diff --git a/.bazelrc b/.bazelrc
index 14677de162fc..e854f5a23623 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -125,6 +125,10 @@ build --config=short_logs
 # TODO(mihaimaruseac): Document this option or remove if no longer needed
 build --config=v2
 
+# Precompiling results in some action conflicts. Disable it for now until
+# the problematic targets are fixed.
+build --@rules_python//python/config_settings:precompile=force_disabled
+
 # TF now has `cc_shared_library` targets, so it needs the experimental flag
 # TODO(rostam): Remove when `cc_shared_library` is enabled by default
 common --experimental_cc_shared_library
@@ -159,15 +163,19 @@ build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:android_arm --config=android
 build:android_arm --cpu=armeabi-v7a
 build:android_arm --fat_apk_cpu=armeabi-v7a
+build:android_arm --platforms=@org_tensorflow//tensorflow/tools/toolchains/android:armeabi-v7a
 build:android_arm64 --config=android
 build:android_arm64 --cpu=arm64-v8a
 build:android_arm64 --fat_apk_cpu=arm64-v8a
+build:android_arm64 --platforms=@org_tensorflow//tensorflow/tools/toolchains/android:arm64-v8a
 build:android_x86 --config=android
 build:android_x86 --cpu=x86
 build:android_x86 --fat_apk_cpu=x86
+build:android_x86 --platforms=@org_tensorflow//tensorflow/tools/toolchains/android:x86
 build:android_x86_64 --config=android
 build:android_x86_64 --cpu=x86_64
 build:android_x86_64 --fat_apk_cpu=x86_64
+build:android_x86_64 --platforms=@org_tensorflow//tensorflow/tools/toolchains/android:x86_64
 
 # Build everything statically for Android since all static libs are later
 # bundled together into a single .so for deployment.
@@ -200,6 +208,7 @@ build:apple-toolchain --host_crosstool_top=@local_config_apple_cc//:toolchain
 # Settings for MacOS on ARM CPUs.
 build:macos_arm64 --cpu=darwin_arm64
 build:macos_arm64 --macos_minimum_os=11.0
+build:macos_arm64 --platforms=@build_bazel_apple_support//configs/platforms:darwin_arm64
 
 # iOS configs for each architecture and the fat binary builds.
 build:ios --apple_platform_type=ios
@@ -208,14 +217,19 @@ build:ios --copt=-Wno-c++11-narrowing
 build:ios --config=apple-toolchain
 build:ios_armv7 --config=ios
 build:ios_armv7 --cpu=ios_armv7
+build:ios_armv7 --platforms=@org_tensorflow//tensorflow/tools/toolchains/ios:ios_armv7
 build:ios_arm64 --config=ios
 build:ios_arm64 --cpu=ios_arm64
+build:ios_arm64 --platforms=@build_bazel_apple_support//configs/platforms:ios_arm64
 build:ios_arm64e --config=ios
 build:ios_arm64e --cpu=ios_arm64e
+build:ios_arm64e --platforms=@build_bazel_apple_support//configs/platforms:ios_arm64e
 build:ios_sim_arm64 --config=ios
 build:ios_sim_arm64 --cpu=ios_sim_arm64
+build:ios_sim_arm64 --platforms=@build_bazel_apple_support//configs/platforms:ios_sim_arm64
 build:ios_x86_64 --config=ios
 build:ios_x86_64 --cpu=ios_x86_64
+build:ios_x86_64 --platforms=@build_bazel_apple_support//configs/platforms:ios_x86_64
 build:ios_fat --config=ios
 build:ios_fat --ios_multi_cpus=armv7,arm64,i386,x86_64
 
@@ -241,24 +255,24 @@ build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl_threadpool --define=build_with_mkl_opensource=true
 build:mkl_threadpool -c opt
 
-# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
-build:mkl_aarch64 --define=build_with_mkl_aarch64=true
-build:mkl_aarch64 --define=build_with_openmp=true
-build:mkl_aarch64 --define=build_with_acl=true
-build:mkl_aarch64 -c opt
-
 # Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
 # with Eigen threadpool support
 build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
+build:mkl_aarch64_threadpool --define=build_with_acl=true
 build:mkl_aarch64_threadpool -c opt
 
+# This is an alias for the mkl_aarch64_threadpool build.
+build:mkl_aarch64 --config=mkl_aarch64_threadpool
+
+# Default CUDA and CUDNN versions.
+build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
+build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
+
 # CUDA: This config refers to building CUDA op kernels with nvcc.
 build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
-# Default CUDA and CUDNN versions.
-build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
-build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
+build:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
 build:cuda --@local_config_cuda//cuda:include_cuda_libs=true
 
@@ -288,8 +302,7 @@ build:cuda_clang --linkopt="-lm"
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
-build:cuda_clang_official --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
+build:cuda_clang_official --config=cuda_version
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
 build:cuda_clang_official --crosstool_top="@local_config_cuda//crosstool:toolchain"
 
@@ -426,12 +439,8 @@ build:windows --dynamic_mode=off
 
 # Default paths for TF_SYSTEM_LIBS
 build:linux --define=PREFIX=/usr
-build:linux --define=LIBDIR=$(PREFIX)/lib
-build:linux --define=INCLUDEDIR=$(PREFIX)/include
 build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
 build:macos --define=PREFIX=/usr
-build:macos --define=LIBDIR=$(PREFIX)/lib
-build:macos --define=INCLUDEDIR=$(PREFIX)/include
 build:macos --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
 # TF_SYSTEM_LIBS do not work on windows.
 
@@ -494,20 +503,31 @@ build:avx_linux --copt=-mavx
 build:avx_linux --host_copt=-mavx
 build:avx_win --copt=/arch:AVX
 
+build:win_clang_base --@com_google_protobuf//:use_dlls=True
+build:win_clang_base --@com_google_absl//absl:use_dlls
+build:win_clang_base --linkopt=/demangle:no --host_linkopt=/demangle:no
+build:win_clang_base --linkopt=/errorlimit:0 --host_linkopt=/errorlimit:0
+build:win_clang_base --copt=/clang:-Weverything
+build:win_clang_base --host_copt=/clang:-Weverything
+build:win_clang_base --compiler=clang-cl
+build:win_clang_base --linkopt=/FORCE:MULTIPLE
+build:win_clang_base --host_linkopt=/FORCE:MULTIPLE
+build:win_clang_base --action_env=PATHEXT=.COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC;.PY;.PYW
+test:win_clang_base --linkopt=/FORCE:MULTIPLE
+test:win_clang_base --host_linkopt=/FORCE:MULTIPLE
+test:win_clang_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true --test_summary=short
+
+build:win_clang --config=win_clang_base
+build:win_clang --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl
+build:win_clang --extra_execution_platforms=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
+build:win_clang --host_platform=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
+
+build:windows_x86_cpu_2022 --config=win_clang_base
 build:windows_x86_cpu_2022 --crosstool_top="//tensorflow/tools/toolchains/win2022/20241118:toolchain"
 build:windows_x86_cpu_2022 --extra_toolchains="//tensorflow/tools/toolchains/win2022/20241118:cc-toolchain-x64_windows-clang-cl"
 build:windows_x86_cpu_2022 --extra_execution_platforms="//tensorflow/tools/toolchains/win2022:windows_ltsc2022_clang"
 build:windows_x86_cpu_2022 --host_platform="//tensorflow/tools/toolchains/win2022:windows_ltsc2022_clang"
 build:windows_x86_cpu_2022 --platforms="//tensorflow/tools/toolchains/win2022:windows_ltsc2022_clang"
-build:windows_x86_cpu_2022 --copt=/clang:-Weverything
-build:windows_x86_cpu_2022 --host_copt=/clang:-Weverything
-build:windows_x86_cpu_2022 --compiler=clang-cl
-build:windows_x86_cpu_2022 --linkopt=/FORCE:MULTIPLE
-build:windows_x86_cpu_2022 --host_linkopt=/FORCE:MULTIPLE
-test:windows_x86_cpu_2022 --linkopt=/FORCE:MULTIPLE
-test:windows_x86_cpu_2022 --host_linkopt=/FORCE:MULTIPLE
-test:windows_x86_cpu_2022 --action_env=PATHEXT=.COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC;.PY;.PYW
-test:windows_x86_cpu_2022 --build_tests_only --keep_going --test_output=errors --verbose_failures=true --test_summary=short
 
 # Options to build TensorFlow 1.x or 2.x.
 # TODO(kanglan): Change v2's define to default behavior
@@ -581,6 +601,12 @@ build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 
+# Download CUDA/CUDNN redistributions to preserve the repositories cache between
+# CPU and GPU builds.
+# TODO(ybaturina): Uncomment when RBE is ready to support this.
+# build:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
+# build:rbe_linux_cpu --config=cuda_version
+
 # TODO(kanglan): Remove it after toolchain update is complete.
 build:rbe_linux_cpu_old --config=rbe_linux
 build:rbe_linux_cpu_old --host_crosstool_top="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
@@ -594,6 +620,7 @@ common:rbe_linux_cpu_old --remote_instance_name=projects/tensorflow-testing/inst
 
 build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
+build:rbe_linux_cuda --repo_env=USE_CUDA_TAR_ARCHIVE_FILES=1
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
 
@@ -621,8 +648,10 @@ build:elinux --crosstool_top=@local_config_embedded_arm//:toolchain
 build:elinux --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:elinux_aarch64 --config=elinux
 build:elinux_aarch64 --cpu=aarch64
+build:elinux_aarch64 --platforms=@org_tensorflow//tensorflow/tools/toolchains/linux:linux_aarch64
 build:elinux_armhf --config=elinux
 build:elinux_armhf --cpu=armhf
+build:elinux_armhf --platforms=@org_tensorflow//tensorflow/tools/toolchains/linux:linux_armhf
 build:elinux_armhf --copt -mfp16-format=ieee
 
 # Config-specific options should come above this line.
@@ -766,11 +795,6 @@ build:tf_public_macos_cache_push --config=tf_public_macos_cache --remote_upload_
 # These are convenience config options that effectively declare TF's CI test suites. Look
 # at the scripts of ci/official/ to see how TF's CI uses them.
 
-# LIBTENSORFLOW TESTS are for building Libtensorflow archives. These are CUDA/CPU-agnostic.
-test:linux_libtensorflow_test --config=cuda_wheel -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
-build:linux_libtensorflow_build --config=cuda_wheel -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
-build:windows_libtensorflow_build --config=cuda_wheel --config=windows_x86_cpu_2022 -- //:LICENSE //tensorflow:tensorflow.dll //tensorflow:tensorflow_dll_import_lib //tensorflow/tools/lib_package:clicenses_generate //tensorflow/java:tensorflow_jni.dll //tensorflow/tools/lib_package:jnilicenses_generate
-
 # PYTHON TESTS run a suite of Python tests intended for verifying that the Python wheel
 # will work properly. These are usually run Nightly or upon Release.
 # CPU WHEEL
@@ -802,7 +826,7 @@ test:macos_x86_wheel_test --@local_xla//third_party/py:wheel_dependency=true --c
 test:windows_x86_cpu_2022_wheel_test_filters --test_tag_filters=-no_windows,-windows_excluded,-no_oss,-oss_excluded,-gpu,-tpu,-benchmark-test,-v1only
 test:windows_x86_cpu_2022_wheel_test_filters --build_tag_filters=-no_windows,-windows_excluded,-no_oss,-oss_excluded,-benchmark-test,-v1only
 test:windows_x86_cpu_2022_wheel_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --test_timeout="300,450,1200,3600"
-test:windows_x86_cpu_2022_wheel_test --build_tests_only --config=windows_x86_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/java/... -//tensorflow/lite/... -//tensorflow/compiler/...
+test:windows_x86_cpu_2022_wheel_test --build_tests_only --config=windows_x86_cpu_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:prebuilt_wheel_import_api_packages_test_cpu -//tensorflow/java/... -//tensorflow/lite/... -//tensorflow/compiler/...
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -853,12 +877,11 @@ build:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
 build:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
 # WINDOWS X86-64 CPU PYCPP
 build:windows_x86_cpu_2022_pycpp_test_build_opts --copt=/d2ReducedOptimizeHugeFunctions --host_copt=/d2ReducedOptimizeHugeFunctions --dynamic_mode=off
-build:windows_x86_cpu_2022_pycpp_test_build_opts_debug --config=windows_x86_cpu_2022_pycpp_test_build_opts --linkopt=/demangle:no --host_linkopt=/demangle:no --linkopt=/errorlimit:0 --host_linkopt=/errorlimit:0
 test:windows_x86_cpu_2022_pycpp_test_filters --test_tag_filters=-no_windows,-windows_excluded,-no_oss,-tf_tosa,-oss_excluded,-gpu,-tpu,-benchmark-test,-v1only
-test:windows_x86_cpu_2022_pycpp_test_filters --build_tag_filters=-no_windows,-windows_excluded,-no_oss,-tf_tosa,-oss_excluded,-benchmark-test,-v1only
+build:windows_x86_cpu_2022_pycpp_test_filters --build_tag_filters=-no_windows,-windows_excluded,-no_oss,-tf_tosa,-oss_excluded,-benchmark-test,-v1only
 test:windows_x86_cpu_2022_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --test_timeout="300,450,1200,3600"
 test:windows_x86_cpu_2022_pycpp_test_opts --config=windows_x86_cpu_2022_pycpp_test_build_opts --build_tests_only
-test:windows_x86_cpu_2022_pycpp_test --config=windows_x86_cpu_2022_pycpp_test_opts --config=windows_x86_cpu_2022_pycpp_test_filters -- //tensorflow/... -//tensorflow/java/... -//tensorflow/lite/... -//tensorflow/compiler/...
+test:windows_x86_cpu_2022_pycpp_test --config=windows_x86_cpu_2022_pycpp_test_opts --config=windows_x86_cpu_2022_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/java/... -//tensorflow/lite/... -//tensorflow/compiler/...
 
 # END TF TEST SUITE OPTIONS
 
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
index e612b642fb19..cb30fca91ecb 100644
--- a/.github/workflows/osv-scanner-scheduled.yml
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -28,7 +28,7 @@ permissions:
 jobs:
   scan-scheduled:
     if: github.repository == 'tensorflow/tensorflow'
-    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v1.9.2"
+    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.0.1"
     with:
       scan-args: |-
         --lockfile=requirements.txt:./requirements_lock_3_9.txt
diff --git a/.github/workflows/pylint-presubmit.yml b/.github/workflows/pylint-presubmit.yml
index 09801d29b697..b2113a0e0448 100644
--- a/.github/workflows/pylint-presubmit.yml
+++ b/.github/workflows/pylint-presubmit.yml
@@ -38,7 +38,7 @@ jobs:
       run: |
         echo Changed files: ${{ steps.get_file_changes.outputs.files }}
     - name: Set up Python 3.9
-      uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+      uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
       with:
         python-version: "3.9"
     - name: Install Python dependencies
diff --git a/.github/workflows/release-branch-cherrypick.yml b/.github/workflows/release-branch-cherrypick.yml
index 6587769b85b8..4fa4f8d5b943 100644
--- a/.github/workflows/release-branch-cherrypick.yml
+++ b/.github/workflows/release-branch-cherrypick.yml
@@ -58,7 +58,7 @@ jobs:
           echo "SHORTSHA=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%h")" >> "$GITHUB_OUTPUT"
           echo "TITLE=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%s")" >> "$GITHUB_OUTPUT"
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@dd2324fc52d5d43c699a5636bcf19fceaa70c284 # v7.0.7
+      uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
       with:
         title: '${{ github.event.inputs.release_branch }} cherry-pick: ${{ steps.cherrypick.outputs.SHORTSHA }} "${{ steps.cherrypick.outputs.TITLE }}"'
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index 6adc36c3749d..51fe91c6b86b 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -55,7 +55,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: SARIF file
           path: results.sarif
@@ -64,6 +64,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@b56ba49b26e50535fa1e7f7db0f4f7b4bf65d80d # v3.28.10
+        uses: github/codeql-action/upload-sarif@28deaeda66b76a05916b6923827895f2b14ab387 # v3.28.16
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/sigbuild-docker-branch.yml b/.github/workflows/sigbuild-docker-branch.yml
deleted file mode 100644
index 35086f5d073e..000000000000
--- a/.github/workflows/sigbuild-docker-branch.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-name: Upload SIG Build docker containers modified for release branches
-
-on:
-  workflow_dispatch:
-  push:
-    paths:
-      - '.github/workflows/sigbuild-docker-branch.yml'
-      - 'tensorflow/tools/tf_sig_build_dockerfiles/**'
-      - '!tensorflow/tools/tf_sig_build_dockerfiles/README.md'
-    branches:
-      - "r[1-9].[0-9]+"
-
-permissions:
-  contents: read
-
-jobs:
-  docker:
-    if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [python3.9, python3.10, python3.11, python3.12]
-    steps:
-      - name: Delete unnecessary tools folder
-        run: rm -rf /opt/hostedtoolcache
-      -
-        name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      -
-        name: Login to GCR
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-        with:
-          registry: gcr.io
-          username: _json_key
-          password: ${{ secrets.GCP_CREDS }}
-      -
-        name: Generate variables for cache busting and tag naming
-        run: |
-          echo "DATE=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
-          # Converts r2.9 to just 2.9
-          echo "REF=$(echo $GITHUB_REF_NAME | sed 's/r//g')" >> "$GITHUB_OUTPUT"
-        id: vars
-      -
-        name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
-        with:
-          push: true
-          context: ./tensorflow/tools/tf_sig_build_dockerfiles
-          target: devel
-          build-args: |
-            PYTHON_VERSION=${{ matrix.python-version }}
-            CACHEBUSTER=${{ steps.vars.outputs.DATE }}
-          tags: |
-            tensorflow/build:${{ steps.vars.outputs.REF }}-${{ matrix.python-version }}
-            gcr.io/tensorflow-sigs/build:${{ steps.vars.outputs.REF }}-${{ matrix.python-version }}
-          cache-from: type=registry,ref=tensorflow/build:${{ steps.vars.outputs.REF }}-${{ matrix.python-version }}
-          cache-to: type=inline
-      -
-        name: Image digest
-        run: echo ${{ steps.docker_build.outputs.digest }}
-
diff --git a/.github/workflows/sigbuild-docker-presubmit.yml b/.github/workflows/sigbuild-docker-presubmit.yml
deleted file mode 100644
index 3a30dd849d23..000000000000
--- a/.github/workflows/sigbuild-docker-presubmit.yml
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-name: Build SIG Build containers as presubmits
-
-on:
-  pull_request:
-    types: [labeled, opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/sigbuild-docker-presubmit.yml'
-      - 'tensorflow/tools/tf_sig_build_dockerfiles/**'
-      - '!tensorflow/tools/tf_sig_build_dockerfiles/README.md'
-
-permissions:
-  contents: read
-
-jobs:
-  docker:
-    if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [python3.9, python3.10, python3.11, python3.12]
-    permissions:
-      contents: read
-      pull-requests: write
-    steps:
-      - name: Delete unnecessary tools folder
-        run: |
-          df -h
-          rm -rf /opt/hostedtoolcache
-          df -h
-      -
-        name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
-      -
-        name: Login to GCR
-        if: contains(github.event.pull_request.labels.*.name, 'build and push to gcr.io for staging')
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-        with:
-          registry: gcr.io
-          username: _json_key
-          password: ${{ secrets.GCP_CREDS }}
-      -
-        name: Login to AR
-        # Once this is verified, change the label's name. For now, we will piggyback on gcr.io actions.
-        if: contains(github.event.pull_request.labels.*.name, 'build and push to gcr.io for staging')
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-        with:
-          registry: us-central1-docker.pkg.dev
-          username: _json_key
-          password: ${{ secrets.GCP_CREDS }}
-      -
-        name: Grab the date to do cache busting (assumes same day OK to keep)
-        run: |
-          echo "DATE=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
-        id: date
-      -
-        name: Build containers, and push to GCR only if the 'build and push to gcr.io for staging' label is applied
-        id: docker_build
-        uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
-        with:
-          push: ${{ contains(github.event.pull_request.labels.*.name, 'build and push to gcr.io for staging') }}
-          context: ./tensorflow/tools/tf_sig_build_dockerfiles
-          target: devel
-          build-args: |
-            PYTHON_VERSION=${{ matrix.python-version }}
-            CACHEBUSTER=${{ steps.date.outputs.DATE }}
-          tags: |
-            gcr.io/tensorflow-sigs/build:${{ github.event.number }}-${{ matrix.python-version }}
-            us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build:${{ github.event.number }}-${{ matrix.python-version }}
-          cache-from: |
-            type=registry,ref=tensorflow/build:latest-${{ matrix.python-version }}
-            type=registry,ref=gcr.io/tensorflow-sigs/build:${{ github.event.number }}-${{ matrix.python-version }}
-          cache-to: type=inline
-      -
-        name: Add a comment with the pushed containers
-        uses: mshick/add-pr-comment@dd126dd8c253650d181ad9538d8b4fa218fc31e8 # v2
-        if: contains(github.event.pull_request.labels.*.name, 'build and push to gcr.io for staging')
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          message: |
-            I pushed these containers:
-            
-            - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.12`
-            - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.11`
-            - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.10`
-            - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.9`
-            
-            Re-apply the `build and push to gcr.io for staging` label to rebuild and push again. This comment will only be posted once.
-      -
-        name: Print image digest
-        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/sigbuild-docker.yml b/.github/workflows/sigbuild-docker.yml
deleted file mode 100644
index 3b1026abfc69..000000000000
--- a/.github/workflows/sigbuild-docker.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-name: Upload SIG Build docker containers regularly
-
-on:
-  workflow_dispatch:
-  schedule:
-      # Run once a week on Sunday at midnight. See http://crontab.guru
-      - cron: '0 0 * * 0'
-  push:
-    paths:
-      - '.github/workflows/sigbuild-docker.yml'
-      - 'tensorflow/tools/tf_sig_build_dockerfiles/**'
-      - '!tensorflow/tools/tf_sig_build_dockerfiles/README.md'
-    branches:
-      - master
-
-permissions:
-  contents: read
-
-jobs:
-  docker:
-    if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [python3.9, python3.10, python3.11, python3.12]
-    steps:
-      - name: Delete unnecessary tools folder
-        run: rm -rf /opt/hostedtoolcache
-      -
-        name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      -
-        name: Login to GCR
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-        with:
-          registry: gcr.io
-          username: _json_key
-          password: ${{ secrets.GCP_CREDS }}
-      -
-        name: Login to AR
-        # Once this is verified, removed gcr.io actions.
-        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-        with:
-          registry: us-central1-docker.pkg.dev
-          username: _json_key
-          password: ${{ secrets.GCP_CREDS }}
-      -
-        name: Grab the upcoming TF version to tag this container
-        run: |
-          # [[:digit:]] searches for numbers and \+ joins them together
-          major_version=$(grep "^#define TF_MAJOR_VERSION" ./tensorflow/core/public/version.h | grep -o "[[:digit:]]\+")
-          minor_version=$(grep "^#define TF_MINOR_VERSION" ./tensorflow/core/public/version.h | grep -o "[[:digit:]]\+")
-          echo "TF_VERSION=${major_version}.${minor_version}" >> "$GITHUB_OUTPUT"
-          # Also get the current date to do cache busting. Assumes one day
-          # is an ok range for rebuilds
-          echo "DATE=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
-        id: tf-version
-      -
-        name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
-        with:
-          push: true
-          context: ./tensorflow/tools/tf_sig_build_dockerfiles
-          target: devel
-          build-args: |
-            PYTHON_VERSION=${{ matrix.python-version }}
-            CACHEBUSTER=${{ steps.tf-version.outputs.DATE }}
-          tags: |
-            tensorflow/build:latest-${{ matrix.python-version }}
-            tensorflow/build:${{ steps.tf-version.outputs.TF_VERSION }}-${{ matrix.python-version }}
-            gcr.io/tensorflow-sigs/build:latest-${{ matrix.python-version }}
-            gcr.io/tensorflow-sigs/build:${{ steps.tf-version.outputs.TF_VERSION }}-${{ matrix.python-version }}
-            us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build:latest-${{ matrix.python-version }}
-            us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build:${{ steps.tf-version.outputs.TF_VERSION }}-${{ matrix.python-version }}
-          cache-from: type=registry,ref=tensorflow/build:latest-${{ matrix.python-version }}
-          cache-to: type=inline
-      -
-        name: Image digest
-        run: echo ${{ steps.docker_build.outputs.digest }}
-
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index 11b83f43e708..a06d2e0125f6 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -130,7 +130,7 @@ jobs:
         map sigbuild-r2.17-clang-python3.11 2.17-python3.11
         map sigbuild-r2.17-clang-python3.12 2.17-python3.12
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@dd2324fc52d5d43c699a5636bcf19fceaa70c284 # v7.0.7
+      uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
       with:
         title: Update the RBE images to the latest container versions
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
diff --git a/README.md b/README.md
index 64060ee986f9..f8e1c796cc44 100644
--- a/README.md
+++ b/README.md
@@ -11,8 +11,6 @@
 [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow-py.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow-py)
 [![OSSRank](https://shields.io/endpoint?url=https://ossrank.com/shield/44)](https://ossrank.com/p/44)
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-v1.4%20adopted-ff69b4.svg)](CODE_OF_CONDUCT.md)
-[![TF Official Continuous](https://tensorflow.github.io/build/TF%20Official%20Continuous.svg)](https://tensorflow.github.io/build#TF%20Official%20Continuous)
-[![TF Official Nightly](https://tensorflow.github.io/build/TF%20Official%20Nightly.svg)](https://tensorflow.github.io/build#TF%20Official%20Nightly)
 
 **`Documentation`** |
 ------------------- |
@@ -71,7 +69,7 @@ commands.
 
 *Nightly binaries are available for testing using the
 [tf-nightly](https://pypi.python.org/pypi/tf-nightly) and
-[tf-nightly-cpu](https://pypi.python.org/pypi/tf-nightly-cpu) packages on PyPi.*
+[tf-nightly-cpu](https://pypi.python.org/pypi/tf-nightly-cpu) packages on PyPI.*
 
 #### *Try your first TensorFlow program*
 
diff --git a/RELEASE.md b/RELEASE.md
index cea0dc4c8779..a867cae331b6 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -3234,7 +3234,7 @@ This release introduces several vulnerability fixes:
 
     *   Keras been split into a separate PIP package (`keras`), and its code has
         been moved to the GitHub
-        repository[keras-team/keras](http://github.com/keras-team/keras). The
+        repository[keras-team/keras](https://github.com/keras-team/keras). The
         API endpoints for `tf.keras` stay unchanged, but are now backed by the
         `keras` PIP package. The existing code in tensorflow/python/keras is a
         staled copy and will be removed in future release (2.7). Please remove
@@ -10260,7 +10260,7 @@ answered questions, and were part of inspiring discussions.
 ## Major Features And Improvements
 
 *   `tf.keras` is now part of the core TensorFlow API.
-*   [`tf.data`](http://tensorflow.org/guide/data) is now part of the core
+*   [`tf.data`](https://tensorflow.org/guide/data) is now part of the core
     TensorFlow API.
     *   The API is now subject to backwards compatibility guarantees.
     *   For a guide to migrating from the `tf.contrib.data` API, see the
diff --git a/WORKSPACE b/WORKSPACE
index 445f974b0943..e42663c69229 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -43,6 +43,7 @@ python_init_repositories(
         "3.10": "//:requirements_lock_3_10.txt",
         "3.11": "//:requirements_lock_3_11.txt",
         "3.12": "//:requirements_lock_3_12.txt",
+        "3.13": "//:requirements_lock_3_13.txt",
     },
 )
 
diff --git a/ci/official/containers/linux_arm64/Dockerfile b/ci/official/containers/linux_arm64/Dockerfile
index c66ef9682c49..2092c4986ea3 100644
--- a/ci/official/containers/linux_arm64/Dockerfile
+++ b/ci/official/containers/linux_arm64/Dockerfile
@@ -1,5 +1,5 @@
 ################################################################################
-FROM ubuntu:20.04@sha256:8e5c4f0285ecbb4ead070431d29b576a530d3166df73ec44affc1cd27555141b as builder
+FROM ubuntu:20.04@sha256:8feb4d8ca5354def3d8fce243717141ce31e2c428701f6682bd2fafe15388214 as builder
 ################################################################################
 
 # Install devtoolset build dependencies
diff --git a/ci/official/containers/linux_arm64/devel.usertools/code_check_full.bats b/ci/official/containers/linux_arm64/devel.usertools/code_check_full.bats
index cdfc81499af7..ae9d1919039b 100644
--- a/ci/official/containers/linux_arm64/devel.usertools/code_check_full.bats
+++ b/ci/official/containers/linux_arm64/devel.usertools/code_check_full.bats
@@ -57,8 +57,8 @@ EOF
 
   # grep patterns for targets which are allowed to be extra licenses
   cat > $BATS_TEST_TMPDIR/allowed_to_be_extra <<EOF
-//third_party/mkl
-//third_party/mkl_dnn
+@local_xla//third_party/mkl
+@local_xla//third_party/mkl_dnn
 @absl_py//
 @bazel_tools//src
 @bazel_tools//platforms
diff --git a/ci/official/containers/ml_build/Dockerfile b/ci/official/containers/ml_build/Dockerfile
index c607f93f615d..9aa72bedb45b 100644
--- a/ci/official/containers/ml_build/Dockerfile
+++ b/ci/official/containers/ml_build/Dockerfile
@@ -28,6 +28,7 @@ RUN /setup.python.sh python3.10 /builder.requirements.txt
 RUN /setup.python.sh python3.11 /builder.requirements.txt
 RUN /setup.python.sh python3.13 /builder.requirements.txt
 RUN /setup.python.sh python3.13-nogil /builder.requirements.txt
+RUN /setup.python.sh python3.14 /builder.requirements.txt
 
 # Since we are using python3.12 as the default python version, we need to
 # install python3.12 last for now.
@@ -76,7 +77,7 @@ RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud
     tar zxf - google-cloud-sdk && \
     google-cloud-sdk/install.sh --quiet && \
     ln -s /google-cloud-sdk/bin/gcloud /usr/bin/gcloud
-ENV PATH="$PATH:/google-cloud-sdk/bin/"
+ENV PATH="$PATH:/google-cloud-sdk/bin/:/usr/local/cuda/bin/"
 
 # Download and install patchelf v0.18.0 from GitHub. The default Ubuntu focal
 # packages only provide the "0.10-2build1" version. We use patchelf to manipulate
diff --git a/ci/official/containers/ml_build/builder.packages.txt b/ci/official/containers/ml_build/builder.packages.txt
index e1a8bf3cc0e8..8dbbf4196440 100644
--- a/ci/official/containers/ml_build/builder.packages.txt
+++ b/ci/official/containers/ml_build/builder.packages.txt
@@ -32,3 +32,4 @@ unzip
 zip
 openjdk-21-jdk
 vim
+jq
diff --git a/ci/official/containers/ml_build/builder.requirements.txt b/ci/official/containers/ml_build/builder.requirements.txt
index 014d5ff52623..114efaf9dc97 100644
--- a/ci/official/containers/ml_build/builder.requirements.txt
+++ b/ci/official/containers/ml_build/builder.requirements.txt
@@ -1,6 +1,9 @@
 # For wheel verification, and uploading
 auditwheel ~= 6.1.0
-twine ~= 5.1.1
+twine ~= 6.1.0
+id
+urllib3
+requests
 
 # For JAX
 build ~= 1.2.2
diff --git a/ci/official/containers/ml_build/cuda12.1_cudnn9.8.packages.txt b/ci/official/containers/ml_build/cuda12.1_cudnn9.8.packages.txt
new file mode 100644
index 000000000000..91fc70991d25
--- /dev/null
+++ b/ci/official/containers/ml_build/cuda12.1_cudnn9.8.packages.txt
@@ -0,0 +1,21 @@
+# All required CUDA packages
+cuda-compat-12-1
+cuda-command-line-tools-12-1
+cuda-cudart-dev-12-1
+cuda-nvcc-12-1
+cuda-cupti-12-1
+cuda-nvprune-12-1
+cuda-libraries-12-1
+cuda-libraries-dev-12-1
+cuda-nvml-dev-12-1
+libcufft-12-1
+libcurand-12-1
+libcusolver-dev-12-1
+libcusparse-dev-12-1
+libcublas-12-1
+libcublas-dev-12-1
+libnccl-dev=2.18.3-1+cuda12.1
+libnccl2=2.18.3-1+cuda12.1
+# CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
+libcudnn9-dev-cuda-12=9.8.0.87-1
+libcudnn9-cuda-12=9.8.0.87-1
\ No newline at end of file
diff --git a/ci/official/containers/ml_build/cuda12.8_cudnn9.8.packages.txt b/ci/official/containers/ml_build/cuda12.8_cudnn9.8.packages.txt
new file mode 100644
index 000000000000..c544433acff2
--- /dev/null
+++ b/ci/official/containers/ml_build/cuda12.8_cudnn9.8.packages.txt
@@ -0,0 +1,21 @@
+# All required CUDA packages
+cuda-compat-12-8
+cuda-command-line-tools-12-8
+cuda-cudart-dev-12-8
+cuda-nvcc-12-8
+cuda-cupti-12-8
+cuda-nvprune-12-8
+cuda-libraries-12-8
+cuda-libraries-dev-12-8
+cuda-nvml-dev-12-8
+libcufft-12-8
+libcurand-12-8
+libcusolver-dev-12-8
+libcusparse-dev-12-8
+libcublas-12-8
+libcublas-dev-12-8
+libnccl-dev=2.25.1-1+cuda12.8
+libnccl2=2.25.1-1+cuda12.8
+# CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
+libcudnn9-dev-cuda-12=9.8.0.87-1
+libcudnn9-cuda-12=9.8.0.87-1
\ No newline at end of file
diff --git a/ci/official/containers/ml_build/setup.python.sh b/ci/official/containers/ml_build/setup.python.sh
index 93e7cb6c7636..67f7d6068993 100755
--- a/ci/official/containers/ml_build/setup.python.sh
+++ b/ci/official/containers/ml_build/setup.python.sh
@@ -28,7 +28,7 @@ if [[ ${VERSION} == "python3.13-nogil" ]]; then
   cat >pythons.txt <<EOF
 $VERSION
 EOF
-elif [[ ${VERSION} == "python3.13" || ${VERSION} == "python3.12" ]]; then
+elif [[ ${VERSION} == "python3.14" || ${VERSION} == "python3.13" || ${VERSION} == "python3.12" ]]; then
   cat >pythons.txt <<EOF
 $VERSION
 $VERSION-dev
@@ -72,7 +72,8 @@ wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5
 # For Python 3.13t, do not install twine as it does not have pre-built wheels
 # for this Python version and building it from source fails. We only need twine
 # to be present on the system Python which in this case is 3.12.
-if [[ ${VERSION} == "python3.13-nogil" ]]; then
+# Same reason for Python 3.140a7.
+if [[ ${VERSION} == "python3.13-nogil" || ${VERSION} == "python3.14" ]]; then
   grep -v "twine" $REQUIREMENTS > requirements_without_twine.txt
   REQUIREMENTS=requirements_without_twine.txt
 fi
diff --git a/ci/official/containers/ml_build_arm64/requirements.txt b/ci/official/containers/ml_build_arm64/requirements.txt
index 0487ecd6260c..6ae6deda1412 100644
--- a/ci/official/containers/ml_build_arm64/requirements.txt
+++ b/ci/official/containers/ml_build_arm64/requirements.txt
@@ -1,7 +1,7 @@
 portpicker==1.6.0
 # For wheel verification, and uploading
 auditwheel ~= 6.1.0
-twine ~= 5.1.1
+twine ~= 6.1.0
 
 # uv is faster than pip for installing Python packages.
 uv ~= 0.5.30
\ No newline at end of file
diff --git a/ci/official/debug_tfci.sh b/ci/official/debug_tfci.sh
index 249820383358..08ffa240ee34 100755
--- a/ci/official/debug_tfci.sh
+++ b/ci/official/debug_tfci.sh
@@ -22,3 +22,4 @@ echo "==TFCI== env outside of tfrun:"
 env
 echo "==TFCI== env inside of tfrun:"
 tfrun env
+echo "==TFCI== env end"
diff --git a/ci/official/envs/linux_arm64 b/ci/official/envs/linux_arm64
index 8e385aab7be9..2b6e38b0e42f 100644
--- a/ci/official/envs/linux_arm64
+++ b/ci/official/envs/linux_arm64
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config release_arm64_linux"
+TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --repo_env=USE_PYWRAP_RULES=True --config release_arm64_linux"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
 # Note: this is not set to "--cpu", because that changes the package name
 # to tensorflow_cpu. These ARM builds are supposed to have the name "tensorflow"
@@ -28,5 +28,5 @@ TFCI_OUTPUT_DIR=build_output
 TFCI_WHL_AUDIT_ENABLE=1
 TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
 TFCI_WHL_BAZEL_TEST_ENABLE=1
-TFCI_WHL_SIZE_LIMIT=250M
+TFCI_WHL_SIZE_LIMIT=255M
 TFCI_WHL_SIZE_LIMIT_ENABLE=1
diff --git a/ci/official/envs/linux_arm64_cross_compile b/ci/official/envs/linux_arm64_cross_compile
index e4e9004b4f1c..7333be2ff9ff 100644
--- a/ci/official/envs/linux_arm64_cross_compile
+++ b/ci/official/envs/linux_arm64_cross_compile
@@ -13,5 +13,5 @@
 # limitations under the License.
 # ==============================================================================
 source ci/official/envs/linux_arm64
-TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config  cross_compile_linux_arm64"
+TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config cross_compile_linux_arm64 --repo_env=USE_PYWRAP_RULES=True"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=cross_compile_linux_arm64
diff --git a/ci/official/envs/macos_arm64 b/ci/official/envs/macos_arm64
index c789a2dc2d09..96d8c14655ce 100644
--- a/ci/official/envs/macos_arm64
+++ b/ci/official/envs/macos_arm64
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config release_macos_arm64"
+TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --repo_env=USE_PYWRAP_RULES=True --config release_macos_arm64"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
 TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=tensorflow"
 TFCI_INDEX_HTML_ENABLE=1
@@ -29,7 +29,12 @@ case $TFCI_PYTHON_VERSION in
 3.11)
   TFCI_MACOS_PYENV_INSTALL_ENABLE=0
   ;;
+3.13)
+  TFCI_MACOS_UPGRADE_PYENV_ENABLE=1
+  TFCI_MACOS_PYENV_INSTALL_ENABLE=1
+  ;;
 *)
   TFCI_MACOS_PYENV_INSTALL_ENABLE=1
   ;;
 esac
+
diff --git a/ci/official/envs/py313 b/ci/official/envs/py313
new file mode 100644
index 000000000000..1210c5eca815
--- /dev/null
+++ b/ci/official/envs/py313
@@ -0,0 +1,15 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+TFCI_PYTHON_VERSION=3.13
diff --git a/ci/official/envs/windows_x86_2022 b/ci/official/envs/windows_x86_2022
index 5d3bd33e05da..56187ad78eca 100644
--- a/ci/official/envs/windows_x86_2022
+++ b/ci/official/envs/windows_x86_2022
@@ -16,7 +16,7 @@ TFCI_DOCKER_ENABLE=1
 TFCI_DOCKER_PULL_ENABLE=1
 TFCI_DOCKER_IMAGE="gcr.io/tensorflow-testing/tf-win2022@sha256:915cb093630432c38b028f56bd31116a5559ebbc688d427b6092d86828ae03bc"
 TFCI_BAZEL_BAZELRC_ARGS="--output_user_root=C:/t"
-TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=windows_x86_cpu_2022"
+TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --repo_env=USE_PYWRAP_RULES=True --config=windows_x86_cpu_2022"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=windows_x86_cpu_2022
 TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=tensorflow"
 TFCI_BUILD_PIP_PACKAGE_ADDITIONAL_WHEEL_NAMES="tensorflow_cpu"
diff --git a/ci/official/requirements_updater/requirements.in b/ci/official/requirements_updater/requirements.in
index 0cfbaf22f820..f63fa5ccc529 100644
--- a/ci/official/requirements_updater/requirements.in
+++ b/ci/official/requirements_updater/requirements.in
@@ -28,7 +28,7 @@ requests >= 2.31.0
 packaging==23.2
 setuptools==70.0.0
 jax==0.4.7
-zstandard=0.23.0
+zstandard==0.23.0
 # NVIDIA CUDA dependencies
 # Note that the wheels are downloaded only when the targets in bazel command
 # contain dependencies on these wheels.
@@ -44,7 +44,7 @@ nvidia-cusparse-cu12 == 12.5.1.3
 nvidia-nccl-cu12 == 2.25.1
 nvidia-nvjitlink-cu12 == 12.5.82
 # The dependencies below are needed for TF wheel testing.
-tensorflow-io-gcs-filesystem==0.37.1
+tensorflow-io-gcs-filesystem==0.37.1 ; python_version <= "3.12"
 libclang >= 13.0.0
 google_pasta ~= 0.2
 flatbuffers ~= 24.3.25
diff --git a/ci/official/utilities/code_check_full.bats b/ci/official/utilities/code_check_full.bats
index 63e8667b9a1a..e468ee09d61b 100644
--- a/ci/official/utilities/code_check_full.bats
+++ b/ci/official/utilities/code_check_full.bats
@@ -61,8 +61,8 @@ EOF
 
   # grep patterns for targets which are allowed to be extra licenses
   cat > $BATS_TEST_TMPDIR/allowed_to_be_extra <<EOF
-//third_party/mkl
-//third_party/mkl_dnn
+@local_xla//third_party/mkl
+@local_xla//third_party/mkl_dnn
 @absl_py//
 @bazel_tools//src
 @bazel_tools//platforms
diff --git a/ci/official/utilities/rename_and_verify_wheels.sh b/ci/official/utilities/rename_and_verify_wheels.sh
index 34389f79264f..e5b08d1a9d4a 100755
--- a/ci/official/utilities/rename_and_verify_wheels.sh
+++ b/ci/official/utilities/rename_and_verify_wheels.sh
@@ -69,7 +69,11 @@ fi
 # TODO(b/366266944) Remove the check after tf docker image upgrade for NumPy 2
 # and numpy 1 support is dropped b/361369076.
 if [[ "$TFCI_WHL_NUMPY_VERSION" == 1 ]]; then
-  "$python" -m pip install numpy==1.26.0
+  if [[ "$TFCI_PYTHON_VERSION" == "3.13" ]]; then
+    "$python" -m pip install numpy==1.26.4
+  else
+    "$python" -m pip install numpy==1.26.0
+  fi
 fi
 "$python" -m pip install *.whl $TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS
 if [[ "$TFCI_WHL_IMPORT_TEST_ENABLE" == "1" ]]; then
diff --git a/ci/official/utilities/setup_macos.sh b/ci/official/utilities/setup_macos.sh
index 8a63d318c6e1..05c0cf27581d 100644
--- a/ci/official/utilities/setup_macos.sh
+++ b/ci/official/utilities/setup_macos.sh
@@ -61,10 +61,23 @@ fi
 # those VMs does not support installing Python 3.12 and above which we need
 # for running smoke tests in nightly/release wheel builds.
 if [[ "${TFCI_MACOS_UPGRADE_PYENV_ENABLE}" == 1 ]]; then
-  # The TFCI Mac VM image seems to have uncommitted local changes to the Pyenv
-  # repository so we have to discard them and reset the working directory before
-  # we can pull in the latest changes.
-  cd /Users/kbuilder/.pyenv/ && git reset --hard HEAD && git pull && cd -
+  echo "Upgrading pyenv..."
+  echo "Current pyevn version: $(pyenv --version)"
+
+  # Check if pyenv is managed by homebrew. If so, update and upgrade pyenv.
+  # Otherwise, install the latest pyenv from github.
+  if command -v brew &> /dev/null && brew list pyenv &> /dev/null; then
+    # On "ventura-slcn" VMs, pyenv is managed via Homebrew.
+    echo "pyenv is installed and managed by homebrew."
+    brew update && brew upgrade pyenv
+  else
+    echo "pyenv is not managed by homebrew. Installing it via github..."
+    # On "ventura" VMs, pyenv is not managed by Homebrew. Install the latest
+    # pyenv from github.
+    rm -rf "$PYENV_ROOT"
+    git clone https://github.com/pyenv/pyenv.git "$PYENV_ROOT"
+  fi
+  echo "Upgraded pyenv version: $(pyenv --version)"
 fi
 
 # "TFCI_MACOS_PYENV_INSTALL_ENABLE" controls whether to use Pyenv to install
diff --git a/configure.py b/configure.py
index ec04fcfdd0cc..e5700e0b84b5 100644
--- a/configure.py
+++ b/configure.py
@@ -529,7 +529,9 @@ def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var,
     string value for var_name
   """
   var = environ_cp.get(var_name)
-  if not var:
+  # an intentionally empty value in the
+  # environment is not the same as no value
+  if var is None:
     var = get_input(ask_for_var)
     print('\n')
   if not var:
@@ -1125,7 +1127,7 @@ def set_system_libs_flag(environ_cp):
       syslibs = ','.join(sorted(syslibs.split()))
     write_action_env_to_bazelrc('TF_SYSTEM_LIBS', syslibs)
 
-  for varname in ('PREFIX', 'LIBDIR', 'INCLUDEDIR', 'PROTOBUF_INCLUDE_PATH'):
+  for varname in ('PREFIX', 'PROTOBUF_INCLUDE_PATH'):
     if varname in environ_cp:
       write_to_bazelrc('build --define=%s=%s' % (varname, environ_cp[varname]))
 
diff --git a/requirements_lock_3_13.txt b/requirements_lock_3_13.txt
new file mode 100644
index 000000000000..a03c65b0b248
--- /dev/null
+++ b/requirements_lock_3_13.txt
@@ -0,0 +1,842 @@
+#
+# This file is autogenerated by pip-compile with Python 3.13
+# by the following command:
+#
+#    bazel run //ci/official/requirements_updater:requirements.update
+#
+absl-py==2.2.1 \
+    --hash=sha256:4c7bc50d42d021c12d4f31b7001167925e0bd71ade853069f64af410f5565ff9 \
+    --hash=sha256:ca8209abd5005ae6e700ef36e2edc84ad5338678f95625a3f15275410a89ffbc
+    # via
+    #   dm-tree
+    #   keras-nightly
+    #   tb-nightly
+astor==0.7.1 \
+    --hash=sha256:95c30d87a6c2cf89aa628b87398466840f0ad8652f88eb173125a6df8533fb8d \
+    --hash=sha256:fb503b9e2fdd05609fbf557b916b4a7824171203701660f0c55bbf5a7a68713e
+    # via -r ci/official/requirements_updater/requirements.in
+astunparse==1.6.3 \
+    --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \
+    --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8
+    # via -r ci/official/requirements_updater/requirements.in
+attrs==25.3.0 \
+    --hash=sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3 \
+    --hash=sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b
+    # via dm-tree
+auditwheel==6.3.0 \
+    --hash=sha256:05c70a234fa14c140aa6d9076135d9550962d95849911b8d5d0419a3add09f00 \
+    --hash=sha256:31cbd8045d4ff6776f79bef328b5fd563e5ecc8ae82ea34b6fe5e76efe2a84eb
+    # via -r ci/official/requirements_updater/requirements.in
+certifi==2025.1.31 \
+    --hash=sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651 \
+    --hash=sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe
+    # via requests
+charset-normalizer==3.4.1 \
+    --hash=sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537 \
+    --hash=sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa \
+    --hash=sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a \
+    --hash=sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294 \
+    --hash=sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b \
+    --hash=sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd \
+    --hash=sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601 \
+    --hash=sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd \
+    --hash=sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4 \
+    --hash=sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d \
+    --hash=sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2 \
+    --hash=sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313 \
+    --hash=sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd \
+    --hash=sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa \
+    --hash=sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8 \
+    --hash=sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1 \
+    --hash=sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2 \
+    --hash=sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496 \
+    --hash=sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d \
+    --hash=sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b \
+    --hash=sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e \
+    --hash=sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a \
+    --hash=sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4 \
+    --hash=sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca \
+    --hash=sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78 \
+    --hash=sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408 \
+    --hash=sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5 \
+    --hash=sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3 \
+    --hash=sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f \
+    --hash=sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a \
+    --hash=sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765 \
+    --hash=sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6 \
+    --hash=sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146 \
+    --hash=sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6 \
+    --hash=sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9 \
+    --hash=sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd \
+    --hash=sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c \
+    --hash=sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f \
+    --hash=sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545 \
+    --hash=sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176 \
+    --hash=sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770 \
+    --hash=sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824 \
+    --hash=sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f \
+    --hash=sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf \
+    --hash=sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487 \
+    --hash=sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d \
+    --hash=sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd \
+    --hash=sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b \
+    --hash=sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534 \
+    --hash=sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f \
+    --hash=sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b \
+    --hash=sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9 \
+    --hash=sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd \
+    --hash=sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125 \
+    --hash=sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9 \
+    --hash=sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de \
+    --hash=sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11 \
+    --hash=sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d \
+    --hash=sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35 \
+    --hash=sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f \
+    --hash=sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda \
+    --hash=sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7 \
+    --hash=sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a \
+    --hash=sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971 \
+    --hash=sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8 \
+    --hash=sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41 \
+    --hash=sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d \
+    --hash=sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f \
+    --hash=sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757 \
+    --hash=sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a \
+    --hash=sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886 \
+    --hash=sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77 \
+    --hash=sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76 \
+    --hash=sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247 \
+    --hash=sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85 \
+    --hash=sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb \
+    --hash=sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7 \
+    --hash=sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e \
+    --hash=sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6 \
+    --hash=sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037 \
+    --hash=sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1 \
+    --hash=sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e \
+    --hash=sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807 \
+    --hash=sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407 \
+    --hash=sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c \
+    --hash=sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12 \
+    --hash=sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3 \
+    --hash=sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089 \
+    --hash=sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd \
+    --hash=sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e \
+    --hash=sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00 \
+    --hash=sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616
+    # via requests
+dill==0.3.7 \
+    --hash=sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e \
+    --hash=sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03
+    # via -r ci/official/requirements_updater/requirements.in
+dm-tree==0.1.9 \
+    --hash=sha256:12f4cc6cd52a39aa38ff31577b6d79b6136a9a89273a876bf62335c9f65c27bf \
+    --hash=sha256:1ae3cbff592bb3f2e197f5a8030de4a94e292e6cdd85adeea0b971d07a1b85f2 \
+    --hash=sha256:2334cfe9d2ed4293f9f1c7aefba0657deaab9ea74b5fadd966f6d01d9b6b42d9 \
+    --hash=sha256:294dc1cecf87552a45cdd5ddb215e7f5295a5a47c46f1f0a0463c3dd02a527d7 \
+    --hash=sha256:54d5616015412311df154908069fcf2c2d8786f6088a2ae3554d186cdf2b1e15 \
+    --hash=sha256:5d5b28ee2e461b6af65330c143806a6d0945dcabbb8d22d2ba863e6dabd9254e \
+    --hash=sha256:6893fcdc5cf1a4f459cfc383526d35d42e7c671ae565d7e429a2f2cb2cb93e89 \
+    --hash=sha256:7d7d784afaeb4b67d87d858261aaf02503939ddc1f09c4cca70728f9892ab004 \
+    --hash=sha256:80c43417814b1181d3367b335460bfdd30b79ee187a64220e11f6ddd093a4b15 \
+    --hash=sha256:831699d2c60a1b38776a193b7143ae0acad0a687d87654e6d3342584166816bc \
+    --hash=sha256:9020a5ce256fcc83aa4bc190cc96dd66e87685db0a6e501b0c06aa492c2e38fc \
+    --hash=sha256:a4c7db3d3935a5a2d5e4b383fc26c6b0cd6f78c6d4605d3e7b518800ecd5342b \
+    --hash=sha256:a8d20eeab7fde77a3ed71f07716021eb0edfb4812a128eb381d108af3a310257 \
+    --hash=sha256:b06e7a5da1c31a82521a60060573527e8d24b9920fdd20b2ec86f08412737598 \
+    --hash=sha256:cfa33c2e028155810ad1b4e11928707bf47489516763a86e79cab2954d23bf68 \
+    --hash=sha256:d05622d074353cf434049206e53c12147903a048c4bd7d77f2800d427413ad78 \
+    --hash=sha256:e1f5d1e96b3a7de22b25b13a5eb30f41f8cf9c02dd4479a24920de99e780903c \
+    --hash=sha256:e660d1779ddcbd1348410d08f67db4870d413a3ec4ba8b4b045bd5ce4bd8f35c \
+    --hash=sha256:e97c34fcb44941c36b7ee81dcdbceba0fbe728bddcc77e5837ab2eb665bcbff8 \
+    --hash=sha256:f68b0efad76703dd4648586c75618a48cdd671b68c3266fe980e323c15423607
+    # via keras-nightly
+flatbuffers==24.3.25 \
+    --hash=sha256:8dbdec58f935f3765e4f7f3cf635ac3a77f83568138d6a2311f524ec96364812 \
+    --hash=sha256:de2ec5b203f21441716617f38443e0a8ebf3d25bf0d9c0bb0ce68fa00ad546a4
+    # via -r ci/official/requirements_updater/requirements.in
+gast==0.4.0 \
+    --hash=sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1 \
+    --hash=sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4
+    # via -r ci/official/requirements_updater/requirements.in
+google-pasta==0.2.0 \
+    --hash=sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954 \
+    --hash=sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed \
+    --hash=sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e
+    # via -r ci/official/requirements_updater/requirements.in
+grpcio==1.71.0 \
+    --hash=sha256:0ab8b2864396663a5b0b0d6d79495657ae85fa37dcb6498a2669d067c65c11ea \
+    --hash=sha256:0fa05ee31a20456b13ae49ad2e5d585265f71dd19fbd9ef983c28f926d45d0a7 \
+    --hash=sha256:0ff35c8d807c1c7531d3002be03221ff9ae15712b53ab46e2a0b4bb271f38537 \
+    --hash=sha256:1be857615e26a86d7363e8a163fade914595c81fec962b3d514a4b1e8760467b \
+    --hash=sha256:20e8f653abd5ec606be69540f57289274c9ca503ed38388481e98fa396ed0b41 \
+    --hash=sha256:22c3bc8d488c039a199f7a003a38cb7635db6656fa96437a8accde8322ce2366 \
+    --hash=sha256:24e867651fc67717b6f896d5f0cac0ec863a8b5fb7d6441c2ab428f52c651c6b \
+    --hash=sha256:2b85f7820475ad3edec209d3d89a7909ada16caab05d3f2e08a7e8ae3200a55c \
+    --hash=sha256:39983a9245d37394fd59de71e88c4b295eb510a3555e0a847d9965088cdbd033 \
+    --hash=sha256:3d081e859fb1ebe176de33fc3adb26c7d46b8812f906042705346b314bde32c3 \
+    --hash=sha256:469f42a0b410883185eab4689060a20488a1a0a00f8bbb3cbc1061197b4c5a79 \
+    --hash=sha256:47be9584729534660416f6d2a3108aaeac1122f6b5bdbf9fd823e11fe6fbaa29 \
+    --hash=sha256:4be74ddeeb92cc87190e0e376dbc8fc7736dbb6d3d454f2fa1f5be1dee26b9d7 \
+    --hash=sha256:4dd0dfbe4d5eb1fcfec9490ca13f82b089a309dc3678e2edabc144051270a66e \
+    --hash=sha256:5b08d03ace7aca7b2fadd4baf291139b4a5f058805a8327bfe9aece7253b6d67 \
+    --hash=sha256:63e41b91032f298b3e973b3fa4093cbbc620c875e2da7b93e249d4728b54559a \
+    --hash=sha256:652350609332de6dac4ece254e5d7e1ff834e203d6afb769601f286886f6f3a8 \
+    --hash=sha256:693bc706c031aeb848849b9d1c6b63ae6bcc64057984bb91a542332b75aa4c3d \
+    --hash=sha256:74258dce215cb1995083daa17b379a1a5a87d275387b7ffe137f1d5131e2cfbb \
+    --hash=sha256:789d5e2a3a15419374b7b45cd680b1e83bbc1e52b9086e49308e2c0b5bbae6e3 \
+    --hash=sha256:7c9c80ac6091c916db81131d50926a93ab162a7e97e4428ffc186b6e80d6dda4 \
+    --hash=sha256:7d6ac9481d9d0d129224f6d5934d5832c4b1cddb96b59e7eba8416868909786a \
+    --hash=sha256:85da336e3649a3d2171e82f696b5cad2c6231fdd5bad52616476235681bee5b3 \
+    --hash=sha256:8700a2a57771cc43ea295296330daaddc0d93c088f0a35cc969292b6db959bf3 \
+    --hash=sha256:8997d6785e93308f277884ee6899ba63baafa0dfb4729748200fcc537858a509 \
+    --hash=sha256:9182e0063112e55e74ee7584769ec5a0b4f18252c35787f48738627e23a62b97 \
+    --hash=sha256:9b91879d6da1605811ebc60d21ab6a7e4bae6c35f6b63a061d61eb818c8168f6 \
+    --hash=sha256:a2242d6950dc892afdf9e951ed7ff89473aaf744b7d5727ad56bdaace363722b \
+    --hash=sha256:a371e6b6a5379d3692cc4ea1cb92754d2a47bdddeee755d3203d1f84ae08e03e \
+    --hash=sha256:a76d39b5fafd79ed604c4be0a869ec3581a172a707e2a8d7a4858cb05a5a7637 \
+    --hash=sha256:ad9f30838550695b5eb302add33f21f7301b882937460dd24f24b3cc5a95067a \
+    --hash=sha256:b2266862c5ad664a380fbbcdbdb8289d71464c42a8c29053820ee78ba0119e5d \
+    --hash=sha256:b78a99cd1ece4be92ab7c07765a0b038194ded2e0a26fd654591ee136088d8d7 \
+    --hash=sha256:c200cb6f2393468142eb50ab19613229dcc7829b5ccee8b658a36005f6669fdd \
+    --hash=sha256:c30f393f9d5ff00a71bb56de4aa75b8fe91b161aeb61d39528db6b768d7eac69 \
+    --hash=sha256:c6a0a28450c16809f94e0b5bfe52cabff63e7e4b97b44123ebf77f448534d07d \
+    --hash=sha256:cebc1b34ba40a312ab480ccdb396ff3c529377a2fce72c45a741f7215bfe8379 \
+    --hash=sha256:d2c170247315f2d7e5798a22358e982ad6eeb68fa20cf7a820bb74c11f0736e7 \
+    --hash=sha256:d35a95f05a8a2cbe8e02be137740138b3b2ea5f80bd004444e4f9a1ffc511e32 \
+    --hash=sha256:d5170929109450a2c031cfe87d6716f2fae39695ad5335d9106ae88cc32dc84c \
+    --hash=sha256:d6aa986318c36508dc1d5001a3ff169a15b99b9f96ef5e98e13522c506b37eef \
+    --hash=sha256:d6de81c9c00c8a23047136b11794b3584cdc1460ed7cbc10eada50614baa1444 \
+    --hash=sha256:dc1a1231ed23caac1de9f943d031f1bc38d0f69d2a3b243ea0d664fc1fbd7fec \
+    --hash=sha256:e6beeea5566092c5e3c4896c6d1d307fb46b1d4bdf3e70c8340b190a69198594 \
+    --hash=sha256:e6d8de076528f7c43a2f576bc311799f89d795aa6c9b637377cc2b1616473804 \
+    --hash=sha256:e6f83a583ed0a5b08c5bc7a3fe860bb3c2eac1f03f1f63e0bc2091325605d2b7 \
+    --hash=sha256:f250ff44843d9a0615e350c77f890082102a0318d66a99540f54769c8766ab73 \
+    --hash=sha256:f71574afdf944e6652203cd1badcda195b2a27d9c83e6d88dc1ce3cfb73b31a5 \
+    --hash=sha256:f903017db76bf9cc2b2d8bdd37bf04b505bbccad6be8a81e1542206875d0e9db \
+    --hash=sha256:f9a412f55bb6e8f3bb000e020dbc1e709627dcb3a56f6431fa7076b4c1aab0db \
+    --hash=sha256:f9c30c464cb2ddfbc2ddf9400287701270fdc0f14be5f08a1e3939f1e749b455
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   tb-nightly
+h5py==3.13.0 \
+    --hash=sha256:10894c55d46df502d82a7a4ed38f9c3fdbcb93efb42e25d275193e093071fade \
+    --hash=sha256:1870e46518720023da85d0895a1960ff2ce398c5671eac3b1a41ec696b7105c3 \
+    --hash=sha256:21daf38171753899b5905f3d82c99b0b1ec2cbbe282a037cad431feb620e62ec \
+    --hash=sha256:22ffe2a25770a2d67213a1b94f58006c14dce06933a42d2aaa0318c5868d1508 \
+    --hash=sha256:337af114616f3656da0c83b68fcf53ecd9ce9989a700b0883a6e7c483c3235d4 \
+    --hash=sha256:357e6dc20b101a805ccfd0024731fbaf6e8718c18c09baf3b5e4e9d198d13fca \
+    --hash=sha256:477c58307b6b9a2509c59c57811afb9f598aedede24a67da808262dfa0ee37b4 \
+    --hash=sha256:4f97ecde7ac6513b21cd95efdfc38dc6d19f96f6ca6f2a30550e94e551458e0a \
+    --hash=sha256:5540daee2b236d9569c950b417f13fd112d51d78b4c43012de05774908dff3f5 \
+    --hash=sha256:560e71220dc92dfa254b10a4dcb12d56b574d2d87e095db20466b32a93fec3f9 \
+    --hash=sha256:56dd172d862e850823c4af02dc4ddbc308f042b85472ffdaca67f1598dff4a57 \
+    --hash=sha256:57c4c74f627c616f02b7aec608a8c706fe08cb5b0ba7c08555a4eb1dde20805a \
+    --hash=sha256:782ff0ac39f455f21fd1c8ebc007328f65f43d56718a89327eec76677ebf238a \
+    --hash=sha256:82690e89c72b85addf4fc4d5058fb1e387b6c14eb063b0b879bf3f42c3b93c35 \
+    --hash=sha256:851ae3a8563d87a5a0dc49c2e2529c75b8842582ccaefbf84297d2cfceeacd61 \
+    --hash=sha256:8a8e38ef4ceb969f832cc230c0cf808c613cc47e31e768fd7b1106c55afa1cb8 \
+    --hash=sha256:9c82ece71ed1c2b807b6628e3933bc6eae57ea21dac207dca3470e3ceaaf437c \
+    --hash=sha256:be949b46b7388074c5acae017fbbe3e5ba303fd9daaa52157fdfef30bbdacadd \
+    --hash=sha256:c10f061764d8dce0a9592ce08bfd5f243a00703325c388f1086037e5d619c5f1 \
+    --hash=sha256:d2cf6a231a07c14acd504a945a6e9ec115e0007f675bde5e0de30a4dc8d86a31 \
+    --hash=sha256:d571644958c5e19a61c793d8d23cd02479572da828e333498c9acc463f4a3997 \
+    --hash=sha256:d6f13f9b5ce549448c01e4dfe08ea8d1772e6078799af2c1c8d09e941230a90d \
+    --hash=sha256:e520ec76de00943dd017c8ea3f354fa1d2f542eac994811943a8faedf2a7d5cb \
+    --hash=sha256:e79d8368cd9295045956bfb436656bea3f915beaa11d342e9f79f129f5178763 \
+    --hash=sha256:f35640e81b03c02a88b8bf99fb6a9d3023cc52f7c627694db2f379e0028f2868 \
+    --hash=sha256:fb267ce4b83f9c42560e9ff4d30f60f7ae492eacf9c7ede849edf8c1b860e16b
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   keras-nightly
+idna==3.10 \
+    --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+    --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+    # via requests
+jax==0.4.7 \
+    --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
+    # via -r ci/official/requirements_updater/requirements.in
+keras-nightly==3.0.4.dev2024021403 \
+    --hash=sha256:24ce69d29d582771685bf4235f59663723405b5a5b16f3eaff2657e52e74663a \
+    --hash=sha256:9f416e66b820ef833779d219d255b346b8b90a72fdbd0b2f1e90a43ad142a03d
+    # via -r ci/official/requirements_updater/requirements.in
+libclang==18.1.1 \
+    --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \
+    --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \
+    --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \
+    --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \
+    --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \
+    --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \
+    --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \
+    --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \
+    --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \
+    --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe
+    # via -r ci/official/requirements_updater/requirements.in
+lit==17.0.6 \
+    --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
+    # via -r ci/official/requirements_updater/requirements.in
+markdown==3.7 \
+    --hash=sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2 \
+    --hash=sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803
+    # via tb-nightly
+markdown-it-py==3.0.0 \
+    --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
+    --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
+    # via rich
+markupsafe==3.0.2 \
+    --hash=sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4 \
+    --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \
+    --hash=sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0 \
+    --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \
+    --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \
+    --hash=sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13 \
+    --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \
+    --hash=sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca \
+    --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \
+    --hash=sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832 \
+    --hash=sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0 \
+    --hash=sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b \
+    --hash=sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579 \
+    --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \
+    --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \
+    --hash=sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff \
+    --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \
+    --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \
+    --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \
+    --hash=sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb \
+    --hash=sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e \
+    --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \
+    --hash=sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a \
+    --hash=sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d \
+    --hash=sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a \
+    --hash=sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b \
+    --hash=sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8 \
+    --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \
+    --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \
+    --hash=sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144 \
+    --hash=sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f \
+    --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \
+    --hash=sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d \
+    --hash=sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93 \
+    --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \
+    --hash=sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158 \
+    --hash=sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84 \
+    --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \
+    --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \
+    --hash=sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171 \
+    --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \
+    --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \
+    --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \
+    --hash=sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d \
+    --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \
+    --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \
+    --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \
+    --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \
+    --hash=sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29 \
+    --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \
+    --hash=sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798 \
+    --hash=sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c \
+    --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \
+    --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \
+    --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \
+    --hash=sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a \
+    --hash=sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178 \
+    --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \
+    --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \
+    --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430 \
+    --hash=sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50
+    # via werkzeug
+mdurl==0.1.2 \
+    --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
+    --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
+    # via markdown-it-py
+ml-dtypes==0.5.1 \
+    --hash=sha256:023ce2f502efd4d6c1e0472cc58ce3640d051d40e71e27386bed33901e201327 \
+    --hash=sha256:05f23447a1c20ddf4dc7c2c661aa9ed93fcb2658f1017c204d1e758714dc28a8 \
+    --hash=sha256:12651420130ee7cc13059fc56dac6ad300c3af3848b802d475148c9defd27c23 \
+    --hash=sha256:141b2ea2f20bb10802ddca55d91fe21231ef49715cfc971998e8f2a9838f3dbe \
+    --hash=sha256:15ad0f3b0323ce96c24637a88a6f44f6713c64032f27277b069f285c3cf66478 \
+    --hash=sha256:1b7fbe5571fdf28fd3aaab3ef4aafc847de9ebf263be959958c1ca58ec8eadf5 \
+    --hash=sha256:26ebcc69d7b779c8f129393e99732961b5cc33fcff84090451f448c89b0e01b4 \
+    --hash=sha256:6f462f5eca22fb66d7ff9c4744a3db4463af06c49816c4b6ac89b16bfcdc592e \
+    --hash=sha256:6f76232163b5b9c34291b54621ee60417601e2e4802a188a0ea7157cd9b323f4 \
+    --hash=sha256:7000b6e4d8ef07542c05044ec5d8bbae1df083b3f56822c3da63993a113e716f \
+    --hash=sha256:810512e2eccdfc3b41eefa3a27402371a3411453a1efc7e9c000318196140fed \
+    --hash=sha256:8f2c028954f16ede77902b223a8da2d9cbb3892375b85809a5c3cfb1587960c4 \
+    --hash=sha256:9626d0bca1fb387d5791ca36bacbba298c5ef554747b7ebeafefb4564fc83566 \
+    --hash=sha256:ac5b58559bb84a95848ed6984eb8013249f90b6bab62aa5acbad876e256002c9 \
+    --hash=sha256:ad4953c5eb9c25a56d11a913c2011d7e580a435ef5145f804d98efa14477d390 \
+    --hash=sha256:aefedc579ece2f8fb38f876aa7698204ee4c372d0e54f1c1ffa8ca580b54cc60 \
+    --hash=sha256:afb2009ac98da274e893e03162f6269398b2b00d947e7057ee2469a921d58135 \
+    --hash=sha256:b8a9d46b4df5ae2135a8e8e72b465448ebbc1559997f4f9304a9ecc3413efb5b \
+    --hash=sha256:bd73f51957949069573ff783563486339a9285d72e2f36c18e0c1aa9ca7eb190 \
+    --hash=sha256:bf9975bda82a99dc935f2ae4c83846d86df8fd6ba179614acac8e686910851da \
+    --hash=sha256:c09526488c3a9e8b7a23a388d4974b670a9a3dd40c5c8a61db5593ce9b725bab \
+    --hash=sha256:c9945669d3dadf8acb40ec2e57d38c985d8c285ea73af57fc5b09872c516106d \
+    --hash=sha256:d13755f8e8445b3870114e5b6240facaa7cb0c3361e54beba3e07fa912a6e12b \
+    --hash=sha256:fd918d4e6a4e0c110e2e05be7a7814d10dc1b95872accbf6512b80a109b71ae1
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   jax
+    #   keras-nightly
+namex==0.0.8 \
+    --hash=sha256:32a50f6c565c0bb10aa76298c959507abdc0e850efe085dc38f3440fcb3aa90b \
+    --hash=sha256:7ddb6c2bb0e753a311b7590f84f6da659dd0c05e65cb89d519d54c0a250c0487
+    # via keras-nightly
+numpy==2.1.3 \
+    --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \
+    --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \
+    --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \
+    --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \
+    --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \
+    --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \
+    --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \
+    --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \
+    --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \
+    --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \
+    --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \
+    --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \
+    --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \
+    --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \
+    --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \
+    --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \
+    --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \
+    --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \
+    --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \
+    --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \
+    --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \
+    --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \
+    --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \
+    --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \
+    --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \
+    --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \
+    --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \
+    --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \
+    --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \
+    --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \
+    --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \
+    --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \
+    --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \
+    --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \
+    --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \
+    --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \
+    --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \
+    --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \
+    --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \
+    --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \
+    --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \
+    --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \
+    --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \
+    --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \
+    --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \
+    --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \
+    --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \
+    --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \
+    --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \
+    --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \
+    --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \
+    --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \
+    --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \
+    --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \
+    --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   dm-tree
+    #   h5py
+    #   jax
+    #   keras-nightly
+    #   ml-dtypes
+    #   opt-einsum
+    #   scipy
+    #   tb-nightly
+nvidia-cublas-cu12==12.5.3.2 \
+    --hash=sha256:4960f3dc5f39699acadf76fa6d94b10a2a00f2956c2c442efa299fb22b0748f3 \
+    --hash=sha256:7d0191251180de606023d396b94d66f66470a0ae96d1dbb906c7656ea0f71eda \
+    --hash=sha256:ca070ad70e9fa6654084575d01bd001f30cc4665e33d4bb9fc8e0f321caa034b
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+nvidia-cuda-cupti-cu12==12.5.82 \
+    --hash=sha256:4f835281cf492e2bedd153f5c3de9da8f1d775a419468305e64ce73b3b0c6dc3 \
+    --hash=sha256:bde77a5feb66752ec61db2adfe47f56b941842825b4c7e2068aff27c9d107953 \
+    --hash=sha256:d32c06490c6ba35c4323730820c7d0c4c126c04ed58d2f57275adb8d54b138fe
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-cuda-nvrtc-cu12==12.5.82 \
+    --hash=sha256:3dbd97b0104b4bfbc3c4f8c79cd2496307c89c43c29a9f83125f1d76296ff3fd \
+    --hash=sha256:5bb6a0eb01d4974bb7ca3d48bd3859472debb3c3057a5e7de2b08fbdf35eed7e \
+    --hash=sha256:e5db37e990056c70953b7772dd778336ef9da0a0b5bb28f9f2a61c2e42b51d78
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-cuda-runtime-cu12==12.5.82 \
+    --hash=sha256:0fd5fbca289bceb9f0690aa9858f06187b554fdeb7e2711dfd5bb3ce58900b46 \
+    --hash=sha256:3e79a060e126df40fd3a068f3f787eb000fa51b251ec6cd97d09579632687115 \
+    --hash=sha256:71f015dbf9df05dd71f7480132c6ebf47a6ceb2ab53d7db8e08e4b30ebb87e14
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-cudnn-cu12==9.3.0.75 \
+    --hash=sha256:9ad9c6929ebb5295eb4a1728024666d1c88283373e265a0c5c883e6f9d5cd76d \
+    --hash=sha256:c5cf7ff3415e446adf195a5b7dd2ba56cd00c3ee78bfdc566e51698931aa4b7f \
+    --hash=sha256:c819e82eed8cf564b9d37478ea4eab9e87194bb3b7f7f8098bc1f67c9b80f1b6
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-cufft-cu12==11.2.3.61 \
+    --hash=sha256:4a8f6f0ce93c52a50ee83422a80472b5f376054a63f38532d0eab4007e7ef28b \
+    --hash=sha256:6d45b48a5ee7599e57131129cda2c58544d9b78b95064d3ec3e5c6b96e2b58cc \
+    --hash=sha256:9a6e8df162585750f61983a638104a48c756aa13f9f48e19ab079b38e3c828b8
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-curand-cu12==10.3.6.82 \
+    --hash=sha256:0631ba65231260ad832ce233ddda57e7b3b7158eabf000d78e46cbb5bd5b7aae \
+    --hash=sha256:2823fb27de4e44dbb22394a6adf53aa6e1b013aca0f8c22867d1cfae58405536 \
+    --hash=sha256:36aabeb5990297bbce3df324ea7c7c13c3aabb140c86d50ab3b23e4ec61672f1
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-cusolver-cu12==11.6.3.83 \
+    --hash=sha256:1b8b77d2fe8abe72bb722dafb708cceaeb81f1a03999477f20b33b34f46ab885 \
+    --hash=sha256:6224732963cba312a84c78114b9a38c4ffabb2e2a6a120923ac99ba6f895c8cf \
+    --hash=sha256:93cfafacde4428b71778eeb092ec615a02a3d05404da1bcf91c53e3fa1bce42b
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-cusparse-cu12==12.5.1.3 \
+    --hash=sha256:016df8e993c437e8301e62739f01775cba988fd5253cd4c64173f8e8d2f8e752 \
+    --hash=sha256:33520db374e2f5ebc976d6faa1852b98c398a57e6f71150fe59705928596ffd1 \
+    --hash=sha256:7b97fd01f0a61628af99d0efd52132fccc8c18fc5c509f13802dccf0574a19c2
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   nvidia-cusolver-cu12
+nvidia-nccl-cu12==2.25.1 \
+    --hash=sha256:362aed5963fb9ea2ed2f264409baae30143498fd0e5c503aeaa1badd88cdc54a \
+    --hash=sha256:4ab428bc915785cc66e8c57cb34c7a64cf739c46702b8db748b6ad6cc7180cf8
+    # via -r ci/official/requirements_updater/requirements.in
+nvidia-nvjitlink-cu12==12.5.82 \
+    --hash=sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27 \
+    --hash=sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697 \
+    --hash=sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+opt-einsum==3.3.0 \
+    --hash=sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147 \
+    --hash=sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   jax
+packaging==23.2 \
+    --hash=sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5 \
+    --hash=sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   auditwheel
+    #   tb-nightly
+portpicker==1.6.0 \
+    --hash=sha256:b2787a41404cf7edbe29b07b9e0ed863b09f2665dcc01c1eb0c2261c1e7d0755 \
+    --hash=sha256:bd507fd6f96f65ee02781f2e674e9dc6c99bbfa6e3c39992e3916204c9d431fa
+    # via -r ci/official/requirements_updater/requirements.in
+protobuf==6.30.2 \
+    --hash=sha256:0eb523c550a66a09a0c20f86dd554afbf4d32b02af34ae53d93268c1f73bc65b \
+    --hash=sha256:35c859ae076d8c56054c25b59e5e59638d86545ed6e2b6efac6be0b6ea3ba048 \
+    --hash=sha256:4f6c687ae8efae6cf6093389a596548214467778146b7245e886f35e1485315d \
+    --hash=sha256:50f32cc9fd9cb09c783ebc275611b4f19dfdfb68d1ee55d2f0c7fa040df96815 \
+    --hash=sha256:524afedc03b31b15586ca7f64d877a98b184f007180ce25183d1a5cb230ee72b \
+    --hash=sha256:7653c99774f73fe6b9301b87da52af0e69783a2e371e8b599b3e9cb4da4b12b9 \
+    --hash=sha256:acec579c39c88bd8fbbacab1b8052c793efe83a0a5bd99db4a31423a25c0a0e2 \
+    --hash=sha256:ae86b030e69a98e08c77beab574cbcb9fff6d031d57209f574a5aea1445f4b51 \
+    --hash=sha256:b12ef7df7b9329886e66404bef5e9ce6a26b54069d7f7436a0853ccdeb91c103
+    # via tb-nightly
+psutil==7.0.0 \
+    --hash=sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25 \
+    --hash=sha256:1e744154a6580bc968a0195fd25e80432d3afec619daf145b9e5ba16cc1d688e \
+    --hash=sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91 \
+    --hash=sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da \
+    --hash=sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34 \
+    --hash=sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553 \
+    --hash=sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456 \
+    --hash=sha256:84df4eb63e16849689f76b1ffcb36db7b8de703d1bc1fe41773db487621b6c17 \
+    --hash=sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993 \
+    --hash=sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99
+    # via portpicker
+pyelftools==0.32 \
+    --hash=sha256:013df952a006db5e138b1edf6d8a68ecc50630adbd0d83a2d41e7f846163d738 \
+    --hash=sha256:6de90ee7b8263e740c8715a925382d4099b354f29ac48ea40d840cf7aa14ace5
+    # via auditwheel
+pygments==2.19.1 \
+    --hash=sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f \
+    --hash=sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c
+    # via rich
+requests==2.32.3 \
+    --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
+    --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
+    # via -r ci/official/requirements_updater/requirements.in
+rich==14.0.0 \
+    --hash=sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0 \
+    --hash=sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725
+    # via keras-nightly
+scipy==1.15.2 \
+    --hash=sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf \
+    --hash=sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11 \
+    --hash=sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37 \
+    --hash=sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d \
+    --hash=sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0 \
+    --hash=sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8 \
+    --hash=sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af \
+    --hash=sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40 \
+    --hash=sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9 \
+    --hash=sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971 \
+    --hash=sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d \
+    --hash=sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737 \
+    --hash=sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e \
+    --hash=sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32 \
+    --hash=sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53 \
+    --hash=sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1 \
+    --hash=sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d \
+    --hash=sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e \
+    --hash=sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776 \
+    --hash=sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5 \
+    --hash=sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462 \
+    --hash=sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274 \
+    --hash=sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301 \
+    --hash=sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3 \
+    --hash=sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58 \
+    --hash=sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4 \
+    --hash=sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa \
+    --hash=sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9 \
+    --hash=sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27 \
+    --hash=sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9 \
+    --hash=sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f \
+    --hash=sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655 \
+    --hash=sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20 \
+    --hash=sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65 \
+    --hash=sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93 \
+    --hash=sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828 \
+    --hash=sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd \
+    --hash=sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f \
+    --hash=sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec \
+    --hash=sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb \
+    --hash=sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6 \
+    --hash=sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded \
+    --hash=sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e \
+    --hash=sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28 \
+    --hash=sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0 \
+    --hash=sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   jax
+six==1.17.0 \
+    --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
+    --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
+    # via
+    #   astunparse
+    #   google-pasta
+    #   tb-nightly
+tb-nightly==2.19.0a20250218 \
+    --hash=sha256:7c7fea911a9e113e7d40fa9aed96168840e2443c5ada52fba5bc3645ec6e206f
+    # via -r ci/official/requirements_updater/requirements.in
+tblib==2.0.0 \
+    --hash=sha256:9100bfa016b047d5b980d66e7efed952fbd20bd85b56110aaf473cb97d18709a \
+    --hash=sha256:a6df30f272c08bf8be66e0775fad862005d950a6b8449b94f7c788731d70ecd7
+    # via -r ci/official/requirements_updater/requirements.in
+tensorboard-data-server==0.7.2 \
+    --hash=sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb \
+    --hash=sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60 \
+    --hash=sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530
+    # via tb-nightly
+termcolor==2.3.0 \
+    --hash=sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475 \
+    --hash=sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a
+    # via -r ci/official/requirements_updater/requirements.in
+typing-extensions==4.8.0 \
+    --hash=sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0 \
+    --hash=sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef
+    # via -r ci/official/requirements_updater/requirements.in
+urllib3==2.3.0 \
+    --hash=sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df \
+    --hash=sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d
+    # via requests
+werkzeug==3.1.3 \
+    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
+    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+    # via tb-nightly
+wheel==0.41.3 \
+    --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
+    --hash=sha256:4d4987ce51a49370ea65c0bfd2234e8ce80a12780820d9dc462597a6e60d0841
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   astunparse
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   dm-tree
+zstandard==0.23.0 \
+    --hash=sha256:034b88913ecc1b097f528e42b539453fa82c3557e414b3de9d5632c80439a473 \
+    --hash=sha256:0a7f0804bb3799414af278e9ad51be25edf67f78f916e08afdb983e74161b916 \
+    --hash=sha256:11e3bf3c924853a2d5835b24f03eeba7fc9b07d8ca499e247e06ff5676461a15 \
+    --hash=sha256:12a289832e520c6bd4dcaad68e944b86da3bad0d339ef7989fb7e88f92e96072 \
+    --hash=sha256:1516c8c37d3a053b01c1c15b182f3b5f5eef19ced9b930b684a73bad121addf4 \
+    --hash=sha256:157e89ceb4054029a289fb504c98c6a9fe8010f1680de0201b3eb5dc20aa6d9e \
+    --hash=sha256:1bfe8de1da6d104f15a60d4a8a768288f66aa953bbe00d027398b93fb9680b26 \
+    --hash=sha256:1e172f57cd78c20f13a3415cc8dfe24bf388614324d25539146594c16d78fcc8 \
+    --hash=sha256:1fd7e0f1cfb70eb2f95a19b472ee7ad6d9a0a992ec0ae53286870c104ca939e5 \
+    --hash=sha256:203d236f4c94cd8379d1ea61db2fce20730b4c38d7f1c34506a31b34edc87bdd \
+    --hash=sha256:27d3ef2252d2e62476389ca8f9b0cf2bbafb082a3b6bfe9d90cbcbb5529ecf7c \
+    --hash=sha256:29a2bc7c1b09b0af938b7a8343174b987ae021705acabcbae560166567f5a8db \
+    --hash=sha256:2ef230a8fd217a2015bc91b74f6b3b7d6522ba48be29ad4ea0ca3a3775bf7dd5 \
+    --hash=sha256:2ef3775758346d9ac6214123887d25c7061c92afe1f2b354f9388e9e4d48acfc \
+    --hash=sha256:2f146f50723defec2975fb7e388ae3a024eb7151542d1599527ec2aa9cacb152 \
+    --hash=sha256:2fb4535137de7e244c230e24f9d1ec194f61721c86ebea04e1581d9d06ea1269 \
+    --hash=sha256:32ba3b5ccde2d581b1e6aa952c836a6291e8435d788f656fe5976445865ae045 \
+    --hash=sha256:34895a41273ad33347b2fc70e1bff4240556de3c46c6ea430a7ed91f9042aa4e \
+    --hash=sha256:379b378ae694ba78cef921581ebd420c938936a153ded602c4fea612b7eaa90d \
+    --hash=sha256:38302b78a850ff82656beaddeb0bb989a0322a8bbb1bf1ab10c17506681d772a \
+    --hash=sha256:3aa014d55c3af933c1315eb4bb06dd0459661cc0b15cd61077afa6489bec63bb \
+    --hash=sha256:4051e406288b8cdbb993798b9a45c59a4896b6ecee2f875424ec10276a895740 \
+    --hash=sha256:40b33d93c6eddf02d2c19f5773196068d875c41ca25730e8288e9b672897c105 \
+    --hash=sha256:43da0f0092281bf501f9c5f6f3b4c975a8a0ea82de49ba3f7100e64d422a1274 \
+    --hash=sha256:445e4cb5048b04e90ce96a79b4b63140e3f4ab5f662321975679b5f6360b90e2 \
+    --hash=sha256:48ef6a43b1846f6025dde6ed9fee0c24e1149c1c25f7fb0a0585572b2f3adc58 \
+    --hash=sha256:50a80baba0285386f97ea36239855f6020ce452456605f262b2d33ac35c7770b \
+    --hash=sha256:519fbf169dfac1222a76ba8861ef4ac7f0530c35dd79ba5727014613f91613d4 \
+    --hash=sha256:53dd9d5e3d29f95acd5de6802e909ada8d8d8cfa37a3ac64836f3bc4bc5512db \
+    --hash=sha256:53ea7cdc96c6eb56e76bb06894bcfb5dfa93b7adcf59d61c6b92674e24e2dd5e \
+    --hash=sha256:576856e8594e6649aee06ddbfc738fec6a834f7c85bf7cadd1c53d4a58186ef9 \
+    --hash=sha256:59556bf80a7094d0cfb9f5e50bb2db27fefb75d5138bb16fb052b61b0e0eeeb0 \
+    --hash=sha256:5d41d5e025f1e0bccae4928981e71b2334c60f580bdc8345f824e7c0a4c2a813 \
+    --hash=sha256:61062387ad820c654b6a6b5f0b94484fa19515e0c5116faf29f41a6bc91ded6e \
+    --hash=sha256:61f89436cbfede4bc4e91b4397eaa3e2108ebe96d05e93d6ccc95ab5714be512 \
+    --hash=sha256:62136da96a973bd2557f06ddd4e8e807f9e13cbb0bfb9cc06cfe6d98ea90dfe0 \
+    --hash=sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b \
+    --hash=sha256:65308f4b4890aa12d9b6ad9f2844b7ee42c7f7a4fd3390425b242ffc57498f48 \
+    --hash=sha256:66b689c107857eceabf2cf3d3fc699c3c0fe8ccd18df2219d978c0283e4c508a \
+    --hash=sha256:6a41c120c3dbc0d81a8e8adc73312d668cd34acd7725f036992b1b72d22c1772 \
+    --hash=sha256:6f77fa49079891a4aab203d0b1744acc85577ed16d767b52fc089d83faf8d8ed \
+    --hash=sha256:72c68dda124a1a138340fb62fa21b9bf4848437d9ca60bd35db36f2d3345f373 \
+    --hash=sha256:752bf8a74412b9892f4e5b58f2f890a039f57037f52c89a740757ebd807f33ea \
+    --hash=sha256:76e79bc28a65f467e0409098fa2c4376931fd3207fbeb6b956c7c476d53746dd \
+    --hash=sha256:774d45b1fac1461f48698a9d4b5fa19a69d47ece02fa469825b442263f04021f \
+    --hash=sha256:77da4c6bfa20dd5ea25cbf12c76f181a8e8cd7ea231c673828d0386b1740b8dc \
+    --hash=sha256:77ea385f7dd5b5676d7fd943292ffa18fbf5c72ba98f7d09fc1fb9e819b34c23 \
+    --hash=sha256:80080816b4f52a9d886e67f1f96912891074903238fe54f2de8b786f86baded2 \
+    --hash=sha256:80a539906390591dd39ebb8d773771dc4db82ace6372c4d41e2d293f8e32b8db \
+    --hash=sha256:82d17e94d735c99621bf8ebf9995f870a6b3e6d14543b99e201ae046dfe7de70 \
+    --hash=sha256:837bb6764be6919963ef41235fd56a6486b132ea64afe5fafb4cb279ac44f259 \
+    --hash=sha256:84433dddea68571a6d6bd4fbf8ff398236031149116a7fff6f777ff95cad3df9 \
+    --hash=sha256:8c24f21fa2af4bb9f2c492a86fe0c34e6d2c63812a839590edaf177b7398f700 \
+    --hash=sha256:8ed7d27cb56b3e058d3cf684d7200703bcae623e1dcc06ed1e18ecda39fee003 \
+    --hash=sha256:9206649ec587e6b02bd124fb7799b86cddec350f6f6c14bc82a2b70183e708ba \
+    --hash=sha256:983b6efd649723474f29ed42e1467f90a35a74793437d0bc64a5bf482bedfa0a \
+    --hash=sha256:98da17ce9cbf3bfe4617e836d561e433f871129e3a7ac16d6ef4c680f13a839c \
+    --hash=sha256:9c236e635582742fee16603042553d276cca506e824fa2e6489db04039521e90 \
+    --hash=sha256:9da6bc32faac9a293ddfdcb9108d4b20416219461e4ec64dfea8383cac186690 \
+    --hash=sha256:a05e6d6218461eb1b4771d973728f0133b2a4613a6779995df557f70794fd60f \
+    --hash=sha256:a0817825b900fcd43ac5d05b8b3079937073d2b1ff9cf89427590718b70dd840 \
+    --hash=sha256:a4ae99c57668ca1e78597d8b06d5af837f377f340f4cce993b551b2d7731778d \
+    --hash=sha256:a8c86881813a78a6f4508ef9daf9d4995b8ac2d147dcb1a450448941398091c9 \
+    --hash=sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35 \
+    --hash=sha256:a9b07268d0c3ca5c170a385a0ab9fb7fdd9f5fd866be004c4ea39e44edce47dd \
+    --hash=sha256:ab19a2d91963ed9e42b4e8d77cd847ae8381576585bad79dbd0a8837a9f6620a \
+    --hash=sha256:ac184f87ff521f4840e6ea0b10c0ec90c6b1dcd0bad2f1e4a9a1b4fa177982ea \
+    --hash=sha256:b0e166f698c5a3e914947388c162be2583e0c638a4703fc6a543e23a88dea3c1 \
+    --hash=sha256:b2170c7e0367dde86a2647ed5b6f57394ea7f53545746104c6b09fc1f4223573 \
+    --hash=sha256:b2d8c62d08e7255f68f7a740bae85b3c9b8e5466baa9cbf7f57f1cde0ac6bc09 \
+    --hash=sha256:b4567955a6bc1b20e9c31612e615af6b53733491aeaa19a6b3b37f3b65477094 \
+    --hash=sha256:b69bb4f51daf461b15e7b3db033160937d3ff88303a7bc808c67bbc1eaf98c78 \
+    --hash=sha256:b8c0bd73aeac689beacd4e7667d48c299f61b959475cdbb91e7d3d88d27c56b9 \
+    --hash=sha256:be9b5b8659dff1f913039c2feee1aca499cfbc19e98fa12bc85e037c17ec6ca5 \
+    --hash=sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9 \
+    --hash=sha256:c16842b846a8d2a145223f520b7e18b57c8f476924bda92aeee3a88d11cfc391 \
+    --hash=sha256:c363b53e257246a954ebc7c488304b5592b9c53fbe74d03bc1c64dda153fb847 \
+    --hash=sha256:c7c517d74bea1a6afd39aa612fa025e6b8011982a0897768a2f7c8ab4ebb78a2 \
+    --hash=sha256:d20fd853fbb5807c8e84c136c278827b6167ded66c72ec6f9a14b863d809211c \
+    --hash=sha256:d2240ddc86b74966c34554c49d00eaafa8200a18d3a5b6ffbf7da63b11d74ee2 \
+    --hash=sha256:d477ed829077cd945b01fc3115edd132c47e6540ddcd96ca169facff28173057 \
+    --hash=sha256:d50d31bfedd53a928fed6707b15a8dbeef011bb6366297cc435accc888b27c20 \
+    --hash=sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d \
+    --hash=sha256:dc5d1a49d3f8262be192589a4b72f0d03b72dcf46c51ad5852a4fdc67be7b9e4 \
+    --hash=sha256:e2d1a054f8f0a191004675755448d12be47fa9bebbcffa3cdf01db19f2d30a54 \
+    --hash=sha256:e7792606d606c8df5277c32ccb58f29b9b8603bf83b48639b7aedf6df4fe8171 \
+    --hash=sha256:ed1708dbf4d2e3a1c5c69110ba2b4eb6678262028afd6c6fbcc5a8dac9cda68e \
+    --hash=sha256:f2d4380bf5f62daabd7b751ea2339c1a21d1c9463f1feb7fc2bdcea2c29c3160 \
+    --hash=sha256:f3513916e8c645d0610815c257cbfd3242adfd5c4cfa78be514e5a3ebb42a41b \
+    --hash=sha256:f8346bfa098532bc1fb6c7ef06783e969d87a99dd1d2a5a18a892c1d7a643c58 \
+    --hash=sha256:f83fa6cae3fff8e98691248c9320356971b59678a17f20656a9e59cd32cee6d8 \
+    --hash=sha256:fa6ce8b52c5987b3e34d5674b0ab529a4602b632ebab0a93b07bfb4dfc8f8a33 \
+    --hash=sha256:fb2b1ecfef1e67897d336de3a0e3f52478182d6a47eda86cbd42504c5cbd009a \
+    --hash=sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880 \
+    --hash=sha256:fd30d9c67d13d891f2360b2a120186729c111238ac63b43dbd37a5a40670b8ca \
+    --hash=sha256:fd7699e8fd9969f455ef2926221e0233f81a2542921471382e77a9e2f2b57f4b \
+    --hash=sha256:fe3b385d996ee0822fd46528d9f0443b880d4d05528fd26a9119a54ec3f91c69
+    # via -r ci/official/requirements_updater/requirements.in
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==70.0.0 \
+    --hash=sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4 \
+    --hash=sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0
+    # via
+    #   -r ci/official/requirements_updater/requirements.in
+    #   tb-nightly
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 18c00c7a51d4..995156cdde67 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -190,6 +190,8 @@ package(
 #     name = "build_cleaner_spec_test",
 #     src = "build_cleaner_spec.textproto",
 # )
+#
+# exports_files(srcs = ["METADATA"])
 # copybara:uncomment_end
 
 licenses(["notice"])
@@ -251,7 +253,7 @@ config_setting(
 config_setting(
     name = "android",
     constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
+        ["@platforms//os:android"],
         [],
     ),
     values = if_oss(
@@ -263,45 +265,45 @@ config_setting(
 
 config_setting(
     name = "android_x86",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:x86_32",
+            "@platforms//os:android",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//external:android/crosstool"},
         ),
-        cpu = "x86",
     ),
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "android_x86_64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:android",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//external:android/crosstool"},
         ),
-        cpu = "x86_64",
     ),
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "android_armeabi",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:armv6-m",
+            "@platforms//os:android",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//external:android/crosstool"},
         ),
-        cpu = "armeabi",
     ),
     visibility = ["//visibility:public"],
 )
@@ -309,22 +311,28 @@ config_setting(
 # copybara:uncomment_begin(google-only)
 # config_setting(
 #     name = "chromiumos_x86_64",
-#     constraint_values = ["//third_party/bazel_platforms/os:chromiumos"],
-#     values = {"cpu": "k8"},
+#     constraint_values = [
+#         "@platforms//cpu:x86_64",
+#         "@platforms//os:chromiumos",
+#     ],
 #     visibility = ["//visibility:public"],
 # )
 #
 # config_setting(
 #     name = "chromiumos_arm64",
-#     constraint_values = ["//third_party/bazel_platforms/os:chromiumos"],
-#     values = {"cpu": "arm"},
+#     constraint_values = [
+#         "@platforms//cpu:aarch64",
+#         "@platforms//os:chromiumos",
+#     ],
 #     visibility = ["//visibility:public"],
 # )
 #
 # config_setting(
 #     name = "chromiumos_armv7",
-#     constraint_values = ["//third_party/bazel_platforms/os:chromiumos"],
-#     values = {"cpu": "armeabi-v7a"},
+#     constraint_values = [
+#         "@platforms//cpu:armv7",
+#         "@platforms//os:chromiumos",
+#     ],
 #     visibility = ["//visibility:public"],
 # )
 # copybara:uncomment_end
@@ -332,7 +340,7 @@ config_setting(
 config_setting(
     name = "emscripten",
     constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:emscripten"],
+        ["@platforms//os:emscripten"],
         [],
     ),
     values = if_oss(
@@ -344,57 +352,56 @@ config_setting(
 
 config_setting(
     name = "raspberry_pi_armeabi",
+    constraint_values =
+        [
+            "@platforms//cpu:armv6-m",
+            "@platforms//os:linux",
+        ],
     values = {
         "crosstool_top": "@local_config_arm_compiler//:toolchain",
-        "cpu": "armeabi",
     },
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "android_arm",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:armv7",
+            "@platforms//os:android",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//external:android/crosstool"},
         ),
-        cpu = "armeabi-v7a",
     ),
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "android_arm64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:aarch64",
+            "@platforms//os:android",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//external:android/crosstool"},
         ),
-        cpu = "arm64-v8a",
     ),
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "android_mips",
-    values = {
-        "crosstool_top": "//external:android/crosstool",
-        "cpu": "mips",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "android_mips64",
+    constraint_values =
+        [
+            "@platforms//cpu:mips64",
+            "@platforms//os:android",
+        ],
     values = {
         "crosstool_top": "//external:android/crosstool",
-        "cpu": "mips64",
     },
     visibility = ["//visibility:public"],
 )
@@ -402,16 +409,10 @@ config_setting(
 # TODO(jakeharmon8): Remove in favor of TSL version
 config_setting(
     name = "windows",
-    # Internal builds query the target OS.
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:windows"],
-        [],
-    ),
-    # OSS builds query the CPU type.
-    values = if_oss(
-        {"cpu": "x64_windows"},
-        {},
-    ),
+    constraint_values =
+        [
+            "@platforms//os:windows",
+        ],
     visibility = ["//visibility:public"],
 )
 
@@ -421,52 +422,28 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# Sometimes Bazel reports darwin_x86_64 as "darwin" and sometimes as
-# "darwin_x86_64". The former shows up when building on a Mac x86_64 host for a Mac x86_64 target.
-# The latter shows up when cross-compiling for Mac x86_64 from a Mac ARM machine and in internal
-# Google builds.
-config_setting(
-    name = "macos_x86_64_default",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:macos"],
-        [],
-    ),
-    values = {
-        "apple_platform_type": "macos",
-        "cpu": "darwin",
-    },
-)
-
 config_setting(
-    name = "macos_x86_64_crosscompile",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:macos"],
-        [],
-    ),
+    name = "macos_x86_64",
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:macos",
+        ],
     values = {
         "apple_platform_type": "macos",
-        "cpu": "darwin_x86_64",
     },
-)
-
-selects.config_setting_group(
-    name = "macos_x86_64",
-    match_any = [
-        ":macos_x86_64_default",
-        ":macos_x86_64_crosscompile",
-    ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "macos_arm64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:macos"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:aarch64",
+            "@platforms//os:macos",
+        ],
     values = {
         "apple_platform_type": "macos",
-        "cpu": "darwin_arm64",
     },
     visibility = ["//visibility:public"],
 )
@@ -484,7 +461,7 @@ selects.config_setting_group(
 config_setting(
     name = "ios",
     constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:ios"],
+        ["@platforms//os:ios"],
         [],
     ),
     values = if_oss(
@@ -497,41 +474,32 @@ config_setting(
 # TODO(jakeharmon8): Remove in favor of TSL version
 config_setting(
     name = "fuchsia",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:fuchsia"],
-        [],
-    ),
-    values = if_oss(
-        # TODO(b/149248802) When we have a Fuchsia Bazel SDK update to use the values it sets.
-        {"cpu": "fuchsia"},
-        {},
-    ),
+    constraint_values =
+        ["@platforms//os:fuchsia"],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "fuchsia_x86_64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:fuchsia"],
-        [],
-    ),
-    values = {
-        "cpu": "x86_64",
-    },
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:fuchsia",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "ios_x86_64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:ios"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:ios",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//tools/osx/crosstool:crosstool"},
         ),
-        cpu = "ios_x86_64",
     ),
     visibility = ["//visibility:public"],
 )
@@ -539,7 +507,7 @@ config_setting(
 config_setting(
     name = "chromiumos",
     constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:chromiumos"],
+        ["@platforms//os:chromiumos"],
         [],
     ),
     values = if_oss(
@@ -551,49 +519,43 @@ config_setting(
 
 config_setting(
     name = "linux_aarch64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "aarch64"},
+    constraint_values =
+        [
+            "@platforms//cpu:aarch64",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_armhf",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "armhf"},
+    constraint_values =
+        [
+            "@platforms//cpu:armv7e-mf",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_x86_64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "k8"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "haswell",
-    values = {"cpu": "haswell"},
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 # This condition takes precedence over :linux_x86_64
 config_setting(
     name = "linux_x86_64_no_sse",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:linux",
+        ],
     values = {
-        "cpu": "k8",
         "copt": "-mno-sse4.2",
     },
     visibility = ["//visibility:public"],
@@ -603,52 +565,52 @@ config_setting(
 # TODO(b/290533709): Remove this with PJRT build rule cleanup.
 config_setting(
     name = "linux_x86_64_with_weightwatcher",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:linux",
+        ],
     define_values = {"tensorflow_weightwatcher": "true"},
-    values = {"cpu": "k8"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_ppc64le",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "ppc"},
+    constraint_values =
+        [
+            "@platforms//cpu:ppc64le",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_s390x",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "s390x"},
+    constraint_values =
+        [
+            "@platforms//cpu:s390x",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_mips64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "mips64"},
+    constraint_values =
+        [
+            "@platforms//cpu:mips64",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_riscv64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "riscv64"},
+    constraint_values =
+        [
+            "@platforms//cpu:riscv64",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
@@ -668,45 +630,25 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "arm",
-    values = {"cpu": "arm"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "armeabi",
-    values = {"cpu": "armeabi"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "armeabi-v7a",
-    values = {"cpu": "armeabi-v7a"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "arm64-v8a",
-    values = {"cpu": "arm64-v8a"},
-    visibility = ["//visibility:public"],
-)
-
 selects.config_setting_group(
     name = "arm_any",
     match_any = [
-        ":arm",
-        ":armeabi",
-        ":armeabi-v7a",
-        ":arm64-v8a",
-        ":linux_aarch64",
-        ":linux_armhf",
+        "@platforms//cpu:aarch32",
+        "@platforms//cpu:aarch64",
+        "@platforms//cpu:armv6-m",
+        "@platforms//cpu:armv7",
+        "@platforms//cpu:armv7-m",
+        "@platforms//cpu:armv7e-m",
+        "@platforms//cpu:armv7e-mf",
     ],
 )
 
 config_setting(
     name = "freebsd",
-    values = {"cpu": "freebsd"},
+    constraint_values = [
+        "@platforms//os:freebsd",
+        "@platforms//cpu:x86_64",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -900,7 +842,7 @@ config_setting(
 )
 
 # This flag disables generating tensorflow.lite.python under LiteRT repo.
-# Avoid using flag for creating tflite wheels as tensorflow/lite is not yet fully splitted from tf.
+# Avoid using flag for creating tflite wheels as tensorflow/lite is not yet fully split from tf.
 config_setting(
     name = "disable_tf_lite_py",
     define_values = {"disable_tf_lite_py": "true"},
@@ -1140,13 +1082,13 @@ bzl_library(
         ":tf_version_bzl",
         "//tensorflow/core/platform:build_config_root_bzl",
         "//tensorflow/core/platform:rules_cc_bzl",
-        "//third_party/compute_library:build_defs_bzl",
-        "//third_party/llvm_openmp:openmp_bzl",
         "@bazel_skylib//lib:new_sets",
         "@bazel_skylib//rules:common_settings",
         "@local_config_cuda//cuda:build_defs_bzl",
         "@local_config_rocm//rocm:build_defs_bzl",
         "@local_config_tensorrt//:build_defs_bzl",
+        "@local_xla//third_party/compute_library:build_defs_bzl",
+        "@local_xla//third_party/llvm_openmp:openmp_bzl",
         "@local_xla//third_party/py/rules_pywrap:pywrap_bzl",
         "@local_xla//xla/tsl:tsl_bzl",
         "@local_xla//xla/tsl/mkl:build_defs_bzl",
@@ -1362,7 +1304,7 @@ tf_cc_shared_library(
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
-            "-z defs",
+            "-Wl,-z,defs",
             "-Wl,--version-script,$(location //tensorflow:tf_version_script.lds)",
         ],
     }),
@@ -1773,6 +1715,7 @@ py_library(
         "//tensorflow/lite/python:lite",
         "//tensorflow/lite/python/authoring",
         "//tensorflow/python:no_contrib",
+        "//tensorflow/python/profiler:profiler_client",
         "@pypi_keras_nightly//:pkg",
         "@pypi_tb_nightly//:pkg",
     ],
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index e4ee61063fa0..793d6312a837 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -923,8 +923,8 @@ void TF_SetAttrShape(TF_OperationDescription* desc, const char* attr_name,
                      const int64_t* dims, int num_dims) {
   PartialTensorShape shape;
   if (num_dims >= 0) {
-    shape = PartialTensorShape(
-        ArraySlice<int64_t>(reinterpret_cast<const int64_t*>(dims), num_dims));
+    shape = PartialTensorShape(absl::Span<const int64_t>(
+        reinterpret_cast<const int64_t*>(dims), num_dims));
   }
   desc->node_builder.Attr(attr_name, shape);
 }
@@ -938,7 +938,7 @@ void TF_SetAttrShapeList(TF_OperationDescription* desc, const char* attr_name,
     if (num_dims[i] < 0) {
       shapes.emplace_back();
     } else {
-      shapes.emplace_back(ArraySlice<int64_t>(
+      shapes.emplace_back(absl::Span<const int64_t>(
           reinterpret_cast<const int64_t*>(dims[i]), num_dims[i]));
     }
   }
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index f4b480752c90..e4c2c92783d4 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -1107,7 +1107,6 @@ cc_library(
         ":c_api",
         ":c_api_experimental",
         ":tfe_tensorhandle_internal",
-        "//tensorflow/c:tf_status_helper",
         "//tensorflow/c:tf_status_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1120,7 +1119,7 @@ cc_library(
 
 tf_cuda_cc_test(
     name = "dlpack_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "dlpack_test.cc",
     ],
diff --git a/tensorflow/c/eager/dlpack.cc b/tensorflow/c/eager/dlpack.cc
index e3447215192f..6bfe6363bb35 100644
--- a/tensorflow/c/eager/dlpack.cc
+++ b/tensorflow/c/eager/dlpack.cc
@@ -279,6 +279,11 @@ bool IsValidStrideCompactRowMajorData(int64_t* shape_arr, int64_t* stride_arr,
 }
 }  // namespace
 
+void* TFE_GetDLDevice(TFE_TensorHandle* h, TF_Status* status) {
+  auto dl_device = GetDlContext(h, status);
+  return new DLDevice{dl_device.device_type, dl_device.device_id};
+}
+
 void TFE_CallDLManagedTensorDeleter(void* dlm_ptr) {
   DLManagedTensor* dlMTensor = static_cast<DLManagedTensor*>(dlm_ptr);
   if (dlMTensor->deleter != nullptr) {
diff --git a/tensorflow/c/eager/dlpack.h b/tensorflow/c/eager/dlpack.h
index 8c85dee62f78..e2deb835863a 100644
--- a/tensorflow/c/eager/dlpack.h
+++ b/tensorflow/c/eager/dlpack.h
@@ -23,6 +23,13 @@ namespace tensorflow {
 // PyCapsule name for DLPack Tensor
 const char* const kDlTensorCapsuleName = "dltensor";
 
+// Returns the DLDevice* for the given eager tensor handle.
+//
+// The caller takes ownership of the returned pointer and is responsible for
+// deleting it.
+TF_CAPI_EXPORT extern void* TFE_GetDLDevice(TFE_TensorHandle* h,
+                                            TF_Status* status);
+
 // Converts eager tensor handle to DLPack (DLManagedTensor*), and return the
 // void* for further PyCapsule construction.
 TF_CAPI_EXPORT extern void* TFE_HandleToDLPack(TFE_TensorHandle* h,
diff --git a/tensorflow/c/experimental/pluggable_profiler/BUILD b/tensorflow/c/experimental/pluggable_profiler/BUILD
index 03ea7f148e24..dfdd9e5aada4 100644
--- a/tensorflow/c/experimental/pluggable_profiler/BUILD
+++ b/tensorflow/c/experimental/pluggable_profiler/BUILD
@@ -43,9 +43,9 @@ cc_library(
         "//tensorflow/core/common_runtime/device:device_utils",
         "//tensorflow/core/profiler/lib:profiler_factory",
         "//tensorflow/core/profiler/lib:profiler_interface",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -65,7 +65,7 @@ cc_library(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/profiler/lib:profiler_interface",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
diff --git a/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h b/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
index 55af07ad79f4..0262db81b486 100644
--- a/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
+++ b/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/c/experimental/saved_model/core/test_utils.cc b/tensorflow/c/experimental/saved_model/core/test_utils.cc
index 65b70906d30e..ffe89d5b71e2 100644
--- a/tensorflow/c/experimental/saved_model/core/test_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/test_utils.cc
@@ -103,6 +103,8 @@ void FillNumericTensorBuffer(DataType dtype, size_t num_elements, void* buffer,
     TF_CALL_float(CASE);
     TF_CALL_int4(CASE);
     TF_CALL_uint4(CASE);
+    TF_CALL_int2(CASE);
+    TF_CALL_uint2(CASE);
 #undef CASE
     default:
       CHECK(false) << "Unsupported data type: " << DataTypeString(dtype);
@@ -135,6 +137,8 @@ void CheckBufferDataIsEqual(DataType dtype, int64_t num_elements, void* a,
     TF_CALL_float(CASE);
     TF_CALL_int4(CASE);
     TF_CALL_uint4(CASE);
+    TF_CALL_int2(CASE);
+    TF_CALL_uint2(CASE);
 #undef CASE
     default:
       CHECK(false) << "Unsupported data type: " << DataTypeString(dtype);
diff --git a/tensorflow/c/experimental/stream_executor/test/BUILD b/tensorflow/c/experimental/stream_executor/test/BUILD
index 2a4d40b3e797..9594e2a1c22b 100644
--- a/tensorflow/c/experimental/stream_executor/test/BUILD
+++ b/tensorflow/c/experimental/stream_executor/test/BUILD
@@ -19,3 +19,13 @@ tf_cc_shared_object(
         "//tensorflow/c/experimental/stream_executor:stream_executor_test_util",
     ],
 )
+
+cc_library(
+    name = "test_pluggable_device",
+    srcs = ["test_pluggable_device.cc"],
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
+        "//tensorflow/c/experimental/stream_executor:stream_executor_test_util",
+    ],
+)
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 80474eb68130..c1821fb1c2dd 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -875,8 +875,8 @@ TF_Tensor* TF_ForwardInputOrAllocateOutput(
   TF_SetStatus(status, TF_OK, "");
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
 
-  tensorflow::gtl::ArraySlice<int> input_indices_array(
-      candidate_input_indices, num_candidate_input_indices);
+  absl::Span<const int> input_indices_array(candidate_input_indices,
+                                            num_candidate_input_indices);
   tensorflow::gtl::ArraySlice<const int64_t> output_dimarray(
       reinterpret_cast<const int64_t*>(output_dims), output_num_dims);
   tensorflow::Tensor* output_tensor_pointer;
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index fd7f99cdf990..d9c956f83c44 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -115,8 +115,8 @@ TF_CAPI_EXPORT extern TF_KernelBuilder* TF_NewAsyncKernelBuilder(
 
 // Specifies that this kernel's attribute only supports the given type.
 TF_CAPI_EXPORT extern void TF_KernelBuilder_TypeConstraint(
-    TF_KernelBuilder* kernel_builder, const char* attr_name,
-    const TF_DataType type, TF_Status* status);
+    TF_KernelBuilder* kernel_builder, const char* attr_name, TF_DataType type,
+    TF_Status* status);
 
 // Specify that this kernel requires/provides an input/output arg
 // in host memory (instead of the default, device memory).
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index f8431498eb51..6e8dbc8512fa 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -19,6 +19,7 @@ tf_kernel_library(
         "//tensorflow/c:tf_tensor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
diff --git a/tensorflow/c/kernels/bitcast_op.cc b/tensorflow/c/kernels/bitcast_op.cc
index f104804bdf90..d60cdb8173d9 100644
--- a/tensorflow/c/kernels/bitcast_op.cc
+++ b/tensorflow/c/kernels/bitcast_op.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <sstream>
 
+#include "absl/log/check.h"
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/c/ops.h"
 #include "tensorflow/c/tf_tensor.h"
diff --git a/tensorflow/c/tf_datatype.h b/tensorflow/c/tf_datatype.h
index 448207bf4299..02a38e9b164e 100644
--- a/tensorflow/c/tf_datatype.h
+++ b/tensorflow/c/tf_datatype.h
@@ -63,6 +63,8 @@ typedef enum TF_DataType {
                                // finite-only,with NaN.
   TF_INT4 = 29,
   TF_UINT4 = 30,
+  TF_INT2 = 31,
+  TF_UINT2 = 32,
 } TF_DataType;
 
 // TF_DataTypeSize returns the sizeof() for the underlying type corresponding
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 53622b8f155b..011ac8baee7f 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -492,6 +492,7 @@ cc_library(
     ],
     deps = [
         ":constants",
+        ":fingerprinting_x_platform_utils",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/graph/regularization:simple_delete",
         "//tensorflow/core/graph/regularization:util",
@@ -523,6 +524,7 @@ cc_library(
         "//learning/brain/contrib/tpu_modeling:__subpackages__",
         "//learning/metadata/artifactoid/cc:__subpackages__",
         "//learning/tfx/pipeline/util:__subpackages__",
+        "//tensorflow/core/tfrt:__subpackages__",
         "//tensorflow/python/saved_model:__subpackages__",
     ],
     deps = if_static([
@@ -544,6 +546,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":constants",
+        ":fingerprinting_x_platform_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/util/tensor_bundle:naming",
@@ -560,6 +563,17 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "fingerprinting_x_platform_utils",
+    srcs = ["fingerprinting_x_platform_utils.cc"],
+    hdrs = ["fingerprinting_x_platform_utils.h"],
+    deps = [
+        "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:random",
+    ],
+)
+
 tf_cc_test(
     name = "fingerprinting_utils_test",
     srcs = ["fingerprinting_utils_test.cc"],
@@ -633,6 +647,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/numeric:int128",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/cc/saved_model/fingerprinting.cc b/tensorflow/cc/saved_model/fingerprinting.cc
index edb61db527c6..9a46f3507f56 100644
--- a/tensorflow/cc/saved_model/fingerprinting.cc
+++ b/tensorflow/cc/saved_model/fingerprinting.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/fingerprinting_x_platform_utils.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/regularization/simple_delete.h"
 #include "tensorflow/core/graph/regularization/util.h"
@@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 #include "tensorflow/core/util/tensor_bundle/naming.h"
-#include "tsl/platform/random.h"
 // b/291933687, b/291001524
 #if !defined(PLATFORM_WINDOWS) && !defined(__APPLE__)
 #include "tensorflow/cc/saved_model/fingerprinting_utils.h"
@@ -184,7 +183,7 @@ absl::StatusOr<FingerprintDef> CreateFingerprintDefPb(
   // Set fingerprint field #5.
   fingerprint_def.set_checkpoint_hash(HashCheckpointIndexFile(export_dir));
   // Assign a random UUID to the fingerprint.
-  fingerprint_def.set_uuid(absl::StrFormat("%016d", tsl::random::New64()));
+  fingerprint_def.set_uuid(CreateRandomUUID());
   // Set version of the fingerprint.
   VersionDef* version = fingerprint_def.mutable_version();
   version->set_producer(kFingerprintProducer);
diff --git a/tensorflow/cc/saved_model/fingerprinting_test.cc b/tensorflow/cc/saved_model/fingerprinting_test.cc
index dbc784eb8de5..36f2fe7917bf 100644
--- a/tensorflow/cc/saved_model/fingerprinting_test.cc
+++ b/tensorflow/cc/saved_model/fingerprinting_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/numeric/int128.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
@@ -63,9 +64,13 @@ TEST(FingerprintingTest, TestCreateFingerprint) {
   EXPECT_EQ(fingerprint_def.signature_def_hash(), 15570736222402453744U);
   EXPECT_EQ(fingerprint_def.saved_object_graph_hash(), 3678101440349108924U);
 
-  // The uuid is a random number, but it should be a number > 0.
-  uint64 uuid = 0;
-  EXPECT_TRUE(absl::SimpleAtoi(fingerprint_def.uuid(), &uuid));
+  // The uuid is a random number (as string), but it should be a number > 0.
+  absl::uint128 uuid = 0;
+  EXPECT_TRUE(absl::SimpleAtoi(fingerprint_def.uuid(), &uuid))
+      << "String to Uint128 conversion failed. "
+      << "UUID from proto, and Uint128Max(): \n"
+      << fingerprint_def.uuid() << "\n"
+      << absl::Uint128Max();
   EXPECT_GT(uuid, 0);
 
   // TODO(b/242348400): The checkpoint hash is non-deterministic, so we cannot
diff --git a/tensorflow/cc/saved_model/fingerprinting_utils.cc b/tensorflow/cc/saved_model/fingerprinting_utils.cc
index a41ab4ecd02b..460d34c36aa8 100644
--- a/tensorflow/cc/saved_model/fingerprinting_utils.cc
+++ b/tensorflow/cc/saved_model/fingerprinting_utils.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "riegeli/bytes/fd_reader.h"  // from @riegeli
 #include "riegeli/records/record_reader.h"  // from @riegeli
 #include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/fingerprinting_x_platform_utils.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -47,7 +47,6 @@ limitations under the License.
 #include "tensorflow/tools/proto_splitter/chunk.pb.h"
 #include "tensorflow/tools/proto_splitter/merge.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/random.h"
 #include "tsl/platform/statusor.h"
 // IWYU pragma: no_include "third_party/protobuf/repeated_ptr_field.h"
 // IWYU pragma: no_include "third_party/protobuf/io/coded_stream.h"
@@ -475,7 +474,8 @@ absl::StatusOr<FingerprintDef> CreateFingerprintDefCpb(
 
   fingerprint_def.set_checkpoint_hash(HashCheckpointIndexFile(export_dir));
 
-  fingerprint_def.set_uuid(absl::StrFormat("%016d", tsl::random::New64()));
+  // Assign a random UUID to the fingerprint.
+  fingerprint_def.set_uuid(fingerprinting::CreateRandomUUID());
   reader.Close();
 
   // Set version of the fingerprint.
diff --git a/tensorflow/cc/saved_model/fingerprinting_x_platform_utils.cc b/tensorflow/cc/saved_model/fingerprinting_x_platform_utils.cc
new file mode 100644
index 000000000000..7273ec720c4d
--- /dev/null
+++ b/tensorflow/cc/saved_model/fingerprinting_x_platform_utils.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/fingerprinting_x_platform_utils.h"
+
+#include <string>
+
+#include "absl/numeric/int128.h"
+#include "absl/strings/str_format.h"
+#include "tsl/platform/random.h"
+
+// UINT64MAX is 18'446'744'073'709'551'615 (20 digits)
+// UINT128MAX is 340'282'366'920'938'463'463'374'607'431'768'211'455 (39 dgts)
+// After sqrt(INT64MAX) = 4'294'967'296 (4B models), it's 50% likely to be
+// duplicates in the ID space. In comparison, sqrt(UINT128MAX) = UINT64MAX,
+// meaning that we can continue generating unique IDs for a lot longer time
+// if the UUID is generated from two random UINT64s. This can be replaced by
+// random::New128() if that becomes available.
+std::string tensorflow::saved_model::fingerprinting::CreateRandomUUID() {
+  absl::uint128 uuid_1 = tsl::random::New64();
+  absl::uint128 uuid_2 = tsl::random::New64();
+  absl::uint128 uuid_complete = (uuid_1 << 64) | uuid_2;
+  return absl::StrFormat("%020d", uuid_complete);
+}
diff --git a/tensorflow/cc/saved_model/fingerprinting_x_platform_utils.h b/tensorflow/cc/saved_model/fingerprinting_x_platform_utils.h
new file mode 100644
index 000000000000..4f555f055321
--- /dev/null
+++ b/tensorflow/cc/saved_model/fingerprinting_x_platform_utils.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_X_PLATFORM_UTILS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_X_PLATFORM_UTILS_H_
+
+#include <string>
+
+namespace tensorflow::saved_model::fingerprinting {
+
+// Returns a random UUID (128 bits random) as a string.
+std::string CreateRandomUUID();
+
+}  // namespace tensorflow::saved_model::fingerprinting
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_X_PLATFORM_UTILS_H_
diff --git a/tensorflow/cc/training/coordinator.cc b/tensorflow/cc/training/coordinator.cc
index 68f1a9cf85b5..7e25c310edb1 100644
--- a/tensorflow/cc/training/coordinator.cc
+++ b/tensorflow/cc/training/coordinator.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/cc/training/coordinator.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "absl/status/status.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 5f6a41e63f8d..829747f718f9 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/cc/training/queue_runner.h"
 
+#include <chrono>
+#include <cstddef>
+#include <functional>
+#include <memory>
+
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/cc/training/coordinator.h"
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index 3122ff313e84..b994bce49858 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/cc/training/coordinator.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 5bcd1d07da85..273741a750dd 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -36,6 +36,28 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "thunk_proto_execution_deserializer",
+    srcs = ["thunk_proto_execution_deserializer.cc"],
+    hdrs = ["thunk_proto_execution_deserializer.h"],
+    deps = [
+        "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_xla//xla:cpu_function_runtime",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla:util",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/backends/cpu/runtime:convolution_lib",
+        "@local_xla//xla/backends/cpu/runtime:dot_lib",
+        "@local_xla//xla/backends/cpu/runtime:thunk_proto_cc",
+        "@local_xla//xla/service/cpu:cpu_aot_compilation_result",
+        "@local_xla//xla/service/cpu:cpu_executable",
+        "@local_xla//xla/service/cpu:executable_proto_cc",
+    ],
+)
+
 cc_library(
     name = "tfcompile_lib",
     srcs = [
@@ -101,6 +123,7 @@ cc_library(
         "@local_xla//xla/hlo/builder:xla_computation",
         "@local_xla//xla/service:compiler",
         "@local_xla//xla/service/cpu:buffer_info_util",
+        "@local_xla//xla/service/cpu:cpu_aot_compilation_result",
         "@local_xla//xla/service/cpu:cpu_compiler",
         "@local_xla//xla/stream_executor:platform_manager",
     ],
@@ -121,10 +144,12 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",  # fixdeps: keep
         "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:shape_util",
+        "@local_xla//xla/service/cpu:cpu_aot_compilation_result",
     ] + if_llvm_x86_available([
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
     ]),
@@ -173,6 +198,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_xla//xla:debug_options_flags",
@@ -334,6 +360,43 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "embedded_constant_buffers",
+    srcs = ["embedded_constant_buffers.cc"],
+    hdrs = ["embedded_constant_buffers.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
+        "@local_xla//xla:util",
+        "@local_xla//xla/service/llvm_ir:llvm_type_conversion_util",
+    ],
+)
+
+tf_cc_test(
+    name = "embedded_constant_buffers_test",
+    srcs = ["embedded_constant_buffers_test.cc"],
+    deps = [
+        ":embedded_constant_buffers",
+        ":llvm_targets",  # fixdeps: keep
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/service/cpu:test_header_helper",
+    ],
+)
+
 cc_library(
     name = "aot_only_var_handle_op",
     srcs = ["aot_only_var_handle_op.cc"],
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 989c319da07d..b21d100eb8e5 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -40,7 +40,9 @@ limitations under the License.
 #include "xla/cpu_function_runtime.h"
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/buffer_info_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
@@ -142,12 +144,12 @@ absl::Status AddRewritesForShape(
   std::vector<string> dim_vars;
   string dim_sizes, indices;
   int count = 1;
-  if (shape.rank() == 0 ||
-      (shape.dimensions_size() == 1 && shape.dimensions(0) == 1)) {
+  if (shape.dimensions().size() == 0 ||
+      (shape.dimensions().size() == 1 && shape.dimensions(0) == 1)) {
     dim_sizes = "[1]";
     indices = "[0]";
   } else {
-    for (int dim = 0; dim < shape.dimensions_size(); ++dim) {
+    for (int dim = 0; dim < shape.dimensions().size(); ++dim) {
       dim_vars.push_back(absl::StrCat("size_t dim", dim));
       dim_sizes += absl::StrCat("[", shape.dimensions(dim), "]");
       indices += absl::StrCat("[dim", dim, "]");
@@ -525,6 +527,7 @@ absl::Status GenerateHeader(const CodegenOpts& opts,
   TF_RETURN_IF_ERROR(
       CheckEqual(ps.result().tuple_shapes_size(), result_index_table.size(),
                  "Result number mismatch, proto vs. result_index_table"));
+  TF_ASSIGN_OR_RETURN(auto program_shape, xla::ProgramShape::FromProto(ps));
   const size_t arg_bytes_aligned =
       xla::cpu_function_runtime::AlignedBufferBytes(
           buffer_infos_for_args.data(), buffer_infos_for_args.size(),
@@ -845,7 +848,7 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       {"{{METHODS_VARIABLE}}\n", methods_variable},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
-      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
+      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(program_shape)},
       {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
        metadata_result.program_shape_access_shim},
       {"{{VARIABLE_NAMES_CODE}}", variable_names_code},
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 7056d8559014..7ba72b461d41 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -19,10 +19,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/TargetSelect.h"
+#include "tensorflow/compiler/aot/compile.h"
 #include "xla/cpu_function_runtime.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/shape_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -215,24 +218,30 @@ TEST(CodegenTest, Golden) {
   variable3->mutable_shape()->add_dim()->set_size(5);
   variable3->set_type(DT_INT32);
   CompileResult compile_result;
-  compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult(
-      {},
-      {BufferInfo::MakeTempBuffer(3 * 8),
-       BufferInfo::MakeEntryParameter(/*size=*/8, /*entry_param_number=*/0),
-       BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/1),
-       BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/2),
-       BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/3),
-       BufferInfo::MakeResultParameter(/*size=*/5 * 6 * 4,
-                                       /*result_param_number=*/0),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/4),
-       BufferInfo::MakeResultParameter(/*size=*/1 * 4,
-                                       /*result_param_number=*/1),
-       BufferInfo::MakeResultParameter(/*size=*/5 * 4,
-                                       /*result_param_number=*/2)},
-      0, nullptr, {}));
+  compile_result.aot =
+      absl::WrapUnique(new xla::cpu::CpuAotCompilationResultLegacy(
+          {},
+          {BufferInfo::MakeTempBuffer(3 * 8),
+           BufferInfo::MakeEntryParameter(/*size=*/8,
+                                          /*entry_param_number=*/0),
+           BufferInfo::MakeTempBuffer(1),
+           BufferInfo::MakeEntryParameter(/*size=*/96,
+                                          /*entry_param_number=*/1),
+           BufferInfo::MakeTempBuffer(1),
+           BufferInfo::MakeEntryParameter(/*size=*/96,
+                                          /*entry_param_number=*/2),
+           BufferInfo::MakeTempBuffer(1),
+           BufferInfo::MakeEntryParameter(/*size=*/96,
+                                          /*entry_param_number=*/3),
+           BufferInfo::MakeResultParameter(/*size=*/5 * 6 * 4,
+                                           /*result_param_number=*/0),
+           BufferInfo::MakeEntryParameter(/*size=*/96,
+                                          /*entry_param_number=*/4),
+           BufferInfo::MakeResultParameter(/*size=*/1 * 4,
+                                           /*result_param_number=*/1),
+           BufferInfo::MakeResultParameter(/*size=*/5 * 4,
+                                           /*result_param_number=*/2)},
+          0, nullptr, {}));
   compile_result.program_shape =
       xla::ShapeUtil::MakeProgramShape(
           {
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index b093034e5cd4..b3f6f30a4505 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -32,17 +32,16 @@ limitations under the License.
 #include "xla/client/client_library.h"
 #include "xla/client/compile_only_client.h"
 #include "xla/hlo/builder/xla_computation.h"
-#include "xla/service/cpu/cpu_compiler.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/regexp.h"  // IWYU pragma: keep
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -95,7 +94,7 @@ absl::Status CompileXla(xla::CompileOnlyClient* client,
                            aot_or.status().message());
   }
   compile_result->aot =
-      xla::unique_ptr_down_cast<xla::cpu::CpuAotCompilationResult>(
+      xla::unique_ptr_down_cast<xla::cpu::CpuAotCompilationResultLegacy>(
           std::move(aot_or.value().back()));
   compile_result->entry_point = aot_opts.entry_point_name();
   compile_result->pointer_size =
@@ -164,6 +163,11 @@ absl::Status CompileGraph(GraphDef graph_def, const tf2xla::Config& config,
         flags.sanitize_abilists_dataflow, ',', absl::SkipEmpty()));
   }
 
+  // AOT compilation is currently not supported for the thunk runtime.
+  if (aot_opts.debug_options().xla_cpu_use_thunk_runtime()) {
+    aot_opts.mutable_debug_options()->set_xla_cpu_use_thunk_runtime(false);
+  }
+
   return CompileXla(client, computation, aot_opts, compile_result);
 }
 
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 9d3ff78af89a..4d9901b52aac 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -33,7 +33,7 @@ namespace tfcompile {
 // data and meta-information is available in aot.
 struct CompileResult {
   // Contains object file and meta-info.
-  std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
+  std::unique_ptr<xla::cpu::CpuAotCompilationResultLegacy> aot;
   xla::ProgramShapeProto program_shape;  // Static shape of args and results.
   string entry_point;                    // Name of generated function.
   int pointer_size = 0;                  // Size of a pointer in bytes.
diff --git a/tensorflow/compiler/aot/embedded_constant_buffers.cc b/tensorflow/compiler/aot/embedded_constant_buffers.cc
new file mode 100644
index 000000000000..e81d87760499
--- /dev/null
+++ b/tensorflow/compiler/aot/embedded_constant_buffers.cc
@@ -0,0 +1,167 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/aot/embedded_constant_buffers.h"
+
+#include <sys/types.h>
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "xla/service/llvm_ir/llvm_type_conversion_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+using xla::llvm_ir::AsStringRef;
+
+void ConstantToEmbed::SerializeIntoBuffer(absl::Span<const uint8_t> buffer) {
+  // Allocate memory for the size of the buffer and the buffer itself.
+  const uint64_t buffer_size = buffer.size();
+  data_buffer.resize(sizeof(uint64_t) + buffer_size);
+  std::memcpy(data_buffer.data(), &buffer_size, sizeof(uint64_t));
+  std::memcpy(data_buffer.data() + sizeof(uint64_t), buffer.data(),
+              buffer.size());
+}
+
+static absl::Status AddBufferToLlvmModule(
+    llvm::Module* module, const ConstantToEmbed& constant_to_embed,
+    absl::string_view unique_identifier,
+    std::string& constant_array_symbol_name) {
+  if (constant_to_embed.data().empty()) {
+    return xla::Internal(
+        "Constant buffer shouldn't be empty, it should at least contain the "
+        "size of the buffer.");
+  }
+
+  absl::Span<const uint8_t> buffer_contents = constant_to_embed.data();
+
+  llvm::Constant* buffer_initializer = llvm::ConstantDataVector::get(
+      module->getContext(),
+      llvm::ArrayRef<uint8_t>(buffer_contents.data(), buffer_contents.size()));
+
+  constant_array_symbol_name =
+      absl::StrCat(unique_identifier, "_constant_buffer_contents");
+  new llvm::GlobalVariable(
+      *module, buffer_initializer->getType(),
+      /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
+      buffer_initializer, AsStringRef(constant_array_symbol_name));
+
+  return absl::OkStatus();
+}
+
+static absl::StatusOr<std::string> CodegenModule(
+    llvm::TargetMachine* target_machine, std::unique_ptr<llvm::Module> module) {
+  llvm::SmallVector<char, 0> stream_buffer;
+  llvm::raw_svector_ostream ostream(stream_buffer);
+  llvm::legacy::PassManager codegen_passes;
+
+  if (target_machine->addPassesToEmitFile(codegen_passes, ostream, nullptr,
+                                          llvm::CodeGenFileType::ObjectFile)) {
+    return xla::Internal(
+        "Could not create pass pipeline to generate object file");
+  }
+
+  codegen_passes.run(*module);
+
+  return std::string(stream_buffer.begin(), stream_buffer.end());
+}
+
+static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
+GetTargetMachineFromTriple(absl::string_view target_triple) {
+  std::string error;
+  std::string normalized_triple =
+      llvm::Triple::normalize(AsStringRef(absl::string_view(target_triple)));
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget(normalized_triple, error);
+  if (target == nullptr) {
+    return xla::Internal("TargetRegistry::lookupTarget failed: %s",
+                         error.c_str());
+  }
+
+  return absl::WrapUnique(target->createTargetMachine(
+      normalized_triple, /*CPU=*/"",
+      /*Features=*/"", llvm::TargetOptions(), std::nullopt));
+}
+
+absl::StatusOr<EmbeddedConstantBuffers> CreateEmbeddedConstantBuffers(
+    absl::string_view target_triple,
+    absl::Span<ConstantToEmbed> constants_to_embed) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
+                      GetTargetMachineFromTriple(target_triple));
+
+  llvm::LLVMContext llvm_context;
+  auto module_with_serialized_proto = std::make_unique<llvm::Module>(
+      "embedded_constant_data_module", llvm_context);
+
+  EmbeddedConstantBuffers result;
+
+  for (const ConstantToEmbed& constant_to_embed : constants_to_embed) {
+    std::string constant_array_symbol_name;
+
+    TF_RETURN_IF_ERROR(AddBufferToLlvmModule(
+        module_with_serialized_proto.get(), constant_to_embed,
+        constant_to_embed.symbol_prefix, constant_array_symbol_name));
+
+    std::string cpp_variable_decl =
+        absl::StrCat("extern \"C\" char ", constant_array_symbol_name, "[];");
+
+    std::string cpp_access_shim = absl::StrFormat(R"(
+    [](char* buffer) -> std::pair<uint64_t, char*> {
+      uint64_t buffer_size;
+      std::memcpy(&buffer_size, buffer, sizeof(uint64_t));
+      return {buffer_size, buffer + sizeof(uint64_t)};
+    }(%s)
+    )",
+                                                  constant_array_symbol_name);
+    result.variable_decls.push_back(
+        {constant_array_symbol_name, cpp_variable_decl, cpp_access_shim});
+  }
+
+  TF_ASSIGN_OR_RETURN(result.object_file_data,
+                      CodegenModule(target_machine.get(),
+                                    std::move(module_with_serialized_proto)));
+  return result;
+}
+
+}  // namespace tfcompile
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/embedded_constant_buffers.h b/tensorflow/compiler/aot/embedded_constant_buffers.h
new file mode 100644
index 000000000000..15f4b17ad342
--- /dev/null
+++ b/tensorflow/compiler/aot/embedded_constant_buffers.h
@@ -0,0 +1,77 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_AOT_EMBEDDED_CONSTANT_BUFFERS_H_
+#define TENSORFLOW_COMPILER_AOT_EMBEDDED_CONSTANT_BUFFERS_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+// Represents a set of constant buffers embedded into an object file.
+struct EmbeddedConstantBuffers {
+  struct VariableInfo {
+    // variable_name is the name of the variable from variable_decl.
+    std::string variable_name;
+
+    // `variable_decl` is an "extern C" array declaration that is used in
+    // `expression`.
+    std::string variable_decl;
+
+    // `cpp_access_shim` is a C++ expression that receives a pointer to the
+    // start of the buffer with size and returns the size and a pointer
+    // to the start of the buffer data.
+    std::string cpp_access_shim;
+  };
+  // Variable infos for each constant buffer.
+  std::vector<VariableInfo> variable_decls;
+
+  // The contents of the object (".o") file the constant buffers are embedded
+  // in.
+  std::string object_file_data;
+};
+
+// Describes a protocol buffer to embed into an object file.
+struct ConstantToEmbed {
+  // `symbol_prefix` is prefix that is guaranteed to be unique across the binary
+  // or DSO the generated object file will be linked into.
+  std::string symbol_prefix;
+
+  // Serializes the size of the `buffer` and it's contents into `data`.
+  void SerializeIntoBuffer(absl::Span<const uint8_t> buffer);
+
+  const std::vector<uint8_t>& data() const { return data_buffer; }
+
+ private:
+  // `data_buffer` is the constant buffer to be embedded. It containes the
+  // number of bytes of the buffer and it's contents.
+  std::vector<uint8_t> data_buffer;
+};
+
+absl::StatusOr<EmbeddedConstantBuffers> CreateEmbeddedConstantBuffers(
+    absl::string_view target_triple,
+    absl::Span<ConstantToEmbed> constants_to_embed);
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_EMBEDDED_CONSTANT_BUFFERS_H_
diff --git a/tensorflow/compiler/aot/embedded_constant_buffers_test.cc b/tensorflow/compiler/aot/embedded_constant_buffers_test.cc
new file mode 100644
index 000000000000..5ada34794d31
--- /dev/null
+++ b/tensorflow/compiler/aot/embedded_constant_buffers_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/aot/embedded_constant_buffers.h"
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/TargetSelect.h"
+#include "xla/service/cpu/test_target_triple_helper.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow::tfcompile {
+
+namespace {
+
+class EmbeddedConstantBuffersTest : public ::testing::Test {
+ protected:
+  EmbeddedConstantBuffersTest() {
+    // Initialize LLVM's MC layer for the native target.
+    llvm::InitializeNativeTarget();
+    llvm::InitializeNativeTargetAsmPrinter();
+  }
+};
+
+TEST_F(EmbeddedConstantBuffersTest, CreateEmbeddedConstantBuffers) {
+  std::vector<ConstantToEmbed> constants_to_embed(1);
+
+  constants_to_embed[0].SerializeIntoBuffer(std::vector<uint8_t>({1, 2, 3}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      EmbeddedConstantBuffers buffers,
+      CreateEmbeddedConstantBuffers(kTargetTripleForHost,
+                                    absl::MakeSpan(constants_to_embed)));
+
+  EXPECT_EQ(buffers.variable_decls.size(), constants_to_embed.size());
+
+  for (const auto& variable_decl : buffers.variable_decls) {
+    EXPECT_EQ(variable_decl.variable_name, "_constant_buffer_contents");
+    EXPECT_EQ(variable_decl.variable_decl,
+              "extern \"C\" char _constant_buffer_contents[];");
+    EXPECT_EQ(variable_decl.cpp_access_shim,
+              "\n    [](char* buffer) -> std::pair<uint64_t, char*> {\n"
+              "      uint64_t buffer_size;\n"
+              "      std::memcpy(&buffer_size, buffer, sizeof(uint64_t));\n"
+              "      return {buffer_size, buffer + sizeof(uint64_t)};\n"
+              "    }(_constant_buffer_contents)\n    ");
+  }
+}
+
+}  // namespace
+
+}  // namespace tensorflow::tfcompile
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 6065e5f8492f..a06ab1520b5e 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -401,6 +401,7 @@ tf_cc_test(
         ":test_graph_tfvariable",
         ":test_graph_tfvariable_readonly",
         ":test_graph_tfvariable_sequential_updates",
+        "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 80fa6d4a5075..139647260fbb 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -14,17 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <vector>
-#define EIGEN_USE_THREADS
-#define EIGEN_USE_CUSTOM_THREAD_POOL
-
-#include "absl/strings/str_split.h"
-#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
-#include "xla/hlo/testlib/test.h"
-#include "xla/service/hlo_profile_printer.h"
-#include "xla/shape_util.h"
-#include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/core/platform/test.h"
 
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
@@ -35,32 +24,39 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd.h"
-#include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_with_profiling.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfsplits.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tftop_k.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfvariable.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfvariable_readonly.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfvariable_sequential_updates.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 
 namespace tensorflow {
 namespace tfcompile {
 namespace {
 
-using ::testing::ContainsRegex;
-using ::testing::IsSupersetOf;
-
 TEST(TFCompileTest, Add) {
   AddComp add;
   EXPECT_EQ(add.arg0_data(), add.arg_data(0));
   EXPECT_EQ(add.arg1_data(), add.arg_data(1));
-
   add.arg0() = 1;
   add.arg1() = 2;
   EXPECT_TRUE(add.Run());
   EXPECT_EQ(add.error_msg(), "");
   EXPECT_EQ(add.result0(), 3);
   EXPECT_EQ(add.result0_data()[0], 3);
-  EXPECT_EQ(add.result0_data(), add.results()[0]);
+  EXPECT_EQ(add.result0_data(), add.result_data(0));
 
   add.arg0_data()[0] = 123;
   add.arg1_data()[0] = 456;
@@ -68,7 +64,7 @@ TEST(TFCompileTest, Add) {
   EXPECT_EQ(add.error_msg(), "");
   EXPECT_EQ(add.result0(), 579);
   EXPECT_EQ(add.result0_data()[0], 579);
-  EXPECT_EQ(add.result0_data(), add.results()[0]);
+  EXPECT_EQ(add.result0_data(), add.result_data(0));
 
   const AddComp& add_const = add;
   EXPECT_EQ(add_const.error_msg(), "");
@@ -80,7 +76,7 @@ TEST(TFCompileTest, Add) {
   EXPECT_EQ(add_const.arg1_data(), add.arg_data(1));
   EXPECT_EQ(add_const.result0(), 579);
   EXPECT_EQ(add_const.result0_data()[0], 579);
-  EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
+  EXPECT_EQ(add_const.result0_data(), add_const.result_data(0));
 }
 
 // Run tests that use set_argN_data separately, to avoid accidentally re-using
@@ -89,8 +85,8 @@ TEST(TFCompileTest, Add_SetArg) {
   AddComp add(
       XlaCompiledCpuFunction::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
 
-  int32 arg_x = 10;
-  int32 arg_y = 32;
+  alignas(32) int32 arg_x = 10;
+  alignas(32) int32 arg_y = 32;
   add.set_arg0_data(&arg_x);
   add.set_arg1_data(&arg_y);
   EXPECT_EQ(add.arg0_data(), add.arg_data(0));
@@ -100,7 +96,7 @@ TEST(TFCompileTest, Add_SetArg) {
   EXPECT_EQ(add.error_msg(), "");
   EXPECT_EQ(add.result0(), 42);
   EXPECT_EQ(add.result0_data()[0], 42);
-  EXPECT_EQ(add.result0_data(), add.results()[0]);
+  EXPECT_EQ(add.result0_data(), add.result_data(0));
 }
 
 TEST(TFCompileTest, AddWithCkpt) {
@@ -112,14 +108,14 @@ TEST(TFCompileTest, AddWithCkpt) {
   EXPECT_EQ(add.error_msg(), "");
   EXPECT_EQ(add.result0(), 43);
   EXPECT_EQ(add.result0_data()[0], 43);
-  EXPECT_EQ(add.result0_data(), add.results()[0]);
+  EXPECT_EQ(add.result0_data(), add.result_data(0));
 
   add.arg0_data()[0] = 111;
   EXPECT_TRUE(add.Run());
   EXPECT_EQ(add.error_msg(), "");
   EXPECT_EQ(add.result0(), 153);
   EXPECT_EQ(add.result0_data()[0], 153);
-  EXPECT_EQ(add.result0_data(), add.results()[0]);
+  EXPECT_EQ(add.result0_data(), add.result_data(0));
 
   const AddWithCkptComp& add_const = add;
   EXPECT_EQ(add_const.error_msg(), "");
@@ -128,7 +124,7 @@ TEST(TFCompileTest, AddWithCkpt) {
   EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0));
   EXPECT_EQ(add_const.result0(), 153);
   EXPECT_EQ(add_const.result0_data()[0], 153);
-  EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
+  EXPECT_EQ(add_const.result0_data(), add_const.result_data(0));
 }
 
 TEST(TFCompileTest, AddWithCkptSaver) {
@@ -140,14 +136,14 @@ TEST(TFCompileTest, AddWithCkptSaver) {
   EXPECT_EQ(add.error_msg(), "");
   EXPECT_EQ(add.result0(), 43);
   EXPECT_EQ(add.result0_data()[0], 43);
-  EXPECT_EQ(add.result0_data(), add.results()[0]);
+  EXPECT_EQ(add.result0_data(), add.result_data(0));
 
   add.arg0_data()[0] = 111;
   EXPECT_TRUE(add.Run());
   EXPECT_EQ(add.error_msg(), "");
   EXPECT_EQ(add.result0(), 153);
   EXPECT_EQ(add.result0_data()[0], 153);
-  EXPECT_EQ(add.result0_data(), add.results()[0]);
+  EXPECT_EQ(add.result0_data(), add.result_data(0));
 
   const AddWithCkptSaverComp& add_const = add;
   EXPECT_EQ(add_const.error_msg(), "");
@@ -156,7 +152,7 @@ TEST(TFCompileTest, AddWithCkptSaver) {
   EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0));
   EXPECT_EQ(add_const.result0(), 153);
   EXPECT_EQ(add_const.result0_data()[0], 153);
-  EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
+  EXPECT_EQ(add_const.result0_data(), add_const.result_data(0));
 }
 
 TEST(TFCompileTest, Cond) {
@@ -170,17 +166,19 @@ TEST(TFCompileTest, Cond) {
     cond.arg0() = true;
     const int32 expected_result = cond.arg1();
     EXPECT_TRUE(cond.Run());
+    EXPECT_EQ(cond.error_msg(), "");
     EXPECT_EQ(cond.result0(), expected_result);
     EXPECT_EQ(cond.result0_data()[0], expected_result);
-    EXPECT_EQ(cond.result0_data(), cond.results()[0]);
+    EXPECT_EQ(cond.result0_data(), cond.result_data(0));
   }
   {
     cond.arg0() = false;
     const int32 expected_result = cond.arg2();
     EXPECT_TRUE(cond.Run());
+    EXPECT_EQ(cond.error_msg(), "");
     EXPECT_EQ(cond.result0(), expected_result);
     EXPECT_EQ(cond.result0_data()[0], expected_result);
-    EXPECT_EQ(cond.result0_data(), cond.results()[0]);
+    EXPECT_EQ(cond.result0_data(), cond.result_data(0));
   }
 }
 
@@ -202,7 +200,7 @@ TEST(TFCompileTest, Gather) {
       EXPECT_EQ(gather.result0(i), results[i]);
       EXPECT_EQ(gather.result0_data()[i], results[i]);
     }
-    EXPECT_EQ(gather.result0_data(), gather.results()[0]);
+    EXPECT_EQ(gather.result0_data(), gather.result_data(0));
 
     const GatherComp& gather_const = gather;
     EXPECT_EQ(gather_const.error_msg(), "");
@@ -220,7 +218,7 @@ TEST(TFCompileTest, Gather) {
       EXPECT_EQ(gather_const.result0(i), results[i]);
       EXPECT_EQ(gather_const.result0_data()[i], results[i]);
     }
-    EXPECT_EQ(gather_const.result0_data(), gather.results()[0]);
+    EXPECT_EQ(gather_const.result0_data(), gather.result_data(0));
   }
 }
 
@@ -256,7 +254,7 @@ TEST(TFCompileTest, MatMul2) {
       EXPECT_EQ(matmul.result0(i / 2, i % 2), results[i]);
       EXPECT_EQ(matmul.result0_data()[i], results[i]);
     }
-    EXPECT_EQ(matmul.result0_data(), matmul.results()[0]);
+    EXPECT_EQ(matmul.result0_data(), matmul.result_data(0));
   }
 
   // Test using the argN_data() methods.
@@ -271,7 +269,7 @@ TEST(TFCompileTest, MatMul2) {
       EXPECT_EQ(matmul.result0(i / 2, i % 2), results[i]);
       EXPECT_EQ(matmul.result0_data()[i], results[i]);
     }
-    EXPECT_EQ(matmul.result0_data(), matmul.results()[0]);
+    EXPECT_EQ(matmul.result0_data(), matmul.result_data(0));
 
     const foo::bar::MatMulComp& matmul_const = matmul;
     EXPECT_EQ(matmul_const.error_msg(), "");
@@ -289,7 +287,7 @@ TEST(TFCompileTest, MatMul2) {
       EXPECT_EQ(matmul_const.result0(i / 2, i % 2), results[i]);
       EXPECT_EQ(matmul_const.result0_data()[i], results[i]);
     }
-    EXPECT_EQ(matmul_const.result0_data(), matmul.results()[0]);
+    EXPECT_EQ(matmul_const.result0_data(), matmul.result_data(0));
   }
 }
 
@@ -304,8 +302,9 @@ TEST(TFCompileTest, MatMul2_SetArg) {
   matmul.set_thread_pool(&device);
 
   // Test using the set_argN_data() methods.
-  float arg0[2][3] = {{1, 2, 3}, {4, 5, 6}};
-  float arg1[3][2] = {{7, 8}, {9, 10}, {11, 12}};
+
+  alignas(32) float arg0[2][3] = {{1, 2, 3}, {4, 5, 6}};
+  alignas(32) float arg1[3][2] = {{7, 8}, {9, 10}, {11, 12}};
   matmul.set_arg0_data(&arg0);
   matmul.set_arg1_data(&arg1);
   EXPECT_EQ(matmul.arg0_data(), matmul.arg_data(0));
@@ -318,7 +317,7 @@ TEST(TFCompileTest, MatMul2_SetArg) {
     EXPECT_EQ(matmul.result0(i / 2, i % 2), results[i]);
     EXPECT_EQ(matmul.result0_data()[i], results[i]);
   }
-  EXPECT_EQ(matmul.result0_data(), matmul.results()[0]);
+  EXPECT_EQ(matmul.result0_data(), matmul.result_data(0));
 }
 
 TEST(TFCompileTest, MatMulAndAdd1) {
@@ -345,8 +344,8 @@ TEST(TFCompileTest, MatMulAndAdd1) {
       EXPECT_EQ(muladd.result1(i / 2, i % 2), results1[i]);
       EXPECT_EQ(muladd.result1_data()[i], results1[i]);
     }
-    EXPECT_EQ(muladd.result0_data(), muladd.results()[0]);
-    EXPECT_EQ(muladd.result1_data(), muladd.results()[1]);
+    EXPECT_EQ(muladd.result0_data(), muladd.result_data(0));
+    EXPECT_EQ(muladd.result1_data(), muladd.result_data(1));
 
     const ::foo::bar::MatMulAndAddComp& muladd_const = muladd;
     EXPECT_EQ(muladd_const.error_msg(), "");
@@ -366,8 +365,8 @@ TEST(TFCompileTest, MatMulAndAdd1) {
       EXPECT_EQ(muladd_const.result1(i / 2, i % 2), results1[i]);
       EXPECT_EQ(muladd_const.result1_data()[i], results1[i]);
     }
-    EXPECT_EQ(muladd_const.result0_data(), muladd.results()[0]);
-    EXPECT_EQ(muladd_const.result1_data(), muladd.results()[1]);
+    EXPECT_EQ(muladd_const.result0_data(), muladd.result_data(0));
+    EXPECT_EQ(muladd_const.result1_data(), muladd.result_data(1));
   }
 
   // Test methods with named args and results.
@@ -385,8 +384,8 @@ TEST(TFCompileTest, MatMulAndAdd1) {
       EXPECT_EQ(muladd.result_x_y_sum(i / 2, i % 2), results1[i]);
       EXPECT_EQ(muladd.result_x_y_sum_data()[i], results1[i]);
     }
-    EXPECT_EQ(muladd.result_x_y_prod_data(), muladd.results()[0]);
-    EXPECT_EQ(muladd.result_x_y_sum_data(), muladd.results()[1]);
+    EXPECT_EQ(muladd.result_x_y_prod_data(), muladd.result_data(0));
+    EXPECT_EQ(muladd.result_x_y_sum_data(), muladd.result_data(1));
 
     // Test const methods.
     const ::foo::bar::MatMulAndAddComp& muladd_const = muladd;
@@ -407,8 +406,8 @@ TEST(TFCompileTest, MatMulAndAdd1) {
       EXPECT_EQ(muladd_const.result_x_y_sum(i / 2, i % 2), results1[i]);
       EXPECT_EQ(muladd_const.result_x_y_sum_data()[i], results1[i]);
     }
-    EXPECT_EQ(muladd_const.result_x_y_prod_data(), muladd.results()[0]);
-    EXPECT_EQ(muladd_const.result_x_y_sum_data(), muladd.results()[1]);
+    EXPECT_EQ(muladd_const.result_x_y_prod_data(), muladd.result_data(0));
+    EXPECT_EQ(muladd_const.result_x_y_sum_data(), muladd.result_data(1));
   }
 }
 
@@ -424,7 +423,7 @@ TEST(TFCompileTest, Function) {
   EXPECT_EQ(add_fn.error_msg(), "");
   EXPECT_EQ(add_fn.result0(), 3);
   EXPECT_EQ(add_fn.result0_data()[0], 3);
-  EXPECT_EQ(add_fn.result0_data(), add_fn.results()[0]);
+  EXPECT_EQ(add_fn.result0_data(), add_fn.result_data(0));
 }
 
 TEST(TFCompileTest, Splits) {
@@ -484,11 +483,14 @@ TEST(TFCompileTest, VariableReadonly) {
   Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
 
   VariableReadonlyComp fn;
-  float x = 23;
+
+  alignas(32) float x = 23;
   fn.set_var_x_data(&x);
 
   fn.set_thread_pool(&device);
-  fn.Run();
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
+
   EXPECT_EQ(fn.result0(), 65);
   EXPECT_EQ(fn.var_x(), 23);
 }
@@ -498,18 +500,21 @@ TEST(TFCompileTest, Variable) {
   Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
 
   VariableComp fn;
-  float x = 23;
+
+  alignas(32) float x = 23;
   fn.set_var_x_data(&x);
 
   fn.set_thread_pool(&device);
-  fn.Run();
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
   EXPECT_EQ(fn.result0(0, 0), 23);
   EXPECT_EQ(fn.result0(1, 0), 65);
   EXPECT_EQ(fn.var_x(), 65);
 
   EXPECT_EQ(fn.var_x_data(), &x);
   EXPECT_EQ(x, 65);
-  fn.Run();
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
   EXPECT_EQ(fn.result0(0, 0), 65);
   EXPECT_EQ(fn.result0(1, 0), 107);
   EXPECT_EQ(fn.var_x(), 107);
@@ -528,17 +533,19 @@ TEST(TFCompileTest, VariableSequentialUpdates) {
 
   fn.set_thread_pool(&device);
   // First calculate x[3]
-  fn.Run();
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
   EXPECT_NEAR(fn.var_x(), 1.187f, 1e-6);
 
-  const float y = 1;
+  alignas(32) const float y = 1;
   fn.set_var_y_data(&y);
 
-  // Now const_cast<float*>(fn.var_y_data()) is not longer legal since we've set
-  // the buffer to point to a constant location.
+  // Now const_cast<float*>(fn.var_y_data()) is not longer legal since we've
+  // set the buffer to point to a constant location.
 
   // Then calculate x[6]
-  fn.Run();
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
   EXPECT_NEAR(fn.var_x(), 0.594322f, 1e-6);
 }
 
@@ -551,24 +558,27 @@ TEST(TFCompileTest, VariableSequentialUpdatesNoAlloc) {
   // x[n+1] = x[n] - 0.1*(x[n-1] + 1.0)
   VariableSequentialUpdatesComp fn(
       XlaCompiledCpuFunction::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
-  float x = 2;
-  float y = 1;
+
+  alignas(32) float x = 2;
+  alignas(32) float y = 1;
   fn.set_var_x_data(&x);
   fn.set_var_y_data(&y);
 
   fn.set_thread_pool(&device);
   // First calculate x[3]
-  fn.Run();
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
   EXPECT_NEAR(x, 1.187f, 1e-6);
 
   // Then calculate x[6]
-  fn.Run();
+  EXPECT_TRUE(fn.Run());
+  EXPECT_EQ(fn.error_msg(), "");
   EXPECT_NEAR(x, 0.594322f, 1e-6);
 }
 
 TEST(TFCompileTest, AssertEqAndReturnDiff) {
-  // Assert is converted into a no-op in XLA, so there is no failure even if the
-  // two args are different.
+  // Assert is converted into a no-op in XLA, so there is no failure even if
+  // the two args are different.
   AssertComp assert;
   EXPECT_EQ(assert.arg0_data(), assert.arg_data(0));
   EXPECT_EQ(assert.arg1_data(), assert.arg_data(1));
@@ -580,7 +590,7 @@ TEST(TFCompileTest, AssertEqAndReturnDiff) {
   EXPECT_EQ(assert.error_msg(), "");
   EXPECT_EQ(assert.result0(), expected_result);
   EXPECT_EQ(assert.result0_data()[0], expected_result);
-  EXPECT_EQ(assert.result0_data(), assert.results()[0]);
+  EXPECT_EQ(assert.result0_data(), assert.result_data(0));
 }
 
 TEST(TFCompileTest, LookupNameIndex) {
@@ -638,61 +648,6 @@ TEST(TFCompileTest, ProgramShape) {
   EXPECT_TRUE(ShapeUtil::Compatible(muladd_result1, f32_2x2));
 }
 
-TEST(TFCompileTest, HloProfiling) {
-  Eigen::ThreadPool tp(1);
-  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
-
-  MatMulAndAddCompWithProfiling fn;
-  ASSERT_TRUE(fn.hlo_profiling_enabled());
-
-  fn.set_thread_pool(&device);
-
-  // x = [[1, 2], [3, 4]]
-  fn.arg0(0, 0) = 1;
-  fn.arg0(0, 1) = 2;
-  fn.arg0(1, 0) = 3;
-  fn.arg0(1, 1) = 4;
-
-  // y = [[10, 20], [30, 40]]
-  fn.arg1(0, 0) = 10;
-  fn.arg1(0, 1) = 20;
-  fn.arg1(1, 0) = 30;
-  fn.arg1(1, 1) = 40;
-
-  EXPECT_TRUE(fn.Run());
-
-  string hlo_profile_as_string =
-      xla::PrintHloProfile(fn.hlo_profile_printer_data(), fn.profile_counters(),
-                           /*clock_rate_ghz=*/1.0);
-  VLOG(1) << "Original HLO profile string:\n" << hlo_profile_as_string;
-
-  // Strip away identifier details from the profile string to avoid this test
-  // being a change detector for xla internals. Identifiers such as '%dot.0.7'
-  // just become '%dot'.
-  RE2::GlobalReplace(&hlo_profile_as_string, "(%[a-zA-Z0-9]*)[.0-9]*", "\\1");
-  VLOG(1) << "Stripped HLO profile string:\n" << hlo_profile_as_string;
-
-  std::vector<string> hlo_profile_lines =
-      absl::StrSplit(hlo_profile_as_string, '\n');
-
-  auto header = ContainsRegex("Execution profile for");
-  auto total_cycles_profile_line = ContainsRegex(R"(\[total\])");
-  auto dot_profile_line =
-      ContainsRegex(R"(%dot = f32\[2,2\]{1,0\} dot\(.*%arg0, .*%arg1\))");
-  auto add_profile_line =
-      ContainsRegex(R"(%add = f32\[2,2\]\{1,0\} add\(.*%arg0, .*%arg1\))");
-  auto tuple_profile_line = ContainsRegex(
-      R"(%tuple = \(f32\[2,2\]\{1,0\}, f32\[2,2\]\{1,0\}\) tuple\(.*%dot, .*%add\))");
-  auto arg0_profile_line =
-      ContainsRegex(R"(%arg0 = f32\[2,2\]\{1,0\} parameter\(0\))");
-  auto arg1_profile_line =
-      ContainsRegex(R"(%arg1 = f32\[2,2\]\{1,0\} parameter\(1\))");
-
-  EXPECT_THAT(hlo_profile_lines,
-              IsSupersetOf({header, total_cycles_profile_line, dot_profile_line,
-                            add_profile_line, tuple_profile_line}));
-}
-
 }  // namespace
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index c8719714c79f..6a1a0d55511d 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -480,9 +480,6 @@ def tf_library(
                       gen_benchmark=True.
     The output header is called <name>.h.
 
-    Deprecated:
-      tfcompile is deprecated (b/389018081). As an alternative, consider using
-      XLA:CPU's AOT capabilities directly.
     Args:
       name: The name of the build rule.
       graph: The TensorFlow GraphDef to compile.  If the file ends in '.pbtxt'
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index 458d6f708974..ea175238e909 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <string>
-#include <utility>
+#include <iostream>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/aot/compile.h"
diff --git a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
new file mode 100644
index 000000000000..6a775e1f0d74
--- /dev/null
+++ b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.cc
@@ -0,0 +1,696 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/aot/thunk_proto_execution_deserializer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/numeric/int128.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/dot_lib.h"
+#include "xla/backends/cpu/runtime/thunk.pb.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+namespace {
+
+std::string GetBufferAllocationString(
+    const xla::buffer_assignment::BufferAllocationSliceProto& slice) {
+  return absl::StrCat("reinterpret_cast<std::byte*>(buffer_table()[",
+                      slice.buffer_allocation_index(), "]) + ", slice.offset());
+}
+
+}  // namespace
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetThunkSpecificRunImpl(
+    const xla::cpu::CompilationResultProto& proto) && {
+  return ThunkSpecificRunImplFromThunkSequence(proto.thunk_sequence());
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::ThunkSpecificRunImplFromThunkSequence(
+    const xla::cpu::ThunkSequenceProto& thunk_sequence_proto) {
+  std::vector<std::string> thunk_run_impls;
+  thunk_run_impls.reserve(thunk_sequence_proto.thunks_size());
+
+  for (const auto& thunk : thunk_sequence_proto.thunks()) {
+    switch (thunk.impl_case()) {
+      case xla::cpu::ThunkProto::kKernelThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetKernelThunkRunImpl(thunk));
+        break;
+      }
+      case xla::cpu::ThunkProto::kDotThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetDotThunkRunImpl(thunk));
+        break;
+      }
+      case xla::cpu::ThunkProto::kCopyThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetCopyThunkRunImpl(thunk));
+        break;
+      }
+      case xla::cpu::ThunkProto::kConditionalThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetConditionalThunkRunImpl(thunk));
+        break;
+      }
+      case xla::cpu::ThunkProto::kWhileThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetWhileThunkRunImpl(thunk));
+        break;
+      }
+      case xla::cpu::ThunkProto::kConvolutionThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetConvolutionFusionThunkRunImpl(thunk));
+        break;
+      }
+      case xla::cpu::ThunkProto::kRngGetAndUpdateStateThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetRngGetAndUpdateStateThunkRunImpl(thunk));
+        break;
+      }
+      case xla::cpu::ThunkProto::kCallThunk: {
+        TF_ASSIGN_OR_RETURN(thunk_run_impls.emplace_back(),
+                            GetCallThunkRunImpl(thunk));
+        break;
+      }
+      default: {
+        return xla::Internal("Unsupported thunk type: %s.", thunk.kind());
+      }
+    }
+  }
+
+  return absl::StrJoin(thunk_run_impls, "\n");
+}
+
+absl::StatusOr<std::string> ThunkProtoExecutionDeserializer::GetMatmulFunction(
+    xla::PrimitiveType xla_type, bool is_single_threaded) {
+  switch (xla_type) {
+    case xla::F16:
+      return is_single_threaded
+                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulF16"
+                 : "__xla_cpu_runtime_EigenMatMulF16";
+    case xla::F32:
+      return is_single_threaded
+                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulF32"
+                 : "__xla_cpu_runtime_EigenMatMulF32";
+    case xla::F64:
+      return is_single_threaded
+                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulF64"
+                 : "__xla_cpu_runtime_EigenMatMulF64";
+    case xla::C64:
+      return is_single_threaded
+                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulC64"
+                 : "__xla_cpu_runtime_EigenMatMulC64";
+    case xla::C128:
+      return is_single_threaded
+                 ? "__xla_cpu_runtime_EigenSingleThreadedMatMulC128"
+                 : "__xla_cpu_runtime_EigenMatMulC128";
+    default:
+      return xla::Internal("Unsupported xla type: %d", xla_type);
+  }
+}
+
+absl::StatusOr<std::string> ThunkProtoExecutionDeserializer::GetDotThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_dot_thunk()) {
+    return xla::Internal(
+        "Dot thunk was expected when getting thunk run implementation.");
+  }
+  const xla::cpu::DotThunkProto& dot_thunk = thunk.dot_thunk();
+
+  absl::string_view dot_thunk_invocation_format = R"(
+     // Dot Thunk
+     {
+         if (run_options()->intra_op_thread_pool() != nullptr) {
+           {{MATMUL_FUNCTION}}(
+            run_options(), {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}},
+            {{M}}, {{N}}, {{K}}, {{TRANSPOSE_LHS}}, {{TRANSPOSE_RHS}});
+         } else {
+           {{SINGLE_THREADED_MATMUL_FUNCTION}}(
+            nullptr, {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}},
+            {{M}}, {{N}}, {{K}}, {{TRANSPOSE_LHS}}, {{TRANSPOSE_RHS}});
+         }
+     }
+     )";
+
+  if (!(dot_thunk.lhs_buffer_shape().shape().element_type() ==
+            dot_thunk.rhs_buffer_shape().shape().element_type() &&
+        dot_thunk.rhs_buffer_shape().shape().element_type() ==
+            dot_thunk.out_buffer_shape().shape().element_type())) {
+    return xla::Internal(
+        "Dot thunk has mismatched types between lhs, rhs, and out buffers.");
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::string matmul_function,
+      GetMatmulFunction(dot_thunk.lhs_buffer_shape().shape().element_type(),
+                        /*is_single_threaded=*/false));
+
+  TF_ASSIGN_OR_RETURN(
+      std::string single_threaded_matmul_function,
+      GetMatmulFunction(dot_thunk.lhs_buffer_shape().shape().element_type(),
+                        /*is_single_threaded=*/true));
+
+  TF_ASSIGN_OR_RETURN(std::string data_type,
+                      CppDataTypeFromXlaType(
+                          dot_thunk.lhs_buffer_shape().shape().element_type()));
+
+  std::string output_ptr = absl::StrCat(
+      "reinterpret_cast<", data_type, "*>(",
+      GetBufferAllocationString(dot_thunk.out_buffer_shape().slice()), ")");
+  std::string lhs_ptr = absl::StrCat(
+      "reinterpret_cast<", data_type, "*>(",
+      GetBufferAllocationString(dot_thunk.lhs_buffer_shape().slice()), ")");
+  std::string rhs_ptr = absl::StrCat(
+      "reinterpret_cast<", data_type, "*>(",
+      GetBufferAllocationString(dot_thunk.rhs_buffer_shape().slice()), ")");
+
+  auto lhs_shape = xla::Shape(dot_thunk.lhs_buffer_shape().shape());
+  auto rhs_shape = xla::Shape(dot_thunk.rhs_buffer_shape().shape());
+  auto out_shape = xla::Shape(dot_thunk.out_buffer_shape().shape());
+
+  TF_ASSIGN_OR_RETURN(xla::cpu::DotShape dot_shape,
+                      xla::cpu::GetDotShape(dot_thunk.dot_dimensions(),
+                                            lhs_shape, rhs_shape, out_shape));
+
+  TF_ASSIGN_OR_RETURN(
+      xla::cpu::DotCanonicalDims dot_canonical_dims,
+      GetDotCanonicalDims(dot_thunk.dot_dimensions(), dot_shape));
+
+  size_t m = dot_canonical_dims.m;
+  size_t k = dot_canonical_dims.k;
+  size_t n = dot_canonical_dims.n;
+
+  // Decide if a transpose is required based on an XOR of the canonical and
+  // column major flags.
+  bool transpose_lhs =
+      (dot_canonical_dims.lhs_canonical != dot_canonical_dims.lhs_column_major);
+  bool transpose_rhs =
+      (dot_canonical_dims.rhs_canonical != dot_canonical_dims.rhs_column_major);
+
+  if (!dot_canonical_dims.output_column_major) {
+    std::swap(m, n);
+    std::swap(lhs_ptr, rhs_ptr);
+    std::swap(transpose_lhs, transpose_rhs);
+    transpose_lhs = !transpose_lhs;
+    transpose_rhs = !transpose_rhs;
+  }
+
+  return absl::StrReplaceAll(
+      dot_thunk_invocation_format,
+      {{"{{MATMUL_FUNCTION}}", matmul_function},
+       {"{{SINGLE_THREADED_MATMUL_FUNCTION}}", single_threaded_matmul_function},
+       {"{{OUTPUT_PTR}}", output_ptr},
+       {"{{LHS_PTR}}", lhs_ptr},
+       {"{{RHS_PTR}}", rhs_ptr},
+       {"{{M}}", absl::StrCat(m)},
+       {"{{N}}", absl::StrCat(n)},
+       {"{{K}}", absl::StrCat(k)},
+       {"{{TRANSPOSE_LHS}}", transpose_lhs ? "true" : "false"},
+       {"{{TRANSPOSE_RHS}}", transpose_rhs ? "true" : "false"}});
+};
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetConvolutionFunction(
+    xla::PrimitiveType xla_type, bool is_single_threaded) {
+  switch (xla_type) {
+    case xla::F16:
+      return is_single_threaded
+                 ? "__xla_cpu_runtime_EigenSingleThreadedConv2DF16"
+                 : "__xla_cpu_runtime_EigenConv2DF16";
+    case xla::F32:
+      return is_single_threaded
+                 ? "__xla_cpu_runtime_EigenSingleThreadedConv2DF32"
+                 : "__xla_cpu_runtime_EigenConv2DF32";
+    default:
+      return xla::Internal("Unsupported xla type: %d", xla_type);
+  }
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetConvolution2DRunImpl(
+    const xla::cpu::ConvolutionThunkProto& convolution_thunk,
+    const xla::cpu::ConvolutionCanonicalDims& canonical_dims) {
+  TF_ASSIGN_OR_RETURN(
+      std::string data_type,
+      CppDataTypeFromXlaType(
+          convolution_thunk.input_buffer_shape().shape().element_type()));
+
+  std::string output_ptr =
+      absl::StrCat("reinterpret_cast<", data_type, "*>(",
+                   GetBufferAllocationString(
+                       convolution_thunk.output_buffer_shape().slice()),
+                   ")");
+  std::string lhs_ptr = absl::StrCat(
+      "reinterpret_cast<", data_type, "*>(",
+      GetBufferAllocationString(convolution_thunk.input_buffer_shape().slice()),
+      ")");
+  std::string rhs_ptr =
+      absl::StrCat("reinterpret_cast<", data_type, "*>(",
+                   GetBufferAllocationString(
+                       convolution_thunk.kernel_buffer_shape().slice()),
+                   ")");
+
+  TF_ASSIGN_OR_RETURN(
+      std::string convolution_function,
+      GetConvolutionFunction(
+          convolution_thunk.input_buffer_shape().shape().element_type(),
+          /*is_single_threaded=*/false));
+
+  TF_ASSIGN_OR_RETURN(
+      std::string single_threaded_convolution_function,
+      GetConvolutionFunction(
+          convolution_thunk.input_buffer_shape().shape().element_type(),
+          /*is_single_threaded=*/true));
+
+  absl::string_view convolution_thunk_invocation_format = R"(
+     // Convolution Thunk
+     {
+         if (run_options()->intra_op_thread_pool() != nullptr) {
+           {{CONVOLUTION_FUNCTION}}(
+             run_options(),
+             {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}}, {{INPUT_BATCH}},
+             {{INPUT_ROWS}}, {{INPUT_COLS}}, {{INPUT_CHANNELS}}, {{KERNEL_ROWS}},
+             {{KERNEL_COLS}}, {{KERNEL_CHANNELS}}, {{KERNEL_FILTERS}},
+             {{OUTPUT_ROWS}}, {{OUTPUT_COLS}}, {{ROW_STRIDE}}, {{COL_STRIDE}},
+             {{PADDING_TOP}}, {{PADDING_BOTTOM}}, {{PADDING_LEFT}},
+             {{PADDING_RIGHT}}, {{LHS_ROW_DILATION}}, {{LHS_COL_DILATION}},
+             {{RHS_ROW_DILATION}}, {{RHS_COL_DILATION}}, {{FEATURE_GROUP_COUNT}}
+           );
+         } else {
+           {{SINGLE_THREADED_CONVOLUTION_FUNCTION}}(
+             nullptr,
+             {{OUTPUT_PTR}}, {{LHS_PTR}}, {{RHS_PTR}}, {{INPUT_BATCH}},
+             {{INPUT_ROWS}}, {{INPUT_COLS}}, {{INPUT_CHANNELS}}, {{KERNEL_ROWS}},
+             {{KERNEL_COLS}}, {{KERNEL_CHANNELS}}, {{KERNEL_FILTERS}},
+             {{OUTPUT_ROWS}}, {{OUTPUT_COLS}}, {{ROW_STRIDE}}, {{COL_STRIDE}},
+             {{PADDING_TOP}}, {{PADDING_BOTTOM}}, {{PADDING_LEFT}},
+             {{PADDING_RIGHT}}, {{LHS_ROW_DILATION}}, {{LHS_COL_DILATION}},
+             {{RHS_ROW_DILATION}}, {{RHS_COL_DILATION}}, {{FEATURE_GROUP_COUNT}}
+           );
+         }
+     })";
+
+  return absl::StrReplaceAll(
+      convolution_thunk_invocation_format,
+      {{"{{CONVOLUTION_FUNCTION}}", convolution_function},
+       {"{{SINGLE_THREADED_CONVOLUTION_FUNCTION}}",
+        single_threaded_convolution_function},
+       {"{{OUTPUT_PTR}}", output_ptr},
+       {"{{LHS_PTR}}", lhs_ptr},
+       {"{{RHS_PTR}}", rhs_ptr},
+       {"{{INPUT_BATCH}}", absl::StrCat(canonical_dims.input_batch)},
+       {"{{INPUT_ROWS}}", absl::StrCat(canonical_dims.input_dims.x)},
+       {"{{INPUT_COLS}}", absl::StrCat(canonical_dims.input_dims.y)},
+       {"{{INPUT_CHANNELS}}", absl::StrCat(canonical_dims.input_channels)},
+       {"{{KERNEL_ROWS}}", absl::StrCat(canonical_dims.kernel_dims.x)},
+       {"{{KERNEL_COLS}}", absl::StrCat(canonical_dims.kernel_dims.y)},
+       {"{{KERNEL_CHANNELS}}", absl::StrCat(canonical_dims.kernel_channels)},
+       {"{{KERNEL_FILTERS}}", absl::StrCat(canonical_dims.kernel_filters)},
+       {"{{OUTPUT_ROWS}}", absl::StrCat(canonical_dims.output_dims.x)},
+       {"{{OUTPUT_COLS}}", absl::StrCat(canonical_dims.output_dims.y)},
+       {"{{ROW_STRIDE}}", absl::StrCat(canonical_dims.strides.x)},
+       {"{{COL_STRIDE}}", absl::StrCat(canonical_dims.strides.y)},
+       {"{{PADDING_TOP}}", absl::StrCat(canonical_dims.padding_before.x)},
+       {"{{PADDING_BOTTOM}}", absl::StrCat(canonical_dims.padding_after.x)},
+       {"{{PADDING_LEFT}}", absl::StrCat(canonical_dims.padding_before.y)},
+       {"{{PADDING_RIGHT}}", absl::StrCat(canonical_dims.padding_after.y)},
+       {"{{LHS_ROW_DILATION}}", absl::StrCat(canonical_dims.base_dilation.x)},
+       {"{{LHS_COL_DILATION}}", absl::StrCat(canonical_dims.base_dilation.y)},
+       {"{{RHS_ROW_DILATION}}", absl::StrCat(canonical_dims.window_dilation.x)},
+       {"{{RHS_COL_DILATION}}", absl::StrCat(canonical_dims.window_dilation.y)},
+       {"{{FEATURE_GROUP_COUNT}}",
+        absl::StrCat(canonical_dims.feature_group_count)}});
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetConvolutionFusionThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_convolution_thunk()) {
+    return xla::Internal(
+        "Convolution thunk was expected when getting thunk run "
+        "implementation.");
+  }
+  const xla::cpu::ConvolutionThunkProto& convolution_thunk =
+      thunk.convolution_thunk();
+
+  // NOTE(basioli): Slices are not needed here, we only use this class to
+  // invoke GetConvolutionCanonicalDims.
+  xla::cpu::ConvolutionSlices slices{
+      /*input_buffer =*/{},
+      /*input_shape =*/
+      xla::Shape(convolution_thunk.input_buffer_shape().shape()),
+      /*kernel_buffer =*/{},
+      /*kernel_shape =*/
+      xla::Shape(convolution_thunk.kernel_buffer_shape().shape()),
+      /*output_buffer =*/{},
+      /*output_shape =*/
+      xla::Shape(convolution_thunk.output_buffer_shape().shape()),
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      xla::cpu::ConvolutionCanonicalDims canonical_dims,
+      xla::cpu::GetConvolutionCanonicalDims(
+          slices, convolution_thunk.dimension_numbers(),
+          convolution_thunk.window(), convolution_thunk.feature_group_count()));
+
+  if (canonical_dims.convolution_rank() == 2) {
+    return GetConvolution2DRunImpl(convolution_thunk, canonical_dims);
+  } else {
+    return xla::Internal("3D convolution is not implemented.");
+  }
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetRngGetAndUpdateStateThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_rng_get_and_update_state_thunk()) {
+    return xla::Internal(
+        "RngGetAndUpdateState thunk was expected when getting thunk run "
+        "implementation.");
+  }
+  const xla::cpu::RngGetAndUpdateStateThunkProto& rng_thunk =
+      thunk.rng_get_and_update_state_thunk();
+  absl::string_view rng_thunk_invocation_format = R"(
+     // Rng Thunk
+     {
+         rng_states_[{{RNG_STATE_INDEX}}].GetAndUpdateState({{RNG_STATE_PTR}});
+     })";
+
+  if (rng_thunk.state_buffer().size() != sizeof(absl::int128)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Rng state buffer size: ", rng_thunk.state_buffer().size(),
+                     " is not equal to the size of an absl::int128: ",
+                     sizeof(absl::int128)));
+  }
+
+  return absl::StrReplaceAll(
+      rng_thunk_invocation_format,
+      {{"{{RNG_STATE_INDEX}}", absl::StrCat(rng_state_index_++)},
+       {"{{RNG_STATE_PTR}}",
+        absl::StrCat("reinterpret_cast<uint64_t*>(",
+                     GetBufferAllocationString(rng_thunk.state_buffer()),
+                     ")")}});
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetCallThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_call_thunk()) {
+    return xla::Internal(
+        "Calls thunk was expected when getting thunk run implementation.");
+  }
+  const xla::cpu::CallThunkProto& call_thunk = thunk.call_thunk();
+  absl::string_view call_thunk_invocation_format = R"(
+     // Call Thunk
+     {
+         {{CALL_THUNK_IMPL}}
+     })";
+
+  TF_ASSIGN_OR_RETURN(
+      std::string call_thunk_impl,
+      ThunkSpecificRunImplFromThunkSequence(call_thunk.called_sequence()));
+
+  return absl::StrReplaceAll(call_thunk_invocation_format,
+                             {{"{{CALL_THUNK_IMPL}}", call_thunk_impl}});
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetKernelThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_kernel_thunk()) {
+    return xla::Internal(
+        "Kernel thunk was expected when getting thunk run implementation.");
+  }
+  const xla::cpu::KernelThunkProto& kernel_thunk = thunk.kernel_thunk();
+
+  auto get_args_initializer_as_string =
+      [](const xla::cpu::KernelThunkProto& kernel_thunk) -> std::string {
+    std::vector<std::string> args_initializer;
+    for (const auto& buffer_proto : kernel_thunk.arguments_buffers()) {
+      args_initializer.push_back(absl::StrCat(
+          "XLA_CPU_KernelArg{", GetBufferAllocationString(buffer_proto), ", ",
+          buffer_proto.size(), "}"));
+    }
+    for (const auto& buffer_proto : kernel_thunk.results_buffers()) {
+      args_initializer.push_back(absl::StrCat(
+          "XLA_CPU_KernelArg{", GetBufferAllocationString(buffer_proto), ", ",
+          buffer_proto.size(), "}"));
+    }
+    return absl::StrCat("{", absl::StrJoin(args_initializer, ", "), "}");
+  };
+
+  // Execute in block so we don't have to worry about naming for now
+  absl::string_view kernel_invocation_format = R"(
+     // Kernel Thunk
+     {
+       std::array<XLA_CPU_KernelArg, {{NUM_ARGS}}> args = {{ARGS_INITIALIZER}};
+       XLA_CPU_KernelThreadDim kernel_thread_dims = {
+           {{THREAD_DIM_X}},
+           {{THREAD_DIM_Y}},
+           {{THREAD_DIM_Z}},
+       };
+
+       for (uint64_t z = 0; z < {{THREAD_DIM_Z}}; ++z) {
+         for (uint64_t y = 0; y < {{THREAD_DIM_Y}}; ++y) {
+           for (uint64_t x = 0; x < {{THREAD_DIM_X}}; ++x) {
+             XLA_CPU_KernelThread kernel_thread = {x, y, z};
+
+             XLA_CPU_KernelCallFrame call_frame = {
+                 &kernel_thread_dims, &kernel_thread, args.size(), args.data()};
+
+             XLA_CPU_KernelError* error = (*{{KERNEL_NAME}})(&call_frame);
+
+             if (ABSL_PREDICT_FALSE(error != nullptr)) {
+               return false;
+             }
+           }
+         }
+       }
+     }
+     )";
+
+  return absl::StrReplaceAll(
+      kernel_invocation_format,
+      {
+          {"{{NUM_ARGS}}",
+           absl::StrCat(kernel_thunk.arguments_buffers().size() +
+                        kernel_thunk.results_buffers().size())},
+          {"{{ARGS_INITIALIZER}}",
+           get_args_initializer_as_string(kernel_thunk)},
+          {"{{THREAD_DIM_X}}", absl::StrCat(kernel_thunk.thread_dim().x())},
+          {"{{THREAD_DIM_Y}}", absl::StrCat(kernel_thunk.thread_dim().y())},
+          {"{{THREAD_DIM_Z}}", absl::StrCat(kernel_thunk.thread_dim().z())},
+          {"{{KERNEL_NAME}}", kernel_thunk.kernel_name()},
+      });
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetCopyThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_copy_thunk()) {
+    return xla::Internal(
+        "Copy thunk was expected when getting thunk run implementation.");
+  }
+  const xla::cpu::CopyThunkProto& copy_thunk = thunk.copy_thunk();
+
+  if (!xla::ShapeUtil::Equal(
+          xla::Shape(copy_thunk.src_buffer_shape().shape()),
+          xla::Shape(copy_thunk.dst_buffer_shape().shape()))) {
+    return xla::Internal("Source and destination shapes must be equal.");
+  }
+
+  absl::string_view copy_invocation_format = R"(
+     // Copy Thunk
+     {
+       std::memcpy({{DST_BUFFER}},
+                   {{SRC_BUFFER}},
+                   {{SRC_BUFFER_SIZE}});
+     }
+     )";
+
+  return absl::StrReplaceAll(
+      copy_invocation_format,
+      {
+          {"{{DST_BUFFER}}",
+           GetBufferAllocationString(copy_thunk.dst_buffer_shape().slice())},
+          {"{{SRC_BUFFER}}",
+           GetBufferAllocationString(copy_thunk.src_buffer_shape().slice())},
+          {"{{SRC_BUFFER_SIZE}}",
+           absl::StrCat(copy_thunk.src_buffer_shape().slice().size())},
+      });
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetConditionalThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_conditional_thunk()) {
+    return xla::Internal(
+        "Conditional thunk was expected when getting thunk run "
+        "implementation.");
+  }
+  const xla::cpu::ConditionalThunkProto& conditional_thunk =
+      thunk.conditional_thunk();
+
+  std::vector<std::string> conditional_thunk_branches;
+  conditional_thunk_branches.reserve(conditional_thunk.branch_sequences_size());
+  for (const auto& branch_sequence : conditional_thunk.branch_sequences()) {
+    TF_ASSIGN_OR_RETURN(conditional_thunk_branches.emplace_back(),
+                        ThunkSpecificRunImplFromThunkSequence(branch_sequence));
+  }
+
+  absl::string_view branch_execution_format = R"(
+         case {{CASE_INDEX}}: {
+           {{BRANCH_EXECUTION}}
+           break;
+         }
+     )";
+
+  std::vector<std::string> branch_execution_impls;
+  branch_execution_impls.reserve(conditional_thunk_branches.size());
+
+  for (size_t i = 0; i < conditional_thunk_branches.size(); ++i) {
+    branch_execution_impls.push_back(absl::StrReplaceAll(
+        branch_execution_format,
+        {
+            {"{{CASE_INDEX}}", absl::StrCat(i)},
+            {"{{BRANCH_EXECUTION}}", conditional_thunk_branches[i]},
+        }));
+  }
+
+  absl::string_view conditional_thunk_invocation_format = R"(
+     // Conditional Thunk
+     {
+       size_t branch_index = {{BRANCH_INDEX}};
+       CHECK(branch_index < {{NUM_BRANCHES}}) << "branch_index is out of bounds";
+       switch (branch_index) {
+         {{BRANCH_EXECUTIONS}}
+       }
+     })";
+
+  auto get_branch_index =
+      [](const xla::buffer_assignment::BufferAllocationSliceProto&
+             branch_index_buffer) -> absl::StatusOr<std::string> {
+    if (branch_index_buffer.size() == sizeof(bool)) {
+      return absl::StrCat("*reinterpret_cast<bool*>(",
+                          GetBufferAllocationString(branch_index_buffer),
+                          ") ? 0 : 1");
+    }
+    if (branch_index_buffer.size() == sizeof(int32_t)) {
+      return absl::StrCat("*reinterpret_cast<int32_t*>(",
+                          GetBufferAllocationString(branch_index_buffer), ")");
+    }
+
+    return xla::Internal("Unsupported branch index buffer size %d",
+                         branch_index_buffer.size());
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      std::string branch_index,
+      get_branch_index(conditional_thunk.branch_index_buffer()));
+
+  return absl::StrReplaceAll(
+      conditional_thunk_invocation_format,
+      {
+          {"{{BRANCH_INDEX}}", branch_index},
+          {"{{NUM_BRANCHES}}", absl::StrCat(branch_execution_impls.size())},
+          {"{{BRANCH_EXECUTIONS}}",
+           absl::StrJoin(branch_execution_impls, "\n")},
+      });
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetForLoopThunkRunImpl(
+    const xla::cpu::WhileThunkProto& while_thunk) {
+  if (!while_thunk.has_trip_count()) {
+    return xla::Internal("While thunk is missing trip count.");
+  }
+  int64_t trip_count = while_thunk.trip_count().value();
+
+  absl::string_view for_loop_thunk_invocation_format = R"(
+     // For Loop Thunk
+     {
+       for (int64_t loop_counter = 0; loop_counter < {{TRIP_COUNT}}; ++loop_counter) {
+         {{BODY_EXECUTION}};
+       }
+     }
+     )";
+
+  TF_ASSIGN_OR_RETURN(
+      std::string body_execution,
+      ThunkSpecificRunImplFromThunkSequence(while_thunk.body_sequence()));
+
+  return absl::StrReplaceAll(for_loop_thunk_invocation_format,
+                             {
+                                 {"{{TRIP_COUNT}}", absl::StrCat(trip_count)},
+                                 {"{{BODY_EXECUTION}}", body_execution},
+                             });
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::GetWhileThunkRunImpl(
+    const xla::cpu::ThunkProto& thunk) {
+  if (!thunk.has_while_thunk()) {
+    return xla::Internal(
+        "While thunk was expected when getting thunk run implementation.");
+  }
+  const xla::cpu::WhileThunkProto& while_thunk = thunk.while_thunk();
+
+  if (!while_thunk.has_trip_count()) {
+    return xla::Internal("Only while thunks with a trip count are supported.");
+  }
+
+  return GetForLoopThunkRunImpl(while_thunk);
+}
+
+absl::StatusOr<std::string>
+ThunkProtoExecutionDeserializer::CppDataTypeFromXlaType(
+    xla::PrimitiveType xla_type) {
+  switch (xla_type) {
+    case xla::F16:
+      return "Eigen::half";
+    case xla::F32:
+      return "float";
+    case xla::F64:
+      return "double";
+    case xla::C64:
+      return "std::complex<float>";
+    case xla::C128:
+      return "std::complex<double>";
+    default:
+      return xla::Internal("Unsupported xla type: %d", xla_type);
+  }
+}
+
+}  // namespace tfcompile
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
new file mode 100644
index 000000000000..8e8679d28683
--- /dev/null
+++ b/tensorflow/compiler/aot/thunk_proto_execution_deserializer.h
@@ -0,0 +1,91 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_AOT_THUNK_PROTO_EXECUTION_DESERIALIZER_H_
+#define TENSORFLOW_COMPILER_AOT_THUNK_PROTO_EXECUTION_DESERIALIZER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/cpu/runtime/convolution_lib.h"
+#include "xla/backends/cpu/runtime/thunk.pb.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+// Helper class for deserializing the contents of specific thunks into C++ code
+// that is used to codegen the `Run` method of the tfcompiled models.
+class ThunkProtoExecutionDeserializer {
+ public:
+  absl::StatusOr<std::string> GetThunkSpecificRunImpl(
+      const xla::cpu::CompilationResultProto& proto) &&;
+
+  absl::StatusOr<std::string> ThunkSpecificRunImplFromThunkSequence(
+      const xla::cpu::ThunkSequenceProto& thunk_sequence_proto);
+
+ protected:
+  absl::StatusOr<std::string> GetMatmulFunction(xla::PrimitiveType xla_type,
+                                                bool is_single_threaded);
+
+  absl::StatusOr<std::string> GetDotThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> GetConvolutionFunction(
+      xla::PrimitiveType xla_type, bool is_single_threaded);
+
+  absl::StatusOr<std::string> GetConvolution2DRunImpl(
+      const xla::cpu::ConvolutionThunkProto& convolution_thunk,
+      const xla::cpu::ConvolutionCanonicalDims& canonical_dims);
+
+  absl::StatusOr<std::string> GetConvolutionFusionThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> GetRngGetAndUpdateStateThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> GetCallThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> GetKernelThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> GetCopyThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> GetConditionalThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> GetForLoopThunkRunImpl(
+      const xla::cpu::WhileThunkProto& while_thunk);
+
+  absl::StatusOr<std::string> GetWhileThunkRunImpl(
+      const xla::cpu::ThunkProto& thunk);
+
+  absl::StatusOr<std::string> CppDataTypeFromXlaType(
+      xla::PrimitiveType xla_type);
+
+ private:
+  // The index of the next rng state to use when deserializing the rng state
+  // from the ThunkProto.
+  int64_t rng_state_index_ = 0;
+};
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_THUNK_PROTO_EXECUTION_DESERIALIZER_H_
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 23a6fa0d2404..39f93d17aa29 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,3 +1,5 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_xla//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load(
     "@local_xla//xla/tsl:tsl.bzl",
@@ -106,6 +108,10 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "@local_xla//xla/service:gpu_plugin",
         "//tensorflow/core/tfrt/common:pjrt_gpu_client_registration",
+    ]) + if_cuda([
+        "@local_xla//xla/stream_executor/cuda:all_runtime",  # buildcleaner: keep
+    ]) + if_rocm([
+        "@local_xla//xla/stream_executor/rocm:all_runtime",  # buildcleaner: keep
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 2b15a4affc76..50b263716988 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -370,11 +370,16 @@ bool RecursiveCompilabilityChecker::OpIsSlow(const Node& node) const {
   // https://github.com/tensorflow/tensorflow/pull/31012:
   //    ResizeNearestNeighbor, ResizeBilinear, and ResizeBilinearGrad sometimes
   //    create convolutions too large for CuDNN to handle.
+  // NonMaxSuppressionV3/V4 in XLA runs significantly slower than TF kernel in
+  // object detection models, specially when there are a lot of proposed
+  // bounding boxes.
   return node.type_string() == "SelfAdjointEigV2" ||
          node.type_string() == "Svd" || node.type_string() == "Qr" ||
          node.type_string() == "MatrixInverse" ||
          node.type_string() == "MatrixSolve" ||
-         node.type_string() == "ResizeBilinearGrad";
+         node.type_string() == "ResizeBilinearGrad" ||
+         node.type_string() == "NonMaxSuppressionV3" ||
+         node.type_string() == "NonMaxSuppressionV4";
 }
 
 bool RecursiveCompilabilityChecker::IsCompilableNode(
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index 0fe2d2d2fe96..ea24176bb04a 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -51,6 +51,7 @@ constexpr char kUncompilableFunctionName[] = "UncompilableFn";
 constexpr char kUncompilableFunctionNodeName[] = "n_c_uncompilable";
 constexpr char kUncompilableFunctionTwoName[] = "UncompilableFnTwo";
 constexpr char kUncompilableFunctionNodeTwoName[] = "n_d_uncompilable";
+constexpr char kNonMaxSuppressionNodeName[] = "NonMaxSuppression";
 
 // A dummy OpKernel for testing.
 class DummyCompilableOp : public XlaOpKernel {
@@ -63,6 +64,7 @@ class DummyCompilableOp : public XlaOpKernel {
 
 // Register the DummyCompilableOp kernel for CPU.
 REGISTER_OP("InputFloatOp").Output("o: float");
+REGISTER_OP("InputInt32Op").Output("o: int32");
 REGISTER_OP("CompilableOp").Input("i: float").Output("o: float");
 REGISTER_XLA_OP(Name("CompilableOp").Device(DEVICE_CPU_XLA_JIT),
                 DummyCompilableOp);
@@ -554,5 +556,90 @@ TEST_F(CompilabilityCheckUtilTest, TestCanTriggerXlaCompilation) {
   EXPECT_TRUE(CanTriggerXlaCompilation(graph_def));
 }
 
+TEST_F(CompilabilityCheckUtilTest, CheckNonMaxSuppressionV3UncompilableSlowOp) {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  auto opts = builder.opts();
+
+  Node* boxes = ops::SourceOp("InputFloatOp", opts);
+  Node* scores = ops::SourceOp("InputFloatOp", opts);
+  Node* max_output_size = ops::SourceOp("InputInt32Op", opts);
+  Node* iou_threshold = ops::SourceOp("InputFloatOp", opts);
+  Node* score_threshold = ops::SourceOp("InputFloatOp", opts);
+
+  NodeBuilder non_max_suppression_builder(
+      kNonMaxSuppressionNodeName, "NonMaxSuppressionV3", opts.op_registry());
+  non_max_suppression_builder.Input(boxes)
+      .Input(scores)
+      .Input(max_output_size)
+      .Input(iou_threshold)
+      .Input(score_threshold)
+      .Attr("T", DT_FLOAT);
+  Node* non_max_suppression;
+  non_max_suppression =
+      builder.opts().FinalizeBuilder(&non_max_suppression_builder);
+
+  GraphDef graph_def;
+  TF_EXPECT_OK(builder.ToGraphDef(&graph_def));
+  auto* flib_runtime = GetFunctionLibraryRuntime();
+
+  EXPECT_FALSE(checker_->IsCompilableNode(*non_max_suppression, flib_runtime));
+
+  const auto uncompilable_nodes =
+      checker_->FindUncompilableNodes(*non_max_suppression, flib_runtime);
+  ASSERT_EQ(1, uncompilable_nodes.size());
+  auto node_info_it =
+      uncompilable_nodes.find(NameAttrList().ShortDebugString());
+  ASSERT_NE(uncompilable_nodes.end(), node_info_it);
+
+  const auto& uncompilable_nodes_inside_function = node_info_it->second.second;
+  ASSERT_EQ(1, uncompilable_nodes_inside_function.size());
+  const auto& uncompilable_node_info = uncompilable_nodes_inside_function.at(0);
+  EXPECT_TRUE(absl::StrContains(uncompilable_node_info.uncompilable_reason,
+                                "slow operation"));
+}
+
+TEST_F(CompilabilityCheckUtilTest, CheckNonMaxSuppressionV4UncompilableSlowOp) {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  auto opts = builder.opts();
+
+  Node* boxes = ops::SourceOp("InputFloatOp", opts);
+  Node* scores = ops::SourceOp("InputFloatOp", opts);
+  Node* max_output_size = ops::SourceOp("InputInt32Op", opts);
+  Node* iou_threshold = ops::SourceOp("InputFloatOp", opts);
+  Node* score_threshold = ops::SourceOp("InputFloatOp", opts);
+
+  NodeBuilder non_max_suppression_v4_builder(
+      kNonMaxSuppressionNodeName, "NonMaxSuppressionV4", opts.op_registry());
+  non_max_suppression_v4_builder.Input(boxes)
+      .Input(scores)
+      .Input(max_output_size)
+      .Input(iou_threshold)
+      .Input(score_threshold)
+      .Attr("T", DT_FLOAT);
+  Node* non_max_suppression_v4;
+  non_max_suppression_v4 =
+      builder.opts().FinalizeBuilder(&non_max_suppression_v4_builder);
+
+  GraphDef graph_def;
+  TF_EXPECT_OK(builder.ToGraphDef(&graph_def));
+  auto* flib_runtime = GetFunctionLibraryRuntime();
+
+  EXPECT_FALSE(
+      checker_->IsCompilableNode(*non_max_suppression_v4, flib_runtime));
+
+  const auto uncompilable_nodes =
+      checker_->FindUncompilableNodes(*non_max_suppression_v4, flib_runtime);
+  ASSERT_EQ(1, uncompilable_nodes.size());
+  auto node_info_it =
+      uncompilable_nodes.find(NameAttrList().ShortDebugString());
+  ASSERT_NE(uncompilable_nodes.end(), node_info_it);
+
+  const auto& uncompilable_nodes_inside_function = node_info_it->second.second;
+  ASSERT_EQ(1, uncompilable_nodes_inside_function.size());
+  const auto& uncompilable_node_info = uncompilable_nodes_inside_function.at(0);
+  EXPECT_TRUE(absl::StrContains(uncompilable_node_info.uncompilable_reason,
+                                "slow operation"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compilation_cache.h b/tensorflow/compiler/jit/device_compilation_cache.h
index 6137d1bfd95a..e6938024344b 100644
--- a/tensorflow/compiler/jit/device_compilation_cache.h
+++ b/tensorflow/compiler/jit/device_compilation_cache.h
@@ -107,8 +107,8 @@ class DeviceCompilationCache {
     const mutex_lock lock(compile_cache_mu_);
     absl::erase_if(
         cache_,
-        [&](std::pair<const Key, absl::Nullable<std::unique_ptr<Entry>>>& kv) {
-          const absl::Nullable<Entry*> entry = kv.second.get();
+        [&](std::pair<const Key, absl_nullable std::unique_ptr<Entry>>& kv) {
+          Entry* absl_nullable const entry = kv.second.get();
           if (entry == nullptr) {
             return true;
           }
diff --git a/tensorflow/compiler/jit/device_compiler.h b/tensorflow/compiler/jit/device_compiler.h
index fb0dbd2ae417..34b22033129b 100644
--- a/tensorflow/compiler/jit/device_compiler.h
+++ b/tensorflow/compiler/jit/device_compiler.h
@@ -406,7 +406,7 @@ absl::Status DeviceCompiler<ExecutableType, ClientType>::CompileAsynchronous(
 template <typename ExecutableType, typename ClientType>
 void DeviceCompiler<ExecutableType, ClientType>::Finalize() {
   const mutex_lock lock(cluster_mutexes_mu_);
-  std::vector<absl::Nonnull<mutex*>> cluster_mutexes;
+  std::vector<mutex* absl_nonnull> cluster_mutexes;
   cluster_mutexes.reserve(cluster_mutexes_.size());
   for (auto& [_, mutex] : cluster_mutexes_) {
     if (mutex != nullptr) {
@@ -420,7 +420,7 @@ void DeviceCompiler<ExecutableType, ClientType>::Finalize() {
   absl::c_sort(cluster_mutexes);
   std::vector<mutex_lock> cluster_mutex_locks;
   cluster_mutex_locks.reserve(cluster_mutexes.size());
-  for (const absl::Nonnull<mutex*> mutex : cluster_mutexes) {
+  for (mutex* absl_nonnull const mutex : cluster_mutexes) {
     cluster_mutex_locks.emplace_back(*mutex);
   }
 
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 647c8d070806..468b85280e2a 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/jit/device_compilation_profiler.h"
 #include "tensorflow/compiler/jit/device_compiler.h"
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 8041d500347d..c3a24f3e0f71 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -902,7 +902,7 @@ int64_t GetConstantTensorSize(Node* n) {
   if (n->op_def().name() != "Const") return -1;
 
   const TensorProto* proto = nullptr;
-  Status s = GetNodeAttr(n->def(), "value", &proto);
+  absl::Status s = GetNodeAttr(n->def(), "value", &proto);
   if (!s.ok()) return -1;
 
   if (!proto->has_tensor_shape()) {
diff --git a/tensorflow/compiler/jit/pjrt_device_compiler_client.cc b/tensorflow/compiler/jit/pjrt_device_compiler_client.cc
index f64468fd2d25..aac55d260c79 100644
--- a/tensorflow/compiler/jit/pjrt_device_compiler_client.cc
+++ b/tensorflow/compiler/jit/pjrt_device_compiler_client.cc
@@ -45,9 +45,10 @@ PjRtDeviceCompilerClient::BuildExecutable(
     const XlaCompiler::CompilationResult& result) {
   VLOG(2) << "Compiling to xla::PjRtLoadedExecutable.";
 
-  TF_ASSIGN_OR_RETURN(auto executable,
-                      client_->Compile(*result.computation,
-                                       GetPjRtCompileOptions(options, result)));
+  TF_ASSIGN_OR_RETURN(
+      auto executable,
+      client_->CompileAndLoad(*result.computation,
+                              GetPjRtCompileOptions(options, result)));
 
   VLOG(2) << "Compiled PJRT executable " << executable->name()
           << " num_replicas " << executable->num_replicas()
@@ -77,8 +78,9 @@ PjRtDeviceCompilerClient::LoadExecutable(
     const XlaCompiler::CompilationResult& result,
     const std::string& serialized_executable) {
   VLOG(1) << "Deserializing from string to xla::PjRtLoadedExecutable.";
-  return client_->DeserializeExecutable(serialized_executable,
-                                        GetPjRtCompileOptions(options, result));
+  return client_->LoadSerializedExecutable(
+      serialized_executable, GetPjRtCompileOptions(options, result),
+      xla::LoadOptions());
 }
 
 void PjRtDeviceCompilerClient::WaitForProgramsToFinish() {
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index ed7f66ee50c3..40de3e19dfd6 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -118,5 +118,8 @@ tf_cc_test(
         "//tensorflow/compiler/jit:compilation_passes",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:test",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
index 74462a1cdfd1..dee77ac750ee 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
@@ -188,7 +188,7 @@ absl::Status AutoClusteringTest::RunAutoClusteringTestWithGzippedPbtxt(
                          io::ZlibCompressionOptions::GZIP());
   tstring decompressed_pbtxt_string;
   absl::Status s = in.ReadNBytes(INT_MAX, &decompressed_pbtxt_string);
-  if (!s.ok() && !errors::IsOutOfRange(s)) {
+  if (!s.ok() && !absl::IsOutOfRange(s)) {
     // OutOfRange is fine since we set the number of read bytes to INT_MAX.
     // Only return other kinds of errors.
     return s;
diff --git a/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc b/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
index 3da7ac13eaea..b17a05c37a59 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
+++ b/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <gtest/gtest.h>
+#include "absl/strings/match.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/jit/tests/device_compiler_test_helper.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
index f9af695e33c1..2fa938160712 100644
--- a/tensorflow/compiler/jit/xla_platform_info.cc
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -255,7 +255,7 @@ absl::Status BuildXlaDeviceCompiler(DeviceBase* device,
     return platform.status();
   }
 
-  absl::StatusOr<xla::Compiler*> compiler_for_platform =
+  absl::StatusOr<std::unique_ptr<xla::Compiler>> compiler_for_platform =
       xla::Compiler::GetForPlatform(platform.value());
   if (!compiler_for_platform.ok()) {
     // In some rare cases (usually in unit tests with very small clusters) we
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 20c7d3abb35d..c11a761a0891 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -220,11 +220,11 @@ tf_cc_binary(
     srcs = ["tf_mlir_translate_main.cc"],
     deps = [
         ":init_mlir",
-        "//tensorflow/compiler/mlir/lite/tools:translate_cl_options",
-        "//tensorflow/compiler/mlir/lite/tools:translate_registration",
         "//tensorflow/compiler/mlir/tensorflow:tf_xla_mlir_translate",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tf2xla/tests/registration:graph_to_tf_executor_registration",
+        "//tensorflow/compiler/mlir/tools:translate_cl_options",
+        "//tensorflow/compiler/mlir/tools:translate_registration",
         "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/mlir/glob_lit_test.bzl b/tensorflow/compiler/mlir/glob_lit_test.bzl
index ad44b889cc62..079dc4adc269 100644
--- a/tensorflow/compiler/mlir/glob_lit_test.bzl
+++ b/tensorflow/compiler/mlir/glob_lit_test.bzl
@@ -11,6 +11,7 @@ load(
     "@local_xla//xla:lit.bzl",
     "lit_script_with_xla_gpu_cuda_data_dir",
 )
+load("@rules_python//python:py_test.bzl", "py_test")
 
 # Default values used by the test runner.
 _default_test_file_exts = ["mlir", ".pbtxt", ".td"]
@@ -49,7 +50,7 @@ def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
     """
 
     # Disable tests on windows for now, to enable testing rest of all xla and mlir.
-    native.py_test(
+    py_test(
         name = name,
         srcs = ["@llvm-project//llvm:lit"],
         tags = tags + ["no_pip", "no_windows"],
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index f3830acc44bb..748ede9590c0 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -61,7 +61,7 @@ td_library(
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_td_files",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_td_files",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
         "@llvm-project//mlir:FuncTdFiles",
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
@@ -101,15 +101,10 @@ td_library(
 gentbl_cc_library(
     name = "tensorflow_lite_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TensorFlowLiteTd",
-            ],
-            "transforms/passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TensorFlowLiteTd",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/passes.td",
     deps = [
@@ -120,23 +115,14 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tfl_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tfl_ops.cc.inc",
-        ),
-        (
-            [
-                "-gen-dialect-doc",
-                "-dialect=tfl",
-            ],
-            "g3doc/tfl_ops.md",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tfl_ops.h.inc": ["-gen-op-decls"],
+        "ir/tfl_ops.cc.inc": ["-gen-op-defs"],
+        "g3doc/tfl_ops.md": [
+            "-gen-dialect-doc",
+            "-dialect=tfl",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tfl_ops.td",
     deps = [
@@ -147,24 +133,12 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_op_interfaces_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-interface-decls"],
-            "ir/tfl_ops_interface.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "ir/tfl_ops_interface.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "ir/tfl_ops_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "ir/tfl_ops_dialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tfl_ops_interface.h.inc": ["-gen-op-interface-decls"],
+        "ir/tfl_ops_interface.cc.inc": ["-gen-op-interface-defs"],
+        "ir/tfl_ops_dialect.h.inc": ["-gen-dialect-decls"],
+        "ir/tfl_ops_dialect.cc.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tfl_op_interfaces.td",
     deps = [
@@ -175,24 +149,12 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_op_enums_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-enum-decls"],
-            "ir/tfl_ops_enums.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "ir/tfl_ops_enums.cc.inc",
-        ),
-        (
-            ["-gen-attrdef-decls"],
-            "ir/tfl_ops_attrdefs.h.inc",
-        ),
-        (
-            ["-gen-attrdef-defs"],
-            "ir/tfl_ops_attrdefs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tfl_ops_enums.h.inc": ["-gen-enum-decls"],
+        "ir/tfl_ops_enums.cc.inc": ["-gen-enum-defs"],
+        "ir/tfl_ops_attrdefs.h.inc": ["-gen-attrdef-decls"],
+        "ir/tfl_ops_attrdefs.cc.inc": ["-gen-attrdef-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tfl_op_enums.td",
     deps = [
@@ -203,12 +165,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_prepare_tf_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_prepare_tf.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_prepare_tf.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/prepare_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -217,12 +174,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_lower_static_tensor_list_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_lower_static_tensor_list.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_lower_static_tensor_list.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/tensorlist_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -231,12 +183,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_legalize_tf_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_legalize_tf.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_legalize_tf.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/legalize_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -245,12 +192,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_legalize_variables_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_legalize_variables.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_legalize_variables.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/legalize_variables.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -259,12 +201,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_optimize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_optimize.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_optimize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/optimize_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -273,12 +210,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_optimize_batch_matmul_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_optimize_batch_matmul.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_optimize_batch_matmul.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/optimize_batch_matmul.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -287,12 +219,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "optimize_broadcast_like_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_optimize_broadcast_like.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_optimize_broadcast_like.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/optimize_broadcast_like_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -301,12 +228,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_quantize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_quantize.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_quantize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/quantize_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -315,12 +237,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_quantize_by_converter_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_quantize_by_converter.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_quantize_by_converter.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/quantize_by_converter_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -329,12 +246,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_post_quantize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_post_quantize.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_post_quantize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/post_quantize_patterns.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -343,12 +255,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_legalize_tensorlist_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_legalize_tensorlist.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_legalize_tensorlist.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/legalize_tensorlist.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -380,12 +287,7 @@ cc_library(
 gentbl_cc_library(
     name = "tensorflow_lite_canonicalize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "ir/tfl_canonicalize.inc",
-        ),
-    ],
+    tbl_outs = {"ir/tfl_canonicalize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tfl_canonicalize.td",
     deps = [":tensorflow_lite_patterns_td_files"],
@@ -395,6 +297,8 @@ cc_library(
     name = "utils",
     hdrs = ["utils/utils.h"],
     deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
@@ -402,6 +306,18 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "utils_test",
+    srcs = ["utils/utils_test.cc"],
+    deps = [
+        ":utils",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "attribute_utils",
     srcs = ["utils/attribute_utils.cc"],
@@ -473,6 +389,7 @@ cc_library(
     deps = [
         ":common",
         ":converter_flags_proto_cc",
+        ":optimize_broadcast_like_pass_options",
         ":optimize_pass_options",
         ":pass_options",
         ":pass_options_setter",
@@ -508,9 +425,9 @@ cc_library(
         ":tensorflow_lite_op_interfaces_inc_gen",
         ":tensorflow_lite_ops_inc_gen",
         ":utils",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_op_interfaces",
@@ -587,6 +504,7 @@ cc_library(
     hdrs = [
         "ir/tfl_ops.h",
         "transforms/canonicalize_boundary_value_pass.h",
+        "transforms/cleanup_optimization_barrier_pass.h",
         "transforms/optimize_batch_matmul_pass.h",
         "transforms/optimize_broadcast_like_pass.h",
         "transforms/optimize_pass.h",
@@ -605,9 +523,11 @@ cc_library(
     deps = [
         ":attribute_utils",
         ":canonicalize_boundary_value",
+        ":cleanup_optimization_barrier",
         ":converter_inc",
         ":cost_estimators",
         ":optimize_broadcast_like_pass",
+        ":optimize_broadcast_like_pass_options",
         ":optimize_pass_options",
         ":pass",
         ":pass_options",
@@ -629,10 +549,10 @@ cc_library(
         ":tensorflow_lite_tf_unfreeze_global_tensors",
         ":tensorflow_lite_unfold_large_splat_constants",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_op_interfaces",
@@ -642,6 +562,8 @@ cc_library(
         "//tensorflow/core:framework",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@llvm-project//llvm:Support",
@@ -798,7 +720,7 @@ cc_library(
     ],
     deps = [
         ":tensorflow_lite",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "@llvm-project//llvm:Support",
@@ -957,6 +879,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cleanup_optimization_barrier",
+    srcs = [
+        "transforms/cleanup_optimization_barrier_pass.cc",
+    ],
+    hdrs = [
+        "transforms/cleanup_optimization_barrier_pass.h",
+    ],
+    deps = [
+        ":pass",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_legalize_tf_analyze_variables",
     srcs = [
@@ -1107,6 +1049,7 @@ cc_library(
         ":fake_quant_utils",
         ":lstm_utils",
         ":nms_utils",
+        ":optimize_broadcast_like_pass_options",
         ":perception_ops_utils",
         ":shape_and_size_utils",
         ":stateful_ops_utils",
@@ -1123,17 +1066,17 @@ cc_library(
         ":validators",
         ":variables_utils",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
-        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf",
         "//tensorflow/compiler/mlir/lite/stablehlo:optimize_layout",
         "//tensorflow/compiler/mlir/lite/stablehlo:prepare_hlo",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_chlo",
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/stablehlo:legalize_tf",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
@@ -1173,6 +1116,8 @@ cc_library(
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
+        "@local_xla//xla/mlir_hlo:type_conversion",
+        "@local_xla//xla/mlir_hlo:unfuse_batch_norm",
         "@stablehlo//:stablehlo_ops",
     ],
 )
@@ -1196,8 +1141,8 @@ cc_library(
         ":tensorflow_lite_optimize_inc_gen",
         ":utils",
         ":validators",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:verification_utils",
@@ -1213,6 +1158,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "optimize_batch_matmul_utils",
+    srcs = ["transforms/tflite_passes/optimize_batch_matmul_utils.cc"],
+    hdrs = ["transforms/tflite_passes/optimize_batch_matmul_utils.h"],
+    deps = [
+        ":utils",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "optimize_batch_matmul_utils_test",
+    srcs = ["transforms/tflite_passes/optimize_batch_matmul_utils_test.cc"],
+    deps = [
+        ":optimize_batch_matmul_utils",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_optimize_batch_matmul",
     srcs = [
@@ -1224,6 +1192,7 @@ cc_library(
     ],
     deps = [
         ":convert_type",
+        ":optimize_batch_matmul_utils",
         ":pass",
         ":pass_options",
         ":tensorflow_lite_ops",
@@ -1231,7 +1200,6 @@ cc_library(
         ":tensorflow_lite_passes_inc_gen",
         ":utils",
         ":validators",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
@@ -1258,14 +1226,16 @@ cc_library(
     ],
     deps = [
         ":optimize_broadcast_like_inc_gen",
+        ":optimize_broadcast_like_pass_options",
         ":pass",
-        ":pass_options",
         ":tensorflow_lite_ops",
+        ":utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
     ],
@@ -1329,18 +1299,17 @@ cc_library(
         "transforms/prepare_quantize_helper.cc",
         "transforms/quantize.cc",
         "transforms/quantize_variables.cc",
-        "transforms/tfl_quantization_driver.cc",
         "utils/generated_op_quant_spec_getters.inc",
     ],
     hdrs = [
         "transforms/lower_quant_annotations_helper.h",
         "transforms/passes.h",
         "transforms/prepare_quantize_helper.h",
-        "transforms/tfl_quantization_driver.h",
     ],
     deps = [
         "convert_type",
         ":op_quant_spec_getters_inc",
+        ":optimize_broadcast_like_pass_options",
         ":shape_and_size_utils",
         ":stateful_ops_utils",
         ":tensorflow_lite",
@@ -1349,15 +1318,17 @@ cc_library(
         ":tensorflow_lite_quantize_by_converter_inc_gen",
         ":tensorflow_lite_quantize_inc_gen",
         ":validators",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:tfl_quantization_driver",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/quantization/lite:tfl_to_std",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/lite/tools/optimize:operator_property",
         "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1442,7 +1413,7 @@ filegroup(
 gentbl_cc_library(
     name = "op_quant_spec_getters_inc",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [([], "utils/generated_op_quant_spec_getters.inc")],
+    tbl_outs = {"utils/generated_op_quant_spec_getters.inc": []},
     tblgen = "//tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen",
     td_file = "ir/tfl_ops.td",
     deps = [
@@ -1453,7 +1424,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tflite_op_coverage_spec_inc",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [([], "utils/tflite_op_coverage_spec.inc")],
+    tbl_outs = {"utils/tflite_op_coverage_spec.inc": []},
     tblgen = "//tensorflow/compiler/mlir/lite/quantization:tflite_op_coverage_spec_getters_gen",
     td_file = "ir/tfl_ops.td",
     visibility = ["//learning/brain/mobile/model_optimization/g3doc/autogen:__pkg__"],
@@ -1478,22 +1449,16 @@ tf_native_cc_binary(
 gentbl_cc_library(
     name = "converter_inc",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["--gen-operator-converters"],
-            "operator_converters.inc",
-        ),
-        (
-            ["--gen-runtime-verifiers"],
-            "runtime_verifiers.inc",
-        ),
-    ],
+    tbl_outs = {
+        "operator_converters.inc": ["--gen-operator-converters"],
+        "runtime_verifiers.inc": ["--gen-runtime-verifiers"],
+    },
     tblgen = ":converter-gen",
     td_file = "ir/tfl_ops.td",
     test = 1,
     deps = [
         ":tensorflow_lite_ops_td_files",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_td_files",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_td_files",
     ],
 )
 
@@ -1637,6 +1602,7 @@ cc_library(
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/core:absl_error_model_builder",
         "//tensorflow/compiler/mlir/lite/experimental/remat:metadata_util",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/schema:debug_metadata_fbs_with_mutable",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
@@ -1644,7 +1610,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/schema:schema_utils",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
@@ -1741,6 +1706,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "optimize_broadcast_like_pass_options",
+    hdrs = ["transforms/optimize_broadcast_like_pass_options.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
 cc_library(
     name = "flatbuffer_translate_lib",
     hdrs = [
@@ -1818,7 +1792,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:converter_flags_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
@@ -1845,8 +1819,7 @@ tf_cc_binary(
         ":tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/lite:converter_flags_proto_cc",
-        "//tensorflow/compiler/mlir/lite/tools:translate_cl_options",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
@@ -1889,10 +1862,10 @@ cc_library(
         ":tensorflow_lite_optimize_batch_matmul",  # buildcleaner: keep
         ":tensorflow_lite_push_transpose_through_ewise_pass",  # buildcleaner: keep
         ":tensorflow_lite_quantize",  # buildcleaner: keep
-        ":tensorflow_lite_tf_unfreeze_global_tensors",
         ":variable_freezing_pipeline",
         "//tensorflow/compiler/mlir/lite/core:macros",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_quantization_passes",
         "//tensorflow/compiler/mlir/lite/stablehlo:build_stablehlo_composite",
         "//tensorflow/compiler/mlir/lite/stablehlo:compose_uniform_quantized_type_pass",
@@ -1902,13 +1875,12 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:lift_callsite_loc_caller",
         "//tensorflow/compiler/mlir/lite/stablehlo:prepare_hlo",  # buildcleaner: keep
-        "//tensorflow/compiler/mlir/lite/stablehlo:rename_entrypoint_to_main",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_chlo",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/lite/stablehlo:uniform_quantized_stablehlo_to_tfl_pass",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/stablehlo:rename_entrypoint_to_main",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
         "//tensorflow/core:core_cpu_base",
@@ -1943,6 +1915,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/experimental/remat:metadata_util",
         "//tensorflow/compiler/mlir/lite/metrics:converter_error_data_proto_cc",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy:quantize_weights",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
@@ -1951,7 +1924,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_util",
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/lite/tools/optimize:reduced_precision_metadata",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantize_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
index db9715e99c1a..d94c585e4d18 100644
--- a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -23,14 +23,14 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 
 namespace mlir {
 namespace TFL {
 
 // A config that controls which passes get run as part TFLite converter.
 struct PassConfig {
-  explicit PassConfig(quant::QuantizationSpecs specs)
+  explicit PassConfig(QuantizationSpecs specs)
       : quant_specs(std::move(specs)) {}
 
   // If `emit_builtin_tflite_ops` is true, TF Lite legalization passes will be
@@ -42,7 +42,7 @@ struct PassConfig {
   // The allowlist of functions that would be preserved after trimming.
   llvm::ArrayRef<std::string> trim_functions_allowlist;
   // All information about quantization.
-  quant::QuantizationSpecs quant_specs;
+  QuantizationSpecs quant_specs;
   // If `form_clusters` is true , clusters are formed by grouping consecutive
   // ops of the same device, under a `tf_device.launch` op.
   bool form_clusters = false;
@@ -90,8 +90,7 @@ struct PassConfig {
   bool reduce_type_precision = false;
   // Whether to consider this model a quantized model with quantize/dequantize
   // ops and to convert kernels to quantized kernels wherever appropriate.
-  quant::QDQConversionMode qdq_conversion_mode =
-      quant::QDQConversionMode::kQDQNone;
+  QDQConversionMode qdq_conversion_mode = QDQConversionMode::kQDQNone;
 
   // When set to true, StableHLO Quantizer is run. The full configuration for
   // the quantizer is at `ConverterFlags::quantization_config`.
@@ -107,6 +106,12 @@ struct PassConfig {
   // When set to true, convert +Inf/-Inf to MIN/MAX float value and output of
   // convert only contains finite values.
   bool canonicalizing_inf_as_min_max_float = true;
+
+  // When set to true, allows fusion of dynamic shaped broadcast ops. It helps
+  // fusing implicit broadcasting ops when output shape has dynamic dimensions,
+  // but it may cause incorrect results when broadcasting ops are introduced by
+  // explicit broadcasting in the source model.
+  bool unsafe_fuse_dynamic_shaped_broadcast = false;
 };
 
 inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
@@ -133,6 +138,8 @@ inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
             << pass_config.enable_stablehlo_conversion
             << "\nlegalize_custom_tensor_list_ops: "
             << pass_config.legalize_custom_tensor_list_ops
+            << "\nunsafe_fuse_dynamic_shaped_broadcast: "
+            << pass_config.unsafe_fuse_dynamic_shaped_broadcast
             << "\nreduce_type_precision: " << pass_config.reduce_type_precision
             << "\nconvert_qdq_format: "
             << GetQDQQuantModeString(pass_config.qdq_conversion_mode)
diff --git a/tensorflow/compiler/mlir/lite/converter_flags.proto b/tensorflow/compiler/mlir/lite/converter_flags.proto
index 5b6b9e2ca752..1c1a1ad00aea 100644
--- a/tensorflow/compiler/mlir/lite/converter_flags.proto
+++ b/tensorflow/compiler/mlir/lite/converter_flags.proto
@@ -41,7 +41,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 68.
+// Next ID to use: 69.
 message ConverterFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -385,4 +385,10 @@ message ConverterFlags {
   // possible rather than quantizing any op that is possible to quantize.
   // WARNING: Experimental interface, subject to change.
   optional bool strict_qdq_mode = 67 [default = false];
+
+  // When set to true, allows fusion of dynamic shaped broadcast ops. It helps
+  // fusing implicit broadcasting ops when output shape has dynamic dimensions,
+  // but it may cause incorrect results when broadcasting ops are introduced by
+  // explicit broadcasting in the source model.
+  optional bool unsafe_fuse_dynamic_shaped_broadcast = 68 [default = false];
 }
diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index 6869783209e2..ba186348a97c 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -568,7 +568,7 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os,
        << "::VerifyTflRuntimeConstraints(::mlir::Operation *op, bool "
           "emit_error_on_verify_fail) {\n";
     os << "  auto top = cast<" << op.getCppClassName() << ">(op); (void)top;\n";
-    verify_ctx.addSubst("_op", "top");
+    verify_ctx.addSubst("_op", "(*op)");
 
     for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
       auto &value = op.getOperand(i);
diff --git a/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.h b/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.h
index ed452c9084cb..0112b1ef84a9 100644
--- a/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/compiler/mlir/lite/core/api/flatbuffer_conversions.h
@@ -48,9 +48,8 @@ class BuiltinDataAllocator {
   // deallocation.
   template <typename T>
   T* AllocatePOD() {
-    // TODO(b/154346074): Change this to is_trivially_destructible when all
-    // platform targets support that properly.
-    static_assert(std::is_pod<T>::value, "Builtin data structure must be POD.");
+    static_assert(std::is_trivially_destructible<T>::value,
+                  "Builtin data structure must be POD.");
     void* allocated_memory = this->Allocate(sizeof(T), alignof(T));
     return new (allocated_memory) T();
   }
diff --git a/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h b/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
index 1327162f2326..c580bf03cd3f 100644
--- a/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
+++ b/tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 /// WARNING: Users of TensorFlow Lite should not include this file directly,
-/// but should instead include
-/// "third_party/tensorflow/lite/c/builtin_op_data.h".
-/// Only the TensorFlow Lite implementation itself should include this
-/// file directly.
+/// only the TensorFlow Lite implementation itself should.
+
+// IWYU pragma: private, include "third_party/tensorflow/lite/c/builtin_op_data.h"
+
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_CORE_C_BUILTIN_OP_DATA_H_
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index f3edb169515b..e4d0101245ba 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -98,12 +98,7 @@ cc_library(
 gentbl_cc_library(
     name = "transform_patterns_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_transform_patterns.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_transform_patterns.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/transform_patterns.td",
     deps = [
@@ -128,7 +123,6 @@ cc_library(
     deps = [
         ":common",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:verification_utils",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc
index 19cd2e081a7d..91dc26155fc6 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc
@@ -59,6 +59,15 @@ double GpuHardware::GetHardwareSwitchingCost(const TargetHardware* from,
          kCrossHardwareTransferFixedCost;
 }
 
+bool GpuHardware::IsOpSupported(mlir::Operation* op) const {
+  if (TargetHardware::IsOpSupported(op)) {
+    return true;
+  }
+
+  // We also support quantized ops.
+  return !NotTFLQuantDequantizeOp(op);
+}
+
 namespace {
 // GPU
 constexpr float kGPUArithmeticUnitCost = 0.2;
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h
index 149c2076a615..cc13c6e36be2 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h
@@ -41,6 +41,8 @@ class GpuHardware : public TargetHardware {
 
   double GetHardwareSwitchingCost(const TargetHardware* from,
                                   size_t buffer_size) const override;
+
+  bool IsOpSupported(mlir::Operation* op) const override;
 };
 }  // namespace tac
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
index 573449f6eff0..4ea57f3c1cc9 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
@@ -21,9 +21,9 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/experimental/tac:tflite_importer_exporter",
         "//tensorflow/compiler/mlir/lite/experimental/tac/hardwares:all-target-hardwares",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:IR",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -104,7 +104,7 @@ pybind_extension(
     deps = [
         ":tac_wrapper_lib",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/tac-filter.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/tac-filter.mlir
index 9b6d68c49f53..5afdc4370641 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/tac-filter.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/tac-filter.mlir
@@ -62,3 +62,23 @@ module {
     func.return
   }
 }
+
+// -----
+
+// expected-remark@below {{Tac filter (0): filter type: function filter SKIP_TARGET_ANNOTATION, filter_pattern: "^testFunction"}}
+// expected-remark@below {{Tac filter (0) specified but not applied to any op}}
+// expected-remark@below {{Tac filter (1): filter type: function filter INCLUDE_TARGET_ANNOTATION, filter_pattern: "testFunctionInclude"}}
+// expected-remark@below {{Tac filter (1) specified but not applied to any op}}
+// expected-remark@below {{Tac filter (2): filter type: op filter, filter_pattern: "^test_op"}}
+module {
+  // CHECK-LABEL: testOpMultipleResults
+  // expected-remark@+1 {{all ops filtered by tac filter (2): "tfl.split_v"}}
+  func.func @testOpMultipleResults(%arg0: tensor<16x4x4xf32>) -> (tensor<7x4x4xf32>, tensor<3x4x4xf32>, tensor<6x4x4xf32>) {
+    %size_splits = arith.constant dense<[7, 3, 6]> : tensor<3xi32>
+    %split_dim = arith.constant dense<0> : tensor<i32>
+    // CHECK: tfl.split_v
+    // CHECK-SAME: tac.skip_target_annotation
+    %0, %1, %2 = "tfl.split_v"(%arg0, %size_splits, %split_dim) {num_splits = 3 : i32} : (tensor<16x4x4xf32>, tensor<3xi32>, tensor<i32>) -> (tensor<7x4x4xf32>, tensor<3x4x4xf32>, tensor<6x4x4xf32>) loc("test_op_split"("/tmp/test_model.tflite":0:0))
+    func.return %0, %1, %2 : tensor<7x4x4xf32>, tensor<3x4x4xf32>, tensor<6x4x4xf32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
index fd4852b34ed3..f9a14eef8378 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
@@ -202,7 +202,6 @@ bool AlternativeSubgraphPass::IsAllSupportedbySpec(
   bool found_unsupported = false;
   func.walk([&](Operation* op) {
     if (IsNonConstOp(op) && !IsTerminatorOp(op) &&
-        NotTFLQuantDequantizeOp(op) &&
         !llvm::isa<func::ReturnOp, func::FuncOp, CallOpInterface>(op) &&
         !IsSupported(op, device_inference_type.hardware)) {
       found_unsupported = true;
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
index 82fe3471e4da..8dee7c090226 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
@@ -127,12 +127,11 @@ void ApplyTacFilter(
   }
 
   auto should_filter_op = [](mlir::Operation* op) {
-    return IsNonConstOp(op) && NotTFLQuantDequantizeOp(op) &&
-           !IsTerminatorOp(op) &&
+    return IsNonConstOp(op) && !IsTerminatorOp(op) &&
            !llvm::isa<func::ReturnOp, func::FuncOp, CallOpInterface>(op);
   };
 
-  auto map_op_to_cpu = [&](mlir::Operation* op, std::string name) {
+  auto map_op_to_cpu = [&](mlir::Operation* op) {
     if (!should_filter_op(op)) {
       return;
     }
@@ -157,8 +156,14 @@ void ApplyTacFilter(
   OpFilter::MatchType match_type = tac_filter.op_filter().match_type();
   OpFilter::DeviceType device_type = tac_filter.op_filter().device_type();
   module.walk([&](Operation* op) {
-    auto named_loc = mlir::dyn_cast<NameLoc>(op->getLoc());
-    if (!named_loc) {
+    NameLoc loc;
+    if (auto name_loc = mlir::dyn_cast<NameLoc>(op->getLoc())) {
+      loc = name_loc;
+    } else if (auto fused_loc = mlir::dyn_cast<FusedLoc>(op->getLoc())) {
+      loc = dyn_cast<NameLoc>(fused_loc.getLocations().front());
+    }
+
+    if (!loc) {
       return;
     }
     // There can be two kinds of `match_type`:
@@ -171,11 +176,11 @@ void ApplyTacFilter(
     //
     // The code below maps an op to the appropriate device based on the above
     // fields.
-    if (op_regex.match(named_loc.getName())) {
+    if (op_regex.match(loc.getName())) {
       switch (match_type) {
         case OpFilter::MATCH:
           if (device_type == OpFilter::CPU) {
-            map_op_to_cpu(op, named_loc.getName().str());
+            map_op_to_cpu(op);
             return;
           }
           map_op_to_custom_device(op);
@@ -187,7 +192,7 @@ void ApplyTacFilter(
       switch (match_type) {
         case OpFilter::INVERT_MATCH:
           if (device_type == OpFilter::CPU) {
-            map_op_to_cpu(op, named_loc.getName().str());
+            map_op_to_cpu(op);
             return;
           }
           map_op_to_custom_device(op);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
index 6d1bf7ab9341..e3d1a4e47e78 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
@@ -140,8 +140,7 @@ void TargetAnnotationPass::runOnFunction() {
 
   func.walk([&](Operation* op) {
     // We only care about TFL dialect.
-    if (IsNonConstOp(op) && NotTFLQuantDequantizeOp(op) &&
-        !IsTerminatorOp(op) &&
+    if (IsNonConstOp(op) && !IsTerminatorOp(op) &&
         !llvm::isa<func::ReturnOp, func::FuncOp, CallOpInterface>(op)) {
       SetTargetAnnotation(op, device_specs_flag_, &builder);
     }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
index bf830df4cd39..168c65efb38f 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
@@ -19,6 +19,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/experimental/tac:common",
+        "//tensorflow/compiler/mlir/lite/stablehlo:prepare_hlo",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -29,6 +31,8 @@ cc_library(
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
index 6c6590664af9..3ac7acf53431 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -38,10 +39,14 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h"
 
 namespace mlir {
 namespace TFL {
@@ -97,6 +102,22 @@ absl::Status ExportFlatbufferOrMlir(
     module.print(os);
     os.flush();
   } else {
+    // This extra attribute is added by TAC pass. We need to remove it before
+    // converting to VHLO.
+    module.walk([&](mlir::Operation* op) {
+      if (op->hasAttr(mlir::TFL::tac::kSkipTargetAnnotation)) {
+        op->removeAttr(mlir::TFL::tac::kSkipTargetAnnotation);
+      }
+    });
+    // Converts stablehlo to vhlo so that flatbuffer export can handle it.
+    auto pass_manager =
+        std::make_unique<mlir::PassManager>(module.getContext());
+    pass_manager->addPass(mlir::odml::createLegalizeStablehloToVhloPass());
+    pass_manager->addPass(mlir::createReconcileUnrealizedCastsPass());
+    if (failed(pass_manager->run(module))) {
+      return absl::UnknownError("Failed to legalize stablehlo to vhlo.");
+    }
+
     tflite::FlatbufferExportOptions options;
     options.converter_flags.set_force_select_tf_ops(false);
     options.converter_flags.set_allow_custom_ops(true);
@@ -109,7 +130,8 @@ absl::Status ExportFlatbufferOrMlir(
     if (custom_option_alignment.has_value()) {
       options.custom_option_alignment = *custom_option_alignment;
     }
-    if (!tflite::MlirToFlatBufferTranslateFunction(module, options, &result)) {
+    if (!tflite::MlirToFlatBufferTranslateFunction(
+            module, options, &result, /*serialize_stablehlo_ops=*/true)) {
       return absl::UnknownError("Failed to export tflite file.");
     }
   }
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 4b95c46902bf..6045278ffa54 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -1716,30 +1716,34 @@ void CreateFlexbufferVector(
     const std::unique_ptr<flexbuffers::Builder>& flex_builder,
     std::string& name, const mlir::Attribute& attr) {
   auto start = flex_builder->StartVector(name.c_str());
-  auto array = attr.cast<mlir::vhlo::ArrayV1Attr>().getValue();
+  auto array = mlir::cast<mlir::vhlo::ArrayV1Attr>(attr).getValue();
 
   for (int i = 0; i < array.size(); i++) {
     if (llvm::isa<mlir::BoolAttr>(array[i])) {
       flex_builder->Bool(name.c_str(),
-                         array[i].cast<mlir::BoolAttr>().getValue());
+                         mlir::cast<mlir::BoolAttr>(array[i]).getValue());
     } else if (llvm::isa<mlir::StringAttr>(attr)) {
-      flex_builder->String(name.c_str(),
-                           array[i].cast<mlir::StringAttr>().getValue().str());
+      flex_builder->String(
+          name.c_str(),
+          mlir::cast<mlir::StringAttr>(array[i]).getValue().str());
     } else if (llvm::isa<mlir::vhlo::BooleanV1Attr>(array[i])) {
-      flex_builder->Bool(name.c_str(),
-                         array[i].cast<mlir::vhlo::BooleanV1Attr>().getValue());
+      flex_builder->Bool(
+          name.c_str(),
+          mlir::cast<mlir::vhlo::BooleanV1Attr>(array[i]).getValue());
     } else if (llvm::isa<mlir::vhlo::StringV1Attr>(array[i])) {
       flex_builder->String(
           name.c_str(),
-          array[i].cast<mlir::vhlo::StringV1Attr>().getValue().str());
+          mlir::cast<mlir::vhlo::StringV1Attr>(array[i]).getValue().str());
     } else if (llvm::isa<mlir::vhlo::IntegerV1Attr>(array[i])) {
-      flex_builder->Int(
-          name.c_str(),
-          array[i].cast<mlir::vhlo::IntegerV1Attr>().getValue().getSExtValue());
+      flex_builder->Int(name.c_str(),
+                        mlir::cast<mlir::vhlo::IntegerV1Attr>(array[i])
+                            .getValue()
+                            .getSExtValue());
     } else if (llvm::isa<mlir::vhlo::FloatV1Attr>(array[i])) {
-      flex_builder->Float(
-          name.c_str(),
-          array[i].cast<mlir::vhlo::FloatV1Attr>().getValue().convertToFloat());
+      flex_builder->Float(name.c_str(),
+                          mlir::cast<mlir::vhlo::FloatV1Attr>(array[i])
+                              .getValue()
+                              .convertToFloat());
 
     } else if (llvm::isa<mlir::vhlo::ArrayV1Attr>(array[i])) {
       CreateFlexbufferVector(flex_builder, name, array[i]);
@@ -1835,43 +1839,49 @@ Translator::BuildVhloCompositeV1Op(mlir::vhlo::CompositeOpV1 composite_op,
   uint32_t opcode_index =
       GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_COMPOSITE);
 
-  int32_t api_version = composite_op.getVersion()
-                            .cast<mlir::vhlo::IntegerV1Attr>()
-                            .getValue()
-                            .getSExtValue();
+  int32_t api_version =
+      mlir::cast<mlir::vhlo::IntegerV1Attr>(composite_op.getVersion())
+          .getValue()
+          .getSExtValue();
 
   auto name = builder_.CreateString(
-      composite_op.getName().cast<mlir::vhlo::StringV1Attr>().getValue().str());
+      mlir::cast<mlir::vhlo::StringV1Attr>(composite_op.getName())
+          .getValue()
+          .str());
 
-  auto composite_attributes = composite_op.getCompositeAttributes()
-                                  .cast<mlir::vhlo::DictionaryV1Attr>();
+  auto composite_attributes = mlir::cast<mlir::vhlo::DictionaryV1Attr>(
+      composite_op.getCompositeAttributes());
   auto flex_builder = std::make_unique<flexbuffers::Builder>();
   size_t map_start = flex_builder->StartMap();
 
   for (auto namedAttr : composite_attributes.getValue()) {
     auto name =
-        namedAttr.first.cast<mlir::vhlo::StringV1Attr>().getValue().str();
+        mlir::cast<mlir::vhlo::StringV1Attr>(namedAttr.first).getValue().str();
     auto attr = namedAttr.second;
 
     if (llvm::isa<mlir::BoolAttr>(attr))
-      flex_builder->Bool(name.c_str(), attr.cast<mlir::BoolAttr>().getValue());
+      flex_builder->Bool(name.c_str(),
+                         mlir::cast<mlir::BoolAttr>(attr).getValue());
     else if (llvm::isa<mlir::StringAttr>(attr))
       flex_builder->String(name.c_str(),
-                           attr.cast<mlir::StringAttr>().getValue().str());
+                           mlir::cast<mlir::StringAttr>(attr).getValue().str());
     else if (llvm::isa<mlir::vhlo::BooleanV1Attr>(attr))
-      flex_builder->Bool(name.c_str(),
-                         attr.cast<mlir::vhlo::BooleanV1Attr>().getValue());
+      flex_builder->Bool(
+          name.c_str(), mlir::cast<mlir::vhlo::BooleanV1Attr>(attr).getValue());
     else if (llvm::isa<mlir::vhlo::StringV1Attr>(attr))
       flex_builder->String(
-          name.c_str(), attr.cast<mlir::vhlo::StringV1Attr>().getValue().str());
-    else if (llvm::isa<mlir::vhlo::IntegerV1Attr>(attr))
-      flex_builder->Int(
           name.c_str(),
-          attr.cast<mlir::vhlo::IntegerV1Attr>().getValue().getSExtValue());
+          mlir::cast<mlir::vhlo::StringV1Attr>(attr).getValue().str());
+    else if (llvm::isa<mlir::vhlo::IntegerV1Attr>(attr))
+      flex_builder->Int(name.c_str(),
+                        mlir::cast<mlir::vhlo::IntegerV1Attr>(attr)
+                            .getValue()
+                            .getSExtValue());
     else if (llvm::isa<mlir::vhlo::FloatV1Attr>(attr))
-      flex_builder->Float(
-          name.c_str(),
-          attr.cast<mlir::vhlo::FloatV1Attr>().getValue().convertToFloat());
+      flex_builder->Float(name.c_str(),
+                          mlir::cast<mlir::vhlo::FloatV1Attr>(attr)
+                              .getValue()
+                              .convertToFloat());
     else if (llvm::isa<mlir::vhlo::ArrayV1Attr>(attr))
       CreateFlexbufferVector(flex_builder, name, attr);
     else if (llvm::isa<mlir::vhlo::TensorV1Attr>(attr)) {
@@ -1932,8 +1942,8 @@ Translator::BuildVhloCompositeV1Op(mlir::vhlo::CompositeOpV1 composite_op,
   flex_builder->Finish();
 
   int32_t decomposition_subgraph_index =
-      subgraph_index_map_[composite_op.getDecomposition()
-                              .cast<mlir::vhlo::StringV1Attr>()
+      subgraph_index_map_[mlir::cast<mlir::vhlo::StringV1Attr>(
+                              composite_op.getDecomposition())
                               .getValue()
                               .str()];
 
@@ -3631,11 +3641,17 @@ std::string Translator::SerializeDebugMetadata(mlir::ModuleOp module) {
 
 std::optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
 Translator::CreateMetadataVector() {
+  constexpr StringRef kRuntimeVersionMetadataKey = "min_runtime_version";
   auto dict_attr = module_->getAttrOfType<mlir::DictionaryAttr>("tfl.metadata");
   std::vector<BufferOffset<tflite::Metadata>> metadata;
   if (dict_attr) {
     for (const auto& named_attr : dict_attr) {
       StringRef name = named_attr.getName();
+      if (name == kRuntimeVersionMetadataKey) {
+        LOG(WARNING) << "Skipping runtime version metadata in the model. This "
+                        "will be generated by the exporter.";
+        continue;
+      }
       mlir::Attribute attr = named_attr.getValue();
       if (auto content = mlir::dyn_cast<StringAttr>(attr)) {
         metadata.push_back(BuildMetadata(name, content.getValue()));
@@ -3652,8 +3668,8 @@ Translator::CreateMetadataVector() {
   // 16-byte because it's the alignment of buffers in flatbuffer, so it won't
   // cause any waste of space if the actual string is shorter than 16 bytes.
   constexpr std::size_t kByteStringSize = 16;
-  metadata.push_back(
-      BuildMetadata("min_runtime_version", std::string(kByteStringSize, '\0')));
+  metadata.push_back(BuildMetadata(kRuntimeVersionMetadataKey,
+                                   std::string(kByteStringSize, '\0')));
   if (use_buffer_offset_) {
     metadata.push_back(
         BuildMetadata(tflite_metadata_buffer_location, "outside flatbuffers"));
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 132d87c93cd4..57e1fd26f936 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -81,6 +81,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/offset_buffer.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/schema/mutable/debug_metadata_generated.h"
 #include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
@@ -91,7 +92,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/control_edges.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/shape_and_size_utils.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -377,10 +377,8 @@ mlir::Operation* ConvertMinMaxToStatsOp(const TensorT& tensor, OpBuilder b,
   // min/max stats is just for comments, so ignore it.
   if (!tensor.quantization || tfl::IsQuantized(tensor)) return nullptr;
   // If the result isn't float and unquantizable, the min/max is ignored.
-  if (!res.getType()
-           .cast<mlir::ShapedType>()
-           .getElementType()
-           .isa<mlir::FloatType>()) {
+  if (!llvm::isa<mlir::FloatType>(
+          llvm::cast<mlir::ShapedType>(res.getType()).getElementType())) {
     return nullptr;
   }
   auto mins = tensor.quantization->min;
@@ -438,7 +436,7 @@ StatusOr<Operation*> BuildExternalConstOp(const tflite::TensorT& tensor,
   TF_ASSIGN_OR_RETURN(mlir::TensorType type,
                       tfl::GetTensorType(tensor, builder,
                                          /*is_constant=*/true));
-  auto shaped_type = type.dyn_cast<mlir::RankedTensorType>();
+  auto shaped_type = llvm::dyn_cast<mlir::RankedTensorType>(type);
   if (!shaped_type) {
     return errors::Internal("Constant doesn't have a shape");
   }
@@ -457,7 +455,7 @@ StatusOr<Operation*> BuildVariableOp(const tflite::TensorT& tensor,
   TF_ASSIGN_OR_RETURN(mlir::TensorType type,
                       tfl::GetTensorType(tensor, builder,
                                          /*is_constant=*/true));
-  auto shaped_type = type.dyn_cast<mlir::RankedTensorType>();
+  auto shaped_type = llvm::dyn_cast<mlir::RankedTensorType>(type);
   if (!shaped_type) {
     return errors::Internal("Constant doesn't have a shape");
   }
@@ -510,7 +508,7 @@ static StatusOr<Operation*> BuildSparseConstOp(
   TF_ASSIGN_OR_RETURN(mlir::TensorType type,
                       tfl::GetTensorType(tensor, builder,
                                          /*is_constant=*/true));
-  auto shaped_type = type.dyn_cast<mlir::RankedTensorType>();
+  auto shaped_type = llvm::dyn_cast<mlir::RankedTensorType>(type);
   if (!shaped_type) {
     return errors::Internal("Constant doesn't have a shape");
   }
@@ -598,7 +596,7 @@ StatusOr<Operation*> BuildConstOp(const tflite::TensorT& tensor,
                                          /*is_constant=*/true,
                                          /*is_intermediate=*/false,
                                          /*get_storage=*/true));
-  auto shaped_type = type.dyn_cast<mlir::RankedTensorType>();
+  auto shaped_type = llvm::dyn_cast<mlir::RankedTensorType>(type);
   if (!shaped_type) {
     return errors::Internal("Constant doesn't have a shape");
   }
@@ -619,11 +617,11 @@ StatusOr<Operation*> BuildConstOp(const tflite::TensorT& tensor,
   }
 
   auto elem_type = shaped_type.getElementType();
-  if (auto float_type = elem_type.dyn_cast<mlir::FloatType>()) {
+  if (auto float_type = llvm::dyn_cast<mlir::FloatType>(elem_type)) {
     TF_ASSIGN_OR_RETURN(value, tfl::ConvertFloatBuffer(shaped_type, buffer));
-  } else if (elem_type.isa<mlir::IntegerType>()) {
+  } else if (llvm::isa<mlir::IntegerType>(elem_type)) {
     TF_ASSIGN_OR_RETURN(value, tfl::ConvertIntBuffer(shaped_type, buffer));
-  } else if (elem_type.isa<mlir::TF::StringType>()) {
+  } else if (llvm::isa<mlir::TF::StringType>(elem_type)) {
     tensorflow::TensorProto repr =
         tfl::ConvertTfliteConstTensor(tensor, buffer);
     std::vector<llvm::StringRef> refs;
@@ -633,7 +631,8 @@ StatusOr<Operation*> BuildConstOp(const tflite::TensorT& tensor,
       refs.push_back({ref.data(), ref.size()});
 
     value = mlir::DenseStringElementsAttr::get(shaped_type, refs);
-  } else if (elem_type.isa<mlir::ComplexType, mlir::TF::TensorFlowType>()) {
+  } else if (llvm::isa<mlir::ComplexType, mlir::TF::TensorFlowType>(
+                 elem_type)) {
     tensorflow::TensorProto repr =
         tfl::ConvertTfliteConstTensor(tensor, buffer);
     std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
@@ -889,7 +888,7 @@ StatusOr<Operation*> ConvertOp(
     op_state.addTypes({type});
   }
 
-  // While the last several tensors could be optional tensors for an tfl op, the
+  // While the last several tensors could be optional tensors for a tfl op, the
   // number of input operands could vary. Gets the min/max number of operands
   // from tflite op name.
   // Also, since the above code special-handles the `tfl.reshape` op and add an
@@ -929,8 +928,8 @@ StatusOr<Operation*> ConvertOp(
     // Flattens reshape ops when more than one dimension shape operand is given.
     mlir::DenseIntElementsAttr shape_attr;
     if (matchPattern(op_state.operands[1], m_Constant(&shape_attr))) {
-      auto shape_ty =
-          op_state.operands[1].getType().dyn_cast<RankedTensorType>();
+      auto shape_ty = llvm::dyn_cast<mlir::RankedTensorType>(
+          op_state.operands[1].getType());
       if (shape_ty != nullptr && shape_ty.hasRank() && shape_ty.getRank() > 1) {
         llvm::SmallVector<mlir::Attribute, 4> shape;
         int32_t dim_size = 0;
@@ -1117,15 +1116,16 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
         value.getType());
     // Only the 8-bit constants are imported with narrow range.
     if (!qtype || qtype.getStorageTypeIntegralWidth() != 8 ||
-        !(qtype.isa<mlir::quant::UniformQuantizedType>() ||
-          qtype.isa<mlir::quant::UniformQuantizedPerAxisType>())) {
+        !(llvm::isa<mlir::quant::UniformQuantizedType>(qtype) ||
+          llvm::isa<mlir::quant::UniformQuantizedPerAxisType>(qtype))) {
       return;
     }
     for (auto& use : value.getUses()) {
       Operation* user = use.getOwner();
       if (user->hasTrait<mlir::OpTrait::IsTerminator>()) continue;
 
-      auto affine_user = llvm::dyn_cast<mlir::AffineQuantizedOpInterface>(user);
+      auto affine_user =
+          llvm::dyn_cast<mlir::TFL::AffineQuantizedOpInterface>(user);
       if (affine_user &&
           affine_user.GetAffineOperandIndex() == use.getOperandNumber() &&
           affine_user.RequiredNarrowRangeAffineOperand())
@@ -1134,14 +1134,16 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
       if (full_range_const == value) {
         mlir::quant::QuantizedType new_qtype;
         if (auto per_axis =
-                qtype.dyn_cast<mlir::quant::UniformQuantizedPerAxisType>()) {
+                llvm::dyn_cast<mlir::quant::UniformQuantizedPerAxisType>(
+                    qtype)) {
           new_qtype = mlir::quant::UniformQuantizedPerAxisType::get(
               per_axis.getFlags(), per_axis.getStorageType(),
               per_axis.getExpressedType(), per_axis.getScales(),
               per_axis.getZeroPoints(), per_axis.getQuantizedDimension(),
               per_axis.getStorageTypeMin() - 1, per_axis.getStorageTypeMax());
         } else if (auto per_tensor =
-                       qtype.dyn_cast<mlir::quant::UniformQuantizedType>()) {
+                       llvm::dyn_cast<mlir::quant::UniformQuantizedType>(
+                           qtype)) {
           new_qtype = mlir::quant::UniformQuantizedType::get(
               per_tensor.getFlags(), per_tensor.getStorageType(),
               per_tensor.getExpressedType(), per_tensor.getScale(),
@@ -1185,7 +1187,8 @@ int GetTensorIndex(const std::string& tensor_name,
 llvm::SmallVector<llvm::StringRef, 2> GetStringsFromAttrWithSeparator(
     mlir::DictionaryAttr attr, const std::string& attr_key) {
   llvm::SmallVector<llvm::StringRef, 2> result;
-  if (auto str = attr.get(attr_key).dyn_cast_or_null<mlir::StringAttr>()) {
+  if (auto str =
+          llvm::dyn_cast_if_present<mlir::StringAttr>(attr.get(attr_key))) {
     str.getValue().split(result, ',', /*MaxSplit=*/-1,
                          /*KeepEmpty=*/false);
   }
@@ -1643,11 +1646,13 @@ void AddRegionsForTflWhileOp(mlir::ModuleOp module) {
   mlir::SymbolTable symbol_table(module);
   module.walk([&](mlir::TFL::WhileOp while_op) {
     auto cond = symbol_table.lookup<mlir::func::FuncOp>(
-        while_op->getAttr("cond").cast<mlir::FlatSymbolRefAttr>().getValue());
+        llvm::cast<mlir::FlatSymbolRefAttr>(while_op->getAttr("cond"))
+            .getValue());
     AddCallOpInWhileOpRegion(while_op.getCond(), cond);
     while_op->removeAttr("cond");
     auto body = symbol_table.lookup<mlir::func::FuncOp>(
-        while_op->getAttr("body").cast<mlir::FlatSymbolRefAttr>().getValue());
+        llvm::cast<mlir::FlatSymbolRefAttr>(while_op->getAttr("body"))
+            .getValue());
     AddCallOpInWhileOpRegion(while_op.getBody(), body);
     while_op->removeAttr("body");
   });
@@ -1658,15 +1663,15 @@ void AddRegionsForStableHLOOp(mlir::ModuleOp module) {
   std::vector<mlir::func::FuncOp> to_delete_funcs;
   module.walk([&](mlir::vhlo::ReduceOpV1 reduce_op) {
     auto body = symbol_table.lookup<mlir::func::FuncOp>(
-        reduce_op->getAttr("body").cast<mlir::FlatSymbolRefAttr>().getValue());
+        llvm::cast<mlir::FlatSymbolRefAttr>(reduce_op->getAttr("body"))
+            .getValue());
     InlineVhloOpRegion(reduce_op.getBody(), body);
     reduce_op->removeAttr("body");
     to_delete_funcs.push_back(body);
   });
   module.walk([&](mlir::vhlo::ReduceWindowOpV1 reduce_window_op) {
     auto body = symbol_table.lookup<mlir::func::FuncOp>(
-        reduce_window_op->getAttr("body")
-            .cast<mlir::FlatSymbolRefAttr>()
+        llvm::cast<mlir::FlatSymbolRefAttr>(reduce_window_op->getAttr("body"))
             .getValue());
     InlineVhloOpRegion(reduce_window_op.getBody(), body);
     reduce_window_op->removeAttr("body");
@@ -1674,8 +1679,8 @@ void AddRegionsForStableHLOOp(mlir::ModuleOp module) {
   });
   module.walk([&](mlir::vhlo::ScatterOpV1 scatter_op) {
     auto update_computation = symbol_table.lookup<mlir::func::FuncOp>(
-        scatter_op->getAttr(kScatterRegionFuncName)
-            .cast<mlir::FlatSymbolRefAttr>()
+        llvm::cast<mlir::FlatSymbolRefAttr>(
+            scatter_op->getAttr(kScatterRegionFuncName))
             .getValue());
     InlineVhloOpRegion(scatter_op.getUpdateComputation(), update_computation);
     scatter_op->removeAttr(kScatterRegionFuncName);
@@ -1683,8 +1688,7 @@ void AddRegionsForStableHLOOp(mlir::ModuleOp module) {
   });
   module.walk([&](mlir::vhlo::SortOpV1 sort_op) {
     auto comparator = symbol_table.lookup<mlir::func::FuncOp>(
-        sort_op->getAttr("comparator")
-            .cast<mlir::FlatSymbolRefAttr>()
+        llvm::cast<mlir::FlatSymbolRefAttr>(sort_op->getAttr("comparator"))
             .getValue());
     InlineVhloOpRegion(sort_op.getComparator(), comparator);
     sort_op->removeAttr("comparator");
@@ -1692,11 +1696,13 @@ void AddRegionsForStableHLOOp(mlir::ModuleOp module) {
   });
   module.walk([&](mlir::vhlo::WhileOpV1 while_op) {
     auto cond = symbol_table.lookup<mlir::func::FuncOp>(
-        while_op->getAttr("cond").cast<mlir::FlatSymbolRefAttr>().getValue());
+        llvm::cast<mlir::FlatSymbolRefAttr>(while_op->getAttr("cond"))
+            .getValue());
     InlineVhloOpRegion(while_op.getCond(), cond);
     while_op->removeAttr("cond");
     auto body = symbol_table.lookup<mlir::func::FuncOp>(
-        while_op->getAttr("body").cast<mlir::FlatSymbolRefAttr>().getValue());
+        llvm::cast<mlir::FlatSymbolRefAttr>(while_op->getAttr("body"))
+            .getValue());
     InlineVhloOpRegion(while_op.getBody(), body);
     while_op->removeAttr("body");
     to_delete_funcs.push_back(body);
diff --git a/tensorflow/compiler/mlir/lite/integrations/BUILD b/tensorflow/compiler/mlir/lite/integrations/BUILD
new file mode 100644
index 000000000000..cae74c9c3ac7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/integrations/BUILD
@@ -0,0 +1,72 @@
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/lite/integrations:__subpackages__",
+        "//third_party/odml/litert/litert/python/tools/model_utils:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+pybind_extension(
+    name = "model_utils_core_pybind",
+    srcs = [
+        "model_utils_core_pybind.cc",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/lite:flatbuffer_export",
+        "//tensorflow/compiler/mlir/lite:flatbuffer_import",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/stablehlo:prepare_hlo",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/python/lib/core:ndarray_tensor",
+        "//tensorflow/python/lib/core:py_func_lib",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:CAPIIRHeaders",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:FuncTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MLIRBindingsPythonHeaders",
+        "@llvm-project//mlir:MLIRBindingsPythonHeadersAndDeps",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//third_party/python_runtime:headers",
+        "@pybind11",
+        "@stablehlo//:register",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:vhlo_ops",
+    ],
+)
+
+py_test(
+    name = "py_bindings_test",
+    srcs = ["py_bindings_test.py"],
+    deps = [
+        "//tensorflow/compiler/mlir/lite/integrations/python/mlir",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc b/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
new file mode 100644
index 000000000000..42ae13c57e42
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/integrations/model_utils_core_pybind.cc
@@ -0,0 +1,223 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <Python.h>
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Support/LLVM.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "llvm/Support/Casting.h"
+#include "mlir-c/IR.h"  // from @llvm-project
+#include "mlir/Bindings/Python/PybindAdaptors.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/CAPI/IR.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Func/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "pybind11/cast.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
+#include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/python/lib/core/ndarray_tensor.h"
+
+namespace py = pybind11;
+
+// -----------------------------------------------------------------------------
+// Module initialization.
+// -----------------------------------------------------------------------------
+
+namespace {
+
+class MlirPythonPass
+    : public mlir::PassWrapper<MlirPythonPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  explicit MlirPythonPass(std::string name, std::string description,
+                          py::object pyfunc)
+      : name_(name), description_(description), pyfunc_(pyfunc) {
+    pyfunc.inc_ref();
+  }
+
+  ~MlirPythonPass() override = default;
+
+  mlir::StringRef getName() const override { return name_; }
+  mlir::StringRef getArgument() const override { return name_; }
+  mlir::StringRef getDescription() const override { return description_; }
+
+  void runOnOperation() override {
+    auto module_clone = getOperation().clone();
+    MlirModule c_module = wrap(module_clone);
+
+    auto py_module = py::cast(c_module);
+    auto py_args = py::make_tuple(py_module);
+    PyObject* py_pass_ret = PyObject_CallObject(pyfunc_.ptr(), py_args.ptr());
+
+    if (py_pass_ret == nullptr || PyErr_Occurred()) {
+      PyErr_PrintEx(0);
+      PyErr_Clear();
+      signalPassFailure();
+      return;
+    }
+    auto py_new_module_op = py::cast<py::object>(py_pass_ret);
+    auto c_new_module_op = py::cast<MlirOperation>(py_new_module_op);
+    mlir::Operation* new_module_op = unwrap(c_new_module_op);
+
+    // TODO: Copy attributes from new_module
+    getOperation().getBodyRegion().takeBody(new_module_op->getRegion(0));
+
+    module_clone.erase();
+  }
+
+ private:
+  std::string name_;
+  std::string description_;
+  py::object pyfunc_;
+};
+
+inline void RegisterDialects(mlir::DialectRegistry& registry) {
+  mlir::registerAllDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+  mlir::func::registerAllExtensions(registry);
+  registry.insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                  mlir::quant::QuantDialect,
+                  mlir::quantfork::QuantizationForkDialect,
+                  mlir::TFL::TensorFlowLiteDialect,
+                  mlir::stablehlo::StablehloDialect, mlir::vhlo::VhloDialect>();
+}
+
+PYBIND11_MODULE(model_utils_core_pybind, m) {
+  Py_Initialize();
+
+  m.doc() = "LiteRT ModelUtils Core Pybinds";
+  // Register passes on load.
+  mlir::registerTransformsPasses();
+  mlir::func::registerFuncPasses();
+  mlir::odml::registerLegalizeStablehloToVhloPass();
+
+  m.def("mlir_opt_main", [](std::vector<std::string> argv,
+                            std::vector<std::string> pass_names,
+                            std::vector<std::string> pass_descriptions,
+                            std::vector<py::object> pass_fns) {
+    std::vector<char*> c_argv_vec;
+    c_argv_vec.reserve(argv.size());
+    for (size_t i = 0; i < argv.size(); ++i)
+      c_argv_vec.push_back(const_cast<char*>(argv[i].c_str()));
+
+    int argc = argv.size();
+    char** c_argv = c_argv_vec.data();
+
+    tensorflow::InitMlir y(&argc, &c_argv);
+
+    mlir::DialectRegistry registry;
+    RegisterDialects(registry);
+
+    int num_passes = pass_names.size();
+    for (int i = 0; i < num_passes; ++i) {
+      mlir::PassRegistration<MlirPythonPass>(
+          [&, i = i]() -> std::unique_ptr<mlir::Pass> {
+            std::unique_ptr<mlir::Pass> p = std::make_unique<MlirPythonPass>(
+                pass_names[i], pass_descriptions[i], pass_fns[i]);
+            return p;
+          });
+    }
+
+    (void)mlir::MlirOptMain(argc, c_argv, "ModelUtils python passes driver\n",
+                            registry);
+  });
+
+  m.def("register_dialects", [](MlirContext context) {
+    mlir::DialectRegistry registry;
+    RegisterDialects(registry);
+    unwrap(context)->appendDialectRegistry(registry);
+    unwrap(context)->loadAllAvailableDialects();
+  });
+
+  m.def("flatbuffer_to_mlir",
+        [](py::bytes buffer, MlirContext context) -> MlirModule {
+          mlir::DialectRegistry registry;
+          RegisterDialects(registry);
+          unwrap(context)->appendDialectRegistry(registry);
+          unwrap(context)->loadAllAvailableDialects();
+
+          auto module_op = tflite::FlatBufferToMlir(
+              buffer, unwrap(context), mlir::UnknownLoc::get(unwrap(context)));
+          return wrap(module_op.release());
+        });
+
+  m.def("mlir_to_flatbuffer", [](MlirOperation c_op) {
+    auto op = unwrap(c_op);
+    auto module_op = llvm::dyn_cast<mlir::ModuleOp>(op);
+
+    tflite::FlatbufferExportOptions options;
+    std::string result;
+    tflite::MlirToFlatBufferTranslateFunction(module_op, options, &result,
+                                              true);
+    return py::bytes(result);
+  });
+
+  m.def("get_operation_attribute_names", [](MlirOperation c_op) {
+    mlir::Operation* op = unwrap(c_op);
+
+    std::vector<std::string> attr_names;
+    for (auto attr : op->getAttrDictionary()) {
+      attr_names.push_back(attr.getName().str());
+    }
+    return attr_names;
+  });
+
+  m.def("get_dictionary_attr_names", [](MlirAttribute c_attr) {
+    auto attr = mlir::cast<mlir::DictionaryAttr>(unwrap(c_attr));
+    std::vector<std::string> attr_names;
+    for (auto attr : attr) {
+      attr_names.push_back(attr.getName().str());
+    }
+    return attr_names;
+  });
+
+  m.def("get_elements_attr_buffer", [](MlirAttribute c_attr) {
+    auto attr = mlir::cast<mlir::ElementsAttr>(unwrap(c_attr));
+
+    tensorflow::Tensor tensor;
+    auto status = tensorflow::ConvertToTensor(attr, &tensor);
+    PyObject* np_array = Py_None;
+    status = tensorflow::TensorToNdarray(tensor, &np_array);
+
+    return py::reinterpret_steal<py::object>(np_array);
+  });
+}
+
+}  // namespace
diff --git a/tensorflow/compiler/mlir/lite/integrations/py_bindings_test.py b/tensorflow/compiler/mlir/lite/integrations/py_bindings_test.py
new file mode 100644
index 000000000000..b750a7d92311
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/integrations/py_bindings_test.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests to ensure that mlir py_bindings building properly."""
+
+# pylint: disable=g-import-not-at-top
+# pylint: disable=unused-import
+
+
+def smoketest():
+  import tensorflow.compiler.mlir.lite.integrations.python.mlir
+
+
+if __name__ == "__main__":
+  smoketest()
diff --git a/tensorflow/compiler/mlir/lite/integrations/python/mlir/BUILD b/tensorflow/compiler/mlir/lite/integrations/python/mlir/BUILD
new file mode 100644
index 000000000000..2162d9864827
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/integrations/python/mlir/BUILD
@@ -0,0 +1,43 @@
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+load("//tensorflow:py.default.bzl", "py_library")
+load("//tensorflow/compiler/mlir/lite:symlink_files.bzl", "symlink_inputs")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/mlir/lite:__subpackages__"],
+    features = [
+        # Cannot use header_modules (parse_headers feature fails).
+        "-use_header_modules",
+    ],
+    licenses = ["notice"],
+)
+
+symlink_inputs(
+    name = "mlir_libs",
+    rule = py_library,
+    symlinked_inputs = {"srcs": {
+        "_mlir_libs/": ["@llvm-project//mlir/python:MlirLibsPyFiles"],
+    }},
+)
+
+py_library(
+    name = "mlir",
+    deps = [
+        ":mlir_libs",
+        "//tensorflow/compiler/mlir/lite/integrations/python/mlir/_mlir_libs:_mlir",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/integrations/python/mlir/_mlir_libs/BUILD b/tensorflow/compiler/mlir/lite/integrations/python/mlir/_mlir_libs/BUILD
new file mode 100644
index 000000000000..303a2bb48544
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/integrations/python/mlir/_mlir_libs/BUILD
@@ -0,0 +1,59 @@
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
+load("//tensorflow/compiler/mlir/lite:symlink_files.bzl", "symlink_files")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/mlir/lite:__subpackages__"],
+    features = [
+        # Cannot use header_modules (parse_headers feature fails).
+        "-use_header_modules",
+    ],
+    licenses = ["notice"],
+)
+
+# These flags are needed for parse_headers feature.
+COPTS = [
+    "-fexceptions",
+    "-frtti",
+]
+
+pybind_extension(
+    name = "_mlir",
+    srcs = [
+        "@llvm-project//mlir:lib/Bindings/Python/MainModule.cpp",
+    ],
+    copts = COPTS,
+    pytype_srcs = [
+        ":_mlirPyi",
+    ],
+    deps = [
+        "@llvm-project//mlir:MLIRBindingsPythonCore",
+        "@llvm-project//mlir:MLIRBindingsPythonNanobindHeaders",
+        "@nanobind",
+    ],
+)
+
+symlink_files(
+    name = "_mlirPyi",
+    srcs = [
+        "@llvm-project//mlir/python:IRPyIFiles",
+        "@llvm-project//mlir/python:PassManagerPyIFiles",
+    ],
+    dst = "_mlir",
+    flatten = True,
+)
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.td b/tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.td
index d9200ddc70f1..3881a1e29177 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.td
@@ -32,7 +32,7 @@ def GetSqueezedPermutation: NativeCodeCall<"GetSqueezedPermutation($0, $1)">;
 
 // Check to see if the tensor dimensions can be Squeezed by eliminating 1s'
 def CanSqueezeTensor : Constraint<CPred<
-  "GetShape($0).getNumElements() > GetSqueezedShape($0).getNumElements()">>;
+  "GetShapeAttr($0).getNumElements() > GetSqueezedShape($0).getNumElements()">>;
 
 
 // Pattern to convert TFL_TransposeOp with rank>6 to rank<=6 if there are
@@ -50,7 +50,12 @@ def ConvertTransposeToDecreaseRank : Pat<
     (TFL_TransposeOp
       (TFL_ReshapeOp $input, (Arith_ConstantOp (GetSqueezedShape $input))),
       (Arith_ConstantOp (GetSqueezedPermutation $input, $permutation))),
-    (Arith_ConstantOp (GetShape $output_transpose))),
+    (Arith_ConstantOp (GetShapeAttr $output_transpose))),
     [(AnyStaticShapeTensor $input),
      (HasRankAtLeast<7> $input),
      (CanSqueezeTensor $input)]>;
+
+def RemoveNoopTranspose : Pat<
+  (TFL_TransposeOp $input, $perm),
+  (replaceWithValue $input),
+  [(IsTransposeNoop $perm)]>;
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
index fa85389789e5..57e4ec22976d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
@@ -27,9 +27,9 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
 // Referred TF_AnyStrAttrOf in tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
 class TFL_AnyStrAttrOf<list<string> cases> : StringBasedAttr<
   CPred<!foldl(
-      "$_self.cast<StringAttr>().getValue() == \"" # !head(cases) # "\"",
+      "llvm::cast<StringAttr>($_self).getValue() == \"" # !head(cases) # "\"",
       !foreach(case, !tail(cases),
-               "$_self.cast<StringAttr>().getValue() == \"" # case # "\""),
+               "llvm::cast<StringAttr>($_self).getValue() == \"" # case # "\""),
       prev, cur, prev # " || " # cur)>,
   "string attribute whose value is " #
     !foldl(/*init*/!head(cases), /*list*/!tail(cases),
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index d2b23cffe125..4ff0bc9e01d9 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -81,10 +81,10 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h"
 #include "tensorflow/compiler/mlir/lite/utils/shape_and_size_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
@@ -96,15 +96,17 @@ limitations under the License.
 namespace mlir {
 namespace TFL {
 
+// go/keep-sorted start
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CeilOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CosOp);
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(LocalResponseNormalizationOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(FloorOp);
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RoundOp);
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(LocalResponseNormalizationOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(NegOp);
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RoundOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SinOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SqrtOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SquareOp);
+// go/keep-sorted end
 
 namespace {
 
@@ -193,7 +195,7 @@ DenseElementsAttr GetSqueezedShape(Value value_tensor) {
 // TFL_TransposeOp when the tensor has some dimensions with value==1
 // Example- "tfl.transpose"(tensor<56x8x56x1x1x1x7xf32>, [4, 5, 1, 2, 0, 6, 3])
 // Permutation before squeese is [4, 5, 1, 2, 0, 6, 3] becomes [1, 2, 0, 3]
-// after squeeze is perfomed to retain the relative ordering of the non-1 dims.
+// after squeeze is performed to retain the relative ordering of the non-1 dims.
 DenseElementsAttr GetSqueezedPermutation(Value input_value,
                                          Value input_permutation) {
   auto input_shape =
@@ -258,6 +260,58 @@ bool ShouldFoldOperation(Operation* inst) {
           (results_size <= kSizeFactor * operands_size));
 }
 
+// Returns dimension index for the given axis that supports negative
+// indexing.
+int64_t NormalizeDim(int64_t axis, int64_t rank) {
+  return axis >= 0 ? axis : axis + rank;
+}
+
+Type InferReductionOpType(Value input, Value reduction_indices,
+                          BoolAttr keep_dims) {
+  Type input_ty = input.getType();
+  Type element_ty = getElementTypeOrSelf(input_ty);
+
+  // Output type is unranked if input type is not ranked.
+  auto ranked_ty = mlir::dyn_cast<RankedTensorType>(input_ty);
+  if (!ranked_ty) return UnrankedTensorType::get(element_ty);
+  int64_t rank = ranked_ty.getRank();
+
+  DenseIntElementsAttr indices;
+  if (!matchPattern(reduction_indices, m_Constant(&indices))) {
+    // Output type is unranked if reduction indices are not constant and reduced
+    // dimensions are not kept.
+    if (!keep_dims.getValue()) return UnrankedTensorType::get(element_ty);
+
+    // Otherwise, output type has same rank as the input.
+    return RankedTensorType::get(
+        SmallVector<int64_t, 4>(rank, ShapedType::kDynamic), element_ty);
+  }
+
+  int64_t num_reduce_dim = 0;
+  llvm::SmallVector<bool, 4> is_reduce_dim(rank, false);
+  for (const APInt& index : indices.getValues<APInt>()) {
+    int64_t dim = NormalizeDim(index.getSExtValue(), rank);
+    // Invalid input.
+    assert(dim >= 0 && dim < rank);
+
+    if (!is_reduce_dim[dim]) {
+      is_reduce_dim[dim] = true;
+      num_reduce_dim++;
+    }
+  }
+
+  ArrayRef<int64_t> shape = ranked_ty.getShape();
+  SmallVector<int64_t, 4> out_shape;
+  out_shape.reserve(rank - (keep_dims.getValue() ? 0 : num_reduce_dim));
+  for (int64_t i = 0; i < rank; ++i) {
+    if (!is_reduce_dim[i])
+      out_shape.push_back(shape[i]);
+    else if (keep_dims.getValue())
+      out_shape.push_back(1);
+  }
+  return RankedTensorType::get(out_shape, element_ty);
+}
+
 #include "tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.inc"
 
 }  // namespace
@@ -425,7 +479,7 @@ bool EqualsZero(Value value) {
 
 // Replaces the bias operand with a "none" type value if the bias value is
 // constant zero.
-// `ConcreteOpType` must be an concrete MLIR op class that has an optional
+// `ConcreteOpType` must be a concrete MLIR op class that has an optional
 // bias operand named 'bias'.
 template <typename ConcreteOpType>
 struct RemoveOptionalZeroBias : public OpRewritePattern<ConcreteOpType> {
@@ -1527,7 +1581,7 @@ LogicalResult FullyConnectedOp::verify() {
 
   // Input's element size must be multiple of parameter's z_in dimension.
   const int z_in = filter_type.getDimSize(1);
-  const int num_input_elements = input_type.getNumElements();
+  const int64_t num_input_elements = input_type.getNumElements();
   if (z_in != 0 && num_input_elements % z_in != 0) {
     return op.emitOpError(llvm::formatv(
                "expect 'input' num_elements % {0} == 0, got input type ", z_in))
@@ -1543,7 +1597,7 @@ LogicalResult FullyConnectedOp::verify() {
       return mlir::success();
     }
 
-    const int num_output_elements = output_type.getNumElements();
+    const int64_t num_output_elements = output_type.getNumElements();
     const int z_out = filter_type.getDimSize(0);
     if (num_output_elements % z_out != 0) {
       return op.emitOpError(llvm::formatv(
@@ -2232,16 +2286,14 @@ struct RemoveAdjacentReshape : public RewritePattern {
   explicit RemoveAdjacentReshape(MLIRContext* context)
       : RewritePattern(ReshapeOp::getOperationName(), 1, context) {}
 
-  LogicalResult match(Operation* op) const override {
-    auto thisOp = cast<ReshapeOp>(op);
-    auto prevOp = thisOp.getOperand(0).getDefiningOp();
-    return isa_and_nonnull<ReshapeOp>(prevOp) ? success() : failure();
-  }
-
-  void rewrite(Operation* op, PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
     auto thisOp = cast<ReshapeOp>(op);
-    auto prevOp = cast<ReshapeOp>(thisOp.getOperand(0).getDefiningOp());
-
+    auto prevOp =
+        dyn_cast_or_null<ReshapeOp>(thisOp.getOperand(0).getDefiningOp());
+    if (!prevOp) {
+      return failure();
+    }
     // Replace
     //   %1 = "tfl.reshape"(%0, %shape0)
     //   %2 = "tfl.reshape"(%1, %shape1)
@@ -2249,6 +2301,7 @@ struct RemoveAdjacentReshape : public RewritePattern {
     //   %2 = "tfl.reshape"(%0, %shape1)
     rewriter.replaceOpWithNewOp<ReshapeOp>(
         op, thisOp.getType(), prevOp.getOperand(0), thisOp.getOperand(1));
+    return success();
   }
 };
 
@@ -2964,7 +3017,8 @@ struct DropFakeQuant : public RewritePattern {
   explicit DropFakeQuant(MLIRContext* context)
       : RewritePattern(FakeQuantOp::getOperationName(), 1, context) {}
 
-  LogicalResult match(Operation* op) const override {
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
     // We only match the op with valid "minmax" attribute.
     if (!HasValidMinMaxAttribute(op)) return failure();
 
@@ -2974,12 +3028,9 @@ struct DropFakeQuant : public RewritePattern {
     for (auto* operand : fakeQuantOp.getResult().getUsers())
       if (!HasValidMinMaxAttribute(operand)) return failure();
 
-    return success();
-  }
-
-  void rewrite(Operation* op, PatternRewriter& rewriter) const override {
     // Replace the matched FakeQuantOp by its primary operand.
     rewriter.replaceOp(op, op->getOperand(0));
+    return success();
   }
 };
 }  // end anonymous namespace
@@ -4037,6 +4088,12 @@ OpFoldResult SumOp::fold(FoldAdaptor adaptor) {
   return DenseFPElementsAttr::get(out_type, out_data);
 }
 
+void SumOp::build(OpBuilder& builder, OperationState& result, Value input,
+                  Value axes, BoolAttr keep_dims) {
+  Type out_ty = InferReductionOpType(input, axes, keep_dims);
+  build(builder, result, out_ty, input, axes, keep_dims);
+}
+
 //===----------------------------------------------------------------------===//
 // RankOp
 //===----------------------------------------------------------------------===//
@@ -4443,6 +4500,27 @@ int64_t TransposeConvOp::GetArithmeticCount(Operation* op) {
 // StridedSliceOp
 //===----------------------------------------------------------------------===//
 
+bool VerifyStridedSliceOpInputRankConstraints(StridedSliceOp op) {
+  auto ranked_input_type =
+      mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
+
+  // If input is unranked, there is nothing else to be verified.
+  if (!ranked_input_type) return true;
+  const int num_input_dims = ranked_input_type.getRank();
+
+  // The kernel will reshape the input tensor with new axis, it only supports
+  // this reshaped tensor up to 5D.
+  const uint32_t ellipsis_mask = op.getEllipsisMask();
+  const uint32_t new_axis_mask = op.getNewAxisMask();
+  int num_added_axis = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (!((1 << i) & ellipsis_mask) && ((1 << i) & new_axis_mask)) {
+      num_added_axis++;
+    }
+  }
+  return (num_input_dims + num_added_axis <= 5);
+}
+
 LogicalResult StridedSliceOp::verify() {
   StridedSliceOp op = *this;
   auto ranked_input_type =
@@ -4469,17 +4547,6 @@ LogicalResult StridedSliceOp::verify() {
     if (strides_type.getDimSize(0) > num_input_dims) return failure();
   }
 
-  // The kernel will reshape the input tensor with new axis, it only supports
-  // this reshaped tensor up to 5D.
-  uint32_t ellipsis_mask = op.getEllipsisMask();
-  uint32_t new_axis_mask = op.getNewAxisMask();
-  int num_added_axis = 0;
-  for (int i = 0; i < 8; ++i) {
-    if (!((1 << i) & ellipsis_mask) && ((1 << i) & new_axis_mask)) {
-      num_added_axis++;
-    }
-  }
-  if (num_input_dims + num_added_axis > 5) return failure();
   return success();
 }
 
@@ -4574,7 +4641,7 @@ void ComputePermutation(ArrayRef<int64_t> perms, ArrayRef<int64_t> output_shape,
 
 void TransposeOp::getCanonicalizationPatterns(RewritePatternSet& results,
                                               MLIRContext* context) {
-  results.add<ConvertTransposeToDecreaseRank>(context);
+  results.add<ConvertTransposeToDecreaseRank, RemoveNoopTranspose>(context);
 }
 
 OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 5946ce0f31da..89d1a5ed9602 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -35,10 +35,10 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_dialect.h.inc"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_enums.h.inc"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #define GET_ATTRDEF_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_attrdefs.h.inc"
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 722abc63f1cb..09bc5776873c 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -28,27 +28,27 @@ include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td"
-include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td"
+include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
 
 //===----------------------------------------------------------------------===//
 // TFLite dialect string type - uses the TF string type as implementation
 //===----------------------------------------------------------------------===//
-def TFL_Str : Type<CPred<"$_self.isa<mlir::TF::StringType>()">,
+def TFL_Str : Type<CPred<"llvm::isa<mlir::TF::StringType>($_self)">,
                   "TFLite string type">,
              BuildableType<"getType<mlir::TF::StringType>()">;
 
 //===----------------------------------------------------------------------===//
 // TFLite dialect quint8 type - uses the TF quint8 type as implementation
 //===----------------------------------------------------------------------===//
-def TFL_Quint8 : Type<CPred<"$_self.isa<mlir::TF::Quint8Type>()">,
+def TFL_Quint8 : Type<CPred<"llvm::isa<mlir::TF::Quint8Type>($_self)">,
                     "TFLite quint8 type">,
               BuildableType<"getType<mlir::TF::Quint8Type>()">;
 
 //===----------------------------------------------------------------------===//
 // Type that represents control dependencies
 //===----------------------------------------------------------------------===//
-def TFL_Control: Type<CPred<"$_self.isa<ControlType>()">, "control">,
+def TFL_Control: Type<CPred<"llvm::isa<ControlType>($_self)">, "control">,
                  BuildableType<"$_builder.getType<ControlType>()">;
 
 
@@ -77,7 +77,7 @@ class TFL_OperandsHaveSameShapesOrBroadcastableShape<
   TFL_RuntimePredOpTrait<"operands do not have the same shape or "
       "broadcastable shapes within the rank " # max_bcast_rank,
     CPred<"TFL::VerifyOperandsHaveSameShapesOrBroadcastableShape("
-            "$_op, llvm::ArrayRef<unsigned>({" # !interleave(indices, ", ") #
+            "&$_op, llvm::ArrayRef<unsigned>({" # !interleave(indices, ", ") #
             "}), " # max_bcast_rank # ")">>;
 
 // These additional types/type constraints here are used to decouple the ops
@@ -151,10 +151,10 @@ def TFL_StatefulTensor : TypeAlias<AnyTensor, "stateful tensor">;
 
 // Returns true of operand is none type.
 class TFL_OperandIsNoneType<int i> :
-  CPred<"$_op.getOperand(" # i # ").getType().isa<NoneType>()">;
+  CPred<"llvm::isa<NoneType>($_op.getOperand(" # i # ").getType())">;
 
 class TFL_OperandIsUnrankedPred<int n> :
-  CPred<"$_op.getOperand(" # n # ").getType().isa<UnrankedTensorType>()">;
+  CPred<"llvm::isa<UnrankedTensorType>($_op.getOperand(" # n # ").getType())">;
 
 // TODO: Some of these could be generalized and/or moved to more general
 // location.
@@ -162,52 +162,52 @@ class TFL_OperandIsUnrankedPred<int n> :
 class TFL_OperandHasRank<int n, int m> :
   PredOpTrait<"operand " # n # " is " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
-      CPred<"$_op.getOperand(" # n #
-      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n #
+      ").getType()).getRank() == " # m>]>>;
 
 // Returns true if the n-th operand is ranked and has rank dim.
 class TFL_OperandHasKnownRank<int n, int dim> : And<[
-  CPred<"$_op.getOperand(" # n # ").getType().isa<RankedTensorType>()">,
-  CPred<"$_op.getOperand(" # n # ").getType().cast<ShapedType>().getRank() == "
+  CPred<"llvm::isa<RankedTensorType>($_op.getOperand(" # n # ").getType())">,
+  CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n # ").getType()).getRank() == "
     # dim>]>;
 
 // True if operand n is ranked and has a rank > dim.
 class TFL_OperandIsRankedAndHasDimPred<int n, int dim> : And<[
-  CPred<"$_op.getOperand(" # n # ").getType().isa<RankedTensorType>()">,
-  CPred<"$_op.getOperand(" # n # ").getType().cast<ShapedType>().getRank() > "
+  CPred<"llvm::isa<RankedTensorType>($_op.getOperand(" # n # ").getType())">,
+  CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n # ").getType()).getRank() > "
   # dim>]>;
 
 // Returns true if the n-th operand is ranked and has a dimension length = size
 // at the rank dim.
 class TFL_OperandDimEquals<int n, int dim, int size> : And<[
   TFL_OperandIsRankedAndHasDimPred<n, dim>,
-  CPred<"$_op.getOperand(" # n # ").getType().cast<ShapedType>()"
+  CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n # ").getType())"
       ".getShape()[" # dim # " ] == " # size>]>;
 
 // Returns true if the n-th operand is ranked and has a dimension length <=
 // size at the rank dim.
 class TFL_OperandDimIsAtMost<int n, int dim, int size> : And<[
   TFL_OperandIsRankedAndHasDimPred<n, dim>,
-  CPred<"$_op.getOperand(" # n # ").getType().cast<ShapedType>()"
+  CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n # ").getType())"
       ".getShape()[" # dim # " ] <= " # size>]>;
 
 // Returns true if the n-th operand has unknown rank or at least rank m.
 class TFL_OperandHasAtleastRank<int n, int m> :
   PredOpTrait<"operand " # n # " is " # m # "-D",
-    Or<[CPred<"$_op.getOperand(" # n # ").getType().isa<UnrankedTensorType>()">,
-      CPred<"$_op.getOperand(" # n #
-        ").getType().cast<ShapedType>().getRank() >= " # m>]>>;
+    Or<[CPred<"llvm::isa<UnrankedTensorType>($_op.getOperand(" # n # ").getType())">,
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n #
+        ").getType()).getRank() >= " # m>]>>;
 
 class TFL_OperandRankEquals1DimOfOperand<int x, int y> :
   PredOpTrait<"operand " # x # "'s rank equals operand " # y # "'s size",
     Or<[TFL_OperandIsUnrankedPred<x>,
         TFL_OperandIsUnrankedPred<y>,
-        CPred<"!$_op.getOperand(" # y #
-          ").getType().cast<ShapedType>().hasStaticShape()">,
-        CPred<"$_op.getOperand(" # x #
-          ").getType().cast<ShapedType>().getRank() == "
-          "$_op.getOperand(" # y #
-          ").getType().cast<ShapedType>().getShape()[0]">]>>;
+        CPred<"!llvm::cast<ShapedType>($_op.getOperand(" # y #
+          ").getType()).hasStaticShape()">,
+        CPred<"llvm::cast<ShapedType>($_op.getOperand(" # x #
+          ").getType()).getRank() == "
+          "llvm::cast<ShapedType>($_op.getOperand(" # y #
+          ").getType()).getShape()[0]">]>>;
 
 class TFL_Operand0DOr1ElementTensor<int x> :
   PredOpTrait<"operand #" # x # " is an 0-d tensor or 1-d tensor w/ 1 element",
@@ -219,14 +219,14 @@ class TFL_Operand0DOr1ElementTensor<int x> :
 class TFL_OperandsHaveSameDims<int x, int y, int i, int j> :
     Or<[TFL_OperandIsUnrankedPred<x>,
         TFL_OperandIsUnrankedPred<y>,
-        CPred<"!$_op.getOperand(" # x #
-          ").getType().cast<ShapedType>().hasStaticShape()">,
-        CPred<"!$_op.getOperand(" # y #
-          ").getType().cast<ShapedType>().hasStaticShape()">,
-        CPred<"$_op.getOperand(" # x #
-          ").getType().cast<ShapedType>().getShape()[" # i # "] == "
-          "$_op.getOperand(" # y #
-          ").getType().cast<ShapedType>().getShape()[" # j # "]">]>;
+        CPred<"!llvm::cast<ShapedType>($_op.getOperand(" # x #
+          ").getType()).hasStaticShape()">,
+        CPred<"!llvm::cast<ShapedType>($_op.getOperand(" # y #
+          ").getType()).hasStaticShape()">,
+        CPred<"llvm::cast<ShapedType>($_op.getOperand(" # x #
+          ").getType()).getShape()[" # i # "] == "
+          "llvm::cast<ShapedType>($_op.getOperand(" # y #
+          ").getType()).getShape()[" # j # "]">]>;
 
 class TFL_OperandsHaveSameDimsTrait<int x, int y, int i, int j> :
   PredOpTrait<"dim " # i # " of operand " # x # " equals to dim " # j #
@@ -238,14 +238,14 @@ class TFL_OperandsHaveSameDimsTrait<int x, int y, int i, int j> :
 class TFL_NumElementsEqualsDim<int x, int y, int j> :
   Or<[TFL_OperandIsUnrankedPred<x>,
       TFL_OperandIsUnrankedPred<y>,
-      CPred<"!$_op.getOperand(" # x #
-        ").getType().cast<ShapedType>().hasStaticShape()">,
-      CPred<"!$_op.getOperand(" # y #
-        ").getType().cast<ShapedType>().hasStaticShape()">,
-      CPred<"$_op.getOperand(" # x #
-        ").getType().cast<ShapedType>().getNumElements() == "
-        "$_op.getOperand(" # y #
-        ").getType().cast<ShapedType>().getShape()[" # j # "]">]>;
+      CPred<"!llvm::cast<ShapedType>($_op.getOperand(" # x #
+        ").getType()).hasStaticShape()">,
+      CPred<"!llvm::cast<ShapedType>($_op.getOperand(" # y #
+        ").getType()).hasStaticShape()">,
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # x #
+        ").getType()).getNumElements() == "
+        "llvm::cast<ShapedType>($_op.getOperand(" # y #
+        ").getType()).getShape()[" # j # "]">]>;
 
 class TFL_NumElementsEqualsDimTrait<int x, int y, int j> :
   PredOpTrait<"operand " # x # " has num of elements equals to dim " # j #
@@ -255,10 +255,10 @@ class TFL_NumElementsEqualsDimTrait<int x, int y, int j> :
 // Return true if number of elements of x-th operand equals to n.
 class TFL_NumElements<int x, int n> :
   Or<[TFL_OperandIsUnrankedPred<x>,
-      CPred<"!$_op.getOperand(" # x #
-        ").getType().cast<ShapedType>().hasStaticShape()">,
-      CPred<"$_op.getOperand(" # x #
-        ").getType().cast<ShapedType>().getNumElements() == " # n>]>;
+      CPred<"!llvm::cast<ShapedType>($_op.getOperand(" # x #
+        ").getType()).hasStaticShape()">,
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # x #
+        ").getType()).getNumElements() == " # n>]>;
 
 class TFL_NumElementsTrait<int x, int n> :
   PredOpTrait<"operand " # x # " has num of elements equals to  " # n,
@@ -268,16 +268,16 @@ class TFL_NumElementsTrait<int x, int n> :
 // when used as element types.
 class TFL_TFTypesWithSameBits<int i, int j, int num> :
   And<[
-    Or<[CPred<"getElementTypeOrSelf($_op.getResult(" # i # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+    Or<[CPred<"llvm::isa<mlir::TF::Quint" # num # "Type>(getElementTypeOrSelf($_op.getResult(" # i # ")))">,
         CPred<"getElementTypeOrSelf($_op.getResult(" # i # ")).isUnsignedInteger(" # num # ")">]>,
-    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+    Or<[CPred<"llvm::isa<mlir::TF::Quint" # num # "Type>(getElementTypeOrSelf($_op.getOperand(" # j # ")))">,
         CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
 
 class TFL_TFOperandTypesWithSameBits<int i, int j, int num> :
   And<[
-    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+    Or<[CPred<"llvm::isa<mlir::TF::Quint" # num # "Type>(getElementTypeOrSelf($_op.getOperand(" # i # ")))">,
         CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isUnsignedInteger(" # num # ")">]>,
-    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+    Or<[CPred<"llvm::isa<mlir::TF::Quint" # num # "Type>(getElementTypeOrSelf($_op.getOperand(" # j # ")))">,
         CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
 
 class TFL_OperandIsNoneOrHasRank<int n, int m> :
@@ -285,21 +285,21 @@ class TFL_OperandIsNoneOrHasRank<int n, int m> :
     Or<[
       TFL_OperandIsNoneType<n>,
       TFL_OperandIsUnrankedPred<n>,
-      CPred<"$_op.getOperand(" # n #
-      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n #
+      ").getType()).getRank() == " # m>]>>;
 
 class TFL_OperandIsNoneOrHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[
       TFL_OperandIsNoneType<n>,
       TFL_OperandIsUnrankedPred<n>,
-      CPred<"$_op.getOperand(" # n #
-      ").getType().cast<ShapedType>().getRank() <= " # m>]>>;
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n #
+      ").getType()).getRank() <= " # m>]>>;
 
 class TFL_OperandHasRankAtMostPred<int n, int m> :
   Or<[TFL_OperandIsUnrankedPred<n>,
-    CPred<"$_op.getOperand(" # n #
-    ").getType().cast<ShapedType>().getRank() <= " # m>]>;
+    CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n #
+    ").getType()).getRank() <= " # m>]>;
 
 class TFL_OperandHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
@@ -310,54 +310,54 @@ class TFL_OperandHasRankAtMost<int n, int m> :
 class TFL_TransposeOperandHasEffectiveRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
-      CPred<"GetSqueezedShape($_op.getOperand(" # n #
-      ")).cast<DenseElementsAttr>().size() <= " # m>]>>;
+      CPred<"llvm::cast<DenseElementsAttr>(GetSqueezedShape($_op.getOperand(" # n #
+      "))).size() <= " # m>]>>;
 
 class TFL_OperandHasRankAtLeast<int n, int m> :
   PredOpTrait<"operand " # n # " is at least " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
-      CPred<"$_op.getOperand(" # n #
-      ").getType().cast<ShapedType>().getRank() >= " # m>]>>;
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n #
+      ").getType()).getRank() >= " # m>]>>;
 
 class TFL_OperandHasRankRange<int n, int x, int y> :
   PredOpTrait<"operand " # n # " has rank range [" # x # ", " # y # "]",
     Or<[TFL_OperandIsUnrankedPred<n>,
-      CPred<"$_op.getOperand(" # n # ").getType().cast<ShapedType>().getRank() "
-      ">= " # x # " && $_op.getOperand(" # n # ").getType().cast<ShapedType>()."
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n # ").getType()).getRank() "
+      ">= " # x # " && llvm::cast<ShapedType>($_op.getOperand(" # n # ").getType())."
       "getRank() <= " # y>]>>;
 
 def TFL_FloatNonNegative : AttrConstraint<
-    CPred<"$_self.isa<FloatAttr>() && "
-            "!$_self.cast<FloatAttr>().getValue().isNegative()">,
+    CPred<"llvm::isa<FloatAttr>($_self) && "
+            "!llvm::cast<FloatAttr>($_self).getValue().isNegative()">,
     "whose value is non-negative">;
 
 def TFL_BoolTrue : AttrConstraint<
-    CPred<"$_self.isa<BoolAttr>() && $_self.cast<BoolAttr>().getValue()">,
+    CPred<"llvm::isa<BoolAttr>($_self) && llvm::cast<BoolAttr>($_self).getValue()">,
     "whose value is true">;
 
 def TFL_BoolFalse : AttrConstraint<
-    CPred<"$_self.isa<BoolAttr>() && !$_self.cast<BoolAttr>().getValue()">,
+    CPred<"llvm::isa<BoolAttr>($_self) && !llvm::cast<BoolAttr>($_self).getValue()">,
     "whose value is false">;
 
 class TFL_StringEqualsTo<string value> : AttrConstraint<
-    CPred<"$_self.cast<StringAttr>().getValue() == \"" # value # "\"">,
+    CPred<"llvm::cast<StringAttr>($_self).getValue() == \"" # value # "\"">,
     "whose value equals to '" # value # "'">;
 
 // Ensures the array attribute's size is within the given maximum size.
 class TFL_ArrayMaxCount<int n> : AttrConstraint<
-    CPred<"$_self.isa<ArrayAttr>() && $_self.cast<ArrayAttr>().size() <= " # n>,
+    CPred<"llvm::isa<ArrayAttr>($_self) && llvm::cast<ArrayAttr>($_self).size() <= " # n>,
     "whose size is at most " # n>;
 
 // Ensures the given integer attribute has the given value.
 class TFL_IntEqualsTo<int n> : AttrConstraint<
-    CPred<"$_self.isa<IntegerAttr>() && "
-            "$_self.cast<IntegerAttr>().getInt() == " # n>,
+    CPred<"llvm::isa<IntegerAttr>($_self) && "
+            "llvm::cast<IntegerAttr>($_self).getInt() == " # n>,
     "whose value is " # n>;
 
 // Ensures the given LSTMKernelType attribute has the given value.
 class TFL_LSTMKernelTypeEqualsTo<string value> : AttrConstraint<
-    CPred<"$_self.isa<LSTMKernelTypeAttr>() && "
-            "$_self.cast<LSTMKernelTypeAttr>().getValue() == " # value>,
+    CPred<"llvm::isa<LSTMKernelTypeAttr>($_self) && "
+            "llvm::cast<LSTMKernelTypeAttr>($_self).getValue() == " # value>,
     "whose value is " # value>;
 
 // This is a quantization-aware version of TCresVTEtIsSameAsOp
@@ -525,6 +525,16 @@ an output element, this operation computes \\(y = |x|\\).
   let results = (outs TFL_TensorOf<[I16, I32, F32, QI8, QI16]>:$y);
 
   let hasFolder = 1;
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns.
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_DilateOp : TFL_Op<"dilate", [
@@ -759,11 +769,12 @@ def TFL_ArgMaxOp : TFL_Op<"arg_max", [
   let hasOptions = 1;
 
   DerivedTFLiteTypeAttr output_type = DerivedTFLiteTypeAttr<[{
-    return getResult().getType().cast<TensorType>().getElementType().
-        cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
+    return llvm::cast<IntegerType>(llvm::cast<TensorType>(
+        getResult().getType()).getElementType()).getWidth() > 32 ?
+            tflite::TensorType_INT64 :
             tflite::TensorType_INT32;
     }], [{
-      TypeAttr::get(getResult().getType().cast<TensorType>().getElementType())
+      TypeAttr::get(llvm::cast<TensorType>(getResult().getType()).getElementType())
     }]>;
 }
 
@@ -791,11 +802,12 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [
   let hasOptions = 1;
 
   DerivedTFLiteTypeAttr output_type = DerivedTFLiteTypeAttr<[{
-    return getResult().getType().cast<TensorType>().getElementType().
-        cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
+    return llvm::cast<IntegerType>(llvm::cast<TensorType>(
+        getResult().getType()).getElementType()).getWidth() > 32 ?
+            tflite::TensorType_INT64 :
             tflite::TensorType_INT32;
     }], [{
-      TypeAttr::get(getResult().getType().cast<TensorType>().getElementType())
+      TypeAttr::get(llvm::cast<TensorType>(getResult().getType()).getElementType())
     }]>;
 }
 
@@ -1114,7 +1126,7 @@ def TFL_BatchMatMulOp : TFL_Op<"batch_matmul", [
    TFL_OperandHasAtleastRank<0, 2>,
    TFL_OperandHasAtleastRank<1, 2>,
    QuantizableResult,
-   PredOpTrait<"x and output must have same element type or they are int8 and int32",
+   TFL_RuntimePredOpTrait<"x and output must have same element type or they are int8 and int32",
        Or<[TFL_TCresVTEtIsSameAsOp<0, 0>,
            And<[CPred<"getElementTypeOrSelf($_op.getOperand(0)).isInteger(8)">,
                 CPred<"getElementTypeOrSelf($_op.getOperand(1)).isInteger(8)">,
@@ -1637,6 +1649,14 @@ def TFL_EluOp: TFL_Op<"elu", [
   let results = (outs TFL_TensorOf<[F32, I8]>:$y);
 
   let hasOptions = 0;
+
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
@@ -1973,6 +1993,16 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [
   let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$output);
 
   let hasOptions = 0;
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns.
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [Pure,
@@ -2004,7 +2034,7 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [Pure,
     // central_value = min_value / 2 + (max_value - 1) / 2 + 1
     // zero_point = central_value
     // scale = 1. / (central_value - min_value)
-    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+    return mlir::TFL::GetFixedOutputRange(is_signed, bit_width, result_type,
         /*scale=*/1.0 / (1<<(bit_width-1)), /*zero_point=*/0);
   }
   }];
@@ -2097,7 +2127,8 @@ def TFL_LogicalAndOp : TFL_Op<"logical_and", [ResultsBroadcastableShape, Pure]>
 
 def TFL_LogicalNotOp : TFL_Op<"logical_not", [
     Pure,
-    SameOperandsAndResultShape]> {
+    SameOperandsAndResultType
+    ]> {
   let summary = "Logical NOT operator";
 
   let description = [{
@@ -2163,7 +2194,7 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     auto result_type = getY().getType();
     // zero_point = 0
     // scale = 1. / (max_value + 1)
-    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+    return mlir::TFL::GetFixedOutputRange(is_signed, bit_width, result_type,
         /*scale=*/1.0 / (1<<(bit_width)),
         /*zero_point=*/-(1<<(bit_width-1)));
   }
@@ -2203,6 +2234,16 @@ def TFL_LogOp: TFL_Op<"log", [
       return TF::ArraysAreCastCompatible(l, r);
     }
   }];
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns.
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
@@ -2234,7 +2275,7 @@ def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     auto result_type = getOutput().getType();
     // zero_point = max_value
     // scale = -log_softmax_output_min / (max_value + 1)
-    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+    return mlir::TFL::GetFixedOutputRange(is_signed, bit_width, result_type,
         /*scale=*/16.0 / 256, /*zero_point=*/127);
   }
   }];
@@ -2391,7 +2432,8 @@ def TFL_SliceOp : TFL_Op<"slice", [
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     Pure,
     SameOperandsAndResultsScale,
-    TFL_OperandHasRankAtMost<0, 5>,
+    TFL_RuntimePredOpTrait<"input must have rank at most 5",
+      TFL_OperandHasRankAtMostPred<0, 5>>,
     TFL_OperandHasRankAtMost<1, 1>,
     TFL_OperandHasRankAtMost<2, 1>]> {
   let summary = "Return a slice from 'input'.";
@@ -2454,6 +2496,11 @@ def TFL_SumOp: TFL_Op<"sum", [
 
   let hasFolder = 1;
 
+  let builders = [
+    OpBuilder<(ins "Value":$input, "Value":$axes,
+      "BoolAttr":$keep_dims)>
+  ];
+
   // TODO(b/215655380): Re-enable this once there is 16-bit MLIR quantizer.
   //
   //let extraClassDeclaration = [{
@@ -2976,6 +3023,16 @@ def TFL_Relu0To1Op: TFL_Op<"relu_0_to_1", [
   let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
 
   let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns.
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
@@ -3086,6 +3143,16 @@ def TFL_RsqrtOp: TFL_Op<"rsqrt", [Pure,
   let results = (outs TFL_TensorOf<[F32, QI8, QI16]>:$y);
 
   let hasFolder = 1;
+
+  // This builder doesn't work with quantized type, so it can only be used by
+  // non-quantization tablegen patterns.
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
+    }]>
+  ];
 }
 
 def TFL_ShapeOp: TFL_Op<"shape", [
@@ -3102,7 +3169,7 @@ def TFL_ShapeOp: TFL_Op<"shape", [
   let results = (outs TFL_TensorOf<[I32, I64]>:$output);
 
   DerivedTypeAttr out_type = DerivedTypeAttr<[{
-    return getResult().getType().cast<TensorType>().getElementType();
+    return llvm::cast<TensorType>(getResult().getType()).getElementType();
   }]>;
 
   let hasOptions = 1;
@@ -3306,7 +3373,7 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
     auto result_type = getOutput().getType();
     // zero_point = 0
     // scale = 1. / (max_value + 1)
-    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+    return mlir::TFL::GetFixedOutputRange(is_signed, bit_width, result_type,
         /*scale=*/1.0 / (bit_width == 8 ? (1<<(bit_width)) : (1<<(bit_width-1))),
         /*zero_point=*/bit_width == 8 ? -(1<<(bit_width-1)): 0);
   }
@@ -3470,7 +3537,7 @@ def TFL_TanhOp: TFL_Op<"tanh", [
     // central_value = min_value / 2 + (max_value - 1) / 2 + 1
     // zero_point = central_value
     // scale = 1. / (central_value - min_value)
-    return quant::GetFixedOutputRange(is_signed, bit_width, result_type,
+    return mlir::TFL::GetFixedOutputRange(is_signed, bit_width, result_type,
         /*scale=*/1.0 / (1<<(bit_width-1)), /*zero_point=*/0);
   }
   }];
@@ -3623,10 +3690,9 @@ def TFL_UnpackOp : TFL_Op<"unpack", [
 }
 
 def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [
-    PredOpTrait<"input and output must have same element type",
-      TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    SameOperandsAndResultShape,
-    Pure]> {
+    Pure,
+    SameOperandsAndResultType
+    ]> {
   let summary = "ZerosLike operator";
 
   let description = [{
@@ -3876,9 +3942,9 @@ def TFL_SparseToDenseOp : TFL_Op<"sparse_to_dense", [
     TFL_OperandHasRankAtMost<2, 1>,
     PredOpTrait<"the first operand should have a rank <= 2, when its rank is 2 and has static shape, the second dim should be <= 4",
       Or<[TFL_OperandIsUnrankedPred<0>,
-          CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() <= 1">,
-          CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() == 2 && !$_op.getOperand(0).getType().cast<ShapedType>().hasStaticShape()">,
-          CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() == 2 && $_op.getOperand(0).getType().cast<ShapedType>().getShape()[1] <= 4">]>>]> {
+          CPred<"llvm::cast<ShapedType>($_op.getOperand(0).getType()).getRank() <= 1">,
+          CPred<"llvm::cast<ShapedType>($_op.getOperand(0).getType()).getRank() == 2 && !llvm::cast<ShapedType>($_op.getOperand(0).getType()).hasStaticShape()">,
+          CPred<"llvm::cast<ShapedType>($_op.getOperand(0).getType()).getRank() == 2 && llvm::cast<ShapedType>($_op.getOperand(0).getType()).getShape()[1] <= 4">]>>]> {
   let summary = "Converts a sparse representation into a dense tensor.";
 
   let description = [{
@@ -3921,7 +3987,8 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice", [
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultsScale,
-    TFL_OperandHasRankAtMost<0, 5>,
+    TFL_RuntimePredOpTrait<"input (with new_axis) must have rank at most 5",
+      CPred<"TFL::VerifyStridedSliceOpInputRankConstraints(llvm::cast<StridedSliceOp>($_op))">>,
     TFL_OperandHasRank<1, 1>,
     TFL_OperandHasRank<2, 1>,
     TFL_OperandHasRank<3, 1>
@@ -4049,11 +4116,12 @@ value of `input` in the unique output `output`. In other words:
   );
 
   DerivedTFLiteTypeAttr idx_out_type = DerivedTFLiteTypeAttr<[{
-    return getResult(1).getType().cast<TensorType>().getElementType().
-        cast<IntegerType>().getWidth() > 32 ? tflite::TensorType_INT64 :
+    return llvm::cast<IntegerType>(llvm::cast<TensorType>(
+        getResult(1).getType()).getElementType()).getWidth() > 32 ?
+            tflite::TensorType_INT64 :
             tflite::TensorType_INT32;
     }], [{
-      TypeAttr::get(getResult(1).getType().cast<TensorType>().getElementType())
+      TypeAttr::get(llvm::cast<TensorType>(getResult(1).getType()).getElementType())
     }]>;
 
   let hasOptions = 1;
@@ -4095,13 +4163,13 @@ def TFL_DynamicUpdateSliceOp: TFL_Op<"dynamic_update_slice", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[I1, I8, I32, I64, F32, F16]>:$operand,
-    TFL_TensorOf<[I1, I8, I32, I64, F32, F16]>:$update,
+    TFL_TensorOf<[I1, I8, I16, I32, I64, F32, F16]>:$operand,
+    TFL_TensorOf<[I1, I8, I16, I32, I64, F32, F16]>:$update,
     TFL_I32OrI64Tensor:$start_indices
   );
 
   let results = (
-    outs TFL_TensorOf<[I1, I8, I32, I64, F32, F16]>:$output);
+    outs TFL_TensorOf<[I1, I8, I16, I32, I64, F32, F16]>:$output);
 
   let hasFolder = 1;
 }
@@ -4183,6 +4251,19 @@ def TFL_DequantizeOp: TFL_Op<"dequantize", [NoMemoryEffect]> {
   let arguments = (ins TFL_TensorOf<[QI4, QI8, QUI8, QI16, F16]>:$input);
 
   let results = (outs TFL_FpTensor:$output);
+
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(mlir::cast<TensorType>(input.getType()).hasRank() ?
+         static_cast<TensorType>(RankedTensorType::get(
+           mlir::cast<TensorType>(input.getType()).getShape(),
+           $_builder.getF32Type())) :
+         static_cast<TensorType>(
+          UnrankedTensorType::get($_builder.getF32Type())));
+    }]>
+  ];
 }
 
 def TFL_FakeQuantOp : TFL_Op<"fake_quant", [
@@ -5167,8 +5248,8 @@ def TFL_UnsortedSegmentSumOp: TFL_Op<"unsorted_segment_sum", [
 
 def TFL_Atan2Op: TFL_Op<"atan2", [
   Pure,
-  SameOperandsAndResultShape,
-  SameOperandsAndResultElementType]> {
+  SameOperandsAndResultType
+  ]> {
 
   let summary = "Atan2 operation";
   let description = [{
@@ -5188,8 +5269,7 @@ def TFL_Atan2Op: TFL_Op<"atan2", [
 
 def TFL_SignOp: TFL_Op<"sign", [
   Pure,
-  SameOperandsAndResultShape,
-  SameOperandsAndResultElementType
+  SameOperandsAndResultType
   ]> {
 
   let summary = "Sign operation";
@@ -5658,6 +5738,14 @@ value is computed as \\( \sqrt{a^2 + b^2}\\).
   let results = (outs
     TFL_TensorOf<[F32, F64]>:$output
   );
+
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(dyn_cast<ComplexType>(input.getType()).getElementType());
+    }]>
+  ];
 }
 
 def TFL_RealOp : TFL_Op<"real", [
@@ -5679,6 +5767,14 @@ type `float` that is the real part of each element in `input`. All elements in
   let results = (outs
     TFL_TensorOf<[F32, F64]>:$output
   );
+
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(dyn_cast<ComplexType>(input.getType()).getElementType());
+    }]>
+  ];
 }
 
 def TFL_ImagOp : TFL_Op<"imag", [
@@ -5700,6 +5796,14 @@ is the real part and *b* is the imaginary part returned by this operation.
   let results = (outs
     TFL_TensorOf<[F32, F64]>:$output
   );
+
+  let builders = [
+    OpBuilder<(ins "Value":$input),
+    [{
+      $_state.addOperands({input});
+      $_state.addTypes(dyn_cast<ComplexType>(input.getType()).getElementType());
+    }]>
+  ];
 }
 
 def TFL_HashtableOp: TFL_Op<"hashtable", []> {
diff --git a/tensorflow/compiler/mlir/lite/kernels/internal/BUILD b/tensorflow/compiler/mlir/lite/kernels/internal/BUILD
index 74910218b1d1..ca2f9ed03181 100644
--- a/tensorflow/compiler/mlir/lite/kernels/internal/BUILD
+++ b/tensorflow/compiler/mlir/lite/kernels/internal/BUILD
@@ -66,88 +66,20 @@ cc_library(
 )
 
 config_setting(
-    name = "haswell",
-    values = {
-        "cpu": "haswell",
-    },
-)
-
-config_setting(
-    name = "ios_x86_64",
-    values = {
-        "cpu": "ios_x86_64",
-    },
-)
-
-config_setting(
-    name = "tvos_x86_64",
-    values = {
-        "cpu": "tvos_x86_64",
-    },
-)
-
-config_setting(
-    name = "k8",
-    values = {
-        "cpu": "k8",
-    },
-)
-
-config_setting(
-    name = "x86",
-    values = {
-        "cpu": "x86",
-    },
+    name = "x86_32",
+    constraint_values = ["@platforms//cpu:x86_32"],
 )
 
 config_setting(
     name = "x86_64",
-    values = {
-        "cpu": "x86_64",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {
-        "cpu": "darwin",
-    },
-)
-
-config_setting(
-    name = "darwin_x86_64",
-    values = {
-        "cpu": "darwin_x86_64",
-    },
-)
-
-config_setting(
-    name = "freebsd",
-    values = {
-        "cpu": "freebsd",
-    },
-)
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
+    constraint_values = ["@platforms//cpu:x86_64"],
 )
 
 selects.config_setting_group(
     name = "x86_any",
     match_any = [
-        ":haswell",
-        ":ios_x86_64",
-        ":k8",
-        ":x86",
+        ":x86_32",
         ":x86_64",
-        ":darwin",
-        ":darwin_x86_64",
-        ":freebsd",
-        ":windows",
-        ":tvos_x86_64",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/kernels/internal/utils/sparsity_format_converter.cc b/tensorflow/compiler/mlir/lite/kernels/internal/utils/sparsity_format_converter.cc
index e5db23a88318..aa639ef3acfd 100644
--- a/tensorflow/compiler/mlir/lite/kernels/internal/utils/sparsity_format_converter.cc
+++ b/tensorflow/compiler/mlir/lite/kernels/internal/utils/sparsity_format_converter.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <utility>
 #include <vector>
 
 #include "Eigen/Core"  // from @eigen_archive
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index 916353ba408b..aeb56038984a 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -35,8 +35,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite:types_proto_cc",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/tools/optimize:reduced_precision_metadata",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/core:core_cpu_base",
@@ -63,7 +63,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:converter_flags_proto_cc",
         "//tensorflow/compiler/mlir/lite:model_flags_proto_cc",
         "//tensorflow/compiler/mlir/lite:types_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow/translate/tools:parsers",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:graph_to_tf_executor",
@@ -90,7 +90,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite:types_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:lib",
@@ -117,7 +117,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:model_flags_proto_cc",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:types_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/log",
@@ -210,7 +210,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -218,6 +217,7 @@ cc_library(
         "@com_google_protobuf//:protobuf",
         "@com_google_protobuf//:protobuf_headers",
         "@flatbuffers//:runtime_cc",
+        "@local_xla//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
         "@local_xla//xla/tsl/platform:status",
     ] + select({
         # This is required when running `tflite_convert` from `bazel`.
@@ -246,7 +246,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ] + if_pywrap([":converter_python_api"]),
 )
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index a5227a7f4b6c..ffd4bab19611 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/types.pb.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.h"
@@ -49,7 +49,7 @@ absl::Status ConvertGraphDefToTFLiteFlatBuffer(
     const GraphDef& input, std::string* result) {
   auto context = std::make_unique<mlir::MLIRContext>();
   GraphImportConfig specs;
-  mlir::quant::QuantizationSpecs quant_specs;
+  mlir::TFL::QuantizationSpecs quant_specs;
 
   // Parse input arrays.
   std::vector<std::string> node_names;
diff --git a/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/BUILD b/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/BUILD
index 9268de7ec1de..267ef251ebdd 100644
--- a/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/BUILD
@@ -12,7 +12,7 @@ cc_library(
     hdrs = ["python_utils.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -23,6 +23,6 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/lite:stateful_error_reporter",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.cc b/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.cc
index 75f9222d7c22..594f9722fa5b 100644
--- a/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.cc
+++ b/tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/python/interpreter_wrapper/python_error_reporter.h"
 
+#include <Python.h>
+
 #include <cstdarg>
 #include <cstdio>
 #include <string>
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
index d4a2d02db6ac..3aaad3c7767c 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
@@ -36,9 +36,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/types.pb.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/translate/stablehlo.h"
 #include "xla/service/hlo.pb.h"
@@ -83,7 +83,7 @@ absl::Status ConvertJaxToTFLiteFlatBuffer(
     const std::string& input, const tflite::ModelFlags& model_flags,
     tflite::ConverterFlags& converter_flags, std::string* result) {
   auto context = std::make_unique<mlir::MLIRContext>();
-  mlir::quant::QuantizationSpecs quant_specs;
+  mlir::TFL::QuantizationSpecs quant_specs;
 
   // Parse input arrays.
   std::vector<std::string> node_names;
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 4dcf1497476f..fa94cd3b5b81 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -37,10 +37,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/types.pb.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "xla/tsl/platform/errors.h"
@@ -137,7 +137,7 @@ absl::Status ConvertSavedModelToTFLiteFlatBuffer(
     tflite::ConverterFlags& converter_flags, std::string* result,
     const PyFunctionLibrary* quantization_py_function_lib) {
   auto context = std::make_unique<mlir::MLIRContext>();
-  mlir::quant::QuantizationSpecs quant_specs;
+  mlir::TFL::QuantizationSpecs quant_specs;
 
   // Parse input arrays.
   std::vector<string> node_names;
@@ -217,21 +217,23 @@ absl::Status ConvertSavedModelToTFLiteFlatBuffer(
   pass_config.model_origin_framework = converter_flags.model_origin_framework();
   pass_config.canonicalizing_inf_as_min_max_float =
       converter_flags.canonicalizing_inf_as_min_max_float();
+  pass_config.unsafe_fuse_dynamic_shaped_broadcast =
+      converter_flags.unsafe_fuse_dynamic_shaped_broadcast();
 
   if (converter_flags.strict_qdq_mode()) {
     pass_config.quant_specs.qdq_conversion_mode =
-        mlir::quant::QDQConversionMode::kQDQStrict;
+        mlir::TFL::QDQConversionMode::kQDQStrict;
   } else if (converter_flags.qdq_conversion_mode() == "STATIC") {
     pass_config.quant_specs.qdq_conversion_mode =
-        mlir::quant::QDQConversionMode::kQDQStatic;
+        mlir::TFL::QDQConversionMode::kQDQStatic;
   } else if (converter_flags.qdq_conversion_mode() == "DYNAMIC") {
     pass_config.quant_specs.qdq_conversion_mode =
-        mlir::quant::QDQConversionMode::kQDQDynamic;
+        mlir::TFL::QDQConversionMode::kQDQDynamic;
     // Need to set this or else the ops will still use floating point kernels
     pass_config.quant_specs.inference_type = tensorflow::DT_QINT8;
   } else if (converter_flags.qdq_conversion_mode() == "NONE") {
     pass_config.quant_specs.qdq_conversion_mode =
-        mlir::quant::QDQConversionMode::kQDQNone;
+        mlir::TFL::QDQConversionMode::kQDQNone;
   } else {
     return errors::InvalidArgument("Unknown QDQ conversion mode: ",
                                    converter_flags.qdq_conversion_mode());
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 3534e57a5ea4..bdfdcc479d6a 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -33,11 +33,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/types.pb.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
 #include "xla/tsl/platform/statusor.h"
@@ -216,7 +216,7 @@ absl::Status RegisterAllCustomOps(
 absl::Status PopulateQuantizationSpecs(
     const tflite::ModelFlags& model_flags,
     tflite::ConverterFlags& converter_flags,
-    mlir::quant::QuantizationSpecs* quant_specs,
+    mlir::TFL::QuantizationSpecs* quant_specs,
     std::vector<std::string>* node_names, std::vector<std::string>* node_dtypes,
     std::vector<std::optional<std::vector<int>>>* node_shapes,
     std::vector<std::optional<double>>* node_mins,
@@ -264,8 +264,8 @@ absl::Status PopulateQuantizationSpecs(
     }
   }
 
-  if (mlir::quant::GetInputNodeQuantSpecs(*node_names, *node_mins, *node_maxs,
-                                          inference_type, quant_specs)) {
+  if (mlir::TFL::GetInputNodeQuantSpecs(*node_names, *node_mins, *node_maxs,
+                                        inference_type, quant_specs)) {
     return errors::InvalidArgument("Failed to get input quant spec.");
   }
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index fec9450f4296..f837a6f0140e 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -27,9 +27,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/model_flags.pb.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/types.pb.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -46,8 +46,8 @@ absl::Status RegisterAllCustomOps(
 absl::Status PopulateQuantizationSpecs(
     const tflite::ModelFlags& model_flags,
     tflite::ConverterFlags& converter_flags,
-    mlir::quant::QuantizationSpecs* quant_specs,
-    std::vector<string>* node_names, std::vector<string>* node_dtypes,
+    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
+    std::vector<string>* node_dtypes,
     std::vector<std::optional<std::vector<int>>>* node_shapes,
     std::vector<std::optional<double>>* node_mins,
     std::vector<std::optional<double>>* node_maxs);
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index 4c4872ed1351..d7a055a3daea 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -109,8 +109,8 @@ cc_library(
     hdrs = ["quantization_context.h"],
     deps = [
         ":device_target",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/BUILD b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/BUILD
new file mode 100644
index 000000000000..56f4af8ce837
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/BUILD
@@ -0,0 +1,144 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # By default, these targets should only be used within the quantization library.
+    default_visibility = [
+        "//learning/brain/mlir/quantization:__subpackages__",
+        "//platforms/darwinn/compiler:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "tfl_quantization_driver",
+    srcs = [
+        "tfl_quantization_driver.cc",
+    ],
+    hdrs = [
+        "tfl_quantization_driver.h",
+    ],
+    deps = [
+        ":quantization_config",
+        ":quantization_lib",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "quantization_lib",
+    srcs = [
+        "quantization_driver.cc",
+        "quantization_interface.cc.inc",
+        "quantization_utils.cc",
+    ],
+    hdrs = [
+        "quantization_driver.h",
+        "quantization_interface.h.inc",
+        "quantization_traits.h",
+        "quantization_utils.h",
+    ],
+    deps = [
+        ":quantization_config",
+        ":quantization_interfaces_inc_gen",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy:portable_tensor_utils",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/tools/optimize:quantization_utils",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "quantization_driver_test",
+    srcs = ["quantization_driver_test.cc"],
+    deps = [
+        ":quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:func",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+td_library(
+    name = "quantization_td_files",
+    srcs = [
+        "quantization.td",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantizationOpsTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "quantization_interfaces_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {
+        "quantization_interface.h.inc": ["-gen-op-interface-decls"],
+        "quantization_interface.cc.inc": ["-gen-op-interface-defs"],
+    },
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "quantization.td",
+    deps = [
+        ":quantization_td_files",
+    ],
+)
+
+cc_library(
+    name = "quantization_config",
+    srcs = [
+        "quantization_config.cc",
+    ],
+    hdrs = [
+        "quantization_config.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/lite/tools/optimize:reduced_precision_metadata",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+exports_files([
+    "quantization_traits.h",
+    "quantization_config.h",
+    "quantization_utils.h",
+])
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td
new file mode 100644
index 000000000000..02f874d8f3d6
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td
@@ -0,0 +1,227 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the quantization definition file for TensorFlow.
+
+#ifdef TF_Quantization
+#else
+#define TF_Quantization
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Quant/IR/QuantBase.td"
+
+//===----------------------------------------------------------------------===//
+// QuantizedType definitions.
+//===----------------------------------------------------------------------===//
+
+// The base class of a quantized type. Signed quantized types may be expressed
+// as signless integers (i.e. up to op interpretation), but we include an
+// explicit signedness check to differentiate the signed/unsigned constraints
+// predicates from one another at the TD level.
+class QuantizedType<string n, list<int> params, bit signed>
+  : Type<And<[CPred<"llvm::isa<mlir::quant::QuantizedType>($_self)">,
+              CPred<"llvm::cast<mlir::quant::QuantizedType>($_self)" #
+                    ".getStorageTypeIntegralWidth() == " # !head(params)>,
+              Or<[CPred<"llvm::cast<mlir::quant::QuantizedType>($_self)" #
+                    ".getStorageType().isSignlessInteger()">,
+                  CPred<"llvm::cast<mlir::quant::QuantizedType>($_self)" #
+                    ".getStorageType().isSignedInteger() == " # signed>]>]>,
+    "Q" # !if (signed, "I", "UI") # !head(params) # " type"> {
+  string name = n;
+  string asTraitArgsStr =
+    !interleave(params, ", ") # !if(signed, ", true", ", false");
+}
+
+// Uniform quantized types. Two integers "smantissa" and "sexp" are used to
+// express the Mantissa and Exponent components of the floating-point scale so
+// the scale of the quantized type is "smantissa * 10 ^ sexp".
+class UInt8UniformQuantizedType<int zero_pt, int smantissa, int sexp>
+    : QuantizedType<"Uniform",
+                        [8, zero_pt, smantissa, sexp, 0, 255], 0>;
+class Int8UniformQuantizedType<int zero_pt, int smantissa, int sexp>
+    : QuantizedType<"Uniform",
+                        [8, zero_pt, smantissa, sexp, -128, 127], 1>;
+
+// General uniform quantized types. The definitions can be used to specify
+// operand's tensor types.
+def QI4 : QuantizedType<"Uniform", [4], 1>;
+def QUI8 : QuantizedType<"Uniform", [8], 0>;
+def QI8 : QuantizedType<"Uniform", [8], 1>;
+def QUI16 : QuantizedType<"Uniform", [16], 0>;
+def QI16 : QuantizedType<"Uniform", [16], 1>;
+def QUI32 : QuantizedType<"Uniform", [32], 0>;
+def QI32 : QuantizedType<"Uniform", [32], 1>;
+
+//===----------------------------------------------------------------------===//
+// TFL native op traits (for quantization).
+//
+// Ops in this link should have those traits specified:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+//===----------------------------------------------------------------------===//
+
+def FixedOutputRangeInterface : OpInterface<
+  "FixedOutputRangeInterface"> {
+  let cppNamespace = "TFL";
+
+  let description = [{
+    Interface for defining the fixed output range.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the fixed output range.}],
+      "UniformQuantizedType", "GetFixedOutputRange",
+      (ins "bool":$sign, "int":$bit_width)
+    >,
+  ];
+}
+
+def AffineQuantizedOpInterface : OpInterface<
+  "AffineQuantizedOpInterface"> {
+  let cppNamespace = "TFL";
+
+  let description = [{
+    Interface for affine quantized ops (conv2d, fully_connected, etc.)
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the affine operand index.}],
+      "int", "GetAffineOperandIndex",
+      (ins), [{}], [{return 1;}]>,
+    InterfaceMethod<
+      [{Returns whether narrow range is required for the affine operand.}],
+      "bool", "RequiredNarrowRangeAffineOperand",
+      (ins), [{}], [{return true;}]>,
+    InterfaceMethod<
+      [{Returns quantization dim for the affine operand.}],
+      "int", "GetQuantizationDimIndex",
+      (ins)>,
+    InterfaceMethod<
+      [{Returns the dimension index of the output channels.}],
+      "int", "GetChannelDimIndex", (ins)
+    >,
+  ];
+}
+
+def SameOperandsAndResultsScale : OpInterface<"SameScalesOpInterface"> {
+  let cppNamespace = "TFL";
+
+  let description = [{
+    Interface for ops potentially have same operands and results scales.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns whether same operands and results scales are required.}],
+      "bool", "RequiredSameOperandsAndResultsScale",
+      (ins "bool":$sign, "int":$bit_width), [{}], [{return true;}]
+    >,
+    InterfaceMethod<
+      [{Returns whether operands and results must have the same quantized axis.}],
+      "bool", "RequiredSameQuantizedAxes",
+      (ins), [{}], [{return true;}]
+    >,
+  ];
+
+  let verify = [{
+    return TFL::VerifySameScales($_op);
+  }];
+}
+
+def DynamicRangeQuantizedOpInterface : OpInterface<
+  "DynamicRangeQuantizedOpInterface"> {
+  let cppNamespace = "TFL";
+
+  let description = [{
+    Interface for ops dynamic range quantization is supported.
+
+    If the op has the kernel support for dynamic range quantization, Q/DQ op
+    pairs connected to the op are rewritten by its quantized alternatives where
+    a new op uses Q ops for its operands instead of DQ op. Otherwise, it is
+    left as is for weight-only which means the weight is dequantized at runtime.
+
+    For example, if the kernel does not support dynamic range quantization the
+    graph will be converted into the following IR:
+
+    %q_w = "tfl.pseudo_qconst"() {
+         qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+    %w = "tfl.dequantize"(%q_w) :
+         (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) ->
+         tensor<64x3x3x3xf32>
+    %conv = "tfl.conv_2d"(%input_act, %w, %bias)
+
+    but if it is supported, it will be rewritten as:
+
+    %q_w = "tfl.pseudo_qconst"() {
+         qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+    %conv = "tfl.conv_2d"(%input_act, %q_w, %bias)
+
+    Note that this is part of reaching feature parity with the old quantizer for
+    dynamic range quantization except:
+    - Only use_updated_hybrid_scheme=True is supported which means the ops with
+    the asymmetrically quantizing input support is enabled to use this feature
+    during MLIR graph rewriting passes while it is configurable in the old
+    quantizer. So when those ops are matched during graph rewriting passes,
+    MLIR quantizer will always ignore the pre-set value of the attribute, if
+    there's any, and set it to True. The reason behind this decision is that
+    generally activations of these ops show better accuracy with asymmetric
+    input quantization so we want to deprecate symmetric activation quantization
+    for those ops eventually.
+    - Unlike to the old quantizer, per-channel quantization is supported for
+    weight-only TransposeConvOp.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the quantizable operand indices of the op.}],
+      "std::vector<int>", "GetQuantizableOperandIndices",
+      (ins), [{}], [{return {};}]>,
+    InterfaceMethod<
+      [{Returns whether the op has the kernel support for dynamic range
+      quantization.}],
+      "bool", "GetDynamicRangeQuantKernelSupport",
+      (ins), [{}], [{return false;}]>,
+    InterfaceMethod<
+      [{Returns whether the op requires asymmetric quantize input attribute
+      setting.}],
+      "bool", "RequireAsymmetricQuantizeInputsAttr",
+      (ins), [{}], [{return false;}]>,
+  ];
+}
+
+// Specify this trait if the op has a fixed output value range.
+class FixedResultScale<QuantizedType qt> : NativeOpTrait<!strconcat(
+  "TFL::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
+
+// Specify this trait if the bias-th input of the op is a bias input, which
+// needs a scale based on the scales of op1 and op2.
+class AccumulatorUniformScale<int bias, int op1, int op2> : NativeOpTrait<
+  !strconcat("TFL::AccumulatorUniformScale<",
+             !interleave([bias, op1, op2], ", "),
+             ">::Impl")>;
+
+// Specify the operand index of the coefficient operand for an affine op
+// and also the quantization dimension if per-axis quantization is support.
+// If the quantization dimension is -1, per-axis quantization isn't supported.
+class AffineOpCoefficient<int dim, int index> : NativeOpTrait<
+  !strconcat("TFL::AffineOpCoefficient<",
+             !interleave([dim, index], ", "),
+             ">::Impl")>;
+
+// Specify this trait if the op does have quantizable output. Quantizers will
+// apply quantization on this op.
+def QuantizableResult : NativeOpTrait<"TFL::QuantizableResult">;
+#endif // TF_Quantization
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.cc
new file mode 100644
index 000000000000..9aef4058cf96
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.cc
@@ -0,0 +1,184 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+
+#include <ios>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+// Returns whether the given dtype is a quantization type in TensorFlow.
+static bool IsQuantizationType(tensorflow::DataType dtype) {
+  switch (dtype) {
+    case tensorflow::DT_QINT8:
+    case tensorflow::DT_QUINT8:
+    case tensorflow::DT_QINT16:
+    case tensorflow::DT_QUINT16:
+    case tensorflow::DT_QINT32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+namespace mlir {
+namespace TFL {
+namespace {
+bool GetBooleanSpecs(const std::string& bool_val) {
+  bool result;
+  std::stringstream iss(bool_val);
+  iss >> std::boolalpha >> result;
+  return result;
+}
+}  // namespace
+
+void ParseCustomOpSpecs(const absl::string_view node_names,
+                        const CustomOpUpdateOptions& update_option,
+                        CustomOpMap& custom_op_map) {
+  if (node_names.empty()) return;
+
+  const std::vector<std::string> custom_nodes = absl::StrSplit(node_names, ',');
+
+  for (const std::string& cur_node : custom_nodes) {
+    const std::vector<std::string> node_infos = absl::StrSplit(cur_node, '=');
+    const std::string& node_name = node_infos[0];
+    const std::string& node_specification = node_infos[1];
+    CustomOpInfo new_node_info;
+    switch (update_option) {
+      case CustomOpUpdateOptions::kInputIndices: {
+        const std::vector<std::string> indices =
+            absl::StrSplit(node_specification, '-');
+        for (const std::string& cur_index : indices) {
+          custom_op_map[node_name].quantizable_input_indices.push_back(
+              std::stoi(cur_index));
+        }
+        break;
+      }
+      case CustomOpUpdateOptions::kWeightOnly:
+        custom_op_map[node_name].is_weight_only =
+            GetBooleanSpecs(node_specification);
+        break;
+      case CustomOpUpdateOptions::kNoSideEffect:
+        custom_op_map[node_name].no_side_effect =
+            GetBooleanSpecs(node_specification);
+        break;
+    }
+  }
+}
+
+bool ParseInputNodeQuantSpecs(const absl::string_view node_names,
+                              const absl::string_view min_values,
+                              const absl::string_view max_values,
+                              const absl::string_view inference_type,
+                              QuantizationSpecs* quant_specs) {
+  const std::vector<std::string> input_nodes = absl::StrSplit(node_names, ',');
+  std::vector<std::optional<double>> node_mins;
+  if (!min_values.empty()) {
+    std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
+    for (const std::string& node_mins_str : node_mins_str) {
+      double value;
+      if (!absl::SimpleAtod(node_mins_str, &value)) {
+        llvm::errs() << "Unexpected mins: " << node_mins_str << "\n";
+        return true;
+      }
+      node_mins.push_back(value);
+    }
+  }
+
+  std::vector<std::optional<double>> node_maxs;
+  if (!max_values.empty()) {
+    const std::vector<std::string> node_maxs_str =
+        absl::StrSplit(max_values, ',');
+    for (const std::string& node_maxs_str : node_maxs_str) {
+      double value;
+      if (!absl::SimpleAtod(node_maxs_str, &value)) {
+        llvm::errs() << "Unexpected mins: " << node_maxs_str << "\n";
+        return true;
+      }
+      node_maxs.push_back(value);
+    }
+  }
+
+  tensorflow::DataType final_type = tensorflow::DT_FLOAT;
+  if (!inference_type.empty() &&
+      !DataType_Parse(std::string(inference_type), &final_type)) {
+    return true;
+  }
+  return GetInputNodeQuantSpecs(input_nodes, node_mins, node_maxs, final_type,
+                                quant_specs);
+}
+
+bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
+                            const std::vector<std::optional<double>>& node_mins,
+                            const std::vector<std::optional<double>>& node_maxs,
+                            const tensorflow::DataType inference_type,
+                            QuantizationSpecs* quant_specs) {
+  quant_specs->inference_type = inference_type;
+
+  // If min/max are not specified, just return;
+  if (node_mins.empty() || node_maxs.empty()) return false;
+
+  // Otherwise make sure min/max has the same size as inputs.
+  if (IsQuantizationType(inference_type)) {
+    // min/max should have same size as inputs, or shouldn't be specified.
+    if (node_names.size() != node_mins.size() ||
+        node_names.size() != node_maxs.size()) {
+      return true;
+    }
+    for (int i = 0; i < node_names.size(); ++i) {
+      quant_specs->input_ranges.push_back({node_mins[i], node_maxs[i]});
+    }
+    return false;
+  }
+  if (!node_mins.empty()) {
+    llvm::dbgs() << "Ignored input_min_values.";
+  }
+  if (!node_maxs.empty()) {
+    llvm::dbgs() << "Ignored input_max_values.";
+  }
+  return false;
+}
+
+std::string GetQDQQuantModeString(const QDQConversionMode mode) {
+  switch (mode) {
+    case QDQConversionMode::kQDQStatic:
+      return "Static";
+    case QDQConversionMode::kQDQDynamic:
+      return "Dynamic";
+    case QDQConversionMode::kQDQStrict:
+      return "Strict";
+    default:
+      return "NoQDQ";
+  }
+}
+
+QDQConversionMode GetQDQQuantModeFromString(const std::string& mode_str) {
+  if (mode_str == "Static") return QDQConversionMode::kQDQStatic;
+  if (mode_str == "Dynamic") return QDQConversionMode::kQDQDynamic;
+  if (mode_str == "Strict") return QDQConversionMode::kQDQStrict;
+  return QDQConversionMode::kQDQNone;
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h
new file mode 100644
index 000000000000..5f7fde15a68a
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h
@@ -0,0 +1,255 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines node specs for quantization and the methods to parse
+// command line flags to these specs.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_CONFIG_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace TFL {
+
+// Stores information about how to quantize a user-specified custom operation.
+struct CustomOpInfo {
+  std::vector<std::int32_t> quantizable_input_indices;
+  bool is_weight_only = false;
+  bool no_side_effect = true;
+};
+
+using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
+enum CustomOpUpdateOptions { kInputIndices, kWeightOnly, kNoSideEffect };
+enum class QDQConversionMode { kQDQNone, kQDQStatic, kQDQDynamic, kQDQStrict };
+
+struct QuantizationSpecs {
+  // Which function this node quant specifications belong to.
+  std::string target_func = "main";
+
+  // Whether to trigger quantization passses for post-training quantization.
+  // If true, the model input doesn't require user specified input ranges.
+  bool post_training_quantization = false;
+
+  // Whether to allow dynamic range quantization. This is the easiest
+  // quantization mode which doesn't require QAT or sample inputs.
+  // This option only targets `DT_HALF` and `DT_QINT8` inference type.
+  bool weight_quantization = false;
+
+  // Whether to use the MLIR dynamic range quantizer instead of TOCO.
+  bool enable_mlir_dynamic_range_quantizer = false;
+
+  // Whether to allow weight-only quantization. This scheme quantizes
+  // weights but will dequantize them back at runtime which is useful for
+  // memory bound case without kernel support available in lower precisions.
+  // Used in MLIR dynamic range quantizer.
+  bool weight_only_quantization = false;
+
+  // The minimum number of elements in a weights array required to apply
+  // quantization. This is especially useful not to quantize small tensors as
+  // it is hard to get performance benefits from them with quantization. Used
+  // in MLIR dynamic range quantizer with int8 weight data type.
+  int64_t minimum_elements_for_weights = 1024;
+
+  // Whether to calculate scales in float to keep quantized values the same with
+  // old TOCO quantizer.
+  bool legacy_float_scale = false;
+
+  // Whether to perform per-tensor quantization. Currently, this option is only
+  // valid when the quantization parameters need to be created by scanning the
+  // constant content (post-training quantization or QAT without weight
+  // FakeQuant).
+  bool disable_per_channel = false;
+
+  // Whether to disable per-channel weight quantization and enable legacy per
+  // tensor quantization. The legacy quantization for Dense layers is
+  // inconsistent with Conv 1x1 which always performs per channel quantization.
+  bool disable_per_channel_for_dense_layers = false;
+
+  // Whether to use fixed output ranges of the activation ops (tanh, sigmoid,
+  // etc.) and not infer weight constants.
+  // If this option is set, quantization emulation ops should be placed after
+  // the ops in the input graph. This flag should be set to false for
+  // post-training quantization.
+  bool disable_infer_tensor_range = false;
+
+  // Whether to use the unfrozen variable quantization in MLIR. Typically,
+  // variables are frozen for passing passes, but some variables aren't frozen.
+  // If it is true, QuantizeVariables pass will be added after the
+  // PrepareQuantizePass.
+  bool enable_mlir_variable_quantization = false;
+
+  // The node type when the model is exported. Currently this is limited to
+  // DT_FLOAT, DT_HALF, DT_QINT8, and DT_QUINT8. When DT_HALF is used, the
+  // `weight_quantization` flag needs to set to true. When DT_QUINT8 is used,
+  // the `weight_quantization` flag needs to set to false.
+  tensorflow::DataType inference_type = tensorflow::DT_FLOAT;
+
+  // The input and output data type during inference. This flag is only used
+  // when `inference_type` is different from DT_FLOAT. This flag can only be set
+  // to DT_FLOAT or as same as `inference_type`. If this flag is different
+  // from `inference_type`, adaptor ops are inserted as heading and tailing ops
+  // in the result model.
+  tensorflow::DataType inference_input_type = tensorflow::DT_FLOAT;
+
+  // Input node ranges. These ranges are stored as the same order of function
+  // arguments. They are only used when `weight_quantization` is set to false,
+  // and the model is required to have quantization parameters, either from
+  // quantization aware training or calibration, for the remaining tensors.
+  std::vector<std::pair<std::optional<double>, std::optional<double>>>
+      input_ranges;
+
+  // Whether to disable setting the quantization parameters of the input nodes
+  // using input ranges.
+  bool disable_set_input_nodes_quantization_params = false;
+
+  // The default ranges can be used when a tensor doesn't have quantization
+  // parameters and couldn't be quantized. Used only for latency tests.
+  std::pair<std::optional<double>, std::optional<double>> default_ranges;
+
+  // A serialized "QuantizationInfo" object to specify value ranges for some of
+  // the tensors with known names.
+  std::string serialized_quant_stats = "";
+
+  // A bitmask to encode support for reduced precision inference in the model.
+  tflite::optimize::ReducedPrecisionSupport support_mask =
+      tflite::optimize::ReducedPrecisionSupport::None;
+
+  // Whether to run the passes to propagate the quantization parameters and
+  // graph rewrites. Returns false if the inference_type is DT_FLOAT or
+  // `weight_quantization` flag is set.
+  bool RunPropagationAndRewriteQuantizationPasses() const {
+    return inference_type != tensorflow::DT_FLOAT && !weight_quantization;
+  }
+
+  // TODO: b/202075505 - make implicit weight type clearer
+  // Whether run the passes and graph rewrites for dynamic range quantization.
+  bool RunAndRewriteDynamicRangeQuantizationPasses() const {
+    bool dynamic_range_quantize =
+        (inference_type != tensorflow::DT_FLOAT) && weight_quantization &&
+        !post_training_quantization && !disable_infer_tensor_range &&
+        enable_mlir_dynamic_range_quantizer;
+    return dynamic_range_quantize;
+  }
+
+  // Returns whether this inference type represents a signed storage type.
+  bool IsSignedInferenceType() const {
+    switch (inference_type) {
+      case tensorflow::DT_QUINT8:
+      case tensorflow::DT_QUINT16:
+        return false;
+      default:
+        return true;
+    }
+  }
+
+  // Gets the width of this quantization type. Returns 0 if it isn't a
+  // quantization type.
+  int64_t GetQuantizationTypeWidth() const {
+    switch (inference_type) {
+      case tensorflow::DT_INT8:
+      case tensorflow::DT_UINT8:
+      case tensorflow::DT_QINT8:
+      case tensorflow::DT_QUINT8:
+        return 8;
+      case tensorflow::DT_INT16:
+      case tensorflow::DT_UINT16:
+      case tensorflow::DT_QINT16:
+      case tensorflow::DT_QUINT16:
+        return 16;
+      case tensorflow::DT_INT32:
+      case tensorflow::DT_QINT32:
+        return 32;
+      default:
+        return 0;
+    }
+  }
+
+  // Whether to add the NumericVerify ops to verify numbers before and after
+  // quantization.
+  bool verify_numeric = false;
+  // Whether to add verification for layer by layer, or on whole model. When
+  // disabled (per-layer) float and quantized ops will be run from same input
+  // (output of previous quantized layer). When enabled, float and quantized ops
+  // will run with respective float and quantized output of previous ops.
+  bool whole_model_verify = false;
+
+  // Whether to use fake quant attributes to calculate quantization parameters.
+  bool use_fake_quant_num_bits = false;
+
+  // Names of ops to block from quantization. Used in QuantizePass.
+  // For dynamic range quantization, ops in blocklist are quantized in weight-
+  // only manner.
+  absl::flat_hash_set<std::string> ops_blocklist;
+
+  // Names of locations to block from quantization. Used in QuantizePass.
+  absl::flat_hash_set<std::string> nodes_blocklist;
+
+  // Map from custom op code to custom op quantization information.
+  // For dynamic range quantization, among the custom ops in the graph those
+  // specified in this map are subject to quantization.
+  CustomOpMap custom_map;
+
+  // If other than kQDQNone, the model is a floating point graph with QDQ ops
+  // to be eliminated and fused into quantized kernels.
+  QDQConversionMode qdq_conversion_mode = QDQConversionMode::kQDQNone;
+
+  // When set, adheres to the QDQ annotations added by the framework when
+  // possible rather than quantizing any op that is possible to quantize.
+  bool strict_qdq_mode = false;
+};
+
+// Parses the command line flag strings to the CustomOpMap specification.
+void ParseCustomOpSpecs(absl::string_view node_names,
+                        const CustomOpUpdateOptions& update_option,
+                        CustomOpMap& custom_op_map);
+
+// Parses the command line flag strings to the quantization specification for
+// input arrays of a graph. The array names are not stored in the spec, and will
+// be matched by position. Returns true if failed.
+bool ParseInputNodeQuantSpecs(absl::string_view node_names,
+                              absl::string_view min_values,
+                              absl::string_view max_values,
+                              absl::string_view inference_type,
+                              QuantizationSpecs* quant_specs);
+
+// Gets the quantization specification for input arrays. The array names are not
+// stored in the spec, and will be matched by position. The min/max will be
+// ignored if the inference_type isn't a quantized type. Returns true if failed.
+bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
+                            const std::vector<std::optional<double>>& node_mins,
+                            const std::vector<std::optional<double>>& node_maxs,
+                            tensorflow::DataType inference_type,
+                            QuantizationSpecs* quant_specs);
+
+// Returns a human-readable string of the QDQQuantMode enum class
+std::string GetQDQQuantModeString(QDQConversionMode mode);
+
+// Returns the QDQQuantMode enum class from a human-readable string
+QDQConversionMode GetQDQQuantModeFromString(const std::string& mode_str);
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.cc
new file mode 100644
index 000000000000..0ce7f43cd24f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.cc
@@ -0,0 +1,958 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.h"
+
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+constexpr int32_t kBiasMax = std::numeric_limits<int32_t>::max() / 2;
+
+// Uses the type of `value` to set the initial state of the index-th result if
+// `as_result` is true or index-th operand if `as_result` is false. The state
+// is immutable if the type is a quantized type. Returns the index of this
+// new state in the state vector.
+void InitializeStateForValue(
+    Operation* op, const int index, const Value value, const bool as_result,
+    std::vector<QuantState>& states,
+    DenseMap<Value, QuantizationDriver::QuantStateIndex>& value_to_state,
+    DenseMap<QuantizationDriver::OpWithOperandIndex,
+             QuantizationDriver::QuantStateIndex>& operand_states,
+    DenseMap<QuantizationDriver::OpWithResultIndex,
+             QuantizationDriver::QuantStateIndex>& result_states) {
+  const auto [cached, inserted] = value_to_state.try_emplace(value, 0);
+  if (!inserted) {
+    if (as_result) {
+      result_states[{op, index}] = cached->second;
+    } else {
+      operand_states[{op, index}] = cached->second;
+    }
+    return;
+  }
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(value.getType());
+
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states.size();
+  states.push_back({quantized_type, immutable});
+  if (as_result) {
+    result_states[{op, index}] = next_state_index;
+  } else {
+    operand_states[{op, index}] = next_state_index;
+  }
+
+  cached->second = next_state_index;
+}
+
+bool HasPerAxisQuantizedOperand(Operation* op) {
+  for (int i = 0; i < op->getNumOperands(); ++i) {
+    if (auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
+            op->getOperand(i).getDefiningOp())) {
+      auto type =
+          mlir::cast<TensorType>(dq_op.getArg().getType()).getElementType();
+      if (auto per_axis_qtype =
+              mlir::dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(
+                  QuantizedType::getQuantizedElementType(type))) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+void QuantizationDriver::InitializeArgState(const BlockArgument arg,
+                                            const Value arg_value) {
+  const auto [cached, inserted] = value_to_state_.try_emplace(arg_value, 0);
+  if (!inserted) {
+    arg_states_[arg] = cached->second;
+    return;
+  }
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(arg_value.getType());
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states_.size();
+  states_.push_back({quantized_type, immutable});
+  arg_states_[arg] = next_state_index;
+  cached->second = next_state_index;
+}
+
+void QuantizationDriver::InitializeOperandState(Operation* op, const int index,
+                                                const Value value) {
+  InitializeStateForValue(op, index, value, /*as_result=*/false, states_,
+                          value_to_state_, operand_states_, result_states_);
+}
+
+void QuantizationDriver::InitializeResultState(Operation* op, const int index,
+                                               const Value value) {
+  InitializeStateForValue(op, index, value, /*as_result=*/true, states_,
+                          value_to_state_, operand_states_, result_states_);
+}
+
+std::unique_ptr<OpQuantSpec> QuantizationDriver::GetQuantSpec(Operation* op) {
+  return op_quant_spec_getter_(op);
+}
+
+std::unique_ptr<OpQuantScaleSpec> QuantizationDriver::GetQuantScaleSpec(
+    Operation* op) {
+  return op_quant_scale_spec_getter_(op);
+}
+
+bool QuantizationDriver::IsQuantized(Operation* op) {
+  for (int i = 0; i < op->getNumResults(); ++i) {
+    if (GetResultQuantState(op, i).IsEmpty()) return false;
+  }
+  return true;
+}
+
+bool QuantizationDriver::SetConstantResultParams(Operation* op) {
+  DenseFPElementsAttr attr;
+  const Value result = op->getResult(0);
+  if (!matchPattern(result, m_Constant(&attr))) {
+    return false;
+  }
+  // TODO: b/323478683 - Make storage_type_width and narrow_range configurable.
+  Type final_type;
+  const auto it = optimized_weights_.find(op);
+  const bool is_weight = it != optimized_weights_.end();
+  const bool is_weight_with_per_channel_support =
+      is_weight && it->second != -1 && is_signed_;
+
+  if (is_weight_with_per_channel_support && !disable_per_channel_) {
+    // When `disable_per_channel_` is false, per-channel symmetric quantization
+    // parameters are created from the weights when the ops support per-channel
+    // quantization. Otherwise, uses per-tensor asymmetric quantization with
+    // narrow range.
+
+    // per-axis quantization weight, with symmetric min/max enforced.
+    final_type = GetUniformQuantizedPerAxisTypeForWeight(
+        attr, it->second, /*symmetric=*/true, /*num_bits=*/8, is_signed_,
+        /*narrow_range=*/true, legacy_float_scale_);
+  } else {
+    // per-tensor quantization weight
+    final_type = GetUniformQuantizedTypeForWeight(
+        attr, /*symmetric=*/is_weight && is_signed_,
+        /*num_bits=*/8, is_signed_,
+        /*narrow_range=*/is_weight, legacy_float_scale_);
+  }
+  if (const auto quant_type = mlir::dyn_cast_or_null<QuantizedType>(final_type);
+      quant_type != nullptr) {
+    return SetResultParams(op, /*result_index=*/0, quant_type);
+  }
+  return false;
+}
+
+bool QuantizationDriver::SetResultParams(Operation* op, const int result_index,
+                                         const QuantizedType quantized_type) {
+  QuantState& state = GetResultQuantState(op, result_index);
+  if (state.params == quantized_type) {
+    return false;
+  }
+  if (!state.IsEmpty()) {
+    RequantizeStates& rescales = GetResultRequantizeStates(op, result_index);
+    RequantizeState& rescale = rescales.emplace_back();
+    rescale.pos = RequantizeState::ON_INPUT;
+    rescale.params = quantized_type;
+    return true;
+  }
+  state.params = quantized_type;
+  AddUserToList(op, result_index);
+  return true;
+}
+
+QuantizedType QuantizationDriver::GetBiasParams(
+    Operation* op, const int bias_index,
+    const ArrayRef<int> non_bias_operand_indices,
+    const AccumulatorScaleFunc func) {
+  QuantState& bias_state = GetOperandQuantState(op, bias_index);
+  if (!bias_state.IsEmpty()) {
+    return bias_state.params;
+  }
+  std::vector<QuantizedType> op_types{};
+  op_types.reserve(non_bias_operand_indices.size());
+
+  int adjusted_quant_dim = -1;
+  if (op->getNumOperands() > bias_index) {
+    // Some kernels allow 1D bias, broadcasting it inside the kernel. In this
+    // case, the `quantizedDimension=0` when quantizing per-channel.
+    // However, for some kernels which require bias to be already broadcasted
+    // to match the accumulation shape, the very last index should be used.
+    Operation* bias_op = op->getOperand(bias_index).getDefiningOp();
+    if (bias_op != nullptr) {
+      Type bias_type = bias_op->getResult(0).getType();
+      if (bias_type != builder_.getNoneType()) {
+        const int bias_rank = mlir::dyn_cast<ShapedType>(bias_type).getRank();
+        adjusted_quant_dim = bias_rank > 1 ? bias_rank - 1 : 0;
+      }
+    }
+  }
+
+  for (const int non_bias_operand_index : non_bias_operand_indices) {
+    const QuantState& non_bias_state =
+        GetOperandQuantState(op, non_bias_operand_index);
+    op_types.push_back(non_bias_state.params);
+  }
+  return func(op_types, adjusted_quant_dim, legacy_float_scale_);
+}
+
+bool QuantizationDriver::SetOperandParams(Operation* op,
+                                          const int operand_index,
+                                          const QuantizedType quantized_type,
+                                          const bool override) {
+  QuantState& state = GetOperandQuantState(op, operand_index);
+  if (state.params == quantized_type) {
+    return false;
+  }
+
+  if (!state.IsEmpty() && !override) {
+    RequantizeStates& rescales = GetOperandRequantizeStates(op, operand_index);
+    for (RequantizeState& rescale : rescales) {
+      if (rescale.params == quantized_type) {
+        rescale.users.emplace_back(op, operand_index);
+        return true;
+      }
+    }
+    RequantizeState& rescale = rescales.emplace_back();
+    rescale.pos = RequantizeState::ON_OUTPUT;
+    rescale.params = quantized_type;
+    rescale.users.emplace_back(op, operand_index);
+    return true;
+  }
+
+  state.params = quantized_type;
+  AddOperandToList(op, operand_index);
+  return true;
+}
+
+void QuantizationDriver::QuantizeOpResult(Operation* op, const int result_index,
+                                          const QuantizedType quantized_type) {
+  builder_.setInsertionPointAfter(op);
+  const Value original_result = op->getResult(result_index);
+  QuantizeValue(original_result, quantized_type, op->getLoc());
+}
+
+void QuantizationDriver::QuantizeArg(BlockArgument arg,
+                                     const QuantizedType quantized_type) {
+  builder_.setInsertionPointToStart(arg.getOwner());
+  QuantizeValue(arg, quantized_type, builder_.getUnknownLoc());
+}
+
+void QuantizationDriver::QuantizeValue(Value value,
+                                       QuantizedType quantized_type,
+                                       const Location loc) {
+  const Type expressed_type = value.getType();
+  const Type new_value_type =
+      quantized_type.castFromExpressedType(expressed_type);
+  // Skip if `value` or `value`'s element type doesn't match the expressed type
+  // of `quantized_type`.
+  if (new_value_type == nullptr) return;
+
+  auto quantize =
+      builder_.create<quantfork::QuantizeCastOp>(loc, new_value_type, value);
+  auto dequantize = builder_.create<quantfork::DequantizeCastOp>(
+      loc, expressed_type, quantize.getResult());
+
+  // This attribute is set to distinguish the quantize ops being added by the
+  // quantization pass. These ops can be removed without losing original
+  // program accuracy.
+  // TODO: b/323478683 - Make the attribute being part of op definition.
+  quantize->setAttr(kVolatileOpAttrName, builder_.getUnitAttr());
+
+  // `original_result` has a use to `quantize`, so this will replace that use
+  // by the result of `dequantize`. Remember to reset that use afterwards
+  value.replaceAllUsesWith(dequantize);
+  quantize.getOperation()->replaceUsesOfWith(dequantize, value);
+}
+
+void QuantizationDriver::RequantizeOpResult(Operation* op,
+                                            const int result_index,
+                                            RequantizeStates& states) {
+  if (states.empty()) return;
+
+  builder_.setInsertionPointAfter(op);
+  Value value = op->getResult(result_index);
+  RequantizeState::RequantizePosition pos = states.front().pos;
+  if (pos == RequantizeState::NO_REQUANTIZE) {
+    return;
+  }
+  for (const RequantizeState& state : states) {
+    // Check that all requantization positions are the same for each state.
+    // Unsure if this check is required.
+    if (state.pos != pos) {
+      return;
+    }
+  }
+  if (pos == RequantizeState::ON_OUTPUT) {
+    Operation* user = value.getUses().begin().getUser();
+    if (isa<quantfork::QuantizeCastOp>(user)) {
+      // The requantize op is inserted between `quantize` and `dequantize` ops.
+      value = user->getResult(0);
+      builder_.setInsertionPointAfter(user);
+    }
+  }
+  RequantizeValue(value, states, op->getLoc());
+}
+
+void QuantizationDriver::RequantizeArg(const BlockArgument arg,
+                                       RequantizeStates& states) {
+  Value value = arg;
+  builder_.setInsertionPointToStart(arg.getOwner());
+  if (value.hasOneUse()) {
+    Operation* user = value.use_begin().getUser();
+    if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
+      value = q.getResult();
+      builder_.setInsertionPoint(arg.getOwner(), ++Block::iterator(user));
+    }
+  }
+  RequantizeValue(value, states, builder_.getUnknownLoc());
+}
+
+void QuantizationDriver::RequantizeValue(Value value, RequantizeStates& states,
+                                         const Location loc) {
+  if (states.empty() || states.front().pos == RequantizeState::NO_REQUANTIZE) {
+    return;
+  }
+  if (states.front().pos == RequantizeState::ON_INPUT) {
+    RequantizeState& state = states.front();
+    const Type expressed_type = value.getType();
+    // The value needs to be requantized. A Quantize op will be created to use
+    // it as the operand and replace its uses.
+    const Type new_type = state.params.castFromExpressedType(expressed_type);
+    if (!new_type) return;
+    auto requantize_op =
+        builder_.create<quantfork::QuantizeCastOp>(loc, new_type, value);
+    value.replaceAllUsesWith(requantize_op);
+    requantize_op.getOperation()->replaceUsesOfWith(requantize_op, value);
+    // This requantization was defined as required for the result value, so
+    // there should be only one requant state.
+    return;
+  }
+
+  // If this is an operand that requires requantization, then the value should
+  // only have one `DequantizeCastOp` user which produces the operand value.
+  if (!value.hasOneUse()) {
+    return;
+  }
+  auto dequant_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
+      value.use_begin().getUser());
+  if (!dequant_op) {
+    return;
+  }
+  // It is possible that the dequant value is used by a op that doesn't require
+  // requant, so only overwrite the first if that is not the case.
+  const int num_uses = std::distance(dequant_op.getResult().use_begin(),
+                                     dequant_op.getResult().use_end());
+
+  // Whether to replace quantization params of the first dequantize op
+  // after the quantized value is produced.
+  // If there is a use other than the requantize states, then we can't clobber.
+  bool clobber_first = num_uses <= states.size();
+  for (RequantizeState& state : states) {
+    Type expressed_type = QuantizedType::castToExpressedType(value.getType());
+    if (!expressed_type) continue;
+    // The value needs to be requantized. A Quantize op will be created to use
+    // it as the operand and replace its uses.
+    const Type new_type = state.params.castFromExpressedType(expressed_type);
+    // This value isn't an expressed type (float), skip.
+    if (!new_type) continue;
+
+    auto requantize_op =
+        builder_.create<quantfork::QuantizeCastOp>(loc, new_type, value);
+
+    if (clobber_first) {
+      dequant_op.setOperand(requantize_op.getResult());
+      // All ops requiring this value already use the result of dequant.
+      clobber_first = false;
+    } else {
+      auto new_dequant_op = builder_.create<quantfork::DequantizeCastOp>(
+          loc, dequant_op.getResult().getType(), requantize_op.getResult());
+      for (auto [op, operand_idx] : state.users) {
+        op->setOperand(operand_idx, new_dequant_op.getResult());
+      }
+    }
+  }
+}
+
+// A heuristic to get quantization parameters satisfies the same scale
+// constraints:
+// - If there are immutable states,
+//   - use the single input, or,
+//   - use the single output, or,
+//   - use the first one in the collection,
+// - use the single input if it is ready, or,
+// - use the single output if it is ready, or,
+// - use the first ready one in the collection.
+QuantizedType QuantizationDriver::GetQuantParamsForSameScaleConstraint(
+    Operation* op) {
+  // Two vector to collect Non-empty operands and results states.
+  std::vector<QuantState*> mutable_states, immutable_states;
+  for (int i = 0; i < op->getNumOperands(); ++i) {
+    QuantState& state = GetOperandQuantState(op, i);
+    if (state.immutable) {
+      immutable_states.push_back(&state);
+    } else if (!state.IsEmpty()) {
+      mutable_states.push_back(&state);
+    }
+  }
+
+  const int immutable_operands_num = immutable_states.size();
+  const int mutable_operands_num = mutable_states.size();
+  // Use the operand's state if it is immutable and it is the only one
+  // operand.
+  if (op->getNumOperands() == 1 && immutable_operands_num == 1) {
+    return immutable_states.front()->params;
+  }
+
+  for (int i = 0; i < op->getNumResults(); ++i) {
+    QuantState& state = GetResultQuantState(op, i);
+    if (state.immutable) {
+      immutable_states.push_back(&state);
+    } else if (!state.IsEmpty()) {
+      mutable_states.push_back(&state);
+    }
+  }
+
+  const int immutable_results_num =
+      immutable_states.size() - immutable_operands_num;
+  const int mutable_results_num = mutable_states.size() - mutable_operands_num;
+  // Use the result's state if it is immutable and it is the only one result.
+  if (op->getNumResults() == 1 && immutable_results_num == 1) {
+    return immutable_states.back()->params;
+  }
+
+  // Use the first immutable state to quantize the rest operands and results.
+  if (!immutable_states.empty()) return immutable_states.front()->params;
+
+  // If there are no immutable states, use the operand's state if it is the
+  // only one operand and has parameters propagated.
+  if (op->getNumOperands() == 1 && mutable_operands_num == 1) {
+    return mutable_states.front()->params;
+  }
+
+  // If there are no immutable states, use the result's state if it is the
+  // only one result and has parameters propagated.
+  if (op->getNumResults() == 1 && mutable_results_num == 1) {
+    return mutable_states.back()->params;
+  }
+
+  // Use the first propagated state to quantize the rest operands and results.
+  if (!mutable_states.empty()) return mutable_states.front()->params;
+
+  // None operands/results have parameters propagated, skip this node for now.
+  return {};
+}
+
+void QuantizationDriver::PreprocessConstantOps() {
+  fn_.walk([&](arith::ConstantOp cst) {
+    // Non-float tensors are neither weights nor require quantization.
+    const auto type = mlir::dyn_cast<ShapedType>(cst.getType());
+    if (!type || !mlir::isa<FloatType>(type.getElementType())) return;
+
+    // Skip if the value is NaN or INF.
+    // Otherwise the illegal scale/zp will be calculated.
+    auto float_attr = mlir::dyn_cast<DenseFPElementsAttr>(cst.getValueAttr());
+    if (float_attr && (float_attr.getValues<APFloat>().empty() ||
+                       !float_attr.getValues<APFloat>()[0].isFinite())) {
+      return;
+    }
+
+    const Value value = cst.getResult();
+    builder_.setInsertionPoint(cst);
+
+    // The following loop will change the value uses, thus we cache all the uses
+    // needs to be changed.
+    SmallVector<std::pair<Operation*, int>> uses;
+    for (OpOperand& use : value.getUses()) {
+      uses.push_back({use.getOwner(), use.getOperandNumber()});
+    }
+    for (const auto [user, operand_num] : uses) {
+      const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(user);
+      const std::unique_ptr<OpQuantScaleSpec> scale_spec =
+          GetQuantScaleSpec(user);
+      const BiasParamsMap biases = spec->biases_params;
+
+      // The quantization parameters of a `weight` shouldn't be determined by
+      // other values. So any constants which are not bias, an operand of an
+      // op with same scale requirements, and haven't been quantized are
+      // weights.
+      if (!biases.contains(operand_num) &&
+          !scale_spec->has_same_scale_requirement &&
+          !dyn_cast<quantfork::QuantizeCastOp>(user)) {
+        // Needs to scan the content of weights to get the quantization
+        // parameters if there are no quantization parameters (FakeQuant ops).
+        // For this case, the weight will not be duplicated.
+        weights_.insert(cst);
+        if (spec->coeff_op_quant_dim.find(operand_num) !=
+            spec->coeff_op_quant_dim.end()) {
+          optimized_weights_.insert(
+              {cst, spec->coeff_op_quant_dim[operand_num]});
+        }
+      } else {
+        // This is a bias or an operand of an op with same scale requirements,
+        // so the quantization parameter are propagated from or determined by
+        // other values. Duplicate this constant in case it is shared by
+        // different users.
+        if (uses.size() > 1) {
+          auto new_constant_op =
+              builder_.create<arith::ConstantOp>(cst.getLoc(), cst.getValue());
+          user->setOperand(operand_num, new_constant_op);
+        }
+      }
+    }
+  });
+}
+
+void QuantizationDriver::SetupAllStates() {
+  for (BlockArgument arg : fn_.getArguments()) {
+    args_.push_back(arg);
+    Value value = arg;
+    // If the argument is quantized, it should only has one user.
+    if (arg.hasOneUse()) {
+      Operation* user = value.use_begin().getUser();
+      if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
+        value = q.getResult();
+      }
+    }
+    InitializeArgState(arg, value);
+  }
+
+  fn_.walk([&](Operation* op) {
+    std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
+    if (!IsOpQuantizable(op) && !scale_spec->has_same_scale_requirement) {
+      return;
+    }
+    work_list_.push_back(op);
+
+    for (int i = 0; i < op->getNumOperands(); ++i) {
+      Value operand = op->getOperand(i);
+      if (Operation* inst = operand.getDefiningOp()) {
+        // If the operand comes from a `quantfork::DequantizeCastOp`, we use
+        // the quantized input of this `quantfork::DequantizeCastOp` to set the
+        // state.
+        if (auto dq = dyn_cast<quantfork::DequantizeCastOp>(inst)) {
+          operand = dq.getArg();
+        }
+      }
+      InitializeOperandState(op, i, operand);
+    }
+
+    for (int i = 0; i < op->getNumResults(); ++i) {
+      Value result = op->getResult(i);
+      // If the result has been quantized, it should only be used by a
+      // `quantfork::QuantizeCastOp`. For this case, we uses the quantized
+      // result to create the state and mark it immutable.
+      if (result.hasOneUse()) {
+        Operation* user = result.use_begin().getUser();
+        if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
+          result = q.getResult();
+        }
+      }
+      InitializeResultState(op, i, result);
+    }
+  });
+}
+
+arith::ConstantOp QuantizationDriver::DuplicateConstantOpIfNeeded(
+    arith::ConstantOp op, Operation* target_op, const int operand_index) {
+  if (op.getResult().hasOneUse()) {
+    return op;
+  }
+  OpBuilder builder(op->getContext());
+  builder.setInsertionPointAfter(op);
+  arith::ConstantOp new_op = cast<arith::ConstantOp>(builder.clone(*op));
+  target_op->getOpOperand(operand_index).set(new_op.getResult());
+  InitializeOperandState(target_op, operand_index, new_op.getResult());
+  InitializeResultState(new_op, 0, new_op.getResult());
+  return new_op;
+}
+
+bool QuantizationDriver::ShouldCheckBiasScale(
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType quantized_type, int& input_index, int& filter_index) {
+  // For now, restrict scale adjustment to ops with affine quantized weights,
+  // and having weights and biases as constants. This currently only applies to
+  // FC and Conv* ops. Restriction for the weight can be relaxed if there are
+  // needs for adjusting scale of variable weights.
+  auto affine_op = dyn_cast<AffineQuantizedOpInterface>(op);
+  auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
+  if (!affine_op || !bias_op || input_indices.size() != 2) return false;
+  if (!mlir::isa<DenseFPElementsAttr>(bias_op.getValue())) return false;
+  filter_index = affine_op.GetAffineOperandIndex();
+  if (!op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>()) {
+    return false;
+  }
+  if (filter_index == input_indices[0]) {
+    input_index = input_indices[1];
+  } else if (filter_index == input_indices[1]) {
+    input_index = input_indices[0];
+  } else {
+    return false;
+  }
+
+  const QuantState& input_state = GetOperandQuantState(op, input_index);
+  const QuantState& filter_state = GetOperandQuantState(op, filter_index);
+  // If quantization parameter for the filter is fixed, should return it as-is.
+  // Only checks ops with 8-bit input and weights, and 32-bit biases.
+  return input_state.params.getStorageTypeIntegralWidth() == 8 &&
+         filter_state.params.getStorageTypeIntegralWidth() == 8 &&
+         quantized_type.getStorageTypeIntegralWidth() == 32;
+}
+
+bool QuantizationDriver::SetBiasParamsWithAdjustments(
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType params) {
+  bool changed = false;
+
+  int input_index;
+  int filter_index;
+  if (!ShouldCheckBiasScale(op, bias_index, input_indices, params, input_index,
+                            filter_index)) {
+    return SetOperandParams(op, bias_index, params);
+  }
+
+  QuantState input_state = GetOperandQuantState(op, input_index);
+  QuantState filter_state = GetOperandQuantState(op, filter_index);
+  auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
+  const double input_scale =
+      mlir::cast<UniformQuantizedType>(input_state.params).getScale();
+
+  auto bias_values = mlir::cast<DenseFPElementsAttr>(bias_op.getValue());
+  // Restrict maximum absolute value of bias within INT_MAX / 2, to make some
+  // room for accumulator.
+  if (auto bias_quantized_type = mlir::dyn_cast<UniformQuantizedType>(params);
+      bias_quantized_type != nullptr) {
+    double bias_half_range = 0.0f;
+    for (auto bias : bias_values.getValues<APFloat>()) {
+      if (bias_half_range < std::abs(bias.convertToFloat())) {
+        bias_half_range = std::abs(bias.convertToFloat());
+      }
+    }
+    if (bias_half_range / bias_quantized_type.getScale() < kBiasMax) {
+      return SetOperandParams(op, bias_index, params);
+    }
+    const double new_bias_scale =
+        static_cast<double>(bias_half_range) / kBiasMax;
+
+    changed |= SetOperandParams(
+        op, bias_index,
+        UniformQuantizedType::getChecked(
+            bias_op->getLoc(), params.getFlags(), params.getStorageType(),
+            params.getExpressedType(), new_bias_scale, 0,
+            params.getStorageTypeMin(), params.getStorageTypeMax()));
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
+        op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
+        filter_index);
+    if (!filter_op) {
+      return SetOperandParams(op, bias_index, params);
+    }
+
+    const auto filter_quantized_type =
+        mlir::cast<UniformQuantizedType>(filter_state.params);
+    changed |= SetOperandParams(
+        op, filter_index,
+        UniformQuantizedType::getChecked(
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(),
+            new_bias_scale / input_scale, 0,
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
+        /*override=*/true);
+  } else if (auto bias_quantized_type =
+                 mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(params);
+             bias_quantized_type != nullptr) {
+    const auto filter_quantized_type =
+        mlir::cast<quant::UniformQuantizedPerAxisType>(filter_state.params);
+    std::vector<double> new_bias_scales = bias_quantized_type.getScales().vec();
+    std::vector<double> new_filter_scales =
+        filter_quantized_type.getScales().vec();
+
+    bool needs_adjustment = false;
+    for (int i = 0; i < bias_quantized_type.getScales().size(); ++i) {
+      const float abs_bias = std::abs(bias_values.getValues<float>()[i]);
+      if (abs_bias / new_bias_scales[i] > kBiasMax) {
+        new_bias_scales[i] = static_cast<double>(abs_bias) / kBiasMax;
+        new_filter_scales[i] = new_bias_scales[i] / input_scale;
+        needs_adjustment = true;
+      }
+    }
+    if (!needs_adjustment) {
+      return SetOperandParams(op, bias_index, params);
+    }
+    changed |= SetOperandParams(
+        op, bias_index,
+        quant::UniformQuantizedPerAxisType::getChecked(
+            bias_op->getLoc(), params.getFlags(), params.getStorageType(),
+            params.getExpressedType(), new_bias_scales,
+            bias_quantized_type.getZeroPoints(),
+            bias_quantized_type.getQuantizedDimension(),
+            params.getStorageTypeMin(), params.getStorageTypeMax()));
+
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
+        op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
+        filter_index);
+    changed |= SetOperandParams(
+        op, filter_index,
+        quant::UniformQuantizedPerAxisType::getChecked(
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(), new_filter_scales,
+            filter_quantized_type.getZeroPoints(),
+            filter_quantized_type.getQuantizedDimension(),
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
+        /*override=*/true);
+  }
+  return changed;
+}
+
+// This method scans the operations in the function to setup the initial
+// states for quantization parameter propagation.
+// TODO: b/323478683 - This algorithm assumes there are only one pair of
+// `quantfork::QuantizeCastOp` and `quantfork::DequantizeCastOp` ops between two
+// quantizable ops. A sanity check should be applied.
+void QuantizationDriver::Initialize() {
+  // Duplicate the bias constant, so the states can be setup correctly.
+  // TODO: b/323478683 - Function definition should also be duplicated if there
+  // are multiple call sites.
+  PreprocessConstantOps();
+
+  // Setup all the internal states.
+  SetupAllStates();
+}
+
+// Propagates the quantization parameters to the operands, results, and biases.
+// TODO: b/323478683 - Do not use while loop to handle this logic.
+bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
+  // TODO: b/323478683 - Use a typed indicator instead of a bool value.
+  bool changed = false;
+  while (!work_list_.empty()) {
+    Operation* op = work_list_.back();
+    work_list_.pop_back();
+
+    // This op has been quantized, so we should not consider it again.
+    if (quantized_.contains(op)) continue;
+    quantized_.insert(op);
+
+    if (auto constant_op = dyn_cast<arith::ConstantOp>(op); constant_op) {
+      // If the workflow requires inferring ranges from the content
+      // (post-training quantization) and it is weight (filter) and hasn't
+      // been quantized, we infer the quantization parameters from the content.
+      if (infer_tensor_range_ && IsWeight(constant_op) && !IsQuantized(op)) {
+        // The quantization parameters are determined by the content of the
+        // constant.
+        changed |= SetConstantResultParams(op);
+      }
+      continue;
+    }
+
+    std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
+
+    if (scale_spec->has_same_scale_requirement) {
+      const QuantizedType params = GetQuantParamsForSameScaleConstraint(op);
+      // The quantization parameters haven't been propagated to any operands
+      // or results. Skip this node for now.
+      if (!params) {
+        quantized_.erase(op);
+        continue;
+      }
+
+      // If this is a QDQ conversion only, the op could have a same-scale
+      // requirement for the floating point kernel but allow per-axis
+      // quantization for the quantized kernel. If the quantized dimension
+      // changes, the following logic no longer works as the same `params`
+      // shouldn't be used for both input and output quantization params.
+      // E.g. During TransposeOp's quantization propagation in
+      // PrepareQuantize, if the quantization is per-axis and the
+      // QuantizedDimension is transposed, then the output q-dq params must
+      // reflect the new QuantizedDimension. So, check and skip the
+      // propagation if any of the operands has a per-axis quantized type param
+      // and `RequiredSameQuantizedAxes` set to false.
+      // Currently, these lines of code are only applicable to TFL_TransposeOp
+      // and TFL_ReshapeOp. And the output q-dq propagation for this Op is
+      // performed in `PropagateTransposedPerAxisQuantDim` and
+      // `PropagateReshapedPerAxisQuantDim` respectively.
+      if (is_qdq_conversion_ &&
+          !scale_spec->required_same_quantized_axes_func()) {
+        if (HasPerAxisQuantizedOperand(op)) continue;
+      }
+
+      // Use the final state to set all the operands' parameters.
+      for (int i = 0; i < op->getNumOperands(); ++i) {
+        if (auto type =
+                mlir::dyn_cast<ShapedType>(op->getOperand(i).getType())) {
+          // Without this check, it will accidentally propagate the quantization
+          // information by the shared non-float tensors.
+          if (mlir::isa<FloatType>(type.getElementType()))
+            changed |= SetOperandParams(op, i, params);
+        }
+      }
+
+      // Use the final state to set all the results' parameters.
+      for (int i = 0; i < op->getNumResults(); ++i)
+        if (auto type = mlir::dyn_cast<ShapedType>(op->getResult(i).getType());
+            type != nullptr) {
+          // Without this check, it will accidentally propagate the quantization
+          // information by the shared non-float-tensors.
+          if (mlir::isa<FloatType>(type.getElementType()))
+            changed |= SetResultParams(op, i, params);
+        }
+    }
+
+    // If the model already contains immutable QDQs, require upstream to
+    // explicitly fix output range instead.
+    if (scale_spec->has_fixed_output_range && infer_tensor_range_ &&
+        !is_qdq_conversion_) {
+      // Infer ranges from the activation ops. This is usually required for
+      // the post-training quantization workflow.
+      // TODO: b/323478683 - Different result can have different fixed range.
+      const QuantizedType params =
+          scale_spec->fixed_output_range_func(is_signed_, bit_width_);
+      for (auto i = 0; i < op->getNumResults(); ++i) {
+        // The range is null if the result has been quantized.
+        if (params) {
+          changed |= SetResultParams(op, i, params);
+        }
+      }
+    }
+
+    const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(op);
+    for (const auto& [bias_operand_idx, non_bias_params] :
+         spec->biases_params) {
+      const auto& [non_bias_operand_indices, accumulator_scale_func] =
+          non_bias_params;
+      const QuantizedType params =
+          GetBiasParams(op, bias_operand_idx, non_bias_operand_indices,
+                        accumulator_scale_func);
+      if (!params) {
+        quantized_.erase(op);
+        continue;
+      }
+      changed |= SetBiasParamsWithAdjustments(op, bias_operand_idx,
+                                              non_bias_operand_indices, params);
+    }
+  }
+
+  return changed;
+}
+
+// Finalizes the arguments and result states in the function.
+void QuantizationDriver::Finalize() {
+  for (BlockArgument arg : args_) {
+    const QuantState& state = GetArgQuantState(arg);
+    RequantizeStates& requantizes = GetArgRequantizeStates(arg);
+    if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
+      continue;
+    }
+
+    if (!state.immutable) {
+      QuantizeArg(arg, state.params);
+    }
+
+    if (!requantizes.empty()) {
+      RequantizeArg(arg, requantizes);
+    }
+  }
+
+  for (const auto& [op_with_result_idx, quant_state_idx] : result_states_) {
+    const auto [op, result_idx] = op_with_result_idx;
+    const QuantState& state = GetResultQuantState(op, result_idx);
+    RequantizeStates& requantizes = GetResultRequantizeStates(op, result_idx);
+    if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
+      continue;
+    }
+
+    if (!state.immutable) {
+      QuantizeOpResult(op, result_idx, state.params);
+    }
+
+    if (!requantizes.empty()) {
+      RequantizeOpResult(op, result_idx, requantizes);
+    }
+  }
+}
+
+// Runs quantization in following steps:
+//   1. Scans the operations in the function to setup the initial
+//      states for quantization parameter propagation.
+//   2. Propagates the quantization parameters to the operands, results, and
+//      biases.
+//   3. Finalizes the arguments and result states in the function.
+void QuantizationDriver::Run() {
+  Initialize();
+  if (PropagateParamsAndReturnIfChanged()) {
+    Finalize();
+  }
+}
+
+void ApplyQuantizationParamsPropagation(
+    const func::FuncOp func, const bool is_signed, const int bit_width,
+    const bool disable_per_channel,
+    const OpQuantSpecGetter op_quant_spec_getter,
+    const bool infer_tensor_ranges, const bool legacy_float_scale,
+    const bool is_qdq_conversion) {
+  ApplyQuantizationParamsPropagation(
+      func, is_signed, bit_width, disable_per_channel, op_quant_spec_getter,
+      GetDefaultQuantScaleSpec, infer_tensor_ranges, legacy_float_scale,
+      is_qdq_conversion);
+}
+
+void ApplyQuantizationParamsPropagation(
+    const func::FuncOp func, const bool is_signed, const int bit_width,
+    const bool disable_per_channel,
+    const OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+    const bool infer_tensor_ranges, const bool legacy_float_scale,
+    const bool is_qdq_conversion) {
+  QuantizationDriver(func, is_signed, bit_width, disable_per_channel,
+                     op_quant_spec_getter, op_quant_scale_spec_getter,
+                     infer_tensor_ranges, legacy_float_scale, is_qdq_conversion)
+      .Run();
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.h
new file mode 100644
index 000000000000..18d156ec8aa3
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.h
@@ -0,0 +1,387 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
+
+namespace mlir {
+namespace TFL {
+
+// The state for each op result during the quantization parameters propagation.
+struct QuantState {
+  // Quantization parameters propagated to an op result.
+  QuantizedType params;
+  // A flag indicates this state (the params) shouldn't be changed after it is
+  // initialized. This flag will be set to true if the quantization parameters
+  // are from the quantization-aware training.
+  const bool immutable;
+
+  bool IsEmpty() const { return params == nullptr; }
+};
+
+// The state for rescaling the propagated quantization parameters. This can be
+// on the input side to satisfy the constraint of previous operation, or on the
+// output side to satisfy the constraint of the next operation.
+struct RequantizeState {
+  // Sometimes, we have to "requantize" the quantization result to satisfy all
+  // the constraints. The "requantize" can happen either on the input or output
+  // of the quantization result.
+  enum RequantizePosition {
+    NO_REQUANTIZE,
+    ON_INPUT,
+    ON_OUTPUT
+  } pos = NO_REQUANTIZE;
+
+  // Quantization parameters will be used to add the requantize ops.
+  QuantizedType params;
+
+  // Avoid clobbering all uses of the value, limit to just these ops.
+  SmallVector<std::pair<Operation*, int>> users;
+};
+
+using RequantizeStates = SmallVector<RequantizeState>;
+
+// This is a worklist-driven driver for propagating quantization parameters
+// across operations.
+//
+// The initial quantization parameters are extracted from the quantized type
+// between adjacent `quantfork::QuantizeCastOp` and
+// `quantfork::DequantizeCastOp`s. All these initial parameters are marked as
+// immutable because they are from quantization-aware training.
+//
+// The algorithm traverses each op and sets the quantization parameters of its
+// operands and results, according to its quantization specification, and then
+// adds the operands and results to the worklist. If there are any conflicts
+// (for example, there are quantization parameters propagated from the previous
+// iteration), this process stops if the existing parameters are the immutable,
+// or adding `requantize` op to resolve the conflicts.
+//
+// After the algorithm is converged, pairs of `quantfork::QuantizeCastOp` and
+// `quantfork::DequantizeCastOp` are inserted to the right position to
+// materialize the propagation and requantize results.
+//
+class QuantizationDriver {
+ public:
+  // Type alias of int used to access `states_`.
+  using QuantStateIndex = int;
+
+  // (op, operand index) pair.
+  using OpWithOperandIndex = std::pair<Operation*, int>;
+
+  // (op, result index) pair.
+  using OpWithResultIndex = std::pair<Operation*, int>;
+
+  explicit QuantizationDriver(func::FuncOp func_op, const bool is_signed,
+                              const int bit_width,
+                              const bool disable_per_channel,
+                              OpQuantSpecGetter op_quant_spec_getter,
+                              OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+                              const bool infer_tensor_range,
+                              const bool legacy_float_scale = false,
+                              const bool is_qdq_conversion = false)
+      : fn_(func_op),
+        builder_(func_op.getBody()),
+        is_signed_(is_signed),
+        bit_width_(bit_width),
+        disable_per_channel_(disable_per_channel),
+        op_quant_spec_getter_(op_quant_spec_getter),
+        op_quant_scale_spec_getter_(op_quant_scale_spec_getter),
+        infer_tensor_range_(infer_tensor_range),
+        legacy_float_scale_(legacy_float_scale),
+        is_qdq_conversion_(is_qdq_conversion) {}
+
+  // The entry point of the quantization parameters propagation.
+  void Run();
+
+  // Sets up the states for all the op results in the function.
+  void Initialize();
+
+  // Propagates the quantization parameters across all the ops.
+  bool PropagateParamsAndReturnIfChanged();
+
+  // Inserts the Quantize and Dequantize ops according to the propagation
+  // result.
+  void Finalize();
+
+  SmallVector<BlockArgument, 4> GetArgs() { return args_; }
+
+  llvm::DenseMap<std::pair<mlir::Operation*, int>, int> GetResultStates() {
+    return result_states_;
+  }
+
+  DenseMap<OpWithResultIndex, QuantStateIndex> result_states_;
+
+  // Returns the state of the block argument.
+  QuantState& GetArgQuantState(BlockArgument arg) {
+    return states_[arg_states_[arg]];
+  }
+
+  // Returns the state of the index-th result of the op.
+  QuantState& GetResultQuantState(Operation* op, const int index) {
+    return states_[result_states_[{op, index}]];
+  }
+
+ private:
+  // Duplicates the constant op if it has multiple uses, and replaces
+  // target_op->operand[operand_index] with the newly created op. This also
+  // replaces corresponsing quantization states.
+  arith::ConstantOp DuplicateConstantOpIfNeeded(arith::ConstantOp op,
+                                                Operation* target_op,
+                                                int operand_index);
+
+  // Adjusts bias scale that is derived from other scales (fc, conv ops) to
+  // prevent overflow of quantized bias values. This also changes quantization
+  // state of other inputs when needed.
+  bool SetBiasParamsWithAdjustments(Operation* op, int bias_index,
+                                    ArrayRef<int> input_indices,
+                                    QuantizedType params);
+
+  // Checks preconditions to adjust bias scale.
+  bool ShouldCheckBiasScale(Operation* op, int bias_index,
+                            ArrayRef<int> input_indices,
+                            QuantizedType quantized_type, int& input_index,
+                            int& filter_index);
+
+  // Preprocesses the constants by doing the following:
+  //   - Duplicates constants if it is used by multiple ops. For example, if a
+  //     constant is used by multiple ops as a bias, duplicate constants and
+  //     let each op assign its own quantization parameter for bias.
+  //   - Adds all the non-bias constants (weights) to a set for looking up
+  //     later.
+  //   - Adds all per-channel weights to a set for looking up later.
+  void PreprocessConstantOps();
+
+  // Sets up all the data structures for quantization propagation.
+  void SetupAllStates();
+
+  // Returns Whether the constant is a weight, which shouldn't be shared by
+  // different ops.
+  bool IsWeight(Operation* cst) { return llvm::is_contained(weights_, cst); }
+
+  // Returns all the related quantization constraints of the op.
+  std::unique_ptr<OpQuantSpec> GetQuantSpec(Operation* op);
+  std::unique_ptr<OpQuantScaleSpec> GetQuantScaleSpec(Operation* op);
+
+  // Returns whether quantization parameters have been propagated to the results
+  // of this op.
+  bool IsQuantized(Operation* op);
+
+  // Adds all the users of index-th result of op to the work list.
+  void AddUserToList(Operation* op, const int index) {
+    for (Operation* user : op->getResult(index).getUsers()) {
+      work_list_.push_back(user);
+    }
+  }
+
+  // Adds the defining op of index-th operand of op to the work list.
+  void AddOperandToList(Operation* op, const int index) {
+    if (Operation* operand_op = op->getOperand(index).getDefiningOp();
+        operand_op != nullptr) {
+      work_list_.push_back(operand_op);
+    }
+  }
+
+  // Returns the quantization params for the bias input from the non-bias
+  // operands which have their indexes in the `non_biases` vector. The returned
+  // parameters are calculated by `func`.
+  QuantizedType GetBiasParams(Operation* op, int bias_index,
+                              ArrayRef<int> non_bias_operand_indices,
+                              AccumulatorScaleFunc func);
+
+  // Sets the quantization parameters of the result to `quantized_type`. If
+  // any quantization parameters have been propagated, a requantize will
+  // happen on the input of propagated quantization. Returns `true` if internal
+  // state has been modified.
+  bool SetResultParams(Operation* op, int result_index,
+                       QuantizedType quantized_type);
+
+  // Sets the quantization parameters of the operand to `quantized_type`. If any
+  // quantization parameters have been propagated, a `requantize` will happen on
+  // the output of propagated quantization. When `override` is set, quantization
+  // state of the value is replaced instead of adding requantization. Returns
+  // `true` if internal state has been modified.
+  bool SetOperandParams(Operation* op, int operand_index,
+                        QuantizedType quantized_type, bool override = false);
+
+  // Sets the quantization parameters of the constant result according to its
+  // content.
+  bool SetConstantResultParams(Operation* op);
+
+  // Inserts the Quantize and Dequantize ops after `op`'s `index`-th result. The
+  // quantized element type for the result is `quantized_type`.
+  void QuantizeOpResult(Operation* op, int result_index,
+                        QuantizedType quantized_type);
+
+  // Inserts the Quantize and Dequantize ops after `arg`. The quantized element
+  // type for `arg` is `quantized_type`.
+  void QuantizeArg(BlockArgument arg, QuantizedType quantized_type);
+
+  // Inserts the Quantize and Dequantize ops (i.e. QDQ) after `value`. The
+  // quantized element type for `value` is `quantized_type`.
+  void QuantizeValue(Value value, QuantizedType quantized_type, Location loc);
+
+  // Inserts the Quantize ops for requantizing the index-th result of the op.
+  void RequantizeOpResult(Operation* op, int result_index,
+                          RequantizeStates& states);
+
+  // Inserts the Quantize ops for requantizing a block argument.
+  void RequantizeArg(BlockArgument arg, RequantizeStates& states);
+
+  // Inserts the Quantize and Dequantize ops to quantize the value and returns
+  // the Quantize op.
+  void RequantizeValue(Value value, RequantizeStates& states, Location loc);
+
+  // Returns the quantization parameter satisfies the same scale
+  // constraints for the op. Returns an empty option if this quantization
+  // parameter doesn't exist.
+  QuantizedType GetQuantParamsForSameScaleConstraint(Operation* op);
+
+  // Returns the state of the index-th operand of the op.
+  QuantState& GetOperandQuantState(Operation* op, const int index) {
+    return states_[operand_states_[{op, index}]];
+  }
+
+  // Returns the states of the index-th operand of the op.
+  RequantizeStates& GetOperandRequantizeStates(Operation* op, const int index) {
+    return rescale_states_[operand_states_[{op, index}]];
+  }
+
+  // Returns the states of the index-th result of the op.
+  RequantizeStates& GetResultRequantizeStates(Operation* op, const int index) {
+    return rescale_states_[result_states_[{op, index}]];
+  }
+
+  // Returns the states of the arg.
+  RequantizeStates& GetArgRequantizeStates(BlockArgument arg) {
+    return rescale_states_[arg_states_[arg]];
+  }
+
+  // Sets the state of an argument. If this value is cached, uses the cached
+  // result without creating new entry in the state vector. Otherwise, allocate
+  // a new entry in the state vector.
+  void InitializeArgState(BlockArgument arg, Value arg_value);
+
+  // Sets the state of the index-th operand of the op. If this operand is
+  // cached, uses the cached result without creating new entry in the state
+  // vector. Otherwise, allocate a new entry in the state vector.
+  void InitializeOperandState(Operation* op, int index, Value value);
+
+  // Sets the state of the index-th result of the op. If this result is cached,
+  // uses the cached result without creating new entry in the state vector.
+  // Otherwise, allocate a new entry in the state vector.
+  void InitializeResultState(Operation* op, int index, Value value);
+
+  func::FuncOp fn_;
+  OpBuilder builder_;
+  const bool is_signed_;
+  const int bit_width_;
+  const bool disable_per_channel_;
+
+  // We should distinguish weights and bias constants. Biases are specified by
+  // the quantization spec or are the operands of ops with same scale spec. The
+  // rest are weights.
+  DenseSet<Operation*> weights_;
+
+  // The weights require narrow_range quantization. This map collects all the
+  // weight operands defined by the op quant spec. The value of each entry is
+  // the quantization dimension. If it is positive, per-channel quantization is
+  // required.
+  DenseMap<Operation*, int> optimized_weights_;
+
+  // All the ops needs to propagate the quantization parameters to.
+  std::vector<Operation*> work_list_;
+  absl::flat_hash_set<Operation*> quantized_;
+
+  // The vector contains all the quantization parameters propagated from the
+  // defining operations of the value, or from the quantization aware training.
+  std::vector<QuantState> states_;
+
+  // The map contains all the quantization parameters which are required to
+  // satisfy the same operands and results constraint. The keys of this map are
+  // the values from `operand_states_` and `result_state_`.
+  absl::flat_hash_map<QuantStateIndex, RequantizeStates> rescale_states_;
+
+  // Maps of indexes to the propagation state vector from the ops operands,
+  // results and arguments.
+  DenseMap<OpWithOperandIndex, QuantStateIndex> operand_states_;
+  DenseMap<BlockArgument, QuantStateIndex> arg_states_;
+  DenseMap<Value, QuantStateIndex> value_to_state_;
+
+  // This vector is to preserve the arguments order, so the newly inserted
+  // quantized ops for the arguments are deterministically ordered.
+  SmallVector<BlockArgument, 4> args_;
+
+  OpQuantSpecGetter op_quant_spec_getter_;
+  OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
+
+  // Infer output ranges for activation ops and constants. This is usually
+  // required for post-training quantization.
+  const bool infer_tensor_range_;
+
+  // Calculate scales in float instead of double, so that the scales and
+  // quantized values are exactly the same with the TOCO quantizer.
+  const bool legacy_float_scale_;
+
+  // If true, the model is a floating point graph with QDQ ops to be eliminated
+  // and fused into quantized kernels.
+  const bool is_qdq_conversion_;
+};
+
+// Propagates quantization parameters across ops in this function and satisfies
+// the quantization specification of the ops. This methods assumes the initial
+// quantization parameters are stored as adjacent quantize and dequantize ops
+// and the propagation results are materialized by inserting pairs of quantize
+// and dequantize ops to this function. Set `disable_per_channel` to true to not
+// use per channel quantization even the op supports it.
+// Setting `infer_tensor_range` to true, to infer quantization parameters from
+// the activation ops and weight constants. This is only used for post-training
+// quantization.
+void ApplyQuantizationParamsPropagation(func::FuncOp func, bool is_signed,
+                                        int bit_width, bool disable_per_channel,
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool infer_tensor_ranges,
+                                        bool legacy_float_scale,
+                                        bool is_qdq_conversion);
+
+void ApplyQuantizationParamsPropagation(
+    func::FuncOp func, bool is_signed, int bit_width, bool disable_per_channel,
+    OpQuantSpecGetter op_quant_spec_getter,
+    OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
+    bool legacy_float_scale, bool is_qdq_conversion);
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver_test.cc b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver_test.cc
new file mode 100644
index 000000000000..59ca182bd418
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver_test.cc
@@ -0,0 +1,169 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_driver.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::TFL {
+namespace {
+
+using ApplyQuantizationParamsPropagationTest =
+    mlir::quant::QuantizationTestBase;
+using ::testing::IsEmpty;
+using ::testing::Not;
+
+constexpr absl::string_view kModuleTFLite = R"mlir(
+  module {
+    func.func @main(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> attributes {_from_xla_call_module} {
+      %cst_0 = arith.constant dense<1.0> : tensor<3x1x1x3xf32>
+      %cst_1 = arith.constant dense<2.0> : tensor<3xf32>
+      %0 = "tf.XlaCallModule"(%arg0, %cst_0, %cst_1) <{Sout = [#tf_type.shape<1x4x4x3>], module = "", version = 9 : i64}> {_entry_function = @composite_fn_1, _stablehlo_version = "1.0.0", _original_entry_function = "composite_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst_0, %cst_1) <{Sout = [#tf_type.shape<1x4x4x3>], module = "", version = 9 : i64}> {_entry_function = @composite_fn_2, _stablehlo_version = "1.0.0", _original_entry_function = "composite_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      return %1 : tensor<1x4x4x3xf32>
+    }
+    func.func private @composite_fn_1(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<3x1x1x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x4x4x3xf32> attributes {tf_quant.composite_function} {
+      %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      return %0 : tensor<1x4x4x3xf32>
+    }
+    func.func private @composite_fn_2(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<3x1x1x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x4x4x3xf32> attributes {tf_quant.composite_function} {
+      %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      return %0 : tensor<1x4x4x3xf32>
+    }
+  }
+)mlir";
+
+// TOOD: b/323478683 - Directly use types rather than creating a `unique_ptr`.
+std::unique_ptr<OpQuantSpec> GetOpQuantSpec(
+    const mlir::Operation* op,
+    bool disable_per_channel_for_dense_layers = false) {
+  auto spec = std::make_unique<OpQuantSpec>();
+  spec->coeff_op_quant_dim[1] = 3;
+  spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
+  for (const auto& [key, value] : spec->coeff_op_quant_dim) {
+    spec->quantizable_operands.insert(key);
+  }
+  return spec;
+}
+
+TEST_F(ApplyQuantizationParamsPropagationTest,
+       ConstsUsedMultipleTimesAreDuplicated) {
+  const OwningOpRef<ModuleOp> module_op_ref =
+      mlir::quant::QuantizationTestBase::ParseModuleOpString(kModuleTFLite);
+  func::FuncOp main_fn = mlir::quant::FindMainFuncOp(*module_op_ref);
+
+  auto op_quant_spec_getter = [&](mlir::Operation* op) {
+    return GetOpQuantSpec(op, /*disable_per_channel_for_dense_layers=*/false);
+  };
+  QuantizationDriver quantization_driver(
+      main_fn, /*is_signed=*/true, /*bit_width=*/8,
+      /*disable_per_channel=*/false, op_quant_spec_getter,
+      GetDefaultQuantScaleSpec,
+      /*infer_tensor_range=*/true, /*legacy_float_scale=*/false,
+      /*is_qdq_conversion=*/false);
+
+  quantization_driver.Initialize();
+
+  int64_t num_constant_op = 0;
+  main_fn.walk([&](arith::ConstantOp cst) { ++num_constant_op; });
+  EXPECT_EQ(num_constant_op, 4);
+}
+
+TEST_F(ApplyQuantizationParamsPropagationTest,
+       PropagateParamsCreatesQuantState) {
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
+  func::FuncOp main_fn = mlir::quant::FindMainFuncOp(*module_op_ref);
+
+  auto op_quant_spec_getter = [&](mlir::Operation* op) {
+    return GetOpQuantSpec(op, /*disable_per_channel_for_dense_layers=*/false);
+  };
+  QuantizationDriver quantization_driver(
+      main_fn, /*is_signed=*/true, /*bit_width=*/8,
+      /*disable_per_channel=*/false, op_quant_spec_getter,
+      GetDefaultQuantScaleSpec,
+      /*infer_tensor_range=*/true, /*legacy_float_scale=*/false,
+      /*is_qdq_conversion=*/false);
+
+  quantization_driver.Initialize();
+  ASSERT_TRUE(quantization_driver.PropagateParamsAndReturnIfChanged());
+  EXPECT_THAT(quantization_driver.GetArgs(), Not(IsEmpty()));
+
+  for (const auto& arg : quantization_driver.GetArgs()) {
+    const QuantState& state = quantization_driver.GetArgQuantState(arg);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
+  }
+  for (const auto& result : quantization_driver.GetResultStates()) {
+    mlir::Operation* op = result.first.first;
+    const int res_index = result.first.second;
+    const QuantState state =
+        quantization_driver.GetResultQuantState(op, res_index);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
+  }
+}
+
+TEST_F(ApplyQuantizationParamsPropagationTest, FinalizeInsertsQDQOps) {
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
+  func::FuncOp main_fn = mlir::quant::FindMainFuncOp(*module_op_ref);
+
+  auto op_quant_spec_getter = [&](mlir::Operation* op) {
+    return GetOpQuantSpec(op, /*disable_per_channel_for_dense_layers=*/false);
+  };
+  ApplyQuantizationParamsPropagation(
+      main_fn, /*is_signed=*/true, /*bit_width=*/8,
+      /*disable_per_channel=*/false, op_quant_spec_getter,
+      /*infer_tensor_ranges=*/true, /*legacy_float_scale=*/false,
+      /*is_qdq_conversion=*/false);
+  mlir::Operation* xla_call_module_op =
+      quant::FindOperationOfType<TF::XlaCallModuleOp>(main_fn);
+  mlir::Operation* filter_dcast_op =
+      xla_call_module_op->getOperand(1).getDefiningOp();
+  mlir::Operation* filter_qcast_op =
+      filter_dcast_op->getOperand(0).getDefiningOp();
+  ASSERT_NE(filter_qcast_op, nullptr);
+  EXPECT_TRUE(isa<quantfork::QuantizeCastOp>(filter_qcast_op));
+  EXPECT_TRUE(isa<quantfork::DequantizeCastOp>(filter_dcast_op));
+  EXPECT_TRUE(isa<quant::UniformQuantizedPerAxisType>(
+      mlir::cast<TensorType>(filter_qcast_op->getResult(0).getType())
+          .getElementType()));
+}
+
+}  // namespace
+}  // namespace mlir::TFL
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h
new file mode 100644
index 000000000000..332682eb6199
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h
@@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the op traits used in the MLIR TensorFlow Lite dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_TRAITS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_TRAITS_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+using QuantizedType = mlir::quant::QuantizedType;
+using UniformQuantizedType = mlir::quant::UniformQuantizedType;
+
+namespace mlir {
+namespace TFL {
+// Verifies that the op satisfies the same operands and results scales
+// constraints. Note that this constraint can only be applied on some
+// storage types of the op.
+LogicalResult VerifySameScales(Operation* op);
+}  // namespace TFL
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_interface.h.inc"
+
+namespace OpTrait {
+namespace TFL {
+
+// The base class that all the quantization related OpTrait implements.
+template <typename ConcreteType, template <typename> class TraitType>
+struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
+  static bool IsBias(int index) { return false; }
+  static bool IsQuantizable() { return true; }
+};
+
+// This class provides the API for ops that has a fixed output value range.
+// This is used as a trait like this:
+//
+//   class SoftmaxOp
+//       : public Op<SoftmaxOp,
+//           OpTrait::TFL::FixedResultUniformScale<
+//               8, -128, 390625, -8, 0, 255, false>::Impl> {
+//
+// TODO(fengliuai): create a better way to express floating point scale in the
+// template argument list.
+template <unsigned BitWidth, int ZeroPoint, int ScaleMantissa, int ScaleExp,
+          int64_t StorageTypeMin, int64_t StorageTypeMax, bool Sign>
+class FixedResultUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, FixedResultUniformScale<
+                              BitWidth, ZeroPoint, ScaleMantissa, ScaleExp,
+                              StorageTypeMin, StorageTypeMax, Sign>::Impl> {
+   public:
+    QuantizedType GetResultQuantizedType(int index) {
+      auto op = this->getOperation();
+      const auto result_type =
+          op->getResult(index).getType().template cast<ShapedType>();
+      if (!result_type.getElementType().template isa<FloatType>()) return {};
+      Builder builder(op->getContext());
+      const IntegerType storage_type = builder.getIntegerType(BitWidth);
+      const double scale = static_cast<double>(ScaleMantissa) *
+                           std::pow(10.0, static_cast<double>(ScaleExp));
+      return UniformQuantizedType::getChecked(
+          Sign, storage_type, result_type.getElementType(), scale, ZeroPoint,
+          StorageTypeMin, StorageTypeMax, builder.getUnknownLoc());
+    }
+  };
+};
+
+// This class provides the API for ops that has input as bias. This is used
+// as a trait like this:
+//
+//   class Conv2DOp
+//       : public Op<Conv2DOp, OpTrait::TFL::AccumulatorScale<2, 0, 1>::Impl>
+//
+// TODO(fengliuai): supports a configurable accumulator bit width.
+template <int Bias, int... Operands>
+class AccumulatorUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, AccumulatorUniformScale<Bias, Operands...>::Impl> {
+   public:
+    // Whether the index-th operand is a bias.
+    static bool IsBias(int index) { return index == Bias; }
+
+    // Returns the indexes of all the non-bias operands.
+    static std::vector<int> GetAllNonBiasOperands() {
+      return std::vector<int>({Operands...});
+    }
+  };
+};
+
+// The trait to specify the operand index of the coefficient for an affine op
+// and also the quantization dimension if per-axis quantization is support.
+// If the quantization dimension is -1, per-axis quantization isn't supported.
+//
+//   class Conv2DOp
+//       : public Op<Conv2DOp, OpTrait::TFL::AffineOpCoefficient<0>::Impl>
+//
+template <int QuantDim, int OperandIndex = 1>
+class AffineOpCoefficient {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public TraitBase<ConcreteType,
+                         AffineOpCoefficient<QuantDim, OperandIndex>::Impl> {
+   public:
+    static int GetCoefficientOperandIndex() { return OperandIndex; }
+    static int GetQuantizationDim() { return QuantDim; }
+  };
+};
+
+// This class provides the API for ops that can be quantized.
+// This is as a trait like this:
+//
+//   class LessOp : public Op<LessOp, OpTrait::TFL::QuantizableResult> {
+//
+template <typename ConcreteType>
+class QuantizableResult
+    : public QuantizationSpecTraitBase<ConcreteType, QuantizableResult> {};
+
+}  // namespace TFL
+}  // namespace OpTrait
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_TRAITS_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.cc
new file mode 100644
index 000000000000..3754ae7fb478
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.cc
@@ -0,0 +1,1075 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/portable_tensor_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/tools/optimize/quantization_utils.h"
+
+namespace mlir {
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_interface.cc.inc"
+
+namespace TFL {
+namespace {
+
+constexpr double kSmallestHalfRange = kNearZeroTolerance / 2;
+using QType = quant::QuantizedType;
+
+// Repeats the content of `data` multiple times to resize to `target_size`.
+// Note that this only broadcast across one dimension.
+template <typename T>
+bool BroadcastVector(int target_size, SmallVectorImpl<T>& data) {
+  const int size = data.size();
+  if (size != target_size) {
+    if (target_size % size != 0) return true;
+    data.reserve(target_size);
+    for (int i = 1; i < target_size / size; ++i) {
+      data.insert(data.end(), data.begin(), data.begin() + size);
+    }
+  }
+  return false;
+}
+
+// Expands the range to be larger than or equal to 1.0e-6, if it is
+// very small (< 1.0e-6). This is to prevent very large quantized value by this
+// range.
+void ExpandVerySmallRange(const ArrayRef<double> mins,
+                          const ArrayRef<double> maxs,
+                          SmallVectorImpl<double>& effective_mins,
+                          SmallVectorImpl<double>& effective_maxs) {
+  for (const auto [min, max] : llvm::zip(mins, maxs)) {
+    // The range is small. Expands the range to stride 0.0 and also at least
+    // 1.0e-6.
+    if (max - min > kNearZeroTolerance) {
+      effective_mins.push_back(min);
+      effective_maxs.push_back(max);
+    } else {
+      effective_mins.push_back(std::min(min, -kSmallestHalfRange));
+      effective_maxs.push_back(std::max(max, kSmallestHalfRange));
+    }
+  }
+}
+
+// Sets the min / max, scale and zero_points from the fake quant num_bits
+// attribute from QAT.
+QuantizedType ResetMinMaxFromNumBits(const QuantizedType type,
+                                     const int num_bits,
+                                     const bool narrow_range,
+                                     const bool is_signed) {
+  if (num_bits >= 8) {
+    return type;
+  }
+  int64_t qmin = QType::getDefaultMinimumForInteger(is_signed, num_bits);
+  int64_t qmax = QType::getDefaultMaximumForInteger(is_signed, num_bits);
+  if (narrow_range) {
+    qmin += 1;
+  }
+  const int64_t storage_type_min = type.getStorageTypeMin();
+  const int64_t storage_type_max = type.getStorageTypeMax();
+  const double rate =
+      static_cast<double>(storage_type_max - storage_type_min) / (qmax - qmin);
+  const auto& recalculate_scale = [&](double scale) -> double {
+    return scale * rate;
+  };
+  const auto& recalculate_zero_point = [&](int64_t zero_point) -> int64_t {
+    return qmax - std::round((storage_type_max - zero_point) / rate);
+  };
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
+    const double scale = recalculate_scale(q_type.getScale());
+    const double zero_point = recalculate_zero_point(q_type.getZeroPoint());
+    return UniformQuantizedType::get(q_type.getFlags(), q_type.getStorageType(),
+                                     q_type.getExpressedType(), scale,
+                                     zero_point, qmin, qmax);
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
+    const int size = q_type.getScales().size();
+    SmallVector<double, 4> scales(size);
+    SmallVector<int64_t, 4> zero_points(size);
+    for (int i = 0; i < size; ++i) {
+      scales[i] = recalculate_scale(q_type.getScales()[i]);
+      zero_points[i] = recalculate_zero_point(q_type.getZeroPoints()[i]);
+    }
+    return quant::UniformQuantizedPerAxisType::get(
+        q_type.getFlags(), q_type.getStorageType(), q_type.getExpressedType(),
+        scales, zero_points, q_type.getQuantizedDimension(), qmin, qmax);
+  } else {
+    llvm_unreachable("Unsupported QuantizedType in ResetMinMaxFromNumBits");
+  }
+  return type;
+}
+
+// Changes the axis of the input per-channel quantized type to match the
+// dimension of the target type. Returns nullptr if it fails.
+quant::UniformQuantizedPerAxisType ResetAxisAndBroadcast(
+    const ArrayRef<int64_t> shape,
+    const quant::UniformQuantizedPerAxisType qtype, const Type target,
+    const int quant_dim) {
+  const auto shaped = dyn_cast<RankedTensorType>(target);
+  if (!shaped) return {};
+  const ArrayRef<int64_t> new_shape = shaped.getShape();
+
+  SmallVector<double, 4> scales(qtype.getScales().begin(),
+                                qtype.getScales().end());
+  SmallVector<int64_t, 4> zero_points(qtype.getZeroPoints().begin(),
+                                      qtype.getZeroPoints().end());
+
+  if (new_shape.size() == shape.size()) {  // same rank
+    // Broadcast the scales and zero points to match the target size, which is
+    // usually the axis-th dimension of the target type. Currently, it covers
+    // two cases:
+    // - for Transpose, the data layout is changed so the `dim[axis]` still
+    // equals to the `scales_size`. The broadcast skips;
+    // - for Reshape, the data layout isn't changed but the innermost dimension
+    // is expand to cover the last two original dimensions. Thus we just need to
+    // be repeated the `scales` dim[2] times to covers the new dim length.
+    if (BroadcastVector<double>(shaped.getDimSize(quant_dim), scales) ||
+        BroadcastVector<int64_t>(shaped.getDimSize(quant_dim), zero_points)) {
+      return {};
+    }
+  } else if ((new_shape.size() == shape.size() + 1) && new_shape.front() == 1) {
+    // Handle the [A, B, C] -> [1, A, B, C] reshape case.
+    if (!(std::equal(shape.begin(), shape.end(), new_shape.begin() + 1) &&
+          quant_dim == new_shape.size() - 1)) {
+      return {};
+    }
+  } else {
+    return {};
+  }
+
+  return quant::UniformQuantizedPerAxisType::get(
+      qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
+      scales, zero_points, quant_dim, qtype.getStorageTypeMin(),
+      qtype.getStorageTypeMax());
+}
+
+}  // namespace
+
+bool IsOpQuantizable(Operation* op) {
+  if (isa<func::ConstantOp, arith::ConstantOp, quantfork::StatisticsOp>(op)) {
+    // Constant ops do not have QuantizableResult attribute but they can deal
+    // with quantized tensors.
+    return true;
+  } else if (op->hasTrait<OpTrait::IsTerminator>() ||
+             isa<quantfork::QuantizeCastOp, quantfork::DequantizeCastOp>(op)) {
+    // Terminators, qcast and decast are not quantizable.
+    return false;
+  }
+
+  const bool attr_enforced_quantizable =
+      op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+      op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue().str() ==
+          QuantTraitValues[QuantizationTrait::FullyQuantizable];
+
+  const bool attr_output_quantized = QuantizableOpSupportsFloatOutputType(op);
+
+  const bool trait_enforced_quantizable =
+      op->hasTrait<OpTrait::TFL::QuantizableResult>();
+
+  return attr_enforced_quantizable || trait_enforced_quantizable ||
+         attr_output_quantized;
+}
+
+// Checks if an op has specific attributes that enable quantized inputs with
+// float outputs.
+bool QuantizableOpSupportsFloatOutputType(Operation* op) {
+  static constexpr char kOutputTypes[] = "_output_types";
+  static constexpr char kSupportOutputTypeFloat[] =
+      "_support_output_type_float_in_quantized_op";
+
+  if (!(op->hasAttrOfType<mlir::BoolAttr>(kOutputQuantized) &&
+        op->getAttrOfType<mlir::BoolAttr>(kOutputQuantized).getValue())) {
+    return false;
+  }
+
+  if (!(op->hasAttrOfType<mlir::BoolAttr>(kSupportOutputTypeFloat) &&
+        op->getAttrOfType<mlir::BoolAttr>(kSupportOutputTypeFloat)
+            .getValue())) {
+    return false;
+  }
+
+  if (!op->hasAttrOfType<mlir::ArrayAttr>(kOutputTypes)) {
+    return false;
+  }
+
+  auto output_types_attr = op->getAttrOfType<mlir::ArrayAttr>(kOutputTypes);
+
+  if (output_types_attr.size() != op->getResultTypes().size()) {
+    return false;
+  }
+
+  for (const auto [attr_element, result_type] :
+       llvm::zip_equal(output_types_attr, op->getResultTypes())) {
+    auto type_attr = mlir::dyn_cast_or_null<TypeAttr>(attr_element);
+
+    if (!type_attr) {
+      return false;
+    }
+
+    auto tensor_type = mlir::dyn_cast_or_null<TensorType>(result_type);
+
+    if (!tensor_type) {
+      return false;
+    }
+
+    if (type_attr.getValue() != tensor_type.getElementType()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Returns the quantized type for the
+// input_type/min/max/storag_type_width/narrow_range.
+// This is entry point to the Quant dialect and used for both quantizing
+// activations and weights.
+Type GetQuantizedType(Builder builder, const Type input_type,
+                      const ArrayRef<double> min, const ArrayRef<double> max,
+                      const int quant_dim, const int storage_type_width,
+                      const bool narrow_range, const bool is_signed,
+                      const bool legacy_float_scale,
+                      const bool use_fake_quant_num_bits) {
+  auto converter =
+      mlir::quant::ir::ExpressedToQuantizedConverter::forInputType(input_type);
+
+  // Expand the range to prevent extremely small scales and large quantized
+  // integers which can cause overflow. This leads to scale
+  // 7.843137254901961e-9 with 8 bits.
+  SmallVector<double, 4> effective_mins, effective_maxs;
+  ExpandVerySmallRange(min, max, effective_mins, effective_maxs);
+
+  quant::QuantizedType quantized_element_type;
+  if (min.size() == 1 && max.size() == 1 && quant_dim == -1) {
+    quantized_element_type = quantfork::fakeQuantAttrsToType(
+        builder.getUnknownLoc(), storage_type_width, effective_mins[0],
+        effective_maxs[0], narrow_range, converter.expressed_type, is_signed);
+    if (legacy_float_scale) {
+      quantized_element_type =
+          DownCastScale(quantized_element_type, effective_mins[0],
+                        effective_maxs[0], builder.getUnknownLoc());
+    }
+  } else if (min.size() == max.size()) {
+    auto shape = dyn_cast<ShapedType>(input_type);
+    if (!shape || shape.getRank() <= quant_dim ||
+        static_cast<int64_t>(min.size()) != shape.getDimSize(quant_dim)) {
+      return {};
+    }
+    // The quantization dim is set to the last dimension.
+    quantized_element_type = quantfork::fakeQuantAttrsToType(
+        builder.getUnknownLoc(), storage_type_width, quant_dim, effective_mins,
+        effective_maxs, narrow_range, converter.expressed_type, is_signed);
+    if (legacy_float_scale) {
+      quantized_element_type =
+          DownCastScale(quantized_element_type, effective_mins, effective_maxs,
+                        builder.getUnknownLoc());
+    }
+  }
+  if (!quantized_element_type) return {};
+  // Use fake quant configured bit-widths (only supported for
+  // 1 < num_bits < 8 bits) instead of using 8-bit defaults.
+  if (use_fake_quant_num_bits && storage_type_width > 1 &&
+      storage_type_width < 8 &&
+      quantized_element_type.getStorageTypeMax() >
+          QType::getDefaultMinimumForInteger(is_signed, storage_type_width)) {
+    const auto resetEleType = ResetMinMaxFromNumBits(
+        quantized_element_type, storage_type_width, narrow_range, is_signed);
+    return converter.convert(resetEleType);
+  }
+  return converter.convert(quantized_element_type);
+}
+
+// TODO(fengliuai): promote this utility method to mlir QuantOps.
+TypeAttr RescaleQuantizedType(const Type input, const Attribute factor) {
+  const auto factor_values = dyn_cast_or_null<DenseFPElementsAttr>(factor);
+  if (!factor_values) return {};
+  const auto element_type =
+      quant::QuantizedType::getQuantizedElementType(input);
+  if (!element_type) return {};
+  if (auto qtype = dyn_cast<quant::UniformQuantizedPerAxisType>(element_type)) {
+    const ArrayRef<double> scales = qtype.getScales();
+    // Broadcasting hasn't been implemented yet.
+    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements())
+      return {};
+    SmallVector<double, 4> new_scales;
+    new_scales.reserve(scales.size());
+    auto scales_iter = scales.begin();
+    for (const auto& f : factor_values) {
+      new_scales.push_back(*scales_iter *
+                           std::fabs(FloatAttr::getValueAsDouble(f)));
+      ++scales_iter;
+    }
+    // We are assuming symmetric quantization.
+    auto new_ele_type = quant::UniformQuantizedPerAxisType::get(
+        qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
+        new_scales, qtype.getZeroPoints(), qtype.getQuantizedDimension(),
+        qtype.getStorageTypeMin(), qtype.getStorageTypeMax());
+    if (const auto new_type = new_ele_type.castFromExpressedType(
+            quant::QuantizedType::castToExpressedType(input))) {
+      return TypeAttr::get(new_type);
+    }
+  }
+  // Currently, we only support per-axis quantized type.
+  return {};
+}
+
+TypeAttr GetQuantizedTypeAttr(const Builder builder, const Type input_type,
+                              const Attribute min, const Attribute max,
+                              const int quant_dim, const IntegerAttr num_bits,
+                              const BoolAttr narrow_range, const bool is_signed,
+                              const bool legacy_float_scale,
+                              const bool use_fake_quant_num_bits) {
+  SmallVector<double, 4> min_value, max_value;
+  const auto mins = dyn_cast<DenseFPElementsAttr>(min);
+  const auto maxs = dyn_cast<DenseFPElementsAttr>(max);
+  if (mins && maxs) {
+    min_value.reserve(mins.getNumElements());
+    max_value.reserve(maxs.getNumElements());
+    for (auto it = mins.begin(); it != mins.end(); ++it) {
+      min_value.push_back(FloatAttr::getValueAsDouble(*it));
+    }
+    for (auto it = maxs.begin(); it != maxs.end(); ++it) {
+      max_value.push_back(FloatAttr::getValueAsDouble(*it));
+    }
+  } else {
+    const auto fmin = dyn_cast<FloatAttr>(min);
+    const auto fmax = dyn_cast<FloatAttr>(max);
+    if (fmin && fmax) {
+      min_value.push_back(fmin.getValueAsDouble());
+      max_value.push_back(fmax.getValueAsDouble());
+    } else {
+      return {};
+    }
+  }
+  const Type final_type =
+      GetQuantizedType(builder, input_type, min_value, max_value, quant_dim,
+                       num_bits.getInt(), narrow_range.getValue(), is_signed,
+                       legacy_float_scale, use_fake_quant_num_bits);
+  if (!final_type) return {};
+  return TypeAttr::get(final_type);
+}
+
+TypeAttr CastQuantizedTypeAttrFromExpressedType(const Builder builder,
+                                                const TypeAttr source,
+                                                const Type target,
+                                                const int axis) {
+  const auto source_type = dyn_cast_or_null<ShapedType>(source.getValue());
+  if (!source_type) return {};
+  const auto src_ele_type = source_type.getElementType();
+  auto qtype = dyn_cast<quant::QuantizedType>(src_ele_type);
+
+  // Reset the quantization dimensions if it is per-axis.
+  if (const auto per_axis =
+          dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(qtype)) {
+    // For the pass-through ops, we don't know which the dimension will be the
+    // new quantization dimension. Only if the new quantization dimension can
+    // be inferred, it is safe to reset the per-axis quantized type.
+    if (axis == -1) return {};
+    qtype =
+        ResetAxisAndBroadcast(source_type.getShape(), per_axis, target, axis);
+  }
+  if (!qtype) return {};
+  const Type final_type = qtype.castFromExpressedType(target);
+  if (!final_type) return {};
+  return TypeAttr::get(final_type);
+}
+
+void ExtractMinMaxFromAttr(const DenseFPElementsAttr values, const int dim_size,
+                           const int slice_size, bool symmetric,
+                           SmallVectorImpl<double>& mins,
+                           SmallVectorImpl<double>& maxs) {
+  // If all the element values are same we don't need to scan the content.
+  if (values.isSplat()) {
+    const double single_value =
+        FloatAttr::getValueAsDouble(values.getSplatValue<llvm::APFloat>());
+
+    // When the single value isn't 0.0, we expand it to a range to include
+    // this single value and 0.0. This will give us a scale and zero point
+    // works for both this value and 0.0.
+    if (single_value < 0.0) {
+      mins[0] = single_value;
+      maxs[0] = symmetric ? -single_value : 0.0;
+    } else if (single_value > 0.0) {
+      mins[0] = symmetric ? -single_value : 0.0;
+      maxs[0] = single_value;
+    } else {
+      mins[0] = maxs[0] = single_value;
+    }
+    for (int i = 1; i < dim_size; ++i) {
+      mins[i] = mins[0];
+      maxs[i] = maxs[0];
+    }
+  } else {
+    int64_t flatten_index = 0;
+    auto begin = values.begin();
+    auto end = values.end();
+    for (auto it = begin; it != end; ++it, ++flatten_index) {
+      const double ele_value = FloatAttr::getValueAsDouble(*it);
+      const int slice_index = flatten_index / slice_size;
+      const int channel_index = slice_index % dim_size;
+      mins[channel_index] = std::min(mins[channel_index], ele_value);
+      maxs[channel_index] = std::max(maxs[channel_index], ele_value);
+    }
+    // Expand range to include 0.
+    for (int i = 0; i < dim_size; ++i) {
+      maxs[i] = std::max(maxs[i], 0.0);
+      mins[i] = std::min(mins[i], 0.0);
+    }
+    if (symmetric) {
+      for (int i = 0; i < dim_size; ++i) {
+        maxs[i] = std::max(std::abs(mins[i]), std::abs(maxs[i]));
+        mins[i] = -maxs[i];
+      }
+    }
+  }
+}
+
+Type GetUniformQuantizedTypeForWeight(
+    const ElementsAttr attr, const bool symmetric, const unsigned num_bits,
+    const bool is_signed, const bool narrow_range,
+    const bool legacy_float_scale, const bool use_fake_quant_num_bits) {
+  const Builder builder(attr.getContext());
+  // `symmetric` can only be used when it is `signed` and `narrow_range`.
+  if (symmetric && (!is_signed || !narrow_range)) return {};
+
+  SmallVector<double, 4> mins(1, std::numeric_limits<double>::max());
+  SmallVector<double, 4> maxs(1, std::numeric_limits<double>::min());
+  const auto fp = dyn_cast<DenseFPElementsAttr>(attr);
+  if (!fp) return {};
+
+  // Computes the effective min/max values of the attribute values.
+  ExtractMinMaxFromAttr(fp, /*dim_size=*/1, /*slice_size=*/1, symmetric, mins,
+                        maxs);
+
+  const auto type =
+      GetQuantizedType(builder, attr.getType(), mins[0], maxs[0],
+                       /*quant_dim=*/-1, num_bits, narrow_range, is_signed,
+                       legacy_float_scale, use_fake_quant_num_bits);
+  if (const auto ele_type = dyn_cast_or_null<TensorType>(type))
+    return ele_type.getElementType();
+
+  return {};
+}
+
+Type GetUniformQuantizedPerAxisTypeForWeight(
+    const ElementsAttr attr, const int quant_dim, const bool symmetric,
+    const unsigned num_bits, const bool is_signed, const bool narrow_range,
+    const bool legacy_float_scale, const bool use_fake_quant_num_bits) {
+  const Builder builder(attr.getContext());
+  const auto shape = cast<ShapedType>(attr.getType()).getShape();
+  if (static_cast<int>(shape.size()) <= quant_dim) return {};
+  // `symmetric` can only be used when it is `signed` and `narrow_range`.
+  if (symmetric && (!is_signed || !narrow_range)) return {};
+
+  const int dim_size = shape[quant_dim];
+  const int slice_size =
+      std::accumulate(std::next(shape.begin(), quant_dim + 1), shape.end(), 1,
+                      std::multiplies<int64_t>());
+  SmallVector<double, 4> mins(dim_size, std::numeric_limits<double>::max());
+  SmallVector<double, 4> maxs(dim_size, std::numeric_limits<double>::min());
+  const auto fp = dyn_cast<DenseFPElementsAttr>(attr);
+  if (!fp) return {};
+
+  // Computes the effective min/max values of the attribute values.
+  ExtractMinMaxFromAttr(fp, dim_size, slice_size, symmetric, mins, maxs);
+
+  const auto type = GetQuantizedType(
+      builder, attr.getType(), mins, maxs, quant_dim, num_bits, narrow_range,
+      is_signed, legacy_float_scale, use_fake_quant_num_bits);
+  if (auto ele_type = dyn_cast_or_null<TensorType>(type))
+    return ele_type.getElementType();
+
+  return {};
+}
+
+quant::QuantizedType GetUniformQuantizedTypeForBias(
+    const std::vector<quant::QuantizedType>& op_types,
+    const int adjusted_quant_dim, const bool legacy_float_scale) {
+  if (op_types.empty()) return {};
+
+  size_t axis_size = 1;
+  int32_t quant_dim = -1;
+  Type expressed_type;
+  // Requires all the op types are valid UniformQuantizedTypes or
+  // UniformQuantizedPerAxisTypes and also have same expressed type. For all
+  // the UniformQuantizedPerAxisTypes, the quantization dimension index and
+  // dimension sizes are same.
+  for (const auto op_type : op_types) {
+    if (!op_type) return {};
+    if (expressed_type && expressed_type != op_type.getExpressedType()) {
+      return {};
+    }
+    expressed_type = op_type.getExpressedType();
+
+    if (const auto type =
+            dyn_cast<quant::UniformQuantizedPerAxisType>(op_type)) {
+      if (axis_size != 1 && axis_size != type.getScales().size()) return {};
+      if (quant_dim != -1 && quant_dim != type.getQuantizedDimension())
+        return {};
+      axis_size = type.getScales().size();
+      quant_dim = type.getQuantizedDimension();
+    } else if (!isa<quant::UniformQuantizedType>(op_type)) {
+      return {};
+    }
+  }
+
+  // The scale from the UniformQuantizedTypes is broadcasted if there are
+  // UniformQuantizedPerAxisTypes.
+  SmallVector<double, 4> scales(axis_size, 1.0);
+  for (const auto op_type : op_types) {
+    if (const auto type =
+            dyn_cast<quant::UniformQuantizedPerAxisType>(op_type)) {
+      for (const auto& index_scale : llvm::enumerate(type.getScales())) {
+        scales[index_scale.index()] *= index_scale.value();
+      }
+    } else if (const auto type =
+                   dyn_cast<quant::UniformQuantizedType>(op_type)) {
+      for (int index = 0; index < axis_size; ++index) {
+        scales[index] *= type.getScale();
+      }
+    }
+  }
+  if (legacy_float_scale) {
+    for (int i = 0; i < scales.size(); ++i) {
+      scales[i] = static_cast<float>(scales[i]);
+    }
+  }
+
+  // Builds the result quantized type, which has signed 32 bits storage type.
+  Builder builder(expressed_type.getContext());
+  const IntegerType storage_type = builder.getIntegerType(32);
+  const int64_t storage_type_min =
+      quant::QuantizedType::getDefaultMinimumForInteger(/*isSigned=*/true, 32);
+  const int64_t storage_type_max =
+      quant::QuantizedType::getDefaultMaximumForInteger(/*isSigned=*/true, 32);
+  if (axis_size == 1) {
+    return quant::UniformQuantizedType::getChecked(
+        builder.getUnknownLoc(),
+        /*flags=*/true, storage_type, expressed_type, scales[0],
+        /*zeroPoint=*/0, storage_type_min, storage_type_max);
+  } else {
+    SmallVector<int64_t, 4> zero_points(axis_size, 0);
+    // If the bias is a 1-D tensor, set the `quantizedDimension` to 0.
+    // If the bias rank is larger than 1 because it was already broadcasted
+    // to match the output shape, use the last index.
+    return quant::UniformQuantizedPerAxisType::getChecked(
+        builder.getUnknownLoc(),
+        /*flags=*/true, storage_type, expressed_type, scales, zero_points,
+        /*quantizedDimension=*/std::max(adjusted_quant_dim, 0),
+        storage_type_min, storage_type_max);
+  }
+}
+
+ElementsAttr QuantizeLegacy(const Attribute real_value,
+                            const Type tensor_type) {
+  if (!isa<DenseFPElementsAttr>(real_value) ||
+      !quant::QuantizedType::getQuantizedElementType(tensor_type)) {
+    return {};
+  }
+  const auto real_values_attr = cast<DenseFPElementsAttr>(real_value);
+  auto q_type = quant::QuantizedType::getQuantizedElementType(tensor_type);
+  std::vector<float> real_values;
+  SmallVector<APInt, 8> quantized_attr;
+  real_values.reserve(real_values_attr.getNumElements());
+  quantized_attr.reserve(real_values_attr.getNumElements());
+  std::transform(real_values_attr.begin(), real_values_attr.end(),
+                 std::back_inserter(real_values), [&](APFloat value) -> float {
+                   return value.convertToFloat();
+                 });
+  const ShapedType new_dense_type = dyn_cast_or_null<ShapedType>(
+      q_type.castExpressedToStorageType(real_values_attr.getType()));
+  const int width = dyn_cast<IntegerType>(q_type.getStorageType()).getWidth();
+
+  if (width == 8 && q_type.getStorageTypeMax() == 127 &&
+      q_type.getStorageTypeMin() == -127) {
+    std::vector<int8_t> quantized_values(real_values_attr.getNumElements());
+    if (auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
+      float min, max, scale;
+      mlir::lite::toco_legacy::PortableSymmetricQuantizeFloats(
+          real_values.data(), real_values.size(), quantized_values.data(), &min,
+          &max, &scale);
+      // The scale has been adjusted, so the adjusted scale should be respected.
+      if (std::abs(scale - uniform_type.getScale()) > 1e-3) {
+        return Quantize(real_value, tensor_type);
+      }
+    } else if (auto uniform_type =
+                   dyn_cast<quant::UniformQuantizedPerAxisType>(q_type)) {
+      std::vector<float> scales_inv;
+      std::vector<int32_t> dimension;
+      dimension.insert(dimension.end(), new_dense_type.getShape().begin(),
+                       new_dense_type.getShape().end());
+      std::transform(uniform_type.getScales().begin(),
+                     uniform_type.getScales().end(),
+                     std::back_inserter(scales_inv),
+                     [](float scale) { return 1.0 / scale; });
+
+      tflite_migration::optimize::utils::SymmetricPerChannelQuantizeValues(
+          real_values.data(), scales_inv, dimension,
+          uniform_type.getQuantizedDimension(), &quantized_values);
+    } else {
+      return {};
+    }
+    std::transform(quantized_values.begin(), quantized_values.end(),
+                   std::back_inserter(quantized_attr),
+                   [&](int8_t value) -> APInt {
+                     return APInt(8, value, /*isSigned=*/true);
+                   });
+    return DenseElementsAttr::get(new_dense_type, quantized_attr);
+  } else if (width == 8) {
+    // This can be a state tensor, or an actual constant tensor with
+    // asymmetric range. For a state tensor, assigning correct quantization
+    // parameters is sufficient, and for constants with asymmetric range it's
+    // not correctly quantized by legacy quantizer so call the new Quantize.
+    return Quantize(real_value, tensor_type);
+  } else if (width == 16) {
+    if (const auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
+      const auto quantized_values =
+          tflite_migration::optimize::utils::SymmetricQuantizeFloatsToInt16(
+              real_values.data(), real_values.size(), uniform_type.getScale());
+      std::transform(quantized_values.begin(), quantized_values.end(),
+                     std::back_inserter(quantized_attr),
+                     [&](int16_t value) -> APInt {
+                       return APInt(16, value, /*isSigned=*/true);
+                     });
+      return DenseElementsAttr::get(new_dense_type, quantized_attr);
+    }
+  } else if (width == 32) {
+    std::vector<float> scales;
+    if (const auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
+      scales.push_back(uniform_type.getScale());
+    } else if (const auto uniform_type =
+                   dyn_cast<quant::UniformQuantizedPerAxisType>(q_type)) {
+      scales.insert(scales.end(), uniform_type.getScales().begin(),
+                    uniform_type.getScales().end());
+    } else {
+      return {};
+    }
+    const auto quantized_bias =
+        tflite_migration::optimize::utils::SymmetricBiasQuantize<std::int32_t>(
+            real_values.data(), real_values.size(), scales);
+    std::transform(quantized_bias.begin(), quantized_bias.end(),
+                   std::back_inserter(quantized_attr),
+                   [&](int32_t value) -> APInt {
+                     return APInt(32, value, /*isSigned=*/true);
+                   });
+    return DenseElementsAttr::get(new_dense_type, quantized_attr);
+  }
+  return {};
+}
+
+ElementsAttr Quantize(const Attribute real_value, const Type tensor_type) {
+  if (const auto q_type =
+          quant::QuantizedType::getQuantizedElementType(tensor_type)) {
+    Type converted_type;
+    return dyn_cast_or_null<ElementsAttr>(
+        quantfork::quantizeAttr(real_value, q_type, converted_type));
+  }
+  return {};
+}
+
+quant::QuantizedType DownCastScale(QuantizedType type, double min, double max,
+                                   Location loc) {
+  const SmallVector<double, 1> mins = {min};
+  const SmallVector<double, 1> maxs = {max};
+  return DownCastScale(type, mins, maxs, loc);
+}
+
+quant::QuantizedType DownCastScale(QuantizedType type,
+                                   const SmallVectorImpl<double>& mins,
+                                   const SmallVectorImpl<double>& maxs,
+                                   Location loc) {
+  // The given type can be null. For example, there can be an invalid scale and
+  // so on.
+  if (!type) return type;
+  SmallVector<double, 4> scales(mins.size());
+  SmallVector<int64_t, 4> zero_points(mins.size());
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
+    zero_points.push_back(q_type.getZeroPoint());
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
+    zero_points = {q_type.getZeroPoints().begin(),
+                   q_type.getZeroPoints().end()};
+  }
+  for (int i = 0; i < mins.size(); ++i) {
+    scales[i] = (static_cast<float>(maxs[i]) - static_cast<float>(mins[i])) /
+                (type.getStorageTypeMax() - type.getStorageTypeMin());
+    if (type.getStorageTypeMax() != -type.getStorageTypeMin()) {
+      // Only applies for asymmetric quantized range with original scale.
+      const float zero_point_from_min =
+          type.getStorageTypeMin() - mins[i] / scales[i];
+      if (zero_point_from_min < type.getStorageTypeMin()) {
+        zero_points[i] = static_cast<int64_t>(type.getStorageTypeMin());
+      } else if (zero_point_from_min > type.getStorageTypeMax()) {
+        zero_points[i] = static_cast<int64_t>(type.getStorageTypeMax());
+      } else {
+        zero_points[i] = static_cast<int64_t>(std::round(zero_point_from_min));
+      }
+    }
+  }
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
+    return UniformQuantizedType::get(q_type.getFlags(), q_type.getStorageType(),
+                                     q_type.getExpressedType(), scales[0],
+                                     zero_points[0], q_type.getStorageTypeMin(),
+                                     q_type.getStorageTypeMax());
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
+    return quant::UniformQuantizedPerAxisType::get(
+        q_type.getFlags(), q_type.getStorageType(), q_type.getExpressedType(),
+        scales, zero_points, q_type.getQuantizedDimension(),
+        q_type.getStorageTypeMin(), q_type.getStorageTypeMax());
+  }
+  return type;
+}
+
+// A heuristic to determine whether the scales needs to be from operands or
+// from results for the ops with the `SameOperandsAndResultsScale` property.
+// The current implementation is based on the number of operands.
+static bool PreferResultScale(Operation* op) {
+  int float_operands = 0;
+  for (auto operand : op->getOperands()) {
+    if (auto operand_type = dyn_cast<ShapedType>(operand.getType())) {
+      if (isa<FloatType>(operand_type.getElementType())) {
+        if (++float_operands > 1) return true;
+      }
+    }
+  }
+  return false;
+}
+
+std::unique_ptr<OpQuantScaleSpec> GetDefaultQuantScaleSpec(Operation* op) {
+  auto spec = std::make_unique<OpQuantScaleSpec>();
+  if (isa<SameScalesOpInterface>(op)) {
+    spec->has_same_scale_requirement = true;
+    spec->required_same_scale_func = [op](const bool sign,
+                                          const int bit_width) {
+      return cast<SameScalesOpInterface>(op)
+          .RequiredSameOperandsAndResultsScale(sign, bit_width);
+    };
+    spec->required_same_quantized_axes_func = [op]() {
+      return cast<SameScalesOpInterface>(op).RequiredSameQuantizedAxes();
+    };
+  }
+  if (isa<FixedOutputRangeInterface>(op)) {
+    spec->has_fixed_output_range = true;
+    spec->fixed_output_range_func = [op](bool sign, int bit_width) {
+      return cast<FixedOutputRangeInterface>(op).GetFixedOutputRange(sign,
+                                                                     bit_width);
+    };
+  }
+  return spec;
+}
+
+// The stats op of some of the ops can be redundant. The current implementation
+// only considers the ops with restricted output params.
+static bool IsStatsRedundant(
+    Operation* op, const OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
+  // If it has FixedOutputRangeInterface, no need to manually create spec.
+  return isa<FixedOutputRangeInterface>(op) ||
+         op_quant_scale_spec_getter(op)->has_fixed_output_range;
+}
+
+static bool IsSameScaleOp(
+    Operation* op, const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
+  // If it has SameScalesOpInterface, no need to manually create spec.
+  return dyn_cast<SameScalesOpInterface>(op) ||
+         op_quant_scale_spec_getter(op)->has_same_scale_requirement;
+}
+
+bool RemoveRedundantStatsOps(
+    func::FuncOp func, const OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
+  SmallVector<quantfork::StatisticsOp, 16> all_stats_ops;
+  llvm::DenseSet<Operation*> redundant_stats_ops;
+
+  // Step 0: remove the quantfork::StatisticsOp which are used by the
+  // quant.qcast op in case it overrides the information from training FakeQuant
+  // ops.
+  func.walk([&](quantfork::QuantizeCastOp q) {
+    auto input_op = q.getArg().getDefiningOp();
+    if (auto stats = dyn_cast_or_null<quantfork::StatisticsOp>(input_op)) {
+      q.setOperand(stats.getArg());
+      if (stats.use_empty()) stats.erase();
+    }
+  });
+
+  // Step 1: forward pass: propagate any value scales which are not produces
+  // by `SameOperandsAndResultsScale`. Additionally, remove the value scales
+  // which are produced by the ops with the `FixedOutputRangeInterface`.
+  // Note that we don't propagate across the multiple-operands
+  // `SameOperandsAndResultsScale` ops like `concatenation`.
+  func.walk([&](quantfork::StatisticsOp stats_op) {
+    all_stats_ops.push_back(stats_op);
+  });
+
+  while (!all_stats_ops.empty()) {
+    quantfork::StatisticsOp stats_op = all_stats_ops.back();
+    all_stats_ops.pop_back();
+
+    if (auto def = stats_op.getArg().getDefiningOp()) {
+      if (IsStatsRedundant(def, op_quant_spec_getter,
+                           op_quant_scale_spec_getter)) {
+        redundant_stats_ops.insert(stats_op);
+      }
+    }
+
+    for (Operation* user : stats_op.getResult().getUsers()) {
+      // We don't propagate this parameter down if it has multiple operands.
+      // We want to use the result parameter scales instead.
+      if (!IsSameScaleOp(user, op_quant_scale_spec_getter) ||
+          PreferResultScale(user)) {
+        continue;
+      }
+      for (Value res : user->getResults()) {
+        if (!res.hasOneUse()) {
+          continue;
+        }
+        if (auto next_stats =
+                dyn_cast<quantfork::StatisticsOp>(*res.getUsers().begin())) {
+          // quantization parameters can be propagated to next_stats
+          redundant_stats_ops.insert(next_stats);
+          // add next_stats to the work list so propagation can continue.
+          all_stats_ops.push_back(next_stats);
+        }
+      }
+    }
+  }
+
+  // Step 2: backward pass: For the ops skipped in the forward pass, propagate
+  // its results scale backwards as far as possible.
+  func.walk([&](quantfork::StatisticsOp stats_op) {
+    if (redundant_stats_ops.find(stats_op) == redundant_stats_ops.end()) {
+      all_stats_ops.push_back(stats_op);
+    }
+  });
+
+  while (!all_stats_ops.empty()) {
+    quantfork::StatisticsOp stats_op = all_stats_ops.back();
+    all_stats_ops.pop_back();
+
+    if (Operation* def = stats_op.getArg().getDefiningOp()) {
+      if (!IsSameScaleOp(def, op_quant_scale_spec_getter)) {
+        continue;
+      }
+      for (Value input : def->getOperands()) {
+        if (auto next_stats = dyn_cast_or_null<quantfork::StatisticsOp>(
+                input.getDefiningOp())) {
+          redundant_stats_ops.insert(next_stats);
+          all_stats_ops.push_back(next_stats);
+        }
+      }
+    }
+  }
+
+  // Step3: Remove all the redundant stats ops
+  for (Operation* it : redundant_stats_ops) {
+    if (!isa<quantfork::StatisticsOp>(it)) return true;
+    auto stats_op = cast<quantfork::StatisticsOp>(it);
+    stats_op.getResult().replaceAllUsesWith(stats_op.getArg());
+    stats_op.erase();
+  }
+
+  // Returns false if the steps finish without errors.
+  return false;
+}
+
+LogicalResult VerifySameScales(Operation* op) {
+  auto same_scale_op = cast<SameScalesOpInterface>(op);
+
+  SmallVector<QuantizedType, 4> collected_quant_params;
+  for (Value input : op->getOperands()) {
+    QuantizedType quant_params =
+        QuantizedType::getQuantizedElementType(input.getType());
+    // Skip non-quantizable operands.
+    if (quant_params) {
+      collected_quant_params.push_back(quant_params);
+    }
+  }
+
+  for (Value output : op->getResults()) {
+    const QuantizedType quant_params =
+        QuantizedType::getQuantizedElementType(output.getType());
+    // Skip non-quantizable results.
+    if (quant_params) {
+      collected_quant_params.push_back(quant_params);
+    }
+  }
+
+  if (collected_quant_params.size() <= 1) return success();
+  const auto& expected_params = collected_quant_params[0];
+  for (int i = 1; i < collected_quant_params.size(); ++i) {
+    const auto& compared_params = collected_quant_params[i];
+    // For some ops (such as Transpose or Squeeze), the quantized axis might not
+    // be the same, this function only verifies the scale and zero point in
+    // that case. The quantized axis should be verified in their own verifier
+    // method.
+    if (!same_scale_op.RequiredSameQuantizedAxes()) {
+      const auto expected_per_axis_qtype =
+          dyn_cast<quant::UniformQuantizedPerAxisType>(expected_params);
+      const auto compared_per_axis_qtype =
+          dyn_cast<quant::UniformQuantizedPerAxisType>(compared_params);
+      if (expected_per_axis_qtype && compared_per_axis_qtype &&
+          llvm::equal(expected_per_axis_qtype.getScales(),
+                      compared_per_axis_qtype.getScales()) &&
+          llvm::equal(expected_per_axis_qtype.getZeroPoints(),
+                      compared_per_axis_qtype.getZeroPoints()) &&
+          expected_params.getStorageType() ==
+              compared_params.getStorageType() &&
+          expected_params.getExpressedType() ==
+              compared_params.getExpressedType()) {
+        continue;
+      }
+    }
+    // Same quantization parameters are always ok.
+    if (expected_params == compared_params) continue;
+    // If the quantization parameters are not the same, as long as it has the
+    // same storage type and the op interface doesn't require same scale
+    // constraint for this storage type, it is still ok.
+    if (expected_params.isSigned() == compared_params.isSigned() &&
+        expected_params.getStorageTypeIntegralWidth() ==
+            compared_params.getStorageTypeIntegralWidth() &&
+        !same_scale_op.RequiredSameOperandsAndResultsScale(
+            expected_params.isSigned(),
+            expected_params.getStorageTypeIntegralWidth()))
+      continue;
+
+    std::string err_msg =
+        "quantization parameters violate the same scale constraint: ";
+    llvm::raw_string_ostream os(err_msg);
+    expected_params.print(os);
+    os << " vs. ";
+    compared_params.print(os);
+    os.flush();
+    return op->emitOpError(err_msg);
+  }
+  return success();
+}
+
+quant::UniformQuantizedType GetFixedOutputRange(
+    const bool is_signed, const int bit_width, const Type tensor_type,
+    const double scale, int64_t zero_point, int64_t storage_min,
+    int64_t storage_max) {
+  const auto result_type = cast<ShapedType>(tensor_type);
+  if (!isa<FloatType>(result_type.getElementType())) return {};
+  Builder builder(result_type.getContext());
+
+  // Only support 8-bits and 16-bits
+  if (bit_width != 8 && bit_width != 16) return {};
+  const IntegerType storage_type = builder.getIntegerType(bit_width);
+  if (!is_signed && bit_width == 8) {
+    zero_point += 128;
+    storage_min += 128;
+    storage_max += 128;
+  }
+  return quant::UniformQuantizedType::getChecked(
+      builder.getUnknownLoc(), is_signed, storage_type,
+      result_type.getElementType(), scale, zero_point, storage_min,
+      storage_max);
+}
+
+quant::UniformQuantizedType GetFixedOutputRange(const bool is_signed,
+                                                const int bit_width,
+                                                const Type tensor_type,
+                                                const double scale,
+                                                const int64_t zero_point) {
+  return GetFixedOutputRange(is_signed, bit_width, tensor_type, scale,
+                             zero_point,
+                             /*storage_min=*/-(1 << (bit_width - 1)),
+                             /*storage_max=*/(1 << (bit_width - 1)) - 1);
+}
+
+Type ConvertSignedQuantizedToUnsigned(const Type signed_tensor_type,
+                                      const Location loc) {
+  const auto qtype = QType::getQuantizedElementType(signed_tensor_type);
+  if (!qtype || !qtype.isSigned()) return {};
+
+  const int num_bits = qtype.getStorageTypeIntegralWidth();
+  // This is a negative value, and will be applied on zero points and fixed
+  // point ranges.
+  const int64_t offset =
+      QType::getDefaultMinimumForInteger(/*isSigned=*/true, num_bits) -
+      QType::getDefaultMinimumForInteger(/*isSigned=*/false, num_bits);
+
+  const auto flags = !quant::QuantizationFlags::Signed;
+  QType new_qtype;
+  if (auto uqtype = dyn_cast<quant::UniformQuantizedType>(qtype)) {
+    new_qtype = quant::UniformQuantizedType::getChecked(
+        loc, flags, qtype.getStorageType(), qtype.getExpressedType(),
+        uqtype.getScale(), uqtype.getZeroPoint() - offset,
+        uqtype.getStorageTypeMin() - offset,
+        uqtype.getStorageTypeMax() - offset);
+  } else if (auto aqtype =
+                 dyn_cast<quant::UniformQuantizedPerAxisType>(qtype)) {
+    const auto zero_points = aqtype.getZeroPoints();
+    SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
+                                            zero_points.end());
+    for (int i = 0; i < new_zero_points.size(); ++i) {
+      new_zero_points[i] -= offset;
+    }
+    new_qtype = quant::UniformQuantizedPerAxisType::getChecked(
+        loc, flags, qtype.getStorageType(), qtype.getExpressedType(),
+        aqtype.getScales(), new_zero_points, aqtype.getQuantizedDimension(),
+        aqtype.getStorageTypeMin() - offset,
+        aqtype.getStorageTypeMax() - offset);
+  }
+  return new_qtype.castFromExpressedType(
+      QType::castToExpressedType(signed_tensor_type));
+}
+
+LogicalResult RemoveDebugAttrPattern::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  // removeAttr will return nullptr if the attribute did not exist. Thus we can
+  // return success(result) to indicate if this op has changed.
+  return success(/*isSuccess=*/
+                 op->removeAttr(kDebugModeOpQuantAttrName) ||
+                 op->removeAttr(kDebugModeOpFloatAttrName));
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
new file mode 100644
index 000000000000..66d307dd2fbd
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
@@ -0,0 +1,973 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace TFL {
+
+// A unit attribute can be attached to the quantize/dequantize ops which are
+// added by the quantization passes. These ops can be removed erased without
+// losing accuracy.
+inline constexpr char kVolatileOpAttrName[] = "volatile";
+
+// Following attributes are used to mark ops that are not quantizable during
+// debug model generation process for whole-model verify mode. If these
+// attributes are attached, the upstream float/quantized ops know which ops to
+// connect to, and it also prevents these ops from being copied again.
+inline constexpr char kDebugModeOpFloatAttrName[] = "debug_float";
+inline constexpr char kDebugModeOpQuantAttrName[] = "debug_quant";
+
+// Used to annotate custom ops if they are quantizable.
+inline constexpr char kQuantTraitAttrName[] = "_tfl_quant_trait";
+enum QuantizationTrait { FullyQuantizable = 0, NotQuantizable = 1 };
+inline constexpr absl::string_view QuantTraitValues[] = {"fully_quantizable",
+                                                         "not_quantizable"};
+inline constexpr char kOutputQuantized[] = "_output_quantized";
+
+inline constexpr double kNearZeroTolerance = 1.0e-6;
+
+using QuantParams = QuantizedType;
+using QuantSpec = QuantizationSpecs;
+using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
+using QuantParamsForResults = llvm::SmallVector<QuantizedType, 4>;
+using AccumulatorScaleFunc =
+    std::function<QuantizedType(const std::vector<QuantizedType>&, int, bool)>;
+using BiasParamsMap =
+    absl::flat_hash_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
+// UniformQuantizedType GetFixedOutputRange(bool sign, int bit_width)
+using GetFixedOutputRangeFunc = std::function<UniformQuantizedType(bool, int)>;
+// bool RequiredSameOperandsAndResultsScale(bool sign, int $bit_width)
+using RequiredSameOperandsAndResultsScaleFunc = std::function<bool(bool, int)>;
+// bool RequiredSameQuantizedAxes()
+using RequiredSameQuantizedAxesFunc = std::function<bool()>;
+
+using CustomMap = CustomOpMap;
+using Operation = ::mlir::Operation;
+
+// Quantization spec of an op, driving the quantization algorithm.
+struct OpQuantSpec {
+  // Maps the operand index of a bias input to its quantization specifications,
+  // including the non-bias operand indexes and the method retrieving
+  // quantization parameters from list of parameters of the non-bias operands.
+  // This map is empty if the op doesn't have a bias operand.
+  BiasParamsMap biases_params;
+
+  // Quantization parameters for value restricted outputs. This is the
+  // "hard-coded" parameters and should be used unconditionally for the
+  // quantized op. This vector is empty if the op doesn't have value restricted
+  // outputs.
+  llvm::DenseMap<SignedInteger, QuantParamsForResults> restricted_output_params;
+
+  // Coefficient operand index and whether supporting per-channel quantization.
+  // For QAT, this information is carried by the FakeQuant*/Quantize/Dequantize
+  // ops, but post-training quantization, the quantization parameters need to be
+  // inferred from the tensor content and op property. A "-1" value indicates
+  // the operand doesn't support per-channel quantization.
+  llvm::DenseMap<int, int> coeff_op_quant_dim;
+
+  // Indices of quantizable operands. Biases are not included in this field,
+  // the indices of biases can be found in the `biases_params`.
+  absl::flat_hash_set<int> quantizable_operands;
+};
+
+// A function signature for getting the particular OpQuantSpec for the provided
+// op.
+using OpQuantSpecGetter =
+    std::function<std::unique_ptr<OpQuantSpec>(mlir::Operation*)>;
+
+// Quantization scale spec of an op. The information defined in the MLIR
+// interfaces FixedOutputRangeInterface and SameOperandsAndResultsScale should
+// be checked first if present.
+// TODO: b/323478683: Consider deprecating this.
+struct OpQuantScaleSpec {
+  // Whether this op has a fixed range requirement (e.g. sigmoid)
+  bool has_fixed_output_range = false;
+  // Whether this op should have same operand and result scales (e.g. concat)
+  bool has_same_scale_requirement = false;
+  // Whether this op should have same operand and result type (e.g. gather)
+  bool has_same_operand_and_result_type_requirement = false;
+  // Returns the fixed output range, when has_fixed_output_range is set.
+  GetFixedOutputRangeFunc fixed_output_range_func;
+  // Returns whether same operands and results scales are required.
+  RequiredSameOperandsAndResultsScaleFunc required_same_scale_func =
+      [](bool sign, int bit_width) { return true; };
+  // Returns whether operands and results must have the same quantized axis.
+  RequiredSameQuantizedAxesFunc required_same_quantized_axes_func = []() {
+    return true;
+  };
+};
+
+// A function signature for getting the particular OpQuantScaleSpec for the
+// provided op.
+using OpQuantScaleSpecGetter =
+    std::function<std::unique_ptr<OpQuantScaleSpec>(mlir::Operation*)>;
+
+// Used in TFL Numeric Verify
+struct NumericVerifySpec {
+  // Whether to enable numeric verification
+  bool verify_numeric = false;
+
+  // Tolerance level from the quantized value for verification. If the tolerance
+  // is very small(<0.1), only the stats of the diff is displayed.
+  float error_tolerance = 5.0f;
+
+  // Whether to verify numerical correctness layer by layer or by whole model
+  bool whole_model_verify = false;
+
+  // Whether to enable log for failures
+  bool log_if_failed_flag = false;
+};
+
+// Used in TFL Quantize Pass
+struct QuantPassSpec {
+  // Variables to control TFL Numeric Verify
+  NumericVerifySpec numeric_verify_spec;
+
+  // Variables related to quantization
+  QuantSpec quant_spec;
+};
+
+// Re-calculates scales again in float instead of simply downcasting existing
+// scales.
+quant::QuantizedType DownCastScale(quant::QuantizedType type,
+                                   const SmallVectorImpl<double>& mins,
+                                   const SmallVectorImpl<double>& maxs,
+                                   Location loc);
+
+quant::QuantizedType DownCastScale(quant::QuantizedType type, double min,
+                                   double max, Location loc);
+
+bool IsOpQuantizable(mlir::Operation* op);
+bool QuantizableOpSupportsFloatOutputType(mlir::Operation* op);
+
+// Specialized version of location to string for flatbuffer exported locations.
+inline std::string GetTensorNameFromLoc(Location loc) {
+  if (auto name_loc = llvm::dyn_cast<NameLoc>(loc)) {
+    return name_loc.getName().str();
+  }
+  return "";
+}
+
+template <typename QuantizeOpT, typename DequantizeOpT>
+struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
+  ConvertStatsToQDQs(int num_bits, bool narrow_range, bool is_signed,
+                     bool legacy_float_scale, MLIRContext* context)
+      : OpRewritePattern<quantfork::StatisticsOp>(context),
+        num_bits(num_bits),
+        narrow_range(narrow_range),
+        is_signed(is_signed),
+        legacy_float_scale(legacy_float_scale) {}
+
+  LogicalResult matchAndRewrite(quantfork::StatisticsOp op,
+                                PatternRewriter& rewriter) const override {
+    Type expressed = llvm::cast<ShapedType>(op.getType()).getElementType();
+    quant::QuantizedType quant_type;
+    SmallVector<double, 4> mins, maxs;
+
+    if (op.getAxisStats().has_value()) {
+      // Per axis quantization (or per channel quantization)
+      int stats_num = op.getAxisStats()->getNumElements();
+      if (stats_num == 0 || stats_num % 2 != 0) return failure();
+      auto stats = llvm::dyn_cast<DenseFPElementsAttr>(*op.getAxisStats());
+      if (!stats) return failure();
+
+      for (auto it = stats.begin(), e = stats.end(); it != e; ++it) {
+        double rmin = FloatAttr::getValueAsDouble(*it++);
+        double rmax = FloatAttr::getValueAsDouble(*it);
+        // The default nudging implementation of mlir quant library might cause
+        // clamping during inference if the calibration range isn't wide enough.
+        // So here we adjust the range to include 0.0.
+        rmin = std::min(rmin, 0.0);
+        rmax = std::max(rmax, 0.0);
+        if (num_bits == 16) {
+          // TODO: b/266536261 - Since the kernel implementation assumes that
+          // 16x8 integer quantization is symmetric, this MLIR quantizer
+          // supports only symmetric quantization.
+          rmax = std::max(std::abs(rmin), std::abs(rmax));
+          rmin = -rmax;
+        }
+        TensorRangeSanityCheck(op, rmin, rmax);
+        mins.push_back(rmin);
+        maxs.push_back(rmax);
+      }
+      quant_type = quantfork::fakeQuantAttrsToType(
+          op.getLoc(), num_bits, *op.getAxis(), mins, maxs, narrow_range,
+          expressed, is_signed);
+      if (legacy_float_scale) {
+        quant_type =
+            mlir::TFL::DownCastScale(quant_type, mins, maxs, op->getLoc());
+      }
+    } else if (auto stats =
+                   llvm::dyn_cast<DenseFPElementsAttr>(op.getLayerStats())) {
+      // Per tensor quantization
+      auto statValues = stats.getValues<APFloat>();
+      double rmin = FloatAttr::getValueAsDouble(statValues[0]);
+      double rmax = FloatAttr::getValueAsDouble(statValues[1]);
+      // The default nudging implementation of mlir quant library might cause
+      // clamping during inference if the calibration range isn't wide enough.
+      // So here we adjust the range to include 0.0.
+      rmin = std::min(rmin, 0.0);
+      rmax = std::max(rmax, 0.0);
+      if (num_bits == 16) {
+        // TODO: b/266536261 - Since the kernel implementation assumes that
+        // 16x8 integer quantization is symmetric, this MLIR quantizer supports
+        // only symmetric quantization.
+        rmax = std::max(std::abs(rmin), std::abs(rmax));
+        rmin = -rmax;
+      }
+      TensorRangeSanityCheck(op, rmin, rmax);
+      quant_type =
+          quantfork::fakeQuantAttrsToType(op.getLoc(), num_bits, rmin, rmax,
+                                          narrow_range, expressed, is_signed);
+      if (legacy_float_scale) {
+        quant_type =
+            mlir::TFL::DownCastScale(quant_type, rmin, rmax, op->getLoc());
+      }
+    } else {
+      return failure();
+    }
+
+    rewriter.setInsertionPointAfter(op.getOperation());
+    Type result_type = quant_type.castFromExpressedType(op.getType());
+    auto q =
+        rewriter.create<QuantizeOpT>(op.getLoc(), result_type, op.getArg());
+    q->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
+
+    auto dq = rewriter.create<DequantizeOpT>(op.getLoc(), op.getType(), q);
+    op.getResult().replaceAllUsesWith(dq);
+    q.getOperation()->replaceUsesOfWith(dq, op.getArg());
+    op.erase();
+
+    return success();
+  }
+
+ private:
+  int num_bits;
+  bool narrow_range;
+  bool is_signed;
+  bool legacy_float_scale;
+
+  // Emits an op warning message if the calibrated range is larger than 10.0 and
+  // the storage type is less than or equal to 8 bits.
+  void TensorRangeSanityCheck(quantfork::StatisticsOp op, double& min,
+                              double& max) const {
+    double range = std::fabs(max - min);
+    if (num_bits <= 8 && range >= 10.0) {
+      op.emitWarning()
+          << "Tensor range is too wide to be quantized. Use tf.clip_by_value "
+             "or tf.relu6 to narrow the tensor range. Range: "
+          << range << ", bit width: " << num_bits;
+    }
+    if (std::abs(max - min) < kNearZeroTolerance) {
+      op.emitWarning() << "Tensor range (" << min << ", " << max
+                       << ") is too narrow and it might cause overflow. "
+                          "Expanding range symmetrically by "
+                       << kNearZeroTolerance;
+      min -= kNearZeroTolerance;
+      max += kNearZeroTolerance;
+    }
+  }
+};
+
+template <typename VerifierT>
+bool UsedBy(mlir::Operation* op) {
+  for (mlir::Operation* user : op->getUsers()) {
+    if (llvm::isa_and_nonnull<VerifierT>(user)) return true;
+  }
+  return false;
+}
+
+template <typename VerifierT>
+void CreateVerifier(mlir::Operation* quantizing_op,
+                    mlir::Operation* quantized_op, PatternRewriter& rewriter,
+                    int result_idx, const QuantPassSpec& quant_params) {
+  rewriter.setInsertionPointAfter(quantized_op);
+  FloatAttr tolerance = rewriter.getF32FloatAttr(
+      quant_params.numeric_verify_spec.error_tolerance);
+  BoolAttr log =
+      rewriter.getBoolAttr(quant_params.numeric_verify_spec.log_if_failed_flag);
+  // Verify the quantized value by sending the result to the verifier.
+  rewriter.create<VerifierT>(
+      quantizing_op->getLoc(), quantized_op->getResult(result_idx).getType(),
+      quantized_op->getResult(result_idx), quantizing_op->getResult(result_idx),
+      tolerance, log);
+}
+
+template <>
+inline bool UsedBy<void>(mlir::Operation* op) {
+  return false;
+}
+
+// This specialization is not going to be called, but needed for compilation.
+template <>
+inline void CreateVerifier<void>(mlir::Operation* quantizing_op,
+                                 mlir::Operation* quantized_op,
+                                 PatternRewriter& rewriter, int result_idx,
+                                 const QuantPassSpec& quant_params) {}
+
+// A base rewrite pattern which matches any N-in-M-out operations with
+// quantization parameters propagated to at least one of its operands. The
+// quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
+// Each matched pattern are rewritten by its quantized alternatives.
+//
+// The concrete pattern, extends from this base pattern, can specify whether it
+// allows dynamic range quantized operands and results for the operations in the
+// current context. These "DynamicRangeQuantized" operands and results don't
+// have quantization parameters propagated to, so will be in float in the
+// quantized results. The concrete pattern should define the following two
+// functions:
+//
+//   bool AllowDynamicRangeQuantizedOperand(Operation *) const
+//   bool AllowDynamicRangeQuantizedResult(Operation *) const
+//
+// Full integer quantization disallows "DynamicRangeQuantized" operands or
+// results. Dynamic range quantization allows "DynamicRangeQuantized" operands
+// and results.
+template <typename ConcreteT, typename QuantizeOpT, typename DequantizeOpT,
+          typename VerifierT, typename RootOpT = DequantizeOpT>
+class QuantizationPattern : public RewritePattern {
+ public:
+  using BaseType = QuantizationPattern<ConcreteT, QuantizeOpT, DequantizeOpT,
+                                       VerifierT, RootOpT>;
+
+  explicit QuantizationPattern(MLIRContext* context,
+                               const QuantPassSpec& quant_params)
+      // Set the score to a large number so it is always preferred.
+      : RewritePattern(RootOpT::getOperationName(), 300, context),
+        quant_params_(quant_params) {}
+
+  LogicalResult matchAndRewrite(mlir::Operation* op,
+                                PatternRewriter& rewriter) const override {
+    llvm::SmallVector<mlir::Operation*, 4> quantizing_ops;
+
+    // Collect all the ops to quantize, as the user / producer of the root op.
+    if constexpr (std::is_same_v<RootOpT, DequantizeOpT>) {
+      if (op->getNumResults() != 1) {
+        return failure();
+      }
+      auto users = op->getResult(0).getUsers();
+      quantizing_ops.append(users.begin(), users.end());
+    } else if constexpr (std::is_same_v<RootOpT, QuantizeOpT>) {
+      if (op->getNumOperands() != 1) {
+        return failure();
+      }
+      Value quantize_operand = op->getOperand(0);
+      if (QuantizedType::getQuantizedElementType(quantize_operand.getType())) {
+        // The input of this QuantizeOp has already been quantized, i.e.
+        // rescale.
+        return failure();
+      }
+      DenseFPElementsAttr attr;
+      if (matchPattern(quantize_operand, m_Constant(&attr))) {
+        // Const-> QuantizeOp pattern will be handled separately.
+        return failure();
+      }
+      if (mlir::Operation* quantizing_op = quantize_operand.getDefiningOp()) {
+        quantizing_ops.push_back(quantizing_op);
+      }
+    }
+
+    tensorflow::DataType inference_type =
+        quant_params_.quant_spec.inference_type;
+    bool weight_only_quantization =
+        quant_params_.quant_spec.weight_only_quantization;
+    bool enable_verify = quant_params_.numeric_verify_spec.verify_numeric;
+    bool enable_whole_model_verify =
+        quant_params_.numeric_verify_spec.whole_model_verify;
+    absl::flat_hash_set<std::string> ops_blocklist =
+        quant_params_.quant_spec.ops_blocklist;
+    absl::flat_hash_set<std::string> nodes_blocklist =
+        quant_params_.quant_spec.nodes_blocklist;
+    CustomMap custom_map = quant_params_.quant_spec.custom_map;
+
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (mlir::Operation* quantizing_op : quantizing_ops) {
+      // If it is requantize op, we shouldn't rewrite this op.
+      if (llvm::isa<QuantizeOpT, DequantizeOpT>(quantizing_op)) {
+        return failure();
+      }
+
+      // If the op is terminator, not quantizable or any ops from the mlir quant
+      // ops dialect, we shouldn't rewrite. In case of whole-model verify debug
+      // mode, not-quantizable ops should be duplicated to keep parallel
+      // float/quant model execution.
+      if (quantizing_op->hasTrait<OpTrait::IsTerminator>()) {
+        return failure();
+      }
+
+      if (!IsOpQuantizable(quantizing_op) &&
+          !static_cast<const ConcreteT*>(this)->IsQuantizableCustomOp(
+              quantizing_op, custom_map)) {
+        if (!(enable_verify && enable_whole_model_verify)) {
+          return failure();
+        }
+        if (quantizing_op->hasAttr(kDebugModeOpQuantAttrName) ||
+            quantizing_op->hasAttr(kDebugModeOpFloatAttrName)) {
+          return failure();
+        }
+
+        rewriter.setInsertionPoint(quantizing_op);
+        mlir::Operation* float_op = rewriter.clone(*quantizing_op);
+        quantizing_op->setAttr(kDebugModeOpQuantAttrName,
+                               rewriter.getUnitAttr());
+        float_op->setAttr(kDebugModeOpFloatAttrName, rewriter.getUnitAttr());
+        RewireFloatModelBackbone(quantizing_op, float_op);
+        return success();
+      }
+
+      // Blocklist op is checked in advance for non-dynamic range quantization
+      // case.
+      if (!quant_params_.quant_spec.weight_quantization &&
+          (ops_blocklist.find(quantizing_op->getName().getStringRef().str()) !=
+           ops_blocklist.end())) {
+        return failure();
+      }
+
+      if (!nodes_blocklist.empty()) {
+        if (auto name_loc = llvm::dyn_cast<NameLoc>(quantizing_op->getLoc())) {
+          std::string sloc = name_loc.getName().str();
+          if (!sloc.empty() &&
+              (nodes_blocklist.find(sloc) != nodes_blocklist.end())) {
+            return failure();
+          }
+        }
+      }
+
+      // An op with float inputs and outputs are expected when it's used by a
+      // NumericVerify op. Skip this op.
+      if (enable_verify && UsedBy<VerifierT>(quantizing_op)) {
+        continue;
+      }
+
+      bool is_operand_or_result_modified = false;
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(quantizing_op->getNumOperands());
+      for (auto operand : quantizing_op->getOperands()) {
+        Type operand_type = operand.getType();
+        if (isa<NoneType>(operand_type)) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        auto ele_type =
+            llvm::cast<TensorType>(operand.getType()).getElementType();
+        if (static_cast<const ConcreteT*>(this)
+                ->AllowDynamicRangeQuantizedOperand(quantizing_op,
+                                                    custom_map)) {
+          auto dq_op = dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp());
+
+          if (dq_op && inference_type == tensorflow::DT_QINT8 &&
+              !static_cast<const ConcreteT*>(this)->IsWeightOnlyOp(
+                  quantizing_op, ops_blocklist, weight_only_quantization,
+                  custom_map)) {
+            // Dynamic range quantization is applied by having QuantizeOp as an
+            // input. Only int8 weight is supported for now.
+            inputs.push_back(dq_op.getOperand());
+            is_operand_or_result_modified = true;
+          } else {
+            // Otherwise, it's the case where the operand is activations or the
+            // quantizing_op is non-supported/weight-only.
+            inputs.push_back(operand);
+          }
+        } else {
+          if (auto dq_op =
+                  dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
+            is_operand_or_result_modified = true;
+            inputs.push_back(dq_op.getOperand());
+          } else if (!ele_type.isF32()) {
+            // If the operand is an integer tensor, then it doesn't require the
+            // DequantizeOp in the pattern.
+            inputs.push_back(operand);
+          } else {
+            return failure();
+          }
+        }
+      }
+
+      mlir::Operation* quantized_op;
+      if (QuantizableOpSupportsFloatOutputType(quantizing_op)) {
+        rewriter.setInsertionPointAfter(quantizing_op);
+        OperationState new_state(
+            quantizing_op->getLoc(), quantizing_op->getName().getStringRef(),
+            inputs, quantizing_op->getResultTypes(), quantizing_op->getAttrs());
+        for (const auto& indexed_regions :
+             llvm::enumerate(quantizing_op->getRegions())) {
+          Region* target_region = new_state.addRegion();
+          IRMapping mapping;
+          indexed_regions.value().cloneInto(target_region, mapping);
+        }
+        quantized_op = rewriter.create(new_state);
+        rewriter.replaceOp(quantizing_op, quantized_op);
+      } else {
+        // Collect all the quantized outputs and replace them by the results of
+        // the new quantized op.
+        llvm::SmallDenseMap<Value, int> outputs_replaced;
+        SmallVector<Type, 4> output_types;
+        output_types.reserve(quantizing_op->getNumResults());
+        for (const auto& enumerated_result :
+             llvm::enumerate(quantizing_op->getResults())) {
+          Value result = enumerated_result.value();
+          Type result_type = result.getType();
+          // Add this to the test coverage once we create test ops with none
+          // type results.
+          if (isa<NoneType>(result_type)) {
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result_type);
+            continue;
+          }
+          Type result_ele_type =
+              llvm::cast<TensorType>(result.getType()).getElementType();
+          // If the user is the QuantizeOp, it must be the only user.
+          if (result.hasOneUse() &&
+              llvm::isa<QuantizeOpT>(*result.user_begin())) {
+            auto user = llvm::cast<QuantizeOpT>(*result.user_begin());
+            outputs_replaced.insert(
+                {user.getResult(), enumerated_result.index()});
+            output_types.push_back(user.getType());
+            is_operand_or_result_modified = true;
+          } else if (!result_ele_type.isF32()) {
+            // If the result is an integer tensor, then it doesn't require the
+            // D op in the pattern.
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result.getType());
+          } else if (static_cast<const ConcreteT*>(this)
+                         ->AllowDynamicRangeQuantizedResult(quantizing_op,
+                                                            custom_map)) {
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result.getType());
+          } else {
+            return failure();
+          }
+        }
+
+        // For float16 quantization if none of the operand or result is
+        // modified, replacing the op. See b/335025403.
+        if (inference_type == tensorflow::DT_HALF &&
+            !is_operand_or_result_modified) {
+          return failure();
+        }
+
+        rewriter.setInsertionPointAfter(quantizing_op);
+        OperationState new_state(
+            quantizing_op->getLoc(), quantizing_op->getName().getStringRef(),
+            inputs, output_types, quantizing_op->getAttrs());
+        for (int i = 0; i < quantizing_op->getNumRegions(); ++i) {
+          new_state.addRegion();
+        }
+        quantized_op = rewriter.create(new_state);
+        if (quantizing_op->getNumRegions() != 0) {
+          for (const auto& indexed_regions :
+               llvm::enumerate(quantizing_op->getRegions())) {
+            Region& target_region =
+                quantized_op->getRegion(indexed_regions.index());
+            IRMapping mapping;
+            indexed_regions.value().cloneInto(&target_region, mapping);
+          }
+        }
+        for (auto output : outputs_replaced) {
+          output.getFirst().replaceAllUsesWith(
+              quantized_op->getResult(output.getSecond()));
+        }
+      }
+
+      // To verify the numericals, the original floating-point ops are
+      // preserved in the graph. The result of these floating-point ops are sent
+      // to a numeric verifier op as the reference.
+      if (enable_verify && !std::is_same_v<VerifierT, void>) {
+        // For constant operands, the floating-point constant is duplicated in
+        // case it is quantized.
+        for (int i = 0, e = quantized_op->getNumOperands(); i < e; ++i) {
+          auto def = quantized_op->getOperand(i).getDefiningOp();
+          if (auto q = llvm::dyn_cast_or_null<QuantizeOpT>(def)) {
+            DenseFPElementsAttr attr;
+            if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
+              continue;
+            }
+            auto cst = rewriter.create<arith::ConstantOp>(
+                quantized_op->getLoc(), attr);
+            quantizing_op->setOperand(i, cst.getResult());
+          }
+        }
+
+        for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
+          if (!isa<FloatType>(
+                  cast<ShapedType>(quantizing_op->getResult(i).getType())
+                      .getElementType())) {
+            continue;
+          }
+          CreateVerifier<VerifierT>(quantizing_op, quantized_op, rewriter, i,
+                                    quant_params_);
+
+          if (enable_whole_model_verify) {
+            RewireFloatModelBackbone(quantized_op, quantizing_op);
+          }
+        }
+      }
+    }
+    return success();
+  }
+
+ private:
+  // Reconnects float ops in the whole-model verify mode. Works for both
+  // Quantizable ops and Unquantizable ops
+  void RewireFloatModelBackbone(mlir::Operation* quantized_op,
+                                mlir::Operation* float_op) const {
+    for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
+      if (!llvm::cast<ShapedType>(float_op->getResult(i).getType())
+               .getElementType()
+               .isF32()) {
+        continue;
+      }
+      // Find the Quantize/Dequantize users of the new op results, and replace
+      // the usage. Then all the floating-point ops are connected, forming a
+      // separate float "backbone" model that the quantized model can be
+      // compared against in parallel.
+      // N.B. the return op will use this floating-point result.
+      Value result;
+      if (!IsOpQuantizable(float_op)) {
+        // For not quantizable ops, search for dequantize attached to the
+        // quantized op of the output.
+        if (mlir::Operation* quantize_op = dyn_cast_or_null<QuantizeOpT>(
+                *quantized_op->getResult(i).getUsers().begin())) {
+          result = quantize_op->getResult(0);
+        } else {
+          quantized_op->emitError()
+              << "Output[" << i
+              << "] is expected to have only one user [QUANTIZE]";
+          return;
+        }
+      } else {
+        result = quantized_op->getResult(i);
+      }
+      for (auto user : result.getUsers()) {
+        // Skip the Requantize op and set the user to the following dequantize
+        // op. This happens when the quantizer tries to match the scale conflict
+        // with QuantizeOp - QuantizeOp(requant) - DequantizeOp triples. The
+        // correct float op should be the user of the last DequantizeOp.
+        if (llvm::isa<QuantizeOpT>(user)) {
+          user = *user->getResult(0).getUsers().begin();
+        }
+        if (auto dequantize = llvm::dyn_cast<DequantizeOpT>(user)) {
+          // Replace all uses, except not quantizable ops that are being used in
+          // the float backbone.
+          dequantize.getResult().replaceUsesWithIf(
+              float_op->getResult(i), [&](OpOperand& use) {
+                return !use.getOwner()->hasAttr(kDebugModeOpQuantAttrName);
+              });
+        }
+      }
+    }
+  }
+
+  QuantPassSpec quant_params_;
+};
+
+// A pattern that removes debug attributes that are annotated to ops during
+// the debug model creation.
+class RemoveDebugAttrPattern : public RewritePattern {
+ public:
+  explicit RemoveDebugAttrPattern(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+  LogicalResult matchAndRewrite(mlir::Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Converts quantized tensor type with signed integer type to quantized tensor
+// type with unsigned integer type.
+Type ConvertSignedQuantizedToUnsigned(Type signed_tensor_type, Location loc);
+
+// Converts quantize ops with unsigned quantized types to these with signed
+// quantized types and preserves the scales.
+template <typename QuantizeOpT>
+struct ConvertUnsignedToSigned : public OpRewritePattern<QuantizeOpT> {
+  using BaseType = ConvertUnsignedToSigned<QuantizeOpT>;
+  using QType = quant::QuantizedType;
+
+  explicit ConvertUnsignedToSigned(MLIRContext* context)
+      : OpRewritePattern<QuantizeOpT>(context, 1) {}
+
+  LogicalResult matchAndRewrite(QuantizeOpT op,
+                                PatternRewriter& rewriter) const override {
+    Type output_type = op.getResult().getType();
+    auto qtype = QType::getQuantizedElementType(output_type);
+    if (!qtype || qtype.isSigned()) return failure();
+
+    int num_bits = qtype.getStorageTypeIntegralWidth();
+    if (num_bits == 8) {
+      // If storage is 8-bit, trained num bits may be less than 8 so check here.
+      num_bits =
+          static_cast<int>(std::ceil(std::log2(qtype.getStorageTypeMax())));
+    }
+    // This is a positive value, and will be applied on zero points and fixed
+    // point ranges.
+    int64_t offset =
+        QType::getDefaultMinimumForInteger(/*isSigned=*/false, num_bits) -
+        QType::getDefaultMinimumForInteger(/*isSigned=*/true, num_bits);
+
+    auto flags = quant::QuantizationFlags::Signed;
+    QType new_qtype;
+    if (auto uqtype = llvm::dyn_cast<quant::UniformQuantizedType>(qtype)) {
+      new_qtype = quant::UniformQuantizedType::getChecked(
+          op.getLoc(), flags, qtype.getStorageType(), qtype.getExpressedType(),
+          uqtype.getScale(), uqtype.getZeroPoint() - offset,
+          uqtype.getStorageTypeMin() - offset,
+          uqtype.getStorageTypeMax() - offset);
+    } else if (auto aqtype =
+                   llvm::dyn_cast<quant::UniformQuantizedPerAxisType>(qtype)) {
+      auto zero_points = aqtype.getZeroPoints();
+      llvm::SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
+                                                    zero_points.end());
+      for (int i = 0, e = new_zero_points.size(); i < e; ++i) {
+        new_zero_points[i] -= offset;
+      }
+      new_qtype = quant::UniformQuantizedPerAxisType::getChecked(
+          op.getLoc(), flags, qtype.getStorageType(), qtype.getExpressedType(),
+          aqtype.getScales(), new_zero_points, aqtype.getQuantizedDimension(),
+          aqtype.getStorageTypeMin() - offset,
+          aqtype.getStorageTypeMax() - offset);
+    } else {
+      return failure();
+    }
+
+    if (!new_qtype) return failure();
+    Type new_output_type = new_qtype.castFromExpressedType(
+        QType::castToExpressedType(output_type));
+    rewriter.replaceOpWithNewOp<QuantizeOpT>(op, new_output_type, op.getArg());
+    return success();
+  }
+};
+
+// Fold Extra Requantize ops if the preceding ops has free scale requirement.
+template <typename RequantizeOpT>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RequantizeOpT> {
+  explicit FoldTrivalRequantizeOp(MLIRContext* context)
+      : OpRewritePattern<RequantizeOpT>(context, 1) {}
+
+  LogicalResult matchAndRewrite(RequantizeOpT op,
+                                PatternRewriter& rewriter) const override {
+    Value pre_quantized = op->getOperand(0);
+    auto pre_quantized_type =
+        quant::QuantizedType::getQuantizedElementType(pre_quantized.getType());
+    if (!pre_quantized_type) return failure();
+
+    mlir::Operation* def = pre_quantized.getDefiningOp();
+    if (!def) return failure();
+    if (llvm::isa<FixedOutputRangeInterface, SameScalesOpInterface>(def) ||
+        !def->hasTrait<OpTrait::TFL::QuantizableResult>()) {
+      return failure();
+    }
+
+    // This op should not clobber def, if more than one requant of this value.
+    if (!pre_quantized.hasOneUse()) {
+      return failure();
+    }
+
+    op.emitWarning("Remove trivial `rescale` op. Please fix the source graph.");
+
+    llvm::SmallVector<Type, 4> new_output_types;
+    for (auto result : def->getResults()) {
+      if (result.hasOneUse() && *result.getUsers().begin() == op) {
+        new_output_types.push_back(op.getResult().getType());
+      } else {
+        new_output_types.push_back(result.getType());
+      }
+    }
+
+    // Remove this rescale op.
+    rewriter.replaceOp(op, {pre_quantized});
+
+    // Replace the output scale of the preceding op.
+    rewriter.setInsertionPointAfter(def);
+    OperationState new_state(def->getLoc(), def->getName().getStringRef(),
+                             def->getOperands(), new_output_types,
+                             def->getAttrs());
+    Operation* new_op = rewriter.create(new_state);
+
+    rewriter.replaceOp(def, new_op->getResults());
+    return success();
+  }
+};
+
+// Given a quantized type `input`, magnifying its scales by the factor stored in
+// `factor`. If `input` isn't a quantized type or the `factor` doesn't match the
+// dimension size of `input` or isn't floating-point, nullptr will be returned.
+TypeAttr RescaleQuantizedType(Type input, Attribute factor);
+
+// Converts the min/max/num_bits/narrow_range information to a
+// QuantizedType, and then returns the attribute containing the QuantizedType.
+// The `min` and `max` arguments can be FloatAttr or DenseFPElementsAttr and
+// returns UniformQuantizedType or UniformQuantizedPerAxisType respectively.
+// `narrow_range` is set to true for weights and `is_signed` is set to true
+// if it is using signed int symmetric quantization.
+//
+// Note that this method may broadcast min and max to match the dimension length
+// of `input_type`, if the `quant_dim` is valid. On the other hand, the
+// symmetry of min and max is not adjusted by this method. The QAT workflow
+// should set min/max correctly (and use `narrow_range`=true, `is_signed`=true)
+// if symmetric quantization is required.
+TypeAttr GetQuantizedTypeAttr(Builder builder, Type input_type, Attribute min,
+                              Attribute max, int quant_dim,
+                              IntegerAttr num_bits, BoolAttr narrow_range,
+                              bool is_signed, bool legacy_float_scale = false,
+                              bool use_fake_quant_num_bits = false);
+
+// Casts the `target` type to a quantized type by using the quantization
+// parameters from the type in the `source` type attribute.
+// Examples:
+//   f32 -> !quant.uniform<i8:f32, 1.0>
+//   tensor<4xf32> -> tensor<4x!quant.uniform<i8:f32, 1.0>>
+// The result is wrapped by a type attribute. Returns nullptr if the cast
+// isn't valid.
+//
+// `axis` is to specify the quantization dimension in the `target` and only
+// used if the element type of `source` is a per-channel quantized type. During
+// the casting, the quantization dimension of the result type needs to be set
+// this new `axis` value.
+TypeAttr CastQuantizedTypeAttrFromExpressedType(Builder builder,
+                                                TypeAttr source, Type target,
+                                                int axis);
+
+// Quantizes the elements in the attribute `real_value` by the quantization
+// parameters in `tensor_type`. Returns empty Attribute if the
+// `tensor_type` is not a QuantizedType or the quantization fails.
+ElementsAttr Quantize(Attribute real_value, Type tensor_type);
+
+// Quantizes the elements in "legacy mode", where it calls TOCO's methods to
+// to quantize values with float scale.
+ElementsAttr QuantizeLegacy(Attribute real_value, Type tensor_type);
+
+// Returns the quantized type for an element attribute. The quantization
+// parameters in this type is based on the min and max element of the
+// attribute. When the elements in the `attr` are not in floating-point, or
+// the value range isn't straddling zero, an empty type is returned. The min/max
+// are adjusted to be symmetric if `symmetric` flag is set to True. And
+// `symmetric` can only be set to true when it is signed and narrow_range.
+Type GetUniformQuantizedTypeForWeight(ElementsAttr attr, bool symmetric,
+                                      unsigned num_bits, bool is_signed,
+                                      bool narrow_range,
+                                      bool legacy_float_scale = false,
+                                      bool use_fake_quant_num_bits = false);
+
+// Returns the per channel quantized type for an element attribute.
+// `quant_dim` defines the quantization axis. The channel min/max are adjusted
+// to be symmetric if `symmetric` flag is set to True. And `symmetric` can only
+// be set to true when it is signed and narrow_range.
+Type GetUniformQuantizedPerAxisTypeForWeight(
+    ElementsAttr attr, int quant_dim, bool symmetric, unsigned num_bits,
+    bool is_signed, bool narrow_range, bool legacy_float_scale = false,
+    bool use_fake_quant_num_bits = false);
+
+// Returns the quantized type of a bias input, given the quantized types of
+// other operands which are multiply-accumulated (the bias is added to the
+// accumulated value).
+quant::QuantizedType GetUniformQuantizedTypeForBias(
+    const std::vector<quant::QuantizedType>& op_types, int adjusted_quant_dim,
+    bool legacy_float_scale = false);
+
+// Gets quantization scale specs (e.g. fixed output range, same result and
+// operand scales) from the default quantization interfaces. The op should
+// outlive returned spec for its interface methods to be properly referenced.
+std::unique_ptr<OpQuantScaleSpec> GetDefaultQuantScaleSpec(Operation* op);
+
+// The function might contain more stats ops than required, and it will
+// introduce requantize if the calibration stats have conflicts. This method
+// tries to remove all the redundant stats ops.
+bool RemoveRedundantStatsOps(mlir::func::FuncOp func,
+                             OpQuantSpecGetter op_quant_spec_getter,
+                             OpQuantScaleSpecGetter op_quant_scale_spec_getter =
+                                 GetDefaultQuantScaleSpec);
+
+// Given quantization parameters for int8, compute the quantization parameters
+// for uint if it is required, and wrap the result in an UniformQuantizedType.
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point,
+                                                int64_t storage_min,
+                                                int64_t storage_max);
+
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point);
+
+// Extracts min and max values from the DenseFPElementsAttr, and stores them
+// into `mins` and `maxs`. When mins and maxs are extracted per-channel,
+// `dim_size` is number of channels and `slice_size` is the size of slice per
+// each channel. When `symmetric` is true, the range is expanded to [-M, M].
+void ExtractMinMaxFromAttr(DenseFPElementsAttr values, int dim_size,
+                           int slice_size, bool symmetric,
+                           SmallVectorImpl<double>& mins,
+                           SmallVectorImpl<double>& maxs);
+
+// Returns the quantized type for the
+// input_type/min/max/storage_type_width/narrow_range.
+Type GetQuantizedType(Builder builder, Type input_type, ArrayRef<double> min,
+                      ArrayRef<double> max, int quant_dim,
+                      int storage_type_width, bool narrow_range, bool is_signed,
+                      bool legacy_float_scale = false,
+                      bool use_fake_quant_num_bits = false);
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.cc
similarity index 84%
rename from tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.cc
rename to tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.cc
index 697cda55a43b..d011e8235d6c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.cc
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <iterator>
 #include <limits>
 #include <memory>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -39,15 +41,19 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 namespace mlir {
 namespace TFL {
+namespace temp {
 namespace {
 
+using ::mlir::Operation;
+
 constexpr int32_t kBiasMax = std::numeric_limits<int32_t>::max() / 2;
 
 // Uses the type of `value` to set the initial state of the index-th result if
@@ -134,12 +140,11 @@ void QuantizationDriver::InitializeResultState(Operation* op, const int index,
                           value_to_state_, operand_states_, result_states_);
 }
 
-std::unique_ptr<quant::OpQuantSpec> QuantizationDriver::GetQuantSpec(
-    Operation* op) {
+std::unique_ptr<OpQuantSpec> QuantizationDriver::GetQuantSpec(Operation* op) {
   return op_quant_spec_getter_(op);
 }
 
-std::unique_ptr<quant::OpQuantScaleSpec> QuantizationDriver::GetQuantScaleSpec(
+std::unique_ptr<OpQuantScaleSpec> QuantizationDriver::GetQuantScaleSpec(
     Operation* op) {
   return op_quant_scale_spec_getter_(op);
 }
@@ -171,12 +176,12 @@ bool QuantizationDriver::SetConstantResultParams(Operation* op) {
     // narrow range.
 
     // per-axis quantization weight, with symmetric min/max enforced.
-    final_type = quant::GetUniformQuantizedPerAxisTypeForWeight(
+    final_type = GetUniformQuantizedPerAxisTypeForWeight(
         attr, it->second, /*symmetric=*/true, /*num_bits=*/8, is_signed_,
         /*narrow_range=*/true, legacy_float_scale_);
   } else {
     // per-tensor quantization weight
-    final_type = quant::GetUniformQuantizedTypeForWeight(
+    final_type = GetUniformQuantizedTypeForWeight(
         attr, /*symmetric=*/is_weight && is_signed_,
         /*num_bits=*/8, is_signed_,
         /*narrow_range=*/is_weight, legacy_float_scale_);
@@ -209,7 +214,7 @@ bool QuantizationDriver::SetResultParams(Operation* op, const int result_index,
 QuantizedType QuantizationDriver::GetBiasParams(
     Operation* op, const int bias_index,
     const ArrayRef<int> non_bias_operand_indices,
-    const quant::AccumulatorScaleFunc func) {
+    const AccumulatorScaleFunc func) {
   QuantState& bias_state = GetOperandQuantState(op, bias_index);
   if (!bias_state.IsEmpty()) {
     return bias_state.params;
@@ -302,7 +307,7 @@ void QuantizationDriver::QuantizeValue(Value value,
   // quantization pass. These ops can be removed without losing original
   // program accuracy.
   // TODO: b/323478683 - Make the attribute being part of op definition.
-  quantize->setAttr(quant::kVolatileOpAttrName, builder_.getUnitAttr());
+  quantize->setAttr(kVolatileOpAttrName, builder_.getUnitAttr());
 
   // `original_result` has a use to `quantize`, so this will replace that use
   // by the result of `dequantize`. Remember to reset that use afterwards
@@ -512,10 +517,10 @@ void QuantizationDriver::PreprocessConstantOps() {
       uses.push_back({use.getOwner(), use.getOperandNumber()});
     }
     for (const auto [user, operand_num] : uses) {
-      const std::unique_ptr<quant::OpQuantSpec> spec = GetQuantSpec(user);
-      const std::unique_ptr<quant::OpQuantScaleSpec> scale_spec =
+      const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(user);
+      const std::unique_ptr<OpQuantScaleSpec> scale_spec =
           GetQuantScaleSpec(user);
-      const quant::BiasParamsMap biases = spec->biases_params;
+      const BiasParamsMap biases = spec->biases_params;
 
       // The quantization parameters of a `weight` shouldn't be determined by
       // other values. So any constants which are not bias, an operand of an
@@ -563,9 +568,8 @@ void QuantizationDriver::SetupAllStates() {
   }
 
   fn_.walk([&](Operation* op) {
-    std::unique_ptr<quant::OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
-    if (!quant::IsOpQuantizable(op) &&
-        !scale_spec->has_same_scale_requirement) {
+    std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
+    if (!IsOpQuantizable(op) && !scale_spec->has_same_scale_requirement) {
       return;
     }
     work_list_.push_back(op);
@@ -768,6 +772,85 @@ void QuantizationDriver::Initialize() {
   SetupAllStates();
 }
 
+namespace {
+
+bool IsConcatWithUint8QuantizedTypes(Operation* op) {
+  auto concat = mlir::dyn_cast_or_null<TFL::ConcatenationOp>(op);
+  if (!concat) {
+    return false;
+  }
+
+  QuantizedType t = nullptr;
+  for (auto operand : concat.getOperands()) {
+    auto def_op = operand.getDefiningOp();
+    if (!def_op) {
+      continue;
+    }
+
+    auto dq_op = mlir::dyn_cast_or_null<quantfork::DequantizeCastOp>(def_op);
+    if (!dq_op) {
+      continue;
+    }
+
+    auto qtype =
+        QuantizedType::getQuantizedElementType(dq_op.getArg().getType());
+    if (!qtype) {
+      continue;
+    }
+
+    t = qtype;
+    break;
+  }
+
+  if (!t) {
+    return false;
+  }
+
+  auto st = mlir::dyn_cast_or_null<IntegerType>(t.getStorageType());
+  if (!st) {
+    return false;
+  }
+
+  return !t.isSigned() && st.getWidth() == 8;
+}
+
+std::tuple<double, double> ExtractMinMax(UniformQuantizedType type) {
+  double scale = type.getScale();
+  int64_t zero_point = type.getZeroPoint();
+  int64_t storage_type_min = type.getStorageTypeMin();
+  int64_t storage_type_max = type.getStorageTypeMax();
+  double real_min = static_cast<double>(storage_type_min - zero_point) * scale;
+  double real_max = static_cast<double>(storage_type_max - zero_point) * scale;
+  return {real_min, real_max};
+}
+
+QuantizedType CalculateNewQuantizedType(
+    llvm::ArrayRef<UniformQuantizedType> qtypes) {
+  if (qtypes.size() == 1) {
+    return qtypes[0];
+  }
+
+  double real_min = std::numeric_limits<double>::max();
+  double real_max = std::numeric_limits<double>::min();
+  for (auto uniform_qtype : qtypes) {
+    auto min_max = ExtractMinMax(uniform_qtype);
+    real_min = std::min(real_min, std::get<0>(min_max));
+    real_max = std::max(real_max, std::get<1>(min_max));
+  }
+  auto uniform_qtype = qtypes[0];
+  double q_min = static_cast<double>(uniform_qtype.getStorageTypeMin());
+  double q_max = static_cast<double>(uniform_qtype.getStorageTypeMax());
+  double scale = (real_max - real_min) / (q_max - q_min);
+  int64_t zero_point = static_cast<int64_t>(q_min - (real_min / scale));
+
+  return UniformQuantizedType::get(
+      uniform_qtype.getFlags(), uniform_qtype.getStorageType(),
+      uniform_qtype.getExpressedType(), scale, zero_point,
+      uniform_qtype.getStorageTypeMin(), uniform_qtype.getStorageTypeMax());
+}
+
+}  // namespace
+
 // Propagates the quantization parameters to the operands, results, and biases.
 // TODO: b/323478683 - Do not use while loop to handle this logic.
 bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
@@ -785,7 +868,7 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       // If the workflow requires inferring ranges from the content
       // (post-training quantization) and it is weight (filter) and hasn't
       // been quantized, we infer the quantization parameters from the content.
-      if (qdq_conversion_mode_ != quant::QDQConversionMode::kQDQStrict &&
+      if (qdq_conversion_mode_ != QDQConversionMode::kQDQStrict &&
           infer_tensor_range_ && IsWeight(constant_op) && !IsQuantized(op)) {
         // The quantization parameters are determined by the content of the
         // constant.
@@ -794,7 +877,103 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       continue;
     }
 
-    std::unique_ptr<quant::OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
+    if (qdq_conversion_mode_ != QDQConversionMode::kQDQStrict &&
+        IsConcatWithUint8QuantizedTypes(op)) {
+      auto concat = mlir::dyn_cast_or_null<TFL::ConcatenationOp>(op);
+      llvm::DenseMap<int, UniformQuantizedType> operand_qtypes;
+      auto operands = concat.getOperands();
+      for (auto i = 0; i < operands.size(); i++) {
+        auto op = operands[i].getDefiningOp();
+        if (!op) {
+          continue;
+        }
+
+        auto dq_op = mlir::dyn_cast_or_null<quantfork::DequantizeCastOp>(op);
+        if (!dq_op) {
+          continue;
+        }
+
+        auto qtype =
+            QuantizedType::getQuantizedElementType(dq_op.getArg().getType());
+        if (!qtype) {
+          continue;
+        }
+
+        auto uniform_qtype =
+            mlir::dyn_cast_or_null<UniformQuantizedType>(qtype);
+        if (!uniform_qtype) {
+          continue;
+        }
+
+        operand_qtypes[i] = uniform_qtype;
+      }
+
+      llvm::DenseMap<int, UniformQuantizedType> result_qtypes;
+      llvm::SmallVector<Operation*> users(op->user_begin(), op->user_end());
+      for (auto i = 0; i < users.size(); i++) {
+        auto user = users[i];
+        auto q_op = mlir::dyn_cast_or_null<quantfork::QuantizeCastOp>(user);
+        if (!q_op) {
+          continue;
+        }
+
+        auto qtype = QuantizedType::getQuantizedElementType(q_op.getType());
+        if (!qtype) {
+          continue;
+        }
+
+        auto uniform_qtype =
+            mlir::dyn_cast_or_null<UniformQuantizedType>(qtype);
+        if (!uniform_qtype) {
+          continue;
+        }
+
+        result_qtypes[i] = uniform_qtype;
+      }
+
+      // If all operands and results are already quantized then leave it be.
+      if (operand_qtypes.size() == operands.size() &&
+          result_qtypes.size() == users.size()) {
+        continue;
+      }
+
+      // Calculate a new scale and zp using existing parameters.
+      // If no result qtype exists then calculate a new one based off of the
+      // ones specified on the operands.
+      // If no operand qtypes exist use the result qtype.
+      // We know that at least one operand or result type is quantized at this
+      // point.
+      llvm::SmallVector<UniformQuantizedType> qtypes;
+      if (result_qtypes.empty()) {
+        for (auto [idx, qtype] : operand_qtypes) {
+          qtypes.push_back(qtype);
+        }
+      } else {
+        qtypes.push_back(result_qtypes[0]);
+      }
+
+      auto new_qtype = CalculateNewQuantizedType(qtypes);
+
+      for (int i = 0; i < op->getNumOperands(); ++i) {
+        auto it = operand_qtypes.find(i);
+        if (it != operand_qtypes.end()) {
+          continue;
+        }
+        changed |= SetOperandParams(op, i, new_qtype);
+      }
+
+      for (int i = 0; i < op->getNumResults(); ++i) {
+        auto it = result_qtypes.find(i);
+        if (it != result_qtypes.end()) {
+          continue;
+        }
+        changed |= SetResultParams(op, i, new_qtype);
+      }
+
+      continue;
+    }
+
+    std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
 
     if (scale_spec->has_same_scale_requirement) {
       const QuantizedType params = GetQuantParamsForSameScaleConstraint(op);
@@ -820,7 +999,7 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       // and TFL_ReshapeOp. And the output q-dq propagation for this Op is
       // performed in `PropagateTransposedPerAxisQuantDim` and
       // `PropagateReshapedPerAxisQuantDim` respectively.
-      if (qdq_conversion_mode_ != quant::QDQConversionMode::kQDQNone &&
+      if (qdq_conversion_mode_ != QDQConversionMode::kQDQNone &&
           !scale_spec->required_same_quantized_axes_func()) {
         if (HasPerAxisQuantizedOperand(op)) continue;
       }
@@ -850,7 +1029,7 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
     // If the model already contains immutable QDQs, require upstream to
     // explicitly fix output range instead.
     if (scale_spec->has_fixed_output_range && infer_tensor_range_ &&
-        qdq_conversion_mode_ == quant::QDQConversionMode::kQDQNone) {
+        qdq_conversion_mode_ == QDQConversionMode::kQDQNone) {
       // Infer ranges from the activation ops. This is usually required for
       // the post-training quantization workflow.
       // TODO: b/323478683 - Different result can have different fixed range.
@@ -864,7 +1043,7 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       }
     }
 
-    const std::unique_ptr<quant::OpQuantSpec> spec = GetQuantSpec(op);
+    const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(op);
     for (const auto& [bias_operand_idx, non_bias_params] :
          spec->biases_params) {
       const auto& [non_bias_operand_indices, accumulator_scale_func] =
@@ -936,28 +1115,28 @@ void QuantizationDriver::Run() {
 void ApplyQuantizationParamsPropagation(
     const func::FuncOp func, const bool is_signed, const int bit_width,
     const bool disable_per_channel,
-    const quant::OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantSpecGetter op_quant_spec_getter,
     const bool infer_tensor_ranges, const bool legacy_float_scale,
-    quant::QDQConversionMode qdq_conversion_mode) {
+    QDQConversionMode qdq_conversion_mode) {
   ApplyQuantizationParamsPropagation(
       func, is_signed, bit_width, disable_per_channel, op_quant_spec_getter,
-      quant::GetDefaultQuantScaleSpec, infer_tensor_ranges, legacy_float_scale,
+      GetDefaultQuantScaleSpec, infer_tensor_ranges, legacy_float_scale,
       qdq_conversion_mode);
 }
 
 void ApplyQuantizationParamsPropagation(
     const func::FuncOp func, const bool is_signed, const int bit_width,
     const bool disable_per_channel,
-    const quant::OpQuantSpecGetter op_quant_spec_getter,
-    const quant::OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+    const OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantScaleSpecGetter op_quant_scale_spec_getter,
     const bool infer_tensor_ranges, const bool legacy_float_scale,
-    quant::QDQConversionMode qdq_conversion_mode) {
+    QDQConversionMode qdq_conversion_mode) {
   QuantizationDriver(func, is_signed, bit_width, disable_per_channel,
                      op_quant_spec_getter, op_quant_scale_spec_getter,
                      infer_tensor_ranges, qdq_conversion_mode,
                      legacy_float_scale)
       .Run();
 }
-
+}  // namespace temp
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.h
similarity index 87%
rename from tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.h
rename to tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.h
index d1bc55dd718a..24c265e8ae60 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.h
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFL_QUANTIZATION_DRIVER_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFL_QUANTIZATION_DRIVER_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_TFL_QUANTIZATION_DRIVER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_TFL_QUANTIZATION_DRIVER_H_
 
 #include <memory>
 #include <utility>
@@ -34,11 +34,14 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 
 namespace mlir {
 namespace TFL {
+// TODO(b/413355305): Remove temp namespace after TFL's 2 quantization_drivers
+// are merged.
+namespace temp {
 
 // The state for each op result during the quantization parameters propagation.
 struct QuantState {
@@ -104,14 +107,14 @@ class QuantizationDriver {
   // (op, result index) pair.
   using OpWithResultIndex = std::pair<Operation*, int>;
 
-  explicit QuantizationDriver(
-      func::FuncOp func_op, const bool is_signed, const int bit_width,
-      const bool disable_per_channel,
-      quant::OpQuantSpecGetter op_quant_spec_getter,
-      quant::OpQuantScaleSpecGetter op_quant_scale_spec_getter,
-      const bool infer_tensor_range,
-      const quant::QDQConversionMode qdq_conversion_mode,
-      const bool legacy_float_scale = false)
+  explicit QuantizationDriver(func::FuncOp func_op, const bool is_signed,
+                              const int bit_width,
+                              const bool disable_per_channel,
+                              OpQuantSpecGetter op_quant_spec_getter,
+                              OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+                              const bool infer_tensor_range,
+                              const QDQConversionMode qdq_conversion_mode,
+                              const bool legacy_float_scale = false)
       : fn_(func_op),
         builder_(func_op.getBody()),
         is_signed_(is_signed),
@@ -192,8 +195,8 @@ class QuantizationDriver {
   bool IsWeight(Operation* cst) { return llvm::is_contained(weights_, cst); }
 
   // Returns all the related quantization constraints of the op.
-  std::unique_ptr<quant::OpQuantSpec> GetQuantSpec(Operation* op);
-  std::unique_ptr<quant::OpQuantScaleSpec> GetQuantScaleSpec(Operation* op);
+  std::unique_ptr<OpQuantSpec> GetQuantSpec(Operation* op);
+  std::unique_ptr<OpQuantScaleSpec> GetQuantScaleSpec(Operation* op);
 
   // Returns whether quantization parameters have been propagated to the results
   // of this op.
@@ -219,7 +222,7 @@ class QuantizationDriver {
   // parameters are calculated by `func`.
   QuantizedType GetBiasParams(Operation* op, int bias_index,
                               ArrayRef<int> non_bias_operand_indices,
-                              quant::AccumulatorScaleFunc func);
+                              AccumulatorScaleFunc func);
 
   // Sets the quantization parameters of the result to `quantized_type`. If
   // any quantization parameters have been propagated, a requantize will
@@ -344,8 +347,8 @@ class QuantizationDriver {
   // quantized ops for the arguments are deterministically ordered.
   SmallVector<BlockArgument, 4> args_;
 
-  quant::OpQuantSpecGetter op_quant_spec_getter_;
-  quant::OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
+  OpQuantSpecGetter op_quant_spec_getter_;
+  OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
 
   // Infer output ranges for activation ops and constants. This is usually
   // required for post-training quantization.
@@ -356,7 +359,7 @@ class QuantizationDriver {
   const bool legacy_float_scale_;
 
   // The type of qdq conversion.
-  const quant::QDQConversionMode qdq_conversion_mode_;
+  const QDQConversionMode qdq_conversion_mode_;
 };
 
 // Propagates quantization parameters across ops in this function and satisfies
@@ -368,19 +371,21 @@ class QuantizationDriver {
 // Setting `infer_tensor_range` to true, to infer quantization parameters from
 // the activation ops and weight constants. This is only used for post-training
 // quantization.
-void ApplyQuantizationParamsPropagation(
-    func::FuncOp func, bool is_signed, int bit_width, bool disable_per_channel,
-    quant::OpQuantSpecGetter op_quant_spec_getter, bool infer_tensor_ranges,
-    bool legacy_float_scale, quant::QDQConversionMode qdq_conversion_mode);
+void ApplyQuantizationParamsPropagation(func::FuncOp func, bool is_signed,
+                                        int bit_width, bool disable_per_channel,
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool infer_tensor_ranges,
+                                        bool legacy_float_scale,
+                                        QDQConversionMode qdq_conversion_mode);
 
 void ApplyQuantizationParamsPropagation(
     func::FuncOp func, bool is_signed, int bit_width, bool disable_per_channel,
-    quant::OpQuantSpecGetter op_quant_spec_getter,
-    quant::OpQuantScaleSpecGetter op_quant_scale_spec_getter,
-    bool infer_tensor_ranges, bool legacy_float_scale,
-    quant::QDQConversionMode qdq_conversion_mode);
+    OpQuantSpecGetter op_quant_spec_getter,
+    OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
+    bool legacy_float_scale, QDQConversionMode qdq_conversion_mode);
 
+}  // namespace temp
 }  // namespace TFL
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFL_QUANTIZATION_DRIVER_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_COMMON_QUANTIZATION_LIB_TFL_QUANTIZATION_DRIVER_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/device_target.cc b/tensorflow/compiler/mlir/lite/quantization/device_target.cc
index 651ee62ef8c6..c55114e62acc 100644
--- a/tensorflow/compiler/mlir/lite/quantization/device_target.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/device_target.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <functional>
 #include <optional>
 
-#include "absl/types/optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -73,11 +72,11 @@ ScaleDecomposeFn DeviceTarget::GetDecomposeFn(
 
 void DeviceTarget::AppendToSignature(Type spec,
                                      KernelSpecs::Signature* signature) {
-  if (auto quant = spec.dyn_cast_or_null<UniformQuantizedType>()) {
+  if (auto quant = llvm::dyn_cast_or_null<UniformQuantizedType>(spec)) {
     signature->push_back(AnyQuantizedType::get(
         quant.getFlags(), quant.getStorageType(), quant.getExpressedType(),
         quant.getStorageTypeMin(), quant.getStorageTypeMax()));
-  } else if (auto any = spec.dyn_cast_or_null<AnyQuantizedType>()) {
+  } else if (auto any = llvm::dyn_cast_or_null<AnyQuantizedType>(spec)) {
     signature->push_back(any);
   } else {  // float
     signature->push_back(AnyQuantizedType());
@@ -114,17 +113,17 @@ LogicalResult DeviceTarget::DecomposeMultiplyAccumulateScale(
 
   llvm::SmallVector<Type, 4> input_specs, out_specs;
   for (auto spec : rop.getInputSpecs()) {
-    input_specs.push_back(spec.cast<TypeAttr>().getValue());
+    input_specs.push_back(llvm::cast<TypeAttr>(spec).getValue());
   }
   for (auto spec : rop.getOutputSpecs()) {
-    out_specs.push_back(spec.cast<TypeAttr>().getValue());
+    out_specs.push_back(llvm::cast<TypeAttr>(spec).getValue());
   }
 
-  auto in_spec = input_specs[0].dyn_cast<UniformQuantizedType>();
+  auto in_spec = llvm::dyn_cast<UniformQuantizedType>(input_specs[0]);
   // TODO(fengliuai): handles the PerAxis QuantizedType.
-  auto w_spec = input_specs[1].dyn_cast<UniformQuantizedType>();
-  auto b_spec = input_specs[2].dyn_cast<UniformQuantizedType>();
-  auto o_spec = out_specs[0].dyn_cast<UniformQuantizedType>();
+  auto w_spec = llvm::dyn_cast<UniformQuantizedType>(input_specs[1]);
+  auto b_spec = llvm::dyn_cast<UniformQuantizedType>(input_specs[2]);
+  auto o_spec = llvm::dyn_cast<UniformQuantizedType>(out_specs[0]);
   if (!in_spec || !w_spec || !b_spec || !o_spec) return failure();
 
   double scale_product = in_spec.getScale() * w_spec.getScale();
@@ -165,10 +164,8 @@ LogicalResult DeviceTarget::DecomposeSameScale(
     output_multipliers->push_back(kUnitQuantizedMultiplier);
   }
 
-  auto o_spec = rop.getOutputSpecs()[0]
-                    .cast<TypeAttr>()
-                    .getValue()
-                    .dyn_cast<UniformQuantizedType>();
+  auto o_spec = llvm::dyn_cast<UniformQuantizedType>(
+      llvm::cast<TypeAttr>(rop.getOutputSpecs()[0]).getValue());
   if (!o_spec) return failure();
 
   // output ranges
diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index 9347e9633020..2a35475dcceb 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Regex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -106,8 +107,8 @@ class ImportQuantStatsPass
     if (index < 0 || index >= static_cast<int>(op->getNumResults()))
       return false;
     Value res = op->getResult(index);
-    return res.getType().isa<ShapedType>() &&
-           res.getType().cast<ShapedType>().getElementType().isa<FloatType>();
+    return isa<ShapedType>(res.getType()) &&
+           isa<FloatType>(cast<ShapedType>(res.getType()).getElementType());
   }
 
   // A method to retrieve the name for the given op.
@@ -235,11 +236,11 @@ std::unique_ptr<OperationPass<func::FuncOp>>
 CreateImportQuantStatsPassForTFControlDialect(const std::string &stats_str) {
   auto get_name_func = [](Operation *op) {
     Location loc = tensorflow::GetLocationWithoutOpType(op->getLoc());
-    if (auto name = loc.dyn_cast<NameLoc>()) {
+    if (auto name = llvm::dyn_cast<NameLoc>(loc)) {
       return name.getName().strref();
-    } else if (auto fused_name = loc.dyn_cast<FusedLoc>()) {
+    } else if (auto fused_name = llvm::dyn_cast<FusedLoc>(loc)) {
       for (auto sub_loc : fused_name.getLocations()) {
-        if (auto named_sub_loc = sub_loc.dyn_cast<NameLoc>()) {
+        if (auto named_sub_loc = llvm::dyn_cast<NameLoc>(sub_loc)) {
           return named_sub_loc.getName().strref();
         }
       }
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
index a6d6c6144454..88022e023443 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
@@ -26,30 +26,18 @@ td_library(
 gentbl_cc_library(
     name = "QuantOpsIncGen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "QuantOps.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "QuantOps.cc.inc",
-        ),
-        (
-            [
-                "-gen-dialect-decls",
-                "-dialect=quantfork",
-            ],
-            "QuantOpsDialect.h.inc",
-        ),
-        (
-            [
-                "-gen-dialect-defs",
-                "-dialect=quantfork",
-            ],
-            "QuantOpsDialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "QuantOps.h.inc": ["-gen-op-decls"],
+        "QuantOps.cc.inc": ["-gen-op-defs"],
+        "QuantOpsDialect.h.inc": [
+            "-gen-dialect-decls",
+            "-dialect=quantfork",
+        ],
+        "QuantOpsDialect.cc.inc": [
+            "-gen-dialect-defs",
+            "-dialect=quantfork",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "QuantOps.td",
     deps = [":QuantizationOpsTdFiles"],
@@ -58,15 +46,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "QuantPassIncGen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=quantfork",
-            ],
-            "Passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"Passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=quantfork",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "Passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
index ddede29c0d7e..7eefe6a38e3c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
@@ -66,7 +66,9 @@ class FakeQuantRewrite : public OpRewritePattern<FakeQuantOp> {
   bool *hadFailure;
 
   bool failableRewrite(FakeQuantOp op, PatternRewriter &rewriter) const {
-    auto converter = ExpressedToQuantizedConverter::forInputType(op.getType());
+    auto converter =
+        mlir::quant::ir::ExpressedToQuantizedConverter::forInputType(
+            op.getType());
     if (!converter) {
       return (op.emitError("unsupported quantized type conversion"), true);
     }
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
index 2d79db85fadc..af0d21594ae9 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
@@ -32,7 +32,8 @@ using namespace mlir::quantfork;
 /// Returns a converter Attribute or nullptr if conversion is not possible.
 static Attribute convertPrimitiveValueAttr(
     Attribute origRealValue, quant::QuantizedType quantizedElementType,
-    const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
+    const mlir::quant::ir::UniformQuantizedValueConverter &converter,
+    Type &outConvertedType) {
   if (mlir::isa<FloatAttr>(origRealValue)) {
     FloatAttr floatAttr = mlir::cast<FloatAttr>(origRealValue);
     outConvertedType = quantizedElementType.getStorageType();
@@ -49,7 +50,7 @@ static Attribute convertPrimitiveValueAttr(
 static DenseElementsAttr convertDenseFPElementsAttr(
     DenseFPElementsAttr realFPElementsAttr,
     quant::QuantizedType quantizedElementType,
-    const UniformQuantizedValueConverter &converter) {
+    const mlir::quant::ir::UniformQuantizedValueConverter &converter) {
   return realFPElementsAttr.mapValues(
       quantizedElementType.getStorageType(),
       [&converter](const APFloat &realVal) {
@@ -63,7 +64,7 @@ static DenseElementsAttr convertDenseFPElementsAttr(
 static SparseElementsAttr convertSparseElementsAttr(
     SparseElementsAttr realSparseAttr,
     quant::QuantizedType quantizedElementType,
-    const UniformQuantizedValueConverter &converter) {
+    const mlir::quant::ir::UniformQuantizedValueConverter &converter) {
   DenseElementsAttr realDenseAttr = realSparseAttr.getValues();
   if (!mlir::isa<DenseFPElementsAttr>(realDenseAttr)) {
     return nullptr;
@@ -92,7 +93,8 @@ static SparseElementsAttr convertSparseElementsAttr(
 /// converter.
 Attribute mlir::quantfork::quantizeAttrUniform(
     Attribute realValue, quant::UniformQuantizedType quantizedElementType,
-    const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
+    const mlir::quant::ir::UniformQuantizedValueConverter &converter,
+    Type &outConvertedType) {
   // Fork to handle different variants of constants supported.
   if (mlir::isa<DenseFPElementsAttr>(realValue)) {
     // Dense tensor or vector constant.
@@ -125,14 +127,15 @@ Attribute mlir::quantfork::quantizeAttr(
     Type &outConvertedType) {
   if (auto uniformQuantized =
           mlir::dyn_cast<quant::UniformQuantizedType>(quantizedElementType)) {
-    UniformQuantizedValueConverter converter(uniformQuantized);
+    mlir::quant::ir::UniformQuantizedValueConverter converter(uniformQuantized);
     return quantizeAttrUniform(realValue, uniformQuantized, converter,
                                outConvertedType);
   }
   if (auto uniformQuantizedPerAxis =
           mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
               quantizedElementType)) {
-    UniformQuantizedPerAxisValueConverter converter(uniformQuantizedPerAxis);
+    mlir::quant::ir::UniformQuantizedPerAxisValueConverter converter(
+        uniformQuantizedPerAxis);
     auto converted = converter.convert(realValue);
     // TODO: why we need this outConvertedType? remove it?
     if (converted) {
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h
index bfc6afb834b0..c3770fa88cca 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h
@@ -23,9 +23,11 @@ class Type;
 namespace quant {
 class QuantizedType;
 class UniformQuantizedType;
+namespace ir {
+class UniformQuantizedValueConverter;
+}  // namespace ir
 }  // namespace quant
 namespace quantfork {
-class UniformQuantizedValueConverter;
 
 /// Converts an attribute from a type based on
 /// quantizedElementType.getExpressedType() to one based on
@@ -61,10 +63,10 @@ Attribute quantizeAttr(Attribute realValue,
 /// (realValue: DenseElementsAttr[tensor<2x2xf32>],
 ///  quantizedElementType: UniformQuantizedType[i8:f32])
 ///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
-Attribute quantizeAttrUniform(Attribute realValue,
-                              quant::UniformQuantizedType quantizedElementType,
-                              const UniformQuantizedValueConverter &converter,
-                              Type &outConvertedType);
+Attribute quantizeAttrUniform(
+    Attribute realValue, quant::UniformQuantizedType quantizedElementType,
+    const mlir::quant::ir::UniformQuantizedValueConverter &converter,
+    Type &outConvertedType);
 }  // namespace quantfork
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index ac7cf3ab6f45..cf423fe6d067 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -37,8 +37,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
         "//tensorflow/compiler/mlir/lite/debug",
         "//tensorflow/compiler/mlir/lite/debug:debug_options_proto_cc",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -65,8 +65,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -88,14 +88,14 @@ cc_library(
     ],
     hdrs = [
         "tfl_to_std.h",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_utils.h",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_utils.h",
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 22df8a4358b2..e16da5d6303e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -38,11 +38,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -92,7 +92,7 @@ absl::Status QuantizeModel(
     // Add debugging instrumentation
     tensorflow::InitPassManager(pm, debug_options.value());
   }
-  quant::QuantizationSpecs quant_specs;
+  TFL::QuantizationSpecs quant_specs;
   quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.post_training_quantization = true;
   quant_specs.disable_per_channel = disable_per_channel;
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
index ba2bc4cd72ba..58fc72be5a46 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
@@ -38,11 +38,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -112,7 +112,7 @@ absl::Status QuantizeWeights(
 
   // Apply quantization passes.
   PassManager pm((*module)->getName(), OpPassManager::Nesting::Implicit);
-  quant::QuantizationSpecs quant_specs;
+  TFL::QuantizationSpecs quant_specs;
   quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.weight_quantization = true;
   quant_specs.weight_only_quantization = weight_only_quantization;
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
index 339dfee21495..a8eff71edf25 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 namespace mlir {
 namespace TFL {
@@ -65,8 +65,8 @@ void ConvertMlirQuantOpsToTFLQuantOps(func::FuncOp func) {
       auto dcast = b.create<DequantizeOp>(dq.getLoc(), dq.getResult().getType(),
                                           dq.getArg());
       dq.getResult().replaceAllUsesWith(dcast);
-      if (auto extra_attr = op->getAttr(mlir::quant::kVolatileOpAttrName)) {
-        dcast->setAttr(mlir::quant::kVolatileOpAttrName, extra_attr);
+      if (auto extra_attr = op->getAttr(kVolatileOpAttrName)) {
+        dcast->setAttr(kVolatileOpAttrName, extra_attr);
       }
       dq.erase();
     } else if (auto q = llvm::dyn_cast<quantfork::QuantizeCastOp>(op)) {
@@ -74,8 +74,8 @@ void ConvertMlirQuantOpsToTFLQuantOps(func::FuncOp func) {
       auto qcast = b.create<QuantizeOp>(q.getLoc(), out_type, q.getArg(),
                                         TypeAttr::get(out_type));
       q.getResult().replaceAllUsesWith(qcast);
-      if (auto extra_attr = op->getAttr(mlir::quant::kVolatileOpAttrName)) {
-        qcast->setAttr(mlir::quant::kVolatileOpAttrName, extra_attr);
+      if (auto extra_attr = op->getAttr(kVolatileOpAttrName)) {
+        qcast->setAttr(kVolatileOpAttrName, extra_attr);
       }
       q.erase();
     }
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD
index bcb756f71088..4f36cb7e7b3d 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/BUILD
@@ -15,7 +15,9 @@ cc_library(
     srcs = ["portable_tensor_utils.cc"],
     hdrs = ["portable_tensor_utils.h"],
     visibility = [
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib:__pkg__",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:__pkg__",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib:__pkg__",
     ],
 )
 
@@ -100,6 +102,7 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@flatbuffers//:runtime_cc",
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.cc b/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.cc
index b2d6fe972801..655c1e4deadf 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
diff --git a/tensorflow/compiler/mlir/lite/quantization/numerical_utils.cc b/tensorflow/compiler/mlir/lite/quantization/numerical_utils.cc
index d00bff6ebfbe..0303972950c8 100644
--- a/tensorflow/compiler/mlir/lite/quantization/numerical_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/numerical_utils.cc
@@ -22,8 +22,6 @@ limitations under the License.
 #include <limits>
 #include <optional>
 
-#include "absl/types/optional.h"
-
 namespace mlir {
 namespace quant {
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc b/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
index 7f9b02b9f614..bdb75edf3aa1 100644
--- a/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/numerical_utils_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <optional>
 
 #include <gtest/gtest.h>
-#include "absl/types/optional.h"
 
 namespace mlir {
 namespace quant {
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
index 8682cba5cdc5..7979e1fc7acf 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_context.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,9 +34,9 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/device_target.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 #define DEBUG_TYPE "quantization-context"
 
@@ -190,7 +191,7 @@ void QuantizeContext::DumpStates(quantfork::QuantizeRegionOp current_op) {
 // - use the single input if it is ready, or,
 // - use the single output if it is ready, or,
 // - use the first ready one in the collection.
-QuantParams QuantizeContext::GetQuantParamsForSameScaleConstraint(
+TFL::QuantParams QuantizeContext::GetQuantParamsForSameScaleConstraint(
     Operation *op) {
   // Two vector to collect Non-empty operands and results states.
   std::vector<quant::QuantState *> mutable_states, immutable_states;
@@ -254,12 +255,13 @@ QuantParams QuantizeContext::GetQuantParamsForSameScaleConstraint(
 }
 
 LogicalResult QuantizeContext::PropagateQuantParams(
-    Operation *op, const QuantParams params,
+    Operation *op, const TFL::QuantParams params,
     quant::AdjacentOperations *new_items, bool *changed) {
   // Use the final state to set all the operands' parameters.
   for (int i = 0, e = op->getNumOperands(); i != e; ++i) {
-    auto ele = op->getOperand(i).getType().cast<ShapedType>().getElementType();
-    if (ele.isa<FloatType>() && SetOperandParams(op, i, params)) {
+    auto ele =
+        llvm::cast<ShapedType>(op->getOperand(i).getType()).getElementType();
+    if (isa<FloatType>(ele) && SetOperandParams(op, i, params)) {
       *changed |= true;
       new_items->push_back(op->getOperand(i).getDefiningOp());
     }
@@ -267,8 +269,9 @@ LogicalResult QuantizeContext::PropagateQuantParams(
 
   // Use the final state to set all the results' parameters.
   for (int res = 0, e = op->getNumResults(); res != e; ++res) {
-    auto ele = op->getResult(res).getType().cast<ShapedType>().getElementType();
-    if (ele.isa<FloatType>() && SetResultParams(op, res, params)) {
+    auto ele =
+        llvm::cast<ShapedType>(op->getResult(res).getType()).getElementType();
+    if (isa<FloatType>(ele) && SetResultParams(op, res, params)) {
       auto users = op->getResult(res).getUsers();
       *changed |= !users.empty();
       new_items->append(users.begin(), users.end());
@@ -285,8 +288,8 @@ int QuantizeContext::StatesManager::InitializeState(
   } else {
     params_attr = op.getInputSpecs()[index];
   }
-  QuantParams params =
-      params_attr.cast<TypeAttr>().getValue().dyn_cast<QuantParams>();
+  TFL::QuantParams params =
+      dyn_cast<QuantizedType>(cast<TypeAttr>(params_attr).getValue());
   bool immutable = !EmptyParams(params);
   int next_state_index = states_.size();
   states_.push_back({params, immutable});
@@ -329,7 +332,7 @@ bool QuantizeContext::StatesManager::SetConstantResultParams(Operation *op) {
 
 bool QuantizeContext::StatesManager::SetResultParams(Operation *op,
                                                      int res_index,
-                                                     QuantParams params) {
+                                                     TFL::QuantParams params) {
   auto &state = GetResultQuantState(op, res_index);
   if (state.params == params) {
     return false;
@@ -345,7 +348,7 @@ bool QuantizeContext::StatesManager::SetResultParams(Operation *op,
 }
 
 bool QuantizeContext::StatesManager::SetOperandParams(Operation *op, int index,
-                                                      QuantParams params) {
+                                                      TFL::QuantParams params) {
   auto &state = GetOperandQuantState(op, index);
   if (state.params == params) {
     return false;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_context.h b/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
index 2b33e1e65b58..960fe465804c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_context.h
@@ -28,19 +28,21 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/device_target.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 namespace mlir {
 namespace quant {
 
-static bool EmptyParams(QuantParams p) { return p == quant::QuantizedType(); }
+static bool EmptyParams(TFL::QuantParams p) {
+  return p == quant::QuantizedType();
+}
 
 // The state for each op result during the quantization parameters propagation.
 struct QuantState {
   // Quantization parameters propagated to an op result.
-  QuantParams params;
+  TFL::QuantParams params;
   // A flag indicates this state (the params) shouldn't be changed after it is
   // initialized. This flag will be set to true if the quantization parameters
   // are from the quantization-aware training.
@@ -63,7 +65,7 @@ struct RequantizeState {
   } pos = NO_REQUANTIZE;
 
   // Quantization parameters will be used to add the requantize ops.
-  QuantParams params;
+  TFL::QuantParams params;
 };
 
 // This class manages all the intermediate quantization states.
@@ -91,24 +93,24 @@ class QuantizeContext {
   // Update the quantization parameter for certain result of the op. By this
   // method, the quantization parameter is propagated to all the users of the
   // result as well.
-  bool SetResultParams(Operation *op, int index, QuantParams params) {
+  bool SetResultParams(Operation *op, int index, TFL::QuantParams params) {
     return states_manager_.SetResultParams(op, index, params);
   }
 
   // Update the quantization parameter for certain operand of the op. By this
   // method, the quantization parameter is propagated to the defining op of
   // operand as well.
-  bool SetOperandParams(Operation *op, int index, QuantParams params) {
+  bool SetOperandParams(Operation *op, int index, TFL::QuantParams params) {
     return states_manager_.SetOperandParams(op, index, params);
   }
 
   // Return the quantization parameter of certain result of the op.
-  QuantParams GetResultParams(Operation *op, int index) {
+  TFL::QuantParams GetResultParams(Operation *op, int index) {
     return states_manager_.GetResultParams(op, index);
   }
 
   // Return the quantization parameter of certain operand of the op.
-  QuantParams GetOperandParams(Operation *op, int index) {
+  TFL::QuantParams GetOperandParams(Operation *op, int index) {
     return states_manager_.GetOperandParams(op, index);
   }
 
@@ -124,13 +126,13 @@ class QuantizeContext {
   // - use the single input if it is ready, or,
   // - use the single output if it is ready, or,
   // - use the first ready one in the collection.
-  QuantParams GetQuantParamsForSameScaleConstraint(Operation *op);
+  TFL::QuantParams GetQuantParamsForSameScaleConstraint(Operation *op);
 
   // Propagate `params` to all the quantizable port of the `op`. The adjacent
   // ops, which have the parameters propagated to, are collected by `new_items`,
   // so they can be added to the working queue. `changed` is set to true if
   // there are any new elements being added to `new_items`.
-  LogicalResult PropagateQuantParams(Operation *op, QuantParams params,
+  LogicalResult PropagateQuantParams(Operation *op, TFL::QuantParams params,
                                      AdjacentOperations *new_items,
                                      bool *changed);
 
@@ -149,7 +151,7 @@ class QuantizeContext {
     //
     // Returns true, if the users of the result needs to be added to the
     // worklist.
-    bool SetResultParams(Operation *op, int index, QuantParams params);
+    bool SetResultParams(Operation *op, int index, TFL::QuantParams params);
 
     // Sets the quantization parameters of the operand to a fixed value. If any
     // quantization parameters have been propagated, a `requantize` will happen
@@ -157,15 +159,15 @@ class QuantizeContext {
     //
     // Returns true, if the defining op of the operand needs to be added to the
     // worklist.
-    bool SetOperandParams(Operation *op, int index, QuantParams params);
+    bool SetOperandParams(Operation *op, int index, TFL::QuantParams params);
 
     // Returns the quantization parameters of the index-th result of the op.
-    QuantParams GetResultParams(Operation *op, int index) {
+    TFL::QuantParams GetResultParams(Operation *op, int index) {
       return states_[result_states_[{op, index}]].params;
     }
 
     // Returns the quantization parameters of the index-th operand of the op.
-    QuantParams GetOperandParams(Operation *op, int index) {
+    TFL::QuantParams GetOperandParams(Operation *op, int index) {
       return states_[operand_states_[{op, index}]].params;
     }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
index 7d2ff18de0ab..2ce14328fb1a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
@@ -15,13 +15,13 @@ cc_library(
         "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_tf_unfreeze_global_tensors",
-        "//tensorflow/compiler/mlir/lite/stablehlo:tf_stablehlo",
         "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:weight_only_ptq",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/compiler/mlir/stablehlo:tf_stablehlo",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
         "//tensorflow/core/protobuf:for_core_protos_cc",
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
index ff68df33d747..3f5bcc10eedd 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
@@ -29,13 +29,13 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/loader.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
index 8a73407338f6..aee0c6574ec1 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
@@ -36,12 +36,7 @@ td_library(
 gentbl_cc_library(
     name = "ptq_fallback_to_flex_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "fallback_to_flex_patterns.inc",
-        ),
-    ],
+    tbl_outs = {"fallback_to_flex_patterns.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "fallback_to_flex_patterns.td",
     deps = [":ptq_td_files"],
@@ -60,8 +55,8 @@ cc_library(
     deps = [
         ":ptq_fallback_to_flex_inc_gen",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index a60ac436b56b..6c43167a78cb 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
@@ -141,7 +141,7 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
     BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
     Type res_type = tf_op.getType();
-    TypeAttr qtype = quant::GetQuantizedTypeAttr(
+    TypeAttr qtype = TFL::GetQuantizedTypeAttr(
         rewriter, res_type, min_value, max_value, quant_dim, num_bits,
         narrow_range, /*is_signed=*/true);
     if (!qtype) return failure();
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
index 874118ae4f93..94660ab67b02 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
@@ -53,7 +53,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, const RecordKeeper &records) {
   std::vector<const Record *> defs = records.getAllDerivedDefinitions("Op");
   llvm::sort(defs, LessRecord());
 
-  OUT(0) << "static std::unique_ptr<quant::OpQuantSpec> "
+  OUT(0) << "static std::unique_ptr<OpQuantSpec> "
             "GetOpQuantSpec(mlir::Operation *op, bool "
             "disable_per_channel_for_dense_layers = false) {\n";
   // TODO(b/176258587): Move to OpTrait if this should be generalized.
@@ -66,15 +66,14 @@ static bool OpQuantSpecWriter(raw_ostream &os, const RecordKeeper &records) {
             "GetLstmOpQuantSpec<TFL::UnidirectionalSequenceLSTMOp>(lstm_op);\n";
   OUT(2) << "}\n";
 
-  OUT(2) << "auto spec = std::make_unique<quant::OpQuantSpec>();\n";
+  OUT(2) << "auto spec = std::make_unique<OpQuantSpec>();\n";
   llvm::SmallVector<llvm::StringRef, 3> matches;
   for (auto *def : defs) {
     Operator op(def);
     for (const auto t : op.getTraits()) {
       if (auto opTrait = llvm::dyn_cast<mlir::tblgen::NativeTrait>(&t)) {
         auto trait_str = opTrait->getFullyQualifiedTraitName();
-        if (!llvm::StringRef{trait_str}.consume_front(
-                "::mlir::OpTrait::quant::"))
+        if (!llvm::StringRef{trait_str}.consume_front("::mlir::OpTrait::TFL::"))
           continue;
 
         OUT(2) << "if (auto tfl = llvm::dyn_cast<" << op.getQualCppClassName()
@@ -84,7 +83,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, const RecordKeeper &records) {
           OUT(4) << "for (int i = 0, e = op->getNumResults(); i != e; ++i)\n";
           OUT(6) << "spec->restricted_output_params[std::make_pair("
                  << matches[1] << ", " << matches[2]
-                 << ")].push_back(tfl.::mlir::OpTrait::quant::" << trait_str
+                 << ")].push_back(tfl.::mlir::OpTrait::TFL::" << trait_str
                  << "<" << op.getQualCppClassName()
                  << ">::GetResultQuantizedType(i));\n";
           matches.clear();
@@ -93,7 +92,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, const RecordKeeper &records) {
         if (acc_uniform_trait_regex.match(trait_str, &matches)) {
           OUT(4) << "spec->biases_params.emplace(std::make_pair(" << matches[1]
                  << ", std::make_pair(tfl.GetAllNonBiasOperands(),"
-                 << "quant::GetUniformQuantizedTypeForBias)));\n";
+                 << "GetUniformQuantizedTypeForBias)));\n";
           matches.clear();
         }
         // There is a "QuantChannelDim" trait, set the quantization dimension.
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
index c92d43da951a..410730604ee0 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
@@ -181,7 +181,7 @@ void GenerateStaticQuantOp(std::vector<const Record *> &defs,
 
   for (const auto *def : defs) {
     Operator op(def);
-    if (!op.getTrait("::mlir::OpTrait::quant::QuantizableResult")) continue;
+    if (!op.getTrait("::mlir::OpTrait::TFL::QuantizableResult")) continue;
 
     const llvm::DagInit *args_in_dag = def->getValueAsDag("arguments");
     // Assumes argument name is "input" for input activations. Otherwise, assume
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index cfb0925b7fe6..4d7e4c1af5cb 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -1,6 +1,5 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_xla//xla/tsl/platform:build_config_root.bzl", "if_static")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
@@ -51,29 +50,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "rename_entrypoint_to_main",
-    srcs = [
-        "transforms/rename_entrypoint_to_main.cc",
-    ],
-    hdrs = [
-        "transforms/rename_entrypoint_to_main.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        ":stablehlo_util",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "hlo_matchers",
     srcs = [
@@ -120,128 +96,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "legalize_utils",
-    srcs = ["transforms/utils.cc"],
-    hdrs = ["transforms/utils.h"],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@local_xla//xla/mlir_hlo",
-    ],
-)
-
-tf_cc_test(
-    name = "legalize_utils_test",
-    srcs = ["transforms/utils_test.cc"],
-    deps = [
-        ":legalize_utils",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_xla//xla/mlir_hlo",
-    ],
-)
-
-gentbl_cc_library(
-    name = "legalize_tf_patterns_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_legalize_tf.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "transforms/legalize_tf_patterns.td",
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:TensorOpsTdFiles",
-        "@local_xla//xla/mlir_hlo:hlo_ops_td_files",
-    ],
-)
-
-cc_library(
-    name = "legalize_tf",
-    srcs = [
-        "transforms/generated_legalize_tf.inc",
-        "transforms/legalize_tf.cc",
-    ],
-    hdrs = [
-        "transforms/legalize_tf_passes.h",
-    ],
-    deps = [
-        ":legalize_tf_patterns_inc_gen",
-        ":legalize_utils",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
-        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/kernels:conv_grad_shape_utils",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@local_tsl//tsl/platform:bfloat16",
-        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:padding",
-        "@local_xla//xla/client:sharding_builder",
-        "@local_xla//xla/client/lib:conv_grad_size_util",
-        "@local_xla//xla/hlo/translate/hlo_to_mhlo:attribute_importer",
-        "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:convert_op_folder",
-        "@local_xla//xla/translate/hlo_to_mhlo:attribute_importer",
-        "@local_xla//xla/tsl/platform:status",
-        "@stablehlo//:chlo_ops",
-    ] + if_static(["@local_tsl//tsl/platform:tensor_float_32_utils"]),
-)
-
-cc_library(
-    name = "tf_stablehlo",
-    srcs = [
-        "transforms/tf_stablehlo_pass.cc",
-    ],
-    hdrs = [
-        "transforms/tf_stablehlo_pass.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        ":legalize_tf",
-        ":stablehlo_util",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:lower_tf_lib",
-        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-        "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
-        "@local_xla//xla/mlir_hlo:mhlo_passes",
-        "@local_xla//xla/mlir_hlo:type_conversion",
-        "@stablehlo//:chlo_ops",
-        "@stablehlo//:register",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "tfl_stablehlo",
     srcs = [
@@ -281,19 +135,19 @@ cc_library(
     ],
     deps = [
         ":drop_savedmodel_semantics",
-        ":fuse_convolution_pass",
         ":legalize_stablehlo_custom_call_to_composite",
         ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
-        ":rename_entrypoint_to_main",
         ":smuggle_disallowed_ops",
         ":stablehlo_fuse_convolution_pass",
         ":stablehlo_unfuse_batch_norm_pass",
-        ":tf_stablehlo",
         ":unfold_splat_constant_pass",
-        ":unfuse_batch_norm_pass",
         "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
         "//tensorflow/compiler/mlir/stablehlo:fold_broadcast_pass",
+        "//tensorflow/compiler/mlir/stablehlo:fuse_convolution_pass",
+        "//tensorflow/compiler/mlir/stablehlo:rename_entrypoint_to_main",
+        "//tensorflow/compiler/mlir/stablehlo:tf_stablehlo",
+        "//tensorflow/compiler/mlir/stablehlo:unfuse_batch_norm_pass",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
@@ -336,7 +190,7 @@ cc_library(
     deps = [
         ":stablehlo_util",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/common/quantization_lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
@@ -353,15 +207,10 @@ cc_library(
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=OdmlStablehlo",
-            ],
-            "transforms/stablehlo_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/stablehlo_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=OdmlStablehlo",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/stablehlo_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
@@ -389,33 +238,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "unfuse_batch_norm_pass",
-    srcs = [
-        "transforms/mhlo_passes/unfuse_batch_norm_pass.cc",
-    ],
-    hdrs = [
-        "transforms/stablehlo_passes.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        ":passes_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-        "@local_xla//xla/mlir_hlo",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "stablehlo_unfuse_batch_norm_pass",
     srcs = [
@@ -442,35 +264,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "fuse_convolution_pass",
-    srcs = [
-        "transforms/mhlo_passes/fuse_convolution_pass.cc",
-    ],
-    hdrs = [
-        "transforms/stablehlo_passes.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        ":passes_inc_gen",
-        "//tensorflow/compiler/mlir/lite:validators",
-        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-        "@local_xla//xla/mlir_hlo",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "stablehlo_fuse_convolution_pass",
     srcs = [
@@ -705,12 +498,7 @@ cc_library(
 gentbl_cc_library(
     name = "hlo_legalize_tf_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_legalize_hlo.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_legalize_hlo.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/legalize_hlo_patterns.td",
     deps = [
@@ -724,12 +512,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "hlo_legalize_tflite_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_tflite_legalize_hlo.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_tflite_legalize_hlo.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/tflite_legalize_hlo_patterns.td",
     deps = [
@@ -787,12 +570,7 @@ cc_library(
 gentbl_cc_library(
     name = "prepare_hlo_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_prepare_hlo.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_prepare_hlo.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/prepare_hlo.td",
     deps = [
@@ -885,6 +663,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_device_pass_inc_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
@@ -1038,12 +817,7 @@ cc_library(
 gentbl_cc_library(
     name = "composite_lowering_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_composite_lowering.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_composite_lowering.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/composite_lowering_patterns.td",
     deps = [
@@ -1067,7 +841,6 @@ tf_cc_binary(
                   " [tf.lite.OpsSet.EXPERIMENTAL_STABLEHLO_OPS]",
     deps = [
         ":check_accepted_ops_pass",
-        ":legalize_tf",
         ":op_stat_pass",
         ":stablehlo_util",
         ":transforms",
@@ -1079,6 +852,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
+        "//tensorflow/compiler/mlir/stablehlo:legalize_tf",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
@@ -1114,7 +888,6 @@ tf_cc_binary(
     tags = ["hostonly"],
     deps = [
         ":compose_uniform_quantized_type_pass",
-        ":fuse_convolution_pass",
         ":legalize_stablehlo_composite_to_tfl_custom",
         ":legalize_stablehlo_custom_call_to_composite",
         ":legalize_stablehlo_to_vhlo_pass",
@@ -1125,15 +898,16 @@ tf_cc_binary(
         ":stablehlo_fuse_convolution_pass",
         ":stablehlo_unfuse_batch_norm_pass",
         ":tf_legalize_hlo",
-        ":tf_stablehlo",
         ":tfl_legalize_chlo",
         ":tfl_legalize_hlo",
         ":tfl_stablehlo",
-        ":unfuse_batch_norm_pass",
         ":uniform_quantized_stablehlo_to_tfl_pass",
         "//tensorflow/compiler/mlir:passes",
         "//tensorflow/compiler/mlir:tf_mlir_opt_main",
         "//tensorflow/compiler/mlir/stablehlo:fold_broadcast_pass",
+        "//tensorflow/compiler/mlir/stablehlo:fuse_convolution_pass",
+        "//tensorflow/compiler/mlir/stablehlo:tf_stablehlo",
+        "//tensorflow/compiler/mlir/stablehlo:unfuse_batch_norm_pass",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
index c54545bd3313..d6b46ee3d31a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
@@ -64,12 +64,7 @@ cc_library(
 gentbl_cc_library(
     name = "shlo_simplify_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_shlo_simplify.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/generated_shlo_simplify.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/shlo_simplify.td",
     deps = ["@stablehlo//:stablehlo_ops_td_files"],
@@ -91,15 +86,10 @@ cc_library(
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=ODMLConverter",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=ODMLConverter",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.cc
index cb48050db47c..778e76c79c98 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.cc
@@ -104,7 +104,7 @@ static LogicalResult FoldDivOpInternal(stablehlo::DivOp op,
   }
 
   auto res_attr = DenseElementsAttr::get(
-      const_oprs[0].getType().cast<RankedTensorType>(), res);
+      mlir::cast<RankedTensorType>(const_oprs[0].getType()), res);
   rewriter.replaceOpWithNewOp<stablehlo::ConstantOp>(adaptor.value().Op(),
                                                      res_attr);
   return success();
@@ -112,10 +112,10 @@ static LogicalResult FoldDivOpInternal(stablehlo::DivOp op,
 
 static LogicalResult FoldDivOp(stablehlo::DivOp op, PatternRewriter& rewriter) {
   auto etype = op.getType().getElementType();
-  if (etype.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(etype)) {
     return FoldDivOpInternal<APFloat>(op, rewriter);
   }
-  if (etype.isa<IntegerType>()) {
+  if (mlir::isa<IntegerType>(etype)) {
     return FoldDivOpInternal<APInt>(op, rewriter);
   }
   return failure();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.td b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.td
index c8d19baeb11d..620fd42ec054 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.td
@@ -19,10 +19,10 @@ include "mlir/IR/CommonAttrConstraints.td"
 include "mlir/IR/CommonTypeConstraints.td"
 
 def CloneF32ElementsAttrWithOnes
-  : NativeCodeCall<"DenseElementsAttr::get($0.getType().cast<ShapedType>(), (float)1.0)">;
+  : NativeCodeCall<"DenseElementsAttr::get(llvm::cast<ShapedType>($0.getType()), (float)1.0)">;
 
 def NotConstant : Constraint<
-    CPred<"$0.isa<BlockArgument>() || !llvm::isa<stablehlo::ConstantOp>($0.getDefiningOp())">,
+    CPred<"llvm::isa<BlockArgument>($0) || !llvm::isa<stablehlo::ConstantOp>($0.getDefiningOp())">,
     "Is not a constant.">;
 
 def : Pat<(StableHLO_DivOp $l,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
index fab718c7a444..5f5942dcb714 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
@@ -56,13 +56,13 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h"
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
index 60f94c690146..f9c8c4953fb9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
@@ -427,6 +427,21 @@ func.func private @XlaCallModule_odml.embedding_lookup.impl_0(%arg0: tensor<1xi3
 // CHECK:           return %[[VAL_1]] : tensor<1x2048xf32>
 // CHECK:         }
 
+func.func @embedding_lookup_dynamic(%arg0: tensor<1xi32>, %arg1: tensor<32000x2048xf32>, %arg2: tensor<i32>) -> tensor<1x2048xf32> {
+  %0 = mhlo.composite "odml.embedding_lookup" %arg2, %arg0, %arg1 {decomposition = @XlaCallModule_odml.embedding_lookup.impl_1} : (tensor<i32>, tensor<1xi32>, tensor<32000x2048xf32>) -> tensor<1x2048xf32>
+  return %0 : tensor<1x2048xf32>
+}
+func.func private @XlaCallModule_odml.embedding_lookup.impl_1(%arg2: tensor<i32>, %arg0: tensor<1xi32>, %arg1: tensor<32000x2048xf32>) -> tensor<1x2048xf32> {
+    %0 = "mhlo.gather"(%arg1, %arg0) <{dimension_numbers = #mhlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>, slice_sizes = dense<[1, 2048]> : tensor<2xi64>}> : (tensor<32000x2048xf32>, tensor<1xi32>) -> tensor<1x2048xf32>
+    return %0 : tensor<1x2048xf32>
+  }
+
+// CHECK-LABEL:   func.func @embedding_lookup_dynamic(
+// CHECK-SAME:        %[[ARG_0:.*]]: tensor<1xi32>, %[[ARG_1:.*]]: tensor<32000x2048xf32>, %[[ARG_2:.*]]: tensor<i32>) -> tensor<1x2048xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tfl.embedding_lookup"(%[[ARG_0]], %[[ARG_1]]) : (tensor<1xi32>, tensor<32000x2048xf32>) -> tensor<1x2048xf32>
+// CHECK:           return %[[VAL_1]] : tensor<1x2048xf32>
+// CHECK:         }
+
 
 func.func @random_uniform(%arg0: tensor<3xi32>) -> tensor<1x2x3xf32> {
   %0 = mhlo.composite "odml.random_uniform" %arg0 {composite_attributes = {seed = 0 : i64, seed2 = 1: i64}, decomposition = @XlaCallModule_odml.random_uniform.impl_0} : (tensor<3xi32>) -> tensor<1x2x3xf32>
@@ -451,4 +466,14 @@ func.func private @XlaCallModule_odml.random_standard_normal.impl_0(%arg0: tenso
 }
 // CHECK-LABEL  func.func @random_standard_normal
 // CHECK:  %0 = "tfl.random_standard_normal"(%arg0) <{seed = 0 : i64, seed2 = 1 : i64}> : (tensor<3xi32>) -> tensor<1x2x3xf32> 
-// CHECK:  return %0 : tensor<1x2x3xf32>
\ No newline at end of file
+// CHECK:  return %0 : tensor<1x2x3xf32>
+
+
+func.func private @XlaCallModule_tfl.unpack.impl_0(%arg0: tensor<1x3x4x1xf32>) -> (tensor<1x4x1xf32>, tensor<1x4x1xf32>, tensor<1x4x1xf32>)
+func.func @jax_unstack(%arg0: tensor<1x3x4x1xf32>) -> (tensor<1x4x1xf32>, tensor<1x4x1xf32>, tensor<1x4x1xf32>) {
+  %0:3 = mhlo.composite "tfl.unpack" %arg0 {composite_attributes = {num = 3 : i32, axis = 1 : i32}, decomposition = @XlaCallModule_tfl.unpack.impl_0} : (tensor<1x3x4x1xf32>) -> (tensor<1x4x1xf32>, tensor<1x4x1xf32>, tensor<1x4x1xf32>)
+  return %0#0, %0#1, %0#2 : tensor<1x4x1xf32>, tensor<1x4x1xf32>, tensor<1x4x1xf32>
+}
+
+// CHECK-LABEL: jax_unstack
+// CHECK: %0:3 = "tfl.unpack"(%arg0) <{axis = 1 : i32, num = 3 : i32}> : (tensor<1x3x4x1xf32>) -> (tensor<1x4x1xf32>, tensor<1x4x1xf32>, tensor<1x4x1xf32>)
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
index dc15507fc312..23edb0e03de0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
@@ -1,31 +1,22 @@
 // RUN: odml-to-stablehlo-opt %s -stablehlo-composite-legalize-tfl-custom | FileCheck %s
 
-module {
+func.func private @odml.update_kv_cache.impl_0(%arg0: tensor<1x500x4x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<100xi64>, %arg3: tensor<1x100x4x4xf32>, %arg4: tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+// CHECK-LABEL: func.func private @test_multiple_kv_caches
+func.func private @test_multiple_kv_caches(%arg0: tensor<1x500x4x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<100xi64>, %arg3: tensor<1x100x4x4xf32>, %arg4: tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>) {
+  // CHECK: %0:2 = "tfl.custom"(%arg2, %arg3, %arg4) <{custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40100000200050505092501">}> : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+  // CHECK: %1:2 = "tfl.custom"(%arg2, %arg3, %arg4) <{custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40101000200050505092501">}> : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+  %0:2 = stablehlo.composite "odml.update_kv_cache" %arg0, %arg1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+  %1:2 = stablehlo.composite "odml.update_kv_cache" %0#0, %0#1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+  return %1#0, %1#1 : tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>
+}
 
-  // CHECK-LABEL: func.func private @test_multiple_kv_caches
-  func.func private @test_multiple_kv_caches(%arg0: tensor<1x500x4x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<100xi64>, %arg3: tensor<1x100x4x4xf32>, %arg4: tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>) {
-    // CHECK: %0:2 = "tfl.custom"(%arg2, %arg3, %arg4) <{custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40100000200050505092501">}> : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
-    // CHECK: %1:2 = "tfl.custom"(%arg2, %arg3, %arg4) <{custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40101000200050505092501">}> : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
-    %0:2 = stablehlo.composite "odml.update_kv_cache" %arg0, %arg1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
-    %1:2 = stablehlo.composite "odml.update_kv_cache" %0#0, %0#1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
-    return %1#0, %1#1 : tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>
-  }
-  func.func private @odml.update_kv_cache.impl_0(%arg0: tensor<1x500x4x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<100xi64>, %arg3: tensor<1x100x4x4xf32>, %arg4: tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>) {
-    %0 = stablehlo.constant dense<500> : tensor<100xi64>
-    %1 = stablehlo.constant dense<0> : tensor<100xi64>
-    %2 = stablehlo.compare  LT, %arg2, %1 : (tensor<100xi64>, tensor<100xi64>) -> tensor<100xi1>
-    %3 = stablehlo.add %arg2, %0 : tensor<100xi64>
-    %4 = stablehlo.select %2, %3, %arg2 : tensor<100xi1>, tensor<100xi64>
-    %5 = stablehlo.reshape %4 : (tensor<100xi64>) -> tensor<100x1xi64>
-    %6 = "stablehlo.scatter"(%arg0, %5, %arg3) ({
-    ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
-      stablehlo.return %arg6 : tensor<f32>
-    }) {indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0, 2, 3], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1], index_vector_dim = 1>, unique_indices = false} : (tensor<1x500x4x4xf32>, tensor<100x1xi64>, tensor<1x100x4x4xf32>) -> tensor<1x500x4x4xf32>
-    %7 = "stablehlo.scatter"(%arg1, %5, %arg4) ({
-    ^bb0(%arg5: tensor<f32>, %arg6: tensor<f32>):
-      stablehlo.return %arg6 : tensor<f32>
-    }) {indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0, 2, 3], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1], index_vector_dim = 1>, unique_indices = false} : (tensor<1x500x4x4xf32>, tensor<100x1xi64>, tensor<1x100x4x4xf32>) -> tensor<1x500x4x4xf32>
-    return %6, %7 : tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>
-  }
+// ---
 
-}
+func.func private @test_odml_detector.detector.impl_0(%arg0: tensor<2xf32>) -> tensor<2xf32>
+// CHECK-LABEL: func.func private @test_odml_detector
+func.func @test_odml_detector(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>) {
+  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2xf32>
+  // CHECK %1 = "tfl.custom"(%0) <{custom_code = "odml.detector", custom_option = #tfl<const_bytes : "0x6E616D6500036F757400776F726B696E675F64697200082F746D702F7473740002211802010220101414042401">}> : (tensor<2xf32>) -> tensor<2xf32>
+  %1 = stablehlo.composite "odml.detector" %0 {composite_attributes = {name = "out", working_dir = "/tmp/tst"}, decomposition = @test_odml_detector.detector.impl_0} : (tensor<2xf32>) -> tensor<2xf32>
+  return %1 : tensor<2xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index c55a93fb8f6d..8753c6fc4be1 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -3753,11 +3753,11 @@ func.func @convert_gather_offset(%arg0: tensor<1x20xi32>, %arg1: tensor<1x1xi32>
 
 // CHECK-LABEL:   func @convert_gather_batching_dims(
 // CHECK-SAME:                          %[[ARG_0:.*]]: tensor<2x3x128xf32>,
-// CHECK-SAME:                          %[[ARG_1:.*]]: tensor<3x2x128x1xi32>)
+// CHECK-SAME:                          %[[ARG_1:.*]]: tensor<3x128x2x1xi32>)
 // CHECK-DAG:         %[[CST:.*]] = arith.constant dense<[6, 128]> : tensor<2xi64>
 // CHECK:             %[[VAL_0:.*]] = "tf.Reshape"(%[[ARG_0]], %[[CST]]) : (tensor<2x3x128xf32>, tensor<2xi64>) -> tensor<6x128xf32>
-// CHECK-DAG:         %[[CST_0:.*]] = "tf.Const"() <{value = dense<[1, 0, 2, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
-// CHECK:             %[[VAL_1:.*]] = "tf.Transpose"(%[[ARG_1]], %[[CST_0]]) : (tensor<3x2x128x1xi32>, tensor<4xi64>) -> tensor<2x3x128x1xi32>
+// CHECK-DAG:         %[[CST_0:.*]] = "tf.Const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK:             %[[VAL_1:.*]] = "tf.Transpose"(%[[ARG_1]], %[[CST_0]]) : (tensor<3x128x2x1xi32>, tensor<4xi64>) -> tensor<2x3x128x1xi32>
 // CHECK-DAG:         %[[CST_1:.*]] = arith.constant dense<[6, 128, 1]> : tensor<3xi64>
 // CHECK:             %[[VAL_2:.*]] = "tf.Reshape"(%[[VAL_1]], %[[CST_1]]) : (tensor<2x3x128x1xi32>, tensor<3xi64>) -> tensor<6x128x1xi32>
 // CHECK-DAG:         %[[CST_2:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
@@ -3773,23 +3773,23 @@ func.func @convert_gather_offset(%arg0: tensor<1x20xi32>, %arg1: tensor<1x1xi32>
 // CHECK:             %[[VAL_7:.*]] = "tf.GatherNd"(%[[VAL_0]], %[[VAL_6]]) <{bad_indices_policy = ""}> : {{.*}} -> tensor<6x128xf32>
 // CHECK-DAG:         %[[CST_8:.*]] = arith.constant dense<[2, 3, 128]> : tensor<3xi64>
 // CHECK:             %[[VAL_8:.*]] = "tf.Reshape"(%[[VAL_7]], %[[CST_8]]) : (tensor<6x128xf32>, tensor<3xi64>) -> tensor<2x3x128xf32>
-// CHECK-DAG:         %[[CST_9:.*]] = "tf.Const"() <{value = dense<[1, 0, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
-// CHECK:             %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_8]], %[[CST_9]]) : (tensor<2x3x128xf32>, tensor<3xi64>) -> tensor<3x2x128xf32>
+// CHECK-DAG:         %[[CST_9:.*]] = "tf.Const"() <{value = dense<[1, 2, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK:             %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_8]], %[[CST_9]]) : (tensor<2x3x128xf32>, tensor<3xi64>) -> tensor<3x128x2xf32>
 // CHECK:             return %[[VAL_9]]
 // CHECK:         }
-func.func @convert_gather_batching_dims(%arg0: tensor<2x3x128xf32>, %arg1: tensor<3x2x128x1xi32>) -> tensor<3x2x128xf32> {
+func.func @convert_gather_batching_dims(%arg0: tensor<2x3x128xf32>, %arg1: tensor<3x128x2x1xi32>) -> tensor<3x128x2xf32> {
   %0 = "mhlo.gather"(%arg0, %arg1) {
     dimension_numbers = #mhlo.gather<
       index_vector_dim = 3,
       start_index_map = [2],
       operand_batching_dims = [0, 1],
-      start_indices_batching_dims = [1, 0],
+      start_indices_batching_dims = [2, 0],
       collapsed_slice_dims = [2],
     >,
     indices_are_sorted = false,
     slice_sizes = dense<1> : tensor<3xi64>
-  } : (tensor<2x3x128xf32>, tensor<3x2x128x1xi32>) -> tensor<3x2x128xf32>
-  func.return %0 : tensor<3x2x128xf32>
+  } : (tensor<2x3x128xf32>, tensor<3x128x2x1xi32>) -> tensor<3x128x2xf32>
+  func.return %0 : tensor<3x128x2xf32>
 }
 
 // CHECK-LABEL: func @convert_gather_non_collapsed_index_dim(
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/prepare_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/prepare_hlo.mlir
index f363b369d763..2fa440eee1a3 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/prepare_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/prepare_hlo.mlir
@@ -845,3 +845,51 @@ func.func @mhlo_nd_fft(%arg0: tensor<2x3x345x256xf32>) -> tensor<2x3x345x129xcom
 // CHECK: return %2 : tensor<2x3x345x129xcomplex<f32>>
 
 // -----
+
+// CHECK-LABEL: @mhlo_dynamic_fft_1
+func.func @mhlo_dynamic_fft_1(%arg0: tensor<?x9x2560xf32>) -> tensor<?x9x1281xcomplex<f32>> {
+  %0 = "mhlo.fft"(%arg0) <{fft_length = dense<2560> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x9x2560xf32>) -> tensor<?x9x1281xcomplex<f32>>
+  return %0 : tensor<?x9x1281xcomplex<f32>>
+  // CHECK: %4 = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (tensor<?x9x2560xf32>) -> tensor<i32>
+  // CHECK: %5 = mhlo.reshape %4 : (tensor<i32>) -> tensor<1xi32>
+  // CHECK: %6 = "mhlo.concatenate"(%5, %3, %2, %1) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK: %7 = mhlo.dynamic_reshape %arg0, %6 : (tensor<?x9x2560xf32>, tensor<4xi32>) -> tensor<?x9x1x2560xf32>
+  // CHECK: %8 = "mhlo.fft"(%7) <{fft_length = dense<[1, 2560]> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x9x1x2560xf32>) -> tensor<?x9x1x1281xcomplex<f32>>
+  // CHECK: %9 = "mhlo.get_dimension_size"(%8) <{dimension = 0 : i64}> : (tensor<?x9x1x1281xcomplex<f32>>) -> tensor<i32>
+  // CHECK: %10 = mhlo.reshape %9 : (tensor<i32>) -> tensor<1xi32>
+  // CHECK: %11 = "mhlo.concatenate"(%10, %3, %0) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  // CHECK: %12 = mhlo.dynamic_reshape %8, %11 : (tensor<?x9x1x1281xcomplex<f32>>, tensor<3xi32>) -> tensor<?x9x1281xcomplex<f32>>
+  // CHECK: return %12 : tensor<?x9x1281xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: @mhlo_dynamic_fft_2
+func.func @mhlo_dynamic_fft_2(%arg0: tensor<?x?x2560xf32>) -> tensor<?x?x1281xcomplex<f32>> {
+  %0 = "mhlo.fft"(%arg0) <{fft_length = dense<2560> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x?x2560xf32>) -> tensor<?x?x1281xcomplex<f32>>
+  return %0 : tensor<?x?x1281xcomplex<f32>>
+  // CHECK: %3 = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (tensor<?x?x2560xf32>) -> tensor<i32>
+  // CHECK: %4 = mhlo.reshape %3 : (tensor<i32>) -> tensor<1xi32>
+  // CHECK: %5 = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<?x?x2560xf32>) -> tensor<i32>
+  // CHECK: %6 = mhlo.reshape %5 : (tensor<i32>) -> tensor<1xi32>
+  // CHECK: %7 = "mhlo.concatenate"(%4, %6, %2, %1) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK: %8 = mhlo.dynamic_reshape %arg0, %7 : (tensor<?x?x2560xf32>, tensor<4xi32>) -> tensor<?x?x1x2560xf32>
+  // CHECK: %9 = "mhlo.fft"(%8) <{fft_length = dense<[1, 2560]> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x?x1x2560xf32>) -> tensor<?x?x1x1281xcomplex<f32>>
+  // CHECK: %10 = "mhlo.get_dimension_size"(%9) <{dimension = 0 : i64}> : (tensor<?x?x1x1281xcomplex<f32>>) -> tensor<i32>
+  // CHECK: %11 = mhlo.reshape %10 : (tensor<i32>) -> tensor<1xi32>
+  // CHECK: %12 = "mhlo.get_dimension_size"(%9) <{dimension = 1 : i64}> : (tensor<?x?x1x1281xcomplex<f32>>) -> tensor<i32>
+  // CHECK: %13 = mhlo.reshape %12 : (tensor<i32>) -> tensor<1xi32>
+  // CHECK: %14 = "mhlo.concatenate"(%11, %13, %0) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  // CHECK: %15 = mhlo.dynamic_reshape %9, %14 : (tensor<?x?x1x1281xcomplex<f32>>, tensor<3xi32>) -> tensor<?x?x1281xcomplex<f32>>
+  // CHECK: return %15 : tensor<?x?x1281xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: @mhlo_dynamic_fft_2_neg
+func.func @mhlo_dynamic_fft_2_neg(%arg0: tensor<?x9x?xf32>) -> tensor<?x9x1281xcomplex<f32>> {
+  %0 = "mhlo.fft"(%arg0) <{fft_length = dense<2560> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x9x?xf32>) -> tensor<?x9x1281xcomplex<f32>>
+  return %0 : tensor<?x9x1281xcomplex<f32>>
+  // CHECK: %0 = "mhlo.fft"(%arg0) <{fft_length = dense<2560> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x9x?xf32>) -> tensor<?x9x1281xcomplex<f32>>
+  // CHECK: return %0 : tensor<?x9x1281xcomplex<f32>>
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
index a8146487705c..a77d02e78c1d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
@@ -1758,27 +1758,27 @@ func.func @gather_offset(%arg0: tensor<1x20xi32>, %arg1: tensor<1x1xi32>) -> ten
 
 
 // CHECK-LABEL: gather_batching_dims
-func.func @gather_batching_dims(%arg0: tensor<2x3x128xf32>, %arg1: tensor<3x2x128x1xi32>) -> tensor<3x2x128xf32> {
+func.func @gather_batching_dims(%arg0: tensor<2x3x128xf32>, %arg1: tensor<3x128x2x1xi32>) -> tensor<3x128x2xf32> {
   %0 = "mhlo.gather"(%arg0, %arg1) {
     dimension_numbers = #mhlo.gather<
       index_vector_dim = 3,
       start_index_map = [2],
       operand_batching_dims = [0, 1],
-      start_indices_batching_dims = [1, 0],
+      start_indices_batching_dims = [2, 0],
       collapsed_slice_dims = [2],
     >,
     indices_are_sorted = false,
     slice_sizes = dense<1> : tensor<3xi64>
-  } : (tensor<2x3x128xf32>, tensor<3x2x128x1xi32>) -> tensor<3x2x128xf32>
-  func.return %0 : tensor<3x2x128xf32>
+  } : (tensor<2x3x128xf32>, tensor<3x128x2x1xi32>) -> tensor<3x128x2xf32>
+  func.return %0 : tensor<3x128x2xf32>
 }
 
 // CHECK-DAG: %[[CST:.*]] = arith.constant dense<[6, 128]> : tensor<2xi64>
 // CHECK:     %[[VAL_0:.*]] = "tfl.cast"(%[[CST]]) : (tensor<2xi64>) -> tensor<2xi32>
 // CHECK:     %[[VAL_1:.*]] = "tfl.reshape"(%arg0, %[[VAL_0]]) : (tensor<2x3x128xf32>, tensor<2xi32>) -> tensor<6x128xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 0, 2, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK-DAG: %[[VAL_2:.*]] = "tfl.pseudo_const"() <{value = dense<[2, 0, 1, 3]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK:     %[[VAL_3:.*]] = "tfl.cast"(%[[VAL_2]]) : (tensor<4xi64>) -> tensor<4xi32>
-// CHECK:     %[[VAL_4:.*]] = "tfl.transpose"(%arg1, %[[VAL_3]]) : (tensor<3x2x128x1xi32>, tensor<4xi32>) -> tensor<2x3x128x1xi32>
+// CHECK:     %[[VAL_4:.*]] = "tfl.transpose"(%arg1, %[[VAL_3]]) : (tensor<3x128x2x1xi32>, tensor<4xi32>) -> tensor<2x3x128x1xi32>
 // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<[6, 128, 1]> : tensor<3xi64>
 // CHECK:     %[[VAL_5:.*]] = "tfl.cast"(%[[CST_0]]) : (tensor<3xi64>) -> tensor<3xi32>
 // CHECK:     %[[VAL_6:.*]] = "tfl.reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<2x3x128x1xi32>, tensor<3xi32>) -> tensor<6x128x1xi32>
@@ -1796,9 +1796,9 @@ func.func @gather_batching_dims(%arg0: tensor<2x3x128xf32>, %arg1: tensor<3x2x12
 // CHECK-DAG: %[[CST_6:.*]] = arith.constant dense<[2, 3, 128]> : tensor<3xi64>
 // CHECK:     %[[VAL_13:.*]] = "tfl.cast"(%[[CST_6]]) : (tensor<3xi64>) -> tensor<3xi32>
 // CHECK:     %[[VAL_14:.*]] = "tfl.reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<6x128xf32>, tensor<3xi32>) -> tensor<2x3x128xf32>
-// CHECK:     %[[VAL_15:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 0, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK:     %[[VAL_15:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 2, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK:     %[[VAL_16:.*]] = "tfl.cast"(%[[VAL_15]]) : (tensor<3xi64>) -> tensor<3xi32>
-// CHECK:     %[[VAL_17:.*]] = "tfl.transpose"(%[[VAL_14]], %[[VAL_16]]) : (tensor<2x3x128xf32>, tensor<3xi32>) -> tensor<3x2x128xf32>
+// CHECK:     %[[VAL_17:.*]] = "tfl.transpose"(%[[VAL_14]], %[[VAL_16]]) : (tensor<2x3x128xf32>, tensor<3xi32>) -> tensor<3x128x2xf32>
 
 // -----
 
@@ -3801,6 +3801,26 @@ func.func @mhlo_nd_fft_1(%arg0: tensor<2x3x345x4x256xf32>) -> tensor<2x3x345x4x1
 
 // -----
 
+// CHECK-LABEL: @mhlo_dynamic_fft_1
+func.func @mhlo_dynamic_fft_1(%arg0: tensor<?x9x1x2560xf32>) -> tensor<?x9x1x1281xcomplex<f32>> {
+  %0 = "mhlo.fft"(%arg0) <{fft_length = dense<[1, 2560]> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x9x1x2560xf32>) -> tensor<?x9x1x1281xcomplex<f32>>
+  return %0 : tensor<?x9x1x1281xcomplex<f32>>
+  // CHECK: %cst = arith.constant dense<[1, 2560]> : tensor<2xi32>
+  // CHECK: %0 = "tfl.rfft2d"(%arg0, %cst) : (tensor<?x9x1x2560xf32>, tensor<2xi32>) -> tensor<?x9x1x1281xcomplex<f32>>
+  // CHECK: return %0 : tensor<?x9x1x1281xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: @mhlo_dynamic_fft_2
+func.func @mhlo_dynamic_fft_2(%arg0: tensor<?x?x1x2560xf32>) -> tensor<?x?x1x1281xcomplex<f32>> {
+  %9 = "mhlo.fft"(%arg0) <{fft_length = dense<[1, 2560]> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT>}> : (tensor<?x?x1x2560xf32>) -> tensor<?x?x1x1281xcomplex<f32>>
+  return %9 : tensor<?x?x1x1281xcomplex<f32>>
+  // CHECK: %cst = arith.constant dense<[1, 2560]> : tensor<2xi32>
+  // CHECK: %0 = "tfl.rfft2d"(%arg0, %cst) : (tensor<?x?x1x2560xf32>, tensor<2xi32>) -> tensor<?x?x1x1281xcomplex<f32>>
+  // CHECK: return %0 : tensor<?x?x1x1281xcomplex<f32>>
+}
+
 //===----------------------------------------------------------------------===//
 // mhlo.imag
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
index 6c118468653c..4107859b7412 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
@@ -429,9 +429,19 @@ class UniformDequantizeFunctionCallPattern {
 class ComposeUniformQuantizedConvolutionOp
     : public OpRewritePattern<stablehlo::ConvolutionOp> {
  public:
-  using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::ConvolutionOp op) const final {
+  LogicalResult matchAndRewrite(stablehlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const final {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(stablehlo::ConvolutionOp op) const {
     // Verify operands' types.
     for (Type operand_type : op.getOperandTypes()) {
       if (Type element_type =
@@ -643,8 +653,7 @@ class ComposeUniformQuantizedConvolutionOp
     return success();
   }
 
-  void rewrite(stablehlo::ConvolutionOp op,
-               PatternRewriter& rewriter) const final {
+  void rewrite(stablehlo::ConvolutionOp op, PatternRewriter& rewriter) const {
     // Rewrite `call @uniform_quantize` -> `stablehlo.uniform_quantize`.
     auto input_i8_to_f32_convert_op =
         cast<stablehlo::ConvertOp>(op.getOperand(0).getDefiningOp());
@@ -883,8 +892,19 @@ class ComposeUniformQuantizedConvolutionOp
 class ComposeUniformQuantizedDotGeneralOp
     : public OpRewritePattern<stablehlo::DotGeneralOp> {
  public:
-  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
-  LogicalResult match(stablehlo::DotGeneralOp op) const final {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(stablehlo::DotGeneralOp op,
+                                PatternRewriter& rewriter) const final {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(stablehlo::DotGeneralOp op) const {
     auto input_i8_to_f32_convert_op =
         TryCast<stablehlo::ConvertOp>(op.getOperand(0).getDefiningOp(),
                                       /*name=*/"input_i8_to_f32_convert_op");
@@ -988,8 +1008,7 @@ class ComposeUniformQuantizedDotGeneralOp
     return success();
   }
 
-  void rewrite(stablehlo::DotGeneralOp op,
-               PatternRewriter& rewriter) const final {
+  void rewrite(stablehlo::DotGeneralOp op, PatternRewriter& rewriter) const {
     // Build uniform quantized type for input.
     auto input_i8_to_f32_convert_op =
         cast<stablehlo::ConvertOp>(op.getOperand(0).getDefiningOp());
@@ -1306,9 +1325,19 @@ class ComposeUniformQuantizedDotGeneralOp
 class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
     : public OpRewritePattern<stablehlo::DotGeneralOp> {
  public:
-  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::DotGeneralOp op) const final {
+  LogicalResult matchAndRewrite(stablehlo::DotGeneralOp op,
+                                PatternRewriter& rewriter) const final {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(stablehlo::DotGeneralOp op) const {
     // q1 - z1
     if (failed(MatchQuantizedOperand(op.getOperand(0)))) {
       LLVM_DEBUG(llvm::dbgs()
@@ -1365,8 +1394,7 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
     return success();
   }
 
-  void rewrite(stablehlo::DotGeneralOp op,
-               PatternRewriter& rewriter) const final {
+  void rewrite(stablehlo::DotGeneralOp op, PatternRewriter& rewriter) const {
     // Build uniform quantized type for input 1 (lhs).
     auto input1_zero_point_subtract_op =
         cast<stablehlo::SubtractOp>(op.getOperand(0).getDefiningOp());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
index 7fe70321a1dd..2cf060c6379d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
@@ -29,21 +29,21 @@ def LegalizeHardSwishComposite: Pat<
                     (TFL_HardSwishOp $input)>;
 
 def IsNchwLayoutOp: Constraint<CPred<
-  "$0.get(\"is_nchw_op\") && $0.get(\"is_nchw_op\").dyn_cast<BoolAttr>() "
+  "$0.get(\"is_nchw_op\") && llvm::dyn_cast<BoolAttr>($0.get(\"is_nchw_op\")) "
       "== mlir::BoolAttr::get($_builder.getContext(), true)">>;
 
 def IsNhwcLayoutOp: Constraint<Neg<IsNchwLayoutOp.predicate>>;
 
 class HasRank<int n> : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-          "$0.getType().cast<ShapedType>().getRank() == " # n>>;
+    CPred<"llvm::cast<ShapedType>($0.getType()).hasRank() && "
+          "llvm::cast<ShapedType>($0.getType()).getRank() == " # n>>;
 
 class HasRankAtLeast<int n> : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-          "$0.getType().cast<ShapedType>().getRank() >= " # n>>;
+    CPred<"llvm::cast<ShapedType>($0.getType()).hasRank() && "
+          "llvm::cast<ShapedType>($0.getType()).getRank() >= " # n>>;
 
 def I32ElementsVal : Constraint<CPred<
-  "$0.getType().cast<TensorType>().getElementType().isInteger(32)">,
+  "llvm::cast<TensorType>($0.getType()).getElementType().isInteger(32)">,
   "32 bit integer tensor">;
 
 // TODO(b/343278954): Move the creation of transposes to a separate prepare pass
@@ -133,6 +133,27 @@ def LegalizeCompositeGELU : Pat<
         (TFL_GeluOp $inputs,
           (GetCompositeAttributeAs<"approximate", "BoolAttr"> $attrs))>;
 
+def LegalizeCompositeGELUDynamicShaped : Pat<
+        (MHLO_CompositeOp:$composite
+          (variadic $_, $inputs),
+          ConstantStrAttr<StrAttr, "tfl.gelu">, $attrs, $_, $_),
+        (TFL_GeluOp $inputs,
+          (GetCompositeAttributeAs<"approximate", "BoolAttr"> $attrs))>;
+
+def LegalizeCompositeGELUDynamicShaped2 : Pat<
+        (MHLO_CompositeOp:$composite
+          (variadic $_, $_, $inputs),
+          ConstantStrAttr<StrAttr, "tfl.gelu">, $attrs, $_, $_),
+        (TFL_GeluOp $inputs,
+          (GetCompositeAttributeAs<"approximate", "BoolAttr"> $attrs))>;
+
+def LegalizeCompositeGELUDynamicShaped3 : Pat<
+        (MHLO_CompositeOp:$composite
+          (variadic $_, $_, $_, $inputs),
+          ConstantStrAttr<StrAttr, "tfl.gelu">, $attrs, $_, $_),
+        (TFL_GeluOp $inputs,
+          (GetCompositeAttributeAs<"approximate", "BoolAttr"> $attrs))>;
+
 def LegalizeCompositeOdmlEmbeddingLookup : Pat<
         (MHLO_CompositeOp:$composite
           (variadic $indices, $table),
@@ -151,6 +172,24 @@ def LegalizeCompositeOdmlEmbeddingLookupDynamicShaped : Pat<
         (I32ElementsVal $indices),
         (HasRankAtLeast<2> $table)]>;
 
+def LegalizeCompositeOdmlEmbeddingLookupDynamicShaped2 : Pat<
+        (MHLO_CompositeOp:$composite
+          (variadic $_, $_, $indices, $table),
+          ConstantStrAttr<StrAttr, "odml.embedding_lookup">, $attrs, $_, $_),
+        (TFL_EmbeddingLookupOp $indices, $table),
+        [(HasRank<1> $indices),
+        (I32ElementsVal $indices),
+        (HasRankAtLeast<2> $table)]>;
+
+def LegalizeCompositeOdmlEmbeddingLookupDynamicShaped3 : Pat<
+        (MHLO_CompositeOp:$composite
+          (variadic $_, $indices, $table),
+          ConstantStrAttr<StrAttr, "odml.embedding_lookup">, $attrs, $_, $_),
+        (TFL_EmbeddingLookupOp $indices, $table),
+        [(HasRank<1> $indices),
+        (I32ElementsVal $indices),
+        (HasRankAtLeast<2> $table)]>;
+
 def LegalizeCompositeOdmlRandomUniform : Pat<
         (MHLO_CompositeOp:$composite
           (variadic $shape),
@@ -165,4 +204,22 @@ def LegalizeCompositeOdmlRandomStandardNormal : Pat<
           ConstantStrAttr<StrAttr, "odml.random_standard_normal">, $attrs, $_, $_),
         (TFL_RandomStandardNormalOp $shape,
           (GetCompositeAttributeAs<"seed", "IntegerAttr"> $attrs),
-          (GetCompositeAttributeAs<"seed2", "IntegerAttr"> $attrs))>;
\ No newline at end of file
+          (GetCompositeAttributeAs<"seed2", "IntegerAttr"> $attrs))>;
+
+def LegalizeCompositeUnpack : Pat<
+        (MHLO_CompositeOp:$composite
+          (variadic $inputs),
+          ConstantStrAttr<StrAttr, "tfl.unpack">, $attrs, $_, $_),
+        (TFL_UnpackOp $inputs,
+          (GetCompositeAttributeAs<"num", "IntegerAttr"> $attrs),
+          (GetCompositeAttributeAs<"axis", "IntegerAttr"> $attrs))>;
+
+def LegalizeCompositePack4Elements : Pat<
+        (MHLO_CompositeOp:$composite
+          // TD not able to represent variadic of variadic now.
+          // Move to C++ matcher to support more cases.
+          (variadic $i0, $i1, $i2, $i3),
+          ConstantStrAttr<StrAttr, "tfl.pack">, $attrs, $_, $_),
+        (TFL_PackOp (variadic $i0, $i1, $i2, $i3),
+          (GetCompositeAttributeAs<"values_count", "IntegerAttr"> $attrs),
+          (GetCompositeAttributeAs<"axis", "IntegerAttr"> $attrs))>;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
index 30d6f4247fba..7d905119b3f0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
@@ -33,7 +33,7 @@ def GetI32DenseAttr: NativeCodeCall<
 // Receives a composite DictionaryAttr and returns the value of the Attribute
 // with the key `attr_name` as the type provided by `attr_type`.
 class GetCompositeAttributeAs<string attr_name, string attr_type>:
-  NativeCodeCall<"$0.get(\"" # attr_name # "\").dyn_cast<" # attr_type # ">()">;
+  NativeCodeCall<"llvm::dyn_cast<" # attr_type # ">($0.get(\"" # attr_name # "\"))">;
 
 // Receives a composite DictionaryAttr and returns the value of the Attribute
 // with the key `attr_name` as a DenseIntElementsAttr.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 67763345add8..044848ce93ce 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -2809,7 +2810,7 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     }
     for (int i = 0; i < slice_sizes_vector.size(); ++i) {
       int s = slice_sizes_vector[i];
-      if (llvm::count(start_indices_batching_dims, i)) {
+      if (llvm::count(operand_batching_dims, i)) {
         if (s != 1) {
           return rewriter.notifyMatchFailure(gather_op,
                                              "unsupported slice sizes");
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
index 0d47a3f038f5..9e2f1cf33f49 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
@@ -96,6 +96,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -340,8 +341,8 @@ cc_library(
     srcs = ["fft.cc"],
     hdrs = ["fft.h"],
     deps = [
-        "//tensorflow/compiler/mlir/lite:const_tensor_utils",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
index ec9b0e16778b..f89f8acd4463 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <optional>
 
+#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -27,6 +29,7 @@ limitations under the License.
 
 namespace mlir {
 namespace odml {
+namespace {
 
 class ConvertCustomCallOp : public OpConversionPattern<mhlo::CustomCallOp> {
  public:
@@ -37,10 +40,45 @@ class ConvertCustomCallOp : public OpConversionPattern<mhlo::CustomCallOp> {
       ConversionPatternRewriter& rewriter) const final;
 };
 
+// TFL op on StableHLO CustomCall carrier must serialize its attributes in
+// the CustomCallOp's backend_config StringAttr, following MLIR
+// DictionaryAttr serialization format. If no attributes are specified,
+// the backend_config should be the serialized empty DictionaryAttr.
+mlir::DictionaryAttr ParseSerializedTFLOpAttributes(
+    std::optional<mlir::Attribute> backend_config, MLIRContext* ctx) {
+  if (!backend_config) {
+    return nullptr;
+  }
+
+  auto serialized_attributes =
+      mlir::dyn_cast_or_null<mlir::StringAttr>(*backend_config);
+  if (!serialized_attributes) {
+    return nullptr;
+  }
+
+  auto dict_attribute = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(
+      parseAttribute(serialized_attributes.getValue(), ctx));
+  return dict_attribute;
+}
+
 LogicalResult ConvertCustomCallOp::matchAndRewrite(
     mhlo::CustomCallOp mhlo_custom_call, OpAdaptor adaptor,
     ConversionPatternRewriter& rewriter) const {
   auto call_target_name = mhlo_custom_call.getCallTargetName();
+  if (call_target_name.starts_with("tfl.")) {
+    auto bc = mhlo_custom_call.getBackendConfig();
+    if (mlir::DictionaryAttr attributes =
+            ParseSerializedTFLOpAttributes(bc, getContext())) {
+      // Short-cut: TFL direct lowering on StableHLO CustomCall carrier.
+      mlir::OperationState new_op(mhlo_custom_call.getLoc(), call_target_name,
+                                  mhlo_custom_call.getOperands(),
+                                  mhlo_custom_call.getResultTypes(),
+                                  attributes.getValue());
+      rewriter.replaceOp(mhlo_custom_call, rewriter.create(new_op));
+      return success();
+    }
+  }
+
   if (!call_target_name.starts_with("custom_call.")) {
     return failure();
   }
@@ -102,9 +140,16 @@ std::optional<bool> IsCustomCallLegal(mhlo::CustomCallOp op) {
       return false;
     }
   }
+  if (call_target_name.starts_with("tfl.")) {
+    auto bc = op.getBackendConfig();
+    if (!bc || mlir::isa<mlir::DictionaryAttr, mlir::StringAttr>(*bc)) {
+      return false;
+    }
+  }
 
   return true;
 }
+}  // namespace
 
 void PopulateCustomCallPatterns(MLIRContext* ctx, RewritePatternSet& patterns,
                                 ConversionTarget& target) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
index 8f08a0f8a2b1..f2d29774c31c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <limits>
 #include <optional>
 #include <vector>
 
+#include "mhlo/IR/hlo_ops.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -32,7 +32,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
-#include "tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir::odml {
@@ -62,14 +62,6 @@ bool IsSupportedRfftOp(mhlo::FftOp fft_op) {
 
   if (fft_lengths.size() > 2) return false;  // Only support 2D FFT.
 
-  // TFLite RFFT2d supports only int32 fft_lengths that are powers of 2.
-  for (int64_t fft_length : fft_lengths) {
-    if (fft_length != 1 && (!TFL::IsPowerOfTwo(fft_length) ||
-                            fft_length > std::numeric_limits<int32_t>::max())) {
-      return false;
-    }
-  }
-
   // Check if the trailing input shape matches the fft_lengths.
   const std::vector<int64_t> input_shape =
       mlir::cast<ShapedType>(fft_op.getOperand().getType()).getShape();
@@ -77,6 +69,16 @@ bool IsSupportedRfftOp(mhlo::FftOp fft_op) {
                     fft_lengths.begin(), fft_lengths.end());
 }
 
+// Returns a tensor of the dimension size of the input tensor. Result of
+// mhlo::GetDimensionSizeOp is always a scalar value, but we need a tensor to
+// concatenate with other dimension sizes.
+Value GetDimensionSizeTensor(OpBuilder& rewriter, Location loc, Value input,
+                             int64_t dim) {
+  auto size_scalar = rewriter.create<mhlo::GetDimensionSizeOp>(loc, input, dim);
+  return rewriter.create<mhlo::ReshapeOp>(
+      loc, RankedTensorType::get({1}, rewriter.getI32Type()), size_scalar);
+}
+
 // Convert rfft to rfft2d.
 // The transformation pattern looks like below:
 //
@@ -114,18 +116,22 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
     auto input_type =
         mlir::dyn_cast_or_null<RankedTensorType>(fft_op.getOperand().getType());
     const std::vector<int64_t> input_shape =
-        mlir::cast<ShapedType>(fft_op.getOperand().getType()).getShape();
+        input_type
+            ? input_type.getShape()
+            : mlir::cast<ShapedType>(fft_op.getOperand().getType()).getShape();
 
-    auto fft_operand = fft_op.getOperand();
+    Value fft_operand = fft_op.getOperand();
     auto output_type = mlir::cast<ShapedType>(fft_op.getResult().getType());
 
     // Create a new fft_length attribute for the 2D FFT.
     SmallVector<int64_t, 3> new_fft_lengths = {1, fft_lengths.back()};
     auto new_fft_lengths_attr = rewriter.getI64TensorAttr(new_fft_lengths);
 
+    bool is_dynamic_shape = !input_type || !input_type.hasStaticShape();
+
     // Input can have a single trivial batch dim next to the fft dimension, in
     // which case we don't need to expand the input.
-    if (input_type && (input_shape[input_shape.size() - 2] != 1)) {
+    if (input_shape[input_shape.size() - 2] != 1) {
       const std::vector<int64_t> output_shape = output_type.getShape();
 
       // [a, b, c, d, e] -> [a, b, c, d, 1, e]
@@ -133,11 +139,42 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
                                                    input_shape.end() - 1};
       expanded_input_shape.push_back(1);
       expanded_input_shape.push_back(input_shape.back());
-      // Replace the expand_dims op with a reshape op:
-      auto expanded_input_type = mlir::RankedTensorType::get(
+      auto expanded_input_type = tensorflow::GetTypeFromTFTensorShape(
           expanded_input_shape, input_type.getElementType());
-      fft_operand = rewriter.create<mhlo::ReshapeOp>(
-          fft_op.getLoc(), expanded_input_type, fft_operand);
+
+      // Dynamic shape needs to be handled separately as mhlo::ReshapeOp does
+      // not support dynamic shape.
+      if (is_dynamic_shape) {
+        // Programmatically-
+        // 1. Get the dimensions of the input tensor and create shape vector.
+        // 2. Insert a 1 as the penultimate dimension size.
+        // 3. Concatenate the dimension sizes to create a new SHAPE tensor.
+        SmallVector<Value> expanded_input_shape_values;
+        for (int i = 0; i < input_shape.size() - 1; ++i) {
+          expanded_input_shape_values.push_back(GetDimensionSizeTensor(
+              rewriter, fft_op.getLoc(), fft_operand, i));
+        }
+        expanded_input_shape_values.push_back(rewriter.create<mhlo::ConstantOp>(
+            fft_op.getLoc(), rewriter.getI32TensorAttr({1})));
+        expanded_input_shape_values.push_back(GetDimensionSizeTensor(
+            rewriter, fft_op.getLoc(), fft_operand, input_shape.size() - 1));
+
+        auto expanded_input_shape_tensor = rewriter.create<mhlo::ConcatenateOp>(
+            fft_op.getLoc(),
+            RankedTensorType::get(
+                {static_cast<int64_t>(expanded_input_shape_values.size())},
+                rewriter.getI32Type()),
+            expanded_input_shape_values, 0);
+
+        // Create a new mhlo.dynamic_reshape op with the expanded input and
+        // expanded input shape. SHAPE tensor is created in the previous step.
+        fft_operand = rewriter.create<mhlo::DynamicReshapeOp>(
+            fft_op.getLoc(), expanded_input_type, fft_operand,
+            expanded_input_shape_tensor);
+      } else {
+        fft_operand = rewriter.create<mhlo::ReshapeOp>(
+            fft_op.getLoc(), expanded_input_type, fft_operand);
+      }
 
       SmallVector<int64_t, 6> new_output_shape = {output_shape.begin(),
                                                   output_shape.end() - 1};
@@ -152,12 +189,34 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
         rewriter.create<mhlo::FftOp>(fft_op.getLoc(), output_type, fft_operand,
                                      fft_op.getFftType(), new_fft_lengths_attr);
 
-    if (input_type && (input_shape[input_shape.size() - 2] != 1)) {
+    if (input_shape[input_shape.size() - 2] != 1) {
       // Squeeze the output dimensions back to 2D.
-      auto squeeze_op = rewriter.create<mhlo::ReshapeOp>(
-          fft_op.getLoc(), fft_op.getResult().getType(), new_fft.getResult());
-
-      rewriter.replaceOp(fft_op, squeeze_op.getResult());
+      if (is_dynamic_shape) {
+        SmallVector<Value> output_shape_values;
+        for (int i = 0; i < new_fft.getResult().getType().getShape().size() - 2;
+             ++i) {
+          output_shape_values.push_back(GetDimensionSizeTensor(
+              rewriter, fft_op.getLoc(), new_fft.getResult(), i));
+        }
+        output_shape_values.push_back(GetDimensionSizeTensor(
+            rewriter, fft_op.getLoc(), new_fft.getResult(),
+            new_fft.getResult().getType().getShape().size() - 1));
+
+        auto shape_tensor = rewriter.create<mhlo::ConcatenateOp>(
+            fft_op.getLoc(),
+            RankedTensorType::get(
+                {static_cast<int64_t>(output_shape_values.size())},
+                rewriter.getI32Type()),
+            output_shape_values, 0);
+        auto squeeze_op = rewriter.create<mhlo::DynamicReshapeOp>(
+            fft_op.getLoc(), fft_op.getResult().getType(), new_fft.getResult(),
+            shape_tensor);
+        rewriter.replaceOp(fft_op, squeeze_op.getResult());
+      } else {
+        auto squeeze_op = rewriter.create<mhlo::ReshapeOp>(
+            fft_op.getLoc(), fft_op.getResult().getType(), new_fft.getResult());
+        rewriter.replaceOp(fft_op, squeeze_op.getResult());
+      }
     } else {
       rewriter.replaceOp(fft_op, new_fft.getResult());
     }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gather.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gather.cc
index daaea546077c..e10ec578f8cb 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gather.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gather.cc
@@ -614,7 +614,7 @@ LogicalResult LegalizeGatherToGatherND::matchAndRewrite(
   }
   for (int i = 0; i < slice_sizes_vector.size(); ++i) {
     int s = slice_sizes_vector[i];
-    if (llvm::count(start_indices_batching_dims, i)) {
+    if (llvm::count(operand_batching_dims, i)) {
       if (s != 1) {
         return rewriter.notifyMatchFailure(gather_op,
                                            "unsupported slice sizes");
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
index b3d619b0dd8c..05a68b2cff37 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
@@ -146,27 +146,27 @@ def : Pat<(MHLO_ConvertOp MHLO_Tensor:$operand),
 foreach Mapping = [[MHLO_AbsOp, TF_AbsOp],
                    [MHLO_BitcastConvertOp, TF_BitcastOp],
                    [MHLO_CeilOp, TF_CeilOp],
-                   [MHLO_CosineOp, TF_CosOp],
-                   [MHLO_Expm1Op, TF_Expm1Op],
                    [MHLO_FloorOp, TF_FloorOp],
                    [MHLO_ImagOp, TF_ImagOp],
                    [MHLO_IsFiniteOp, TF_IsFiniteOp],
-                   [MHLO_LogOp, TF_LogOp],
-                   [MHLO_Log1pOp, TF_Log1pOp],
-                   [MHLO_LogisticOp, TF_SigmoidOp],
                    [MHLO_NegOp, TF_NegOp],
                    [MHLO_RealOp, TF_RealOp],
-                   [MHLO_RsqrtOp, TF_RsqrtOp],
-                   [MHLO_SineOp, TF_SinOp],
-                   [MHLO_SignOp, TF_SignOp],
-                   [MHLO_SqrtOp, TF_SqrtOp],
-                   [MHLO_TanhOp, TF_TanhOp]] in
+                   [MHLO_SignOp, TF_SignOp]] in
   def : Pat<(Mapping[0] TF_IntOrFpTensor:$input), (Mapping[1] $input)>;
 
 def ConstDefaultResultAccuracyAttr :
   ConstantAttr<MHLO_ResultAccuracyAttr, "::mlir::mhlo::ResultAccuracyMode::DEFAULT">;
 
-foreach Mapping = [[MHLO_ExpOp, TF_ExpOp]] in {
+foreach Mapping = [[MHLO_CosineOp, TF_CosOp],
+                   [MHLO_Expm1Op, TF_Expm1Op],
+                   [MHLO_ExpOp, TF_ExpOp],
+                   [MHLO_LogOp, TF_LogOp],
+                   [MHLO_Log1pOp, TF_Log1pOp],
+                   [MHLO_LogisticOp, TF_SigmoidOp],
+                   [MHLO_RsqrtOp, TF_RsqrtOp],
+                   [MHLO_SineOp, TF_SinOp],
+                   [MHLO_SqrtOp, TF_SqrtOp],
+                   [MHLO_TanhOp, TF_TanhOp]] in {
   def : Pat<(Mapping[0] $input, ConstDefaultResultAccuracyAttr),
            (Mapping[1] MHLO_Tensor:$input)>;
 }
@@ -283,7 +283,7 @@ def : Pat<(MHLO_ConcatenateOp $inputs, $dim),
 //===----------------------------------------------------------------------===//
 
 class HasChloCompareType<string value> :
-    CPred<"$_self.cast<::mlir::chlo::ComparisonTypeAttr>().getValue() == " # value>;
+    CPred<"llvm::cast<::mlir::chlo::ComparisonTypeAttr>($_self).getValue() == " # value>;
 
 // Attribute value should be such that it matches the comparison used by
 // TensorFlow, if the attribute is present.
@@ -298,7 +298,7 @@ class CHLO_ComparisonDirectionValue<string enumStr> :
   ConstantAttr<CHLO_ComparisonDirectionAttr, "::mlir::chlo::ComparisonDirection::" # enumStr>;
 
 class HasMhloCompareType<string value> :
-    CPred<"$_self.cast<::mlir::mhlo::ComparisonTypeAttr>().getValue() == " # value>;
+    CPred<"llvm::cast<::mlir::mhlo::ComparisonTypeAttr>($_self).getValue() == " # value>;
 
 // Attribute value should be such that it matches the comparison used by
 // TensorFlow, if the attribute is present.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
index dc7ba979076b..8625fe82afc4 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
@@ -45,7 +45,7 @@ bool IsSupportedComposite(::mlir::stablehlo::CompositeOp op) {
   // List of supported composites to represent using CustomOp.
   return llvm::is_contained(
       {"odml.update_kv_cache", "odml.update_external_kv_cache",
-       "odml.quantize_and_dequantize"},
+       "odml.quantize_and_dequantize", "odml.detector"},
       op.getName());
 }
 
@@ -74,6 +74,12 @@ LogicalResult BuildOption(flexbuffers::Builder* fbb, Operation* op,
     return success();
   }
 
+  if (mlir::isa<::mlir::StringAttr>(attr)) {
+    fbb->String(
+        key, mlir::dyn_cast<mlir::StringAttr>(attr).getValue().str().c_str());
+    return success();
+  }
+
   return op->emitWarning("serialization not supported for : ") << key;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
index e1f1681a3d7a..704dbf37d680 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
@@ -79,7 +79,6 @@ class StablehloToOdmlTypeConverter : public vhlo::VhloTypeConverter {
     });
     addBuiltinToVhloConversions();
 
-    addArgumentMaterialization(MaterializeIllegalCast);
     addSourceMaterialization(MaterializeIllegalCast);
     addTargetMaterialization(MaterializeIllegalCast);
   }
@@ -112,7 +111,6 @@ class VhloToStablehloTypeConverter : public vhlo::VhloTypeConverter {
     });
     addVhloToBuiltinConversions();
 
-    addArgumentMaterialization(MaterializeIllegalCast);
     addSourceMaterialization(MaterializeIllegalCast);
     addTargetMaterialization(MaterializeIllegalCast);
   }
@@ -144,7 +142,7 @@ void ConvertAndWrapUsesInUnrealizedCast(Value result, TypeConverter &converter,
                                         IRRewriter &rewriter) {
   auto type = result.getType();
   result.setType(converter.convertType(result.getType()));
-  auto new_value = converter.materializeArgumentConversion(
+  auto new_value = converter.materializeSourceConversion(
       rewriter, result.getLoc(), type, {result});
   rewriter.replaceAllUsesExcept(result, new_value, new_value.getDefiningOp());
 }
@@ -160,7 +158,7 @@ void WrapOperandsInUnrealizedCastAndConvert(Operation *op,
                                             IRRewriter &rewriter) {
   for (int i = 0; i < op->getNumOperands(); ++i) {
     auto operand = op->getOperand(i);
-    auto new_operand = converter.materializeArgumentConversion(
+    auto new_operand = converter.materializeSourceConversion(
         rewriter, op->getLoc(), converter.convertType(operand.getType()),
         {operand});
     op->setOperand(i, new_operand);
@@ -218,7 +216,7 @@ LogicalResult ApplyStablehloToVhloPatterns(ModuleOp module,
 
   StablehloToOdmlTypeConverter converter;
   RewritePatternSet patterns(context);
-  stablehlo::populateStablehloToVhloPatterns(&patterns, &converter, context);
+  stablehlo::populateStablehloToVhloPatterns(context, &patterns, &converter);
 
   if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
     return module->emitError("Failed partial conversion to VHLO");
@@ -248,7 +246,7 @@ LogicalResult ApplyVhloToStablehloPatterns(ModuleOp module) {
 
   VhloToStablehloTypeConverter converter;
   RewritePatternSet patterns(context);
-  stablehlo::populateVhloToStablehloPatterns(&patterns, &converter, context);
+  stablehlo::populateVhloToStablehloPatterns(context, &patterns, &converter);
 
   if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
     return module->emitError("Failed partial conversion to StableHLO");
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
index 7ff1ce6cc29d..321fa5519efb 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
@@ -40,8 +40,8 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 
 namespace mlir {
 namespace odml {
@@ -97,7 +97,7 @@ void PrintOpStatsPass::runOnOperation() {
         isa<ShapedType>(op->getResult(0).getType())) {
       // Use rhs operand to detect types for dynamic range quantizable ops.
       Value value_for_deducing_op_type =
-          (dyn_cast_or_null<DynamicRangeQuantizedOpInterface>(op))
+          (dyn_cast_or_null<TFL::DynamicRangeQuantizedOpInterface>(op))
               ? op->getOperand(1)
               : op->getResult(0);
       ShapedType value_shaped_type = mlir::dyn_cast_or_null<ShapedType>(
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize_layout.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize_layout.cc
index d251f49cfa28..b0bbeb57c5a6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize_layout.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize_layout.cc
@@ -91,7 +91,7 @@ struct TransposeCommuteWithPad : public OpRewritePattern<stablehlo::PadOp> {
   LogicalResult matchAndRewrite(stablehlo::PadOp pad_op,
                                 PatternRewriter& rewriter) const override {
     Value pad_input = pad_op.getOperand();
-    RankedTensorType pad_type = pad_op.getType().cast<RankedTensorType>();
+    RankedTensorType pad_type = mlir::cast<RankedTensorType>(pad_op.getType());
 
     auto transpose_op = pad_input.getDefiningOp<stablehlo::TransposeOp>();
     if (!transpose_op || !transpose_op->hasOneUse()) return failure();
@@ -132,7 +132,7 @@ struct TransposeCommuteWithReduceWindow
     Value reduce_input = inputs[0];
 
     RankedTensorType reduce_type =
-        reduce_op.getResultTypes()[0].cast<RankedTensorType>();
+        mlir::cast<RankedTensorType>(reduce_op.getResultTypes()[0]);
 
     auto transpose_op = reduce_input.getDefiningOp<stablehlo::TransposeOp>();
     if (!transpose_op || !transpose_op->hasOneUse()) return failure();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/prepare_hlo.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/prepare_hlo.td
index 9b6f6efbfcf4..c0b274ac1f85 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/prepare_hlo.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/prepare_hlo.td
@@ -56,10 +56,10 @@ def AreDnumsFullyDefined : Constraint<CPred<
 // get a new Type with the same element type as value, but with
 // shape permuted based on from and to layouts.
 def PermuteShape : NativeCodeCall<
-  "$2.getType().cast<RankedTensorType>()"
+  "llvm::cast<RankedTensorType>($2.getType())"
     ".clone($0.PermuteShape("
       "$1,"
-      "$2.getType().cast<RankedTensorType>().getShape()))">;
+      "llvm::cast<RankedTensorType>($2.getType()).getShape()))">;
 
 def IsStandardConv : Constraint<CPred<
   "IsStandardConv($0.getDefiningOp<mhlo::ConvolutionOp>())">>;
@@ -380,7 +380,7 @@ def GetExplicitPaddingArgs : NativeCodeCall<
 
 // Gets element type from Value.
 def GetElementType : NativeCodeCall<
-  "$0.getType().cast<RankedTensorType>().getElementType()">;
+  "llvm::cast<RankedTensorType>($0.getType()).getElementType()">;
 
 // Given element type, get a DenseElements with scalar shape and 0 value.
 def GetZeroScalarAttrFromType : NativeCodeCall<
@@ -439,9 +439,9 @@ def UnfuseConvWithExplicitPadding : Pat<(MHLO_ConvolutionOp:$conv
 
 def TrivialStrides : NativeCodeCall<
   "DenseIntElementsAttr::get("
-    "RankedTensorType::get({$0.getType().cast<ShapedType>().getRank()},"
+    "RankedTensorType::get({llvm::cast<ShapedType>($0.getType()).getRank()},"
       "$_builder.getI64Type()),"
-      "llvm::SmallVector<int64_t>($0.getType().cast<ShapedType>().getRank(),"
+      "llvm::SmallVector<int64_t>(llvm::cast<ShapedType>($0.getType()).getRank(),"
         "1))">;
 
 def SliceStart : NativeCodeCall<
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
index 6ccdb72abf34..fcecd557aeab 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <iterator>
-#include <memory>
 #include <utility>
 
 #include "stablehlo/dialect/StablehloOps.h"
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h
index f0ef634c848b..ac8aff94f06d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h
@@ -25,12 +25,6 @@ limitations under the License.
 namespace mlir {
 namespace odml {
 
-// Unfuses MHLO batch norm inference op into arithmetic ops.
-std::unique_ptr<Pass> createUnfuseBatchNormPass();
-
-// Fuses MHLO binary element-wise ops and convolution op.
-std::unique_ptr<Pass> createFuseConvolutionPass();
-
 // Applies various optimizations on MHLO IR.
 std::unique_ptr<Pass> createOptimizePass();
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_unfuse_batch_norm_pass.cc
index 3b0ec3c97400..32d76f918480 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_unfuse_batch_norm_pass.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <cassert>
 #include <cstdint>
-#include <memory>
 #include <utility>
 
 #include "stablehlo/dialect/StablehloOps.h"
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
index e8a2bc870e96..c876347d2a2c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo_patterns.td
index c45e67ed5bfb..e438e9580697 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo_patterns.td
@@ -28,6 +28,9 @@ def ShapeToConst : NativeCodeCall<"ShapeToConst($_builder, $0)">;
 def CreateTFLCastToInt32Op : NativeCodeCall<
   "CreateCastToInt32($0, $_loc, $_builder)">;
 
+def ConstDefaultResultAccuracyAttr :
+  ConstantAttr<MHLO_ResultAccuracyAttr, "::mlir::mhlo::ResultAccuracyMode::DEFAULT">;
+
 def : Pat<
   (MHLO_ConstantOp:$output $value),
   (Arith_ConstantOp $value),
@@ -53,7 +56,7 @@ def : Pat<
 def I64AttrToI32Attr:
   NativeCodeCall<
     "$_builder.getI32IntegerAttr("
-      "static_cast<int32_t>($0.cast<IntegerAttr>().getInt()))">;
+      "static_cast<int32_t>(llvm::cast<IntegerAttr>($0).getInt()))">;
 
 def : Pat<
   (MHLO_ConcatenateOp $inputs, $dim),
@@ -295,7 +298,7 @@ foreach pair = [
 // Check implicit bool cast of `$_self` to ensure Attribute is non-null before
 // casting.
 def HasSupportedComparisonType : AttrConstraint<
-  CPred<"!$_self || SupportedComparisonType($_self.cast<mhlo::ComparisonTypeAttr>())">>;
+  CPred<"!$_self || SupportedComparisonType(llvm::cast<mhlo::ComparisonTypeAttr>($_self))">>;
 
 class MHLO_ComparisonDirectionValue<string enumStr> :
   ConstantAttr<MHLO_ComparisonDirectionAttr,
@@ -318,7 +321,7 @@ in {
 // unary element-wise op
 //===----------------------------------------------------------------------===//
 
-def LowerCbrt : Pat<(MHLO_CbrtOp $opr),
+def LowerCbrt : Pat<(MHLO_CbrtOp $opr, ConstDefaultResultAccuracyAttr),
   (TFL_PowOp $opr,
     (TFL_DivOp
       (Arith_ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">),
@@ -335,29 +338,26 @@ def : Pat<(MHLO_AbsOp MHLO_PredIntFpOrQuantizedTensor:$arg), (TFL_AbsOp $arg)>;
 foreach pair = [
   [MHLO_BitcastConvertOp, TFL_BitcastOp],
   [MHLO_CeilOp, TFL_CeilOp],
-  [MHLO_CosineOp, TFL_CosOp],
   [MHLO_FloorOp, TFL_FloorOp],
   [MHLO_ImagOp, TFL_ImagOp],
-  [MHLO_LogOp, TFL_LogOp],
-  [MHLO_LogisticOp, TFL_LogisticOp],
   [MHLO_NegOp, TFL_NegOp],
   [MHLO_RealOp, TFL_RealOp],
-  [MHLO_RsqrtOp, TFL_RsqrtOp],
-  [MHLO_SineOp, TFL_SinOp],
   [MHLO_SignOp, TFL_SignOp],
-  [MHLO_SqrtOp, TFL_SqrtOp],
-  [MHLO_TanhOp, TFL_TanhOp]
 ] in {
   def : Pat<
     (pair[0] $input),
     (pair[1] $input)>;
 }
 
-def ConstDefaultResultAccuracyAttr :
-  ConstantAttr<MHLO_ResultAccuracyAttr, "::mlir::mhlo::ResultAccuracyMode::DEFAULT">;
-
 foreach pair = [
+  [MHLO_CosineOp, TFL_CosOp],
   [MHLO_ExpOp, TFL_ExpOp],
+  [MHLO_LogOp, TFL_LogOp],
+  [MHLO_LogisticOp, TFL_LogisticOp],
+  [MHLO_RsqrtOp, TFL_RsqrtOp],
+  [MHLO_SineOp, TFL_SinOp],
+  [MHLO_SqrtOp, TFL_SqrtOp],
+  [MHLO_TanhOp, TFL_TanhOp],
 ] in {
   def : Pat<
     (pair[0] $input, ConstDefaultResultAccuracyAttr),
@@ -370,7 +370,7 @@ def : Pat<
   (TFL_CastOp $input)>;
 
 def : Pat<
-  (MHLO_Expm1Op F32Tensor:$x),
+  (MHLO_Expm1Op F32Tensor:$x, ConstDefaultResultAccuracyAttr),
   (TFL_SubOp
     (TFL_ExpOp $x),
     (Arith_ConstantOp
@@ -385,7 +385,7 @@ def : Pat<
       ConstantAttr<RankedF32ElementsAttr<[]>, "0.0f">))>;
 
 def : Pat<
-  (MHLO_Log1pOp F32Tensor:$x),
+  (MHLO_Log1pOp F32Tensor:$x, ConstDefaultResultAccuracyAttr),
   (TFL_LogOp
     (TFL_AddOp
       $x,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
index eef657be2981..620a473f3334 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
-#include "tensorflow/compiler/mlir/stablehlo/transforms/stablehlo_passes.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index f6f01cf68454..be4a10dd108b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -130,12 +131,12 @@ Operation* GetBiasConstOp(Operation* op) {
 TFL::QConstOp CreateTransposedTflConstOpForFilter(
     stablehlo::ConstantOp filter_constant_op, PatternRewriter& rewriter,
     bool is_per_channel) {
-  const auto filter_values = filter_constant_op.getValue()
-                                 .cast<DenseIntElementsAttr>()
-                                 .getValues<int8_t>();
+  const auto filter_values =
+      llvm::cast<DenseIntElementsAttr>(filter_constant_op.getValue())
+          .getValues<int8_t>();
 
   ArrayRef<int64_t> filter_shape =
-      filter_constant_op.getType().cast<TensorType>().getShape();
+      llvm::cast<TensorType>(filter_constant_op.getType()).getShape();
 
   // Reverse the shapes. This makes sense, assuming that the filter tensor has a
   // rank of 2 (no batch dimension).
@@ -159,16 +160,16 @@ TFL::QConstOp CreateTransposedTflConstOpForFilter(
   Type new_filter_quantized_type;
 
   if (is_per_channel) {
-    auto filter_quantized_type = GetElementType(filter_constant_op.getResult())
-                                     .cast<UniformQuantizedPerAxisType>();
+    auto filter_quantized_type = llvm::cast<quant::UniformQuantizedPerAxisType>(
+        GetElementType(filter_constant_op.getResult()));
     new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
         filter_constant_op->getLoc(), *rewriter.getContext(),
         filter_quantized_type.getScales(),
         filter_quantized_type.getZeroPoints(),
         /*quantization_dimension=*/0, /*narrow_range=*/true);
   } else {
-    auto filter_quantized_type = GetElementType(filter_constant_op.getResult())
-                                     .cast<UniformQuantizedType>();
+    auto filter_quantized_type = llvm::cast<quant::UniformQuantizedType>(
+        GetElementType(filter_constant_op.getResult()));
     new_filter_quantized_type = CreateI8F32UniformQuantizedType(
         filter_constant_op->getLoc(), *rewriter.getContext(),
         filter_quantized_type.getScale(), filter_quantized_type.getZeroPoint(),
@@ -235,8 +236,8 @@ TFL::QConstOp CreateTflConstOpForDummyBias(
   Type bias_quantized_type;
   if (is_per_channel) {
     const auto filter_quantized_element_type =
-        GetElementType(filter_const_op.getResult())
-            .cast<UniformQuantizedPerAxisType>();
+        llvm::cast<quant::UniformQuantizedPerAxisType>(
+            GetElementType(filter_const_op.getResult()));
 
     // The storage type is i32 for bias, which is the precision used for
     // accumulation.
@@ -247,8 +248,8 @@ TFL::QConstOp CreateTflConstOpForDummyBias(
         /*quantization_dimension=*/0);
   } else {
     const auto filter_quantized_element_type =
-        GetElementType(filter_const_op.getResult())
-            .cast<UniformQuantizedType>();
+        llvm::cast<quant::UniformQuantizedType>(
+            GetElementType(filter_const_op.getResult()));
 
     // The storage type is i32 for bias, which is the precision used for
     // accumulation.
@@ -297,8 +298,8 @@ Type GetQuantizedOutputType(Operation* op, PatternRewriter& rewriter,
   }
   // StableHLO Quantizer outputs an i32 type. Rewrite to i8 type result
   // to meet TFLite op requirement.
-  auto result_quantized_type = GetElementType(uniform_quantize_op->getResult(0))
-                                   .cast<UniformQuantizedType>();
+  auto result_quantized_type = llvm::cast<quant::UniformQuantizedType>(
+      GetElementType(uniform_quantize_op->getResult(0)));
   auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
       uniform_quantize_op->getLoc(), *rewriter.getContext(),
       result_quantized_type.getScale(), result_quantized_type.getZeroPoint());
@@ -306,8 +307,8 @@ Type GetQuantizedOutputType(Operation* op, PatternRewriter& rewriter,
   // fused `qi8` type.
   rewriter.replaceAllUsesWith(uniform_quantize_op->getResult(0),
                               op->getResult(0));
-  return op->getResult(0).getType().cast<TensorType>().clone(
-      new_result_quantized_type);
+  return llvm::cast<TensorType>(op->getResult(0).getType())
+      .clone(new_result_quantized_type);
 }
 
 // Matches kernel dimension numbers, ranks of input and output and constant
@@ -331,7 +332,7 @@ LogicalResult MatchConvolutionFormat(stablehlo::ConvolutionOp op) {
     return failure();
   }
 
-  const auto input_type = op.getLhs().getType().cast<TensorType>();
+  const auto input_type = llvm::cast<TensorType>(op.getLhs().getType());
   if (input_type.getRank() != 4) {
     LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
                                "Expected input rank of 4. Got: "
@@ -339,7 +340,7 @@ LogicalResult MatchConvolutionFormat(stablehlo::ConvolutionOp op) {
     return failure();
   }
 
-  const auto filter_type = op.getRhs().getType().cast<TensorType>();
+  const auto filter_type = llvm::cast<TensorType>(op.getRhs().getType());
   if (filter_type.getRank() != 4) {
     LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
                                "Expected filter rank of 4. Got: "
@@ -445,15 +446,16 @@ int64_t GetConvolutionKernelInputFeatureDimension(bool is_depthwise) {
 // TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
 class RewriteUniformQuantizeOp
     : public OpRewritePattern<stablehlo::UniformQuantizeOp> {
-  using OpRewritePattern<stablehlo::UniformQuantizeOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
   // Determines whether the input and output types are compatible with
   // `tfl.quantize`. See the definition for the `QUANTIZE` kernel for the
   // detailed limitations
   // (https://github.com/tensorflow/tensorflow/blob/8f145d579aa0ee7f4187af32dbbf4e12fdabbffe/tensorflow/lite/kernels/quantize.cc#L105).
-  LogicalResult match(stablehlo::UniformQuantizeOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::UniformQuantizeOp op,
+                                PatternRewriter& rewriter) const override {
     const Type input_element_type = GetElementType(op.getOperand());
-    if (!(input_element_type.isa<FloatType>() ||
+    if (!(llvm::isa<FloatType>(input_element_type) ||
           IsI32F32UniformQuantizedType(input_element_type) ||
           IsI32F32UniformQuantizedPerAxisType(input_element_type))) {
       LLVM_DEBUG(llvm::dbgs() << "Uniform quantize op's input should be a "
@@ -464,42 +466,37 @@ class RewriteUniformQuantizeOp
 
     // Output type of `UniformQuantizeOp` is guaranteed to be a quantized
     // tensor with integer storage type.
-    const auto output_storage_type = GetElementType(op.getResult())
-                                         .cast<QuantizedType>()
-                                         .getStorageType()
-                                         .cast<IntegerType>();
+    const auto output_storage_type = llvm::cast<IntegerType>(
+        llvm::cast<quant::QuantizedType>(GetElementType(op.getResult()))
+            .getStorageType());
     if (!IsSupportedByTfliteQuantizeOrDequantizeOps(output_storage_type)) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to match storage type of output quantized type.\n");
       return failure();
     }
 
-    return success();
-  }
-
-  void rewrite(stablehlo::UniformQuantizeOp op,
-               PatternRewriter& rewriter) const override {
     Type output_type = *op->getResultTypes().begin();
     rewriter.replaceOpWithNewOp<TFL::QuantizeOp>(
         op, output_type, /*input=*/op.getOperand(),
         /*qtype=*/TypeAttr::get(output_type));
+    return success();
   }
 };
 
 // stablehlo.uniform_dequantize -> tfl.dequantize
 class RewriteUniformDequantizeOp
     : public OpRewritePattern<stablehlo::UniformDequantizeOp> {
-  using OpRewritePattern<stablehlo::UniformDequantizeOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
   // Determines whether the input and output types are compatible with
   // `tfl.dequantize`. See the definition for the `DEQUANTIZE` kernel for the
   // detailed limitations
   // (https://github.com/tensorflow/tensorflow/blob/8f145d579aa0ee7f4187af32dbbf4e12fdabbffe/tensorflow/lite/kernels/dequantize.cc#L52).
-  LogicalResult match(stablehlo::UniformDequantizeOp op) const override {
-    const auto input_storage_type = GetElementType(op.getOperand())
-                                        .cast<QuantizedType>()
-                                        .getStorageType()
-                                        .cast<IntegerType>();
+  LogicalResult matchAndRewrite(stablehlo::UniformDequantizeOp op,
+                                PatternRewriter& rewriter) const override {
+    const auto input_storage_type = llvm::cast<IntegerType>(
+        llvm::cast<quant::QuantizedType>(GetElementType(op.getOperand()))
+            .getStorageType());
     if (!IsSupportedByTfliteQuantizeOrDequantizeOps(input_storage_type)) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to match storage type of input quantized type.\n");
@@ -508,21 +505,17 @@ class RewriteUniformDequantizeOp
 
     // Output type is guaranteed to be a float tensor for a valid StableHLO.
     const auto output_element_type =
-        GetElementType(op.getResult()).cast<FloatType>();
-    if (!output_element_type.isa<Float32Type>()) {
+        llvm::cast<FloatType>(GetElementType(op.getResult()));
+    if (!llvm::isa<Float32Type>(output_element_type)) {
       LLVM_DEBUG(llvm::dbgs() << "Uniform dequantize op's output element type "
                                  "should be f32. Got: "
                               << output_element_type << ".\n");
       return failure();
     }
 
-    return success();
-  }
-
-  void rewrite(stablehlo::UniformDequantizeOp op,
-               PatternRewriter& rewriter) const override {
     rewriter.replaceOpWithNewOp<TFL::DequantizeOp>(
         op, /*resultTypes=*/op->getResultTypes(), /*input=*/op.getOperand());
+    return success();
   }
 };
 
@@ -570,7 +563,17 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
       MLIRContext* ctx)
       : OpRewritePattern<stablehlo::DotGeneralOp>(ctx, /*benefit=*/10) {}
 
-  LogicalResult match(stablehlo::DotGeneralOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::DotGeneralOp op,
+                                PatternRewriter& rewriter) const override {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(stablehlo::DotGeneralOp op) const {
     const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
         op.getDotDimensionNumbers();
     const bool is_batch_matmul = !IsDotGeneralFullyConnected(op).value();
@@ -602,8 +605,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
                                                 has_i32_output);
   }
 
-  void rewrite(stablehlo::DotGeneralOp op,
-               PatternRewriter& rewriter) const override {
+  void rewrite(stablehlo::DotGeneralOp op, PatternRewriter& rewriter) const {
     const Type output_type = GetElementType(op.getResult());
     const bool has_i32_output =
         IsI32F32UniformQuantizedType(output_type) ||
@@ -621,7 +623,6 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
     }
   }
 
- private:
   static LogicalResult MatchDotGeneralToTflBatchMatmulOp(
       stablehlo::DotGeneralOp op,
       const stablehlo::DotDimensionNumbersAttr dot_dimension_nums,
@@ -652,7 +653,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
                                  "quantized dot_general.\n");
       return failure();
     }
-    const auto input_type = op.getLhs().getType().cast<TensorType>();
+    const auto input_type = llvm::cast<TensorType>(op.getLhs().getType());
     const int input_rank = input_type.getRank();
     const auto input_contracting_dim =
         dot_dimension_nums.getLhsContractingDimensions()[0];
@@ -663,7 +664,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
       return failure();
     }
 
-    const auto filter_type = op.getRhs().getType().cast<TensorType>();
+    const auto filter_type = llvm::cast<TensorType>(op.getRhs().getType());
     const Type filter_element_type = filter_type.getElementType();
     if (!IsI8F32UniformQuantizedType(filter_element_type)) {
       LLVM_DEBUG(llvm::dbgs()
@@ -672,7 +673,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
                  << filter_type << "\n");
       return failure();
     }
-    const int rhs_rank = filter_type.cast<TensorType>().getRank();
+    const int rhs_rank = llvm::cast<TensorType>(filter_type).getRank();
     const auto rhs_contracting_dim =
         dot_dimension_nums.getRhsContractingDimensions()[0];
     if ((rhs_contracting_dim != rhs_rank - 1) &&
@@ -699,7 +700,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
       return failure();
     }
 
-    const auto input_type = op.getLhs().getType().cast<TensorType>();
+    const auto input_type = llvm::cast<TensorType>(op.getLhs().getType());
     if (!(input_type.getRank() == 2 || input_type.getRank() == 3)) {
       LLVM_DEBUG(llvm::dbgs() << "Input expected to have rank of 2 or 3. Got: "
                               << input_type << ".\n");
@@ -707,7 +708,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
     }
 
     const Value filter = op.getRhs();
-    const auto filter_type = filter.getType().cast<TensorType>();
+    const auto filter_type = llvm::cast<TensorType>(filter.getType());
     if (filter_type.getRank() != 2) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Filter tensor expected to have a tensor rank of 2. Got: "
@@ -749,7 +750,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
   }
 
   static LogicalResult MatchInputDotGeneralCommonPattern(const Value input) {
-    const auto input_type = input.getType().cast<TensorType>();
+    const auto input_type = llvm::cast<TensorType>(input.getType());
     if (const auto input_element_type = input_type.getElementType();
         !IsI8F32UniformQuantizedType(input_element_type)) {
       LLVM_DEBUG(llvm::dbgs()
@@ -766,7 +767,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
   }
 
   static LogicalResult MatchFilterCommonPattern(const Value filter) {
-    auto filter_type = filter.getType().cast<TensorType>();
+    auto filter_type = llvm::cast<TensorType>(filter.getType());
     if (!filter_type.hasRank()) {
       LLVM_DEBUG(llvm::dbgs() << "Expected rhs of dot_general has rank. Got: "
                               << filter.getType() << "\n");
@@ -827,11 +828,11 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
     // dynamic-range quantized.
     const BoolAttr asymmetric_quantize_inputs = nullptr;
 
-    const int lhs_rank = lhs_value.getType().cast<TensorType>().getRank();
+    const int lhs_rank = llvm::cast<TensorType>(lhs_value.getType()).getRank();
     const BoolAttr adj_x =
         (lhs_contracting_dims[0] == lhs_rank - 2 ? rewriter.getBoolAttr(true)
                                                  : rewriter.getBoolAttr(false));
-    const int rhs_rank = rhs_value.getType().cast<TensorType>().getRank();
+    const int rhs_rank = llvm::cast<TensorType>(rhs_value.getType()).getRank();
     const BoolAttr adj_y =
         (rhs_contracting_dims[0] == rhs_rank - 1 ? rewriter.getBoolAttr(true)
                                                  : rewriter.getBoolAttr(false));
@@ -852,7 +853,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
     // Update BMM if rhs is a constant.
     if (filter_constant_op != nullptr) {
       const auto rhs_uniform_quantized_type =
-          rhs_value.getType().cast<ShapedType>();
+          llvm::cast<ShapedType>(rhs_value.getType());
       const auto rhs_constant_value_attr =
           cast<DenseIntElementsAttr>(filter_constant_op.getValue());
       auto rhs_constant_op = rewriter.create<TFL::QConstOp>(
@@ -883,7 +884,8 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         rhs_value.getDefiningOp(), rewriter, /*is_per_channel=*/true);
 
     const double input_scale =
-        GetElementType(lhs_value).cast<UniformQuantizedType>().getScale();
+        llvm::cast<quant::UniformQuantizedType>(GetElementType(lhs_value))
+            .getScale();
     TFL::QConstOp bias_tfl_op;
     bool fuse_bias_constant =
         FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
@@ -919,23 +921,23 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
         uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(add_op);
         const auto filter_quantized_type =
-            GetElementType(op->getOperand(1))
-                .cast<UniformQuantizedPerAxisType>();
+            llvm::cast<quant::UniformQuantizedPerAxisType>(
+                GetElementType(op->getOperand(1)));
         const SmallVector<double> bias_scales = GetBiasScales(
-            /*input_scale=*/GetElementType(op->getOperand(0))
-                .cast<UniformQuantizedType>()
+            /*input_scale=*/llvm::cast<quant::UniformQuantizedType>(
+                GetElementType(op->getOperand(0)))
                 .getScale(),
             /*filter_scales=*/filter_quantized_type.getScales());
         const ArrayRef<int64_t> output_shape =
-            op->getResult(0).getType().cast<TensorType>().getShape();
+            llvm::cast<TensorType>(op->getResult(0).getType()).getShape();
         const SmallVector<int64_t, 1> bias_shape = {
             output_shape[output_shape.size() - 1]};
         // `tfl.fully_connected`'s `GetChannelDimIndex` is 0.
         const auto bias_quantized_type =
             CreateI32F32UniformQuantizedPerAxisType(
                 op->getLoc(), *op->getContext(), std::move(bias_scales),
-                GetElementType(op->getResult(0))
-                    .cast<UniformQuantizedPerAxisType>()
+                llvm::cast<quant::UniformQuantizedPerAxisType>(
+                    GetElementType(op->getResult(0)))
                     .getZeroPoints(),
                 /*quantization_dimension=*/0);
         Operation* bias_const_op = GetBiasConstOp(add_op);
@@ -954,14 +956,14 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
       }
 
       const auto result_quantized_type =
-          GetElementType(uniform_quantize_op->getResult(0))
-              .cast<UniformQuantizedType>();
+          llvm::cast<quant::UniformQuantizedType>(
+              GetElementType(uniform_quantize_op->getResult(0)));
       const auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
           uniform_quantize_op->getLoc(), *rewriter.getContext(),
           result_quantized_type.getScale(),
           result_quantized_type.getZeroPoint());
-      output_type = op->getResult(0).getType().cast<TensorType>().clone(
-          new_result_quantized_type);
+      output_type = llvm::cast<TensorType>(op->getResult(0).getType())
+                        .clone(new_result_quantized_type);
       // Omit any bias and requantize ops as `tfl.fully_connected` outputs a
       // fused `qi8` type.
       FindUserOfType<>(uniform_quantize_op)->setOperand(0, op->getResult(0));
@@ -1006,8 +1008,19 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
 class RewriteQuantizedConvolutionOp
     : public OpRewritePattern<stablehlo::ConvolutionOp> {
  public:
-  using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
-  LogicalResult match(stablehlo::ConvolutionOp op) const override {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(stablehlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const override {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(stablehlo::ConvolutionOp op) const {
     const bool has_i32_output =
         IsI32F32UniformQuantizedPerAxisType(GetElementType(op.getResult()));
     const bool fuse_bias_constant =
@@ -1053,8 +1066,7 @@ class RewriteQuantizedConvolutionOp
     return success();
   }
 
-  void rewrite(stablehlo::ConvolutionOp op,
-               PatternRewriter& rewriter) const override {
+  void rewrite(stablehlo::ConvolutionOp op, PatternRewriter& rewriter) const {
     const bool has_i32_output =
         IsI32F32UniformQuantizedPerAxisType(GetElementType(op.getResult()));
     stablehlo::ConvDimensionNumbersAttr dimension_numbers =
@@ -1145,9 +1157,8 @@ class RewriteQuantizedConvolutionOp
     }
   }
 
- private:
   static LogicalResult MatchInput(Value input) {
-    auto input_type = input.getType().cast<TensorType>();
+    auto input_type = llvm::cast<TensorType>(input.getType());
     if (const auto input_element_type = input_type.getElementType();
         !IsI8F32UniformQuantizedType(input_element_type)) {
       LLVM_DEBUG(llvm::dbgs()
@@ -1160,7 +1171,7 @@ class RewriteQuantizedConvolutionOp
   }
 
   static LogicalResult MatchFilter(Value filter) {
-    auto filter_type = filter.getType().cast<TensorType>();
+    auto filter_type = llvm::cast<TensorType>(filter.getType());
     const Type filter_element_type = filter_type.getElementType();
     if (!IsI8F32UniformQuantizedPerAxisType(filter_type.getElementType())) {
       LLVM_DEBUG(
@@ -1170,7 +1181,7 @@ class RewriteQuantizedConvolutionOp
       return failure();
     }
 
-    if (filter_element_type.cast<UniformQuantizedPerAxisType>()
+    if (llvm::cast<quant::UniformQuantizedPerAxisType>(filter_element_type)
             .getQuantizedDimension() != 3) {
       LLVM_DEBUG(llvm::dbgs() << "Quantized dimension should be 3. Got: "
                               << filter_element_type << "\n");
@@ -1217,7 +1228,7 @@ class RewriteQuantizedConvolutionOp
     tfl_pad_values.push_back(0);
 
     const auto input_tensor_type =
-        input_value.getType().cast<RankedTensorType>();
+        llvm::cast<RankedTensorType>(input_value.getType());
     const int64_t rank = input_tensor_type.getRank();
 
     SmallVector<int64_t> padded_output_tensor_shape =
@@ -1353,12 +1364,12 @@ class RewriteQuantizedConvolutionOp
   std::tuple<int64_t, int64_t, int64_t, int64_t> GetInOutDimensions(
       stablehlo::ConvolutionOp op,
       stablehlo::ConvDimensionNumbersAttr dimension_numbers) const {
-    const auto [input_height, input_width] =
-        GetDimSize(op->getOperand(0).getType().cast<ShapedType>().getShape(),
-                   dimension_numbers.getInputSpatialDimensions());
-    const auto [output_height, output_width] =
-        GetDimSize(op->getResult(0).getType().cast<ShapedType>().getShape(),
-                   dimension_numbers.getOutputSpatialDimensions());
+    const auto [input_height, input_width] = GetDimSize(
+        llvm::cast<ShapedType>(op->getOperand(0).getType()).getShape(),
+        dimension_numbers.getInputSpatialDimensions());
+    const auto [output_height, output_width] = GetDimSize(
+        llvm::cast<ShapedType>(op->getResult(0).getType()).getShape(),
+        dimension_numbers.getOutputSpatialDimensions());
     return {input_height, input_width, output_height, output_width};
   }
 
@@ -1397,7 +1408,8 @@ class RewriteQuantizedConvolutionOp
     Value filter_value = op.getOperand(1);
     Operation* filter_op = filter_value.getDefiningOp();
     auto filter_uniform_quantized_type =
-        GetElementType(filter_value).cast<UniformQuantizedPerAxisType>();
+        llvm::cast<quant::UniformQuantizedPerAxisType>(
+            GetElementType(filter_value));
     auto filter_constant_value_attr = cast<DenseIntElementsAttr>(
         cast<stablehlo::ConstantOp>(filter_value.getDefiningOp()).getValue());
     const DenseIntElementsAttr new_filter_value_attr =
@@ -1440,8 +1452,8 @@ class RewriteQuantizedConvolutionOp
       const SmallVector<int64_t, 1> bias_shape, const bool has_i32_output,
       const bool fuse_bias_constant) const {
     const SmallVector<double> bias_scales = GetBiasScales(
-        /*input_scale=*/GetElementType(op.getOperand(0))
-            .cast<UniformQuantizedType>()
+        /*input_scale=*/llvm::cast<quant::UniformQuantizedType>(
+            GetElementType(op.getOperand(0)))
             .getScale(),
         /*filter_scales=*/new_filter_quantized_type.getScales());
 
@@ -1480,15 +1492,14 @@ class RewriteQuantizedConvolutionOp
 class RewriteQuantizedTransposeOp
     : public OpRewritePattern<stablehlo::TransposeOp> {
  public:
-  using OpRewritePattern<stablehlo::TransposeOp>::OpRewritePattern;
-
-  LogicalResult match(stablehlo::TransposeOp op) const override {
-    return success(IsOpFullyQuantized(op));
-  }
+  using OpRewritePattern::OpRewritePattern;
 
-  void rewrite(stablehlo::TransposeOp op,
-               PatternRewriter& rewriter) const override {
-    auto operand_type = op.getOperand().getType().cast<TensorType>();
+  LogicalResult matchAndRewrite(stablehlo::TransposeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpFullyQuantized(op)) {
+      return failure();
+    }
+    auto operand_type = llvm::cast<TensorType>(op.getOperand().getType());
     const int64_t rank = operand_type.getRank();
     ArrayRef<int64_t> shape(rank);
     TensorType permutation_type =
@@ -1503,6 +1514,7 @@ class RewriteQuantizedTransposeOp
         rewriter.create<arith::ConstantOp>(op.getLoc(), permutation_attr);
     rewriter.replaceOpWithNewOp<TFL::TransposeOp>(op, op.getOperand(),
                                                   permutation);
+    return success();
   }
 };
 
@@ -1510,35 +1522,35 @@ class RewriteQuantizedTransposeOp
 class RewriteQuantizedReshapeOp
     : public OpRewritePattern<stablehlo::ReshapeOp> {
  public:
-  using OpRewritePattern<stablehlo::ReshapeOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::ReshapeOp op) const override {
-    return success(IsOpFullyQuantized(op));
-  }
-
-  void rewrite(stablehlo::ReshapeOp op,
-               PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(stablehlo::ReshapeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpFullyQuantized(op)) {
+      return failure();
+    }
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
         op, op.getOperand(),
         CreateI32ShapeConstantOp(op.getResult().getType(), op->getLoc(),
                                  rewriter));
+    return success();
   }
 };
 
 class RewriteQuantizedDynamicReshapeOp
     : public OpRewritePattern<stablehlo::DynamicReshapeOp> {
  public:
-  using OpRewritePattern<stablehlo::DynamicReshapeOp>::OpRewritePattern;
-
-  LogicalResult match(stablehlo::DynamicReshapeOp op) const override {
-    return success(IsQuantizedTensorType(op.getOperand().getType()) &&
-                   IsQuantizedTensorType(op.getResult().getType()));
-  }
+  using OpRewritePattern::OpRewritePattern;
 
-  void rewrite(stablehlo::DynamicReshapeOp op,
-               PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(stablehlo::DynamicReshapeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsQuantizedTensorType(op.getOperand().getType()) ||
+        !IsQuantizedTensorType(op.getResult().getType())) {
+      return failure();
+    }
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(op, op.getOperand(),
                                                 op.getOutputShape());
+    return success();
   }
 };
 
@@ -1546,9 +1558,10 @@ class RewriteQuantizedDynamicReshapeOp
 // TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
 class RewriteQuantizedSelectOp : public OpRewritePattern<stablehlo::SelectOp> {
  public:
-  using OpRewritePattern<stablehlo::SelectOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::SelectOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::SelectOp op,
+                                PatternRewriter& rewriter) const override {
     if (!IsQuantizedTensorType(op.getOperand(1).getType())) {
       return failure();
     }
@@ -1558,15 +1571,11 @@ class RewriteQuantizedSelectOp : public OpRewritePattern<stablehlo::SelectOp> {
     if (!IsQuantizedTensorType(op.getResult().getType())) {
       return failure();
     }
-    return success();
-  }
-
-  void rewrite(stablehlo::SelectOp op,
-               PatternRewriter& rewriter) const override {
     Value pred = op.getOperand(0);
     Value on_true = op.getOperand(1);
     Value on_false = op.getOperand(2);
     rewriter.replaceOpWithNewOp<TFL::SelectV2Op>(op, pred, on_true, on_false);
+    return success();
   }
 };
 
@@ -1575,19 +1584,19 @@ class RewriteQuantizedSelectOp : public OpRewritePattern<stablehlo::SelectOp> {
 class RewriteQuantizedConcatenateOp
     : public OpRewritePattern<stablehlo::ConcatenateOp> {
  public:
-  using OpRewritePattern<stablehlo::ConcatenateOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::ConcatenateOp op) const override {
-    return success(IsOpFullyQuantized(op));
-  }
-
-  void rewrite(stablehlo::ConcatenateOp op,
-               PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(stablehlo::ConcatenateOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpFullyQuantized(op)) {
+      return failure();
+    }
     Type output_type = op.getResult().getType();
     uint32_t axis = CastI64ToI32(op.getDimension()).value();
     rewriter.replaceOpWithNewOp<TFL::ConcatenationOp>(
         op, output_type, op.getOperands(), axis,
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+    return success();
   }
 };
 
@@ -1596,13 +1605,13 @@ class RewriteQuantizedConcatenateOp
 // TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
 class RewriteQuantizedPadOp : public OpRewritePattern<stablehlo::PadOp> {
  public:
-  using OpRewritePattern<stablehlo::PadOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::PadOp op) const override {
-    return success(IsOpFullyQuantized(op));
-  }
-
-  void rewrite(stablehlo::PadOp op, PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(stablehlo::PadOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpFullyQuantized(op)) {
+      return failure();
+    }
     Value input = op.getOperand();
     // If any of the interior padding is non-zero, operand should be dilated
     // first, and then padded.
@@ -1611,7 +1620,7 @@ class RewriteQuantizedPadOp : public OpRewritePattern<stablehlo::PadOp> {
       input = InsertDilateOp(op, rewriter);
     }
 
-    TensorType operand_type = input.getType().cast<TensorType>();
+    TensorType operand_type = llvm::cast<TensorType>(input.getType());
     const int64_t rank = operand_type.getRank();
     // Shape of padding should be [rank, 2].
     SmallVector<int64_t> shape{rank, 2};
@@ -1626,18 +1635,19 @@ class RewriteQuantizedPadOp : public OpRewritePattern<stablehlo::PadOp> {
       padding_value.push_back(CastI64ToI32(padding_high[i]).value());
     }
 
-    TensorType output_type = op.getResult().getType().cast<TensorType>();
+    TensorType output_type = llvm::cast<TensorType>(op.getResult().getType());
     Value constant_values = op.getPaddingValue();
     auto padding_attr = DenseIntElementsAttr::get(padding_type, padding_value);
     auto padding =
         rewriter.create<arith::ConstantOp>(op.getLoc(), padding_attr);
     rewriter.replaceOpWithNewOp<TFL::PadV2Op>(op, output_type, input, padding,
                                               constant_values);
+    return success();
   }
 
   Value InsertDilateOp(stablehlo::PadOp op, PatternRewriter& rewriter) const {
     Value input = op.getOperand();
-    TensorType operand_type = input.getType().cast<TensorType>();
+    TensorType operand_type = llvm::cast<TensorType>(input.getType());
     const int64_t rank = operand_type.getRank();
 
     ArrayRef<int64_t> dilate_shape(rank);
@@ -1657,7 +1667,7 @@ class RewriteQuantizedPadOp : public OpRewritePattern<stablehlo::PadOp> {
       dilated_shape[i] =
           operand_shape[i] + interior_padding_i64[i] * (operand_shape[i] - 1);
     }
-    TensorType output_type = op.getResult().getType().cast<TensorType>();
+    TensorType output_type = llvm::cast<TensorType>(op.getResult().getType());
     Type dilated_output_type = output_type.clone(dilated_shape);
     Value constant_values = op.getPaddingValue();
 
@@ -1669,15 +1679,14 @@ class RewriteQuantizedPadOp : public OpRewritePattern<stablehlo::PadOp> {
 // Rewrites quantized stablehlo.slice to tfl.slice or tfl.strided_slice.
 class RewriteQuantizedSliceOp : public OpRewritePattern<stablehlo::SliceOp> {
  public:
-  using OpRewritePattern<stablehlo::SliceOp>::OpRewritePattern;
-
-  LogicalResult match(stablehlo::SliceOp op) const override {
-    return success(IsOpFullyQuantized(op));
-  }
+  using OpRewritePattern::OpRewritePattern;
 
-  void rewrite(stablehlo::SliceOp op,
-               PatternRewriter& rewriter) const override {
-    auto operand_type = op.getOperand().getType().cast<TensorType>();
+  LogicalResult matchAndRewrite(stablehlo::SliceOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpFullyQuantized(op)) {
+      return failure();
+    }
+    auto operand_type = llvm::cast<TensorType>(op.getOperand().getType());
     Type output_type = op.getResult().getType();
     const int64_t rank = operand_type.getRank();
 
@@ -1709,7 +1718,7 @@ class RewriteQuantizedSliceOp : public OpRewritePattern<stablehlo::SliceOp> {
     if (llvm::all_of(strides, [](int64_t stride) { return stride == 1; })) {
       rewriter.replaceOpWithNewOp<TFL::SliceOp>(
           op, output_type, op.getOperand(), start_idx, slice_size);
-      return;
+      return success();
     }
 
     SmallVector<int32_t> stride_i32 = CastI64ArrayToI32(strides).value();
@@ -1720,6 +1729,7 @@ class RewriteQuantizedSliceOp : public OpRewritePattern<stablehlo::SliceOp> {
         /*begin_mask=*/0, /*end_mask=*/0,
         /*ellipsis_mask=*/0, /*new_axis_mask=*/0, /*shrink_axis_mask=*/0,
         /*offset=*/false);
+    return success();
   }
 };
 
@@ -1731,16 +1741,15 @@ class RewriteQuantizedSliceOp : public OpRewritePattern<stablehlo::SliceOp> {
 class RewriteQuantizedBroadcastInDimOp
     : public OpRewritePattern<stablehlo::BroadcastInDimOp> {
  public:
-  using OpRewritePattern<stablehlo::BroadcastInDimOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::BroadcastInDimOp op) const override {
-    return success(IsOpFullyQuantized(op));
-  }
-
-  void rewrite(stablehlo::BroadcastInDimOp op,
-               PatternRewriter& rewriter) const override {
-    auto operand_type = op.getOperand().getType().cast<TensorType>();
-    auto output_type = op.getResult().getType().cast<TensorType>();
+  LogicalResult matchAndRewrite(stablehlo::BroadcastInDimOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpFullyQuantized(op)) {
+      return failure();
+    }
+    auto operand_type = llvm::cast<TensorType>(op.getOperand().getType());
+    auto output_type = llvm::cast<TensorType>(op.getResult().getType());
     Value input = op.getOperand();
 
     // If broadcast_dimensions is not in ascending order, transpose first.
@@ -1765,6 +1774,7 @@ class RewriteQuantizedBroadcastInDimOp
 
     rewriter.replaceOpWithNewOp<TFL::BroadcastToOp>(op, output_type, input,
                                                     shape);
+    return success();
   }
 
   Value InsertTransposeOp(stablehlo::BroadcastInDimOp op,
@@ -1778,7 +1788,7 @@ class RewriteQuantizedBroadcastInDimOp
           return static_cast<int32_t>(llvm::find(sorted_dims, dim) -
                                       sorted_dims.begin());
         }));
-    auto operand_type = op.getOperand().getType().cast<TensorType>();
+    auto operand_type = llvm::cast<TensorType>(op.getOperand().getType());
     TensorType perm_type = operand_type.cloneWith(
         {static_cast<int64_t>(permutation.size())}, rewriter.getI32Type());
     auto perm_attr = DenseIntElementsAttr::get(perm_type, permutation);
@@ -1791,7 +1801,7 @@ class RewriteQuantizedBroadcastInDimOp
   Value InsertExpandDimsOp(stablehlo::BroadcastInDimOp op,
                            PatternRewriter& rewriter, Value input,
                            int64_t output_rank) const {
-    auto input_type = input.getType().cast<TensorType>();
+    auto input_type = llvm::cast<TensorType>(input.getType());
     SmallVector<int64_t> input_shape(input_type.getShape());
     SmallVector<int64_t> input_dims =
         llvm::to_vector(op.getBroadcastDimensions());
@@ -1828,8 +1838,18 @@ class RewriteQuantizedBroadcastInDimOp
 class RewriteQuantizedReduceWindowOpWithMax
     : public OpRewritePattern<stablehlo::ReduceWindowOp> {
  public:
-  using OpRewritePattern<stablehlo::ReduceWindowOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
+  LogicalResult matchAndRewrite(stablehlo::ReduceWindowOp op,
+                                PatternRewriter& rewriter) const override {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
   LogicalResult MatchBinaryReduceFunction(Region& function) const {
     Block& body = function.front();
     if (body.getNumArguments() != 2) return failure();
@@ -1845,7 +1865,7 @@ class RewriteQuantizedReduceWindowOpWithMax
                    reduce_op.getRhs() == body.getArgument(1));
   }
 
-  LogicalResult match(stablehlo::ReduceWindowOp op) const override {
+  LogicalResult match(stablehlo::ReduceWindowOp op) const {
     // Check that the reduce-window is a max-reduce-window.
     if (failed(MatchBinaryReduceFunction(op.getBody()))) {
       return failure();
@@ -1879,8 +1899,7 @@ class RewriteQuantizedReduceWindowOpWithMax
     return success(IsOpFullyQuantized(op));
   }
 
-  void rewrite(stablehlo::ReduceWindowOp op,
-               PatternRewriter& rewriter) const override {
+  void rewrite(stablehlo::ReduceWindowOp op, PatternRewriter& rewriter) const {
     Type result_type = op.getResult(0).getType();
     Value input = op.getOperand(0);
     // Ops with padding is rejected in matching function, so we can use the
@@ -1923,9 +1942,10 @@ class RewriteQuantizedReduceWindowOpWithMax
 // offset dimensions.
 class RewriteQuantizedGatherOp : public OpRewritePattern<stablehlo::GatherOp> {
  public:
-  using OpRewritePattern<stablehlo::GatherOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::GatherOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::GatherOp op,
+                                PatternRewriter& rewriter) const override {
     const Type input_type = op.getOperand().getType();
     const Type output_type = op.getResult().getType();
     if (!IsQuantizedTensorType(input_type) ||
@@ -1933,7 +1953,7 @@ class RewriteQuantizedGatherOp : public OpRewritePattern<stablehlo::GatherOp> {
       return failure();
     }
 
-    auto output_tensor_type = output_type.cast<TensorType>();
+    auto output_tensor_type = llvm::cast<TensorType>(output_type);
     if (!output_tensor_type.hasRank()) {
       return failure();
     }
@@ -1989,7 +2009,7 @@ class RewriteQuantizedGatherOp : public OpRewritePattern<stablehlo::GatherOp> {
 
     // Input type is checked to be quantized tensor type.
     const auto input_shape =
-        op.getOperand().getType().cast<TensorType>().getShape();
+        llvm::cast<TensorType>(op.getOperand().getType()).getShape();
     SmallVector<int64_t> input_offset_shape;
     for (int64_t i = 0; i < input_shape.size(); ++i) {
       if (!llvm::is_contained(start_index_map, i)) {
@@ -2005,14 +2025,10 @@ class RewriteQuantizedGatherOp : public OpRewritePattern<stablehlo::GatherOp> {
       }
     }
 
-    return success();
-  }
-
-  void rewrite(stablehlo::GatherOp op,
-               PatternRewriter& rewriter) const override {
     rewriter.replaceOpWithNewOp<TFL::GatherNdOp>(
         op, /*output=*/op.getResult().getType(), /*params=*/op.getOperand(),
         /*indices=*/op.getStartIndices());
+    return success();
   }
 };
 
@@ -2021,22 +2037,19 @@ class RewriteQuantizedGatherOp : public OpRewritePattern<stablehlo::GatherOp> {
 class RewriteQuantizedDynamicSliceOp
     : public OpRewritePattern<stablehlo::DynamicSliceOp> {
  public:
-  using OpRewritePattern<stablehlo::DynamicSliceOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::DynamicSliceOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::DynamicSliceOp op,
+                                PatternRewriter& rewriter) const override {
     if (!IsQuantizedTensorType(op.getOperand().getType()) ||
-        !IsQuantizedTensorType(op.getResult().getType())) {
+        !IsQuantizedTensorType(op.getResult().getType()) ||
+        !quant::HasStaticShape(op.getOperand())) {
       return failure();
     }
 
-    return success(quant::HasStaticShape(op.getOperand()));
-  }
-
-  void rewrite(stablehlo::DynamicSliceOp op,
-               PatternRewriter& rewriter) const override {
     Type output = op.getResult().getType();
     Value input = op.getOperand();
-    TensorType operand_type = input.getType().cast<TensorType>();
+    TensorType operand_type = llvm::cast<TensorType>(input.getType());
     ArrayRef<int64_t> operand_shape = operand_type.getShape();
     const int64_t rank = operand_type.getRank();
     const Type i64_type = rewriter.getI64Type();
@@ -2089,19 +2102,20 @@ class RewriteQuantizedDynamicSliceOp
     auto size = rewriter.create<arith::ConstantOp>(op.getLoc(), size_attr);
 
     rewriter.replaceOpWithNewOp<TFL::SliceOp>(op, output, input, begin, size);
+    return success();
   }
 };
 
 class RewriteQuantizedAddOp : public OpRewritePattern<stablehlo::AddOp> {
  public:
-  using OpRewritePattern<stablehlo::AddOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::AddOp op) const override {
-    return success(IsI8F32UniformQuantizedType(GetElementType(op.getLhs())) &&
-                   IsI8F32UniformQuantizedType(GetElementType(op.getRhs())));
-  }
-
-  void rewrite(stablehlo::AddOp op, PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(stablehlo::AddOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsI8F32UniformQuantizedType(GetElementType(op.getLhs())) ||
+        !IsI8F32UniformQuantizedType(GetElementType(op.getRhs()))) {
+      return failure();
+    }
     TFL::QConstOp lhs_qconst_op;
     TFL::QConstOp rhs_qconst_op;
 
@@ -2111,7 +2125,7 @@ class RewriteQuantizedAddOp : public OpRewritePattern<stablehlo::AddOp> {
         auto stablehlo_const_op = dyn_cast_or_null<stablehlo::ConstantOp>(
             broadcast_op.getOperand().getDefiningOp());
         auto const_uniform_quantized_type =
-            stablehlo_const_op.getResult().getType().cast<ShapedType>();
+            llvm::cast<ShapedType>(stablehlo_const_op.getResult().getType());
         return rewriter.create<TFL::QConstOp>(
             op.getLoc(), TypeAttr::get(const_uniform_quantized_type),
             cast<DenseIntElementsAttr>(stablehlo_const_op.getValue()));
@@ -2127,6 +2141,7 @@ class RewriteQuantizedAddOp : public OpRewritePattern<stablehlo::AddOp> {
         lhs_qconst_op ? lhs_qconst_op : op.getOperand(0),
         rhs_qconst_op ? rhs_qconst_op : op.getOperand(1),
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+    return success();
   }
 };
 
@@ -2134,17 +2149,17 @@ class RewriteQuantizedAddOp : public OpRewritePattern<stablehlo::AddOp> {
 class RewriteQuantizedConstantOp
     : public OpRewritePattern<stablehlo::ConstantOp> {
  public:
-  using OpRewritePattern<stablehlo::ConstantOp>::OpRewritePattern;
-
-  LogicalResult match(stablehlo::ConstantOp op) const override {
-    return success(IsQuantizedTensorType(op.getOutput().getType()));
-  }
+  using OpRewritePattern::OpRewritePattern;
 
-  void rewrite(stablehlo::ConstantOp op,
-               PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(stablehlo::ConstantOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsQuantizedTensorType(op.getOutput().getType())) {
+      return failure();
+    }
     rewriter.replaceOpWithNewOp<TFL::QConstOp>(
         op, /*qtype=*/TypeAttr::get(op.getOutput().getType()),
         /*value=*/op.getValue());
+    return success();
   }
 };
 
@@ -2155,26 +2170,26 @@ class RewriteQuantizedConstantOp
 class RewriteHybridQuantizedDotGeneralOp
     : public OpRewritePattern<stablehlo::DotGeneralOp> {
  public:
-  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::DotGeneralOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::DotGeneralOp op,
+                                PatternRewriter& rewriter) const override {
     // Lhs and result should not be quantized and rhs should be quantized.
-    return success(!IsQuantizedTensorType(op->getOperand(0).getType()) &&
-                   IsQuantizedTensorType(op->getOperand(1).getType()) &&
-                   !IsQuantizedTensorType(op->getResult(0).getType()));
-  }
-
-  void rewrite(stablehlo::DotGeneralOp op,
-               PatternRewriter& rewriter) const override {
+    if (IsQuantizedTensorType(op->getOperand(0).getType()) ||
+        !IsQuantizedTensorType(op->getOperand(1).getType()) ||
+        IsQuantizedTensorType(op->getResult(0).getType())) {
+      return failure();
+    }
     Value rhs = op.getRhs();
     Type lhs_element_type =
-        op.getLhs().getType().template cast<TensorType>().getElementType();
+        llvm::cast<TensorType>(op.getLhs().getType()).getElementType();
     Type dequantized_rhs_type =
         quant::CloneTypeWithNewElementType(rhs.getType(), lhs_element_type);
     auto dq = rewriter.create<TFL::DequantizeOp>(
         op->getLoc(), /*output=*/dequantized_rhs_type,
         /*input=*/rhs);
     rewriter.replaceAllUsesExcept(rhs, dq.getOutput(), dq);
+    return success();
   }
 };
 
@@ -2189,20 +2204,19 @@ class RewriteHybridQuantizedConvolutionOp
   explicit RewriteHybridQuantizedConvolutionOp(MLIRContext* ctx)
       : OpRewritePattern<stablehlo::ConvolutionOp>(ctx, /*benefit=*/5) {}
 
-  LogicalResult match(stablehlo::ConvolutionOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const override {
     if (failed(MatchConvolutionFormat(op))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to match dimension format for convolution_op.\n");
       return failure();
     }
     // Lhs and result should not be quantized and rhs should be quantized.
-    return success(!IsQuantizedTensorType(op->getOperand(0).getType()) &&
-                   IsQuantizedTensorType(op->getOperand(1).getType()) &&
-                   !IsQuantizedTensorType(op->getResult(0).getType()));
-  }
-
-  void rewrite(stablehlo::ConvolutionOp op,
-               PatternRewriter& rewriter) const override {
+    if (IsQuantizedTensorType(op->getOperand(0).getType()) ||
+        !IsQuantizedTensorType(op->getOperand(1).getType()) ||
+        IsQuantizedTensorType(op->getResult(0).getType())) {
+      return failure();
+    }
     const bool is_depthwise = IsDepthwiseConvolution(op);
 
     Operation* filter_op = op.getRhs().getDefiningOp();
@@ -2225,13 +2239,14 @@ class RewriteHybridQuantizedConvolutionOp
     op.setDimensionNumbersAttr(new_dimension_numbers);
 
     Type lhs_element_type =
-        op.getOperand(0).getType().template cast<TensorType>().getElementType();
+        llvm::cast<TensorType>(op.getOperand(0).getType()).getElementType();
     Type dequantized_rhs_type = quant::CloneTypeWithNewElementType(
         new_filter.getType(), lhs_element_type);
     auto dq = rewriter.create<TFL::DequantizeOp>(
         op->getLoc(), /*output=*/dequantized_rhs_type,
         /*input=*/new_filter);
     rewriter.replaceAllUsesExcept(filter_op->getResult(0), dq.getOutput(), dq);
+    return success();
   }
 
  private:
@@ -2239,11 +2254,12 @@ class RewriteHybridQuantizedConvolutionOp
   Type GetNewWeightQuantizedType(MLIRContext* context, Location location,
                                  ArrayRef<int64_t> new_shape, Type filter_type,
                                  bool is_depthwise) const {
-    auto tensor_type = filter_type.cast<TensorType>();
+    auto tensor_type = llvm::cast<TensorType>(filter_type);
     auto element_type = tensor_type.getElementType();
     RankedTensorType new_filter_result_type;
-    if (element_type.isa<UniformQuantizedPerAxisType>()) {
-      auto per_axis_type = element_type.cast<UniformQuantizedPerAxisType>();
+    if (llvm::isa<UniformQuantizedPerAxisType>(element_type)) {
+      auto per_axis_type =
+          llvm::cast<quant::UniformQuantizedPerAxisType>(element_type);
       int64_t kernel_output_feature_dim =
           GetConvolutionKernelOutputFeatureDimension(is_depthwise);
       auto new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
@@ -2255,8 +2271,9 @@ class RewriteHybridQuantizedConvolutionOp
           RankedTensorType::getChecked(location,
                                        /*shape=*/new_shape,
                                        /*type=*/new_filter_quantized_type);
-    } else if (element_type.isa<UniformQuantizedType>()) {
-      auto per_tensor_type = element_type.cast<UniformQuantizedType>();
+    } else if (llvm::isa<UniformQuantizedType>(element_type)) {
+      auto per_tensor_type =
+          llvm::cast<quant::UniformQuantizedType>(element_type);
       new_filter_result_type =
           RankedTensorType::getChecked(location,
                                        /*shape=*/new_shape,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.cc
deleted file mode 100644
index b120a6f02e14..000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h"
-
-#include <cstdint>
-
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/mlir_hlo/utils/hlo_utils.h"
-
-namespace mlir {
-namespace odml {
-
-mhlo::ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
-                                      OpBuilder* builder) {
-  return builder->create<mhlo::ConstantOp>(loc,
-                                           hlo::getScalarOfType(ty, raw_value));
-}
-
-mhlo::ConstantOp GetScalarNegZeroOfType(Type ty, Location loc,
-                                        OpBuilder* builder) {
-  return builder->create<mhlo::ConstantOp>(loc,
-                                           hlo::getScalarNegZeroOfType(ty));
-}
-
-DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr) {
-  RankedTensorType ty =
-      RankedTensorType::get(static_cast<int64_t>(attr.size()),
-                            IntegerType::get(attr.getContext(), 64));
-  return DenseIntElementsAttr::get(ty, attr.getValue());
-}
-
-DenseIntElementsAttr GetI64ElementsAttr(ArrayRef<int64_t> values,
-                                        Builder* builder) {
-  RankedTensorType ty = RankedTensorType::get(
-      {static_cast<int64_t>(values.size())}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, values);
-}
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h
deleted file mode 100644
index fc7c2316655d..000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_UTILS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_UTILS_H_
-
-#include <cstdint>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-
-namespace mlir {
-namespace odml {
-
-// Builds body for reduce op by using the template binary op as the
-// reducer op.
-template <typename Op>
-void BuildReduceBody(Type element_type, Region* body, OpBuilder* builder) {
-  OpBuilder::InsertionGuard guard(*builder);
-  Block* block = builder->createBlock(body);
-
-  // Block arguments are scalars of the given element type.
-  Type type = RankedTensorType::get(/*shape=*/{}, element_type);
-  Location loc = body->getLoc();
-  block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-
-  auto reducer =
-      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
-  builder->create<mhlo::ReturnOp>(loc, reducer.getResult());
-}
-
-mhlo::ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
-                                      OpBuilder* builder);
-
-mhlo::ConstantOp GetScalarNegZeroOfType(Type ty, Location loc,
-                                        OpBuilder* builder);
-
-// Converts an ArrayAttr to a 1D 64-bit dense elements attribute.
-DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr);
-DenseIntElementsAttr GetI64ElementsAttr(llvm::ArrayRef<int64_t> values,
-                                        Builder* builder);
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils_test.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils_test.cc
deleted file mode 100644
index 40d3cc271644..000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/utils_test.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h"
-
-#include <cstdint>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-
-namespace mlir {
-namespace odml {
-namespace {
-
-TEST(UtilsTest, GetScalarConstOfType) {
-  MLIRContext context;
-  context.loadDialect<mlir::mhlo::MhloDialect>();
-  OpBuilder builder(&context);
-  Location loc = UnknownLoc::get(&context);
-  Type ty = builder.getI32Type();
-  mhlo::ConstantOp op = GetScalarConstOfType(ty, loc, 123, &builder);
-  EXPECT_EQ(op.getValue().getValues<int32_t>()[0], 123);
-
-  op->destroy();
-}
-
-TEST(UtilsTest, GetScalarNegZeroOfType) {
-  MLIRContext context;
-  context.loadDialect<mlir::mhlo::MhloDialect>();
-  OpBuilder builder(&context);
-  Location loc = UnknownLoc::get(&context);
-  Type ty = builder.getF32Type();
-  mhlo::ConstantOp op = GetScalarNegZeroOfType(ty, loc, &builder);
-  EXPECT_EQ(op.getValue().getValues<float>()[0], -0.f);
-
-  op->destroy();
-}
-
-TEST(UtilsTest, GetI64ElementsAttr) {
-  MLIRContext context;
-  context.loadDialect<mlir::mhlo::MhloDialect>();
-  OpBuilder builder(&context);
-  Location loc = UnknownLoc::get(&context);
-  SmallVector<int64_t> values = {1, 2, 3};
-  auto valuesAttr = builder.getI64ArrayAttr(values);
-  DenseIntElementsAttr attr = GetI64ElementsAttr(valuesAttr);
-  EXPECT_THAT(SmallVector<int64_t>(attr.getValues<int64_t>()),
-              testing::ElementsAreArray(values));
-}
-
-TEST(UtilsTest, GetI64ElementsAttrBuilder) {
-  MLIRContext context;
-  context.loadDialect<mlir::mhlo::MhloDialect>();
-  OpBuilder builder(&context);
-  Location loc = UnknownLoc::get(&context);
-  SmallVector<int64_t> values = {1, 2, 3};
-  DenseIntElementsAttr attr = GetI64ElementsAttr(values, &builder);
-  EXPECT_THAT(SmallVector<int64_t>(attr.getValues<int64_t>()),
-              testing::ElementsAreArray(values));
-}
-
-}  // namespace
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/symlink_files.bzl b/tensorflow/compiler/mlir/lite/symlink_files.bzl
new file mode 100644
index 000000000000..e757f32fa03e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/symlink_files.bzl
@@ -0,0 +1,117 @@
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Macros for symlinking files into certain directories at build time."""
+
+def _symlink_files_impl(ctx):
+    flatten = ctx.attr.flatten
+    strip_prefix = ctx.attr.strip_prefix
+    mapping = ctx.attr.mapping
+    outputs = []
+    for src in ctx.files.srcs:
+        src_path = src.short_path
+        if src_path in mapping:
+            file_dst = mapping[src_path]
+        else:
+            file_dst = src.basename if flatten else src_path
+            if not file_dst.startswith(strip_prefix):
+                fail(("File {} has destination {} that does not begin with" +
+                      " strip_prefix {}").format(
+                    src,
+                    file_dst,
+                    strip_prefix,
+                ))
+            file_dst = file_dst[len(strip_prefix):]
+        outfile = ctx.attr.dst + "/" + file_dst
+        out = ctx.actions.declare_file(outfile)
+        outputs.append(out)
+        ctx.actions.symlink(output = out, target_file = src)
+    outputs = depset(outputs)
+    return [DefaultInfo(
+        files = outputs,
+        runfiles = ctx.runfiles(transitive_files = outputs),
+    )]
+
+symlink_files = rule(
+    implementation = _symlink_files_impl,
+    attrs = {
+        "dst": attr.string(
+            default = ".",
+            doc = "Destination directory into which to symlink `srcs`." +
+                  " Relative to current directory.",
+        ),
+        "srcs": attr.label_list(
+            allow_files = True,
+            doc = "Files to symlink into `dst`.",
+        ),
+        "flatten": attr.bool(
+            default = False,
+            doc = "Whether files in `srcs` should all be flattened to be" +
+                  " direct children of `dst` or preserve their existing" +
+                  " directory structure.",
+        ),
+        "strip_prefix": attr.string(
+            default = "",
+            doc = "Literal string prefix to strip from the paths of all files" +
+                  " in `srcs`. All files in `srcs` must begin with this" +
+                  " prefix or be present mapping. Generally they would not be" +
+                  " used together, but prefix stripping happens after flattening.",
+        ),
+        "mapping": attr.string_dict(
+            default = {},
+            doc = "Dictionary indicating where individual files in `srcs`" +
+                  " should be mapped to under `dst`. Keys are the origin" +
+                  " path of the file (relative to the build system root) and" +
+                  " values are the destination relative to `dst`. Files" +
+                  " present in `mapping` ignore the `flatten` and" +
+                  " `strip_prefix` attributes: their destination is based" +
+                  " only on `dst` and the value for their key in `mapping`.",
+        ),
+    },
+)
+
+def symlink_inputs(name, rule, symlinked_inputs, **kwargs):
+    """Wraps a rule and symlinks input files into the current directory tree.
+
+    Args:
+      rule: the rule (or macro) being wrapped.
+      name: name for the generated rule.
+      symlinked_inputs: a dictionary of dictionaries indicating label-list
+        arguments labels that should be passed to the generated rule after
+        being symlinked into the specified directory.
+      **kwargs: additional keyword arguments to forward to the generated rule.
+    """
+    for kwarg, mapping in symlinked_inputs.items():
+        for dst, files in mapping.items():
+            if kwarg in kwargs:
+                fail(
+                    "key %s is already present in this rule" % (kwarg,),
+                    attr = "symlinked_inputs",
+                )
+            if dst == None:
+                kwargs[kwarg] = files
+            else:
+                symlinked_target_name = "_{}_{}".format(name, kwarg)
+                symlink_files(
+                    name = symlinked_target_name,
+                    dst = dst,
+                    srcs = files,
+                    flatten = True,
+                )
+                kwargs[kwarg] = [":" + symlinked_target_name]
+    rule(
+        name = name,
+        **kwargs
+    )
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index bad74e9b0c9c..90c3e797b250 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -304,7 +304,7 @@ func.func @broadcast_to_to_reshape(%arg0: tensor<4x4x4xf32>, %arg1 : tensor<4xi3
 
 // Converts tfl.broadcast_to to tfl.reshape if input and output have the same
 // number of elements.
-// CHECK-LABEL: broadcast_to_to_reshape_i64
+// CHECK-LABEL: @broadcast_to_to_reshape_i64
 func.func @broadcast_to_to_reshape_i64(%arg0: tensor<4x4x4xf32>, %arg1 : tensor<4xi64>) -> tensor<1x4x4x4xf32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<4x4x4xf32>, tensor<4xi64>) -> tensor<1x4x4x4xf32>
   // CHECK: "tfl.cast"
@@ -317,7 +317,7 @@ func.func @broadcast_to_to_reshape_i64(%arg0: tensor<4x4x4xf32>, %arg1 : tensor<
 
 // Converts tfl.broadcast_to to tfl.reshape if input and output have the same
 // number of elements.
-// CHECK-LABEL: broadcast_to_to_reshape_i64_const
+// CHECK-LABEL: @broadcast_to_to_reshape_i64_const
 func.func @broadcast_to_to_reshape_i64_const(%arg0: tensor<4x4x4xf32>) -> tensor<1x4x4x4xf32> {
   %cst = arith.constant dense<[1, 4, 4, 4]> : tensor<4xi64>
   %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<4xi64>) -> tensor<1x4x4x4xf32>
@@ -329,6 +329,7 @@ func.func @broadcast_to_to_reshape_i64_const(%arg0: tensor<4x4x4xf32>) -> tensor
 
 // -----
 
+// CHECK-LABEL: @trivial_dynamic_update_slice
 func.func @trivial_dynamic_update_slice(%arg0: tensor<2x7x14xf32>, %arg1: tensor<2x7x14xf32>) -> tensor<2x7x14xf32> {
   %0 = arith.constant dense<0> : tensor<3xi32>
   %1 = "tfl.dynamic_update_slice"(%arg0, %arg1, %0) : (tensor<2x7x14xf32>, tensor<2x7x14xf32>, tensor<3xi32>) -> tensor<2x7x14xf32>
@@ -338,6 +339,7 @@ func.func @trivial_dynamic_update_slice(%arg0: tensor<2x7x14xf32>, %arg1: tensor
 
 // -----
 
+// CHECK-LABEL: @trivial_dynamic_update_slice_wrong_update_shape
 func.func @trivial_dynamic_update_slice_wrong_update_shape(%arg0: tensor<2x7x14xf32>, %arg1: tensor<2x7x7xf32>) -> tensor<2x7x14xf32> {
   %0 = arith.constant dense<0> : tensor<3xi32>
   %1 = "tfl.dynamic_update_slice"(%arg0, %arg1, %0) : (tensor<2x7x14xf32>, tensor<2x7x7xf32>, tensor<3xi32>) -> tensor<2x7x14xf32>
@@ -381,4 +383,10 @@ func.func @ConstPadToI32(%arg0: tensor<15600xf32>) -> tensor<15602xf32> {
   // CHECK: "tfl.pad"(%arg0, %cst) : (tensor<15600xf32>, tensor<1x2xi32>) -> tensor<15602xf32>
 }
 
-
+// CHECK-LABEL: @RemoveNoopTranspose
+func.func @RemoveNoopTranspose(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
+  %cst = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x2x3x4xf32>, tensor<4xi32>) -> tensor<1x2x3x4xf32>
+  func.return %0 : tensor<1x2x3x4xf32>
+  // CHECK: return %arg0
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/cleanup_optimization_barrier.mlir b/tensorflow/compiler/mlir/lite/tests/cleanup_optimization_barrier.mlir
new file mode 100644
index 000000000000..12625023255f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/cleanup_optimization_barrier.mlir
@@ -0,0 +1,14 @@
+// RUN: tf-opt %s --tfl-cleanup-optimization-barrier --split-input-file | FileCheck %s
+
+// CHECK-LABEL:   func.func @cleanup_barrier(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK:           %0 = tfl.add(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+// CHECK:           %1 = tfl.add(%0, %cst) <{fused_activation_function = "NONE"}> : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+// CHECK:           return %1 : tensor<2x2xf32>
+
+func.func @cleanup_barrier(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %cst = arith.constant dense<5.000000e+00> : tensor<f32>
+    %0 = tfl.add(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %1 = stablehlo.optimization_barrier %0 : tensor<2x2xf32>
+    %2 = tfl.add(%1, %cst) <{fused_activation_function = "NONE"}> : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    return %2 : tensor<2x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_gather_round_trip.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_gather_round_trip.mlir
index 12de9da59395..adb22ddd009a 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_gather_round_trip.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_gather_round_trip.mlir
@@ -4,11 +4,14 @@
 module {
   // CHECK-LABEL: func.func public @main
   func.func public @main(%arg0: tensor<3x2x4x7x9xi32>, %arg1: tensor<4x3x5x2xi32>) -> tensor<4x3x5x8xi32> {
-    // CHECK-ROUNDTRIP:       %[[iota_1:.*]] = "tfl.pseudo_const"() <{{.*}}> : () -> tensor<4x3x5x1xi32
-    // CHECK-ROUNDTRIP:       %[[iota_2:.*]] = "tfl.pseudo_const"() <{{.*}}> : () -> tensor<4x3x5x1xi32>
-    // CHECK-ROUNDTRIP:       %[[concat:.*]] = "tfl.concatenation"(%[[iota_1]], %[[iota_2]], %arg1) <{axis = 3 : i32, fused_activation_function = "NONE"}> :
+    // CHECK-ROUNDTRIP:       %0 = "tfl.pseudo_const"() <{value = dense<{{\[\[\[\[}}0]], {{\[\[}}1]], {{\[\[}}2]]]]> : tensor<1x3x1x1xi32>}> : () -> tensor<1x3x1x1xi32>
+    // CHECK-ROUNDTRIP:       %1 = "tfl.pseudo_const"() <{value = dense<[4, 3, 5, 1]> : tensor<4xi64>}> : () -> tensor<4xi64>
+    // CHECK-ROUNDTRIP:       %2 = "tfl.broadcast_to"(%0, %1) : (tensor<1x3x1x1xi32>, tensor<4xi64>) -> tensor<4x3x5x1xi32>
+    // CHECK-ROUNDTRIP:       %3 = "tfl.pseudo_const"() <{value = dense<{{\[\[\[\[}}0]]], {{\[\[\[}}1]]], {{\[\[\[}}2]]], {{\[\[\[}}3]]]]> : tensor<4x1x1x1xi32>}> : () -> tensor<4x1x1x1xi32>
+    // CHECK-ROUNDTRIP:       %4 = "tfl.broadcast_to"(%3, %1) : (tensor<4x1x1x1xi32>, tensor<4xi64>) -> tensor<4x3x5x1xi32>
+    // CHECK-ROUNDTRIP:       %[[concat:.*]] = "tfl.concatenation"(%2, %4, %arg1) <{axis = 3 : i32, fused_activation_function = "NONE"}> :
     // CHECK-ROUNDTRIP-SAME:    (tensor<4x3x5x1xi32>, tensor<4x3x5x1xi32>, tensor<4x3x5x2xi32>) -> tensor<4x3x5x4xi32>
-    // CHECK-ROUNDTRIP:       %[[gather:.*]] = "stablehlo.gather"(%arg0, %2) <{
+    // CHECK-ROUNDTRIP:       %[[gather:.*]] = "stablehlo.gather"(%arg0, %[[concat]]) <{
     // CHECK-ROUNDTRIP-SAME:    dimension_numbers = #stablehlo.gather<
     // CHECK-ROUNDTRIP-SAME:      offset_dims = [3], collapsed_slice_dims = [0, 1, 2, 3],
     // CHECK-ROUNDTRIP-SAME:      start_index_map = [0, 2, 1, 3], index_vector_dim = 3>,
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_scatter_round_trip.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_scatter_round_trip.mlir
index 44d1bb7dd8b7..7e42ff310c08 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_scatter_round_trip.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/batched_scatter_round_trip.mlir
@@ -4,11 +4,14 @@
 module {
   // CHECK-LABEL: func.func public @main
   func.func public @main(%arg0: tensor<3x2x4x7x9xi32>, %arg1: tensor<4x3x5x2xi32>, %arg2: tensor<4x3x5x8xi32>) -> tensor<3x2x4x7x9xi32> {
-    // CHECK-ROUNDTRIP:       %[[iota_1:.*]] = "tfl.pseudo_const"() <{{.*}}> : () -> tensor<4x3x5x1xi32
-    // CHECK-ROUNDTRIP:       %[[iota_2:.*]] = "tfl.pseudo_const"() <{{.*}}> : () -> tensor<4x3x5x1xi32>
-    // CHECK-ROUNDTRIP:       %[[concat:.*]] = "tfl.concatenation"(%[[iota_1]], %[[iota_2]], %arg1) <{axis = 3 : i32, fused_activation_function = "NONE"}> :
+    // CHECK-ROUNDTRIP:       %0 = "tfl.pseudo_const"() <{value = dense<{{\[\[\[\[}}0]], {{\[\[}}1]], {{\[\[}}2]]]]> : tensor<1x3x1x1xi32>}> : () -> tensor<1x3x1x1xi32>
+    // CHECK-ROUNDTRIP:       %1 = "tfl.pseudo_const"() <{value = dense<[4, 3, 5, 1]> : tensor<4xi64>}> : () -> tensor<4xi64>
+    // CHECK-ROUNDTRIP:       %2 = "tfl.broadcast_to"(%0, %1) : (tensor<1x3x1x1xi32>, tensor<4xi64>) -> tensor<4x3x5x1xi32>
+    // CHECK-ROUNDTRIP:       %3 = "tfl.pseudo_const"() <{value = dense<{{\[\[\[\[}}0]]], {{\[\[\[}}1]]], {{\[\[\[}}2]]], {{\[\[\[}}3]]]]> : tensor<4x1x1x1xi32>}> : () -> tensor<4x1x1x1xi32>
+    // CHECK-ROUNDTRIP:       %4 = "tfl.broadcast_to"(%3, %1) : (tensor<4x1x1x1xi32>, tensor<4xi64>) -> tensor<4x3x5x1xi32>
+    // CHECK-ROUNDTRIP:       %[[concat:.*]] = "tfl.concatenation"(%2, %4, %arg1) <{axis = 3 : i32, fused_activation_function = "NONE"}> :
     // CHECK-ROUNDTRIP-SAME:    (tensor<4x3x5x1xi32>, tensor<4x3x5x1xi32>, tensor<4x3x5x2xi32>) -> tensor<4x3x5x4xi32>
-    // CHECK-ROUNDTRIP:       %[[scatter:.*]] = "stablehlo.scatter"(%arg0, %2, %arg2) <{
+    // CHECK-ROUNDTRIP:       %[[scatter:.*]] = "stablehlo.scatter"(%arg0, %[[concat]], %arg2) <{
     // CHECK-ROUNDTRIP-SAME:    scatter_dimension_numbers = #stablehlo.scatter
     // CHECK-ROUNDTRIP-SAME:      update_window_dims = [3], inserted_window_dims = [0, 1, 2, 3],
     // CHECK-ROUNDTRIP-SAME:      scatter_dims_to_operand_dims = [0, 2, 1, 3], index_vector_dim = 3>}>
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
index 2c17e734c58d..e0793cbf803c 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
@@ -5,7 +5,6 @@ func.func @broadcast_to_bf16(%arg0: tensor<3xbf16>, %arg1: tensor<2xi64>) -> ten
   func.return %0: tensor<3x3xbf16>
 
 // CHECK-LABEL: broadcast_to_bf16
-// CHECK:  [[CST:%.*]] = arith.constant dense<1.000000e+00> : tensor<3x3xbf16>
-// CHECK:  [[MUL:%.*]] = tfl.mul(%arg0, [[CST]]) <{fused_activation_function = "NONE"}> : (tensor<3xbf16>, tensor<3x3xbf16>) -> tensor<3x3xbf16>
-// CHECK:  return [[MUL]] : tensor<3x3xbf16>
+// CHECK:  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xbf16>, tensor<2xi64>) -> tensor<3x3xbf16>
+// CHECK:  return %0 : tensor<3x3xbf16>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index c0978d484ee1..c3dc00ca74f1 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -2589,6 +2589,14 @@ func.func @dynamic_update_slice_f16_arg(%arg0: tensor<4x5xf16>, %arg1: tensor<1x
 // CHECK: "tfl.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<4x5xf16>, tensor<1x5xf16>, tensor<2xi32>) -> tensor<4x5xf16>
 }
 
+func.func @dynamic_update_slice_i16(%arg0: tensor<4x5xi16>, %arg1: tensor<1x5xi16>, %arg2: tensor<2xi32>) -> tensor<4x5xi16> {
+  %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<4x5xi16>, tensor<1x5xi16>, tensor<2xi32>) -> tensor<4x5xi16>
+  func.return %0 : tensor<4x5xi16>
+
+// CHECK-LABEL:dynamic_update_slice_i16
+// CHECK: "tfl.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<4x5xi16>, tensor<1x5xi16>, tensor<2xi32>) -> tensor<4x5xi16>
+}
+
 func.func @testReluI32(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = "tf.Relu"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   func.return %0: tensor<1xi32>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/runtime_version_metadata.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/runtime_version_metadata.mlir
new file mode 100644
index 000000000000..123e7f8decae
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/runtime_version_metadata.mlir
@@ -0,0 +1,10 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s 2>&1 | FileCheck %s
+
+module attributes {tfl.metadata = {min_runtime_version = ""}} {
+  func.func @main(%arg0: tensor<3x2xi32>) -> tensor<3x2xi32>
+    attributes {tf.entry_function = {inputs = "input", outputs = "SameNameAsOutput"}} {
+    func.return %arg0 : tensor<3x2xi32>
+  }
+}
+
+// CHECK: Skipping runtime version metadata in the model. This will be generated by the exporter.
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 5d8328590fe8..56b82b904259 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1599,6 +1599,14 @@ func.func @testBatchMatmulHybridQuant(%arg0 : tensor<1x4x384x32xf32>, %arg1 : te
 
 // -----
 
+func.func @testBatchMatmulHybridBf16F32(%arg0 : tensor<1x4x384x32xbf16>, %arg1 : tensor<1x4x384x32xbf16>) -> tensor<1x4x384x384xf32> {
+  // expected-error @+1 {{'tfl.batch_matmul' op operand #0 must be tensor of 32-bit float or QI8 type or QI16 type or 8-bit signless integer values}}
+  %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = true} : (tensor<1x4x384x32xbf16>, tensor<1x4x384x32xbf16>) -> tensor<1x4x384x384xf32>
+  func.return %0 : tensor<1x4x384x384xf32>
+}
+
+// -----
+
 func.func @testConcat(%arg0: tensor<1x2xi32>, %arg1: tensor<1x2xi32>) -> tensor<2x2xi32> {
   // CHECK: "tfl.concatenation"(%arg0, %arg1) <{axis = 0 : i32, fused_activation_function = "NONE"}>
   %0 = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
@@ -1751,6 +1759,14 @@ func.func @testStridedSliceWithInvalidOutputType(%arg0: tensor<12x2x2x5xf32>, %a
 
 // -----
 
+func.func @testStridedSliceWithInvalidInputRank(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x1x1x2x2x5xf32> {
+  // expected-error @+1 {{op failed to verify that input (with new_axis) must have rank at most 5}}
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 6 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x1x1x2x2x5xf32>
+  func.return %0 : tensor<1x1x1x2x2x5xf32>
+}
+
+// -----
+
 // CHECK-LABEL: testOneHot
 func.func @testOneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xf32> {
   // CHECK: "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) <{axis = -1 : i32}> : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
@@ -2593,6 +2609,13 @@ func.func @fully_connected(%arg0: tensor<1x37xf32>, %arg1: tensor<40x37xf32>, %a
 
 // -----
 
+func.func @fully_connected_with_int64_num_elements(%arg0: tensor<2048x128xf32>, %arg1: tensor<1049088x128xf32>, %arg2: none) -> tensor<2048x1049088xf32> {
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<2048x128xf32>, tensor<1049088x128xf32>, none) -> tensor<2048x1049088xf32>
+  func.return %0 : tensor<2048x1049088xf32>
+}
+
+// -----
+
 func.func @fully_connected_no_bias(%arg0: tensor<2x2x10xf32>, %arg1: tensor<40x40xf32>, %arg2: none) -> tensor<1x40xf32> {
   %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x2x10xf32>, tensor<40x40xf32>, none) -> tensor<1x40xf32>
   func.return %0 : tensor<1x40xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 47fa770ec865..4b9ecc812307 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -510,6 +510,40 @@ func.func @fuseMulIntoFollowingFullyConnected(%arg0: tensor<4x2xf32>) -> tensor<
 // CHECK-NEXT: return %[[fc]] : tensor<4x2xf32>
 }
 
+// CHECK-LABEL: @DontFuseRhsNonConstMulIntoFollowingFullyConnected
+func.func @DontFuseRhsNonConstMulIntoFollowingFullyConnected(%arg0: tensor<4x2xf32>, %arg1: tensor<2xf32>) -> tensor<4x2xf32> {
+  %mul = "tfl.mul"(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  %filter = arith.constant dense<1.750000e+00> : tensor<2x2xf32>
+  %bias = arith.constant dense<2.000000e+00> : tensor<2xf32>
+  %fc = "tfl.fully_connected"(%mul, %filter, %bias) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  func.return %fc : tensor<4x2xf32>
+
+// CHECK-DAG: %[[MUL:.*]] = tfl.mul(%arg0, %arg1)
+// CHECK-DAG: %[[FILTER:.*]] = arith.constant dense<1.750000e+00> : tensor<2x2xf32>
+// CHECK-DAG: %[[BIAS:.*]] = arith.constant dense<2.000000e+00> : tensor<2xf32>
+// CHECK-NEXT: %[[FC:.*]] = "tfl.fully_connected"(%[[MUL]], %[[FILTER]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+// CHECK-NEXT: return %[[FC]] : tensor<4x2xf32>
+}
+
+// CHECK-LABEL: @DontFuseMulIntoFollowingWeightOnlyQuantizedFullyConnected
+func.func @DontFuseMulIntoFollowingWeightOnlyQuantizedFullyConnected(%arg0: tensor<4x2xf32>) -> tensor<4x2xf32> {
+  %mul_cst = arith.constant dense<[1.500000e+00, 1.600000e+00]> : tensor<2xf32>
+  %mul = "tfl.mul"(%arg0, %mul_cst) <{fused_activation_function = "NONE"}> : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  %filter_quant = "tfl.pseudo_qconst"() <{qtype = tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00,1.100000e+00}>>, value = dense<9> : tensor<2x2xi8>}> : () -> tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00,1.100000e+00}>>
+  %filter_dq = "tfl.dequantize"(%filter_quant) : (tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00,1.100000e+00}>>) -> tensor<2x2xf32>
+  %bias = arith.constant dense<2.000000e+00> : tensor<2xf32>
+  %weight_only_fc = "tfl.fully_connected"(%mul, %filter_dq, %bias) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  func.return %weight_only_fc : tensor<4x2xf32>
+
+// CHECK-DAG: %[[MUL_CST:.*]] = arith.constant dense<[1.500000e+00, 1.600000e+00]> : tensor<2xf32>
+// CHECK-DAG: %[[MUL:.*]] = tfl.mul(%arg0, %[[MUL_CST]])
+// CHECK-DAG: %[[FILTER_QUANT:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00,1.100000e+00}>>, value = dense<9> : tensor<2x2xi8>}> : () -> tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00,1.100000e+00}>>
+// CHECK-DAG: %[[FILTER_DQ:.*]] = "tfl.dequantize"(%[[FILTER_QUANT]]) : (tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00,1.100000e+00}>>) -> tensor<2x2xf32>
+// CHECK-DAG: %[[BIAS:.*]] = arith.constant dense<2.000000e+00> : tensor<2xf32>
+// CHECK-NEXT: %[[WEIGHT_ONLY_FC:.*]] = "tfl.fully_connected"(%[[MUL]], %[[FILTER_DQ]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+// CHECK-NEXT: return %[[WEIGHT_ONLY_FC]] : tensor<4x2xf32>
+}
+
 // CHECK-LABEL: @fuseMulIntoFullyConnectedBroadcast
 func.func @fuseMulIntoFullyConnectedBroadcast(%arg0: tensor<1x3xf32>) -> tensor<1x2xf32> {
   %cst0 = arith.constant dense<[[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]> : tensor<2x3xf32>
@@ -2539,15 +2573,21 @@ func.func @DontConvertMul1WithBroadcastToIdentity(%arg0: tensor<2xf32>) -> tenso
 }
 
 // CHECK-LABEL: ConvertConstSelectToIdentity
-func.func @ConvertConstSelectToIdentity(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x3x4xf32>) -> (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) {
+func.func @ConvertConstSelectToIdentity(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x3x4xf32>, %arg2: tensor<1x2x3x4xi1>) -> (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>) {
   %cst_true = arith.constant dense<true> : tensor<1x2x3x4xi1>
   %cst_false = arith.constant dense<false> : tensor<1x2x3x4xi1>
   %0 = "tfl.select"(%cst_true, %arg0, %arg1) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
   %1 = "tfl.select_v2"(%cst_true, %arg0, %arg1) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
   %2 = "tfl.select"(%cst_false, %arg0, %arg1) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
   %3 = "tfl.select_v2"(%cst_false, %arg0, %arg1) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
-  func.return %0, %1, %2, %3 : tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>
-  // CHECK: return %arg0, %arg0, %arg1, %arg1
+  %4 = "tfl.select"(%arg2, %cst_true, %cst_false) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>) -> tensor<1x2x3x4xi1>
+  %5 = "tfl.select_v2"(%arg2, %cst_true, %cst_false) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>) -> tensor<1x2x3x4xi1>
+  %6 = "tfl.select"(%arg2, %cst_false, %cst_true) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>) -> tensor<1x2x3x4xi1>
+  %7 = "tfl.select_v2"(%arg2, %cst_false, %cst_true) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>) -> tensor<1x2x3x4xi1>
+  func.return %0, %1, %2, %3, %4, %5, %6, %7 : tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>, tensor<1x2x3x4xi1>
+  // CHECK: %0 = "tfl.logical_not"(%arg2) : (tensor<1x2x3x4xi1>) -> tensor<1x2x3x4xi1>
+  // CHECK: %1 = "tfl.logical_not"(%arg2) : (tensor<1x2x3x4xi1>) -> tensor<1x2x3x4xi1>
+  // CHECK: return %arg0, %arg0, %arg1, %arg1, %arg2, %arg2, %0, %1
 }
 
 // CHECK-LABEL: DontConvertConstSelectBroadcast
@@ -3712,6 +3752,46 @@ func.func @gelu_approximate(%arg0: tensor<3xf32>) -> tensor<3xf32> {
 // CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
 }
 
+func.func @gelu_approximate_with_mul(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %cst = arith.constant dense<0.797884583> : tensor<f32>
+  %cst_0 = arith.constant dense<5.000000e-01> : tensor<f32>
+  %cst_1 = arith.constant dense<1.000000e+00> : tensor<f32>
+  %cst_3 = arith.constant dense<4.471500e-02> : tensor<f32>
+  %99 = "tfl.mul"(%arg0, %arg0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %0 = "tfl.mul"(%99, %arg0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %1 = "tfl.mul"(%0, %cst_3) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %2 = "tfl.add"(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %3 = "tfl.mul"(%2, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %4 = "tfl.tanh"(%3) : (tensor<3xf32>) -> tensor<3xf32>
+  %5 = "tfl.add"(%4, %cst_1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %6 = "tfl.mul"(%arg0, %cst_0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %7 = "tfl.mul"(%6, %5) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  func.return %7 : tensor<3xf32>
+
+// CHECK-LABEL:gelu_approximate
+// CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
+}
+
+func.func @gelu_approximate_with_mul2(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %cst = arith.constant dense<0.797884583> : tensor<f32>
+  %cst_0 = arith.constant dense<5.000000e-01> : tensor<f32>
+  %cst_1 = arith.constant dense<1.000000e+00> : tensor<f32>
+  %cst_3 = arith.constant dense<4.471500e-02> : tensor<f32>
+  %99 = "tfl.mul"(%arg0, %arg0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %0 = "tfl.mul"(%arg0, %99) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %1 = "tfl.mul"(%0, %cst_3) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %2 = "tfl.add"(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %3 = "tfl.mul"(%2, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %4 = "tfl.tanh"(%3) : (tensor<3xf32>) -> tensor<3xf32>
+  %5 = "tfl.add"(%4, %cst_1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %6 = "tfl.mul"(%arg0, %cst_0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %7 = "tfl.mul"(%6, %5) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  func.return %7 : tensor<3xf32>
+
+// CHECK-LABEL:gelu_approximate
+// CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
+}
+
 func.func @gelu_approximate1(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst = arith.constant dense<0.797884583> : tensor<f32>
   %cst_0 = arith.constant dense<5.000000e-01> : tensor<f32>
@@ -3732,6 +3812,49 @@ func.func @gelu_approximate1(%arg0: tensor<3xf32>) -> tensor<3xf32> {
 // CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
 }
 
+func.func @gelu_approximate1_with_mul(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %cst = arith.constant dense<0.797884583> : tensor<f32>
+  %cst_0 = arith.constant dense<5.000000e-01> : tensor<f32>
+  %cst_1 = arith.constant dense<1.000000e+00> : tensor<f32>
+  %cst_2 = arith.constant dense<3.000000e+00> : tensor<f32>
+  %cst_3 = arith.constant dense<4.471500e-02> : tensor<f32>
+  %99 = "tfl.mul"(%arg0, %arg0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %0 = "tfl.mul"(%99, %arg0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %1 = "tfl.mul"(%0, %cst_3) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %2 = "tfl.add"(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %3 = "tfl.mul"(%2, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %4 = "tfl.tanh"(%3) : (tensor<3xf32>) -> tensor<3xf32>
+  %5 = "tfl.add"(%4, %cst_1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %6 = "tfl.mul"(%5, %cst_0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %7 = "tfl.mul"(%arg0, %6) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  func.return %7 : tensor<3xf32>
+
+// CHECK-LABEL:gelu_approximate
+// CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
+}
+
+
+func.func @gelu_approximate1_with_mul1(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %cst = arith.constant dense<0.797884583> : tensor<f32>
+  %cst_0 = arith.constant dense<5.000000e-01> : tensor<f32>
+  %cst_1 = arith.constant dense<1.000000e+00> : tensor<f32>
+  %cst_2 = arith.constant dense<3.000000e+00> : tensor<f32>
+  %cst_3 = arith.constant dense<4.471500e-02> : tensor<f32>
+  %99 = "tfl.mul"(%arg0, %arg0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %0 = "tfl.mul"(%arg0, %99) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %1 = "tfl.mul"(%0, %cst_3) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %2 = "tfl.add"(%arg0, %1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  %3 = "tfl.mul"(%2, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %4 = "tfl.tanh"(%3) : (tensor<3xf32>) -> tensor<3xf32>
+  %5 = "tfl.add"(%4, %cst_1) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %6 = "tfl.mul"(%5, %cst_0) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+  %7 = "tfl.mul"(%arg0, %6) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
+  func.return %7 : tensor<3xf32>
+
+// CHECK-LABEL:gelu_approximate
+// CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
+}
+
 func.func @gelu_approximate_no_match(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst = arith.constant dense<0.797884583> : tensor<f32>
   %cst_0 = arith.constant dense<5.000000e-01> : tensor<f32>
@@ -4310,11 +4433,11 @@ func.func @FuseExcessBroadcastingOnReshapes(%arg0: tensor<1x8xf32>) -> tensor<1x
     %1 = "tfl.broadcast_to"(%0, %cst_0) : (tensor<1x1x1x8x1x1xf32>, tensor<6xi32>) -> tensor<1x1x1x8x16x1xf32>
     %2 = "tfl.reshape"(%1, %cst_1) : (tensor<1x1x1x8x16x1xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
     return %2 : tensor<1x1x1x128xf32>
-    // CHECK: %cst = arith.constant dense<1.000000e+00> : tensor<8x16xf32>
+    // CHECK: %cst = arith.constant dense<[8, 16]> : tensor<2xi64>
     // CHECK: %cst_0 = arith.constant dense<[1, 1, 1, 128]> : tensor<4xi32>
     // CHECK: %cst_1 = arith.constant dense<[8, 1]> : tensor<2xi32>
     // CHECK: %0 = "tfl.reshape"(%arg0, %cst_1) : (tensor<1x8xf32>, tensor<2xi32>) -> tensor<8x1xf32>
-    // CHECK: %1 = tfl.mul(%0, %cst) <{fused_activation_function = "NONE"}> : (tensor<8x1xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+    // CHECK: %1 = "tfl.broadcast_to"(%0, %cst) : (tensor<8x1xf32>, tensor<2xi64>) -> tensor<8x16xf32>
     // CHECK: %2 = "tfl.reshape"(%1, %cst_0) : (tensor<8x16xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
     // CHECK: return %2 : tensor<1x1x1x128xf32>
 }
@@ -4336,83 +4459,63 @@ func.func @FuseExcessBroadcastingOnReshapesDynamicShapes(%arg0: tensor<?x10x1xf3
 func.func @broadcast_to_f32_low_dim(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
   return %0 : tensor<3x3xf32>
-  // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-  // CHECK:  return %0 : tensor<3x3xf32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_i32_low_dim
 func.func @broadcast_to_i32_low_dim(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<3x3xi32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
   return %0 : tensor<3x3xi32>
-  // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
-  // CHECK:  return %0 : tensor<3x3xi32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_low_dim_with_unknown_shape
 func.func @broadcast_to_low_dim_with_unknown_shape(%arg0: tensor<3xf32>, %arg1: tensor<*xi32>) -> tensor<3x3xf32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<*xi32>) -> tensor<3x3xf32>
   return %0 : tensor<3x3xf32>
-  // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-  // CHECK:  return %0 : tensor<3x3xf32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_i16_low_dim
 func.func @broadcast_to_i16_low_dim(%arg0: tensor<3xi16>, %arg1: tensor<2xi32>) -> tensor<3x3xi16> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi16>, tensor<2xi32>) -> tensor<3x3xi16>
   return %0 : tensor<3x3xi16>
-  // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi16>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xi16>, tensor<3x3xi16>) -> tensor<3x3xi16>
-  // CHECK:  return %0 : tensor<3x3xi16>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_i32_low_dim_with_unknown_output
 func.func @broadcast_to_i32_low_dim_with_unknown_output(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<*xi32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<*xi32>
   return %0 : tensor<*xi32>
-  // CHECK:  %cst = arith.constant dense<1> : tensor<i32>
-  // CHECK:  %0 = "tfl.fill"(%arg1, %cst) : (tensor<2xi32>, tensor<i32>) -> tensor<*xi32>
-  // CHECK:  %1 = tfl.mul(%arg0, %0) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<*xi32>) -> tensor<*xi32>
-  // CHECK:  return %1 : tensor<*xi32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_ui32
 func.func @broadcast_to_ui32(%arg0: tensor<ui32>, %arg1: tensor<1xi64>) -> tensor<10xui32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<ui32>, tensor<1xi64>) -> tensor<10xui32>
   return %0 : tensor<10xui32>
-  // CHECK:  %cst = arith.constant dense<1> : tensor<10xui32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<ui32>, tensor<10xui32>) -> tensor<10xui32>
-  // CHECK:  return %0 : tensor<10xui32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_f32
 func.func @broadcast_to_f32(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
   return %0 : tensor<3x3xf32>
-  // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-  // CHECK:  return %0 : tensor<3x3xf32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_i32
 func.func @broadcast_to_i32(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<3x3xi32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
   return %0 : tensor<3x3xi32>
-  // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
-  // CHECK:  return %0 : tensor<3x3xi32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_i32_with_dynamic_shape_and_output
 func.func @broadcast_to_i32_with_dynamic_shape_and_output(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<3x?xi32> {
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x?xi32>
   return %0 : tensor<3x?xi32>
-  // CHECK:  %cst = arith.constant dense<1> : tensor<i32>
-  // CHECK:  %0 = "tfl.fill"(%arg1, %cst) : (tensor<2xi32>, tensor<i32>) -> tensor<3x?xi32>
-  // CHECK:  %1 = tfl.mul(%arg0, %0) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<3x?xi32>) -> tensor<3x?xi32>
-  // CHECK:  return %1 : tensor<3x?xi32>
+  // CHECK:  tfl.broadcast_to
 }
 
 // CHECK-LABEL: @broadcast_to_ui32_with_dynamic_output
@@ -4530,4 +4633,198 @@ func.func @RealDivWithConstDivisor(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: %cst = arith.constant dense<2.000000e-01> : tensor<f32>
   // CHECK: %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xf32>
   // CHECK: return %0 : tensor<2x3xf32>
-}
\ No newline at end of file
+}
+
+//CHECK-LABEL: @PushTransposeThroughSqueezeNoDims
+func.func @PushTransposeThroughSqueezeNoDims(%arg0: tensor<1x1x2x3xf32>) -> (tensor<3x2xf32>) {
+  %cst = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x3x1x2xf32>
+  %1 = "tfl.squeeze"(%0): (tensor<1x3x1x2xf32>) -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+
+  // CHECK: %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
+  // CHECK: %cst_0 = arith.constant dense<[2, 3]> : tensor<2xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<1x1x2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  // CHECK: %1 = "tfl.transpose"(%0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+}
+
+//CHECK-LABEL: @PushTransposeThroughSqueeze1
+func.func @PushTransposeThroughSqueeze1(%arg0: tensor<1x1x2x3xf32>) -> (tensor<3x2xf32>) {
+  %cst = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x3x1x2xf32>
+  %1 = "tfl.squeeze"(%0) {squeeze_dims = [0, 2]}: (tensor<1x3x1x2xf32>) -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+
+  // CHECK: %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
+  // CHECK: %cst_0 = arith.constant dense<[2, 3]> : tensor<2xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<1x1x2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  // CHECK: %1 = "tfl.transpose"(%0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  // CHECK: return
+}
+
+//CHECK-LABEL: @PushTransposeThroughSqueeze2
+func.func @PushTransposeThroughSqueeze2(%arg0: tensor<1x1x2x3xf32>) -> (tensor<2x3xf32>) {
+  %cst = arith.constant dense<[1, 2, 0, 3]> : tensor<4xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+  %1 = "tfl.squeeze"(%0) {squeeze_dims = [0, 2]}: (tensor<1x2x1x3xf32>) -> tensor<2x3xf32>
+  return %1 : tensor<2x3xf32>
+
+  // CHECK: %cst = arith.constant dense<[2, 3]> : tensor<2xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<1x1x2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  // CHECK: return
+}
+
+//CHECK-LABEL: @EliminateBooleanCastCompare
+func.func @EliminateBooleanCastCompare(%arg0: tensor<*xi1>) -> (tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>) {
+  %zero = arith.constant dense<0> : tensor<i32>
+  %cast = "tfl.cast"(%arg0) : (tensor<*xi1>) -> tensor<*xi32>
+
+  %1 = "tfl.equal"(%cast, %zero) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %2 = "tfl.less_equal"(%cast, %zero) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %3 = "tfl.greater_equal"(%cast, %zero) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %4 = "tfl.not_equal"(%cast, %zero) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %5 = "tfl.greater"(%cast, %zero) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %6 = "tfl.less"(%cast, %zero) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+
+  %7 = "tfl.equal"(%zero, %cast) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi1>
+  %8 = "tfl.less_equal"(%zero, %cast) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi1>
+  %9 = "tfl.greater_equal"(%zero, %cast) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi1>
+  %10 = "tfl.not_equal"(%zero, %cast) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi1>
+  %11 = "tfl.greater"(%zero, %cast) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi1>
+  %12 = "tfl.less"(%zero, %cast) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi1>
+
+  return %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12 : tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>
+
+  // CHECK: %0 = "tfl.logical_not"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %1 = "tfl.logical_not"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %2 = "tfl.zeros_like"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %3 = "tfl.logical_not"(%2) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %4 = "tfl.zeros_like"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %5 = "tfl.logical_not"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %6 = "tfl.zeros_like"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %7 = "tfl.logical_not"(%6) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %8 = "tfl.logical_not"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: %9 = "tfl.zeros_like"(%arg0) : (tensor<*xi1>) -> tensor<*xi1>
+  // CHECK: return %0, %1, %3, %arg0, %arg0, %4, %5, %7, %8, %arg0, %9, %arg0 : tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>, tensor<*xi1>
+}
+
+// CHECK-LABEL: @ReorderTransposeReshapeTranspose
+func.func @ReorderTransposeReshapeTranspose(%arg0: tensor<282x2048xf32>) -> tensor<2x1x282x1024xf32> {
+  %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
+  %cst_1 = arith.constant dense<[2, 1024, 1, 282]> : tensor<4xi32>
+  %cst_2 = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<282x2048xf32>, tensor<2xi32>) -> tensor<2048x282xf32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<2048x282xf32>, tensor<4xi32>) -> tensor<2x1024x1x282xf32>
+  %2 = "tfl.transpose"(%1, %cst_2) : (tensor<2x1024x1x282xf32>, tensor<4xi32>) -> tensor<2x1x282x1024xf32>
+  return %2: tensor<2x1x282x1024xf32>
+
+  // CHECK:      %cst = arith.constant dense<[1, 3, 0, 2]> : tensor<4xi32>
+  // CHECK-NEXT: %cst_0 = arith.constant dense<[282, 2, 1024, 1]> : tensor<4xi32>
+  // CHECK-NEXT: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<282x2048xf32>, tensor<4xi32>) -> tensor<282x2x1024x1xf32>
+  // CHECK-NEXT: %1 = "tfl.transpose"(%0, %cst) : (tensor<282x2x1024x1xf32>, tensor<4xi32>) -> tensor<2x1x282x1024xf32>
+  // CHECK-NEXT: return %1 : tensor<2x1x282x1024xf32>
+}
+
+// CHECK-LABEL: @FullyConnectedSwapOperandsWhenLHSIsConst
+func.func @FullyConnectedSwapOperandsWhenLHSIsConst(%arg0: tensor<4x2xf32>, %arg1: none) -> tensor<2x4xf32> {
+  %cst = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %0 = "tfl.fully_connected"(%cst, %arg0, %arg1) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x2xf32>, tensor<4x2xf32>, none) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+
+  // CHECK:      %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
+  // CHECK-NEXT: %cst_0 = arith.constant dense<{{\[}}[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]]> : tensor<2x2xf32>
+  // CHECK-NEXT: %0 = "tfl.fully_connected"(%arg0, %cst_0, %arg1) <{asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, none) -> tensor<4x2xf32>
+  // CHECK-NEXT: %1 = "tfl.transpose"(%0, %cst) : (tensor<4x2xf32>, tensor<2xi32>) -> tensor<2x4xf32>
+  // CHECK-NEXT: return %1 : tensor<2x4xf32>
+}
+
+// CHECK-LABEL: @FullyConnectedSwapOperandsWhenLHSIsConstBias
+func.func @FullyConnectedSwapOperandsWhenLHSIsConstBias(%arg0: tensor<4x2xf32>) -> tensor<2x4xf32> {
+  %cst = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %cst_1 = arith.constant dense<2.0> : tensor<2xf32>
+  %0 = "tfl.fully_connected"(%cst, %arg0, %cst_1) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x2xf32>, tensor<4x2xf32>, tensor<2xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+
+  // CHECK:      [[cst:%.*]] = arith.constant
+  // CHECK-NEXT: [[cst_1:%.*]] = arith.constant
+  // CHECK-NOT:  %0 = "tfl.fully_connected"(%arg0, [[cst]], [[cst_1]])
+}
+
+// CHECK-LABEL: @FullyConnectedSwapOperandsWhenLHSIsConstKeepNumDimsTrue
+func.func @FullyConnectedSwapOperandsWhenLHSIsConstKeepNumDimsTrue(%arg0: tensor<4x2xf32>, %arg1: none) -> tensor<2x4xf32> {
+  %cst = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %0 = "tfl.fully_connected"(%cst, %arg0, %arg1) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<2x2xf32>, tensor<4x2xf32>, none) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+
+  // CHECK:      [[cst:%.*]] = arith.constant
+  // CHECK-NOT:  %0 = "tfl.fully_connected"(%arg0, [[cst]], %arg1)
+}
+
+// CHECK-LABEL: @FullyConnectedSwapOperandsWhenLHSIsConstFusedActivationFunction
+func.func @FullyConnectedSwapOperandsWhenLHSIsConstFusedActivationFunction(%arg0: tensor<4x2xf32>, %arg1: none) -> tensor<2x4xf32> {
+  %cst = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %0 = "tfl.fully_connected"(%cst, %arg0, %arg1) {asymmetric_quantize_inputs = true, fused_activation_function = "RELU", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x2xf32>, tensor<4x2xf32>, none) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+
+  // CHECK:      [[cst:%.*]] = arith.constant
+  // CHECK-NOT:  %0 = "tfl.fully_connected"(%arg0, [[cst]], %arg1)
+}
+
+// CHECK-LABEL: @FullyConnectedSwapOperandsWhenLHSIsConstLHSRank3
+func.func @FullyConnectedSwapOperandsWhenLHSIsConstLHSRank3(%arg0: tensor<512x512xf32>, %arg1: none) -> tensor<1x1x512xf32> {
+  %cst = arith.constant dense<1.0> : tensor<1x1x512xf32>
+  %0 = "tfl.fully_connected"(%cst, %arg0, %arg1) {asymmetric_quantize_inputs = true, fused_activation_function = "RELU", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1x512xf32>, tensor<512x512xf32>, none) -> tensor<1x1x512xf32>
+  func.return %0 : tensor<1x1x512xf32>
+
+  // CHECK:  %0 = "tfl.fully_connected"(%cst, %arg0, %arg1)
+}
+
+// CHECK-LABEL: @AddComputedZero
+func.func @AddComputedZero(%arg0: tensor<512x512xf32>, %arg1: tensor<1x512xf32>) -> tensor<512x512xf32> {
+  %0 = "tfl.sub"(%arg1, %arg1) {fused_activation_function = "NONE"} : (tensor<1x512xf32>, tensor<1x512xf32>) -> tensor<1x512xf32>
+  // Add broadcasts, but the output shape is the same as input
+  %1 = "tfl.add"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<512x512xf32>, tensor<1x512xf32>) -> tensor<512x512xf32>
+  func.return %1 : tensor<512x512xf32>
+
+  // CHECK-NOT: tfl.sub
+  // CHECK-NOT: tfl.add
+}
+
+// CHECK-LABEL: @AddComputedZeroNegative
+func.func @AddComputedZeroNegative(%arg0: tensor<1x512xf32>, %arg1: tensor<512x512xf32>) -> tensor<512x512xf32> {
+  %0 = "tfl.sub"(%arg1, %arg1) {fused_activation_function = "NONE"} : (tensor<512x512xf32>, tensor<512x512xf32>) -> tensor<512x512xf32>
+  // Add broadcasts, the output shape is larger than the input
+  %1 = "tfl.add"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<1x512xf32>, tensor<512x512xf32>) -> tensor<512x512xf32>
+  func.return %1 : tensor<512x512xf32>
+
+  // CHECK: %0 = tfl.sub %arg1, %arg1 {fused_activation_function = "NONE"} : tensor<512x512xf32>
+  // CHECK: %1 = tfl.add(%arg0, %0) <{fused_activation_function = "NONE"}> : (tensor<1x512xf32>, tensor<512x512xf32>) -> tensor<512x512xf32>
+}
+
+// CHECK-LABEL: @DegerateFC
+func.func @DegerateFC(%input: tensor<5x3x1xf32>) -> tensor<5x3x2xf32> {
+  %weights = arith.constant dense<[[1.0], [2.0]]> : tensor<2x1xf32>
+  %bias = "tfl.no_value"() {value} : () -> none
+  %0 = "tfl.fully_connected"(%input, %weights, %bias) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<5x3x1xf32>, tensor<2x1xf32>, none) -> tensor<5x3x2xf32>
+  func.return %0: tensor<5x3x2xf32>
+
+  // CHECK: %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<5x3x1xf32>, tensor<2xf32>) -> tensor<5x3x2xf32>
+}
+
+// CHECK-LABEL: @DegerateFCNegative
+func.func @DegerateFCNegative(%input_ok: tensor<5x3x1xf32>, %input_too_many_dims: tensor<11x7x5x3x1xf32>, %input_last_dim_not_1: tensor<5x3x2xf32>) -> (tensor<11x7x5x3x2xf32>, tensor<5x3x2xf32>, tensor<5x3x2xf32>, tensor<5x3x2xf32>) {
+  %weights_ok = arith.constant dense<[[1.0], [2.0]]> : tensor<2x1xf32>
+  %weights_last_dim_not_1 = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %weights_quantized = "tfl.pseudo_qconst"() <{qtype = tensor<2x1x!quant.uniform<i8:f32:0, {1.0}>>, value = dense<42> : tensor<2x1xi8>}> : () -> tensor<2x1x!quant.uniform<i8:f32:0, {1.0}>>
+
+  %bias_ok = "tfl.no_value"() {value} : () -> none
+  %bias_notnull = arith.constant dense<[1.0, 2.0]>: tensor<2xf32>
+
+  %1 = "tfl.fully_connected"(%input_too_many_dims, %weights_ok, %bias_ok) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<11x7x5x3x1xf32>, tensor<2x1xf32>, none) -> tensor<11x7x5x3x2xf32>
+  %2 = "tfl.fully_connected"(%input_last_dim_not_1, %weights_last_dim_not_1, %bias_ok) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<5x3x2xf32>, tensor<2x2xf32>, none) -> tensor<5x3x2xf32>
+  %3 = "tfl.fully_connected"(%input_ok, %weights_quantized, %bias_ok) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<5x3x1xf32>, tensor<2x1x!quant.uniform<i8:f32:0, {1.0}>>, none) -> tensor<5x3x2xf32>
+  %4 = "tfl.fully_connected"(%input_ok, %weights_ok, %bias_notnull) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<5x3x1xf32>, tensor<2x1xf32>, tensor<2xf32>) -> tensor<5x3x2xf32>
+  func.return %1, %2, %3, %4 : tensor<11x7x5x3x2xf32>, tensor<5x3x2xf32>, tensor<5x3x2xf32>, tensor<5x3x2xf32>
+
+  // CHECK-NOT: tfl.mul
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir b/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir
index 79f50aaaadab..39b1346bcf93 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir
@@ -170,3 +170,36 @@ func.func @BatchmatmulToReduceSumF32(%arg0: tensor<1x16384x257xf32>) -> (tensor<
   // CHECK: %[[CONST_DIM:.*]] = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
   // CHECK: %[[RED:.*]] = "tfl.sum"(%arg0, %[[CONST_DIM]]) <{keep_dims = true}> : (tensor<1x16384x257xf32>, tensor<1xi32>) -> tensor<1x1x257xf32>
 }
+
+// CHECK-LABEL: FuseBatchMatmulToTransposeNoBatchDims
+func.func @FuseBatchMatmulToTransposeNoBatchDims(%arg0: tensor<2048x32x128xf32>, %arg1: tensor<4x128xf32>) -> tensor<4x65536xf32> {
+  %36 = "tfl.pseudo_const"() <{value = dense<[2, 0, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  %37 = "tfl.transpose"(%arg0, %36) : (tensor<2048x32x128xf32>, tensor<3xi32>) -> tensor<128x2048x32xf32>
+  %38 = "tfl.pseudo_const"() <{value = dense<[128, 65536]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %39 = "tfl.reshape"(%37, %38) : (tensor<128x2048x32xf32>, tensor<2xi32>) -> tensor<128x65536xf32>
+  %41 = "tfl.batch_matmul"(%arg1, %39) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<4x128xf32>, tensor<128x65536xf32>) -> tensor<4x65536xf32>
+  return %41 : tensor<4x65536xf32>
+  // CHECK-NOT: "tfl.transpose"
+}
+
+// CHECK-LABEL: FuseBatchMatmulToTransposeWithBatchDims
+func.func @FuseBatchMatmulToTransposeWithBatchDims(%arg0: tensor<2048x1x8x32x32xf32>, %arg1: tensor<2048x1x2x32xf32>) -> tensor<2048x1x2x256xf32> {
+  %104 = "tfl.pseudo_const"() <{value = dense<[0, 1, 4, 2, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  %106 = "tfl.pseudo_const"() <{value = dense<[2048, 1, 32, 256]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %202 = "tfl.transpose"(%arg0, %104) : (tensor<2048x1x8x32x32xf32>, tensor<5xi32>) -> tensor<2048x1x32x8x32xf32>
+  %203 = "tfl.reshape"(%202, %106) : (tensor<2048x1x32x8x32xf32>, tensor<4xi32>) -> tensor<2048x1x32x256xf32>
+  %204 = "tfl.batch_matmul"(%arg1, %203) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<2048x1x2x32xf32>, tensor<2048x1x32x256xf32>) -> tensor<2048x1x2x256xf32>
+  return %204 : tensor<2048x1x2x256xf32>
+  // CHECK-NOT: "tfl.transpose"
+}
+
+// CHECK-LABEL: FuseBatchMatmulToTransposeNegative
+func.func @FuseBatchMatmulToTransposeNegative(%arg0: tensor<2048x32x1x8x2xf32>, %arg1: tensor<2048x1x32x2xf32>) -> tensor<2048x1x32x256xf32> {
+  %88 = "tfl.pseudo_const"() <{value = dense<[0, 2, 4, 1, 3]> : tensor<5xi32>}> : () -> tensor<5xi32>
+  %90 = "tfl.pseudo_const"() <{value = dense<[2048, 1, 2, 256]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %194 = "tfl.transpose"(%arg0, %88) : (tensor<2048x32x1x8x2xf32>, tensor<5xi32>) -> tensor<2048x1x2x32x8xf32>
+  %195 = "tfl.reshape"(%194, %90) : (tensor<2048x1x2x32x8xf32>, tensor<4xi32>) -> tensor<2048x1x2x256xf32>
+  %196 = "tfl.batch_matmul"(%arg1, %195) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<2048x1x32x2xf32>, tensor<2048x1x2x256xf32>) -> tensor<2048x1x32x256xf32>
+  return %196 : tensor<2048x1x32x256xf32>
+  // CHECK: "tfl.transpose"
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize_broadcast_like.mlir b/tensorflow/compiler/mlir/lite/tests/optimize_broadcast_like.mlir
index 8fae494f23eb..4940eebc701e 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize_broadcast_like.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize_broadcast_like.mlir
@@ -1,4 +1,5 @@
-// RUN: tf-opt -tfl-optimize-broadcast-like -split-input-file %s | FileCheck %s
+// RUN: tf-opt -tfl-optimize-broadcast-like='unsafe-fuse-dynamic-shaped-broadcast=false' -split-input-file %s | FileCheck %s
+// RUN: tf-opt -tfl-optimize-broadcast-like='unsafe-fuse-dynamic-shaped-broadcast=true' -split-input-file %s | FileCheck --check-prefix=UNSAFE-DYNAMIC-CHECK %s
 
 // CHECK-LABEL: @broadcast_mul0
 func.func @broadcast_mul0(%arg0: tensor<5x7xf32>, %arg1: tensor<7xf32>) -> tensor<5x7xf32> {
@@ -19,12 +20,12 @@ func.func @broadcast_mul1(%arg0: tensor<7xf32>, %arg1: tensor<5x7xf32>) -> tenso
 }
 
 // CHECK-LABEL: @broadcast_eq
-func.func @broadcast_eq(%arg0: tensor<5x7xf32>, %arg1: tensor<7xf32>) -> tensor<5x7xf32> {
+func.func @broadcast_eq(%arg0: tensor<5x7xf32>, %arg1: tensor<7xf32>) -> tensor<5x7xi1> {
   %cst = mhlo.constant dense<[5, 7]> : tensor<2xi32>
   %0 = "tfl.broadcast_to"(%arg1, %cst) : (tensor<7xf32>, tensor<2xi32>) -> tensor<5x7xf32>
-  %1 = "tfl.equal"(%arg0, %0) : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xf32>
-  func.return %1 : tensor<5x7xf32>
-  // CHECK: %0 = "tfl.equal"(%arg0, %arg1) : (tensor<5x7xf32>, tensor<7xf32>) -> tensor<5x7xf32>
+  %1 = "tfl.equal"(%arg0, %0) : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xi1>
+  func.return %1 : tensor<5x7xi1>
+  // CHECK: %0 = "tfl.equal"(%arg0, %arg1) : (tensor<5x7xf32>, tensor<7xf32>) -> tensor<5x7xi1>
 }
 
 // CHECK-LABEL: @broadcast_eq_no_fold
@@ -665,3 +666,569 @@ func.func @DontFuseMulIntoFullyConnectedForLargeFilter(%arg0: tensor<128x256000x
 // CHECK:  %[[a:.*]] = "tfl.fully_connected"(%arg0, %cst_0, %cst_1) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
 // CHECK:  %[[b:.*]] = tfl.mul(%[[a]], %cst) <{fused_activation_function = "RELU6"}>
 }
+
+// CHECK-LABEL: FuseBroadcastToLhsOfDivIntoRhsOfAdd
+func.func @FuseBroadcastToLhsOfDivIntoRhsOfAdd(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.div(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoRhsOfAdd_quantized
+func.func @FuseBroadcastToLhsOfMulIntoRhsOfAdd_quantized(%arg0: tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>, %arg1: tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.13491056859493256:61>>, %arg2: tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>) -> tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>> {
+  %cst = arith.constant dense<[1, 1, 1, 2, 64]> : tensor<5xi64>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>, tensor<5xi64>) -> tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>
+  %2 = tfl.mul(%arg1, %1) <{fused_activation_function = "NONE"}> : (tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.13491056859493256:61>>, tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>) -> tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>
+  %3 = tfl.add(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>, tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>) -> tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>
+  return %3 : tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>
+  // CHECK:  %cst = arith.constant dense<[1, 1, 1, 2, 64]> : tensor<5xi64>
+  // CHECK:  %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>, tensor<5xi64>) -> tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>
+  // CHECK:  %1 = tfl.mul(%arg1, %0) <{fused_activation_function = "NONE"}> : (tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.13491056859493256:61>>, tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>) -> tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>
+  // CHECK:  %2 = tfl.add(%arg2, %1) <{fused_activation_function = "NONE"}> : (tensor<1x1x1x2x1x!quant.uniform<i8:f32, 0.0033858942333608866:-128>>, tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>) -> tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>
+  // CHECK:  return %2 : tensor<1x1x1x2x64x!quant.uniform<i8:f32, 0.045444928109645844:20>>
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfDivIntoRhsOfAdd_neg
+func.func @FuseBroadcastToLhsOfDivIntoRhsOfAdd_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.div(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+
+// CHECK-LABEL: FuseBroadcastToLhsOfDivIntoLhsOfAdd
+func.func @FuseBroadcastToLhsOfDivIntoLhsOfAdd(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.div(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfDivIntoLhsOfAdd_neg
+func.func @FuseBroadcastToLhsOfDivIntoLhsOfAdd_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.div(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfDivIntoRhsOfAdd
+func.func @FuseBroadcastToRhsOfDivIntoRhsOfAdd(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.div(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMulIntoRhsOfAdd_neg
+func.func @FuseBroadcastToRhsOfMulIntoRhsOfAdd_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMulIntoLhsOfAdd
+func.func @FuseBroadcastToRhsOfMulIntoLhsOfAdd(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMulIntoLhsOfAdd_neg
+func.func @FuseBroadcastToRhsOfMulIntoLhsOfAdd_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.add(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoRhsOfMin
+func.func @FuseBroadcastToLhsOfMulIntoRhsOfMin(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%arg2, %2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoRhsOfMin_neg
+func.func @FuseBroadcastToLhsOfMulIntoRhsOfMin_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%arg2, %2) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoLhsOfMin
+func.func @FuseBroadcastToLhsOfMulIntoLhsOfMin(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoLhsOfMin_neg
+func.func @FuseBroadcastToLhsOfMulIntoLhsOfMin_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMulIntoRhsOfMin
+func.func @FuseBroadcastToRhsOfMulIntoRhsOfMin(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%arg2, %2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMulIntoRhsOfMin_neg
+func.func @FuseBroadcastToRhsOfMulIntoRhsOfMin_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%arg2, %2) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMulIntoLhsOfMin
+func.func @FuseBroadcastToRhsOfMulIntoLhsOfMin(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMulIntoLhsOfMin_neg
+func.func @FuseBroadcastToRhsOfMulIntoLhsOfMin_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%arg1, %1) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoRhsOfMinWithActFn
+func.func @FuseBroadcastToLhsOfMulIntoRhsOfMinWithActFn(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "RELU"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%arg2, %2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoRhsOfMinWithActFn_neg
+func.func @FuseBroadcastToLhsOfMulIntoRhsOfMinWithActFn_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "RELU"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%arg2, %2) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoLhsOfMinWithActFn
+func.func @FuseBroadcastToLhsOfMulIntoLhsOfMinWithActFn(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "RELU"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMulIntoLhsOfMinWithActFn_neg
+func.func @FuseBroadcastToLhsOfMulIntoLhsOfMinWithActFn_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = tfl.mul(%1, %arg1) {fused_activation_function = "RELU"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.minimum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoRhsOfMul
+func.func @FuseBroadcastToLhsOfMinIntoRhsOfMul(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoRhsOfMul_neg
+func.func @FuseBroadcastToLhsOfMinIntoRhsOfMul_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoLhsOfMul
+func.func @FuseBroadcastToLhsOfMinIntoLhsOfMul(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoLhsOfMul_neg
+func.func @FuseBroadcastToLhsOfMinIntoLhsOfMul_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoRhsOfMul
+func.func @FuseBroadcastToRhsOfMinIntoRhsOfMul(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoRhsOfMul_neg
+func.func @FuseBroadcastToRhsOfMinIntoRhsOfMul_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%arg2, %2) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoLhsOfMul
+func.func @FuseBroadcastToRhsOfMinIntoLhsOfMul(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoLhsOfMul_neg
+func.func @FuseBroadcastToRhsOfMinIntoLhsOfMul_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%2, %arg2) {fused_activation_function = "NONE"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoRhsOfMulWithActFn
+func.func @FuseBroadcastToLhsOfMinIntoRhsOfMulWithActFn(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%arg2, %2) {fused_activation_function = "RELU"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoRhsOfMulWithActFn_neg
+func.func @FuseBroadcastToLhsOfMinIntoRhsOfMulWithActFn_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%arg2, %2) {fused_activation_function = "RELU"} : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoLhsOfMulWithActFn
+func.func @FuseBroadcastToLhsOfMinIntoLhsOfMulWithActFn(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%2, %arg2) {fused_activation_function = "RELU"} : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoLhsOfMulWithActFn_neg
+func.func @FuseBroadcastToLhsOfMinIntoLhsOfMulWithActFn_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = tfl.mul(%2, %arg2) {fused_activation_function = "RELU"} : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoRhsOfMax
+func.func @FuseBroadcastToLhsOfMinIntoRhsOfMax(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%arg2, %2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoRhsOfMax_neg
+func.func @FuseBroadcastToLhsOfMinIntoRhsOfMax_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%arg2, %2) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoLhsOfMax
+func.func @FuseBroadcastToLhsOfMinIntoLhsOfMax(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToLhsOfMinIntoLhsOfMax_neg
+func.func @FuseBroadcastToLhsOfMinIntoLhsOfMax_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%1, %arg1) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoRhsOfMax
+func.func @FuseBroadcastToRhsOfMinIntoRhsOfMax(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%arg2, %2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoRhsOfMax_neg
+func.func @FuseBroadcastToRhsOfMinIntoRhsOfMax_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%arg2, %2) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoLhsOfMax
+func.func @FuseBroadcastToRhsOfMinIntoLhsOfMax(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<25x32x1xf32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: FuseBroadcastToRhsOfMinIntoLhsOfMax_neg
+func.func @FuseBroadcastToRhsOfMinIntoLhsOfMax_neg(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<25x32x1xf32> {
+  %cst = arith.constant dense<[25, 32, 1]> : tensor<3xi32>
+  %1 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<f32>, tensor<3xi32>) -> tensor<25x32x1xf32>
+  %2 = "tfl.minimum"(%arg1, %1) : (tensor<f32>, tensor<25x32x1xf32>) -> tensor<25x32x1xf32>
+  %3 = "tfl.maximum"(%2, %arg2) : (tensor<25x32x1xf32>, tensor<f32>) -> tensor<25x32x1xf32>
+  return %3 : tensor<25x32x1xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: @broadcast_add_sub
+func.func @broadcast_add_sub(%arg0: tensor<5x7xf32>, %arg1: tensor<7xf32>) -> (tensor<5x7xf32>, tensor<5x7xf32>) {
+  %cst = mhlo.constant dense<[5, 7]> : tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg1, %cst) : (tensor<7xf32>, tensor<2xi32>) -> tensor<5x7xf32>
+  %1 = "tfl.add"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xf32>
+  %3 = "tfl.sub"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<5x7xf32>
+  func.return %1, %3 : tensor<5x7xf32>, tensor<5x7xf32>
+  // CHECK-NOT: tfl.broadcast_to
+}
+
+// CHECK-LABEL: @broadcast_add_neg
+func.func @broadcast_add_neg(%arg0: tensor<2x2xf32>, %arg1: tensor<4x2xf32>, %arg2: tensor<f32>) -> (tensor<2x2xf32>, tensor<4x2xf32>) {
+  %cst = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %cst1 = "tfl.no_value"() {value} : () -> none
+  %0 = "tfl.broadcast_to"(%arg2, %cst) : (tensor<f32>, tensor<2xi32>) -> tensor<2x2xf32>
+  %1 = "tfl.add"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  %2 = "tfl.fully_connected"(%arg1, %0, %cst1) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, none) -> tensor<4x2xf32>
+  func.return %1, %2 : tensor<2x2xf32>, tensor<4x2xf32>
+  // CHECK: tfl.broadcast_to
+}
+
+// CHECK-LABEL: @broadcast_abs
+func.func @broadcast_abs(%arg0: tensor<1x2xf32>) -> (tensor<2x2xf32>) {
+  %cst = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  %1 = "tfl.abs"(%0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %1 : tensor<2x2xf32>
+  // CHECK: %[[constant:.*]] = mhlo.constant dense<2> : tensor<2xi32>
+  // CHECK: %[[abs_value:.*]] = "tfl.abs"(%arg0) : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  // CHECK: %[[broadcasted:.*]] = "tfl.broadcast_to"(%[[abs_value]], %[[constant]]) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  // CHECK: return %[[broadcasted]]
+}
+
+// CHECK-LABEL: @broadcast_cast
+func.func @broadcast_cast(%arg0: tensor<1x2xi8>) -> (tensor<2x2xf32>) {
+  %cst = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x2xi8>, tensor<2xi32>) -> tensor<2x2xi8>
+  %1 = "tfl.cast"(%0) : (tensor<2x2xi8>) -> tensor<2x2xf32>
+  func.return %1 : tensor<2x2xf32>
+  // CHECK: %[[constant:.*]] = mhlo.constant dense<2> : tensor<2xi32>
+  // CHECK: %[[cast_value:.*]] = "tfl.cast"(%arg0) : (tensor<1x2xi8>) -> tensor<1x2xf32>
+  // CHECK: %[[broadcasted:.*]] = "tfl.broadcast_to"(%[[cast_value]], %[[constant]]) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  // CHECK: return %[[broadcasted]]
+}
+
+// CHECK-LABEL: @broadcast_dequantize
+func.func @broadcast_dequantize(%arg0: tensor<1x2x!quant.uniform<i8:f32, 0.0123456789:-128>>) -> (tensor<2x2xf32>) {
+  %cst = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x2x!quant.uniform<i8:f32, 0.0123456789:-128>>, tensor<2xi32>) -> tensor<2x2x!quant.uniform<i8:f32, 0.0123456789:-128>>
+  %1 = "tfl.dequantize"(%0) : (tensor<2x2x!quant.uniform<i8:f32, 0.0123456789:-128>>) -> tensor<2x2xf32>
+  func.return %1 : tensor<2x2xf32>
+  // CHECK: %[[constant:.*]] = mhlo.constant dense<2> : tensor<2xi32>
+  // CHECK: %[[dequantized:.*]] = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<i8:f32, 0.0123456789:-128>>) -> tensor<1x2xf32>
+  // CHECK: %[[broadcasted:.*]] = "tfl.broadcast_to"(%[[dequantized]], %[[constant]]) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  // CHECK: return %[[broadcasted]]
+}
+
+// CHECK-LABEL: @broadcast_floor
+func.func @broadcast_floor(%arg0: tensor<1x2xf32>) -> (tensor<2x2xf32>) {
+  %cst = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  %1 = "tfl.floor"(%0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %1 : tensor<2x2xf32>
+  // CHECK: %[[constant:.*]] = mhlo.constant dense<2> : tensor<2xi32>
+  // CHECK: %[[floor_value:.*]] = "tfl.floor"(%arg0) : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  // CHECK: %[[broadcasted:.*]] = "tfl.broadcast_to"(%[[floor_value]], %[[constant]]) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  // CHECK: return %[[broadcasted]]
+}
+
+// CHECK-LABEL: @broadcast_zeros_like
+func.func @broadcast_zeros_like(%arg0: tensor<1x2xf32>) -> (tensor<2x2xf32>) {
+  %cst = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  %1 = "tfl.zeros_like"(%0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %1 : tensor<2x2xf32>
+  // CHECK: %[[constant:.*]] = mhlo.constant dense<2> : tensor<2xi32>
+  // CHECK: %[[zeros:.*]] = "tfl.zeros_like"(%arg0) : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  // CHECK: %[[broadcasted:.*]] = "tfl.broadcast_to"(%[[zeros]], %[[constant]]) : (tensor<1x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  // CHECK: return %[[broadcasted]]
+}
+
+// CHECK-LABEL: @broadcast_mul_dynamic_rhs
+func.func @broadcast_mul_dynamic_rhs(%arg0: tensor<?x7xf32>, %arg1: tensor<1x7xf32>) -> tensor<?x7xf32> {
+  %shape = "tfl.shape"(%arg0) : (tensor<?x7xf32>) -> tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg1, %shape) : (tensor<1x7xf32>, tensor<2xi32>) -> tensor<?x7xf32>
+  %1 = "tfl.mul"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<?x7xf32>, tensor<?x7xf32>) -> tensor<?x7xf32>
+  func.return %1 : tensor<?x7xf32>
+  // UNSAFE-DYNAMIC-CHECK: %0 = tfl.mul(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<?x7xf32>, tensor<1x7xf32>) -> tensor<?x7xf32>
+}
+
+// CHECK-LABEL: @broadcast_mul_dynamic_rhs2
+func.func @broadcast_mul_dynamic_rhs2(%arg0: tensor<?x7xf32>, %arg1: tensor<7xf32>) -> tensor<?x7xf32> {
+  %shape = "tfl.shape"(%arg0) : (tensor<?x7xf32>) -> tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg1, %shape) : (tensor<7xf32>, tensor<2xi32>) -> tensor<?x7xf32>
+  %1 = "tfl.mul"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<?x7xf32>, tensor<?x7xf32>) -> tensor<?x7xf32>
+  func.return %1 : tensor<?x7xf32>
+  // UNSAFE-DYNAMIC-CHECK: %0 = tfl.mul(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<?x7xf32>, tensor<7xf32>) -> tensor<?x7xf32>
+}
+
+// CHECK-LABEL: @broadcast_mul_dynamic_lhs
+func.func @broadcast_mul_dynamic_lhs(%arg0: tensor<1x7xf32>, %arg1: tensor<?x7xf32>) -> tensor<?x7xf32> {
+  %shape = "tfl.shape"(%arg1) : (tensor<?x7xf32>) -> tensor<2xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %shape) : (tensor<1x7xf32>, tensor<2xi32>) -> tensor<?x7xf32>
+  %1 = "tfl.mul"(%0, %arg1) {fused_activation_function = "NONE"} : (tensor<?x7xf32>, tensor<?x7xf32>) -> tensor<?x7xf32>
+  func.return %1 : tensor<?x7xf32>
+  // UNSAFE-DYNAMIC-CHECK: %0 = tfl.mul(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x7xf32>, tensor<?x7xf32>) -> tensor<?x7xf32>
+}
+
+// CHECK-LABEL: @move_broadcast_through_sum
+func.func @move_broadcast_through_sum(%arg0: tensor<1x1x40x100x40x3xf32>) -> tensor<1x4x100x40x3xf32> {
+  %cst_0 = arith.constant dense<[1, 4, 40, 100, 40, 3]> : tensor<6xi64>
+  %cst_1 = arith.constant dense<2> : tensor<1xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst_0) : (tensor<1x1x40x100x40x3xf32>, tensor<6xi64>) -> tensor<1x4x40x100x40x3xf32>
+  %1 = "tfl.sum"(%0, %cst_1) <{keep_dims = false}> : (tensor<1x4x40x100x40x3xf32>, tensor<1xi32>) -> tensor<1x4x100x40x3xf32>
+  return %1 : tensor<1x4x100x40x3xf32>
+  // CHECK: %cst = arith.constant dense<[1, 4, 100, 40, 3]> : tensor<5xi32>
+  // CHECK: %cst_0 = arith.constant dense<2> : tensor<1xi32>
+  // CHECK: %0 = "tfl.sum"(%arg0, %cst_0) <{keep_dims = false}> : (tensor<1x1x40x100x40x3xf32>, tensor<1xi32>) -> tensor<1x1x100x40x3xf32>
+  // CHECK: %1 = "tfl.broadcast_to"(%0, %cst) : (tensor<1x1x100x40x3xf32>, tensor<5xi32>) -> tensor<1x4x100x40x3xf32>
+  // CHECK: return %1 : tensor<1x4x100x40x3xf32>
+}
+
+// CHECK-LABEL: @move_broadcast_through_sum_keep_dims
+func.func @move_broadcast_through_sum_keep_dims(%arg0: tensor<1x1x40x100x40x3xf32>) -> tensor<1x4x1x100x40x3xf32> {
+  %cst_0 = arith.constant dense<[1, 4, 40, 100, 40, 3]> : tensor<6xi64>
+  %cst_1 = arith.constant dense<2> : tensor<1xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst_0) : (tensor<1x1x40x100x40x3xf32>, tensor<6xi64>) -> tensor<1x4x40x100x40x3xf32>
+  %1 = "tfl.sum"(%0, %cst_1) <{keep_dims = true}> : (tensor<1x4x40x100x40x3xf32>, tensor<1xi32>) -> tensor<1x4x1x100x40x3xf32>
+  return %1 : tensor<1x4x1x100x40x3xf32>
+  // CHECK: %cst = arith.constant dense<[1, 4, 1, 100, 40, 3]> : tensor<6xi32>
+  // CHECK: %cst_0 = arith.constant dense<2> : tensor<1xi32>
+  // CHECK: %0 = "tfl.sum"(%arg0, %cst_0) <{keep_dims = true}> : (tensor<1x1x40x100x40x3xf32>, tensor<1xi32>) -> tensor<1x1x1x100x40x3xf32>
+  // CHECK: %1 = "tfl.broadcast_to"(%0, %cst) : (tensor<1x1x1x100x40x3xf32>, tensor<6xi32>) -> tensor<1x4x1x100x40x3xf32>
+  // CHECK: return %1 : tensor<1x4x1x100x40x3xf32>
+}
+
+// CHECK-LABEL: @move_broadcast_through_sum_neg
+func.func @move_broadcast_through_sum_neg(%arg0: tensor<1x1x40x100x40x3xf32>) -> tensor<1x40x100x40x3xf32> {
+  %cst_0 = arith.constant dense<[1, 4, 40, 100, 40, 3]> : tensor<6xi64>
+  %cst_1 = arith.constant dense<1> : tensor<1xi32>
+  %0 = "tfl.broadcast_to"(%arg0, %cst_0) : (tensor<1x1x40x100x40x3xf32>, tensor<6xi64>) -> tensor<1x4x40x100x40x3xf32>
+  %1 = "tfl.sum"(%0, %cst_1) <{keep_dims = false}> : (tensor<1x4x40x100x40x3xf32>, tensor<1xi32>) -> tensor<1x40x100x40x3xf32>
+  return %1 : tensor<1x40x100x40x3xf32>
+  // CHECK: %cst = arith.constant dense<[1, 4, 40, 100, 40, 3]> : tensor<6xi64>
+  // CHECK: %cst_0 = arith.constant dense<1> : tensor<1xi32>
+  // CHECK: %0 = "tfl.broadcast_to"(%arg0, %cst) : (tensor<1x1x40x100x40x3xf32>, tensor<6xi64>) -> tensor<1x4x40x100x40x3xf32>
+  // CHECK: %1 = "tfl.sum"(%0, %cst_0) <{keep_dims = false}> : (tensor<1x4x40x100x40x3xf32>, tensor<1xi32>) -> tensor<1x40x100x40x3xf32>
+  // CHECK: return %1 : tensor<1x40x100x40x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index 005aec23403c..8971ca0d6d37 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -188,9 +188,21 @@ func.func @FoldPerAxisReshape() -> tensor<1x2x2x!quant.uniform<i8:f32:2, {0.007,
   %0 = "tfl.pseudo_qconst"() <{qtype = tensor<2x2x!quant.uniform<i8:f32:1, {0.007,0.004}>>, value = dense<[[-127, 127], [-85, -80]]> : tensor<2x2xi8>}> : () -> tensor<2x2x!quant.uniform<i8:f32:1, {0.007,0.004}>>
   %1 = "tfl.reshape"(%0, %cst) : (tensor<2x2x!quant.uniform<i8:f32:1, {0.007,0.004}>>, tensor<3xi32>) -> tensor<1x2x2x!quant.uniform<i8:f32:2, {0.007,0.004}>>
   return %1 : tensor<1x2x2x!quant.uniform<i8:f32:2, {0.007,0.004}>>
-  
+
 
 // CHECK{LITERAL}:  %0 = "tfl.pseudo_qconst"() <{qtype = tensor<1x2x2x!quant.uniform<i8:f32:2, {7.000000e-03,4.000000e-03}>>, value = dense<[[[-127, 127], [-85, -80]]]> : tensor<1x2x2xi8>}> : () -> tensor<1x2x2x!quant.uniform<i8:f32:2, {7.000000e-03,4.000000e-03}>>
 // CHECK-NOT: tfl.reshape
 // CHECK:  return %0 : tensor<1x2x2x!quant.uniform<i8:f32:2, {7.000000e-03,4.000000e-03}>>
 }
+
+// CHECK-LABEL: RemoveVolatileQConstOps
+func.func @RemoveVolatileQConstOps() -> tensor<640xf32> {
+  %1 = "tfl.pseudo_qconst"() <{qtype = tensor<640x!quant.uniform<i32:f32, 1.0000000949949049E-6>>, value = dense<0> : tensor<640xi32>}> {volatile} : () -> tensor<640x!quant.uniform<i32:f32, 1.0000000949949049E-6>>
+  %2 = "tfl.dequantize"(%1) : (tensor<640x!quant.uniform<i32:f32, 1.0000000949949049E-6>>) -> tensor<640xf32>
+  func.return %2 : tensor<640xf32>
+  // CHECK: %0 = "tfl.pseudo_qconst"() <{qtype = tensor<640x!quant.uniform<i32:f32, 1.0000000949949049E-6>>, value = dense<0> : tensor<640xi32>}> {volatile} : () -> tensor<640x!quant.uniform<i32:f32, 1.0000000949949049E-6>>
+  // CHECK: return %0 : tensor<640x!quant.uniform<i32:f32, 1.0000000949949049E-6>>
+
+  // QDQ-CHECK: %cst = arith.constant dense<0.000000e+00> : tensor<640xf32>
+  // QDQ-CHECK: return %cst : tensor<640xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index c6a2eb88e09e..c2ba52bf0f5a 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -298,3 +298,57 @@ func.func @bias_adjust_pass_immutable(%arg0: tensor<1x2xf32>) -> (tensor<1x2xf32
 // CHECK: %[[w_q:.*]] = "tfl.quantize"(%[[weight]])
 // CHECK-SAME: quant.uniform<i8:f32, 0.0078431372549019607:-128>
 }
+
+// -----
+
+// Series of values needing requantization -- first the args then the results
+// of concatenation operations. concat(concat(arg2, arg0), concat(arg1, arg0)),
+// concat(concat(arg2, arg0), arg3)). arg0 should be requantized twice --
+// concat(arg2, arg0) should be requantized twice as well.
+// Int8-LABEL: QuantizedCatsAddRequantsTest
+func.func @QuantizedCatsAddRequantsTest(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1xf32>, %arg2: tensor<1x1xf32>, %arg3: tensor<1x1xf32>) -> (tensor<1x4xf32>, tensor<1x3xf32>) {
+  %0 = "quantfork.stats"(%arg0) {layerStats = dense<[-0.440728068, 0.189515018]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  %1 = "quantfork.stats"(%arg1) {layerStats = dense<[-0.154693216, 0.26483655]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  %2 = "quantfork.stats"(%arg2) {layerStats = dense<[-0.488159984, 0.16362021]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  %3 = "quantfork.stats"(%arg3) {layerStats = dense<[-0.25180456, 0.398609281]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  %6 = "tfl.concatenation"(%1, %0) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
+  %7 = "quantfork.stats"(%6) {layerStats = dense<[-0.440728068, 0.26483655]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  %8 = "tfl.concatenation"(%2, %0) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
+  %9 = "quantfork.stats"(%8) {layerStats = dense<[-0.488159984, 0.189515018]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  %10 = "tfl.concatenation"(%9, %7) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x4xf32>
+  %11 = "quantfork.stats"(%10) {layerStats = dense<[-0.488159984, 0.26483655]> : tensor<2xf32>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+  %13 = "tfl.concatenation"(%9, %3) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x1xf32>) -> tensor<1x3xf32>
+  %14 = "quantfork.stats"(%13) {layerStats = dense<[-0.488159984, 0.398609281]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %10, %14 : tensor<1x4xf32>, tensor<1x3xf32>
+
+// Int8:      %[[q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0024715415402954701:50>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0024715415402954701:50>>
+// Int8-NEXT: %[[r0q0:.*]] = "tfl.quantize"(%[[q0]]) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0026575490540149166:56>>}> : (tensor<1x1x!quant.uniform<i8:f32, 0.0024715415402954701:50>>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0026575490540149166:56>>
+// Int8-NEXT: %[[r1q0:.*]] = "tfl.quantize"(%[[q0]]) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0027669200710221833:31>>}> : (tensor<1x1x!quant.uniform<i8:f32, 0.0024715415402954701:50>>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0027669200710221833:31>>
+// Int8-NEXT: %[[d1q0:.*]] = "tfl.dequantize"(%[[r1q0]]) : (tensor<1x1x!quant.uniform<i8:f32, 0.0027669200710221833:31>>) -> tensor<1x1xf32>
+// Int8-NEXT: %[[d0q0:.*]] = "tfl.dequantize"(%[[r0q0]]) : (tensor<1x1x!quant.uniform<i8:f32, 0.0026575490540149166:56>>) -> tensor<1x1xf32>
+// Int8-NEXT: %[[q1:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0016452147680170396:-34>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0016452147680170396:-34>>
+// Int8-NEXT: %[[r0q1:.*]] = "tfl.quantize"(%[[q1]]) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0027669200710221833:31>>}> : (tensor<1x1x!quant.uniform<i8:f32, 0.0016452147680170396:-34>>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0027669200710221833:31>>
+// Int8-NEXT: %[[d0q1:.*]] = "tfl.dequantize"(%[[r0q1]]) : (tensor<1x1x!quant.uniform<i8:f32, 0.0027669200710221833:31>>) -> tensor<1x1xf32>
+// Int8-NEXT: %[[q2:.*]] = "tfl.quantize"(%arg2) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0025560007375829358:63>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0025560007375829358:63>>
+// Int8-NEXT: %[[r0q2:.*]] = "tfl.quantize"(%[[q2]]) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0026575490540149166:56>>}> : (tensor<1x1x!quant.uniform<i8:f32, 0.0025560007375829358:63>>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0026575490540149166:56>>
+// Int8-NEXT: %[[d0q2:.*]] = "tfl.dequantize"(%[[r0q2]]) : (tensor<1x1x!quant.uniform<i8:f32, 0.0026575490540149166:56>>) -> tensor<1x1xf32>
+// Int8-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg3) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0025506425137613335:-29>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0025506425137613335:-29>>
+// Int8-NEXT: %[[r0q3:.*]] = "tfl.quantize"(%[[q3]]) <{qtype = tensor<1x1x!quant.uniform<i8:f32, 0.0034775265291625379:12>>}> : (tensor<1x1x!quant.uniform<i8:f32, 0.0025506425137613335:-29>>) -> tensor<1x1x!quant.uniform<i8:f32, 0.0034775265291625379:12>>
+// Int8-NEXT: %[[d0q3:.*]] = "tfl.dequantize"(%[[r0q3]]) : (tensor<1x1x!quant.uniform<i8:f32, 0.0034775265291625379:12>>) -> tensor<1x1xf32>
+// Int8-NEXT: %[[cat1_0:.*]] = "tfl.concatenation"(%[[d0q1]], %[[d1q0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
+// Int8-NEXT: %[[qcat1_0:.*]] = "tfl.quantize"(%[[cat1_0]]) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 0.0027669200710221833:31>>}> {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 0.0027669200710221833:31>>
+// Int8-NEXT: %[[r0qcat1_0:.*]] = "tfl.quantize"(%[[qcat1_0]]) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 0.0026575490540149166:56>>}> : (tensor<1x2x!quant.uniform<i8:f32, 0.0027669200710221833:31>>) -> tensor<1x2x!quant.uniform<i8:f32, 0.0026575490540149166:56>>
+// Int8-NEXT: %[[d0qcat1_0:.*]] = "tfl.dequantize"(%[[r0qcat1_0]]) : (tensor<1x2x!quant.uniform<i8:f32, 0.0026575490540149166:56>>) -> tensor<1x2xf32>
+// Int8-NEXT: %[[cat_2_0:.*]] = "tfl.concatenation"(%[[d0q2]], %[[d0q0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
+// Int8-NEXT: %[[qcat_2_0:.*]] = "tfl.quantize"(%[[cat_2_0]]) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 0.0026575490540149166:56>>}> {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 0.0026575490540149166:56>>
+// Int8-NEXT: %[[r0qcat_2_0:.*]] = "tfl.quantize"(%[[qcat_2_0]]) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 0.0034775265291625379:12>>}> : (tensor<1x2x!quant.uniform<i8:f32, 0.0026575490540149166:56>>) -> tensor<1x2x!quant.uniform<i8:f32, 0.0034775265291625379:12>>
+// Int8-NEXT: %[[d0qcat_2_0:.*]] = "tfl.dequantize"(%[[r0qcat_2_0]]) : (tensor<1x2x!quant.uniform<i8:f32, 0.0034775265291625379:12>>) -> tensor<1x2xf32>
+// Int8-NEXT: %[[dqcat_2_0:.*]] = "tfl.dequantize"(%[[qcat_2_0]]) : (tensor<1x2x!quant.uniform<i8:f32, 0.0026575490540149166:56>>) -> tensor<1x2xf32>
+// Int8-NEXT: %[[cat_2_0_1_0:.*]] = "tfl.concatenation"(%[[dqcat_2_0]], %[[d0qcat1_0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x4xf32>
+// Int8-NEXT: %[[qcat_2_0_1_0:.*]] = "tfl.quantize"(%[[cat_2_0_1_0]]) <{qtype = tensor<1x4x!quant.uniform<i8:f32, 0.0026575490540149166:56>>}> {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 0.0026575490540149166:56>>
+// Int8-NEXT: %[[dqcat_2_0_1_0:.*]] = "tfl.dequantize"(%[[qcat_2_0_1_0]]) : (tensor<1x4x!quant.uniform<i8:f32, 0.0026575490540149166:56>>) -> tensor<1x4xf32>
+// Int8-NEXT: %[[cat_2_0_3:.*]] = "tfl.concatenation"(%[[d0qcat_2_0]], %[[d0q3]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x1xf32>) -> tensor<1x3xf32>
+// Int8-NEXT: %[[qcat_2_0_3:.*]] = "tfl.quantize"(%[[cat_2_0_3]]) <{qtype = tensor<1x3x!quant.uniform<i8:f32, 0.0034775265291625379:12>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.0034775265291625379:12>>
+// Int8-NEXT: %[[dqcat_2_0_3:.*]] = "tfl.dequantize"(%[[qcat_2_0_3]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.0034775265291625379:12>>) -> tensor<1x3xf32>
+// Int8-NEXT: return %[[dqcat_2_0_1_0]], %[[dqcat_2_0_3]] : tensor<1x4xf32>, tensor<1x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index 1fb4381e90af..4ce9be1aa3d2 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -1,6 +1,8 @@
-// RUN: tf-opt %s -tfl-prepare-quantize="quantize-allowlist=quantize_float_placeholder_only,not_reset_input" | FileCheck %s
-// RUN: tf-opt %s -tfl-prepare-quantize="disable-set-input-nodes-quantization-params=true" | FileCheck --check-prefix=MixedPrecision %s
-// RUN: tf-opt %s -tfl-prepare-quantize="qdq-conversion-mode=Static" | FileCheck --check-prefix=QDQ %s
+// RUN: tf-opt %s -split-input-file -tfl-prepare-quantize="quantize-allowlist=quantize_float_placeholder_only,not_reset_input" | FileCheck %s
+// RUN: tf-opt %s -split-input-file -tfl-prepare-quantize="disable-set-input-nodes-quantization-params=true" | FileCheck --check-prefix=MixedPrecision %s
+// RUN: tf-opt %s -split-input-file -tfl-prepare-quantize="qdq-conversion-mode=Static" | FileCheck --check-prefix=QDQ %s
+
+// -----
 
 // CHECK-LABEL: main
 // Uses `main` function to match the default target function of QuantSpecs and
@@ -23,8 +25,10 @@ func.func @main(%arg0: tensor<2x1xf32>, %arg1: tensor<2x3xf32>) -> (tensor<2x4xf
 // CHECK-NEXT: return %[[dq_1:.*]]
 }
 
-// MixedPrecision-LABEL: paritial_quantized
-func.func @paritial_quantized(%arg0: tensor<2x1xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x4xf32>) -> (tensor<2x4xf32>) {
+// -----
+
+// MixedPrecision-LABEL: partial_quantized
+func.func @partial_quantized(%arg0: tensor<2x1xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x4xf32>) -> (tensor<2x4xf32>) {
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<2x1x!quant.uniform<i16:f32, 1.0>>} : (tensor<2x1xf32>) -> tensor<2x1x!quant.uniform<i16:f32, 1.0>>
   %1 = "tfl.dequantize"(%0) : (tensor<2x1x!quant.uniform<i16:f32, 1.0>>) -> (tensor<2x1xf32>)
   %2 = "tfl.quantize"(%arg1) {qtype = tensor<2x3x!quant.uniform<i16:f32, 1.0>>} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i16:f32, 1.0>>
@@ -44,6 +48,8 @@ func.func @paritial_quantized(%arg0: tensor<2x1xf32>, %arg1: tensor<2x3xf32>, %a
 // MixedPrecision-NEXT: return %[[v:.*]]
 }
 
+// -----
+
 // CHECK-LABEL: quantize_float_placeholder_only
 func.func @quantize_float_placeholder_only(%arg0: tensor<f32>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xf32>) -> (tensor<f32>, tensor<2x3xi32>, tensor<2x3xf32>) {
   func.return %arg0, %arg1, %arg2: tensor<f32>, tensor<2x3xi32>, tensor<2x3xf32>
@@ -55,6 +61,8 @@ func.func @quantize_float_placeholder_only(%arg0: tensor<f32>, %arg1: tensor<2x3
 // CHECK-NEXT: %[[dq]], %arg1, %[[dq_0]]
 }
 
+// -----
+
 // CHECK-LABEL: not_reset_input
 func.func @not_reset_input(%arg0: tensor<f32>) -> (tensor<!quant.uniform<i16:f32, 1.0>>) {
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<!quant.uniform<i16:f32, 1.0>>} : (tensor<f32>) -> tensor<!quant.uniform<i16:f32, 1.0>>
@@ -64,6 +72,8 @@ func.func @not_reset_input(%arg0: tensor<f32>) -> (tensor<!quant.uniform<i16:f32
 // CHECK-NEXT: return %[[q]]
 }
 
+// -----
+
 // CHECK-LABEL: DequantizeAndQuantize
 func.func @DequantizeAndQuantize() -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>> {
   %cst = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<-1> : tensor<2x2xi8>} : () -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
@@ -77,6 +87,8 @@ func.func @DequantizeAndQuantize() -> tensor<2x2x!quant.uniform<u8:f32, 7.843137
 // CHECK:  return %2
 }
 
+// -----
+
 // CHECK-LABEL: prepareStatistics
 func.func @prepareStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
   %0 = "quantfork.stats"(%arg0) {
@@ -99,6 +111,8 @@ func.func @prepareStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
 // CHECK: return %[[dq2]]
 }
 
+// -----
+
 // CHECK-LABEL: prepareNarrowStatistics
 func.func @prepareNarrowStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
   %0 = "quantfork.stats"(%arg0) {
@@ -111,6 +125,8 @@ func.func @prepareNarrowStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32
 // CHECK: return %[[dq]]
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConv2DPerChannel
 func.func @QuantizeConv2DPerChannel(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 1.5>>,
                                %arg1: tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32:3, {1.0,2.0,3.0}>>) -> tensor<1x112x112x32xf32> {
@@ -131,6 +147,8 @@ func.func @QuantizeConv2DPerChannel(%arg0: tensor<1x224x224x3x!quant.uniform<u8:
 // CHECK-NEXT: return %[[conv]]
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConv2DPerChannelConst
 func.func @QuantizeConv2DPerChannelConst(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 1.5>>,
                                %arg1: tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32:3, {1.0,2.0,3.0}>>) -> tensor<1x112x112x32xf32> {
@@ -151,6 +169,8 @@ func.func @QuantizeConv2DPerChannelConst(%arg0: tensor<1x224x224x3x!quant.unifor
 // CHECK-NEXT: return %[[conv]]
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConv2DPerChannels
 func.func @QuantizeConv2DPerChannels(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32:3, {1.0,2.0,3.0}>>,
                                %arg1: tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32:3, {1.0,2.0,3.0}>>) -> tensor<1x112x112x32xf32> {
@@ -171,6 +191,8 @@ func.func @QuantizeConv2DPerChannels(%arg0: tensor<1x224x224x3x!quant.uniform<u8
 // CHECK-NEXT: return %[[conv]]
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConv2D
 func.func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
 ^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -193,6 +215,8 @@ func.func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03
 // CHECK: return %6
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeFullyConnected
 func.func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
 ^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -215,6 +239,8 @@ func.func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.81
 // CHECK: return %6
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeDepthwiseConv2D
 func.func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
 ^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -237,6 +263,8 @@ func.func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.8
 // CHECK: return %6
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeAveragePool2D
 func.func @QuantizeAveragePool2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x1x1x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -253,6 +281,8 @@ func.func @QuantizeAveragePool2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500
 // CHECK: return %3 : tensor<1x1x1x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeMaximum
 func.func @QuantizeMaximum(tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>):
@@ -269,6 +299,8 @@ func.func @QuantizeMaximum(tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, tensor<
 // CHECK: return %4 : tensor<1x6x6x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeMinimum
 func.func @QuantizeMinimum(tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>) -> tensor<1x6x6x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>):
@@ -285,6 +317,8 @@ func.func @QuantizeMinimum(tensor<1x6x6x16x!quant.uniform<u8:f32, 0.1>>, tensor<
 // CHECK: return %4 : tensor<1x6x6x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeSlice
 func.func @QuantizeSlice(tensor<2x3x5x!quant.uniform<u8:f32, 0.1>>, tensor<3xi32>, tensor<3xi32>) -> tensor<?x3x5xf32> {
 ^bb0(%arg0: tensor<2x3x5x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>):
@@ -299,6 +333,8 @@ func.func @QuantizeSlice(tensor<2x3x5x!quant.uniform<u8:f32, 0.1>>, tensor<3xi32
 // CHECK: return %3 : tensor<?x3x5xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeStridedSlice
 func.func @QuantizeStridedSlice(tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32> {
 ^bb0(%arg0: tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>):
@@ -313,6 +349,8 @@ func.func @QuantizeStridedSlice(tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, te
 // CHECK: return %3 : tensor<1x2x2x5xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizePad
 func.func @QuantizePad(tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, tensor<3x2xi32>) -> tensor<?xf32> {
 ^bb0(%arg0: tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<3x2xi32>):
@@ -327,6 +365,8 @@ func.func @QuantizePad(tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, tensor<3x2xi32
 // CHECK: return %3 : tensor<?xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizePad2
 // only the second tfl.pad has sufficient quantization information.
 func.func @QuantizePad2(tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, tensor<2x1x3xf32>, tensor<3x2xi32>) -> (tensor<?xf32>, tensor<?xf32>) {
@@ -343,6 +383,8 @@ func.func @QuantizePad2(tensor<2x1x3x!quant.uniform<u8:f32, 0.1>>, tensor<2x1x3x
 // CHECK: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]])
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeReshape2D
 func.func @QuantizeReshape2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x36x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -358,6 +400,8 @@ func.func @QuantizeReshape2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03
 // CHECK: return %3 : tensor<1x36x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeSoftmax
 func.func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -372,6 +416,8 @@ func.func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:1
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeLogistic
 func.func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -386,6 +432,8 @@ func.func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: NotRescaleLogistic
 func.func @NotRescaleLogistic(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>> {
   %0 = "tfl.logistic"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>
@@ -395,6 +443,8 @@ func.func @NotRescaleLogistic(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.81
 // CHECK: return %[[log]]
 }
 
+// -----
+
 // QDQ-LABEL: QDQNoQuantizeLogistic
 func.func @QDQNoQuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -408,6 +458,8 @@ func.func @QDQNoQuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500
 // QDQ: return %1 : tensor<1x6x6x16xf32>
 }
 
+// -----
+
 // QDQ-LABEL: QDQNoQuantizeSoftmax
 func.func @QDQNoQuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -421,6 +473,8 @@ func.func @QDQNoQuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e
 // QDQ: return %1 : tensor<1x6x6x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeL2Norm
 func.func @QuantizeL2Norm(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x6x6x16xf32> {
   %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x6x6x16xf32>
@@ -434,6 +488,8 @@ func.func @QuantizeL2Norm(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -
 // CHECK: return %[[dq]] : tensor<1x6x6x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: NotQuantizeConcatConstantOperand
 func.func @NotQuantizeConcatConstantOperand(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
   %0 = arith.constant dense<1.0> : tensor<1x2xf32>
@@ -445,6 +501,8 @@ func.func @NotQuantizeConcatConstantOperand(%arg0: tensor<1x2xf32>) -> tensor<2x
 // CHECK-NEXT: return %[[cc]]
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConcatOperand0ToAll
 func.func @QuantizeConcatOperand0ToAll(tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>, tensor<1x2xf32>) -> tensor<2x2xf32> {
 ^bb0(%arg0: tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>, %arg1: tensor<1x2xf32>):
@@ -461,6 +519,8 @@ func.func @QuantizeConcatOperand0ToAll(tensor<1x2x!quant.uniform<u8:f32, 0.1:128
 // CHECK: return %5 : tensor<2x2xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConcatOperand1ToAll
 func.func @QuantizeConcatOperand1ToAll(tensor<1x2xf32>, tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>) -> tensor<2x2xf32> {
 ^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>):
@@ -477,6 +537,8 @@ func.func @QuantizeConcatOperand1ToAll(tensor<1x2xf32>, tensor<1x2x!quant.unifor
 // CHECK: return %5 : tensor<2x2xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConcatResToAll
 func.func @QuantizeConcatResToAll(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1:128>> {
 ^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>):
@@ -493,6 +555,8 @@ func.func @QuantizeConcatResToAll(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x
 // CHECK: return %5 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConcatResToAllNoRequantize
 func.func @QuantizeConcatResToAllNoRequantize(tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1:128>> {
 ^bb0(%arg0: tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>, %arg1: tensor<1x2xf32>):
@@ -509,42 +573,48 @@ func.func @QuantizeConcatResToAllNoRequantize(tensor<1x2x!quant.uniform<u8:f32,
 // CHECK: return %4 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConcatResToAllRequantize
-func.func @QuantizeConcatResToAllRequantize(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1:128>> {
+func.func @QuantizeConcatResToAllRequantize(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1:128>> {
 ^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>):
-  %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>
-  %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>) -> tensor<1x2xf32>
+  %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>
+  %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>) -> tensor<1x2xf32>
   %2 = "tfl.concatenation"(%1, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-  %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-  func.return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
-// CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %[[Q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>}> : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>
-// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%[[Q0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-// CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
+  %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+  func.return %3 : tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}> {volatile}
+// CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
+// CHECK: %[[Q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:128>>}> : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:128>>
+// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%[[Q0]]) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}> : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+// CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %[[CONC:.*]] = "tfl.concatenation"(%[[DQ0]], %[[DQ1]]) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-// CHECK: return %[[Q]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) <{qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+// CHECK: return %[[Q]] : tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConcatResToAllRequantizeArg
-func.func @QuantizeConcatResToAllRequantizeArg(tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1:128>> {
-^bb0(%arg0: tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>, %arg1: tensor<1x2xf32>):
-  %1 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>) -> tensor<1x2xf32>
+func.func @QuantizeConcatResToAllRequantizeArg(tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1:128>> {
+^bb0(%arg0: tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>, %arg1: tensor<1x2xf32>):
+  %1 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>) -> tensor<1x2xf32>
   %2 = "tfl.concatenation"(%1, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-  %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-  func.return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+  %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+  func.return %3 : tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
-// CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-// CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}> {volatile}
+// CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
+// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}> : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+// CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %[[CONC:.*]] = "tfl.concatenation"(%[[DQ0]], %[[DQ1]]) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-// CHECK: return %[[Q]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) <{qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+// CHECK: return %[[Q]] : tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
 }
 
+// -----
+
 // CHECK-LABEL: NotRequantizeAlreadyQuantizedModel
 func.func @NotRequantizeAlreadyQuantizedModel(%arg0: tensor<1x73x73x64x!quant.uniform<u8:f32, 1.0>>, %arg1: tensor<1x147x147x96x!quant.uniform<u8:f32, 2.0>>) -> tensor<1x73x73x160x!quant.uniform<u8:f32, 1.0>> {
   %9 = "tfl.max_pool_2d"(%arg1) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x96x!quant.uniform<u8:f32, 2.0>>) -> tensor<1x73x73x96x!quant.uniform<u8:f32, 2.0>>
@@ -556,6 +626,8 @@ func.func @NotRequantizeAlreadyQuantizedModel(%arg0: tensor<1x73x73x64x!quant.un
 // CHECK: return %[[cat]] : tensor<1x73x73x160x!quant.uniform<u8:f32, 1.000000e+00>>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeChain
 func.func @QuantizeChain(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x36x16xf32> {
 ^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -597,6 +669,8 @@ func.func @QuantizeChain(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:
 // CHECK: return %16 : tensor<1x36x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeConstant
 func.func @QuantizeConstant() -> tensor<2x3xf32> {
   %cst = arith.constant dense<[[-3.0, -1.0, 0.0], [0.0, 1.0, 3.0]]> : tensor<2x3xf32>
@@ -608,6 +682,8 @@ func.func @QuantizeConstant() -> tensor<2x3xf32> {
 // CHECK: return %1 : tensor<2x3xf32>
 }
 
+// -----
+
 // CHECK-LABEL: NotQuantizeNoneType
 func.func @NotQuantizeNoneType() -> none {
   %cst = "tfl.no_value"() {value = unit} : () -> none
@@ -617,6 +693,8 @@ func.func @NotQuantizeNoneType() -> none {
 // CHECK-NEXT:  return %[[cst]]
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeZeroSplat
 func.func @QuantizeZeroSplat() -> tensor<2x3xf32> {
   %cst = arith.constant dense<0.0> : tensor<2x3xf32>
@@ -626,6 +704,8 @@ func.func @QuantizeZeroSplat() -> tensor<2x3xf32> {
 // CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>}> {volatile}
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeZeroScalar
 func.func @QuantizeZeroScalar() -> tensor<f32> {
   %cst = arith.constant dense<0.0> : tensor<f32>
@@ -635,6 +715,8 @@ func.func @QuantizeZeroScalar() -> tensor<f32> {
 // CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>}> {volatile}
 }
 
+// -----
+
 // CHECK-LABEL: QuantizePositiveSplat
 func.func @QuantizePositiveSplat() -> tensor<2x3xf32> {
   %cst = arith.constant dense<25.4> : tensor<2x3xf32>
@@ -644,6 +726,8 @@ func.func @QuantizePositiveSplat() -> tensor<2x3xf32> {
 // CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 0.099607841641295186>>}> {volatile}
 }
 
+// -----
+
 // CHECK-LABEL: QuantizePositiveScalar
 func.func @QuantizePositiveScalar() -> tensor<f32> {
   %cst = arith.constant dense<2.54> : tensor<f32>
@@ -653,6 +737,8 @@ func.func @QuantizePositiveScalar() -> tensor<f32> {
 // CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<!quant.uniform<u8:f32, 0.0099607841641295193>>}> {volatile}
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeNegativeSplat
 func.func @QuantizeNegativeSplat() -> tensor<2x3xf32> {
   %cst = arith.constant dense<-2.54> : tensor<2x3xf32>
@@ -662,6 +748,8 @@ func.func @QuantizeNegativeSplat() -> tensor<2x3xf32> {
 // CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 0.0099607841641295193:255>>}> {volatile}
 }
 
+// -----
+
 // CHECK-LABEL: QuantizeNegativeScalar
 func.func @QuantizeNegativeScalar() -> tensor<f32> {
   %cst = arith.constant dense<-25.4> : tensor<f32>
@@ -671,6 +759,8 @@ func.func @QuantizeNegativeScalar() -> tensor<f32> {
 // CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<!quant.uniform<u8:f32, 0.099607841641295186:255>>}> {volatile}
 }
 
+// -----
+
 // Make sure biases are not shared.
 // CHECK-LABEL: QuantizeSharedBiases
 func.func @QuantizeSharedBiases(
@@ -700,6 +790,8 @@ func.func @QuantizeSharedBiases(
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq_0]])
 }
 
+// -----
+
 // Make sure biases are not shared.
 // CHECK-LABEL: QuantizeSharedBiases2
 func.func @QuantizeSharedBiases2(
@@ -727,6 +819,8 @@ func.func @QuantizeSharedBiases2(
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
 }
 
+// -----
+
 // Make sure biases are not shared.
 // CHECK-LABEL: QuantizeSharedBiases3
 func.func @QuantizeSharedBiases3(
@@ -755,6 +849,8 @@ func.func @QuantizeSharedBiases3(
 // CHECK: %{{.*}} = tfl.add %{{.*}}, %[[dq_0]]
 }
 
+// -----
+
 // Make sure constants are duplicataed for all users.
 // CHECK-LABEL: QuantizeSharedConstantsMultipleUsers
 func.func @QuantizeSharedConstantsMultipleUsers(
@@ -785,6 +881,8 @@ func.func @QuantizeSharedConstantsMultipleUsers(
 // CHECK-DAG: "tfl.minimum"(%{{.*}}, %[[cst4]]) : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xf32>
 }
 
+// -----
+
 // Make sure quantization parameters are scanned from weight, but not from bias.
 // CHECK-LABEL: QuantizeWeight
 func.func @QuantizeWeight(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
@@ -803,6 +901,8 @@ func.func @QuantizeWeight(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32
 // CHECK: return %[[c]] : tensor<1x112x112x32xf32>
 }
 
+// -----
+
 // Make sure quantization parameters are not scanned if quantize op is presented.
 // CHECK-LABEL: NoRedundantQuantizeWeight
 func.func @NoRedundantQuantizeWeight() -> tensor<1x112x112x32xf32> {
@@ -817,6 +917,8 @@ func.func @NoRedundantQuantizeWeight() -> tensor<1x112x112x32xf32> {
 // CHECK-NEXT: return %[[dq]] : tensor<1x112x112x32xf32>
 }
 
+// -----
+
 // CHECK-LABEL: ReturnQuantizedResult
 func.func @ReturnQuantizedResult(%arg0: tensor<1x224x224x3xf32>, %arg1: tensor<32x3x3x3xf32>, %arg2: tensor<32xf32>) -> (tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) {
   %0 = "tfl.depthwise_conv_2d"(%arg0, %arg1, %arg2) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
@@ -830,56 +932,7 @@ func.func @ReturnQuantizedResult(%arg0: tensor<1x224x224x3xf32>, %arg1: tensor<3
 // CHECK: return %[[dq]], %[[dq]]
 }
 
-// Series of values needing requantization -- first the args then the results
-// of concatenation operations. concat(concat(arg2, arg0), concat(arg1, arg0)),
-// concat(concat(arg2, arg0), arg3)). arg0 should be requantized twice --
-// concat(arg2, arg0) should be requantized twice as well.
-// CHECK-LABEL: QuantizedCatsAddRequantsTest
-func.func @QuantizedCatsAddRequantsTest(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1xf32>, %arg2: tensor<1x1xf32>, %arg3: tensor<1x1xf32>) -> (tensor<1x4xf32>, tensor<1x3xf32>) {
-  %0 = "quantfork.stats"(%arg0) {layerStats = dense<[-0.440728068, 0.189515018]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
-  %1 = "quantfork.stats"(%arg1) {layerStats = dense<[-0.154693216, 0.26483655]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
-  %2 = "quantfork.stats"(%arg2) {layerStats = dense<[-0.488159984, 0.16362021]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
-  %3 = "quantfork.stats"(%arg3) {layerStats = dense<[-0.25180456, 0.398609281]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
-  %6 = "tfl.concatenation"(%1, %0) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
-  %7 = "quantfork.stats"(%6) {layerStats = dense<[-0.440728068, 0.26483655]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-  %8 = "tfl.concatenation"(%2, %0) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
-  %9 = "quantfork.stats"(%8) {layerStats = dense<[-0.488159984, 0.189515018]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-  %10 = "tfl.concatenation"(%9, %7) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x4xf32>
-  %11 = "quantfork.stats"(%10) {layerStats = dense<[-0.488159984, 0.26483655]> : tensor<2xf32>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  %13 = "tfl.concatenation"(%9, %3) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x1xf32>) -> tensor<1x3xf32>
-  %14 = "quantfork.stats"(%13) {layerStats = dense<[-0.488159984, 0.398609281]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-  func.return %10, %14 : tensor<1x4xf32>, tensor<1x3xf32>
-// CHECK-NEXT: %[[q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>
-// CHECK-NEXT: %[[r0q0:.*]] = "tfl.quantize"(%[[q0]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
-// CHECK-NEXT: %[[r1q0:.*]] = "tfl.quantize"(%[[q0]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
-// CHECK-NEXT: %[[d1q0:.*]] = "tfl.dequantize"(%[[r1q0]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[d0q0:.*]] = "tfl.dequantize"(%[[r0q0]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>
-// CHECK-NEXT: %[[r0q1:.*]] = "tfl.quantize"(%[[q1]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
-// CHECK-NEXT: %[[d0q1:.*]] = "tfl.dequantize"(%[[r0q1]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%arg2) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>
-// CHECK-NEXT: %[[r0q2:.*]] = "tfl.quantize"(%[[q2]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
-// CHECK-NEXT: %[[d0q2:.*]] = "tfl.dequantize"(%[[r0q2]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg3) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>
-// CHECK-NEXT: %[[r0q3:.*]] = "tfl.quantize"(%[[q3]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
-// CHECK-NEXT: %[[d0q3:.*]] = "tfl.dequantize"(%[[r0q3]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[cat1_0:.*]] = "tfl.concatenation"(%[[d0q1]], %[[d1q0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[qcat1_0:.*]] = "tfl.quantize"(%[[cat1_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>}> {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
-// CHECK-NEXT: %[[r0qcat1_0:.*]] = "tfl.quantize"(%[[qcat1_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> : (tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
-// CHECK-NEXT: %[[d0qcat1_0:.*]] = "tfl.dequantize"(%[[r0qcat1_0]]) : (tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[cat_2_0:.*]] = "tfl.concatenation"(%[[d0q2]], %[[d0q0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[qcat_2_0:.*]] = "tfl.quantize"(%[[cat_2_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
-// CHECK-NEXT: %[[r0qcat_2_0:.*]] = "tfl.quantize"(%[[qcat_2_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>}> : (tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
-// CHECK-NEXT: %[[d0qcat_2_0:.*]] = "tfl.dequantize"(%[[r0qcat_2_0]]) : (tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[dqcat_2_0:.*]] = "tfl.dequantize"(%[[qcat_2_0]]) : (tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[cat_2_0_1_0:.*]] = "tfl.concatenation"(%[[dqcat_2_0]], %[[d0qcat1_0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x4xf32>
-// CHECK-NEXT: %[[qcat_2_0_1_0:.*]] = "tfl.quantize"(%[[cat_2_0_1_0]]) <{qtype = tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
-// CHECK-NEXT: %[[dqcat_2_0_1_0:.*]] = "tfl.dequantize"(%[[qcat_2_0_1_0]]) : (tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x4xf32>
-// CHECK-NEXT: %[[cat_2_0_3:.*]] = "tfl.concatenation"(%[[d0qcat_2_0]], %[[d0q3]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x1xf32>) -> tensor<1x3xf32>
-// CHECK-NEXT: %[[qcat_2_0_3:.*]] = "tfl.quantize"(%[[cat_2_0_3]]) <{qtype = tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
-// CHECK-NEXT: %[[dqcat_2_0_3:.*]] = "tfl.dequantize"(%[[qcat_2_0_3]]) : (tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>) -> tensor<1x3xf32>
-// CHECK-NEXT: return %[[dqcat_2_0_1_0]], %[[dqcat_2_0_3]] : tensor<1x4xf32>, tensor<1x3xf32>
-}
+// -----
 
 // QDQ-LABEL: TransposePerTensorQuantizationPropagation
 func.func @TransposePerTensorQuantizationPropagation() -> tensor<2x5xf32> {
@@ -900,6 +953,8 @@ func.func @TransposePerTensorQuantizationPropagation() -> tensor<2x5xf32> {
   // QDQ-NEXT: return %[[dqtw]] : tensor<2x5xf32>
 }
 
+// -----
+
 // QDQ-LABEL: TransposePerChannelNewQuantDim
 func.func @TransposePerChannelNewQuantDim() -> tensor<2x5xf32> {
   %perm = arith.constant dense<[1, 0]> : tensor<2xi32>
@@ -919,6 +974,8 @@ func.func @TransposePerChannelNewQuantDim() -> tensor<2x5xf32> {
 // QDQ-NEXT: return %[[dqtw]] : tensor<2x5xf32>
 }
 
+// -----
+
 // QDQ-LABEL: ReshapePerChannelNewQuantDim
 func.func @ReshapePerChannelNewQuantDim() -> tensor<24x5xf32> {
   %cst = arith.constant dense<1.0> : tensor<1x2x3x4x5xf32>
@@ -938,6 +995,8 @@ func.func @ReshapePerChannelNewQuantDim() -> tensor<24x5xf32> {
 // QDQ-NEXT: return %4 : tensor<24x5xf32>
 }
 
+// -----
+
 // QDQ-LABEL: TransposePerChannelNewQuantDim_int4
 func.func @TransposePerChannelNewQuantDim_int4() -> tensor<2x5xf32> {
   %perm = arith.constant dense<[1, 0]> : tensor<2xi32>
@@ -956,3 +1015,27 @@ func.func @TransposePerChannelNewQuantDim_int4() -> tensor<2x5xf32> {
 // QDQ-NEXT: %[[dqtw:.*]] = "tfl.dequantize"(%[[qtw]]) : (tensor<2x5x!quant.uniform<i4<-7:7>:f32:1
 // QDQ-NEXT: return %[[dqtw]] : tensor<2x5xf32>
 }
+
+// -----
+
+// CHECK-LABEL: concat_requantize_inputs_and_outputs_if_different_scales
+func.func @concat_requantize_inputs_and_outputs_if_different_scales(%arg0: tensor<2x1xf32>, %arg1: tensor<2x3xf32>) -> (tensor<2x4xf32>) {
+  %0 = "tfl.quantize"(%arg0) {qtype = tensor<2x1x!quant.uniform<u8:f32, 2.0>>} : (tensor<2x1xf32>) -> tensor<2x1x!quant.uniform<u8:f32, 2.0>>
+  %1 = "tfl.dequantize"(%0) : (tensor<2x1x!quant.uniform<u8:f32, 2.0>>) -> (tensor<2x1xf32>)
+  %2 = "tfl.quantize"(%arg1) {qtype = tensor<2x3x!quant.uniform<u8:f32, 1.0>>} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<u8:f32, 1.0>>
+  %3 = "tfl.dequantize"(%2) : (tensor<2x3x!quant.uniform<u8:f32, 1.0>>) -> (tensor<2x3xf32>)
+  %4 = "tfl.concatenation"(%1, %3) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xf32>, tensor<2x3xf32>) -> tensor<2x4xf32>
+  func.return %4: tensor<2x4xf32>
+
+// CHECK:      %0 = "tfl.quantize"(%arg0) <{qtype = tensor<2x1x!quant.uniform<u8:f32, 2.000000e+00>>}> : (tensor<2x1xf32>) -> tensor<2x1x!quant.uniform<u8:f32, 2.000000e+00>>
+// CHECK-NEXT: %1 = "tfl.dequantize"(%0)
+// CHECK-NEXT: %2 = "tfl.quantize"(%arg1) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 1.000000e+00>>}> : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT: %3 = "tfl.dequantize"(%2)
+// CHECK-NEXT: %4 = "tfl.concatenation"(%1, %3) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<2x1xf32>, tensor<2x3xf32>) -> tensor<2x4xf32>
+// CHECK-NEXT: %5 = "tfl.quantize"(%4) <{qtype = tensor<2x4x!quant.uniform<u8:f32, 2.000000e+00>>}> {volatile} : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<u8:f32, 2.000000e+00>>
+// CHECK-NEXT: %6 = "tfl.dequantize"(%5)
+// CHECK-NEXT: return %6
+}
+
+// -----
+
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
index 1034782d68d9..22414eb03b48 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
@@ -1,7 +1,5 @@
-// RUN: tf-opt %s -tfl-raise-custom-ops="test-raise-tf-targets=tf.FakeQuantWithMinMaxVarsPerChannel,tf.FakeQuantWithMinMaxVars" -tfl-prepare-tf | FileCheck --dump-input=always %s
-// RUN: tf-opt %s -tfl-raise-custom-ops="test-raise-tf-targets=tf.FakeQuantWithMinMaxVarsPerChannel,tf.FakeQuantWithMinMaxVars" -tfl-prepare-tf=use-fake-quant-num-bits=true | FileCheck --check-prefix LOBIT --dump-input=always %s
-
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+// RUN: tf-opt %s -split-input-file -tfl-raise-custom-ops="test-raise-tf-targets=tf.FakeQuantWithMinMaxVarsPerChannel,tf.FakeQuantWithMinMaxVars" -tfl-prepare-tf | FileCheck --dump-input=always %s
+// RUN: tf-opt %s -split-input-file -tfl-raise-custom-ops="test-raise-tf-targets=tf.FakeQuantWithMinMaxVarsPerChannel,tf.FakeQuantWithMinMaxVars" -tfl-prepare-tf=use-fake-quant-num-bits=true | FileCheck --check-prefix LOBIT --dump-input=always %s
 
 // CHECK-LABEL: fakeQuantPerChannelForActivation
 func.func @fakeQuantPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tensor<8x4xf32>) {
@@ -16,6 +14,8 @@ func.func @fakeQuantPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tensor<8
 // CHECK:  return %[[dq]]
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantForActivation
 func.func @fakeQuantForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
 ^bb0(%arg0: tensor<8xf32>):
@@ -30,6 +30,8 @@ func.func @fakeQuantForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
 // CHECK:  return %2
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantForActivationNoDuplication
 func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>) {
 ^bb0(%arg0: tensor<8xf32>):
@@ -44,6 +46,8 @@ func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quan
 // CHECK:  return %1
 }
 
+// -----
+
 // CHECK-LABEL: WrappedFakeQuantFolded
 func.func @WrappedFakeQuantFolded() -> tensor<8xf32> {
   %in = arith.constant dense<0.0> : tensor<8xf32>
@@ -64,6 +68,8 @@ func.func @WrappedFakeQuantFolded() -> tensor<8xf32> {
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantFolded
 func.func @fakeQuantFolded() -> (tensor<8xf32>) {
   %in = arith.constant dense<0.0> : tensor<8xf32>
@@ -80,6 +86,8 @@ func.func @fakeQuantFolded() -> (tensor<8xf32>) {
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantFoldedWithoutIdentity
 func.func @fakeQuantFoldedWithoutIdentity() -> (tensor<8xf32>) {
   %in = arith.constant dense<0.0> : tensor<8xf32>
@@ -94,6 +102,8 @@ func.func @fakeQuantFoldedWithoutIdentity() -> (tensor<8xf32>) {
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantFoldedWithCast
 func.func @fakeQuantFoldedWithCast() -> (tensor<8xf32>) {
   %in = arith.constant dense<0.0> : tensor<8xf32>
@@ -112,6 +122,8 @@ func.func @fakeQuantFoldedWithCast() -> (tensor<8xf32>) {
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantNotFolded
 func.func @fakeQuantNotFolded(tensor<8xf32>, tensor<f32>, tensor<f32>) -> (tensor<8xf32>) {
 ^bb0(%arg0: tensor<8xf32>, %arg3: tensor<f32>, %arg4: tensor<f32>):
@@ -122,6 +134,8 @@ func.func @fakeQuantNotFolded(tensor<8xf32>, tensor<f32>, tensor<f32>) -> (tenso
 // CHECK: return %0 : tensor<8xf32>
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantFollowedByTranspose
 func.func @fakeQuantFollowedByTranspose(tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> (tensor<2x1xf32>) {
 ^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
@@ -136,6 +150,8 @@ func.func @fakeQuantFollowedByTranspose(tensor<1x2xf32>, tensor<f32>, tensor<f32
 // CHECK:  return %1
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantFollowedByTransposes
 func.func @fakeQuantFollowedByTransposes(tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> (tensor<2x1xf32>, tensor<2x1xf32>) {
 ^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
@@ -151,6 +167,8 @@ func.func @fakeQuantFollowedByTransposes(tensor<1x2xf32>, tensor<f32>, tensor<f3
 // CHECK:  %[[T2:.*]] = "tf.Transpose"(%[[FQ]], %cst)
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantFollowedByReshape
 func.func @fakeQuantFollowedByReshape(tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> (tensor<2x1xf32>) {
 ^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
@@ -166,6 +184,8 @@ func.func @fakeQuantFollowedByReshape(tensor<1x2xf32>, tensor<f32>, tensor<f32>)
 // CHECK:  return %1
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantFollowedByReshapes
 func.func @fakeQuantFollowedByReshapes(tensor<1x2xf32>, tensor<f32>, tensor<f32>) -> (tensor<2x1xf32>, tensor<2x1xf32>) {
 ^bb0(%arg0: tensor<1x2xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
@@ -183,6 +203,8 @@ func.func @fakeQuantFollowedByReshapes(tensor<1x2xf32>, tensor<f32>, tensor<f32>
 // CHECK-SAME: tensor<2x1xf32>
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantWithConv2D
 func.func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x8x7x16xf32>) {
 ^bb0(%arg: tensor<256x32x32x3xf32>) :
@@ -203,6 +225,8 @@ func.func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x8x7x16xf3
 // CHECK: return %[[CONV]]
 }
 
+// -----
+
 // CHECK-LABEL: perChannelFakeQuantWithConv2D
 func.func @perChannelFakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x8x7x16xf32>) {
 ^bb0(%arg: tensor<256x32x32x3xf32>) :
@@ -224,6 +248,8 @@ func.func @perChannelFakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256
 // CHECK: return %[[CONV]] : tensor<256x8x7x16xf32>
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantWithDepthwiseConv2D
 func.func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x30x30x16xf32>) {
 ^bb0(%arg: tensor<256x32x32x3xf32>) :
@@ -244,6 +270,8 @@ func.func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x
 // CHECK: return %[[CONV]]
 }
 
+// -----
+
 // CHECK-LABEL: perChannelFakeQuantWithDepthwiseConv2D
 func.func @perChannelFakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x30x30x16xf32>) {
 ^bb0(%arg: tensor<256x32x32x3xf32>) :
@@ -267,6 +295,8 @@ func.func @perChannelFakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (t
 // CHECK: return %[[CONV]]
 }
 
+// -----
+
 // CHECK-LABEL: perChannelFakeQuantWithDepthwiseConv2DWithReshape
 func.func @perChannelFakeQuantWithDepthwiseConv2DWithReshape(%arg: tensor<1x160x160x48xf32>) -> (tensor<1x160x160x48xf32>) {
   %in = arith.constant dense<0.0> : tensor<3x3x48x1xf32>
@@ -293,6 +323,8 @@ func.func @perChannelFakeQuantWithDepthwiseConv2DWithReshape(%arg: tensor<1x160x
 // CHECK: return %[[CONV]]
 }
 
+// -----
+
 // LOBIT-LABEL: fakeQuant3BitPerChannelForActivation
 func.func @fakeQuant3BitPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tensor<8x4xf32>) {
   %arg1 = arith.constant dense<[0.0, -1.0, -31.0, -30.0]> : tensor<4xf32>
@@ -306,6 +338,8 @@ func.func @fakeQuant3BitPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tens
 // LOBIT:  return %[[dq]]
 }
 
+// -----
+
 // LOBIT-LABEL: fakeQuant3BitForActivation
 func.func @fakeQuant3BitForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
 ^bb0(%arg0: tensor<8xf32>):
@@ -320,6 +354,8 @@ func.func @fakeQuant3BitForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
 // LOBIT:  return %2
 }
 
+// -----
+
 // CHECK-LABEL: fakeQuantConcat
 func.func @fakeQuantConcat(%arg0: tensor<1x6400x2xf32>, %arg1: tensor<1x1600x2xf32>) -> (tensor<1x8000x2xf32>) {
   %cst = arith.constant dense<1> : tensor<i32>
@@ -345,6 +381,38 @@ func.func @fakeQuantConcat(%arg0: tensor<1x6400x2xf32>, %arg1: tensor<1x1600x2xf
 // CHECK:  return %9
 }
 
+// -----
+
+// CHECK-LABEL: fakeQuantConcatQDQ
+func.func @fakeQuantConcatQDQ(%arg0: tensor<1x6400x2xf32>, %arg1: tensor<1x1600x2xf32>) -> (tensor<1x8000x2xf32>) {
+  %cst = arith.constant dense<1> : tensor<i32>
+  %cst_1 = arith.constant dense<-1.0> : tensor<f32>
+  %cst_2 = arith.constant dense<1.0> : tensor<f32>
+  %cst_3 = arith.constant dense<-2.0> : tensor<f32>
+  %cst_4 = arith.constant dense<0.5> : tensor<f32>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst_1, %cst_2) {num_bits = 8, narrow_range = false} : (tensor<1x6400x2xf32>, tensor<f32>, tensor<f32>) -> tensor<1x6400x2xf32>
+  %1 = "tfl.quantize"(%0) {qtype = tensor<1x6400x2x!quant.uniform<u8:f32, 1.0>>} : (tensor<1x6400x2xf32>) -> tensor<1x6400x2x!quant.uniform<u8:f32, 1.0>>
+  %2 = "tfl.dequantize"(%1) : (tensor<1x6400x2x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x6400x2xf32>
+  %3 = "tf.FakeQuantWithMinMaxVars"(%arg1, %cst_3, %cst_4) {num_bits = 8, narrow_range = false} : (tensor<1x1600x2xf32>, tensor<f32>, tensor<f32>) -> tensor<1x1600x2xf32>
+  %4 = "tfl.quantize"(%3) {qtype = tensor<1x1600x2x!quant.uniform<u8:f32, 1.0>>} : (tensor<1x1600x2xf32>) -> tensor<1x1600x2x!quant.uniform<u8:f32, 1.0>>
+  %5 = "tfl.dequantize"(%4) : (tensor<1x1600x2x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x1600x2xf32>
+  %6 = "tf.ConcatV2"(%2, %5, %cst) : (tensor<1x6400x2xf32>, tensor<1x1600x2xf32>, tensor<i32>) -> tensor<1x8000x2xf32>
+  return %6 : tensor<1x8000x2xf32>
+
+// CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst_0, %cst_1)
+// CHECK:  %1 = "tfl.quantize"(%0)
+// CHECK:  %2 = "tfl.dequantize"(%1)
+// CHECK:  %3 = "tf.FakeQuantWithMinMaxVars"(%arg1, %cst_2, %cst_3)
+// CHECK:  %4 = "tfl.quantize"(%3)
+// CHECK:  %5 = "tfl.dequantize"(%4)
+// CHECK:  %6 = "tf.ConcatV2"(%2, %5, %cst)
+// CHECK:  %7 = "tf.FakeQuantWithMinMaxVars"(%6, %cst_2, %cst_1) <{narrow_range = false, num_bits = 8 : i64}> : (tensor<1x8000x2xf32>, tensor<f32>, tensor<f32>) -> tensor<1x8000x2xf32>
+// CHECK:  %8 = "tfl.quantize"(%7) <{qtype = tensor<1x8000x2x!quant.uniform<u8:f32, 0.011764705882352941:170>>}> : (tensor<1x8000x2xf32>) -> tensor<1x8000x2x!quant.uniform<u8:f32, 0.011764705882352941:170>>
+// CHECK:  %9 = "tfl.dequantize"(%8) : (tensor<1x8000x2x!quant.uniform<u8:f32, 0.011764705882352941:170>>) -> tensor<1x8000x2xf32>
+// CHECK:  return %9
+}
+
+// -----
 
 // CHECK-LABEL: populateFakeQuantOnMeanOutput
 func.func @populateFakeQuantOnMeanOutput(%arg0: tensor<f32>) -> (tensor<f32>) {
@@ -365,6 +433,67 @@ func.func @populateFakeQuantOnMeanOutput(%arg0: tensor<f32>) -> (tensor<f32>) {
 // CHECK:  return %6
 }
 
+// -----
+
+// CHECK-LABEL: populateFakeQuantOnMeanOutputQDQs
+func.func @populateFakeQuantOnMeanOutputQDQs(%arg0: tensor<f32>) -> (tensor<f32>) {
+  %cst = arith.constant dense<-1.0> : tensor<f32>
+  %cst_1 = arith.constant dense<1.0> : tensor<f32>
+  %cst_2 = arith.constant dense<0> : tensor<1xi32>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_1) {num_bits = 8, narrow_range = false} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tfl.quantize"(%0) <{qtype = tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<f32>) -> tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+  %2 = "tfl.dequantize"(%1) : (tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<f32>
+  %3 = "tf.Mean"(%2, %cst_2) <{keep_dims = false}> : (tensor<f32>, tensor<1xi32>) -> tensor<f32>
+  return %3 : tensor<f32>
+
+// CHECK:       %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0)
+// CHECK-NEXT:  %1 = "tfl.quantize"(%0) <{qtype = tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<f32>) -> tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK-NEXT:  %2 = "tfl.dequantize"(%1) : (tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<f32>
+// CHECK-NEXT:  %3 = "tf.Mean"(%2, %cst_1)
+// CHECK-NEXT:  %4 = "tf.FakeQuantWithMinMaxVars"(%3, %cst, %cst_0)
+// CHECK-NEXT:  %5 = "tfl.quantize"(%4) <{qtype = tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<f32>) -> tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK-NEXT:  %6 = "tfl.dequantize"(%5) : (tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<f32>
+// CHECK-NEXT:  return %6
+}
+
+// -----
+
+// CHECK-LABEL: populateFakeQuantOnMeanOutputFollowedByConcat
+func.func @populateFakeQuantOnMeanOutputFollowedByConcat(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<1xf32>) {
+  %cst = arith.constant dense<1> : tensor<i32>
+  %cst_1 = arith.constant dense<-1.0> : tensor<f32>
+  %cst_2 = arith.constant dense<1.0> : tensor<f32>
+  %cst_3 = arith.constant dense<0> : tensor<1xi32>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst_1, %cst_2) {num_bits = 8, narrow_range = false} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tf.Mean"(%0, %cst_3) <{keep_dims = false}> : (tensor<f32>, tensor<1xi32>) -> tensor<f32>
+  %2 = "tf.FakeQuantWithMinMaxVars"(%arg1, %cst_1, %cst_2) {num_bits = 8, narrow_range = false} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %3 = "tf.Mean"(%2, %cst_3) <{keep_dims = false}> : (tensor<f32>, tensor<1xi32>) -> tensor<f32>
+  %4 = "tf.ConcatV2"(%1, %3, %cst) : (tensor<f32>, tensor<f32>, tensor<i32>) -> tensor<1xf32>
+  return %4 : tensor<1xf32>
+
+// CHECK:       %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst_0, %cst_1)
+// CHECK-NEXT:  %1 = "tfl.quantize"(%0) <{qtype = tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<f32>) -> tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK-NEXT:  %2 = "tfl.dequantize"(%1) : (tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<f32>
+// CHECK-NEXT:  %3 = "tf.Mean"(%2, %cst_2)
+// CHECK-NEXT:  %4 = "tf.FakeQuantWithMinMaxVars"(%3, %cst_0, %cst_1)
+// CHECK-NEXT:  %5 = "tfl.quantize"(%4) <{qtype = tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<f32>) -> tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK-NEXT:  %6 = "tfl.dequantize"(%5) : (tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<f32>
+// CHECK-NEXT:  %7 = "tf.FakeQuantWithMinMaxVars"(%arg1, %cst_0, %cst_1)
+// CHECK-NEXT:  %8 = "tfl.quantize"(%7) <{qtype = tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<f32>) -> tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK-NEXT:  %9 = "tfl.dequantize"(%8) : (tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<f32>
+// CHECK-NEXT:  %10 = "tf.Mean"(%9, %cst_2)
+// CHECK-NEXT:  %11 = "tf.FakeQuantWithMinMaxVars"(%10, %cst_0, %cst_1)
+// CHECK-NEXT:  %12 = "tfl.quantize"(%11) <{qtype = tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<f32>) -> tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK-NEXT:  %13 = "tfl.dequantize"(%12) : (tensor<!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<f32>
+// CHECK-NEXT:  %14 = "tf.ConcatV2"(%6, %13, %cst)
+// CHECK-NEXT:  %15 = "tf.FakeQuantWithMinMaxVars"(%14, %cst_0, %cst_1) <{narrow_range = false, num_bits = 8 : i64}> : (tensor<1xf32>, tensor<f32>, tensor<f32>) -> tensor<1xf32>
+// CHECK-NEXT:  %16 = "tfl.quantize"(%15) <{qtype = tensor<1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> : (tensor<1xf32>) -> tensor<1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK-NEXT:  %17 = "tfl.dequantize"(%16) : (tensor<1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>) -> tensor<1xf32>
+// CHECK-NEXT:  return %17
+}
+
+// -----
+
 // CHECK-LABEL: populateFakeQuantOnMeanOutputNegativeCase
 func.func @populateFakeQuantOnMeanOutputNegativeCase(%arg0: tensor<f32>) -> (tensor<f32>) {
   %cst = arith.constant dense<-1.0> : tensor<f32>
@@ -383,5 +512,5 @@ func.func @populateFakeQuantOnMeanOutputNegativeCase(%arg0: tensor<f32>) -> (ten
 // CHECK-NOT:  "tf.FakeQuantWithMinMaxVars"
 }
 
-}
+// -----
 
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-strict.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-strict.mlir
index 4240ea659884..4fca520c3cc5 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-strict.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-strict.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tfl-quantize='qdq-conversion-mode=Strict' |  FileCheck %s
+// RUN: tf-opt %s -tfl-quantize='qdq-conversion-mode=Strict' | FileCheck %s
 // CHECK-LABEL: QuantizeConvDRQ
 func.func private @XlaCallModule_quant.fake_quant.impl_0(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
 func.func @QuantizeConvDRQ(%arg0: tensor<1x4x4x3xf32>) -> (tensor<1x4x4x1xf32>) {
@@ -54,6 +54,7 @@ func.func @QuantizeConvWithBiasAndReluDRQ(%arg0: tensor<1x4x4x3xf32>) -> (tensor
 
 // -----
 
+// CHECK-LABEL: QuantizeConvWithBiasAndReluWeightOnly
 func.func @QuantizeConvWithBiasAndReluWeightOnly(%arg0: tensor<1x4x4x3xf32>) -> (tensor<1x4x4x1xf32>) {
   %cst = arith.constant dense<1.14751196> : tensor<1xf32>
   %cst_0 = arith.constant dense<[[[[1.76285899, -0.257785767, 0.20429258], [1.16310906, 0.23124367, 0.529797196]], [[0.348971426, -0.319283515, -0.772461354], [0.316666812, 1.88180697, -1.78054631]]]]> : tensor<1x2x2x3xf32>
@@ -71,9 +72,10 @@ func.func @QuantizeConvWithBiasAndReluWeightOnly(%arg0: tensor<1x4x4x3xf32>) ->
 
 // -----
 
+// CHECK-LABEL: QuantizeConvWithBiasAndReluSRQ
 func.func @QuantizeConvWithBiasAndReluSRQ(%arg0: tensor<1x4x4x3xf32>) -> (tensor<1x4x4x1xf32>) {
   %cst = arith.constant dense<1.14751196> : tensor<1xf32>
-  %0 = "tfl.quantize"(%cst) <{qtype = tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>}> {volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>
+  %0 = "tfl.quantize"(%cst) <{qtype = tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>}> : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>
   %1 = "tfl.dequantize"(%0) : (tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>) -> tensor<1xf32>
   %cst_0 = arith.constant dense<[[[[1.76285899, -0.257785767, 0.20429258], [1.16310906, 0.23124367, 0.529797196]], [[0.348971426, -0.319283515, -0.772461354], [0.316666812, 1.88180697, -1.78054631]]]]> : tensor<1x2x2x3xf32>
   %2 = "tfl.quantize"(%arg0) <{qtype = tensor<1x4x4x3x!quant.uniform<i8:f32, 0.0037634586915373802:-128>>}> : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3x!quant.uniform<i8:f32, 0.0037634586915373802:-128>>
@@ -95,6 +97,21 @@ func.func @QuantizeConvWithBiasAndReluSRQ(%arg0: tensor<1x4x4x3xf32>) -> (tensor
 
 // -----
 
+// CHECK-LABEL: QuantizeEmbeddingLookupDrq
+func.func @QuantizeEmbeddingLookupDrq(%arg0: tensor<2xi32>) -> (tensor<2x4xf32>){
+  %cst = arith.constant dense<[[1.0545162, -0.969288647, -0.594602108, -0.0318857245], [2.41093326, -1.87844908, -0.784769594, -0.313708425], [0.333708912, 1.76770353, -1.02776456, 1.41117179], [-0.508497119, -0.526377499, 0.503150403, 1.05497932], [-0.0874073281, 0.795816719, 2.65656161, -0.58229059]]> : tensor<5x4xf32>
+  %0 = "tfl.quantize"(%cst) <{qtype = tensor<5x4x!quant.uniform<i8:f32:0, {0.0082384077832102776,0.018835416063666344,0.013810183852910995,0.0082420259714126587,0.020754387602210045}>>}> : (tensor<5x4xf32>) -> tensor<5x4x!quant.uniform<i8:f32:0, {0.0082384077832102776,0.018835416063666344,0.013810183852910995,0.0082420259714126587,0.020754387602210045}>>
+  %1 = "tfl.dequantize"(%0) : (tensor<5x4x!quant.uniform<i8:f32:0, {0.0082384077832102776,0.018835416063666344,0.013810183852910995,0.0082420259714126587,0.020754387602210045}>>) -> tensor<5x4xf32>
+  %2 = "tfl.embedding_lookup"(%arg0, %1) : (tensor<2xi32>, tensor<5x4xf32>) -> tensor<2x4xf32>
+  return %2 : tensor<2x4xf32>
+
+// CHECK{LITERAL}: %0 = "tfl.pseudo_qconst"() <{qtype = tensor<5x4x!quant.uniform<i8:f32:0, {0.0082384077832102776,0.018835416063666344,0.013810183852910995,0.0082420259714126587,0.020754387602210045}>>, value = dense<[[127, -118, -72, -4], [127, -100, -42, -17], [24, 127, -74, 102], [-62, -64, 61, 127], [-4, 38, 127, -28]]> : tensor<5x4xi8>}> : () -> tensor<5x4x!quant.uniform<i8:f32:0, {0.0082384077832102776,0.018835416063666344,0.013810183852910995,0.0082420259714126587,0.020754387602210045}>>
+// CHECK: %1 = "tfl.embedding_lookup"(%arg0, %0) : (tensor<2xi32>, tensor<5x4x!quant.uniform<i8:f32:0, {0.0082384077832102776,0.018835416063666344,0.013810183852910995,0.0082420259714126587,0.020754387602210045}>>) -> tensor<2x4xf32>
+// CHECK: return %1 : tensor<2x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: DQQToRequantize
 func.func @DQQToRequantize(%arg0: tensor<1x128x128x320x!quant.uniform<i8:f32, 0.17072822153568268:6>>) -> (tensor<1x128x128x320x!quant.uniform<i8:f32, 0.1043805405497551:-6>>) {
     %0 = "tfl.dequantize"(%arg0) : (tensor<1x128x128x320x!quant.uniform<i8:f32, 0.17072822153568268:6>>) -> tensor<1x128x128x320xf32>
@@ -105,3 +122,14 @@ func.func @DQQToRequantize(%arg0: tensor<1x128x128x320x!quant.uniform<i8:f32, 0.
 // CHECK:    return %0 : tensor<1x128x128x320x!quant.uniform<i8:f32, 0.1043805405497551:-6>>
 }
 
+// -----
+
+func.func @VolatileQuantizeConst() -> (tensor<1xf32>) {
+  %cst = arith.constant dense<1.14751196> : tensor<1xf32>
+  %0 = "tfl.quantize"(%cst) <{qtype = tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>}> {volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>
+  %1 = "tfl.dequantize"(%0) : (tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>) -> tensor<1xf32>
+  return %1 : tensor<1xf32>
+// CHECK: %0 = "tfl.pseudo_qconst"() <{qtype = tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>, value = dense<20578> : tensor<1xi32>}> {volatile} : () -> tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>
+// CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x!quant.uniform<i32:f32, 5.576458833533339E-5>>) -> tensor<1xf32>
+// CHECK: return %1 : tensor<1xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
index a5ac48521818..4538c0cdd7b5 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
@@ -152,7 +152,7 @@ func.func @QuantizeTwoVariable(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>)
 
   %4 = "tfl.var_handle"() {container = "", shared_name = "read_assign/states0"} : () -> tensor<!tf_type.resource>
   %5 = "tfl.var_handle"() {container = "", shared_name = "read_assign/states1"} : () -> tensor<!tf_type.resource>
-  
+
   %40 = "tfl.read_variable"(%4) : (tensor<!tf_type.resource>) -> tensor<1x2x3xf32>
   %41 = "quantfork.stats"(%40) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
   %42 = "tfl.concatenation"(%41, %0) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x4x3xf32>
@@ -171,17 +171,16 @@ func.func @QuantizeTwoVariable(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>)
 
   func.return %0 : tensor<1x2x3xf32>
 
-// WHOLE-PASSES:  %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>}> : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
-// WHOLE-PASSES-DAG:  %[[vh1:.*]] = "tfl.var_handle"() <{container = "", shared_name = "read_assign/states0"}> : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
-// WHOLE-PASSES-DAG:  %[[vh2:.*]] = "tfl.var_handle"() <{container = "", shared_name = "read_assign/states1"}> : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
+// WHOLE-PASSES:       %[[vh1:.*]] = "tfl.var_handle"() <{container = "", shared_name = "read_assign/states0"}> : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
+// WHOLE-PASSES-DAG:   %[[vh2:.*]] = "tfl.var_handle"() <{container = "", shared_name = "read_assign/states1"}> : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
 
-// WHOLE-PASSES-DAG:  %[[rv1:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-DAG:   %[[rv1:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  %[[cc1:.*]] = "tfl.concatenation"(%[[rv1]], {{.*}}) {{.*}} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc1]]) <{qtype = tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>}> : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  %[[ss1:.*]] = "tfl.strided_slice"(%[[q2]], {{.*}}) <{{{.*}}}> : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh1]], %[[ss1]]) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> ()
 
-// WHOLE-PASSES-DAG:  %[[rv2:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-DAG:   %[[rv2:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  %[[cc2:.*]] = "tfl.concatenation"(%[[rv2]], {{.*}}) {{.*}} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  %[[ss2:.*]] = "tfl.strided_slice"(%[[cc2]], {{.*}}) <{{{.*}}}> : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh2]], %[[ss2]]) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> ()
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index e3b95f65eade..f53598441abb 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -316,17 +316,17 @@ func.func @QuantizeConcat(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!quant
 // -----
 
 // CHECK-LABEL: QuantizeConcatRequantize
-func.func @QuantizeConcatRequantize(tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1:128>> {
-^bb0(%arg0: tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>, %arg1: tensor<1x2xf32>):
-  %1 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>) -> tensor<1x2xf32>
+func.func @QuantizeConcatRequantize(tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>, tensor<1x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1:128>> {
+^bb0(%arg0: tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>, %arg1: tensor<1x2xf32>):
+  %1 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<i8:f32, 2.0:128>>) -> tensor<1x2xf32>
   %2 = "tfl.concatenation"(%1, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-  %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
-  func.return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+  %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
+  func.return %3 : tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}> {volatile}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 1.000000e-01:128>>}>
 // CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q0]], %[[q1]]) <{axis = 0 : i32, fused_activation_function = "NONE"}>
-// CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<i8:f32, 1.000000e-01:128>>
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 25789ab44d17..2e420ed6ef5f 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/core/macros.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/tensorflow/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline.h"
 #include "tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -70,14 +70,20 @@ void AddOptimizationPasses(const tflite::ConverterFlags& converter_flags,
 
   pass_manager->addPass(mlir::TFL::CreatePushTransposeThroughEwisePass());
 
-  pass_manager->addNestedPass<mlir::func::FuncOp>(
-      mlir::TFL::Create<mlir::TFL::OptimizeBroadcastLikePass>());
+  // Add BroadcastLike optimization pass.
+  {
+    mlir::TFL::OptimizeBroadcastLikePassOptions options;
+    options.unsafe_fuse_dynamic_shaped_broadcast =
+        pass_config.unsafe_fuse_dynamic_shaped_broadcast;
+    pass_manager->addNestedPass<mlir::func::FuncOp>(
+        mlir::TFL::Create<mlir::TFL::OptimizeBroadcastLikePass>(options));
+  }
 
   // Add TFLite optimize pass.
   mlir::TFL::OptimizePassOptions optimize_pass_options;
   optimize_pass_options.enable_strict_qdq_mode =
       (pass_config.quant_specs.qdq_conversion_mode ==
-       mlir::quant::QDQConversionMode::kQDQStrict);
+       mlir::TFL::QDQConversionMode::kQDQStrict);
   std::unique_ptr<mlir::Pass> optimize_pass =
       mlir::TFL::Create<mlir::TFL::OptimizePass>(optimize_pass_options);
   auto pass_ptr =
@@ -122,7 +128,7 @@ void AddStrictQDQQuantizationPasses(
 
 void AddQuantizationPasses(const mlir::TFL::PassConfig& pass_config,
                            mlir::OpPassManager& pass_manager) {
-  const mlir::quant::QuantizationSpecs& quant_specs = pass_config.quant_specs;
+  const mlir::TFL::QuantizationSpecs& quant_specs = pass_config.quant_specs;
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreatePrepareQuantizePass(quant_specs));
   if (quant_specs.default_ranges.first.has_value() ||
@@ -191,7 +197,7 @@ void AddVariableFreezingFromGlobalTensorsPasses(
 
 void AddDynamicRangeQuantizationPasses(const mlir::TFL::PassConfig& pass_config,
                                        mlir::OpPassManager& pass_manager) {
-  const mlir::quant::QuantizationSpecs& quant_specs = pass_config.quant_specs;
+  const mlir::TFL::QuantizationSpecs& quant_specs = pass_config.quant_specs;
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreatePrepareDynamicRangeQuantizePass(quant_specs));
   pass_manager.addNestedPass<mlir::func::FuncOp>(
@@ -355,8 +361,13 @@ void AddPostQuantizationStableHloToTfPasses(
     // broadcasting support. This needs to be run immediately after HLO->TFL
     // legalization, otherwise the newly generated TFL broadcast ops can fold
     // and materialize the weights.
-    pass_manager.addNestedPass<mlir::func::FuncOp>(
-        mlir::TFL::Create<mlir::TFL::OptimizeBroadcastLikePass>());
+    {
+      mlir::TFL::OptimizeBroadcastLikePassOptions options;
+      options.unsafe_fuse_dynamic_shaped_broadcast =
+          pass_config.unsafe_fuse_dynamic_shaped_broadcast;
+      pass_manager.addNestedPass<mlir::func::FuncOp>(
+          mlir::TFL::Create<mlir::TFL::OptimizeBroadcastLikePass>(options));
+    }
   }
   // folds tf.BroadcastTo ops with subsequent ops if they have built in
   // broadcasting support. This needs to be run immediately after HLO->TF
@@ -585,7 +596,7 @@ void AddPostVariableFreezingTFToTFLConversionPasses(
     pass_manager->addPass(mlir::TFL::CreateLegalizeHashTablesPass());
 
     if (pass_config.quant_specs.qdq_conversion_mode ==
-        mlir::quant::QDQConversionMode::kQDQStrict) {
+        mlir::TFL::QDQConversionMode::kQDQStrict) {
       pass_manager->addPass(mlir::TFL::CreateLowerQuantAnnotationsPass());
 
       // To remove the quant annotation decompositions.
@@ -611,7 +622,7 @@ void AddPostVariableFreezingTFToTFLConversionPasses(
     pass_manager->addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
 
     if (pass_config.quant_specs.qdq_conversion_mode ==
-        mlir::quant::QDQConversionMode::kQDQStrict) {
+        mlir::TFL::QDQConversionMode::kQDQStrict) {
       AddStrictQDQQuantizationPasses(converter_flags, pass_config,
                                      *pass_manager);
     } else {
@@ -621,7 +632,7 @@ void AddPostVariableFreezingTFToTFLConversionPasses(
       if (pass_config.quant_specs
               .RunPropagationAndRewriteQuantizationPasses() ||
           pass_config.quant_specs.qdq_conversion_mode !=
-              mlir::quant::QDQConversionMode::kQDQNone) {
+              mlir::TFL::QDQConversionMode::kQDQNone) {
         AddQuantizationPasses(pass_config, *pass_manager);
         // Remove unnecessary QDQs while handling QAT models.
         pass_manager->addNestedPass<mlir::func::FuncOp>(
@@ -637,8 +648,9 @@ void AddPostVariableFreezingTFToTFLConversionPasses(
         converter_flags.reduce_type_precision()) {
       pass_manager->addPass(mlir::TFL::CreateReduceTypePrecisionPass());
     }
+    pass_manager->addPass(mlir::TFL::CreateCleanupOptimizationBarrierPass());
 
-    // This pass should alway run before the end of the model conversion but
+    // This pass should always run before the end of the model conversion but
     // not after the CreateSplitMergedOperandsPass below.
     if (pass_config.canonicalizing_inf_as_min_max_float)
       pass_manager->addPass(mlir::TFL::CreateCanonicalizeBoundaryValuePass());
@@ -658,7 +670,7 @@ void AddPostVariableFreezingTFToTFLConversionPasses(
     pass_manager->addPass(
         mlir::TFL::CreateInsertCallOnceOpFromSessionInitializerPass());
   } else {
-    // This pass should alway run before the end of the model conversion.
+    // This pass should always run before the end of the model conversion.
     if (pass_config.canonicalizing_inf_as_min_max_float)
       pass_manager->addPass(mlir::TFL::CreateCanonicalizeBoundaryValuePass());
   }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 5b20a6e72f99..b306986654c8 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -46,11 +46,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
-#include "tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "xla/hlo/translate/hlo_to_mhlo/translate.h"
@@ -58,9 +57,15 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
 
+using llvm::cl::opt;
 using mlir::MLIRContext;
 using mlir::ModuleOp;
 
+// NOLINTNEXTLINE
+opt<bool> upgrade_legacy("tf-upgrade-legacy",
+                         llvm::cl::desc("Upgrade legacy TF graph behavior"),
+                         llvm::cl::init(false));
+
 // NOLINTNEXTLINE
 static llvm::cl::opt<std::string> weight_quantization(
     "weight_quantization",
@@ -184,9 +189,9 @@ int main(int argc, char **argv) {
   if (!module.ok()) return kTrFailure;
 
   // Set the quantization specifications from the command line flags.
-  mlir::quant::QuantizationSpecs quant_specs;
-  if (mlir::quant::ParseInputNodeQuantSpecs(
-          input_arrays, min_values, max_values, inference_type, &quant_specs)) {
+  mlir::TFL::QuantizationSpecs quant_specs;
+  if (mlir::TFL::ParseInputNodeQuantSpecs(input_arrays, min_values, max_values,
+                                          inference_type, &quant_specs)) {
     llvm::errs() << "Failed to get input quant spec.";
     return kTrFailure;
   }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
index 0f05c371868b..7769a0ada951 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h"
 
+#include <string>
+
 #include "llvm/Support/CommandLine.h"
 
 using llvm::cl::opt;
@@ -218,3 +220,73 @@ opt<std::string> model_origin_framework(
     "model-origin-framework",
     llvm::cl::desc("The source model type: PYTORCH, JAX, TENSORFLOW, etc."),
     llvm::cl::init("UNSET"));
+
+// NOLINTNEXTLINE
+opt<std::string> input_arrays(
+    "tf-input-arrays", llvm::cl::desc("Input tensor names, separated by ','"),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> input_dtypes(
+    "tf-input-data-types",
+    llvm::cl::desc("(Optional) Input tensor data types, separated by ','. Use "
+                   "'' if a single data type is skipped. The data type from "
+                   "the import graph is used if it is skipped."),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> input_shapes(
+    "tf-input-shapes",
+    llvm::cl::desc(
+        "Input tensor shapes. Shapes for different tensors are separated by "
+        "':', and dimension sizes for the same tensor are separated by ','"),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> output_arrays(
+    "tf-output-arrays", llvm::cl::desc("Output tensor names, separated by ','"),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> control_output_arrays(
+    "tf-control-output-arrays",
+    llvm::cl::desc("Control output node names, separated by ','"),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> inference_type(
+    "tf-inference-type",
+    llvm::cl::desc(
+        "Sets the type of real-number arrays in the output file. Only allows "
+        "float and quantized types"),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> min_values(
+    "tf-input-min-values",
+    llvm::cl::desc(
+        "Sets the lower bound of the input data. Separated by ','; Each entry "
+        "in the list should match an entry in -tf-input-arrays. This is "
+        "used when -tf-inference-type is a quantized type."),
+    llvm::cl::Optional, llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> max_values(
+    "tf-input-max-values",
+    llvm::cl::desc(
+        "Sets the upper bound of the input data. Separated by ','; Each entry "
+        "in the list should match an entry in -tf-input-arrays. This is "
+        "used when -tf-inference-type is a quantized type."),
+    llvm::cl::Optional, llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> debug_info_file(
+    "tf-debug-info",
+    llvm::cl::desc("Path to the debug info file of the input graph def"),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<bool> enable_shape_inference(
+    "tf-enable-shape-inference-on-import",
+    llvm::cl::desc("Enable shape inference on import (temporary)"),
+    llvm::cl::init(false));
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
index c225291360c9..6095b69d471a 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
@@ -48,6 +48,17 @@ extern llvm::cl::opt<bool> enable_dynamic_update_slice;
 extern llvm::cl::opt<bool> preserve_assert_op;
 extern llvm::cl::opt<bool> legalize_custom_tensor_list_ops;
 extern llvm::cl::opt<bool> reduce_type_precision;
+extern llvm::cl::opt<std::string> input_arrays;
+extern llvm::cl::opt<std::string> input_dtypes;
+extern llvm::cl::opt<std::string> input_shapes;
+extern llvm::cl::opt<std::string> output_arrays;
+extern llvm::cl::opt<std::string> control_output_arrays;
+extern llvm::cl::opt<std::string> inference_type;
+extern llvm::cl::opt<std::string> min_values;
+extern llvm::cl::opt<std::string> max_values;
+extern llvm::cl::opt<std::string> debug_info_file;
+extern llvm::cl::opt<bool> upgrade_legacy;
+extern llvm::cl::opt<bool> enable_shape_inference;
 
 // Import saved model.
 extern llvm::cl::opt<bool> import_saved_model_object_graph;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index ca8c6eec8a24..e950a5d91b98 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/metrics/converter_error_data.pb.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/quantize_weights.h"
 #include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
@@ -76,7 +77,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
@@ -270,7 +270,7 @@ absl::StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
 // on the translated_result using quant_specs and saving the final output in
 // result.
 absl::Status ApplyDynamicRangeQuantizationFromOldQuantizer(
-    const mlir::quant::QuantizationSpecs& quant_specs,
+    const mlir::TFL::QuantizationSpecs& quant_specs,
     std::string translated_result, std::string* result) {
   flatbuffers::FlatBufferBuilder q_builder(/*initial_size=*/10240);
   const uint8_t* buffer =
@@ -538,7 +538,7 @@ absl::Status ConvertTFExecutorToTFLOrFlatbuffer(
   }
 
   // Write MLIR TFLite dialect into FlatBuffer
-  const mlir::quant::QuantizationSpecs& quant_specs = pass_config.quant_specs;
+  const mlir::TFL::QuantizationSpecs& quant_specs = pass_config.quant_specs;
   OpOrArgLocNameMapper op_or_arg_name_mapper;
   tflite::FlatbufferExportOptions options;
   std::string translated_result;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 304473e20106..9188b54e3708 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -33,7 +34,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/converter_flags.pb.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/compiler/mlir/lite/tools/BUILD b/tensorflow/compiler/mlir/lite/tools/BUILD
index 63590fc545fd..055877d0b322 100644
--- a/tensorflow/compiler/mlir/lite/tools/BUILD
+++ b/tensorflow/compiler/mlir/lite/tools/BUILD
@@ -22,47 +22,3 @@ cc_library(
 )
 
 # LINT.ThenChange(//tensorflow/lite/tools:command_line_flags)
-
-cc_library(
-    name = "translate_cl_options",
-    srcs = [
-        "tf_mlir_translate_cl.cc",
-    ],
-    hdrs = [
-        "tf_mlir_translate_cl.h",
-    ],
-    deps = [
-        "@llvm-project//llvm:Support",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "translate_registration",
-    srcs = [
-        "tf_mlir_translate_registration.cc",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/lite/tools:translate_cl_options",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow/translate:mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow/translate/tools:file_tf_mlir_translate",
-        "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_executor_to_graph",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:TranslateLib",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_xla//xla/client:client_library",
-        "@local_xla//xla/client:compile_only_client",
-        "@local_xla//xla/service/cpu:cpu_compiler",
-        "@local_xla//xla/service/cpu:cpu_transfer_manager",
-        "@local_xla//xla/stream_executor/host:host_platform",
-        "@local_xla//xla/stream_executor/host:host_platform_id",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/compiler/mlir/lite/tools/command_line_flags.cc b/tensorflow/compiler/mlir/lite/tools/command_line_flags.cc
index 19ed0d7215b0..dd0ff61419c4 100644
--- a/tensorflow/compiler/mlir/lite/tools/command_line_flags.cc
+++ b/tensorflow/compiler/mlir/lite/tools/command_line_flags.cc
@@ -13,14 +13,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/tools/command_line_flags.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <cstring>
 #include <functional>
-#include <iomanip>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>
 
 #include "absl/log/log.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.cc b/tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.cc
new file mode 100644
index 000000000000..8cb785ac86d8
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.cc
@@ -0,0 +1,55 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+#define DEBUG_TYPE "cleanup-optimization-barrier"
+
+// Replaces the shlo.optimization_barrier op with its input.
+struct CleanupOptimizationBarrier
+    : public OpRewritePattern<stablehlo::OptimizationBarrierOp> {
+  using OpRewritePattern<stablehlo::OptimizationBarrierOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(stablehlo::OptimizationBarrierOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOp(op, op.getOperands());
+    return success();
+  }
+};
+}  // end namespace
+
+void CleanupOptimizationBarrierPass::runOnOperation() {
+  auto* ctx = &getContext();
+
+  RewritePatternSet patterns(ctx);
+  patterns.add<CleanupOptimizationBarrier>(ctx);
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+}  // end namespace TFL
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.h b/tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.h
new file mode 100644
index 000000000000..3a6bd2a863e0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CLEANUP_OPTIMIZATION_BARRIER_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CLEANUP_OPTIMIZATION_BARRIER_PASS_H_
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/transforms/pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+namespace mlir {
+namespace TFL {
+
+// Pass to clean up shlo.optimization_barrier ops.
+
+class CleanupOptimizationBarrierPass
+    : public TFL::Pass<CleanupOptimizationBarrierPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CleanupOptimizationBarrierPass)
+
+  CleanupOptimizationBarrierPass() = default;
+  CleanupOptimizationBarrierPass(const CleanupOptimizationBarrierPass&) {};
+
+  void runOnOperation() override;
+  static llvm::StringRef GetName() { return "CleanupOptimizationBarrierPass"; }
+  static llvm::StringRef GetArgument() {
+    return "tfl-cleanup-optimization-barrier";
+  }
+  static llvm::StringRef GetDescription() {
+    return "Pass to clean up shlo.optimization_barrier ops.";
+  }
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, mlir::stablehlo::StablehloDialect,
+                    mlir::arith::ArithDialect>();
+  }
+};
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_CLEANUP_OPTIMIZATION_BARRIER_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.cc b/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.cc
index f0fb9361980f..5a3f23fe6df3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h"
 
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass_options.h"
 #include "tensorflow/compiler/mlir/lite/transforms/optimize_pass_options.h"
 #include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
 #include "tensorflow/compiler/mlir/lite/transforms/variable_freezing_pipeline_options.h"
@@ -33,6 +34,12 @@ void ConverterPassOptionsSetter::SetOptions(
   options.enable_tflite_variables = pass_config_.enable_tflite_variables;
 }
 
+void ConverterPassOptionsSetter::SetOptions(
+    OptimizeBroadcastLikePassOptions& options) const {
+  // options.unsafe_fuse_dynamic_shaped_broadcast =
+  //     converter_flags_.unsafe_fuse_dynamic_shaped_broadcast();
+}
+
 void ConverterPassOptionsSetter::SetOptions(EmptyPassOptions& options) const {}
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h b/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h
index 01f71afe84ca..59151448b92f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h
+++ b/tensorflow/compiler/mlir/lite/transforms/converter_pass_options_setter.h
@@ -26,6 +26,7 @@ namespace TFL {
 class OptimizePassOptions;
 class VariableFreezingPipelineOptions;
 class EmptyPassOptions;
+class OptimizeBroadcastLikePassOptions;
 
 // PassOptionsSetter to set TFLite Converter Pass/Pipeline Options based on
 // ConverterFlags and TFL::PassConfig values.
@@ -40,6 +41,7 @@ class ConverterPassOptionsSetter : public PassOptionsSetter {
   void SetOptions(OptimizePassOptions& options) const override;
   void SetOptions(VariableFreezingPipelineOptions& options) const override;
   void SetOptions(EmptyPassOptions& options) const override;
+  void SetOptions(OptimizeBroadcastLikePassOptions& options) const override;
 
  private:
   tflite::ConverterFlags converter_flags_;
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index f1b602a6763a..a15f71fb7ebf 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -28,11 +28,11 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 //===----------------------------------------------------------------------===//
 // The Pass to add default quantization parameters for the activations which
@@ -41,8 +41,8 @@ limitations under the License.
 
 namespace mlir {
 namespace TFL {
-// Includs an auto-generated function, which can retrieve the quantization
-// specification for an TFL operation. The signature of the function is
+// Includes an auto-generated function, which can retrieve the quantization
+// specification for a TFL operation. The signature of the function is
 //   std::unique_pointer<OpQuantSpec> TFL::GetOpQuantSpec(Operation *)
 #include "tensorflow/compiler/mlir/lite/utils/generated_op_quant_spec_getters.inc"
 
@@ -54,7 +54,7 @@ namespace {
 class DefaultQuantParamsPass
     : public impl::DefaultQuantParamsPassBase<DefaultQuantParamsPass> {
  public:
-  using DefaultQuantParamsPassBase::DefaultQuantParamsPassBase;
+  DefaultQuantParamsPass() {}
 
   explicit DefaultQuantParamsPass(double default_min, double default_max,
                                   bool is_signed) {
@@ -87,21 +87,20 @@ class DefaultQuantParamsPass
 
   // Uses `quant_params` to quantize `value` and inserting a pair of
   // tfl.quantize and tfl.dequantize ops for this `value`.
-  void QuantizeValue(OpBuilder builder, Value value,
-                     quant::QuantParams quant_params);
+  void QuantizeValue(OpBuilder builder, Value value, QuantParams quant_params);
 
   // If the value hasn't been quantized, the functions adds it to `values`.
   void AddToWorkListIfUnquantized(Value value, std::vector<Value> *values);
 
   // Converts the default min/max to the default quantization parameters.
-  quant::QuantParams GetDefaultQuantParams(Builder builder);
+  QuantParams GetDefaultQuantParams(Builder builder);
 
   // Gets the quantization parameters for the bias of an operation by using the
   // quantization parameters from the non-biases operands.
-  quant::QuantParams GetQuantParamsForBias(Operation *op, int bias,
-                                           const std::vector<int> &non_biases,
-                                           quant::AccumulatorScaleFunc func);
-  quant::QuantParams default_quant_params_;
+  QuantParams GetQuantParamsForBias(Operation *op, int bias,
+                                    const std::vector<int> &non_biases,
+                                    AccumulatorScaleFunc func);
+  QuantParams default_quant_params_;
 };
 }  // namespace
 
@@ -123,7 +122,7 @@ void DefaultQuantParamsPass::runOnOperation() {
   }
 
   func.walk([&](Operation *op) {
-    if (!quant::IsOpQuantizable(op) || op->getParentOfType<TFL::CustomTfOp>()) {
+    if (!IsOpQuantizable(op) || op->getParentOfType<TFL::CustomTfOp>()) {
       return;
     }
 
@@ -137,7 +136,7 @@ void DefaultQuantParamsPass::runOnOperation() {
   });
 
   // Apply the default quantization parameters for these activation values.
-  quant::QuantParams default_params = GetDefaultQuantParams(builder);
+  QuantParams default_params = GetDefaultQuantParams(builder);
   for (Value value : activation_values) {
     QuantizeValue(builder, value, default_params);
   }
@@ -148,7 +147,7 @@ void DefaultQuantParamsPass::runOnOperation() {
     Operation *op = *bias.user_begin();
     auto spec = TFL::GetOpQuantSpec(op);
     for (auto &it : spec->biases_params) {
-      quant::QuantParams bias_params = GetQuantParamsForBias(
+      QuantParams bias_params = GetQuantParamsForBias(
           op, it.first, it.second.first, it.second.second);
       if (!bias_params) continue;
       QuantizeValue(builder, bias, bias_params);
@@ -177,7 +176,7 @@ void DefaultQuantParamsPass::AddToWorkListIfUnquantized(
 }
 
 void DefaultQuantParamsPass::QuantizeValue(OpBuilder builder, Value value,
-                                           quant::QuantParams quant_params) {
+                                           QuantParams quant_params) {
   Type expressed_type = value.getType();
   Type new_type = quant_params.castFromExpressedType(expressed_type);
   // This value isn't an expressed type (float), skip.
@@ -202,9 +201,9 @@ void DefaultQuantParamsPass::QuantizeValue(OpBuilder builder, Value value,
   quantize.getOperation()->replaceUsesOfWith(dequantize, value);
 }
 
-quant::QuantParams DefaultQuantParamsPass::GetQuantParamsForBias(
+QuantParams DefaultQuantParamsPass::GetQuantParamsForBias(
     Operation *op, int bias, const std::vector<int> &non_biases,
-    quant::AccumulatorScaleFunc func) {
+    AccumulatorScaleFunc func) {
   std::vector<quant::QuantizedType> non_bias_types;
   non_bias_types.reserve(non_biases.size());
   for (int non_bias : non_biases) {
@@ -226,8 +225,7 @@ quant::QuantParams DefaultQuantParamsPass::GetQuantParamsForBias(
               /*legacy_float_scale=*/false);
 }
 
-quant::QuantParams DefaultQuantParamsPass::GetDefaultQuantParams(
-    Builder builder) {
+QuantParams DefaultQuantParamsPass::GetDefaultQuantParams(Builder builder) {
   if (!default_quant_params_) {
     default_quant_params_ = quantfork::fakeQuantAttrsToType(
         builder.getUnknownLoc(),
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 03272ef73538..9e9bea497c60 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -26,29 +26,29 @@ include "tensorflow/compiler/mlir/lite/utils/utils.td"
 def CreateEmptyBoolAttr : NativeCodeCall<"::mlir::BoolAttr()">;
 
 def DenseElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<DenseElementsAttr>()">,
+  CPred<"llvm::isa<DenseElementsAttr>($_self)">,
   "non-opaque constant tensor">;
 
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">, "float constant tensor">;
+  CPred<"llvm::cast<ElementsAttr>($_self).getShapedType().getElementType().isF32()">, "float constant tensor">;
 
 def Int64ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isInteger(64)">, "Int 64 constant tensor">;
+  CPred<"llvm::cast<ElementsAttr>($_self).getShapedType().getElementType().isInteger(64)">, "Int 64 constant tensor">;
 
 // Extract the ith int element from an ArrayAttr $0 as an 32-bit IntegerAttr
 // with builder.
 class ExtractI32At<int i> : NativeCodeCall<
-    "$_builder.getI32IntegerAttr($_self.cast<ArrayAttr>().getValue()[" # i #
-    "].cast<IntegerAttr>().getInt())">;
+    "$_builder.getI32IntegerAttr(llvm::cast<IntegerAttr>(llvm::cast<ArrayAttr>($_self).getValue()[" # i #
+    "]).getInt())">;
 
 // Use the tensor type information from $0 and convert min $1, max $2 and
 // numBits $3 and narrowRange $4 to a QuantizedType.
 def ConvertToQuantTypeFromAttrs : NativeCodeCall<
-    "quant::GetQuantizedTypeAttr($_builder, $0.getType(), $1, $2, -1, $3, $4, /*is_signed=*/false)">;
+    "GetQuantizedTypeAttr($_builder, $0.getType(), $1, $2, -1, $3, $4, /*is_signed=*/false)">;
 
 // Converts an integer attribute $0 to 32-bit with builder.
 def convertIntAttrTo32Bit : NativeCodeCall<
-    "$_builder.getI32IntegerAttr($0.cast<IntegerAttr>().getInt())">;
+    "$_builder.getI32IntegerAttr(llvm::cast<IntegerAttr>($0).getInt())">;
 
 // Builds a constant bool attribute.
 class GetBoolAttr<int value> :
@@ -56,15 +56,15 @@ class GetBoolAttr<int value> :
 
 // Converts an integer attribute $0 to 64-bit with builder.
 def convertIntAttrTo64Bit : NativeCodeCall<
-    "$_builder.getI64IntegerAttr($0.cast<IntegerAttr>().getInt())">;
+    "$_builder.getI64IntegerAttr(llvm::cast<IntegerAttr>($0).getInt())">;
 
 // Extracts the single integer element from $_self.
 def ExtractSingleElementAsInteger : NativeCodeCall<
-    "ExtractSingleElementAsInteger($_self.cast<ElementsAttr>())">;
+    "ExtractSingleElementAsInteger(llvm::cast<ElementsAttr>($_self))">;
 
 // Extracts the single int32 element from $_self.
 def ExtractSingleElementAsInt32 : NativeCodeCall<
-    "$_builder.getI32IntegerAttr(ExtractSingleElementAsInteger($_self.cast<ElementsAttr>()).getInt())">;
+    "$_builder.getI32IntegerAttr(ExtractSingleElementAsInteger(llvm::cast<ElementsAttr>($_self)).getInt())">;
 
 // Converts tensor with int64 to int32.
 def CreateTFCastToInt32Op : NativeCodeCall<
@@ -75,7 +75,7 @@ def CreateInt32ConstOrCast : NativeCodeCall<
 
 // Creates an int32 constant op from an integer attribute $0.
 def CreateInt32ConstOpFromIntAttr
-  : NativeCodeCall<"$_builder.create<TF::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>($0.cast<IntegerAttr>().getInt())}))">;
+  : NativeCodeCall<"$_builder.create<TF::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>(llvm::cast<IntegerAttr>($0).getInt())}))">;
 
 //===----------------------------------------------------------------------===//
 // Nullary ops patterns.
@@ -100,8 +100,8 @@ def IsDataFormatNHWC : ConstantAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">;
 def IsDataFormatNCHW : ConstantAttr<TF_ConvnetDataFormatAttr, "\"NCHW\"">;
 
 class I32VectorElementsAttr<int len> : ElementsAttrBase<
-  CPred<"$_self.isa<DenseIntElementsAttr>() &&"
-      "$_self.cast<DenseIntElementsAttr>().getType()."
+  CPred<"llvm::isa<DenseIntElementsAttr>($_self) &&"
+      "llvm::cast<DenseIntElementsAttr>($_self).getType()."
       "getElementType().isSignlessInteger(32)">,
   "32-bit int elements attribute of shape [" # len # "]"> {
 
@@ -123,8 +123,8 @@ def IsAllOnes : AttrConstraint<CPred<"TFIntListIsAllOnes($_self)">>;
 
 // Constraint that attribute is string with value either "SAME" or "VALID"
 def IsSameOrValid : AttrConstraint<
-    CPred<"$_self.cast<StringAttr>().getValue() == \"SAME\" || " #
-          "$_self.cast<StringAttr>().getValue() == \"VALID\"">,
+    CPred<"llvm::cast<StringAttr>($_self).getValue() == \"SAME\" || " #
+          "llvm::cast<StringAttr>($_self).getValue() == \"VALID\"">,
     "'SAME' or 'VALID' paddings">;
 
 def TFL_GetMirrorPaddingType : NativeCodeCall<
@@ -307,7 +307,7 @@ def LegalizeSelectV2NotSameStaticShape : Pat<
   [(OpHasNotSameStaticShapes $src_op)]>;
 def LegalizeShape : Pat<(TF_ShapeOp $arg), (TFL_ShapeOp $arg)>;
 def LegalizeSigmoid : Pat<(TF_SigmoidOp $arg), (TFL_LogisticOp $arg)>;
-def LegalizeSin : Pat<(TF_SinOp F32Tensor:$arg), (TFL_SinOp $arg)>;
+def LegalizeSin : Pat<(TF_SinOp $arg), (TFL_SinOp $arg)>;
 def LegalizeSlice : Pat<(TF_SliceOp $input, $begin, $size),
                         (TFL_SliceOp $input, $begin, $size)>;
 def LegalizeSoftmax : Pat<(TF_SoftmaxOp $arg),
@@ -443,8 +443,8 @@ def LegalizeSum : Pat<(TF_SumOp $arg, $axes, BoolAttr:$arg2),
 def LegalizeTopKV2 : Pat<(TF_TopKV2Op $input, $k, $ignored_sorted),
                          (TFL_TopKV2Op $input, $k)>;
 
-def ReductionDimensionIsLastDim : Constraint<CPred<"($0.cast<IntegerAttr>().getInt() == "
-  "$1.getType().cast<ShapedType>().getRank() - 1 || $0.cast<IntegerAttr>().getInt() == -1)">>;
+def ReductionDimensionIsLastDim : Constraint<CPred<"(llvm::cast<IntegerAttr>($0).getInt() == "
+  "llvm::cast<ShapedType>($1.getType()).getRank() - 1 || llvm::cast<IntegerAttr>($0).getInt() == -1)">>;
 
 // Legalizes TF_ApproxTopKOp to TFL_TopKV2Op with the following constraints:
 //    1. It computes max k
@@ -558,10 +558,10 @@ def LegalizeConv2DBackpropInput : Pat<
      /*fused_activation_function=*/TFL_AF_None)>;
 
 def IsRankZeroAttr
-  : CPred<"$_self.cast<DenseElementsAttr>().getType().getRank() == 0">;
+  : CPred<"llvm::cast<DenseElementsAttr>($_self).getType().getRank() == 0">;
 
 def HasValueZero
-  : CPred<"$_self.cast<DenseElementsAttr>()."
+  : CPred<"llvm::cast<DenseElementsAttr>($_self)."
           "getSplatValue<::mlir::IntegerAttr>().getInt() == 0">;
 
 // TFLite only supports MatrixSetDiag ops with scalar zero k attribute.
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_variables.td b/tensorflow/compiler/mlir/lite/transforms/legalize_variables.td
index 5c26b6ea4685..72ec563930d7 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_variables.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_variables.td
@@ -22,7 +22,7 @@ def HasSupportedElementType :
   Constraint<CPred<"HasSupportedElementType($0.getDefiningOp())">>;
 
 def IsSupportedElementType :
-  Constraint<CPred<"IsSupportedElementType($0.getType().cast<ShapedType>())">>;
+  Constraint<CPred<"IsSupportedElementType(llvm::cast<ShapedType>($0.getType()))">>;
 
 def LegalizeVarHandle : Pat<
   (TF_VarHandleOp:$result $container, $shared_name),
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_pass.cc b/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_pass.cc
index 42a8c2d3c444..97689e5c42f9 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_pass.cc
@@ -384,7 +384,7 @@ void LowerQuantAnnotationsPass::runOnOperation() {
   prepare_patterns.add<RemovePreventGradient, RemoveIdentity>(&ctx);
 
   GreedyRewriteConfig greedy_config;
-  greedy_config.fold = true;
+  greedy_config.enableFolding(true);
   if (failed(applyPatternsGreedily(module, std::move(prepare_patterns),
                                    greedy_config))) {
     module.emitError(
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 2b5b7537f515..182d593cb143 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -139,7 +139,7 @@ Value CreateI32SplatTensor(Location loc, PatternRewriter *rewriter,
 Type PrependLeadingDimIfRanked(int64_t dim, Type type,
                                PatternRewriter *rewriter) {
   Type dtype = getElementTypeOrSelf(type);
-  if (RankedTensorType ty = type.dyn_cast<RankedTensorType>()) {
+  if (RankedTensorType ty = llvm::dyn_cast<RankedTensorType>(type)) {
     llvm::SmallVector<int64_t, 4> shape = {dim};
     shape.append(ty.getShape().begin(), ty.getShape().end());
     return tensorflow::GetTypeFromTFTensorShape(shape, dtype);
@@ -256,7 +256,7 @@ struct ConvertConst : public OpConversionPattern<TF::ConstOp> {
       ConversionPatternRewriter &rewriter) const override {
     // Verify that the tensor proto contains tensor of type variant and scalar
     // shape. The variant type should hold a TensorList.
-    auto proto_attr = op.getValue().dyn_cast<TF::TensorProtoAttr>();
+    auto proto_attr = llvm::dyn_cast<tf_type::TensorProtoAttr>(op.getValue());
     if (!proto_attr) return failure();
     tensorflow::Tensor tensor;
     if (!tensorflow::ConvertToTensor(proto_attr, &tensor).ok())
@@ -270,13 +270,13 @@ struct ConvertConst : public OpConversionPattern<TF::ConstOp> {
     if (!list) return failure();
 
     // Verify output type is variant and contains exactly one ranked subtypes.
-    auto variant_ty =
-        getElementTypeOrSelf(op.getType()).dyn_cast<TF::VariantType>();
+    auto variant_ty = llvm::dyn_cast<tf_type::VariantType>(
+        getElementTypeOrSelf(op.getType()));
     if (!variant_ty) return failure();
     ArrayRef<TensorType> subtypes = variant_ty.getSubtypes();
     if (subtypes.size() != 1) return failure();
     RankedTensorType list_element_ty =
-        subtypes.front().dyn_cast<RankedTensorType>();
+        llvm::dyn_cast<RankedTensorType>(subtypes.front());
     if (!list_element_ty) return failure();
 
     // Extract tensor elements for the TensorList and construct result type
@@ -372,7 +372,8 @@ struct ConvertTensorListSetItem
         loc, tensorflow::GetTypeFromTFTensorShape({1}, shape_dtype), item_rank,
         scalar_zero);
     // Create two slice ops.
-    Type element_type = input.getType().cast<TensorType>().getElementType();
+    Type element_type =
+        llvm::cast<TensorType>(input.getType()).getElementType();
     UnrankedTensorType unranked_tensor = UnrankedTensorType::get(element_type);
     Value scalar_minus_one = CreateI32SplatConst(loc, &rewriter, {}, -1);
     TF::SliceOp slice1 =
@@ -441,7 +442,8 @@ struct ConvertTensorListSetItem
     // Expand the dimension of item so that it will have the same rank with
     // input.
     // ExpandDims(item, 0)
-    Type element_type = input.getType().cast<TensorType>().getElementType();
+    Type element_type =
+        llvm::cast<TensorType>(input.getType()).getElementType();
     UnrankedTensorType unranked_tensor = UnrankedTensorType::get(element_type);
     auto expanded_item = rewriter.create<TF::ExpandDimsOp>(
         op.getLoc(), unranked_tensor, item, scalar_zero);
@@ -494,7 +496,8 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
     // looking at the first `TensorListSetItemOp` writing to this tensor list.
     // Here we assume that the element_shape won't be changed before calling
     // the first `TensorListSetItemOp`.
-    if (auto shaped_type = element_shape.getType().dyn_cast<ShapedType>()) {
+    if (auto shaped_type =
+            llvm::dyn_cast<ShapedType>(element_shape.getType())) {
       if (shaped_type.hasRank() && shaped_type.getRank() == 0) {
         bool element_shape_acquired = false;
         auto uses = op.getResult().getUses();
@@ -517,8 +520,8 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
               if (TF::TensorListSetItemOp set_op =
                       llvm::dyn_cast<TF::TensorListSetItemOp>(
                           inside_use.getOwner())) {
-                if (auto shaped_type =
-                        set_op.getItem().getType().dyn_cast<ShapedType>()) {
+                if (auto shaped_type = llvm::dyn_cast<ShapedType>(
+                        set_op.getItem().getType())) {
                   if (shaped_type.hasStaticShape()) {
                     RankedTensorType type =
                         tensorflow::GetTypeFromTFTensorShape(
@@ -592,7 +595,8 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
       }
 
       auto attr = DenseIntElementsAttr::get(
-          element_shape.getType().cast<ShapedType>(), new_element_shape_values);
+          llvm::cast<ShapedType>(element_shape.getType()),
+          new_element_shape_values);
       auto new_element_shape = rewriter.create<arith::ConstantOp>(
           op.getLoc(), element_shape.getType(), attr);
       element_shape = new_element_shape;
@@ -603,7 +607,7 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
     Type result_type = UnrankedTensorType::get(element_dtype);
     Value leading_dim = GetNumElements(op, adaptor.getOperands(), &rewriter);
     if (auto element_type =
-            op.element_type().template dyn_cast<RankedTensorType>()) {
+            llvm::dyn_cast<RankedTensorType>(op.element_type())) {
       result_rank = element_type.getRank() + 1;
       int64_t leading_dim_v = -1;
       ElementsAttr element_attr;
@@ -662,12 +666,12 @@ struct ConvertTensorListReserve
       return CreateI32SplatConst(op.getLoc(), rewriter, {1}, attr.getInt());
     }
     if (auto const_op = num_elements.getDefiningOp<TF::ConstOp>()) {
-      return CreateI32SplatConst(op->getLoc(), rewriter, {1},
-                                 (*const_op.getValue()
-                                       .cast<DenseElementsAttr>()
-                                       .getValues<APInt>()
-                                       .begin())
-                                     .getSExtValue());
+      return CreateI32SplatConst(
+          op->getLoc(), rewriter, {1},
+          (*llvm::cast<DenseElementsAttr>(const_op.getValue())
+                .getValues<APInt>()
+                .begin())
+              .getSExtValue());
     }
     return rewriter->create<TF::ExpandDimsOp>(
         op.getLoc(), tensorflow::GetTypeFromTFTensorShape({1}, shape_dtype),
@@ -713,8 +717,8 @@ struct ConvertTensorListPushBack
         loc, expanded_item_type, item, scalar_zero);
 
     Type elem_type = getElementTypeOrSelf(item);
-    auto handle_dtype = getElementTypeOrSelf(op.getOutputHandle().getType())
-                            .cast<TF::VariantType>();
+    auto handle_dtype = llvm::cast<tf_type::VariantType>(
+        getElementTypeOrSelf(op.getOutputHandle().getType()));
     Type result_type =
         GetTensorTypeForTensorList(elem_type, handle_dtype, &rewriter);
 
@@ -756,8 +760,8 @@ struct ConvertTensorListResize
 
     // Infer result type of this op based on TF's shape inference result.
     Type elem_type = getElementTypeOrSelf(input_handle);
-    auto handle_dtype = getElementTypeOrSelf(op.getOutputHandle().getType())
-                            .cast<TF::VariantType>();
+    auto handle_dtype = llvm::cast<tf_type::VariantType>(
+        getElementTypeOrSelf(op.getOutputHandle().getType()));
     Type result_type =
         GetTensorTypeForTensorList(elem_type, handle_dtype, &rewriter);
 
@@ -952,7 +956,8 @@ struct ConvertTensorListStack
     // trivial Reshape op (that doesn't actually change the input's shape) and
     // also populate the shape info to the op result. The shape of the
     // tensorlist is inferred from `num_elements` and `element_shape`.
-    auto ranked_type = element_shape.getType().dyn_cast<RankedTensorType>();
+    auto ranked_type =
+        llvm::dyn_cast<RankedTensorType>(element_shape.getType());
     DenseIntElementsAttr dense_elem_attr;
     if ((ranked_type && ranked_type.getRank() == 0) ||
         !matchPattern(element_shape, m_Constant(&dense_elem_attr))) {
@@ -1013,7 +1018,7 @@ struct ConvertTensorListConcatV2
     // First unpack the input tensor along the first dimension.
     Type input_element_type = getElementTypeOrSelf(input);
     int64_t num_unpacked = 0;
-    if (auto type = input.getType().dyn_cast<RankedTensorType>()) {
+    if (auto type = llvm::dyn_cast<RankedTensorType>(input.getType())) {
       if (type.getDimSize(0) > 0) {
         num_unpacked = type.getDimSize(0);
       } else {
@@ -1091,7 +1096,7 @@ struct ConvertYield : public OpConversionPattern<TF::YieldOp> {
 // if `type` is a tensor of variant. Otherwise, returns `type` unmodified.
 Type VariantToUnrankedTensorType(Type type, Value value) {
   TF::VariantType variant_ty =
-      getElementTypeOrSelf(type).dyn_cast<TF::VariantType>();
+      llvm::dyn_cast<tf_type::VariantType>(getElementTypeOrSelf(type));
   if (!variant_ty) {
     return type;
   }
@@ -1102,7 +1107,7 @@ Type VariantToUnrankedTensorType(Type type, Value value) {
   }
   Type value_type = value.getType();
   Type element_type;
-  variant_ty = value_type.dyn_cast<TF::VariantType>();
+  variant_ty = llvm::dyn_cast<tf_type::VariantType>(value_type);
   if (variant_ty && !variant_ty.getSubtypes().empty()) {
     element_type = variant_ty.getSubtypes()[0].getElementType();
   } else {
@@ -1114,7 +1119,7 @@ Type VariantToUnrankedTensorType(Type type, Value value) {
 // Returns true if we can deduce the type is tensorlist.
 bool IsTensorListType(Type type, std::optional<Value> value) {
   TF::VariantType variant_ty =
-      getElementTypeOrSelf(type).dyn_cast<TF::VariantType>();
+      llvm::dyn_cast<tf_type::VariantType>(getElementTypeOrSelf(type));
   if (!variant_ty) {
     return false;
   }
@@ -1336,7 +1341,7 @@ llvm::DenseMap<int, int> MapTensorListResultToArgument(func::FuncOp func) {
         break;
       }
     }
-    if (auto block_arg = parent.dyn_cast<mlir::BlockArgument>()) {
+    if (auto block_arg = dyn_cast<mlir::BlockArgument>(parent)) {
       return block_arg.getArgNumber();
     }
     // Returns -1 if we don't find which this result maps to.
@@ -1547,7 +1552,7 @@ void LowerStaticTensorListPass::runOnOperation() {
   // still.
   auto is_legal = [](Operation *op) {
     auto is_not_variant = [](Type ty) {
-      return !ty.cast<ShapedType>().getElementType().isa<TF::VariantType>();
+      return !isa<TF::VariantType>(cast<ShapedType>(ty).getElementType());
     };
     return llvm::all_of(op->getOperandTypes(), is_not_variant) &&
            llvm::all_of(op->getResultTypes(), is_not_variant);
@@ -1555,8 +1560,7 @@ void LowerStaticTensorListPass::runOnOperation() {
 
   auto is_set_item_legal = [](Operation *op) {
     return op->hasAttr("resize_if_index_out_of_bounds") &&
-           op->getAttr("resize_if_index_out_of_bounds")
-               .cast<mlir::BoolAttr>()
+           llvm::cast<BoolAttr>(op->getAttr("resize_if_index_out_of_bounds"))
                .getValue();
   };
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc b/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
index 7fea1e395ea2..3c15da8e4e62 100644
--- a/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 
 namespace mlir {
@@ -118,8 +119,8 @@ LogicalResult ModifyIONodesPass::ModifyInputNodes(
         quantize_output.replaceAllUsesWith(new_arg);
       } else if (input_type.isUnsignedInteger(
                      current_type.getIntOrFloatBitWidth())) {  // int8 != uint8
-        arg_type = quant::ConvertSignedQuantizedToUnsigned(
-            quantize_output.getType(), loc);
+        arg_type =
+            ConvertSignedQuantizedToUnsigned(quantize_output.getType(), loc);
         new_arg = block.addArgument(arg_type, loc);
         quantize_op.setOperand(new_arg);
       } else {
@@ -172,7 +173,7 @@ LogicalResult ModifyIONodesPass::ModifyOutputNodes(
         returned_value = dequantize_input;
       } else if (output_type.isUnsignedInteger(
                      current_type.getIntOrFloatBitWidth())) {  // int8 != uint8
-        returned_type = quant::ConvertSignedQuantizedToUnsigned(
+        returned_type = ConvertSignedQuantizedToUnsigned(
             dequantize_input.getType(), dequantize_op.getLoc());
         // replace the dequantize op by a quantize op
         TypeAttr type_attr = TypeAttr::get(returned_type);
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.td b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.td
index 85bdf63babcb..bc82b1f496ac 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.td
@@ -26,8 +26,8 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 def NotFromDequant : Constraint<CPred<"NotFromDequant($0)">>;
 
 def IsResultRankEqualTo : Constraint<CPred<
-  "$0.getType().front().cast<ShapedType>().getRank() == "
-  "$1.getType().cast<ShapedType>().getRank()">>;
+  "llvm::cast<ShapedType>($0.getType().front()).getRank() == "
+  "llvm::cast<ShapedType>($1.getType()).getRank()">>;
 
 // Fuses TFL_FullyConnectedOp and TFL_TransposeOp Rhs to TFL_BatchMatMulOp when
 // it's used by TFL_BatchMatMulOp and "transpose_lhs" is true.
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
index 2451089517c5..71ebbab92c1a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
 
 namespace mlir {
@@ -56,7 +57,7 @@ bool NotFromDequant(mlir::Value value) {
 
 // Converts batch_matmul operation to fully_connected if rhs is a
 // constant tensor with rank 2
-struct ConvertBatchMatMulOp2FullyConnectedOp
+struct ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs
     : public OpRewritePattern<TFL::BatchMatMulOp> {
   using OpRewritePattern<TFL::BatchMatMulOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(TFL::BatchMatMulOp bmm_op,
@@ -263,6 +264,127 @@ struct ConvertBatchMatMulOpToReduceSum
     return false;
   }
 };
+
+// Pattern to fuse transpose op into RHS of batch_matmul op if the transpose and
+// batch_matmul are separated by a reshape op; and the transpose op is used
+// exclusively to transpose the contracting dimension and the LHS-Output
+// dimension.
+// Converts batch_matmul operation to fully_connected if rhs is rank-2
+// else converts it to a BatchMatMul op with adj_y = true and transpose fused
+// into RHS.
+//
+// Example:
+// % 0 = "tfl.transpose" // Input: [2048, 32, 128] -> [128, 2048, 32]
+// % 1 = "tfl.reshape"(%0)  // reshaped [128, 2048, 32] -> [128, 65536]
+// % 2 = "tfl.batch_matmul"  // LHS: [4, 128], RHS: [128, 65536] -> [4, 65536]
+struct FuseRhsTransposeIntoBatchMatMulOp
+    : public OpRewritePattern<TFL::BatchMatMulOp> {
+  using OpRewritePattern<TFL::BatchMatMulOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(TFL::BatchMatMulOp bmm_op,
+                                PatternRewriter& rewriter) const override {
+    // Exit the pattern if adj_y is true.
+    if (bmm_op.getAdjY()) {
+      return rewriter.notifyMatchFailure(
+          bmm_op, "Pattern does not apply when adj_y is true.");
+    }
+
+    // Exit the pattern if the RHS of BatchMatMulOp is not originated from a
+    // TFL::TransposeOp->TFL::ReshapeOp.
+    auto reshape_op = bmm_op.getY().getDefiningOp<ReshapeOp>();
+    if (!reshape_op) {
+      return rewriter.notifyMatchFailure(
+          bmm_op,
+          "RHS is not originated from a transpose->reshape op pattern.");
+    }
+
+    auto transpose_op = reshape_op.getInput().getDefiningOp<TransposeOp>();
+    if (!transpose_op) {
+      return rewriter.notifyMatchFailure(
+          bmm_op,
+          "RHS is not originated from a transpose->reshape op pattern.");
+    }
+
+    // Get the dimensions info of the RHS of BatchMatMulOp.
+    auto rhs_dimensions_info = GetBatchMatMulRhsDimensionsInfo(
+        mlir::cast<ShapedType>(bmm_op.getY().getType()));
+
+    // Make sure that the reshape op is flattening either the contracting
+    // dimension or the output dimension.
+    auto reshape_input_shape = GetShape(reshape_op.getInput());
+    if (!HasFlattenedContractingDims(reshape_input_shape,
+                                     rhs_dimensions_info) &&
+        !HasFlattenedOutDims(reshape_input_shape, rhs_dimensions_info)) {
+      return rewriter.notifyMatchFailure(
+          bmm_op,
+          "Reshape op is not flattening the contracting dimension or the "
+          "output dimension.");
+    }
+
+    // Make sure that the transpose op is only transposing the contracting
+    // dimensions and the output dimensions.
+    auto transpose_perm_status_or_value =
+        GetValueAsIntArray(transpose_op.getPerm());
+    auto transpose_input_shape = GetShape(transpose_op.getInput());
+    if (transpose_perm_status_or_value.ok() &&
+        !HasTransposedContractingAndOutDims(
+            transpose_input_shape, transpose_perm_status_or_value.value(),
+            rhs_dimensions_info)) {
+      return rewriter.notifyMatchFailure(
+          bmm_op,
+          "Transpose op is not transposing the contracting dimension and the "
+          "output dimension.");
+    }
+
+    auto rhs_contracting_dimensions =
+        rhs_dimensions_info.contracting_dimensions();
+    auto rhs_out_dimensions = rhs_dimensions_info.out_dimensions();
+    auto rhs_batch_dimensions = rhs_dimensions_info.batch_dimensions();
+
+    // Create a new ReshapeOp, without the TransposeOp, to flatten the
+    // contracting dimension and the output dimension, as needed.
+    llvm::SmallVector<int32_t> new_reshape_input_shape;
+    if (!rhs_dimensions_info.batch_dimensions().AxesArray().empty()) {
+      for (auto dim_size : rhs_batch_dimensions.SizesArray()) {
+        new_reshape_input_shape.push_back(dim_size);
+      }
+    }
+    new_reshape_input_shape.push_back(rhs_out_dimensions.SizesArray().front());
+    new_reshape_input_shape.push_back(
+        rhs_contracting_dimensions.SizesArray().front());
+
+    Value new_reshape_shape_value = rewriter.create<arith::ConstantOp>(
+        bmm_op->getLoc(),
+        GetI32ElementsAttr(new_reshape_input_shape, &rewriter));
+    auto new_reshape_value = rewriter.create<TFL::ReshapeOp>(
+        bmm_op->getLoc(), transpose_op.getInput(), new_reshape_shape_value);
+
+    // Replace the BatchMatMulOp with a FullyConnectedOp, if the RHS of BMM has
+    // no broadcasting dimensions. I.e. RHS of BMM is of Rank 2.
+    if (rhs_dimensions_info.batch_dimensions().AxesArray().empty()) {
+      auto no_input = rewriter.create<TFL::NoValueOp>(
+          bmm_op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
+      auto fc_op = rewriter.create<TFL::FullyConnectedOp>(
+          bmm_op->getLoc(), ArrayRef<Type>{bmm_op.getType()},
+          /*input=*/bmm_op.getX(), /*filter=*/new_reshape_value,
+          /*bias=*/no_input,
+          /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
+          /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
+          /*keep_num_dims=*/rewriter.getBoolAttr(true),
+          /*asymmetric_quantize_inputs=*/mlir::BoolAttr());
+      rewriter.replaceOp(bmm_op, {fc_op.getResult(0)});
+    } else {
+      // Replace the BatchMatMulOp with a BatchMatMulOp with adj_y = true and
+      // transpose fused into RHS.
+      auto bmm_op_with_adj_y = rewriter.create<TFL::BatchMatMulOp>(
+          bmm_op->getLoc(), bmm_op.getType(), bmm_op.getX(), new_reshape_value,
+          bmm_op.getAdjX(), /*adj_y=*/true, mlir::BoolAttr());
+      rewriter.replaceOp(bmm_op, {bmm_op_with_adj_y.getResult()});
+    }
+
+    return success();
+  }
+};
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize_batch_matmul.inc"
 }  // namespace
 
@@ -271,8 +393,10 @@ void OptimizeBatchMatmulPass::runOnOperation() {
   auto* ctx = &getContext();
 
   RewritePatternSet patterns(ctx);
-  patterns.add<ConvertBatchMatMulOp2FullyConnectedOp,
-               ConvertBatchMatMulOpToReduceSum>(ctx);
+  patterns
+      .add<ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs,
+           ConvertBatchMatMulOpToReduceSum, FuseRhsTransposeIntoBatchMatMulOp>(
+          ctx);
   TFL::populateWithGenerated(patterns);
   (void)applyPatternsGreedily(func, std::move(patterns));
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
index 52f91d32e8ba..aed2946db17b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
@@ -27,9 +27,11 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -40,6 +42,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass_options.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
 
 namespace mlir {
 namespace TFL {
@@ -52,8 +56,10 @@ using BroadcastedShapeFunction =
 
 class ConvertResultsBroadcastableShapeOp : public RewritePattern {
  public:
-  explicit ConvertResultsBroadcastableShapeOp(MLIRContext* context)
-      : RewritePattern(MatchAnyOpTypeTag(), /*PatternBenefit*/ 1, context) {}
+  explicit ConvertResultsBroadcastableShapeOp(
+      MLIRContext* context, const OptimizeBroadcastLikePassOptions& options)
+      : RewritePattern(MatchAnyOpTypeTag(), /*PatternBenefit*/ 1, context),
+        options_(options) {}
 
   LogicalResult matchAndRewrite(Operation* op,
                                 PatternRewriter& rewriter) const override;
@@ -62,6 +68,9 @@ class ConvertResultsBroadcastableShapeOp : public RewritePattern {
   LogicalResult RewriteOp(
       Operation* op, PatternRewriter& rewriter,
       BroadcastedShapeFunction& get_broadcasted_shape) const;
+
+ private:
+  const OptimizeBroadcastLikePassOptions& options_;
 };
 
 // Some tfl ops only support implicit broadcasting up to a certain rank.
@@ -188,7 +197,8 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
 
   // Check that the result shape is fully defined.
   auto result_type = llvm::cast<ShapedType>(op->getResultTypes().front());
-  if (!result_type || !result_type.hasStaticShape())
+  if (!result_type || (!options_.unsafe_fuse_dynamic_shaped_broadcast &&
+                       !result_type.hasStaticShape()))
     return rewriter.notifyMatchFailure(
         op, "Unsupported result shape for broadcasting on op: " +
                 op->getName().getStringRef());
@@ -221,7 +231,10 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
     // Check that the operand of the broadcast has fully defined shape.
     auto broadcast_arg_type =
         llvm::cast<ShapedType>(broadcast_like_op_input.getType());
-    if (!broadcast_arg_type || !broadcast_arg_type.hasStaticShape()) continue;
+    if (!broadcast_arg_type ||
+        (!options_.unsafe_fuse_dynamic_shaped_broadcast &&
+         !broadcast_arg_type.hasStaticShape()))
+      continue;
 
     auto other_arg = op->getOpOperand(1 - i).get();
     // If non-splat operand is not fusable affine ops, then no need to apply
@@ -235,7 +248,9 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
 
     // Check that the other argument has fully defined shape.
     auto other_arg_type = llvm::cast<ShapedType>(other_arg.getType());
-    if (!other_arg_type || !other_arg_type.hasStaticShape()) continue;
+    if (!other_arg_type || (!options_.unsafe_fuse_dynamic_shaped_broadcast &&
+                            !other_arg_type.hasStaticShape()))
+      continue;
 
     // Get the unbroadcasted shapes in the operand order.
     std::array<llvm::ArrayRef<int64_t>, 2> operand_shapes;
@@ -265,8 +280,9 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
 class ConvertResultsBroadcastableBatchMatMulShapeOp
     : public ConvertResultsBroadcastableShapeOp {
  public:
-  explicit ConvertResultsBroadcastableBatchMatMulShapeOp(MLIRContext* context)
-      : ConvertResultsBroadcastableShapeOp(context) {}
+  explicit ConvertResultsBroadcastableBatchMatMulShapeOp(
+      MLIRContext* context, const OptimizeBroadcastLikePassOptions& options)
+      : ConvertResultsBroadcastableShapeOp(context, options) {}
 
   LogicalResult matchAndRewrite(Operation* op,
                                 PatternRewriter& rewriter) const override;
@@ -330,6 +346,50 @@ LogicalResult ConvertResultsBroadcastableBatchMatMulShapeOp::RewriteOp(
                                                        get_broadcasted_shape);
 }
 
+class ReorderBroadcastToCast : public RewritePattern {
+ public:
+  explicit ReorderBroadcastToCast(MLIRContext* context)
+      : RewritePattern(TFL::CastOp::getOperationName(), /*PatternBenefit*/ 1,
+                       context) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+LogicalResult ReorderBroadcastToCast::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto cast_op = llvm::dyn_cast<TFL::CastOp>(op);
+  if (!cast_op) return rewriter.notifyMatchFailure(op, "Not a CastOp");
+
+  auto broadcast_to_op = llvm::dyn_cast_or_null<TFL::BroadcastToOp>(
+      cast_op.getInput().getDefiningOp());
+  if (!broadcast_to_op)
+    return rewriter.notifyMatchFailure(op, "Not a BroadcastToOp");
+
+  auto fused_loc = FusedLoc::get(cast_op.getContext(),
+                                 {cast_op.getLoc(), broadcast_to_op.getLoc()});
+
+  auto input_value = broadcast_to_op.getInput();
+  auto input_type = input_value.getType();
+  auto old_cast_op_output_type = cast_op.getOutput().getType();
+  auto new_cast_op_output_type =
+      old_cast_op_output_type.hasRank()
+          ? static_cast<TensorType>(
+                RankedTensorType::get(input_type.getShape(),
+                                      old_cast_op_output_type.getElementType()))
+          : static_cast<TensorType>(UnrankedTensorType::get(
+                old_cast_op_output_type.getElementType()));
+
+  auto new_cast_op = rewriter.create<TFL::CastOp>(
+      fused_loc, new_cast_op_output_type, input_value);
+  auto new_broadcast_to_op = rewriter.create<TFL::BroadcastToOp>(
+      fused_loc, old_cast_op_output_type, new_cast_op.getOutput(),
+      broadcast_to_op.getShape());
+
+  rewriter.replaceOp(cast_op, new_broadcast_to_op.getOutput());
+  return success();
+}
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize_broadcast_like.inc"
 }  // namespace
 
@@ -337,9 +397,11 @@ void OptimizeBroadcastLikePass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   auto func = getOperation();
 
-  patterns.add<ConvertResultsBroadcastableShapeOp>(func.getContext());
-  patterns.add<ConvertResultsBroadcastableBatchMatMulShapeOp>(
-      func.getContext());
+  patterns.add<ConvertResultsBroadcastableShapeOp>(func.getContext(),
+                                                   GetOptions());
+  patterns.add<ConvertResultsBroadcastableBatchMatMulShapeOp>(func.getContext(),
+                                                              GetOptions());
+  patterns.add<ReorderBroadcastToCast>(func.getContext());
   TFL::populateWithGenerated(patterns);
   (void)applyPatternsGreedily(getOperation(), std::move(patterns));
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h
index f13048a19826..0b5f8f1f6bc2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h
@@ -16,24 +16,28 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass_options.h"
 #include "tensorflow/compiler/mlir/lite/transforms/pass.h"
-#include "tensorflow/compiler/mlir/lite/transforms/pass_options.h"
 
 namespace mlir {
 namespace TFL {
 
 // Pass to optimize explicit broadcasting-like patterns.
 class OptimizeBroadcastLikePass
-    : public TFL::Pass<OptimizeBroadcastLikePass, EmptyPassOptions,
-                       func::FuncOp> {
+    : public TFL::Pass<OptimizeBroadcastLikePass,
+                       OptimizeBroadcastLikePassOptions, func::FuncOp> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizeBroadcastLikePass)
 
   OptimizeBroadcastLikePass() = default;
   OptimizeBroadcastLikePass(const OptimizeBroadcastLikePass&) {};
+  explicit OptimizeBroadcastLikePass(const mlir::detail::PassOptions& options)
+      : Pass<OptimizeBroadcastLikePass, OptimizeBroadcastLikePassOptions,
+             func::FuncOp>(options) {}
 
   void runOnOperation() override;
   static llvm::StringRef GetName() { return "OptimizeBroadcastLikePass"; }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass_options.h b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass_options.h
new file mode 100644
index 000000000000..7d11f5d74cc4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass_options.h
@@ -0,0 +1,41 @@
+
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BROADCAST_LIKE_PASS_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BROADCAST_LIKE_PASS_OPTIONS_H_
+
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass Options
+////////////////////////////////////////////////////////////////////////////////
+
+struct OptimizeBroadcastLikePassOptions : public mlir::detail::PassOptions {
+  mlir::detail::PassOptions::Option<bool> unsafe_fuse_dynamic_shaped_broadcast{
+      *this, "unsafe-fuse-dynamic-shaped-broadcast",
+      llvm::cl::desc(
+          "Enable fusion of dynamic shaped broadcast ops. It helps fusing "
+          "implicit broadcasting ops when output shape has dynamic dimensions, "
+          "but it may cause incorrect results when broadcasting ops are "
+          "introduced by explicit broadcasting in the source model."),
+      llvm::cl::init(false)};
+};
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_OPTIMIZE_BROADCAST_LIKE_PASS_OPTIONS_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_patterns.td
index 4a0409eeea3b..945c67090f08 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_patterns.td
@@ -23,6 +23,9 @@ include "mlir/Dialect/Func/IR/FuncOps.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/lite/utils/utils.td"
 
+// Checks if the value has only one user.
+def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Patterns on TFL::Select*Op to optimize explicit broadcasting-like patterns.
 ////////////////////////////////////////////////////////////////////////////////
@@ -130,3 +133,263 @@ foreach SelectOp = [TFL_SelectOp, TFL_SelectV2Op] in {
   // Fuse broadcast to into select op.
   defm : FuseBroadcastToIntoSelectOp<SelectOp>;
 }
+
+// Checks if the value has only one use or used by elementwise op.
+def HasOneUseOrUsedByElementwiseOp : Constraint<CPred<
+  "($0.hasOneUse() || llvm::all_of($0.getUsers(), [](Operation* user){"
+  "  return llvm::isa<TFL::AddOp, TFL::SubOp, TFL::MulOp, TFL::DivOp, "
+  "                   TFL::MinimumOp, TFL::MaximumOp, TFL::LessOp, "
+  "                   TFL::LessEqualOp, TFL::GreaterOp, TFL::GreaterEqualOp, "
+  "                   TFL::NotEqualOp, TFL::EqualOp, TFL::PowOp, "
+  "                   TFL::SquaredDifferenceOp, TFL::FloorDivOp, "
+  "                   TFL::FloorModOp, TFL::AbsOp, TFL::NegOp, TFL::SqrtOp, "
+  "                   TFL::RsqrtOp, TFL::SquareOp, TFL::LogicalNotOp, "
+  "                   TFL::LogicalAndOp, TFL::LogicalOrOp, TFL::ExpOp, "
+  "                   TFL::SelectOp, TFL::SelectV2Op, TFL::CeilOp, "
+  "                   TFL::FloorOp, TFL::RoundOp, TFL::SinOp, TFL::CosOp, "
+  "                   TFL::TanhOp, TFL::LogOp>(user);"
+  "}))"
+  >>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Patterns on TFL::<BinaryOp> to optimize explicit broadcast_to patterns.
+////////////////////////////////////////////////////////////////////////////////
+
+// ConvertResultsBroadcastableShapeOp pattern in this pass fuses the
+// broadcast_to op into the TFL ops that support implicit broadcasting.
+// These Patterns below aims to handle all other broadcast_to ops that remain,
+// by moving the broadcast_to op after the binary op. This way, the
+// broadcast_to op can get the opportunity to be fused into the consumer of the
+// binary op.
+
+// TFL_DivOp needs to be handled separately because it supports implicit
+// broadcasting only for rank<=5.
+def ReorderBroadcastToOpAndDivOpLhs : Pat<
+  (TFL_DivOp:$result
+      (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim),
+      AnyStaticShapeTensor:$input2, $act_fn2),
+    (TFL_BroadcastToOp
+      (TFL_DivOp $pre_broadcast, $input2, $act_fn2), $dim),
+    [(IsNotQuantized $post_broadcast),
+     (OperandsDontBroadcastToOutputType $input2, $pre_broadcast, $post_broadcast),
+     (HasSameStaticShapes $post_broadcast, $result),
+     (HasOneUse $post_broadcast),
+     (HasRankAtMost<5> $post_broadcast)]>;
+
+def ReorderBroadcastToOpAndDivOpRhs : Pat<
+  (TFL_DivOp:$result
+      AnyStaticShapeTensor:$input1,
+      (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim), $act_fn2),
+    (TFL_BroadcastToOp
+      (TFL_DivOp $input1, $pre_broadcast, $act_fn2), $dim),
+    [(IsNotQuantized $post_broadcast),
+     (OperandsDontBroadcastToOutputType $input1, $pre_broadcast, $post_broadcast),
+     (HasSameStaticShapes $post_broadcast, $result),
+     (HasOneUse $post_broadcast),
+     (HasRankAtMost<5> $post_broadcast)]>;
+
+def ReorderBroadcastToOpAndDivOpWithSplatLhs : Pat<
+  (TFL_DivOp:$result
+      (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim),
+      (Arith_ConstantOp:$constant_value SplatElementsAttr:$constant_attr), $act_fn2),
+    (TFL_BroadcastToOp
+      (TFL_DivOp $pre_broadcast,
+        (Arith_ConstantOp (GetScalarElementsAttrFromSplat $constant_attr)), $act_fn2),
+      $dim),
+        [(IsNotQuantized $post_broadcast),
+        (OperandsDontBroadcastToOutputType $constant_value, $pre_broadcast, $post_broadcast),
+         (HasSameStaticShapes $post_broadcast, $result),
+         (HasOneUse $post_broadcast),
+         (HasRankAtMost<5> $post_broadcast)]>;
+
+    def ReorderBroadcastToOpAndDivOpWithSplat2Rhs : Pat<
+      (TFL_DivOp:$result
+          (Arith_ConstantOp:$constant_value SplatElementsAttr:$constant_attr),
+          (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim), $act_fn2),
+        (TFL_BroadcastToOp
+          (TFL_DivOp (Arith_ConstantOp (GetScalarElementsAttrFromSplat $constant_attr)), $pre_broadcast, $act_fn2),
+          $dim),
+        [(IsNotQuantized $post_broadcast),
+         (OperandsDontBroadcastToOutputType $constant_value, $pre_broadcast, $post_broadcast),
+         (HasSameStaticShapes $post_broadcast, $result),
+         (HasOneUse $post_broadcast),
+         (HasRankAtMost<5> $post_broadcast)]>;
+
+
+multiclass ReorderBroadcastToOpAndBinaryOpWithActFn<Op BinaryOp> {
+  def ReorderBroadcastToOpAnd#BinaryOp#Lhs : Pat<
+    (BinaryOp:$result
+      (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim),
+      AnyStaticShapeTensor:$input2, $act_fn2),
+    (TFL_BroadcastToOp
+      (BinaryOp $pre_broadcast, $input2, $act_fn2), $dim),
+    [(IsNotQuantized $post_broadcast),
+     (OperandsDontBroadcastToOutputType $input2, $pre_broadcast, $post_broadcast),
+     (HasSameStaticShapes $post_broadcast, $result),
+     (HasOneUse $post_broadcast),
+     (HasRankAtMost<6> $post_broadcast)]>;
+
+
+  def ReorderBroadcastToOpAnd#BinaryOp#Rhs : Pat<
+    (BinaryOp:$result
+        AnyStaticShapeTensor:$input1,
+        (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim), $act_fn2),
+      (TFL_BroadcastToOp
+        (BinaryOp $input1, $pre_broadcast, $act_fn2), $dim),
+      [(IsNotQuantized $post_broadcast),
+       (OperandsDontBroadcastToOutputType $input1, $pre_broadcast, $post_broadcast),
+       (HasSameStaticShapes $post_broadcast, $result),
+       (HasOneUse $post_broadcast),
+       (HasRankAtMost<6> $post_broadcast)]>;
+
+  def ReorderBroadcastToOpAnd#BinaryOp#WithSplatLhs : Pat<
+    (BinaryOp:$result
+        (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim),
+        (Arith_ConstantOp:$constant_value SplatElementsAttr:$constant_attr), $act_fn2),
+      (TFL_BroadcastToOp
+        (BinaryOp $pre_broadcast,
+          (Arith_ConstantOp (GetScalarElementsAttrFromSplat $constant_attr)), $act_fn2),
+        $dim),
+          [(IsNotQuantized $post_broadcast),
+           (OperandsDontBroadcastToOutputType $constant_value, $pre_broadcast, $post_broadcast),
+           (HasSameStaticShapes $post_broadcast, $result),
+           (HasOneUse $post_broadcast),
+           (HasRankAtMost<6> $post_broadcast)]>;
+
+  def ReorderBroadcastToOpAnd#BinaryOp#WithSplat2Rhs : Pat<
+    (BinaryOp:$result
+        (Arith_ConstantOp:$constant_value SplatElementsAttr:$constant_attr),
+        (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim), $act_fn2),
+      (TFL_BroadcastToOp
+        (BinaryOp (Arith_ConstantOp (GetScalarElementsAttrFromSplat $constant_attr)), $pre_broadcast, $act_fn2),
+        $dim),
+      [(IsNotQuantized $post_broadcast),
+       (OperandsDontBroadcastToOutputType $constant_value, $pre_broadcast, $post_broadcast),
+       (HasSameStaticShapes $post_broadcast, $result),
+       (HasOneUse $post_broadcast),
+       (HasRankAtMost<6> $post_broadcast)]>;
+}
+
+foreach BinaryOp = [TFL_AddOp, TFL_SubOp, TFL_MulOp] in {
+  // Reorder broadcast to after binary op.
+  defm : ReorderBroadcastToOpAndBinaryOpWithActFn<BinaryOp>;
+}
+
+multiclass ReorderBroadcastToOpAndBinaryOpWithoutActFn<Op BinaryOp> {
+  def ReorderBroadcastToOpAnd#BinaryOp#Lhs : Pat<
+    (BinaryOp:$result
+      (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim),
+      AnyStaticShapeTensor:$input2),
+    (TFL_BroadcastToOp
+      (BinaryOp $pre_broadcast, $input2), $dim),
+    [(IsNotQuantized $post_broadcast),
+     (OperandsDontBroadcastToOutputType $input2, $pre_broadcast, $post_broadcast),
+     (HasSameStaticShapes $post_broadcast, $result),
+     (HasOneUse $post_broadcast),
+     (HasRankAtMost<4> $post_broadcast)]>;
+
+  def ReorderBroadcastToOpAnd#BinaryOp#Rhs : Pat<
+    (BinaryOp:$result
+        AnyStaticShapeTensor:$input1,
+        (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim)),
+      (TFL_BroadcastToOp
+        (BinaryOp $input1, $pre_broadcast), $dim),
+      [(IsNotQuantized $post_broadcast),
+       (OperandsDontBroadcastToOutputType $input1, $pre_broadcast, $post_broadcast),
+       (HasSameStaticShapes $post_broadcast, $result),
+       (HasOneUse $post_broadcast),
+       (HasRankAtMost<4> $post_broadcast)]>;
+
+  def ReorderBroadcastToOpAnd#BinaryOp#WithSplatLhs : Pat<
+    (BinaryOp:$result
+        (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim),
+        (Arith_ConstantOp:$constant_value SplatElementsAttr:$constant_attr)),
+      (TFL_BroadcastToOp
+        (BinaryOp $pre_broadcast,
+          (Arith_ConstantOp (GetScalarElementsAttrFromSplat $constant_attr))),
+        $dim),
+          [(IsNotQuantized $post_broadcast),
+           (OperandsDontBroadcastToOutputType $constant_value, $pre_broadcast, $post_broadcast),
+           (HasSameStaticShapes $post_broadcast, $result),
+           (HasOneUse $post_broadcast),
+           (HasRankAtMost<4> $post_broadcast)]>;
+
+  def ReorderBroadcastToOpAnd#BinaryOp#WithSplat2Rhs : Pat<
+    (BinaryOp:$result
+        (Arith_ConstantOp:$constant_value SplatElementsAttr:$constant_attr),
+        (TFL_BroadcastToOp:$post_broadcast AnyStaticShapeTensor:$pre_broadcast, $dim)),
+      (TFL_BroadcastToOp
+        (BinaryOp (Arith_ConstantOp (GetScalarElementsAttrFromSplat $constant_attr)), $pre_broadcast),
+        $dim),
+      [(IsNotQuantized $post_broadcast),
+       (OperandsDontBroadcastToOutputType $constant_value, $pre_broadcast, $post_broadcast),
+       (HasSameStaticShapes $post_broadcast, $result),
+       (HasOneUse $post_broadcast),
+       (HasRankAtMost<4> $post_broadcast)]>;
+}
+
+foreach BinaryOp = [TFL_MinimumOp, TFL_MaximumOp, TFL_LessOp,
+                    TFL_LessEqualOp, TFL_GreaterOp,
+                    TFL_GreaterEqualOp, TFL_NotEqualOp, TFL_EqualOp, TFL_PowOp,
+                    TFL_SquaredDifferenceOp, TFL_FloorDivOp, TFL_FloorModOp] in {
+  // Reorder broadcast to after binary op without act fn.
+  defm : ReorderBroadcastToOpAndBinaryOpWithoutActFn<BinaryOp>;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Reorder TFL::<UnaryOp> with the TFL::broadcast_to operator.
+////////////////////////////////////////////////////////////////////////////////
+multiclass ReorderBroadcastToAndUnaryOp<Op UnaryOp> {
+  def ReorderBroadcastToOf#UnaryOp : Pat<
+    (UnaryOp (TFL_BroadcastToOp AnyStaticShapeTensor:$input, $dim)),
+    (TFL_BroadcastToOp (UnaryOp $input), $dim)>;
+}
+
+// TFL_CastOp of requires special handling due to not having a builder, it's
+// implemented in native code in ReorderBroadcastToCast.
+foreach UnaryOp = [TFL_AbsOp, TFL_CeilOp, TFL_ComplexAbsOp, TFL_CosOp,
+                   TFL_DequantizeOp, TFL_EluOp, TFL_ExpOp, TFL_FloorOp,
+                   TFL_HardSwishOp, TFL_ImagOp, TFL_LogOp, TFL_LogicalNotOp,
+                   TFL_LogisticOp, TFL_NegOp, TFL_RealOp, TFL_Relu0To1Op,
+                   TFL_Relu1Op, TFL_Relu6Op, TFL_ReluOp, TFL_RoundOp,
+                   TFL_RsqrtOp, TFL_SignOp, TFL_SinOp, TFL_SqrtOp, TFL_SquareOp,
+                   TFL_TanhOp, TFL_ZerosLikeOp] in {
+  defm : ReorderBroadcastToAndUnaryOp<UnaryOp>;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Remove redundant broadcast_to op.
+////////////////////////////////////////////////////////////////////////////////
+def RemoveRedundantBroadcastToOp : Pat<
+  (TFL_BroadcastToOp:$result AnyStaticShapeTensor:$pre_broadcast, $dim),
+  (replaceWithValue $pre_broadcast),
+  [(HasSameStaticShapes $pre_broadcast, $result)]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Reorder TFL::SumOp with the TFL::broadcast_to operator.
+////////////////////////////////////////////////////////////////////////////////
+
+def HasDistinctBroadcastAndReduceAxes : Constraint<CPred<
+    "AreBroadcastAndReductionAxesIndependent($0, $1, $2)">>;
+
+// Pattern to transform tfl.sum(tfl.broadcast_to(input, shape=S1), axis=B, keep_dims=true)
+// into tfl.broadcast_to(tfl.sum(input, axis=B, keep_dims=true), shape=S2)
+// where S1 is intermediate_target_shape_val, B is reduction_indices_val,
+// and S2 is the computed final_target_shape_val (shape of original sum).
+def ReorderBroadcastToAfterSumOp : Pat<
+  (TFL_SumOp:$original_sum
+    (TFL_BroadcastToOp:$intermediate_broadcast
+      AnyStaticShapeTensor:$original_input,
+      (Arith_ConstantOp $intermediate_target_shape_val)),
+    (Arith_ConstantOp I32ElementsAttr:$reduction_indices_val),
+    $keep_dims),
+  (TFL_BroadcastToOp
+    (TFL_SumOp
+      $original_input,
+      (Arith_ConstantOp $reduction_indices_val),
+      $keep_dims),
+    (Arith_ConstantOp (GetShapeAttr $original_sum))),
+  [(HasOneUse $intermediate_broadcast),
+   (HasDistinctBroadcastAndReduceAxes
+      $original_input, $reduction_indices_val, $intermediate_target_shape_val),
+   ]>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc
index b853af538f4f..1e06c574d419 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
@@ -23,8 +24,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 
 namespace mlir {
 namespace TFL {
@@ -60,7 +61,7 @@ struct PushDownDequantize : public OpRewritePattern<DequantizeOp> {
 
     // If the op is the pass-through op with (3x) smaller output, the dequantize
     // op can be pushed down to the single result of this op.
-    if (!llvm::dyn_cast<mlir::SameScalesOpInterface>(passthrough_op) ||
+    if (!llvm::dyn_cast<mlir::TFL::SameScalesOpInterface>(passthrough_op) ||
         passthrough_op->getNumResults() != 1) {
       return failure();
     }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
index a4c28fb9155a..f6b09eb99419 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
@@ -57,13 +57,13 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/optimize_pass_options.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 
@@ -519,7 +519,7 @@ DenseElementsAttr ExpandTo4DForDepthwiseConv(Attribute a) {
 }
 
 TypeAttr RescaleQtype(Type input, Attribute factor) {
-  return quant::RescaleQuantizedType(input, factor);
+  return RescaleQuantizedType(input, factor);
 }
 
 // Returns `true` if reducing `axes` in `input` with `keep_dims=true` results
@@ -824,21 +824,6 @@ bool IsPermutationNCHW(Value perm) {
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
-// Returns 1D 32-bit dense elements attribute with the given values.
-static DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
-                                               Builder *builder) {
-  RankedTensorType ty = mlir::RankedTensorType::get(
-      {static_cast<int32_t>(values.size())}, builder->getIntegerType(32));
-  return DenseIntElementsAttr::get(ty, values);
-}
-
-DenseIntElementsAttr GetI64ElementsAttr(ArrayRef<int64_t> values,
-                                        Builder *builder) {
-  RankedTensorType ty = RankedTensorType::get(
-      {static_cast<int64_t>(values.size())}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, values);
-}
-
 // Get the number of leading 1s in the shape of the given input.
 // Ex. input_shape = [1 x 1 x 1 x 1 x 2 x 1] => 4
 // returns 0 if the input shape is not static.
@@ -992,80 +977,6 @@ struct SqueezeReshapesAroundBroadcastOp
   }
 };
 
-// This pattern matches TFL::BroadcastToOp WITH TENSOR RANK <= 4 and replaces
-// it with a MulOp that multiplies the tensor by a splat constant with 1s.
-struct ConvertTFLBroadcastToMulOp
-    : public OpRewritePattern<TFL::BroadcastToOp> {
-  using OpRewritePattern<TFL::BroadcastToOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TFL::BroadcastToOp tfl_broadcast_to_op,
-                                PatternRewriter &rewriter) const override {
-    auto input_type =
-        mlir::cast<ShapedType>(tfl_broadcast_to_op.getInput().getType());
-    auto output_type =
-        mlir::cast<ShapedType>(tfl_broadcast_to_op.getOutput().getType());
-    auto shape_type =
-        mlir::cast<ShapedType>(tfl_broadcast_to_op.getShape().getType());
-    Type element_type = input_type.getElementType();
-
-    auto loc = tfl_broadcast_to_op->getLoc();
-
-    // Check that the output type is not dynamic and is less-than-equal to 4D or
-    // the shape type is static, 1D and has less-than-equal to 4 elements.
-    bool is_output_shape_dynamic =
-        (!output_type.hasRank() || (output_type.getRank() > 4) ||
-         (output_type.getNumDynamicDims() > 0));
-    bool is_broadcast_shape_dynamic =
-        (!shape_type.hasStaticShape() || (shape_type.getRank() != 1) ||
-         (shape_type.getDimSize(0) > 4));
-    if (is_output_shape_dynamic && is_broadcast_shape_dynamic)
-      return rewriter.notifyMatchFailure(
-          loc, "output_rank or broadcast_to shape not supported");
-
-    // Allow lowering when the input's elements type is F32, BFloat16, I32 or
-    // I16.
-    if (!(mlir::isa<BFloat16Type, Float32Type>(element_type) ||
-          element_type.isInteger(32) || element_type.isInteger(16)))
-      return rewriter.notifyMatchFailure(loc, "element_type_not_supported");
-
-    // TFL_FillOp is created only if is_output_shape_dynamic is true, otherwise
-    // a Arith.ConstOp is created.
-    if (is_output_shape_dynamic &&
-        output_type.getElementType().isUnsignedInteger()) {
-      return rewriter.notifyMatchFailure(
-          loc,
-          "Unsigned broadcast_to output with dynamic shape is not supported");
-    }
-
-    Value mul_rhs_value;
-    if (!output_type.hasRank() || (output_type.getNumDynamicDims() > 0)) {
-      auto status_or_const_op =
-          CreateConstOpWithSingleValue(&rewriter, loc, input_type, 1);
-      if (!status_or_const_op.ok()) {
-        return failure();
-      }
-
-      mul_rhs_value = rewriter.create<TFL::FillOp>(
-          loc, output_type, tfl_broadcast_to_op.getShape(),
-          status_or_const_op.value());
-    } else {
-      auto status_or_const_op =
-          CreateConstOpWithVectorValue(&rewriter, loc, output_type, 1);
-      if (!status_or_const_op.ok()) {
-        return failure();
-      }
-
-      mul_rhs_value = status_or_const_op.value();
-    }
-
-    auto mul_op = rewriter.create<TFL::MulOp>(
-        loc, output_type, tfl_broadcast_to_op.getInput(), mul_rhs_value,
-        rewriter.getStringAttr("NONE"));
-    rewriter.replaceOp(tfl_broadcast_to_op, mul_op.getResult());
-    return success();
-  }
-};
-
 struct FuseAddAndStridedSlice : public OpRewritePattern<TFL::StridedSliceOp> {
   using OpRewritePattern<TFL::StridedSliceOp>::OpRewritePattern;
 
@@ -1152,8 +1063,8 @@ struct Convert2DUpscalingToResizeNearestNeighor
   // - tfl.gather_nd -> tfl.transpose -> tfl.gather_nd -> tfl.transpose
   //   where ...
   //     - all tfl.gather_nd op instances take [0, 0, 1, 1, ..., n-1, n-1] as
-  //       the indices arugment,
-  //     - first tranpose op takes perm [2, 1, 0, 3], and
+  //       the indices argument,
+  //     - first transpose op takes perm [2, 1, 0, 3], and
   //     - second transpose op take perm [1, 2, 0, 3].
   //
   // Note the current pattern matching logic only handles when width == height.
@@ -1176,7 +1087,7 @@ struct Convert2DUpscalingToResizeNearestNeighor
       return failure();
     }
 
-    // The pattern matching allows arbitary channel dimension but it handles
+    // The pattern matching allows arbitrary channel dimension but it handles
     // only when height = width.
     if (params_type.getShape().size() != 4 ||
         indices_type.getShape().size() != 2)
@@ -1219,7 +1130,7 @@ struct Convert2DUpscalingToResizeNearestNeighor
       ++i;
     }
 
-    // Check whether first tranpose's perm has [2, 1, 0, 3].
+    // Check whether first transpose's perm has [2, 1, 0, 3].
     DenseIntElementsAttr perm;
     if (!matchPattern(transpose_first.getPerm(), m_Constant(&perm)))
       return failure();
@@ -1229,7 +1140,7 @@ struct Convert2DUpscalingToResizeNearestNeighor
     }
     if (axes != SmallVector<int64_t>({2, 1, 0, 3})) return failure();
 
-    // Check whether second tranpose's perm has [1, 2, 0, 3].
+    // Check whether second transpose's perm has [1, 2, 0, 3].
     if (!matchPattern(transpose_second.getPerm(), m_Constant(&perm)))
       return failure();
     axes.clear();
@@ -1454,7 +1365,7 @@ struct FuseAddAndFullyConnected
 // FC(Mul(lhs, rhs), filter, bias)
 // .. with ..
 // FC(lhs, Mul(filter, rhs), bias)
-// .. if rhs, filter, and bias are all constants.
+// .. if rhs and filter are all constants.
 // The generated Mul will be constant folded to a single matrix.
 struct FuseMulAndFullyConnected
     : public OpRewritePattern<TFL::FullyConnectedOp> {
@@ -1483,6 +1394,28 @@ struct FuseMulAndFullyConnected
       return failure();
     }
 
+    // Checks the constant requirements.
+    if (!matchPattern(mul_op.getRhs(), m_Constant())) {
+      return failure();
+    }
+
+    if (!matchPattern(fc_op.getFilter(), m_Constant())) {
+      // We must not apply this optimization if RHS is not a constant.
+      //
+      // In particular, this optimization must not break the weight-only
+      // quantized FullyConnected sequence:
+      //
+      // %filter_quant = "tfl.pseudo_qconst"() <{...}>
+      //     : () -> tensor<... x !quant.uniform<...>>
+      // %filter_dequant = "tfl.dequantize"(%filter_quant)
+      //     : (tensor<... x !quant.uniform<...>>) -> tensor<... x f32>
+      // %fc = "tfl.fully_connected"(%input, %filter_dequant, ...)
+      //     : (tensor<... x f32>, tensor<... x f32>, ...)
+      //     -> tensor<... x f32>
+      //
+      return failure();
+    }
+
     auto location =
         FusedLoc::get(mul_op.getContext(), {mul_op.getLoc(), fc_op.getLoc()});
 
@@ -2533,7 +2466,9 @@ struct EliminateQDQPairs : public OpRewritePattern<TFL::QuantizeOp> {
 struct UndoBroadcastFullyConnectedBiasAddWithQDQs
     : public OpRewritePattern<TFL::AddOp> {
   using OpRewritePattern::OpRewritePattern;
-  LogicalResult match(TFL::AddOp add_op) const override {
+
+  LogicalResult matchAndRewrite(TFL::AddOp add_op,
+                                PatternRewriter &rewriter) const override {
     if (!add_op->hasOneUse()) {
       return failure();
     }
@@ -2572,13 +2507,6 @@ struct UndoBroadcastFullyConnectedBiasAddWithQDQs
       return failure();
     }
 
-    return success();
-  }
-
-  void rewrite(TFL::AddOp add_op, PatternRewriter &rewriter) const override {
-    auto dq_op = cast<TFL::DequantizeOp>(add_op.getRhs().getDefiningOp());
-    auto q_op = cast<TFL::QuantizeOp>(dq_op.getInput().getDefiningOp());
-    auto bias_op = cast<arith::ConstantOp>(q_op.getInput().getDefiningOp());
     auto new_bias = FlattenTo1D(bias_op.getValueAttr());
     auto new_bias_type = new_bias.getType();
     auto new_bias_op = rewriter.create<arith::ConstantOp>(
@@ -2603,6 +2531,7 @@ struct UndoBroadcastFullyConnectedBiasAddWithQDQs
 
     // Remove old bias
     rewriter.eraseOp(bias_op);
+    return success();
   }
 };
 
@@ -2705,6 +2634,341 @@ struct EnableFullyConnectedKeepNumDimsBeforeReshape
   }
 };
 
+// This pattern push transposes through squeeze ops to facilitate further
+// transpose and reshape fusions. For example, some JAX model could have
+// subgraphs like Reshape-Transpose-Squeeze. With this pattern, the transpose
+// can be pushed through the squeeze op, and fused with a subsequent reshape or
+// removed entirely. The squeeze op could also be fused with the former reshape.
+//
+// The pattern is designed to have lower benefit/priority than others,
+// while the push may still happen if the transpose could be fused with
+// downstream optimization phases or passe..
+struct PushTransposeThroughSqueeze : public RewritePattern {
+  explicit PushTransposeThroughSqueeze(MLIRContext *context)
+      : RewritePattern(TFL::SqueezeOp::getOperationName(), /*benefit=*/0,
+                       context) {}
+
+  LogicalResult matchAndRewrite(mlir::Operation *op,
+                                PatternRewriter &rewriter) const override {
+    TFL::SqueezeOp squeeze = cast<TFL::SqueezeOp>(op);
+    auto transpose = llvm::dyn_cast_or_null<TFL::TransposeOp>(
+        squeeze.getInput().getDefiningOp());
+    if (!transpose) {
+      return failure();
+    }
+
+    int32_t input_rank = transpose.getType().getShape().size();
+
+    llvm::SmallVector<int32_t, 4> squeeze_dims;
+    if (squeeze->hasAttr("squeeze_dims")) {
+      for (const auto &squeeze_dim : squeeze.getSqueezeDimsAttr()) {
+        squeeze_dims.push_back(
+            mlir::dyn_cast<IntegerAttr>(squeeze_dim).getInt());
+      }
+    }
+    if (squeeze_dims.empty()) {
+      for (int dim = 0; dim < input_rank; ++dim) {
+        if (transpose.getType().getDimSize(dim) == 1) {
+          squeeze_dims.push_back(dim);
+        }
+      }
+    }
+
+    mlir::DenseIntElementsAttr perm_attr;
+    if (!matchPattern(transpose.getPerm(), m_Constant(&perm_attr))) {
+      return failure();
+    }
+    llvm::SmallVector<int32_t, 4> perm;
+    for (const auto &dim : perm_attr.getValues<APInt>()) {
+      perm.push_back(dim.getSExtValue());
+    }
+
+    // Map squeeze dimensions to their positions after transpose.
+    llvm::sort(squeeze_dims);
+    llvm::SmallVector<int32_t, 4> new_squeeze_dims;
+    for (int32_t dim : squeeze_dims) {
+      new_squeeze_dims.push_back(perm[dim]);
+    }
+    llvm::sort(new_squeeze_dims);
+
+    // Filter the original transpose permutation to keep only non-squeezed
+    // positions.
+    llvm::SmallVector<int32_t> filtered_perm_original_indices;
+    for (int i = 0; i < input_rank; ++i) {
+      if (!llvm::is_contained(squeeze_dims, i)) {
+        filtered_perm_original_indices.push_back(perm[i]);
+      }
+    }
+
+    // Map the remaining original dimension indices to new 0-based indices after
+    // squeeze.
+    llvm::SmallVector<int32_t> original_remaining_dims;
+    for (int i = 0; i < input_rank; ++i) {
+      if (!llvm::is_contained(new_squeeze_dims, i)) {
+        original_remaining_dims.push_back(i);
+      }
+    }
+
+    llvm::SmallVector<int32_t> original_to_new_index_map(input_rank, -1);
+    for (int i = 0; i < original_remaining_dims.size(); ++i) {
+      original_to_new_index_map[original_remaining_dims[i]] = i;
+    }
+
+    llvm::SmallVector<int32_t> new_perm;
+    for (const auto &original_dim : filtered_perm_original_indices) {
+      new_perm.push_back(original_to_new_index_map[original_dim]);
+    }
+
+    llvm::SmallVector<int64_t> new_squeeze_shape;
+    for (int i = 0; i < input_rank; ++i) {
+      if (!llvm::is_contained(new_squeeze_dims, i)) {
+        new_squeeze_shape.push_back(
+            transpose.getInput().getType().getDimSize(i));
+      }
+    }
+    auto new_squeeze = rewriter.create<TFL::SqueezeOp>(
+        squeeze->getLoc(),
+        mlir::RankedTensorType::get(new_squeeze_shape,
+                                    squeeze.getType().getElementType()),
+        transpose.getInput(), rewriter.getI32ArrayAttr(new_squeeze_dims));
+
+    auto new_transpose = rewriter.create<TFL::TransposeOp>(
+        squeeze->getLoc(), squeeze.getType(), new_squeeze,
+        rewriter.create<arith::ConstantOp>(
+            squeeze->getLoc(), GetI32ElementsAttr(new_perm, &rewriter)));
+
+    rewriter.replaceOp(squeeze, new_transpose);
+    return success();
+  }
+};
+
+// Helper function to check if a constant tensor attribute has the expected
+// integer values
+bool matchConstantIntPermutation(Value permValue,
+                                 ArrayRef<int64_t> expectedPerm) {
+  DenseElementsAttr permAttr;
+  if (!matchPattern(permValue, m_Constant(&permAttr))) {
+    return false;  // Not a constant
+  }
+  if (!permAttr.getElementType().isInteger(32) &&
+      !permAttr.getElementType().isInteger(64)) {
+    // TFLite perms are often i32, but accept i64 too
+    return false;
+  }
+
+  auto values = permAttr.getValues<APInt>();
+  if (values.size() != expectedPerm.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < expectedPerm.size(); ++i) {
+    if (values[i].getSExtValue() != expectedPerm[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
+                                               Builder *builder) {
+  RankedTensorType ty = mlir::RankedTensorType::get(
+      {static_cast<int32_t>(values.size())}, builder->getIntegerType(32));
+  return DenseIntElementsAttr::get(ty, values);
+}
+
+inline DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int64_t> values,
+                                               Builder *builder) {
+  llvm::SmallVector<int32_t> new_values;
+  for (auto el : values) {
+    new_values.push_back(static_cast<int32_t>(el));
+  }
+  RankedTensorType ty = mlir::RankedTensorType::get(
+      {static_cast<int32_t>(values.size())}, builder->getIntegerType(32));
+  return DenseIntElementsAttr::get(ty, new_values);
+}
+
+// Reorders a Transpose-Reshape-Transpose sequence to
+// Reshape-Transpose-Transpose to allow for further optimization.
+//
+// The pattern matches:
+//   Transpose(Reshape(Transpose(input, perm: [1, 0])))
+//
+// and rewrites it to:
+//   Transpose(Transpose(Reshape(input)))
+//
+// This reordering allows for further optimization by potentially fusing the
+// reshapes and transposes.
+struct ReorderTransposeReshapeTranspose
+    : public OpRewritePattern<TFL::TransposeOp> {
+  explicit ReorderTransposeReshapeTranspose(MLIRContext *context)
+      : OpRewritePattern<TFL::TransposeOp>(context, /*benefit=*/0) {}
+
+  LogicalResult matchAndRewrite(TFL::TransposeOp outer_tpose,
+                                PatternRewriter &rewriter) const override {
+    auto reshape = outer_tpose.getInput().getDefiningOp<TFL::ReshapeOp>();
+    if (!reshape) return failure();
+
+    auto inner_tpose = reshape.getInput().getDefiningOp<TFL::TransposeOp>();
+    if (!inner_tpose) return failure();
+
+    auto inner_tpose_shape =
+        mlir::dyn_cast_or_null<RankedTensorType>(inner_tpose.getType());
+    if (!inner_tpose_shape) return failure();
+
+    auto input = inner_tpose.getInput();
+
+    auto inner_perm = inner_tpose.getPerm();
+    if (!matchConstantIntPermutation(inner_perm, {1, 0})) return failure();
+
+    int64_t perm0 = inner_tpose_shape.getDimSize(0);
+
+    llvm::SmallVector<int32_t, 4> reshape_shape;
+    {
+      DenseIntElementsAttr reshape_shape_attr;
+      if (!matchPattern(reshape.getShape(), m_Constant(&reshape_shape_attr))) {
+        return failure();
+      }
+
+      for (auto dim : reshape_shape_attr) {
+        reshape_shape.push_back(static_cast<int32_t>(dim.getSExtValue()));
+      }
+    }
+
+    // Consume dimensions until we've equaled the size of the first dim in the
+    // permuted result of the inner tpose and record the dim.
+    int32_t dim = -1;
+    for (auto i = 0, running_total = 1; i < reshape_shape.size(); i++) {
+      running_total *= reshape_shape[i];
+      if (perm0 == running_total) {
+        dim = i;
+      }
+    }
+
+    if (dim == -1) return failure();
+
+    llvm::SmallVector<int64_t, 4> new_reshape_shape(reshape_shape.size());
+    llvm::SmallVector<int32_t, 4> new_inner_perm(reshape_shape.size());
+
+    int index = 0;
+    for (auto i = dim + 1; i < reshape_shape.size(); i++) {
+      new_inner_perm[i] = index;
+      new_reshape_shape[index++] = reshape_shape[i];
+    }
+    for (auto i = 0; i <= dim; i++) {
+      new_inner_perm[i] = index;
+      new_reshape_shape[index++] = reshape_shape[i];
+    }
+
+    auto reshape_type =
+        mlir::dyn_cast_or_null<RankedTensorType>(reshape.getType());
+    if (!reshape_type) return failure();
+
+    auto new_reshape_shape_const = rewriter.create<arith::ConstantOp>(
+        reshape.getLoc(), GetI32ElementsAttr(new_reshape_shape, &rewriter));
+
+    auto new_inner_reshape = rewriter.create<TFL::ReshapeOp>(
+        reshape.getLoc(),
+        RankedTensorType::get(new_reshape_shape, reshape_type.getElementType()),
+        input, new_reshape_shape_const.getResult());
+    auto new_inner_tpose = rewriter.create<TFL::TransposeOp>(
+        inner_tpose.getLoc(), reshape_type, new_inner_reshape,
+        rewriter.create<arith::ConstantOp>(
+            inner_tpose.getLoc(),
+            GetI32ElementsAttr(new_inner_perm, &rewriter)));
+
+    rewriter.replaceOp(reshape, new_inner_tpose);
+
+    return success();
+  }
+};
+
+// Some models produce FullyConnected ops where the LHS is a const and the RHS
+// is the activation. This breaks some downstream optimizations (notably input
+// caching in XNNPack among other things). This rewrite pattern swaps the
+// operands to match the expected order and recomputes a new output shape for
+// the resuling op.
+//
+// This pattern only applies when:
+// * input and filter operands are 2D
+// * bias = none
+// * keep_num_dims = false (implied if input and filter are 2D)
+// Support for additional cases to broaden applicability can be added later.
+// TODO(b/408313959): Add support for more cases.
+//
+// Note that transposes are added to maintain correctness:
+//
+// Original: Output[B, O] = FC(Input[B, I](Const), Filter[O, I](Var), Bias=None)
+//                           ~= matmul(C, transpose(V))
+//
+// Transformed:
+//   Intermediate[O, B] = FC(Filter[O, I](Var), Input[B, I](Const), None)
+//                           ~= matmul(V, transpose(C))
+//   FinalOutput[B, O]   = Transpose(Intermediate[O, B], perm=[1, 0])
+struct FullyConnectedSwapOperandsWhenLHSIsConst
+    : public OpRewritePattern<TFL::FullyConnectedOp> {
+  explicit FullyConnectedSwapOperandsWhenLHSIsConst(MLIRContext *context)
+      : OpRewritePattern<TFL::FullyConnectedOp>(context, /*benefit=*/0) {}
+
+  LogicalResult matchAndRewrite(TFL::FullyConnectedOp fc,
+                                PatternRewriter &rewriter) const override {
+    if (!mlir::isa<NoneType>(fc.getBias().getType())) return failure();
+
+    auto input = fc.getInput();
+    auto filter = fc.getFilter();
+
+    if (!matchPattern(input, m_Constant()) ||
+        matchPattern(filter, m_Constant()))
+      return failure();
+
+    auto input_type = mlir::dyn_cast<RankedTensorType>(input.getType());
+    auto filter_type = mlir::dyn_cast<RankedTensorType>(filter.getType());
+    auto output_type =
+        mlir::dyn_cast<RankedTensorType>(fc.getResult(0).getType());
+
+    if (!input_type || !filter_type || !output_type) return failure();
+
+    if (input_type.getRank() != 2 || filter_type.getRank() != 2)
+      return failure();
+
+    // Dimensions: B=Batch, I=InputDepth, O=OutputDepth
+    // Input: [B, I], Filter: [O, I]
+    // We extract B from the input operand and O from the filter operand
+    int64_t B = input_type.getDimSize(0);
+    int64_t O = filter_type.getDimSize(0);
+
+    Type element_type = output_type.getElementType();
+    Location loc = fc.getLoc();
+
+    RankedTensorType intermediate_type =
+        RankedTensorType::get({O, B}, element_type);
+
+    auto new_fc = rewriter.create<TFL::FullyConnectedOp>(
+        loc,
+        /*resultTypes=*/intermediate_type,
+        /*input=*/filter,  // Original Filter V[O, I]
+        /*filter=*/input,  // Original Input C[B, I]
+        /*bias=*/fc.getBias(),
+        /*fused_activation_function=*/
+        rewriter.getStringAttr(fc.getFusedActivationFunction()),
+        /*weights_format=*/fc.getWeightsFormatAttr(),
+        /*keep_num_dims=*/rewriter.getBoolAttr(false),
+        /*asymmetric_quantize_inputs=*/
+        fc.getAsymmetricQuantizeInputsAttr()  // Propagate quant attr
+    );
+
+    RankedTensorType final_shape_type =
+        RankedTensorType::get({B, O}, element_type);
+
+    Value transposed_result = rewriter.create<TFL::TransposeOp>(
+        loc, final_shape_type, new_fc.getResult(0),
+        rewriter.create<arith::ConstantOp>(
+            loc, GetI32ElementsAttr(ArrayRef<int32_t>({1, 0}), &rewriter)));
+
+    rewriter.replaceOp(fc, transposed_result);
+
+    return success();
+  }
+};
+
 // Adds canonicalization patterns to the list of patterns.
 void AddCanonicalizationPatterns(MLIRContext *context,
                                  RewritePatternSet *patterns) {
@@ -2727,7 +2991,8 @@ void OptimizePass::runOnOperation() {
            FuseOutputReshape_BatchMatMulWithFlattenedContractingDims,
            FuseSqueezingLhsReshapeIntoFC_Output,
            FuseReshapesAroundBatchMatMulLHS, FuseReshapesAroundBatchMatMulLHS1,
-           FuseInputReshape_BatchMatMulWithFlattenedRhsDims>(ctx);
+           FuseInputReshape_BatchMatMulWithFlattenedRhsDims,
+           PushTransposeThroughSqueeze>(ctx);
   (void)applyPatternsGreedily(func, std::move(phase_0_patterns));
 
   // Potentially the binary ops might be fused together, like hard_swish, thus
@@ -2764,8 +3029,9 @@ void OptimizePass::runOnOperation() {
       OptimizeTopK, FuseAddAndStridedSlice,
       FuseReshapeAndTransposeAroundBatchMatmul,
       FuseTransposeReshapeIntoBatchMatmul, MoveReshapeAfterFullyConnected,
-      EnableFullyConnectedKeepNumDimsBeforeReshape, ConvertTFLBroadcastToMulOp>(
-      ctx);
+      EnableFullyConnectedKeepNumDimsBeforeReshape,
+      ReorderTransposeReshapeTranspose,
+      FullyConnectedSwapOperandsWhenLHSIsConst>(ctx);
   if (!GetOptions().disable_fuse_mul_and_fc) {
     phase_2_patterns.add<FuseMulAndFullyConnected>(ctx);
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 60cd31622719..99a1a01d7f96 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -27,21 +27,21 @@ include "mlir/IR/CommonAttrConstraints.td"
 
 // Checks if the param passed is a F32 ElementsAttr.
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">,
+  CPred<"llvm::isa<ElementsAttr>($_self) && llvm::cast<ElementsAttr>($_self).getShapedType().getElementType().isF32()">,
         "32 bit float constant tensor">;
 
 // Checks if the param passed is a float ElementsAttr.
 def FloatElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getShapedType().getElementType().isa<FloatType>()">,
+  CPred<"llvm::isa<ElementsAttr>($_self) && llvm::isa<FloatType>(llvm::cast<ElementsAttr>($_self).getShapedType().getElementType())">,
         "float constant tensor">;
 
 def ExtractSingleElementAsFloat : NativeCodeCall<
-    "ExtractSingleElementAsFloat($_self.cast<ElementsAttr>())">;
+    "ExtractSingleElementAsFloat(llvm::cast<ElementsAttr>($_self))">;
 
 // Checks if the value has rank 'n'.
 class HasRank<int n> : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-          "$0.getType().cast<ShapedType>().getRank() == " # n>>;
+    CPred<"llvm::cast<ShapedType>($0.getType()).hasRank() && "
+          "llvm::cast<ShapedType>($0.getType()).getRank() == " # n>>;
 
 class FloatValueEquals<string val> : Constraint<CPred<
   "FloatValueEquals($0, " # val # ")">>;
@@ -57,9 +57,9 @@ def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 def IsPermutationNCHW : Constraint<CPred<"IsPermutationNCHW($0)">>;
 
 def IsBiasShape : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().getRank() == 4 && "
-          "$0.getType().cast<ShapedType>().getShape()[2] == 1 && "
-          "$0.getType().cast<ShapedType>().getShape()[3] == 1">,
+    CPred<"llvm::cast<ShapedType>($0.getType()).getRank() == 4 && "
+          "llvm::cast<ShapedType>($0.getType()).getShape()[2] == 1 && "
+          "llvm::cast<ShapedType>($0.getType()).getShape()[3] == 1">,
     "has shape consistent with a bias">;
 
 def ReshapeNCHWBiasToNHWC : NativeCodeCall<"ReshapeNCHWBiasToNHWC($0, $1)">;
@@ -114,7 +114,7 @@ foreach actFnPair = [[TFL_ReluOp, TFL_AF_Relu],
 }
 
 def GetBiasMultiplier:
-  NativeCodeCall<"GetBiasMultiplier($_builder, $0, $1.cast<DenseFPElementsAttr>())">;
+  NativeCodeCall<"GetBiasMultiplier($_builder, $0, llvm::cast<DenseFPElementsAttr>($1))">;
 
 class CanFuseConvOrDepthwiseConv<string is_depthwise> : Constraint<
   CPred<"TFL::CanFuseConvOrDepthwiseConv($0, $1, " # is_depthwise # ")">>;
@@ -155,7 +155,7 @@ multiclass FuseBinaryOpToPrecedingAffine<Op binaryOp> {
               (Arith_ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_TransposeConvOp $output_shape, $weights, $input,
       (binaryOp (Arith_ConstantOp $bias),
-         (Arith_ConstantOp $value), TFL_AF_None),
+         (Arith_ConstantOp (FlattenTo1D $value)), TFL_AF_None),
       $padding, $stride_h, $stride_w, $act_fn),
     [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
      (HasOneUse $output)]>;
@@ -166,7 +166,7 @@ multiclass FuseBinaryOpToPrecedingAffine<Op binaryOp> {
                 $stride_h, $stride_w, TFL_AF_None),
               (Arith_ConstantOp FloatElementsAttr:$value), $act_fn),
     (TFL_TransposeConvOp $output_shape, $weights, $input,
-      (TFL_MulOp  (Arith_ConstantOp $value),
+      (TFL_MulOp  (Arith_ConstantOp (FlattenTo1D $value)),
                   (GetBiasMultiplier $root, $value),
                   TFL_AF_None
       ),
@@ -372,22 +372,22 @@ def MatchHardSwishPattern6 : Pat<
 
 // Constraint that the attribute value is less than 'n'
 class ConstDoubleValueLessThan<string n> : Constraint<
-  CPred<"$0.isa<DenseElementsAttr>() && "
-  "$0.cast<DenseElementsAttr>().getNumElements() == 1 && "
-  "std::abs(*$0.cast<DenseElementsAttr>().getValues<float>().begin()) < "
+  CPred<"llvm::isa<DenseElementsAttr>($0) && "
+  "llvm::cast<DenseElementsAttr>($0).getNumElements() == 1 && "
+  "std::abs(*llvm::cast<DenseElementsAttr>($0).getValues<float>().begin()) < "
   # n>>;
 
 // Constraint that the attribute value is negative infinity or negative largest.
 // We use both -inf & flt_min due to the forward compatibility.
 def ConstAPFloatNegLargestOrNegInfinity : Constraint<CPred<
-  "$0.isa<DenseElementsAttr>() && "
-  "$0.cast<DenseElementsAttr>().getNumElements() == 1 && "
-  "(($0.cast<DenseElementsAttr>().getValues<APFloat>()[0].isLargest() && "
-  "$0.cast<DenseElementsAttr>().getValues<APFloat>()[0].isNegative()) || "
-  "$0.cast<DenseElementsAttr>().getValues<APFloat>()[0].isNegInfinity())">>;
+  "llvm::isa<DenseElementsAttr>($0) && "
+  "llvm::cast<DenseElementsAttr>($0).getNumElements() == 1 && "
+  "((llvm::cast<DenseElementsAttr>($0).getValues<APFloat>()[0].isLargest() && "
+  "llvm::cast<DenseElementsAttr>($0).getValues<APFloat>()[0].isNegative()) || "
+  "llvm::cast<DenseElementsAttr>($0).getValues<APFloat>()[0].isNegInfinity())">>;
 
 def L2NormValidReduceIndex : Constraint<CPred<
-  "L2NormalizeReduceAxis($0, $1.cast<DenseElementsAttr>())">>;
+  "L2NormalizeReduceAxis($0, llvm::cast<DenseElementsAttr>($1))">>;
 
 // Currently L2Normalization doesn't support activation function
 // in TFLite.
@@ -456,9 +456,9 @@ def IsReducedTailOfShape : Constraint<CPred<
   "TFL::IsReducedTailOfShape($0.getType(), $1.getType())">>;
 
 def Flatten : NativeCodeCall<
-  "$0.cast<DenseElementsAttr>()"
-    ".reshape(RankedTensorType::get({$0.getType().cast<ShapedType>().getNumElements()}, "
-                                   "$0.getType().cast<ShapedType>().getElementType()))">;
+  "llvm::cast<DenseElementsAttr>($0)"
+    ".reshape(RankedTensorType::get({llvm::cast<ShapedType>($0.getType()).getNumElements()}, "
+                                   "llvm::cast<ShapedType>($0.getType()).getElementType()))">;
 
 def IsLastDimEqualToNumElements : Constraint<CPred<
   "TFL::IsLastDimEqualToNumElements($0.getType(), $1.getType())">>;
@@ -725,20 +725,20 @@ foreach ValueOp = [TFL_CeilOp, TFL_ExpOp, TFL_FloorOp, TFL_NegOp,
 // Returns truncated shape of a ranked-tensor.
 // Prefix-Truncated, here, means eliminating any contiguous 1s' in the lower
 // dimentions of the tensor
-def GetPrefixTruncatedShape: NativeCodeCall<"GetShape($0, true)">;
+def GetPrefixTruncatedShape: NativeCodeCall<"GetShapeAttr($0, true)">;
 
 // Returns True if the operand type is RankedTensorType and valid.
 def HasValidRankedTensor : Constraint<CPred<
-  "$0.getType().isa<RankedTensorType>() && "
-  "$0.getType().cast<RankedTensorType>().getNumDynamicDims() <= 1">>;
+  "llvm::isa<RankedTensorType>($0.getType()) && "
+  "llvm::cast<RankedTensorType>($0.getType()).getNumDynamicDims() <= 1">>;
 
 // Check if the truncated shape of the lhs is equal to the shape of rhs
 def IsPrefixTruncatedShapeEqualTo : Constraint<CPred<
-  "GetShape($0, true) == GetShape($1)">>;
+  "GetShapeAttr($0, true) == GetShapeAttr($1)">>;
 
 def ConvertSqueezeToReshape : Pat<
   (TFL_SqueezeOp:$squeeze_op $input, $squeeze_dims),
-  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShape $squeeze_op))),
+  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShapeAttr $squeeze_op))),
   [(HasValidRankedTensor $squeeze_op)]>;
 
 // Pattern to perform the following optimization
@@ -793,7 +793,7 @@ def UndoBroadcastConvBiasAdd : Pat<
 // Pattern to convert a trivial transpose op to a reshape op.
 def ConvertTrivialTransposeOpToReshapeOp : Pat<
   (TFL_TransposeOp:$transpose_op $input, (Arith_ConstantOp:$permutation $p1)),
-  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShape $transpose_op))),
+  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShapeAttr $transpose_op))),
   [(IsTransposeTrivial $input, $permutation),
    (AnyStaticShapeTensor $input),
    (AnyStaticShapeTensor $transpose_op)]>;
@@ -810,7 +810,7 @@ def FoldDoubleTranspose : Pat<
 // Convert expand_dims to reshape if possible.
 def ConvertExpandDimsToReshape : Pat<
   (TFL_ExpandDimsOp:$expand_dims_op $input, $dim),
-  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShape $expand_dims_op))),
+  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShapeAttr $expand_dims_op))),
   [(AnyStaticShapeTensor $expand_dims_op)]>;
 
 // Here, the element type can be any integer or float type.
@@ -900,8 +900,8 @@ def RemoveShapeOnlyCast : Pat<(TFL_CastOp:$output $input),
 
 // Checks if the operand0's rank is one less than operand1's rank.
 def PReluAlphaRankCheck : Constraint<
-  CPred<"$0.getType().cast<ShapedType>().getRank() == "
-  "$1.getType().cast<ShapedType>().getRank() - 1">>;
+  CPred<"llvm::cast<ShapedType>($0.getType()).getRank() == "
+  "llvm::cast<ShapedType>($1.getType()).getRank() - 1">>;
 
 // PReLU pattern from Keras:
 // f(x) = Relu(x) + (-alpha * Relu(-x))
@@ -979,7 +979,7 @@ def OptimizePow2ToRsqrt : Pat<
 
 def CanOptimizeIdentityGatherNdOrScatterNdOp : Constraint<CPred<
   "TFL::CanOptimizeIdentityGatherNdOrScatterNdOp("
-  "$0, $1.cast<DenseIntElementsAttr>(), $2.getType())">>;
+  "$0, llvm::cast<DenseIntElementsAttr>($1), $2.getType())">>;
 
 def OptimizeIdentityGatherNdOp : Pat<
   (TFL_GatherNdOp:$output $params, (Arith_ConstantOp I32ElementsAttr: $indices)),
@@ -1013,9 +1013,9 @@ def IsSame : Constraint<CPred<"$0 == $1">>;
 def HasTwoUse : Constraint<CPred<
   "std::distance($0.use_begin(), $0.use_end()) == 2">>;
 def AxesIsLastDimension : Constraint<CPred<
-  "$0.cast<DenseIntElementsAttr>().getNumElements() == 1 && "
-  "($0.cast<DenseIntElementsAttr>().getValues<APInt>()[0] == "
-  "$1.getType().cast<ShapedType>().getRank() - 1 || $0.cast<DenseIntElementsAttr>().getValues<int32_t>()[0] == -1)">>;
+  "llvm::cast<DenseIntElementsAttr>($0).getNumElements() == 1 && "
+  "(llvm::cast<DenseIntElementsAttr>($0).getValues<APInt>()[0] == "
+  "llvm::cast<ShapedType>($1.getType()).getRank() - 1 || llvm::cast<DenseIntElementsAttr>($0).getValues<int32_t>()[0] == -1)">>;
 
 // Convert exp(x)/sum(exp(x)) into softmax.
 def OptimizeToSoftmax : Pat<
@@ -1070,10 +1070,10 @@ def FoldNormalizationIntoSoftmaxJaxWithAxisMinus1 : Pat<
 def HaveSameType : Constraint<CPred<"($0.getType() == $1.getType())">>;
 
 class AllElementsAreF32<string val> : Constraint<CPred<
-  "($0.isa<DenseElementsAttr>() && "
-   "$0.cast<DenseElementsAttr>().getType().cast<ShapedType>().getElementType().isF32() && "
-   "std::all_of($0.cast<DenseElementsAttr>().getValues<float>().begin(), "
-               "$0.cast<DenseElementsAttr>().getValues<float>().end(), "
+  "(llvm::isa<DenseElementsAttr>($0) && "
+   "llvm::cast<ShapedType>(llvm::cast<DenseElementsAttr>($0).getType()).getElementType().isF32() && "
+   "std::all_of(llvm::cast<DenseElementsAttr>($0).getValues<float>().begin(), "
+               "llvm::cast<DenseElementsAttr>($0).getValues<float>().end(), "
                "[](float v){ return v == " #val# ";}))">>;
 
 // Optimize X*1 to X
@@ -1086,10 +1086,10 @@ def OptimizeMul1ToIdentity : Pat<
    (AllElementsAreF32<"1.0f"> $constant)]>;
 
 class AllElementsAreBool<string val> : Constraint<CPred<
-  "($0.isa<DenseElementsAttr>() && "
-   "$0.cast<DenseElementsAttr>().getType().cast<ShapedType>().getElementType().isInteger(1) && "
-   "std::all_of($0.cast<DenseElementsAttr>().getValues<bool>().begin(), "
-               "$0.cast<DenseElementsAttr>().getValues<bool>().end(), "
+  "(llvm::isa<DenseElementsAttr>($0) && "
+   "llvm::cast<ShapedType>(llvm::cast<DenseElementsAttr>($0).getType()).getElementType().isInteger(1) && "
+   "std::all_of(llvm::cast<DenseElementsAttr>($0).getValues<bool>().begin(), "
+               "llvm::cast<DenseElementsAttr>($0).getValues<bool>().end(), "
                "[](bool v){ return v == " #val# ";}))">>;
 
 // Remove select operators when the result is known in advance.
@@ -1114,6 +1114,24 @@ foreach SelectOp = [TFL_SelectOp, TFL_SelectV2Op] in {
   def Optimize#SelectOp#Not : Pat<
     (SelectOp (TFL_LogicalNotOp $condition), $input1, $input2),
     (SelectOp $condition, $input2, $input1)>;
+  // select(C, true_tensor, false_tensor) -> C
+  def Optimize#SelectOp#IsNoop : Pat<
+    (SelectOp:$result $condition,
+                       (Arith_ConstantOp $input1),
+                       (Arith_ConstantOp $input2)),
+    (replaceWithValue $condition),
+    [(HaveSameType $condition, $result),
+     (AllElementsAreBool<"true"> $input1),
+     (AllElementsAreBool<"false"> $input2)]>;
+  // select(C, false_tensor, true_tensor) -> logical_not(C)
+  def Optimize#SelectOp#IsNegate : Pat<
+    (SelectOp:$result $condition,
+                       (Arith_ConstantOp $input1),
+                       (Arith_ConstantOp $input2)),
+    (TFL_LogicalNotOp $condition),
+    [(HaveSameType $condition, $result),
+     (AllElementsAreBool<"false"> $input1),
+     (AllElementsAreBool<"true"> $input2)]>;
 }
 
 def EliminateLogicalAndTrue : Pat<
@@ -1207,11 +1225,11 @@ def IsLastDimensionEqualOne : Constraint<CPred<"IsLastDimensionEqualOne($0)">>;
 // As above but if shape is not static and rank 2 with last dim 1.
 def IsLastDimensionEqualOneOrDynamicBatchDimRank2 : Constraint<
   CPred<"IsLastDimensionEqualOne($0) || "
-        "(!$0.getType().cast<ShapedType>().hasStaticShape() && "
-        "  $0.getType().cast<ShapedType>().hasRank() && "
-        "  $0.getType().cast<ShapedType>().getRank() == 2 && "
-        "  !$0.getType().cast<ShapedType>().getShape().empty() && "
-        "  $0.getType().cast<ShapedType>().getShape()[1] == 1)">>;
+        "(!llvm::cast<ShapedType>($0.getType()).hasStaticShape() && "
+        "  llvm::cast<ShapedType>($0.getType()).hasRank() && "
+        "  llvm::cast<ShapedType>($0.getType()).getRank() == 2 && "
+        "  !llvm::cast<ShapedType>($0.getType()).getShape().empty() && "
+        "  llvm::cast<ShapedType>($0.getType()).getShape()[1] == 1)">>;
 
 // Replace
 //   Equal(X, indices)
@@ -1232,10 +1250,10 @@ def ReshapeEqualOpToOneHotOp : Pat<
    (IsOneHotIndexAttribute $series)]>;
 
 def F32ElementsVal : Constraint<CPred<
-  "$0.getType().cast<TensorType>().getElementType().isF32()">,
+  "llvm::cast<TensorType>($0.getType()).getElementType().isF32()">,
   "32 bit float tensor">;
 def I32ElementsVal : Constraint<CPred<
-  "$0.getType().cast<TensorType>().getElementType().isInteger(32)">,
+  "llvm::cast<TensorType>($0.getType()).getElementType().isInteger(32)">,
   "32 bit integer tensor">;
 
 def ConvertSingleElementAttrToFloatAttr :
@@ -1306,7 +1324,7 @@ def ReplaceOneHotFullyConnectedWithLookup : Pat<
         (Arith_ConstantOp ConstantAttr<RankedI32ElementsAttr<[2]>, "{1,0}">)),
       (returnType (GetEmbeddingLookupShape $indices, $filter))
     ),
-    (Arith_ConstantOp (GetShape (GetIthValue<0> $outputs)))),
+    (Arith_ConstantOp (GetShapeAttr (GetIthValue<0> $outputs)))),
   [(I32ElementsVal $indices),     // lookup is not implemented for i64
    (IsNoneType $bias)]>;          // Maybe folded into the lookup matrix later
 
@@ -1379,6 +1397,67 @@ def MatchGeluApproximate : Pat<
    (HasOneUse $pow_out),
   ]>;
 
+// Alternate pattern for GeluApproximate to match mul(x, mul(x, x)).
+//   0.5 * x * ( 1 + tanh( sqrt_2dPi  * ( x + 0.044715 * mul(x, mul(x, x)) ) ) )
+def MatchGeluApproximate_Mul1 : Pat<
+  (TFL_MulOp
+   (TFL_MulOp:$mul_out $arg0, (Arith_ConstantOp F32ElementsAttr:$Cst_1_2), TFL_AF_None),
+   (TFL_AddOp:$add_out
+    (TFL_TanhOp:$tanh_out
+     (TFL_MulOp:$mul_out1
+      (TFL_AddOp:$add_out1 $arg0,
+       (TFL_MulOp:$mul_out2
+        (TFL_MulOp:$pow_out $arg0,
+          (TFL_MulOp:$sqr_out $arg0, $arg0, TFL_AF_None), TFL_AF_None),
+        (Arith_ConstantOp F32ElementsAttr:$Coeff), TFL_AF_None), TFL_AF_None),
+      (Arith_ConstantOp F32ElementsAttr:$Cst_sqrt_2dPi), TFL_AF_None)),
+    (Arith_ConstantOp F32ElementsAttr:$Cst_1), TFL_AF_None), TFL_AF_None),
+  (TFL_GeluOp $arg0, ConstBoolAttrTrue),
+  [(FloatValueEquals<"0.5"> $Cst_1_2),
+   (FloatValueEquals<"1"> $Cst_1),
+   (FloatValueEquals<"0.797884583"> $Cst_sqrt_2dPi),
+   (FloatValueEquals<"0.044715"> $Coeff),
+   (HasOneUse $mul_out),
+   (HasOneUse $add_out),
+   (HasOneUse $tanh_out),
+   (HasOneUse $mul_out1),
+   (HasOneUse $add_out1),
+   (HasOneUse $mul_out2),
+   (HasOneUse $pow_out),
+   (HasOneUse $sqr_out),
+  ]>;
+
+// Alternate pattern for GeluApproximate to match mul(mul(x, x), x).
+//   0.5 * x * ( 1 + tanh( sqrt_2dPi  * ( x + 0.044715 * mul(mul(x, x), x) ) ) )
+def MatchGeluApproximate_Mul2 : Pat<
+  (TFL_MulOp
+   (TFL_MulOp:$mul_out $arg0, (Arith_ConstantOp F32ElementsAttr:$Cst_1_2), TFL_AF_None),
+   (TFL_AddOp:$add_out
+    (TFL_TanhOp:$tanh_out
+     (TFL_MulOp:$mul_out1
+      (TFL_AddOp:$add_out1 $arg0,
+       (TFL_MulOp:$mul_out2
+        (TFL_MulOp:$pow_out
+          (TFL_MulOp:$sqr_out $arg0, $arg0, TFL_AF_None),
+          $arg0, TFL_AF_None),
+        (Arith_ConstantOp F32ElementsAttr:$Coeff), TFL_AF_None), TFL_AF_None),
+      (Arith_ConstantOp F32ElementsAttr:$Cst_sqrt_2dPi), TFL_AF_None)),
+    (Arith_ConstantOp F32ElementsAttr:$Cst_1), TFL_AF_None), TFL_AF_None),
+  (TFL_GeluOp $arg0, ConstBoolAttrTrue),
+  [(FloatValueEquals<"0.5"> $Cst_1_2),
+   (FloatValueEquals<"1"> $Cst_1),
+   (FloatValueEquals<"0.797884583"> $Cst_sqrt_2dPi),
+   (FloatValueEquals<"0.044715"> $Coeff),
+   (HasOneUse $mul_out),
+   (HasOneUse $add_out),
+   (HasOneUse $tanh_out),
+   (HasOneUse $mul_out1),
+   (HasOneUse $add_out1),
+   (HasOneUse $mul_out2),
+   (HasOneUse $pow_out),
+   (HasOneUse $sqr_out),
+  ]>;
+
 // Alternate pattern for GeluApproximate (see different order for mul), replaces
 //   x * ( 0.5 * ( 1 + tanh( sqrt_2dPi  * ( x + 0.044715 * pow( x, 3 ) ) ) ) )
 def MatchGeluApproximate1 : Pat<
@@ -1408,6 +1487,67 @@ def MatchGeluApproximate1 : Pat<
    (HasOneUse $pow_out),
   ]>;
 
+// Alternate pattern for GeluApproximate1 to match mul(x, mul(x, x)).
+//   x * ( 0.5 * ( 1 + tanh( sqrt_2dPi  * ( x + 0.044715 * mul(x, mul(x, x)) ) ) ) )
+def MatchGeluApproximate1_Mul1 : Pat<
+  (TFL_MulOp $arg0,
+   (TFL_MulOp:$mul_out
+    (TFL_AddOp:$add_out
+     (TFL_TanhOp:$tanh_out
+      (TFL_MulOp:$mul_out1
+       (TFL_AddOp:$add_out1 $arg0,
+        (TFL_MulOp:$mul_out2
+          (TFL_MulOp:$pow_out $arg0,
+            (TFL_MulOp:$sqr_out $arg0, $arg0, TFL_AF_None), TFL_AF_None),
+         (Arith_ConstantOp F32ElementsAttr:$Coeff), TFL_AF_None), TFL_AF_None),
+       (Arith_ConstantOp F32ElementsAttr:$Cst_sqrt_2dPi), TFL_AF_None)),
+     (Arith_ConstantOp F32ElementsAttr:$Cst_1), TFL_AF_None), (Arith_ConstantOp F32ElementsAttr:$Cst_1_2), TFL_AF_None), TFL_AF_None),
+  (TFL_GeluOp $arg0, ConstBoolAttrTrue),
+  [(FloatValueEquals<"0.5"> $Cst_1_2),
+   (FloatValueEquals<"1"> $Cst_1),
+   (FloatValueEquals<"0.797884583"> $Cst_sqrt_2dPi),
+   (FloatValueEquals<"0.044715"> $Coeff),
+   (HasOneUse $mul_out),
+   (HasOneUse $add_out),
+   (HasOneUse $tanh_out),
+   (HasOneUse $mul_out1),
+   (HasOneUse $add_out1),
+   (HasOneUse $mul_out2),
+   (HasOneUse $pow_out),
+   (HasOneUse $sqr_out),
+  ]>;
+
+// Alternate pattern for GeluApproximate1 to match mul(mul(x, x), x).
+//   x * ( 0.5 * ( 1 + tanh( sqrt_2dPi  * ( x + 0.044715 * mul(mul(x, x), x) ) ) ) )
+def MatchGeluApproximate1_Mul2 : Pat<
+  (TFL_MulOp $arg0,
+   (TFL_MulOp:$mul_out
+    (TFL_AddOp:$add_out
+     (TFL_TanhOp:$tanh_out
+      (TFL_MulOp:$mul_out1
+       (TFL_AddOp:$add_out1 $arg0,
+        (TFL_MulOp:$mul_out2
+          (TFL_MulOp:$pow_out
+            (TFL_MulOp:$sqr_out $arg0, $arg0, TFL_AF_None),
+            $arg0, TFL_AF_None),
+         (Arith_ConstantOp F32ElementsAttr:$Coeff), TFL_AF_None), TFL_AF_None),
+       (Arith_ConstantOp F32ElementsAttr:$Cst_sqrt_2dPi), TFL_AF_None)),
+     (Arith_ConstantOp F32ElementsAttr:$Cst_1), TFL_AF_None), (Arith_ConstantOp F32ElementsAttr:$Cst_1_2), TFL_AF_None), TFL_AF_None),
+  (TFL_GeluOp $arg0, ConstBoolAttrTrue),
+  [(FloatValueEquals<"0.5"> $Cst_1_2),
+   (FloatValueEquals<"1"> $Cst_1),
+   (FloatValueEquals<"0.797884583"> $Cst_sqrt_2dPi),
+   (FloatValueEquals<"0.044715"> $Coeff),
+   (HasOneUse $mul_out),
+   (HasOneUse $add_out),
+   (HasOneUse $tanh_out),
+   (HasOneUse $mul_out1),
+   (HasOneUse $add_out1),
+   (HasOneUse $mul_out2),
+   (HasOneUse $pow_out),
+   (HasOneUse $sqr_out),
+  ]>;
+
 // For Gelu, replaces
 //   0.5 * x * ( 1 + erf( x * sqrt_1_2 ) )
 def MatchGelu : Pat<
@@ -1524,7 +1664,7 @@ def isF32Splat : Constraint<
   CPred<"IsF32Splat($0)">>;
 
 def ExtractF32AtIndex0: NativeCodeCall<
-    "$_builder.getF32FloatAttr($_self.cast<DenseElementsAttr>().getValues<float>()[0])">;
+    "$_builder.getF32FloatAttr(llvm::cast<DenseElementsAttr>($_self).getValues<float>()[0])">;
 
 def FuseLeakyReluConst : Pat<
   (TFL_SelectOp
@@ -1559,16 +1699,16 @@ class ContractingDimsProductEqual<int agg_start_idx> : Constraint<CPred<
 
 // Returns true if the dimensions of a subsection of two tensors is equal
 class AreTensorSubSectionShapesEqual<int skip_first, int skip_last> : Constraint<CPred<
-  "($0.getType().dyn_cast<ShapedType>().getShape()"
+  "(llvm::dyn_cast<ShapedType>($0.getType()).getShape()"
     ".drop_back("#skip_last#").drop_front("#skip_first#") =="
-  "$1.getType().dyn_cast<ShapedType>().getShape()"
+  "llvm::dyn_cast<ShapedType>($1.getType()).getShape()"
     ".drop_back("#skip_last#").drop_front("#skip_first#"))">>;
 
 // Returns true if the broadcast dimension of a tensor is [1]
 // here- broadcast dimension is first prefix dimension
 // excluding the last two dimensions
 def IsBroadcastDimEqualToOne : Constraint<CPred<
-  "$0.getType().dyn_cast<ShapedType>().getShape()[0] == 1">>;
+  "llvm::dyn_cast<ShapedType>($0.getType()).getShape()[0] == 1">>;
 
 // Pattern to fuse/fold the reshape ops around TFL_BatchMatMulOp
 // This pattern is applied when the rank of rhs is 2
@@ -1711,6 +1851,7 @@ def FuseTransposeIntoBatchMatMulRHS: Pat<
     $input, (CreateNoneValue $lhs), TFL_AF_None, TFL_FCWO_Default,
     ConstBoolAttrTrue, $asymmetric_quantize_inputs),
   [(HasRank<2> $input),
+   (AnyStaticShapeTensor $input),
    (AreLastTwoDimsTransposed $perm_value),
    (IsBoolAttrEqual<"false"> $adj_x),
    (IsBoolAttrEqual<"false"> $adj_y)]>;
@@ -1812,25 +1953,25 @@ def FuseSliceAndPack4D : Pat<(
 
 // Given a value, checks if dim `d` is static.
 class HasStaticDim<int d> : Constraint<CPred<
-  "!$0.getType().cast<ShapedType>().isDynamicDim(" # d # ")">>;
+  "!llvm::cast<ShapedType>($0.getType()).isDynamicDim(" # d # ")">>;
 
 class IsBalancedPaddingArray<int spatials_start, int spatials_end> :
   Constraint<CPred<
     "IsBalancedPaddingArray("# spatials_start #","# spatials_end #","
-      "$0.cast<DenseElementsAttr>())">>;
+      "llvm::cast<DenseElementsAttr>($0))">>;
 
 // Given in_shape, out_shape, stride checks ceil(in_shape[d] / stride) == out_shape[d]
 def IsSameStridedShape2D : Constraint<CPred<
   "HasSameStridedShape($0.getDefiningOp<TFL::Conv2DOp>(),"
-    "$1.getType().cast<ShapedType>().getShape())">>;
+    "llvm::cast<ShapedType>($1.getType()).getShape())">>;
 
 def IsSameStridedShapeDepthwise : Constraint<CPred<
   "HasSameStridedShape($0.getDefiningOp<TFL::DepthwiseConv2DOp>(),"
-    "$1.getType().cast<ShapedType>().getShape())">>;
+    "llvm::cast<ShapedType>($1.getType()).getShape())">>;
 
 def IsSameStridedShape3D : Constraint<CPred<
   "HasSameStridedShape($0.getDefiningOp<TFL::Conv3DOp>(),"
-    "$1.getType().cast<ShapedType>().getShape())">>;
+    "llvm::cast<ShapedType>($1.getType()).getShape())">>;
 
 def IsValidPadding : Constraint<CPred<"$0.str() == \"VALID\"">>;
 
@@ -1950,3 +2091,127 @@ def RealDivWithF32ConstDivisor : Pat<
       (GetScalarOfType<1> (Arith_ConstantOp $value))),
       (Arith_ConstantOp $value),  TFL_AF_None),
     $activation)>;
+
+// Replace casting a boolean tensor to a numeric type, followed by comparing
+// with zero. Note it doesn't matter what type we're casting to. HasSameType
+// enforces both the input being boolean (as result always is), and prevents
+// broadcasts.
+
+// 0 == Cast(bool_tensor) -> logical_not(bool_tensor)
+def ZeroEqualCast : Pat<
+  (TFL_EqualOp:$result (Arith_ConstantOp $zero), (TFL_CastOp $input)),
+  (TFL_LogicalNotOp $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   Cast(bool_tensor) == 0 -> logical_not(bool_tensor)
+def CastEqualZero : Pat<
+  (TFL_EqualOp:$result (TFL_CastOp $input), (Arith_ConstantOp $zero)),
+  (TFL_LogicalNotOp $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   0 <= Cast(bool_tensor) -> constant true
+// Using zeros_like to make sure shapes match.
+def ZeroLessEqualCast : Pat<
+  (TFL_LessEqualOp:$result (Arith_ConstantOp $zero), (TFL_CastOp $input)),
+  (TFL_LogicalNotOp (TFL_ZerosLikeOp $input)),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   Cast(bool_tensor) <= 0 -> logical_not(bool_tensor)
+def CastLessEqualZero : Pat<
+  (TFL_LessEqualOp:$result (TFL_CastOp $input), (Arith_ConstantOp $zero)),
+  (TFL_LogicalNotOp $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   0 >= Cast(bool_tensor) -> logical_not(bool_tensor)
+def ZeroGreaterEqualCast : Pat<
+  (TFL_GreaterEqualOp:$result (Arith_ConstantOp $zero), (TFL_CastOp $input)),
+  (TFL_LogicalNotOp $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   Cast(bool_tensor) >= 0 -> constant true
+// Using zeros_like to make sure shapes match.
+def CastGreaterEqualZero : Pat<
+  (TFL_GreaterEqualOp:$result (TFL_CastOp $input), (Arith_ConstantOp $zero)),
+  (TFL_LogicalNotOp (TFL_ZerosLikeOp $input)),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   0 != Cast(bool_tensor) -> bool_tensor
+def ZeroNotEqualCast : Pat<
+  (TFL_NotEqualOp:$result (Arith_ConstantOp $zero), (TFL_CastOp $input)),
+  (replaceWithValue $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   Cast(bool_tensor) != 0 -> bool_tensor
+def CastNotEqualZero : Pat<
+  (TFL_NotEqualOp:$result (TFL_CastOp $input), (Arith_ConstantOp $zero)),
+  (replaceWithValue $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   0 > Cast(bool_tensor)  -> constant false
+// Using zeros_like to make sure shapes match.
+def ZeroGreaterCast : Pat<
+  (TFL_GreaterOp:$result (Arith_ConstantOp $zero), (TFL_CastOp $input)),
+  (TFL_ZerosLikeOp $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   Cast(bool_tensor) > 0  -> bool_tensor
+def CastGreaterZero : Pat<
+  (TFL_GreaterOp:$result (TFL_CastOp $input), (Arith_ConstantOp $zero)),
+  (replaceWithValue $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   0 < Cast(bool_tensor)  -> bool_tensor
+def ZeroLessCast : Pat<
+  (TFL_LessOp:$result (Arith_ConstantOp $zero), (TFL_CastOp $input)),
+  (replaceWithValue $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//   Cast(bool_tensor) < 0  -> constant false
+// Using zeros_like to make sure shapes match.
+def CastLessZero : Pat<
+  (TFL_LessOp:$result (TFL_CastOp $input), (Arith_ConstantOp $zero)),
+  (TFL_ZerosLikeOp $input),
+  [(IsConstantValueOf<0> $zero), (HasSameType $input, $result)]>;
+
+//  x + (y - y) -> x
+// This pattern can emerge through some usages of gradient stop. Note, for all
+// activation functions fn(0) = 0, so it can be anything in the subtraction.
+def AddComputedZeroRHS : Pat<
+  (TFL_AddOp:$output
+    $input,
+    (TFL_SubOp $input2, $input2, $activation),
+    TFL_AF_None),
+  (replaceWithValue $input),
+  [(HasSameType $input, $output)]>;
+//  (y - y) + x -> x
+def AddComputedZeroLHS : Pat<
+  (TFL_AddOp:$output
+    (TFL_SubOp $input2, $input2, $activation),
+    $input,
+    TFL_AF_None),
+  (replaceWithValue $input),
+  [(HasSameType $input, $output)]>;
+
+// Replace matmul where inputs & weights have a last dimension of 1 with an
+// elementwise multiplication that broadcasts, i.e. replace:
+//   [a, b, 1] x [n, 1] => [a, b, n]
+// with:
+//   [a, b, 1] * [n] => [a, b, n]
+def DegenerateFCtoMul : Pat<
+  (TFL_FullyConnectedOp
+    $input,
+    (Arith_ConstantOp:$filter $filterVal),
+    $bias,
+    $fused_activation_function,
+    TFL_FCWO_Default,
+    ConstBoolAttrTrue,
+    $asymmetric_quantize_inputs),
+  (TFL_MulOp
+    $input,
+    (Arith_ConstantOp (FlattenTo1D $filterVal)),
+    $fused_activation_function),
+  [(HasRankAtMost<4> $input),
+   (HasRank<2> $filter),
+   (IsLastDimensionEqualOne $input),
+   (SameElementType $input, $filter),
+   (IsNoneType $bias)]>;
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h b/tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h
index 534b1402dd4c..29906014fce2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h
+++ b/tensorflow/compiler/mlir/lite/transforms/pass_options_setter.h
@@ -22,6 +22,7 @@ namespace TFL {
 class OptimizePassOptions;
 class VariableFreezingPipelineOptions;
 class EmptyPassOptions;
+class OptimizeBroadcastLikePassOptions;
 
 // Interface for setting options for TFLite Converter Pass/Pipeline Options.
 class PassOptionsSetter {
@@ -30,6 +31,7 @@ class PassOptionsSetter {
   virtual void SetOptions(OptimizePassOptions& options) const = 0;
   virtual void SetOptions(VariableFreezingPipelineOptions& options) const = 0;
   virtual void SetOptions(EmptyPassOptions& options) const = 0;
+  virtual void SetOptions(OptimizeBroadcastLikePassOptions& options) const = 0;
 };
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 4d8ecccaa5f3..c6419e387b1b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -22,9 +22,12 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/canonicalize_boundary_value_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/cleanup_optimization_barrier_pass.h"
 #include "tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.h"
 #include "tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.h"
+#include "tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass_options.h"
 #include "tensorflow/compiler/mlir/lite/transforms/optimize_pass.h"
 #include "tensorflow/compiler/mlir/lite/transforms/pass_registry_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.h"
@@ -34,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/split_merged_operands_pass.h"
 #include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.h"
 #include "tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 
 namespace mlir {
 namespace quant {
@@ -110,7 +112,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateLowerStaticTensorListPass();
 // Use quant_specs.ops_blocklist and quant_specs.nodes_blocklist if possible
 // as they are now structure variables of QuantizationSpecs.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    const quant::QuantizationSpecs& quant_specs,
+    const QuantizationSpecs& quant_specs,
     const absl::flat_hash_set<std::string>& ops_blocklist = {},
     const absl::flat_hash_set<std::string>& nodes_blocklist = {});
 
@@ -128,15 +130,14 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
 
 // Creates an instance of the TensorFlow Lite dialect PrepareQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
-    const quant::QuantizationSpecs& quant_specs);
+    const QuantizationSpecs& quant_specs);
 
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass();
 
 // Creates an instance of the TensorFlow Lite dialect
 // PrepareDynamicRangeQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>>
-CreatePrepareDynamicRangeQuantizePass(
-    const quant::QuantizationSpecs& quant_specs);
+CreatePrepareDynamicRangeQuantizePass(const QuantizationSpecs& quant_specs);
 
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreatePrepareDynamicRangeQuantizePass();
@@ -144,7 +145,7 @@ CreatePrepareDynamicRangeQuantizePass();
 // Creates an instance of the TensorFlow Lite dialect PostQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass();
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass(
-    bool emit_quant_adaptor_ops, const quant::CustomOpMap& custom_op_map = {});
+    bool emit_quant_adaptor_ops, const CustomOpMap& custom_op_map = {});
 
 // Creates an instance of the TensorFlow Lite dialect QuantizeVariables pass.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeVariablesPass();
@@ -224,7 +225,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateRaiseCustomOpsPass(
 // Creates raise custom ops pass, which legalize custom ops to TFL::CustomOp
 std::unique_ptr<OperationPass<func::FuncOp>> CreateLowerCustomOpsPass();
 
-// Inserts an TFL::CallOnce op when the tf_saved_model's session initialzer is
+// Inserts a TFL::CallOnce op when the tf_saved_model's session initialzer is
 // given.
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateInsertCallOnceOpFromSessionInitializerPass();
@@ -289,6 +290,11 @@ inline std::unique_ptr<mlir::Pass> CreateCanonicalizeBoundaryValuePass() {
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreatePartitionedTopologicalSortPass();
 
+// Create a pass that cleans up optimization barriers.
+inline std::unique_ptr<mlir::Pass> CreateCleanupOptimizationBarrierPass() {
+  return Create<CleanupOptimizationBarrierPass>();
+}
+
 #define GEN_PASS_DECL_DEFAULTQUANTPARAMSPASS
 #define GEN_PASS_DECL_LEGALIZETFPASS
 #define GEN_PASS_DECL_LOWERSTATICTENSORLISTPASS
@@ -340,13 +346,14 @@ inline void registerTensorFlowLitePasses() {
   Register<OptimizePass, OptimizePassOptions>();
   Register<OptimizeBatchMatmulPass>();
   Register<UnfreezeMutableGlobalTensorsPass>();
-  Register<OptimizeBroadcastLikePass>();
+  Register<OptimizeBroadcastLikePass, OptimizeBroadcastLikePassOptions>();
   Register<PushTransposeThroughEwisePass>();
   Register<CanonicalizeBoundaryValuePass>();
 
   // Other TFLite Passes
   Register<UnfoldLargeSplatConstantPass>();
   Register<SplitMergedOperandsPass>();
+  Register<CleanupOptimizationBarrierPass>();
 }
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.td b/tensorflow/compiler/mlir/lite/transforms/passes.td
index 10e3156855ef..cf2cc345e34d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.td
@@ -283,7 +283,8 @@ def PrepareTFPass : Pass<"tfl-prepare-tf", "mlir::func::FuncOp"> {
   let dependentDialects = ["TFL::TensorFlowLiteDialect",
     "mlir::quant::QuantDialect",
     "mlir::quantfork::QuantizationForkDialect",
-    "mhlo::MhloDialect"
+    "mhlo::MhloDialect",
+    "stablehlo::StablehloDialect"
   ];
   let options = [
       Option<"unfold_batch_matmul_", "unfold_batchmatmul",
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 914d426f278d..2538cc423cdf 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -15,22 +15,31 @@ limitations under the License.
 
 // This transformation pass applies some clean up steps after quantization.
 
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 
 //===----------------------------------------------------------------------===//
 // The post-quantize Passes.
@@ -52,7 +61,7 @@ class PostQuantizePass : public impl::PostQuantizePassBase<PostQuantizePass> {
 
   // Constructor used by manually creating the pass.
   explicit PostQuantizePass(bool emit_quant_adaptor_ops,
-                            const quant::CustomOpMap& custom_op_map)
+                            const CustomOpMap& custom_op_map)
       : custom_op_map_(custom_op_map) {
     // Set this flag to true if the inputs and outputs are in floating point.
     // The quant adaptor ops convert them to fixed point values (i.e. quantize)
@@ -64,7 +73,7 @@ class PostQuantizePass : public impl::PostQuantizePassBase<PostQuantizePass> {
   void runOnOperation() override;
 
  private:
-  quant::CustomOpMap custom_op_map_;
+  CustomOpMap custom_op_map_;
 };
 
 // Cleans up unnecessary QDQ pattern for input/output ops.
@@ -155,6 +164,92 @@ enum RemoveVolatileOpsType {
   kPreserveInputsAndOutputs,
 };
 
+// Returns a constant tensor with the given scalar/vector value and shape.
+template <typename T>
+std::optional<mlir::Value> GetConstTensor(PatternRewriter& rewriter,
+                                          Location loc, llvm::ArrayRef<T> vec,
+                                          llvm::ArrayRef<int64_t> shape) {
+  int64_t num_total_elements = 1;
+  for (int64_t a : shape) {
+    num_total_elements *= a;
+  }
+
+  if (vec.size() != num_total_elements) {
+    return std::nullopt;
+  }
+
+  auto const_type = tensorflow::GetTypeFromTFTensorShape(
+      shape, rewriter.getIntegerType(sizeof(T) * 8));
+  auto const_attr = DenseElementsAttr::get(const_type, vec);
+
+  auto const_op =
+      rewriter.create<arith::ConstantOp>(loc, const_type, const_attr);
+  return const_op.getResult();
+}
+
+// Converts a dequantize op to a (scale * (input - zeropoint)). The expectation
+// is that the qconst value will be constant folded to retain the original
+// constant value. This is essentially a constant fold of the dequantize op,
+// privided that the value, zp and scale are all constants.
+std::optional<mlir::Value> ConvertDequantizeOp(
+    PatternRewriter& rewriter, mlir::Operation* op,
+    mlir::ShapedType output_type, mlir::Value input_value,
+    llvm::ArrayRef<double> scale, llvm::ArrayRef<int64_t> zeropoint,
+    int64_t dim) {
+  RankedTensorType input_type =
+      dyn_cast<RankedTensorType>(input_value.getType());
+  if (!input_type) return std::nullopt;
+
+  std::optional<mlir::Value> zp_val;
+  if (zeropoint.size() == 1) {
+    auto const_type =
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type());
+    auto const_attr =
+        DenseElementsAttr::get(const_type, static_cast<float>(zeropoint[0]));
+
+    auto const_op = rewriter.create<arith::ConstantOp>(op->getLoc(), const_type,
+                                                       const_attr);
+    zp_val = const_op.getResult();
+  } else {
+    SmallVector<int64_t> shape;
+    shape.resize(input_type.getRank(), 1);
+    shape[dim] = zeropoint.size();
+    zp_val = GetConstTensor(rewriter, op->getLoc(), zeropoint, shape);
+  }
+
+  std::optional<mlir::Value> scale_val;
+  if (scale.size() == 1) {
+    auto const_type =
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type());
+    auto const_attr =
+        DenseElementsAttr::get(const_type, static_cast<float>(scale[0]));
+
+    auto const_op = rewriter.create<arith::ConstantOp>(op->getLoc(), const_type,
+                                                       const_attr);
+    scale_val = const_op.getResult();
+  } else {
+    SmallVector<int64_t> shape;
+    shape.resize(input_type.getRank(), 1);
+    shape[dim] = scale.size();
+    scale_val = GetConstTensor(rewriter, op->getLoc(), scale, shape);
+  }
+
+  if (!zp_val || !scale_val) return std::nullopt;
+
+  auto op1_cast_in =
+      rewriter.create<TFL::CastOp>(op->getLoc(), output_type, input_value);
+
+  auto op2_sub_op1 = rewriter.create<TFL::SubOp>(
+      op->getLoc(), output_type, op1_cast_in.getResult(), zp_val.value(),
+      /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+
+  return rewriter
+      .create<TFL::MulOp>(
+          op->getLoc(), output_type, op2_sub_op1.getResult(), scale_val.value(),
+          /*fused_activation_function=*/rewriter.getStringAttr("NONE"))
+      .getResult();
+}
+
 // Remove the back-to-back quantize and dequantize ops with volatile attribute.
 template <RemoveVolatileOpsType remove_volatile_ops_type>
 struct RemoveVolatileOps : public OpRewritePattern<DequantizeOp> {
@@ -165,7 +260,7 @@ struct RemoveVolatileOps : public OpRewritePattern<DequantizeOp> {
                                 PatternRewriter& rewriter) const override {
     auto input_op = op.getInput().getDefiningOp();
     if (auto q = llvm::dyn_cast_or_null<QuantizeOp>(input_op)) {
-      if (!q->getAttr(mlir::quant::kVolatileOpAttrName)) return failure();
+      if (!q->getAttr(kVolatileOpAttrName)) return failure();
 
       if (remove_volatile_ops_type == kPreserveInputsAndOutputs) {
         // Don't remove leading and trailing QDQ for PTQ workflow, so the io
@@ -188,6 +283,47 @@ struct RemoveVolatileOps : public OpRewritePattern<DequantizeOp> {
 
       op.replaceAllUsesWith(q.getInput());
       return success();
+    } else if (auto qconst_op = llvm::dyn_cast_or_null<QConstOp>(input_op)) {
+      if (!qconst_op->getAttr(kVolatileOpAttrName)) return failure();
+
+      auto qtype =
+          quant::QuantizedType::getQuantizedElementType(qconst_op.getType());
+      if (!qtype) return failure();
+      SmallVector<double, 1> scale;
+      SmallVector<int64_t, 1> zeropoint;
+      int64_t dim = 0;
+
+      if (auto uniform_qtype =
+              mlir::dyn_cast<quant::UniformQuantizedType>(qtype)) {
+        scale.push_back(uniform_qtype.getScale());
+        zeropoint.push_back(uniform_qtype.getZeroPoint());
+      } else if (auto per_axis_qtype =
+                     mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+                         qtype)) {
+        scale.assign(per_axis_qtype.getScales().begin(),
+                     per_axis_qtype.getScales().end());
+        zeropoint.assign(per_axis_qtype.getZeroPoints().begin(),
+                         per_axis_qtype.getZeroPoints().end());
+        dim = per_axis_qtype.getQuantizedDimension();
+      } else {
+        return failure();
+      }
+
+      auto output_type = mlir::cast<mlir::ShapedType>(op.getOutput().getType());
+
+      auto const_type = tensorflow::GetTypeFromTFTensorShape(
+          output_type.getShape(), qtype.getStorageType());
+      auto const_op = rewriter.create<arith::ConstantOp>(
+          op->getLoc(), const_type, qconst_op.getValue());
+
+      auto new_value =
+          ConvertDequantizeOp(rewriter, op, output_type, const_op.getResult(),
+                              scale, zeropoint, dim);
+      if (!new_value) return failure();
+
+      op.replaceAllUsesWith(new_value.value());
+      op->erase();
+      return success();
     }
     return failure();
   }
@@ -358,8 +494,8 @@ struct FoldReshapeOp : public OpRewritePattern<ReshapeOp> {
 template <typename OpTy>
 struct PruneUnusedOpsWithSideEffect : public OpRewritePattern<OpTy> {
  public:
-  explicit PruneUnusedOpsWithSideEffect(
-      MLIRContext* context, const quant::CustomOpMap& custom_op_map = {})
+  explicit PruneUnusedOpsWithSideEffect(MLIRContext* context,
+                                        const CustomOpMap& custom_op_map = {})
       : OpRewritePattern<OpTy>(context), custom_op_map(custom_op_map) {}
 
   LogicalResult matchAndRewrite(OpTy op,
@@ -384,7 +520,7 @@ struct PruneUnusedOpsWithSideEffect : public OpRewritePattern<OpTy> {
     rewriter.eraseOp(op);
     return success();
   }
-  quant::CustomOpMap custom_op_map;
+  CustomOpMap custom_op_map;
 };
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_post_quantize.inc"
@@ -392,15 +528,14 @@ struct PruneUnusedOpsWithSideEffect : public OpRewritePattern<OpTy> {
 void PostQuantizePass::runOnOperation() {
   if (!enable_custom_op_no_side_effect_.empty()) {
     ParseCustomOpSpecs(enable_custom_op_no_side_effect_,
-                       quant::CustomOpUpdateOptions::kNoSideEffect,
-                       custom_op_map_);
+                       CustomOpUpdateOptions::kNoSideEffect, custom_op_map_);
   }
 
   RewritePatternSet patterns(&getContext());
   auto func = getOperation();
   auto* ctx = func.getContext();
   TFL::populateWithGenerated(patterns);
-  patterns.add<quant::FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
+  patterns.add<FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
   patterns.add<PruneUnusedOpsWithSideEffect<TFL::LSTMOp>>(ctx);
   patterns.add<PruneUnusedOpsWithSideEffect<TFL::UnidirectionalSequenceLSTMOp>>(
       ctx);
@@ -415,7 +550,7 @@ void PostQuantizePass::runOnOperation() {
 
   RewritePatternSet phase_2_patterns(&getContext());
   TFL::populateWithGenerated(phase_2_patterns);
-  phase_2_patterns.add<quant::FoldTrivalRequantizeOp<QuantizeOp>,
+  phase_2_patterns.add<FoldTrivalRequantizeOp<QuantizeOp>,
                        RemoveVolatileOps<kPreserveInputsAndOutputs>,
                        FoldTransposeOp, FoldReshapeOp>(ctx);
   (void)applyPatternsGreedily(func, std::move(phase_2_patterns));
@@ -434,7 +569,7 @@ void PostQuantizeRemoveQDQPass::runOnOperation() {
 
 // Creates an instance of the TensorFlow Lite dialect PostQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass(
-    bool emit_quant_adaptor_ops, const quant::CustomOpMap& custom_op_map) {
+    bool emit_quant_adaptor_ops, const CustomOpMap& custom_op_map) {
   return std::make_unique<PostQuantizePass>(emit_quant_adaptor_ops,
                                             custom_op_map);
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 1afceede5252..568b5357836f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -20,7 +20,7 @@ include "tensorflow/compiler/mlir/lite/utils/utils.td"
 def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 
 def DenseElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<DenseElementsAttr>()">,
+  CPred<"llvm::isa<DenseElementsAttr>($_self)">,
   "non-opaque constant tensor">;
 
 def CreateGatherNdOp : NativeCodeCall<
@@ -109,10 +109,10 @@ def RemoveIdentityN : Pat<(TF_IdentityNOp $arg), (replaceWithValue $arg)>;
 // Casts result type of $1 to a quantized type by using the quantization
 // parameters from the type in $0.
 class UpdateShapeWithAxis<int i> : NativeCodeCall<
-  "quant::CastQuantizedTypeAttrFromExpressedType($_builder, $0, $1.getType(), " # i # ")">;
+  "CastQuantizedTypeAttrFromExpressedType($_builder, $0, $1.getType(), " # i # ")">;
 
 class CanUpdateShapeWithAxis<int i> : Constraint<
-  CPred<"quant::CastQuantizedTypeAttrFromExpressedType($_builder, $0, $1.getType(), " # i # ")">>;
+  CPred<"CastQuantizedTypeAttrFromExpressedType($_builder, $0, $1.getType(), " # i # ")">>;
 
 class UsedBy<string op> : Constraint<
   CPred<"llvm::isa<mlir::TFL::" # op # "Op>(*$0.getUsers().begin())">>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index f3624f0393c5..96a6ab06dc62 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -45,12 +45,13 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/tfl_quantization_driver.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"
-#include "tensorflow/compiler/mlir/lite/transforms/tfl_quantization_driver.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 
@@ -83,7 +84,7 @@ class PrepareQuantizePass
   explicit PrepareQuantizePass() : use_quantization_flags_(true) {}
 
   // Constructor used by manually creating the pass.
-  explicit PrepareQuantizePass(const quant::QuantizationSpecs& quant_specs)
+  explicit PrepareQuantizePass(const QuantizationSpecs& quant_specs)
       : use_quantization_flags_(false), quant_specs_(quant_specs) {}
 
   void runOnOperation() override;
@@ -132,7 +133,7 @@ class PrepareQuantizePass
   bool ContainsQuantizeOps(func::FuncOp func);
 
   bool use_quantization_flags_;
-  quant::QuantizationSpecs quant_specs_;
+  QuantizationSpecs quant_specs_;
 };
 
 bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
@@ -193,7 +194,7 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
         // The input min/max or mean/std are not specified, then skip.
         if (!min_max.first.has_value() || !min_max.second.has_value()) return;
 
-        TypeAttr params = quant::GetQuantizedTypeAttr(
+        TypeAttr params = GetQuantizedTypeAttr(
             builder, input_type, builder.getF64FloatAttr(min_max.first.value()),
             builder.getF64FloatAttr(min_max.second.value()),
             /*quant_dim=*/-1, num_bits, narrow_range, is_signed);
@@ -324,8 +325,7 @@ bool PrepareQuantizePass::ContainsQuantizeOps(func::FuncOp func) {
 }
 
 using PrepareQuantStats =
-    quant::ConvertStatsToQDQs<quantfork::QuantizeCastOp,
-                              quantfork::DequantizeCastOp>;
+    ConvertStatsToQDQs<quantfork::QuantizeCastOp, quantfork::DequantizeCastOp>;
 
 void PrepareQuantizePass::runOnOperation() {
   func::FuncOp func = getOperation();
@@ -345,7 +345,7 @@ void PrepareQuantizePass::runOnOperation() {
     quant_specs_.disable_set_input_nodes_quantization_params =
         disable_set_input_nodes_quantization_params_;
     quant_specs_.qdq_conversion_mode =
-        quant::GetQDQQuantModeFromString(qdq_conversion_mode_);
+        GetQDQQuantModeFromString(qdq_conversion_mode_);
 
     for (const auto& ir : input_ranges_) {
       std::pair<std::string, std::string> input_range = absl::StrSplit(ir, '|');
@@ -403,7 +403,7 @@ void PrepareQuantizePass::runOnOperation() {
     patterns_1.add<PrepareLstmOutputScale<LSTMOp>>(ctx);
     patterns_1.add<PrepareLstmOutputScale<UnidirectionalSequenceLSTMOp>>(ctx);
   }
-  if (quant_specs_.qdq_conversion_mode != quant::QDQConversionMode::kQDQNone) {
+  if (quant_specs_.qdq_conversion_mode != QDQConversionMode::kQDQNone) {
     patterns_1.add<PropagateReshapedPerAxisQuantDim,
                    PropagateTransposedPerAxisQuantDim>(ctx);
   }
@@ -413,8 +413,7 @@ void PrepareQuantizePass::runOnOperation() {
   // convert all of them to signed.
   RewritePatternSet patterns_2(&getContext());
   if (is_signed) {
-    patterns_2.add<quant::ConvertUnsignedToSigned<quantfork::QuantizeCastOp>>(
-        ctx);
+    patterns_2.add<ConvertUnsignedToSigned<quantfork::QuantizeCastOp>>(ctx);
   }
   // Convert quant stats to int8, unit8, int16 quantization parameters.
   // Currently, only activation stats are imported, so narrow_range = false.
@@ -436,14 +435,13 @@ void PrepareQuantizePass::runOnOperation() {
 
   // Bind the getter with the fixed configuration parameter for the correct
   // quantization settings of the ops.
-  std::function<std::unique_ptr<quant::OpQuantSpec>(Operation*)>
-      op_quant_spec_getter =
-          std::bind(GetOpQuantSpec, std::placeholders::_1,
-                    quant_specs_.disable_per_channel_for_dense_layers);
+  std::function<std::unique_ptr<OpQuantSpec>(Operation*)> op_quant_spec_getter =
+      std::bind(GetOpQuantSpec, std::placeholders::_1,
+                quant_specs_.disable_per_channel_for_dense_layers);
 
   // Finally, the quantization parameters can be propagated to the rest of the
   // values (tensors).
-  ApplyQuantizationParamsPropagation(
+  temp::ApplyQuantizationParamsPropagation(
       func, is_signed, bit_width,
       disable_per_channel_ || quant_specs_.disable_per_channel,
       op_quant_spec_getter, infer_tensor_range, quant_specs_.legacy_float_scale,
@@ -454,7 +452,7 @@ void PrepareQuantizePass::runOnOperation() {
 
 // Creates an instance of the TensorFlow Lite dialect PrepareQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
-    const quant::QuantizationSpecs& quant_specs) {
+    const QuantizationSpecs& quant_specs) {
   return std::make_unique<PrepareQuantizePass>(quant_specs);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
index dd30318e48ca..645e74a1c75b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
@@ -23,14 +23,16 @@ limitations under the License.
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -68,8 +70,7 @@ class PrepareDynamicRangeQuantizePass
   }
 
   // Constructor used by manually creating the pass.
-  explicit PrepareDynamicRangeQuantizePass(
-      const quant::QuantizationSpecs& quant_specs)
+  explicit PrepareDynamicRangeQuantizePass(const QuantizationSpecs& quant_specs)
       : quant_specs_(quant_specs) {
     enable_dynamic_range_per_channel_quantization_ =
         !quant_specs_.disable_per_channel;
@@ -91,7 +92,7 @@ class PrepareDynamicRangeQuantizePass
   // minimum_elements_for_weights threshold. Prevents emitting duplicate
   // warnings for the same op, once deemed ineligible for quantization.
   llvm::SetVector<Operation*> visited_nonquantizable_ops_;
-  quant::QuantizationSpecs quant_specs_;
+  QuantizationSpecs quant_specs_;
 };
 
 #include "tensorflow/compiler/mlir/lite/utils/generated_op_quant_spec_getters.inc"
@@ -102,7 +103,7 @@ class PrepareDynamicRangeQuantizableOp
     : public OpRewritePattern<arith::ConstantOp> {
  public:
   explicit PrepareDynamicRangeQuantizableOp(
-      MLIRContext* context, const quant::QuantizationSpecs& quant_specs,
+      MLIRContext* context, const QuantizationSpecs& quant_specs,
       llvm::SetVector<Operation*>* const visited_nonquantizable_ops)
       : OpRewritePattern<arith::ConstantOp>(context),
         visited_nonquantizable_ops_(visited_nonquantizable_ops),
@@ -300,13 +301,13 @@ class PrepareDynamicRangeQuantizableOp
 
     if (op_with_per_axis_support) {
       quant_type = mlir::dyn_cast<quant::QuantizedType>(
-          quant::GetUniformQuantizedPerAxisTypeForWeight(
+          GetUniformQuantizedPerAxisTypeForWeight(
               attr, affine_user.GetQuantizationDimIndex(),
               /*symmetric=*/true, bit_width, is_signed, is_narrow_range,
               is_legacy_float));
     } else {
-      quant_type = mlir::dyn_cast<quant::QuantizedType>(
-          quant::GetUniformQuantizedTypeForWeight(
+      quant_type =
+          mlir::dyn_cast<quant::QuantizedType>(GetUniformQuantizedTypeForWeight(
               attr, is_narrow_range && is_signed, bit_width, is_signed,
               is_narrow_range, is_legacy_float));
     }
@@ -459,7 +460,7 @@ class PrepareDynamicRangeQuantizableOp
   }
 
  protected:
-  quant::QuantizationSpecs quant_specs_;
+  QuantizationSpecs quant_specs_;
 };
 
 // Remove all the stats ops which are redundant for dynamic range quantizaiton.
@@ -486,7 +487,7 @@ void PrepareDynamicRangeQuantizePass::runOnOperation() {
 
   if (!enable_custom_op_quantization_.empty()) {
     ParseCustomOpSpecs(enable_custom_op_quantization_,
-                       quant::CustomOpUpdateOptions::kInputIndices,
+                       CustomOpUpdateOptions::kInputIndices,
                        quant_specs_.custom_map);
   }
 
@@ -506,8 +507,7 @@ void PrepareDynamicRangeQuantizePass::runOnOperation() {
 // Creates an instance of the TensorFlow Lite dialect
 // PrepareDynamicRangeQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>>
-CreatePrepareDynamicRangeQuantizePass(
-    const quant::QuantizationSpecs& quant_specs) {
+CreatePrepareDynamicRangeQuantizePass(const QuantizationSpecs& quant_specs) {
   return std::make_unique<PrepareDynamicRangeQuantizePass>(quant_specs);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
index 2b2885761fd3..e9e99cc21864 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
@@ -40,14 +40,14 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/tools/optimize/operator_property.h"
 #include "tensorflow/compiler/mlir/lite/utils/shape_and_size_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -230,13 +230,13 @@ template <typename SourceOp>
 class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
  public:
   explicit ConvertOpStatsToQDQs(MLIRContext* context,
-                                const quant::QuantizationSpecs& quant_specs,
+                                const QuantizationSpecs& quant_specs,
                                 PatternBenefit benefit = 1)
       : OpRewritePattern<SourceOp>(context, benefit),
         quant_specs_(quant_specs) {}
 
  protected:
-  quant::QuantizationSpecs quant_specs_;
+  QuantizationSpecs quant_specs_;
 
   LogicalResult processInputs(
       SourceOp op, const operator_property::OpVariant& op_variant,
@@ -306,8 +306,8 @@ class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
       SmallVector<double, 4> mins(1, std::numeric_limits<double>::max());
       SmallVector<double, 4> maxs(1, std::numeric_limits<double>::min());
       // Computes the effective min/max values of the attribute values.
-      quant::ExtractMinMaxFromAttr(attr, /*dim_size=*/1, /*slice_size=*/1,
-                                   /*symmetric=*/true, mins, maxs);
+      ExtractMinMaxFromAttr(attr, /*dim_size=*/1, /*slice_size=*/1,
+                            /*symmetric=*/true, mins, maxs);
       double scale = maxs[0] / -llvm::minIntN(tensor_property.number_of_bits);
       quant_type = UniformQuantizedType::getChecked(
           const_op->getLoc(), quant::QuantizationFlags::Signed,
@@ -315,7 +315,7 @@ class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
           /*zeroPoint=*/0, llvm::minIntN(10), -llvm::minIntN(10));
     } else {
       quant_type = mlir::dyn_cast<quant::UniformQuantizedType>(
-          quant::GetUniformQuantizedTypeForWeight(
+          GetUniformQuantizedTypeForWeight(
               attr, /*symmetric=*/true,
               /*num_bits=*/tensor_property.number_of_bits,
               /*is_signed=*/true,
@@ -393,7 +393,8 @@ class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
             /*isSigned=*/true);
       }
       if (quant_specs_.legacy_float_scale) {
-        quant_type = quant::DownCastScale(quant_type, min, max, op.getLoc());
+        quant_type =
+            ::mlir::TFL::DownCastScale(quant_type, min, max, op.getLoc());
       }
     }
     rewriter.setInsertionPointAfter(stats_op);
@@ -410,7 +411,7 @@ template <typename SourceOp>
 class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
  public:
   ConvertLstmStatsToQDQs(MLIRContext* context,
-                         const quant::QuantizationSpecs& quant_specs)
+                         const QuantizationSpecs& quant_specs)
       : ConvertOpStatsToQDQs<SourceOp>(context, quant_specs),
         activation_number_of_bits_(quant_specs.GetQuantizationTypeWidth()) {}
   LogicalResult matchAndRewrite(SourceOp op,
@@ -476,9 +477,9 @@ class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
             /*narrowRange=*/false, calibrated_type.getExpressedType(),
             /*isSigned=*/this->quant_specs_.IsSignedInferenceType());
         if (this->quant_specs_.legacy_float_scale) {
-          qtype = mlir::cast<UniformQuantizedType>(
-              quant::DownCastScale(qtype, calibrated_type.getMin(),
-                                   calibrated_type.getMax(), op.getLoc()));
+          qtype = mlir::cast<UniformQuantizedType>(::mlir::TFL::DownCastScale(
+              qtype, calibrated_type.getMin(), calibrated_type.getMax(),
+              op.getLoc()));
         }
       } else if (tensor_property.number_of_bits == 16) {
         double max = std::max(std::abs(calibrated_type.getMin()),
@@ -505,13 +506,13 @@ class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
 // Returns a function that returns the quantized type of a bias input.
 // The scale of bias is a multiplication of given scale and scales from the
 // quantization type of other operands.
-inline quant::AccumulatorScaleFunc GetUniformQuantizedTypeForBiasWithScale(
+inline AccumulatorScaleFunc GetUniformQuantizedTypeForBiasWithScale(
     double scale) {
-  return [=](const std::vector<quant::QuantParams>& quant_params,
+  return [=](const std::vector<QuantParams>& quant_params,
              const int adjusted_quant_dim,
-             const bool legacy_float_scale) -> quant::QuantParams {
+             const bool legacy_float_scale) -> QuantParams {
     if (auto qtype = mlir::dyn_cast_or_null<UniformQuantizedType>(
-            quant::GetUniformQuantizedTypeForBias(
+            ::mlir::TFL::GetUniformQuantizedTypeForBias(
                 quant_params, legacy_float_scale, adjusted_quant_dim))) {
       return quant::UniformQuantizedType::get(
           qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
@@ -524,14 +525,14 @@ inline quant::AccumulatorScaleFunc GetUniformQuantizedTypeForBiasWithScale(
 
 // Returns quantization spec for LSTMs based on their operator properties.
 template <typename LstmOp>
-std::unique_ptr<quant::OpQuantSpec> GetLstmOpQuantSpec(LstmOp op) {
+std::unique_ptr<OpQuantSpec> GetLstmOpQuantSpec(LstmOp op) {
   operator_property::OpVariant lstm_variant;
   operator_property::OperatorProperty lstm_property;
   if (failed(GetLstmProperty(op, &lstm_variant, &lstm_property))) {
     return nullptr;
   }
 
-  auto spec = std::make_unique<quant::OpQuantSpec>();
+  auto spec = std::make_unique<OpQuantSpec>();
 
   for (const auto& enumerated_inputs : lstm_property.inputs) {
     int index = enumerated_inputs.first;
@@ -556,8 +557,9 @@ std::unique_ptr<quant::OpQuantSpec> GetLstmOpQuantSpec(LstmOp op) {
       }
       spec->biases_params.emplace(
           index,
-          std::make_pair(tensor_property.derived_scale.input_tensors,
-                         GetUniformQuantizedTypeForBiasWithScale(scale)));
+          std::make_pair(
+              tensor_property.derived_scale.input_tensors,
+              ::mlir::TFL::GetUniformQuantizedTypeForBiasWithScale(scale)));
     }
   }
   return spec;
@@ -565,8 +567,8 @@ std::unique_ptr<quant::OpQuantSpec> GetLstmOpQuantSpec(LstmOp op) {
 
 class ConvertSvdfStatsToQDQs : public ConvertOpStatsToQDQs<TFL::SVDFOp> {
  public:
-  explicit ConvertSvdfStatsToQDQs(
-      MLIRContext* context, const quant::QuantizationSpecs& quant_specs_param)
+  explicit ConvertSvdfStatsToQDQs(MLIRContext* context,
+                                  const QuantizationSpecs& quant_specs_param)
       : ConvertOpStatsToQDQs<TFL::SVDFOp>(context, quant_specs_param) {}
   LogicalResult matchAndRewrite(TFL::SVDFOp op,
                                 PatternRewriter& rewriter) const override {
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 3f85702837a9..957d243e7277 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -64,7 +65,6 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/dilated_conv.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -74,6 +74,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/shape_and_size_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -83,6 +84,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
+#include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 
 #define DEBUG_TYPE "tf-tfl-legalization"
 
@@ -1367,10 +1370,15 @@ LogicalResult ConvertTf2XlaOps(func::FuncOp func, MLIRContext *context) {
   mhlo::Tf2XlaTypeConverter converter;
   mhlo::PopulateLegalizeTfWithTf2XlaPatterns("XLA_CPU_JIT", patterns, context,
                                              converter);
-  mhlo::PopulateLegalizeTfPatterns(context, &patterns);
+  hlo::PopulateLegalizeTfPatterns(context, &patterns);
   mlir::odml::PopulateLegalizeHloToTfPatterns(&patterns, context);
   mhlo::GatherOp::getCanonicalizationPatterns(patterns, context);
 
+  // hlo::PopulateLegalizeTfPatterns emits StableHLO ops, until this pipeline
+  // handles StableHLO ops directly, we need to convert them to MHLO ops.
+  stablehlo::StablehloToHloTypeConverter hlo_converter;
+  stablehlo::populateStablehloToHloPatterns(&patterns, &hlo_converter, context);
+
   return applyPartialConversion(func, target, std::move(patterns));
 }
 
@@ -1499,6 +1507,32 @@ struct RemoveIdentity : public OpRewritePattern<TF::IdentityOp> {
   }
 };
 
+llvm::FailureOr<TF::FakeQuantWithMinMaxVarsOp> TryGetAncestorFakeQuantOp(
+    Operation *operand) {
+  if (auto fq =
+          mlir::dyn_cast_or_null<TF::FakeQuantWithMinMaxVarsOp>(operand)) {
+    return fq;
+  }
+
+  auto dq = mlir::dyn_cast_or_null<TFL::DequantizeOp>(operand);
+  if (!dq) {
+    return failure();
+  }
+
+  auto q =
+      mlir::dyn_cast_or_null<TFL::QuantizeOp>(dq.getInput().getDefiningOp());
+  if (!q) {
+    return failure();
+  }
+
+  if (auto fq = mlir::dyn_cast_or_null<TF::FakeQuantWithMinMaxVarsOp>(
+          q.getInput().getDefiningOp())) {
+    return fq;
+  }
+
+  return failure();
+}
+
 // Quantizes Concat ops where the inputs are quantized with fake quant but the
 // result is not explicitly quantized. Without this, later quantization passes
 // handle the quantization of the concat op incorrectly.
@@ -1523,22 +1557,11 @@ class QuantizeConcatResult : public OpRewritePattern<TF::ConcatV2Op> {
     // fake quants.
     llvm::SmallVector<TF::FakeQuantWithMinMaxVarsOp> fake_quant_ops;
     for (Value operand_value : concat.getValues()) {
-      auto dq = mlir::dyn_cast_or_null<TFL::DequantizeOp>(
-          operand_value.getDefiningOp());
-
-      if (!dq) {
+      auto fq_or = TryGetAncestorFakeQuantOp(operand_value.getDefiningOp());
+      if (failed(fq_or)) {
         return failure();
       }
-
-      auto q = mlir::dyn_cast_or_null<TFL::QuantizeOp>(
-          dq.getInput().getDefiningOp());
-
-      if (!q) {
-        return failure();
-      }
-
-      auto fq = mlir::dyn_cast_or_null<TF::FakeQuantWithMinMaxVarsOp>(
-          q.getInput().getDefiningOp());
+      auto fq = fq_or.value();
 
       if (!fq) {
         return failure();
@@ -1635,30 +1658,11 @@ class QuantizeMeanResult : public OpRewritePattern<TF::MeanOp> {
       }
     }
 
-    // At this point, all pre-existing FakeQuantWithMinMaxVarsOps should have
-    // had qdq ops generated so we'll need to follow up the chain to get to the
-    // fake quants.
-    Value operand_value = mean.getInput();
-    auto dq = mlir::dyn_cast_or_null<TFL::DequantizeOp>(
-        operand_value.getDefiningOp());
-
-    if (!dq) {
-      return failure();
-    }
-
-    auto q =
-        mlir::dyn_cast_or_null<TFL::QuantizeOp>(dq.getInput().getDefiningOp());
-
-    if (!q) {
-      return failure();
-    }
-
-    auto fq = mlir::dyn_cast_or_null<TF::FakeQuantWithMinMaxVarsOp>(
-        q.getInput().getDefiningOp());
-
-    if (!fq) {
+    auto fq_or = TryGetAncestorFakeQuantOp(mean.getInput().getDefiningOp());
+    if (failed(fq_or)) {
       return failure();
     }
+    auto fq = fq_or.value();
 
     Value mean_result = mean.getResult();
     llvm::SmallVector<OpOperand *> uses;
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index ae1674b58629..8c411b93542a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This transformation pass applies quantization on TFLite dialect.
 
-#include <cstddef>
+#include <memory>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -53,13 +53,13 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_traits.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/lower_quant_annotations_helper.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 namespace mlir {
 namespace TFL {
@@ -94,9 +94,11 @@ static LogicalResult HasDQParent(Value value, Value& dq_input) {
   return failure();
 }
 
+// The assumption here is that the op has at least one DQ operand since the
+// pattern's root is that.
 static OpQuantizationType GetOpQuantizationType(Operation* op) {
-  // The assumption here is that the op has at least one DQ operand since the
-  // pattern's root is that.
+  const absl::flat_hash_set<std::string> kDrqOpsWithNoDrqInput = {
+      "tfl.embedding_lookup"};
 
   // Indicates if an input which is not an FQ is seen.
   bool non_fq_float_input_seen = false;
@@ -112,6 +114,10 @@ static OpQuantizationType GetOpQuantizationType(Operation* op) {
       continue;
     }
 
+    if (kDrqOpsWithNoDrqInput.contains(op->getName().getStringRef().str())) {
+      return OpQuantizationType::kDRQ;
+    }
+
     auto element_type = getElementTypeOrSelf(operand.getType());
 
     // Ignore non-f32 tensors when determining the quantization type.
@@ -158,7 +164,7 @@ class StrictQuantizationPattern : public RewritePattern {
   using BaseType = StrictQuantizationPattern;
 
   explicit StrictQuantizationPattern(MLIRContext* context,
-                                     const quant::QuantPassSpec& quant_params)
+                                     const QuantPassSpec& quant_params)
       // Set the score to a large number so it is always preferred.
       : RewritePattern(DequantizeOp::getOperationName(), 300, context),
         quant_params_(quant_params) {}
@@ -177,7 +183,7 @@ class StrictQuantizationPattern : public RewritePattern {
     bool enable_verify = quant_params_.numeric_verify_spec.verify_numeric;
     bool enable_whole_model_verify =
         quant_params_.numeric_verify_spec.whole_model_verify;
-    quant::CustomOpMap custom_map = quant_params_.quant_spec.custom_map;
+    CustomOpMap custom_map = quant_params_.quant_spec.custom_map;
 
     // Rewrite the floating-point ops to the quantized version, by fusing
     // preceding dequantize ops and succeding quantize ops.
@@ -195,29 +201,28 @@ class StrictQuantizationPattern : public RewritePattern {
         return failure();
       }
 
-      if (!quant::IsOpQuantizable(quantizing_op) &&
+      if (!IsOpQuantizable(quantizing_op) &&
           !IsQuantizableCustomOp(quantizing_op, custom_map)) {
         if (!(enable_verify && enable_whole_model_verify)) {
           return failure();
         }
-        if (quantizing_op->hasAttr(quant::kDebugModeOpQuantAttrName) ||
-            quantizing_op->hasAttr(quant::kDebugModeOpFloatAttrName)) {
+        if (quantizing_op->hasAttr(kDebugModeOpQuantAttrName) ||
+            quantizing_op->hasAttr(kDebugModeOpFloatAttrName)) {
           return failure();
         }
 
         rewriter.setInsertionPoint(quantizing_op);
         Operation* float_op = rewriter.clone(*quantizing_op);
-        quantizing_op->setAttr(quant::kDebugModeOpQuantAttrName,
+        quantizing_op->setAttr(kDebugModeOpQuantAttrName,
                                rewriter.getUnitAttr());
-        float_op->setAttr(quant::kDebugModeOpFloatAttrName,
-                          rewriter.getUnitAttr());
+        float_op->setAttr(kDebugModeOpFloatAttrName, rewriter.getUnitAttr());
         RewireFloatModelBackbone(quantizing_op, float_op);
         return success();
       }
 
       // An op with float inputs and outputs are expected when it's used by a
       // NumericVerify op. Skip this op.
-      if (enable_verify && quant::UsedBy<NumericVerifyOp>(quantizing_op)) {
+      if (enable_verify && UsedBy<NumericVerifyOp>(quantizing_op)) {
         continue;
       }
 
@@ -236,7 +241,7 @@ class StrictQuantizationPattern : public RewritePattern {
       inputs.reserve(quantizing_op->getNumOperands());
       for (auto operand : quantizing_op->getOperands()) {
         Type operand_type = operand.getType();
-        if (operand_type.isa<NoneType>()) {
+        if (mlir::isa<NoneType>(operand_type)) {
           inputs.push_back(operand);
           continue;
         }
@@ -267,7 +272,7 @@ class StrictQuantizationPattern : public RewritePattern {
       }
 
       Operation* quantized_op;
-      if (quant::QuantizableOpSupportsFloatOutputType(quantizing_op)) {
+      if (QuantizableOpSupportsFloatOutputType(quantizing_op)) {
         rewriter.setInsertionPointAfter(quantizing_op);
         OperationState new_state(
             quantizing_op->getLoc(), quantizing_op->getName().getStringRef(),
@@ -292,7 +297,7 @@ class StrictQuantizationPattern : public RewritePattern {
           Type result_type = result.getType();
           // Add this to the test coverage once we create test ops with none
           // type results.
-          if (result_type.isa<NoneType>()) {
+          if (mlir::isa<NoneType>(result_type)) {
             outputs_replaced.insert({result, enumerated_result.index()});
             output_types.push_back(result_type);
             continue;
@@ -384,7 +389,7 @@ class StrictQuantizationPattern : public RewritePattern {
 
  private:
   bool IsQuantizableCustomOp(Operation* op,
-                             const quant::CustomOpMap& custom_op_map) const {
+                             const CustomOpMap& custom_op_map) const {
     // In some cases, ops may need to be quantized even though their op trait is
     // not quantizable. For example, for the case of custom op various ops can
     // be categorized as cusom ops despite each of them may require different
@@ -413,7 +418,7 @@ class StrictQuantizationPattern : public RewritePattern {
       // compared against in parallel.
       // N.B. the return op will use this floating-point result.
       Value result;
-      if (!quant::IsOpQuantizable(float_op)) {
+      if (!IsOpQuantizable(float_op)) {
         // For not quantizable ops, search for dequantize attached to the
         // quantized op of the output.
         if (Operation* quantize_op = dyn_cast_or_null<QuantizeOp>(
@@ -441,31 +446,29 @@ class StrictQuantizationPattern : public RewritePattern {
           // the float backbone.
           dequantize.getResult().replaceUsesWithIf(
               float_op->getResult(i), [&](OpOperand& use) {
-                return !use.getOwner()->hasAttr(
-                    quant::kDebugModeOpQuantAttrName);
+                return !use.getOwner()->hasAttr(kDebugModeOpQuantAttrName);
               });
         }
       }
     }
   }
 
-  quant::QuantPassSpec quant_params_;
+  QuantPassSpec quant_params_;
 };
 
 // Base struct for quantization.
 template <QuantizationTrait quantization_trait, typename ConcreteT,
           typename RootOpT = DequantizeOp>
 struct TFLQuantizationBase
-    : public quant::QuantizationPattern<ConcreteT, QuantizeOp, DequantizeOp,
-                                        NumericVerifyOp, RootOpT> {
+    : public QuantizationPattern<ConcreteT, QuantizeOp, DequantizeOp,
+                                 NumericVerifyOp, RootOpT> {
   explicit TFLQuantizationBase(MLIRContext* ctx,
-                               const quant::QuantPassSpec& quant_params)
-      : quant::QuantizationPattern<ConcreteT, QuantizeOp, DequantizeOp,
-                                   NumericVerifyOp, RootOpT>(ctx,
-                                                             quant_params) {}
+                               const QuantPassSpec& quant_params)
+      : QuantizationPattern<ConcreteT, QuantizeOp, DequantizeOp,
+                            NumericVerifyOp, RootOpT>(ctx, quant_params) {}
 
   static bool IsQuantizableCustomOp(Operation* op,
-                                    const quant::CustomOpMap& custom_op_map) {
+                                    const CustomOpMap& custom_op_map) {
     // In some cases, ops may need to be quantized even though their op trait is
     // not quantizable. For example, for the case of custom op various ops can
     // be categorized as cusom ops despite each of them may require different
@@ -481,7 +484,7 @@ struct TFLQuantizationBase
   }
 
   static bool AllowDynamicRangeQuantizedOperand(
-      Operation* quantized_op, const quant::CustomOpMap& custom_op_map) {
+      Operation* quantized_op, const CustomOpMap& custom_op_map) {
     // Collect the input if dynamic range quantization is on and the op supports
     // it.
     return quantization_trait == kDynamicRangeQuantization &&
@@ -490,7 +493,7 @@ struct TFLQuantizationBase
   }
 
   static bool AllowDynamicRangeQuantizedResult(
-      Operation* quantized_op, const quant::CustomOpMap& custom_op_map) {
+      Operation* quantized_op, const CustomOpMap& custom_op_map) {
     // Collect the output if dynamic range quantization is on and the op
     // supports it.
     return quantization_trait == kDynamicRangeQuantization &&
@@ -501,8 +504,7 @@ struct TFLQuantizationBase
   static bool IsWeightOnlyOp(
       Operation* quantized_op,
       const absl::flat_hash_set<std::string>& ops_blocklist,
-      const bool weight_only_quantization,
-      const quant::CustomOpMap& custom_op_map) {
+      const bool weight_only_quantization, const CustomOpMap& custom_op_map) {
     // Check whether the quantized_op needs to be quantized in weight-only
     // manner.
     bool is_blocklisted = false;
@@ -539,7 +541,7 @@ struct TFLQuantizationBase
 struct TFLFullQuantization
     : public TFLQuantizationBase<kFullQuantization, TFLFullQuantization> {
   explicit TFLFullQuantization(MLIRContext* ctx,
-                               const quant::QuantPassSpec& quant_params)
+                               const QuantPassSpec& quant_params)
       : TFLQuantizationBase<kFullQuantization, TFLFullQuantization>(
             ctx, quant_params) {}
 };
@@ -550,7 +552,7 @@ struct TFLFullQuantizationReverse
     : public TFLQuantizationBase<kFullQuantization, TFLFullQuantizationReverse,
                                  QuantizeOp> {
   explicit TFLFullQuantizationReverse(MLIRContext* ctx,
-                                      const quant::QuantPassSpec& quant_params)
+                                      const QuantPassSpec& quant_params)
       : TFLQuantizationBase<kFullQuantization, TFLFullQuantizationReverse,
                             QuantizeOp>(ctx, quant_params) {}
 };
@@ -560,7 +562,7 @@ struct TFLDynamicRangeQuantization
     : public TFLQuantizationBase<kDynamicRangeQuantization,
                                  TFLDynamicRangeQuantization> {
   explicit TFLDynamicRangeQuantization(MLIRContext* ctx,
-                                       const quant::QuantPassSpec& quant_params)
+                                       const QuantPassSpec& quant_params)
       : TFLQuantizationBase<kDynamicRangeQuantization,
                             TFLDynamicRangeQuantization>(ctx, quant_params) {}
 };
@@ -577,12 +579,18 @@ class QuantizeConstPattern : public OpRewritePattern<QuantizeOp> {
       auto qtype = op.getQtypeAttr();
       Attribute quantized_attr;
       if (legacy_float_scale_) {
-        quantized_attr = quant::QuantizeLegacy(attr, qtype.getValue());
+        quantized_attr = QuantizeLegacy(attr, qtype.getValue());
       } else {
-        quantized_attr = quant::Quantize(attr, qtype.getValue());
+        quantized_attr = Quantize(attr, qtype.getValue());
       }
       if (quantized_attr) {
-        rewriter.replaceOpWithNewOp<QConstOp>(op, qtype, quantized_attr);
+        auto qconst_op =
+            rewriter.create<QConstOp>(op.getLoc(), qtype, quantized_attr);
+        if (auto volatile_attr = op->getAttr(kVolatileOpAttrName)) {
+          qconst_op->setAttr(kVolatileOpAttrName, volatile_attr);
+        }
+        op.replaceAllUsesWith(qconst_op.getOutput());
+        rewriter.eraseOp(op);
         return success();
       }
     }
@@ -602,7 +610,7 @@ struct QuantizePass : public impl::QuantizePassBase<QuantizePass> {
   explicit QuantizePass() { quant_specs.inference_type = tensorflow::DT_QINT8; }
 
   // Constructor used by manually creating the pass.
-  explicit QuantizePass(const quant::QuantizationSpecs& quant_specs)
+  explicit QuantizePass(const QuantizationSpecs& quant_specs)
       : quant_specs(quant_specs) {
     enable_numeric_verify_ = quant_specs.verify_numeric;
     enable_whole_model_verify_ = quant_specs.whole_model_verify;
@@ -610,13 +618,13 @@ struct QuantizePass : public impl::QuantizePassBase<QuantizePass> {
     enable_dynamic_range_quantization_ = quant_specs.weight_quantization;
     enable_weight_only_quantization_ = quant_specs.weight_only_quantization;
     qdq_conversion_mode_ =
-        quant::GetQDQQuantModeString(quant_specs.qdq_conversion_mode);
+        GetQDQQuantModeString(quant_specs.qdq_conversion_mode);
   }
 
   void runOnOperation() override;
 
  private:
-  quant::QuantizationSpecs quant_specs;
+  QuantizationSpecs quant_specs;
 };
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_quantize.inc"
@@ -637,7 +645,7 @@ void QuantizePass::runOnOperation() {
   quant_specs.weight_quantization = enable_dynamic_range_quantization_;
   quant_specs.weight_only_quantization = enable_weight_only_quantization_;
   quant_specs.qdq_conversion_mode =
-      quant::GetQDQQuantModeFromString(qdq_conversion_mode_);
+      GetQDQQuantModeFromString(qdq_conversion_mode_);
 
   if (!ops_blocklist_flag_.empty()) {
     quant_specs.ops_blocklist = absl::flat_hash_set<std::string>(
@@ -651,30 +659,29 @@ void QuantizePass::runOnOperation() {
 
   if (!enable_custom_op_weight_only_.empty()) {
     ParseCustomOpSpecs(enable_custom_op_weight_only_,
-                       quant::CustomOpUpdateOptions::kWeightOnly,
+                       CustomOpUpdateOptions::kWeightOnly,
                        quant_specs.custom_map);
   }
   if (enable_float16_quantization_) {
     quant_specs.inference_type = tensorflow::DT_HALF;
   }
 
-  const quant::QuantPassSpec quant_params = {
+  const QuantPassSpec quant_params = {
       {quant_specs.verify_numeric, error_tolerance_,
        quant_specs.whole_model_verify, enable_log_if_failed_},
       quant_specs};
 
-  if (quant_specs.qdq_conversion_mode == quant::QDQConversionMode::kQDQStrict) {
+  if (quant_specs.qdq_conversion_mode == QDQConversionMode::kQDQStrict) {
     patterns.add<StrictQuantizationPattern>(ctx, quant_params);
     patterns.add<RemoveUnusedFQ, SquashDqQ, FuseDqQToRequant>(ctx);
   } else if (quant_specs.weight_quantization ||
              quant_specs.use_fake_quant_num_bits ||
              quant_specs.qdq_conversion_mode ==
-                 quant::QDQConversionMode::kQDQDynamic) {
+                 QDQConversionMode::kQDQDynamic) {
     patterns.add<SquashDqQ, EliminateRemnantConstQDQ>(ctx);
     quantize_by_converter_patterns::populateWithGenerated(patterns);
     patterns.add<TFLDynamicRangeQuantization>(ctx, quant_params);
-  } else if (quant_specs.qdq_conversion_mode ==
-             quant::QDQConversionMode::kQDQNone) {
+  } else if (quant_specs.qdq_conversion_mode == QDQConversionMode::kQDQNone) {
     patterns.add<SquashDqQ, EliminateRemnantConstQDQ>(ctx);
     quantize_by_converter_patterns::populateWithGenerated(patterns);
     patterns.add<TFLFullQuantization, TFLFullQuantizationReverse>(ctx,
@@ -692,7 +699,7 @@ void QuantizePass::runOnOperation() {
   RewritePatternSet patterns_2(&getContext());
   patterns_2.add<QuantizeConstPattern>(ctx, quant_specs.legacy_float_scale);
   if (quant_params.numeric_verify_spec.whole_model_verify) {
-    patterns_2.add<quant::RemoveDebugAttrPattern>(ctx);
+    patterns_2.add<RemoveDebugAttrPattern>(ctx);
   }
   (void)applyPatternsGreedily(func, std::move(patterns_2));
 }
@@ -700,10 +707,10 @@ void QuantizePass::runOnOperation() {
 
 // Creates an instance of the TensorFlow Lite dialect QuantizeTFL pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    const quant::QuantizationSpecs& quant_specs,
+    const QuantizationSpecs& quant_specs,
     const absl::flat_hash_set<std::string>& ops_blocklist,
     const absl::flat_hash_set<std::string>& nodes_blocklist) {
-  quant::QuantizationSpecs updated_quant_specs;
+  QuantizationSpecs updated_quant_specs;
   updated_quant_specs = quant_specs;
   // If there's new blocklists given, update quant_specs to use the new one.
   if (!ops_blocklist.empty()) {
@@ -724,7 +731,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
     const bool legacy_float_scale,
     const absl::flat_hash_set<std::string>& ops_blocklist,
     const absl::flat_hash_set<std::string>& nodes_blocklist) {
-  quant::QuantizationSpecs quant_specs;
+  QuantizationSpecs quant_specs;
   quant_specs.verify_numeric = verify_numeric;
   quant_specs.whole_model_verify = whole_model_verify;
   quant_specs.legacy_float_scale = legacy_float_scale;
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_by_converter_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_by_converter_patterns.td
index 025991b2e8cc..3ff1f5458bfa 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_by_converter_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_by_converter_patterns.td
@@ -22,7 +22,7 @@ include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/IR/CommonTypeConstraints.td"
-include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td"
+include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 // Transpose conv supports hybrid computation with quantized weights.
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
index ae8af0a99cc8..f775781e2b52 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
@@ -21,13 +21,13 @@ include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/IR/CommonTypeConstraints.td"
-include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td"
+include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 // Quantize attribute $0 by using quantization parameter from %1.
-def QuantizeByQuantizedType : NativeCodeCall<"quant::Quantize($0, $1.getValue())">;
+def QuantizeByQuantizedType : NativeCodeCall<"TFL::Quantize($0, $1.getValue())">;
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">, "float constant tensor">;
+  CPred<"llvm::cast<ElementsAttr>($_self).getShapedType().getElementType().isF32()">, "float constant tensor">;
 
 def HasSameType : Constraint<CPred<[{$0.getType() == $1.getType()}]>>;
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.cc b/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.cc
index 0fe96f4b0b71..5e20684f6a94 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/tf_legalizations/legalize_tensorlist_pass.cc
@@ -71,7 +71,8 @@ ConstBytesAttr CreateListReserveOptions(MLIRContext* context,
 }
 
 std::optional<Type> GetSingularVariantBaseType(Value val) {
-  auto val_t = mlir::getElementTypeOrSelf(val).dyn_cast_or_null<VariantType>();
+  auto val_t = llvm::dyn_cast_or_null<mlir::tf_type::VariantType>(
+      mlir::getElementTypeOrSelf(val));
   if (!val_t) {
     return std::nullopt;
   }
@@ -107,11 +108,13 @@ std::optional<ConstBytesAttr> CustomOptions(MLIRContext* context,
 
 bool HasVariantInputOrOutput(Operation* op) {
   const bool has_variant_input = llvm::any_of(op->getOperands(), [](Value val) {
-    return val.getType().cast<TensorType>().getElementType().isa<VariantType>();
+    return llvm::isa<VariantType>(
+        llvm::cast<mlir::TensorType>(val.getType()).getElementType());
   });
   const bool has_variant_output =
       llvm::any_of(op->getResultTypes(), [](Type t) {
-        return t.cast<TensorType>().getElementType().isa<VariantType>();
+        return llvm::isa<VariantType>(
+            llvm::cast<mlir::TensorType>(t).getElementType());
       });
   return has_variant_input || has_variant_output;
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.cc b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.cc
new file mode 100644
index 000000000000..e40fb1a85d4e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.cc
@@ -0,0 +1,303 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+BatchMatMulDimensionsInfo::BatchMatMulDimensionsInfo(mlir::ShapedType type,
+                                                     bool is_lhs)
+    : is_lhs_(is_lhs) {
+  // BatchMatMulOp has the following shape pattern: B0,...,Bn,L,C and
+  // B0,...,Bn,C,R. So, there is only one Contracting dimension and one
+  // output dimension.
+  const int64_t rank = type.getRank();
+
+  if (is_lhs) {
+    contracting_dimensions_.axes.push_back(rank - 1);
+    contracting_dimensions_.sizes.push_back(type.getDimSize(rank - 1));
+    out_dimensions_.axes.push_back(rank - 2);
+    out_dimensions_.sizes.push_back(type.getDimSize(rank - 2));
+  } else {
+    contracting_dimensions_.axes.push_back(rank - 2);
+    contracting_dimensions_.sizes.push_back(type.getDimSize(rank - 2));
+    out_dimensions_.axes.push_back(rank - 1);
+    out_dimensions_.sizes.push_back(type.getDimSize(rank - 1));
+  }
+  // Dims 0 and 1 are contracting and output dimensions, hence skipped.
+  for (int64_t dim = 0; dim < rank - 2; ++dim) {
+    batch_dimensions_.axes.push_back(dim);
+    batch_dimensions_.sizes.push_back(type.getDimSize(dim));
+  }
+}
+
+const DimensionVector& BatchMatMulDimensionsInfo::batch_dimensions() const {
+  return batch_dimensions_;
+}
+const DimensionVector& BatchMatMulDimensionsInfo::contracting_dimensions()
+    const {
+  return contracting_dimensions_;
+}
+
+const DimensionVector& BatchMatMulDimensionsInfo::out_dimensions() const {
+  return out_dimensions_;
+}
+
+bool BatchMatMulDimensionsInfo::is_lhs() const { return is_lhs_; }
+
+BatchMatMulDimensionsInfo GetBatchMatMulLhsDimensionsInfo(
+    mlir::ShapedType type) {
+  return BatchMatMulDimensionsInfo(type, /*is_lhs=*/true);
+}
+
+BatchMatMulDimensionsInfo GetBatchMatMulRhsDimensionsInfo(
+    mlir::ShapedType type) {
+  return BatchMatMulDimensionsInfo(type, /*is_lhs=*/false);
+}
+
+bool HasFlattenedContractingDims(
+    llvm::ArrayRef<int32_t> reshape_input_shape,
+    const BatchMatMulDimensionsInfo& bmm_dimensions_info) {
+  // Batch dimensions are not flattened and need to match the LHS/RHS of
+  // BatchMatMulOp.
+  auto batch_dimensions = bmm_dimensions_info.batch_dimensions().SizesArray();
+  // The batch dimensions are at the front of the input shape.
+  auto reshape_input_shape_batch_dims =
+      reshape_input_shape.take_front(batch_dimensions.size());
+
+  if (!llvm::all_of(
+          llvm::zip(batch_dimensions, reshape_input_shape_batch_dims),
+          [](auto dims) { return std::get<0>(dims) == std::get<1>(dims); })) {
+    return false;
+  }
+
+  // Out dimensions are assumed to be unflattened and need to match the LHS/RHS
+  // of BatchMatMulOp.
+  auto out_dimensions = bmm_dimensions_info.out_dimensions().SizesArray();
+  llvm::ArrayRef<int32_t> reshape_input_shape_out_dims;
+  // The out dimensions are at the end of the input shape for LHS and
+  // at the front for RHS.
+  if (bmm_dimensions_info.is_lhs()) {
+    reshape_input_shape_out_dims =
+        reshape_input_shape.slice(batch_dimensions.size(), 1);
+  } else {
+    reshape_input_shape_out_dims =
+        reshape_input_shape.take_back(out_dimensions.size());
+  }
+  if (!llvm::all_of(
+          llvm::zip(out_dimensions, reshape_input_shape_out_dims),
+          [](auto dims) { return std::get<0>(dims) == std::get<1>(dims); })) {
+    return false;
+  }
+
+  auto contracting_dimensions =
+      bmm_dimensions_info.contracting_dimensions().SizesArray();
+  // The contracting dimensions are at the end of the input shape for
+  // LHS and at the front for RHS.
+  llvm::ArrayRef<int32_t> reshape_input_shape_contracting_dims;
+  size_t num_contracting_dims = reshape_input_shape.size() -
+                                batch_dimensions.size() - out_dimensions.size();
+  if (bmm_dimensions_info.is_lhs()) {
+    reshape_input_shape_contracting_dims =
+        reshape_input_shape.take_back(num_contracting_dims);
+  } else {
+    reshape_input_shape_contracting_dims = reshape_input_shape.slice(
+        batch_dimensions.size(), num_contracting_dims);
+  }
+
+  return (std::accumulate(reshape_input_shape_contracting_dims.begin(),
+                          reshape_input_shape_contracting_dims.end(), 1,
+                          std::multiplies<int64_t>()) ==
+          contracting_dimensions[0]);
+}
+
+bool HasFlattenedOutDims(llvm::ArrayRef<int32_t> reshape_input_shape,
+                         const BatchMatMulDimensionsInfo& bmm_dimensions_info) {
+  // Batch dimensions are not flattened and need to match the LHS/RHS of
+  // BatchMatMulOp.
+  auto batch_dimensions = bmm_dimensions_info.batch_dimensions().SizesArray();
+  // The batch dimensions are at the front of the input shape.
+  auto reshape_input_shape_batch_dims =
+      reshape_input_shape.take_front(batch_dimensions.size());
+  if (!llvm::all_of(
+          llvm::zip(batch_dimensions, reshape_input_shape_batch_dims),
+          [](auto dims) { return std::get<0>(dims) == std::get<1>(dims); })) {
+    return false;
+  }
+
+  auto contracting_dimensions =
+      bmm_dimensions_info.contracting_dimensions().SizesArray();
+  // The contracting dimensions are at the end of the input shape for
+  // LHS and at the front for RHS.
+  llvm::ArrayRef<int32_t> reshape_input_shape_contracting_dims;
+  if (bmm_dimensions_info.is_lhs()) {
+    reshape_input_shape_contracting_dims =
+        reshape_input_shape.take_back(contracting_dimensions.size());
+  } else {
+    reshape_input_shape_contracting_dims =
+        reshape_input_shape.slice(batch_dimensions.size(), 1);
+  }
+  if (!llvm::all_of(
+          llvm::zip(contracting_dimensions,
+                    reshape_input_shape_contracting_dims),
+          [](auto dims) { return std::get<0>(dims) == std::get<1>(dims); })) {
+    return false;
+  }
+
+  auto out_dimensions = bmm_dimensions_info.out_dimensions().SizesArray();
+  // The out dimensions are at the end of the input shape for LHS and
+  // at the front for RHS.
+  llvm::ArrayRef<int32_t> reshape_input_shape_out_dims;
+  size_t num_out_dims = reshape_input_shape.size() - batch_dimensions.size() -
+                        contracting_dimensions.size();
+  if (bmm_dimensions_info.is_lhs()) {
+    reshape_input_shape_out_dims =
+        reshape_input_shape.slice(batch_dimensions.size(), num_out_dims);
+  } else {
+    reshape_input_shape_out_dims = reshape_input_shape.take_back(num_out_dims);
+  }
+
+  return (std::accumulate(reshape_input_shape_out_dims.begin(),
+                          reshape_input_shape_out_dims.end(), 1,
+                          std::multiplies<int64_t>()) == out_dimensions[0]);
+}
+
+std::tuple<std::pair<int, int>, std::pair<int, int>>
+GetTransposedGroupsIndexRange(llvm::ArrayRef<int32_t> transpose_permutation) {
+  // If the input vector is empty, return None for both pairs.
+  if (transpose_permutation.empty()) {
+    return {{-1, -1}, {-1, -1}};  // Use -1 to indicate None
+  }
+
+  int group_one_end_idx = -1;
+  for (int i = 0; i < transpose_permutation.size(); ++i) {
+    if (transpose_permutation[i] == i) {
+      group_one_end_idx = i;
+    } else {
+      break;
+    }
+  }
+
+  // If all dimensions are batch dimensions, i.e. the first group is a
+  // monotonically increasing sequence, return None for both remaining groups.
+  if (group_one_end_idx == transpose_permutation.size() - 1) {
+    return {{-1, -1}, {-1, -1}};
+  }
+
+  int group_two_start_idx = group_one_end_idx + 1;
+  int group_two_end_idx = group_two_start_idx;
+  int group_three_start_idx = -1;
+  int group_three_end_idx = -1;
+
+  int group_two_end_idx_value = transpose_permutation.size() - 1;
+  int group_three_start_idx_value = group_one_end_idx + 1;
+
+  for (int i = group_two_start_idx + 1; i < transpose_permutation.size(); ++i) {
+    if (transpose_permutation[i] > group_two_end_idx_value ||
+        transpose_permutation[i] <= group_three_start_idx_value ||
+        (transpose_permutation[i] != transpose_permutation[i - 1] + 1)) {
+      break;
+    }
+    group_two_end_idx = i;
+  }
+
+  group_three_start_idx = group_two_end_idx + 1;
+  group_three_end_idx = transpose_permutation.size() - 1;
+  // Fail if the last group is not a monotonically increasing sequence.
+  for (int i = group_three_start_idx + 1; i < transpose_permutation.size();
+       ++i) {
+    if (transpose_permutation[i] != transpose_permutation[i - 1] + 1) {
+      return {{-1, -1}, {-1, -1}};
+    }
+  }
+
+  // Handle edge cases where start index might be greater than end index.
+  if (group_two_start_idx > group_two_end_idx) {
+    group_two_start_idx = group_two_end_idx;
+  }
+
+  if (group_three_start_idx > group_three_end_idx) {
+    group_three_start_idx = group_three_end_idx;
+  }
+  if (group_three_start_idx >= transpose_permutation.size()) {
+    group_three_start_idx = -1;
+    group_three_end_idx = -1;
+  }
+
+  return {{group_two_start_idx, group_two_end_idx},
+          {group_three_start_idx, group_three_end_idx}};
+}
+
+bool HasTransposedContractingAndOutDims(
+    llvm::ArrayRef<int32_t> transpose_input_shape,
+    llvm::ArrayRef<int32_t> transpose_permutation,
+    const BatchMatMulDimensionsInfo& bmm_dimensions_info) {
+  std::tuple<std::pair<int, int>, std::pair<int, int>>
+      transposed_groups_index_range =
+          GetTransposedGroupsIndexRange(transpose_permutation);
+  // Return false if the transpose_permutation is not valid.
+  if (std::get<0>(transposed_groups_index_range).first == -1 ||
+      std::get<0>(transposed_groups_index_range).second == -1 ||
+      std::get<1>(transposed_groups_index_range).first == -1 ||
+      std::get<1>(transposed_groups_index_range).second == -1) {
+    return false;
+  }
+
+  // Check if the broadcast dimensions match the batch dimensions of
+  // BatchMatMulOp.
+  if (!bmm_dimensions_info.batch_dimensions().AxesArray().empty() &&
+      bmm_dimensions_info.batch_dimensions().AxesArray().back() !=
+          std::get<0>(transposed_groups_index_range).first - 1) {
+    return false;
+  }
+
+  // Accumulating the sizes of the transposed groups should match the sizes of
+  // the contracting and out dimensions of BatchMatMulOp.
+  int64_t group_two_dims_size = 1;
+  int64_t group_three_dims_size = 1;
+  for (int i = std::get<0>(transposed_groups_index_range).first;
+       i <= std::get<0>(transposed_groups_index_range).second; ++i) {
+    group_two_dims_size *= transpose_input_shape[transpose_permutation[i]];
+  }
+  for (int i = std::get<1>(transposed_groups_index_range).first;
+       i <= std::get<1>(transposed_groups_index_range).second; ++i) {
+    group_three_dims_size *= transpose_input_shape[transpose_permutation[i]];
+  }
+
+  const auto& out_dims = bmm_dimensions_info.out_dimensions().SizesArray()[0];
+  const auto& contracting_dims =
+      bmm_dimensions_info.contracting_dimensions().SizesArray()[0];
+
+  return bmm_dimensions_info.is_lhs()
+             ? (group_two_dims_size == out_dims &&
+                group_three_dims_size == contracting_dims)
+             : (group_two_dims_size == contracting_dims &&
+                group_three_dims_size == out_dims);
+}
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.h b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.h
new file mode 100644
index 000000000000..3eb3de702e1f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.h
@@ -0,0 +1,141 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_OPTIMIZE_BATCH_MATMUL_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_OPTIMIZE_BATCH_MATMUL_UTILS_H_
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+// LHS and RHS of BatchMatMulOp has shapes following the pattern:
+// B0,...,Bn,L,C and B0,...,Bn,C,R. The output shape of BatchMatMulOp is:
+// B0,...,Bn,L,R.
+//
+// LHS and RHS of FullyConnectedOp has shapes following the pattern:
+// B0,...,Bn,L,C and R,C. The output shape of FullyConnectedOp is:
+// B0,...,Bn,L,R.
+//
+// The fundamental idea behind seeing transposes and reshapes around
+// BatchMatMulOp is that-
+// -- BatchMatMulOp is often created as a result of lowering einsum or
+//    dot_general ops.
+// -- einsum and dot_general ops have multiple contracting and output
+//    dimensions that will to be reshaped and transposed to match the
+//    BatchMatMulOp's LHS and RHS restrictions.
+//
+// This file contains utility functions to identify the reshapes and transposes
+// around BatchMatMulOp and see if they can be fused.
+
+// A struct to hold axes and sizes for a set of dimensions.
+struct DimensionVector {
+  llvm::ArrayRef<int64_t> AxesArray() const { return axes; }
+  llvm::ArrayRef<int64_t> SizesArray() const { return sizes; }
+
+  llvm::SmallVector<int64_t, 4> axes;
+  llvm::SmallVector<int64_t, 4> sizes;
+};
+
+// A struct to hold information about dimensions of dot_general operands.
+class BatchMatMulDimensionsInfo {
+ public:
+  BatchMatMulDimensionsInfo(mlir::ShapedType type, bool is_lhs);
+  const DimensionVector& batch_dimensions() const;
+  const DimensionVector& contracting_dimensions() const;
+  // Out dimensions are any dimensions that are neither batch nor contracting
+  // dimensions, hence will be propagated to output shape.
+  const DimensionVector& out_dimensions() const;
+  bool is_lhs() const;
+
+ private:
+  DimensionVector batch_dimensions_;
+  DimensionVector contracting_dimensions_;
+  // Out dimensions are any dimensions that are neither batch nor contracting
+  // dimensions, hence will be propagated to output shape.
+  DimensionVector out_dimensions_;
+  bool is_lhs_;
+};
+
+// Returns the dimensions info of the LHS of BatchMatMulOp.
+BatchMatMulDimensionsInfo GetBatchMatMulLhsDimensionsInfo(
+    mlir::ShapedType type);
+
+// Returns the dimensions info of the RHS of BatchMatMulOp.
+BatchMatMulDimensionsInfo GetBatchMatMulRhsDimensionsInfo(
+    mlir::ShapedType type);
+
+// Returns true if the product of the last few dimensions in the
+// `reshape_input_shape` is equal to the contracting dimension of the
+// `bmm_dimensions_info`.
+bool HasFlattenedContractingDims(
+    llvm::ArrayRef<int32_t> reshape_input_shape,
+    const BatchMatMulDimensionsInfo& bmm_dimensions_info);
+
+// Returns true if the product of the first few dimensions in the
+// `reshape_input_shape` is equal to the output dimension of the
+// `bmm_dimensions_info`.
+bool HasFlattenedOutDims(llvm::ArrayRef<int32_t> reshape_input_shape,
+                         const BatchMatMulDimensionsInfo& bmm_dimensions_info);
+
+// Returns true if the contracting and output dimensions are transposed in the
+// `transpose_permutation`.
+bool HasTransposedContractingAndOutDims(
+    llvm::ArrayRef<int32_t> transpose_input_shape,
+    llvm::ArrayRef<int32_t> transpose_permutation,
+    const BatchMatMulDimensionsInfo& bmm_dimensions_info);
+
+// `transpose_permutation` is the permutation of the input shape of the
+// transpose op. `transpose_input_shape` is the shape of the input of the
+// transpose op. `bmm_dimensions_info` is the dimensions info of the
+// BatchMatMulOp.
+//
+// The dimensions in the transpose_permutation can be split into three groups:
+//   1. Batch dimensions
+//   2. Contracting dimensions
+//   3. Output dimensions
+//
+// - The number of dimensions and the order of the dimensions in the
+//   batch-dimensions group is expected to match the batch dimensions of the
+//   BatchMatMulOp.
+// - The number of dimensions in the contracting-dimensions and
+//   output-dimensions groups can be more than 1.
+// - The dimensions in group 1 are expected to be a monotonically increasing
+//   sequence.
+// - The dimensions in group 2 and 3 need not be a monotonically increasing
+//   sequence.
+// - In this function, we only care if the groups 2 and 3 are transposed.
+//
+// For example, consider the following transpose_permutation-
+// [0, 1, 2, 6, 7, 8, 3, 4, 5]. Here all the three groups are monotonically
+// increasing. But other permutations like [0, 1, 2, 8, 7, 6, 4, 5, 3] and [0,
+// 1, 2, 6, 7, 8, 3, 5, 4] are also valid.
+//
+// NOTE: The first version of this function will support the case where all the
+// three groups are monotonically increasing.
+std::tuple<std::pair<int, int>, std::pair<int, int>>
+GetTransposedGroupsIndexRange(llvm::ArrayRef<int32_t> transpose_permutation);
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TRANSFORMS_TFLITE_PASSES_OPTIMIZE_BATCH_MATMUL_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils_test.cc b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils_test.cc
new file mode 100644
index 000000000000..cf026d8c8169
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/optimize_batch_matmul_utils.h"
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+TEST(OptimizeBatchMatmulUtilsTest, BatchMatMulDimensionsInfo) {
+  mlir::MLIRContext context;
+  mlir::ShapedType type = mlir::RankedTensorType::get(
+      {1, 2, 3, 4, 5}, mlir::Float32Type::get(&context));
+  BatchMatMulDimensionsInfo lhs_info(type, /*is_lhs=*/true);
+  EXPECT_EQ(lhs_info.batch_dimensions().AxesArray(),
+            llvm::ArrayRef<int64_t>({0, 1, 2}));
+  EXPECT_EQ(lhs_info.batch_dimensions().SizesArray(),
+            llvm::ArrayRef<int64_t>({1, 2, 3}));
+  EXPECT_EQ(lhs_info.contracting_dimensions().AxesArray(),
+            llvm::ArrayRef<int64_t>({4}));
+  EXPECT_EQ(lhs_info.contracting_dimensions().SizesArray(),
+            llvm::ArrayRef<int64_t>({5}));
+  EXPECT_EQ(lhs_info.out_dimensions().AxesArray(),
+            llvm::ArrayRef<int64_t>({3}));
+  EXPECT_EQ(lhs_info.out_dimensions().SizesArray(),
+            llvm::ArrayRef<int64_t>({4}));
+  EXPECT_TRUE(lhs_info.is_lhs());
+
+  BatchMatMulDimensionsInfo rhs_info(type, /*is_lhs=*/false);
+  EXPECT_EQ(rhs_info.batch_dimensions().AxesArray(),
+            llvm::ArrayRef<int64_t>({0, 1, 2}));
+  EXPECT_EQ(rhs_info.batch_dimensions().SizesArray(),
+            llvm::ArrayRef<int64_t>({1, 2, 3}));
+  EXPECT_EQ(rhs_info.contracting_dimensions().AxesArray(),
+            llvm::ArrayRef<int64_t>({3}));
+  EXPECT_EQ(rhs_info.contracting_dimensions().SizesArray(),
+            llvm::ArrayRef<int64_t>({4}));
+  EXPECT_EQ(rhs_info.out_dimensions().AxesArray(),
+            llvm::ArrayRef<int64_t>({4}));
+  EXPECT_EQ(rhs_info.out_dimensions().SizesArray(),
+            llvm::ArrayRef<int64_t>({5}));
+  EXPECT_FALSE(rhs_info.is_lhs());
+}
+
+TEST(OptimizeBatchMatmulUtilsTest, HasFlattenedContractingDims) {
+  mlir::MLIRContext context;
+  mlir::ShapedType type = mlir::RankedTensorType::get(
+      {1, 2, 3, 4, 50}, mlir::Float32Type::get(&context));
+  BatchMatMulDimensionsInfo lhs_info(type, /*is_lhs=*/true);
+  EXPECT_TRUE(HasFlattenedContractingDims({1, 2, 3, 4, 5, 10}, lhs_info));
+  EXPECT_FALSE(HasFlattenedContractingDims({1, 2, 3, 4, 10}, lhs_info));
+
+  type = mlir::RankedTensorType::get({1, 2, 12, 5},
+                                     mlir::Float32Type::get(&context));
+  BatchMatMulDimensionsInfo rhs_info(type, /*is_lhs=*/false);
+  EXPECT_TRUE(HasFlattenedContractingDims({1, 2, 3, 4, 5}, rhs_info));
+  EXPECT_FALSE(HasFlattenedContractingDims({1, 2, 3, 4, 10}, rhs_info));
+
+  type = mlir::RankedTensorType::get({4, 50}, mlir::Float32Type::get(&context));
+  lhs_info = BatchMatMulDimensionsInfo(type, /*is_lhs=*/true);
+  EXPECT_TRUE(HasFlattenedContractingDims({4, 5, 10}, lhs_info));
+  EXPECT_FALSE(HasFlattenedContractingDims({4, 10}, lhs_info));
+
+  type = mlir::RankedTensorType::get({12, 5}, mlir::Float32Type::get(&context));
+  rhs_info = BatchMatMulDimensionsInfo(type, /*is_lhs=*/false);
+  EXPECT_TRUE(HasFlattenedContractingDims({3, 4, 5}, rhs_info));
+  EXPECT_FALSE(HasFlattenedContractingDims({3, 4, 10}, rhs_info));
+}
+
+TEST(OptimizeBatchMatmulUtilsTest, HasFlattenedOutDims) {
+  mlir::MLIRContext context;
+  mlir::ShapedType type = mlir::RankedTensorType::get(
+      {1, 2, 12, 5}, mlir::Float32Type::get(&context));
+  BatchMatMulDimensionsInfo lhs_info(type, /*is_lhs=*/true);
+  EXPECT_TRUE(HasFlattenedOutDims({1, 2, 3, 4, 5}, lhs_info));
+  EXPECT_FALSE(HasFlattenedOutDims({1, 2, 3, 4, 10}, lhs_info));
+
+  type = mlir::RankedTensorType::get({1, 2, 12, 10},
+                                     mlir::Float32Type::get(&context));
+  BatchMatMulDimensionsInfo rhs_info(type, /*is_lhs=*/false);
+  EXPECT_TRUE(HasFlattenedOutDims({1, 2, 12, 5, 2}, rhs_info));
+  EXPECT_FALSE(HasFlattenedOutDims({1, 2, 3, 4, 10}, rhs_info));
+
+  type = mlir::RankedTensorType::get({12, 5}, mlir::Float32Type::get(&context));
+  lhs_info = BatchMatMulDimensionsInfo(type, /*is_lhs=*/true);
+  EXPECT_TRUE(HasFlattenedOutDims({3, 4, 5}, lhs_info));
+  EXPECT_FALSE(HasFlattenedOutDims({3, 4, 10}, lhs_info));
+
+  type =
+      mlir::RankedTensorType::get({12, 10}, mlir::Float32Type::get(&context));
+  rhs_info = BatchMatMulDimensionsInfo(type, /*is_lhs=*/false);
+  EXPECT_TRUE(HasFlattenedOutDims({12, 5, 2}, rhs_info));
+  EXPECT_FALSE(HasFlattenedOutDims({3, 4, 10}, rhs_info));
+}
+
+TEST(OptimizeBatchMatmulUtilsTest, GetTransposedGroupsIndexRange) {
+  EXPECT_EQ(GetTransposedGroupsIndexRange({0, 1, 2, 6, 7, 8, 3, 4, 5}),
+            std::make_tuple(std::make_pair(3, 5), std::make_pair(6, 8)));
+  EXPECT_EQ(GetTransposedGroupsIndexRange({2, 0, 1}),
+            std::make_tuple(std::make_pair(0, 0), std::make_pair(1, 2)));
+  EXPECT_EQ(GetTransposedGroupsIndexRange({0, 1, 2, 3, 7, 8, 4, 5, 6}),
+            std::make_tuple(std::make_pair(4, 5), std::make_pair(6, 8)));
+  EXPECT_EQ(GetTransposedGroupsIndexRange({0, 1, 2, 3, 8, 7, 4, 5, 6}),
+            std::make_tuple(std::make_pair(-1, -1), std::make_pair(-1, -1)));
+  EXPECT_EQ(GetTransposedGroupsIndexRange({0, 1, 2}),
+            std::make_tuple(std::make_pair(-1, -1), std::make_pair(-1, -1)));
+  EXPECT_EQ(GetTransposedGroupsIndexRange({0, 1, 2}),
+            std::make_tuple(std::make_pair(-1, -1), std::make_pair(-1, -1)));
+  EXPECT_EQ(GetTransposedGroupsIndexRange({}),
+            std::make_tuple(std::make_pair(-1, -1), std::make_pair(-1, -1)));
+}
+
+TEST(OptimizeBatchMatmulUtilsTest, HasTransposedContractingAndOutDims) {
+  mlir::MLIRContext context;
+  mlir::ShapedType type = mlir::RankedTensorType::get(
+      {1, 2, 3, 504, 120}, mlir::Float32Type::get(&context));
+  BatchMatMulDimensionsInfo lhs_info(type, /*is_lhs=*/true);
+  EXPECT_TRUE(HasTransposedContractingAndOutDims(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 2, 6, 7, 8, 3, 4, 5}, lhs_info));
+  EXPECT_FALSE(HasTransposedContractingAndOutDims(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 2, 8, 7, 6, 4, 5, 3}, lhs_info));
+
+  BatchMatMulDimensionsInfo rhs_info(type, /*is_lhs=*/false);
+  EXPECT_TRUE(HasTransposedContractingAndOutDims(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 2, 6, 7, 8, 3, 4, 5}, rhs_info));
+  EXPECT_FALSE(HasTransposedContractingAndOutDims(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 2, 8, 7, 6, 4, 5, 3}, rhs_info));
+
+  type =
+      mlir::RankedTensorType::get({504, 120}, mlir::Float32Type::get(&context));
+  lhs_info = BatchMatMulDimensionsInfo(type, /*is_lhs=*/true);
+  EXPECT_TRUE(HasTransposedContractingAndOutDims({4, 5, 6, 7, 8, 9},
+                                                 {3, 4, 5, 0, 1, 2}, lhs_info));
+  EXPECT_FALSE(HasTransposedContractingAndOutDims(
+      {4, 5, 6, 7, 8, 9}, {5, 4, 3, 1, 2, 0}, lhs_info));
+
+  rhs_info = BatchMatMulDimensionsInfo(type, /*is_lhs=*/false);
+  EXPECT_TRUE(HasTransposedContractingAndOutDims({4, 5, 6, 7, 8, 9},
+                                                 {3, 4, 5, 0, 1, 2}, rhs_info));
+  EXPECT_FALSE(HasTransposedContractingAndOutDims(
+      {4, 5, 6, 7, 8, 9}, {5, 4, 3, 1, 2, 0}, rhs_info));
+}
+
+}  // namespace
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
index 069bf7fd6636..2b0355712165 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
@@ -15,8 +15,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.h"
 
 #include <cstddef>
-#include <cstdint>
-#include <utility>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.cc b/tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.cc
index 63c74dcc7dea..6b465500684b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/unfreeze_global_constants.cc
@@ -257,7 +257,9 @@ void UnfreezeMutableGlobalTensorsPass::runOnOperation() {
       arg.replaceAllUsesWith(var_handle_op->getResults()[0]);
     }
 
-    func.eraseArguments(args_to_erase);
+    if (failed(func.eraseArguments(args_to_erase))) {
+      return signalPassFailure();
+    }
   }
 
   // Erase the mutable GlobalTensorOps that are replaced by VarHandleOps.
diff --git a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
index 146cae1f2c47..4e0fb068c8b9 100644
--- a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
@@ -32,8 +32,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 
 namespace mlir {
@@ -138,7 +138,7 @@ class InsertTFLQuantOpsAfterTFFakeQuantOp {
     IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
     BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
     Type res_type = tf_op.getType();
-    TypeAttr qtype = quant::GetQuantizedTypeAttr(
+    TypeAttr qtype = GetQuantizedTypeAttr(
         rewriter, res_type, min_value, max_value, quant_dim, num_bits,
         narrow_range, /*is_signed=*/false, /*legacy_float_scale=*/false,
         use_fake_quant_num_bits_);
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.h b/tensorflow/compiler/mlir/lite/utils/utils.h
index 53f6a038678d..88088b5799e7 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/utils.h
@@ -20,14 +20,20 @@ limitations under the License.
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <set>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
@@ -58,13 +64,28 @@ inline bool IsPosInfiniteValue(APFloat value) {
   return value.isInfinity();
 }
 
+// Returns 1D 32-bit dense elements attribute with the given values.
+inline DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
+                                               Builder* builder) {
+  RankedTensorType ty = mlir::RankedTensorType::get(
+      {static_cast<int32_t>(values.size())}, builder->getIntegerType(32));
+  return DenseIntElementsAttr::get(ty, values);
+}
+
+inline DenseIntElementsAttr GetI64ElementsAttr(ArrayRef<int64_t> values,
+                                               Builder* builder) {
+  RankedTensorType ty = RankedTensorType::get(
+      {static_cast<int64_t>(values.size())}, builder->getIntegerType(64));
+  return DenseIntElementsAttr::get(ty, values);
+}
+
 // Returns true if all tensor value in `values` has static shape and same shape.
 inline bool OpHasSameStaticShapes(Operation* op) {
   auto values = op->getOperands();
   int operand_num = 0;
   ArrayRef<int64_t> shape;
   for (Value value : values) {
-    auto shaped_type = value.getType().dyn_cast<ShapedType>();
+    auto shaped_type = mlir::dyn_cast<ShapedType>(value.getType());
     if (!shaped_type || !shaped_type.hasStaticShape()) {
       return false;
     }
@@ -117,6 +138,19 @@ inline DenseElementsAttr RemapPermutation(Value permutation1,
   return RemapPermutation(permutation1, perm2_const);
 }
 
+inline bool IsTransposeNoop(Value permutation) {
+  DenseElementsAttr perm_values_attr;
+  if (!matchPattern(permutation, m_Constant(&perm_values_attr))) return false;
+
+  for (const auto& [idx, perm_value] :
+       llvm::enumerate(perm_values_attr.getValues<APInt>())) {
+    if (perm_value.getSExtValue() != idx) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Returns true if the transpose op is trivial. Trivial means that
 // the permutation is a cyclic permutation of the original shape with only the
 // identity dimensions permuted.
@@ -151,7 +185,7 @@ inline bool IsTransposeTrivial(llvm::ArrayRef<int64_t> input_shape,
 // Returns the permutation that maps the input shape to the output shape.
 // This is only valid for trivial reshape ops.
 inline DenseElementsAttr GetPermutationFromTrivialReshape(
-    ShapedType input_type, ShapedType output_type) {
+    mlir::ShapedType input_type, mlir::ShapedType output_type) {
   ArrayRef<int64_t> in_shape = input_type.getShape();
   ArrayRef<int64_t> out_shape = output_type.getShape();
 
@@ -195,8 +229,8 @@ inline DenseElementsAttr GetPermutationFromTrivialReshape(
 // Returns true if the reshape op is equivalent to a transpose op.
 // This is true if the reshape op is a trivial reshape op, meaning no change in
 // the order of non-identity dimensions.
-inline bool IsReshapeEquivalentToTranspose(ShapedType input_type,
-                                           ShapedType output_type) {
+inline bool IsReshapeEquivalentToTranspose(mlir::ShapedType input_type,
+                                           mlir::ShapedType output_type) {
   std::vector<int64_t> in_shape{input_type.getShape().vec()};
   std::vector<int64_t> out_shape{output_type.getShape().vec()};
 
@@ -215,14 +249,14 @@ inline bool IsReshapeEquivalentToTranspose(ShapedType input_type,
 
 // Checks if all elements in the constant attribute value are 1.
 inline bool IsAllOnesConstant(Attribute value) {
-  auto values = value.cast<DenseElementsAttr>().getValues<int32_t>();
+  auto values = mlir::cast<DenseElementsAttr>(value).getValues<int32_t>();
   return !std::any_of(values.begin(), values.end(),
                       [](int32_t element_value) { return element_value != 1; });
 }
 
 // Checks if all elements in the constant attribute value are non-negative.
 inline bool HasNonNegativeValues(Attribute value) {
-  auto values = value.cast<DenseElementsAttr>().getValues<APInt>();
+  auto values = mlir::cast<DenseElementsAttr>(value).getValues<APInt>();
   return !std::any_of(
       values.begin(), values.end(),
       [](const APInt& element_value) { return element_value.isNegative(); });
@@ -230,8 +264,8 @@ inline bool HasNonNegativeValues(Attribute value) {
 
 // Utility function to get the offset between two dense attribute values.
 inline TypedAttr GetOffSet(Attribute begin, Attribute end) {
-  auto begin_values = begin.cast<DenseElementsAttr>().getValues<int32_t>();
-  auto end_values = end.cast<DenseElementsAttr>().getValues<int32_t>();
+  auto begin_values = mlir::cast<DenseElementsAttr>(begin).getValues<int32_t>();
+  auto end_values = mlir::cast<DenseElementsAttr>(end).getValues<int32_t>();
 
   SmallVector<int32_t> offsets;
   if (begin_values.size() == end_values.size()) {
@@ -269,7 +303,7 @@ inline bool AreLastTwoDimsTransposed(Value permutation) {
 
 // Gets the new type after transposing the last 2 dimensions.
 inline Type TransposeLastTwoDims(Type type) {
-  auto shaped_type = type.dyn_cast<ShapedType>();
+  auto shaped_type = mlir::dyn_cast<ShapedType>(type);
   if (!shaped_type.hasStaticShape() || shaped_type.getRank() < 2) {
     return nullptr;
   }
@@ -285,9 +319,9 @@ inline Type TransposeLastTwoDims(Type type) {
 
 // Returns a ShapedType for a permutation and the shape of input after
 // applying the permutation to the given shape through a transpose.
-inline ShapedType GetTransposedType(Value input,
-                                    llvm::ArrayRef<int64_t> permutation_array) {
-  auto input_type = input.getType().cast<ShapedType>();
+inline mlir::ShapedType GetTransposedType(
+    Value input, llvm::ArrayRef<int64_t> permutation_array) {
+  auto input_type = mlir::cast<ShapedType>(input.getType());
   if (permutation_array.size() != input_type.getRank()) {
     return nullptr;
   }
@@ -327,41 +361,67 @@ inline DenseElementsAttr GetExpandedShapeAttr(Value input_val, int n) {
 
 // Return the resultant shape type if the shape of the supplied attribute/value
 // is expanded by n leading 1s'.
-inline ShapedType GetExpandedShapeType(Value input_val, int n) {
+inline mlir::ShapedType GetExpandedShapeType(Value input_val, int n) {
   auto expanded_shape = GetExpandedShape(input_val, n);
   return RankedTensorType::get(
       SmallVector<int64_t>{expanded_shape.begin(), expanded_shape.end()},
       mlir::cast<ShapedType>(input_val.getType()).getElementType());
 }
 
-// Returns shape of a ranked tensor.
-// Precondition: output_val's is ranked tensor.
-// Returns a truncated shape when `truncate` is set to true.
-inline DenseElementsAttr GetShape(Value output_val, bool truncate = false) {
-  auto output_shape = output_val.getType().dyn_cast<ShapedType>().getShape();
+// Returns shape of a ranked tensor as a SmallVector.
+// Precondition: input_value's is ranked tensor.
+// Returns a squeezed shape when `squeeze_leading_ones` is set to true.
+inline SmallVector<int32_t> GetShape(Value input_value,
+                                     bool squeeze_leading_ones = false) {
+  auto output_shape =
+      mlir::dyn_cast<ShapedType>(input_value.getType()).getShape();
 
   SmallVector<int32_t> shape;
   shape.reserve(output_shape.size());
 
-  bool needs_truncation = true;
+  bool can_squeeze = true;
   for (size_t dim_idx = 0; dim_idx < output_shape.size(); ++dim_idx) {
     int64_t dim = output_shape[dim_idx];
-    if (truncate && needs_truncation && dim == 1) {
+    if (squeeze_leading_ones && can_squeeze && dim == 1) {
       continue;
-    } else if (needs_truncation && dim != 1) {
-      needs_truncation = false;
+    } else if (can_squeeze && dim != 1) {
+      can_squeeze = false;
     }
     shape.push_back(ShapedType::isDynamic(dim) ? -1
                                                : static_cast<int32_t>(dim));
   }
+  return shape;
+}
+
+// Returns shape of a ranked tensor as a DenseElementsAttr.
+// Precondition: input_value's is ranked tensor.
+// Returns a squeezed shape when `squeeze_leading_ones` is set to true.
+inline DenseElementsAttr GetShapeAttr(Value input_value,
+                                      bool squeeze_leading_ones = false) {
+  SmallVector<int32_t> shape = GetShape(input_value, squeeze_leading_ones);
 
   return mlir::DenseElementsAttr::get(
       RankedTensorType::get(
           {static_cast<int>(shape.size())},
-          mlir::IntegerType::get(output_val.getContext(), 32)),
+          mlir::IntegerType::get(input_value.getContext(), 32)),
       llvm::ArrayRef(shape));
 }
 
+// Returns the value of a constant attribute as an int array, if the value is
+// not a constant, returns an error status.
+inline absl::StatusOr<SmallVector<int32_t>> GetValueAsIntArray(Value value) {
+  DenseElementsAttr values_const_attr;
+  if (!matchPattern(value, m_Constant(&values_const_attr))) {
+    return absl::InvalidArgumentError("Value is not a constant.");
+  }
+
+  SmallVector<int32_t> values;
+  for (const auto& value : values_const_attr.getValues<APInt>()) {
+    values.push_back(value.getSExtValue());
+  }
+  return values;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 ///////////////// OP BROADCASTING UTILITIES ////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
@@ -402,6 +462,136 @@ DenseElementsAttr GetScalarOfType(Type ty, T raw_value) {
   llvm_unreachable("unsupported type");
 }
 
+// Checks if reduction axes and broadcast axes are disjoint.
+// Broadcast axes are derived by comparing the shape of `input_val` to the shape
+// represented by `target_shape_attr` according to standard broadcasting rules.
+// Returns true if the sets of axes are disjoint, false otherwise or on error.
+inline bool AreBroadcastAndReductionAxesIndependent(
+    mlir::Value input_val, const mlir::Attribute& indices_attr,
+    const mlir::Attribute& target_shape_attr) {
+  // 1. Get input type and shape.
+  // Use llvm::dyn_cast for safer casting.
+  auto ranked_input_type =
+      llvm::dyn_cast<mlir::RankedTensorType>(input_val.getType());
+  if (!ranked_input_type) {
+    // Consider logging or error emission if builder context is
+    // available/needed.
+    return false;  // Expect ranked type.
+  }
+  llvm::ArrayRef<int64_t> input_shape = ranked_input_type.getShape();
+  const int64_t input_rank = ranked_input_type.getRank();
+
+  // 2. Validate and extract reduction axes.
+  // Use llvm::dyn_cast for safer casting.
+  auto indices = llvm::dyn_cast<mlir::DenseElementsAttr>(indices_attr);
+  if (!indices || !indices.getElementType().isIntOrIndex()) {
+    return false;  // Invalid indices attribute.
+  }
+
+  // Use std::set for efficient storage and lookup of axes.
+  std::set<int64_t> reduction_axes_set;
+  if (!indices.empty()) {  // Only process if there are reduction axes.
+    if (input_rank == 0) {
+      // It's invalid to specify reduction axes for a scalar (rank 0) input.
+      return false;
+    }
+
+    // Iterate using range-based for loop and structured binding (if applicable)
+    // or direct value access.
+    for (const mlir::APInt& axis_val : indices.getValues<mlir::APInt>()) {
+      int64_t axis =
+          axis_val.getSExtValue();  // Use sign extension for neg axes.
+
+      // Normalize axis and check bounds.
+      if (axis < -input_rank || axis >= input_rank) {
+        return false;  // Axis out of bounds.
+      }
+      if (axis < 0) {
+        axis += input_rank;  // Convert negative axis to positive.
+      }
+      reduction_axes_set.insert(axis);
+    }
+  }
+
+  // If there are no reduction axes, they are trivially independent of any
+  // broadcast axes.
+  if (reduction_axes_set.empty()) {
+    return true;
+  }
+
+  // 3. Validate and extract target shape for broadcast.
+  // Use llvm::dyn_cast for safer casting.
+  auto target_shape_value_attr =
+      llvm::dyn_cast<mlir::DenseElementsAttr>(target_shape_attr);
+  if (!target_shape_value_attr ||
+      !target_shape_value_attr.getElementType().isIntOrIndex()) {
+    return false;  // Invalid target shape attribute.
+  }
+
+  // Use llvm::SmallVector for efficient shape storage.
+  llvm::SmallVector<int64_t, 4> target_shape_vec;
+  target_shape_vec.reserve(
+      target_shape_value_attr.getNumElements());  // Pre-allocate
+  for (const mlir::APInt& shape_val :
+       target_shape_value_attr.getValues<mlir::APInt>()) {
+    // Assuming shape dimensions should be non-negative, consider getZExtValue.
+    // However, getSExtValue is safe if intermediate calculations handle signs.
+    target_shape_vec.push_back(shape_val.getSExtValue());
+  }
+  // Use llvm::ArrayRef for safe, non-owning view of the shape vector.
+  llvm::ArrayRef<int64_t> target_shape = target_shape_vec;
+  const int64_t target_rank = target_shape.size();
+
+  // 4. Determine broadcast axes based on standard broadcasting rules.
+  std::set<int64_t> broadcast_axes_set;
+  const int64_t max_rank = std::max(input_rank, target_rank);
+
+  // Iterate through dimensions, aligning from the right (trailing dimensions).
+  for (int64_t i = 0; i < max_rank; ++i) {
+    // Calculate indices relative to the end of the shape arrays.
+    const int64_t input_dim_idx = input_rank - 1 - i;
+    const int64_t target_dim_idx = target_rank - 1 - i;
+
+    // Treat dimensions missing due to lower rank as having size 1.
+    const int64_t input_dim =
+        (input_dim_idx >= 0) ? input_shape[input_dim_idx] : 1;
+    const int64_t target_dim =
+        (target_dim_idx >= 0) ? target_shape[target_dim_idx] : 1;
+
+    // Check for incompatible shapes (dimensions differ and neither is 1).
+    // This indicates an invalid broadcast according to NumPy rules.
+    if (input_dim != target_dim && input_dim != 1 && target_dim != 1) {
+      // Consider if the specific broadcast op allows other behaviors (e.g.,
+      // -1). For standard rules, this is an incompatibility.
+      return false;
+    }
+
+    // An axis in the *input* tensor is involved in broadcasting if its size is
+    // 1 and the corresponding target dimension size is greater than 1.
+    if (input_dim == 1 && target_dim > 1) {
+      // Ensure the axis index is valid for the input tensor's rank.
+      if (input_dim_idx >= 0) {
+        broadcast_axes_set.insert(input_dim_idx);
+      }
+      // Note: If input_dim_idx < 0, broadcasting occurs due to rank difference,
+      // but it doesn't correspond to an axis *within* the original input
+      // tensor.
+    }
+  }
+
+  // 5. Check for intersection between the set of reduction axes and the set of
+  //    broadcast axes derived above.
+  for (int64_t reduction_axis : reduction_axes_set) {
+    if (broadcast_axes_set.count(reduction_axis)) {
+      // Found an axis that is present in both sets.
+      return false;
+    }
+  }
+
+  // 6. No overlapping axes were found.
+  return true;
+}
+
 }  // namespace TFL
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
index 12d12a6c02fc..7583d48618f4 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.td
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -19,6 +19,18 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/IR/PatternBase.td"
 
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////// TENSOR TYPE UTILITIES ////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+def IsQuantized : Constraint<CPred<
+  "llvm::dyn_cast<ShapedType>($0.getType()) && "
+  "llvm::isa<quant::UniformQuantizedType>("
+  "llvm::dyn_cast<ShapedType>($0.getType()).getElementType())">>;
+
+def IsNotQuantized : Constraint<Neg<IsQuantized.predicate>>;
+
 ////////////////////////////////////////////////////////////////////////////////
 ///////////////// TENSOR RANK UTILITIES ////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
@@ -26,42 +38,42 @@ include "mlir/IR/PatternBase.td"
 // Checks if the rank of the value is less than or equal to the rank of the
 // other value.
 def IsRankLessThanEqualTo : Constraint<CPred<
-  "$0.getType().cast<ShapedType>().getRank() <= "
-  "$1.getType().cast<ShapedType>().getRank()">>;
+  "llvm::cast<ShapedType>($0.getType()).getRank() <= "
+  "llvm::cast<ShapedType>($1.getType()).getRank()">>;
 
 // Checks if the value has rank at most 'n'.
 class HasRankAtMost<int n> : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-          "$0.getType().cast<ShapedType>().getRank() <= " # n>>;
+    CPred<"llvm::cast<ShapedType>($0.getType()).hasRank() && "
+          "llvm::cast<ShapedType>($0.getType()).getRank() <= " # n>>;
 
 ////////////////////////////////////////////////////////////////////////////////
 ///////////////// DENSE UTILITIES /////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 
-def DenseFPElementsAttrPred : CPred<"$_self.isa<DenseFPElementsAttr>()">;
-def DenseIntElementsAttrPred : CPred<"$_self.isa<DenseIntElementsAttr>()">;
+def DenseFPElementsAttrPred : CPred<"llvm::isa<DenseFPElementsAttr>($_self)">;
+def DenseIntElementsAttrPred : CPred<"llvm::isa<DenseIntElementsAttr>($_self)">;
 
 ////////////////////////////////////////////////////////////////////////////////
 ///////////////// SPLAT CONSTANT UTILITIES /////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 
 def DenseElementsAttrIsSplatPred
-    : CPred<"$_self.cast<DenseElementsAttr>().isSplat()">;
+    : CPred<"llvm::cast<DenseElementsAttr>($_self).isSplat()">;
 
 class DenseFPElementsAttrSplatValueEqualToPred<string val>
-    : CPred<"$_self.cast<DenseFPElementsAttr>().getSplatValue<FloatAttr>()"
+    : CPred<"llvm::cast<DenseFPElementsAttr>($_self).getSplatValue<FloatAttr>()"
             ".getValueAsDouble() == " # val>;
 
 class DenseFPElementsAttrSplatValueEqualToPredWithTolerance<string val, string tolerance>
-    : CPred<"std::abs($_self.cast<DenseFPElementsAttr>().getSplatValue<FloatAttr>()"
+    : CPred<"std::abs(llvm::cast<DenseFPElementsAttr>($_self).getSplatValue<FloatAttr>()"
             ".getValueAsDouble() - " # val # ") <= "#tolerance>;
 
 class DenseIntElementsAttrSplatValueEqualToPred<string val>
-    : CPred<"$_self.isa<DenseIntElementsAttr>() && "
-            "$_self.cast<DenseIntElementsAttr>().getElementType()"
-            "  .isa<IntegerType>() && "
-            "$_self.cast<DenseIntElementsAttr>().isSplat() && "
-            "$_self.cast<DenseIntElementsAttr>().getSplatValue<IntegerAttr>()"
+    : CPred<"llvm::isa<DenseIntElementsAttr>($_self) && "
+            "llvm::isa<IntegerType>("
+	    "llvm::cast<DenseIntElementsAttr>($_self).getElementType()) && "
+            "llvm::cast<DenseIntElementsAttr>($_self).isSplat() && "
+            "llvm::cast<DenseIntElementsAttr>($_self).getSplatValue<IntegerAttr>()"
             "  .getValue().getSExtValue() == " # val>;
 
 // AttrConstraint to match a floating point dense elements attribute with a
@@ -98,8 +110,8 @@ def SplatIntElementsAttr : ElementsAttrBase<
 def GetScalarElementsAttrFromSplat : NativeCodeCall<
     "DenseElementsAttr::get("
     " RankedTensorType::get({},"
-        " $0.cast<mlir::DenseElementsAttr>().getType().getElementType()),"
-    " $0.cast<mlir::DenseElementsAttr>().getSplatValue<mlir::Attribute>())">;
+        " llvm::cast<mlir::DenseElementsAttr>($0).getType().getElementType()),"
+    " llvm::cast<mlir::DenseElementsAttr>($0).getSplatValue<mlir::Attribute>())">;
 
 ////////////////////////////////////////////////////////////////////////////////
 ///////////////// OP BROADCASTING UTILITIES ////////////////////////////////////
@@ -109,15 +121,18 @@ def OperandsBroadcastToOutputType : Constraint<CPred<
   "TFL::OperandsBroadcastToOutputType($0.getType(), $1.getType(), "
                                      "$2.getType())">>;
 
+def OperandsDontBroadcastToOutputType : Constraint<Neg<
+  OperandsBroadcastToOutputType.predicate>>;
+
 ////////////////////////////////////////////////////////////////////////////////
 ///////////////// TENSOR SHAPE UTILITIES ///////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 
 def HasSameStaticShapes : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().hasStaticShape() && "
-          "$1.getType().cast<ShapedType>().hasStaticShape() && "
-          "$0.getType().cast<ShapedType>().getShape() =="
-          "$1.getType().cast<ShapedType>().getShape()">,
+    CPred<"llvm::cast<ShapedType>($0.getType()).hasStaticShape() && "
+          "llvm::cast<ShapedType>($1.getType()).hasStaticShape() && "
+          "llvm::cast<ShapedType>($0.getType()).getShape() =="
+          "llvm::cast<ShapedType>($1.getType()).getShape()">,
     "have the same static shape">;
 
 def CreateNoneValue : NativeCodeCall<
@@ -125,7 +140,7 @@ def CreateNoneValue : NativeCodeCall<
 
 // Returns shape of a ranked tensor.
 // if called without a ranked tensor it will fail.
-def GetShape: NativeCodeCall<"GetShape($0)">;
+def GetShapeAttr: NativeCodeCall<"GetShapeAttr($0)">;
 
 // Return the resultant shape if the shape of the supplied attribute/value is
 // expanded by n leading 1s'.
@@ -144,22 +159,25 @@ def IsAllOnesConstant : Constraint<CPred<"TFL::IsAllOnesConstant($0)">>;
 // the permutation is a cyclic permutation of the original shape with only the
 // identity dimensions permuted.
 def IsTransposeTrivial : Constraint<CPred<
-  "TFL::IsTransposeTrivial($0.getType().cast<ShapedType>().getShape(), $1)">>;
+  "TFL::IsTransposeTrivial(llvm::cast<ShapedType>($0.getType()).getShape(), $1)">>;
+
+// Constraint that checks if the transpose op is a no-op.
+def IsTransposeNoop : Constraint<CPred<"TFL::IsTransposeNoop($0)">>;
 
 // Constraint that checks if the reshape op is equivalent to a transpose op.
 // This is true if the reshape op is a trivial reshape op, meaning no change in
 // the order of non-identity dimensions.
 def IsReshapeEquivalentToTranspose : Constraint<CPred<
   "TFL::IsReshapeEquivalentToTranspose("
-    "$0.getType().cast<ShapedType>(),"
-    "$1.getType().cast<ShapedType>())">>;
+    "llvm::cast<ShapedType>($0.getType()),"
+    "llvm::cast<ShapedType>($1.getType()))">>;
 
 // Returns the permutation of the trivial reshape op, this will be used to
 // construct the transpose op.
 def GetPermutationFromTrivialReshape : NativeCodeCall<
   "TFL::GetPermutationFromTrivialReshape("
-    "$0.getType().cast<ShapedType>(),"
-    "$1.getType().cast<ShapedType>())">;
+    "llvm::cast<ShapedType>($0.getType()),"
+    "llvm::cast<ShapedType>($1.getType()))">;
 
 // Constraint that checks if all values in offset between two
 // attributes are non-negative.
@@ -173,12 +191,12 @@ def GetOffSet : NativeCodeCall<"TFL::GetOffSet($0, $1)">;
 
 // Attribute Constraint that checks if the attribute value is zero.
 def ZeroIntAttr
-  : AttrConstraint<CPred<"$_self.cast<::mlir::IntegerAttr>().getInt() == 0">>;
+  : AttrConstraint<CPred<"llvm::cast<::mlir::IntegerAttr>($_self).getInt() == 0">>;
 
 // Checks if the value has rank at most 'n'.
 class HasRankAtLeast<int n> : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-          "$0.getType().cast<ShapedType>().getRank() >= " # n>>;
+    CPred<"llvm::cast<ShapedType>($0.getType()).hasRank() && "
+          "llvm::cast<ShapedType>($0.getType()).getRank() >= " # n>>;
 
 // Accepts two inputs and check if both have the same element type.
 def SameElementType : Constraint<
@@ -209,7 +227,7 @@ def AreLastTwoDimsTransposed : Constraint<CPred<
   "TFL::AreLastTwoDimsTransposed($0)">>;
 
 // Checks if the param passed is of NoneType.
-def IsNoneType : Constraint<CPred<"$0.getType().isa<NoneType>()">>;
+def IsNoneType : Constraint<CPred<"llvm::isa<NoneType>($0.getType())">>;
 
 def ConstantLikePred : CPred<"::mlir::matchPattern($0, ::mlir::m_Constant())">;
 def IsConstantLike : Constraint<ConstantLikePred>;
diff --git a/tensorflow/compiler/mlir/lite/utils/utils_test.cc b/tensorflow/compiler/mlir/lite/utils/utils_test.cc
new file mode 100644
index 000000000000..f4e37480b2b0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/utils_test.cc
@@ -0,0 +1,128 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+// Test fixture for AreBroadcastAndReductionAxesIndependent function.
+class BroadcastAndReductionAxesIndependentTest : public ::testing::Test {
+ protected:
+  BroadcastAndReductionAxesIndependentTest() : builder_(&context_) {
+    context_.loadDialect<arith::ArithDialect>();
+  }
+
+  // Builds an mlir::Value representing a tensor with the given shape.
+  Value BuildTensor(ArrayRef<int64_t> shape) {
+    return builder_.create<arith::ConstantOp>(
+        builder_.getUnknownLoc(),
+        RankedTensorType::get(shape, builder_.getF32Type()),
+        builder_.getZeroAttr(
+            RankedTensorType::get(shape, builder_.getF32Type())));
+  }
+
+  // Builds a DenseElementsAttr representing an integer array.
+  DenseElementsAttr BuildIntArrayAttr(ArrayRef<int32_t> values) {
+    return DenseElementsAttr::get(
+        RankedTensorType::get({static_cast<int32_t>(values.size())},
+                              builder_.getI32Type()),
+        values);
+  }
+
+  MLIRContext context_;
+  OpBuilder builder_;
+};
+
+TEST_F(BroadcastAndReductionAxesIndependentTest, IndependentAxes) {
+  Value input_tensor = BuildTensor({2, 1, 4, 1});
+  DenseElementsAttr reduction_axes = BuildIntArrayAttr({0, 2});
+  DenseElementsAttr target_shape = BuildIntArrayAttr({2, 3, 4, 5});
+
+  EXPECT_TRUE(AreBroadcastAndReductionAxesIndependent(
+      input_tensor, reduction_axes, target_shape));
+  input_tensor.getDefiningOp()->destroy();
+}
+
+TEST_F(BroadcastAndReductionAxesIndependentTest, OverlappingAxes) {
+  Value input_tensor = BuildTensor({1, 3, 4, 5});
+  DenseElementsAttr reduction_axes = BuildIntArrayAttr({0, 2});
+  DenseElementsAttr target_shape = BuildIntArrayAttr({2, 3, 4, 5});
+
+  EXPECT_FALSE(AreBroadcastAndReductionAxesIndependent(
+      input_tensor, reduction_axes, target_shape));
+  input_tensor.getDefiningOp()->destroy();
+}
+
+TEST_F(BroadcastAndReductionAxesIndependentTest, EmptyReductionAxes) {
+  Value input_tensor = BuildTensor({1, 3, 1, 5});
+  DenseElementsAttr reduction_axes = BuildIntArrayAttr({});
+  DenseElementsAttr target_shape = BuildIntArrayAttr({2, 3, 4, 5});
+
+  EXPECT_TRUE(AreBroadcastAndReductionAxesIndependent(
+      input_tensor, reduction_axes, target_shape));
+  input_tensor.getDefiningOp()->destroy();
+}
+
+TEST_F(BroadcastAndReductionAxesIndependentTest, UnrankedInput) {
+  Value input_tensor = builder_.create<arith::ConstantOp>(
+      builder_.getUnknownLoc(), builder_.getF32Type(),
+      builder_.getZeroAttr(builder_.getF32Type()));
+  DenseElementsAttr reduction_axes = BuildIntArrayAttr({0, 2});
+  DenseElementsAttr target_shape = BuildIntArrayAttr({2, 3, 4, 5});
+
+  EXPECT_FALSE(AreBroadcastAndReductionAxesIndependent(
+      input_tensor, reduction_axes, target_shape));
+  input_tensor.getDefiningOp()->destroy();
+}
+
+TEST_F(BroadcastAndReductionAxesIndependentTest, InvalidReductionAxesType) {
+  Value input_tensor = BuildTensor({2, 3, 4, 5});
+  DenseElementsAttr reduction_axes = DenseElementsAttr::get(
+      RankedTensorType::get({2}, builder_.getF32Type()), {1.0f, 2.0f});
+  DenseElementsAttr target_shape = BuildIntArrayAttr({1, 3, 1, 5});
+
+  EXPECT_FALSE(AreBroadcastAndReductionAxesIndependent(
+      input_tensor, reduction_axes, target_shape));
+  input_tensor.getDefiningOp()->destroy();
+}
+
+TEST_F(BroadcastAndReductionAxesIndependentTest, InvalidTargetShapeType) {
+  Value input_tensor = BuildTensor({2, 3, 4, 5});
+  DenseElementsAttr reduction_axes = BuildIntArrayAttr({0, 2});
+  DenseElementsAttr target_shape = DenseElementsAttr::get(
+      RankedTensorType::get({2}, builder_.getF32Type()), {1.0f, 2.0f});
+
+  EXPECT_FALSE(AreBroadcastAndReductionAxesIndependent(
+      input_tensor, reduction_axes, target_shape));
+  input_tensor.getDefiningOp()->destroy();
+}
+
+}  // namespace
+}  // namespace TFL
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/variables_utils.cc b/tensorflow/compiler/mlir/lite/utils/variables_utils.cc
index 0cab3ff3db32..fe13b43c0163 100644
--- a/tensorflow/compiler/mlir/lite/utils/variables_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/variables_utils.cc
@@ -29,17 +29,15 @@ namespace utils {
 bool IsSupportedVariableType(Operation* op) {
   ShapedType type;
   if (llvm::isa<TF::ReadVariableOp>(op)) {
-    type = op->getResult(0).getType().cast<ShapedType>();
+    type = llvm::cast<ShapedType>(op->getResult(0).getType());
   } else if (llvm::isa<TF::AssignVariableOp>(op)) {
-    type = op->getOperand(1).getType().cast<ShapedType>();
+    type = llvm::cast<ShapedType>(op->getOperand(1).getType());
   } else if (llvm::isa<TF::VarHandleOp>(op)) {
-    type = op->getResult(0)
-               .getType()
-               .cast<ShapedType>()
-               .getElementType()
-               .cast<TF::TensorFlowTypeWithSubtype>()
-               .GetSubtypes()
-               .back();
+    type =
+        llvm::cast<tf_type::TensorFlowTypeWithSubtype>(
+            llvm::cast<ShapedType>(op->getResult(0).getType()).getElementType())
+            .GetSubtypes()
+            .back();
   }
   return IsSupportedVariableType(type);
 }
@@ -47,13 +45,13 @@ bool IsSupportedVariableType(Operation* op) {
 bool IsSupportedVariableType(ShapedType type) {
   auto element_type = type.getElementType();
   // Check complex types.
-  if (auto complex_type = element_type.dyn_cast<mlir::ComplexType>()) {
+  if (auto complex_type = llvm::dyn_cast<ComplexType>(element_type)) {
     auto complex_element_type = complex_type.getElementType();
     if (complex_element_type.isF32() || complex_element_type.isF64())
       return true;
   }
   // Check quantized types.
-  if (auto quant_type = element_type.dyn_cast<mlir::quant::QuantizedType>()) {
+  if (auto quant_type = llvm::dyn_cast<quant::QuantizedType>(element_type)) {
     // TFLite supports QI16, QI32, QI8, and QUI8
     if ((quant_type.getStorageTypeIntegralWidth() == 16 &&
          quant_type.isSigned()) ||
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 41cc194be23f..9c8e27a51b49 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -348,9 +348,11 @@ absl::Status MlirFunctionOptimizationPass::Run(
       //     error to the caller.
       //   Enabled - return error back to the caller.
       if (pass_state == MlirOptimizationPassState::FallbackEnabled) {
-        LOG(WARNING) << StringRefToView(name)
-                     << " pass failed, continuing without the pass because the "
-                        "pass has fallback enabled";
+        LOG(WARNING)
+            << StringRefToView(name)
+            << " pass failed, continuing without the pass because the "
+            << "pass has fallback enabled. This was the pass failure:\n"
+            << pass_status;
         mlir_function_pass_fallback_count->GetCell(kFailure)->IncrementBy(1);
       } else if (pass_state == MlirOptimizationPassState::Enabled) {
         return pass_status;
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 304ab73ea9b7..07a516a70f38 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -52,7 +52,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
         "@local_xla//xla/mlir/framework/transforms:passes",
         "@local_xla//xla/mlir_hlo:all_passes",
-        "//tensorflow/compiler/mlir/lite:flatbuffer_import",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 80fa3ecf23f8..5eaf5d736262 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/cc/saved_model/loader.h"
-#include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
index 2e357393d36f..975840a70db2 100644
--- a/tensorflow/compiler/mlir/quantization/common/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -9,6 +9,7 @@ package(
         "//learning/brain/mlir/quantization:__subpackages__",
         "//tensorflow/compiler/mlir/lite:__subpackages__",
         "//tensorflow/compiler/mlir/quantization:__subpackages__",
+        "//tensorflow/compiler/mlir/stablehlo:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -24,6 +25,37 @@ td_library(
     ],
 )
 
+cc_library(
+    name = "tf_lift_as_function_call",
+    srcs = ["tf_lift_as_function_call.cc"],
+    hdrs = ["tf_lift_as_function_call.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:stablehlo_type_utils",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/ir/types:Dialect",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:version",
+    ],
+)
+
 cc_library(
     name = "lift_as_function_call",
     srcs = ["lift_as_function_call.cc"],
@@ -120,6 +152,32 @@ cc_library(
         ":func",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "tf_test_base",
+    testonly = 1,
+    srcs = [],
+    hdrs = ["tf_test_base.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":func",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
@@ -136,6 +194,34 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_attrs_and_constraints",
+    srcs = [
+        "tf_attrs_and_constraints.cc",
+    ],
+    hdrs = [
+        "tf_attrs_and_constraints.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":tf_uniform_quantized_types",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
 cc_library(
     name = "attrs_and_constraints",
     srcs = [
@@ -199,6 +285,19 @@ td_library(
     ],
 )
 
+cc_library(
+    name = "tf_uniform_quantized_types",
+    srcs = ["tf_uniform_quantized_types.cc"],
+    hdrs = ["tf_uniform_quantized_types.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "uniform_quantized_types",
     srcs = ["uniform_quantized_types.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td
index 1921345d6012..b6085d30f656 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td
@@ -17,7 +17,7 @@ include "mlir/IR/PatternBase.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
 
 def DenseElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<DenseElementsAttr>()">,
+  CPred<"llvm::isa<DenseElementsAttr>($_self)">,
   "non-opaque constant tensor">;
 
 // Checks if the data format is "NHWC".
@@ -31,13 +31,13 @@ def IsConstTensor :  Constraint<CPred<"dyn_cast_or_null<TF::ConstOp>($0.getDefin
 
 // Checks if the element value has a float type.
 def IsFloatElementsAttr : ElementsAttrBase<
-  CPred<"$_self.isa<ElementsAttr>() && "
-        "getElementTypeOrSelf($_self.cast<ElementsAttr>().getType()).isa<FloatType>()">,
+  CPred<"llvm::isa<ElementsAttr>($_self) && "
+        "llvm::isa<FloatType>(getElementTypeOrSelf(llvm::cast<ElementsAttr>($_self).getType()))">,
         "float constant tensor">;
 
 // Checks if the boolean value is false.
 def IsFalseBoolAttr : AttrConstraint<
-  CPred<"!$_self.cast<BoolAttr>().getValue()">>;
+  CPred<"!llvm::cast<BoolAttr>($_self).getValue()">>;
 
 // Checks if the value has only one user.
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
@@ -63,7 +63,7 @@ def IsBF16ElementType : Constraint<
 
 // Checks if the value has the type of UniformQuantizedType.
 def IsUniformQuantizedType : Constraint<
-  CPred<"getElementTypeOrSelf($0).isa<mlir::quant::UniformQuantizedType>()">>;
+  CPred<"llvm::isa<mlir::quant::UniformQuantizedType>(getElementTypeOrSelf($0))">>;
 
 // Checks if the given two values have the same type.
 def AreTheSameElementType : Constraint<
@@ -75,12 +75,12 @@ def AreTheSameValue : Constraint<
 
 // Checks if the value has rank.
 def HasRank : Constraint<
-  CPred<"$0.getType().cast<ShapedType>().hasRank()">>;
+  CPred<"llvm::cast<ShapedType>($0.getType()).hasRank()">>;
 
 // Checks if the value has rank of `n`.
 class HasRankOf<int n> : Constraint<
-  CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-        "$0.getType().cast<ShapedType>().getRank() == " # n>,
+  CPred<"llvm::cast<ShapedType>($0.getType()).hasRank() && "
+        "llvm::cast<ShapedType>($0.getType()).getRank() == " # n>,
   "Checks if the value has rank of 'n'.">;
 
 // Checks if the value has static shape.
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/BUILD b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
index 615f54f70d23..162c14c4ad70 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
@@ -25,56 +25,66 @@ td_library(
 gentbl_cc_library(
     name = "QuantOpsIncGen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "QuantOps.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "QuantOps.cc.inc",
-        ),
-        (
-            [
-                "-gen-dialect-decls",
-                "-dialect=quantization",
-            ],
-            "QuantOpsDialect.h.inc",
-        ),
-        (
-            [
-                "-gen-dialect-defs",
-                "-dialect=quantization",
-            ],
-            "QuantOpsDialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "QuantOps.h.inc": ["-gen-op-decls"],
+        "QuantOps.cc.inc": ["-gen-op-defs"],
+        "QuantOpsDialect.h.inc": [
+            "-gen-dialect-decls",
+            "-dialect=quantization",
+        ],
+        "QuantOpsDialect.cc.inc": [
+            "-gen-dialect-defs",
+            "-dialect=quantization",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "QuantOps.td",
     deps = [":QuantizationOpsTdFiles"],
 )
 
+gentbl_cc_library(
+    name = "QuantPassIncGen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"Passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=tfquant",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "Passes.td",
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
 cc_library(
     name = "QuantOps",
     srcs = [
+        "ConvertConst.cc",
+        "ConvertSimQuant.cc",
         "FakeQuantSupport.cc",
         "QuantOps.cc",
+        "QuantizeUtils.cc",
         "UniformSupport.cc",
     ],
     hdrs = [
         "FakeQuantSupport.h",
+        "Passes.h",
         "QuantOps.h",
+        "QuantizeUtils.h",
         "UniformSupport.h",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":QuantOpsIncGen",
+        ":QuantPassIncGen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BytecodeOpInterface",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/ConvertConst.cc b/tensorflow/compiler/mlir/quantization/common/ir/ConvertConst.cc
new file mode 100644
index 000000000000..22f4bb6019d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/ir/ConvertConst.cc
@@ -0,0 +1,124 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/Passes.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.h"
+
+namespace mlir {
+namespace quant::ir {
+
+using mlir::quant::QuantizedType;
+
+namespace {
+#define GEN_PASS_DEF_QUANTCONVERTCONST
+#include "tensorflow/compiler/mlir/quantization/common/ir/Passes.h.inc"
+
+struct ConvertConstPass : public impl::QuantConvertConstBase<ConvertConstPass> {
+  void runOnOperation() override;
+};
+
+struct QuantizedConstRewrite : public OpRewritePattern<QuantizeCastOp> {
+  using OpRewritePattern<QuantizeCastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(QuantizeCastOp qbarrier,
+                                PatternRewriter &rewriter) const override;
+};
+
+}  // namespace
+
+/// Matches a [constant] -> [qbarrier] where the qbarrier results type is
+/// quantized and the operand type is quantizable.
+
+LogicalResult QuantizedConstRewrite::matchAndRewrite(
+    QuantizeCastOp qbarrier, PatternRewriter &rewriter) const {
+  Attribute value;
+
+  // Is the operand a constant?
+  if (!matchPattern(qbarrier.getArg(), m_Constant(&value))) {
+    return failure();
+  }
+
+  // Does the qbarrier convert to a quantized type. This will not be true
+  // if a quantized type has not yet been chosen or if the cast to an equivalent
+  // storage type is not supported.
+  Type qbarrierResultType = qbarrier.getResult().getType();
+  QuantizedType quantizedElementType =
+      QuantizedType::getQuantizedElementType(qbarrierResultType);
+  if (!quantizedElementType) {
+    return failure();
+  }
+  if (!QuantizedType::castToStorageType(qbarrierResultType)) {
+    return failure();
+  }
+
+  // Is the operand type compatible with the expressed type of the quantized
+  // type? This will not be true if the qbarrier is superfluous (converts
+  // from and to a quantized type).
+  if (!quantizedElementType.isCompatibleExpressedType(
+          qbarrier.getArg().getType())) {
+    return failure();
+  }
+
+  // Is the constant value a type expressed in a way that we support?
+  if (!mlir::isa<FloatAttr, DenseElementsAttr, SparseElementsAttr>(value)) {
+    return failure();
+  }
+
+  Type newConstValueType;
+  auto newConstValue =
+      quantizeAttr(value, quantizedElementType, newConstValueType);
+  if (!newConstValue) {
+    return failure();
+  }
+
+  // When creating the new const op, use a fused location that combines the
+  // original const and the qbarrier that led to the quantization.
+  auto fusedLoc = rewriter.getFusedLoc(
+      {qbarrier.getArg().getDefiningOp()->getLoc(), qbarrier.getLoc()});
+  auto newConstOp = rewriter.create<arith::ConstantOp>(
+      fusedLoc, newConstValueType, cast<TypedAttr>(newConstValue));
+  rewriter.replaceOpWithNewOp<StorageCastOp>(qbarrier, qbarrier.getType(),
+                                             newConstOp);
+  return success();
+}
+
+void ConvertConstPass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  auto func = getOperation();
+  auto *context = &getContext();
+  patterns.add<QuantizedConstRewrite>(context);
+  (void)applyPatternsGreedily(func, std::move(patterns));
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertConstPass() {
+  return std::make_unique<ConvertConstPass>();
+}
+
+}  // namespace quant::ir
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/ConvertSimQuant.cc b/tensorflow/compiler/mlir/quantization/common/ir/ConvertSimQuant.cc
new file mode 100644
index 000000000000..51e362eb4166
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/ir/ConvertSimQuant.cc
@@ -0,0 +1,158 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/Passes.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
+
+namespace mlir::quant::ir {
+
+#define GEN_PASS_DEF_QUANTCONVERTSIMULATEDQUANT
+#include "tensorflow/compiler/mlir/quantization/common/ir/Passes.h.inc"
+
+struct ConvertSimulatedQuantPass
+    : public impl::QuantConvertSimulatedQuantBase<ConvertSimulatedQuantPass> {
+  void runOnOperation() override;
+};
+
+/// Base class rewrites ConstFakeQuant into a qbarrier/dbarrier pair.
+template <typename ConcreteRewriteClass, typename FakeQuantOp>
+class FakeQuantRewrite : public OpRewritePattern<FakeQuantOp> {
+ public:
+  using OpRewritePattern<FakeQuantOp>::OpRewritePattern;
+
+  FakeQuantRewrite(MLIRContext *ctx, bool *hadFailure)
+      : OpRewritePattern<FakeQuantOp>(ctx), hadFailure(hadFailure) {}
+
+  LogicalResult matchAndRewrite(FakeQuantOp op,
+                                PatternRewriter &rewriter) const override {
+    // TODO: If this pattern comes up more frequently, consider adding core
+    // support for failable rewrites.
+    if (failableRewrite(op, rewriter)) {
+      *hadFailure = true;
+      return failure();
+    }
+    return success();
+  }
+
+ private:
+  bool *hadFailure;
+
+  bool failableRewrite(FakeQuantOp op, PatternRewriter &rewriter) const {
+    auto converter =
+        mlir::quant::ir::ExpressedToQuantizedConverter::forInputType(
+            op.getType());
+    if (!converter) {
+      return (op.emitError("unsupported quantized type conversion"), true);
+    }
+
+    quant::QuantizedType elementType =
+        static_cast<const ConcreteRewriteClass *>(this)
+            ->convertFakeQuantAttrsToType(op, converter.expressed_type);
+
+    if (!elementType) {
+      // Note that the fakeQuantAttrsToType will have emitted the error.
+      return true;
+    }
+
+    Type quantizedType = converter.convert(elementType);
+    assert(quantizedType &&
+           "Converter accepted a type that it did not convert");
+
+    // TODO: Map to a qbarrier with an attribute like [Forced] to signal that
+    // this is a forced/hard-coded constraint.
+    auto qbarrier = rewriter.create<QuantizeCastOp>(op.getLoc(), quantizedType,
+                                                    op.getInputs());
+    rewriter.replaceOpWithNewOp<DequantizeCastOp>(op, converter.input_type,
+                                                  qbarrier.getResult());
+
+    return false;
+  }
+};
+
+class ConstFakeQuantRewrite
+    : public FakeQuantRewrite<ConstFakeQuantRewrite, ConstFakeQuant> {
+ public:
+  using BaseRewrite = FakeQuantRewrite<ConstFakeQuantRewrite, ConstFakeQuant>;
+
+  ConstFakeQuantRewrite(MLIRContext *ctx, bool *hadFailure)
+      : BaseRewrite(ctx, hadFailure) {}
+
+  quant::QuantizedType convertFakeQuantAttrsToType(ConstFakeQuant fqOp,
+                                                   Type expressedType) const {
+    return quantfork::fakeQuantAttrsToType(
+        fqOp.getLoc(), fqOp.getNumBits(), fqOp.getMin().convertToFloat(),
+        fqOp.getMax().convertToFloat(), fqOp.getNarrowRange(), expressedType,
+        fqOp.getIsSigned());
+  }
+};
+
+class ConstFakeQuantPerAxisRewrite
+    : public FakeQuantRewrite<ConstFakeQuantPerAxisRewrite,
+                              ConstFakeQuantPerAxis> {
+ public:
+  using BaseRewrite =
+      FakeQuantRewrite<ConstFakeQuantPerAxisRewrite, ConstFakeQuantPerAxis>;
+
+  ConstFakeQuantPerAxisRewrite(MLIRContext *ctx, bool *hadFailure)
+      : BaseRewrite(ctx, hadFailure) {}
+
+  quant::QuantizedType convertFakeQuantAttrsToType(ConstFakeQuantPerAxis fqOp,
+                                                   Type expressedType) const {
+    SmallVector<double, 4> min, max;
+    min.reserve(fqOp.getMin().size());
+    max.reserve(fqOp.getMax().size());
+    for (auto m : fqOp.getMin())
+      min.push_back(cast<FloatAttr>(m).getValueAsDouble());
+    for (auto m : fqOp.getMax())
+      max.push_back(cast<FloatAttr>(m).getValueAsDouble());
+
+    return quantfork::fakeQuantAttrsToType(
+        fqOp.getLoc(), fqOp.getNumBits(), fqOp.getAxis(), min, max,
+        fqOp.getNarrowRange(), expressedType, fqOp.getIsSigned());
+  }
+};
+
+void ConvertSimulatedQuantPass::runOnOperation() {
+  bool hadFailure = false;
+  auto func = getOperation();
+  RewritePatternSet patterns(func.getContext());
+  auto *ctx = func.getContext();
+  patterns.add<ConstFakeQuantRewrite, ConstFakeQuantPerAxisRewrite>(
+      ctx, &hadFailure);
+  (void)applyPatternsGreedily(func, std::move(patterns));
+  if (hadFailure) signalPassFailure();
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertSimulatedQuantPass() {
+  return std::make_unique<ConvertSimulatedQuantPass>();
+}
+
+}  // namespace mlir::quant::ir
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/Passes.h b/tensorflow/compiler/mlir/quantization/common/ir/Passes.h
new file mode 100644
index 000000000000..29ba597c253c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/ir/Passes.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+//
+// This file defines all of the passes owned by the quantization dialect. As
+// things mature, it is expected that passes specific to certain frontend or
+// backend dialects will move to those dialects directly. For now, they are
+// incubated here.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_PASSES_H_
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace func {
+class FuncOp;
+}  // namespace func
+
+namespace quant::ir {
+
+/// Creates a pass that converts quantization simulation operations (i.e.
+/// FakeQuant and those like it) to casts into/out of supported QuantizedTypes.
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertSimulatedQuantPass();
+
+/// Creates a pass that converts constants followed by a qbarrier to a
+/// constant whose value is quantized. This is typically one of the last
+/// passes done when lowering to express actual quantized arithmetic in a
+/// low level representation. Because it modifies the constant, it is
+/// destructive and cannot be undone.
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertConstPass();
+
+//===----------------------------------------------------------------------===//
+// Registration
+//===----------------------------------------------------------------------===//
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/quantization/common/ir/Passes.h.inc"
+
+}  // namespace quant::ir
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/Passes.td b/tensorflow/compiler/mlir/quantization/common/ir/Passes.td
new file mode 100644
index 000000000000..86702d598a0a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/ir/Passes.td
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TF_QUANT_PASSES
+#define TF_QUANT_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def QuantConvertConst : Pass<"quant-convert-const", "func::FuncOp"> {
+  let summary = "Converts constants followed by qbarrier to actual quantized "
+                "values";
+  let constructor = "mlir::quant::ir::createConvertConstPass()";
+}
+
+def QuantConvertSimulatedQuant
+    : Pass<"quant-convert-simulated-quantization", "func::FuncOp"> {
+  let summary = "Converts training-time simulated quantization ops to "
+                "corresponding quantize/dequantize casts";
+  let constructor = "mlir::quant::ir::createConvertSimulatedQuantPass()";
+}
+
+#endif // TF_QUANT_PASSES
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.cc b/tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.cc
new file mode 100644
index 000000000000..f5e92ccc4d58
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.cc
@@ -0,0 +1,148 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.h"
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
+
+namespace mlir {
+namespace quant::ir {
+
+/// Converts a possible primitive, real expressed value attribute to a
+/// corresponding storage attribute (typically FloatAttr -> IntegerAttr).
+/// quantizedElementType is the QuantizedType that describes the expressed
+/// origValue.
+/// Returns a converter Attribute or nullptr if conversion is not possible.
+static Attribute convertPrimitiveValueAttr(
+    Attribute origRealValue, quant::QuantizedType quantizedElementType,
+    const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
+  if (mlir::isa<FloatAttr>(origRealValue)) {
+    FloatAttr floatAttr = mlir::cast<FloatAttr>(origRealValue);
+    outConvertedType = quantizedElementType.getStorageType();
+    return IntegerAttr::get(quantizedElementType.getStorageType(),
+                            converter.quantizeFloatToInt(floatAttr.getValue()));
+  }
+
+  return nullptr;
+}
+
+/// Converts a real expressed DenseFPElementsAttr to a corresponding
+/// DenseElementsAttr (typically DenseIntElementsAttr) containing quantized
+/// storage values assuming the given quantizedElementType and converter.
+static DenseElementsAttr convertDenseFPElementsAttr(
+    DenseFPElementsAttr realFPElementsAttr,
+    quant::QuantizedType quantizedElementType,
+    const UniformQuantizedValueConverter &converter) {
+  return realFPElementsAttr.mapValues(
+      quantizedElementType.getStorageType(),
+      [&converter](const APFloat &realVal) {
+        return converter.quantizeFloatToInt(realVal);
+      });
+}
+
+/// Converts a real expressed SplatElementsAttr to a corresponding
+/// SplatElementsAttr containing quantized storage values assuming the given
+/// quantizedElementType and converter.
+static SparseElementsAttr convertSparseElementsAttr(
+    SparseElementsAttr realSparseAttr,
+    quant::QuantizedType quantizedElementType,
+    const UniformQuantizedValueConverter &converter) {
+  DenseElementsAttr realDenseAttr = realSparseAttr.getValues();
+  if (!mlir::isa<DenseFPElementsAttr>(realDenseAttr)) {
+    return nullptr;
+  }
+  DenseElementsAttr quantDenseAttr =
+      convertDenseFPElementsAttr(mlir::cast<DenseFPElementsAttr>(realDenseAttr),
+                                 quantizedElementType, converter);
+  if (!quantDenseAttr) {
+    return nullptr;
+  }
+
+  // Cast from an expressed-type-based type to storage-type-based type,
+  // preserving the sparse shape (i.e. tensor<4xf32> -> tensor<4xi8>).
+  ShapedType newSparseType = mlir::dyn_cast_or_null<ShapedType>(
+      quantizedElementType.castExpressedToStorageType(
+          realSparseAttr.getType()));
+  if (!newSparseType) {
+    return nullptr;
+  }
+  return SparseElementsAttr::get(newSparseType, realSparseAttr.getIndices(),
+                                 quantDenseAttr);
+}
+
+/// Converts a real expressed Attribute to a corresponding Attribute containing
+/// quantized storage values assuming the given uniform quantizedElementType and
+/// converter.
+Attribute quantizeAttrUniform(Attribute realValue,
+                              quant::UniformQuantizedType quantizedElementType,
+                              const UniformQuantizedValueConverter &converter,
+                              Type &outConvertedType) {
+  // Fork to handle different variants of constants supported.
+  if (mlir::isa<DenseFPElementsAttr>(realValue)) {
+    // Dense tensor or vector constant.
+    auto converted =
+        convertDenseFPElementsAttr(mlir::cast<DenseFPElementsAttr>(realValue),
+                                   quantizedElementType, converter);
+    outConvertedType = converted.getType();
+    return converted;
+  }
+  if (mlir::isa<SparseElementsAttr>(realValue)) {
+    // Sparse tensor or vector constant.
+    auto converted =
+        convertSparseElementsAttr(mlir::cast<SparseElementsAttr>(realValue),
+                                  quantizedElementType, converter);
+    outConvertedType = converted.getType();
+    return converted;
+  }
+  // Nothing else matched: try to convert a primitive.
+  return convertPrimitiveValueAttr(realValue, quantizedElementType, converter,
+                                   outConvertedType);
+}
+
+/// Convert an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType().
+/// Returns nullptr if the conversion is not supported.
+/// On success, stores the converted type in outConvertedType.
+Attribute quantizeAttr(Attribute realValue,
+                       quant::QuantizedType quantizedElementType,
+                       Type &outConvertedType) {
+  if (auto uniformQuantized =
+          mlir::dyn_cast<quant::UniformQuantizedType>(quantizedElementType)) {
+    UniformQuantizedValueConverter converter(uniformQuantized);
+    return quantizeAttrUniform(realValue, uniformQuantized, converter,
+                               outConvertedType);
+  }
+  if (auto uniformQuantizedPerAxis =
+          mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+              quantizedElementType)) {
+    UniformQuantizedPerAxisValueConverter converter(uniformQuantizedPerAxis);
+    auto converted = converter.convert(realValue);
+    // TODO: why we need this outConvertedType? remove it?
+    if (converted) {
+      outConvertedType = converted.getType();
+    }
+    return converted;
+  }
+  return nullptr;
+}
+
+}  // namespace quant::ir
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.h b/tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.h
new file mode 100644
index 000000000000..cf9184a1dfea
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_QUANTIZEUTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_QUANTIZEUTILS_H_
+
+namespace mlir {
+class Attribute;
+class Type;
+
+namespace quant {
+class QuantizedType;
+
+namespace ir {
+class UniformQuantizedType;
+class UniformQuantizedValueConverter;
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType().
+/// Returns nullptr if the conversion is not supported. On success, stores the
+/// converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttr(Attribute realValue, QuantizedType quantizedElementType,
+                       Type &outConvertedType);
+
+/// Converts an attribute from a type based on
+/// quantizedElementType.getExpressedType() to one based on
+/// quantizedElementType.getStorageType(), where quantizedElementType is as from
+/// QuantizedType::getQuantizedElementType() and casted to an
+/// UniformQuantizedType. Returns nullptr if the conversion is not supported. On
+/// success, stores the converted type in outConvertedType.
+///
+/// Examples:
+/// 1. realValue is a primitive value attribute:
+/// (realValue: FloatAttr, quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (IntegerAttr, outConvertedType: i8)
+/// 2. realValue is an elements attribute:
+/// (realValue: DenseElementsAttr[tensor<2x2xf32>],
+///  quantizedElementType: UniformQuantizedType[i8:f32])
+///   -> (DenseElementsAttr[tensor<2x2xi8>], outConvertedType: tensor<2x2xi8>)
+Attribute quantizeAttrUniform(Attribute realValue,
+                              UniformQuantizedType quantizedElementType,
+                              const UniformQuantizedValueConverter &converter,
+                              Type &outConvertedType);
+}  // namespace ir
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_QUANTIZEUTILS_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
index 3d5535791f31..d0a1e09ebbc6 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
-namespace mlir::quantfork {
+namespace mlir::quant::ir {
 
 static bool isQuantizablePrimitiveType(Type input_type) {
   return isa<FloatType>(input_type);
@@ -109,4 +109,4 @@ DenseElementsAttr UniformQuantizedPerAxisValueConverter::convert(
   });
 }
 
-}  // namespace mlir::quantfork
+}  // namespace mlir::quant::ir
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
index f4dcc8bf313d..0d4b94aab0a2 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
@@ -34,7 +34,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
-namespace mlir::quantfork {
+namespace mlir::quant::ir {
 
 // Performs type conversion from an arbitrary input type to a type
 // that is expressed by a QuantizedType.
@@ -242,6 +242,6 @@ class UniformQuantizedPerAxisValueConverter {
   int32_t quantization_dim_;
 };
 
-}  // namespace mlir::quantfork
+}  // namespace mlir::quant::ir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
index bf0cf8aa2ba9..c4d1fc32a705 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
@@ -491,7 +491,7 @@ bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr) {
          rhs_out_idx_start >= batch_dim_size;
 }
 
-absl::StatusOr<Method> GetQuantizationMethod(absl::Nonnull<Operation*> op) {
+absl::StatusOr<Method> GetQuantizationMethod(Operation* absl_nonnull op) {
   const auto quantization_method_attr =
       op->getAttrOfType<StringAttr>(kQuantizationMethodAttr);
   if (!quantization_method_attr) {
@@ -509,7 +509,7 @@ absl::StatusOr<Method> GetQuantizationMethod(absl::Nonnull<Operation*> op) {
   return quantization_method;
 }
 
-Method GetQuantizationMethodOrDefault(absl::Nonnull<Operation*> op) {
+Method GetQuantizationMethodOrDefault(Operation* absl_nonnull op) {
   absl::StatusOr<Method> method = GetQuantizationMethod(op);
   if (method.status().code() == absl::StatusCode::kInternal) {
     // This indicates that the `Method` protobuf string is corrupt, but this
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index 22e0307f4a9e..b9faba72f147 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -70,14 +70,14 @@ bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
 // `absl::InternalError` when parsing the attribute to `Method` failed.
 // `op` must be non-null.
 absl::StatusOr<::stablehlo::quantization::Method> GetQuantizationMethod(
-    absl::Nonnull<Operation*> op);
+    Operation* absl_nonnull op);
 
 // Gets the quantization method from `op`. It is retrieved from the
 // `kQuantizationMethodAttr` string attribute. Returns a default instance of
 // `Method` iff the attribute doesn't exist or the attribute contains an invalid
 // textproto for `Method`. `op` must be non-null.
 ::stablehlo::quantization::Method GetQuantizationMethodOrDefault(
-    absl::Nonnull<Operation*> op);
+    Operation* absl_nonnull op);
 
 // Creates a function to wrap the section between arguments and results.
 // The generated function call op type will be decided by the given call_op_type
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
index b6b1d17d17a4..36b7152c15ff 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
@@ -102,16 +102,10 @@ td_library(
 gentbl_cc_library(
     name = "quantization_interfaces_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-interface-decls"],
-            "quantization_interface.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "quantization_interface.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "quantization_interface.h.inc": ["-gen-op-interface-decls"],
+        "quantization_interface.cc.inc": ["-gen-op-interface-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "quantization.td",
     deps = [
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td
index 0f9b6a74762f..706eb8552eb1 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization.td
@@ -31,12 +31,12 @@ include "mlir/Dialect/Quant/IR/QuantBase.td"
 // explicit signedness check to differentiate the signed/unsigned constraints
 // predicates from one another at the TD level.
 class QuantizedType<string n, list<int> params, bit signed>
-  : Type<And<[CPred<"$_self.isa<mlir::quant::QuantizedType>()">,
-              CPred<"$_self.cast<mlir::quant::QuantizedType>()" #
+  : Type<And<[CPred<"llvm::isa<mlir::quant::QuantizedType>($_self)">,
+              CPred<"llvm::cast<mlir::quant::QuantizedType>($_self)" #
                     ".getStorageTypeIntegralWidth() == " # !head(params)>,
-              Or<[CPred<"$_self.cast<mlir::quant::QuantizedType>()" #
+              Or<[CPred<"llvm::cast<mlir::quant::QuantizedType>($_self)" #
                     ".getStorageType().isSignlessInteger()">,
-                  CPred<"$_self.cast<mlir::quant::QuantizedType>()" #
+                  CPred<"llvm::cast<mlir::quant::QuantizedType>($_self)" #
                     ".getStorageType().isSignedInteger() == " # signed>]>]>,
     "Q" # !if (signed, "I", "UI") # !head(params) # " type"> {
   string name = n;
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
index 29b14dc98dd8..d0c3e1899503 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
@@ -46,9 +46,9 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/portable_tensor_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.h"
 #include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/tools/optimize/quantization_utils.h"
@@ -279,7 +279,7 @@ Type GetQuantizedType(Builder builder, const Type input_type,
                       const bool legacy_float_scale,
                       const bool use_fake_quant_num_bits) {
   auto converter =
-      quantfork::ExpressedToQuantizedConverter::forInputType(input_type);
+      mlir::quant::ir::ExpressedToQuantizedConverter::forInputType(input_type);
 
   // Expand the range to prevent extremely small scales and large quantized
   // integers which can cause overflow. This leads to scale
@@ -710,7 +710,7 @@ ElementsAttr Quantize(const Attribute real_value, const Type tensor_type) {
           quant::QuantizedType::getQuantizedElementType(tensor_type)) {
     Type converted_type;
     return dyn_cast_or_null<ElementsAttr>(
-        quantfork::quantizeAttr(real_value, q_type, converted_type));
+        mlir::quant::ir::quantizeAttr(real_value, q_type, converted_type));
   }
   return {};
 }
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index 94169e3e9436..51dbc257d3b7 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -200,7 +200,7 @@ bool QuantizableOpSupportsFloatOutputType(Operation* op);
 
 // Specialized version of location to string for flatbuffer exported locations.
 inline std::string GetTensorNameFromLoc(Location loc) {
-  if (auto name_loc = loc.dyn_cast<NameLoc>()) {
+  if (auto name_loc = llvm::dyn_cast<NameLoc>(loc)) {
     return name_loc.getName().str();
   }
   return "";
@@ -218,7 +218,7 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
 
   LogicalResult matchAndRewrite(quantfork::StatisticsOp op,
                                 PatternRewriter& rewriter) const override {
-    Type expressed = op.getType().cast<ShapedType>().getElementType();
+    Type expressed = llvm::cast<ShapedType>(op.getType()).getElementType();
     quant::QuantizedType quant_type;
     SmallVector<double, 4> mins, maxs;
 
@@ -226,7 +226,8 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
       // Per axis quantization (or per channel quantization)
       int stats_num = op.getAxisStats()->getNumElements();
       if (stats_num == 0 || stats_num % 2 != 0) return failure();
-      auto stats = op.getAxisStats()->dyn_cast<DenseFPElementsAttr>();
+      auto stats =
+          llvm::dyn_cast<DenseFPElementsAttr>(op.getAxisStats().value());
       if (!stats) return failure();
 
       for (auto it = stats.begin(), e = stats.end(); it != e; ++it) {
@@ -255,7 +256,7 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
         quant_type = DownCastScale(quant_type, mins, maxs, op->getLoc());
       }
     } else if (auto stats =
-                   op.getLayerStats().dyn_cast<DenseFPElementsAttr>()) {
+                   llvm::dyn_cast<DenseFPElementsAttr>(op.getLayerStats())) {
       // Per tensor quantization
       auto statValues = stats.getValues<APFloat>();
       double rmin = FloatAttr::getValueAsDouble(statValues[0]);
@@ -481,7 +482,7 @@ class QuantizationPattern : public RewritePattern {
       }
 
       if (!nodes_blocklist.empty()) {
-        if (auto name_loc = quantizing_op->getLoc().dyn_cast<NameLoc>()) {
+        if (auto name_loc = llvm::dyn_cast<NameLoc>(quantizing_op->getLoc())) {
           std::string sloc = name_loc.getName().str();
           if (!sloc.empty() &&
               (nodes_blocklist.find(sloc) != nodes_blocklist.end())) {
@@ -503,12 +504,13 @@ class QuantizationPattern : public RewritePattern {
       inputs.reserve(quantizing_op->getNumOperands());
       for (auto operand : quantizing_op->getOperands()) {
         Type operand_type = operand.getType();
-        if (operand_type.isa<NoneType>()) {
+        if (llvm::isa<NoneType>(operand_type)) {
           inputs.push_back(operand);
           continue;
         }
 
-        auto ele_type = operand.getType().cast<TensorType>().getElementType();
+        auto ele_type =
+            llvm::cast<TensorType>(operand.getType()).getElementType();
         if (static_cast<const ConcreteT*>(this)
                 ->AllowDynamicRangeQuantizedOperand(quantizing_op,
                                                     custom_map)) {
@@ -568,13 +570,13 @@ class QuantizationPattern : public RewritePattern {
           Type result_type = result.getType();
           // Add this to the test coverage once we create test ops with none
           // type results.
-          if (result_type.isa<NoneType>()) {
+          if (llvm::isa<NoneType>(result_type)) {
             outputs_replaced.insert({result, enumerated_result.index()});
             output_types.push_back(result_type);
             continue;
           }
           Type result_ele_type =
-              result.getType().cast<TensorType>().getElementType();
+              llvm::cast<TensorType>(result.getType()).getElementType();
           // If the user is the QuantizeOp, it must be the only user.
           if (result.hasOneUse() &&
               llvm::isa<QuantizeOpT>(*result.user_begin())) {
@@ -648,11 +650,9 @@ class QuantizationPattern : public RewritePattern {
         }
 
         for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
-          if (!quantizing_op->getResult(i)
-                   .getType()
-                   .cast<ShapedType>()
-                   .getElementType()
-                   .isa<FloatType>()) {
+          if (!llvm::isa<FloatType>(
+                  llvm::cast<ShapedType>(quantizing_op->getResult(i).getType())
+                      .getElementType())) {
             continue;
           }
           CreateVerifier<VerifierT>(quantizing_op, quantized_op, rewriter, i,
@@ -673,9 +673,7 @@ class QuantizationPattern : public RewritePattern {
   void RewireFloatModelBackbone(Operation* quantized_op,
                                 Operation* float_op) const {
     for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
-      if (!float_op->getResult(i)
-               .getType()
-               .cast<ShapedType>()
+      if (!llvm::cast<ShapedType>(float_op->getResult(i).getType())
                .getElementType()
                .isF32()) {
         continue;
@@ -768,14 +766,14 @@ struct ConvertUnsignedToSigned : public OpRewritePattern<QuantizeOpT> {
 
     auto flags = quant::QuantizationFlags::Signed;
     QType new_qtype;
-    if (auto uqtype = qtype.template dyn_cast<quant::UniformQuantizedType>()) {
+    if (auto uqtype = llvm::dyn_cast<quant::UniformQuantizedType>(qtype)) {
       new_qtype = quant::UniformQuantizedType::getChecked(
           op.getLoc(), flags, qtype.getStorageType(), qtype.getExpressedType(),
           uqtype.getScale(), uqtype.getZeroPoint() - offset,
           uqtype.getStorageTypeMin() - offset,
           uqtype.getStorageTypeMax() - offset);
-    } else if (auto aqtype = qtype.template dyn_cast<
-                             quant::UniformQuantizedPerAxisType>()) {
+    } else if (auto aqtype =
+                   llvm::dyn_cast<quant::UniformQuantizedPerAxisType>(qtype)) {
       auto zero_points = aqtype.getZeroPoints();
       llvm::SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
                                                     zero_points.end());
diff --git a/tensorflow/compiler/mlir/quantization/common/test_base.h b/tensorflow/compiler/mlir/quantization/common/test_base.h
index f33e586c100d..d89b2ac95616 100644
--- a/tensorflow/compiler/mlir/quantization/common/test_base.h
+++ b/tensorflow/compiler/mlir/quantization/common/test_base.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -53,7 +54,7 @@ class QuantizationTestBase : public Test {
         func::FuncDialect, TF::TensorFlowDialect, TFL::TensorFlowLiteDialect,
         tf_saved_model::TensorFlowSavedModelDialect,
         tf_executor::TensorFlowExecutorDialect, quant::QuantDialect,
-        quantfork::QuantizationForkDialect>();
+        quantfork::QuantizationForkDialect, ir::TFQuantDialect>();
   }
 
   // Parses `module_op_str` to create a `ModuleOp`.
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.cc b/tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.cc
new file mode 100644
index 000000000000..c19b7680b36c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.cc
@@ -0,0 +1,184 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+
+namespace mlir::tf_quant {
+
+using ::mlir::stablehlo::DotGeneralOp;
+
+bool HasStaticShape(Value value) {
+  auto shaped_type = mlir::dyn_cast<ShapedType>(value.getType());
+  if (!shaped_type) return false;
+
+  return shaped_type.hasStaticShape();
+}
+
+bool HasStaticShapeAtDims(Value value, const ArrayRef<int> dims) {
+  auto shaped_type = mlir::dyn_cast<ShapedType>(value.getType());
+  if (!shaped_type || !shaped_type.hasRank()) return false;
+
+  for (auto dim : dims) {
+    if (shaped_type.isDynamicDim(dim)) return false;
+  }
+  return true;
+}
+
+Type CloneTypeWithNewElementType(Type old_type, Type element_type) {
+  if (!mlir::isa<ShapedType>(old_type)) return {};
+
+  return mlir::cast<ShapedType>(old_type).clone(element_type);
+}
+
+SmallVector<Value> CloneOpWithReplacedOperands(
+    OpBuilder& builder, Operation* op, const ArrayRef<Value> new_operands) {
+  IRMapping mapping;
+  for (const auto& arg : enumerate(new_operands)) {
+    mapping.map(op->getOperand(arg.index()), arg.value());
+  }
+  return builder.clone(*op, mapping)->getResults();
+}
+
+FailureOr<int32_t> CastI64ToI32(const int64_t value) {
+  if (!llvm::isInt<32>(value)) {
+    DEBUG_WITH_TYPE(
+        "mlir-quant-attrs-and-constraints",
+        llvm::dbgs()
+            << "Tried to cast " << value
+            << "from int64 to int32, but lies out of range of int32.\n");
+    return failure();
+  }
+  return static_cast<int32_t>(value);
+}
+
+FailureOr<SmallVector<int32_t>> CastI64ArrayToI32(
+    const ArrayRef<int64_t> int64_array) {
+  SmallVector<int32_t> int32_array{};
+  int32_array.reserve(int64_array.size());
+
+  for (const int64_t i64 : int64_array) {
+    FailureOr<int32_t> cast_i32 = CastI64ToI32(i64);
+    if (failed(cast_i32)) return failure();
+
+    int32_array.push_back(*cast_i32);
+  }
+  return int32_array;
+}
+
+StringRef GetEntryFunctionName(TF::XlaCallModuleOp op) {
+  if (!op->hasAttrOfType<FlatSymbolRefAttr>(
+          TF::kStablehloEntryFunctionAttrName)) {
+    return StringRef();
+  }
+  return op
+      ->getAttrOfType<FlatSymbolRefAttr>(TF::kStablehloEntryFunctionAttrName)
+      .getValue();
+}
+
+bool IsHybridQuantizedOp(Operation* op) {
+  if ((op->getNumOperands() != 2 && op->getNumOperands() != 3) ||
+      op->getResultTypes().size() != 1) {
+    return false;
+  }
+  Type lhs_type = op->getOperand(0).getType();
+  Type rhs_type = op->getOperand(1).getType();
+  Type result_type = op->getResult(0).getType();
+  return !IsQuantizedTensorType(lhs_type) && IsQuantizedTensorType(rhs_type) &&
+         !IsQuantizedTensorType(result_type);
+}
+
+absl::StatusOr<bool> IsDotGeneralFullyConnected(DotGeneralOp dot_general_op) {
+  if (dot_general_op == nullptr)
+    return absl::InvalidArgumentError(
+        "Given dot_general op cannot be null when checking "
+        "`IsDotGeneralBatchMatmul`.");
+  const ::mlir::stablehlo::DotDimensionNumbersAttr dot_dimension_numbers =
+      dot_general_op.getDotDimensionNumbers();
+  const ArrayRef<int64_t> lhs_contracting_dims =
+      dot_dimension_numbers.getLhsContractingDimensions();
+  const ArrayRef<int64_t> rhs_contracting_dims =
+      dot_dimension_numbers.getRhsContractingDimensions();
+  const int64_t input_rank =
+      mlir::dyn_cast<ShapedType>(dot_general_op.getOperand(0).getType())
+          .getRank();
+  const int64_t filter_rank =
+      mlir::dyn_cast<ShapedType>(dot_general_op.getOperand(1).getType())
+          .getRank();
+  // The following conditions are such requirements:
+  //   - rank(lhs) is 1 or 2
+  //   - rank(rhs) = 2
+  //   - size(lhs_contracting_dimensions) = 1
+  //   - size(rhs_contracting_dimensions) = 1
+  //   - lhs_contracting_dimension = last dimension of lhs.
+  //   - `stablehlo.dot_general` should not have `lhs_batching_dim`.
+  //   - quantization_dimension(rhs) should not be in
+  //     `rhs_contracting_dimensions`.
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dot_general
+  const bool has_proper_rank =
+      (input_rank == 1 || input_rank == 2) && filter_rank == 2;
+  const bool has_proper_contracting_dim =
+      lhs_contracting_dims.size() == 1 && rhs_contracting_dims.size() == 1 &&
+      lhs_contracting_dims[0] == input_rank - 1;
+  const bool is_not_batch_op =
+      dot_dimension_numbers.getLhsBatchingDimensions().empty();
+  const bool has_proper_quantization_dimension =
+      absl::c_find(rhs_contracting_dims, filter_rank) ==
+      rhs_contracting_dims.end();
+  return has_proper_rank && has_proper_contracting_dim && is_not_batch_op &&
+         has_proper_quantization_dimension;
+}
+
+std::optional<int64_t> GetDotGeneralQuantizationDim(
+    DotGeneralOp dot_general_op) {
+  if (dot_general_op == nullptr) return std::nullopt;
+  const int64_t filter_rank =
+      mlir::dyn_cast<ShapedType>(dot_general_op.getOperand(1).getType())
+          .getRank();
+
+  // To quantize rhs per-channel, we currently only consider the case where
+  // `stablehlo.dot_general` is legalizable to `tfl.fully_connected`.
+  const bool is_per_axis_quantizable =
+      IsDotGeneralFullyConnected(dot_general_op).value();
+  if (!is_per_axis_quantizable) return std::nullopt;
+  return filter_rank - 1;
+}
+
+bool ContainsConvOrDot(StringRef str) {
+  return str.contains("_conv") || str.contains("_dot_general");
+}
+
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h
new file mode 100644
index 000000000000..d542996e522f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h
@@ -0,0 +1,260 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_ATTRS_AND_CONSTRAINTS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_ATTRS_AND_CONSTRAINTS_H_
+
+#include <array>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+
+#include "absl/status/statusor.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+
+namespace mlir::tf_quant {
+
+constexpr char kAttrMapAttribute[] = "attr_map";
+
+// Name of the string attribute attached to `XlaCallModuleOp`, which is the
+// textproto representation of `Method`.
+inline constexpr StringRef kQuantizationMethodAttr = "_quantization_method";
+
+// Permutation from the NHWC tensor format to NCHW. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNhwcToNchwPermutation = {0, 3, 1, 2};
+
+// Permutation from the NCHW tensor format to NHWC. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNchwToNhwcPermutation = {0, 2, 3, 1};
+
+// Permutation from the OIHW (== (output features, input features, height,
+// width)) tensor format to HWIO. This is commonly used to transpose convolution
+// weights represented as OIHW format to HWIO, which is more desirable for
+// certain downstream optimization passes (e.g. XLA).
+inline constexpr std::array<int64_t, 4> kOihwToHwioPermutation = {2, 3, 1, 0};
+
+// Returns true if the value has static shape.
+bool HasStaticShape(Value value);
+
+// Returns true if the value has static shape at given dims.
+bool HasStaticShapeAtDims(Value value, ArrayRef<int> dims);
+
+// Whether `value` has known rank of `rank`. Returns false when it is not a
+// `ShapedType` or its rank is unknown.
+inline bool HasRankOf(Value value, const int64_t rank) {
+  auto shaped_type = mlir::dyn_cast_or_null<ShapedType>(value.getType());
+  return shaped_type && shaped_type.hasRank() && shaped_type.getRank() == rank;
+}
+
+// Creates a new type that has the shape from the `old_type` and the element
+// type from the `element_type`.
+Type CloneTypeWithNewElementType(Type old_type, Type element_type);
+
+// Creates an array with integer/float type.
+template <typename T,
+          typename = std::enable_if_t<
+              (std::is_integral_v<T> || std::is_same_v<T, float>), void>>
+Value CreateConstValue(OpBuilder& builder, const Location loc,
+                       const SmallVector<int64_t>& shape,
+                       const SmallVector<T>& values) {
+  if constexpr (std::is_integral_v<T>) {
+    auto shape_type =
+        RankedTensorType::get(shape, builder.getIntegerType(sizeof(T) * 8));
+
+    const auto attr = DenseIntElementsAttr::get(shape_type, values);
+    return builder.create<TF::ConstOp>(loc, attr);
+  }
+
+  const auto type = RankedTensorType::get(shape, builder.getF32Type());
+  const auto value_attr = DenseFPElementsAttr::get(type, values);
+  return builder.create<TF::ConstOp>(loc, value_attr);
+}
+
+// Creates a 1D array with integer/float type.
+template <typename T>
+Value Create1DConstValue(OpBuilder& builder, const Location loc,
+                         const SmallVector<T>& values) {
+  return CreateConstValue<T>(builder, loc,
+                             {static_cast<int64_t>(values.size())}, values);
+}
+
+// Creates a scalar with integer / float type.
+template <typename T>
+Value CreateScalarConstValue(OpBuilder& builder, const Location loc,
+                             const T value) {
+  return CreateConstValue<T>(builder, loc, /*shape=*/{}, {value});
+}
+
+// Checks if the value is a constant and return its splat value.
+template <typename T,
+          typename = std::enable_if_t<
+              (std::is_integral_v<T> || std::is_same_v<T, float>), void>>
+bool GetSplatValue(Value value, T& splat_value) {
+  if constexpr (std::is_integral_v<T>) {
+    DenseIntElementsAttr value_attr;
+    if (!matchPattern(value, m_Constant(&value_attr)) ||
+        !value_attr.isSplat()) {
+      return false;
+    }
+    splat_value = value_attr.getSplatValue<T>();
+    return true;
+  }
+
+  DenseFPElementsAttr value_attr;
+  if (!matchPattern(value, m_Constant(&value_attr)) || !value_attr.isSplat()) {
+    return false;
+  }
+  splat_value = value_attr.getSplatValue<T>();
+  return true;
+}
+
+// Checks if the value is a constant and its splat value is equal to x.
+template <typename T>
+bool IsSplatValueEqual(Value value, const T x) {
+  T splat_value;
+  if (!GetSplatValue(value, splat_value)) return false;
+
+  return splat_value == x;
+}
+
+// Checks if two values are constants and their splat values are equal.
+template <typename T>
+bool AreSplatValuesEqual(Value x, Value y) {
+  T splat_x, splat_y;
+  if (!GetSplatValue(x, splat_x) || !GetSplatValue(y, splat_y)) {
+    return false;
+  }
+
+  return splat_x == splat_y;
+}
+
+// Clones an operation with new operands while keeping attributes.
+SmallVector<Value> CloneOpWithReplacedOperands(OpBuilder& builder,
+                                               Operation* op,
+                                               ArrayRef<Value> new_operands);
+
+// Tries casting `op` with a concrete op type `T`. If the cast fails or `op` is
+// a `nullptr`, returns `failure` and prints a debugging message identifying
+// the cast attempt as `name`.
+template <typename T>
+FailureOr<T> TryCast(Operation* op, const StringRef name) {
+  auto cast_op = dyn_cast_or_null<T>(op);
+  if (cast_op) {
+    return cast_op;
+  } else {
+    DEBUG_WITH_TYPE("mlir-quant-attrs-and-constraints",
+                    llvm::dbgs() << "Failed to match " << name << " ("
+                                 << T::getOperationName() << ").\n");
+    return failure();
+  }
+}
+
+FailureOr<int32_t> CastI64ToI32(int64_t value);
+
+// Tries to cast an array of int64 to int32. If any of the element in the
+// array is not in the range of int32, returns failure().
+FailureOr<SmallVector<int32_t>> CastI64ArrayToI32(
+    ArrayRef<int64_t> int64_array);
+
+// Returns the first operation with the given type in the function.
+template <typename OpType>
+OpType FindOperationOfType(func::FuncOp function) {
+  for (auto op : function.getBody().getOps<OpType>()) {
+    return op;
+  }
+  return nullptr;
+}
+
+// Returns the first user of the given operation, optionally of the given
+// type if provided. If there is no user or user of type, return nullptr.
+template <typename T = Operation*>
+Operation* FindUserOfType(Operation* op) {
+  for (Operation* user : op->getUsers()) {
+    if (isa<T>(user)) {
+      return user;
+    }
+  }
+  return nullptr;
+}
+
+// Returns the first user of the given operation, optionally of the given
+// type if provided. If there is no user or user of type, return nullptr.
+template <typename T = Operation*>
+Operation* FindOperandOfType(Operation* op) {
+  for (Value operand_value : op->getOperands()) {
+    if (isa<T>(operand_value.getDefiningOp())) {
+      return operand_value.getDefiningOp();
+    }
+  }
+  return nullptr;
+}
+
+// Returns the function attribute for the given call op which is lifted for
+// quantization.
+inline FlatSymbolRefAttr GetFuncAttr(TF::PartitionedCallOp call_op) {
+  return mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+}
+
+inline FlatSymbolRefAttr GetFuncAttr(TF::XlaCallModuleOp call_op) {
+  return call_op->getAttrOfType<FlatSymbolRefAttr>(
+      TF::kStablehloEntryFunctionAttrName);
+}
+
+// Returns the entry function name for the given tf.XlaCallModule op. Returns
+// empty string if such attribute does not exist.
+StringRef GetEntryFunctionName(TF::XlaCallModuleOp op);
+
+// Checks whether the given op contains QuantizationTrait::FullyQuantizable.
+inline bool HasQuantizableTrait(Operation* op) {
+  return op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+         op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue().str() ==
+             QuantTraitValues[QuantizationTrait::FullyQuantizable];
+}
+
+// Returns true if `op` has two operands and one result and only second operand
+// is quantized.
+bool IsHybridQuantizedOp(Operation* op);
+
+// Returns whether a given `stablehlo.dot_general` can be legalizable to
+// `tfl.fully_connected`.
+absl::StatusOr<bool> IsDotGeneralFullyConnected(
+    ::mlir::stablehlo::DotGeneralOp dot_general_op);
+
+// Returns the quantization dimension for a given `stablehlo.dot_general` op,
+// or `std::nullopt` if the given op is not per-channel quantizable.
+std::optional<int64_t> GetDotGeneralQuantizationDim(
+    ::mlir::stablehlo::DotGeneralOp dot_general_op);
+
+// Checks if a `StringRef` contains 'conv' or 'dot_general'.
+bool ContainsConvOrDot(StringRef str);
+
+}  // namespace mlir::tf_quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_ATTRS_AND_CONSTRAINTS_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.cc b/tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.cc
new file mode 100644
index 000000000000..602e077d095f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.cc
@@ -0,0 +1,550 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <queue>
+#include <stack>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/Version.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+#include "tensorflow/core/ir/types/dialect.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant {
+
+using ::stablehlo::quantization::Method;
+using ::tsl::protobuf::TextFormat;
+
+// Default version number for native serialization.
+constexpr int64_t kDefaultVersion = 9;
+// Default platform for XlaCallModuleOp.
+constexpr StringRef kPlatformCpu = "CPU";
+// Name of `tf.XlaCallModule`'s dictionary attribute for keeping the
+// deserialized stablehlo module's attributes.
+constexpr StringRef kStablehloModuleAttrsAttrName = "_stablehlo_module_attrs";
+// Attribute required for running shape refinement pass enabled in XlaCallModule
+// version 8 and above.
+constexpr StringRef kUsesShapePolymorphismAttr = "jax.uses_shape_polymorphism";
+
+bool IsInLiftedFunc(Operation* op) {
+  if (op == nullptr) return false;
+  return op->getParentOfType<func::FuncOp>()->hasAttr(kFusedFunctionAttr);
+}
+
+bool IsInStableHloOpRegion(Operation* op) {
+  if (op == nullptr) return false;
+  auto parent_op = op->getParentOp();
+  return parent_op != nullptr && quant::stablehlo::IsStablehloOp(parent_op);
+}
+
+// Inserts the function to the symbol table of the module thread-safely.
+StringAttr InsertToSymbolTable(Operation& module, Operation& function,
+                               const StringRef func_name) {
+  static tensorflow::mutex* mtx = new tensorflow::mutex();
+  tensorflow::mutex_lock lock(*mtx);
+
+  SymbolTable symbol_table(&module);
+  std::string unique_name = func_name.str();
+  int32_t uniquing_counter = 0;
+  while (symbol_table.lookup(unique_name) != nullptr) {
+    ++uniquing_counter;
+    unique_name = absl::StrCat(func_name.str(), "_", uniquing_counter);
+  }
+  function.setAttr("sym_name",
+                   StringAttr::get(module.getContext(), unique_name));
+  return symbol_table.insert(&function);
+}
+
+// Creates the TF::PartitionedCallOp with the given arguments and output types.
+// This function call op is for invoking the TF subgraphs.
+ValueRange CreateTFPartitionedCallOp(OpBuilder& builder,
+                                     const Location location,
+                                     const StringRef func_name,
+                                     const TypeRange output_types,
+                                     const ValueRange args) {
+  TF::PartitionedCallOp call_op = builder.create<TF::PartitionedCallOp>(
+      location, output_types, args,
+      /*args_attrs=*/nullptr, /*res_attrs=*/nullptr,
+      FlatSymbolRefAttr::get(builder.getStringAttr(func_name)),
+      /*config=*/"", /*config_proto=*/"", /*executor_type=*/"");
+
+  // Set the attribute to annotate this function call op as a quantizable spot.
+  call_op->setAttr(
+      kQuantTraitAttrName,
+      builder.getStringAttr(StringRef(
+          std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
+
+  return call_op.getOutput();
+}
+
+// Creates the TF::XlaCallModuleOp with the given arguments and output types.
+// This function call op is for invoking the StableHLO subgraphs.
+ValueRange CreateTFXlaCallModuleOp(OpBuilder& builder, const Location location,
+                                   const StringRef func_name,
+                                   const TypeRange output_types,
+                                   const ValueRange args) {
+  MLIRContext* ctx = builder.getContext();
+  // Collect the shapes of the output to fill up the Sout attribute.
+  SmallVector<Attribute> shape_attrs;
+  for (const Type result_type : output_types) {
+    shape_attrs.push_back(
+        tf_type::ShapeAttr::get(ctx, mlir::cast<ShapedType>(result_type)));
+  }
+  auto empty_array_attr = ArrayAttr::get(ctx, {});
+  auto platforms = ArrayAttr::get(ctx, {StringAttr::get(ctx, kPlatformCpu)});
+
+  auto call_op = builder.create<TF::XlaCallModuleOp>(
+      location,
+      /*output=*/output_types,
+      /*args=*/args,
+      /*version=*/kDefaultVersion, /*module=*/"",
+      /*Sout=*/ArrayAttr::get(ctx, shape_attrs),
+      /*dim_args_spec=*/empty_array_attr,
+      /*platforms=*/platforms,
+      /*function_list=*/empty_array_attr,
+      /*has_token_input_output=*/false,
+      /*disabled_checks=*/empty_array_attr);
+
+  // Set the function name. This will be controlled by the
+  // XlaCallModuleSerialization related passes directly, which means that the
+  // function name can be changed by those passes.
+  call_op->setAttr(TF::kStablehloEntryFunctionAttrName,
+                   FlatSymbolRefAttr::get(builder.getStringAttr(func_name)));
+
+  // Set target version to WEEK_4 since this is an offline quantizer.
+  std::string target_version =
+      mlir::vhlo::Version::fromCompatibilityRequirement(
+          vhlo::Version::CompatibilityRequirement::WEEK_4)
+          .toString();
+  call_op->setAttr(TF::kStablehloVersionAttrName,
+                   builder.getStringAttr(target_version));
+
+  // Store the custom attribute to restore the function name when loading it
+  // back in the post calibration stage. As mentioned above, the above entry
+  // function attribute is not reliable.
+  call_op->setAttr(kOriginalStablehloEntryFunctionAttrName,
+                   builder.getStringAttr(func_name));
+
+  // Set the attribute to annotate this function call op as a quantizable spot.
+  call_op->setAttr(
+      kQuantTraitAttrName,
+      builder.getStringAttr(StringRef(
+          std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
+
+  // Set jax.uses_shape_polymorphism=true to enable shape refinement at runtime.
+  // This is needed for native serialization version >= 8.
+  call_op->setAttr(kStablehloModuleAttrsAttrName,
+                   builder.getDictionaryAttr(builder.getNamedAttr(
+                       kUsesShapePolymorphismAttr, builder.getBoolAttr(true))));
+
+  return call_op.getOutput();
+}
+
+// Creates the function call op based on the given call_op_type argument.
+ValueRange CreateFunctionCallOp(OpBuilder& builder, const Location location,
+                                const FunctionCallOpType call_op_type,
+                                const StringRef func_name,
+                                const TypeRange output_types,
+                                const ValueRange args) {
+  switch (call_op_type) {
+    case FunctionCallOpType::TFXlaCallModuleOp:
+      return CreateTFXlaCallModuleOp(builder, location, func_name, output_types,
+                                     args);
+    case FunctionCallOpType::TFPartitionedCallOp:
+      return CreateTFPartitionedCallOp(builder, location, func_name,
+                                       output_types, args);
+  }
+}
+
+// Finds ops in the paths from arguments to results. The ops is listed in an
+// order that the former ops shouldn't have any dependencies on the later ones.
+SmallVector<Operation*> FindOpsFromArgumentsToResults(
+    const ArrayRef<Value> arguments, const ArrayRef<Value> results) {
+  std::queue<Value> value_queue;
+  for (Value result : results) {
+    value_queue.push(result);
+  }
+  absl::flat_hash_set<mlir::detail::ValueImpl*> argument_set;
+  for (Value argument : arguments) {
+    argument_set.insert(argument.getImpl());
+  }
+
+  // Searching for ops from results to arguments. Duplicate ops in the op stack
+  // are intentional in order to make sure the op on the top of the stack
+  // doesn't depends on any ops below it.
+  std::stack<Operation*> op_stack;
+  while (!value_queue.empty()) {
+    Value current_value = value_queue.front();
+    value_queue.pop();
+
+    Operation* defining_node = current_value.getDefiningOp();
+    if (defining_node == nullptr) continue;
+    op_stack.push(defining_node);
+    for (Value arg : defining_node->getOperands()) {
+      if (!argument_set.contains(arg.getImpl())) {
+        value_queue.push(arg);
+      }
+    }
+  }
+
+  // Remove duplicate ops from the op stack.
+  SmallVector<Operation*> sorted_ops;
+  absl::flat_hash_set<Operation*> unique_ops;
+  while (!op_stack.empty()) {
+    Operation* current_op = op_stack.top();
+    op_stack.pop();
+    if (unique_ops.contains(current_op)) continue;
+    sorted_ops.push_back(current_op);
+    unique_ops.insert(current_op);
+  }
+  return sorted_ops;
+}
+
+// Finds the name of each attribute in `attributes` and set the attr_map
+// attribute which maps an attribute identifier to its attribute name. The
+// identifier is the order of that attribute in `attributes`. This map
+// is then used to set attributes in the quantized functions in the
+// QuantizeCompositeFunctionsPass.
+// For example, for tf.MatMul with `attributes` = {{"transpose_a", false},
+// {"transpose_b", false}}, the generated attr_map is
+// "0:transpose_a,1:transpose_b", where 0 and 1 are the respective attribute
+// identifiers.
+// This function returns success if all attributes could be found.
+LogicalResult SetAttributeMap(MLIRContext& context,
+                              const ArrayRef<NamedAttribute> attributes,
+                              const ArrayRef<Operation*> ops) {
+  // A map to find which operation an attribute belongs to.
+  // The key for this map uses the entire NamedAttribute object, i.e. the
+  // {attribute_name, attribute_value} pair.
+  llvm::SmallDenseMap<NamedAttribute, Operation*> attr_to_op_map;
+  for (Operation* op : ops) {
+    for (const NamedAttribute named_attr : op->getAttrs()) {
+      attr_to_op_map.insert({named_attr, op});
+    }
+  }
+
+  for (int idx : llvm::seq<int>(0, attributes.size())) {
+    const NamedAttribute& attribute = attributes[idx];
+    // Skip the following steps if the attribute value is `NullAttribute`.
+    if (const auto string_attr =
+            mlir::dyn_cast_or_null<StringAttr>(attribute.getValue());
+        string_attr != nullptr &&
+        string_attr.getValue() == kNullAttributeValue) {
+      continue;
+    }
+
+    if (std::find_if(
+            attr_to_op_map.begin(), attr_to_op_map.end(), [&](auto attr_op) {
+              return std::get<0>(attr_op).getName() == attribute.getName();
+            }) == attr_to_op_map.end()) {
+      emitError(UnknownLoc::get(&context),
+                "Could not find attribute: " + attribute.getName().str());
+      return failure();
+    }
+
+    Operation* owner_op;
+    for (const auto& [attr, val] : attr_to_op_map) {
+      if (attr.getName() == attribute.getName()) owner_op = val;
+    }
+    if (quant::stablehlo::IsStablehloOp(owner_op)) {
+      owner_op->setAttr(StringRef(attribute.getName()), attribute.getValue());
+    } else {
+      owner_op = attr_to_op_map[attribute];
+
+      std::string new_attr_map_str{};
+      if (owner_op->hasAttr(kAttrMapAttribute)) {
+        new_attr_map_str =
+            owner_op->getAttrOfType<StringAttr>(kAttrMapAttribute).str();
+        absl::StrAppend(&new_attr_map_str, ",");
+      }
+
+      // Append "<identifier>:<attribute_name>". Ex) "0:transpose_a".
+      const std::string identifier = std::to_string(idx);
+      const StringAttr attribute_name = attribute.getName();
+      absl::StrAppend(&new_attr_map_str, identifier, ":", attribute_name.str());
+      owner_op->setAttr(kAttrMapAttribute,
+                        StringAttr::get(&context, new_attr_map_str));
+    }
+  }
+  return success();
+}
+
+// Creates a function to wrap the section between arguments and results.
+SmallVector<Value, 4> LiftAsFunctionCall(
+    OpBuilder& builder, const Location location,
+    const FunctionCallOpType call_op_type, const StringRef func_name,
+    const ArrayRef<Value> arguments, const ArrayRef<Value> results,
+    const ArrayRef<NamedAttribute> attributes) {
+  MLIRContext* context = builder.getContext();
+  if (results.empty()) {
+    emitError(UnknownLoc::get(context), "No result values specified");
+    return {};
+  }
+  Operation* result_op = results[0].getDefiningOp();
+  auto module = result_op->getParentOfType<ModuleOp>();
+
+  // Create a private function and copy all ops between arguments and results.
+  auto current_func = result_op->getParentOfType<func::FuncOp>();
+  auto guard = OpBuilder::InsertionGuard(builder);
+  builder.setInsertionPointAfter(current_func);
+  TypeRange arg_types{ValueRange{arguments}};
+  TypeRange result_types{ValueRange{results}};
+  auto func_type = FunctionType::get(context, arg_types, result_types);
+
+  SmallVector<Location> arg_locs;
+  for (Value arg : arguments) {
+    arg_locs.push_back(arg.getLoc());
+  }
+
+  auto wrap_func = builder.create<func::FuncOp>(location, func_name, func_type);
+  wrap_func.setVisibility(SymbolTable::Visibility::Private);
+  // The callee function for TF::XlaCallModuleOp must have this attribute.
+  if (call_op_type == FunctionCallOpType::TFXlaCallModuleOp) {
+    wrap_func->setAttr(TF::kFromXlaCallModuleAttrName, builder.getUnitAttr());
+  }
+  wrap_func->setAttr(kFusedFunctionAttr, builder.getUnitAttr());
+  builder.createBlock(&wrap_func.getBody(), wrap_func.begin(), arg_types,
+                      arg_locs);
+
+  IRMapping mapping;
+  for (int32_t i : llvm::seq<int32_t>(0, arguments.size())) {
+    mapping.map(arguments[i], wrap_func.getArgument(i));
+  }
+
+  auto cloning_ops = FindOpsFromArgumentsToResults(arguments, results);
+  // Set the location of call op to QuantizationUnitLoc if found.
+  Location call_op_loc = location;
+  for (Operation* op : cloning_ops) {
+    std::optional<quant::QuantizationUnitLoc::QuantizationUnit> unit =
+        quant::FindQuantizationUnitFromLoc(op->getLoc());
+    if (unit.has_value()) {
+      call_op_loc =
+          quant::QuantizationUnitLoc(builder.getContext(), unit.value());
+    }
+  }
+
+  if (failed(SetAttributeMap(*context, attributes, cloning_ops))) {
+    current_func.emitError() << "Some attributes couldn't be found.";
+  }
+  for (Operation* op : cloning_ops) {
+    builder.clone(*op, mapping);
+  }
+
+  SmallVector<Value> return_values;
+  for (Value result : results) {
+    return_values.push_back(mapping.lookupOrNull(result));
+  }
+  builder.create<func::ReturnOp>(location, return_values);
+
+  // Create a function call to the newly created function.
+  StringAttr new_func_name =
+      InsertToSymbolTable(*module, *wrap_func, func_name);
+  builder.setInsertionPointAfter(result_op);
+  ValueRange new_results =
+      CreateFunctionCallOp(builder, call_op_loc, call_op_type,
+                           new_func_name.getValue(), result_types, arguments);
+  return SmallVector<Value, 4>(new_results.begin(), new_results.end());
+}
+
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder,
+                                         const Location location,
+                                         const FunctionCallOpType call_op_type,
+                                         const StringRef func_name,
+                                         const ArrayRef<Value> arguments,
+                                         const ArrayRef<Value> results) {
+  SmallVector<NamedAttribute> attributes;
+  return LiftAsFunctionCall(builder, location, call_op_type, func_name,
+                            arguments, results, attributes);
+}
+
+SmallVector<Value> AppendToVector(const ArrayRef<Value> arguments,
+                                  Value append) {
+  SmallVector<Value> ret(arguments);
+  ret.push_back(append);
+  return ret;
+}
+
+// Check if the given einsum equation is supported by XlaDotV2.
+// Conditions:
+// 1. Two inputs & one output.
+// 2. No ... in the equation.
+// 3. Batch dimensions should be the same, or only the left equation should have
+//    the batch dimension. This condition is from the XlaDotV2 specification. It
+//    could process the following equation by setting the attributes properly:
+//    abc,cd->abd.
+// 4. The output should be in the form: [batch dims][lhs dims][rhs dims]
+bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr) {
+  StringRef equation = equation_attr.getValue();
+
+  if (!absl::StrContains(equation, "->") || !absl::StrContains(equation, ",") ||
+      absl::StrContains(equation, ".")) {
+    return false;
+  }
+
+  // Parse equation.
+  int idx_arrow = equation.find("->");
+  StringRef calc_eq = equation.substr(0, idx_arrow);
+  StringRef out_eq = equation.substr(idx_arrow + 2);
+
+  int idx_comma = calc_eq.find(',');
+  StringRef lhs_eq = calc_eq.substr(0, idx_comma);
+  StringRef rhs_eq = calc_eq.substr(idx_comma + 1);
+
+  if (absl::StrContains(rhs_eq, ",")) return false;
+
+  int lhs_out_idx_start = out_eq.size();
+  int lhs_out_idx_end = -1;
+  int rhs_out_idx_start = out_eq.size();
+  int rhs_out_idx_end = -1;
+  int lhs_batch_dim_size = 0;
+  int rhs_batch_dim_size = 0;
+  for (const char c : lhs_eq) {
+    if (absl::StrContains(out_eq, c) && absl::StrContains(rhs_eq, c)) {
+      lhs_batch_dim_size++;
+    } else if (absl::StrContains(out_eq, c)) {
+      const int out_idx = out_eq.find(c);
+      if (out_idx < lhs_out_idx_end) {
+        // Left-hand equation is reversed in the output.
+        return false;
+      }
+      lhs_out_idx_start = std::min(lhs_out_idx_start, out_idx);
+      lhs_out_idx_end = std::max(lhs_out_idx_end, out_idx);
+    }
+  }
+
+  for (const char c : rhs_eq) {
+    if (absl::StrContains(out_eq, c) && absl::StrContains(lhs_eq, c)) {
+      rhs_batch_dim_size++;
+    } else if (absl::StrContains(out_eq, c)) {
+      int out_idx = out_eq.find(c);
+      if (out_idx < rhs_out_idx_end) {
+        return false;
+      }
+      if (out_idx < rhs_out_idx_start) rhs_out_idx_start = out_idx;
+      if (out_idx > rhs_out_idx_end) rhs_out_idx_end = out_idx;
+    }
+  }
+
+  if (lhs_batch_dim_size != rhs_batch_dim_size && lhs_batch_dim_size != 0 &&
+      rhs_batch_dim_size != 0) {
+    // Batch dimension does not match.
+    return false;
+  }
+
+  // All the lhs equations should come first.
+  if (lhs_out_idx_end > rhs_out_idx_start) return false;
+
+  // All the lhs out dim and rhs out dim should be larger than the batch dims,
+  // and they should not be mixed.
+  int batch_dim_size = std::max(rhs_batch_dim_size, lhs_batch_dim_size);
+  return lhs_out_idx_start >= batch_dim_size &&
+         rhs_out_idx_start >= batch_dim_size;
+}
+
+absl::StatusOr<Method> GetQuantizationMethod(Operation* absl_nonnull op) {
+  const auto quantization_method_attr =
+      op->getAttrOfType<StringAttr>(kQuantizationMethodAttr);
+  if (!quantization_method_attr) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Attribute ", kQuantizationMethodAttr.str(), " is not found."));
+  }
+
+  Method quantization_method;
+  const std::string method_txtpb = quantization_method_attr.getValue().str();
+  if (!TextFormat::ParseFromString(method_txtpb, &quantization_method)) {
+    return absl::InternalError(
+        absl::StrCat("Failed to parse Method from textproto: ", method_txtpb));
+  }
+
+  return quantization_method;
+}
+
+Method GetQuantizationMethodOrDefault(Operation* absl_nonnull op) {
+  absl::StatusOr<Method> method = GetQuantizationMethod(op);
+  if (method.status().code() == absl::StatusCode::kInternal) {
+    // This indicates that the `Method` protobuf string is corrupt, but this
+    // function ignores it and returns the default instance.
+    op->emitError(absl::StrCat("Failed to get quantization method: ",
+                               method.status().ToString()));
+  }
+  return method.ok() ? *method : Method::default_instance();
+}
+
+bool HasWeightOnlyPtqMethod(TF::XlaCallModuleOp xla_call_module_op) {
+  Method method = GetQuantizationMethodOrDefault(xla_call_module_op);
+  return method.has_weight_only_ptq();
+}
+
+bool IsWeightOnlyQuantizableOp(const Operation& op) {
+  if (auto call_op = dyn_cast<TF::XlaCallModuleOp>(op)) {
+    StringRef entry_function_name = GetEntryFunctionName(call_op);
+    absl::StatusOr<Method> quantization_method = GetQuantizationMethod(call_op);
+    return ContainsConvOrDot(entry_function_name) && quantization_method.ok() &&
+           quantization_method->has_weight_only_ptq();
+  }
+  return false;
+}
+
+SmallVector<func::FuncOp> GetSortedFunctions(ModuleOp module_op) {
+  auto iterator_range = module_op.getOps<func::FuncOp>();
+  SmallVector<func::FuncOp> func_ops(iterator_range.begin(),
+                                     iterator_range.end());
+  absl::c_sort(func_ops, [](func::FuncOp op1, func::FuncOp op2) {
+    return op1.getName() < op2.getName();
+  });
+  return func_ops;
+}
+
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h
new file mode 100644
index 000000000000..b421ec3c672d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h
@@ -0,0 +1,114 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_LIFT_AS_FUNCTION_CALL_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_LIFT_AS_FUNCTION_CALL_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::tf_quant {
+
+// This attribute will be set for functions created by this pass.
+// Presence of this attribute will mark the function as quantization target.
+inline constexpr StringRef kFusedFunctionAttr = "tf_quant.composite_function";
+// The keyword to detect if this is a `NullAttribute`.
+inline constexpr StringRef kNullAttributeValue = "N/A";
+
+// Prefixes attached to lifted functions.
+constexpr StringRef kQuantizedFuncPrefix = "quantized_";
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+
+// The attribute will be used for TF::XlaCallModuleOp to restore the original
+// function name when loading it back.
+inline constexpr StringRef kOriginalStablehloEntryFunctionAttrName =
+    "_original_entry_function";
+
+// FunctionCallOpType to be generated as the function call operator when
+// function lifting will happen.
+enum FunctionCallOpType { TFPartitionedCallOp = 0, TFXlaCallModuleOp = 1 };
+
+// Checks if an op is inside a lifted function.
+// If the given op pointer is a nullptr, returns false.
+bool IsInLiftedFunc(Operation* op);
+
+// Checks if the op is inside a StableHLO op with region.
+// If the given op pointer is a nullptr, returns false.
+bool IsInStableHloOpRegion(Operation* op);
+
+// Checks if a given einsum op is supported for XlaDotV2 quantization.
+bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
+
+// Gets the quantization method from `op`. It is retrieved from the
+// `kQuantizationMethodAttr` string attribute. Returns
+// `absl::InvalidArgumentError` when the attribute doesn't exist. Returns
+// `absl::InternalError` when parsing the attribute to `Method` failed.
+// `op` must be non-null.
+absl::StatusOr<::stablehlo::quantization::Method> GetQuantizationMethod(
+    Operation* absl_nonnull op);
+
+// Gets the quantization method from `op`. It is retrieved from the
+// `kQuantizationMethodAttr` string attribute. Returns a default instance of
+// `Method` iff the attribute doesn't exist or the attribute contains an invalid
+// textproto for `Method`. `op` must be non-null.
+::stablehlo::quantization::Method GetQuantizationMethodOrDefault(
+    Operation* absl_nonnull op);
+
+// Creates a function to wrap the section between arguments and results.
+// The generated function call op type will be decided by the given call_op_type
+// argument. Currently, it supports TF::XlaCallModuleOp and
+// TF::PartitionedCallOp function call op generations.
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results,
+                                         ArrayRef<NamedAttribute> attributes);
+
+// Same as above but with empty attributes.
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results);
+
+// Add the second argument to the first argument, which is expected to be an
+// argument list.
+// Used to attach bias to einsum argument list.
+SmallVector<Value> AppendToVector(ArrayRef<Value> arguments, Value append);
+
+// Checks if the `Method` attatched to the given `tf.XlaCallModule` op has
+// `WeightOnlyPtq`.
+bool HasWeightOnlyPtqMethod(TF::XlaCallModuleOp xla_call_module_op);
+
+// Checks if an op is a `tf.XlaCallModule` op, contains 'conv' or 'dot_general'
+// in its name and has `Method` with `WeightOnlyPtq`.
+bool IsWeightOnlyQuantizableOp(const Operation& op);
+
+// Lists the functions in a ModuleOp sorted by their names.
+SmallVector<func::FuncOp> GetSortedFunctions(ModuleOp module_op);
+
+}  // namespace mlir::tf_quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_LIFT_AS_FUNCTION_CALL_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/BUILD b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/BUILD
new file mode 100644
index 000000000000..2ce3b743dcd7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/BUILD
@@ -0,0 +1,125 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # By default, these targets should only be used within the quantization library.
+    default_visibility = [
+        "//learning/brain/mlir/quantization:__subpackages__",
+        "//platforms/darwinn/compiler:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "tf_quantization_lib",
+    srcs = [
+        "tf_quantization_driver.cc",
+        "tf_quantization_interface.cc.inc",
+        "tf_quantization_utils.cc",
+    ],
+    hdrs = [
+        "tf_quantization_driver.h",
+        "tf_quantization_interface.h.inc",
+        "tf_quantization_traits.h",
+        "tf_quantization_utils.h",
+    ],
+    deps = [
+        ":tf_quantization_config",
+        ":tf_quantization_interfaces_inc_gen",
+        "//tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy:portable_tensor_utils",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/tools/optimize:quantization_utils",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_quantization_driver_test",
+    srcs = ["tf_quantization_driver_test.cc"],
+    deps = [
+        ":tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:func",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tf_quantization_config",
+    srcs = [
+        "tf_quantization_config.cc",
+    ],
+    hdrs = [
+        "tf_quantization_config.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/lite/tools/optimize:reduced_precision_metadata",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+td_library(
+    name = "tf_quantization_td_files",
+    srcs = [
+        "tf_quantization.td",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantizationOpsTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tf_quantization_interfaces_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "tf_quantization_interface.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "tf_quantization_interface.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "tf_quantization.td",
+    deps = [
+        ":tf_quantization_td_files",
+    ],
+)
+
+exports_files([
+    "tf_quantization_traits.h",
+    "tf_quantization_config.h",
+    "tf_quantization_utils.h",
+])
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization.td b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization.td
new file mode 100644
index 000000000000..3909495ef239
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization.td
@@ -0,0 +1,223 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the quantization definition file for TensorFlow.
+
+#ifdef TF_Quantization
+#else
+#define TF_Quantization
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Quant/IR/QuantBase.td"
+
+//===----------------------------------------------------------------------===//
+// TFQuantizedType definitions.
+//===----------------------------------------------------------------------===//
+
+// The base class of a quantized type. Signed quantized types may be expressed
+// as signless integers (i.e. up to op interpretation), but we include an
+// explicit signedness check to differentiate the signed/unsigned constraints
+// predicates from one another at the TD level.
+class TFQuantizedType<string n, list<int> params, bit signed>
+  : Type<And<[CPred<"llvm::isa<mlir::tf_quant::TFQuantizedType>($_self)">,
+              CPred<"llvm::cast<mlir::tf_quant::TFQuantizedType>($_self)" #
+                    ".getStorageTypeIntegralWidth() == " # !head(params)>,
+              Or<[CPred<"llvm::cast<mlir::tf_quant::TFQuantizedType>($_self)" #
+                    ".getStorageType().isSignlessInteger()">,
+                  CPred<"llvm::cast<mlir::tf_quant::TFQuantizedType>($_self)" #
+                    ".getStorageType().isSignedInteger() == " # signed>]>]>,
+    "Q" # !if (signed, "I", "UI") # !head(params) # " type"> {
+  string name = n;
+  string asTraitArgsStr =
+    !interleave(params, ", ") # !if(signed, ", true", ", false");
+}
+
+// Uniform quantized types. Two integers "smantissa" and "sexp" are used to
+// express the Mantissa and Exponent components of the floating-point scale so
+// the scale of the quantized type is "smantissa * 10 ^ sexp".
+class UInt8UniformTFQuantizedType<int zero_pt, int smantissa, int sexp>
+    : TFQuantizedType<"Uniform",
+                        [8, zero_pt, smantissa, sexp, 0, 255], 0>;
+class Int8UniformTFQuantizedType<int zero_pt, int smantissa, int sexp>
+    : TFQuantizedType<"Uniform",
+                        [8, zero_pt, smantissa, sexp, -128, 127], 1>;
+
+// General uniform quantized types. The definitions can be used to specify
+// operand's tensor types.
+def QI4 : TFQuantizedType<"Uniform", [4], 1>;
+def QUI8 : TFQuantizedType<"Uniform", [8], 0>;
+def QI8 : TFQuantizedType<"Uniform", [8], 1>;
+def QUI16 : TFQuantizedType<"Uniform", [16], 0>;
+def QI16 : TFQuantizedType<"Uniform", [16], 1>;
+def QUI32 : TFQuantizedType<"Uniform", [32], 0>;
+def QI32 : TFQuantizedType<"Uniform", [32], 1>;
+
+//===----------------------------------------------------------------------===//
+// TFL native op traits (for quantization).
+//
+// Ops in this link should have those traits specified:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+//===----------------------------------------------------------------------===//
+
+def FixedOutputRangeInterface : OpInterface<
+  "FixedOutputRangeInterface"> {
+  let cppNamespace = "tf_quant";
+  let description = [{
+    Interface for defining the fixed output range.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the fixed output range.}],
+      "UniformQuantizedType", "GetFixedOutputRange",
+      (ins "bool":$sign, "int":$bit_width)
+    >,
+  ];
+}
+
+def AffineQuantizedOpInterface : OpInterface<
+  "AffineQuantizedOpInterface"> {
+  let cppNamespace = "tf_quant";
+  let description = [{
+    Interface for affine quantized ops (conv2d, fully_connected, etc.)
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the affine operand index.}],
+      "int", "GetAffineOperandIndex",
+      (ins), [{}], [{return 1;}]>,
+    InterfaceMethod<
+      [{Returns whether narrow range is required for the affine operand.}],
+      "bool", "RequiredNarrowRangeAffineOperand",
+      (ins), [{}], [{return true;}]>,
+    InterfaceMethod<
+      [{Returns quantization dim for the affine operand.}],
+      "int", "GetQuantizationDimIndex",
+      (ins)>,
+    InterfaceMethod<
+      [{Returns the dimension index of the output channels.}],
+      "int", "GetChannelDimIndex", (ins)
+    >,
+  ];
+}
+
+def SameOperandsAndResultsScale : OpInterface<"SameScalesOpInterface"> {
+  let cppNamespace = "tf_quant";
+  let description = [{
+    Interface for ops potentially have same operands and results scales.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns whether same operands and results scales are required.}],
+      "bool", "RequiredSameOperandsAndResultsScale",
+      (ins "bool":$sign, "int":$bit_width), [{}], [{return true;}]
+    >,
+    InterfaceMethod<
+      [{Returns whether operands and results must have the same quantized axis.}],
+      "bool", "RequiredSameQuantizedAxes",
+      (ins), [{}], [{return true;}]
+    >,
+  ];
+
+  let verify = [{
+    return tf_quant::VerifySameScales($_op);
+  }];
+}
+
+def DynamicRangeQuantizedOpInterface : OpInterface<
+  "DynamicRangeQuantizedOpInterface"> {
+  let cppNamespace = "tf_quant";
+  let description = [{
+    Interface for ops dynamic range quantization is supported.
+
+    If the op has the kernel support for dynamic range quantization, Q/DQ op
+    pairs connected to the op are rewritten by its quantized alternatives where
+    a new op uses Q ops for its operands instead of DQ op. Otherwise, it is
+    left as is for weight-only which means the weight is dequantized at runtime.
+
+    For example, if the kernel does not support dynamic range quantization the
+    graph will be converted into the following IR:
+
+    %q_w = "tfl.pseudo_qconst"() {
+         qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+    %w = "tfl.dequantize"(%q_w) :
+         (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) ->
+         tensor<64x3x3x3xf32>
+    %conv = "tfl.conv_2d"(%input_act, %w, %bias)
+
+    but if it is supported, it will be rewritten as:
+
+    %q_w = "tfl.pseudo_qconst"() {
+         qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+    %conv = "tfl.conv_2d"(%input_act, %q_w, %bias)
+
+    Note that this is part of reaching feature parity with the old quantizer for
+    dynamic range quantization except:
+    - Only use_updated_hybrid_scheme=True is supported which means the ops with
+    the asymmetrically quantizing input support is enabled to use this feature
+    during MLIR graph rewriting passes while it is configurable in the old
+    quantizer. So when those ops are matched during graph rewriting passes,
+    MLIR quantizer will always ignore the pre-set value of the attribute, if
+    there's any, and set it to True. The reason behind this decision is that
+    generally activations of these ops show better accuracy with asymmetric
+    input quantization so we want to deprecate symmetric activation quantization
+    for those ops eventually.
+    - Unlike to the old quantizer, per-channel quantization is supported for
+    weight-only TransposeConvOp.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the quantizable operand indices of the op.}],
+      "std::vector<int>", "GetQuantizableOperandIndices",
+      (ins), [{}], [{return {};}]>,
+    InterfaceMethod<
+      [{Returns whether the op has the kernel support for dynamic range
+      quantization.}],
+      "bool", "GetDynamicRangeQuantKernelSupport",
+      (ins), [{}], [{return false;}]>,
+    InterfaceMethod<
+      [{Returns whether the op requires asymmetric quantize input attribute
+      setting.}],
+      "bool", "RequireAsymmetricQuantizeInputsAttr",
+      (ins), [{}], [{return false;}]>,
+  ];
+}
+
+// Specify this trait if the op has a fixed output value range.
+class FixedResultScale<TFQuantizedType qt> : NativeOpTrait<!strconcat(
+  "tf_quant::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
+
+// Specify this trait if the bias-th input of the op is a bias input, which
+// needs a scale based on the scales of op1 and op2.
+class AccumulatorUniformScale<int bias, int op1, int op2> : NativeOpTrait<
+  !strconcat("tf_quant::AccumulatorUniformScale<",
+             !interleave([bias, op1, op2], ", "),
+             ">::Impl")>;
+
+// Specify the operand index of the coefficient operand for an affine op
+// and also the quantization dimension if per-axis quantization is support.
+// If the quantization dimension is -1, per-axis quantization isn't supported.
+class AffineOpCoefficient<int dim, int index> : NativeOpTrait<
+  !strconcat("tf_quant::AffineOpCoefficient<",
+             !interleave([dim, index], ", "),
+             ">::Impl")>;
+
+// Specify this trait if the op does have quantizable output. Quantizers will
+// apply quantization on this op.
+def QuantizableResult : NativeOpTrait<"tf_quant::QuantizableResult">;
+#endif // TF_Quantization
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.cc b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.cc
new file mode 100644
index 000000000000..80abc8815d8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.cc
@@ -0,0 +1,184 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h"
+
+#include <ios>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+// Returns whether the given dtype is a quantization type in TensorFlow.
+static bool IsQuantizationType(tensorflow::DataType dtype) {
+  switch (dtype) {
+    case tensorflow::DT_QINT8:
+    case tensorflow::DT_QUINT8:
+    case tensorflow::DT_QINT16:
+    case tensorflow::DT_QUINT16:
+    case tensorflow::DT_QINT32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+bool GetBooleanSpecs(const std::string& bool_val) {
+  bool result;
+  std::stringstream iss(bool_val);
+  iss >> std::boolalpha >> result;
+  return result;
+}
+}  // namespace
+
+void ParseCustomOpSpecs(const absl::string_view node_names,
+                        const CustomOpUpdateOptions& update_option,
+                        CustomOpMap& custom_op_map) {
+  if (node_names.empty()) return;
+
+  const std::vector<std::string> custom_nodes = absl::StrSplit(node_names, ',');
+
+  for (const std::string& cur_node : custom_nodes) {
+    const std::vector<std::string> node_infos = absl::StrSplit(cur_node, '=');
+    const std::string& node_name = node_infos[0];
+    const std::string& node_specification = node_infos[1];
+    CustomOpInfo new_node_info;
+    switch (update_option) {
+      case CustomOpUpdateOptions::kInputIndices: {
+        const std::vector<std::string> indices =
+            absl::StrSplit(node_specification, '-');
+        for (const std::string& cur_index : indices) {
+          custom_op_map[node_name].quantizable_input_indices.push_back(
+              std::stoi(cur_index));
+        }
+        break;
+      }
+      case CustomOpUpdateOptions::kWeightOnly:
+        custom_op_map[node_name].is_weight_only =
+            GetBooleanSpecs(node_specification);
+        break;
+      case CustomOpUpdateOptions::kNoSideEffect:
+        custom_op_map[node_name].no_side_effect =
+            GetBooleanSpecs(node_specification);
+        break;
+    }
+  }
+}
+
+bool ParseInputNodeQuantSpecs(const absl::string_view node_names,
+                              const absl::string_view min_values,
+                              const absl::string_view max_values,
+                              const absl::string_view inference_type,
+                              QuantizationSpecs* quant_specs) {
+  const std::vector<std::string> input_nodes = absl::StrSplit(node_names, ',');
+  std::vector<std::optional<double>> node_mins;
+  if (!min_values.empty()) {
+    std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
+    for (const std::string& node_mins_str : node_mins_str) {
+      double value;
+      if (!absl::SimpleAtod(node_mins_str, &value)) {
+        llvm::errs() << "Unexpected mins: " << node_mins_str << "\n";
+        return true;
+      }
+      node_mins.push_back(value);
+    }
+  }
+
+  std::vector<std::optional<double>> node_maxs;
+  if (!max_values.empty()) {
+    const std::vector<std::string> node_maxs_str =
+        absl::StrSplit(max_values, ',');
+    for (const std::string& node_maxs_str : node_maxs_str) {
+      double value;
+      if (!absl::SimpleAtod(node_maxs_str, &value)) {
+        llvm::errs() << "Unexpected mins: " << node_maxs_str << "\n";
+        return true;
+      }
+      node_maxs.push_back(value);
+    }
+  }
+
+  tensorflow::DataType final_type = tensorflow::DT_FLOAT;
+  if (!inference_type.empty() &&
+      !DataType_Parse(std::string(inference_type), &final_type)) {
+    return true;
+  }
+  return GetInputNodeQuantSpecs(input_nodes, node_mins, node_maxs, final_type,
+                                quant_specs);
+}
+
+bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
+                            const std::vector<std::optional<double>>& node_mins,
+                            const std::vector<std::optional<double>>& node_maxs,
+                            const tensorflow::DataType inference_type,
+                            QuantizationSpecs* quant_specs) {
+  quant_specs->inference_type = inference_type;
+
+  // If min/max are not specified, just return;
+  if (node_mins.empty() || node_maxs.empty()) return false;
+
+  // Otherwise make sure min/max has the same size as inputs.
+  if (IsQuantizationType(inference_type)) {
+    // min/max should have same size as inputs, or shouldn't be specified.
+    if (node_names.size() != node_mins.size() ||
+        node_names.size() != node_maxs.size()) {
+      return true;
+    }
+    for (int i = 0; i < node_names.size(); ++i) {
+      quant_specs->input_ranges.push_back({node_mins[i], node_maxs[i]});
+    }
+    return false;
+  }
+  if (!node_mins.empty()) {
+    llvm::dbgs() << "Ignored input_min_values.";
+  }
+  if (!node_maxs.empty()) {
+    llvm::dbgs() << "Ignored input_max_values.";
+  }
+  return false;
+}
+
+std::string GetQDQQuantModeString(const QDQConversionMode mode) {
+  switch (mode) {
+    case QDQConversionMode::kQDQStatic:
+      return "Static";
+    case QDQConversionMode::kQDQDynamic:
+      return "Dynamic";
+    case QDQConversionMode::kQDQStrict:
+      return "Strict";
+    default:
+      return "NoQDQ";
+  }
+}
+
+QDQConversionMode GetQDQQuantModeFromString(const std::string& mode_str) {
+  if (mode_str == "Static") return QDQConversionMode::kQDQStatic;
+  if (mode_str == "Dynamic") return QDQConversionMode::kQDQDynamic;
+  if (mode_str == "Strict") return QDQConversionMode::kQDQStrict;
+  return QDQConversionMode::kQDQNone;
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h
new file mode 100644
index 000000000000..d65496bc402e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h
@@ -0,0 +1,255 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines node specs for quantization and the methods to parse
+// command line flags to these specs.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_CONFIG_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// Stores information about how to quantize a user-specified custom operation.
+struct CustomOpInfo {
+  std::vector<std::int32_t> quantizable_input_indices;
+  bool is_weight_only = false;
+  bool no_side_effect = true;
+};
+
+using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
+enum CustomOpUpdateOptions { kInputIndices, kWeightOnly, kNoSideEffect };
+enum class QDQConversionMode { kQDQNone, kQDQStatic, kQDQDynamic, kQDQStrict };
+
+struct QuantizationSpecs {
+  // Which function this node quant specifications belong to.
+  std::string target_func = "main";
+
+  // Whether to trigger quantization passses for post-training quantization.
+  // If true, the model input doesn't require user specified input ranges.
+  bool post_training_quantization = false;
+
+  // Whether to allow dynamic range quantization. This is the easiest
+  // quantization mode which doesn't require QAT or sample inputs.
+  // This option only targets `DT_HALF` and `DT_QINT8` inference type.
+  bool weight_quantization = false;
+
+  // Whether to use the MLIR dynamic range quantizer instead of TOCO.
+  bool enable_mlir_dynamic_range_quantizer = false;
+
+  // Whether to allow weight-only quantization. This scheme quantizes
+  // weights but will dequantize them back at runtime which is useful for
+  // memory bound case without kernel support available in lower precisions.
+  // Used in MLIR dynamic range quantizer.
+  bool weight_only_quantization = false;
+
+  // The minimum number of elements in a weights array required to apply
+  // quantization. This is especially useful not to quantize small tensors as
+  // it is hard to get performance benefits from them with quantization. Used
+  // in MLIR dynamic range quantizer with int8 weight data type.
+  int64_t minimum_elements_for_weights = 1024;
+
+  // Whether to calculate scales in float to keep quantized values the same with
+  // old TOCO quantizer.
+  bool legacy_float_scale = false;
+
+  // Whether to perform per-tensor quantization. Currently, this option is only
+  // valid when the quantization parameters need to be created by scanning the
+  // constant content (post-training quantization or QAT without weight
+  // FakeQuant).
+  bool disable_per_channel = false;
+
+  // Whether to disable per-channel weight quantization and enable legacy per
+  // tensor quantization. The legacy quantization for Dense layers is
+  // inconsistent with Conv 1x1 which always performs per channel quantization.
+  bool disable_per_channel_for_dense_layers = false;
+
+  // Whether to use fixed output ranges of the activation ops (tanh, sigmoid,
+  // etc.) and not infer weight constants.
+  // If this option is set, quantization emulation ops should be placed after
+  // the ops in the input graph. This flag should be set to false for
+  // post-training quantization.
+  bool disable_infer_tensor_range = false;
+
+  // Whether to use the unfrozen variable quantization in MLIR. Typically,
+  // variables are frozen for passing passes, but some variables aren't frozen.
+  // If it is true, QuantizeVariables pass will be added after the
+  // PrepareQuantizePass.
+  bool enable_mlir_variable_quantization = false;
+
+  // The node type when the model is exported. Currently this is limited to
+  // DT_FLOAT, DT_HALF, DT_QINT8, and DT_QUINT8. When DT_HALF is used, the
+  // `weight_quantization` flag needs to set to true. When DT_QUINT8 is used,
+  // the `weight_quantization` flag needs to set to false.
+  tensorflow::DataType inference_type = tensorflow::DT_FLOAT;
+
+  // The input and output data type during inference. This flag is only used
+  // when `inference_type` is different from DT_FLOAT. This flag can only be set
+  // to DT_FLOAT or as same as `inference_type`. If this flag is different
+  // from `inference_type`, adaptor ops are inserted as heading and tailing ops
+  // in the result model.
+  tensorflow::DataType inference_input_type = tensorflow::DT_FLOAT;
+
+  // Input node ranges. These ranges are stored as the same order of function
+  // arguments. They are only used when `weight_quantization` is set to false,
+  // and the model is required to have quantization parameters, either from
+  // quantization aware training or calibration, for the remaining tensors.
+  std::vector<std::pair<std::optional<double>, std::optional<double>>>
+      input_ranges;
+
+  // Whether to disable setting the quantization parameters of the input nodes
+  // using input ranges.
+  bool disable_set_input_nodes_quantization_params = false;
+
+  // The default ranges can be used when a tensor doesn't have quantization
+  // parameters and couldn't be quantized. Used only for latency tests.
+  std::pair<std::optional<double>, std::optional<double>> default_ranges;
+
+  // A serialized "QuantizationInfo" object to specify value ranges for some of
+  // the tensors with known names.
+  std::string serialized_quant_stats = "";
+
+  // A bitmask to encode support for reduced precision inference in the model.
+  tflite::optimize::ReducedPrecisionSupport support_mask =
+      tflite::optimize::ReducedPrecisionSupport::None;
+
+  // Whether to run the passes to propagate the quantization parameters and
+  // graph rewrites. Returns false if the inference_type is DT_FLOAT or
+  // `weight_quantization` flag is set.
+  bool RunPropagationAndRewriteQuantizationPasses() const {
+    return inference_type != tensorflow::DT_FLOAT && !weight_quantization;
+  }
+
+  // TODO: b/202075505 - make implicit weight type clearer
+  // Whether run the passes and graph rewrites for dynamic range quantization.
+  bool RunAndRewriteDynamicRangeQuantizationPasses() const {
+    bool dynamic_range_quantize =
+        (inference_type != tensorflow::DT_FLOAT) && weight_quantization &&
+        !post_training_quantization && !disable_infer_tensor_range &&
+        enable_mlir_dynamic_range_quantizer;
+    return dynamic_range_quantize;
+  }
+
+  // Returns whether this inference type represents a signed storage type.
+  bool IsSignedInferenceType() const {
+    switch (inference_type) {
+      case tensorflow::DT_QUINT8:
+      case tensorflow::DT_QUINT16:
+        return false;
+      default:
+        return true;
+    }
+  }
+
+  // Gets the width of this quantization type. Returns 0 if it isn't a
+  // quantization type.
+  int64_t GetQuantizationTypeWidth() const {
+    switch (inference_type) {
+      case tensorflow::DT_INT8:
+      case tensorflow::DT_UINT8:
+      case tensorflow::DT_QINT8:
+      case tensorflow::DT_QUINT8:
+        return 8;
+      case tensorflow::DT_INT16:
+      case tensorflow::DT_UINT16:
+      case tensorflow::DT_QINT16:
+      case tensorflow::DT_QUINT16:
+        return 16;
+      case tensorflow::DT_INT32:
+      case tensorflow::DT_QINT32:
+        return 32;
+      default:
+        return 0;
+    }
+  }
+
+  // Whether to add the NumericVerify ops to verify numbers before and after
+  // quantization.
+  bool verify_numeric = false;
+  // Whether to add verification for layer by layer, or on whole model. When
+  // disabled (per-layer) float and quantized ops will be run from same input
+  // (output of previous quantized layer). When enabled, float and quantized ops
+  // will run with respective float and quantized output of previous ops.
+  bool whole_model_verify = false;
+
+  // Whether to use fake quant attributes to calculate quantization parameters.
+  bool use_fake_quant_num_bits = false;
+
+  // Names of ops to block from quantization. Used in QuantizePass.
+  // For dynamic range quantization, ops in blocklist are quantized in weight-
+  // only manner.
+  absl::flat_hash_set<std::string> ops_blocklist;
+
+  // Names of locations to block from quantization. Used in QuantizePass.
+  absl::flat_hash_set<std::string> nodes_blocklist;
+
+  // Map from custom op code to custom op quantization information.
+  // For dynamic range quantization, among the custom ops in the graph those
+  // specified in this map are subject to quantization.
+  CustomOpMap custom_map;
+
+  // If other than kQDQNone, the model is a floating point graph with QDQ ops
+  // to be eliminated and fused into quantized kernels.
+  QDQConversionMode qdq_conversion_mode = QDQConversionMode::kQDQNone;
+
+  // When set, adheres to the QDQ annotations added by the framework when
+  // possible rather than quantizing any op that is possible to quantize.
+  bool strict_qdq_mode = false;
+};
+
+// Parses the command line flag strings to the CustomOpMap specification.
+void ParseCustomOpSpecs(absl::string_view node_names,
+                        const CustomOpUpdateOptions& update_option,
+                        CustomOpMap& custom_op_map);
+
+// Parses the command line flag strings to the quantization specification for
+// input arrays of a graph. The array names are not stored in the spec, and will
+// be matched by position. Returns true if failed.
+bool ParseInputNodeQuantSpecs(absl::string_view node_names,
+                              absl::string_view min_values,
+                              absl::string_view max_values,
+                              absl::string_view inference_type,
+                              QuantizationSpecs* quant_specs);
+
+// Gets the quantization specification for input arrays. The array names are not
+// stored in the spec, and will be matched by position. The min/max will be
+// ignored if the inference_type isn't a quantized type. Returns true if failed.
+bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
+                            const std::vector<std::optional<double>>& node_mins,
+                            const std::vector<std::optional<double>>& node_maxs,
+                            tensorflow::DataType inference_type,
+                            QuantizationSpecs* quant_specs);
+
+// Returns a human-readable string of the QDQQuantMode enum class
+std::string GetQDQQuantModeString(QDQConversionMode mode);
+
+// Returns the QDQQuantMode enum class from a human-readable string
+QDQConversionMode GetQDQQuantModeFromString(const std::string& mode_str);
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.cc b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.cc
new file mode 100644
index 000000000000..a3b6f7aeed9e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.cc
@@ -0,0 +1,958 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.h"
+
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+constexpr int32_t kBiasMax = std::numeric_limits<int32_t>::max() / 2;
+
+// Uses the type of `value` to set the initial state of the index-th result if
+// `as_result` is true or index-th operand if `as_result` is false. The state
+// is immutable if the type is a quantized type. Returns the index of this
+// new state in the state vector.
+void InitializeStateForValue(
+    Operation* op, const int index, const Value value, const bool as_result,
+    std::vector<QuantState>& states,
+    DenseMap<Value, QuantizationDriver::QuantStateIndex>& value_to_state,
+    DenseMap<QuantizationDriver::OpWithOperandIndex,
+             QuantizationDriver::QuantStateIndex>& operand_states,
+    DenseMap<QuantizationDriver::OpWithResultIndex,
+             QuantizationDriver::QuantStateIndex>& result_states) {
+  const auto [cached, inserted] = value_to_state.try_emplace(value, 0);
+  if (!inserted) {
+    if (as_result) {
+      result_states[{op, index}] = cached->second;
+    } else {
+      operand_states[{op, index}] = cached->second;
+    }
+    return;
+  }
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(value.getType());
+
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states.size();
+  states.push_back({quantized_type, immutable});
+  if (as_result) {
+    result_states[{op, index}] = next_state_index;
+  } else {
+    operand_states[{op, index}] = next_state_index;
+  }
+
+  cached->second = next_state_index;
+}
+
+bool HasPerAxisQuantizedOperand(Operation* op) {
+  for (int i = 0; i < op->getNumOperands(); ++i) {
+    if (auto dq_op = dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+            op->getOperand(i).getDefiningOp())) {
+      auto type =
+          mlir::cast<TensorType>(dq_op.getArg().getType()).getElementType();
+      if (auto per_axis_qtype =
+              mlir::dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(
+                  QuantizedType::getQuantizedElementType(type))) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+void QuantizationDriver::InitializeArgState(const BlockArgument arg,
+                                            const Value arg_value) {
+  const auto [cached, inserted] = value_to_state_.try_emplace(arg_value, 0);
+  if (!inserted) {
+    arg_states_[arg] = cached->second;
+    return;
+  }
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(arg_value.getType());
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states_.size();
+  states_.push_back({quantized_type, immutable});
+  arg_states_[arg] = next_state_index;
+  cached->second = next_state_index;
+}
+
+void QuantizationDriver::InitializeOperandState(Operation* op, const int index,
+                                                const Value value) {
+  InitializeStateForValue(op, index, value, /*as_result=*/false, states_,
+                          value_to_state_, operand_states_, result_states_);
+}
+
+void QuantizationDriver::InitializeResultState(Operation* op, const int index,
+                                               const Value value) {
+  InitializeStateForValue(op, index, value, /*as_result=*/true, states_,
+                          value_to_state_, operand_states_, result_states_);
+}
+
+std::unique_ptr<OpQuantSpec> QuantizationDriver::GetQuantSpec(Operation* op) {
+  return op_quant_spec_getter_(op);
+}
+
+std::unique_ptr<OpQuantScaleSpec> QuantizationDriver::GetQuantScaleSpec(
+    Operation* op) {
+  return op_quant_scale_spec_getter_(op);
+}
+
+bool QuantizationDriver::IsQuantized(Operation* op) {
+  for (int i = 0; i < op->getNumResults(); ++i) {
+    if (GetResultQuantState(op, i).IsEmpty()) return false;
+  }
+  return true;
+}
+
+bool QuantizationDriver::SetConstantResultParams(Operation* op) {
+  DenseFPElementsAttr attr;
+  const Value result = op->getResult(0);
+  if (!matchPattern(result, m_Constant(&attr))) {
+    return false;
+  }
+  // TODO: b/323478683 - Make storage_type_width and narrow_range configurable.
+  Type final_type;
+  const auto it = optimized_weights_.find(op);
+  const bool is_weight = it != optimized_weights_.end();
+  const bool is_weight_with_per_channel_support =
+      is_weight && it->second != -1 && is_signed_;
+
+  if (is_weight_with_per_channel_support && !disable_per_channel_) {
+    // When `disable_per_channel_` is false, per-channel symmetric quantization
+    // parameters are created from the weights when the ops support per-channel
+    // quantization. Otherwise, uses per-tensor asymmetric quantization with
+    // narrow range.
+
+    // per-axis quantization weight, with symmetric min/max enforced.
+    final_type = GetUniformQuantizedPerAxisTypeForWeight(
+        attr, it->second, /*symmetric=*/true, /*num_bits=*/8, is_signed_,
+        /*narrow_range=*/true, legacy_float_scale_);
+  } else {
+    // per-tensor quantization weight
+    final_type = GetUniformQuantizedTypeForWeight(
+        attr, /*symmetric=*/is_weight && is_signed_,
+        /*num_bits=*/8, is_signed_,
+        /*narrow_range=*/is_weight, legacy_float_scale_);
+  }
+  if (const auto quant_type = mlir::dyn_cast_or_null<QuantizedType>(final_type);
+      quant_type != nullptr) {
+    return SetResultParams(op, /*result_index=*/0, quant_type);
+  }
+  return false;
+}
+
+bool QuantizationDriver::SetResultParams(Operation* op, const int result_index,
+                                         const QuantizedType quantized_type) {
+  QuantState& state = GetResultQuantState(op, result_index);
+  if (state.params == quantized_type) {
+    return false;
+  }
+  if (!state.IsEmpty()) {
+    RequantizeStates& rescales = GetResultRequantizeStates(op, result_index);
+    RequantizeState& rescale = rescales.emplace_back();
+    rescale.pos = RequantizeState::ON_INPUT;
+    rescale.params = quantized_type;
+    return true;
+  }
+  state.params = quantized_type;
+  AddUserToList(op, result_index);
+  return true;
+}
+
+QuantizedType QuantizationDriver::GetBiasParams(
+    Operation* op, const int bias_index,
+    const ArrayRef<int> non_bias_operand_indices,
+    const AccumulatorScaleFunc func) {
+  QuantState& bias_state = GetOperandQuantState(op, bias_index);
+  if (!bias_state.IsEmpty()) {
+    return bias_state.params;
+  }
+  std::vector<QuantizedType> op_types{};
+  op_types.reserve(non_bias_operand_indices.size());
+
+  int adjusted_quant_dim = -1;
+  if (op->getNumOperands() > bias_index) {
+    // Some kernels allow 1D bias, broadcasting it inside the kernel. In this
+    // case, the `quantizedDimension=0` when quantizing per-channel.
+    // However, for some kernels which require bias to be already broadcasted
+    // to match the accumulation shape, the very last index should be used.
+    Operation* bias_op = op->getOperand(bias_index).getDefiningOp();
+    if (bias_op != nullptr) {
+      Type bias_type = bias_op->getResult(0).getType();
+      if (bias_type != builder_.getNoneType()) {
+        const int bias_rank = mlir::dyn_cast<ShapedType>(bias_type).getRank();
+        adjusted_quant_dim = bias_rank > 1 ? bias_rank - 1 : 0;
+      }
+    }
+  }
+
+  for (const int non_bias_operand_index : non_bias_operand_indices) {
+    const QuantState& non_bias_state =
+        GetOperandQuantState(op, non_bias_operand_index);
+    op_types.push_back(non_bias_state.params);
+  }
+  return func(op_types, adjusted_quant_dim, legacy_float_scale_);
+}
+
+bool QuantizationDriver::SetOperandParams(Operation* op,
+                                          const int operand_index,
+                                          const QuantizedType quantized_type,
+                                          const bool override) {
+  QuantState& state = GetOperandQuantState(op, operand_index);
+  if (state.params == quantized_type) {
+    return false;
+  }
+
+  if (!state.IsEmpty() && !override) {
+    RequantizeStates& rescales = GetOperandRequantizeStates(op, operand_index);
+    for (RequantizeState& rescale : rescales) {
+      if (rescale.params == quantized_type) {
+        rescale.users.emplace_back(op, operand_index);
+        return true;
+      }
+    }
+    RequantizeState& rescale = rescales.emplace_back();
+    rescale.pos = RequantizeState::ON_OUTPUT;
+    rescale.params = quantized_type;
+    rescale.users.emplace_back(op, operand_index);
+    return true;
+  }
+
+  state.params = quantized_type;
+  AddOperandToList(op, operand_index);
+  return true;
+}
+
+void QuantizationDriver::QuantizeOpResult(Operation* op, const int result_index,
+                                          const QuantizedType quantized_type) {
+  builder_.setInsertionPointAfter(op);
+  const Value original_result = op->getResult(result_index);
+  QuantizeValue(original_result, quantized_type, op->getLoc());
+}
+
+void QuantizationDriver::QuantizeArg(BlockArgument arg,
+                                     const QuantizedType quantized_type) {
+  builder_.setInsertionPointToStart(arg.getOwner());
+  QuantizeValue(arg, quantized_type, builder_.getUnknownLoc());
+}
+
+void QuantizationDriver::QuantizeValue(Value value,
+                                       QuantizedType quantized_type,
+                                       const Location loc) {
+  const Type expressed_type = value.getType();
+  const Type new_value_type =
+      quantized_type.castFromExpressedType(expressed_type);
+  // Skip if `value` or `value`'s element type doesn't match the expressed type
+  // of `quantized_type`.
+  if (new_value_type == nullptr) return;
+
+  auto quantize = builder_.create<mlir::quant::ir::QuantizeCastOp>(
+      loc, new_value_type, value);
+  auto dequantize = builder_.create<mlir::quant::ir::DequantizeCastOp>(
+      loc, expressed_type, quantize.getResult());
+
+  // This attribute is set to distinguish the quantize ops being added by the
+  // quantization pass. These ops can be removed without losing original
+  // program accuracy.
+  // TODO: b/323478683 - Make the attribute being part of op definition.
+  quantize->setAttr(kVolatileOpAttrName, builder_.getUnitAttr());
+
+  // `original_result` has a use to `quantize`, so this will replace that use
+  // by the result of `dequantize`. Remember to reset that use afterwards
+  value.replaceAllUsesWith(dequantize);
+  quantize.getOperation()->replaceUsesOfWith(dequantize, value);
+}
+
+void QuantizationDriver::RequantizeOpResult(Operation* op,
+                                            const int result_index,
+                                            RequantizeStates& states) {
+  if (states.empty()) return;
+
+  builder_.setInsertionPointAfter(op);
+  Value value = op->getResult(result_index);
+  RequantizeState::RequantizePosition pos = states.front().pos;
+  if (pos == RequantizeState::NO_REQUANTIZE) {
+    return;
+  }
+  for (const RequantizeState& state : states) {
+    // Check that all requantization positions are the same for each state.
+    // Unsure if this check is required.
+    if (state.pos != pos) {
+      return;
+    }
+  }
+  if (pos == RequantizeState::ON_OUTPUT) {
+    Operation* user = value.getUses().begin().getUser();
+    if (isa<mlir::quant::ir::QuantizeCastOp>(user)) {
+      // The requantize op is inserted between `quantize` and `dequantize` ops.
+      value = user->getResult(0);
+      builder_.setInsertionPointAfter(user);
+    }
+  }
+  RequantizeValue(value, states, op->getLoc());
+}
+
+void QuantizationDriver::RequantizeArg(const BlockArgument arg,
+                                       RequantizeStates& states) {
+  Value value = arg;
+  builder_.setInsertionPointToStart(arg.getOwner());
+  if (value.hasOneUse()) {
+    Operation* user = value.use_begin().getUser();
+    if (auto q = dyn_cast<mlir::quant::ir::QuantizeCastOp>(user)) {
+      value = q.getResult();
+      builder_.setInsertionPoint(arg.getOwner(), ++Block::iterator(user));
+    }
+  }
+  RequantizeValue(value, states, builder_.getUnknownLoc());
+}
+
+void QuantizationDriver::RequantizeValue(Value value, RequantizeStates& states,
+                                         const Location loc) {
+  if (states.empty() || states.front().pos == RequantizeState::NO_REQUANTIZE) {
+    return;
+  }
+  if (states.front().pos == RequantizeState::ON_INPUT) {
+    RequantizeState& state = states.front();
+    const Type expressed_type = value.getType();
+    // The value needs to be requantized. A Quantize op will be created to use
+    // it as the operand and replace its uses.
+    const Type new_type = state.params.castFromExpressedType(expressed_type);
+    if (!new_type) return;
+    auto requantize_op =
+        builder_.create<mlir::quant::ir::QuantizeCastOp>(loc, new_type, value);
+    value.replaceAllUsesWith(requantize_op);
+    requantize_op.getOperation()->replaceUsesOfWith(requantize_op, value);
+    // This requantization was defined as required for the result value, so
+    // there should be only one requant state.
+    return;
+  }
+
+  // If this is an operand that requires requantization, then the value should
+  // only have one `DequantizeCastOp` user which produces the operand value.
+  if (!value.hasOneUse()) {
+    return;
+  }
+  auto dequant_op = dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+      value.use_begin().getUser());
+  if (!dequant_op) {
+    return;
+  }
+  // It is possible that the dequant value is used by a op that doesn't require
+  // requant, so only overwrite the first if that is not the case.
+  const int num_uses = std::distance(dequant_op.getResult().use_begin(),
+                                     dequant_op.getResult().use_end());
+
+  // Whether to replace quantization params of the first dequantize op
+  // after the quantized value is produced.
+  // If there is a use other than the requantize states, then we can't clobber.
+  bool clobber_first = num_uses <= states.size();
+  for (RequantizeState& state : states) {
+    Type expressed_type = QuantizedType::castToExpressedType(value.getType());
+    if (!expressed_type) continue;
+    // The value needs to be requantized. A Quantize op will be created to use
+    // it as the operand and replace its uses.
+    const Type new_type = state.params.castFromExpressedType(expressed_type);
+    // This value isn't an expressed type (float), skip.
+    if (!new_type) continue;
+
+    auto requantize_op =
+        builder_.create<mlir::quant::ir::QuantizeCastOp>(loc, new_type, value);
+
+    if (clobber_first) {
+      dequant_op.setOperand(requantize_op.getResult());
+      // All ops requiring this value already use the result of dequant.
+      clobber_first = false;
+    } else {
+      auto new_dequant_op = builder_.create<mlir::quant::ir::DequantizeCastOp>(
+          loc, dequant_op.getResult().getType(), requantize_op.getResult());
+      for (auto [op, operand_idx] : state.users) {
+        op->setOperand(operand_idx, new_dequant_op.getResult());
+      }
+    }
+  }
+}
+
+// A heuristic to get quantization parameters satisfies the same scale
+// constraints:
+// - If there are immutable states,
+//   - use the single input, or,
+//   - use the single output, or,
+//   - use the first one in the collection,
+// - use the single input if it is ready, or,
+// - use the single output if it is ready, or,
+// - use the first ready one in the collection.
+QuantizedType QuantizationDriver::GetQuantParamsForSameScaleConstraint(
+    Operation* op) {
+  // Two vector to collect Non-empty operands and results states.
+  std::vector<QuantState*> mutable_states, immutable_states;
+  for (int i = 0; i < op->getNumOperands(); ++i) {
+    QuantState& state = GetOperandQuantState(op, i);
+    if (state.immutable) {
+      immutable_states.push_back(&state);
+    } else if (!state.IsEmpty()) {
+      mutable_states.push_back(&state);
+    }
+  }
+
+  const int immutable_operands_num = immutable_states.size();
+  const int mutable_operands_num = mutable_states.size();
+  // Use the operand's state if it is immutable and it is the only one
+  // operand.
+  if (op->getNumOperands() == 1 && immutable_operands_num == 1) {
+    return immutable_states.front()->params;
+  }
+
+  for (int i = 0; i < op->getNumResults(); ++i) {
+    QuantState& state = GetResultQuantState(op, i);
+    if (state.immutable) {
+      immutable_states.push_back(&state);
+    } else if (!state.IsEmpty()) {
+      mutable_states.push_back(&state);
+    }
+  }
+
+  const int immutable_results_num =
+      immutable_states.size() - immutable_operands_num;
+  const int mutable_results_num = mutable_states.size() - mutable_operands_num;
+  // Use the result's state if it is immutable and it is the only one result.
+  if (op->getNumResults() == 1 && immutable_results_num == 1) {
+    return immutable_states.back()->params;
+  }
+
+  // Use the first immutable state to quantize the rest operands and results.
+  if (!immutable_states.empty()) return immutable_states.front()->params;
+
+  // If there are no immutable states, use the operand's state if it is the
+  // only one operand and has parameters propagated.
+  if (op->getNumOperands() == 1 && mutable_operands_num == 1) {
+    return mutable_states.front()->params;
+  }
+
+  // If there are no immutable states, use the result's state if it is the
+  // only one result and has parameters propagated.
+  if (op->getNumResults() == 1 && mutable_results_num == 1) {
+    return mutable_states.back()->params;
+  }
+
+  // Use the first propagated state to quantize the rest operands and results.
+  if (!mutable_states.empty()) return mutable_states.front()->params;
+
+  // None operands/results have parameters propagated, skip this node for now.
+  return {};
+}
+
+void QuantizationDriver::PreprocessConstantOps() {
+  fn_.walk([&](arith::ConstantOp cst) {
+    // Non-float tensors are neither weights nor require quantization.
+    const auto type = mlir::dyn_cast<ShapedType>(cst.getType());
+    if (!type || !mlir::isa<FloatType>(type.getElementType())) return;
+
+    // Skip if the value is NaN or INF.
+    // Otherwise the illegal scale/zp will be calculated.
+    auto float_attr = mlir::dyn_cast<DenseFPElementsAttr>(cst.getValueAttr());
+    if (float_attr && (float_attr.getValues<APFloat>().empty() ||
+                       !float_attr.getValues<APFloat>()[0].isFinite())) {
+      return;
+    }
+
+    const Value value = cst.getResult();
+    builder_.setInsertionPoint(cst);
+
+    // The following loop will change the value uses, thus we cache all the uses
+    // needs to be changed.
+    SmallVector<std::pair<Operation*, int>> uses;
+    for (OpOperand& use : value.getUses()) {
+      uses.push_back({use.getOwner(), use.getOperandNumber()});
+    }
+    for (const auto [user, operand_num] : uses) {
+      const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(user);
+      const std::unique_ptr<OpQuantScaleSpec> scale_spec =
+          GetQuantScaleSpec(user);
+      const BiasParamsMap biases = spec->biases_params;
+
+      // The quantization parameters of a `weight` shouldn't be determined by
+      // other values. So any constants which are not bias, an operand of an
+      // op with same scale requirements, and haven't been quantized are
+      // weights.
+      if (!biases.contains(operand_num) &&
+          !scale_spec->has_same_scale_requirement &&
+          !dyn_cast<mlir::quant::ir::QuantizeCastOp>(user)) {
+        // Needs to scan the content of weights to get the quantization
+        // parameters if there are no quantization parameters (FakeQuant ops).
+        // For this case, the weight will not be duplicated.
+        weights_.insert(cst);
+        if (spec->coeff_op_quant_dim.find(operand_num) !=
+            spec->coeff_op_quant_dim.end()) {
+          optimized_weights_.insert(
+              {cst, spec->coeff_op_quant_dim[operand_num]});
+        }
+      } else {
+        // This is a bias or an operand of an op with same scale requirements,
+        // so the quantization parameter are propagated from or determined by
+        // other values. Duplicate this constant in case it is shared by
+        // different users.
+        if (uses.size() > 1) {
+          auto new_constant_op =
+              builder_.create<arith::ConstantOp>(cst.getLoc(), cst.getValue());
+          user->setOperand(operand_num, new_constant_op);
+        }
+      }
+    }
+  });
+}
+
+void QuantizationDriver::SetupAllStates() {
+  for (BlockArgument arg : fn_.getArguments()) {
+    args_.push_back(arg);
+    Value value = arg;
+    // If the argument is quantized, it should only has one user.
+    if (arg.hasOneUse()) {
+      Operation* user = value.use_begin().getUser();
+      if (auto q = dyn_cast<mlir::quant::ir::QuantizeCastOp>(user)) {
+        value = q.getResult();
+      }
+    }
+    InitializeArgState(arg, value);
+  }
+
+  fn_.walk([&](Operation* op) {
+    std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
+    if (!IsOpQuantizable(op) && !scale_spec->has_same_scale_requirement) {
+      return;
+    }
+    work_list_.push_back(op);
+
+    for (int i = 0; i < op->getNumOperands(); ++i) {
+      Value operand = op->getOperand(i);
+      if (Operation* inst = operand.getDefiningOp()) {
+        // If the operand comes from a `mlir::quant::ir::DequantizeCastOp`, we
+        // use the quantized input of this `mlir::quant::ir::DequantizeCastOp`
+        // to set the state.
+        if (auto dq = dyn_cast<mlir::quant::ir::DequantizeCastOp>(inst)) {
+          operand = dq.getArg();
+        }
+      }
+      InitializeOperandState(op, i, operand);
+    }
+
+    for (int i = 0; i < op->getNumResults(); ++i) {
+      Value result = op->getResult(i);
+      // If the result has been quantized, it should only be used by a
+      // `mlir::quant::ir::QuantizeCastOp`. For this case, we uses the quantized
+      // result to create the state and mark it immutable.
+      if (result.hasOneUse()) {
+        Operation* user = result.use_begin().getUser();
+        if (auto q = dyn_cast<mlir::quant::ir::QuantizeCastOp>(user)) {
+          result = q.getResult();
+        }
+      }
+      InitializeResultState(op, i, result);
+    }
+  });
+}
+
+arith::ConstantOp QuantizationDriver::DuplicateConstantOpIfNeeded(
+    arith::ConstantOp op, Operation* target_op, const int operand_index) {
+  if (op.getResult().hasOneUse()) {
+    return op;
+  }
+  OpBuilder builder(op->getContext());
+  builder.setInsertionPointAfter(op);
+  arith::ConstantOp new_op = cast<arith::ConstantOp>(builder.clone(*op));
+  target_op->getOpOperand(operand_index).set(new_op.getResult());
+  InitializeOperandState(target_op, operand_index, new_op.getResult());
+  InitializeResultState(new_op, 0, new_op.getResult());
+  return new_op;
+}
+
+bool QuantizationDriver::ShouldCheckBiasScale(
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType quantized_type, int& input_index, int& filter_index) {
+  // For now, restrict scale adjustment to ops with affine quantized weights,
+  // and having weights and biases as constants. This currently only applies to
+  // FC and Conv* ops. Restriction for the weight can be relaxed if there are
+  // needs for adjusting scale of variable weights.
+  auto affine_op = dyn_cast<AffineQuantizedOpInterface>(op);
+  auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
+  if (!affine_op || !bias_op || input_indices.size() != 2) return false;
+  if (!mlir::isa<DenseFPElementsAttr>(bias_op.getValue())) return false;
+  filter_index = affine_op.GetAffineOperandIndex();
+  if (!op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>()) {
+    return false;
+  }
+  if (filter_index == input_indices[0]) {
+    input_index = input_indices[1];
+  } else if (filter_index == input_indices[1]) {
+    input_index = input_indices[0];
+  } else {
+    return false;
+  }
+
+  const QuantState& input_state = GetOperandQuantState(op, input_index);
+  const QuantState& filter_state = GetOperandQuantState(op, filter_index);
+  // If quantization parameter for the filter is fixed, should return it as-is.
+  // Only checks ops with 8-bit input and weights, and 32-bit biases.
+  return input_state.params.getStorageTypeIntegralWidth() == 8 &&
+         filter_state.params.getStorageTypeIntegralWidth() == 8 &&
+         quantized_type.getStorageTypeIntegralWidth() == 32;
+}
+
+bool QuantizationDriver::SetBiasParamsWithAdjustments(
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType params) {
+  bool changed = false;
+
+  int input_index;
+  int filter_index;
+  if (!ShouldCheckBiasScale(op, bias_index, input_indices, params, input_index,
+                            filter_index)) {
+    return SetOperandParams(op, bias_index, params);
+  }
+
+  QuantState input_state = GetOperandQuantState(op, input_index);
+  QuantState filter_state = GetOperandQuantState(op, filter_index);
+  auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
+  const double input_scale =
+      mlir::cast<UniformQuantizedType>(input_state.params).getScale();
+
+  auto bias_values = mlir::cast<DenseFPElementsAttr>(bias_op.getValue());
+  // Restrict maximum absolute value of bias within INT_MAX / 2, to make some
+  // room for accumulator.
+  if (auto bias_quantized_type = mlir::dyn_cast<UniformQuantizedType>(params);
+      bias_quantized_type != nullptr) {
+    double bias_half_range = 0.0f;
+    for (auto bias : bias_values.getValues<APFloat>()) {
+      if (bias_half_range < std::abs(bias.convertToFloat())) {
+        bias_half_range = std::abs(bias.convertToFloat());
+      }
+    }
+    if (bias_half_range / bias_quantized_type.getScale() < kBiasMax) {
+      return SetOperandParams(op, bias_index, params);
+    }
+    const double new_bias_scale =
+        static_cast<double>(bias_half_range) / kBiasMax;
+
+    changed |= SetOperandParams(
+        op, bias_index,
+        UniformQuantizedType::getChecked(
+            bias_op->getLoc(), params.getFlags(), params.getStorageType(),
+            params.getExpressedType(), new_bias_scale, 0,
+            params.getStorageTypeMin(), params.getStorageTypeMax()));
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
+        op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
+        filter_index);
+    if (!filter_op) {
+      return SetOperandParams(op, bias_index, params);
+    }
+
+    const auto filter_quantized_type =
+        mlir::cast<UniformQuantizedType>(filter_state.params);
+    changed |= SetOperandParams(
+        op, filter_index,
+        UniformQuantizedType::getChecked(
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(),
+            new_bias_scale / input_scale, 0,
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
+        /*override=*/true);
+  } else if (auto bias_quantized_type =
+                 mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(params);
+             bias_quantized_type != nullptr) {
+    const auto filter_quantized_type =
+        mlir::cast<quant::UniformQuantizedPerAxisType>(filter_state.params);
+    std::vector<double> new_bias_scales = bias_quantized_type.getScales().vec();
+    std::vector<double> new_filter_scales =
+        filter_quantized_type.getScales().vec();
+
+    bool needs_adjustment = false;
+    for (int i = 0; i < bias_quantized_type.getScales().size(); ++i) {
+      const float abs_bias = std::abs(bias_values.getValues<float>()[i]);
+      if (abs_bias / new_bias_scales[i] > kBiasMax) {
+        new_bias_scales[i] = static_cast<double>(abs_bias) / kBiasMax;
+        new_filter_scales[i] = new_bias_scales[i] / input_scale;
+        needs_adjustment = true;
+      }
+    }
+    if (!needs_adjustment) {
+      return SetOperandParams(op, bias_index, params);
+    }
+    changed |= SetOperandParams(
+        op, bias_index,
+        quant::UniformQuantizedPerAxisType::getChecked(
+            bias_op->getLoc(), params.getFlags(), params.getStorageType(),
+            params.getExpressedType(), new_bias_scales,
+            bias_quantized_type.getZeroPoints(),
+            bias_quantized_type.getQuantizedDimension(),
+            params.getStorageTypeMin(), params.getStorageTypeMax()));
+
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
+        op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
+        filter_index);
+    changed |= SetOperandParams(
+        op, filter_index,
+        quant::UniformQuantizedPerAxisType::getChecked(
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(), new_filter_scales,
+            filter_quantized_type.getZeroPoints(),
+            filter_quantized_type.getQuantizedDimension(),
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
+        /*override=*/true);
+  }
+  return changed;
+}
+
+// This method scans the operations in the function to setup the initial
+// states for quantization parameter propagation.
+// TODO: b/323478683 - This algorithm assumes there are only one pair of
+// `mlir::quant::ir::QuantizeCastOp` and `mlir::quant::ir::DequantizeCastOp` ops
+// between two quantizable ops. A sanity check should be applied.
+void QuantizationDriver::Initialize() {
+  // Duplicate the bias constant, so the states can be setup correctly.
+  // TODO: b/323478683 - Function definition should also be duplicated if there
+  // are multiple call sites.
+  PreprocessConstantOps();
+
+  // Setup all the internal states.
+  SetupAllStates();
+}
+
+// Propagates the quantization parameters to the operands, results, and biases.
+// TODO: b/323478683 - Do not use while loop to handle this logic.
+bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
+  // TODO: b/323478683 - Use a typed indicator instead of a bool value.
+  bool changed = false;
+  while (!work_list_.empty()) {
+    Operation* op = work_list_.back();
+    work_list_.pop_back();
+
+    // This op has been quantized, so we should not consider it again.
+    if (quantized_.contains(op)) continue;
+    quantized_.insert(op);
+
+    if (auto constant_op = dyn_cast<arith::ConstantOp>(op); constant_op) {
+      // If the workflow requires inferring ranges from the content
+      // (post-training quantization) and it is weight (filter) and hasn't
+      // been quantized, we infer the quantization parameters from the content.
+      if (infer_tensor_range_ && IsWeight(constant_op) && !IsQuantized(op)) {
+        // The quantization parameters are determined by the content of the
+        // constant.
+        changed |= SetConstantResultParams(op);
+      }
+      continue;
+    }
+
+    std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
+
+    if (scale_spec->has_same_scale_requirement) {
+      const QuantizedType params = GetQuantParamsForSameScaleConstraint(op);
+      // The quantization parameters haven't been propagated to any operands
+      // or results. Skip this node for now.
+      if (!params) {
+        quantized_.erase(op);
+        continue;
+      }
+
+      // If this is a QDQ conversion only, the op could have a same-scale
+      // requirement for the floating point kernel but allow per-axis
+      // quantization for the quantized kernel. If the quantized dimension
+      // changes, the following logic no longer works as the same `params`
+      // shouldn't be used for both input and output quantization params.
+      // E.g. During TransposeOp's quantization propagation in
+      // PrepareQuantize, if the quantization is per-axis and the
+      // QuantizedDimension is transposed, then the output q-dq params must
+      // reflect the new QuantizedDimension. So, check and skip the
+      // propagation if any of the operands has a per-axis quantized type param
+      // and `RequiredSameQuantizedAxes` set to false.
+      // Currently, these lines of code are only applicable to TFL_TransposeOp
+      // and TFL_ReshapeOp. And the output q-dq propagation for this Op is
+      // performed in `PropagateTransposedPerAxisQuantDim` and
+      // `PropagateReshapedPerAxisQuantDim` respectively.
+      if (is_qdq_conversion_ &&
+          !scale_spec->required_same_quantized_axes_func()) {
+        if (HasPerAxisQuantizedOperand(op)) continue;
+      }
+
+      // Use the final state to set all the operands' parameters.
+      for (int i = 0; i < op->getNumOperands(); ++i) {
+        if (auto type =
+                mlir::dyn_cast<ShapedType>(op->getOperand(i).getType())) {
+          // Without this check, it will accidentally propagate the quantization
+          // information by the shared non-float tensors.
+          if (mlir::isa<FloatType>(type.getElementType()))
+            changed |= SetOperandParams(op, i, params);
+        }
+      }
+
+      // Use the final state to set all the results' parameters.
+      for (int i = 0; i < op->getNumResults(); ++i)
+        if (auto type = mlir::dyn_cast<ShapedType>(op->getResult(i).getType());
+            type != nullptr) {
+          // Without this check, it will accidentally propagate the quantization
+          // information by the shared non-float-tensors.
+          if (mlir::isa<FloatType>(type.getElementType()))
+            changed |= SetResultParams(op, i, params);
+        }
+    }
+
+    // If the model already contains immutable QDQs, require upstream to
+    // explicitly fix output range instead.
+    if (scale_spec->has_fixed_output_range && infer_tensor_range_ &&
+        !is_qdq_conversion_) {
+      // Infer ranges from the activation ops. This is usually required for
+      // the post-training quantization workflow.
+      // TODO: b/323478683 - Different result can have different fixed range.
+      const QuantizedType params =
+          scale_spec->fixed_output_range_func(is_signed_, bit_width_);
+      for (auto i = 0; i < op->getNumResults(); ++i) {
+        // The range is null if the result has been quantized.
+        if (params) {
+          changed |= SetResultParams(op, i, params);
+        }
+      }
+    }
+
+    const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(op);
+    for (const auto& [bias_operand_idx, non_bias_params] :
+         spec->biases_params) {
+      const auto& [non_bias_operand_indices, accumulator_scale_func] =
+          non_bias_params;
+      const QuantizedType params =
+          GetBiasParams(op, bias_operand_idx, non_bias_operand_indices,
+                        accumulator_scale_func);
+      if (!params) {
+        quantized_.erase(op);
+        continue;
+      }
+      changed |= SetBiasParamsWithAdjustments(op, bias_operand_idx,
+                                              non_bias_operand_indices, params);
+    }
+  }
+
+  return changed;
+}
+
+// Finalizes the arguments and result states in the function.
+void QuantizationDriver::Finalize() {
+  for (BlockArgument arg : args_) {
+    const QuantState& state = GetArgQuantState(arg);
+    RequantizeStates& requantizes = GetArgRequantizeStates(arg);
+    if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
+      continue;
+    }
+
+    if (!state.immutable) {
+      QuantizeArg(arg, state.params);
+    }
+
+    if (!requantizes.empty()) {
+      RequantizeArg(arg, requantizes);
+    }
+  }
+
+  for (const auto& [op_with_result_idx, quant_state_idx] : result_states_) {
+    const auto [op, result_idx] = op_with_result_idx;
+    const QuantState& state = GetResultQuantState(op, result_idx);
+    RequantizeStates& requantizes = GetResultRequantizeStates(op, result_idx);
+    if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
+      continue;
+    }
+
+    if (!state.immutable) {
+      QuantizeOpResult(op, result_idx, state.params);
+    }
+
+    if (!requantizes.empty()) {
+      RequantizeOpResult(op, result_idx, requantizes);
+    }
+  }
+}
+
+// Runs quantization in following steps:
+//   1. Scans the operations in the function to setup the initial
+//      states for quantization parameter propagation.
+//   2. Propagates the quantization parameters to the operands, results, and
+//      biases.
+//   3. Finalizes the arguments and result states in the function.
+void QuantizationDriver::Run() {
+  Initialize();
+  if (PropagateParamsAndReturnIfChanged()) {
+    Finalize();
+  }
+}
+
+void ApplyQuantizationParamsPropagation(
+    const func::FuncOp func, const bool is_signed, const int bit_width,
+    const bool disable_per_channel,
+    const OpQuantSpecGetter op_quant_spec_getter,
+    const bool infer_tensor_ranges, const bool legacy_float_scale,
+    const bool is_qdq_conversion) {
+  ApplyQuantizationParamsPropagation(
+      func, is_signed, bit_width, disable_per_channel, op_quant_spec_getter,
+      GetDefaultQuantScaleSpec, infer_tensor_ranges, legacy_float_scale,
+      is_qdq_conversion);
+}
+
+void ApplyQuantizationParamsPropagation(
+    const func::FuncOp func, const bool is_signed, const int bit_width,
+    const bool disable_per_channel,
+    const OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+    const bool infer_tensor_ranges, const bool legacy_float_scale,
+    const bool is_qdq_conversion) {
+  QuantizationDriver(func, is_signed, bit_width, disable_per_channel,
+                     op_quant_spec_getter, op_quant_scale_spec_getter,
+                     infer_tensor_ranges, legacy_float_scale, is_qdq_conversion)
+      .Run();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.h b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.h
new file mode 100644
index 000000000000..c7bb1c55c521
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.h
@@ -0,0 +1,387 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_DRIVER_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_DRIVER_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// The state for each op result during the quantization parameters propagation.
+struct QuantState {
+  // Quantization parameters propagated to an op result.
+  QuantizedType params;
+  // A flag indicates this state (the params) shouldn't be changed after it is
+  // initialized. This flag will be set to true if the quantization parameters
+  // are from the quantization-aware training.
+  const bool immutable;
+
+  bool IsEmpty() const { return params == nullptr; }
+};
+
+// The state for rescaling the propagated quantization parameters. This can be
+// on the input side to satisfy the constraint of previous operation, or on the
+// output side to satisfy the constraint of the next operation.
+struct RequantizeState {
+  // Sometimes, we have to "requantize" the quantization result to satisfy all
+  // the constraints. The "requantize" can happen either on the input or output
+  // of the quantization result.
+  enum RequantizePosition {
+    NO_REQUANTIZE,
+    ON_INPUT,
+    ON_OUTPUT
+  } pos = NO_REQUANTIZE;
+
+  // Quantization parameters will be used to add the requantize ops.
+  QuantizedType params;
+
+  // Avoid clobbering all uses of the value, limit to just these ops.
+  SmallVector<std::pair<Operation*, int>> users;
+};
+
+using RequantizeStates = SmallVector<RequantizeState>;
+
+// This is a worklist-driven driver for propagating quantization parameters
+// across operations.
+//
+// The initial quantization parameters are extracted from the quantized type
+// between adjacent `mlir::quant::ir::QuantizeCastOp` and
+// `mlir::quant::ir::DequantizeCastOp`s. All these initial parameters are marked
+// as immutable because they are from quantization-aware training.
+//
+// The algorithm traverses each op and sets the quantization parameters of its
+// operands and results, according to its quantization specification, and then
+// adds the operands and results to the worklist. If there are any conflicts
+// (for example, there are quantization parameters propagated from the previous
+// iteration), this process stops if the existing parameters are the immutable,
+// or adding `requantize` op to resolve the conflicts.
+//
+// After the algorithm is converged, pairs of `mlir::quant::ir::QuantizeCastOp`
+// and `mlir::quant::ir::DequantizeCastOp` are inserted to the right position to
+// materialize the propagation and requantize results.
+//
+class QuantizationDriver {
+ public:
+  // Type alias of int used to access `states_`.
+  using QuantStateIndex = int;
+
+  // (op, operand index) pair.
+  using OpWithOperandIndex = std::pair<Operation*, int>;
+
+  // (op, result index) pair.
+  using OpWithResultIndex = std::pair<Operation*, int>;
+
+  explicit QuantizationDriver(func::FuncOp func_op, const bool is_signed,
+                              const int bit_width,
+                              const bool disable_per_channel,
+                              OpQuantSpecGetter op_quant_spec_getter,
+                              OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+                              const bool infer_tensor_range,
+                              const bool legacy_float_scale = false,
+                              const bool is_qdq_conversion = false)
+      : fn_(func_op),
+        builder_(func_op.getBody()),
+        is_signed_(is_signed),
+        bit_width_(bit_width),
+        disable_per_channel_(disable_per_channel),
+        op_quant_spec_getter_(op_quant_spec_getter),
+        op_quant_scale_spec_getter_(op_quant_scale_spec_getter),
+        infer_tensor_range_(infer_tensor_range),
+        legacy_float_scale_(legacy_float_scale),
+        is_qdq_conversion_(is_qdq_conversion) {}
+
+  // The entry point of the quantization parameters propagation.
+  void Run();
+
+  // Sets up the states for all the op results in the function.
+  void Initialize();
+
+  // Propagates the quantization parameters across all the ops.
+  bool PropagateParamsAndReturnIfChanged();
+
+  // Inserts the Quantize and Dequantize ops according to the propagation
+  // result.
+  void Finalize();
+
+  SmallVector<BlockArgument, 4> GetArgs() { return args_; }
+
+  llvm::DenseMap<std::pair<mlir::Operation*, int>, int> GetResultStates() {
+    return result_states_;
+  }
+
+  DenseMap<OpWithResultIndex, QuantStateIndex> result_states_;
+
+  // Returns the state of the block argument.
+  QuantState& GetArgQuantState(BlockArgument arg) {
+    return states_[arg_states_[arg]];
+  }
+
+  // Returns the state of the index-th result of the op.
+  QuantState& GetResultQuantState(Operation* op, const int index) {
+    return states_[result_states_[{op, index}]];
+  }
+
+ private:
+  // Duplicates the constant op if it has multiple uses, and replaces
+  // target_op->operand[operand_index] with the newly created op. This also
+  // replaces corresponsing quantization states.
+  arith::ConstantOp DuplicateConstantOpIfNeeded(arith::ConstantOp op,
+                                                Operation* target_op,
+                                                int operand_index);
+
+  // Adjusts bias scale that is derived from other scales (fc, conv ops) to
+  // prevent overflow of quantized bias values. This also changes quantization
+  // state of other inputs when needed.
+  bool SetBiasParamsWithAdjustments(Operation* op, int bias_index,
+                                    ArrayRef<int> input_indices,
+                                    QuantizedType params);
+
+  // Checks preconditions to adjust bias scale.
+  bool ShouldCheckBiasScale(Operation* op, int bias_index,
+                            ArrayRef<int> input_indices,
+                            QuantizedType quantized_type, int& input_index,
+                            int& filter_index);
+
+  // Preprocesses the constants by doing the following:
+  //   - Duplicates constants if it is used by multiple ops. For example, if a
+  //     constant is used by multiple ops as a bias, duplicate constants and
+  //     let each op assign its own quantization parameter for bias.
+  //   - Adds all the non-bias constants (weights) to a set for looking up
+  //     later.
+  //   - Adds all per-channel weights to a set for looking up later.
+  void PreprocessConstantOps();
+
+  // Sets up all the data structures for quantization propagation.
+  void SetupAllStates();
+
+  // Returns Whether the constant is a weight, which shouldn't be shared by
+  // different ops.
+  bool IsWeight(Operation* cst) { return llvm::is_contained(weights_, cst); }
+
+  // Returns all the related quantization constraints of the op.
+  std::unique_ptr<OpQuantSpec> GetQuantSpec(Operation* op);
+  std::unique_ptr<OpQuantScaleSpec> GetQuantScaleSpec(Operation* op);
+
+  // Returns whether quantization parameters have been propagated to the results
+  // of this op.
+  bool IsQuantized(Operation* op);
+
+  // Adds all the users of index-th result of op to the work list.
+  void AddUserToList(Operation* op, const int index) {
+    for (Operation* user : op->getResult(index).getUsers()) {
+      work_list_.push_back(user);
+    }
+  }
+
+  // Adds the defining op of index-th operand of op to the work list.
+  void AddOperandToList(Operation* op, const int index) {
+    if (Operation* operand_op = op->getOperand(index).getDefiningOp();
+        operand_op != nullptr) {
+      work_list_.push_back(operand_op);
+    }
+  }
+
+  // Returns the quantization params for the bias input from the non-bias
+  // operands which have their indexes in the `non_biases` vector. The returned
+  // parameters are calculated by `func`.
+  QuantizedType GetBiasParams(Operation* op, int bias_index,
+                              ArrayRef<int> non_bias_operand_indices,
+                              AccumulatorScaleFunc func);
+
+  // Sets the quantization parameters of the result to `quantized_type`. If
+  // any quantization parameters have been propagated, a requantize will
+  // happen on the input of propagated quantization. Returns `true` if internal
+  // state has been modified.
+  bool SetResultParams(Operation* op, int result_index,
+                       QuantizedType quantized_type);
+
+  // Sets the quantization parameters of the operand to `quantized_type`. If any
+  // quantization parameters have been propagated, a `requantize` will happen on
+  // the output of propagated quantization. When `override` is set, quantization
+  // state of the value is replaced instead of adding requantization. Returns
+  // `true` if internal state has been modified.
+  bool SetOperandParams(Operation* op, int operand_index,
+                        QuantizedType quantized_type, bool override = false);
+
+  // Sets the quantization parameters of the constant result according to its
+  // content.
+  bool SetConstantResultParams(Operation* op);
+
+  // Inserts the Quantize and Dequantize ops after `op`'s `index`-th result. The
+  // quantized element type for the result is `quantized_type`.
+  void QuantizeOpResult(Operation* op, int result_index,
+                        QuantizedType quantized_type);
+
+  // Inserts the Quantize and Dequantize ops after `arg`. The quantized element
+  // type for `arg` is `quantized_type`.
+  void QuantizeArg(BlockArgument arg, QuantizedType quantized_type);
+
+  // Inserts the Quantize and Dequantize ops (i.e. QDQ) after `value`. The
+  // quantized element type for `value` is `quantized_type`.
+  void QuantizeValue(Value value, QuantizedType quantized_type, Location loc);
+
+  // Inserts the Quantize ops for requantizing the index-th result of the op.
+  void RequantizeOpResult(Operation* op, int result_index,
+                          RequantizeStates& states);
+
+  // Inserts the Quantize ops for requantizing a block argument.
+  void RequantizeArg(BlockArgument arg, RequantizeStates& states);
+
+  // Inserts the Quantize and Dequantize ops to quantize the value and returns
+  // the Quantize op.
+  void RequantizeValue(Value value, RequantizeStates& states, Location loc);
+
+  // Returns the quantization parameter satisfies the same scale
+  // constraints for the op. Returns an empty option if this quantization
+  // parameter doesn't exist.
+  QuantizedType GetQuantParamsForSameScaleConstraint(Operation* op);
+
+  // Returns the state of the index-th operand of the op.
+  QuantState& GetOperandQuantState(Operation* op, const int index) {
+    return states_[operand_states_[{op, index}]];
+  }
+
+  // Returns the states of the index-th operand of the op.
+  RequantizeStates& GetOperandRequantizeStates(Operation* op, const int index) {
+    return rescale_states_[operand_states_[{op, index}]];
+  }
+
+  // Returns the states of the index-th result of the op.
+  RequantizeStates& GetResultRequantizeStates(Operation* op, const int index) {
+    return rescale_states_[result_states_[{op, index}]];
+  }
+
+  // Returns the states of the arg.
+  RequantizeStates& GetArgRequantizeStates(BlockArgument arg) {
+    return rescale_states_[arg_states_[arg]];
+  }
+
+  // Sets the state of an argument. If this value is cached, uses the cached
+  // result without creating new entry in the state vector. Otherwise, allocate
+  // a new entry in the state vector.
+  void InitializeArgState(BlockArgument arg, Value arg_value);
+
+  // Sets the state of the index-th operand of the op. If this operand is
+  // cached, uses the cached result without creating new entry in the state
+  // vector. Otherwise, allocate a new entry in the state vector.
+  void InitializeOperandState(Operation* op, int index, Value value);
+
+  // Sets the state of the index-th result of the op. If this result is cached,
+  // uses the cached result without creating new entry in the state vector.
+  // Otherwise, allocate a new entry in the state vector.
+  void InitializeResultState(Operation* op, int index, Value value);
+
+  func::FuncOp fn_;
+  OpBuilder builder_;
+  const bool is_signed_;
+  const int bit_width_;
+  const bool disable_per_channel_;
+
+  // We should distinguish weights and bias constants. Biases are specified by
+  // the quantization spec or are the operands of ops with same scale spec. The
+  // rest are weights.
+  DenseSet<Operation*> weights_;
+
+  // The weights require narrow_range quantization. This map collects all the
+  // weight operands defined by the op quant spec. The value of each entry is
+  // the quantization dimension. If it is positive, per-channel quantization is
+  // required.
+  DenseMap<Operation*, int> optimized_weights_;
+
+  // All the ops needs to propagate the quantization parameters to.
+  std::vector<Operation*> work_list_;
+  absl::flat_hash_set<Operation*> quantized_;
+
+  // The vector contains all the quantization parameters propagated from the
+  // defining operations of the value, or from the quantization aware training.
+  std::vector<QuantState> states_;
+
+  // The map contains all the quantization parameters which are required to
+  // satisfy the same operands and results constraint. The keys of this map are
+  // the values from `operand_states_` and `result_state_`.
+  absl::flat_hash_map<QuantStateIndex, RequantizeStates> rescale_states_;
+
+  // Maps of indexes to the propagation state vector from the ops operands,
+  // results and arguments.
+  DenseMap<OpWithOperandIndex, QuantStateIndex> operand_states_;
+  DenseMap<BlockArgument, QuantStateIndex> arg_states_;
+  DenseMap<Value, QuantStateIndex> value_to_state_;
+
+  // This vector is to preserve the arguments order, so the newly inserted
+  // quantized ops for the arguments are deterministically ordered.
+  SmallVector<BlockArgument, 4> args_;
+
+  OpQuantSpecGetter op_quant_spec_getter_;
+  OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
+
+  // Infer output ranges for activation ops and constants. This is usually
+  // required for post-training quantization.
+  const bool infer_tensor_range_;
+
+  // Calculate scales in float instead of double, so that the scales and
+  // quantized values are exactly the same with the TOCO quantizer.
+  const bool legacy_float_scale_;
+
+  // If true, the model is a floating point graph with QDQ ops to be eliminated
+  // and fused into quantized kernels.
+  const bool is_qdq_conversion_;
+};
+
+// Propagates quantization parameters across ops in this function and satisfies
+// the quantization specification of the ops. This methods assumes the initial
+// quantization parameters are stored as adjacent quantize and dequantize ops
+// and the propagation results are materialized by inserting pairs of quantize
+// and dequantize ops to this function. Set `disable_per_channel` to true to not
+// use per channel quantization even the op supports it.
+// Setting `infer_tensor_range` to true, to infer quantization parameters from
+// the activation ops and weight constants. This is only used for post-training
+// quantization.
+void ApplyQuantizationParamsPropagation(func::FuncOp func, bool is_signed,
+                                        int bit_width, bool disable_per_channel,
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool infer_tensor_ranges,
+                                        bool legacy_float_scale,
+                                        bool is_qdq_conversion);
+
+void ApplyQuantizationParamsPropagation(
+    func::FuncOp func, bool is_signed, int bit_width, bool disable_per_channel,
+    OpQuantSpecGetter op_quant_spec_getter,
+    OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
+    bool legacy_float_scale, bool is_qdq_conversion);
+
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_DRIVER_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver_test.cc b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver_test.cc
new file mode 100644
index 000000000000..1c7a12fb2658
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::tf_quant {
+namespace {
+
+using ApplyQuantizationParamsPropagationTest =
+    mlir::quant::QuantizationTestBase;
+using ::testing::IsEmpty;
+using ::testing::Not;
+
+constexpr absl::string_view kModuleTFLite = R"mlir(
+  module {
+    func.func @main(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> attributes {_from_xla_call_module} {
+      %cst_0 = arith.constant dense<1.0> : tensor<3x1x1x3xf32>
+      %cst_1 = arith.constant dense<2.0> : tensor<3xf32>
+      %0 = "tf.XlaCallModule"(%arg0, %cst_0, %cst_1) <{Sout = [#tf_type.shape<1x4x4x3>], module = "", version = 9 : i64}> {_entry_function = @composite_fn_1, _stablehlo_version = "1.0.0", _original_entry_function = "composite_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst_0, %cst_1) <{Sout = [#tf_type.shape<1x4x4x3>], module = "", version = 9 : i64}> {_entry_function = @composite_fn_2, _stablehlo_version = "1.0.0", _original_entry_function = "composite_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      return %1 : tensor<1x4x4x3xf32>
+    }
+    func.func private @composite_fn_1(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<3x1x1x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x4x4x3xf32> attributes {tf_quant.composite_function} {
+      %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      return %0 : tensor<1x4x4x3xf32>
+    }
+    func.func private @composite_fn_2(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<3x1x1x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x4x4x3xf32> attributes {tf_quant.composite_function} {
+      %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x4x4x3xf32>, tensor<3x1x1x3xf32>, tensor<3xf32>) -> tensor<1x4x4x3xf32>
+      return %0 : tensor<1x4x4x3xf32>
+    }
+  }
+)mlir";
+
+// TOOD: b/323478683 - Directly use types rather than creating a `unique_ptr`.
+std::unique_ptr<OpQuantSpec> GetOpQuantSpec(
+    const mlir::Operation* op,
+    bool disable_per_channel_for_dense_layers = false) {
+  auto spec = std::make_unique<OpQuantSpec>();
+  spec->coeff_op_quant_dim[1] = 3;
+  spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
+  for (const auto& [key, value] : spec->coeff_op_quant_dim) {
+    spec->quantizable_operands.insert(key);
+  }
+  return spec;
+}
+
+TEST_F(ApplyQuantizationParamsPropagationTest,
+       ConstsUsedMultipleTimesAreDuplicated) {
+  const OwningOpRef<ModuleOp> module_op_ref =
+      mlir::quant::QuantizationTestBase::ParseModuleOpString(kModuleTFLite);
+  func::FuncOp main_fn = mlir::quant::FindMainFuncOp(*module_op_ref);
+
+  auto op_quant_spec_getter = [&](mlir::Operation* op) {
+    return GetOpQuantSpec(op, /*disable_per_channel_for_dense_layers=*/false);
+  };
+  QuantizationDriver quantization_driver(
+      main_fn, /*is_signed=*/true, /*bit_width=*/8,
+      /*disable_per_channel=*/false, op_quant_spec_getter,
+      GetDefaultQuantScaleSpec,
+      /*infer_tensor_range=*/true, /*legacy_float_scale=*/false,
+      /*is_qdq_conversion=*/false);
+
+  quantization_driver.Initialize();
+
+  int64_t num_constant_op = 0;
+  main_fn.walk([&](arith::ConstantOp cst) { ++num_constant_op; });
+  EXPECT_EQ(num_constant_op, 4);
+}
+
+TEST_F(ApplyQuantizationParamsPropagationTest,
+       PropagateParamsCreatesQuantState) {
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
+  func::FuncOp main_fn = mlir::quant::FindMainFuncOp(*module_op_ref);
+
+  auto op_quant_spec_getter = [&](mlir::Operation* op) {
+    return GetOpQuantSpec(op, /*disable_per_channel_for_dense_layers=*/false);
+  };
+  QuantizationDriver quantization_driver(
+      main_fn, /*is_signed=*/true, /*bit_width=*/8,
+      /*disable_per_channel=*/false, op_quant_spec_getter,
+      GetDefaultQuantScaleSpec,
+      /*infer_tensor_range=*/true, /*legacy_float_scale=*/false,
+      /*is_qdq_conversion=*/false);
+
+  quantization_driver.Initialize();
+  ASSERT_TRUE(quantization_driver.PropagateParamsAndReturnIfChanged());
+  EXPECT_THAT(quantization_driver.GetArgs(), Not(IsEmpty()));
+
+  for (const auto& arg : quantization_driver.GetArgs()) {
+    const QuantState& state = quantization_driver.GetArgQuantState(arg);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
+  }
+  for (const auto& result : quantization_driver.GetResultStates()) {
+    Operation* op = result.first.first;
+    const int res_index = result.first.second;
+    const QuantState state =
+        quantization_driver.GetResultQuantState(op, res_index);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
+  }
+}
+
+TEST_F(ApplyQuantizationParamsPropagationTest, FinalizeInsertsQDQOps) {
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
+  func::FuncOp main_fn = mlir::quant::FindMainFuncOp(*module_op_ref);
+
+  auto op_quant_spec_getter = [&](mlir::Operation* op) {
+    return GetOpQuantSpec(op, /*disable_per_channel_for_dense_layers=*/false);
+  };
+  ApplyQuantizationParamsPropagation(
+      main_fn, /*is_signed=*/true, /*bit_width=*/8,
+      /*disable_per_channel=*/false, op_quant_spec_getter,
+      /*infer_tensor_ranges=*/true, /*legacy_float_scale=*/false,
+      /*is_qdq_conversion=*/false);
+  Operation* xla_call_module_op =
+      mlir::quant::FindOperationOfType<TF::XlaCallModuleOp>(main_fn);
+  Operation* filter_dcast_op =
+      xla_call_module_op->getOperand(1).getDefiningOp();
+  Operation* filter_qcast_op = filter_dcast_op->getOperand(0).getDefiningOp();
+  ASSERT_NE(filter_qcast_op, nullptr);
+  EXPECT_TRUE(isa<mlir::quant::ir::QuantizeCastOp>(filter_qcast_op));
+  EXPECT_TRUE(isa<mlir::quant::ir::DequantizeCastOp>(filter_dcast_op));
+  EXPECT_TRUE(isa<quant::UniformQuantizedPerAxisType>(
+      mlir::cast<TensorType>(filter_qcast_op->getResult(0).getType())
+          .getElementType()));
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h
new file mode 100644
index 000000000000..07e38c5f3ebf
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h
@@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the op traits used in the MLIR TensorFlow Lite dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_TRAITS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_TRAITS_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+using QuantizedType = mlir::quant::QuantizedType;
+using UniformQuantizedType = mlir::quant::UniformQuantizedType;
+
+namespace mlir {
+namespace tf_quant {
+// Verifies that the op satisfies the same operands and results scales
+// constraints. Note that this constraint can only be applied on some
+// storage types of the op.
+LogicalResult VerifySameScales(Operation* op);
+}  // namespace tf_quant
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_interface.h.inc"
+
+namespace OpTrait {
+namespace tf_quant {
+
+// The base class that all the quantization related OpTrait implements.
+template <typename ConcreteType, template <typename> class TraitType>
+struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
+  static bool IsBias(int index) { return false; }
+  static bool IsQuantizable() { return true; }
+};
+
+// This class provides the API for ops that has a fixed output value range.
+// This is used as a trait like this:
+//
+//   class SoftmaxOp
+//       : public Op<SoftmaxOp,
+//           OpTrait::quant::FixedResultUniformScale<
+//               8, -128, 390625, -8, 0, 255, false>::Impl> {
+//
+// TODO(fengliuai): create a better way to express floating point scale in the
+// template argument list.
+template <unsigned BitWidth, int ZeroPoint, int ScaleMantissa, int ScaleExp,
+          int64_t StorageTypeMin, int64_t StorageTypeMax, bool Sign>
+class FixedResultUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, FixedResultUniformScale<
+                              BitWidth, ZeroPoint, ScaleMantissa, ScaleExp,
+                              StorageTypeMin, StorageTypeMax, Sign>::Impl> {
+   public:
+    QuantizedType GetResultQuantizedType(int index) {
+      auto op = this->getOperation();
+      const auto result_type =
+          op->getResult(index).getType().template cast<ShapedType>();
+      if (!result_type.getElementType().template isa<FloatType>()) return {};
+      Builder builder(op->getContext());
+      const IntegerType storage_type = builder.getIntegerType(BitWidth);
+      const double scale = static_cast<double>(ScaleMantissa) *
+                           std::pow(10.0, static_cast<double>(ScaleExp));
+      return UniformQuantizedType::getChecked(
+          Sign, storage_type, result_type.getElementType(), scale, ZeroPoint,
+          StorageTypeMin, StorageTypeMax, builder.getUnknownLoc());
+    }
+  };
+};
+
+// This class provides the API for ops that has input as bias. This is used
+// as a trait like this:
+//
+//   class Conv2DOp
+//       : public Op<Conv2DOp, OpTrait::quant::AccumulatorScale<2, 0, 1>::Impl>
+//
+// TODO(fengliuai): supports a configurable accumulator bit width.
+template <int Bias, int... Operands>
+class AccumulatorUniformScale {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public QuantizationSpecTraitBase<
+            ConcreteType, AccumulatorUniformScale<Bias, Operands...>::Impl> {
+   public:
+    // Whether the index-th operand is a bias.
+    static bool IsBias(int index) { return index == Bias; }
+
+    // Returns the indexes of all the non-bias operands.
+    static std::vector<int> GetAllNonBiasOperands() {
+      return std::vector<int>({Operands...});
+    }
+  };
+};
+
+// The trait to specify the operand index of the coefficient for an affine op
+// and also the quantization dimension if per-axis quantization is support.
+// If the quantization dimension is -1, per-axis quantization isn't supported.
+//
+//   class Conv2DOp
+//       : public Op<Conv2DOp, OpTrait::quant::AffineOpCoefficient<0>::Impl>
+//
+template <int QuantDim, int OperandIndex = 1>
+class AffineOpCoefficient {
+ public:
+  template <typename ConcreteType>
+  class Impl
+      : public TraitBase<ConcreteType,
+                         AffineOpCoefficient<QuantDim, OperandIndex>::Impl> {
+   public:
+    static int GetCoefficientOperandIndex() { return OperandIndex; }
+    static int GetQuantizationDim() { return QuantDim; }
+  };
+};
+
+// This class provides the API for ops that can be quantized.
+// This is as a trait like this:
+//
+//   class LessOp : public Op<LessOp, OpTrait::quant::QuantizableResult> {
+//
+template <typename ConcreteType>
+class QuantizableResult
+    : public QuantizationSpecTraitBase<ConcreteType, QuantizableResult> {};
+
+}  // namespace tf_quant
+}  // namespace OpTrait
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_TRAITS_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.cc b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.cc
new file mode 100644
index 000000000000..2beccf116125
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.cc
@@ -0,0 +1,1078 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/lite/toco_legacy/portable_tensor_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantizeUtils.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/tools/optimize/quantization_utils.h"
+
+namespace mlir {
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_interface.cc.inc"
+
+namespace tf_quant {
+namespace {
+
+constexpr double kSmallestHalfRange = kNearZeroTolerance / 2;
+using QType = quant::QuantizedType;
+
+// Repeats the content of `data` multiple times to resize to `target_size`.
+// Note that this only broadcast across one dimension.
+template <typename T>
+bool BroadcastVector(int target_size, SmallVectorImpl<T>& data) {
+  const int size = data.size();
+  if (size != target_size) {
+    if (target_size % size != 0) return true;
+    data.reserve(target_size);
+    for (int i = 1; i < target_size / size; ++i) {
+      data.insert(data.end(), data.begin(), data.begin() + size);
+    }
+  }
+  return false;
+}
+
+// Expands the range to be larger than or equal to 1.0e-6, if it is
+// very small (< 1.0e-6). This is to prevent very large quantized value by this
+// range.
+void ExpandVerySmallRange(const ArrayRef<double> mins,
+                          const ArrayRef<double> maxs,
+                          SmallVectorImpl<double>& effective_mins,
+                          SmallVectorImpl<double>& effective_maxs) {
+  for (const auto [min, max] : llvm::zip(mins, maxs)) {
+    // The range is small. Expands the range to stride 0.0 and also at least
+    // 1.0e-6.
+    if (max - min > kNearZeroTolerance) {
+      effective_mins.push_back(min);
+      effective_maxs.push_back(max);
+    } else {
+      effective_mins.push_back(std::min(min, -kSmallestHalfRange));
+      effective_maxs.push_back(std::max(max, kSmallestHalfRange));
+    }
+  }
+}
+
+// Sets the min / max, scale and zero_points from the fake quant num_bits
+// attribute from QAT.
+QuantizedType ResetMinMaxFromNumBits(const QuantizedType type,
+                                     const int num_bits,
+                                     const bool narrow_range,
+                                     const bool is_signed) {
+  if (num_bits >= 8) {
+    return type;
+  }
+  int64_t qmin = QType::getDefaultMinimumForInteger(is_signed, num_bits);
+  int64_t qmax = QType::getDefaultMaximumForInteger(is_signed, num_bits);
+  if (narrow_range) {
+    qmin += 1;
+  }
+  const int64_t storage_type_min = type.getStorageTypeMin();
+  const int64_t storage_type_max = type.getStorageTypeMax();
+  const double rate =
+      static_cast<double>(storage_type_max - storage_type_min) / (qmax - qmin);
+  const auto& recalculate_scale = [&](double scale) -> double {
+    return scale * rate;
+  };
+  const auto& recalculate_zero_point = [&](int64_t zero_point) -> int64_t {
+    return qmax - std::round((storage_type_max - zero_point) / rate);
+  };
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
+    const double scale = recalculate_scale(q_type.getScale());
+    const double zero_point = recalculate_zero_point(q_type.getZeroPoint());
+    return UniformQuantizedType::get(q_type.getFlags(), q_type.getStorageType(),
+                                     q_type.getExpressedType(), scale,
+                                     zero_point, qmin, qmax);
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
+    const int size = q_type.getScales().size();
+    SmallVector<double, 4> scales(size);
+    SmallVector<int64_t, 4> zero_points(size);
+    for (int i = 0; i < size; ++i) {
+      scales[i] = recalculate_scale(q_type.getScales()[i]);
+      zero_points[i] = recalculate_zero_point(q_type.getZeroPoints()[i]);
+    }
+    return quant::UniformQuantizedPerAxisType::get(
+        q_type.getFlags(), q_type.getStorageType(), q_type.getExpressedType(),
+        scales, zero_points, q_type.getQuantizedDimension(), qmin, qmax);
+  } else {
+    llvm_unreachable("Unsupported QuantizedType in ResetMinMaxFromNumBits");
+  }
+  return type;
+}
+
+// Changes the axis of the input per-channel quantized type to match the
+// dimension of the target type. Returns nullptr if it fails.
+quant::UniformQuantizedPerAxisType ResetAxisAndBroadcast(
+    const ArrayRef<int64_t> shape,
+    const quant::UniformQuantizedPerAxisType qtype, const Type target,
+    const int quant_dim) {
+  const auto shaped = dyn_cast<RankedTensorType>(target);
+  if (!shaped) return {};
+  const ArrayRef<int64_t> new_shape = shaped.getShape();
+
+  SmallVector<double, 4> scales(qtype.getScales().begin(),
+                                qtype.getScales().end());
+  SmallVector<int64_t, 4> zero_points(qtype.getZeroPoints().begin(),
+                                      qtype.getZeroPoints().end());
+
+  if (new_shape.size() == shape.size()) {  // same rank
+    // Broadcast the scales and zero points to match the target size, which is
+    // usually the axis-th dimension of the target type. Currently, it covers
+    // two cases:
+    // - for Transpose, the data layout is changed so the `dim[axis]` still
+    // equals to the `scales_size`. The broadcast skips;
+    // - for Reshape, the data layout isn't changed but the innermost dimension
+    // is expand to cover the last two original dimensions. Thus we just need to
+    // be repeated the `scales` dim[2] times to covers the new dim length.
+    if (BroadcastVector<double>(shaped.getDimSize(quant_dim), scales) ||
+        BroadcastVector<int64_t>(shaped.getDimSize(quant_dim), zero_points)) {
+      return {};
+    }
+  } else if ((new_shape.size() == shape.size() + 1) && new_shape.front() == 1) {
+    // Handle the [A, B, C] -> [1, A, B, C] reshape case.
+    if (!(std::equal(shape.begin(), shape.end(), new_shape.begin() + 1) &&
+          quant_dim == new_shape.size() - 1)) {
+      return {};
+    }
+  } else {
+    return {};
+  }
+
+  return quant::UniformQuantizedPerAxisType::get(
+      qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
+      scales, zero_points, quant_dim, qtype.getStorageTypeMin(),
+      qtype.getStorageTypeMax());
+}
+
+}  // namespace
+
+bool IsOpQuantizable(Operation* op) {
+  if (isa<func::ConstantOp, arith::ConstantOp, mlir::quant::ir::StatisticsOp>(
+          op)) {
+    // Constant ops do not have QuantizableResult attribute but they can deal
+    // with quantized tensors.
+    return true;
+  } else if (op->hasTrait<OpTrait::IsTerminator>() ||
+             isa<mlir::quant::ir::QuantizeCastOp,
+                 mlir::quant::ir::DequantizeCastOp>(op)) {
+    // Terminators, qcast and decast are not quantizable.
+    return false;
+  }
+
+  const bool attr_enforced_quantizable =
+      op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+      op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue().str() ==
+          QuantTraitValues[QuantizationTrait::FullyQuantizable];
+
+  const bool attr_output_quantized = QuantizableOpSupportsFloatOutputType(op);
+
+  const bool trait_enforced_quantizable =
+      op->hasTrait<OpTrait::tf_quant::QuantizableResult>();
+
+  return attr_enforced_quantizable || trait_enforced_quantizable ||
+         attr_output_quantized;
+}
+
+// Checks if an op has specific attributes that enable quantized inputs with
+// float outputs.
+bool QuantizableOpSupportsFloatOutputType(Operation* op) {
+  static constexpr char kOutputTypes[] = "_output_types";
+  static constexpr char kSupportOutputTypeFloat[] =
+      "_support_output_type_float_in_quantized_op";
+
+  if (!(op->hasAttrOfType<mlir::BoolAttr>(kOutputQuantized) &&
+        op->getAttrOfType<mlir::BoolAttr>(kOutputQuantized).getValue())) {
+    return false;
+  }
+
+  if (!(op->hasAttrOfType<mlir::BoolAttr>(kSupportOutputTypeFloat) &&
+        op->getAttrOfType<mlir::BoolAttr>(kSupportOutputTypeFloat)
+            .getValue())) {
+    return false;
+  }
+
+  if (!op->hasAttrOfType<mlir::ArrayAttr>(kOutputTypes)) {
+    return false;
+  }
+
+  auto output_types_attr = op->getAttrOfType<mlir::ArrayAttr>(kOutputTypes);
+
+  if (output_types_attr.size() != op->getResultTypes().size()) {
+    return false;
+  }
+
+  for (const auto [attr_element, result_type] :
+       llvm::zip_equal(output_types_attr, op->getResultTypes())) {
+    auto type_attr = mlir::dyn_cast_or_null<TypeAttr>(attr_element);
+
+    if (!type_attr) {
+      return false;
+    }
+
+    auto tensor_type = mlir::dyn_cast_or_null<TensorType>(result_type);
+
+    if (!tensor_type) {
+      return false;
+    }
+
+    if (type_attr.getValue() != tensor_type.getElementType()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Returns the quantized type for the
+// input_type/min/max/storag_type_width/narrow_range.
+// This is entry point to the Quant dialect and used for both quantizing
+// activations and weights.
+Type GetQuantizedType(Builder builder, const Type input_type,
+                      const ArrayRef<double> min, const ArrayRef<double> max,
+                      const int quant_dim, const int storage_type_width,
+                      const bool narrow_range, const bool is_signed,
+                      const bool legacy_float_scale,
+                      const bool use_fake_quant_num_bits) {
+  auto converter =
+      mlir::quant::ir::ExpressedToQuantizedConverter::forInputType(input_type);
+
+  // Expand the range to prevent extremely small scales and large quantized
+  // integers which can cause overflow. This leads to scale
+  // 7.843137254901961e-9 with 8 bits.
+  SmallVector<double, 4> effective_mins, effective_maxs;
+  ExpandVerySmallRange(min, max, effective_mins, effective_maxs);
+
+  quant::QuantizedType quantized_element_type;
+  if (min.size() == 1 && max.size() == 1 && quant_dim == -1) {
+    quantized_element_type = quantfork::fakeQuantAttrsToType(
+        builder.getUnknownLoc(), storage_type_width, effective_mins[0],
+        effective_maxs[0], narrow_range, converter.expressed_type, is_signed);
+    if (legacy_float_scale) {
+      quantized_element_type =
+          DownCastScale(quantized_element_type, effective_mins[0],
+                        effective_maxs[0], builder.getUnknownLoc());
+    }
+  } else if (min.size() == max.size()) {
+    auto shape = dyn_cast<ShapedType>(input_type);
+    if (!shape || shape.getRank() <= quant_dim ||
+        static_cast<int64_t>(min.size()) != shape.getDimSize(quant_dim)) {
+      return {};
+    }
+    // The quantization dim is set to the last dimension.
+    quantized_element_type = quantfork::fakeQuantAttrsToType(
+        builder.getUnknownLoc(), storage_type_width, quant_dim, effective_mins,
+        effective_maxs, narrow_range, converter.expressed_type, is_signed);
+    if (legacy_float_scale) {
+      quantized_element_type =
+          DownCastScale(quantized_element_type, effective_mins, effective_maxs,
+                        builder.getUnknownLoc());
+    }
+  }
+  if (!quantized_element_type) return {};
+  // Use fake quant configured bit-widths (only supported for
+  // 1 < num_bits < 8 bits) instead of using 8-bit defaults.
+  if (use_fake_quant_num_bits && storage_type_width > 1 &&
+      storage_type_width < 8 &&
+      quantized_element_type.getStorageTypeMax() >
+          QType::getDefaultMinimumForInteger(is_signed, storage_type_width)) {
+    const auto resetEleType = ResetMinMaxFromNumBits(
+        quantized_element_type, storage_type_width, narrow_range, is_signed);
+    return converter.convert(resetEleType);
+  }
+  return converter.convert(quantized_element_type);
+}
+
+// TODO(fengliuai): promote this utility method to mlir QuantOps.
+TypeAttr RescaleQuantizedType(const Type input, const Attribute factor) {
+  const auto factor_values = dyn_cast_or_null<DenseFPElementsAttr>(factor);
+  if (!factor_values) return {};
+  const auto element_type =
+      quant::QuantizedType::getQuantizedElementType(input);
+  if (!element_type) return {};
+  if (auto qtype = dyn_cast<quant::UniformQuantizedPerAxisType>(element_type)) {
+    const ArrayRef<double> scales = qtype.getScales();
+    // Broadcasting hasn't been implemented yet.
+    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements())
+      return {};
+    SmallVector<double, 4> new_scales;
+    new_scales.reserve(scales.size());
+    auto scales_iter = scales.begin();
+    for (const auto& f : factor_values) {
+      new_scales.push_back(*scales_iter *
+                           std::fabs(FloatAttr::getValueAsDouble(f)));
+      ++scales_iter;
+    }
+    // We are assuming symmetric quantization.
+    auto new_ele_type = quant::UniformQuantizedPerAxisType::get(
+        qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
+        new_scales, qtype.getZeroPoints(), qtype.getQuantizedDimension(),
+        qtype.getStorageTypeMin(), qtype.getStorageTypeMax());
+    if (const auto new_type = new_ele_type.castFromExpressedType(
+            quant::QuantizedType::castToExpressedType(input))) {
+      return TypeAttr::get(new_type);
+    }
+  }
+  // Currently, we only support per-axis quantized type.
+  return {};
+}
+
+TypeAttr GetQuantizedTypeAttr(const Builder builder, const Type input_type,
+                              const Attribute min, const Attribute max,
+                              const int quant_dim, const IntegerAttr num_bits,
+                              const BoolAttr narrow_range, const bool is_signed,
+                              const bool legacy_float_scale,
+                              const bool use_fake_quant_num_bits) {
+  SmallVector<double, 4> min_value, max_value;
+  const auto mins = dyn_cast<DenseFPElementsAttr>(min);
+  const auto maxs = dyn_cast<DenseFPElementsAttr>(max);
+  if (mins && maxs) {
+    min_value.reserve(mins.getNumElements());
+    max_value.reserve(maxs.getNumElements());
+    for (auto it = mins.begin(); it != mins.end(); ++it) {
+      min_value.push_back(FloatAttr::getValueAsDouble(*it));
+    }
+    for (auto it = maxs.begin(); it != maxs.end(); ++it) {
+      max_value.push_back(FloatAttr::getValueAsDouble(*it));
+    }
+  } else {
+    const auto fmin = dyn_cast<FloatAttr>(min);
+    const auto fmax = dyn_cast<FloatAttr>(max);
+    if (fmin && fmax) {
+      min_value.push_back(fmin.getValueAsDouble());
+      max_value.push_back(fmax.getValueAsDouble());
+    } else {
+      return {};
+    }
+  }
+  const Type final_type =
+      GetQuantizedType(builder, input_type, min_value, max_value, quant_dim,
+                       num_bits.getInt(), narrow_range.getValue(), is_signed,
+                       legacy_float_scale, use_fake_quant_num_bits);
+  if (!final_type) return {};
+  return TypeAttr::get(final_type);
+}
+
+TypeAttr CastQuantizedTypeAttrFromExpressedType(const Builder builder,
+                                                const TypeAttr source,
+                                                const Type target,
+                                                const int axis) {
+  const auto source_type = dyn_cast_or_null<ShapedType>(source.getValue());
+  if (!source_type) return {};
+  const auto src_ele_type = source_type.getElementType();
+  auto qtype = dyn_cast<quant::QuantizedType>(src_ele_type);
+
+  // Reset the quantization dimensions if it is per-axis.
+  if (const auto per_axis =
+          dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(qtype)) {
+    // For the pass-through ops, we don't know which the dimension will be the
+    // new quantization dimension. Only if the new quantization dimension can
+    // be inferred, it is safe to reset the per-axis quantized type.
+    if (axis == -1) return {};
+    qtype =
+        ResetAxisAndBroadcast(source_type.getShape(), per_axis, target, axis);
+  }
+  if (!qtype) return {};
+  const Type final_type = qtype.castFromExpressedType(target);
+  if (!final_type) return {};
+  return TypeAttr::get(final_type);
+}
+
+void ExtractMinMaxFromAttr(const DenseFPElementsAttr values, const int dim_size,
+                           const int slice_size, bool symmetric,
+                           SmallVectorImpl<double>& mins,
+                           SmallVectorImpl<double>& maxs) {
+  // If all the element values are same we don't need to scan the content.
+  if (values.isSplat()) {
+    const double single_value =
+        FloatAttr::getValueAsDouble(values.getSplatValue<llvm::APFloat>());
+
+    // When the single value isn't 0.0, we expand it to a range to include
+    // this single value and 0.0. This will give us a scale and zero point
+    // works for both this value and 0.0.
+    if (single_value < 0.0) {
+      mins[0] = single_value;
+      maxs[0] = symmetric ? -single_value : 0.0;
+    } else if (single_value > 0.0) {
+      mins[0] = symmetric ? -single_value : 0.0;
+      maxs[0] = single_value;
+    } else {
+      mins[0] = maxs[0] = single_value;
+    }
+    for (int i = 1; i < dim_size; ++i) {
+      mins[i] = mins[0];
+      maxs[i] = maxs[0];
+    }
+  } else {
+    int64_t flatten_index = 0;
+    auto begin = values.begin();
+    auto end = values.end();
+    for (auto it = begin; it != end; ++it, ++flatten_index) {
+      const double ele_value = FloatAttr::getValueAsDouble(*it);
+      const int slice_index = flatten_index / slice_size;
+      const int channel_index = slice_index % dim_size;
+      mins[channel_index] = std::min(mins[channel_index], ele_value);
+      maxs[channel_index] = std::max(maxs[channel_index], ele_value);
+    }
+    // Expand range to include 0.
+    for (int i = 0; i < dim_size; ++i) {
+      maxs[i] = std::max(maxs[i], 0.0);
+      mins[i] = std::min(mins[i], 0.0);
+    }
+    if (symmetric) {
+      for (int i = 0; i < dim_size; ++i) {
+        maxs[i] = std::max(std::abs(mins[i]), std::abs(maxs[i]));
+        mins[i] = -maxs[i];
+      }
+    }
+  }
+}
+
+Type GetUniformQuantizedTypeForWeight(
+    const ElementsAttr attr, const bool symmetric, const unsigned num_bits,
+    const bool is_signed, const bool narrow_range,
+    const bool legacy_float_scale, const bool use_fake_quant_num_bits) {
+  const Builder builder(attr.getContext());
+  // `symmetric` can only be used when it is `signed` and `narrow_range`.
+  if (symmetric && (!is_signed || !narrow_range)) return {};
+
+  SmallVector<double, 4> mins(1, std::numeric_limits<double>::max());
+  SmallVector<double, 4> maxs(1, std::numeric_limits<double>::min());
+  const auto fp = dyn_cast<DenseFPElementsAttr>(attr);
+  if (!fp) return {};
+
+  // Computes the effective min/max values of the attribute values.
+  ExtractMinMaxFromAttr(fp, /*dim_size=*/1, /*slice_size=*/1, symmetric, mins,
+                        maxs);
+
+  const auto type =
+      GetQuantizedType(builder, attr.getType(), mins[0], maxs[0],
+                       /*quant_dim=*/-1, num_bits, narrow_range, is_signed,
+                       legacy_float_scale, use_fake_quant_num_bits);
+  if (const auto ele_type = dyn_cast_or_null<TensorType>(type))
+    return ele_type.getElementType();
+
+  return {};
+}
+
+Type GetUniformQuantizedPerAxisTypeForWeight(
+    const ElementsAttr attr, const int quant_dim, const bool symmetric,
+    const unsigned num_bits, const bool is_signed, const bool narrow_range,
+    const bool legacy_float_scale, const bool use_fake_quant_num_bits) {
+  const Builder builder(attr.getContext());
+  const auto shape = cast<ShapedType>(attr.getType()).getShape();
+  if (static_cast<int>(shape.size()) <= quant_dim) return {};
+  // `symmetric` can only be used when it is `signed` and `narrow_range`.
+  if (symmetric && (!is_signed || !narrow_range)) return {};
+
+  const int dim_size = shape[quant_dim];
+  const int slice_size =
+      std::accumulate(std::next(shape.begin(), quant_dim + 1), shape.end(), 1,
+                      std::multiplies<int64_t>());
+  SmallVector<double, 4> mins(dim_size, std::numeric_limits<double>::max());
+  SmallVector<double, 4> maxs(dim_size, std::numeric_limits<double>::min());
+  const auto fp = dyn_cast<DenseFPElementsAttr>(attr);
+  if (!fp) return {};
+
+  // Computes the effective min/max values of the attribute values.
+  ExtractMinMaxFromAttr(fp, dim_size, slice_size, symmetric, mins, maxs);
+
+  const auto type = GetQuantizedType(
+      builder, attr.getType(), mins, maxs, quant_dim, num_bits, narrow_range,
+      is_signed, legacy_float_scale, use_fake_quant_num_bits);
+  if (auto ele_type = dyn_cast_or_null<TensorType>(type))
+    return ele_type.getElementType();
+
+  return {};
+}
+
+quant::QuantizedType GetUniformQuantizedTypeForBias(
+    const std::vector<quant::QuantizedType>& op_types,
+    const int adjusted_quant_dim, const bool legacy_float_scale) {
+  if (op_types.empty()) return {};
+
+  size_t axis_size = 1;
+  int32_t quant_dim = -1;
+  Type expressed_type;
+  // Requires all the op types are valid UniformQuantizedTypes or
+  // UniformQuantizedPerAxisTypes and also have same expressed type. For all
+  // the UniformQuantizedPerAxisTypes, the quantization dimension index and
+  // dimension sizes are same.
+  for (const auto op_type : op_types) {
+    if (!op_type) return {};
+    if (expressed_type && expressed_type != op_type.getExpressedType()) {
+      return {};
+    }
+    expressed_type = op_type.getExpressedType();
+
+    if (const auto type =
+            dyn_cast<quant::UniformQuantizedPerAxisType>(op_type)) {
+      if (axis_size != 1 && axis_size != type.getScales().size()) return {};
+      if (quant_dim != -1 && quant_dim != type.getQuantizedDimension())
+        return {};
+      axis_size = type.getScales().size();
+      quant_dim = type.getQuantizedDimension();
+    } else if (!isa<quant::UniformQuantizedType>(op_type)) {
+      return {};
+    }
+  }
+
+  // The scale from the UniformQuantizedTypes is broadcasted if there are
+  // UniformQuantizedPerAxisTypes.
+  SmallVector<double, 4> scales(axis_size, 1.0);
+  for (const auto op_type : op_types) {
+    if (const auto type =
+            dyn_cast<quant::UniformQuantizedPerAxisType>(op_type)) {
+      for (const auto& index_scale : llvm::enumerate(type.getScales())) {
+        scales[index_scale.index()] *= index_scale.value();
+      }
+    } else if (const auto type =
+                   dyn_cast<quant::UniformQuantizedType>(op_type)) {
+      for (int index = 0; index < axis_size; ++index) {
+        scales[index] *= type.getScale();
+      }
+    }
+  }
+  if (legacy_float_scale) {
+    for (int i = 0; i < scales.size(); ++i) {
+      scales[i] = static_cast<float>(scales[i]);
+    }
+  }
+
+  // Builds the result quantized type, which has signed 32 bits storage type.
+  Builder builder(expressed_type.getContext());
+  const IntegerType storage_type = builder.getIntegerType(32);
+  const int64_t storage_type_min =
+      quant::QuantizedType::getDefaultMinimumForInteger(/*isSigned=*/true, 32);
+  const int64_t storage_type_max =
+      quant::QuantizedType::getDefaultMaximumForInteger(/*isSigned=*/true, 32);
+  if (axis_size == 1) {
+    return quant::UniformQuantizedType::getChecked(
+        builder.getUnknownLoc(),
+        /*flags=*/true, storage_type, expressed_type, scales[0],
+        /*zeroPoint=*/0, storage_type_min, storage_type_max);
+  } else {
+    SmallVector<int64_t, 4> zero_points(axis_size, 0);
+    // If the bias is a 1-D tensor, set the `quantizedDimension` to 0.
+    // If the bias rank is larger than 1 because it was already broadcasted
+    // to match the output shape, use the last index.
+    return quant::UniformQuantizedPerAxisType::getChecked(
+        builder.getUnknownLoc(),
+        /*flags=*/true, storage_type, expressed_type, scales, zero_points,
+        /*quantizedDimension=*/std::max(adjusted_quant_dim, 0),
+        storage_type_min, storage_type_max);
+  }
+}
+
+ElementsAttr QuantizeLegacy(const Attribute real_value,
+                            const Type tensor_type) {
+  if (!isa<DenseFPElementsAttr>(real_value) ||
+      !quant::QuantizedType::getQuantizedElementType(tensor_type)) {
+    return {};
+  }
+  const auto real_values_attr = cast<DenseFPElementsAttr>(real_value);
+  auto q_type = quant::QuantizedType::getQuantizedElementType(tensor_type);
+  std::vector<float> real_values;
+  SmallVector<APInt, 8> quantized_attr;
+  real_values.reserve(real_values_attr.getNumElements());
+  quantized_attr.reserve(real_values_attr.getNumElements());
+  std::transform(real_values_attr.begin(), real_values_attr.end(),
+                 std::back_inserter(real_values), [&](APFloat value) -> float {
+                   return value.convertToFloat();
+                 });
+  const ShapedType new_dense_type = dyn_cast_or_null<ShapedType>(
+      q_type.castExpressedToStorageType(real_values_attr.getType()));
+  const int width = dyn_cast<IntegerType>(q_type.getStorageType()).getWidth();
+
+  if (width == 8 && q_type.getStorageTypeMax() == 127 &&
+      q_type.getStorageTypeMin() == -127) {
+    std::vector<int8_t> quantized_values(real_values_attr.getNumElements());
+    if (auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
+      float min, max, scale;
+      mlir::lite::toco_legacy::PortableSymmetricQuantizeFloats(
+          real_values.data(), real_values.size(), quantized_values.data(), &min,
+          &max, &scale);
+      // The scale has been adjusted, so the adjusted scale should be respected.
+      if (std::abs(scale - uniform_type.getScale()) > 1e-3) {
+        return Quantize(real_value, tensor_type);
+      }
+    } else if (auto uniform_type =
+                   dyn_cast<quant::UniformQuantizedPerAxisType>(q_type)) {
+      std::vector<float> scales_inv;
+      std::vector<int32_t> dimension;
+      dimension.insert(dimension.end(), new_dense_type.getShape().begin(),
+                       new_dense_type.getShape().end());
+      std::transform(uniform_type.getScales().begin(),
+                     uniform_type.getScales().end(),
+                     std::back_inserter(scales_inv),
+                     [](float scale) { return 1.0 / scale; });
+
+      tflite_migration::optimize::utils::SymmetricPerChannelQuantizeValues(
+          real_values.data(), scales_inv, dimension,
+          uniform_type.getQuantizedDimension(), &quantized_values);
+    } else {
+      return {};
+    }
+    std::transform(quantized_values.begin(), quantized_values.end(),
+                   std::back_inserter(quantized_attr),
+                   [&](int8_t value) -> APInt {
+                     return APInt(8, value, /*isSigned=*/true);
+                   });
+    return DenseElementsAttr::get(new_dense_type, quantized_attr);
+  } else if (width == 8) {
+    // This can be a state tensor, or an actual constant tensor with
+    // asymmetric range. For a state tensor, assigning correct quantization
+    // parameters is sufficient, and for constants with asymmetric range it's
+    // not correctly quantized by legacy quantizer so call the new Quantize.
+    return Quantize(real_value, tensor_type);
+  } else if (width == 16) {
+    if (const auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
+      const auto quantized_values =
+          tflite_migration::optimize::utils::SymmetricQuantizeFloatsToInt16(
+              real_values.data(), real_values.size(), uniform_type.getScale());
+      std::transform(quantized_values.begin(), quantized_values.end(),
+                     std::back_inserter(quantized_attr),
+                     [&](int16_t value) -> APInt {
+                       return APInt(16, value, /*isSigned=*/true);
+                     });
+      return DenseElementsAttr::get(new_dense_type, quantized_attr);
+    }
+  } else if (width == 32) {
+    std::vector<float> scales;
+    if (const auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
+      scales.push_back(uniform_type.getScale());
+    } else if (const auto uniform_type =
+                   dyn_cast<quant::UniformQuantizedPerAxisType>(q_type)) {
+      scales.insert(scales.end(), uniform_type.getScales().begin(),
+                    uniform_type.getScales().end());
+    } else {
+      return {};
+    }
+    const auto quantized_bias =
+        tflite_migration::optimize::utils::SymmetricBiasQuantize<std::int32_t>(
+            real_values.data(), real_values.size(), scales);
+    std::transform(quantized_bias.begin(), quantized_bias.end(),
+                   std::back_inserter(quantized_attr),
+                   [&](int32_t value) -> APInt {
+                     return APInt(32, value, /*isSigned=*/true);
+                   });
+    return DenseElementsAttr::get(new_dense_type, quantized_attr);
+  }
+  return {};
+}
+
+ElementsAttr Quantize(const Attribute real_value, const Type tensor_type) {
+  if (const auto q_type =
+          quant::QuantizedType::getQuantizedElementType(tensor_type)) {
+    Type converted_type;
+    return dyn_cast_or_null<ElementsAttr>(
+        mlir::quant::ir::quantizeAttr(real_value, q_type, converted_type));
+  }
+  return {};
+}
+
+quant::QuantizedType DownCastScale(QuantizedType type, double min, double max,
+                                   Location loc) {
+  const SmallVector<double, 1> mins = {min};
+  const SmallVector<double, 1> maxs = {max};
+  return DownCastScale(type, mins, maxs, loc);
+}
+
+quant::QuantizedType DownCastScale(QuantizedType type,
+                                   const SmallVectorImpl<double>& mins,
+                                   const SmallVectorImpl<double>& maxs,
+                                   Location loc) {
+  // The given type can be null. For example, there can be an invalid scale and
+  // so on.
+  if (!type) return type;
+  SmallVector<double, 4> scales(mins.size());
+  SmallVector<int64_t, 4> zero_points(mins.size());
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
+    zero_points.push_back(q_type.getZeroPoint());
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
+    zero_points = {q_type.getZeroPoints().begin(),
+                   q_type.getZeroPoints().end()};
+  }
+  for (int i = 0; i < mins.size(); ++i) {
+    scales[i] = (static_cast<float>(maxs[i]) - static_cast<float>(mins[i])) /
+                (type.getStorageTypeMax() - type.getStorageTypeMin());
+    if (type.getStorageTypeMax() != -type.getStorageTypeMin()) {
+      // Only applies for asymmetric quantized range with original scale.
+      const float zero_point_from_min =
+          type.getStorageTypeMin() - mins[i] / scales[i];
+      if (zero_point_from_min < type.getStorageTypeMin()) {
+        zero_points[i] = static_cast<int64_t>(type.getStorageTypeMin());
+      } else if (zero_point_from_min > type.getStorageTypeMax()) {
+        zero_points[i] = static_cast<int64_t>(type.getStorageTypeMax());
+      } else {
+        zero_points[i] = static_cast<int64_t>(std::round(zero_point_from_min));
+      }
+    }
+  }
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
+    return UniformQuantizedType::get(q_type.getFlags(), q_type.getStorageType(),
+                                     q_type.getExpressedType(), scales[0],
+                                     zero_points[0], q_type.getStorageTypeMin(),
+                                     q_type.getStorageTypeMax());
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
+    return quant::UniformQuantizedPerAxisType::get(
+        q_type.getFlags(), q_type.getStorageType(), q_type.getExpressedType(),
+        scales, zero_points, q_type.getQuantizedDimension(),
+        q_type.getStorageTypeMin(), q_type.getStorageTypeMax());
+  }
+  return type;
+}
+
+// A heuristic to determine whether the scales needs to be from operands or
+// from results for the ops with the `SameOperandsAndResultsScale` property.
+// The current implementation is based on the number of operands.
+static bool PreferResultScale(Operation* op) {
+  int float_operands = 0;
+  for (auto operand : op->getOperands()) {
+    if (auto operand_type = dyn_cast<ShapedType>(operand.getType())) {
+      if (isa<FloatType>(operand_type.getElementType())) {
+        if (++float_operands > 1) return true;
+      }
+    }
+  }
+  return false;
+}
+
+std::unique_ptr<OpQuantScaleSpec> GetDefaultQuantScaleSpec(Operation* op) {
+  auto spec = std::make_unique<OpQuantScaleSpec>();
+  if (isa<SameScalesOpInterface>(op)) {
+    spec->has_same_scale_requirement = true;
+    spec->required_same_scale_func = [op](const bool sign,
+                                          const int bit_width) {
+      return cast<SameScalesOpInterface>(op)
+          .RequiredSameOperandsAndResultsScale(sign, bit_width);
+    };
+    spec->required_same_quantized_axes_func = [op]() {
+      return cast<SameScalesOpInterface>(op).RequiredSameQuantizedAxes();
+    };
+  }
+  if (isa<FixedOutputRangeInterface>(op)) {
+    spec->has_fixed_output_range = true;
+    spec->fixed_output_range_func = [op](bool sign, int bit_width) {
+      return cast<FixedOutputRangeInterface>(op).GetFixedOutputRange(sign,
+                                                                     bit_width);
+    };
+  }
+  return spec;
+}
+
+// The stats op of some of the ops can be redundant. The current implementation
+// only considers the ops with restricted output params.
+static bool IsStatsRedundant(
+    Operation* op, const OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
+  // If it has FixedOutputRangeInterface, no need to manually create spec.
+  return isa<FixedOutputRangeInterface>(op) ||
+         op_quant_scale_spec_getter(op)->has_fixed_output_range;
+}
+
+static bool IsSameScaleOp(
+    Operation* op, const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
+  // If it has SameScalesOpInterface, no need to manually create spec.
+  return dyn_cast<SameScalesOpInterface>(op) ||
+         op_quant_scale_spec_getter(op)->has_same_scale_requirement;
+}
+
+bool RemoveRedundantStatsOps(
+    func::FuncOp func, const OpQuantSpecGetter op_quant_spec_getter,
+    const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
+  SmallVector<mlir::quant::ir::StatisticsOp, 16> all_stats_ops;
+  llvm::DenseSet<Operation*> redundant_stats_ops;
+
+  // Step 0: remove the mlir::quant::ir::StatisticsOp which are used by the
+  // quant.qcast op in case it overrides the information from training FakeQuant
+  // ops.
+  func.walk([&](mlir::quant::ir::QuantizeCastOp q) {
+    auto input_op = q.getArg().getDefiningOp();
+    if (auto stats =
+            dyn_cast_or_null<mlir::quant::ir::StatisticsOp>(input_op)) {
+      q.setOperand(stats.getArg());
+      if (stats.use_empty()) stats.erase();
+    }
+  });
+
+  // Step 1: forward pass: propagate any value scales which are not produces
+  // by `SameOperandsAndResultsScale`. Additionally, remove the value scales
+  // which are produced by the ops with the `FixedOutputRangeInterface`.
+  // Note that we don't propagate across the multiple-operands
+  // `SameOperandsAndResultsScale` ops like `concatenation`.
+  func.walk([&](mlir::quant::ir::StatisticsOp stats_op) {
+    all_stats_ops.push_back(stats_op);
+  });
+
+  while (!all_stats_ops.empty()) {
+    mlir::quant::ir::StatisticsOp stats_op = all_stats_ops.back();
+    all_stats_ops.pop_back();
+
+    if (auto def = stats_op.getArg().getDefiningOp()) {
+      if (IsStatsRedundant(def, op_quant_spec_getter,
+                           op_quant_scale_spec_getter)) {
+        redundant_stats_ops.insert(stats_op);
+      }
+    }
+
+    for (Operation* user : stats_op.getResult().getUsers()) {
+      // We don't propagate this parameter down if it has multiple operands.
+      // We want to use the result parameter scales instead.
+      if (!IsSameScaleOp(user, op_quant_scale_spec_getter) ||
+          PreferResultScale(user)) {
+        continue;
+      }
+      for (Value res : user->getResults()) {
+        if (!res.hasOneUse()) {
+          continue;
+        }
+        if (auto next_stats = dyn_cast<mlir::quant::ir::StatisticsOp>(
+                *res.getUsers().begin())) {
+          // quantization parameters can be propagated to next_stats
+          redundant_stats_ops.insert(next_stats);
+          // add next_stats to the work list so propagation can continue.
+          all_stats_ops.push_back(next_stats);
+        }
+      }
+    }
+  }
+
+  // Step 2: backward pass: For the ops skipped in the forward pass, propagate
+  // its results scale backwards as far as possible.
+  func.walk([&](mlir::quant::ir::StatisticsOp stats_op) {
+    if (redundant_stats_ops.find(stats_op) == redundant_stats_ops.end()) {
+      all_stats_ops.push_back(stats_op);
+    }
+  });
+
+  while (!all_stats_ops.empty()) {
+    mlir::quant::ir::StatisticsOp stats_op = all_stats_ops.back();
+    all_stats_ops.pop_back();
+
+    if (Operation* def = stats_op.getArg().getDefiningOp()) {
+      if (!IsSameScaleOp(def, op_quant_scale_spec_getter)) {
+        continue;
+      }
+      for (Value input : def->getOperands()) {
+        if (auto next_stats = dyn_cast_or_null<mlir::quant::ir::StatisticsOp>(
+                input.getDefiningOp())) {
+          redundant_stats_ops.insert(next_stats);
+          all_stats_ops.push_back(next_stats);
+        }
+      }
+    }
+  }
+
+  // Step3: Remove all the redundant stats ops
+  for (Operation* it : redundant_stats_ops) {
+    if (!isa<mlir::quant::ir::StatisticsOp>(it)) return true;
+    auto stats_op = cast<mlir::quant::ir::StatisticsOp>(it);
+    stats_op.getResult().replaceAllUsesWith(stats_op.getArg());
+    stats_op.erase();
+  }
+
+  // Returns false if the steps finish without errors.
+  return false;
+}
+
+LogicalResult VerifySameScales(Operation* op) {
+  auto same_scale_op = cast<SameScalesOpInterface>(op);
+
+  SmallVector<QuantizedType, 4> collected_quant_params;
+  for (Value input : op->getOperands()) {
+    QuantizedType quant_params =
+        QuantizedType::getQuantizedElementType(input.getType());
+    // Skip non-quantizable operands.
+    if (quant_params) {
+      collected_quant_params.push_back(quant_params);
+    }
+  }
+
+  for (Value output : op->getResults()) {
+    const QuantizedType quant_params =
+        QuantizedType::getQuantizedElementType(output.getType());
+    // Skip non-quantizable results.
+    if (quant_params) {
+      collected_quant_params.push_back(quant_params);
+    }
+  }
+
+  if (collected_quant_params.size() <= 1) return success();
+  const auto& expected_params = collected_quant_params[0];
+  for (int i = 1; i < collected_quant_params.size(); ++i) {
+    const auto& compared_params = collected_quant_params[i];
+    // For some ops (such as Transpose or Squeeze), the quantized axis might not
+    // be the same, this function only verifies the scale and zero point in
+    // that case. The quantized axis should be verified in their own verifier
+    // method.
+    if (!same_scale_op.RequiredSameQuantizedAxes()) {
+      const auto expected_per_axis_qtype =
+          dyn_cast<quant::UniformQuantizedPerAxisType>(expected_params);
+      const auto compared_per_axis_qtype =
+          dyn_cast<quant::UniformQuantizedPerAxisType>(compared_params);
+      if (expected_per_axis_qtype && compared_per_axis_qtype &&
+          llvm::equal(expected_per_axis_qtype.getScales(),
+                      compared_per_axis_qtype.getScales()) &&
+          llvm::equal(expected_per_axis_qtype.getZeroPoints(),
+                      compared_per_axis_qtype.getZeroPoints()) &&
+          expected_params.getStorageType() ==
+              compared_params.getStorageType() &&
+          expected_params.getExpressedType() ==
+              compared_params.getExpressedType()) {
+        continue;
+      }
+    }
+    // Same quantization parameters are always ok.
+    if (expected_params == compared_params) continue;
+    // If the quantization parameters are not the same, as long as it has the
+    // same storage type and the op interface doesn't require same scale
+    // constraint for this storage type, it is still ok.
+    if (expected_params.isSigned() == compared_params.isSigned() &&
+        expected_params.getStorageTypeIntegralWidth() ==
+            compared_params.getStorageTypeIntegralWidth() &&
+        !same_scale_op.RequiredSameOperandsAndResultsScale(
+            expected_params.isSigned(),
+            expected_params.getStorageTypeIntegralWidth()))
+      continue;
+
+    std::string err_msg =
+        "quantization parameters violate the same scale constraint: ";
+    llvm::raw_string_ostream os(err_msg);
+    expected_params.print(os);
+    os << " vs. ";
+    compared_params.print(os);
+    os.flush();
+    return op->emitOpError(err_msg);
+  }
+  return success();
+}
+
+quant::UniformQuantizedType GetFixedOutputRange(
+    const bool is_signed, const int bit_width, const Type tensor_type,
+    const double scale, int64_t zero_point, int64_t storage_min,
+    int64_t storage_max) {
+  const auto result_type = cast<ShapedType>(tensor_type);
+  if (!isa<FloatType>(result_type.getElementType())) return {};
+  Builder builder(result_type.getContext());
+
+  // Only support 8-bits and 16-bits
+  if (bit_width != 8 && bit_width != 16) return {};
+  const IntegerType storage_type = builder.getIntegerType(bit_width);
+  if (!is_signed && bit_width == 8) {
+    zero_point += 128;
+    storage_min += 128;
+    storage_max += 128;
+  }
+  return quant::UniformQuantizedType::getChecked(
+      builder.getUnknownLoc(), is_signed, storage_type,
+      result_type.getElementType(), scale, zero_point, storage_min,
+      storage_max);
+}
+
+quant::UniformQuantizedType GetFixedOutputRange(const bool is_signed,
+                                                const int bit_width,
+                                                const Type tensor_type,
+                                                const double scale,
+                                                const int64_t zero_point) {
+  return GetFixedOutputRange(is_signed, bit_width, tensor_type, scale,
+                             zero_point,
+                             /*storage_min=*/-(1 << (bit_width - 1)),
+                             /*storage_max=*/(1 << (bit_width - 1)) - 1);
+}
+
+Type ConvertSignedQuantizedToUnsigned(const Type signed_tensor_type,
+                                      const Location loc) {
+  const auto qtype = QType::getQuantizedElementType(signed_tensor_type);
+  if (!qtype || !qtype.isSigned()) return {};
+
+  const int num_bits = qtype.getStorageTypeIntegralWidth();
+  // This is a negative value, and will be applied on zero points and fixed
+  // point ranges.
+  const int64_t offset =
+      QType::getDefaultMinimumForInteger(/*isSigned=*/true, num_bits) -
+      QType::getDefaultMinimumForInteger(/*isSigned=*/false, num_bits);
+
+  const auto flags = !quant::QuantizationFlags::Signed;
+  QType new_qtype;
+  if (auto uqtype = dyn_cast<quant::UniformQuantizedType>(qtype)) {
+    new_qtype = quant::UniformQuantizedType::getChecked(
+        loc, flags, qtype.getStorageType(), qtype.getExpressedType(),
+        uqtype.getScale(), uqtype.getZeroPoint() - offset,
+        uqtype.getStorageTypeMin() - offset,
+        uqtype.getStorageTypeMax() - offset);
+  } else if (auto aqtype =
+                 dyn_cast<quant::UniformQuantizedPerAxisType>(qtype)) {
+    const auto zero_points = aqtype.getZeroPoints();
+    SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
+                                            zero_points.end());
+    for (int i = 0; i < new_zero_points.size(); ++i) {
+      new_zero_points[i] -= offset;
+    }
+    new_qtype = quant::UniformQuantizedPerAxisType::getChecked(
+        loc, flags, qtype.getStorageType(), qtype.getExpressedType(),
+        aqtype.getScales(), new_zero_points, aqtype.getQuantizedDimension(),
+        aqtype.getStorageTypeMin() - offset,
+        aqtype.getStorageTypeMax() - offset);
+  }
+  return new_qtype.castFromExpressedType(
+      QType::castToExpressedType(signed_tensor_type));
+}
+
+LogicalResult RemoveDebugAttrPattern::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  // removeAttr will return nullptr if the attribute did not exist. Thus we can
+  // return success(result) to indicate if this op has changed.
+  return success(/*isSuccess=*/
+                 op->removeAttr(kDebugModeOpQuantAttrName) ||
+                 op->removeAttr(kDebugModeOpFloatAttrName));
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h
new file mode 100644
index 000000000000..39e805d6a1a8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h
@@ -0,0 +1,973 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_UTILS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// A unit attribute can be attached to the quantize/dequantize ops which are
+// added by the quantization passes. These ops can be removed erased without
+// losing accuracy.
+inline constexpr char kVolatileOpAttrName[] = "volatile";
+
+// Following attributes are used to mark ops that are not quantizable during
+// debug model generation process for whole-model verify mode. If these
+// attributes are attached, the upstream float/quantized ops know which ops to
+// connect to, and it also prevents these ops from being copied again.
+inline constexpr char kDebugModeOpFloatAttrName[] = "debug_float";
+inline constexpr char kDebugModeOpQuantAttrName[] = "debug_quant";
+
+// Used to annotate custom ops if they are quantizable.
+inline constexpr char kQuantTraitAttrName[] = "_tfl_quant_trait";
+enum QuantizationTrait { FullyQuantizable = 0, NotQuantizable = 1 };
+inline constexpr absl::string_view QuantTraitValues[] = {"fully_quantizable",
+                                                         "not_quantizable"};
+inline constexpr char kOutputQuantized[] = "_output_quantized";
+
+inline constexpr double kNearZeroTolerance = 1.0e-6;
+
+using QuantParams = QuantizedType;
+using QuantSpec = QuantizationSpecs;
+using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
+using QuantParamsForResults = llvm::SmallVector<QuantizedType, 4>;
+using AccumulatorScaleFunc =
+    std::function<QuantizedType(const std::vector<QuantizedType>&, int, bool)>;
+using BiasParamsMap =
+    absl::flat_hash_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
+// UniformQuantizedType GetFixedOutputRange(bool sign, int bit_width)
+using GetFixedOutputRangeFunc = std::function<UniformQuantizedType(bool, int)>;
+// bool RequiredSameOperandsAndResultsScale(bool sign, int $bit_width)
+using RequiredSameOperandsAndResultsScaleFunc = std::function<bool(bool, int)>;
+// bool RequiredSameQuantizedAxes()
+using RequiredSameQuantizedAxesFunc = std::function<bool()>;
+
+using CustomMap = CustomOpMap;
+
+// Quantization spec of an op, driving the quantization algorithm.
+struct OpQuantSpec {
+  // Maps the operand index of a bias input to its quantization specifications,
+  // including the non-bias operand indexes and the method retrieving
+  // quantization parameters from list of parameters of the non-bias operands.
+  // This map is empty if the op doesn't have a bias operand.
+  BiasParamsMap biases_params;
+
+  // Quantization parameters for value restricted outputs. This is the
+  // "hard-coded" parameters and should be used unconditionally for the
+  // quantized op. This vector is empty if the op doesn't have value restricted
+  // outputs.
+  llvm::DenseMap<SignedInteger, QuantParamsForResults> restricted_output_params;
+
+  // Coefficient operand index and whether supporting per-channel quantization.
+  // For QAT, this information is carried by the FakeQuant*/Quantize/Dequantize
+  // ops, but post-training quantization, the quantization parameters need to be
+  // inferred from the tensor content and op property. A "-1" value indicates
+  // the operand doesn't support per-channel quantization.
+  llvm::DenseMap<int, int> coeff_op_quant_dim;
+
+  // Indices of quantizable operands. Biases are not included in this field,
+  // the indices of biases can be found in the `biases_params`.
+  absl::flat_hash_set<int> quantizable_operands;
+};
+
+// A function signature for getting the particular OpQuantSpec for the provided
+// op.
+using OpQuantSpecGetter =
+    std::function<std::unique_ptr<OpQuantSpec>(Operation*)>;
+
+// Quantization scale spec of an op. The information defined in the MLIR
+// interfaces FixedOutputRangeInterface and SameOperandsAndResultsScale should
+// be checked first if present.
+// TODO: b/323478683: Consider deprecating this.
+struct OpQuantScaleSpec {
+  // Whether this op has a fixed range requirement (e.g. sigmoid)
+  bool has_fixed_output_range = false;
+  // Whether this op should have same operand and result scales (e.g. concat)
+  bool has_same_scale_requirement = false;
+  // Whether this op should have same operand and result type (e.g. gather)
+  bool has_same_operand_and_result_type_requirement = false;
+  // Returns the fixed output range, when has_fixed_output_range is set.
+  GetFixedOutputRangeFunc fixed_output_range_func;
+  // Returns whether same operands and results scales are required.
+  RequiredSameOperandsAndResultsScaleFunc required_same_scale_func =
+      [](bool sign, int bit_width) { return true; };
+  // Returns whether operands and results must have the same quantized axis.
+  RequiredSameQuantizedAxesFunc required_same_quantized_axes_func = []() {
+    return true;
+  };
+};
+
+// A function signature for getting the particular OpQuantScaleSpec for the
+// provided op.
+using OpQuantScaleSpecGetter =
+    std::function<std::unique_ptr<OpQuantScaleSpec>(Operation*)>;
+
+// Used in TFL Numeric Verify
+struct NumericVerifySpec {
+  // Whether to enable numeric verification
+  bool verify_numeric = false;
+
+  // Tolerance level from the quantized value for verification. If the tolerance
+  // is very small(<0.1), only the stats of the diff is displayed.
+  float error_tolerance = 5.0f;
+
+  // Whether to verify numerical correctness layer by layer or by whole model
+  bool whole_model_verify = false;
+
+  // Whether to enable log for failures
+  bool log_if_failed_flag = false;
+};
+
+// Used in TFL Quantize Pass
+struct QuantPassSpec {
+  // Variables to control TFL Numeric Verify
+  NumericVerifySpec numeric_verify_spec;
+
+  // Variables related to quantization
+  QuantSpec quant_spec;
+};
+
+// Re-calculates scales again in float instead of simply downcasting existing
+// scales.
+quant::QuantizedType DownCastScale(quant::QuantizedType type,
+                                   const SmallVectorImpl<double>& mins,
+                                   const SmallVectorImpl<double>& maxs,
+                                   Location loc);
+
+quant::QuantizedType DownCastScale(quant::QuantizedType type, double min,
+                                   double max, Location loc);
+
+bool IsOpQuantizable(Operation* op);
+bool QuantizableOpSupportsFloatOutputType(Operation* op);
+
+// Specialized version of location to string for flatbuffer exported locations.
+inline std::string GetTensorNameFromLoc(Location loc) {
+  if (auto name_loc = llvm::dyn_cast<NameLoc>(loc)) {
+    return name_loc.getName().str();
+  }
+  return "";
+}
+
+template <typename QuantizeOpT, typename DequantizeOpT>
+struct ConvertStatsToQDQs
+    : public OpRewritePattern<mlir::quant::ir::StatisticsOp> {
+  ConvertStatsToQDQs(int num_bits, bool narrow_range, bool is_signed,
+                     bool legacy_float_scale, MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::StatisticsOp>(context),
+        num_bits(num_bits),
+        narrow_range(narrow_range),
+        is_signed(is_signed),
+        legacy_float_scale(legacy_float_scale) {}
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::StatisticsOp op,
+                                PatternRewriter& rewriter) const override {
+    Type expressed = llvm::cast<ShapedType>(op.getType()).getElementType();
+    quant::QuantizedType quant_type;
+    SmallVector<double, 4> mins, maxs;
+
+    if (op.getAxisStats().has_value()) {
+      // Per axis quantization (or per channel quantization)
+      int stats_num = op.getAxisStats()->getNumElements();
+      if (stats_num == 0 || stats_num % 2 != 0) return failure();
+      auto stats = llvm::dyn_cast<DenseFPElementsAttr>(*op.getAxisStats());
+      if (!stats) return failure();
+
+      for (auto it = stats.begin(), e = stats.end(); it != e; ++it) {
+        double rmin = FloatAttr::getValueAsDouble(*it++);
+        double rmax = FloatAttr::getValueAsDouble(*it);
+        // The default nudging implementation of mlir quant library might cause
+        // clamping during inference if the calibration range isn't wide enough.
+        // So here we adjust the range to include 0.0.
+        rmin = std::min(rmin, 0.0);
+        rmax = std::max(rmax, 0.0);
+        if (num_bits == 16) {
+          // TODO: b/266536261 - Since the kernel implementation assumes that
+          // 16x8 integer quantization is symmetric, this MLIR quantizer
+          // supports only symmetric quantization.
+          rmax = std::max(std::abs(rmin), std::abs(rmax));
+          rmin = -rmax;
+        }
+        TensorRangeSanityCheck(op, rmin, rmax);
+        mins.push_back(rmin);
+        maxs.push_back(rmax);
+      }
+      quant_type = quantfork::fakeQuantAttrsToType(
+          op.getLoc(), num_bits, *op.getAxis(), mins, maxs, narrow_range,
+          expressed, is_signed);
+      if (legacy_float_scale) {
+        quant_type =
+            mlir::tf_quant::DownCastScale(quant_type, mins, maxs, op->getLoc());
+      }
+    } else if (auto stats =
+                   llvm::dyn_cast<DenseFPElementsAttr>(op.getLayerStats())) {
+      // Per tensor quantization
+      auto statValues = stats.getValues<APFloat>();
+      double rmin = FloatAttr::getValueAsDouble(statValues[0]);
+      double rmax = FloatAttr::getValueAsDouble(statValues[1]);
+      // The default nudging implementation of mlir quant library might cause
+      // clamping during inference if the calibration range isn't wide enough.
+      // So here we adjust the range to include 0.0.
+      rmin = std::min(rmin, 0.0);
+      rmax = std::max(rmax, 0.0);
+      if (num_bits == 16) {
+        // TODO: b/266536261 - Since the kernel implementation assumes that
+        // 16x8 integer quantization is symmetric, this MLIR quantizer supports
+        // only symmetric quantization.
+        rmax = std::max(std::abs(rmin), std::abs(rmax));
+        rmin = -rmax;
+      }
+      TensorRangeSanityCheck(op, rmin, rmax);
+      quant_type =
+          quantfork::fakeQuantAttrsToType(op.getLoc(), num_bits, rmin, rmax,
+                                          narrow_range, expressed, is_signed);
+      if (legacy_float_scale) {
+        quant_type =
+            mlir::tf_quant::DownCastScale(quant_type, rmin, rmax, op->getLoc());
+      }
+    } else {
+      return failure();
+    }
+
+    rewriter.setInsertionPointAfter(op.getOperation());
+    Type result_type = quant_type.castFromExpressedType(op.getType());
+    auto q =
+        rewriter.create<QuantizeOpT>(op.getLoc(), result_type, op.getArg());
+    q->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
+
+    auto dq = rewriter.create<DequantizeOpT>(op.getLoc(), op.getType(), q);
+    op.getResult().replaceAllUsesWith(dq);
+    q.getOperation()->replaceUsesOfWith(dq, op.getArg());
+    op.erase();
+
+    return success();
+  }
+
+ private:
+  int num_bits;
+  bool narrow_range;
+  bool is_signed;
+  bool legacy_float_scale;
+
+  // Emits an op warning message if the calibrated range is larger than 10.0 and
+  // the storage type is less than or equal to 8 bits.
+  void TensorRangeSanityCheck(mlir::quant::ir::StatisticsOp op, double& min,
+                              double& max) const {
+    double range = std::fabs(max - min);
+    if (num_bits <= 8 && range >= 10.0) {
+      op.emitWarning()
+          << "Tensor range is too wide to be quantized. Use tf.clip_by_value "
+             "or tf.relu6 to narrow the tensor range. Range: "
+          << range << ", bit width: " << num_bits;
+    }
+    if (std::abs(max - min) < kNearZeroTolerance) {
+      op.emitWarning() << "Tensor range (" << min << ", " << max
+                       << ") is too narrow and it might cause overflow. "
+                          "Expanding range symmetrically by "
+                       << kNearZeroTolerance;
+      min -= kNearZeroTolerance;
+      max += kNearZeroTolerance;
+    }
+  }
+};
+
+template <typename VerifierT>
+bool UsedBy(Operation* op) {
+  for (Operation* user : op->getUsers()) {
+    if (llvm::isa_and_nonnull<VerifierT>(user)) return true;
+  }
+  return false;
+}
+
+template <typename VerifierT>
+void CreateVerifier(Operation* quantizing_op, Operation* quantized_op,
+                    PatternRewriter& rewriter, int result_idx,
+                    const QuantPassSpec& quant_params) {
+  rewriter.setInsertionPointAfter(quantized_op);
+  FloatAttr tolerance = rewriter.getF32FloatAttr(
+      quant_params.numeric_verify_spec.error_tolerance);
+  BoolAttr log =
+      rewriter.getBoolAttr(quant_params.numeric_verify_spec.log_if_failed_flag);
+  // Verify the quantized value by sending the result to the verifier.
+  rewriter.create<VerifierT>(
+      quantizing_op->getLoc(), quantized_op->getResult(result_idx).getType(),
+      quantized_op->getResult(result_idx), quantizing_op->getResult(result_idx),
+      tolerance, log);
+}
+
+template <>
+inline bool UsedBy<void>(Operation* op) {
+  return false;
+}
+
+// This specialization is not going to be called, but needed for compilation.
+template <>
+inline void CreateVerifier<void>(Operation* quantizing_op,
+                                 Operation* quantized_op,
+                                 PatternRewriter& rewriter, int result_idx,
+                                 const QuantPassSpec& quant_params) {}
+
+// A base rewrite pattern which matches any N-in-M-out operations with
+// quantization parameters propagated to at least one of its operands. The
+// quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
+// Each matched pattern are rewritten by its quantized alternatives.
+//
+// The concrete pattern, extends from this base pattern, can specify whether it
+// allows dynamic range quantized operands and results for the operations in the
+// current context. These "DynamicRangeQuantized" operands and results don't
+// have quantization parameters propagated to, so will be in float in the
+// quantized results. The concrete pattern should define the following two
+// functions:
+//
+//   bool AllowDynamicRangeQuantizedOperand(Operation *) const
+//   bool AllowDynamicRangeQuantizedResult(Operation *) const
+//
+// Full integer quantization disallows "DynamicRangeQuantized" operands or
+// results. Dynamic range quantization allows "DynamicRangeQuantized" operands
+// and results.
+template <typename ConcreteT, typename QuantizeOpT, typename DequantizeOpT,
+          typename VerifierT, typename RootOpT = DequantizeOpT>
+class QuantizationPattern : public RewritePattern {
+ public:
+  using BaseType = QuantizationPattern<ConcreteT, QuantizeOpT, DequantizeOpT,
+                                       VerifierT, RootOpT>;
+
+  explicit QuantizationPattern(MLIRContext* context,
+                               const QuantPassSpec& quant_params)
+      // Set the score to a large number so it is always preferred.
+      : RewritePattern(RootOpT::getOperationName(), 300, context),
+        quant_params_(quant_params) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    llvm::SmallVector<Operation*, 4> quantizing_ops;
+
+    // Collect all the ops to quantize, as the user / producer of the root op.
+    if constexpr (std::is_same_v<RootOpT, DequantizeOpT>) {
+      if (op->getNumResults() != 1) {
+        return failure();
+      }
+      auto users = op->getResult(0).getUsers();
+      quantizing_ops.append(users.begin(), users.end());
+    } else if constexpr (std::is_same_v<RootOpT, QuantizeOpT>) {
+      if (op->getNumOperands() != 1) {
+        return failure();
+      }
+      Value quantize_operand = op->getOperand(0);
+      if (QuantizedType::getQuantizedElementType(quantize_operand.getType())) {
+        // The input of this QuantizeOp has already been quantized, i.e.
+        // rescale.
+        return failure();
+      }
+      DenseFPElementsAttr attr;
+      if (matchPattern(quantize_operand, m_Constant(&attr))) {
+        // Const-> QuantizeOp pattern will be handled separately.
+        return failure();
+      }
+      if (Operation* quantizing_op = quantize_operand.getDefiningOp()) {
+        quantizing_ops.push_back(quantizing_op);
+      }
+    }
+
+    tensorflow::DataType inference_type =
+        quant_params_.quant_spec.inference_type;
+    bool weight_only_quantization =
+        quant_params_.quant_spec.weight_only_quantization;
+    bool enable_verify = quant_params_.numeric_verify_spec.verify_numeric;
+    bool enable_whole_model_verify =
+        quant_params_.numeric_verify_spec.whole_model_verify;
+    absl::flat_hash_set<std::string> ops_blocklist =
+        quant_params_.quant_spec.ops_blocklist;
+    absl::flat_hash_set<std::string> nodes_blocklist =
+        quant_params_.quant_spec.nodes_blocklist;
+    CustomMap custom_map = quant_params_.quant_spec.custom_map;
+
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (Operation* quantizing_op : quantizing_ops) {
+      // If it is requantize op, we shouldn't rewrite this op.
+      if (llvm::isa<QuantizeOpT, DequantizeOpT>(quantizing_op)) {
+        return failure();
+      }
+
+      // If the op is terminator, not quantizable or any ops from the mlir quant
+      // ops dialect, we shouldn't rewrite. In case of whole-model verify debug
+      // mode, not-quantizable ops should be duplicated to keep parallel
+      // float/quant model execution.
+      if (quantizing_op->hasTrait<OpTrait::IsTerminator>()) {
+        return failure();
+      }
+
+      if (!IsOpQuantizable(quantizing_op) &&
+          !static_cast<const ConcreteT*>(this)->IsQuantizableCustomOp(
+              quantizing_op, custom_map)) {
+        if (!(enable_verify && enable_whole_model_verify)) {
+          return failure();
+        }
+        if (quantizing_op->hasAttr(kDebugModeOpQuantAttrName) ||
+            quantizing_op->hasAttr(kDebugModeOpFloatAttrName)) {
+          return failure();
+        }
+
+        rewriter.setInsertionPoint(quantizing_op);
+        Operation* float_op = rewriter.clone(*quantizing_op);
+        quantizing_op->setAttr(kDebugModeOpQuantAttrName,
+                               rewriter.getUnitAttr());
+        float_op->setAttr(kDebugModeOpFloatAttrName, rewriter.getUnitAttr());
+        RewireFloatModelBackbone(quantizing_op, float_op);
+        return success();
+      }
+
+      // Blocklist op is checked in advance for non-dynamic range quantization
+      // case.
+      if (!quant_params_.quant_spec.weight_quantization &&
+          (ops_blocklist.find(quantizing_op->getName().getStringRef().str()) !=
+           ops_blocklist.end())) {
+        return failure();
+      }
+
+      if (!nodes_blocklist.empty()) {
+        if (auto name_loc = llvm::dyn_cast<NameLoc>(quantizing_op->getLoc())) {
+          std::string sloc = name_loc.getName().str();
+          if (!sloc.empty() &&
+              (nodes_blocklist.find(sloc) != nodes_blocklist.end())) {
+            return failure();
+          }
+        }
+      }
+
+      // An op with float inputs and outputs are expected when it's used by a
+      // NumericVerify op. Skip this op.
+      if (enable_verify && UsedBy<VerifierT>(quantizing_op)) {
+        continue;
+      }
+
+      bool is_operand_or_result_modified = false;
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(quantizing_op->getNumOperands());
+      for (auto operand : quantizing_op->getOperands()) {
+        Type operand_type = operand.getType();
+        if (isa<NoneType>(operand_type)) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        auto ele_type =
+            llvm::cast<TensorType>(operand.getType()).getElementType();
+        if (static_cast<const ConcreteT*>(this)
+                ->AllowDynamicRangeQuantizedOperand(quantizing_op,
+                                                    custom_map)) {
+          auto dq_op = dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp());
+
+          if (dq_op && inference_type == tensorflow::DT_QINT8 &&
+              !static_cast<const ConcreteT*>(this)->IsWeightOnlyOp(
+                  quantizing_op, ops_blocklist, weight_only_quantization,
+                  custom_map)) {
+            // Dynamic range quantization is applied by having QuantizeOp as an
+            // input. Only int8 weight is supported for now.
+            inputs.push_back(dq_op.getOperand());
+            is_operand_or_result_modified = true;
+          } else {
+            // Otherwise, it's the case where the operand is activations or the
+            // quantizing_op is non-supported/weight-only.
+            inputs.push_back(operand);
+          }
+        } else {
+          if (auto dq_op =
+                  dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
+            is_operand_or_result_modified = true;
+            inputs.push_back(dq_op.getOperand());
+          } else if (!ele_type.isF32()) {
+            // If the operand is an integer tensor, then it doesn't require the
+            // DequantizeOp in the pattern.
+            inputs.push_back(operand);
+          } else {
+            return failure();
+          }
+        }
+      }
+
+      Operation* quantized_op;
+      if (QuantizableOpSupportsFloatOutputType(quantizing_op)) {
+        rewriter.setInsertionPointAfter(quantizing_op);
+        OperationState new_state(
+            quantizing_op->getLoc(), quantizing_op->getName().getStringRef(),
+            inputs, quantizing_op->getResultTypes(), quantizing_op->getAttrs());
+        for (const auto& indexed_regions :
+             llvm::enumerate(quantizing_op->getRegions())) {
+          Region* target_region = new_state.addRegion();
+          IRMapping mapping;
+          indexed_regions.value().cloneInto(target_region, mapping);
+        }
+        quantized_op = rewriter.create(new_state);
+        rewriter.replaceOp(quantizing_op, quantized_op);
+      } else {
+        // Collect all the quantized outputs and replace them by the results of
+        // the new quantized op.
+        llvm::SmallDenseMap<Value, int> outputs_replaced;
+        SmallVector<Type, 4> output_types;
+        output_types.reserve(quantizing_op->getNumResults());
+        for (const auto& enumerated_result :
+             llvm::enumerate(quantizing_op->getResults())) {
+          Value result = enumerated_result.value();
+          Type result_type = result.getType();
+          // Add this to the test coverage once we create test ops with none
+          // type results.
+          if (isa<NoneType>(result_type)) {
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result_type);
+            continue;
+          }
+          Type result_ele_type =
+              llvm::cast<TensorType>(result.getType()).getElementType();
+          // If the user is the QuantizeOp, it must be the only user.
+          if (result.hasOneUse() &&
+              llvm::isa<QuantizeOpT>(*result.user_begin())) {
+            auto user = llvm::cast<QuantizeOpT>(*result.user_begin());
+            outputs_replaced.insert(
+                {user.getResult(), enumerated_result.index()});
+            output_types.push_back(user.getType());
+            is_operand_or_result_modified = true;
+          } else if (!result_ele_type.isF32()) {
+            // If the result is an integer tensor, then it doesn't require the
+            // D op in the pattern.
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result.getType());
+          } else if (static_cast<const ConcreteT*>(this)
+                         ->AllowDynamicRangeQuantizedResult(quantizing_op,
+                                                            custom_map)) {
+            outputs_replaced.insert({result, enumerated_result.index()});
+            output_types.push_back(result.getType());
+          } else {
+            return failure();
+          }
+        }
+
+        // For float16 quantization if none of the operand or result is
+        // modified, replacing the op. See b/335025403.
+        if (inference_type == tensorflow::DT_HALF &&
+            !is_operand_or_result_modified) {
+          return failure();
+        }
+
+        rewriter.setInsertionPointAfter(quantizing_op);
+        OperationState new_state(
+            quantizing_op->getLoc(), quantizing_op->getName().getStringRef(),
+            inputs, output_types, quantizing_op->getAttrs());
+        for (int i = 0; i < quantizing_op->getNumRegions(); ++i) {
+          new_state.addRegion();
+        }
+        quantized_op = rewriter.create(new_state);
+        if (quantizing_op->getNumRegions() != 0) {
+          for (const auto& indexed_regions :
+               llvm::enumerate(quantizing_op->getRegions())) {
+            Region& target_region =
+                quantized_op->getRegion(indexed_regions.index());
+            IRMapping mapping;
+            indexed_regions.value().cloneInto(&target_region, mapping);
+          }
+        }
+        for (auto output : outputs_replaced) {
+          output.getFirst().replaceAllUsesWith(
+              quantized_op->getResult(output.getSecond()));
+        }
+      }
+
+      // To verify the numericals, the original floating-point ops are
+      // preserved in the graph. The result of these floating-point ops are sent
+      // to a numeric verifier op as the reference.
+      if (enable_verify && !std::is_same_v<VerifierT, void>) {
+        // For constant operands, the floating-point constant is duplicated in
+        // case it is quantized.
+        for (int i = 0, e = quantized_op->getNumOperands(); i < e; ++i) {
+          auto def = quantized_op->getOperand(i).getDefiningOp();
+          if (auto q = llvm::dyn_cast_or_null<QuantizeOpT>(def)) {
+            DenseFPElementsAttr attr;
+            if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
+              continue;
+            }
+            auto cst = rewriter.create<arith::ConstantOp>(
+                quantized_op->getLoc(), attr);
+            quantizing_op->setOperand(i, cst.getResult());
+          }
+        }
+
+        for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
+          if (!isa<FloatType>(
+                  cast<ShapedType>(quantizing_op->getResult(i).getType())
+                      .getElementType())) {
+            continue;
+          }
+          CreateVerifier<VerifierT>(quantizing_op, quantized_op, rewriter, i,
+                                    quant_params_);
+
+          if (enable_whole_model_verify) {
+            RewireFloatModelBackbone(quantized_op, quantizing_op);
+          }
+        }
+      }
+    }
+    return success();
+  }
+
+ private:
+  // Reconnects float ops in the whole-model verify mode. Works for both
+  // Quantizable ops and Unquantizable ops
+  void RewireFloatModelBackbone(Operation* quantized_op,
+                                Operation* float_op) const {
+    for (int i = 0, e = quantized_op->getNumResults(); i < e; ++i) {
+      if (!llvm::cast<ShapedType>(float_op->getResult(i).getType())
+               .getElementType()
+               .isF32()) {
+        continue;
+      }
+      // Find the Quantize/Dequantize users of the new op results, and replace
+      // the usage. Then all the floating-point ops are connected, forming a
+      // separate float "backbone" model that the quantized model can be
+      // compared against in parallel.
+      // N.B. the return op will use this floating-point result.
+      Value result;
+      if (!IsOpQuantizable(float_op)) {
+        // For not quantizable ops, search for dequantize attached to the
+        // quantized op of the output.
+        if (Operation* quantize_op = dyn_cast_or_null<QuantizeOpT>(
+                *quantized_op->getResult(i).getUsers().begin())) {
+          result = quantize_op->getResult(0);
+        } else {
+          quantized_op->emitError()
+              << "Output[" << i
+              << "] is expected to have only one user [QUANTIZE]";
+          return;
+        }
+      } else {
+        result = quantized_op->getResult(i);
+      }
+      for (auto user : result.getUsers()) {
+        // Skip the Requantize op and set the user to the following dequantize
+        // op. This happens when the quantizer tries to match the scale conflict
+        // with QuantizeOp - QuantizeOp(requant) - DequantizeOp triples. The
+        // correct float op should be the user of the last DequantizeOp.
+        if (llvm::isa<QuantizeOpT>(user)) {
+          user = *user->getResult(0).getUsers().begin();
+        }
+        if (auto dequantize = llvm::dyn_cast<DequantizeOpT>(user)) {
+          // Replace all uses, except not quantizable ops that are being used in
+          // the float backbone.
+          dequantize.getResult().replaceUsesWithIf(
+              float_op->getResult(i), [&](OpOperand& use) {
+                return !use.getOwner()->hasAttr(kDebugModeOpQuantAttrName);
+              });
+        }
+      }
+    }
+  }
+
+  QuantPassSpec quant_params_;
+};
+
+// A pattern that removes debug attributes that are annotated to ops during
+// the debug model creation.
+class RemoveDebugAttrPattern : public RewritePattern {
+ public:
+  explicit RemoveDebugAttrPattern(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+// Converts quantized tensor type with signed integer type to quantized tensor
+// type with unsigned integer type.
+Type ConvertSignedQuantizedToUnsigned(Type signed_tensor_type, Location loc);
+
+// Converts quantize ops with unsigned quantized types to these with signed
+// quantized types and preserves the scales.
+template <typename QuantizeOpT>
+struct ConvertUnsignedToSigned : public OpRewritePattern<QuantizeOpT> {
+  using BaseType = ConvertUnsignedToSigned<QuantizeOpT>;
+  using QType = quant::QuantizedType;
+
+  explicit ConvertUnsignedToSigned(MLIRContext* context)
+      : OpRewritePattern<QuantizeOpT>(context, 1) {}
+
+  LogicalResult matchAndRewrite(QuantizeOpT op,
+                                PatternRewriter& rewriter) const override {
+    Type output_type = op.getResult().getType();
+    auto qtype = QType::getQuantizedElementType(output_type);
+    if (!qtype || qtype.isSigned()) return failure();
+
+    int num_bits = qtype.getStorageTypeIntegralWidth();
+    if (num_bits == 8) {
+      // If storage is 8-bit, trained num bits may be less than 8 so check here.
+      num_bits =
+          static_cast<int>(std::ceil(std::log2(qtype.getStorageTypeMax())));
+    }
+    // This is a positive value, and will be applied on zero points and fixed
+    // point ranges.
+    int64_t offset =
+        QType::getDefaultMinimumForInteger(/*isSigned=*/false, num_bits) -
+        QType::getDefaultMinimumForInteger(/*isSigned=*/true, num_bits);
+
+    auto flags = quant::QuantizationFlags::Signed;
+    QType new_qtype;
+    if (auto uqtype = llvm::dyn_cast<quant::UniformQuantizedType>(qtype)) {
+      new_qtype = quant::UniformQuantizedType::getChecked(
+          op.getLoc(), flags, qtype.getStorageType(), qtype.getExpressedType(),
+          uqtype.getScale(), uqtype.getZeroPoint() - offset,
+          uqtype.getStorageTypeMin() - offset,
+          uqtype.getStorageTypeMax() - offset);
+    } else if (auto aqtype =
+                   llvm::dyn_cast<quant::UniformQuantizedPerAxisType>(qtype)) {
+      auto zero_points = aqtype.getZeroPoints();
+      llvm::SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
+                                                    zero_points.end());
+      for (int i = 0, e = new_zero_points.size(); i < e; ++i) {
+        new_zero_points[i] -= offset;
+      }
+      new_qtype = quant::UniformQuantizedPerAxisType::getChecked(
+          op.getLoc(), flags, qtype.getStorageType(), qtype.getExpressedType(),
+          aqtype.getScales(), new_zero_points, aqtype.getQuantizedDimension(),
+          aqtype.getStorageTypeMin() - offset,
+          aqtype.getStorageTypeMax() - offset);
+    } else {
+      return failure();
+    }
+
+    if (!new_qtype) return failure();
+    Type new_output_type = new_qtype.castFromExpressedType(
+        QType::castToExpressedType(output_type));
+    rewriter.replaceOpWithNewOp<QuantizeOpT>(op, new_output_type, op.getArg());
+    return success();
+  }
+};
+
+// Fold Extra Requantize ops if the preceding ops has free scale requirement.
+template <typename RequantizeOpT>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RequantizeOpT> {
+  explicit FoldTrivalRequantizeOp(MLIRContext* context)
+      : OpRewritePattern<RequantizeOpT>(context, 1) {}
+
+  LogicalResult matchAndRewrite(RequantizeOpT op,
+                                PatternRewriter& rewriter) const override {
+    Value pre_quantized = op->getOperand(0);
+    auto pre_quantized_type =
+        quant::QuantizedType::getQuantizedElementType(pre_quantized.getType());
+    if (!pre_quantized_type) return failure();
+
+    Operation* def = pre_quantized.getDefiningOp();
+    if (!def) return failure();
+    if (llvm::isa<FixedOutputRangeInterface, SameScalesOpInterface>(def) ||
+        !def->hasTrait<OpTrait::tf_quant::QuantizableResult>()) {
+      return failure();
+    }
+
+    // This op should not clobber def, if more than one requant of this value.
+    if (!pre_quantized.hasOneUse()) {
+      return failure();
+    }
+
+    op.emitWarning("Remove trivial `rescale` op. Please fix the source graph.");
+
+    llvm::SmallVector<Type, 4> new_output_types;
+    for (auto result : def->getResults()) {
+      if (result.hasOneUse() && *result.getUsers().begin() == op) {
+        new_output_types.push_back(op.getResult().getType());
+      } else {
+        new_output_types.push_back(result.getType());
+      }
+    }
+
+    // Remove this rescale op.
+    rewriter.replaceOp(op, {pre_quantized});
+
+    // Replace the output scale of the preceding op.
+    rewriter.setInsertionPointAfter(def);
+    OperationState new_state(def->getLoc(), def->getName().getStringRef(),
+                             def->getOperands(), new_output_types,
+                             def->getAttrs());
+    Operation* new_op = rewriter.create(new_state);
+
+    rewriter.replaceOp(def, new_op->getResults());
+    return success();
+  }
+};
+
+// Given a quantized type `input`, magnifying its scales by the factor stored in
+// `factor`. If `input` isn't a quantized type or the `factor` doesn't match the
+// dimension size of `input` or isn't floating-point, nullptr will be returned.
+TypeAttr RescaleQuantizedType(Type input, Attribute factor);
+
+// Converts the min/max/num_bits/narrow_range information to a
+// QuantizedType, and then returns the attribute containing the QuantizedType.
+// The `min` and `max` arguments can be FloatAttr or DenseFPElementsAttr and
+// returns UniformQuantizedType or UniformQuantizedPerAxisType respectively.
+// `narrow_range` is set to true for weights and `is_signed` is set to true
+// if it is using signed int symmetric quantization.
+//
+// Note that this method may broadcast min and max to match the dimension length
+// of `input_type`, if the `quant_dim` is valid. On the other hand, the
+// symmetry of min and max is not adjusted by this method. The QAT workflow
+// should set min/max correctly (and use `narrow_range`=true, `is_signed`=true)
+// if symmetric quantization is required.
+TypeAttr GetQuantizedTypeAttr(Builder builder, Type input_type, Attribute min,
+                              Attribute max, int quant_dim,
+                              IntegerAttr num_bits, BoolAttr narrow_range,
+                              bool is_signed, bool legacy_float_scale = false,
+                              bool use_fake_quant_num_bits = false);
+
+// Casts the `target` type to a quantized type by using the quantization
+// parameters from the type in the `source` type attribute.
+// Examples:
+//   f32 -> !quant.uniform<i8:f32, 1.0>
+//   tensor<4xf32> -> tensor<4x!quant.uniform<i8:f32, 1.0>>
+// The result is wrapped by a type attribute. Returns nullptr if the cast
+// isn't valid.
+//
+// `axis` is to specify the quantization dimension in the `target` and only
+// used if the element type of `source` is a per-channel quantized type. During
+// the casting, the quantization dimension of the result type needs to be set
+// this new `axis` value.
+TypeAttr CastQuantizedTypeAttrFromExpressedType(Builder builder,
+                                                TypeAttr source, Type target,
+                                                int axis);
+
+// Quantizes the elements in the attribute `real_value` by the quantization
+// parameters in `tensor_type`. Returns empty Attribute if the
+// `tensor_type` is not a QuantizedType or the quantization fails.
+ElementsAttr Quantize(Attribute real_value, Type tensor_type);
+
+// Quantizes the elements in "legacy mode", where it calls TOCO's methods to
+// to quantize values with float scale.
+ElementsAttr QuantizeLegacy(Attribute real_value, Type tensor_type);
+
+// Returns the quantized type for an element attribute. The quantization
+// parameters in this type is based on the min and max element of the
+// attribute. When the elements in the `attr` are not in floating-point, or
+// the value range isn't straddling zero, an empty type is returned. The min/max
+// are adjusted to be symmetric if `symmetric` flag is set to True. And
+// `symmetric` can only be set to true when it is signed and narrow_range.
+Type GetUniformQuantizedTypeForWeight(ElementsAttr attr, bool symmetric,
+                                      unsigned num_bits, bool is_signed,
+                                      bool narrow_range,
+                                      bool legacy_float_scale = false,
+                                      bool use_fake_quant_num_bits = false);
+
+// Returns the per channel quantized type for an element attribute.
+// `quant_dim` defines the quantization axis. The channel min/max are adjusted
+// to be symmetric if `symmetric` flag is set to True. And `symmetric` can only
+// be set to true when it is signed and narrow_range.
+Type GetUniformQuantizedPerAxisTypeForWeight(
+    ElementsAttr attr, int quant_dim, bool symmetric, unsigned num_bits,
+    bool is_signed, bool narrow_range, bool legacy_float_scale = false,
+    bool use_fake_quant_num_bits = false);
+
+// Returns the quantized type of a bias input, given the quantized types of
+// other operands which are multiply-accumulated (the bias is added to the
+// accumulated value).
+quant::QuantizedType GetUniformQuantizedTypeForBias(
+    const std::vector<quant::QuantizedType>& op_types, int adjusted_quant_dim,
+    bool legacy_float_scale = false);
+
+// Gets quantization scale specs (e.g. fixed output range, same result and
+// operand scales) from the default quantization interfaces. The op should
+// outlive returned spec for its interface methods to be properly referenced.
+std::unique_ptr<OpQuantScaleSpec> GetDefaultQuantScaleSpec(Operation* op);
+
+// The function might contain more stats ops than required, and it will
+// introduce requantize if the calibration stats have conflicts. This method
+// tries to remove all the redundant stats ops.
+bool RemoveRedundantStatsOps(mlir::func::FuncOp func,
+                             OpQuantSpecGetter op_quant_spec_getter,
+                             OpQuantScaleSpecGetter op_quant_scale_spec_getter =
+                                 GetDefaultQuantScaleSpec);
+
+// Given quantization parameters for int8, compute the quantization parameters
+// for uint if it is required, and wrap the result in an UniformQuantizedType.
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point,
+                                                int64_t storage_min,
+                                                int64_t storage_max);
+
+quant::UniformQuantizedType GetFixedOutputRange(bool is_signed, int bit_width,
+                                                Type tensor_type, double scale,
+                                                int64_t zero_point);
+
+// Extracts min and max values from the DenseFPElementsAttr, and stores them
+// into `mins` and `maxs`. When mins and maxs are extracted per-channel,
+// `dim_size` is number of channels and `slice_size` is the size of slice per
+// each channel. When `symmetric` is true, the range is expanded to [-M, M].
+void ExtractMinMaxFromAttr(DenseFPElementsAttr values, int dim_size,
+                           int slice_size, bool symmetric,
+                           SmallVectorImpl<double>& mins,
+                           SmallVectorImpl<double>& maxs);
+
+// Returns the quantized type for the
+// input_type/min/max/storage_type_width/narrow_range.
+Type GetQuantizedType(Builder builder, Type input_type, ArrayRef<double> min,
+                      ArrayRef<double> max, int quant_dim,
+                      int storage_type_width, bool narrow_range, bool is_signed,
+                      bool legacy_float_scale = false,
+                      bool use_fake_quant_num_bits = false);
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_QUANTIZATION_LIB_TF_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_test_base.h b/tensorflow/compiler/mlir/quantization/common/tf_test_base.h
new file mode 100644
index 000000000000..3c171abf0ac7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_test_base.h
@@ -0,0 +1,86 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_TEST_BASE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_TEST_BASE_H_
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir::tf_quant {
+
+using ::testing::Test;
+
+class QuantizationTestBase : public Test {
+ protected:
+  QuantizationTestBase()
+      : ctx_(quant::stablehlo::CreateMlirContextForQuantization()),
+        builder_(ctx_.get()) {
+    ctx_->loadDialect<arith::ArithDialect, mlir::stablehlo::StablehloDialect,
+                      func::FuncDialect, TF::TensorFlowDialect,
+                      tf_saved_model::TensorFlowSavedModelDialect,
+                      tf_executor::TensorFlowExecutorDialect,
+                      quant::QuantDialect, mlir::quant::ir::TFQuantDialect,
+                      quant::ir::TFQuantDialect>();
+  }
+
+  // Parses `module_op_str` to create a `ModuleOp`.
+  OwningOpRef<ModuleOp> ParseModuleOpString(
+      const absl::string_view module_op_str) {
+    return parseSourceString<ModuleOp>(module_op_str, ctx_.get());
+  }
+
+  // Convenience function that returns the first operation of type `OpT` from
+  // the `@main` function in `module_op`. Useful when testing with a text
+  // representation of a `ModuleOp` containing a single function `@main`.
+  // Returns `failure` iff there is no `@main` or no such operation is found in
+  // `@main`.
+  template <typename OpT>
+  FailureOr<OpT> FindFirstOpFromMainFunc(ModuleOp module_op) {
+    func::FuncOp main_func_op = quant::FindMainFuncOp(module_op);
+    if (main_func_op == nullptr) return failure();
+
+    auto ops = main_func_op.getOps<OpT>();
+    if (ops.empty()) return failure();
+
+    return *ops.begin();
+  }
+
+  std::unique_ptr<MLIRContext> ctx_;
+  OpBuilder builder_;
+};
+
+}  // namespace mlir::tf_quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_TEST_BASE_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.cc b/tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.cc
new file mode 100644
index 000000000000..da812387fc1b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.cc
@@ -0,0 +1,232 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.h"
+
+#include <cstdint>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+#define DEBUG_TYPE "uniform-quantized-types"
+
+namespace mlir {
+namespace tf_quant {
+
+using quant::QuantizedType;
+using quant::UniformQuantizedPerAxisType;
+using quant::UniformQuantizedType;
+
+UniformQuantizedType CreateI8F32UniformQuantizedType(const Location loc,
+                                                     MLIRContext& context,
+                                                     const double scale,
+                                                     const int64_t zero_point,
+                                                     const bool narrow_range) {
+  return UniformQuantizedType::getChecked(
+      loc, /*flags=*/quant::QuantizationFlags::Signed,
+      /*storageType=*/IntegerType::get(&context, /*width=*/8),
+      /*expressedType=*/Float32Type::get(&context), scale, zero_point,
+      /*storageTypeMin=*/llvm::minIntN(8) + (narrow_range ? 1 : 0),
+      /*storageTypeMax=*/llvm::maxIntN(8));
+}
+
+UniformQuantizedType CreateI32F32UniformQuantizedType(
+    const Location loc, MLIRContext& context, const double scale,
+    const int64_t zero_point) {
+  return UniformQuantizedType::getChecked(
+      loc, /*flags=*/quant::QuantizationFlags::Signed,
+      /*storageType=*/IntegerType::get(&context, /*width=*/32),
+      /*expressedType=*/Float32Type::get(&context), scale, zero_point,
+      /*storageTypeMin=*/llvm::minIntN(32),
+      /*storageTypeMax=*/llvm::maxIntN(32));
+}
+
+UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
+    const Location loc, MLIRContext& context, const ArrayRef<double> scales,
+    const ArrayRef<int64_t> zero_points, const int quantization_dimension,
+    const bool narrow_range) {
+  return UniformQuantizedPerAxisType::getChecked(
+      loc, /*flags=*/quant::QuantizationFlags::Signed,
+      /*storageType=*/IntegerType::get(&context, /*width=*/8),
+      /*expressedType=*/Float32Type::get(&context), SmallVector<double>(scales),
+      SmallVector<int64_t>(zero_points), quantization_dimension,
+      /*storageTypeMin=*/llvm::minIntN(8) + (narrow_range ? 1 : 0),
+      /*storageTypeMax=*/llvm::maxIntN(8));
+}
+
+UniformQuantizedPerAxisType CreateI32F32UniformQuantizedPerAxisType(
+    const Location loc, MLIRContext& context, const ArrayRef<double> scales,
+    const ArrayRef<int64_t> zero_points, const int quantization_dimension) {
+  return UniformQuantizedPerAxisType::getChecked(
+      loc, /*flags=*/quant::QuantizationFlags::Signed,
+      /*storageType=*/IntegerType::get(&context, /*width=*/32),
+      /*expressedType=*/Float32Type::get(&context), SmallVector<double>(scales),
+      SmallVector<int64_t>(zero_points), quantization_dimension,
+      /*storageTypeMin=*/llvm::minIntN(32),
+      /*storageTypeMax=*/llvm::maxIntN(32));
+}
+
+bool IsStorageTypeI8(const QuantizedType quantized_type) {
+  const Type storage_type = quantized_type.getStorageType();
+  return storage_type.isInteger(/*width=*/8);
+}
+
+bool IsStorageTypeI32(const QuantizedType quantized_type) {
+  const Type storage_type = quantized_type.getStorageType();
+  return storage_type.isInteger(/*width=*/32);
+}
+
+bool IsExpressedTypeF32(const QuantizedType quantized_type) {
+  const Type expressed_type = quantized_type.getExpressedType();
+  return mlir::isa<Float32Type>(expressed_type);
+}
+
+bool IsI8F32UniformQuantizedType(const Type type) {
+  const UniformQuantizedType quantized_type =
+      mlir::dyn_cast_or_null<UniformQuantizedType>(type);
+  if (!quantized_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI8(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i8 storage type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool IsI8F32UniformQuantizedPerAxisType(const Type type) {
+  const UniformQuantizedPerAxisType quantized_per_axis_type =
+      mlir::dyn_cast_or_null<UniformQuantizedPerAxisType>(type);
+  if (!quantized_per_axis_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI8(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i8 storage type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool IsI32F32UniformQuantizedType(const Type type) {
+  const UniformQuantizedType quantized_type =
+      mlir::dyn_cast_or_null<UniformQuantizedType>(type);
+  if (!quantized_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI32(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i32 storage type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool IsI32F32UniformQuantizedPerAxisType(const Type type) {
+  const UniformQuantizedPerAxisType quantized_per_axis_type =
+      mlir::dyn_cast_or_null<UniformQuantizedPerAxisType>(type);
+  if (!quantized_per_axis_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI32(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i32 storage type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
+// Determines whether the storage type of a quantized type is supported by
+// `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
+bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type) {
+  if (storage_type.getWidth() == 8 ||
+      (storage_type.isSigned() && storage_type.getWidth() == 16)) {
+    return true;
+  }
+  LLVM_DEBUG(llvm::dbgs()
+             << "Uniform quantize / dequantize op only supports ui8, i8 or "
+                "i16 for the storage type of uniform quantized type. Got: "
+             << storage_type << ".\n");
+  return false;
+}
+
+bool IsQuantizedTensorType(Type type) {
+  if (!mlir::isa<TensorType>(type)) {
+    return false;
+  }
+  Type element_type = mlir::cast<TensorType>(type).getElementType();
+  return mlir::isa<QuantizedType>(element_type);
+}
+
+bool IsOpFullyQuantized(Operation* op) {
+  return llvm::all_of(op->getOperandTypes(), IsQuantizedTensorType) &&
+         llvm::all_of(op->getResultTypes(), IsQuantizedTensorType);
+}
+
+bool IsOpNotQuantized(Operation* op) {
+  return !llvm::any_of(op->getOperandTypes(), IsQuantizedTensorType) &&
+         !llvm::any_of(op->getResultTypes(), IsQuantizedTensorType);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.h b/tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.h
new file mode 100644
index 000000000000..e0bec5c2630a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/tf_uniform_quantized_types.h
@@ -0,0 +1,116 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_UNIFORM_QUANTIZED_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_UNIFORM_QUANTIZED_TYPES_H_
+
+#include <cstdint>
+
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_quant {
+
+// Creates a `UniformQuantizedType` with the given `scale` and `zero_point`
+// values. The produced type has f32 as its expressed type and i8 as its
+// storage type. The available values use the full range of the storage value,
+// i.e. [-128, 127]. Assumes asymmetric quantization, meaning the zero point
+// value can be a non-zero value.
+// If `narrow_range` is set true (ex: for weights), a restricted range of
+// integers will be used for symmetric mapping, i.e. [-127, 127].
+quant::UniformQuantizedType CreateI8F32UniformQuantizedType(
+    Location loc, MLIRContext& context, double scale, int64_t zero_point,
+    bool narrow_range = false);
+
+// Creates a `UniformQuantizedType` with the given `scale` and `zero_point`
+// values. The produced type has f32 as its expressed type and i32 as its
+// storage type. The available values use the full range of the storage value.
+// Assumes asymmetric quantization, meaning the zero point value can be
+// a non-zero value.
+quant::UniformQuantizedType CreateI32F32UniformQuantizedType(
+    Location loc, MLIRContext& context, double scale, int64_t zero_point);
+
+// Creates a `UniformQuantizedPerAxisType` with the given `scales` and
+// `zero_points` values. The produced type has f32 as its expressed type and
+// i8 as its storage type. The available values use the full range of the
+// storage value, i.e. [-128, 127]. Assumes asymmetric quantization, meaning the
+// zero point values can be non-zero values.
+// If `narrow_range` is set true (ex: for weights), a restricted range of
+// integers will be used for symmetric mapping, i.e. [-127, 127].
+quant::UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
+    Location loc, MLIRContext& context, ArrayRef<double> scales,
+    ArrayRef<int64_t> zero_points, int quantization_dimension,
+    bool narrow_range = false);
+
+// Creates a `UniformQuantizedPerAxisType` with the given `scales` and
+// `zero_points` values. The produced type has f32 as its expressed type and
+// i32 as its storage type. The available values use the full range of the
+// storage value. Assumes asymmetric quantization, meaning the
+// zero point values can be non-zero values.
+quant::UniformQuantizedPerAxisType CreateI32F32UniformQuantizedPerAxisType(
+    Location loc, MLIRContext& context, ArrayRef<double> scales,
+    ArrayRef<int64_t> zero_points, int quantization_dimension);
+
+bool IsStorageTypeI8(quant::QuantizedType quantized_type);
+
+bool IsStorageTypeI32(quant::QuantizedType quantized_type);
+
+bool IsExpressedTypeF32(quant::QuantizedType quantized_type);
+
+// Given a value, extract the `ElementType`.
+// `value` should be a non-null `TensorType`.
+inline Type GetElementType(const Value value) {
+  return mlir::cast<TensorType>(value.getType()).getElementType();
+}
+
+// Returns true iff `type` is a uniform quantized type whose storage type is
+// 8-bit integer and expressed type is f32.
+bool IsI8F32UniformQuantizedType(Type type);
+
+// Returns true iff `type` is a uniform quantized per-axis (per-channel) type
+// whose storage type is 8-bit integer and expressed type is f32.
+bool IsI8F32UniformQuantizedPerAxisType(Type type);
+
+// Returns true iff `type` is a uniform quantized type whose storage type is
+// 32-bit integer and expressed type is f32.
+bool IsI32F32UniformQuantizedType(Type type);
+
+// Returns true iff `type` is a uniform quantized per-axis (per-channel) type
+// whose storage type is 32-bit integer and expressed type is f32.
+bool IsI32F32UniformQuantizedPerAxisType(Type type);
+
+// Determines whether the storage type of a quantized type is supported by
+// `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
+bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type);
+
+// Returns true if a type is quantized tensor type.
+bool IsQuantizedTensorType(Type type);
+
+// Returns true if all operands and results are quantized.
+bool IsOpFullyQuantized(Operation* op);
+
+// Returns true iff none among operand and result tensors are quantized.
+bool IsOpNotQuantized(Operation* op);
+
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TF_UNIFORM_QUANTIZED_TYPES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index ec79c4f83f5d..7946079794f0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -27,16 +27,24 @@ package(
 )
 
 gentbl_cc_library(
-    name = "stablehlo_passes_inc_gen",
+    name = "tf_stablehlo_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-            ],
-            "passes/passes.h.inc",
-        ),
+    tbl_outs = {"passes/tf_passes.h.inc": [
+        "-gen-pass-decls",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
     ],
+)
+
+gentbl_cc_library(
+    name = "stablehlo_passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/passes.h.inc": [
+        "-gen-pass-decls",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/passes.td",
     deps = [
@@ -44,10 +52,103 @@ gentbl_cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_passes",
+    srcs = [
+        "passes/lift_quantizable_spots_as_functions_fusion.inc",
+        "passes/lift_quantizable_spots_as_functions_simple.inc",
+        "passes/remove_sharding_custom_call.inc",
+        "passes/tf_convert_func_to_bfloat16.cc",
+        "passes/tf_convert_shape_constraint_to_assert.cc",
+        "passes/tf_convert_xla_call_module_op_to_bfloat16.cc",
+        "passes/tf_defer_activation_transpose.cc",
+        "passes/tf_fold_constant_transpose.cc",
+        "passes/tf_insert_calibration_statistics_saver.cc",
+        "passes/tf_insert_weight_param.cc",
+        "passes/tf_lift_quantizable_spots_as_functions.cc",
+        "passes/tf_merge_fusion_with_dequantize.cc",
+        "passes/tf_nchw_convolution_to_nhwc.cc",
+        "passes/tf_optimize_graph.cc",
+        "passes/tf_post_quantize.cc",
+        "passes/tf_prepare_quantize.cc",
+        "passes/tf_quantize.cc",
+        "passes/tf_quantize_composite_functions.cc",
+        "passes/tf_quantize_weight.cc",
+        "passes/tf_remove_sharding_custom_call.cc",
+        "passes/tf_replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc",
+        "passes/tf_restore_function_name.cc",
+        "passes/tf_unfuse_mhlo_batch_norm.cc",
+        "passes/tf_unwrap_xla_call_module_op.cc",
+        "passes/tf_xla_call_module_to_call.cc",
+    ],
+    hdrs = [
+        "passes/tf_passes.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":bfloat16_type",
+        ":fill_quantization_options",
+        ":lift_quantizable_spots_as_functions_fusion_inc_gen",
+        ":lift_quantizable_spots_as_functions_simple_inc_gen",
+        ":optimize_graph_inc_gen",
+        ":quantization_config_proto_cc",
+        ":quantization_options_proto_cc",
+        ":remove_sharding_custom_call_inc_gen",
+        ":stablehlo_type_utils",
+        ":tf_quantization_patterns",
+        ":tf_stablehlo_passes_inc_gen",
+        "//tensorflow/compiler/mlir/quantization/common:func",
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:tf_lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib:tf_quantization_config",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:permutation",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:tf_stablehlo_op_quant_spec",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/ir/types:Dialect",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@eigen_archive//:eigen3",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Rewrite",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:regexp",
+        "@local_xla//xla/mlir_hlo",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
+        "@stablehlo//:stablehlo_portable_api",
+        "@stablehlo//:stablehlo_serialization",
+        "@stablehlo//:version",
+    ],
+)
+
 cc_library(
     name = "passes",
     srcs = [
         "passes/convert_func_to_bfloat16.cc",
+        "passes/convert_shape_constraint_to_assert.cc",
         "passes/convert_xla_call_module_op_to_bfloat16.cc",
         "passes/defer_activation_transpose.cc",
         "passes/fold_constant_transpose.cc",
@@ -138,6 +239,7 @@ cc_library(
         "@llvm-project//mlir:Rewrite",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:path",
@@ -150,11 +252,42 @@ cc_library(
         "@local_xla//xla/tsl/protobuf:protos_all_cc",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
         "@stablehlo//:stablehlo_portable_api",
         "@stablehlo//:stablehlo_serialization",
     ],
 )
 
+cc_library(
+    name = "tf_quantization_patterns",
+    srcs = ["passes/tf_quantization_patterns.cc"],
+    hdrs = [
+        "passes/tf_quantization_patterns.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:tf_lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:tf_stablehlo_op_quant_spec",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:path",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
 cc_library(
     name = "quantization_patterns",
     srcs = ["passes/quantization_patterns.cc"],
@@ -209,12 +342,7 @@ td_library(
 gentbl_cc_library(
     name = "lift_quantizable_spots_as_functions_simple_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/lift_quantizable_spots_as_functions_simple.inc",
-        ),
-    ],
+    tbl_outs = {"passes/lift_quantizable_spots_as_functions_simple.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/lift_quantizable_spots_as_functions_simple.td",
     deps = [
@@ -226,12 +354,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "lift_quantizable_spots_as_functions_fusion_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/lift_quantizable_spots_as_functions_fusion.inc",
-        ),
-    ],
+    tbl_outs = {"passes/lift_quantizable_spots_as_functions_fusion.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/lift_quantizable_spots_as_functions_fusion.td",
     deps = [
@@ -243,12 +366,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "optimize_graph_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/optimize_graph.inc",
-        ),
-    ],
+    tbl_outs = {"passes/optimize_graph.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/optimize_graph.td",
     deps = [
@@ -260,12 +378,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "remove_sharding_custom_call_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/remove_sharding_custom_call.inc",
-        ),
-    ],
+    tbl_outs = {"passes/remove_sharding_custom_call.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/remove_sharding_custom_call.td",
     deps = [
@@ -276,15 +389,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "bridge_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=Bridge",
-            ],
-            "passes/bridge/passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes/bridge/passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=Bridge",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/bridge/passes.td",
     deps = [
@@ -365,12 +473,7 @@ td_library(
 gentbl_cc_library(
     name = "optimize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/bridge/optimize.inc",
-        ),
-    ],
+    tbl_outs = {"passes/bridge/optimize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/bridge/optimize.td",
     deps = [":optimize_td_files"],
@@ -493,17 +596,26 @@ cc_library(
 )
 
 gentbl_cc_library(
-    name = "stablehlo_test_passes_inc_gen",
+    name = "tf_stablehlo_test_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=Test",
-            ],
-            "passes/testing/passes.h.inc",
-        ),
+    tbl_outs = {"passes/testing/tf_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=Test",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/testing/tf_passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
     ],
+)
+
+gentbl_cc_library(
+    name = "stablehlo_test_passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/testing/passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=Test",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/testing/passes.td",
     deps = [
@@ -511,6 +623,48 @@ gentbl_cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_test_passes",
+    srcs = [
+        "passes/testing/tf_test_lift_quantizable_spots_as_functions_with_quantization_specs.cc",
+        "passes/testing/tf_test_post_calibration_component.cc",
+        "passes/testing/tf_test_pre_calibration_component.cc",
+        "passes/testing/tf_test_tf_to_stablehlo_pass.cc",
+    ],
+    hdrs = [
+        "passes/testing/tf_passes.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":quantization_config_proto_cc",
+        ":tf_passes",
+        ":tf_stablehlo_test_passes_inc_gen",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:tf_post_calibration",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:tf_pre_calibration",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quantize_preprocess",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SparseTensorDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:UBDialect",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_xla//xla/mlir_hlo",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:vhlo_ops",
+    ],
+)
+
 cc_library(
     name = "test_passes",
     srcs = [
@@ -768,8 +922,44 @@ tf_cc_binary(
         ":bridge_passes",
         ":passes",
         ":test_passes",
+        ":tf_passes",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pass_pipeline",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
+        "//tensorflow/core/ir/types:Dialect",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
+        "@stablehlo//:vhlo_ops",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf-stablehlo-quant-opt",
+    srcs = ["tools/tf_stablehlo_quant_opt.cc"],
+    visibility = [":internal_visibility_allowlist_package"],
+    deps = [
+        ":bridge_passes",
+        ":passes",
+        ":test_passes",
+        ":tf_passes",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pass_pipeline",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 620c76e9c1f2..538d53c80cb5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -143,6 +143,71 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tf_saved_model_export",
+    srcs = ["tf_saved_model_export.cc"],
+    hdrs = ["tf_saved_model_export.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":io",
+        ":tf_pass_pipeline",
+        ":tf_saved_model_import",
+        ":types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:convert_asset_args",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:tf_unfreeze_constants",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_executor_to_graph",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:statusor",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_saved_model_export_test",
+    srcs = ["tf_saved_model_export_test.cc"],
+    deps = [
+        ":tf_saved_model_export",
+        "//tensorflow/compiler/mlir/quantization/common:tf_test_base",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/core:all_kernels",  # buildcleaner: keep Required to export to GraphDef
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:ops",  # buildcleaner: keep Required to export to GraphDef
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_xla//xla/tsl/platform:status_matchers",
+        "@local_xla//xla/tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "saved_model_export",
     srcs = ["saved_model_export.cc"],
@@ -208,6 +273,49 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tf_saved_model_import",
+    srcs = ["tf_saved_model_import.cc"],
+    hdrs = ["tf_saved_model_import.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":types",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quantize_preprocess",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:statusor",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_saved_model_import_test",
+    srcs = ["tf_saved_model_import_test.cc"],
+    deps = [
+        ":tf_saved_model_import",
+        ":types",
+        "//tensorflow/compiler/mlir/quantization/common:tf_test_base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "saved_model_import",
     srcs = ["saved_model_import.cc"],
@@ -251,6 +359,27 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tf_pass_pipeline",
+    srcs = ["tf_pass_pipeline.cc"],
+    hdrs = ["tf_pass_pipeline.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@stablehlo//:stablehlo_passes",
+    ],
+)
+
 cc_library(
     name = "pass_pipeline",
     srcs = ["pass_pipeline.cc"],
@@ -272,6 +401,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_pre_calibration",
+    srcs = ["tf_pre_calibration.cc"],
+    hdrs = ["tf_pre_calibration.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:__subpackages__",
+    ],
+    deps = [
+        ":component",
+        ":tf_pass_pipeline",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_xla//xla/tsl/platform:errors",
+    ],
+)
+
 cc_library(
     name = "pre_calibration",
     srcs = ["pre_calibration.cc"],
@@ -318,6 +473,27 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tf_report",
+    srcs = ["tf_report.cc"],
+    hdrs = ["tf_report.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":io",
+        "//tensorflow/compiler/mlir/quantization/common:tf_lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
 cc_library(
     name = "report",
     srcs = ["report.cc"],
@@ -369,6 +545,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_post_calibration",
+    srcs = ["tf_post_calibration.cc"],
+    hdrs = ["tf_post_calibration.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":component",
+        ":config",
+        ":tf_pass_pipeline",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/instrumentations:tf_save_report",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/tsl/platform:errors",
+    ],
+)
+
 cc_library(
     name = "post_calibration",
     srcs = ["post_calibration.cc"],
@@ -442,6 +642,42 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_weight_only_ptq",
+    srcs = ["tf_weight_only_ptq.cc"],
+    hdrs = ["tf_weight_only_ptq.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":component",
+        ":config",
+        ":context",
+        ":tf_pass_pipeline",
+        ":tf_saved_model_export",
+        ":tf_saved_model_import",
+        ":types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/instrumentations:tf_save_report",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "weight_only_ptq",
     srcs = ["weight_only_ptq.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
index 1344a487471d..b7da9a0d52af 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
@@ -44,6 +44,45 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_component",
+    srcs = ["tf_component.cc"],
+    hdrs = ["tf_component.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":representative_dataset",
+        ":statistics",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:component",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:tf_saved_model_export",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:types",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "component",
     srcs = ["component.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
index f18cf0f7df7f..a6e8fa86e9d1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
@@ -104,8 +104,8 @@ absl::Status RunCalibrationPasses(
 }
 
 CalibrationComponent::CalibrationComponent(
-    absl::Nonnull<MLIRContext*> ctx,
-    absl::Nonnull<const PyFunctionLibrary*> py_function_lib,
+    MLIRContext* absl_nonnull ctx,
+    const PyFunctionLibrary* absl_nonnull py_function_lib,
     const absl::string_view src_saved_model_path,
     absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases,
     std::unordered_set<std::string> tags,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
index 03d2dd933732..d55f5afda362 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
@@ -57,9 +57,9 @@ class CalibrationComponent : public Component {
   // `representative_dataset_file_map` contains information about the
   // calibration dataset.
   CalibrationComponent(
-      absl::Nonnull<MLIRContext*> ctx,
-      absl::Nonnull<const tensorflow::quantization::PyFunctionLibrary*>
-          py_function_lib,
+      MLIRContext* absl_nonnull ctx,
+      const tensorflow::quantization::PyFunctionLibrary* absl_nonnull
+      py_function_lib,
       absl::string_view src_saved_model_path,
       absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases,
       std::unordered_set<std::string> tags,
@@ -88,12 +88,12 @@ class CalibrationComponent : public Component {
   absl::StatusOr<ModuleOp> ImportCalibratedSavedModel(
       absl::string_view calibrated_saved_model_path);
 
-  absl::Nonnull<MLIRContext*> ctx_;
+  MLIRContext* absl_nonnull ctx_;
 
   // Contains function implementations from the python layer. Should be injected
   // from the python level using pybind11.
-  absl::Nonnull<const tensorflow::quantization::PyFunctionLibrary*>
-      py_function_lib_;
+  const tensorflow::quantization::PyFunctionLibrary* absl_nonnull
+  py_function_lib_;
 
   // Path to the pre-calibrated SavedModel.
   std::string src_saved_model_path_;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/tf_component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/tf_component.cc
new file mode 100644
index 000000000000..874b012b5d68
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/tf_component.cc
@@ -0,0 +1,214 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/tf_component.h"
+
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/die_if_null.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::AddCalibrationStatistics;
+using ::stablehlo::quantization::CreateRepresentativeDatasetFileMap;
+using ::stablehlo::quantization::DisableDebugging;
+using ::stablehlo::quantization::IsCalibrationRequired;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::ReadStatistics;
+using ::stablehlo::quantization::RepresentativeDatasetConfig;
+using ::stablehlo::quantization::io::CreateTmpDir;
+using ::stablehlo::quantization::io::GetLocalTmpFileName;
+using ::stablehlo::quantization::io::ListDirectory;
+using ::tensorflow::AssetFileDef;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::calibrator::CalibrationStatistics;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::PyFunctionLibrary;
+using ::tensorflow::quantization::RunPasses;
+using CalibrationStatisticsFlatMap =
+    absl::flat_hash_map<std::string, CalibrationStatistics>;
+
+}  // namespace
+
+absl::Status RunCalibrationPasses(
+    mlir::ModuleOp module_op, MLIRContext& ctx,
+    absl::string_view calibration_data_dir,
+    const bool force_regenerate_calibration_data) {
+  // Disable DumpTensor ops when running calibration.
+  DisableDebugging(module_op);
+
+  std::vector<std::string> skipping_aggregator_ops;
+  if (!force_regenerate_calibration_data) {
+    TF_ASSIGN_OR_RETURN(const CalibrationStatisticsFlatMap statistics_map,
+                        ReadStatistics(calibration_data_dir));
+    absl::c_for_each(statistics_map, [&](const auto& iter) {
+      return skipping_aggregator_ops.push_back(iter.first);
+    });
+  }
+
+  return RunPasses(
+      /*name=*/
+      CalibrationComponent::kName,
+      /*add_passes_func=*/
+      [calibration_data_dir, &skipping_aggregator_ops](PassManager& pm) {
+        pm.addPass(CreateInsertCalibrationStatisticsSaverPass(
+            calibration_data_dir, skipping_aggregator_ops));
+      },
+      ctx, module_op);
+}
+
+CalibrationComponent::CalibrationComponent(
+    MLIRContext* absl_nonnull ctx,
+    const PyFunctionLibrary* absl_nonnull py_function_lib,
+    const absl::string_view src_saved_model_path,
+    absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases,
+    std::unordered_set<std::string> tags,
+    absl::flat_hash_map<std::string, SignatureDef> signature_def_map,
+    std::vector<std::string> signature_keys)
+    : ctx_(ABSL_DIE_IF_NULL(ctx)),                          // Crash OK
+      py_function_lib_(ABSL_DIE_IF_NULL(py_function_lib)),  // Crash OK
+      src_saved_model_path_(src_saved_model_path),
+      function_aliases_(std::move(function_aliases)),
+      tags_(std::move(tags)),
+      signature_def_map_(std::move(signature_def_map)),
+      signature_keys_(std::move(signature_keys)) {}
+
+absl::Status CalibrationComponent::ExportToSavedModel(
+    ModuleOp module_op, absl::string_view calibration_data_dir,
+    const bool force_regenerate_calibration_data,
+    const absl::string_view dst_saved_model_path) {
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
+
+  // Clone ModuleOp and function aliases so changes in this pipeline won't
+  // be reflected in the original values.
+  mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
+
+  TF_RETURN_IF_ERROR(RunCalibrationPasses(*cloned_module_ref, *ctx_,
+                                          calibration_data_dir,
+                                          force_regenerate_calibration_data));
+
+  const bool is_calibration_required =
+      IsCalibrationRequired(*cloned_module_ref);
+  if (!is_calibration_required) return absl::OkStatus();
+
+  // `duplicate_shape_determining_constants = false` because the
+  // resulting graph of this step is not expected to be loaded on TPU.
+  const ExportOptions export_opts = {
+      /*duplicate_shape_determining_constants=*/false,
+      /*unfreeze_constants=*/false, checkpoint_dir,
+      /*debug_name=*/absl::StrCat(kName, kExportStepSuffix)};
+
+  TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
+                      RunExportPasses(export_opts, *ctx_, *cloned_module_ref));
+
+  TF_ASSIGN_OR_RETURN(ExportedModel exported_model,
+                      ConvertMlirModuleToExportedModel(
+                          *cloned_module_ref, checkpoint_dir, function_aliases_,
+                          {asset_file_defs.begin(), asset_file_defs.end()}));
+
+  py_function_lib_->SaveExportedModel(dst_saved_model_path, exported_model,
+                                      src_saved_model_path_, tags_,
+                                      signature_def_map_);
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<ModuleOp> CalibrationComponent::Run(
+    ModuleOp module_op, const QuantizationConfig& config) {
+  // Export the calibration model to SavedModel.
+  TF_ASSIGN_OR_RETURN(const std::string calibration_saved_model_dir,
+                      CreateTmpDir());
+
+  std::string calibration_data_dir =
+      config.calibration_options().calibration_data_dir();
+  if (calibration_data_dir.empty()) {
+    TF_ASSIGN_OR_RETURN(calibration_data_dir, CreateTmpDir());
+  }
+
+  TF_RETURN_IF_ERROR(ExportToSavedModel(
+      module_op, calibration_data_dir,
+      config.calibration_options().force_regenerate_calibration_data(),
+      calibration_saved_model_dir));
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::string> calibration_saved_model_files,
+                      ListDirectory(calibration_saved_model_dir));
+  if (!calibration_saved_model_files.empty()) {
+    // Translate `RepresentativeDatasetConfig`s to signature key ->
+    // `RepresentativeDatasetFile` mapping.
+    const auto dataset_configs =
+        config.calibration_options().representative_datasets();
+    const std::vector<RepresentativeDatasetConfig> dataset_config_vector(
+        dataset_configs.begin(), dataset_configs.end());
+    TF_ASSIGN_OR_RETURN(
+        const auto representative_dataset_file_map,
+        CreateRepresentativeDatasetFileMap(dataset_config_vector));
+
+    // Run calibration on the exported model.
+    if (py_function_lib_->RunCalibration(
+            calibration_saved_model_dir, signature_keys_, tags_,
+            /*force_graph_mode_calibration=*/true,
+            representative_dataset_file_map) == std::nullopt) {
+      return absl::InternalError(
+          "CalibrationComponent error: Failed to run calibration.");
+    }
+  }
+
+  if (absl::Status status = AddCalibrationStatistics(
+          module_op, calibration_data_dir, config.calibration_options(),
+          *py_function_lib_);
+      !status.ok()) {
+    LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
+                    "values. Parts of the graph are not quantized. "
+                 << status;
+  }
+
+  return module_op;
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/tf_component.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/tf_component.h
new file mode 100644
index 000000000000..cb590583ad2c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/tf_component.h
@@ -0,0 +1,126 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_TF_COMPONENT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_TF_COMPONENT_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+using ::mlir::quant::stablehlo::Component;
+using ::mlir::quant::stablehlo::FunctionAlias;
+using ::mlir::quant::stablehlo::FunctionName;
+
+// Performs post-calibration graph transformation as part of post-training
+// static-range quantization.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the statistics collected
+// after the calibration step, corresponding to each `TF::CustomAggregatorOp`s
+// in the input module op.
+//
+// TODO: b/320607042 - Add tests for this component on the python layer.
+class CalibrationComponent : public Component {
+ public:
+  // Name of the post-training quantization post-calibration step. Used for
+  // debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_calibration";
+
+  // `CalibrationComponent` ctor with necessary information required to run
+  // calibration on a `ModuleOp`. Meta information like `function_aliases`,
+  // `tags`, `signature_def_map`, and `signature_keys` are required to properly
+  // save and load the module_op to and from SavedModel.
+  // `representative_dataset_file_map` contains information about the
+  // calibration dataset.
+  CalibrationComponent(
+      MLIRContext* absl_nonnull ctx,
+      const tensorflow::quantization::PyFunctionLibrary* absl_nonnull
+      py_function_lib,
+      absl::string_view src_saved_model_path,
+      absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases,
+      std::unordered_set<std::string> tags,
+      absl::flat_hash_map<std::string, tensorflow::SignatureDef>
+          signature_def_map,
+      std::vector<std::string> signature_keys);
+
+  // Runs calibration on `module_op` and returns a calibrated ModuleOp with
+  // calibrated statistics embedded.
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  // Exports `module_op` to SavedModel at `dst_saved_model_path`. This is used
+  // to export the pre-calibrated `module_op` to SavedModel so that the
+  // calibration process can use it to load and run the graph with the
+  // representative dataset. Returns a failure status if the export fails.
+  absl::Status ExportToSavedModel(ModuleOp module_op,
+                                  absl::string_view calibration_data_dir,
+                                  bool force_regenerate_calibration_data,
+                                  absl::string_view dst_saved_model_path);
+
+  // Imports the SavedModel at `calibrated_saved_model_path` to `ModuleOp` after
+  // running calibration.
+  absl::StatusOr<ModuleOp> ImportCalibratedSavedModel(
+      absl::string_view calibrated_saved_model_path);
+
+  MLIRContext* absl_nonnull ctx_;
+
+  // Contains function implementations from the python layer. Should be injected
+  // from the python level using pybind11.
+  const tensorflow::quantization::PyFunctionLibrary* absl_nonnull
+  py_function_lib_;
+
+  // Path to the pre-calibrated SavedModel.
+  std::string src_saved_model_path_;
+
+  // Function alias mapping for pre-calibrated SavedModel. Used to preserve
+  // aliased functions.
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases_;
+
+  // Tags to identify the MetaGraphDef to load from a SavedModel.
+  const std::unordered_set<std::string> tags_;
+
+  const absl::flat_hash_map<std::string, tensorflow::SignatureDef>
+      signature_def_map_;
+
+  // Signature keys to identify the functions to load & quantize.
+  const std::vector<std::string> signature_keys_;
+};
+
+// Runs passes to prepare the calibration model.
+absl::Status RunCalibrationPasses(mlir::ModuleOp module_op, MLIRContext& ctx,
+                                  absl::string_view calibration_data_dir,
+                                  bool force_regenerate_calibration_data);
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_TF_COMPONENT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index 1bbf67389366..c5fc8b5b3d8d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -116,15 +116,13 @@ void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm) {
 }
 
 void AddShapeLegalizationPasses(OpPassManager& pm) {
-  pm.addPass(mhlo::createStablehloLegalizeToHloPass());
+  // TODO: We may need to make a parent pass here that does
+  // shape->StableHLO+cstr because the stablehlo pass requires that the ops made
+  // by cstr are legal.
   pm.addNestedPass<func::FuncOp>(
-      mhlo::createShapeLegalizeToHloPass(/*legalizeConstraints=*/true));
-  // The following 2 passes are used to clean up the spurious UnrealizedCast ops
-  // and shape.assuming regions leftover from the ShapeLegalizeToHlo pass. See
-  // pass definition for details.
+      createConvertShapeToStablehloWithConstraintsPass());
   pm.addPass(createReconcileUnrealizedCastsPass());
   pm.addNestedPass<func::FuncOp>(mlir::createCanonicalizerPass());
-  pm.addPass(mhlo::createHloLegalizeToStablehloPass());
 }
 
 void AddStablehloQuantToIntPasses(OpPassManager& pm) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
index 45213c10b3b7..ec4a10af74bc 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
@@ -38,7 +38,7 @@ using ::stablehlo::quantization::QuantizationSpecs;
 using ::tensorflow::quantization::RunPasses;
 
 PostCalibrationComponent::PostCalibrationComponent(
-    absl::Nonnull<MLIRContext*> ctx)
+    MLIRContext* absl_nonnull ctx)
     : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
 
 absl::StatusOr<ModuleOp> PostCalibrationComponent::Run(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
index 6e3762817e16..6692047628f0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
@@ -39,7 +39,7 @@ class PostCalibrationComponent : public Component {
   // debugging purposes.
   static constexpr absl::string_view kName = "quant_ptq_post_calibration";
 
-  explicit PostCalibrationComponent(absl::Nonnull<MLIRContext*> ctx);
+  explicit PostCalibrationComponent(MLIRContext* absl_nonnull ctx);
 
   absl::StatusOr<ModuleOp> Run(
       ModuleOp module_op,
@@ -51,7 +51,7 @@ class PostCalibrationComponent : public Component {
       const ::stablehlo::quantization::PipelineConfig& pipeline_config) const;
 
  private:
-  absl::Nonnull<MLIRContext*> ctx_;
+  MLIRContext* absl_nonnull ctx_;
 };
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
index bd7cab73d90c..3de90290df20 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
@@ -30,8 +30,7 @@ namespace mlir::quant::stablehlo {
 using ::stablehlo::quantization::QuantizationConfig;
 using ::tensorflow::quantization::RunPasses;
 
-PreCalibrationComponent::PreCalibrationComponent(
-    absl::Nonnull<MLIRContext*> ctx)
+PreCalibrationComponent::PreCalibrationComponent(MLIRContext* absl_nonnull ctx)
     : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
 
 absl::StatusOr<ModuleOp> PreCalibrationComponent::Run(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h
index bdc61bafa569..705f8b95bda1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h
@@ -38,14 +38,14 @@ class PreCalibrationComponent : public Component {
   // debugging purposes.
   static constexpr absl::string_view kName = "quant_ptq_pre_calibration";
 
-  explicit PreCalibrationComponent(absl::Nonnull<MLIRContext*> ctx);
+  explicit PreCalibrationComponent(MLIRContext* absl_nonnull ctx);
 
   absl::StatusOr<ModuleOp> Run(
       ModuleOp,
       const ::stablehlo::quantization::QuantizationConfig& config) override;
 
  private:
-  absl::Nonnull<MLIRContext*> ctx_;
+  MLIRContext* absl_nonnull ctx_;
 };
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
index 47aaf3121656..ca1033746383 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
@@ -56,8 +56,8 @@ using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
 
 StaticRangePtqComponent::StaticRangePtqComponent(
-    absl::Nonnull<MLIRContext*> ctx,
-    absl::Nonnull<const PyFunctionLibrary*> py_function_library,
+    MLIRContext* absl_nonnull ctx,
+    const PyFunctionLibrary* absl_nonnull py_function_library,
     const absl::string_view src_saved_model_path,
     std::vector<std::string> signature_keys,
     std::unordered_set<std::string> tags,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
index 69bd9da6733c..104df9aa50da 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
@@ -51,9 +51,9 @@ class StaticRangePtqComponent : public Component {
   // `CalibrationComponent`. For detailed explanation of each argument, see the
   // comment of `CalibrationComponent`'s constructor.
   StaticRangePtqComponent(
-      absl::Nonnull<MLIRContext*> ctx,
-      absl::Nonnull<const tensorflow::quantization::PyFunctionLibrary*>
-          py_function_library,
+      MLIRContext* absl_nonnull ctx,
+      const tensorflow::quantization::PyFunctionLibrary* absl_nonnull
+      py_function_library,
       absl::string_view src_saved_model_path,
       std::vector<std::string> signature_keys,
       std::unordered_set<std::string> tags,
@@ -69,7 +69,7 @@ class StaticRangePtqComponent : public Component {
  private:
   // A non-owning `MLIRContext`. This `MLIRContext` should exceed the lifetime
   // of `StaticRangePtqComponent`.
-  absl::Nonnull<MLIRContext*> ctx_;
+  MLIRContext* absl_nonnull ctx_;
   // This component consists of three sub-components, `PreCalibrationComponent`,
   // `CalibrationComponent`, and `PostCalibrationComponent`.
   std::array<std::unique_ptr<Component>, 3> sub_components_;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.cc
new file mode 100644
index 000000000000..f5512470bbdc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.cc
@@ -0,0 +1,177 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h"
+
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+using ::stablehlo::quantization::CalibrationOptions;
+using ::stablehlo::quantization::DebuggerConfig;
+using ::stablehlo::quantization::PipelineConfig;
+using ::stablehlo::quantization::QuantizationSpecs;
+
+void AddPreCalibrationPasses(OpPassManager& pm,
+                             const CalibrationOptions& calibration_options,
+                             const QuantizationSpecs& quantization_specs,
+                             const DebuggerConfig& debugger_config) {
+  // Convert NCHW tensors to NHWC at along with extra optimizations as
+  // downstream passes perform better optimizations when dealing with NHWC
+  // formatted tensors.
+  AddProcessNchwTensorPasses(pm);
+
+  pm.addPass(CreateLiftQuantizableSpotsAsFunctionsPass(quantization_specs));
+  if (debugger_config.debugger_type() !=
+      DebuggerConfig::DEBUGGER_TYPE_UNSPECIFIED) {
+    pm.addPass(CreateAddDumpTensorOpPass(debugger_config.debugger_type(),
+                                         debugger_config.log_dir_path()));
+  }
+  pm.addNestedPass<func::FuncOp>(
+      CreateInsertCustomAggregationOpsPass(calibration_options));
+}
+
+void AddPostCalibrationPasses(OpPassManager& pm,
+                              const PipelineConfig& pipeline_config,
+                              const QuantizationSpecs& specs) {
+  QuantizeCompositeFunctionsPassOptions options;
+  // TODO: b/331120943 - Temporarily set below to true, signaling per-channel
+  // quantization will be applied for all where applicable. This will be
+  // replaced by individual `Method` in `QuantizationSpecs`.
+  options.enable_per_channel_quantized_weight_ = true;
+  // For debugging purposes.
+  options.mlir_dump_file_name_ = "quantize_composite_functions";
+  options.merge_fusion_with_dequantize_ =
+      pipeline_config.merge_fusion_with_dequantize();
+
+  AddShapeLegalizationPasses(pm);
+  pm.addNestedPass<func::FuncOp>(
+      CreateConvertCustomAggregationOpToQuantStatsPass());
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+  // Add an inliner pass to inline quantized StableHLO functions.
+  pm.addPass(createInlinerPass());
+  if (pipeline_config.unpack_quantized_types()) {
+    AddStablehloQuantToIntPasses(pm);
+  }
+}
+
+void AddWeightOnlyQuantizationPasses(
+    OpPassManager& pm, const QuantizationSpecs& quantization_specs,
+    const PipelineConfig& pipeline_config,
+    const DebuggerConfig& debugger_config) {
+  // For models with NCHW convolution format. This pass is required because
+  // downstream pipeline handles NHWC convolution better for most cases.
+  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+
+  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
+  // generated as by-products after optimizing dimension numbers (e.g.
+  // NCHW->NHWC convolution conversion).
+  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
+  pm.addPass(CreateLiftQuantizableSpotsAsFunctionsPass(quantization_specs));
+  if (debugger_config.debugger_type() !=
+      DebuggerConfig::DEBUGGER_TYPE_UNSPECIFIED) {
+    pm.addPass(CreateAddDumpTensorOpPass(debugger_config.debugger_type(),
+                                         debugger_config.log_dir_path()));
+  }
+  AddShapeLegalizationPasses(pm);
+  QuantizeCompositeFunctionsPassOptions options;
+  // For debugging purposes.
+  options.mlir_dump_file_name_ = "quantize_composite_functions";
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+
+  // Add an inliner pass to inline quantized StableHLO functions.
+  pm.addPass(createInlinerPass());
+  if (pipeline_config.unpack_quantized_types()) {
+    AddStablehloQuantToIntPasses(pm);
+  }
+}
+
+void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm) {
+  pm.addPass(TF::CreateXlaCallModuleDeserializationPass());
+  pm.addPass(createRestoreFunctionNamePass());
+  pm.addPass(createUnwrapXlaCallModuleOpPass());
+  pm.addPass(createSymbolDCEPass());
+}
+
+void AddShapeLegalizationPasses(OpPassManager& pm) {
+  // TODO: We may need to make a parent pass here that does
+  // shape->StableHLO+cstr because the stablehlo pass requires that the ops made
+  // by cstr are legal.
+  pm.addNestedPass<func::FuncOp>(
+      createConvertShapeToStablehloWithConstraintsPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
+  pm.addNestedPass<func::FuncOp>(mlir::createCanonicalizerPass());
+}
+
+void AddStablehloQuantToIntPasses(OpPassManager& pm) {
+  pm.addNestedPass<func::FuncOp>(
+      mlir::stablehlo::createStablehloLegalizeQuantToMathPass());
+  // StableHLO -> MHLO legalization.
+  pm.addPass(mhlo::createStablehloLegalizeToHloPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  // Integer graph optimization relies on chlo broadcast ops for easier handling
+  // of dynamic shapes. Therefore we lower chlo ops after optimization.
+  pm.addNestedPass<func::FuncOp>(
+      quant::stablehlo::CreateOptimizeIntGraphPass());
+  pm.addNestedPass<func::FuncOp>(mhlo::createChloLegalizeToHloPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addPass(createSymbolDCEPass());
+  // MHLO -> StableHLO legalization.
+  pm.addPass(mhlo::createHloLegalizeToStablehloPass());
+}
+
+// NOMUTANTS -- Add tests for individual passes with migration below.
+void AddCallModuleSerializationPasses(OpPassManager& pm) {
+  AddShapeLegalizationPasses(pm);
+  pm.addPass(createReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass());
+  // ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass may create
+  // duplicate constants. Add canonicalizer to deduplicate.
+  pm.addNestedPass<func::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(TF::CreateXlaCallModuleSerializationPass());
+}
+
+void AddProcessNchwTensorPasses(OpPassManager& pm) {
+  // For models with NCHW convolution format. This pass is required because
+  // downstream pipeline handles NHWC convolution better for most cases.
+  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+
+  // Recursively push down the `stablehlo.transpose` ops for activations
+  // generated by the `NchwConvolutionToNhwc` pass.
+  pm.addNestedPass<func::FuncOp>(createDeferActivationTransposePass());
+
+  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
+  // generated as by-products after optimizing dimension numbers (e.g.
+  // NCHW->NHWC convolution conversion).
+  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
+}
+
+void RegisterPassPipelines() {
+  static PassPipelineRegistration<> nchw_tensor_format_processing_pipeline(
+      /*arg=*/"stablehlo-process-nchw-tensor",
+      /*description=*/"Optimizes tensors with NCHW format.",
+      AddProcessNchwTensorPasses);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h
new file mode 100644
index 000000000000..a0c1e0f38eba
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_PASS_PIPELINE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_PASS_PIPELINE_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// Adds passes for static-range quantization pre-calibration. Inserts ops
+// required to collect tensor statistics.
+void AddPreCalibrationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::CalibrationOptions& calibration_options,
+    const ::stablehlo::quantization::QuantizationSpecs& specs,
+    const ::stablehlo::quantization::DebuggerConfig& debugger_config);
+
+// Adds passes for static-range quantization post-calibration. Utilizes tensor
+// statistics collected from the calibration step and performs quantization.
+void AddPostCalibrationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::PipelineConfig& pipeline_config,
+    const ::stablehlo::quantization::QuantizationSpecs& specs);
+
+// Adds passes for weight-only quantization.
+void AddWeightOnlyQuantizationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::QuantizationSpecs& quantization_specs,
+    const ::stablehlo::quantization::PipelineConfig& pipeline_config,
+    const ::stablehlo::quantization::DebuggerConfig& debugger_config);
+
+// Deserializes StableHLO functions serialized and embedded in XlaCallModuleOps.
+void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm);
+
+// Legalizes shape/tensor/arith dialect ops to StableHLO for handling dynamic
+// shapes, by going through a round-trip to MHLO.
+void AddShapeLegalizationPasses(OpPassManager& pm);
+
+// Serializes the StableHLO module into a tf.XlaCallModuleOp for compatibility
+// with passes that expect TF format. This also allows the StableHLO ops to be
+// exported as a TF SavedModel.
+void AddCallModuleSerializationPasses(OpPassManager& pm);
+
+// Passes for unpacking quantized ops to int valued StableHLO ops. This is
+// useful when uniform quantized types are suboptimal for the hardware. It goes
+// through a StableHLO <-> MHLO roundtrip to utilize the MHLOQuantToInt pass.
+void AddStablehloQuantToIntPasses(OpPassManager& pm);
+
+// Processes tensors with NCHW format (== (batch, channel, height, weight)) by
+// converting them to NHWC formats along with extra optimizations such as
+// constant folding the transpose->convolution pattern. This is useful when
+// downstream pipeline (e.g. XLA) is more optimized when accepting NHWC formats.
+void AddProcessNchwTensorPasses(OpPassManager& pm);
+
+// Registers quantization pass pipelines. This is only required when running
+// MLIR opt binaries and not required when adding passes programmatically.
+void RegisterPassPipelines();
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_PASS_PIPELINE_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.cc
new file mode 100644
index 000000000000..b59d3c423733
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.cc
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.h"
+
+#include <memory>
+
+#include "absl/base/nullability.h"
+#include "absl/log/die_if_null.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+using ::stablehlo::quantization::GetReportFilePath;
+using ::stablehlo::quantization::PipelineConfig;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::QuantizationSpecs;
+using ::tensorflow::quantization::RunPasses;
+
+PostCalibrationComponent::PostCalibrationComponent(
+    MLIRContext* absl_nonnull ctx)
+    : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
+
+absl::StatusOr<ModuleOp> PostCalibrationComponent::Run(
+    ModuleOp module_op, const QuantizationConfig& config) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      kName, /*add_passes_func=*/
+      [&config](PassManager& pm) {
+        // Add instrumentation to save quantization report after quantization.
+        pm.addInstrumentation(
+            std::make_unique<SaveQuantizationReportInstrumentation>(
+                GetReportFilePath(config)));
+
+        tf_quant::stablehlo::AddPostCalibrationPasses(
+            pm, config.pipeline_config(), config.specs());
+      },
+      *ctx_, module_op));
+  return module_op;
+}
+
+void PostCalibrationComponent::AddPasses(
+    OpPassManager& pm, const QuantizationSpecs& specs,
+    const PipelineConfig& pipeline_config) const {
+  tf_quant::stablehlo::AddPostCalibrationPasses(pm, pipeline_config, specs);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.h
new file mode 100644
index 000000000000..95d839c07007
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.h
@@ -0,0 +1,59 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_POST_CALIBRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_POST_CALIBRATION_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// Performs post-calibration graph transformation as part of post-training
+// static-range quantization.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the statistics collected
+// after the calibration step, corresponding to each `TF::CustomAggregatorOp`s
+// in the input module op.
+class PostCalibrationComponent : public quant::stablehlo::Component {
+ public:
+  // Name of the post-training quantization post-calibration step. Used for
+  // debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_post_calibration";
+
+  explicit PostCalibrationComponent(MLIRContext* absl_nonnull ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+  void AddPasses(
+      OpPassManager& pm,
+      const ::stablehlo::quantization::QuantizationSpecs& specs,
+      const ::stablehlo::quantization::PipelineConfig& pipeline_config) const;
+
+ private:
+  MLIRContext* absl_nonnull ctx_;
+};
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_POST_CALIBRATION_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.cc
new file mode 100644
index 000000000000..f251a69c52a0
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.cc
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.h"
+
+#include "absl/base/nullability.h"
+#include "absl/log/die_if_null.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace mlir::quant::stablehlo {
+
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::quantization::RunPasses;
+
+PreCalibrationComponent::PreCalibrationComponent(MLIRContext* absl_nonnull ctx)
+    : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
+
+absl::StatusOr<ModuleOp> PreCalibrationComponent::Run(
+    ModuleOp module_op, const QuantizationConfig& config) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      kName, /*add_passes_func=*/
+      [&config](PassManager& pm) {
+        tf_quant::stablehlo::AddPreCalibrationPasses(
+            pm, config.calibration_options(), config.specs(),
+            config.debugger_config());
+      },
+      *ctx_, module_op));
+  return module_op;
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.h
new file mode 100644
index 000000000000..495798a694b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_PRE_CALIBRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_PRE_CALIBRATION_H_
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs pre-calibration graph transformation as part of post-training
+// static-range quantization.
+
+// The resulting `ModuleOp` contains `TF::CustomAggregatorOp`s for collecting
+// quantization statistics, along with `TF::XlaCallModuleOp`s that correspond to
+// lifted quantizable functions.
+class PreCalibrationComponent : public Component {
+ public:
+  // Name of the post-training quantization pre-calibration step. Used for
+  // debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_pre_calibration";
+
+  explicit PreCalibrationComponent(MLIRContext* absl_nonnull ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  MLIRContext* absl_nonnull ctx_;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_PRE_CALIBRATION_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.cc
new file mode 100644
index 000000000000..131c2372dae5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.cc
@@ -0,0 +1,174 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizationResult;
+using ::stablehlo::quantization::QuantizationResults;
+using ::stablehlo::quantization::io::WriteStringToFile;
+using ::tsl::protobuf::TextFormat;
+
+// Given a `quantized_func_name` that starts with `kQuantizedFuncPrefix`,
+// converts `kQuantizedFuncPrefix` to `kCompositeFuncPrefix`.
+std::string GetCompositeFunctionName(const StringRef quantized_func_name) {
+  return Twine(kCompositeFuncPrefix)
+      .concat(quantized_func_name.rsplit(kQuantizedFuncPrefix).second)
+      .str();
+}
+
+// Retrieves `QuantizationResult` from `call_op`. If the callee's name starts
+// with `kQuantizedFuncPrefix` then a `QuantizationResult` will be returned with
+// its `name` field set to the callee's name reverted back to the lifted
+// function's name. Also, `call_op` must have the `kQuantizationMethodAttr`
+// attribute, which is deserialized as `Method` and set in the returned
+// `QuantizationResult`. Otherwise, it returns `std::nullopt`.
+std::optional<QuantizationResult> GetQuantizationResult(func::CallOp call_op) {
+  const StringRef callee_name = call_op.getCalleeAttr().getValue();
+  if (!callee_name.starts_with(kQuantizedFuncPrefix)) {
+    return std::nullopt;  // `call_op` is not a quantized function call.
+  }
+
+  absl::StatusOr<Method> method = GetQuantizationMethod(call_op);
+  if (!method.ok()) {
+    call_op->emitError() << "Failed to get quantization method: "
+                         << method.status().ToString();
+    return std::nullopt;
+  }
+
+  QuantizationResult result{};
+  result.mutable_quantizable_unit()->set_name(
+      GetCompositeFunctionName(callee_name));
+  *result.mutable_method() = std::move(*method);
+  return result;
+}
+
+// Retrieves `QuantizationResult` from `xla_call_module_op`. If
+// `xla_call_module_op` is a quantizable unit, then a `QuantizationResult` will
+// be returned with its `name` field set to the callee's name. The `method`
+// field will be set to `NoQuantization` because remaining `xla_call_module_op`s
+// means they are not quantized. Returns `std::nullopt` if `xla_call_module_op`
+// is not a quantizable unit.
+std::optional<QuantizationResult> GetQuantizationResult(
+    TF::XlaCallModuleOp xla_call_module_op) {
+  const StringAttr callee_name_attr =
+      mlir::dyn_cast_or_null<StringAttr>(xla_call_module_op->getDiscardableAttr(
+          kOriginalStablehloEntryFunctionAttrName));
+
+  // `TF::XlaCallModuleOp` without the `_original_entry_function` means it is
+  // not a quantizable unit.
+  if (callee_name_attr == nullptr) return std::nullopt;
+
+  if (callee_name_attr.getValue().starts_with(kCompositeFuncPrefix)) {
+    QuantizationResult result{};
+    result.mutable_quantizable_unit()->set_name(
+        callee_name_attr.getValue().str());
+    result.mutable_method()->mutable_no_quantization();
+    return result;
+  } else {
+    return std::nullopt;
+  }
+}
+
+// Populates quantized ops from `module_op` to `results`. After going through
+// the quantization passes, quantized ops are represented as `func::CallOp` with
+// a callee's prefix of `quantized_`.
+void PopulateQuantizedResults(ModuleOp module_op,
+                              QuantizationResults& results) {
+  module_op.walk([&results](func::CallOp call_op) {
+    std::optional<QuantizationResult> result = GetQuantizationResult(call_op);
+    if (result == std::nullopt) return WalkResult::skip();
+
+    *results.add_results() = std::move(*result);
+    return WalkResult::advance();
+  });
+}
+
+// Populates non-quantized ops from `module_op` to `results`. After going
+// through the quantization passes, non-quantized quantizable units remain as
+// `TF::XlaCallModuleOp` with a callee's prefix of `composite_`.
+void PopulateNonQuantizedResults(ModuleOp module_op,
+                                 QuantizationResults& results) {
+  module_op.walk([&results](TF::XlaCallModuleOp xla_call_module_op) {
+    std::optional<QuantizationResult> result =
+        GetQuantizationResult(xla_call_module_op);
+    if (result == std::nullopt) return WalkResult::skip();
+
+    *results.add_results() = std::move(*result);
+    return WalkResult::advance();
+  });
+}
+
+}  // namespace
+
+QuantizationReport::QuantizationReport(ModuleOp module_op)
+    : quantization_results_(CollectResultsFromModuleOp(module_op)) {}
+
+QuantizationResults QuantizationReport::CollectResultsFromModuleOp(
+    ModuleOp module_op) const {
+  QuantizationResults results{};
+
+  PopulateQuantizedResults(module_op, results);
+  PopulateNonQuantizedResults(module_op, results);
+
+  return results;
+}
+
+void QuantizationReport::AddQuantizationResult(QuantizationResult&& result) {
+  *quantization_results_.add_results() = std::move(result);
+}
+
+std::string QuantizationReport::ToString() const {
+  std::string results_str{};
+  TextFormat::PrintToString(quantization_results_, &results_str);
+
+  return absl::StrCat("===== Quantization Report =====\n\n", results_str,
+                      "\n===== Quantization Report End =====\n\n");
+}
+
+void QuantizationReport::Print() const {
+  llvm::outs() << ToString();
+  llvm::outs().flush();  // Show the report immediately.
+}
+
+absl::Status QuantizationReport::Save(const StringRef file_path) const {
+  std::string results_str{};
+  TextFormat::PrintToString(GetQuantizationResults(), &results_str);
+
+  return WriteStringToFile(file_path, results_str);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.h
new file mode 100644
index 000000000000..9bd359c6c95e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.h
@@ -0,0 +1,71 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_REPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_REPORT_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// A class that manages information about `QuantizableUnit`s post-quantization,
+// internally in the form of `QuantizationUnits`. It is used to collect
+// quantization summary from a quantized `ModuleOp` and emit it in a human- and
+// machine-readable format.
+class QuantizationReport {
+ public:
+  QuantizationReport() = default;
+
+  // Initializes `QuantizationReport` by collecting `QuantizationResults` from
+  // `module_op`.
+  explicit QuantizationReport(ModuleOp module_op);
+
+  // Adds a `QuantizationResult` to the report.
+  void AddQuantizationResult(
+      ::stablehlo::quantization::QuantizationResult&& result);
+
+  // Returns `QuantizationResults` that are registered in this report.
+  const ::stablehlo::quantization::QuantizationResults& GetQuantizationResults()
+      const {
+    return quantization_results_;
+  }
+
+  // Returns a human-readable string representation of this report.
+  std::string ToString() const;
+
+  // Prints a human-readable report to stdout.
+  void Print() const;
+
+  // Saves the report to `file_path`. The textproto representation of
+  // `QuantizationResults` will be written to the file. Returns non-ok status
+  // when the file write fails.
+  absl::Status Save(StringRef file_path) const;
+
+ private:
+  ::stablehlo::quantization::QuantizationResults CollectResultsFromModuleOp(
+      ModuleOp module_op) const;
+
+  // Quantization results that are registered in this report. A quantization
+  // result may be added manually by calling `AddQuantizationResult`.
+  ::stablehlo::quantization::QuantizationResults quantization_results_;
+};
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_REPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.cc
new file mode 100644
index 000000000000..5b5c37a6deb1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.cc
@@ -0,0 +1,290 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_executor_to_graph.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::io::GetLocalTmpFileName;
+using ::tensorflow::AssetFileDef;
+using ::tensorflow::FunctionDefLibrary;
+using ::tensorflow::FunctionLibraryDefinition;
+using ::tensorflow::Graph;
+using ::tensorflow::GraphDef;
+using ::tensorflow::Node;
+using ::tensorflow::NodeDef;
+using ::tensorflow::OpRegistry;
+using ::tensorflow::SaverDef;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::RunPasses;
+using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
+
+// Finds and returns the name of the node from a set of control output nodes.
+// The name should contain the string `contains`. Returns an empty string if no
+// node whose name contains `contains` is found. Assumes there is at most one
+// such a node.
+std::string GetNodeName(const std::vector<std::string>& control_ret_node_names,
+                        const absl::string_view contains) {
+  for (const std::string& node_name : control_ret_node_names) {
+    if (absl::StrContains(node_name, contains)) {
+      VLOG(1) << "Node found: " << node_name << ", contains: " << contains;
+      return node_name;
+    }
+  }
+  VLOG(1) << "Could not find node whose name conatins: " << contains;
+  return "";
+}
+
+// Returns the file prefix tensor name. An empty string is returned if no such a
+// tensor is found (when there are no variables to restore, it is expected that
+// the file prefix tensor does not exist). The file prefix tensor is found among
+// the "_Arg" nodes, as it is translated from the MLIR @main function's
+// argument. It also must have the attribute `tf_saved_model.index_path =
+// ["__tf_file_prefix"]`.
+//
+// See `MergeSaveFunctionOpsToMainPass` for details how the file prefix tensor
+// ends up at the MLIR @main function's argument.
+std::string FindFilePrefixTensorName(const GraphDef& graph_def) {
+  for (const NodeDef& node_def : graph_def.node()) {
+    if (node_def.op() == FunctionLibraryDefinition::kArgOp) {
+      // Matches the `tf_saved_model.index_path = ["__tf_file_prefix"]`.
+      const auto index_path_attr_itr =
+          node_def.attr().find(kTfSavedModelIndexPathAttr.str());
+      if (index_path_attr_itr != node_def.attr().end()) {
+        const auto& index_paths = index_path_attr_itr->second.list().s();
+        if (absl::c_find(index_paths, quant::kTfFilePrefix.str()) !=
+            index_paths.end()) {
+          // ":0" appended to indicate that it is a tensor, not an Operation.
+          return absl::StrCat(node_def.name(), ":0");
+        }
+      }
+    }
+  }
+  return "";
+}
+
+}  // namespace
+
+absl::StatusOr<ExportedModel> CreateExportedModel(
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationConfig& quantization_config,
+    absl::string_view debug_name_prefix,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op) {
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
+  const ExportOptions export_opts = {
+      /*duplicate_shape_determining_constants=*/true,
+      /*unfreeze_constants=*/false, checkpoint_dir,
+      /*debug_name=*/
+      absl::StrCat(debug_name_prefix, kExportStepSuffix)};
+
+  TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
+                      RunExportPasses(export_opts, ctx, module_op));
+
+  return ConvertMlirModuleToExportedModel(
+      module_op, checkpoint_dir, function_aliases,
+      {asset_file_defs.begin(), asset_file_defs.end()});
+}
+
+ExportedModel CreateExportedModelFromGraphDef(
+    GraphDef&& graph_def, const absl::string_view init_node_name,
+    const absl::string_view checkpoint_dir,
+    const std::optional<SaverDef> saver_def,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    const std::vector<AssetFileDef>& asset_file_defs) {
+  ExportedModel exported_model{};
+  *exported_model.mutable_graph_def() = graph_def;
+  exported_model.set_init_node_name(std::string(init_node_name));
+  exported_model.set_checkpoint_dir(std::string(checkpoint_dir));
+
+  exported_model.mutable_function_aliases()->insert(function_aliases.begin(),
+                                                    function_aliases.end());
+
+  for (const AssetFileDef& asset_file_def : asset_file_defs) {
+    *exported_model.mutable_asset_file_defs()->Add() = asset_file_def;
+  }
+
+  if (saver_def != std::nullopt) {
+    *exported_model.mutable_saver_def() = *std::move(saver_def);
+  }
+
+  return exported_model;
+}
+
+void AddExportPasses(mlir::PassManager& pm,
+                     const bool duplicate_shape_determining_constants) {
+  AddCallModuleSerializationPasses(pm);
+  if (duplicate_shape_determining_constants) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::tf_quant::CreateDuplicateShapeDeterminingConstantsPass());
+  }
+
+  pm.addPass(mlir::tf_quant::CreateInsertMainFunctionPass());
+  pm.addPass(mlir::tf_quant::CreateLiftHashTableOpsAsArgsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+  pm.addPass(mlir::CreateBreakUpIslandsPass());
+  pm.addPass(mlir::tf_quant::CreateMergeInitializerFunctionOpsToMainPass());
+  pm.addPass(mlir::tf_quant::CreateMergeSaveFunctionOpsToMainPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateMergeDuplicateResourceOpsPass());
+
+  // Used to clean up the "tf._noinliner" attribute that is previously used to
+  // prevent certain functions from being inlined (see
+  // `MarkFunctionsNoinlinePass`). InlinerPass must not come after this pass.
+  pm.addPass(mlir::TF::CreateStripNoinlineAttributePass());
+}
+
+absl::StatusOr<std::optional<SaverDef>> CreateSaverDef(
+    const std::vector<std::string>& control_ret_node_names,
+    const GraphDef& graph_def) {
+  const std::string filename_tensor_name = FindFilePrefixTensorName(graph_def);
+  const std::string restore_op_name =
+      GetNodeName(control_ret_node_names, kTfSavedModelInitializerRestoreType);
+  const std::string save_node_name =
+      GetNodeName(control_ret_node_names, quant::kTfQuantSaveOpName);
+
+  const std::vector<absl::string_view> fields = {
+      filename_tensor_name, restore_op_name, save_node_name};
+  const auto is_empty_predicate = [](const absl::string_view s) {
+    return s.empty();
+  };
+
+  if (absl::c_all_of(fields, is_empty_predicate)) {
+    return std::nullopt;
+  } else if (absl::c_none_of(fields, is_empty_predicate)) {
+    SaverDef saver_def{};
+    saver_def.set_version(SaverDef::V2);
+    saver_def.set_filename_tensor_name(filename_tensor_name);
+    saver_def.set_restore_op_name(restore_op_name);
+    // :0 attached to indicate the first result tensor. This saves the model
+    // checkpoint when fetched.
+    saver_def.set_save_tensor_name(absl::StrCat(save_node_name, ":0"));
+    return saver_def;
+  } else {
+    return absl::InternalError(
+        absl::StrCat("Failed to create SaverDef. Fields should be either all "
+                     "empty strings or all non-empty strings. Got fields: ",
+                     absl::StrJoin(fields, ",")));
+  }
+}
+
+absl::StatusOr<ExportedModel> ConvertMlirModuleToExportedModel(
+    const mlir::ModuleOp module_op, const absl::string_view checkpoint_dir,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    const std::vector<AssetFileDef>& asset_file_defs) {
+  const tensorflow::GraphExportConfig config{};
+  FunctionLibraryDefinition flib_def{OpRegistry::Global(),
+                                     FunctionDefLibrary()};
+  std::unique_ptr<Graph> graph;
+  absl::flat_hash_set<Node*> control_ret_nodes{};
+  TF_RETURN_IF_ERROR(tensorflow::tf2xla::v2::ConvertTfExecutorToGraph(
+      module_op, config, &graph, &flib_def, &control_ret_nodes));
+
+  GraphDef graph_def{};
+  graph->ToGraphDef(&graph_def);
+
+  std::vector<std::string> control_ret_node_names{};
+  for (Node* node : control_ret_nodes) {
+    control_ret_node_names.push_back(node->name());
+  }
+  const std::string init_node_name =
+      GetNodeName(control_ret_node_names, kTfSavedModelInitializerInitType);
+
+  TF_ASSIGN_OR_RETURN(const std::optional<SaverDef> saver_def,
+                      CreateSaverDef(control_ret_node_names, graph_def));
+
+  return CreateExportedModelFromGraphDef(std::move(graph_def), init_node_name,
+                                         checkpoint_dir, std::move(saver_def),
+                                         function_aliases, asset_file_defs);
+}
+
+absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
+    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op) {
+  if (export_opts.unfreeze_constants) {
+    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
+        export_opts.checkpoint_dir, ctx, module_op));
+    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
+              << export_opts.checkpoint_dir;
+  }
+
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/
+      export_opts.debug_name,
+      /*add_passes_func=*/
+      [dup_constants = export_opts.duplicate_shape_determining_constants](
+          PassManager& pm) { AddExportPasses(pm, dup_constants); },
+      ctx, module_op));
+
+  FailureOr<SmallVector<AssetFileDef>> asset_file_defs =
+      quant::ConvertAssetArgs(module_op);
+  if (failed(asset_file_defs)) {
+    return absl::InternalError("Failed to convert asset args.");
+  }
+
+  return *asset_file_defs;
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.h
new file mode 100644
index 000000000000..8aaca2b49896
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.h
@@ -0,0 +1,145 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functionalities for exporting MLIR ModuleOp to TensorFlow SavedModel.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_SAVED_MODEL_EXPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_SAVED_MODEL_EXPORT_H_
+
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+using ::mlir::quant::stablehlo::FunctionAlias;
+using ::mlir::quant::stablehlo::FunctionName;
+
+// Suffix string for the module export step. Used for debugging.
+constexpr absl::string_view kExportStepSuffix = "_export";
+
+// Options when running passes for exporting an MLIR ModuleOp.
+struct ExportOptions {
+  // If set to `true`, it runs `DuplicateShapeDeterminingConstantsPass` before
+  // lowering to tf_executor dialect.
+  bool duplicate_shape_determining_constants = true;
+
+  // If set to `true`, unfreezes constants into variables and saves them to a
+  // checkpoint file. Setting this to `true` is an experimental feature that has
+  // no stability guarantees.
+  bool unfreeze_constants = false;
+
+  // Path to the directory where checkpoint files are saved.
+  std::string checkpoint_dir = "";
+
+  // Name used to identify the ModuleOp this is exporting. Only used for
+  // debugging and does not modify the behavior of the export.
+  std::string debug_name = "stablehlo_quant";
+};
+
+// Creates `ExportedModel` from `module_op`. `module_op` goes through post
+// process passes before an `ExportModel` is created.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<tensorflow::quantization::ExportedModel> CreateExportedModel(
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view debug_name_prefix,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op);
+
+// Factory function for `ExportedModel`.
+[[nodiscard]] tensorflow::quantization::ExportedModel
+CreateExportedModelFromGraphDef(
+    tensorflow::GraphDef&& graph_def, absl::string_view init_node_name,
+    absl::string_view checkpoint_dir,
+    std::optional<tensorflow::SaverDef> saver_def,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases,
+    const std::vector<tensorflow::AssetFileDef>& asset_file_defs);
+
+// Creates a new `SaverDef` instance, which contains information regarding
+// checkpoint saving and restoring. This function returns a `SaverDef` instance
+// with four fields populated: `version`, `filename_tensor_name`,
+// `restore_op_name` and `save_tensor_name`. For valid quantized `graph_def` and
+// `control_ret_node_names`, it should be able to retrieve the last three fields
+// if there is at lest one variable in the graph.
+//
+// Returns a `std::nullopt` if there are no variables in the graph and no saving
+// & restoring are required. Returns an `InternalError` status for when the
+// required fields are only partially provided.
+absl::StatusOr<std::optional<tensorflow::SaverDef>> CreateSaverDef(
+    const std::vector<std::string>& control_ret_node_names,
+    const tensorflow::GraphDef& graph_def);
+
+// Adds passes for transforming the MLIR module op so that it can be exported
+// back to GraphDef. Roughly, this consists of:
+//   1) Inserting the @main function, which will become the main Graph.
+//   2) Duplicating shape-determining constants.
+//   3) Converting TF dialect -> tf_executor dialect.
+//   4) Adding initializer function's ops into @main function for correct
+//      resource initialization when loading the exported model.
+//
+// Duplicating shape-determining constants is required to place constants that
+// affect the shape of a tensor to be placed in the TPU graph instead of in the
+// CPU graph, when the graph gets converted for TPU inference. This allows these
+// constants to be known at XLA compilation time.
+void AddExportPasses(mlir::PassManager& pm,
+                     bool duplicate_shape_determining_constants);
+
+// Converts MLIR ModuleOp to `ExportedModel`. Returns `InternalError` status
+// when the conversion fails.
+//
+// * `checkpoint_dir` is the directory where checkpoints where variable values
+// are stored. This value will be fed to the "file_prefix" tensor to restore the
+// variables.
+// * `function_aliases` maps the actual function name to the function alias.
+// This associates the quantized functions to the original functions' aliases.
+// If there were no function aliases in the input model, this should be empty.
+// * `asset_file_defs` include information about the assets, if any, that are
+// used directly to initialize resources (like hash tables). If no assets are
+// used in the model, this should be empty.
+absl::StatusOr<tensorflow::quantization::ExportedModel>
+ConvertMlirModuleToExportedModel(
+    mlir::ModuleOp module_op, absl::string_view checkpoint_dir,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases,
+    const std::vector<tensorflow::AssetFileDef>& asset_file_defs);
+
+// Sets up and runs the passes for exporting `module_op`. The behavior of the
+// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
+// associate the input arguments of @main and the asset file names. Asset file
+// names will be used to feed the corresponding tensors during initialization
+// upon model loading.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<SmallVector<::tensorflow::AssetFileDef>> RunExportPasses(
+    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op);
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_SAVED_MODEL_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export_test.cc
new file mode 100644
index 000000000000..d5e6a9585b76
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export_test.cc
@@ -0,0 +1,439 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_test_base.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+using ::tensorflow::AssetFileDef;
+using ::tensorflow::GraphDef;
+using ::tensorflow::NodeDef;
+using ::tensorflow::SaverDef;
+using ::tensorflow::quantization::ExportedModel;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+TEST(CreateExportedModelTest, CreateExportedModelBasicFieldsSet) {
+  GraphDef graph_def{};
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(node { name: "foo" })pb", &graph_def));
+
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
+      std::move(graph_def), "init_node_name", "checkpoint_dir",
+      /*saver_def=*/std::nullopt,
+      /*function_aliases=*/{}, /*asset_file_defs=*/{});
+  ASSERT_THAT(exported_model.graph_def().node(), SizeIs(1));
+  EXPECT_THAT(exported_model.graph_def().node()[0].name(), StrEq("foo"));
+
+  EXPECT_THAT(exported_model.init_node_name(), StrEq("init_node_name"));
+  EXPECT_THAT(exported_model.checkpoint_dir(), StrEq("checkpoint_dir"));
+  EXPECT_FALSE(exported_model.has_saver_def());
+  EXPECT_THAT(exported_model.function_aliases(), IsEmpty());
+  EXPECT_THAT(exported_model.asset_file_defs(), IsEmpty());
+}
+
+TEST(CreateExportedModelTest, CreateExportedModelWithAddedFunctionAliases) {
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
+      GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
+      /*saver_def=*/std::nullopt,
+      /*function_aliases=*/{{"func1", "alias1"}, {"func2", "alias2"}},
+      /*asset_file_defs=*/{});
+  ASSERT_THAT(exported_model.function_aliases(), SizeIs(2));
+  EXPECT_TRUE(exported_model.function_aliases().contains("func1"));
+  EXPECT_THAT(exported_model.function_aliases().at("func1"), StrEq("alias1"));
+  EXPECT_TRUE(exported_model.function_aliases().contains("func2"));
+  EXPECT_THAT(exported_model.function_aliases().at("func2"), StrEq("alias2"));
+}
+
+TEST(CreateExportedModelTest, CreateExportedModelWithAddedAssetFileDefs) {
+  AssetFileDef asset1;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(filename: "fname1")pb", &asset1));
+
+  AssetFileDef asset2;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(filename: "fname2")pb", &asset2));
+
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
+      GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
+      /*saver_def=*/std::nullopt, /*function_aliases=*/{},
+      /*asset_file_defs=*/{asset1, asset2});
+  ASSERT_THAT(exported_model.asset_file_defs(), SizeIs(2));
+  EXPECT_THAT(exported_model.asset_file_defs()[0].filename(), StrEq("fname1"));
+  EXPECT_THAT(exported_model.asset_file_defs()[1].filename(), StrEq("fname2"));
+}
+
+TEST(CreateExportedModelTest, CreateExportedModelWithAddedSaverDef) {
+  SaverDef saver_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(filename_tensor_name: "my_file")pb", &saver_def));
+
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
+      GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"", saver_def,
+      /*function_aliases=*/{}, /*asset_file_defs=*/{});
+  EXPECT_THAT(exported_model.saver_def().filename_tensor_name(), "my_file");
+}
+
+TEST(CreateSaverDefTest, CreateValidSaverDef) {
+  // Needs to have a _Arg node with an attribute "tf_saved_model.index_path" =
+  // ["__tf_file_prefix"].
+  GraphDef graph_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(node {
+             name: "foo",
+             op: "_Arg",
+             attr {
+               key: "tf_saved_model.index_path",
+               value { list { s: "__tf_file_prefix" } }
+             }
+           })pb",
+      &graph_def));
+
+  // Restore op's name should start with "restore_op" and the save op's name
+  // should start with "tf_quant__save_op".
+  const std::vector<std::string> control_ret_node_names = {
+      "restore_op_0", "tf_quant__save_op_0"};
+
+  TF_ASSERT_OK_AND_ASSIGN(const std::optional<SaverDef> saver_def,
+                          CreateSaverDef(control_ret_node_names, graph_def));
+  ASSERT_NE(saver_def, std::nullopt);
+  EXPECT_THAT(saver_def->version(), SaverDef::V2);
+  EXPECT_THAT(saver_def->restore_op_name(), "restore_op_0");
+  EXPECT_THAT(saver_def->filename_tensor_name(), "foo:0");
+  EXPECT_THAT(saver_def->save_tensor_name(), "tf_quant__save_op_0:0");
+}
+
+TEST(CreateSaverDefTest, ReturnsNulloptIfNoSaverDefRelatedNodesExist) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::optional<SaverDef> saver_def,
+      CreateSaverDef(/*control_ret_node_names=*/{}, GraphDef()));
+  EXPECT_EQ(saver_def, std::nullopt);
+}
+
+TEST(CreateSaverDefTest, ReturnsErrorStatusIfSaverDefNodesPartiallyExist) {
+  // An _Arg node missing the attribute "tf_saved_model.index_path" =
+  // ["__tf_file_prefix"].
+  GraphDef graph_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(node { name: "foo", op: "_Arg" })pb", &graph_def));
+
+  // Restore op's name should start with "restore_op" and the save op's name
+  // should start with "tf_quant__save_op".
+  const std::vector<std::string> control_ret_node_names = {
+      "restore_op_0", "tf_quant__save_op_0"};
+
+  const absl::StatusOr<std::optional<SaverDef>> saver_def =
+      CreateSaverDef(control_ret_node_names, graph_def);
+  EXPECT_THAT(
+      saver_def,
+      StatusIs(
+          absl::StatusCode::kInternal,
+          HasSubstr(
+              "should be either all empty strings or all non-empty strings")));
+}
+
+// Testing ConvertMlirModuleToExportedModel requires parsing MLIR string to
+// ModuleOp.
+using ConvertMlirModuleToExportedModelTest =
+    ::mlir::tf_quant::QuantizationTestBase;
+
+TEST_F(ConvertMlirModuleToExportedModelTest, SimpleGraphDefSet) {
+  // Define a module a no-op main function.
+  mlir::OwningOpRef<mlir::ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      func.func @main(%arg: tensor<1x2xf32> {tf_saved_model.index_path = ["input_tensor:0"]}) -> (tensor<1x2xf32> {tf_saved_model.index_path = ["output_tensor:0"]}) attributes {tf.entry_function = {inputs = "input_tensor:0", outputs = "output_tensor:0"}, tf_saved_model.exported_names = ["main"]} {
+        %0 = tf_executor.graph {
+          tf_executor.fetch %arg : tensor<1x2xf32>
+        }
+        return %0 : tensor<1x2xf32>
+      }
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  const absl::StatusOr<ExportedModel> exported_model =
+      ConvertMlirModuleToExportedModel(*module_op, /*checkpoint_dir=*/"",
+                                       /*function_aliases=*/{},
+                                       /*asset_file_defs=*/{});
+
+  ASSERT_THAT(exported_model, IsOk());
+  // There are 2 nodes in the graph, one for arg and another for retval.
+  ASSERT_THAT(exported_model->graph_def().node(), SizeIs(2));
+
+  // Match the `_Arg` node that corresponds to the argument of @main.
+  const auto arg_node_itr =
+      llvm::find_if(exported_model->graph_def().node(),
+                    [](const NodeDef& node) { return node.op() == "_Arg"; });
+  ASSERT_NE(arg_node_itr, exported_model->graph_def().node().end());
+  EXPECT_THAT(arg_node_itr->name(), StrEq("input_tensor"));
+  ASSERT_TRUE(arg_node_itr->attr().contains("tf_saved_model.index_path"));
+  ASSERT_THAT(arg_node_itr->attr().at("tf_saved_model.index_path").list().s(),
+              SizeIs(1));
+  EXPECT_THAT(
+      arg_node_itr->attr().at("tf_saved_model.index_path").list().s()[0],
+      StrEq("input_tensor:0"));
+
+  // Match the `_Retval` node that corresponds to the return value of @main.
+  const auto retval_node_itr =
+      llvm::find_if(exported_model->graph_def().node(),
+                    [](const NodeDef& node) { return node.op() == "_Retval"; });
+  ASSERT_NE(retval_node_itr, exported_model->graph_def().node().end());
+  EXPECT_THAT(retval_node_itr->name(), StrEq("output_tensor"));
+  ASSERT_TRUE(retval_node_itr->attr().contains("tf_saved_model.index_path"));
+  ASSERT_THAT(
+      retval_node_itr->attr().at("tf_saved_model.index_path").list().s(),
+      SizeIs(1));
+  EXPECT_THAT(
+      retval_node_itr->attr().at("tf_saved_model.index_path").list().s()[0],
+      StrEq("output_tensor:0"));
+}
+
+TEST_F(ConvertMlirModuleToExportedModelTest, CheckpointDirSet) {
+  // Define a module a no-op main function.
+  mlir::OwningOpRef<mlir::ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      func.func @main() -> () attributes {tf_saved_model.exported_names = ["main"]} {
+        tf_executor.graph {
+          tf_executor.fetch
+        }
+        return
+      }
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  const absl::StatusOr<ExportedModel> exported_model =
+      ConvertMlirModuleToExportedModel(*module_op, "my_checkpoint_dir",
+                                       /*function_aliases=*/{},
+                                       /*asset_file_defs=*/{});
+
+  ASSERT_THAT(exported_model, IsOk());
+  EXPECT_THAT(exported_model->checkpoint_dir(), StrEq("my_checkpoint_dir"));
+}
+
+TEST_F(ConvertMlirModuleToExportedModelTest, FunctionAliasesSet) {
+  // Define a module with 2 function calls, function_1 and function_2.
+  mlir::OwningOpRef<mlir::ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      func.func private @function_1() -> () attributes {tf._original_func_name = "__func_1"} {
+        tf_executor.graph {
+          %control_0 = tf_executor.island wraps "tf.NoOp"() : () -> ()
+        }
+        return
+      }
+
+      func.func private @function_2() -> () attributes {tf._original_func_name = "__func_2"} {
+        tf_executor.graph {
+          %control_0 = tf_executor.island wraps "tf.NoOp"() : () -> ()
+        }
+        return
+      }
+
+      func.func @main() -> () attributes {tf_saved_model.exported_names = ["main"]} {
+        tf_executor.graph {
+          %control_0 = tf_executor.island wraps "tf.PartitionedCall"() <{config = "", config_proto = "", executor_type = "", f = @function_1}> : () -> ()
+          %control_1 = tf_executor.island wraps "tf.PartitionedCall"() <{config = "", config_proto = "", executor_type = "", f = @function_2}> : () -> ()
+          tf_executor.fetch %control_0, %control_1 : !tf_executor.control, !tf_executor.control
+        }
+        return
+      }
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  const absl::StatusOr<ExportedModel> exported_model =
+      ConvertMlirModuleToExportedModel(
+          *module_op, /*checkpoint_dir=*/"",
+          /*function_aliases=*/
+          {{"alias_1", "function_1"}, {"alias_2", "function_2"}},
+          /*asset_file_defs=*/{});
+
+  ASSERT_THAT(exported_model, IsOk());
+  ASSERT_THAT(exported_model->function_aliases(), SizeIs(2));
+  EXPECT_THAT(exported_model->function_aliases().at("alias_1"),
+              StrEq("function_1"));
+  EXPECT_THAT(exported_model->function_aliases().at("alias_2"),
+              StrEq("function_2"));
+}
+
+TEST_F(ConvertMlirModuleToExportedModelTest, AssetFileDefSet) {
+  // Define a module a no-op main function.
+  mlir::OwningOpRef<mlir::ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      func.func @main() -> () attributes {tf_saved_model.exported_names = ["main"]} {
+        tf_executor.graph {
+          tf_executor.fetch
+        }
+        return
+      }
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  AssetFileDef asset_file_def{};
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(filename: "vocab_file.txt",
+                                       tensor_info { name: "arg_0:0" })pb",
+                                  &asset_file_def));
+  const std::vector<AssetFileDef> asset_file_defs = {asset_file_def};
+
+  const absl::StatusOr<ExportedModel> exported_model =
+      ConvertMlirModuleToExportedModel(*module_op, /*checkpoint_dir=*/"",
+                                       /*function_aliases=*/{},
+                                       /*asset_file_defs=*/asset_file_defs);
+
+  ASSERT_THAT(exported_model, IsOk());
+  ASSERT_THAT(exported_model->asset_file_defs(), SizeIs(1));
+  EXPECT_THAT(exported_model->asset_file_defs()[0].filename(),
+              StrEq("vocab_file.txt"));
+  EXPECT_THAT(exported_model->asset_file_defs()[0].tensor_info().name(),
+              StrEq("arg_0:0"));
+}
+
+TEST_F(ConvertMlirModuleToExportedModelTest,
+       InitNodeNameSetToLocOfControlOutput) {
+  // Define a module that initializes a tf.HashTableV2 whose control output node
+  // for the initialization is named "init_op_init_all_tables".
+  mlir::OwningOpRef<mlir::ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() <{initializers = []}> : () -> ()
+      "tf_saved_model.asset"() <{filename = "assets/vocab_file.txt", sym_name = "__tf_saved_model_asset0_vocab_file.txt"}> : () -> ()
+      func.func @main(%arg1: tensor<!tf_type.string> {tf_saved_model.index_path = ["arg_0:0"]}) -> (tensor<1x2xf32> {tf_saved_model.index_path = ["output:0"]}) attributes {tf.entry_function = {inputs = "arg_0:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main"]} {
+        %0 = tf_executor.graph {
+          %o_0, %c_0 = tf_executor.island wraps "tf.Const"() <{value = dense<1.0> : tensor<1x2xf32>}> : () -> tensor<1x2xf32>
+          %o, %c = tf_executor.island wraps "tf.HashTableV2"() <{container = "", key_dtype = !tf_type.string, shared_name = "vocab_file.txt", use_node_name_sharing = false, value_dtype = i64}> {device = ""} : () -> tensor<!tf_type.resource>
+          %c_9 = tf_executor.island wraps "tf.InitializeTableFromTextFileV2"(%o, %arg1) <{delimiter = "\09", key_index = -2 : i64, value_index = -1 : i64, vocab_size = -1 : i64}> {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<!tf_type.string>) -> ()
+          // Location of this control output op becomes the name of the init_op.
+          %c_10 = tf_executor.island(%c_9) wraps "tf.NoOp"() : () -> () loc("init_op_init_all_tables")
+          tf_executor.fetch %o_0, %c_10 : tensor<1x2xf32>, !tf_executor.control
+        }
+        return %0 : tensor<1x2xf32>
+      }
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  const absl::StatusOr<ExportedModel> exported_model =
+      ConvertMlirModuleToExportedModel(*module_op, /*checkpoint_dir=*/"",
+                                       /*function_aliases=*/{},
+                                       /*asset_file_defs=*/{});
+
+  ASSERT_THAT(exported_model, IsOk());
+  EXPECT_THAT(exported_model->init_node_name(),
+              StrEq("init_op_init_all_tables"));
+
+  // Match the init node, which is a NoOp that has control dependency to
+  // HashTableV2 initialization. Fetching this node in TF Session will
+  // initialize the hash table.
+  const auto init_node_itr = llvm::find_if(
+      exported_model->graph_def().node(), [](const NodeDef& node) {
+        return node.name() == "init_op_init_all_tables";
+      });
+  ASSERT_NE(init_node_itr, exported_model->graph_def().node().end());
+  EXPECT_THAT(init_node_itr->op(), StrEq("NoOp"));
+  ASSERT_THAT(init_node_itr->input(), SizeIs(1));
+  // "^" means control input.
+  EXPECT_THAT(init_node_itr->input()[0],
+              StrEq("^tf.InitializeTableFromTextFileV2"));
+}
+
+TEST_F(ConvertMlirModuleToExportedModelTest, InitNodeNotSetIfLocNameMismatch) {
+  // Define a module that initializes a tf.HashTableV2 whose control output node
+  // for the initialization is named "init_ok". Since the output control node
+  // name does not begin with "init_op" the init node could not have been found
+  // after the conversion.
+  mlir::OwningOpRef<mlir::ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() <{initializers = []}> : () -> ()
+      "tf_saved_model.asset"() <{filename = "assets/vocab_file.txt", sym_name = "__tf_saved_model_asset0_vocab_file.txt"}> : () -> ()
+      func.func @main(%arg1: tensor<!tf_type.string> {tf_saved_model.index_path = ["arg_0:0"]}) -> (tensor<1x2xf32> {tf_saved_model.index_path = ["output:0"]}) attributes {tf.entry_function = {inputs = "arg_0:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main"]} {
+        %0 = tf_executor.graph {
+          %output_0, %control_0 = tf_executor.island wraps "tf.Const"() <{value = dense<1.0> : tensor<1x2xf32>}> : () -> tensor<1x2xf32>
+          %output_1, %control_1 = tf_executor.island wraps "tf.HashTableV2"() <{container = "", key_dtype = !tf_type.string, shared_name = "vocab_file.txt", use_node_name_sharing = false, value_dtype = i64}> {device = ""} : () -> tensor<!tf_type.resource>
+          %control_2 = tf_executor.island wraps "tf.InitializeTableFromTextFileV2"(%output_1, %arg1) <{delimiter = "\09", key_index = -2 : i64, value_index = -1 : i64, vocab_size = -1 : i64}> {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<!tf_type.string>) -> ()
+          // Location of this control output op becomes the name of the init_op.
+          %control_3 = tf_executor.island(%control_2) wraps "tf.NoOp"() : () -> () loc("init_ok")
+          tf_executor.fetch %output_0, %control_3 : tensor<1x2xf32>, !tf_executor.control
+        }
+        return %0 : tensor<1x2xf32>
+      }
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  const absl::StatusOr<ExportedModel> exported_model =
+      ConvertMlirModuleToExportedModel(*module_op, /*checkpoint_dir=*/"",
+                                       /*function_aliases=*/{},
+                                       /*asset_file_defs=*/{});
+
+  ASSERT_THAT(exported_model, IsOk());
+  EXPECT_THAT(exported_model->init_node_name(), IsEmpty());
+}
+
+TEST_F(ConvertMlirModuleToExportedModelTest,
+       ConversionFailureWhenNoMainFunction) {
+  // Define a module a function whose name is not @main.
+  mlir::OwningOpRef<mlir::ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      func.func @not_main() -> () attributes {tf_saved_model.exported_names = ["not_main"]} {
+        tf_executor.graph {
+          tf_executor.fetch
+        }
+        return
+      }
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  const absl::StatusOr<ExportedModel> exported_model =
+      ConvertMlirModuleToExportedModel(*module_op, "my_checkpoint_dir",
+                                       /*function_aliases=*/{},
+                                       /*asset_file_defs=*/{});
+  EXPECT_THAT(exported_model,
+              StatusIs(absl::StatusCode::kFailedPrecondition,
+                       HasSubstr("entry function `main` must be present")));
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.cc
new file mode 100644
index 000000000000..5f414a39c607
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.cc
@@ -0,0 +1,153 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/cc/saved_model/reader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::MLIRImportOptions;
+using ::tensorflow::SavedModelBundle;
+using ::tensorflow::SavedModelSignatureDefsToMlirImport;
+using ::tensorflow::quantization::PreprocessAndFreezeGraph;
+
+absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
+    const absl::string_view saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& signature_keys,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  MLIRImportOptions import_options;
+  import_options.upgrade_legacy = true;
+  import_options.lift_variables = false;
+  import_options.include_variables_in_initializers = true;
+
+  auto bundle = std::make_unique<SavedModelBundle>();
+
+  // Copy to eliminate the `const` qualifier so that `absl::MakeSpan` can be
+  // called on it.
+  std::vector<std::string> exported_names = signature_keys;
+  absl::StatusOr<OwningOpRef<ModuleOp>> module_op =
+      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
+                                          absl::MakeSpan(exported_names), &ctx,
+                                          import_options, &bundle);
+  if (!module_op.status().ok()) {
+    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
+                                            module_op.status().ToString()));
+  }
+
+  return std::make_pair(std::move(*module_op), std::move(bundle));
+}
+
+absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+GetFunctionAliases(absl::string_view saved_model_path,
+                   const std::unordered_set<std::string>& tags) {
+  tensorflow::MetaGraphDef meta_graph;
+  TF_RETURN_IF_ERROR(tensorflow::ReadMetaGraphDefFromSavedModel(
+      saved_model_path, tags, &meta_graph));
+
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases(
+      meta_graph.meta_info_def().function_aliases().begin(),
+      meta_graph.meta_info_def().function_aliases().end());
+  return function_aliases;
+}
+
+void UpdateFunctionAliases(
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    ModuleOp module_op) {
+  absl::flat_hash_set<FunctionName> existing_func_names;
+  module_op->walk([&](func::FuncOp func_op) {
+    FunctionName func_name = func_op.getSymName().str();
+    existing_func_names.insert(func_name);
+    // We may retrieve the original function's name from the attribute.
+    // Functions without this attribute are ignored.
+    auto original_func_name =
+        func_op->getAttrOfType<StringAttr>("tf._original_func_name");
+    if (original_func_name) {
+      if (auto alias_itr = function_aliases.find(original_func_name.str());
+          alias_itr != function_aliases.end()) {
+        const FunctionAlias alias = alias_itr->second;
+        function_aliases[func_name] = alias;
+      }
+    }
+  });
+
+  // Remove aliases to function that no-longer exists.
+  absl::erase_if(function_aliases, [&existing_func_names](const auto& item) {
+    return !existing_func_names.contains(item.first);
+  });
+}
+
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportSavedModel(
+    const absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationConfig& quantization_config,
+    const absl::string_view mlir_dump_file_prefix,
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  TF_ASSIGN_OR_RETURN(
+      ImportedMlirModuleOp imported_module,
+      SavedModelToMlirModuleOp(saved_model_path, tags, signature_keys, ctx));
+  auto [module_op, saved_model_bundle] = std::move(imported_module);
+
+  UpdateFunctionAliases(function_aliases, *module_op);
+
+  // Collect the names of the functions that have aliases so that they may not
+  // be inlined.
+  absl::flat_hash_set<std::string> aliased_function_names;
+  absl::c_for_each(function_aliases, [&](const auto& aliases) {
+    return aliased_function_names.insert(aliases.first);
+  });
+
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+      mlir_dump_file_prefix, /*is_inliner_run=*/true,
+      /*noinline_functions=*/aliased_function_names, *module_op, &ctx,
+      saved_model_bundle == nullptr ? nullptr
+                                    : saved_model_bundle->GetSession(),
+      /*run_tf_to_stablehlo=*/true, /*deserialize_xla_call_module=*/false));
+  return std::move(module_op);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.h
new file mode 100644
index 000000000000..4ecef73ecbbd
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.h
@@ -0,0 +1,92 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functionalities for importing MLIR ModuleOp from TensorFlow SavedModel.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_SAVED_MODEL_IMPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_SAVED_MODEL_IMPORT_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// Represents a pair of `mlir::ModuleOp` and `tensorflow::SavedModelBundle`. The
+// SavedModelBundle complements the imported ModuleOp by providing access to
+// `tensorflow::Session` which may be useful when reading values from resources
+// (e.g. `TF::VarHandleOp`s).
+using ImportedMlirModuleOp =
+    std::pair<OwningOpRef<ModuleOp>,
+              std::unique_ptr<::tensorflow::SavedModelBundle>>;
+using quant::stablehlo::FunctionAlias;
+using quant::stablehlo::FunctionName;
+
+// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
+//
+// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
+// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
+// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
+// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
+    absl::string_view saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& signature_keys,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
+// Gets the function aliases from the SavedModel.
+absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+GetFunctionAliases(absl::string_view saved_model_path,
+                   const std::unordered_set<std::string>& tags);
+
+// Updates the function aliases. `module_op` may have different
+// function names from the original model, so it re-associates the aliases
+// with the new function names. Both the input `function_aliases` and the
+// returned value are function name -> alias mappings. `function_aliases` is
+// the function alias mapping of the original function. The original function's
+// name is retrieved by looking at the "tf._original_func_name" string attribute
+// attached to a `func::FuncOp`.
+void UpdateFunctionAliases(
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    ModuleOp module_op);
+
+// Loads a SavedModel to `mlir::ModuleOp` and performs preprocesses including
+// shape inference and graph freezing.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportSavedModel(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view mlir_dump_file_prefix,
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_SAVED_MODEL_IMPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import_test.cc
new file mode 100644
index 000000000000..61299229cf5e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_test_base.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+using UpdateFunctionAliasesTest = ::mlir::tf_quant::QuantizationTestBase;
+
+TEST_F(UpdateFunctionAliasesTest, NoAliasesReturnsEmptyMap) {
+  // MLIR @main function corresponds to the TF function "main_original".
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    func.func private @main(%arg: tensor<1x2xf32>) -> (tensor<1x2xf32>) attributes {tf._original_func_name = "main_original"} {
+      return %arg : tensor<1x2xf32>
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases;
+  UpdateFunctionAliases(function_aliases, *module_op);
+  EXPECT_THAT(function_aliases, IsEmpty());
+}
+
+TEST_F(UpdateFunctionAliasesTest, AliasUpdatedByMlirFunctionName) {
+  // MLIR @main function corresponds to the TF function "main_original".
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    func.func private @main(%arg: tensor<1x2xf32>) -> (tensor<1x2xf32>) attributes {tf._original_func_name = "main_original"} {
+      return %arg : tensor<1x2xf32>
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases{
+      {"main_original", "main_alias"}};
+  UpdateFunctionAliases(function_aliases, *module_op);
+
+  EXPECT_THAT(function_aliases,
+              UnorderedElementsAre(Pair("main", "main_alias")));
+}
+
+TEST_F(UpdateFunctionAliasesTest, IgnoresUnmatchedFunctions) {
+  // MLIR @main function corresponds to the TF function "main_original".
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    func.func private @main(%arg: tensor<1x2xf32>) -> (tensor<1x2xf32>) attributes {tf._original_func_name = "main_original"} {
+      return %arg : tensor<1x2xf32>
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  // There is no alias corresponding to "main_original". The existing entry
+  // without a corresponding function is ignored.
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases{
+      {"not_main", "not_main_alias"}};
+  UpdateFunctionAliases(function_aliases, *module_op);
+
+  EXPECT_THAT(function_aliases, IsEmpty());
+}
+
+TEST_F(UpdateFunctionAliasesTest,
+       SkipsFunctionsWithNoOriginalFuncNameAttribute) {
+  // @main does not have the "tf._original_func_name" attribute.
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    func.func private @main(%arg: tensor<1x2xf32>) -> (tensor<1x2xf32>) {
+      return %arg : tensor<1x2xf32>
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  // The existing entry without a corresponding function is ignored.
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases{
+      {"main_original", "main_alias"}};
+  UpdateFunctionAliases(function_aliases, *module_op);
+
+  EXPECT_THAT(function_aliases, IsEmpty());
+}
+
+TEST_F(UpdateFunctionAliasesTest, FunctionNameNotChanged) {
+  // @main does not have the "tf._original_func_name" attribute.
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    func.func private @main_original(%arg: tensor<1x2xf32>) -> (tensor<1x2xf32>) {
+      return %arg : tensor<1x2xf32>
+    }
+  )mlir");
+  ASSERT_TRUE(module_op);
+
+  // The existing entry without a corresponding function is ignored.
+  absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases{
+      {"main_original", "main_alias"}};
+  UpdateFunctionAliases(function_aliases, *module_op);
+
+  EXPECT_THAT(function_aliases,
+              UnorderedElementsAre(Pair("main_original", "main_alias")));
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_weight_only_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_weight_only_ptq.cc
new file mode 100644
index 000000000000..f7242dc43128
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_weight_only_ptq.cc
@@ -0,0 +1,125 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_weight_only_ptq.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/die_if_null.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_export.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_saved_model_import.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+using ::stablehlo::quantization::GetReportFilePath;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::PyFunctionLibrary;
+using ::tensorflow::quantization::RunPasses;
+
+WeightOnlyPtqComponent::WeightOnlyPtqComponent(MLIRContext* absl_nonnull ctx)
+    : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
+
+absl::StatusOr<ModuleOp> WeightOnlyPtqComponent::Run(
+    ModuleOp module_op, const QuantizationConfig& config) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      kName, /*add_passes_func=*/
+      [&config](PassManager& pm) {
+        // Add instrumentation to save quantization report after quantization.
+        pm.addInstrumentation(
+            std::make_unique<SaveQuantizationReportInstrumentation>(
+                GetReportFilePath(config)));
+
+        AddWeightOnlyQuantizationPasses(pm, config.specs(),
+                                        config.pipeline_config(),
+                                        config.debugger_config());
+      },
+      *ctx_, module_op));
+  return module_op;
+}
+
+absl::Status QuantizeWeightOnlyPtq(
+    const absl::string_view src_saved_model_path,
+    const absl::string_view dst_saved_model_path,
+    QuantizationConfig quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library) {
+  std::unordered_set<std::string> tags;
+  tags.insert(quantization_config.tf_saved_model().tags().begin(),
+              quantization_config.tf_saved_model().tags().end());
+
+  std::unique_ptr<MLIRContext> ctx =
+      quant::stablehlo::CreateMlirContextForQuantization();
+
+  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+      function_aliases = GetFunctionAliases(src_saved_model_path, tags);
+  if (!function_aliases.ok()) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to get function alias: ", function_aliases.status().message()));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      auto module,
+      ImportSavedModel(src_saved_model_path, signature_keys, tags,
+                       quantization_config, WeightOnlyPtqComponent::kName,
+                       *function_aliases, *ctx));
+
+  WeightOnlyPtqComponent weight_only_ptq_component(ctx.get());
+  TF_ASSIGN_OR_RETURN(
+      *module, weight_only_ptq_component.Run(*module, quantization_config));
+
+  TF_ASSIGN_OR_RETURN(
+      const ExportedModel post_calibrated_exported_model,
+      CreateExportedModel(signature_keys, tags, quantization_config,
+                          WeightOnlyPtqComponent::kName, *function_aliases,
+                          *ctx, *module));
+
+  // Remove the `tpu` tag for exporting because the output quantized model is
+  // essentially a CPU model.
+  tags.erase("tpu");
+
+  py_function_library.SaveExportedModel(
+      dst_saved_model_path, post_calibrated_exported_model,
+      src_saved_model_path, tags, signature_def_map);
+
+  return absl::OkStatus();
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_weight_only_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_weight_only_ptq.h
new file mode 100644
index 000000000000..403a89b768f3
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_weight_only_ptq.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_WEIGHT_ONLY_PTQ_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_WEIGHT_ONLY_PTQ_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// Performs int8 weight-only quantization on dot_general ops.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the weight constants, not
+// relying on calibration.
+class WeightOnlyPtqComponent : public quant::stablehlo::Component {
+ public:
+  // Used for debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_weight_only";
+
+  explicit WeightOnlyPtqComponent(MLIRContext* absl_nonnull ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  MLIRContext* absl_nonnull ctx_;
+};
+
+// Runs weight-only quantization on a SavedModel at
+// `src_saved_model_path` and saves the resulting model to
+// `dst_saved_model_path`.
+//
+// `quantization_config` configures the quantization behavior for the
+// weight-only quantization.
+//
+// `signature_keys` specify the signatures that correspond to functions to be
+// quantized. `signature_def_map` connects the signature keys to
+// `SignatureDef`s.
+//
+// Returns a non-OK status when the quantization is not successful.
+// LINT.IfChange
+absl::Status QuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path,
+    ::stablehlo::quantization::QuantizationConfig quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+// LINT.ThenChange(../python/pywrap_quantization.cc:weight_only_ptq)
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_TF_WEIGHT_ONLY_PTQ_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
index 3f8215edc605..ec780bf8cf9a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
@@ -53,7 +53,7 @@ using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
 using ::tensorflow::quantization::RunPasses;
 
-WeightOnlyPtqComponent::WeightOnlyPtqComponent(absl::Nonnull<MLIRContext*> ctx)
+WeightOnlyPtqComponent::WeightOnlyPtqComponent(MLIRContext* absl_nonnull ctx)
     : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
 
 absl::StatusOr<ModuleOp> WeightOnlyPtqComponent::Run(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
index bf23e93246c7..ba18d729042d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
@@ -42,14 +42,14 @@ class WeightOnlyPtqComponent : public Component {
   // Used for debugging purposes.
   static constexpr absl::string_view kName = "quant_ptq_weight_only";
 
-  explicit WeightOnlyPtqComponent(absl::Nonnull<MLIRContext*> ctx);
+  explicit WeightOnlyPtqComponent(MLIRContext* absl_nonnull ctx);
 
   absl::StatusOr<ModuleOp> Run(
       ModuleOp module_op,
       const ::stablehlo::quantization::QuantizationConfig& config) override;
 
  private:
-  absl::Nonnull<MLIRContext*> ctx_;
+  MLIRContext* absl_nonnull ctx_;
 };
 
 // Runs weight-only quantization on a SavedModel at
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/BUILD
index f2016bc16446..005014f19cd6 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/BUILD
@@ -9,6 +9,43 @@ package(
     licenses = ["notice"],
 )
 
+cc_library(
+    name = "tf_save_report",
+    srcs = ["tf_save_report.cc"],
+    hdrs = ["tf_save_report.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:tf_report",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_save_report_test",
+    srcs = ["tf_save_report_test.cc"],
+    deps = [
+        ":tf_save_report",
+        "//tensorflow/compiler/mlir/quantization/common:tf_test_base",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_xla//xla/tsl/platform:status_matchers",
+    ],
+)
+
 cc_library(
     name = "save_report",
     srcs = ["save_report.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.cc b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.cc
index e1a705cdbb24..edba8f604086 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.cc
@@ -38,8 +38,8 @@ std::optional<std::string> OptionalStringViewToOptionalString(
 }
 
 // Whether the pass is `QuantizeCompositeFunctionPass`.
-bool IsQuantizeCompositeFunctionPass(absl::Nullable<Pass*> pass,
-                                     absl::Nullable<Operation*> op) {
+bool IsQuantizeCompositeFunctionPass(Pass* absl_nullable pass,
+                                     Operation* absl_nullable op) {
   // It is known that `op` is `ModuleOp` when `pass` is
   // `QuantizeCompositeFunctionPass`, but the check is still performed to be
   // defensive.
@@ -52,7 +52,7 @@ bool IsQuantizeCompositeFunctionPass(absl::Nullable<Pass*> pass,
 // * After running `QuantizeCompositeFunctionPass`.
 // * The pass is run on `ModuleOp`.
 // * `file_path` is not `nullopt`.
-bool ShouldSaveReport(absl::Nullable<Pass*> pass, absl::Nullable<Operation*> op,
+bool ShouldSaveReport(Pass* absl_nullable pass, Operation* absl_nullable op,
                       const std::optional<std::string>& file_path) {
   return file_path != std::nullopt && IsQuantizeCompositeFunctionPass(pass, op);
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.cc b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.cc
new file mode 100644
index 000000000000..70b309f5b83d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.cc
@@ -0,0 +1,95 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.h"
+
+#include <optional>
+#include <string>
+
+#include "absl/base/nullability.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_report.h"
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+// Converts `std::optional<absl::string_view>` to `std::optional<std::string>`.
+// A `std::nullopt` is returned when `view` is `std::nullopt`.
+std::optional<std::string> OptionalStringViewToOptionalString(
+    std::optional<absl::string_view> view) {
+  if (view == std::nullopt) return std::nullopt;
+  return std::make_optional<std::string>(*view);
+}
+
+// Whether the pass is `QuantizeCompositeFunctionPass`.
+bool IsQuantizeCompositeFunctionPass(Pass* absl_nullable pass,
+                                     Operation* absl_nullable op) {
+  // It is known that `op` is `ModuleOp` when `pass` is
+  // `QuantizeCompositeFunctionPass`, but the check is still performed to be
+  // defensive.
+  return pass != nullptr &&
+         pass->getArgument() == "tf-stablehlo-quantize-composite-functions" &&
+         isa_and_nonnull<ModuleOp>(op);
+}
+
+// Report is saved only when:
+// * After running `QuantizeCompositeFunctionPass`.
+// * The pass is run on `ModuleOp`.
+// * `file_path` is not `nullopt`.
+bool ShouldSaveReport(Pass* absl_nullable pass, Operation* absl_nullable op,
+                      const std::optional<std::string>& file_path) {
+  return file_path != std::nullopt && IsQuantizeCompositeFunctionPass(pass, op);
+}
+
+void SaveReport(const QuantizationReport& report,
+                const absl::string_view file_path) {
+  if (const absl::Status save_status = report.Save(file_path);
+      save_status.ok()) {
+    LOG(INFO) << "Successfully saved quantization report to: " << file_path;
+  } else {
+    LOG(ERROR) << "Failed to save quantization report to: " << file_path
+               << " with status: " << save_status;
+  }
+}
+
+}  // namespace
+
+SaveQuantizationReportInstrumentation::SaveQuantizationReportInstrumentation(
+    std::optional<absl::string_view> file_path)
+    : file_path_(OptionalStringViewToOptionalString(file_path)) {}
+
+void SaveQuantizationReportInstrumentation::runAfterPass(Pass* pass,
+                                                         Operation* op) {
+  // Only run after `QuantizeCompositeFunctionPass`.
+  if (!IsQuantizeCompositeFunctionPass(pass, op)) return;
+
+  auto module_op = cast<ModuleOp>(op);
+  const QuantizationReport report(module_op);
+
+  // Print a human-readable report to stdout regardless of whether the report
+  // is saved to file.
+  report.Print();
+
+  // Exit early if the report should not be saved to file.
+  if (!ShouldSaveReport(pass, op, file_path_)) return;
+
+  SaveReport(report, *file_path_);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.h b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.h
new file mode 100644
index 000000000000..827ffde4ff3a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_TF_SAVE_REPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_TF_SAVE_REPORT_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+
+namespace mlir::tf_quant::stablehlo {
+
+// A `PassInstrumentation` that saves quantization report to file after
+// `QuantizeCompositeFunctionsPass` is run. It inspects the `ModuleOp` after
+// quantization and analyzes the quantizable units and quantization methods
+// used. The report file will be saved at the `file_path`. The report file
+// contains textproto of `QuantizationResults`. `file_path`'s base directories
+// should exist (this pass instrumentation will not `mkdir` them).
+//
+// See `QuantizationReport` for further details on the quantization report.
+class SaveQuantizationReportInstrumentation : public PassInstrumentation {
+ public:
+  // `file_path` is the path to save the report file. The report file is in
+  // textproto format so a `.txtpb` extension is preferred but it doesn't result
+  // in error if other extension is used. This instrumentation will not be run
+  // if `file_path` is a `nullopt`.
+  explicit SaveQuantizationReportInstrumentation(
+      std::optional<absl::string_view> file_path);
+
+  void runAfterPass(Pass* pass, Operation* op) override;
+
+ private:
+  std::optional<std::string> file_path_;  // Path to file to save the report.
+};
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_TF_SAVE_REPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report_test.cc
new file mode 100644
index 000000000000..8cf1a3de20a6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report_test.cc
@@ -0,0 +1,187 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/tf_save_report.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_test_base.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::QuantizationResults;
+using ::stablehlo::quantization::io::ReadFileToString;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+using SaveQuantizationReportInstrumentationTest = QuantizationTestBase;
+
+TEST_F(SaveQuantizationReportInstrumentationTest, SaveReport) {
+  constexpr absl::string_view kModuleWithCompositeDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+      %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      return %2 : tensor<1x3xf32>
+    }
+
+    func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithCompositeDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  // Create a pass manager with `SaveQuantizationReportInstrumentation` and
+  // `QuantizeCompositeFunctionsPass`. Run the passes against `module_op`.
+  PassManager pm(ctx_.get());
+
+  QuantizeCompositeFunctionsPassOptions options;
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+
+  const std::string report_file_path =
+      absl::StrCat(testing::TempDir(), "/save_report.txtpb");
+  pm.addInstrumentation(std::make_unique<SaveQuantizationReportInstrumentation>(
+      report_file_path));
+
+  const LogicalResult run_result = pm.run(*module_op);
+  ASSERT_TRUE(succeeded(run_result));
+
+  // Check that the report file contains `QuantizationResults` textproto,
+  // reflecting the quantization results, in this case the
+  // `composite_dot_general_fn` with quantized with `static_range_ptq` method.
+  const absl::StatusOr<std::string> file_data =
+      ReadFileToString(report_file_path);
+  ASSERT_THAT(file_data, IsOk());
+
+  /*
+  results {
+    quantizable_unit {
+      name: "composite_dot_general_fn"
+    }
+    method { static_range_ptq { } }
+  }
+  */
+  QuantizationResults results{};
+  ASSERT_TRUE(TextFormat::ParseFromString(*file_data, &results));
+  ASSERT_THAT(results.results(), SizeIs(1));
+  EXPECT_THAT(results.results(0).quantizable_unit().name(),
+              StrEq("composite_dot_general_fn"));
+  EXPECT_TRUE(results.results(0).method().has_static_range_ptq());
+}
+
+TEST_F(SaveQuantizationReportInstrumentationTest,
+       ReportNotSavedWhenNoQuantizeCompositeFunctionsPass) {
+  constexpr absl::string_view kModuleWithCompositeDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %cst = "stablehlo.constant"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+      %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      return %2 : tensor<1x3xf32>
+    }
+
+    func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithCompositeDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  // Create a pass manager with `SaveQuantizationReportInstrumentation` a pass
+  // that is not `QuantizeCompositeFunctionsPass`. Run the passes against
+  // `module_op`.
+  PassManager pm(ctx_.get());
+
+  pm.addPass(createPrepareQuantizePass());
+
+  const std::string report_file_path = absl::StrCat(
+      testing::TempDir(),
+      "/report_not_saved_no_quantize_composite_functions_pass.txtpb");
+  pm.addInstrumentation(std::make_unique<SaveQuantizationReportInstrumentation>(
+      report_file_path));
+
+  const LogicalResult run_result = pm.run(*module_op);
+  ASSERT_TRUE(succeeded(run_result));
+
+  // The report file is not created because `QuantizeCompositeFunctionsPass` was
+  // not run.
+  EXPECT_THAT(ReadFileToString(report_file_path),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(SaveQuantizationReportInstrumentationTest,
+       ReportNotSavedWhenReportFilePathIsNullopt) {
+  constexpr absl::string_view kModuleWithCompositeDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %cst = "stablehlo.constant"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+      %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      return %2 : tensor<1x3xf32>
+    }
+
+    func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithCompositeDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  PassManager pm(ctx_.get());
+
+  QuantizeCompositeFunctionsPassOptions options;
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+  pm.addInstrumentation(std::make_unique<SaveQuantizationReportInstrumentation>(
+      /*file_path=*/std::nullopt));
+
+  // The report file is not created and `SaveQuantizationReportInstrumentation`
+  // is not run, but the passes still run without errors.
+  const LogicalResult run_result = pm.run(*module_op);
+  ASSERT_TRUE(succeeded(run_result));
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
index 61da2af4d3fb..798d0ecc1396 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
@@ -9,6 +9,31 @@ package(
     licenses = ["notice"],
 )
 
+cc_library(
+    name = "tf_stablehlo_op_quant_spec",
+    srcs = [
+        "tf_stablehlo_op_quant_spec.cc",
+    ],
+    hdrs = ["tf_stablehlo_op_quant_spec.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:tf_lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
 cc_library(
     name = "stablehlo_op_quant_spec",
     srcs = [
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.cc
new file mode 100644
index 000000000000..d2e413af3e92
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.cc
@@ -0,0 +1,184 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.h"
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+// To be used with LLVM_DEBUG.
+#define DEBUG_TYPE "stablehlo_opt_quant_spec"
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+using ::mlir::stablehlo::DotGeneralOp;
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::StaticRangePtq;
+
+// Whether it represents a lifted function (i.e. `op` is the corresponding
+// `XlaCallModuleOp`) that is explicitly marked `NoQuantization`.
+bool IsDenylistedLiftedFunction(Operation* op) {
+  if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
+      xla_call_module_op != nullptr) {
+    absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+    if (method.ok() && method->has_no_quantization()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Populates `spec.coeff_op_quant_dim` according to `xla_call_module_op`'s
+// `_quantization_method` attribute. If there is an input `QuantizedType` with
+// `dimension_specs` set, which represents the quantization dimension for the
+// input, then the corresponding operand index -> quantization dimension mapping
+// is set for `spec`.
+// TODO: b/323478683 - Duplicate tracking of config will be eliminated.
+// `OpQuantSpec` will be deprecated and `Method` will be used instead.
+void PopulateCoeffOpQuantDimIfPerChannelQuantized(
+    TF::XlaCallModuleOp xla_call_module_op, OpQuantSpec& spec) {
+  absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+  if (method.ok() && method->has_static_range_ptq()) {
+    // TODO: b/331145946 - Use `Method` accessors.
+    const StaticRangePtq& static_range_ptq_spec = method->static_range_ptq();
+    // Look for quantized dimension specs for each quantized type and
+    // populate `coeff_op_quant_dim`.
+    for (const auto& [operand_idx, quantized_type] :
+         static_range_ptq_spec.input_quantized_types()) {
+      if (quantized_type.has_dimension_specs()) {
+        spec.coeff_op_quant_dim[operand_idx] =
+            quantized_type.dimension_specs().dimension();
+      }
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
+  auto spec = std::make_unique<OpQuantSpec>();
+  if (auto call_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op)) {
+    auto entry_function =
+        call_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function");
+    StringRef function_name = entry_function.getValue();
+    if (!function_name.starts_with("composite_")) {
+      return spec;
+    }
+
+    if (function_name.contains("conv")) {
+      // Looks up `Method` to see if it should be per-channel quantized and
+      // populates the spec accordingly.
+      PopulateCoeffOpQuantDimIfPerChannelQuantized(call_op, *spec);
+
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("dot_general")) {
+      const auto module_op = call_op->getParentOfType<ModuleOp>();
+
+      const SymbolTable symbol_table(module_op);
+      auto entry_func_op =
+          dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(function_name));
+      auto dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
+      if (auto optional_dim = GetDotGeneralQuantizationDim(dot_general_op);
+          optional_dim) {
+        spec->coeff_op_quant_dim[1] = optional_dim.value();
+      } else {
+        spec->coeff_op_quant_dim[1] = -1;
+      }
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
+      }
+    }
+    for (const auto [operand_idx, per_channel_dim] : spec->coeff_op_quant_dim) {
+      spec->quantizable_operands.insert(operand_idx);
+    }
+  }
+  return spec;
+}
+
+std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantConstraints(Operation* op) {
+  auto scale_spec = std::make_unique<OpQuantScaleSpec>();
+  if (llvm::isa<mlir::stablehlo::BroadcastInDimOp,
+                mlir::stablehlo::ConcatenateOp,
+                mlir::stablehlo::DynamicReshapeOp,
+                mlir::stablehlo::DynamicSliceOp, mlir::stablehlo::GatherOp,
+                mlir::stablehlo::PadOp, mlir::stablehlo::ReduceWindowOp,
+                mlir::stablehlo::ReshapeOp, mlir::stablehlo::SelectOp,
+                mlir::stablehlo::SliceOp, mlir::stablehlo::TransposeOp>(op)) {
+    scale_spec->has_same_scale_requirement = true;
+  }
+  if (llvm::isa<mlir::stablehlo::DynamicSliceOp, mlir::stablehlo::GatherOp,
+                mlir::stablehlo::PadOp, mlir::stablehlo::SliceOp>(op)) {
+    scale_spec->has_same_operand_and_result_type_requirement = true;
+  }
+  return scale_spec;
+}
+
+bool IsOpQuantizableStableHlo(Operation* op) {
+  if (isa<func::ConstantOp, mlir::stablehlo::ConstantOp>(op)) {
+    // Constant ops do not have QuantizableResult attribute but can be
+    // quantized.
+    return true;
+  } else if (op->hasTrait<OpTrait::IsTerminator>() ||
+             isa<mlir::quant::ir::QuantizeCastOp,
+                 mlir::quant::ir::DequantizeCastOp>(op)) {
+    // Terminators, qcast and decast are not quantizable.
+    return false;
+  }
+
+  // `op` is not quantizable when it is an `XlaCallModuleOp` representing lifted
+  // function whose `_quantization_method` attribute is marked `NoQuantization`.
+  // This means this quantizable unit has been explicitly denylisted by the
+  // user.
+  if (IsDenylistedLiftedFunction(op)) {
+    LLVM_DEBUG(llvm::errs() << "Denylisted quantizable unit: \n" << op << "\n");
+    return false;
+  }
+
+  if (GetStableHloQuantConstraints(op)->has_same_scale_requirement) {
+    return true;
+  }
+
+  const bool attr_enforced_quantizable =
+      op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+      op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue().str() ==
+          QuantTraitValues[QuantizationTrait::FullyQuantizable];
+  return attr_enforced_quantizable;
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.h b/tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.h
new file mode 100644
index 000000000000..2c6ca14b5f0a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_TF_STABLEHLO_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_TF_STABLEHLO_OP_QUANT_SPEC_H_
+
+#include <memory>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// Returns StableHLO quantization specs for an op.
+std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op);
+
+// Returns quantization constraints (ex: fixed output, same scale) given
+// a StableHLO op.
+std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantConstraints(Operation* op);
+
+// Checks if an op is quantizable in StableHLO quantizer. Argument op is not
+// necessarily a StableHLO op.
+bool IsOpQuantizableStableHlo(Operation* op);
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_TF_STABLEHLO_OP_QUANT_SPEC_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index 9dfc858ed3fc..babda33245a7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -199,7 +199,7 @@ class ConvertTfQuantToMhloIntTest : public Test {
     AddQuantizationLoweringPasses(pm);
     CHECK(succeeded(pm.run(module_op.get())));
     // Compile the program.
-    return pjrt_client_->Compile(*module_op, xla::CompileOptions{});
+    return pjrt_client_->CompileAndLoad(*module_op, xla::CompileOptions{});
   }
 
   absl::StatusOr<std::shared_ptr<xla::Literal>>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_shape_constraint_to_assert.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_shape_constraint_to_assert.cc
new file mode 100644
index 000000000000..d63dfdeaec75
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_shape_constraint_to_assert.cc
@@ -0,0 +1,218 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"  // IWYU pragma: keep
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_CONVERTSHAPETOSTABLEHLOWITHCONSTRAINTSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+using ::mlir::stablehlo::AndOp;
+using ::mlir::stablehlo::CompareOp;
+using ::mlir::stablehlo::ComparisonDirection;
+using ::mlir::stablehlo::ConcatenateOp;
+using ::mlir::stablehlo::ConstantOp;
+using ::mlir::stablehlo::CustomCallOp;
+using ::mlir::stablehlo::OrOp;
+using ::mlir::stablehlo::ReshapeOp;
+using ::mlir::stablehlo::SliceOp;
+
+// Cast from index-based shape representation used in the Shape dialect to the
+// i32-based representation used in HLO:
+//   * index => tensor<i32>.
+//   * tensor<Nxindex> => tensor<Nxi32>.
+//   * All i32-based types from above => themselves.
+// There is no convenient op that can express this, so we're using
+// unrealized_conversion_cast (with the idea that all these casts will
+// annihilate at the end of the pass).
+Value castToI32(PatternRewriter& rewriter, Location loc, Value value) {
+  Type resultType;
+  if (value.getType().isIndex())
+    resultType = RankedTensorType::get({}, rewriter.getI32Type());
+  if (auto valueType = mlir::dyn_cast<ShapedType>(value.getType())) {
+    if (!valueType.hasStaticShape()) return {};
+    if (valueType.getElementType().isInteger(32)) return value;
+    if (valueType.getElementType().isIndex())
+      resultType =
+          RankedTensorType::get(valueType.getShape(), rewriter.getI32Type());
+  }
+  if (!resultType) return {};
+  auto cast =
+      rewriter.create<UnrealizedConversionCastOp>(loc, resultType, value);
+  return cast.getResult(0);
+}
+
+// Pads input tensor<N x i32> by X ones from the left. The number X is
+// determined by input pad. Result is tensor<(X+N) x i32>, where the first X
+// elements are ones.
+Value padFromLeft(PatternRewriter& rewriter, Location loc, Value input,
+                  int64_t pad) {
+  Value padI32 = rewriter.create<ConstantOp>(
+      loc, DenseIntElementsAttr::get<int32_t>(
+               RankedTensorType::get({pad}, rewriter.getI32Type()), 1));
+  return rewriter.create<ConcatenateOp>(loc, ValueRange{padI32, input},
+                                        /*dimension=*/0);
+}
+
+void insertShapeAssertionCustomCall(OpBuilder builder, Location loc,
+                                    Value assert) {
+  auto customCall =
+      builder.create<CustomCallOp>(loc, TypeRange{}, ValueRange{assert});
+  customCall.setCallTargetName("shape_assertion");
+  customCall.setHasSideEffect(true);
+  customCall->setAttr("error_message",
+                      builder.getStringAttr("Shape assertion failed"));
+}
+
+struct ConvertCstrBroadcastableOp
+    : public OpRewritePattern<shape::CstrBroadcastableOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
+                                PatternRewriter& rewriter) const override {
+    // As defined, op inputs must be 1D tensor or !shape.shape.
+    // We only support inputs of two 1D tensors.
+    if (op.getShapes().size() != 2) return failure();
+    auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
+    auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
+    if (!shape1 || !shape2) return failure();
+    auto tensorType1 = mlir::dyn_cast<RankedTensorType>(shape1.getType());
+    auto tensorType2 = mlir::dyn_cast<RankedTensorType>(shape2.getType());
+    if (!tensorType1 || !tensorType2) return failure();
+
+    // If the two operand shapes are of different sizes, the smaller one is
+    // padded with 1's from the left.
+    int32_t rank =
+        std::max(tensorType1.getDimSize(0), tensorType2.getDimSize(0));
+    if (tensorType1.getDimSize(0) < tensorType2.getDimSize(0)) {
+      shape1 =
+          padFromLeft(rewriter, op.getLoc(), shape1,
+                      tensorType2.getDimSize(0) - tensorType1.getDimSize(0));
+    } else if (tensorType1.getDimSize(0) > tensorType2.getDimSize(0)) {
+      shape2 =
+          padFromLeft(rewriter, op.getLoc(), shape2,
+                      tensorType1.getDimSize(0) - tensorType2.getDimSize(0));
+    }
+
+    // Compute if each dim is broadcastable. A dim is broadcastable iff
+    // dimSize1 == dimSize2 or dimSize1 == 1 or dimSize2 == 1
+    auto allOne = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<int32_t>(
+                         RankedTensorType::get({rank}, rewriter.getI32Type()),
+                         static_cast<int32_t>(1)));
+    Value dimSize1Is1 = rewriter.create<CompareOp>(op.getLoc(), shape1, allOne,
+                                                   ComparisonDirection::EQ);
+    Value dimSize2Is1 = rewriter.create<CompareOp>(op.getLoc(), shape2, allOne,
+                                                   ComparisonDirection::EQ);
+    Value eitherDimSizeIs1 =
+        rewriter.create<OrOp>(op.getLoc(), dimSize1Is1, dimSize2Is1);
+    Value dimSizeEq = rewriter.create<CompareOp>(op.getLoc(), shape1, shape2,
+                                                 ComparisonDirection::EQ);
+    Value dimBroadcastable =
+        rewriter.create<OrOp>(op.getLoc(), eitherDimSizeIs1, dimSizeEq);
+
+    // Iterate over each dim to check that all dims are broadcastable.
+    auto boolType = RankedTensorType::get({1}, rewriter.getI1Type());
+    Value allBroadcastable = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<bool>(boolType, true));
+    for (auto i = 0; i < rank; ++i) {
+      Value broadcastable = rewriter.create<SliceOp>(
+          op.getLoc(), dimBroadcastable, rewriter.getDenseI64ArrayAttr(i),
+          rewriter.getDenseI64ArrayAttr(i + 1),
+          rewriter.getDenseI64ArrayAttr(1));
+      allBroadcastable =
+          rewriter.create<AndOp>(op.getLoc(), allBroadcastable, broadcastable);
+    }
+    Value allBroadcastableScalar = rewriter.create<ReshapeOp>(
+        op.getLoc(), RankedTensorType::get({}, rewriter.getI1Type()),
+        allBroadcastable);
+
+    // Add CustomCallOp and replace Cstr op with const witness, which is useful
+    // for canonicalizer to remove the shape.assuming region.
+    insertShapeAssertionCustomCall(rewriter, op->getLoc(),
+                                   allBroadcastableScalar);
+    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op.getOperation(), true);
+    return success();
+  }
+};
+
+bool hasIndexStyle(Value value) {
+  if (value.getType().isIndex()) return true;
+  auto type = mlir::dyn_cast<ShapedType>(value.getType());
+  return type && type.getElementType().isIndex();
+}
+
+struct ConvertShapeToStablehloWithConstraintsPass
+    : public impl::ConvertShapeToStablehloWithConstraintsPassBase<
+          ConvertShapeToStablehloWithConstraintsPass> {
+  void runOnOperation() override {
+    ConversionTarget target(getContext());
+    target.addIllegalDialect<shape::ShapeDialect>();
+    target.addIllegalDialect<tensor::TensorDialect>();
+    target.addIllegalOp<arith::IndexCastOp>();
+    target.addIllegalOp<arith::MulIOp>();
+    target.addDynamicallyLegalDialect<::mlir::stablehlo::StablehloDialect>(
+        [](Operation* op) {
+          return !llvm::any_of(op->getOperands(), hasIndexStyle);
+        });
+    target.addLegalOp<tensor::CastOp>();
+    target.addLegalOp<UnrealizedConversionCastOp>();
+    target.addLegalOp<shape::ConstWitnessOp, shape::AssumingOp,
+                      shape::AssumingYieldOp>();
+
+    RewritePatternSet patterns(&getContext());
+    ::mlir::stablehlo::populateShapeToStablehloPatterns(&getContext(),
+                                                        &patterns);
+
+    patterns.add<ConvertCstrBroadcastableOp>(&getContext());
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
index 0f4d2074e420..1a6663f4a735 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/nullability.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -47,7 +48,7 @@ using ::mlir::stablehlo::TransposeOp;
 
 // Returns `success()` if `op` is a `TransposeOp` with permutation attribute
 // equivalent to `permuation`.
-LogicalResult IsTransposeOpWithPermuation(absl::Nullable<Operation*> op,
+LogicalResult IsTransposeOpWithPermuation(Operation* absl_nullable op,
                                           const ArrayRef<int64_t> permutation) {
   auto transpose_op = dyn_cast_or_null<TransposeOp>(op);
   return success(transpose_op != nullptr && transpose_op.getPermutation() ==
@@ -89,8 +90,8 @@ void DeferRhsTransposeForBinaryOp(OpT op, PatternRewriter& rewriter) {
 // "Climbs up" the `op` if `op` is a `BraodcastInDimOp` and returns the defining
 // op of its operand. Returns `op` otherwise. May return `nullptr` when the
 // `BroadcastInDimOp`'s operand is a block argument.
-absl::Nullable<Operation*> SkipUpwardsOptionalBroadcastInDimOp(
-    absl::Nonnull<Operation*> op) {
+Operation* absl_nullable SkipUpwardsOptionalBroadcastInDimOp(
+    Operation* absl_nonnull op) {
   if (auto broadcast_in_dim_op = dyn_cast_or_null<BroadcastInDimOp>(op);
       broadcast_in_dim_op != nullptr) {
     return broadcast_in_dim_op.getOperand().getDefiningOp();
@@ -100,9 +101,10 @@ absl::Nullable<Operation*> SkipUpwardsOptionalBroadcastInDimOp(
 
 class DeferActivationTransposeForAddOp : public OpRewritePattern<AddOp> {
  public:
-  using OpRewritePattern<AddOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(AddOp op) const override {
+  LogicalResult matchAndRewrite(AddOp op,
+                                PatternRewriter& rewriter) const override {
     // Only supports the case for 2D convolution.
     const Value lhs = op.getOperand(0);
     if (!HasRankOf(lhs, /*rank=*/4)) return failure();
@@ -119,12 +121,13 @@ class DeferActivationTransposeForAddOp : public OpRewritePattern<AddOp> {
     }
 
     // Match LHS permutation that converts: NHWC -> NCHW.
-    return IsTransposeOpWithPermuation(lhs.getDefiningOp(),
-                                       kNhwcToNchwPermutation);
-  }
+    if (IsTransposeOpWithPermuation(lhs.getDefiningOp(), kNhwcToNchwPermutation)
+            .failed()) {
+      return failure();
+    }
 
-  void rewrite(AddOp op, PatternRewriter& rewriter) const override {
     DeferRhsTransposeForBinaryOp(op, rewriter);
+    return success();
   }
 };
 
@@ -135,9 +138,10 @@ class DeferActivationTransposeForAddOp : public OpRewritePattern<AddOp> {
 class DeferActivationTransposeForMaxPoolReduceWindowOp
     : public OpRewritePattern<mlir::stablehlo::ReduceWindowOp> {
  public:
-  using OpRewritePattern<mlir::stablehlo::ReduceWindowOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(mlir::stablehlo::ReduceWindowOp op) const override {
+  LogicalResult matchAndRewrite(mlir::stablehlo::ReduceWindowOp op,
+                                PatternRewriter& rewriter) const override {
     if (failed(MatchMaxPoolReduceWindowOp(op))) return failure();
 
     // Match only when the lhs is connected to a transpose.
@@ -146,13 +150,12 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
     if (!HasRankOf(lhs, /*rank=*/4)) return failure();
 
     // Match input permutation that converts: NHWC -> NCHW.
-    return IsTransposeOpWithPermuation(lhs.getDefiningOp(),
-                                       kNhwcToNchwPermutation);
-  }
+    if (IsTransposeOpWithPermuation(lhs.getDefiningOp(), kNhwcToNchwPermutation)
+            .failed()) {
+      return failure();
+    }
 
-  // Pushes the transpose op at the input to the result.
-  void rewrite(mlir::stablehlo::ReduceWindowOp op,
-               PatternRewriter& rewriter) const override {
+    // Pushes the transpose op at the input to the result.
     auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
 
     const auto result_type = mlir::cast<TensorType>(op.getResult(0).getType());
@@ -192,6 +195,7 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
         rewriter);
 
     rewriter.replaceAllUsesWith(op.getResult(0), result_transpose_op);
+    return success();
   }
 
  private:
@@ -242,9 +246,10 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
 // `transpose(maximum(%rhs, transpose(%lhs)))`.
 class DeferActivationTransposeForMaxOp : public OpRewritePattern<MaxOp> {
  public:
-  using OpRewritePattern<MaxOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(MaxOp op) const override {
+  LogicalResult matchAndRewrite(MaxOp op,
+                                PatternRewriter& rewriter) const override {
     Value input = op.getOperand(0);
     if (!HasRankOf(input, /*rank=*/4)) return failure();
 
@@ -255,12 +260,13 @@ class DeferActivationTransposeForMaxOp : public OpRewritePattern<MaxOp> {
       return failure();
     }
 
-    return IsTransposeOpWithPermuation(input.getDefiningOp(),
-                                       kNhwcToNchwPermutation);
-  }
-
-  void rewrite(MaxOp op, PatternRewriter& rewriter) const override {
+    if (IsTransposeOpWithPermuation(input.getDefiningOp(),
+                                    kNhwcToNchwPermutation)
+            .failed()) {
+      return failure();
+    }
     DeferRhsTransposeForBinaryOp(op, rewriter);
+    return success();
   }
 };
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
index 24f5ab6a10fb..197fb1c868af 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
@@ -118,9 +118,10 @@ class DenseElementsTransposer {
 class FoldTransposedConstantOp
     : public OpRewritePattern<mlir::stablehlo::TransposeOp> {
  public:
-  using OpRewritePattern<mlir::stablehlo::TransposeOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(mlir::stablehlo::TransposeOp op) const override {
+  LogicalResult matchAndRewrite(mlir::stablehlo::TransposeOp op,
+                                PatternRewriter& rewriter) const override {
     Value operand = op.getOperand();
     auto const_op =
         dyn_cast_or_null<mlir::stablehlo::ConstantOp>(operand.getDefiningOp());
@@ -132,14 +133,9 @@ class FoldTransposedConstantOp
       return failure();
     }
 
-    return success(
-        mlir::isa_and_nonnull<DenseFPElementsAttr>(const_op.getValue()));
-  }
-
-  void rewrite(mlir::stablehlo::TransposeOp op,
-               PatternRewriter& rewriter) const override {
-    auto const_op =
-        cast<mlir::stablehlo::ConstantOp>(op.getOperand().getDefiningOp());
+    if (!mlir::isa_and_nonnull<DenseFPElementsAttr>(const_op.getValue())) {
+      return failure();
+    }
 
     const auto value_attr =
         mlir::cast<DenseFPElementsAttr>(const_op.getValue());
@@ -168,7 +164,8 @@ class FoldTransposedConstantOp
         combined_loc, new_value_attr);
 
     rewriter.replaceAllUsesWith(op, new_const_op);
-  };
+    return success();
+  }
 };
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
index a9e13695fbda..fb2e5caba7b5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
@@ -82,12 +82,11 @@ class InsertWeightParamPass
 class InsertWeightParamPattern
     : public OpTraitRewritePattern<OpTrait::ConstantLike> {
  public:
-  using OpTraitRewritePattern<OpTrait::ConstantLike>::OpTraitRewritePattern;
-
   explicit InsertWeightParamPattern(MLIRContext* context)
-      : OpTraitRewritePattern<OpTrait::ConstantLike>(context) {}
+      : OpTraitRewritePattern(context) {}
 
-  LogicalResult match(Operation* op) const override {
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
     if (op->getNumResults() != 1) {
       return failure();
     }
@@ -95,27 +94,11 @@ class InsertWeightParamPattern
     if (!type || !type.getElementType().isF32()) {
       return failure();
     }
-    return success(
-        op->hasOneUse() &&
-        IsWeightQuantizableFunction(*op->getUses().begin(), type.getRank()));
-  }
-
-  // Checks if the operand is second operand of `tf.XlaCallModule` op for
-  // `stablehlo.convolution` or `stablehlo.dot_general` with fully_quantizable
-  // trait.
-  static bool IsWeightQuantizableFunction(OpOperand& operand, int64_t rank) {
-    if (operand.getOperandNumber() != 1) {
-      return false;
-    }
-    Operation* user = operand.getOwner();
-    if (!IsWeightOnlyQuantizableOp(*user)) {
-      return false;
+    if (!op->hasOneUse() ||
+        !IsWeightQuantizableFunction(*op->getUses().begin(), type.getRank())) {
+      return failure();
     }
-    Method method = GetQuantizationMethodOrDefault(user);
-    return HasValidWeightOnlyPtqMethod(method.weight_only_ptq(), rank);
-  }
 
-  void rewrite(Operation* op, PatternRewriter& rewriter) const override {
     Operation* quantizable_op = *op->getUsers().begin();
     DenseFPElementsAttr attr;
     matchPattern(op->getResult(0), m_Constant(&attr));
@@ -143,7 +126,7 @@ class InsertWeightParamPattern
       op->emitError(
           "Failed to get weight quantization parameters for weight-only "
           "quantization.");
-      return;
+      return failure();
     }
 
     const Type expressed_type = op->getResult(0).getType();
@@ -156,6 +139,22 @@ class InsertWeightParamPattern
     auto dq = rewriter.create<quantfork::DequantizeCastOp>(op->getLoc(),
                                                            expressed_type, q);
     quantizable_op->setOperand(1, dq.getResult());
+    return success();
+  }
+
+  // Checks if the operand is second operand of `tf.XlaCallModule` op for
+  // `stablehlo.convolution` or `stablehlo.dot_general` with fully_quantizable
+  // trait.
+  static bool IsWeightQuantizableFunction(OpOperand& operand, int64_t rank) {
+    if (operand.getOperandNumber() != 1) {
+      return false;
+    }
+    Operation* user = operand.getOwner();
+    if (!IsWeightOnlyQuantizableOp(*user)) {
+      return false;
+    }
+    Method method = GetQuantizationMethodOrDefault(user);
+    return HasValidWeightOnlyPtqMethod(method.weight_only_ptq(), rank);
   }
 
  private:
@@ -220,7 +219,7 @@ class InsertWeightParamPattern
           dimension_numbers.getRhsContractingDimensions();
       ArrayRef<int64_t> rhs_batching_dims =
           dimension_numbers.getRhsBatchingDimensions();
-      int64_t rank = dot.getRhs().getType().cast<TensorType>().getRank();
+      int64_t rank = mlir::cast<TensorType>(dot.getRhs().getType()).getRank();
       for (int i = 0; i < rank; ++i) {
         // Return the first non-contracting, non-batching dimension of rhs.
         if (llvm::find(rhs_contracting_dims, i) == rhs_contracting_dims.end() &&
@@ -229,7 +228,7 @@ class InsertWeightParamPattern
         }
       }
     }
-    return op.getOperand(1).getType().cast<TensorType>().getRank() - 1;
+    return mlir::cast<TensorType>(op.getOperand(1).getType()).getRank() - 1;
   }
 };
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
index 293b4a19c6eb..23ce9c168843 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
@@ -103,9 +103,10 @@ class MergeFusionWithUniformDequantizePattern
     }
     for (auto user : users_to_erase) rewriter.eraseOp(user);
     rewriter.eraseOp(call_op);
-    func_op.eraseResult(0);
-    func_op.insertResult(0, new_call_op.getResult(0).getType(),
-                         /*resultAttrs=*/nullptr);
+    if (failed(func_op.eraseResult(0))) return failure();
+    if (failed(func_op.insertResult(0, new_call_op.getResult(0).getType(),
+                                    /*resultAttrs=*/nullptr)))
+      return failure();
 
     // Modify the quantized fused function to do dequantize+relu(6).
     rewriter.setInsertionPoint(req_op);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
index 39546b337782..4bb871a56886 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
@@ -50,9 +50,10 @@ class NchwConvolutionToNhwcPass
 class RewriteNchwConvolutionToNhwc
     : public OpRewritePattern<mlir::stablehlo::ConvolutionOp> {
  public:
-  using OpRewritePattern<mlir::stablehlo::ConvolutionOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(mlir::stablehlo::ConvolutionOp op) const override {
+  LogicalResult matchAndRewrite(mlir::stablehlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const override {
     // Handles 2D convolutions only.
     if (!HasRankOf(op.getOperand(0), /*rank=*/4) ||
         !HasRankOf(op.getOperand(1), /*rank=*/4)) {
@@ -62,13 +63,14 @@ class RewriteNchwConvolutionToNhwc
     if (!IsOpNotQuantized(op)) return failure();
 
     const ConvDimensionNumbersAttr dimension_nums = op.getDimensionNumbers();
-    return success(MatchInputDimensionNumbers(dimension_nums) &&
-                   MatchKernelDimensionNumbers(dimension_nums) &&
-                   MatchOutputDimensionNumbers(dimension_nums));
-  }
+    const bool dimension_nums_matched =
+        MatchInputDimensionNumbers(dimension_nums) &&
+        MatchKernelDimensionNumbers(dimension_nums) &&
+        MatchOutputDimensionNumbers(dimension_nums);
+    if (!dimension_nums_matched) {
+      return failure();
+    }
 
-  void rewrite(mlir::stablehlo::ConvolutionOp op,
-               PatternRewriter& rewriter) const override {
     // Transpose the input tensor: [b, f, 0, 1] => [b, 0, 1, f]
     Value input = op->getOperand(0);
     const TensorType new_input_tensor_type = GetTransposedTensorType(
@@ -129,6 +131,7 @@ class RewriteNchwConvolutionToNhwc
         rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
 
     rewriter.replaceAllUsesWith(op, output_transpose_op);
+    return success();
   }
 
  private:
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index da59c218a569..e6108ca6d13e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -153,6 +153,15 @@ def ConvertXlaCallModuleOpToBfloat16Pass : Pass<"stablehlo-convert-xla-call-modu
   ];
 }
 
+def ConvertShapeToStablehloWithConstraintsPass : Pass<"stablehlo-convert-shape-to-stablehlo-with-constraints", "mlir::func::FuncOp"> {
+  let summary = "Convert shape.cstr_broadcastable to stablehlo.custom_call @shape_assertion";
+  let dependentDialects = [
+    "mlir::shape::ShapeDialect",
+    "mlir::tensor::TensorDialect",
+    "mlir::stablehlo::StablehloDialect",
+  ];
+}
+
 def OptimizeGraphPass : Pass<"optimize-graph", "ModuleOp"> {
   let summary = "Optimize the sub-optimal patterns after quantization.";
   let dependentDialects = ["mlir::stablehlo::StablehloDialect",];
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index 350b6f786452..d6a88055c8c8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -672,11 +672,12 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
   explicit XlaCallModuleOpToCallOp(
       MLIRContext& ctx, const bool enable_per_channel_quantized_weight)
-      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx),
+      : OpRewritePattern<TF::XlaCallModuleOp>::OpRewritePattern(&ctx),
         enable_per_channel_quantized_weight_(
             enable_per_channel_quantized_weight) {}
 
-  LogicalResult match(TF::XlaCallModuleOp op) const override {
+  LogicalResult matchAndRewrite(TF::XlaCallModuleOp op,
+                                PatternRewriter& rewriter) const override {
     ModuleOp module_op = op->getParentOfType<ModuleOp>();
 
     // Ignore ops without quantization method.
@@ -697,22 +698,20 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
       return failure();
     }
     Method quantization_method = GetQuantizationMethodOrDefault(op);
-    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
-        .match(entry_func_op, quantization_method);
-  }
+    if (FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
+            .match(entry_func_op, quantization_method)
+            .failed()) {
+      return failure();
+    }
 
-  void rewrite(TF::XlaCallModuleOp xla_call_module_op,
-               PatternRewriter& rewriter) const override {
     // TODO: b/331145946 - Each quantization method should be valid
     // (GetQuantizationMethodOrDefault swallows invalid method attribute). Check
     // the validity in `match()`. Use accessors to achieve this.
-    const Method quantization_method =
-        GetQuantizationMethodOrDefault(xla_call_module_op);
-
     ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
-        *rewriter.getContext(), rewriter, xla_call_module_op,
+        *rewriter.getContext(), rewriter, op,
         FuncBodyRewritePatternT(enable_per_channel_quantized_weight_),
         quantization_method);
+    return success();
   }
 
  private:
@@ -730,7 +729,17 @@ class QuantizeOpWithRegionPattern
   explicit QuantizeOpWithRegionPattern(MLIRContext& ctx)
       : OpRewritePattern<quantfork::DequantizeCastOp>(&ctx) {};
 
-  LogicalResult match(quantfork::DequantizeCastOp op) const final {
+  LogicalResult matchAndRewrite(quantfork::DequantizeCastOp op,
+                                PatternRewriter& rewriter) const final {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(quantfork::DequantizeCastOp op) const {
     // Match only when there is one user of the dequantize op.
     if (!op.getResult().hasOneUse()) {
       return failure();
@@ -759,7 +768,7 @@ class QuantizeOpWithRegionPattern
   }
 
   void rewrite(quantfork::DequantizeCastOp op,
-               PatternRewriter& rewriter) const final {
+               PatternRewriter& rewriter) const {
     // Rewrite the floating-point ops to the quantized version, by fusing
     // preceding dequantize ops and succeding quantize ops.
     for (Operation* op_with_region : op.getResult().getUsers()) {
@@ -846,7 +855,6 @@ class QuantizeOpWithRegionPattern
     }
   }
 
- private:
   // Checks if an op is quantizable in a nested region.
   bool IsOpQuantizableInNestedRegion(Operation& op) const {
     return isa<mlir::stablehlo::MaxOp, mlir::stablehlo::ReturnOp>(op);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/remove_sharding_custom_call.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/remove_sharding_custom_call.td
index 70ee6dc077ee..0ff3ece326d2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/remove_sharding_custom_call.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/remove_sharding_custom_call.td
@@ -15,7 +15,7 @@ limitations under the License.
 include "stablehlo/dialect/StablehloOps.td"
 
 class IsStringAttrOf<string value> : Constraint<
-  CPred<"::llvm::isa_and_nonnull<StringAttr>($_self) && $_self.cast<StringAttr>().getValue() == \"" # value # "\"">,
+  CPred<"::llvm::isa_and_nonnull<StringAttr>($_self) && llvm::cast<StringAttr>($_self).getValue() == \"" # value # "\"">,
   "Is a string attribute whose value is \"" # value # "\""
 >;
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h
new file mode 100644
index 000000000000..1e16ee648aef
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_TF_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_TF_PASSES_H_
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo::testing {
+
+// Identifies predefined `QuantizationSpecs` for
+// `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`. The pass
+// option argument is specified in line comments for each enum value.
+enum class TestQuantizationSpecs {
+  kEmpty,                         // empty
+  kDisableAllDotGeneral,          // disable-all-dot-general
+  kStaticRangePtqToAll,           // static-range-ptq-to-all
+  kStaticRangePtqToComputeHeavy,  // static-range-ptq-to-compute-heavy
+};
+
+// Adds generated pass default constructors or options definitions.
+#define GEN_PASS_DECL
+// Adds generated pass registration functions.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h.inc"
+
+}  // namespace mlir::tf_quant::stablehlo::testing
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_TF_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.td
new file mode 100644
index 000000000000..63db23ce3c1f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.td
@@ -0,0 +1,94 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Passes only used for testing purposes.
+
+include "mlir/Pass/PassBase.td"
+
+def TestPreCalibrationComponentPass : Pass<"tf-stablehlo-test-pre-calibration-component", "mlir::ModuleOp"> {
+  let summary = "Test-only pass to test the PreCalibrationComponent.";
+  let description = [{
+    Runs the pre calibration passes for post-training quantization with default
+    configuration.
+  }];
+  let dependentDialects = [
+    "mlir::stablehlo::StablehloDialect", "mlir::TF::TensorFlowDialect",
+    "mlir::func::FuncDialect", "mlir::tf_executor::TensorFlowExecutorDialect",
+    "mlir::mhlo::MhloDialect", "mlir::vhlo::VhloDialect",
+  ];
+}
+
+def TestPostCalibrationComponentPass : Pass<"tf-stablehlo-test-post-calibration-component", "mlir::ModuleOp"> {
+  let summary = "Test-only pass to test the PostCalibrationComponent.";
+  let description = [{
+    Runs the post-calibration passes for post-training quantization.
+  }];
+  let options = [
+    Option<"unpack_quantized_types_", "unpack-quantized-types", "bool",
+      /*default=*/"true", "Unpacks ops with uniform quantized types into "
+      "operations without uniform quantized types (mostly i8 or i32).">
+  ];
+  let dependentDialects = [
+    "mlir::stablehlo::StablehloDialect", "mlir::TF::TensorFlowDialect",
+    "mlir::func::FuncDialect", "mlir::mhlo::MhloDialect",
+    "mlir::quant::QuantDialect", "mlir::chlo::ChloDialect",
+    "mlir::vhlo::VhloDialect", "mlir::shape::ShapeDialect",
+    "mlir::quant::ir::TFQuantDialect",
+  ];
+}
+
+def TestTFToStablehloPass : Pass<"tf-stablehlo-test-tf-to-stablehlo", "mlir::ModuleOp"> {
+  let summary = "Test-only pass to test TFToStablehloPasses.";
+  let description = [{
+    Runs the TFToStablehloPasses.
+  }];
+  let dependentDialects = [
+    "mlir::stablehlo::StablehloDialect", "mlir::TF::TensorFlowDialect",
+    "mlir::chlo::ChloDialect", "mlir::quant::QuantDialect",
+    "mlir::mhlo::MhloDialect", "mlir::shape::ShapeDialect",
+    "mlir::sparse_tensor::SparseTensorDialect", "mlir::ub::UBDialect",
+    "mlir::vhlo::VhloDialect",
+  ];
+}
+
+def TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass :
+    Pass<"tf-stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs", "mlir::ModuleOp"> {
+  let summary = "Test-only pass for testing the LiftQuantizableSpotsAsFunctionsPass with a predefined QuantizationSpecs.";
+  let description = [{
+    This test-only pass is the same as `LiftQuantizableSpotsAsFunctionsPass` but
+    has predefined `QuantizationSpecs` to make FileCheck testing easier.
+  }];
+  let options = [
+    Option<"quantization_specs_", "quantization-specs",
+      "mlir::tf_quant::stablehlo::testing::TestQuantizationSpecs",
+      /*default=*/"mlir::tf_quant::stablehlo::testing::TestQuantizationSpecs::kEmpty",
+      "Sets one of the predefined `QuantizationSpecs` for testing.",
+      [{llvm::cl::values(
+        clEnumValN(mlir::tf_quant::stablehlo::testing::TestQuantizationSpecs::kEmpty,
+          "empty", "Uses empty (default) QuantizationSpecs."),
+        clEnumValN(mlir::tf_quant::stablehlo::testing::TestQuantizationSpecs::kDisableAllDotGeneral,
+          "disable-all-dot-general", "Disables all dot_general ops by matching lifted function names"),
+        clEnumValN(mlir::tf_quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToAll,
+          "static-range-ptq-to-all", "Applies `StaticRangePtq` to all quantizable units."),
+        clEnumValN(mlir::tf_quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToComputeHeavy,
+          "static-range-ptq-to-compute-heavy", "Applies `StaticRangePtq` to only compute heavy units.")
+      )}]>
+  ];
+  let dependentDialects = [
+      "mlir::func::FuncDialect",
+      "mlir::stablehlo::StablehloDialect",
+      "TF::TensorFlowDialect",
+  ];
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_lift_quantizable_spots_as_functions_with_quantization_specs.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
new file mode 100644
index 000000000000..4996f96dbff6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
@@ -0,0 +1,139 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo::testing {
+
+// NOLINTNEXTLINE - Automatically generated.
+#define GEN_PASS_DEF_TESTLIFTQUANTIZABLESPOTSASFUNCTIONSWITHQUANTIZATIONSPECSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h.inc"
+
+namespace {
+
+using ::stablehlo::quantization::QuantizationSpecs;
+using ::tsl::protobuf::TextFormat;
+// NOLINTNEXTLINE(misc-include-cleaner) - Required for OSS.
+using ::tsl::protobuf::io::ArrayInputStream;
+
+// Empty (default) `QuantizationSpecs` proto.
+constexpr absl::string_view kSpecsEmpty = R"pb(specs
+                                               [])pb";
+
+// Configure `QuantizationSpecs` to disable quantization for all dot_general
+// quantizable units.
+constexpr absl::string_view kSpecsDisableAllDotGeneral =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: "composite_dot_general_.*" } }
+           method { no_quantization {} }
+         }])pb";
+
+// Configure `QuantizationSpecs` to apply `StaticRangePtq` to all quantizable
+// units.
+constexpr absl::string_view kSpecsStaticRangePtqToAll =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: ".*" } }
+           method { static_range_ptq {} }
+         }])pb";
+
+// Configure `QuantizationSpecs` to apply `StaticRangePtq` to compute heavy
+// units.
+constexpr absl::string_view kSpecsStaticRangePtqToComputeHeavy =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: "^.*(conv|dot|gather).*" } }
+           method { static_range_ptq {} }
+         }])pb";
+
+class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
+    : public impl::
+          TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
+              TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass> {
+ public:
+  using impl::TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
+      TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass>::
+      TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass)
+
+ private:
+  void runOnOperation() override;
+};
+
+// `TestQuantizationSpecs` -> predefined `QuantizationSpecs` textproto.
+absl::string_view GetQuantizationSpecsTextProto(
+    const TestQuantizationSpecs test_specs) {
+  switch (test_specs) {
+    case TestQuantizationSpecs::kEmpty:
+      return kSpecsEmpty;
+    case TestQuantizationSpecs::kDisableAllDotGeneral:
+      return kSpecsDisableAllDotGeneral;
+    case TestQuantizationSpecs::kStaticRangePtqToAll:
+      return kSpecsStaticRangePtqToAll;
+    case TestQuantizationSpecs::kStaticRangePtqToComputeHeavy:
+      return kSpecsStaticRangePtqToComputeHeavy;
+  }
+}
+
+// Parses a text proto into a `QuantizationSpecs` proto. Returns
+// `InvalidArgumentError` if `text_proto` is invalid.
+absl::StatusOr<QuantizationSpecs> ParseTextProto(
+    const absl::string_view text_proto) {
+  QuantizationSpecs quantization_specs;
+  TextFormat::Parser parser;
+  ArrayInputStream input_stream(text_proto.data(), text_proto.size());
+  if (parser.Parse(&input_stream, &quantization_specs)) {
+    return quantization_specs;
+  }
+  return absl::InvalidArgumentError("Could not parse text proto.");
+}
+
+void TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass::
+    runOnOperation() {
+  PassManager pass_manager{&getContext()};
+
+  // Construct `QuantizationSpecs` from the pass option `quantization-specs`.
+  const absl::StatusOr<QuantizationSpecs> quantization_specs =
+      ParseTextProto(GetQuantizationSpecsTextProto(quantization_specs_));
+  if (!quantization_specs.ok()) {
+    signalPassFailure();
+    return;
+  }
+
+  pass_manager.addPass(
+      CreateLiftQuantizableSpotsAsFunctionsPass(*quantization_specs));
+
+  if (failed(pass_manager.run(getOperation()))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo::testing
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_post_calibration_component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_post_calibration_component.cc
new file mode 100644
index 000000000000..d496d9f5b457
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_post_calibration_component.cc
@@ -0,0 +1,83 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_post_calibration.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo::testing {
+
+#define GEN_PASS_DEF_TESTPOSTCALIBRATIONCOMPONENTPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h.inc"
+
+namespace {
+
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PipelineConfig;
+using ::stablehlo::quantization::QuantizationConfig;
+
+class TestPostCalibrationComponentPass
+    : public impl::TestPostCalibrationComponentPassBase<
+          TestPostCalibrationComponentPass> {
+ public:
+  using impl::TestPostCalibrationComponentPassBase<
+      TestPostCalibrationComponentPass>::TestPostCalibrationComponentPassBase;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestPostCalibrationComponentPass)
+
+ private:
+  void runOnOperation() override;
+};
+
+void TestPostCalibrationComponentPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  OpPassManager pm(ModuleOp::getOperationName());
+
+  QuantizationConfig config = QuantizationConfig::default_instance();
+  config.mutable_static_range_ptq_preset();
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+
+  PipelineConfig pipeline_config;
+  pipeline_config.set_unpack_quantized_types(unpack_quantized_types_);
+
+  PostCalibrationComponent component(&ctx);
+  component.AddPasses(pm, new_config.specs(), pipeline_config);
+
+  if (failed(runPipeline(pm, module_op))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo::testing
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_pre_calibration_component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_pre_calibration_component.cc
new file mode 100644
index 000000000000..5403e3759a4a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_pre_calibration_component.cc
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pre_calibration.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo::testing {
+
+#define GEN_PASS_DEF_TESTPRECALIBRATIONCOMPONENTPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h.inc"
+
+namespace {
+
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PopulateDefaults;
+using ::stablehlo::quantization::QuantizationConfig;
+
+class TestPreCalibrationComponentPass
+    : public impl::TestPreCalibrationComponentPassBase<
+          TestPreCalibrationComponentPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestPreCalibrationComponentPass)
+
+ private:
+  void runOnOperation() override;
+};
+
+void TestPreCalibrationComponentPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  // Simply runs the PreCalibrationComponent with a default configuration.
+  quant::stablehlo::PreCalibrationComponent component(&ctx);
+  QuantizationConfig quantization_config{};
+  quantization_config.mutable_static_range_ptq_preset();
+  quantization_config = ExpandPresets(PopulateDefaults(quantization_config));
+  if (!component.Run(module_op, quantization_config).ok()) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo::testing
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_tf_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_tf_to_stablehlo_pass.cc
new file mode 100644
index 000000000000..354c7f739a33
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_test_tf_to_stablehlo_pass.cc
@@ -0,0 +1,70 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/UB/IR/UBOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo::testing {
+
+#define GEN_PASS_DEF_TESTTFTOSTABLEHLOPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/tf_passes.h.inc"
+
+namespace {
+
+using ::tensorflow::quantization::AddTFToStablehloPasses;
+using ::tensorflow::quantization::RunPassesOnModuleOp;
+
+class TestTFToStablehloPass
+    : public impl::TestTFToStablehloPassBase<TestTFToStablehloPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestTFToStablehloPass)
+
+ private:
+  void runOnOperation() override;
+};
+
+void TestTFToStablehloPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = &getContext();
+  mlir::PassManager pm(ctx);
+
+  AddTFToStablehloPasses(pm);
+  if (!RunPassesOnModuleOp(
+           /*mlir_dump_file_name=*/"test_tf_to_stablehlo_pass", pm, module_op)
+           .ok()) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo::testing
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_func_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_func_to_bfloat16.cc
new file mode 100644
index 000000000000..d4f2d88ea34f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_func_to_bfloat16.cc
@@ -0,0 +1,232 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.h"
+#include "tensorflow/core/platform/bfloat16.h"
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+class BFloat16TypeConverter : public TypeConverter {
+ public:
+  BFloat16TypeConverter() {
+    addConversion([](const Type type) -> Type {
+      return quant::stablehlo::IsLargeFloatType(type)
+                 ? quant::stablehlo::ToBfloat16Type(type)
+                 : type;
+    });
+  }
+};
+
+// This helper function makes legality check easier. Both convert ops in the
+// patterns below are considered legal:
+//  - `BitcastConvertOp` (i32 -> f32) + `ConvertOp` (f32 -> bf16)
+//  - `ConvertOp` (bf16 -> f32) -> `BitcastConvertOp` (f32 -> i32)
+template <typename ConvertOp, typename OtherConvertOp>
+bool IsConvertOpLegal(ConvertOp convert_op, BFloat16TypeConverter& converter) {
+  if (!converter.isLegal(convert_op.getOperand().getType())) {
+    auto other_convert_op = dyn_cast_or_null<OtherConvertOp>(
+        convert_op.getOperand().getDefiningOp());
+    return other_convert_op &&
+           converter.isLegal(other_convert_op.getOperand().getType());
+  } else if (!converter.isLegal(convert_op.getResult().getType())) {
+    if (!convert_op.getResult().hasOneUse()) {
+      return false;
+    }
+    auto other_convert_op = dyn_cast_or_null<OtherConvertOp>(
+        *convert_op.getResult().getUsers().begin());
+    return other_convert_op &&
+           converter.isLegal(other_convert_op.getResult().getType());
+  }
+  return true;
+}
+
+class BFloat16TypeConversionTarget : public ConversionTarget {
+ public:
+  explicit BFloat16TypeConversionTarget(MLIRContext& ctx,
+                                        BFloat16TypeConverter& converter)
+      : ConversionTarget(ctx), converter_(converter) {
+    markUnknownOpDynamicallyLegal([this](Operation* op) {
+      // The FuncOp type can contain types that the op's operand and result
+      // types do not contain.
+      if (auto func = dyn_cast<func::FuncOp>(op)) {
+        if (!converter_.isSignatureLegal(func.getFunctionType())) return false;
+      } else if (auto bitcast_convert_op =
+                     dyn_cast<mlir::stablehlo::BitcastConvertOp>(op)) {
+        return IsConvertOpLegal<mlir::stablehlo::BitcastConvertOp,
+                                mlir::stablehlo::ConvertOp>(bitcast_convert_op,
+                                                            converter_);
+      } else if (auto convert_op = dyn_cast<mlir::stablehlo::ConvertOp>(op)) {
+        return IsConvertOpLegal<mlir::stablehlo::ConvertOp,
+                                mlir::stablehlo::BitcastConvertOp>(convert_op,
+                                                                   converter_);
+      }
+      return converter_.isLegal(op);
+    });
+  }
+
+ private:
+  BFloat16TypeConverter& converter_;
+};
+
+class BFloat16TypePattern : public ConversionPattern {
+ public:
+  BFloat16TypePattern(TypeConverter& converter, MLIRContext* ctx)
+      : ConversionPattern(converter, MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult matchAndRewrite(
+      Operation* op, const ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const override {
+    if (getTypeConverter()->isLegal(op)) {
+      return failure();
+    }
+    if (isa<mlir::stablehlo::BitcastConvertOp>(op)) {
+      // Skip `BitcastConvertOp`, which is handled by the other pattern.
+      return failure();
+    }
+
+    // Update the results.
+    SmallVector<Type, 4> new_results;
+    if (failed(getTypeConverter()->convertTypes(op->getResultTypes(),
+                                                new_results)))
+      return failure();
+
+    // Update the regions. The dialect conversion framework wants new regions to
+    // be created and updated, rather than updating the old op. Thus we use an
+    // OperationState so we can add regions to the new op.
+    OperationState state(op->getLoc(), op->getName().getStringRef(), operands,
+                         new_results, op->getAttrs(), op->getSuccessors());
+    for (Region& region : op->getRegions()) {
+      auto new_region = std::make_unique<Region>(op);
+      rewriter.inlineRegionBefore(region, *new_region, new_region->begin());
+      if (failed(rewriter.convertRegionTypes(new_region.get(),
+                                             *getTypeConverter()))) {
+        return failure();
+      }
+      state.addRegion(std::move(new_region));
+    }
+
+    // Convert value of ConstantOp to bfloat16.
+    if (auto const_op = dyn_cast<mlir::stablehlo::ConstantOp>(op)) {
+      const auto values = const_op.getValue().tryGetValues<float>();
+      if (!values.has_value()) {
+        return failure();
+      }
+      const SmallVector<tensorflow::bfloat16> bfloat16_values(values->begin(),
+                                                              values->end());
+      state.attributes.set(
+          const_op.getValueAttrName(),
+          DenseFPElementsAttr::get(
+              mlir::dyn_cast<ShapedType>(const_op.getValue().getType())
+                  .clone(rewriter.getBF16Type()),
+              bfloat16_values));
+    }
+
+    rewriter.replaceOp(op, rewriter.create(state)->getResults());
+
+    return success();
+  }
+};
+
+class BitcastConvertOpPattern
+    : public OpConversionPattern<mlir::stablehlo::BitcastConvertOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mlir::stablehlo::BitcastConvertOp op,
+      mlir::stablehlo::BitcastConvertOpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    const bool is_input_legal =
+        getTypeConverter()->isLegal(op.getOperand().getType());
+    const bool is_output_legal =
+        getTypeConverter()->isLegal(op.getResult().getType());
+    if (is_input_legal && is_output_legal) {
+      return failure();
+    } else if (is_input_legal) {
+      // output is f32, we bitcast_convert to f32 and then convert to bf16.
+      const Value output = rewriter.create<mlir::stablehlo::BitcastConvertOp>(
+          op->getLoc(), op.getResult().getType(), adaptor.getOperand());
+      rewriter.replaceOpWithNewOp<mlir::stablehlo::ConvertOp>(
+          op, getTypeConverter()->convertType(op.getResult().getType()),
+          output);
+    } else if (is_output_legal) {
+      // input is f32, we convert from bf16 and then bitcast_convert.
+      const Value output = rewriter.create<mlir::stablehlo::ConvertOp>(
+          op->getLoc(), op.getOperand().getType(), adaptor.getOperand());
+      rewriter.replaceOpWithNewOp<mlir::stablehlo::BitcastConvertOp>(
+          op, op.getResult().getType(), output);
+    } else {
+      // Both input/output are f32. Convert to no-op.
+      rewriter.replaceOp(op, adaptor.getOperand());
+    }
+    return success();
+  }
+};
+}  // namespace
+
+#define GEN_PASS_DEF_CONVERTFUNCTOBFLOAT16PASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+namespace {
+class ConvertFuncToBfloat16Pass
+    : public impl::ConvertFuncToBfloat16PassBase<ConvertFuncToBfloat16Pass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertFuncToBfloat16Pass)
+
+  explicit ConvertFuncToBfloat16Pass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+void ConvertFuncToBfloat16Pass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  MLIRContext* context = func_op.getContext();
+  RewritePatternSet patterns(context);
+
+  BFloat16TypeConverter converter;
+  patterns.add<BFloat16TypePattern, BitcastConvertOpPattern>(converter,
+                                                             context);
+  populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
+                                                                 converter);
+  BFloat16TypeConversionTarget target(*context, converter);
+  if (failed(applyPartialConversion(func_op.getOperation(), target,
+                                    std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_shape_constraint_to_assert.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_shape_constraint_to_assert.cc
new file mode 100644
index 000000000000..bc9f247c7195
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_shape_constraint_to_assert.cc
@@ -0,0 +1,215 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_CONVERTSHAPETOSTABLEHLOWITHCONSTRAINTSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+using ::mlir::stablehlo::AndOp;
+using ::mlir::stablehlo::CompareOp;
+using ::mlir::stablehlo::ComparisonDirection;
+using ::mlir::stablehlo::ConcatenateOp;
+using ::mlir::stablehlo::ConstantOp;
+using ::mlir::stablehlo::CustomCallOp;
+using ::mlir::stablehlo::OrOp;
+using ::mlir::stablehlo::ReshapeOp;
+using ::mlir::stablehlo::SliceOp;
+
+// Cast from index-based shape representation used in the Shape dialect to the
+// i32-based representation used in HLO:
+//   * index => tensor<i32>.
+//   * tensor<Nxindex> => tensor<Nxi32>.
+//   * All i32-based types from above => themselves.
+// There is no convenient op that can express this, so we're using
+// unrealized_conversion_cast (with the idea that all these casts will
+// annihilate at the end of the pass).
+Value castToI32(PatternRewriter& rewriter, Location loc, Value value) {
+  Type resultType;
+  if (value.getType().isIndex())
+    resultType = RankedTensorType::get({}, rewriter.getI32Type());
+  if (auto valueType = mlir::dyn_cast<ShapedType>(value.getType())) {
+    if (!valueType.hasStaticShape()) return {};
+    if (valueType.getElementType().isInteger(32)) return value;
+    if (valueType.getElementType().isIndex())
+      resultType =
+          RankedTensorType::get(valueType.getShape(), rewriter.getI32Type());
+  }
+  if (!resultType) return {};
+  auto cast =
+      rewriter.create<UnrealizedConversionCastOp>(loc, resultType, value);
+  return cast.getResult(0);
+}
+
+// Pads input tensor<N x i32> by X ones from the left. The number X is
+// determined by input pad. Result is tensor<(X+N) x i32>, where the first X
+// elements are ones.
+Value padFromLeft(PatternRewriter& rewriter, Location loc, Value input,
+                  int64_t pad) {
+  Value padI32 = rewriter.create<ConstantOp>(
+      loc, DenseIntElementsAttr::get<int32_t>(
+               RankedTensorType::get({pad}, rewriter.getI32Type()), 1));
+  return rewriter.create<ConcatenateOp>(loc, ValueRange{padI32, input},
+                                        /*dimension=*/0);
+}
+
+void insertShapeAssertionCustomCall(OpBuilder builder, Location loc,
+                                    Value assert) {
+  auto customCall =
+      builder.create<CustomCallOp>(loc, TypeRange{}, ValueRange{assert});
+  customCall.setCallTargetName("shape_assertion");
+  customCall.setHasSideEffect(true);
+  customCall->setAttr("error_message",
+                      builder.getStringAttr("Shape assertion failed"));
+}
+
+struct ConvertCstrBroadcastableOp
+    : public OpRewritePattern<shape::CstrBroadcastableOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
+                                PatternRewriter& rewriter) const override {
+    // As defined, op inputs must be 1D tensor or !shape.shape.
+    // We only support inputs of two 1D tensors.
+    if (op.getShapes().size() != 2) return failure();
+    auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
+    auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
+    if (!shape1 || !shape2) return failure();
+    auto tensorType1 = mlir::dyn_cast<RankedTensorType>(shape1.getType());
+    auto tensorType2 = mlir::dyn_cast<RankedTensorType>(shape2.getType());
+    if (!tensorType1 || !tensorType2) return failure();
+
+    // If the two operand shapes are of different sizes, the smaller one is
+    // padded with 1's from the left.
+    int32_t rank =
+        std::max(tensorType1.getDimSize(0), tensorType2.getDimSize(0));
+    if (tensorType1.getDimSize(0) < tensorType2.getDimSize(0)) {
+      shape1 =
+          padFromLeft(rewriter, op.getLoc(), shape1,
+                      tensorType2.getDimSize(0) - tensorType1.getDimSize(0));
+    } else if (tensorType1.getDimSize(0) > tensorType2.getDimSize(0)) {
+      shape2 =
+          padFromLeft(rewriter, op.getLoc(), shape2,
+                      tensorType1.getDimSize(0) - tensorType2.getDimSize(0));
+    }
+
+    // Compute if each dim is broadcastable. A dim is broadcastable iff
+    // dimSize1 == dimSize2 or dimSize1 == 1 or dimSize2 == 1
+    auto allOne = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<int32_t>(
+                         RankedTensorType::get({rank}, rewriter.getI32Type()),
+                         static_cast<int32_t>(1)));
+    Value dimSize1Is1 = rewriter.create<CompareOp>(op.getLoc(), shape1, allOne,
+                                                   ComparisonDirection::EQ);
+    Value dimSize2Is1 = rewriter.create<CompareOp>(op.getLoc(), shape2, allOne,
+                                                   ComparisonDirection::EQ);
+    Value eitherDimSizeIs1 =
+        rewriter.create<OrOp>(op.getLoc(), dimSize1Is1, dimSize2Is1);
+    Value dimSizeEq = rewriter.create<CompareOp>(op.getLoc(), shape1, shape2,
+                                                 ComparisonDirection::EQ);
+    Value dimBroadcastable =
+        rewriter.create<OrOp>(op.getLoc(), eitherDimSizeIs1, dimSizeEq);
+
+    // Iterate over each dim to check that all dims are broadcastable.
+    auto boolType = RankedTensorType::get({1}, rewriter.getI1Type());
+    Value allBroadcastable = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<bool>(boolType, true));
+    for (auto i = 0; i < rank; ++i) {
+      Value broadcastable = rewriter.create<SliceOp>(
+          op.getLoc(), dimBroadcastable, rewriter.getDenseI64ArrayAttr(i),
+          rewriter.getDenseI64ArrayAttr(i + 1),
+          rewriter.getDenseI64ArrayAttr(1));
+      allBroadcastable =
+          rewriter.create<AndOp>(op.getLoc(), allBroadcastable, broadcastable);
+    }
+    Value allBroadcastableScalar = rewriter.create<ReshapeOp>(
+        op.getLoc(), RankedTensorType::get({}, rewriter.getI1Type()),
+        allBroadcastable);
+
+    // Add CustomCallOp and replace Cstr op with const witness, which is useful
+    // for canonicalizer to remove the shape.assuming region.
+    insertShapeAssertionCustomCall(rewriter, op->getLoc(),
+                                   allBroadcastableScalar);
+    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op.getOperation(), true);
+    return success();
+  }
+};
+
+bool hasIndexStyle(Value value) {
+  if (value.getType().isIndex()) return true;
+  auto type = mlir::dyn_cast<ShapedType>(value.getType());
+  return type && type.getElementType().isIndex();
+}
+
+struct ConvertShapeToStablehloWithConstraintsPass
+    : public impl::ConvertShapeToStablehloWithConstraintsPassBase<
+          ConvertShapeToStablehloWithConstraintsPass> {
+  void runOnOperation() override {
+    ConversionTarget target(getContext());
+    target.addIllegalDialect<shape::ShapeDialect>();
+    target.addIllegalDialect<tensor::TensorDialect>();
+    target.addIllegalOp<arith::IndexCastOp>();
+    target.addIllegalOp<arith::MulIOp>();
+    target.addDynamicallyLegalDialect<::mlir::stablehlo::StablehloDialect>(
+        [](Operation* op) {
+          return !llvm::any_of(op->getOperands(), hasIndexStyle);
+        });
+    target.addLegalOp<tensor::CastOp>();
+    target.addLegalOp<UnrealizedConversionCastOp>();
+    target.addLegalOp<shape::ConstWitnessOp, shape::AssumingOp,
+                      shape::AssumingYieldOp>();
+
+    RewritePatternSet patterns(&getContext());
+    ::mlir::stablehlo::populateShapeToStablehloPatterns(&getContext(),
+                                                        &patterns);
+
+    patterns.add<ConvertCstrBroadcastableOp>(&getContext());
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_xla_call_module_op_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_xla_call_module_op_to_bfloat16.cc
new file mode 100644
index 000000000000..2db14f7470f0
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_convert_xla_call_module_op_to_bfloat16.cc
@@ -0,0 +1,146 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/Serialization.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+absl::StatusOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
+    const StringRef serialized_stablehlo_module) {
+  // StableHLO module is empty often because the XlaCallModuleOp is already
+  // deserialized, e.g. after invoking XlaCallModuleDeserializationPass. We
+  // don't handle this situation.
+  if (serialized_stablehlo_module.empty()) {
+    return absl::InvalidArgumentError("StableHLO module is empty.");
+  }
+
+  MLIRContext context;
+  OwningOpRef<ModuleOp> stablehlo_module_op =
+      mlir::stablehlo::deserializePortableArtifact(serialized_stablehlo_module,
+                                                   &context);
+  auto version =
+      mlir::stablehlo::getPortableArtifactVersion(serialized_stablehlo_module);
+  if (failed(version)) {
+    return absl::InternalError(
+        "Failed to get the deserialized StableHLO version, XlaCallModuleOp "
+        "must have a valid StableHLO module serialized using "
+        "stablehlo::serializePortableArtifact APIs.");
+  }
+
+  // Convert the StableHLO module to bfloat16.
+  PassManager pm(&context);
+  pm.addNestedPass<func::FuncOp>(createConvertFuncToBfloat16Pass());
+  if (failed(pm.run(stablehlo_module_op.get()))) {
+    return absl::InternalError(
+        "Failed to convert StableHLO module to bfloat16.");
+  }
+
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  if (failed(mlir::stablehlo::serializePortableArtifact(
+          stablehlo_module_op.get(), version.value().toString(), os))) {
+    return absl::InternalError("Failed to serialize StableHLO module.");
+  }
+  return bytecode;
+}
+
+#define GEN_PASS_DEF_CONVERTXLACALLMODULEOPTOBFLOAT16PASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+class ConvertXlaCallModuleOpToBfloat16Pass
+    : public impl::ConvertXlaCallModuleOpToBfloat16PassBase<
+          ConvertXlaCallModuleOpToBfloat16Pass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      ConvertXlaCallModuleOpToBfloat16Pass)
+
+  explicit ConvertXlaCallModuleOpToBfloat16Pass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+void ConvertXlaCallModuleOpToBfloat16Pass::runOnOperation() {
+  Operation* func_op = getOperation();
+  SymbolTableCollection symbol_table;
+  OpBuilder builder(&getContext());
+
+  auto result = func_op->walk([&](TF::XlaCallModuleOp op) {
+    // Converts the serialized StableHLO module to bfloat16.
+    auto result =
+        ConvertSerializedStableHloModuleToBfloat16(op.getModuleAttr());
+    if (!result.ok()) {
+      llvm::errs() << "Failed to convert StableHLO module to bfloat16: "
+                   << result.status().message();
+      return WalkResult::interrupt();
+    }
+    op.setModuleAttr(StringAttr::get(&getContext(), *result));
+
+    // Convert the `tf.XlaCallModuleOp` to bfloat16 and add casts around it.
+    builder.setInsertionPoint(op);
+    for (auto& op_operand : op->getOpOperands()) {
+      if (quant::stablehlo::IsLargeFloatType(op_operand.get().getType())) {
+        op_operand.set(builder.create<TF::CastOp>(
+            op->getLoc(),
+            quant::stablehlo::ToBfloat16Type(op_operand.get().getType()),
+            op_operand.get()));
+      }
+    }
+    builder.setInsertionPointAfter(op);
+    for (auto op_result : op->getOpResults()) {
+      if (quant::stablehlo::IsLargeFloatType(op_result.getType())) {
+        const Type original_type = op_result.getType();
+        op_result.setType(quant::stablehlo::ToBfloat16Type(original_type));
+        const Value cast =
+            builder.create<TF::CastOp>(op->getLoc(), original_type, op_result);
+        op_result.replaceAllUsesExcept(cast, cast.getDefiningOp());
+      }
+    }
+    return WalkResult::advance();
+  });
+
+  if (result.wasInterrupted()) return signalPassFailure();
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_defer_activation_transpose.cc
new file mode 100644
index 000000000000..f2816f4a700c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_defer_activation_transpose.cc
@@ -0,0 +1,294 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "absl/base/nullability.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_DEFERACTIVATIONTRANSPOSEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::BroadcastInDimOp;
+using ::mlir::stablehlo::MaxOp;
+using ::mlir::stablehlo::TransposeOp;
+
+// Returns `success()` if `op` is a `TransposeOp` with permutation attribute
+// equivalent to `permuation`.
+LogicalResult IsTransposeOpWithPermuation(Operation* absl_nullable op,
+                                          const ArrayRef<int64_t> permutation) {
+  auto transpose_op = dyn_cast_or_null<TransposeOp>(op);
+  return success(transpose_op != nullptr && transpose_op.getPermutation() ==
+                                                ArrayRef<int64_t>(permutation));
+}
+
+// Convenience function to create a `TransposeOp` with a given `permutation`.
+// The Location is set as `input`'s loc.
+TransposeOp CreateTransposeOp(Value input, const ArrayRef<int64_t> permutation,
+                              PatternRewriter& rewriter) {
+  return rewriter.create<TransposeOp>(
+      input.getLoc(), input, rewriter.getDenseI64ArrayAttr(permutation));
+}
+
+// Defers the transpose of the left-hand side (LHS) to the right-hand side and
+// the result of a binary operation. In detail, this rewrites the
+// `op(transpose(%rhs), %lhs)` to `transpose(op(%rhs, transpose(%lhs)))`. The
+// LHS transpose permutation must be a NCHW->NHWC permutation.
+template <typename OpT>
+void DeferRhsTransposeForBinaryOp(OpT op, PatternRewriter& rewriter) {
+  auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
+  Value lhs_pre_transpose = transpose_op.getOperand();
+
+  // NCHW -> NHWC for the right-hand side, to match the operand's shape.
+  Value rhs = op.getOperand(1);
+  TransposeOp rhs_transpose_op = CreateTransposeOp(
+      /*input=*/rhs, kNchwToNhwcPermutation, rewriter);
+
+  auto new_binary_op =
+      rewriter.create<OpT>(op.getLoc(), lhs_pre_transpose, rhs_transpose_op);
+
+  // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
+  TransposeOp output_transpose_op = CreateTransposeOp(
+      /*input=*/new_binary_op, kNhwcToNchwPermutation, rewriter);
+
+  rewriter.replaceAllUsesWith(op.getResult(), output_transpose_op);
+}
+
+// "Climbs up" the `op` if `op` is a `BraodcastInDimOp` and returns the defining
+// op of its operand. Returns `op` otherwise. May return `nullptr` when the
+// `BroadcastInDimOp`'s operand is a block argument.
+Operation* absl_nullable SkipUpwardsOptionalBroadcastInDimOp(
+    Operation* absl_nonnull op) {
+  if (auto broadcast_in_dim_op = dyn_cast_or_null<BroadcastInDimOp>(op);
+      broadcast_in_dim_op != nullptr) {
+    return broadcast_in_dim_op.getOperand().getDefiningOp();
+  }
+  return op;
+}
+
+class DeferActivationTransposeForAddOp : public OpRewritePattern<AddOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(AddOp op,
+                                PatternRewriter& rewriter) const override {
+    // Only supports the case for 2D convolution.
+    const Value lhs = op.getOperand(0);
+    if (!HasRankOf(lhs, /*rank=*/4)) return failure();
+
+    const Value rhs = op.getOperand(1);
+    Operation* rhs_op = rhs.getDefiningOp();
+    if (rhs_op == nullptr) return failure();
+
+    // Ignore the optional `BroadcastInDimOp` in between the constant and RHS.
+    rhs_op = SkipUpwardsOptionalBroadcastInDimOp(rhs_op);
+
+    if (rhs_op == nullptr || !rhs_op->hasTrait<OpTrait::ConstantLike>()) {
+      return failure();
+    }
+
+    // Match LHS permutation that converts: NHWC -> NCHW.
+    if (IsTransposeOpWithPermuation(lhs.getDefiningOp(), kNhwcToNchwPermutation)
+            .failed()) {
+      return failure();
+    }
+
+    DeferRhsTransposeForBinaryOp(op, rewriter);
+    return success();
+  }
+};
+
+// Rewrites the `reduce_window(transpose(%activation), %init_value)` patterns to
+// `transpose(reduce_window(%activation), %init_value)`, deferring the transpose
+// to the result. The reduce function should be equivalent to
+// `stablehlo.maximum`, representing max pooling.
+class DeferActivationTransposeForMaxPoolReduceWindowOp
+    : public OpRewritePattern<mlir::stablehlo::ReduceWindowOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::stablehlo::ReduceWindowOp op,
+                                PatternRewriter& rewriter) const override {
+    if (failed(MatchMaxPoolReduceWindowOp(op))) return failure();
+
+    // Match only when the lhs is connected to a transpose.
+    // Only supports the case commonly appearing for 2D convolutions.
+    Value lhs = op.getOperand(0);
+    if (!HasRankOf(lhs, /*rank=*/4)) return failure();
+
+    // Match input permutation that converts: NHWC -> NCHW.
+    if (IsTransposeOpWithPermuation(lhs.getDefiningOp(), kNhwcToNchwPermutation)
+            .failed()) {
+      return failure();
+    }
+
+    // Pushes the transpose op at the input to the result.
+    auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
+
+    const auto result_type = mlir::cast<TensorType>(op.getResult(0).getType());
+    const SmallVector<int64_t> new_result_shape =
+        quant::Permute<int64_t>(result_type.getShape(), kNchwToNhwcPermutation);
+
+    const TensorType new_result_type =
+        result_type.cloneWith(new_result_shape, result_type.getElementType());
+
+    // Create a new `stablehlo.reduce_window` with all relevant attributes
+    // permutated to match the new operand & result type.
+    auto new_reduce_window_op =
+        rewriter.create<mlir::stablehlo::ReduceWindowOp>(
+            op.getLoc(), new_result_type, transpose_op.getOperand(),
+            /*init_value=*/op.getOperand(1),
+            /*window_dimensions=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowDimensions(),
+                                kNchwToNhwcPermutation),
+            /*window_strides=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowStrides(),
+                                kNchwToNhwcPermutation),
+            /*base_dilations=*/
+            PermuteI64ArrayAttr(rewriter, op.getBaseDilations(),
+                                kNchwToNhwcPermutation),
+            /*window_dilations=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowDilations(),
+                                kNchwToNhwcPermutation),
+            /*padding=*/DenseIntElementsAttr(nullptr));
+
+    // Clone the reduce body. It is not affected by the permutation.
+    IRMapping mapping;
+    op.getBody().cloneInto(&new_reduce_window_op.getBody(), mapping);
+
+    // Introduce a transpose to the result to match the shapes of `op`'s uses.
+    TransposeOp result_transpose_op = CreateTransposeOp(
+        /*input=*/new_reduce_window_op.getResult(0), kNhwcToNchwPermutation,
+        rewriter);
+
+    rewriter.replaceAllUsesWith(op.getResult(0), result_transpose_op);
+    return success();
+  }
+
+ private:
+  // Permutes `array_attr` with `permutation`. The number of elements in
+  // `array_attr` and `permutation` must be equal. Returns a null attribute
+  // if `array_attr` is null.
+  DenseI64ArrayAttr PermuteI64ArrayAttr(
+      PatternRewriter& rewriter,
+      const std::optional<ArrayRef<int64_t>> array_attr,
+      const ArrayRef<int64_t> permutation) const {
+    if (!array_attr.has_value()) return DenseI64ArrayAttr(nullptr);
+
+    return rewriter.getDenseI64ArrayAttr(
+        quant::Permute<int64_t>(array_attr.value(), permutation));
+  }
+
+  LogicalResult MatchMaxPoolReduceWindowOp(
+      mlir::stablehlo::ReduceWindowOp op) const {
+    // TODO: b/321099943 - Support explicit padding.
+    if (HasPadding(op)) return failure();
+
+    // Check that the reduce-window body is a max operation.
+    return success(IsMaxFunction(op.getBody().front()));
+  }
+
+  // Whether `block` semantically corresponds to a `stablehlo.maximum` op.
+  bool IsMaxFunction(Block& block) const {
+    if (block.getNumArguments() != 2) return false;
+
+    auto return_op = cast<mlir::stablehlo::ReturnOp>(block.getTerminator());
+    if (return_op.getNumOperands() != 1) return false;
+
+    auto max_op = dyn_cast_or_null<MaxOp>(
+        return_op.getOperands().front().getDefiningOp());
+    if (!max_op) return false;
+
+    return (max_op.getLhs() == block.getArgument(0)) &&
+           (max_op.getRhs() == block.getArgument(1));
+  }
+
+  // Whether `op` has the `padding` attribute (which is optional).
+  bool HasPadding(mlir::stablehlo::ReduceWindowOp op) const {
+    return op.getPadding() != std::nullopt;
+  }
+};
+
+// Rewrites `maximum(transpose(%rhs), %lhs)` patterns to
+// `transpose(maximum(%rhs, transpose(%lhs)))`.
+class DeferActivationTransposeForMaxOp : public OpRewritePattern<MaxOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(MaxOp op,
+                                PatternRewriter& rewriter) const override {
+    Value input = op.getOperand(0);
+    if (!HasRankOf(input, /*rank=*/4)) return failure();
+
+    const Value max_value = op.getOperand(1);
+    Operation* max_value_op = max_value.getDefiningOp();
+    if (max_value_op == nullptr ||
+        !max_value_op->hasTrait<OpTrait::ConstantLike>()) {
+      return failure();
+    }
+
+    if (IsTransposeOpWithPermuation(input.getDefiningOp(),
+                                    kNhwcToNchwPermutation)
+            .failed()) {
+      return failure();
+    }
+    DeferRhsTransposeForBinaryOp(op, rewriter);
+    return success();
+  }
+};
+
+}  // namespace
+
+class DeferActivationTransposePass
+    : public impl::DeferActivationTransposePassBase<
+          DeferActivationTransposePass> {
+ private:
+  void runOnOperation() override;
+};
+
+void DeferActivationTransposePass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<DeferActivationTransposeForAddOp,
+               DeferActivationTransposeForMaxPoolReduceWindowOp,
+               DeferActivationTransposeForMaxOp>(&ctx);
+  if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
+    func_op->emitWarning() << "Failed to converge patterns: " << getArgument();
+  }
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_fold_constant_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_fold_constant_transpose.cc
new file mode 100644
index 000000000000..4de2b0ee026b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_fold_constant_transpose.cc
@@ -0,0 +1,195 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_FOLDCONSTANTTRANSPOSEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+// Returns contiguous offset (address) of the position represented by `indices`
+// in a `shape` shaped tensor. Assumes row-major order. `indices` and `shape`
+// should have the same size.
+// Example: Index (2, 3) of a (4, 5)-shaped tensor has the contiguous offset of
+// 2 * 5 + 3 = 13.
+int64_t GetContiguousOffset(const ArrayRef<int64_t> indices,
+                            const ArrayRef<int64_t> shape) {
+  int64_t contiguous_offset = 0;
+  int64_t base_offset = 1;
+  for (auto [i, dimension] : llvm::reverse(llvm::zip_equal(indices, shape))) {
+    contiguous_offset += base_offset * i;
+    base_offset *= dimension;
+  }
+
+  return contiguous_offset;
+}
+
+// Performs transposition of a tensor represented as a contiguous element array.
+// Assumes row-major order. The shape of the input tensor and the desired
+// permutation is registered during construction, and calling `TransposeValues`
+// returns the transposed tensor values.
+class DenseElementsTransposer {
+ public:
+  DenseElementsTransposer(const ArrayRef<int64_t> original_shape,
+                          const ArrayRef<int64_t> permutation)
+      : rank_(original_shape.size()),
+        original_shape_(original_shape),
+        target_shape_(quant::Permute<int64_t>(original_shape, permutation)),
+        permutation_(permutation) {}
+
+  // Transposes `values` with the permutation. Returns the transposed values.
+  SmallVector<float> TransposeValues(const ArrayRef<float> values) const {
+    SmallVector<float> transposed_values(values.size());
+    SmallVector<int64_t> current_indices = {};
+    TransposeRecursively(values, transposed_values, current_indices);
+
+    return transposed_values;
+  }
+
+  // Returns the shape after permutation.
+  SmallVector<int64_t> GetTargetShape() const { return target_shape_; }
+
+ private:
+  // Helper function that performs transposition recursively by mapping each set
+  // of indices from the original values to the target values.
+  void TransposeRecursively(const ArrayRef<float> original_values,
+                            const MutableArrayRef<float> target_values,
+                            SmallVector<int64_t>& current_indices) const {
+    // Map an element from `original_values` to `target_values` when a set of
+    // indices is formed.
+    if (current_indices.size() == rank_) {
+      const int64_t original_index =
+          GetContiguousOffset(current_indices, original_shape_);
+
+      const SmallVector<int64_t> target_indices =
+          quant::Permute<int64_t>(current_indices, permutation_);
+      const int64_t target_index =
+          GetContiguousOffset(target_indices, target_shape_);
+
+      target_values[target_index] = original_values[original_index];
+      return;
+    }
+
+    // Recursively iterate by selecting the index of the next dimension.
+    const int next_shape_idx = current_indices.size();
+    for (int i = 0; i < original_shape_[next_shape_idx]; ++i) {
+      current_indices.push_back(i);
+      TransposeRecursively(original_values, target_values, current_indices);
+      current_indices.pop_back();
+    }
+  }
+
+  int rank_;                             // Rank of the input values.
+  SmallVector<int64_t> original_shape_;  // Shape of the original tensor.
+  SmallVector<int64_t> target_shape_;    // Shape of the target tensor.
+  SmallVector<int64_t> permutation_;
+};
+
+class FoldTransposedConstantOp
+    : public OpRewritePattern<mlir::stablehlo::TransposeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::stablehlo::TransposeOp op,
+                                PatternRewriter& rewriter) const override {
+    Value operand = op.getOperand();
+    auto const_op =
+        dyn_cast_or_null<mlir::stablehlo::ConstantOp>(operand.getDefiningOp());
+    if (!const_op) return failure();
+
+    // Only support float tensors.
+    auto tensor_type = mlir::dyn_cast_or_null<TensorType>(const_op.getType());
+    if (!tensor_type || !tensor_type.getElementType().isF32()) {
+      return failure();
+    }
+
+    if (!mlir::isa_and_nonnull<DenseFPElementsAttr>(const_op.getValue())) {
+      return failure();
+    }
+
+    const auto value_attr =
+        mlir::cast<DenseFPElementsAttr>(const_op.getValue());
+    const ArrayRef<int64_t> original_shape =
+        value_attr.getShapedType().getShape();
+
+    const SmallVector<float> original_values =
+        llvm::to_vector(value_attr.getValues<float>());
+
+    // Fold the constant value by transposing the values according to the
+    // `TransposeOp`'s permutation attribute.
+    const DenseElementsTransposer transposer(original_shape,
+                                             op.getPermutation());
+    SmallVector<float> transposed_values =
+        transposer.TransposeValues(original_values);
+
+    // Create a new constant op with the transposed values.
+    const Location combined_loc =
+        rewriter.getFusedLoc({const_op.getLoc(), op.getLoc()});
+    auto new_value_type =
+        RankedTensorType::getChecked(combined_loc, transposer.GetTargetShape(),
+                                     /*elementType=*/rewriter.getF32Type());
+    auto new_value_attr =
+        DenseFPElementsAttr::get(new_value_type, std::move(transposed_values));
+    auto new_const_op = rewriter.create<mlir::stablehlo::ConstantOp>(
+        combined_loc, new_value_attr);
+
+    rewriter.replaceAllUsesWith(op, new_const_op);
+    return success();
+  }
+};
+
+}  // namespace
+
+class FoldConstantTransposePass
+    : public impl::FoldConstantTransposePassBase<FoldConstantTransposePass> {
+ public:
+  using impl::FoldConstantTransposePassBase<
+      FoldConstantTransposePass>::FoldConstantTransposePassBase;
+
+ private:
+  void runOnOperation() override;
+};
+
+void FoldConstantTransposePass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<FoldTransposedConstantOp>(&ctx);
+  if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
+    func_op.emitError("Failed to fold constant->transpose pattern.");
+    signalPassFailure();
+  }
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_insert_calibration_statistics_saver.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_insert_calibration_statistics_saver.cc
new file mode 100644
index 000000000000..1f4fa95533c3
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_insert_calibration_statistics_saver.cc
@@ -0,0 +1,190 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+#include "tsl/platform/path.h"
+
+namespace mlir::tf_quant::stablehlo {
+namespace {
+
+std::string GetOutputFilePath(absl::string_view calibration_data_dir,
+                              absl::string_view func_name,
+                              int32_t output_file_idx) {
+  return tsl::io::JoinPath(calibration_data_dir,
+                           llvm::Twine(func_name)
+                               .concat("_")
+                               .concat(std::to_string(output_file_idx))
+                               .concat(".pb")
+                               .str());
+}
+
+// Finds `CustomAggregator` ops and collects their outputs and attributes.
+void FindCustomAggregatorOps(
+    Region& region,
+    const std::unordered_set<std::string>& aggregator_ops_to_ignore,
+    SmallVector<Value>& statistics_outputs, SmallVector<StringRef>& ids,
+    SmallVector<int32_t>& calibration_methods) {
+  for (auto op : region.getOps<TF::CustomAggregatorOp>()) {
+    if (aggregator_ops_to_ignore.count(op.getId().str())) continue;
+
+    ids.push_back(op.getId());
+    calibration_methods.push_back(op.getCalibrationMethod());
+    statistics_outputs.push_back(op.getMin());
+    statistics_outputs.push_back(op.getMax());
+    statistics_outputs.push_back(op.getHistogram());
+  }
+}
+
+// Inserts a `CalibrationStatisticsSaverOp` to the end of the region.
+LogicalResult InsertCalibrationStatisticsSaverOp(
+    Region& region, MLIRContext& ctx, absl::string_view output_file_path,
+    const std::unordered_set<std::string>& aggregator_ops_to_ignore) {
+  SmallVector<Value> statistics_outputs;
+  SmallVector<StringRef> ids;
+  SmallVector<int32_t> calibration_methods;
+  FindCustomAggregatorOps(region, aggregator_ops_to_ignore, statistics_outputs,
+                          ids, calibration_methods);
+  if (statistics_outputs.empty()) return failure();
+
+  OpBuilder builder(&ctx);
+  // Set the insertion point right before the return op.
+  builder.setInsertionPoint(&region.back().back());
+
+  StringAttr output_file_path_attr = builder.getStringAttr(output_file_path);
+  ArrayAttr ids_attr = builder.getStrArrayAttr(ids);
+  ArrayAttr calibration_methods_attr =
+      builder.getI32ArrayAttr(calibration_methods);
+  builder.create<TF::CalibrationStatisticsSaverOp>(
+      region.getLoc(), statistics_outputs, output_file_path_attr, ids_attr,
+      calibration_methods_attr);
+  return success();
+}
+
+// Returns true if the op contains a `CalibrationStatisticsSaverOp`.
+bool ContainCalibrationStatisticsSaverOp(Operation* op) {
+  // Check the region for CaseRegionOp, IfRegionOp and WhileRegionOp.
+  for (Region& region : op->getRegions()) {
+    if (!region.getOps<TF::CalibrationStatisticsSaverOp>().empty()) {
+      return true;
+    }
+  }
+
+  SymbolTable symbol_table(op->getParentOfType<ModuleOp>());
+  // Check the functions associated to CaseOp, IfOp and WhileOp.
+  for (const NamedAttribute& attr : op->getAttrs()) {
+    FlatSymbolRefAttr symbol_attr =
+        dyn_cast_or_null<FlatSymbolRefAttr>(attr.getValue());
+    if (!symbol_attr) continue;
+
+    func::FuncOp target_func = dyn_cast_or_null<func::FuncOp>(
+        symbol_table.lookup(symbol_attr.getValue()));
+    if (!target_func) continue;
+
+    if (!target_func.getBody()
+             .getOps<TF::CalibrationStatisticsSaverOp>()
+             .empty()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+#define GEN_PASS_DECL_INSERTCALIBRATIONSTATISTICSSAVERPASS
+#define GEN_PASS_DEF_INSERTCALIBRATIONSTATISTICSSAVERPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+class InsertCalibrationStatisticsSaverPass
+    : public impl::InsertCalibrationStatisticsSaverPassBase<
+          InsertCalibrationStatisticsSaverPass> {
+ public:
+  using impl::InsertCalibrationStatisticsSaverPassBase<
+      InsertCalibrationStatisticsSaverPass>::
+      InsertCalibrationStatisticsSaverPassBase;
+
+ private:
+  void runOnOperation() override;
+};
+
+void InsertCalibrationStatisticsSaverPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  std::unordered_set<std::string> aggregator_ops_to_ignore(
+      aggregator_ops_to_ignore_.begin(), aggregator_ops_to_ignore_.end());
+
+  // Insert CalibrationStatisticsSaverOp to the end of each region.
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    int32_t output_file_idx = 0;
+    StringRef func_name = func_op.getSymName();
+
+    func_op.walk([&output_file_idx, &ctx, &func_name, &aggregator_ops_to_ignore,
+                  this](Operation* op) {
+      for (Region& region : op->getRegions()) {
+        if (succeeded(InsertCalibrationStatisticsSaverOp(
+                region, ctx,
+                GetOutputFilePath(calibration_data_dir_, func_name,
+                                  output_file_idx),
+                aggregator_ops_to_ignore))) {
+          ++output_file_idx;
+        };
+      }
+    });
+  }
+
+  // Control flow ops that contains CalibrationStatisticsSaver ops must be set
+  // to stateful, otherwise the op will not be executed.
+  OpBuilder builder(&ctx);
+  module_op.walk([&builder](Operation* op) {
+    if (op->hasAttrOfType<BoolAttr>("is_stateless") &&
+        ContainCalibrationStatisticsSaverOp(op)) {
+      op->setAttr("is_stateless", builder.getBoolAttr(false));
+    }
+  });
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInsertCalibrationStatisticsSaverPass(
+    StringRef calibration_data_dir,
+    const std::vector<std::string>& aggregator_ops_to_ignore) {
+  InsertCalibrationStatisticsSaverPassOptions options = {
+      .aggregator_ops_to_ignore_ = llvm::to_vector(aggregator_ops_to_ignore),
+      .calibration_data_dir_ = calibration_data_dir.str(),
+  };
+  return std::make_unique<InsertCalibrationStatisticsSaverPass>(options);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_insert_weight_param.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_insert_weight_param.cc
new file mode 100644
index 000000000000..d6d4a9093051
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_insert_weight_param.cc
@@ -0,0 +1,249 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_INSERTWEIGHTPARAMPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizedType;
+using ::stablehlo::quantization::WeightOnlyPtq;
+
+// Inserts quantization parameters of weights for weight-only quantization and
+// dynamic range quantization of `stablehlo.convolution` and
+// `stablehlo.dot_general`.
+class InsertWeightParamPass
+    : public impl::InsertWeightParamPassBase<InsertWeightParamPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertWeightParamPass)
+
+  using impl::InsertWeightParamPassBase<
+      InsertWeightParamPass>::InsertWeightParamPassBase;
+
+ private:
+  void runOnOperation() override;
+};
+
+// Inserts quantization parameters for weights for hybrid quantization of
+// `stablehlo.convolution` and `stablehlo.dot_general`.
+class InsertWeightParamPattern
+    : public OpTraitRewritePattern<OpTrait::ConstantLike> {
+ public:
+  explicit InsertWeightParamPattern(MLIRContext* context)
+      : OpTraitRewritePattern(context) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    if (op->getNumResults() != 1) {
+      return failure();
+    }
+    auto type = mlir::cast<TensorType>(op->getResult(0).getType());
+    if (!type || !type.getElementType().isF32()) {
+      return failure();
+    }
+    if (!op->hasOneUse() ||
+        !IsWeightQuantizableFunction(*op->getUses().begin(), type.getRank())) {
+      return failure();
+    }
+
+    Operation* quantizable_op = *op->getUsers().begin();
+    DenseFPElementsAttr attr;
+    matchPattern(op->getResult(0), m_Constant(&attr));
+
+    Method method = GetQuantizationMethodOrDefault(quantizable_op);
+    const WeightOnlyPtq& weight_only_ptq = method.weight_only_ptq();
+
+    Type weight_type;
+    if (IsPerTensor(weight_only_ptq)) {
+      weight_type =
+          dyn_cast<quant::QuantizedType>(GetUniformQuantizedTypeForWeight(
+              attr, /*symmetric=*/true, /*num_bits=*/8, /*is_signed=*/true,
+              /*narrow_range=*/true, /*legacy_float_scale=*/false));
+    } else {
+      int quantization_dimension = GetQuantizationDimension(
+          weight_only_ptq, cast<TF::XlaCallModuleOp>(quantizable_op));
+      weight_type = GetUniformQuantizedPerAxisTypeForWeight(
+          attr, quantization_dimension, /*symmetric=*/true, /*num_bits=*/8,
+          /*is_signed=*/true,
+          /*narrow_range=*/true, /*legacy_float_scale=*/false);
+    }
+
+    auto quant_type = dyn_cast<quant::QuantizedType>(weight_type);
+    if (!quant_type) {
+      op->emitError(
+          "Failed to get weight quantization parameters for weight-only "
+          "quantization.");
+      return failure();
+    }
+
+    const Type expressed_type = op->getResult(0).getType();
+    const Type quantized_type =
+        quant_type.castFromExpressedType(expressed_type);
+
+    rewriter.setInsertionPointAfter(op);
+    auto q = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
+        op->getLoc(), quantized_type, op->getResult(0));
+    auto dq = rewriter.create<mlir::quant::ir::DequantizeCastOp>(
+        op->getLoc(), expressed_type, q);
+    quantizable_op->setOperand(1, dq.getResult());
+    return success();
+  }
+
+  // Checks if the operand is second operand of `tf.XlaCallModule` op for
+  // `stablehlo.convolution` or `stablehlo.dot_general` with fully_quantizable
+  // trait.
+  static bool IsWeightQuantizableFunction(OpOperand& operand, int64_t rank) {
+    if (operand.getOperandNumber() != 1) {
+      return false;
+    }
+    Operation* user = operand.getOwner();
+    if (!IsWeightOnlyQuantizableOp(*user)) {
+      return false;
+    }
+    Method method = GetQuantizationMethodOrDefault(user);
+    return HasValidWeightOnlyPtqMethod(method.weight_only_ptq(), rank);
+  }
+
+ private:
+  static bool HasValidWeightOnlyPtqMethod(const WeightOnlyPtq& weight_only_ptq,
+                                          int64_t rank) {
+    const auto& input_quantized_types = weight_only_ptq.input_quantized_types();
+    if (IsPerTensor(weight_only_ptq)) {
+      return true;
+    }
+    // `input_quantized_types` should contain spec for quantization type of the
+    // second operand, which is weight.
+    const QuantizedType& quantized_type = input_quantized_types.at(1);
+    if (const auto& specs = quantized_type.dimension_specs();
+        specs.has_dimension()) {
+      return specs.dimension() >= 0 && specs.dimension() < rank;
+    }
+    return true;
+  }
+
+  static bool IsPerTensor(const WeightOnlyPtq& weight_only_ptq) {
+    const auto& input_quantized_types = weight_only_ptq.input_quantized_types();
+    if (input_quantized_types.empty()) {
+      return true;
+    }
+    auto weight_type = input_quantized_types.find(1);
+    if (weight_type == input_quantized_types.end()) {
+      return true;
+    }
+    return weight_type->second.has_per_tensor();
+  }
+
+  static int GetQuantizationDimension(const WeightOnlyPtq& weight_only_ptq,
+                                      TF::XlaCallModuleOp op) {
+    const QuantizedType& quantized_type =
+        weight_only_ptq.input_quantized_types().at(1);
+    if (quantized_type.dimension_specs().has_dimension()) {
+      return quantized_type.dimension_specs().dimension();
+    }
+    return GetDefaultQuantizationDimension(op);
+  }
+
+  // Determines quantization dimension of weights for given `tf.XlaCallModule`
+  // op. For convolution, returns output feature dimension of the kernel. For
+  // dot_general, returns the first non-contracting dimension, non-batching
+  // dimension. If such dimension does not exists, returns the last dimension of
+  // rhs.
+  static int64_t GetDefaultQuantizationDimension(TF::XlaCallModuleOp op) {
+    const StringRef function_name = GetEntryFunctionName(op);
+    const auto module_op = op->getParentOfType<ModuleOp>();
+    const SymbolTable symbol_table(module_op);
+    func::FuncOp func = symbol_table.lookup<func::FuncOp>(function_name);
+
+    if (function_name.contains("conv")) {
+      return (*(func.getOps<mlir::stablehlo::ConvolutionOp>().begin()))
+          .getDimensionNumbers()
+          .getKernelOutputFeatureDimension();
+    } else if (function_name.contains("dot_general")) {
+      auto dot = *(func.getOps<mlir::stablehlo::DotGeneralOp>().begin());
+      const ::mlir::stablehlo::DotDimensionNumbersAttr dimension_numbers =
+          dot.getDotDimensionNumbers();
+      ArrayRef<int64_t> rhs_contracting_dims =
+          dimension_numbers.getRhsContractingDimensions();
+      ArrayRef<int64_t> rhs_batching_dims =
+          dimension_numbers.getRhsBatchingDimensions();
+      int64_t rank = cast<TensorType>(dot.getRhs().getType()).getRank();
+      for (int i = 0; i < rank; ++i) {
+        // Return the first non-contracting, non-batching dimension of rhs.
+        if (llvm::find(rhs_contracting_dims, i) == rhs_contracting_dims.end() &&
+            llvm::find(rhs_batching_dims, i) == rhs_batching_dims.end()) {
+          return i;
+        }
+      }
+    }
+    return cast<TensorType>(op.getOperand(1).getType()).getRank() - 1;
+  }
+};
+
+void InsertWeightParamPass::runOnOperation() {
+  func::FuncOp func = getOperation();
+  MLIRContext* context = func.getContext();
+  RewritePatternSet patterns(context);
+
+  patterns.add<InsertWeightParamPattern>(context);
+
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_lift_quantizable_spots_as_functions.cc
new file mode 100644
index 000000000000..bdd9255d9099
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_lift_quantizable_spots_as_functions.cc
@@ -0,0 +1,243 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/regexp.h"    // IWYU pragma: keep
+
+#define DEBUG_TYPE "lift_quantizable_spots_as_functions"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_LIFTQUANTIZABLESPOTSASFUNCTIONSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+using ::stablehlo::quantization::FunctionNameMatcherSpec;
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizationSpec;
+using ::stablehlo::quantization::QuantizationSpecs;
+using ::tsl::protobuf::TextFormat;
+
+// TODO - b/303543789: Move the helper functions below to a separate util.
+// Fetches the default or null attribute, used for pattern matching.
+Attribute DefaultOrNullAttr(OpBuilder& builder, const Attribute& attr) {
+  if (attr) return attr;
+  return builder.getStringAttr(kNullAttributeValue);
+}
+
+// Checks whether the value of a constant equals the given float, regardless
+// of the tensor dimension.
+bool FloatValueEquals(const Attribute& attr, const double value) {
+  const auto fp_attr = mlir::dyn_cast_or_null<DenseFPElementsAttr>(attr);
+  if (!fp_attr) return false;
+
+  if (fp_attr.isSplat()) {
+    return fp_attr.getSplatValue<APFloat>().isExactlyValue(value);
+  }
+  return llvm::all_of(fp_attr.getValues<APFloat>(), [value](const APFloat& f) {
+    return f.isExactlyValue(value);
+  });
+}
+
+inline void TrimTrailingWhitespaces(std::string& str) {
+  while (!str.empty() && str.back() == ' ') {
+    str.pop_back();
+  }
+}
+
+// Lifts quantizable units as separate functions, thereby identifying the
+// boundaries of quantizable subgraphs. `QuantizationSpecs` influences how
+// quantizable units are lifted.
+//
+// FileCheck test cases using various `QuantizationSpecs` can be seen at
+// `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`.
+class LiftQuantizableSpotsAsFunctionsPass
+    : public impl::LiftQuantizableSpotsAsFunctionsPassBase<
+          LiftQuantizableSpotsAsFunctionsPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      LiftQuantizableSpotsAsFunctionsPass)
+
+  LiftQuantizableSpotsAsFunctionsPass() = default;
+
+  // Constructor with explicit user-provided `QuantizationSpecs`.
+  explicit LiftQuantizableSpotsAsFunctionsPass(
+      QuantizationSpecs quantization_specs)
+      : quantization_specs_(std::move(quantization_specs)) {}
+
+ private:
+  void runOnOperation() override;
+
+  // No explicit quantization spec is specified by default. Implicitly this
+  // means that all quantizable units will be identified and lifted.
+  QuantizationSpecs quantization_specs_{};
+};
+
+namespace simple_patterns {
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.inc"
+}
+
+namespace fusion_patterns {
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.inc"
+}
+
+// Quantizable Unit matcher that uses lifted function's name for matching.
+class FunctionNameMatcher {
+ public:
+  explicit FunctionNameMatcher(const FunctionNameMatcherSpec& spec)
+      : match_regex_(GetMatchRegex(spec)) {}
+
+  // Returns `true` when matched with the entry function of
+  // `xla_call_module_op`.
+  bool Match(TF::XlaCallModuleOp xla_call_module_op) const {
+    if (match_regex_ == nullptr) return false;
+
+    const std::string lifted_func_name =
+        xla_call_module_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function")
+            .getValue()
+            .str();
+
+    return RE2::FullMatch(lifted_func_name, *match_regex_);  // NOLINT
+  }
+
+ private:
+  // Returns an owned `RE2` object that corresponds to the `spec`. Returns
+  // `nullptr` if the `spec` is invalid.
+  // NOLINTNEXTLINE - RE2 included via TSL regexp.h
+  std::unique_ptr<RE2> GetMatchRegex(const FunctionNameMatcherSpec& spec) {
+    const std::string& regex = spec.regex();
+    if (regex.empty()) return nullptr;
+
+    return std::make_unique<RE2>(regex);  // NOLINT
+  }
+
+  // Regex object used for matching against a lifted function's name.
+  std::unique_ptr<RE2> match_regex_;  // NOLINT
+};
+
+// Converts `Method` to a single-line textproto representation. Returns
+// `failure()` when converting to textproto failed.
+FailureOr<std::string> QuantizationMethodToTextProto(const Method& method) {
+  TextFormat::Printer printer;
+  printer.SetSingleLineMode(true);
+
+  std::string method_txtpb;
+  if (!printer.PrintToString(method, &method_txtpb)) {
+    LLVM_DEBUG(llvm::dbgs() << "Failed to convert Method to textproto\n.");
+    return failure();
+  }
+
+  // Single line mode might have an extra space at the end, due to the internal
+  // details of `Printer`.
+  TrimTrailingWhitespaces(method_txtpb);
+
+  return method_txtpb;
+}
+
+// Applies quantization spec to all matched lifted functions. At this point only
+// denylisting (`NoQuantization`) will be applied if specs is nonempty.
+// TODO: b/307620778 - Support more advanced selective quantization methods.
+LogicalResult ApplyQuantizationSpec(const QuantizationSpec& spec,
+                                    ModuleOp module_op) {
+  const Method& quantization_method = spec.method();
+
+  FailureOr<std::string> quantization_method_txtpb =
+      QuantizationMethodToTextProto(quantization_method);
+  if (failed(quantization_method_txtpb)) return failure();
+
+  const FunctionNameMatcher matcher(spec.matcher().function_name());
+  // Iterate over all XlaCallModuleOp in all FuncOps.
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    for (auto xla_call_module_op : func.getOps<TF::XlaCallModuleOp>()) {
+      if (!matcher.Match(xla_call_module_op)) continue;
+
+      // Set the text representation of `Method` to matched
+      // `TF::XlaCallModuleOp`.
+      xla_call_module_op->setAttr(
+          kQuantizationMethodAttr,
+          StringAttr::get(module_op.getContext(),
+                          std::move(*quantization_method_txtpb)));
+    }
+  }
+  return success();
+}
+
+void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module_op = getOperation();
+
+  simple_patterns::populateWithGenerated(patterns);
+  fusion_patterns::populateWithGenerated(patterns);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  // Iterate over the sorted list of functions to keep order deterministic.
+  for (func::FuncOp func : GetSortedFunctions(module_op)) {
+    if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+      func.emitError()
+          << "quant-stablehlo-lift-quantizable-spots-as-functions failed.";
+      signalPassFailure();
+    }
+  }
+
+  // Remove all attr_map attributes.
+  module_op.walk([](Operation* op) { op->removeAttr(kAttrMapAttribute); });
+
+  // Perform selective quantization. Iterates over the quantization specs and
+  // applies quantization methods to each matched lifted function.
+  for (const QuantizationSpec& spec : quantization_specs_.specs()) {
+    if (failed(ApplyQuantizationSpec(spec, module_op))) {
+      signalPassFailure();
+      return;
+    }
+  }
+}
+
+}  // namespace
+
+// Creates `LiftQuantizableSpotsAsFunctionsPass` with user-defined
+// `QuantizationSpecs`.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const QuantizationSpecs& quantization_specs) {
+  return std::make_unique<LiftQuantizableSpotsAsFunctionsPass>(
+      quantization_specs);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_merge_fusion_with_dequantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_merge_fusion_with_dequantize.cc
new file mode 100644
index 000000000000..f9dfd1319656
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_merge_fusion_with_dequantize.cc
@@ -0,0 +1,150 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_MERGEFUSIONWITHDEQUANTIZEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+class MergeFusionWithDequantizePass
+    : public impl::MergeFusionWithDequantizePassBase<
+          MergeFusionWithDequantizePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeFusionWithDequantizePass)
+
+  explicit MergeFusionWithDequantizePass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+class MergeFusionWithUniformDequantizePattern
+    : public OpRewritePattern<func::CallOp> {
+ public:
+  explicit MergeFusionWithUniformDequantizePattern(MLIRContext* context)
+      : OpRewritePattern<func::CallOp>(context) {}
+  LogicalResult matchAndRewrite(func::CallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    if (call_op.getNumResults() != 1) return failure();
+    auto users = call_op->getUsers();
+    for (auto user : users) {
+      if (!llvm::isa<mlir::stablehlo::UniformDequantizeOp>(user)) {
+        return failure();
+      }
+    }
+    auto func_name = call_op.getCallee();
+    if (!func_name.starts_with("quantized_")) return failure();
+    if (call_op->getNumResults() != 1) return failure();
+    if (!mlir::isa<quant::UniformQuantizedType>(
+            getElementTypeOrSelf(call_op->getResult(0).getType())))
+      return failure();
+
+    // Fetch the callee function.
+    SymbolTable symbol_table(call_op->getParentOfType<ModuleOp>());
+    auto func_op =
+        dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(func_name));
+    if (!func_op) return failure();
+    // The quantized fusion should have requantize and return ops at the end.
+    auto return_op = dyn_cast_or_null<func::ReturnOp>(
+        func_op.getRegion().getBlocks().front().getTerminator());
+    if (!return_op) return failure();
+    auto req_op = llvm::dyn_cast_or_null<mlir::stablehlo::UniformQuantizeOp>(
+        return_op.getOperands()[0].getDefiningOp());
+    if (!req_op) return failure();
+
+    // Create a new func.call op with f32 output.
+    auto new_call_op = call_op.clone();
+    new_call_op->getResult(0).setType(
+        mlir::cast<ShapedType>(call_op.getResult(0).getType())
+            .clone(rewriter.getF32Type()));
+    rewriter.setInsertionPoint(call_op);
+    rewriter.insert(new_call_op);
+
+    // Remove the dequantize ops and replace uses by the new func.call op.
+    SmallVector<Operation*> users_to_erase;
+    for (auto user : users) {
+      llvm::dyn_cast<mlir::stablehlo::UniformDequantizeOp>(user)
+          .replaceAllUsesWith(new_call_op.getResult(0));
+      users_to_erase.push_back(user);
+    }
+    for (auto user : users_to_erase) rewriter.eraseOp(user);
+    rewriter.eraseOp(call_op);
+    if (failed(func_op.eraseResult(0))) {
+      return failure();
+    }
+    if (failed(func_op.insertResult(0, new_call_op.getResult(0).getType(),
+                                    /*resultAttrs=*/nullptr))) {
+      return failure();
+    }
+
+    // Modify the quantized fused function to do dequantize+relu(6).
+    rewriter.setInsertionPoint(req_op);
+    Value new_result = rewriter.create<mlir::stablehlo::UniformDequantizeOp>(
+        req_op.getLoc(), func_op.getResultTypes()[0], req_op.getOperand());
+    if (func_name.contains("_relu6_")) {
+      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
+          req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      auto max = rewriter.create<mlir::stablehlo::ConstantOp>(
+          req_op.getLoc(), rewriter.getF32FloatAttr(6));
+      new_result = rewriter.create<mlir::stablehlo::ClampOp>(
+          req_op.getLoc(), min, new_result, max);
+    } else if (func_name.contains("_relu_")) {
+      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
+          req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      new_result = rewriter.create<mlir::chlo::BroadcastMaxOp>(
+          req_op.getLoc(), min, new_result, nullptr);
+    }
+    return_op->setOperand(0, new_result);
+    rewriter.eraseOp(req_op);
+
+    return success();
+  }
+};
+
+void MergeFusionWithDequantizePass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = module_op.getContext();
+  RewritePatternSet patterns(ctx);
+  patterns.add<MergeFusionWithUniformDequantizePattern>(ctx);
+  if (failed(applyPatternsGreedily(module_op, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_nchw_convolution_to_nhwc.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_nchw_convolution_to_nhwc.cc
new file mode 100644
index 000000000000..4088b84937c7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_nchw_convolution_to_nhwc.cc
@@ -0,0 +1,191 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_NCHWCONVOLUTIONTONHWCPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+using ::mlir::stablehlo::ConvDimensionNumbersAttr;
+
+class NchwConvolutionToNhwcPass
+    : public impl::NchwConvolutionToNhwcPassBase<NchwConvolutionToNhwcPass> {
+ private:
+  void runOnOperation() override;
+};
+
+// Rewrites NCHW convolution to NHWC.
+// * Src dimension numbers: [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+// * Dst dimension numbers: [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+class RewriteNchwConvolutionToNhwc
+    : public OpRewritePattern<mlir::stablehlo::ConvolutionOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::stablehlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const override {
+    // Handles 2D convolutions only.
+    if (!HasRankOf(op.getOperand(0), /*rank=*/4) ||
+        !HasRankOf(op.getOperand(1), /*rank=*/4)) {
+      return failure();
+    }
+
+    if (!quant::IsOpNotQuantized(op)) return failure();
+
+    const ConvDimensionNumbersAttr dimension_nums = op.getDimensionNumbers();
+    const bool dimension_nums_matched =
+        MatchInputDimensionNumbers(dimension_nums) &&
+        MatchKernelDimensionNumbers(dimension_nums) &&
+        MatchOutputDimensionNumbers(dimension_nums);
+    if (!dimension_nums_matched) {
+      return failure();
+    }
+
+    // Transpose the input tensor: [b, f, 0, 1] => [b, 0, 1, f]
+    Value input = op->getOperand(0);
+    const TensorType new_input_tensor_type = GetTransposedTensorType(
+        mlir::cast<TensorType>(input.getType()), kNchwToNhwcPermutation);
+
+    auto input_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
+        op.getLoc(), /*resultType0=*/new_input_tensor_type, /*operand=*/input,
+        rewriter.getDenseI64ArrayAttr(kNchwToNhwcPermutation));
+
+    // Transpose the filter tensor: [o, i, 0, 1] => [0, 1, i, o]
+    Value filter = op->getOperand(1);
+    const TensorType new_filter_tensor_type = GetTransposedTensorType(
+        mlir::cast<TensorType>(filter.getType()), kOihwToHwioPermutation);
+
+    auto filter_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
+        op.getLoc(), /*resultType0=*/new_filter_tensor_type, /*operand=*/filter,
+        rewriter.getDenseI64ArrayAttr(kOihwToHwioPermutation));
+
+    // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+    const auto new_dimension_nums = rewriter.getAttr<ConvDimensionNumbersAttr>(
+        /*inputBatchDimension=*/0, /*inputFeatureDimension=*/3,
+        /*inputSpatialDimensions=*/SmallVector<int64_t>{1, 2},
+        /*kernelInputFeatureDimension=*/2, /*kernelOutputFeatureDimension=*/3,
+        /*kernelSpatialDimensions=*/SmallVector<int64_t>{0, 1},
+        /*outputBatchDimension=*/0, /*outputFeatureDimension=*/3,
+        /*outputSpatialDimensions=*/SmallVector<int64_t>{1, 2});
+
+    // Determine the shape of the output tensor: [b, f, 0, 1] => [b, 0, 1, f]
+    auto output_tensor_type =
+        mlir::cast<TensorType>(op->getResult(0).getType());
+    const TensorType new_conv_output_tensor_type =
+        GetTransposedTensorType(output_tensor_type, kNchwToNhwcPermutation);
+
+    // window_strides, padding, lhs_dilation, rhs_dilation, window_reversal are
+    // reused without modification because the ordering of spatial dimensions
+    // is not modified (i.e. before: [b, f, 0, 1], after: [b, 0, 1, f] => the
+    // spatial dimension is still ordered as {0, 1}).
+    auto new_convolution_op = rewriter.create<mlir::stablehlo::ConvolutionOp>(
+        op.getLoc(), /*resultType0=*/new_conv_output_tensor_type,
+        /*lhs=*/input_transpose_op,
+        /*rhs=*/filter_transpose_op,
+        /*window_strides=*/op.getWindowStridesAttr(),
+        /*padding=*/op.getPaddingAttr(),
+        /*lhs_dilation=*/op.getLhsDilationAttr(),
+        /*rhs_dilation=*/op.getRhsDilationAttr(),
+        /*window_reversal=*/op.getWindowReversalAttr(),
+        /*dimension_numbers=*/new_dimension_nums,
+        /*feature_group_count=*/op.getFeatureGroupCountAttr(),
+        /*batch_group_count=*/op.getBatchGroupCountAttr(),
+        /*precision_config=*/op.getPrecisionConfigAttr());
+
+    // Transpose the output of the `ConvolutionOp` back to the original op's
+    // output shape so that users' shapes match.
+    // [b, 0, 1, f] => [b, f, 0, 1]
+    auto output_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
+        new_convolution_op.getLoc(), /*resultType0=*/output_tensor_type,
+        /*operand=*/new_convolution_op,
+        rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
+
+    rewriter.replaceAllUsesWith(op, output_transpose_op);
+    return success();
+  }
+
+ private:
+  // Matches input dimensions corresponding to: [b, f, 0, 1].
+  bool MatchInputDimensionNumbers(
+      const ConvDimensionNumbersAttr dimension_numbers) const {
+    return dimension_numbers.getInputBatchDimension() == 0 &&
+           dimension_numbers.getInputFeatureDimension() == 1 &&
+           dimension_numbers.getInputSpatialDimensions() ==
+               ArrayRef<int64_t>{2, 3};
+  }
+
+  // Matches kernel dimensions corresponding to: [o, i, 0, 1].
+  bool MatchKernelDimensionNumbers(
+      const ConvDimensionNumbersAttr dimension_numbers) const {
+    return dimension_numbers.getKernelInputFeatureDimension() == 1 &&
+           dimension_numbers.getKernelOutputFeatureDimension() == 0 &&
+           dimension_numbers.getKernelSpatialDimensions() ==
+               ArrayRef<int64_t>{2, 3};
+  }
+
+  // Matches output dimensions corresponding to: [b, f, 0, 1].
+  bool MatchOutputDimensionNumbers(
+      const ConvDimensionNumbersAttr dimension_numbers) const {
+    return dimension_numbers.getOutputBatchDimension() == 0 &&
+           dimension_numbers.getOutputFeatureDimension() == 1 &&
+           dimension_numbers.getOutputSpatialDimensions() ==
+               ArrayRef<int64_t>{2, 3};
+  }
+
+  // Returns a new tensor type with the shape transposed according to the
+  // permutation. The rank of `type` and the size of `permutation` must be
+  // equal.
+  TensorType GetTransposedTensorType(
+      const TensorType type, const ArrayRef<int64_t> permutation) const {
+    const SmallVector<int64_t> after_shape =
+        quant::Permute<int64_t>(type.getShape(), permutation);
+    return type.cloneWith(after_shape, type.getElementType());
+  }
+};
+
+}  // namespace
+
+void NchwConvolutionToNhwcPass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<RewriteNchwConvolutionToNhwc>(&ctx);
+
+  if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
+    func_op.emitError() << "Failed to run NchwConvolutionToNhwcPass.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_optimize_graph.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_optimize_graph.cc
new file mode 100644
index 000000000000..0bb7b660e110
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_optimize_graph.cc
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_OPTIMIZEGRAPHPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+class OptimizeGraphPass
+    : public impl::OptimizeGraphPassBase<OptimizeGraphPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizeGraphPass)
+
+  explicit OptimizeGraphPass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/optimize_graph.inc"
+
+void OptimizeGraphPass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  auto func = getOperation();
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h
new file mode 100644
index 000000000000..dd62e6f27806
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h
@@ -0,0 +1,61 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TF_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TF_PASSES_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// Creates a pass that quantizes weight component of StableHLO graph.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
+    const ::stablehlo::quantization::QuantizationComponentSpec&
+        quantization_component_spec = {});
+
+// Converts a serialized StableHLO module to bfloat16 and output serialized
+// module.
+absl::StatusOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
+    StringRef serialized_stablehlo_module);
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const ::stablehlo::quantization::QuantizationSpecs& quantization_specs);
+
+// Creates a pass that inserts CalibrationStatisticsSaverOp.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInsertCalibrationStatisticsSaverPass(
+    StringRef calibration_data_dir,
+    const std::vector<std::string>& aggregator_ops_to_ignore);
+
+// Adds generated pass default constructors or options definitions.
+#define GEN_PASS_DECL
+// Adds generated pass registration functions.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TF_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.td
new file mode 100644
index 000000000000..fd47b5d8ec68
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.td
@@ -0,0 +1,248 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def QuantizeWeightPass : Pass<"tf-stablehlo-quantize-weight", "mlir::func::FuncOp"> {
+  let summary = "Quantizes the weight component of StableHLO graph.";
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+  let constructor = "mlir::tf_quant::stablehlo::CreateQuantizeWeightPass()";
+}
+
+def UnfuseMhloBatchNormPass : Pass<"tf-stablehlo-unfuse-mhlo-batch-norm", "mlir::func::FuncOp"> {
+  let summary = "Unfuses batch normalization into arithmetic ops.";
+}
+
+def LiftQuantizableSpotsAsFunctionsPass : Pass<"tf-stablehlo-lift-quantizable-spots-as-functions", "mlir::ModuleOp"> {
+  let summary = "Replace quantization candidates with composite functions into the module.";
+  let description = [{
+    Mark frequent fusible patterns as functions for quantization targets.
+    In addition to brining performance benefits by reducing q/dq op overhead in non-full quantization,
+    this brings higher accuracy by keeping a smaller range when quantizing ops
+    that disperse values. (ex: convolution, dot_general)
+  }];
+  let dependentDialects = [
+      "mlir::func::FuncDialect",
+      "mlir::stablehlo::StablehloDialect",
+      "TF::TensorFlowDialect",
+  ];
+}
+
+def ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass : Pass<"tf-stablehlo-replace-stablehlo-ops-in-main-function-with-xla-call-module-ops", "mlir::ModuleOp"> {
+  let summary = "Replaces the StableHLO ops with a separate XlaCallModuleOps.";
+  let description = [{
+     Replaces the StableHLO ops in the main function block with
+     tf.XlaCallModuleOps as separate subgraphs. Wires them back to the main
+     function block to be compatible with SavedModel structure.
+  }];
+}
+
+def RestoreFunctionNamePass : Pass<"tf-stablehlo-restore-function-name", "ModuleOp"> {
+  let summary = "Restores function name from XlaCallModule op.";
+}
+
+def QuantizeCompositeFunctionsPass : Pass<"tf-stablehlo-quantize-composite-functions", "ModuleOp"> {
+  let summary = "Quantize composite functions with QDQ input / outputs.";
+  let options = [
+    Option<"enable_per_channel_quantized_weight_",
+        "enable-per-channel-quantized-weight",
+        "bool", /*default=*/"true",
+        "Whether to enable per-channel quantized weights.">,
+    Option<"mlir_dump_file_name_", "mlir-dump-file-name",
+        "std::optional<std::string>", /*default=*/"std::nullopt",
+        "MLIR dump file name.">,
+    Option<"merge_fusion_with_dequantize_",
+        "merge-fusion-with-dequantize",
+        "bool", /*default=*/"false",
+        "Whether to merge quantized conv/dot_general fusion with subsequent dequantize.">,
+  ];
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::quant::QuantDialect",
+    "mlir::quant::ir::TFQuantDialect",
+    "TF::TensorFlowDialect",
+  ];
+}
+
+def PrepareQuantizePass : Pass<"tf-stablehlo-prepare-quantize", "mlir::ModuleOp"> {
+  let summary = "Prepare StableHLO dialect for static range quantization by converting quantfork.stats into quantfork.qcast and dcast ops.";
+  let options = [
+    Option<"enable_per_channel_quantized_weight_",
+        "enable-per-channel-quantized-weight",
+        "bool", /*default=*/"true",
+        "Whether to enable per-channel quantized weights.">,
+    Option<"bit_width_", "bit-width", "int", /*default=*/"8",
+        "Bitwidth of quantized integer">
+    ];
+  let dependentDialects = [
+      "mlir::stablehlo::StablehloDialect",
+      "mlir::quant::QuantDialect",
+      "mlir::quant::ir::TFQuantDialect",
+      "mlir::arith::ArithDialect",
+  ];
+}
+
+def QuantizePass : Pass<"tf-stablehlo-quantize", "mlir::ModuleOp"> {
+  let summary = "Applies static-range quantization on ops by converting quantfork.qcast, quantfork.dcast, and float op into uniform quantized ops .";
+  let options = [
+    Option<"enable_per_channel_quantized_weight_",
+        "enable-per-channel-quantized-weight",
+        "bool", /*default=*/"true",
+        "Whether to enable per-channel quantized weights.">,
+  ];
+  let dependentDialects = [
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::quant::QuantDialect",
+    "mlir::quant::ir::TFQuantDialect",
+  ];
+}
+
+def PostQuantizePass : Pass<"tf-stablehlo-post-quantize", "mlir::func::FuncOp"> {
+  let summary = "Apply clean-up after quantization.";
+  let dependentDialects = [
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::quant::ir::TFQuantDialect",
+  ];
+}
+
+def XlaCallModuleToCallPass : Pass<"tf-stablehlo-xla-call-module-to-call", "ModuleOp"> {
+  let summary = "Convert XlaCallModuleOp to func.call op";
+  let dependentDialects = [
+    "TF::TensorFlowDialect",
+  ];
+}
+
+def MergeFusionWithDequantizePass : Pass<"tf-stablehlo-merge-fusion-with-dequantize", "mlir::ModuleOp"> {
+  let summary = "Merge quantized conv/dot_general fusion with subsequent dequantize.";
+  let dependentDialects = [
+    "chlo::ChloDialect",
+    "mlir::stablehlo::StablehloDialect",
+  ];
+}
+
+def UnwrapXlaCallModuleOpPass : Pass<"tf-stablehlo-unwrap-xla-call-module-op", "ModuleOp"> {
+  let summary = "Unwrap XlaCallModuleOps into inline functions if not used for quantizing fused patterns.";
+  let dependentDialects = ["TF::TensorFlowDialect"];
+}
+
+def ConvertFuncToBfloat16Pass : Pass<"tf-stablehlo-convert-func-to-bfloat16", "mlir::func::FuncOp"> {
+  let summary = "Convert a StableHLO function to bfloat16";
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
+
+def ConvertXlaCallModuleOpToBfloat16Pass : Pass<"tf-stablehlo-convert-xla-call-module-op-to-bfloat16", "mlir::func::FuncOp"> {
+  let summary = "Convert serialized XlaCallModuleOp to bfloat16";
+  let dependentDialects = [
+    "TF::TensorFlowDialect",
+    "mlir::quant::QuantDialect",
+    "mlir::shape::ShapeDialect",
+    "mlir::stablehlo::StablehloDialect",
+  ];
+}
+
+def ConvertShapeToStablehloWithConstraintsPass : Pass<"tf-stablehlo-convert-shape-to-stablehlo-with-constraints", "mlir::func::FuncOp"> {
+  let summary = "Convert shape.cstr_broadcastable to stablehlo.custom_call @shape_assertion";
+  let dependentDialects = [
+    "mlir::shape::ShapeDialect",
+    "mlir::tensor::TensorDialect",
+    "mlir::stablehlo::StablehloDialect",
+  ];
+}
+
+def OptimizeGraphPass : Pass<"tf-optimize-graph", "ModuleOp"> {
+  let summary = "Optimize the sub-optimal patterns after quantization.";
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect",];
+}
+
+def NchwConvolutionToNhwcPass : Pass<"tf-stablehlo-nchw-convolution-to-nhwc", "mlir::func::FuncOp"> {
+  let summary = "Converts stablehlo.convolution op of NCHW format to -> NHWC.";
+  let description = [{
+    Matches `ConvolutionOp`s with NCHW format and converts it to NHWC
+    format by inserting `TransposeOp`s to input, filter, and output tensors.
+    In terms of dimension numbers, this matches
+    `[b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]` format and converts it to
+    `[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]` format.
+
+    This pass is useful to convert models that conventionally use the NCHW
+    format to target hardwares that are more NHWC-friendly.
+  }];
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
+
+def DeferActivationTransposePass : Pass<"tf-stablehlo-defer-activation-transpose", "mlir::func::FuncOp"> {
+  let summary = "Merges stablehlo.transpose for activations.";
+  let description = [{
+    Defers activation transposes (e.g. LHS of `stablehlo.add`) to the output and
+    optionally inserts `stablehlo.transpose`s to match the shape of operands.
+    This is useful when recursively pushing down the extra `stablehlo.transpose`
+    inserted to activation tensors after running `NchwConvolutionToNhwcPass`.
+
+    Currently only converts limited cases that appear in NCHW->NHWC 2D
+    convolution conversion, to avoid introducing unwanted pessimizations.
+  }];
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
+
+def InsertWeightParamPass : Pass<"tf-stablehlo-insert-weight-param", "mlir::func::FuncOp"> {
+  let summary = "Insert quantization parameters of weights for weight-only quantization and dynamic range quantization.";
+  let dependentDialects = [
+      "mlir::stablehlo::StablehloDialect",
+      "TF::TensorFlowDialect",
+      "mlir::quant::QuantDialect",
+      "mlir::quant::ir::TFQuantDialect",
+  ];
+}
+
+def FoldConstantTransposePass : Pass<"tf-stablehlo-fold-constant-transpose", "mlir::func::FuncOp"> {
+  let summary = "Folds stablehlo.constant -> stablehlo.transpose patterns.";
+  let description = [{
+    Finds patterns where a `stablehlo.constant` is directly followed by a
+    `stablehlo.transpose` and folds them into a single `stablehlo.constant`.
+    This is considered an aggressive optimization, but it is useful to eliminate
+    `stablehlo.constant`->`stablehlo.transpose` patterns which are often
+    by-products of other shape conversion optimizations, such as NCHW->NHWC
+    convolution conversion.
+  }];
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
+
+def RemoveShardingCustomCallPass : Pass<"tf-stablehlo-remove-sharding-custom-call", "mlir::func::FuncOp"> {
+  let summary = "Removes `stablehlo.custom_call @Sharding`";
+  let description = [{
+    Finds `stablehlo.custom_call @Sharding` and removes all instances of them,
+    replacing the usages by its operand. This is used where sharding doesn't
+    make much sense or sharding custom calls are incompatible, e.g. on-device
+    targets.
+  }];
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
+
+def InsertCalibrationStatisticsSaverPass : Pass<"tf-stablehlo-insert-calibration-statistics-saver", "ModuleOp"> {
+  let summary = "Inserts `CalibrationStatisticsSaver` op to collect and save calibration statistics.";
+  let description = [{
+    Finds all `CustomAggregator` ops in the each function and add a single
+    `CalibrationStatisticsSaver` op at the end of the function to collect their
+    statistics.
+  }];
+  let options = [
+    ListOption<"aggregator_ops_to_ignore_", "aggregator-ops-to-ignore", "std::string",
+               "Ops to ignore when inserting CalibrationStatisticsSaver.">,
+    Option<"calibration_data_dir_", "calibration-data-dir",
+        "std::string", /*default=*/"",
+        "The directory to save calibration data.">,
+  ];
+  let dependentDialects = ["TF::TensorFlowDialect"];
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_post_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_post_quantize.cc
new file mode 100644
index 000000000000..82e85a0c3470
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_post_quantize.cc
@@ -0,0 +1,160 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_POSTQUANTIZEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+// Applies clean-up patterns after quantization.
+class PostQuantizePass : public impl::PostQuantizePassBase<PostQuantizePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PostQuantizePass)
+
+  explicit PostQuantizePass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+// TODO: b/305815328 - Consider preserving leading and trailing QDQs for
+// ModifyIONodesPass in TFLite use cases.
+// Removes the back-to-back quantize and dequantize ops with volatile attribute.
+class RemoveVolatileQdqPattern
+    : public OpRewritePattern<mlir::quant::ir::DequantizeCastOp> {
+ public:
+  explicit RemoveVolatileQdqPattern(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::DequantizeCastOp>(context) {}
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::DequantizeCastOp op,
+                                PatternRewriter& rewriter) const override {
+    auto input_op = op.getArg().getDefiningOp();
+    if (auto q =
+            llvm::dyn_cast_or_null<mlir::quant::ir::QuantizeCastOp>(input_op)) {
+      if (!q->getAttr(kVolatileOpAttrName)) return failure();
+
+      // If the quantize op is a requantize op, it is being used in other scale
+      // adjustments and should be kept. Instead, move dequantize op before the
+      // requantize op to remove the unnecessary requantize op.
+      if (const QuantizedType qtype =
+              QuantizedType::getQuantizedElementType(q.getArg().getType())) {
+        rewriter.setInsertionPoint(op);
+        rewriter.replaceOpWithNewOp<mlir::quant::ir::DequantizeCastOp>(
+            op, op.getResult().getType(), q.getArg());
+        return success();
+      }
+
+      op.replaceAllUsesWith(q.getArg());
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Replaces constant and uniform_quantize ops with single quantized constant op.
+class QuantizeConstPattern
+    : public OpRewritePattern<mlir::stablehlo::UniformQuantizeOp> {
+ public:
+  explicit QuantizeConstPattern(MLIRContext* context)
+      : OpRewritePattern<mlir::stablehlo::UniformQuantizeOp>(context) {}
+
+  LogicalResult matchAndRewrite(mlir::stablehlo::UniformQuantizeOp op,
+                                PatternRewriter& rewriter) const override {
+    DenseFPElementsAttr attr;
+    if (matchPattern(op.getOperand(), m_Constant(&attr))) {
+      const Type qtype = op.getResult().getType();
+      ElementsAttr quantized_attr = Quantize(attr, qtype);
+      if (quantized_attr) {
+        rewriter.replaceOpWithNewOp<mlir::stablehlo::ConstantOp>(
+            op, qtype, quantized_attr);
+        return success();
+      }
+    }
+    return failure();
+  }
+};
+
+// Replaces quantfork.dcast with stablehlo.uniform_dequantize.
+class ConvertDequantizeCastToUniformDequantizePattern
+    : public OpRewritePattern<mlir::quant::ir::DequantizeCastOp> {
+ public:
+  explicit ConvertDequantizeCastToUniformDequantizePattern(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::DequantizeCastOp>(context) {}
+  LogicalResult matchAndRewrite(mlir::quant::ir::DequantizeCastOp dq_op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlir::stablehlo::UniformDequantizeOp>(
+        dq_op, dq_op.getResult().getType(), dq_op.getArg());
+    return success();
+  }
+};
+
+// Replaces quantfork.qcast with stablehlo.uniform_quantize.
+class ConvertQuantizeCastToUniformQuantizePattern
+    : public OpRewritePattern<mlir::quant::ir::QuantizeCastOp> {
+ public:
+  explicit ConvertQuantizeCastToUniformQuantizePattern(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::QuantizeCastOp>(context) {}
+  LogicalResult matchAndRewrite(mlir::quant::ir::QuantizeCastOp q_op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlir::stablehlo::UniformQuantizeOp>(
+        q_op, q_op.getResult().getType(), q_op.getArg());
+    return success();
+  }
+};
+
+void PostQuantizePass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  func::FuncOp func = getOperation();
+  MLIRContext* ctx = func.getContext();
+  // TODO: b/307463853 - Consider splitting passes for each pattern set.
+  patterns.add<FoldTrivalRequantizeOp<mlir::quant::ir::QuantizeCastOp>,
+               RemoveVolatileQdqPattern>(ctx);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+
+  RewritePatternSet patterns_2(&getContext());
+  patterns_2
+      .add<QuantizeConstPattern, ConvertQuantizeCastToUniformQuantizePattern,
+           ConvertDequantizeCastToUniformDequantizePattern>(ctx);
+  if (failed(applyPatternsGreedily(func, std::move(patterns_2)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_prepare_quantize.cc
new file mode 100644
index 000000000000..b7976e35c7f4
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_prepare_quantize.cc
@@ -0,0 +1,200 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace stablehlo {
+
+#define GEN_PASS_DEF_PREPAREQUANTIZEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+// Applies prepare quantization on the model in TF dialect. This pass runs
+// before the quantization pass and propagate the quantization parameters
+// across ops. This step is necessary for post-training quantization and also
+// making the quantization rule for some operations in the quantization-aware
+// training quantization simpler.
+class PrepareQuantizePass
+    : public impl::PrepareQuantizePassBase<PrepareQuantizePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareQuantizePass)
+
+  using impl::PrepareQuantizePassBase<
+      PrepareQuantizePass>::PrepareQuantizePassBase;
+
+  explicit PrepareQuantizePass(const bool enable_per_channel_quantized_weight,
+                               const int bit_width) {
+    enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+    bit_width_ = bit_width;
+  }
+
+  void runOnOperation() override;
+};
+
+// Merges consecutive QuantizeCast ops. See b/246655213 for details.
+// For example, the following case:
+// %1 = quantfork.QuantizeCastOp(%0) : f32 -> qtype1
+// %2 = quantfork.QuantizeCastOp(%1) : qtype1 -> qtype2
+// %3 = quantfork.QuantizedOp1(%1)
+// %4 = quantfork.QuantizedOp2(%2)
+// will be tranformed to:
+// %1 = quantfork.QuantizeCastOp(%0) : f32 -> qtype1
+// %2 = quantfork.QuantizeCastOp(%0) : f32 -> qtype2
+// %3 = quantfork.QuantizedOp1(%1)
+// %4 = quantfork.QuantizedOp2(%2)
+// Converting from f32 -> qtype1 -> qtype2 will add unexpected quantization
+// lost for %2. This pattern avoids that by converting from f32 -> qtype2
+// directly.
+class MergeConsecutiveQuantizeCast
+    : public mlir::OpRewritePattern<mlir::quant::ir::QuantizeCastOp> {
+ public:
+  explicit MergeConsecutiveQuantizeCast(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::QuantizeCastOp>(context) {}
+
+ private:
+  LogicalResult matchAndRewrite(mlir::quant::ir::QuantizeCastOp q_op,
+                                PatternRewriter& rewriter) const override {
+    auto preceding_qcast =
+        q_op.getArg().getDefiningOp<mlir::quant::ir::QuantizeCastOp>();
+    if (!preceding_qcast) return failure();
+
+    auto new_qcast = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
+        q_op.getLoc(), q_op.getType(), preceding_qcast.getArg());
+    new_qcast->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
+    q_op->replaceAllUsesWith(new_qcast);
+    return success();
+  }
+};
+
+class ConvertTFConstOpToArithConstOp : public OpRewritePattern<TF::ConstOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TF::ConstOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, op.getValue());
+    return success();
+  }
+};
+
+class ConvertStablehloConstToArithConstOp
+    : public OpRewritePattern<mlir::stablehlo::ConstantOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(mlir::stablehlo::ConstantOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, op.getValue());
+    return success();
+  }
+};
+
+class ConvertArithConstToStablehloConstOp
+    : public OpRewritePattern<arith::ConstantOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(arith::ConstantOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlir::stablehlo::ConstantOp>(op, op.getValue());
+    return success();
+  }
+};
+
+void PrepareQuantizePass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = module_op.getContext();
+
+  auto func_op_quant_spec = GetStableHloOpQuantSpec;
+  auto func_op_quant_scale_spec = GetStableHloQuantConstraints;
+
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    // The function might contain more stats ops than required, and it will
+    // introduce requantize if the calibration stats have conflicts. This tries
+    // to remove all the redundant stats ops.
+    RemoveRedundantStatsOps(func_op, func_op_quant_spec,
+                            func_op_quant_scale_spec);
+
+    RewritePatternSet patterns(ctx);
+    // Convert quant stats to int8 quantization parameters.
+    // Currently, only activation stats are imported, so narrow_range = false.
+    patterns.add<tf_quant::ConvertStatsToQDQs<
+        mlir::quant::ir::QuantizeCastOp, mlir::quant::ir::DequantizeCastOp>>(
+        bit_width_,
+        /*narrow_range=*/false,
+        /*is_signed=*/true,
+        /*legacy_float_scale=*/false, ctx);
+    // Convert all constants to arith::ConstantOp as quantization driver can
+    // deal with the arith::ConstantOp instances.
+    patterns.add<ConvertTFConstOpToArithConstOp>(ctx);
+    patterns.add<ConvertStablehloConstToArithConstOp>(ctx);
+    if (failed(applyPatternsGreedily(func_op, std::move(patterns)))) {
+      signalPassFailure();
+    }
+
+    // Finally, the quantization parameters can be propagated to the rest of the
+    // values (tensors).
+    ApplyQuantizationParamsPropagation(
+        func_op, /*is_signed=*/true, bit_width_,
+        !enable_per_channel_quantized_weight_, func_op_quant_spec,
+        func_op_quant_scale_spec,
+        /*infer_tensor_ranges=*/true, /*legacy_float_scale=*/false,
+        /*is_qdq_conversion=*/false);
+
+    // Restore constants as stablehlo::ConstantOp.
+    RewritePatternSet patterns_2(ctx);
+    patterns_2
+        .add<MergeConsecutiveQuantizeCast, ConvertArithConstToStablehloConstOp>(
+            ctx);
+    if (failed(applyPatternsGreedily(func_op, std::move(patterns_2)))) {
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect PrepareQuantize pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizePass(
+    const bool enable_per_channel_quantized_weight, const int bit_width) {
+  return std::make_unique<PrepareQuantizePass>(
+      enable_per_channel_quantized_weight, bit_width);
+}
+
+}  // namespace stablehlo
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.cc
new file mode 100644
index 000000000000..028d7e861d21
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.cc
@@ -0,0 +1,1039 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.h"
+
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BlockSupport.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+#define DEBUG_TYPE "populate-quantization-patterns"
+
+namespace mlir::tf_quant::stablehlo {
+
+namespace {
+
+using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::BroadcastInDimOp;
+using ::mlir::stablehlo::ConcatenateOp;
+using ::mlir::stablehlo::ConvolutionOp;
+using ::mlir::stablehlo::DotGeneralOp;
+using ::mlir::stablehlo::DynamicBroadcastInDimOp;
+using ::mlir::stablehlo::GatherOp;
+using ::mlir::stablehlo::GetDimensionSizeOp;
+using ::mlir::stablehlo::ReshapeOp;
+using ::mlir::stablehlo::UniformQuantizeOp;
+using ::mlir::tf_quant::FindUserOfType;
+using ::mlir::tf_quant::TryCast;
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizedDimension;
+using ::stablehlo::quantization::QuantizedType;
+using ::stablehlo::quantization::StaticRangePtq;
+
+constexpr StringRef kEntryFuncAttrName = "_entry_function";
+
+// Returns broadcasted user op of an input op. Returns null if
+// the op is not broadcasted or not the intended type.
+// Supports both static broadcast and dynamic broadcast.
+// Note that the patterns below differ from lifted patterns as
+// ShapeLegalizeToHloPass is ran prior to running this pass.
+//
+// Dynamically broadcasted bias due to unknown input batch size
+// usually has the following pattern. In the example below,
+// the input operand would be stablehlo.convolution op, and return value would
+// be stablehlo.add op.
+//
+// ```
+// %0 = stablehlo.constant dense<3>
+// %1 = stablehlo.constant dense<4>
+// %2 = stablehlo.constant dense<2>
+// %3 = stablehlo.convolution(%%arg0, %%arg1) :
+//          (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+// %4 = stablehlo.get_dimension_size %3, dim = 0 :
+//          (tensor<?x3x4x2xf32>) -> tensor<i32>
+// %5 = stablehlo.reshape %4 :
+//          (tensor<i32>) -> tensor<1xi32>
+// %6 = stablehlo.concatenate %5, %0, %1, %2, dim = 0 :
+//          (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>)
+//            -> tensor<4xi32>
+// %7 = stablehlo.dynamic_broadcast_in_dim %arg2, %6
+// %8 = stablehlo.add %3, %7
+// ```
+//
+// Statically broadcasted bias will be broadcasted to match the accumulation.
+// ```
+// %3 = stablehlo.convolution(%%arg0, %%arg1) :
+//          (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+// %4 = stablehlo.broadcast_in_dim %arg2, %3
+// %5 = stablehlo.add %3, %4
+// ```
+template <typename T>
+Operation* GetBroadcastedUserOp(Operation* op) {
+  // Broadcast bias for known input shape.
+  auto broadcast_in_dim_op = FindUserOfType<BroadcastInDimOp>(op);
+  if (broadcast_in_dim_op != nullptr) {
+    auto target_op = FindUserOfType<T>(broadcast_in_dim_op);
+    if (target_op != nullptr) return target_op;
+  }
+  // Broadcast bias for unknown input shape.
+  auto get_dimension_size_op = FindUserOfType<GetDimensionSizeOp>(op);
+  if (get_dimension_size_op == nullptr) return nullptr;
+
+  auto reshape_op = FindUserOfType<ReshapeOp>(get_dimension_size_op);
+  if (reshape_op == nullptr) return nullptr;
+
+  auto concatenate_op = FindUserOfType<ConcatenateOp>(reshape_op);
+  if (concatenate_op == nullptr) return nullptr;
+
+  auto dynamic_broadcast_in_dim_op =
+      FindUserOfType<DynamicBroadcastInDimOp>(concatenate_op);
+  if (dynamic_broadcast_in_dim_op == nullptr) return nullptr;
+
+  auto target_op = FindUserOfType<T>(dynamic_broadcast_in_dim_op);
+  return target_op;
+}
+
+// Gets the corresponding quantized function name from the given function name.
+// Example: "composite_dot_general_fn_1" => "quantized_dot_general_fn"
+std::string GetQuantizedFunctionName(const StringRef func_name) {
+  return Twine(kQuantizedFuncPrefix)
+      .concat(func_name.rsplit(kCompositeFuncPrefix).second)
+      .str();
+}
+
+// Returns true if `xla_call_module_op` is quantized. To be considered
+// quantized, it should meet three conditions:
+// 1. At least one of the inputs and outputs should be a uniform quantized type.
+// 2. `xla_call_module_op` should have the `kQuantTraitAttrName` attribute.
+// 3. It should also have the `kEntryFuncAttrName` attribute, which points to
+//    the function that `xla_call_module_op` represents.
+bool IsQuantizedXlaCallModuleOp(TF::XlaCallModuleOp xla_call_module_op) {
+  return !quant::IsOpNotQuantized(xla_call_module_op) &&
+         xla_call_module_op->hasAttr(kQuantTraitAttrName) &&
+         xla_call_module_op->hasAttr(kEntryFuncAttrName);
+}
+
+// Returns the entry function, i.e. the callee of `xla_call_module_op`.
+func::FuncOp GetEntryFuncOp(TF::XlaCallModuleOp xla_call_module_op,
+                            const SymbolTable symbol_table) {
+  const auto entry_function_symbol_ref =
+      xla_call_module_op->getAttrOfType<FlatSymbolRefAttr>(kEntryFuncAttrName);
+
+  return dyn_cast_or_null<func::FuncOp>(
+      symbol_table.lookup(entry_function_symbol_ref.getValue()));
+}
+
+// Replaces the function type of `entry_func_op` to a quantized one, matching
+// the input and output types of `xla_call_module_op`.
+void SetQuantizedFunctionType(PatternRewriter& rewriter,
+                              func::FuncOp entry_func_op,
+                              TF::XlaCallModuleOp xla_call_module_op) {
+  SmallVector<Type> arg_types;
+  SmallVector<Location> arg_locs;
+  for (const Value arg : xla_call_module_op.getArgs()) {
+    arg_types.push_back(arg.getType());
+    arg_locs.push_back(arg.getLoc());
+  }
+
+  SmallVector<Type> output_types;
+  for (const Value output : xla_call_module_op.getOutput()) {
+    output_types.push_back(output.getType());
+  }
+
+  entry_func_op.setFunctionType(
+      rewriter.getFunctionType(arg_types, output_types));
+
+  // Replace argument types and locs.
+  Block& entry = entry_func_op->getRegion(0).front();
+  for (auto [arg, arg_type, arg_loc] :
+       llvm::zip_equal(entry.getArguments(), arg_types, arg_locs)) {
+    arg.setType(arg_type);
+    arg.setLoc(arg_loc);
+  }
+}
+
+// Creates a UniformQuantize op and sets it as return op.
+// The requantize scale and zero point should be determined from the
+// `entry_func_op`'s output, containing information on layerStats of the
+// entire function.
+void CreateAndReturnUniformQuantizeOp(PatternRewriter& rewriter, Operation& op,
+                                      func::FuncOp entry_func_op,
+                                      const Type func_result_type) {
+  // Add i32 -> i8 requantization.
+  UniformQuantizeOp uniform_quant_op = rewriter.create<UniformQuantizeOp>(
+      op.getLoc(), func_result_type, op.getResults());
+  cast<func::ReturnOp>(entry_func_op.getBody().front().getTerminator())
+      .setOperand(0, uniform_quant_op);
+}
+
+template <typename GemmStyleOp>
+// Creates a quantized bias pattern for static and dynamic shape case
+// and sets the quantized bias as the return op.
+void CreateAndReturnQuantizedBiasPattern(
+    Operation* op, PatternRewriter& rewriter, func::FuncOp entry_func_op,
+    const Type func_result_type, const Type accumulation_quantized_element_type,
+    GemmStyleOp gemm_style_op) {
+  const Value bias_op = op->getOperand(1);
+  Value add_op_result = op->getResult(0);
+
+  // Broadcast bias value if unmatched with output shape.
+  auto bcast_op = TryCast<BroadcastInDimOp>(bias_op.getDefiningOp(),
+                                            /*name=*/"broadcast_in_dim_op");
+
+  if (failed(bcast_op)) {
+    bcast_op = TryCast<DynamicBroadcastInDimOp>(
+        bias_op.getDefiningOp(),
+        /*name=*/"dynamic_broadcast_in_dim_op");
+  }
+  // Update the bias type for both static and dynamic broadcasts.
+  if (succeeded(bcast_op)) {
+    Value bcast_op_result = (*bcast_op)->getResult(0);
+    auto bcast_op_result_type =
+        mlir::cast<RankedTensorType>(bcast_op_result.getType());
+    const ArrayRef<int64_t> bcast_shape = bcast_op_result_type.getShape();
+    const TensorType new_bcast_op_result_type = bcast_op_result_type.cloneWith(
+        bcast_shape, accumulation_quantized_element_type);
+    bcast_op_result.setType(new_bcast_op_result_type);
+  }
+
+  const auto add_op_result_type =
+      mlir::cast<RankedTensorType>(add_op_result.getType());
+  const ArrayRef<int64_t> add_op_shape = add_op_result_type.getShape();
+  // For quantized bias add case, lhs, rhs, and result have the same types.
+  const TensorType new_add_op_result_type = add_op_result_type.cloneWith(
+      add_op_shape, accumulation_quantized_element_type);
+  add_op_result.setType(new_add_op_result_type);
+
+  AddOp bias_add_op =
+      rewriter.create<AddOp>(gemm_style_op->getLoc(), gemm_style_op, bias_op);
+
+  CreateAndReturnUniformQuantizeOp(rewriter, *bias_add_op, entry_func_op,
+                                   func_result_type);
+}
+
+// An interface representing patterns that quantizes an entry function's body.
+// The entry function's signatures should have already been quantized at the
+// point of rewriting.
+class EntryFuncBodyQuantizationPattern {
+ public:
+  virtual ~EntryFuncBodyQuantizationPattern() = default;
+
+  // Returns `success()` if `entry_func_op`'s body is eligible for rewriting. At
+  // this point `entry_func_op`'s signature has not been reset with quantized
+  // types.
+  virtual LogicalResult match(func::FuncOp entry_func_op,
+                              const Method& quantization_method) const = 0;
+
+  // Rewrites the `entry_func_op`'s body.
+  virtual void rewrite(func::FuncOp entry_func_op,
+                       const Method& quantization_method,
+                       PatternRewriter& rewriter) const = 0;
+};
+
+// Gemm Style Op: glossary/gemm.
+template <typename GemmStyleOp>
+// Match for all gemm_style op and check for possible fusions.
+LogicalResult MatchGemmStyleOp(func::FuncOp entry_func_op) {
+  const auto op_iterator_range = entry_func_op.getOps<GemmStyleOp>();
+  if (op_iterator_range.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << "Function does not have "
+                            << GemmStyleOp::getOperationName() << " op.\n");
+    return failure();
+  }
+  if (!isa<RankedTensorType>(
+          (*op_iterator_range.begin()).getResult().getType())) {
+    LLVM_DEBUG(llvm::dbgs() << GemmStyleOp::getOperationName()
+                            << " op must have ranked tensor type.\n");
+    return failure();
+  }
+
+  MutableArrayRef<BlockArgument> operands =
+      entry_func_op.getBody().getArguments();
+  // Function must have input, filter, and optionally bias.
+  if (operands.size() != 2 && operands.size() != 3) {
+    LLVM_DEBUG(llvm::dbgs() << GemmStyleOp::getOperationName()
+                            << " op function should have 2 or 3 operands.\n");
+    return failure();
+  }
+  return success();
+}
+
+// Gemm Style Op: glossary/gemm.
+template <typename GemmStyleOp>
+void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
+                        const bool enable_per_channel_quantized_weight) {
+  const GemmStyleOp gemm_style_op =
+      *entry_func_op.getOps<GemmStyleOp>().begin();
+
+  const Type input_type = entry_func_op.getArgumentTypes()[0];
+  const Type filter_type = entry_func_op.getArgumentTypes()[1];
+  const Type func_result_type = entry_func_op.getResultTypes()[0];
+
+  Value gemm_style_op_result = gemm_style_op->getResult(0);
+  const auto gemm_style_op_result_type =
+      mlir::cast<RankedTensorType>(gemm_style_op_result.getType());
+  const ArrayRef<int64_t> gemm_style_shape =
+      gemm_style_op_result_type.getShape();
+
+  Type accumulation_quantized_element_type;
+  TensorType new_gemm_style_op_result_type;
+
+  const double input_scale =
+      mlir::cast<UniformQuantizedType>(getElementTypeOrSelf(input_type))
+          .getScale();
+
+  if (enable_per_channel_quantized_weight) {
+    ArrayRef<double> filter_scales =
+        mlir::cast<quant::UniformQuantizedPerAxisType>(
+            getElementTypeOrSelf(filter_type))
+            .getScales();
+    std::vector<double> result_scales;
+    result_scales.reserve(filter_scales.size());
+
+    for (const double filter_scale : filter_scales) {
+      result_scales.push_back(input_scale * filter_scale);
+    }
+
+    const ArrayRef<int64_t> zero_points =
+        mlir::cast<quant::UniformQuantizedPerAxisType>(
+            getElementTypeOrSelf(filter_type))
+            .getZeroPoints();
+
+    // `stablehlo.convolution` assumes the following format:
+    // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+    // `stablehlo.dot_general` can take various formats. We only per-channel
+    // quantize non-batch ops.
+    // `stablehlo.dot_general` legalizable to `tfl.fully_connected` has a
+    // filter rank of 2 with the last dimension as the channel dimension.
+    const int64_t quantization_dimension =
+        mlir::cast<ShapedType>(filter_type).getShape().size() - 1;
+    accumulation_quantized_element_type =
+        quant::CreateI32F32UniformQuantizedPerAxisType(
+            gemm_style_op->getLoc(), *rewriter.getContext(), result_scales,
+            zero_points, quantization_dimension);
+
+    new_gemm_style_op_result_type = gemm_style_op_result_type.cloneWith(
+        gemm_style_shape, accumulation_quantized_element_type);
+  } else {
+    const double filter_scale =
+        mlir::cast<UniformQuantizedType>(getElementTypeOrSelf(filter_type))
+            .getScale();
+    const double result_scale = input_scale * filter_scale;
+
+    accumulation_quantized_element_type =
+        quant::CreateI32F32UniformQuantizedType(
+            gemm_style_op->getLoc(), *rewriter.getContext(), result_scale,
+            /*zero_point=*/0);
+
+    new_gemm_style_op_result_type = gemm_style_op_result_type.cloneWith(
+        gemm_style_shape, accumulation_quantized_element_type);
+  }
+
+  gemm_style_op_result.setType(new_gemm_style_op_result_type);
+
+  rewriter.setInsertionPointAfter(gemm_style_op);
+
+  Operation* next_op = FindUserOfType<>(gemm_style_op);
+
+  // If activation exists, omit clipping op.
+  // Since out_scale and out_zp are computed based on clipped range,
+  // explicit activation clipping op is not required.
+  if (isa<AddOp>(next_op) && gemm_style_op->hasOneUse()) {
+    // bias fusion
+    CreateAndReturnQuantizedBiasPattern(
+        next_op, rewriter, entry_func_op, func_result_type,
+        accumulation_quantized_element_type, gemm_style_op);
+  } else if (auto add_op = cast_or_null<AddOp>(
+                 GetBroadcastedUserOp<AddOp>(gemm_style_op))) {
+    // broadcasted bias fusion
+    rewriter.setInsertionPointAfter(add_op);
+    CreateAndReturnQuantizedBiasPattern(
+        add_op, rewriter, entry_func_op, func_result_type,
+        accumulation_quantized_element_type, gemm_style_op);
+  } else {
+    // Non fusible op
+    // If an op is used multiple times and is not a broadcasted shape case,
+    // do not apply quantization of fused patterns to prevent removal of
+    // dependee ops.
+    CreateAndReturnUniformQuantizeOp(rewriter, *gemm_style_op, entry_func_op,
+                                     func_result_type);
+  }
+}
+
+// Quantizes the entry function's body containing a `DotGeneralOp`.
+class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeDotGeneralOpPattern(
+      const bool enable_per_channel_quantized_weight)
+      : enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
+
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_static_range_ptq()) {
+      return failure();
+    }
+    return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
+  }
+
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
+               PatternRewriter& rewriter) const override {
+    DotGeneralOp dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
+    const bool should_quantize_per_channel =
+        enable_per_channel_quantized_weight_ &&
+        GetDotGeneralQuantizationDim(dot_general_op);
+    RewriteGemmStyleOp<DotGeneralOp>(entry_func_op, rewriter,
+                                     should_quantize_per_channel);
+  }
+
+ private:
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
+};
+
+// Quantizes the entry function's body containing a `ConvolutionOp`.
+class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeConvolutionOpPattern(
+      const bool enable_per_channel_quantized_weight)
+      : enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
+
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_static_range_ptq()) {
+      return failure();
+    }
+    return MatchGemmStyleOp<ConvolutionOp>(entry_func_op);
+  }
+
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
+               PatternRewriter& rewriter) const override {
+    RewriteGemmStyleOp<ConvolutionOp>(
+        entry_func_op, rewriter,
+        enable_per_channel_quantized_weight_ &&
+            IsWeightPerChannelQuantized(quantization_method));
+  }
+
+  // Returns true if the quantization method indicates per-channel quantization
+  // for convolution weights. This method specifically matches a quantization
+  // dimension of 3 for the input index 1 or unspecified quantization dimension
+  // for the input index 1.
+  bool IsWeightPerChannelQuantized(const Method& quantization_method) const {
+    if (quantization_method.has_static_range_ptq()) {
+      const StaticRangePtq& static_range_ptq_spec =
+          quantization_method.static_range_ptq();
+
+      if (static_range_ptq_spec.input_quantized_types().contains(1)) {
+        const QuantizedType& weight_quantized_type =
+            static_range_ptq_spec.input_quantized_types().at(1);
+        if (weight_quantized_type.has_per_tensor()) {
+          return false;
+        }
+        const QuantizedDimension& dimension_specs =
+            weight_quantized_type.dimension_specs();
+        return !dimension_specs.has_dimension() ||
+               dimension_specs.dimension() == 3;
+      }
+    }
+    return false;
+  }
+
+ private:
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
+};
+
+// Quantizes the entry function's body for weight-only quantized op.
+template <typename OpT>
+class QuantizeWeightOnlyOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeWeightOnlyOpPattern(
+      const bool enable_per_channel_quantized_weight)
+      : enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
+
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_weight_only_ptq()) {
+      return failure();
+    }
+    return MatchGemmStyleOp<OpT>(entry_func_op);
+  }
+
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
+               PatternRewriter& rewriter) const override {}
+
+ private:
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
+};
+
+template <typename SingularOpT>
+class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeSingularOpPattern(
+      const bool enable_per_channel_quantized_weight) {}
+
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_static_range_ptq()) {
+      return failure();
+    }
+    const auto op_iterator_range = entry_func_op.getOps<SingularOpT>();
+    if (op_iterator_range.empty()) {
+      LLVM_DEBUG(llvm::dbgs() << "Function does not have "
+                              << SingularOpT::getOperationName() << " op.\n");
+      return failure();
+    }
+
+    // Entry function body should have one block with two ops(op to be quantized
+    // and return op).
+    Region& body = entry_func_op.getBody();
+    if (body.getBlocks().size() != 1 ||
+        body.begin()->getOperations().size() != 2) {
+      return failure();
+    }
+
+    if (!isa<RankedTensorType>(
+            (*op_iterator_range.begin()).getResult().getType())) {
+      LLVM_DEBUG(llvm::dbgs() << SingularOpT::getOperationName()
+                              << " op must have ranked tensor type.\n");
+      return failure();
+    }
+    return success();
+  }
+
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
+               PatternRewriter& rewriter) const override {
+    auto singular_op = *entry_func_op.getOps<SingularOpT>().begin();
+    Value singular_op_result = singular_op.getResult();
+
+    // For ops that require same operand and result types, use explicit
+    // requantize op rather than using `entry_func_op`'s result as op result.
+    auto spec = GetStableHloQuantConstraints(singular_op);
+    const bool has_same_operand_and_result_type =
+        spec->has_same_operand_and_result_type_requirement;
+    if (has_same_operand_and_result_type) {
+      const Type operand_type = entry_func_op.getArgumentTypes()[0];
+      const Type func_result_type = entry_func_op.getResultTypes()[0];
+
+      // Get the quantized tensor manipulation op's output type and update.
+      const auto singular_op_result_type =
+          mlir::cast<RankedTensorType>(singular_op_result.getType());
+      const ArrayRef<int64_t> singular_op_shape =
+          singular_op_result_type.getShape();
+      const TensorType new_singular_op_result_type =
+          singular_op_result_type.cloneWith(
+              singular_op_shape, mlir::cast<UniformQuantizedType>(
+                                     getElementTypeOrSelf(operand_type)));
+      singular_op_result.setType(new_singular_op_result_type);
+
+      // Create requantization op and return.
+      rewriter.setInsertionPointAfter(singular_op);
+      CreateAndReturnUniformQuantizeOp(rewriter, *singular_op, entry_func_op,
+                                       func_result_type);
+    } else {
+      singular_op_result.setType(entry_func_op.getResultTypes()[0]);
+    }
+  }
+};
+
+// Converts `entry_func_op` to be quantized according to the respective
+// inputs and outputs of `xla_call_module_op` that are possibly quantized. It
+// signature (type) is reset to match that of `xla_call_module_op`.
+// `entry_func_body_quantization_pattern` rewrites the function's body, based on
+// the new signature. `quantization_method` specifies the quantization method
+// applied to the quantizable unit `xla_call_module_op` and its corresponding
+// function `entry_func_op`.
+void QuantizeEntryFuncOp(
+    const MLIRContext& ctx, PatternRewriter& rewriter,
+    const TF::XlaCallModuleOp xla_call_module_op, func::FuncOp entry_func_op,
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern,
+    const Method& quantization_method) {
+  SetQuantizedFunctionType(rewriter, entry_func_op, xla_call_module_op);
+
+  body_rewrite_pattern.rewrite(entry_func_op, quantization_method, rewriter);
+
+  // Rename the function to be clear that the function has been quantized.
+  const std::string quantized_function_name =
+      GetQuantizedFunctionName(entry_func_op.getSymName());
+  entry_func_op.setSymName(quantized_function_name);
+}
+
+// Replaces `xla_call_module_op` with a newly created `func::CallOp`, where the
+// callee is `callee_func_op`. The existence of `kQuantizationMethodAttr` in
+// `xla_call_module_op` should be guaranteed.
+void ReplaceXlaCallModuleOpWithNewCallOp(TF::XlaCallModuleOp xla_call_module_op,
+                                         func::FuncOp callee_func_op,
+                                         PatternRewriter& rewriter) {
+  OpBuilder::InsertionGuard insertion_guard(rewriter);
+
+  // Create a new `CallOp` that calls `callee_func_op`.
+  rewriter.setInsertionPoint(xla_call_module_op);
+  auto call_op =
+      rewriter.create<func::CallOp>(xla_call_module_op.getLoc(), callee_func_op,
+                                    xla_call_module_op.getArgs());
+
+  // Transfer the `kQuantizationMethodAttr` attribute to the `CallOp`,
+  // indicating what `Method` has been applied to the quantized unit.
+  call_op->setAttr(
+      kQuantizationMethodAttr,
+      xla_call_module_op->getAttrOfType<StringAttr>(kQuantizationMethodAttr));
+
+  rewriter.replaceOp(xla_call_module_op, call_op);
+}
+
+// Replaces a quantized `xla_call_module_op` with a `func::CallOp`. The callee
+// is expected to remain unquantized (thus having a signature mismatch), and it
+// is also quantized accordingly.
+void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
+    const MLIRContext& ctx, PatternRewriter& rewriter,
+    TF::XlaCallModuleOp xla_call_module_op,
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern,
+    const Method& quantization_method) {
+  const ModuleOp module_op = xla_call_module_op->getParentOfType<ModuleOp>();
+
+  func::FuncOp entry_func_op =
+      GetEntryFuncOp(xla_call_module_op, SymbolTable(module_op));
+  QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
+                      body_rewrite_pattern, quantization_method);
+
+  ReplaceXlaCallModuleOpWithNewCallOp(xla_call_module_op, entry_func_op,
+                                      rewriter);
+}
+
+// Pattern that mainly does two things:
+//
+//   1. Replaces quantized `TF::XlaCallModuleOp` with a `func::CallOp`.
+//   2. Quantizes the callee function.
+//
+// The inputs of this pattern assumes an invalid IR, where even if a
+// `TF::XlaCallModuleOp` is quantized the callee remains unquantized. Step (2)
+// not only replaces the input and output tensor types into quantized ones, but
+// also rewrites the body with a quantized equivalent.
+//
+// `FuncBodyRewritePatternT` defines how a function body is quantized and
+// rewritten.
+template <typename FuncBodyRewritePatternT,
+          typename = std::enable_if_t<std::is_base_of_v<
+              EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
+class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
+ public:
+  explicit XlaCallModuleOpToCallOp(
+      MLIRContext& ctx, const bool enable_per_channel_quantized_weight)
+      : OpRewritePattern<TF::XlaCallModuleOp>::OpRewritePattern(&ctx),
+        enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
+
+  LogicalResult matchAndRewrite(TF::XlaCallModuleOp op,
+                                PatternRewriter& rewriter) const override {
+    ModuleOp module_op = op->getParentOfType<ModuleOp>();
+
+    // Ignore ops without quantization method.
+    // Consider adding checks for individual methods.
+    if (!op->getAttr(kQuantizationMethodAttr)) return failure();
+
+    // Ignore unquantized ops.
+    if (!IsQuantizedXlaCallModuleOp(op)) return failure();
+
+    // For weight-only quantization, op should be hybrid quantized.
+    if (HasWeightOnlyPtqMethod(op) && !IsHybridQuantizedOp(op)) {
+      return failure();
+    }
+
+    func::FuncOp entry_func_op = GetEntryFuncOp(op, SymbolTable(module_op));
+    if (!entry_func_op) {
+      op->emitError("Failed to find a valid entry function.");
+      return failure();
+    }
+    Method quantization_method = GetQuantizationMethodOrDefault(op);
+    if (FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
+            .match(entry_func_op, quantization_method)
+            .failed()) {
+      return failure();
+    }
+
+    // TODO: b/331145946 - Each quantization method should be valid
+    // (GetQuantizationMethodOrDefault swallows invalid method attribute). Check
+    // the validity in `match()`. Use accessors to achieve this.
+    ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
+        *rewriter.getContext(), rewriter, op,
+        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_),
+        quantization_method);
+    return success();
+  }
+
+ private:
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
+};
+
+// Quantizes op with regions such as stablehlo.reduce_window op.
+// Quantizes only when the nested region consists of ops whose quantization
+// parameters can be propagated from outside.
+class QuantizeOpWithRegionPattern
+    : public OpRewritePattern<mlir::quant::ir::DequantizeCastOp> {
+ public:
+  explicit QuantizeOpWithRegionPattern(MLIRContext& ctx)
+      : OpRewritePattern<mlir::quant::ir::DequantizeCastOp>(&ctx) {};
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::DequantizeCastOp op,
+                                PatternRewriter& rewriter) const final {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(mlir::quant::ir::DequantizeCastOp op) const {
+    // Match only when there is one user of the dequantize op.
+    if (!op.getResult().hasOneUse()) {
+      return failure();
+    }
+
+    for (Operation* op_with_region : op.getResult().getUsers()) {
+      // Among the ops with regions, only reduce_window op is supported for now.
+      if (!isa<mlir::stablehlo::ReduceWindowOp>(op_with_region)) {
+        return failure();
+      }
+
+      if (!IsNestedRegionQuantizable(op_with_region)) {
+        return failure();
+      }
+
+      // Quantization parameters can be propagated only for same-scale ops and
+      // same-scale ops are quantized only when they are connected to quantized
+      // composite functions.
+      if (!GetStableHloQuantConstraints(op_with_region)
+               ->has_same_scale_requirement ||
+          !IsConnectedWithQuantizedCompsiteFunction(op_with_region)) {
+        return failure();
+      }
+    }
+    return success();
+  }
+
+  void rewrite(mlir::quant::ir::DequantizeCastOp op,
+               PatternRewriter& rewriter) const {
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (Operation* op_with_region : op.getResult().getUsers()) {
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(op_with_region->getNumOperands());
+      for (Value operand : op_with_region->getOperands()) {
+        const Type operand_type = operand.getType();
+        if (mlir::isa<NoneType>(operand_type)) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        const Type element_type =
+            mlir::cast<TensorType>(operand.getType()).getElementType();
+        if (auto dq_op = dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+                operand.getDefiningOp())) {
+          inputs.push_back(dq_op.getOperand());
+        } else if (isa<IntegerType>(element_type)) {
+          // If the operand is an integer tensor, then it doesn't require the
+          // DequantizeOp in the pattern.
+          inputs.push_back(operand);
+        } else {
+          return;
+        }
+      }
+
+      // Collect all the quantized outputs and replace them by the results of
+      // the new quantized op.
+      SmallVector<Value, 4> outputs_replaced;
+      SmallVector<Type, 4> output_types;
+      output_types.reserve(op_with_region->getNumResults());
+      for (const Value result : op_with_region->getResults()) {
+        const Type result_type = result.getType();
+        if (mlir::isa<NoneType>(result_type)) {
+          outputs_replaced.push_back(result);
+          output_types.push_back(result_type);
+          continue;
+        }
+        const Type result_element_type =
+            mlir::cast<TensorType>(result.getType()).getElementType();
+        // If the user is the QuantizeOp, it must be the only user.
+        if (result.hasOneUse() &&
+            isa<mlir::quant::ir::QuantizeCastOp>(*result.user_begin())) {
+          auto user =
+              cast<mlir::quant::ir::QuantizeCastOp>(*result.user_begin());
+          outputs_replaced.push_back(user.getResult());
+          output_types.push_back(user.getType());
+        } else if (isa<IntegerType>(result_element_type)) {
+          // If the result is an integer tensor, then it doesn't require the
+          // dequantize op in the pattern.
+          outputs_replaced.push_back(result);
+          output_types.push_back(result.getType());
+        } else {
+          return;
+        }
+      }
+
+      rewriter.setInsertionPointAfter(op_with_region);
+      OperationState new_state(op_with_region->getLoc(),
+                               op_with_region->getName().getStringRef(), inputs,
+                               output_types, op_with_region->getAttrs());
+      for (int i = 0; i < op_with_region->getNumRegions(); ++i) {
+        new_state.addRegion();
+      }
+      Operation* quantized_op = rewriter.create(new_state);
+      for (const auto& [index, region] :
+           llvm::enumerate(op_with_region->getRegions())) {
+        Region& target_region = quantized_op->getRegion(index);
+        IRMapping mapping;
+        region.cloneInto(&target_region, mapping);
+      }
+
+      const Type operand_type = quantized_op->getOperandTypes()[0];
+      const Type element_type =
+          mlir::cast<TensorType>(operand_type).getElementType();
+      for (Region& region : quantized_op->getRegions()) {
+        ReplaceTypesInNestedRegion(region, element_type);
+      }
+
+      for (auto [index, output] : llvm::enumerate(outputs_replaced)) {
+        output.replaceAllUsesWith(quantized_op->getResult(index));
+      }
+    }
+  }
+
+  // Checks if an op is quantizable in a nested region.
+  bool IsOpQuantizableInNestedRegion(Operation& op) const {
+    return isa<mlir::stablehlo::MaxOp, mlir::stablehlo::ReturnOp>(op);
+  }
+
+  // Checks if a region only consists of ops that are quantizable in a nested
+  // region.
+  // tf.CustomAggregator op cannot be inserted into region of a StableHLO op,
+  // thus calibration is impossible within a nested region. Therefore, when an
+  // op involves a region, the op is only quantizable when the region only
+  // consists of ops whose quantization parameters can be propagated from
+  // outside.
+  bool IsNestedRegionQuantizable(Operation* op) const {
+    for (Region& region : op->getRegions()) {
+      for (Operation& op : region.getOps()) {
+        if (!IsOpQuantizableInNestedRegion(op)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Replaces all types in nested regions under the assumption that the body
+  // consists of same-scale ops only.
+  void ReplaceTypesInNestedRegion(Region& region,
+                                  const Type element_type) const {
+    for (BlockArgument arg : region.getArguments()) {
+      arg.setType(ReplaceElementType(arg.getType(), element_type));
+    }
+
+    for (Operation& op : region.getOps()) {
+      for (Value operand : op.getOperands()) {
+        operand.setType(ReplaceElementType(operand.getType(), element_type));
+      }
+
+      for (Value result : op.getResults()) {
+        result.setType(ReplaceElementType(result.getType(), element_type));
+      }
+    }
+  }
+
+  // Replaces element type of the given tensor type while preserving shape of
+  // the given type. If the given type is not tensor type, just return itself.
+  Type ReplaceElementType(const Type type, const Type element_type) const {
+    if (TensorType tensor_type = mlir::dyn_cast<TensorType>(type)) {
+      return tensor_type.clone(element_type);
+    }
+    return type;
+  }
+};
+
+}  // namespace
+
+// Checks if an op calls a composite function and all the inputs and outputs are
+// quantized.
+bool IsQuantizedCompositeFunction(func::CallOp call_op) {
+  if (!call_op.getCallee().starts_with("quantized_")) {
+    return false;
+  }
+
+  bool has_quantized_types = false;
+  for (Value operand : call_op.getOperands()) {
+    if (const TensorType type = mlir::dyn_cast<TensorType>(operand.getType())) {
+      if (mlir::isa<FloatType>(type.getElementType())) {
+        return false;
+      }
+      if (mlir::isa<UniformQuantizedType, quant::UniformQuantizedPerAxisType>(
+              type.getElementType())) {
+        has_quantized_types = true;
+      }
+    }
+  }
+  for (const Value result : call_op.getResults()) {
+    if (const auto type = mlir::dyn_cast<TensorType>(result.getType())) {
+      if (mlir::isa<FloatType>(type.getElementType())) {
+        return false;
+      }
+      if (mlir::isa<UniformQuantizedType, quant::UniformQuantizedPerAxisType>(
+              type.getElementType())) {
+        has_quantized_types = true;
+      }
+    }
+  }
+  return has_quantized_types;
+}
+
+bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
+  for (const Value operand : same_scale_op->getOperands()) {
+    auto dq_op = dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+        operand.getDefiningOp());
+    if (!dq_op) continue;
+
+    Operation* preceding_op = dq_op.getArg().getDefiningOp();
+    if (!preceding_op) continue;
+
+    // Check whether the preceding op is a quantized composite function.
+    if (isa<func::CallOp>(preceding_op)) {
+      auto call_op = cast<func::CallOp>(preceding_op);
+      if (!IsQuantizedCompositeFunction(call_op)) continue;
+      return true;
+    }
+
+    // Check whether the preceding op is a quantized same-scale op.
+    if (GetStableHloQuantConstraints(preceding_op)
+            ->has_same_scale_requirement) {
+      for (const OpResult result : preceding_op->getResults()) {
+        const Type element_type = getElementTypeOrSelf(result.getType());
+        if (mlir::isa<UniformQuantizedType>(element_type)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  for (const Value result : same_scale_op->getResults()) {
+    // If the user is the Quantize op, it must be the only user.
+    if (!result.hasOneUse() ||
+        !isa<mlir::quant::ir::QuantizeCastOp>(*result.user_begin())) {
+      continue;
+    }
+
+    auto q_op = cast<mlir::quant::ir::QuantizeCastOp>(*result.user_begin());
+    for (Operation* following_op : q_op->getUsers()) {
+      // Check whether the following op is a quantized composite function.
+      if (isa<func::CallOp>(following_op)) {
+        auto call_op = cast<func::CallOp>(following_op);
+        if (!IsQuantizedCompositeFunction(call_op)) continue;
+        return true;
+      }
+
+      // Check whether the following op is a quantized same-scale op.
+      if (GetStableHloQuantConstraints(following_op)
+              ->has_same_scale_requirement) {
+        for (Value operand : following_op->getOperands()) {
+          const Type element_type = getElementTypeOrSelf(operand.getType());
+          if (mlir::isa<UniformQuantizedType>(element_type)) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+// Compute heavy patterns should be quantized for both server and ODML targets.
+// Most patterns here are useful when quantized since they are compute heavy
+// or memory bound.
+void PopulateCommonQuantizationPatterns(
+    MLIRContext& ctx, RewritePatternSet& patterns,
+    const bool enable_per_channel_quantized_weight) {
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(
+      ctx, enable_per_channel_quantized_weight);
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
+      ctx, enable_per_channel_quantized_weight);
+  patterns
+      .add<XlaCallModuleOpToCallOp<QuantizeWeightOnlyOpPattern<ConvolutionOp>>>(
+          ctx, enable_per_channel_quantized_weight);
+  patterns
+      .add<XlaCallModuleOpToCallOp<QuantizeWeightOnlyOpPattern<DotGeneralOp>>>(
+          ctx, enable_per_channel_quantized_weight);
+  // TODO: b/307620772 - Per-channel quantization for gather.
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<GatherOp>>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
+  // Populate pattern for quantization of ops with regions such as
+  // `stablehlo.reduce_window` op.
+  patterns.add<QuantizeOpWithRegionPattern>(ctx);
+}
+
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns) {
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<AddOp>>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.h
new file mode 100644
index 000000000000..f1098ed0aa12
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.h
@@ -0,0 +1,254 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TF_QUANTIZATION_PATTERNS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TF_QUANTIZATION_PATTERNS_H_
+
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/tf_stablehlo_op_quant_spec.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+// Checks whether an op is connected with a quantized composite function. If
+// not, the same-scale op will not be quantized. This decision is based on the
+// current assumption that the performance gain of the same-scale op itself
+// could not beat the overhead of the quantize and dequantize routines need to
+// be added around that op. When the assumption changes, this policy might
+// change as well.
+bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op);
+
+// A base rewrite pattern which matches any N-in-M-out operations with
+// quantization parameters propagated to at least one of its operands. The
+// quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
+// Each matched pattern are rewritten by its quantized alternatives.
+//
+// Quantization method is determined by the `_quantization_method` attributes
+// attached to each quantizable units.
+//
+// Template constraints are imposed as follows:
+//
+// * `QuantizeOpT` should have only one operand.
+// * `DequantizeOpT` should have only one result.
+template <typename ConcreteT, typename QuantizeOpT, typename DequantizeOpT,
+          typename VerifierT, typename RootOpT = DequantizeOpT,
+          typename = std::enable_if_t<
+              QuantizeOpT::template hasTrait<OpTrait::OneOperand>() &&
+              DequantizeOpT::template hasTrait<OpTrait::OneResult>()>>
+class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
+ public:
+  explicit StableHloQuantizationPattern(MLIRContext* context)
+      // Set the benefit to a large number so that it is always preferred.
+      : OpRewritePattern<RootOpT>(context, /*benefit=*/300) {}
+
+ private:
+  // Collects all candidate ops for quantization, which are the
+  // `dequantize_op`'s users.
+  FailureOr<SmallVector<Operation*>> CollectCandidateOps(
+      DequantizeOpT dequantize_op) const {
+    auto users = dequantize_op->getResult(0).getUsers();
+    return SmallVector<Operation*>(users.begin(), users.end());
+  }
+
+  // Collects all candidate ops for quantization, which is the operand of
+  // `quantize_op`. If successful, this always returns one element which is the
+  // operand of `quantize_op`.
+  FailureOr<SmallVector<Operation*>> CollectCandidateOps(
+      QuantizeOpT quantize_op) const {
+    Value operand = quantize_op->getOperand(0);
+    if (QuantizedType::getQuantizedElementType(operand.getType())) {
+      // The input of the quantize op has already been quantized, i.e.
+      // rescale.
+      return failure();
+    }
+
+    Operation* operand_op = operand.getDefiningOp();
+    if (operand_op == nullptr) {
+      // When `QuantizeOpT`'s operand does not have a defining op, it means it
+      // is a `BlockArgument`. The pattern does not match if there is no op to
+      // quantize.
+      return failure();
+    }
+
+    if (operand_op->hasTrait<OpTrait::ConstantLike>()) {
+      // Const-> QuantizeOp pattern will be handled separately.
+      return failure();
+    }
+
+    return SmallVector<Operation*>{operand_op};
+  }
+
+  LogicalResult matchAndRewrite(RootOpT op,
+                                PatternRewriter& rewriter) const override {
+    // Collect all the candidate ops for quantization.
+    FailureOr<SmallVector<Operation*>> candidate_ops = CollectCandidateOps(op);
+    // Safeguard check to ensure that there is at least one quantizable op.
+    if (failed(candidate_ops) || candidate_ops->empty()) return failure();
+
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (Operation* candidate_op : *candidate_ops) {
+      // If it is requantize op, we shouldn't rewrite this op.
+      if (isa<QuantizeOpT, DequantizeOpT>(candidate_op)) {
+        return failure();
+      }
+
+      // If the op is terminator, we shouldn't rewrite.
+      if (candidate_op->hasTrait<OpTrait::IsTerminator>()) {
+        return failure();
+      }
+
+      if (!IsOpQuantizableStableHlo(candidate_op)) {
+        return failure();
+      }
+
+      if (GetStableHloQuantConstraints(candidate_op)
+              ->has_same_scale_requirement &&
+          !IsConnectedWithQuantizedCompsiteFunction(candidate_op)) {
+        return failure();
+      }
+
+      // Ops with regions will be quantized in a separate pattern.
+      if (isa<mlir::stablehlo::ReduceWindowOp>(candidate_op)) {
+        return failure();
+      }
+
+      const bool weight_only_quantizable =
+          IsWeightOnlyQuantizableOp(*candidate_op);
+
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(candidate_op->getNumOperands());
+      for (auto operand : candidate_op->getOperands()) {
+        Type operand_type = operand.getType();
+        if (mlir::isa<NoneType>(operand_type)) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        auto ele_type =
+            mlir::cast<TensorType>(operand.getType()).getElementType();
+        if (auto dq_op =
+                dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
+          inputs.push_back(dq_op.getOperand());
+        } else if (!ele_type.isF32()) {
+          // If the operand is an integer tensor, then it doesn't require the
+          // DequantizeOp in the pattern.
+          inputs.push_back(operand);
+        } else if (weight_only_quantizable) {
+          inputs.push_back(operand);
+        } else {
+          return failure();
+        }
+      }
+
+      // Collect all the quantized outputs and replace them by the results of
+      // the new quantized op.
+      llvm::SmallDenseMap<Value, int> outputs_replaced;
+      SmallVector<Type, 4> output_types;
+      output_types.reserve(candidate_op->getNumResults());
+      for (const auto& enumerated_result :
+           llvm::enumerate(candidate_op->getResults())) {
+        Value result = enumerated_result.value();
+        Type result_type = result.getType();
+        // Add this to the test coverage once we create test ops with none type
+        // results.
+        if (mlir::isa<NoneType>(result_type)) {
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result_type);
+          continue;
+        }
+        Type result_ele_type =
+            mlir::cast<TensorType>(result.getType()).getElementType();
+        // If the user is the QuantizeOp, it must be the only user.
+        if (result.hasOneUse() && isa<QuantizeOpT>(*result.user_begin())) {
+          auto user = cast<QuantizeOpT>(*result.user_begin());
+          outputs_replaced.insert(
+              {user.getResult(), enumerated_result.index()});
+          output_types.push_back(user.getType());
+        } else if (!result_ele_type.isF32()) {
+          // If the result is an integer tensor, then it doesn't require the
+          // D op in the pattern.
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result.getType());
+        } else if (weight_only_quantizable) {
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result.getType());
+        } else {
+          return failure();
+        }
+      }
+
+      rewriter.setInsertionPointAfter(candidate_op);
+      OperationState new_state(candidate_op->getLoc(),
+                               candidate_op->getName().getStringRef(), inputs,
+                               output_types, candidate_op->getAttrs());
+      for (int i = 0; i < candidate_op->getNumRegions(); ++i) {
+        new_state.addRegion();
+      }
+      Operation* quantized_op = rewriter.create(new_state);
+      if (candidate_op->getNumRegions() != 0) {
+        for (const auto& indexed_regions :
+             llvm::enumerate(candidate_op->getRegions())) {
+          Region& target_region =
+              quantized_op->getRegion(indexed_regions.index());
+          IRMapping mapping;
+          indexed_regions.value().cloneInto(&target_region, mapping);
+        }
+      }
+      for (auto output : outputs_replaced) {
+        output.getFirst().replaceAllUsesWith(
+            quantized_op->getResult(output.getSecond()));
+      }
+    }
+    return success();
+  }
+};
+
+// Populates common patterns that are usually compute heavy or memory bound.
+void PopulateCommonQuantizationPatterns(
+    MLIRContext& ctx, RewritePatternSet& patterns,
+    bool enable_per_channel_quantized_weight);
+
+// Populates conversion patterns for all quantizable ops, including
+// ops that are not compute-heavy and data movement ops.
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns);
+
+}  // namespace mlir::tf_quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TF_QUANTIZATION_PATTERNS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize.cc
new file mode 100644
index 000000000000..5dad68992a80
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize.cc
@@ -0,0 +1,111 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantization_patterns.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_QUANTIZEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+// Base struct for quantization.
+template <typename ConcreteT,
+          typename RootOpT = mlir::quant::ir::DequantizeCastOp>
+struct StableHloQuantizationBase
+    : public StableHloQuantizationPattern<ConcreteT,
+                                          mlir::quant::ir::QuantizeCastOp,
+                                          mlir::quant::ir::DequantizeCastOp,
+                                          /*VerifierT=*/void, RootOpT> {
+  explicit StableHloQuantizationBase(MLIRContext* ctx)
+      : StableHloQuantizationPattern<ConcreteT, mlir::quant::ir::QuantizeCastOp,
+                                     mlir::quant::ir::DequantizeCastOp,
+                                     /*VerifierT=*/void, RootOpT>(ctx) {}
+
+  static bool AllowWeightOnlyQuantization(Operation& op) { return false; }
+};
+
+// Quantization rewrite pattern using DQ as the root op.
+struct StableHloQuantization
+    : public StableHloQuantizationBase<StableHloQuantization> {
+  explicit StableHloQuantization(MLIRContext* ctx)
+      : StableHloQuantizationBase<StableHloQuantization>(ctx) {}
+};
+
+// Quantization rewrite pattern using Q as the root op. This is for the
+// quantizable ops without floating-point operands.
+struct StableHloQuantizationReverse
+    : public StableHloQuantizationBase<StableHloQuantizationReverse,
+                                       mlir::quant::ir::QuantizeCastOp> {
+  explicit StableHloQuantizationReverse(MLIRContext* ctx)
+      : StableHloQuantizationBase<StableHloQuantizationReverse,
+                                  mlir::quant::ir::QuantizeCastOp>(ctx) {}
+};
+
+class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizePass)
+
+  using impl::QuantizePassBase<QuantizePass>::QuantizePassBase;
+
+  explicit QuantizePass(const bool enable_per_channel_quantized_weight) {
+    enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+  }
+
+ private:
+  void runOnOperation() override;
+};
+
+void QuantizePass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<StableHloQuantization, StableHloQuantizationReverse>(&ctx);
+
+  PopulateCommonQuantizationPatterns(ctx, patterns,
+                                     enable_per_channel_quantized_weight_);
+
+  // Quantize all quantizable ops, including ops that are not compute-heavy.
+  PopulateAllQuantizablePatterns(ctx, patterns);
+
+  if (failed(applyPatternsGreedily(module_op, std::move(patterns)))) {
+    // There are cases where no rewrites happen even if a pattern matches,
+    // causing this to result in a convergence failure. Consider this as a
+    // best-effort.
+    module_op.emitWarning("Failed to converge pattern at QuantizePass.");
+  }
+}
+
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize_composite_functions.cc
new file mode 100644
index 000000000000..38379ef7b12d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize_composite_functions.cc
@@ -0,0 +1,114 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+
+#define DEBUG_TYPE "quantize-composite-functions"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_QUANTIZECOMPOSITEFUNCTIONSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+using ::tensorflow::quantization::RunPassesOnModuleOp;
+
+class QuantizeCompositeFunctionsPass
+    : public impl::QuantizeCompositeFunctionsPassBase<
+          QuantizeCompositeFunctionsPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeCompositeFunctionsPass)
+
+  using impl::QuantizeCompositeFunctionsPassBase<
+      QuantizeCompositeFunctionsPass>::QuantizeCompositeFunctionsPassBase;
+
+  explicit QuantizeCompositeFunctionsPass(
+      const bool enable_per_channel_quantized_weight) {
+    enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+  }
+
+ private:
+  void runOnOperation() override;
+};
+
+void QuantizeCompositeFunctionsPass::runOnOperation() {
+  MLIRContext& ctx = getContext();
+
+  PassManager pm(&ctx);
+  // Intermediate output from QuantizePass will have quantized ops
+  // (XlaCallModuleOps) with quantized input and output types, which are not
+  // allowed in the TF dialect.
+  pm.enableVerifier(false);
+
+  PrepareQuantizePassOptions options;
+  options.enable_per_channel_quantized_weight_ =
+      enable_per_channel_quantized_weight_;
+  // Change this to user-given bit width once we have custom configuration.
+  options.bit_width_ = 8;
+
+  // Insert quantization parameters for weights for ops with `weight_only_ptq`
+  // attribute.
+  pm.addNestedPass<func::FuncOp>(createInsertWeightParamPass());
+
+  // PrepareQuantizePass uses SymbolTable to fetch relevant GEMM ops for
+  // determining quantization attributes. This requires module-level context.
+  pm.addPass(createPrepareQuantizePass(options));
+
+  QuantizePassOptions quantize_options;
+  quantize_options.enable_per_channel_quantized_weight_ =
+      enable_per_channel_quantized_weight_;
+
+  // QuantizePass modifies FuncOps referenced outside of its given scope
+  // and therefore requires a module-level context.
+  pm.addPass(createQuantizePass(quantize_options));
+  pm.addNestedPass<func::FuncOp>(createPostQuantizePass());
+
+  // Convert XlaCallModuleOps lifted but not quantized to func.call op.
+  // The reasons these ops are not quantized may be:
+  // 1. Disabled due to selective quantization.
+  // 2. Not supported, e.g. add op for server.
+  pm.addPass(createXlaCallModuleToCallPass());
+
+  // TODO: b/321729008 - move this implementation to quantization_patterns.cc.
+  if (merge_fusion_with_dequantize_) {
+    pm.addPass(createMergeFusionWithDequantizePass());
+  }
+
+  ModuleOp module_op = getOperation();
+  if (const absl::Status pm_run_status =
+          RunPassesOnModuleOp(mlir_dump_file_name_, pm, module_op);
+      !pm_run_status.ok()) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize_weight.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize_weight.cc
new file mode 100644
index 000000000000..3b3435298f38
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_quantize_weight.cc
@@ -0,0 +1,244 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "Eigen/Core"  // from @eigen_archive
+#include "llvm/ADT/SetVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+// NOLINTNEXTLINE
+//===----------------------------------------------------------------------===//
+// The Quantization Pass for Weight.
+//===----------------------------------------------------------------------===//
+
+namespace mlir::tf_quant::stablehlo {
+
+// Put the definitions inside the ::mlir::tf_quant::stablehlo namespace, to
+// match the declarations in tf_passes.h.
+#define GEN_PASS_DEF_QUANTIZEWEIGHTPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+using QuantizationUnits = llvm::SetVector<std::pair<Operation*, int>>;
+using mlir::stablehlo::ConstantOp;
+using mlir::stablehlo::ConvertOp;
+using ::stablehlo::quantization::QuantizationComponentSpec;
+
+// Min/Max values used for creating ConstantOp.
+constexpr float kMaxFloat16Value = 65504.f;
+constexpr float kMinFloat16Value = -65504.f;
+
+class QuantizeWeightPass
+    : public impl::QuantizeWeightPassBase<QuantizeWeightPass> {
+ public:
+  explicit QuantizeWeightPass(
+      QuantizationComponentSpec quantization_component_spec)
+      : quantization_component_spec_(quantization_component_spec) {}
+
+ private:
+  void runOnOperation() override;
+  QuantizationComponentSpec quantization_component_spec_;
+};
+
+// Collects quantizable target ops, then insert Q-DQ quantization patterns.
+class QuantizeWeight : public OpRewritePattern<ConstantOp> {
+ public:
+  explicit QuantizeWeight(
+      MLIRContext* context,
+      const QuantizationComponentSpec& quantization_component_spec)
+      : OpRewritePattern<ConstantOp>(context),
+        quantization_component_spec_(quantization_component_spec) {}
+
+  LogicalResult matchAndRewrite(ConstantOp op,
+                                PatternRewriter& rewriter) const override {
+    // 1. Collect quantizable ops.
+    QuantizationUnits quantizable_ops = GetQuantizableOps(op);
+    if (quantizable_ops.empty()) {
+      return failure();
+    }
+
+    // 2. Quantize collected ops.
+    if (!QuantizeOps(rewriter, op, quantizable_ops)) {
+      return failure();
+    }
+
+    // 3. Complete the Q-DQ pair for each inference type.
+    if (!ConvertToFloat16Constant(rewriter, op)) {
+      return failure();
+    }
+    return success();
+  }
+
+ private:
+  const QuantizationComponentSpec quantization_component_spec_;
+  // Marks users that are applicable for quantization where the criteria for
+  // determining quantizable ops differs by the inference type.
+  QuantizationUnits GetQuantizableOps(ConstantOp op) const {
+    // Non-float tensors do not need quantization.
+    QuantizationUnits quantizable_ops;
+    const ShapedType type = mlir::dyn_cast<ShapedType>(op.getType());
+    if (!type || !type.getElementType().isF32()) return quantizable_ops;
+
+    const Value value = op.getResult();
+
+    for (OpOperand& use : value.getUses()) {
+      Operation* user = use.getOwner();
+      const int operand_num = use.getOperandNumber();
+      quantizable_ops.insert({user, operand_num});
+    }
+    return quantizable_ops;
+  }
+
+  // Returns whether quantization is applied to filtered users.
+  bool QuantizeOps(PatternRewriter& rewriter, ConstantOp op,
+                   const QuantizationUnits& quantizable_ops) const {
+    for (const std::pair<Operation*, int>& quant_op : quantizable_ops) {
+      // For f16 quantization, quantize all constant ops as float16.
+      QuantizeOpAsFloat16(rewriter, op, quant_op);
+    }
+    // TODO: b/264218457 - Return a value that accurately captures result
+    // status.
+    return true;
+  }
+
+  // Inserts ConvertOp which is used for converting float32 ConstantOp into
+  // float16 quantization. If there is an existing ConvertOp connected to the
+  // ConstantOp, the quantizable_op will be rewired to the existing ConvertOp.
+  // This guarantees at most one ConvertOp is created for float32 to float16
+  // conversion.
+  void QuantizeOpAsFloat16(PatternRewriter& rewriter, ConstantOp op,
+                           const std::pair<Operation*, int> quant_op) const {
+    const auto [quantizable_op, quantize_operand_num] = quant_op;
+    // If the constant is an output tensor, do nothing.
+    if (isa<func::ReturnOp>(quantizable_op)) {
+      return;
+    }
+
+    TensorType old_result_type =
+        mlir::dyn_cast<TensorType>(op.getResult().getType());
+    const FloatType quantized_type = Float16Type::get(op.getContext());
+    const ShapedType new_result_type = old_result_type.clone(quantized_type);
+
+    // Insert ConvertOp if it does not exist yet. Otherwise, just rewire without
+    // creating a ConvertOp.
+    for (const OpOperand& connected_op : op.getResult().getUses()) {
+      ConvertOp convert_op =
+          dyn_cast_or_null<ConvertOp>(connected_op.getOwner());
+      // ConvertOp already exists. Rewire the existing convert op into f16.
+      if (convert_op && convert_op.getType() == new_result_type) {
+        quantizable_op->setOperand(quantize_operand_num, convert_op);
+        return;
+      }
+    }
+    rewriter.setInsertionPointAfter(op);
+    ConvertOp new_convert_op = rewriter.create<ConvertOp>(
+        op->getLoc(), new_result_type, op.getResult());
+    quantizable_op->setOperand(quantize_operand_num,
+                               new_convert_op.getResult());
+  }
+
+  // Returns whether a ConvertOp-Operation sequence can be converted into new
+  // ConstantOp-Convert-Operation. The new ConstantOp has float16 data type.
+  bool ConvertToFloat16Constant(PatternRewriter& rewriter,
+                                ConstantOp op) const {
+    for (Operation* connected_op : op.getResult().getUsers()) {
+      ConvertOp convert_op = dyn_cast_or_null<ConvertOp>(connected_op);
+      // Skip if no convert op exists.
+      if (!convert_op || convert_op.getResult().use_empty()) continue;
+
+      // Get types.
+      const Type old_result_type = op.getResult().getType();
+      const ShapedType new_result_type =
+          mlir::dyn_cast<ShapedType>(convert_op.getType());
+
+      // Proceeds only if the converting is to float16.
+      if (!new_result_type.getElementType().isF16()) continue;
+
+      // Convert values.
+      std::vector<Eigen::half> new_values;
+      const DenseFPElementsAttr value_attr =
+          mlir::cast<DenseFPElementsAttr>(op.getValue());
+      new_values.reserve(value_attr.getNumElements());
+
+      for (const float value : value_attr.getValues<float>()) {
+        new_values.push_back(Eigen::half(
+            std::min(std::max(value, kMinFloat16Value), kMaxFloat16Value)));
+      }
+      const DenseElementsAttr new_value_attr = DenseFPElementsAttr::get(
+          new_result_type, ArrayRef<Eigen::half>(new_values));
+      // Create new ConstantOp-ConvertOp-Operation sequences. At this moment,
+      // old ConstantOp is guaranteed to have one F32->F16 convert op regardless
+      // of its number of users.
+      rewriter.setInsertionPointAfter(op);
+      // create new F16 constant op in that location
+      ConstantOp new_const = rewriter.create<ConstantOp>(
+          op->getLoc(), new_result_type, new_value_attr);
+      ConvertOp dcast =
+          rewriter.create<ConvertOp>(op->getLoc(), old_result_type, new_const);
+      // replace all convert ops with dq op.
+      convert_op->replaceAllUsesWith(dcast);
+      // Return without scanning for the next ConvertOp as only one ConvertOp is
+      // connected to all quantizable ops.
+      return true;
+    }
+    return false;
+  }
+};
+
+// TODO: b/264218457 - Refactors the current file to parse preset quantization
+// options and allow modular control of quantization specs.
+void QuantizeWeightPass::runOnOperation() {
+  func::FuncOp func = getOperation();
+  MLIRContext* ctx = func.getContext();
+  RewritePatternSet patterns(ctx);
+
+  patterns.add<QuantizeWeight>(ctx, quantization_component_spec_);
+
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the StableHLO dialect Quantize Weight pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
+    const QuantizationComponentSpec& quantization_component_spec) {
+  return std::make_unique<QuantizeWeightPass>(quantization_component_spec);
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_remove_sharding_custom_call.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_remove_sharding_custom_call.cc
new file mode 100644
index 000000000000..cae6c33226dc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_remove_sharding_custom_call.cc
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_REMOVESHARDINGCUSTOMCALLPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+// Include patterns generated from `remove_sharding_custom_call.td`.
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/remove_sharding_custom_call.inc"
+
+class RemoveShardingCustomCallPass
+    : public impl::RemoveShardingCustomCallPassBase<
+          RemoveShardingCustomCallPass> {
+ public:
+  using impl::RemoveShardingCustomCallPassBase<
+      RemoveShardingCustomCallPass>::RemoveShardingCustomCallPassBase;
+
+ private:
+  void runOnOperation() override;
+};
+
+void RemoveShardingCustomCallPass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  populateWithGenerated(patterns);
+
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+  if (failed(applyPatternsGreedily(func_op, frozen_patterns))) {
+    func_op.emitWarning() << "Failed to converge "
+                          << RemoveShardingCustomCallPass::getArgumentName();
+  }
+}
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
new file mode 100644
index 000000000000..6e4a608857e3
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
@@ -0,0 +1,536 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <string>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/Version.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+#include "tensorflow/core/ir/types/dialect.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_REPLACESTABLEHLOOPSINMAINFUNCTIONWITHXLACALLMODULEOPSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+constexpr StringRef kStablehloModuleAttrsAttrName = "_stablehlo_module_attrs";
+constexpr StringRef kUsesShapePolymorphismAttr = "jax.uses_shape_polymorphism";
+
+// Default version number for native serialization.
+constexpr int64_t kDefaultVersion = 9;
+// Platforms for XlaCallModuleOp.
+constexpr StringRef kPlatformCpu = "CPU";
+constexpr StringRef kPlatformTpu = "TPU";
+
+class ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass
+    : public impl::
+          ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPassBase<
+              ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass)
+
+  ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass() = default;
+
+  ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass(
+      const ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass& other) =
+      default;
+
+ private:
+  void runOnOperation() override;
+};
+
+// Creates a unique stablehlo function name based on op order.
+std::string CreateStablehloFunctionName(const int id) {
+  return Twine("_stablehlo_main_").concat(std::to_string(id)).str();
+}
+
+// Follows the structure of Live-variable analysis. It is a form of
+// CFG (Control Flow Graph) analysis, often used in compilers.
+//
+// A variable is live if it holds a value that may be used in the future.
+// It is live-in at node n if it is live on any of the node's in-edges.
+// It is live-out at node n if it is live on any of the node's out-edges.
+// def[n] refers to values that are defined at node n.
+// use[n] refers to values that are used at node n.
+//
+// Given a node n, variables' liveliness is defined like the following:
+// live_in[n] = use[n] U (live_out[n] - def[n])
+// live_out[n] = U {live_in[s] | s ε succ[n]}
+//
+// Consider a sequence of op:
+//
+// ```
+// node 1: %0 = stablehlo.constant
+// node 2: %1 = stablehlo.constant
+// node 3: %2 = stablehlo.add %0, %1
+// node 4: %3 = stablehlo.multiply %2, %1
+// node 5: return %3
+// ```
+//
+// In Backward Liveliness analysis, the liveliness for each node above becomes:
+// live_in[5] = use[5]   U (live_out[5] - def[5])
+//            = {%3}     U ({∅} - {∅})            = {%3}
+// live_in[4] = use[4]   U (live_out[4] - def[4])
+//            = {%1, %2} U ({%3} - {%3})          = {%1, %2}
+// live_in[3] = use[3]   U (live_out[3] - def[3])
+//            = {%0, %1} U ({%1, %2} - {%2})      = {%0, %1}
+// live_in[2] = use[2]   U (live_out[2] - def[2])
+//            = {∅}      U ({%0, %1} - {%1})      = {%0}
+// live_in[1] = use[1]   U (live_out[1] - def[1])
+//            = {∅}      U ({%0} - {%0})          = {∅}
+//
+// This analogy is used throughout this pass to ensure only live edges form
+// proper subgraphs.
+class LiveOuts {
+ public:
+  LiveOuts() = default;
+
+  explicit LiveOuts(OperandRange range)
+      : liveouts_(range.begin(), range.end()), prev_liveouts_(liveouts_) {}
+
+  // Delete the current op from liveouts and moves on to the parent ops.
+  void update(Operation& op) {
+    for (Value result_value : op.getResults()) {
+      liveouts_.remove(result_value);
+    }
+    for (Value operand : op.getOperands()) {
+      liveouts_.insert(operand);
+    }
+  }
+
+  // Snapshot the current live values to previous live values.
+  void snapshot_previous_state() { prev_liveouts_ = liveouts_; }
+
+  // Return the current live values.
+  const SetVector<Value>& get() const { return liveouts_; }
+
+  // Return the previous live values.
+  const SetVector<Value>& get_previous() const { return prev_liveouts_; }
+
+ private:
+  // Use SerVector to ensure deterministic traversal order.
+  SetVector<Value> liveouts_;
+  SetVector<Value> prev_liveouts_;
+};
+
+// Creates the tf.XlaCallModuleOp from attributes.
+void CreateXlaCallModuleOp(ValueRange inputs, ValueRange outputs,
+                           const TypeRange result_types,
+                           const SetVector<Operation*>& reverse_subgraph,
+                           const func::FuncOp stablehlo_func_op,
+                           ModuleOp module_op) {
+  MLIRContext* ctx = module_op.getContext();
+  OpBuilder builder(ctx);
+  Operation* last_subgraph_op = reverse_subgraph.front();
+  builder.setInsertionPointAfter(last_subgraph_op);
+
+  // Create attributes used for creating an XlaCallModuleOp.
+  SmallVector<Attribute> shape_attrs;
+  for (const Type result_type : result_types) {
+    shape_attrs.push_back(
+        tf_type::ShapeAttr::get(ctx, mlir::cast<ShapedType>(result_type)));
+  }
+  const auto empty_array_attr = ArrayAttr::get(ctx, {});
+  // TODO: b/310291615 - find a better way for platform support.
+  const auto platforms = ArrayAttr::get(
+      ctx,
+      {StringAttr::get(ctx, kPlatformCpu), StringAttr::get(ctx, kPlatformTpu)});
+
+  auto xla_call_module_op = builder.create<TF::XlaCallModuleOp>(
+      module_op.getLoc(), /*output=*/result_types,
+      /*args=*/inputs,
+      /*version=*/kDefaultVersion, /*module=*/"",
+      /*Sout=*/ArrayAttr::get(ctx, shape_attrs),
+      /*dim_args_spec=*/empty_array_attr, platforms,
+      /*function_list=*/empty_array_attr,
+      /*has_token_input_output=*/false,
+      /*disabled_checks=*/empty_array_attr);
+  xla_call_module_op->setAttr(TF::kStablehloEntryFunctionAttrName,
+                              SymbolRefAttr::get(stablehlo_func_op));
+  std::string target_version =
+      mlir::vhlo::Version::fromCompatibilityRequirement(
+          vhlo::Version::CompatibilityRequirement::WEEK_4)
+          .toString();
+  xla_call_module_op->setAttr(TF::kStablehloVersionAttrName,
+                              builder.getStringAttr(target_version));
+  // Set jax.uses_shape_polymorphism=true to enable shape refinement at runtime.
+  // This is needed for native serialization version >= 8.
+  xla_call_module_op->setAttr(
+      kStablehloModuleAttrsAttrName,
+      builder.getDictionaryAttr(builder.getNamedAttr(
+          kUsesShapePolymorphismAttr, builder.getBoolAttr(true))));
+
+  for (auto [original_output_value, xla_call_module_op_result_value] :
+       llvm::zip_equal(outputs, xla_call_module_op->getResults())) {
+    original_output_value.replaceAllUsesExcept(xla_call_module_op_result_value,
+                                               /*exceptedUser=*/nullptr);
+  }
+}
+
+// Replaces the StableHLO ops with a separate XlaCallModuleOp, then wires it
+// back into the main graph.
+void ReplaceStablehloOpsWithXlaCallModuleOp(
+    const ArrayRef<Value> inputs, const ArrayRef<Value> outputs,
+    const SetVector<Operation*>& reverse_subgraph, const int stablehlo_func_id,
+    ModuleOp module_op) {
+  MLIRContext* ctx = module_op.getContext();
+  OpBuilder builder(ctx);
+
+  // Identify arg types & arg locs.
+  SmallVector<Type> arg_types;
+  SmallVector<Location> arg_locs;
+
+  // Add an argument for platform_index. This allows for multiple platforms.
+  // TODO: b/310291615 - find a better way for platform support.
+  arg_types.push_back(RankedTensorType::get({}, builder.getI32Type()));
+  arg_locs.push_back(module_op.getLoc());
+  for (const Value input_value : inputs) {
+    arg_types.push_back(input_value.getType());
+    arg_locs.push_back(input_value.getLoc());
+  }
+
+  // Identify result types.
+  SmallVector<Type> result_types;
+  for (const Value output_value : outputs) {
+    result_types.push_back(output_value.getType());
+  }
+
+  // 1) Create FuncOp for the StableHLO ops. They will be separate subgraphs.
+  builder.setInsertionPoint(&*module_op.begin());
+  auto stablehlo_func_op = builder.create<func::FuncOp>(
+      module_op.getLoc(), CreateStablehloFunctionName(stablehlo_func_id),
+      FunctionType::get(ctx, arg_types, result_types));
+  stablehlo_func_op.setVisibility(SymbolTable::Visibility::Private);
+  stablehlo_func_op->setAttr(TF::kFromXlaCallModuleAttrName,
+                             builder.getUnitAttr());
+
+  builder.createBlock(&stablehlo_func_op.getBody(), stablehlo_func_op.begin(),
+                      arg_types, arg_locs);
+
+  IRMapping mapper;
+  // stablehlo_func_op has 1 extra arg for platform index.
+  for (auto [input, stablehlo_func_arg] : llvm::zip_equal(
+           inputs, stablehlo_func_op.getArguments().take_back(inputs.size()))) {
+    mapper.map(input, stablehlo_func_arg);
+  }
+
+  for (Operation* subgraph_op : llvm::reverse(reverse_subgraph)) {
+    // Create a deep copy of the subgraph ops' operands to the func op.
+    stablehlo_func_op.getBody().begin()->push_back(subgraph_op->clone(mapper));
+  }
+
+  SmallVector<Value> result_values;
+  for (const Value original_output_value : outputs) {
+    // Use the mapped values in the newly created function that correspond to
+    // outputs in the original function.
+    result_values.push_back(mapper.lookup(original_output_value));
+  }
+  builder.create<func::ReturnOp>(module_op.getLoc(), result_values);
+
+  // 2) Create XlaCallModuleOp (with ops mapped).
+  CreateXlaCallModuleOp(inputs, outputs, result_types, reverse_subgraph,
+                        stablehlo_func_op, module_op);
+
+  // 3) Erase the replaced ops.
+  for (Operation* subgraph_op : reverse_subgraph) {
+    subgraph_op->erase();
+  }
+}
+
+// Contains the actual logic for updating states and replacing StableHLO ops
+// with tf.XlaCallModuleOps.
+void UpdateStatesAndReplaceStablehloOps(
+    const SetVector<Value>& operands, const SetVector<Value>& defined_values,
+    const LiveOuts& liveouts, ModuleOp module_op,
+    const SetVector<Operation*>& reverse_subgraph, const int stablehlo_func_id,
+    func::FuncOp main_func, const bool is_last_subgraph = false) {
+  SetVector<Value> inputs = operands;
+  for (Value defined_value : defined_values) {
+    inputs.remove(defined_value);
+  }
+
+  SetVector<Value> outputs = liveouts.get_previous();
+  for (const Value live_value : liveouts.get()) {
+    outputs.remove(live_value);
+  }
+
+  if (is_last_subgraph) {
+    // Additionally remove arguments from the outputs, as it provides liveness
+    // throughout (functions as an invisible op above the very first op that
+    // returns the arguments).
+    for (const BlockArgument arg : main_func.getArguments()) {
+      outputs.remove(arg);
+    }
+  }
+
+  ReplaceStablehloOpsWithXlaCallModuleOp(
+      SmallVector<Value>(inputs.begin(), inputs.end()),
+      SmallVector<Value>(outputs.begin(), outputs.end()), reverse_subgraph,
+      stablehlo_func_id, module_op);
+}
+
+// Check if the op should be added to the subgraph.
+// The op should be added to the subgraph if all of its users match one
+// of following two conditions:
+// 1: The user is already in the current subgraph.
+// 2: The user will reach a dead end.
+//
+// If the op should be added to the subgraph and there are users who
+// will reach the dead end, add the ops on the dead end to the subgraph as well.
+bool ShouldAddOpToSubgraph(Operation* op,
+                           const SetVector<Operation*>& reverse_subgraph,
+                           const SetVector<Operation*>& ops_to_add,
+                           SmallVector<Operation*>& all_descendants) {
+  if (!op) {
+    return false;
+  }
+
+  SmallVector<Operation*> current_layer_descendants;
+  SmallVector<Operation*> next_layer_descendants;
+  int current_depth = 0;
+  current_layer_descendants.push_back(op);
+  // BFS downstream ops for current user.
+  // If any one of the descendants meet one of the three conditions, we return
+  // false for the current value:
+  // 1: The descendant is not in the ops_to_add.
+  // 2: The descendant is not a stablehlo op.
+  // 3: The depth of the descendant is larger than 5, we don't want to search
+  // too deep, max depth is arbitrarily chosen.
+  while (!current_layer_descendants.empty()) {
+    if (current_depth > 5) {
+      all_descendants.clear();
+      return false;
+    }
+    current_depth++;
+
+    for (Operation* descendant : current_layer_descendants) {
+      if (!quant::stablehlo::IsStablehloOp(descendant) ||
+          !ops_to_add.contains(descendant)) {
+        all_descendants.clear();
+        return false;
+      }
+      for (Operation* next_descendant : descendant->getUsers()) {
+        if (reverse_subgraph.contains(next_descendant)) {
+          continue;
+        }
+        next_layer_descendants.push_back(next_descendant);
+      }
+      all_descendants.push_back(descendant);
+    }
+
+    current_layer_descendants = next_layer_descendants;
+    next_layer_descendants.clear();
+  }
+
+  return true;
+}
+
+// Replaces the StableHLO ops in the main function block with
+// tf.XlaCallModuleOps as separate subgraphs. Wires them back to the main
+// function block to be compatible with SavedModel structure.
+void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(
+    ModuleOp module_op, func::FuncOp main_func, int& stablehlo_func_id) {
+  Block& main_func_block = main_func.getBody().front();
+
+  // LiveOuts keeps track of live values at the output of some op. The updates
+  // must be made in a reverse, bottom-up manner.
+  const auto result_values = main_func_block.getTerminator()->getOperands();
+  LiveOuts liveouts(result_values);
+
+  // Copy ops to iterate because we will be modifying the block during
+  // iteration. The ordering should be reversed because liveness analysis is a
+  // bottom-up analysis. The terminator is not included because the return
+  // statement is not included in any subgraph (e.g. XlaCallModuleOp) and is
+  // untouched.
+  SmallVector<Operation*> reverse_main_func_block_ops;
+  SetVector<Operation*> ops_to_add;
+  for (Operation& main_func_block_op :
+       llvm::reverse(main_func_block.without_terminator())) {
+    reverse_main_func_block_ops.push_back(&main_func_block_op);
+    ops_to_add.insert(&main_func_block_op);
+  }
+
+  // Create a separate subgraph invoked with XlaCallModuleOp per each
+  // set of StableHLO ops in the main func block.
+  SetVector<Operation*> reverse_subgraph;
+  SetVector<Value> operands;
+  SetVector<Value> defined_values;
+
+  // Add op to the subgraph.
+  const auto add_to_subgraph = [&](Operation* op) {
+    // Move on to the parent ops.
+    liveouts.update(*op);
+    ops_to_add.remove(op);
+
+    if (!quant::stablehlo::IsStablehloOp(op)) {
+      // Always update the liveouts when the subgraph isn't being continued.
+      liveouts.snapshot_previous_state();
+      return;
+    }
+
+    reverse_subgraph.insert(op);
+    defined_values.insert(op->getResults().begin(), op->getResults().end());
+    operands.insert(op->getOperands().begin(), op->getOperands().end());
+  };
+
+  for (Operation* op : reverse_main_func_block_ops) {
+    if (!ops_to_add.contains(op)) continue;
+    // When hitting a non-StableHLO op, i.e. tf.CustomAggregatorOp, start
+    // recursively tracing defining ops of the current subgraph's operands. This
+    // makes sure that all dependencies needed for shape inference are included
+    // in the subgraph. We only trace StableHLO ops that have all users inside
+    // the current subgraph.
+    // TODO: b/311239049 - Consider rewrite this using BFS.
+    if (!quant::stablehlo::IsStablehloOp(op)) {
+      bool should_add_op = true;
+      while (should_add_op) {
+        should_add_op = false;
+        SmallVector<Operation*> all_descendants;
+        for (Value v : operands) {
+          if (defined_values.contains(v)) continue;
+          if (ShouldAddOpToSubgraph(v.getDefiningOp(), reverse_subgraph,
+                                    ops_to_add, all_descendants)) {
+            should_add_op = true;
+            break;
+          }
+        }
+        if (should_add_op) {
+          for (auto descendant : llvm::reverse(all_descendants)) {
+            add_to_subgraph(descendant);
+          }
+        }
+      }
+      // Create an XlaCallModuleOp if reverse_subgraph isn't empty.
+      if (!reverse_subgraph.empty()) {
+        UpdateStatesAndReplaceStablehloOps(operands, defined_values, liveouts,
+                                           module_op, reverse_subgraph,
+                                           ++stablehlo_func_id, main_func);
+
+        // Reset states and start a new subgraph.
+        reverse_subgraph.clear();
+        operands.clear();
+        defined_values.clear();
+      }
+    }
+    add_to_subgraph(op);
+  }
+
+  // Create the last subgraph if it isn't empty.
+  if (!reverse_subgraph.empty()) {
+    UpdateStatesAndReplaceStablehloOps(
+        operands, defined_values, liveouts, module_op, reverse_subgraph,
+        ++stablehlo_func_id, main_func, /*is_last_subgraph=*/true);
+  }
+}
+
+// Duplicates small constants for each use.
+//
+// In the subsequent graph partitioning, constants for shape inference need to
+// be in the same subgraph. But graph partitioning stops at ops with multiple
+// uses. So here we duplicate small constants for each use so that if a
+// constant is useful for shape inference for multiple subgraphs, they can be
+// included in each subgraphs. If duplicate constants are accidentally created
+// in the same subgraph, they can be easily removed with a canonicalizer pass.
+//
+// We set a size limit since constants needed for shape inference are no
+// larger than tensor rank. This avoids duplicating large constants.
+void DuplicateSmallConstantOps(ModuleOp module_op, func::FuncOp main_func) {
+  OpBuilder builder(main_func.getContext());
+  for (auto constant_op :
+       main_func.getBody().getOps<mlir::stablehlo::ConstantOp>()) {
+    builder.setInsertionPointAfter(constant_op);
+    if (constant_op.getResult().use_empty() ||
+        constant_op.getResult().hasOneUse())
+      continue;
+    // Do not duplicate constant op if the size is too large.
+    // 32 is chosen to be larger than all constants useful for shape references,
+    // while not too large to possibly significantly increase model size.
+    if (constant_op.getValue().getNumElements() > 32) continue;
+    while (!constant_op.getResult().hasOneUse()) {
+      auto new_constant_op = builder.clone(*constant_op.getOperation());
+      constant_op.getResult().getUses().begin()->assign(
+          dyn_cast<mlir::stablehlo::ConstantOp>(new_constant_op));
+    }
+  }
+}
+
+void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass::
+    runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  func::FuncOp main_func = quant::FindMainFuncOp(module_op);
+  if (!main_func) return;
+
+  // In case the model has tf.StatefulPartitionedCallOp or tf.PartitionedCallOp,
+  // we recursively find called functions and process StableHLO ops in them.
+  SmallVector<func::FuncOp> func_ops;
+  func_ops.push_back(main_func);
+  int stablehlo_func_id = -1;
+  while (!func_ops.empty()) {
+    auto main_func = func_ops.back();
+    func_ops.pop_back();
+    if (!main_func) continue;
+
+    SymbolTable symbol_table(module_op);
+    for (auto call_op : main_func.getOps<TF::PartitionedCallOp>()) {
+      func_ops.push_back(dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(
+          mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue())));
+    }
+    for (auto call_op : main_func.getOps<TF::StatefulPartitionedCallOp>()) {
+      func_ops.push_back(
+          dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(call_op.getF())));
+    }
+
+    DuplicateSmallConstantOps(module_op, main_func);
+    ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(module_op, main_func,
+                                                          stablehlo_func_id);
+  }
+
+  // TODO - b/298966126: Currently quantizable functions are identified in TF
+  // Quantizer via the tf_quant.composite_function UnitAttr attached to
+  // func ops. We remove this attribute as this interferes with VHLO conversion.
+  // Remove this temporary hack.
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    func_op->removeAttr(kFusedFunctionAttr);
+  }
+}
+
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_restore_function_name.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_restore_function_name.cc
new file mode 100644
index 000000000000..d047953693e2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_restore_function_name.cc
@@ -0,0 +1,94 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+
+//===----------------------------------------------------------------------===//
+// The stablehlo-restore-function-name Pass.
+//===----------------------------------------------------------------------===//
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_RESTOREFUNCTIONNAMEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+// Restores entry function name from XlaCallModuleOp attribute.
+// This restoration is required because StableHLO functions are renamed during
+// the XlaCallModuleSerialization.
+class RestoreFunctionNamePass
+    : public impl::RestoreFunctionNamePassBase<RestoreFunctionNamePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RestoreFunctionNamePass)
+
+  explicit RestoreFunctionNamePass() = default;
+
+  void runOnOperation() override;
+};
+
+void RestoreFunctionNameFromXlaCallModuleOp(TF::XlaCallModuleOp& call_op,
+                                            SymbolTable& symbol_table) {
+  if (!call_op->hasAttr(kOriginalStablehloEntryFunctionAttrName)) {
+    return;
+  }
+
+  const auto original_function_name = call_op->getAttrOfType<StringAttr>(
+      kOriginalStablehloEntryFunctionAttrName);
+  const auto current_function_name = call_op->getAttrOfType<FlatSymbolRefAttr>(
+      TF::kStablehloEntryFunctionAttrName);
+
+  if (!original_function_name || !current_function_name) {
+    return;
+  }
+
+  auto function =
+      symbol_table.lookup<func::FuncOp>(current_function_name.getValue());
+  if (function) {
+    function.setName(original_function_name);
+  }
+
+  call_op->setAttr(TF::kStablehloEntryFunctionAttrName,
+                   FlatSymbolRefAttr::get(original_function_name));
+}
+
+void RestoreFunctionNamePass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  MLIRContext* ctx = module_op.getContext();
+  OpBuilder builder(ctx);
+  SymbolTable symbol_table(module_op);
+
+  // TODO - b/298966126: Improve this logic if needed.
+  module_op.walk([&](TF::XlaCallModuleOp call_op) {
+    RestoreFunctionNameFromXlaCallModuleOp(call_op, symbol_table);
+  });
+}
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_unfuse_mhlo_batch_norm.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_unfuse_mhlo_batch_norm.cc
new file mode 100644
index 000000000000..8a09a010e5c4
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_unfuse_mhlo_batch_norm.cc
@@ -0,0 +1,59 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
+
+//===----------------------------------------------------------------------===//
+// The unfuse-mhlo-batch-norm Pass.
+//===----------------------------------------------------------------------===//
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_UNFUSEMHLOBATCHNORMPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+class UnfuseMhloBatchNormPass
+    : public impl::UnfuseMhloBatchNormPassBase<UnfuseMhloBatchNormPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(UnfuseMhloBatchNormPass)
+
+  explicit UnfuseMhloBatchNormPass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+void UnfuseMhloBatchNormPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  mhlo::populateUnfuseBatchNormPatterns(ctx, &patterns);
+
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_unwrap_xla_call_module_op.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_unwrap_xla_call_module_op.cc
new file mode 100644
index 000000000000..2b80378bb8fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_unwrap_xla_call_module_op.cc
@@ -0,0 +1,132 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_UNWRAPXLACALLMODULEOPPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+// Unwraps XlaCallModule ops without quantizable trait that call function with
+// '_from_xla_call_module' trait.
+class UnwrapXlaCallModuleOpPass
+    : public impl::UnwrapXlaCallModuleOpPassBase<UnwrapXlaCallModuleOpPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(UnwrapXlaCallModuleOpPass)
+
+  explicit UnwrapXlaCallModuleOpPass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+void UnwrapXlaCallModuleOp(TF::XlaCallModuleOp call_op,
+                           SymbolTable& symbol_table) {
+  // Do not inline lifted quantized functions used for fusing patterns.
+  // TODO - b/310539922: Remove reference to TF/TFL utils.
+  if (call_op->hasAttr(kQuantTraitAttrName)) {
+    return;
+  }
+
+  auto function_name = call_op
+                           ->getAttrOfType<FlatSymbolRefAttr>(
+                               TF::kStablehloEntryFunctionAttrName)
+                           .getValue();
+  func::FuncOp func_op = symbol_table.lookup<func::FuncOp>(function_name);
+
+  // We should not unwrap if the function is not from
+  // ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass.
+  if (!func_op->hasAttr(TF::kFromXlaCallModuleAttrName)) {
+    return;
+  }
+
+  MLIRContext* context = call_op.getContext();
+  OpBuilder builder(context);
+  builder.setInsertionPointAfter(call_op);
+
+  IRMapping arg_mapper;
+  bool call_op_has_platform_index_arg = call_op.getPlatforms().size() > 1;
+  // Add an argument for platform_index. This allows for multiple platforms.
+  // TODO: b/310291615 - find a better way for multi-platform support.
+  if (call_op_has_platform_index_arg) {
+    arg_mapper.map(func_op.getArgument(0),
+                   builder.create<mhlo::ConstantOp>(
+                       func_op.getLoc(), builder.getI16IntegerAttr(0)));
+  }
+  for (auto [func_arg, operand] : llvm::zip_equal(
+           func_op.getArguments().take_back(call_op.getNumOperands()),
+           call_op.getOperands())) {
+    arg_mapper.map(func_arg, operand);
+  }
+
+  Region& function_body = func_op.getBody();
+  IRMapping new_op_mapper;
+  for (Operation& op : function_body.getOps()) {
+    if (llvm::isa<func::ReturnOp>(op)) {
+      for (auto [call_result, return_value] :
+           llvm::zip_equal(call_op.getResults(), op.getOperands())) {
+        Value new_result = new_op_mapper.lookup(return_value);
+
+        call_result.replaceAllUsesWith(new_result);
+      }
+      continue;
+    }
+
+    Operation& new_op = *builder.clone(op, arg_mapper);
+    for (auto [result, new_result] :
+         llvm::zip_equal(op.getResults(), new_op.getResults())) {
+      new_op_mapper.map(result, new_result);
+    }
+  }
+
+  call_op.erase();
+}
+
+void UnwrapXlaCallModuleOpPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  SymbolTable symbol_table(module_op);
+
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    Region& function_body = func_op.getBody();
+
+    function_body.walk([&](TF::XlaCallModuleOp call_op) {
+      UnwrapXlaCallModuleOp(call_op, symbol_table);
+    });
+  }
+}
+
+}  // namespace
+
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_xla_call_module_to_call.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_xla_call_module_to_call.cc
new file mode 100644
index 000000000000..250123ad9190
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_xla_call_module_to_call.cc
@@ -0,0 +1,84 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::tf_quant::stablehlo {
+
+#define GEN_PASS_DEF_XLACALLMODULETOCALLPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h.inc"
+
+namespace {
+
+// Converts XlaCallModuleOps to func.call.
+class XlaCallModuleToCallPass
+    : public impl::XlaCallModuleToCallPassBase<XlaCallModuleToCallPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(XlaCallModuleToCallPass)
+
+  explicit XlaCallModuleToCallPass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+// Converts XlaCallModuleOps to func.call.
+class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
+ public:
+  explicit XlaCallModuleOpToCallOp(MLIRContext* context)
+      : OpRewritePattern<TF::XlaCallModuleOp>(context) {}
+
+  LogicalResult matchAndRewrite(TF::XlaCallModuleOp op,
+                                PatternRewriter& rewriter) const override {
+    auto module_op = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module_op);
+
+    auto entry_func_op = dyn_cast_or_null<func::FuncOp>(
+        symbol_table.lookup(GetEntryFunctionName(op)));
+    if (!entry_func_op) return failure();
+
+    // Replace the XlaCallModuleOp with a new CallOp.
+    rewriter.replaceOpWithNewOp<func::CallOp>(op, entry_func_op, op.getArgs());
+    return success();
+  }
+};
+
+void XlaCallModuleToCallPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = module_op.getContext();
+  RewritePatternSet patterns(&getContext());
+  patterns.add<XlaCallModuleOpToCallOp>(ctx);
+  if (failed(applyPatternsGreedily(module_op, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize.mlir
new file mode 100644
index 000000000000..69f509653328
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize.mlir
@@ -0,0 +1,140 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-prepare-quantize=enable-per-channel-quantized-weight=false -verify-diagnostics | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: func @dot
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x3xf32>) -> tensor<?x2xf32>
+func.func @dot(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
+  // CHECK: %[[cst:.*]] = stablehlo.constant
+  // CHECK: %[[q1:.*]] = "quantization.qcast"(%[[cst]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
+  // CHECK: %[[dq1:.*]] = "quantization.dcast"(%[[q1]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
+  %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
+  // CHECK: %[[q2:.*]] = "quantization.qcast"(%[[ARG_0]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
+  // CHECK: %[[dq2:.*]] = "quantization.dcast"(%[[q2]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
+  %0 = "quantization.stats"(%arg0) {bitsNum = 8 : i64, layerStats = dense<[-0.999415695, 0.99998933]> : tensor<2xf32>, narrowRange = false} : (tensor<?x3xf32>) -> tensor<?x3xf32>
+  // CHECK: %[[dot:.*]] = stablehlo.dot %[[dq2]], %[[dq1]]
+  %1 = stablehlo.dot %0, %cst : (tensor<?x3xf32>, tensor<3x2xf32>) -> tensor<?x2xf32>
+  // CHECK: %[[q3:.*]] = "quantization.qcast"(%[[dot]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.036254937041039562:-28>>
+  // CHECK: %[[dq3:.*]] = "quantization.dcast"(%[[q3]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.036254937041039562:-28>>
+  %2 = "quantization.stats"(%1) {bitsNum = 8 : i64, layerStats = dense<[-3.6289506, 5.61605835]> : tensor<2xf32>, narrowRange = false} : (tensor<?x2xf32>) -> tensor<?x2xf32>
+  // CHECK: return %[[dq3]]
+  func.return %2 : tensor<?x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @duplicate_stats
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<2x3xf32>) -> tensor<2x3xf32>
+func.func @duplicate_stats(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  // CHECK: %[[q1:.*]] = "quantization.qcast"(%[[ARG_0]])
+  // CHECK: %[[dq1:.*]] = "quantization.dcast"(%[[q1]])
+  // CHECK: %[[q2:.*]] = "quantization.qcast"(%[[dq1]])
+  // CHECK: %[[dq2:.*]] = "quantization.dcast"(%[[q2]])
+  // CHECK: stablehlo.convert %[[dq2]]
+  %0 = "quantization.stats"(%arg0) {bitsNum = 8 : i64, layerStats = dense<[-0.999415695, 0.99998933]> : tensor<2xf32>, narrowRange = false} : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %1 = "quantization.stats"(%0) {bitsNum = 8 : i64, layerStats = dense<[-2.0, 2.0]> : tensor<2xf32>, narrowRange = false} : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %2 = stablehlo.convert %1 : (tensor<2x3xf32>) -> (tensor<2x3xf32>)
+  func.return %2 : tensor<2x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_redundant_stats
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x3xf32>) -> tensor<?x2xf32>
+func.func @dot_redundant_stats(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
+  // CHECK: %[[cst:.*]] = stablehlo.constant
+  // CHECK: %[[q1:.*]] = "quantization.qcast"(%[[cst]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
+  // CHECK: %[[dq1:.*]] = "quantization.dcast"(%[[q1]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
+  %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
+  // CHECK: %[[q2:.*]] = "quantization.qcast"(%[[ARG_0]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
+  // CHECK: %[[dq2:.*]] = "quantization.dcast"(%[[q2]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0078408040252386357:-1>
+  %0 = "quantization.stats"(%arg0) {bitsNum = 8 : i64, layerStats = dense<[-100.2, 212.4]> : tensor<2xf32>, narrowRange = false} : (tensor<?x3xf32>) -> tensor<?x3xf32>
+  %1 = "quantization.qcast"(%0) {volatile} : (tensor<?x3xf32>) -> tensor<?x3x!quant.uniform<i8:f32, 0.0078408040252386357:-1>>
+  %2 = "quantization.dcast"(%1) : (tensor<?x3x!quant.uniform<i8:f32, 0.0078408040252386357:-1>>) -> tensor<?x3xf32>
+  // CHECK: %[[dot:.*]] = stablehlo.dot %[[dq2]], %[[dq1]]
+  %3 = stablehlo.dot %2, %cst : (tensor<?x3xf32>, tensor<3x2xf32>) -> tensor<?x2xf32>
+  // CHECK: %[[q3:.*]] = "quantization.qcast"(%[[dot]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.036254937041039562:-28>>
+  // CHECK: %[[dq3:.*]] = "quantization.dcast"(%[[q3]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.036254937041039562:-28>>
+  %4 = "quantization.stats"(%3) {bitsNum = 8 : i64, layerStats = dense<[-3.6289506, 5.61605835]> : tensor<2xf32>, narrowRange = false} : (tensor<?x2xf32>) -> tensor<?x2xf32>
+  // CHECK: return %[[dq3]]
+  func.return %4 : tensor<?x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @reshape_same_scale_propagate
+func.func @reshape_same_scale_propagate(%arg0: tensor<2x3xf32>) -> tensor<6xf32> {
+  // CHECK: %[[dq:.*]] = "quantization.dcast"
+  // CHECK-SAME: (tensor<2x3x!quant.uniform<i8:f32, 0.0078408040252386357:-1>>)
+  %0 = "quantization.stats"(%arg0) {bitsNum = 8 : i64, layerStats = dense<[-0.999415695, 0.99998933]> : tensor<2xf32>, narrowRange = false} : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  // CHECK: %[[reshape:.*]] = stablehlo.reshape %[[dq]]
+  %1 = stablehlo.reshape %0 : (tensor<2x3xf32>) -> (tensor<6xf32>)
+  // CHECK: %[[q:.*]] = "quantization.qcast"(%[[reshape]])
+  // CHECK-SAME: -> tensor<6x!quant.uniform<i8:f32, 0.0078408040252386357:-1>>
+  %2 = "quantization.stats"(%1) {bitsNum = 8 : i64, layerStats = dense<[-2.0, 2.0]> : tensor<2xf32>, narrowRange = false} : (tensor<6xf32>) -> tensor<6xf32>
+  func.return %2 : tensor<6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @merge_consecutive_qcast
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
+func.func @merge_consecutive_qcast(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
+  // CHECK: "quantization.qcast"(%[[ARG_1]])
+  // CHECK-SAME: -> tensor<?x!quant.uniform<i8:f32, 0.02454993117089365:-64>>
+  // CHECK: "quantization.qcast"(%[[ARG_1]])
+  // CHECK-SAME: -> tensor<?x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
+  %0 = "quantization.stats"(%arg0) {layerStats = dense<[-0.83811146, 2.4960899]> : tensor<2xf32>} : (tensor<?xf32>) -> tensor<?xf32>
+  %1 = "quantization.stats"(%arg1) {layerStats = dense<[-0.835039615, 1.000000e+00]> : tensor<2xf32>} : (tensor<?xf32>) -> tensor<?xf32>
+  %2 = "stablehlo.concatenate"(%0, %1) {dimension = 0 : i64} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %3 = "quantization.stats"(%2) {layerStats = dense<[-0.83811146, 2.4960899]> : tensor<2xf32>} : (tensor<?xf32>) -> tensor<?xf32>
+  %4 = "quantization.stats"(%arg2) {layerStats = dense<[-1.5726943, 1.07351148]> : tensor<2xf32>} : (tensor<?xf32>) -> tensor<?xf32>
+  %5 = "stablehlo.concatenate"(%4, %1) {dimension = 0 : i64} : (tensor<?xf32>,  tensor<?xf32>) -> tensor<?xf32>
+  %6 = "quantization.stats"(%5) {layerStats = dense<[-1.5726943, 4.6875381]> : tensor<2xf32>} : (tensor<?xf32>) -> tensor<?xf32>
+  func.return  %3, %6 : tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @skip_nan_inf_constant
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x112x112x64xf32>) -> tensor<?x56x56x64xf32>
+func.func @skip_nan_inf_constant(%arg0: tensor<?x112x112x64xf32>) -> tensor<?x56x56x64xf32> {
+  // CHECK-DAG: %[[cst0:.*]] = stablehlo.constant dense<0xFF800000> : tensor<f32
+  // CHECK-DAG: %[[cst1:.*]] = stablehlo.constant dense<0x7FC00000> : tensor<f32>
+  // CHECK-DAG: %[[cst2:.*]] = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[cst3:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NOT: %[[q0:.*]] = "quantization.qcast"(%[[cst0]])
+  // CHECK-NOT: %[[q1:.*]] = "quantization.qcast"(%[[cst1]])
+  // CHECK: %[[q2:.*]] = "quantization.qcast"(%[[cst2]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.023529411764705882:-128>
+  // CHECK: %[[dq2:.*]] = "quantization.dcast"(%[[q2]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.023529411764705882:-128>
+  // CHECK: %[[q3:.*]] = "quantization.qcast"(%[[cst3]])
+  // CHECK-SAME: quant.uniform<i8:f32, 3.9215686274509805E-9>
+  // CHECK: %[[dq3:.*]] = "quantization.dcast"(%[[q3]])
+  // CHECK-SAME: quant.uniform<i8:f32, 3.9215686274509805E-9>
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
+  %1 = stablehlo.constant dense<0x7FC00000> : tensor<f32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  %3 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %4 = "stablehlo.add"(%0, %1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %5 = stablehlo.clamp %3, %arg0, %2 : (tensor<f32>, tensor<?x112x112x64xf32>, tensor<f32>) -> tensor<?x112x112x64xf32>
+  %6 = "stablehlo.reduce_window"(%5, %4) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+    stablehlo.return %7 : tensor<f32>
+  }) {padding = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>, window_strides = array<i64: 1, 2, 2, 1>} : (tensor<?x112x112x64xf32>, tensor<f32>) -> tensor<?x56x56x64xf32>
+  return %6 : tensor<?x56x56x64xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize_int4.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize_int4.mlir
new file mode 100644
index 000000000000..81a95f9066bc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize_int4.mlir
@@ -0,0 +1,26 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-prepare-quantize=bit-width=4 -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: func @dot_int4
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x3xf32>) -> tensor<?x2xf32>
+func.func @dot_int4(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
+  // CHECK: %[[cst:.*]] = stablehlo.constant
+  // CHECK: %[[q1:.*]] = "quantization.qcast"(%[[cst]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
+  // CHECK: %[[dq1:.*]] = "quantization.dcast"(%[[q1]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.0040316890267764818:127>
+  %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
+  // CHECK: %[[q2:.*]] = "quantization.qcast"(%[[ARG_0]])
+  // CHECK-SAME: quant.uniform<i4:f32, 0.13329366842905679:-1>
+  // CHECK: %[[dq2:.*]] = "quantization.dcast"(%[[q2]])
+  // CHECK-SAME: quant.uniform<i4:f32, 0.13329366842905679:-1>
+  %0 = "quantization.stats"(%arg0) {bitsNum = 8 : i64, layerStats = dense<[-0.999415695, 0.99998933]> : tensor<2xf32>, narrowRange = false} : (tensor<?x3xf32>) -> tensor<?x3xf32>
+  // CHECK: %[[dot:.*]] = stablehlo.dot %[[dq2]], %[[dq1]]
+  %1 = stablehlo.dot %0, %cst : (tensor<?x3xf32>, tensor<3x2xf32>) -> tensor<?x2xf32>
+  // CHECK: %[[q3:.*]] = "quantization.qcast"(%[[dot]])
+  // CHECK-SAME: quant.uniform<i4:f32, 0.61633392969767253:-2>>
+  // CHECK: %[[dq3:.*]] = "quantization.dcast"(%[[q3]])
+  // CHECK-SAME: quant.uniform<i4:f32, 0.61633392969767253:-2>>
+  %2 = "quantization.stats"(%1) {bitsNum = 8 : i64, layerStats = dense<[-3.6289506, 5.61605835]> : tensor<2xf32>, narrowRange = false} : (tensor<?x2xf32>) -> tensor<?x2xf32>
+  // CHECK: return %[[dq3]]
+  func.return %2 : tensor<?x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize_per_channel.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize_per_channel.mlir
new file mode 100644
index 000000000000..196c517d3f46
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/tf_prepare_quantize_per_channel.mlir
@@ -0,0 +1,130 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-prepare-quantize=enable-per-channel-quantized-weight=true -verify-diagnostics | FileCheck %s
+
+// -----
+
+module {
+  // CHECK-LABEL: conv_with_bias_and_relu
+  func.func private @conv_with_bias_and_relu(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> {
+    %cst = "tf.Const"() {device = "", value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
+    // CHECK: %[[q_weight_per_channel:.*]] = "quantization.qcast"
+    // CHECK-SAME: -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.075123051020104109,0.072960192762960605}>>
+    // CHECK: %[[dq_weight:.*]] = "quantization.dcast"(%[[q_weight_per_channel]])
+    %cst_0 = "tf.Const"() {device = "", value = dense<[[[[-6.30731344, 5.4962182], [1.80364347, -7.64542675], [-2.11145878, -7.08605719]], [[-9.54062747, -6.14013147], [6.12640238, -4.18223286], [5.05738974, 8.99269962]], [[3.3535192, 0.84816426], [-6.64676809, -7.95477629], [5.81315517, 9.21566581]]], [[[1.38622558, 4.63866329], [4.54742622, -1.43770897], [-3.96835279, 2.99996852]], [[0.989735424, -4.83384752], [-7.27702999, 1.17216611], [1.33735656, 0.728900194]], [[5.1286211, 8.98645591], [1.55008793, -3.85491467], [3.7003777, 9.26594448]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    // CHECK: %[[q_act:.*]] = "quantization.qcast"(%arg0)
+    // CHECK-SAME: -> tensor<1x3x2x3x!quant.uniform<i8:f32, 0.018920717052384919:-128>>
+    // CHECK: %[[dq_act:.*]] = "quantization.dcast"(%[[q_act]])
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[1.27501142, 4.824783]> : tensor<2xf32>} : (tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+    // CHECK: "tf.XlaCallModule"(%[[dq_act]], %[[dq_weight]]
+    %1 = "tf.XlaCallModule"(%0, %cst_0, %cst) {
+      Sout = [#tf_type.shape<1x2x2x2>], config = "",
+      module = "composite_conv2d_with_bias_and_relu6_fn_10",
+      _entry_function = @composite_conv2d_with_bias_and_relu6_fn_10,
+      // Represents a per-channel quantization for the operand index 1 with
+      // quantization dimension of 3
+      _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+      platforms = [], version = 4 : i64
+    } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    return %2 : tensor<1x2x2x2xf32>
+  }
+
+  // CHECK-LABEL: composite_conv2d_with_bias_and_relu6_fn_10
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_10(%arg0: tensor<1x3x2x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x2x2x2xf32> attributes {tf.tf_quant.composite_function} {
+    %0 = "quantization.stats"(%arg1) {layerStats = dense<[-3.54062747, 0.54742622]> : tensor<2xf32>} : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2xf32>
+    %1 = "quantization.stats"(%arg0) {layerStats = dense<[1.27501142, 2.824783]> : tensor<2xf32>} : (tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+    %2 = stablehlo.convolution(%1, %0)
+      dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+      window = {
+        stride = [1, 1], pad = [[0, 0], [1, 1]],
+        lhs_dilate = [1, 1],
+        rhs_dilate = [1, 1]
+      }
+      {
+        batch_group_count = 1 : i64,
+        feature_group_count = 1 : i64
+      } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>)
+      -> tensor<1x2x2x2xf32>
+    %3 = "quantization.stats"(%arg2) {layerStats = dense<[7.05456924, 7.11401462]> : tensor<2xf32>} : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = "quantization.stats"(%2) {layerStats = dense<[-1.36523, 3.57373]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    %5 = "chlo.broadcast_add"(%4, %3) : (tensor<1x2x2x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
+    %6 = "quantization.stats"(%5) {layerStats = dense<[-1.31055, 2.62842]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    %cst_min = stablehlo.constant dense<0.0> : tensor<f32>
+    %cst_max = stablehlo.constant dense<6.0> : tensor<f32>
+    %7 = "stablehlo.clamp"(%cst_min, %6, %cst_max) {device = ""} : (tensor<f32>, tensor<1x2x2x2xf32>, tensor<f32>) -> tensor<1x2x2x2xf32>
+    %8 = "quantization.stats"(%7) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    return %8 : tensor<1x2x2x2xf32>
+  }
+}
+
+// -----
+
+module {
+  // CHECK-LABEL: dot_general
+  func.func private @dot_general(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    // CHECK: %[[q_weight:.*]] = "quantization.qcast"
+    // CHECK-SAME: -> tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {0.049663885371891529,0.060200210631363035}>>
+    // CHECK: %[[dq_weight:.*]] = "quantization.dcast"(%[[q_weight]])
+    %cst = "tf.Const"() {device = "", value = dense<[[-6.30731344, 5.4962182], [1.80364347, -7.64542675]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    // CHECK: %[[q_act:.*]] = "quantization.qcast"(%arg0)
+    // CHECK-SAME: -> tensor<2x2x!quant.uniform<i8:f32, 0.018920717052384919:-128>>
+    // CHECK: %[[dq_act:.*]] = "quantization.dcast"(%[[q_act]])
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[1.27501142, 4.824783]> : tensor<2xf32>} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    // CHECK: "tf.XlaCallModule"(%[[dq_act]], %[[dq_weight]]
+    %1 = "tf.XlaCallModule"(%0, %cst) {
+      Sout = [#tf_type.shape<2x2>], config = "",
+      _entry_function = @composite_dot_general,
+      module = "composite_dot_general",
+      platforms = [], version = 4 : i64
+    } : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %2 : tensor<2x2xf32>
+  }
+
+  // CHECK-LABEL: composite_dot_general
+  func.func private @composite_dot_general(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %0 = "stablehlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #stablehlo.dot<
+      lhs_batching_dimensions = [],
+      rhs_batching_dimensions = [],
+      lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [0]
+      >
+    } : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
+}
+
+// -----
+
+// Tests that the `PrepareQuantizePass` prepares for per-tensor quantization for
+// the weight of convolution. This is based on the `_quantization_method` that
+// does not have a `input_quantized_types` with a specified `dimension_specs`.
+
+// CHECK-LABEL: conv_per_tensor_quantized_method
+func.func private @conv_per_tensor_quantized_method(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<[[[[-6.30731344, 5.4962182], [1.80364347, -7.64542675], [-2.11145878, -7.08605719]], [[-9.54062747, -6.14013147], [6.12640238, -4.18223286], [5.05738974, 8.99269962]], [[3.3535192, 0.84816426], [-6.64676809, -7.95477629], [5.81315517, 9.21566581]]], [[[1.38622558, 4.63866329], [4.54742622, -1.43770897], [-3.96835279, 2.99996852]], [[0.989735424, -4.83384752], [-7.27702999, 1.17216611], [1.33735656, 0.728900194]], [[5.1286211, 8.98645591], [1.55008793, -3.85491467], [3.7003777, 9.26594448]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantization.stats"(%arg0) {layerStats = dense<[1.27501142, 4.824783]> : tensor<2xf32>} : (tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+  %1 = "tf.XlaCallModule"(%0, %cst_0, %cst) {
+    Sout = [#tf_type.shape<1x2x2x2>], config = "",
+    module = "composite_conv_fn_1",
+    _entry_function = @composite_conv_fn_1,
+    _quantization_method = "static_range_ptq {}",
+    platforms = [], version = 4 : i64
+  } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
+  %2 = "quantization.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %2 : tensor<1x2x2x2xf32>
+}
+// CHECK-SAME: %[[ARG_0:.+]]: tensor<1x3x2x3xf32>
+
+// Test that the weight is prepared for per-tensor quantization, based on the
+// `_quantization_method` attribute without a `dimension_specs` field in
+// `QuantizedType`.
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} tensor<2x3x3x2xf32>
+// CHECK: %[[Q_WEIGHT_PER_TENSOR:.*]] = "quantization.qcast"(%[[WEIGHT_CONST]]) {{.*}} (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[DQ_WEIGHT:.*]] = "quantization.dcast"(%[[Q_WEIGHT_PER_TENSOR]])
+
+// CHECK: %[[Q_ACTIVATION:.*]] = "quantization.qcast"(%[[ARG_0]])
+// CHECK-SAME: -> tensor<1x3x2x3x!quant.uniform<i8:f32, 0.018920717052384919:-128>>
+// CHECK: %[[DQ_ACTIVATION:.*]] = "quantization.dcast"(%[[Q_ACTIVATION]])
+// CHECK: "tf.XlaCallModule"(%[[DQ_ACTIVATION]], %[[DQ_WEIGHT]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize.mlir
new file mode 100644
index 000000000000..17e38625a42e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize.mlir
@@ -0,0 +1,74 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-quantize -verify-each=false | FileCheck %s
+
+// Tests for PopulateFusedGemmStylePatterns are handled in
+// quantize_composite_functions for module-level evaluation of functions.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK: quantize_simple_xla_call_module(%[[ARG_0:.+]]: tensor<1x4xf32>)
+  func.func private @quantize_simple_xla_call_module(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
+    %1 = "quantization.qcast"(%0) {volatile} : (tensor<4x3xf32>) -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03, 5.000000e-03, 5.000000e-03}>>
+    %2 = "quantization.dcast"(%1) : (tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03, 5.000000e-03, 5.000000e-03}>>) -> tensor<4x3xf32>
+    %3 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    %4 = "quantization.dcast"(%3) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x4xf32>
+    %5 = "tf.XlaCallModule"(%4, %2) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %6 = "quantization.qcast"(%5) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %7 = "quantization.dcast"(%6) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %7 : tensor<1x3xf32>
+  }
+// Test that the inputs and output of the tf.XlaCallModule op has been replaced
+// by quantized types, and the corresponding quantization.dcast ops that turned
+// those quantized types back to float types are removed.
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
+// CHECK-DAG: %[[QCAST_0:.+]] = "quantization.qcast"(%[[CONST_0]]) {volatile} : (tensor<4x3xf32>) -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+// CHECK-DAG: %[[QCAST_1:.+]] = "quantization.qcast"(%[[ARG_0]]) {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[QCAST_1]], %[[QCAST_0]])
+// Test that the `Method` has been copied over.
+// CHECK-SAME: {_quantization_method = "static_range_ptq { }"}
+// CHECK-SAME: : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+// CHECK: %[[DCAST_0:.+]] = "quantization.dcast"(%[[CALL_0]]) :  (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+// CHECK: return
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Tests that the output of the tf.XlaCallModule op has been replaced by
+// a quantized type, and the corresponding quantization.qcast ops that turned
+// the float output to a quantized type is removed.
+
+// CHECK-LABEL: quantize_simple_xla_call_module_no_operand
+func.func private @quantize_simple_xla_call_module_no_operand() -> tensor<1x3xf32> {
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+  %1 = "quantization.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  %2 = "quantization.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+  return %2 : tensor<1x3xf32>
+}
+// CHECK: %[[XLA_CALL_MODULE_0:.+]] = "tf.XlaCallModule"() <{{{.*}}}> {{{.*}}} : () -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+// CHECK: %[[DCAST_0:.+]] = "quantization.dcast"(%[[XLA_CALL_MODULE_0]]) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+// CHECK: "func.return"(%[[DCAST_0]]) : (tensor<1x3xf32>) -> ()
+
+// -----
+
+// Tests for emitting an error when there is no corresponding entry
+// function to quantize (@composite_dot_general_fn).
+
+module attributes {tf_saved_model.semantics} {
+ func.func private @error_when_no_entry_function(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+   %0 = stablehlo.constant dense<1.000000e+00> : tensor<2x3xf32>
+   %1 = "quantization.qcast"(%0) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+   %2 = "quantization.dcast"(%1) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<2x3xf32>
+   %3 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+   %4 = "quantization.dcast"(%3) : (tensor<1x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x2xf32>
+// expected-error @+2 {{Failed to find a valid entry function}}
+// expected-error @+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values}}
+   %5 = "tf.XlaCallModule"(%4, %2) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+   %6 = "quantization.qcast"(%5) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+   %7 = "quantization.dcast"(%6) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+   return %7 : tensor<1x3xf32>
+ }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_op_with_region.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_op_with_region.mlir
new file mode 100644
index 000000000000..5edfea7bc490
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_op_with_region.mlir
@@ -0,0 +1,241 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-quantize -verify-each=false | FileCheck %s
+
+// Tests if reduce_window op following quantized function is quantized.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1722 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: main_00
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1x1024xf32>
+  func.func private @main_00(%arg0: tensor<2x3x1x1024xf32>) -> tensor<2x3x1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    // CHECK: %[[CST0:.*]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    // CHECK: %[[CST1:.*]] = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    // CHECK: %[[Q0:.*]] = "quantization.qcast"(%[[CST0]])
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[CST1]])
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG0]])
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q2]], %[[Q1]])
+
+    // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[CALL]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
+    // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[REDUCE]])
+    // CHECK: return %[[DQ]]
+
+    %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    %1 = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    %2 = "quantization.qcast"(%0) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %3 = "quantization.dcast"(%2) : (tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<f32>
+    %4 = "quantization.qcast"(%1) {volatile} : (tensor<2x3x1024x3xf32>) -> tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>
+    %5 = "quantization.dcast"(%4) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
+    %6 = "quantization.qcast"(%arg0) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %7 = "quantization.dcast"(%6) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
+    %8 = "tf.XlaCallModule"(%7, %5) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %9 = "quantization.qcast"(%8) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %10 = "quantization.dcast"(%9) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
+    %11 = "stablehlo.reduce_window"(%10, %3) ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %14 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %14 : tensor<f32>
+    }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>} : (tensor<2x3x1x3xf32>, tensor<f32>) -> tensor<2x3x1x3xf32>
+    %12 = "quantization.qcast"(%11) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %13 = "quantization.dcast"(%12) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
+    return %13 : tensor<2x3x1x3xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<2x3x1x1024xf32>, %arg1: tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general
+    // CHECK: %[[RQ:.*]] = stablehlo.uniform_quantize %[[DOT]]
+    // CHECK: return %[[RQ]]
+
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    return %0 : tensor<2x3x1x3xf32>
+  }
+}
+
+// -----
+
+// Tests if reduce_window op preceding quantized function is quantized.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1722 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: main_00
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1x1024xf32>
+  func.func private @main_00(%arg0: tensor<2x3x1x1024xf32>) -> tensor<2x3x1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    // CHECK: %[[CST0:.*]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    // CHECK: %[[CST1:.*]] = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    // CHECK: %[[Q0:.*]] = "quantization.qcast"(%[[CST0]])
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]])
+
+    // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[Q1]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
+    // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[CST1]])
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[REDUCE]], %[[Q2]])
+
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[CALL]])
+    // CHECK: return %[[DQ]]
+
+    %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    %1 = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    %2 = "quantization.qcast"(%0) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %3 = "quantization.dcast"(%2) : (tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<f32>
+    %4 = "quantization.qcast"(%arg0) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %5 = "quantization.dcast"(%4) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
+    %6 = "stablehlo.reduce_window"(%5, %3) ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %14 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %14 : tensor<f32>
+    }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>} : (tensor<2x3x1x1024xf32>, tensor<f32>) -> tensor<2x3x1x1024xf32>
+    %7 = "quantization.qcast"(%6) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %8 = "quantization.dcast"(%7) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
+    %9 = "quantization.qcast"(%1) {volatile} : (tensor<2x3x1024x3xf32>) -> tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>
+    %10 = "quantization.dcast"(%9) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
+    %11 = "tf.XlaCallModule"(%8, %10) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %12 = "quantization.qcast"(%11) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %13 = "quantization.dcast"(%12) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
+    return %13 : tensor<2x3x1x3xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<2x3x1x1024xf32>, %arg1: tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general
+    // CHECK: %[[RQ:.*]] = stablehlo.uniform_quantize %[[DOT]]
+    // CHECK: return %[[RQ]]
+
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    return %0 : tensor<2x3x1x3xf32>
+  }
+}
+
+// -----
+
+// Tests if reduce_window op following quantized same-scale op is quantized.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1722 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: main_00
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1x1024xf32>
+  func.func private @main_00(%arg0: tensor<2x3x1x1024xf32>) -> tensor<2x3x3xf32> attributes {tf._original_func_name = "main_0"} {
+    // CHECK: %[[CST0:.*]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    // CHECK: %[[CST1:.*]] = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    // CHECK: %[[Q0:.*]] = "quantization.qcast"(%[[CST0]])
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[CST1]])
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG0]])
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q2]], %[[Q1]])
+    // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[CALL]]
+
+    // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[RESHAPE]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
+    // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: (tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[REDUCE]])
+    // CHECK: return %[[DQ]]
+
+    %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    %1 = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    %2 = "quantization.qcast"(%0) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %3 = "quantization.dcast"(%2) : (tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<f32>
+    %4 = "quantization.qcast"(%1) {volatile} : (tensor<2x3x1024x3xf32>) -> tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>
+    %5 = "quantization.dcast"(%4) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
+    %6 = "quantization.qcast"(%arg0) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %7 = "quantization.dcast"(%6) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
+    %8 = "tf.XlaCallModule"(%7, %5) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %9 = "quantization.qcast"(%8) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %10 = "quantization.dcast"(%9) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
+    %11 = stablehlo.reshape %10 : (tensor<2x3x1x3xf32>) -> tensor<2x3x3xf32>
+    %12 = "quantization.qcast"(%11) {volatile} : (tensor<2x3x3xf32>) -> tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %13 = "quantization.dcast"(%12) : (tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x3xf32>
+    %14 = "stablehlo.reduce_window"(%13, %3) ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %17 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %17 : tensor<f32>
+    }) {padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>, window_dimensions = array<i64: 1, 3, 1>} : (tensor<2x3x3xf32>, tensor<f32>) -> tensor<2x3x3xf32>
+    %15 = "quantization.qcast"(%14) {volatile} : (tensor<2x3x3xf32>) -> tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %16 = "quantization.dcast"(%15) : (tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x3xf32>
+    return %16 : tensor<2x3x3xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<2x3x1x1024xf32>, %arg1: tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general
+    // CHECK: %[[RQ:.*]] = stablehlo.uniform_quantize %[[DOT]]
+    // CHECK: return %[[RQ]]
+
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    return %0 : tensor<2x3x1x3xf32>
+  }
+}
+
+// -----
+
+// Tests if reduce_window op preceding quantized same-scale op is quantized.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1722 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: main_00
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1024xf32>
+  func.func private @main_00(%arg0: tensor<2x3x1024xf32>) -> tensor<2x3x1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    // CHECK: %[[CST0:.*]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    // CHECK: %[[CST1:.*]] = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    // CHECK: %[[Q0:.*]] = "quantization.qcast"(%[[CST0]])
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]])
+
+    // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[Q1]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
+    // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: (tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+
+    // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[REDUCE]]
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[CST1]])
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[RESHAPE]], %[[Q2]])
+
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[CALL]])
+    // CHECK: return %[[DQ]]
+
+    %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
+    %1 = stablehlo.constant dense<0xFF80000E> : tensor<2x3x1024x3xf32>
+    %2 = "quantization.qcast"(%0) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %3 = "quantization.dcast"(%2) : (tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<f32>
+    %4 = "quantization.qcast"(%arg0) {volatile} : (tensor<2x3x1024xf32>) -> tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %5 = "quantization.dcast"(%4) : (tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1024xf32>
+    %6 = "stablehlo.reduce_window"(%5, %3) ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %17 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %17 : tensor<f32>
+    }) {padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>, window_dimensions = array<i64: 1, 3, 1>} : (tensor<2x3x1024xf32>, tensor<f32>) -> tensor<2x3x1024xf32>
+    %7 = "quantization.qcast"(%6) {volatile} : (tensor<2x3x1024xf32>) -> tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %8 = "quantization.dcast"(%7) : (tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1024xf32>
+    %9 = stablehlo.reshape %8 : (tensor<2x3x1024xf32>) -> tensor<2x3x1x1024xf32>
+    %10 = "quantization.qcast"(%9) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    %11 = "quantization.dcast"(%10) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
+    %12 = "quantization.qcast"(%1) {volatile} : (tensor<2x3x1024x3xf32>) -> tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>
+    %13 = "quantization.dcast"(%12) : (tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>) -> tensor<2x3x1024x3xf32>
+    %14 = "tf.XlaCallModule"(%11, %13) <{Sout = [#tf_type.shape<2x3x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    %15 = "quantization.qcast"(%14) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    %16 = "quantization.dcast"(%15) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
+    return %16 : tensor<2x3x1x3xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<2x3x1x1024xf32>, %arg1: tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general
+    // CHECK: %[[RQ:.*]] = stablehlo.uniform_quantize %[[DOT]]
+    // CHECK: return %[[RQ]]
+
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<2x3x1x1024xf32>, tensor<2x3x1024x3xf32>) -> tensor<2x3x1x3xf32>
+    return %0 : tensor<2x3x1x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_same_scale.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_same_scale.mlir
new file mode 100644
index 000000000000..5ab6ea4101db
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_same_scale.mlir
@@ -0,0 +1,373 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-quantize -verify-each=false | FileCheck %s
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: same_scale_after_composite
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<1x2xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<2x3xf32>
+  func.func private @same_scale_after_composite(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<3x1xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
+    // CHECK-SAME: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[CALL]] : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[RESHAPE]]) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+    // CHECK: return %[[DQ]]
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    %3 = "quantization.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    %7 = stablehlo.reshape %6 : (tensor<1x3xf32>) -> tensor<3x1xf32>
+    %8 = "quantization.qcast"(%7) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %9 = "quantization.dcast"(%8) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+    return %9 : tensor<3x1xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: same_scale_indirectly_connected
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<1x2xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<2x3xf32>
+  func.func private @same_scale_indirectly_connected(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
+    // CHECK-SAME: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[CALL]] : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[TRANSPOSE:.*]] = stablehlo.transpose %[[RESHAPE]], dims = [1, 0] : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[TRANSPOSE]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    // CHECK: return %[[DQ]]
+
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    %3 = "quantization.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    %7 = stablehlo.reshape %6 : (tensor<1x3xf32>) -> tensor<3x1xf32>
+    %8 = "quantization.qcast"(%7) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %9 = "quantization.dcast"(%8) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+    %10 = stablehlo.transpose %9, dims = [1, 0] : (tensor<3x1xf32>) -> tensor<1x3xf32>
+    %11 = "quantization.qcast"(%10) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %12 = "quantization.dcast"(%11) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    return %12 : tensor<1x3xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: same_scale_not_connected_to_composite
+func.func @same_scale_not_connected_to_composite() -> tensor<3x1xf32> {
+  // CHECK: %[[CST:.*]] = stablehlo.constant
+  // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[CST]]) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ1:.*]] = "quantization.dcast"(%[[Q1]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[DQ1]]
+  // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[RESHAPE]]) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ2:.*]] = "quantization.dcast"(%[[Q2]]) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+  // CHECK: return %[[DQ2]]
+
+  %0 = stablehlo.constant dense<1.000000e+00> : tensor<1x3xf32>
+  %1 = "quantization.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantization.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %3 = stablehlo.reshape %2 : (tensor<1x3xf32>) -> tensor<3x1xf32>
+  %4 = "quantization.qcast"(%3) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantization.dcast"(%4) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+  return %5 : tensor<3x1xf32>
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: concatenate_and_composite
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<3x2xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<1x2xf32>
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<2x5xf32>
+  func.func private @concatenate_and_composite(%arg0: tensor<3x2xf32>, %arg1: tensor<1x2xf32>, %arg2: tensor<2x5xf32>) -> tensor<4x5xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>
+    // CHECK: %[[CONCAT:.*]] = stablehlo.concatenate %[[Q1]], %[[Q2]], dim = 0
+    // CHECK-SAME: (tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>, tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>) -> tensor<4x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>
+    // CHECK: %[[Q3:.*]] = "quantization.qcast"(%[[ARG2]]) {volatile} : (tensor<2x5xf32>) -> tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[CONCAT]], %[[Q3]])
+    // CHECK-SAME: (tensor<4x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>, tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[CALL]]) : (tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<4x5xf32>
+    // CHECK: return %[[DQ]]
+
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>
+    %1 = "quantization.dcast"(%0) : (tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>) -> tensor<3x2xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>
+    %3 = "quantization.dcast"(%2) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>) -> tensor<1x2xf32>
+    %4 = "stablehlo.concatenate"(%1, %3) {
+      dimension = 0 : i64
+    } : (tensor<3x2xf32>, tensor<1x2xf32>) -> tensor<4x2xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<4x2xf32>) -> tensor<4x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<4x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>) -> tensor<4x2xf32>
+    %7 = "quantization.qcast"(%arg2) {volatile} : (tensor<2x5xf32>) -> tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
+    %8 = "quantization.dcast"(%7) : (tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x5xf32>
+    %9 = "tf.XlaCallModule"(%6, %8) {Sout = [#tf_type.shape<4x5>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<4x2xf32>, tensor<2x5xf32>) -> tensor<4x5xf32>
+    %10 = "quantization.qcast"(%9) {volatile} : (tensor<4x5xf32>) -> tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %11 = "quantization.dcast"(%10) : (tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<4x5xf32>
+    return %11 : tensor<4x5xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<4x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>
+  // CHECK-SAME: %[[ARG4:.*]]: tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<4x2xf32>, %arg1: tensor<2x5xf32>) -> tensor<4x5xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG3]], %[[ARG4]]
+    // CHECK-SAME: (tensor<4x2x!quant.uniform<i8:f32, 5.000000e-03:-1>>, tensor<2x5x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<4x5x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05,3.000000e-05,3.000000e-05}>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<4x5x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<4x2xf32>, tensor<2x5xf32>) -> tensor<4x5xf32>
+    return %0 : tensor<4x5xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: composite_and_pad
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<1x2xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<2x3xf32>
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<f32>
+  func.func private @composite_and_pad(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<f32>) -> tensor<3x9xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[Q3:.*]] = "quantization.qcast"(%arg2) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[PAD:.*]] = stablehlo.pad %[[CALL]], %[[Q3]]
+    // CHECK-SAME: (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>, tensor<!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x9x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[PAD]]) : (tensor<3x9x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x9xf32>
+    // CHECK: return %[[DQ]]
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    %3 = "quantization.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    %7 = "quantization.qcast"(%arg2) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %8 = "quantization.dcast"(%7) : (tensor<!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<f32>
+    %9 = stablehlo.pad %6, %8, low = [0, 1], high = [2, 1], interior = [0, 2] : (tensor<1x3xf32>, tensor<f32>) -> tensor<3x9xf32>
+    %10 = "quantization.qcast"(%9) {volatile} : (tensor<3x9xf32>) -> tensor<3x9x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %11 = "quantization.dcast"(%10) : (tensor<3x9x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x9xf32>
+    return %11 : tensor<3x9xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: composite_and_select
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<1x2xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<2x3xf32>
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<1x3xi1>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<1x3xf32>
+  func.func private @composite_and_select(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<1x3xi1>, %arg3: tensor<1x3xf32>) -> tensor<1x3xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[Q3:.*]] = "quantization.qcast"(%[[ARG3]]) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[SELECT:.*]] = stablehlo.select %[[ARG2]], %[[CALL]], %[[Q3]] : tensor<1x3xi1>, tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[SELECT]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    // CHECK: return %[[DQ]]
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    %3 = "quantization.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    %7 = "quantization.qcast"(%arg3) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %8 = "quantization.dcast"(%7) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    %9 = stablehlo.select %arg2, %6, %8 : (tensor<1x3xi1>, tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+    %10 = "quantization.qcast"(%9) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %11 = "quantization.dcast"(%10) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    return %11 : tensor<1x3xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: composite_and_broadcast_in_dim
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<1x2xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<2x3xf32>
+  func.func private @composite_and_broadcast_in_dim(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<2x3x2xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[BROADCAST:.*]] = stablehlo.broadcast_in_dim %[[CALL]], dims = [2, 1] : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[BROADCAST]]) : (tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2xf32>
+    // CHECK: return %[[DQ]]
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+    %3 = "quantization.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x3xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+    %7 = stablehlo.broadcast_in_dim %6, dims = [2, 1] : (tensor<1x3xf32>) -> tensor<2x3x2xf32>
+    %8 = "quantization.qcast"(%7) {volatile} : (tensor<2x3x2xf32>) -> tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %9 = "quantization.dcast"(%8) : (tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2xf32>
+    return %9 : tensor<2x3x2xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: composite_and_gather
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<3x4x5xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<3x5x2xf32>
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<2x3x2xi64>
+  func.func private @composite_and_gather(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x2xf32>, %arg2: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<3x4x5xf32>) -> tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<3x5x2xf32>) -> tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
+    // CHECK-SAME: (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[GATHER:.*]] = "stablehlo.gather"(%[[CALL]], %[[ARG2]])
+    // CHECK-SAME: (tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>, tensor<2x3x2xi64>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[GATHER]]) : (tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2x2xf32>
+    // CHECK: return %[[DQ]]
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<3x4x5xf32>) -> tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<3x4x5xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<3x5x2xf32>) -> tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>
+    %3 = "quantization.dcast"(%2) : (tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>) -> tensor<3x5x2xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x5xf32>, tensor<3x5x2xf32>) -> tensor<3x4x2xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<3x4x2xf32>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x4x2xf32>
+    %7 = "stablehlo.gather"(%6, %arg2) {
+    dimension_numbers = #stablehlo.gather<
+      offset_dims = [2, 3],
+      collapsed_slice_dims = [0],
+      start_index_map = [1, 0],
+      index_vector_dim = 2>,
+    slice_sizes = array<i64: 1, 2, 2>,
+    indices_are_sorted = false
+  } : (tensor<3x4x2xf32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32>
+    %8 = "quantization.qcast"(%7) {volatile} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %9 = "quantization.dcast"(%8) : (tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2x2xf32>
+    return %9 : tensor<2x3x2x2xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x2xf32>) -> tensor<3x4x2xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK-SAME: (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>) -> tensor<3x4x2x!quant.uniform<i32:f32, 3.000000e-05>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<3x4x2x!quant.uniform<i32:f32, 3.000000e-05>>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x4x5xf32>, tensor<3x5x2xf32>) -> tensor<3x4x2xf32>
+    return %0 : tensor<3x4x2xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: composite_and_slice
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<3x2xf32>
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<2x4xf32>
+  func.func private @composite_and_slice(%arg0: tensor<3x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<2x2xf32> {
+    // CHECK: %[[Q1:.*]] = "quantization.qcast"(%[[ARG0]]) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    // CHECK: %[[Q2:.*]] = "quantization.qcast"(%[[ARG1]]) {volatile} : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
+    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
+    // CHECK-SAME: (tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[SLICE:.*]] = stablehlo.slice %[[CALL]] [1:3, 2:4] : (tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: %[[DQ:.*]] = "quantization.dcast"(%[[SLICE]]) : (tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x2xf32>
+    // CHECK: return %[[DQ]]
+    %0 = "quantization.qcast"(%arg0) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<3x2xf32>
+    %2 = "quantization.qcast"(%arg1) {volatile} : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
+    %3 = "quantization.dcast"(%2) : (tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<2x4xf32>
+    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x2xf32>, tensor<2x4xf32>) -> tensor<3x4xf32>
+    %5 = "quantization.qcast"(%4) {volatile} : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x4xf32>
+    %7 = stablehlo.slice %6 [1:3, 2:4] : (tensor<3x4xf32>) -> tensor<2x2xf32>
+    %8 = "quantization.qcast"(%7) {volatile} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    %9 = "quantization.dcast"(%8) : (tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x2xf32>
+    return %9 : tensor<2x2xf32>
+  }
+
+  // CHECK: quantized_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<3x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<3x4xf32> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK-SAME: (tensor<3x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x4x!quant.uniform<i8<-127:127>:f32:1, {6.000000e-03,6.000000e-03,6.000000e-03,6.000000e-03}>>) -> tensor<3x4x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05,3.000000e-05}>>
+    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<3x4x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK: return %[[Q3]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<3x2xf32>, tensor<2x4xf32>) -> tensor<3x4xf32>
+    return %0 : tensor<3x4xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_weight_only.mlir
new file mode 100644
index 000000000000..6a9bd42a76ae
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/tf_quantize_weight_only.mlir
@@ -0,0 +1,66 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-quantize | FileCheck %s
+
+// Test that hybrid quantized dot_general is produced when q/dq pair only exists
+// for weight.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+    %0 = "quantization.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>
+    %1 = "quantization.dcast"(%0) : (tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<2x3xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_dot_general_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+// CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[Q]])
+// CHECK-SAME: {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_dot_general_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<1x3xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<1x3xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Test that hybrid quantized convolution is produced when q/dq pair only exists
+// for weight.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %0 = "quantization.qcast"(%cst) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    %1 = "quantization.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3x3x2xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+// CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[Q]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
index a15639671ddc..3163350bc1d3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
@@ -855,7 +855,7 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
     %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
     %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_add_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be ranked tensor of 2/4/8/16/32-bit uniform quantized signed integer or 2/4/8/16/32-bit uniform quantized unsigned integer or 2/4/8/16/32-bit uniform quantized per axis signed integer or 2/4/8/16/32-bit uniform quantized per axis unsigned integer values, but got 'tensor<1x2xf32>'}}
+    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be ranked tensor of per-tensor integer quantized or per-axis integer quantized values, but got 'tensor<1x2xf32>'}}
     %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
     return %2 : tensor<1x2xf32>
   }
@@ -876,7 +876,7 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<1> : tensor<2x3x2xi32>} : () -> tensor<2x3x2xi32>
     %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<3x4x2xf32>) -> tensor<3x4x2xf32>
     %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<2x3x2x2>], _entry_function = @composite_gather_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_gather_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
-    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be ranked tensor of 2/4/8/16/32-bit uniform quantized signed integer or 2/4/8/16/32-bit uniform quantized unsigned integer or 2/4/8/16/32-bit uniform quantized per axis signed integer or 2/4/8/16/32-bit uniform quantized per axis unsigned integer values, but got 'tensor<2x3x2x2xf32>'}}
+    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be ranked tensor of per-tensor integer quantized or per-axis integer quantized values, but got 'tensor<2x3x2x2xf32>'}}
     %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2xf32>
     return %2 : tensor<2x3x2x2xf32>
   }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
index 3cccc406c201..d455ff1421f7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
@@ -33,11 +33,11 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     return %9#0 : tensor<1x64xf32>
   }
 
-  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_1
   // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable"}
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_0]])
-  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_0
   // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable"}
   // CHECK: %[[CUSTOM_AGGREGATOR_3:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_1:.*]])
@@ -91,7 +91,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     return %5 : tensor<1x1024xf32>
   }
 
- // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"(%arg0) <{Sout = [#tf_type.shape<1x1024>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _stablehlo_version = "{{.*}}"} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+ // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"(%arg0) <{Sout = [#tf_type.shape<1x1024>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _stablehlo_version = "{{.*}}"} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
  // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP]])
  // CHECK: return %[[IDENTITY]]
  // CHECK }
@@ -117,7 +117,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     return %3#0 : tensor<1x3xf32>
   }
 
-  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _stablehlo_version = "{{.*}}"}
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _stablehlo_version = "{{.*}}"}
   // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/shape_cstr_legalize_to_hlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/shape_cstr_legalize_to_hlo.mlir
new file mode 100644
index 000000000000..ac7d6a51fb87
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/shape_cstr_legalize_to_hlo.mlir
@@ -0,0 +1,110 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-convert-shape-to-stablehlo-with-constraints --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: func.func @shape_cstr_broadcastable
+func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xindex>) {
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<2xindex>
+  shape.assuming %0 {
+  }
+  func.return
+  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[ONES:.*]] = stablehlo.constant dense<1> : tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = stablehlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_EQ:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[DIMS2]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = stablehlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
+  // CHECK-NEXT: %[[TRUE:.*]] = stablehlo.constant dense<true> : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [0:1] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = stablehlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [1:2] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = stablehlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = stablehlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
+  // CHECK-NEXT: stablehlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// CHECK-LABEL: func @shape_cstr_broadcastable_different_dims_1
+func.func @shape_cstr_broadcastable_different_dims_1(%arg0: tensor<2xindex>, %arg1: tensor<1xindex>) {
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<1xindex>
+  shape.assuming %0 {
+  }
+  func.return
+  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<1xindex> to tensor<1xi32>
+  // CHECK-NEXT: %[[PAD:.*]] = stablehlo.constant dense<1> : tensor<1xi32>
+  // CHECK-NEXT: %[[DIMS2_PAD:.*]] = stablehlo.concatenate %[[PAD]], %[[DIMS2]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[ONES:.*]] = stablehlo.constant dense<1> : tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS2_PAD]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = stablehlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_EQ:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[DIMS2_PAD]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = stablehlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
+  // CHECK-NEXT: %[[TRUE:.*]] = stablehlo.constant dense<true> : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [0:1] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = stablehlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [1:2] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = stablehlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = stablehlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
+  // CHECK-NEXT: stablehlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// CHECK-LABEL: func @shape_cstr_broadcastable_different_dims_2
+func.func @shape_cstr_broadcastable_different_dims_2(%arg0: tensor<1xindex>, %arg1: tensor<2xindex>) {
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<1xindex>, tensor<2xindex>
+  shape.assuming %0 {
+  }
+  func.return
+  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<1xindex> to tensor<1xi32>
+  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[PAD:.*]] = stablehlo.constant dense<1> : tensor<1xi32>
+  // CHECK-NEXT: %[[DIMS1_PAD:.*]] = stablehlo.concatenate %[[PAD]], %[[DIMS1]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[ONES:.*]] = stablehlo.constant dense<1> : tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS1_PAD]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = stablehlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_EQ:.*]] = stablehlo.compare  EQ, %[[DIMS1_PAD]], %[[DIMS2]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = stablehlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
+  // CHECK-NEXT: %[[TRUE:.*]] = stablehlo.constant dense<true> : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [0:1] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = stablehlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [1:2] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = stablehlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = stablehlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
+  // CHECK-NEXT: stablehlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+
+func.func @shape_cstr_broadcast_too_many_operands(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>, %arg2: tensor<4xindex>) {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1, %arg2 : tensor<4xindex>, tensor<4xindex>, tensor<4xindex>
+  shape.assuming %0 {
+  }
+  func.return
+}
+
+// -----
+
+func.func @shape_cstr_broadcastable_input_shape(%arg0: !shape.shape, %arg1: !shape.shape) {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : !shape.shape, !shape.shape
+  shape.assuming %0 {
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_convert_func_to_bfloat16.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_convert_func_to_bfloat16.mlir
new file mode 100644
index 000000000000..f73515b3c5e8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_convert_func_to_bfloat16.mlir
@@ -0,0 +1,128 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-convert-func-to-bfloat16 -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @add_f32(%arg0: tensor<3x3xbf16>, %arg1: tensor<3x3xbf16>) -> tensor<3x3xbf16>
+func.func @add_f32(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
+  // CHECK-NOT: f32
+  // CHECK: stablehlo.add
+  %0 = stablehlo.add %arg0, %arg1: (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @add_f64(%arg0: tensor<3x3xbf16>, %arg1: tensor<3x3xbf16>) -> tensor<3x3xbf16>
+func.func @add_f64(%arg0: tensor<3x3xf64>, %arg1: tensor<3x3xf64>) -> tensor<3x3xf64> {
+  // CHECK-NOT: f64
+  // CHECK: stablehlo.add
+  %0 = stablehlo.add %arg0, %arg1: (tensor<3x3xf64>, tensor<3x3xf64>) -> tensor<3x3xf64>
+  return %0 : tensor<3x3xf64>
+}
+
+// -----
+
+// CHECK-LABEL: @constant_f32() -> tensor<2x2xbf16>
+func.func @constant_f32() -> tensor<2x2xf32> {
+  // CHECK-NOT: f32
+  // CHECK{LITERAL}: stablehlo.constant dense<[[1.398440e+00, 0.000000e+00], [3.093750e+00, -2.001950e-01]]> : tensor<2x2xbf16>
+  %0 = stablehlo.constant dense<[[1.4, 0.0], [3.1, -0.2]]> : tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+func.func @constant_elided() -> tensor<2x2xf32> {
+  // expected-error @+1 {{failed to legalize operation 'stablehlo.constant' that was explicitly marked illegal}}
+  %0 = stablehlo.constant dense_resource<__elided__> : tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @reduce_window_f32(%arg0: tensor<2x3x1x3xbf16>) -> tensor<2x3x1x3xbf16>
+func.func @reduce_window_f32(%arg0: tensor<2x3x1x3xf32>) -> tensor<2x3x1x3xf32> {
+  // CHECK-NOT: f32
+  // CHECK: stablehlo.reduce_window
+  %0 = stablehlo.constant dense<0.0> : tensor<f32>
+  %1 = "stablehlo.reduce_window"(%arg0, %0) ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %2 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %2 : tensor<f32>
+  }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>} : (tensor<2x3x1x3xf32>, tensor<f32>) -> tensor<2x3x1x3xf32>
+  return %1 : tensor<2x3x1x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_i32_f32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_i32_f32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xf32> {
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xf32>
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %[[BITCAST]] : (tensor<1x256128xf32>) -> tensor<1x256128xbf16>
+  // CHECK: return %[[CONVERT]] : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xf32>
+  return %20 : tensor<1x256128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_f32_i32(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xi32>
+func.func @bitcast_convert_f32_i32(%arg0: tensor<1x256128xf32>) -> tensor<1x256128xi32> {
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xf32>
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %[[CONVERT]] : (tensor<1x256128xf32>) -> tensor<1x256128xi32>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xi32>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xf32>) -> tensor<1x256128xi32>
+  return %20 : tensor<1x256128xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_ui32_f32(%arg0: tensor<1x256128xui32>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_ui32_f32(%arg0: tensor<1x256128xui32>) -> tensor<1x256128xf32> {
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xui32>) -> tensor<1x256128xf32>
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %[[BITCAST]] : (tensor<1x256128xf32>) -> tensor<1x256128xbf16>
+  // CHECK: return %[[CONVERT]] : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xui32>) -> tensor<1x256128xf32>
+  return %20 : tensor<1x256128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_f32_ui32(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xui32>
+func.func @bitcast_convert_f32_ui32(%arg0: tensor<1x256128xf32>) -> tensor<1x256128xui32> {
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xf32>
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %[[CONVERT]] : (tensor<1x256128xf32>) -> tensor<1x256128xui32>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xui32>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xf32>) -> tensor<1x256128xui32>
+  return %20 : tensor<1x256128xui32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_f32_f32(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_f32_f32(%arg0: tensor<1x256128xf32>) -> tensor<1x256128xf32> {
+  // Convert bitcast_convert to no-op for f32->f32.
+  // CHECK: return %arg0 : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xf32>) -> tensor<1x256128xf32>
+  return %20 : tensor<1x256128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_i32_ui32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xui32>
+func.func @bitcast_convert_i32_ui32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xui32> {
+  // Do not convert bitcast_convert for legal types.
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xui32>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xui32>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xui32>
+  return %20 : tensor<1x256128xui32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_bf16_bf16(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_bf16_bf16(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xbf16> {
+  // Do not convert bitcast_convert for legal types.
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+  return %20 : tensor<1x256128xbf16>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_convert_xla_call_module_op_to_bfloat16.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_convert_xla_call_module_op_to_bfloat16.mlir
new file mode 100644
index 000000000000..d3694e7e6402
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_convert_xla_call_module_op_to_bfloat16.mlir
@@ -0,0 +1,42 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-xla-call-module-serialization -tf-stablehlo-convert-xla-call-module-op-to-bfloat16 -tf-xla-call-module-deserialization | FileCheck %s
+
+// ConvertXlaCallModuleOpToBfloat16Pass works on XlaCallModuleOps with
+// serialized modules. Which makes verification difficult. Therefore we add
+// (de)serialization passes so that the input and output are deserializated
+// StableHLO functions.
+
+// CHECK-LABEL: module
+module {
+  // CHECK-LABEL: func @main
+  // CHECK-SAME: %[[ARG_0:.*]]: tensor<10xf32>, %[[ARG_1:.*]]: tensor<10xf32>, %[[ARG_2:.*]]: tensor<6xi32>
+  func.func @main(
+      %arg0: tensor<10xf32>, %arg1: tensor<10xf32>, %arg2: tensor<6xi32>
+    ) -> (tensor<10xf32>, tensor<6xi32>) {
+    // CHECK: %[[CAST_0:.*]] = "tf.Cast"(%[[ARG_0]]) <{Truncate = false}> : (tensor<10xf32>) -> tensor<10xbf16>
+    // CHECK: %[[CAST_1:.*]] = "tf.Cast"(%[[ARG_1]]) <{Truncate = false}> : (tensor<10xf32>) -> tensor<10xbf16>
+    // CHECK: %[[RESULT:.*]]:2 = "tf.XlaCallModule"(%[[CAST_0]], %[[CAST_1]], %[[ARG_2]])
+    // CHECK-SAME: _stablehlo_version = "1.0.0"
+    // CHECK-SAME: (tensor<10xbf16>, tensor<10xbf16>, tensor<6xi32>) -> (tensor<10xbf16>, tensor<6xi32>)
+    // CHECK: %[[RESULT_CAST:.*]] = "tf.Cast"(%[[RESULT]]#0) <{Truncate = false}> : (tensor<10xbf16>) -> tensor<10xf32>
+    %0:2 = "tf.XlaCallModule"(%arg0, %arg1, %arg2) {
+      Sout = [#tf_type.shape<10>], dim_args_spec = [],
+      _entry_function = @main_0,
+      _stablehlo_version = "1.0.0",
+      _stablehlo_module_attrs = { mhlo.num_partitions = 1 }, module = "",
+      platforms = [], version = 5 : i64
+    } : (tensor<10xf32>, tensor<10xf32>, tensor<6xi32>) -> (tensor<10xf32>, tensor<6xi32>)
+    // CHECK: return %[[RESULT_CAST]], %[[RESULT]]#1 : tensor<10xf32>, tensor<6xi32>
+    func.return %0#0, %0#1 : tensor<10xf32>, tensor<6xi32>
+  }
+
+  // CHECK-LABEL: func private @main_0
+  // CHECK-SAME: %[[ARG_0:.*]]: tensor<10xbf16>, %[[ARG_1:.*]]: tensor<10xbf16>, %[[ARG_2:.*]]: tensor<6xi32>
+  func.func private @main_0(
+      %arg0: tensor<10xf32>, %arg1: tensor<10xf32>, %arg2: tensor<6xi32>
+    ) -> (tensor<10xf32>, tensor<6xi32>) attributes {_from_xla_call_module} {
+    // CHECK: %[[ADD:.*]] = stablehlo.add %[[ARG_0]], %[[ARG_1]] : tensor<10xbf16>
+    %0 = stablehlo.add %arg0, %arg1 : tensor<10xf32>
+    // CHECK: return %[[ADD]], %[[ARG_2]] : tensor<10xbf16>, tensor<6xi32>
+    return %0, %arg2 : tensor<10xf32>, tensor<6xi32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_defer_activation_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_defer_activation_transpose.mlir
new file mode 100644
index 000000000000..b4216725020c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_defer_activation_transpose.mlir
@@ -0,0 +1,307 @@
+// RUN: stablehlo-quant-opt %s -tf-stablehlo-defer-activation-transpose \
+// RUN:   -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that an `add(transpose(arg0), arg1)` pattern is converted to
+// `transpose(add(arg0, transpose(arg1)))`. The transpose in the activation is
+// deferred to the output of `stablehlo.add` and an extra transpose op is
+// inserted to the RHS to match the shape of the operand.
+
+// CHECK-LABEL: add_with_activation_transpose
+func.func @add_with_activation_transpose(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x4x3x3xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.add %1, %0 : tensor<1x4x3x3xf32>
+  return %2 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[CONST_0]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that an `add(transpose(arg0), broadcast_in_dim(arg1))` pattern is
+// converted to `transpose(add(arg0, transpose(broadcast_in_dim(arg1))))`.
+// The transpose in the activation is deferred to the output of `stablehlo.add`
+// and an extra transpose op is inserted to the RHS to match the shape of the
+// operand.
+
+// CHECK-LABEL: add_with_activation_transpose_broadcasted_rhs
+func.func @add_with_activation_transpose_broadcasted_rhs(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
+  %1 = stablehlo.broadcast_in_dim %0, dims = [1] : (tensor<4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x3x3xf32>
+  return %3 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[BROADCAST:.+]] = stablehlo.broadcast_in_dim %[[CONST_0]], dims = [1] : (tensor<4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[BROADCAST]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// [No change] Tests that the activation transpose whose permutation is not
+// `[0, 3, 1, 2]` is not deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_permutation_mismatch
+func.func @add_with_activation_transpose_permutation_mismatch(
+      %arg0: tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x2x4xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 2, 1, 3] : (tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32>
+  %2 = stablehlo.add %1, %0 : tensor<1x3x2x4xf32>
+  return %2 : tensor<1x3x2x4xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// [No change] Tests that the activation transpose whose rank is not 4 is not
+// deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_rank_two
+func.func @add_with_activation_transpose_rank_two(%arg0: tensor<1x2xf32>) -> tensor<2x1xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x1xf32>
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<1x2xf32>) -> tensor<2x1xf32>
+  %2 = stablehlo.add %1, %0 : tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// [No change] Tests that the right-hand side that is not a constant is not
+// deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_nonconst_rhs
+func.func @add_with_activation_transpose_nonconst_rhs(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<1x4x3x3xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %1 = stablehlo.add %0, %arg1 : tensor<1x4x3x3xf32>
+  return %1 : tensor<1x4x3x3xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// Tests that the transpose of the input of `stablehlo.reduce_window` is
+// deferred to the result. The attributes are permutated according to the new
+// input shape.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose
+func.func @reduce_window_max_activation_transpose(%arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x8x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {window_dimensions = array<i64: 1, 1, 2, 2>, window_strides = array<i64: 1, 1, 2, 2>} : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x8x8xf32>
+  return %2 : tensor<1x4x8x8xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000>
+
+// Check that the body is not modified.
+// CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: <{window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}>
+// CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
+// CHECK: stablehlo.return %[[MAX]]
+
+// Check that the attributes window_dimensions & window_strides are also
+// permutated to match the new input shape.
+// CHECK: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x8x8x4xf32>
+
+// Check that a `stablehlo.transpose` is added to the result to match the shape
+// of the users.
+// CHECK: %[[TRANSPOSE:.+]] = stablehlo.transpose %[[REDUCE_WINDOW]], dims = [0, 3, 1, 2] : (tensor<1x8x8x4xf32>) -> tensor<1x4x8x8xf32>
+// CHECK: return %[[TRANSPOSE]]
+
+// -----
+
+// Tests that the transpose of the input of `stablehlo.reduce_window` is
+// deferred to the result. The attributes are permutated according to the new
+// input shape. This test is similar to the test above with the difference that
+// the `stablehlo.reduce_window` has explicit optional attributes:
+// `base_dilations` and `window_dilations`.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_explicit_optional_attrs
+func.func @reduce_window_max_activation_transpose_explicit_optional_attrs(
+      %arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x15x15xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>,
+    base_dilations = array<i64: 1, 1, 2, 2>,
+    window_dilations = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x15x15xf32>
+  return %2 : tensor<1x4x15x15xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000>
+
+// Check that the body is not modified.
+// CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: <{base_dilations = array<i64: 1, 2, 2, 1>, window_dilations = array<i64: 1, 2, 2, 1>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}>
+// CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
+// CHECK: stablehlo.return %[[MAX]]
+
+// Check that the attributes window_dimensions & window_strides along with
+// optional attributes base_dilations and window_dilations are also permutated
+// to match the new input shape.
+// CHECK: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x15x15x4xf32>
+
+// Check that a `stablehlo.transpose` is added to the result to match the shape
+// of the users.
+// CHECK: %[[TRANSPOSE:.+]] = stablehlo.transpose %[[REDUCE_WINDOW]], dims = [0, 3, 1, 2] : (tensor<1x15x15x4xf32>) -> tensor<1x4x15x15xf32>
+// CHECK: return %[[TRANSPOSE]]
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when the input
+// tensor does not have rank 4.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose
+// CHECK-SAME: (%[[ARG:.+]]: tensor<16x8xf32>) -> tensor<4x8xf32>
+func.func @reduce_window_max_activation_transpose_rank2(%arg0: tensor<16x8xf32>) -> tensor<4x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<16x8xf32>) -> tensor<8x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {window_dimensions = array<i64: 2, 2>, window_strides = array<i64: 2, 2>} : (tensor<8x16xf32>, tensor<f32>) -> tensor<4x8xf32>
+  return %2 : tensor<4x8xf32>
+}
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when it has an
+// explicit `padding` attribute.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_with_padding
+func.func @reduce_window_max_activation_transpose_with_padding(%arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x9x9xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>,
+    padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x9x9xf32>
+  return %2 : tensor<1x4x9x9xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when the transpose
+// isn't `[0, 3, 1, 2]` (i.e. NCHW->NHWC).
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_with_padding
+func.func @reduce_window_max_activation_transpose_with_padding(%arg0: tensor<16x16x4x1xf32>) -> tensor<1x4x8x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [3, 2, 1, 0] : (tensor<16x16x4x1xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x8x8xf32>
+  return %2 : tensor<1x4x8x8xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<16x16x4x1xf32>
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// Tests that an `max(transpose(arg0), arg1)` pattern is converted to
+// `transpose(max(arg0, transpose(arg1)))`. The transpose in the activation is
+// deferred to the output of `stablehlo.max` and an extra transpose op is
+// inserted to the RHS to match the shape of the operand.
+
+// CHECK-LABEL: max_with_activation_transpose
+func.func @max_with_activation_transpose(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x4x3x3xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<1x4x3x3xf32>
+  return %2 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[CONST_0]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// [No change] Tests that the activation transpose of `stablehlo.maximum` whose
+// permutation is not `[0, 3, 1, 2]` is not deferred.
+
+// CHECK-LABEL: max_with_activation_transpose_permutation_mismatch
+func.func @max_with_activation_transpose_permutation_mismatch(
+      %arg0: tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x2x4xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 2, 1, 3] : (tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<1x3x2x4xf32>
+  return %2 : tensor<1x3x2x4xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[MAX_0]]
+
+// -----
+
+// [No change] Tests that the activation transpose of `stablehlo.maximum` whose
+// rank is not 4 is not deferred.
+
+// CHECK-LABEL: max_with_activation_transpose_rank_two
+func.func @max_with_activation_transpose_rank_two(%arg0: tensor<1x2xf32>) -> tensor<2x1xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x1xf32>
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<1x2xf32>) -> tensor<2x1xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[MAX_0]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_fold_constant_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_fold_constant_transpose.mlir
new file mode 100644
index 000000000000..da96bb0e7a68
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_fold_constant_transpose.mlir
@@ -0,0 +1,59 @@
+// RUN: stablehlo-quant-opt %s -tf-stablehlo-fold-constant-transpose \
+// RUN:   -split-input-file | FileCheck %s
+
+// CHECK-LABEL: transpose_simple_1d
+func.func @transpose_simple_1d() -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<[0.000000e+0, 1.000000e+0]> : tensor<2xf32>
+  %1 = stablehlo.transpose %0, dims = [0] : (tensor<2xf32>) -> tensor<2xf32>
+  return %1 : tensor<2xf32>
+}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant dense<[0.000000e+00, 1.000000e+00]> : tensor<2xf32>
+// CHECK-NOT: transpose
+// CHECK: return %[[CONST_0]] : tensor<2xf32>
+
+// -----
+
+// CHECK-LABEL: transpose_simple_2d
+func.func @transpose_simple_2d() -> tensor<3x2xf32> {
+  %0 = stablehlo.constant dense<[[0.000000e+0, 1.000000e+0, 2.000000e+0], [3.000000e+0, 4.000000e+0, 5.000000e+0]]> : tensor<2x3xf32>
+  %1 = stablehlo.transpose %0, dims = [1, 0] : (tensor<2x3xf32>) -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant dense<{{\[\[}}0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
+// CHECK-NOT: transpose
+// CHECK: return %[[CONST_0]] : tensor<3x2xf32>
+
+// -----
+
+// CHECK-LABEL: transpose_simple_4d
+func.func @transpose_simple_4d() -> tensor<5x2x3x4xf32> {
+  %0 = stablehlo.constant dense<1.000000e+0> : tensor<2x3x4x5xf32>
+  %1 = stablehlo.transpose %0, dims = [3, 0, 1, 2] : (tensor<2x3x4x5xf32>) -> tensor<5x2x3x4xf32>
+  return %1 : tensor<5x2x3x4xf32>
+}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<5x2x3x4xf32>
+// CHECK-NOT: transpose
+// CHECK: return %[[CONST_0]] : tensor<5x2x3x4xf32>
+
+// -----
+
+// Tests that int constants are not folded.
+
+// CHECK-LABEL: transpose_int
+func.func @transpose_int() -> tensor<3x2xi32> {
+  %0 = stablehlo.constant dense<0> : tensor<2x3xi32>
+  %1 = stablehlo.transpose %0, dims = [1, 0] : (tensor<2x3xi32>) -> tensor<3x2xi32>
+  return %1 : tensor<3x2xi32>
+}
+// CHECK: transpose
+
+// -----
+
+// Tests that transposing an argument cannot be folded.
+
+// CHECK-LABEL: transpose_arg
+func.func @transpose_arg(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
+  %0 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<2x3xf32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+}
+// CHECK: transpose
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_calibration_statistics_saver.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_calibration_statistics_saver.mlir
new file mode 100644
index 000000000000..8e034735ee9a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_calibration_statistics_saver.mlir
@@ -0,0 +1,219 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -mlir-disable-threading -tf-stablehlo-insert-calibration-statistics-saver | FileCheck %s
+
+func.func @serving_default(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x2x2x2xf32>) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}} {
+  %cst = "tf.Const"() <{value = dense<[[[[-0.891899645, 0.392044574], [0.77720493, 1.31188095], [0.255048186, 2.700150e+00]], [[-1.08111858, -0.406604826], [-0.298575521, -2.25356531], [-1.00201964, 2.54532099]], [[-1.34911358, 0.279911458], [-0.868258893, -1.36708188], [0.866317451, -2.05804896]]], [[[-0.591397941, 0.331505477], [0.715151429, 2.64073896], [1.27163255, 0.206143498]], [[0.474211812, 1.45044816], [0.119936548, 2.54149938], [-0.939900994, 0.438387245]], [[-1.12486279, -1.09022558], [0.82202208, 1.04652023], [1.30316162, 2.62054276]]]]> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 5 : i32, id = "0", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %0 = "tf.Conv2D"(%output, %cst) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  %output_1, %min_2, %max_3, %histogram_4 = "tf.CustomAggregator"(%0) <{calibration_method = 5 : i32, id = "1", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x2x2x2xf32>) -> (tensor<1x2x2x2xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %1 = "tf.Identity"(%output_1) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %1 : tensor<1x2x2x2xf32>
+}
+// CHECK-LABEL: @serving_default
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "0", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "1", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_O]], %[[MAX_O]], %[[HISTOGRAM_0]], %[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+// CHECK-SAME: <{calibration_methods = [5 : i32, 5 : i32], ids = ["0", "1"], output_file_path = "serving_default_0.pb"}>  : (tensor<f32>, tensor<f32>, tensor<512xi64>, tensor<f32>, tensor<f32>, tensor<512xi64>) -> ()
+// CHECK: return
+
+// -----
+
+// No CustomAggregator ops exist.
+func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x2x2x2xf32> attributes {tf_quant.composite_function} {
+  %0 = "tf.Conv2D"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> : (tensor<1x2x2x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %2 : tensor<1x2x2x2xf32>
+}
+// CHECK-LABEL: @composite_conv2d_with_bias_and_relu6_fn_1
+// CHECK-NOT: "tf.CalibrationStatisticsSaver"
+
+// -----
+
+// Check the IfOp is set to stateful.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func @serving_default
+  // CHECK: "tf.If"
+  // CHECK-SAME: is_stateless = false
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_0 = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %0 = "tf.Sum"(%arg0, %cst) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst_0) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.If"(%1, %arg0) <{else_branch = @cond_false_80, is_stateless = true, then_branch = @cond_true_70}> {Tcond = i1, Tin = [f32], Tout = [i1, f32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>, tensor<1x4xf32>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @cond_false_80
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "cond_false_80_0.pb"
+  func.func private @cond_false_80(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_false_8"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%output, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %output_2, %min_3, %max_4, %histogram_5 = "tf.CustomAggregator"(%1) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %2 = "tf.Identity"(%output_2) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @cond_true_70
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "cond_true_70_0.pb"
+  func.func private @cond_true_70(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_true_7"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%output, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %output_2, %min_3, %max_4, %histogram_5 = "tf.CustomAggregator"(%1) <{calibration_method = 1 : i32, id = "3", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %2 = "tf.Identity"(%output_2) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Check the IfRegion is set to stateful.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func @serving_default
+  // CHECK: "tf.IfRegion"
+  // CHECK-SAME: is_stateless = false
+
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "serving_default_0.pb"
+
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "serving_default_1.pb"
+
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "serving_default_2.pb"
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_2 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_3 = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_4 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_5 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.Sum"(%output, %cst_0) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.IfRegion"(%1) <{_else_func_name = "cond_false_80", _then_func_name = "cond_true_70", is_stateless = true}> ({
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%output, %cst_1, %cst_2) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %output_6, %min_7, %max_8, %histogram_9 = "tf.CustomAggregator"(%5) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      %6 = "tf.Identity"(%output_6) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }, {
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%output, %cst_4, %cst_5) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %output_6, %min_7, %max_8, %histogram_9 = "tf.CustomAggregator"(%5) <{calibration_method = 1 : i32, id = "2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      %6 = "tf.Identity"(%output_6) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<10x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<10x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<0.000000e+00>: tensor<10x1024x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x1024xf32>) -> (tensor<10x1x1024xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.XlaCallModule"(%output, %cst) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %output_0, %min_1, %max_2, %histogram_3 = "tf.CustomAggregator"(%0) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x3xf32>) -> (tensor<10x1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    return %output_0 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: @main
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_O]], %[[MAX_O]], %[[HISTOGRAM_0]], %[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+  // CHECK-SAME: <{calibration_methods = [1 : i32, 1 : i32], ids = ["0", "1"], output_file_path = "main_0.pb"}> : (tensor<f32>, tensor<f32>, tensor<0xi64>, tensor<f32>, tensor<f32>, tensor<0xi64>) -> ()
+  // CHECK: return
+
+  func.func private @composite_dot_general_with_relu_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10x1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [DEFAULT, DEFAULT] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %1 = stablehlo.maximum %0, %cst : tensor<10x1x3xf32>
+    return %1 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: func.func private @composite_dot_general_with_relu_fn_1
+  // CHECK-NOT: "tf.CalibrationStatisticsSaver"
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func @main
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "main_0.pb"
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "main_1.pb"
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "main_2.pb"
+  func.func @main(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<1.000000e+01> : tensor<f32>
+    %cst_0 = stablehlo.constant dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>
+    %c = stablehlo.constant dense<true> : tensor<i1>
+    %cst_1 = stablehlo.constant dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>
+    %cst_2 = stablehlo.constant dense<-0.000000e+00> : tensor<f32>
+    %cst_3 = stablehlo.constant dense<[[0.335351914, 0.084816426, -0.664676845]]> : tensor<1x3xf32>
+    %cst_4 = stablehlo.constant dense<[[0.117216609, 0.933735609, 0.0728900209]]> : tensor<1x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = stablehlo.reduce(%output init: %cst_2) applies stablehlo.add across dimensions = [0, 1] : (tensor<1x4xf32>, tensor<f32>) -> tensor<f32>
+    %1 = stablehlo.compare  GT, %0, %cst : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "stablehlo.if"(%1) ({
+      %3 = "tf.XlaCallModule"(%output, %cst_0, %cst_3) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_2, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_2", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      %output_5, %min_6, %max_7, %histogram_8 = "tf.CustomAggregator"(%3) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      stablehlo.return %c, %output_5 : tensor<i1>, tensor<1x3xf32>
+    }, {
+      %3 = "tf.XlaCallModule"(%output, %cst_1, %cst_4) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_1, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      %output_5, %min_6, %max_7, %histogram_8 = "tf.CustomAggregator"(%3) <{calibration_method = 1 : i32, id = "2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      stablehlo.return %c, %output_5 : tensor<i1>, tensor<1x3xf32>
+    }) : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    return %2#1 : tensor<1x3xf32>
+  }
+  func.func private @composite_dot_general_with_bias_same_shape_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_calibration_statistics_saver_with_skipping.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_calibration_statistics_saver_with_skipping.mlir
new file mode 100644
index 000000000000..a7a4e6d7b47f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_calibration_statistics_saver_with_skipping.mlir
@@ -0,0 +1,47 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-insert-calibration-statistics-saver='aggregator-ops-to-ignore=skipping_id' | FileCheck %s
+
+func.func @serving_default(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x2x2x2xf32>) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}} {
+  %cst = "tf.Const"() <{value = dense<[[[[-0.891899645, 0.392044574], [0.77720493, 1.31188095], [0.255048186, 2.700150e+00]], [[-1.08111858, -0.406604826], [-0.298575521, -2.25356531], [-1.00201964, 2.54532099]], [[-1.34911358, 0.279911458], [-0.868258893, -1.36708188], [0.866317451, -2.05804896]]], [[[-0.591397941, 0.331505477], [0.715151429, 2.64073896], [1.27163255, 0.206143498]], [[0.474211812, 1.45044816], [0.119936548, 2.54149938], [-0.939900994, 0.438387245]], [[-1.12486279, -1.09022558], [0.82202208, 1.04652023], [1.30316162, 2.62054276]]]]> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 5 : i32, id = "skipping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %0 = "tf.Conv2D"(%output, %cst) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  %output_1, %min_2, %max_3, %histogram_4 = "tf.CustomAggregator"(%0) <{calibration_method = 5 : i32, id = "keeping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x2x2x2xf32>) -> (tensor<1x2x2x2xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %1 = "tf.Identity"(%output_1) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %1 : tensor<1x2x2x2xf32>
+}
+// CHECK-LABEL: @serving_default
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "skipping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "keeping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+// CHECK-SAME: <{calibration_methods = [5 : i32], ids = ["keeping_id"], output_file_path = "serving_default_0.pb"}>  : (tensor<f32>, tensor<f32>, tensor<512xi64>) -> ()
+// CHECK: return
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<10x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<10x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<0.000000e+00>: tensor<10x1024x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "skipping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x1024xf32>) -> (tensor<10x1x1024xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.XlaCallModule"(%output, %cst) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %output_0, %min_1, %max_2, %histogram_3 = "tf.CustomAggregator"(%0) <{calibration_method = 1 : i32, id = "keeping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x3xf32>) -> (tensor<10x1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    return %output_0 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: @main
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "skipping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "keeping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+  // CHECK-SAME: <{calibration_methods = [1 : i32], ids = ["keeping_id"], output_file_path = "main_0.pb"}> : (tensor<f32>, tensor<f32>, tensor<0xi64>) -> ()
+  // CHECK: return
+
+  func.func private @composite_dot_general_with_relu_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10x1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [DEFAULT, DEFAULT] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %1 = stablehlo.maximum %0, %cst : tensor<10x1x3xf32>
+    return %1 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: func.func private @composite_dot_general_with_relu_fn_1
+  // CHECK-NOT: "tf.CalibrationStatisticsSaver"
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_weight_param.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_weight_param.mlir
new file mode 100644
index 000000000000..8812a2963b72
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_insert_weight_param.mlir
@@ -0,0 +1,374 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-insert-weight-param | FileCheck %s
+
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with empty `weight_only_ptq` method
+// and function name containing conv.
+
+func.func @qdq_for_conv_weight_empty(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x2x2x2>], _entry_function = @composite_conv_fn,
+    _original_entry_function = "composite_conv_fn",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq { }",
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  return %0 : tensor<1x2x2x2xf32>
+}
+
+// CHECK-LABEL: func.func @qdq_for_conv_weight_empty
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq { }"
+// CHECK-SAME: (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: return %[[CALL]] : tensor<1x2x2x2xf32>
+
+// -----
+
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with empty `weight_only_ptq` method and
+// function name containing dot_general.
+
+func.func @qdq_for_dot_general_weight_empty(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
+    _original_entry_function = "composite_dot_general_fn",
+    _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {},
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func.func @qdq_for_dot_general_weight_empty
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3xf32>}> : () -> tensor<2x3xf32>
+// CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }"
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]] : tensor<1x3xf32>
+
+// -----
+
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `per_tensor` and function name containing conv.
+
+func.func @qdq_for_conv_weight_per_tensor(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x2x2x2>], _entry_function = @composite_conv_fn,
+    _original_entry_function = "composite_conv_fn",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}",
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  return %0 : tensor<1x2x2x2xf32>
+}
+
+// CHECK-LABEL: func.func @qdq_for_conv_weight_per_tensor
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}"
+// CHECK-SAME: (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: return %[[CALL]] : tensor<1x2x2x2xf32>
+
+// -----
+
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `per_tensor` and function name containing dot_general.
+
+func.func @qdq_for_dot_general_weight_per_tensor(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
+    _original_entry_function = "composite_dot_general_fn",
+    _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}", _stablehlo_module_attrs = {},
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func.func @qdq_for_dot_general_weight_per_tensor
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3xf32>}> : () -> tensor<2x3xf32>
+// CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}"
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]] : tensor<1x3xf32>
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` without specified quantization dimension and function name
+// containing conv.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_conv_weight_per_channel_default(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+        Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [],
+        has_token_input_output = false, module = "", platforms = [], version = 5 : i64,
+        _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn",
+        _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}",
+        _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+        device = ""
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+
+  // CHECK: func.func private @qdq_for_conv_weight_per_channel_default(%[[ARG0:.+]]: tensor<1x3x4x3xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  // CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<2x3x3x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+  // CHECK: func private @composite_conv_fn
+  // CHECK: %[[CONV:.+]] = stablehlo.convolution
+  // CHECK: return %[[CONV]]
+}
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` without specified quantization dimension and function name
+// containing dot_general.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_dot_general_weight_per_channel_default(%arg0: tensor<4x3x6x5xf32>) -> tensor<4x3x6x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<4x3x5x2xf32>} : () -> tensor<4x3x5x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+      Sout = [#tf_type.shape<4x3x6x2>], _entry_function = @composite_dot_general_fn,
+      _original_entry_function = "composite_dot_general_fn",
+      _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}",
+      _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+      device = "", dim_args_spec = [], disabled_checks = [],
+      has_token_input_output = false, module = "", platforms = [],
+      version = 5 : i64
+    } : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func.func private @qdq_for_dot_general_weight_per_channel_default(%[[ARG0:.+]]: tensor<4x3x6x5xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<4x3x5x2xf32>}> : () -> tensor<4x3x5x2xf32>
+  // CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<4x3x5x2xf32>) -> tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<4x3x5x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<4x3x6x5xf32>, %arg1: tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func private @composite_dot_general_fn
+  // CHECK: %[[DOT:.+]] = stablehlo.dot_general
+  // CHECK: return %[[DOT]]
+}
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` with specified quantization dimension and function name
+// containing conv.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_conv_weight_per_channel(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+        Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [],
+        has_token_input_output = false, module = "", platforms = [], version = 5 : i64,
+        _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn",
+        _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+        device = ""
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+
+  // CHECK: func.func private @qdq_for_conv_weight_per_channel(%[[ARG0:.+]]: tensor<1x3x4x3xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  // CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<2x3x3x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+  // CHECK: func private @composite_conv_fn
+  // CHECK: %[[CONV:.+]] = stablehlo.convolution
+  // CHECK: return %[[CONV]]
+}
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` with specified quantization dimension and function name
+// containing dot_general.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_dot_general_weight_per_channel(%arg0: tensor<4x3x6x5xf32>) -> tensor<4x3x6x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<4x3x5x2xf32>} : () -> tensor<4x3x5x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+      Sout = [#tf_type.shape<4x3x6x2>], _entry_function = @composite_dot_general_fn,
+      _original_entry_function = "composite_dot_general_fn",
+      _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+      _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+      device = "", dim_args_spec = [], disabled_checks = [],
+      has_token_input_output = false, module = "", platforms = [],
+      version = 5 : i64
+    } : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func.func private @qdq_for_dot_general_weight_per_channel(%[[ARG0:.+]]: tensor<4x3x6x5xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<4x3x5x2xf32>}> : () -> tensor<4x3x5x2xf32>
+  // CHECK: %[[Q:.+]] = "quantization.qcast"(%[[CST]]) : (tensor<4x3x5x2xf32>) -> tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantization.dcast"(%[[Q]]) : (tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<4x3x5x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<4x3x6x5xf32>, %arg1: tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func private @composite_dot_general_fn
+  // CHECK: %[[DOT:.+]] = stablehlo.dot_general
+  // CHECK: return %[[DOT]]
+}
+
+// -----
+
+// Test that q/dq pair is not inserted between constant and XlaCallModule op
+// whose entry function name does not include conv nor dot_general.
+
+func.func @no_qdq_except_conv_and_dot_general(%arg0: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<3x4x2xf32>} : () -> tensor<3x4x2xf32>
+  %0 = "tf.XlaCallModule"(%cst, %arg0) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_gather_fn,
+    _original_entry_function = "composite_gather_fn", _quantization_method = "weight_only_ptq { }",
+    _stablehlo_module_attrs = {}, device = "", dim_args_spec = [],
+    disabled_checks = [], has_token_input_output = false, module = "",
+    platforms = [], version = 5 : i64
+  } : (tensor<3x4x2xf32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32>
+  return %0 : tensor<2x3x2x2xf32>
+}
+
+// CHECK-LABEL: func.func @no_qdq_except_conv_and_dot_general
+// CHECK-NOT: quantization.qcast
+// CHECK-NOT: quantization.dcast
+
+// -----
+
+// Test that q/dq pair is not inserted for constant whose operand number is
+// not 1.
+
+func.func @no_qdq_for_non_weight_constant(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<4.000000e-02> : tensor<3xf32>} : () -> tensor<3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %arg1, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_fn,
+    _original_entry_function = "composite_dot_general_with_bias_fn",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq { }",
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<1x2xf32>, tensor<2x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func.func @no_qdq_for_non_weight_constant
+// CHECK-NOT: quantization.qcast
+// CHECK-NOT: quantization.dcast
+
+// -----
+
+// Test that q/dq pair is not inserted between constant and XlaCallModule op
+// without `weight_only_ptq` method.
+
+func.func @no_qdq_for_not_quantizable_call(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
+    _original_entry_function = "composite_dot_general_fn",
+    _stablehlo_module_attrs = {}, device = "", dim_args_spec = [],
+    disabled_checks = [], has_token_input_output = false, module = "",
+    platforms = [], version = 5 : i64
+  } : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func.func @no_qdq_for_not_quantizable_call
+// CHECK-NOT: quantization.qcast
+// CHECK-NOT: quantization.dcast
+
+// -----
+
+// Test that q/dq pair is not inserted between constant and XlaCallModule op
+// with different method.
+
+func.func @no_qdq_for_not_quantizable_call(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
+    _original_entry_function = "composite_dot_general_fn",
+    _stablehlo_module_attrs = {}, device = "", dim_args_spec = [],
+    disabled_checks = [], has_token_input_output = false, module = "",
+    platforms = [], _quantization_method = "static_range_ptq { }", version = 5 : i64
+  } : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func.func @no_qdq_for_not_quantizable_call
+// CHECK-NOT: quantization.qcast
+// CHECK-NOT: quantization.dcast
+
+// -----
+
+// Test that q/dq pair is not inserted when constant has multiple users.
+
+func.func @no_qdq_for_multiple_users(%arg0: tensor<2x2xf32>) -> tensor<2x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
+    _original_entry_function = "composite_dot_general_fn",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq { }",
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<2x2xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  %2 = stablehlo.add %cst, %0 : tensor<2x3xf32>
+  return %2 : tensor<2x3xf32>
+}
+
+// CHECK-LABEL: func.func @no_qdq_for_multiple_users
+// CHECK-NOT: quantization.qcast
+// CHECK-NOT: quantization.dcast
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_lift_quantizable_spots_as_functions.mlir
new file mode 100644
index 000000000000..e0c0406bb892
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_lift_quantizable_spots_as_functions.mlir
@@ -0,0 +1,861 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-lift-quantizable-spots-as-functions | FileCheck %s
+
+// CHECK-LABEL: @conv_fn(
+// CHECK-SAME:          %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  func.return %1: tensor<1x3x3x4xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_fn_1
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: return %[[CONV]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_fn(
+// CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_same_shape_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x2xf32>
+func.func @dot_general_with_bias_same_shape_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x3xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x3xf32>
+  func.return %3: tensor<1x3xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: return %[[ADD]] : tensor<1x3xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_bias_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_bias_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %3 = stablehlo.broadcast_in_dim %1, dims = [3] : (tensor<4xf32>) -> tensor<1x3x3x4xf32>
+  %4 = stablehlo.add %2, %3 : tensor<1x3x3x4xf32>
+  func.return %4: tensor<1x3x3x4xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
+// CHECK: return %[[ADD]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<64xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %3 = stablehlo.broadcast_in_dim %1, dims = [2] : (tensor<64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.add %2, %3 : tensor<1x1x64xf32>
+  func.return %4: tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
+// CHECK: return %[[ADD]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_bias_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %3 = shape.shape_of %2 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %5 = stablehlo.add %2, %4 : tensor<?x28x28x16xf32>
+  func.return %5: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_dynamic_fn_1
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[CONV]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM]]
+// CHECK: return %[[ADD]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// Because the operand of shape_of is other than the target conv,
+// should not match conv bias pattern.
+
+// CHECK-LABEL: @conv_with_bias_dynamic_shape_not_same_op_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_dynamic_shape_not_same_op_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %4 = shape.shape_of %3 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %5 = stablehlo.dynamic_broadcast_in_dim %1, %4, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %6 = stablehlo.add %2, %5 : tensor<?x28x28x16xf32>
+  func.return %6: tensor<?x28x28x16xf32>
+}
+// CHECK-NOT: @composite_conv_with_bias_dynamic_fn_1
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_bias_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<10xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %3 = shape.shape_of %2 : tensor<?x10xf32> -> tensor<2xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [1] : (tensor<10xf32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %5 = stablehlo.add %2, %4 : tensor<?x10xf32>
+  func.return %5: tensor<?x10xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_dynamic_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK: return %[[ADD]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_relu_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %3 = stablehlo.maximum %2, %1 : tensor<1x3x3x4xf32>
+  func.return %3: tensor<1x3x3x4xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_relu_fn_1
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[CONV]], %[[CONST]]
+// CHECK: return %[[MAX]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_relu_fn(
+// CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>,
+func.func @dot_general_with_relu_fn(%arg0: tensor<1x1x167xf32>, %arg1: tensor<167x64xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %3 = stablehlo.maximum %2, %1 : tensor<1x1x64xf32>
+  return %3 : tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_relu_fn_1
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[DOT_GENERAL]], %[[CONST]]
+// CHECK: return %[[MAX:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_relu_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %3 = shape.shape_of %2 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [] : (tensor<f32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %5 = stablehlo.maximum %2, %4 : tensor<?x28x28x16xf32>
+  func.return %5: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_relu_dynamic_fn_1
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[CONV]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM]]
+// CHECK: return %[[MAX]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// Because the operand of shape_of is other than the target conv,
+// should not match conv relu dynamic pattern.
+
+// CHECK-LABEL: @conv_with_relu_dynamic_shape_not_same_op_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_relu_dynamic_shape_not_same_op_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %4 = shape.shape_of %3 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %5 = stablehlo.dynamic_broadcast_in_dim %1, %4, dims = [] : (tensor<f32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %6 = stablehlo.maximum %2, %5 : tensor<?x28x28x16xf32>
+  func.return %6: tensor<?x28x28x16xf32>
+}
+// CHECK-NOT: private @composite_conv_with_relu_dynamic_fn_1
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_relu_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %3 = shape.shape_of %2 : tensor<?x10xf32> -> tensor<2xindex>
+  %4 = stablehlo.dynamic_broadcast_in_dim %1, %3, dims = [] : (tensor<f32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %5 = stablehlo.maximum %2, %4 : tensor<?x10xf32>
+  func.return %5: tensor<?x10xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_relu_dynamic_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM]]
+// CHECK: return %[[MAX]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
+// The pattern should not match when the const value for relu is not 0.
+
+// CHECK-LABEL: @conv_with_relu_wrong_const_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_relu_wrong_const_fn(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %3 = stablehlo.maximum %2, %1 : tensor<1x3x3x4xf32>
+  func.return %3: tensor<1x3x3x4xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]])
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[XLA_CALL_MODULE]], %[[CONST_1]]
+// CHECK: return %[[MAX]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_fn_1
+// CHECK-NOT: private @composite_conv_with_relu_fn_1
+
+// -----
+
+// CHECK-LABEL: @conv_with_relu6_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<1x3x3x4xf32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %4 = stablehlo.clamp %1, %3, %2 : tensor<1x3x3x4xf32>
+  func.return %4: tensor<1x3x3x4xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_relu6_fn_1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[CONV]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_relu6_fn(
+// CHECK-SAME:                 %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.clamp %1, %3, %2 : tensor<1x1x64xf32>
+  return %4 : tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_relu6_fn_1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[DOT_GENERAL]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_relu6_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_relu6_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %4 = stablehlo.clamp %1, %3, %2 : (tensor<f32>, tensor<?x28x28x16xf32>, tensor<f32>) -> tensor<?x28x28x16xf32>
+  func.return %4: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_relu6_fn_1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[CONV]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_relu6_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_relu6_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %4 = stablehlo.clamp %1, %3, %2 : (tensor<f32>, tensor<?x10xf32>, tensor<f32>) -> tensor<?x10xf32>
+  func.return %4: tensor<?x10xf32>
+}
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_relu6_fn_1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[DOT_GENERAL]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_same_shape_and_relu_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_same_shape_and_relu_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.add %3, %1 : tensor<1x1x64xf32>
+  %5 = stablehlo.maximum %4, %2 : tensor<1x1x64xf32>
+  func.return %5: tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_and_relu_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
+// CHECK: return %[[MAX]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_bias_and_relu_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %4 = stablehlo.broadcast_in_dim %1, dims = [3] : (tensor<4xf32>) -> tensor<1x3x3x4xf32>
+  %5 = stablehlo.add %3, %4 : tensor<1x3x3x4xf32>
+  %6 = stablehlo.maximum %5, %2 : tensor<1x3x3x4xf32>
+  func.return %6: tensor<1x3x3x4xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_and_relu_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
+// CHECK: return %[[MAX]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_and_relu_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_and_relu_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.broadcast_in_dim %1, dims = [2] : (tensor<64xf32>) -> tensor<1x1x64xf32>
+  %5 = stablehlo.add %3, %4 : tensor<1x1x64xf32>
+  %6 = stablehlo.maximum %5, %2 : tensor<1x1x64xf32>
+  func.return %6: tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_and_relu_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
+// CHECK: return %[[MAX]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_bias_and_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %4 = shape.shape_of %3 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %5 = stablehlo.dynamic_broadcast_in_dim %1, %4, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %6 = stablehlo.add %3, %5 : tensor<?x28x28x16xf32>
+  %7 = shape.shape_of %6 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %8 = stablehlo.dynamic_broadcast_in_dim %2, %7, dims = [] : (tensor<f32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %9 = stablehlo.maximum %6, %8 : tensor<?x28x28x16xf32>
+  func.return %9: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_and_relu_dynamic_fn_1
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[CONV]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK: %[[SHAPE_OF_1:.*]] = shape.shape_of %[[ADD]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_1:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF_1]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[DYNAMIC_BROADCAST_IN_DIM_1]]
+// CHECK: return %[[MAX]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// Because the operand of shape_of is other than the target conv,
+// should not match conv bias relu dynamic pattern.
+
+// CHECK-LABEL: @conv_with_bias_and_relu_dynamic_shape_not_same_op_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_and_relu_dynamic_shape_not_same_op_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %5 = shape.shape_of %4 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %6 = stablehlo.dynamic_broadcast_in_dim %1, %5, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %7 = stablehlo.add %3, %6 : tensor<?x28x28x16xf32>
+  %8 = shape.shape_of %7 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %9 = stablehlo.dynamic_broadcast_in_dim %2, %8, dims = [] : (tensor<f32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %10 = stablehlo.maximum %7, %9 : tensor<?x28x28x16xf32>
+  func.return %10: tensor<?x28x28x16xf32>
+}
+// CHECK-NOT: private @composite_conv_with_bias_and_relu_dynamic_fn_1
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_and_relu_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<10xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %4 = shape.shape_of %3 : tensor<?x10xf32> -> tensor<2xindex>
+  %5 = stablehlo.dynamic_broadcast_in_dim %1, %4, dims = [1] : (tensor<10xf32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %6 = stablehlo.add %3, %5 : tensor<?x10xf32>
+  %7 = shape.shape_of %6 : tensor<?x10xf32> -> tensor<2xindex>
+  %8 = stablehlo.dynamic_broadcast_in_dim %2, %7, dims = [] : (tensor<f32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %9 = stablehlo.maximum %6, %8 : tensor<?x10xf32>
+  func.return %9: tensor<?x10xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_and_relu_dynamic_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK: %[[SHAPE_OF_1:.*]] = shape.shape_of %[[ADD]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_1:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF_1]]
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[DYNAMIC_BROADCAST_IN_DIM_1]]
+// CHECK: return %[[MAX]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_same_shape_and_relu6_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_same_shape_and_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
+  %4 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %5 = stablehlo.add %4, %1 : tensor<1x1x64xf32>
+  %6 = stablehlo.clamp %2, %5, %3 : tensor<1x1x64xf32>
+  func.return %6: tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_and_relu6_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_bias_and_relu6_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
+func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x3x3x4xf32>
+  %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+  %5 = stablehlo.broadcast_in_dim %1, dims = [3] : (tensor<4xf32>) -> tensor<1x3x3x4xf32>
+  %6 = stablehlo.add %4, %5 : tensor<1x3x3x4xf32>
+  %7 = stablehlo.clamp %2, %6, %3 : tensor<1x3x3x4xf32>
+  func.return %7: tensor<1x3x3x4xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_and_relu6_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<1x3x3x4xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_and_relu6_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_and_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
+  %4 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %5 = stablehlo.broadcast_in_dim %1, dims = [2] : (tensor<64xf32>) -> tensor<1x1x64xf32>
+  %6 = stablehlo.add %4, %5 : tensor<1x1x64xf32>
+  %7 = stablehlo.clamp %2, %6, %3 : tensor<1x1x64xf32>
+  func.return %7: tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_and_relu6_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @conv_with_bias_and_relu6_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %5 = shape.shape_of %4 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %6 = stablehlo.dynamic_broadcast_in_dim %1, %5, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %7 = stablehlo.add %4, %6 : tensor<?x28x28x16xf32>
+  %8 = stablehlo.clamp %2, %7, %3 : (tensor<f32>, tensor<?x28x28x16xf32>, tensor<f32>) -> tensor<?x28x28x16xf32>
+  func.return %8: tensor<?x28x28x16xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv_with_bias_and_relu6_dynamic_fn_1
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[CONV]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<?x28x28x16xf32>
+// CHECK: }
+
+// -----
+
+// Because the operand of shape_of is other than the target conv,
+// should not match conv bias relu6 dynamic pattern.
+
+// CHECK-LABEL: @conv_with_bias_and_relu6_dynamic_shape_not_same_op_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x28x28x1xf32>
+func.func @conv_with_bias_and_relu6_dynamic_shape_not_same_op_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x28x28x16xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x1x16xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<16xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %5 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x28x28x1xf32>, tensor<3x3x1x16xf32>) -> tensor<?x28x28x16xf32>
+  %6 = shape.shape_of %5 : tensor<?x28x28x16xf32> -> tensor<4xindex>
+  %7 = stablehlo.dynamic_broadcast_in_dim %1, %6, dims = [3] : (tensor<16xf32>, tensor<4xindex>) -> tensor<?x28x28x16xf32>
+  %8 = stablehlo.add %4, %7 : tensor<?x28x28x16xf32>
+  %9 = stablehlo.clamp %2, %8, %3 : (tensor<f32>, tensor<?x28x28x16xf32>, tensor<f32>) -> tensor<?x28x28x16xf32>
+  func.return %9: tensor<?x28x28x16xf32>
+}
+// CHECK-NOT: private @composite_conv_with_bias_and_relu6_dynamic_fn_1
+
+// -----
+
+// CHECK-LABEL: @dot_general_with_bias_and_relu6_dynamic_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<?x12544xf32>
+func.func @dot_general_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tensor<?x10xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<12544x10xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<10xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  %4 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x12544xf32>, tensor<12544x10xf32>) -> tensor<?x10xf32>
+  %5 = shape.shape_of %4 : tensor<?x10xf32> -> tensor<2xindex>
+  %6 = stablehlo.dynamic_broadcast_in_dim %1, %5, dims = [1] : (tensor<10xf32>, tensor<2xindex>) -> tensor<?x10xf32>
+  %7 = stablehlo.add %4, %6 : tensor<?x10xf32>
+  %8 = stablehlo.clamp %2, %7, %3 : (tensor<f32>, tensor<?x10xf32>, tensor<f32>) -> tensor<?x10xf32>
+  func.return %8: tensor<?x10xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<?x10xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<?x10xf32>
+// CHECK: }
+
+// -----
+
+// CHECK-LABEL: @gather_fn(
+func.func @gather_fn() -> tensor<2x3x2x2xi32> {
+  %0 = stablehlo.constant dense<1> : tensor<3x4x2xi32>
+  %1 = stablehlo.constant dense<1> : tensor<2x3x2xi64>
+  %2 = "stablehlo.gather"(%0, %1) {
+  dimension_numbers = #stablehlo.gather<
+    offset_dims = [2, 3],
+    collapsed_slice_dims = [0],
+    start_index_map = [1, 0],
+    index_vector_dim = 2>,
+  slice_sizes = array<i64: 1, 2, 2>,
+  indices_are_sorted = false
+} : (tensor<3x4x2xi32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xi32>
+  func.return %2: tensor<2x3x2x2xi32>
+}
+// CHECK: %[[OPERAND:.*]] = stablehlo.constant
+// CHECK: %[[INDICES:.*]] = stablehlo.constant
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[OPERAND]], %[[INDICES]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<2x3x2x2xi32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_gather_fn_1
+// CHECK: %[[GATHER:.*]] = "stablehlo.gather"(%arg0, %arg1)
+// CHECK: return %[[GATHER]] : tensor<2x3x2x2xi32>
+// CHECK: }
+
+// -----
+
+// Test that the name of composite functions are deterministic. There are 3
+// unsorted functions in this module and each function has 2 quantizable ops.
+module {
+  func.func @conv_3_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+    %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+    %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    func.return %2: tensor<1x3x3x4xf32>
+  }
+
+  func.func @conv_1_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+    %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+    %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    func.return %2: tensor<1x3x3x4xf32>
+  }
+
+  func.func @conv_2_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+    %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+    %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    func.return %2: tensor<1x3x3x4xf32>
+  }
+}
+
+// CHECK-LABEL: @conv_3_fn
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_6, _original_entry_function = "composite_conv_fn_6"
+// CHECK-SAME: _stablehlo_version = "{{.*}}"
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_5, _original_entry_function = "composite_conv_fn_5"
+// CHECK-SAME: _stablehlo_version = "{{.*}}"
+
+// CHECK-LABEL: @conv_1_fn
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_2, _original_entry_function = "composite_conv_fn_2"
+// CHECK-SAME: _stablehlo_version = "{{.*}}"
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_1, _original_entry_function = "composite_conv_fn_1"
+// CHECK-SAME: _stablehlo_version = "{{.*}}"
+
+// CHECK-LABEL: @conv_2_fn
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_4, _original_entry_function = "composite_conv_fn_4"
+// CHECK-SAME: _stablehlo_version = "{{.*}}"
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_3, _original_entry_function = "composite_conv_fn_3"
+// CHECK-SAME: _stablehlo_version = "{{.*}}"
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_merge-fusion-with-dequantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_merge-fusion-with-dequantize.mlir
new file mode 100644
index 000000000000..65154cb890cf
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_merge-fusion-with-dequantize.mlir
@@ -0,0 +1,198 @@
+// RUN: stablehlo-quant-opt %s -tf-stablehlo-merge-fusion-with-dequantize -split-input-file -verify-diagnostics | FileCheck %s
+
+// Merge fusion with dequantize for relu case.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu_fusion
+  func.func private @merge_relu_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_relu_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu_fn
+  func.func private @quantized_dot_general_relu_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    // CHECK: %[[MIN:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %arg0, %arg1
+    // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+    // CHECK: %[[MAX:.*]] = chlo.broadcast_maximum %[[DQ]], %[[MIN]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Merge fusion with dequantize for relu6 case.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu6_fusion
+  func.func private @merge_relu6_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu6_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_relu6_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu6_fn
+  func.func private @quantized_dot_general_relu6_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    // CHECK-DAG: %[[MIN:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK-DAG: %[[MAX:.*]] = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %arg0, %arg1
+    // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+    // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[MIN]], %[[DQ]], %[[MAX]]
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Merge fusion with dequantize for no activation case.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_no_act_fusion
+  func.func private @merge_no_act_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_fn
+  func.func private @quantized_dot_general_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %arg0, %arg1
+    // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+    // CHECK: return %[[DQ]] : tensor<1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Do not merge when quant.uniform result is used directly.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @no_merge_fusion_direct_usage
+  func.func private @no_merge_fusion_direct_usage(%arg0: tensor<1x4xf32>) -> (tensor<1x3xf32>, tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu_fn
+    // CHECK-SAME: -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %2 = call @quantized_dot_general_relu_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3, %2 : tensor<1x3xf32>, tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu_fn
+  func.func private @quantized_dot_general_relu_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Do not merge when fusion and dequantize is already merged.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @no_merge_fusion_already_merged
+  func.func private @no_merge_fusion_already_merged(%arg0: tensor<1x4xf32>) -> (tensor<1x3xf32>) {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_fn
+    // CHECK-SAME: -> tensor<1x3xf32>
+    %2 = call @quantized_dot_general_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_fn
+  func.func private @quantized_dot_general_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_dequantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Do not merge when function is not quantized function.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu_fusion
+  func.func private @merge_relu_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @some_func
+    // CHECK-SAME: -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %2 = call @some_func(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @some_func
+  func.func private @some_func(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>
+    %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {3.000000e-05,3.000000e-05,3.000000e-05}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %1 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
+
+// -----
+
+// Do not merge when the quantized fusion is invalid.
+
+module attributes {tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func private @merge_relu_fusion
+  func.func private @merge_relu_fusion(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
+    %0 = stablehlo.constant() {value = dense<127> : tensor<4x3xi8>} : () -> tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    // CHECK: call @quantized_dot_general_relu_fn
+    // CHECK-SAME: -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %2 = call @quantized_dot_general_relu_fn(%1, %0) : (tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @quantized_dot_general_relu_fn
+  func.func private @quantized_dot_general_relu_fn(
+      %arg0: tensor<1x4x!quant.uniform<i8:f32, 6.000000e-03:-128>>,
+      %arg1: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {5.000000e-03,5.000000e-03,5.000000e-03}>>
+    ) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>> attributes {_from_xla_call_module} {
+    %0 = stablehlo.constant() {value = dense<2> : tensor<1x3xi8>} : () -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+    return %0 : tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_nchw_convolution_to_nhwc.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_nchw_convolution_to_nhwc.mlir
new file mode 100644
index 000000000000..3dfb5555ef43
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_nchw_convolution_to_nhwc.mlir
@@ -0,0 +1,96 @@
+// RUN: stablehlo-quant-opt %s -tf-stablehlo-nchw-convolution-to-nhwc \
+// RUN:   -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that `stablehlo.transpose` ops are inserted for each of input, filter,
+// and output.
+// Output dimension numbers =  [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+
+// CHECK-LABEL: nchw_conv
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x8x4x4xf32>
+func.func @nchw_conv(%arg0: tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32> {
+  %0 = stablehlo.constant() {value = dense<7.000000e+00> : tensor<8x8x3x3xf32>} : () -> tensor<8x8x3x3xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x8x4x4xf32>, tensor<8x8x3x3xf32>) -> tensor<1x8x4x4xf32>
+  return %2 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-DAG: %[[CONST:.+]] = stablehlo.constant {{.*}} : tensor<8x8x3x3xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x8x4x4xf32>) -> tensor<1x4x4x8xf32>
+// CHECK-DAG: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[CONST]], dims = [2, 3, 1, 0] : (tensor<8x8x3x3xf32>) -> tensor<3x3x8x8xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[TRANSPOSE_1]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x4x8xf32>, tensor<3x3x8x8xf32>) -> tensor<1x4x4x8xf32>
+// CHECK: %[[TRANSPOSE_2:.+]] = stablehlo.transpose %[[CONV]], dims = [0, 3, 1, 2] : (tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32>
+
+// -----
+
+// Tests that the conversion doesn't happen when the input dimension numbers
+// are not [b, f, 0, 1].
+
+// CHECK-LABEL: conv_input_dim_numbers_mismatch
+func.func @conv_input_dim_numbers_mismatch(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+  %0 = stablehlo.constant() {value = dense<7.000000e+00> : tensor<8x8x3x3xf32>} : () -> tensor<8x8x3x3xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x4x8xf32>, tensor<8x8x3x3xf32>) -> tensor<1x8x4x4xf32>
+  return %2 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-NOT: stablehlo.transpose
+// CHECK: %[[CONV:.+]] = stablehlo.convolution
+// CHECK-SAME{LITERAL}: [b, 0, 1, f]x[o, i, 0, 1]->[b, f, 0, 1]
+// CHECK-NOT: stablehlo.transpose
+
+// -----
+
+// Tests that the conversion doesn't happen when the feature dimension numbers
+// are not [i, 0, 1, o].
+
+// CHECK-LABEL: conv_feature_dim_numbers_mismatch
+func.func @conv_feature_dim_numbers_mismatch(%arg0: tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32> {
+  %0 = stablehlo.constant() {value = dense<7.000000e+00> : tensor<8x3x3x8xf32>} : () -> tensor<8x3x3x8xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[i, 0, 1, o]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x8x4x4xf32>, tensor<8x3x3x8xf32>) -> tensor<1x8x4x4xf32>
+  return %2 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-NOT: stablehlo.transpose
+// CHECK: %[[CONV:.+]] = stablehlo.convolution
+// CHECK-SAME{LITERAL}: [b, f, 0, 1]x[i, 0, 1, o]->[b, f, 0, 1]
+// CHECK-NOT: stablehlo.transpose
+
+// -----
+
+// Tests that the conversion doesn't happen when the output dimension numbers
+// are not [b, 0, 1, f].
+
+// CHECK-LABEL: conv_output_dim_numbers_mismatch
+func.func @conv_output_dim_numbers_mismatch(%arg0: tensor<1x8x4x4xf32>) -> tensor<1x4x4x8xf32> {
+  %0 = stablehlo.constant() {value = dense<7.000000e+00> : tensor<8x8x3x3xf32>} : () -> tensor<8x8x3x3xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x8x4x4xf32>, tensor<8x8x3x3xf32>) -> tensor<1x4x4x8xf32>
+  return %2 : tensor<1x4x4x8xf32>
+}
+
+// CHECK-NOT: stablehlo.transpose
+// CHECK: %[[CONV:.+]] = stablehlo.convolution
+// CHECK-SAME{LITERAL}: [b, f, 0, 1]x[o, i, 0, 1]->[b, 0, 1, f]
+// CHECK-NOT: stablehlo.transpose
+
+// -----
+
+// Tests that a quantized convolution does not match. No conversion occurs.
+
+// CHECK-LABEL: quantized_convolution
+func.func @quantized_convolution(%arg0: tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<2x4x3x3x!quant.uniform<i8:f32:0, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>> {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<2x4x3x3x!quant.uniform<i8:f32:0, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
+  return %0 : tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
+}
+
+// CHECK-NOT: stablehlo.transpose
+
+// -----
+
+// Tests that a quantized convolution with rank > 4 does not match.
+// No conversion occurs.
+
+// CHECK-LABEL: convolution_3d
+func.func @convolution_3d(%arg0: tensor<1x4x28x28x1xf32>, %arg1: tensor<2x3x3x1x16xf32>) -> tensor<1x3x26x26x16xf32> {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, 2, f]x[0, 1, 2, i, o]->[b, 0, 1, 2, f], window = {} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x28x28x1xf32>, tensor<2x3x3x1x16xf32>) -> tensor<1x3x26x26x16xf32>
+  return %0 : tensor<1x3x26x26x16xf32>
+}
+
+// CHECK-NOT: stablehlo.transpose
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_optimize_graph.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_optimize_graph.mlir
new file mode 100644
index 000000000000..92484985334b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_optimize_graph.mlir
@@ -0,0 +1,33 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-optimize-graph | FileCheck %s
+
+// CHECK-LABEL: @merge_requantization_followed_by_dequantization
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<1x3x4x3xf32>
+func.func @merge_requantization_followed_by_dequantization(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> {
+  // CHECK: %[[CST:.*]] = stablehlo.constant dense<4.000000e-01> : tensor<2x3x3x2xf32>
+  // CHECK: %[[QUANT_CST:.*]] = stablehlo.uniform_quantize %[[CST]]
+  // CHECK: %[[QUANT_ARG_0:.*]] = stablehlo.uniform_quantize %[[ARG_0]]
+  // CHECK: %[[CONV:.*]] = stablehlo.convolution(%[[QUANT_ARG_0]], %[[QUANT_CST]])
+  // CHECK-NOT: stablehlo.uniform_quantize
+  // CHECK: %[[DEQUANT:.*]] = stablehlo.uniform_dequantize %[[CONV]]
+  // CHECK: return %[[DEQUANT]]
+  %cst = stablehlo.constant dense<0.4> : tensor<2x3x3x2xf32>
+  %quant_cst = stablehlo.uniform_quantize %cst : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.015>>
+  %quant_arg = stablehlo.uniform_quantize %arg0 : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039207626791561354:-128>>
+  %conv = stablehlo.convolution(%quant_arg, %quant_cst) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039207626791561354:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.015>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, 5.8949912267181218E-5>>
+  %requant = stablehlo.uniform_quantize %conv : (tensor<1x3x4x2x!quant.uniform<i32:f32, 5.8949912267181218E-5>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, 0.045673100153605144:-62>>
+  %dequant = stablehlo.uniform_dequantize %requant : (tensor<1x3x4x2x!quant.uniform<i8:f32, 0.045673100153605144:-62>>) -> tensor<1x3x4x2xf32>
+  func.return %dequant : tensor<1x3x4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @dont_merge_quantization_followed_by_quantization
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<1x3x4x3xf32>
+func.func @dont_merge_quantization_followed_by_quantization(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32> {
+  // CHECK: %[[QUANT_ARG_0:.*]] = stablehlo.uniform_quantize %[[ARG_0]]
+  // CHECK: %[[DEQUANT:.*]] = stablehlo.uniform_dequantize %[[QUANT_ARG_0]]
+  // CHECK: return %[[DEQUANT]]
+  %quant_arg = stablehlo.uniform_quantize %arg0 : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039207626791561354:-128>>
+  %dequant = stablehlo.uniform_dequantize %quant_arg : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039207626791561354:-128>>) -> tensor<1x3x4x3xf32>
+  func.return %dequant : tensor<1x3x4x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_post_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_post_quantize.mlir
new file mode 100644
index 000000000000..01f2ee34f0c8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_post_quantize.mlir
@@ -0,0 +1,72 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-post-quantize | FileCheck %s
+
+// CHECK-LABEL: @remove_volatile_qdq
+func.func @remove_volatile_qdq() -> tensor<3x2xf32> {
+  // CHECK: %[[CST:.*]] = stablehlo.constant
+  // CHECK-NOT: "quantization.qcast"
+  // CHECK-NOT: "quantization.dcast"
+  // CHECK: return %[[CST]]
+  %cst = stablehlo.constant dense<[[-0.960978984, -0.390246302], [-0.790828585, -0.601039409], [-1.0280807, -1.02731466]]> : tensor<3x2xf32>
+  %q = "quantization.qcast"(%cst) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
+  %dq = "quantization.dcast"(%q) : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2xf32>
+  func.return %dq : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @remove_volatile_qdq_with_requantization
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3x2xf32>
+func.func @remove_volatile_qdq_with_requantization(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  // CHECK: %[[Q1:.*]] = stablehlo.uniform_quantize %[[ARG0]]
+  // CHECK: %[[Q2:.*]] = stablehlo.uniform_quantize %[[Q1]]
+  // CHECK: %[[ABS:.*]] = stablehlo.abs %[[Q2]]
+  // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[ABS]]
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[ARG0]], %[[DQ]]
+  // CHECK: return %[[ADD]]
+  %q1 = "quantization.qcast"(%arg0) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+  %q2 = "quantization.qcast"(%q1) {volatile} : (tensor<3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
+  %dq1 = "quantization.dcast"(%q2) : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2xf32>
+  %abs = stablehlo.abs %q2 : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>
+  %dq2 = "quantization.dcast"(%abs) : (tensor<3x2x!quant.uniform<i8:f32, 0.013075299590241675:-64>>) -> tensor<3x2xf32>
+  %add = stablehlo.add %dq1, %dq2 : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
+  func.return %add : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @quantize_constant
+// CHECK-SAME: %[[ARG0:.*]]: tensor<1x3xf32>
+func.func @quantize_constant(%arg0: tensor<1x3xf32>) -> tensor<1x2xf32> {
+  // CHECK-DAG: %[[QCST:.*]] = stablehlo.constant() <{value = dense<-78> : tensor<3x2xi8>}> : () -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK-DAG: %[[Q1:.*]] = stablehlo.uniform_quantize %[[ARG0]]
+  // CHECK-NOT: "quantization.qcast"
+  // CHECK: %[[DOT:.*]] = stablehlo.dot %[[Q1]], %[[QCST]]
+  // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+  // CHECK: return %[[DQ]]
+  %cst = stablehlo.constant dense<-0.390246302> : tensor<3x2xf32>
+  %q1 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+  %q2 = "quantization.qcast"(%cst) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %dot = stablehlo.dot %q1, %q2 : (tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  %dq = "quantization.dcast"(%dot) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x2xf32>
+  func.return %dq : tensor<1x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @convert_quantization_qdq_to_stablehlo_uniform_qdq
+// CHECK-SAME: %[[ARG0:.*]]: tensor<1x3xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<3x2xf32>
+func.func @convert_quantization_qdq_to_stablehlo_uniform_qdq(%arg0: tensor<1x3xf32>, %arg1: tensor<3x2xf32>) -> tensor<1x2xf32> {
+  // CHECK: %[[Q1:.*]] = stablehlo.uniform_quantize %[[ARG0]]
+  // CHECK-NOT: "quantization.qcast"
+  // CHECK: %[[Q2:.*]] = stablehlo.uniform_quantize %[[ARG1]]
+  // CHECK-NOT: "quantization.qcast"
+  // CHECK: %[[DOT:.*]] = stablehlo.dot %[[Q1]], %[[Q2]]
+  // CHECK: %[[DQ:.*]] = stablehlo.uniform_dequantize %[[DOT]]
+  // CHECK: return %[[DQ]]
+  %q1 = "quantization.qcast"(%arg0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+  %q2 = "quantization.qcast"(%arg1) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %dot = stablehlo.dot %q1, %q2 : (tensor<1x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>, tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  %dq = "quantization.dcast"(%dot) : (tensor<1x2x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x2xf32>
+  func.return %dq : tensor<1x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_quantize_composite_functions.mlir
new file mode 100644
index 000000000000..46e51a7dd0f7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_quantize_composite_functions.mlir
@@ -0,0 +1,896 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -tf-stablehlo-quantize-composite-functions | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -tf-stablehlo-quantize-composite-functions=enable-per-channel-quantized-weight=false | FileCheck --check-prefix=CHECK-PER-TENSOR %s
+
+// Tests that basic dot_general is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+// Checks that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// Checks that the entry function is quantized for dot_general. Quantized
+// dot_general outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_dot_general_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL_0]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_dot_general_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that `stablehlo.dot_general` with `batching_dim` is quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_batch_per_tensor_quantized_fn(%arg0: tensor<2x2x2xf32>) -> tensor<2x2x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x2x3xf32>} : () -> tensor<2x2x3xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<2x2x2xf32>, tensor<2x2x3xf32>) -> tensor<2x2x3xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<2x2x3xf32>) -> tensor<2x2x3xf32>
+    return %2 : tensor<2x2x3xf32>
+  }
+// CHECK: func.func private @quantize_dot_general_batch_per_tensor_quantized_fn(%[[ARG_0:.+]]: tensor<2x2x2xf32>) -> tensor<2x2x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x2x3xi8>}> : () -> tensor<2x2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<2x2x2xf32>) -> tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x2x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<2x2x3xf32>
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x3xf32>) -> tensor<2x2x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x2x2xf32>, tensor<2x2x3xf32>) -> tensor<2x2x3xf32>
+    return %0 : tensor<2x2x3xf32>
+  }
+}
+
+// -----
+
+// Tests that fused pattern for dot_general + bias is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_with_bias_same_shape_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_same_shape_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_with_bias_same_shape_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+// CHECK: func.func private @quantize_dot_general_with_bias_same_shape_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
+// CHECK: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x3xi32>}> : () -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_dot_general_with_bias_same_shape_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x3xi32>}> : () -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+// CHECK: func.func private @quantized_dot_general_with_bias_same_shape_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[ARG_3]] : tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_dot_general_with_bias_same_shape_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[ARG_3]] : tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+
+}
+
+// -----
+
+// Tests that fused pattern for dot_general + bias with dynamic batch dimension
+// is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_with_bias_dynamic_fn(%arg0: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<3xf32>} : () -> tensor<3xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x2xf32>) -> tensor<?x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<?x3>], _entry_function = @composite_dot_general_with_bias_dynamic_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_with_bias_dynamic_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x2xf32>, tensor<2x3xf32>, tensor<3xf32>) -> tensor<?x3xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3xf32>) -> tensor<?x3xf32>
+    return %2 : tensor<?x3xf32>
+  }
+// CHECK: func.func private @quantize_dot_general_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<3xi32>}> : () -> tensor<3x!quant.uniform<i32:f32:0, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<3x!quant.uniform<i32:f32:0, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_dot_general_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<3xi32>}> : () -> tensor<3x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3xf32>
+
+  func.func private @composite_dot_general_with_bias_dynamic_fn(%arg0: tensor<?x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<3xf32>) -> tensor<?x3xf32> attributes {_from_xla_call_module} {
+      %cst_0 = stablehlo.constant dense<2> : tensor<1xi32>
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x3xf32>) -> tensor<?x3xf32>
+      %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3xf32>) -> tensor<i32>
+      %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+      %3 = stablehlo.concatenate %2, %cst_0, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+      %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [1] : (tensor<3xf32>, tensor<2xi32>) -> tensor<?x3xf32>
+      %5 = stablehlo.add %0, %4 : tensor<?x3xf32>
+      return %5 : tensor<?x3xf32>
+    }
+}
+// CHECK: func.func private @quantized_dot_general_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>, %[[ARG_3:.+]]: tensor<3x!quant.uniform<i32:f32:0, {{.*}}>>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[CONST_2:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<?x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[DOT_GENERAL_0]], dim = 0 : (tensor<?x3x!quant.uniform<i32:f32:1, {{.*}}>)
+// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [1] : (tensor<3x!quant.uniform<i32:f32:0, {{.*}}>>, tensor<2xi32>) -> tensor<?x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x!quant.uniform<i32:f32:1, {{.*}}>>)
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<?x3x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_dot_general_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[CONST_2:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[DOT_GENERAL_0]], dim = 0 : (tensor<?x3x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [1] : (tensor<3x!quant.uniform<i32:f32, {{.*}}>>, tensor<2xi32>) -> tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>)
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<?x3x!quant.uniform<i8:f32, {{.*}}>>
+
+// -----
+
+// Tests that basic convolution is properly quantized. It is per-channel
+// quantized unless `enable-per-channel-quantized-weight=false`, according to
+// `_quantization_method` with an `input_quantized_types` and explicit
+// `dimension_specs`.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64,
+        _entry_function = @composite_conv_fn,
+        _stableghlo_version = "1.0.0",
+        _original_entry_function = "composite_conv_fn",
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _stablehlo_module_attrs = {},
+        _tfl_quant_trait = "fully_quantizable",
+        device = ""
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// Check that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+// Checks that the entry function is quantized for convolution. Quantized
+// convolution outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that basic convolution is properly quantized. In this example, the
+// convolution is always per-tensor quantized (even if
+// enable-per-channel-quantized-weights=true), according to
+// `_quantization_method`.
+
+// CHECK-LABEL: quantize_conv_fn_per_tensor
+func.func @quantize_conv_fn_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> {
+  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+  %1 = "tf.XlaCallModule"(%0, %cst) {
+      Sout = [#tf_type.shape<1x3x4x2>],
+      dim_args_spec = [],
+      disabled_checks = [],
+      has_token_input_output = false,
+      module = "",
+      platforms = [],
+      version = 5 : i64,
+      _entry_function = @composite_conv_fn,
+      _stablehlo_version = "1.0.0",
+      _original_entry_function = "composite_conv_fn",
+      _quantization_method = "static_range_ptq { }",
+      _stablehlo_module_attrs = {},
+      _tfl_quant_trait = "fully_quantizable",
+      device = ""
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+  return %2 : tensor<1x3x4x2xf32>
+}
+// Check that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  return %0 : tensor<1x3x4x2xf32>
+}
+// Checks that the entry function is quantized for convolution. Quantized
+// convolution outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// -----
+
+// Tests that fused pattern for convolution + bias is properly quantized.
+
+// Checks that fused functions with 1D bias is properly quantized.
+// The 1D bias should be broadcasted in dims [3], where it initially has
+// `quantizedDimension=0`, but has `quantizedDimension=3` after broadcasting.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_1d_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_1d_fn,
+        _stablehlo_version = "1.0.0",
+        _original_entry_function = "composite_conv_with_bias_1d_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_1d_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<47978> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, 0.0027450979924669452:-128>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_1d_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_1d_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.broadcast_in_dim %arg2, dims = [3] : (tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = stablehlo.add %1, %0 : tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantized_conv_with_bias_1d_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %arg2, dims = [3] : (tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_1d_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %[[ARG_3]]
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Checks that fused functions with 4D bias is properly quantized.
+// The 4D bias should be braoadcasted in dims [0, 1, 2, 3], where it
+// already has `quantizedDimension=3`.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_fn,
+        _stablehlo_version = "1.0.0",
+        _original_entry_function = "composite_conv_with_bias_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.broadcast_in_dim %arg2, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = stablehlo.add %1, %0 : tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantized_conv_with_bias_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %arg2, dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %arg2
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that fused pattern for convolution + bias with dynamic batch dimension
+// is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_dynamic_fn(%arg0: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_dynamic_fn,
+        _stablehlo_version = "1.0.0",
+        _original_entry_function = "composite_conv_with_bias_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
+    return %2 : tensor<?x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER_TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_dynamic_fn(%arg0: tensor<?x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32> attributes {_from_xla_call_module} {
+    %cst_0 = stablehlo.constant dense<3> : tensor<1xi32>
+    %cst_1 = stablehlo.constant dense<4> : tensor<1xi32>
+    %cst_2 = stablehlo.constant dense<2> : tensor<1xi32>
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3x4x2xf32>) -> tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.concatenate %2, %cst_0, %cst_1, %cst_2, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>, tensor<4xi32>) -> tensor<?x3x4x2xf32>
+    %5 = stablehlo.add %0, %4 : tensor<?x3x4x2xf32>
+    return %5 : tensor<?x3x4x2xf32>
+  }
+}
+// CHECK: func.func private @quantized_conv_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>)
+// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>)
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>)
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// -----
+
+// Tests that fused pattern for convolution + bias + relu with
+// dynamic batch dimension is properly quantized.
+
+// Note that this checks for identical condition as
+// quantize_conv_with_bias_dynamic_fn, omitting stablehlo.maximum.
+// This is because activation clipping which includes 0.0f can be simply
+// omitted from the graph as the lifted function's out_scale and out_zp are
+// already calculated based on the clipped distribution.
+// Note that the resulting scale and zero point should be calculated based on
+// clipped range [0, r_max].
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_and_relu_dynamic_fn,
+        _stablehlo_version = "1.0.0",
+        _original_entry_function = "composite_conv_with_bias_and_relu_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[0.00000000e-6, 8.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
+    return %2 : tensor<?x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32> attributes {_from_xla_call_module} {
+    %cst_0 = stablehlo.constant dense<3> : tensor<1xi32>
+    %cst_1 = stablehlo.constant dense<4> : tensor<1xi32>
+    %cst_2 = stablehlo.constant dense<2> : tensor<1xi32>
+    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %cst_4 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3x4x2xf32>) -> tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.concatenate %2, %cst_0, %cst_1, %cst_2, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>, tensor<4xi32>) -> tensor<?x3x4x2xf32>
+    %5 = stablehlo.add %0, %4 : tensor<?x3x4x2xf32>
+    %6 = stablehlo.clamp %cst_3, %5, %cst_4 : (tensor<f32>, tensor<?x3x4x2xf32>, tensor<f32>) -> tensor<?x3x4x2xf32>
+    return %6 : tensor<?x3x4x2xf32>
+  }
+}
+// CHECK: func.func private @quantized_conv_with_bias_and_relu_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>)
+// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_and_relu_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+
+// -----
+
+// Tests that fused pattern for convolution + bias + relu6 with
+// dynamic batch dimension is properly quantized.
+
+// Note that this checks for identical condition as
+// quantize_conv_with_bias_dynamic_fn, omitting stablehlo.clamp.
+// This is because activation clipping which includes 0.0f can be simply
+// omitted from the graph as the lifted function's out_scale and out_zp are
+// already calculated based on the clipped distribution.
+// Note that the resulting scale and zero point should be calculated based on
+// clipped range [0, r_max].
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantization.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_and_relu6_dynamic_fn,
+        _stablehlo_version = "1.0.0",
+        _original_entry_function = "composite_conv_with_bias_and_relu6_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
+    return %2 : tensor<?x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
+// CHECK-PER-TENSOR: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32> attributes {_from_xla_call_module} {
+    %cst_0 = stablehlo.constant dense<3> : tensor<1xi32>
+    %cst_1 = stablehlo.constant dense<4> : tensor<1xi32>
+    %cst_2 = stablehlo.constant dense<2> : tensor<1xi32>
+    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %cst_4 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3x4x2xf32>) -> tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.concatenate %2, %cst_0, %cst_1, %cst_2, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>, tensor<4xi32>) -> tensor<?x3x4x2xf32>
+    %5 = stablehlo.add %0, %4 : tensor<?x3x4x2xf32>
+    %6 = stablehlo.clamp %cst_3, %5, %cst_4 : (tensor<f32>, tensor<?x3x4x2xf32>, tensor<f32>) -> tensor<?x3x4x2xf32>
+    return %6 : tensor<?x3x4x2xf32>
+  }
+}
+// CHECK: func.func private @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>)
+// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+
+// -----
+
+// Tests that XlaCallModule op is not quantized and converted to func.call without the quantization.stats ops.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantized_without_stats_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _stableghlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+// Check that "tf.Const" is converted to stablehlo.constant. XlaCallModule is
+// not quantized.
+
+// CHECK: func.func private @not_quantized_without_stats_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+// CHECK: %[[CALL:.+]] = call @composite_dot_general_fn(%[[ARG_0]], %[[CONST_0]]) : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// CHECK: func.func private @composite_dot_general_fn(%[[ARG_1:.+]]: tensor<1x2xf32>, %[[ARG_2:.+]]: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module}
+// Check that the composite_dot_general_fn is untouched.
+// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]]
+// CHECK: return %[[DOT_GENERAL_0]]
+}
+
+// -----
+
+// Tests that basic `stablehlo.gather` is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK: func.func private @quantize_gather_fn(%[[ARG:.+]]: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"}
+  func.func private @quantize_gather_fn(%arg: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<1> : tensor<2x3x2xi32>} : () -> tensor<2x3x2xi32>
+    %0 = "quantization.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<3x4x2xf32>) -> tensor<3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<2x3x2x2>], _entry_function = @composite_gather_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_gather_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2xf32>
+    return %2 : tensor<2x3x2x2xf32>
+  }
+// Checks that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+// CHECK: %[[CONST:.+]] = stablehlo.constant dense<{{.*}}> : tensor<2x3x2xi32>
+// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<3x4x2xf32>) -> tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL:.+]] = call @quantized_gather_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) {_quantization_method = "static_range_ptq { }"} : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x3x2x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE]] : tensor<2x3x2x2xf32>
+
+// CHECK: func.func private @quantized_gather_fn(%[[ARG_0:.+]]: tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_gather_fn(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32> attributes {_from_xla_call_module} {
+    %0 = "stablehlo.gather"(%arg0, %arg1) {
+      dimension_numbers = #stablehlo.gather<
+        offset_dims = [2, 3],
+        collapsed_slice_dims = [0],
+        start_index_map = [1, 0],
+        index_vector_dim = 2>,
+      slice_sizes = array<i64: 1, 2, 2>,
+      indices_are_sorted = false
+    } : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    return %0 : tensor<2x3x2x2xf32>
+  }
+// CHECK: %[[GATHER:.+]] = "stablehlo.gather"(%[[ARG_0]], %[[ARG_1]]) {{.*}} : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that a basic `stablehlo.add` and a fused `stablehlo.dot_general`
+// are properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK: func.func private @quantize_add_fn(%[[ARG:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+  func.func private @quantize_add_fn(%arg: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst_0 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
+    %cst_1 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %0 = "quantization.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst_0) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_add_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
+    %2 = "quantization.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %3 = "quantization.stats"(%2) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %4 = "tf.XlaCallModule"(%3, %cst_1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantization.stats"(%4) {layerStats = dense<[5.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %5 : tensor<1x3xf32>
+  }
+// CHECK: %[[CONST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<1x2xi8>}> : () -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL:.+]] = call @quantized_add_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2xf32>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[UNIFORM_DEQUANTIZE]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+// CHECK: func.func private @quantized_add_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_add_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<1x2xf32>
+    return %0 : tensor<1x2xf32>
+  }
+// CHECK: %[[ADD:.+]] = stablehlo.add %arg0, %arg1 : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[ADD]] : tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK: func.func private @quantized_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// CHECK: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1,{{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that `stablehlo.add` is not quantized and emits error when the function
+// does not include two ops.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantize_fn_when_not_singular(%arg: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
+    %0 = "quantization.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_add_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
+    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be ranked tensor of per-tensor integer quantized or per-axis integer quantized values, but got 'tensor<1x2xf32>'}}
+    %2 = "quantization.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    return %2 : tensor<1x2xf32>
+  }
+
+  func.func private @composite_add_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<1x2xf32>
+    %1 = stablehlo.add %0, %arg1 : tensor<1x2xf32>
+    return %1 : tensor<1x2xf32>
+  }
+}
+
+// -----
+
+// Tests that `stablehlo.gather` without `static_range_ptq` is not quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantize_singular_op_without_static_range_ptq(%arg: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<1> : tensor<2x3x2xi32>} : () -> tensor<2x3x2xi32>
+    %0 = "quantization.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<3x4x2xf32>) -> tensor<3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<2x3x2x2>], _entry_function = @composite_gather_fn, _stablehlo_version = "1.0.0", _original_entry_function = "composite_gather_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be ranked tensor of per-tensor integer quantized or per-axis integer quantized values, but got 'tensor<2x3x2x2xf32>'}}
+    %2 = "quantization.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2xf32>
+    return %2 : tensor<2x3x2x2xf32>
+  }
+
+  func.func private @composite_gather_fn(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32> attributes {_from_xla_call_module} {
+    %0 = "stablehlo.gather"(%arg0, %arg1) {
+      dimension_numbers = #stablehlo.gather<
+        offset_dims = [2, 3],
+        collapsed_slice_dims = [0],
+        start_index_map = [1, 0],
+        index_vector_dim = 2>,
+      slice_sizes = array<i64: 1, 2, 2>,
+      indices_are_sorted = false
+    } : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    return %0 : tensor<2x3x2x2xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_quantize_composite_functions_weight_only.mlir
new file mode 100644
index 000000000000..1467313c585a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_quantize_composite_functions_weight_only.mlir
@@ -0,0 +1,122 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -tf-stablehlo-quantize-composite-functions | FileCheck --check-prefix=CHECK %s
+
+// Test that per-tensor weight-only quantized dot_general op is produced when
+// empty `weight_only_ptq` is provided.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_per_tensor(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_dot_general_per_tensor
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_dot_general_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Test that per-tensor weight-only quantized convolution op is produced when
+// empty `weight_only_ptq` is provided.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %1 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_per_tensor
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]
+
+// -----
+
+// Test that per-channel weight-only quantized dot_general op is produced when
+// `weight_only_ptq` with `dimension_specs` is provided.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_per_channel(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_dot_general_per_channel
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}"}
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_dot_general_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Test that per-channel weight-only quantized convolution op is produced when
+// `weight_only_ptq` with `dimension_specs` is provided.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_per_channel(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %1 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_per_channel
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}"}
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_remove_sharding_custom_call.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_remove_sharding_custom_call.mlir
new file mode 100644
index 000000000000..c408290bd4a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_remove_sharding_custom_call.mlir
@@ -0,0 +1,20 @@
+// RUN: stablehlo-quant-opt %s -tf-stablehlo-remove-sharding-custom-call \
+// RUN:   -split-input-file | FileCheck %s
+
+// CHECK-LABEL: sharding_custom_call_removed
+func.func @sharding_custom_call_removed(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %1 = stablehlo.custom_call @Sharding(%arg0) {mhlo.sharding = ""} : (tensor<3xf32>) -> tensor<3xf32>
+  return %1 : tensor<3xf32>
+}
+// CHECK-NOT: custom_call
+
+// -----
+
+// Tests that a custom_call that is not @Sharding is not removed.
+
+// CHECK-LABEL: custom_call_not_removed
+func.func @custom_call_not_removed(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %1 = stablehlo.custom_call @NotSharding(%arg0) : (tensor<3xf32>) -> tensor<3xf32>
+  return %1 : tensor<3xf32>
+}
+// CHECK: custom_call @NotSharding
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
new file mode 100644
index 000000000000..ad1d99ac1fbf
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
@@ -0,0 +1,476 @@
+// RUN: stablehlo-quant-opt %s -split-input-file \
+// RUN:    -tf-stablehlo-replace-stablehlo-ops-in-main-function-with-xla-call-module-ops \
+// RUN:    | FileCheck %s
+
+// Modules with "main" or "serving_default" should properly run this pass and
+// convert subgraphs into XLACallModuleOp.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+
+  // CHECK: func private @_stablehlo_main_1
+  // CHECK: %[[CONSTANT_0:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1x3xf32>
+  // CHECK: return
+  // CHECK: }
+
+  // CHECK: func private @_stablehlo_main_0
+  // CHECK: %[[CONSTANT_0:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<3x64xf32>
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1x64xf32>
+  // CHECK: return
+  // CHECK: }
+
+  func.func @main(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x64xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %1 = stablehlo.constant dense<1.000000e+03> : tensor<1x3xf32>
+    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %3 = "tf.XlaCallModule"(%2#0, %0, %1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %5 = stablehlo.constant dense<1.000000e+03> : tensor<3x64xf32>
+    %6 = stablehlo.constant dense<1.000000e+03> : tensor<1x64xf32>
+    %7:4 = "tf.CustomAggregator"(%4#0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %8 = "tf.XlaCallModule"(%7#0, %5, %6) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x3xf32>, tensor<3x64xf32>, tensor<1x64xf32>) -> tensor<1x64xf32>
+    %9:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x64xf32>) -> (tensor<1x64xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    return %9#0 : tensor<1x64xf32>
+  }
+
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable"}
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_0]])
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_0
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable"}
+  // CHECK: %[[CUSTOM_AGGREGATOR_3:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_1:.*]])
+  // CHECK: return %[[CUSTOM_AGGREGATOR_3]] : tensor<1x64xf32>
+  // CHECK: }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK-NOT: tf_quant.composite_function
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+
+  // CHECK: @composite_dot_general_with_relu_fn_1
+  // CHECK-NOT: tf_quant.composite_function
+  func.func private @composite_dot_general_with_relu_fn_1(%arg0: tensor<1x3xf32>, %arg1: tensor<3x64xf32>, %arg2: tensor<1x64xf32>) -> tensor<1x64xf32> {
+    %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x64xf32>
+    %1 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x3xf32>, tensor<3x64xf32>) -> tensor<1x64xf32>
+    %2 = stablehlo.add %1, %arg2 : tensor<1x64xf32>
+    %3 = stablehlo.maximum %2, %0 : tensor<1x64xf32>
+    return %3 : tensor<1x64xf32>
+  }
+}
+
+
+// -----
+
+// Tests that the subgraph in serving_default excluding the tf.Identity is
+// converted to a single XlaCallModuleOp.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1654 : i32}, tf_saved_model.semantics} {
+
+  // CHECK: func private @_stablehlo_main_0(%arg0: tensor<i32>, %arg1: tensor<1x1024xf32>)
+  // CHECK: %[[CONSTANT_0:.*]] = stablehlo.constant dense<0.134728625> : tensor<1x3xf32>
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<-1.280000e+02> : tensor<1x1024xf32>
+  // CHECK: %[[CONSTANT_2:.*]] = stablehlo.constant dense<0.003921567> : tensor<1x1024xf32>
+  // CHECK: %[[DIVIDE:.*]] = stablehlo.divide %arg1, %[[CONSTANT_2]]
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[DIVIDE]], %[[CONSTANT_1]]
+  // CHECK return %[[ADD]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x1024xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<0.134728625> : tensor<1x3xf32>
+    %1 = stablehlo.constant dense<-1.280000e+02> : tensor<1x1024xf32>
+    %2 = stablehlo.constant dense<0.003921567> : tensor<1x1024xf32>
+    %3 = stablehlo.divide %arg0, %2 : tensor<1x1024xf32>
+    %4 = stablehlo.add %3, %1 : tensor<1x1024xf32>
+    %5 = "tf.Identity"(%4) {device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+    return %5 : tensor<1x1024xf32>
+  }
+
+ // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"(%arg0) <{Sout = [#tf_type.shape<1x1024>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _stablehlo_version = "{{.*}}"} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+ // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP]])
+ // CHECK: return %[[IDENTITY]]
+ // CHECK }
+
+}
+
+// -----
+
+// Tests that the first stablehlo.constant is converted to XlaCallModuleOp.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func private @_stablehlo_main_0
+  // CHECK: %[[CONSTANT:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: return %[[CONSTANT:.*]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    return %3#0 : tensor<1x3xf32>
+  }
+
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}, module = "", platforms = ["CPU", "TPU"], use_shardy_partitioner = false, version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _stablehlo_version = "{{.*}}"}
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}"
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
+  // CHECK: }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK-NOT: tf_quant.composite_function
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Tests to confirm that the StableHLO graph is not replaced if "main" or
+// "serving_default" function is not in the module.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK-NOT: func private @_stablehlo_main_
+
+  // CHECK-LABEL: @random_name
+  func.func @random_name(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    return %3#0 : tensor<1x3xf32>
+  }
+
+  // CHECK: %[[CONSTANT:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[XLA_CALL_MODULE_EXTRACTED_FROM_SUBGRAPH:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0"
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
+  // CHECK: }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK: tf_quant.composite_function
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Tests where StableHLO graph in main has a small constant to be duplicated.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func private @_stablehlo_main_1(%arg0: tensor<i32>) -> tensor<1024x3xf32> attributes {_from_xla_call_module}
+  // CHECK: %[[CONSTANT1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: return %[[CONSTANT1:.*]]
+  // CHECK: }
+
+  // CHECK: func private @_stablehlo_main_0(%arg0: tensor<i32>
+  // CHECK-SAME: %[[INPUT1:.*]]: tensor<1024x3xf32>, %[[INPUT2:.*]]: tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT2:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[INPUT1]], %[[CONSTANT2]] : tensor<1024x3xf32>
+  // CHECK: %[[MUL:.*]] = stablehlo.multiply %[[INPUT1]], %[[INPUT2]] : tensor<1024x3xf32>
+  // CHECK: return %[[ADD]], %[[MUL]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1024x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1024x3xf32> {tf_saved_model.index_path = ["output1"]}, tensor<1024x3xf32> {tf_saved_model.index_path = ["output2"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %4 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %5 = stablehlo.add %3#0, %4 : tensor<1024x3xf32>
+    %6 = stablehlo.multiply %3#0, %0 : tensor<1024x3xf32>
+    return %5, %6 : tensor<1024x3xf32>, tensor<1024x3xf32>
+  }
+
+  // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0"
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[SUBGRAPH_2:.*]]:2 = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: return %[[SUBGRAPH_2]]#0, %[[SUBGRAPH_2]]#1
+  // CHECK: }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK-NOT: tf_quant.composite_function
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Tests where StableHLO graph in main has branches.
+// This test makes sure tracing won't stop at op (%1) with multiple uses.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func private @_stablehlo_main_1(%arg0: tensor<i32>) -> tensor<3x11xf32>
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<3x11xf32>
+  // CHECK: return %[[CONSTANT_1:.*]]
+  // CHECK: }
+
+  // CHECK: func private @_stablehlo_main_0
+  // CHECK-SAME: (%arg0: tensor<i32>, %[[INPUT_1:.*]]: tensor<3x11xf32>)
+  // CHECK-SAME: -> tensor<3x11xf32>
+  // CHECK: %[[CONSTANT_2:.*]] = stablehlo.constant dense<1.000000e+01> : tensor<3x11xf32>
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[INPUT_1]], %[[CONSTANT_2]] : tensor<3x11xf32>
+  // CHECK: %[[MUL:.*]] = stablehlo.multiply %[[ADD]], %[[CONSTANT_2]] : tensor<3x11xf32>
+  // CHECK: return %[[MUL]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<3x3xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<3x11xf32> {tf_saved_model.index_path = ["output1"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<3x11xf32>
+    // %1 is large enough that it won't be duplicated.
+    %1 = stablehlo.constant dense<1.000000e+01> : tensor<3x11xf32>
+    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<3x3xf32>) -> (tensor<3x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %3 = "tf.XlaCallModule"(%2#0, %0) {Sout = [#tf_type.shape<3x11>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x3xf32>, tensor<3x11xf32>) -> tensor<3x11xf32>
+    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<3x11xf32>) -> (tensor<3x11xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %5 = stablehlo.add %4#0, %1 : tensor<3x11xf32>
+    %6 = stablehlo.multiply %5, %1 : tensor<3x11xf32>
+    return %6 : tensor<3x11xf32>
+  }
+
+  // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<3x11>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<3x11>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0"
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[SUBGRAPH_2:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]]) <{Sout = [#tf_type.shape<3x11>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: return %[[SUBGRAPH_2]]
+  // CHECK: }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK-NOT: tf_quant.composite_function
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<3x3xf32>, %arg1: tensor<3x11xf32>) -> tensor<3x11xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<3x3xf32>, tensor<3x11xf32>) -> tensor<3x11xf32>
+    return %0 : tensor<3x11xf32>
+  }
+}
+
+// -----
+
+// Tests where StableHLO graph in main has dead end.
+// This test makes sure tracing will include the dead end from the op in the
+// same sub graph:
+// stablehlo.add and %0 along with its dead end branch are in the same sub
+// graph.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func.func private @_stablehlo_main_1(%arg0: tensor<i32>) -> tensor<1024x3xf32> attributes {_from_xla_call_module} {
+  // CHECK: %[[CONSTANT_0:.*]] = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
+  // CHECK: return %[[CONSTANT_0]]
+  // CHECK: }
+
+  // CHECK: func.func private @_stablehlo_main_0(%arg0: tensor<i32>, %[[ARG_1:.*]]: tensor<1024x3xf32>) -> tensor<1024x3xf32> attributes {_from_xla_call_module} {
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT_2:.*]] = stablehlo.constant dense<5.000000e+01> : tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT_3:.*]] = stablehlo.constant dense<4.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[REMAINDER:.*]] = stablehlo.remainder %[[CONSTANT_3]], %[[CONSTANT_1]] : tensor<1024x3xf32>
+  // CHECK: %[[COMPARE:.*]] = stablehlo.compare  EQ, %[[REMAINDER]], %[[CONSTANT_2]],  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
+  // CHECK: stablehlo.custom_call @shape_assertion(%[[COMPARE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[ARG_1]], %[[CONSTANT_3]]
+  // CHECK: return %[[ADD]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1024x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1024x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<4.000000e+03> : tensor<1024x3xf32>
+    %1 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %2 = stablehlo.constant dense<5.000000e+01> : tensor<1024x3xf32>
+    %3 = stablehlo.remainder %0, %1 : tensor<1024x3xf32>
+    %4 = stablehlo.compare  EQ, %3, %2,  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
+    stablehlo.custom_call @shape_assertion(%4) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
+    %5 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
+    %6:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %7 = "tf.XlaCallModule"(%6#0, %5) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %8:4 = "tf.CustomAggregator"(%7) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %9 = stablehlo.add %8#0, %0 : tensor<1024x3xf32>
+    return %9 : tensor<1024x3xf32>
+  }
+  // CHECK: %[[SUBGRAPH_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_0]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0"
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: return %[[SUBGRAPH_1]] : tensor<1024x3xf32>
+  // CHECK: }
+}
+
+// -----
+
+// Tests where StableHLO graph in main has branch.
+// This test makes sure the branch will not be added to subgraph when it reaches
+// a tf op:
+// stablehlo.add and %0 are not in the same subgraph.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func.func private @_stablehlo_main_2(%arg0: tensor<i32>) -> (tensor<1024x3xf32>, tensor<1024x3xf32>) attributes {_from_xla_call_module} {
+  // CHECK: %[[CONSTANT_0:.*]] = stablehlo.constant dense<4.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[REMAINDER:.*]] = stablehlo.remainder %[[CONSTANT_0]], %[[CONSTANT_1]] : tensor<1024x3xf32>
+  // CHECK: return %[[CONSTANT_0]], %[[REMAINDER]]
+  // CHECK: }
+
+  // CHECK: func.func private @_stablehlo_main_1(%arg0: tensor<i32>) -> tensor<1024x3xf32> attributes {_from_xla_call_module} {
+  // CHECK: %[[CONSTANT_2:.*]] = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
+  // CHECK: return %[[CONSTANT_2]] : tensor<1024x3xf32>
+  // CHECK: }
+
+  // CHECK: func.func private @_stablehlo_main_0(%arg0: tensor<i32>, %[[ARG_1:.*]]: tensor<1024x3xf32>, %[[ARG_2:.*]]: tensor<1024x3xf32>) -> tensor<1024x3xf32> attributes {_from_xla_call_module} {
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[ARG_1]], %[[ARG_2]]
+  // CHECK: return %[[ADD]]
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1024x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1024x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<4.000000e+03> : tensor<1024x3xf32>
+    %1 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %2 = stablehlo.remainder %0, %1 : tensor<1024x3xf32>
+    %3 = "tf.Identity"(%2) {device = ""} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %4 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
+    %5:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %6 = "tf.XlaCallModule"(%5#0, %4) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %7:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %8 = stablehlo.add %7#0, %0 : tensor<1024x3xf32>
+    return %8 : tensor<1024x3xf32>
+  }
+  // CHECK: %[[SUBGRAPH_0:.*]]:2 = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_2
+  // CHECK: %[[IDENTIFY:.*]] = "tf.Identity"(%[[SUBGRAPH_0]]#1) {device = ""} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
+  // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "1.0.0"
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[SUBGRAPH_2:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_0]]#0) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: return %[[SUBGRAPH_2]] : tensor<1024x3xf32>
+  // CHECK: }
+}
+
+// -----
+
+// Tests where StableHLO graph in main has dead end.
+// This test checks tracing will stop if the dead end is too deep (>5):
+// stablehlo.add and %0 are not in the same subgraph.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func.func private @_stablehlo_main_1(%arg0: tensor<i32>) -> (tensor<1024x3xf32>, tensor<1024x3xf32>) attributes {_from_xla_call_module} {
+  // CHECK: %[[CONSTANT_0:.*]] = stablehlo.constant dense<4.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT_1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT_2:.*]] = stablehlo.constant dense<5.000000e+01> : tensor<1024x3xf32>
+  // CHECK: %[[REMAINDER_0:.*]] = stablehlo.remainder %[[CONSTANT_0]], %[[CONSTANT_1]] : tensor<1024x3xf32>
+  // CHECK: %[[REMAINDER_1:.*]] = stablehlo.remainder %[[REMAINDER_0]], %[[CONSTANT_1]] : tensor<1024x3xf32>
+  // CHECK: %[[REMAINDER_2:.*]] = stablehlo.remainder %[[REMAINDER_1]], %[[CONSTANT_1]] : tensor<1024x3xf32>
+  // CHECK: %[[REMAINDER_3:.*]] = stablehlo.remainder %[[REMAINDER_2]], %[[CONSTANT_1]] : tensor<1024x3xf32>
+  // CHECK: %[[COMPARE:.*]] = stablehlo.compare  EQ, %[[REMAINDER_3]], %[[CONSTANT_2]],  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
+  // CHECK: stablehlo.custom_call @shape_assertion(%[[COMPARE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
+  // CHECK: %[[CONSTANT_3:.*]] = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
+  // CHECK: return %[[CONSTANT_0]], %[[CONSTANT_3]]
+  // CHECK: }
+
+  // CHECK: func.func private @_stablehlo_main_0(%arg0: tensor<i32>, %[[ARG_1:.*]]: tensor<1024x3xf32>, %[[ARG_2:.*]]: tensor<1024x3xf32>) -> tensor<1024x3xf32> attributes {_from_xla_call_module} {
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[ARG_1]], %[[ARG_2]]
+  // CHECK: return %[[ADD]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1024x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1024x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<4.000000e+03> : tensor<1024x3xf32>
+    %1 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %2 = stablehlo.constant dense<5.000000e+01> : tensor<1024x3xf32>
+    %3 = stablehlo.remainder %0, %1 : tensor<1024x3xf32>
+    %4 = stablehlo.remainder %3, %1 : tensor<1024x3xf32>
+    %5 = stablehlo.remainder %4, %1 : tensor<1024x3xf32>
+    %6 = stablehlo.remainder %5, %1 : tensor<1024x3xf32>
+    %7 = stablehlo.compare  EQ, %6, %2,  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
+    stablehlo.custom_call @shape_assertion(%7) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
+    %8 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
+    %9:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %10 = "tf.XlaCallModule"(%9#0, %8) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %11:4 = "tf.CustomAggregator"(%10) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %12 = stablehlo.add %11#0, %0 : tensor<1024x3xf32>
+    return %12 : tensor<1024x3xf32>
+  }
+  // CHECK: %[[SUBGRAPH_0:.*]]:2 = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_0]]#1) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_version = "{{.*}}"
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_0]]#0) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: return %[[SUBGRAPH_1]] : tensor<1024x3xf32>
+  // CHECK: }
+}
+
+// -----
+
+// main function contains PartitionedCall and StatefulPartitionedCall ops which
+// is used to preserve aliased functions. This test make sure stablehlo ops in
+// each PartitionedCall functions are lifted.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func private @_stablehlo_main_2
+  // CHECK: stablehlo.multiply %arg1, %arg2 : tensor<3x3xf32>
+  // CHECK: return
+  // CHECK: }
+
+  // CHECK: func private @_stablehlo_main_1
+  // CHECK: stablehlo.add %arg1, %arg2 : tensor<3x3xf32>
+  // CHECK: return
+  // CHECK: }
+
+  // CHECK: func private @_stablehlo_main_0
+  // CHECK: stablehlo.constant dense<1.000000e+03> : tensor<3x3xf32>
+  // CHECK: stablehlo.constant dense<2.000000e+03> : tensor<3x3xf32>
+  // CHECK: return
+  // CHECK: }
+
+  func.func @main() -> (tensor<3x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<3x3xf32>
+    %1 = stablehlo.constant dense<2.000000e+03> : tensor<3x3xf32>
+    %2 = "tf.StatefulPartitionedCall"(%0, %1) <{
+      config = "", config_proto = "", executor_type = "", f = @some_func
+    }> {
+      _collective_manager_ids = [], device = ""
+    } : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    %3 = "tf.PartitionedCall"(%2, %1) <{
+      config = "", config_proto = "", executor_type = "", f = @some_other_func
+    }> {
+      _collective_manager_ids = [], device = ""
+    } : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    return %3 : tensor<3x3xf32>
+  }
+  // CHECK: func.func @main
+  // CHECK: %[[INPUT:.*]]:3 = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @_stablehlo_main_0
+  // CHECK: %[[ADD:.*]] = "tf.StatefulPartitionedCall"(%[[INPUT]]#1, %[[INPUT]]#2)
+  // CHECK-SAME: f = @some_func
+  // CHECK: "tf.PartitionedCall"(%[[ADD]], %[[INPUT]]#0)
+  // CHECK-SAME: f = @some_other_func
+  // CHECK: return
+
+  func.func private @some_func(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> attributes {tf._noinline = true} {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<3x3xf32>
+    return %0 : tensor<3x3xf32>
+  }
+  // CHECK: func.func private @some_func
+  // CHECK: tf.XlaCallModule
+  // CHECK-SAME: _entry_function = @_stablehlo_main_1
+  // CHECK: return
+
+  func.func private @some_other_func(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> attributes {tf._noinline = true} {
+    %0 = stablehlo.multiply %arg0, %arg1 : tensor<3x3xf32>
+    return %0 : tensor<3x3xf32>
+  }
+  // CHECK: func.func private @some_other_func
+  // CHECK: tf.XlaCallModule
+  // CHECK-SAME: _entry_function = @_stablehlo_main_2
+  // CHECK: return
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_restore_function_name.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_restore_function_name.mlir
new file mode 100644
index 000000000000..b6f746c8e469
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_restore_function_name.mlir
@@ -0,0 +1,52 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-restore-function-name | FileCheck %s
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1646 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: @serving_default
+  // CHECK-SAME: %[[ARG0:[^:[:space:]]+]]
+  // CHECK-SAME: %[[ARG1:[^:[:space:]]+]]
+  func.func private @serving_default(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<1x3xf32> {
+    %0 = "tf.XlaCallModule"(%arg0, %arg1) {Sout = [#tf_type.shape<1x3>], _entry_function = @main, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+    // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
+    // CHECK-SAME: _entry_function = @composite_dot_general_fn_1
+    // CHECK-SAME: _original_entry_function = "composite_dot_general_fn_1"
+    // CHECK: return %[[CALL]]
+  }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:[^:[:space:]]+]]
+  // CHECK-SAME: %[[ARG3:[^:[:space:]]+]]
+  func.func private @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+    // CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK: return %[[DOT]]
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1646 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: @serving_default
+  // CHECK-SAME: %[[ARG0:[^:[:space:]]+]]
+  // CHECK-SAME: %[[ARG1:[^:[:space:]]+]]
+  func.func private @serving_default(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<1x3xf32> {
+    %0 = "tf.XlaCallModule"(%arg0, %arg1) {Sout = [#tf_type.shape<1x3>], _entry_function = @main, _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+    // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
+    // CHECK-SAME: _entry_function = @main
+    // CHECK-NOT: _original_entry_function = "composite_dot_general_fn_1"
+    // CHECK: return %[[CALL]]
+  }
+
+  // CHECK: @main
+  // CHECK-NOT: @composite_dot_general_fn_1
+  // CHECK-SAME: %[[ARG2:[^:[:space:]]+]]
+  // CHECK-SAME: %[[ARG3:[^:[:space:]]+]]
+  func.func private @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+    // CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
+    // CHECK: return %[[DOT]]
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_shape_cstr_legalize_to_hlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_shape_cstr_legalize_to_hlo.mlir
new file mode 100644
index 000000000000..e0a2ba600993
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_shape_cstr_legalize_to_hlo.mlir
@@ -0,0 +1,110 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-convert-shape-to-stablehlo-with-constraints --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: func.func @shape_cstr_broadcastable
+func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xindex>) {
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<2xindex>
+  shape.assuming %0 {
+  }
+  func.return
+  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[ONES:.*]] = stablehlo.constant dense<1> : tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = stablehlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_EQ:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[DIMS2]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = stablehlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
+  // CHECK-NEXT: %[[TRUE:.*]] = stablehlo.constant dense<true> : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [0:1] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = stablehlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [1:2] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = stablehlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = stablehlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
+  // CHECK-NEXT: stablehlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// CHECK-LABEL: func @shape_cstr_broadcastable_different_dims_1
+func.func @shape_cstr_broadcastable_different_dims_1(%arg0: tensor<2xindex>, %arg1: tensor<1xindex>) {
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<1xindex>
+  shape.assuming %0 {
+  }
+  func.return
+  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<1xindex> to tensor<1xi32>
+  // CHECK-NEXT: %[[PAD:.*]] = stablehlo.constant dense<1> : tensor<1xi32>
+  // CHECK-NEXT: %[[DIMS2_PAD:.*]] = stablehlo.concatenate %[[PAD]], %[[DIMS2]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[ONES:.*]] = stablehlo.constant dense<1> : tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS2_PAD]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = stablehlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_EQ:.*]] = stablehlo.compare  EQ, %[[DIMS1]], %[[DIMS2_PAD]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = stablehlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
+  // CHECK-NEXT: %[[TRUE:.*]] = stablehlo.constant dense<true> : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [0:1] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = stablehlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [1:2] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = stablehlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = stablehlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
+  // CHECK-NEXT: stablehlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// CHECK-LABEL: func @shape_cstr_broadcastable_different_dims_2
+func.func @shape_cstr_broadcastable_different_dims_2(%arg0: tensor<1xindex>, %arg1: tensor<2xindex>) {
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<1xindex>, tensor<2xindex>
+  shape.assuming %0 {
+  }
+  func.return
+  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<1xindex> to tensor<1xi32>
+  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[PAD:.*]] = stablehlo.constant dense<1> : tensor<1xi32>
+  // CHECK-NEXT: %[[DIMS1_PAD:.*]] = stablehlo.concatenate %[[PAD]], %[[DIMS1]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[ONES:.*]] = stablehlo.constant dense<1> : tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS1_PAD]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = stablehlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = stablehlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_EQ:.*]] = stablehlo.compare  EQ, %[[DIMS1_PAD]], %[[DIMS2]] : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = stablehlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
+  // CHECK-NEXT: %[[TRUE:.*]] = stablehlo.constant dense<true> : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [0:1] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = stablehlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = stablehlo.slice %[[DIMS_BROADCASTABLE]] [1:2] : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = stablehlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = stablehlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
+  // CHECK-NEXT: stablehlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+
+func.func @shape_cstr_broadcast_too_many_operands(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>, %arg2: tensor<4xindex>) {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1, %arg2 : tensor<4xindex>, tensor<4xindex>, tensor<4xindex>
+  shape.assuming %0 {
+  }
+  func.return
+}
+
+// -----
+
+func.func @shape_cstr_broadcastable_input_shape(%arg0: !shape.shape, %arg1: !shape.shape) {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : !shape.shape, !shape.shape
+  shape.assuming %0 {
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_unfuse_mhlo_batch_norm.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_unfuse_mhlo_batch_norm.mlir
new file mode 100644
index 000000000000..e6dd30102e1d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_unfuse_mhlo_batch_norm.mlir
@@ -0,0 +1,30 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-unfuse-mhlo-batch-norm | FileCheck %s
+
+// CHECK-LABEL: @unfuse_batch_norm
+// CHECK-SAME: %[[X:[^:[:space:]]+]]
+// CHECK-SAME: %[[SCALE:[^:[:space:]]+]]
+// CHECK-SAME: %[[OFFSET:[^:[:space:]]+]]
+// CHECK-SAME: %[[MEAN:[^:[:space:]]+]]
+// CHECK-SAME: %[[VARIANCE:[^:[:space:]]+]]
+func.func @unfuse_batch_norm(
+    %x: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>,
+    %mean: tensor<256xf32>, %variance: tensor<256xf32>)
+    -> (tensor<4x256xf32>) {
+  // CHECK-DAG: %[[EPS_BCAST:.+]] = mhlo.constant dense<1.001000e-05> : tensor<256xf32>
+  // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
+  // CHECK-DAG: %[[STDDEV:.+]] = mhlo.sqrt %[[VARIANCE_EPS]] : tensor<256xf32>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[STDDEV]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[SCALE]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[OFFSET]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MEAN]]) <{broadcast_dimensions = dense<1> : tensor<1xi64>}> : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK: %[[X_CENTER:.+]] = mhlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<4x256xf32>
+  // CHECK: %[[X_SCALED:.+]] = mhlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<4x256xf32>
+  // CHECK: %[[X_NORMED:.+]] = mhlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<4x256xf32>
+  // CHECK: %[[RESULT:.+]] = mhlo.add %[[X_NORMED]], %[[OFFSET_BCAST]] : tensor<4x256xf32>
+  %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
+      (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
+        tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: return %[[RESULT]]
+  func.return %0 : tensor<4x256xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_unwrap_xla_call_module_op.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_unwrap_xla_call_module_op.mlir
new file mode 100644
index 000000000000..e31ec5a24cf8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_unwrap_xla_call_module_op.mlir
@@ -0,0 +1,53 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-unwrap-xla-call-module-op | FileCheck %s
+
+// Tests if XlaCallModule op without quantizable trait that calls function with
+// '_from_xla_call_module' trait is unwrapped.
+// Tests if XlaCallModule op with quantizable trait is not unwrapped.
+// Tests if XlaCallModule op without quantizable trait that calls function
+// without '_from_xla_call_module' trait is not unwrapped.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1682 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: @main_00
+  // CHECK: %[[ARG0:.*]]: tensor<10x1x1024xf32>
+  func.func private @main_00(%arg0: tensor<10x1x1024xf32>) -> tensor<6x5xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = "tf.Const"() <{value = dense<1.000000e+00> : tensor<10x1024x3xf32>}> : () -> tensor<10x1024x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %2 = "tf.XlaCallModule"(%1) <{Sout = [#tf_type.shape<3x10>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @main_0, _stablehlo_version = "1.0.0", _stablehlo_module_attrs = {}, device = ""} : (tensor<10x1x3xf32>) -> tensor<3x10xf32>
+    %3 = "tf.XlaCallModule"(%2) <{Sout = [#tf_type.shape<6x5>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @main_1, _stablehlo_version = "1.0.0", _stablehlo_module_attrs = {}, device = ""} : (tensor<3x10xf32>) -> tensor<6x5xf32>
+    return %3 : tensor<6x5xf32>
+  }
+  // CHECK: %[[CST:.*]] = "tf.Const"()
+  // CHECK-NEXT: %[[CALL1:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[CST]])
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-NOT: "tf.XlaCallModule"
+  // CHECK-NEXT: %[[RESHAPE:.*]] = stablehlo.reshape %[[CALL1]] : (tensor<10x1x3xf32>) -> tensor<3x10xf32>
+  // CHECK-NEXT: %[[CALL2:.*]] = "tf.XlaCallModule"(%[[RESHAPE]])
+  // CHECK-SAME: _entry_function = @main_1
+  // CHECK-NOT:  _tfl_quant_trait = "fully_quantizable"
+  // CHECK-NEXT: return %[[CALL2]]
+
+  // CHECK: @composite_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    return %0 : tensor<10x1x3xf32>
+  }
+  // CHECK: %[[DOT:.*]] = stablehlo.dot_general
+  // CHECK-NEXT: return %[[DOT]]
+
+  // CHECK: @main_0
+  func.func private @main_0(%arg0: tensor<10x1x3xf32>) -> tensor<3x10xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.reshape %arg0 : (tensor<10x1x3xf32>) -> tensor<3x10xf32>
+    return %0 : tensor<3x10xf32>
+  }
+  // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape
+  // CHECK-NEXT: return %[[RESHAPE]]
+
+  // CHECK: @main_1
+  func.func private @main_1(%arg0: tensor<3x10xf32>) -> tensor<6x5xf32> {
+    %0 = stablehlo.reshape %arg0 : (tensor<3x10xf32>) -> tensor<6x5xf32>
+    return %0 : tensor<6x5xf32>
+  }
+  // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape
+  // CHECK-NEXT: return %[[RESHAPE]]
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_xla_call_module_to_call.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_xla_call_module_to_call.mlir
new file mode 100644
index 000000000000..15374881b677
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/tf_xla_call_module_to_call.mlir
@@ -0,0 +1,23 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -tf-stablehlo-xla-call-module-to-call | FileCheck %s
+
+// -----
+
+// Tests composite tf.XlaCallModule is converted to func.call.
+
+module {
+  // CHECK-LABEL: func.func @main
+  func.func @main(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
+    // CHECK: call @composite_dot_general_fn_1
+    // CHECK-SAME: (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    // CHECK-NOT: tf.XlaCallModule
+    %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+  // CHECK-LABEL: func.func private @composite_dot_general_fn_1
+  // CHECK-SAME: -> tensor<1x3xf32>
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
index c14cff879848..105ab22d159b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
@@ -28,10 +28,12 @@ limitations under the License.
 #include "stablehlo/transforms/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -46,6 +48,7 @@ int main(int argc, char** argv) {
   mlir::registerAllPasses();
   mlir::registerTensorFlowPasses();
   mlir::quant::stablehlo::registerPasses();
+  mlir::tf_quant::stablehlo::registerPasses();
   mlir::quant::stablehlo::registerBridgePasses();
   mlir::stablehlo::registerPasses();
   mlir::mhlo::registerAllMhloPasses();
@@ -64,7 +67,7 @@ int main(int argc, char** argv) {
                   mlir::quantfork::QuantizationForkDialect,
                   mlir::stablehlo::StablehloDialect,
                   mlir::tf_executor::TensorFlowExecutorDialect,
-                  mlir::vhlo::VhloDialect>();
+                  mlir::vhlo::VhloDialect, mlir::quant::ir::TFQuantDialect>();
   mlir::mhlo::registerAllMhloDialects(registry);
   mlir::func::registerAllExtensions(registry);
   return failed(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tools/tf_stablehlo_quant_opt.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tools/tf_stablehlo_quant_opt.cc
new file mode 100644
index 000000000000..e79b539f00ea
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tools/tf_stablehlo_quant_opt.cc
@@ -0,0 +1,73 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "xla/mlir_hlo/mhlo/IR/register.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/core/ir/types/dialect.h"
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  mlir::registerAllPasses();
+  mlir::registerTensorFlowPasses();
+  mlir::quant::stablehlo::registerPasses();
+  mlir::tf_quant::stablehlo::registerPasses();
+  mlir::quant::stablehlo::registerBridgePasses();
+  mlir::stablehlo::registerPasses();
+  mlir::mhlo::registerAllMhloPasses();
+  // These passes are only used for testing purposes.
+  mlir::quant::stablehlo::testing::registerTestPasses();
+
+  // Register StableHLO Quantizer pass pipelines.
+  mlir::quant::stablehlo::RegisterPassPipelines();
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
+                  mlir::tf_saved_model::TensorFlowSavedModelDialect,
+                  mlir::func::FuncDialect, mlir::shape::ShapeDialect,
+                  mlir::arith::ArithDialect, mlir::tf_type::TFTypeDialect,
+                  mlir::quant::QuantDialect, mlir::tensor::TensorDialect,
+                  mlir::stablehlo::StablehloDialect,
+                  mlir::tf_executor::TensorFlowExecutorDialect,
+                  mlir::vhlo::VhloDialect, mlir::quant::ir::TFQuantDialect>();
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::func::registerAllExtensions(registry);
+  return failed(
+      mlir::MlirOptMain(argc, argv, "StableHLO quant Pass Driver\n", registry));
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index f6cfec951ebb..aaac4ad0e59d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -86,6 +86,29 @@ cc_library(
     ],
 )
 
+td_library(
+    name = "tf_quant_td_files",
+    srcs = [
+        "passes/tf_cast_bf16_ops_to_f32.td",
+        "passes/tf_convert_tf_xla_op_to_tf_op.td",
+        "passes/tf_lift_quantizable_spots_as_functions.td",
+        "passes/tf_lift_quantizable_spots_as_functions_drq.td",
+        "passes/tf_optimize.td",
+        "passes/tf_post_quantize.td",
+        "passes/tf_prepare_lifting.td",
+        "passes/tf_quantize_composite_functions.td",
+        "passes/tf_replace_cast_hacks_with_tf_xla_ops.td",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common:quant_td_files",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantizationOpsTdFiles",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+        "@llvm-project//mlir:ArithOpsTdFiles",
+        "@llvm-project//mlir:FuncTdFiles",
+    ],
+)
+
 td_library(
     name = "quant_td_files",
     srcs = [
@@ -114,114 +137,136 @@ td_library(
 gentbl_cc_library(
     name = "convert_tf_xla_op_to_tf_op_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/convert_tf_xla_op_to_tf_op.inc",
-        ),
-    ],
+    tbl_outs = {"passes/convert_tf_xla_op_to_tf_op.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/convert_tf_xla_op_to_tf_op.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_convert_tf_xla_op_to_tf_op_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_convert_tf_xla_op_to_tf_op.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_convert_tf_xla_op_to_tf_op.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "cast_bf16_ops_to_f32_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/cast_bf16_ops_to_f32.inc",
-        ),
-    ],
+    tbl_outs = {"passes/cast_bf16_ops_to_f32.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/cast_bf16_ops_to_f32.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_cast_bf16_ops_to_f32_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_cast_bf16_ops_to_f32.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_cast_bf16_ops_to_f32.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "prepare_lifting_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/prepare_lifting.inc",
-        ),
-    ],
+    tbl_outs = {"passes/prepare_lifting.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/prepare_lifting.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_prepare_lifting_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_prepare_lifting.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_prepare_lifting.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "lift_quantizable_spots_as_functions_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/lift_quantizable_spots_as_functions.inc",
-        ),
-    ],
+    tbl_outs = {"passes/lift_quantizable_spots_as_functions.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/lift_quantizable_spots_as_functions.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_lift_quantizable_spots_as_functions_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_lift_quantizable_spots_as_functions.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_lift_quantizable_spots_as_functions.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "lift_quantizable_spots_as_functions_drq_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/lift_quantizable_spots_as_functions_drq.inc",
-        ),
-    ],
+    tbl_outs = {"passes/lift_quantizable_spots_as_functions_drq.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/lift_quantizable_spots_as_functions_drq.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_lift_quantizable_spots_as_functions_drq_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_lift_quantizable_spots_as_functions_drq.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_lift_quantizable_spots_as_functions_drq.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "prepare_quantize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/prepare_quantize.inc",
-        ),
-    ],
+    tbl_outs = {"passes/prepare_quantize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/prepare_quantize.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_prepare_quantize_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_prepare_quantize.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_prepare_quantize.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "quantize_composite_functions_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/quantize_composite_functions.inc",
-        ),
-    ],
+    tbl_outs = {"passes/quantize_composite_functions.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/quantize_composite_functions.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_quantize_composite_functions_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_quantize_composite_functions.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_quantize_composite_functions.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "tf_quant_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "passes/tf_quant_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "passes/tf_quant_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "passes/tf_quant_ops.h.inc": ["-gen-op-decls"],
+        "passes/tf_quant_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/tf_quant_ops.td",
     deps = [
@@ -232,54 +277,61 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "optimize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/optimize.inc",
-        ),
-    ],
+    tbl_outs = {"passes/optimize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/optimize.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_optimize_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_optimize.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_optimize.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "convert_tpu_model_to_cpu_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/convert_tpu_model_to_cpu.inc",
-        ),
-    ],
+    tbl_outs = {"passes/convert_tpu_model_to_cpu.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/convert_tpu_model_to_cpu.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_convert_tpu_model_to_cpu_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_convert_tpu_model_to_cpu.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_convert_tpu_model_to_cpu.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "post_quantize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/post_quantize.inc",
-        ),
-    ],
+    tbl_outs = {"passes/post_quantize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/post_quantize.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_post_quantize_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_post_quantize.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_post_quantize.td",
+    deps = [":tf_quant_td_files"],
+)
+
 gentbl_cc_library(
     name = "preprocess_op_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/preprocess_op.inc",
-        ),
-    ],
+    tbl_outs = {"passes/preprocess_op.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/preprocess_op.td",
     deps = [":quant_td_files"],
@@ -319,17 +371,21 @@ cc_library(
 gentbl_cc_library(
     name = "replace_cast_hacks_with_tf_xla_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/replace_cast_hacks_with_tf_xla_ops.inc",
-        ),
-    ],
+    tbl_outs = {"passes/replace_cast_hacks_with_tf_xla_ops.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/replace_cast_hacks_with_tf_xla_ops.td",
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "tf_replace_cast_hacks_with_tf_xla_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"passes/tf_replace_cast_hacks_with_tf_xla_ops.inc": ["-gen-rewriters"]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/tf_replace_cast_hacks_with_tf_xla_ops.td",
+    deps = [":tf_quant_td_files"],
+)
+
 cc_library(
     name = "passes",
     srcs = [
@@ -402,7 +458,6 @@ cc_library(
         ":remove_identity_op_pattern",
         ":replace_cast_hacks_with_tf_xla_ops_inc_gen",
         ":tf_quant_ops",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/common:func",
@@ -475,6 +530,183 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "tf_passes",
+    srcs = [
+        "passes/quantized_function_library.h",
+        "passes/tf_add_dump_tensor_op.cc",
+        "passes/tf_add_quantization_unit_loc.cc",
+        "passes/tf_cast_bf16_ops_to_f32.cc",
+        "passes/tf_cast_bf16_ops_to_f32.inc",
+        "passes/tf_convert_custom_aggregation_op_to_quant_stats.cc",
+        "passes/tf_convert_fake_quant_to_qdq.cc",
+        "passes/tf_convert_tf_xla_op_to_tf_op.cc",
+        "passes/tf_convert_tf_xla_op_to_tf_op.inc",
+        "passes/tf_convert_tpu_model_to_cpu.cc",
+        "passes/tf_convert_tpu_model_to_cpu.inc",
+        "passes/tf_duplicate_shape_determining_constants.cc",
+        "passes/tf_insert_custom_aggregation_ops.cc",
+        "passes/tf_insert_main_function.cc",
+        "passes/tf_insert_quantized_functions.cc",
+        "passes/tf_insert_restore_op.cc",
+        "passes/tf_insert_save_op.cc",
+        "passes/tf_lift_hashtable_ops_as_args.cc",
+        "passes/tf_lift_quantizable_spots_as_functions.cc",
+        "passes/tf_lift_quantizable_spots_as_functions.inc",
+        "passes/tf_lift_quantizable_spots_as_functions_drq.cc",
+        "passes/tf_lift_quantizable_spots_as_functions_drq.inc",
+        "passes/tf_mark_functions_noinline.cc",
+        "passes/tf_merge_duplicate_resource_ops.cc",
+        "passes/tf_merge_initializer_function_ops_to_main.cc",
+        "passes/tf_merge_save_function_ops_to_main.cc",
+        "passes/tf_optimize.cc",
+        "passes/tf_optimize.inc",
+        "passes/tf_post_quantize.cc",
+        "passes/tf_post_quantize.inc",
+        "passes/tf_prepare_lifting.cc",
+        "passes/tf_prepare_lifting.inc",
+        "passes/tf_prepare_quantize.cc",
+        "passes/tf_prepare_quantize.inc",
+        "passes/tf_prepare_quantize_drq.cc",
+        "passes/tf_preprocess_op.cc",
+        "passes/tf_propagate_quantize_type.cc",
+        "passes/tf_quantize.cc",
+        "passes/tf_quantize_composite_functions.cc",
+        "passes/tf_quantize_composite_functions.inc",
+        "passes/tf_quantize_weights.cc",
+        "passes/tf_remove_var_init_by_const.cc",
+        "passes/tf_replace_cast_hacks_with_tf_xla_ops.cc",
+        "passes/tf_replace_cast_hacks_with_tf_xla_ops.inc",
+        "passes/tf_unfreeze_constants.cc",
+    ],
+    hdrs = [
+        "passes/constants.h",
+        "passes/tf_passes.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":lift_quantizable_spots_as_functions_drq_inc_gen",
+        ":manipulate_model_attr",
+        ":preprocess_op_gen",
+        ":quantization_options_proto_cc",
+        ":remove_identity_op_pattern",
+        ":tf_cast_bf16_ops_to_f32_inc_gen",
+        ":tf_convert_tf_xla_op_to_tf_op_inc_gen",
+        ":tf_convert_tpu_model_to_cpu_inc_gen",
+        ":tf_lift_quantizable_spots_as_functions_drq_inc_gen",
+        ":tf_lift_quantizable_spots_as_functions_inc_gen",
+        ":tf_optimize_inc_gen",
+        ":tf_post_quantize_inc_gen",
+        ":tf_prepare_lifting_inc_gen",
+        ":tf_prepare_quantize_inc_gen",
+        ":tf_quant_ops",
+        ":tf_quantize_composite_functions_inc_gen",
+        ":tf_replace_cast_hacks_with_tf_xla_ops_inc_gen",
+        "//tensorflow/compiler/mlir/quantization/common:func",
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:tf_lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib:tf_quantization_config",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:calibration_parameters",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:const_op_size",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:tf_constant_fold",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/ops:temp_tf_op_quant_spec",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_tf_quantize_op",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:temp_fake_quant_utils",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_tf_to_uniform_attribute_utils",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_tf_to_xla_attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:import_model",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
+        "//tensorflow/compiler/mlir/utils:name_utils",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/ir/importexport:convert_tensor",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_googlesource_code_re2//:re2",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FunctionInterfaces",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Rewrite",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:UBDialect",
+        "@local_xla//xla:xla_data_proto_cc",
+    ],
+    # Alwayslink is required for registering the MLIR passes.
+    # TODO(b/255530126): Split the pass registration from the definitions to avoid binary size bloat.
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tf_quantize_preprocess",
+    srcs = [
+        "tf_quantize_preprocess.cc",
+    ],
+    hdrs = [
+        "tf_quantize_preprocess.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":tf_passes",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:tf_pass_pipeline",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/stablehlo:fold_broadcast_pass",
+        "//tensorflow/compiler/mlir/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
+        "//tensorflow/compiler/mlir/stablehlo:rename_entrypoint_to_main",
+        "//tensorflow/compiler/mlir/stablehlo:tf_fuse_convolution_pass",
+        "//tensorflow/compiler/mlir/stablehlo:tf_stablehlo",
+        "//tensorflow/compiler/mlir/stablehlo:unfuse_batch_norm_pass",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/platform:path",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir_hlo:all_passes",
+    ],
+)
+
 cc_library(
     name = "quantize_preprocess",
     srcs = [
@@ -487,15 +719,15 @@ cc_library(
     deps = [
         ":passes",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite/stablehlo:fuse_convolution_pass",
-        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
-        "//tensorflow/compiler/mlir/lite/stablehlo:rename_entrypoint_to_main",
-        "//tensorflow/compiler/mlir/lite/stablehlo:tf_stablehlo",
-        "//tensorflow/compiler/mlir/lite/stablehlo:unfuse_batch_norm_pass",
         "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pass_pipeline",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/stablehlo:fold_broadcast_pass",
+        "//tensorflow/compiler/mlir/stablehlo:fuse_convolution_pass",
+        "//tensorflow/compiler/mlir/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
+        "//tensorflow/compiler/mlir/stablehlo:rename_entrypoint_to_main",
+        "//tensorflow/compiler/mlir/stablehlo:tf_stablehlo",
+        "//tensorflow/compiler/mlir/stablehlo:unfuse_batch_norm_pass",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
@@ -537,6 +769,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_quantize_passes",
+    srcs = ["tf_quantize_passes.cc"],
+    hdrs = ["tf_quantize_passes.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":quantization_options_proto_cc",
+        ":tf_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
+        "//tensorflow/core/platform:path",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+    ],
+)
+
 # OSS only: This target is header-only. Link `quantization_options_proto_impl` only to
 # `libtensorflow_framework.so` via `lib_internal_impl`. Do NOT link
 # `quantization_options_proto_impl` directly unless the target does not link
@@ -593,8 +844,10 @@ tf_cc_binary(
     srcs = ["passes/tf_quant_opt.cc"],
     deps = [
         ":passes",
+        ":tf_passes",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
index e72a71f4a35d..09dfcae58466 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
@@ -74,7 +74,7 @@ proto file.)doc");
 class CalibrationStatisticsSaverOp : public OpKernel {
  public:
   explicit CalibrationStatisticsSaverOp(
-      absl::Nonnull<OpKernelConstruction*> context)
+      OpKernelConstruction* absl_nonnull context)
       : OpKernel(context) {
     std::string output_file_path;
     OP_REQUIRES_OK(context,
@@ -128,7 +128,7 @@ class CalibrationStatisticsSaverOp : public OpKernel {
     }
   }
 
-  void Compute(absl::Nonnull<OpKernelContext*> context) override {
+  void Compute(OpKernelContext* absl_nonnull context) override {
     for (int idx = 0; idx < ids_.size(); ++idx) {
       AssignIfNotExists(
           ids_[idx], static_cast<CalibrationMethod>(calibration_methods_[idx]));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 61c9ad722977..e605104708ef 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -56,6 +56,7 @@ tf_cc_test(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
@@ -73,6 +74,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_remaining_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@com_google_absl//absl/algorithm:container",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -86,6 +88,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
@@ -139,7 +142,9 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow/debugging:mlir_dump",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_xla//xla/tsl/platform:errors",
@@ -147,6 +152,47 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_constant_fold",
+    srcs = [
+        "tf_constant_fold.cc",
+    ],
+    hdrs = [
+        "tf_constant_fold.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common:tf_lift_as_function_call",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:constant_fold_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_constant_fold_test",
+    srcs = ["tf_constant_fold_test.cc"],
+    deps = [
+        ":tf_constant_fold",
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:tf_test_base",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
 cc_library(
     name = "constant_fold",
     srcs = [
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.cc
index 2c1b85ba1945..c12f70785ea3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.cc
@@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h"
 
 #include <climits>
+#include <cstdint>
 
+#include "absl/algorithm/container.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size_test.cc
index 5206aceec7b4..7879b7e8cb46 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h"
 
+#include <cstdint>
+
+#include <gmock/gmock.h>
 #include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/AsmState.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
index 60d2c07bdab8..fe6141fb9cb9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
 
-#include "absl/algorithm/container.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
index 8deda7c61383..2fba7211a71d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -48,6 +49,27 @@ std::string GenerateQuantizationUnitString(
                       kQuantizationUnitSuffix);
 }
 
+std::optional<StringRef> CallerNameFromCallSiteLoc(CallSiteLoc callsite_loc) {
+  // loc(callsite("func" at "QuantizationUnit(...)"))
+  if (mlir::isa<NameLoc>(callsite_loc.getCaller())) {
+    return mlir::cast<NameLoc>(callsite_loc.getCaller()).getName().strref();
+  }
+
+  // loc(callsite("func" at callsite("QuantizationUnit(...)" at ...)))
+  if (mlir::isa<CallSiteLoc>(callsite_loc.getCaller())) {
+    CallSiteLoc caller_callsite_loc =
+        mlir::cast<CallSiteLoc>(callsite_loc.getCaller());
+
+    if (mlir::isa<NameLoc>(caller_callsite_loc.getCallee())) {
+      return mlir::cast<NameLoc>(caller_callsite_loc.getCallee())
+          .getName()
+          .strref();
+    }
+  }
+
+  return std::nullopt;
+}
+
 }  // namespace
 
 QuantizationUnitLoc::QuantizationUnitLoc(MLIRContext* context,
@@ -65,22 +87,25 @@ bool QuantizationUnitLoc::classof(Attribute attr) {
   if (!llvm::isa<CallSiteLoc>(attr)) return false;
   auto callsite_loc = llvm::dyn_cast<CallSiteLoc>(attr);
 
-  if (!mlir::isa<NameLoc>(callsite_loc.getCaller())) return false;
-  StringRef caller_name =
-      mlir::cast<NameLoc>(callsite_loc.getCaller()).getName().strref();
-  return caller_name.starts_with(kQuantizationUnitPrefix) &&
-         caller_name.ends_with(kQuantizationUnitSuffix);
+  std::optional<StringRef> caller_name =
+      CallerNameFromCallSiteLoc(callsite_loc);
+
+  return caller_name && caller_name->starts_with(kQuantizationUnitPrefix) &&
+         caller_name->ends_with(kQuantizationUnitSuffix);
 }
 
 std::optional<QuantizationUnitLoc::QuantizationUnit>
 FindQuantizationUnitFromLoc(Location loc) {
   if (isa<QuantizationUnitLoc>(loc)) {
-    Location caller = mlir::cast<CallSiteLoc>(loc).getCaller();
-    StringRef caller_name = mlir::cast<NameLoc>(caller).getName().strref();
+    std::optional<StringRef> caller_name =
+        CallerNameFromCallSiteLoc(mlir::cast<CallSiteLoc>(loc));
+    if (!caller_name) {
+      return std::nullopt;
+    }
     const size_t start_index = kQuantizationUnitPrefix.size();
-    const size_t end_index = caller_name.rfind(kQuantizationUnitSuffix);
+    const size_t end_index = caller_name->rfind(kQuantizationUnitSuffix);
     std::string serialized_proto =
-        caller_name.substr(start_index, end_index - start_index).str();
+        caller_name->substr(start_index, end_index - start_index).str();
     QuantizationUnitLoc::QuantizationUnit quant_unit;
     if (quant_unit.ParseFromString(serialized_proto)) {
       return quant_unit;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc
index b6380c8de8d8..cebab2d63d98 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc
@@ -17,6 +17,9 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
index b3d60f7c6b5e..89b066d5df20 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
index 73e8256e3384..cbbd11f59271 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
index f0a71cf8a9ef..74ef3189770c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
@@ -14,13 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h"
 
+#include <cstdint>
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include "absl/cleanup/cleanup.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.cc
new file mode 100644
index 000000000000..b29a6d201fdc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.cc
@@ -0,0 +1,146 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+// Folds the operation recursively and return the results.
+LogicalResult FoldOperation(OpBuilder& builder, Operation* op,
+                            SmallVector<Value>& results) {
+  SmallVector<ElementsAttr> inputs;
+  for (auto operand : op->getOperands()) {
+    auto preceding_const_op = operand.getDefiningOp<TF::ConstOp>();
+    if (preceding_const_op) {
+      inputs.push_back(preceding_const_op.getValue());
+      continue;
+    }
+
+    Operation* preceding_op = operand.getDefiningOp();
+    int preceding_result_id = -1;
+    for (auto preceding_result : preceding_op->getResults()) {
+      if (operand == preceding_result) {
+        preceding_result_id = preceding_result.getResultNumber();
+        break;
+      }
+    }
+    SmallVector<Value> preceding_results;
+    if (failed(FoldOperation(builder, preceding_op, preceding_results))) {
+      return failure();
+    }
+    auto preceding_result = preceding_results[preceding_result_id];
+    preceding_const_op = preceding_result.getDefiningOp<TF::ConstOp>();
+    inputs.push_back(preceding_const_op.getValue());
+  }
+
+  SmallVector<Attribute> result_values;
+  if (failed(TF::EvaluateOperation(op, inputs, result_values))) {
+    return failure();
+  }
+
+  results.clear();
+  builder.setInsertionPointAfter(op);
+  for (const auto& result_value : result_values) {
+    results.push_back(builder.create<TF::ConstOp>(op->getLoc(), result_value));
+  }
+  return success();
+}
+
+bool IsOperationFoldable(Operation* op) {
+  if (isa<TF::ConstOp>(op)) return true;
+
+  if (op->getDialect()->getNamespace() != "tf" || !TF::CanBeFolded(op)) {
+    return false;
+  }
+
+  // Check if the operands are foldable as well.
+  for (auto operand : op->getOperands()) {
+    auto preceding_op = operand.getDefiningOp();
+    if (!preceding_op || !IsOperationFoldable(preceding_op)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// TODO: b/289744814 - Refactor to have a single source of truth of TF Quant
+// specs.
+absl::flat_hash_set<int> GetQuantizableOperands(Operation* op) {
+  absl::flat_hash_set<int> quantizable_operands;
+  if (isa<TF::DepthwiseConv2dNativeOp, TF::Conv2DOp, TF::Conv3DOp, TF::MatMulOp,
+          TF::BatchMatMulOp>(op)) {
+    quantizable_operands.insert(1);
+  } else if (isa<TF::GatherOp>(op)) {
+    quantizable_operands.insert(0);
+  } else if (auto einsum_op = dyn_cast<TF::EinsumOp>(op)) {
+    if (IsEinsumSupportedByXlaDotV2(einsum_op.getEquationAttr())) {
+      quantizable_operands.insert(1);
+    }
+  }
+  return quantizable_operands;
+}
+}  // namespace
+
+SmallVector<Value> ConstantFoldOpIfPossible(Operation* op) {
+  if (!IsOperationFoldable(op)) return op->getResults();
+
+  OpBuilder builder(op);
+  SmallVector<Value> results;
+  if (failed(FoldOperation(builder, op, results))) {
+    return op->getResults();
+  }
+  return results;
+}
+
+LogicalResult ConstantFoldQuantizableOperands::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  absl::flat_hash_set<int> quantizable_operands = GetQuantizableOperands(op);
+  if (quantizable_operands.empty()) return failure();
+
+  bool has_change = false;
+  for (auto operand_idx : quantizable_operands) {
+    Value operand = op->getOperand(operand_idx);
+    Operation* preceding_op = operand.getDefiningOp();
+    if (!preceding_op || isa<TF::ConstOp>(preceding_op)) continue;
+
+    int preceding_result_idx = -1;
+    for (auto preceding_result : preceding_op->getResults()) {
+      if (operand == preceding_result) {
+        preceding_result_idx = preceding_result.getResultNumber();
+        break;
+      }
+    }
+
+    has_change = has_change || IsOperationFoldable(preceding_op);
+    SmallVector<Value> folded_results = ConstantFoldOpIfPossible(preceding_op);
+    op->setOperand(operand_idx, folded_results[preceding_result_idx]);
+  }
+
+  return success(/*isSuccess=*/has_change);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h b/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h
new file mode 100644
index 000000000000..03487b737596
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_TF_CONSTANT_FOLD_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_TF_CONSTANT_FOLD_H_
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_quant {
+
+// Applies constant folding recursively if the operation and all of its operands
+// are foldable. Returns the constants generated by constant-folding or the
+// original operation's outputs if not folded.
+SmallVector<Value> ConstantFoldOpIfPossible(Operation* op);
+
+// This pattern tries to constant-fold the quantizable operands of supported
+// TF operations.
+struct ConstantFoldQuantizableOperands : public RewritePattern {
+ public:
+  explicit ConstantFoldQuantizableOperands(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override;
+};
+
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_TF_CONSTANT_FOLD_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold_test.cc
new file mode 100644
index 000000000000..a06d8c11da10
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold_test.cc
@@ -0,0 +1,201 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h"
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_test_base.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::testing::NotNull;
+using ::testing::SizeIs;
+
+using ConstantFoldingTest = ::mlir::tf_quant::QuantizationTestBase;
+
+TEST_F(ConstantFoldingTest, FoldLargeConstant) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module {
+      func.func @test_fold_constant() -> (tensor<1024x24x24x3xf32>) {
+        %zp = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+        %scale = "tf.Const"() {value = dense<2.0> : tensor<f32>} : () -> tensor<f32>
+        %weight = "tf.Const"() {value = dense<1> : tensor<1024x24x24x3xi8>} : () -> tensor<1024x24x24x3xi8>
+        %input_i32 = "tf.Cast"(%weight) : (tensor<1024x24x24x3xi8>) -> tensor<1024x24x24x3xi32>
+        %output = "tf.Sub"(%input_i32, %zp) : (tensor<1024x24x24x3xi32>, tensor<i32>) -> tensor<1024x24x24x3xi32>
+        %cast = "tf.Cast"(%output) : (tensor<1024x24x24x3xi32>) -> tensor<1024x24x24x3xf32>
+        %mul = "tf.Mul"(%cast, %scale) : (tensor<1024x24x24x3xf32>, tensor<f32>) -> tensor<1024x24x24x3xf32>
+        func.return %mul : tensor<1024x24x24x3xf32>
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
+  Operation* mul_op = FindOperationOfType<TF::MulOp>(test_func);
+  SmallVector<Value> results = ConstantFoldOpIfPossible(mul_op);
+  EXPECT_THAT(results, SizeIs(1));
+  EXPECT_TRUE(isa<TF::ConstOp>(results[0].getDefiningOp()));
+}
+
+TEST_F(ConstantFoldingTest, NotFoldingIdentity) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module {
+      func.func @test_fold_constant() -> (tensor<1024x24x24x3xf32>) {
+        %zp = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+        %scale = "tf.Const"() {value = dense<2.0> : tensor<f32>} : () -> tensor<f32>
+        %weight = "tf.Const"() {value = dense<1> : tensor<1024x24x24x3xi8>} : () -> tensor<1024x24x24x3xi8>
+        %input_i32 = "tf.Cast"(%weight) : (tensor<1024x24x24x3xi8>) -> tensor<1024x24x24x3xi32>
+        %output = "tf.Sub"(%input_i32, %zp) : (tensor<1024x24x24x3xi32>, tensor<i32>) -> tensor<1024x24x24x3xi32>
+        %cast = "tf.Cast"(%output) : (tensor<1024x24x24x3xi32>) -> tensor<1024x24x24x3xf32>
+        %identity = "tf.Identity"(%scale) : (tensor<f32>) -> tensor<f32>
+        %mul = "tf.Mul"(%cast, %identity) : (tensor<1024x24x24x3xf32>, tensor<f32>) -> tensor<1024x24x24x3xf32>
+        func.return %mul : tensor<1024x24x24x3xf32>
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
+  Operation* op_to_fold = FindOperationOfType<TF::MulOp>(test_func);
+  SmallVector<Value> results = ConstantFoldOpIfPossible(op_to_fold);
+  EXPECT_THAT(results, SizeIs(1));
+  // No constant-folding since the IdentityOp has `TF_NoConstantFold` trait.
+  auto mul_op = dyn_cast_or_null<TF::MulOp>(results[0].getDefiningOp());
+  EXPECT_THAT(mul_op, NotNull());
+  // Even though the preceding CastOp is foldable, it shouldn't be folded since
+  // we are calling from the MulOp.
+  EXPECT_TRUE(isa<TF::CastOp>(mul_op.getX().getDefiningOp()));
+}
+
+TEST_F(ConstantFoldingTest, NotFoldingArgument) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module {
+      func.func @test_fold_constant(%arg0: tensor<f32>) -> (tensor<1024x24x24x3xf32>) {
+        %zp = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+        %weight = "tf.Const"() {value = dense<1> : tensor<1024x24x24x3xi8>} : () -> tensor<1024x24x24x3xi8>
+        %input_i32 = "tf.Cast"(%weight) : (tensor<1024x24x24x3xi8>) -> tensor<1024x24x24x3xi32>
+        %output = "tf.Sub"(%input_i32, %zp) : (tensor<1024x24x24x3xi32>, tensor<i32>) -> tensor<1024x24x24x3xi32>
+        %cast = "tf.Cast"(%output) : (tensor<1024x24x24x3xi32>) -> tensor<1024x24x24x3xf32>
+        %mul = "tf.Mul"(%cast, %arg0) : (tensor<1024x24x24x3xf32>, tensor<f32>) -> tensor<1024x24x24x3xf32>
+        func.return %mul : tensor<1024x24x24x3xf32>
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
+  Operation* op_to_fold = FindOperationOfType<TF::MulOp>(test_func);
+  SmallVector<Value> results = ConstantFoldOpIfPossible(op_to_fold);
+  EXPECT_THAT(results, SizeIs(1));
+  // No constant-folding since the second operand is an argument.
+  TF::MulOp mul_op = dyn_cast_or_null<TF::MulOp>(results[0].getDefiningOp());
+  EXPECT_THAT(mul_op, NotNull());
+  // Even though the preceding CastOp is foldable, it shouldn't be folded since
+  // we are calling from the MulOp.
+  EXPECT_TRUE(isa<TF::CastOp>(mul_op.getX().getDefiningOp()));
+}
+
+TEST_F(ConstantFoldingTest, FoldDepthwiseConvWeight) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module {
+      func.func @test_fold_constant(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
+        %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+        %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+        %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+        %cst_2 = "tf.Const"() {value = dense<3.0> : tensor<f32>} : () -> tensor<f32>
+        %w = "tf.Mul"(%cst, %cst_2) : (tensor<2x3x3x1xf32>, tensor<f32>) -> tensor<2x3x3x1xf32>
+        %0 = "tf.DepthwiseConv2dNative"(%arg0, %w) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+        %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+        %2 = "tf.Mul"(%1, %cst_1) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+        func.return %2 : tensor<?x?x?x3xf32>
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
+  RewritePatternSet patterns(ctx_.get());
+  patterns.add<ConstantFoldQuantizableOperands>(ctx_.get());
+  EXPECT_TRUE(succeeded(applyPatternsGreedily(test_func, std::move(patterns))));
+
+  auto depthwise_conv_op =
+      FindOperationOfType<TF::DepthwiseConv2dNativeOp>(test_func);
+  EXPECT_THAT(depthwise_conv_op, NotNull());
+  // The filter of the DepthwiseConv2dNativeOp is expected to be a constant.
+  EXPECT_TRUE(isa<TF::ConstOp>(depthwise_conv_op.getFilter().getDefiningOp()));
+}
+
+TEST_F(ConstantFoldingTest, DepthwiseConvWeightNotFoldable) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module {
+      func.func @test_fold_constant(%arg0: tensor<*xf32>, %arg1: tensor<f32>) -> (tensor<?x?x?x3xf32>) {
+        %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+        %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+        %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+        %w = "tf.Mul"(%cst, %arg1) : (tensor<2x3x3x1xf32>, tensor<f32>) -> tensor<2x3x3x1xf32>
+        %0 = "tf.DepthwiseConv2dNative"(%arg0, %w) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+        %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+        %2 = "tf.Mul"(%1, %cst_1) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+        func.return %2 : tensor<?x?x?x3xf32>
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
+  RewritePatternSet patterns(ctx_.get());
+  patterns.add<ConstantFoldQuantizableOperands>(ctx_.get());
+  EXPECT_TRUE(succeeded(applyPatternsGreedily(test_func, std::move(patterns))));
+
+  auto depthwise_conv_op =
+      FindOperationOfType<TF::DepthwiseConv2dNativeOp>(test_func);
+  EXPECT_THAT(depthwise_conv_op, NotNull());
+  // The filter of the DepthwiseConv2dNativeOp is not constant-foldable.
+  EXPECT_TRUE(isa<TF::MulOp>(depthwise_conv_op.getFilter().getDefiningOp()));
+}
+
+}  // namespace
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
index de23418e1af0..b41356e67e0f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
@@ -37,6 +37,56 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tf_tf_quantize_op",
+    srcs = [
+        "tf_tf_quantize_op.cc",
+    ],
+    hdrs = ["tf_tf_quantize_op.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_quantize_op_utils",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "temp_tf_op_quant_spec",
+    srcs = [
+        "temp_tf_op_quant_spec.cc",
+    ],
+    hdrs = ["temp_tf_op_quant_spec.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "temp_tf_op_quant_spec_test",
+    srcs = ["temp_tf_op_quant_spec_test.cc"],
+    deps = [
+        ":temp_tf_op_quant_spec",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "tf_quantize_op",
     srcs = [
@@ -76,6 +126,21 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tf_uniform_op_quant_spec",
+    srcs = [
+        "tf_uniform_op_quant_spec.cc",
+    ],
+    hdrs = ["tf_uniform_op_quant_spec.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "uniform_op_quant_spec",
     srcs = [
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.cc
new file mode 100644
index 000000000000..dd13cdb0fd7f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.cc
@@ -0,0 +1,168 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// TODO - b/296503614: [Converter Component][TF-Quantizer] Reflect custom traits
+// from TF-Quantizer to stableHLO quantization
+bool IsOpWithDataMovementTrait(Operation* op) {
+  // Supported data movement ops. These ops do not perform any computations and
+  // has one result operand.
+  return isa<TF::IdentityOp, TF::CastOp, TF::ReshapeOp, TF::XlaShardingOp,
+             TF::GatherOp, TF::GatherV2Op, TF::XlaGatherOp, TF::ExpandDimsOp,
+             TF::SqueezeOp, TF::TransposeOp>(op);
+}
+
+bool IsOpWithQuantizableTrait(Operation* op) {
+  // Supported quantizable ops.
+  return isa<TF::XlaConvV2Op, TF::XlaDotV2Op, TF::MatMulOp, TF::Conv2DOp,
+             TF::GatherOp, TF::GatherV2Op, TF::XlaGatherOp,
+             TF::ResourceGatherOp, TF::DepthwiseConv2dNativeOp, TF::Conv3DOp,
+             TF::BatchMatMulV2Op, TF::EinsumOp>(op);
+}
+
+bool IsOpWithInt8TypeOperand(Operation* op) {
+  return (isa<TF::XlaConvV2Op, TF::XlaDotV2Op, TF::XlaGatherOp, TF::GatherOp,
+              TF::GatherV2Op>(op));
+}
+
+bool IsValueWithQuantizablePrecision(Value val) {
+  auto type = mlir::dyn_cast<ShapedType>(val.getType());
+  if (!type) return false;
+  // Supported original tensor data types.
+  if (type.getElementType().isF32() || type.getElementType().isBF16())
+    return true;
+  return false;
+}
+
+std::optional<tensorflow::quantization::QuantizationComponentSpec>
+GetWeightComponentSpec(
+    const tensorflow::quantization::QuantizationOptions& quantization_options) {
+  for (auto& cur_spec : quantization_options.quantization_method()
+                            .quantization_component_specs()) {
+    if (cur_spec.quantization_component() ==
+        tensorflow::quantization::QuantizationComponentSpec::COMPONENT_WEIGHT)
+      return cur_spec;
+  }
+  return std::nullopt;
+}
+
+// TODO(b/228928859): Improve the getter function to match attributes rather
+// than function name.
+std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op) {
+  auto spec = std::make_unique<OpQuantSpec>();
+  if (auto call_op = dyn_cast<TF::PartitionedCallOp>(op)) {
+    StringRef function_name =
+        mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
+    if (!function_name.starts_with("composite_")) {
+      return spec;
+    }
+    if (function_name.contains("depthwise_conv2d")) {
+      spec->coeff_op_quant_dim[1] = 3;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  tf_quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("conv2d")) {
+      spec->coeff_op_quant_dim[1] = 3;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  tf_quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("matmul")) {
+      spec->coeff_op_quant_dim[1] = -1;
+      if (function_name.contains("with_bias") ||
+          function_name.contains("and_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  tf_quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("einsum")) {
+      spec->coeff_op_quant_dim[1] = -1;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  tf_quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("conv3d")) {
+      spec->coeff_op_quant_dim[1] = 4;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  tf_quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("batch_matmul")) {
+      spec->coeff_op_quant_dim[1] = -1;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  tf_quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("gather")) {
+      // Note that gather has axis attribute that specifies channel axis.
+      spec->coeff_op_quant_dim[0] = -1;
+    }
+    for (auto quantizable_operand : spec->coeff_op_quant_dim) {
+      spec->quantizable_operands.insert(quantizable_operand.first);
+    }
+  }
+  return spec;
+}
+
+std::unique_ptr<OpQuantScaleSpec> GetTfQuantScaleSpec(Operation* op) {
+  auto scale_spec = std::make_unique<OpQuantScaleSpec>();
+  if (llvm::isa<
+          // clang-format off
+          // go/keep-sorted start
+          TF::AvgPoolOp,
+          TF::ConcatOp,
+          TF::ConcatV2Op,
+          TF::ExpandDimsOp,
+          TF::IdentityNOp,
+          TF::IdentityOp,
+          TF::MaxPoolOp,
+          TF::PadV2Op,
+          TF::RankOp,
+          TF::ReshapeOp,
+          TF::SelectOp,
+          TF::SelectV2Op,
+          TF::ShapeNOp,
+          TF::ShapeOp,
+          TF::SizeOp,
+          TF::SqueezeOp,
+          TF::TransposeOp
+          // go/keep-sorted end
+          // clang-format on
+          >(op)) {
+    scale_spec->has_same_scale_requirement = true;
+  }
+  return scale_spec;
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h b/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h
new file mode 100644
index 000000000000..ba89e21ff08f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functions for quantization specifications of TensorFlow ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TEMP_TF_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TEMP_TF_OP_QUANT_SPEC_H_
+
+#include <memory>
+#include <optional>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// Check if the op has data movement trait. Ops with this trait do not perform
+// any computations but just move data and has one result operand.
+bool IsOpWithDataMovementTrait(Operation* op);
+
+// Check if the op is quantizable. Currently, the scope of quantizable op is
+// limited to compute intense operations and the ops that supports integer
+// operands.
+bool IsOpWithQuantizableTrait(Operation* op);
+
+// Check if the op's operand accepts int8 type.
+bool IsOpWithInt8TypeOperand(Operation* op);
+
+// Check if the data is in quantizable precision. Currently, a value in f32 or
+// bf16 is quantizable.
+bool IsValueWithQuantizablePrecision(Value val);
+
+std::optional<tensorflow::quantization::QuantizationComponentSpec>
+GetWeightComponentSpec(
+    const tensorflow::quantization::QuantizationOptions& quantization_options);
+
+// Returns the spec for the given operation that can be used for both of
+// dynamic and static range quantization.
+std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op);
+
+// Returns quantization scale specs (fixed output, same scale) for a TF op.
+std::unique_ptr<OpQuantScaleSpec> GetTfQuantScaleSpec(Operation* op);
+
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TEMP_TF_OP_QUANT_SPEC_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec_test.cc
new file mode 100644
index 000000000000..9ee83d63a7a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::tf_quant {
+namespace {
+
+using QuantizationOptions = tensorflow::quantization::QuantizationOptions;
+using QuantizationComponentSpec =
+    tensorflow::quantization::QuantizationComponentSpec;
+
+TEST(TfOpQuantSpecTest, WeightComponentSpecExist) {
+  QuantizationOptions quant_options;
+  QuantizationComponentSpec quant_spec;
+  quant_spec.set_quantization_component(
+      QuantizationComponentSpec::COMPONENT_WEIGHT);
+  quant_spec.set_tensor_type(QuantizationComponentSpec::TENSORTYPE_INT_8);
+  auto mutable_quant_method = quant_options.mutable_quantization_method();
+  *mutable_quant_method->add_quantization_component_specs() = quant_spec;
+  auto output = GetWeightComponentSpec(quant_options);
+  EXPECT_TRUE(output.has_value());
+}
+
+TEST(TfOpQuantSpecTest, WeightComponentSpecDoNotExist) {
+  QuantizationOptions quant_options;
+  auto output = GetWeightComponentSpec(quant_options);
+  EXPECT_FALSE(output.has_value());
+}
+
+}  // namespace
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index 9630b20b32d5..86bf1677b06d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -89,39 +89,33 @@ std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op) {
     if (function_name.contains("depthwise_conv2d")) {
       spec->coeff_op_quant_dim[1] = 3;
       if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
       }
     } else if (function_name.contains("conv2d")) {
       spec->coeff_op_quant_dim[1] = 3;
       if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
       }
     } else if (function_name.contains("matmul")) {
       spec->coeff_op_quant_dim[1] = -1;
       if (function_name.contains("with_bias") ||
           function_name.contains("and_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
       }
     } else if (function_name.contains("einsum")) {
       spec->coeff_op_quant_dim[1] = -1;
       if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
       }
     } else if (function_name.contains("conv3d")) {
       spec->coeff_op_quant_dim[1] = 4;
       if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
       }
     } else if (function_name.contains("batch_matmul")) {
       spec->coeff_op_quant_dim[1] = -1;
       if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
+        spec->biases_params[2] = {{0, 1}, GetUniformQuantizedTypeForBias};
       }
     } else if (function_name.contains("gather")) {
       // Note that gather has axis attribute that specifies channel axis.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc
index 6aacfeac0fdd..4394045469cc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/types/optional.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
@@ -154,8 +153,8 @@ QuantizedType CalculateUniformQuantParams(
   DenseFPElementsAttr attr;
   if (!matchPattern(op->getResult(0), m_Constant(&attr))) return nullptr;
 
-  QuantizedType quant_type = mlir::dyn_cast<quant::QuantizedType>(
-      quant::GetUniformQuantizedTypeForWeight(
+  QuantizedType quant_type =
+      mlir::dyn_cast<quant::QuantizedType>(GetUniformQuantizedTypeForWeight(
           attr, /*symmetric=*/kIsNarrowRange && kIsSigned, kBitWidth, kIsSigned,
           kIsNarrowRange, /*is_legacy_float*/ false));
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.cc
new file mode 100644
index 000000000000..d049fe15a084
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.cc
@@ -0,0 +1,261 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.h"
+
+#include <functional>
+#include <optional>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+constexpr StringRef kDequantizeFunctionName = "composite_dequantize";
+constexpr StringRef kUniformQuantizationFunctionName = "uniform";
+
+// Pre-actions before adding quantization logics. It creates a function with the
+// func_name where input_val is an input and result_type is a result.
+func::FuncOp PrepareFunctionRegister(PatternRewriter& rewriter, Value input_val,
+                                     ShapedType result_type,
+                                     StringRef func_name,
+                                     Value& func_input_arg) {
+  Operation* input_op = input_val.getDefiningOp();
+
+  Operation* insertion_point = input_op->getParentOfType<func::FuncOp>();
+  if (!insertion_point) insertion_point = input_op->getParentOfType<ModuleOp>();
+  rewriter.setInsertionPointAfter(insertion_point);
+
+  UnrankedTensorType create_unknown_input_shape =
+      quant::CreateUnknownShapeFromElementType(input_val.getType());
+  UnrankedTensorType create_unknown_output_shape =
+      quant::CreateUnknownShapeFromElementType(result_type);
+
+  FunctionType func_type =
+      FunctionType::get(rewriter.getContext(), {create_unknown_input_shape},
+                        {create_unknown_output_shape});
+
+  func::FuncOp quantization_func =
+      rewriter.create<func::FuncOp>(input_op->getLoc(), func_name, func_type);
+
+  OpBuilder::InsertionGuard guard = OpBuilder::InsertionGuard(rewriter);
+  ArrayRef<Type> inputs = quantization_func.getFunctionType().getInputs();
+  Block* block = rewriter.createBlock(
+      &quantization_func.getBody(), quantization_func.begin(), inputs,
+      SmallVector<Location>(inputs.size(), quantization_func.getLoc()));
+  func_input_arg = block->getArgument(0);
+  return quantization_func;
+}
+
+// Post-actions after adding quantization logics. Post-actions include
+// 1) Adding the created function in the symbol table
+// 2) Creating a PartitionedCallOp in the main graph that calls the created
+//    function.
+TF::PartitionedCallOp FinalizeFunctionRegister(
+    PatternRewriter& rewriter, Value input, Value output,
+    func::FuncOp& quantization_func, Operation* quantized_op,
+    StringRef func_name, IRRewriter::InsertPoint original_point,
+    Type quantize_result_type) {
+  rewriter.create<func::ReturnOp>(input.getLoc(), ArrayRef<Value>({output}));
+
+  quantization_func.setVisibility(func::FuncOp::Visibility::Private);
+  SymbolTable symbol_table(quantized_op->getParentOfType<ModuleOp>());
+
+  symbol_table.insert(quantization_func);
+
+  FlatSymbolRefAttr func_name_attr =
+      FlatSymbolRefAttr::get(rewriter.getStringAttr(func_name));
+
+  rewriter.restoreInsertionPoint(original_point);
+
+  auto quantize_call = rewriter.create<TF::PartitionedCallOp>(
+      quantized_op->getLoc(), quantize_result_type, input,
+      /*args_attrs=*/nullptr, /*res_attrs=*/nullptr, func_name_attr,
+      /*config=*/"", /*config_proto=*/"", /*executor_type=*/"");
+  return quantize_call;
+}
+
+// Acts as a register of a function where the body has a sequence of operations
+// required to execute certain quantization scheme's quant/dequantization
+// logics.
+std::optional<TF::PartitionedCallOp> RegisterOperationsInFuncOp(
+    StringRef func_name, PatternRewriter& rewriter, QuantizedType quant_type,
+    Value input_val, ShapedType result_type,
+    std::function<Operation*(PatternRewriter&, Operation*, Value, ShapedType,
+                             QuantizedType)>
+        quantization_operations_func) {
+  Operation* input_op = input_val.getDefiningOp();
+  auto original_point = rewriter.saveInsertionPoint();
+
+  auto unique_func_name = func_name.str();
+  SymbolTable symbol_table(input_op->getParentOfType<ModuleOp>());
+  while (symbol_table.lookup(unique_func_name)) {
+    absl::StrAppend(&unique_func_name, "_");
+  }
+
+  Value func_input_arg;
+  // Creates a function.
+  func::FuncOp func_op = PrepareFunctionRegister(
+      rewriter, input_val, result_type, unique_func_name, func_input_arg);
+
+  // Fills the body.
+  Operation* last_op_in_func =
+      quantization_operations_func(rewriter, func_op.getOperation(),
+                                   func_input_arg, result_type, quant_type);
+
+  // Connect the function in the existing graph.
+  auto end_call_op = FinalizeFunctionRegister(
+      rewriter, input_val, last_op_in_func->getResult(0), func_op, input_op,
+      unique_func_name, original_point, result_type);
+  return end_call_op;
+}
+
+QuantizedType CalculateUniformQuantParams(
+    PatternRewriter& rewriter, TF::ConstOp op,
+    tensorflow::quantization::QuantizationComponentSpec& weight_spec) {
+  // TODO - b/278949920: Enable Per-Channel Quantization for XLA Opset
+  // Currently, support symmetric, per-tensor, signed int8
+  const bool kIsNarrowRange = true;
+  const bool kIsSigned = true;
+  const int kBitWidth = 8;
+
+  DenseFPElementsAttr attr;
+  if (!matchPattern(op->getResult(0), m_Constant(&attr))) return nullptr;
+
+  QuantizedType quant_type =
+      mlir::dyn_cast<quant::QuantizedType>(GetUniformQuantizedTypeForWeight(
+          attr, /*symmetric=*/kIsNarrowRange && kIsSigned, kBitWidth, kIsSigned,
+          kIsNarrowRange, /*is_legacy_float*/ false));
+
+  return quant_type;
+}
+
+// Add uniform quantization's quantization logic.
+std::optional<Value> AddUniformQuantizeOps(PatternRewriter& rewriter,
+                                           TF::ConstOp op,
+                                           QuantizedType quant_type) {
+  DenseFPElementsAttr attr;
+  if (!matchPattern(op->getResult(0), m_Constant(&attr))) {
+    return nullptr;
+  }
+  Type expressed_type = op.getResult().getType();
+  Type quantized_type = quant_type.castFromExpressedType(expressed_type);
+  ShapedType shaped_quantized_type = mlir::cast<ShapedType>(quantized_type);
+  DenseElementsAttr tensor_proto_attr =
+      mlir::dyn_cast<DenseElementsAttr>(Quantize(attr, shaped_quantized_type));
+  if (!tensor_proto_attr) {
+    return nullptr;
+  }
+
+  Type storage_type =
+      mlir::cast<QuantizedType>(shaped_quantized_type.getElementType())
+          .getStorageType();
+  ShapedType new_type = shaped_quantized_type.clone(storage_type);
+
+  rewriter.setInsertionPointAfter(op);
+  auto const_op =
+      rewriter.create<TF::ConstOp>(op.getLoc(), new_type, tensor_proto_attr);
+  auto new_identity_op = rewriter.create<TF::IdentityOp>(
+      op->getLoc(), const_op.getType(), const_op);
+  return new_identity_op.getResult();
+}
+
+Operation* LogicsForUniformDequanization(PatternRewriter& rewriter,
+                                         Operation* func_op, Value input_val,
+                                         ShapedType original_input_tensor_type,
+                                         QuantizedType quant_type) {
+  auto loc = input_val.getLoc();
+  rewriter.setInsertionPointToStart(
+      &(cast<func::FuncOp>(func_op)).getBody().front());
+
+  UnrankedTensorType create_unknown_input_shape =
+      quant::CreateUnknownShapeFromElementType(original_input_tensor_type);
+  auto new_cast_op =
+      rewriter.create<TF::CastOp>(loc, create_unknown_input_shape, input_val);
+  // TODO - b/278949920: Enable Per-Channel Quantization for XLA Opset
+  auto qtype = mlir::dyn_cast<UniformQuantizedType>(quant_type);
+  TensorType scale_type = RankedTensorType::get({}, rewriter.getF32Type());
+  Value scale_op = rewriter.create<TF::ConstOp>(
+      loc, scale_type,
+      DenseFPElementsAttr::get(scale_type,
+                               {static_cast<float>(qtype.getScale())}));
+
+  if (original_input_tensor_type.getElementType().isBF16()) {
+    // Add bf16 cast op after scale to match with the next op's data
+    // type.
+    scale_op = rewriter.create<TF::CastOp>(
+        loc, UnrankedTensorType::get(rewriter.getBF16Type()), scale_op);
+  }
+
+  auto mul_op = rewriter.create<TF::MulOp>(loc, new_cast_op.getType(), scale_op,
+                                           new_cast_op);
+  return mul_op;
+}
+
+// Add uniform quantization's dequantization logic.
+std::optional<TF::PartitionedCallOp> AddUniformDequantizeOps(
+    PatternRewriter& rewriter, QuantizedType quant_type,
+    Value val_to_dequantize, ShapedType result_type) {
+  auto func_name = absl::StrJoin(
+      {kDequantizeFunctionName, kUniformQuantizationFunctionName}, "_");
+
+  std::optional<TF::PartitionedCallOp> dequant_op = RegisterOperationsInFuncOp(
+      func_name, rewriter, quant_type, val_to_dequantize, result_type,
+      LogicsForUniformDequanization);
+
+  return dequant_op;
+}
+}  // namespace
+
+// Generate quantize and dequantize functions with uniform quantization.
+std::optional<TF::PartitionedCallOp> ApplyUniformQuantization(
+    PatternRewriter& rewriter, TF::ConstOp op,
+    tensorflow::quantization::QuantizationComponentSpec& weight_spec) {
+  QuantizedType quant_type =
+      CalculateUniformQuantParams(rewriter, op, weight_spec);
+  if (!quant_type) return nullptr;
+
+  std::optional<Value> quantized_val =
+      AddUniformQuantizeOps(rewriter, op, quant_type);
+  if (!quantized_val.has_value()) return std::nullopt;
+
+  std::optional<TF::PartitionedCallOp> dequantized_val =
+      AddUniformDequantizeOps(rewriter, quant_type, quantized_val.value(),
+                              mlir::cast<ShapedType>(op.getType()));
+
+  return dequantized_val;
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.h b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.h
new file mode 100644
index 000000000000..6f7deda4f320
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides a list of supported quantization algorithms in the format
+// of "apply<Name of the Quantization Algorithm>Quantization".
+// After applying the function, a quantize/dequantize functions are created
+// where the body of each function contains a specific quantization algorithm.
+// The input of the quantize function has one operand of
+// IsValueWithQuantizablePrecision and the output is a tensor with supported
+// quantized precision (like int8). For dequantize function, it is the other way
+// around.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_TF_QUANTIZE_OP_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_TF_QUANTIZE_OP_H_
+
+#include <optional>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+
+std::optional<TF::PartitionedCallOp> ApplyUniformQuantization(
+    PatternRewriter& rewriter, TF::ConstOp op,
+    tensorflow::quantization::QuantizationComponentSpec& weight_spec);
+
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_TF_QUANTIZE_OP_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.cc
new file mode 100644
index 000000000000..f7e1c01b759b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.cc
@@ -0,0 +1,41 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.h"
+
+#include <memory>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::tf_quant {
+
+std::unique_ptr<OpQuantSpec> GetUniformOpQuantSpec(Operation* op) {
+  auto spec = std::make_unique<OpQuantSpec>();
+  if (isa<TF::UniformQuantizedConvolutionHybridOp>(op) ||
+      isa<TF::UniformQuantizedConvolutionOp>(op)) {
+    spec->coeff_op_quant_dim[1] = 3;
+  } else if (isa<TF::UniformQuantizedDotHybridOp>(op)) {
+    spec->coeff_op_quant_dim[1] = -1;
+  }
+
+  for (auto quantizable_operand : spec->coeff_op_quant_dim) {
+    spec->quantizable_operands.insert(quantizable_operand.first);
+  }
+  return spec;
+}
+
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.h b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.h
new file mode 100644
index 000000000000..23da455519f0
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functions for quantization specifications of Uniform Quantized ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_UNIFORM_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_UNIFORM_OP_QUANT_SPEC_H_
+
+#include <memory>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// Returns the spec for the given operation that can be used for both of
+// dynamic and static range quantization.
+std::unique_ptr<OpQuantSpec> GetUniformOpQuantSpec(Operation* op);
+
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_TF_UNIFORM_OP_QUANT_SPEC_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
index b59eaf759174..0b73b9c550b6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
@@ -174,6 +174,15 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
         debugger_type_(debugger_type),
         log_dir_path_(std::move(log_dir_path)) {}
 
+  LogicalResult matchAndRewrite(LiftedOpT op,
+                                PatternRewriter &rewriter) const override {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
  private:
   SmallVector<NamedAttribute> CreateDumpAttributes(
       PatternRewriter &rewriter, const StringRef folder_name,
@@ -203,7 +212,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
     return symbol_table.insert(new_ref_func);
   }
 
-  LogicalResult match(LiftedOpT op) const override {
+  LogicalResult match(LiftedOpT op) const {
     if (!op->hasAttr(kQuantTraitAttrName) || op->getNumResults() != 1) {
       return failure();
     }
@@ -218,7 +227,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
     return success();
   }
 
-  void rewrite(LiftedOpT op, PatternRewriter &rewriter) const override {
+  void rewrite(LiftedOpT op, PatternRewriter &rewriter) const {
     // Only support ops with 1 results
     Value result = op->getResult(0);
     rewriter.setInsertionPointAfterValue(result);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
index 7c0afb0b683b..16782cc292aa 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
index 4d1ae4ea5397..50d4030083d9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
@@ -52,8 +52,17 @@ class CastBf16OpsToF32 : public RewritePattern {
   explicit CastBf16OpsToF32(MLIRContext* context)
       : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
 
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
  private:
-  LogicalResult match(Operation* op) const override {
+  LogicalResult match(Operation* op) const {
     if (isa<TF::CastOp, TF::ConstOp>(op) ||
         op->getName().hasTrait<OpTrait::ZeroOperands>()) {
       return failure();
@@ -71,7 +80,7 @@ class CastBf16OpsToF32 : public RewritePattern {
     return failure();
   }
 
-  void rewrite(Operation* op, PatternRewriter& rewriter) const override {
+  void rewrite(Operation* op, PatternRewriter& rewriter) const {
     // Casts inputs of the operation.
     for (int i = 0; i < op->getNumOperands(); i++) {
       Value input = op->getOperand(i);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
index 7c5590da9ed2..92b759b73a0e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index 886f9cd28a12..ec7ffefd2d43 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -86,7 +86,7 @@ std::optional<StringRef> GetCompsiteFunctionName(Operation *op) {
     return entry_function_attr.getValue();
   } else {
     TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
-    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
     if (!f_attr) return std::nullopt;
     return f_attr.getValue();
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_restore_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_restore_op.cc
index 30bae562a4a6..3eb553702717 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_restore_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_restore_op.cc
@@ -114,8 +114,8 @@ BlockArgument InsertFilePrefixArgument(func::FuncOp func_op,
 
   const int insert_idx = func_op.getNumArguments();
 
-  func_op.insertArgument(insert_idx, /*argType=*/filename_op_type, arg_attrs,
-                         NameLoc::get(file_prefix_attr));
+  (void)func_op.insertArgument(insert_idx, /*argType=*/filename_op_type,
+                               arg_attrs, NameLoc::get(file_prefix_attr));
 
   return func_op.getArgument(insert_idx);
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
index d56ee05dc071..9e0f26d87936 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
@@ -25,8 +25,8 @@ include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
 //===----------------------------------------------------------------------===//
 
 class IsFusedOpEndsWith<string OpName> : AttrConstraint<
-  CPred<"!$_self.cast<ArrayAttr>().empty() && "
-        "$_self.cast<ArrayAttr>()[$_self.cast<ArrayAttr>().size() - 1]."
+  CPred<"!llvm::cast<ArrayAttr>($_self).empty() && "
+        "llvm::cast<ArrayAttr>($_self)[llvm::cast<ArrayAttr>($_self).size() - 1]."
         "cast<::mlir::StringAttr>().str() == \"" # OpName # "\"">,
   "Matching fused '" # OpName # "' op at the end">;
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
index fe196b9caa44..927905c5a6e4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
@@ -211,8 +211,8 @@ IRMapping CloneSrcFuncArgumentsToMainFunc(func::FuncOp src_func_op,
     const DictionaryAttr main_arg_attr =
         src_func_op.getArgAttrDict(src_arg_idx);
 
-    main_func_op.insertArgument(main_arg_idx, src_arg.getType(), main_arg_attr,
-                                src_arg.getLoc());
+    (void)main_func_op.insertArgument(main_arg_idx, src_arg.getType(),
+                                      main_arg_attr, src_arg.getLoc());
 
     const std::string new_input_name =
         absl::StrCat(GetInitializerType(src_func_op), "_", src_arg_idx, ":0");
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc
index 307e97bd8527..9e73b72d7de5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
index d75a01be7d21..338fdc91fc52 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
@@ -83,21 +83,21 @@ class HasEqualElementSize<list<int> shape_1, list<int> shape_2> : Constraint<
   "Checks if the given dimensions contain the same number of elements.">;
 
 def ReshapableTo1DTensor : Constraint<
-  CPred<"quant::ReshapableTo1DTensor($0.getType().cast<ShapedType>())">,
+  CPred<"quant::ReshapableTo1DTensor(llvm::cast<ShapedType>($0.getType()))">,
   "Checks if the value dims are all ones except the right most dim">;
 
 def ReshapeTo1DTensor : NativeCodeCall<
   "quant::ReshapeTo1DTensor($_builder, $_loc, $0)">;
 
 def HasEqualShape : Constraint<CPred<
-  "$0.getType().cast<ShapedType>().hasRank() && "
-  "$1.getType().cast<ShapedType>().hasRank() && "
-  "$0.getType().cast<ShapedType>().getShape() == $1.getType().cast<ShapedType>().getShape()">,
+  "llvm::cast<ShapedType>($0.getType()).hasRank() && "
+  "llvm::cast<ShapedType>($1.getType()).hasRank() && "
+  "llvm::cast<ShapedType>($0.getType()).getShape() == llvm::cast<ShapedType>($1.getType()).getShape()">,
   "Checks if the shapes of tensors are same.">;
 
 // Make the 1D value $0 broadcastable with the shape of $1.
 def MakeOneDimValueBroadcastable : NativeCodeCall<
-  "MakeOneDimValueBroadcastable($_builder, $_loc, $0, $1.getType().cast<ShapedType>())">;
+  "MakeOneDimValueBroadcastable($_builder, $_loc, $0, llvm::cast<ShapedType>($1.getType()))">;
 
 // Match convolution op with "NHWC" data format or matmul op.
 def SupportedAffineOpMatcher : NativeCodeCall<
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
index 091f08177dc4..f577ce38bd3c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
index 2f4cd3e815a0..508771e94475 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
index c18d76327ca8..1bb95d4e865f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
@@ -85,7 +84,7 @@ struct TFQuantizationBase
       Operation* quantized_op, const CustomMap& custom_op_map) {
     auto call_op = cast<TF::PartitionedCallOp>(quantized_op);
     StringRef function_name =
-        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+        llvm::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
     // The below can be generalized as there are more read-only ops added such
     // as slice.
     const bool is_gather = function_name.contains("gather");
@@ -98,7 +97,7 @@ struct TFQuantizationBase
                                                const CustomMap& custom_op_map) {
     auto call_op = cast<TF::PartitionedCallOp>(quantized_op);
     StringRef function_name =
-        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+        llvm::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
     // The below can be generalized as there are more read-only ops added such
     // as slice.
     bool is_gather = false;
@@ -221,16 +220,16 @@ class QuantizeSameScaleOpsPattern
       inputs.reserve(quantizing_op->getNumOperands());
       for (const auto& operand : quantizing_op->getOperands()) {
         Type operand_type = operand.getType();
-        if (operand_type.isa<NoneType>()) {
+        if (isa<NoneType>(operand_type)) {
           inputs.push_back(operand);
           continue;
         }
 
-        Type elem_type = operand_type.cast<TensorType>().getElementType();
+        Type elem_type = llvm::cast<TensorType>(operand_type).getElementType();
         if (auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
                 operand.getDefiningOp())) {
-          auto dq_arg_type = dq_op.getArg().getType().cast<TensorType>();
-          auto qtype = dq_arg_type.getElementType().cast<QuantizedType>();
+          auto dq_arg_type = llvm::cast<TensorType>(dq_op.getArg().getType());
+          auto qtype = llvm::cast<QuantizedType>(dq_arg_type.getElementType());
           auto scast_op = rewriter.create<quantfork::StorageCastOp>(
               dq_op->getLoc(), dq_arg_type.clone(qtype.getStorageType()),
               dq_op.getArg());
@@ -253,12 +252,12 @@ class QuantizeSameScaleOpsPattern
            llvm::enumerate(quantizing_op->getResults())) {
         Value result = enumerated_result.value();
         Type result_type = result.getType();
-        if (result_type.isa<NoneType>()) {
+        if (isa<NoneType>(result_type)) {
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result_type);
           continue;
         }
-        auto result_tensor_type = result_type.cast<TensorType>();
+        auto result_tensor_type = llvm::cast<TensorType>(result_type);
         // If the user is the Quantize op, it must be the only user.
         if (result.hasOneUse() &&
             llvm::isa<quantfork::QuantizeCastOp>(*result.user_begin())) {
@@ -266,10 +265,8 @@ class QuantizeSameScaleOpsPattern
               llvm::cast<quantfork::QuantizeCastOp>(*result.user_begin());
           outputs_replaced.insert(
               {user.getResult(), enumerated_result.index()});
-          auto qtype = user.getType()
-                           .cast<TensorType>()
-                           .getElementType()
-                           .cast<QuantizedType>();
+          auto qtype = llvm::cast<QuantizedType>(
+              llvm::cast<TensorType>(user.getType()).getElementType());
           output_types.push_back(
               result_tensor_type.clone(qtype.getStorageType()));
         } else if (!result_tensor_type.getElementType().isF32()) {
@@ -338,7 +335,7 @@ class QuantizeSameScaleOpsPattern
       // Check if the preceding op is a quantized same-scale op.
       if (llvm::isa<quantfork::StorageCastOp>(preceding_op)) {
         auto sc_op = llvm::cast<quantfork::StorageCastOp>(preceding_op);
-        auto sc_arg_type = sc_op.getArg().getType().dyn_cast<TensorType>();
+        auto sc_arg_type = llvm::dyn_cast<TensorType>(sc_op.getArg().getType());
         if (sc_arg_type.getElementType().isInteger(8)) {
           return true;
         }
@@ -364,7 +361,8 @@ class QuantizeSameScaleOpsPattern
         // Check if the preceding op is a quantized same-scale op.
         if (llvm::isa<quantfork::StorageCastOp>(following_op)) {
           auto sc_op = llvm::cast<quantfork::StorageCastOp>(following_op);
-          auto sc_arg_type = sc_op.getResult().getType().dyn_cast<TensorType>();
+          auto sc_arg_type =
+              llvm::dyn_cast<TensorType>(sc_op.getResult().getType());
           if (sc_arg_type.getElementType().isInteger(8)) {
             return true;
           }
@@ -381,28 +379,28 @@ class QuantizeSameScaleOpsPattern
       return false;
     }
 
-    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = llvm::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
     if (!f_attr || !f_attr.getValue().starts_with("composite_")) {
       return false;
     }
 
     bool has_quantized_types = false;
     for (Value input : call_op.getArgs()) {
-      if (auto type = input.getType().dyn_cast<TensorType>()) {
-        if (type.getElementType().isa<FloatType>()) {
+      if (auto type = llvm::dyn_cast<TensorType>(input.getType())) {
+        if (isa<FloatType>(type.getElementType())) {
           return false;
         }
-        if (type.getElementType().isa<QuantizedType>()) {
+        if (isa<QuantizedType>(type.getElementType())) {
           has_quantized_types = true;
         }
       }
     }
     for (Value output : call_op.getOutput()) {
-      if (auto type = output.getType().dyn_cast<TensorType>()) {
-        if (type.getElementType().isa<FloatType>()) {
+      if (auto type = llvm::dyn_cast<TensorType>(output.getType())) {
+        if (isa<FloatType>(type.getElementType())) {
           return false;
         }
-        if (type.getElementType().isa<QuantizedType>()) {
+        if (isa<QuantizedType>(type.getElementType())) {
           has_quantized_types = true;
         }
       }
@@ -432,10 +430,11 @@ struct QuantizeAvgPoolOpPattern
     if (!preceding_sc_op) return failure();
 
     // Check if the same-scale requirement is met.
-    auto dq_arg_type = preceding_sc_op.getArg().getType().cast<TensorType>();
-    auto qtype = dq_arg_type.getElementType().cast<QuantizedType>();
-    auto q_result_type = sc_op.getType().cast<TensorType>();
-    auto out_qtype = q_result_type.getElementType().cast<QuantizedType>();
+    auto dq_arg_type =
+        llvm::cast<TensorType>(preceding_sc_op.getArg().getType());
+    auto qtype = llvm::cast<QuantizedType>(dq_arg_type.getElementType());
+    auto q_result_type = llvm::cast<TensorType>(sc_op.getType());
+    auto out_qtype = llvm::cast<QuantizedType>(q_result_type.getElementType());
     if (qtype != out_qtype) {
       avg_pool_op.emitError(
           "The preceding StorageCastOp and the following "
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index a0176b1b5264..e5563d09cb7c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_var_init_by_const.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_var_init_by_const.cc
index 4ea643cb307e..ae3a25b32199 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_var_init_by_const.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_var_init_by_const.cc
@@ -64,26 +64,23 @@ class RemoveVariableInitializationByConstPass
 struct RemoveVariableAssignmentByConst
     : public OpRewritePattern<TF::AssignVariableOp> {
   // Inherit the constructors.
-  using OpRewritePattern<TF::AssignVariableOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(TF::AssignVariableOp assign_op) const override {
+  LogicalResult matchAndRewrite(TF::AssignVariableOp assign_op,
+                                PatternRewriter& rewriter) const override {
     Value resource_operand = assign_op.getOperand(0);
     Value assigned_value_operand = assign_op.getOperand(1);
 
-    if (isa<TF::VarHandleOp>(resource_operand.getDefiningOp()) &&
-        isa<TF::ConstOp>(assigned_value_operand.getDefiningOp())) {
-      return success();
-    } else {
+    if (!isa<TF::VarHandleOp>(resource_operand.getDefiningOp()) ||
+        !isa<TF::ConstOp>(assigned_value_operand.getDefiningOp())) {
       return failure();
     }
-  }
 
-  void rewrite(TF::AssignVariableOp assign_op,
-               PatternRewriter& rewriter) const override {
     // `TF::ConstOp` and `TF::VarHandleOp` are not manually erased.
     // `applyPatternsGreedily` performs dead code elimination and unsed
     // ops will be erased during the optimization.
     rewriter.eraseOp(assign_op);
+    return success();
   }
 };
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
index d1e46b4eb560..2605d7479e44 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h"
@@ -628,8 +627,7 @@ Value CreateXlaConvOp(OpBuilder &builder, Location loc, Value input,
                       Value filter, Value input_zp, Value conv_output,
                       ArrayAttr strides, ArrayAttr dilations,
                       StringAttr conv_padding, ArrayAttr explicit_paddings,
-                      int feature_group_cnt, bool four_bit = false,
-                      int num_dims = 4) {
+                      int feature_group_cnt, int num_dims = 4) {
   int32_t input_zp_value;
   if (!GetSplatValue(input_zp, input_zp_value)) {
     emitError(loc,
@@ -675,14 +673,6 @@ Value CreateXlaConvOp(OpBuilder &builder, Location loc, Value input,
       conv_padding, explicit_paddings, padding, num_dims);
 
   std::string precision_config_str;
-  if (four_bit) {
-    input = PackOperand(builder, loc, input, /*pack_dim=*/num_dims - 1);
-    filter = PackOperand(builder, loc, filter, /*pack_dim=*/num_dims - 2);
-    xla::PrecisionConfig precision_config;
-    precision_config.add_operand_precision(xla::PrecisionConfig::PACKED_NIBBLE);
-    precision_config.add_operand_precision(xla::PrecisionConfig::PACKED_NIBBLE);
-    precision_config_str = precision_config.SerializeAsString();
-  }
   Value xla_conv_output =
       builder
           .create<TF::XlaConvV2Op>(
@@ -774,14 +764,13 @@ Value CreateXlaConvOpFromTfConv3dOp(OpBuilder &builder, Location loc,
   return CreateXlaConvOp(builder, loc, input, filter, input_zp, conv_output,
                          strides, dilations, conv_padding,
                          /*explicit_paddings=*/nullptr, feature_group_cnt,
-                         /*four_bit=*/false, /*num_dims=*/5);
+                         /*num_dims=*/5);
 }
 
 // Helper function to create an XlaDotV2Op.
 Value CreateXlaDotV2Op(OpBuilder &builder, Location loc, Value input,
                        Value weight, Value input_zp, Value weight_zp,
-                       Value output, const xla::DotDimensionNumbers &dnums,
-                       bool four_bit = false) {
+                       Value output, const xla::DotDimensionNumbers &dnums) {
   int32_t input_zp_value = 0;
   int32_t weight_zp_value = 0;
   if (input_zp != nullptr && !GetSplatValue(input_zp, input_zp_value)) {
@@ -797,14 +786,6 @@ Value CreateXlaDotV2Op(OpBuilder &builder, Location loc, Value input,
   }
 
   std::string precision_config_str;
-  if (four_bit) {
-    input = PackOperand(builder, loc, input, /*pack_dim=*/1);
-    weight = PackOperand(builder, loc, weight, /*pack_dim=*/0);
-    xla::PrecisionConfig precision_config;
-    precision_config.add_operand_precision(xla::PrecisionConfig::PACKED_NIBBLE);
-    precision_config.add_operand_precision(xla::PrecisionConfig::PACKED_NIBBLE);
-    precision_config_str = precision_config.SerializeAsString();
-  }
 
   Value dot_result =
       builder
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_add_dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_add_dump_tensor_op.cc
new file mode 100644
index 000000000000..9c521c1da5d9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_add_dump_tensor_op.cc
@@ -0,0 +1,321 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::stablehlo::quantization::DebuggerConfig;
+using DebuggerType = DebuggerConfig::DebuggerType;
+
+constexpr StringRef kOriginalEntryFuncAttrName = "_original_entry_function";
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+constexpr StringRef kEmptyNodeName = "_empty_node";
+
+// Returns a pair: `func_name` and `node_name` for the lifted function. In TF
+// quantizer, both are filled. For StableHLO quantizer, the func_name is only
+// filled and node_name is always set to "_empty_node".
+std::pair<std::string, std::string> GetFuncNameAndNodeName(
+    TF::PartitionedCallOp call_op, const FlatSymbolRefAttr &f_attr) {
+  std::optional<quant::QuantizationUnitLoc::QuantizationUnit> quant_unit =
+      quant::FindQuantizationUnitFromLoc(call_op->getLoc());
+  return std::make_pair(quant_unit->func_name(), quant_unit->node_name());
+}
+
+std::pair<std::string, std::string> GetFuncNameAndNodeName(
+    TF::XlaCallModuleOp call_op, const FlatSymbolRefAttr &f_attr) {
+  return std::make_pair(f_attr.getValue().str(), kEmptyNodeName.str());
+}
+
+Operation *DuplicateOp(TF::PartitionedCallOp call_op, PatternRewriter &rewriter,
+                       const StringAttr &new_ref_func_name) {
+  // Create PartitionedCallOp to the copied composite function. This
+  // PartitionedCallOp does not have kQuantTraitAttrName, and therefore won't
+  // get quantized.
+  auto new_call_op = rewriter.create<TF::PartitionedCallOp>(
+      call_op.getLoc(), call_op.getResultTypes(), call_op.getOperands(),
+      call_op.getArgAttrsAttr(), call_op.getResAttrsAttr(),
+      FlatSymbolRefAttr::get(new_ref_func_name));
+  return new_call_op;
+}
+
+Operation *DuplicateOp(TF::XlaCallModuleOp call_op, PatternRewriter &rewriter,
+                       const StringAttr &new_ref_func_name) {
+  // Create XlaCallModuleOp to the copied composite function. This
+  // XlaCallModuleOp does not have kQuantTraitAttrName, and therefore won't get
+  // quantized.
+  auto new_call_op = rewriter.create<TF::XlaCallModuleOp>(
+      call_op.getLoc(), call_op.getResultTypes(), call_op.getOperands(),
+      call_op.getVersionAttr(), call_op.getModuleAttr(), call_op.getSoutAttr());
+  new_call_op->setAttr(TF::kStablehloEntryFunctionAttrName,
+                       rewriter.getStringAttr(new_ref_func_name.getValue()));
+  new_call_op->setAttrs(call_op->getAttrs());
+  new_call_op->setAttr(TF::kStablehloVersionAttrName,
+                       call_op->getAttr(TF::kStablehloVersionAttrName));
+  new_call_op->removeAttr(rewriter.getStringAttr(kQuantTraitAttrName));
+
+  FlatSymbolRefAttr new_func_name_attr =
+      FlatSymbolRefAttr::get(rewriter.getContext(), new_ref_func_name);
+  new_call_op->setAttr(TF::kStablehloEntryFunctionAttrName, new_func_name_attr);
+  new_call_op->setAttr(kOriginalEntryFuncAttrName, new_ref_func_name);
+  return new_call_op;
+}
+
+// AddDumpTensorOp pass adds DumpTensorOp - which saves entire value of its
+// input into a file - to quantizable layer's output.
+class AddDumpTensorOpPass
+    : public PassWrapper<AddDumpTensorOpPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AddDumpTensorOpPass)
+
+  explicit AddDumpTensorOpPass() = default;
+
+  explicit AddDumpTensorOpPass(DebuggerType debugger_type,
+                               std::string log_dir_path)
+      : log_dir_path_(std::move(log_dir_path)) {
+    debugger_type_ = debugger_type;
+  }
+
+  AddDumpTensorOpPass(const AddDumpTensorOpPass &other) {
+    debugger_type_ = other.debugger_type_;
+    log_dir_path_ = other.log_dir_path_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in the textual format (on
+    // the commandline for example).
+    return "tf-quant-add-dump-tensor-op";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Add DumpTensor ops after quantizable ops";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+    registry.insert<quant::QuantDialect>();
+    registry.insert<mlir::quant::ir::TFQuantDialect>();
+  }
+
+ private:
+  void runOnOperation() override;
+
+  Option<DebuggerType> debugger_type_{
+      *this, "debugger_type",
+      llvm::cl::init(DebuggerConfig::DEBUGGER_TYPE_UNSPECIFIED),
+      llvm::cl::values(
+          clEnumValN(DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL, "whole_model",
+                     "Whole model verify"),
+          clEnumValN(DebuggerConfig::DEBUGGER_TYPE_INT_PER_LAYER,
+                     "int_per_layer", "Int Per-layer verify"),
+          clEnumValN(DebuggerConfig::DEBUGGER_TYPE_FLOAT_PER_LAYER,
+                     "float_per_layer", "Float Per-layer verify"))};
+
+  std::string log_dir_path_ = "/tmp/dumps";
+};
+
+template <typename LiftedOpT>
+class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
+ public:
+  // Does not take ownership of context, which must refer to a valid value that
+  // outlives this object.
+  explicit AddDumpTensorOp(MLIRContext *context, DebuggerType debugger_type,
+                           std::string log_dir_path)
+      : OpRewritePattern<LiftedOpT>(context),
+        debugger_type_(debugger_type),
+        log_dir_path_(std::move(log_dir_path)) {}
+
+  LogicalResult matchAndRewrite(LiftedOpT op,
+                                PatternRewriter &rewriter) const override {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  SmallVector<NamedAttribute> CreateDumpAttributes(
+      PatternRewriter &rewriter, const StringRef folder_name,
+      const StringRef file_name, const bool enabled, const StringRef func_name,
+      const StringRef node_name) const {
+    SmallVector<NamedAttribute> dump_attributes{
+        rewriter.getNamedAttr("log_dir_path",
+                              rewriter.getStringAttr(folder_name)),
+        rewriter.getNamedAttr("file_name", rewriter.getStringAttr(file_name)),
+        // The op is disabled by default. Otherwise, values will be saved
+        // during calibration.
+        rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(enabled)),
+        rewriter.getNamedAttr("func_name", rewriter.getStringAttr(func_name)),
+        rewriter.getNamedAttr("node_name", rewriter.getStringAttr(node_name)),
+    };
+    return dump_attributes;
+  }
+
+  StringAttr DuplicateFunction(Operation *op,
+                               const FlatSymbolRefAttr &f_attr) const {
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module);
+
+    const func::FuncOp ref_func =
+        dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
+    func::FuncOp new_ref_func = dyn_cast<func::FuncOp>(ref_func->clone());
+    return symbol_table.insert(new_ref_func);
+  }
+
+  LogicalResult match(LiftedOpT op) const {
+    if (!op->hasAttr(kQuantTraitAttrName) || op->getNumResults() != 1) {
+      return failure();
+    }
+
+    Value result = op->getResult(0);
+    for (auto user : result.getUsers()) {
+      if (dyn_cast_or_null<TF::DumpTensorOp>(user)) return failure();
+    }
+
+    const FlatSymbolRefAttr f_attr = GetFuncAttr(op);
+    if (!f_attr.getValue().starts_with(kCompositeFuncPrefix)) return failure();
+    return success();
+  }
+
+  void rewrite(LiftedOpT op, PatternRewriter &rewriter) const {
+    // Only support ops with 1 results
+    Value result = op->getResult(0);
+    rewriter.setInsertionPointAfterValue(result);
+
+    // In Whole model, we first need to set file_name as
+    // unquantized_tensor_data.pb as it is used by unquantized dump model.
+    // After saving unquantized dump model, the file name will be changed to
+    // quantized_tensor_data.pb.
+    // Since this process doesn't happen for per layer, we need to set file_name
+    // as quantized_tensor_data.pb here.
+    // TODO: b/296933893 - Refactor the debugger code when no quantize option
+    // is added
+    std::string file_name =
+        debugger_type_ == DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL
+            ? "unquantized_tensor_data.pb"
+            : "quantized_tensor_data.pb";
+
+    const FlatSymbolRefAttr f_attr = GetFuncAttr(op);
+
+    // In TF::PartitionedCallOp case, func_name and node_name are filled.
+    // But in TF::XlaCallModuleOp case, node_name is `kEmptyNodeName` since
+    // debugging and selective quantization of StableHLO Quantizer only uses
+    // func_name for op matching.
+    auto [func_name, node_name] = GetFuncNameAndNodeName(op, f_attr);
+    std::string folder_name =
+        tensorflow::io::JoinPath(log_dir_path_, f_attr.getValue());
+
+    // Attach DumpTensorOp to its output layer.
+    SmallVector<NamedAttribute> dump_attributes =
+        CreateDumpAttributes(rewriter, folder_name, file_name,
+                             /*enabled=*/true, func_name, node_name);
+    rewriter.create<TF::DumpTensorOp>(op->getLoc(), TypeRange{}, result,
+                                      dump_attributes);
+
+    // Per-layer mode.
+    if (debugger_type_ == DebuggerConfig::DEBUGGER_TYPE_INT_PER_LAYER ||
+        debugger_type_ == DebuggerConfig::DEBUGGER_TYPE_FLOAT_PER_LAYER) {
+      // Duplicate composite function and op of quantizable layer for creating
+      // unquantized layer.
+      StringAttr new_ref_func_name = DuplicateFunction(op, f_attr);
+      Operation *new_op = DuplicateOp(op, rewriter, new_ref_func_name);
+
+      // Attach second DumpTensorOp to its output unquantized layer.
+      SmallVector<NamedAttribute> dump_attributes = CreateDumpAttributes(
+          rewriter, folder_name, /*file_name=*/"unquantized_tensor_data.pb",
+          /*enabled=*/true, func_name, node_name);
+      rewriter.create<TF::DumpTensorOp>(op.getLoc(), TypeRange{},
+                                        new_op->getResult(0), dump_attributes);
+
+      if (debugger_type_ == DebuggerConfig::DEBUGGER_TYPE_FLOAT_PER_LAYER) {
+        // Swap all uses between call_op and ref_call_op, except for the
+        // particular use that owns DumpTensor.
+        rewriter.replaceUsesWithIf(
+            op.getResult(0), new_op->getResult(0), [](OpOperand &use) -> bool {
+              return !isa<TF::DumpTensorOp>(use.getOwner());
+            });
+      }
+    }
+  }
+
+  DebuggerType debugger_type_;
+  std::string log_dir_path_;
+};
+
+static PassRegistration<AddDumpTensorOpPass> pass;
+
+void AddDumpTensorOpPass::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module = getOperation();
+
+  patterns.add<AddDumpTensorOp<TF::PartitionedCallOp>,
+               AddDumpTensorOp<TF::XlaCallModuleOp>>(ctx, debugger_type_,
+                                                     log_dir_path_);
+
+  if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+    module.emitError() << "quant-add-dump-tensor-op failed.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateAddDumpTensorOpPass(
+    DebuggerType debugger_type, std::string log_dir_path) {
+  return std::make_unique<AddDumpTensorOpPass>(debugger_type,
+                                               std::move(log_dir_path));
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_add_quantization_unit_loc.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_add_quantization_unit_loc.cc
new file mode 100644
index 000000000000..9e52d09e7647
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_add_quantization_unit_loc.cc
@@ -0,0 +1,203 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/match.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using QuantizationUnit =
+    tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+
+// Adds QuantizationUnitLoc to quantizable layers.
+class AddQuantizationUnitLocPass
+    : public PassWrapper<AddQuantizationUnitLocPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AddQuantizationUnitLocPass)
+  explicit AddQuantizationUnitLocPass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-add-quantization-unit-loc";
+  }
+  StringRef getDescription() const final {
+    return "Add QuantizationUnitLoc to quantizable layers.";
+  }
+
+ private:
+  void runOnOperation() override;
+};
+
+// TF graph nodes are imported with one of following location patterns:
+//   FusedLoc[NameLoc(op_type:), ..., NameLoc(node_name@func_name)] or
+//   FusedLoc[NameLoc(op_type:), ..., CallSiteLoc(node_name@func_name)]. See
+// tensorflow/compiler/mlir/tensorflow/translate/import_model.cc for more
+// details.
+bool IsImportLocPattern(FusedLoc loc) {
+  ArrayRef<Location> locations = mlir::cast<FusedLoc>(loc).getLocations();
+  if (locations.size() < 2 || !isa<NameLoc>(locations.front())) return false;
+
+  StringRef op_type_with_suffix =
+      mlir::cast<NameLoc>(locations.front()).getName().strref();
+  if (!op_type_with_suffix.ends_with(":")) return false;
+
+  return absl::c_all_of(locations, [](Location loc) {
+    return isa<NameLoc>(loc) ||
+           (isa<CallSiteLoc>(loc) &&
+            isa<NameLoc>(mlir::cast<CallSiteLoc>(loc).getCallee()));
+  });
+}
+
+// Finds the pattern of the location created by `ImporterBase::GetLocation`
+// in `tensorflow/compiler/mlir/tensorflow/translate/import_model.cc`.
+void FindQuantizationUnitsRecursively(Location loc,
+                                      SmallVector<QuantizationUnit>& units) {
+  if (!isa<FusedLoc>(loc)) return;
+
+  auto set_node_and_func_name = [](QuantizationUnit& new_unit,
+                                   StringRef name_loc_id) {
+    if (name_loc_id.contains("@")) {
+      new_unit.set_node_name(name_loc_id.split('@').first.str());
+      new_unit.set_func_name(name_loc_id.split('@').second.str());
+    } else {
+      new_unit.set_node_name(name_loc_id.str());
+    }
+  };
+
+  ArrayRef<Location> locations = mlir::cast<FusedLoc>(loc).getLocations();
+  if (IsImportLocPattern(mlir::cast<FusedLoc>(loc))) {
+    QuantizationUnit new_unit;
+    // Op type is a NameLoc with the ":" suffix.
+    StringRef op_type_with_suffix =
+        mlir::cast<NameLoc>(locations.front()).getName().strref();
+    StringRef op_type =
+        op_type_with_suffix.substr(0, op_type_with_suffix.size() - 1);
+    new_unit.set_op_type(op_type.str());
+
+    if (isa<NameLoc>(locations.back())) {
+      StringRef name_loc_id =
+          mlir::cast<NameLoc>(locations.back()).getName().strref();
+      set_node_and_func_name(new_unit, name_loc_id);
+    } else {
+      Location callee = mlir::cast<CallSiteLoc>(locations.back()).getCallee();
+      StringRef name_loc_id = mlir::cast<NameLoc>(callee).getName().strref();
+      set_node_and_func_name(new_unit, name_loc_id);
+    }
+    units.push_back(new_unit);
+  } else {
+    for (Location child_loc : locations) {
+      FindQuantizationUnitsRecursively(child_loc, units);
+    }
+  }
+}
+
+// Finds the QuantizationUnit from location.
+std::optional<QuantizationUnit> FindQuantizationUnit(Operation* op) {
+  SmallVector<QuantizationUnit> quant_units;
+  FindQuantizationUnitsRecursively(op->getLoc(), quant_units);
+
+  if (quant_units.size() == 1) {
+    return *quant_units.begin();
+  }
+  // Among units, return the one with the same type as given op.
+  StringRef given_op_type = op->getName().getStringRef();
+  for (const QuantizationUnit& quant_unit : quant_units) {
+    if (absl::StrContains(given_op_type.lower(),
+                          StringRef(quant_unit.op_type()).lower())) {
+      return quant_unit;
+    }
+  }
+
+  return std::nullopt;
+}
+
+class AddQuantizationUnitLoc : public RewritePattern {
+ public:
+  explicit AddQuantizationUnitLoc(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+
+ private:
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    if (!IsOpWithQuantizableTrait(op) ||
+        quant::FindQuantizationUnitFromLoc(op->getLoc()).has_value()) {
+      return failure();
+    }
+
+    std::optional<QuantizationUnit> quantization_unit =
+        FindQuantizationUnit(op);
+    if (!quantization_unit.has_value()) return failure();
+
+    if (quantization_unit->func_name().empty()) {
+      std::string func_name =
+          op->getParentOfType<func::FuncOp>().getSymNameAttr().str();
+      quantization_unit->set_func_name(func_name);
+    }
+    quant::QuantizationUnitLoc unit_loc(getContext(),
+                                        quantization_unit.value());
+    op->setLoc(unit_loc);
+
+    return success();
+  }
+};
+
+void AddQuantizationUnitLocPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  func::FuncOp func = getOperation();
+
+  patterns.add<AddQuantizationUnitLoc>(ctx);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    func.emitError() << "tf-quant-add-quantization-unit-loc pattern "
+                        "conversion did not converge.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of `AddQuantizationUnitLocPass`.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateAddQuantizationUnitLocPass() {
+  return std::make_unique<AddQuantizationUnitLocPass>();
+}
+
+static PassRegistration<AddQuantizationUnitLocPass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_cast_bf16_ops_to_f32.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_cast_bf16_ops_to_f32.cc
new file mode 100644
index 000000000000..c48725069813
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_cast_bf16_ops_to_f32.cc
@@ -0,0 +1,151 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+class CastBf16OpsToF32Pass
+    : public PassWrapper<CastBf16OpsToF32Pass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CastBf16OpsToF32Pass)
+  explicit CastBf16OpsToF32Pass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-cast-bf16-ops-to-f32";
+  }
+  StringRef getDescription() const final {
+    return "Cast BF16 operations to F32.";
+  }
+
+  void runOnOperation() override;
+};
+
+class CastBf16OpsToF32 : public RewritePattern {
+ public:
+  explicit CastBf16OpsToF32(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    if (match(op).failed()) {
+      return failure();
+    }
+    rewrite(op, rewriter);
+    return success();
+  }
+
+ private:
+  LogicalResult match(Operation* op) const {
+    if (isa<TF::CastOp, TF::ConstOp>(op) ||
+        op->getName().hasTrait<OpTrait::ZeroOperands>()) {
+      return failure();
+    }
+    for (Value input : op->getOperands()) {
+      if (getElementTypeOrSelf(input).isBF16()) {
+        return success();
+      }
+    }
+    for (Value value : op->getResults()) {
+      if (getElementTypeOrSelf(value).isBF16()) {
+        return success();
+      }
+    }
+    return failure();
+  }
+
+  void rewrite(Operation* op, PatternRewriter& rewriter) const {
+    // Casts inputs of the operation.
+    for (int i = 0; i < op->getNumOperands(); i++) {
+      Value input = op->getOperand(i);
+      if (getElementTypeOrSelf(input).isBF16()) {
+        Value f32_cast = rewriter.create<TF::CastOp>(
+            op->getLoc(),
+            CloneTypeWithNewElementType(input.getType(), rewriter.getF32Type()),
+            input);
+        op->setOperand(i, f32_cast);
+      }
+    }
+
+    // Casts BF16 outputs of the operation.
+    for (Value value : op->getResults()) {
+      if (getElementTypeOrSelf(value).isBF16()) {
+        value.setType(CloneTypeWithNewElementType(value.getType(),
+                                                  rewriter.getF32Type()));
+        rewriter.setInsertionPointAfterValue(value);
+        for (Operation* user : op->getUsers()) {
+          for (int i = 0; i < user->getNumOperands(); i++) {
+            if (user->getOperand(i) == value) {
+              Value bf16_cast = rewriter.create<TF::CastOp>(
+                  user->getLoc(),
+                  CloneTypeWithNewElementType(value.getType(),
+                                              rewriter.getBF16Type()),
+                  value);
+              user->setOperand(i, bf16_cast);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_cast_bf16_ops_to_f32.inc"
+
+void CastBf16OpsToF32Pass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  auto module_op = getOperation();
+
+  patterns.add<CastBf16OpsToF32>(ctx);
+  populateWithGenerated(patterns);
+
+  if (failed(applyPatternsGreedily(module_op, std::move(patterns)))) {
+    module_op.emitError() << "tf-quant-cast-bf16-ops-to-f32 failed.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the Cast BF16 ops to F32 pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCastBf16OpsToF32Pass() {
+  return std::make_unique<CastBf16OpsToF32Pass>();
+}
+
+static PassRegistration<CastBf16OpsToF32Pass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_cast_bf16_ops_to_f32.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_cast_bf16_ops_to_f32.td
new file mode 100644
index 000000000000..80c65560aa14
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_cast_bf16_ops_to_f32.td
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern rules for converting bfloat16 operations to fp32 conversions.
+//===----------------------------------------------------------------------===//
+
+// Remove unneeded redundant cast ops like (f32 -> bf16 -> f32).
+def RemoveUnneededCastOps : Pat<
+  (TF_CastOp:$output
+    (TF_CastOp
+      $input, $truncate_0), $truncate_1),
+  (replaceWithValue $input),
+  [(AreTheSameElementType $input, $output)]>;
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_custom_aggregation_op_to_quant_stats.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_custom_aggregation_op_to_quant_stats.cc
new file mode 100644
index 000000000000..bc75a779433c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_custom_aggregation_op_to_quant_stats.cc
@@ -0,0 +1,127 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+class ConvertCustomAggregationOpToQuantStatsPass
+    : public PassWrapper<ConvertCustomAggregationOpToQuantStatsPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      ConvertCustomAggregationOpToQuantStatsPass)
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in the textual format (on
+    // the commandline for example).
+    return "tf-quant-convert-tf-custom-aggregator-op-to-quant-stats";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Convert tf.CustomAggregator op to quant.Stats";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+    registry.insert<quant::QuantDialect>();
+    registry.insert<mlir::quant::ir::TFQuantDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+class ConvertCustomAggregationOpToQuantStats
+    : public OpRewritePattern<TF::CustomAggregatorOp> {
+ public:
+  // Does not take ownership of context, which must refer to a valid value that
+  // outlives this object.
+  explicit ConvertCustomAggregationOpToQuantStats(MLIRContext *context)
+      : OpRewritePattern<TF::CustomAggregatorOp>(context) {}
+
+  LogicalResult matchAndRewrite(TF::CustomAggregatorOp op,
+                                PatternRewriter &rewriter) const override {
+    FloatAttr min = mlir::dyn_cast_or_null<FloatAttr>(op->getAttr("min"));
+    FloatAttr max = mlir::dyn_cast_or_null<FloatAttr>(op->getAttr("max"));
+
+    // When there are no min and max attributes, remove op.
+    if (min == nullptr || max == nullptr) {
+      op.getOutput().replaceAllUsesWith(op.getInput());
+      rewriter.eraseOp(op);
+      return success();
+    }
+
+    // The layer stats contain only the first min/max pairs.
+    ElementsAttr layer_stats = DenseFPElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getF32Type()),
+        {static_cast<float>(min.getValueAsDouble()),
+         static_cast<float>(max.getValueAsDouble())});
+    ElementsAttr axis_stats;
+    IntegerAttr axis;
+
+    mlir::quant::ir::StatisticsOp stats_op =
+        rewriter.create<mlir::quant::ir::StatisticsOp>(
+            op->getLoc(), op.getInput(), layer_stats, axis_stats, axis);
+    op.getOutput().replaceAllUsesWith(stats_op.getResult());
+    return success();
+  }
+};
+
+static PassRegistration<ConvertCustomAggregationOpToQuantStatsPass> pass;
+
+void ConvertCustomAggregationOpToQuantStatsPass::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  func::FuncOp func = getOperation();
+
+  patterns.add<ConvertCustomAggregationOpToQuantStats>(ctx);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    func.emitError()
+        << "tf-quant-convert-tf-custom-aggregator-op-to-quant-stats failed.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateConvertCustomAggregationOpToQuantStatsPass() {
+  return std::make_unique<ConvertCustomAggregationOpToQuantStatsPass>();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_fake_quant_to_qdq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_fake_quant_to_qdq.cc
new file mode 100644
index 000000000000..e8ee46db5a96
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_fake_quant_to_qdq.cc
@@ -0,0 +1,89 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project  // IWYU pragma: keep, for applyPatternsGreedily
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+class ConvertFakeQuantToQdqPass
+    : public PassWrapper<ConvertFakeQuantToQdqPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertFakeQuantToQdqPass)
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-convert-fake-quant-to-qdq";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Convert Fake Quant op to quant.qcast and quant.dcast pairs";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+    registry.insert<quant::QuantDialect>();
+    registry.insert<mlir::quant::ir::TFQuantDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+static PassRegistration<ConvertFakeQuantToQdqPass> pass;
+
+void ConvertFakeQuantToQdqPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  func::FuncOp func = getOperation();
+
+  if (failed(tf_quant::ConvertFakeQuantOps(
+          func, ctx, /*use_fake_quant_num_bits=*/false))) {
+    func.emitError() << "quant-convert-fake-quant-to-qdq pass failed.";
+    signalPassFailure();
+  }
+
+  // For removing dead FakeQuant* ops
+  RewritePatternSet patterns(ctx);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertFakeQuantToQdqPass() {
+  return std::make_unique<ConvertFakeQuantToQdqPass>();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tf_xla_op_to_tf_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tf_xla_op_to_tf_op.cc
new file mode 100644
index 000000000000..748fc756a427
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tf_xla_op_to_tf_op.cc
@@ -0,0 +1,341 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "xla/xla_data.pb.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+class ConvertTfXlaOpToTfOpPass
+    : public PassWrapper<ConvertTfXlaOpToTfOpPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertTfXlaOpToTfOpPass)
+
+  ConvertTfXlaOpToTfOpPass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-convert-tf-xla-op-to-tf-op";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Apply converting Tensorflow Xla ops to non-xla ops.";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, arith::ArithDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+// Generate an einsum equation from the given DotDimensionNumber.
+std::string CreateEinsumEquation(
+    const xla::DotDimensionNumbers& dot_dimension_numbers, const int lhs_rank,
+    const int rhs_rank) {
+  // Prepare necessary indices.
+  absl::flat_hash_set<int64_t> lhs_batch_idx, rhs_batch_idx;
+  absl::flat_hash_set<int64_t> lhs_contract_idx, rhs_contract_idx;
+  lhs_batch_idx.insert(dot_dimension_numbers.lhs_batch_dimensions().begin(),
+                       dot_dimension_numbers.lhs_batch_dimensions().end());
+  lhs_contract_idx.insert(
+      dot_dimension_numbers.lhs_contracting_dimensions().begin(),
+      dot_dimension_numbers.lhs_contracting_dimensions().end());
+  rhs_batch_idx.insert(dot_dimension_numbers.rhs_batch_dimensions().begin(),
+                       dot_dimension_numbers.rhs_batch_dimensions().end());
+  rhs_contract_idx.insert(
+      dot_dimension_numbers.rhs_contracting_dimensions().begin(),
+      dot_dimension_numbers.rhs_contracting_dimensions().end());
+
+  // Generate equation.
+  std::string lhs_eq = "";
+  std::string rhs_eq = "";
+  std::string out_eq = "";
+  char c = 'a';
+  std::vector<char> lhs_batch_dims;
+  std::vector<char> lhs_contract_dims;
+  for (int i = 0; i < lhs_rank; i++) {
+    absl::StrAppend(&lhs_eq, std::string(1, c));
+    if (lhs_batch_idx.contains(i)) {
+      lhs_batch_dims.push_back(c);
+    } else if (lhs_contract_idx.contains(i)) {
+      lhs_contract_dims.push_back(c);
+    }
+    c++;
+  }
+
+  int batch_trace_idx = 0;
+  int contract_trace_idx = 0;
+  const bool rhs_only_batch = lhs_batch_dims.empty();
+  for (int i = 0; i < rhs_rank; i++) {
+    if (rhs_batch_idx.contains(i)) {
+      if (rhs_only_batch) {
+        rhs_eq.push_back(c);
+        lhs_batch_dims.push_back(c);
+        c++;
+      } else {
+        rhs_eq.push_back(lhs_batch_dims[batch_trace_idx]);
+        batch_trace_idx++;
+      }
+    } else if (rhs_contract_idx.contains(i)) {
+      absl::StrAppend(&rhs_eq,
+                      std::string(1, lhs_contract_dims[contract_trace_idx]));
+      contract_trace_idx++;
+    } else {
+      rhs_eq += c;
+      c++;
+    }
+  }
+
+  // Create out_eq by merging lhs and rhs.
+  // In XlaDotv2 style - batch dim - leftover from lhs - leftover from rhs.
+  for (const char c : lhs_batch_dims) {
+    absl::StrAppend(&out_eq, std::string(1, c));
+  }
+  for (const char c : lhs_eq) {
+    if (!absl::StrContains(out_eq, c) && !absl::StrContains(rhs_eq, c)) {
+      absl::StrAppend(&out_eq, std::string(1, c));
+    }
+  }
+  for (const char c : rhs_eq) {
+    if (!absl::StrContains(out_eq, c) && !absl::StrContains(lhs_eq, c)) {
+      absl::StrAppend(&out_eq, std::string(1, c));
+    }
+  }
+
+  return absl::StrCat(lhs_eq, ",", rhs_eq, "->", out_eq);
+}
+
+Value CreateEinsumOpFromXlaDotV2Op(OpBuilder& builder, const Location loc,
+                                   Value lhs, Value rhs, Value output,
+                                   StringAttr dot_dimension_numbers_str) {
+  xla::DotDimensionNumbers dot_dimension_numbers;
+  dot_dimension_numbers.ParseFromString(dot_dimension_numbers_str.str());
+  SmallVector<Value> input_arguments = {lhs, rhs};
+  const int lhs_rank = mlir::cast<ShapedType>(lhs.getType()).getShape().size();
+  const int rhs_rank = mlir::cast<ShapedType>(rhs.getType()).getShape().size();
+
+  const std::string einsum_equation =
+      CreateEinsumEquation(dot_dimension_numbers, lhs_rank, rhs_rank);
+
+  return builder.create<TF::EinsumOp>(loc, output.getType(), input_arguments,
+                                      builder.getStringAttr(einsum_equation));
+}
+
+// Restores the collapsed dimensions to the `tensor_type`. `collapsed_dims`
+// designate the dimension indices that were collapsed to produce `tensor_type`.
+// The restored dimensions' sizes are 1, according to the semantics of
+// `XlaGatherOp (https://www.tensorflow.org/xla/operation_semantics#gather). The
+// resulting type's shape has `tensor_type.size() + collapsed_dims.size()`
+// dimensions.
+RankedTensorType RestoreCollapsedDimensions(
+    const RankedTensorType tensor_type,
+    const absl::flat_hash_set<int64_t>& collapsed_dims) {
+  ArrayRef<int64_t> original_tensor_shape = tensor_type.getShape();
+  const int output_tensor_rank =
+      original_tensor_shape.size() + collapsed_dims.size();
+  auto shape_itr = tensor_type.getShape().begin();
+
+  // Populate the dimensions of the output shape, including the restored
+  // dimensions.
+  SmallVector<int64_t> output_shape(output_tensor_rank);
+  for (int i = 0; i < output_tensor_rank; i++) {
+    if (collapsed_dims.contains(i)) {
+      // The collapsed dimension's size should have been 1, so it restores the
+      // dimension with size 1.
+      output_shape[i] = 1;
+    } else {
+      output_shape[i] = *shape_itr;
+      shape_itr++;
+    }
+  }
+
+  return RankedTensorType::get(output_shape, tensor_type.getElementType());
+}
+
+// Determines the output type of the `SliceOp` when it is being inserted in
+// place of a `XlaGatherOp`. When the dimensions of `xla_gather_op_output_type`
+// is known, the `collapsed_dims` are restored. `xla_gather_op_output_type` is
+// the result of collapsing the `collapsed_dims`, but the `SliceOp`'s output
+// should not have the dimensions collapsed already. Returns
+// `xla_gather_op_output_type` unchanged if the rank is unknown.
+//
+// Examples:
+//   * If `xla_gather_op_output_type` == tensor<*xf32>, then it returns:
+//     tensor<*xf32>.
+//   * If `xla_gather_op_output_type` == tensor<3x5xi32> and `collapsed_dims` ==
+//     {0}, then it returns: tensor<1x3x5xi32>.
+//   * If `xla_gather_op_output_type` == tensor<3x5xf32> and `collapsed_dims` ==
+//     {1, 3}, then it returns: tensor<3x1x5x1xf32>.
+Type GetSliceOpOutputType(Type xla_gather_op_output_type,
+                          const absl::flat_hash_set<int64_t>& collapsed_dims) {
+  if (auto ranked_output_type =
+          mlir::dyn_cast<RankedTensorType>(xla_gather_op_output_type);
+      ranked_output_type) {
+    return RestoreCollapsedDimensions(ranked_output_type, collapsed_dims);
+  }
+
+  return xla_gather_op_output_type;
+}
+
+// TODO (b/275225582): Supports Xla Gather op in general case.
+bool IsXlaGatherWithoutBatch(Value operand, Value start_indices) {
+  auto operand_type = mlir::dyn_cast_or_null<ShapedType>(operand.getType());
+  auto start_indices_type =
+      mlir::dyn_cast_or_null<ShapedType>(start_indices.getType());
+  if (start_indices_type == nullptr || operand_type == nullptr) return false;
+  return start_indices_type.getShape().size() == 1;
+}
+
+Value CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch(
+    OpBuilder& builder, const Location loc, Value operand, Value start_indices,
+    Value slice_sizes, Value output, StringAttr dimension_numbers_str) {
+  // Reads dimension numbers.
+  xla::GatherDimensionNumbers dimension_numbers;
+  dimension_numbers.ParseFromString(dimension_numbers_str.str());
+
+  // Construct full start_indices with given start_indices and
+  // start_index_map.
+  const ArrayRef<int64_t> operand_shape =
+      mlir::cast<ShapedType>(operand.getType()).getShape();
+  const int64_t operand_rank = operand_shape.size();
+
+  // Fills zeros if start_index is not given in start_indices.
+  Value empty_start_indices = builder.create<TF::FillOp>(
+      loc, RankedTensorType::get({operand_rank}, builder.getI64Type()),
+      /*shape=*/Create1DConstValue<int64_t>(builder, loc, {operand_rank}),
+      /*value=*/CreateScalarConstValue<int64_t>(builder, loc, 0));
+
+  // Converts start_index_map proto to tensor.
+  const int64_t index_map_size = dimension_numbers.start_index_map().size();
+  SmallVector<int64_t> indices(index_map_size);
+  for (int64_t i = 0; i < index_map_size; i++) {
+    indices[i] = dimension_numbers.start_index_map()[i];
+  }
+
+  // Fill elements from start_indices with start_index_map
+  Value scattered_start_indices = builder.create<TF::TensorScatterUpdateOp>(
+      loc, empty_start_indices,
+      /*indices=*/
+      builder.create<TF::ReshapeOp>(
+          loc, RankedTensorType::get({index_map_size, 1}, builder.getI64Type()),
+          Create1DConstValue<int64_t>(builder, loc, indices),
+          Create1DConstValue<int64_t>(builder, loc, {index_map_size, 1})),
+      /*value=*/
+      builder.create<TF::CastOp>(
+          loc,
+          RankedTensorType::get(
+              mlir::cast<ShapedType>(start_indices.getType()).getShape(),
+              builder.getI64Type()),
+          start_indices));
+
+  absl::flat_hash_set<int64_t> collapsed_dims;
+  collapsed_dims.insert(dimension_numbers.collapsed_slice_dims().begin(),
+                        dimension_numbers.collapsed_slice_dims().end());
+
+  // Slice operand by constructed start_indices and slice_sizes.
+  auto slice_op = builder.create<TF::SliceOp>(
+      loc, GetSliceOpOutputType(output.getType(), collapsed_dims), operand,
+      /*start_indices=*/scattered_start_indices,
+      /*slice_sizes=*/
+      builder.create<TF::CastOp>(
+          loc,
+          RankedTensorType::get(
+              mlir::cast<ShapedType>(slice_sizes.getType()).getShape(),
+              builder.getI64Type()),
+          slice_sizes));
+
+  // Collapses dimensions by reshaping.
+  SmallVector<int64_t> new_shape(operand_rank - collapsed_dims.size());
+  for (int64_t i = 0, j = 0; i < operand_rank; i++) {
+    if (!collapsed_dims.contains(i)) {
+      new_shape[j++] = operand_shape[i];
+    }
+  }
+  if (!new_shape.empty()) new_shape[0] = -1;
+  return builder.create<TF::ReshapeOp>(
+      loc, output.getType(), slice_op,
+      Create1DConstValue(builder, loc, new_shape));
+}
+
+bool IsPrecisionEmpty(StringAttr prec_str) {
+  xla::PrecisionConfig prec;
+  prec.ParseFromString(prec_str.str());
+  return !prec.operand_precision_size();
+}
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tf_xla_op_to_tf_op.inc"
+
+void ConvertTfXlaOpToTfOpPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  auto func = getOperation();
+
+  // The pattern includes
+  // - Converting XlaDotV2Op to EinsumOp
+  // - Converting XlaGatherOp to SliceOp
+  RewritePatternSet patterns(ctx);
+  populateWithGenerated(patterns);
+
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    func.emitError() << "tf-quant-converting-tf-xla-op-to-tf-op failed.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertTfXlaOpToTfOpPass() {
+  return std::make_unique<ConvertTfXlaOpToTfOpPass>();
+}
+
+static PassRegistration<ConvertTfXlaOpToTfOpPass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tf_xla_op_to_tf_op.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tf_xla_op_to_tf_op.td
new file mode 100644
index 000000000000..2e6e92ba467f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tf_xla_op_to_tf_op.td
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
+
+// Only handles the case where precision config is default.
+def IsPrecisionEmpty :
+  Constraint<CPred<"IsPrecisionEmpty($0)">>;
+
+// Creates Einsum Op from XlaDotV2 Op by generating equation.
+def CreateEinsumOpFromXlaDotV2Op : NativeCodeCall<
+  "CreateEinsumOpFromXlaDotV2Op($_builder, $_loc, $0...)">;
+
+// Convert XlaDotV2 Op to Einsum Op with above two functions.
+def ConvertXlaDotV2OpToEinsumOp : Pat<
+  (TF_XlaDotV2Op:$dot $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+  (CreateEinsumOpFromXlaDotV2Op $lhs, $rhs, $dot, $dot_dimension_numbers),
+  [(IsPrecisionEmpty $precision_config)]>;
+
+// Only handles the case where batch_dimension is empty.
+def IsXlaGatherWithoutBatch :
+  Constraint<CPred<"IsXlaGatherWithoutBatch($0, $1)">>;
+
+// Create Slice op from XlaGather op without batch dimension.
+def CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch : NativeCodeCall<
+  "CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch($_builder, $_loc, $0...)">;
+
+// Convert XlaGather op without batch to Slice op with above two functions.
+def ConvertXlaGatherOpWithoutBatch : Pat<
+  (TF_XlaGatherOp:$gather $operand,
+    $start_indices, $slice_sizes, $dimension_numbers, $indices_are_sorted),
+  (CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch $operand,
+    $start_indices, $slice_sizes, $gather, $dimension_numbers),
+  [(IsXlaGatherWithoutBatch $operand, $start_indices)]>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tpu_model_to_cpu.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tpu_model_to_cpu.cc
new file mode 100644
index 000000000000..7f12e604655e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tpu_model_to_cpu.cc
@@ -0,0 +1,155 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tpu_model_to_cpu.inc"
+
+// Convert a TPU model to be compatible on CPU by rewriting/removing TPU ops.
+class ConvertTpuModelToCpuPass
+    : public PassWrapper<ConvertTpuModelToCpuPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ConvertTpuModelToCpuPass)
+  explicit ConvertTpuModelToCpuPass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-convert-tpu-model-to-cpu";
+  }
+  StringRef getDescription() const final {
+    return "Convert TPU models to CPU by rewriting TPU related operations.";
+  }
+
+  void runOnOperation() override;
+};
+
+class RemoveTpuOp : public RewritePattern {
+ public:
+  explicit RemoveTpuOp(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+
+ private:
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    // Remove `_tpu_replicate` attributes on each operation first.
+    if (op->hasAttr(tensorflow::kTPUReplicateAttr)) {
+      op->removeAttr(tensorflow::kTPUReplicateAttr);
+      return success();
+    }
+
+    // Remove TPU operations.
+    if (isa<TF::TPUReplicateMetadataOp, TF::TPUCompilationResultOp,
+            TF::TPUOrdinalSelectorOp>(op)) {
+      op->erase();
+    } else if (auto replicated_input_op =
+                   dyn_cast_or_null<TF::TPUReplicatedInputOp>(op)) {
+      // TODO(b/267700110): Handle multiple input/output cases.
+      rewriter.replaceOp(replicated_input_op, replicated_input_op.getInputs());
+    } else if (auto replicated_output_op =
+                   dyn_cast_or_null<TF::TPUReplicatedOutputOp>(op)) {
+      // TODO(b/267700110): Handle multiple input/output cases.
+      rewriter.replaceOp(replicated_output_op, replicated_output_op.getInput());
+    } else {
+      return failure();
+    }
+    return success();
+  }
+};
+
+class ReplaceTpuPartitionedCallOpWithPartitionedCallOp
+    : public OpRewritePattern<TF::TPUPartitionedCallOp> {
+ public:
+  using OpRewritePattern<TF::TPUPartitionedCallOp>::OpRewritePattern;
+
+ private:
+  LogicalResult matchAndRewrite(TF::TPUPartitionedCallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+    auto module_op = call_op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module_op);
+
+    auto f_name = f_attr.getValue();
+    func::FuncOp float_func =
+        dyn_cast<func::FuncOp>(symbol_table.lookup(f_name));
+    if (!float_func) {
+      return failure();
+    }
+    rewriter.setInsertionPointAfter(call_op);
+
+    // The TPUPartitionedCall has a TPUOrdinalSelectorOp for its last argument
+    // which should be removed. So the replaced PartitionedCall op should keep
+    // its original arguments except for the last element.
+    SmallVector<Value> args = call_op.getOperands().drop_back();
+
+    rewriter.replaceOpWithNewOp<TF::PartitionedCallOp>(
+        call_op, float_func.getResultTypes(), args, call_op.getArgAttrsAttr(),
+        call_op.getResAttrsAttr(), f_attr);
+    return success();
+  }
+};
+
+void ConvertTpuModelToCpuPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module_op = getOperation();
+
+  patterns.add<ReplaceTpuPartitionedCallOpWithPartitionedCallOp,
+               ReplaceBatchFunctionOpToPartitionedCallOp>(ctx);
+  patterns.add<RemoveTpuOp>(ctx);
+  patterns.add<quant::RemoveIdentity>(ctx);
+
+  if (failed(applyPatternsGreedily(module_op, std::move(patterns)))) {
+    module_op.emitError() << "tf-quant-convert-tpu-model-to-cpu pattern "
+                             "conversion did not converge.";
+    signalPassFailure();
+    return;
+  }
+}
+
+}  // namespace
+
+// Creates an instance of `ConvertTpuModelToCpuPass`.
+std::unique_ptr<OperationPass<ModuleOp>> CreateConvertTpuModelToCpuPass() {
+  return std::make_unique<ConvertTpuModelToCpuPass>();
+}
+
+static PassRegistration<ConvertTpuModelToCpuPass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tpu_model_to_cpu.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tpu_model_to_cpu.td
new file mode 100644
index 000000000000..b3e6cd6bdfa5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_convert_tpu_model_to_cpu.td
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+
+// Combines the two variadic arguments ($in_tensors and $captured_tensors).
+def GetBatchFunctionOpArgOperands:
+    NativeCodeCall<"cast<TF::BatchFunctionOp>($0[0].getDefiningOp()).getArgOperands()">;
+
+def CreateEmptyDictAttr : NativeCodeCall<"$_builder.getArrayAttr({})">;
+
+// Replaces `TF_BatchFunctionOp` into `TF_PartitionedCallOp` that calls the
+// same $f. This may be required, for example, when inlining is desired,
+// because `TF_BatchFunctionOp` doesn't have the `CallOpInterface` trait.
+def ReplaceBatchFunctionOpToPartitionedCallOp : Pat<
+  (TF_BatchFunctionOp:$src_op_res
+      $_, $_, $f, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_, $_),
+  (TF_PartitionedCallOp
+      (GetBatchFunctionOpArgOperands $src_op_res),
+      /*arg_attrs=*/(CreateEmptyDictAttr),
+      /*res_attrs=*/(CreateEmptyDictAttr),
+      $f,
+      /*config=*/(CreateStringAttr<"">),
+      /*config_proto=*/(CreateStringAttr<"">),
+      /*executor_type=*/(CreateStringAttr<"">))>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_duplicate_shape_determining_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_duplicate_shape_determining_constants.cc
new file mode 100644
index 000000000000..0d8351d24064
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_duplicate_shape_determining_constants.cc
@@ -0,0 +1,374 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <array>
+#include <iterator>
+#include <memory>
+
+#include "absl/algorithm/container.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+// Required to use LLVM_DEBUG macro.
+#define DEBUG_TYPE "tf-quant-duplicate-shape-determining-constants"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+// This pass duplicates constants that affect or determine the shape of a tensor
+// after being used in a computation for some op. Some specific operands of TF
+// ops (like the `dim` argument for `TF::ExpandDimsOp`) determine the shape of
+// the resulting tensor. If these operands are constants, they are duplicated
+// and replace the shape-determining operands. Each duplicated constant will
+// only be used as the shape-determining operand; it will not replace other
+// usages of the original constant. If the operands are not constants (i.e.
+// results of some other computation), then the pass recursively traverses the
+// call tree upwards and duplicates all constants found in the subtree in a
+// similar manner.
+//
+// This pass may be used to avoid placing shape-determining constants in the CPU
+// graph and pass them as arguments to the TPU graph (via `TPUPartitionedCall`).
+// If this happens, the XLA compiler cannot recognize such arguments as
+// constants and may result in an error.
+//
+// A set of predefined ops and operand indices is used to determine whether an
+// operand is a target for constant duplication.
+class DuplicateShapeDeterminingConstantsPass
+    : public PassWrapper<DuplicateShapeDeterminingConstantsPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      DuplicateShapeDeterminingConstantsPass)
+
+  StringRef getArgument() const final {
+    return "tf-quant-duplicate-shape-determining-constants";
+  }
+
+  StringRef getDescription() const final {
+    return "Duplicates shape-determining constants. A shape-determining "
+           "constant is a constant that are transitively used to change or "
+           "determine the shape of a tensor. For example, the second argument "
+           "'dim' to TF::ExpandDimsOp specifies the dimension index to expand.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Returns True iff the otuput value of `op` is either a compile time constant
+// or bounded from the XLA compiler's perspective, even if it is not a
+// `ConstOp`.
+bool IsOutputCompileTimeConstantOrBounded(Operation* op) {
+  return llvm::isa_and_nonnull<TF::ShapeOp, TF::ShapeNOp, TF::RankOp,
+                               TF::SizeOp, TF::TensorArraySizeV3Op,
+                               TF::XlaSetBoundOp>(op);
+}
+
+// Recursively duplicate constants for `op_operands` upward.
+void RecursivelyDuplicateConstantsForOperands(
+    llvm::ArrayRef<OpOperand*> op_operands) {
+  // Target operands to duplicate if it is a ConstOp.
+  llvm::SmallVector<OpOperand*, 4> duplication_targets{op_operands.begin(),
+                                                       op_operands.end()};
+
+  int target_idx = 0;
+  while (target_idx < duplication_targets.size()) {
+    OpOperand* curr_operand = duplication_targets[target_idx];
+    target_idx++;
+
+    Operation* owning_op = curr_operand->getOwner();
+    Operation* defining_op = curr_operand->get().getDefiningOp();
+
+    if (llvm::isa_and_nonnull<TF::ConstOp>(defining_op)) {
+      // No need to clone if this is the only use.
+      if (defining_op->hasOneUse()) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Not duplicating constant operand since it has only one "
+                      "usage. Op: "
+                   << curr_operand->getOperandNumber()
+                   << ", operand idx: " << curr_operand->getOperandNumber()
+                   << ", loc: " << owning_op->getLoc() << "\n");
+        continue;
+      }
+
+      mlir::OpBuilder builder{owning_op->getContext()};
+      builder.setInsertionPointAfter(defining_op);
+      auto const_op_cloned = builder.clone(*defining_op);
+
+      // Replace the operand with the duplicated op.
+      owning_op->setOperand(curr_operand->getOperandNumber(),
+                            const_op_cloned->getResult(0));
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Duplicated constant operand from: "
+                 << owning_op->getName().getStringRef()
+                 << ", operand idx: " << curr_operand->getOperandNumber()
+                 << ", loc: " << const_op_cloned->getLoc() << "\n");
+    } else if (IsOutputCompileTimeConstantOrBounded(defining_op)) {
+      // Stop the recursion early when the output of the defining op is
+      // considered compile-time constant from the XLA compiler's perspective.
+      continue;
+    } else if (!defining_op) {
+      // One example for this case is when `curr_operand` is a function
+      // argument.
+      owning_op->emitWarning()
+          << "Operand idx (zero-based): " << curr_operand->getOperandNumber()
+          << " does not have a defining op and cannot be duplicated.";
+    } else {
+      // If the operand's defining is not a ConstOp, recursively traverse
+      // "upwards" to find ConstOps that transitively produces the current
+      // operand and duplicate them.
+      auto op_operands = defining_op->getOpOperands();
+      absl::c_transform(
+          op_operands, std::back_inserter(duplication_targets),
+          [](OpOperand& op_operand) -> OpOperand* { return &op_operand; });
+    }
+  }
+}
+
+// Evaluate `operand_idx` w.r.t. `op`'s operands. If `operand_idx` is a positive
+// number or a zero, it is returned as it is. If it is a negative number, it
+// means it is counting backwards and will return the zero-based operand index
+// for `op`.
+//
+// `operand_idx` should be within the range: [-num_operands, num_operands - 1].
+int EvaluateOperandIdx(const int operand_idx, Operation& op) {
+  if (operand_idx < 0) {
+    // Calculate the actual index if a negative value is provided for
+    // `operand_idx`.
+    return op.getNumOperands() + operand_idx;
+  }
+  return operand_idx;
+}
+
+// Returns the pointers to operands at `operand_indices` of `op`.
+llvm::SmallVector<OpOperand*> GetOperands(Operation& op,
+                                          llvm::ArrayRef<int> operand_indices) {
+  llvm::SmallVector<OpOperand*> operands{};
+  for (const int operand_idx : operand_indices) {
+    const int evaluated_operand_idx = EvaluateOperandIdx(operand_idx, op);
+    operands.emplace_back(&op.getOpOperand(evaluated_operand_idx));
+  }
+
+  return operands;
+}
+
+// Represents an op type and its operand indices that should be "compile time
+// constant" from the XLA compiler's point of view.
+template <typename OpT, int... OperandIdx>
+struct CompileTimeConstantOperand {
+  static_assert(
+      sizeof...(OperandIdx) > 0,
+      "CompileTimeConstantOperand should have at least one operand index.");
+
+  using OpType = OpT;
+
+  // Returns the indices of operands that should be compile time constants.
+  static constexpr std::array<int, sizeof...(OperandIdx)> OperandIndices() {
+    return {OperandIdx...};
+  }
+};
+
+// Finds all op of type `T::OpType` `func_op` and recursively duplicates
+// constants used at the op's operands at `T::OperandIndices()`. It sequentially
+// does the same thing for `Ts`.
+template <typename T, typename... Ts>
+void DuplicateShapeDeterminingConstants(func::FuncOp func_op) {
+  for (auto op : func_op.getOps<typename T::OpType>()) {
+    RecursivelyDuplicateConstantsForOperands(
+        GetOperands(*op, T::OperandIndices()));
+  }
+
+  // Do the same thing for the rest of `Ts`.
+  if constexpr (sizeof...(Ts) != 0) {
+    DuplicateShapeDeterminingConstants<Ts...>(func_op);
+  }
+}
+
+void DuplicateShapeDeterminingConstantsPass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+
+  DuplicateShapeDeterminingConstants<
+      // go/keep-sorted start
+      CompileTimeConstantOperand<TF::AllToAllOp, 1>,  // $group_assignment
+      CompileTimeConstantOperand<TF::ArgMaxOp, 1>,    // $dimension
+      CompileTimeConstantOperand<TF::ArgMinOp, 1>,    // $dimension
+      // $orig_input_shape
+      CompileTimeConstantOperand<TF::AvgPool3DGradOp, 0>,
+      // $orig_input_shape
+      CompileTimeConstantOperand<TF::AvgPoolGradOp, 0>,
+      // $block_shape, $crops
+      CompileTimeConstantOperand<TF::BatchToSpaceNDOp, 1, 2>,
+      CompileTimeConstantOperand<TF::BatchToSpaceOp, 1>,      // $crops
+      CompileTimeConstantOperand<TF::BincountOp, 1>,          // $size
+      CompileTimeConstantOperand<TF::BroadcastArgsOp, 0, 1>,  // $s0, $s1
+      // $s0, $s1
+      CompileTimeConstantOperand<TF::BroadcastGradientArgsOp, 0, 1>,
+      CompileTimeConstantOperand<TF::BroadcastToOp, 1>,  // $shape
+      /// $group_assignment
+      CompileTimeConstantOperand<TF::CollectiveAssignGroupV2Op, 0>,
+      // $source_target_pairs
+      CompileTimeConstantOperand<TF::CollectivePermuteOp, 1>,
+      // $group_size, $group_key
+      CompileTimeConstantOperand<TF::CollectiveReduceV2Op, 1, 2>,
+      CompileTimeConstantOperand<TF::ConcatV2Op, -1>,  // (variadic) $axis
+      // $filter_sizes
+      CompileTimeConstantOperand<TF::Conv2DBackpropFilterOp, 1>,
+      CompileTimeConstantOperand<TF::Conv2DBackpropInputOp, 0>,  // $input_sizes
+      // $filter_sizes
+      CompileTimeConstantOperand<TF::Conv3DBackpropFilterV2Op, 1>,
+      // $input_sizes
+      CompileTimeConstantOperand<TF::Conv3DBackpropInputV2Op, 0>,
+      // $group_assignment
+      CompileTimeConstantOperand<TF::CrossReplicaSumOp, 1>,
+      CompileTimeConstantOperand<TF::CumprodOp, 1>,              // $axis
+      CompileTimeConstantOperand<TF::CumsumOp, 1>,               // $axis
+      CompileTimeConstantOperand<TF::CumulativeLogsumexpOp, 1>,  // $axis
+      // $filter_sizes
+      CompileTimeConstantOperand<TF::DepthwiseConv2dNativeBackpropFilterOp, 1>,
+      // $input_sizes
+      CompileTimeConstantOperand<TF::DepthwiseConv2dNativeBackpropInputOp, 0>,
+      CompileTimeConstantOperand<TF::EmptyOp, 0>,  // $shape
+      // $element_shape, $max_num_elements
+      CompileTimeConstantOperand<TF::EmptyTensorListOp, 0, 1>,
+      CompileTimeConstantOperand<TF::ExpandDimsOp, 1>,   // $dim
+      CompileTimeConstantOperand<TF::FillOp, 0>,         // $dims
+      CompileTimeConstantOperand<TF::GatherV2Op, 2>,     // $axis
+      CompileTimeConstantOperand<TF::IRFFT2DOp, 1>,      // $fft_length
+      CompileTimeConstantOperand<TF::IRFFT3DOp, 1>,      // $fft_length
+      CompileTimeConstantOperand<TF::IRFFTOp, 1>,        // $fft_length
+      CompileTimeConstantOperand<TF::InTopKV2Op, 2>,     // $k
+      CompileTimeConstantOperand<TF::LinSpaceOp, 2>,     // $num
+      CompileTimeConstantOperand<TF::ListDiffOp, 0, 1>,  // $x, $y
+      // $k, $padding_value
+      CompileTimeConstantOperand<TF::MatrixDiagPartV3Op, 1, 2>,
+      // $k, $num_rows, $num_cols, $padding_value
+      CompileTimeConstantOperand<TF::MatrixDiagV2Op, 1, 2, 3, 4>,
+      // $k, $num_rows, $num_cols, $padding_value
+      CompileTimeConstantOperand<TF::MatrixDiagV3Op, 1, 2, 3, 4>,
+      CompileTimeConstantOperand<TF::MatrixSetDiagV2Op, 2>,  // $k
+      CompileTimeConstantOperand<TF::MatrixSetDiagV3Op, 2>,  // $k
+      CompileTimeConstantOperand<TF::MaxOp, 1>,  // $reduction_indices
+      // $ksize, $strides
+      CompileTimeConstantOperand<TF::MaxPoolGradGradV2Op, 3, 4>,
+      // $ksize, $strides
+      CompileTimeConstantOperand<TF::MaxPoolGradV2Op, 2, 3>,
+      CompileTimeConstantOperand<TF::MaxPoolV2Op, 1, 2>,   // $ksize, $strides
+      CompileTimeConstantOperand<TF::MeanOp, 1>,           // $reduction_indices
+      CompileTimeConstantOperand<TF::MirrorPadGradOp, 1>,  // $paddings
+      CompileTimeConstantOperand<TF::MirrorPadOp, 1>,      // $paddings
+      CompileTimeConstantOperand<TF::MultinomialOp, 1>,    // $num_samples
+      // $max_output_size
+      CompileTimeConstantOperand<TF::NonMaxSuppressionV3Op, 2>,
+      // $max_output_size
+      CompileTimeConstantOperand<TF::NonMaxSuppressionV4Op, 2>,
+      CompileTimeConstantOperand<TF::OneHotOp, 1>,  // $depth
+      CompileTimeConstantOperand<TF::PadOp, 1>,     // $paddings
+      CompileTimeConstantOperand<TF::PadV2Op, 1>,   // $paddings
+      // $shape
+      CompileTimeConstantOperand<TF::ParameterizedTruncatedNormalOp, 0>,
+      CompileTimeConstantOperand<TF::RFFT2DOp, 1>,                // $fft_length
+      CompileTimeConstantOperand<TF::RFFT3DOp, 1>,                // $fft_length
+      CompileTimeConstantOperand<TF::RFFTOp, 1>,                  // $fft_length
+      CompileTimeConstantOperand<TF::RandomStandardNormalOp, 0>,  // $shape
+      CompileTimeConstantOperand<TF::RandomUniformIntOp, 0>,      // $shape
+      CompileTimeConstantOperand<TF::RandomUniformOp, 0>,         // $shape
+      // $start, $limit, $delta
+      CompileTimeConstantOperand<TF::RangeOp, 0, 1, 2>,
+      CompileTimeConstantOperand<TF::ReshapeOp, 1>,                // $shape
+      CompileTimeConstantOperand<TF::ResizeBilinearOp, 1>,         // $size
+      CompileTimeConstantOperand<TF::ResizeNearestNeighborOp, 1>,  // $size
+      // $begin, $end, $strides
+      CompileTimeConstantOperand<TF::ResourceStridedSliceAssignOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::ReverseOp, 1>,        // $dims
+      CompileTimeConstantOperand<TF::ReverseV2Op, 1>,      // $axis
+      CompileTimeConstantOperand<TF::ScatterNdOp, 2>,      // $shape
+      CompileTimeConstantOperand<TF::SegmentSumV2Op, 2>,   // $num_segments
+      CompileTimeConstantOperand<TF::SliceOp, 1, 2>,       // $begin, $size
+      CompileTimeConstantOperand<TF::SparseToDenseOp, 1>,  // $output_shape
+      CompileTimeConstantOperand<TF::SplitOp, 0>,          // $split_dim
+      // $size_splits, $split_dim
+      CompileTimeConstantOperand<TF::SplitVOp, 1, 2>,
+      CompileTimeConstantOperand<TF::StackV2Op, 0>,  // $max_size
+      // $num_samples
+      CompileTimeConstantOperand<TF::StatelessMultinomialOp, 1>,
+      // $shape, $begin, $end, $strides
+      CompileTimeConstantOperand<TF::StridedSliceGradOp, 0, 1, 2, 3>,
+      // $begin, $end, $strides
+      CompileTimeConstantOperand<TF::StridedSliceOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::SumOp, 1>,  // $reduction_indices
+      CompileTimeConstantOperand<TF::TensorArraySplitV3Op, 2>,  // $lengths
+      CompileTimeConstantOperand<TF::TensorArrayV3Op, 0>,       // $size
+      // $element_shape
+      CompileTimeConstantOperand<TF::TensorListFromTensorOp, 1>,
+      // $element_shape, $num_elements
+      CompileTimeConstantOperand<TF::TensorListReserveOp, 0, 1>,
+      // $begin, $end, $strides
+      CompileTimeConstantOperand<TF::TensorStridedSliceUpdateOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::TileOp, 1>,                // $multiples
+      CompileTimeConstantOperand<TF::TopKV2Op, 1>,              // $k
+      CompileTimeConstantOperand<TF::TransposeOp, 1>,           // $perm
+      CompileTimeConstantOperand<TF::TruncatedNormalOp, 0>,     // $shape
+      CompileTimeConstantOperand<TF::UnsortedSegmentMaxOp, 2>,  // $num_segments
+      CompileTimeConstantOperand<TF::UnsortedSegmentMinOp, 2>,  // $num_segments
+      CompileTimeConstantOperand<TF::UnsortedSegmentSumOp, 2>,  // $num_segments
+      // $broadcast_dims
+      CompileTimeConstantOperand<TF::XlaBroadcastHelperOp, 2>,
+      // $window_strides, $padding, $lhs_dilation, $rhs_dilation,
+      // $feature_group_count
+      CompileTimeConstantOperand<TF::XlaConvOp, 2, 3, 4, 5, 6>,
+      // $window_strides, $padding, $lhs_dilation, $rhs_dilation,
+      // $feature_group_count
+      CompileTimeConstantOperand<TF::XlaConvV2Op, 2, 3, 4, 5, 6>,
+      CompileTimeConstantOperand<TF::XlaDynamicSliceOp, 2>,  // $slice_indices
+      CompileTimeConstantOperand<TF::XlaGatherOp, 2>,        // $slice_sizes
+      // $padding_low, $padding_high, $padding_interior
+      CompileTimeConstantOperand<TF::XlaPadOp, 2, 3, 4>,
+      // $window_dimensions, $window_strides, $base_dilations,
+      // $window_dilations, $padding
+      CompileTimeConstantOperand<TF::XlaReduceWindowOp, 2, 3, 4, 5, 6>,
+      // $dim_index
+      CompileTimeConstantOperand<TF::XlaRemoveDynamicDimensionSizeOp, 1>,
+      // $window_dimensions, $window_strides, $padding
+      CompileTimeConstantOperand<TF::XlaSelectAndScatterOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::XlaSetBoundOp, 1>,  // $bound
+      // $dim_index
+      CompileTimeConstantOperand<TF::XlaSetDynamicDimensionSizeOp, 1>
+      // go/keep-sorted end
+      >(func_op);
+}
+
+static PassRegistration<DuplicateShapeDeterminingConstantsPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDuplicateShapeDeterminingConstantsPass() {
+  return std::make_unique<DuplicateShapeDeterminingConstantsPass>();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_custom_aggregation_ops.cc
new file mode 100644
index 000000000000..f8808748885c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_custom_aggregation_ops.cc
@@ -0,0 +1,370 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::stablehlo::quantization::CalibrationOptions;
+using ::stablehlo::quantization::Method;
+
+constexpr StringRef kQuantTraitAttrName = "_tfl_quant_trait";
+
+// Whether the op is a call op to lifted composite function.
+bool IsCallToQuantizableLiftedFunction(Operation *op) {
+  if (!op) return false;
+  if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
+      xla_call_module_op != nullptr) {
+    absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+    if (method.ok() && method->has_static_range_ptq()) return true;
+  }
+
+  TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
+  return call_op && call_op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+         call_op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue() ==
+             llvm::StringRef(
+                 QuantTraitValues[QuantizationTrait::FullyQuantizable]);
+}
+
+// Returns the composite function name.
+std::optional<StringRef> GetCompsiteFunctionName(Operation *op) {
+  if (!IsCallToQuantizableLiftedFunction(op)) return std::nullopt;
+
+  if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
+      xla_call_module_op != nullptr) {
+    auto entry_function_attr = xla_call_module_op->getAttrOfType<StringAttr>(
+        kOriginalStablehloEntryFunctionAttrName);
+    if (!entry_function_attr) return std::nullopt;
+    return entry_function_attr.getValue();
+  } else {
+    TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+    if (!f_attr) return std::nullopt;
+    return f_attr.getValue();
+  }
+}
+
+class InsertCustomAggregationOpsPass
+    : public PassWrapper<InsertCustomAggregationOpsPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  explicit InsertCustomAggregationOpsPass() : test_mode_(true) {
+    initializeForTest();
+  }
+
+  explicit InsertCustomAggregationOpsPass(const CalibrationOptions &calib_opts)
+      : test_mode_(false), calib_opts_(calib_opts) {}
+
+  InsertCustomAggregationOpsPass(const InsertCustomAggregationOpsPass &other) {
+    test_mode_ = other.test_mode_;
+    test_case_ = other.test_case_;
+    calib_opts_ = other.calib_opts_;
+    initializeForTest();
+  }
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertCustomAggregationOpsPass)
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in the textual format (on
+    // the commandline for example).
+    return "tf-quant-insert-custom-aggregation-ops";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Insert custom aggregation ops for the calibration procedure";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
+  void runOnOperation() override;
+
+ private:
+  enum TestCase {
+    TEST_CASE_MIN_MAX,
+    TEST_CASE_AVERAGE_MIN_MAX,
+    TEST_CASE_HISTOGRAM_PERCENTILE,
+    TEST_CASE_HISTOGRAM_MSE_BRUTEFORCE,
+    TEST_CASE_HISTOGRAM_MSE_MAX_FREQUENCY,
+    TEST_CASE_HISTOGRAM_MSE_SYMMETRIC,
+  };
+
+  bool test_mode_;
+  CalibrationOptions calib_opts_;
+  Option<TestCase> test_case_{
+      *this, "test-case",
+      llvm::cl::desc(
+          "Select a the test case for testing various calibration methods. It "
+          "sets the value of calib_opts_ when test_mode_ is true."),
+      llvm::cl::init(TEST_CASE_MIN_MAX),
+      llvm::cl::values(
+          clEnumValN(TEST_CASE_MIN_MAX, "MIN_MAX",
+                     "Uses MIN_MAX calibration method"),
+          clEnumValN(TEST_CASE_AVERAGE_MIN_MAX, "AVERAGE_MIN_MAX",
+                     "Uses AVERAGE_MIN_MAX calibration method"),
+          clEnumValN(TEST_CASE_HISTOGRAM_PERCENTILE, "HISTOGRAM_PERCENTILE",
+                     "Uses HISTOGRAM_PERCENTILE calibration method"),
+          clEnumValN(TEST_CASE_HISTOGRAM_MSE_BRUTEFORCE,
+                     "HISTOGRAM_MSE_BRUTEFORCE",
+                     "Uses HISTOGRAM_MSE_BRUTEFORCE calibration method"),
+          clEnumValN(TEST_CASE_HISTOGRAM_MSE_MAX_FREQUENCY,
+                     "HISTOGRAM_MSE_MAX_FREQUENCY",
+                     "Uses HISTOGRAM_MSE_MAX_FREQUENCY calibration "
+                     "method"),
+          clEnumValN(TEST_CASE_HISTOGRAM_MSE_SYMMETRIC,
+                     "HISTOGRAM_MSE_SYMMETRIC",
+                     "Uses HISTOGRAM_MSE_SYMMETRIC calibration "
+                     "method"))};
+
+  // Initialize for tests.
+  void initializeForTest() {
+    if (!test_mode_) return;
+
+    switch (test_case_.getValue()) {
+      case TEST_CASE_MIN_MAX:
+        calib_opts_.set_calibration_method(
+            CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
+        break;
+      case TEST_CASE_AVERAGE_MIN_MAX:
+        calib_opts_.set_calibration_method(
+            CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX);
+        break;
+      case TEST_CASE_HISTOGRAM_PERCENTILE: {
+        calib_opts_.set_calibration_method(
+            CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE);
+        auto calibration_parameters =
+            CalibrationOptions::CalibrationParameters();
+        calibration_parameters.set_num_bins(512);
+        calibration_parameters.set_min_percentile(0.001);
+        calibration_parameters.set_max_percentile(99.999);
+        calib_opts_.mutable_calibration_parameters()->CopyFrom(
+            calibration_parameters);
+        break;
+      }
+      case TEST_CASE_HISTOGRAM_MSE_BRUTEFORCE: {
+        calib_opts_.set_calibration_method(
+            CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE);
+        auto calibration_parameters =
+            CalibrationOptions::CalibrationParameters();
+        calibration_parameters.set_num_bins(512);
+        calib_opts_.mutable_calibration_parameters()->CopyFrom(
+            calibration_parameters);
+        break;
+      }
+      case TEST_CASE_HISTOGRAM_MSE_MAX_FREQUENCY: {
+        calib_opts_.set_calibration_method(
+            CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
+        auto calibration_parameters =
+            CalibrationOptions::CalibrationParameters();
+        calibration_parameters.set_num_bins(512);
+        calib_opts_.mutable_calibration_parameters()->CopyFrom(
+            calibration_parameters);
+        break;
+      }
+      case TEST_CASE_HISTOGRAM_MSE_SYMMETRIC: {
+        calib_opts_.set_calibration_method(
+            CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC);
+        auto calibration_parameters =
+            CalibrationOptions::CalibrationParameters();
+        calibration_parameters.set_num_bins(512);
+        calib_opts_.mutable_calibration_parameters()->CopyFrom(
+            calibration_parameters);
+        break;
+      }
+    }
+  }
+};
+
+static PassRegistration<InsertCustomAggregationOpsPass> pass;
+
+class AddCustomAggregationOp : public RewritePattern {
+ public:
+  // Does not take ownership of context, which must refer to a valid value that
+  // outlives this object.
+  explicit AddCustomAggregationOp(MLIRContext *context,
+                                  const CalibrationOptions &calib_opts)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context),
+        calib_opts_(calib_opts) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    // Return early if the given operator is the custom aggregator op.
+    if (dyn_cast_or_null<TF::CustomAggregatorOp>(op)) return failure();
+
+    // The CustomAggregatorOp is only added after quantizable values.
+    SmallVector<Value> quantizable_values;
+    SmallVector<std::string> aggregator_ids;
+    if (IsCallToQuantizableLiftedFunction(op)) {
+      std::optional<StringRef> composite_function_name =
+          GetCompsiteFunctionName(op);
+      if (!composite_function_name.has_value()) return failure();
+
+      // Quantize inputs of quantizable composite functions.
+      for (OpOperand &input : op->getOpOperands()) {
+        Type element_type = getElementTypeOrSelf(input.get().getType());
+        // Non-float cases won't be calibrated.
+        if (!element_type.isF32()) {
+          continue;
+        }
+
+        // Skip when there is any already existing CustomAggregatorOp found.
+        Operation *defining_op = input.get().getDefiningOp();
+        if (dyn_cast_or_null<TF::CustomAggregatorOp>(defining_op)) {
+          continue;
+        }
+
+        // Skip calibration when the given operand comes from a constant.
+        if (defining_op != nullptr &&
+            defining_op->hasTrait<OpTrait::ConstantLike>()) {
+          continue;
+        }
+
+        quantizable_values.push_back(input.get());
+        aggregator_ids.push_back(
+            (llvm::Twine(composite_function_name.value()) + "_arg_" +
+             llvm::Twine(input.getOperandNumber()) + "_calibration_method_" +
+             llvm::Twine(calib_opts_.calibration_method()))
+                .str());
+      }
+    } else {
+      // Quantize output of fully quantizable composite functions.
+      for (Value input : op->getOperands()) {
+        auto defining_op = input.getDefiningOp();
+        std::optional<StringRef> composite_function_name =
+            GetCompsiteFunctionName(defining_op);
+        if (!composite_function_name.has_value()) continue;
+
+        // Do not add CustomAggregatorOp after Gather since it is a weight-only
+        // quantizable op.
+        if (auto call_op =
+                dyn_cast_or_null<TF::PartitionedCallOp>(defining_op)) {
+          StringRef function_name =
+              mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
+          if (function_name.contains("gather")) continue;
+        }
+
+        quantizable_values.push_back(input);
+        // All composite functions have a single result at the moment.
+        aggregator_ids.push_back((llvm::Twine(composite_function_name.value()) +
+                                  "_calibration_method_" +
+                                  llvm::Twine(calib_opts_.calibration_method()))
+                                     .str());
+      }
+    }
+    if (quantizable_values.empty()) return failure();
+
+    int32_t effective_num_bins = GetNumBins(calib_opts_);
+    for (auto [value, aggregator_id] :
+         llvm::zip_equal(quantizable_values, aggregator_ids)) {
+      // ID attribute will have empty value for now.
+      SmallVector<NamedAttribute, 5> attributes{
+          rewriter.getNamedAttr("id", rewriter.getStringAttr(aggregator_id)),
+          rewriter.getNamedAttr(
+              "calibration_method",
+              rewriter.getI32IntegerAttr(calib_opts_.calibration_method())),
+          rewriter.getNamedAttr("num_bins",
+                                rewriter.getI32IntegerAttr(effective_num_bins)),
+          rewriter.getNamedAttr(
+              "min_percentile",
+              rewriter.getF32FloatAttr(
+                  calib_opts_.calibration_parameters().min_percentile())),
+          rewriter.getNamedAttr(
+              "max_percentile",
+              rewriter.getF32FloatAttr(
+                  calib_opts_.calibration_parameters().max_percentile())),
+      };
+
+      SmallVector<Type, 4> output_types{
+          value.getType(),
+          RankedTensorType::get({}, rewriter.getF32Type()),
+          RankedTensorType::get({}, rewriter.getF32Type()),
+          RankedTensorType::get({effective_num_bins}, rewriter.getI64Type()),
+      };
+
+      // Insert custom aggregation op between operand and operator.
+      rewriter.setInsertionPointAfterValue(value);
+      Operation *aggregator_op = rewriter.create<TF::CustomAggregatorOp>(
+          op->getLoc(), output_types, value, attributes);
+
+      Value aggregator_op_result = aggregator_op->getOpResult(0);
+      value.replaceAllUsesExcept(aggregator_op_result, aggregator_op);
+    }
+
+    return success();
+  }
+
+ private:
+  CalibrationOptions calib_opts_;
+};
+
+void InsertCustomAggregationOpsPass::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  func::FuncOp func = getOperation();
+
+  patterns.add<AddCustomAggregationOp>(ctx, calib_opts_);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    func.emitError() << "tf-quant-insert-custom-aggregation-ops failed.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateInsertCustomAggregationOpsPass(const CalibrationOptions &calib_opts) {
+  return std::make_unique<InsertCustomAggregationOpsPass>(calib_opts);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_main_function.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_main_function.cc
new file mode 100644
index 000000000000..d73529a43c7a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_main_function.cc
@@ -0,0 +1,442 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+using ::tensorflow::kImportModelDefaultGraphFuncName;
+
+constexpr StringRef kEntryFunctionAttr = "tf.entry_function";
+
+// The ConvertMlirToGraphdef requires the provided input module to have a main
+// function, which might not exist in case of multi-signature graphs. In that
+// case, this pass will create a new main function, which calls signature
+// functions.
+//
+// An already existing @main function will be renamed by attaching a numeric
+// suffix like `@main_0` to avoid conflict with the newly created main function.
+class TFInsertMainFunctionPass
+    : public PassWrapper<TFInsertMainFunctionPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TFInsertMainFunctionPass)
+
+  explicit TFInsertMainFunctionPass() = default;
+
+  StringRef getArgument() const override {
+    return "tf-quant-insert-main-function";
+  }
+
+  StringRef getDescription() const override {
+    return "Inserts the main function to the module.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Checks if a FuncOp is exported.
+bool IsExported(func::FuncOp op) {
+  auto exported_names =
+      op->getAttrOfType<ArrayAttr>(kTfSavedModelExportedNamesAttr);
+  return exported_names && !exported_names.empty();
+}
+
+// Check if a function is an entry function.
+bool IsEntryFunction(func::FuncOp op) {
+  return op->hasAttr(kEntryFunctionAttr);
+}
+
+// Returns true iff the provided FuncOp is qualified to be included in the main
+// function.
+bool ShouldIncludeInMainFunction(func::FuncOp func_op) {
+  return !func_op.isPrivate() && IsExported(func_op) &&
+         IsEntryFunction(func_op);
+}
+
+// Sets a function to be private so it can be referred internally.
+void SetFunctionPrivate(func::FuncOp func) {
+  func.setVisibility(SymbolTable::Visibility::Private);
+
+  // The `tf_saved_model` attributes can only be applied to public functions.
+  for (auto& attr : func->getAttrs()) {
+    StringRef attr_name = attr.getName().getValue();
+    if (attr_name.starts_with("tf_saved_model.")) {
+      func->removeAttr(attr_name);
+    }
+  }
+
+  auto iface = cast<FunctionOpInterface>(func.getOperation());
+  for (int i = 0; i < func.getNumArguments(); ++i) {
+    for (auto& attr : iface.getArgAttrs(i)) {
+      const StringAttr& attr_name = attr.getName();
+      if (attr_name.getValue().starts_with("tf_saved_model.")) {
+        func.removeArgAttr(i, attr_name);
+      }
+    }
+  }
+  for (int i = 0; i < func.getNumResults(); ++i) {
+    for (auto& attr : iface.getResultAttrs(i)) {
+      const StringAttr& attr_name = attr.getName();
+      if (attr_name.getValue().starts_with("tf_saved_model.")) {
+        func.removeResultAttr(i, attr_name);
+      }
+    }
+  }
+}
+
+// Information to identify an output in its node and in the model output list.
+// Ex: If the model output list is ["add:0", "topk:0": "topk:1"], then the
+// output corresponding to "topk:1" will have output_index=2 and tensor_index=1.
+struct OutputInfo {
+  // The index of this output in the model output list.
+  int32_t output_index;
+  // The index of this output in its node.
+  int32_t tensor_index;
+  // The output value.
+  Value value;
+};
+
+// Makes input/output names across entry functions unique if necessary. If a
+// duplicated name is found, this function will add signature prefix for all the
+// input/output names.
+void GetUniqueInputOutputNodeNames(ModuleOp module_op,
+                                   std::vector<std::string>& input_name_vec,
+                                   std::vector<std::string>& output_name_vec) {
+  bool need_prefix_for_input_name = false;
+  bool need_prefix_for_output_name = false;
+  std::vector<StringRef> fn_input_name_vec, fn_output_name_vec;
+  llvm::StringSet<> input_name_set, output_name_set;
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (!ShouldIncludeInMainFunction(func_op)) continue;
+    if (auto tf_attrs =
+            func_op->getAttrOfType<DictionaryAttr>(kEntryFunctionAttr)) {
+      StringRef function_name = func_op.getSymName();
+
+      if (auto inputs_attr = tf_attrs.get("inputs")) {
+        const std::string inputs_attr_str =
+            mlir::cast<StringAttr>(inputs_attr).getValue().str();
+        std::vector<std::string> fn_input_names =
+            absl::StrSplit(inputs_attr_str, ',', absl::SkipEmpty());
+
+        for (StringRef input_name : fn_input_names) {
+          if (input_name_set.contains(input_name)) {
+            // Found a duplicated name, all input names will be prefixed by
+            // their corresponding function names.
+            need_prefix_for_input_name = true;
+          }
+          input_name_set.insert(input_name);
+          fn_input_name_vec.push_back(function_name);
+        }
+        input_name_vec.insert(input_name_vec.end(),
+                              std::make_move_iterator(fn_input_names.begin()),
+                              std::make_move_iterator(fn_input_names.end()));
+      }
+
+      if (auto outputs_attr = tf_attrs.get("outputs")) {
+        const std::string outputs_attr_str =
+            mlir::cast<StringAttr>(outputs_attr).getValue().str();
+        std::vector<std::string> fn_output_names =
+            absl::StrSplit(outputs_attr_str, ',', absl::SkipEmpty());
+
+        for (StringRef output_name : fn_output_names) {
+          if (output_name_set.contains(output_name)) {
+            // Found a duplicated name, all output names will be prefixed by
+            // their corresponding function names.
+            need_prefix_for_output_name = true;
+          }
+          output_name_set.insert(output_name);
+          fn_output_name_vec.push_back(function_name);
+        }
+        output_name_vec.insert(output_name_vec.end(),
+                               std::make_move_iterator(fn_output_names.begin()),
+                               std::make_move_iterator(fn_output_names.end()));
+      }
+    }
+  }
+
+  if (need_prefix_for_input_name) {
+    absl::c_transform(
+        input_name_vec, fn_input_name_vec, input_name_vec.begin(),
+        [](const std::string& input_name, const StringRef fn_name) {
+          return absl::StrCat(fn_name.str(), "_", input_name);
+        });
+  }
+  if (need_prefix_for_output_name) {
+    absl::c_transform(
+        output_name_vec, fn_output_name_vec, output_name_vec.begin(),
+        [](const std::string& output_name, const StringRef fn_name) {
+          return absl::StrCat(fn_name.str(), "_", output_name);
+        });
+  }
+}
+
+// Creates a main function which calls other exported functions.
+bool CreateMainFunction(ModuleOp module_op) {
+  MLIRContext* context = module_op.getContext();
+  OpBuilder builder(context);
+
+  std::vector<std::string> input_names, output_names;
+  GetUniqueInputOutputNodeNames(module_op, input_names, output_names);
+
+  // Collects argument and result types.
+  llvm::SmallVector<Location> arg_locs;
+  llvm::SmallVector<Type> arg_types, result_types;
+
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (!ShouldIncludeInMainFunction(func_op)) continue;
+
+    arg_types.append(func_op.getArgumentTypes().begin(),
+                     func_op.getArgumentTypes().end());
+    auto& return_op = func_op.getBody().getBlocks().front().back();
+    result_types.append(return_op.getOperandTypes().begin(),
+                        return_op.getOperandTypes().end());
+    for (const auto& arg : func_op.getArguments()) {
+      arg_locs.push_back(arg.getLoc());
+    }
+  }
+
+  // Creates a new main function.
+  auto func_type = FunctionType::get(context, arg_types, result_types);
+  auto main_func = builder.create<func::FuncOp>(
+      module_op.getLoc(), kImportModelDefaultGraphFuncName, func_type);
+  builder.createBlock(&main_func.getBody(), main_func.begin(), arg_types,
+                      arg_locs);
+  SmallVector<NamedAttribute> func_attrs;
+  func_attrs.push_back(
+      {StringAttr::get(context, "inputs"),
+       StringAttr::get(context, absl::StrJoin(input_names, ","))});
+  func_attrs.push_back(
+      {StringAttr::get(context, "outputs"),
+       StringAttr::get(context, absl::StrJoin(output_names, ","))});
+  auto dictAttr = DictionaryAttr::get(context, func_attrs);
+  main_func->setAttr(StringAttr::get(context, kEntryFunctionAttr), dictAttr);
+  main_func->setAttr(
+      kTfSavedModelExportedNamesAttr,
+      builder.getStrArrayAttr({kImportModelDefaultGraphFuncName}));
+
+  if (input_names.size() != main_func.getNumArguments() ||
+      output_names.size() != main_func.getNumResults()) {
+    module_op.emitError()
+        << "Number of inputs and outputs in the tf.entry_function attribute "
+           "mismatched. [Input] Expected: "
+        << input_names.size() << ", got: " << main_func.getNumArguments()
+        << ". [Output] Expected: " << output_names.size()
+        << ", got: " << main_func.getNumResults();
+    return false;
+  }
+
+  const int num_args = main_func.getNumArguments();
+  for (int i = 0; i < num_args; ++i) {
+    main_func.setArgAttr(
+        i, kTfSavedModelIndexPathAttr,
+        ArrayAttr::get(context, {StringAttr::get(context, input_names[i])}));
+  }
+
+  const int num_results = main_func.getNumResults();
+  for (int i = 0; i < num_results; ++i) {
+    main_func.setResultAttr(
+        i, kTfSavedModelIndexPathAttr,
+        ArrayAttr::get(context, {StringAttr::get(context, output_names[i])}));
+  }
+
+  // Creates PartitionedCall ops to call exported functions.
+  auto guard = OpBuilder::InsertionGuard(builder);
+  int arg_idx = 0;
+  int result_idx = 0;
+  llvm::SmallVector<Value> call_op_returns;
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (!ShouldIncludeInMainFunction(func_op)) continue;
+
+    llvm::ArrayRef<BlockArgument> new_args = llvm::ArrayRef(
+        main_func.getArguments().begin() + arg_idx, func_op.getNumArguments());
+    arg_idx += func_op.getNumArguments();
+    llvm::ArrayRef<Type> new_types = llvm::ArrayRef(
+        result_types.begin() + result_idx, func_op.getNumResults());
+    result_idx += func_op.getNumResults();
+
+    auto call_op = builder.create<TF::PartitionedCallOp>(
+        module_op.getLoc(), new_types, new_args, /*args_attrs=*/nullptr,
+        /*res_attrs=*/nullptr,
+        SymbolRefAttr::get(context, func_op.getSymName()),
+        /*config=*/builder.getStringAttr(""),
+        /*config_proto=*/builder.getStringAttr(""),
+        /*executor_type=*/builder.getStringAttr(""));
+    call_op_returns.append(call_op.getResults().begin(),
+                           call_op.getResults().end());
+    SetFunctionPrivate(func_op);
+  }
+
+  // Creates Identity/IdentityN ops for returing values. This allows us to
+  // restore the same output tensor names in python.
+  int32_t output_count = 0;
+  // Map from node name to the list of the OutputInfos of its outputs that are
+  // used as the model outputs.
+  llvm::StringMap<llvm::SmallVector<OutputInfo>> node_to_output_map;
+  for (auto [output_name, call_op_return] :
+       llvm::zip(output_names, call_op_returns)) {
+    std::vector<std::string> name_and_index =
+        absl::StrSplit(output_name, ':', absl::SkipEmpty());
+    llvm::StringRef node_name = name_and_index.front();
+    int32_t tensor_index = 0;
+    if (name_and_index.size() > 1) {
+      tensor_index = std::stoi(name_and_index.back());
+    }
+    node_to_output_map[node_name].push_back(
+        {output_count++, tensor_index, call_op_return});
+  }
+
+  Value scalar_one =
+      CreateScalarConstValue<float>(builder, builder.getUnknownLoc(), 1.0);
+  llvm::SmallVector<Value> returning_values(output_count, Value());
+  for (const auto& node_name : node_to_output_map.keys()) {
+    auto node_output_tensors = node_to_output_map[node_name];
+
+    NameLoc new_loc = NameLoc::get(builder.getStringAttr(node_name));
+    int32_t max_tensor_index = 0;
+    absl::c_for_each(node_output_tensors,
+                     [&max_tensor_index](const OutputInfo& output_info) {
+                       max_tensor_index =
+                           std::max(max_tensor_index, output_info.tensor_index);
+                     });
+
+    // Create IdentityOp or IdentityNOp based on the number of outputs.
+    Operation* identity_op;
+    if (max_tensor_index == 0) {
+      Value output_value = node_output_tensors.front().value;
+      identity_op = builder.create<TF::IdentityOp>(
+          new_loc, output_value.getType(), output_value);
+    } else {
+      llvm::SmallVector<Value> input_values(node_output_tensors.size(),
+                                            scalar_one);
+      for (const auto& [output_index, tensor_index, tensor_value] :
+           node_output_tensors) {
+        input_values[tensor_index] = tensor_value;
+      }
+      identity_op = builder.create<TF::IdentityNOp>(
+          new_loc, TypeRange(ValueRange(input_values)), input_values);
+    }
+
+    for (const auto& [output_index, tensor_index, tensor_value] :
+         node_output_tensors) {
+      returning_values[output_index] = identity_op->getResult(tensor_index);
+    }
+  }
+  builder.create<func::ReturnOp>(main_func.getBody().getLoc(),
+                                 returning_values);
+
+  // Adds the new function to symbol table.
+  SymbolTable symbol_table(module_op);
+  symbol_table.insert(main_func);
+  return true;
+}
+
+// Creates a new function name by attaching a number suffix
+// (`main_func_name_{i}`) and incrementing it until there are no conflicts.
+std::string CreateNewFuncName(const StringRef main_func_name,
+                              SymbolTable& symbol_table) {
+  int suffix_id = 0;
+  std::string new_func_name =
+      absl::StrCat(main_func_name.str(), "_", suffix_id);
+  while (symbol_table.lookup(new_func_name)) {
+    suffix_id++;
+    new_func_name = absl::StrCat(main_func_name.str(), "_", suffix_id);
+  }
+
+  return new_func_name;
+}
+
+// Renames the existing @main function to avoid conflict with the newly
+// created main function. When it is renamed, its usages will also be replaced.
+// It will be renamed by attaching a number suffix like `@main_{i}`, until there
+// are no conflicts. This function is a no-op when no function called @main
+// exists.
+LogicalResult RenameExistingMainFunction(ModuleOp module_op) {
+  SymbolTable symbol_table(module_op);
+
+  auto main_func_op =
+      symbol_table.lookup<func::FuncOp>(kImportModelDefaultGraphFuncName);
+  if (!main_func_op) {
+    return success();
+  }
+
+  const std::string new_func_name =
+      CreateNewFuncName(main_func_op.getSymName(), symbol_table);
+
+  main_func_op.setSymName(new_func_name);
+  return symbol_table.replaceAllSymbolUses(
+      main_func_op, StringAttr::get(module_op.getContext(), new_func_name),
+      module_op);
+}
+
+void TFInsertMainFunctionPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  if (failed(RenameExistingMainFunction(module_op))) {
+    module_op->emitError("Failed to rename existing function `@main`.");
+    signalPassFailure();
+  }
+
+  if (!CreateMainFunction(module_op)) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertMainFunctionPass() {
+  return std::make_unique<TFInsertMainFunctionPass>();
+}
+
+static PassRegistration<TFInsertMainFunctionPass> pass([] {
+  return CreateInsertMainFunctionPass();
+});
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_quantized_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_quantized_functions.cc
new file mode 100644
index 000000000000..f4c75648b2ee
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_quantized_functions.cc
@@ -0,0 +1,224 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/UB/IR/UBOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
+
+class InsertQuantizedFunctionsPass
+    : public PassWrapper<InsertQuantizedFunctionsPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertQuantizedFunctionsPass)
+
+  explicit InsertQuantizedFunctionsPass() = default;
+  explicit InsertQuantizedFunctionsPass(QuantMethod quantization_method,
+                                        OpSet op_set) {
+    quantization_method_ = quantization_method;
+    op_set_ = op_set;
+  }
+  InsertQuantizedFunctionsPass(const InsertQuantizedFunctionsPass& other) {
+    quantization_method_ = other.quantization_method_;
+    op_set_ = other.op_set_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in the textual format (on
+    // the commandline for example).
+    return "tf-quant-insert-quantized-functions";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Insert quantized functions into the module";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, func::FuncDialect, ub::UBDialect>();
+  }
+
+ private:
+  void runOnOperation() override;
+
+  // Returns the function library for the given quantization method and opset
+  // pair.
+  llvm::StringRef GetFunctionLibrary(QuantMethod quantization_method,
+                                     OpSet op_set);
+
+  Option<QuantMethod> quantization_method_{
+      *this, "quantization-method",
+      llvm::cl::init(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_INT8),
+      llvm::cl::desc("Choose quantization method."),
+      llvm::cl::values(
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_INT8,
+                     "ptq", "Post-training static-range quantization"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_DYNAMIC_RANGE_INT8,
+                     "drq", "Post-training dynamic-range quantizaiton"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8,
+                     "weight_only", "Post-training weight_only quantizaiton"))};
+
+  Option<OpSet> op_set_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+};
+
+llvm::StringRef InsertQuantizedFunctionsPass::GetFunctionLibrary(
+    QuantMethod quantization_method, OpSet op_set) {
+  absl::flat_hash_map<OpSet, llvm::StringRef> function_library_map;
+  if (quantization_method ==
+      tensorflow::quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8) {
+    function_library_map = {
+        {OpSet::TF, quant::kQuantizedFunctionLibraryInMLIR_TF_DRQ},
+        {OpSet::UNIFORM_QUANTIZED,
+         quant::kQuantizedFunctionLibraryInMLIR_UNIFORM_QUANTIZED_DRQ},
+        {OpSet::XLA, quant::kQuantizedFunctionLibraryInMLIR_TF_DRQ}};
+  } else if (quantization_method ==
+             tensorflow::quantization::QuantizationMethod::
+                 METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8) {
+    // Uniform quantized opset is not supported for weight-only as inputs for
+    // weight quantization are floats. And only dequantize_i8 is used from the
+    // quantized function library.
+    function_library_map = {
+        {OpSet::TF, quant::kQuantizedFunctionLibraryInMLIR},
+        {OpSet::XLA, quant::kQuantizedFunctionLibraryInMLIR_XLA_WEIGHT_ONLY}};
+  } else {
+    function_library_map = {
+        {OpSet::TF, quant::kQuantizedFunctionLibraryInMLIR},
+        {OpSet::UNIFORM_QUANTIZED,
+         quant::kQuantizedFunctionLibraryInMLIR_UNIFORM_QUANTIZED},
+        {OpSet::XLA, quant::kQuantizedFunctionLibraryInMLIR}};
+  }
+
+  auto it = function_library_map.find(op_set);
+  if (it != function_library_map.end()) {
+    return it->second;
+  }
+  return llvm::StringRef();
+}
+
+static PassRegistration<InsertQuantizedFunctionsPass> pass;
+
+void InsertQuantizedFunctionsPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  SymbolTable symbol_table(module);
+
+  std::unique_ptr<llvm::MemoryBuffer> mem_buffer;
+  llvm::StringRef quantized_function_library =
+      GetFunctionLibrary(quantization_method_, op_set_);
+
+  if (quantized_function_library.empty()) {
+    emitError(module.getLoc())
+        << "Failed to get function library for the opset.";
+    signalPassFailure();
+    return;
+  }
+
+  mem_buffer =
+      llvm::MemoryBuffer::getMemBuffer(quantized_function_library,
+                                       /*BufferName=*/"",
+                                       /*RequiresNullTerminator=*/false);
+
+  llvm::SourceMgr source_mgr;
+  source_mgr.AddNewSourceBuffer(std::move(mem_buffer), llvm::SMLoc());
+  OwningOpRef<ModuleOp> module_ref =
+      parseSourceFile<ModuleOp>(source_mgr, module.getContext());
+  // Inline and optimize loaded functions.
+  MLIRContext* context = &getContext();
+  PassManager pm(context);
+  pm.addPass(createInlinerPass());
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(createCSEPass());
+
+  StatusScopedDiagnosticHandler diagnostic_handler(context);
+  if (failed(pm.run(*module_ref))) {
+    emitError(module.getLoc()) << "failed to apply the optimization: "
+                               << diagnostic_handler.ConsumeStatus().message();
+    signalPassFailure();
+    return;
+  }
+
+  // Copy all functions used by this signature to the final MLIR module.
+  for (func::FuncOp func : module_ref->getOps<func::FuncOp>()) {
+    // Do nothing if the function already exists.
+    if (symbol_table.lookup(func.getSymName()) != nullptr) continue;
+
+    // Set the function to private and insert to the module.
+    func::FuncOp new_func = func.clone();
+    new_func.setPrivate();
+    symbol_table.insert(new_func);
+
+    // For consistency, we require all quantized composite function to have
+    // the "tf_quant.quantized_ops" attribute.
+    if (!new_func.getSymName().starts_with("quantized_")) continue;
+    if (!new_func->hasAttrOfType<ArrayAttr>("tf_quant.quantized_ops")) {
+      new_func->emitError() << "Missing \"tf_quant.quantized_ops\" "
+                               "attribute in the quantized composite function.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the pass for inserting quantized functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
+    QuantMethod quantization_method, OpSet target_opset) {
+  return std::make_unique<InsertQuantizedFunctionsPass>(quantization_method,
+                                                        target_opset);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_restore_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_restore_op.cc
new file mode 100644
index 000000000000..d9594d05a9d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_restore_op.cc
@@ -0,0 +1,226 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+
+// This pass creates a RestoreV2 op in the initializer function with
+// type "restore_op" that initializes variables from checkpoint. It finds
+// tf.AssignVariableOp(tf.VarHandleOp, tf.Const) patterns in the initializer
+// function and replaces tf.Consts with the results of RestoreV2.
+class InsertRestoreOpPass
+    : public PassWrapper<InsertRestoreOpPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertRestoreOpPass)
+
+  explicit InsertRestoreOpPass() = default;
+
+  // The argument used to refer to the pass in the textual format (e.g. on the
+  // commandline).
+  StringRef getArgument() const final { return "tf-quant-insert-restore-op"; }
+
+  StringRef getDescription() const final {
+    return "Creates RestoreV2 op to initialize the variables in the "
+           "initializer function (`tf_saved_model.initializer_type == "
+           "'restore_op'`). Replaces each occurrence of "
+           "`tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns with "
+           "`tf.AssignVariableOp(tf.VarHandleOp, restore_op_output#N)`, where "
+           "`restore_op_output#N` is the Nth output of the newly created "
+           "RestoreV2Op.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Finds `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns and returns
+// the `tf.VarHandleOp`s that are initialized by these `tf.AssignVariableOp`s.
+std::vector<TF::VarHandleOp> CollectVariableOps(
+    func::FuncOp session_init_func) {
+  std::vector<TF::VarHandleOp> var_handle_ops{};
+
+  for (auto assign_variable_op : llvm::make_early_inc_range(
+           session_init_func.getOps<TF::AssignVariableOp>())) {
+    Value resource_operand = assign_variable_op.getOperand(0);
+    Value assigned_value_operand = assign_variable_op.getOperand(1);
+
+    if (auto var_handle_op =
+            dyn_cast<TF::VarHandleOp>(resource_operand.getDefiningOp());
+        var_handle_op &&
+        isa<TF::ConstOp>(assigned_value_operand.getDefiningOp())) {
+      var_handle_ops.emplace_back(var_handle_op);
+    }
+  }
+
+  return var_handle_ops;
+}
+
+// Creates a `ConstOp` of 1-dimensional TF::StringType out of `str_values`.
+TF::ConstOp Create1DStringConst(const ArrayRef<std::string> str_values,
+                                const Location loc, OpBuilder& builder) {
+  const auto tensor_type =
+      RankedTensorType::get(/*shape=*/{static_cast<int64_t>(str_values.size())},
+                            /*elementType=*/builder.getType<TF::StringType>());
+
+  return builder.create<TF::ConstOp>(
+      loc, DenseStringElementsAttr::get(
+               tensor_type,
+               SmallVector<StringRef>(str_values.begin(), str_values.end())));
+}
+
+// Creates a new argument for `func_op` that accepts a string tensor containing
+// the checkpoint file's prefix.
+BlockArgument InsertFilePrefixArgument(func::FuncOp func_op,
+                                       OpBuilder& builder) {
+  const auto filename_op_type = RankedTensorType::get(
+      /*shape=*/{}, /*elementType=*/builder.getType<TF::StringType>());
+  const auto file_prefix_attr = builder.getStringAttr(quant::kTfFilePrefix);
+  const auto arg_attrs = builder.getDictionaryAttr({builder.getNamedAttr(
+      kTfSavedModelIndexPathAttr, builder.getArrayAttr({file_prefix_attr}))});
+
+  const int insert_idx = func_op.getNumArguments();
+
+  (void)func_op.insertArgument(insert_idx, /*argType=*/filename_op_type,
+                               arg_attrs, NameLoc::get(file_prefix_attr));
+
+  return func_op.getArgument(insert_idx);
+}
+
+// Creates a 1D string array constant for "tensor_names" input of `RestoreV2`
+// op. The `ConstOp` will be created at `builder`'s current insertion point.
+TF::ConstOp CreateTensorNamesConst(const ArrayRef<std::string> tensor_names,
+                                   OpBuilder& builder) {
+  const auto loc = NameLoc::get(builder.getStringAttr("tensor_names"));
+  return Create1DStringConst(tensor_names, loc, builder);
+}
+
+// Creates a 1D string array constant for "shape_and_slices" input of
+// `RestoreV2` op. The `ConstOp` will be created at `builder`'s current
+// insertion point. It will be filled with `size` empty strings.
+TF::ConstOp CreateShapeAndSlicesConst(const int size, OpBuilder& builder) {
+  const SmallVector<std::string> shape_and_slices_values(size, /*Value=*/"");
+
+  const auto loc = NameLoc::get(builder.getStringAttr("shape_and_slices"));
+  return Create1DStringConst(shape_and_slices_values, loc, builder);
+}
+
+// Creates a `tf.RestoreV2Op` that loads the variable values from the checkpoint
+// file. The loaded tensors will be used to initialize `tf.VarHandleOp`s via
+// `tf.AssignVariableOp`s.
+void CreateRestoreV2Op(std::vector<TF::VarHandleOp>& target_var_handle_ops,
+                       func::FuncOp session_init_func) {
+  SmallVector<Type> tensor_types{};
+  SmallVector<std::string> tensor_names{};
+  for (auto var_handle_op : target_var_handle_ops) {
+    tensor_names.emplace_back(var_handle_op.getSharedName().str());
+    // Location must be set to the same name as the shared name. The Location is
+    // later tranlated to the op's name when exported to `GraphDef`. This is
+    // required to find the correct variable name to restore when it is
+    // imported back to MLIR. When importing the graph to MLIR, the name of the
+    // op is used to retrieve the tensor values of each variable. See
+    // `InitializeVariablesInSessionInitializer` for further details.
+    const auto loc = NameLoc::get(StringAttr::get(
+        var_handle_op.getContext(), var_handle_op.getSharedName()));
+    var_handle_op->setLoc(loc);
+
+    // Ex) If VarHandleOp's type is tensor<!tf_type.resource<tensor<1xf32>>>,
+    // then tensor<1xf32> is the subtype.
+    tensor_types.emplace_back(var_handle_op.resource_subtype());
+  }
+
+  auto builder =
+      OpBuilder::atBlockTerminator(&session_init_func.getBody().front());
+
+  const BlockArgument filename_arg =
+      InsertFilePrefixArgument(session_init_func, builder);
+
+  TF::ConstOp tensor_names_const =
+      CreateTensorNamesConst(tensor_names, builder);
+  TF::ConstOp shape_and_slices_const =
+      CreateShapeAndSlicesConst(tensor_names.size(), builder);
+
+  auto restore_op = builder.create<TF::RestoreV2Op>(
+      session_init_func.getLoc(),
+      /*tensors=*/tensor_types,
+      /*prefix=*/filename_arg, tensor_names_const, shape_and_slices_const);
+
+  for (auto [idx, restore_result] : llvm::enumerate(restore_op.getResults())) {
+    builder.create<TF::AssignVariableOp>(
+        restore_op.getLoc(), target_var_handle_ops[idx], restore_result);
+  }
+}
+
+// TODO(b/261813194): Do not create a new RestoreV2 op when a RestoreV2 op
+// already exists.
+void InsertRestoreOpPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  func::FuncOp session_init_func = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+  if (!session_init_func) {
+    LOG(INFO) << "No session initializer function with type 'restore_op'. "
+                 "RestoreV2 op will not be created.";
+    return;
+  }
+
+  std::vector<TF::VarHandleOp> target_var_handle_ops =
+      CollectVariableOps(session_init_func);
+  if (target_var_handle_ops.empty()) {
+    LOG(INFO) << "There are no VarHandleOps to restore. RestoreV2 op will not "
+                 "be created.";
+    return;
+  }
+
+  CreateRestoreV2Op(target_var_handle_ops, session_init_func);
+}
+
+static PassRegistration<InsertRestoreOpPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertRestoreOpPass() {
+  return std::make_unique<InsertRestoreOpPass>();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_save_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_save_op.cc
new file mode 100644
index 000000000000..2a8d65176118
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_insert_save_op.cc
@@ -0,0 +1,254 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/log/log.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+
+constexpr StringRef kTfQuantSaveV2OpName = "tf_quant__save_save_v2";
+constexpr StringRef kTfQuantSaveReturnOpName = "tf_quant__save_return";
+
+// A pass that creates a new function that wraps the newly created SaveV2 op.
+// The new function's name is "tf_quant__save". The function accepts a single
+// string tensor as argument, which specifies the path to the checkpoint to
+// which the variable's tensor values are saved. It finds
+// `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` pattern in the initializer
+// function of type "restore_op" to identify the VarHandleOps that should be
+// saved using the SaveV2 op.
+class InsertSaveOpPass
+    : public PassWrapper<InsertSaveOpPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertSaveOpPass)
+
+  explicit InsertSaveOpPass() = default;
+
+  // The argument used to refer to the pass in the textual format (e.g. on the
+  // commandline).
+  StringRef getArgument() const final { return "tf-quant-insert-save-op"; }
+
+  StringRef getDescription() const final {
+    return "Inserts a new function that wraps a SaveV2 op. The SaveV2 op saves "
+           "the values of the VarHandleOps that are found in the initializer "
+           "function of 'restore_op' type.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Finds `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns and removes
+// `tf.AssignVariableOp`s and `tf.Const`s. Collects and returns the
+// `tf.VarHandleOp`s that are initialized by these `tf.AssignVariableOp`s.
+SmallVector<TF::VarHandleOp> CollectVariableOps(
+    func::FuncOp session_init_func) {
+  SmallVector<TF::VarHandleOp> var_handle_ops{};
+
+  for (auto assign_variable_op : llvm::make_early_inc_range(
+           session_init_func.getOps<TF::AssignVariableOp>())) {
+    Value resource_operand = assign_variable_op.getOperand(0);
+    auto var_handle_op =
+        dyn_cast<TF::VarHandleOp>(resource_operand.getDefiningOp());
+    if (!var_handle_op) continue;
+
+    Value assigned_value_operand = assign_variable_op.getOperand(1);
+    auto const_op =
+        dyn_cast<TF::ConstOp>(assigned_value_operand.getDefiningOp());
+    if (!const_op) continue;
+
+    var_handle_ops.emplace_back(var_handle_op);
+  }
+
+  return var_handle_ops;
+}
+
+// Creates a `ConstOp` of 1-dimensional TF::StringType out of `str_values`.
+TF::ConstOp Create1DStringConst(const ArrayRef<std::string> str_values,
+                                const Location loc, OpBuilder& builder) {
+  const auto tensor_type =
+      RankedTensorType::get(/*shape=*/{static_cast<int64_t>(str_values.size())},
+                            /*elementType=*/builder.getType<TF::StringType>());
+
+  return builder.create<TF::ConstOp>(
+      loc, DenseStringElementsAttr::get(
+               tensor_type,
+               SmallVector<StringRef>(str_values.begin(), str_values.end())));
+}
+
+// Creates a 1D string array constant for "tensor_names" input of `RestoreV2`
+// op. The `ConstOp` will be created at `builder`'s current insertion point.
+TF::ConstOp CreateTensorNamesConst(const ArrayRef<std::string> tensor_names,
+                                   OpBuilder& builder) {
+  const auto loc = NameLoc::get(builder.getStringAttr("tensor_names"));
+  return Create1DStringConst(tensor_names, loc, builder);
+}
+
+// Creates a 1D string array constant for "shape_and_slices" input of
+// `RestoreV2` op. The `ConstOp` will be created at `builder`'s current
+// insertion point. It will be filled with `size` empty strings.
+TF::ConstOp CreateShapeAndSlicesConst(const int size, OpBuilder& builder) {
+  const SmallVector<std::string> shape_and_slices_values(size, /*Value=*/"");
+
+  const auto loc = NameLoc::get(builder.getStringAttr("shape_and_slices"));
+  return Create1DStringConst(shape_and_slices_values, loc, builder);
+}
+
+// Returns cloned `VarHandleOp`s. Assumes `save_func`'s body is empty.
+SmallVector<TF::VarHandleOp> CloneVarHandleOpsIntoSaveFunc(
+    func::FuncOp save_func, const ArrayRef<TF::VarHandleOp> var_handle_ops) {
+  Block& save_op_block = save_func.getBody().front();
+
+  IRMapping mapper{};
+  SmallVector<TF::VarHandleOp> cloned_var_handle_ops = {};
+  for (auto var_handle_op : var_handle_ops) {
+    Operation* cloned_var_handle_op = var_handle_op->clone(mapper);
+    save_op_block.push_back(cloned_var_handle_op);
+
+    cloned_var_handle_ops.push_back(
+        cast<TF::VarHandleOp>(cloned_var_handle_op));
+  }
+
+  return cloned_var_handle_ops;
+}
+
+// Creates and returns a `TF::SaveV2Op` for the `var_handle_ops`. For each
+// VarHandleOp in `var_handle_ops` the tensor value is read via
+// `TF::ReadVariableOp` and provided as arguments to the newly created SaveV2
+// op.
+TF::SaveV2Op CreateSaveV2Op(func::FuncOp save_func,
+                            const ArrayRef<TF::VarHandleOp> var_handle_ops) {
+  auto builder = OpBuilder::atBlockEnd(&save_func.getBody().front());
+
+  SmallVector<std::string> tensor_names = {};
+  SmallVector<Value> tensor_values = {};
+  for (auto var_handle_op : var_handle_ops) {
+    tensor_names.emplace_back(var_handle_op.getSharedName().str());
+
+    auto read_var_op = builder.create<TF::ReadVariableOp>(
+        var_handle_op.getLoc(), var_handle_op.resource_subtype(),
+        var_handle_op);
+    tensor_values.emplace_back(read_var_op.getResult());
+  }
+
+  TF::ConstOp tensor_names_const =
+      CreateTensorNamesConst(tensor_names, builder);
+  TF::ConstOp shape_and_slices_const =
+      CreateShapeAndSlicesConst(tensor_names.size(), builder);
+
+  BlockArgument filename_arg = save_func.getArgument(0);
+  return builder.create<TF::SaveV2Op>(
+      NameLoc::get(builder.getStringAttr(kTfQuantSaveV2OpName)),
+      /*prefix=*/filename_arg, tensor_names_const, shape_and_slices_const,
+      /*tensors=*/tensor_values);
+}
+
+// Creates and returns a new `FuncOp` named "tf_quant__save". The resulting
+// `FuncOp`'s body has no ops.
+func::FuncOp CreateEmptySaveFunc(ModuleOp module_op) {
+  OpBuilder builder(module_op);
+  builder.setInsertionPointToEnd(&module_op.getBodyRegion().front());
+
+  auto filename_input_type = RankedTensorType::get(
+      /*shape=*/{}, /*elementType=*/builder.getType<TF::StringType>());
+
+  FunctionType func_type = builder.getFunctionType(
+      /*inputs=*/{filename_input_type}, /*results=*/{});
+  auto save_func = builder.create<func::FuncOp>(
+      NameLoc::get(builder.getStringAttr(quant::kTfQuantSaveFuncName)),
+      /*sym_name=*/quant::kTfQuantSaveFuncName, func_type);
+  save_func.addEntryBlock();
+  save_func.setPrivate();
+
+  return save_func;
+}
+
+// Creates a save function that contains the `TF::SaveV2Op` for the variables in
+// `var_handle_ops`. The `var_handle_ops` are cloned into the new function and
+// provides the tensor values to be saved. The new function is a private
+// function and has one argument for the file prefix (the directory to the
+// checkpoint).
+void CreateSaveFunc(ModuleOp module_op,
+                    const ArrayRef<TF::VarHandleOp> var_handle_ops) {
+  func::FuncOp save_func = CreateEmptySaveFunc(module_op);
+
+  const SmallVector<TF::VarHandleOp> cloned_var_handle_ops =
+      CloneVarHandleOpsIntoSaveFunc(save_func, var_handle_ops);
+
+  CreateSaveV2Op(save_func, cloned_var_handle_ops);
+
+  // Create a "func.return".
+  auto builder = OpBuilder::atBlockEnd(&save_func.getBody().front());
+  builder.create<func::ReturnOp>(
+      NameLoc::get(builder.getStringAttr(kTfQuantSaveReturnOpName)));
+}
+
+void InsertSaveOpPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  func::FuncOp session_init_func = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+  if (!session_init_func) {
+    LOG(INFO) << "No session initializer function with type 'restore_op'. "
+                 "SaveV2 op will not be created.";
+    return;
+  }
+
+  SmallVector<TF::VarHandleOp> target_var_handle_ops =
+      CollectVariableOps(session_init_func);
+  if (target_var_handle_ops.empty()) {
+    LOG(INFO) << "There are no VarHandleOps to save. SaveV2 op will not "
+                 "be created.";
+    return;
+  }
+
+  CreateSaveFunc(module_op, target_var_handle_ops);
+}
+
+static PassRegistration<InsertSaveOpPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertSaveOpPass() {
+  return std::make_unique<InsertSaveOpPass>();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_hashtable_ops_as_args.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_hashtable_ops_as_args.cc
new file mode 100644
index 000000000000..638c4071feeb
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_hashtable_ops_as_args.cc
@@ -0,0 +1,225 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+constexpr StringRef kSharedNameAttr = "shared_name";
+
+class LiftHashTableOpsAsArgsPass
+    : public PassWrapper<LiftHashTableOpsAsArgsPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LiftHashTableOpsAsArgsPass)
+  explicit LiftHashTableOpsAsArgsPass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-lift-hashtable-ops-as-args";
+  }
+  StringRef getDescription() const final {
+    return "Lifts HashTable ops as function arguments.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Checks if the given op is a Hashtable op.
+bool IsHashTableOp(Operation* op) {
+  return llvm::isa<TF::HashTableOp, TF::HashTableV2Op,
+                   TF::MutableHashTableV2Op>(op);
+}
+
+// Checks if the function is the main or initializer function.
+bool IsMainOrInitializerFunction(ModuleOp module, func::FuncOp func) {
+  if (func.getSymName() ==
+          llvm::StringRef(tensorflow::kImportModelDefaultGraphFuncName) ||
+      func.getSymName() == quant::kTfQuantSaveFuncName) {
+    return true;
+  }
+
+  for (func::FuncOp init_func :
+       tf_saved_model::GetInitializerFunctions(module)) {
+    if (func.getSymName() == init_func.getSymName()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Checks if the function is only used by supported ops. Returns false when the
+// function has no uses. Currently, only PartitionedCall is supported.
+// TODO(b/284222309): Support lifting for functions called by control flow.
+bool UsedBySupportedOps(ModuleOp module, func::FuncOp func) {
+  auto function_uses =
+      SymbolTable::getSymbolUses(func, &module.getBodyRegion());
+  if (!function_uses.has_value()) return false;
+  for (auto& function_use : function_uses.value()) {
+    if (!llvm::isa<TF::PartitionedCallOp, TF::StatefulPartitionedCallOp>(
+            function_use.getUser())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns the `shared_name` attribute value if exists. If not, returns an
+// empty string.
+StringRef GetSharedName(Operation* op) {
+  if (!op->hasAttrOfType<StringAttr>(kSharedNameAttr)) return "";
+  return op->getAttrOfType<StringAttr>(kSharedNameAttr).getValue();
+}
+
+// Checks if the HashTable is initialized. This function assumes that the
+// HashTable is initialized if it appears in the initializer since it can't
+// check the actual value.
+bool IsResourceInitialized(ModuleOp module_op, Operation* hash_table) {
+  StringRef shared_name = GetSharedName(hash_table);
+  if (shared_name.empty()) return false;
+
+  for (func::FuncOp init_func_op :
+       tf_saved_model::GetInitializerFunctions(module_op)) {
+    for (Operation& op : init_func_op.getBody().getOps()) {
+      StringRef other_shared_name = GetSharedName(&op);
+      if (IsHashTableOp(&op) && other_shared_name == shared_name) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Lifts HashTable ops in the target function as function arguments and returns
+// the lifted ops. These ops  will then be added to the caller function and
+// passed to the target function.
+LogicalResult LiftHashTableOpsToArguments(ModuleOp module_op,
+                                          func::FuncOp target_func) {
+  if (!llvm::hasSingleElement(target_func)) return success();
+  if (!UsedBySupportedOps(module_op, target_func)) return success();
+  if (IsMainOrInitializerFunction(module_op, target_func)) return success();
+
+  llvm::StringMap<int> shared_name_to_arg_idx;
+  llvm::SmallVector<std::pair<Operation*, int>> lifted_op_and_arg_idx;
+  Block& block = target_func.front();
+  auto func_type = target_func.getFunctionType();
+
+  for (Operation& op : block.without_terminator()) {
+    StringRef shared_name = GetSharedName(&op);
+    if (shared_name.empty() || !IsHashTableOp(&op)) continue;
+    if (!IsResourceInitialized(module_op, &op)) continue;
+
+    auto it =
+        shared_name_to_arg_idx.insert({shared_name, block.getNumArguments()});
+    if (it.second) {
+      auto resource_type = op.getResult(0).getType();
+      op.getResult(0).replaceAllUsesWith(
+          block.addArgument(resource_type, op.getLoc()));
+      quant::AddEntryFunctionInput(
+          absl::StrCat("hash_table_", it.first->getValue(), ":0"), target_func);
+      // Avoid deleting the op here, clone it to the caller function first.
+      lifted_op_and_arg_idx.emplace_back(&op, it.first->getValue());
+    } else {
+      op.getResult(0).replaceAllUsesWith(
+          block.getArgument(it.first->getValue()));
+      op.erase();
+    }
+  }
+  if (lifted_op_and_arg_idx.empty()) return success();
+
+  // Update the function signature as well as its uses.
+  target_func.setType(FunctionType::get(target_func.getContext(),
+                                        block.getArgumentTypes(),
+                                        func_type.getResults()));
+
+  IRMapping mapping;
+  OpBuilder builder(module_op);
+  OpBuilder::InsertionGuard g(builder);
+  // The function has been checked to have at least one use.
+  auto function_uses =
+      SymbolTable::getSymbolUses(target_func, &module_op.getBodyRegion());
+  for (auto& function_use : function_uses.value()) {
+    auto call_op = function_use.getUser();
+    auto caller_func = call_op->getParentOfType<func::FuncOp>();
+    if (!caller_func) return failure();
+
+    builder.setInsertionPoint(call_op);
+    for (auto [lifted_op, arg_idx] : lifted_op_and_arg_idx) {
+      auto new_op = builder.clone(*lifted_op, mapping);
+      call_op->insertOperands(arg_idx, new_op->getResult(0));
+    }
+
+    // Try to lift recursively until the main function.
+    if (failed(LiftHashTableOpsToArguments(module_op, caller_func))) {
+      return failure();
+    }
+  }
+
+  // Erase the lifted operations explicitly.
+  for (auto [lifted_op, arg_idx] : lifted_op_and_arg_idx) {
+    lifted_op->erase();
+  }
+
+  return success();
+}
+
+void LiftHashTableOpsAsArgsPass::runOnOperation() {
+  auto module_op = getOperation();
+
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (failed(LiftHashTableOpsToArguments(module_op, func_op))) {
+      signalPassFailure();
+      return;
+    }
+  }
+}
+
+static PassRegistration<LiftHashTableOpsAsArgsPass> pass;
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftHashTableOpsAsArgsPass() {
+  return std::make_unique<LiftHashTableOpsAsArgsPass>();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions.cc
new file mode 100644
index 000000000000..1d073aa7c083
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions.cc
@@ -0,0 +1,419 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "re2/re2.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using QuantizationUnit =
+    ::tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+using ::tensorflow::quantization::OpSet;
+using ::tensorflow::quantization::QuantizationComponentSpec;
+using ::tensorflow::quantization::QuantizationMethod;
+using ::tensorflow::quantization::QuantizationOptions;
+using ::tensorflow::quantization::UnitWiseQuantizationSpec;
+
+class LiftQuantizableSpotsAsFunctionsPass
+    : public PassWrapper<LiftQuantizableSpotsAsFunctionsPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      LiftQuantizableSpotsAsFunctionsPass)
+
+  LiftQuantizableSpotsAsFunctionsPass() : test_mode_(true) {
+    initializeForTest();
+  }
+
+  explicit LiftQuantizableSpotsAsFunctionsPass(
+      const QuantizationOptions& quant_options)
+      : quant_options_(quant_options), test_mode_(false) {}
+
+  LiftQuantizableSpotsAsFunctionsPass(
+      const LiftQuantizableSpotsAsFunctionsPass& other) {
+    quant_options_ = other.quant_options_;
+    test_mode_ = other.test_mode_;
+    op_set_ = other.op_set_;
+    initializeForTest();
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-lift-quantizable-spots-as-functions";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Replace quantization candidates with composite functions into the "
+           "module";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
+  void runOnOperation() override;
+
+ private:
+  QuantizationOptions quant_options_;
+  bool test_mode_;
+  Option<OpSet> op_set_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+
+  // Initialize for tests.
+  void initializeForTest() {
+    if (!test_mode_) return;
+
+    op_set_.setCallback([this](const OpSet& new_op_set) {
+      quant_options_.set_op_set(new_op_set);
+    });
+
+    // Set the test quantization method to static-range.
+    if (quant_options_.quantization_method().preset_method() ==
+        QuantizationMethod::METHOD_UNSPECIFIED) {
+      quant_options_.mutable_quantization_method()->set_preset_method(
+          QuantizationMethod::METHOD_STATIC_RANGE_INT8);
+    }
+
+    if (quant_options_.quantization_method()
+            .quantization_component_specs()
+            .empty()) {
+      auto add_new_spec =
+          [this](QuantizationComponentSpec::QuantizationComponent component,
+                 QuantizationComponentSpec::TensorType type) {
+            QuantizationComponentSpec* new_spec =
+                quant_options_.mutable_quantization_method()
+                    ->add_quantization_component_specs();
+            new_spec->set_quantization_component(component);
+            new_spec->set_tensor_type(type);
+          };
+
+      add_new_spec(QuantizationComponentSpec::COMPONENT_ACTIVATION,
+                   QuantizationComponentSpec::TENSORTYPE_INT_8);
+      add_new_spec(QuantizationComponentSpec::COMPONENT_WEIGHT,
+                   QuantizationComponentSpec::TENSORTYPE_INT_8);
+      add_new_spec(QuantizationComponentSpec::COMPONENT_BIAS,
+                   QuantizationComponentSpec::TENSORTYPE_INT_32);
+    }
+
+    if (quant_options_.unit_wise_quantization_specs().empty()) {
+      // Opt-out a node named `test_opt_out`.
+      UnitWiseQuantizationSpec* new_spec =
+          quant_options_.add_unit_wise_quantization_specs();
+      QuantizationUnit* new_unit = new_spec->add_unit();
+      new_unit->set_node_name("test_opt_out");
+      new_spec->mutable_quantization_method()->set_preset_method(
+          QuantizationMethod::METHOD_NO_QUANTIZE);
+    }
+  }
+};
+
+class CheckQuantizableOps
+    : public mlir::OpRewritePattern<TF::PartitionedCallOp> {
+ public:
+  explicit CheckQuantizableOps(MLIRContext* context,
+                               const QuantizationOptions& quant_options)
+      : OpRewritePattern<TF::PartitionedCallOp>(context),
+        quant_options_(quant_options) {}
+
+ private:
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    StringRef function_name =
+        mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
+    if (!function_name.starts_with("composite_") ||
+        !call_op->hasAttr(kQuantTraitAttrName)) {
+      return failure();
+    }
+
+    absl::Status check_status;
+    // TODO(b/270906404): Support weight-only gather for uniform quantized opset
+    // in PTQ mode
+    if (quant_options_.op_set() == OpSet::UNIFORM_QUANTIZED &&
+        function_name.contains("gather")) {
+      check_status.Update(absl::InternalError("Weight-only op is skipped."));
+    }
+
+    if (quant_options_.op_set() == OpSet::XLA) {
+      check_status.Update(checkQuantizableOpsForXla(call_op, function_name));
+    }
+
+    // Only the composite functions with f32 inputs are quantizable.
+    if (call_op.getResults().size() == 1 &&
+        !mlir::cast<ShapedType>(call_op->getResult(0).getType())
+             .getElementType()
+             .isF32()) {
+      check_status.Update(absl::InternalError(
+          "Composite functions for quantization should be f32 type."));
+    }
+
+    // The OK status means this op is quantizable. Return failure since the
+    // pattern doesn't rewrite anything yet.
+    if (check_status.ok()) return failure();
+    call_op->removeAttr(kQuantTraitAttrName);
+    removeAttrMapAttribute(call_op, function_name, check_status.message());
+    return success();
+  }
+
+  // Get the quantization method to apply to this composite function. If set,
+  // the unit-wise quantization method overrides the default one.
+  std::optional<QuantizationMethod> getUnitWiseQuantizationMethod(
+      TF::PartitionedCallOp call_op) const {
+    // If unit-wise quantization config is found, overwrite the default config.
+    auto quantization_unit =
+        quant::FindQuantizationUnitFromLoc(call_op.getLoc());
+    if (!quantization_unit.has_value()) return std::nullopt;
+
+    for (const auto& unit_config :
+         quant_options_.unit_wise_quantization_specs()) {
+      for (const auto& unit : unit_config.unit()) {
+        if (!unit.op_type().empty() &&
+            quantization_unit.value().op_type() != unit.op_type()) {
+          continue;
+        }
+
+        if (!unit.node_name().empty()) {
+          const RE2 node_name_regex(unit.node_name());
+          if (!RE2::FullMatch(quantization_unit.value().node_name(),
+                              node_name_regex)) {
+            continue;
+          }
+        }
+
+        if (!unit.func_name().empty()) {
+          const RE2 func_name_regex(unit.func_name());
+          if (!RE2::FullMatch(quantization_unit.value().func_name(),
+                              func_name_regex)) {
+            continue;
+          }
+        }
+
+        // Overrides the default quantization method.
+        return unit_config.quantization_method();
+      }
+    }
+    return std::nullopt;
+  }
+
+  absl::Status checkQuantizableOpsForXla(TF::PartitionedCallOp call_op,
+                                         StringRef function_name) const {
+    // Disable quantization for the DepthwiseConv since it has no benefits in
+    // the XLA opset.
+    if (function_name.contains("depthwise_conv2d")) {
+      return absl::InternalError(
+          "DepthwiseConv2D doesn't get any benefit of quantization in XLA.");
+    } else if (function_name.contains("conv2d")) {
+      // For Conv2D, the channel dimension must be static to calculate the
+      // feature group count.
+      if (!HasStaticShapeAtDims(call_op->getOperand(0), /*dims=*/3)) {
+        return absl::InternalError(
+            "The channel dimension of Conv2D is required to be static.");
+      }
+    } else if (function_name.contains("conv3d")) {
+      // For Conv3D, the channel dimension must be static to calculate the
+      // feature group count.
+      if (!HasStaticShapeAtDims(call_op->getOperand(0), /*dims=*/4)) {
+        return absl::InternalError(
+            "The channel dimension of Conv3D is required to be static.");
+      }
+    } else if (function_name.contains("batch_matmul")) {
+      // For BatchMatMul, the input must be ranked to determine the batch
+      // dimensions.
+      ShapedType shaped_type =
+          mlir::dyn_cast<ShapedType>(call_op->getOperand(0).getType());
+      if (!shaped_type || !shaped_type.hasRank()) {
+        return absl::InternalError("The input of BatchMatMul must have rank.");
+      }
+    } else if (function_name.contains("gather")) {
+      // This op is guaranteed to be a constant as ODS checks IsConstTensor.
+      // Check if the number of elements meets the requirement.
+      int64_t num_elements =
+          mlir::cast<ShapedType>(call_op.getOperand(0).getType())
+              .getNumElements();
+      if (num_elements < quant_options_.min_num_elements_for_weights()) {
+        return absl::InternalError(
+            "The params of Gather have fewer number of elements than "
+            "the `min_num_elements_for_weights`.");
+      }
+    }
+
+    // Disable quantization if the quantization method is NO_QUANTIZE.
+    QuantizationMethod quantization_method =
+        quant_options_.quantization_method();
+    if (quantization_method.quantization_component_specs().empty()) {
+      return absl::InternalError(
+          "The quantization method has been set to METHOD_NO_QUANTIZE.");
+    }
+
+    // The unit-wise quantization config should override the loser-grained
+    // quantization config, such as `enable_two_input_tensors`.
+    bool is_unitwise_quantization_enabled = false;
+    std::optional<QuantizationMethod> unit_wise_quantization_method =
+        getUnitWiseQuantizationMethod(call_op);
+    if (unit_wise_quantization_method.has_value()) {
+      if (unit_wise_quantization_method.value()
+              .quantization_component_specs()
+              .empty()) {
+        return absl::InternalError(
+            "The unit-wise quantization method has been set to "
+            "METHOD_NO_QUANTIZE.");
+      }
+      is_unitwise_quantization_enabled = true;
+    }
+
+    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(call_op);
+    for (auto iter : spec->coeff_op_quant_dim) {
+      Operation* preceding_op = call_op.getOperand(iter.first).getDefiningOp();
+      // The XLA opset only supports constant filter/weight at the moment.
+      bool is_weight_constant =
+          preceding_op && preceding_op->hasTrait<OpTrait::ConstantLike>();
+
+      // There might be q/dq ops after the filter/weight.
+      if (auto dq_op =
+              llvm::dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+                  preceding_op)) {
+        if (auto q_op = llvm::dyn_cast_or_null<mlir::quant::ir::QuantizeCastOp>(
+                dq_op.getArg().getDefiningOp())) {
+          Operation* q_op_input = q_op.getArg().getDefiningOp();
+          is_weight_constant =
+              q_op_input && q_op_input->hasTrait<OpTrait::ConstantLike>();
+        }
+      }
+
+      if (!is_weight_constant) {
+        if (!function_name.contains("matmul") &&
+            !function_name.contains("einsum")) {
+          return absl::InternalError(
+              "Non-constant weights are not supported at the moment,"
+              " except matmul and einsum.");
+        } else if (!quant_options_.enable_two_input_tensors() &&
+                   !is_unitwise_quantization_enabled) {
+          return absl::InternalError(
+              "Quantization is disabled for this op due to the non-constant "
+              "weight. You can enable it by setting `enable_two_input_tensors` "
+              "to true or using unit-wise quantization config.");
+        }
+      }
+    }
+
+    return absl::OkStatus();
+  }
+
+  void removeAttrMapAttribute(TF::PartitionedCallOp call_op,
+                              StringRef function_name,
+                              StringRef error_message) const {
+    ModuleOp module = call_op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module);
+    mlir::func::FuncOp composite_func =
+        dyn_cast<func::FuncOp>(symbol_table.lookup(function_name));
+    if (!composite_func) return;
+
+    composite_func.walk([&](Operation* op) {
+      if (op->hasAttr(kAttrMapAttribute)) {
+        op->removeAttr(kAttrMapAttribute);
+
+        std::string log_message;
+        llvm::raw_string_ostream log_stream(log_message);
+        op->getLoc().print(log_stream);
+        log_stream << ": Quantization disabled on this op: ";
+        log_stream << error_message << "\n";
+        log_stream << "See the current operation:\n";
+        op->print(log_stream);
+        VLOG(2) << log_message;
+      }
+    });
+  }
+
+  const QuantizationOptions& quant_options_;
+};
+
+static PassRegistration<LiftQuantizableSpotsAsFunctionsPass> pass;
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions.inc"
+
+void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module = getOperation();
+
+  populateWithGenerated(patterns);
+  patterns.add<CheckQuantizableOps>(ctx, quant_options_);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  // Iterate over the sorted list of functions to keep the order deterministic.
+  for (func::FuncOp func : GetSortedFunctions(module)) {
+    if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+      func.emitError()
+          << "tf-quant-lift-quantizable-spots-as-functions failed.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const QuantizationOptions& quant_options) {
+  return std::make_unique<LiftQuantizableSpotsAsFunctionsPass>(quant_options);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions.td
new file mode 100644
index 000000000000..9e0f26d87936
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions.td
@@ -0,0 +1,390 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
+
+//===----------------------------------------------------------------------===//
+// Helper functions.
+//===----------------------------------------------------------------------===//
+
+class IsFusedOpEndsWith<string OpName> : AttrConstraint<
+  CPred<"!llvm::cast<ArrayAttr>($_self).empty() && "
+        "llvm::cast<ArrayAttr>($_self)[llvm::cast<ArrayAttr>($_self).size() - 1]."
+        "cast<::mlir::StringAttr>().str() == \"" # OpName # "\"">,
+  "Matching fused '" # OpName # "' op at the end">;
+
+//===----------------------------------------------------------------------===//
+// Pattern rules for lifting ops as functions
+//===----------------------------------------------------------------------===//
+
+def LiftConv : Pat<
+  (TF_Conv2DOp:$res $input, $filter, $strides, $use_cudnn_on_gpu, $padding,
+    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+  (LiftAsTFPartitionedCall<"composite_conv2d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"use_cudnn_on_gpu"> $use_cudnn_on_gpu),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
+
+def LiftDepthwiseConv : Pat<
+  (TF_DepthwiseConv2dNativeOp:$res $input, $filter, $strides, $padding,
+    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+  (LiftAsTFPartitionedCall<"composite_depthwise_conv2d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
+
+def LiftMatMul : Pat<
+  (TF_MatMulOp:$res $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (LiftAsTFPartitionedCall<"composite_matmul_fn">
+    (ArgumentList $a, $b),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"transpose_a"> $transpose_a),
+      (NamedAttr<"transpose_b"> $transpose_b))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
+
+def LiftConv3D : Pat<
+  (TF_Conv3DOp:$res $input, $filter, $strides, $padding,
+    IsDataFormatNDHWC:$data_format, $dilations),
+  (LiftAsTFPartitionedCall<"composite_conv3d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
+
+def LiftBatchMatMul : Pat<
+  (TF_BatchMatMulV2Op:$res $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
+  (LiftAsTFPartitionedCall<"composite_batch_matmul_fn">
+    (ArgumentList $x, $y),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"adj_x"> $adj_x),
+      (NamedAttr<"adj_y"> $adj_y))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
+
+def LiftEinsum : Pat<
+  (TF_EinsumOp:$res $input, $equation),
+  (LiftAsTFPartitionedCall<"composite_einsum_fn">
+    (ArgumentList $input),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"equation"> $equation))),
+  [(IsNotInLiftedFunc $res),
+   (IsEinsumSupportedByXlaDotV2 $equation)
+  ], [], (addBenefit 1)>;
+
+
+//===----------------------------------------------------------------------===//
+// Pattern rules for lifting ops with bias as functions
+//===----------------------------------------------------------------------===//
+
+def LiftDepthwiseConv2dNativeWithBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_DepthwiseConv2dNativeOp $input, $filter, $strides, $padding,
+      $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsTFPartitionedCall<"composite_depthwise_conv2d_with_bias_fn">
+    (ArgumentList $input, $filter, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+
+def LiftConv2dWithBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_Conv2DOp $input, $filter, $strides, $use_cudnn_on_gpu, $padding,
+      $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsTFPartitionedCall<"composite_conv2d_with_bias_fn">
+    (ArgumentList $input, $filter, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"use_cudnn_on_gpu"> $use_cudnn_on_gpu),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+
+def LiftMatmulWithBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsTFPartitionedCall<"composite_matmul_with_bias_fn">
+    (ArgumentList $a, $b, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"transpose_a"> $transpose_a),
+      (NamedAttr<"transpose_b"> $transpose_b))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+
+// TODO(b/278493977): Create generic implementation of lifting any fused op
+// with any reshaping op
+def LiftMatmulWithReshapeAndBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_ReshapeOp:$out
+      (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
+    $shape),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsTFPartitionedCall<"composite_matmul_with_reshape_and_bias_fn">
+    (ArgumentList $a, $b, $bias, $shape),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"transpose_a"> $transpose_a),
+      (NamedAttr<"transpose_b"> $transpose_b))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+
+def LiftConv3dWithBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_Conv3DOp $input, $filter, $strides, $padding,
+      IsDataFormatNDHWC:$data_format, $dilations),
+    $bias, $bias_data_format),
+  (LiftAsTFPartitionedCall<"composite_conv3d_with_bias_fn">
+    (ArgumentList $input, $filter, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+
+def LiftBatchMatMulWithBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsTFPartitionedCall<"composite_batch_matmul_with_bias_fn">
+    (ArgumentList $x, $y, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"adj_x"> $adj_x),
+      (NamedAttr<"adj_y"> $adj_y))),
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 5)>;
+
+def LiftEinsumWithBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_EinsumOp $input, $equation),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsTFPartitionedCall<"composite_einsum_with_bias_fn">
+    (AppendToVector (ArgumentList $input), $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"equation"> $equation))),
+  [(IsNotInLiftedFunc $res),
+   (IsEinsumSupportedByXlaDotV2 $equation)],
+  [], (addBenefit 5)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern rules for lifting ops with bias and activation as functions
+//===----------------------------------------------------------------------===//
+
+multiclass LiftCompositeOpsWithActivation<Op ActivationOp, string ActivationName> {
+  def LiftConvWith#ActivationOp : Pat<
+    (ActivationOp:$res
+      (TF_Conv2DOp $input, $filter, $strides, $use_cudnn_on_gpu, $padding,
+        $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations)),
+    (LiftAsTFPartitionedCall<"composite_conv2d_with_"# ActivationName #"_fn">
+      (ArgumentList $input, $filter),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"strides"> $strides),
+        (NamedAttr<"use_cudnn_on_gpu"> $use_cudnn_on_gpu),
+        (NamedAttr<"padding"> $padding),
+        (NamedAttr<"explicit_paddings"> $explicit_paddings),
+        (NamedAttr<"dilations"> $dilations))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftConv2dWithBiasAnd#LastFusedOp : Pat<
+    (ActivationOp:$res
+      (TF_BiasAddOp
+        (TF_Conv2DOp $input, $filter, $strides, $use_cudnn_on_gpu, $padding,
+          $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+        $bias, IsDataFormatNHWC:$bias_data_format)),
+    (LiftAsTFPartitionedCall<"composite_conv2d_with_bias_and_"# ActivationName #"_fn">
+      (ArgumentList $input, $filter, $bias),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"strides"> $strides),
+        (NamedAttr<"use_cudnn_on_gpu"> $use_cudnn_on_gpu),
+        (NamedAttr<"padding"> $padding),
+        (NamedAttr<"explicit_paddings"> $explicit_paddings),
+        (NamedAttr<"dilations"> $dilations))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftDepthwiseConv2dNativeWith#ActivationOp : Pat<
+    (ActivationOp:$res
+      (TF_DepthwiseConv2dNativeOp $input, $filter, $strides, $padding,
+        $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations)),
+    (LiftAsTFPartitionedCall<"composite_depthwise_conv2d_with_"# ActivationName #"_fn">
+      (ArgumentList $input, $filter),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"strides"> $strides),
+        (NamedAttr<"padding"> $padding),
+        (NamedAttr<"explicit_paddings"> $explicit_paddings),
+        (NamedAttr<"dilations"> $dilations))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftDepthwiseConv2dNativeWithBiasAnd#LastFusedOp : Pat<
+    (ActivationOp:$res
+      (TF_BiasAddOp
+        (TF_DepthwiseConv2dNativeOp $input, $filter, $strides, $padding,
+          $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+        $bias, IsDataFormatNHWC:$bias_data_format)),
+    (LiftAsTFPartitionedCall<"composite_depthwise_conv2d_with_bias_and_"# ActivationName #"_fn">
+      (ArgumentList $input, $filter, $bias),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"strides"> $strides),
+        (NamedAttr<"padding"> $padding),
+        (NamedAttr<"explicit_paddings"> $explicit_paddings),
+        (NamedAttr<"dilations"> $dilations))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftMatmulWith#ActivationOp : Pat<
+    (ActivationOp:$res
+      (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b)),
+    (LiftAsTFPartitionedCall<"composite_matmul_with_"# ActivationName #"_fn">
+      (ArgumentList $a, $b),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"transpose_a"> $transpose_a),
+        (NamedAttr<"transpose_b"> $transpose_b))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftMatmulWithBiasAnd#LastFusedOp : Pat<
+    (ActivationOp:$res
+      (TF_BiasAddOp
+        (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
+      $bias, IsDataFormatNHWC:$bias_data_format)),
+    (LiftAsTFPartitionedCall<"composite_matmul_with_bias_and_"# ActivationName #"_fn">
+      (ArgumentList $a, $b, $bias),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"transpose_a"> $transpose_a),
+        (NamedAttr<"transpose_b"> $transpose_b))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftConv3dWith#ActivationOp : Pat<
+    (ActivationOp:$res
+      (TF_Conv3DOp $input, $filter, $strides, $padding,
+        IsDataFormatNDHWC:$data_format, $dilations)),
+    (LiftAsTFPartitionedCall<"composite_conv3d_with_"# ActivationName #"_fn">
+      (ArgumentList $input, $filter),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"strides"> $strides),
+        (NamedAttr<"padding"> $padding),
+        (NamedAttr<"dilations"> $dilations))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftConv3dWithBiasAnd#LastFusedOp : Pat<
+    (ActivationOp:$res
+      (TF_BiasAddOp
+        (TF_Conv3DOp $input, $filter, $strides, $padding,
+          IsDataFormatNDHWC:$data_format, $dilations),
+        $bias, $bias_data_format)),
+    (LiftAsTFPartitionedCall<"composite_conv3d_with_bias_and_"# ActivationName #"_fn">
+      (ArgumentList $input, $filter, $bias),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"strides"> $strides),
+        (NamedAttr<"padding"> $padding),
+        (NamedAttr<"dilations"> $dilations))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftBatchMatMulWith#ActivationOp : Pat<
+    (ActivationOp:$res
+      (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y, $grad_x, $grad_y)),
+    (LiftAsTFPartitionedCall<"composite_batch_matmul_with_"# ActivationName #"_fn">
+      (ArgumentList $x, $y),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"adj_x"> $adj_x),
+        (NamedAttr<"adj_y"> $adj_y))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftBatchMatMulWithBiasAnd#LastFusedOp : Pat<
+    (ActivationOp:$res
+      (TF_BiasAddOp
+        (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
+        $bias, IsDataFormatNHWC:$bias_data_format)),
+    (LiftAsTFPartitionedCall<"composite_batch_matmul_with_bias_and_"# ActivationName #"_fn">
+      (ArgumentList $x, $y, $bias),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"adj_x"> $adj_x),
+        (NamedAttr<"adj_y"> $adj_y))),
+    [(IsNotInLiftedFunc $res)], [], (addBenefit 10)>;
+
+  def LiftEinsumWith#ActivationOp : Pat<
+    (ActivationOp:$res
+      (TF_EinsumOp $input, $equation)),
+    (LiftAsTFPartitionedCall<"composite_einsum_with_"# ActivationName #"_fn">
+      (ArgumentList $input),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"equation"> $equation))),
+    [(IsNotInLiftedFunc $res),
+     (IsEinsumSupportedByXlaDotV2 $equation)],
+    [], (addBenefit 10)>;
+
+  def LiftEinsumWithBiasAnd#LastFusedOp : Pat<
+    (ActivationOp:$res
+      (TF_BiasAddOp
+        (TF_EinsumOp $input, $equation),
+        $bias, IsDataFormatNHWC:$bias_data_format)),
+    (LiftAsTFPartitionedCall<"composite_einsum_with_bias_and_"# ActivationName #"_fn">
+      (AppendToVector (ArgumentList $input), $bias),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"equation"> $equation))),
+    [(IsNotInLiftedFunc $res),
+     (IsEinsumSupportedByXlaDotV2 $equation)],
+    [], (addBenefit 10)>;
+
+}
+defm : LiftCompositeOpsWithActivation<TF_ReluOp, "relu">;
+defm : LiftCompositeOpsWithActivation<TF_Relu6Op, "relu6">;
+
+def LiftGather : Pat<
+  (TF_GatherV2Op:$res $params, $indices, $axis, $batch_dims),
+  (LiftAsTFPartitionedCall<"composite_gather_fn">
+    (ArgumentList $params, $indices, $axis),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"batch_dims"> $batch_dims))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $params)], [], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions_drq.cc
new file mode 100644
index 000000000000..33ebbecd8759
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions_drq.cc
@@ -0,0 +1,213 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_lift_as_function_call.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using QuantMethod =
+    ::tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
+
+class LiftQuantizableSpotsAsFunctionsDRQPass
+    : public PassWrapper<LiftQuantizableSpotsAsFunctionsDRQPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      LiftQuantizableSpotsAsFunctionsDRQPass)
+
+  // Constructor used by the PassRegistration. This is only used by test.
+  explicit LiftQuantizableSpotsAsFunctionsDRQPass() = default;
+
+  // Constructor used by manually creating the pass.
+  explicit LiftQuantizableSpotsAsFunctionsDRQPass(
+      const QuantMethod quantization_method, const OpSet target_opset,
+      const int min_num_elements_for_weights) {
+    quantization_method_ = quantization_method;
+    target_opset_ = target_opset;
+    min_num_elements_for_weights_ = min_num_elements_for_weights;
+  }
+
+  LiftQuantizableSpotsAsFunctionsDRQPass(
+      const LiftQuantizableSpotsAsFunctionsDRQPass& other) {
+    quantization_method_ = other.quantization_method_;
+    target_opset_ = other.target_opset_;
+    min_num_elements_for_weights_ = other.min_num_elements_for_weights_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-lift-quantizable-spots-as-functions-drq";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Replace quantization candidates with composite functions into the "
+           "module for post-training dynamic range case";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
+  void runOnOperation() override;
+
+ private:
+  Option<OpSet> target_opset_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+
+  Option<int64_t> min_num_elements_for_weights_{
+      *this, "min-num-elements-for-weights", llvm::cl::init(0),
+      llvm::cl::desc("The minimum required number of elements in a weight "
+                     "array to apply quantization.")};
+
+  Option<QuantMethod> quantization_method_{
+      *this, "quantization-method",
+      llvm::cl::init(tensorflow::quantization::QuantizationMethod::
+                         METHOD_DYNAMIC_RANGE_INT8),
+      llvm::cl::desc("Choose quantization method."),
+      llvm::cl::values(
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_DYNAMIC_RANGE_INT8,
+                     "drq", "Post-training dynamic-range quantizaiton"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8,
+                     "weight_only", "Post-training weight_only quantizaiton"))};
+};
+
+class CheckQuantizableOps
+    : public mlir::OpRewritePattern<TF::PartitionedCallOp> {
+ public:
+  explicit CheckQuantizableOps(MLIRContext* context,
+                               const QuantMethod quantization_method,
+                               const OpSet target_opset,
+                               const int min_num_elements_for_weights)
+      : OpRewritePattern<TF::PartitionedCallOp>(context),
+        quantization_method_(quantization_method),
+        target_opset_(target_opset),
+        min_num_elements_for_weights_(min_num_elements_for_weights) {}
+
+ private:
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(call_op);
+    if (spec->quantizable_operands.empty()) return failure();
+
+    for (auto idx : spec->quantizable_operands) {
+      // This op is guaranteed to be a constant as ODS checks IsConstTensor.
+      // Check if the number of elements meets the requirement.
+      int current_num_elements =
+          mlir::cast<ShapedType>(call_op.getOperand(idx).getType())
+              .getNumElements();
+      if (current_num_elements < min_num_elements_for_weights_) {
+        call_op.emitRemark("Quantization is skipped for ")
+            << call_op->getName().getStringRef().str() << " because it has "
+            << current_num_elements
+            << " elements which is fewer than the threshold("
+            << min_num_elements_for_weights_ << " elements).";
+        call_op->removeAttr(kQuantTraitAttrName);
+      }
+    }
+
+    StringRef function_name =
+        mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
+    if ((quantization_method_ == tensorflow::quantization::QuantizationMethod::
+                                     METHOD_DYNAMIC_RANGE_INT8) &&
+        (function_name.contains("batch_matmul") ||
+         function_name.contains("conv3d"))) {
+      call_op->removeAttr(kQuantTraitAttrName);
+    }
+
+    // TODO(b/270906404): Support weight-only gather for uniform quantized opset
+    // in PTQ mode
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED &&
+        function_name.contains("gather")) {
+      call_op->removeAttr(kQuantTraitAttrName);
+    }
+
+    return failure();
+  }
+  QuantMethod quantization_method_;
+  OpSet target_opset_;
+  int min_num_elements_for_weights_;
+};
+
+static PassRegistration<LiftQuantizableSpotsAsFunctionsDRQPass> pass;
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.inc"
+
+void LiftQuantizableSpotsAsFunctionsDRQPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module = getOperation();
+
+  populateWithGenerated(patterns);
+  patterns.add<CheckQuantizableOps>(ctx, quantization_method_, target_opset_,
+                                    min_num_elements_for_weights_);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+  for (auto func : module.getOps<func::FuncOp>()) {
+    if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+      func.emitError()
+          << "tf-quant-lift-quantizable-spots-as-functions-drq failed.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsDRQPass(
+    const QuantMethod quantization_method, const OpSet target_opset,
+    const int min_num_elements_for_weights) {
+  return std::make_unique<LiftQuantizableSpotsAsFunctionsDRQPass>(
+      quantization_method, target_opset, min_num_elements_for_weights);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions_drq.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions_drq.td
new file mode 100644
index 000000000000..cd978b302f46
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_lift_quantizable_spots_as_functions_drq.td
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern rules for lifting ops as functions
+//===----------------------------------------------------------------------===//
+
+def LiftConv : Pat<
+  (TF_Conv2DOp:$res $input, $filter, $strides, $use_cudnn_on_gpu, $padding,
+    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+  (LiftAsTFPartitionedCall<"composite_conv2d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"use_cudnn_on_gpu"> $use_cudnn_on_gpu),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $filter)], [], (addBenefit 1)>;
+
+def LiftDepthwiseConv : Pat<
+  (TF_DepthwiseConv2dNativeOp:$res $input, $filter, $strides, $padding,
+    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+  (LiftAsTFPartitionedCall<"composite_depthwise_conv2d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $filter)], [], (addBenefit 1)>;
+
+def LiftMatMul : Pat<
+  (TF_MatMulOp:$res $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (LiftAsTFPartitionedCall<"composite_matmul_fn">
+    (ArgumentList $a, $b),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"transpose_a"> $transpose_a),
+      (NamedAttr<"transpose_b"> $transpose_b))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $b)], [], (addBenefit 1)>;
+
+def LiftGather : Pat<
+  (TF_GatherV2Op:$res $params, $indices, $axis, $batch_dims),
+  (LiftAsTFPartitionedCall<"composite_gather_fn">
+    (ArgumentList $params, $indices, $axis),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"batch_dims"> $batch_dims))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $params)], [], (addBenefit 1)>;
+
+def LiftConv3D : Pat<
+  (TF_Conv3DOp:$res $input, $filter, $strides, $padding,
+    IsDataFormatNDHWC:$data_format, $dilations),
+  (LiftAsTFPartitionedCall<"composite_conv3d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $filter)], [], (addBenefit 1)>;
+
+def LiftBatchMatMul : Pat<
+  (TF_BatchMatMulV2Op:$res $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
+  (LiftAsTFPartitionedCall<"composite_batch_matmul_fn">
+    (ArgumentList $x, $y),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"adj_x"> $adj_x),
+      (NamedAttr<"adj_y"> $adj_y))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $y)], [], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_mark_functions_noinline.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_mark_functions_noinline.cc
new file mode 100644
index 000000000000..deaf279c392e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_mark_functions_noinline.cc
@@ -0,0 +1,125 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+
+// Required when using LLVM_DEBUG macro.
+#define DEBUG_TYPE "tf-mark-functions-noinline"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+// Name of the boolean attribute indicating whether the function can be
+// inlined or not.
+constexpr StringRef kTfNoinlineAttr = "tf._noinline";
+
+// This pass marks functions with the attribute `tf._noinline = true` so that
+// they aren't inlined by the `InlinerPass`. The names of the functions to be
+// marked noinline should be specified by the `noinline-functions` option.
+class MarkFunctionsNoinlinePass
+    : public PassWrapper<MarkFunctionsNoinlinePass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MarkFunctionsNoinlinePass)
+
+  explicit MarkFunctionsNoinlinePass()
+      : MarkFunctionsNoinlinePass(
+            /*noinline_functions=*/ArrayRef<std::string>{}) {}
+
+  // `noinline_functions` is a list of function names to be marked noinline.
+  explicit MarkFunctionsNoinlinePass(
+      const ArrayRef<std::string> noinline_functions)
+      : noinline_functions_(CreateNoinlineFunctionsOption(noinline_functions)) {
+  }
+
+  MarkFunctionsNoinlinePass(const MarkFunctionsNoinlinePass& other)
+      : MarkFunctionsNoinlinePass() {
+    noinline_functions_ = other.noinline_functions_;
+  }
+
+  StringRef getArgument() const final { return "tf-mark-functions-noinline"; }
+
+  StringRef getDescription() const final {
+    return "Marks a function whose name is in `noinline-functions` option with "
+           "the attribute `tf._noinline = true`. This attributes the function "
+           "from being inlined by the `InlinerPass`.";
+  }
+
+  void runOnOperation() override;
+
+ private:
+  ListOption<std::string> CreateNoinlineFunctionsOption(
+      const ArrayRef<std::string> noinline_functions) {
+    return {*this, "noinline-functions",
+            llvm::cl::desc(
+                "Name of the functions that should be marked "
+                "tf._noinline = true to prevent inlining. The name of the "
+                "function should exactly match to be marked noinline."),
+            llvm::cl::list_init<std::string>(noinline_functions),
+            llvm::cl::ZeroOrMore};
+  }
+
+  // Gets a set of function names from `noinline_functions_`.
+  llvm::StringSet<> GetNoinlineFunctionsSet() {
+    llvm::StringSet<> noinline_functions;
+    noinline_functions.insert(noinline_functions_.begin(),
+                              noinline_functions_.end());
+    return noinline_functions;
+  }
+
+  // Names of the functions to be marked noinline.
+  ListOption<std::string> noinline_functions_;
+};
+
+void MarkFunctionsNoinlinePass::runOnOperation() {
+  const llvm::StringSet<> noinline_functions = GetNoinlineFunctionsSet();
+
+  func::FuncOp func_op = getOperation();
+  Builder builder(&getContext());
+
+  // Adds the `tf._noinline = true` attribute to the function if the name
+  // matches.
+  if (noinline_functions.contains(func_op.getSymName())) {
+    func_op->setAttr(kTfNoinlineAttr, builder.getBoolAttr(true));
+    LLVM_DEBUG(llvm::dbgs()
+               << "Marked tf._noinline = true: " << func_op.getSymName());
+  }
+}
+
+static PassRegistration<MarkFunctionsNoinlinePass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMarkFunctionsNoinlinePass(
+    const ArrayRef<std::string> noinline_functions) {
+  return std::make_unique<MarkFunctionsNoinlinePass>(noinline_functions);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_duplicate_resource_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_duplicate_resource_ops.cc
new file mode 100644
index 000000000000..ab99a9d21e83
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_duplicate_resource_ops.cc
@@ -0,0 +1,149 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_executor::GraphOp;
+using ::mlir::tf_executor::IslandOp;
+
+constexpr StringRef kSharedNameAttr = "shared_name";
+
+class MergeDuplicateResourceOpsPass
+    : public PassWrapper<MergeDuplicateResourceOpsPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeDuplicateResourceOpsPass)
+
+  StringRef getArgument() const final {
+    return "tf-quant-merge-duplicate-resource-ops";
+  }
+
+  StringRef getDescription() const final {
+    return "Merge resource ops that have the same shared name.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Checks if the island op contains a resource op like Variable or Hashtable
+// and returns that resource op. Otherwise, returns null.
+Operation* GetResourceOp(Operation* op) {
+  // Check if the island has only one block thats contain two ops, including
+  // one resource op and one Yield op.
+  auto island_op = llvm::dyn_cast_or_null<IslandOp>(op);
+  if (!island_op || !island_op.getBody().hasOneBlock()) return nullptr;
+  auto& island_block = island_op.getBody().front();
+  if (++island_block.begin() != --island_block.end()) return nullptr;
+
+  Operation* resource_op = &island_block.front();
+  if (llvm::isa<TF::VarHandleOp, TF::HashTableOp, TF::HashTableV2Op,
+                TF::MutableHashTableV2Op>(resource_op)) {
+    return resource_op;
+  }
+  return nullptr;
+}
+
+// Returns the `shared_name` attribute value if exists. If not, returns an
+// empty string.
+StringRef GetSharedName(Operation* op) {
+  if (!op->hasAttrOfType<StringAttr>(kSharedNameAttr)) return "";
+  return op->getAttrOfType<StringAttr>(kSharedNameAttr).getValue();
+}
+
+// Gets the GraphOp from the function op. Returns an empty op iff it doesn't
+// exist.
+// TODO(b/284222084): Move executor dialect utilities to a new library.
+GraphOp GetGraphOpFromFuncOp(func::FuncOp func_op) {
+  if (func_op->getNumRegions() == 0 || func_op.getBody().empty()) return {};
+
+  auto graph_op_range = func_op.front().without_terminator();
+  if (llvm::hasSingleElement(graph_op_range)) {
+    // The pass runs on a valid tf_executor dialect, so the op should be the
+    // GraphOp.
+    return cast<GraphOp>(graph_op_range.begin());
+  }
+
+  return {};
+}
+
+void MergeDuplicateResourceOpsPass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  GraphOp graph_op = GetGraphOpFromFuncOp(func_op);
+  if (!graph_op) return;
+
+  llvm::StringMap<Operation*> shared_name_to_resource;
+  llvm::SmallVector<Operation*> ops_to_remove;
+  for (Operation& op : graph_op.GetBody().without_terminator()) {
+    Operation* resource_op = GetResourceOp(&op);
+    if (!resource_op) continue;
+    StringRef shared_name = GetSharedName(resource_op);
+    if (shared_name.empty()) continue;
+
+    if (!shared_name_to_resource.contains(shared_name)) {
+      shared_name_to_resource[shared_name] = resource_op;
+      continue;
+    }
+
+    auto existing_resource = shared_name_to_resource[shared_name];
+    if (resource_op->getName().getStringRef() !=
+            existing_resource->getName().getStringRef() ||
+        resource_op->getResult(0).getType() !=
+            existing_resource->getResult(0).getType()) {
+      resource_op->emitOpError(
+          "This op has the same `shared_name` but different type with another "
+          "resource op in the function");
+      signalPassFailure();
+      return;
+    }
+    op.replaceAllUsesWith(existing_resource->getParentOp()->getResults());
+    ops_to_remove.push_back(&op);
+  }
+
+  // Remove op after the loop to avoid crash.
+  for (Operation* op : ops_to_remove) {
+    op->erase();
+  }
+}
+
+static PassRegistration<MergeDuplicateResourceOpsPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateMergeDuplicateResourceOpsPass() {
+  return std::make_unique<MergeDuplicateResourceOpsPass>();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_initializer_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_initializer_function_ops_to_main.cc
new file mode 100644
index 000000000000..84518e22c3b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_initializer_function_ops_to_main.cc
@@ -0,0 +1,402 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <array>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_executor::FetchOp;
+using ::mlir::tf_executor::GraphOp;
+using ::mlir::tf_executor::IslandOp;
+using ::mlir::tf_saved_model::GetInitializerFunctions;
+using ::mlir::tf_saved_model::GetSessionInitializerOp;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
+
+// Array of initializer functions' types. The corresponding initializer
+// functions should be merged in this order. This is because:
+//   1) Variable restoration usually happens before initialization of other
+//   resources when a SavedModel is loaded. This ordering follows this semantic.
+//   2) The `tf_saved_model` dialect requires that the arguments with
+//   `tf_saved_model.index_path` attributes should precede those with
+//   `tf_saved_model.bound_input` attributes. The init function of type
+//   `kTfSavedModelInitializerRestoreType` usually has an argument with
+//   `tf_saved_model.index_path`, whereas the init function of type
+//   `kTfSavedModelInitializerInitType` may have arguments with
+//   `tf_saved_model.bound_input`. This ordering avoids breaking the argument
+//   ordering constraint.
+constexpr std::array<StringRef, 2> kInitializerTypesByMergeOrder = {
+    kTfSavedModelInitializerRestoreType, kTfSavedModelInitializerInitType};
+
+// This pass moves all ops from initializer functions to the main function. A
+// new `tf.NoOp` that has control dependency to the initializer function for
+// non-variable resources will be created. The control output of the new
+// `tf.NoOp` will be merged into the main function's `FetchOp`.
+class MergeInitializerFunctionOpsToMainPass
+    : public PassWrapper<MergeInitializerFunctionOpsToMainPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      MergeInitializerFunctionOpsToMainPass)
+
+  explicit MergeInitializerFunctionOpsToMainPass() = default;
+
+  StringRef getArgument() const override {
+    return "tf-quant-merge-initializer-function-ops-to-main";
+  }
+
+  StringRef getDescription() const override {
+    return "Moves all ops from the initializer functions to the main function. "
+           "A new `tf.NoOp` that has a control dependency to the initializer "
+           "function for non-variable resources will be created. Its control "
+           "output will be merged into the main function's `FetchOp`. The "
+           "initializer functions will be removed after this pass.";
+  }
+
+  void runOnOperation() override;
+
+ private:
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry
+        .insert<TF::TensorFlowDialect, tf_executor::TensorFlowExecutorDialect,
+                tf_saved_model::TensorFlowSavedModelDialect>();
+  }
+};
+
+// Returns true iff func_op has either no Region or the body has no Blocks.
+bool IsFuncOpEmpty(func::FuncOp func_op) {
+  return func_op->getNumRegions() == 0 || func_op.getBody().empty();
+}
+
+// Gets the GraphOp from the function op. Returns an empty op iff it doesn't
+// exist.
+GraphOp GetGraphOpFromFuncOp(func::FuncOp func_op) {
+  if (IsFuncOpEmpty(func_op)) return {};
+
+  auto graph_op_range = func_op.front().without_terminator();
+  if (llvm::hasSingleElement(graph_op_range)) {
+    // The pass runs on a valid tf_executor dialect, so the op should be the
+    // GraphOp.
+    return cast<GraphOp>(graph_op_range.begin());
+  }
+
+  return {};
+}
+
+// Gets the string representation of the type name.
+std::string GetTypeName(const Type type) {
+  std::string type_name{};
+  auto os = llvm::raw_string_ostream{type_name};
+  os << type;
+  return type_name;
+}
+
+// Retrieves the value of `tf_saved_model.initializer_type` attribute from the
+// initializer function. Assumes that there exists such an attribute.
+std::string GetInitializerType(func::FuncOp init_func_op) {
+  return init_func_op
+      ->getAttrOfType<StringAttr>(kTfSavedModelInitializerTypeAttr)
+      .str();
+}
+
+// An initializer function should satisfy the follwing conditions:
+// * Its GraphOp should only have control outputs.
+// * "tf_saved_model.initializer_type" attribute must exist.
+LogicalResult ValidateInitFunc(func::FuncOp init_func_op) {
+  GraphOp graph_op = GetGraphOpFromFuncOp(init_func_op);
+  if (!graph_op) return success();  // Consider empty FuncOp valid.
+
+  FetchOp fetch_op = graph_op.GetFetch();
+  for (const Value fetch : fetch_op.getFetches()) {
+    if (!mlir::isa<tf_executor::ControlType>(fetch.getType())) {
+      fetch_op.emitError(absl::StrFormat(
+          "Validation failed for the initializer function: %s. "
+          "All initializer function's fetches should be "
+          "tf_executor::ControlType. Got: %s.",
+          init_func_op.getName().str(), GetTypeName(fetch.getType())));
+      return failure();
+    }
+  }
+
+  if (const auto init_type_attr = init_func_op->getAttrOfType<StringAttr>(
+          kTfSavedModelInitializerTypeAttr);
+      !init_type_attr) {
+    return init_func_op->emitError() << "Initializer func op does not have "
+                                        "tf_saved_model.initializer_type "
+                                        "attribute. Func op: "
+                                     << init_func_op.getSymName();
+  }
+
+  return success();
+}
+
+// Returns initializer_type -> init_func_op mapping from the session_init_op's
+// initializers. The initializer functions are validated for whether it can be
+// moved to the main function. Returns failure() iff validation fails.
+FailureOr<absl::flat_hash_map<std::string, func::FuncOp>> GetInitFuncOps(
+    ModuleOp module_op) {
+  absl::flat_hash_map<std::string, func::FuncOp> init_func_ops;
+
+  for (func::FuncOp init_func_op : GetInitializerFunctions(module_op)) {
+    if (failed(ValidateInitFunc(init_func_op))) {
+      return failure();
+    }
+
+    init_func_ops[GetInitializerType(init_func_op)] = init_func_op;
+  }
+
+  return init_func_ops;
+}
+
+// Creates new arguments to the main function that corresponds to the source
+// function's arguments. Returns the `IRMapping` that contains the
+// relationship.
+IRMapping CloneSrcFuncArgumentsToMainFunc(func::FuncOp src_func_op,
+                                          func::FuncOp main_func_op) {
+  IRMapping mapper{};
+
+  for (auto [src_arg_idx, src_arg] :
+       llvm::enumerate(src_func_op.getArguments())) {
+    // No need to create a mapping when there is no usage - it will not affect
+    // the cloning.
+    if (src_arg.use_empty()) continue;
+
+    const unsigned main_arg_idx = main_func_op.getNumArguments();
+
+    const DictionaryAttr main_arg_attr =
+        src_func_op.getArgAttrDict(src_arg_idx);
+
+    (void)main_func_op.insertArgument(main_arg_idx, src_arg.getType(),
+                                      main_arg_attr, src_arg.getLoc());
+
+    const std::string new_input_name =
+        absl::StrCat(GetInitializerType(src_func_op), "_", src_arg_idx, ":0");
+
+    quant::AddEntryFunctionInput(new_input_name, main_func_op);
+
+    // During cloning, let it know that the source function's argument
+    // corresponds to the main function's newly created argument when cloning
+    // ops from src -> main.
+    BlockArgument main_arg = main_func_op.getArgument(main_arg_idx);
+    mapper.map(src_arg, main_arg);
+  }
+
+  return mapper;
+}
+
+// Copies ops from `src_func_op` to `main_body` except for the FetchOps. Returns
+// the fetch values in the main GraphOp corresponding to the original fetch
+// values from `src_func_op`. Returns an empty vector when `src_func_op` is
+// empty. `main_func_op` must have a GraphOp.
+SmallVector<Value> CopyOpsToMainFunction(func::FuncOp src_func_op,
+                                         func::FuncOp main_func_op) {
+  GraphOp src_graph_op = GetGraphOpFromFuncOp(src_func_op);
+  if (!src_graph_op) {
+    VLOG(1) << "Function " << src_func_op.getName().str()
+            << " does not have a tf_executor::GraphOp. No ops are copied to "
+               "the main function.";
+    return {};
+  }
+
+  GraphOp main_graph_op = GetGraphOpFromFuncOp(main_func_op);
+
+  FetchOp main_fetch_op = main_graph_op.GetFetch();
+  const absl::Cleanup erase_main_fetch_op = [main_fetch_op]() mutable {
+    main_fetch_op.erase();
+  };
+
+  // TODO(b/245473863): Handle when assets are actually used in the body.
+  IRMapping mapper = CloneSrcFuncArgumentsToMainFunc(src_func_op, main_func_op);
+
+  // Clones each op from src to main_body.
+  Block& main_body = main_graph_op.GetBody();
+  Block& src_body = src_graph_op.GetBody();
+  for (Operation& op : src_body.without_terminator()) {
+    main_body.push_back(op.clone(mapper));
+  }
+
+  // Relocate the main function's FetchOp at the last.
+  main_body.push_back(main_fetch_op->clone(mapper));
+
+  // Clone the source's FetchOp, but do not push to the main function's body.
+  // The clone is only needed to identify the fetch operands.
+  auto cloned_fetch_op = cast<FetchOp>(src_graph_op.GetFetch()->clone(mapper));
+  const absl::Cleanup erase_cloned_fetch_op = [cloned_fetch_op]() mutable {
+    cloned_fetch_op.erase();
+  };
+
+  return llvm::to_vector(cloned_fetch_op.getFetches());
+}
+
+// Creates a new `IslandOp` that wraps a `TF::NoOp`. The `IslandOp` has control
+// dependencies to the values provided.
+IslandOp CreateNoOpWithControlDependencies(
+    const Location loc, GraphOp main_graph_op,
+    const ArrayRef<Value> control_dependencies) {
+  auto builder = OpBuilder::atBlockTerminator(&main_graph_op.GetBody());
+
+  auto wrapper_island_op = builder.create<IslandOp>(
+      loc, /*outputs=*/TypeRange{},
+      /*control=*/tf_executor::ControlType::get(builder.getContext()),
+      /*controlInputs=*/control_dependencies);
+  wrapper_island_op.getBody().emplaceBlock();
+
+  // Create a NoOp inside the IslandOp.
+  auto guard = OpBuilder::InsertionGuard(builder);
+  builder.setInsertionPointToStart(&wrapper_island_op.GetBody());
+
+  builder.create<TF::NoOp>(loc);
+  builder.create<tf_executor::YieldOp>(loc);
+
+  return wrapper_island_op;
+}
+
+// Adds a new fetch operand for the main function's GraphOp.
+void AddFetchOperandToMain(GraphOp main_graph_op, const Value fetch_operand) {
+  FetchOp old_fetch = main_graph_op.GetFetch();
+  const absl::Cleanup erase_old_fetch = [old_fetch]() mutable {
+    old_fetch.erase();
+  };
+
+  auto fetches = llvm::to_vector(old_fetch.getFetches());
+  fetches.emplace_back(fetch_operand);
+
+  auto builder = OpBuilder::atBlockTerminator(&main_graph_op.GetBody());
+  builder.create<FetchOp>(main_graph_op.getLoc(), std::move(fetches));
+}
+
+// Creates a new Location for the initializer function. This creates a loc by
+// attaching a to the initializer function's type so that it is identifiable.
+Location CreateInitOpLoc(MLIRContext* ctx, func::FuncOp init_func_ops) {
+  const std::string init_type = GetInitializerType(init_func_ops);
+  const std::string name =
+      absl::StrCat(init_type, "_", init_func_ops.getName().str());
+  return NameLoc::get(StringAttr::get(ctx, name));
+}
+
+void MergeInitializerFunctionOpsToMainPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = module_op.getContext();
+
+  func::FuncOp main_func_op = quant::FindMainFuncOp(module_op);
+  if (!main_func_op) {
+    module_op.emitError("Main function op not found.");
+    return signalPassFailure();
+  }
+
+  GraphOp main_graph_op = GetGraphOpFromFuncOp(main_func_op);
+  if (!main_graph_op) return;
+
+  tf_saved_model::SessionInitializerOp session_init_op =
+      GetSessionInitializerOp(module_op);
+  if (!session_init_op) return;
+
+  // initializer_type -> init_func_op mapping.
+  SymbolTable symbol_table{module_op};
+  FailureOr<absl::flat_hash_map<std::string, func::FuncOp>> init_func_ops =
+      GetInitFuncOps(module_op);
+  if (failed(init_func_ops)) {
+    module_op->emitError("Validation on initializer functions failed.");
+    return signalPassFailure();
+  } else if (init_func_ops->empty()) {
+    VLOG(1) << "No initializer functions found.";
+    return;
+  }
+
+  // Find the initializer functions and clone their ops to @main.
+  for (const StringRef init_type : kInitializerTypesByMergeOrder) {
+    const auto it = init_func_ops->find(init_type);
+    if (it == init_func_ops->end()) continue;
+
+    func::FuncOp init_func_op = it->second;
+
+    const SmallVector<Value> init_op_fetches =
+        CopyOpsToMainFunction(init_func_op, main_func_op);
+    if (init_op_fetches.empty()) {
+      VLOG(1) << "No fetch values exist from initializer functions.";
+      return;
+    }
+
+    // Creates a NoOp that has control dependency to the initializer function
+    // for non-variables.
+    const Location init_op_loc = CreateInitOpLoc(ctx, init_func_op);
+    IslandOp noop_wrapper_island_op = CreateNoOpWithControlDependencies(
+        init_op_loc, main_graph_op,
+        /*control_dependencies=*/init_op_fetches);
+
+    AddFetchOperandToMain(
+        main_graph_op,
+        /*fetch_operand=*/noop_wrapper_island_op.getControl());
+
+    symbol_table.erase(init_func_op);
+  }
+
+  // Empties the "initializers" attribute from the `SessionInitializerOp` since
+  // all ops of the initializer ops are cloned into @main.
+  session_init_op.setInitializersAttr(ArrayAttr::get(ctx, {}));
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMergeInitializerFunctionOpsToMainPass() {
+  return std::make_unique<MergeInitializerFunctionOpsToMainPass>();
+}
+
+// Registers MergeInitializerFunctionOpsToMainPass.
+static PassRegistration<MergeInitializerFunctionOpsToMainPass> pass([] {
+  return CreateMergeInitializerFunctionOpsToMainPass();
+});
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_save_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_save_function_ops_to_main.cc
new file mode 100644
index 000000000000..ac0347b0b8e4
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_merge_save_function_ops_to_main.cc
@@ -0,0 +1,302 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_executor::FetchOp;
+using ::mlir::tf_executor::GraphOp;
+using ::mlir::tf_executor::IslandOp;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+using ::tensorflow::kImportModelDefaultGraphFuncName;
+
+class MergeSaveFunctionOpsToMainPass
+    : public PassWrapper<MergeSaveFunctionOpsToMainPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeSaveFunctionOpsToMainPass)
+
+  explicit MergeSaveFunctionOpsToMainPass() = default;
+
+  StringRef getArgument() const override {
+    return "tf-quant-merge-save-function-ops-to-main";
+  }
+
+  StringRef getDescription() const override {
+    return "Merge the save function's ops to the main function. The save "
+           "function will be removed after the pass.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Returns true iff func_op has either no Region or the body has no Blocks.
+bool IsFuncOpEmpty(func::FuncOp func_op) {
+  return func_op->getNumRegions() == 0 || func_op.getBody().empty();
+}
+
+// Gets the GraphOp from the function op. Returns an empty op iff it doesn't
+// exist.
+GraphOp GetGraphOpFromFuncOp(func::FuncOp func_op) {
+  if (IsFuncOpEmpty(func_op)) return {};
+
+  auto graph_op_range = func_op.front().without_terminator();
+  if (llvm::hasSingleElement(graph_op_range)) {
+    // The pass runs on a valid tf_executor dialect, so the op should be the
+    // GraphOp.
+    return cast<GraphOp>(graph_op_range.begin());
+  }
+
+  return {};
+}
+
+// Gets the "main" function from the module. Returns an empty op iff it doesn't
+// exist.
+func::FuncOp GetMainFunction(ModuleOp module_op) {
+  const auto main_func_id =
+      StringAttr::get(module_op.getContext(), kImportModelDefaultGraphFuncName);
+  auto func_ops = module_op.getOps<func::FuncOp>();
+  auto main_func_itr = absl::c_find_if(func_ops, [&main_func_id](auto func_op) {
+    return func_op.getName() == main_func_id;
+  });
+
+  if (main_func_itr == func_ops.end()) return {};
+  return *main_func_itr;
+}
+
+func::FuncOp GetSaveFuncOp(ModuleOp module_op) {
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (func_op.getSymName() == quant::kTfQuantSaveFuncName) return func_op;
+  }
+
+  return nullptr;
+}
+
+// Adds the file prefix argument to `main_func_op`. The file prefix argument
+// is the argument whose "tf_saved_model.index_path" attribute has
+// "__tf_file_prefix". Its type is `tensor<!tf_type.string>`. Also, the value
+// "__tf_file_prefix:0" is appended to the "tf.entry_function" attribute's
+// "inputs" key.
+BlockArgument CreateFilePrefixArg(func::FuncOp main_func_op) {
+  Builder builder(main_func_op);
+
+  // Add a new argument of type `tensor<!tf_type.string>` and update the
+  // function type.
+  auto file_prefix_arg_type =
+      RankedTensorType::get(/*shape=*/{}, builder.getType<TF::StringType>());
+  BlockArgument new_file_prefix_arg =
+      main_func_op.getBody().front().addArgument(
+          file_prefix_arg_type,
+          NameLoc::get(builder.getStringAttr(quant::kTfFilePrefix)));
+
+  SmallVector<Type> input_types(main_func_op.getArgumentTypes());
+  input_types.emplace_back(file_prefix_arg_type);
+
+  main_func_op.setType(
+      builder.getFunctionType(input_types, main_func_op.getResultTypes()));
+
+  // Add "__tf_file_prefix" to the "tf_saved_model.index_path" attribute for the
+  // newly created argument.
+  main_func_op.setArgAttr(
+      new_file_prefix_arg.getArgNumber(),
+      /*name=*/kTfSavedModelIndexPathAttr,
+      /*value=*/builder.getStrArrayAttr({quant::kTfFilePrefix}));
+
+  // Append the "__tf_file_prefix:0" to the "tf.entry_function" attribute's
+  // item keyed by "inputs".
+  quant::AddEntryFunctionInput(Twine(quant::kTfFilePrefix).concat(":0").str(),
+                               main_func_op);
+
+  return new_file_prefix_arg;
+}
+
+// Finds the file prefix argument from `main_func_op`. The file prefix argument
+// is the argument whose "tf_saved_model.index_path" attribute has
+// "__tf_file_prefix". If such an argument doesn't exist, returns a null value.
+BlockArgument GetFilePrefixArg(func::FuncOp main_func_op) {
+  for (int i = 0; i < main_func_op.getNumArguments(); i++) {
+    auto index_path_attr =
+        main_func_op.getArgAttrOfType<ArrayAttr>(i, kTfSavedModelIndexPathAttr);
+    if (index_path_attr && !index_path_attr.empty() &&
+        mlir::cast<StringAttr>(index_path_attr[0]) == quant::kTfFilePrefix) {
+      return main_func_op.getArgument(i);
+    }
+  }
+  return {};
+}
+
+// Returns the existing file prefix argument from the `main_func_op`. The file
+// prefix argument is the argument whose "tf_saved_model.index_path" attribute
+// has "__tf_file_prefix". If such an argument doesn't exist, creates a new file
+// prefix argument and returns it.
+BlockArgument GetOrCreateFilePrefixArg(func::FuncOp main_func_op) {
+  if (BlockArgument main_file_prefix_arg = GetFilePrefixArg(main_func_op);
+      main_file_prefix_arg) {
+    return main_file_prefix_arg;
+  } else {
+    return CreateFilePrefixArg(main_func_op);
+  }
+}
+
+// Clones ops from `src_graph_op` to `dst_graph_op`. The `dst_graph_op`'s
+// `FetchOp` will be used without modified. Returns the fetch operands from the
+// `scr_graph_op`.
+Value CloneGraphOps(GraphOp src_graph_op, GraphOp dst_graph_op,
+                    IRMapping& mapper) {
+  Block& main_body = dst_graph_op.GetBody();
+
+  // Take the reference of the main graph's FetchOp to later move to the end.
+  FetchOp main_fetch_op = dst_graph_op.GetFetch();
+
+  Block& save_func_body = src_graph_op.GetBody();
+  for (Operation& op : save_func_body.without_terminator()) {
+    main_body.push_back(op.clone(mapper));
+  }
+
+  // Relocate the main function's FetchOp to the last.
+  main_body.push_back(main_fetch_op->clone(mapper));
+  main_fetch_op.erase();
+
+  auto cloned_fetch_op = cast<FetchOp>(src_graph_op.GetFetch()->clone(mapper));
+  Value control_fetch = *cloned_fetch_op.getFetches().begin();
+  cloned_fetch_op.erase();
+
+  return control_fetch;
+}
+
+// Creates a new `IdentityOp` wrapped by an `IslandOp`. The identity op returns
+// the `main_file_prefix_arg` and has control dependencies to `control_inputs`.
+IslandOp CreateFilePrefixIdentityOp(const BlockArgument main_file_prefix_arg,
+                                    const ArrayRef<Value> control_inputs,
+                                    GraphOp main_graph_op) {
+  MLIRContext& ctx = *main_graph_op.getContext();
+  const auto name_loc =
+      NameLoc::get(StringAttr::get(&ctx, quant::kTfQuantSaveOpName));
+
+  auto builder = OpBuilder::atBlockTerminator(&main_graph_op.GetBody());
+  // Create an IslandOp that will wrap the IdentityOp. Add a control dependency
+  // for the newly copied save function.
+  auto wrapper_island_op = builder.create<IslandOp>(
+      name_loc, TypeRange{main_file_prefix_arg.getType()},
+      tf_executor::ControlType::get(&ctx), ValueRange(control_inputs));
+  wrapper_island_op.getBody().emplaceBlock();
+
+  builder.setInsertionPointToStart(&wrapper_island_op.GetBody());
+  auto identity_op = builder.create<TF::IdentityOp>(
+      name_loc, /*result_types=*/main_file_prefix_arg.getType(),
+      /*input=*/main_file_prefix_arg);
+
+  builder.create<tf_executor::YieldOp>(name_loc, identity_op.getResult());
+
+  return wrapper_island_op;
+}
+
+// Appends `value` to the arguments of the `FetchOp` of `graph_op`.
+void AppendValueToFetch(GraphOp graph_op, Value value) {
+  FetchOp old_main_fetch = graph_op.GetFetch();
+  auto fetches = llvm::to_vector(old_main_fetch.getFetches());
+  fetches.emplace_back(value);
+
+  auto builder = OpBuilder::atBlockTerminator(&graph_op.GetBody());
+  builder.create<FetchOp>(old_main_fetch.getLoc(), std::move(fetches));
+  old_main_fetch.erase();
+}
+
+void MergeSaveFunctionOpsToMain(func::FuncOp save_func_op,
+                                func::FuncOp main_func_op) {
+  GraphOp main_graph_op = GetGraphOpFromFuncOp(main_func_op);
+  if (!main_graph_op) return;
+
+  GraphOp save_func_graph_op = GetGraphOpFromFuncOp(save_func_op);
+  if (!save_func_graph_op) return;
+
+  IRMapping mapper{};
+  BlockArgument main_file_prefix_arg = GetOrCreateFilePrefixArg(main_func_op);
+  // TODO(b/268452435): This part assumes that the save function is always valid
+  // and has the argument. Add a validation function to filter out any invalid
+  // inputs.
+  mapper.map(save_func_op.getArgument(0), main_file_prefix_arg);
+
+  Value save_control_fetch =
+      CloneGraphOps(save_func_graph_op, main_graph_op, mapper);
+
+  IslandOp file_prefix_identity_wrapper = CreateFilePrefixIdentityOp(
+      main_file_prefix_arg, /*control_inputs=*/{save_control_fetch},
+      main_graph_op);
+
+  // Adds the newly created identity op's control output to the main's fetches.
+  AppendValueToFetch(main_graph_op, file_prefix_identity_wrapper.getControl());
+}
+
+}  // namespace
+
+void MergeSaveFunctionOpsToMainPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  func::FuncOp main_func_op = GetMainFunction(module_op);
+  if (!main_func_op) {
+    module_op.emitError("Main function op not found.");
+    return signalPassFailure();
+  }
+
+  func::FuncOp save_func_op = GetSaveFuncOp(module_op);
+  if (!save_func_op) return;
+
+  MergeSaveFunctionOpsToMain(save_func_op, main_func_op);
+
+  // Erase the save function when all ops are successfully cloned.
+  save_func_op.erase();
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMergeSaveFunctionOpsToMainPass() {
+  return std::make_unique<MergeSaveFunctionOpsToMainPass>();
+}
+
+static PassRegistration<MergeSaveFunctionOpsToMainPass> pass([] {
+  return CreateMergeSaveFunctionOpsToMainPass();
+});
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_optimize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_optimize.cc
new file mode 100644
index 000000000000..dea51450fc15
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_optimize.cc
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"  // IWYU pragma: keep - required to use `IsSplatValueEqual`.
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+
+namespace mlir::tf_quant {
+namespace {
+
+// Applies optimization after quantization.
+class OptimizePass
+    : public PassWrapper<OptimizePass, OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizePass)
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-optimize";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Applies optimization after quantization";
+  }
+
+  void runOnOperation() override;
+};
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_optimize.inc"
+
+void OptimizePass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  auto func = getOperation();
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateOptimizePass() {
+  return std::make_unique<OptimizePass>();
+}
+
+static PassRegistration<OptimizePass> pass;
+
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_optimize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_optimize.td
new file mode 100644
index 000000000000..c40902d283e8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_optimize.td
@@ -0,0 +1,62 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+// Remove redundant `CastOp` to int8 if the input is properly clipped.
+def RemoveRedundantCastOps : Pat<
+  (TF_CastOp:$root_cast
+    (TF_CastOp:$i8_cast
+      (TF_ClipByValueOp:$clip $input, $min_value, $max_value),
+      ConstBoolAttrFalse:$truncate2),
+    ConstBoolAttrFalse:$truncate1),
+  (TF_CastOp $clip, ConstBoolAttrFalse),
+  [(TensorOf<[I8]> $i8_cast),
+   (TensorOf<[I32]> $clip),
+   (IsIntSplatValueEqual<"int32_t", "-128"> $min_value),
+   (IsIntSplatValueEqual<"int32_t", "127"> $max_value)]>;
+
+// This pattern optimizes:
+//   (x + cst1) + cst2 -> x + cst
+//   (x - cst1) - cst2 -> x - cst
+// Where: cst = cst1 + cst2
+foreach BinaryOp = [TF_AddV2Op, TF_SubOp] in {
+  def OptimizeConsecutive#BinaryOp : Pat<
+    (BinaryOp
+      (BinaryOp $x, (TF_ConstOp:$cst1 $cst1_value)),
+      (TF_ConstOp:$cst2 $cst2_value)),
+    (BinaryOp
+      $x, (TF_AddV2Op $cst1, $cst2))>;
+}
+
+// This pattern optimizes:
+//   (x + cst1) - cst2 -> x - cst
+//   (x - cst1) + cst2 -> x + cst
+// Where: cst = cst2 - cst1
+foreach BinaryOpPair = [[TF_AddV2Op, TF_SubOp],
+                     [TF_SubOp, TF_AddV2Op]] in {
+  def OptimizeConsecutive#BinaryOpPair[0]#BinaryOpPair[1] : Pat<
+    (BinaryOpPair[0]
+      (BinaryOpPair[1] $x, (TF_ConstOp:$cst1 $cst1_value)),
+      (TF_ConstOp:$cst2 $cst2_value)),
+    (BinaryOpPair[0]
+      $x, (TF_SubOp  $cst2, $cst1))>;
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h
new file mode 100644
index 000000000000..acc049f9c0b2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h
@@ -0,0 +1,251 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_TF_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_TF_PASSES_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// Create a pass that inserts dump tensor to quantizable layer's output.
+std::unique_ptr<OperationPass<ModuleOp>> CreateAddDumpTensorOpPass(
+    ::stablehlo::quantization::DebuggerConfig::DebuggerType debugger_type,
+    std::string log_dir_path);
+
+// Creates a pass that add QuantizationUnitLoc to quantizable layers.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateAddQuantizationUnitLocPass();
+
+// Replaces tf.CustomAggregator ops with quant.Stats ops for finalizing the
+// calibration procedure.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateConvertCustomAggregationOpToQuantStatsPass();
+
+// Creates a pass that casts BFloat16 operations to Float32 operations. This
+// pass is a part of the ConvertTpuModelToCpu pass to support BF16 optimized TPU
+// model quantization.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCastBf16OpsToF32Pass();
+
+// Creates a pass that converts Tensorflow Xla ops to non-Xla ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertTfXlaOpToTfOpPass();
+
+// Creates a pass that converts TPU models for CPU by removing TPU related ops
+// such as TPUPartitionedCall, TPUReplicatedOp, etc. The TF quantizer does not
+// work with models specifically designed for TPU, so this pass makes the input
+// TPU model compatible with the TF quantizer by rewriting the TPU ops. The
+// output model of this pass is expected to be ready for the TF quantizer.
+std::unique_ptr<OperationPass<ModuleOp>> CreateConvertTpuModelToCpuPass();
+
+// Creates a pass that duplicates constants that affect the shape of a tensor
+// after some computation.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDuplicateShapeDeterminingConstantsPass();
+
+// Inserts custom aggregation operators for the calibration procedure.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateInsertCustomAggregationOpsPass(
+    const ::stablehlo::quantization::CalibrationOptions& calib_opts);
+
+// Creates a main function if it doesn't exist in the module. This is a
+// workaround to make ConvertMlirToGraphdef work for multi-signatures graphs.
+// TODO(b/204265523): Removes this pass after the exporting MLIR to SavedModel
+// path is available.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertMainFunctionPass();
+
+// Inserts quantized function library.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    tensorflow::quantization::OpSet target_opset);
+
+// Creates a pass that creates a RestoreV2 op in the initializer function with
+// type "restore_op" that initializes variables from the checkpoint. It finds
+// tf.AssignVariableOp(tf.VarHandleOp, tf.Const) patterns in the initializer
+// function and replaces tf.Consts with the results of RestoreV2.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertRestoreOpPass();
+
+// Creates a pass that creates a new function that wraps the newly created
+// SaveV2 op. The new function's name is "tf_quant__save". The function accepts
+// a single string tensor as argument, which specifies the path to the
+// checkpoint to which the variable's tensor values are saved. It finds
+// `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` pattern in the initializer
+// function of type "restore_op" to identify the VarHandleOps that should be
+// saved using the SaveV2 op.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertSaveOpPass();
+
+// Creates a pass that lifts HashTable ops as function arguments. In the graph
+// execution mode, resource ops with the same `shared_name` attribute point to
+// the same underlying resource. This is not true in the eager execution mode.
+// Lifting resource ops as arguments will help unifying them across functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftHashTableOpsAsArgsPass();
+
+// Lifts the quantizable spots as composite functions.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const tensorflow::quantization::QuantizationOptions& quant_options);
+
+// Lifts the dynamic range quantizable spots as composite functions.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsDRQPass(
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    tensorflow::quantization::OpSet target_opset,
+    int min_num_elements_for_weights);
+
+// Creates a pass that marks functions with the attribute `tf._noinline = true`
+// to avoid being inlined by the `InlinerPass`. `noinline_functions` is the name
+// of the functions to mark.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMarkFunctionsNoinlinePass(
+    ArrayRef<std::string> noinline_functions);
+
+// Creates a pass that merges duplicate resource ops in each function. Two
+// resource ops are considered duplicated if they have the same `shared_name`.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateMergeDuplicateResourceOpsPass();
+
+// Creates a pass that moves & merges initializer function's ops into the @main
+// function. This pass should be run on a valid tf_executor dialect. The control
+// output of the initializer function for non-variable resource initialization
+// will be passed on as a dependency to a new `tf.NoOp`, whose control output
+// will be merged into the main function's FetchOp. The initializer functions
+// will be removed.
+//
+// Running this pass essentially has the effect of inlining the initializer
+// functions into the main graph. This is beneficial when we wish to find and
+// fetch the node that restores resources, after the ModuleOp has been exported
+// as GraphDef.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateMergeInitializerFunctionOpsToMainPass();
+
+// Creates a pass that moves & merges the "@tf_quant__save" function to "@main"
+// function. A new `IdentityOp` will be created. It will have control dependency
+// to the save function and returns the file_prefix argument (typed
+// `tensor<!tf_type.string>`). The file_prefix argument, which can be identified
+// if the "tf_saved_model.index_path" attribute has "__tf_file_prefix", will be
+// reused if it already exist in @main. Otherwise a new file prefix argument
+// will be created. @tf_quant__save function will be erased.
+//
+// Running this pass essentially has the effect of inlining the @tf_quant__save
+// into the main graph. This is beneficial when we wish to find and fetch
+// the node that saves the variables, after the ModuleOp has been exported as
+// GraphDef.
+std::unique_ptr<OperationPass<ModuleOp>> CreateMergeSaveFunctionOpsToMainPass();
+
+// Applies optimization patterns after quantization.
+std::unique_ptr<OperationPass<mlir::func::FuncOp>> CreateOptimizePass();
+
+// Creates an instance of the PrepareQuantize pass, which will perform similar
+// transformations as TFL::PrepareQuantizePass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
+    const tf_quant::QuantizationSpecs& quant_specs,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method);
+
+// Creates an instance of the PrepareQuantizeDRQ pass, which will
+// perform similar transformations as TFL::PrepareQuantizeDynamicRangePass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
+    const tf_quant::QuantizationSpecs& quant_specs,
+    tensorflow::quantization::OpSet op_set);
+
+// Converts FakeQuant ops to quant.qcast and quant.dcast (QDQ) pairs.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertFakeQuantToQdqPass();
+
+// Apply graph optimizations such as fusing and constant folding to prepare
+// lifting.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareLiftingPass(
+    tensorflow::quantization::OpSet target_opset);
+
+// Creates an instance of the PostQuantize pass, which will remove unnecessary
+// ops from the final quantized graph.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass();
+
+// Propagate quantized type through allowed ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePropagateQuantizeTypePass();
+
+// Replaces composite functions with quantized composite functions. After this
+// pass runs, functions in the given graph will be replaced with their quantized
+// versions. By doing so, the quantization will be applied to the given input.
+// mlir_dump_file_prefix is an optional field that is used for debugging to save
+// mlir dump files.
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    tensorflow::quantization::OpSet target_opset,
+    bool enable_per_channel_quantization, int min_num_elements_for_weights,
+    bool enable_legacy_weight_only = false,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+// Converts dequantize-(quantizable) call-quantize pattern to a single call op
+// that has quantized input and output types. It is expected for this pass to
+// emit illegal IR with unsupported quantized input and output types. The
+// pass following immediately after this one will be responsible for legalizing
+// input and output types by unwrapping quantization parameters.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass();
+
+// Overloading of CreateQuantizePass which takes QuantizationSpecs.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+    tf_quant::QuantizationSpecs quant_specs,
+    tensorflow::quantization::OpSet target_opset);
+
+// Apply quantization to weights based on the provided schemes.
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeWeightsPass(
+    const tensorflow::quantization::QuantizationOptions& quant_options);
+
+// Removes `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns from the
+// initializer function (type = "restore_op").
+// Note: initializing values (`tf.Const`s) will be removed and this may result
+// in an information loss and uninitialized variables eventually. Make sure that
+// this effect is desired (e.g. there is a `tf.RestoreV2Op` that restores the
+// variables instead).
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateRemoveVariableInitializationByConstPass();
+
+// Creates an instance of the ReplaceCastHacksWithTFXLAOpsPass, which will
+// replace mixed-type convolution and matmul cast hacks by XLA Conv2DOp and
+// MatmulOp.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplaceCastHacksWithTFXLAOpsPass();
+
+// Creates a pass that "unfreezes" ConstOps into variables. Each ConstOp's use
+// will be replaced by a VarHandleOp -> ReadVariableOp pattern. The newly
+// created variables will be initialized in the session initializer function via
+// AssignVariableOps.
+std::unique_ptr<OperationPass<ModuleOp>> CreateUnfreezeConstantsPass();
+
+// Creates an instance of the PreprocessOp pass, which will perform op
+// preprocessing to allow multi-axis quantization, prior to quantization.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
+    tensorflow::quantization::OpSet op_set,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+}  // namespace tf_quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_TF_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_post_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_post_quantize.cc
new file mode 100644
index 000000000000..0b4777ae71bc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_post_quantize.cc
@@ -0,0 +1,161 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This transformation pass applies some clean up steps after quantization.
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+
+//===----------------------------------------------------------------------===//
+// The post-quantize Passes.
+//
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+// Applies all the clean up steps after quantization.
+class PostQuantizePass
+    : public PassWrapper<PostQuantizePass, OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PostQuantizePass)
+
+  // Constructor used by the PassRegistration. This will remove the adaptor ops.
+  explicit PostQuantizePass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-post-quantize";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Apply post quantization clean up after quantization";
+  }
+
+  void runOnOperation() override;
+};
+
+enum RemoveVolatileOpsType {
+  // Remove all volatile quant-dequant ops.
+  kPreserveNone,
+  // Preserve volatile quant-dequants for input and output ops.
+  kPreserveInputsAndOutputs,
+};
+
+// Remove the back-to-back quantize and dequantize ops with volatile attribute.
+template <RemoveVolatileOpsType remove_volatile_ops_type>
+struct RemoveVolatileOps
+    : public OpRewritePattern<mlir::quant::ir::DequantizeCastOp> {
+  explicit RemoveVolatileOps(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::DequantizeCastOp>(context, 1) {}
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::DequantizeCastOp op,
+                                PatternRewriter& rewriter) const override {
+    auto input_op = op.getArg().getDefiningOp();
+    if (auto q =
+            llvm::dyn_cast_or_null<mlir::quant::ir::QuantizeCastOp>(input_op)) {
+      if (!q->getAttr(kVolatileOpAttrName)) return failure();
+
+      if (remove_volatile_ops_type == kPreserveInputsAndOutputs) {
+        // Don't remove leading and trailing QDQ for PTQ workflow, so the io
+        // modifying lib can work correctly.
+        if (!q.getArg().getDefiningOp()) return failure();
+        if (op->hasOneUse() &&
+            op->user_begin()->hasTrait<OpTrait::IsTerminator>())
+          return failure();
+      }
+      // If the quantize op is a requantize op, it is being used in other scale
+      // adjustments and should be kept. Instead, moving dequantize op before
+      // the requantize op to remove the unnecessary requantize op.
+      if (auto qtype =
+              QuantizedType::getQuantizedElementType(q.getArg().getType())) {
+        rewriter.setInsertionPoint(op);
+        rewriter.replaceOpWithNewOp<mlir::quant::ir::DequantizeCastOp>(
+            op, op.getResult().getType(), q.getArg());
+        return success();
+      }
+
+      op.replaceAllUsesWith(q.getArg());
+      return success();
+    }
+    return failure();
+  }
+};
+
+// The StorageCastOp is used to cast from a quantized type to its storage type
+// or the opposite. If none of its input and output is quantized, the op has
+// no effect and should be removed.
+class RemoveRedundantScast
+    : public mlir::OpRewritePattern<mlir::quant::ir::StorageCastOp> {
+ public:
+  explicit RemoveRedundantScast(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::StorageCastOp>(context) {}
+
+ private:
+  LogicalResult matchAndRewrite(mlir::quant::ir::StorageCastOp scast_op,
+                                PatternRewriter& rewriter) const override {
+    if (QuantizedType::getQuantizedElementType(scast_op.getArg().getType()) ||
+        QuantizedType::getQuantizedElementType(scast_op.getType())) {
+      return failure();
+    }
+
+    scast_op.replaceAllUsesWith(scast_op.getArg());
+    return success();
+  }
+};
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_post_quantize.inc"
+
+void PostQuantizePass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  auto func = getOperation();
+  auto* ctx = func.getContext();
+  patterns.add<FoldTrivalRequantizeOp<mlir::quant::ir::QuantizeCastOp>,
+               RemoveVolatileOps<kPreserveNone>, RemoveRedundantScast>(ctx);
+  populateWithGenerated(patterns);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect PostQuantize pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass() {
+  return std::make_unique<PostQuantizePass>();
+}
+
+static PassRegistration<PostQuantizePass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_post_quantize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_post_quantize.td
new file mode 100644
index 000000000000..e5cea091c8f1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_post_quantize.td
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+
+// Re-orders the Identity op following a quantized composite function. This
+// allows the QuantizeCompositeFunctionsPass to merge the DequantizeCast with
+// the quantized composite function to optimize the requantization part.
+def ReorderIdentityFollowingQuantizedFunction : Pat<
+  (Quantization_DequantizeCastOp:$output
+    (Quantization_StorageCastOp
+      (TF_IdentityOp
+        (Quantization_StorageCastOp $value)))),
+  (TF_IdentityOp
+    (Quantization_DequantizeCastOp
+      $value, (returnType (GetValueType $output))))>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_lifting.cc
new file mode 100644
index 000000000000..75c5c27bc40d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_lifting.cc
@@ -0,0 +1,359 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::tensorflow::quantization::OpSet;
+using tf_quant::CloneOpWithReplacedOperands;
+using tf_quant::HasStaticShape;
+
+class PrepareLiftingPass
+    : public PassWrapper<PrepareLiftingPass, OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareLiftingPass)
+
+  PrepareLiftingPass() = default;
+
+  explicit PrepareLiftingPass(OpSet op_set) { op_set_ = op_set; }
+
+  PrepareLiftingPass(const PrepareLiftingPass& other) {
+    op_set_ = other.op_set_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-prepare-lifting";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Apply graph optimizations such as fusing and constant folding to "
+           "prepare lifting.";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, arith::ArithDialect>();
+  }
+
+  void runOnOperation() override;
+
+ private:
+  Option<OpSet> op_set_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+};
+
+// Check if given indices in `val1` has same number of elements as given
+// indices in `val2`.
+bool HasEqualElementSize(Value val1, Value val2, ArrayRef<int> val1_indices,
+                         ArrayRef<int> val2_indices) {
+  ShapedType val1_shape = mlir::cast<ShapedType>(val1.getType());
+  ShapedType val2_shape = mlir::cast<ShapedType>(val2.getType());
+  if (!val1_shape.hasRank() || !val2_shape.hasRank()) return false;
+
+  int val1_result = 1;
+  int val2_result = 1;
+  for (auto idx : val1_indices) {
+    if (idx < 0) idx = idx + val1_shape.getRank();
+    if (idx >= val1_shape.getRank() || val1_shape.isDynamicDim(idx)) {
+      return false;
+    }
+    val1_result *= val1_shape.getDimSize(idx);
+  }
+
+  for (auto idx : val2_indices) {
+    if (idx < 0) idx = idx + val2_shape.getRank();
+    if (idx >= val2_shape.getRank() || val2_shape.isDynamicDim(idx)) {
+      return false;
+    }
+    val2_result *= val2_shape.getDimSize(idx);
+  }
+
+  return val1_result == val2_result;
+}
+
+// Checks if a shape has dim sizes of all ones except the right most dim.
+bool ReshapableTo1DTensor(ShapedType rhs_shape) {
+  for (auto rank = 0; rank < rhs_shape.getRank() - 1; rank++) {
+    if (rhs_shape.getDimSize(rank) != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Value ReshapeTo1DTensor(OpBuilder& builder, Location loc, Value value) {
+  auto shape = mlir::cast<ShapedType>(value.getType());
+  if (shape.getRank() != 1) {
+    SmallVector<int64_t> new_shape;
+    new_shape.push_back(shape.getNumElements());
+    value = builder.create<TF::ReshapeOp>(
+        loc, value, tf_quant::Create1DConstValue(builder, loc, new_shape));
+  }
+  return ConstantFoldOpIfPossible(value.getDefiningOp()).front();
+}
+
+// Matches convolution op with "NHWC" data format or matmul op with false adj_y.
+// The list of supported ops in this function is:
+// - Conv2DOp
+// - Conv3DOp
+// - DepthwiseConv2dNativeOp
+// - MatMulOp
+// - BatchMatMulV2Op
+LogicalResult MatchSupportedAffineOp(Operation* op, Value& binding_output,
+                                     Value& binding_input,
+                                     Value& binding_weight) {
+  bool is_supported_affine_op = false;
+  if (llvm::isa<TF::Conv2DOp, TF::Conv3DOp, TF::DepthwiseConv2dNativeOp>(op)) {
+    if (const auto data_format = op->getAttrOfType<StringAttr>("data_format")) {
+      is_supported_affine_op =
+          data_format.getValue() == "NHWC" || data_format.getValue() == "NDHWC";
+    }
+  } else if (llvm::isa<TF::BatchMatMulV2Op>(op)) {
+    if (const auto adj_y = op->getAttrOfType<BoolAttr>("adj_y")) {
+      is_supported_affine_op = !adj_y.getValue();
+    }
+  } else if (llvm::isa<TF::MatMulOp>(op)) {
+    if (const auto adj_y = op->getAttrOfType<BoolAttr>("transpose_b")) {
+      is_supported_affine_op = !adj_y.getValue();
+    }
+  }
+
+  if (!is_supported_affine_op) return failure();
+
+  // Bind input, output and weight to the given values.
+  binding_output = op->getResult(0);
+  binding_input = op->getOperand(0);
+  binding_weight = op->getOperand(1);
+  return success();
+}
+
+// Makes the 1D value broadcastable with the `rhs_shape`.
+Value MakeOneDimValueBroadcastable(OpBuilder& builder, Location loc,
+                                   Value value, ShapedType rhs_shape) {
+  ShapedType value_shape = mlir::dyn_cast_or_null<ShapedType>(value.getType());
+  if (!value_shape || value_shape.getRank() != 1 ||
+      !value_shape.hasStaticShape() || !rhs_shape.hasStaticShape()) {
+    return {};
+  }
+
+  int64_t num_elements = value_shape.getNumElements();
+  SmallVector<int64_t> new_shape;
+  for (auto idx : llvm::reverse(llvm::seq<int32_t>(0, rhs_shape.getRank()))) {
+    const int64_t rhs_dim = rhs_shape.getDimSize(idx);
+    if (num_elements % rhs_dim != 0) {
+      return {};
+    }
+    new_shape.push_back(rhs_dim);
+    num_elements = num_elements / rhs_dim;
+    if (num_elements == 1) break;
+  }
+  absl::c_reverse(new_shape);
+
+  auto reshape_op = builder.create<TF::ReshapeOp>(
+      loc, value, tf_quant::Create1DConstValue(builder, loc, new_shape));
+  return ConstantFoldOpIfPossible(reshape_op).front();
+}
+
+// Checks if a value can be symmetrically quantized.
+bool CanBeSymmetricallyQuantized(Value weight) {
+  auto dq_op = weight.getDefiningOp<mlir::quant::ir::DequantizeCastOp>();
+  if (!dq_op) return true;
+
+  auto qtype =
+      mlir::cast<TensorType>(dq_op.getArg().getType()).getElementType();
+  if (auto uniform_type = llvm::dyn_cast_or_null<UniformQuantizedType>(qtype)) {
+    return uniform_type.getZeroPoint() == 0;
+  } else if (auto per_axis_type =
+                 llvm::dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(
+                     qtype)) {
+    return absl::c_all_of(per_axis_type.getZeroPoints(),
+                          [](int64_t x) { return x == 0; });
+  }
+  return false;
+}
+
+// Multiplies two 1D arrays with broadcasting support.
+template <typename T>
+SmallVector<T> MultiplyTwoArrays(ArrayRef<T> a, ArrayRef<T> b) {
+  auto get_value_at = [](ArrayRef<T> v, size_t i) -> T {
+    if (v.size() == 1) return v.front();
+    return v[i];
+  };
+
+  size_t max_size = std::max(a.size(), b.size());
+  SmallVector<T> result(max_size);
+  for (size_t i : llvm::seq<size_t>(0, max_size)) {
+    result[i] = get_value_at(a, i) * get_value_at(b, i);
+  }
+  return result;
+}
+
+// Multiplies the value followed by a FakeQuant op and adjusts the quantization
+// params. This function only supports symmetrically quantized values.
+Value MultiplyFakeQuantValue(OpBuilder& builder, Location loc, Value value,
+                             Value multiplier) {
+  auto dq_op = value.getDefiningOp<mlir::quant::ir::DequantizeCastOp>();
+  if (!dq_op) {
+    auto mul_op = builder.create<TF::MulOp>(loc, value, multiplier);
+    return mul_op.getResult();
+  }
+  auto q_op = dq_op.getArg().getDefiningOp<mlir::quant::ir::QuantizeCastOp>();
+  if (!q_op) return {};
+
+  Value float_value = q_op.getArg();
+  Value new_value = builder.create<TF::MulOp>(loc, float_value, multiplier);
+  auto new_value_type = mlir::cast<TensorType>(new_value.getType());
+
+  // Get multiplier value in double.
+  DenseFPElementsAttr multiplier_attr;
+  if (!matchPattern(multiplier, m_Constant(&multiplier_attr)) ||
+      mlir::cast<ShapedType>(multiplier_attr.getType()).getRank() > 1) {
+    return {};
+  }
+  std::vector<double> multiplier_values;
+  absl::c_transform(multiplier_attr, std::back_inserter(multiplier_values),
+                    [](auto v) { return FloatAttr::getValueAsDouble(v); });
+  ArrayRef<double> multiplier_array(multiplier_values.data(),
+                                    multiplier_values.size());
+
+  // Multiply the quantization parameters by the multiplier.
+  QuantizedType new_qtype;
+  auto element_type = mlir::cast<TensorType>(q_op.getType()).getElementType();
+  if (auto uniform_type = llvm::dyn_cast<UniformQuantizedType>(element_type)) {
+    if (multiplier_attr.isSplat()) {
+      double new_scale = multiplier_array.front() * uniform_type.getScale();
+      new_qtype = UniformQuantizedType::get(
+          uniform_type.getFlags(), uniform_type.getStorageType(),
+          uniform_type.getExpressedType(), new_scale,
+          uniform_type.getZeroPoint(), uniform_type.getStorageTypeMin(),
+          uniform_type.getStorageTypeMax());
+    } else {
+      auto new_scales =
+          MultiplyTwoArrays(multiplier_array, {uniform_type.getScale()});
+      int32_t quantized_dim = new_value_type.getRank() - 1;
+      auto new_zero_points =
+          SmallVector<int64_t>(new_scales.size(), uniform_type.getZeroPoint());
+      new_qtype = quant::UniformQuantizedPerAxisType::get(
+          uniform_type.getFlags(), uniform_type.getStorageType(),
+          uniform_type.getExpressedType(), new_scales, new_zero_points,
+          quantized_dim, uniform_type.getStorageTypeMin(),
+          uniform_type.getStorageTypeMax());
+    }
+  } else if (auto per_axis_type =
+                 llvm::dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(
+                     element_type)) {
+    auto new_scales =
+        MultiplyTwoArrays(multiplier_array, per_axis_type.getScales());
+    new_qtype = quant::UniformQuantizedPerAxisType::get(
+        per_axis_type.getFlags(), per_axis_type.getStorageType(),
+        per_axis_type.getExpressedType(), new_scales,
+        per_axis_type.getZeroPoints(), per_axis_type.getQuantizedDimension(),
+        per_axis_type.getStorageTypeMin(), per_axis_type.getStorageTypeMax());
+  }
+
+  auto quantize = builder.create<mlir::quant::ir::QuantizeCastOp>(
+      q_op.getLoc(), new_value_type.clone(new_qtype), new_value);
+  auto dequantize = builder.create<mlir::quant::ir::DequantizeCastOp>(
+      dq_op.getLoc(), new_value_type, quantize.getResult());
+  return dequantize.getResult();
+}
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_lifting.inc"
+
+void PrepareLiftingPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  auto func = getOperation();
+
+  // The pattern includes decomposing batch normalization ops, fusing add/mul
+  // with a constant operand to a preceding affine operation.
+  RewritePatternSet patterns(ctx);
+  populateWithGenerated(patterns);
+  patterns.add<quant::RemoveIdentity, ConstantFoldQuantizableOperands>(ctx);
+  if (op_set_ != OpSet::XLA) {
+    // Convert Einsum into BatchMatMul for non-XLA opsets.
+    // For the uniform opset, it is requested to maintain the BatchMatmul logic.
+    // For the TF opset, since we need to test the effect we remain it as a
+    // future work.
+    patterns.add<TF::ConvertTFEinsumOp>(ctx);
+  }
+
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    func.emitError() << "tf-quant-prepare-lifting failed.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareLiftingPass(
+    const OpSet target_opset) {
+  return std::make_unique<PrepareLiftingPass>(target_opset);
+}
+
+static PassRegistration<PrepareLiftingPass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_lifting.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_lifting.td
new file mode 100644
index 000000000000..78f1b371e907
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_lifting.td
@@ -0,0 +1,209 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+
+// Converts arith.constant ops from freezing passes back to tf.Const ops.
+def ConvertArithConstToTfConst : Pat<
+  (Arith_ConstantOp:$res DenseElementsAttr:$value),
+  (TF_ConstOp $value),
+  [(AnyStaticShapeTensor $res)]>;
+
+// Remove CheckNumerics op
+def RemoveCheckNumerics : Pat<
+  (TF_CheckNumericsOp $arg, $msg),
+  (replaceWithValue $arg)>;
+
+// Remove StopGradient op
+def RemoveStopGradient : Pat<
+  (TF_StopGradientOp $arg),
+  (replaceWithValue $arg)>;
+
+// Converts tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
+// operations. Specifically, performs the following calculation:
+//
+//   (x - mean) * scale / sqrt(variance + epsilon) + offset
+//
+// Let multiplier = scale / sqrt(variance + epsilon),
+// to compute
+//   (x - mean) * scale / sqrt(variance + epsilon) + offset,
+// is then to compute
+//   (x * multiplier) + (offset - mean * multiplier).
+//
+// TODO(b/228916181): There is a known issue with this DDR rule that it doesn't
+// take into account broadcasting conditions. If the issue needs to be handled,
+// see tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+def FoldFusedBatchNormV3: Pattern<
+    (TF_FusedBatchNormV3Op:$root
+        $x, $scale, $offset, $mean, $variance,
+        F32Attr:$epsilon, $exponential_avg_factor,
+        $data_format, IsFalseBoolAttr:$is_training),
+    [(TF_AddV2Op
+        (TF_MulOp
+            $x,
+            (TF_MulOp:$multiplier
+                $scale,
+                (TF_RsqrtOp
+                    (TF_AddV2Op $variance,
+                              (TF_ConstOp $epsilon))))),
+        (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
+     // We already guaranteed that the last five results have no use so it does
+     // not matter what value we provide here for replacement.
+     /*batch_mean=*/(replaceWithValue $x),
+     /*batch_variance=*/(replaceWithValue $x),
+     /*reserve_space_1=*/(replaceWithValue $x),
+     /*reserve_space_2=*/(replaceWithValue $x),
+     /*reserve_space_3=*/(replaceWithValue $x)],
+    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
+     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4),
+     (HasNoUseOf:$root__5)]>;
+
+class HasEqualElementSize<list<int> shape_1, list<int> shape_2> : Constraint<
+  CPred<"HasEqualElementSize($0, $1,"
+  "llvm::ArrayRef<int>({" # !interleave(shape_1, ", ") # "}),"
+  "llvm::ArrayRef<int>({" # !interleave(shape_2, ", ") # "}))">,
+  "Checks if the given dimensions contain the same number of elements.">;
+
+def ReshapableTo1DTensor : Constraint<
+  CPred<"ReshapableTo1DTensor(llvm::cast<ShapedType>($0.getType()))">,
+  "Checks if the value dims are all ones except the right most dim">;
+
+def ReshapeTo1DTensor : NativeCodeCall<
+  "ReshapeTo1DTensor($_builder, $_loc, $0)">;
+
+def HasEqualShape : Constraint<CPred<
+  "llvm::cast<ShapedType>($0.getType()).hasRank() && "
+  "llvm::cast<ShapedType>($1.getType()).hasRank() && "
+  "llvm::cast<ShapedType>($0.getType()).getShape() == llvm::cast<ShapedType>($1.getType()).getShape()">,
+  "Checks if the shapes of tensors are same.">;
+
+// Make the 1D value $0 broadcastable with the shape of $1.
+def MakeOneDimValueBroadcastable : NativeCodeCall<
+  "MakeOneDimValueBroadcastable($_builder, $_loc, $0, llvm::cast<ShapedType>($1.getType()))">;
+
+// Match convolution op with "NHWC" data format or matmul op.
+def SupportedAffineOpMatcher : NativeCodeCall<
+  "MatchSupportedAffineOp($_self, $0, $1, $2)">;
+
+// Checks if a value can be symetrically quantized.
+def CanBeSymmetricallyQuantized : Constraint<CPred<"CanBeSymmetricallyQuantized($0)">>;
+
+// Multiplies the value followed by a FakeQuant op and adjusts its params.
+def MultiplyFakeQuantValue : NativeCodeCall<
+  "MultiplyFakeQuantValue($_builder, $_loc, $0...)">;
+
+// Convert AddV2Op following an AffineOp to BiasAddOp.
+// For Conv3D, even though the Conv3D op has "NDHWC" data format, the BiasAdd
+// will still has the data format of "NHWC".
+def ConvertAddToBiasAdd : Pat<
+  (TF_AddV2Op
+    (SupportedAffineOpMatcher $conv_out, $input, $weight),
+    (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
+  (TF_BiasAddOp $conv_out, $add_rhs, (CreateStringAttr<"NHWC">)),
+  [(HasRankOf<1> $add_rhs_value),
+   (HasEqualElementSize<[-1], [0]> $conv_out, $add_rhs)], [], (addBenefit -1)>;
+
+// Convert conv+sub+mul pattern to conv+mul+add.
+// (conv - sub) * mul -> conv * mul + (-sub) * mul
+//
+// This is needed to support Conv+BatchNorm pattern from Jax models converted
+// using jax2tf w/o native serialization. Note that Jax2tf patterns always
+// extend bias shapes to a rank of 4, e.g. 1x1x1x5.
+def ConvertSubMulToMulAdd : Pat<
+  (TF_MulOp
+    (TF_SubOp
+      (SupportedAffineOpMatcher $conv_out, $input, $weight),
+      (TF_ConstOp:$sub_rhs IsFloatElementsAttr:$sub_rhs_value)),
+    (TF_ConstOp:$mul_rhs IsFloatElementsAttr:$mul_rhs_value)),
+  (TF_AddV2Op
+    (TF_MulOp $conv_out, (ReshapeTo1DTensor $mul_rhs)),
+    (TF_MulOp
+      (TF_NegOp (ReshapeTo1DTensor $sub_rhs)),
+      (ReshapeTo1DTensor $mul_rhs))),
+  [(ReshapableTo1DTensor $mul_rhs),
+   (ReshapableTo1DTensor $sub_rhs),
+   (HasEqualElementSize<[-1], [-1]> $conv_out, $mul_rhs),
+   (HasEqualElementSize<[-1], [-1]> $conv_out, $sub_rhs)]>;
+
+// TODO(b/278493977): Create generic implementation of lifting any fused op
+// with any reshaping op
+def ConvertAddWithReshapeToBiasAddWithReshape : Pat<
+  (TF_AddV2Op
+    (TF_ReshapeOp:$reshape_out
+      (SupportedAffineOpMatcher $_, $_, $_),
+      $_
+    ),
+    (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
+  (TF_BiasAddOp $reshape_out, $add_rhs, (CreateStringAttr<"NHWC">)),
+  [(HasRankOf<1> $add_rhs_value),
+   (HasEqualElementSize<[-1], [0]> $reshape_out, $add_rhs)]>;
+
+// Fuse consecutive BiasAddOp and an AddV2Op.
+// We also handle the case where add_rhs has rank 4.
+def FuseBiasAndAddV2 : Pat<
+  (TF_AddV2Op
+    (TF_BiasAddOp:$bias_add
+      $conv_out,
+      (TF_ConstOp:$bias IsFloatElementsAttr:$bias_value), $data_format),
+    (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
+  (TF_BiasAddOp
+    $conv_out, (TF_AddV2Op $bias, (ReshapeTo1DTensor $add_rhs)), $data_format),
+  [(HasOneUse $bias_add),
+   (ReshapableTo1DTensor $add_rhs),
+   (HasEqualElementSize<[-1], [-1]> $bias, $add_rhs)]>;
+
+// Fuse AffineOp followed by an MulOp patterns.
+def FuseAffineOpAndMul : Pat<
+  (TF_MulOp
+    (SupportedAffineOpMatcher $conv_out, $input, $weight),
+    (TF_ConstOp:$mul_rhs IsFloatElementsAttr:$mul_rhs_value)),
+  (CloneOpWithReplacedOperands
+        (GetDefiningOp $conv_out),
+        $input,
+        (MultiplyFakeQuantValue $weight,
+          (MakeOneDimValueBroadcastable $mul_rhs, $weight))),
+  [(HasOneUse $conv_out),
+   (HasRankOf<1> $mul_rhs_value),
+   (HasStaticShapeConstraint $weight),
+   (CanBeSymmetricallyQuantized $weight),
+   (HasEqualElementSize<[-1], [0]> $conv_out, $mul_rhs)]>;
+
+// Fuse AffineOp followed by an BiasAddOp and an MulOp patterns.
+def FuseAffineOpWithBiasAddAndMul : Pat<
+  (TF_MulOp
+    (TF_BiasAddOp:$bias_add
+      (SupportedAffineOpMatcher $conv_out, $input, $weight),
+      $bias, $data_format),
+    (TF_ConstOp:$mul_rhs IsFloatElementsAttr:$mul_rhs_value)),
+  (TF_BiasAddOp
+    (CloneOpWithReplacedOperands
+      (GetDefiningOp $conv_out),
+      $input,
+      (MultiplyFakeQuantValue $weight,
+        (MakeOneDimValueBroadcastable $mul_rhs, $weight))),
+    (MultiplyFakeQuantValue $bias, $mul_rhs), $data_format),
+  [(HasOneUse $conv_out),
+   (HasOneUse $bias_add),
+   (HasRankOf<1> $mul_rhs_value),
+   (HasStaticShapeConstraint $weight),
+   (CanBeSymmetricallyQuantized $weight),
+   (CanBeSymmetricallyQuantized $bias),
+   (HasEqualShape $bias, $mul_rhs_value)]>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.cc
new file mode 100644
index 000000000000..c32b6022e992
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.cc
@@ -0,0 +1,442 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Copied and modified from
+// //third_party/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+// This transformation pass applies quantization propagation on TF dialect.
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_driver.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+
+//===----------------------------------------------------------------------===//
+// The prepare-quantize Pass.
+//
+namespace mlir {
+namespace tf_quant {
+
+namespace {
+
+using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+
+// Applies prepare quantization on the model in TF dialect. This pass runs
+// before the quantization pass and propagate the quantization parameters
+// across ops. This step is necessary for post-training quantization and also
+// making the quantization rule for some operations in the quantization-aware
+// training quantization simpler.
+class PrepareQuantizePass
+    : public PassWrapper<PrepareQuantizePass, OperationPass<func::FuncOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, ::mlir::quant::QuantDialect,
+                    ::mlir::quant::ir::TFQuantDialect>();
+  }
+
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareQuantizePass)
+
+  // Constructor used by the PassRegistration and enforce uint8 quantization.
+  // This is only used by test.
+  explicit PrepareQuantizePass() {
+    quant_specs_.inference_type = tensorflow::DT_QINT8;
+  }
+
+  // Constructor used by manually creating the pass.
+  explicit PrepareQuantizePass(const QuantizationSpecs& quant_specs,
+                               QuantMethod quantization_method)
+      : quant_specs_(quant_specs) {
+    quant_specs_.inference_type = tensorflow::DT_QINT8;
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
+    enable_post_training_quantize_ =
+        (quantization_method == tensorflow::quantization::QuantizationMethod::
+                                    METHOD_STATIC_RANGE_INT8);
+  }
+
+  PrepareQuantizePass(const PrepareQuantizePass& other) {
+    quant_specs_ = other.quant_specs_;
+    enable_post_training_quantize_ = other.enable_post_training_quantize_;
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
+  }
+
+  explicit PrepareQuantizePass(const QuantizationSpecs& quant_specs)
+      : quant_specs_(quant_specs) {
+    enable_post_training_quantize_ = quant_specs.post_training_quantization;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-prepare-quantize";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Prepare TF dialect for quantization";
+  }
+
+  void runOnOperation() override;
+
+ private:
+  // Set the quantization parameters of the input nodes. These parameters are
+  // converted from the user specified input value ranges. The input nodes with
+  // non-float tensor types will be skipped because they are not quantizable.
+  // Return true if number of input nodes doesn't equal to that of the input
+  // ranges.
+  bool SetInputNodesQuantizationParams(func::FuncOp func);
+
+  // The function might contain more stats ops than required, and it will
+  // introduce requantize if the calibration stats have conflicts. This method
+  // tries to remove all the redundant stats ops.
+  bool RemoveRedundantStats(func::FuncOp func);
+
+  // Verify the quantization specification is expected for quantizing the
+  // current function.
+  bool IsLegalQuantSpecs(func::FuncOp func) {
+    if (func.getName() == quant_specs_.target_func) {
+      return func.getNumArguments() == quant_specs_.input_ranges.size();
+    }
+    return true;
+  }
+
+  // Get the min and max values from the quantization specification for the
+  // current function and argument index. Uses default values if the function
+  // is specified in the `quantize_allowlist`.
+  std::pair<std::optional<double>, std::optional<double>>
+  GetMinMaxValuesForArgument(llvm::StringRef func_name, int index) {
+    if (func_name == quant_specs_.target_func) {
+      return quant_specs_.input_ranges[index];
+    } else {
+      return {0.0, 255.0};
+    }
+  }
+
+  // Apply some sanity check and report some warnings for those who don't follow
+  // the best quantization practice. This also fixes some simple violations.
+  void SanityCheckAndAdjustment(func::FuncOp func);
+
+  // Whether the func contains Quantize ops. This is used to determine whether
+  // to use the quantization parameters from the fixed output range property.
+  bool ContainsQuantizeOps(func::FuncOp func);
+
+  QuantizationSpecs quant_specs_;
+
+  Option<bool> enable_post_training_quantize_{
+      *this, "post-training-quantize", llvm::cl::init(false),
+      llvm::cl::desc("Enable post training quantization. Only used in tests.")};
+
+  // A local flag is needed for testing conditions in
+  // prepare_quantize_ptq_per_channel.mlir.
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
+};
+
+bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
+  StringRef func_name = func.getName();
+  auto has_quantize_op = [&](const Value arg) {
+    return (arg.hasOneUse() &&
+            llvm::isa<mlir::quant::ir::QuantizeCastOp>(*arg.user_begin()));
+  };
+
+  bool need_to_set_input_nodes_quantization_params = false;
+  for (const BlockArgument arg : func.getArguments()) {
+    auto shaped = mlir::dyn_cast<ShapedType>(arg.getType());
+    if (shaped && mlir::isa<FloatType>(shaped.getElementType()) &&
+        !has_quantize_op(arg)) {
+      need_to_set_input_nodes_quantization_params = true;
+      break;
+    }
+  }
+
+  if (!need_to_set_input_nodes_quantization_params) {
+    return false;
+  }
+
+  // If the validation fails, the pass should stop immediately.
+  if (!IsLegalQuantSpecs(func)) {
+    return true;
+  }
+
+  OpBuilder builder(func);
+  bool is_signed = quant_specs_.IsSignedInferenceType();
+  IntegerAttr num_bits =
+      builder.getI32IntegerAttr(quant_specs_.GetQuantizationTypeWidth());
+  BoolAttr narrow_range = builder.getBoolAttr(false);
+
+  auto add_quantize_op = [&](Location loc, mlir::Type input_type, Block* block,
+                             Block::iterator insertion_point, Value arg,
+                             int i) {
+    if (auto shaped = mlir::dyn_cast<ShapedType>(input_type)) {
+      if (mlir::isa<FloatType>(shaped.getElementType())) {
+        // If there are existing quantize ops, they are from training and we
+        // should respect them.
+        if (has_quantize_op(arg)) {
+          return;
+        }
+
+        auto min_max = GetMinMaxValuesForArgument(func_name, i);
+        // The input min/max or mean/std are not specified, then skip.
+        if (!min_max.first.has_value() || !min_max.second.has_value()) return;
+
+        TypeAttr params = GetQuantizedTypeAttr(
+            builder, input_type, builder.getF64FloatAttr(min_max.first.value()),
+            builder.getF64FloatAttr(min_max.second.value()),
+            /*quant_dim=*/-1, num_bits, narrow_range, is_signed);
+        builder.setInsertionPoint(block, insertion_point);
+        auto q_op = builder.create<mlir::quant::ir::QuantizeCastOp>(
+            loc, params.getValue(), arg);
+        auto dq_op = builder.create<mlir::quant::ir::DequantizeCastOp>(
+            loc, input_type, q_op.getResult());
+        arg.replaceAllUsesWith(dq_op.getResult());
+        q_op.setOperand(arg);
+      }
+    }
+  };
+
+  for (int i = 0, e = func.getNumArguments(); i != e; ++i) {
+    BlockArgument arg = func.getArgument(i);
+    auto* arg_block = arg.getOwner();
+    add_quantize_op(arg.getLoc(), arg.getType(), arg_block,
+                    std::next(arg_block->begin(), i), arg, i);
+  }
+
+  return false;
+}
+
+bool PrepareQuantizePass::RemoveRedundantStats(func::FuncOp func) {
+  return mlir::tf_quant::RemoveRedundantStatsOps(func, GetTFOpQuantSpec,
+                                                 GetTfQuantScaleSpec);
+}
+
+static Value Quantized(Operation* user) {
+  if (auto q = llvm::dyn_cast_or_null<mlir::quant::ir::QuantizeCastOp>(user)) {
+    if (auto dq = llvm::dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+            *q.getResult().user_begin())) {
+      return dq.getResult();
+    }
+  }
+  return {};
+}
+
+void PrepareQuantizePass::SanityCheckAndAdjustment(func::FuncOp func) {
+  // If an op output has two users: one of them is a quantize op and another
+  // one is returned directly, we decide to return the quantized result instead,
+  // so this op can be quantized. This is only applied on the returned result
+  // because the error will not be accumulated.
+
+  func.walk([&](func::ReturnOp ret) {
+    int i = 0;
+    for (Value returned : ret.getOperands()) {
+      llvm::SmallVector<Value, 4> quantized;
+      for (auto user : returned.getUsers()) {
+        if (auto q = Quantized(user)) {
+          quantized.push_back(q);
+        }
+      }
+      if (quantized.size() == 1) {
+        ret.setOperand(i, quantized.front());
+      }
+      i++;
+    }
+  });
+
+  // Check for  (Quant (Dequant $in), $qA) "qdq" pairs that couldn't be
+  // eliminated at this point.  This only occurs for the pattern
+  //      (Quant (Dequant (Quant $in, $qB)), $qA)   $qB != $qA
+  // where the  qdq pair denotes a non-trivial requantization of an
+  // already quantized value. Since this makes little sense (directly quantizing
+  // (Quant $in, $qA) would introduce less quantization noise) the likely cause
+  // is an minor error in constructing the original network model that
+  // introduced back-to-back Fake Quantization operations. Hence: emit a
+  // warning. N.b. at this point we're (teporarility) in the quantization
+  // dialect (presumably enable re-use in xla etc)
+  // mlir::quant::ir::*QuantizeCastOp
+  // we're matching here.
+  //
+  func.walk([&](mlir::quant::ir::QuantizeCastOp q_op) {
+    // If up with end up with
+    auto dq_op = dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+        q_op.getOperand().getDefiningOp());
+    if (!dq_op) {
+      return;
+    }
+    auto dq_arg = dq_op.getOperand();
+
+    if (!dq_arg.hasOneUse()) {
+      // The initial quantization is used someplace else ... so it might be
+      // reasonable for it to requantized for another purpose.
+      // Ideally would want to still check whether requantization narrows
+      // rather than widens the representation.
+      return;
+    }
+
+    // Invariant:
+    // isa<mlir::quant::ir::QuantizeCastOp>(dq_arg.getDefiningOp()) -->
+    // getdq_arg.getType() != q_op.getResult().getType()
+    //
+    // as otherwise qdq pair would have been optimized away.
+    auto qd_arg_def_q_op = dyn_cast_or_null<mlir::quant::ir::QuantizeCastOp>(
+        dq_arg.getDefiningOp());
+    if (!qd_arg_def_q_op) {
+      return;
+    }
+
+    qd_arg_def_q_op.emitWarning()
+        << " quantizer's output has another quantizer (" << q_op.getLoc()
+        << ") as consumer - intentional?";
+  });
+}
+
+// Merges consecutive QuantizeCast ops. For example, the following case:
+// %1 = tf.QuantizeCastOp(%0) : f32 -> qtype1
+// %2 = tf.QuantizeCastOp(%1) : qtype1 -> qtype2
+// %3 = tf.QuantizedOp1(%1)
+// %4 = tf.QuantizedOp2(%2)
+// will be tranformed to:
+// %1 = tf.QuantizeCastOp(%0) : f32 -> qtype1
+// %2 = tf.QuantizeCastOp(%0) : f32 -> qtype2
+// %3 = tf.QuantizedOp1(%1)
+// %4 = tf.QuantizedOp2(%2)
+// Converting from f32 -> qtype1 -> qtype2 will add unexpected quantization
+// lost for %2. This pattern avoids that by converting from f32 -> qtype2
+// directly.
+class MergeConsecutiveQuantizeCast
+    : public mlir::OpRewritePattern<mlir::quant::ir::QuantizeCastOp> {
+ public:
+  explicit MergeConsecutiveQuantizeCast(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::QuantizeCastOp>(context) {}
+
+ private:
+  LogicalResult matchAndRewrite(mlir::quant::ir::QuantizeCastOp q_op,
+                                PatternRewriter& rewriter) const override {
+    auto preceding_qcast =
+        q_op.getArg().getDefiningOp<mlir::quant::ir::QuantizeCastOp>();
+    if (!preceding_qcast) return failure();
+
+    auto new_qcast = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
+        q_op.getLoc(), q_op.getType(), preceding_qcast.getArg());
+    new_qcast->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
+    q_op->replaceAllUsesWith(new_qcast);
+    return success();
+  }
+};
+
+bool PrepareQuantizePass::ContainsQuantizeOps(func::FuncOp func) {
+  for (const auto& op : func.getOps()) {
+    if (llvm::isa<mlir::quant::ir::DequantizeCastOp>(op)) return true;
+  }
+  return false;
+}
+
+using PrepareQuantStats =
+    ConvertStatsToQDQs<mlir::quant::ir::QuantizeCastOp,
+                              mlir::quant::ir::DequantizeCastOp>;
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.inc"
+
+void PrepareQuantizePass::runOnOperation() {
+  func::FuncOp func = getOperation();
+  MLIRContext* ctx = func.getContext();
+
+  quant_specs_.post_training_quantization = enable_post_training_quantize_;
+  if (quant_specs_.post_training_quantization) {
+    RemoveRedundantStats(func);
+  } else {
+    // Set the quantization parameters for the quantizable input nodes. If this
+    // failed, return the function immediately. This is only required for
+    // quantization aware training model conversion.
+    if (SetInputNodesQuantizationParams(func)) {
+      return;
+    }
+  }
+
+  bool is_signed = quant_specs_.IsSignedInferenceType();
+  int bit_width = quant_specs_.GetQuantizationTypeWidth();
+  // When this is true, the quantizer will try its best to extract the
+  // quantization parameters from the op quantization property and constant
+  // content. This is also set to true when the `quantize_allowlist` and
+  // `quantize_signed` test flags are enabled.
+  bool eager_quantize = ContainsQuantizeOps(func);
+  // Infer the tensor range for the activation ops and weight constants unless
+  // it is disabled explicitly.
+  bool infer_tensor_range =
+      (quant_specs_.post_training_quantization || eager_quantize) &&
+      !quant_specs_.disable_infer_tensor_range;
+
+  // During the legalization, unsigned quantized type is used, so we have to
+  // convert all of them to signed.
+  RewritePatternSet patterns(ctx);
+  populateWithGenerated(patterns);
+  patterns.add<ConvertUnsignedToSigned<mlir::quant::ir::QuantizeCastOp>>(
+      ctx);
+  // Convert quant stats to int8 quantization parameters.
+  // Currently, only activation stats are imported, so narrow_range = false.
+  patterns.add<PrepareQuantStats>(bit_width, false, true,
+                                  /*legacy_float_scale=*/false, ctx);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    signalPassFailure();
+  }
+
+  SanityCheckAndAdjustment(func);
+
+  // Finally, the quantization parameters can be propagated to the rest of the
+  // values (tensors).
+  ApplyQuantizationParamsPropagation(
+      func, is_signed, /*bit_width=*/8, !enable_per_channel_quantization_,
+      GetTFOpQuantSpec, GetTfQuantScaleSpec, infer_tensor_range,
+      quant_specs_.legacy_float_scale, /*is_qdq_conversion=*/false);
+
+  RewritePatternSet patterns2(ctx);
+  patterns2.add<MergeConsecutiveQuantizeCast>(ctx);
+  if (failed(applyPatternsGreedily(func, std::move(patterns2)))) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect PrepareQuantize pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
+    const QuantizationSpecs& quant_specs, QuantMethod quantization_method) {
+  return std::make_unique<PrepareQuantizePass>(quant_specs,
+                                               quantization_method);
+}
+
+static PassRegistration<PrepareQuantizePass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.td
new file mode 100644
index 000000000000..4fa7ef333f67
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.td
@@ -0,0 +1,28 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+// Converts tf.Const to arith.constant for statically shaped, non-opaque constants.
+// Needed for QuantizationDriver to recognize constants.
+def ConvertTfConstToArithConst : Pat<
+  (TF_ConstOp:$res DenseElementsAttr:$value),
+  (Arith_ConstantOp $value),
+  [(AnyStaticShapeTensor $res)], [], (addBenefit 10)>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize_drq.cc
new file mode 100644
index 000000000000..df89c3837b77
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize_drq.cc
@@ -0,0 +1,313 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Copied and modified from
+// //third_party/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+// This transformation pass applies quantization propagation on TF dialect.
+
+#include <memory>
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+//===----------------------------------------------------------------------===//
+// The prepare-quantize-drq Pass.
+//
+namespace mlir {
+namespace tf_quant {
+
+namespace {
+
+using QuantizationUnit = std::pair<Operation*, int>;
+using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
+using ::tensorflow::quantization::OpSet;
+
+// Applies prepare quantization on the model in TF dialect for dynamic range
+// quantization case.
+class PrepareQuantizeDRQPass
+    : public PassWrapper<PrepareQuantizeDRQPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, ::mlir::quant::QuantDialect,
+                    ::mlir::quant::ir::TFQuantDialect>();
+  }
+
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareQuantizeDRQPass)
+
+  // Constructor used by the PassRegistration and enforce int8 quantization.
+  // This is only used by test.
+  explicit PrepareQuantizeDRQPass() : op_set_(OpSet::UNIFORM_QUANTIZED) {
+    quant_specs_.inference_type = tensorflow::DT_QINT8;
+  }
+
+  // Constructor used by manually creating the pass.
+  explicit PrepareQuantizeDRQPass(const QuantizationSpecs& quant_specs,
+                                  OpSet op_set)
+      : quant_specs_(quant_specs), op_set_(op_set) {
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
+  }
+
+  PrepareQuantizeDRQPass(const PrepareQuantizeDRQPass& other) {
+    quant_specs_ = other.quant_specs_;
+    op_set_ = other.op_set_;
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-prepare-quantize-drq";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Prepare TF dialect for dynamic range quantization";
+  }
+
+  // The function might contain stats ops which are redundant for processing
+  // dynamic range quantization. And stats ops may cause conflict while
+  // processing the function for dynamic range quantization. Therefore, this
+  // method preprocess the function to remove all stats ops.
+  void removeAllStatsOp(func::FuncOp func);
+
+  void runOnOperation() override;
+
+ private:
+  QuantizationSpecs quant_specs_;
+  OpSet op_set_;
+
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
+};
+
+// If the weight is applicable to dynamic range quantization, insert Quantize
+// and Dequantize ops with per-tensor scale.
+class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
+ public:
+  explicit PrepareDRQQuantizableOp(MLIRContext* context,
+                                   const QuantizationSpecs& quant_specs,
+                                   OpSet op_set,
+                                   bool enable_per_channel_quantization)
+      : OpRewritePattern<arith::ConstantOp>(context),
+        quant_specs_(quant_specs),
+        op_set_(op_set),
+        enable_per_channel_quantization_(enable_per_channel_quantization) {}
+
+  LogicalResult matchAndRewrite(arith::ConstantOp op,
+                                PatternRewriter& rewriter) const override {
+    QuantizationUnits quantizable_ops;
+
+    // 1. Collect quantizable ops.
+    if (!(getQuantizableOps(op, quantizable_ops))) {
+      return failure();
+    }
+
+    // 2. Quantize collected ops. It is immediately quantized by inserting Q-DQ
+    // pair for int8.
+    if (!(quantizeOps(rewriter, op, quantizable_ops))) {
+      return failure();
+    }
+
+    return success();
+  }
+
+ private:
+  // Mark users that are applicable for dynamic range quantization where the
+  // criteria for determining quantizable ops differs by the inference type.
+  bool getQuantizableOps(arith::ConstantOp op,
+                         QuantizationUnits& quantizable_ops) const {
+    // Non-float tensors do not need quantization.
+    auto type = mlir::dyn_cast<ShapedType>(op.getType());
+    if (!type || !type.getElementType().isF32()) return false;
+
+    Value value = op.getResult();
+
+    // Check whether dynamic range quantization can be applied.
+    for (auto& use : value.getUses()) {
+      Operation* user = use.getOwner();
+      int operand_num = use.getOperandNumber();
+      std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(user);
+
+      if (quant_specs_.inference_type == tensorflow::DT_QINT8 &&
+          spec->quantizable_operands.contains(operand_num)) {
+        quantizable_ops.insert({user, operand_num});
+      }
+    }
+
+    return !quantizable_ops.empty();
+  }
+
+  // Apply per-tensor quantization for int8 dynamic range quantization.
+  bool quantizeOpAsInt8(PatternRewriter& rewriter, arith::ConstantOp op,
+                        QuantizationUnit quant_op) const {
+    auto [quantized_op, weight_idx] = quant_op;
+    const bool is_narrow_range = true;
+    const bool is_legacy_float = quant_specs_.legacy_float_scale;
+    const bool is_signed = quant_specs_.IsSignedInferenceType();
+    const int bit_width = quant_specs_.GetQuantizationTypeWidth();
+
+    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(quantized_op);
+    const int quant_dim = spec->coeff_op_quant_dim[weight_idx];
+    const bool is_per_channel_quantization =
+        enable_per_channel_quantization_ && quant_dim != -1;
+
+    QuantizedType quant_type;
+    DenseFPElementsAttr attr;
+    if (!matchPattern(op->getResult(0), m_Constant(&attr))) return false;
+
+    if (attr.size() < quant_specs_.minimum_elements_for_weights) {
+      op->emitRemark("Quantization is skipped for ")
+          << quantized_op->getName().getStringRef().str() << " because it has "
+          << mlir::dyn_cast<DenseFPElementsAttr>(attr).size()
+          << " elements which is fewer than the threshold("
+          << quant_specs_.minimum_elements_for_weights << " elements).";
+      return false;
+    }
+
+    if (is_per_channel_quantization) {
+      quant_type = mlir::dyn_cast<quant::QuantizedType>(
+          GetUniformQuantizedPerAxisTypeForWeight(attr, quant_dim,
+                                                  /*symmetric=*/true, bit_width,
+                                                  is_signed, is_narrow_range,
+                                                  is_legacy_float));
+    } else {
+      quant_type =
+          mlir::dyn_cast<quant::QuantizedType>(GetUniformQuantizedTypeForWeight(
+              attr, is_narrow_range && is_signed, bit_width, is_signed,
+              is_narrow_range, is_legacy_float));
+    }
+    return insertQDQ(rewriter, op, quant_type, quant_op);
+  }
+
+  // Insert Quantize and Dequantize ops.
+  bool insertQDQ(PatternRewriter& rewriter, arith::ConstantOp op,
+                 QuantizedType quant_type, QuantizationUnit quant_op) const {
+    if (!quant_type) return false;
+
+    Operation* quantize_op = quant_op.first;
+    int quantize_operand_num = quant_op.second;
+
+    Type expressed_type = op.getResult().getType();
+    Type cast_type = quant_type.castFromExpressedType(expressed_type);
+
+    // Insert DQ-op if it does not exist yet. Otherwise, just rewire without
+    // creating a new DQ-op.
+    for (auto connected_op : op->getUsers()) {
+      auto q_op =
+          llvm::dyn_cast_or_null<mlir::quant::ir::QuantizeCastOp>(connected_op);
+      if (q_op && q_op.getType() == cast_type) {
+        auto dq_op = llvm::cast<mlir::quant::ir::DequantizeCastOp>(
+            q_op.getResult().use_begin()->getOwner());
+        quantize_op->setOperand(quantize_operand_num, dq_op);
+        return false;
+      }
+    }
+    rewriter.setInsertionPointAfter(op);
+    auto q = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
+        op->getLoc(), cast_type, op.getResult());
+    auto dq = rewriter.create<mlir::quant::ir::DequantizeCastOp>(
+        op->getLoc(), expressed_type, q);
+    quantize_op->setOperand(quantize_operand_num, dq.getResult());
+    return true;
+  }
+
+  // For each filtered user, apply quantization.
+  bool quantizeOps(PatternRewriter& rewriter, arith::ConstantOp op,
+                   QuantizationUnits& quantizable_ops) const {
+    bool quantized = false;
+
+    for (auto& quant_op : quantizable_ops) {
+      if (quant_specs_.inference_type == tensorflow::DT_QINT8) {
+        quantized |= quantizeOpAsInt8(rewriter, op, quant_op);
+      }
+    }
+    return quantized;
+  }
+
+ protected:
+  QuantizationSpecs quant_specs_;
+  OpSet op_set_;
+  bool enable_per_channel_quantization_;
+};
+
+// Remove all the stats ops which are redundant for dynamic range quantization.
+void PrepareQuantizeDRQPass::removeAllStatsOp(func::FuncOp func) {
+  func.walk([&](mlir::quant::ir::StatisticsOp stats_op) {
+    stats_op.replaceAllUsesWith(stats_op.getArg());
+    stats_op.erase();
+  });
+}
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_prepare_quantize.inc"
+
+void PrepareQuantizeDRQPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module_op = getOperation();
+
+  populateWithGenerated(patterns);
+  patterns.add<PrepareDRQQuantizableOp>(ctx, quant_specs_, op_set_,
+                                        enable_per_channel_quantization_);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    removeAllStatsOp(func);
+    if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+      func.emitError() << "quant-prepare-quantize-drq failed.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect PrepareQuantizeDRQ
+// pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
+    const QuantizationSpecs& quant_specs, const OpSet op_set) {
+  return std::make_unique<PrepareQuantizeDRQPass>(quant_specs, op_set);
+}
+
+static PassRegistration<PrepareQuantizeDRQPass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_preprocess_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_preprocess_op.cc
new file mode 100644
index 000000000000..f10d5c64e412
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_preprocess_op.cc
@@ -0,0 +1,276 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This transformation pass applies quantization propagation on TF dialect.
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+//===----------------------------------------------------------------------===//
+// The preprocess-op Pass.
+//
+namespace mlir {
+namespace tf_quant {
+
+namespace {
+
+using QuantMethod =
+    ::tensorflow::quantization::QuantizationMethod::PresetMethod;
+using QuantizationUnit = std::pair<Operation*, int>;
+using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
+using ::tensorflow::quantization::OpSet;
+
+// Preprocesses ops to allow multi-axis quantization, prior to quantization
+// passes. Currently, per-channel quantization only supports 1D results.
+class PreprocessOpPass
+    : public PassWrapper<PreprocessOpPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, quant::QuantDialect,
+                    mlir::quant::ir::TFQuantDialect>();
+  }
+
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PreprocessOpPass)
+
+  explicit PreprocessOpPass() = default;
+
+  // Constructor used by manually creating the pass.
+  explicit PreprocessOpPass(OpSet op_set, const QuantMethod quantization_method,
+                            bool enable_per_channel_quantization) {
+    op_set_ = op_set;
+    quantization_method_ = quantization_method;
+    enable_per_channel_quantization_ = enable_per_channel_quantization;
+  }
+
+  PreprocessOpPass(const PreprocessOpPass& other) {
+    op_set_ = other.op_set_;
+    quantization_method_ = other.quantization_method_;
+    enable_per_channel_quantization_ = other.enable_per_channel_quantization_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-preprocess-op";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Preprocess TF op prior to quantization";
+  }
+
+  void runOnOperation() override;
+
+ private:
+  Option<OpSet> op_set_{
+      *this, "target-opset", llvm::cl::init(OpSet::UNIFORM_QUANTIZED),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+
+  Option<QuantMethod> quantization_method_{
+      *this, "quantization-method",
+      llvm::cl::init(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_INT8),
+      llvm::cl::desc("Choose quantization method."),
+      llvm::cl::values(
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_INT8,
+                     "ptq", "Post-training static-range quantization"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_DYNAMIC_RANGE_INT8,
+                     "drq", "Post-training dynamic-range quantizaiton"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8,
+                     "weight_only", "Post-training weight-only quantizaiton"))};
+
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
+};
+
+// Apply constant transformations for the op_set.
+class PreprocessConstantOp : public OpRewritePattern<TF::PartitionedCallOp> {
+ public:
+  explicit PreprocessConstantOp(MLIRContext* context, OpSet op_set,
+                                QuantMethod quantization_method,
+                                bool enable_per_channel_quantization)
+      : OpRewritePattern<TF::PartitionedCallOp>(context),
+        op_set_(op_set),
+        quantization_method_(quantization_method),
+        enable_per_channel_quantization_(enable_per_channel_quantization) {}
+
+  LogicalResult addReshapeOpToDepthwiseWeight(TF::PartitionedCallOp op,
+                                              PatternRewriter& rewriter,
+                                              StringRef function_name) const {
+    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(op);
+    const absl::flat_hash_set<int> operands = spec->quantizable_operands;
+
+    if (operands.size() != 1) return failure();
+    int weight_operand_idx = *operands.begin();
+
+    Operation* weight_op = op.getOperand(weight_operand_idx).getDefiningOp();
+    DenseFPElementsAttr attr;
+    if (!matchPattern(weight_op->getResult(0), m_Constant(&attr))) {
+      return failure();
+    }
+
+    // Get new shape.
+    llvm::ArrayRef<int64_t> cur_shape = attr.getType().getShape();
+    int cur_rank = cur_shape.size();
+    if (cur_rank != 4 || cur_shape[2] == 1) return failure();
+    TensorType new_shape = RankedTensorType::get(
+        {cur_shape[0], cur_shape[1], 1, cur_shape[2] * cur_shape[3]},
+        attr.getElementType());
+
+    // Inserts a reshape op.
+    auto shape_spec_type =
+        RankedTensorType::get({cur_rank}, rewriter.getIntegerType(64));
+    auto new_shape_const_attr =
+        DenseElementsAttr::get(shape_spec_type, new_shape.getShape());
+    rewriter.setInsertionPointAfter(weight_op);
+    auto new_shape_const = rewriter.create<arith::ConstantOp>(
+        weight_op->getLoc(), shape_spec_type, new_shape_const_attr);
+    auto reshape_op = rewriter.create<TF::ReshapeOp>(
+        weight_op->getLoc(), new_shape, weight_op->getResult(0),
+        new_shape_const);
+    op->setOperand(weight_operand_idx, reshape_op);
+
+    // Create a new function with preprocessed types.
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module);
+    func::FuncOp float_func =
+        dyn_cast<func::FuncOp>(symbol_table.lookup(function_name));
+    OperandRange func_args = op.getArgs();
+    func::FuncOp new_float_func = float_func.clone();
+
+    SmallVector<Value> new_float_func_args{func_args.begin(), func_args.end()};
+    new_float_func_args[weight_operand_idx] = reshape_op;
+    new_float_func.getArgument(weight_operand_idx).setType(new_shape);
+    new_float_func.setType(FunctionType::get(
+        getContext(), TypeRange{ValueRange{new_float_func_args}},
+        new_float_func.getResultTypes()));
+    symbol_table.insert(new_float_func);
+
+    op->setAttr("f", SymbolRefAttr::get(rewriter.getContext(),
+                                        new_float_func.getName()));
+
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp op,
+                                PatternRewriter& rewriter) const override {
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(op.getFAttr());
+    // Non-quantizable op
+    if (!op->hasAttr(kQuantTraitAttrName)) return failure();
+    StringRef function_name = f_attr.getValue();
+    // TODO(b/228928859): Improve the getter function to match attributes rather
+    // than function name.
+    if (!function_name.starts_with("composite_")) {
+      return failure();
+    }
+
+    if (function_name.contains("depthwise_conv2d")) {
+      // Uniform Quantized op requires weights of tf.DepthwiseConv2dNative to
+      // be transformed from [H,W,C,M] to [H,W,1,CxM] where
+      // H=height,W=width,C=channel,M=multiplier. Therefore, a reshape op is
+      // inserted between the constant op and the function op so that the
+      // constant is safely transformed for the multi-use cases as well. Note
+      // that bias doesn't need transformation as its shape is already in [CxM].
+      if (op_set_ == OpSet::UNIFORM_QUANTIZED ||
+          (op_set_ == OpSet::XLA && enable_per_channel_quantization_ &&
+           quantization_method_ ==
+               tensorflow::quantization::QuantizationMethod::
+                   METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8)) {
+        return addReshapeOpToDepthwiseWeight(op, rewriter, function_name);
+      }
+    }
+    return failure();
+  }
+
+ private:
+  const OpSet op_set_;
+  const QuantMethod quantization_method_;
+  const bool enable_per_channel_quantization_;
+};
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.inc"
+
+void PreprocessOpPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module_op = getOperation();
+
+  populateWithGenerated(patterns);
+  patterns.add<PreprocessConstantOp>(ctx, op_set_, quantization_method_,
+                                     enable_per_channel_quantization_);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+      func.emitError() << "quant-preprocess-op failed.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect PreprocessOp
+// pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
+    const OpSet op_set, QuantMethod quantization_method,
+    const bool enable_per_channel_quantization) {
+  return std::make_unique<PreprocessOpPass>(op_set, quantization_method,
+                                            enable_per_channel_quantization);
+}
+
+static PassRegistration<PreprocessOpPass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_propagate_quantize_type.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_propagate_quantize_type.cc
new file mode 100644
index 000000000000..9dbd641391e8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_propagate_quantize_type.cc
@@ -0,0 +1,171 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+constexpr StringRef kDequantizeFunctionName = "composite_dequantize";
+
+class PropagateQuantizeType
+    : public PassWrapper<PropagateQuantizeType, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PropagateQuantizeType)
+
+  // Constructor used by the PassRegistration. This will remove the adaptor ops.
+  explicit PropagateQuantizeType() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-propagate-quantize-type";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Propagate quantized type through allowed ops.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Propagate dequantize op if the next op supports the data type.
+// Given the below graph,
+// op_before_dequantize -> dequantize_op -> user_op -> rest_op
+// the transformation is applied to result the following graph:
+// op_before_dequantize -> user_op -> new_dequantize_op -> rest_op
+class PropagateDequantizeOpIfAllowed
+    : public OpRewritePattern<TF::PartitionedCallOp> {
+ public:
+  explicit PropagateDequantizeOpIfAllowed(MLIRContext* context)
+      : OpRewritePattern<TF::PartitionedCallOp>(context) {}
+
+  // Create a new dequantize op that is propagated.
+  void createNewDequantizeOp(PatternRewriter& rewriter,
+                             TF::PartitionedCallOp original_dequantize_op,
+                             Operation* user_op, int user_idx,
+                             Type new_user_op_type) const {
+    auto op_before_dequantize = original_dequantize_op.getOperand(0);
+
+    // Create a new dequantize op that is propagated.
+    rewriter.setInsertionPointAfter(user_op);
+    TF::PartitionedCallOp new_dequantize_op =
+        cast<TF::PartitionedCallOp>(rewriter.clone(*original_dequantize_op));
+
+    // Skip the original dequant op and connect the op before dequantize to the
+    // user op.
+    user_op->setOperand(user_idx, op_before_dequantize);
+
+    // Wire input/output nodes.
+    new_dequantize_op->setOperand(0, user_op->getResult(0));
+    new_dequantize_op->getResult(0).setType(user_op->getResult(0).getType());
+    user_op->getResult(0).replaceAllUsesExcept(new_dequantize_op->getResult(0),
+                                               new_dequantize_op);
+    user_op->getResult(0).setType(new_user_op_type);
+  }
+
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp op,
+                                PatternRewriter& rewriter) const override {
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(op.getFAttr());
+    StringRef function_name = f_attr.getValue();
+    if (!function_name.starts_with(kDequantizeFunctionName)) return failure();
+
+    llvm::SmallVector<Operation*> users(op->getUsers().begin(),
+                                        op->getUsers().end());
+
+    bool changed = false;
+    for (auto& use : op->getUses()) {
+      Operation* user_op = use.getOwner();
+      int user_idx = use.getOperandNumber();
+      if (!IsOpWithInt8TypeOperand(user_op)) continue;
+      // If the next op is terminator, function type needs to be changed so
+      // handle this case separately when propagating for function op is
+      // added.
+      if (std::any_of(user_op->getResult(0).getUsers().begin(),
+                      user_op->getResult(0).getUsers().end(), [](Operation* y) {
+                        return y->hasTrait<OpTrait::IsTerminator>();
+                      }))
+        continue;
+      if (IsOpWithDataMovementTrait(user_op)) {
+        auto op_before_dequantize = op.getOperand(0);
+        // New user op type needs to be set since user_op can output integer
+        // type for the data movement case.
+        auto original_result_type = user_op->getResult(0).getType();
+        auto new_user_op_type = CloneTypeWithNewElementType(
+            original_result_type,
+            mlir::cast<ShapedType>(op_before_dequantize.getType())
+                .getElementType());
+        createNewDequantizeOp(rewriter, op, user_op, user_idx,
+                              new_user_op_type);
+      } else {
+        createNewDequantizeOp(rewriter, op, user_op, user_idx,
+                              user_op->getResult(0).getType());
+      }
+      changed = true;
+    }
+    return changed ? success() : failure();
+  }
+};
+
+void PropagateQuantizeType::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  auto module_op = getOperation();
+  MLIRContext* ctx = &getContext();
+
+  patterns.add<PropagateDequantizeOpIfAllowed>(ctx);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+  // Propagation can happen recursively with multiple functions so keep this
+  // module level.
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+      func.emitError() << "tf-quant-propagate-quantize-type failed.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect PropagateQuantizeType pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePropagateQuantizeTypePass() {
+  return std::make_unique<PropagateQuantizeType>();
+}
+
+static PassRegistration<PropagateQuantizeType> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc
index a403f75403d4..a006c927b40e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -41,6 +42,7 @@ int main(int argc, char **argv) {
                   mlir::arith::ArithDialect, mlir::tf_type::TFTypeDialect,
                   mlir::quant::QuantDialect,
                   mlir::quantfork::QuantizationForkDialect,
+                  mlir::quant::ir::TFQuantDialect,
                   mlir::tf_executor::TensorFlowExecutorDialect,
                   mlir::stablehlo::StablehloDialect>();
   mlir::func::registerAllExtensions(registry);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize.cc
new file mode 100644
index 000000000000..54cd8dc3f4b5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize.cc
@@ -0,0 +1,585 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace tf_quant {
+
+//===----------------------------------------------------------------------===//
+// The actual Quantize Pass.
+//===----------------------------------------------------------------------===//
+namespace {
+
+using ::tensorflow::quantization::OpSet;
+
+enum QuantizationTrait { kFullQuantization, kDynamicRangeQuantization };
+
+// Base struct for quantization.
+template <QuantizationTrait quantization_trait, typename ConcreteT,
+          typename RootOpT = mlir::quant::ir::DequantizeCastOp>
+struct TFQuantizationBase
+    : public QuantizationPattern<ConcreteT, mlir::quant::ir::QuantizeCastOp,
+                                 mlir::quant::ir::DequantizeCastOp,
+                                 /*VerifierT=*/void, RootOpT> {
+  explicit TFQuantizationBase(MLIRContext* ctx,
+                              const QuantPassSpec& quant_params)
+      : QuantizationPattern<ConcreteT, mlir::quant::ir::QuantizeCastOp,
+                            mlir::quant::ir::DequantizeCastOp,
+                            /*VerifierT=*/void, RootOpT>(ctx, quant_params) {}
+
+  // Custom op quantization is not supported.
+  static bool IsQuantizableCustomOp(Operation* op,
+                                    const CustomMap& custom_op_map) {
+    return false;
+  }
+
+  // All the quantized ops are supported if the quantization method is dynamic
+  // range quantization.
+  static bool AllowDynamicRangeQuantizedOperand(
+      Operation* quantized_op, const CustomMap& custom_op_map) {
+    auto call_op = cast<TF::PartitionedCallOp>(quantized_op);
+    StringRef function_name =
+        llvm::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
+    // The below can be generalized as there are more read-only ops added such
+    // as slice.
+    const bool is_gather = function_name.contains("gather");
+    return quantization_trait != kFullQuantization || is_gather;
+  }
+
+  // All the quantized ops are supported if the quantization method is dynamic
+  // range quantization.
+  static bool AllowDynamicRangeQuantizedResult(Operation* quantized_op,
+                                               const CustomMap& custom_op_map) {
+    auto call_op = cast<TF::PartitionedCallOp>(quantized_op);
+    StringRef function_name =
+        llvm::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
+    // The below can be generalized as there are more read-only ops added such
+    // as slice.
+    bool is_gather = false;
+    if (function_name.contains("gather")) is_gather = true;
+    return quantization_trait != kFullQuantization ||
+           (quantization_trait == kFullQuantization && is_gather);
+  }
+
+  // If weight_only_quantization is true, the legacy weight-only quantization is
+  // applied. The legacy weight-only graph has dequantization logic at the
+  // front.
+  static bool IsWeightOnlyOp(Operation* quantized_op,
+                             absl::flat_hash_set<std::string>& ops_blocklist,
+                             bool weight_only_quantization,
+                             const CustomMap& custom_op_map) {
+    return weight_only_quantization;
+  }
+};
+
+// Full integer quantization rewrite pattern using DQ as the root op.
+struct TFFullQuantization
+    : public TFQuantizationBase<kFullQuantization, TFFullQuantization> {
+  explicit TFFullQuantization(MLIRContext* ctx,
+                              const QuantPassSpec& quant_params)
+      : TFQuantizationBase<kFullQuantization, TFFullQuantization>(
+            ctx, quant_params) {}
+};
+
+// Full integer quantization rewrite pattern using Q as the root op. This is for
+// the quantizable ops without floating-point operands.
+struct TFFullQuantizationReverse
+    : public TFQuantizationBase<kFullQuantization, TFFullQuantizationReverse,
+                                mlir::quant::ir::QuantizeCastOp> {
+  explicit TFFullQuantizationReverse(MLIRContext* ctx,
+                                     const QuantPassSpec& quant_params)
+      : TFQuantizationBase<kFullQuantization, TFFullQuantizationReverse,
+                           mlir::quant::ir::QuantizeCastOp>(ctx, quant_params) {
+  }
+};
+
+// Dynamic range quantization rewrite pattern using DQ as the root op.
+struct TFDynamicRangeQuantization
+    : public TFQuantizationBase<kDynamicRangeQuantization,
+                                TFDynamicRangeQuantization> {
+  explicit TFDynamicRangeQuantization(
+      MLIRContext* ctx, const tf_quant::QuantPassSpec& quant_params)
+      : TFQuantizationBase<kDynamicRangeQuantization,
+                           TFDynamicRangeQuantization>(ctx, quant_params) {}
+};
+
+// Removes quantize-dequantize pairs that are not used in the quantization.
+// The benefit of this pattern is set to lower value than other patterns, so
+// that the other patterns can work on quantize/dequantize ops first.
+class RemoveUnusedQdqPattern
+    : public OpRewritePattern<mlir::quant::ir::DequantizeCastOp> {
+ public:
+  explicit RemoveUnusedQdqPattern(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::DequantizeCastOp>(context) {}
+  LogicalResult matchAndRewrite(mlir::quant::ir::DequantizeCastOp dq_op,
+                                PatternRewriter& rewriter) const override {
+    auto q_op = dq_op.getArg().getDefiningOp<mlir::quant::ir::QuantizeCastOp>();
+    if (!q_op) return failure();
+
+    dq_op.replaceAllUsesWith(q_op.getArg());
+    return success();
+  }
+};
+
+class QuantizeSameScaleOpsPattern
+    : public OpRewritePattern<mlir::quant::ir::DequantizeCastOp> {
+ public:
+  explicit QuantizeSameScaleOpsPattern(
+      MLIRContext* context, OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+      OpSet target_opset)
+      // Set the score to a large number so it is always preferred, after
+      // quantization patterns.
+      : OpRewritePattern<mlir::quant::ir::DequantizeCastOp>(context,
+                                                            /*benefit=*/200),
+        op_quant_scale_spec_getter_(op_quant_scale_spec_getter),
+        target_opset_(target_opset) {}
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::DequantizeCastOp op,
+                                PatternRewriter& rewriter) const override {
+    SmallVector<Operation*, 4> quantizing_ops;
+    auto users = op.getResult().getUsers();
+    quantizing_ops.append(users.begin(), users.end());
+
+    bool changed = false;
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (Operation* quantizing_op : quantizing_ops) {
+      // If it is requantize op, we shouldn't rewrite this op.
+      if (llvm::isa<mlir::quant::ir::QuantizeCastOp,
+                    mlir::quant::ir::DequantizeCastOp>(quantizing_op)) {
+        return failure();
+      }
+
+      // If the op is terminator, not quantizable or any ops from the mlir quant
+      // ops dialect, we shouldn't rewrite.
+      if (quantizing_op->hasTrait<OpTrait::IsTerminator>()) {
+        return failure();
+      }
+
+      if (!op_quant_scale_spec_getter_(quantizing_op)
+               ->has_same_scale_requirement) {
+        continue;
+      }
+
+      if (target_opset_ == OpSet::XLA &&
+          !IsConnectedWithCompsiteFunction(quantizing_op)) {
+        continue;
+      }
+
+      // Same scale op is not supported for Uniform Quantized ops.
+      if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+        continue;
+      }
+
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(quantizing_op->getNumOperands());
+      for (const auto& operand : quantizing_op->getOperands()) {
+        Type operand_type = operand.getType();
+        if (isa<NoneType>(operand_type)) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        Type elem_type = llvm::cast<TensorType>(operand_type).getElementType();
+        if (auto dq_op = dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+                operand.getDefiningOp())) {
+          auto dq_arg_type = llvm::cast<TensorType>(dq_op.getArg().getType());
+          auto qtype = llvm::cast<QuantizedType>(dq_arg_type.getElementType());
+          auto scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+              dq_op->getLoc(), dq_arg_type.clone(qtype.getStorageType()),
+              dq_op.getArg());
+          inputs.push_back(scast_op.getResult());
+        } else if (!elem_type.isF32()) {
+          // If the operand is an integer tensor, then it doesn't require the
+          // DQ op in the pattern.
+          inputs.push_back(operand);
+        } else {
+          return failure();
+        }
+      }
+
+      // Collect all the quantized outputs and replace them by the results of
+      // the new quantized op.
+      llvm::SmallDenseMap<Value, int> outputs_replaced;
+      SmallVector<Type, 4> output_types;
+      output_types.reserve(quantizing_op->getNumResults());
+      for (const auto& enumerated_result :
+           llvm::enumerate(quantizing_op->getResults())) {
+        Value result = enumerated_result.value();
+        Type result_type = result.getType();
+        if (isa<NoneType>(result_type)) {
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result_type);
+          continue;
+        }
+        auto result_tensor_type = llvm::cast<TensorType>(result_type);
+        // If the user is the Quantize op, it must be the only user.
+        if (result.hasOneUse() &&
+            llvm::isa<mlir::quant::ir::QuantizeCastOp>(*result.user_begin())) {
+          auto user =
+              llvm::cast<mlir::quant::ir::QuantizeCastOp>(*result.user_begin());
+          outputs_replaced.insert(
+              {user.getResult(), enumerated_result.index()});
+          auto qtype = llvm::cast<QuantizedType>(
+              llvm::cast<TensorType>(user.getType()).getElementType());
+          output_types.push_back(
+              result_tensor_type.clone(qtype.getStorageType()));
+        } else if (!result_tensor_type.getElementType().isF32()) {
+          // If the result is an integer tensor, then it doesn't require the
+          // D op in the pattern.
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result.getType());
+        } else {
+          // TODO(b/224691264): separate matching and rewriting clearly.
+          return failure();
+        }
+      }
+
+      rewriter.setInsertionPointAfter(quantizing_op);
+      OperationState new_state(quantizing_op->getLoc(),
+                               quantizing_op->getName().getStringRef(), inputs,
+                               output_types, quantizing_op->getAttrs());
+      for (int i = 0; i < quantizing_op->getNumRegions(); ++i) {
+        new_state.addRegion();
+      }
+      Operation* quantized_op = rewriter.create(new_state);
+      if (quantizing_op->getNumRegions() != 0) {
+        for (const auto& indexed_regions :
+             llvm::enumerate(quantizing_op->getRegions())) {
+          IRMapping mapping;
+          indexed_regions.value().cloneInto(
+              &quantized_op->getRegion(indexed_regions.index()), mapping);
+        }
+      }
+      for (const auto& output_index_pair : outputs_replaced) {
+        Value output = output_index_pair.getFirst();
+        int output_index = output_index_pair.getSecond();
+        auto scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+            output.getLoc(), output.getType(),
+            quantized_op->getResult(output_index));
+        output.replaceAllUsesWith(scast_op);
+      }
+      changed = true;
+    }
+    return success(changed);
+  }
+
+ private:
+  // Checks whether the operation is connected with a composite function.
+  // If not, the same-scale op will not be quantized. This decision is based
+  // on the current assumption that the performance gain of the same-scale
+  // op itself could not beat the overhead of the quantize and dequantize
+  // routines need to be added around that op. When the assumption changes,
+  // this policy might change as well.
+  bool IsConnectedWithCompsiteFunction(Operation* same_scale_op) const {
+    for (const auto& operand : same_scale_op->getOperands()) {
+      auto dq_op = dyn_cast_or_null<mlir::quant::ir::DequantizeCastOp>(
+          operand.getDefiningOp());
+      if (!dq_op) continue;
+
+      Operation* preceding_op = dq_op.getArg().getDefiningOp();
+      if (!preceding_op) continue;
+
+      // Check whether the preceding op is a quantized composite function.
+      if (llvm::isa<TF::PartitionedCallOp>(preceding_op)) {
+        auto call_op = llvm::cast<TF::PartitionedCallOp>(preceding_op);
+        if (!IsCompositeFunction(call_op)) continue;
+        return true;
+      }
+
+      // Check if the preceding op is a quantized same-scale op.
+      if (llvm::isa<mlir::quant::ir::StorageCastOp>(preceding_op)) {
+        auto sc_op = llvm::cast<mlir::quant::ir::StorageCastOp>(preceding_op);
+        auto sc_arg_type = llvm::dyn_cast<TensorType>(sc_op.getArg().getType());
+        if (sc_arg_type.getElementType().isInteger(8)) {
+          return true;
+        }
+      }
+    }
+
+    for (const auto& result : same_scale_op->getResults()) {
+      // If the user is the Quantize op, it must be the only user.
+      if (!result.hasOneUse() ||
+          !llvm::isa<mlir::quant::ir::QuantizeCastOp>(*result.user_begin())) {
+        continue;
+      }
+
+      auto q_op =
+          llvm::cast<mlir::quant::ir::QuantizeCastOp>(*result.user_begin());
+      for (auto following_op : q_op->getUsers()) {
+        // Check whether the preceding op is a quantized composite function.
+        if (llvm::isa<TF::PartitionedCallOp>(following_op)) {
+          auto call_op = llvm::cast<TF::PartitionedCallOp>(following_op);
+          if (!IsCompositeFunction(call_op)) continue;
+          return true;
+        }
+
+        // Check if the preceding op is a quantized same-scale op.
+        if (llvm::isa<mlir::quant::ir::StorageCastOp>(following_op)) {
+          auto sc_op = llvm::cast<mlir::quant::ir::StorageCastOp>(following_op);
+          auto sc_arg_type =
+              llvm::dyn_cast<TensorType>(sc_op.getResult().getType());
+          if (sc_arg_type.getElementType().isInteger(8)) {
+            return true;
+          }
+        }
+      }
+    }
+
+    return false;
+  }
+
+  // Checks if op calls a composite function and all the inputs are quantized.
+  bool IsCompositeFunction(TF::PartitionedCallOp call_op) const {
+    if (!call_op->hasAttr(kQuantTraitAttrName)) {
+      return false;
+    }
+
+    const auto f_attr = llvm::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+    if (!f_attr || !f_attr.getValue().starts_with("composite_")) {
+      return false;
+    }
+
+    bool has_quantized_types = false;
+    for (Value input : call_op.getArgs()) {
+      if (auto type = llvm::dyn_cast<TensorType>(input.getType())) {
+        if (isa<FloatType>(type.getElementType())) {
+          return false;
+        }
+        if (isa<QuantizedType>(type.getElementType())) {
+          has_quantized_types = true;
+        }
+      }
+    }
+    for (Value output : call_op.getOutput()) {
+      if (auto type = llvm::dyn_cast<TensorType>(output.getType())) {
+        if (isa<FloatType>(type.getElementType())) {
+          return false;
+        }
+        if (isa<QuantizedType>(type.getElementType())) {
+          has_quantized_types = true;
+        }
+      }
+    }
+    return has_quantized_types;
+  }
+
+  OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
+  OpSet target_opset_;
+};
+
+// The AvgPool op is a same-scale op but it doesn't have int8 kernel, so
+// we cast its input to float and its output to int8 as a workaround.
+// TODO(b/229183248): Remove this workaround after int8 kernels have been
+// added to TF and XLA.
+struct QuantizeAvgPoolOpPattern
+    : public OpRewritePattern<mlir::quant::ir::StorageCastOp> {
+  explicit QuantizeAvgPoolOpPattern(MLIRContext* context)
+      : OpRewritePattern<mlir::quant::ir::StorageCastOp>(context,
+                                                         /*benefit=*/100) {}
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::StorageCastOp sc_op,
+                                PatternRewriter& rewriter) const override {
+    auto avg_pool_op = sc_op.getArg().getDefiningOp<TF::AvgPoolOp>();
+    if (!avg_pool_op) return failure();
+    auto preceding_sc_op = dyn_cast_or_null<mlir::quant::ir::StorageCastOp>(
+        avg_pool_op.getValue().getDefiningOp());
+    if (!preceding_sc_op) return failure();
+
+    // Check if the same-scale requirement is met.
+    auto dq_arg_type =
+        llvm::cast<TensorType>(preceding_sc_op.getArg().getType());
+    auto qtype = llvm::cast<QuantizedType>(dq_arg_type.getElementType());
+    auto q_result_type = llvm::cast<TensorType>(sc_op.getType());
+    auto out_qtype = llvm::cast<QuantizedType>(q_result_type.getElementType());
+    if (qtype != out_qtype) {
+      avg_pool_op.emitError(
+          "The preceding StorageCastOp and the following "
+          "StorageCastOp must have the same quantized type");
+      return failure();
+    }
+
+    // Cast to float type before the AvgPool op.
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointAfter(preceding_sc_op);
+    auto fcast_op = rewriter.create<TF::CastOp>(
+        preceding_sc_op->getLoc(), dq_arg_type.clone(rewriter.getF32Type()),
+        preceding_sc_op.getResult());
+
+    // Create a new AvgPool op with float type.
+    TF::AvgPoolOp float_avg_pool_op = rewriter.create<TF::AvgPoolOp>(
+        avg_pool_op->getLoc(),
+        avg_pool_op.getType().clone(rewriter.getF32Type()),
+        /*operands=*/fcast_op.getResult(),
+        /*attributes=*/avg_pool_op->getAttrs());
+
+    // Cast back to the storage type after AvgPool op.
+    auto round_val = rewriter.create<TF::RoundOp>(
+        sc_op.getLoc(), float_avg_pool_op.getOutput());
+    auto icast_op = rewriter.create<TF::CastOp>(
+        sc_op.getLoc(), q_result_type.clone(qtype.getStorageType()), round_val);
+    avg_pool_op.getResult().replaceAllUsesWith(icast_op.getResult());
+    return success();
+  }
+};
+
+// Applies quantization on the model in TF dialect.
+class QuantizePass
+    : public PassWrapper<QuantizePass, OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizePass)
+
+  // Constructor used by the PassRegistration and only used by test.
+  explicit QuantizePass() {
+    quant_specs_.inference_type = tensorflow::DT_QINT8;
+  }
+
+  // Constructor used by manually creating the pass.
+  explicit QuantizePass(const QuantizationSpecs& quant_specs,
+                        OpSet target_opset)
+      : quant_specs_(quant_specs) {
+    weight_quantization_ = quant_specs.weight_quantization;
+    target_opset_ = target_opset;
+  }
+
+  QuantizePass(const QuantizePass& other) : quant_specs_(other.quant_specs_) {
+    weight_quantization_ = other.weight_quantization_;
+    target_opset_ = other.target_opset_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-quantize";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Apply quantization on models in TensorFlow dialect";
+  }
+
+  // Determine if the unused Q-DQ pairs need to be removed. For weight-only
+  // quantizable ops, Q-DQ ops need to be preserved.
+  bool shouldKeepUnusedQdqPattern();
+
+  void runOnOperation() override;
+
+ private:
+  QuantizationSpecs quant_specs_;
+
+  Option<bool> weight_quantization_{
+      *this, "weight-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether to enable weight quantization.")};
+  Option<OpSet> target_opset_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+};
+
+bool QuantizePass::shouldKeepUnusedQdqPattern() {
+  return target_opset_ == OpSet::XLA &&
+         (quant_specs_.weight_only_quantization ||
+          quant_specs_.weight_quantization);
+}
+
+void QuantizePass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  auto func = getOperation();
+  auto* ctx = func.getContext();
+
+  quant_specs_.weight_quantization = weight_quantization_;
+  const QuantPassSpec quant_params = {
+      {quant_specs_.verify_numeric, /*error_tolerance=*/5.0f,
+       quant_specs_.whole_model_verify, /*enable_log_if_failed=*/false},
+      quant_specs_};
+
+  if (quant_specs_.weight_quantization) {
+    patterns.add<TFDynamicRangeQuantization>(ctx, quant_params);
+  } else {
+    patterns.add<TFFullQuantization, TFFullQuantizationReverse>(ctx,
+                                                                quant_params);
+    patterns.add<QuantizeSameScaleOpsPattern>(ctx, GetTfQuantScaleSpec,
+                                              target_opset_);
+    patterns.add<QuantizeAvgPoolOpPattern>(ctx);
+  }
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    func.emitWarning("Failed to converge pattern at QuantizePass.");
+  }
+
+  if (!shouldKeepUnusedQdqPattern()) {
+    RewritePatternSet patterns_2(&getContext());
+    patterns_2.add<RemoveUnusedQdqPattern>(ctx);
+    if (failed(applyPatternsGreedily(func, std::move(patterns_2)))) {
+      signalPassFailure();
+    }
+  }
+}
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect Quantize pass.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass() {
+  QuantizationSpecs quant_specs;
+  return std::make_unique<QuantizePass>(quant_specs, OpSet::TF);
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+    QuantizationSpecs quant_specs, OpSet target_opset) {
+  return std::make_unique<QuantizePass>(quant_specs, target_opset);
+}
+
+static PassRegistration<QuantizePass> pass;
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_composite_functions.cc
new file mode 100644
index 000000000000..2c5ed6d7fe47
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_composite_functions.cc
@@ -0,0 +1,1370 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_traits.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
+#include "tensorflow/core/ir/importexport/convert_tensor.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
+
+constexpr absl::string_view kQuantizeCompositeFunctionsStepName =
+    "_quantize_composite_functions";
+constexpr StringRef kQuantizeFuncName = "quantize_i8";
+constexpr StringRef kDequantizeFuncName = "dequantize_i8";
+constexpr StringRef kAttrMapAttribute = "attr_map";
+constexpr StringRef kQuantizedOpsAttribute = "tf_quant.quantized_ops";
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+constexpr StringRef kQuantizedFuncPrefix = "quantized_";
+constexpr StringRef kFloatOutputFuncSuffix = "_float_output_fn";
+constexpr StringRef kHybridFuncSuffix = "_hybrid_fn";
+
+class QuantizeCompositeFunctionsPass
+    : public mlir::PassWrapper<QuantizeCompositeFunctionsPass,
+                               OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeCompositeFunctionsPass)
+
+  explicit QuantizeCompositeFunctionsPass() = default;
+
+  explicit QuantizeCompositeFunctionsPass(
+      const QuantMethod quantization_method, const OpSet target_opset,
+      const bool enable_per_channel_quantization,
+      const int min_num_elements_for_weights,
+      const bool enable_legacy_weight_only,
+      std::optional<const std::string> mlir_dump_file_name)
+      : enable_legacy_weight_only_(enable_legacy_weight_only),
+        min_num_elements_for_weights_(min_num_elements_for_weights),
+        mlir_dump_file_name_(std::move(mlir_dump_file_name)) {
+    quantization_method_ = quantization_method;
+    target_opset_ = target_opset;
+    enable_per_channel_quantization_ = enable_per_channel_quantization;
+  }
+
+  QuantizeCompositeFunctionsPass(const QuantizeCompositeFunctionsPass& other) {
+    quantization_method_ = other.quantization_method_;
+    target_opset_ = other.target_opset_;
+    enable_per_channel_quantization_ = other.enable_per_channel_quantization_;
+    min_num_elements_for_weights_ = other.min_num_elements_for_weights_;
+    enable_legacy_weight_only_ = other.enable_legacy_weight_only_;
+    mlir_dump_file_name_ = other.mlir_dump_file_name_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-quantize-composite-functions";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Quantize composite functions with QDQ input/outputs.";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, quant::QuantDialect,
+                    mlir::quant::ir::TFQuantDialect>();
+  }
+
+ private:
+  void runOnOperation() override;
+
+  bool enable_legacy_weight_only_;
+  int min_num_elements_for_weights_;
+  std::optional<std::string> mlir_dump_file_name_;
+
+  // These flags are only used for testing purpose.
+  Option<QuantMethod> quantization_method_{
+      *this, "quantization-method",
+      llvm::cl::init(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_INT8),
+      llvm::cl::desc("Choose quantization method."),
+      llvm::cl::values(
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_INT8,
+                     "ptq", "Post-training static-range quantization"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_DYNAMIC_RANGE_INT8,
+                     "drq", "Post-training dynamic-range quantizaiton"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::
+                         METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8,
+                     "weight_only", "Post-training weight-only quantization"))};
+
+  Option<OpSet> target_opset_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
+};
+
+LogicalResult CreateUniformQuantizedTypeParams(UniformQuantizedType qtype,
+                                               Location loc,
+                                               PatternRewriter& rewriter,
+                                               Value& scale,
+                                               Value& zero_point) {
+  TensorType scale_type = RankedTensorType::get({}, rewriter.getF32Type());
+  TensorType zero_point_type = scale_type.clone(rewriter.getI32Type());
+  scale = rewriter.create<TF::ConstOp>(
+      loc, scale_type,
+      DenseFPElementsAttr::get(scale_type,
+                               {static_cast<float>(qtype.getScale())}));
+  zero_point = rewriter.create<TF::ConstOp>(
+      loc, zero_point_type,
+      DenseIntElementsAttr::get(zero_point_type,
+                                {static_cast<int32_t>(qtype.getZeroPoint())}));
+  return success(scale && zero_point);
+}
+
+LogicalResult CreateUniformQuantizedPerAxisTypeParams(
+    quant::UniformQuantizedPerAxisType qtype, Location loc,
+    PatternRewriter& rewriter, Value& scale, Value& zero_point) {
+  // Consuming op should already know about Quantized channel information,
+  // so not passing it during conversion. This design might change if needed.
+  ArrayRef<double> scales = qtype.getScales();
+  ArrayRef<int64_t> zero_points = qtype.getZeroPoints();
+  const int num_channels = scales.size();
+  TensorType scale_type = RankedTensorType::get(
+      {static_cast<int64_t>(num_channels)}, rewriter.getF32Type());
+  TensorType zero_point_type = scale_type.clone(rewriter.getI32Type());
+
+  llvm::SmallVector<float, 4> float_scales;
+  llvm::SmallVector<int32_t, 4> int32_zero_points;
+  float_scales.reserve(num_channels);
+  int32_zero_points.reserve(num_channels);
+  for (int i = 0; i < num_channels; ++i) {
+    float_scales.push_back(scales[i]);
+    int32_zero_points.push_back(zero_points[i]);
+  }
+  scale = rewriter.create<TF::ConstOp>(
+      loc, scale_type, DenseFPElementsAttr::get(scale_type, float_scales));
+  zero_point = rewriter.create<TF::ConstOp>(
+      loc, zero_point_type,
+      DenseIntElementsAttr::get(zero_point_type, int32_zero_points));
+  return success(scale && zero_point);
+}
+
+LogicalResult CreateQuantizationParams(QuantizedType elem_type, Location loc,
+                                       PatternRewriter& rewriter, Value& scale,
+                                       Value& zero_point) {
+  if (!elem_type) {
+    return failure();
+  }
+  if (auto qtype = mlir::dyn_cast<UniformQuantizedType>(elem_type)) {
+    return CreateUniformQuantizedTypeParams(qtype, loc, rewriter, scale,
+                                            zero_point);
+  } else if (auto qtype = mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+                 elem_type)) {
+    return CreateUniformQuantizedPerAxisTypeParams(qtype, loc, rewriter, scale,
+                                                   zero_point);
+  }
+  return failure();
+}
+
+// Converts the element type of the input tensor to the corresponding quantized
+// version. Supports only int8 for now and returns nullptr if the input type is
+// not supported.
+ShapedType ConvertIntToQint(ShapedType input_type, MLIRContext* ctx) {
+  int bit_width;
+  bool is_signed;
+
+  Type ele_type = input_type.getElementType();
+  if (ele_type.isIntOrFloat()) {
+    bit_width = ele_type.getIntOrFloatBitWidth();
+    is_signed = ele_type.isSignlessIntOrFloat() || ele_type.isSignedInteger();
+  } else if (QuantizedType qtype = mlir::dyn_cast<QuantizedType>(ele_type)) {
+    bit_width = qtype.getStorageTypeIntegralWidth();
+    is_signed = qtype.isSigned();
+  } else {
+    return input_type;
+  }
+
+  Type new_storage_type;
+  if (is_signed) {
+    switch (bit_width) {
+      case 8:
+        new_storage_type = TF::Qint8Type::get(ctx);
+        break;
+      case 32:
+        new_storage_type = TF::Qint32Type::get(ctx);
+        break;
+      default:
+        return nullptr;  // Not yet supported
+    }
+  } else {
+    return nullptr;  // Not yet supported
+  }
+
+  input_type = input_type.clone(new_storage_type);
+  return input_type;
+}
+
+// Replaces quant.qcast op to composite quantize_i8 function.
+class ReplaceQuantizePattern
+    : public mlir::OpRewritePattern<mlir::quant::ir::QuantizeCastOp> {
+ public:
+  explicit ReplaceQuantizePattern(MLIRContext* context, OpSet target_opset)
+      : OpRewritePattern<mlir::quant::ir::QuantizeCastOp>(context),
+        target_opset_(target_opset) {}
+
+ private:
+  OpSet target_opset_ = OpSet::TF;
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::QuantizeCastOp q_op,
+                                PatternRewriter& rewriter) const override {
+    auto output_type = mlir::cast<TensorType>(q_op.getType());
+    auto elem_type =
+        mlir::dyn_cast<QuantizedType>(output_type.getElementType());
+    const Location loc = q_op->getLoc();
+    Value scale, zero_point;
+
+    if (failed(CreateQuantizationParams(elem_type, loc, rewriter, scale,
+                                        zero_point))) {
+      return failure();
+    }
+
+    SmallVector<Type> output_types;
+
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      ShapedType new_output_type = ConvertIntToQint(
+          mlir::cast<ShapedType>(output_type), rewriter.getContext());
+      if (!new_output_type) {
+        q_op->emitError(
+            "Failed to convert the type to the corresponding qtype.");
+        return failure();
+      }
+      output_types = {new_output_type};
+    } else {
+      output_types = {output_type.clone(elem_type.getStorageType())};
+    }
+
+    SmallVector<Value> args = {q_op.getArg(), scale, zero_point};
+    FlatSymbolRefAttr func_name =
+        FlatSymbolRefAttr::get(rewriter.getStringAttr(kQuantizeFuncName));
+
+    auto quantize_call = rewriter.create<TF::PartitionedCallOp>(
+        loc, output_types, args, /*args_attrs=*/nullptr,
+        /*res_attrs=*/nullptr, func_name,
+        /*config=*/"", /*config_proto=*/"", /*executor_type=*/"");
+    auto scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+        loc, output_type, quantize_call->getResult(0));
+    q_op->replaceAllUsesWith(scast_op);
+    return success();
+  }
+};
+
+// Replaces quant.dcast op to composite dequantize_i8 function.
+class ReplaceDequantizePattern
+    : public mlir::OpRewritePattern<mlir::quant::ir::DequantizeCastOp> {
+ public:
+  explicit ReplaceDequantizePattern(MLIRContext* context, OpSet target_opset)
+      : OpRewritePattern<mlir::quant::ir::DequantizeCastOp>(context),
+        target_opset_(target_opset) {}
+
+ private:
+  OpSet target_opset_ = OpSet::TF;
+
+  LogicalResult matchAndRewrite(mlir::quant::ir::DequantizeCastOp dq_op,
+                                PatternRewriter& rewriter) const override {
+    auto input_type = mlir::cast<TensorType>(dq_op.getArg().getType());
+    auto elem_type = mlir::dyn_cast<QuantizedType>(input_type.getElementType());
+    const Location loc = dq_op->getLoc();
+
+    Value scale, zero_point;
+    if (failed(CreateQuantizationParams(elem_type, loc, rewriter, scale,
+                                        zero_point))) {
+      return failure();
+    }
+
+    TensorType output_type = input_type.clone(elem_type.getStorageType());
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      ShapedType new_output_type = ConvertIntToQint(
+          mlir::cast<ShapedType>(output_type), rewriter.getContext());
+      if (!new_output_type) {
+        dq_op->emitError(
+            "Failed to convert the type to the corresponding qtype.");
+        return failure();
+      }
+      output_type = mlir::cast<TensorType>(new_output_type);
+    }
+
+    auto scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+        loc, output_type, dq_op.getArg());
+
+    FlatSymbolRefAttr func_name =
+        FlatSymbolRefAttr::get(rewriter.getStringAttr(kDequantizeFuncName));
+    SmallVector<Value> args = {scast_op->getResult(0), scale, zero_point};
+    auto dequantize_call = rewriter.create<TF::PartitionedCallOp>(
+        loc, dq_op.getResult().getType(), args, /*args_attrs=*/nullptr,
+        /*res_attrs=*/nullptr, func_name,
+        /*config=*/"", /*config_proto=*/"", /*executor_type=*/"");
+    dq_op->replaceAllUsesWith(dequantize_call);
+    return success();
+  }
+};
+
+// Checks if input weights are quantized only.
+bool IsQuantizedCallforDynamicRange(TF::PartitionedCallOp call_op) {
+  bool has_quantized_types_for_weights = false;
+  std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(call_op);
+
+  for (int32_t cur_idx = 0; cur_idx < call_op.getArgs().size(); cur_idx++) {
+    // Check if the only the weight index has QuantizeCastOp.
+    auto cur_op = dyn_cast_or_null<mlir::quant::ir::QuantizeCastOp>(
+        call_op.getArgs()[cur_idx].getDefiningOp());
+    if (!cur_op && spec->quantizable_operands.contains(cur_idx)) {
+      return false;
+    } else if (cur_op) {
+      // Check if the QuantizeCastOp has element type of quantized type.
+      if (!mlir::isa<QuantizedType>(
+              getElementTypeOrSelf(cur_op.getResult().getType()))) {
+        return false;
+      }
+      // Satisfies the input condition.
+      has_quantized_types_for_weights = true;
+    }
+  }
+  for (Value output : call_op.getOutput()) {
+    if (auto type = mlir::dyn_cast<TensorType>(output.getType())) {
+      if (mlir::isa<QuantizedType>(type.getElementType())) {
+        return false;
+      }
+    }
+  }
+  return has_quantized_types_for_weights;
+}
+
+// Checks if all the inputs are quantized.
+bool IsQuantizedCallforStaticRange(TF::PartitionedCallOp call_op) {
+  bool has_quantized_types = false;
+  for (Value input : call_op.getArgs()) {
+    if (auto type = mlir::dyn_cast<TensorType>(input.getType())) {
+      if (mlir::isa<QuantizedType>(type.getElementType())) {
+        has_quantized_types = true;
+      }
+    }
+  }
+  for (Value output : call_op.getOutput()) {
+    if (auto type = mlir::dyn_cast<TensorType>(output.getType())) {
+      if (mlir::isa<QuantizedType>(type.getElementType())) {
+        has_quantized_types = true;
+      }
+    }
+  }
+  return has_quantized_types;
+}
+
+// Transfers the attributes of the corresponding ops from the float function to
+// the quantized function using the attr_map attribute. In the quantized
+// function, this map (map1) is in {attr_name_1: attr_identifier} format; and in
+// the float function, this map (map2) is in {attr_identifier: attr_name_2}
+// format. Where, the attribute identifiers should match between two maps,
+// attr_name_1 is the name of the of the attribute needs to be set in the
+// quantized function, attr_name_2 is the name of the attribute corresponding to
+// the attribute identifier in the float function.
+LogicalResult TransferTFAttributesToTFUniformAttributes(
+    PatternRewriter& rewriter, func::FuncOp float_func,
+    func::FuncOp quantized_func, QuantMethod quantization_method,
+    bool enable_per_channel_quantization) {
+  // A map to find an attribute from its identifier.
+  llvm::StringMap<Attribute> identifier_to_attr;
+
+  for (Operation& inner_op : float_func.getBody().front().getOperations()) {
+    if (!inner_op.hasAttr(kAttrMapAttribute)) continue;
+    // Insert quantization related attribute if they exists. Quantization
+    // attributes are generated in the prepare pass so the attr_map doesn't
+    // contain the attribute names.
+    // TransferQuantizationAttributes(rewriter, inner_op, attrs);
+    std::string attr_map_str =
+        inner_op.getAttrOfType<StringAttr>(kAttrMapAttribute).str();
+    for (absl::string_view element_str : absl::StrSplit(attr_map_str, ',')) {
+      std::vector<absl::string_view> key_and_value_pair =
+          absl::StrSplit(element_str, ':');
+      if (key_and_value_pair.size() != 2) {
+        float_func.emitError("The attr_map attribute is malformed");
+        return failure();
+      }
+      identifier_to_attr.insert(
+          {llvm::StringRef(std::string(key_and_value_pair[1])),
+           inner_op.getAttr(
+               llvm::StringRef(std::string(key_and_value_pair[1])))});
+    }
+  }
+
+  // Set the attributes for ops with the attr_map attribute.
+  for (Operation& inner_op : quantized_func.getBody().front().getOperations()) {
+    if (auto uniform_op =
+            llvm::dyn_cast<TF::UniformQuantizedConvolutionHybridOp>(inner_op);
+        uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizedConvolutionOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedConvolutionOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizedConvolutionOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedDotHybridOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizedDotOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedAddOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizedAddOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedClipByValueOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizedClipByValueOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformRequantizeOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformRequantizeOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizeOp>(inner_op);
+               uniform_op != nullptr) {
+      if (failed(FillAttributesForUniformQuantizeOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    }
+  }
+  return success();
+}
+
+// Transfers the attributes of the corresponding ops from the float function to
+// the quantized function using the attr_map attribute. In the quantized
+// function, this map (map1) is in {attr_name_1: attr_identifier} format; and in
+// the float function, this map (map2) is in {attr_identifier: attr_name_2}
+// format. Where, the attribute identifiers should match between two maps,
+// attr_name_1 is the name of the of the attribute needs to be set in the
+// quantized function, attr_name_2 is the name of the attribute corresponding to
+// the attribute identifier in the float function.
+LogicalResult TransferAttributes(func::FuncOp float_func,
+                                 func::FuncOp quantized_func) {
+  // A map to find an attribute from its identifier.
+  llvm::StringMap<Attribute> identifier_to_attr;
+  for (Operation& inner_op : float_func.getBody().front().getOperations()) {
+    if (!inner_op.hasAttr(kAttrMapAttribute)) continue;
+    std::string attr_map_str =
+        inner_op.getAttrOfType<StringAttr>(kAttrMapAttribute).str();
+    for (absl::string_view element_str : absl::StrSplit(attr_map_str, ',')) {
+      std::vector<absl::string_view> key_and_value_pair =
+          absl::StrSplit(element_str, ':');
+      if (key_and_value_pair.size() != 2) {
+        float_func.emitError("The attr_map attribute is malformed");
+        return failure();
+      }
+      identifier_to_attr.insert(
+          {llvm::StringRef(std::string(key_and_value_pair[0])),
+           inner_op.getAttr(
+               llvm::StringRef(std::string(key_and_value_pair[1])))});
+    }
+  }
+
+  // Set the attributes for ops with the attr_map attribute.
+  for (Operation& inner_op : quantized_func.getBody().front().getOperations()) {
+    if (!inner_op.hasAttr(kAttrMapAttribute)) continue;
+
+    std::string attr_map_str =
+        inner_op.getAttrOfType<StringAttr>(kAttrMapAttribute).str();
+    for (absl::string_view element_str : absl::StrSplit(attr_map_str, ',')) {
+      std::vector<absl::string_view> key_and_value_pair =
+          absl::StrSplit(element_str, ':');
+      if (key_and_value_pair.size() != 2) {
+        float_func.emitError("The attr_map attribute is malformed");
+        return failure();
+      }
+      if (identifier_to_attr.count(
+              llvm::StringRef(std::string(key_and_value_pair[1]))) == 0) {
+        float_func.emitWarning(absl::StrCat("Using the default value for the '",
+                                            key_and_value_pair[0],
+                                            "' attribute"));
+        continue;
+      }
+      inner_op.setAttr(llvm::StringRef(std::string(key_and_value_pair[0])),
+                       identifier_to_attr[llvm::StringRef(
+                           std::string(key_and_value_pair[1]))]);
+    }
+    inner_op.removeAttr(kAttrMapAttribute);
+  }
+  return success();
+}
+
+// Transfers the location of the main op in float function to ops with
+// `attr_map` attributes in quantized function.
+LogicalResult TransferLocation(func::FuncOp float_func,
+                               func::FuncOp quantized_func) {
+  Operation* main_op = nullptr;
+  for (Operation& inner_op : float_func.getBody().front().getOperations()) {
+    // Expect only one quantizable op in the composite function.
+    if (IsOpWithQuantizableTrait(&inner_op)) {
+      main_op = &inner_op;
+      break;
+    }
+  }
+  if (!main_op) {
+    float_func.emitError() << "No quantizable ops found in the function.";
+    return failure();
+  }
+
+  for (Operation& inner_op : quantized_func.getBody().front().getOperations()) {
+    if (!inner_op.hasAttr(kAttrMapAttribute)) continue;
+    inner_op.setLoc(main_op->getLoc());
+  }
+  return success();
+}
+
+// Get the corresponding quantized function name from the given function name.
+std::string GetQuantizedFunctionName(StringRef func_name,
+                                     const bool merged_with_dequantize,
+                                     const bool is_hybrid) {
+  if (func_name.starts_with(kQuantizedFuncPrefix)) return func_name.str();
+  if (!func_name.starts_with(kCompositeFuncPrefix)) return "";
+
+  auto base_function_name =
+      llvm::Twine(kQuantizedFuncPrefix)
+          .concat(llvm::Twine(func_name.substr(kCompositeFuncPrefix.size())
+                                  .rsplit("_fn")
+                                  .first));
+
+  if (merged_with_dequantize) {
+    return base_function_name.concat("_float_output_fn").str();
+  }
+
+  if (is_hybrid) {
+    return base_function_name.concat("_hybrid_fn").str();
+  }
+
+  return base_function_name.concat("_fn").str();
+}
+
+bool ContainsFloatResultType(ArrayRef<Type> result_types) {
+  for (auto current_type : result_types) {
+    if (mlir::dyn_cast<TensorType>(current_type).getElementType().isF32())
+      return true;
+  }
+  return false;
+}
+
+// Unwraps quantization parameters of PartitionedCall ops with quantized
+// input/outputs that are created from QuantizePass.
+class QuantizeFunctionPattern
+    : public mlir::OpRewritePattern<TF::PartitionedCallOp> {
+ public:
+  explicit QuantizeFunctionPattern(MLIRContext* context,
+                                   const QuantMethod quantization_method,
+                                   const OpSet target_opset,
+                                   const bool enable_per_channel_quantization)
+      : OpRewritePattern<TF::PartitionedCallOp>(context),
+        quantization_method_(quantization_method),
+        target_opset_(target_opset),
+        enable_per_channel_quantization_(enable_per_channel_quantization) {}
+
+ private:
+  QuantMethod quantization_method_ =
+      tensorflow::quantization::QuantizationMethod::METHOD_STATIC_RANGE_INT8;
+  OpSet target_opset_ = OpSet::TF;
+  bool enable_per_channel_quantization_;
+
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+    // removeAttr will return nullptr if no attribute was removed.
+    if (!call_op->removeAttr(kQuantTraitAttrName) || !f_attr) {
+      return failure();
+    }
+    if (!f_attr.getValue().starts_with(kCompositeFuncPrefix)) {
+      return failure();
+    }
+
+    bool has_quantized_types = false;
+    if (quantization_method_ == tensorflow::quantization::QuantizationMethod::
+                                    METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8) {
+      // Skipping input type check for weight-only quantization as it can be
+      // dequantized beforehand for the legacy scheme.
+      has_quantized_types = true;
+    } else {
+      // Determines if all required float input/outputs are now quantized.
+      // Either one of the criteria needs to meet.
+      has_quantized_types |= IsQuantizedCallforDynamicRange(call_op);
+      has_quantized_types |= IsQuantizedCallforStaticRange(call_op);
+    }
+
+    if (!has_quantized_types) return failure();
+
+    SmallVector<Value, 4> args;
+    SmallVector<Value, 4> qparam_args;
+    for (Value arg : call_op.getArgs()) {
+      if (const auto arg_type = mlir::dyn_cast<TensorType>(arg.getType())) {
+        QuantizedType qtype =
+            mlir::dyn_cast<QuantizedType>(arg_type.getElementType());
+        if (!qtype) continue;
+        if (!mlir::isa<UniformQuantizedType,
+                       quant::UniformQuantizedPerAxisType>(qtype)) {
+          return failure();
+        }
+        Value scale, zero_point;
+        if (failed(CreateQuantizationParams(qtype, arg.getLoc(), rewriter,
+                                            scale, zero_point))) {
+          // As the quantized types are already checked, this is unexpected.
+          call_op->emitError(
+              "Failed to create quantization parameter for an argument.");
+          return failure();
+        }
+        qparam_args.push_back(scale);
+        qparam_args.push_back(zero_point);
+      }
+    }
+
+    for (Value result : call_op->getResults()) {
+      if (auto result_type = mlir::dyn_cast<TensorType>(result.getType())) {
+        QuantizedType qtype =
+            mlir::dyn_cast<QuantizedType>(result_type.getElementType());
+        if (!qtype) continue;
+        if (!mlir::isa<UniformQuantizedType,
+                       quant::UniformQuantizedPerAxisType>(qtype)) {
+          return failure();
+        }
+        Value scale, zero_point;
+        if (failed(CreateQuantizationParams(qtype, result.getLoc(), rewriter,
+                                            scale, zero_point))) {
+          // As the quantized types are already checked, this is unexpected.
+          call_op->emitError(
+              "Failed to create quantization parameter for a result.");
+          return failure();
+        }
+        qparam_args.push_back(scale);
+        qparam_args.push_back(zero_point);
+      }
+    }
+
+    rewriter.setInsertionPoint(call_op);
+
+    for (Value arg : call_op.getArgs()) {
+      TensorType arg_type = mlir::dyn_cast<TensorType>(arg.getType());
+      if (!arg_type) {
+        args.push_back(arg);
+        continue;
+      }
+      QuantizedType qtype =
+          mlir::dyn_cast<QuantizedType>(arg_type.getElementType());
+      if (!qtype) {
+        args.push_back(arg);
+        continue;
+      }
+
+      mlir::quant::ir::StorageCastOp scast_op;
+      if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+        ShapedType new_arg_type = ConvertIntToQint(
+            mlir::cast<ShapedType>(arg_type), rewriter.getContext());
+        if (!new_arg_type) {
+          call_op->emitError(
+              "Failed to convert the type to the corresponding qtype.");
+          return failure();
+        }
+        scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+            arg.getLoc(), mlir::cast<TensorType>(new_arg_type), arg);
+      } else {
+        scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+            arg.getLoc(), arg_type.clone(qtype.getStorageType()), arg);
+      }
+      args.push_back(scast_op.getResult());
+    }
+    args.insert(args.end(), qparam_args.begin(), qparam_args.end());
+    // For XLA opset, try to merge quantized functions with following Dequantize
+    // for optimization.
+    if (target_opset_ == OpSet::XLA) {
+      if (failed(mergeDequantizeOpFollowingQuantizedFunction(call_op, args,
+                                                             rewriter))) {
+        return failure();
+      }
+    }
+    if (call_op->use_empty()) return success();
+
+    DenseMap<Value, mlir::quant::ir::StorageCastOp> replace_map;
+    rewriter.setInsertionPointAfter(call_op);
+
+    SmallVector<Type, 4> result_types;
+    for (Value result : call_op->getResults()) {
+      TensorType result_type = mlir::dyn_cast<TensorType>(result.getType());
+      if (!result_type) {
+        result_types.push_back(result.getType());
+        continue;
+      }
+      QuantizedType qtype =
+          mlir::dyn_cast<QuantizedType>(result_type.getElementType());
+      if (!qtype) {
+        result_types.push_back(result_type);
+        continue;
+      }
+      if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+        ShapedType new_result_type = ConvertIntToQint(
+            mlir::cast<ShapedType>(result_type), rewriter.getContext());
+        result_types.push_back(new_result_type);
+      } else {
+        result_types.push_back(result_type.clone(qtype.getStorageType()));
+      }
+      auto scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+          call_op.getLoc(), result_type, result);
+      replace_map.insert(std::make_pair(result, scast_op));
+    }
+
+    for (auto replace_pair : replace_map) {
+      Value result = replace_pair.first;
+      mlir::quant::ir::StorageCastOp scast_op = replace_pair.second;
+      result.replaceAllUsesExcept(scast_op, scast_op);
+    }
+
+    // Make a copy of the quantized function.
+    auto module = call_op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module);
+
+    mlir::func::FuncOp float_func =
+        dyn_cast<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
+    rewriter.setInsertionPointAfter(float_func);
+
+    // Applies only for hybrid ops in SRQ.
+    const bool is_hybrid =
+        ContainsFloatResultType(result_types) &&
+        (quantization_method_ == tensorflow::quantization::QuantizationMethod::
+                                     METHOD_STATIC_RANGE_INT8);
+    const std::string quantized_function_name = GetQuantizedFunctionName(
+        f_attr.getValue(), /*merged_with_dequantize=*/false,
+        /*is_hybrid=*/is_hybrid);
+
+    const mlir::func::FuncOp quantized_func = dyn_cast_or_null<func::FuncOp>(
+        symbol_table.lookup(quantized_function_name));
+    if (quantized_func == nullptr) {
+      call_op->emitError("Failed to find the quantized function: " +
+                         quantized_function_name);
+      return failure();
+    }
+    mlir::func::FuncOp new_quantized_func =
+        dyn_cast<func::FuncOp>(quantized_func->clone());
+
+    new_quantized_func.setType(
+        FunctionType::get(getContext(), TypeRange{ValueRange{args}},
+                          new_quantized_func.getResultTypes()));
+    for (auto [partitioned_call_arg, new_quantized_func_arg] :
+         llvm::zip_equal(args, new_quantized_func.getArguments())) {
+      new_quantized_func_arg.setType(partitioned_call_arg.getType());
+    }
+
+    // Set the location for ops so the op name is preserved.
+    if (failed(TransferLocation(float_func, new_quantized_func))) {
+      return failure();
+    }
+
+    // Set the attributes for ops with the attr_map attribute.
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      if (failed(TransferTFAttributesToTFUniformAttributes(
+              rewriter, float_func, new_quantized_func, quantization_method_,
+              enable_per_channel_quantization_))) {
+        return failure();
+      }
+    } else {
+      if (failed(TransferAttributes(float_func, new_quantized_func))) {
+        return failure();
+      }
+    }
+
+    rewriter.setInsertionPoint(call_op);
+
+    const StringAttr new_quant_func_name =
+        symbol_table.insert(new_quantized_func);
+    rewriter.replaceOpWithNewOp<TF::PartitionedCallOp>(
+        call_op, result_types, args, call_op.getArgAttrsAttr(),
+        call_op.getResAttrsAttr(), FlatSymbolRefAttr::get(new_quant_func_name));
+
+    return success();
+  }
+
+  // For composite functions followed by Dequantize ops, merges the Dequantize
+  // op into the functions by creating quantized functions with float output.
+  LogicalResult mergeDequantizeOpFollowingQuantizedFunction(
+      TF::PartitionedCallOp call_op, const SmallVector<Value, 4>& args,
+      PatternRewriter& rewriter) const {
+    bool followed_by_dequantize = false;
+    for (Operation* user : call_op->getUsers()) {
+      if (llvm::isa<mlir::quant::ir::DequantizeCastOp>(user)) {
+        followed_by_dequantize = true;
+        break;
+      }
+    }
+    if (!followed_by_dequantize) return success();
+
+    rewriter.setInsertionPointAfter(call_op);
+    SmallVector<Type, 4> result_types;
+    for (Value result : call_op->getResults()) {
+      TensorType result_type = mlir::dyn_cast<TensorType>(result.getType());
+      if (!result_type) {
+        result_types.push_back(result.getType());
+        continue;
+      }
+      QuantizedType qtype =
+          mlir::dyn_cast<QuantizedType>(result_type.getElementType());
+      if (!qtype) {
+        result_types.push_back(result_type);
+        continue;
+      }
+
+      result_types.push_back(result_type.clone(qtype.getExpressedType()));
+    }
+
+    // Make a copy of the quantized function.
+    auto module = call_op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module);
+
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+    const auto float_func =
+        dyn_cast<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
+    rewriter.setInsertionPointAfter(float_func);
+
+    const std::string quantized_function_name = GetQuantizedFunctionName(
+        f_attr.getValue(), /*merged_with_dequantize=*/true,
+        /*is_hybrid=*/false);
+    const auto quantized_func = dyn_cast_or_null<func::FuncOp>(
+        symbol_table.lookup(quantized_function_name));
+    if (quantized_func == nullptr) {
+      call_op->emitError("Failed to find the quantized function: " +
+                         quantized_function_name);
+      return failure();
+    }
+    auto new_quantized_func = dyn_cast<func::FuncOp>(quantized_func->clone());
+    new_quantized_func.setType(
+        FunctionType::get(getContext(), TypeRange{ValueRange{args}},
+                          new_quantized_func.getResultTypes()));
+    for (auto [partitioned_call_arg, new_quantized_func_arg] :
+         llvm::zip_first(args, new_quantized_func.getArguments())) {
+      new_quantized_func_arg.setType(partitioned_call_arg.getType());
+    }
+
+    // Set the location for ops so the op name is preserved.
+    if (failed(TransferLocation(float_func, new_quantized_func))) {
+      return failure();
+    }
+
+    // Set the attributes for ops with the attr_map attribute.
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      if (failed(TransferTFAttributesToTFUniformAttributes(
+              rewriter, float_func, new_quantized_func, quantization_method_,
+              enable_per_channel_quantization_))) {
+        return failure();
+      }
+    } else {
+      if (failed(TransferAttributes(float_func, new_quantized_func))) {
+        return failure();
+      }
+    }
+
+    rewriter.setInsertionPoint(call_op);
+    const StringAttr new_quant_func_name =
+        symbol_table.insert(new_quantized_func);
+    auto quantized_call_op = rewriter.create<TF::PartitionedCallOp>(
+        call_op.getLoc(), result_types, args, call_op.getArgAttrsAttr(),
+        call_op.getResAttrsAttr(), FlatSymbolRefAttr::get(new_quant_func_name));
+
+    for (int result_idx : llvm::seq<int>(0, call_op->getNumResults())) {
+      Value result = call_op->getResult(result_idx);
+      for (Operation* user : result.getUsers()) {
+        if (auto dequant_op =
+                llvm::dyn_cast<mlir::quant::ir::DequantizeCastOp>(user)) {
+          dequant_op.getResult().replaceAllUsesWith(
+              quantized_call_op->getResult(result_idx));
+        }
+      }
+    }
+
+    return success();
+  }
+};
+
+// Converts const -> quant.qcast pattern to quantized constant, after
+// quantization parameters are safely included to each quantize composite
+// functions.
+class QuantizeConstPattern
+    : public OpRewritePattern<mlir::quant::ir::QuantizeCastOp> {
+ public:
+  // This pattern should have larger benefit than ReplaceQuantizePattern
+  explicit QuantizeConstPattern(MLIRContext* context, OpSet target_opset)
+      : OpRewritePattern<mlir::quant::ir::QuantizeCastOp>(context,
+                                                          /*benefit=*/10),
+        target_opset_(target_opset) {}
+
+ private:
+  LogicalResult matchAndRewrite(mlir::quant::ir::QuantizeCastOp q_op,
+                                PatternRewriter& rewriter) const override {
+    DenseFPElementsAttr attr;
+    if (!matchPattern(q_op.getArg(), m_Constant(&attr))) {
+      return failure();
+    }
+
+    ShapedType tensor_qtype =
+        mlir::cast<ShapedType>(q_op.getResult().getType());
+    Attribute tensor_proto_attr = Quantize(attr, tensor_qtype);
+    if (!tensor_proto_attr) {
+      return failure();
+    }
+
+    Type storage_type = mlir::cast<QuantizedType>(tensor_qtype.getElementType())
+                            .getStorageType();
+    ShapedType new_type = tensor_qtype.clone(storage_type);
+    Location loc = q_op.getArg().getLoc();
+
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      new_type = ConvertIntToQint(new_type, rewriter.getContext());
+
+      // TODO(b/225793355): It adds TensorProtoAttr to the constant as a
+      // workaround.
+      tensorflow::TensorProto tensor_proto;
+      if (!mlir::tfg::ConvertToTensorProto(
+               mlir::cast<ElementsAttr>(tensor_proto_attr), &tensor_proto)
+               .ok()) {
+        return failure();
+      }
+
+      const int bit_width =
+          mlir::dyn_cast<QuantizedType>(tensor_qtype.getElementType())
+              .getStorageTypeIntegralWidth();
+
+      tensor_proto.set_dtype((bit_width == 8) ? tensorflow::DT_QINT8
+                                              : tensorflow::DT_QINT32);
+
+      tensor_proto_attr = ElementsAttr(TF::TensorProtoAttr::get(
+          new_type, tensorflow::mangling_util::MangleTensor(tensor_proto)));
+    }
+    auto const_op =
+        rewriter.create<TF::ConstOp>(loc, new_type, tensor_proto_attr);
+    // Add scast op to match quantize -> composition pattern. The added scast
+    // is then removed by canonicalization. ([scast - scast] -> [])
+    auto scast_op = rewriter.create<mlir::quant::ir::StorageCastOp>(
+        loc, tensor_qtype, const_op.getOutput());
+    q_op->replaceAllUsesWith(scast_op);
+    return success();
+  }
+
+  OpSet target_opset_;
+};
+
+// To calculate per-channel scale and offset, weight of depthwise was reshaped
+// to [H, W, 1, InxMul]. After scale and offset has been calculated, this
+// pattern gets called and restores the weight of depthwise back
+// into [H, W, In, Mul]
+class RestoreWeightShapePattern
+    : public OpRewritePattern<TF::PartitionedCallOp> {
+  using OpRewritePattern<TF::PartitionedCallOp>::OpRewritePattern;
+
+ private:
+  LogicalResult addReshapeOpToDepthwiseWeight(TF::PartitionedCallOp op,
+                                              PatternRewriter& rewriter) const {
+    int weight_operand_idx = 1;
+    Operation* weight_op = op.getOperand(weight_operand_idx).getDefiningOp();
+
+    auto weight_type =
+        mlir::dyn_cast<ShapedType>(weight_op->getResult(0).getType());
+    auto input_type = mlir::dyn_cast<ShapedType>(op.getOperand(0).getType());
+
+    llvm::ArrayRef<int64_t> weight_shape = weight_type.getShape();
+    llvm::ArrayRef<int64_t> input_shape = input_type.getShape();
+
+    // If weight_shape[2] != 1, it means weight shape was already restored.
+    if (weight_shape[2] != 1) return failure();
+
+    // Weight was reshaped into [H, W, 1, InxMul].
+    // Since we know in_channels from input_shape, we can derive multiplier.
+    int64_t in_channels = input_shape[3];
+    // If in_channels is 1, there is no need to restore weight shape.
+    if (in_channels == 1) return failure();
+    int64_t multiplier = weight_shape[3] / in_channels;
+
+    TensorType new_shape = RankedTensorType::get(
+        {weight_shape[0], weight_shape[1], in_channels, multiplier},
+        weight_type.getElementType());
+
+    int cur_rank = weight_type.getRank();
+
+    // Inserts a reshape op.
+    auto shape_spec_type =
+        RankedTensorType::get({cur_rank}, rewriter.getIntegerType(64));
+    auto new_shape_const_attr =
+        DenseElementsAttr::get(shape_spec_type, new_shape.getShape());
+    rewriter.setInsertionPointAfter(weight_op);
+    auto new_shape_const = rewriter.create<TF::ConstOp>(
+        weight_op->getLoc(), shape_spec_type, new_shape_const_attr);
+    auto reshape_op = rewriter.create<TF::ReshapeOp>(
+        weight_op->getLoc(), new_shape, weight_op->getResult(0),
+        new_shape_const);
+    op->setOperand(weight_operand_idx, reshape_op);
+
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+    StringRef function_name = f_attr.getValue();
+    // TODO(b/228928859): Improve the getter function to match attributes rather
+    // than function name.
+    // If enable_legacy_weight_only is enabled, QuantizeFunctionsPattern
+    // does not get called and function remains as composite
+    if (!function_name.starts_with("quantized_") &&
+        !function_name.starts_with("composite_")) {
+      return failure();
+    }
+
+    if (function_name.contains("depthwise_conv2d")) {
+      return addReshapeOpToDepthwiseWeight(call_op, rewriter);
+    }
+
+    return failure();
+  }
+};
+
+// Prints a summary about the quantization results.
+class QuantizationSummary {
+ public:
+  explicit QuantizationSummary(ModuleOp module)
+      : module_(module), symbol_table_(module) {}
+
+  void Print() {
+    llvm::StringMap<OpCountItem> func_count_map;
+    int32_t total_quantized_func_count = 0, float_output_func_count = 0,
+            quantize_func_count = 0, dequantize_func_count = 0,
+            weight_only_count = 0;
+
+    module_.walk([&](Operation* op) {
+      if (auto call_op = llvm::dyn_cast_or_null<TF::PartitionedCallOp>(op)) {
+        const auto f_attr =
+            mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
+        if (!f_attr) return;
+        StringRef func_name = f_attr.getValue();
+        if (func_name.starts_with(kQuantizedFuncPrefix)) {
+          auto representative_name = GetRepresentativeName(func_name);
+          if (failed(representative_name)) return;
+
+          func_count_map[representative_name.value()].num_quant++;
+          total_quantized_func_count++;
+          if (func_name.contains(kFloatOutputFuncSuffix) ||
+              func_name.contains(kHybridFuncSuffix)) {
+            float_output_func_count++;
+          }
+        } else if (func_name.starts_with(kCompositeFuncPrefix)) {
+          auto representative_name = GetRepresentativeName(func_name);
+          if (failed(representative_name)) {
+            // TODO(b/264507511): Print quantization summary for weight-only.
+            weight_only_count++;
+          } else {
+            func_count_map[representative_name.value()].num_float++;
+          }
+        } else if (func_name.starts_with("quantize_i")) {
+          quantize_func_count++;
+        } else if (func_name.starts_with("dequantize_i")) {
+          dequantize_func_count++;
+        }
+      } else if (auto einsum = llvm::isa<TF::EinsumOp>(op)) {
+        if (IsInCompsiteFunction(op)) return;
+        // Leftover Einsum ops are always non-quantized.
+        auto op_name = op->getName().stripDialect();
+        func_count_map[op_name].num_float++;
+      }
+    });
+
+    // Pad string to a certain size to format the table. Space is preferred to
+    // Tab since it is easier to check the format in the mlir tests.
+    auto pad_string = [](StringRef s, int32_t width) -> std::string {
+      return llvm::Twine(s).concat(std::string(width - s.size(), ' ')).str();
+    };
+
+    // Generate a quantization report.
+    size_t name_col_width = 5;
+    absl::c_for_each(func_count_map.keys(), [&name_col_width](const auto& key) {
+      name_col_width = std::max(name_col_width, key.size() + 1);
+    });
+
+    std::vector<std::string> lines;
+    lines.push_back("-------- Quantization Summary --------");
+    lines.push_back("Number of quantized layers in the model");
+    lines.push_back("--------------------------------");
+    lines.push_back(
+        absl::StrFormat("%s Count/Total", pad_string("Name", name_col_width)));
+    lines.push_back("================================");
+    for (StringRef op_name : func_count_map.keys()) {
+      const int32_t quantized_count = func_count_map[op_name].num_quant;
+      const int32_t total_count =
+          quantized_count + func_count_map[op_name].num_float;
+      lines.push_back(absl::StrFormat("%s %d/%d",
+                                      pad_string(op_name, name_col_width),
+                                      quantized_count, total_count));
+    }
+    lines.push_back("");
+    lines.push_back(absl::StrFormat(
+        "Number of quantized layers with quantized outputs: %d/%d",
+        total_quantized_func_count - float_output_func_count,
+        total_quantized_func_count));
+    lines.push_back(absl::StrFormat("Number of quantize layers added: %d",
+                                    quantize_func_count));
+    lines.push_back(absl::StrFormat("Number of dequantize layers added: %d",
+                                    dequantize_func_count));
+    lines.push_back("");
+
+    // Make the report visible by default.
+    const std::string log_message =
+        absl::StrJoin(lines.begin(), lines.end(), /*separator=*/"\n");
+    llvm::errs() << log_message;
+
+    // Create a FuncOp and attach the quantization summary to it. This is a
+    // a hack to check the summary in mlir tests. This function will be
+    // automatically removed since this pass is always followed by the Symbol
+    // DCE pass.
+    OpBuilder builder(module_);
+    builder.setInsertionPointToEnd(&module_.getBodyRegion().back());
+    const auto func_type =
+        builder.getFunctionType(/*inputs=*/{}, /*results=*/{});
+    auto summary_func = builder.create<func::FuncOp>(
+        builder.getUnknownLoc(), /*sym_name=*/"summary", func_type);
+    summary_func.setPrivate();
+    summary_func->setAttr("quantization_summary",
+                          builder.getStringAttr(log_message));
+  }
+
+ private:
+  // Structs used to count quantized and non-quantized ops.
+  struct OpCountItem {
+    int32_t num_quant = 0;
+    int32_t num_float = 0;
+  };
+
+  // Get the representative name attribute value of a composite function.
+  FailureOr<StringRef> GetRepresentativeName(StringRef func_name) {
+    std::string quantized_func_name = GetQuantizedFunctionName(
+        func_name, /*merged_with_dequantize=*/false, /*is_hybrid=*/false);
+    auto quantized_func = dyn_cast_or_null<func::FuncOp>(
+        symbol_table_.lookup(quantized_func_name));
+    // Quantized function does not exist for weight-only case.
+    if (!quantized_func ||
+        !quantized_func->hasAttrOfType<ArrayAttr>(kQuantizedOpsAttribute)) {
+      return failure();
+    }
+
+    auto quantized_ops =
+        quantized_func->getAttrOfType<ArrayAttr>(kQuantizedOpsAttribute)
+            .getValue();
+    if (quantized_ops.empty()) {
+      quantized_func->emitError() << "At least one op is expected in the "
+                                  << kQuantizedOpsAttribute << " attribute.";
+      return failure();
+    }
+
+    // Use the first op as the representative name.
+    return mlir::cast<StringAttr>(quantized_ops.front()).getValue();
+  }
+
+  bool IsInCompsiteFunction(Operation* op) {
+    func::FuncOp parent = op->getParentOfType<func::FuncOp>();
+    if (!parent) return false;
+
+    StringRef sym_name = parent.getSymName();
+    return sym_name.starts_with(kQuantizedFuncPrefix) ||
+           sym_name.starts_with(kCompositeFuncPrefix);
+  }
+
+  ModuleOp module_;
+  SymbolTable symbol_table_;
+};
+
+static PassRegistration<QuantizeCompositeFunctionsPass> pass;
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_composite_functions.inc"
+
+void QuantizeCompositeFunctionsPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  ModuleOp module = getOperation();
+
+  PassManager pm(ctx);
+  // Intermediate output from QuantizePass will have PartitionedCall ops with
+  // quantized input and output types, which are not allowed in TF dialect.
+  // This can be removed when the composite call supports quantized types.
+  pm.enableVerifier(false);
+
+  QuantizationSpecs quant_specs;
+  quant_specs.inference_type = tensorflow::DT_QINT8;
+  quant_specs.disable_per_channel = !enable_per_channel_quantization_;
+
+  pm.addPass(CreatePreprocessOpPass(target_opset_, quantization_method_,
+                                    enable_per_channel_quantization_));
+
+  // Apply activation-weight quantization.
+  if (quantization_method_ ==
+      tensorflow::quantization::QuantizationMethod::METHOD_STATIC_RANGE_INT8) {
+    // For XLA case, weight quantization will be applied for the remaining f32
+    // weights even in SRQ.
+    pm.addNestedPass<func::FuncOp>(
+        CreatePrepareQuantizePass(quant_specs, quantization_method_));
+    pm.addNestedPass<func::FuncOp>(
+        CreateQuantizePass(quant_specs, target_opset_));
+    pm.addNestedPass<func::FuncOp>(CreatePostQuantizePass());
+  } else {
+    // Apply weight quantization.
+    quant_specs.minimum_elements_for_weights = min_num_elements_for_weights_;
+    quant_specs.weight_quantization = true;
+    quant_specs.weight_only_quantization = enable_legacy_weight_only_;
+    pm.addPass(CreatePrepareQuantizeDRQPass(quant_specs, target_opset_));
+    pm.addNestedPass<func::FuncOp>(
+        CreateQuantizePass(quant_specs, target_opset_));
+    pm.addNestedPass<func::FuncOp>(CreatePostQuantizePass());
+  }
+
+  absl::Status pm_run_status = tensorflow::quantization::RunPassesOnModuleOp(
+      mlir_dump_file_name_, pm, module);
+  if (!pm_run_status.ok()) {
+    signalPassFailure();
+  }
+
+  // Legacy weight-only does not require quantized ops.
+  if (!enable_legacy_weight_only_) {
+    RewritePatternSet patterns(ctx);
+    patterns.add<QuantizeFunctionPattern>(ctx, quantization_method_,
+                                          target_opset_,
+                                          enable_per_channel_quantization_);
+
+    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+
+  // Constant quantization is a lossy transformation, so they are applied only
+  // after all the other patterns have been applied.
+  RewritePatternSet patterns_2(ctx);
+  populateWithGenerated(patterns_2);
+  patterns_2.add<ReplaceQuantizePattern, ReplaceDequantizePattern>(
+      ctx, target_opset_);
+  patterns_2.add<QuantizeConstPattern>(ctx, target_opset_);
+
+  if (target_opset_ == OpSet::XLA && enable_per_channel_quantization_) {
+    patterns_2.add<RestoreWeightShapePattern>(ctx);
+  }
+
+  if (failed(applyPatternsGreedily(module, std::move(patterns_2))) ||
+      failed(verify(module))) {
+    signalPassFailure();
+  }
+  QuantizationSummary(module).Print();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
+    const QuantMethod quantization_method, const OpSet target_opset,
+    const bool enable_per_channel_quantization,
+    const int min_num_elements_for_weights,
+    const bool enable_legacy_weight_only,
+    std::optional<const absl::string_view> mlir_dump_file_prefix) {
+  std::optional<std::string> mlir_dump_file_name;
+  if (mlir_dump_file_prefix) {
+    mlir_dump_file_name = absl::StrCat(mlir_dump_file_prefix.value(),
+                                       kQuantizeCompositeFunctionsStepName);
+  }
+  return std::make_unique<QuantizeCompositeFunctionsPass>(
+      quantization_method, target_opset, enable_per_channel_quantization,
+      min_num_elements_for_weights, enable_legacy_weight_only,
+      mlir_dump_file_name);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_composite_functions.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_composite_functions.td
new file mode 100644
index 000000000000..23722a510ac9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_composite_functions.td
@@ -0,0 +1,28 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+// Converts reamaining arith.constant ops from quantization passes back to
+// tf.Const ops.
+def ConvertArithConstToTfConst : Pat<
+  (Arith_ConstantOp:$res DenseElementsAttr:$value),
+  (TF_ConstOp $value),
+  [(AnyStaticShapeTensor $res)], [], (addBenefit 20)>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_weights.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_weights.cc
new file mode 100644
index 000000000000..b9072e05e656
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quantize_weights.cc
@@ -0,0 +1,278 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/temp_tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_tf_quantize_op.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+class QuantizeWeightsPass
+    : public mlir::PassWrapper<QuantizeWeightsPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeWeightsPass)
+
+  explicit QuantizeWeightsPass() : test_mode_(true) { initializeForTest(); }
+
+  explicit QuantizeWeightsPass(
+      const tensorflow::quantization::QuantizationOptions& quant_options)
+      : test_mode_(false), quant_options_(quant_options) {}
+
+  QuantizeWeightsPass(const QuantizeWeightsPass& other) {
+    test_mode_ = other.test_mode_;
+    quant_options_ = other.quant_options_;
+    initializeForTest();
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-quantize-weights";
+  }
+
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Quantize weights used by quantizable ops.";
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, quant::QuantDialect>();
+  }
+
+ private:
+  void runOnOperation() override;
+
+  bool test_mode_;
+  tensorflow::quantization::QuantizationOptions quant_options_;
+
+  // Initialize for tests.
+  void initializeForTest() {
+    if (!test_mode_) return;
+
+    tensorflow::quantization::QuantizationComponentSpec quant_spec;
+    quant_spec.set_quantization_component(
+        tensorflow::quantization::QuantizationComponentSpec::COMPONENT_WEIGHT);
+    quant_spec.set_tensor_type(
+        tensorflow::quantization::QuantizationComponentSpec::TENSORTYPE_INT_8);
+    auto mutable_quant_method = quant_options_.mutable_quantization_method();
+    *mutable_quant_method->add_quantization_component_specs() = quant_spec;
+  }
+};
+
+// If a constant is connected to a quantizable op, quantize the constant to have
+// the provided data type.
+class QuantizeConstWeights : public OpRewritePattern<TF::ConstOp> {
+ public:
+  explicit QuantizeConstWeights(
+      MLIRContext* context,
+      const tensorflow::quantization::QuantizationOptions& quantization_options)
+      : OpRewritePattern<TF::ConstOp>(context),
+        quant_options_(quantization_options) {}
+
+  LogicalResult matchAndRewrite(TF::ConstOp op,
+                                PatternRewriter& rewriter) const override {
+    auto weight_component_spec = GetWeightComponentSpec(quant_options_);
+    if (!weight_component_spec) return failure();
+
+    // 1. Check if the constant is quantizable.
+    if (failed((isQuantizableWeight(op)))) {
+      return failure();
+    }
+
+    // 2. Quantize the constant to the provided data type.
+    // After quantization, the graph will be transformed
+    // from:
+    // const -> some op -> quantizable_op
+    // to:
+    // q_const -> dequant_op -> some op -> quantizable_op
+    //
+    // A dequant_op will propagate to further quantize the next ops in another
+    // pass.
+    //
+    // Note that a constant can be used by multiple ops. For example, if a graph
+    // looks like below:
+    // const -> while -> quant_op
+    //       -> not_quant_op
+    //
+    // the transformation will be:
+    // q_const -> dequant_op -> while -> quant_op
+    //                       -> not_quant_op
+    // And the dequant_op op will propagate towards quant_op only.
+    if (failed(quantizeOps(rewriter, op, weight_component_spec.value()))) {
+      return failure();
+    }
+    return success();
+  }
+
+ private:
+  // Check if op's user or op's user after an identity op is connected to a
+  // terminator.
+  bool checkIfAnyUserIsConnectedToTermiantor(BlockArgument op) const {
+    for (const auto& user : op.getUsers()) {
+      if (user->template hasTrait<OpTrait::IsTerminator>()) return true;
+      if (auto next_user = dyn_cast_or_null<TF::IdentityOp>(user)) {
+        return (*(next_user->getResult(0).getUsers().begin()))
+            ->template hasTrait<OpTrait::IsTerminator>();
+      }
+    }
+    return false;
+  }
+
+  // Check if the constant op is connected to a quantizable op at some point.
+  bool hasUsageFromQuantizableOp(TF::ConstOp op) const {
+    llvm::SmallVector<mlir::Value> uses_at_current_level{op};
+    while (!uses_at_current_level.empty()) {
+      llvm::SmallVector<mlir::Value> next_values_to_visit;
+      for (auto cur_op : uses_at_current_level) {
+        for (auto& cur_op_use : cur_op.getUses()) {
+          Operation* next_op = cur_op_use.getOwner();
+          int next_op_operand_num = cur_op_use.getOperandNumber();
+          if (auto call_op = llvm::dyn_cast<mlir::CallOpInterface>(next_op)) {
+            mlir::func::FuncOp func =
+                llvm::dyn_cast<mlir::func::FuncOp>(call_op.resolveCallable());
+            if (!func) continue;
+            next_values_to_visit.push_back(
+                func.getArgument(next_op_operand_num));
+          } else if (auto while_op =
+                         llvm::dyn_cast_or_null<TF::WhileOp>(next_op)) {
+            func::FuncOp func = while_op.body_function();
+            auto func_argument = func.getArgument(next_op_operand_num);
+            // Check if the op is returned without mutation. Returning values
+            // from a while op follow return or identity -> return pattern.
+            if (checkIfAnyUserIsConnectedToTermiantor(func_argument))
+              next_values_to_visit.push_back(
+                  func.getArgument(next_op_operand_num));
+          } else if (IsOpWithQuantizableTrait(next_op)) {
+            // Check this before IsOpWithDataMovementTrait since some data
+            // movement ops are also quantizable ops.
+            return true;
+          } else if (IsOpWithDataMovementTrait(next_op)) {
+            next_values_to_visit.insert(next_values_to_visit.end(),
+                                        next_op->getResults().begin(),
+                                        next_op->getResults().end());
+          }
+        }
+      }
+      uses_at_current_level.swap(next_values_to_visit);
+    }
+    return false;
+  }
+
+  // List of conditions to check if a const op is quantizable.
+  LogicalResult isQuantizableWeight(TF::ConstOp op) const {
+    // Non-float tensors do not need quantization.
+    if (!IsValueWithQuantizablePrecision(op)) return failure();
+    // Check if quantizable ops are connected. Do this before num_elements check
+    // to avoid checking unnecessary constants which causes unintended remarks.
+    // This check also prevents quantizing unintended consts like scale.
+    if (!hasUsageFromQuantizableOp(op)) return failure();
+
+    // Check if the weight size is big enough.
+    int num_elements_threshold = quant_options_.min_num_elements_for_weights();
+    int num_elements = cast<ShapedType>(op.getType()).getNumElements();
+    if (num_elements < num_elements_threshold) {
+      op->emitRemark("Quantization is skipped because the op has ")
+          << num_elements << " elements which is fewer than the threshold("
+          << num_elements_threshold << " elements).";
+      return failure();
+    }
+
+    return success();
+  }
+
+  // Apply quantization with the provided spec.
+  LogicalResult quantizeOps(PatternRewriter& rewriter, TF::ConstOp op,
+                            tensorflow::quantization::QuantizationComponentSpec&
+                                weight_component_spec) const {
+    if (weight_component_spec.tensor_type() ==
+        tensorflow::quantization::QuantizationComponentSpec::TENSORTYPE_INT_8) {
+      // TODO - b/296535985: [Converter Component][TF-Quantizer] Factor out
+      // quant/dequant in QuantizeWeightsPass
+      auto dequantized_val =
+          ApplyUniformQuantization(rewriter, op, weight_component_spec);
+      if (!dequantized_val.has_value()) return failure();
+      op.getOutput().replaceAllUsesWith(dequantized_val.value().getResult(0));
+      return success();
+    }
+
+    op->emitRemark("Not supported quantization data type.");
+    return failure();
+  }
+
+ protected:
+  tensorflow::quantization::QuantizationOptions quant_options_;
+};
+
+static PassRegistration<QuantizeWeightsPass> pass;
+
+void QuantizeWeightsPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  auto module_op = getOperation();
+  RewritePatternSet patterns(ctx);
+
+  patterns.add<QuantizeConstWeights>(ctx, quant_options_);
+
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  // Apply transformation on each function. For recursive call case, another
+  // function can be modified at the same time so avoid running functions in
+  // parallel.
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    if (failed(applyPatternsGreedily(func, frozen_patterns))) {
+      func.emitError() << "tf-quant-quantize-weights failed.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeWeightsPass(
+    const tensorflow::quantization::QuantizationOptions& quant_options) {
+  return std::make_unique<QuantizeWeightsPass>(quant_options);
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_remove_var_init_by_const.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_remove_var_init_by_const.cc
new file mode 100644
index 000000000000..9067801d8feb
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_remove_var_init_by_const.cc
@@ -0,0 +1,122 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+
+// A pass that removes `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns
+// from the initializer function (type = "restore_op").
+//
+// Note: initializing values (`tf.Const`s) will be removed and this may result
+// in an information loss and uninitialized variable errors. Make sure that this
+// effect is desired (e.g. there is a `tf.RestoreV2Op` restoring the variables
+// instead).
+class RemoveVariableInitializationByConstPass
+    : public PassWrapper<RemoveVariableInitializationByConstPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      RemoveVariableInitializationByConstPass)
+
+  StringRef getArgument() const final {
+    return "tf-quant-remove-var-init-by-const";
+  }
+
+  StringRef getDescription() const final {
+    return "Removes `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns "
+           "from the initializer function of type 'restore_op'.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Finds and removes the `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)`
+// pattern. `tf.VarHandleOp` and `tf.Const` are removed unless they are used by
+// other ops.
+struct RemoveVariableAssignmentByConst
+    : public OpRewritePattern<TF::AssignVariableOp> {
+  // Inherit the constructors.
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::AssignVariableOp assign_op,
+                                PatternRewriter& rewriter) const override {
+    Value resource_operand = assign_op.getOperand(0);
+    Value assigned_value_operand = assign_op.getOperand(1);
+
+    if (!isa<TF::VarHandleOp>(resource_operand.getDefiningOp()) ||
+        !isa<TF::ConstOp>(assigned_value_operand.getDefiningOp())) {
+      return failure();
+    }
+
+    // `TF::ConstOp` and `TF::VarHandleOp` are not manually erased.
+    // `applyPatternsGreedily` performs dead code elimination and unsed
+    // ops will be erased during the optimization.
+    rewriter.eraseOp(assign_op);
+    return success();
+  }
+};
+
+void RemoveVariableInitializationByConstPass::runOnOperation() {
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<RemoveVariableAssignmentByConst>(&ctx);
+
+  ModuleOp module_op = getOperation();
+  func::FuncOp init_func_op = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+  if (init_func_op) {
+    if (failed(applyPatternsGreedily(init_func_op, std::move(patterns)))) {
+      init_func_op->emitError(
+          "Failed to remove variable assignment by const patterns.");
+      signalPassFailure();
+    }
+  } else {
+    LOG(INFO) << "Initializer function with type 'restore_op' does not exist. "
+                 "'RemoveVariableInitializationByConstPass' is a no-op.";
+  }
+}
+
+static PassRegistration<RemoveVariableInitializationByConstPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateRemoveVariableInitializationByConstPass() {
+  return std::make_unique<RemoveVariableInitializationByConstPass>();
+}
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_replace_cast_hacks_with_tf_xla_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_replace_cast_hacks_with_tf_xla_ops.cc
new file mode 100644
index 000000000000..80f2cce9cdd3
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_replace_cast_hacks_with_tf_xla_ops.cc
@@ -0,0 +1,1175 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_xla_attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "xla/xla_data.pb.h"
+
+namespace mlir::tf_quant {
+namespace {
+
+constexpr StringRef kTfQuantCreatedEinsum = "__tf_quant_created_einsum";
+
+// Replaces mixed-type Conv and Matmul cast hacks with TF XLA ops.
+// TODO(b/228403741): Support conversion for dynamic-shaped TF ops.
+class ReplaceCastHacksWithTFXLAOpsPass
+    : public PassWrapper<ReplaceCastHacksWithTFXLAOpsPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReplaceCastHacksWithTFXLAOpsPass)
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "tf-quant-replace-cast-hacks-with-tf-xla-ops";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Replace mixed-type Conv and Matmul cast hacks with TF XLA ops.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Generates params for the XLA Convolution op.
+void PrepareXlaConvParams(OpBuilder &builder, Location loc, ArrayAttr strides,
+                          ArrayAttr dilations, int feature_group_cnt,
+                          Value &window_strides, Value &lhs_dilation,
+                          Value &rhs_dilation, Value &feature_group_count,
+                          int num_dims) {
+  SmallVector<int32_t> lhs_dilation_values(num_dims - 2, 1);
+  SmallVector<int32_t> stride_values, rhs_dilation_values;
+  for (int64_t i : llvm::seq<int64_t>(1, num_dims - 1)) {
+    stride_values.push_back(mlir::cast<IntegerAttr>(strides[i]).getInt());
+    rhs_dilation_values.push_back(
+        mlir::cast<IntegerAttr>(dilations[i]).getInt());
+  }
+  window_strides = Create1DConstValue<int32_t>(builder, loc, stride_values);
+  lhs_dilation = Create1DConstValue<int32_t>(builder, loc, lhs_dilation_values);
+  rhs_dilation = Create1DConstValue<int32_t>(builder, loc, rhs_dilation_values);
+
+  feature_group_count =
+      CreateScalarConstValue<int32_t>(builder, loc, feature_group_cnt);
+}
+
+// Calculates other_tensor_zp * tensor for zero point offset calculation.
+Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
+                                   Value tensor, int8_t other_tensor_zp,
+                                   const ArrayRef<int64_t> output_dims) {
+  if (other_tensor_zp == 0) {
+    return CreateScalarConstValue<int32_t>(builder, loc, 0);
+  }
+
+  auto shape = mlir::cast<ShapedType>(tensor.getType());
+  SmallVector<int64_t> non_output_indices;
+  for (int64_t i : llvm::seq<int64_t>(0, shape.getRank())) {
+    if (absl::c_count(output_dims, i) == 0) {
+      non_output_indices.push_back(i);
+    }
+  }
+
+  auto reduction_indices_value =
+      Create1DConstValue<int64_t>(builder, loc, non_output_indices);
+  auto zp = CreateScalarConstValue<int32_t>(builder, loc, other_tensor_zp);
+
+  TensorType tensor_type = mlir::dyn_cast<TensorType>(tensor.getType());
+  Value tensor_i32 = builder.create<TF::CastOp>(
+      loc, tensor_type.clone(builder.getIntegerType(32)), tensor);
+  auto reduced =
+      builder.create<TF::SumOp>(loc, tensor_i32, reduction_indices_value,
+                                /*keep_dims=*/builder.getBoolAttr(true));
+  auto mul_op = builder.create<TF::MulOp>(loc, zp, reduced);
+
+  SmallVector<Value> folded_results = ConstantFoldOpIfPossible(mul_op);
+  return folded_results.front();
+}
+
+// Add two contributions, and a zeropoint modification term
+// Consider two quantized matrices P, Q with zero points z, w. Let's say the
+// dimensions are l X n, n X m.
+// What we want to calculate is: R = matmul(P-z, Q-w).
+// Then r_ij = sigma(k) (p_ik - z) * (q_kj - w)
+//           = sigma(k)(p_ik * q_kj) - w * sigma(k)p_ik - z * sigma(k)q_kj
+//             + sigma(k)z*w.
+// zp_input_contribution = z * sigma(k)q_kj
+// zp_weight_contribution = w * sigma(k)p_ik
+// In case z != 0 and w != 0, we need to additionally calculate sigma(k)z*w,
+// which is: # of reduced dim(n in this case) * input_zp * weight_zp
+Value MergeZeroPointOffset(OpBuilder &builder, Location loc, Value weight,
+                           const ArrayRef<int64_t> weight_output_dims,
+                           int8_t input_zp, int8_t weight_zp,
+                           Value zp_input_contribution,
+                           Value zp_weight_contribution) {
+  auto weight_shape = mlir::cast<ShapedType>(weight.getType());
+  SmallVector<int64_t> weight_non_output_indices;
+  for (auto i : llvm::seq<int64_t>(0, weight_shape.getRank())) {
+    if (absl::c_count(weight_output_dims, i) == 0) {
+      weight_non_output_indices.push_back(i);
+    }
+  }
+
+  int32_t static_dim_total = 1;
+  Value accum_dynamic_dim = nullptr;
+  SmallVector<int64_t> weight_non_output_dynamic_indices;
+  for (const int64_t weight_idx : weight_non_output_indices) {
+    if (weight_shape.isDynamicDim(weight_idx)) {
+      weight_non_output_dynamic_indices.push_back(weight_idx);
+    } else {
+      static_dim_total *= weight_shape.getDimSize(weight_idx);
+    }
+  }
+
+  if (!weight_non_output_dynamic_indices.empty()) {
+    // Has dynamic shapes.
+    auto weight_shape_op = builder.create<TF::ShapeOp>(
+        loc, weight, /*use32Bit=*/builder.getBoolAttr(false));
+
+    auto slice_output_type = RankedTensorType::get({1}, builder.getI64Type());
+    auto slice_stride = CreateConstValue<int64_t>(builder, loc, {1}, {1});
+    for (int64_t weight_idx : weight_non_output_dynamic_indices) {
+      auto start = CreateConstValue<int64_t>(builder, loc, {1}, {weight_idx});
+      auto end = CreateConstValue<int64_t>(builder, loc, {1}, {weight_idx + 1});
+      auto sliced_shape_op = builder.create<TF::StridedSliceOp>(
+          loc, slice_output_type, weight_shape_op, start, end, slice_stride);
+      if (accum_dynamic_dim == nullptr) {
+        accum_dynamic_dim = sliced_shape_op->getResults().front();
+      } else {
+        accum_dynamic_dim =
+            builder.create<TF::MulOp>(loc, accum_dynamic_dim, sliced_shape_op)
+                ->getResults()
+                .front();
+      }
+    }
+  }
+
+  const int32_t zp_constant_offset = static_cast<int32_t>(input_zp) *
+                                     static_cast<int32_t>(weight_zp) *
+                                     static_dim_total;
+  auto zp_offset_value =
+      CreateScalarConstValue<int32_t>(builder, loc, zp_constant_offset);
+  if (accum_dynamic_dim != nullptr) {
+    accum_dynamic_dim =
+        builder
+            .create<TF::CastOp>(
+                loc, RankedTensorType::get({1}, builder.getI32Type()),
+                accum_dynamic_dim)
+            ->getResults()
+            .front();
+    auto mul_op =
+        builder.create<TF::MulOp>(loc, accum_dynamic_dim, zp_offset_value);
+    zp_offset_value = mul_op->getResults().front();
+  }
+
+  auto offset_sum = builder.create<TF::AddOp>(loc, zp_input_contribution,
+                                              zp_weight_contribution);
+  auto offset_op = builder.create<TF::SubOp>(loc, offset_sum, zp_offset_value);
+
+  SmallVector<Value> folded_results = ConstantFoldOpIfPossible(offset_op);
+  return folded_results.front();
+}
+
+// Calculates zero-point offset by reducing the weight and multiply it with zp.
+// Originally, we have:
+//   output = (int8_input - input_zp) * (int8_weight - weight_zp)
+// So, offset = input_zp * int8_weight + weight_zp * int8_input
+// - input_zp * weight_zp.
+// This function calculates the `offset` value mentioned above. Note that the
+// `output_dims` is the weight dimensions that are not contracted, so they
+// appear in the output shape.
+Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value input,
+                               Value weight, int8_t input_zp, int8_t weight_zp,
+                               const ArrayRef<int64_t> input_output_dims,
+                               const ArrayRef<int64_t> weight_output_dims) {
+  Value zp_input_contribution = CreateZeroPointPartialOffset(
+      builder, loc, input, weight_zp, input_output_dims);
+  Value zp_weight_contribution = CreateZeroPointPartialOffset(
+      builder, loc, weight, input_zp, weight_output_dims);
+
+  if (input_zp != 0 && weight_zp != 0) {
+    return MergeZeroPointOffset(builder, loc, weight, weight_output_dims,
+                                input_zp, weight_zp, zp_input_contribution,
+                                zp_weight_contribution);
+  }
+
+  if (input_zp != 0) return zp_weight_contribution;
+  return zp_input_contribution;
+}
+
+// Copy the value of d1 into d2.
+void CopyXlaDotDimensionNumbers(const xla::DotDimensionNumbers &d1,
+                                xla::DotDimensionNumbers &d2,
+                                const bool copy_left = true) {
+  if (copy_left) {
+    for (auto v : d1.lhs_batch_dimensions()) {
+      d2.add_lhs_batch_dimensions(v);
+    }
+    for (auto v : d1.lhs_contracting_dimensions()) {
+      d2.add_lhs_contracting_dimensions(v);
+    }
+  } else {
+    for (auto v : d1.rhs_batch_dimensions()) {
+      d2.add_rhs_batch_dimensions(v);
+    }
+    for (auto v : d1.rhs_contracting_dimensions()) {
+      d2.add_rhs_contracting_dimensions(v);
+    }
+  }
+}
+
+// Figure out the shape of other xladot argument for reducing contracting
+// dimension.
+// It must have the contracting dimensions on its shape, to reduce the
+// contracting dims from the original target. In addition, to match with
+// the XLADotV2 output shape, it requires the following additional rank:
+// xladot_out_rank - used_rank (= batch_rank + output_rank), with dim 1.
+// The final shape of the opponent should be:
+// c1,..,cn,1,...,1 for rhs opponent, 1,..,1, c1,..,cn for lhs opponent.
+// Returns the number of contracting dims.
+int GetXLADotPseudoOpponentShapeForReducingContractDims(
+    const xla::DotDimensionNumbers &dnums, const int xladot_output_rank,
+    ShapedType tensor_shape, const bool is_lhs,
+    SmallVector<int64_t> &opponent_shape) {
+  int opponent_required_dim = xladot_output_rank;
+  int used_rank = tensor_shape.getRank();
+
+  if (is_lhs) {
+    used_rank -= dnums.lhs_contracting_dimensions_size();
+    for (int64_t v : dnums.lhs_contracting_dimensions()) {
+      opponent_shape.push_back(tensor_shape.getDimSize(v));
+    }
+  } else {
+    used_rank -= dnums.rhs_contracting_dimensions_size();
+    for (int64_t v : dnums.rhs_contracting_dimensions()) {
+      opponent_shape.push_back(tensor_shape.getDimSize(v));
+    }
+  }
+
+  const int num_contract_dim = opponent_shape.size();
+  opponent_required_dim -= used_rank;
+
+  // Add redundant 1s to match the shape.
+  // Required 1s = out_dims - # my batch_dims - my remaining dims.
+  if (!is_lhs) {
+    absl::c_reverse(opponent_shape);
+  }
+  for (int i = 0; i < opponent_required_dim; i++) {
+    opponent_shape.push_back(1);
+  }
+  if (!is_lhs) {
+    absl::c_reverse(opponent_shape);
+  }
+
+  return num_contract_dim;
+}
+
+// Create a matrix with 1s using the given shape.
+Operation *Create1sMatrix(OpBuilder &builder, Location loc,
+                          const SmallVector<int64_t> &shape) {
+  SmallVector<int64_t> shape_ones(/*Size=*/shape.size(), /*Value=*/1);
+
+  return builder.create<TF::BroadcastToOp>(
+      loc, RankedTensorType::get(shape, builder.getIntegerType(32)),
+      CreateConstValue<int32_t>(builder, loc, shape_ones, {1}),
+      Create1DConstValue(builder, loc, shape));
+}
+
+// Create the output shape for XlaDotV2, given dot dimension numbers and shapes
+// of both inputs.
+SmallVector<int64_t> CreateOutputShape(const xla::DotDimensionNumbers &ddn,
+                                       const ArrayRef<int64_t> lhs_shape,
+                                       const ArrayRef<int64_t> rhs_shape) {
+  SmallVector<int64_t> output_shape;
+
+  // Prepare necessary indices.
+  absl::flat_hash_set<int64_t> lhs_remove_idx, rhs_remove_idx;
+  for (auto v : ddn.lhs_batch_dimensions()) {
+    lhs_remove_idx.insert(v);
+  }
+  for (auto v : ddn.lhs_contracting_dimensions()) {
+    lhs_remove_idx.insert(v);
+  }
+  for (auto v : ddn.rhs_batch_dimensions()) {
+    rhs_remove_idx.insert(v);
+  }
+  for (auto v : ddn.rhs_contracting_dimensions()) {
+    rhs_remove_idx.insert(v);
+  }
+
+  // Gather shapes for output.
+  for (auto v : ddn.lhs_batch_dimensions()) {
+    output_shape.push_back(lhs_shape[v]);
+  }
+
+  // Batch dimension is gathered from the right side.
+  if (output_shape.empty()) {
+    for (auto v : ddn.rhs_batch_dimensions()) {
+      output_shape.push_back(rhs_shape[v]);
+    }
+  }
+
+  // Gather remaining dimensions.
+  for (int i = 0; i < lhs_shape.size(); i++) {
+    if (lhs_remove_idx.find(i) == lhs_remove_idx.end()) {
+      output_shape.push_back(lhs_shape[i]);
+    }
+  }
+
+  for (int i = 0; i < rhs_shape.size(); i++) {
+    if (rhs_remove_idx.find(i) == rhs_remove_idx.end()) {
+      output_shape.push_back(rhs_shape[i]);
+    }
+  }
+
+  return output_shape;
+}
+
+// Generate an einsum equation from the given DotDimensionNumber.
+std::string CreateEinsumEquation(const xla::DotDimensionNumbers &ddn,
+                                 const int lhs_rank, const int rhs_rank) {
+  // Prepare necessary indices.
+  absl::flat_hash_set<int64_t> lhs_batch_idx, rhs_batch_idx;
+  absl::flat_hash_set<int64_t> lhs_contract_idx, rhs_contract_idx;
+  for (auto v : ddn.lhs_batch_dimensions()) {
+    lhs_batch_idx.insert(v);
+  }
+  for (auto v : ddn.lhs_contracting_dimensions()) {
+    lhs_contract_idx.insert(v);
+  }
+  for (auto v : ddn.rhs_batch_dimensions()) {
+    rhs_batch_idx.insert(v);
+  }
+  for (auto v : ddn.rhs_contracting_dimensions()) {
+    rhs_contract_idx.insert(v);
+  }
+
+  // Generate equation.
+  std::string lhs_eq = "";
+  std::string rhs_eq = "";
+  std::string out_eq = "";
+  char c = 'a';
+  std::vector<char> lhs_batch_dims;
+  std::vector<char> lhs_contract_dims;
+  for (int i = 0; i < lhs_rank; i++) {
+    absl::StrAppend(&lhs_eq, std::string(1, c));
+    if (lhs_batch_idx.find(i) != lhs_batch_idx.end()) {
+      lhs_batch_dims.push_back(c);
+    } else if (lhs_contract_idx.find(i) != lhs_contract_idx.end()) {
+      lhs_contract_dims.push_back(c);
+    }
+    c++;
+  }
+
+  int batch_trace_idx = 0;
+  int contract_trace_idx = 0;
+  bool rhs_only_batch = lhs_batch_dims.empty();
+  for (int i = 0; i < rhs_rank; i++) {
+    if (rhs_batch_idx.find(i) != rhs_batch_idx.end()) {
+      if (!rhs_only_batch) {
+        absl::StrAppend(&rhs_eq,
+                        std::string(1, lhs_batch_dims[batch_trace_idx]));
+        batch_trace_idx++;
+      } else {
+        absl::StrAppend(&rhs_eq, std::string(1, c));
+        lhs_batch_dims.push_back(c);
+        c++;
+      }
+    } else if (rhs_contract_idx.find(i) != rhs_contract_idx.end()) {
+      absl::StrAppend(&rhs_eq,
+                      std::string(1, lhs_contract_dims[contract_trace_idx]));
+      contract_trace_idx++;
+    } else {
+      rhs_eq += c;
+      c++;
+    }
+  }
+
+  // Create out_eq by merging lhs and rhs.
+  // In XlaDotv2 style - batch dim - leftover from lhs - leftover from rhs.
+  for (auto c : lhs_batch_dims) {
+    absl::StrAppend(&out_eq, std::string(1, c));
+  }
+  for (auto c : lhs_eq) {
+    if (!absl::StrContains(out_eq, c) && !absl::StrContains(rhs_eq, c)) {
+      absl::StrAppend(&out_eq, std::string(1, c));
+    }
+  }
+  for (auto c : rhs_eq) {
+    if (!absl::StrContains(out_eq, c) && !absl::StrContains(lhs_eq, c)) {
+      absl::StrAppend(&out_eq, std::string(1, c));
+    }
+  }
+
+  return absl::StrCat(lhs_eq, ",", rhs_eq, "->", out_eq);
+}
+
+// Check if the given einsum equation could be replaced with "reduce".
+bool IsReducable(const StringRef einsum_equation,
+                 const xla::DotDimensionNumbers &dnums, const bool is_lhs,
+                 SmallVector<int64_t> &out_dims) {
+  int idx_arrow = einsum_equation.find("->");
+  StringRef calc_eq = einsum_equation.substr(0, idx_arrow);
+  StringRef out_eq = einsum_equation.substr(idx_arrow + 2);
+
+  int idx_comma = calc_eq.find(',');
+  StringRef lhs_eq = calc_eq.substr(0, idx_comma);
+  StringRef rhs_eq = calc_eq.substr(idx_comma + 1);
+
+  std::string target_eq;
+  if (is_lhs) {
+    target_eq = lhs_eq;
+    for (auto v : dnums.lhs_contracting_dimensions()) {
+      target_eq[v] = '_';
+    }
+  } else {
+    target_eq = rhs_eq;
+    for (auto v : dnums.rhs_contracting_dimensions()) {
+      target_eq[v] = '_';
+    }
+  }
+
+  if (target_eq.size() > out_eq.size()) return false;
+
+  for (int i = 0; i < target_eq.size(); i++) {
+    int out_idx = out_eq.size() - target_eq.size() + i;
+    if (target_eq[i] != '_' && out_eq[out_idx] != target_eq[i]) {
+      return false;
+    }
+
+    if (target_eq[i] != '_') out_dims.push_back(i);
+  }
+
+  return true;
+}
+
+// Calculates other_tensor_zp * tensor for zero point offset calculation.
+// Things to do:
+//  1. Reduce the tensor (which is an input of XlaDotV2) with contracting
+//     dimensions of XlaDotV2.
+//     - The resultant dimension must match with XlaDotV2 resultant dimension
+//  2. Multiply it with zero point from the other tensor.
+// We decided to use tf.Einsum for step 1, since it would require transposes/
+// reshapes in many cases. More precisely, this function creates 1s matrix
+// with appropriate shape to match with the shape of XlaDotV2 result.
+// We didn't apply XlaEinsum or XlaDotV2 for this work, since it would loose
+// the chance for constant folding later. We could try to add some
+// postprocessing passes later to further optimize the graph after constant
+// folding.
+Value CreateZeroPointPartialOffsetXlaDotV2(
+    OpBuilder &builder, Location loc, Value tensor,
+    const int8_t other_tensor_zp, const xla::DotDimensionNumbers &dnums,
+    const bool is_lhs, const int xladot_output_rank) {
+  if (other_tensor_zp == 0) {
+    return CreateScalarConstValue<int32_t>(builder, loc, 0);
+  }
+
+  auto shape = mlir::cast<ShapedType>(tensor.getType());
+  SmallVector<int64_t> tensor_shape;
+  for (auto v : shape.getShape()) {
+    tensor_shape.push_back(v);
+  }
+
+  auto zp = CreateScalarConstValue<int32_t>(builder, loc, other_tensor_zp);
+
+  TensorType tensor_type = mlir::dyn_cast<TensorType>(tensor.getType());
+  Value tensor_i32 = builder.create<TF::CastOp>(
+      loc, tensor_type.clone(builder.getIntegerType(32)), tensor);
+
+  // Figure out the shape of einsum opponent pseudo-input.
+  SmallVector<int64_t> opponent_shape;
+  const int num_contract_dim =
+      GetXLADotPseudoOpponentShapeForReducingContractDims(
+          dnums, xladot_output_rank, shape, is_lhs, opponent_shape);
+
+  // Generate the dimension numbers for reduce.
+  xla::DotDimensionNumbers reduce_dnums;
+  CopyXlaDotDimensionNumbers(dnums, reduce_dnums, is_lhs);
+  const int contracting_dim_start =
+      is_lhs ? 0 : opponent_shape.size() - num_contract_dim;
+  for (int i = contracting_dim_start;
+       i < contracting_dim_start + num_contract_dim; i++) {
+    if (is_lhs) {
+      reduce_dnums.add_rhs_contracting_dimensions(i);
+    } else {
+      reduce_dnums.add_lhs_contracting_dimensions(i);
+    }
+  }
+
+  // Create the pseudo opponent matrix.
+  Operation *one_matrix = Create1sMatrix(builder, loc, opponent_shape);
+
+  // Calculate output shape of the reduce einsum operation.
+  SmallVector<int64_t> output_shape;
+  SmallVector<Value> input_arguments;
+  int lhs_rank, rhs_rank;
+  if (is_lhs) {
+    output_shape =
+        CreateOutputShape(reduce_dnums, tensor_shape, opponent_shape);
+    input_arguments.push_back(tensor_i32);
+    input_arguments.push_back(one_matrix->getResult(0));
+    lhs_rank = tensor_shape.size();
+    rhs_rank = opponent_shape.size();
+  } else {
+    output_shape =
+        CreateOutputShape(reduce_dnums, opponent_shape, tensor_shape);
+    input_arguments.push_back(one_matrix->getResult(0));
+    input_arguments.push_back(tensor_i32);
+    lhs_rank = opponent_shape.size();
+    rhs_rank = tensor_shape.size();
+  }
+
+  // Create the equation.
+  const std::string einsum_equation =
+      CreateEinsumEquation(reduce_dnums, lhs_rank, rhs_rank);
+
+  // Check if we can create "reduce" instead of "einsum".
+  // Condition: the target equation except contracting dimension must match the
+  // end of out equation.
+  SmallVector<int64_t> out_dims;
+  if (IsReducable(einsum_equation, dnums, is_lhs, out_dims)) {
+    return CreateZeroPointPartialOffset(builder, loc, tensor, other_tensor_zp,
+                                        out_dims);
+  }
+
+  Value reduced = builder.create<TF::EinsumOp>(
+      loc, RankedTensorType::get(output_shape, builder.getIntegerType(32)),
+      input_arguments, builder.getStringAttr(einsum_equation));
+
+  reduced.getDefiningOp()->setAttr(
+      kTfQuantCreatedEinsum,
+      BoolAttr::get(reduced.getDefiningOp()->getContext(), true));
+  auto mul_op = builder.create<TF::MulOp>(loc, zp, reduced);
+  SmallVector<Value> folded_results = ConstantFoldOpIfPossible(mul_op);
+  return folded_results.front();
+}
+
+// Calculates zero-point offset by reducing the weight and multiply it with zp.
+// Originally, we have:
+//   output = (int8_input - input_zp) * (int8_weight - weight_zp)
+// So, offset = input_zp * int8_weight + weight_zp * int8_input
+// - input_zp * weight_zp.
+// This function calculates the `offset` value mentioned above. Note that the
+// `output_dims` is the weight dimensions that are not contracted, so they
+// appear in the output shape.
+Value CalculateZeroPointOffsetXLADotV2(OpBuilder &builder, Location loc,
+                                       Value input, Value weight,
+                                       int8_t input_zp, int8_t weight_zp,
+                                       const xla::DotDimensionNumbers &dnums,
+                                       int output_rank) {
+  Value zp_input_contribution = CreateZeroPointPartialOffsetXlaDotV2(
+      builder, loc, input, weight_zp, dnums, /*is_lhs=*/true, output_rank);
+  Value zp_weight_contribution = CreateZeroPointPartialOffsetXlaDotV2(
+      builder, loc, weight, input_zp, dnums, /*is_lhs=*/false, output_rank);
+
+  auto weight_shape = mlir::cast<ShapedType>(weight.getType());
+
+  absl::flat_hash_set<int64_t> rhs_contracting_dims;
+  for (auto dim : dnums.rhs_contracting_dimensions()) {
+    rhs_contracting_dims.insert(dim);
+  }
+
+  SmallVector<int64_t> weight_output_dims;
+  for (int64_t i = 0; i < weight_shape.getRank(); i++) {
+    if (rhs_contracting_dims.find(i) == rhs_contracting_dims.end()) {
+      weight_output_dims.push_back(i);
+    }
+  }
+
+  if (input_zp != 0 && weight_zp != 0) {
+    return MergeZeroPointOffset(builder, loc, weight, weight_output_dims,
+                                input_zp, weight_zp, zp_input_contribution,
+                                zp_weight_contribution);
+  }
+
+  if (input_zp != 0) return zp_weight_contribution;
+  return zp_input_contribution;
+}
+
+// Helper function to create a XlaConvV2Op for Conv2DOp, DepthwiseConv2DOp and
+// Conv3DOp.
+Value CreateXlaConvOp(OpBuilder &builder, Location loc, Value input,
+                      Value filter, Value input_zp, Value conv_output,
+                      ArrayAttr strides, ArrayAttr dilations,
+                      StringAttr conv_padding, ArrayAttr explicit_paddings,
+                      int feature_group_cnt, int num_dims = 4) {
+  int32_t input_zp_value;
+  if (!GetSplatValue(input_zp, input_zp_value)) {
+    emitError(loc,
+              "zero point is expected to be a constant with a single value");
+    return {};
+  }
+  if (strides.size() != num_dims || dilations.size() != num_dims) {
+    emitError(loc,
+              absl::StrFormat(
+                  "strides and dilations are expected to be %d-element arrays",
+                  num_dims));
+    return {};
+  }
+
+  xla::ConvolutionDimensionNumbers dnums;
+  // Input: [N, H, W, C] for Conv2D or [N, D, H, W, C] for Conv3D.
+  dnums.set_input_batch_dimension(0);
+  dnums.set_input_feature_dimension(num_dims - 1);
+  // Kernel: [K, K, I, O] for Conv2D or [K, K, K, I, O] for Conv3D.
+  dnums.set_kernel_input_feature_dimension(num_dims - 2);
+  dnums.set_kernel_output_feature_dimension(num_dims - 1);
+  // Output: [N, H, W, C] for Conv2D or [N, D, H, W, C] for Conv3D.
+  dnums.set_output_batch_dimension(0);
+  dnums.set_output_feature_dimension(num_dims - 1);
+
+  for (int64_t i : llvm::seq<int64_t>(1, num_dims - 1)) {
+    dnums.add_input_spatial_dimensions(i);
+    dnums.add_kernel_spatial_dimensions(i - 1);
+    dnums.add_output_spatial_dimensions(i);
+  }
+
+  Value padding, window_strides, lhs_dilation, rhs_dilation,
+      feature_group_count;
+  PrepareXlaConvParams(builder, loc, strides, dilations, feature_group_cnt,
+                       /*window_strides=*/window_strides,
+                       /*lhs_dilation=*/lhs_dilation,
+                       /*rhs_dilation=*/rhs_dilation,
+                       /*feature_group_count=*/feature_group_count,
+                       /*num_dims=*/num_dims);
+
+  input = CalculatePaddingAndPadIfNeeded(
+      builder, loc, input, filter, input_zp_value, strides, dilations,
+      conv_padding, explicit_paddings, padding, num_dims);
+
+  std::string precision_config_str;
+  Value xla_conv_output =
+      builder
+          .create<TF::XlaConvV2Op>(
+              loc, /*output_type=*/conv_output.getType(),
+              /*lhs=*/input,
+              /*rhs=*/filter, window_strides, padding, lhs_dilation,
+              rhs_dilation, feature_group_count,
+              builder.getStringAttr(dnums.SerializeAsString()),
+              /*precision_config=*/builder.getStringAttr(precision_config_str))
+          .getOutput();
+
+  // Dynamic-range quantization wil always fall into this case.
+  if (input_zp_value == 0) return xla_conv_output;
+
+  Value zp_offset = CalculateZeroPointOffset(
+      builder, loc, input, filter, input_zp_value,
+      /*weight_zp=*/0,
+      /*input_output_dims=*/ArrayRef<int64_t>({0}),
+      /*weight_output_dims=*/ArrayRef<int64_t>({num_dims - 1}));
+  return builder.create<TF::SubOp>(loc, xla_conv_output, zp_offset).getZ();
+}
+
+// Creates a XlaConvV2Op from TF Conv2DOp and returns its output. The returned
+// value will be used as an input of the next op.
+Value CreateXlaConvOpFromTfConv2dOp(OpBuilder &builder, Location loc,
+                                    Value input, Value filter, Value input_zp,
+                                    Value conv_output, ArrayAttr strides,
+                                    ArrayAttr dilations,
+                                    StringAttr conv_padding,
+                                    ArrayAttr explicit_paddings) {
+  auto input_shape = mlir::cast<ShapedType>(input.getType());
+  auto filter_shape = mlir::cast<ShapedType>(filter.getType());
+  if (!input_shape.hasRank() || input_shape.getRank() != 4 ||
+      !filter_shape.hasRank() || filter_shape.getRank() != 4) {
+    emitError(loc, "input and filter are expected to be 4D tensors");
+    return {};
+  }
+
+  const int feature_group_cnt =
+      input_shape.getDimSize(3) / filter_shape.getDimSize(2);
+  return CreateXlaConvOp(builder, loc, input, filter, input_zp, conv_output,
+                         strides, dilations, conv_padding, explicit_paddings,
+                         feature_group_cnt);
+}
+
+// Creates a XlaConvV2Op from TF DepthwiseConv2DOp and returns its output.
+Value CreateXlaConvOpFromTfDepthwiseConv2dOp(
+    OpBuilder &builder, Location loc, Value input, Value filter, Value input_zp,
+    Value conv_output, ArrayAttr strides, ArrayAttr dilations,
+    StringAttr conv_padding, ArrayAttr explicit_paddings) {
+  auto input_shape = mlir::cast<ShapedType>(input.getType());
+  auto filter_shape = mlir::cast<ShapedType>(filter.getType());
+  if (!input_shape.hasRank() || input_shape.getRank() != 4 ||
+      !filter_shape.hasRank() || filter_shape.getRank() != 4) {
+    emitError(loc, "input and filter are expected to be 4D tensors");
+    return {};
+  }
+  const int feature_group_cnt = input_shape.getDimSize(3);
+
+  // Reshape the filter to [K, K, 1, I * O].
+  SmallVector<int64_t> new_filter_shape{
+      filter_shape.getDimSize(0), filter_shape.getDimSize(1), 1,
+      filter_shape.getDimSize(2) * filter_shape.getDimSize(3)};
+  Value new_filter = builder.create<TF::ReshapeOp>(
+      loc,
+      RankedTensorType::get(new_filter_shape, filter_shape.getElementType()),
+      filter, Create1DConstValue(builder, loc, new_filter_shape));
+  return CreateXlaConvOp(builder, loc, input, new_filter, input_zp, conv_output,
+                         strides, dilations, conv_padding, explicit_paddings,
+                         feature_group_cnt);
+}
+
+// Creates a XlaConvV2Op from TF Conv3DOp and returns its output.
+Value CreateXlaConvOpFromTfConv3dOp(OpBuilder &builder, Location loc,
+                                    Value input, Value filter, Value input_zp,
+                                    Value conv_output, ArrayAttr strides,
+                                    ArrayAttr dilations,
+                                    StringAttr conv_padding) {
+  auto input_shape = mlir::cast<ShapedType>(input.getType());
+  auto filter_shape = mlir::cast<ShapedType>(filter.getType());
+  if (!input_shape.hasRank() || input_shape.getRank() != 5 ||
+      !filter_shape.hasRank() || filter_shape.getRank() != 5) {
+    emitError(loc, "input and filter are expected to be 5D tensors");
+    return {};
+  }
+  const int feature_group_cnt =
+      input_shape.getDimSize(4) / filter_shape.getDimSize(3);
+
+  return CreateXlaConvOp(builder, loc, input, filter, input_zp, conv_output,
+                         strides, dilations, conv_padding,
+                         /*explicit_paddings=*/nullptr, feature_group_cnt,
+                         /*num_dims=*/5);
+}
+
+// Helper function to create an XlaDotV2Op.
+Value CreateXlaDotV2Op(OpBuilder &builder, Location loc, Value input,
+                       Value weight, Value input_zp, Value weight_zp,
+                       Value output, const xla::DotDimensionNumbers &dnums) {
+  int32_t input_zp_value = 0;
+  int32_t weight_zp_value = 0;
+  if (input_zp != nullptr && !GetSplatValue(input_zp, input_zp_value)) {
+    emitError(loc,
+              "zero point is expected to be a constant with a single value");
+    return {};
+  }
+
+  if (weight_zp != nullptr && !GetSplatValue(weight_zp, weight_zp_value)) {
+    emitError(loc,
+              "zero point is expected to be a constant with a single value");
+    return {};
+  }
+
+  std::string precision_config_str;
+
+  Value dot_result =
+      builder
+          .create<TF::XlaDotV2Op>(
+              loc, /*output=*/output.getType(),
+              /*lhs=*/input,
+              /*rhs=*/weight,
+              /*dimension_numbers=*/
+              builder.getStringAttr(dnums.SerializeAsString()),
+              /*precision_config=*/builder.getStringAttr(precision_config_str))
+          .getResult();
+
+  if (input_zp_value == 0) return dot_result;
+
+  Value zp_offset = CalculateZeroPointOffsetXLADotV2(
+      builder, loc, input, weight, input_zp_value, weight_zp_value, dnums,
+      mlir::cast<ShapedType>(output.getType()).getRank());
+
+  return builder.create<TF::SubOp>(loc, dot_result, zp_offset);
+}
+
+Value CreateXlaDotV2OpFromTfMatMulOp(OpBuilder &builder, Location loc,
+                                     Value input, Value weight, Value input_zp,
+                                     Value weight_zp, Value output,
+                                     BoolAttr transpose_a,
+                                     BoolAttr transpose_b) {
+  // Transpose and constant-fold the weight if needed.
+  if (transpose_b.getValue()) {
+    Value perm = Create1DConstValue<int32_t>(builder, loc, {1, 0});
+    auto transpose_op = builder.create<TF::TransposeOp>(loc, weight, perm);
+    weight = ConstantFoldOpIfPossible(transpose_op).front();
+  }
+
+  xla::DotDimensionNumbers dnums;
+  dnums.add_rhs_contracting_dimensions(0);
+  if (transpose_a.getValue()) {
+    dnums.add_lhs_contracting_dimensions(0);
+  } else {
+    dnums.add_lhs_contracting_dimensions(1);
+  }
+
+  return CreateXlaDotV2Op(builder, loc, input, weight, input_zp, weight_zp,
+                          output, dnums);
+}
+
+// Gets the broadcasted shapes of the input and weight of the BatchMatMul op
+// from their types. If there are dynamic dimesions, these shapes couldn't be
+// used as the arguments for the BroadcastTo ops.
+std::optional<std::pair<SmallVector<int64_t>, SmallVector<int64_t>>>
+GetBroadcastShapesForBatchMatmul(ShapedType input_type,
+                                 ShapedType weight_type) {
+  ArrayRef<int64_t> input_shape = input_type.getShape();
+  ArrayRef<int64_t> weight_shape = weight_type.getShape();
+
+  const int64_t num_matmul_dim = 2;
+  const int64_t num_input_batch_dim = input_type.getRank() - num_matmul_dim;
+  const int64_t num_weight_batch_dim = weight_type.getRank() - num_matmul_dim;
+
+  ArrayRef<int64_t> input_batch_dims =
+      input_shape.slice(0, num_input_batch_dim);
+  ArrayRef<int64_t> weight_batch_dims =
+      weight_shape.slice(0, num_weight_batch_dim);
+  ArrayRef<int64_t> input_matmul_dims =
+      input_shape.slice(num_input_batch_dim, num_matmul_dim);
+  ArrayRef<int64_t> weight_matmul_dims =
+      weight_shape.slice(num_weight_batch_dim, num_matmul_dim);
+
+  SmallVector<int64_t> broadcasted_batch_dims;
+  if (!OpTrait::util::getBroadcastedShape(input_batch_dims, weight_batch_dims,
+                                          broadcasted_batch_dims)) {
+    return std::nullopt;
+  }
+  SmallVector<int64_t> broadcasted_input_shape(broadcasted_batch_dims);
+  broadcasted_input_shape.append(input_matmul_dims.begin(),
+                                 input_matmul_dims.end());
+  SmallVector<int64_t> broadcasted_weight_shape(broadcasted_batch_dims);
+  broadcasted_weight_shape.append(weight_matmul_dims.begin(),
+                                  weight_matmul_dims.end());
+
+  return std::make_pair(std::move(broadcasted_input_shape),
+                        std::move(broadcasted_weight_shape));
+}
+
+// Broadcasts batch dimensions of the input and weight of the BatchMatMul
+// op. In XLA, shapes are all constants, so all operations created in this
+// function, except BroadcastTo, are expected to be folded.
+void BroadcastBatchDimensionsForBatchMatMul(OpBuilder &builder, Location loc,
+                                            Value &input, Value &weight) {
+  ShapedType input_type = mlir::cast<ShapedType>(input.getType());
+  ShapedType weight_type = mlir::cast<ShapedType>(weight.getType());
+  const int32_t input_rank = input_type.getRank();
+  const int32_t weight_rank = weight_type.getRank();
+  const int32_t broadcasted_rank = std::max(input_rank, weight_rank);
+
+  const int32_t num_matmul_dim = 2;
+  const int32_t num_input_batch_dim = input_rank - num_matmul_dim;
+  const int32_t num_weight_batch_dim = weight_rank - num_matmul_dim;
+  if (num_input_batch_dim == 0 && num_weight_batch_dim == 0) return;
+
+  // If the broadcasted shapes can be calculated statically, only add two
+  // BroadcastTo ops for input and weight.
+  auto broadcasted_shapes_or =
+      GetBroadcastShapesForBatchMatmul(input_type, weight_type);
+  if (!broadcasted_shapes_or.has_value()) return;
+  const auto broadcasted_input_type = RankedTensorType::get(
+      broadcasted_shapes_or->first, input_type.getElementType());
+  const auto broadcasted_weight_type = RankedTensorType::get(
+      broadcasted_shapes_or->second, weight_type.getElementType());
+
+  if (broadcasted_input_type.hasStaticShape() &&
+      broadcasted_weight_type.hasStaticShape()) {
+    input = builder.create<TF::BroadcastToOp>(
+        loc, broadcasted_input_type, input,
+        Create1DConstValue(builder, loc, broadcasted_shapes_or->first));
+    weight = builder.create<TF::BroadcastToOp>(
+        loc, broadcasted_weight_type, weight,
+        Create1DConstValue(builder, loc, broadcasted_shapes_or->second));
+    return;
+  }
+
+  const Value zero = Create1DConstValue<int32_t>(builder, loc, {0});
+  const Value num_matmul_dim_value =
+      Create1DConstValue<int32_t>(builder, loc, {num_matmul_dim});
+  const Value num_input_batch_dim_value =
+      Create1DConstValue<int32_t>(builder, loc, {num_input_batch_dim});
+  const Value num_weight_batch_dim_value =
+      Create1DConstValue<int32_t>(builder, loc, {num_weight_batch_dim});
+
+  // Decompose the input and weight shape into batch and matmul dimensions.
+  Value input_shape = builder.create<TF::ShapeOp>(
+      loc, input, /*use32Bit=*/builder.getBoolAttr(false));
+  Value input_batch_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_input_batch_dim}, builder.getI64Type()),
+      input_shape, zero, num_input_batch_dim_value);
+  Value input_matmul_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_matmul_dim}, builder.getI64Type()),
+      input_shape, num_input_batch_dim_value, num_matmul_dim_value);
+
+  Value weight_shape = builder.create<TF::ShapeOp>(
+      loc, weight, /*use32Bit=*/builder.getBoolAttr(false));
+  Value weight_batch_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_weight_batch_dim}, builder.getI64Type()),
+      weight_shape, zero, num_weight_batch_dim_value);
+  Value weight_matmul_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_matmul_dim}, builder.getI64Type()),
+      weight_shape, num_weight_batch_dim_value, num_matmul_dim_value);
+
+  // Calculate the broadcasted shapes.
+  Value broadcasted_batch_dims = builder.create<TF::BroadcastArgsOp>(
+      loc,
+      RankedTensorType::get({broadcasted_rank - num_matmul_dim},
+                            builder.getI64Type()),
+      input_batch_dims, weight_batch_dims);
+  Type broadcasted_shape_type =
+      RankedTensorType::get({broadcasted_rank}, builder.getI64Type());
+
+  const Value zero_scalar = CreateScalarConstValue<int32_t>(builder, loc, 0);
+  Value broacasted_input_shape = builder.create<TF::ConcatOp>(
+      loc, broadcasted_shape_type, /*concat_dim=*/zero_scalar,
+      ValueRange{broadcasted_batch_dims, input_matmul_dims});
+  Value broacasted_weight_shape = builder.create<TF::ConcatOp>(
+      loc, broadcasted_shape_type, /*concat_dim=*/zero_scalar,
+      ValueRange{broadcasted_batch_dims, weight_matmul_dims});
+
+  // Broadcast input and weight with the calculated shapes.
+  input = builder.create<TF::BroadcastToOp>(loc, broadcasted_input_type, input,
+                                            broacasted_input_shape);
+  weight = builder.create<TF::BroadcastToOp>(loc, broadcasted_weight_type,
+                                             weight, broacasted_weight_shape);
+}
+
+Value CreateXlaDotV2OpFromTfBatchMatMulOp(OpBuilder &builder, Location loc,
+                                          Value input, Value weight,
+                                          Value input_zp, Value weight_zp,
+                                          Value output, BoolAttr adj_x,
+                                          BoolAttr adj_y) {
+  // TensorFlow BatchMatMulOp allows the batch dimensions to be broadcastable
+  // while the XlaDotV2Op doesn't. So we have to broadcast them beforehand.
+  BroadcastBatchDimensionsForBatchMatMul(builder, loc, input, weight);
+
+  // Both input and weight have the same rank after broadcasting.
+  ShapedType weight_shape = mlir::cast<ShapedType>(weight.getType());
+  int num_batch_dim = weight_shape.getRank() - 2;
+
+  // Transpose and constant-fold the weight if needed.
+  if (adj_y.getValue()) {
+    SmallVector<int32_t> perm_values(num_batch_dim);
+    absl::c_iota(perm_values, 0);
+    perm_values.push_back(num_batch_dim + 1);
+    perm_values.push_back(num_batch_dim);
+    Value perm = Create1DConstValue<int32_t>(builder, loc, perm_values);
+    auto transpose_op = builder.create<TF::TransposeOp>(loc, weight, perm);
+    weight = ConstantFoldOpIfPossible(transpose_op).front();
+  }
+
+  xla::DotDimensionNumbers dnums;
+  for (int i : llvm::seq<int32_t>(0, num_batch_dim)) {
+    dnums.add_lhs_batch_dimensions(i);
+    dnums.add_rhs_batch_dimensions(i);
+  }
+  dnums.add_rhs_contracting_dimensions(num_batch_dim);
+  if (adj_x.getValue()) {
+    dnums.add_lhs_contracting_dimensions(num_batch_dim);
+  } else {
+    dnums.add_lhs_contracting_dimensions(num_batch_dim + 1);
+  }
+
+  return CreateXlaDotV2Op(builder, loc, input, weight, input_zp, weight_zp,
+                          output, dnums);
+}
+
+// Check if the given value is a ranked type with specified integer width.
+bool IsRankedInt(Value value, const int integer_width) {
+  ShapedType value_type = mlir::cast<ShapedType>(value.getType());
+  if (!value_type.hasRank()) return false;
+  if (!value_type.getElementType().isInteger(integer_width)) return false;
+
+  return true;
+}
+
+// Constraint to check:
+// 1. The einsum has two inputs and one output.
+// 2. The einsum is not created by the convert function itself.
+// 3. Both inputs are int32 tensor.
+// 4. Both inputs have the graph ancestor of either const-(sub), or cast-sub.
+// 5. The type of the const tensor (or input of the cast operation) is int8.
+bool IsEinsumOpSupported(Value output, OperandRange args,
+                         StringAttr equation_attr) {
+  Operation *op = output.getDefiningOp();
+  if (op->getAttrOfType<BoolAttr>(kTfQuantCreatedEinsum) != nullptr) {
+    return false;
+  }
+
+  // Only supports einsum with two inputs and one specified output.
+  if (args.size() != 2) return false;
+  if (!absl::StrContains(equation_attr.str(), "->")) return false;
+
+  // Check the types and ranks of the input arguments.
+  if (!IsRankedInt(args[0], 32)) return false;
+  if (!IsRankedInt(args[1], 32)) return false;
+
+  // Trace the graph to see if the conversion is applicable.
+  Operation *op_input = args[0].getDefiningOp();
+  Operation *op_weight = args[1].getDefiningOp();
+  if (isa<TF::SubOp>(op_input)) {
+    op_input = op_input->getOperand(0).getDefiningOp();
+  }
+  if (isa<TF::SubOp>(op_weight)) {
+    op_weight = op_weight->getOperand(0).getDefiningOp();
+  }
+  if (isa<TF::CastOp>(op_input)) {
+    op_input = op_input->getOperand(0).getDefiningOp();
+  } else if (!isa<TF::ConstOp>(op_input)) {
+    return false;
+  }
+  if (isa<TF::CastOp>(op_weight)) {
+    op_weight = op_weight->getOperand(0).getDefiningOp();
+  } else if (!isa<TF::ConstOp>(op_weight)) {
+    return false;
+  }
+
+  if (!IsRankedInt(op_weight->getResult(0), 8)) return false;
+  if (!IsRankedInt(op_input->getResult(0), 8)) return false;
+
+  return true;
+}
+
+// Convert an einsum equation into XLA Dot Dimension Numbers.
+// If the return flag is true, the arguments for XlaDotV2 should be swapped.
+xla::DotDimensionNumbers ConvertEinsumEquationIntoXlaDotDimensionNumbers(
+    const StringRef equation) {
+  xla::DotDimensionNumbers dnums;
+
+  // 1. Parse the given equation.
+  int idx_arrow = equation.find("->");
+  StringRef calc_eq = equation.substr(0, idx_arrow);
+  StringRef out_eq = equation.substr(idx_arrow + 2);
+
+  int idx_comma = calc_eq.find(',');
+  StringRef lhs_eq = calc_eq.substr(0, idx_comma);
+  StringRef rhs_eq = calc_eq.substr(idx_comma + 1);
+
+  // 2.Fill the DDN.
+  std::vector<int> lhs_batch_dims, lhs_contract_dims;
+  std::vector<int> rhs_batch_dims, rhs_contract_dims;
+
+  for (int i = 0; i < lhs_eq.size(); i++) {
+    char c = lhs_eq.data()[i];
+    if (absl::StrContains(out_eq, c) && absl::StrContains(rhs_eq, c)) {
+      dnums.add_lhs_batch_dimensions(i);
+    } else if (!absl::StrContains(out_eq, c)) {
+      dnums.add_lhs_contracting_dimensions(i);
+    }
+  }
+
+  for (int i = 0; i < rhs_eq.size(); i++) {
+    char c = rhs_eq.data()[i];
+    if (absl::StrContains(out_eq, c) && absl::StrContains(lhs_eq, c)) {
+      dnums.add_rhs_batch_dimensions(i);
+    } else if (!absl::StrContains(out_eq, c)) {
+      dnums.add_rhs_contracting_dimensions(i);
+    }
+  }
+
+  return dnums;
+}
+
+// Trace the graph to find out the actual operation.
+Value getActualValue(Operation *op) {
+  if (isa<TF::CastOp>(op)) {
+    op = op->getOperand(0).getDefiningOp();
+  }
+
+  if (isa<TF::IdentityOp>(op)) {
+    op = op->getOperand(0).getDefiningOp();
+  }
+  return op->getResult(0);
+}
+
+Value CreateXlaDotV2OpFromTfEinsumOp(OpBuilder &builder, Location loc,
+                                     StringAttr equation_attr,
+                                     OperandRange args, Value output) {
+  xla::DotDimensionNumbers dnums =
+      ConvertEinsumEquationIntoXlaDotDimensionNumbers(equation_attr);
+
+  // Look for zp.
+  Value input_zp = nullptr;
+  Value weight_zp = nullptr;
+  Operation *op_input = args[0].getDefiningOp();
+  Operation *op_weight = args[1].getDefiningOp();
+  if (isa<TF::SubOp>(op_input)) {
+    input_zp = op_input->getOperand(1);
+    op_input = op_input->getOperand(0).getDefiningOp();
+  } else {
+    builder.setInsertionPoint(op_input->getPrevNode());
+    input_zp = Create1DConstValue<int32_t>(builder, loc, {0});
+  }
+
+  if (isa<TF::SubOp>(op_weight)) {
+    weight_zp = op_weight->getOperand(1);
+    op_weight = op_weight->getOperand(0).getDefiningOp();
+  } else {
+    builder.setInsertionPoint(op_weight->getPrevNode());
+    weight_zp = Create1DConstValue<int32_t>(builder, loc, {0});
+  }
+
+  Value input = getActualValue(op_input);
+  Value weight = getActualValue(op_weight);
+
+  return CreateXlaDotV2Op(builder, loc, input, weight, input_zp, weight_zp,
+                          output, dnums);
+}
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_replace_cast_hacks_with_tf_xla_ops.inc"
+
+void ReplaceCastHacksWithTFXLAOpsPass::runOnOperation() {
+  func::FuncOp func = getOperation();
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  populateWithGenerated(patterns);
+  if (failed(applyPatternsGreedily(func, std::move(patterns)))) {
+    func.emitError() << "tf-quant-replace-cast-hacks-with-tf-xla-ops failed.";
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplaceCastHacksWithTFXLAOpsPass() {
+  return std::make_unique<ReplaceCastHacksWithTFXLAOpsPass>();
+}
+
+static PassRegistration<ReplaceCastHacksWithTFXLAOpsPass> pass;
+
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_replace_cast_hacks_with_tf_xla_ops.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_replace_cast_hacks_with_tf_xla_ops.td
new file mode 100644
index 000000000000..ccd477c310e2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_replace_cast_hacks_with_tf_xla_ops.td
@@ -0,0 +1,531 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+def CreateXLAConvOpFromTFConv2DOp : NativeCodeCall<
+  "CreateXlaConvOpFromTfConv2dOp($_builder, $_loc, $0...)">;
+
+def CreateXLAConvOpFromTFDepthwiseConv2DOp : NativeCodeCall<
+  "CreateXlaConvOpFromTfDepthwiseConv2dOp($_builder, $_loc, $0...)">;
+
+def CreateXlaDotV2OpFromTfMatMulOp : NativeCodeCall<
+  "CreateXlaDotV2OpFromTfMatMulOp($_builder, $_loc, $0...)">;
+
+def CreateXLAConvOpFromTFConv3DOp : NativeCodeCall<
+  "CreateXlaConvOpFromTfConv3dOp($_builder, $_loc, $0...)">;
+
+def CreateXlaDotV2OpFromTfBatchMatMulOp : NativeCodeCall<
+  "CreateXlaDotV2OpFromTfBatchMatMulOp($_builder, $_loc, $0...)">;
+
+def CreateXlaDotV2OpFromTfEinsumOp : NativeCodeCall<
+  "CreateXlaDotV2OpFromTfEinsumOp($_builder, $_loc, $0...)">;
+
+def IsEinsumOpSupported : Constraint<
+  CPred<"IsEinsumOpSupported($0, $1, $2)">,
+  "Check if the given einsum op could be converted into a XlaDotV2 op.">;
+
+// Converts inlined Conv2D pattern to TF XlaConvV2 op. This pattern doesn't
+// support non-constant weights.
+def ConvertTFConv2DToXLAConvOp : Pat<
+  (TF_Conv2DOp:$conv
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_CastOp (TF_IdentityOp $filter), $truncate1),
+    $strides, $use_cudnn, $padding, $explicit_padding,
+    IsDataFormatNHWC:$data_format, $dilations),
+  (CreateXLAConvOpFromTFConv2DOp
+    $input, $filter, $input_zp, $conv, $strides,
+    $dilations, $padding, $explicit_padding),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $input_zp),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFConv2DToXLAConvOp but handles the case where input zero
+// point is dynaically calculated so not a constant.
+def ConvertTFConv2DToXLAConvOpDynamicRange : Pat<
+  (TF_Conv2DOp:$conv
+    (TF_SubOp:$input (TF_CastOp $input_i8, $truncate0), $input_zp),
+    (TF_CastOp (TF_IdentityOp $filter), $truncate1),
+    $strides, $use_cudnn, $padding, $explicit_padding,
+    IsDataFormatNHWC:$data_format, $dilations),
+  (CreateXLAConvOpFromTFConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides,
+    $dilations, $padding, $explicit_padding),
+  [(IsInt32ElementType $input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+// Convert Conv2D with hybrid inputs (f32 activation/int8 weight) to XlaConv
+def ConvertTFConv2DToXLAConvOpWeightOnly : Pat<
+  (TF_Conv2DOp:$conv
+    $input,
+    (TF_MulOp (TF_CastOp (TF_IdentityOp $filter), $truncate1), $scale),
+    $strides, $use_cudnn, $padding, $explicit_padding,
+    IsDataFormatNHWC:$data_format, $dilations),
+  (TF_MulOp (CreateXLAConvOpFromTFConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides,
+    $dilations, $padding, $explicit_padding), $scale),
+  [(IsF32ElementType $input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsF32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFConv2DToXLAConvOp but handles the case where input zero
+// point is 0 and the Sub op has been folded.
+def ConvertTFConv2DWithNoZeroPointToXLAConvOp : Pat<
+  (TF_Conv2DOp:$conv
+    (TF_CastOp $input, $truncate),
+    (TF_CastOp (TF_IdentityOp $filter), $truncate1),
+    $strides, $use_cudnn, $padding, $explicit_padding,
+    IsDataFormatNHWC:$data_format, $dilations),
+  (CreateXLAConvOpFromTFConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides, $dilations, $padding, $explicit_padding),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+// Converts inlined DepthwiseConv2D pattern to TF XlaConvV2 op. This pattern
+// doesn't support non-constant weights.
+def ConvertTFDepthwiseConv2DToXLAConvOp : Pat<
+  (TF_CastOp:$conv
+    (TF_DepthwiseConv2dNativeOp
+      (TF_CastOp:$cast_input
+        (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp), $truncate2),
+      (TF_CastOp
+        (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
+      $strides, $padding, $explicit_padding,
+      IsDataFormatNHWC:$data_format, $dilations), $truncate5),
+  (CreateXLAConvOpFromTFDepthwiseConv2DOp
+    $input, $filter, $input_zp, $conv, $strides,
+    $dilations, $padding, $explicit_padding),
+  [(IsInt8ElementType $input),
+   (IsF32ElementType $cast_input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $input_zp),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFDepthwiseConv2DToXLAConvOp but handles the case where input
+// zero point is dynaically calculated so not a constant.
+def ConvertTFDepthwiseConv2DToXLAConvOpDynamicRange : Pat<
+  (TF_CastOp:$conv
+    (TF_DepthwiseConv2dNativeOp
+    (TF_CastOp
+      (TF_SubOp:$input (TF_CastOp $input_i8, $truncate0), $input_zp), $truncate1),
+    (TF_CastOp
+      (TF_CastOp (TF_IdentityOp $filter), $truncate2), $truncate3),
+      $strides, $padding, $explicit_padding,
+      IsDataFormatNHWC:$data_format, $dilations), $truncate4),
+  (CreateXLAConvOpFromTFDepthwiseConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides,
+    $dilations, $padding, $explicit_padding),
+  [(IsInt32ElementType $input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+// Convert DepthwiseConv2D with hybrid inputs (f32 activation/int8 weight) to
+// XlaConv
+def ConvertTFDepthwiseConv2DToXLAConvOpWeightOnly : Pat<
+    (TF_DepthwiseConv2dNativeOp:$conv $input,
+      (TF_MulOp (TF_CastOp (TF_IdentityOp $filter), $truncate2), $scale),
+      $strides, $padding, $explicit_padding,
+      IsDataFormatNHWC:$data_format, $dilations),
+  (TF_MulOp (CreateXLAConvOpFromTFDepthwiseConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides,
+    $dilations, $padding, $explicit_padding), $scale),
+  [(IsF32ElementType $input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsF32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+
+// Same as ConvertTFDepthwiseConv2DToXLAConvOp but handles the case where input
+// zero point is 0 and the Sub op has been folded.
+def ConvertTFDepthwiseConv2DWithNoZeroPointToXLAConvOp : Pat<
+  (TF_CastOp:$conv
+    (TF_DepthwiseConv2dNativeOp
+      (TF_CastOp:$cast_input
+        (TF_CastOp $input, $truncate1), $truncate2),
+      (TF_CastOp
+        (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
+      $strides, $padding, $explicit_padding,
+      IsDataFormatNHWC:$data_format, $dilations), $truncate5),
+  (CreateXLAConvOpFromTFDepthwiseConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides, $dilations, $padding, $explicit_padding),
+  [(IsInt8ElementType $input),
+   (IsF32ElementType $cast_input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  [], (addBenefit 10)>;
+
+
+// Converts inlined MatMul pattern to TF XlaDotV2 op. This pattern doesn't
+// support non-constant weights.
+def ConvertTFMatMulToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">), $matmul,
+    $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $input_zp),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $matmul),
+   (HasStaticShapeConstraint $weight)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFMatMulToXLADotV2Op but handles the case where input zero
+// point is dynaically calculated so not a constant.
+def ConvertTFMatMulToXLADotV2OpDynamicRange : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_SubOp:$input (TF_CastOp $input_i8, $truncate0), $input_zp),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b),
+  [(IsInt32ElementType $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $matmul),
+   (HasStaticShapeConstraint $weight)],
+  [], (addBenefit 10)>;
+
+// Convert Matmul with hybrid inputs (f32 activation/int8 weight) to XlaDotV2
+def ConvertTFMatMulToXLADotV2OpWeightOnly : Pat<
+  (TF_MatMulOp:$matmul
+    $input,
+    (TF_MulOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $scale),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (TF_MulOp (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b), $scale),
+  [(IsF32ElementType $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $weight),
+   (IsF32ElementType $matmul),
+   (HasStaticShapeConstraint $weight)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFMatMulToXLADotV2Op but handles the case where input
+// zero point is 0 and the Sub op has been folded.
+def ConvertTFMatMulWithNoZeroPointToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_CastOp $input, $truncate),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $matmul),
+   (HasStaticShapeConstraint $weight)],
+  [], (addBenefit 10)>;
+
+// Converts inlined MatMul pattern to TF XlaDotV2 op. This pattern supports
+// non-constant weights.
+def ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp),
+    (TF_SubOp (TF_CastOp $weight, $truncate2), $weight_zp),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, $input_zp, $weight_zp, $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $matmul)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op but handles the case
+// where input zero point is 0 and the Sub op has been folded.
+def ConvertTFMatMulWithTwoInputTensorsAndNoInputZeroPointToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_CastOp $input, $truncate),
+    (TF_SubOp (TF_CastOp $weight, $truncate2), $weight_zp),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $weight_zp, $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $matmul)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op but handles the case
+// where weight zero point is 0 and the Sub op has been folded.
+def ConvertTFMatMulWithTwoInputTensorsAndNoWeightZeroPointToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_CastOp $weight, $truncate1),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (IsInt32ElementType $matmul)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op but handles the case
+// where both zero point is 0 and the Sub op has been folded.
+def ConvertTFMatMulWithTwoInputTensorsAndNoBothZeroPointsToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_CastOp $input, $truncate),
+    (TF_CastOp $weight, $truncate1),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (IsInt32ElementType $matmul)],
+  [], (addBenefit 10)>;
+
+
+// Converts inlined Conv3D pattern to TF XlaConvV2 op. This pattern
+// doesn't support non-constant weights.
+def ConvertTFConv3DToXLAConvOp : Pat<
+  (TF_CastOp:$conv
+    (TF_Conv3DOp
+      (TF_CastOp:$cast_input
+        (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp), $truncate2),
+      (TF_CastOp
+        (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
+      $strides, $padding, IsDataFormatNDHWC:$data_format, $dilations),
+    $truncate5),
+  (CreateXLAConvOpFromTFConv3DOp
+    $input, $filter, $input_zp, $conv, $strides, $dilations, $padding),
+  [(IsInt8ElementType $input),
+   (IsF32ElementType $cast_input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"4"> $input)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFConv3DToXLAConvOp but handles the case where input
+// zero point is 0 and the Sub op has been folded.
+def ConvertTFConv3DWithNoZeroPointToXLAConvOp : Pat<
+  (TF_CastOp:$conv
+    (TF_Conv3DOp
+      (TF_CastOp:$cast_input
+        (TF_CastOp $input, $truncate1), $truncate2),
+      (TF_CastOp
+        (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
+      $strides, $padding, IsDataFormatNDHWC:$data_format, $dilations),
+    $truncate5),
+  (CreateXLAConvOpFromTFConv3DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides, $dilations, $padding),
+  [(IsInt8ElementType $input),
+   (IsF32ElementType $cast_input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"4"> $input)],
+  [], (addBenefit 10)>;
+
+// Converts inlined BatchMatMul pattern to TF XlaDotV2 op. This pattern doesn't
+// support non-constant weights.
+def ConvertTFBatchMatMulToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $adj_x, $adj_y, $grad_x, $grad_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (HasRank $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $batch_matmul),
+   (HasStaticShapeConstraint $weight)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulToXLADotV2Op but handles the case where input
+// zero point is 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithNoZeroPointToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_CastOp $input, $truncate),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $adj_x, $adj_y, $grad_x, $grad_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (HasRank $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $batch_matmul),
+   (HasStaticShapeConstraint $weight)],
+  [], (addBenefit 10)>;
+
+// Converts inlined BatchMatMul pattern to TF XlaDotV2 op. Support for
+// non-constant weights.
+// TODO(b/263529454): Remove redundant identity of the rule input on the second
+// argument.
+def ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_SubOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $weight_zp),
+    $adj_x, $adj_y, $grad_x, $grad_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, $input_zp, $weight_zp, $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $batch_matmul)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2O but handles
+// the case where input zero point is 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithTwoInputTensorsAndNoInputZeroPointToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_CastOp $input, $truncate),
+    (TF_SubOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $weight_zp),
+    $adj_x, $adj_y, $grad_x, $grad_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $weight_zp, $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $batch_matmul)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2O but handles
+// the case where weight zero point is 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithTwoInputTensorsAndNoWeightZeroPointToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp),
+    (TF_CastOp $weight, $truncate2),
+    $adj_x, $adj_y, $grad_x, $grad_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (IsInt32ElementType $batch_matmul)],
+  [], (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2O but handles
+// the case where both zero points are 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithTwoInputTensorsAndNoBothZeroPointsToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_CastOp $input, $truncate1),
+    (TF_CastOp $weight, $truncate2),
+    $adj_x, $adj_y, $grad_x, $grad_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (IsInt32ElementType $batch_matmul)],
+  [], (addBenefit 10)>;
+
+// Converts inlined Einsum pattern to TF XlaDotV2 op.
+def ConvertTFEinsumToXLADotV2Op : Pat<
+  (TF_EinsumOp:$einsum
+    $args, $equation),
+  (CreateXlaDotV2OpFromTfEinsumOp
+    $equation, $args, $einsum),
+  [(IsInt32ElementType $einsum),
+   // Constraint to check:
+   // 1. The einsum has two inputs and one output.
+   // 2. The einsum is not created by the convert function itself.
+   // 3. Both inputs are int32 tensor.
+   // 4. Both inputs have the graph ancestor of either const-(sub), or cast-sub.
+   // 5. The type of the const tensor (or input of the cast operation) is int8.
+   (IsEinsumOpSupported $einsum, $args, $equation)],
+  [], (addBenefit 10)>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_unfreeze_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_unfreeze_constants.cc
new file mode 100644
index 000000000000..a26be176f6e1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_unfreeze_constants.cc
@@ -0,0 +1,361 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
+
+namespace mlir {
+namespace tf_quant {
+namespace {
+
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::GetSessionInitializerOp;
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
+using ::mlir::tf_saved_model::SessionInitializerOp;
+
+constexpr absl::string_view kDefaultConstName = "const";
+
+// The default lower threshold for the constant size for unfreezing.
+constexpr int64_t kDefaultConstantSizeThresholdInBytes = 64 * 1024;  // 64KiB
+
+// This pass "unfreezes" constants found in the moudle and converts them to
+// `tf.VarHandleOp`s. Also, an initialization pattern
+// `tf.AssignVariableOp(tf.VarHandleOp, tf.ConstOp)` is inserted to the
+// initializer function of type "restore_op" for each of the unfrozen constants.
+//
+// The constants whose sizes are smaller than `size_threshold_in_bytes_` will
+// not be converted to variables.
+class UnfreezeConstantsPass
+    : public PassWrapper<UnfreezeConstantsPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(UnfreezeConstantsPass)
+
+  explicit UnfreezeConstantsPass()
+      : UnfreezeConstantsPass(kDefaultConstantSizeThresholdInBytes) {}
+
+  explicit UnfreezeConstantsPass(const int64_t size_threshold_in_bytes)
+      : size_threshold_in_bytes_(
+            CreateSizeThresholdInBytesOption(size_threshold_in_bytes)) {}
+
+  UnfreezeConstantsPass(const UnfreezeConstantsPass& other)
+      : UnfreezeConstantsPass{} {
+    size_threshold_in_bytes_ = other.size_threshold_in_bytes_.getValue();
+  }
+
+  StringRef getArgument() const override {
+    return "tf-quant-unfreeze-constants";
+  }
+
+  StringRef getDescription() const override {
+    return "Unfreeze large constants.";
+  }
+
+  void runOnOperation() override;
+
+ private:
+  Option<int64_t> CreateSizeThresholdInBytesOption(const int64_t init_value) {
+    return Option<int64_t>(
+        *this, "size_threshold_in_bytes", llvm::cl::init(init_value),
+        llvm::cl::desc(
+            "Lower threshold of the constant size for unfreezing. Constants "
+            "smaller than this value will not be converted to variables."));
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect,
+                    tf_saved_model::TensorFlowSavedModelDialect>();
+  }
+
+  // Lower-bound threshold for the size of the constant in bytes. Constants
+  // larger than this threshold will not be unfrozen and will remain as
+  // constants.
+  Option<int64_t> size_threshold_in_bytes_;
+};
+
+// Adds the symbol to the "initializers" attribute of the session_initializer
+// op.
+void AddSymbolToInitializersAttr(SessionInitializerOp session_init_op,
+                                 FlatSymbolRefAttr symbol) {
+  const auto prev_initializers = session_init_op.getInitializersAttr();
+  llvm::SmallVector<Attribute> initializers_attrs{prev_initializers.begin(),
+                                                  prev_initializers.end()};
+  initializers_attrs.emplace_back(symbol);
+
+  session_init_op.setInitializersAttr(
+      ArrayAttr::get(session_init_op.getContext(), initializers_attrs));
+}
+
+// Returns the session_initializer op in the module if exists. Otherwise,
+// creates a new session_initializer op and returns it.
+SessionInitializerOp GetOrCreateSessionInitializerOp(ModuleOp module_op) {
+  SessionInitializerOp session_init_op = GetSessionInitializerOp(module_op);
+
+  // Create one if it doesn't exist.
+  if (!session_init_op) {
+    OpBuilder builder(&module_op.getBodyRegion());
+
+    session_init_op = builder.create<SessionInitializerOp>(
+        module_op.getLoc(), /*initializers=*/builder.getArrayAttr({}));
+  }
+
+  return session_init_op;
+}
+
+// Create the initializer function right after the SessionInitializer op.
+// Returns the newly created initializer function. The initializer function's
+// initializer_type is set to "restore_op" since it essentially serves as a
+// variable restoration function.
+func::FuncOp CreateInitializerFunc(ModuleOp module_op) {
+  SessionInitializerOp session_init_op =
+      GetOrCreateSessionInitializerOp(module_op);
+
+  OpBuilder builder(module_op.getContext());
+  builder.setInsertionPointAfter(session_init_op);
+
+  const Location loc = builder.getUnknownLoc();
+  const auto func_type = builder.getFunctionType(/*inputs=*/{}, /*results=*/{});
+
+  auto init_func = builder.create<func::FuncOp>(
+      loc, /*sym_name=*/"init_func_restore_op", func_type);
+  builder.createBlock(&init_func.getBody(), /*insertPt=*/init_func.begin(),
+                      /*arg_types=*/{}, /*arg_locs=*/{});
+
+  init_func->setAttr(kTfSavedModelExportedNamesAttr,
+                     builder.getStrArrayAttr(
+                         {"tf_saved_model.session_initializer_restore_op"}));
+  init_func->setAttr(
+      kTfSavedModelInitializerTypeAttr,
+      builder.getStringAttr(kTfSavedModelInitializerRestoreType));
+
+  builder.setInsertionPointToStart(&init_func.front());
+  builder.create<func::ReturnOp>(loc, /*operands=*/ValueRange{});
+
+  SymbolTable symbol_table(module_op);
+  symbol_table.insert(init_func);
+
+  AddSymbolToInitializersAttr(
+      session_init_op, FlatSymbolRefAttr::get(init_func.getSymNameAttr()));
+
+  return init_func;
+}
+
+// Returns true if the initializer function's tf_saved_model.initializer_type
+// matches `initializer_type`.
+bool IsInitializerType(func::FuncOp init_func_op, StringRef initializer_type) {
+  auto init_type =
+      init_func_op->getAttrOfType<StringAttr>(kTfSavedModelInitializerTypeAttr);
+  return init_type && init_type == initializer_type;
+}
+
+// Returns the initializer function whose tf_saved_model.initializer_type
+// is "restore_op". Creates and returns a new initializer function iff such
+// `FuncOp` is not found. The newly created initializer function's
+// initializer_type is "restore_op" and its symbol will be added to the symbol
+// table and session_initializer op's "intializer" attribute.
+func::FuncOp GetOrCreateInitializerFunc(ModuleOp module_op) {
+  if (auto init_func_op = GetInitializerFunction(
+          module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+      init_func_op) {
+    return init_func_op;
+  } else {
+    // Create a new initializer function if the init function is not found.
+    return CreateInitializerFunc(module_op);
+  }
+}
+
+// Retrieve the ConstOp's name from its loc. Returns "const" if a name cannot be
+// produced from its loc.
+std::string GetConstOpName(TF::ConstOp const_op) {
+  if (const std::string name = GetNameFromLoc(const_op.getLoc());
+      !name.empty()) {
+    // Replace any occurrences of ";" to "_". ";" is an illegal character to be
+    // used as a `shared_name`.
+    return absl::StrReplaceAll(name, /*replacements=*/{{";", "_"}});
+  }
+
+  return std::string(kDefaultConstName);
+}
+
+// Collects the ConstOps to unfreeze.
+std::vector<TF::ConstOp> GetTargetConstOps(const int64_t size_threshold,
+                                           ModuleOp module_op) {
+  std::vector<TF::ConstOp> target_const_ops{};
+
+  // TODO(b/254636388): Lift the assumption that there are no intializer
+  // functions and avoid converting ConstOps inside initializer functions.
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    // Do not unfreeze constants under these functions.
+    if (func_op.getSymName().contains("while_body")) continue;
+    if (func_op.getSymName().contains("while_cond")) continue;
+    absl::c_copy_if(func_op.getOps<TF::ConstOp>(),
+                    std::back_inserter(target_const_ops),
+                    [size_threshold](TF::ConstOp const_op) -> bool {
+                      return quant::GetSizeInBytes(const_op) > size_threshold;
+                    });
+  }
+
+  return target_const_ops;
+}
+
+// Replaces every uses of ConstOps in `target_const_ops` to VarHandleOp ->
+// ReadVariableOp patterns. The ConstOps are not erased. Returns the ConstOp ->
+// shared_name mapping. The shared_name is the shared name of the corresponding
+// VarHandleOp.
+llvm::MapVector<TF::ConstOp, std::string> ReplaceConstOpUsesWithVariableReads(
+    llvm::ArrayRef<TF::ConstOp> target_const_ops) {
+  llvm::MapVector<TF::ConstOp, std::string> const_op_name_map{};
+
+  // Keeps track of the number of occurrences of each synthesized name. The
+  // `shared_name` of the newly created `VarHandleOp` will be generated by
+  // suffixing the `"_{count}"` to the name.
+  absl::flat_hash_map<std::string, int> name_counts{};
+  for (auto const_op : target_const_ops) {
+    OpBuilder builder{const_op};
+
+    // TODO(b/254635554): Hoist VarHandleOp to the outermost function and pass
+    // down as arguments to avoid relying on shared variables.
+    const std::string name = GetConstOpName(const_op);
+    const int cnt = name_counts[name]++;
+
+    // Creates a unique name by appending its occurrence count.
+    const auto shared_name = absl::StrCat(name, "_", cnt);
+    const_op_name_map[const_op] = shared_name;
+
+    // Creates a VarHandleOp -> ReadVariableOp pair for each ConstOp.
+    const auto resource_type = RankedTensorType::get(
+        /*shape=*/{}, /*elementType=*/TF::ResourceType::get(
+            /*subtypes=*/llvm::ArrayRef<TensorType>{const_op.getType()},
+            builder.getContext()));
+    auto var_handle_op =
+        builder.create<TF::VarHandleOp>(const_op.getLoc(),
+                                        /*resource=*/resource_type,
+                                        /*container=*/"", shared_name);
+
+    auto read_variable_op = builder.create<TF::ReadVariableOp>(
+        const_op.getLoc(), const_op.getType(), var_handle_op);
+
+    // Replace each usage of ConstOp with the corresponding ReadVariableOp.
+    const_op.getResult().replaceAllUsesWith(read_variable_op);
+  }
+
+  return const_op_name_map;
+}
+
+// Inside `session_init_func`, creates AssignVariableOps(VarHandleOp, ConstOp)
+// for each VarHandleOp that replaces a ConstOp. The `session_init_func` will
+// essentially behave like restore_op for the newly created VarHandleOps whose
+// shared names are the values of `const_op_name_map`.
+void CreateAssignVariableOps(
+    llvm::MapVector<TF::ConstOp, std::string>& const_op_name_map,
+    func::FuncOp session_init_func) {
+  OpBuilder builder{&session_init_func.getBody()};
+
+  for (auto& [const_op, shared_name] : const_op_name_map) {
+    const auto element_type = TF::ResourceType::get(
+        /*subtypes=*/llvm::ArrayRef<TensorType>{const_op.getType()},
+        builder.getContext());
+
+    const auto ranked_tensor_type = RankedTensorType::get(
+        /*shape=*/{}, /*elementType=*/element_type);
+    auto var_handle_op =
+        builder.create<TF::VarHandleOp>(const_op.getLoc(),
+                                        /*resource=*/ranked_tensor_type,
+                                        /*container=*/"", shared_name);
+
+    // Assign the ConstOp to each VarHandleOp. These will be used to save the
+    // variable values to the checkpoint.
+    auto const_op_copy =
+        builder.create<TF::ConstOp>(const_op.getLoc(), const_op.getValue());
+
+    builder.create<TF::AssignVariableOp>(const_op.getLoc(),
+                                         /*resource=*/var_handle_op,
+                                         /*value=*/const_op_copy.getOutput());
+  }
+}
+
+void UnfreezeConstantsPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  // Find the ConstOps to "unfreeze" into VarHandleOps.
+  const std::vector<TF::ConstOp> target_const_ops =
+      GetTargetConstOps(size_threshold_in_bytes_.getValue(), module_op);
+  if (target_const_ops.empty()) {
+    VLOG(1) << "No ConstOps found. UnfreezeConstantsPass is a no-op.";
+    return;
+  }
+
+  func::FuncOp session_init_func = GetOrCreateInitializerFunc(module_op);
+
+  // Replace each usage of ConstOp to a VarHandleOp -> ReadVariableOp pattern.
+  llvm::MapVector<TF::ConstOp, std::string> const_op_name_map =
+      ReplaceConstOpUsesWithVariableReads(target_const_ops);
+
+  // In the session initializer function, assign the const op's values to the
+  // corresponding VarHandleOps.
+  CreateAssignVariableOps(const_op_name_map, session_init_func);
+
+  // Erase the ConstOps that are replaced by VarHandleOps.
+  absl::c_for_each(target_const_ops, [](auto const_op) { const_op.erase(); });
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateUnfreezeConstantsPass() {
+  return std::make_unique<UnfreezeConstantsPass>();
+}
+
+static PassRegistration<UnfreezeConstantsPass> pass([] {
+  return CreateUnfreezeConstantsPass();
+});
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index f59c18a3fd62..130e6fde4096 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -152,10 +152,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",  # build_cleaner: keep; Required for pybind11.
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_xla//third_party/python_runtime:headers",  # build_cleaner: keep; Required for pybind11.
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
     ],
@@ -177,6 +177,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_unfreeze_constants",
+    srcs = ["tf_unfreeze_constants.cc"],
+    hdrs = ["tf_unfreeze_constants.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:save_variables",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:status",
+        "@local_xla//xla/tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "unfreeze_constants",
     srcs = ["unfreeze_constants.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index b44c788bc10f..b9f62efb9394 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -5617,7 +5617,7 @@ def test_conv_model(
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
         ),
-        0.3,
+        0.31,
     )
 
     if enable_per_channel_quantization and target_opset == quant_opts_pb2.XLA:
@@ -5711,7 +5711,7 @@ def test_depthwise_conv2d_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     # Due to other meta data, the compression is not exactly 1/4.
-    size_threshold = 0.5 if enable_per_channel_quantization else 0.32
+    size_threshold = 0.5 if enable_per_channel_quantization else 0.33
     self.assertLess(
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.cc
new file mode 100644
index 000000000000..c12ca5c2a76e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.cc
@@ -0,0 +1,74 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.h"
+
+#include <string>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// Unfreezes constants into variables and saves them to a checkpoint files under
+// `checkpoint_dir`. `checkpoint_dir` will be created within this function. It
+// will return a non-OK status if it already exists or permission is denied.
+// TODO(b/261652258): Make sure this works for when there are non-frozen
+// variables in the model.
+absl::Status UnfreezeConstantsAndSaveVariables(
+    const absl::string_view checkpoint_dir, mlir::MLIRContext &ctx,
+    mlir::ModuleOp module_op) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/kTfQuantConstantUnfreezingStepName, /*add_passes_func=*/
+      [](mlir::PassManager &pm) {
+        pm.addPass(mlir::tf_quant::CreateUnfreezeConstantsPass());
+      },
+      ctx, module_op));
+
+  if (const absl::Status create_dir_status =
+          Env::Default()->CreateDir(std::string(checkpoint_dir));
+      !create_dir_status.ok()) {
+    LOG(ERROR) << "Failed to create checkpoint directory at: "
+               << checkpoint_dir;
+    return create_dir_status;
+  }
+
+  TF_ASSIGN_OR_RETURN(const auto unused_variable_names,
+                      SaveVariablesToCheckpoint(checkpoint_dir, module_op));
+
+  return RunPasses(
+      /*name=*/kTfQuantInsertRestoreOpStepName,
+      /*add_passes_func=*/
+      [](mlir::PassManager &pm) {
+        pm.addPass(mlir::tf_quant::CreateInsertRestoreOpPass());
+        pm.addPass(mlir::tf_quant::CreateInsertSaveOpPass());
+        // Initialization by `tf.ConstOp` is no longer required as there is
+        // a `tf.RestoreV2Op` now.
+        pm.addPass(
+            mlir::tf_quant::CreateRemoveVariableInitializationByConstPass());
+      },
+      ctx, module_op);
+}
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.h
new file mode 100644
index 000000000000..4124f9602c31
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/tf_unfreeze_constants.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_TF_UNFREEZE_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_TF_UNFREEZE_CONSTANTS_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace quantization {
+
+inline constexpr absl::string_view kTfQuantConstantUnfreezingStepName =
+    "tf_quant_constant_unfreezing";
+inline constexpr absl::string_view kTfQuantInsertRestoreOpStepName =
+    "tf_quant_insert_restore_op";
+
+absl::Status UnfreezeConstantsAndSaveVariables(absl::string_view checkpoint_dir,
+                                               mlir::MLIRContext &ctx,
+                                               mlir::ModuleOp module_op);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_TF_UNFREEZE_CONSTANTS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
index 86d5b547f43a..70fbc7d0a73e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
@@ -32,16 +32,17 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/stablehlo/transforms/stablehlo_passes.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
@@ -82,7 +83,7 @@ void AddTFToStablehloPasses(
   // on TPU.
   // Extracts the StableHLO module from tf.XlaCallModuleOp if the StableHLO
   // module is serialized in it.
-  pm.addPass(mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
+  pm.addPass(mlir::stablehlo::CreateLegalizeTFXlaCallModuleToStablehloPass());
 
   // Preprocesses TPU-targeting StableHLO module for support in TF Quantizer.
   pm.addPass(mlir::quant::CreateConvertTpuModelToCpuPass());
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir
index 7f7a5090439e..08fff1322be2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_weights.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-quant-opt %s -split-input-file -quant-quantize-weights | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-quantize-weights | FileCheck %s
 
 module {
   func.func @not_quantize_const() -> (tensor<2x1024xf32>) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_add_dump_tensor_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_add_dump_tensor_op.mlir
new file mode 100644
index 000000000000..324e72458072
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_add_dump_tensor_op.mlir
@@ -0,0 +1,300 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-add-dump-tensor-op='debugger_type=whole_model' | FileCheck --check-prefix=WholeModel %s
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-add-dump-tensor-op='debugger_type=int_per_layer' | FileCheck --check-prefix=IntPerLayer %s
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-add-dump-tensor-op='debugger_type=float_per_layer' | FileCheck --check-prefix=FloatPerLayer %s
+
+
+module {
+  func.func @conv(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<[[[[1.600000e-01, 1.000000e-01], [5.100000e-01, 5.400000e-01], [-5.000000e-01, 4.100000e-01]], [[-3.500000e-01, 5.000000e-02], [-0.00999999977, 1.600000e-01], [-4.800000e-01, -2.400000e-01]]], [[[-3.500000e-01, -2.100000e-01], [-1.400000e-01, -2.000000e-02], [4.800000e-01, 3.500000e-01]], [[-1.900000e-01, 3.200000e-01], [0.00999999977, -7.000000e-02], [2.000000e-01, -4.000000e-02]]]]> : tensor<2x2x3x2xf32>} : () -> tensor<2x2x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>, tensor<2xf32>) -> tensor<*xf32> loc(callsite("test@conv"("Conv2D") at "QuantizationUnit(\12\06Conv2D\1a\04conv)"))
+    %1 = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>, tensor<2xf32>) -> tensor<*xf32> loc(callsite("test@conv"("Conv2D_1") at "QuantizationUnit(\12\08Conv2D_1\1a\04conv)"))
+    func.return %0, %1 : tensor<*xf32>, tensor<*xf32>
+  }
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_2(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x2x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %2 : tensor<*xf32>
+  }
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x2x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %2 : tensor<*xf32>
+  }
+
+// WholeModel-LABEL: func @conv
+// WholeModel-DAG: %[[w:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}1.600000e-01, 1.000000e-01
+// WholeModel-DAG: %[[b:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00
+// WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}>
+// WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// WholeModel-DAG: return %[[output0]], %[[output1]]
+
+// IntPerLayer-LABEL: func @conv
+// IntPerLayer-DAG: %[[w:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}1.600000e-01, 1.000000e-01
+// IntPerLayer-DAG: %[[b:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00
+// IntPerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
+// IntPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// IntPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %cst, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// IntPerLayer-DAG: return %[[output0]], %[[output1_quantized]]
+
+// FloatPerLayer-LABEL: func @conv
+// FloatPerLayer-DAG: %[[w:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}1.600000e-01, 1.000000e-01
+// FloatPerLayer-DAG: %[[b:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00
+// FloatPerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
+// FloatPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// FloatPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// FloatPerLayer-DAG: return %[[output0]], %[[output1_unquantized]]
+}
+
+// -----
+
+module {
+  func.func @multiple_conv2d(%arg: tensor<?x2x2x2xf32>) -> tensor<?x2x2x2xf32> {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<[[[[0.193340182, 0.285152316], [0.41538316, -0.313452125]], [[0.188379049, 0.0693640113], [-0.199678659, -0.0629909635]]], [[[0.141592324, 0.554834187], [-0.224576354, 0.103607118]], [[0.134974658, -2.952230e-02], [-0.15929231, -0.538676262]]]]> : tensor<2x2x2x2xf32>} : () -> tensor<2x2x2x2xf32>
+    %cst_2 = "tf.Const"() {value = dense<[[[[-0.174680978, -0.367524445], [-0.0481151938, -0.154707015]], [[-0.0463985205, 0.457213104], [-0.0713823438, 0.0317451358]]], [[[-0.335502505, 0.00602310896], [0.307939529, 0.49636358]], [[-0.223585874, -0.194682062], [0.0728010535, 0.43586427]]]]> : tensor<2x2x2x2xf32>} : () -> tensor<2x2x2x2xf32>
+    %0 = "tf.PartitionedCall"(%arg, %cst_1, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32> loc(callsite("test@multiple_conv2d"("Conv2D") at "QuantizationUnit(\12\06Conv2D\1a\0fmultiple_conv2d)"))
+    %1 = "tf.PartitionedCall"(%0, %cst_2, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32> loc(callsite("test@multiple_conv2d"("Conv2D_1") at "QuantizationUnit(\12\08Conv2D_1\1a\0fmultiple_conv2d)"))
+    return %1 : tensor<?x2x2x2xf32>
+  }
+
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_2(%arg0: tensor<?x2x2x2xf32>, %arg1: tensor<2x2x2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2x2x2xf32> {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>) -> tensor<?x2x2x2xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<?x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<?x2x2x2xf32>) -> tensor<?x2x2x2xf32>
+    return %2 : tensor<?x2x2x2xf32>
+  }
+
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<?x2x2x2xf32>, %arg1: tensor<2x2x2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2x2x2xf32> {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<?x2x2x2xf32>, tensor<2x2x2x2xf32>) -> tensor<?x2x2x2xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<?x2x2x2xf32>, tensor<2xf32>) -> tensor<?x2x2x2xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<?x2x2x2xf32>) -> tensor<?x2x2x2xf32>
+    return %2 : tensor<?x2x2x2xf32>
+  }
+
+// WholeModel-LABEL: func @multiple_conv2d
+// WholeModel-DAG: %[[b0:.*]] = "tf.Const"() <{value = dense<0.000000e+00>
+// WholeModel-DAG: %[[b1:.*]] = "tf.Const"() <{value = dense<1.000000e+00>
+// WholeModel-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
+// WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
+// WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%[[output0]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// WholeModel-DAG: return %[[output1]]
+
+// IntPerLayer-LABEL: func @multiple_conv2d
+// IntPerLayer-DAG: %[[b0:.*]] = "tf.Const"() <{value = dense<0.000000e+00>
+// IntPerLayer-DAG: %[[b1:.*]] = "tf.Const"() <{value = dense<1.000000e+00>
+// IntPerLayer-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
+// IntPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
+// IntPerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// IntPerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// IntPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// IntPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// IntPerLayer-DAG: return %[[output1_quantized]]
+
+// FloatPerLayer-LABEL: func @multiple_conv2d
+// FloatPerLayer-DAG: %[[b0:.*]] = "tf.Const"() <{value = dense<0.000000e+00>
+// FloatPerLayer-DAG: %[[b1:.*]] = "tf.Const"() <{value = dense<1.000000e+00>
+// FloatPerLayer-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
+// FloatPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
+// FloatPerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// FloatPerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// FloatPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_unquantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// FloatPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_unquantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// FloatPerLayer-DAG: return %[[output1_unquantized]]
+}
+
+// -----
+
+module {
+  func.func @matmul2(%arg0: tensor<2x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x2xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<[[-0.211145893, -0.708605706], [-0.954062759, -0.614013135]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> loc(callsite("test@matmul2"("MatMul") at "QuantizationUnit(\12\06MatMul\1a\07matmul2)"))
+    %1 = "tf.PartitionedCall"(%0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> loc(callsite("test@matmul2"("MatMul_1") at "QuantizationUnit(\12\08MatMul_1\1a\07matmul2)"))
+    return %1 : tensor<2x2xf32>
+  }
+  func.func private @composite_matmul_fn_2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
+  func.func private @composite_matmul_fn_1(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
+
+// WholeModel-LABEL: func @matmul2
+// WholeModel-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344
+// WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
+// WholeModel-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: return %[[m1]]
+
+// IntPerLayer-LABEL: func @matmul2
+// IntPerLayer-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344
+// IntPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
+// IntPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: return %[[m1]] : tensor<2x2xf32>
+
+// FloatPerLayer-LABEL: func @matmul2
+// FloatPerLayer-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344
+// FloatPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
+// FloatPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: return %[[m1_0]] : tensor<2x2xf32>
+}
+
+// -----
+
+module {
+  func.func @matmul2_softmax(%arg0: tensor<2x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x2xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<[[-0.211145893, -0.708605706], [-0.954062759, -0.614013135]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> loc(callsite("test@matmul2_softmax"("MatMul") at "QuantizationUnit(\12\06MatMul\1a\0fmatmul2_softmax)"))
+    %1 = "tf.Softmax"(%0) {T = "tfdtype$DT_FLOAT"} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %2 = "tf.PartitionedCall"(%1, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> loc(callsite("test@matmul2_softmax"("MatMul_1") at "QuantizationUnit(\12\08MatMul_1\1a\0fmatmul2_softmax)"))
+    return %2 : tensor<2x2xf32>
+  }
+  func.func private @composite_matmul_fn_2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
+  func.func private @composite_matmul_fn_1(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
+
+// WholeModel-LABEL: func @matmul2_softmax
+// WholeModel-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344, 0.54962182
+// WholeModel-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893, -0.708605706
+// WholeModel-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_0]]) {T = "tfdtype$DT_FLOAT"}
+// WholeModel-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: return %[[pc_1]]
+
+// IntPerLayer-LABEL: func @matmul2_softmax
+// IntPerLayer-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344, 0.54962182
+// IntPerLayer-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893, -0.708605706
+// IntPerLayer-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// IntPerLayer-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// IntPerLayer-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_0]]) {T = "tfdtype$DT_FLOAT"}
+// IntPerLayer-DAG: %[[pc_2:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// IntPerLayer-DAG: %[[pc_3:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// IntPerLayer-DAG: return %[[pc_2]]
+
+// FloatPerLayer-LABEL: func @matmul2_softmax
+// FloatPerLayer-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344, 0.54962182
+// FloatPerLayer-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893, -0.708605706
+// FloatPerLayer-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// FloatPerLayer-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// FloatPerLayer-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_1]]) {T = "tfdtype$DT_FLOAT"}
+// FloatPerLayer-DAG: %[[pc_2:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// FloatPerLayer-DAG: %[[pc_3:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// FloatPerLayer-DAG: return %[[pc_3]]
+}
+
+// -----
+
+module {
+  func.func @matmul2_concat(%arg0: tensor<2x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x4xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<[[-0.211145893, -0.708605706], [-0.954062759, -0.614013135]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    %cst_1 = "tf.Const"() { value = dense<-1> : tensor<i32> } : () -> tensor<i32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> loc(callsite("test@matmul2_concat"("MatMul") at "QuantizationUnit(\12\06MatMul\1a\0ematmul2_concat)"))
+    %1 = "tf.PartitionedCall"(%0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> loc(callsite("test@matmul2_concat"("MatMul_1") at "QuantizationUnit(\12\08MatMul_1\1a\0ematmul2_concat)"))
+    %2 = "tf.ConcatV2"(%0, %1, %cst_1) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i32>) -> tensor<2x4xf32>
+    return %2 : tensor<2x4xf32>
+  }
+  func.func private @composite_matmul_fn_2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
+  func.func private @composite_matmul_fn_1(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    return %0 : tensor<2x2xf32>
+  }
+
+// WholeModel-LABEL: func @matmul2_concat
+// WholeModel-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344
+// WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
+// WholeModel-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}
+// WholeModel-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: %[[c:.*]] = "tf.ConcatV2"(%[[m0]], %[[m1]], %[[axis]])
+// WholeModel-DAG: return %[[c]]
+
+// IntPerLayer-LABEL: func @matmul2_concat
+// IntPerLayer-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344
+// IntPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
+// IntPerLayer-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
+// IntPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: %4 = "tf.ConcatV2"(%[[m0]], %[[m1]], %[[axis]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i32>) -> tensor<2x4xf32>
+// IntPerLayer-DAG: return %4 : tensor<2x4xf32>
+
+// FloatPerLayer-LABEL: func @matmul2_concat
+// FloatPerLayer-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.630731344
+// FloatPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
+// FloatPerLayer-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
+// FloatPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: %4 = "tf.ConcatV2"(%1, %[[m1_0]], %[[axis]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i32>) -> tensor<2x4xf32>
+// FloatPerLayer-DAG: return %4 : tensor<2x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_add_quantization_unit_loc.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_add_quantization_unit_loc.mlir
new file mode 100644
index 000000000000..81c735b75133
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_add_quantization_unit_loc.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-quant-opt %s -mlir-print-debuginfo -mlir-print-local-scope -tf-quant-add-quantization-unit-loc | FileCheck %s
+
+func.func @conv2d_unmatching_loc_pattern(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc("Model/conv2d")
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc("Model/conv2d")
+}
+
+func.func @conv2d_with_valid_loc(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc(fused["Conv2D:", "Model/conv2d"])
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc(callsite("Model/conv2d@conv2d_with_valid_loc"("Conv2D") at "QuantizationUnit({{.*}})"))
+}
+
+func.func @conv2d_with_callsite_loc(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc(fused["Conv2D:", callsite("Model/conv2d" at "model.py":10:8)])
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc(callsite("Model/conv2d@conv2d_with_callsite_loc"("Conv2D") at "QuantizationUnit({{.*}})"))
+}
+
+func.func @conv2d_with_func_name(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1]}
+       : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16> loc(fused["Conv2D:", "Model/conv2d@original_func"])
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+// CHECK: tf.Conv2D
+// CHECK-SAME: loc(callsite("Model/conv2d@original_func"("Conv2D") at "QuantizationUnit({{.*}})"))
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_cast_bf16_ops_to_f32.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_cast_bf16_ops_to_f32.mlir
new file mode 100644
index 000000000000..c9be645a1415
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_cast_bf16_ops_to_f32.mlir
@@ -0,0 +1,114 @@
+// RUN: tf-quant-opt %s -tf-quant-cast-bf16-ops-to-f32 | FileCheck %s
+
+func.func @cast_bf16_conv_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16>
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %3 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @cast_bf16_conv_to_fp32
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>}> {device = ""} : () -> tensor<2x3x3x2xbf16>
+// CHECK: %[[cast:.*]] = "tf.Cast"(%[[cst]]) <{Truncate = false}> : (tensor<2x3x3x2xbf16>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cast]])
+// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[conv]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: return %[[identity]] : tensor<1x3x2x2xf32>
+
+func.func @cast_bf16_conv_with_bias_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<2xbf16>} : () -> tensor<2xbf16>
+  %cst_0 = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst_0) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16>
+  %2 = "tf.BiasAdd"(%1, %cst) {data_format = "NHWC", device = ""} : (tensor<1x3x2x2xbf16>, tensor<2xbf16>) -> tensor<1x3x2x2xbf16>
+  %3 = "tf.Cast"(%2) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %4 = "tf.IdentityN"(%3) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %4 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @cast_bf16_conv_with_bias_to_fp32
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]])
+// CHECK: %[[bias_add:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]])
+// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[bias_add]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: return %[[identity]] : tensor<1x3x2x2xf32>
+
+func.func @cast_bf16_avg_pool_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.Conv2D"(%0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16>
+  %2 = "tf.AvgPool"(%1) {data_format = "NHWC", device = "", ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xbf16>
+  %3 = "tf.Cast"(%2) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  %4 = "tf.IdentityN"(%3) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %4 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @cast_bf16_avg_pool_to_fp32
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[cst]])
+// CHECK: %[[avg_pool:.*]] = "tf.AvgPool"(%[[conv]])
+// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[avg_pool]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: return %[[identity]] : tensor<1x3x2x2xf32>
+
+func.func @cast_bf16_matmul_to_fp32(%arg0: tensor<1x10xf32>) -> (tensor<1x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+01> : tensor<10x2xbf16>} : () -> tensor<10x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x10xf32>) -> tensor<1x10xbf16>
+  %1 = "tf.MatMul"(%0, %cst) {device = "", transpose_a = false, transpose_b = false} : (tensor<1x10xbf16>, tensor<10x2xbf16>) -> tensor<1x2xbf16>
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x2xbf16>) -> tensor<1x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  return %3 : tensor<1x2xf32>
+}
+
+// CHECK: func @cast_bf16_matmul_to_fp32
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<10x2xf32>}> : () -> tensor<10x2xf32>
+// CHECK: %[[matmul:.*]] = "tf.MatMul"(%arg0, %[[cst]])
+// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[matmul]])
+// CHECK: return %[[identity]] : tensor<1x2xf32>
+
+func.func @cast_bf16_depthwise_conv_to_fp32(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x2x2x6xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+01> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  %1 = "tf.DepthwiseConv2dNative"(%0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x2x2x6xbf16>
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x2x2x6xbf16>) -> tensor<1x2x2x6xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x2x2x6xf32>) -> tensor<1x2x2x6xf32>
+  return %3 : tensor<1x2x2x6xf32>
+}
+
+// CHECK: func @cast_bf16_depthwise_conv_to_fp32
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK: %[[depthwise_conv:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[cst]])
+// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[depthwise_conv]]) {device = ""} : (tensor<1x2x2x6xf32>) -> tensor<1x2x2x6xf32>
+// CHECK: return %[[identity]] : tensor<1x2x2x6xf32>
+
+func.func @cast_bf16_batch_matmul_v2_to_fp32(%arg0: tensor<1x1x10xf32>) -> (tensor<1x1x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+01> : tensor<10x2xbf16>} : () -> tensor<10x2xbf16>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x1x10xf32>) -> tensor<1x1x10xbf16>
+  %1 = "tf.BatchMatMulV2"(%0, %cst) {adj_x = false, adj_y = false, device = ""} : (tensor<1x1x10xbf16>, tensor<10x2xbf16>) -> tensor<1x1x2xbf16>
+  %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x1x2xbf16>) -> tensor<1x1x2xf32>
+  %3 = "tf.IdentityN"(%2) {device = ""} : (tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+  return %3 : tensor<1x1x2xf32>
+}
+
+// CHECK: func @cast_bf16_batch_matmul_v2_to_fp32
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<10x2xf32>}> : () -> tensor<10x2xf32>
+// CHECK: %[[batch_matmul:.*]] = "tf.BatchMatMulV2"(%arg0, %[[cst]])
+// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[batch_matmul]]) {device = ""} : (tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+// CHECK: return %[[identity]] : tensor<1x1x2xf32>
+
+// Tests that an AddV2 op accepting two bf16 operands is transformed into
+// an AddV2 op that accepts two fp32 operands.
+func.func @cast_bf16_add_v2_to_fp32(%arg0: tensor<2xbf16>, %arg1: tensor<2xbf16>) -> tensor<2xf32> {
+  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2xbf16>, tensor<2xbf16>) -> tensor<2xbf16>
+  %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<2xbf16>) -> tensor<2xf32>
+  return %1 : tensor<2xf32>
+}
+// The signature of the function is not changed.
+// CHECK: func @cast_bf16_add_v2_to_fp32(%[[ARG_0:.*]]: tensor<2xbf16>, %[[ARG_1:.*]]: tensor<2xbf16>) -> tensor<2xf32>
+
+// bfloat16 operands are cast to f32 operands.
+// CHECK-DAG: %[[CAST_0:.*]] = "tf.Cast"(%[[ARG_0]]) <{Truncate = false}> : (tensor<2xbf16>) -> tensor<2xf32>
+// CHECK-DAG: %[[CAST_1:.*]] = "tf.Cast"(%[[ARG_1]]) <{Truncate = false}> : (tensor<2xbf16>) -> tensor<2xf32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[CAST_0]], %[[CAST_1]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK: return %[[ADD]] : tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_custom_aggregation_op_to_quant_stats.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_custom_aggregation_op_to_quant_stats.mlir
new file mode 100644
index 000000000000..bc3b96a8c4b6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_custom_aggregation_op_to_quant_stats.mlir
@@ -0,0 +1,19 @@
+// RUN: tf-quant-opt %s -tf-quant-convert-tf-custom-aggregator-op-to-quant-stats | FileCheck %s
+
+func.func @customAggregator(%arg0: tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) {
+  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, max = 0.2 : f32, id = "0", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) {id = "1", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  func.return %0#0, %1#0 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
+}
+// CHECK: func @customAggregator
+// CHECK-NEXT: %[[stats:.*]] = "quantization.stats"(%arg0) <{layerStats = dense<[-1.000000e-01, 2.000000e-01]> : tensor<2xf32>}> : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+// CHECK-NEXT: return %[[stats]], %arg0
+
+func.func @doNotHandleNoMinMaxCases(%arg0: tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) {
+  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, id = "1", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) {max = 0.2 : f32, id = "2", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %2:4 = "tf.CustomAggregator"(%arg0) {id = "3", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  func.return %0#0, %1#0, %2#0 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
+}
+// CHECK: func @doNotHandleNoMinMaxCases
+// CHECK-NOT: "quantization.stats"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_fake_quant_to_qdq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_fake_quant_to_qdq.mlir
new file mode 100644
index 000000000000..2909f73d4bba
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_fake_quant_to_qdq.mlir
@@ -0,0 +1,44 @@
+// RUN: tf-quant-opt %s -tf-quant-convert-fake-quant-to-qdq | FileCheck %s
+
+func.func @fakeQuantArgs(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32> {
+  %0 = "tf.FakeQuantWithMinMaxArgs"(%arg0) {
+    min = -0.1 : f32, max = 0.2 : f32, num_bits = 8
+  } : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  func.return %0 : tensor<8x8x8x8xf32>
+}
+// CHECK: func @fakeQuantArgs
+// CHECK-NEXT: %[[q:.*]] = "quantization.qcast"(%arg0) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+// CHECK-NEXT: %[[dq:.*]] = "quantization.dcast"(%[[q]])
+// CHECK-NEXT: return %[[dq]]
+
+func.func @doNotHandleNonEightBitFakeQuant(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32> {
+  %0 = "tf.FakeQuantWithMinMaxArgs"(%arg0) {
+    min = -0.1 : f32, max = 0.2 : f32, num_bits = 16
+  } : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  func.return %0 : tensor<8x8x8x8xf32>
+}
+// CHECK: func @doNotHandleNonEightBitFakeQuant
+// CHECK: tf.FakeQuantWithMinMaxArgs
+// CHECK-NOT: "quantization.qcast"
+
+func.func @fakeQuantVars(%arg0: tensor<3xf32>, %arg1: tensor<4x3xf32>) -> (tensor<3xf32>, tensor<4x3xf32>) {
+  %cst = "tf.Const"() {value = dense<-0.950868546> : tensor<f32>} : () -> tensor<f32>
+  %cst_0 = "tf.Const"() {value = dense<9.951540e-01> : tensor<f32>} : () -> tensor<f32>
+  %cst_1 = "tf.Const"() {value = dense<[-0.5, -0.4, -0.7]> : tensor<3xf32>} : () -> tensor<3xf32>
+  %cst_2 = "tf.Const"() {value = dense<[0.5, 0.6, 0.3]> : tensor<3xf32>} : () -> tensor<3xf32>
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) {
+    device = "", narrow_range = false, num_bits = 8 : i64
+  } : (tensor<3xf32>, tensor<f32>, tensor<f32>) -> tensor<3xf32>
+  %1 = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg1, %cst_1, %cst_2) {
+    device = "", narrow_range = true, num_bits = 8 : i64
+  } : (tensor<4x3xf32>, tensor<3xf32>, tensor<3xf32>) -> tensor<4x3xf32>
+  func.return %0, %1 : tensor<3xf32>, tensor<4x3xf32>
+}
+
+// CHECK: %[[q1:.*]] = "quantization.qcast"(%arg0)
+// CHECK-SAME: tensor<3x!quant.uniform<i8:f32, 0.0076314610593459188:-3>>
+// CHECK: %[[dq1:.*]] = "quantization.dcast"(%[[q1]])
+// CHECK: %[[q2:.*]] = "quantization.qcast"(%arg1)
+// CHECK-SAME: tensor<4x3x!quant.uniform<i8<-127:127>:f32:1, {0.003937007874015748,0.0039370079913477263:-25,0.003937007874015748:51}>>
+// CHECK: %[[dq2:.*]] = "quantization.dcast"(%[[q2]])
+// CHECK: return %[[dq1]], %[[dq2]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_tf_xla_op_to_tf_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_tf_xla_op_to_tf_op.mlir
new file mode 100644
index 000000000000..4f881c9a2dec
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_tf_xla_op_to_tf_op.mlir
@@ -0,0 +1,58 @@
+// RUN: tf-quant-opt %s -tf-quant-convert-tf-xla-op-to-tf-op -split-input-file | FileCheck %s
+
+func.func @xla_dot_v2(%arg0: tensor<?x2x3xf32>, %arg1: tensor<3x4x5xf32>) -> (tensor<?x2x4x5xf32>) {
+  %0 = "tf.XlaDotV2"(%arg0, %arg1) {device = "", dimension_numbers = "\0A\01\02\12\01\00", precision_config = ""} : (tensor<?x2x3xf32>, tensor<3x4x5xf32>) -> tensor<?x2x4x5xf32>
+  func.return %0 : tensor<?x2x4x5xf32>
+}
+
+// CHECK: func @xla_dot_v2
+// CHECK: %[[einsum:.*]] = "tf.Einsum"(%arg0, %arg1) <{equation = "abc,cde->abde"}> : (tensor<?x2x3xf32>, tensor<3x4x5xf32>) -> tensor<?x2x4x5xf32>
+// CHECK: return %[[einsum]] : tensor<?x2x4x5xf32>
+
+// -----
+
+// dimension_numbers: {
+//   offset_dims: 0
+//   collapsed_slice_dims: 1
+//   start_index_map: 1
+// }
+func.func @xla_gather(%arg0: tensor<?x2xf32>, %arg1: tensor<1xi32>, %arg2: tensor<2xi32>) -> tensor<*xf32> {
+  %0 = "tf.XlaGather"(%arg0, %arg1, %arg2) {device = "", dimension_numbers = "\0A\01\00\12\01\01\1A\01\01", indices_are_sorted = true} : (tensor<?x2xf32>, tensor<1xi32>, tensor<2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// CHECK: func @xla_gather
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<1> : tensor<1x1xi64>}> : () -> tensor<1x1xi64>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<-1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[tensor_scatter_update:.*]] = "tf.TensorScatterUpdate"(%[[cst]], %[[cst_0]], %[[arg1_i64]]) <{bad_indices_policy = ""}> : (tensor<2xi64>, tensor<1x1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) <{Truncate = false}> : (tensor<2xi32>) -> tensor<2xi64>
+// CHECK: %[[slice:.*]] = "tf.Slice"(%arg0, %[[tensor_scatter_update]], %[[arg2_i64]]) : (tensor<?x2xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<*xf32>
+// CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[slice]], %[[cst_1]]) : (tensor<*xf32>, tensor<1xi64>) -> tensor<*xf32>
+// CHECK: return %[[reshape]] : tensor<*xf32>
+
+// -----
+
+// Tests that the converted `tf.Slice` has the correct number of dimensions
+// when the output shape is known (`tensor<i32>` instead of `tensor<*xi32>`).
+
+func.func @xla_gather_known_output_shape(%arg0: tensor<5xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<i32> {
+  // dimension_numbers: {
+  //   collapsed_slice_dims: 0
+  //   start_index_map: 0
+  // }
+  %0 = "tf.XlaGather"(%arg0, %arg1, %arg2) {device = "", dimension_numbers = "\12\01\00\1A\01\00", indices_are_sorted = true} : (tensor<5xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// CHECK: func @xla_gather_known_output_shape
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<0> : tensor<1x1xi64>}> : () -> tensor<1x1xi64>
+// CHECK-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}> : () -> tensor<0xi64>
+// CHECK: %[[arg1_i64:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[tensor_scatter_update:.*]] = "tf.TensorScatterUpdate"(%[[cst]], %[[cst_0]], %[[arg1_i64]]) <{bad_indices_policy = ""}> : (tensor<1xi64>, tensor<1x1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK: %[[arg2_i64:.*]] = "tf.Cast"(%arg2) <{Truncate = false}> : (tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[slice:.*]] = "tf.Slice"(%arg0, %[[tensor_scatter_update]], %[[arg2_i64]]) : (tensor<5xi32>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[slice]], %[[cst_1]]) : (tensor<1xi32>, tensor<0xi64>) -> tensor<i32>
+// CHECK: return %[[reshape]] : tensor<i32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_tpu_model_to_cpu.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_tpu_model_to_cpu.mlir
new file mode 100644
index 000000000000..207fb96ea8ee
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_convert_tpu_model_to_cpu.mlir
@@ -0,0 +1,56 @@
+// RUN: tf-quant-opt %s -tf-quant-convert-tpu-model-to-cpu -inline -tf-quant-cast-bf16-ops-to-f32 -split-input-file | \
+// RUN: FileCheck %s
+
+// Remove TPU related ops.
+func.func @tpu_conv(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x2x2xf32> {
+  %0 = "tf.TPUOrdinalSelector"() {device = ""} : () -> tensor<?xi32>
+  %1 = "tf.TPUPartitionedCall"(%arg0, %0) {autotuner_thresh = 0 : i64, device = "", f = @tpu_func_0_optim0} : (tensor<1x3x4x3xf32>, tensor<?xi32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.IdentityN"(%1) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %2 : tensor<1x3x2x2xf32>
+}
+
+func.func private @tpu_func_0_optim0(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x2x2xf32> attributes {tf._original_func_name = "tpu_func_0_optim"} {
+  %cst = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>} : () -> tensor<2x3x3x2xbf16>
+  %cst_0 = "tf.Const"() {device = "", value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %cst_1 = "tf.Const"() {_tpu_replicate = "cluster", device = "", value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %0 = "tf.Cast"(%arg0) {Truncate = false, device = ""} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xbf16>
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", tpu_compile_options_proto = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
+  %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "cluster", device = ""} : () -> tensor<!tf_type.string>
+  %2 = "tf.Transpose"(%0, %cst_0) {device = ""} : (tensor<1x3x4x3xbf16>, tensor<4xi32>) -> tensor<1x3x3x4xbf16>
+  %3 = "tf.TPUReplicatedInput"(%2) {device = "", index = -1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<1x3x3x4xbf16>) -> tensor<1x3x3x4xbf16>
+  %4 = "tf.Transpose"(%3, %cst_1) {_tpu_replicate = "cluster", device = ""} : (tensor<1x3x3x4xbf16>, tensor<4xi32>) -> tensor<1x3x4x3xbf16>
+  %5 = "tf.Conv2D"(%4, %cst) {_tpu_replicate = "cluster", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xbf16>, tensor<2x3x3x2xbf16>) -> tensor<1x3x2x2xbf16>
+  %6 = "tf.TPUReplicatedOutput"(%5) {device = ""} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xbf16>
+  %7 = "tf.Cast"(%6) {Truncate = false} : (tensor<1x3x2x2xbf16>) -> tensor<1x3x2x2xf32>
+  func.return %7 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @tpu_conv(%[[ARG0:.*]]: tensor<1x3x4x3xf32>)
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xbf16>}> {device = ""} : () -> tensor<2x3x3x2xbf16>
+// CHECK: %[[cast:.*]] = "tf.Cast"(%[[cst]]) <{Truncate = false}> : (tensor<2x3x3x2xbf16>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[conv:.*]] = "tf.Conv2D"(%[[ARG0]], %[[cast]])
+// CHECK: %[[identity:.*]] = "tf.IdentityN"(%[[conv]]) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: return %[[identity]] : tensor<1x3x2x2xf32>
+
+// -----
+
+// Tests that `tf.BatchFunction` is inlined.
+
+func.func @serving_default(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  %0 = "tf.BatchFunction"(%arg0, %arg1) {f = @batched_func, num_batch_threads = 1 : i64, max_batch_size = 2 : i64, batch_timeout_micros = 10000 : i64, operandSegmentSizes = array<i32: 1, 1>} : (tensor<1xf32>, tensor<1xf32>) -> (tensor<1xf32>)
+  return %0 : tensor<1xf32>
+}
+// The contents of `@serving_default` should have been inlined to `@batch_func`.
+// CHECK: func.func @serving_default(%[[ARG0:.*]]: tensor<1xf32>, %[[ARG1:.*]]: tensor<1xf32>) -> tensor<1xf32>
+// CHECK-NOT: tf.BatchFunction
+// CHECK: %[[ADD0:.*]] = "tf.AddV2"(%[[ARG0]], %[[ARG1]])
+// CHECK: return %[[ADD0]] : tensor<1xf32>
+
+func.func private @batched_func(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  %0 = "tf.Identity"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+  %1 = "tf.Identity"(%arg1) : (tensor<1xf32>) -> tensor<1xf32>
+  %2 = "tf.AddV2"(%0, %1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  return %2: tensor<1xf32>
+}
+// The called function should be removed.
+// CHECK-NOT: batched_func
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_duplicate_shape_determining_constants.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_duplicate_shape_determining_constants.mlir
new file mode 100644
index 000000000000..ecf49fdbafd2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_duplicate_shape_determining_constants.mlir
@@ -0,0 +1,223 @@
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -tf-quant-duplicate-shape-determining-constants | FileCheck %s
+
+// CHECK-LABEL: @duplicate_const_for_shape_determining_operand_at_idx_1
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x2xi32>)
+func.func private @duplicate_const_for_shape_determining_operand_at_idx_1(%arg0: tensor<?x2xi32>) -> tensor<?x2x1xi32> {
+  %cst = "tf.Const"() {device = "", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  // idx 1 should be a compile time constant
+  %0 = "tf.ExpandDims"(%arg0, %cst) {device = ""} : (tensor<?x2xi32>, tensor<i32>) -> tensor<?x2x1xi32>
+  %1 = "tf.AddV2"(%cst, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  return %0 : tensor<?x2x1xi32>
+}
+// Check that the constant is cloned with same value.
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<2> : tensor<i32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<2> : tensor<i32>
+
+// Check that the constants used for tf.ExpandDims and tf.AddV2 are different.
+// CHECK: %[[EXPAND_DIMS:.*]] = "tf.ExpandDims"(%[[ARG_0]], %[[CST_1]])
+// CHECK: %[[ADDV2:.*]] = "tf.AddV2"(%[[CST_0]], %[[CST_0]])
+
+// -----
+
+// CHECK-LABEL: @duplicate_const_for_shape_determining_operand_at_idx_2
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<16x4xf32>, %[[ARG_1:.*]]: tensor<16xi32>)
+func.func private @duplicate_const_for_shape_determining_operand_at_idx_2(%arg0: tensor<16x4xf32>, %arg1: tensor<16xi32>) -> tensor<16xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi32>} : () -> tensor<1xi32>
+  // idx 2 should be a compile time constant
+  %0 = "tf.GatherV2"(%arg0, %arg1, %cst) {batch_dims = 1: i64} : (tensor<16x4xf32>, tensor<16xi32>, tensor<1xi32>) -> tensor<16xf32>
+
+  // Just to introduce an extra use for %cst.
+  %1 = "tf.AddV2"(%cst, %cst) {device = ""} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+
+  return %0 : tensor<16xf32>
+}
+// Check that the constant is cloned with same value.
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<1xi32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<1xi32>
+
+// Check that the constants used for tf.GatherV2 and tf.AddV2 are different.
+// CHECK: %[[GATHER_V2:.*]] = "tf.GatherV2"(%[[ARG_0]], %[[ARG_1]], %[[CST_1]])
+// CHECK: %[[ADDV2:.*]] = "tf.AddV2"(%[[CST_0]], %[[CST_0]])
+
+// -----
+
+// CHECK-LABEL: @duplicate_const_for_shape_determining_operand_with_variadic_operand
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<16x1xf32>
+func.func private @duplicate_const_for_shape_determining_operand_with_variadic_operand(%arg0: tensor<16x1xf32>) -> tensor<16x4xf32> {
+  %axis = "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // tf.ConcatV2 accepts a variadic operand. The last operand should be compile
+  // time constant.
+  %0 = "tf.ConcatV2"(%arg0, %arg0, %arg0, %arg0, %axis) : (tensor<16x1xf32>, tensor<16x1xf32>, tensor<16x1xf32>, tensor<16x1xf32>, tensor<i32>) -> tensor<16x4xf32>
+
+  // Just to introduce an extra use for %cst.
+  %1 = "tf.AddV2"(%axis, %axis) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  return %0 : tensor<16x4xf32>
+}
+// Check that the constant is cloned with same value.
+// The duplicated constant is the last index of the ConcatV2 op (which
+// accepts a variadic arg).
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+
+// Check that the constants used for tf.ConcatV2 and tf.AddV2 are different.
+// CHECK: %[[CONCAT_V2:.*]] = "tf.ConcatV2"(%[[ARG_0]], %[[ARG_0]], %[[ARG_0]], %[[ARG_0]], %[[CST_1]])
+// CHECK: %[[ADDV2:.*]] = "tf.AddV2"(%[[CST_0]], %[[CST_0]])
+
+// -----
+
+// CHECK-LABEL: @duplicate_const_for_multiple_shape_determining_operands
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<8x4x16x16x16xf32>
+// CHECK-SAME: %[[ARG_1:.*]]: tensor<4x3x3x16x16xf32>
+func.func private @duplicate_const_for_multiple_shape_determining_operands(
+    %arg0: tensor<8x4x16x16x16xf32>, %arg1: tensor<4x3x3x16x16xf32>) -> tensor<8x4x14x14x16xf32> {
+  %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %rhs_dilation = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+  %feature_group_count = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+
+  // tf.XlaConvV2's 2, 3, 4, 5, 6 indices should be compile-time constants.
+  %0 = "tf.XlaConvV2"(%arg0, %arg1, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {
+      batch_group_count = 1 : i64,
+      dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03",
+      precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>,
+         tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+
+  // Just to introduce an extra use for %cst.
+  %1 = "tf.AddV2"(%feature_group_count, %feature_group_count) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %2 = "tf.AddV2"(%lhs_dilation, %lhs_dilation) {device = ""} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+  %3 = "tf.AddV2"(%rhs_dilation, %rhs_dilation) {device = ""} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+  %4 = "tf.AddV2"(%padding, %padding) {device = ""} : (tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+  %5 = "tf.AddV2"(%strides, %strides) {device = ""} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+
+  return %0 : tensor<8x4x14x14x16xf32>
+}
+
+// Check that the constants that are input to XlaConvV2's 3rd, 4th, 5th, 6th
+// and 7th arguments are cloned with same value.
+// CHECK-DAG: %[[STRIDES:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[3, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[STRIDES_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[3, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[PADDING:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<0> : tensor<3x2xi32>
+// CHECK-DAG: %[[PADDING_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<0> : tensor<3x2xi32>
+// CHECK-DAG: %[[LHS_DILATION:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[4, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[LHS_DILATION_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[4, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[RHS_DILATION:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<3xi32>
+// CHECK-DAG: %[[RHS_DILATION_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<3xi32>
+// CHECK-DAG: %[[FEATURE_GROUP_COUNT:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+// CHECK-DAG: %[[FEATURE_GROUP_COUNT_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+
+// Check that the constants that are input to XlaConvV2's 3rd and 4th
+// arguments are not duplicated.
+// CHECK-NOT: "tf.Const"()
+
+// Check that the constants used for tf.XlaConvV2 and tf.AddV2s are different.
+// CHECK: %[[GATHER_V2:.*]] = "tf.XlaConvV2"(%[[ARG_0]], %[[ARG_1]], %[[STRIDES_COPY]], %[[PADDING_COPY]], %[[LHS_DILATION_COPY]], %[[RHS_DILATION_COPY]], %[[FEATURE_GROUP_COUNT_COPY]])
+
+// CHECK: %[[ADDV2_2:.*]] = "tf.AddV2"(%[[FEATURE_GROUP_COUNT]], %[[FEATURE_GROUP_COUNT]])
+// CHECK: %[[ADDV2_0:.*]] = "tf.AddV2"(%[[LHS_DILATION]], %[[LHS_DILATION]])
+// CHECK: %[[ADDV2_1:.*]] = "tf.AddV2"(%[[RHS_DILATION]], %[[RHS_DILATION]])
+
+// -----
+
+// CHECK-LABEL: @stop_recursion_when_arg_is_reached
+func.func private @stop_recursion_when_arg_is_reached(%arg0: tensor<1x2x3xf32>, %arg1: tensor<i32>) -> tensor<?x?x?xf32> {
+// The pass wants to duplicate constants for TF::MeanOp's operand idx 1, but
+// it can't proceed since it is a function argument.
+
+// expected-warning @+1 {{Operand idx (zero-based): 1 does not have a defining op and cannot be duplicated}}
+  %0 = "tf.Mean"(%arg0, %arg1) {device = ""} : (tensor<1x2x3xf32>, tensor<i32>) -> tensor<?x?x?xf32>
+
+  return %0: tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @constant_with_single_use_not_duplicated
+func.func private @constant_with_single_use_not_duplicated(%arg0: tensor<1x2x3xf32>) -> tensor<1x3xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%cst, %cst_0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Max"(%arg0, %0) {device = ""} : (tensor<1x2x3xf32>, tensor<i32>) -> tensor<1x3xf32>
+
+  return %1: tensor<1x3xf32>
+}
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"
+// CHECK-SAME: dense<0>
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"
+// CHECK-SAME: dense<1>
+// Check that there are no extra "tf.Const"s existing in this function.
+// CHECK-NOT: "tf.Const"
+
+// Check that the usages of %[[CST]] and %[[CST_0]] are untouched.
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[CST]], %[[CST_0]])
+// CHECK: "tf.Max"({{.*}}, %[[ADD]])
+
+// -----
+
+// CHECK-LABEL: @recursively_duplicate_constants
+func.func private @recursively_duplicate_constants(%arg0: tensor<1x2x3xf32>) -> tensor<1x3xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%cst, %cst_0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Max"(%arg0, %0) {device = ""} : (tensor<1x2x3xf32>, tensor<i32>) -> tensor<1x3xf32>
+
+  // Just to introduce extra usages for %cst and %cst_0.
+  %2 = "tf.Mul"(%cst, %cst_0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  return %1: tensor<1x3xf32>
+}
+// Check that both constants are duplicated, which are used to transitively
+// determine the shape of the result of `tf.Max`.
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"
+// CHECK-SAME: dense<0>
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"
+// CHECK-SAME: dense<0>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"
+// CHECK-SAME: dense<1>
+// CHECK-DAG: %[[CST_2:.*]] = "tf.Const"
+// CHECK-SAME: dense<1>
+
+// -----
+
+// CHECK-LABEL: @early_stop_at_shape_op
+func.func private @early_stop_at_shape_op() -> tensor<1x3xi32> {
+  %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Shape"(%cst) : (tensor<1x3xf32>) -> tensor<2xi32>
+  // Operand index 0 ($dims) should be a compile-time constant.
+  %2 = "tf.Fill"(%1, %cst_0) {device = ""} : (tensor<2xi32>, tensor<i32>) -> tensor<1x3xi32>
+
+  // Just to introduce extra usages for %cst.
+  %3 = "tf.Mul"(%cst, %cst) {device = ""} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+
+  return %2: tensor<1x3xi32>
+}
+// The output of tf.Shape is considered a compile-time constant, so the
+// constant leading to tf.Shape (which transitively becomes an input to the
+// first arg of tf.Fill) is not duplicated.
+
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"
+// CHECK-SAME: dense<1.000000e+00> : tensor<1x3xf32>
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"
+// CHECK-SAME: dense<2> : tensor<i32>
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"(%[[CST]])
+// CHECK: %[[FILL:.*]] = "tf.Fill"(%[[SHAPE]], %[[CST_0]])
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_custom_aggregation_ops.mlir
new file mode 100644
index 000000000000..a7315c44eb7b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_custom_aggregation_ops.mlir
@@ -0,0 +1,353 @@
+// RUN: tf-quant-opt %s -tf-quant-insert-custom-aggregation-ops='test-case=MIN_MAX' -split-input-file | FileCheck --check-prefix=MIN-MAX-CHECK %s
+// RUN: tf-quant-opt %s -tf-quant-insert-custom-aggregation-ops='test-case=AVERAGE_MIN_MAX'  -split-input-file | FileCheck --check-prefix=AVERAGE-MIN-MAX-CHECK %s
+// RUN: tf-quant-opt %s -tf-quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_PERCENTILE' -split-input-file | FileCheck --check-prefix=HISTOGRAM-PERCENTILE-CHECK %s
+// RUN: tf-quant-opt %s -tf-quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_BRUTEFORCE' -split-input-file | FileCheck --check-prefix=HISTOGRAM-MSE-BRUTEFORCE-CHECK %s
+// RUN: tf-quant-opt %s -tf-quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_MAX_FREQUENCY' -split-input-file | FileCheck --check-prefix=HISTOGRAM-MSE-MAX-FREQUENCY-CHECK %s
+// RUN: tf-quant-opt %s -tf-quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_SYMMETRIC' -split-input-file | FileCheck --check-prefix=HISTOGRAM-MSE-SYMMETRIC-CHECK %s
+
+module {
+  func.func @wrap_composite_func(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+    %0 = "tf.PartitionedCall"(%arg0, %arg1) <{f = @composite_conv2d_with_relu6_fn}> {_tfl_quant_trait = "fully_quantizable"}
+          : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %0 : tensor<*xf32>
+  }
+
+  func.func @no_composite_func(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+    %add = "tf.AddV2"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %add : tensor<*xf32>
+  }
+
+  func.func @composite_conv2d_with_relu6_fn(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %1 = "tf.Relu6"(%0) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %1 : tensor<*xf32>
+  }
+}
+
+// CalibrationOptions(calibration_method=CALIBRATION_METHOD_MIN_MAX)
+// MIN-MAX-CHECK: func @wrap_composite_func
+// MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 1 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
+// MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 1 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
+
+// MIN-MAX-CHECK: func @no_composite_func
+// MIN-MAX-CHECK-NEXT:  "tf.AddV2"
+// MIN-MAX-CHECK-NEXT:  return
+
+// MIN-MAX-CHECK: func @composite_conv2d_with_relu6_fn
+// MIN-MAX-CHECK-NEXT:  "tf.Conv2D"
+// MIN-MAX-CHECK-NEXT:  "tf.Relu6"
+// MIN-MAX-CHECK-NEXT:  return
+
+// CalibrationOptions(calibration_method=CALIBRATION_METHOD_AVERAGE_MIN_MAX)
+// AVERAGE-MIN-MAX-CHECK: func @wrap_composite_func
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 2 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 2 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 2 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// AVERAGE-MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
+
+// AVERAGE-MIN-MAX-CHECK: func @no_composite_func
+// AVERAGE-MIN-MAX-CHECK-NEXT:  "tf.AddV2"
+// AVERAGE-MIN-MAX-CHECK-NEXT:  return
+
+// AVERAGE-MIN-MAX-CHECK: func @composite_conv2d_with_relu6_fn
+// AVERAGE-MIN-MAX-CHECK-NEXT:  "tf.Conv2D"
+// AVERAGE-MIN-MAX-CHECK-NEXT:  "tf.Relu6"
+// AVERAGE-MIN-MAX-CHECK-NEXT:  return
+
+// CalibrationOptions(
+//   calibration_method=CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
+//   calibration_parameters=CalibrationParameters(num_bins=256, min_percentile=0.001, max_percentile=99.999)
+// )
+// HISTOGRAM-PERCENTILE-CHECK: func @wrap_composite_func
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 3 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_3", max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 3 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_3", max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 3 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_3", max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
+
+// HISTOGRAM-PERCENTILE-CHECK: func @no_composite_func
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  "tf.AddV2"
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  return
+
+// HISTOGRAM-PERCENTILE-CHECK: func @composite_conv2d_with_relu6_fn
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  "tf.Conv2D"
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  "tf.Relu6"
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  return
+
+// CalibrationOptions(
+//   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
+//   calibration_parameters=CalibrationParameters(num_bins=256)
+// )
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @wrap_composite_func
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 4 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_4", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 4 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_4", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 4 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_4", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
+
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @no_composite_func
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  "tf.AddV2"
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return
+
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @composite_conv2d_with_relu6_fn
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  "tf.Conv2D"
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  "tf.Relu6"
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return
+
+// CalibrationOptions(
+//   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
+//   calibration_parameters=CalibrationParameters(num_bins=256)
+// )
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @wrap_composite_func
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 5 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_5", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 5 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_5", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 5 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_5", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return [[res]] : tensor<*xf32>
+
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @no_composite_func
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  "tf.AddV2"
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return
+
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @composite_conv2d_with_relu6_fn
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  "tf.Conv2D"
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  "tf.Relu6"
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return
+
+// CalibrationOptions(
+//   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
+//   calibration_parameters=CalibrationParameters(num_bins=256)
+// )
+// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @wrap_composite_func
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 6 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_6", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 6 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_6", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 6 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_6", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return [[res]] : tensor<*xf32>
+
+// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @no_composite_func
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  "tf.AddV2"
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return
+
+// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @composite_conv2d_with_relu6_fn
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  "tf.Conv2D"
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  "tf.Relu6"
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return
+
+
+// -----
+
+module {
+  // CHECK-LABEL: func.func @main
+  func.func @main(%arg0: tensor<?x100352xf32>, %arg1: tensor<100352x10xf32>) -> tensor<?x10xf32> {
+    // MIN-MAX-CHECK-DAG: %[[ARG0_ID:.*]] = "tf.Identity"(%arg0)
+    // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG0_ID]])
+    // MIN-MAX-CHECK-SAME: id = "composite_dot_general_fn_1_arg_0_calibration_method_1"
+    // MIN-MAX-CHECK-DAG: %[[ARG1_ID:.*]] = "tf.Identity"(%arg1)
+    // MIN-MAX-CHECK: %[[ARG1_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG1_ID]])
+    // MIN-MAX-CHECK-SAME: id = "composite_dot_general_fn_1_arg_1_calibration_method_1"
+    // MIN-MAX-CHECK: %[[RES:.*]] = "tf.XlaCallModule"(%[[ARG0_AGG]], %[[ARG1_AGG]])
+    // MIN-MAX-CHECK: %[[RES_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[RES]])
+    // MIN-MAX-CHECK-SAME: id = "composite_dot_general_fn_1_calibration_method_1"
+    // MIN-MAX-CHECK: %[[RES_ID:.*]] = "tf.Identity"(%[[RES_AGG]])
+    // MIN-MAX-CHECK: return %[[RES_ID]] : tensor<?x10xf32>
+    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<?x100352xf32>) -> tensor<?x100352xf32>
+    %1 = "tf.Identity"(%arg1) {device = ""} : (tensor<100352x10xf32>) -> tensor<100352x10xf32>
+    %2 = "tf.XlaCallModule"(%0, %1) <{
+        Sout = [#tf_type.shape<?x10>], dim_args_spec = [],
+        disabled_checks = [], function_list = [],
+        has_token_input_output = false, module = "", platforms = [],
+        version = 5 : i64
+    }> {
+        _entry_function = @composite_dot_general_fn_1,
+        _stablehlo_version = "1.0.0",
+        _original_entry_function = "composite_dot_general_fn_1",
+        _tfl_quant_trait = "fully_quantizable",
+        _quantization_method = "static_range_ptq { }"
+    } : (tensor<?x100352xf32>, tensor<100352x10xf32>) -> tensor<?x10xf32>
+    %3 = "tf.Identity"(%2) {device = ""} : (tensor<?x10xf32>) -> tensor<?x10xf32>
+    return %3 : tensor<?x10xf32>
+  }
+
+  // CHECK-LABEL: func.func private @composite_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<?x100352xf32>, %arg1: tensor<100352x10xf32>) -> tensor<?x10xf32> {
+    // CHECK-NOT: tf.CustomAggregator
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x100352xf32>, tensor<100352x10xf32>) -> tensor<?x10xf32>
+    return %0 : tensor<?x10xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_0 = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %0 = "tf.Sum"(%arg0, %cst) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst_0) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.If"(%1, %arg0) <{else_branch = @cond_false_80, is_stateless = true, then_branch = @cond_true_70}> {Tcond = i1, Tin = [f32], Tout = [i1, f32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>, tensor<1x4xf32>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+
+  func.func private @cond_false_80(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_false_8"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %2 = "tf.Identity"(%1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func private @cond_false_80
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_calibration_method_1"
+
+  func.func private @cond_true_70(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_true_7"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %2 = "tf.Identity"(%1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func private @cond_true_70
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_2_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_2_calibration_method_1"
+
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_2 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_3 = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_4 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_5 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %0 = "tf.Sum"(%arg0, %cst_0) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.IfRegion"(%1) <{_else_func_name = "cond_false_80", _then_func_name = "cond_true_70", is_stateless = true}> ({
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_2) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %6 = "tf.Identity"(%5) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }, {
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%arg0, %cst_4, %cst_5) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %6 = "tf.Identity"(%5) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func @serving_default
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: "tf.IfRegion"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_2_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_calibration_method_1"
+
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<10x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<10x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<0.000000e+00>: tensor<10x1024x3xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_with_relu_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    return %0 : tensor<10x1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func @main
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_relu_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_relu_fn_1_calibration_method_1"
+
+  func.func private @composite_dot_general_with_relu_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10x1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [DEFAULT, DEFAULT] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %1 = stablehlo.maximum %0, %cst : tensor<10x1x3xf32>
+    return %1 : tensor<10x1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<1.000000e+01> : tensor<f32>
+    %cst_0 = stablehlo.constant dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>
+    %c = stablehlo.constant dense<true> : tensor<i1>
+    %cst_1 = stablehlo.constant dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>
+    %cst_2 = stablehlo.constant dense<-0.000000e+00> : tensor<f32>
+    %cst_3 = stablehlo.constant dense<[[0.335351914, 0.084816426, -0.664676845]]> : tensor<1x3xf32>
+    %cst_4 = stablehlo.constant dense<[[0.117216609, 0.933735609, 0.0728900209]]> : tensor<1x3xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_2) applies stablehlo.add across dimensions = [0, 1] : (tensor<1x4xf32>, tensor<f32>) -> tensor<f32>
+    %1 = stablehlo.compare  GT, %0, %cst : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "stablehlo.if"(%1) ({
+      %3 = "tf.XlaCallModule"(%arg0, %cst_0, %cst_3) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_2, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_2", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      stablehlo.return %c, %3 : tensor<i1>, tensor<1x3xf32>
+    }, {
+      %3 = "tf.XlaCallModule"(%arg0, %cst_1, %cst_4) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_1, _stablehlo_version = "1.0.0", _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      stablehlo.return %c, %3 : tensor<i1>, tensor<1x3xf32>
+    }) : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    return %2#1 : tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func @main
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_bias_same_shape_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: "stablehlo.if"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_bias_same_shape_fn_2_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_bias_same_shape_fn_1_calibration_method_1"
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_main_function.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_main_function.mlir
new file mode 100644
index 000000000000..397dddcb1f66
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_main_function.mlir
@@ -0,0 +1,214 @@
+// RUN: tf-quant-opt %s -tf-quant-insert-main-function -mlir-disable-threading \
+// RUN:     -allow-unregistered-dialect -split-input-file | FileCheck %s
+
+// CHECK-LABEL: module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  func.func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    func.return
+  }
+// CHECK: func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]}
+
+  func.func @mul1(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["mul1"]} {
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+// CHECK: func private @mul1(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0", outputs = "PartitionedCall:0"}}
+// CHECK:   %[[MUL_0:.*]] = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:   return %[[MUL_0]] : tensor<1xf32>
+// CHECK: }
+
+  func.func @mul2(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "mul2_y:0,mul2_x:0", outputs = "PartitionedCall_1:0"}, tf_saved_model.exported_names = ["mul2"]} {
+    %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+    func.return %1 : tensor<1xf32>
+  }
+// CHECK: func private @mul2(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> attributes {tf.entry_function = {inputs = "mul2_y:0,mul2_x:0", outputs = "PartitionedCall_1:0"}} {
+// CHECK:   %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+// CHECK:   %[[MUL_1:.*]] = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:   %[[MUL_2:.*]] = "tf.Mul"(%[[MUL_1]], %[[CONST_0]]) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+// CHECK:   return %[[MUL_2]] : tensor<1xf32>
+// CHECK: }
+
+// CHECK: func @main(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["mul1_y:0"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["mul1_x:0"]}, %arg2: tensor<1xf32> {tf_saved_model.index_path = ["mul2_y:0"]}, %arg3: tensor<1xf32> {tf_saved_model.index_path = ["mul2_x:0"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall:0"]}, tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall_1:0"]}) attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0,mul2_y:0,mul2_x:0", outputs = "PartitionedCall:0,PartitionedCall_1:0"}, tf_saved_model.exported_names = ["main"]} {
+// CHECK-NOT: f = @NoOp
+// CHECK:   %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1) <{config = "", config_proto = "", executor_type = "", f = @mul1}> : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:   %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg2, %arg3) <{config = "", config_proto = "", executor_type = "", f = @mul2}> : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK-DAG:   %[[IDENTITY_0:.*]] = "tf.Identity"(%[[PARTITIONEDCALL_0]])
+// CHECK-DAG:   %[[IDENTITY_1:.*]] = "tf.Identity"(%[[PARTITIONEDCALL_1]])
+// CHECK:   return %[[IDENTITY_0]], %[[IDENTITY_1]] : tensor<1xf32>, tensor<1xf32>
+// CHECK: }
+}
+
+// -----
+
+// Test a case where there is an exported function not labeled tf.entry_function.
+// CHECK-LABEL: module attributes {tf.versions = {producer = 1132 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
+module attributes {tf.versions = {producer = 1132 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  "tf_saved_model.asset"() {filename = "assets/mydata.txt", sym_name = "__tf_saved_model_asset0_mydata.txt"} : () -> ()
+// Session initializer ops and asset ops untouched.
+// CHECK: "tf_saved_model.session_initializer"() <{initializers = [@NoOp]}> : () -> ()
+// CHECK: "tf_saved_model.asset"() <{filename = "assets/mydata.txt", sym_name = "__tf_saved_model_asset0_mydata.txt"}> : () -> ()
+
+  func.func @NoOp(%arg0: tensor<!tf_type.string> {tf_saved_model.bound_input = @__tf_saved_model_asset0_mydata.txt}) attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    "tf.InitializeTableFromTextFileV2"(%0, %arg0) {delimiter = "\09", device = "", key_index = -2 : i64, offset = 0 : i64, value_index = -1 : i64, vocab_size = 437 : i64} : (tensor<!tf_type.resource>, tensor<!tf_type.string>) -> ()
+    func.return
+  }
+// Initializer function untouched.
+// CHECK: func.func @NoOp(%[[ARG0:.*]]: tensor<!tf_type.string> {tf_saved_model.bound_input = @__tf_saved_model_asset0_mydata.txt})
+// CHECK-SAME: {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]}
+// CHECK: %[[HASH_TABLE0:.*]] = "tf.HashTableV2"()
+// CHECK: "tf.InitializeTableFromTextFileV2"(%[[HASH_TABLE0]], %[[ARG0]])
+// CHECK: return
+
+  func.func @add(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["x"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["y"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["out_0"]}) attributes {tf.entry_function = {inputs = "add_x:0,add_y:0", outputs = "add:0"}, tf_saved_model.exported_names = ["add"]} {
+    %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+// The previously exported function should now be private.
+// CHECK: func.func private @add
+// CHECK-NOT: tf_saved_model.exported_names
+// Other attributes should be left untouched.
+// CHECK-SAME: attributes {tf.entry_function = {inputs = "add_x:0,add_y:0", outputs = "add:0"}}
+
+// Test the newly created "main" function.
+// CHECK: func.func @main(%[[ARG0:.*]]: tensor<1xf32> {tf_saved_model.index_path = ["add_x:0"]}, %[[ARG1:.*]]: tensor<1xf32> {tf_saved_model.index_path = ["add_y:0"]})
+// CHECK-SAME: -> (tensor<1xf32> {tf_saved_model.index_path = ["add:0"]})
+// Check attributes of the main function.
+// CHECK-SAME: tf.entry_function = {inputs = "add_x:0,add_y:0", outputs = "add:0"}
+// CHECK-SAME: tf_saved_model.exported_names = ["main"]
+
+// Check that the function call to @add exists and not to @NoOp.
+// CHECK: %[[CALL0:.*]] = "tf.PartitionedCall"(%[[ARG0]], %[[ARG1]]) <{
+// CHECK-NOT: f = @NoOp
+// CHECK-SAME: f = @add
+// CHECK-SAME: }>
+// CHECK-SAME: : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[CALL0]])
+// CHECK: return %[[IDENTITY]] : tensor<1xf32>
+}
+
+// -----
+
+// Test a case where an entry function return multiple values
+module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  func.func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    func.return
+  }
+
+  func.func @topk(%arg0: tensor<16xf32> {tf_saved_model.index_path = ["input"]}, %arg1: tensor<i32> {tf_saved_model.index_path = ["k"]}) -> (tensor<?xf32> {tf_saved_model.index_path = ["values"]}, tensor<?xi32> {tf_saved_model.index_path = ["indices"]}) attributes {tf.entry_function = {inputs = "input:0,k:0", outputs = "TopK:0,TopK:1"}, tf_saved_model.exported_names = ["topk"]} {
+    %0:2 = "tf.TopKV2"(%arg0, %arg1): (tensor<16xf32>, tensor<i32>) -> (tensor<?xf32>, tensor<?xi32>)
+    func.return %0#0, %0#1: tensor<?xf32>, tensor<?xi32>
+  }
+
+// CHECK: func.func private @topk(%arg0: tensor<16xf32>, %arg1: tensor<i32>) -> (tensor<?xf32>, tensor<?xi32>)
+// CHECK-SAME: attributes {tf.entry_function = {inputs = "input:0,k:0", outputs = "TopK:0,TopK:1"}}
+
+// CHECK: func.func @main(%arg0: tensor<16xf32> {tf_saved_model.index_path = ["input:0"]}, %arg1: tensor<i32> {tf_saved_model.index_path = ["k:0"]})
+// CHECK-SAME: -> (tensor<?xf32> {tf_saved_model.index_path = ["TopK:0"]}, tensor<?xi32> {tf_saved_model.index_path = ["TopK:1"]})
+// CHECK-SAME: attributes {tf.entry_function = {inputs = "input:0,k:0", outputs = "TopK:0,TopK:1"}, tf_saved_model.exported_names = ["main"]}
+// CHECK: %[[CALL0:.*]]:2 = "tf.PartitionedCall"(%arg0, %arg1) <{config = "", config_proto = "", executor_type = "", f = @topk}>
+// Expects an IdentityN op to be created.
+// CHECK: %[[IDENTITY:.*]]:2 = "tf.IdentityN"(%[[CALL0]]#0, %[[CALL0]]#1) : (tensor<?xf32>, tensor<?xi32>) -> (tensor<?xf32>, tensor<?xi32>)
+// CHECK: return %[[IDENTITY]]#0, %[[IDENTITY]]#1 : tensor<?xf32>, tensor<?xi32>
+}
+
+// -----
+
+// Test that the signature prefix is added when there are duplicated input names.
+module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  func.func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    func.return
+  }
+
+  func.func @mul1(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "y:0,x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["mul1"]} {
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+  func.func @mul2(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "y:0,x:0", outputs = "PartitionedCall_1:0"}, tf_saved_model.exported_names = ["mul2"]} {
+    %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+    func.return %1 : tensor<1xf32>
+  }
+
+// CHECK: func @main
+// CHECK: (%arg0: tensor<1xf32> {tf_saved_model.index_path = ["mul1_y:0"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["mul1_x:0"]}
+// CHECK: %arg2: tensor<1xf32> {tf_saved_model.index_path = ["mul2_y:0"]}, %arg3: tensor<1xf32> {tf_saved_model.index_path = ["mul2_x:0"]})
+// CHECK: -> (tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall:0"]}, tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall_1:0"]})
+// CHECK: attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0,mul2_y:0,mul2_x:0", outputs = "PartitionedCall:0,PartitionedCall_1:0"}, tf_saved_model.exported_names = ["main"]}
+}
+
+// -----
+
+// Test that the signature prefix is added when there are duplicated output names.
+module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  func.func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    func.return
+  }
+
+  func.func @mul1(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0", outputs = "output:0"}, tf_saved_model.exported_names = ["mul1"]} {
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+  func.func @mul2(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "mul2_y:0,mul2_x:0", outputs = "output:0"}, tf_saved_model.exported_names = ["mul2"]} {
+    %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+    func.return %1 : tensor<1xf32>
+  }
+// CHECK: func @main
+// CHECK: (%arg0: tensor<1xf32> {tf_saved_model.index_path = ["mul1_y:0"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["mul1_x:0"]}
+// CHECK: %arg2: tensor<1xf32> {tf_saved_model.index_path = ["mul2_y:0"]}, %arg3: tensor<1xf32> {tf_saved_model.index_path = ["mul2_x:0"]})
+// CHECK: -> (tensor<1xf32> {tf_saved_model.index_path = ["mul1_output:0"]}, tensor<1xf32> {tf_saved_model.index_path = ["mul2_output:0"]})
+// CHECK: attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0,mul2_y:0,mul2_x:0", outputs = "mul1_output:0,mul2_output:0"}, tf_saved_model.exported_names = ["main"]}
+}
+
+// -----
+
+// Tests when a function called @main already exists, it is renamed to
+// `main_{i}` to avoid conflict.
+module attributes {tf_saved_model.semantics}  {
+  func.func @main(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["x"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["y"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "x:0,y:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+// CHECK: func.func private @main_0
+// CHECK: func.func @main
+}
+
+// -----
+
+// Tests when a function called @main already exists and @main_{i} also already
+// exists, it increments the suffix number until there's no conflict.
+module attributes {tf_saved_model.semantics}  {
+  func.func @main_0(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["z"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "z:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main_0"]} {
+    %0 = "tf.Identity"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+  func.func @main(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["x"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["y"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "x:0,y:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+// `@main_0` remains touched.
+// CHECK: func.func private @main_0
+// CHECK-SAME: z:0
+
+// `@main` should be renamed to `@main_1` instead of `@main_0` to avoid
+// conflict.
+// CHECK: func.func private @main_1
+// CHECK-SAME: x:0
+
+// This is the newly created main function.
+// CHECK: func.func @main
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_quantized_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_quantized_functions.mlir
new file mode 100644
index 000000000000..b3e01bdfe20b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_quantized_functions.mlir
@@ -0,0 +1,62 @@
+// RUN: tf-quant-opt %s -tf-quant-insert-quantized-functions | FileCheck %s
+// RUN: tf-quant-opt %s -tf-quant-insert-quantized-functions='quantization-method=ptq target-opset=UNIFORM_QUANTIZED' --mlir-print-ir-after-all | FileCheck --check-prefix=UQ-CHECK %s
+
+// Empty module
+module {
+  func.func @simple_fn(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    func.return %arg0 : tensor<*xf32>
+  }
+}
+
+// CHECK-NOT: func private @internal_rescale_fn
+// CHECK-NOT: func private @internal_relu_fn
+// CHECK-NOT: func private @internal_conv2d_fn
+// CHECK-NOT: func private @internal_matmul_fn
+// CHECK: func private @quantized_conv2d_with_bias_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["Conv2D", "BiasAdd"]
+// CHECK: func private @quantized_conv2d_with_bias_and_relu_fn
+// CHECK: func private @quantized_conv2d_with_bias_and_relu6_fn
+// CHECK: func private @quantized_conv2d_fn
+// CHECK: func private @quantized_conv2d_with_relu_fn
+// CHECK: func private @quantized_conv2d_with_relu6_fn
+// CHECK: func private @quantized_depthwise_conv2d_with_bias_and_relu_float_output_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["DepthwiseConv2D", "BiasAdd", "Relu"]
+// CHECK: func private @quantized_matmul_with_bias_fn
+// CHECK: func private @quantized_matmul_with_bias_and_relu_fn
+// CHECK: func private @quantized_matmul_with_bias_and_relu6_fn
+// CHECK: func private @quantized_matmul_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["MatMul"]
+// CHECK: func private @quantized_matmul_with_relu_fn
+// CHECK: func private @quantized_matmul_with_relu6_fn
+// CHECK: func private @quantized_conv3d_with_bias_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["Conv3D", "BiasAdd"]
+// CHECK: func private @quantized_batch_matmul_with_bias_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["BatchMatMul", "BiasAdd"]
+// CHECK: func private @quantize_i8
+// CHECK: func private @dequantize_i8
+
+// UQ-CHECK-NOT: func private @internal_conv2d_fn
+// UQ-CHECK-NOT: func private @internal_requantize_qi8_fn
+// UQ-CHECK-NOT: func private @internal_requantize_no_activation_fn
+// UQ-CHECK-NOT: func private @internal_requantize_and_relu_fn
+// UQ-CHECK: func private @quantized_conv2d_with_bias_fn
+// UQ-CHECK-SAME: tf_quant.quantized_ops = ["Conv2D", "BiasAdd"]
+// UQ-CHECK: func private @quantized_conv2d_with_bias_and_relu_fn
+// UQ-CHECK: func private @quantized_conv2d_with_bias_and_relu6_fn
+// UQ-CHECK: func private @quantized_conv2d_with_relu_fn
+// UQ-CHECK: func private @quantized_conv2d_with_relu6_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_bias_fn
+// UQ-CHECK-SAME: tf_quant.quantized_ops = ["DepthwiseConv2D", "BiasAdd"]
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_bias_and_relu_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_bias_and_relu6_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_relu_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_relu6_fn
+// UQ-CHECK: func private @quantized_matmul_with_bias_fn
+// UQ-CHECK-SAME: tf_quant.quantized_ops = ["MatMul", "BiasAdd"]
+// UQ-CHECK: func private @quantized_matmul_with_bias_and_relu_fn
+// UQ-CHECK: func private @quantized_matmul_with_bias_and_relu6_fn
+// UQ-CHECK: func private @quantized_matmul_with_relu_fn
+// UQ-CHECK: func private @quantized_matmul_with_relu6_fn
+// UQ-CHECK: func private @quantize_i8
+// UQ-CHECK: func private @quantize_i32
+// UQ-CHECK: func private @dequantize_i8
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_restore_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_restore_op.mlir
new file mode 100644
index 000000000000..6723026aad7f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_restore_op.mlir
@@ -0,0 +1,192 @@
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -tf-quant-insert-restore-op | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -mlir-print-debuginfo -mlir-print-local-scope \
+// RUN:   -tf-quant-insert-restore-op | FileCheck %s --check-prefix CHECK-LOC
+
+// RestoreV2 op created for a single VarHandleOp.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+
+// CHECK: func.func @init_func_restore_op
+// Check that an argument ("__tf_file_prefix") is created.
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+
+// Original `AssignVariableOp(VarHandleOp, Const)` pattern persists.
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{.*value = dense<1.000000e\+00> : tensor<2xf32>.*}}
+// CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[CST_0]]) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*value = dense<"var_0"> : tensor<1x!tf_type.string>.*}}
+// CHECK-DAG: %[[CST_2:.*]] = "tf.Const"() {{.*value = dense<""> : tensor<1x!tf_type.string>.*}}
+
+// Test that RestoreV2 op is created with 1 resulting value.
+// CHECK: %[[RESTORE:.*]] = "tf.RestoreV2"(%[[ARG_0]], %[[CST_1]], %[[CST_2]]) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<2xf32>
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[RESTORE]]) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+// Test that the loc is properly set to it's shared_name.
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
+// CHECK-LOC-SAME: loc("var_0")
+}
+
+// -----
+
+// RestoreV2 op created for multiple VarHandleOps.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op_multiple_variables]} : () -> ()
+
+  func.func @init_func_restore_op_multiple_variables() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+    %cst_1 = "tf.Const"() {value = dense<2> : tensor<4xi32>} : () -> tensor<4xi32>
+    %var_1 = "tf.VarHandleOp"() {shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<4xi32>>>
+    "tf.AssignVariableOp"(%var_1, %cst_1) : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
+    return
+  }
+
+// CHECK: func.func @init_func_restore_op_multiple_variables
+// Check that an argument ("__tf_file_prefix") is created.
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_1".*}} : () -> tensor<!tf_type.resource<tensor<4xi32>>>
+
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}>
+
+// Test that RestoreV2 op is created with 2 resulting values.
+// CHECK: %[[RESTORE:.*]]:2 = "tf.RestoreV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]]) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<2xf32>, tensor<4xi32>)
+
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
+
+// Test that the locs are properly set to their shared_names.
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
+// CHECK-LOC-SAME: loc("var_0")
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_1".*}}}>
+// CHECK-LOC-SAME: loc("var_1")
+}
+
+// -----
+
+// RestoreV2 op not created for `AssignVariableOp(VarHandleOp, Const)` patterns
+// in the initializer function of "init_op" type.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_init_op]} : () -> ()
+
+  func.func @init_func_init_op() -> () attributes {
+      tf_saved_model.initializer_type = "init_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+// Check that no function argument is created.
+// CHECK: func.func @init_func_init_op()
+
+// CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}> : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{{{.*value = dense<1.000000e\+00> : tensor<2xf32>.*}}}>
+// Make sure that "tf.RestoreV2" is not created.
+// CHECK-NOT: "tf.RestoreV2"
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[CST]]) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+// CHECK-LOC: @init_func_init_op
+// CHECK-LOC: return
+}
+
+// -----
+
+// Test that `RestoreV2Op` is created even when the `Const` op is shared across
+// `AssignVariableOp`s.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op_multiple_variables_sharing_const]} : () -> ()
+
+  func.func @init_func_restore_op_multiple_variables_sharing_const() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    // This const is shared and initializes two variables.
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+    %var_1 = "tf.VarHandleOp"() {shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_1, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+
+// CHECK: func.func @init_func_restore_op_multiple_variables_sharing_const
+// Check that an argument ("__tf_file_prefix") is created.
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_1".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}>
+
+// Test that RestoreV2 op is created with 2 resulting values.
+// CHECK: %[[RESTORE:.*]]:2 = "tf.RestoreV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]]) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<2xf32>, tensor<2xf32>)
+
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) <{validate_shape = false}> : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+// Test that the locs are properly set to their shared_names.
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
+// CHECK-LOC-SAME: loc("var_0")
+// CHECK-LOC: "tf.VarHandleOp"() <{{{.*shared_name = "var_1".*}}}>
+// CHECK-LOC-SAME: loc("var_1")
+}
+
+
+// -----
+
+// Test that "tf.RestoreV2" is not created because there are no variables.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op_no_variable]} : () -> ()
+
+  func.func @init_func_restore_op_no_variable() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    return
+  }
+// CHECK: func.func @init_func_restore_op_no_variable()
+// CHECK-NOT: "tf.RestoreV2"
+}
+
+// -----
+
+// Test when there are no initializers.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+// CHECK-NOT: "tf.RestoreV2"
+}
+
+// -----
+
+// Test when there is no SessionInitializerOp.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK-NOT: "tf.RestoreV2"
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_save_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_save_op.mlir
new file mode 100644
index 000000000000..d8dacbab31a7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_insert_save_op.mlir
@@ -0,0 +1,116 @@
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -tf-quant-insert-save-op | FileCheck %s
+
+// SaveV2 op created for a single VarHandleOp.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+// SessionInitializerOp is untouched.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: {{.*initializers = \[@init_func_restore_op\].*}}
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"],
+      tf_saved_model.initializer_type = "restore_op"} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+// Initializer function is untouched.
+// CHECK: func.func @init_func_restore_op
+// CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
+// CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"
+// CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[CST]])
+
+// Test that a new save function that wraps the SaveV2 op is created.
+// CHECK: func.func private @tf_quant__save(%[[ARG:.*]]: tensor<!tf_type.string>)
+// CHECK: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"()
+// CHECK-SAME: {{.*shared_name = "var_0".*}}
+// CHECK: %[[READ_VARIABLE:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type.string>.*}}}>
+// CHECK: "tf.SaveV2"(%[[ARG]], %[[CONST_0]], %[[CONST_1]], %[[READ_VARIABLE]])
+// CHECK: return
+}
+
+// -----
+
+// SaveV2 op created for multiple VarHandleOps.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+// SessionInitializerOp is untouched.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: {{.*initializers = \[@init_func_restore_op\].*}}
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"],
+      tf_saved_model.initializer_type = "restore_op"} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    %cst_1 = "tf.Const"() {value = dense<2.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+    %var_1 = "tf.VarHandleOp"() {shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<3xf32>>>
+    "tf.AssignVariableOp"(%var_1, %cst_1) : (tensor<!tf_type.resource<tensor<3xf32>>>, tensor<3xf32>) -> ()
+    return
+  }
+// Initializer function is untouched.
+// CHECK: func.func @init_func_restore_op
+// CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
+// CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+
+// Test that a new save function that wraps the SaveV2 op is created.
+// CHECK: func.func private @tf_quant__save(%[[ARG:.*]]: tensor<!tf_type.string>)
+// CHECK: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"()
+// CHECK-SAME: {{.*shared_name = "var_0".*}}
+// CHECK: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"()
+// CHECK-SAME: {{.*shared_name = "var_1".*}}
+
+// ReadVariableOps are inserted for each VarHandleOp to read the tensor values.
+// CHECK-DAG: %[[READ_VARIABLE_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_0]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
+// CHECK-DAG: %[[READ_VARIABLE_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_1]]) : (tensor<!tf_type.resource<tensor<3xf32>>>) -> tensor<3xf32>
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}>
+// CHECK: "tf.SaveV2"(%[[ARG]], %[[CONST_0]], %[[CONST_1]], %[[READ_VARIABLE_0]], %[[READ_VARIABLE_1]])
+// CHECK: return
+}
+
+// -----
+
+
+// SaveV2 op not created when SessionInitializerOp doesn't exist.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK-NOT: @tf_quant__save
+}
+
+// -----
+
+// SaveV2 op not created when there are no VarHandleOp in the session
+// initializer function.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"],
+      tf_saved_model.initializer_type = "restore_op"} {
+    return
+  }
+// Test that the function for SaveV2 op is not created.
+// CHECK: func.func @init_func_restore_op
+// CHECK-NOT: @tf_quant__save
+}
+
+// -----
+
+// SaveV2 op not created when the initializer function doesn't exist.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+// Test that the function for SaveV2 op is not created.
+// CHECK-NOT: @tf_quant__save
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_hashtable_ops_as_args.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_hashtable_ops_as_args.mlir
new file mode 100644
index 000000000000..88fd3d9f880b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_hashtable_ops_as_args.mlir
@@ -0,0 +1,167 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-lift-hashtable-ops-as-args | FileCheck %s
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1506 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_all_tables]} : () -> ()
+  func.func @init_all_tables() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_init_all_tables"], tf_saved_model.initializer_type = "init_op"} {
+    %cst = "tf.Const"() {value = dense<["hello", "model", "quantization"]> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    %cst_0 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    "tf.LookupTableImportV2"(%0, %cst, %cst_0) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    return
+  }
+
+// Check that HashTable op in the initilizer is not lifted.
+// CHECK: func.func @init_all_tables()
+// CHECK: %[[OUT_0:.*]] = "tf.HashTableV2"()
+// CHECK: "tf.LookupTableImportV2"(%[[OUT_0]]
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string> ) -> (tensor<*xi64>) attributes {tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}} {
+    %cst = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+    %cst_0 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<0.00235294132> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_4 = "tf.Const"() {value = dense<0.00117647066> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {value = dense<0.00156862743> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %1 = "tf.LookupTableSizeV2"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %2 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+    %3 = "tf.StringToHashBucketFast"(%arg0) {device = "", num_buckets = 5 : i64} : (tensor<?x!tf_type.string>) -> tensor<?xi64>
+    %4 = "tf.AddV2"(%3, %1) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+    %5 = "tf.LookupTableFindV2"(%0, %arg0, %cst) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+    return %5 : tensor<*xi64>
+  }
+
+// Check that HashTable op is lifted.
+// CHECK: func.func private @serving_default
+// CHECK-SAME: (%arg0: tensor<?x!tf_type.string>, %arg1: tensor<!tf_type.resource>) -> tensor<*xi64>
+// CHECK-SAME: tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0,hash_table_1:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}
+// CHECK: "tf.LookupTableSizeV2"(%arg1)
+// CHECK: "tf.LookupTableFindV2"(%arg1
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input_vocabs:0"]} ) -> (tensor<*xi64>  {tf_saved_model.index_path = ["FakeQuantWithMinMaxArgs_2:0"]}) attributes {tf.entry_function = {inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> (tensor<*xi64>)
+    %1 = "tf.Identity"(%0) : (tensor<*xi64>) -> tensor<*xi64>
+    return %1 : tensor<*xi64>
+  }
+
+// Check that the caller is updated.
+// CHECK: func.func @main
+// CHECK: %[[OUT_1:.*]] = "tf.HashTableV2"()
+// CHECK: %[[OUT_2:.*]] = "tf.PartitionedCall"(%arg0, %[[OUT_1]])
+}
+// -----
+// Test nested function case.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1506 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_all_tables]} : () -> ()
+  func.func @init_all_tables() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_init_all_tables"], tf_saved_model.initializer_type = "init_op"} {
+    %cst = "tf.Const"() {value = dense<["hello", "model", "quantization"]> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    %cst_0 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    "tf.LookupTableImportV2"(%0, %cst, %cst_0) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    return
+  }
+
+// Check that HashTable op in the initilizer is not lifted.
+// CHECK: func.func @init_all_tables()
+// CHECK: %[[OUT_0:.*]] = "tf.HashTableV2"()
+// CHECK: "tf.LookupTableImportV2"(%[[OUT_0]]
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string> ) -> (tensor<*xi64>) attributes {tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}} {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default1} : (tensor<?x!tf_type.string>) -> (tensor<*xi64>)
+    %1 = "tf.Identity"(%0) : (tensor<*xi64>) -> tensor<*xi64>
+    return %1 : tensor<*xi64>
+  }
+// Check that HashTable op is passed through.
+// CHECK: func.func private @serving_default
+// CHECK-SAME: (%arg0: tensor<?x!tf_type.string>, %arg1: tensor<!tf_type.resource>) -> tensor<*xi64>
+// CHECK-SAME: tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0,hash_table_1:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}
+// CHECK: "tf.PartitionedCall"(%arg0, %arg1)
+  func.func private @serving_default1(%arg0: tensor<?x!tf_type.string> ) -> (tensor<*xi64>) {
+    %cst = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+    %cst_0 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<0.00235294132> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_4 = "tf.Const"() {value = dense<0.00117647066> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {value = dense<0.00156862743> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %1 = "tf.LookupTableSizeV2"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %2 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+    %3 = "tf.StringToHashBucketFast"(%arg0) {device = "", num_buckets = 5 : i64} : (tensor<?x!tf_type.string>) -> tensor<?xi64>
+    %4 = "tf.AddV2"(%3, %1) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+    %5 = "tf.LookupTableFindV2"(%0, %arg0, %cst) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+    return %5 : tensor<*xi64>
+  }
+
+// Check that HashTable op is lifted.
+// CHECK: func.func private @serving_default1
+// CHECK-SAME: (%arg0: tensor<?x!tf_type.string>, %arg1: tensor<!tf_type.resource>) -> tensor<*xi64>
+// CHECK: "tf.LookupTableSizeV2"(%arg1)
+// CHECK: "tf.LookupTableFindV2"(%arg1
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input_vocabs:0"]} ) -> (tensor<*xi64>  {tf_saved_model.index_path = ["FakeQuantWithMinMaxArgs_2:0"]}) attributes {tf.entry_function = {inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> (tensor<*xi64>)
+    %1 = "tf.Identity"(%0) : (tensor<*xi64>) -> tensor<*xi64>
+    return %1 : tensor<*xi64>
+  }
+// Check that the caller is updated.
+// CHECK: func.func @main
+// CHECK: %[[OUT_1:.*]] = "tf.HashTableV2"()
+// CHECK: %[[OUT_2:.*]] = "tf.PartitionedCall"(%arg0, %[[OUT_1]])
+}
+
+// -----
+
+// Test multiple HashTable ops.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1506 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_all_tables]} : () -> ()
+  func.func @init_all_tables() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_init_all_tables"], tf_saved_model.initializer_type = "init_op"} {
+    %cst = "tf.Const"() {value = dense<["hello", "model", "quantization"]> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    %cst_0 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_0", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    "tf.LookupTableImportV2"(%0, %cst, %cst_0) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    %1 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    "tf.LookupTableImportV2"(%1, %cst, %cst_0) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    return
+  }
+// Check that HashTable op in the initilizer is not lifted.
+// CHECK: func.func @init_all_tables()
+// CHECK: %[[OUT_0:.*]] = "tf.HashTableV2"()
+// CHECK: "tf.LookupTableImportV2"(%[[OUT_0]]
+
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string> ) -> (tensor<*xi64>) attributes {tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}} {
+    %cst = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+    %cst_0 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<0.00235294132> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_4 = "tf.Const"() {value = dense<0.00117647066> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {value = dense<0.00156862743> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %1 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_0", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %2 = "tf.LookupTableSizeV2"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %3 = "tf.LookupTableSizeV2"(%1) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %4 = "tf.AddV2"(%2, %3) {device = ""} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %5 = "tf.LookupTableFindV2"(%0, %arg0, %cst) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+    %6 = "tf.AddV2"(%5, %4) {device = ""} : (tensor<*xi64>, tensor<i64>) -> tensor<*xi64>
+    return %6 : tensor<*xi64>
+  }
+// Check that HashTable op is lifted.
+// CHECK: func.func private @serving_default
+// CHECK-SAME: (%arg0: tensor<?x!tf_type.string>, %arg1: tensor<!tf_type.resource>, %arg2: tensor<!tf_type.resource>) -> tensor<*xi64>
+// CHECK-SAME: tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0,hash_table_1:0,hash_table_2:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}
+// CHECK: "tf.LookupTableSizeV2"(%arg1)
+// CHECK: "tf.LookupTableSizeV2"(%arg2)
+// CHECK: "tf.LookupTableFindV2"(%arg1
+
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input_vocabs:0"]} ) -> (tensor<*xi64>  {tf_saved_model.index_path = ["FakeQuantWithMinMaxArgs_2:0"]}) attributes {tf.entry_function = {inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> (tensor<*xi64>)
+    %1 = "tf.Identity"(%0) : (tensor<*xi64>) -> tensor<*xi64>
+    return %1 : tensor<*xi64>
+  }
+
+// Check that the caller is updated.
+// CHECK: func.func @main
+// CHECK: %[[HASHTABLE_1:.*]] = "tf.HashTableV2"()
+// CHECK: %[[HASHTABLE_2:.*]] = "tf.HashTableV2"()
+// CHECK: %[[OUT_2:.*]] = "tf.PartitionedCall"(%arg0, %[[HASHTABLE_1]], %[[HASHTABLE_2]])
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_quantizable_spots_as_functions.mlir
new file mode 100644
index 000000000000..a0c4086e04cc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_quantizable_spots_as_functions.mlir
@@ -0,0 +1,508 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-lift-quantizable-spots-as-functions | FileCheck %s
+
+// CHECK-LABEL: float_conv
+func.func @float_conv(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %3 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %5 = "tf.Relu"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %6 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %7 = "tf.BiasAdd"(%6, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  func.return %2, %5, %7 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu_fn_1}
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_conv2d_with_bias_fn_1}
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv2d_with_bias_and_relu6_fn_1
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[CONV2D_0]], %arg2)
+// CHECK-NEXT: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
+// CHECK-NEXT: return %[[RELU6_0]]
+
+// CHECK-LABEL: private @composite_conv2d_with_bias_and_relu_fn_1
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[CONV2D_0]], %arg2)
+// CHECK-NEXT: %[[RELU6_0:.*]] = "tf.Relu"(%[[BIASADD_0]])
+// CHECK-NEXT: return %[[RELU6_0]]
+
+// CHECK-LABEL: private @composite_conv2d_with_bias_fn_1
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[CONV2D_0]], %arg2)
+// CHECK-NEXT: return %[[BIASADD_0]]
+}
+
+// -----
+
+func.func @float_conv_strides_equals_to_dilations(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", device = "", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @float_conv_strides_equals_to_dilations(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> {
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<*xf32>
+// CHECK: return %[[PARTITIONEDCALL_0]] : tensor<*xf32>
+// CHECK: }
+
+// CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[CONV2D_0]], %arg2)
+// CHECK-NEXT: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
+// CHECK-NEXT: return %[[RELU6_0]]
+
+// -----
+
+// CHECK-LABEL: float_depthwise_conv
+func.func @float_depthwise_conv(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x1xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %3 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %5 = "tf.Relu"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %6 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %7 = "tf.BiasAdd"(%6, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  func.return %2, %5, %7 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu6_fn_1}>
+// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_and_relu_fn_1
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_bias_fn_1
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_depthwise_conv2d_with_bias_and_relu6_fn_1
+// CHECK-NEXT: %[[DEPTHWISECONV2D_0:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations"
+// CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[DEPTHWISECONV2D_0]], %arg2)
+// CHECK-NEXT: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
+// CHECK-NEXT: return %[[RELU6_0:.*]]
+
+// CHECK-LABEL: private @composite_depthwise_conv2d_with_bias_and_relu_fn_1
+// CHECK-NEXT: %[[DEPTHWISECONV2D_0:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations"
+// CHECK-NEXT: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[DEPTHWISECONV2D_0]], %arg2)
+// CHECK-NEXT: %[[RELU_0:.*]] = "tf.Relu"(%[[BIASADD_0]])
+// CHECK-NEXT: return %[[RELU_0:.*]]
+}
+
+// -----
+
+// CHECK-LABEL: float_matmul
+func.func @float_matmul(
+  %arg0: tensor<1x10xf32>, %arg1: tensor<10x10xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>} : () -> tensor<10xf32>
+  %0 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = false, transpose_b = false
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<10xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = true, transpose_b = false
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<10xf32>) -> tensor<*xf32>
+  %5 = "tf.Relu"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %6 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = false, transpose_b = true
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %7 = "tf.BiasAdd"(%6, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<10xf32>) -> tensor<*xf32>
+  func.return %2, %5, %7 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+
+// CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<10xf32>}>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_matmul_with_bias_and_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_matmul_with_bias_and_relu_fn_1
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]])
+// CHECK-SAME: f = @composite_matmul_with_bias_fn_1
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_matmul_with_bias_and_relu6_fn_1
+// CHECK-NEXT: %[[matmul:.*]] = "tf.MatMul"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:transpose_a,1:transpose_b"
+// CHECK-NEXT: tf.BiasAdd
+// CHECK-NEXT: tf.Relu6
+// CHECK-NEXT: return
+
+// CHECK-LABEL: private @composite_matmul_with_bias_and_relu_fn_1
+// CHECK-NEXT: tf.MatMul"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:transpose_a,1:transpose_b"
+// CHECK-NEXT: tf.BiasAdd
+// CHECK-NEXT: tf.Relu
+// CHECK-NEXT: return
+
+// CHECK-LABEL: private @composite_matmul_with_bias_fn_1
+// CHECK-NEXT: tf.MatMul"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:transpose_a,1:transpose_b"
+// CHECK-NEXT: tf.BiasAdd
+// CHECK-NEXT: return
+}
+
+// -----
+
+func.func @float_matmul_with_reshape(%arg0: tensor<1x10xf32>, %arg1: tensor<10x10xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<10xf32>} : () -> tensor<10xf32>
+  %cst_0 = "tf.Const"() {value = dense<[-1, 10]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = false, transpose_b = true
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %2 = "tf.Reshape"(%1, %cst_0) : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %3 = "tf.BiasAdd"(%2, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<10xf32>) -> tensor<*xf32>
+
+  func.return %3 : tensor<*xf32>
+
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<10xf32>}>
+// CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[-1, 10]> : tensor<2xi32>}>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1, %[[CONST_0]], %[[SHAPE]])
+// CHECK-SAME: f = @composite_matmul_with_reshape_and_bias_fn_1
+// CHECK: return %[[PARTITIONEDCALL_0]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_matmul_with_reshape_and_bias_fn_1
+// CHECK-NEXT: tf.MatMul"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:transpose_a,1:transpose_b"
+// CHECK-NEXT: tf.Reshape
+// CHECK-NEXT: tf.BiasAdd
+// CHECK-NEXT: return
+}
+
+// -----
+
+// CHECK-LABEL: float_conv_no_bias
+func.func @float_conv_no_bias(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
+  %0 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %1 = "tf.Relu6"(%0) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %3 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %4 = "tf.Relu"(%3) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %6 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  func.return %1, %4, %6 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_conv2d_with_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_conv2d_with_relu_fn_1
+
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_conv2d_fn_1
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv2d_with_relu6_fn_1
+// CHECK-LABEL: private @composite_conv2d_with_relu_fn_1
+// CHECK-LABEL: private @composite_conv2d_fn_1
+}
+
+// -----
+
+// CHECK-LABEL: float_depthwise_conv_no_bias
+func.func @float_depthwise_conv_no_bias(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x1xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %1 = "tf.Relu6"(%0) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %3 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %4 = "tf.Relu"(%3) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+  %6 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  func.return %1, %4, %6 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_depthwise_conv2d_with_relu_fn_1
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_1
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_depthwise_conv2d_with_relu6_fn_1
+// CHECK-LABEL: private @composite_depthwise_conv2d_with_relu_fn_1
+// CHECK-LABEL: private @composite_depthwise_conv2d_fn_1
+}
+
+// -----
+
+// CHECK-LABEL: float_matmul_no_bias
+func.func @float_matmul_no_bias(
+  %arg0: tensor<1x10xf32>, %arg1: tensor<10x10xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
+  %0 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = false, transpose_b = false
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %1 = "tf.Relu6"(%0) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = true, transpose_b = false
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  %4 = "tf.Relu"(%3) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %6 = "tf.MatMul"(%arg0, %arg1) {
+    transpose_a = false, transpose_b = true
+  } : (tensor<1x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  func.return %1, %4, %6 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_matmul_with_relu6_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_matmul_with_relu_fn_1
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK-SAME: f = @composite_matmul_fn_1
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_matmul_with_relu6_fn_1
+// CHECK-LABEL: private @composite_matmul_with_relu_fn_1
+// CHECK-LABEL: private @composite_matmul_fn_1
+}
+
+// -----
+
+// CHECK-LABEL: conv3d_no_bias
+func.func @conv3d_no_bias(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<2x3x3x3x2xf32>} : () -> tensor<2x3x3x3x2xf32>
+  %0 = "tf.Conv3D"(%arg0, %cst) {
+    data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]
+  } : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+  %1 = "tf.Relu"(%0) {device = ""} : (tensor<1x3x2x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+
+  %2 = "tf.Conv3D"(%arg0, %cst) {
+    data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]
+  } : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+  %3 = "tf.Relu6"(%2) {device = ""} : (tensor<1x3x2x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+
+  %4 = "tf.Conv3D"(%arg0, %cst) {
+    data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]
+  } : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+
+  return %1, %3, %4 : tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>
+
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x3x2xf32>
+
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
+// CHECK-SAME: f = @composite_conv3d_with_relu_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
+// CHECK-SAME: f = @composite_conv3d_with_relu6_fn_1
+
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
+// CHECK-SAME: f = @composite_conv3d_fn_1
+
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+
+// CHECK-LABEL: private @composite_conv3d_with_relu_fn_1
+// CHECK-LABEL: private @composite_conv3d_with_relu6_fn_1
+// CHECK-LABEL: private @composite_conv3d_fn_1
+}
+
+// -----
+
+// CHECK-LABEL: conv3d_with_bias
+func.func @conv3d_with_bias(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<2x3x3x3x2xf32>} : () -> tensor<2x3x3x3x2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv3D"(%arg0, %cst) {
+    data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]
+  } : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_1) {data_format = "NHWC", device = ""} : (tensor<1x3x2x3x2xf32>, tensor<2xf32>) -> tensor<1x3x2x3x2xf32>
+  %2 = "tf.Relu"(%1) {device = ""} : (tensor<1x3x2x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+
+  %3 = "tf.Conv3D"(%arg0, %cst) {
+    data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]
+  } : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+  %4 = "tf.BiasAdd"(%3, %cst_1) {data_format = "NHWC", device = ""} : (tensor<1x3x2x3x2xf32>, tensor<2xf32>) -> tensor<1x3x2x3x2xf32>
+  %5 = "tf.Relu6"(%4) {device = ""} : (tensor<1x3x2x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+
+  %6 = "tf.Conv3D"(%arg0, %cst) {
+    data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]
+  } : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+  %7 = "tf.BiasAdd"(%6, %cst_1) {data_format = "NHWC", device = ""} : (tensor<1x3x2x3x2xf32>, tensor<2xf32>) -> tensor<1x3x2x3x2xf32>
+
+  return %2, %5, %7 : tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>, tensor<1x3x2x3x2xf32>
+
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x3x2xf32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*}} : () -> tensor<2xf32>
+
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]], %[[CST_1]])
+// CHECK-SAME: f = @composite_conv3d_with_bias_and_relu_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]], %[[CST_1]])
+// CHECK-SAME: f = @composite_conv3d_with_bias_and_relu6_fn_1
+
+// CHECK: %[[PARTITIONEDCALL_2:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]], %[[CST_1]])
+// CHECK-SAME: f = @composite_conv3d_with_bias_fn_1
+
+// CHECK: return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]], %[[PARTITIONEDCALL_2]]
+
+// CHECK-LABEL: private @composite_conv3d_with_bias_and_relu_fn_1
+// CHECK-LABEL: private @composite_conv3d_with_bias_and_relu6_fn_1
+// CHECK-LABEL: private @composite_conv3d_with_bias_fn_1
+}
+
+// -----
+
+// Test that the name of composite functions are deterministic. There are 3
+// unsorted functions in this module and each function has 2 quantizable ops.
+module {
+  func.func @float_conv_3(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+    %3 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "tf.Relu6"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %2, %5 : tensor<*xf32>, tensor<*xf32>
+  }
+
+  func.func @float_conv_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+    %3 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "tf.Relu6"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %2, %5 : tensor<*xf32>, tensor<*xf32>
+  }
+
+  func.func @float_conv_2(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+    %3 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "tf.Relu6"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %2, %5 : tensor<*xf32>, tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: @float_conv_3
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_6
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_5
+
+// CHECK-LABEL: @float_conv_1
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_2
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
+
+// CHECK-LABEL: @float_conv_2
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_4
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_3
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_quantizable_spots_as_functions_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_quantizable_spots_as_functions_drq.mlir
new file mode 100644
index 000000000000..4221c247b5f5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_lift_quantizable_spots_as_functions_drq.mlir
@@ -0,0 +1,224 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-lift-quantizable-spots-as-functions-drq | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-lift-quantizable-spots-as-functions-drq='quantization-method=weight_only' | FileCheck --check-prefix=WEIGHTONLY %s
+
+// CHECK-LABEL: lift_float_matmul
+func.func @lift_float_matmul(%arg0: tensor<1x12x12x512xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<512x512xf32>} : () -> tensor<512x512xf32>
+  %out_1 = "tf.MatMul"(%arg0, %cst) {
+    device = "", transpose_a = false, transpose_b = false
+  } : (tensor<1x12x12x512xf32>, tensor<512x512xf32>) -> tensor<*xf32>
+  %out_2 = "tf.MatMul"(%arg0, %arg0) {
+    device = "", transpose_a = false, transpose_b = true
+  } : (tensor<1x12x12x512xf32>, tensor<1x12x12x512xf32>) -> tensor<*xf32>
+  func.return %out_1, %out_2 : tensor<*xf32>, tensor<*xf32>
+
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<512x512xf32>}> : () -> tensor<512x512xf32>
+// CHECK: %[[PARTITIONEDCALL:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST]])
+// CHECK-SAME: f = @composite_matmul_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[UNQUANTIZED_OUTPUT:.*]] = "tf.MatMul"(%arg0, %arg0)
+// CHECK: }
+
+// CHECK-LABEL: private @composite_matmul_fn_1
+// CHECK-NEXT: %[[OUT:.*]] = "tf.MatMul"(%arg0, %arg1)
+// CHECK-NEXT: return %[[OUT]]
+}
+
+// -----
+
+// CHECK-LABEL: lift_float_conv
+func.func @lift_float_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.Conv2D"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+
+  func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: f = @composite_conv2d_fn_2}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0]])
+// CHECK: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: f = @composite_conv2d_fn_1
+// CHECK: %[[BIASADD_1:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_1]], %[[CONST_0]])
+// CHECK: return %[[RELU6_0]], %[[BIASADD_1]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv2d_fn_2
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-NEXT: return %[[CONV2D_0]]
+
+// CHECK-LABEL: private @composite_conv2d_fn_1
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-NEXT: return %[[CONV2D_0]]
+}
+
+// -----
+
+// CHECK-LABEL: not_lift_float_conv_with_non_constant_weights
+func.func @not_lift_float_conv_with_non_constant_weights(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+
+  func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NOT: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+}
+
+// -----
+
+// CHECK-LABEL: lift_float_depthwise_conv
+func.func @lift_float_depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.DepthwiseConv2dNative"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_2}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0]])
+// CHECK: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_1
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_1]], %[[CONST_0]])
+// CHECK: return %[[RELU6_0]], %[[BIASADD_0]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_depthwise_conv2d_fn_2
+// CHECK-NEXT: %[[DEPTHWISECONV2D_0:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations"
+// CHECK-NEXT: return %[[DEPTHWISECONV2D_0:.*]]
+
+// CHECK-LABEL: private @composite_depthwise_conv2d_fn_1
+// CHECK-NEXT: %[[DEPTHWISECONV2D_0:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations"
+// CHECK-NEXT: return %[[DEPTHWISECONV2D_0:.*]]
+}
+
+// -----
+
+// CHECK-LABEL: lift_float_conv3d
+// WEIGHTONLY-LABEL: lift_float_conv3d
+func.func @lift_float_conv3d(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<2x3x3x3x2xf32>} : () -> tensor<2x3x3x3x2xf32>
+  %0 = "tf.Conv3D"(%arg0, %cst) {
+    data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]
+  } : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+  %1 = "tf.Relu"(%0) {device = ""} : (tensor<1x3x2x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+  return %1: tensor<1x3x2x3x2xf32>
+
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x3x2xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
+// CHECK-SAME: f = @composite_conv3d_fn_1}>
+// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: %[[RELU:.*]] = "tf.Relu"(%[[PARTITIONEDCALL_0]])
+// CHECK: return %[[RELU]]
+
+// CHECK-LABEL: private @composite_conv3d_fn_1
+
+// WEIGHTONLY-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x3x2xf32>
+// WEIGHTONLY: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
+// WEIGHTONLY-SAME: f = @composite_conv3d_fn_1}>
+// WEIGHTONLY: {_tfl_quant_trait = "fully_quantizable"
+// WEIGHTONLY: %[[RELU:.*]] = "tf.Relu"(%[[PARTITIONEDCALL_0]])
+// WEIGHTONLY: return %[[RELU]]
+
+// WEIGHTONLY-LABEL: private @composite_conv3d_fn_1
+}
+
+// -----
+
+// CHECK-LABEL: lift_float_batch_matmul
+// WEIGHTONLY-LABEL: lift_float_batch_matmul
+func.func @lift_float_batch_matmul(%arg0: tensor<4x4x3xf32>) -> (tensor<4x4x3xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<4x3x3xf32>} : () -> tensor<4x3x3xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %cst) {adj_x = false, adj_y = false, device = ""} : (tensor<4x4x3xf32>, tensor<4x3x3xf32>) -> tensor<4x4x3xf32>
+  return %0 : tensor<4x4x3xf32>
+
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<4x3x3xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
+// CHECK-SAME: f = @composite_batch_matmul_fn_1}>
+// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: return %[[PARTITIONEDCALL_0]]
+
+// CHECK-LABEL: private @composite_batch_matmul_fn_1
+
+// WEIGHTONLY-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<4x3x3xf32>
+// WEIGHTONLY: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CST]])
+// WEIGHTONLY-SAME: f = @composite_batch_matmul_fn_1}>
+// WEIGHTONLY-SAME: {_tfl_quant_trait = "fully_quantizable"
+// WEIGHTONLY: return %[[PARTITIONEDCALL_0]]
+
+// WEIGHTONLY-LABEL: private @composite_batch_matmul_fn_1
+}
+
+// -----
+
+// CHECK-LABEL: lift_float_gather
+// WEIGHTONLY-LABEL: lift_float_gather
+func.func @lift_float_gather(%arg0: tensor<6xi64>) -> (tensor<6x32xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<128x32xf32>} : () -> tensor<128x32xf32>
+  %0 = "tf.GatherV2"(%cst_0, %arg0, %cst) {batch_dims = 0 : i64, device = ""} : (tensor<128x32xf32>, tensor<6xi64>, tensor<i32>) -> tensor<6x32xf32>
+  return %0 : tensor<6x32xf32>
+
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<i32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*}} : () -> tensor<128x32xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%[[CST_1]], %arg0, %[[CST]])
+// CHECK-SAME: f = @composite_gather_fn_1}>
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable"
+// CHECK: return %[[PARTITIONEDCALL_0]]
+
+// WEIGHTONLY-DAG: %[[CST:.*]] = "tf.Const"() {{.*}} : () -> tensor<i32>
+// WEIGHTONLY-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*}} : () -> tensor<128x32xf32>
+// WEIGHTONLY: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%[[CST_1]], %arg0, %[[CST]])
+// WEIGHTONLY-SAME: f = @composite_gather_fn_1}>
+// WEIGHTONLY-SAME: {_tfl_quant_trait = "fully_quantizable"
+// WEIGHTONLY: return %[[PARTITIONEDCALL_0]]
+
+// WEIGHTONLY-LABEL: private @composite_gather_fn_1
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_mark_functions_noinline.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_mark_functions_noinline.mlir
new file mode 100644
index 000000000000..59455bb107fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_mark_functions_noinline.mlir
@@ -0,0 +1,24 @@
+// RUN: tf-quant-opt %s -tf-mark-functions-noinline='noinline-functions=noinline0' \
+// RUN:     -allow-unregistered-dialect -mlir-disable-threading \
+// RUN:     -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that the function is marked tf._noinline = true.
+
+// CHECK-LABEL: @noinline0
+// CHECK-SAME: attributes {{{.*tf._noinline = true.*}}}
+func.func @noinline0() -> (tensor<0xf32>) {
+  %cst = "tf.Const"() {value = dense<1.0> : tensor<0xf32>} : () -> tensor<0xf32>
+  return %cst : tensor<0xf32>
+}
+
+// -----
+
+// Tests that the function not listed in the option `noinline-functions`
+// is not marked tf._noinline = true.
+
+// CHECK-LABEL: @inline
+// CHECK-NOT: tf._noinline
+func.func @inline() -> (tensor<0xf32>) {
+  %cst = "tf.Const"() {value = dense<1.0> : tensor<0xf32>} : () -> tensor<0xf32>
+  return %cst : tensor<0xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_duplicate_resource_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_duplicate_resource_ops.mlir
new file mode 100644
index 000000000000..5ff77d281399
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_duplicate_resource_ops.mlir
@@ -0,0 +1,108 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-merge-duplicate-resource-ops | FileCheck %s
+
+func.func @merge_duplicate_variable(%arg0: tensor<1x20xf32>, %arg1: tensor<!tf_type.string>) -> (tensor<20x4096xf32>) {
+  %0 = tf_executor.graph {
+    %outputs_5, %control_6 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_7, %control_8 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.RestoreV2"(%arg1, %outputs_7, %outputs_5) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<20x4096xf32>
+    %control_13 = tf_executor.island(%control_12) wraps "tf.AssignVariableOp"(%outputs_9, %outputs_11) {validate_shape = false} : (tensor<!tf_type.resource<tensor<20x4096xf32>>>, tensor<20x4096xf32>) -> ()
+    %control_14 = tf_executor.island(%control_13) wraps "tf.NoOp"() : () -> ()
+    %outputs_15, %control_16 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_17, %control_18 = tf_executor.island wraps "tf.ReadVariableOp"(%outputs_15) : (tensor<!tf_type.resource<tensor<20x4096xf32>>>) -> tensor<20x4096xf32>
+    %outputs_19, %control_20 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_21, %control_22 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %control_23 = tf_executor.island(%control_18) wraps "tf.SaveV2"(%arg1, %outputs_19, %outputs_21, %outputs_17) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<20x4096xf32>) -> ()
+    %outputs_24, %control_25 = tf_executor.island(%control_23) wraps "tf.Identity"(%arg1) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+    tf_executor.fetch %outputs_17, %control_14, %control_25 : tensor<20x4096xf32>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<20x4096xf32>
+}
+// CHECK-LABEL: @merge_duplicate_variable
+// CHECK: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.VarHandleOp"()
+// CHECK: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.RestoreV2"
+// CHECK: %[[CTL_2:.*]] = tf_executor.island(%[[CTL_1]]) wraps "tf.AssignVariableOp"(%[[OUT_0]], %[[OUT_1]])
+
+// Check that ReadVariableOp now use the same variable op.
+// CHECK: %[[OUT_3:.*]], %[[CTL_3:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[OUT_0]])
+
+// -----
+
+func.func @variables_with_different_shared_names(%arg0: tensor<1x20xf32>, %arg1: tensor<!tf_type.string>) -> (tensor<20x4096xf32>) {
+  %0 = tf_executor.graph {
+    %outputs_5, %control_6 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_7, %control_8 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.RestoreV2"(%arg1, %outputs_7, %outputs_5) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<20x4096xf32>
+    %control_13 = tf_executor.island(%control_12) wraps "tf.AssignVariableOp"(%outputs_9, %outputs_11) {validate_shape = false} : (tensor<!tf_type.resource<tensor<20x4096xf32>>>, tensor<20x4096xf32>) -> ()
+    %control_14 = tf_executor.island(%control_13) wraps "tf.NoOp"() : () -> ()
+    %outputs_15, %control_16 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_1"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_17, %control_18 = tf_executor.island wraps "tf.ReadVariableOp"(%outputs_15) : (tensor<!tf_type.resource<tensor<20x4096xf32>>>) -> tensor<20x4096xf32>
+    %outputs_19, %control_20 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_21, %control_22 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %control_23 = tf_executor.island(%control_18) wraps "tf.SaveV2"(%arg1, %outputs_19, %outputs_21, %outputs_17) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<20x4096xf32>) -> ()
+    %outputs_24, %control_25 = tf_executor.island(%control_23) wraps "tf.Identity"(%arg1) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+    tf_executor.fetch %outputs_17, %control_14, %control_25 : tensor<20x4096xf32>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<20x4096xf32>
+}
+// CHECK-LABEL: @variables_with_different_shared_names
+// CHECK: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.VarHandleOp"()
+// CHECK-SAME: shared_name = "MatMul/b_0"
+// CHECK: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.RestoreV2"
+// CHECK: %[[CTL_2:.*]] = tf_executor.island(%[[CTL_1]]) wraps "tf.AssignVariableOp"(%[[OUT_0]], %[[OUT_1]])
+
+// Check that the second variable is not removed since they have different
+// `shared_name` attribute.
+// CHECK: %[[OUT_3:.*]], %[[CTL_3:.*]] = tf_executor.island wraps "tf.VarHandleOp"()
+// CHECK-SAME: shared_name = "MatMul/b_1"
+// CHECK: %[[OUT_4:.*]], %[[CTL_4:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[OUT_3]])
+
+// -----
+
+// Test two resource ops have the same shared_name but different types.
+// expected-error @+1 {{This op has the same `shared_name` but different type with another}}
+func.func @same_shared_name_but_different_types(%arg0: tensor<1x20xf32>, %arg1: tensor<!tf_type.string>) -> (tensor<20x4096xf32>) {
+  %0 = tf_executor.graph {
+    %outputs_5, %control_6 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_7, %control_8 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.RestoreV2"(%arg1, %outputs_7, %outputs_5) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<20x4096xf32>
+    %control_13 = tf_executor.island(%control_12) wraps "tf.AssignVariableOp"(%outputs_9, %outputs_11) {validate_shape = false} : (tensor<!tf_type.resource<tensor<20x4096xf32>>>, tensor<20x4096xf32>) -> ()
+    %control_14 = tf_executor.island(%control_13) wraps "tf.NoOp"() : () -> ()
+    %outputs_15, %control_16 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_17, %control_18 = tf_executor.island wraps "tf.ReadVariableOp"(%outputs_15) : (tensor<!tf_type.resource<tensor<20x4096xf32>>>) -> tensor<20x4096xf32>
+    %outputs_19, %control_20 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_21, %control_22 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %control_23 = tf_executor.island(%control_18) wraps "tf.SaveV2"(%arg1, %outputs_19, %outputs_21, %outputs_17) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<20x4096xf32>) -> ()
+    %outputs_24, %control_25 = tf_executor.island(%control_23) wraps "tf.Identity"(%arg1) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+    tf_executor.fetch %outputs_17, %control_14, %control_25 : tensor<20x4096xf32>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<20x4096xf32>
+}
+
+// -----
+
+func.func @merge_hashtable_ops(%arg0: tensor<?x!tf_type.string>) -> (tensor<i64>) {
+  %0 = tf_executor.graph {
+    %outputs, %control = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %outputs_0, %control_1 = tf_executor.island wraps "tf.LookupTableSizeV2"(%outputs) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %outputs_2, %control_3 = tf_executor.island wraps "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %outputs_4, %control_5 = tf_executor.island wraps "tf.Identity"(%outputs_0) : (tensor<i64>) -> tensor<i64>
+    %control_8 = tf_executor.island(%control_3, %control_5) wraps "tf.NoOp"() : () -> ()
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.Const"() {value = dense<["hello", "model", "quantization"]> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %outputs_13, %control_14 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %control_15 = tf_executor.island wraps "tf.LookupTableImportV2"(%outputs_13, %outputs_9, %outputs_11) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    %control_16 = tf_executor.island(%control_15) wraps "tf.NoOp"() : () -> ()
+    tf_executor.fetch %outputs_4, %control_8, %control_16 : tensor<i64>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<i64>
+}
+
+// CHECK-LABEL: @merge_hashtable_ops
+// CHECK: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.HashTableV2"()
+// CHECK: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.LookupTableSizeV2"(%[[OUT_0]])
+
+// Check that LookupTableImportV2 is using the same HashTableV2 with LookupTableSizeV2.
+// CHECK: %[[CTL_2:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_0]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_initializer_function_ops_to_main.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_initializer_function_ops_to_main.mlir
new file mode 100644
index 000000000000..c3ec753160a3
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_initializer_function_ops_to_main.mlir
@@ -0,0 +1,564 @@
+// RUN: tf-quant-opt %s -tf-quant-merge-initializer-function-ops-to-main \
+// RUN:     -allow-unregistered-dialect -mlir-disable-threading \
+// RUN:     -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: tf-quant-opt %s -tf-quant-merge-initializer-function-ops-to-main \
+// RUN:     -allow-unregistered-dialect -mlir-disable-threading \
+// RUN:     -split-input-file -mlir-print-local-scope -mlir-print-debuginfo \
+// RUN:     -verify-diagnostics | FileCheck %s --check-prefix CHECK-LOC
+
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+// Check that the initializers list is empty.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = []
+
+  func.func @NoOp()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<["test"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_1 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi64>} : () -> tensor<1xi64>
+      %out_1, %ctl_2 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %ctl_3 = tf_executor.island wraps "tf.LookupTableImportV2"(%out_1, %out, %out_0) {device = ""} : (tensor<!tf_type.resource>, tensor<1x!tf_type.string>, tensor<1xi64>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+// The session initializer function is removed.
+// CHECK-NOT: @NoOp()
+
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string>) -> tensor<*xi64> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+    %0 = tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+      %out_0, %ctl_1 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %out_1, %ctl_2 = tf_executor.island wraps "tf.LookupTableFindV2"(%out_0, %arg0, %out) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+      tf_executor.fetch %out_1 : tensor<*xi64>
+    }
+    return %0 : tensor<*xi64>
+  }
+// Sanity check: The contents of @serving_default is untouched.
+// CHECK: func.func private @serving_default(%[[ARG_0:.*]]: tensor<?x!tf_type.string>) -> tensor<*xi64>
+// CHECK-NEXT: %[[RES:.*]] = tf_executor.graph
+// CHECK: %[[OUT:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.Const"()
+// CHECK-NEXT: %[[OUT_0:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.HashTableV2"()
+// CHECK-NEXT: %[[OUT_1:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.LookupTableFindV2"(%[[OUT_0]], %[[ARG_0]], %[[OUT]])
+// CHECK-NEXT: tf_executor.fetch %[[OUT_1]] : tensor<*xi64>
+// CHECK: return %[[RES]] : tensor<*xi64>
+
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["serving_default_input_vocabs:0"]}) -> (tensor<*xi64> {tf_saved_model.index_path = ["StatefulPartitionedCall:0"]})
+      attributes {tf.entry_function = {inputs = "serving_default_input_vocabs:0", outputs = "StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> tensor<*xi64>
+      tf_executor.fetch %out : tensor<*xi64>
+    }
+    return %0 : tensor<*xi64>
+  }
+// Sanity check: The main function's signature & attributes have not changed.
+// CHECK: func.func @main(%[[ARG:.*]]: tensor<?x!tf_type.string>
+// CHECK-SAME: tf_saved_model.index_path = ["serving_default_input_vocabs:0"]
+// CHECK-SAME: -> (tensor<*xi64> {tf_saved_model.index_path = ["StatefulPartitionedCall:0"]})
+// CHECK-SAME: tf.entry_function = {inputs = "serving_default_input_vocabs:0", outputs = "StatefulPartitionedCall:0"}
+// CHECK-SAME: tf_saved_model.exported_names = ["main"]
+
+// CHECK: %[[GRAPH_OUT:.*]] = tf_executor.graph
+// CHECK-NEXT: %[[OUT:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.PartitionedCall"(%[[ARG]])
+// CHECK-SAME: f = @serving_default
+// Checks that the contents of @NoOp are copied here.
+// CHECK-NEXT: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"()
+// CHECK-SAME: value = dense<"test">
+// CHECK-NEXT: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Const"()
+// CHECK-SAME: value = dense<1>
+// CHECK-NEXT: %[[OUT_2:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.HashTableV2"()
+// CHECK-NEXT: %[[CTL_3:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_2]], %[[OUT_0]], %[[OUT_1]])
+// Checks that the NoOp with control dependency to the control output for the
+// initializer function is created & fetched.
+// CHECK-NEXT: %[[CTL_4:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.NoOp"()
+// CHECK-NEXT: tf_executor.fetch %[[OUT]], %[[CTL_4]] : tensor<*xi64>, !tf_executor.control
+// CHECK-NEXT: }
+// CHECK-NEXT: return %[[GRAPH_OUT]] : tensor<*xi64>
+
+// Checks that the location for the init op is properly set.
+// CHECK-LOC-LABEL: func.func @main
+// CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"()
+// CHECK-LOC-SAME: loc("init_op_NoOp")
+}
+
+// -----
+
+// Tests when the initializer function contains multiple stateful
+// initialization ops. They should be transitively connected through
+// control dependencies (!tf_executor.control), which is guaranteed by
+// the `tf-executor-break-up-islands` pass.
+
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+// Check that the initializers list is empty.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = []
+
+  func.func @NoOp()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<["test_1"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_1 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi64>} : () -> tensor<1xi64>
+      %out_1, %ctl_2 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %ctl_3 = tf_executor.island wraps "tf.LookupTableImportV2"(%out_1, %out, %out_0) {device = ""} : (tensor<!tf_type.resource>, tensor<1x!tf_type.string>, tensor<1xi64>) -> ()
+
+      %out_2, %ctl_4 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<["test_2"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_3, %ctl_5 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[2]> : tensor<1xi64>} : () -> tensor<1xi64>
+      // Has a control dependency to the previous LookupTableImportV2.
+      %out_4, %ctl_6 = tf_executor.island(%ctl_3) wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "2", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %ctl_7 = tf_executor.island wraps "tf.LookupTableImportV2"(%out_4, %out_2, %out_3) {device = ""} : (tensor<!tf_type.resource>, tensor<1x!tf_type.string>, tensor<1xi64>) -> ()
+      tf_executor.fetch %ctl_7 : !tf_executor.control
+    }
+    return
+  }
+// The session initializer function is removed.
+// CHECK-NOT: @NoOp()
+
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string>) -> tensor<*xi64> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+    %0 = tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+      %out_0, %ctl_1 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %out_1, %ctl_2 = tf_executor.island wraps "tf.LookupTableFindV2"(%out_0, %arg0, %out) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+      tf_executor.fetch %out_1 : tensor<*xi64>
+    }
+    return %0 : tensor<*xi64>
+  }
+
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["serving_default_input_vocabs:0"]}) -> (tensor<*xi64> {tf_saved_model.index_path = ["StatefulPartitionedCall:0"]})
+      attributes {tf.entry_function = {inputs = "serving_default_input_vocabs:0", outputs = "StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> tensor<*xi64>
+      tf_executor.fetch %out : tensor<*xi64>
+    }
+    return %0 : tensor<*xi64>
+  }
+// Sanity check: The main function's signature & attributes have not changed.
+// CHECK: func.func @main(%[[ARG:.*]]: tensor<?x!tf_type.string>
+// CHECK-SAME: tf_saved_model.index_path = ["serving_default_input_vocabs:0"]
+// CHECK-SAME: -> (tensor<*xi64> {tf_saved_model.index_path = ["StatefulPartitionedCall:0"]})
+// CHECK-SAME: tf.entry_function = {inputs = "serving_default_input_vocabs:0", outputs = "StatefulPartitionedCall:0"}
+// CHECK-SAME: tf_saved_model.exported_names = ["main"]
+
+// CHECK: %[[GRAPH_OUT:.*]] = tf_executor.graph
+// CHECK-NEXT: %[[OUT:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.PartitionedCall"(%[[ARG]])
+// CHECK-SAME: f = @serving_default
+// Checks that the contents of @NoOp are copied here.
+// CHECK-DAG: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"test_1">.*}}}>
+// CHECK-DAG: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<1>.*}}}>
+
+// CHECK-NEXT: %[[OUT_2:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.HashTableV2"()
+// CHECK-NEXT: %[[CTL_3:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_2]], %[[OUT_0]], %[[OUT_1]])
+
+// CHECK-DAG: %[[OUT_3:.*]], %[[CTL_4:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"test_2">.*}}}>
+// CHECK-DAG: %[[OUT_4:.*]], %[[CTL_5:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<2>.*}}}>
+
+// CHECK-NEXT: %[[OUT_5:.*]], %[[CTL_6:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.HashTableV2"()
+// CHECK-NEXT: %[[CTL_7:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_5]], %[[OUT_3]], %[[OUT_4]])
+
+// Checks that the NoOp with control dependency to the control output for the
+// initializer function is created & fetched.
+// CHECK-NEXT: %[[CTL_8:.*]] = tf_executor.island(%[[CTL_7]]) wraps "tf.NoOp"()
+// CHECK-NEXT: tf_executor.fetch %[[OUT]], %[[CTL_8]] : tensor<*xi64>, !tf_executor.control
+// CHECK-NEXT: }
+// CHECK-NEXT: return %[[GRAPH_OUT]] : tensor<*xi64>
+
+// Checks that the location for the init op is properly set.
+// CHECK-LOC-LABEL: func.func @main
+// CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"()
+// CHECK-LOC-SAME: loc("init_op_NoOp")
+}
+
+// -----
+
+// Test the case where the initializer function accepts an argument but it
+// is not used within the body.
+
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+// Check that the initializers list is empty.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = []
+
+  "tf_saved_model.asset"() {filename = "assets/file.txt", sym_name = "__tf_saved_model_asset0_file.txt"} : () -> ()
+
+  func.func @NoOp(%arg: tensor<!tf_type.string> {tf_saved_model.bound_input = @__tf_saved_model_asset0_file.txt})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<["test"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_1 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi64>} : () -> tensor<1xi64>
+      %out_1, %ctl_2 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %ctl_3 = tf_executor.island wraps "tf.LookupTableImportV2"(%out_1, %out, %out_0) {device = ""} : (tensor<!tf_type.resource>, tensor<1x!tf_type.string>, tensor<1xi64>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+// The session initializer function is removed.
+// CHECK-NOT: @NoOp()
+
+  func.func @main() attributes {tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// Sanity check: The main function's signature & attributes have not changed.
+// CHECK: func.func @main()
+// CHECK-SAME: tf_saved_model.exported_names = ["main"]
+
+// CHECK: tf_executor.graph
+// Checks that the contents of @NoOp are copied here.
+// CHECK-NEXT: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"()
+// CHECK-SAME: value = dense<"test">
+// CHECK-NEXT: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Const"()
+// CHECK-SAME: value = dense<1>
+// CHECK-NEXT: %[[OUT_2:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.HashTableV2"()
+// CHECK-NEXT: %[[CTL_3:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_2]], %[[OUT_0]], %[[OUT_1]])
+// Checks that the control output for the initializer function is fetched.
+// CHECK-NEXT: %[[CTL_4:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.NoOp"()
+// CHECK-NEXT: tf_executor.fetch %[[CTL_4]] : !tf_executor.control
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+
+// Checks that the location for the init op is properly set.
+// CHECK-LOC-LABEL: func.func @main
+// CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"()
+// CHECK-LOC-SAME: loc("init_op_NoOp")
+}
+
+// -----
+
+// Test the case where there are 2 initializer functions ("init_op" and
+// "restore_op"). The init func of type "init_op" is merged first.
+
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.asset"() {filename = "assets/table.txt", sym_name = "v"} : () -> ()
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp_0, @NoOp_1]} : () -> ()
+// Check that the initializer typed "init_op" is removed from initializers list.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = []
+
+func.func @NoOp_0(%arg0: tensor<!tf_type.string> {tf_saved_model.bound_input = @v})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp_0"], tf_saved_model.initializer_type = "init_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Identity"(%arg0) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      tf_executor.fetch %ctl : !tf_executor.control
+    }
+    return
+  }
+// The session initializer function is removed.
+// CHECK-NOT: @NoOp_0()
+
+  func.func @NoOp_1(%arg0: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp_1"], tf_saved_model.initializer_type = "restore_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Identity"(%arg0) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      tf_executor.fetch %ctl : !tf_executor.control
+    }
+    return
+  }
+// The session initializer function is removed.
+// CHECK-NOT: @NoOp_1()
+
+  func.func @main() attributes {tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// Check that the args for the "restore_op" is added before the args for the "init_op".
+// CHECK: func.func @main(%[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}, %[[ARG_1:.*]]: tensor<!tf_type.string> {tf_saved_model.bound_input = @v})
+// CHECK-SAME: tf_saved_model.exported_names = ["main"]
+
+// CHECK: tf_executor.graph
+// Checks that the contents of the initializer functions are copied here.
+// CHECK-DAG: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Identity"(%[[ARG_0]])
+// CHECK-DAG: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Identity"(%[[ARG_1]])
+
+// Checks that 2 `NoOp`s having control dependencies to each of the initializer
+// functions are created.
+// CHECK-DAG: %[[CTL_2:.*]] = tf_executor.island(%[[CTL_0]]) wraps "tf.NoOp"()
+// CHECK-DAG: %[[CTL_3:.*]] = tf_executor.island(%[[CTL_1]]) wraps "tf.NoOp"()
+
+// CHECK: tf_executor.fetch
+// CHECK-SAME: !tf_executor.control, !tf_executor.control
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+
+// Checks that the location for the init op is properly set.
+// CHECK-LOC-LABEL: func.func @main
+
+// CHECK-LOC-DAG: tf_executor.island({{.*}}) wraps "tf.NoOp"() {{.*}} loc("init_op_NoOp_0")
+// CHECK-LOC-DAG: tf_executor.island({{.*}}) wraps "tf.NoOp"() {{.*}} loc("restore_op_NoOp_1")
+}
+
+// -----
+
+// Tests that initializer function for "restore_op" is merged into @main.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+// CHECK: "tf_saved_model.session_initializer"() <{initializers = []}>
+
+  func.func @init_func_restore_op(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "restore_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_0 = tf_executor.island wraps "tf.Const"() {value = dense<"var_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_1, %ctl_1 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "var_0", device = "/device:CPU:0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+      %out_2, %ctl_2 = tf_executor.island wraps "tf.RestoreV2"(%arg, %out_0, %out) {} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<2xf32>
+      %ctl_3 = tf_executor.island wraps "tf.AssignVariableOp"(%out_1, %out_2) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main() attributes {tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// A new argument corresponding to the "file_prefix" should be created.
+// CHECK: func.func @main(%[[ARG:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]})
+// CHECK-SAME: {{{.*tf.entry_function = {inputs = "restore_op_0:0", outputs = ""}.*}}}
+// CHECK-NEXT: tf_executor.graph
+
+// Checks that the ops from @init_func_restore_op are cloned.
+// CHECK-DAG: %[[CONST_0:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK-DAG: %[[CONST_1:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK: %[[VAR_HANDLE:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
+// CHECK: %[[RESTORE:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.RestoreV2"(%[[ARG]], %[[CONST_1]], %[[CONST_0]])
+// CHECK: %[[CTL_3:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[RESTORE]])
+// CHECK: %[[CTL_4:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.NoOp"()
+// CHECK-NEXT: tf_executor.fetch %[[CTL_4]] : !tf_executor.control
+// CHECK: return
+
+// Checks that the Location is properly set for the NoOp.
+// CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"() {{.*}} loc("restore_op_init_func_restore_op")
+}
+
+// -----
+
+// Test that the argument of the initializer function is correctly merged
+// into @main.
+
+// CHECK-LABEL: module
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  "tf_saved_model.asset"() {filename = "assets/file.txt", sym_name = "__tf_saved_model_asset0_file.txt"} : () -> ()
+
+  func.func @NoOp(%arg: tensor<!tf_type.string> {tf_saved_model.bound_input = @__tf_saved_model_asset0_file.txt})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Identity"(%arg) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      tf_executor.fetch %ctl : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main() attributes {tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+  // CHECK: @main(%[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.bound_input = @__tf_saved_model_asset0_file.txt})
+  // CHECK-SAME: tf.entry_function = {inputs = "init_op_0:0", outputs = ""}
+  // CHECK: %{{.*}}, %[[CTL:.*]] = tf_executor.island wraps "tf.Identity"(%[[ARG_0]])
+  // CHECK: tf_executor.fetch %[[CTL]]
+}
+
+// -----
+
+// Tests that the input name for the new argument created in @main (for the
+// "restore_op" initializer function) is not added when there is no
+// tf.entry_function.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+// CHECK: "tf_saved_model.session_initializer"() <{initializers = []}>
+
+  func.func @init_func_restore_op(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["file_prefix"]})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "restore_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_0 = tf_executor.island wraps "tf.Const"() {value = dense<"var_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_1, %ctl_1 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "var_0", device = "/device:CPU:0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+      %out_2, %ctl_2 = tf_executor.island wraps "tf.RestoreV2"(%arg, %out_0, %out) {} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<2xf32>
+      %ctl_3 = tf_executor.island wraps "tf.AssignVariableOp"(%out_1, %out_2) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// A new argument corresponding to the "file_prefix" should be created.
+// Also checks that tf.entry_function is not created.
+// CHECK: func.func @main(%[[ARG:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["file_prefix"]}) attributes {tf_saved_model.exported_names = ["main"]}
+}
+
+// -----
+
+// Tests no change when there's no initializer functions.
+
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+// Check that the initializers list is empty.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = []
+
+  func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// CHECK: func.func @main()
+// CHECK-NEXT: tf_executor.graph {
+// CHECK-NEXT: tf_executor.fetch
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+}
+
+// -----
+
+// Tests no change when there's no "tf_saved_model.session_initializer".
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
+    return
+  }
+// CHECK: func.func @main()
+// CHECK-NEXT: return
+}
+
+// -----
+
+// Tests when the main function is empty.
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+// Check that the initializers attribute is untouched.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = [@NoOp]
+
+  func.func @NoOp()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    return
+  }
+// The initializer function is untouched when the main function is empty.
+// CHECK: func.func @NoOp
+
+  func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
+    return
+  }
+// CHECK: func.func @main()
+// CHECK-NEXT: return
+}
+
+// -----
+
+// Tests when the initializer function is empty.
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_when_main_empty]} : () -> ()
+// Check that the initializers attribute is untouched.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = [@init_func_when_main_empty]
+
+  func.func @init_func_when_main_empty()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    return
+  }
+// The initializer function is untouched.
+// CHECK: func.func @init_func_when_main_empty()
+
+  func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// CHECK: func.func @main()
+}
+
+// -----
+
+// @main function must exist in a valid input module for this pass.
+
+// expected-error @+1 {{Main function op not found.}}
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+
+  func.func @NoOp()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    return
+  }
+}
+
+// -----
+
+// Tests malformed initializer function that has a fetch other than
+// tf_executor::ControlType.
+
+// expected-error @+1 {{Validation on initializer functions failed.}}
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+
+  func.func @NoOp()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi64>} : () -> tensor<1xi64>
+      // expected-error @+1 {{Validation failed for the initializer function: NoOp. All initializer function's fetches should be tf_executor::ControlType. Got: tensor<1xi64>.}}
+      tf_executor.fetch %out : tensor<1xi64>
+    }
+    return
+  }
+
+  func.func @main() attributes {tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+}
+
+// -----
+
+// Tests that an error is emitted when an initializer function does not have the
+// tf_saved_model.initializer_type attribute.
+
+// expected-error @below {{Validation on initializer functions failed.}}
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+
+  // expected-error @below {{Initializer func op does not have tf_saved_model.initializer_type attribute. Func op: NoOp}}
+  func.func @NoOp()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi64>} : () -> tensor<1xi64>
+      tf_executor.fetch %ctl : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_save_function_ops_to_main.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_save_function_ops_to_main.mlir
new file mode 100644
index 000000000000..a26810fdb5b2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_merge_save_function_ops_to_main.mlir
@@ -0,0 +1,163 @@
+// RUN: tf-quant-opt %s -tf-quant-merge-save-function-ops-to-main \
+// RUN:     -allow-unregistered-dialect -mlir-disable-threading \
+// RUN:     -split-input-file -verify-diagnostics | FileCheck %s
+
+// Test that the @tf_quant_save's ops are cloned to @main.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @tf_quant__save(%arg: tensor<!tf_type.string>) -> () {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+      %out_0, %ctl_0 = tf_executor.island wraps "tf.ReadVariableOp"(%out) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
+      %out_1, %ctl_1 = tf_executor.island wraps "tf.Const"() {value = dense<"var_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_2, %ctl_2 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %ctl_3 = tf_executor.island wraps "tf.SaveV2"(%arg, %out_1, %out_2, %out_0) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xf32>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}) -> ()
+      attributes {tf.entry_function = {inputs = "tf_file_prefix:0", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+}
+// Save function should be erased.
+// CHECK-NOT: @tf_quant__save
+
+// Test that the contents of @tf_quant__save are copied to @main.
+// CHECK: func.func @main
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+// CHECK: tf_executor.graph
+// CHECK: %[[VAR_HANDLE:.*]], {{.*}} = tf_executor.island wraps "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
+// CHECK: %[[READ_VARIABLE:.*]], {{.*}} = tf_executor.island wraps "tf.ReadVariableOp"(%[[VAR_HANDLE]])
+// CHECK-DAG: %[[CST_0:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK: %[[CTL_0:.*]] = tf_executor.island wraps "tf.SaveV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]], %[[READ_VARIABLE]]) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xf32>) -> ()
+
+// Test that the Identity op has been created to fetch the file prefix
+// argument. It should also have control dependency to the `SaveV2` op.
+// CHECK: %[[IDENTITY:.*]], %[[CTL_1:.*]] = tf_executor.island(%[[CTL_0]]) wraps "tf.Identity"(%[[ARG_0]])
+// CHECK: tf_executor.fetch %[[CTL_1]] : !tf_executor.control
+// CHECK: return
+
+// -----
+
+// Test that no ops are added to @main when @tf_quant__save function does
+// not exist.
+
+module attributes {tf_saved_model.semantics} {
+  func.func @main(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}) -> ()
+      attributes {tf.entry_function = {inputs = "tf_file_prefix:0", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+}
+// CHECK: func.func @main
+// CHECK: tf_executor.graph
+// CHECK-NEXT: tf_executor.fetch
+
+// -----
+
+// Test error when @main op doesn't exist.
+
+// expected-error @+1 {{Main function op not found.}}
+module attributes {tf_saved_model.semantics} {
+}
+
+// -----
+
+// Test that no ops are added to @main when there are no `GraphOp` in @main.
+
+module attributes {tf_saved_model.semantics} {
+  func.func @main(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}) -> ()
+      attributes {tf.entry_function = {inputs = "tf_file_prefix:0", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    return
+  }
+// CHECK: func.func @main({{.*}}) attributes {{{.*}}} {
+// CHECK-NEXT: return
+
+  func.func private @tf_quant__save(%arg: tensor<!tf_type.string>) -> () {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {value = dense<"hello"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+      tf_executor.fetch %ctl : !tf_executor.control
+    }
+    return
+  }
+}
+
+// -----
+
+// Test that no ops are added to @main when there are no `GraphOp` in
+// @tf_quant__save.
+
+module attributes {tf_saved_model.semantics} {
+  func.func @main(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}) -> ()
+      attributes {tf.entry_function = {inputs = "tf_file_prefix:0", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// CHECK: func.func @main({{.*}}) attributes {{{.*}}} {
+// CHECK-NEXT: tf_executor.graph
+// CHECK-NEXT: tf_executor.fetch
+
+  func.func private @tf_quant__save(%arg: tensor<!tf_type.string>) -> () {
+    return
+  }
+}
+
+// -----
+
+// Test that the @tf_quant_save's ops are cloned to @main. When there are no
+// __tf_file_prefix argument in @main, confirm that it is created and wired
+// to the newly created `IdentityOp`.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @tf_quant__save(%arg: tensor<!tf_type.string>) -> () {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+      %out_0, %ctl_0 = tf_executor.island wraps "tf.ReadVariableOp"(%out) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
+      %out_1, %ctl_1 = tf_executor.island wraps "tf.Const"() {value = dense<"var_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_2, %ctl_2 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %ctl_3 = tf_executor.island wraps "tf.SaveV2"(%arg, %out_1, %out_2, %out_0) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xf32>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main() -> () attributes {
+      tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+}
+// Save function should be erased.
+// CHECK-NOT: @tf_quant__save
+
+// Test that the contents of @tf_quant__save are copied to @main.
+// CHECK: func.func @main
+// Test that the "__tf_file_prefix" argument of type `tensor<!tf_type_string>`
+// has been created.
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+// CHECK-SAME: tf.entry_function = {inputs = "__tf_file_prefix:0", outputs = ""}
+// CHECK: tf_executor.graph
+// CHECK: %[[VAR_HANDLE:.*]], {{.*}} = tf_executor.island wraps "tf.VarHandleOp"() <{{{.*shared_name = "var_0".*}}}>
+// CHECK: %[[READ_VARIABLE:.*]], {{.*}} = tf_executor.island wraps "tf.ReadVariableOp"(%[[VAR_HANDLE]])
+// CHECK-DAG: %[[CST_0:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK-DAG: %[[CST_1:.*]], {{.*}} = tf_executor.island wraps "tf.Const"() <{{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}>
+// CHECK: %[[CTL_0:.*]] = tf_executor.island wraps "tf.SaveV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]], %[[READ_VARIABLE]]) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xf32>) -> ()
+
+// Test that the Identity op has been created to fetch the file prefix
+// argument. It should also have control dependency to the `SaveV2` op.
+// CHECK: %[[IDENTITY:.*]], %[[CTL_1:.*]] = tf_executor.island(%[[CTL_0]]) wraps "tf.Identity"(%[[ARG_0]])
+// CHECK: tf_executor.fetch %[[CTL_1]] : !tf_executor.control
+// CHECK: return
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_optimize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_optimize.mlir
new file mode 100644
index 000000000000..87a8694203fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_optimize.mlir
@@ -0,0 +1,124 @@
+// RUN: tf-quant-opt %s -tf-quant-optimize -allow-unregistered-dialect | FileCheck %s
+
+func.func @remove_redundant_cast(%arg0: tensor<1x100x100x1xf32>) -> (tensor<1x96x96x1xf32>) {
+  %cst = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {value = dense<0.0235294122> : tensor<f32>} : () -> tensor<f32>
+  %cst_1 = "tf.Const"() {value = dense<0.00708661414> : tensor<1xf32>} : () -> tensor<1xf32>
+  %cst_2 = "tf.Const"() {value = dense<1.799000e+03> : tensor<1xf32>} : () -> tensor<1xf32>
+  %cst_3 = "tf.Const"() {value = dense<[[[[1.400000e+01]], [[-2.800000e+01]], [[4.200000e+01]]], [[[-5.600000e+01]], [[7.100000e+01]], [[-8.500000e+01]]], [[[9.900000e+01]], [[-1.130000e+02]], [[1.270000e+02]]]]> : tensor<3x3x1x1xf32>} : () -> tensor<3x3x1x1xf32>
+  %cst_4 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+  %cst_5 = "tf.Const"() {value = dense<0.00118110236> : tensor<1xf32>} : () -> tensor<1xf32>
+  %cst_6 = "tf.Const"() {value = dense<1.079500e+04> : tensor<1xf32>} : () -> tensor<1xf32>
+  %cst_7 = "tf.Const"() {value = dense<0.00392156886> : tensor<f32>} : () -> tensor<f32>
+  %cst_8 = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+  %cst_9 = "tf.Const"() {value = dense<127> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Div"(%arg0, %cst_7) : (tensor<1x100x100x1xf32>, tensor<f32>) -> tensor<1x100x100x1xf32>
+  %1 = "tf.Round"(%0) : (tensor<1x100x100x1xf32>) -> tensor<1x100x100x1xf32>
+  %2 = "tf.Cast"(%1) : (tensor<1x100x100x1xf32>) -> tensor<1x100x100x1xi32>
+  %3 = "tf.AddV2"(%2, %cst) : (tensor<1x100x100x1xi32>, tensor<i32>) -> tensor<1x100x100x1xi32>
+
+  %4 = "tf.ClipByValue"(%3, %cst, %cst_9) : (tensor<1x100x100x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x100x100x1xi32>
+  %5 = "tf.Cast"(%4) {Truncate = false} : (tensor<1x100x100x1xi32>) -> tensor<1x100x100x1xi8>
+  %6 = "tf.Cast"(%5) {Truncate = false} : (tensor<1x100x100x1xi8>) -> tensor<1x100x100x1xf32>
+
+  %7 = "tf.Sub"(%6, %cst_4) : (tensor<1x100x100x1xf32>, tensor<f32>) -> tensor<1x100x100x1xf32>
+  %8 = "tf.Conv2D"(%7, %cst_3) {dilations = [1, 1, 1, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x100x100x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x98x98x1xf32>
+  %9 = "tf.AddV2"(%8, %cst_6) : (tensor<1x98x98x1xf32>, tensor<1xf32>) -> tensor<1x98x98x1xf32>
+  %10 = "tf.Mul"(%9, %cst_5) : (tensor<1x98x98x1xf32>, tensor<1xf32>) -> tensor<1x98x98x1xf32>
+  %11 = "tf.AddV2"(%10, %cst_8) : (tensor<1x98x98x1xf32>, tensor<f32>) -> tensor<1x98x98x1xf32>
+  %12 = "tf.Floor"(%11) : (tensor<1x98x98x1xf32>) -> tensor<1x98x98x1xf32>
+  %13 = "tf.Cast"(%12) {Truncate = false} : (tensor<1x98x98x1xf32>) -> tensor<1x98x98x1xi32>
+  %14 = "tf.AddV2"(%13, %cst) : (tensor<1x98x98x1xi32>, tensor<i32>) -> tensor<1x98x98x1xi32>
+
+  %15 = "tf.ClipByValue"(%14, %cst, %cst_9) : (tensor<1x98x98x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x98x98x1xi32>
+  %16 = "tf.Cast"(%15) {Truncate = false} : (tensor<1x98x98x1xi32>) -> tensor<1x98x98x1xi8>
+  %17 = "tf.Cast"(%16) {Truncate = false} : (tensor<1x98x98x1xi8>) -> tensor<1x98x98x1xf32>
+
+  %18 = "tf.Sub"(%17, %cst_4) : (tensor<1x98x98x1xf32>, tensor<f32>) -> tensor<1x98x98x1xf32>
+  %19 = "tf.Conv2D"(%18, %cst_3) {dilations = [1, 1, 1, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x98x98x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x96x96x1xf32>
+  %20 = "tf.AddV2"(%19, %cst_2) : (tensor<1x96x96x1xf32>, tensor<1xf32>) -> tensor<1x96x96x1xf32>
+  %21 = "tf.Mul"(%20, %cst_1) : (tensor<1x96x96x1xf32>, tensor<1xf32>) -> tensor<1x96x96x1xf32>
+  %22 = "tf.AddV2"(%21, %cst_8) : (tensor<1x96x96x1xf32>, tensor<f32>) -> tensor<1x96x96x1xf32>
+  %23 = "tf.Floor"(%22) : (tensor<1x96x96x1xf32>) -> tensor<1x96x96x1xf32>
+  %24 = "tf.Cast"(%23) {Truncate = false} : (tensor<1x96x96x1xf32>) -> tensor<1x96x96x1xi32>
+  %25 = "tf.AddV2"(%24, %cst) : (tensor<1x96x96x1xi32>, tensor<i32>) -> tensor<1x96x96x1xi32>
+
+  %26 = "tf.ClipByValue"(%25, %cst, %cst_9) : (tensor<1x96x96x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x96x96x1xi32>
+  %27 = "tf.Cast"(%26) {Truncate = false} : (tensor<1x96x96x1xi32>) -> tensor<1x96x96x1xi8>
+  %28 = "tf.Cast"(%27) : (tensor<1x96x96x1xi8>) -> tensor<1x96x96x1xi32>
+
+  %29 = "tf.Sub"(%28, %cst) : (tensor<1x96x96x1xi32>, tensor<i32>) -> tensor<1x96x96x1xi32>
+  %30 = "tf.Cast"(%29) : (tensor<1x96x96x1xi32>) -> tensor<1x96x96x1xf32>
+  %31 = "tf.Mul"(%30, %cst_0) : (tensor<1x96x96x1xf32>, tensor<f32>) -> tensor<1x96x96x1xf32>
+  return %31 : tensor<1x96x96x1xf32>
+
+// CHECK-LABEL: func.func @remove_redundant_cast
+
+// CHECK: %[[CLIPBYVALUE_0:.*]] = "tf.ClipByValue"
+// CHECK-SAME: (tensor<1x100x100x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x100x100x1xi32>
+// CHECK: %[[CAST_1:.*]] = "tf.Cast"(%[[CLIPBYVALUE_0]]) <{Truncate = false}> : (tensor<1x100x100x1xi32>) -> tensor<1x100x100x1xf32>
+
+// CHECK: %[[CLIPBYVALUE_1:.*]] = "tf.ClipByValue"
+// CHECK-SAME: (tensor<1x98x98x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x98x98x1xi32>
+// CHECK: %[[CAST_3:.*]] = "tf.Cast"(%[[CLIPBYVALUE_1]]) <{Truncate = false}> : (tensor<1x98x98x1xi32>) -> tensor<1x98x98x1xf32>
+
+// CHECK: %[[CLIPBYVALUE_2:.*]] = "tf.ClipByValue"
+// CHECK-SAME: (tensor<1x96x96x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x96x96x1xi32>
+// CHECK: %[[SUB_2:.*]] = "tf.Sub"(%[[CLIPBYVALUE_2]], {{.*}}) : (tensor<1x96x96x1xi32>, tensor<i32>) -> tensor<1x96x96x1xi32>
+}
+
+func.func @consecutive_add_add(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %cst = "tf.Const"() {value = dense<-18> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {value = dense<-12> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%arg0, %cst) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.AddV2"(%0, %cst_1) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %1 : tensor<i32>
+
+// CHECK-LABEL: func.func @consecutive_add_add
+
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<-30> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+// CHECK: return %[[ADD]] : tensor<i32>
+}
+
+func.func @consecutive_add_sub(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %cst = "tf.Const"() {value = dense<-18> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {value = dense<-12> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%arg0, %cst) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Sub"(%0, %cst_1) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %1 : tensor<i32>
+
+// CHECK-LABEL: func.func @consecutive_add_sub
+
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %[[SUB:.*]] = "tf.Sub"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+// CHECK: return %[[SUB]] : tensor<i32>
+}
+
+func.func @consecutive_sub_add(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %cst = "tf.Const"() {value = dense<-18> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {value = dense<-12> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Sub"(%arg0, %cst) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.AddV2"(%0, %cst_1) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %1 : tensor<i32>
+
+// CHECK-LABEL: func.func @consecutive_sub_add
+
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<6> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+// CHECK: return %[[ADD]] : tensor<i32>
+}
+
+func.func @consecutive_sub_sub(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %cst = "tf.Const"() {value = dense<-18> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {value = dense<-12> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Sub"(%arg0, %cst) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Sub"(%0, %cst_1) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %1 : tensor<i32>
+
+// CHECK-LABEL: func.func @consecutive_sub_sub
+
+// CHECK: %[[CST:.*]] = "tf.Const"() <{value = dense<-30> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %[[SUB:.*]] = "tf.Sub"(%arg0, %[[CST]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+// CHECK: return %[[SUB]] : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_lifting.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_lifting.mlir
new file mode 100644
index 000000000000..b8384cbc4c21
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_lifting.mlir
@@ -0,0 +1,401 @@
+// RUN: tf-quant-opt %s -tf-quant-prepare-lifting -split-input-file | FileCheck %s
+
+func.func @decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %add, %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%arg0, %cst, %cst_0, %cst_0, %cst) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
+  func.return %add : tensor<*xf32>
+}
+// CHECK: func @decompose_batch_norm
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.49743462E-5> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<0.999950051> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[mul:.*]] = "tf.Mul"(%arg0, %[[CONST_0]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+// CHECK: %[[add:.*]] = "tf.AddV2"(%[[mul]], %[[CONST]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+// CHECK-NEXT: return %[[add]] : tensor<*xf32>
+
+// -----
+
+func.func @not_decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %bn, %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%arg0, %cst, %cst_0, %cst_0, %cst) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = true} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
+  func.return %bn : tensor<*xf32>
+}
+// CHECK: func @not_decompose_batch_norm
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[bn:.*]], %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%arg0, %[[CONST]], %[[CONST_0]], %[[CONST_0]], %[[CONST]]) <{data_format = "NHWC", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = true}> {device = ""} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
+// CHECK-NEXT: return %[[bn]] : tensor<*xf32>
+
+// -----
+
+func.func @convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.AddV2"(%0, %cst_0) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %1 : tensor<1x3x2x2xf32>
+}
+// CHECK: func @convert_add_to_biasadd
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @not_convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x3xf32>) {
+  %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x3xf32>} : () -> tensor<2x3x3x3xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<1x3x2x3xf32>} : () -> tensor<1x3x2x3xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x3xf32>) -> tensor<1x3x2x3xf32>
+  %1 = "tf.AddV2"(%0, %cst_0) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+  func.return %1 : tensor<1x3x2x3xf32>
+}
+// CHECK: func @not_convert_add_to_biasadd
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x3xf32>}> : () -> tensor<2x3x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<5.000000e-01> : tensor<1x3x2x3xf32>}> : () -> tensor<1x3x2x3xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x3xf32>) -> tensor<1x3x2x3xf32>
+// CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+// CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x3xf32>
+
+// -----
+
+func.func @fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.Mul"(%0, %cst_0) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %1 : tensor<1x3x2x2xf32>
+}
+// CHECK: func @fuse_conv2d_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: return %[[CONV2D]] : tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @not_fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.Mul"(%0, %cst_0) : (tensor<1x3x2x2xf32>, tensor<2x2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %1 : tensor<1x3x2x2xf32>
+}
+// CHECK: func @not_fuse_conv2d_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2x2xf32>}> : () -> tensor<2x2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[ADD:.*]] = "tf.Mul"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x2xf32>, tensor<2x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %2 : tensor<1x3x2x2xf32>
+}
+// CHECK: func @fuse_conv2d_with_bias_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @not_fuse_conv2d_with_bias_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>, tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.800000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.Mul"(%0, %cst_1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %1, %2 : tensor<1x3x2x2xf32>, tensor<1x3x2x2xf32>
+}
+// CHECK: func @not_fuse_conv2d_with_bias_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[MUL:.*]] = "tf.Mul"(%[[CONV2D]], %[[CONST_1]]) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: return %[[BIASADD]], %[[MUL]] : tensor<1x3x2x2xf32>, tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.AddV2"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %2 : tensor<1x3x2x2xf32>
+}
+// CHECK: func @fuse_conv2d_with_bias_and_add
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @not_fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.AddV2"(%1, %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %2 : tensor<1x3x2x2xf32>
+}
+// CHECK: func @not_fuse_conv2d_with_bias_and_add
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[BIASADD]], %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @match_depthwise_conv2d_and_add(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.AddV2"(%0, %cst_0) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
+  func.return %1 : tensor<*xf32>
+}
+// CHECK: func @match_depthwise_conv2d_and_add
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<*xf32>
+
+// -----
+
+func.func @match_depthwise_conv2d_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.Mul"(%0, %cst_0) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  func.return %1 : tensor<?x?x?x3xf32>
+}
+// CHECK: func @match_depthwise_conv2d_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: return %[[DEPTHWISE_CONV2D]] : tensor<?x?x?x3xf32>
+
+// -----
+
+func.func @match_depthwise_conv2d_with_bias_and_add(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  %2 = "tf.AddV2"(%1, %cst_1) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  func.return %2 : tensor<?x?x?x3xf32>
+}
+// CHECK: func @match_depthwise_conv2d_with_bias_and_add
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
+
+// -----
+
+func.func @match_depthwise_conv2d_with_bias_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  func.return %2 : tensor<?x?x?x3xf32>
+}
+// CHECK: func @match_depthwise_conv2d_with_bias_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> {device = ""} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) <{data_format = "NHWC"}> : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
+
+// -----
+
+func.func @lower_einsum(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,ikm->ijm"}: (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+  func.return %0 : tensor<3x4x6xf32>
+}
+// CHECK-LABEL: lower_einsum
+// CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+
+// -----
+
+func.func @removing_identity_after_const(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %identity = "tf.Identity"(%cst) : (tensor<2x3x3x1xf32>) -> tensor<2x3x3x1xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %identity) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+// CHECK: func @removing_identity_after_const
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<2x3x3x1xf32>}> : () -> tensor<2x3x3x1xf32>
+// CHECK: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]])
+
+// -----
+
+func.func @not_removing_identity_of_returning_value(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+  %3 = "tf.Identity"(%2) : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %3 : tensor<*xf32>
+}
+// CHECK: func @not_removing_identity_of_returning_value
+// CHECK: %[[identity:.*]] = "tf.Identity"
+// CHECK: return %[[identity]] : tensor<*xf32>
+
+// -----
+
+func.func @batch_norm_with_q_dq(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantization.qcast"(%cst_1) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>>
+  %1 = "quantization.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>>) -> tensor<2x3x3x2xf32>
+  %2 = "quantization.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+  %3 = "quantization.dcast"(%2) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32>
+  %4 = "tf.Conv2D"(%3, %1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %y, %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%4, %cst, %cst_0, %cst, %cst_0) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<*xf32>)
+  %5 = "tf.Relu6"(%y) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  %6 = "quantization.qcast"(%5) : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>>
+  %7 = "quantization.dcast"(%6) : (tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>>) -> tensor<1x3x2x2xf32>
+  %8 = "tf.Identity"(%7) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  %9 = "tf.Identity"(%8) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %9 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @batch_norm_with_q_dq
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() <{value = dense<0.707036077> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() <{value = dense<-0.914072155> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK: %[[q_input:.*]] = "quantization.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+// CHECK: %[[dq_input:.*]] = "quantization.dcast"(%[[q_input]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32>
+// CHECK: %[[q_weight:.*]] = "quantization.qcast"(%[[cst]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.005567213212411235,0.005567213212411235}>>
+// CHECK: %[[dq_weight:.*]] = "quantization.dcast"(%[[q_weight]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.005567213212411235,0.005567213212411235}>>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[conv:.*]] = "tf.Conv2D"(%[[dq_input]], %[[dq_weight]])
+// CHECK: %[[bias:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]]) <{data_format = "NHWC"}>
+// CHECK: %[[relu6:.*]] = "tf.Relu6"(%[[bias]])
+
+// -----
+
+func.func @remove_check_numerics_op(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.CheckNumerics"(%arg0) {device = "", message = "transformer"} : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// CHECK: func @remove_check_numerics_op
+// CHECK: return %arg0 : tensor<*xf32>
+
+// -----
+
+func.func @remove_stop_gradient_op(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.StopGradient"(%arg0) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// CHECK: func @remove_stop_gradient_op
+// CHECK: return %arg0 : tensor<*xf32>
+
+// -----
+
+func.func @conv2d_with_large_weight_and_mul(%arg0: tensor<?x?x?x3xf32>) -> (tensor<?x?x?x256xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<48x48x3x1xf32>} : () -> tensor<48x48x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<256xf32>} : () -> tensor<256xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<256xf32>} : () -> tensor<256xf32>
+  %w = "tf.AddV2"(%cst, %cst_1) : (tensor<48x48x3x1xf32>, tensor<256xf32>) -> tensor<48x48x3x256xf32>
+  %0 = "tf.Conv2D"(%arg0, %w) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<?x?x?x3xf32>, tensor<48x48x3x256xf32>) -> tensor<?x?x?x256xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x256xf32>, tensor<256xf32>) -> tensor<?x?x?x256xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<?x?x?x256xf32>, tensor<256xf32>) -> tensor<?x?x?x256xf32>
+  func.return %2 : tensor<?x?x?x256xf32>
+}
+// CHECK: func @conv2d_with_large_weight_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<1.250000e+00> : tensor<48x48x3x256xf32>}> : () -> tensor<48x48x3x256xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<2.000000e-01> : tensor<256xf32>}> : () -> tensor<256xf32>
+// CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]])
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]])
+// CHECK-NEXT: return %[[BIASADD]]
+
+// -----
+
+func.func @depthwise_conv2d_with_large_weight_and_add(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<48x48x3x1xf32>} : () -> tensor<48x48x3x1xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %cst_2 = "tf.Const"() {value = dense<0.500000e+00> : tensor<256xf32>} : () -> tensor<256xf32>
+  %w = "tf.AddV2"(%cst, %cst_2) : (tensor<48x48x3x1xf32>, tensor<256xf32>) -> tensor<48x48x3x256xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %w) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<48x48x3x256xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  %2 = "tf.AddV2"(%1, %cst_1) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  func.return %2 : tensor<?x?x?x3xf32>
+}
+// CHECK: func @depthwise_conv2d_with_large_weight_and_add
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.500000e+00> : tensor<48x48x3x256xf32>}> : () -> tensor<48x48x3x256xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<8.000000e-01> : tensor<3xf32>}> : () -> tensor<3xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]])
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]])
+// CHECK-NEXT: return %[[BIASADD]]
+
+// ----
+
+func.func @fuse_conv2d_with_sub_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.200000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.Sub"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %2 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @fuse_conv2d_with_sub_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-0.0800000056> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-NEXT: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]])
+// CHECK-NEXT: %[[BIAS_ADD:.*]] = "tf.BiasAdd"(%[[CONV]], %[[CONST]])
+// CHECK-NEXT: return %[[BIAS_ADD]]
+
+// -----
+
+func.func @fuse_conv2d_with_sub_mul_addv2(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.200000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %cst_2 = "tf.Const"() {value = dense<0.300000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.Sub"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.AddV2"(%2, %cst_2) : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %3 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @fuse_conv2d_with_sub_mul_addv2
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.200000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-NEXT: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]])
+// CHECK-NEXT: %[[BIAS_ADD:.*]] = "tf.BiasAdd"(%[[CONV]], %[[CONST]])
+// CHECK-NEXT: return %[[BIAS_ADD]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_quantize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_quantize.mlir
new file mode 100644
index 000000000000..1ace3d3a17dc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_quantize.mlir
@@ -0,0 +1,42 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-prepare-quantize | FileCheck %s
+
+module {
+  func.func @same_scale_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
+    %cst_1 = arith.constant dense<1.0> : tensor<144x10xf32>
+    %cst_2 = arith.constant dense<0.1> : tensor<10xf32>
+    %0 = "quantization.qcast"(%arg0) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 0.05:-10>>
+    %1 = "quantization.dcast"(%0) : (tensor<*x!quant.uniform<i8:f32, 0.05:-10>>) -> tensor<*xf32>
+    %2 = "tf.MaxPool"(%1) {
+      data_format = "NHWC", device = "", explicit_paddings = [],
+      ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 2, 2, 1]
+    } : (tensor<*xf32>) -> tensor<*xf32>
+    %3 = "tf.Reshape"(%2, %cst) {device = ""} : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+    %4 = "tf.PartitionedCall"(%3, %cst_1, %cst_2) {
+      _tfl_quant_trait = "fully_quantizable", config = "", config_proto = "",
+      executor_type = "", f = @composite_matmul_with_bias_fn_1
+    } : (tensor<*xf32>, tensor<144x10xf32>, tensor<10xf32>) -> tensor<*xf32>
+    %5 = "quantization.qcast"(%4) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 0.1>>
+    %6 = "quantization.dcast"(%5) : (tensor<*x!quant.uniform<i8:f32, 0.1>>) -> tensor<*xf32>
+    func.return %6 : tensor<*xf32>
+  }
+
+  func.func private @composite_matmul_with_bias_fn_1(%a: tensor<*xf32>, %b: tensor<*xf32>, %c: tensor<*xf32>) -> tensor<*xf32> {
+    func.return %a: tensor<*xf32>
+  }
+
+// CHECK-LABEL: same_scale_test
+// CHECK: %[[maxpool:.*]] = "tf.MaxPool"
+// CHECK: %[[q1:.*]] = "quantization.qcast"(%[[maxpool]])
+// CHECK-SAME: quant.uniform<i8:f32, 5.000000e-02:-10>
+// CHECK: %[[dq1:.*]] = "quantization.dcast"(%[[q1]])
+// CHECK-SAME: quant.uniform<i8:f32, 5.000000e-02:-10>
+// CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[dq1]]
+// CHECK: %[[q2:.*]] = "quantization.qcast"(%[[reshape]])
+// CHECK-SAME: quant.uniform<i8:f32, 5.000000e-02:-10>
+// CHECK: %[[dq2:.*]] = "quantization.dcast"(%[[q2]])
+// CHECK-SAME: quant.uniform<i8:f32, 5.000000e-02:-10>
+// CHECK: "tf.PartitionedCall"(%[[dq2]]
+// CHECK-SAME: f = @composite_matmul_with_bias_fn_1
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_quantize_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_quantize_drq.mlir
new file mode 100644
index 000000000000..201054dce765
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_prepare_quantize_drq.mlir
@@ -0,0 +1,90 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-preprocess-op -tf-quant-prepare-quantize-drq | FileCheck %s
+
+module {
+  func.func @matmul(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_matmul_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x1024xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @matmul
+// CHECK-DAG: %[[CONST:.*]] = arith.constant dense<0.000000e+00> : tensor<2x1024xf32>
+// CHECK: %0 = "quantization.qcast"(%[[CONST]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
+// CHECK: %1 = "quantization.dcast"(%0) : (tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<2x1024xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %2 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_matmul_fn
+// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %0 : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @conv2d(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x512xf32>} : () -> tensor<2x3x3x512xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    %2 = "tf.BiasAdd"(%1, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %2: tensor<*xf32>
+  }
+  func.func private @composite_conv2d_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x512xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv2d
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x3x512xf32>
+// CHECK: %0 = "quantization.qcast"(%[[CONST_1]]) : (tensor<2x3x3x512xf32>) -> tensor<2x3x3x512x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>
+// CHECK: %1 = "quantization.dcast"(%0) : (tensor<2x3x3x512x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>) -> tensor<2x3x3x512xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+// CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
+// CHECK: return %3 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_conv2d_fn_1
+// CHECK: %0 = "tf.Conv2D"(%arg0, %arg1)
+// CHECK: return %0 : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x512xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x512xf32>} : () -> tensor<2x3x3x512xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x512xf32>, %arg1: tensor<2x3x3x512xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x1x1536xf32>
+// CHECK: %0 = "quantization.qcast"(%[[CONST_1]]) : (tensor<2x3x1x1536xf32>) -> tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>
+// CHECK: %1 = "quantization.dcast"(%0) : (tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>) -> tensor<2x3x1x1536xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) <{config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x512xf32>, tensor<2x3x1x1536xf32>) -> tensor<*xf32>
+// CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
+// CHECK: return %3 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x3x512xf32>)
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x1x1536xf32>)
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""}
+// CHECK: return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_preprocess_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_preprocess_op.mlir
new file mode 100644
index 000000000000..aeb1bc951a39
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_preprocess_op.mlir
@@ -0,0 +1,39 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-preprocess-op | FileCheck %s
+
+module {
+  // For UniformQuantized depthwise convolution, tensor shape should have
+  // transformed from [H,W,C,M] to [H,W,1,CxM],
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<6xf32>} : () -> tensor<6xf32>
+    %cst_1 = "tf.Const"() {value = dense<[[[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]]],[[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<6xf32>
+// CHECK: %[[CONST_1:.*]] = arith.constant dense
+// CHECK-NOT: tensor<2x3x3x2xf32>
+// CHECK-SAME: tensor<2x3x1x6xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) <{config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6xf32>) -> tensor<*xf32>
+// CHECK: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) <{data_format = "NHWC"}> {device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+// CHECK: return %[[BIAS_0:.*]] : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x3x2xf32>)
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x1x6xf32>)
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""}
+// CHECK: return %0 : tensor<*xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_propagate_quantize_type.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_propagate_quantize_type.mlir
new file mode 100644
index 000000000000..7f8bd97c95c6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_propagate_quantize_type.mlir
@@ -0,0 +1,97 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-propagate-quantize-type | FileCheck %s
+
+module {
+  func.func @not_propagate_matmul(%arg0: tensor<1x2x2x2xf32>) -> tensor<*xf32> {
+    %cst = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>} : () -> tensor<2x1024xi8>
+    %cst_0 = "tf.Const"() {value = dense<0.0157480314> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Identity"(%cst) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+    %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+    %2 = "tf.MatMul"(%arg0, %1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    %3 = "tf.Mul"(%2, %cst_0) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    return %3 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @not_propagate_matmul
+// CHECK: %[[CASTED_W:.*]] = "tf.Cast"(%0) <{Truncate = false}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %2 = "tf.MatMul"(%arg0, %[[CASTED_W]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @propagate_xladotv2_bf16(%arg0: tensor<1x2x2x2xbf16>) -> tensor<1x2x2x1024xbf16> {
+    %cst = "tf.Const"() {value = dense<127> : tensor<2x1024xi8>} : () -> tensor<2x1024xi8>
+    %0 = "tf.Identity"(%cst) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+    %1 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<2x1024xi8>) -> tensor<2x1024xbf16>
+    %2 = "tf.XlaDotV2"(%arg0, %1) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
+    %3 = "tf.Identity"(%2) : (tensor<1x2x2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
+    return %3 : tensor<1x2x2x1024xbf16>
+  }
+
+  func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xbf16> {
+    %cst = "tf.Const"() {value = dense<1.574710e-02> : tensor<bf16>} : () -> tensor<bf16>
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<*xi8>) -> tensor<*xbf16>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<*xbf16>, tensor<bf16>) -> tensor<*xbf16>
+    return %1 : tensor<*xbf16>
+  }
+
+// CHECK-LABEL: func @propagate_xladotv2_bf16
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%cst) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+// CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"(%arg0, %[[IDENTITY]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xi8>) -> tensor<1x2x2x1024xbf16>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[MATMUL]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1x2x2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
+}
+
+// -----
+
+module {
+  func.func @not_propagate_last_op(%arg0: tensor<10x2xi32>) -> tensor<1x300x10xf32> {
+    %cst = "tf.Const"() {value = dense<[1, 1, 300]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %cst_0 = "tf.Const"() {value = dense<127> : tensor<200x100x300xi8>} : () -> tensor<200x100x300xi8>
+    %0 = "tf.Identity"(%cst_0) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
+    %1 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
+    %2 = "tf.XlaGather"(%1, %arg0, %cst) {dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
+    return %2 : tensor<1x300x10xf32>
+  }
+
+  func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32> {
+    %cst = "tf.Const"() {value = dense<0.0787401571> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    return %1 : tensor<*xf32>
+  }
+
+}
+
+// CHECK-LABEL: func @not_propagate_last_op
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%cst_0) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
+// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[DEQUANTIZED]], %arg0, %cst) <{dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true}> : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
+// CHECK: return %[[GATHER]] : tensor<1x300x10xf32>
+
+// -----
+
+module {
+  func.func @propagate_xlagather(%arg0: tensor<10x2xi32>) -> tensor<1x300x10xf32> {
+    %cst = "tf.Const"() {value = dense<[1, 1, 300]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %cst_0 = "tf.Const"() {value = dense<127> : tensor<200x100x300xi8>} : () -> tensor<200x100x300xi8>
+    %0 = "tf.Identity"(%cst_0) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
+    %1 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform} : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
+    %2 = "tf.XlaGather"(%1, %arg0, %cst) {dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
+    %3 = "tf.Identity"(%2) : (tensor<1x300x10xf32>) -> tensor<1x300x10xf32>
+    return %3 : tensor<1x300x10xf32>
+  }
+
+  func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32> {
+    %cst = "tf.Const"() {value = dense<0.0787401571> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    return %1 : tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: func @propagate_xlagather
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%cst_0) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
+// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[IDENTITY]], %arg0, %cst) <{dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true}> : (tensor<200x100x300xi8>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[GATHER]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1x300x10xi8>) -> tensor<1x300x10xf32>
+// CHECK: %[[ORIGINAL_IDENTITY:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) : (tensor<1x300x10xf32>) -> tensor<1x300x10xf32>
+// CHECK: return %[[ORIGINAL_IDENTITY]] : tensor<1x300x10xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize.mlir
new file mode 100644
index 000000000000..b0feabba6e0b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize.mlir
@@ -0,0 +1,79 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-lift-quantizable-spots-as-functions -tf-quant-quantize -verify-each=false | FileCheck %s
+
+func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "input_tensor"}) -> tensor<*xf32> attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x3x4x3>]} {
+  %weight = arith.constant dense_resource<__elided__> : tensor<2x3x3x2xf32>
+  %bias = arith.constant dense<[7.11401462, 7.05456924]> : tensor<2xf32>
+
+  %q_input= "quantization.qcast"(%input) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
+  %dq_input= "quantization.dcast"(%q_input) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>) -> tensor<1x3x4x3xf32>
+  %q_weight = "quantization.qcast"(%weight) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+  %dq_weight = "quantization.dcast"(%q_weight) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>) -> tensor<2x3x3x2xf32>
+  %q_bias = "quantization.qcast"(%bias) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
+  %dq_bias = "quantization.dcast"(%q_bias) : (tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<2xf32>
+  %conv = "tf.Conv2D"(%dq_input, %dq_weight) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %biasadd = "tf.BiasAdd"(%conv, %dq_bias) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %res = "tf.Relu6"(%biasadd) : (tensor<*xf32>) -> tensor<*xf32>
+  %q_res = "quantization.qcast"(%res) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+  %dq_res = "quantization.dcast"(%q_res) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
+
+  func.return %dq_res : tensor<*xf32>
+}
+
+// CHECK-DAG: [[bias:%.+]] = "arith.constant"() <{value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: [[weight:%.+]] = "arith.constant"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+// CHECK: [[q_input:%.+]] = "quantization.qcast"([[ARG0:%arg[0-9]+]]) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
+// CHECK-NEXT: [[q_bias:%.+]] = "quantization.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
+// CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) <{config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+// CHECK-NEXT: [[res:%.+]] = "quantization.dcast"([[conv]]) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
+// CHECK-NEXT: "func.return"([[res]]) : (tensor<*xf32>) -> ()
+
+
+// -----
+
+// CHECK-LABEL: same_scale_test
+func.func @same_scale_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
+  %0 = "quantization.qcast"(%arg0) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %1 = "quantization.dcast"(%0) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %2 = "tf.MaxPool"(%1) {data_format = "NHWC", device = "", explicit_paddings = [], ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+  %3 = "quantization.qcast"(%2) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %4 = "quantization.dcast"(%3) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %5 = "tf.Reshape"(%4, %cst) {device = ""} : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %6 = "quantization.qcast"(%5) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %7 = "quantization.dcast"(%6) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  func.return %7 : tensor<*xf32>
+}
+
+// CHECK: %[[q:.*]] = "quantization.qcast"(%arg0)
+// CHECK: %[[sc1:.*]] = "quantization.scast"(%[[q]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
+// CHECK: %[[maxpool_i8:.*]] = "tf.MaxPool"(%[[sc1]])
+// CHECK-SAME: (tensor<*xi8>) -> tensor<*xi8>
+// CHECK: %[[reshape_i8:.*]] = "tf.Reshape"(%[[maxpool_i8]]
+// CHECK-SAME: (tensor<*xi8>, tensor<2xi32>) -> tensor<*xi8>
+// CHECK: %[[sc2:.*]] = "quantization.scast"(%[[reshape_i8]])
+// CHECK: %[[dq:.*]] = "quantization.dcast"(%[[sc2]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
+// CHECK: return %[[dq]]
+
+// -----
+
+// CHECK-LABEL: avgpool_test
+func.func @avgpool_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
+  %0 = "quantization.qcast"(%arg0) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %1 = "quantization.dcast"(%0) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %2 = "tf.AvgPool"(%1) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+  %3 = "quantization.qcast"(%2) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %4 = "quantization.dcast"(%3) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  func.return %4 : tensor<*xf32>
+}
+
+// CHECK: %[[q:.*]] = "quantization.qcast"(%arg0)
+// CHECK: %[[sc1:.*]] = "quantization.scast"(%[[q]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
+// CHECK: %[[fcast:.*]] = "tf.Cast"(%[[sc1]]) <{Truncate = false}> : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK: %[[avgpool_f32:.*]] = "tf.AvgPool"(%[[fcast]])
+// CHECK-SAME: (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool_f32]])
+// CHECK: %[[icast:.*]] = "tf.Cast"(%[[round]]) <{Truncate = false}> : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[sc2:.*]] = "quantization.scast"(%[[icast]])
+// CHECK: %[[dq:.*]] = "quantization.dcast"(%[[sc2]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
+// CHECK: return %[[dq]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize_composite_functions.mlir
new file mode 100644
index 000000000000..c677bc9715c9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize_composite_functions.mlir
@@ -0,0 +1,202 @@
+// RUN: tf-quant-opt %s -split-input-file -tf-quant-insert-quantized-functions -tf-quant-quantize-composite-functions | FileCheck %s
+
+module {
+  func.func @conv(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<[[[[1.600000e-01, 1.000000e-01], [5.100000e-01, 5.400000e-01], [-5.000000e-01, 4.100000e-01]], [[-3.500000e-01, 5.000000e-02], [-0.00999999977, 1.600000e-01], [-4.800000e-01, -2.400000e-01]]], [[[-3.500000e-01, -2.100000e-01], [-1.400000e-01, -2.000000e-02], [4.800000e-01, 3.500000e-01]], [[-1.900000e-01, 3.200000e-01], [0.00999999977, -7.000000e-02], [2.000000e-01, -4.000000e-02]]]]> : tensor<2x2x3x2xf32>} : () -> tensor<2x2x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "quantization.qcast"(%cst) : (tensor<2x2x3x2xf32>) -> tensor<2x2x3x2x!quant.uniform<i8<-127:127>:f32:3, {4.000000e-03,5.000000e-03}>>
+    %1 = "quantization.dcast"(%0) : (tensor<2x2x3x2x!quant.uniform<i8<-127:127>:f32:3, {4.000000e-03,5.000000e-03}>>) -> tensor<*xf32>
+    %2 = "quantization.qcast"(%arg0) : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3x!quant.uniform<i8:f32, 8.000000e-03>>
+    %3 = "quantization.dcast"(%2) : (tensor<1x2x2x3x!quant.uniform<i8:f32, 8.000000e-03>>) -> tensor<*xf32>
+    %4 = "tf.PartitionedCall"(%3, %1, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2} : (tensor<*xf32>, tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "quantization.qcast"(%4) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-1>>) -> tensor<*xf32>
+    %7 = "tf.PartitionedCall"(%arg0, %cst, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %6, %7 : tensor<*xf32>, tensor<*xf32>
+  }
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %2 : tensor<*xf32>
+  }
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x2x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x2x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %2 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv
+// CHECK-DAG: %[[w_float:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}1.600000e-01
+// CHECK-DAG: %[[b_float:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>
+// CHECK-DAG: %[[in_scale:.*]] = "tf.Const"() <{value = dense<8.000000e-03> : tensor<f32>}> : () -> tensor<f32>
+// CHECK-DAG: %[[in_zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}>
+// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() <{value = dense<[4.000000e-03
+// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() <{value = dense<0> : tensor<2xi32>}>
+// CHECK-DAG: %[[b_scale:.*]] = "tf.Const"() <{value = dense<[3.200000e-05, 4.000000e-05]> : tensor<2xf32>}
+// CHECK-DAG: %[[out_scale:.*]] = "tf.Const"() <{value = dense<5.000000e-02> : tensor<f32>}>
+// CHECK-DAG: %[[out_zp:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}>
+// CHECK-DAG: %[[b_quant:.*]] = "tf.Const"() <{value = dense<[-62500, 75000]> : tensor<2xi32>}>
+// CHECK-DAG: %[[w_quant:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}40, 20]
+// CHECK-DAG: {{\[\[\[}}-87, -42]
+
+// CHECK: %[[quantize:.*]] = "tf.PartitionedCall"(%arg0, %[[in_scale]], %[[in_zp]])
+// CHECK-SAME: f = @quantize_i8
+// CHECK: %[[conv_quant:.*]] = "tf.PartitionedCall"(%[[quantize]], %[[w_quant]], %[[b_quant]],
+// CHECK-SAME: %[[in_scale]], %[[in_zp]], %[[w_scale]], %[[w_zp]],
+// CHECK-SAME: %[[b_scale]], %[[w_zp]], %[[out_scale]], %[[out_zp]])
+// CHECK-SAME: f = @quantized_conv2d_with_bias_and_relu6_fn_0
+// CHECK-SAME: (tensor<1x2x2x3xi8>, tensor<2x2x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<2xf32>, tensor<2xi32>, tensor<2xf32>, tensor<2xi32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
+// CHECK: %[[dequantize:.*]] = "tf.PartitionedCall"(%[[conv_quant]], %[[out_scale]], %[[out_zp]])
+// CHECK-SAME: f = @dequantize_i8
+
+// CHECK: %[[conv_float:.*]] = "tf.PartitionedCall"(%arg0, %[[w_float]], %[[b_float]])
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
+
+// CHECK: return %[[dequantize]], %[[conv_float]]
+
+// CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu6_fn_1
+// CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
+// CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+// CHECK-SAME: device = ""
+// CHECK:      %[[BIASADD_0:.*]] = "tf.BiasAdd"
+// CHECK:      %[[RELU6_0:.*]] = "tf.Relu6"
+
+// CHECK-LABEL: func private @quantized_conv2d_with_bias_and_relu6_fn_0
+// CHECK-SAME: (%arg0: tensor<1x2x2x3xi8>, %arg1: tensor<2x2x3x2xi8>, %arg2: tensor<2xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2xf32>, %arg6: tensor<2xi32>, %arg7: tensor<2xf32>, %arg8: tensor<2xi32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<*xi8>
+// CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
+// CHECK-SAME: {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/2
+
+// CHECK: Number of quantized layers with quantized outputs: 1/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 1
+}
+
+// -----
+
+module {
+  func.func @conv_with_default_attributes(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<[[[[1.600000e-01, 1.000000e-01], [5.100000e-01, 5.400000e-01], [-5.000000e-01, 4.100000e-01]], [[-3.500000e-01, 5.000000e-02], [-0.00999999977, 1.600000e-01], [-4.800000e-01, -2.400000e-01]]], [[[-3.500000e-01, -2.100000e-01], [-1.400000e-01, -2.000000e-02], [4.800000e-01, 3.500000e-01]], [[-1.900000e-01, 3.200000e-01], [0.00999999977, -7.000000e-02], [2.000000e-01, -4.000000e-02]]]]> : tensor<2x2x3x2xf32>} : () -> tensor<2x2x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "quantization.qcast"(%cst) : (tensor<2x2x3x2xf32>) -> tensor<2x2x3x2x!quant.uniform<i8<-127:127>:f32:3, {4.000000e-03,5.000000e-03}>>
+    %1 = "quantization.dcast"(%0) : (tensor<2x2x3x2x!quant.uniform<i8<-127:127>:f32:3, {4.000000e-03,5.000000e-03}>>) -> tensor<*xf32>
+    %2 = "quantization.qcast"(%arg0) : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3x!quant.uniform<i8:f32, 8.000000e-03>>
+    %3 = "quantization.dcast"(%2) : (tensor<1x2x2x3x!quant.uniform<i8:f32, 8.000000e-03>>) -> tensor<*xf32>
+    %4 = "tf.PartitionedCall"(%3, %1, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<*xf32>, tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "quantization.qcast"(%4) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-1>>) -> tensor<*xf32>
+    func.return %6 : tensor<*xf32>
+  }
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %2 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv_with_default_attributes
+
+// CHECK: %[[quantize:.*]] = "tf.PartitionedCall"(%arg0
+// CHECK-SAME: f = @quantize_i8
+// CHECK: %[[conv_quant:.*]] = "tf.PartitionedCall"(%[[quantize]]
+// CHECK-SAME: f = @quantized_conv2d_with_bias_and_relu6_fn_0
+// CHECK-SAME: (tensor<1x2x2x3xi8>, tensor<2x2x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<2xf32>, tensor<2xi32>, tensor<2xf32>, tensor<2xi32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
+// CHECK: %[[dequantize:.*]] = "tf.PartitionedCall"(%[[conv_quant]]
+// CHECK-SAME: f = @dequantize_i8
+// CHECK: return %[[dequantize]]
+
+// CHECK-LABEL: func private @quantized_conv2d_with_bias_and_relu6_fn_0
+// CHECK-SAME: (%arg0: tensor<1x2x2x3xi8>, %arg1: tensor<2x2x3x2xi8>, %arg2: tensor<2xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2xf32>, %arg6: tensor<2xi32>, %arg7: tensor<2xf32>, %arg8: tensor<2xi32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<*xi8>
+// CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
+// CHECK-SAME: {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/1
+
+// CHECK: Number of quantized layers with quantized outputs: 1/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 1
+}
+
+// -----
+
+module {
+  func.func @conv_with_avgpool(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<[[[[1.600000e-01, 1.000000e-01], [5.100000e-01, 5.400000e-01], [-5.000000e-01, 4.100000e-01]], [[-3.500000e-01, 5.000000e-02], [-0.00999999977, 1.600000e-01], [-4.800000e-01, -2.400000e-01]]], [[[-3.500000e-01, -2.100000e-01], [-1.400000e-01, -2.000000e-02], [4.800000e-01, 3.500000e-01]], [[-1.900000e-01, 3.200000e-01], [0.00999999977, -7.000000e-02], [2.000000e-01, -4.000000e-02]]]]> : tensor<2x2x3x2xf32>} : () -> tensor<2x2x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<[-2.000000e+00, 3.000000e+00]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "quantization.qcast"(%cst) : (tensor<2x2x3x2xf32>) -> tensor<2x2x3x2x!quant.uniform<i8<-127:127>:f32:3, {4.000000e-03,5.000000e-03}>>
+    %1 = "quantization.dcast"(%0) : (tensor<2x2x3x2x!quant.uniform<i8<-127:127>:f32:3, {4.000000e-03,5.000000e-03}>>) -> tensor<*xf32>
+    %2 = "quantization.qcast"(%arg0) : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3x!quant.uniform<i8:f32, 8.000000e-03>>
+    %3 = "quantization.dcast"(%2) : (tensor<1x2x2x3x!quant.uniform<i8:f32, 8.000000e-03>>) -> tensor<*xf32>
+    %4 = "tf.PartitionedCall"(%3, %1, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1} : (tensor<*xf32>, tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "quantization.qcast"(%4) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-1>>
+    %6 = "quantization.dcast"(%5) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-1>>) -> tensor<*xf32>
+    %7 = "tf.AvgPool"(%6) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %7 : tensor<*xf32>
+  }
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %2 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv_with_avgpool
+// CHECK: %[[quantize:.*]] = "tf.PartitionedCall"(%arg0
+// CHECK-SAME: f = @quantize_i8
+// CHECK: %[[conv_quant:.*]] = "tf.PartitionedCall"(%[[quantize]]
+// CHECK-SAME: f = @quantized_conv2d_with_bias_and_relu6_fn_0
+// CHECK-SAME: (tensor<1x2x2x3xi8>, tensor<2x2x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<2xf32>, tensor<2xi32>, tensor<2xf32>, tensor<2xi32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
+// CHECK: %[[cast_1:.*]] = "tf.Cast"(%[[conv_quant]]) <{Truncate = false}> : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK: %[[avgpool:.*]] = "tf.AvgPool"(%[[cast_1]]) <{data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]}> : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[cast_2:.*]] = "tf.Cast"(%[[round]]) <{Truncate = false}> : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[dequantize:.*]] = "tf.PartitionedCall"(%[[cast_2]]
+// CHECK-SAME: f = @dequantize_i8
+// CHECK: return %[[dequantize]]
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/1
+
+// CHECK: Number of quantized layers with quantized outputs: 1/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 1
+}
+
+
+// -----
+
+module {
+  func.func @float_einsum(%arg0: tensor<?x64x32xf32>, %arg1: tensor<32x2x16xf32>) -> (tensor<?x64x2x16xf32>) {
+    %0 = "tf.Einsum"(%arg0, %arg1) {equation = "abc,cde->abde"} : (tensor<?x64x32xf32>, tensor<32x2x16xf32>) -> tensor<?x64x2x16xf32>
+    func.return %0 : tensor<?x64x2x16xf32>
+  }
+
+// CHECK-LABEL: func @float_einsum
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Einsum  0/1
+
+// CHECK: Number of quantized layers with quantized outputs: 0/0
+// CHECK: Number of quantize layers added: 0
+// CHECK: Number of dequantize layers added: 0
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize_weights.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize_weights.mlir
new file mode 100644
index 000000000000..7f7a5090439e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_quantize_weights.mlir
@@ -0,0 +1,525 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-quantize-weights | FileCheck %s
+
+module {
+  func.func @not_quantize_const() -> (tensor<2x1024xf32>) {
+    // Nothing happens if not connected wiht quantizable op.
+    %cst_0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    func.return %cst_0: tensor<2x1024xf32>
+  }
+
+// CHECK-LABEL: func @not_quantize_const
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<2x1024xf32>
+// CHECK: return %[[W]] : tensor<2x1024xf32>
+}
+
+// -----
+
+module {
+  func.func @matmul(%arg0: tensor<1x2x2x2xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %0 = "tf.MatMul"(%arg0, %cst_0) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %0: tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @matmul
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[MATMUL]] : tensor<*xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
+// CHECK: %[[CASTED_W:.*]] = "tf.Cast"(%arg0) <{Truncate = false}> : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.Mul"(%[[CASTED_W]], %[[SCALE]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: return %[[DEQUANTIZED]] : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @not_quantize_matmul_without_const(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x1024xf32>) -> (tensor<*xf32>) {
+    %arg0_identity = "tf.Identity"(%arg0) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    %arg1_identity = "tf.Identity"(%arg1) {device = ""} : (tensor<2x1024xf32>) -> tensor<2x1024xf32>
+    %0 = "tf.MatMul"(%arg0_identity, %arg1_identity) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %0: tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @not_quantize_matmul_without_const
+// CHECK: %[[ORIGINAL_IDENTITY_1:.*]] = "tf.Identity"(%arg0) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: %[[ORIGINAL_IDENTITY_2:.*]] = "tf.Identity"(%arg1) {device = ""} : (tensor<2x1024xf32>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%[[ORIGINAL_IDENTITY_1]], %[[ORIGINAL_IDENTITY_2]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[MATMUL]] : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @quantize_xladotv2_bf16(%arg0: tensor<1x2x2x2xbf16>) -> (tensor<1x2x2x1024xbf16>) {
+    %cst_0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x1024xbf16>} : () -> tensor<2x1024xbf16>
+    %0 = "tf.XlaDotV2"(%arg0, %cst_0) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
+    // Check dequantize performed in bf16.
+    func.return %0: tensor<1x2x2x1024xbf16>
+  }
+
+// CHECK-LABEL: func @quantize_xladotv2_bf16
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xbf16>
+// CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"(%arg0, %[[DEQUANTIZED]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x2x2x2xbf16>, tensor<2x1024xbf16>) -> tensor<1x2x2x1024xbf16>
+// CHECK: return %[[MATMUL]] : tensor<1x2x2x1024xbf16>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xbf16>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<1.574710e-02> : tensor<bf16>
+}
+
+// -----
+
+module {
+  func.func @matmul_with_identity_and_reshape(%arg0: tensor<1x2x2x2xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<1024x2xf32>} : () -> tensor<1024x2xf32>
+    %cst_1 = "tf.Const"() {value = dense<[2, 1024]> : tensor<2xi32>} : () -> tensor<2xi32>
+    // Original identity preserved.
+    %cst_identity = "tf.Identity"(%cst_0) {device = ""} : (tensor<1024x2xf32>) -> tensor<1024x2xf32>
+    %0 = "tf.Reshape"(%cst_identity, %cst_1) : (tensor<1024x2xf32>, tensor<2xi32>) -> tensor<2x1024xf32>
+    %1 = "tf.MatMul"(%arg0, %0) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @matmul_with_identity_and_reshape
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x2xi8>
+// CHECK-DAG: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 1024]> : tensor<2xi32>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<1024x2xi8>) -> tensor<1024x2xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x2xi8>) -> tensor<1024x2xf32>
+// CHECK: %[[ORIGINAL_IDENTITY:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) {device = ""} : (tensor<1024x2xf32>) -> tensor<1024x2xf32>
+// CHECK: %[[RESHAPED_W:.*]] = "tf.Reshape"(%[[ORIGINAL_IDENTITY]], %[[SHAPE]]) : (tensor<1024x2xf32>, tensor<2xi32>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[RESHAPED_W]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[MATMUL]] : tensor<*xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @conv2d(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x512xf32>} : () -> tensor<2x3x3x512xf32>
+    %0 = "tf.Conv2D"(%arg0, %cst_1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    // Dequantize added before BiasAdd.
+    %2 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %2: tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv2d
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x512xi8>
+// CHECK-DAG: %[[BIAS:.*]] = "tf.Const"() <{value = dense<0.000000e+00> : tensor<2xf32>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xf32>
+// CHECK: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[DEQUANTIZED:.*]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+// CHECK: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[BIAS]]) <{data_format = "NHWC"}> {device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+// CHECK: return %[[BIASADD]] : tensor<*xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0236220472> : tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x512xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x512xf32>} : () -> tensor<2x3x3x512xf32>
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst_1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    func.return %0: tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x512xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x3x3x512xi8>) -> tensor<2x3x3x512xf32>
+// CHECK: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[DEQUANTIZED]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]}> {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", device = ""} : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+// CHECK: return %[[DEPTHWISE_CONV2D]] : tensor<*xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00787401571> : tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @quantize_sharded_weights_with_xladot(%arg0: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xbf16> {
+    %cst = "tf.Const"() {device = "", value = dense<1.000000e+01> : tensor<512x512xf32>} : () -> tensor<512x512xf32>
+    %cst_sharded = "tf.XlaSharding"(%cst) {_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", device = "", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+    %1 = "tf.XlaDotV2"(%arg0, %cst_sharded) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
+    %2 = "tf.Cast"(%1) {Truncate = false} : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xbf16>
+    return %2 : tensor<?x?x?x?xbf16>
+  }
+
+// CHECK-LABEL: func @quantize_sharded_weights_with_xladot
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<512x512xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<512x512xi8>) -> tensor<512x512xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<512x512xi8>) -> tensor<512x512xf32>
+// CHECK: %[[SHARDED_W:.*]] = "tf.XlaSharding"(%[[DEQUANTIZED]]) <{_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01"}> {device = "", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+// CHECK: %[[XLADOT:.*]] = "tf.XlaDotV2"(%arg0, %[[SHARDED_W]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
+// CHECK: %[[ORIGINAL_CAST:.*]] = "tf.Cast"(%[[XLADOT]]) <{Truncate = false}> : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xbf16>
+// CHECK: return %[[ORIGINAL_CAST]] : tensor<?x?x?x?xbf16>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0787401571> : tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @quantize_sharded_weights_with_xladot_with_identity(%arg0: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+    %cst = "tf.Const"() {device = "", value = dense<1.000000e+01> : tensor<512x512xf32>} : () -> tensor<512x512xf32>
+    %cst_identity = "tf.Identity"(%cst) {device = ""} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+    %cst_sharded = "tf.XlaSharding"(%cst_identity) {_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", device = "", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+    %1 = "tf.XlaDotV2"(%arg0, %cst_sharded) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
+    return %1 : tensor<?x?x?x?xf32>
+  }
+
+// CHECK-LABEL: func @quantize_sharded_weights_with_xladot_with_identity
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<512x512xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<512x512xi8>) -> tensor<512x512xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<512x512xi8>) -> tensor<512x512xf32>
+// CHECK: %[[IDENTITY_W:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) {device = ""} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+// CHECK: %[[SHARDED_W:.*]] = "tf.XlaSharding"(%[[IDENTITY_W]]) <{_XlaSharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01", sharding = "\08\03\1A\03\01\04\02\22\08\00\04\01\05\02\06\03\070\01"}> {device = "", unspecified_dims = []} : (tensor<512x512xf32>) -> tensor<512x512xf32>
+// CHECK: %[[XLADOT:.*]] = "tf.XlaDotV2"(%arg0, %[[SHARDED_W]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<?x?x?x?xf32>, tensor<512x512xf32>) -> tensor<?x?x?x?xf32>
+// CHECK: return %[[XLADOT]] : tensor<?x?x?x?xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0787401571> : tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @quantize_xlagather(%arg0: tensor<10x2xi32>) -> tensor<1x300x10xf32> {
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.000000e+01> : tensor<200x100x300xf32>} : () -> tensor<200x100x300xf32>
+    %cst = "tf.Const"() { value = dense<[1, 1, 300]> : tensor<3xi64> } : () -> tensor<3xi64>
+    %0 = "tf.XlaGather"(%cst_0, %arg0, %cst) {dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01\20\01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
+    %1 = "tf.Identity"(%0) {device = ""} : (tensor<1x300x10xf32>) -> tensor<1x300x10xf32>
+    func.return %1 : tensor<1x300x10xf32>
+  }
+
+// CHECK-LABEL: func @quantize_xlagather
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<200x100x300xi8>}> : () -> tensor<200x100x300xi8>
+// CHECK-DAG: %[[IDX:.*]] = "tf.Const"() <{value = dense<[1, 1, 300]> : tensor<3xi64>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<200x100x300xi8>) -> tensor<200x100x300xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<200x100x300xi8>) -> tensor<200x100x300xf32>
+// CHECK: %[[GATHER:.*]] = "tf.XlaGather"(%[[DEQUANTIZED]], %arg0, %[[IDX]]) <{dimension_numbers = "\0A\02\00\01\12\01\00\1A\02\00\01 \01", indices_are_sorted = true}> : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<1x300x10xf32>
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[GATHER]]) {device = ""} : (tensor<1x300x10xf32>) -> tensor<1x300x10xf32>
+// CHECK: return %[[IDENTITY]] : tensor<1x300x10xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0787401571> : tensor<f32>}> : () -> tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<4.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+
+  func.func private @composite_matmul_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x1024xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    // Dequantization performed here
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @partitioned_call
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %[[DEQUANTIZED]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[OUTPUT]] : tensor<*xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0314960629> : tensor<f32>
+
+// CHECK-LABEL: func private @composite_matmul_fn
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[MATMUL]] : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @recursive_partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<4.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {config = "", config_proto = "", executor_type = "", f = @outer_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+
+  func.func private @outer_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x1024xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.PartitionedCall"(%arg0, %arg1) {config = "", config_proto = "", executor_type = "", f = @inner_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+  func.func private @inner_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x1024xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    // Dequantization performed here
+    return %0 : tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: func @recursive_partitioned_call(%arg0: tensor<1x2x2x3xf32>) -> tensor<*xf32>
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %[[DEQUANTIZED]]) <{config = "", config_proto = "", executor_type = "", f = @outer_fn}> : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[OUTPUT]] : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0314960629> : tensor<f32>
+
+// CHECK-LABEL: func private @outer_fn
+// CHECK: %[[OUTER_OUTPUT:.*]] = "tf.PartitionedCall"(%arg0, %arg1) <{config = "", config_proto = "", executor_type = "", f = @inner_fn}> : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[OUTER_OUTPUT]] : tensor<*xf32>
+
+// CHECK-LABEL: func private @inner_fn
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[MATMUL]] : tensor<*xf32>
+
+// -----
+
+module {
+  func.func @matmul_multiuses(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<1x2x2x2xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %0 = "tf.MatMul"(%arg0, %cst_0) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    %1 = "tf.MatMul"(%arg1, %cst_0) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    %cst_identity = "tf.Identity"(%cst_0) {device = ""} : (tensor<2x1024xf32>) -> tensor<2x1024xf32>
+    %2 = "tf.MatMul"(%arg0, %cst_identity) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %0, %1, %2 : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @matmul_multiuses
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg1, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[ORIGINAL_IDENTITY:.*]] = "tf.Identity"(%[[DEQUANTIZED]]) {device = ""} : (tensor<2x1024xf32>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[ORIGINAL_IDENTITY]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]] : tensor<*xf32>, tensor<*xf32>, tensor<*xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @matmul_multiuses_with_unquantizable_op(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x1024xf32>) -> (tensor<*xf32>, tensor<2x1024xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %0 = "tf.MatMul"(%arg0, %cst_0) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    // AddV2 not in quantizable op list.
+    %1 = "tf.AddV2"(%arg1, %cst_0) {device = ""} : (tensor<2x1024xf32>, tensor<2x1024xf32>) -> tensor<2x1024xf32>
+    func.return %0, %1 : tensor<*xf32>, tensor<2x1024xf32>
+  }
+
+// CHECK-LABEL: func @matmul_multiuses
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x1024xi8>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<2x1024xi8>) -> tensor<2x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<2x1024xi8>) -> tensor<2x1024xf32>
+// CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[DEQUANTIZED]]) <{transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_a", device = ""} : (tensor<1x2x2x2xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg1, %[[DEQUANTIZED]]) {device = ""} : (tensor<2x1024xf32>, tensor<2x1024xf32>) -> tensor<2x1024xf32>
+// CHECK: return %[[MATMUL]], %[[ADD]] : tensor<*xf32>, tensor<2x1024xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.0157480314> : tensor<f32>
+}
+
+// -----
+
+module {
+  func.func @matmul_with_while(%arg0: tensor<1x1024xf32>) -> tensor<1x1024xf32> {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %cst_1 = "tf.Const"(){value = dense<1.0> : tensor<1024x1024xf32>} : () -> tensor<1024x1024xf32>
+    %0:5 = "tf.While"(%cst_0, %cst, %cst_0, %arg0, %cst_1) {T = [i32, i32, i32, f32, f32],_lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], body = @while_body, cond = @while_cond, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>], parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>)
+    %1 = "tf.Identity"(%0#3) {device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+    func.return %1 : tensor<1x1024xf32>
+  }
+
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xf32>, %arg4: tensor<1024x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>)
+  {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.AddV2"(%arg2, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %2 = "tf.MatMul"(%arg3, %arg4) {device = "", transpose_a = false, transpose_b = false} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
+    %3 = "tf.Identity"(%2) {device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+    %4 = "tf.AddV2"(%arg0, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %5 = "tf.Identity"(%arg4) {device = ""} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    %6 = "tf.MatMul"(%arg3, %5) {device = "", transpose_a = false, transpose_b = false} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
+    %7 = "tf.AddV2"(%2, %6) {device = ""} : (tensor<1x1024xf32>, tensor<1x1024xf32>) -> tensor<1x1024xf32>
+    %8 = "tf.Identity"(%4) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %9 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
+    func.return %8, %9, %1, %7, %arg4 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>
+  }
+
+  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xf32>, %arg4: tensor<1024x1024xf32>) -> tensor<i1>
+  {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.Less"(%arg0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    func.return %0 : tensor<i1>
+  }
+}
+
+// CHECK-LABEL: func @matmul_with_while
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x1024xi8>
+// CHECK-DAG: %[[CNT:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %[[PRESERVE_W:.*]] = "tf.Identity"(%[[W]]) : (tensor<1024x1024xi8>) -> tensor<1024x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[PRESERVE_W]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x1024xi8>) -> tensor<1024x1024xf32>
+// CHECK: %[[WHILE:.*]] = "tf.While"(%[[CNT]], %[[CNT]], %[[CNT]], %arg0, %[[DEQUANTIZED]]) <{body = @while_body, cond = @while_cond, is_stateless = true, parallel_iterations = 10 : i64, shape_invariant}> {T = [i32, i32, i32, f32, f32], _lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], device = "", output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>]} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>)
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[WHILE:.*]]) {device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+// CHECK: return %[[IDENTITY]] : tensor<1x1024xf32>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xf32>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00787401571> : tensor<f32>
+
+// CHECK-LABEL: func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xf32>, %arg4: tensor<1024x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xf32>, tensor<1024x1024xf32>)
+// CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg3, %arg4) <{transpose_a = false, transpose_b = false}> {device = ""} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%arg4) {device = ""} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+// CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg3, %[[IDENTITY]]) <{transpose_a = false, transpose_b = false}> {device = ""} : (tensor<1x1024xf32>, tensor<1024x1024xf32>) -> tensor<1x1024xf32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[MATMUL_1]], %[[MATMUL_2]]) {device = ""} : (tensor<1x1024xf32>, tensor<1x1024xf32>) -> tensor<1x1024xf32>
+
+// CHECK-LABEL: func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xf32>, %arg4: tensor<1024x1024xf32>) -> tensor<i1>
+// CHECK: return %0 : tensor<i1>
+
+// -----
+
+module {
+  func.func @matmul_with_while_bf16(%arg0: tensor<1x1024xbf16>) -> tensor<1x1024xbf16> {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %cst_1 = "tf.Const"() {value = dense<1.0> : tensor<1024x1024xbf16>} : () -> tensor<1024x1024xbf16>
+    %0:5 = "tf.While"(%cst_0, %cst, %cst_0, %arg0, %cst_1) {T = [i32, i32, i32, bf16, bf16],_lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], body = @while_body, cond = @while_cond, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>], parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>)
+    %1 = "tf.Identity"(%0#3) {device = ""} : (tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
+    func.return %1 : tensor<1x1024xbf16>
+  }
+
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>)
+  {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.AddV2"(%arg2, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %2 = "tf.XlaDotV2"(%arg3, %arg4) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
+    %3 = "tf.Identity"(%2) {device = ""} : (tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
+    %4 = "tf.AddV2"(%arg0, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %5 = "tf.Identity"(%arg4) {device = ""} : (tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16>
+    %6 = "tf.XlaDotV2"(%arg3, %5) {device = "", dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
+    %7 = "tf.AddV2"(%2, %6) {device = ""} : (tensor<1x1024xbf16>, tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
+    %8 = "tf.Identity"(%4) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %9 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
+    func.return %8, %9, %1, %7, %arg4 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>
+  }
+
+  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> tensor<i1>
+  {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.Less"(%arg0, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    func.return %0 : tensor<i1>
+  }
+}
+
+// CHECK-LABEL: func @matmul_with_while_bf16
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x1024xi8>
+// CHECK-DAG: %[[CNT:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[W]]) : (tensor<1024x1024xi8>) -> tensor<1024x1024xi8>
+// CHECK: %[[DEQUANTIZED:.*]] = "tf.PartitionedCall"(%[[IDENTITY]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x1024xi8>) -> tensor<1024x1024xbf16>
+// CHECK: %[[WHILE:.*]] = "tf.While"(%[[CNT]], %[[CNT]], %[[CNT]], %arg0, %[[DEQUANTIZED]]) <{body = @while_body, cond = @while_cond, is_stateless = true, parallel_iterations = 10 : i64, shape_invariant}> {T = [i32, i32, i32, bf16, bf16], _lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], device = "", output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x1024>, #tf_type.shape<1024x1024>]} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>)
+// CHECK: %[[ORIGIANL_IDENTITY:.*]] = "tf.Identity"(%[[WHILE:.*]]) {device = ""} : (tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
+
+// CHECK-LABEL: func.func private @composite_dequantize_uniform(%arg0: tensor<*xi8>) -> tensor<*xbf16>
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<7.873530e-03> : tensor<bf16>
+
+// CHECK-LABEL: func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<1x1024xbf16>, tensor<1024x1024xbf16>) {
+// CHECK: %[[MATMUL_1:.*]] = "tf.XlaDotV2"(%arg3, %arg4) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
+// CHECK: %[[IDENTITY_2:.*]] = "tf.Identity"(%arg4) {device = ""} : (tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16>
+// CHECK: %[[MATMUL_2:.*]] = "tf.XlaDotV2"(%arg3, %[[IDENTITY_2]]) <{dimension_numbers = "\12\01\00\0A\01\03", precision_config = ""}> {device = ""} : (tensor<1x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<1x1024xbf16>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[MATMUL_1]], %[[MATMUL_2]]) {device = ""} : (tensor<1x1024xbf16>, tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
+
+// CHECK-LABEL: func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<1x1024xbf16>, %arg4: tensor<1024x1024xbf16>) -> tensor<i1> {
+// CHECK: return %0 : tensor<i1>
+
+// -----
+
+module {
+  func.func @matmul_with_while_returning_mutated_value(%arg0: tensor<i32>, %arg2: tensor<*xf32>) -> (tensor<*xf32>) {
+    // The constant should not be quantized.
+    %cst = "tf.Const" () {value = dense<1.0> : tensor<1024x1024xf32>} : () -> tensor<1024x1024xf32>
+    %0:3 = "tf.While"(%arg0, %cst, %arg2) {
+      cond = @cond, body = @body, is_stateless = false
+    } : (tensor<i32>, tensor<1024x1024xf32>, tensor<*xf32>) -> (tensor<i32>, tensor<*xf32>, tensor<*xf32>)
+    func.return %0#1 : tensor<*xf32>
+  }
+
+  func.func private @cond(%arg0: tensor<i32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<i1> {
+    %0 = "tf.Const" () {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.greater"(%arg0, %0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    func.return %1 : tensor<i1>
+  }
+
+  func.func private @body(%arg0: tensor<i32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> (tensor<i32>, tensor<*xf32>, tensor<*xf32>) {
+    %0 = "tf.Const" () {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.Sub"(%arg0, %0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %2 = "tf.MatMul"(%arg2, %arg1) {} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %3 = "tf.AddV2" (%arg1, %arg1)  : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %4 = "tf.Identity"(%1) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %5 = "tf.Identity"(%3) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+    %6 = "tf.Identity"(%2) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %4, %5, %6 : tensor<i32>, tensor<*xf32>, tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: func @matmul_with_while_returning_mutated_value
+// CHECK-DAG: %[[W:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<1024x1024xf32>}> : () -> tensor<1024x1024xf32>
+
+// -----
+module {
+  func.func @multiple_quantizable_ops_in_graph(%arg0: tensor<1xi32>) -> tensor<1x3x1x1xf32> {
+    %cst = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {value = dense<1.1> : tensor<2x3x3x1024xf32>} : () -> tensor<2x3x3x1024xf32>
+    %cst_1 = "tf.Const"() {value = dense<1.1> : tensor<3x3x1024x1xf32>} : () -> tensor<3x3x1024x1xf32>
+    %cst_2 = "tf.Const"() {value = dense<1.1> : tensor<1024x3x4x3xf32>} : () -> tensor<1024x3x4x3xf32>
+    %0 = "tf.GatherV2"(%cst_2, %arg0, %cst) {batch_dims = 0 : i64, device = ""} : (tensor<1024x3x4x3xf32>, tensor<1xi32>, tensor<i32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.Conv2D"(%0, %cst_0) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1024xf32>) -> tensor<1x3x2x1024xf32>
+    %2 = "tf.Conv2D"(%1, %cst_1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x2x1024xf32>, tensor<3x3x1024x1xf32>) -> tensor<1x3x1x1xf32>
+    %3 = "tf.Identity"(%2) {device = ""} : (tensor<1x3x1x1xf32>) -> tensor<1x3x1x1xf32>
+    return %3 : tensor<1x3x1x1xf32>
+  }
+
+// CHECK-LABEL: func @multiple_quantizable_ops_in_graph
+// CHECK-DAG: %[[W_1:.*]] = "tf.Const"() <{value = dense<127> : tensor<2x3x3x1024xi8>}> : () -> tensor<2x3x3x1024xi8>
+// CHECK-DAG: %[[W_2:.*]] = "tf.Const"() <{value = dense<127> : tensor<3x3x1024x1xi8>}> : () -> tensor<3x3x1024x1xi8>
+// CHECK-DAG: %[[W_3:.*]] = "tf.Const"() <{value = dense<127> : tensor<1024x3x4x3xi8>}> : () -> tensor<1024x3x4x3xi8>
+// CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = ""} : () -> tensor<i32>
+// CHECK: %[[IDENTITY_1:.*]] = "tf.Identity"(%[[W_1]]) : (tensor<2x3x3x1024xi8>) -> tensor<2x3x3x1024xi8>
+// CHECK: %[[DEQUANTIZED_1:.*]] = "tf.PartitionedCall"(%[[IDENTITY_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform__}> : (tensor<2x3x3x1024xi8>) -> tensor<2x3x3x1024xf32>
+// CHECK: %[[IDENTITY_2:.*]] = "tf.Identity"(%[[W_2]]) : (tensor<3x3x1024x1xi8>) -> tensor<3x3x1024x1xi8>
+// CHECK: %[[DEQUANTIZED_2:.*]] = "tf.PartitionedCall"(%[[IDENTITY_2]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform_}> : (tensor<3x3x1024x1xi8>) -> tensor<3x3x1024x1xf32>
+// CHECK: %[[IDENTITY_3:.*]] = "tf.Identity"(%[[W_3]]) : (tensor<1024x3x4x3xi8>) -> tensor<1024x3x4x3xi8>
+// CHECK: %[[DEQUANTIZED_3:.*]] = "tf.PartitionedCall"(%[[IDENTITY_3]]) <{config = "", config_proto = "", executor_type = "", f = @composite_dequantize_uniform}> : (tensor<1024x3x4x3xi8>) -> tensor<1024x3x4x3xf32>
+// CHECK: %[[GATHER:.*]] = "tf.GatherV2"(%[[DEQUANTIZED_3]], %arg0, %[[AXIS]]) <{batch_dims = 0 : i64}> {device = ""} : (tensor<1024x3x4x3xf32>, tensor<1xi32>, tensor<i32>) -> tensor<1x3x4x3xf32>
+// CHECK: %[[CONV_1:.*]] = "tf.Conv2D"(%[[GATHER]], %[[DEQUANTIZED_1]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> {device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1024xf32>) -> tensor<1x3x2x1024xf32>
+// CHECK: %[[CONV_2:.*]] = "tf.Conv2D"(%[[CONV_1]], %[[DEQUANTIZED_2]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}> {device = ""} : (tensor<1x3x2x1024xf32>, tensor<3x3x1024x1xf32>) -> tensor<1x3x1x1xf32>
+
+// CHECK-LABEL: func private @composite_dequantize_uniform__
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00866141729> : tensor<f32>}> : () -> tensor<f32>
+
+// CHECK-LABEL: func private @composite_dequantize_uniform_
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00866141729> : tensor<f32>}> : () -> tensor<f32>
+
+// CHECK-LABEL: func private @composite_dequantize_uniform
+// CHECK-DAG: %[[SCALE:.*]] = "tf.Const"() <{value = dense<0.00866141729> : tensor<f32>}> : () -> tensor<f32>
+
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_remove_var_init_by_const.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_remove_var_init_by_const.mlir
new file mode 100644
index 000000000000..aa730aade7be
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_remove_var_init_by_const.mlir
@@ -0,0 +1,150 @@
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -tf-quant-remove-var-init-by-const | FileCheck %s
+
+// Single `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` pattern removed from
+// the initializer function.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+  // All three ops should have been removed.
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// The `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` pattern is not removed
+// from the initializer function that is not "restore_op" type.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_init_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_init_op]
+
+  func.func @init_func_init_op() -> () attributes {
+      tf_saved_model.initializer_type = "init_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+  // Nothing has been removed.
+  // CHECK: @init_func_init_op
+  // CHECK-NEXT: "tf.Const"
+  // CHECK-NEXT: "tf.VarHandleOp"
+  // CHECK-NEXT: "tf.AssignVariableOp"
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// If `tf.Const` is not used to initialize the variable, it is not removed.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    %add_0 = "tf.Identity"(%cst_0) : (tensor<2xf32>) -> tensor<2xf32>
+    %var_1 = "tf.VarHandleOp"() {shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_1, %add_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+  // The second AssignVariableOp, which takes the result of the `tf.Identity`
+  // op, is not removed. Note that the first AssignVariableOp is removed.
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK: %[[CST:.*]] = "tf.Const"()
+  // CHECK-NEXT: %[[IDENTITY:.*]] = "tf.Identity"(%[[CST]])
+  // CHECK-NEXT: %[[VAR:.*]] = "tf.VarHandleOp"() <{{{.*shared_name = "var_1".*}}}>
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[VAR]], %[[IDENTITY]])
+}
+
+// -----
+
+// If something other than `tf.VarHandleOp` is being initialized, it is
+// not erased.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    // Note: this is a contrived example and is an invalid input.
+    %var_0 = "tf.HashTableV2"() {key_dtype = i64, value_dtype = !tf_type.string} : () -> tensor<!tf_type.resource>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource>, tensor<2xf32>) -> ()
+    return
+  }
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK: %[[CST:.*]] = "tf.Const"()
+  // CHECK-NEXT: %[[HASH_TABLE:.*]] = "tf.HashTableV2"()
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[HASH_TABLE]], %[[CST]])
+}
+
+// -----
+
+
+// Nothing happens when there are no `tf_saved_model.session_initializer`.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+}
+
+// -----
+
+// Nothing happens when there are no initializer functions.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+}
+
+// -----
+
+// Nothing happens when the initializer function of type = "restore_op" is
+// empty.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    return
+  }
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK-NEXT: return
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_replace_cast_hacks_with_tf_xla_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_replace_cast_hacks_with_tf_xla_ops.mlir
new file mode 100644
index 000000000000..0bad0b32af0a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_replace_cast_hacks_with_tf_xla_ops.mlir
@@ -0,0 +1,1000 @@
+// RUN: tf-quant-opt %s -split-input-file -inline -tf-quant-replace-cast-hacks-with-tf-xla-ops | FileCheck %s
+
+// -----
+
+module attributes {} {
+  func.func @conv_with_bias_and_relu(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x2x2xf32> {
+    %cst = "tf.Const"() {value = dense<[162, 160]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_0 = "tf.Const"() {value = dense<[[[[-85, 72], [23, -103], [-29, -96]], [[-128, -83], [81, -57], [67, 119]], [[44, 10], [-90, -107], [77, 122]]], [[[18, 61], [127, -20], [-107, 119]], [[12, -66], [-98, 15], [124, 9]], [[68, 119], [20, -52], [48, 123]]]]> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+    %cst_1 = "tf.Const"() {value = dense<0.587548196> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<18.1044273> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<0.0748551115> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {value = dense<0.0439809859> : tensor<f32>} : () -> tensor<f32>
+    %cst_7 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_2) {config = "", config_proto = "", executor_type = "", f = @quantize_i8} : (tensor<1x3x4x3xf32>, tensor<f32>, tensor<i32>) -> tensor<1x3x4x3xi8>
+    %1 = "tf.PartitionedCall"(%0, %cst_0, %cst, %cst_1, %cst_2, %cst_4, %cst_5, %cst_6, %cst_7, %cst_3, %cst_2) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_bias_and_relu_fn_0} : (tensor<1x3x4x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>) -> tensor<1x3x2x2xi8>
+    %2 = "tf.PartitionedCall"(%1, %cst_3, %cst_2) {config = "", config_proto = "", executor_type = "", f = @dequantize_i8} : (tensor<1x3x2x2xi8>, tensor<f32>, tensor<i32>) -> tensor<1x3x2x2xf32>
+    return %2 : tensor<1x3x2x2xf32>
+  }
+  func.func private @quantize_i8(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<f32>, %arg2: tensor<i32>) -> tensor<1x3x4x3xi8> {
+    %0 = "tf.Div"(%arg0, %arg1) : (tensor<1x3x4x3xf32>, tensor<f32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.Round"(%0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %2 = "tf.Cast"(%1) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xi32>
+    %3 = "tf.AddV2"(%2, %arg2) : (tensor<1x3x4x3xi32>, tensor<i32>) -> tensor<1x3x4x3xi32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<1x3x4x3xi32>) -> tensor<1x3x4x3xi8>
+    return %4 : tensor<1x3x4x3xi8>
+  }
+  func.func private @dequantize_i8(%arg0: tensor<1x3x2x2xi8>, %arg1: tensor<f32>, %arg2: tensor<i32>) -> tensor<1x3x2x2xf32> {
+    %0 = "tf.Cast"(%arg0) : (tensor<1x3x2x2xi8>) -> tensor<1x3x2x2xi32>
+    %1 = "tf.Sub"(%0, %arg2) : (tensor<1x3x2x2xi32>, tensor<i32>) -> tensor<1x3x2x2xi32>
+    %2 = "tf.Cast"(%1) : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
+    %3 = "tf.Mul"(%2, %arg1) : (tensor<1x3x2x2xf32>, tensor<f32>) -> tensor<1x3x2x2xf32>
+    return %3 : tensor<1x3x2x2xf32>
+  }
+  func.func private @quantized_conv2d_with_bias_and_relu_fn_0(%arg0: tensor<1x3x4x3xi8>, %arg1: tensor<2x3x3x2xi8>, %arg2: tensor<2xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<f32>, %arg6: tensor<i32>, %arg7: tensor<f32>, %arg8: tensor<i32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<1x3x2x2xi8> {
+    %cst = "tf.Const"() {value = dense<127> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x3x4x3xi8>) -> tensor<1x3x4x3xi32>
+    %1 = "tf.Sub"(%0, %arg4) : (tensor<1x3x4x3xi32>, tensor<i32>) -> tensor<1x3x4x3xi32>
+    %identity = "tf.Identity"(%arg1) : (tensor<2x3x3x2xi8>) -> tensor<2x3x3x2xi8>
+    %2 = "tf.Cast"(%identity) {Truncate = false} : (tensor<2x3x3x2xi8>) -> tensor<2x3x3x2xi32>
+    %3 = "tf.Sub"(%2, %arg6) : (tensor<2x3x3x2xi32>, tensor<i32>) -> tensor<2x3x3x2xi32>
+    %4 = "tf.Conv2D"(%1, %3) {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xi32>, tensor<2x3x3x2xi32>) -> tensor<1x3x2x2xi32>
+    %5 = "tf.AddV2"(%4, %arg2) : (tensor<1x3x2x2xi32>, tensor<2xi32>) -> tensor<1x3x2x2xi32>
+    %6 = "tf.Mul"(%arg3, %arg5) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %7 = "tf.Div"(%6, %arg9) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %8 = "tf.Cast"(%5) {Truncate = false} : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
+    %9 = "tf.Mul"(%7, %8) : (tensor<f32>, tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+    %10 = "tf.Round"(%9) : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+    %11 = "tf.Cast"(%10) {Truncate = false} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xi32>
+    %12 = "tf.AddV2"(%11, %arg10) : (tensor<1x3x2x2xi32>, tensor<i32>) -> tensor<1x3x2x2xi32>
+    %13 = "tf.Maximum"(%cst_0, %arg10) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %14 = "tf.ClipByValue"(%12, %13, %cst) : (tensor<1x3x2x2xi32>, tensor<i32>, tensor<i32>) -> tensor<1x3x2x2xi32>
+    %15 = "tf.Cast"(%14) {Truncate = false} : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xi8>
+    return %15 : tensor<1x3x2x2xi8>
+  }
+
+// CHECK-LABEL: func @conv_with_bias_and_relu
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<[1, 2]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<1> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
+// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
+// CHECK-DAG-SAME{LITERAL}: value = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]>
+// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2xi8>
+// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2xi32>
+// CHECK-DAG-SAME{LITERAL}: value = dense<[[[[-22016, -23680]]]]>
+// CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() <{value = dense<[162, 160]> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK: %[[PADV2_0:.*]] = "tf.PadV2"({{.*}}, %[[CONST_4]], %[[CONST_5]]) : (tensor<1x3x4x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<1x4x5x3xi8>
+// CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]], %[[CONST_6]], %[[CONST_0]], %[[CONST_3]], %[[CONST_1]], %[[CONST_1]], %[[CONST_2]])
+// CHECK-SAME: (tensor<1x4x5x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<1x3x2x2xi32>
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST_7]]) : (tensor<1x3x2x2xi32>, tensor<1x1x1x2xi32>) -> tensor<1x3x2x2xi32>
+// CHECK: %[[ADDV2_1:.*]] = "tf.AddV2"(%[[SUB_0]], %[[CONST_8]]) : (tensor<1x3x2x2xi32>, tensor<2xi32>) -> tensor<1x3x2x2xi32>
+}
+
+// -----
+
+module attributes {} {
+  func.func @depthwise_conv_with_bias_and_relu6(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x2x2x3xf32> {
+    %cst = "tf.Const"() {value = dense<[129, 166, 221]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_0 = "tf.Const"() {value = dense<[[[[-84], [73], [24]], [[-102], [-28], [-94]], [[-127], [-82], [82]]], [[[-56], [67], [120]], [[45], [11], [-88]], [[-106], [77], [123]]]]> : tensor<2x3x3x1xi8>} : () -> tensor<2x3x3x1xi8>
+    %cst_1 = "tf.Const"() {value = dense<0.587548196> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<0.0235294122> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<0.0751230493> : tensor<1xf32>} : () -> tensor<1xf32>
+    %cst_5 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+    %cst_6 = "tf.Const"() {value = dense<0.0441384129> : tensor<f32>} : () -> tensor<f32>
+    %cst_7 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_2) {config = "", config_proto = "", executor_type = "", f = @quantize_i8} : (tensor<1x3x4x3xf32>, tensor<f32>, tensor<i32>) -> tensor<1x3x4x3xi8>
+    %1 = "tf.PartitionedCall"(%0, %cst_0, %cst, %cst_1, %cst_2, %cst_4, %cst_5, %cst_6, %cst_7, %cst_3, %cst_2) {config = "", config_proto = "", executor_type = "", f = @quantized_depthwise_conv2d_with_bias_and_relu6_fn_0} : (tensor<1x3x4x3xi8>, tensor<2x3x3x1xi8>, tensor<3xi32>, tensor<f32>, tensor<i32>, tensor<1xf32>, tensor<1xi32>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>) -> tensor<1x2x2x3xi8>
+    %2 = "tf.PartitionedCall"(%1, %cst_3, %cst_2) {config = "", config_proto = "", executor_type = "", f = @dequantize_i8} : (tensor<1x2x2x3xi8>, tensor<f32>, tensor<i32>) -> tensor<1x2x2x3xf32>
+    return %2 : tensor<1x2x2x3xf32>
+  }
+  func.func private @quantize_i8(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<f32>, %arg2: tensor<i32>) -> tensor<1x3x4x3xi8> {
+    %0 = "tf.Div"(%arg0, %arg1) : (tensor<1x3x4x3xf32>, tensor<f32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.Round"(%0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %2 = "tf.Cast"(%1) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xi32>
+    %3 = "tf.AddV2"(%2, %arg2) : (tensor<1x3x4x3xi32>, tensor<i32>) -> tensor<1x3x4x3xi32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<1x3x4x3xi32>) -> tensor<1x3x4x3xi8>
+    return %4 : tensor<1x3x4x3xi8>
+  }
+  func.func private @dequantize_i8(%arg0: tensor<1x2x2x3xi8>, %arg1: tensor<f32>, %arg2: tensor<i32>) -> tensor<1x2x2x3xf32> {
+    %0 = "tf.Cast"(%arg0) : (tensor<1x2x2x3xi8>) -> tensor<1x2x2x3xi32>
+    %1 = "tf.Sub"(%0, %arg2) : (tensor<1x2x2x3xi32>, tensor<i32>) -> tensor<1x2x2x3xi32>
+    %2 = "tf.Cast"(%1) : (tensor<1x2x2x3xi32>) -> tensor<1x2x2x3xf32>
+    %3 = "tf.Mul"(%2, %arg1) : (tensor<1x2x2x3xf32>, tensor<f32>) -> tensor<1x2x2x3xf32>
+    return %3 : tensor<1x2x2x3xf32>
+  }
+  func.func private @quantized_depthwise_conv2d_with_bias_and_relu6_fn_0(%arg0: tensor<1x3x4x3xi8>, %arg1: tensor<2x3x3x1xi8>, %arg2: tensor<3xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<1xf32>, %arg6: tensor<1xi32>, %arg7: tensor<f32>, %arg8: tensor<i32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<1x2x2x3xi8> {
+    %cst = "tf.Const"() {value = dense<127> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_1 = "tf.Const"() {value = dense<6.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x3x4x3xi8>) -> tensor<1x3x4x3xi32>
+    %1 = "tf.Sub"(%0, %arg4) : (tensor<1x3x4x3xi32>, tensor<i32>) -> tensor<1x3x4x3xi32>
+    %identity = "tf.Identity"(%arg1) : (tensor<2x3x3x1xi8>) -> tensor<2x3x3x1xi8>
+    %2 = "tf.Cast"(%identity) {Truncate = false} : (tensor<2x3x3x1xi8>) -> tensor<2x3x3x1xi32>
+    %3 = "tf.Sub"(%2, %arg6) : (tensor<2x3x3x1xi32>, tensor<1xi32>) -> tensor<2x3x3x1xi32>
+    %5 = "tf.Cast"(%1) {Truncate = false} : (tensor<1x3x4x3xi32>) -> tensor<1x3x4x3xf32>
+    %6 = "tf.Cast"(%3) {Truncate = false} : (tensor<2x3x3x1xi32>) -> tensor<2x3x3x1xf32>
+    %7 = "tf.DepthwiseConv2dNative"(%5, %6) {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<1x2x2x3xf32>
+    %8 = "tf.Cast"(%7) : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xi32>
+    %9 = "tf.AddV2"(%8, %arg2) : (tensor<1x2x2x3xi32>, tensor<3xi32>) -> tensor<1x2x2x3xi32>
+    %10 = "tf.Mul"(%arg3, %arg5) : (tensor<f32>, tensor<1xf32>) -> tensor<1xf32>
+    %11 = "tf.Div"(%10, %arg9) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+    %12 = "tf.Cast"(%9) {Truncate = false} : (tensor<1x2x2x3xi32>) -> tensor<1x2x2x3xf32>
+    %13 = "tf.Mul"(%11, %12) : (tensor<1xf32>, tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32>
+    %14 = "tf.Round"(%13) : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xf32>
+    %15 = "tf.Cast"(%14) {Truncate = false} : (tensor<1x2x2x3xf32>) -> tensor<1x2x2x3xi32>
+    %16 = "tf.AddV2"(%15, %arg10) : (tensor<1x2x2x3xi32>, tensor<i32>) -> tensor<1x2x2x3xi32>
+    %17 = "tf.Div"(%cst_1, %arg9) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %18 = "tf.Round"(%17) : (tensor<f32>) -> tensor<f32>
+    %19 = "tf.Cast"(%18) : (tensor<f32>) -> tensor<i32>
+    %20 = "tf.AddV2"(%19, %arg10) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %21 = "tf.Cast"(%20) : (tensor<i32>) -> tensor<i8>
+    %22 = "tf.Cast"(%21) {Truncate = false} : (tensor<i8>) -> tensor<i8>
+    %23 = "tf.Cast"(%22) {Truncate = false} : (tensor<i8>) -> tensor<i32>
+    %24 = "tf.Maximum"(%cst_0, %arg10) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %25 = "tf.Minimum"(%cst, %23) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %26 = "tf.ClipByValue"(%16, %24, %25) : (tensor<1x2x2x3xi32>, tensor<i32>, tensor<i32>) -> tensor<1x2x2x3xi32>
+    %27 = "tf.Cast"(%26) {Truncate = false} : (tensor<1x2x2x3xi32>) -> tensor<1x2x2x3xi8>
+    return %27 : tensor<1x2x2x3xi8>
+  }
+
+// CHECK-LABEL: func @depthwise_conv_with_bias_and_relu6
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-128> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<2x3x1x3xi8>}> : () -> tensor<2x3x1x3xi8>
+// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() <{value = dense<2> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() <{value = dense<0> : tensor<2x2xi32>}> : () -> tensor<2x2xi32>
+// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() <{value = dense<1> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() <{value = dense<{{.*}}> : tensor<1x1x1x3xi32>}> : () -> tensor<1x1x1x3xi32>
+// CHECK-DAG-SAME{LITERAL}: value = dense<[[[[55040, -15104, -21376]]]]>
+// CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() <{value = dense<[129, 166, 221]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK: %[[PADV2_0:.*]] = "tf.PadV2"({{.*}}, %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<1x4x5x3xi8>
+// CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], %[[CONST_5]], %[[CONST_5]], %[[CONST_6]])
+// CHECK-SAME: (tensor<1x4x5x3xi8>, tensor<2x3x1x3xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<1x2x2x3xi32>
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST_7]]) : (tensor<1x2x2x3xi32>, tensor<1x1x1x3xi32>) -> tensor<1x2x2x3xi32>
+// CHECK: %[[ADDV2_1:.*]] = "tf.AddV2"(%[[SUB_0]], %[[CONST_8]]) : (tensor<1x2x2x3xi32>, tensor<3xi32>) -> tensor<1x2x2x3xi32>
+}
+
+// -----
+
+module attributes {} {
+  func.func @dynamic_shaped_conv2d_with_bias_and_relu6_inlined(%arg0: tensor<?x?x?x3xf32>) -> tensor<?x?x?x2xf32> {
+    %cst = "tf.Const"() {device = "", value = dense<127> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<[1.8772192, 1.82187414]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<2> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+    %cst_3 = "tf.Const"() {device = "", value = dense<[161, 165]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<0.587548196> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<0.0235294122> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_4) {device = ""} : (tensor<?x?x?x3xf32>, tensor<f32>) -> tensor<?x?x?x3xf32>
+    %1 = "tf.Round"(%0) {device = ""} : (tensor<?x?x?x3xf32>) -> tensor<?x?x?x3xf32>
+    %2 = "tf.Cast"(%1) {device = ""} : (tensor<?x?x?x3xf32>) -> tensor<?x?x?x3xi32>
+    %3 = "tf.AddV2"(%2, %cst_0) {device = ""} : (tensor<?x?x?x3xi32>, tensor<i32>) -> tensor<?x?x?x3xi32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<?x?x?x3xi32>) -> tensor<?x?x?x3xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<?x?x?x3xi8>) -> tensor<?x?x?x3xi32>
+    %6 = "tf.Sub"(%5, %cst_0) {device = ""} : (tensor<?x?x?x3xi32>, tensor<i32>) -> tensor<?x?x?x3xi32>
+    %identity = "tf.Identity"(%cst_2) : (tensor<2x3x3x2xi8>) -> tensor<2x3x3x2xi8>
+    %cast_filter = "tf.Cast"(%identity) {Truncate = false} : (tensor<2x3x3x2xi8>) -> tensor<2x3x3x2xi32>
+    %7 = "tf.Conv2D"(%6, %cast_filter) {device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<?x?x?x3xi32>, tensor<2x3x3x2xi32>) -> tensor<?x?x?x2xi32>
+    %8 = "tf.AddV2"(%7, %cst_3) {device = ""} : (tensor<?x?x?x2xi32>, tensor<2xi32>) -> tensor<?x?x?x2xi32>
+    %9 = "tf.Cast"(%8) {Truncate = false, device = ""} : (tensor<?x?x?x2xi32>) -> tensor<?x?x?x2xf32>
+    %10 = "tf.Mul"(%9, %cst_1) {device = ""} : (tensor<?x?x?x2xf32>, tensor<2xf32>) -> tensor<?x?x?x2xf32>
+    %11 = "tf.Round"(%10) {device = ""} : (tensor<?x?x?x2xf32>) -> tensor<?x?x?x2xf32>
+    %12 = "tf.Cast"(%11) {Truncate = false, device = ""} : (tensor<?x?x?x2xf32>) -> tensor<?x?x?x2xi32>
+    %13 = "tf.AddV2"(%12, %cst_0) {device = ""} : (tensor<?x?x?x2xi32>, tensor<i32>) -> tensor<?x?x?x2xi32>
+    %14 = "tf.ClipByValue"(%13, %cst_0, %cst) {device = ""} : (tensor<?x?x?x2xi32>, tensor<i32>, tensor<i32>) -> tensor<?x?x?x2xi32>
+    %15 = "tf.Cast"(%14) {Truncate = false, device = ""} : (tensor<?x?x?x2xi32>) -> tensor<?x?x?x2xi8>
+    %16 = "tf.Cast"(%15) {device = ""} : (tensor<?x?x?x2xi8>) -> tensor<?x?x?x2xi32>
+    %17 = "tf.Sub"(%16, %cst_0) {device = ""} : (tensor<?x?x?x2xi32>, tensor<i32>) -> tensor<?x?x?x2xi32>
+    %18 = "tf.Cast"(%17) {device = ""} : (tensor<?x?x?x2xi32>) -> tensor<?x?x?x2xf32>
+    %19 = "tf.Mul"(%18, %cst_5) {device = ""} : (tensor<?x?x?x2xf32>, tensor<f32>) -> tensor<?x?x?x2xf32>
+    return %19 : tensor<?x?x?x2xf32>
+  }
+
+// CHECK-LABEL: func @dynamic_shaped_conv2d_with_bias_and_relu6_inlined
+// CHECK-DAG: %[[filter:.*]] = "tf.Const"() <{value = dense<2> : tensor<2x3x3x2xi8>}> {device = ""} : () -> tensor<2x3x3x2xi8>
+// CHECK-DAG: %[[input_shape:.*]] = "tf.Shape"({{.*}}) : (tensor<?x?x?x3xi8>) -> tensor<4xi32>
+// CHECK-DAG: %[[input_dim_1:.*]] = "tf.StridedSlice"(%[[input_shape]], {{.*}}, {{.*}}, {{.*}}) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK-DAG: %[[input_dim_2:.*]] = "tf.StridedSlice"(%[[input_shape]], {{.*}}, {{.*}}, {{.*}}) <{begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK-DAG: %[[padding_rank_1:.*]] = "tf.Concat"({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<8xi32>
+// CHECK-DAG: %[[padding_rank_2:.*]] = "tf.Reshape"(%[[padding_rank_1]], {{.*}}) : (tensor<8xi32>, tensor<2xi64>) -> tensor<4x2xi32>
+// CHECK-DAG: %[[input_padded:.*]] = "tf.PadV2"(%{{.*}}, %[[padding_rank_2]], {{.*}}) : (tensor<?x?x?x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<?x?x?x3xi8>
+// CHECK: %[[conv_output:.*]] = "tf.XlaConvV2"(%[[input_padded]], %[[filter]], {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) <{dimension_numbers = "{{.*}}", precision_config = ""}> : (tensor<?x?x?x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<?x?x?x2xi32>
+// CHECK: %[[conv_output_sub:.*]] = "tf.Sub"(%[[conv_output]], {{.*}}) : (tensor<?x?x?x2xi32>, tensor<1x1x1x2xi32>) -> tensor<?x?x?x2xi32>
+// CHECK: %[[conv_output_add:.*]] = "tf.AddV2"(%[[conv_output_sub]], {{.*}}) {device = ""} : (tensor<?x?x?x2xi32>, tensor<2xi32>) -> tensor<?x?x?x2xi32>
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @conv_with_filter_larger_than_1MB(%arg0: tensor<1x224x224x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<1x224x112x512xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {value = dense<2> : tensor<32x32x3x512xi8>} : () -> tensor<32x32x3x512xi8>
+    %cst_0 = "tf.Const"() {value = dense<0.00117647066> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_2 = "tf.Const"() {value = dense<0.0027450982> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {value = dense<-19> : tensor<i32>} : () -> tensor<i32>
+    %cst_4 = "tf.Const"() {value = dense<0.01> : tensor<512xf32>} : () -> tensor<512xf32>
+    %cst_5 = "tf.Const"() {value = dense<0> : tensor<512xi32>} : () -> tensor<512xi32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_0, %cst_1) {config = "", config_proto = "", executor_type = "", f = @quantize_i8} : (tensor<1x224x224x3xf32>, tensor<f32>, tensor<i32>) -> tensor<1x224x224x3xi8>
+    %1 = "tf.PartitionedCall"(%0, %cst, %cst_0, %cst_1, %cst_4, %cst_5, %cst_2, %cst_3) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_with_relu_fn_0} : (tensor<1x224x224x3xi8>, tensor<32x32x3x512xi8>, tensor<f32>, tensor<i32>, tensor<512xf32>, tensor<512xi32>, tensor<f32>, tensor<i32>) -> tensor<1x224x112x512xi8>
+    %2 = "tf.PartitionedCall"(%1, %cst_2, %cst_3) {config = "", config_proto = "", executor_type = "", f = @dequantize_i8} : (tensor<1x224x112x512xi8>, tensor<f32>, tensor<i32>) -> tensor<1x224x112x512xf32>
+    return %2 : tensor<1x224x112x512xf32>
+  }
+  func.func private @quantize_i8(%arg0: tensor<1x224x224x3xf32>, %arg1: tensor<f32>, %arg2: tensor<i32>) -> tensor<1x224x224x3xi8> {
+    %0 = "tf.Div"(%arg0, %arg1) : (tensor<1x224x224x3xf32>, tensor<f32>) -> tensor<1x224x224x3xf32>
+    %1 = "tf.Round"(%0) : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf32>
+    %2 = "tf.Cast"(%1) : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xi32>
+    %3 = "tf.AddV2"(%2, %arg2) : (tensor<1x224x224x3xi32>, tensor<i32>) -> tensor<1x224x224x3xi32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<1x224x224x3xi32>) -> tensor<1x224x224x3xi8>
+    return %4 : tensor<1x224x224x3xi8>
+  }
+  func.func private @dequantize_i8(%arg0: tensor<1x224x112x512xi8>, %arg1: tensor<f32>, %arg2: tensor<i32>) -> tensor<1x224x112x512xf32> {
+    %0 = "tf.Cast"(%arg0) : (tensor<1x224x112x512xi8>) -> tensor<1x224x112x512xi32>
+    %1 = "tf.Sub"(%0, %arg2) : (tensor<1x224x112x512xi32>, tensor<i32>) -> tensor<1x224x112x512xi32>
+    %2 = "tf.Cast"(%1) : (tensor<1x224x112x512xi32>) -> tensor<1x224x112x512xf32>
+    %3 = "tf.Mul"(%2, %arg1) : (tensor<1x224x112x512xf32>, tensor<f32>) -> tensor<1x224x112x512xf32>
+    return %3 : tensor<1x224x112x512xf32>
+  }
+  func.func private @quantized_conv2d_with_relu_fn_0(%arg0: tensor<1x224x224x3xi8>, %arg1: tensor<32x32x3x512xi8>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<512xf32>, %arg5: tensor<512xi32>, %arg6: tensor<f32>, %arg7: tensor<i32>) -> tensor<1x224x112x512xi8> {
+    %cst = "tf.Const"() {value = dense<127> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x224x224x3xi8>) -> tensor<1x224x224x3xi32>
+    %1 = "tf.Sub"(%0, %arg3) : (tensor<1x224x224x3xi32>, tensor<i32>) -> tensor<1x224x224x3xi32>
+    %2 = "tf.Identity"(%arg1) : (tensor<32x32x3x512xi8>) -> tensor<32x32x3x512xi8>
+    %3 = "tf.Cast"(%2) {Truncate = false} : (tensor<32x32x3x512xi8>) -> tensor<32x32x3x512xi32>
+    %4 = "tf.Sub"(%3, %arg5) : (tensor<32x32x3x512xi32>, tensor<512xi32>) -> tensor<32x32x3x512xi32>
+    %5 = "tf.Conv2D"(%1, %4) {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x224x224x3xi32>, tensor<32x32x3x512xi32>) -> tensor<1x224x112x512xi32>
+    %6 = "tf.Mul"(%arg2, %arg4) : (tensor<f32>, tensor<512xf32>) -> tensor<512xf32>
+    %7 = "tf.Div"(%6, %arg6) : (tensor<512xf32>, tensor<f32>) -> tensor<512xf32>
+    %8 = "tf.Cast"(%5) {Truncate = false} : (tensor<1x224x112x512xi32>) -> tensor<1x224x112x512xf32>
+    %9 = "tf.Mul"(%7, %8) : (tensor<512xf32>, tensor<1x224x112x512xf32>) -> tensor<1x224x112x512xf32>
+    %10 = "tf.Round"(%9) : (tensor<1x224x112x512xf32>) -> tensor<1x224x112x512xf32>
+    %11 = "tf.Cast"(%10) {Truncate = false} : (tensor<1x224x112x512xf32>) -> tensor<1x224x112x512xi32>
+    %12 = "tf.AddV2"(%11, %arg7) : (tensor<1x224x112x512xi32>, tensor<i32>) -> tensor<1x224x112x512xi32>
+    %13 = "tf.Maximum"(%cst_0, %arg7) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %14 = "tf.ClipByValue"(%12, %13, %cst) : (tensor<1x224x112x512xi32>, tensor<i32>, tensor<i32>) -> tensor<1x224x112x512xi32>
+    %15 = "tf.Cast"(%14) {Truncate = false} : (tensor<1x224x112x512xi32>) -> tensor<1x224x112x512xi8>
+    return %15 : tensor<1x224x112x512xi8>
+  }
+
+// CHECK-LABEL: func @conv_with_filter_larger_than_1MB
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-264192> : tensor<1x1x1x512xi32>}> : () -> tensor<1x1x1x512xi32>
+// CHECK: %[[PADV2_0:.*]] = "tf.PadV2"
+// CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]]
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST]])
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @matmul_with_relu(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["serving_default_input_tensor:0"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["tf.PartitionedCall:0"]}) attributes {tf.entry_function = {inputs = "serving_default_input_tensor:0", outputs = "tf.PartitionedCall:0"}, tf_saved_model.exported_names = ["main"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08643539E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
+    %cst_3 = "tf.Const"() {device = "", value = dense<0.00392156653> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_3) {device = ""} : (tensor<1x1024xf32>, tensor<f32>) -> tensor<1x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<1x1024xf32>, tensor<f32>) -> tensor<1x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_5) {device = ""} : (tensor<1x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<1x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<1x1024xf32>) -> tensor<1x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<1x1024xi8>) -> tensor<1x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_4) {device = ""} : (tensor<1x1024xi32>, tensor<i32>) -> tensor<1x1024xi32>
+    %7 = "tf.Identity"(%cst_2) {device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi32>
+    %9 = "tf.MatMul"(%6, %8) {device = "", transpose_a = false, transpose_b = false} : (tensor<1x1024xi32>, tensor<1024x3xi32>) -> tensor<1x3xi32>
+    %10 = "tf.Cast"(%9) {Truncate = false, device = ""} : (tensor<1x3xi32>) -> tensor<1x3xf32>
+    %11 = "tf.Mul"(%10, %cst) {device = ""} : (tensor<1x3xf32>, tensor<f32>) -> tensor<1x3xf32>
+    %12 = "tf.Relu"(%11) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %12 : tensor<1x3xf32>
+  }
+// CHECK-LABEL: func @matmul_with_relu
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = dense<1> : tensor<1024x3xi8>}> {device = ""} : () -> tensor<1024x3xi8>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-131072> : tensor<1x3xi32>}> : () -> tensor<1x3xi32>
+// CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"({{.*}}, %[[WEIGHT]])
+// CHECK-SAME: (tensor<1x1024xi8>, tensor<1024x3xi8>) -> tensor<1x3xi32>
+// CHECK: %[[SUB:.*]] = "tf.Sub"(%[[MATMUL]], %[[CONST]]) : (tensor<1x3xi32>, tensor<1x3xi32>) -> tensor<1x3xi32>
+}
+
+// -----
+
+module attributes {} {
+  func.func @matmul_two_tensors_with_static_shape(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> (tensor<2x2xf32>) {
+    %cst = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_6 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_0) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %2 = "tf.Floor"(%1) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_5, %cst_6) : (tensor<2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<2x2xf32>) -> tensor<2x2xi8>
+    %5 = "tf.Div"(%arg0, %cst_3) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %6 = "tf.AddV2"(%5, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %7 = "tf.Floor"(%6) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_5, %cst_6) : (tensor<2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<2x2xf32>) -> tensor<2x2xi8>
+    %10 = "tf.Cast"(%9) {Truncate = false} : (tensor<2x2xi8>) -> tensor<2x2xi32>
+    %11 = "tf.Sub"(%10, %cst_4) : (tensor<2x2xi32>, tensor<i32>) -> tensor<2x2xi32>
+    %12 = "tf.Identity"(%4) : (tensor<2x2xi8>) -> tensor<2x2xi8>
+    %13 = "tf.Cast"(%12) {Truncate = false} : (tensor<2x2xi8>) -> tensor<2x2xi32>
+    %14 = "tf.Sub"(%13, %cst_2) : (tensor<2x2xi32>, tensor<i32>) -> tensor<2x2xi32>
+    %15 = "tf.MatMul"(%11, %14) {transpose_a = false, transpose_b = false} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    %16 = "tf.Cast"(%15) {Truncate = false} : (tensor<2x2xi32>) -> tensor<2x2xf32>
+    %17 = "tf.Mul"(%16, %cst_0) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %18 = "tf.AddV2"(%17, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %19 = "tf.Floor"(%18) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %20 = "tf.ClipByValue"(%19, %cst_5, %cst_6) : (tensor<2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2xf32>
+    %21 = "tf.Cast"(%20) {Truncate = false} : (tensor<2x2xf32>) -> tensor<2x2xi8>
+    %22 = "tf.Identity"(%21) {device = ""} : (tensor<2x2xi8>) -> tensor<2x2xi8>
+    %23 = "tf.Identity"(%22) {device = ""} : (tensor<2x2xi8>) -> tensor<2x2xi8>
+    %24 = "tf.Cast"(%23) : (tensor<2x2xi8>) -> tensor<2x2xi32>
+    %25 = "tf.Sub"(%24, %cst_4) : (tensor<2x2xi32>, tensor<i32>) -> tensor<2x2xi32>
+    %26 = "tf.Cast"(%25) : (tensor<2x2xi32>) -> tensor<2x2xf32>
+    %27 = "tf.Mul"(%26, %cst_3) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    return %27 : tensor<2x2xf32>
+  }
+
+// CHECK-LABEL: func @matmul_two_tensors_with_static_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+
+// CHECK: %[[arg1_identity:.*]] = "tf.Identity"(%[[arg1_cast]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg0_cast]], %[[arg1_identity]]
+// CHECK-SAME: (tensor<2x2xi8>, tensor<2x2xi8>) -> tensor<2x2xi32>
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+}
+
+// -----
+
+module attributes {} {
+  func.func @matmul_two_tensors_with_dynamic_shape(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
+    %cst = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_6 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_0) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %2 = "tf.Floor"(%1) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_5, %cst_6) : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<?x?xf32>) -> tensor<?x?xi8>
+    %5 = "tf.Div"(%arg0, %cst_3) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %6 = "tf.AddV2"(%5, %cst) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %7 = "tf.Floor"(%6) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_5, %cst_6) : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<?x?xf32>) -> tensor<?x?xi8>
+    %10 = "tf.Cast"(%4) {Truncate = false} : (tensor<?x?xi8>) -> tensor<?x?xi32>
+    %11 = "tf.Sub"(%10, %cst_2) : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+    %12 = "tf.Identity"(%9) : (tensor<?x?xi8>) -> tensor<?x?xi8>
+    %13 = "tf.Cast"(%12) {Truncate = false} : (tensor<?x?xi8>) -> tensor<?x?xi32>
+    %14 = "tf.Sub"(%13, %cst_4) : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+    %15 = "tf.MatMul"(%11, %14) {transpose_a = false, transpose_b = false} : (tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+    %16 = "tf.Cast"(%15) {Truncate = false} : (tensor<?x?xi32>) -> tensor<?x?xf32>
+    %17 = "tf.Mul"(%16, %cst_0) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %18 = "tf.AddV2"(%17, %cst) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %19 = "tf.Floor"(%18) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+    %20 = "tf.ClipByValue"(%19, %cst_5, %cst_6) : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+    %21 = "tf.Cast"(%20) {Truncate = false} : (tensor<?x?xf32>) -> tensor<?x?xi8>
+    %22 = "tf.Identity"(%21) {device = ""} : (tensor<?x?xi8>) -> tensor<?x?xi8>
+    %23 = "tf.Identity"(%22) {device = ""} : (tensor<?x?xi8>) -> tensor<?x?xi8>
+    %24 = "tf.Cast"(%23) : (tensor<?x?xi8>) -> tensor<?x?xi32>
+    %25 = "tf.Sub"(%24, %cst_4) : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+    %26 = "tf.Cast"(%25) : (tensor<?x?xi32>) -> tensor<?x?xf32>
+    %27 = "tf.Mul"(%26, %cst_3) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    return %27 : tensor<?x?xf32>
+  }
+
+// CHECK-LABEL: func @matmul_two_tensors_with_dynamic_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+// CHECK: %[[arg0_identity:.*]] = "tf.Identity"(%[[arg0_cast]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg1_cast]], %[[arg0_identity]]
+// CHECK-SAME: (tensor<?x?xi8>, tensor<?x?xi8>) -> tensor<?x?xi32>
+
+// CHECK: %[[arg0_shape:.*]] = "tf.Shape"(%[[arg0_identity]]
+// CHECK: %[[shape_zp_contribute:.*]] = "tf.StridedSlice"(%[[arg0_shape]]
+// CHECK: %[[shape_zp_contribute_cast:.*]] = "tf.Cast"(%[[shape_zp_contribute]]
+// CHECK: %[[shape_zp_contribute_mul:.*]] = "tf.Mul"(%[[shape_zp_contribute_cast]]
+// CHECK: %[[zp:.*]] = "tf.Sub"({{.*}}, %[[shape_zp_contribute_mul]])
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]], %[[zp]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @conv3d_with_static_shape(%arg0: tensor<1x3x4x3x3xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3x2x3x2xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "tf.PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<[4.57413898E-6, 4.56899261E-6]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-4.250000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<1> : tensor<2x3x3x3x2xi8>} : () -> tensor<2x3x3x3x2xi8>
+    %cst_2 = "tf.Const"() {device = "", value = dense<0.00117643911> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {device = "", value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_2) {device = ""} : (tensor<1x3x4x3x3xf32>, tensor<f32>) -> tensor<1x3x4x3x3xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<1x3x4x3x3xf32>, tensor<f32>) -> tensor<1x3x4x3x3xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<1x3x4x3x3xf32>) -> tensor<1x3x4x3x3xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_4, %cst_5) {device = ""} : (tensor<1x3x4x3x3xf32>, tensor<f32>, tensor<f32>) -> tensor<1x3x4x3x3xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<1x3x4x3x3xf32>) -> tensor<1x3x4x3x3xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<1x3x4x3x3xi8>) -> tensor<1x3x4x3x3xi32>
+    %6 = "tf.Sub"(%5, %cst_3) {device = ""} : (tensor<1x3x4x3x3xi32>, tensor<i32>) -> tensor<1x3x4x3x3xi32>
+    %7 = "tf.Identity"(%cst_1) {device = ""} : (tensor<2x3x3x3x2xi8>) -> tensor<2x3x3x3x2xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<2x3x3x3x2xi8>) -> tensor<2x3x3x3x2xi32>
+    %9 = "tf.Cast"(%6) {Truncate = false, device = ""} : (tensor<1x3x4x3x3xi32>) -> tensor<1x3x4x3x3xf32>
+    %10 = "tf.Cast"(%8) {Truncate = false, device = ""} : (tensor<2x3x3x3x2xi32>) -> tensor<2x3x3x3x2xf32>
+    %11 = "tf.Conv3D"(%9, %10) {device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]} : (tensor<1x3x4x3x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+    %12 = "tf.Cast"(%11) {device = ""} : (tensor<1x3x2x3x2xf32>) -> tensor<1x3x2x3x2xi32>
+    %13 = "tf.Cast"(%12) {Truncate = false, device = ""} : (tensor<1x3x2x3x2xi32>) -> tensor<1x3x2x3x2xf32>
+    %14 = "tf.Mul"(%13, %cst) {device = ""} : (tensor<1x3x2x3x2xf32>, tensor<2xf32>) -> tensor<1x3x2x3x2xf32>
+    %15 = "tf.Identity"(%14) {device = ""} : (tensor<1x3x2x3x2xf32>) -> tensor<1x3x2x3x2xf32>
+    return %15 : tensor<1x3x2x3x2xf32>
+  }
+
+// CHECK-LABEL: func @conv3d_with_static_shape
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = dense<1> : tensor<2x3x3x3x2xi8>}> {device = ""} : () -> tensor<2x3x3x3x2xi8>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {{.*}} : () -> tensor<5x2xi32>
+// CHECK-DAG-SAME{LITERAL}: value = dense<[[0, 0], [0, 1], [0, 1], [1, 1], [0, 0]]> : tensor<5x2xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-43> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<-2322> : tensor<1x1x1x1x2xi32>}> : () -> tensor<1x1x1x1x2xi32>
+
+// CHECK: %[[PAD:.*]] = "tf.PadV2"({{.*}}, %[[CONST]], %[[CONST_1]])
+// CHECK: %[[CONV:.*]] = "tf.XlaConvV2"(%[[PAD]], %[[WEIGHT]]
+// CHECK-SAME: (tensor<1x4x5x5x3xi8>, tensor<2x3x3x3x2xi8>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1x3x2x3x2xi32>
+// CHECK: %[[SUB:.*]] = "tf.Sub"(%[[CONV]], %[[CONST_2]])
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @conv3d_with_dynamic_shape(%arg0: tensor<?x?x?x?x3xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x?x?x?x2xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "tf.PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<[4.57413898E-6, 4.56899261E-6]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-4.250000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<[4987, 41620]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<1> : tensor<2x3x3x3x2xi8>} : () -> tensor<2x3x3x3x2xi8>
+    %cst_3 = "tf.Const"() {device = "", value = dense<0.00117643911> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_6 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_3) {device = ""} : (tensor<?x?x?x?x3xf32>, tensor<f32>) -> tensor<?x?x?x?x3xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<?x?x?x?x3xf32>, tensor<f32>) -> tensor<?x?x?x?x3xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<?x?x?x?x3xf32>) -> tensor<?x?x?x?x3xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_5, %cst_6) {device = ""} : (tensor<?x?x?x?x3xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?x?x?x3xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<?x?x?x?x3xf32>) -> tensor<?x?x?x?x3xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<?x?x?x?x3xi8>) -> tensor<?x?x?x?x3xi32>
+    %6 = "tf.Sub"(%5, %cst_4) {device = ""} : (tensor<?x?x?x?x3xi32>, tensor<i32>) -> tensor<?x?x?x?x3xi32>
+    %7 = "tf.Identity"(%cst_2) {device = ""} : (tensor<2x3x3x3x2xi8>) -> tensor<2x3x3x3x2xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<2x3x3x3x2xi8>) -> tensor<2x3x3x3x2xi32>
+    %9 = "tf.Cast"(%6) {Truncate = false, device = ""} : (tensor<?x?x?x?x3xi32>) -> tensor<?x?x?x?x3xf32>
+    %10 = "tf.Cast"(%8) {Truncate = false, device = ""} : (tensor<2x3x3x3x2xi32>) -> tensor<2x3x3x3x2xf32>
+    %11 = "tf.Conv3D"(%9, %10) {device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 1, 1]} : (tensor<?x?x?x?x3xf32>, tensor<2x3x3x3x2xf32>) -> tensor<?x?x?x?x2xf32>
+    %12 = "tf.Cast"(%11) {device = ""} : (tensor<?x?x?x?x2xf32>) -> tensor<?x?x?x?x2xi32>
+    %13 = "tf.AddV2"(%12, %cst_1) {device = ""} : (tensor<?x?x?x?x2xi32>, tensor<2xi32>) -> tensor<?x?x?x?x2xi32>
+    %14 = "tf.Cast"(%13) {Truncate = false, device = ""} : (tensor<?x?x?x?x2xi32>) -> tensor<?x?x?x?x2xf32>
+    %15 = "tf.Mul"(%14, %cst) {device = ""} : (tensor<?x?x?x?x2xf32>, tensor<2xf32>) -> tensor<?x?x?x?x2xf32>
+    %16 = "tf.Identity"(%15) {device = ""} : (tensor<?x?x?x?x2xf32>) -> tensor<?x?x?x?x2xf32>
+    return %16 : tensor<?x?x?x?x2xf32>
+  }
+
+// CHECK-LABEL: func @conv3d_with_dynamic_shape
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = dense<1> : tensor<2x3x3x3x2xi8>}> {device = ""} : () -> tensor<2x3x3x3x2xi8>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<-43> : tensor<i8>}> : () -> tensor<i8>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<-2322> : tensor<1x1x1x1x2xi32>}> : () -> tensor<1x1x1x1x2xi32>
+
+// CHECK: %[[CONCAT:.*]] = "tf.Concat"({{.*}})
+// CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%[[CONCAT]], {{.*}}) : (tensor<10xi32>, tensor<2xi64>) -> tensor<5x2xi32>
+// CHECK: %[[PAD:.*]] = "tf.PadV2"({{.*}}, %[[RESHAPE]], %[[CONST_1]])
+// CHECK: %[[CONV:.*]] = "tf.XlaConvV2"(%[[PAD]], %[[WEIGHT]]
+// CHECK-SAME: (tensor<?x?x?x?x3xi8>, tensor<2x3x3x3x2xi8>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<?x?x?x?x2xi32>
+// CHECK: %[[SUB:.*]] = "tf.Sub"(%[[CONV]], %[[CONST_2]])
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @batch_matmul(%arg0: tensor<20x30x64x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<20x30x64x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "tf.PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08784583E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<1> : tensor<20x30x1024x3xi8>} : () -> tensor<20x30x1024x3xi8>
+    %cst_3 = "tf.Const"() {device = "", value = dense<0.00392156886> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_3) {device = ""} : (tensor<20x30x64x1024xf32>, tensor<f32>) -> tensor<20x30x64x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<20x30x64x1024xf32>, tensor<f32>) -> tensor<20x30x64x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<20x30x64x1024xf32>) -> tensor<20x30x64x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_5) {device = ""} : (tensor<20x30x64x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<20x30x64x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<20x30x64x1024xf32>) -> tensor<20x30x64x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<20x30x64x1024xi8>) -> tensor<20x30x64x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_4) {device = ""} : (tensor<20x30x64x1024xi32>, tensor<i32>) -> tensor<20x30x64x1024xi32>
+    %7 = "tf.Identity"(%cst_2) {device = ""} : (tensor<20x30x1024x3xi8>) -> tensor<20x30x1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<20x30x1024x3xi8>) -> tensor<20x30x1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<20x30x64x1024xi32>, tensor<20x30x1024x3xi32>) -> tensor<20x30x64x3xi32>
+    %10 = "tf.Cast"(%9) {Truncate = false, device = ""} : (tensor<20x30x64x3xi32>) -> tensor<20x30x64x3xf32>
+    %11 = "tf.Mul"(%10, %cst) {device = ""} : (tensor<20x30x64x3xf32>, tensor<f32>) -> tensor<20x30x64x3xf32>
+    %12 = "tf.Relu"(%11) {device = ""} : (tensor<20x30x64x3xf32>) -> tensor<20x30x64x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<20x30x64x3xf32>) -> tensor<20x30x64x3xf32>
+    return %13 : tensor<20x30x64x3xf32>
+  }
+
+// CHECK-LABEL: func @batch_matmul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-131072> : tensor<20x30x1x3xi32>}> : () -> tensor<20x30x1x3xi32>
+// CHECK: %[[CAST:.*]] = "tf.Cast"
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]]
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLADOTV2_0]], %[[CONST]]) : (tensor<20x30x64x3xi32>, tensor<20x30x1x3xi32>) -> tensor<20x30x64x3xi32>
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @broadcasting_weight_batch_matmul(%arg0: tensor<2x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08762283E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[-241, 5894, -3771]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_3 = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
+    %cst_4 = "tf.Const"() {device = "", value = dense<0.00392156513> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_4) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_6) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<2x1x1024xi8>) -> tensor<2x1x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_5) {device = ""} : (tensor<2x1x1024xi32>, tensor<i32>) -> tensor<2x1x1024xi32>
+    %7 = "tf.Identity"(%cst_3) {device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<2x1x1024xi32>, tensor<1024x3xi32>) -> tensor<2x1x3xi32>
+    %10 = "tf.AddV2"(%9, %cst_2) {device = ""} : (tensor<2x1x3xi32>, tensor<3xi32>) -> tensor<2x1x3xi32>
+    %11 = "tf.Cast"(%10) {Truncate = false, device = ""} : (tensor<2x1x3xi32>) -> tensor<2x1x3xf32>
+    %12 = "tf.Mul"(%11, %cst) {device = ""} : (tensor<2x1x3xf32>, tensor<f32>) -> tensor<2x1x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<2x1x3xf32>) -> tensor<2x1x3xf32>
+    %14 = "tf.Identity"(%13) {device = ""} : (tensor<2x1x3xf32>) -> tensor<2x1x3xf32>
+    return %14 : tensor<2x1x3xf32>
+  }
+
+// CHECK-LABEL: func @broadcasting_weight_batch_matmul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<[2, 1024, 3]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK: %[[CAST:.*]] = "tf.Cast"
+// CHECK: %[[BROADCAST_TO:.*]] = "tf.BroadcastTo"({{.*}}, %[[CONST]]) : (tensor<1024x3xi8>, tensor<3xi64>) -> tensor<2x1024x3xi8>
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]], %[[BROADCAST_TO]])
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @broadcasting_input_batch_matmul(%arg0: tensor<2x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x2x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08762283E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[-241, 5894, -3771]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_3 = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x2x1024x3xi8>} : () -> tensor<2x2x1024x3xi8>
+    %cst_4 = "tf.Const"() {device = "", value = dense<0.00392156513> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_4) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_6) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<2x1x1024xi8>) -> tensor<2x1x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_5) {device = ""} : (tensor<2x1x1024xi32>, tensor<i32>) -> tensor<2x1x1024xi32>
+    %7 = "tf.Identity"(%cst_3) {device = ""} : (tensor<2x2x1024x3xi8>) -> tensor<2x2x1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<2x2x1024x3xi8>) -> tensor<2x2x1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<2x1x1024xi32>, tensor<2x2x1024x3xi32>) -> tensor<2x2x1x3xi32>
+    %10 = "tf.AddV2"(%9, %cst_2) {device = ""} : (tensor<2x2x1x3xi32>, tensor<3xi32>) -> tensor<2x2x1x3xi32>
+    %11 = "tf.Cast"(%10) {Truncate = false, device = ""} : (tensor<2x2x1x3xi32>) -> tensor<2x2x1x3xf32>
+    %12 = "tf.Mul"(%11, %cst) {device = ""} : (tensor<2x2x1x3xf32>, tensor<f32>) -> tensor<2x2x1x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<2x2x1x3xf32>) -> tensor<2x2x1x3xf32>
+    %14 = "tf.Identity"(%13) {device = ""} : (tensor<2x2x1x3xf32>) -> tensor<2x2x1x3xf32>
+    return %14 : tensor<2x2x1x3xf32>
+  }
+
+// CHECK-LABEL: func @broadcasting_input_batch_matmul
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{value = {{.*}} : tensor<2x2x1024x3xi8>}> {device = ""} : () -> tensor<2x2x1024x3xi8>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<[2, 2, 1, 1024]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK: %[[CAST:.*]] = "tf.Cast"
+// CHECK: %[[BROADCAST_TO:.*]] = "tf.BroadcastTo"(%[[CAST]], %[[CONST]]) : (tensor<2x1x1024xi8>, tensor<4xi64>) -> tensor<2x2x1x1024xi8>
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[BROADCAST_TO]], %[[WEIGHT]])
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+  func.func @dynamic_shape_batch_matmul(%arg0: tensor<?x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08762283E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[-241, 5894, -3771]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_3 = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
+    %cst_4 = "tf.Const"() {device = "", value = dense<0.00392156513> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_4) {device = ""} : (tensor<?x1x1024xf32>, tensor<f32>) -> tensor<?x1x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<?x1x1024xf32>, tensor<f32>) -> tensor<?x1x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_6) {device = ""} : (tensor<?x1x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<?x1x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<?x1x1024xi8>) -> tensor<?x1x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_5) {device = ""} : (tensor<?x1x1024xi32>, tensor<i32>) -> tensor<?x1x1024xi32>
+    %7 = "tf.Identity"(%cst_3) {device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<?x1x1024xi32>, tensor<1024x3xi32>) -> tensor<?x1x3xi32>
+    %10 = "tf.AddV2"(%9, %cst_2) {device = ""} : (tensor<?x1x3xi32>, tensor<3xi32>) -> tensor<?x1x3xi32>
+    %11 = "tf.Cast"(%10) {Truncate = false, device = ""} : (tensor<?x1x3xi32>) -> tensor<?x1x3xf32>
+    %12 = "tf.Mul"(%11, %cst) {device = ""} : (tensor<?x1x3xf32>, tensor<f32>) -> tensor<?x1x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<?x1x3xf32>) -> tensor<?x1x3xf32>
+    %14 = "tf.Identity"(%13) {device = ""} : (tensor<?x1x3xf32>) -> tensor<?x1x3xf32>
+    return %14 : tensor<?x1x3xf32>
+  }
+
+// CHECK-LABEL: func @dynamic_shape_batch_matmul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() <{value = dense<[1024, 3]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() <{value = dense<> : tensor<0xi64>}> : () -> tensor<0xi64>
+// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() <{{{value = .* : tensor<1024x3xi8>}}}> {device = ""} : () -> tensor<1024x3xi8>
+// CHECK: %[[CAST:.*]] = "tf.Cast"({{.*}}) <{Truncate = false}> {device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xi8>
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"(%[[CAST]]) : (tensor<?x1x1024xi8>) -> tensor<3xi64>
+// CHECK: %[[SLICE_1:.*]] = "tf.Slice"(%[[SHAPE]], %[[CONST]], %[[CONST_2]]) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[SLICE_2:.*]] = "tf.Slice"(%[[SHAPE]], %[[CONST_2]], %[[CONST_1]]) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+// CHECK: %[[BROADCAST_ARGS:.*]] = "tf.BroadcastArgs"(%[[SLICE_1]], %[[CONST_4]]) : (tensor<1xi64>, tensor<0xi64>) -> tensor<1xi64>
+// CHECK: %[[CONCAT_1:.*]] = "tf.Concat"(%[[CONST_5]], %[[BROADCAST_ARGS]], %[[SLICE_2]]) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+// CHECK: %[[CONCAT_2:.*]] = "tf.Concat"(%[[CONST_5]], %[[BROADCAST_ARGS]], %[[CONST_3]]) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+// CHECK: %[[BROADCAST_1:.*]] = "tf.BroadcastTo"(%[[CAST]], %[[CONCAT_1]]) : (tensor<?x1x1024xi8>, tensor<3xi64>) -> tensor<?x1x1024xi8>
+// CHECK: %[[BROADCAST_2:.*]] = "tf.BroadcastTo"(%[[WEIGHT]], %[[CONCAT_2]]) : (tensor<1024x3xi8>, tensor<3xi64>) -> tensor<?x1024x3xi8>
+// CHECK: %[[DOT:.*]] = "tf.XlaDotV2"(%[[BROADCAST_1]], %[[BROADCAST_2]])
+}
+
+// -----
+
+module attributes {} {
+  func.func @batch_matmul_two_tensors_with_static_shape(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> (tensor<2x2x2xf32>) {
+    %cst = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_6 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_0) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %2 = "tf.Floor"(%1) : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_5, %cst_6) : (tensor<2x2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<2x2x2xf32>) -> tensor<2x2x2xi8>
+    %5 = "tf.Div"(%arg0, %cst_3) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %6 = "tf.AddV2"(%5, %cst) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %7 = "tf.Floor"(%6) : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_5, %cst_6) : (tensor<2x2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<2x2x2xf32>) -> tensor<2x2x2xi8>
+    %10 = "tf.Cast"(%4) {Truncate = false} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+    %11 = "tf.Sub"(%10, %cst_2) : (tensor<2x2x2xi32>, tensor<i32>) -> tensor<2x2x2xi32>
+    %12 = "tf.Identity"(%9) : (tensor<2x2x2xi8>) -> tensor<2x2x2xi8>
+    %13 = "tf.Cast"(%12) {Truncate = false} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+    %14 = "tf.Sub"(%13, %cst_4) : (tensor<2x2x2xi32>, tensor<i32>) -> tensor<2x2x2xi32>
+    %15 = "tf.BatchMatMulV2"(%11, %14) {adj_x = false, adj_y = false} : (tensor<2x2x2xi32>, tensor<2x2x2xi32>) -> tensor<2x2x2xi32>
+    %16 = "tf.Cast"(%15) {Truncate = false} : (tensor<2x2x2xi32>) -> tensor<2x2x2xf32>
+    %17 = "tf.Mul"(%16, %cst_0) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %18 = "tf.AddV2"(%17, %cst) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %19 = "tf.Floor"(%18) : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
+    %20 = "tf.ClipByValue"(%19, %cst_5, %cst_6) : (tensor<2x2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %21 = "tf.Cast"(%20) {Truncate = false} : (tensor<2x2x2xf32>) -> tensor<2x2x2xi8>
+    %22 = "tf.Identity"(%21) {device = ""} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi8>
+    %23 = "tf.Identity"(%22) {device = ""} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi8>
+    %24 = "tf.Cast"(%23) : (tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+    %25 = "tf.Sub"(%24, %cst_4) : (tensor<2x2x2xi32>, tensor<i32>) -> tensor<2x2x2xi32>
+    %26 = "tf.Cast"(%25) : (tensor<2x2x2xi32>) -> tensor<2x2x2xf32>
+    %27 = "tf.Mul"(%26, %cst_3) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    return %27 : tensor<2x2x2xf32>
+  }
+
+// CHECK-LABEL: func @batch_matmul_two_tensors_with_static_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg1_cast]], %[[arg0_cast]]
+// CHECK-SAME: (tensor<2x2x2xi8>, tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+}
+
+// -----
+
+module attributes {} {
+  func.func @batch_matmul_two_tensors_with_dynamic_shape(%arg0: tensor<2x?x?xf32>, %arg1: tensor<2x?x?xf32>) -> (tensor<2x?x?xf32>) {
+    %cst = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+    %cst_0 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+    %cst_1 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+    %cst_2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+    %cst_6 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_7 = "tf.Const"() {value = dense<55> : tensor<i32>} : () -> tensor<i32>
+    %cst_8 = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_9 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_10 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_11 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_12 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_13 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_9) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %1 = "tf.AddV2"(%0, %cst_10) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %2 = "tf.Floor"(%1) : (tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_12, %cst_13) : (tensor<2x?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<2x?x?xf32>) -> tensor<2x?x?xi8>
+    %5 = "tf.Div"(%arg0, %cst_11) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %6 = "tf.AddV2"(%5, %cst_8) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %7 = "tf.Floor"(%6) : (tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_12, %cst_13) : (tensor<2x?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<2x?x?xf32>) -> tensor<2x?x?xi8>
+    %10 = "tf.Shape"(%4) : (tensor<2x?x?xi8>) -> tensor<3xi64>
+    %11 = "tf.Slice"(%10, %cst, %cst_1) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+    %12 = "tf.Slice"(%10, %cst_1, %cst_0) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+    %13 = "tf.Shape"(%9) : (tensor<2x?x?xi8>) -> tensor<3xi64>
+    %14 = "tf.Slice"(%13, %cst, %cst_1) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+    %15 = "tf.Slice"(%13, %cst_1, %cst_0) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+    %16 = "tf.BroadcastArgs"(%11, %14) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+    %17 = "tf.Concat"(%cst_2, %16, %12) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+    %18 = "tf.Concat"(%cst_2, %16, %15) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+    %19 = "tf.BroadcastTo"(%4, %17) : (tensor<2x?x?xi8>, tensor<3xi64>) -> tensor<2x?x?xi8>
+    %20 = "tf.BroadcastTo"(%9, %18) : (tensor<2x?x?xi8>, tensor<3xi64>) -> tensor<2x?x?xi8>
+    %21 = "tf.XlaDotV2"(%19, %20) {dimension_numbers = "\22\01\00\1A\01\00\12\01\01\0A\01\02", precision_config = ""} : (tensor<2x?x?xi8>, tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %22 = "tf.Cast"(%19) {Truncate = false} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %23 = "tf.Sum"(%22, %cst_3) {keep_dims = true} : (tensor<2x?x?xi32>, tensor<1xi64>) -> tensor<2x?x1xi32>
+    %24 = "tf.Mul"(%23, %cst_4) : (tensor<2x?x1xi32>, tensor<i32>) -> tensor<2x?x1xi32>
+    %25 = "tf.Cast"(%20) {Truncate = false} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %26 = "tf.Sum"(%25, %cst_5) {keep_dims = true} : (tensor<2x?x?xi32>, tensor<1xi64>) -> tensor<2x1x?xi32>
+    %27 = "tf.Mul"(%26, %cst_6) : (tensor<2x1x?xi32>, tensor<i32>) -> tensor<2x1x?xi32>
+    %28 = "tf.Shape"(%20) : (tensor<2x?x?xi8>) -> tensor<3xi64>
+    %29 = "tf.StridedSlice"(%28, %cst_5, %cst_3, %cst_5) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<3xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+    %30 = "tf.Cast"(%29) {Truncate = false} : (tensor<1xi64>) -> tensor<1xi32>
+    %31 = "tf.Mul"(%30, %cst_7) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+    %32 = "tf.Add"(%24, %27) : (tensor<2x?x1xi32>, tensor<2x1x?xi32>) -> tensor<2x?x?xi32>
+    %33 = "tf.Sub"(%32, %31) : (tensor<2x?x?xi32>, tensor<1xi32>) -> tensor<2x?x?xi32>
+    %34 = "tf.Sub"(%21, %33) : (tensor<2x?x?xi32>, tensor<2x?x?xi32>) -> tensor<2x?x?xi32>
+    %35 = "tf.Cast"(%34) {Truncate = false} : (tensor<2x?x?xi32>) -> tensor<2x?x?xf32>
+    %36 = "tf.Mul"(%35, %cst_9) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %37 = "tf.AddV2"(%36, %cst_8) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %38 = "tf.Floor"(%37) : (tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
+    %39 = "tf.ClipByValue"(%38, %cst_12, %cst_13) : (tensor<2x?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %40 = "tf.Cast"(%39) {Truncate = false} : (tensor<2x?x?xf32>) -> tensor<2x?x?xi8>
+    %41 = "tf.Identity"(%40) {device = ""} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi8>
+    %42 = "tf.Identity"(%41) {device = ""} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi8>
+    %43 = "tf.Cast"(%42) : (tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %44 = "tf.Sub"(%43, %cst_4) : (tensor<2x?x?xi32>, tensor<i32>) -> tensor<2x?x?xi32>
+    %45 = "tf.Cast"(%44) : (tensor<2x?x?xi32>) -> tensor<2x?x?xf32>
+    %46 = "tf.Mul"(%45, %cst_11) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    return %46 : tensor<2x?x?xf32>
+  }
+
+// CHECK-LABEL: func @batch_matmul_two_tensors_with_dynamic_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+
+// CHECK: %[[arg1_broad:.*]] = "tf.BroadcastTo"(%[[arg1_cast]]
+// CHECK: %[[arg0_broad:.*]] = "tf.BroadcastTo"(%[[arg0_cast]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg1_broad]], %[[arg0_broad]]
+// CHECK-SAME: (tensor<2x?x?xi8>, tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+
+// CHECK: %[[arg0_shape:.*]] = "tf.Shape"(%[[arg0_broad]]
+// CHECK: %[[shape_zp_contribute:.*]] = "tf.StridedSlice"(%[[arg0_shape]]
+// CHECK: %[[shape_zp_contribute_cast:.*]] = "tf.Cast"(%[[shape_zp_contribute]]
+// CHECK: %[[shape_zp_contribute_mul:.*]] = "tf.Mul"(%[[shape_zp_contribute_cast]]
+// CHECK: %[[zp:.*]] = "tf.Sub"({{.*}}, %[[shape_zp_contribute_mul]])
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]], %[[zp]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+}
+
+// -----
+
+module attributes {} {
+  func.func @einsum(%arg0: tensor<2x3xf32>) -> (tensor<2x4xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<1.4049983> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<2.62249741E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[[69, 56, 29, 41], [106, 108, 118, 127], [51, 52, 50, 30]]> : tensor<3x4xi8>} : () -> tensor<3x4xi8>
+    %cst_3 = "tf.Const"() {device = "", value = dense<0.0037096194> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_3) {device = ""} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) {device = ""} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xf32>
+    %2 = "tf.Maximum"(%1, %cst_1) {device = ""} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xf32>
+    %3 = "tf.Minimum"(%2, %cst_5) {device = ""} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2x3xf32>
+    %4 = "tf.Round"(%3) {device = ""} : (tensor<2x3xf32>) -> tensor<2x3xf32>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<2x3xf32>) -> tensor<2x3xi8>
+    %6 = "tf.Identity"(%5) {device = ""} : (tensor<2x3xi8>) -> tensor<2x3xi8>
+    %7 = "tf.Cast"(%6) {Truncate = false, device = ""} : (tensor<2x3xi8>) -> tensor<2x3xi32>
+    %8 = "tf.Sub"(%7, %cst_4) {device = ""} : (tensor<2x3xi32>, tensor<i32>) -> tensor<2x3xi32>
+    %9 = "tf.Identity"(%cst_2) {device = ""} : (tensor<3x4xi8>) -> tensor<3x4xi8>
+    %10 = "tf.Cast"(%9) {Truncate = false, device = ""} : (tensor<3x4xi8>) -> tensor<3x4xi32>
+    %11 = "tf.Einsum"(%8, %10) {device = "", equation = "ab,bc->ac"} : (tensor<2x3xi32>, tensor<3x4xi32>) -> tensor<2x4xi32>
+    %12 = "tf.Cast"(%11) {Truncate = false, device = ""} : (tensor<2x4xi32>) -> tensor<2x4xf32>
+    %13 = "tf.Mul"(%12, %cst_0) {device = ""} : (tensor<2x4xf32>, tensor<f32>) -> tensor<2x4xf32>
+    %14 = "tf.Relu"(%13) {device = ""} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %15 = "tf.Minimum"(%14, %cst) {device = ""} : (tensor<2x4xf32>, tensor<f32>) -> tensor<2x4xf32>
+    %16 = "tf.Identity"(%15) {device = ""} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %17 = "tf.Identity"(%16) {device = ""} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    func.return %17 : tensor<2x4xf32>
+  }
+
+// CHECK-LABEL: func @einsum
+// CHECK: %[[CAST:.*]] = "tf.Cast"(
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]],
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLADOTV2_0]],
+}
+
+// -----
+
+module attributes {} {
+  func.func @einsum_with_batch(%arg0: tensor<2x3x4xf32>) -> (tensor<2x3x5xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<2.02468872> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<3.07491428E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[[[103, 11, 81, 127, 25], [13, 21, 76, 42, 63], [114, 15, 18, 64, 91], [73, 99, 21, 46, 66]], [[11, 127, 65, 72, 82], [31, 39, 111, 69, 20], [82, 37, 34, 76, 13], [61, 70, 69, 112, 3]]]> : tensor<2x4x5xi8>} : () -> tensor<2x4x5xi8>
+    %cst_3 = "tf.Const"() {device = "", value = dense<0.00391459931> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_3) {device = ""} : (tensor<2x3x4xf32>, tensor<f32>) -> tensor<2x3x4xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) {device = ""} : (tensor<2x3x4xf32>, tensor<f32>) -> tensor<2x3x4xf32>
+    %2 = "tf.Maximum"(%1, %cst_1) {device = ""} : (tensor<2x3x4xf32>, tensor<f32>) -> tensor<2x3x4xf32>
+    %3 = "tf.Minimum"(%2, %cst_5) {device = ""} : (tensor<2x3x4xf32>, tensor<f32>) -> tensor<2x3x4xf32>
+    %4 = "tf.Round"(%3) {device = ""} : (tensor<2x3x4xf32>) -> tensor<2x3x4xf32>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<2x3x4xf32>) -> tensor<2x3x4xi8>
+    %6 = "tf.Identity"(%5) {device = ""} : (tensor<2x3x4xi8>) -> tensor<2x3x4xi8>
+    %7 = "tf.Cast"(%6) {Truncate = false, device = ""} : (tensor<2x3x4xi8>) -> tensor<2x3x4xi32>
+    %8 = "tf.Sub"(%7, %cst_4) {device = ""} : (tensor<2x3x4xi32>, tensor<i32>) -> tensor<2x3x4xi32>
+    %9 = "tf.Identity"(%cst_2) {device = ""} : (tensor<2x4x5xi8>) -> tensor<2x4x5xi8>
+    %10 = "tf.Cast"(%9) {Truncate = false, device = ""} : (tensor<2x4x5xi8>) -> tensor<2x4x5xi32>
+    %11 = "tf.Einsum"(%8, %10) {device = "", equation = "abc,acd->abd"} : (tensor<2x3x4xi32>, tensor<2x4x5xi32>) -> tensor<2x3x5xi32>
+    %12 = "tf.Cast"(%11) {Truncate = false, device = ""} : (tensor<2x3x5xi32>) -> tensor<2x3x5xf32>
+    %13 = "tf.Mul"(%12, %cst_0) {device = ""} : (tensor<2x3x5xf32>, tensor<f32>) -> tensor<2x3x5xf32>
+    %14 = "tf.Relu"(%13) {device = ""} : (tensor<2x3x5xf32>) -> tensor<2x3x5xf32>
+    %15 = "tf.Minimum"(%14, %cst) {device = ""} : (tensor<2x3x5xf32>, tensor<f32>) -> tensor<2x3x5xf32>
+    %16 = "tf.Identity"(%15) {device = ""} : (tensor<2x3x5xf32>) -> tensor<2x3x5xf32>
+    %17 = "tf.Identity"(%16) {device = ""} : (tensor<2x3x5xf32>) -> tensor<2x3x5xf32>
+    func.return %17 : tensor<2x3x5xf32>
+  }
+
+// CHECK-LABEL: func @einsum_with_batch
+// CHECK: %[[CAST:.*]] = "tf.Cast"(
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]],
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLADOTV2_0]],
+}
+
+// -----
+
+module attributes {} {
+  func.func @einsum_with_additional_einsums(%arg0: tensor<2x6x4x5xf32>, %arg1: tensor<2x3x4x5xf32>) -> (tensor<2x4x3x6xf32>) {
+    %cst = "tf.Const"() {device = "", value = dense<3.064220e+00> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.5347272E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<0.0039161914> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {device = "", value = dense<0.00391892809> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.Div"(%arg0, %cst_3) {device = ""} : (tensor<2x6x4x5xf32>, tensor<f32>) -> tensor<2x6x4x5xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) {device = ""} : (tensor<2x6x4x5xf32>, tensor<f32>) -> tensor<2x6x4x5xf32>
+    %2 = "tf.Maximum"(%1, %cst_1) {device = ""} : (tensor<2x6x4x5xf32>, tensor<f32>) -> tensor<2x6x4x5xf32>
+    %3 = "tf.Minimum"(%2, %cst_4) {device = ""} : (tensor<2x6x4x5xf32>, tensor<f32>) -> tensor<2x6x4x5xf32>
+    %4 = "tf.Round"(%3) {device = ""} : (tensor<2x6x4x5xf32>) -> tensor<2x6x4x5xf32>
+    %5 = "tf.Cast"(%4) {device = ""} : (tensor<2x6x4x5xf32>) -> tensor<2x6x4x5xi8>
+    %6 = "tf.Div"(%arg1, %cst_2) {device = ""} : (tensor<2x3x4x5xf32>, tensor<f32>) -> tensor<2x3x4x5xf32>
+    %7 = "tf.AddV2"(%6, %cst_1) {device = ""} : (tensor<2x3x4x5xf32>, tensor<f32>) -> tensor<2x3x4x5xf32>
+    %8 = "tf.Maximum"(%7, %cst_1) {device = ""} : (tensor<2x3x4x5xf32>, tensor<f32>) -> tensor<2x3x4x5xf32>
+    %9 = "tf.Minimum"(%8, %cst_4) {device = ""} : (tensor<2x3x4x5xf32>, tensor<f32>) -> tensor<2x3x4x5xf32>
+    %10 = "tf.Round"(%9) {device = ""} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
+    %11 = "tf.Cast"(%10) {device = ""} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xi8>
+    %12 = "tf.Identity"(%11) {device = ""} : (tensor<2x3x4x5xi8>) -> tensor<2x3x4x5xi8>
+    %13 = "tf.Cast"(%12) {Truncate = false, device = ""} : (tensor<2x3x4x5xi8>) -> tensor<2x3x4x5xi32>
+    %14 = "tf.Sub"(%13, %cst_5) {device = ""} : (tensor<2x3x4x5xi32>, tensor<i32>) -> tensor<2x3x4x5xi32>
+    %15 = "tf.Identity"(%5) {device = ""} : (tensor<2x6x4x5xi8>) -> tensor<2x6x4x5xi8>
+    %16 = "tf.Cast"(%15) {Truncate = false, device = ""} : (tensor<2x6x4x5xi8>) -> tensor<2x6x4x5xi32>
+    %17 = "tf.Sub"(%16, %cst_5) {device = ""} : (tensor<2x6x4x5xi32>, tensor<i32>) -> tensor<2x6x4x5xi32>
+    %18 = "tf.Einsum"(%14, %17) {device = "", equation = "abcd,aecd->acbe"} : (tensor<2x3x4x5xi32>, tensor<2x6x4x5xi32>) -> tensor<2x4x3x6xi32>
+    %19 = "tf.Cast"(%18) {Truncate = false, device = ""} : (tensor<2x4x3x6xi32>) -> tensor<2x4x3x6xf32>
+    %20 = "tf.Mul"(%19, %cst_0) {device = ""} : (tensor<2x4x3x6xf32>, tensor<f32>) -> tensor<2x4x3x6xf32>
+    %21 = "tf.Relu"(%20) {device = ""} : (tensor<2x4x3x6xf32>) -> tensor<2x4x3x6xf32>
+    %22 = "tf.Minimum"(%21, %cst) {device = ""} : (tensor<2x4x3x6xf32>, tensor<f32>) -> tensor<2x4x3x6xf32>
+    %23 = "tf.Identity"(%22) {device = ""} : (tensor<2x4x3x6xf32>) -> tensor<2x4x3x6xf32>
+    %24 = "tf.Identity"(%23) {device = ""} : (tensor<2x4x3x6xf32>) -> tensor<2x4x3x6xf32>
+    return %24 : tensor<2x4x3x6xf32>
+  }
+
+// CHECK-LABEL: func @einsum_with_additional_einsums
+// CHECK: %[[ARG1:.*]] = "tf.Cast"(
+// CHECK: %[[ARG0:.*]] = "tf.Cast"(
+// CHECK: %[[XLADOTV2:.*]] = "tf.XlaDotV2"(%[[ARG0]], %[[ARG1]]
+
+// CHECK: %[[ARG0_CAST:.*]] = "tf.Cast"(%[[ARG0]]
+// CHECK: %[[ARG0_REDUCE:.*]] = "tf.Einsum"(%[[ARG0_CAST]]
+// CHECK-SAME: __tf_quant_created_einsum
+// CHECK: %[[ARG0_ZP:.*]] = "tf.Mul"(%[[ARG0_REDUCE]]
+
+// CHECK: %[[ARG1_CAST:.*]] = "tf.Cast"(%[[ARG1]]
+// CHECK: %[[ARG1_REDUCE:.*]] = "tf.Einsum"({{.*}}, %[[ARG1_CAST]]
+// CHECK-SAME: __tf_quant_created_einsum
+// CHECK: %[[ARG1_ZP:.*]] = "tf.Mul"(%[[ARG1_REDUCE]]
+
+// CHECK: %[[ZP:.*]] = "tf.Add"(%[[ARG0_ZP]], %[[ARG1_ZP]])
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_unfreeze_constants.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_unfreeze_constants.mlir
new file mode 100644
index 000000000000..06fd984ec6db
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/tf_unfreeze_constants.mlir
@@ -0,0 +1,284 @@
+// RUN: tf-quant-opt %s -tf-quant-unfreeze-constants='size_threshold_in_bytes=16' \
+// RUN:     -allow-unregistered-dialect -mlir-disable-threading \
+// RUN:     -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests a case with one ConstOp and a tf_saved_model.session_initializer with an empty initializers.
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+// Check that the init function is created & added to the initializers attribute.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = [@init_func_restore_op]
+
+// CHECK: func.func @init_func_restore_op()
+// CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
+// CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+
+// Check that variable is initialized by assigning the const value within the initializer function.
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<8xf32>}>
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
+
+  func.func @serving_default() -> (tensor<8xf32> {tf_saved_model.index_path = ["output"]})
+    attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    return %cst_0 : tensor<8xf32>
+  }
+// Check that the ConstOp's use is replaced by VarHandleOp -> ReadVariableOp.
+// CHECK: @serving_default
+// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK: return %[[READ_VAR_0]] : tensor<8xf32>
+}
+
+// -----
+
+// Tests the case when there's no tf_saved_model.session_initializer.
+module attributes {tf_saved_model.semantics} {
+
+// Check that a new tf_saved_model.session_initializer is created, along with an initialier function.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = [@init_func_restore_op]
+
+// CHECK: func.func @init_func_restore_op()
+// CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
+// CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<1.000000e\+00> : tensor<8xf32>.*}}}>
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
+
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<2.000000e\+00> : tensor<8xf32>.*}}}>
+// CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"()  {{.*shared_name = "const_1".*}}
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[CST_1]])
+
+  func.func @serving_default() -> (tensor<8xf32> {tf_saved_model.index_path = ["output"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<2.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %0 = "tf.AddV2"(%cst_0, %cst_1) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+    return %0 : tensor<8xf32>
+  }
+// CHECK: @serving_default
+// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[VAR_HANDLE_3:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_1".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_3]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[ADD_0:.*]] = "tf.AddV2"(%[[READ_VAR_0]], %[[READ_VAR_1]])
+// CHECK: return %[[ADD_0]] : tensor<8xf32>
+}
+
+// -----
+
+// Tests the case when there's a tf_saved_model.session_initializer and an empty init function.
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() {initializers = [@init]} : () -> ()
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = [@init]
+
+  func.func @init() attributes {tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init"], tf_saved_model.initializer_type = "restore_op"} {
+    return
+  }
+// CHECK: func.func @init()
+// CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init"]
+// CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<8xf32>}>
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"()
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
+
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{value = dense<2.000000e+00> : tensor<8xf32>}>
+// CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"()
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[CST_1]])
+
+  func.func @serving_default(%arg0: tensor<8xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<8xf32> {tf_saved_model.index_path = ["output"]})
+    attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<2.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %0 = "tf.Sub"(%cst_0, %cst_1) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+    return %0 : tensor<8xf32>
+  }
+// CHECK: @serving_default
+// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[VAR_HANDLE_3:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_1".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_3]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[SUB_0:.*]] = "tf.Sub"(%[[READ_VAR_0]], %[[READ_VAR_1]])
+// CHECK: return %[[SUB_0]] : tensor<8xf32>
+}
+
+// -----
+
+// Tests the case when there's a tf_saved_model.session_initializer and an init function whose type is "init_op".
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() {initializers = [@init]} : () -> ()
+// Check that @init_func_restore_op is added to the initializers list.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = [@init, @init_func_restore_op]
+
+// Check that @init_func_restore_op is newly created with variable initializations.
+// CHECK: @init_func_restore_op()
+// CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
+// CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<3.000000e+00> : tensor<8xf32>}>
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"()
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
+
+  func.func @init() attributes {tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init"], tf_saved_model.initializer_type = "init_op"} {
+    return
+  }
+// Check that @init is not removed.
+// CHECK: @init()
+// CHECK-SAME: tf_saved_model.initializer_type = "init_op"
+
+  func.func @serving_default() -> (tensor<8xf32> {tf_saved_model.index_path = ["output"]})
+    attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst_0 = "tf.Const"() {device = "", value = dense<3.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    return %cst_0 : tensor<8xf32>
+  }
+}
+
+// -----
+
+// Tests the case when there is no ConstOp.
+module attributes {tf_saved_model.semantics} {
+
+// Check that nothing happens when there's no ConstOp in the graph.
+// CHECK-NOT: "tf_saved_model.session_initializer"()
+
+  func.func @serving_default(%arg_0: tensor<5xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<5xf32> {tf_saved_model.index_path = ["output"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "inputs:0", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    return %arg_0 : tensor<5xf32>
+  }
+// CHECK: @serving_default(%[[ARG_0:.*]]: tensor<5xf32> {{.*}})
+// CHECK-NEXT: return %[[ARG_0]] : tensor<5xf32>
+}
+
+// -----
+
+// Tests that constants that are smaller than "size_threshold_in_bytes" are
+// not converted to variables. This test uses the threshold of 16 bytes.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+  func.func @init_func_restore_op() attributes {tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init"],
+                                           tf_saved_model.initializer_type = "restore_op"} {
+    return
+  }
+
+  func.func @serving_default() -> (tensor<12xf32> {tf_saved_model.index_path = ["output"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    // Should be unfrozen.
+    %cst_0 = "tf.Const"() {value = dense<5.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    // Consts below are smaller than or equal to the threshold so they
+    // should not be converted to variables.
+    %cst_1 = "tf.Const"() {value = dense<5.0> : tensor<4xf32>} : () -> tensor<4xf32>
+    %cst_axis = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    %0 = "tf.ConcatV2"(%cst_0, %cst_1, %cst_axis) : (tensor<8xf32>, tensor<4xf32>, tensor<i64>) -> tensor<12xf32>
+    return %0 : tensor<12xf32>
+  }
+// CHECK: func.func @init_func_restore_op()
+
+// Check that `tf.VarHandleOp` is only created for the constant that is larger
+// than the threshold (16 bytes for this test).
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{{{.*value = dense<5.000000e\+00> : tensor<8xf32>.*}}}>
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
+
+// Make sure that there are no more `tf.VarHandleOp`s and `tf.AssignVariableOp`s
+// in this function.
+// CHECK-NOT: "tf.VarHandleOp"
+// CHECK-NOT: "tf.AssignVariableOp"
+
+// Only the large constant is replaced with the `tf.VarHandleOp ->
+// tf.ReadVariableOp` pattern and others remain as `tf.Const`s.
+// CHECK: @serving_default
+// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{{{.*value = dense<5.000000e\+00> : tensor<4xf32>.*}}}>
+// CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() <{{{.*value = dense<0> : tensor<i64>.*}}}>
+// CHECK-DAG: %[[CONCAT:.*]] = "tf.ConcatV2"(%[[READ_VAR_0]], %[[CST_1]], %[[AXIS]])
+// CHECK: return %[[CONCAT]] : tensor<12xf32>
+}
+
+// -----
+
+// Tests a case where the ConstOp's location is a fused loc containing more
+// than two strings to be combined to form the shared_name. It must not contain
+// the character ";" (which is often used as a delimiter to join fused loc's
+// items).
+
+module attributes {tf_saved_model.semantics} {
+// CHECK: func.func @init_func_restore_op()
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<8xf32>}>
+// Check that the variable's shared_name contains the fused loc's items joined
+// by the delimiter "_" and suffixed with a number.
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "apple_banana_0".*}}
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
+
+  func.func @serving_default() -> (tensor<8xf32> {tf_saved_model.index_path = ["output"]})
+    attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<8xf32>} : () -> tensor<8xf32> loc(fused["Const:", "apple", "banana"])
+    return %cst_0 : tensor<8xf32>
+  }
+}
+
+
+// -----
+
+// Tests the case when there are functions called from the main function such as while_body/while_cond.
+
+module attributes {tf_saved_model.semantics} {
+
+  func.func @serving_default(%arg0: tensor<1x5x5x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x5x5x1024xf32> {tf_saved_model.index_path = ["output"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = "tf.PartitionedCall"(%arg0) {f = @__inference_main} : (tensor<1x5x5x1024xf32>) -> tensor<1x5x5x1024xf32>
+    return %0 : tensor<1x5x5x1024xf32>
+  }
+
+  func.func private @__inference_main(%arg0: tensor<1x5x5x1024xf32> {tf._user_specified_name = "input_tensor"}) -> tensor<1x5x5x1024xf32>
+  attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x5x5x1024>], tf._noinline = true, tf._original_func_name = "__inference_main_540"} {
+    %cst_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %cst_1 = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
+    %cst_2 = "tf.Const"() {value = dense<1.0> : tensor<1x5x5x1024xf32>} : () -> tensor<1x5x5x1024xf32>
+    // Check that these constants are unfrozen.
+    // CHECK: func private @__inference_main
+    // CHECK: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "const_0"}> : () -> tensor<!tf_type.resource<tensor<1x5x5x1024xf32>>>
+    // CHECK: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<1x5x5x1024xf32>>>) -> tensor<1x5x5x1024xf32>
+    %0:3 = "tf.While"(%cst_0, %cst_1, %arg0) {T = [i32, i32, f32], _lower_using_switch_merge = true, _num_original_outputs = 4 : i64, _read_only_resource_inputs = [], body = @while_body, cond = @while_cond, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x5x5x1024>], parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<1x5x5x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<1x5x5x1024xf32>)
+    %1 = "tf.AddV2"(%0#2, %cst_2) {device = ""} : (tensor<1x5x5x1024xf32>, tensor<1x5x5x1024xf32>) -> tensor<1x5x5x1024xf32>
+    return %1 : tensor<1x5x5x1024xf32>
+  }
+
+  func.func private @while_body(%arg0: tensor<i32> {tf._user_specified_name = "while/loop_counter"}, %arg1: tensor<i32> {tf._user_specified_name = "while/maximum_iterations"}, %arg2: tensor<1x5x5x1024xf32>) -> (tensor<i32>, tensor<i32>, tensor<1x5x5x1024xf32>)
+  attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x5x5x1024>], tf._original_func_name = "while_body_70"} {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %cst_0 = "tf.Const"() {value = dense<1.0> : tensor<1x5x5x1024xf32>} : () -> tensor<1x5x5x1024xf32>
+    // Check that these constants are remained in constants.
+    // CHECK: func private @while_body
+    // CHECK-DAG:  %[[CST_0:.*]]= "tf.Const"() <{value = dense<1.000000e+00> : tensor<1x5x5x1024xf32>}> : () -> tensor<1x5x5x1024xf32>
+    %0 = "tf.AddV2"(%arg0, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = "tf.Identity"(%0) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %2 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %4 = "tf.AddV2"(%arg2, %cst_0) {device = ""} : (tensor<1x5x5x1024xf32>, tensor<1x5x5x1024xf32>) -> tensor<1x5x5x1024xf32>
+    %5 = "tf.Identity"(%4) {device = ""} : (tensor<1x5x5x1024xf32>) -> tensor<1x5x5x1024xf32>
+    return %1, %2, %5 : tensor<i32>, tensor<i32>, tensor<1x5x5x1024xf32>
+  }
+
+  func.func private @while_cond(%arg0: tensor<i32> {tf._user_specified_name = "while/loop_counter"}, %arg1: tensor<i32> {tf._user_specified_name = "while/maximum_iterations"}, %arg2: tensor<1x5x5x1024xf32>) -> tensor<i1>
+  attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<1x5x5x1024>], tf._original_func_name = "while_cond_60"} {
+    %cst = "tf.Const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %cst_0 = "tf.Const"() {value = dense<5.0> : tensor<f32>} : () -> tensor<f32>
+    // Check that these constants are remained in constants.
+    // CHECK: func private @while_cond
+    // CHECK-DAG:  %[[CST:.*]]= "tf.Const"() <{value = dense<[0, 1, 2, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+    %0 = "tf.Sum"(%arg2, %cst) {device = "", keep_dims = false} : (tensor<1x5x5x1024xf32>, tensor<4xi32>) -> tensor<f32>
+    %1 = "tf.Less"(%0, %cst_0) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2 = "tf.Identity"(%1) {device = ""} : (tensor<i1>) -> tensor<i1>
+    return %2 : tensor<i1>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_passes.cc
new file mode 100644
index 000000000000..94be20872c35
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_passes.cc
@@ -0,0 +1,216 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_passes.h"
+
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+
+namespace tensorflow {
+namespace quantization {
+namespace {
+
+void AddConvertTpuToCpuModelPasses(mlir::OpPassManager &pm) {
+  pm.addPass(mlir::tf_quant::CreateConvertTpuModelToCpuPass());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::tf_quant::CreateCastBf16OpsToF32Pass());
+}
+
+}  // namespace
+
+void AddQuantizeQatPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix) {
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateConvertFakeQuantToQdqPass());
+  if (quantization_options.op_set() == OpSet::UNIFORM_QUANTIZED) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::TF::CreateUnrollBatchMatMulPassPass());
+  }
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  if (quantization_options.experimental_enable_tpu_model_support()) {
+    AddConvertTpuToCpuModelPasses(pm);
+  }
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateConvertTfXlaOpToTfOpPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreatePrepareLiftingPass(quantization_options.op_set()));
+
+  pm.addPass(mlir::tf_quant::CreateLiftQuantizableSpotsAsFunctionsPass(
+      quantization_options));
+  pm.addPass(mlir::tf_quant::CreateInsertQuantizedFunctionsPass(
+      quantization_options.quantization_method().preset_method(),
+      quantization_options.op_set()));
+  // TODO: b/260677670 - Pass quantization options as pass's inputs where
+  // applicable
+  pm.addPass(mlir::tf_quant::CreateQuantizeCompositeFunctionsPass(
+      quantization_options.quantization_method().preset_method(),
+      quantization_options.op_set(),
+      quantization_options.enable_per_channel_quantization(),
+      quantization_options.min_num_elements_for_weights(),
+      quantization_options.enable_legacy_weight_only(), mlir_dump_file_prefix));
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+
+  // TODO: b/264637396 - Deprecate TF opset
+  if (quantization_options.op_set() != OpSet::TF) {
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+    if (quantization_options.op_set() == OpSet::XLA) {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::tf_quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    }
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+  }
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::tf_quant::CreateOptimizePass());
+}
+
+void AddQuantizePtqDynamicRangePasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix) {
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateUnrollBatchMatMulPassPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  if (quantization_options.experimental_enable_tpu_model_support()) {
+    AddConvertTpuToCpuModelPasses(pm);
+  }
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateConvertTfXlaOpToTfOpPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreatePrepareLiftingPass(quantization_options.op_set()));
+  pm.addPass(mlir::tf_quant::CreateLiftQuantizableSpotsAsFunctionsDRQPass(
+      quantization_options.quantization_method().preset_method(),
+      quantization_options.op_set(),
+      quantization_options.min_num_elements_for_weights()));
+  pm.addPass(mlir::tf_quant::CreateInsertQuantizedFunctionsPass(
+      quantization_options.quantization_method().preset_method(),
+      quantization_options.op_set()));
+  pm.addPass(mlir::tf_quant::CreateQuantizeCompositeFunctionsPass(
+      quantization_options.quantization_method().preset_method(),
+      quantization_options.op_set(),
+      quantization_options.enable_per_channel_quantization(),
+      quantization_options.min_num_elements_for_weights(),
+      quantization_options.enable_legacy_weight_only(), mlir_dump_file_prefix));
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+
+  // TODO: b/264637396 - Deprecate TF opset
+  if (quantization_options.op_set() != OpSet::TF) {
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+    if (quantization_options.op_set() == OpSet::XLA) {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::tf_quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    }
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+  }
+
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::tf_quant::CreateOptimizePass());
+}
+
+void AddQuantizePtqPreCalibrationPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options) {
+  if (quantization_options.op_set() == OpSet::UNIFORM_QUANTIZED) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::TF::CreateUnrollBatchMatMulPassPass());
+  }
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  if (quantization_options.experimental_enable_tpu_model_support()) {
+    AddConvertTpuToCpuModelPasses(pm);
+  }
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateConvertTfXlaOpToTfOpPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreatePrepareLiftingPass(quantization_options.op_set()));
+  pm.addPass(mlir::tf_quant::CreateLiftQuantizableSpotsAsFunctionsPass(
+      quantization_options));
+  // TODO: b/295140328 - Add debugger support for weight only
+  if (quantization_options.has_debugger_config()) {
+    pm.addPass(mlir::tf_quant::CreateAddDumpTensorOpPass(
+        quantization_options.debugger_config().debugger_type(),
+        quantization_options.debugger_config().log_dir_path()));
+  }
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateInsertCustomAggregationOpsPass(
+          quantization_options.calibration_options()));
+}
+
+void AddQuantizePtqPostCalibrationPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix) {
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateConvertCustomAggregationOpToQuantStatsPass());
+  pm.addPass(mlir::tf_quant::CreateInsertQuantizedFunctionsPass(
+      quantization_options.quantization_method().preset_method(),
+      quantization_options.op_set()));
+  pm.addPass(mlir::tf_quant::CreateQuantizeCompositeFunctionsPass(
+      quantization_options.quantization_method().preset_method(),
+      quantization_options.op_set(),
+      quantization_options.enable_per_channel_quantization(),
+      quantization_options.min_num_elements_for_weights(),
+      quantization_options.enable_legacy_weight_only(), mlir_dump_file_prefix));
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+
+  // TODO: b/264637396 - Deprecate TF opset
+  if (quantization_options.op_set() != OpSet::TF) {
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+    if (quantization_options.op_set() == OpSet::XLA) {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::tf_quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    }
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+  }
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::tf_quant::CreateOptimizePass());
+}
+
+void AddQuantizeWeightOnlyPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix) {
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  // Add PrepareLiftingPass to utilize its functionalities like folding batch
+  // normalization ops and removing training related ops.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreatePrepareLiftingPass(quantization_options.op_set()));
+  pm.addPass(mlir::tf_quant::CreateQuantizeWeightsPass(quantization_options));
+  pm.addPass(mlir::tf_quant::CreatePropagateQuantizeTypePass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+  // Use optimize pass to remove double casts that are inserted when inlining
+  // functions.
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::tf_quant::CreateOptimizePass());
+}
+
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_passes.h
new file mode 100644
index 000000000000..5fabf3afcf07
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_passes.h
@@ -0,0 +1,55 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TF_QUANTIZE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TF_QUANTIZE_PASSES_H_
+
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// mlir_dump_file_prefix is an optional field that is used for debugging to save
+// mlir dump files.
+void AddQuantizeQatPasses(mlir::OpPassManager &pm,
+                          const QuantizationOptions &quantization_options,
+                          std::optional<const absl::string_view>
+                              mlir_dump_file_prefix = std::nullopt);
+
+void AddQuantizePtqDynamicRangePasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+void AddQuantizeWeightOnlyPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+void AddQuantizePtqPreCalibrationPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options);
+
+void AddQuantizePtqPostCalibrationPasses(
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
+    std::optional<const absl::string_view> mlir_dump_file_prefix =
+        std::nullopt);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TF_QUANTIZE_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.cc
new file mode 100644
index 000000000000..bbb45556c449
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.cc
@@ -0,0 +1,233 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "mhlo/transforms/passes.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/tf_pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_passes.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace quantization {
+
+using ::mlir::tf_quant::stablehlo::AddXlaCallModuleOpDeserializationPasses;
+
+// Adds passes that unfuse MHLO ops that do not have their equivalents in
+// StableHLO.
+void AddUnfuseMhloOpsPasses(mlir::PassManager& pm) {
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createLegalizeEinsumToDotGeneralPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createLegalizeDotToDotGeneralPass());
+  // Unfuse mhlo BatchNorm to primitive ops.
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::odml::createUnfuseBatchNormPass());
+  // Fuse Conv + Mul to Conv.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::odml::tf_quant::createFuseConvolutionPass());
+  // Fold broadcast_in_dim + Mul.
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::odml::createFoldBroadcastPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createLegalizeTorchIndexSelectToGatherPass());
+}
+
+// Converts TF SavedModel to StableHLO module. The input TF SavedModel can have
+// StableHLO module serialized into a XlaCallModuleOp. (ex: JAX/PyTorch models)
+void AddTFToStablehloPasses(
+    mlir::PassManager& pm,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes) {
+  pm.addPass(mlir::odml::CreateRenameEntrypointToMainPass());
+  // TODO: b/230572023 - Consider improving shape inference for While op instead
+  // of dropping the attribute. This need not be correct for models not trained
+  // on TPU.
+  // Extracts the StableHLO module from tf.XlaCallModuleOp if the StableHLO
+  // module is serialized in it.
+  pm.addPass(mlir::stablehlo::CreateLegalizeTFXlaCallModuleToStablehloPass());
+
+  // Preprocesses TPU-targeting StableHLO module for support in TF Quantizer.
+  pm.addPass(mlir::tf_quant::CreateConvertTpuModelToCpuPass());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::tf_quant::CreateCastBf16OpsToF32Pass());
+
+  // Optimizes the graph via cleanups, merges, rewrites, constant folding,
+  // and edge case handling where possible.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateDropWhileShapeInvariantPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_executor::CreateTFExecutorGraphPruningPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_executor::CreateTFExecutorIslandCoarseningPass());
+  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  // Propagates shapes on the TensorFlow graph.
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass(input_arg_shapes));
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TFDevice::CreateDecomposeResourceOpsPass());
+
+  // FreezeVariables only freezes variables for TF v1 types. Separately handle
+  // freezing of TF v2 GlobalTensor ops. (Ref: b/206855389)
+  pm.addPass(mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
+  pm.addPass(mlir::tf_saved_model::CreateFreezeGlobalTensorsPass(
+      /*allow_mutable_tensors=*/true));
+
+  // Generic MLIR optimization passes.
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass(input_arg_shapes));
+
+  // Legalizes TF UniformQuantized types into MHLO. Part of the official
+  // TF/XLA bridge component.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::quant::stablehlo::CreateConvertTFQuantOpsToMHLOPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+
+  // TF -> StableHLO legalization.
+  // Skip StatefulPartitionedCall to preserve aliased functions.
+  mlir::odml::AddLegalizeTFToStablehloPasses(pm, /*skip_quantization_ops=*/true,
+                                             /*skip_resize=*/false,
+                                             /*skip_partitioned_calls=*/true);
+  // StableHLO -> MHLO legalization for MHLO optimization.
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  // Rewrites legacy StableHLO ops.
+  AddUnfuseMhloOpsPasses(pm);
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  // MHLO -> StableHLO legalization.
+  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+}
+
+absl::Status PreprocessAndFreezeGraph(
+    const absl::string_view mlir_dump_file_prefix, const bool is_inliner_run,
+    const absl::flat_hash_set<std::string>& noinline_functions,
+    mlir::ModuleOp module_op, mlir::MLIRContext* context,
+    std::optional<Session*> session, const bool run_tf_to_stablehlo,
+    const bool deserialize_xla_call_module,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes) {
+  mlir::PassManager pm_before_freezing_variables(context);
+  mlir::StatusScopedDiagnosticHandler statusHandler(module_op.getContext(),
+                                                    /*propagate=*/true);
+
+  mlir::TF::StandardPipelineOptions standard_pipeline_options;
+  standard_pipeline_options.enable_inliner = false;
+  standard_pipeline_options.form_clusters = false;
+  mlir::TF::CreateTFStandardPipeline(pm_before_freezing_variables,
+                                     standard_pipeline_options);
+
+  // The AddQuantizationUnitLocPass should be added before any other passes.
+  pm_before_freezing_variables.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateAddQuantizationUnitLocPass());
+  pm_before_freezing_variables.addNestedPass<mlir::func::FuncOp>(
+      mlir::TFDevice::CreateDecomposeResourceOpsPass());
+
+  mlir::PassManager pm_after_freezing_variables(context);
+  pm_after_freezing_variables.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm_after_freezing_variables.addPass(mlir::createCanonicalizerPass());
+
+  // Makes certain functions immune to the `InlinerPass`. Used to preserve
+  // aliased functions.
+  pm_after_freezing_variables.addNestedPass<mlir::func::FuncOp>(
+      mlir::tf_quant::CreateMarkFunctionsNoinlinePass(std::vector<std::string>(
+          noinline_functions.begin(), noinline_functions.end())));
+  if (is_inliner_run) {
+    pm_after_freezing_variables.addPass(mlir::createInlinerPass());
+  }
+  if (run_tf_to_stablehlo) {
+    // AddLegalizeTFToStablehloPasses expects frozen TF variables when
+    // legalizing to stablehlo.constant.
+    AddTFToStablehloPasses(pm_after_freezing_variables, input_arg_shapes);
+  }
+
+  if (deserialize_xla_call_module) {
+    // Deserialize the StableHLO module embedded in tf.XlaCallModule and lifts
+    // the StableHLO functions to the top level module. This is needed for
+    // StableHLO quantization. Also restores some shape information for
+    // XlaCallModuleOps and CustomAggregatorOps lost from the calibration step.
+    AddXlaCallModuleOpDeserializationPasses(pm_after_freezing_variables);
+  }
+
+  if (const auto pre_variable_freezing_status = RunPassesOnModuleOp(
+          /*mlir_dump_file_name=*/absl::StrCat(
+              mlir_dump_file_prefix, "_preprocess_pre_variable_freezing"),
+          pm_before_freezing_variables, module_op);
+      !pre_variable_freezing_status.ok()) {
+    return pre_variable_freezing_status;
+  }
+
+  if (!session.has_value() || !*session) {
+    mlir::PassManager pm_freezing_variables(context);
+    // This pass does resource analysis of saved model global tensors and marks
+    // those deemed read-only as immutable.
+    pm_freezing_variables.addPass(
+        mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
+
+    pm_freezing_variables.addPass(
+        mlir::tf_saved_model::CreateFreezeGlobalTensorsPass(
+            /*allow_mutable_tensors=*/true));
+
+    pm_freezing_variables.addPass(
+        mlir::TFL::CreateUnfreezeMutableGlobalTensorsPass());
+
+    if (const auto variable_freezing_status = RunPassesOnModuleOp(
+            /*mlir_dump_file_name=*/absl::StrCat(
+                mlir_dump_file_prefix, "_preprocess_variable_freezing"),
+            pm_freezing_variables, module_op);
+        !variable_freezing_status.ok()) {
+      return variable_freezing_status;
+    }
+  } else if (failed(
+                 mlir::tf_saved_model::FreezeVariables(module_op, *session))) {
+    return statusHandler.ConsumeStatus();
+  }
+
+  return RunPassesOnModuleOp(
+      /*mlir_dump_file_name=*/absl::StrCat(
+          mlir_dump_file_prefix, "_preprocess_post_variable_freezing"),
+      pm_after_freezing_variables, module_op);
+}
+
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.h b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.h
new file mode 100644
index 000000000000..b951557caca1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tf_quantize_preprocess.h
@@ -0,0 +1,86 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TF_QUANTIZE_PREPROCESS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TF_QUANTIZE_PREPROCESS_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// Default MLIR dump file prefix for TensorFlow quantization passes.
+inline constexpr absl::string_view kDefaultTfQuantMlirDumpFilePrefix =
+    "tf_quant";
+
+// Preprocesses the `module_op` for quantization. The preprocess steps include
+// freezing the variables in the graph into constants. `is_inliner_run`
+// determines whether the `InlinerPass` should be run after unfreezing.
+//
+// `mlir_dump_file_prefix` is primarily used for debugging and does not affect
+// the preprocessing behavior. Instructions for producing MLIR dump files are in
+// the comments of `tensorflow::quantization::MaybeEnableIrPrinting` function.
+absl::Status PreprocessAndFreezeGraph(
+    absl::string_view mlir_dump_file_prefix, bool is_inliner_run,
+    const absl::flat_hash_set<std::string>& noinline_functions,
+    mlir::ModuleOp module_op, mlir::MLIRContext* context,
+    std::optional<Session*> session, bool run_tf_to_stablehlo,
+    bool deserialize_xla_call_module,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes = {});
+
+// Overload of `PreprocessAndFreezeGraph` that uses the default MLIR dump file
+// prefix.
+inline absl::Status PreprocessAndFreezeGraph(mlir::ModuleOp module_op,
+                                             mlir::MLIRContext* context,
+                                             std::optional<Session*> session) {
+  return PreprocessAndFreezeGraph(
+      /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
+      /*is_inliner_run=*/true, /*noinline_functions=*/{}, module_op, context,
+      session, /*run_tf_to_stablehlo=*/false,
+      /*deserialize_xla_call_module=*/false, /*input_arg_shapes=*/{});
+}
+
+// Overload of `PreprocessAndFreezeGraph` that uses the default MLIR dump file
+// prefix.
+inline absl::Status PreprocessAndFreezeGraph(mlir::ModuleOp module_op,
+                                             mlir::MLIRContext* context) {
+  return PreprocessAndFreezeGraph(
+      /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
+      /*is_inliner_run=*/true, /*noinline_functions=*/{}, module_op, context,
+      nullptr, /*run_tf_to_stablehlo=*/false,
+      /*deserialize_xla_call_module=*/false, /*input_arg_shapes=*/{});
+}
+
+// TF->StableHLO has limited support for dynamic shapes.
+// Some models can only be converted with explicitly provided input argument
+// shapes.
+void AddTFToStablehloPasses(
+    mlir::PassManager& pm,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes = {});
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TF_QUANTIZE_PREPROCESS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
index fcd42b88cc30..584444e8cb9e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
@@ -23,6 +23,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "temp_fake_quant_utils",
+    srcs = ["temp_fake_quant_utils.cc"],
+    hdrs = [
+        "temp_fake_quant_utils.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "tf_quantize_op_utils",
     srcs = ["tf_quantize_op_utils.cc"],
@@ -34,6 +53,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_tf_to_uniform_attribute_utils",
+    srcs = ["tf_tf_to_uniform_attribute_utils.cc"],
+    hdrs = ["tf_tf_to_uniform_attribute_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common/tf_quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_uniform_op_quant_spec",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "tf_to_uniform_attribute_utils",
     srcs = ["tf_to_uniform_attribute_utils.cc"],
@@ -73,6 +114,25 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tf_tf_to_xla_attribute_utils",
+    srcs = ["tf_tf_to_xla_attribute_utils.cc"],
+    hdrs = ["tf_tf_to_xla_attribute_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite/core/c:tflite_common",
+        "//tensorflow/compiler/mlir/lite/kernels:padding",
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:tf_constant_fold",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:xla_data_proto_cc",
+    ],
+)
+
 cc_library(
     name = "tf_to_xla_attribute_utils",
     srcs = ["tf_to_xla_attribute_utils.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.cc
new file mode 100644
index 000000000000..bcde1612898a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.cc
@@ -0,0 +1,73 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copied and modified from
+// //third_party/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.cc
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.h"
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+
+namespace mlir {
+namespace tf_quant {
+
+// Three instances of the rule to cover the three different types of
+// TF::FakeQuant operators
+using PreparePerTensorFakeQuant = ConvertFakeQuantOpToQuantOps<
+    TF::FakeQuantWithMinMaxVarsOp, /*PerAxis=*/false,
+    FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsOp>>;
+
+using PreparePerChannelFakeQuant = ConvertFakeQuantOpToQuantOps<
+    TF::FakeQuantWithMinMaxVarsPerChannelOp, /*PerAxis=*/true,
+    FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsPerChannelOp>>;
+
+using PreparePerTensorFakeQuantWithMinMaxArgs = ConvertFakeQuantOpToQuantOps<
+    TF::FakeQuantWithMinMaxArgsOp, /*PerAxis=*/false,
+    FetchMinMaxAttrs<TF::FakeQuantWithMinMaxArgsOp>>;
+
+// Removes the wrapper of the tf.FakeQuant* ops and creates the quant.qcast
+// and quant.dcast pairs before tf.FakeQuant* ops are being foled.
+LogicalResult ConvertFakeQuantOps(func::FuncOp func, MLIRContext* ctx,
+                                  bool use_fake_quant_num_bits) {
+  OpBuilder builder(func);
+
+  // Insert the quant.qcast/quant.dcast ops in place of the tf.FakeQuant* ops to
+  // preserve the quantization parameters.
+  func.walk([&](Operation* op) {
+    if (auto fake_quant = llvm::dyn_cast<TF::FakeQuantWithMinMaxArgsOp>(op)) {
+      (void)PreparePerTensorFakeQuantWithMinMaxArgs(use_fake_quant_num_bits)
+          .matchAndRewrite(fake_quant, builder);
+    } else if (auto fake_quant =
+                   llvm::dyn_cast<TF::FakeQuantWithMinMaxVarsOp>(op)) {
+      (void)PreparePerTensorFakeQuant(use_fake_quant_num_bits)
+          .matchAndRewrite(fake_quant, builder);
+    } else if (auto fake_quant =
+                   llvm::dyn_cast<TF::FakeQuantWithMinMaxVarsPerChannelOp>(
+                       op)) {
+      (void)PreparePerChannelFakeQuant(use_fake_quant_num_bits)
+          .matchAndRewrite(fake_quant, builder);
+    }
+  });
+
+  return success();
+}
+
+}  // namespace tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.h
new file mode 100644
index 000000000000..84119aa38b4a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/temp_fake_quant_utils.h
@@ -0,0 +1,160 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TF-Quant transformation
+// passes to work with tf.FakeQuant* ops. Copied and modified from
+// //third_party/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TEMP_FAKE_QUANT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TEMP_FAKE_QUANT_UTILS_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
+
+namespace mlir {
+namespace tf_quant {
+
+template <class TFFakeQuantOp>
+struct FetchMinMaxAttrs {
+  using AttrType = FloatAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    min_value = tf_op.getMinAttr();
+    max_value = tf_op.getMaxAttr();
+    return true;  // Successfully matched and fetched.
+  }
+};
+
+template <class TFFakeQuantOp>
+struct FetchConstantMinMaxInputs {
+  using AttrType = DenseFPElementsAttr;
+  bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
+                  AttrType &max_value) const {
+    Value min = tf_op.getMin(), max = tf_op.getMax();
+    if (auto min_id = min.getDefiningOp<TF::IdentityOp>()) {
+      min = min_id.getInput();
+    }
+    if (auto max_id = max.getDefiningOp<TF::IdentityOp>()) {
+      max = max_id.getInput();
+    }
+
+    if (!matchPattern(min, m_Constant(&min_value))) {
+      return false;
+    }
+    if (!matchPattern(max, m_Constant(&max_value))) {
+      return false;
+    }
+    return true;  // Successfully matched and fetched.
+  }
+};
+
+// Inserts a "quant.qcast" and "quant.dcast" op pair (QDQs) in place of the
+// tf.FakeQyantWithMinMax{Vars|VarsPerChannel|Args}Op
+// before the op being constant folded. Since the constant
+// folding logic will use a "arith.constant" op to replace the
+// "tf.FakeQuantWithMinMaxVarsOp", the "quant.qcast" op is used to preserve
+// the quantization parameters as a TypeAttr and "quant.dcast" op used to
+// convert the output type to the next op. Here are the transformations:
+//
+// input   min cst       max cst              input
+//  \       |             |                     |
+//   \  (tf.Identity) (tf.Identity)   =>   quant.qcast
+//    \     |             |                     |
+//       tf.FakeQuantWithMinMaxVars        quant.dcast
+//                   |                          |
+//
+// Warns if the (most likely unwanted, currently not quite correctly handled)
+// case of back-to-back tf.FakeQuant occurs
+//
+//             tf.FakeQuant*
+//                   |
+//             tf.FakeQuant*
+//
+template <typename TFFakeQuantOp, bool PerAxis, class FetchMinMax>
+class ConvertFakeQuantOpToQuantOps {
+ public:
+  explicit ConvertFakeQuantOpToQuantOps(bool use_fake_quant_num_bits)
+      : use_fake_quant_num_bits_(use_fake_quant_num_bits) {}
+
+  FetchMinMax fetch_min_max_;
+
+  using FetchAttrType = typename FetchMinMax::AttrType;
+  LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
+                                OpBuilder &rewriter) const {
+    if (tf_op.getNumBits() != 8) {
+      return failure();
+    }
+
+    // Extract the min/max constant values from the operands. We also consider
+    // a special case that there are tf.Identity ops between the min/max
+    // constants and the tf.FakeQuantWithMinMaxVarsOp.
+    FetchAttrType min_value, max_value;
+    if (!fetch_min_max_(tf_op, min_value, max_value)) {
+      return failure();
+    }
+
+    Value input = tf_op.getInputs();
+    int quant_dim = -1;
+    auto input_type = mlir::cast<ShapedType>(input.getType());
+    if (PerAxis) {
+      if (!input_type.hasRank()) {
+        tf_op.emitError("The input should have known rank for per-channel op.");
+        return failure();
+      }
+      // This is a special case that the quant_dim is the last dimensions.
+      quant_dim = input_type.getRank() - 1;
+    }
+    // Use the min/max from the operands and the num_bits and narrow_range
+    // attribute to create the quantization parameter for the new quantize op.
+    rewriter.setInsertionPointAfter(tf_op.getOperation());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
+    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
+    Type res_type = tf_op.getType();
+    TypeAttr qtype = tf_quant::GetQuantizedTypeAttr(
+        rewriter, input_type, min_value, max_value, quant_dim, num_bits,
+        narrow_range, /*is_signed=*/true, /*legacy_float_scale=*/false,
+        use_fake_quant_num_bits_);
+    if (!qtype) {
+      return failure();
+    }
+
+    // Finally, use the quantization parameter to create the quantize and
+    // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
+    // and its users.
+    auto quantize = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
+        tf_op.getLoc(), qtype.getValue(), input);
+    auto dequantize = rewriter.create<mlir::quant::ir::DequantizeCastOp>(
+        tf_op.getLoc(), res_type, quantize.getResult());
+    tf_op.getOutputs().replaceAllUsesWith(dequantize);
+
+    return success();
+  }
+
+  bool use_fake_quant_num_bits_;
+};
+
+// Removes the wrapper of the tf.FakeQuant* ops and creates the quant.qcast
+// and quant.dcast pairs before tf.FakeQuant* ops are being folded.
+LogicalResult ConvertFakeQuantOps(func::FuncOp func, MLIRContext *ctx,
+                                  bool use_fake_quant_num_bits);
+
+}  // namespace tf_quant
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TEMP_FAKE_QUANT_UTILS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.cc
new file mode 100644
index 000000000000..2dda8bc4fd35
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.cc
@@ -0,0 +1,473 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.h"
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_quantization_lib/tf_quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_uniform_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/core/util/quantization/uniform_quant_ops_attr.pb.h"
+
+namespace mlir::tf_quant {
+
+using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+
+enum class OpType {
+  kDynamicRangeOp,  // Dynamic Range kernels only have rhs attr.
+  kUnaryOp,         // Unary ops have one min/max attr.
+  kBinaryOp,        // Binary ops have lhs/rhs attr.
+  kQuantizationOp,  // Quantization ops have input/output attr.
+};
+
+// For each op type, the following axis carries axis information:
+// kDynamicRangeOp: rhs_quantization_axis will carry axis information.
+// kUnaryOp: quantization_axis will carry axis information.
+// kBinaryOp: Among {lhs, rhs, output}_quantization_axis, only check rhs.
+// kQuantizationOp: Among {input, output}_quantization_axis, only check input.
+// We therefore check exemplary 3 axes {rhs_, input_, }quantization_axis from
+// previous accumulations.
+constexpr std::array<absl::string_view, 3> kQuantizationAxisAttrs = {
+    "input_quantization_axis", "quantization_axis", "rhs_quantization_axis"};
+
+// Common suffixes for attributes used in FillQuantizationAttributes.
+constexpr std::array<absl::string_view, 2> kSuffixes = {"_min_val", "_max_val"};
+
+Attribute GetWindowStridesValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  ArrayAttr stride = mlir::dyn_cast<ArrayAttr>(identifier_to_attr["strides"]);
+  const int stride_h = mlir::cast<IntegerAttr>(stride[1]).getInt();
+  const int stride_w = mlir::cast<IntegerAttr>(stride[2]).getInt();
+  return rewriter.getI64ArrayAttr({stride_h, stride_w});
+}
+
+Attribute GetLhsDilationValue(PatternRewriter& rewriter,
+                              llvm::StringMap<Attribute>& identifier_to_attr) {
+  return rewriter.getI64ArrayAttr({1, 1});
+}
+
+Attribute GetRhsDilationValue(PatternRewriter& rewriter,
+                              llvm::StringMap<Attribute>& identifier_to_attr) {
+  ArrayAttr dilations =
+      mlir::dyn_cast<ArrayAttr>(identifier_to_attr["dilations"]);
+  const int dilation_h = mlir::cast<IntegerAttr>(dilations[1]).getInt();
+  const int dilation_w = mlir::cast<IntegerAttr>(dilations[2]).getInt();
+  return rewriter.getI64ArrayAttr({dilation_h, dilation_w});
+}
+
+Attribute GetPaddingValue(PatternRewriter& rewriter,
+                          llvm::StringMap<Attribute>& identifier_to_attr) {
+  llvm::StringRef padding =
+      mlir::dyn_cast<StringAttr>(identifier_to_attr["padding"]).getValue();
+  return rewriter.getStringAttr(padding);
+}
+
+Attribute GetExplicitPaddingValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  ArrayAttr explicit_padding =
+      mlir::dyn_cast<ArrayAttr>(identifier_to_attr["explicit_paddings"]);
+  return explicit_padding;
+}
+
+Attribute GetDimensionNumbersValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  // Only NHWC is lifted in TF-quant and the corresponding dimension number is
+  // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f].
+
+  tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr dimension_numbers;
+  if (!tensorflow::protobuf::TextFormat::ParseFromString(
+          R"pb(
+            input_batch_dimension: 0
+            input_feature_dimension: 3
+            input_spatial_dimensions: 1
+            input_spatial_dimensions: 2
+            kernel_output_feature_dimension: 3
+            kernel_input_feature_dimension: 2
+            kernel_spatial_dimensions: 0
+            kernel_spatial_dimensions: 1
+            output_batch_dimension: 0
+            output_feature_dimension: 3
+            output_spatial_dimensions: 1
+            output_spatial_dimensions: 2
+          )pb",
+          &dimension_numbers)) {
+    return rewriter.getStringAttr("");
+  }
+  return rewriter.getStringAttr(dimension_numbers.SerializeAsString());
+}
+
+Attribute GetBatchGroupCountValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  // Only 1 case is supported.
+  return rewriter.getI64IntegerAttr(1);
+}
+
+Attribute GetQuantizationAxis(PatternRewriter& rewriter, Operation* op,
+                              const int operand_index) {
+  auto* defining_op = op->getOperand(operand_index).getDefiningOp();
+  for (auto attr : kQuantizationAxisAttrs) {
+    if (defining_op->hasAttr(attr)) {
+      return defining_op->getAttr(attr);
+    }
+  }
+  // Not found.
+  return rewriter.getI64IntegerAttr(-1);
+}
+
+LogicalResult CheckIfAttrIs8Bit(const std::string& attr, Operation* op,
+                                bool& is_8_bit) {
+  Type element_type;
+  if (attr == "lhs_quantization" || attr == "input_quantization" ||
+      attr == "quantization") {
+    if (op->getNumOperands() < 1) {
+      return failure();
+    }
+    element_type = getElementTypeOrSelf(op->getOperand(0).getType());
+  }
+  if (attr == "rhs_quantization") {
+    if (op->getNumOperands() < 2) {
+      return failure();
+    }
+    element_type = getElementTypeOrSelf(op->getOperand(1).getType());
+  }
+  if (attr == "output_quantization") {
+    if (op->getNumResults() < 1) {
+      return failure();
+    }
+    element_type = getElementTypeOrSelf(op->getOpResult(0).getType());
+  }
+  if (element_type) {
+    is_8_bit = mlir::isa<TF::Qint8Type>(element_type);
+    return success();
+  }
+  return failure();
+}
+
+LogicalResult FillQuantizationAttributes(
+    PatternRewriter& rewriter, Operation* op, NamedAttrList& attrs,
+    llvm::StringMap<Attribute>& identifier_to_attr, OpType op_type) {
+  absl::flat_hash_map<std::string, int> min_max_scheme_for_8bit = {
+      {"min", -128}, {"max", 127}};
+  absl::flat_hash_map<std::string, int> min_max_schema_for_32bit = {
+      {"min", -2147483648}, {"max", 2147483647}};
+
+  std::vector<std::string> quantization_attributes;
+  switch (op_type) {
+    case OpType::kDynamicRangeOp:
+      quantization_attributes = {"rhs_quantization"};
+      break;
+    case OpType::kUnaryOp:
+      quantization_attributes = {"quantization"};
+      break;
+    case OpType::kBinaryOp:
+      quantization_attributes = {"lhs_quantization", "rhs_quantization",
+                                 "output_quantization"};
+      break;
+    case OpType::kQuantizationOp:
+      quantization_attributes = {"input_quantization", "output_quantization"};
+      break;
+    default:
+      quantization_attributes = {};
+      break;
+  }
+
+  for (const auto& attr : quantization_attributes) {
+    bool attr_is_8_bit;
+    if (failed(CheckIfAttrIs8Bit(attr, op, attr_is_8_bit))) {
+      return failure();
+    }
+    for (int i = 0; i < kSuffixes.size(); i++) {
+      int64_t quant_val;
+      if (attr_is_8_bit) {
+        quant_val = i == 0 ? min_max_scheme_for_8bit["min"]
+                           : min_max_scheme_for_8bit["max"];
+      } else {
+        quant_val = i == 0 ? min_max_schema_for_32bit["min"]
+                           : min_max_schema_for_32bit["max"];
+      }
+      std::string attr_minmax = absl::StrCat(attr, kSuffixes[i]);
+      attrs.push_back(rewriter.getNamedAttr(
+          attr_minmax, rewriter.getI64IntegerAttr(quant_val)));
+    }
+  }
+  return success();
+}
+
+// This LogicalResult covers both the hybrid and fully quantized op cases.
+LogicalResult FillAttributesForUniformQuantizedDotOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  if (quantization_method ==
+      tensorflow::quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8) {
+    // Fill quantization related attributes for Hybrid op.
+    if (failed(FillQuantizationAttributes(rewriter, op, attrs,
+                                          identifier_to_attr,
+                                          OpType::kDynamicRangeOp))) {
+      return failure();
+    }
+  } else {
+    // Fill quantization related attributes for fully quantized op.
+    if (failed(FillQuantizationAttributes(
+            rewriter, op, attrs, identifier_to_attr, OpType::kBinaryOp))) {
+      return failure();
+    }
+    // Per-channel activation is not supported
+    attrs.push_back(rewriter.getNamedAttr("lhs_quantization_axis",
+                                          rewriter.getI64IntegerAttr(-1)));
+  }
+
+  std::unique_ptr<OpQuantSpec> spec = GetUniformOpQuantSpec(op);
+  absl::flat_hash_set<int> operands = spec->quantizable_operands;
+  int quant_dim = -1;
+  if (enable_per_channel_quantization && operands.size() == 1) {
+    quant_dim = spec->coeff_op_quant_dim[*(operands.begin())];
+  }
+  attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
+                                        rewriter.getI64IntegerAttr(quant_dim)));
+  attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                        rewriter.getI64IntegerAttr(quant_dim)));
+
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+// This LogicalResult covers both the hybrid and fully quantized op cases.
+LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+  absl::flat_hash_map<std::string, Attribute (*)(PatternRewriter&,
+                                                 llvm::StringMap<Attribute>&)>
+      attribute_getter_map;
+
+  attribute_getter_map = {{"window_strides", GetWindowStridesValue},
+                          {"lhs_dilation", GetLhsDilationValue},
+                          {"rhs_dilation", GetRhsDilationValue},
+                          {"padding", GetPaddingValue},
+                          {"explicit_padding", GetExplicitPaddingValue},
+                          {"dimension_numbers", GetDimensionNumbersValue},
+                          {"batch_group_count", GetBatchGroupCountValue}};
+
+  for (auto& attr : op->getAttrs()) {
+    llvm::StringRef attr_name = attr.getName().getValue();
+    if (attribute_getter_map.find(attr_name.str()) !=
+        attribute_getter_map.end()) {
+      auto attr_val =
+          (attribute_getter_map[attr_name.str()])(rewriter, identifier_to_attr);
+      attrs.push_back(rewriter.getNamedAttr(attr_name, attr_val));
+    }
+  }
+
+  auto feature_group_cnt_attr = llvm::StringRef("feature_group_count");
+  int feature_group_cnt = 1;
+  ShapedType input_shape =
+      mlir::dyn_cast<ShapedType>(op->getOperand(0).getType());
+  if (!input_shape) {
+    return op->emitError(
+        "Only input with known shape is supported for Uniform Quantized "
+        "opset.");
+  }
+
+  if (op->getParentOfType<func::FuncOp>().getName().contains("depthwise_")) {
+    feature_group_cnt = input_shape.getDimSize(3);
+  }
+
+  attrs.push_back(rewriter.getNamedAttr(
+      feature_group_cnt_attr, rewriter.getI64IntegerAttr(feature_group_cnt)));
+
+  if (quantization_method ==
+      tensorflow::quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8) {
+    // Fill quantization related attributes for Hybrid op.
+    if (failed(FillQuantizationAttributes(rewriter, op, attrs,
+                                          identifier_to_attr,
+                                          OpType::kDynamicRangeOp))) {
+      return failure();
+    }
+  } else {
+    // Fill quantization related attributes for fully quantized op.
+    if (failed(FillQuantizationAttributes(
+            rewriter, op, attrs, identifier_to_attr, OpType::kBinaryOp))) {
+      return failure();
+    }
+  }
+
+  if (quantization_method !=
+      tensorflow::quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8) {
+    // Per-channel activation is not supported
+    attrs.push_back(rewriter.getNamedAttr("lhs_quantization_axis",
+                                          rewriter.getI64IntegerAttr(-1)));
+  }
+
+  std::unique_ptr<OpQuantSpec> spec = GetUniformOpQuantSpec(op);
+  absl::flat_hash_set<int> operands = spec->quantizable_operands;
+  int quant_dim = -1;
+  if (enable_per_channel_quantization && operands.size() == 1) {
+    quant_dim = spec->coeff_op_quant_dim[*(operands.begin())];
+  }
+  attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
+                                        rewriter.getI64IntegerAttr(quant_dim)));
+  attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                        rewriter.getI64IntegerAttr(quant_dim)));
+
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+LogicalResult FillAttributesForUniformQuantizedAddOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    const QuantMethod quantization_method,
+    const bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  if (failed(FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                                        OpType::kBinaryOp))) {
+    return failure();
+  }
+  Attribute activation_quantization_axis = rewriter.getI64IntegerAttr(-1);
+  if (enable_per_channel_quantization) {
+    // If either of lhs or rhs is per-channel quantized, the quantization axis
+    // must match for lhs, rhs, and output.
+    activation_quantization_axis =
+        GetQuantizationAxis(rewriter, op, /*operand_index=*/0);
+    if (activation_quantization_axis == rewriter.getI64IntegerAttr(-1)) {
+      activation_quantization_axis =
+          GetQuantizationAxis(rewriter, op, /*operand_index=*/1);
+    }
+  }
+  attrs.push_back(rewriter.getNamedAttr("lhs_quantization_axis",
+                                        activation_quantization_axis));
+  attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
+                                        activation_quantization_axis));
+  attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                        activation_quantization_axis));
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+LogicalResult FillAttributesForUniformQuantizedClipByValueOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  if (failed(FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                                        OpType::kUnaryOp))) {
+    return failure();
+  }
+
+  Attribute activation_quantization_axis = rewriter.getI64IntegerAttr(-1);
+  if (enable_per_channel_quantization) {
+    activation_quantization_axis =
+        GetQuantizationAxis(rewriter, op, /*operand_index=*/0);
+  }
+  attrs.push_back(
+      rewriter.getNamedAttr("quantization_axis", activation_quantization_axis));
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+LogicalResult FillAttributesForUniformRequantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  if (failed(FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                                        OpType::kQuantizationOp))) {
+    return failure();
+  }
+
+  Attribute activation_quantization_axis = rewriter.getI64IntegerAttr(-1);
+  Attribute output_quantization_axis = rewriter.getI64IntegerAttr(-1);
+  // TODO(b/296916785): Revisit axis assignment logic.
+  if (enable_per_channel_quantization) {
+    activation_quantization_axis =
+        GetQuantizationAxis(rewriter, op, /*operand_index=*/0);
+
+    auto output_scale_type =
+        mlir::dyn_cast<ShapedType>(op->getOperand(3).getType());
+    if (!output_scale_type) {
+      return failure();
+    }
+    if (output_scale_type.hasRank() && 0 < output_scale_type.getRank()) {
+      output_quantization_axis = activation_quantization_axis;
+    }
+  }
+  // For per-axis -> per-axis requantization, input and output quantization
+  // axis must be equal.
+  attrs.push_back(rewriter.getNamedAttr("input_quantization_axis",
+                                        activation_quantization_axis));
+  attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                        output_quantization_axis));
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+LogicalResult FillAttributesForUniformQuantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  if (failed(FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                                        OpType::kUnaryOp))) {
+    return failure();
+  }
+  Attribute quantization_axis = rewriter.getI64IntegerAttr(-1);
+  // TODO(b/296916785): Revisit axis assignment logic.
+  if (enable_per_channel_quantization) {
+    quantization_axis = rewriter.getI64IntegerAttr(3);
+  }
+
+  attrs.push_back(
+      rewriter.getNamedAttr("quantization_axis", quantization_axis));
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+  return success();
+}
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.h
new file mode 100644
index 000000000000..adb6b6e9b1ab
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_uniform_attribute_utils.h
@@ -0,0 +1,72 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This header file defines common utils used when transforming TF ops to
+// Uniform Quantized ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
+
+#include "llvm/ADT/StringMap.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+
+namespace mlir::tf_quant {
+
+LogicalResult FillAttributesForUniformQuantizedDotOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedAddOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedClipByValueOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformRequantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizeOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::PresetMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+}  // namespace mlir::tf_quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_xla_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_xla_attribute_utils.cc
new file mode 100644
index 000000000000..f52864190c38
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_xla_attribute_utils.cc
@@ -0,0 +1,312 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <numeric>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_format.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/core/c/builtin_op_data.h"
+#include "tensorflow/compiler/mlir/lite/kernels/padding.h"
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/tf_constant_fold.h"
+#include "xla/xla_data.pb.h"
+
+namespace mlir::tf_quant {
+namespace {
+
+Value GetDimValue(OpBuilder &builder, Location loc, Value shape_value,
+                  int32_t dim) {
+  Type attribute_type = builder.getI64Type();
+  return builder.create<TF::StridedSliceOp>(
+      loc,
+      RankedTensorType::get(
+          {}, mlir::cast<ShapedType>(shape_value.getType()).getElementType()),
+      /*input=*/shape_value,
+      /*begin=*/Create1DConstValue<int32_t>(builder, loc, {dim}),
+      /*end=*/Create1DConstValue<int32_t>(builder, loc, {dim + 1}),
+      /*strides=*/Create1DConstValue<int32_t>(builder, loc, {1}),
+      /*begin_mask=*/builder.getIntegerAttr(attribute_type, 0),
+      /*end_mask=*/builder.getIntegerAttr(attribute_type, 0),
+      /*ellipsis_mask=*/builder.getIntegerAttr(attribute_type, 0),
+      /*new_axis_mask=*/builder.getIntegerAttr(attribute_type, 0),
+      /*shrink_axis_mask=*/builder.getIntegerAttr(attribute_type, 1));
+}
+
+// Given Value input_size, and known numbers filter_sz, dilation_rate, stride,
+// calculate padding_low and padding_high for SAME padding.
+void GetSamePaddingValues(OpBuilder &builder, Location loc, Value input_size,
+                          int64_t filter_sz, int64_t dilation_rate,
+                          int64_t stride, Value &padding_low,
+                          Value &padding_high) {
+  Value zero = CreateScalarConstValue<int32_t>(builder, loc, 0);
+  Value one = CreateScalarConstValue<int32_t>(builder, loc, 1);
+  Value two = CreateScalarConstValue<int32_t>(builder, loc, 2);
+  Value filter_size = CreateScalarConstValue<int32_t>(builder, loc, filter_sz);
+  Type int32_scalar_type = zero.getType();
+
+  auto scalar_add = [&](Value lhs, Value rhs) {
+    return builder.create<TF::AddOp>(loc, int32_scalar_type, lhs, rhs);
+  };
+  auto scalar_mul = [&](Value lhs, Value rhs) {
+    return builder.create<TF::MulOp>(loc, int32_scalar_type, lhs, rhs);
+  };
+  auto scalar_sub = [&](Value lhs, Value rhs) {
+    return builder.create<TF::SubOp>(loc, int32_scalar_type, lhs, rhs);
+  };
+  auto scalar_div = [&](Value lhs, Value rhs) {
+    return builder.create<TF::DivOp>(loc, int32_scalar_type, lhs, rhs);
+  };
+
+  // effective_filter_size = (filter_size - 1) * dilation_rate + 1
+  Value stride_value = CreateScalarConstValue<int32_t>(builder, loc, stride);
+  Value dilation_rate_value =
+      CreateScalarConstValue<int32_t>(builder, loc, dilation_rate);
+
+  Value effective_filter_size_op = scalar_add(
+      scalar_mul(dilation_rate_value, scalar_sub(filter_size, one)), one);
+
+  // output_size = (input_size + stride - 1) / stride
+  Value output_size = scalar_div(
+      scalar_add(input_size, scalar_sub(stride_value, one)), stride_value);
+  // padding_needed = std::max(
+  //     0,
+  //     (output_size - 1) * stride + effective_filter_size - input_size)
+  Value padding_needed = scalar_sub(
+      scalar_add(effective_filter_size_op,
+                 scalar_mul(stride_value, scalar_sub(output_size, one))),
+      input_size);
+  padding_needed = builder.create<TF::MaximumOp>(loc, padding_needed, zero);
+  padding_low = scalar_div(padding_needed, two);
+  padding_high = scalar_sub(padding_needed, padding_low);
+}
+
+Value PadForDynamicShapedInputSamePadding(
+    OpBuilder &builder, Location loc, Value input, Value filter,
+    int8_t input_zp_value, ArrayAttr strides, ArrayAttr dilations,
+    StringAttr conv_padding, Value &padding, int num_dims) {
+  Value zero_rank1 = CreateConstValue<int32_t>(builder, loc, {1}, {0});
+  SmallVector<Value> temp_padding_values{zero_rank1, zero_rank1};
+
+  auto reshape_op = [&](Value value, const SmallVector<int64_t> &shape) {
+    const int64_t rank = shape.size();
+    return builder.create<TF::ReshapeOp>(
+        loc, RankedTensorType::get(shape, builder.getI32Type()), value,
+        CreateConstValue<int64_t>(builder, loc, {rank}, shape));
+  };
+
+  ShapedType filter_shape = mlir::cast<ShapedType>(filter.getType());
+  Value input_shape_value = builder.create<TF::ShapeOp>(
+      loc, RankedTensorType::get({num_dims}, builder.getI32Type()), input);
+  auto scalar_to_rank1 = [&](Value value) { return reshape_op(value, {1}); };
+  for (int i : llvm::seq<int>(1, num_dims - 1)) {
+    Value input_size_i = GetDimValue(builder, loc, input_shape_value, i);
+    const int stride_i = mlir::cast<IntegerAttr>(strides[i]).getInt();
+    const int dilation_i = mlir::cast<IntegerAttr>(dilations[i]).getInt();
+    const int filter_i = filter_shape.getDimSize(i - 1);
+    Value pad_i_low, pad_i_high;
+    GetSamePaddingValues(builder, loc, input_size_i, filter_i, dilation_i,
+                         stride_i, pad_i_low, pad_i_high);
+    temp_padding_values.push_back(scalar_to_rank1(pad_i_low));
+    temp_padding_values.push_back(scalar_to_rank1(pad_i_high));
+  }
+  temp_padding_values.push_back(zero_rank1);
+  temp_padding_values.push_back(zero_rank1);
+
+  padding = CreateConstValue<int32_t>(
+      builder, loc, /*shape=*/{num_dims - 2, 2},
+      /*values=*/SmallVector<int32_t>(2 * (num_dims - 2), 0));
+  Value zero = CreateScalarConstValue(builder, loc, 0);
+  Value temp_padding_rank1 = builder.create<TF::ConcatOp>(
+      loc, RankedTensorType::get({2 * num_dims}, builder.getI32Type()), zero,
+      temp_padding_values);
+  Value temp_padding = reshape_op(temp_padding_rank1, {num_dims, 2});
+  return builder.create<TF::PadV2Op>(
+      loc, input.getType(), input, temp_padding,
+      CreateScalarConstValue<int8_t>(builder, loc, input_zp_value));
+}
+
+}  // namespace
+
+// If input spatial sizes are dynamic (unknown) and padding is same, add ops to
+// dynamically calculate padding size and add input_zp value Pad op with the
+// padding.
+// Otherwise, calculates padding with known numbers, and only for non-zero
+// padding (input_zp != 0), adds Pad op before convolution.
+Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
+                                     Value input, Value filter,
+                                     int8_t input_zp_value, ArrayAttr strides,
+                                     ArrayAttr dilations,
+                                     StringAttr conv_padding,
+                                     ArrayAttr explicit_paddings,
+                                     Value &padding, int num_dims) {
+  ShapedType input_shape = mlir::cast<ShapedType>(input.getType());
+  SmallVector<int64_t> spatial_dims(num_dims - 2);
+  absl::c_iota(spatial_dims, 1);
+  bool has_dynamic_spatial_dim = absl::c_any_of(
+      spatial_dims,
+      [&input_shape](int64_t dim) { return input_shape.isDynamicDim(dim); });
+  if (conv_padding.strref() == "SAME" && has_dynamic_spatial_dim) {
+    return PadForDynamicShapedInputSamePadding(
+        builder, loc, input, filter, input_zp_value, strides, dilations,
+        conv_padding, padding, num_dims);
+  }
+
+  ShapedType filter_shape = mlir::cast<ShapedType>(filter.getType());
+  SmallVector<int32_t> padding_values(2 * num_dims, 0);
+  if (conv_padding.strref() == "EXPLICIT") {
+    if (explicit_paddings.size() != 2 * num_dims) {
+      emitError(loc,
+                absl::StrFormat(
+                    "explicit_paddings are expected to be %d-element arrays",
+                    2 * num_dims));
+      return {};
+    }
+    for (int i : spatial_dims) {
+      padding_values[2 * i] =
+          mlir::cast<IntegerAttr>(explicit_paddings[2 * i]).getInt();
+      padding_values[2 * i + 1] =
+          mlir::cast<IntegerAttr>(explicit_paddings[2 * i + 1]).getInt();
+    }
+  } else if (conv_padding.strref() == "SAME") {
+    for (int i : spatial_dims) {
+      int input_size = input_shape.getDimSize(i);
+      int filter_size = filter_shape.getDimSize(i - 1);
+      int stride_i = mlir::cast<IntegerAttr>(strides[i]).getInt();
+      int dilation_i = mlir::cast<IntegerAttr>(dilations[i]).getInt();
+
+      // LINT.IfChange
+      int out_size = tflite_migration::ComputeOutSize(
+          kTfLitePaddingSame, input_size, filter_size, stride_i, dilation_i);
+
+      int offset = 0;
+      int padding_before = tflite_migration::ComputePaddingWithOffset(
+          stride_i, dilation_i, input_size, filter_size, out_size, &offset);
+      // LINT.ThenChange(//tensorflow/lite/kernels/padding.h)
+
+      int padding_after = padding_before + offset;
+      padding_values[2 * i] = padding_before;
+      padding_values[2 * i + 1] = padding_after;
+    }
+  }
+
+  if (input_zp_value == 0 ||
+      absl::c_all_of(padding_values, [](int v) { return v == 0; })) {
+    padding = CreateConstValue<int32_t>(
+        builder, loc, {num_dims - 2, 2},
+        SmallVector<int32_t>(padding_values.begin() + 2,
+                             padding_values.end() - 2));
+    return input;
+  }
+  padding =
+      CreateConstValue<int32_t>(builder, loc, {num_dims - 2, 2},
+                                SmallVector<int32_t>(2 * (num_dims - 2), 0));
+
+  Value temp_padding =
+      CreateConstValue<int32_t>(builder, loc, {num_dims, 2}, padding_values);
+  SmallVector<int64_t> output_shape(input_shape.getShape().begin(),
+                                    input_shape.getShape().end());
+  for (int i : spatial_dims) {
+    output_shape[i] += padding_values[2 * i] + padding_values[2 * i + 1];
+  }
+
+  return builder.create<TF::PadV2Op>(
+      loc, RankedTensorType::get(output_shape, builder.getI8Type()), input,
+      temp_padding,
+      CreateScalarConstValue<int8_t>(builder, loc, input_zp_value));
+}
+
+// Pack value using following formula:
+// Consider value of rank=4, pack_dim=1 for example.
+//
+// if value.shape[1] % 2:
+//   value = pad(value, [0, 1, 0, 0])
+//
+// slice_shape = value.shape
+// slice_shape[1] /= 2
+//
+// packed_low = slice(value, [0, 0, 0, 0], slice_shape)
+// packed_low = bitwise_and(packed_low, 0x0F)
+//
+// packed_high = slice(value, [0, value.shape[1] / 2, 0, 0], slice_shape)
+// packed_high = left_shift(packed_high, 4)
+//
+// packed_value = bitwise_or(packed_low, packed_high)
+Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
+  ShapedType value_type = mlir::cast<ShapedType>(value.getType());
+  const int rank = value_type.getRank();
+
+  SmallVector<int64_t> packed_shape(value_type.getShape().begin(),
+                                    value_type.getShape().end());
+  RankedTensorType shape_type =
+      RankedTensorType::get({rank}, builder.getI64Type());
+  Value shape_value = builder.create<TF::ShapeOp>(loc, shape_type, value);
+
+  // It is guaranteed that packed_shape[pack_dim] is known.
+  if (packed_shape[pack_dim] % 2 != 0) {
+    packed_shape[pack_dim] += 1;
+    SmallVector<int32_t> padding(rank * 2, 0);
+    padding[pack_dim * 2 + 1] = 1;
+    Value padding_value =
+        CreateConstValue<int32_t>(builder, loc, {rank, 2}, padding);
+    value = builder.create<TF::PadV2Op>(
+        loc, RankedTensorType::get(packed_shape, builder.getI8Type()), value,
+        padding_value, CreateScalarConstValue<int8_t>(builder, loc, 0));
+
+    SmallVector<int64_t> shape_add(rank, 0);
+    shape_add[pack_dim] = 1;
+    shape_value = builder.create<TF::AddOp>(
+        loc, shape_type, shape_value,
+        CreateConstValue<int64_t>(builder, loc, {rank}, shape_add));
+  }
+  packed_shape[pack_dim] /= 2;
+  SmallVector<int64_t> divisor(rank, 1);
+  divisor[pack_dim] = 2;
+
+  RankedTensorType packed_output_type =
+      RankedTensorType::get(packed_shape, builder.getI8Type());
+  Value packed_shape_value = builder.create<TF::DivOp>(
+      loc, shape_type, shape_value,
+      CreateConstValue<int64_t>(builder, loc, {rank}, divisor));
+
+  Value packed_low_begin_value = CreateConstValue<int64_t>(
+      builder, loc, {rank}, SmallVector<int64_t>(rank, 0));
+  Value packed_low_value =
+      builder.create<TF::SliceOp>(loc, packed_output_type, value,
+                                  packed_low_begin_value, packed_shape_value);
+  packed_low_value = builder.create<TF::BitwiseAndOp>(
+      loc, packed_output_type, packed_low_value,
+      CreateScalarConstValue<int8_t>(builder, loc, 0x0F));
+
+  SmallVector<int64_t> packed_high_begin(rank, 0);
+  packed_high_begin[pack_dim] = packed_shape[pack_dim];
+  Value packed_high_begin_value =
+      CreateConstValue<int64_t>(builder, loc, {rank}, packed_high_begin);
+  Value packed_high_value =
+      builder.create<TF::SliceOp>(loc, packed_output_type, value,
+                                  packed_high_begin_value, packed_shape_value);
+  packed_high_value = builder.create<TF::LeftShiftOp>(
+      loc, packed_output_type, packed_high_value,
+      CreateScalarConstValue<int8_t>(builder, loc, 4));
+
+  Operation *packed = builder.create<TF::BitwiseOrOp>(
+      loc, packed_output_type, packed_low_value, packed_high_value);
+  return ConstantFoldOpIfPossible(packed).front();
+}
+
+}  // namespace mlir::tf_quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_xla_attribute_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_xla_attribute_utils.h
new file mode 100644
index 000000000000..c2d6ed460f30
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_tf_to_xla_attribute_utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used when transforming TF ops to XLA
+// ops.
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TF_TO_XLA_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TF_TO_XLA_ATTRIBUTE_UTILS_H_
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+
+namespace mlir::tf_quant {
+
+// Caclulate padding values for XLA ops.
+// Padding values for Uniform Quantized ops can be generated with this method as
+// well as it shares the same definition for padding attribute with the XLA ops.
+Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
+                                     Value input, Value filter,
+                                     int8_t input_zp_value, ArrayAttr strides,
+                                     ArrayAttr dilations,
+                                     StringAttr conv_padding,
+                                     ArrayAttr explicit_paddings,
+                                     Value &padding, int num_dims = 4);
+
+// Given value that is in 8bit type, but holds 4bit data in unpacked format,
+// pack to nibble format along pack_dim.
+// If the pack_dim size is odd, add 1-size 0 padding and then pack.
+Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim);
+
+}  // namespace mlir::tf_quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TF_TO_XLA_ATTRIBUTE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/stablehlo/BUILD b/tensorflow/compiler/mlir/stablehlo/BUILD
index d25c41e85585..be7299eaea8e 100644
--- a/tensorflow/compiler/mlir/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/stablehlo/BUILD
@@ -1,6 +1,9 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("@local_xla//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
+load("@local_xla//xla/tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
@@ -15,6 +18,10 @@ package(
 package_group(
     name = "friends",
     packages = [
+        "//platforms/darwinn/tools/visualization/graph_conversions/...",
+        "//tensorflow/compiler/mlir/lite/...",
+        "//tensorflow/compiler/mlir/quantization/...",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/...",
         "//tensorflow/compiler/tests/...",
     ],
 )
@@ -34,11 +41,11 @@ tsl_pybind_extension(
     ],
     features = ["-use_header_modules"],
     deps = [
-        "//third_party/python_runtime:headers",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:CAPIIR",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MLIRBindingsPythonHeadersAndDeps",
+        "@local_xla//third_party/python_runtime:headers",
         "@nanobind",
         "@stablehlo//:stablehlo_capi",
     ],
@@ -62,13 +69,33 @@ py_strict_test(
     ],
 )
 
+gentbl_cc_library(
+    name = "legalize_tf_patterns_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-rewriters"],
+            "transforms/generated_legalize_tf.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "transforms/legalize_tf_patterns.td",
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:TensorOpsTdFiles",
+        "@local_xla//xla/mlir_hlo:hlo_ops_td_files",
+    ],
+)
+
 cc_library(
     name = "fold_broadcast_pass",
     srcs = [
         "transforms/fold_broadcast_pass.cc",
     ],
     hdrs = [
-        "transforms/stablehlo_passes.h",
+        "transforms/fold_broadcast_pass.h",
     ],
     compatible_with = get_compatible_with_portable(),
     copts = [
@@ -87,3 +114,251 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "legalize_utils",
+    srcs = ["transforms/utils.cc"],
+    hdrs = ["transforms/utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
+
+tf_cc_test(
+    name = "legalize_utils_test",
+    srcs = ["transforms/utils_test.cc"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":legalize_utils",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
+
+cc_library(
+    name = "legalize_tf",
+    srcs = [
+        "transforms/generated_legalize_tf.inc",
+        "transforms/legalize_tf.cc",
+    ],
+    hdrs = [
+        "transforms/legalize_tf_passes.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":legalize_tf_patterns_inc_gen",
+        ":legalize_utils",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
+        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels:conv_grad_shape_utils",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_tsl//tsl/platform:bfloat16",
+        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/hlo/builder:padding",
+        "@local_xla//xla/hlo/builder:sharding_builder",
+        "@local_xla//xla/hlo/builder/lib:conv_grad_size_util",
+        "@local_xla//xla/hlo/translate/hlo_to_mhlo:attribute_importer",
+        "@local_xla//xla/mlir_hlo",
+        "@local_xla//xla/mlir_hlo:convert_op_folder",
+        "@local_xla//xla/tsl/platform:status",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+    ] + if_static(["@local_tsl//tsl/platform:tensor_float_32_utils"]),
+)
+
+cc_library(
+    name = "tf_stablehlo",
+    srcs = [
+        "transforms/tf_stablehlo_pass.cc",
+    ],
+    hdrs = [
+        "transforms/tf_stablehlo_pass.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        ":legalize_tf",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir_hlo",
+        "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/mlir_hlo:type_conversion",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:register",
+    ],
+    alwayslink = 1,
+)
+
+# LINT.IfChange(legalize_tf_xla_call_module_to_stablehlo_pass)
+cc_library(
+    name = "legalize_tf_xla_call_module_to_stablehlo_pass",
+    srcs = [
+        "transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc",
+    ],
+    hdrs = [
+        "transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_serialization",
+        "@stablehlo//:vhlo_ops",
+    ],
+    alwayslink = 1,
+)
+# LINT.ThenChange(//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass)
+
+cc_library(
+    name = "fuse_convolution_pass",
+    srcs = [
+        "transforms/mhlo_passes/fuse_convolution_pass.cc",
+    ],
+    hdrs = [
+        "transforms/mhlo_passes/fuse_convolution_pass.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/utils:validators",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir_hlo",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tf_fuse_convolution_pass",
+    srcs = [
+        "transforms/mhlo_passes/tf_fuse_convolution_pass.cc",
+    ],
+    hdrs = [
+        "transforms/mhlo_passes/tf_fuse_convolution_pass.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/common:tf_attrs_and_constraints",
+        "//tensorflow/compiler/mlir/utils:validators",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir_hlo",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "unfuse_batch_norm_pass",
+    srcs = [
+        "transforms/mhlo_passes/unfuse_batch_norm_pass.cc",
+    ],
+    hdrs = [
+        "transforms/mhlo_passes/unfuse_batch_norm_pass.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla/mlir_hlo",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "rename_entrypoint_to_main",
+    srcs = [
+        "transforms/rename_entrypoint_to_main.cc",
+    ],
+    hdrs = [
+        "transforms/rename_entrypoint_to_main.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.cc b/tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.cc
index 5023f3aadd18..ee39d8acd5d6 100644
--- a/tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.h"
 
 #include <cstdint>
 #include <functional>
@@ -35,7 +36,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/stablehlo/transforms/stablehlo_passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.h b/tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.h
new file mode 100644
index 000000000000..bed6201e0e2b
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/fold_broadcast_pass.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_FOLD_BROADCAST_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_FOLD_BROADCAST_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Constant folds broadcast_in_dim op conditionally.
+std::unique_ptr<Pass> createFoldBroadcastPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_FOLD_BROADCAST_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
similarity index 99%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf.cc
rename to tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
index d1e7dd75dcfa..beca54296e3c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf.cc
@@ -15,18 +15,21 @@ limitations under the License.
 
 // This file implements logic for lowering TensorFlow dialect to XLA dialect.
 #include <algorithm>
-#include <cctype>
+#include <cassert>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <iterator>
 #include <limits>
 #include <numeric>
 #include <optional>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -54,14 +57,14 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/utils.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_passes.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
-#include "xla/client/lib/conv_grad_size_util.h"
-#include "xla/client/padding.h"
-#include "xla/client/sharding_builder.h"
+#include "xla/hlo/builder/lib/conv_grad_size_util.h"
+#include "xla/hlo/builder/padding.h"
+#include "xla/hlo/builder/sharding_builder.h"
 #include "xla/hlo/translate/hlo_to_mhlo/attribute_importer.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/utils/convert_op_folder.h"
@@ -6842,7 +6845,7 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
 
 // Keep all these in the odml namespace to avoid collisions with the tf2xla
 // version for now.
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/generated_legalize_tf.inc"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/generated_legalize_tf.inc"
 
 void PopulatePatterns(MLIRContext *context, RewritePatternSet *patterns) {
   populateWithGenerated(*patterns);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_passes.h
similarity index 85%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h
rename to tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_passes.h
index 9594769e93f7..a81cc57b4d2f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_passes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
 
 #include <memory>
 #include <optional>
@@ -48,4 +48,4 @@ void PopulateLegalizeTfPatterns(MLIRContext* context,
 }  // namespace odml
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_LEGALIZE_TF_PASSES_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
similarity index 93%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_patterns.td
rename to tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
index dbe7457d9ee5..24b1d05bce97 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
@@ -33,8 +33,8 @@ def IEEEFloatTensor : TensorOf<[F16, F32, F64]>;
 // BatchNorm op patterns.
 //===----------------------------------------------------------------------===//
 
-def FalseBoolAttr : AttrConstraint<CPred<"!$_self.cast<BoolAttr>().getValue()">>;
-def TrueBoolAttr : AttrConstraint<CPred<"$_self.cast<BoolAttr>().getValue()">>;
+def FalseBoolAttr : AttrConstraint<CPred<"!llvm::cast<BoolAttr>($_self).getValue()">>;
+def TrueBoolAttr : AttrConstraint<CPred<"llvm::cast<BoolAttr>($_self).getValue()">>;
 
 def CastValueToI64: NativeCodeCall<
   "CastValueToI64($0.getLoc(), $1, &$_builder)">;
@@ -47,21 +47,24 @@ def CastValueToElementType: NativeCodeCall<
 // the corresponding value of ranked tensor type whose axis is referred in $0.
 def GetHLOAxisFromTFAxis : NativeCodeCall<
   "GetHLOAxisFromTFAxis("
-  "$0, $1.getType().cast<RankedTensorType>().getRank(), &$_builder)">;
+  "$0, llvm::cast<RankedTensorType>($1.getType()).getRank(), &$_builder)">;
 
 // Same as the above but with $1 of type operand_range from variadic TensorFlow
 // input.
 def GetHLOAxisFromTFAxisVariadic : NativeCodeCall<
   "GetHLOAxisFromTFAxis("
-  "$0, (*$1.begin()).getType().cast<RankedTensorType>().getRank(), "
+  "$0, llvm::cast<RankedTensorType>((*$1.begin()).getType()).getRank(), "
   "&$_builder)">;
 
 def CastElementsToI64Elements : NativeCodeCall<
-  "hlo::convertElementsAttr("
-    "$0.cast<ElementsAttr>(), $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
+  "llvm::cast<DenseIntElementsAttr>(hlo::convertElementsAttr("
+    "llvm::cast<ElementsAttr>($0), $_builder.getIntegerType(64)))">;
 
 def EmptyDotAlgorithmAttr : NativeCodeCall<"mlir::mhlo::DotAlgorithmAttr{}">;
 
+def ConstDefaultResultAccuracyAttr :
+  ConstantAttr<MHLO_ResultAccuracyAttr, "::mlir::mhlo::ResultAccuracyMode::DEFAULT">;
+
 //===----------------------------------------------------------------------===//
 // ApproximateEqual op pattern.
 //===----------------------------------------------------------------------===//
@@ -271,17 +274,17 @@ def : EqualityPat<TF_NotEqualOp, CHLO_ComparisonDirectionValue<"NE">>;
 //===----------------------------------------------------------------------===//
 
 def OneElementAttrPred
-  : CPred<"$_self.cast<ElementsAttr>().getShapedType().getNumElements() == 1">;
+  : CPred<"llvm::cast<ElementsAttr>($_self).getShapedType().getNumElements() == 1">;
 
 def OneElementAttr
   : ElementsAttrBase<And<[ElementsAttr.predicate, OneElementAttrPred]>,
                      "Scalar ElementsAttr">;
 
 def HasRankedFirstOperand
-  : Constraint<CPred<"(*$0.begin()).getType().isa<RankedTensorType>()">>;
+  : Constraint<CPred<"llvm::isa<RankedTensorType>((*$0.begin()).getType())">>;
 
 def IsShapedTensor
-  : Constraint<CPred<"$0.getType().isa<RankedTensorType>()">>;
+  : Constraint<CPred<"llvm::isa<RankedTensorType>($0.getType())">>;
 
 // This pattern converts TensorFlow axis format to HLO axis format which
 // doesn't wrap around like TensorFlow and is always positive. For this
@@ -329,10 +332,10 @@ class MHLO_FftTypeValue<string enumStr> :
   ConstantAttr<MHLO_FftTypeAttr, "::mlir::mhlo::FftType::" # enumStr>;
 
 def GetInnerDimFromValue : NativeCodeCall<
-  "GetInnerDimFromValue($0.getType().cast<ShapedType>(), &$_builder)">;
+  "GetInnerDimFromValue(llvm::cast<ShapedType>($0.getType()), &$_builder)">;
 
 def CheckInnerDimStatic
-  : Constraint<CPred<"CheckInnerDimStatic($0.getType().cast<ShapedType>(), &$_builder)">>;
+  : Constraint<CPred<"CheckInnerDimStatic(llvm::cast<ShapedType>($0.getType()), &$_builder)">>;
 
 def : Pat<(TF_FFTOp:$res $input),
           (MHLO_FftOp $input, MHLO_FftTypeValue<"FFT">, (GetInnerDimFromValue $res)),
@@ -361,14 +364,14 @@ def LegalizeGatherV2 :
 //===----------------------------------------------------------------------===//
 
 class SliceDenseIntElementsAttrColumn2D<string column> : NativeCodeCall<
-  "SliceDenseIntElementsAttrColumn2D($0.cast<ElementsAttr>(), " # column # " )">;
+  "SliceDenseIntElementsAttrColumn2D(llvm::cast<ElementsAttr>($0), " # column # " )">;
 
 class SliceDenseIntElementsAttr<string index, string axis> : NativeCodeCall<
-  "SliceDenseIntElementsAttr($0.cast<ElementsAttr>(), " # index # ", " # axis # ")">;
+  "SliceDenseIntElementsAttr(llvm::cast<ElementsAttr>($0), " # index # ", " # axis # ")">;
 
 // Interior padding attribute based on the TF padding.
 def GetInteriorPadding : NativeCodeCall <
-  "GetInteriorPadding($0.cast<ElementsAttr>())">;
+  "GetInteriorPadding(llvm::cast<ElementsAttr>($0))">;
 
 def : Pat<(TF_PadV2Op $input, (ConstantLikeMatcher ElementsAttr:$padding), $c),
           (MHLO_PadOp $input, $c,
@@ -404,6 +407,9 @@ def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
 // Lower `tf.ZerosLike`
 //===----------------------------------------------------------------------===//
 
+class MHLO_ConstantLike<string value> : NativeCodeCall<
+    "chlo::getConstantLike($_builder, $_loc, " # value # ", $0)">;
+
 def : Pat<(TF_ZerosLikeOp AnyTensor:$arg),
           (MHLO_ConstantLike<"0"> $arg)>;
 
@@ -425,7 +431,7 @@ def : Pat<(TF_EluOp AnyTensor:$features),
               (MHLO_ConstantLike<"0">:$zero $features),
               MHLO_ComparisonDirectionValue<"GT">, (MHLO_DEFAULT_COMPARISON_TYPE)),
            $features,
-           (MHLO_Expm1Op $features))>;
+           (MHLO_Expm1Op $features, ConstDefaultResultAccuracyAttr))>;
 
 def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
            (MHLO_SelectOp
@@ -508,10 +514,10 @@ def UnpackStartingIndices: NativeCodeCall<
   "UnpackTensorAlongZeroDim($0.getLoc(), $1, &$_builder).getOutput()">;
 
 def CanBeTranslatedToDynamicSlice : Constraint<CPred<
-  "CanBeTranslatedToDynamicSlice($0, $1, $2.cast<DenseIntElementsAttr>())">>;
+  "CanBeTranslatedToDynamicSlice($0, $1, llvm::cast<DenseIntElementsAttr>($2))">>;
 
 def TFSliceSizes2HLOSliceSizes : NativeCodeCall<
-    "TFSliceSizes2HLOSliceSizes($0, $1, $2.cast<DenseIntElementsAttr>(),"
+    "TFSliceSizes2HLOSliceSizes($0, $1, llvm::cast<DenseIntElementsAttr>($2),"
     "&$_builder)">;
 
 def : Pat<(TF_SliceOp:$op MHLO_Tensor:$input, MHLO_Tensor:$starting_indices,
@@ -557,7 +563,7 @@ def : Pat<(TF_LegacyCallOp:$op $args, $args_attrs, $res_attrs, FlatSymbolRefAttr
 //===----------------------------------------------------------------------===//
 
 // Handles axis conversion for TF reverse.
-def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1.cast<ElementsAttr>(), &$_builder)">;
+def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, llvm::cast<ElementsAttr>($1), &$_builder)">;
 
 def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (ConstantLikeMatcher ElementsAttr:$axis)),
     (MHLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
@@ -570,33 +576,32 @@ foreach Mapping = [
                    [TF_AbsOp, MHLO_AbsOp],
                    [TF_CeilOp, MHLO_CeilOp],
                    [TF_ComplexAbsOp, MHLO_AbsOp],
-                   [TF_CosOp, MHLO_CosineOp],
-                   [TF_Expm1Op, MHLO_Expm1Op],
                    [TF_ErfOp, MHLO_ErfOp],
                    [TF_FloorOp, MHLO_FloorOp],
                    [TF_ImagOp, MHLO_ImagOp],
                    [TF_InvertOp, MHLO_NotOp],
                    [TF_IsFiniteOp, MHLO_IsFiniteOp],
-                   [TF_LogOp, MHLO_LogOp],
-                   [TF_Log1pOp, MHLO_Log1pOp],
                    [TF_LogicalNotOp, MHLO_NotOp],
                    [TF_NegOp, MHLO_NegOp],
                    [TF_RealOp, MHLO_RealOp],
-                   [TF_RsqrtOp, MHLO_RsqrtOp],
-                   [TF_SigmoidOp, MHLO_LogisticOp],
-                   [TF_SinOp, MHLO_SineOp],
-                   [TF_SqrtOp, MHLO_SqrtOp],
-                   [TF_TanhOp, MHLO_TanhOp],
-                   [TF_TanOp, MHLO_TanOp]
                   ] in {
  def : Pat<(Mapping[0] MHLO_Tensor:$input),
            (Mapping[1] $input)>;
 }
 
-def ConstDefaultResultAccuracyAttr :
-  ConstantAttr<MHLO_ResultAccuracyAttr, "::mlir::mhlo::ResultAccuracyMode::DEFAULT">;
-
-foreach Mapping = [[TF_ExpOp, MHLO_ExpOp]] in {
+foreach Mapping = [
+                  [TF_CosOp, MHLO_CosineOp],
+                  [TF_ExpOp, MHLO_ExpOp],
+                  [TF_Expm1Op, MHLO_Expm1Op],
+                  [TF_LogOp, MHLO_LogOp],
+                  [TF_Log1pOp, MHLO_Log1pOp],
+                  [TF_RsqrtOp, MHLO_RsqrtOp],
+                  [TF_SigmoidOp, MHLO_LogisticOp],
+                  [TF_SinOp, MHLO_SineOp],
+                  [TF_SqrtOp, MHLO_SqrtOp],
+                  [TF_TanhOp, MHLO_TanhOp],
+                  [TF_TanOp, MHLO_TanOp],
+                  ] in {
  def : Pat<(Mapping[0] MHLO_Tensor:$input),
            (Mapping[1] $input, ConstDefaultResultAccuracyAttr)>;
 }
@@ -703,7 +708,7 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
               [
                 (MHLO_ExpOp:$features_exp $features, ConstDefaultResultAccuracyAttr),
                 (CHLO_BroadcastAddOp:$threshold
-                 (MHLO_LogOp (MHLO_ConstantOp (EpsilonValue $features))),
+                 (MHLO_LogOp (MHLO_ConstantOp (EpsilonValue $features)), ConstDefaultResultAccuracyAttr),
                  (MHLO_ConstantOp (GetScalarOfType<2> $features)),
                  (NullDenseI64ArrayAttr)
                 ),
@@ -725,7 +730,7 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                    (CHLO_DEFAULT_COMPARISON_TYPE)
                   ),
                   $features_exp,
-                  (MHLO_Log1pOp $features_exp)
+                  (MHLO_Log1pOp $features_exp, ConstDefaultResultAccuracyAttr)
                  )
                 ),
                 (replaceWithValue $output)
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
new file mode 100644
index 000000000000..773496af73ff
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
@@ -0,0 +1,266 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT.IfChange
+#include "tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/IR/Quant.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/Serialization.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace stablehlo {
+
+static constexpr absl::string_view kStablehloModuleDefaultEntryFuncName =
+    "main";
+static constexpr absl::string_view kStablehloFuncNamePrefix = "XlaCallModule";
+static constexpr char kShardingAttr[] = "mhlo.sharding";
+static constexpr char kShardingName[] = "Sharding";
+
+class RemoveCustomCallWithSharding
+    : public OpRewritePattern<stablehlo::CustomCallOp> {
+  using OpRewritePattern<stablehlo::CustomCallOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
+                                PatternRewriter &rewriter) const override {
+    // Removes the custom call with sharding op if the operand type is the
+    // same as the result type.
+    if (op->hasAttr(kShardingAttr) && op.getCallTargetName() == kShardingName &&
+        op.getNumOperands() == 1 && op.getNumResults() == 1 &&
+        op.getOperands().front().getType() ==
+            op.getResults().front().getType()) {
+      rewriter.replaceOp(op, op.getOperands());
+      return success();
+    }
+    return failure();
+  }
+};
+
+namespace {
+
+bool IsShloMainFuncOp(func::FuncOp func_op) {
+  if (func_op == nullptr) {
+    return false;
+  }
+
+  if (!func_op.getSymName().contains(kStablehloModuleDefaultEntryFuncName)) {
+    return false;
+  }
+
+  if (func_op.getSymVisibility() == "nested" ||
+      func_op.getSymVisibility() == "private") {
+    return false;
+  }
+
+  return true;
+}
+
+// Returns true if XlaCallModuleOp has the "platform index argument". The
+// platform index argument is an extra 0-dimensional i32 tensor argument at
+// index 0 when the XlaCallModuleOp contains more than one platform specified at
+// the "platform" attribute.
+//
+// See:
+// https://github.com/tensorflow/tensorflow/blob/eba24f41ba9d661d2f58a515921720cf90708cd4/tensorflow/compiler/tf2xla/ops/xla_ops.cc#L1376-L1385
+bool ContainsPlatformIndexArg(TF::XlaCallModuleOp xla_call_module_op) {
+  return xla_call_module_op.getPlatforms().size() > 1;
+}
+
+}  // namespace
+
+class ConvertTFXlaCallModuleOp : public OpRewritePattern<TF::XlaCallModuleOp> {
+ public:
+  explicit ConvertTFXlaCallModuleOp(MLIRContext *context, ModuleOp module_op)
+      : OpRewritePattern<TF::XlaCallModuleOp>(context), module_op_(module_op) {}
+  using OpRewritePattern<TF::XlaCallModuleOp>::OpRewritePattern;
+
+ private:
+  ModuleOp module_op_;
+  LogicalResult matchAndRewrite(TF::XlaCallModuleOp op,
+                                PatternRewriter &rewriter) const override {
+    OwningOpRef<ModuleOp> stablehlo_module_op =
+        stablehlo::deserializePortableArtifact(op.getModuleAttr(),
+                                               getContext());
+    if (stablehlo_module_op.get() == nullptr) {
+      return failure();
+    }
+    SymbolTable parent_module_symbol_table(module_op_);
+    SymbolTable stablehlo_module_symbol_table(stablehlo_module_op.get());
+    {
+      auto main_func_op = stablehlo_module_symbol_table.lookup<func::FuncOp>(
+          kStablehloModuleDefaultEntryFuncName);
+      // TODO(b/291988976): move enforcement of this variable outside of this
+      // rewrite pattern such that it's only checked once. Currently, this
+      // approach results in duplicate error messages as this pattern executes
+      // more than once.
+      if (!IsShloMainFuncOp(main_func_op)) {
+        auto error_msg =
+            "'main' FuncOp in XlaCallModuleOp missing or has visibility other "
+            "than 'public'";
+        if (main_func_op) {
+          main_func_op->emitError(error_msg);
+        }
+        return rewriter.notifyMatchFailure(op, error_msg);
+      }
+    }
+    Builder stablehlo_builder(stablehlo_module_op.get().getContext());
+    // Rename XlaCallModuleOp's functions to avoid naming conflicts.
+    for (auto func_op : stablehlo_module_op.get().getOps<func::FuncOp>()) {
+      const std::string new_func_name =
+          CreateNewFuncName(func_op.getSymName(), parent_module_symbol_table);
+      if (failed(stablehlo_module_symbol_table.replaceAllSymbolUses(
+              func_op, stablehlo_builder.getStringAttr(new_func_name),
+              stablehlo_module_op.get()))) {
+        return failure();
+      }
+      SymbolTable::setSymbolName(func_op, new_func_name);
+    }
+    // Move all functions from XlaCallModuleOp's stablehlo module, to parent
+    // module. Also marks the stablehlo module entry function as private.
+    func::FuncOp main_fn;
+    for (auto func_op : stablehlo_module_op.get().getOps<func::FuncOp>()) {
+      func::FuncOp cloned_func_op = func_op.clone();
+      if (IsShloMainFuncOp(cloned_func_op)) {
+        main_fn = cloned_func_op;
+      }
+      cloned_func_op.setSymVisibility(
+          stablehlo_builder.getStringAttr("private"));
+      parent_module_symbol_table.insert(cloned_func_op);
+    }
+
+    // When the `XlaCallModuleOp`'s callee accepts a platform index argument,
+    // add a dummy platform index argument in order to match the number of
+    // the arguments of the callee function.
+    //
+    // This is because `XlaCallModuleOp` doesn't explicitly take it as an
+    // operand. See:
+    // https://github.com/tensorflow/tensorflow/blob/eba24f41ba9d661d2f58a515921720cf90708cd4/tensorflow/compiler/tf2xla/ops/xla_ops.cc#L1376-L1385
+
+    SmallVector<Value, 4> call_op_operands(op.getOperands());
+    if (ContainsPlatformIndexArg(op)) {
+      Value dummy_const = rewriter.create<TF::ConstOp>(
+          op.getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get({}, rewriter.getIntegerType(32)), {0}));
+      call_op_operands.insert(call_op_operands.begin(), dummy_const);
+    }
+
+    // The stablehlo module main function's input tensor types might be
+    // different from the XlaCallModuleOp's input tensor types. For example,
+    // The XlaCallModuleOp's input is tensor<*xf32> while the function's
+    // argument type is tensor<1x2f32>.
+    SmallVector<Value, 4> casted_operands;
+    casted_operands.reserve(main_fn.getNumArguments());
+    assert(call_op_operands.size() == main_fn.getNumArguments());
+    for (const auto &operand_and_type :
+         zip(call_op_operands, main_fn.getFunctionType().getInputs())) {
+      Value operand = std::get<0>(operand_and_type);
+      Type expected_type = std::get<1>(operand_and_type);
+      if (operand.getType() != expected_type) {
+        operand = rewriter.create<TF::CastOp>(
+            op.getLoc(), expected_type, operand,
+            /*Truncate=*/rewriter.getBoolAttr(false));
+      }
+      casted_operands.push_back(operand);
+    }
+
+    auto call = rewriter.create<func::CallOp>(
+        op->getLoc(), main_fn.getSymName(), main_fn.getResultTypes(),
+        casted_operands);
+    rewriter.replaceOp(op, call->getResults());
+
+    return success();
+  }
+
+  // Creates a new function name to avoid collision. The naming scheme is
+  // XlaCallModule_%s_%d where %s is the original function name and %d is the
+  // counter.
+  std::string CreateNewFuncName(const StringRef func_name,
+                                SymbolTable &symbol_table) const {
+    int suffix_id = 0;
+    std::string new_func_name = absl::StrCat(kStablehloFuncNamePrefix, "_",
+                                             func_name.str(), "_", suffix_id);
+    while (symbol_table.lookup(new_func_name)) {
+      suffix_id++;
+      new_func_name = absl::StrCat(kStablehloFuncNamePrefix, "_",
+                                   func_name.str(), "_", suffix_id);
+    }
+    return new_func_name;
+  }
+};
+
+class TFXlaCallModuleOpToStablehloPass
+    : public PassWrapper<TFXlaCallModuleOpToStablehloPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  StringRef getArgument() const final {
+    return "tf-xla-callmodule-op-to-stablehlo-pass";
+  }
+  StringRef getDescription() const final {
+    return "Legalize TF_XlaCallModule Op to stablehlo";
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<stablehlo::StablehloDialect, vhlo::VhloDialect,
+                    quant::QuantDialect, shape::ShapeDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    RewritePatternSet patterns(&getContext());
+    patterns.add<ConvertTFXlaCallModuleOp>(&getContext(), module_op);
+    patterns.add<RemoveCustomCallWithSharding>(&getContext());
+    if (failed(applyPatternsGreedily(module_op, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLegalizeTFXlaCallModuleToStablehloPass() {
+  return std::make_unique<TFXlaCallModuleOpToStablehloPass>();
+}
+
+static PassRegistration<TFXlaCallModuleOpToStablehloPass> pass;
+
+}  // namespace stablehlo
+}  // namespace mlir
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc)
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h
new file mode 100644
index 000000000000..55a2d9cd82de
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// LINT.IfChange
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace stablehlo {
+
+// Adds passes which transform TF_XlaCallModule Op to StableHLO Ops.
+// Note that this pass only supports static shape tensors for now.
+std::unique_ptr<mlir::OperationPass<ModuleOp>>
+CreateLegalizeTFXlaCallModuleToStablehloPass();
+
+}  // namespace stablehlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_LEGALIZE_TF_XLA_CALL_MODULE_TO_STABLEHLO_PASS_H_
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h)
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/README.md b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/README.md
new file mode 100644
index 000000000000..01d84ae1c577
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/README.md
@@ -0,0 +1,5 @@
+This temporary directory was created to store MHLO pass .cc and .h files. These
+files have been migrated to StableHLO but are still used by inactive or
+potentially outdated compilation paths. Once all MHLO passes have been migrated
+to StableHLO, revisit this directory. At that point, we can replace the uses of
+MHLO passes from this directory with the StableHLO passes.
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.cc b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.cc
similarity index 97%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.cc
rename to tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.cc
index a701f7830841..a54393cfd26a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.h"
+
 #include <iterator>
 #include <memory>
 #include <utility>
@@ -36,8 +38,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/utils/validators.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
@@ -95,7 +97,7 @@ class FuseMhloMulAndConvolutionPattern : public OpRewritePattern<mhlo::MulOp> {
     // format and backprop input conv filter is in HWOI format.
     // Only fuses multiplier if all dimensions other than the out channel
     // dimension are equal to 1.
-    if (!TFL::IsDimensionsDegenerateExceptLastOne(
+    if (!TF::IsDimensionsDegenerateExceptLastOne(
             mul_value.getShapedType().getShape())) {
       return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
         diag << "entities 'mul_value' failed to satisfy constraint: "
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.h b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.h
new file mode 100644
index 000000000000..0d9455a3d9c1
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/fuse_convolution_pass.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_FUSE_CONVOLUTION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_FUSE_CONVOLUTION_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Fuses MHLO binary element-wise ops and convolution op.
+std::unique_ptr<Pass> createFuseConvolutionPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_FUSE_CONVOLUTION_PASS_H_
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.cc b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.cc
new file mode 100644
index 000000000000..2ca7f96c9b34
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.cc
@@ -0,0 +1,202 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.h"
+
+#include <iterator>
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/tf_attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/utils/validators.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml::tf_quant {
+
+using ::mlir::tf_quant::FindUserOfType;
+
+class FuseMhloMulAndConvolutionPattern : public OpRewritePattern<mhlo::MulOp> {
+ public:
+  using OpRewritePattern<mhlo::MulOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::MulOp mul_op,
+                                PatternRewriter &rewriter) const override {
+    // Variables for capturing values and attributes used while creating ops.
+    mhlo::ConvolutionOp conv_op;
+    Operation *bcast_or_const_op;
+    shape::ShapeOfOp shape_of_op;
+    mhlo::ConstantOp filter;
+    mhlo::ConstantOp multiplier;
+    mlir::ElementsAttr filter_value, mul_value;
+    mlir::DenseIntElementsAttr broadcast_dims;
+
+    // Match and capture values/attributes.
+    Value lhs = mul_op.getLhs();
+    Value rhs = mul_op.getRhs();
+    conv_op = lhs.getDefiningOp<mhlo::ConvolutionOp>();
+    if (conv_op == nullptr) {
+      return failure();
+    }
+    filter = conv_op.getRhs().getDefiningOp<mhlo::ConstantOp>();
+    if (filter == nullptr) {
+      return failure();
+    }
+    // Try to match static broadcast or dynamic broadcast.
+    bcast_or_const_op = rhs.getDefiningOp();
+    bool is_dynamic_broadcast =
+        isa<mhlo::DynamicBroadcastInDimOp>(bcast_or_const_op);
+    multiplier = isa<mhlo::ConstantOp>(bcast_or_const_op)
+                     ? dyn_cast_or_null<mhlo::ConstantOp>(bcast_or_const_op)
+                     : bcast_or_const_op->getOperand(0)
+                           .getDefiningOp<mhlo::ConstantOp>();
+    if (multiplier == nullptr) {
+      return failure();
+    }
+
+    auto result_type = OpTrait::util::getBroadcastedType(filter.getType(),
+                                                         multiplier.getType());
+    if (!result_type) {
+      return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
+        diag << "entities 'filter, multiplier' failed to satisfy constraint: "
+                "non-broadcastable operands";
+      });
+    }
+    filter_value = filter.getValue();
+    mul_value = multiplier.getValue();
+    // In MHLO, Conv filter is in HWIO format, Depthwise conv filter is in HW1O
+    // format and backprop input conv filter is in HWOI format.
+    // Only fuses multiplier if all dimensions other than the out channel
+    // dimension are equal to 1.
+    if (!TF::IsDimensionsDegenerateExceptLastOne(
+            mul_value.getShapedType().getShape())) {
+      return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
+        diag << "entities 'mul_value' failed to satisfy constraint: "
+                "unsupported dimensions";
+      });
+    }
+    if (!is_dynamic_broadcast &&
+        !((*conv_op.getODSResults(0).begin()).hasOneUse())) {
+      return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
+        diag << "entities 'conv' failed to satisfy constraint: has one use";
+      });
+    }
+    // For dynamic case, the result of conv should be used by shape_of and mul.
+    if (is_dynamic_broadcast) {
+      auto conv_uses = (*conv_op.getODSResults(0).begin()).getUses();
+      if (std::distance(conv_uses.begin(), conv_uses.end()) != 2 ||
+          FindUserOfType<shape::ShapeOfOp>(conv_op) ==
+              nullptr ||
+          FindUserOfType<mhlo::MulOp>(conv_op) == nullptr) {
+        return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic
+                                                           &diag) {
+          diag << "entities 'conv' failed to satisfy constraint: has two uses "
+                  "for dynamic case";
+        });
+      }
+    }
+
+    // Rewrite
+    // For dynamic case, we use filter's shape to create a static broadcast.
+    broadcast_dims =
+        !isa<mhlo::ConstantOp>(bcast_or_const_op) && !is_dynamic_broadcast
+            ? dyn_cast_or_null<mhlo::BroadcastInDimOp>(bcast_or_const_op)
+                  .getBroadcastDimensions()
+            : nullptr;
+    if (broadcast_dims == nullptr) {
+      const auto filter_rank = filter_value.getShapedType().getRank();
+      auto dimsType = RankedTensorType::get({1}, rewriter.getIntegerType(64));
+      broadcast_dims = DenseIntElementsAttr::get(dimsType, {filter_rank - 1});
+    }
+    Value broadcast_multiplier = rewriter.create<mhlo::BroadcastInDimOp>(
+        mul_op.getLoc(), filter.getType(), multiplier, broadcast_dims);
+    Value new_filter = rewriter.create<mhlo::MulOp>(
+        mul_op.getLoc(), filter.getType(), filter, broadcast_multiplier);
+    Value new_conv = rewriter.create<mhlo::ConvolutionOp>(
+        mul_op.getLoc(), conv_op.getType(), conv_op.getLhs(), new_filter,
+        conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
+        conv_op.getLhsDilationAttr(), conv_op.getRhsDilationAttr(),
+        conv_op.getWindowReversalAttr(), conv_op.getDimensionNumbers(),
+        conv_op.getFeatureGroupCount(), conv_op.getBatchGroupCount(),
+        conv_op.getPrecisionConfigAttr());
+    // For static case, replace the convolution op now.
+    if (!is_dynamic_broadcast) {
+      rewriter.replaceOp(mul_op, {new_conv});
+    } else {
+      // For dynamic case, create new shape_of op and replace uses.
+      shape_of_op =
+          dyn_cast_or_null<mhlo::DynamicBroadcastInDimOp>(bcast_or_const_op)
+              .getOutputDimensions()
+              .getDefiningOp<shape::ShapeOfOp>();
+      // Check if the shape come from the original conv op.
+      if (!shape_of_op ||
+          shape_of_op.getArg().getDefiningOp<mhlo::ConvolutionOp>() !=
+              conv_op) {
+        return failure();
+      }
+      Value new_shape_of = rewriter.create<shape::ShapeOfOp>(
+          mul_op.getLoc(), shape_of_op.getType(), new_conv);
+      shape_of_op.replaceAllUsesWith(new_shape_of);
+      rewriter.replaceOp(mul_op, {new_conv});
+    }
+
+    return success();
+  }
+};
+
+class FuseMhloConvolutionPass
+    : public PassWrapper<FuseMhloConvolutionPass, OperationPass<func::FuncOp>> {
+ public:
+  StringRef getArgument() const final { return "fuse-mhlo-convolution-pass"; }
+  StringRef getDescription() const final {
+    return "Fuses MHLO binary element-wise ops and convolution op";
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<FuseMhloMulAndConvolutionPattern>(&getContext());
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+std::unique_ptr<Pass> createFuseConvolutionPass() {
+  return std::make_unique<FuseMhloConvolutionPass>();
+}
+
+static PassRegistration<FuseMhloConvolutionPass> pass;
+
+}  // namespace odml::tf_quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.h b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.h
new file mode 100644
index 000000000000..fcc48446d65b
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/tf_fuse_convolution_pass.h
@@ -0,0 +1,30 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_TF_FUSE_CONVOLUTION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_TF_FUSE_CONVOLUTION_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir::odml::tf_quant {
+
+// Fuses MHLO binary element-wise ops and convolution op.
+std::unique_ptr<Pass> createFuseConvolutionPass();
+
+}  // namespace mlir::odml::tf_quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_TF_FUSE_CONVOLUTION_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.cc
similarity index 99%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.cc
rename to tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.cc
index 62cccb503a3d..e02f6cf75926 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.h"
+
 #include <cassert>
 #include <cstdint>
 #include <memory>
@@ -33,7 +35,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.h b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.h
new file mode 100644
index 000000000000..fa5035771d42
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/mhlo_passes/unfuse_batch_norm_pass.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_UNFUSE_BATCH_NORM_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_UNFUSE_BATCH_NORM_PASS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Unfuses MHLO batch norm inference op into arithmetic ops.
+std::unique_ptr<Pass> createUnfuseBatchNormPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_MHLO_PASSES_UNFUSE_BATCH_NORM_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc b/tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.cc
similarity index 97%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
rename to tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.cc
index 23b2ccdc83a6..ac9682029380 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.h"
 
 #include <memory>
 #include <string>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h b/tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.h
similarity index 76%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h
rename to tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.h
index e56b7130132b..18a435c20a55 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/rename_entrypoint_to_main.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
 
 #include <memory>
 
@@ -28,4 +28,4 @@ std::unique_ptr<Pass> CreateRenameEntrypointToMainPass();
 }  // namespace odml
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/stablehlo_passes.h b/tensorflow/compiler/mlir/stablehlo/transforms/stablehlo_passes.h
deleted file mode 100644
index d08c700977df..000000000000
--- a/tensorflow/compiler/mlir/stablehlo/transforms/stablehlo_passes.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_STABLEHLO_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_STABLEHLO_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace odml {
-
-// Constant folds broadcast_in_dim op conditionally.
-std::unique_ptr<Pass> createFoldBroadcastPass();
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_STABLEHLO_PASSES_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc b/tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.cc
similarity index 96%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
rename to tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.cc
index a3b2b47ac9f7..b4f726ed4db8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h"
 
 #include <memory>
 #include <utility>
@@ -32,8 +32,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_passes.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
+#include "tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h"
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h b/tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h
similarity index 81%
rename from tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
rename to tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h
index c26a3f36daf6..2a1df5add974 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/tf_stablehlo_pass.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
 
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 
@@ -30,4 +30,4 @@ void AddLegalizeTFToStablehloPasses(OpPassManager& pm,
 }  // namespace odml
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc b/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
new file mode 100644
index 000000000000..d440f20e6d97
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/stablehlo/transforms/utils.h"
+
+#include <cstdint>
+
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/mlir_hlo/utils/hlo_utils.h"
+
+namespace mlir {
+namespace odml {
+
+mhlo::ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
+                                      OpBuilder* builder) {
+  return builder->create<mhlo::ConstantOp>(loc,
+                                           hlo::getScalarOfType(ty, raw_value));
+}
+
+mhlo::ConstantOp GetScalarNegZeroOfType(Type ty, Location loc,
+                                        OpBuilder* builder) {
+  return builder->create<mhlo::ConstantOp>(loc,
+                                           hlo::getScalarNegZeroOfType(ty));
+}
+
+DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr) {
+  RankedTensorType ty =
+      RankedTensorType::get(static_cast<int64_t>(attr.size()),
+                            IntegerType::get(attr.getContext(), 64));
+  return DenseIntElementsAttr::get(ty, attr.getValue());
+}
+
+DenseIntElementsAttr GetI64ElementsAttr(ArrayRef<int64_t> values,
+                                        Builder* builder) {
+  RankedTensorType ty = RankedTensorType::get(
+      {static_cast<int64_t>(values.size())}, builder->getIntegerType(64));
+  return DenseIntElementsAttr::get(ty, values);
+}
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/utils.h b/tensorflow/compiler/mlir/stablehlo/transforms/utils.h
new file mode 100644
index 000000000000..b048850056ea
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/utils.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_UTILS_H_
+
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+// Builds body for reduce op by using the template binary op as the
+// reducer op.
+template <typename Op>
+void BuildReduceBody(Type element_type, Region* body, OpBuilder* builder) {
+  OpBuilder::InsertionGuard guard(*builder);
+  Block* block = builder->createBlock(body);
+
+  // Block arguments are scalars of the given element type.
+  Type type = RankedTensorType::get(/*shape=*/{}, element_type);
+  Location loc = body->getLoc();
+  block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
+
+  auto reducer =
+      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
+  builder->create<mhlo::ReturnOp>(loc, reducer.getResult());
+}
+
+mhlo::ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
+                                      OpBuilder* builder);
+
+mhlo::ConstantOp GetScalarNegZeroOfType(Type ty, Location loc,
+                                        OpBuilder* builder);
+
+// Converts an ArrayAttr to a 1D 64-bit dense elements attribute.
+DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr);
+DenseIntElementsAttr GetI64ElementsAttr(llvm::ArrayRef<int64_t> values,
+                                        Builder* builder);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_STABLEHLO_TRANSFORMS_UTILS_H_
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/utils_test.cc b/tensorflow/compiler/mlir/stablehlo/transforms/utils_test.cc
new file mode 100644
index 000000000000..dd989d8971a7
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/utils_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/stablehlo/transforms/utils.h"
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+namespace {
+
+TEST(UtilsTest, GetScalarConstOfType) {
+  MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect>();
+  OpBuilder builder(&context);
+  Location loc = UnknownLoc::get(&context);
+  Type ty = builder.getI32Type();
+  mhlo::ConstantOp op = GetScalarConstOfType(ty, loc, 123, &builder);
+  EXPECT_EQ(op.getValue().getValues<int32_t>()[0], 123);
+
+  op->destroy();
+}
+
+TEST(UtilsTest, GetScalarNegZeroOfType) {
+  MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect>();
+  OpBuilder builder(&context);
+  Location loc = UnknownLoc::get(&context);
+  Type ty = builder.getF32Type();
+  mhlo::ConstantOp op = GetScalarNegZeroOfType(ty, loc, &builder);
+  EXPECT_EQ(op.getValue().getValues<float>()[0], -0.f);
+
+  op->destroy();
+}
+
+TEST(UtilsTest, GetI64ElementsAttr) {
+  MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect>();
+  OpBuilder builder(&context);
+  Location loc = UnknownLoc::get(&context);
+  SmallVector<int64_t> values = {1, 2, 3};
+  auto valuesAttr = builder.getI64ArrayAttr(values);
+  DenseIntElementsAttr attr = GetI64ElementsAttr(valuesAttr);
+  EXPECT_THAT(SmallVector<int64_t>(attr.getValues<int64_t>()),
+              testing::ElementsAreArray(values));
+}
+
+TEST(UtilsTest, GetI64ElementsAttrBuilder) {
+  MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect>();
+  OpBuilder builder(&context);
+  Location loc = UnknownLoc::get(&context);
+  SmallVector<int64_t> values = {1, 2, 3};
+  DenseIntElementsAttr attr = GetI64ElementsAttr(values, &builder);
+  EXPECT_THAT(SmallVector<int64_t>(attr.getValues<int64_t>()),
+              testing::ElementsAreArray(values));
+}
+
+}  // namespace
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 81bf61234707..4cf0cfc3f9d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -47,16 +47,10 @@ td_library(
 gentbl_cc_library(
     name = "tensorflow_op_interfaces_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-interface-decls"],
-            "ir/tf_op_interfaces.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "ir/tf_op_interfaces.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tf_op_interfaces.h.inc": ["-gen-op-interface-decls"],
+        "ir/tf_op_interfaces.cc.inc": ["-gen-op-interface-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_op_interfaces.td",
     test = True,
@@ -68,12 +62,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_struct_doc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-dialect-doc"],
-            "g3doc/tf_ops.md",
-        ),
-    ],
+    tbl_outs = {"g3doc/tf_ops.md": ["-gen-dialect-doc"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_ops.td",
     test = True,
@@ -107,16 +96,10 @@ cc_library(
 gentbl_cc_library(
     name = "tensorflow_all_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_all_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_all_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tf_all_ops.h.inc": ["-gen-op-decls"],
+        "ir/tf_all_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_ops.td",
     deps = [
@@ -140,22 +123,16 @@ tf_ops_category_list = [
     gentbl_cc_library(
         name = "tensorflow_" + target["name"] + "_inc_gen",
         compatible_with = get_compatible_with_portable(),
-        tbl_outs = [
-            (
-                [
-                    "-gen-op-decls",
-                    "-op-include-regex=" + target["include"],
-                ],
-                "ir/tf_" + target["name"] + ".h.inc",
-            ),
-            (
-                [
-                    "-gen-op-defs",
-                    "-op-include-regex=" + target["include"],
-                ],
-                "ir/tf_" + target["name"] + ".cc.inc",
-            ),
-        ],
+        tbl_outs = {
+            "ir/tf_" + target["name"] + ".h.inc": [
+                "-gen-op-decls",
+                "-op-include-regex=" + target["include"],
+            ],
+            "ir/tf_" + target["name"] + ".cc.inc": [
+                "-gen-op-defs",
+                "-op-include-regex=" + target["include"],
+            ],
+        },
         tblgen = "@llvm-project//mlir:mlir-tblgen",
         td_file = "ir/tf_ops.td",
         deps = [
@@ -167,22 +144,16 @@ tf_ops_category_list = [
 gentbl_cc_library(
     name = "tensorflow_remaining_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-op-decls",
-                "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
-            ],
-            "ir/tf_remaining_ops.h.inc",
-        ),
-        (
-            [
-                "-gen-op-defs",
-                "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
-            ],
-            "ir/tf_remaining_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tf_remaining_ops.h.inc": [
+            "-gen-op-decls",
+            "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
+        ],
+        "ir/tf_remaining_ops.cc.inc": [
+            "-gen-op-defs",
+            "-op-exclude-regex=" + "|".join([target["include"] for target in tf_ops_category_list]),
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_ops.td",
     deps = [
@@ -193,20 +164,11 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tf_saved_model_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_saved_model.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_saved_model.cc.inc",
-        ),
-        (
-            ["-gen-dialect-doc"],
-            "g3doc/tf_saved_model.md",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tf_saved_model.h.inc": ["-gen-op-decls"],
+        "ir/tf_saved_model.cc.inc": ["-gen-op-defs"],
+        "g3doc/tf_saved_model.md": ["-gen-dialect-doc"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_saved_model_ops.td",
     test = True,
@@ -219,23 +181,14 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_executor_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_executor.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_executor.cc.inc",
-        ),
-        (
-            [
-                "-gen-dialect-doc",
-                "-dialect=tf_executor",
-            ],
-            "g3doc/tf_executor.md",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tf_executor.h.inc": ["-gen-op-decls"],
+        "ir/tf_executor.cc.inc": ["-gen-op-defs"],
+        "g3doc/tf_executor.md": [
+            "-gen-dialect-doc",
+            "-dialect=tf_executor",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_executor_ops.td",
     test = True,
@@ -250,20 +203,11 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tensorflow_device_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_device.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_device.cc.inc",
-        ),
-        (
-            ["-gen-dialect-doc"],
-            "g3doc/tf_device.md",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tf_device.h.inc": ["-gen-op-decls"],
+        "ir/tf_device.cc.inc": ["-gen-op-defs"],
+        "g3doc/tf_device.md": ["-gen-dialect-doc"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_device_ops.td",
     test = True,
@@ -1034,9 +978,9 @@ cc_library(
         ":mlir_roundtrip_flags",
         ":serialize_mlir_module_utils",
         ":tensorflow",
-        "//tensorflow/compiler/mlir/lite/tools:translate_cl_options",
         "//tensorflow/compiler/mlir/tensorflow/translate/tools:parsers",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/mlir/tools:translate_cl_options",
         "//tensorflow/compiler/mlir/utils:string_container_utils",
         "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:xla_argument",
@@ -1695,6 +1639,7 @@ cc_library(
     deps = [
         "tensorflow_side_effects",
         "tensorflow_types",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
index 372446641382..29b93f10e839 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h"
 
+#include <cassert>
 #include <tuple>
 
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
index ccf7b0b547ab..f1ab2432181e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
@@ -31,16 +31,10 @@ td_library(
 gentbl_cc_library(
     name = "tensorflow_tfrt_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tfrt_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tfrt_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tfrt_ops.h.inc": ["-gen-op-decls"],
+        "tfrt_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tfrt_ops.td",
     deps = [
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index d58b2c7bd650..e6cee35a8202 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -47,11 +47,11 @@ def TfExecutor_Dialect : Dialect {
 }
 
 // Control type.
-def TfeControlType : Type<CPred<"$_self.isa<ControlType>()">, "control">,
+def TfeControlType : Type<CPred<"llvm::isa<ControlType>($_self)">, "control">,
                      BuildableType<"$_builder.getType<ControlType>()">;
 
 // Token type.
-def TfeTokenType : Type<CPred<"$_self.isa<TokenType>()">, "token">,
+def TfeTokenType : Type<CPred<"llvm::isa<TokenType>($_self)">, "token">,
                    BuildableType<"$_builder.getType<TokenType>()">;
 
 // TODO(hinsu): Define and use TensorType instead of AnyType for data operands
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index f563d350cfdb..721f245e45a3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -15382,7 +15382,7 @@ e.g. Max(segment_ids) should be equal to `num_segments` - 1 for a 1-d segment_id
 With inconsistent num_segments, the op still runs. only difference is,
 the output takes the size of num_segments irrespective of size of segment_ids and data.
 for num_segments less than expected output size, the last elements are ignored
-for num_segments more than the expected output size, last elements are assigned 
+for num_segments more than the expected output size, last elements are assigned
 smallest possible value for the specific numeric type.
 
 For example:
@@ -15552,7 +15552,7 @@ e.g. Max(segment_ids) should be equal to `num_segments` - 1 for a 1-d segment_id
 With inconsistent num_segments, the op still runs. only difference is,
 the output takes the size of num_segments irrespective of size of segment_ids and data.
 for num_segments less than expected output size, the last elements are ignored
-for num_segments more than the expected output size, last elements are assigned 
+for num_segments more than the expected output size, last elements are assigned
 the largest possible value for the specific numeric type.
 
 For example:
@@ -15658,7 +15658,7 @@ The only difference with SegmentProd is the additional input  `num_segments`.
 This helps in evaluating the output shape in compile time.
 `num_segments` should be consistent with segment_ids.
 e.g. Max(segment_ids) - 1 should be equal to `num_segments` for a 1-d segment_ids
-With inconsistent num_segments, the op still runs. only difference is, 
+With inconsistent num_segments, the op still runs. only difference is,
 the output takes the size of num_segments irrespective of size of segment_ids and data.
 for num_segments less than expected output size, the last elements are ignored
 for num_segments more than the expected output size, last elements are assigned 1.
@@ -21424,7 +21424,8 @@ platform argument (see `platforms`) nor the dimension arguments (see
     DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$platforms,
     DefaultValuedOptionalAttr<TF_SymbolRefArrayAttr, "{}">:$function_list,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$has_token_input_output,
-    DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$disabled_checks
+    DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$disabled_checks,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$use_shardy_partitioner
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 127210340114..d7ae0542890a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -144,24 +144,24 @@ def TF_UniqueResourceAllocation: TraitList<[
 //===----------------------------------------------------------------------===//
 
 class TF_OperandIsUnrankedPred<int n> :
-  CPred<"$_op.getOperand(" # n # ").getType().isa<UnrankedTensorType>()">;
+  CPred<"llvm::isa<UnrankedTensorType>($_op.getOperand(" # n # ").getType())">;
 
 class TF_ResultIsUnrankedPred<int n> :
-  CPred<"$_op.getResult(" # n # ").getType().isa<UnrankedTensorType>()">;
+  CPred<"llvm::isa<UnrankedTensorType>($_op.getResult(" # n # ").getType())">;
 
 // Returns true if the n-th operand has unknown rank or has rank m.
 class TF_OperandHasRank<int n, int m> :
   PredOpTrait<"operand " # n # " is " # m # "-D",
     Or<[TF_OperandIsUnrankedPred<n>,
-      CPred<"$_op.getOperand(" # n #
-      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+      CPred<"llvm::cast<ShapedType>($_op.getOperand(" # n #
+      ").getType()).getRank() == " # m>]>>;
 
 // Returns true if the n-th result has unknown rank or has rank m.
 class TF_ResultHasRank<int n, int m> :
   PredOpTrait<"result " # n # " is " # m # "-D",
     Or<[TF_ResultIsUnrankedPred<n>,
-      CPred<"$_op.getResult(" # n #
-      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+      CPred<"llvm::cast<ShapedType>($_op.getResult(" # n #
+      ").getType()).getRank() == " # m>]>>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow resources and side effects
@@ -282,12 +282,12 @@ class TF_Op<string mnemonic, list<Trait> traits = []> :
 //===----------------------------------------------------------------------===//
 
 class TF_TensorFlowAttr <string name, string description> :
-    Attr<CPred<"$_self.isa<mlir::TF::" # name # "Attr>()">,
+    Attr<CPred<"llvm::isa<mlir::TF::" # name # "Attr>($_self)">,
          "TensorFlow " # description # " attribute">;
 
 def TF_ShapeAttr : TF_TensorFlowAttr<"Shape", "shape"> {
   let returnType = "std::optional<llvm::ArrayRef<int64_t>>";
-  let convertFromStorage = "$_self.cast<mlir::TF::ShapeAttr>().getValue()";
+  let convertFromStorage = "llvm::cast<mlir::TF::ShapeAttr>($_self).getValue()";
 
   // Create a ranked shape attr by default.
   let constBuilderCall = "mlir::TF::ShapeAttr::get($_builder.getContext(), $0)";
@@ -309,11 +309,11 @@ def TF_SymbolRefArrayAttr :
 
 // Any tensor element type defined in the TensorFlow dialect
 def TF_TFDialectType :
-    Type<CPred<"$_self.isa<mlir::TF::TensorFlowType>()">, "TensorFlow type">;
+    Type<CPred<"llvm::isa<mlir::TF::TensorFlowType>($_self)">, "TensorFlow type">;
 
 // Class for any TensorFlow dialect specific type
 class TF_TensorFlowType <string name, string description> :
-    Type<CPred<"$_self.isa<mlir::TF::" # name # "Type>()">,
+    Type<CPred<"llvm::isa<mlir::TF::" # name # "Type>($_self)">,
          "TensorFlow " # description # " type">,
     BuildableType<"getType<mlir::TF::" # name # "Type>()">;
 
@@ -547,9 +547,9 @@ def TF_Tensor : TensorOf<[TF_ElementType]>;
 // A string attribute whose value are one of the values in `cases`.
 class TF_AnyStrAttrOf<list<string> cases> : StringBasedAttr<
   CPred<!foldl(
-      "$_self.cast<StringAttr>().getValue() == \"" # !head(cases) # "\"",
+      "llvm::cast<StringAttr>($_self).getValue() == \"" # !head(cases) # "\"",
       !foreach(case, !tail(cases),
-               "$_self.cast<StringAttr>().getValue() == \"" # case # "\""),
+               "llvm::cast<StringAttr>($_self).getValue() == \"" # case # "\""),
       prev, cur, prev # " || " # cur)>,
   "string attribute whose value is " #
     !foldl(/*init*/!head(cases), /*list*/!tail(cases),
@@ -558,8 +558,8 @@ class TF_AnyStrAttrOf<list<string> cases> : StringBasedAttr<
 // TODO: Use EnumAttr to define the common attribute cases
 
 def TF_ConvnetDataFormatAttr : StringBasedAttr<
-    CPred<"$_self.cast<StringAttr>().getValue() == \"NHWC\" || " #
-          "$_self.cast<StringAttr>().getValue() == \"NCHW\"">,
+    CPred<"llvm::cast<StringAttr>($_self).getValue() == \"NHWC\" || " #
+          "llvm::cast<StringAttr>($_self).getValue() == \"NCHW\"">,
     "'NHWC' or 'NCHW' convnet data format">;
 
 //===----------------------------------------------------------------------===//
@@ -679,7 +679,7 @@ class TF_DerivedResultShapeListAttr<int idx> : DerivedAttr<
 
 // A derived attribute that returns the shape of the first result type.
 def TF_DerivedResultShapeAttr : DerivedAttr<"ShapedType",
-  "return (*getOperation()->result_type_begin()).cast<ShapedType>();",
+  "return llvm::cast<ShapedType>((*getOperation()->result_type_begin()));",
   [{ mlir::TF::ShapeAttr::get($_ctxt, $_self) }]>;
 
 def TF_IntTypeAttr : TypeAttrBase<"IntegerType", "integer type"> {
@@ -713,14 +713,14 @@ class WithBroadcastableCmpOpBuilder {
     OpBuilder<(ins "Value":$x, "Value":$y),
     [{
   Type resultType;
-  if (x.getType().isa<UnrankedTensorType>() ||
-      y.getType().isa<UnrankedTensorType>()) {
+  if (llvm::isa<UnrankedTensorType>(x.getType()) ||
+      llvm::isa<UnrankedTensorType>(y.getType())) {
     resultType = UnrankedTensorType::get($_builder.getI1Type());
   } else {
     SmallVector<int64_t, 4> resultShape;
     if (!OpTrait::util::getBroadcastedShape(
-            x.getType().cast<ShapedType>().getShape(),
-            y.getType().cast<ShapedType>().getShape(), resultShape)) {
+            llvm::cast<ShapedType>(x.getType()).getShape(),
+            llvm::cast<ShapedType>(y.getType()).getShape(), resultShape)) {
       mlir::emitError($_state.location,
                       "operands have no broadcastable shapes");
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 83dca69fc1a9..c989178f5fb4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -57,7 +57,7 @@ class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [Pure]> {
     // Returns data type of the result handle. Returned type contains type of
     // the TensorList element as a subtype.
     VariantType handle_dtype() {
-      return getElementTypeOrSelf(getHandle().getType()).cast<TF::VariantType>();
+      return llvm::cast<TF::VariantType>(getElementTypeOrSelf(getHandle().getType()));
     }
   }];
 }
@@ -118,7 +118,7 @@ An n-way switch statement, implementing the following:
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveBranchFunction(::mlir::SymbolTableCollection* table, int index) {
-      auto flat_sym_ref = getBranches()[index].cast<FlatSymbolRefAttr>();
+      auto flat_sym_ref = llvm::cast<FlatSymbolRefAttr>(getBranches()[index]);
       if (table)
         return table->lookupNearestSymbolFrom<func::FuncOp>(*this, flat_sym_ref);
       return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, flat_sym_ref);
@@ -854,14 +854,14 @@ Example:
       "return getElementTypeOrSelf(resource_subtype());">;
   DerivedAttr shape = DerivedAttr<
       "ShapedType",
-      "return resource_subtype().cast<ShapedType>();",
+      "return llvm::cast<ShapedType>(resource_subtype());",
       [{ mlir::TF::ShapeAttr::get($_ctxt, $_self) }]>;
 
   let extraClassDeclaration = [{
     TensorType resource_subtype() { return resource_type().getSubtypes()[0]; }
 
     ResourceType resource_type() {
-      return getElementTypeOrSelf(getResource()).cast<TF::ResourceType>();
+      return llvm::cast<TF::ResourceType>(getElementTypeOrSelf(getResource()));
     }
   }];
 
@@ -2210,6 +2210,36 @@ def TF_XlaSparseDenseMatmulWithCsrInputOp : TF_Op<"XlaSparseDenseMatmulWithCsrIn
   );
 }
 
+def TF_XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp : TF_Op<"XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput", [Pure]> {
+  let summary = "This op looks up the embedding vectors on SparseCores and performs the given combiner computation on TensorCores.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Int32Tensor:$sorted_pos_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$weights,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$input_size,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$max_valency,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_weights,
+    OptionalAttr<F32Attr>:$quantization_config_low,
+    OptionalAttr<F32Attr>:$quantization_config_high,
+    OptionalAttr<I64Attr>:$quantization_config_num_buckets,
+
+    SymbolRefAttr:$combiner_computation,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$activations,
+    TF_Int32Tensor:$preserved_valencies,
+    TF_Float32Tensor:$preserved_vectors
+  );
+}
+
 def TF_XlaSparseDenseMatmulGradWithSgdAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulGradWithSgdAndCsrInput", [Pure]> {
   let summary = "";
 
@@ -2819,6 +2849,282 @@ def TF_XlaSparseDenseMatmulGradWithCsrInputOp : TF_Op<"XlaSparseDenseMatmulGradW
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<5>;
 }
 
+def TF_XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput", [Pure]> {
+  let summary = "This op back-propagates the activation gradients to the embedding table and the combiner weights.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Int32Tensor:$sorted_pos_ids,
+    TF_Float32Tensor:$sorted_gains,
+    // Custom combiner learnable weights to be updated in this backward pass.
+    TF_Float32Tensor:$weights,
+    // Preserved outputs of the SparseCore embedding forward pass (for TC
+    // combiner VJP).
+    TF_Int32Tensor:$preserved_valencies,
+    TF_Float32Tensor:$preserved_vectors,
+    TF_Float32Tensor:$preserved_weights,
+    // Gradients of the activation.
+    TF_Float32Tensor:$activation_gradients,
+    // Learning rate of the embedding table.
+    TF_Float32Tensor:$learning_rate,
+    // Learning rate of the custom combiner weights (using SGD).
+    TF_Float32Tensor:$combiner_weights_learning_rate,
+    TF_Float32Tensor:$embedding_table,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$max_valency,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_weights,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+
+    SymbolRefAttr:$combiner_table_vjp_computation,
+    SymbolRefAttr:$combiner_weights_vjp_computation,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_weights
+  );
+}
+
+def TF_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput", [Pure]> {
+  let summary = "This op back-propagates the activation gradients to the embedding table and the combiner weights.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Int32Tensor:$sorted_pos_ids,
+    TF_Float32Tensor:$sorted_gains,
+    // Custom combiner learnable weights to be updated in this backward pass.
+    TF_Float32Tensor:$weights,
+    // Preserved outputs of the SparseCore embedding forward pass (for TC
+    // combiner VJP).
+    TF_Int32Tensor:$preserved_valencies,
+    TF_Float32Tensor:$preserved_vectors,
+    TF_Float32Tensor:$preserved_weights,
+    // Gradients of the activation.
+    TF_Float32Tensor:$activation_gradients,
+    // Learning rate of the embedding table.
+    TF_Float32Tensor:$learning_rate,
+    // Learning rate of the custom combiner weights (using SGD).
+    TF_Float32Tensor:$combiner_weights_learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$max_valency,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_weights,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+
+    SymbolRefAttr:$combiner_table_vjp_computation,
+    SymbolRefAttr:$combiner_weights_vjp_computation,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_weights
+  );
+}
+
+def TF_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput", [Pure]> {
+  let summary = "This op back-propagates the activation gradients to the embedding table and the combiner weights.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Int32Tensor:$sorted_pos_ids,
+    TF_Float32Tensor:$sorted_gains,
+    // Custom combiner learnable weights to be updated in this backward pass.
+    TF_Float32Tensor:$weights,
+    // Preserved outputs of the SparseCore embedding forward pass (for TC
+    // combiner VJP).
+    TF_Int32Tensor:$preserved_valencies,
+    TF_Float32Tensor:$preserved_vectors,
+    TF_Float32Tensor:$preserved_weights,
+    // Gradients of the activation.
+    TF_Float32Tensor:$activation_gradients,
+    // Learning rate of the embedding table.
+    TF_Float32Tensor:$learning_rate,
+    // Learning rate of the custom combiner weights (using SGD).
+    TF_Float32Tensor:$combiner_weights_learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$momenta,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$max_valency,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_weights,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+
+    BoolAttr:$use_nesterov,
+    F32Attr:$exponent,
+    F32Attr:$beta1,
+    F32Attr:$beta2,
+    F32Attr:$epsilon,
+
+    SymbolRefAttr:$combiner_table_vjp_computation,
+    SymbolRefAttr:$combiner_weights_vjp_computation,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_momenta,
+    TF_Float32Tensor:$updated_weights
+  );
+}
+
+def TF_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput", [Pure]> {
+  let summary = "This op back-propagates the activation gradients to the embedding table and the combiner weights.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Int32Tensor:$sorted_pos_ids,
+    TF_Float32Tensor:$sorted_gains,
+    // Custom combiner learnable weights to be updated in this backward pass.
+    TF_Float32Tensor:$weights,
+    // Preserved outputs of the SparseCore embedding forward pass (for TC
+    // combiner VJP).
+    TF_Int32Tensor:$preserved_valencies,
+    TF_Float32Tensor:$preserved_vectors,
+    TF_Float32Tensor:$preserved_weights,
+    // Gradients of the activation.
+    TF_Float32Tensor:$activation_gradients,
+    // Learning rate of the embedding table.
+    TF_Float32Tensor:$learning_rate,
+    // Learning rate of the custom combiner weights (using SGD).
+    TF_Float32Tensor:$combiner_weights_learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$momenta,
+    TF_Float32Tensor:$velocity,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$max_valency,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_weights,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+
+    BoolAttr:$use_sum_inside_sqrt,
+    F32Attr:$beta1,
+    F32Attr:$beta2,
+    F32Attr:$epsilon,
+
+    SymbolRefAttr:$combiner_table_vjp_computation,
+    SymbolRefAttr:$combiner_weights_vjp_computation,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_momenta,
+    TF_Float32Tensor:$updated_velocity,
+    TF_Float32Tensor:$updated_weights
+  );
+}
+
+def TF_XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp : TF_Op<"XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput", [Pure]> {
+  let summary = "This op back-propagates the activation gradients to the embedding table and the combiner weights.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Int32Tensor:$sorted_pos_ids,
+    TF_Float32Tensor:$sorted_gains,
+    // Custom combiner learnable weights to be updated in this backward pass.
+    TF_Float32Tensor:$weights,
+    // Preserved outputs of the SparseCore embedding forward pass (for TC
+    // combiner VJP).
+    TF_Int32Tensor:$preserved_valencies,
+    TF_Float32Tensor:$preserved_vectors,
+    TF_Float32Tensor:$preserved_weights,
+    // Gradients of the activation.
+    TF_Float32Tensor:$activation_gradients,
+    // Learning rate of the embedding table.
+    TF_Float32Tensor:$learning_rate,
+    // Learning rate of the custom combiner weights (using SGD).
+    TF_Float32Tensor:$combiner_weights_learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$linear,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$max_valency,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_weights,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+
+    BoolAttr:$multiply_linear_by_learning_rate,
+    F32Attr:$beta,
+    F32Attr:$learning_rate_power,
+    F32Attr:$l1_regularization_strength,
+    F32Attr:$l2_regularization_strength,
+
+    SymbolRefAttr:$combiner_table_vjp_computation,
+    SymbolRefAttr:$combiner_weights_vjp_computation,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_linear,
+    TF_Float32Tensor:$updated_weights
+  );
+}
+
+def TF_XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp : TF_Op<"XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput", [AttrSizedOperandSegments, Pure]> {
+  let summary = "This op back-propagates the activation gradients to the embedding table and the combiner weights.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Int32Tensor:$sorted_pos_ids,
+    TF_Float32Tensor:$sorted_gains,
+    // Custom combiner learnable weights to be updated in this backward pass.
+    TF_Float32Tensor:$weights,
+    // Preserved outputs of the SparseCore embedding forward pass (for TC
+    // combiner VJP).
+    TF_Int32Tensor:$preserved_valencies,
+    TF_Float32Tensor:$preserved_vectors,
+    TF_Float32Tensor:$preserved_weights,
+    // Gradients of the activation.
+    TF_Float32Tensor:$activation_gradients,
+    // The embedding table and the associated slot variables.
+    Variadic<TF_Float32Tensor>:$tables,
+    // Hyperparameters of the current optimizer.
+    Variadic<TF_Float32Tensor>:$hyperparameters,
+    // Learning rate of the custom combiner weights (using SGD).
+    TF_Float32Tensor:$combiner_weights_learning_rate,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$max_valency,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$num_weights,
+
+    SymbolRefAttr:$combiner_table_vjp_computation,
+    SymbolRefAttr:$combiner_weights_vjp_computation,
+    SymbolRefAttr:$optimizer_custom_computation,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    Variadic<TF_Float32Tensor>:$updated_tables,
+    TF_Float32Tensor:$updated_weights
+  );
+
+  // Number of embedding table + its associated slot variables.
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<10>;
+  // Number of hyperparameters.
+  TF_DerivedOperandSizeAttr M = TF_DerivedOperandSizeAttr<11>;
+}
+
 // b/394499589: move back to tf_generated_ops.td
 def TF_PartitionedCallOp : TF_Op<"PartitionedCall", [CallOpInterface, DeclareOpInterfaceMethods<SymbolUserOpInterface>, Pure]> {
   let summary = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 905f4864655a..ce586b43fd38 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -98,7 +98,7 @@ namespace {
 // Returns the equivalent Value skipping through identity nodes.
 Value LookThroughIdentity(Value result) {
   while (isa_and_nonnull<IdentityOp, IdentityNOp>(result.getDefiningOp())) {
-    auto op_result = result.cast<OpResult>();
+    auto op_result = cast<OpResult>(result);
     result = op_result.getOwner()->getOperand(op_result.getResultNumber());
   }
   return result;
@@ -195,7 +195,7 @@ LogicalResult OneHotOp::verify() {
   OneHotOp op = *this;
   int64_t axis = op.getAxis();
 
-  auto indices_ty = op.getIndices().getType().dyn_cast<RankedTensorType>();
+  auto indices_ty = llvm::dyn_cast<RankedTensorType>(op.getIndices().getType());
   if (indices_ty &&
       !(axis == -1 || (axis >= 0 && axis <= indices_ty.getShape().size()))) {
     return op.emitOpError()
@@ -234,11 +234,11 @@ LogicalResult OneHotOp::verify() {
 static TensorType InferOneHotOpType(Value indices, Value depth, Value on_value,
                                     Value off_value, IntegerAttr axis) {
   int64_t axis_val = axis.getInt();
-  Type element_ty = on_value.getType().cast<TensorType>().getElementType();
+  Type element_ty = llvm::cast<TensorType>(on_value.getType()).getElementType();
   auto unranked_ty = UnrankedTensorType::get(element_ty);
   if (axis_val < -1) return unranked_ty;
 
-  auto indices_ty = indices.getType().dyn_cast<RankedTensorType>();
+  auto indices_ty = llvm::dyn_cast<RankedTensorType>(indices.getType());
   if (!indices_ty) return unranked_ty;
 
   auto shape = llvm::to_vector<2>(indices_ty.getShape());
@@ -278,7 +278,7 @@ LogicalResult PackOp::verify() {
 
   int64_t inputs_rank = -1;
   for (Value value : values) {
-    if (auto ty = value.getType().dyn_cast<RankedTensorType>()) {
+    if (auto ty = llvm::dyn_cast<RankedTensorType>(value.getType())) {
       // Exit early as input types are verified to be compatible so all ranked
       // tensors have the same rank.
       inputs_rank = ty.getRank();
@@ -346,7 +346,7 @@ OpFoldResult PackOp::fold(FoldAdaptor) {
     auto const_op = dyn_cast_or_null<ConstOp>(value.getDefiningOp());
     if (!const_op) return std::nullopt;
 
-    auto value_attr = const_op.getValue().dyn_cast<DenseIntElementsAttr>();
+    auto value_attr = llvm::dyn_cast<DenseIntElementsAttr>(const_op.getValue());
     if (!value_attr || value_attr.getNumElements() != 1) return std::nullopt;
 
     auto value_ty = value_attr.getType();
@@ -378,7 +378,7 @@ OpFoldResult PackOp::fold(FoldAdaptor) {
     return {};
 
   // First tensor dimension is dynamic.
-  auto arg_ty = tensor.getType().dyn_cast<ShapedType>();
+  auto arg_ty = llvm::dyn_cast<ShapedType>(tensor.getType());
   if (!arg_ty || !arg_ty.hasRank() || arg_ty.getNumDynamicDims() != 1 ||
       !arg_ty.isDynamicDim(0))
     return {};
@@ -416,8 +416,8 @@ struct ConvertPackToReshape : public OpRewritePattern<PackOp> {
     }
 
     // Check if input and output are static.
-    auto input_ty = pack_op.getOperand(0).getType().cast<ShapedType>();
-    auto output_ty = pack_op.getOutput().getType().cast<ShapedType>();
+    auto input_ty = llvm::cast<ShapedType>(pack_op.getOperand(0).getType());
+    auto output_ty = llvm::cast<ShapedType>(pack_op.getOutput().getType());
     if (!input_ty.hasStaticShape() || !output_ty.hasStaticShape()) {
       return failure();
     }
@@ -467,7 +467,8 @@ LogicalResult PadOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
       dyn_cast_or_null<TF::ConstOp>(getPaddings().getDefiningOp());
   if (!paddings_op) return failure();
 
-  auto paddings_value = paddings_op.getValue().dyn_cast<DenseElementsAttr>();
+  auto paddings_value =
+      llvm::dyn_cast<DenseElementsAttr>(paddings_op.getValue());
   if (!paddings_value ||
       paddings_value.getNumElements() != permutation.size() * 2)
     return failure();
@@ -493,9 +494,8 @@ LogicalResult PadOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
   setOperand(1, shuffled_paddings_op);
 
   // Change the result type.
-  getResult().setType(ShuffleRankedTensorType(getResult().getType(),
-                                              ReversePermutation(permutation))
-                          .cast<TensorType>());
+  getResult().setType(llvm::cast<TensorType>(ShuffleRankedTensorType(
+      getResult().getType(), ReversePermutation(permutation))));
 
   return success();
 }
@@ -561,7 +561,7 @@ LogicalResult ParseExampleV2Op::verify() {
 template <typename CallOpClass>
 static LogicalResult VerifyPartitionedCall(CallOpClass op,
                                            SymbolTableCollection &symbolTable) {
-  SymbolRefAttr func = op->getAttr("f").template cast<SymbolRefAttr>();
+  SymbolRefAttr func = llvm::cast<SymbolRefAttr>(op->getAttr("f"));
   auto function = symbolTable.lookupNearestSymbolFrom<func::FuncOp>(op, func);
   if (!function) {
     return op.emitError("'f' attribute refers to an undefined function: ")
@@ -625,10 +625,10 @@ void TPUPartitionedCallOp::setCalleeFromCallable(
 
 OpFoldResult PowOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
-  auto constant_y = operands[1].dyn_cast_or_null<DenseFPElementsAttr>();
+  auto constant_y = llvm::dyn_cast_if_present<DenseFPElementsAttr>(operands[1]);
   if (constant_y && constant_y.isSplat()) {
     APFloat y_value = constant_y.getSplatValue<APFloat>();
-    auto output_type = getType().cast<ShapedType>();
+    auto output_type = llvm::cast<ShapedType>(getType());
     if (y_value.isZero() && output_type.hasStaticShape()) {
       return DenseElementsAttr::get(
           output_type,
@@ -661,7 +661,7 @@ void QuantizeAndDequantizeV2Op::getCanonicalizationPatterns(
 //
 LogicalResult QrOp::verify() {
   QrOp op = *this;
-  auto ttype = op.getInput().getType().cast<TensorType>();
+  auto ttype = llvm::cast<TensorType>(op.getInput().getType());
   if (!ttype.hasRank()) return success();
   if (!HasRankAtLeast(op.getInput(), 2))
     return op.emitOpError(
@@ -765,29 +765,29 @@ void RangeOp::build(OpBuilder &builder, OperationState &result, Value start,
         builder, result,
         tensorflow::GetTypeFromTFTensorShape(
             size.getSExtValue(),
-            start.getType().cast<TensorType>().getElementType()),
+            llvm::cast<TensorType>(start.getType()).getElementType()),
         start, limit, delta);
   }
   return RangeOp::build(
       builder, result,
       tensorflow::GetTypeFromTFTensorShape(
-          {-1}, start.getType().cast<TensorType>().getElementType()),
+          {-1}, llvm::cast<TensorType>(start.getType()).getElementType()),
       start, limit, delta);
 }
 
 OpFoldResult RangeOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
   assert(operands.size() == 3);
-  auto start_tensor = operands[0].dyn_cast_or_null<ElementsAttr>();
-  auto limit_tensor = operands[1].dyn_cast_or_null<ElementsAttr>();
-  auto delta_tensor = operands[2].dyn_cast_or_null<ElementsAttr>();
+  auto start_tensor = llvm::dyn_cast_if_present<ElementsAttr>(operands[0]);
+  auto limit_tensor = llvm::dyn_cast_if_present<ElementsAttr>(operands[1]);
+  auto delta_tensor = llvm::dyn_cast_if_present<ElementsAttr>(operands[2]);
   if (!(start_tensor && limit_tensor && delta_tensor)) return nullptr;
 
   // Operands should all be scalars
   assert(start_tensor.getShapedType().getRank() == 0 &&
          limit_tensor.getShapedType().getRank() == 0 &&
          delta_tensor.getShapedType().getRank() == 0);
-  Type elem_type = getType().cast<ShapedType>().getElementType();
+  Type elem_type = llvm::cast<ShapedType>(getType()).getElementType();
   if (elem_type.isSignlessInteger() || elem_type.isUnsignedInteger()) {
     auto start_attr = start_tensor.getValues<IntegerAttr>()[0];
     auto limit_attr = limit_tensor.getValues<IntegerAttr>()[0];
@@ -809,7 +809,7 @@ OpFoldResult RangeOp::fold(FoldAdaptor adaptor) {
     }
     return BuildConstRangeTensor(elem_type, num_elements, start_attr,
                                  delta_attr);
-  } else if (elem_type.isa<FloatType>()) {
+  } else if (isa<FloatType>(elem_type)) {
     auto start_attr = start_tensor.getValues<FloatAttr>()[0];
     auto limit_attr = limit_tensor.getValues<FloatAttr>()[0];
     auto delta_attr = delta_tensor.getValues<FloatAttr>()[0];
@@ -836,12 +836,12 @@ void RankOp::build(OpBuilder &builder, OperationState &result, Value input) {
 // This will create a constant value for RankOp of a ranked tensor.
 OpFoldResult RankOp::fold(FoldAdaptor) {
   auto type = getInput().getType();
-  auto ranked_type = type.dyn_cast<RankedTensorType>();
+  auto ranked_type = llvm::dyn_cast<RankedTensorType>(type);
   if (!ranked_type) return {};
 
   // DenseIntElementsAttr::get requires the output type be ranked with static
   // shape.
-  auto output_type = getType().dyn_cast<RankedTensorType>();
+  auto output_type = llvm::dyn_cast<RankedTensorType>(getType());
   if (!output_type || !output_type.hasStaticShape()) return {};
 
   int32_t rank = ranked_type.getRank();
@@ -882,11 +882,11 @@ using ReshapeErrorHandler =
 LogicalResult GetReshapeOutputType(Value tensor, Value shape,
                                    ReshapeErrorHandler error_handler,
                                    TensorType &output_ty) {
-  auto tensor_ty = tensor.getType().cast<TensorType>();
+  auto tensor_ty = llvm::cast<TensorType>(tensor.getType());
   auto element_ty = tensor_ty.getElementType();
   output_ty = UnrankedTensorType::get(element_ty);
 
-  auto shape_ty = shape.getType().dyn_cast<RankedTensorType>();
+  auto shape_ty = llvm::dyn_cast<RankedTensorType>(shape.getType());
   if (!shape_ty) return success();
   if (shape_ty.getRank() != 1)
     return error_handler(llvm::formatv(
@@ -982,9 +982,9 @@ LogicalResult ReshapeOp::verify() {
                                   expected_ty)))
     return failure();
 
-  auto output_ty = op.getType().dyn_cast<RankedTensorType>();
+  auto output_ty = llvm::dyn_cast<RankedTensorType>(op.getType());
   if (!output_ty) return success();
-  auto tensor_ty = op.getTensor().getType().cast<TensorType>();
+  auto tensor_ty = llvm::cast<TensorType>(op.getTensor().getType());
   if (output_ty.hasStaticShape() && tensor_ty.hasStaticShape()) {
     const int64_t output_ty_size = output_ty.getNumElements();
     const int64_t tensor_ty_size = tensor_ty.getNumElements();
@@ -1027,7 +1027,7 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor) {
 
   // Fold reshape if operand and result types are the same and all dimensions
   // are statically known (no-op reshape).
-  auto result_ty = getType().dyn_cast<ShapedType>();
+  auto result_ty = llvm::dyn_cast<ShapedType>(getType());
   if (result_ty && result_ty.hasStaticShape() &&
       result_ty == tensor.getType()) {
     return tensor;
@@ -1049,8 +1049,8 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor) {
 //         first dimension equal to `cond`.
 LogicalResult SelectOp::verify() {
   SelectOp op = *this;
-  auto then_tensor = op.getThenValue().getType().cast<TensorType>();
-  auto else_tensor = op.getElseValue().getType().cast<TensorType>();
+  auto then_tensor = llvm::cast<TensorType>(op.getThenValue().getType());
+  auto else_tensor = llvm::cast<TensorType>(op.getElseValue().getType());
   // Check (1).
   if (!AreCastCompatible({then_tensor, else_tensor}))
     return op.emitOpError() << "requires t and e have compatible shapes";
@@ -1081,7 +1081,8 @@ LogicalResult SelectOp::verify() {
     return success();
   }
 
-  auto cond_tensor = op.getCondition().getType().dyn_cast<RankedTensorType>();
+  auto cond_tensor =
+      llvm::dyn_cast<RankedTensorType>(op.getCondition().getType());
   if (!cond_tensor) return success();
   auto cond_rank = cond_tensor.getRank();
   // Check (2a) and (2b).
@@ -1111,15 +1112,15 @@ LogicalResult SelectOp::verify() {
 //===----------------------------------------------------------------------===//
 
 static Type InferSelectV2OpType(Value condition, Value e, Value t) {
-  Type element_ty = e.getType().cast<TensorType>().getElementType();
+  Type element_ty = llvm::cast<TensorType>(e.getType()).getElementType();
   auto unranked_ty = UnrankedTensorType::get(element_ty);
 
   Type broadcasted_ty =
       OpTrait::util::getBroadcastedType(e.getType(), t.getType());
   if (!broadcasted_ty) return unranked_ty;
 
-  auto cond_ranked_ty = condition.getType().dyn_cast<RankedTensorType>();
-  auto broadcasted_ranked_ty = broadcasted_ty.dyn_cast<RankedTensorType>();
+  auto cond_ranked_ty = llvm::dyn_cast<RankedTensorType>(condition.getType());
+  auto broadcasted_ranked_ty = llvm::dyn_cast<RankedTensorType>(broadcasted_ty);
   if (!cond_ranked_ty || !broadcasted_ranked_ty) return unranked_ty;
 
   // Explicitly get broadcasted output type as element types of condition may
@@ -1149,12 +1150,13 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
   std::string variadic_idx_str =
       variadic_idx < 0 ? "" : llvm::formatv(" #{0}", variadic_idx).str();
 
-  auto result_ranked_type = result_type.dyn_cast<RankedTensorType>();
+  auto result_ranked_type = llvm::dyn_cast<RankedTensorType>(result_type);
   if (!result_ranked_type) return success();
   if (result_ranked_type.getShape().size() != 1)
     return op->emitOpError("requires 1D type for result") << variadic_idx_str;
 
-  auto operand_ranked_type = operand_type.dyn_cast_or_null<RankedTensorType>();
+  auto operand_ranked_type =
+      llvm::dyn_cast_or_null<RankedTensorType>(operand_type);
   if (operand_ranked_type) {
     // The operand is a ranked tensor.
     if (result_ranked_type.hasStaticShape() &&
@@ -1197,7 +1199,7 @@ LogicalResult ShapeOp::verify() {
 // Converts shape of the given type to attribute if it is of ranked tensor type.
 // Returned attribute has integer elements of the given width.
 static Attribute ConvertShapeToAttr(Type input_ty, int out_width) {
-  auto ranked_ty = input_ty.dyn_cast<RankedTensorType>();
+  auto ranked_ty = llvm::dyn_cast<RankedTensorType>(input_ty);
   if (!ranked_ty || !ranked_ty.hasStaticShape()) return {};
 
   auto shape = ranked_ty.getShape();
@@ -1214,14 +1216,15 @@ static Attribute ConvertShapeToAttr(Type input_ty, int out_width) {
 }
 
 OpFoldResult ShapeOp::fold(FoldAdaptor) {
-  int width =
-      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
+  int width = llvm::cast<ShapedType>(getType())
+                  .getElementType()
+                  .getIntOrFloatBitWidth();
   return ConvertShapeToAttr(getOperand().getType(), width);
 }
 
 void ShapeOp::build(OpBuilder &builder, OperationState &result, Value input,
                     BoolAttr use32Bit) {
-  auto rankedTensorType = input.getType().dyn_cast<RankedTensorType>();
+  auto rankedTensorType = llvm::dyn_cast<RankedTensorType>(input.getType());
   int64_t rank = rankedTensorType ? rankedTensorType.getRank() : -1;
   auto out_type = use32Bit.getValue() ? builder.getIntegerType(32)
                                       : builder.getIntegerType(64);
@@ -1347,9 +1350,9 @@ LogicalResult SizeOp::verify() {
 }
 
 OpFoldResult SizeOp::fold(FoldAdaptor) {
-  ShapedType output_type = getType().cast<ShapedType>();
+  ShapedType output_type = llvm::cast<ShapedType>(getType());
   if (!output_type.hasRank()) return {};
-  ShapedType input_type = getOperand().getType().cast<ShapedType>();
+  ShapedType input_type = llvm::cast<ShapedType>(getOperand().getType());
   if (!input_type.hasStaticShape()) return {};
   int size = input_type.getNumElements();
   return DenseElementsAttr::get(
@@ -1395,13 +1398,13 @@ LogicalResult SliceOp::verify() {
                                " same number of elements";
   }
 
-  auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
+  auto input_ty = llvm::dyn_cast<RankedTensorType>(op.getInput().getType());
   if (input_ty && begin_ty.getNumElements() != input_ty.getRank()) {
     return op.emitOpError() << "requires number of elements in begin and size "
                                "are equal to input rank";
   }
 
-  auto output_ty = op.getOutput().getType().dyn_cast<RankedTensorType>();
+  auto output_ty = llvm::dyn_cast<RankedTensorType>(op.getOutput().getType());
   if (output_ty && input_ty && output_ty.getRank() != input_ty.getRank()) {
     return op.emitOpError()
            << "requires output to have the same rank as input, but got input "
@@ -1488,9 +1491,8 @@ LogicalResult SoftmaxOp::verify() {
 LogicalResult SoftmaxCrossEntropyWithLogitsOp::verify() {
   SoftmaxCrossEntropyWithLogitsOp op = *this;
   auto broadcasted_ty =
-      OpTrait::util::getBroadcastedType(op.getFeatures().getType(),
-                                        op.getLabels().getType())
-          .dyn_cast_or_null<ShapedType>();
+      llvm::dyn_cast_or_null<ShapedType>(OpTrait::util::getBroadcastedType(
+          op.getFeatures().getType(), op.getLabels().getType()));
   if (!broadcasted_ty ||
       (broadcasted_ty.hasRank() && broadcasted_ty.getRank() != 2))
     return op.emitOpError(
@@ -1516,9 +1518,10 @@ int64_t SpaceToBatchNDBlockRank(const TensorType block_shape_type,
 
 LogicalResult SpaceToBatchNDOp::verify() {
   SpaceToBatchNDOp op = *this;
-  const auto input_type = op.getInput().getType().cast<TensorType>();
-  const auto block_shape_type = op.getBlockShape().getType().cast<TensorType>();
-  const auto paddings_type = op.getPaddings().getType().cast<TensorType>();
+  const auto input_type = llvm::cast<TensorType>(op.getInput().getType());
+  const auto block_shape_type =
+      llvm::cast<TensorType>(op.getBlockShape().getType());
+  const auto paddings_type = llvm::cast<TensorType>(op.getPaddings().getType());
 
   // Check that block_shape has rank 1.
   if (!IsOfRankOrUnranked(op.getBlockShape(), 1)) {
@@ -1626,8 +1629,9 @@ LogicalResult SparseSoftmaxCrossEntropyWithLogitsOp::verify() {
   if (!IsOfRankOrUnranked(op.getLabels(), 1)) {
     return op.emitOpError("requires labels operand of rank one");
   }
-  auto features_ty = op.getFeatures().getType().dyn_cast<RankedTensorType>();
-  auto labels_ty = op.getLabels().getType().dyn_cast<RankedTensorType>();
+  auto features_ty =
+      llvm::dyn_cast<RankedTensorType>(op.getFeatures().getType());
+  auto labels_ty = llvm::dyn_cast<RankedTensorType>(op.getLabels().getType());
   if (features_ty && labels_ty) {
     int64_t features_batches = features_ty.getDimSize(0);
     int64_t labels_batches = labels_ty.getDimSize(0);
@@ -1653,7 +1657,8 @@ LogicalResult VerifySplitInputAndSplitDim(Op op,
   *dim_index = std::nullopt;
 
   Value split_dim = op.getSplitDim();
-  if (auto split_dim_type = split_dim.getType().dyn_cast<RankedTensorType>())
+  if (auto split_dim_type =
+          llvm::dyn_cast<RankedTensorType>(split_dim.getType()))
     if (split_dim_type.getRank() != 0)
       return op.emitOpError(
           "split dimension should be an integer scalar tensor");
@@ -1661,8 +1666,7 @@ LogicalResult VerifySplitInputAndSplitDim(Op op,
   // We can perform further verification if the input tensor to be split has
   // known rank and the split dimension tensor is a constant.
 
-  auto input_type =
-      op.getValue().getType().template dyn_cast<RankedTensorType>();
+  auto input_type = llvm::dyn_cast<RankedTensorType>(op.getValue().getType());
   if (!input_type) return success();
 
   int64_t input_rank = input_type.getRank();
@@ -1691,8 +1695,8 @@ LogicalResult SplitOp::verify() {
   if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
   if (!dim_index) return success();
 
-  int64_t input_dim_size =
-      op.getValue().getType().cast<RankedTensorType>().getDimSize(*dim_index);
+  int64_t input_dim_size = llvm::cast<RankedTensorType>(op.getValue().getType())
+                               .getDimSize(*dim_index);
   if (ShapedType::isDynamic(input_dim_size)) return success();
 
   if (op.getNumResults() == 0) return failure();
@@ -1711,7 +1715,7 @@ LogicalResult SplitOp::verify() {
 LogicalResult SplitVOp::verify() {
   SplitVOp op = *this;
   auto split_sizes_type =
-      op.getSizeSplits().getType().dyn_cast<RankedTensorType>();
+      llvm::dyn_cast<RankedTensorType>(op.getSizeSplits().getType());
   if (!split_sizes_type) return success();
 
   if (split_sizes_type.getRank() != 1 ||
@@ -1724,8 +1728,8 @@ LogicalResult SplitVOp::verify() {
   if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
   if (!dim_index) return success();
 
-  int64_t input_dim_size =
-      op.getValue().getType().cast<RankedTensorType>().getDimSize(*dim_index);
+  int64_t input_dim_size = llvm::cast<RankedTensorType>(op.getValue().getType())
+                               .getDimSize(*dim_index);
   if (ShapedType::isDynamic(input_dim_size)) return success();
 
   // If split sizes come from a constant, they must sum to the dimension size
@@ -1739,7 +1743,7 @@ LogicalResult SplitVOp::verify() {
 
   SmallVector<int64_t, 4> split_sizes;
   split_sizes.reserve(
-      split_sizes_attr.getType().cast<ShapedType>().getNumElements());
+      llvm::cast<ShapedType>(split_sizes_attr.getType()).getNumElements());
 
   for (const auto &dim : llvm::enumerate(split_sizes_attr)) {
     int64_t dim_val = dim.value().getSExtValue();
@@ -1785,7 +1789,7 @@ void SquareOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult SqueezeOp::verify() {
   SqueezeOp op = *this;
-  auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
+  auto input_type = llvm::dyn_cast<RankedTensorType>(op.getInput().getType());
 
   if (!input_type) return success();  // Can't verify squeeze dims.
 
@@ -1829,9 +1833,9 @@ void SumOp::build(OpBuilder &builder, OperationState &result, Value input,
 
 // TODO: Templatize this fold for all reduction ops.
 OpFoldResult SumOp::fold(FoldAdaptor) {
-  auto input_ty = getInput().getType().template dyn_cast<RankedTensorType>();
+  auto input_ty = llvm::dyn_cast<RankedTensorType>(getInput().getType());
   if (!input_ty) return {};
-  auto result_ty = getType().template dyn_cast<RankedTensorType>();
+  auto result_ty = llvm::dyn_cast<RankedTensorType>(getType());
   if (!result_ty) return {};
 
   // Bypass this op if the result has the same shape and type. This can happen
@@ -1866,7 +1870,7 @@ static LogicalResult VerifyStridedSliceBase(OpTy op) {
   int64_t expected_size = -1;
 
   for (Value val : {op.getBegin(), op.getEnd(), op.getStrides()}) {
-    auto operand_ty = val.getType().dyn_cast<ShapedType>();
+    auto operand_ty = llvm::dyn_cast<ShapedType>(val.getType());
     if (!operand_ty || !operand_ty.hasStaticShape()) {
       // TensorFlow constant ops may have non-static shape because the shape is
       // not propagated during constant folding. If the defining op for this
@@ -2151,7 +2155,7 @@ bool StridedSliceOp::GetSlicedBoundRanges(
       !matchPattern(getStrides(), m_Constant(&sparse_strides_attr)))
     return false;
 
-  auto input_ty = this->getInput().getType().dyn_cast<RankedTensorType>();
+  auto input_ty = llvm::dyn_cast<RankedTensorType>(this->getInput().getType());
   if (!input_ty || !input_ty.hasStaticShape()) return false;
   auto input_shape = llvm::to_vector<4>(input_ty.getShape());
 
@@ -2210,7 +2214,8 @@ OpFoldResult StridedSliceOp::fold(FoldAdaptor) {
   // pattern.
   if (getNewAxisMask() != 0) return {};
 
-  auto tensor_ty = shape_op.getInput().getType().dyn_cast<RankedTensorType>();
+  auto tensor_ty =
+      llvm::dyn_cast<RankedTensorType>(shape_op.getInput().getType());
   // Only ranked tensor can be folded.
   if (!tensor_ty) return {};
 
@@ -2269,8 +2274,8 @@ OpFoldResult StridedSliceOp::fold(FoldAdaptor) {
   // scalar or a vector based on `shrink_axis_mask` because we have rejected
   // the case of `new_axis_mask` != 0.
   auto output_elt_ty =
-      getOutput().getType().cast<ShapedType>().getElementType();
-  auto output_ty = getOutput().getType().dyn_cast<RankedTensorType>();
+      llvm::cast<ShapedType>(getOutput().getType()).getElementType();
+  auto output_ty = llvm::dyn_cast<RankedTensorType>(getOutput().getType());
   if (!output_ty || !output_ty.hasStaticShape()) {
     if (getShrinkAxisMask() == 1) {
       output_ty = tensorflow::GetTypeFromTFTensorShape({}, output_elt_ty);
@@ -2296,7 +2301,7 @@ OpFoldResult StridedSliceOp::fold(FoldAdaptor) {
 
 LogicalResult StridedSliceGradOp::verify() {
   StridedSliceGradOp op = *this;
-  auto shape_type = op.getShape().getType().dyn_cast<RankedTensorType>();
+  auto shape_type = llvm::dyn_cast<RankedTensorType>(op.getShape().getType());
   if (shape_type && shape_type.getRank() != 1)
     return op.emitOpError("'shape' operand must be 1D tensor, but got ")
            << shape_type.getRank() << "D tensor";
@@ -2418,7 +2423,7 @@ LogicalResult TPUExecuteAndUpdateVariablesOp::verify() {
   TPUExecuteAndUpdateVariablesOp op = *this;
   int num_resource_args = 0;
   for (Type arg_type : op.getArgs().getTypes())
-    if (arg_type.cast<TensorType>().getElementType().isa<ResourceType>())
+    if (isa<ResourceType>(cast<TensorType>(arg_type).getElementType()))
       ++num_resource_args;
 
   auto check_attr = [&](ArrayAttr indices, llvm::StringRef name,
@@ -2431,7 +2436,7 @@ LogicalResult TPUExecuteAndUpdateVariablesOp::verify() {
              << num_resource_args << "), but got " << indices.size();
 
     for (const auto &entry : llvm::enumerate(indices.getValue())) {
-      auto int_attr = entry.value().cast<IntegerAttr>();
+      auto int_attr = llvm::cast<IntegerAttr>(entry.value());
       if (int_attr.getInt() < min)
         return op.emitOpError()
                << "requires '" << name << "' to contain values of at least "
@@ -2457,20 +2462,16 @@ void TPUExecuteAndUpdateVariablesOp::getEffects(
                        ResourceEffects::TPUExecute::get());
   auto resource_handles =
       llvm::make_filter_range(getArgsMutable(), [](OpOperand &op_operand) {
-        return op_operand.get()
-            .getType()
-            .cast<TensorType>()
-            .getElementType()
-            .isa<ResourceType>();
+        return isa<ResourceType>(
+            cast<TensorType>(op_operand.get().getType()).getElementType());
       });
 
   for (const auto& entry : llvm::enumerate(resource_handles)) {
     OpOperand &op_operand = entry.value();
     effects.emplace_back(MemoryEffects::Read::get(), &op_operand,
                          ResourceEffects::Variable::get());
-    if (getDeviceVarUpdatesIndices()
-            .getValue()[entry.index()]
-            .cast<IntegerAttr>()
+    if (llvm::cast<IntegerAttr>(
+            getDeviceVarUpdatesIndices().getValue()[entry.index()])
             .getInt() >= 0)
       effects.emplace_back(MemoryEffects::Write::get(), &op_operand,
                            ResourceEffects::Variable::get());
@@ -2544,10 +2545,11 @@ LogicalResult TensorListReserveOp::verify() {
 //===----------------------------------------------------------------------===//
 
 OpFoldResult TensorListElementShapeOp::fold(FoldAdaptor) {
-  int width =
-      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
-  auto variant_type =
-      getElementTypeOrSelf(getOperand().getType()).cast<TF::VariantType>();
+  int width = llvm::cast<ShapedType>(getType())
+                  .getElementType()
+                  .getIntOrFloatBitWidth();
+  auto variant_type = llvm::cast<tf_type::VariantType>(
+      getElementTypeOrSelf(getOperand().getType()));
   if (variant_type.getSubtypes().empty()) return {};
   return ConvertShapeToAttr(variant_type.getSubtypes()[0], width);
 }
@@ -2578,8 +2580,8 @@ LogicalResult TensorScatterUpdateOp::verify() {
     return op.emitOpError(
         "requires indices operand to have at least 1 dimension");
 
-  auto tensor_ty = op.getTensor().getType().dyn_cast<RankedTensorType>();
-  auto indices_ty = op.getIndices().getType().dyn_cast<RankedTensorType>();
+  auto tensor_ty = llvm::dyn_cast<RankedTensorType>(op.getTensor().getType());
+  auto indices_ty = llvm::dyn_cast<RankedTensorType>(op.getIndices().getType());
   if (!tensor_ty || !indices_ty) return success();
 
   int64_t num_index_dims = indices_ty.getShape().back();
@@ -2608,10 +2610,10 @@ LogicalResult TensorScatterUpdateOp::verify() {
 
 LogicalResult TileOp::verify() {
   TileOp op = *this;
-  auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
+  auto input_type = llvm::dyn_cast<RankedTensorType>(op.getInput().getType());
   auto multiples_type =
-      op.getMultiples().getType().dyn_cast<RankedTensorType>();
-  auto output_type = op.getOutput().getType().dyn_cast<RankedTensorType>();
+      llvm::dyn_cast<RankedTensorType>(op.getMultiples().getType());
+  auto output_type = llvm::dyn_cast<RankedTensorType>(op.getOutput().getType());
 
   if (multiples_type && multiples_type.getRank() != 1) {
     return op.emitOpError() << "expected multiples to be rank 1, got rank = "
@@ -2745,7 +2747,7 @@ class FuseWithBroadcastCompatibleOp
         continue;
       }
 
-      auto shape = tile.getInput().getType().dyn_cast<RankedTensorType>();
+      auto shape = llvm::dyn_cast<RankedTensorType>(tile.getInput().getType());
       if (!shape) {
         continue;
       }
@@ -2837,13 +2839,13 @@ class ToBoolOfRankedTensor : public OpRewritePattern<ToBoolOp> {
   using OpRewritePattern<ToBoolOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(ToBoolOp op,
                                 PatternRewriter &rewriter) const override {
-    auto type = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto type = llvm::dyn_cast<RankedTensorType>(op.getOperand().getType());
     // If the input is an unranked tensor, cannpt rewrite.
     if (!type) return failure();
 
     // Expected return type of the ToBool operation. The return type of ToBool
     // operation is always 0D tensor of bool type.
-    auto result_type = op.getResult().getType().cast<RankedTensorType>();
+    auto result_type = llvm::cast<RankedTensorType>(op.getResult().getType());
 
     // If input is already a tensor<i1>, it can be folded into an identity.
     if (type == result_type) {
@@ -2858,7 +2860,7 @@ class ToBoolOfRankedTensor : public OpRewritePattern<ToBoolOp> {
       Attribute zero_attr;
       if (element_type.isIntOrFloat())
         zero_attr = rewriter.getZeroAttr(type);
-      else if (element_type.isa<TF::StringType>())
+      else if (isa<TF::StringType>(element_type))
         zero_attr = DenseStringElementsAttr::get(type, {""});
 
       if (!zero_attr) return failure();
@@ -2905,7 +2907,7 @@ LogicalResult TPUPartitionedInputV2Op::verify() {
   int num_partitions = 1;
   const mlir::ArrayAttr partition_dims = op.getPartitionDims();
   for (const mlir::Attribute &dim : partition_dims) {
-    num_partitions *= dim.cast<IntegerAttr>().getInt();
+    num_partitions *= llvm::cast<IntegerAttr>(dim).getInt();
   }
 
   const bool is_packed = op.getIsPacked();
@@ -2926,9 +2928,9 @@ LogicalResult TPUPartitionedInputV2Op::verify() {
 
 LogicalResult TransposeOp::verify() {
   TransposeOp op = *this;
-  auto perm_type = op.getPerm().getType().dyn_cast<RankedTensorType>();
-  auto x_type = op.getX().getType().dyn_cast<RankedTensorType>();
-  auto y_type = op.getY().getType().dyn_cast<RankedTensorType>();
+  auto perm_type = llvm::dyn_cast<RankedTensorType>(op.getPerm().getType());
+  auto x_type = llvm::dyn_cast<RankedTensorType>(op.getX().getType());
+  auto y_type = llvm::dyn_cast<RankedTensorType>(op.getY().getType());
 
   if (perm_type && perm_type.getRank() != 1) {
     return op.emitOpError()
@@ -2985,7 +2987,7 @@ LogicalResult TransposeOp::verify() {
 // TODO(jpienaar): perm could be optional too.
 void TransposeOp::build(OpBuilder &builder, OperationState &result, Value x,
                         Value perm) {
-  auto x_type = x.getType().cast<TensorType>();
+  auto x_type = llvm::cast<TensorType>(x.getType());
   // If value is unranked, then so is results.
   if (!x_type.hasRank())
     return TransposeOp::build(builder, result,
@@ -2995,7 +2997,7 @@ void TransposeOp::build(OpBuilder &builder, OperationState &result, Value x,
   // TODO(jpienaar): Handle unknown perm case.
 
   // TODO(jpienaar): Extract utility function.
-  auto etype = x_type.cast<ShapedType>().getElementType();
+  auto etype = llvm::cast<ShapedType>(x_type).getElementType();
   DenseIntElementsAttr attr_shape;
   if (matchPattern(perm, m_Constant(&attr_shape))) {
     llvm::SmallVector<int64_t, 4> const_shape;
@@ -3040,7 +3042,7 @@ OpFoldResult FoldCancellableTranspose(TransposeOp op) {
   if (transpose->getBlock() != op->getBlock()) {
     tensorflow::DataType dtype;
     auto status = tensorflow::ConvertToDataType(
-        op.getX().getType().cast<TensorType>().getElementType(), &dtype);
+        llvm::cast<TensorType>(op.getX().getType()).getElementType(), &dtype);
     if (status.ok()) {
       // We can only leave the transpose op on host if its dtype is supported on
       // host.
@@ -3104,7 +3106,7 @@ class NMSV3ToNMSV4Op : public OpRewritePattern<NonMaxSuppressionV3Op> {
     }
     SmallVector<Type, 2> new_result_types;
     new_result_types.push_back(nms_op.getType());
-    auto input_ty = nms_op.getType().template cast<ShapedType>();
+    auto input_ty = llvm::cast<ShapedType>(nms_op.getType());
     // corresponds to the second result type of nmsv4
     RankedTensorType valid_output_type =
         tensorflow::GetTypeFromTFTensorShape({}, input_ty.getElementType());
@@ -3184,7 +3186,7 @@ LogicalResult XlaCallModuleOp::verifySymbolUses(
     SymbolTableCollection &symbolTable) {
   for (auto f : getFunctionList()) {
     auto func = symbolTable.lookupNearestSymbolFrom<func::FuncOp>(
-        getOperation(), f.cast<mlir::SymbolRefAttr>());
+        getOperation(), llvm::cast<SymbolRefAttr>(f));
     if (!func) {
       return emitOpError() << "refers to an undefined function: " << f;
     }
@@ -3223,7 +3225,7 @@ std::optional<std::string> XlaLaunchOp::GetResourceInstanceStr() {
 
 LogicalResult UnpackOp::verify() {
   UnpackOp op = *this;
-  auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+  auto value_type = llvm::dyn_cast<RankedTensorType>(op.getValue().getType());
   if (!value_type) return success();
 
   int64_t value_rank = value_type.getRank();
@@ -3321,9 +3323,9 @@ static LogicalResult VerifyUnsortedSegmentReduction(Op op) {
   if (!HasRankAtMost(op.getNumSegments(), 0))
     return op.emitOpError("number of segments should be a 0-D tensor");
 
-  auto data_type = op.getData().getType().template dyn_cast<RankedTensorType>();
+  auto data_type = llvm::dyn_cast<RankedTensorType>(op.getData().getType());
   auto segment_ids_type =
-      op.getSegmentIds().getType().template dyn_cast<RankedTensorType>();
+      llvm::dyn_cast<RankedTensorType>(op.getSegmentIds().getType());
   if (data_type && segment_ids_type) {
     if (data_type.getRank() < segment_ids_type.getRank())
       return op.emitOpError(
@@ -3434,11 +3436,12 @@ void VariableOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult VariableShapeOp::verify() {
   VariableShapeOp op = *this;
-  auto input_type = op.getInput().getType().cast<TensorType>();
+  auto input_type = llvm::cast<TensorType>(op.getInput().getType());
   if (input_type.hasStaticShape() && input_type.getNumElements() != 1)
     return op.emitOpError("requires input to have one resource");
 
-  auto resource_type = input_type.getElementType().cast<TF::ResourceType>();
+  auto resource_type =
+      llvm::cast<tf_type::ResourceType>(input_type.getElementType());
   auto subtypes = resource_type.getSubtypes();
   switch (subtypes.size()) {
     case 1:
@@ -3453,10 +3456,11 @@ LogicalResult VariableShapeOp::verify() {
 }
 
 OpFoldResult VariableShapeOp::fold(FoldAdaptor) {
-  int width =
-      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
-  auto resource_type =
-      getElementTypeOrSelf(getOperand().getType()).cast<TF::ResourceType>();
+  int width = llvm::cast<ShapedType>(getType())
+                  .getElementType()
+                  .getIntOrFloatBitWidth();
+  auto resource_type = llvm::cast<tf_type::ResourceType>(
+      getElementTypeOrSelf(getOperand().getType()));
   if (resource_type.getSubtypes().empty()) return {};
   return ConvertShapeToAttr(resource_type.getSubtypes()[0], width);
 }
@@ -3566,7 +3570,7 @@ LogicalResult WhileRegionOp::verify() {
            << "condition should yield a tensor<i1> and forward the arguments";
 
   auto cond_type =
-      cond_yield->getOperand(0).getType().dyn_cast<RankedTensorType>();
+      llvm::dyn_cast<RankedTensorType>(cond_yield->getOperand(0).getType());
   if (!cond_type || !cond_type.getShape().equals({}) ||
       !cond_type.getElementType().isInteger(/*width=*/1))
     return op.emitOpError()
@@ -3852,8 +3856,8 @@ LogicalResult XlaBroadcastHelperOp::inferReturnTypeComponents(
     return success();
   };
 
-  RankedTensorType lhs_ty = lhs.getType().dyn_cast<RankedTensorType>();
-  RankedTensorType rhs_ty = rhs.getType().dyn_cast<RankedTensorType>();
+  RankedTensorType lhs_ty = llvm::dyn_cast<RankedTensorType>(lhs.getType());
+  RankedTensorType rhs_ty = llvm::dyn_cast<RankedTensorType>(rhs.getType());
   if (!lhs_ty || !rhs_ty) return set_unranked_results();
 
   int64_t lhs_rank = lhs_ty.getRank();
@@ -3871,8 +3875,8 @@ LogicalResult XlaBroadcastHelperOp::inferReturnTypeComponents(
           "if broadcast_dims is empty, both arguments must have equal rank or "
           "at least one argument must be a scalar");
     }
-    inferredReturnShapes.emplace_back(lhs_ty.cast<ShapedType>());
-    inferredReturnShapes.emplace_back(rhs_ty.cast<ShapedType>());
+    inferredReturnShapes.emplace_back(llvm::cast<ShapedType>(lhs_ty));
+    inferredReturnShapes.emplace_back(llvm::cast<ShapedType>(rhs_ty));
     return success();
   }
 
@@ -3904,9 +3908,9 @@ LogicalResult XlaBroadcastHelperOp::inferReturnTypeComponents(
 
   if (broadcast_lhs) {
     inferredReturnShapes.emplace_back(broadcast_shape, lhs_ty.getElementType());
-    inferredReturnShapes.emplace_back(rhs_ty.cast<ShapedType>());
+    inferredReturnShapes.emplace_back(llvm::cast<ShapedType>(rhs_ty));
   } else {
-    inferredReturnShapes.emplace_back(lhs_ty.cast<ShapedType>());
+    inferredReturnShapes.emplace_back(llvm::cast<ShapedType>(lhs_ty));
     inferredReturnShapes.emplace_back(broadcast_shape, rhs_ty.getElementType());
   }
   return success();
@@ -3984,7 +3988,7 @@ LogicalResult XlaSetDynamicDimensionSizeOp::inferReturnTypeComponents(
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
   XlaSetDynamicDimensionSizeOpAdaptor op(operands.getValues(), attributes);
 
-  TensorType operand_ty = op.getInput().getType().cast<TensorType>();
+  TensorType operand_ty = llvm::cast<TensorType>(op.getInput().getType());
   Type element_ty = operand_ty.getElementType();
 
   TensorType result_ty;
@@ -4009,7 +4013,7 @@ LogicalResult XlaSetDynamicDimensionSizeOp::inferReturnTypeComponents(
     result_ty = UnrankedTensorType::get(element_ty);
   }
 
-  inferredReturnShapes.emplace_back(result_ty.cast<ShapedType>());
+  inferredReturnShapes.emplace_back(llvm::cast<ShapedType>(result_ty));
   return success();
 }
 
@@ -4045,7 +4049,7 @@ void XlaReduceOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult XlaReduceWindowOp::verify() {
   XlaReduceWindowOp op = *this;
-  const auto &input_ty = op.getInput().getType().cast<ShapedType>();
+  const auto &input_ty = llvm::cast<ShapedType>(op.getInput().getType());
 
   auto check = [&](mlir::Value val, std::string attr_name) -> LogicalResult {
     ElementsAttr attr;
@@ -4114,7 +4118,7 @@ LogicalResult XlaReduceWindowOp::verify() {
 
 LogicalResult XlaSelectAndScatterOp::verify() {
   XlaSelectAndScatterOp op = *this;
-  auto input_ty = op.getOperand().getType().cast<ShapedType>();
+  auto input_ty = llvm::cast<ShapedType>(op.getOperand().getType());
 
   auto check = [&](mlir::Value val, std::string attr_name) -> LogicalResult {
     ElementsAttr attr;
@@ -4188,9 +4192,9 @@ LogicalResult XlaVariadicReduceOp::verify() {
   // We rely on V2 for the majority of the checks.
   const auto &input_ty = op.getInput().getType();
   if (input_ty.empty()) return op.emitOpError() << "No input";
-  const auto &dtype = input_ty[0].cast<TensorType>().getElementType();
+  const auto &dtype = llvm::cast<TensorType>(input_ty[0]).getElementType();
   for (const auto &ty : input_ty) {
-    if (ty.cast<TensorType>().getElementType() != dtype)
+    if (llvm::cast<TensorType>(ty).getElementType() != dtype)
       return op.emitOpError()
              << "This version is limited to operands of the same dtype";
   }
@@ -4234,10 +4238,10 @@ LogicalResult XlaVariadicReduceV2Op::verify() {
                             << n_init_values << ")";
   }
 
-  auto input_ty_0 = inputs_ty[0].cast<ShapedType>();
+  auto input_ty_0 = llvm::cast<ShapedType>(inputs_ty[0]);
   if (input_ty_0.hasStaticShape()) {
     for (int i = 1; i < n_inputs; ++i) {
-      auto input_ty_i = inputs_ty[i].cast<ShapedType>();
+      auto input_ty_i = llvm::cast<ShapedType>(inputs_ty[i]);
       if (input_ty_i.hasStaticShape() &&
           input_ty_i.getShape() != input_ty_0.getShape()) {
         return op.emitOpError()
@@ -4254,7 +4258,7 @@ LogicalResult XlaVariadicReduceV2Op::verify() {
   }
 
   for (int i = 0; i < n_inputs; ++i) {
-    auto init_value_ty_i = init_values_ty[i].cast<ShapedType>();
+    auto init_value_ty_i = llvm::cast<ShapedType>(init_values_ty[i]);
     if (init_value_ty_i.hasRank() && init_value_ty_i.getRank() != 0) {
       return op.emitOpError()
              << "init_values[" << i << "] must be a scalar but got ["
@@ -4280,10 +4284,10 @@ LogicalResult XlaVariadicSortOp::verify() {
   XlaVariadicSortOp op = *this;
   const auto &inputs_ty = op.getInputs().getType();
   int n_inputs = inputs_ty.size();
-  auto input_ty_0 = inputs_ty[0].cast<ShapedType>();
+  auto input_ty_0 = llvm::cast<ShapedType>(inputs_ty[0]);
   if (input_ty_0.hasStaticShape()) {
     for (int i = 1; i < n_inputs; ++i) {
-      auto input_ty_i = inputs_ty[i].cast<ShapedType>();
+      auto input_ty_i = llvm::cast<ShapedType>(inputs_ty[i]);
       if (input_ty_i.hasStaticShape() &&
           input_ty_i.getShape() != input_ty_0.getShape()) {
         return op.emitOpError()
@@ -4318,10 +4322,9 @@ LogicalResult XlaVariadicSortOp::verify() {
 
 LogicalResult SetStaticDimensionBoundsOp::verify() {
   SetStaticDimensionBoundsOp op = *this;
-  mlir::ShapedType input_type =
-      op.getInput().getType().cast<mlir::ShapedType>();
+  mlir::ShapedType input_type = llvm::cast<ShapedType>(op.getInput().getType());
   mlir::ShapedType static_shape_type =
-      op.getStaticShape().getType().cast<mlir::ShapedType>();
+      llvm::cast<ShapedType>(op.getStaticShape().getType());
   int input_type_rank = input_type.hasRank() ? input_type.getRank() : -1;
   if (input_type_rank > 2) {
     return op.emitOpError() << "was used with an input tensor with rank > 2, "
@@ -4348,8 +4351,8 @@ template <typename UniformQuantizedOp>
 LogicalResult VerifyScalesAndZeroPoints(UniformQuantizedOp op, Value scales,
                                         Value zero_points,
                                         int32_t quantization_axis) {
-  ShapedType scales_type = scales.getType().cast<ShapedType>();
-  ShapedType zero_points_type = zero_points.getType().cast<ShapedType>();
+  ShapedType scales_type = llvm::cast<ShapedType>(scales.getType());
+  ShapedType zero_points_type = llvm::cast<ShapedType>(zero_points.getType());
 
   if (quantization_axis == -1) {
     if (scales_type.hasRank() && scales_type.getRank() != 0) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
index 5f59e3549815..abff7aeb61a2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
@@ -656,7 +656,6 @@ func.func @incomplete_composite_devices_while_body(%arg0: !tf_res, %arg1: !tf_re
     %mul, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
     %control_barrier = tf_executor.island(%assign_control_0, %assign_control_1, %add_control, %exe_control) wraps "tf.NoOp"() : () -> ()
     // CHECK: [[exe]]{{.*}}"tf.Identity"
-    // CHECK-NOT: "tf.Identity"
     // CHECK: tf_executor.fetch
     tf_executor.fetch %arg0, %arg1, %add, %control_barrier, %mul_control : tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, !tf_executor.control, !tf_executor.control
   }
@@ -816,11 +815,11 @@ func.func @tpu_execute_with_non_resource_operands(%arg0: !tf_res {tf._composite_
 func.func @double_tpu_execute_while_body(%arg0: !tf_res, %arg1: !tf_res,
                                          %arg2: tensor<f32>)
     -> (!tf_res, !tf_res, tensor<f32>) {
-    // CHECK: "tf.Identity"
   %graph:3 = tf_executor.graph {
     // CHECK: {{.*}}, [[ctrl1:%.*]] = tf_executor.island wraps "tf.Identity"
     // CHECK: {{.*}}, [[ctrl2:%.*]] = tf_executor.island wraps "tf.Identity"
     // CHECK: "tf.Identity"
+    // CHECK: "tf.Identity"
     %key, %key_control = tf_executor.island wraps "tf.Const"() {value = dense<"">: !tf_str} : () -> !tf_str
     // CHECK: [[exe_ctrl1:%.*]] = tf_executor.island([[ctrl1]]) wraps "tf.TPUExecuteAndUpdateVariables"
     %exe_control1 = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"(%arg2, %arg0, %arg1, %key) {
@@ -887,9 +886,9 @@ func.func @tpu_executes_on_same_device_while_body(%arg0: !tf_res, %arg1: !tf_res
                                          %arg2: tensor<f32>)
     -> (!tf_res, !tf_res, tensor<f32>) {
   %graph:3 = tf_executor.graph {
-    // CHECK: "tf.Identity"
     // CHECK: {{.*}}, [[id_ctrl:%.*]] = tf_executor.island wraps "tf.Identity"
     // CHECK: "tf.Identity"
+    // CHECK: "tf.Identity"
     %key, %key_control = tf_executor.island wraps "tf.Const"() {value = dense<"">: !tf_str} : () -> !tf_str
     // CHECK: [[exe_ctrl1:%.*]] = tf_executor.island([[id_ctrl]]) wraps "tf.TPUExecuteAndUpdateVariables"
     %exe_control1 = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"(%arg2, %arg0, %arg1, %key) {
@@ -911,8 +910,8 @@ func.func @tpu_executes_on_same_device_while_body(%arg0: !tf_res, %arg1: !tf_res
     %mul, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
     %control_barrier = tf_executor.island(%assign_control_0, %assign_control_1, %add_control,
                                           %exe_control1, %exe_control2) wraps "tf.NoOp"() : () -> ()
-    // CHECK: "tf.Identity"(%arg3)
     // CHECK: tf_executor.island([[exe_ctrl1]], [[exe_ctrl2]]) wraps "tf.Identity"
+    // CHECK: "tf.Identity"(%arg4)
     // CHECK: "tf.Identity"(%arg5)
     // CHECK-NEXT: tf_executor.fetch
     tf_executor.fetch %arg0, %arg1, %add, %control_barrier, %mul_control : tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, !tf_executor.control, !tf_executor.control
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 5b9b032719cf..8207032ffdb1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1317,7 +1317,7 @@ func.func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -
 
 // tf.Region yield number of results should match op number of results
 func.func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op  region control flow edge from Region #0 to parent results: source has 2 operands, but target successor needs 1}}
+  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Region #0 to parent results: source has 2 operands, but target successor needs 1}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
@@ -1332,7 +1332,7 @@ func.func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>)
 // -----
 
 func.func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op  region control flow edge from Region #1 to parent results: source has 2 operands, but target successor needs 1}}
+  // expected-error @+1 {{'tf.IfRegion' op region control flow edge from Region #1 to parent results: source has 2 operands, but target successor needs 1}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 1ffeac4df158..54d92b5b2ece 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -13,12 +13,7 @@ package(
 gentbl_cc_library(
     name = "tensorflow_canonicalize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "generated_canonicalize.inc",
-        ),
-    ],
+    tbl_outs = {"generated_canonicalize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "canonicalize.td",
     deps = [
@@ -29,12 +24,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "tensorflow_reduce_patterns_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "reducer/tf_reduce_patterns.inc",
-        ),
-    ],
+    tbl_outs = {"reducer/tf_reduce_patterns.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "reducer/tf_mlir_reduce_patterns.td",
     deps = [
@@ -89,12 +79,7 @@ cc_library(
 gentbl_cc_library(
     name = "decompose_resource_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "generated_decompose_resource_ops.inc",
-        ),
-    ],
+    tbl_outs = {"generated_decompose_resource_ops.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "decompose_resource_ops.td",
     deps = [
@@ -118,6 +103,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:framework",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -152,12 +138,7 @@ cc_library(
 gentbl_cc_library(
     name = "tf_data_optimization_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "generated_tf_data_optimization.inc",
-        ),
-    ],
+    tbl_outs = {"generated_tf_data_optimization.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_data_optimization.td",
     deps = [
@@ -376,19 +357,13 @@ cc_library(
 gentbl_cc_library(
     name = "tf_pass_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TensorFlow",
-            ],
-            "tf_passes.h.inc",
-        ),
-        (
-            ["-gen-pass-doc"],
-            "g3doc/_includes/tf_passes.md",
-        ),
-    ],
+    tbl_outs = {
+        "tf_passes.h.inc": [
+            "-gen-pass-decls",
+            "-name=TensorFlow",
+        ],
+        "g3doc/_includes/tf_passes.md": ["-gen-pass-doc"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_passes.td",
     deps = [
@@ -399,19 +374,13 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tf_device_pass_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TensorFlowDevice",
-            ],
-            "tf_device_passes.h.inc",
-        ),
-        (
-            ["-gen-pass-doc"],
-            "g3doc/includes/tf_device_passes.md",
-        ),
-    ],
+    tbl_outs = {
+        "tf_device_passes.h.inc": [
+            "-gen-pass-decls",
+            "-name=TensorFlowDevice",
+        ],
+        "g3doc/includes/tf_device_passes.md": ["-gen-pass-doc"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_device_passes.td",
     deps = [
@@ -422,19 +391,13 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tf_savedmodel_pass_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TensorFlowSavedModel",
-            ],
-            "tf_savedmodel_passes.h.inc",
-        ),
-        (
-            ["-gen-pass-doc"],
-            "g3doc/includes/tf_savedmodel_passes.md",
-        ),
-    ],
+    tbl_outs = {
+        "tf_savedmodel_passes.h.inc": [
+            "-gen-pass-decls",
+            "-name=TensorFlowSavedModel",
+        ],
+        "g3doc/includes/tf_savedmodel_passes.md": ["-gen-pass-doc"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_savedmodel_passes.td",
     deps = [
@@ -445,19 +408,13 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tf_test_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TensorFlowTest",
-            ],
-            "test_passes.h.inc",
-        ),
-        (
-            ["-gen-pass-doc"],
-            "g3doc/includes/tf_test_passes.md",
-        ),
-    ],
+    tbl_outs = {
+        "test_passes.h.inc": [
+            "-gen-pass-decls",
+            "-name=TensorFlowTest",
+        ],
+        "g3doc/includes/tf_test_passes.md": ["-gen-pass-doc"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_test_passes.td",
     deps = [
@@ -601,7 +558,6 @@ cc_library(
         ":verify_no_outside_compilation_markers_pass",
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/lite:validators",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
@@ -643,6 +599,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tf2xla/transforms:split_into_island_per_op_pass",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
+        "//tensorflow/compiler/mlir/utils:validators",
         "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla/kernels:xla_call_module_loader",
         "//tensorflow/core:core_cpu_base",
@@ -661,6 +618,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -746,6 +704,7 @@ cc_library(
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -838,6 +797,7 @@ cc_library(
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
         "@local_xla//xla/hlo/translate/mhlo_to_hlo:type_to_shape",
+        "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/service:shape_inference",
         "@local_xla//xla/tsl/platform:errors",
         "@local_xla//xla/tsl/util:env_var",
@@ -907,6 +867,9 @@ cc_library(
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -939,8 +902,13 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_traits",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -1015,12 +983,7 @@ filegroup(
 gentbl_cc_library(
     name = "tensorflow_optimize_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "generated_optimize.inc",
-        ),
-    ],
+    tbl_outs = {"generated_optimize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "optimize.td",
     deps = [
@@ -1035,12 +998,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "lower_tf_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "generated_lower_tf.inc",
-        ),
-    ],
+    tbl_outs = {"generated_lower_tf.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "lower_tf.td",
     deps = [
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
index 52765fb5657e..eb9da461993c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+#include <memory>
+
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
index 72697e4dd3f8..c2377ef625d2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <climits>
-#include <cstdint>
-#include <numeric>
+#include <memory>
+#include <string>
+#include <utility>
 
-#include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/breakup-islands.cc
index de001cff0c1e..b3cbd103dc5d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/breakup-islands.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
+#include <cassert>
+#include <memory>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
index 81af0f63dbec..2c245ea5cda4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/base/attributes.h"
+#include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize_compile_and_replicate_attributes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize_compile_and_replicate_attributes.cc
index f9821e168675..06d0842a4502 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize_compile_and_replicate_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize_compile_and_replicate_attributes.cc
@@ -20,6 +20,8 @@ limitations under the License.
 // should be replaced with _xla_compile_device_type with the value of device
 // attribute.
 
+#include <memory>
+
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/check_control_dependencies.cc b/tensorflow/compiler/mlir/tensorflow/transforms/check_control_dependencies.cc
index ead82339edf9..d83137a87785 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/check_control_dependencies.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/check_control_dependencies.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
index 3574bc663db5..93d31b884732 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <vector>
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
index c6b25e73ae09..beee1afb1a12 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
@@ -15,7 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.h"
 
+#include <cassert>
+#include <cstddef>
+#include <functional>
 #include <optional>
+#include <string>
+#include <utility>
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
index 355aded4f2d9..082aef84d15d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
 
-#include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
index 3d3e1305993a..d796526da8f4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
@@ -25,7 +25,10 @@ limitations under the License.
 // does not exist any operation placed on host_B that conumes any result of any
 // operation placed on host_A.
 
+#include <algorithm>
+#include <memory>
 #include <optional>
+#include <string>
 
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
index 5a83e75e9eed..4c40c53e250a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 
+#include <cassert>
+#include <cstdint>
 #include <optional>
 
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
index 214d68f60f8b..20fe886f18ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
@@ -22,6 +22,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -31,6 +35,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_op_device_assignment.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_op_device_assignment.cc
index b33596ffa09f..93df6da8caf1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_op_device_assignment.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_op_device_assignment.cc
@@ -18,6 +18,8 @@ limitations under the License.
 // op is read by operations placed on multiple devices, then the pass will
 // replicate the tf.Const op once for each device.
 
+#include <memory>
+
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index 6262cad26ca6..d63ace094451 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -424,7 +424,7 @@ void ChainResourceOps(
   for (auto class_iter = resource_equivalence_classes.begin();
        class_iter != resource_equivalence_classes.end(); ++class_iter) {
     // Only visit one element per class, the leader.
-    if (!class_iter->isLeader()) continue;
+    if (!(*class_iter)->isLeader()) continue;
 
     // Create chain source and sink identity islands for current equivalence
     // class.
@@ -445,7 +445,7 @@ void ChainResourceOps(
     // by `class_iter`). Keep track of ops that have already been processed.
     llvm::SmallDenseSet<Operation*> processed_ops;
     for (auto member_iter =
-             resource_equivalence_classes.member_begin(class_iter);
+             resource_equivalence_classes.member_begin(**class_iter);
          member_iter != resource_equivalence_classes.member_end();
          ++member_iter) {
       ResourceAndDevice resource_and_device = *member_iter;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc
index a261ea5452b1..42cb9f24e002 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc
index a9b3b4f68090..d67825af3ac4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_to_legacy_compile_and_replicate_attributes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_to_legacy_compile_and_replicate_attributes.cc
index e7e9e27f30fb..224ee0cca95c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_to_legacy_compile_and_replicate_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_to_legacy_compile_and_replicate_attributes.cc
@@ -19,6 +19,8 @@ limitations under the License.
 // This ensures the unified attributes not get exposed outside of the MLIR
 // bridge with V1 pipeline in some cases.
 
+#include <memory>
+
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
index 399dcbc3b083..a0fe58f8de20 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string>
-
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
index 4af1246d5a72..f6a2bafee9f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
@@ -13,11 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
-#include <iterator>
 #include <memory>
-#include <tuple>
-#include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
index 0a205859957c..144bdb440186 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.h"
 
+#include <cstdint>
+#include <vector>
+
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -29,10 +33,8 @@ namespace {
 // Returns subtype of `resource` if present. Otherwise an unranked tensor type
 // of `element_type` is returned.
 static Type GetResourceSubtypeOrDefault(Value resource, Type element_type) {
-  auto resource_type = resource.getType()
-                           .cast<TensorType>()
-                           .getElementType()
-                           .cast<ResourceType>();
+  auto resource_type = llvm::cast<tf_type::ResourceType>(
+      llvm::cast<TensorType>(resource.getType()).getElementType());
   if (resource_type.getSubtypes().size() == 1)
     return resource_type.getSubtypes().front();
 
@@ -40,19 +42,15 @@ static Type GetResourceSubtypeOrDefault(Value resource, Type element_type) {
 }
 
 static bool HasResourceSubtype(Value resource) {
-  return resource.getType()
-             .cast<TensorType>()
-             .getElementType()
-             .cast<ResourceType>()
+  return llvm::cast<tf_type::ResourceType>(
+             llvm::cast<TensorType>(resource.getType()).getElementType())
              .getSubtypes()
              .size() == 1;
 }
 
 static Type GetResourceSubtype(Value resource) {
-  return resource.getType()
-      .cast<TensorType>()
-      .getElementType()
-      .cast<ResourceType>()
+  return llvm::cast<tf_type::ResourceType>(
+             llvm::cast<TensorType>(resource.getType()).getElementType())
       .getSubtypes()
       .front();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index f466c1d48d68..1fc666da4a8d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -30,7 +30,7 @@ def CreateTFReadVariableOp : NativeCodeCall<
     "$_builder.create<TF::ReadVariableOp>("
     "  $0.getLoc(),"
     "  GetResourceSubtypeOrDefault("
-    "    $2, $1.getType().cast<TensorType>().getElementType()),"
+    "    $2, llvm::cast<TensorType>($1.getType()).getElementType()),"
     "  $2)"
     >;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
index cd5ae2d2fdaa..955baa82032f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
+#include <memory>
 #include <queue>
+#include <utility>
 
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
@@ -94,7 +97,7 @@ LogicalResult ApplyPatternsLocallyUntilConverged(
     auto walk_result =
         op_with_regions->walk([&patterns, &changed](Operation* operation) {
           GreedyRewriteConfig config;
-          config.strictMode = mlir::GreedyRewriteStrictness::ExistingOps;
+          config.setStrictness(mlir::GreedyRewriteStrictness::ExistingOps);
           bool op_erased;
           if (failed(applyOpPatternsAndFold(operation, patterns, config,
                                             &op_erased)))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/deduplicate_bound_input_bindings.cc b/tensorflow/compiler/mlir/tensorflow/transforms/deduplicate_bound_input_bindings.cc
index 7e1a841b73de..4bb20a1c3585 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/deduplicate_bound_input_bindings.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/deduplicate_bound_input_bindings.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <vector>
+#include <memory>
+#include <utility>
 
 #include "llvm/ADT/DenseMap.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -51,7 +52,9 @@ void DedupBoundInputBindingPass::runOnOperation() {
     duplicate_arg.replaceAllUsesWith(original_arg);
     arg_indices_to_erase.set(i);
   }
-  func.eraseArguments(arg_indices_to_erase);
+  if (failed(func.eraseArguments(arg_indices_to_erase))) {
+    return signalPassFailure();
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/device_attribute_to_launch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/device_attribute_to_launch.cc
index bee301e97a66..8a272ed4a65c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/device_attribute_to_launch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/device_attribute_to_launch.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
index e0467bea4240..74e32c9ea560 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // Converts DeviceIndex to constant device.
 
+#include <memory>
+
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/drop_while_shape_invariant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/drop_while_shape_invariant.cc
index 1b93728352b7..bcce1dbf1843 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/drop_while_shape_invariant.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/drop_while_shape_invariant.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index f28f3f1447e3..bc4487a4e3fd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -17,15 +17,14 @@ limitations under the License.
 
 #include <algorithm>
 #include <cctype>
-#include <climits>
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
-#include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
index 750e1033eec6..da4ef7c86848 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
@@ -18,9 +18,9 @@ limitations under the License.
 // flow/frames or side effecting ops yet.
 
 #include <iterator>
-#include <optional>
-#include <tuple>
+#include <memory>
 
+#include "absl/log/check.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
index 410aa20fe424..8bdd088b2dde 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -25,6 +27,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/Inliner.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -55,6 +58,7 @@ void ExecutorTPUV1IslandInliningPass::runOnOperation() {
   if (!nested_module) return;
 
   InlinerInterface inliner(&getContext());
+  InlinerConfig config;
   auto walk_result = getOperation().walk([&](TF::PartitionedCallOp call_op) {
     if (!call_op.getF().getRootReference().getValue().starts_with(
             kNestedModule))
@@ -67,7 +71,7 @@ void ExecutorTPUV1IslandInliningPass::runOnOperation() {
     auto called_func =
         dyn_cast_or_null<func::FuncOp>(call_interface.resolveCallable());
 
-    if (failed(inlineCall(inliner, call_interface,
+    if (failed(inlineCall(inliner, config.getCloneCallback(), call_interface,
                           cast<CallableOpInterface>(called_func.getOperation()),
                           called_func.getCallableRegion(),
                           /* shouldCloneInlinedRegion = */ false))) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
index b75f081d1a00..81497dc53cba 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
@@ -16,11 +16,12 @@ limitations under the License.
 // This transformation pass takes TensorFlow executor dialect IslandOps and
 // merges the one that contains operation marked to run on TPU.
 
-#include <algorithm>
-#include <iterator>
+#include <cassert>
+#include <memory>
 #include <optional>
 #include <queue>
 #include <tuple>
+#include <vector>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
index 06e274d65527..2746d0ddb406 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
index 9ef0b9b89c34..244cf3263d8d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <cstdint>
+#include <functional>
 #include <memory>
+#include <utility>
 
-#include "absl/memory/memory.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
index 9d6bd563845f..64d5cd314d41 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
@@ -209,7 +209,9 @@ void FreezeGlobalTensorsPass::runOnOperation() {
       it.first->eraseOperands(it.second);
     }
 
-    func.eraseArguments(args_to_erase);
+    if (failed(func.eraseArguments(args_to_erase))) {
+      return signalPassFailure();
+    }
   }
 
   // Erase all global tensors that were frozen.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc
index daaf9df74004..25bc9067ecd8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
+#include <memory>
 #include <string>
-#include <vector>
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -106,7 +105,10 @@ void FreezeAssetsPass::runOnOperation() {
         init_op.erase();
       }
     }
-    func.eraseArguments(args_to_erase);
+
+    if (failed(func.eraseArguments(args_to_erase))) {
+      return signalPassFailure();
+    }
   }
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
index 65d9a288d568..257eafcad556 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
@@ -16,6 +16,10 @@ limitations under the License.
 // This transformation pass transforms functional control flow operations in the
 // TensorFlow dialect to MLIR Control Flow Graph (CFG) form.
 
+#include <cassert>
+#include <functional>
+#include <memory>
+
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index 11be79869f4f..b368af8b3f77 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -17,6 +17,10 @@ limitations under the License.
 // TensorFlow dialect to their region based counterparts, i.e.,
 // tf.If -> tf.IfRegion and tf.While -> tf.WhileRegion
 
+#include <cassert>
+#include <memory>
+#include <vector>
+
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 2327bcb3e414..e73d76fbc590 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdio>
-#include <iostream>
+#include <memory>
 #include <optional>
 #include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
index f943d0984617..c267b08a43e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
+#include <memory>
+#include <utility>
+
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
index 07935b3cbbc6..3610747331a2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
@@ -25,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace mlir {
 namespace TF {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
index 0de93ca44646..30d1284557ab 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace mlir {
 namespace TF {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
index a23c09de0ce6..6e81e08eea27 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
+
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc b/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc
index 2edd6d76f031..c00c32d10d1c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/group_by_dialect.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/guarantee_all_funcs_one_use.cc b/tensorflow/compiler/mlir/tensorflow/transforms/guarantee_all_funcs_one_use.cc
index ec048e1ef6e0..25ab9ba00dad 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/guarantee_all_funcs_one_use.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/guarantee_all_funcs_one_use.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Analysis/CallGraph.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
index 2acf81dbcd78..f78337a6fad2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <cstddef>
 #include <memory>
-#include <utility>
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Casting.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
index 67c1d911889a..2c70a078fbb1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
@@ -49,9 +52,10 @@ struct HoistReplicateInvariantResourceWritesPass
 // TODO(prakalps): This is a common utility and other passes use something
 // similar. Move to common utils.
 bool IsResourceType(Type type) {
-  return type.isa<TF::ResourceType>() ||
-         (type.isa<TensorType>() &&
-          type.cast<TensorType>().getElementType().isa<TF::ResourceType>());
+  return llvm::isa<TF::ResourceType>(type) ||
+         (llvm::isa<TensorType>(type) &&
+          llvm::isa<TF::ResourceType>(
+              llvm::cast<TensorType>(type).getElementType()));
 }
 
 SmallVector<Value> GetAccessedResources(Operation& op) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
index 907dcf9c23bd..be3bfa30afcf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
@@ -6,8 +6,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//learning/serving/contrib/tfrt/mlir/saved_model_analysis:__pkg__",
-        "//tensorflow/compiler/mlir:__pkg__",
+        "//tensorflow/compiler/mlir:__subpackages__",
         "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
         "//tensorflow/compiler/mlir/tf2xla/api:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
@@ -142,15 +141,10 @@ tf_cc_test(
 gentbl_cc_library(
     name = "runtime_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=RuntimeLowering",
-            ],
-            "runtime_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"runtime_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=RuntimeLowering",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "runtime_passes.td",
     deps = [
@@ -216,7 +210,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -271,6 +264,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
index 16ae6c7a8f99..9492c007b07c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
+#include <cstdint>
 #include <iterator>
 #include <memory>
 #include <utility>
 
 #include "absl/log/log.h"
+#include "absl/strings/str_join.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -37,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -144,8 +148,8 @@ bool AddAccessedResourceIds(
 bool IsResourceMergeable(Attribute& resource_attr, Attribute& device_attr) {
   return resource_attr &&
          ((resource_attr == device_attr) ||
-          (resource_attr.cast<mlir::StringAttr>().getValue().find(
-               "COMPOSITE") != llvm::StringRef::npos));
+          (llvm::cast<StringAttr>(resource_attr).getValue().find("COMPOSITE") !=
+           llvm::StringRef::npos));
 }
 
 // Finds the variable access info for a TPUExecute op.
@@ -193,7 +197,7 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
         // Check device matching for the node defining the resource.
         if (!IsResourceMergeable(resource_attr, device_attr)) continue;
       } else {
-        auto resource_arg = resource.dyn_cast<BlockArgument>();
+        auto resource_arg = dyn_cast<BlockArgument>(resource);
         assert(resource_arg);
         if (resource_arg.getOwner() != &func.front()) continue;
         // Check device matching for the argument defining the resource.
@@ -515,8 +519,8 @@ LogicalResult MergeForOneTPUExecute(
   // Check that all resources are either read or written to.
   for (auto it : llvm::enumerate(var_access_info.new_operand_values)) {
     Type type = it.value().getType();
-    if (type.isa<TensorType>() &&
-        type.cast<TensorType>().getElementType().isa<TF::ResourceType>()) {
+    if (isa<TensorType>(type) &&
+        isa<TF::ResourceType>(cast<TensorType>(type).getElementType())) {
       if (!llvm::is_contained(device_var_reads_indices, it.index()) &&
           !llvm::is_contained(device_var_updates_indices, it.index())) {
         return execute_launch.GetBody().front().emitError("operand #")
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc
index d8067af3f295..780e4c222e56 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 
+#include "absl/log/log.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -192,6 +192,11 @@ Operation* BuildCompileOp(
         metadata.args(operand_and_idx.index()).shape());
     if (shape.IsFullyDefined()) continue;
 
+    VLOG(1) << "Building compile op for module_name: " << module_name.str()
+            << " dynamic shape for operand index: " << operand_and_idx.index()
+            << " metadata: "
+            << metadata.args(operand_and_idx.index()).DebugString();
+
     auto shape_op = builder->create<TF::ShapeOp>(
         cluster_func.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({-1}, builder->getIntegerType(64)),
@@ -311,8 +316,7 @@ LogicalResult AddToParallelExecuteOp(
     int num_results_pre_cluster, Operation* compile_op,
     tf_device::ClusterFuncOp cluster_func, OpBuilder* builder,
     tf_device::ParallelExecuteOp old_parallel_execute,
-    tf_device::ParallelExecuteOp* new_parallel_execute,
-    int* cluster_idx) {
+    tf_device::ParallelExecuteOp* new_parallel_execute, int* cluster_idx) {
   const int num_cores_per_replica = tpu_devices.front().size();
   // parallel_execute op returns concatenated list of return values of
   // all its regions.
@@ -386,7 +390,7 @@ LogicalResult AddToParallelExecuteOp(
         builder, block.getParent()->getLoc(), execute, device);
 
     builder->create<tf_device::ReturnOp>(block.getParent()->getLoc(),
-                                               block_launch_op.getResults());
+                                         block_launch_op.getResults());
   }
 
   return success();
@@ -466,8 +470,7 @@ LogicalResult CheckParallelExecuteConstainsValidNonClusterProcess(
   return success();
 }
 
-int GetNumResultsPreCluster(
-    tf_device::ParallelExecuteOp parallel_execute) {
+int GetNumResultsPreCluster(tf_device::ParallelExecuteOp parallel_execute) {
   int num_results_pre_cluster = 0;
   for (mlir::Region& region : parallel_execute.getRegions()) {
     if (llvm::isa<tf_device::LaunchOp>(region.front().front())) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
index 8d58b8177b33..010bfd460afe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
@@ -189,7 +189,9 @@ static LogicalResult convertTFGlobals(ModuleOp module) {
         argsToErase.set(i);
       }
     }
-    func.eraseArguments(argsToErase);
+    if (failed(func.eraseArguments(argsToErase))) {
+      return failure();
+    }
   }
 
   // Erase all the global tensors.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index 4c7810f8df51..a9ff5a8f7626 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -34,7 +34,7 @@ class GetI64ScalarElementsAttr<int value> :
 class GetF32Scalar<int value> :
   NativeCodeCall<"GetF32Scalar(&$_builder, " # value # ")">;
 
-def TrueBoolAttr : AttrConstraint<CPred<"$_self.cast<::mlir::BoolAttr>().getValue()">>;
+def TrueBoolAttr : AttrConstraint<CPred<"llvm::cast<::mlir::BoolAttr>($_self).getValue()">>;
 
 def CreateTFShapeOp : NativeCodeCall<
     "$_builder.create<TF::ShapeOp>($0.getLoc(), $1, $2)">;
@@ -74,7 +74,7 @@ def LowerAddOp : Pat<(TF_AddOp TF_NumberNotQuantizedTensor:$x,
 
 def GetBiasAddGradReductionIndices : NativeCodeCall<
   "GetBiasAddGradReductionIndices("
-  "$0.getType().cast<RankedTensorType>().getRank(), $1, &$_builder)">;
+  "llvm::cast<RankedTensorType>($0.getType()).getRank(), $1, &$_builder)">;
 
 def LowerBiasAddGradOp :
   Pat<(TF_BiasAddGradOp AnyRankedTensor:$out_backprop, $data_format),
@@ -120,12 +120,12 @@ def LowerSoftmaxCrossEntropyWithLogitsOp : Pattern<
 // dimension should be known.
 class GetDimSizeOfType<int dim> : NativeCodeCall<
   "GetScalarOfType(getElementTypeOrSelf($1), "
-  "$0.getType().cast<RankedTensorType>().getDimSize(" # dim # "))">;
+  "llvm::cast<RankedTensorType>($0.getType()).getDimSize(" # dim # "))">;
 
 // Same as the above with i32 element type.
 class GetDimSizeAsI32<int dim> : NativeCodeCall<
   "GetScalarOfType($_builder.getIntegerType(32), "
-  "$0.getType().cast<RankedTensorType>().getDimSize(" # dim # "))">;
+  "llvm::cast<RankedTensorType>($0.getType()).getDimSize(" # dim # "))">;
 
 // Sparse version of SoftmaxCrossEntropyWithLogits is lowered to dense by
 // expanding the sparse labels using:
@@ -285,7 +285,7 @@ def LowerIsNanOp : Pat<(TF_IsNanOp $x),
 
 def GetAllAxes : NativeCodeCall<
   "GetI64ElementsAttrForSeq("
-  "0, $0.getType().cast<RankedTensorType>().getRank(), &$_builder)">;
+  "0, llvm::cast<RankedTensorType>($0.getType()).getRank(), &$_builder)">;
 
 // L2Loss is lowered using the formula,
 // L2Loss(input) = Sum(input * input) / 2
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index be7e914bd298..f02dffc5d6f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h"
+#include "tensorflow/compiler/mlir/utils/validators.h"  // IWYU pragma: keep
 
 namespace mlir {
 namespace TF {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
index be01d2769020..9ad34d2064c7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
@@ -23,27 +23,27 @@ def IsDataFormatNHWC : ConstantAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">;
 // Get the last dimension size as a 1-d single element attr.
 def GetLastDimSizeAsI32 : NativeCodeCall<
   "DenseElementsAttr::get(RankedTensorType::get({1}, $_builder.getIntegerType(32)), "
-  "static_cast<int32_t>($0.getType().cast<RankedTensorType>().getDimSize(  "
-  "  $0.getType().cast<RankedTensorType>().getRank() - 1)))">;
+  "static_cast<int32_t>(llvm::cast<RankedTensorType>($0.getType()).getDimSize(  "
+  "  llvm::cast<RankedTensorType>($0.getType()).getRank() - 1)))">;
 
 // Check whether the tensor is ranked and whether its last dim is static.
 def IsRankedShapeLastDimStatic : Constraint<And<[
-  CPred<"$0.getType().isa<RankedTensorType>()">,
-  CPred<"!$0.getType().cast<ShapedType>().isDynamicDim( "
-  "  $0.getType().cast<RankedTensorType>().getRank() - 1)">]>>;
+  CPred<"llvm::isa<RankedTensorType>($0.getType())">,
+  CPred<"!llvm::cast<ShapedType>($0.getType()).isDynamicDim( "
+  "  llvm::cast<RankedTensorType>($0.getType()).getRank() - 1)">]>>;
 
 def IsNotComplexType : Constraint<And<[
-  CPred<"$0.getType().isa<RankedTensorType>()">,
-  CPred<"!$0.getType().cast<ShapedType>().getElementType().isa<ComplexType>()">
+  CPred<"llvm::isa<RankedTensorType>($0.getType())">,
+  CPred<"!llvm::isa<ComplexType>(llvm::cast<ShapedType>($0.getType()).getElementType())">
 ]>>;
 
 // Only fuse multiplier if all dimensions other than the channel dimension
 // are equal to 1.
 def CanFuseMulAndConv2D :
-    Constraint<CPred<"TFL::IsBroadcastableElementsAttrs($0, $1) && TFL::IsDimensionsDegenerateExceptLastOne($1)">>;
+    Constraint<CPred<"IsBroadcastableElementsAttrs($0, $1) && IsDimensionsDegenerateExceptLastOne($1)">>;
 
 def F32ElementsAttr : ElementsAttrBase<
-    CPred<"$_self.cast<ElementsAttr>().getShapedType().getElementType().isF32()">, "float constant tensor">;
+    CPred<"llvm::cast<ElementsAttr>($_self).getShapedType().getElementType().isF32()">, "float constant tensor">;
 def DefinedByConv2D : Constraint<CPred<"llvm::isa_and_nonnull<mlir::TF::Conv2DOp>($0.getDefiningOp())">>;
 // Checks if the value has only one user.
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index fd4e631a4a7d..f69218a8bc72 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -137,7 +137,7 @@ void EraseUnusedGlobalTensors(ModuleOp module,
   }
 }
 
-void EraseUnusedBoundInputs(ModuleOp module) {
+LogicalResult EraseUnusedBoundInputs(ModuleOp module) {
   for (auto func : module.getOps<func::FuncOp>()) {
     llvm::BitVector args_to_erase(func.getNumArguments());
     for (int i = 0, e = func.getNumArguments(); i < e; i++) {
@@ -146,8 +146,12 @@ void EraseUnusedBoundInputs(ModuleOp module) {
         args_to_erase.set(i);
       }
     }
-    func.eraseArguments(args_to_erase);
+
+    if (failed(func.eraseArguments(args_to_erase))) {
+      return failure();
+    }
   }
+  return success();
 }
 
 void OptimizeGlobalTensorsPass::runOnOperation() {
@@ -156,7 +160,9 @@ void OptimizeGlobalTensorsPass::runOnOperation() {
     return;
   }
 
-  EraseUnusedBoundInputs(module);
+  if (failed(EraseUnusedBoundInputs(module))) {
+    return signalPassFailure();
+  }
 
   TF::ResourceAnalyzer resource_analyzer(module);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
index 46a9f020ed7d..8b5d2e0de1e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
@@ -69,19 +69,17 @@ class PrepareTpuComputationForTfExportPass
 class RewriteXlaHostComputeMlir
     : public OpRewritePattern<TF::_XlaHostComputeMlirOp> {
  public:
-  using OpRewritePattern<TF::_XlaHostComputeMlirOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(TF::_XlaHostComputeMlirOp op) const override {
+  LogicalResult matchAndRewrite(TF::_XlaHostComputeMlirOp op,
+                                PatternRewriter& rewriter) const override {
     if (op.getManualSharding()) {
       // This rewrite does not support manual_sharding. It is expected that the
       // _XlaHostComputeMlirOp registered as an MlirXlaOpKernel will handle this
       // case later once the XlaBuilder graph reaches it.
       return failure();
     }
-    return success();
-  }
-  void rewrite(TF::_XlaHostComputeMlirOp op,
-               PatternRewriter& rewriter) const override {
+
     llvm::SmallVector<Attribute> shape_attrs;
     shape_attrs.reserve(op.getNumResults());
     for (Type ty : op.getResultTypes()) {
@@ -141,6 +139,7 @@ class RewriteXlaHostComputeMlir
         op.getRecvKeyAttr(),
         /*cost_estimate_ns=*/rewriter.getI64IntegerAttr(kDefaultCostEstimate),
         /*tpu_core=*/rewriter.getI64IntegerAttr(0));
+    return success();
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
index 493725c6cdcb..ecdf19e65f0e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
@@ -204,8 +204,8 @@ void RemoveUnusedArgumentsPass::runOnOperation() {
     }
 
     EraseReturnOperands(region, unused_results);
-    func.eraseResults(unused_results);
-    func.eraseArguments(unused_args);
+    if (failed(func.eraseResults(unused_results))) return;
+    if (failed(func.eraseArguments(unused_args))) return;
 
     args_to_erase.insert(std::make_pair(op, unused_args));
     results_to_erase.insert(std::make_pair(op, unused_results));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index 3928faaa2803..4b699773371e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -94,7 +94,8 @@ LogicalResult GetDeviceOrdinal(const std::optional<DictionaryAttr>& devices,
            << " to be present in 'tf.device.replicate' op";
   }
   llvm::StringRef tpu_device =
-      tpu_replica.cast<ArrayAttr>()[replica_id].cast<StringAttr>().getValue();
+      llvm::cast<StringAttr>(llvm::cast<ArrayAttr>(tpu_replica)[replica_id])
+          .getValue();
   return tensorflow::GetDeviceOrdinalFromDeviceString(op->getLoc(), tpu_device,
                                                       &device_ordinal);
 }
@@ -136,9 +137,9 @@ LogicalResult UpdateRegionReplicateVariantOps(
     // Map aliased devices to explicit devices based on replica.
     if (auto launch = dyn_cast<tf_device::LaunchOp>(op))
       if (auto device_by_replica = devices.value().get(launch.getDevice()))
-        launch->setAttr(
-            kDeviceAttr,
-            device_by_replica.cast<ArrayAttr>()[replica_id].cast<StringAttr>());
+        launch->setAttr(kDeviceAttr,
+                        llvm::cast<StringAttr>(llvm::cast<ArrayAttr>(
+                            device_by_replica)[replica_id]));
 
     return WalkResult::advance();
   });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index c7ffc9c0dd46..5ab1ea1a0345 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -681,7 +681,7 @@ llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> MergeArgResourceUseInfo(
 // removed). If remaining_resource_data_types is provided, it will store the
 // data types of the remaining resource arguments, where the indices are after
 // removing unused ones.
-void RemoveUnusedResourceArgumentsAndForwardedRetvals(
+LogicalResult RemoveUnusedResourceArgumentsAndForwardedRetvals(
     const llvm::SmallDenseMap<int64_t, ResourceArgUseInfo>& infos,
     func::FuncOp func_op,
     llvm::SmallVector<int64_t, 4>* old_to_new_arg_indices = nullptr,
@@ -722,10 +722,13 @@ void RemoveUnusedResourceArgumentsAndForwardedRetvals(
       }
     }
   }
-  func_op.eraseArguments(indices_to_erase);
+  if (failed(func_op.eraseArguments(indices_to_erase))) {
+    return failure();
+  }
   func_op.setType(
       FunctionType::get(func_op.getContext(), new_types,
                         llvm::to_vector<4>(return_op->getOperandTypes())));
+  return success();
 }
 
 // Lifts reads/writes of resource arguments from func_op and changes its
@@ -848,10 +851,15 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, func::FuncOp body,
   // Remove unused resources in functions.
   llvm::SmallVector<int64_t, 4> old_to_new_indices;
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
-  RemoveUnusedResourceArgumentsAndForwardedRetvals(
-      resource_arg_uses, body, &old_to_new_indices,
-      &remaining_resource_data_types);
-  RemoveUnusedResourceArgumentsAndForwardedRetvals(resource_arg_uses, cond);
+  if (failed(RemoveUnusedResourceArgumentsAndForwardedRetvals(
+          resource_arg_uses, body, &old_to_new_indices,
+          &remaining_resource_data_types))) {
+    return failure();
+  }
+  if (failed(RemoveUnusedResourceArgumentsAndForwardedRetvals(resource_arg_uses,
+                                                              cond))) {
+    return failure();
+  }
   (void)LiftArgRetResourcesForFunction(
       body, remaining_resource_data_types,
       [&](int64_t index, Value value) { return_op->setOperand(index, value); });
@@ -916,11 +924,18 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<func::FuncOp> branches) {
   if (resource_arg_uses.empty()) return success();
   // Remove unused resources in functions.
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
-  RemoveUnusedResourceArgumentsAndForwardedRetvals(
-      resource_arg_uses, branches.front(), /*old_to_new_arg_indices=*/nullptr,
-      &remaining_resource_data_types);
-  for (auto func : branches.drop_front())
-    RemoveUnusedResourceArgumentsAndForwardedRetvals(resource_arg_uses, func);
+  if (failed(RemoveUnusedResourceArgumentsAndForwardedRetvals(
+          resource_arg_uses, branches.front(),
+          /*old_to_new_arg_indices=*/nullptr,
+          &remaining_resource_data_types))) {
+    return failure();
+  }
+  for (auto func : branches.drop_front()) {
+    if (failed(RemoveUnusedResourceArgumentsAndForwardedRetvals(
+            resource_arg_uses, func))) {
+      return failure();
+    }
+  }
 
   // Forward resource inputs updated in any branch to the outputs of both
   // branches. First prepare the mapping from arg to new update output.
@@ -1055,9 +1070,11 @@ LogicalResult HandlePartitionedCallOpCallee(
 
   // Remove unused resources in functions.
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
-  RemoveUnusedResourceArgumentsAndForwardedRetvals(
-      result->use_info, callee, /*old_to_new_arg_indices=*/nullptr,
-      &remaining_resource_data_types);
+  if (failed(RemoveUnusedResourceArgumentsAndForwardedRetvals(
+          result->use_info, callee, /*old_to_new_arg_indices=*/nullptr,
+          &remaining_resource_data_types))) {
+    return failure();
+  }
   for (const auto& entry : remaining_resource_data_types) {
     result->arg_data_type_and_updated_output_index[entry.getFirst()] = {
         entry.getSecond(), -1};
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
index 303e5aa2b6dd..346f571bf5ea 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <variant>
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -172,8 +173,8 @@ func::FuncOp CloneFunctionIfNeeded(func::FuncOp func) {
 // branch functions to (a) drop the ununsed return values, and (b) as a result
 // if some argument becomes unused in all branches, drop that argument and the
 // corresponding if/case input operand.
-void EliminateUnusedResultsForIfCase(Operation *op,
-                                     ArrayRef<func::FuncOp> branches) {
+LogicalResult EliminateUnusedResultsForIfCase(Operation *op,
+                                              ArrayRef<func::FuncOp> branches) {
   // Clone branch functions if needed since we will be mutating them.
   SmallVector<func::FuncOp, 2> cloned_branches;
   cloned_branches.reserve(branches.size());
@@ -216,7 +217,11 @@ void EliminateUnusedResultsForIfCase(Operation *op,
     // Traverse arguments backward so that indices to be deleted stay unchanged.
     for (int idx = num_args - 1; idx >= 0; --idx) {
       if (used_args.test(idx)) continue;
-      for (func::FuncOp func : cloned_branches) func.eraseArgument(idx);
+      for (func::FuncOp func : cloned_branches) {
+        if (failed(func.eraseArgument(idx))) {
+          return failure();
+        }
+      }
       // For if/case, arg #i of attached function corresponds to operand #i+1
       op->eraseOperand(idx + 1);
     }
@@ -231,10 +236,11 @@ void EliminateUnusedResultsForIfCase(Operation *op,
   }
 
   EliminateUnusedResults(op);
+  return success();
 }
 
 // Eliminated unused results from a functional while.
-void EliminateUnusedResultsForWhile(TF::WhileOp op) {
+LogicalResult EliminateUnusedResultsForWhile(TF::WhileOp op) {
   func::FuncOp cond = op.cond_function();
   func::FuncOp body = op.body_function();
 
@@ -254,7 +260,7 @@ void EliminateUnusedResultsForWhile(TF::WhileOp op) {
     }
   }
 
-  if (can_eliminate.empty()) return;
+  if (can_eliminate.empty()) return success();
 
   func::FuncOp cloned_cond = CloneFunctionIfNeeded(cond);
   func::FuncOp cloned_body = CloneFunctionIfNeeded(body);
@@ -268,9 +274,13 @@ void EliminateUnusedResultsForWhile(TF::WhileOp op) {
   // deleted stay unchanged.
   for (int idx = op.getNumResults() - 1; idx >= 0; --idx) {
     if (!can_eliminate.test(idx)) continue;
-    cloned_cond.eraseArgument(idx);
+    if (failed(cloned_cond.eraseArgument(idx))) {
+      return failure();
+    }
     cloned_body.front().getTerminator()->eraseOperand(idx);
-    cloned_body.eraseArgument(idx);
+    if (failed(cloned_body.eraseArgument(idx))) {
+      return failure();
+    }
   }
 
   // Patch up branch function types.
@@ -280,6 +290,7 @@ void EliminateUnusedResultsForWhile(TF::WhileOp op) {
                           func.front().getTerminator()->getOperandTypes()));
   }
   EliminateUnusedResults(op, &can_eliminate);
+  return success();
 }
 
 // For resource results, replace all uses with the resource input to which the
@@ -348,7 +359,9 @@ LogicalResult CanonicalizeFunctionalIfCase(Operation *op,
   if (!has_resource_result) return success();
 
   // Drop unused results.
-  EliminateUnusedResultsForIfCase(op, branches);
+  if (failed(EliminateUnusedResultsForIfCase(op, branches))) {
+    return failure();
+  }
   return success();
 }
 
@@ -368,7 +381,9 @@ LogicalResult CanonicalizeFunctionalWhile(TF::WhileOp op) {
   if (!has_resource_result) return success();
 
   // Drop unused results.
-  EliminateUnusedResultsForWhile(op);
+  if (failed(EliminateUnusedResultsForWhile(op))) {
+    return failure();
+  }
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 50f6cc54c4e1..106c65368a18 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -92,6 +92,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/shape_inference.h"
 #include "xla/shape.h"
 #include "xla/tsl/platform/errors.h"
@@ -510,7 +511,7 @@ Type GetNewArgType(Type old_arg_type, ArrayRef<int64_t> shape,
       }
       new_arg_type = tensorflow::GetTypeFromTFTensorShape(
           new_shape, element_type,
-          mhlo::TypeExtensionsAttr::get(context, new_bounds));
+          mlir::mhlo::TypeExtensionsAttr::get(context, new_bounds));
     }
   }
   return new_arg_type;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
index d19d5e8e8ab5..60216929dd49 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
@@ -5,7 +5,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow/compiler/mlir:__pkg__",
+        "//tensorflow/compiler/mlir:__subpackages__",
         "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:__pkg__",
         "//tensorflow/compiler/mlir/tf2xla/internal:__pkg__",
@@ -16,15 +16,10 @@ package(
 gentbl_cc_library(
     name = "sparsecore_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=SparseCore",
-            ],
-            "sparsecore_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"sparsecore_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=SparseCore",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "sparsecore_passes.td",
     deps = [
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
index ccd246bd0d85..d22180fdbe45 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
@@ -148,6 +148,7 @@ return selected_results
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Inliner.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
@@ -422,6 +423,7 @@ struct Inliner : public InlinerInterface {
   LogicalResult InlineCallsInFunc(func::FuncOp func,
                                   bool inline_all_funcs = false) {
     llvm::SetVector<Operation*> ops_to_erase;
+    InlinerConfig config;
     for (auto caller :
          func.getRegion().getOps<TF::StatefulPartitionedCallOp>()) {
       if (!inline_all_funcs &&
@@ -441,7 +443,8 @@ struct Inliner : public InlinerInterface {
       auto callee =
           llvm::dyn_cast<func::FuncOp>(symbol_table.lookup(caller.getF()));
       auto& src_region = callee.getRegion();
-      auto result = inlineCall(*this, caller, callee, &src_region, true);
+      auto result = inlineCall(*this, config.getCloneCallback(), caller, callee,
+                               &src_region, true);
       if (failed(result)) {
         func.emitError("Inliner failed");
         return result;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index 47b046d9fdae..7326c0bde120 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -88,7 +88,8 @@ LogicalResult GetSplitElementTypeAndCount(TF::TensorArraySplitV3Op split,
   if (!lengths_const) return split.emitOpError("non-constant split lengths");
   *count = lengths_const.getValue().getNumElements();
   if (*count <= 0) return split.emitOpError("non-positive split count");
-  auto buffer_type = split.getValue().getType().dyn_cast<RankedTensorType>();
+  auto buffer_type =
+      llvm::dyn_cast<RankedTensorType>(split.getValue().getType());
   if (!buffer_type || !buffer_type.hasStaticShape() ||
       buffer_type.getRank() < 1) {
     return split.emitOpError("unknown or invalid split tensor shape");
@@ -110,7 +111,7 @@ LogicalResult GetSplitElementTypeAndCount(TF::TensorArraySplitV3Op split,
 // Tries to infer the tensor array element shape.
 std::optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
     TF::TensorArrayV3Op ta, ModuleOp module) {
-  auto element_shape = ta.getElementShapeAttr().cast<mlir::TF::ShapeAttr>();
+  auto element_shape = llvm::cast<tf_type::ShapeAttr>(ta.getElementShapeAttr());
   if (element_shape.hasStaticShape()) {
     auto shape = element_shape.getShape();
     // Convert int64 to int64_t.
@@ -142,20 +143,22 @@ std::optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
           // TensorArrayScatter writes vector of tensors to TensorArray. We can
           // deduce the shape of TensorArray by dropping the 0th dim of
           // TensorArrayScatter `value`.
-          auto t = scatter.getValue().getType().dyn_cast<RankedTensorType>();
+          auto t =
+              llvm::dyn_cast<RankedTensorType>(scatter.getValue().getType());
           if (!t || t.getShape().empty()) return std::nullopt;
           return RankedTensorType::get(t.getShape().drop_front(),
                                        t.getElementType());
         } else if (auto gather =
                        llvm::dyn_cast<TF::TensorArrayGatherV3Op>(user)) {
           // Try to infer from result type of gather.
-          auto t = gather.getValue().getType().dyn_cast<RankedTensorType>();
+          auto t =
+              llvm::dyn_cast<RankedTensorType>(gather.getValue().getType());
           if (t && !t.getShape().empty())
             return RankedTensorType::get(t.getShape().drop_front(),
                                          t.getElementType());
           // Try to infer from `element_shape` attribute of gather.
-          auto element_shape = gather.getElementShapeAttr()
-                                   .dyn_cast_or_null<mlir::TF::ShapeAttr>();
+          auto element_shape = llvm::dyn_cast_if_present<tf_type::ShapeAttr>(
+              gather.getElementShapeAttr());
           if (element_shape && element_shape.hasStaticShape()) {
             return RankedTensorType::get(element_shape.getShape(),
                                          gather.getDtype());
@@ -211,7 +214,7 @@ LogicalResult HandleTensorArrayV3Op(
   }
   auto var_type = RankedTensorType::get(
       {}, TF::ResourceType::get(
-              ArrayRef<TensorType>{buffer.getType().cast<TensorType>()},
+              ArrayRef<TensorType>{llvm::cast<TensorType>(buffer.getType())},
               ta.getContext()));
   auto local_var = builder.create<TF::MlirLocalVarOp>(
       ta.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{});
@@ -270,7 +273,7 @@ LogicalResult HandleTensorArrayWriteV3Op(
         cutil::GetElement(index_reshape, buffer, builder, write.getLoc(),
                           /*keep_slice_shape=*/true);
     // Add a size-1 leading dimension to elem.
-    auto slice_type = original_elem.getType().cast<RankedTensorType>();
+    auto slice_type = llvm::cast<RankedTensorType>(original_elem.getType());
     elem = builder.create<TF::ReshapeOp>(
         write.getLoc(), ArrayRef<Type>{slice_type},
         ArrayRef<Value>{elem, cutil::GetR1Const(slice_type.getShape(), builder,
@@ -295,7 +298,7 @@ LogicalResult HandleTensorArrayConcatV3Op(
   }
   OpBuilder builder(concat);
   auto buffer = cutil::ReadLocalVariable(local_var, builder, concat.getLoc());
-  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  auto buffer_type = llvm::cast<RankedTensorType>(buffer.getType());
   if (buffer_type.getShape().size() <= 1) {
     return concat.emitOpError("cannot concat on scalar-element tensor array");
   }
@@ -369,10 +372,9 @@ LogicalResult HandleTensorArraySizeV3Op(
   if (stats.count(local_var) == 0) {
     return size.emitOpError("unknown tensor array");
   }
-  auto buffer_type = getElementTypeOrSelf(local_var.getType())
-                         .cast<TF::ResourceType>()
-                         .getSubtypes()[0]
-                         .cast<RankedTensorType>();
+  auto buffer_type = llvm::cast<RankedTensorType>(
+      llvm::cast<TF::ResourceType>(getElementTypeOrSelf(local_var.getType()))
+          .getSubtypes()[0]);
   OpBuilder builder(size);
   auto result = cutil::CreateScalarConst(buffer_type.getDimSize(0), builder,
                                          size.getLoc());
@@ -387,10 +389,9 @@ LogicalResult CreateAndInitializeGradVariable(Type local_var_type,
   *var = builder.create<TF::MlirLocalVarOp>(
       op->getLoc(), ArrayRef<Type>{local_var_type}, ArrayRef<Value>{});
   Value buffer;
-  auto buffer_type = getElementTypeOrSelf(local_var_type)
-                         .cast<TF::ResourceType>()
-                         .getSubtypes()[0]
-                         .cast<RankedTensorType>();
+  auto buffer_type = llvm::cast<RankedTensorType>(
+      llvm::cast<TF::ResourceType>(getElementTypeOrSelf(local_var_type))
+          .getSubtypes()[0]);
   if (failed(cutil::CreateInitBufferValue(
           buffer_type.getShape().drop_front(), buffer_type.getDimSize(0), op,
           buffer_type.getElementType(), builder, &buffer))) {
@@ -478,7 +479,7 @@ llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> AccessedGradients(
   llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> result;
   llvm::SmallDenseMap<int64_t, llvm::StringSet<>> result_sets;
   auto insert = [&](Value v, const string& source, const Block& func_block) {
-    auto arg = v.dyn_cast<BlockArgument>();
+    auto arg = dyn_cast<BlockArgument>(v);
     if (!arg || arg.getOwner() != &func_block) return;
     auto insert_res = result_sets[arg.getArgNumber()].insert(source);
     if (!insert_res.second) return;
@@ -594,7 +595,7 @@ LogicalResult HandleWhileOp(TF::WhileOp while_op, ModuleOp module,
   for (int64_t i = 0; i < while_op.getNumResults(); ++i) {
     if (!ta_arg_buffer_type(i)) continue;
     auto retval = old_body_ret->getOperand(i);
-    auto arg = retval.dyn_cast<BlockArgument>();
+    auto arg = dyn_cast<BlockArgument>(retval);
     if (!arg) {
       return while_op.emitOpError(
           "output tensor array does not alias input in a while loop");
@@ -702,13 +703,13 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
       if_op->getAttrs());
   auto ret_forwards_input = [](func::FuncOp f, int64_t ret_ind) -> int64_t {
     auto retval = f.front().getTerminator()->getOperand(ret_ind);
-    auto arg = retval.dyn_cast<BlockArgument>();
+    auto arg = dyn_cast<BlockArgument>(retval);
     if (!arg) return -1;
     return arg.getArgNumber();
   };
   for (int64_t i = 0; i < if_op.getNumResults(); ++i) {
-    if (!getElementTypeOrSelf(if_op.getResult(i).getType())
-             .isa<TF::ResourceType>()) {
+    if (!isa<TF::ResourceType>(
+            getElementTypeOrSelf(if_op.getResult(i).getType()))) {
       if_op.getResult(i).replaceAllUsesWith(new_if.getResult(i));
       continue;
     }
@@ -811,8 +812,8 @@ LogicalResult HandlePartitionedCallOp(
   }
   for (int64_t i = 0; i < call.getNumResults(); ++i) {
     auto ret = lowered_callee.front().getTerminator()->getOperand(i);
-    if (!getElementTypeOrSelf(ret.getType()).isa<TF::ResourceType>()) continue;
-    auto arg = ret.dyn_cast<BlockArgument>();
+    if (!isa<TF::ResourceType>(getElementTypeOrSelf(ret.getType()))) continue;
+    auto arg = dyn_cast<BlockArgument>(ret);
     if (!arg) continue;
     info.ret_forward_input.emplace_back(i, arg.getArgNumber());
   }
@@ -842,7 +843,7 @@ LogicalResult HandleRegionControlFlowOps(
     llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
         decomposed_partitioned_call_callees) {
   for (OpOperand& operand : op.getOpOperands()) {
-    if (getElementTypeOrSelf(operand.get().getType()).isa<TF::ResourceType>()) {
+    if (isa<TF::ResourceType>(getElementTypeOrSelf(operand.get().getType()))) {
       return op.emitOpError()
              << "found unexpected type " << operand.get().getType()
              << " of operand #" << operand.getOperandNumber()
@@ -851,7 +852,7 @@ LogicalResult HandleRegionControlFlowOps(
     }
   }
   for (OpResult result : op.getResults()) {
-    if (getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+    if (isa<TF::ResourceType>(getElementTypeOrSelf(result.getType()))) {
       return op.emitOpError()
              << "found unexpected type " << result.getType() << " of result #"
              << result.getResultNumber()
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
index 40d9032b499f..9feb3a8bab17 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -125,7 +126,7 @@ class AssetSinkingPass : public impl::AssetSinkingPassBase<AssetSinkingPass> {
     }
 
     // Erase function arguments with bounded input.
-    func.eraseArguments(arg_indexes_to_remove);
+    CHECK(llvm::succeeded(func.eraseArguments(arg_indexes_to_remove)));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_utils.cc
index 0f77375840ba..79a0e60d2e61 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_utils.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -68,12 +69,14 @@ void UpdateTerminatorArguments(T& func,
 
 // Erases function arguments indexed at `args_to_erase`. Also applies the
 // changes to any relevant function attributes accordingly.
-void EraseFuncOpArguments(func::FuncOp func_op,
-                          const ArrayRef<unsigned> args_to_erase) {
+LogicalResult EraseFuncOpArguments(func::FuncOp func_op,
+                                   const ArrayRef<unsigned> args_to_erase) {
   BitVector args_to_erase_bit_vector(func_op.getNumArguments());
   for (const unsigned i : args_to_erase) args_to_erase_bit_vector.set(i);
 
-  func_op.eraseArguments(args_to_erase_bit_vector);
+  if (failed(func_op.eraseArguments(args_to_erase_bit_vector))) {
+    return failure();
+  }
 
   // Erases entries in "tf._input_shapes" attribute of `func_op` that correspond
   // to the erased arguments.
@@ -93,6 +96,7 @@ void EraseFuncOpArguments(func::FuncOp func_op,
         kTfInputShapesAttr,
         ArrayAttr::get(func_op.getContext(), updated_input_shapes_attr));
   }
+  return success();
 }
 
 // Updates 'while_op' signatures based on which arguments should be removed
@@ -236,9 +240,13 @@ LogicalResult EraseObsoleteResourceUses(
       // 3) Update function result to match the terminator.
       llvm::BitVector result_indices_to_erase;
       UpdateTerminatorArguments(func, args_to_erase, result_indices_to_erase);
-      EraseFuncOpArguments(func, args_to_erase);
+      if (failed(EraseFuncOpArguments(func, args_to_erase))) {
+        return failure();
+      }
 
-      func.eraseResults(result_indices_to_erase);
+      if (failed(func.eraseResults(result_indices_to_erase))) {
+        return failure();
+      }
     } else if (auto read_var = dyn_cast<TF::ReadVariableOp>(user_op)) {
       // Read variables was already replaced by constant op. Just remove the op.
       read_var->erase();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc
index fdacf313d302..18344894ff4c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
@@ -49,19 +50,18 @@ struct TPUResourceReadsWritesPartitioningPass
 
 bool AllResourceTypesHaveSubtypes(TypeRange resources) {
   for (Type resource : resources)
-    if (!llvm::hasSingleElement(resource.cast<TensorType>()
-                                    .getElementType()
-                                    .cast<TF::ResourceType>()
-                                    .getSubtypes()))
+    if (!llvm::hasSingleElement(
+            llvm::cast<tf_type::ResourceType>(
+                llvm::cast<TensorType>(resource).getElementType())
+                .getSubtypes()))
       return false;
 
   return true;
 }
 
 Type GetResourceSubtype(Type type) {
-  return type.cast<TensorType>()
-      .getElementType()
-      .cast<TF::ResourceType>()
+  return llvm::cast<tf_type::ResourceType>(
+             llvm::cast<TensorType>(type).getElementType())
       .getSubtypes()
       .front();
 }
@@ -118,7 +118,7 @@ mlir::Attribute GetDeviceOfResource(mlir::func::FuncOp func,
   if (auto* resource_op = resource.getDefiningOp()) {
     return resource_op->getAttr(kDeviceAttr);
   } else {
-    const auto resource_arg = resource.dyn_cast_or_null<BlockArgument>();
+    const auto resource_arg = dyn_cast_or_null<BlockArgument>(resource);
     if (resource_arg && (resource_arg.getOwner() == &(func.front()))) {
       return func.getArgAttrOfType<mlir::StringAttr>(
           resource_arg.getArgNumber(), kFuncDeviceAttr);
@@ -129,7 +129,7 @@ mlir::Attribute GetDeviceOfResource(mlir::func::FuncOp func,
 }
 
 bool IsCompositeDevice(mlir::Attribute attr) {
-  const auto str_attr = attr.dyn_cast_or_null<mlir::StringAttr>();
+  const auto str_attr = llvm::dyn_cast_if_present<StringAttr>(attr);
   return str_attr &&
          (str_attr.getValue().find("COMPOSITE") != llvm::StringRef::npos);
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/verify_no_outside_compilation_markers_pass_test.cc b/tensorflow/compiler/mlir/tensorflow/transforms/verify_no_outside_compilation_markers_pass_test.cc
index f042737065ac..4b26fd79dfbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/verify_no_outside_compilation_markers_pass_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/verify_no_outside_compilation_markers_pass_test.cc
@@ -31,7 +31,7 @@ namespace TFDevice {
 using ::mlir::MLIRContext;
 using ::mlir::ModuleOp;
 using ::mlir::OwningOpRef;
-using ::mlir::mhlo::test::GetMlirModuleFromString;
+using ::mlir::hlo::test::GetMlirModuleFromString;
 
 class VerifyNoOutsideCompilationMarkersPassTest : public ::testing::Test {
  protected:
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
index 7d176d9692cd..1119d4e2b33c 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
index a9eb45e5da3c..bf786ac1a06c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
@@ -180,10 +180,10 @@ absl::Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
         absl::StrCat("Converting ", debugString(type), " to DataType"));
   }
 
-#define HANDLE_TF_TYPE(tftype, enumerant, name)  \
-  if (type.isa<mlir::tf_type::tftype##Type>()) { \
-    *dtype = DT_##enumerant;                     \
-    return OkStatus();                           \
+#define HANDLE_TF_TYPE(tftype, enumerant, name)       \
+  if (llvm::isa<mlir::tf_type::tftype##Type>(type)) { \
+    *dtype = DT_##enumerant;                          \
+    return OkStatus();                                \
   }
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index 2efd63b29b04..aa818d2ae73b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -126,7 +126,6 @@ TEST(DumpCrashReproducerTest, RoundtripDumpAndReadValid) {
                                 registry,
                                 mlir::MlirOptMainConfig{}
                                     .splitInputFile("")
-                                    .verifyDiagnostics(false)
                                     .verifyPasses(false)
                                     .allowUnregisteredDialects(false)
                                     .setPassPipelineParser(passPipeline))
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.cc
index 9d4305b8e033..56dcee543015 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -41,11 +42,8 @@ void MarkResourceAsReadAndWrite(
     OpOperand& op_operand,
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
         effects) {
-  if (op_operand.get()
-          .getType()
-          .cast<TensorType>()
-          .getElementType()
-          .isa<ResourceType>()) {
+  if (llvm::isa<ResourceType>(llvm::cast<TensorType>(op_operand.get().getType())
+                                  .getElementType())) {
     effects.emplace_back(MemoryEffects::Read::get(), &op_operand,
                          ResourceEffects::Variable::get());
     effects.emplace_back(MemoryEffects::Write::get(), &op_operand,
@@ -57,11 +55,8 @@ void MarkResourceAsReadOnly(
     OpOperand& op_operand,
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
         effects) {
-  if (op_operand.get()
-          .getType()
-          .cast<TensorType>()
-          .getElementType()
-          .isa<ResourceType>()) {
+  if (llvm::isa<ResourceType>(llvm::cast<TensorType>(op_operand.get().getType())
+                                  .getElementType())) {
     effects.emplace_back(MemoryEffects::Read::get(), &op_operand,
                          ResourceEffects::Variable::get());
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
index 348ae41e3d2e..34917780dc80 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
@@ -48,13 +48,13 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tools/parsers.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/xla_argument.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index ac8ecf1090b2..b87afe634125 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -380,7 +380,7 @@ mlir::LogicalResult HandleTileShardedInputsUsingXlaSplitOps(
   std::vector<int64_t> paddings;
   paddings.reserve(rank);
   auto shape = llvm::to_vector<4>(
-      original_source.getType().cast<mlir::TensorType>().getShape());
+      mlir::cast<mlir::TensorType>(original_source.getType()).getShape());
   for (int dim = 0; dim < rank; ++dim) {
     paddings.push_back(
         GetPadding(dim, input_sharding.tile_assignment_dimensions(dim),
diff --git a/tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/BUILD b/tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/BUILD
index f7ec0f891812..236f761a868e 100644
--- a/tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow_to_stablehlo/python/BUILD
@@ -81,7 +81,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow_to_stablehlo:tf_to_stablehlo",
         "//tensorflow/core:lib",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -91,6 +90,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Support",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -100,8 +100,8 @@ tf_python_pybind_extension(
     pytype_srcs = ["pywrap_tensorflow_to_stablehlo.pyi"],
     # Each dependency MUST be either header-only or exclusive.
     deps = [
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings:string_view",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
         "@pybind11_abseil//pybind11_abseil:status_casters",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index bccea8e9d092..5a95a826f3c3 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -68,16 +68,17 @@ cc_library(
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/builder:xla_computation",
         "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/hlo/translate:stablehlo",
         "@local_xla//xla/hlo/translate/mhlo_to_hlo:layout_util",
         "@local_xla//xla/hlo/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
         "@local_xla//xla/hlo/translate/mhlo_to_hlo:type_to_shape",
         "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/mlir_hlo:stablehlo_extension_passes",
         "@local_xla//xla/service:hlo_proto_cc",
         "@local_xla//xla/tsl/platform:errors",
         "@local_xla//xla/tsl/platform:statusor",
-        "@stablehlo//:register",
+        "@stablehlo//:base",
     ],
 )
 
@@ -136,6 +137,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/tpu:tpu_compile",
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
         "//tensorflow/core/tpu/kernels:tpu_compile_proto_cc",
@@ -156,6 +158,7 @@ cc_library(
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@local_xla//xla/pjrt:compile_options_proto_cc",
+        "@local_xla//xla/service:hlo_proto_cc",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
index 6281ea68e378..d618a4446347 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
@@ -54,7 +54,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "stablehlo/dialect/Base.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -87,9 +87,10 @@ limitations under the License.
 #include "xla/hlo/translate/mhlo_to_hlo/layout_util.h"
 #include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 #include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
+#include "xla/hlo/translate/stablehlo.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/mlir_hlo/stablehlo_ext/transforms/passes.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/tsl/platform/errors.h"
@@ -201,12 +202,12 @@ mlir::RankedTensorType GetBufferType(mlir::Type ty) {
 
   int64_t rank = ranked_ty.getRank();
   llvm::SmallVector<int64_t, 4> dims = llvm::to_vector<4>(ranked_ty.getShape());
-  auto encoding = mlir::dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>(
-      ranked_ty.getEncoding());
-  if (encoding && !encoding.getBounds().empty()) {
+  llvm::ArrayRef<int64_t> bounds =
+      mlir::hlo::encodingToBounds(ranked_ty.getEncoding());
+  if (!bounds.empty()) {
     for (int64_t dim = 0; dim < rank; ++dim) {
       if (dims[dim] == mlir::ShapedType::kDynamic) {
-        dims[dim] = encoding.getBounds()[dim];
+        dims[dim] = bounds[dim];
       }
     }
   }
@@ -346,8 +347,7 @@ void GetInputMappingForMlir(int num_inputs, std::vector<int>* input_mapping) {
 
 static void RegisterDialects(mlir::DialectRegistry& registry) {
   mlir::RegisterAllTensorFlowDialects(registry);
-  mlir::mhlo::registerAllMhloDialects(registry);
-  mlir::stablehlo::registerAllDialects(registry);
+  xla::RegisterMlirToHloDependentDialects(registry);
 }
 
 // Checks if functions can be inlined after TF -> HLO legalization. Currently
@@ -581,7 +581,7 @@ void CreateConvertMlirToXlaHloPipeline(
     // Everything should be MHLO after this.
     if (!allow_partial_conversion) {
       pm.addNestedPass<mlir::func::FuncOp>(
-          mlir::mhlo::CreateVerifyTFXLALegalizationPass(legalize_chlo));
+          mlir::hlo::CreateVerifyTFXLALegalizationPass(legalize_chlo));
     }
   }
 
@@ -592,7 +592,7 @@ void CreateConvertMlirToXlaHloPipeline(
   // In order to export to XLA, we must sink constants to control flow regions,
   // since XLA uses functional control flow.
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createSinkConstantsToControlFlowPass());
+      mlir::stablehlo_ext::createSinkConstantsToControlFlowPass());
 }
 
 absl::Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
@@ -988,7 +988,9 @@ static absl::StatusOr<std::vector<int>> RewriteWithArgs(
                                 main_fn.getFunctionType().getResults()));
   }
 
-  for (int idx : llvm::reverse(args_to_erase)) main_fn.eraseArgument(idx);
+  for (int idx : llvm::reverse(args_to_erase)) {
+    CHECK(llvm::succeeded(main_fn.eraseArgument(idx)));
+  }
 
   return params;
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
index 7d3d77f3e290..1dda7d2981a4 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h"
 
+#include <chrono>
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -50,13 +51,16 @@ limitations under the License.
 #include "xla/client/compile_only_client.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/framework/device_type.h"
 #include "xla/tsl/lib/monitoring/sampler.h"
 #include "xla/tsl/platform/errors.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
@@ -65,6 +69,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/tpu_compile.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/inference/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/inference/BUILD
index e80d33abb5cb..d87efdfbf146 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/inference/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/inference/BUILD
@@ -13,15 +13,10 @@ package(
 gentbl_cc_library(
     name = "inference_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TF2XLA",
-            ],
-            "inference_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"inference_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TF2XLA",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "inference_passes.td",
     deps = [
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/inference/inference_metrics_pass_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/inference/inference_metrics_pass_test.cc
index 4567b8f4268c..c0565f78e226 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/inference/inference_metrics_pass_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/inference/inference_metrics_pass_test.cc
@@ -34,7 +34,7 @@ namespace {
 using ::mlir::MLIRContext;
 using ::mlir::ModuleOp;
 using ::mlir::OwningOpRef;
-using ::mlir::mhlo::test::GetMlirModuleFromString;
+using ::mlir::hlo::test::GetMlirModuleFromString;
 using ::tensorflow::monitoring::testing::CellReader;
 
 static constexpr char kHasTpuPartitionedCallStreamzName[] =
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
index becdc528044f..9f2822c65f4f 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -7,8 +7,8 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//learning/pathways/serving/transforms:__pkg__",
-        "//tensorflow/compiler/mlir:__pkg__",
+        "//learning/brain/tfrt/tpu/compiler/mlir:__pkg__",
+        "//tensorflow/compiler/mlir:__subpackages__",
         "//tensorflow/compiler/mlir/tf2xla/api:__subpackages__",
         "//tensorflow/compiler/mlir/tf2xla/internal:__subpackages__",
     ],
@@ -71,15 +71,10 @@ cc_library(
 gentbl_cc_library(
     name = "clustering_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TFXLABridgeClustering",
-            ],
-            "clustering_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"clustering_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TFXLABridgeClustering",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "clustering_passes.td",
     deps = [
@@ -229,15 +224,10 @@ cc_library(
 gentbl_cc_library(
     name = "mlir_to_graph_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TFXLABridgeMlirToGraph",
-            ],
-            "mlir_to_graph_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"mlir_to_graph_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TFXLABridgeMlirToGraph",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mlir_to_graph_passes.td",
     deps = [
@@ -459,15 +449,10 @@ cc_library(
 gentbl_cc_library(
     name = "lowering_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TFXLABridgeLowering",
-            ],
-            "lowering_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"lowering_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TFXLABridgeLowering",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "lowering_passes.td",
     deps = [
@@ -570,7 +555,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:sharding_builder",
+        "@local_xla//xla/hlo/builder:sharding_builder",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc
index d6c92101bf60..e5cfe09aaa57 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc
@@ -61,7 +61,7 @@ void InputMetricsLoweringPass::runOnOperation() {
     auto abstractOp = op->getRegisteredInfo();
     if (!abstractOp) return WalkResult::advance();
 
-    if (mlir::mhlo::IsDynamicPadderOp(abstractOp->getTypeID())) {
+    if (mlir::hlo::IsDynamicPadderOp(abstractOp->getTypeID())) {
       has_dynamic_op = true;
       dynamism_op_counter->GetCell(op->getName().getStringRef().str())
           ->IncrementBy(1);
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass_test.cc
index c26822fad303..39ddd64fa28d 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass_test.cc
@@ -37,7 +37,7 @@ namespace {
 
 using ::mlir::LogicalResult;
 using ::mlir::ModuleOp;
-using ::mlir::mhlo::test::GetMlirModuleFromString;
+using ::mlir::hlo::test::GetMlirModuleFromString;
 using ::tensorflow::monitoring::testing::CellReader;
 
 constexpr char kNotDynamicFunctionName[] = "kNotDynamicFunction";
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
index 7308669b6359..70f43ed67dc6 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
@@ -287,7 +287,7 @@ bool IsSupportedOp(Operation& op,
 
   auto abstractOp = op.getRegisteredInfo();
   if (!abstractOp) return false;
-  return mlir::mhlo::HasTf2XlaFallback(abstractOp->getTypeID());
+  return mlir::hlo::HasTf2XlaFallback(abstractOp->getTypeID());
 }
 
 bool IsVariant(Value value) {
@@ -465,7 +465,7 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
     return signalPassFailure();
   }
   RewritePatternSet patterns(&getContext());
-  mlir::mhlo::PopulateLegalizeTfPatterns(module.getContext(), &patterns);
+  mlir::hlo::PopulateLegalizeTfPatterns(module.getContext(), &patterns);
   mlir::TF::PopulateTFLoweringBeforeHLOPatterns(module.getContext(), &patterns);
   mlir::TF::PopulateLoweringQuantizedPatterns(module.getContext(), &patterns);
   AddCanonicalizationPatterns(module.getContext(), &patterns);
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
index db39ca12d9ce..0da0cc4fc4dd 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
@@ -872,7 +872,7 @@ LogicalResult FormClustersInBlock(
         block, cluster_ops, results, cluster_successor_ops.getArrayRef());
 
     auto num_replicas = cluster_metadata->getSecond().get(kNumReplicasAttr);
-    if (!num_replicas || !num_replicas.isa<mlir::IntegerAttr>())
+    if (!num_replicas || !mlir::isa<mlir::IntegerAttr>(num_replicas))
       return cluster.emitError()
              << "requires '" << kNumReplicasAttr << "' int attribute";
 
@@ -881,9 +881,9 @@ LogicalResult FormClustersInBlock(
         cluster_metadata->getSecond().get(kNumCoresPerReplicaAttr));
     if (num_cores_per_replica_attr)
       num_cores_per_replica = num_cores_per_replica_attr.getInt();
-    if (failed(ReplicateCluster(cluster,
-                                num_replicas.cast<mlir::IntegerAttr>().getInt(),
-                                num_cores_per_replica)))
+    if (failed(ReplicateCluster(
+            cluster, mlir::cast<mlir::IntegerAttr>(num_replicas).getInt(),
+            num_cores_per_replica)))
       return mlir::failure();
 
     // Copy TPUReplicateMetadata attributes to `tf_device.cluster`.
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_validate_inputs_utils_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_validate_inputs_utils_test.cc
index a64f06b838f1..a56d6304a7ef 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_validate_inputs_utils_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_validate_inputs_utils_test.cc
@@ -31,7 +31,7 @@ namespace tf2xla {
 namespace internal {
 namespace {
 
-using mlir::mhlo::test::GetMlirModuleFromString;
+using mlir::hlo::test::GetMlirModuleFromString;
 
 TEST(IsPotentialUnsupportedOp, ClusterOpReturnsFalse) {
   mlir::MLIRContext context;
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.cc
index e1048f8ea2ca..fcddd1058729 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.cc
@@ -32,7 +32,7 @@ namespace internal {
 
 namespace {
 
-using mlir::mhlo::test::GetMlirModuleFromString;
+using mlir::hlo::test::GetMlirModuleFromString;
 
 class VerifyClusteringPassTest : public testing::Test {
  protected:
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_broadcast.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_broadcast.cc
index b9efb7097d62..2e1933a08965 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_broadcast.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_broadcast.cc
@@ -58,7 +58,6 @@ namespace internal {
 namespace {
 
 using llvm::dyn_cast;
-using mlir::Attribute;
 using mlir::Block;
 using mlir::BlockArgument;
 using mlir::DenseIntElementsAttr;
@@ -78,7 +77,6 @@ using mlir::ValueRange;
 using mlir::WalkResult;
 using mlir::func::FuncOp;
 using mlir::TF::ConstOp;
-using mlir::TF::FillOp;
 using mlir::TF::IdentityOp;
 using mlir::TF::ShapeAttr;
 using mlir::TF::TPUDummyInputOp;
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
index 92754a181e85..6188395f648b 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
@@ -559,7 +559,7 @@ func.func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
   // CHECK: %[[RS:.*]] = mhlo.reshape %[[ARG]] : (tensor<4x3x4x3xf32>) -> tensor<12x12xf32>
   // CHECK-DAG: %[[IOTA0:.*]] = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<12x12xi32>
   // CHECK-DAG: %[[IOTA1:.*]] = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<12x12xi32>
-  // CHECK-DAG: %[[COMP:.*]] = mhlo.compare EQ, %[[IOTA0]], %[[IOTA1]], NOTYPE : (tensor<12x12xi32>, tensor<12x12xi32>) -> tensor<12x12xi1>
+  // CHECK-DAG: %[[COMP:.*]] = mhlo.compare EQ, %[[IOTA0]], %[[IOTA1]] : (tensor<12x12xi32>, tensor<12x12xi32>) -> tensor<12x12xi1>
   // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[ZERO_MAT:.*]] = "mhlo.broadcast"(%[[ZERO]]) <{broadcast_sizes = dense<12> : tensor<2xi64>}> : (tensor<f32>) -> tensor<12x12xf32>
   // CHECK-DAG: %[[SEL:.*]] = mhlo.select %[[COMP]], %[[RS]], %[[ZERO_MAT]] : tensor<12x12xi1>, tensor<12x12xf32>
@@ -622,7 +622,7 @@ func.func @matrix_diag_part(%arg0: tensor<7x140x128xi32>) -> tensor<7x22x128xi32
   // CHECK-DAG: %[[V40:.*]] = mhlo.and %[[V36]], %[[V39]] : tensor<1x22x128xi1>
   // CHECK-DAG: %[[V41:.*]] = mhlo.reshape %[[V40]] : (tensor<1x22x128xi1>) -> tensor<22x128xi1>
   // CHECK-DAG: %[[V42:.*]] = "mhlo.concatenate"(%[[V33]], %[[V32]]) <{dimension = 0 : i64}> : (tensor<1x22x128xi32>, tensor<1x22x128xi32>) -> tensor<2x22x128xi32>
-  // CHECK-DAG: %[[V43:.*]] = "mhlo.gather"(%[[ARG]], %[[V42]]) <{dimension_numbers = #mhlo.gather<offset_dims = [0], collapsed_slice_dims = [1, 2], start_index_map = [1, 2]>, indices_are_sorted = false, slice_sizes = dense<[7, 1, 1]> : tensor<3xi64>}> : (tensor<7x140x128xi32>, tensor<2x22x128xi32>) -> tensor<7x22x128xi32>
+  // CHECK-DAG: %[[V43:.*]] = "mhlo.gather"(%[[ARG]], %[[V42]]) <{dimension_numbers = #mhlo.gather<offset_dims = [0], collapsed_slice_dims = [1, 2], start_index_map = [1, 2]>, slice_sizes = dense<[7, 1, 1]> : tensor<3xi64>}> : (tensor<7x140x128xi32>, tensor<2x22x128xi32>) -> tensor<7x22x128xi32>
   // CHECK-DAG: %[[V44:.*]] = "mhlo.broadcast"(%[[V41]]) <{broadcast_sizes = dense<7> : tensor<1xi64>}> : (tensor<22x128xi1>) -> tensor<7x22x128xi1>
   // CHECK-DAG: %[[V45:.*]] = "mhlo.broadcast"(%[[V0]]) <{broadcast_sizes = dense<[7, 22, 128]> : tensor<3xi64>}> : (tensor<i32>) -> tensor<7x22x128xi32>
   // CHECK: %[[V46:.*]] = mhlo.select %[[V44]], %[[V43]], %[[V45]] : tensor<7x22x128xi1>, tensor<7x22x128xi32>
@@ -731,6 +731,80 @@ func.func @matrix_diag_part_align_7d(%arg0: tensor<3x5x7x9x11x13x17xf32>) -> ten
   func.return %2: tensor<3x5x7x9x11x4x10xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Conv
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @conv2d_NHWC
+func.func @conv2d_NHWC(%arg0: tensor<1x4x4x2xf32> {tf_saved_model.index_path = ["input_2"]}, %arg1: tensor<3x3x2x2xf32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>, %arg4: tensor<2xf32>, %arg5: tensor<2xf32>, %arg6: tensor<2xf32>, %arg7: tensor<2xf32>) -> (tensor<1x4x4x2xf32> {tf_saved_model.index_path = [""]}) {
+  // CHECK{LITERAL}: mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x4x4x2xf32>, tensor<3x3x2x2xf32>) -> tensor<1x4x4x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> {device = ""} : (tensor<1x4x4x2xf32>, tensor<3x3x2x2xf32>) -> tensor<1x4x4x2xf32>
+  %1 = "tf.Mul"(%0, %arg6) : (tensor<1x4x4x2xf32>, tensor<2xf32>) -> tensor<1x4x4x2xf32>
+  %2 = "tf.AddV2"(%1, %arg7) : (tensor<1x4x4x2xf32>, tensor<2xf32>) -> tensor<1x4x4x2xf32>
+  return %2 : tensor<1x4x4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @conv2d_backprop_input
+func.func @conv2d_backprop_input(%arg0: tensor<3x3x8x8xf32>, %arg1: tensor<1x128x192x8xf32>) -> tensor<1x256x384x8xf32> {
+    %cst = "tf.Const"() <{value = dense<[1, 256, 384, 8]> : tensor<4xi32>}> : () -> tensor<4xi32>
+    %0 = "tf.Conv2DBackpropInput"(%cst, %arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> {device = ""} : (tensor<4xi32>, tensor<3x3x8x8xf32>, tensor<1x128x192x8xf32>) -> tensor<1x256x384x8xf32>
+    return %0 : tensor<1x256x384x8xf32>
+  }
+
+//===----------------------------------------------------------------------===//
+// Cumulative
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @cumsum
+func.func @cumsum(%arg0: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
+  // CHECK: mhlo.reduce_window
+  // CHECK-SAME{LITERAL}: padding = dense<[[0, 0], [3, 0], [0, 0]]> : tensor<3x2xi64>, window_dimensions = dense<[1, 4, 1]> : tensor<3xi64>, window_strides = dense<1> : tensor<3xi64>
+  // CHECK: mhlo.add
+  %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  %0 = "tf.Cumsum"(%arg0, %cst) <{exclusive = false, reverse = false}> {device = ""} : (tensor<1x4x1xf32>, tensor<i32>) -> tensor<1x4x1xf32>
+  return %0 : tensor<1x4x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @cumprod
+func.func @cumprod(%arg0: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
+  // CHECK: mhlo.reduce_window
+  // CHECK-SAME{LITERAL}: padding = dense<0> : tensor<3x2xi64>, window_dimensions = dense<1> : tensor<3xi64>, window_strides = dense<1> : tensor<3xi64>
+  // CHECK: mhlo.multiply
+  %cst = "tf.Const"() <{value = dense<2> : tensor<i32>}> : () -> tensor<i32>
+  %0 = "tf.Cumprod"(%arg0, %cst) <{exclusive = false, reverse = false}> {device = ""} : (tensor<1x4x1xf32>, tensor<i32>) -> tensor<1x4x1xf32>
+  return %0 : tensor<1x4x1xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicSlice
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @dynamic_slice_i32
+func.func @dynamic_slice_i32(%arg0: tensor<8x512x384xbf16>, %arg1: tensor<3xi32>) -> tensor<1x512x384xbf16> attributes {allow_soft_placement = false, tf.entry_function = {control_outputs = "", inputs = "_arg0,_arg1,_arg2", outputs = "_retval0"}} {
+  %cst = "tf.Const"() <{value = dense<[1, 512, 384]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  //  CHECK: "mhlo.dynamic_slice"{{.*}}slice_sizes = dense<[1, 512, 384]> : tensor<3xi64>
+  %0 = "tf.XlaDynamicSlice"(%arg0, %arg1, %cst) {device = ""} : (tensor<8x512x384xbf16>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x512x384xbf16>
+  return %0 : tensor<1x512x384xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_slice_i64
+func.func @dynamic_slice_i64(%arg0: tensor<8x512x384xbf16>, %arg1: tensor<3xi32>) -> tensor<1x512x384xbf16> attributes {allow_soft_placement = false, tf.entry_function = {control_outputs = "", inputs = "_arg0,_arg1,_arg2", outputs = "_retval0"}} {
+  %cst = "tf.Const"() <{value = dense<[1, 512, 384]> : tensor<3xi64>}> : () -> tensor<3xi64>
+  //  CHECK: "mhlo.dynamic_slice"{{.*}}slice_sizes = dense<[1, 512, 384]> : tensor<3xi64>
+  %0 = "tf.XlaDynamicSlice"(%arg0, %arg1, %cst) {device = ""} : (tensor<8x512x384xbf16>, tensor<3xi32>, tensor<3xi64>) -> tensor<1x512x384xbf16>
+  return %0 : tensor<1x512x384xbf16>
+}
+
 //===----------------------------------------------------------------------===//
 // Erf
 //===----------------------------------------------------------------------===//
@@ -739,7 +813,8 @@ func.func @matrix_diag_part_align_7d(%arg0: tensor<3x5x7x9x11x13x17xf32>) -> ten
 
 // CHECK-LABEL: func @erf
 func.func @erf(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-  // CHECK: mhlo.erf %arg0 : tensor<2x3xf32>
+  // CHECK: chlo.erf %arg0 : tensor<2x3xf32>
+  // CHLO: mhlo.erf %arg0 : tensor<2x3xf32>
   %0 = "tf.Erf"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   func.return %0 : tensor<2x3xf32>
 }
@@ -1488,7 +1563,7 @@ func.func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_outpu
   // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) <{window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>}> ({
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
-  // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]], NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<i1>
   // CHECK: },  {
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
@@ -1513,7 +1588,7 @@ func.func @max_pool_3d_grad_valid(%orig_input: tensor<10x8x24x24x64xf32>, %orig_
   // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) <{window_dimensions = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>, window_strides = dense<[1, 1, 2, 2, 1]> : tensor<5xi64>}> ({
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
-  // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]], NOTYPE : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<i1>
   // CHECK: },  {
   // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
@@ -1558,7 +1633,7 @@ func.func @max_pool_3d_grad_same(%orig_input: tensor<2x8x13x25x7xf32>, %orig_out
 func.func @one_hot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value: tensor<f32>) -> tensor<3x5xf32> {
   // CHECK: %[[IOTA:.*]] = "mhlo.iota"() <{iota_dimension = 1 : i64}> : () -> tensor<3x5xi32>
   // CHECK: %[[BCAST_ARG0:.+]] = "mhlo.broadcast_in_dim"(%arg0) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<3xi32>) -> tensor<3x5xi32>
-  // CHECK: %[[COMPARE:.*]] = mhlo.compare EQ, %[[BCAST_ARG0]], %[[IOTA]], NOTYPE : (tensor<3x5xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
+  // CHECK: %[[COMPARE:.*]] = mhlo.compare EQ, %[[BCAST_ARG0]], %[[IOTA]] : (tensor<3x5xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
   // CHECK: %[[ON_VALUE:.*]] = "mhlo.broadcast"(%arg1) <{broadcast_sizes = dense<[3, 5]> : tensor<2xi64>}> : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[OFF_VALUE:.*]] = "mhlo.broadcast"(%arg2) <{broadcast_sizes = dense<[3, 5]> : tensor<2xi64>}> : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[RESULT:.*]] = mhlo.select %[[COMPARE]], %[[ON_VALUE]], %[[OFF_VALUE]] : tensor<3x5xi1>, tensor<3x5xf32>
@@ -1763,7 +1838,7 @@ func.func @stateful_pcall_multi_in_out(%arg0: tensor<i32>, %arg1: tensor<i32>) -
 
 // CHECK-LABEL: func @elu
 func.func @elu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
-  // CHECK-DAG: %[[ZERO:.*]] = "chlo.constant_like"(%arg0) <{value = 0.000000e+00 : f32}> : (tensor<1xf32>) -> tensor<1xf32>
+  // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<1xf32>
   // CHECK-DAG: %[[PRED:.*]] = mhlo.compare GT, %arg0, %[[ZERO]]
   // CHECK-DAG: %[[EXP:.*]] = mhlo.exponential_minus_one %arg0
   // CHECK: %[[RESULT:.*]] = mhlo.select %[[PRED]], %arg0, %[[EXP]]
@@ -1841,7 +1916,7 @@ func.func @leaky_relu(%arg0: tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32> attribu
     // CHECK-NEXT: %[[ALPHA:.*]] = "chlo.constant_like"(%arg0) <{value = 2.000000e-01 : f32}> : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
     // CHECK-NEXT: %[[ZERO:.*]] = "chlo.constant_like"(%arg0) <{value = 0.000000e+00 : f32}> : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
     // CHECK-NEXT: %[[LEAKY:.*]] = mhlo.multiply %[[INP:.*]], %[[ALPHA]] : tensor<1x4x4x3xf32>
-    // CHECK-NEXT: %[[CMP:.*]] = mhlo.compare GT, %[[INP]], %[[ZERO]], NOTYPE : (tensor<1x4x4x3xf32>, tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xi1>
+    // CHECK-NEXT: %[[CMP:.*]] = mhlo.compare GT, %[[INP]], %[[ZERO]] : (tensor<1x4x4x3xf32>, tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xi1>
     // CHECK-NEXT: %[[RES:.*]] = mhlo.select %[[CMP]], %[[INP]], %[[LEAKY]] : tensor<1x4x4x3xi1>, tensor<1x4x4x3xf32>
     // CHECK-NEXT: return %[[RES]] : tensor<1x4x4x3xf32>
     %0 = "tf.LeakyRelu"(%arg0) {alpha = 2.000000e-01 : f32, device = ""} : (tensor<1x4x4x3xf32>) -> tensor<1x4x4x3xf32>
@@ -1855,7 +1930,7 @@ func.func @leaky_relu_grad(%arg0: tensor<1x4x4xf32>, %arg1: tensor<1x4x4xf32>) -
     // CHECK-NEXT: %[[ALPHA:.*]] = "chlo.constant_like"(%arg1) <{value = 2.000000e-01 : f32}> : (tensor<1x4x4xf32>) -> tensor<1x4x4xf32>
     // CHECK-NEXT: %[[ZERO:.*]] = "chlo.constant_like"(%arg1) <{value = 0.000000e+00 : f32}> : (tensor<1x4x4xf32>) -> tensor<1x4x4xf32>
     // CHECK-NEXT: %[[LEAKYGRAD:.*]] = mhlo.multiply %[[GRADIENT:.*]], %[[ALPHA]] : tensor<1x4x4xf32>
-    // CHECK-NEXT: %[[CMP:.*]] = mhlo.compare GT, %[[INP:.*]], %[[ZERO]], NOTYPE : (tensor<1x4x4xf32>, tensor<1x4x4xf32>) -> tensor<1x4x4xi1>
+    // CHECK-NEXT: %[[CMP:.*]] = mhlo.compare GT, %[[INP:.*]], %[[ZERO]] : (tensor<1x4x4xf32>, tensor<1x4x4xf32>) -> tensor<1x4x4xi1>
     // CHECK-NEXT: %[[RES:.*]] = mhlo.select %[[CMP]], %[[GRADIENT]], %[[LEAKYGRAD]] : tensor<1x4x4xi1>, tensor<1x4x4xf32>
     // CHECK-NEXT: return %[[RES]] : tensor<1x4x4xf32>
     %0 = "tf.LeakyReluGrad"(%arg0, %arg1) {alpha = 2.000000e-01 : f32, device = ""} : (tensor<1x4x4xf32>, tensor<1x4x4xf32>) -> tensor<1x4x4xf32>
@@ -1866,7 +1941,7 @@ func.func @leaky_relu_grad(%arg0: tensor<1x4x4xf32>, %arg1: tensor<1x4x4xf32>) -
 
 // CHECK-LABEL: func @softsign
 func.func @softsign(%arg0: tensor<4x10xf32>) -> tensor<4x10xf32> {
-    // CHECK-NEXT: %[[ONE:.*]] = "chlo.constant_like"(%arg0) <{value = 1.000000e+00 : f32}> : (tensor<4x10xf32>) -> tensor<4x10xf32>
+    // CHECK-NEXT: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<4x10xf32>
     // CHECK-NEXT: %[[ABS:.*]] = mhlo.abs %{{.*}} : tensor<4x10xf32>
     // CHECK-NEXT: %[[ADD:.*]] = mhlo.add %[[ONE]], %[[ABS]] : tensor<4x10xf32>
     // CHECK-NEXT: %[[DIV:.*]] = mhlo.divide %{{.*}}, %[[ADD]] : tensor<4x10xf32>
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/registration/BUILD b/tensorflow/compiler/mlir/tf2xla/tests/registration/BUILD
index a5d8d8d8c518..f46627f0e435 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/registration/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/tests/registration/BUILD
@@ -12,11 +12,11 @@ cc_library(
         "graph_to_tf_executor_registration.cc",
     ],
     deps = [
-        "//tensorflow/compiler/mlir/lite/tools:translate_cl_options",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/translate:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow/translate/tools:file_tf_mlir_translate",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_executor_to_graph",
+        "//tensorflow/compiler/mlir/tools:translate_cl_options",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu_base",
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/registration/graph_to_tf_executor_registration.cc b/tensorflow/compiler/mlir/tf2xla/tests/registration/graph_to_tf_executor_registration.cc
index 8a9811c8dcbc..7b7b5771f5a4 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/registration/graph_to_tf_executor_registration.cc
+++ b/tensorflow/compiler/mlir/tf2xla/tests/registration/graph_to_tf_executor_registration.cc
@@ -26,11 +26,11 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tools/file_tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_executor_to_graph.h"
+#include "tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/client/client_library.h"
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index abd057643629..1e85dbff84e2 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -16,12 +16,7 @@ package(
 gentbl_cc_library(
     name = "legalize_tf_patterns_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "generated_legalize_tf.inc",
-        ),
-    ],
+    tbl_outs = {"generated_legalize_tf.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "legalize_tf_patterns.td",
     deps = [
@@ -30,21 +25,18 @@ gentbl_cc_library(
         "@llvm-project//mlir:FuncTdFiles",
         "@llvm-project//mlir:TensorOpsTdFiles",
         "@local_xla//xla/mlir_hlo:hlo_ops_td_files",
+        "@stablehlo//:chlo_ops_td_files",
+        "@stablehlo//:stablehlo_ops_td_files",
     ],
 )
 
 gentbl_cc_library(
     name = "xla_legalize_tf_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=LegalizeTf",
-            ],
-            "xla_legalize_tf_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"xla_legalize_tf_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=LegalizeTf",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_legalize_tf_passes.td",
     deps = [
@@ -55,15 +47,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tf_xla_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TfXla",
-            ],
-            "tf_xla_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"tf_xla_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TfXla",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_xla_passes.td",
     deps = [
@@ -177,6 +164,8 @@ cc_library(
         "@local_xla//xla/mlir_hlo:convert_op_folder",
         "@local_xla//xla/tsl/platform:status",
         "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_pass_utils",
     ] + if_static(["@local_tsl//tsl/platform:tensor_float_32_utils"]),
 )
 
@@ -262,7 +251,6 @@ cc_library(
         ":xla_legalize_targets",
         ":xla_legalize_tf_passes_inc_gen",
         ":xla_legalize_tf_with_tf2xla",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
@@ -305,8 +293,10 @@ cc_library(
         "@local_xla//xla/mlir_hlo:type_conversion",
         "@local_xla//xla/stream_executor/tpu:c_api_conversions",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
+        "@stablehlo//:base",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
     ],
 )
 
@@ -339,7 +329,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/framework:allocator",
         "//tensorflow/core/protobuf:for_core_protos_cc",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -361,12 +350,13 @@ cc_library(
         "@local_xla//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
         "@local_xla//xla/hlo/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
         "@local_xla//xla/hlo/translate/mhlo_to_hlo:type_to_shape",
-        "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/service:hlo_proto_cc",
         "@local_xla//xla/tsl/platform:env",
         "@local_xla//xla/tsl/platform:errors",
         "@local_xla//xla/tsl/platform:status",
         "@local_xla//xla/tsl/platform:statusor",
+        "@stablehlo//:base",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -381,9 +371,7 @@ tf_cc_test(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/core:framework",
         "//tensorflow/core:ops",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
@@ -396,11 +384,11 @@ tf_cc_test(
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/builder:xla_builder",
         "@local_xla//xla/hlo/builder:xla_computation",
-        "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/tsl/lib/core:status_test_util",
         "@local_xla//xla/tsl/platform:errors",
         "@local_xla//xla/tsl/platform:status",
         "@local_xla//xla/tsl/platform:statusor",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -442,7 +430,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@local_xla//xla/mlir_hlo",
+        "@stablehlo//:base",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
index 7df70e4de558..305f6a2c2fbb 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
 namespace {
 
@@ -358,6 +358,7 @@ bool IsOpTypeAllowedTf2XlaFallback(const TypeID& type_id) {
         TypeID::get<TF::XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp>(),
         TypeID::get<TF::XlaSparseDenseMatmulGradWithSgdAndCsrInputOp>(),
         TypeID::get<TF::XlaSparseDenseMatmulWithCsrInputOp>(),
+        TypeID::get<TF::XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp>(),
         TypeID::get<TF::XlaSparseDenseMatmulWithStaticBufferSizeOp>(),
         TypeID::get<
             TF::XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSizeOp>(),
@@ -370,6 +371,18 @@ bool IsOpTypeAllowedTf2XlaFallback(const TypeID& type_id) {
         TypeID::get<
             TF::XlaSparseDenseMatmulGradWithSgdAndStaticBufferSizeOp>(),  // NOLINT
         TypeID::get<TF::XlaSparseDenseMatmulGradWithCsrInputOp>(),
+        TypeID::get<
+            TF::XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp>(),  // NOLINT
+        TypeID::get<
+            TF::XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp>(),  // NOLINT
+        TypeID::get<
+            TF::XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp>(),  // NOLINT
+        TypeID::get<
+            TF::XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp>(),  // NOLINT
+        TypeID::get<
+            TF::XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp>(),  // NOLINT
+        TypeID::get<
+            TF::XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp>(),
         TypeID::get<TF::XlaSpmdFullToShardShapeOp>(),
         TypeID::get<TF::XlaSpmdShardToFullShapeOp>(),
         TypeID::get<TF::XlaSvdOp>(),
@@ -544,5 +557,5 @@ bool IsDynamicPadderOp(const TypeID& type_id) {
   return DynamicTensorflowOps().contains(type_id);
 }
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h
index b94f3370dabc..329ab3426015 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
 // Given the type ID, check if it's legalized with MLIR.
 bool IsTypeLegalizedWithMlir(const TypeID& type_id);
@@ -39,7 +39,7 @@ bool IsOpAllowedTf2xlaFallback(const TypeID& type_id);
 // used over the MLIR lowering.
 bool IsOpAllowedTf2xlaPreferred(const TypeID& type_id);
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_LEGALIZATION_OP_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index 113b088b3db7..0ca9062366ed 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def.pb.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
 TEST(LegalizationOpConfigTest, ExpectsTrueForMlirTypeID) {
   EXPECT_TRUE(IsTypeLegalizedWithMlir(TypeID::get<TF::ModOp>()));
@@ -83,7 +83,7 @@ TEST(LegalizationOpConfigTest, CountLoweringsSet) {
   // from MLIR to TF2XLA), these numbers should change. Or if TF Dialect adds
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 67);
-  EXPECT_EQ(tf2xla_fallback_count, 323);
+  EXPECT_EQ(tf2xla_fallback_count, 330);
   EXPECT_EQ(non_categorized_count, 431);
 }
 
@@ -121,7 +121,7 @@ TEST(LegalizationOpConfigTest, CountAllMlirLoweringPatterns) {
   context.loadAllAvailableDialects();
 
   RewritePatternSet mlir_legalize_lower_patterns(&context);
-  PopulateLegalizeTfPatterns(&context, &mlir_legalize_lower_patterns);
+  hlo::PopulateLegalizeTfPatterns(&context, &mlir_legalize_lower_patterns);
 
   int mlir_only_patterns = 0;
   for (auto& pattern : mlir_legalize_lower_patterns.getNativePatterns()) {
@@ -161,7 +161,7 @@ TEST(LegalizationOpConfigTest, MlirLoweringWithoutXlaKernel) {
   context.loadAllAvailableDialects();
 
   RewritePatternSet mlir_legalize_lower_patterns(&context);
-  PopulateLegalizeTfPatterns(&context, &mlir_legalize_lower_patterns);
+  hlo::PopulateLegalizeTfPatterns(&context, &mlir_legalize_lower_patterns);
 
   int mlir_without_xla_count = 0;
   for (auto& pattern : mlir_legalize_lower_patterns.getNativePatterns()) {
@@ -179,5 +179,5 @@ TEST(LegalizationOpConfigTest, MlirLoweringWithoutXlaKernel) {
   EXPECT_EQ(mlir_without_xla_count, 13);
 }
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 047a5fb7b46b..9b70d1cc1e66 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -42,6 +42,8 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -57,6 +59,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/transforms/PassUtils.h"  // from @stablehlo  // IWYU pragma: keep, legalize_tf_patterns.td
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
@@ -66,7 +70,6 @@ limitations under the License.
 #include "xla/hlo/builder/padding.h"
 #include "xla/hlo/builder/sharding_builder.h"
 #include "xla/hlo/translate/hlo_to_mhlo/attribute_importer.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/utils/convert_op_folder.h"
 #include "xla/mlir_hlo/utils/hlo_utils.h"
 #include "xla/tsl/platform/status.h"
@@ -80,7 +83,14 @@ limitations under the License.
 #include "tsl/platform/tensor_float_32_utils.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
+
+// Methods from utils.h
+using mhlo::BuildReduceBody;
+using mhlo::GetI64ElementsAttr;
+using mhlo::GetScalarConstOfType;
+using mhlo::GetScalarNegZeroOfType;
+
 namespace {
 
 constexpr char kShardingAttr[] = "mhlo.sharding";
@@ -99,6 +109,34 @@ void GetI64ArrayAttrValues(Attribute attr, SmallVectorImpl<int64_t> *values) {
     values->push_back(mlir::cast<IntegerAttr>(val).getValue().getSExtValue());
 }
 
+DenseI64ArrayAttr GetI64ArrayAttr(ArrayRef<int64_t> values, Builder *builder) {
+  return builder->getDenseI64ArrayAttr(values);
+}
+
+static DenseI64ArrayAttr ToDenseI64ArrayAttr(DenseIntElementsAttr attr,
+                                             Builder *builder) {
+  if (!attr) return {};
+  if (attr.getElementType().isInteger(64)) {
+    return GetI64ArrayAttr(llvm::to_vector(attr.getValues<int64_t>()), builder);
+  }
+
+  // Requires conversion to i64 first.
+  std::vector<int64_t> values;
+  values.reserve(attr.getNumElements());
+  for (auto value : attr.getValues<IntegerAttr>()) {
+    values.push_back(value.getValue().getSExtValue());
+  }
+  return GetI64ArrayAttr(values, builder);
+}
+
+static DenseI64ArrayAttr ToDenseI64ArrayAttr(ElementsAttr attr,
+                                             Builder *builder) {
+  return ToDenseI64ArrayAttr(
+      mlir::cast<DenseIntElementsAttr>(
+          hlo::convertElementsAttr(attr, builder->getIntegerType(64))),
+      builder);
+}
+
 // Returns 1D 32-bit dense elements attribute with the given values.
 static DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
                                                Builder *builder) {
@@ -109,26 +147,22 @@ static DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
 
 // Returns a 1-d i64 elements attribute populated with numbers from start to
 // end, excluding.
-static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
-                                                     Builder *builder) {
+static DenseI64ArrayAttr GetI64ArrayAttrForSeq(int start, int end,
+                                               Builder *builder) {
   int size = end - start;
 
   SmallVector<int64_t, 4> vals;
   vals.resize(size);
   std::iota(vals.begin(), vals.end(), start);
-
-  TensorType ty =
-      tensorflow::GetTypeFromTFTensorShape({size}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, vals);
+  return builder->getDenseI64ArrayAttr(vals);
 }
 
 // Returns a 1-d i64 elements attribute populated with `val` repeated `size`
 // times.
-static DenseIntElementsAttr GetI64ElementsAttrForValue(int size, int64_t val,
-                                                       Builder *builder) {
-  TensorType ty =
-      tensorflow::GetTypeFromTFTensorShape({size}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, val);
+static DenseI64ArrayAttr GetI64ArrayAttrForValue(int size, int64_t val,
+                                                 Builder *builder) {
+  llvm::SmallVector<int64_t, 4> vals(size, val);
+  return builder->getDenseI64ArrayAttr(vals);
 }
 
 // Returns the corresponding type that should be used for performing sum
@@ -164,14 +198,14 @@ static IntegerAttr GetHLOAxisFromTFAxis(Attribute attr, int64_t rank,
 // Returns a PrecisionConfig as an array attribute based on whether TF32
 // execution is enabled
 static ArrayAttr GetPrecisionConfig(Builder *builder) {
-  mlir::mhlo::Precision precision = tsl::tensor_float_32_execution_enabled()
-                                        ? mhlo::Precision::DEFAULT
-                                        : mlir::mhlo::Precision::HIGHEST;
+  mlir::stablehlo::Precision precision =
+      tsl::tensor_float_32_execution_enabled() ? stablehlo::Precision::DEFAULT
+                                               : stablehlo::Precision::HIGHEST;
   llvm::SmallVector<mlir::Attribute, 2> attr_vec;
   const int num_inputs = 2;
   for (int i = 0; i < num_inputs; i++) {
     attr_vec.push_back(
-        mlir::mhlo::PrecisionAttr::get(builder->getContext(), precision));
+        mlir::stablehlo::PrecisionAttr::get(builder->getContext(), precision));
   }
   return builder->getArrayAttr(attr_vec);
 }
@@ -193,9 +227,10 @@ static std::optional<int64_t> GetIntegerHLOAxisFromTFAxis(Value value,
 
 /// Returns a `ConvertOp` that casts the elements to a i64 type while retaining
 /// the shape of the input value.
-static ConvertOp CastValueToI64(Location loc, Value value,
-                                PatternRewriter *rewriter) {
-  return rewriter->create<ConvertOp>(loc, value, rewriter->getIntegerType(64));
+static stablehlo::ConvertOp CastValueToI64(Location loc, Value value,
+                                           PatternRewriter *rewriter) {
+  return rewriter->create<stablehlo::ConvertOp>(loc, value,
+                                                rewriter->getIntegerType(64));
 }
 
 // Creates an unpack op along the 0th dimension of the tensor. The `value` input
@@ -239,10 +274,11 @@ tensorflow::TensorShape ToTensorShape(
 
 // Returns a limit scalar const op for the given type.
 // Requires FloatType or IntegerType
-static ConstantOp GetScalarLimitConstOfType(Type ty, Location loc,
-                                            hlo::ScalarLimit limit,
-                                            OpBuilder *builder) {
-  return builder->create<ConstantOp>(loc, hlo::getScalarLimitOfType(ty, limit));
+static stablehlo::ConstantOp GetScalarLimitConstOfType(Type ty, Location loc,
+                                                       hlo::ScalarLimit limit,
+                                                       OpBuilder *builder) {
+  return builder->create<stablehlo::ConstantOp>(
+      loc, hlo::getScalarLimitOfType(ty, limit));
 }
 
 // Deprecated: This is maintained to aid in porting old code that is not yet
@@ -311,22 +347,24 @@ static Value StaticBinaryBroadcast(Location loc, Value x, Value y,
     return nullptr;
   }
   auto larger_broadcast_dims =
-      GetI64ElementsAttrForSeq(0, result_type.getRank(), &builder);
+      GetI64ArrayAttrForSeq(0, result_type.getRank(), &builder);
   if (x_type.getRank() < y_type.getRank()) {
     if (x_type != result_type) {
-      x = builder.create<BroadcastInDimOp>(loc, result_type, x, broadcast_dims);
+      x = builder.create<stablehlo::BroadcastInDimOp>(loc, result_type, x,
+                                                      broadcast_dims);
     }
     if (y_type != result_type) {
-      y = builder.create<BroadcastInDimOp>(loc, result_type, y,
-                                           larger_broadcast_dims);
+      y = builder.create<stablehlo::BroadcastInDimOp>(loc, result_type, y,
+                                                      larger_broadcast_dims);
     }
   } else {
     if (x_type != result_type) {
-      x = builder.create<BroadcastInDimOp>(loc, result_type, x,
-                                           larger_broadcast_dims);
+      x = builder.create<stablehlo::BroadcastInDimOp>(loc, result_type, x,
+                                                      larger_broadcast_dims);
     }
     if (y_type != result_type) {
-      y = builder.create<BroadcastInDimOp>(loc, result_type, y, broadcast_dims);
+      y = builder.create<stablehlo::BroadcastInDimOp>(loc, result_type, y,
+                                                      broadcast_dims);
     }
   }
   return builder.create<BinaryOp>(loc, x, y);
@@ -356,13 +394,13 @@ static RankedTensorType GetExtentsTensorTypeFor(TensorType value_type) {
 static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
                                      Value broadcast_from, int64_t feature_dim,
                                      OpBuilder &builder) {
-  auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &builder);
+  auto broadcast_dims = GetI64ArrayAttr({feature_dim}, &builder);
   auto to_type = mlir::cast<RankedTensorType>(broadcast_to.getType());
   auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
   auto result_extents_type = GetExtentsTensorTypeFor(to_type);
   auto result_extents = builder.create<shape::ToExtentTensorOp>(
       loc, result_extents_type, result_shape);
-  return builder.create<DynamicBroadcastInDimOp>(
+  return builder.create<stablehlo::DynamicBroadcastInDimOp>(
       loc, to_type, broadcast_from, result_extents, broadcast_dims);
 }
 
@@ -381,8 +419,8 @@ static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
   auto result_extents = builder.create<shape::ToExtentTensorOp>(
       loc, result_extents_type, result_shape);
   int64_t rank = mlir::cast<RankedTensorType>(input.getType()).getRank();
-  auto broadcast_dims = GetI64ElementsAttrForSeq(0, rank, &builder);
-  return builder.create<DynamicBroadcastInDimOp>(
+  auto broadcast_dims = GetI64ArrayAttrForSeq(0, rank, &builder);
+  return builder.create<stablehlo::DynamicBroadcastInDimOp>(
       loc, to_type, input, result_extents, broadcast_dims);
 }
 
@@ -391,33 +429,35 @@ static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
 static Value ApplyReduction(Location loc, Value input,
                             DenseIntElementsAttr reduce_dims,
                             OpBuilder *builder) {
-  auto reduce_dims_op = builder->create<ConstantOp>(loc, reduce_dims);
+  auto reduce_dims_op =
+      builder->create<stablehlo::ConstantOp>(loc, reduce_dims);
   return builder->create<TF::SumOp>(loc, input, reduce_dims_op,
                                     builder->getBoolAttr(false));
 }
 
-// Creates a mhlo.rng_uniform op with `builder` to generate `num_elements`
+// Creates a stablehlo.rng_uniform op with `builder` to generate `num_elements`
 // 32-bit integer numbers in the range of [`lower_limit`, `upper_limit`).
-static mhlo::RngOp CreateRngUniform32(Location loc, int num_elements,
-                                      int lower_limit, int upper_limit,
-                                      OpBuilder *builder) {
-  auto shape_tensor = builder->create<mhlo::ConstantOp>(
+static stablehlo::RngOp CreateRngUniform32(Location loc, int num_elements,
+                                           int lower_limit, int upper_limit,
+                                           OpBuilder *builder) {
+  auto shape_tensor = builder->create<stablehlo::ConstantOp>(
       loc, GetI64ElementsAttr({num_elements}, builder));
 
-  auto lower = builder->create<mhlo::ConstantOp>(
+  auto lower = builder->create<stablehlo::ConstantOp>(
       loc, builder->getI32IntegerAttr(lower_limit));
-  auto upper = builder->create<mhlo::ConstantOp>(
+  auto upper = builder->create<stablehlo::ConstantOp>(
       loc, builder->getI32IntegerAttr(upper_limit));
 
-  return builder->create<mhlo::RngOp>(loc, lower, upper, shape_tensor,
-                                      ::mlir::mhlo::RngDistribution::UNIFORM);
+  return builder->create<stablehlo::RngOp>(
+      loc, lower, upper, shape_tensor,
+      ::mlir::stablehlo::RngDistribution::UNIFORM);
 }
 
 using WhileBodyFnType = llvm::function_ref<void(
     Location loc, Value iteration, ArrayRef<Value> old_values,
     SmallVectorImpl<Value> *new_values, OpBuilder *builder)>;
 
-// Creates a mhlo.while op with `builder` to loop `num_interations` times,
+// Creates a stablehlo.while op with `builder` to loop `num_interations` times,
 // each time calling the given `body_fn` on a set of values to generate a new
 // set of values. Returns the final set of values via `final_values`. The
 // initial set of values is passed in via `init_values`.
@@ -449,8 +489,8 @@ static void CreateWhile32(Location loc, int num_iterations,
   init_types_with_loop_iv.reserve(value_count);
 
   // The initial value for the loop induction variable is 0.
-  init_values_with_loop_iv.push_back(
-      builder->create<mhlo::ConstantOp>(loc, builder->getI32IntegerAttr(0)));
+  init_values_with_loop_iv.push_back(builder->create<stablehlo::ConstantOp>(
+      loc, builder->getI32IntegerAttr(0)));
   init_values_with_loop_iv.append(init_values.begin(), init_values.end());
 
   // Accumulate types of all the init values.
@@ -458,8 +498,8 @@ static void CreateWhile32(Location loc, int num_iterations,
     init_types_with_loop_iv.push_back(init_value_with_loop_iv.getType());
 
   // Create the while op.
-  auto while_op = builder->create<mhlo::WhileOp>(loc, init_types_with_loop_iv,
-                                                 init_values_with_loop_iv);
+  auto while_op = builder->create<stablehlo::WhileOp>(
+      loc, init_types_with_loop_iv, init_values_with_loop_iv);
   auto ivs_count = init_types_with_loop_iv.size();
 
   {
@@ -473,12 +513,12 @@ static void CreateWhile32(Location loc, int num_iterations,
 
     // Get the loop induction variable and compare it against the upper limit.
     auto loop_iv = block->getArgument(0);
-    auto upper_limit = builder->create<mhlo::ConstantOp>(
+    auto upper_limit = builder->create<stablehlo::ConstantOp>(
         loc, builder->getI32IntegerAttr(num_iterations));
-    Value compare = builder->create<mhlo::CompareOp>(loc, loop_iv, upper_limit,
-                                                     ComparisonDirection::LT);
+    Value compare = builder->create<stablehlo::CompareOp>(
+        loc, loop_iv, upper_limit, stablehlo::ComparisonDirection::LT);
 
-    builder->create<mhlo::ReturnOp>(loc, compare);
+    builder->create<stablehlo::ReturnOp>(loc, compare);
   }
 
   {
@@ -500,15 +540,15 @@ static void CreateWhile32(Location loc, int num_iterations,
             &new_values, builder);
 
     // Increment the loop induction variable by one.
-    auto one =
-        builder->create<mhlo::ConstantOp>(loc, builder->getI32IntegerAttr(1));
+    auto one = builder->create<stablehlo::ConstantOp>(
+        loc, builder->getI32IntegerAttr(1));
     auto scalar_broadcast_dims = builder->getDenseI64ArrayAttr({});
     auto plus_one = builder->create<chlo::BroadcastAddOp>(
         loc, block->getArgument(0), one, scalar_broadcast_dims);
     // Prepend with the updated loop induction variable.
     new_values.insert(new_values.begin(), plus_one);
 
-    builder->create<mhlo::ReturnOp>(loc, new_values);
+    builder->create<stablehlo::ReturnOp>(loc, new_values);
   }
 
   // TODO(jpienaar): Support multi-operand while op.
@@ -534,12 +574,12 @@ static IntegerAttr getFeatureDimensionAttr(Builder &b,
 
 // Returns the 1D i64 elements attribute populated with the inner-most dim of
 // the value.
-static DenseIntElementsAttr GetInnerDimFromValue(ShapedType type,
-                                                 Builder *builder) {
+static DenseI64ArrayAttr GetInnerDimFromValue(ShapedType type,
+                                              Builder *builder) {
   if (type.getRank() == 0) {
-    return builder->getI64TensorAttr({});
+    return builder->getDenseI64ArrayAttr({});
   }
-  return builder->getI64TensorAttr(type.getShape().back());
+  return builder->getDenseI64ArrayAttr(type.getShape().back());
 }
 
 // Returns True if the inner-most dim is static.
@@ -569,13 +609,13 @@ static DenseIntElementsAttr Get2DTransposePerm(BoolAttr transpose, Builder *b) {
 //
 // Always returns 64 bit integer attribute regardless of bitwidth of the input
 // attribute.
-static DenseIntElementsAttr SliceDenseIntElementsAttrColumn2D(
-    ElementsAttr input, int column) {
+static DenseI64ArrayAttr SliceDenseIntElementsAttrColumn2D(ElementsAttr input,
+                                                           int column) {
   auto int_attr = mlir::cast<DenseIntElementsAttr>(input);
   auto shaped_type = int_attr.getType();
   auto shape = shaped_type.getShape();
 
-  if (shape.size() != 2) return DenseIntElementsAttr();
+  if (shape.size() != 2) return DenseI64ArrayAttr();
 
   llvm::SmallVector<int64_t, 4> values;
   values.reserve(shaped_type.getNumElements() / shape[1]);
@@ -586,18 +626,15 @@ static DenseIntElementsAttr SliceDenseIntElementsAttrColumn2D(
     }
   }
 
-  auto element_type = IntegerType::get(input.getContext(), 64);
-  return DenseIntElementsAttr::get(
-      tensorflow::GetTypeFromTFTensorShape({shape[0]}, element_type), values);
+  return DenseI64ArrayAttr::get(input.getContext(), values);
 }
 
 // Returns interior padding to use in HLO Pad op based on the TensorFlow padding
 // in TensorFlow PadV2 op.
-static DenseIntElementsAttr GetInteriorPadding(ElementsAttr tf_padding) {
+static DenseI64ArrayAttr GetInteriorPadding(ElementsAttr tf_padding) {
   auto length = tf_padding.getShapedType().getShape()[0];
-  auto element_type = IntegerType::get(tf_padding.getContext(), 64);
-  return DenseIntElementsAttr::get<int64_t>(
-      tensorflow::GetTypeFromTFTensorShape({length}, element_type), 0);
+  std::vector<int64_t> padding(length, 0);
+  return DenseI64ArrayAttr::get(tf_padding.getContext(), padding);
 }
 
 //===----------------------------------------------------------------------===//
@@ -689,10 +726,10 @@ static DenseElementsAttr GetEpsilonValue(Type ty) {
 // ArgMax/ArgMin op utilities.
 //===----------------------------------------------------------------------===//
 
-static void BuildArgMinMaxReductionBody(Type input_element_type,
-                                        Type index_element_type,
-                                        ComparisonDirection direction,
-                                        Region *body, OpBuilder *builder) {
+static void BuildArgMinMaxReductionBody(
+    Type input_element_type, Type index_element_type,
+    stablehlo::ComparisonDirection direction, Region *body,
+    OpBuilder *builder) {
   OpBuilder::InsertionGuard insertion_point_gurad(*builder);
 
   Type input_type =
@@ -710,20 +747,21 @@ static void BuildArgMinMaxReductionBody(Type input_element_type,
   Value rhs_index = block->getArgument(3);
 
   ImplicitLocOpBuilder b(loc, *builder);
-  Value compare_dt = b.create<CompareOp>(lhs_val, rhs_val, direction);
+  Value compare_dt =
+      b.create<stablehlo::CompareOp>(lhs_val, rhs_val, direction);
   Value selected_input =
-      b.create<SelectOp>(input_type, compare_dt, lhs_val, rhs_val);
+      b.create<stablehlo::SelectOp>(input_type, compare_dt, lhs_val, rhs_val);
 
-  Value compare_eq =
-      b.create<CompareOp>(lhs_val, rhs_val, ComparisonDirection::EQ);
-  Value min_index = b.create<MinOp>(lhs_index, rhs_index);
-  Value min_val_index =
-      b.create<SelectOp>(index_type, compare_dt, lhs_index, rhs_index);
-  Value selected_index =
-      b.create<SelectOp>(index_type, compare_eq, min_index, min_val_index);
+  Value compare_eq = b.create<stablehlo::CompareOp>(
+      lhs_val, rhs_val, stablehlo::ComparisonDirection::EQ);
+  Value min_index = b.create<stablehlo::MinOp>(lhs_index, rhs_index);
+  Value min_val_index = b.create<stablehlo::SelectOp>(index_type, compare_dt,
+                                                      lhs_index, rhs_index);
+  Value selected_index = b.create<stablehlo::SelectOp>(
+      index_type, compare_eq, min_index, min_val_index);
 
   Value return_values[] = {selected_input, selected_index};
-  b.create<ReturnOp>(return_values);
+  b.create<stablehlo::ReturnOp>(return_values);
 }
 
 //===----------------------------------------------------------------------===//
@@ -780,13 +818,12 @@ static bool CanBeTranslatedToDynamicSlice(Value input, Value start_indices,
 // TF slice size can be -1, which represents all elements from start_index to
 // the end. HLO slice size can't be -1. As such, we need to translate TF slice
 // size -1 to HLO slice size.
-static DenseIntElementsAttr TFSliceSizes2HLOSliceSizes(
+static DenseI64ArrayAttr TFSliceSizes2HLOSliceSizes(
     Value input, Value start_indices, DenseIntElementsAttr slice_sizes,
     Builder *builder) {
   DenseIntElementsAttr constant_start_indices;
   if (!matchPattern(start_indices, m_Constant(&constant_start_indices))) {
-    return mlir::cast<DenseIntElementsAttr>(
-        hlo::convertElementsAttr(slice_sizes, builder->getIntegerType(64)));
+    return ToDenseI64ArrayAttr(slice_sizes, builder);
   }
 
   auto input_ty = mlir::dyn_cast<RankedTensorType>(input.getType());
@@ -803,7 +840,7 @@ static DenseIntElementsAttr TFSliceSizes2HLOSliceSizes(
                                                 : slice_size);
   }
 
-  return GetI64ElementsAttr(normalized_sizes, builder);
+  return GetI64ArrayAttr(normalized_sizes, builder);
 }
 
 //===----------------------------------------------------------------------===//
@@ -815,11 +852,11 @@ bool HasValidGatherDims(StringAttr attr) {
   return dims.ParseFromString(attr.getValue().str());
 }
 
-GatherDimensionNumbersAttr GetGatherDimNumsAttr(StringAttr attr,
-                                                Builder *builder) {
+stablehlo::GatherDimensionNumbersAttr GetGatherDimNumsAttr(StringAttr attr,
+                                                           Builder *builder) {
   ::xla::GatherDimensionNumbers dims;
   if (!dims.ParseFromString(attr.getValue().str())) return {};
-  return ::xla::ConvertGatherDimensionNumbers(dims, builder);
+  return ::xla::stablehlo::ConvertGatherDimensionNumbers(dims, builder);
 }
 
 //===----------------------------------------------------------------------===//
@@ -831,10 +868,11 @@ bool HasValidDotDims(StringAttr attr) {
   return dims.ParseFromString(attr.getValue().str());
 }
 
-DotDimensionNumbersAttr GetDotDimNumsAttr(StringAttr attr, Builder *builder) {
+stablehlo::DotDimensionNumbersAttr GetDotDimNumsAttr(StringAttr attr,
+                                                     Builder *builder) {
   ::xla::DotDimensionNumbers dims;
   if (!dims.ParseFromString(attr.getValue().str())) return {};
-  return ::xla::ConvertDotDimensionNumbers(dims, builder);
+  return ::xla::stablehlo::ConvertDotDimensionNumbers(dims, builder);
 }
 
 bool HasValidPrecisionConfig(StringAttr attr) {
@@ -845,7 +883,7 @@ bool HasValidPrecisionConfig(StringAttr attr) {
 mlir::ArrayAttr GetPrecisionConfigAttr(StringAttr attr, Builder *builder) {
   ::xla::PrecisionConfig precision;
   if (!precision.ParseFromString(attr.getValue().str())) return {};
-  return ::xla::ConvertPrecisionConfig(&precision, builder);
+  return ::xla::stablehlo::ConvertPrecisionConfig(&precision, builder);
 }
 
 //===----------------------------------------------------------------------===//
@@ -862,7 +900,7 @@ static void BuildBodyWithCall(PatternRewriter &rewriter, const Location &loc,
   block->addArguments(inputs, SmallVector<Location>(inputs.size(), loc));
   mlir::func::CallOp call_op = rewriter.create<mlir::func::CallOp>(
       loc, func, func_ty.getResults(), block->getArguments());
-  rewriter.create<mhlo::ReturnOp>(loc, call_op.getResults());
+  rewriter.create<stablehlo::ReturnOp>(loc, call_op.getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -889,7 +927,7 @@ NamedAttribute GetConvDimensionNumbersAttr(ArrayRef<int64_t> spatial_dims,
 
   return builder->getNamedAttr(
       "dimension_numbers",
-      ConvDimensionNumbersAttr::get(
+      stablehlo::ConvDimensionNumbersAttr::get(
           builder->getContext(), batch_dim, feature_dim, spatial_dims,
           kernel_input_feature_dim, kernel_output_feature_dim,
           kernel_spatial_dimensions, batch_dim, feature_dim, spatial_dims));
@@ -916,7 +954,8 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
     auto feature_dim = GetFeatureDimension(data_format, value_type);
     auto bias_broadcast = Broadcast1DToFeatureDim(
         loc, op.getValue(), op.getBias(), feature_dim, rewriter);
-    Value add = rewriter.create<AddOp>(loc, op.getValue(), bias_broadcast);
+    Value add =
+        rewriter.create<stablehlo::AddOp>(loc, op.getValue(), bias_broadcast);
     if (add.getType() != op.getType()) {
       add = rewriter.create<tensor::CastOp>(loc, op.getType(), add);
     }
@@ -925,7 +964,7 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
   }
 };
 
-// Conterts tf.Conv2D to mhlo.dynamic_conv.
+// Conterts tf.Conv2D to stablehlo.dynamic_conv.
 // TODO(disc): To recover static special case's performance with adding folding,
 // canonicalization func and removing ConvertConvOp.
 template <typename OpT, int num_spatial_dims, bool depthwise_conv = false>
@@ -1082,10 +1121,10 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
       paddings.push_back(pad_high);
     }
     auto rhs_dilations_attr = rewriter.getNamedAttr(
-        "rhs_dilation", GetI64ElementsAttr(rhs_dilations, &rewriter));
+        "rhs_dilation", GetI64ArrayAttr(rhs_dilations, &rewriter));
 
     auto window_strides_attr = rewriter.getNamedAttr(
-        "window_strides", GetI64ElementsAttr(window_strides, &rewriter));
+        "window_strides", GetI64ArrayAttr(window_strides, &rewriter));
 
     auto dimension_numbers_attr = GetConvDimensionNumbersAttr(
         spatial_dim_indices, data_format, &rewriter);
@@ -1127,7 +1166,7 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
       new_shape.push_back(1);
       new_shape.push_back(filter_shape[num_spatial_dims] *
                           filter_shape[num_spatial_dims + 1]);
-      operands[1] = rewriter.create<mhlo::ReshapeOp>(
+      operands[1] = rewriter.create<stablehlo::ReshapeOp>(
           op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(new_shape,
                                                filter_ty.getElementType()),
@@ -1136,8 +1175,8 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     NamedAttribute attrs[] = {rhs_dilations_attr,     window_strides_attr,
                               dimension_numbers_attr, feature_group_count_attr,
                               batch_group_count_attr, precision_config_attr};
-    rewriter.replaceOpWithNewOp<mhlo::DynamicConvOp>(op, op.getType(), operands,
-                                                     llvm::ArrayRef(attrs));
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicConvOp>(
+        op, op.getType(), operands, llvm::ArrayRef(attrs));
     return success();
   }
 
@@ -1155,7 +1194,7 @@ using ConvertConv2DDynamic =
 //
 // Sample result for Conv2D:
 //
-//   %conv = "mhlo.convolution"(%input, %filter) {
+//   %conv = "stablehlo.convolution"(%input, %filter) {
 //     strides = [1, 2],
 //     paddings = [[1, 0], [1, 1]],
 //     ...
@@ -1241,10 +1280,10 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
     }
 
     auto rhs_dilations_attr = rewriter.getNamedAttr(
-        "rhs_dilation", GetI64ElementsAttr(rhs_dilations, &rewriter));
+        "rhs_dilation", GetI64ArrayAttr(rhs_dilations, &rewriter));
 
     auto window_strides_attr = rewriter.getNamedAttr(
-        "window_strides", GetI64ElementsAttr(window_strides, &rewriter));
+        "window_strides", GetI64ArrayAttr(window_strides, &rewriter));
 
     auto dimension_numbers_attr = GetConvDimensionNumbersAttr(
         spatial_dim_indices, data_format, &rewriter);
@@ -1285,7 +1324,7 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
       new_shape.push_back(1);
       new_shape.push_back(filter_shape[num_spatial_dims] *
                           filter_shape[num_spatial_dims + 1]);
-      operands[1] = rewriter.create<mhlo::ReshapeOp>(
+      operands[1] = rewriter.create<stablehlo::ReshapeOp>(
           op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(new_shape,
                                                filter_ty.getElementType()),
@@ -1295,8 +1334,8 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
                               dimension_numbers_attr, feature_group_count_attr,
                               batch_group_count_attr, paddings_attr,
                               precision_config_attr};
-    rewriter.replaceOpWithNewOp<ConvolutionOp>(op, op.getType(), operands,
-                                               llvm::ArrayRef(attrs));
+    rewriter.replaceOpWithNewOp<stablehlo::ConvolutionOp>(
+        op, op.getType(), operands, llvm::ArrayRef(attrs));
     return success();
   }
 };
@@ -1307,7 +1346,7 @@ using ConvertDepthConv2DOp =
     ConvertConvOp<TF::DepthwiseConv2dNativeOp, /*num_spatial_dims=*/2,
                   /*depthwise_conv=*/true>;
 
-// Converts tf.PadV2Op to mhlo.DynamicPadOp. Padding values must be const.
+// Converts tf.PadV2Op to stablehlo.DynamicPadOp. Padding values must be const.
 class ConvertPadOpDynamic : public OpRewritePattern<TF::PadV2Op> {
  public:
   using OpRewritePattern::OpRewritePattern;
@@ -1334,38 +1373,38 @@ class ConvertPadOpDynamic : public OpRewritePattern<TF::PadV2Op> {
     auto interior_attr = GetI64ElementsAttr(interior_values, &rewriter);
 
     Value interior_padding_tensor =
-        rewriter.create<mhlo::ConstantOp>(loc, interior_attr);
+        rewriter.create<stablehlo::ConstantOp>(loc, interior_attr);
     Type paddings_elem_ty = paddings_type.getElementType();
     if (!paddings_elem_ty.isInteger(64)) {
-      interior_padding_tensor = rewriter.create<mhlo::ConvertOp>(
+      interior_padding_tensor = rewriter.create<stablehlo::ConvertOp>(
           loc, interior_padding_tensor, paddings_elem_ty);
     }
     llvm::SmallVector<int64_t, 2> transposed_shape = {2, input_rank};
-    auto transpose_attr = GetI64ElementsAttr({1, 0}, &rewriter);
+    auto transpose_attr = GetI64ArrayAttr({1, 0}, &rewriter);
     Value transposed_paddings =
-        rewriter.create<mhlo::TransposeOp>(loc, paddings, transpose_attr);
-    Value reshaped_paddings = rewriter.create<mhlo::ReshapeOp>(
+        rewriter.create<stablehlo::TransposeOp>(loc, paddings, transpose_attr);
+    Value reshaped_paddings = rewriter.create<stablehlo::ReshapeOp>(
         loc,
         tensorflow::GetTypeFromTFTensorShape({input_rank * 2},
                                              paddings_elem_ty),
         transposed_paddings);
 
-    auto left_padding_start_attr = GetI64ElementsAttr({0}, &rewriter);
-    auto left_padding_limit_attr = GetI64ElementsAttr({input_rank}, &rewriter);
-    auto left_padding_stride_attr = GetI64ElementsAttr({1}, &rewriter);
-    Value left_padding_tensor = rewriter.create<mhlo::SliceOp>(
+    auto left_padding_start_attr = GetI64ArrayAttr({0}, &rewriter);
+    auto left_padding_limit_attr = GetI64ArrayAttr({input_rank}, &rewriter);
+    auto left_padding_stride_attr = GetI64ArrayAttr({1}, &rewriter);
+    Value left_padding_tensor = rewriter.create<stablehlo::SliceOp>(
         loc, reshaped_paddings, left_padding_start_attr,
         left_padding_limit_attr, left_padding_stride_attr);
 
-    auto right_padding_start_attr = GetI64ElementsAttr({input_rank}, &rewriter);
+    auto right_padding_start_attr = GetI64ArrayAttr({input_rank}, &rewriter);
     auto right_padding_limit_attr =
-        GetI64ElementsAttr({2 * input_rank}, &rewriter);
-    auto right_padding_stride_attr = GetI64ElementsAttr({1}, &rewriter);
-    Value right_padding_tensor = rewriter.create<mhlo::SliceOp>(
+        GetI64ArrayAttr({2 * input_rank}, &rewriter);
+    auto right_padding_stride_attr = GetI64ArrayAttr({1}, &rewriter);
+    Value right_padding_tensor = rewriter.create<stablehlo::SliceOp>(
         loc, reshaped_paddings, right_padding_start_attr,
         right_padding_limit_attr, right_padding_stride_attr);
 
-    rewriter.replaceOpWithNewOp<mhlo::DynamicPadOp>(
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicPadOp>(
         op, op.getType(), input, constant_values, left_padding_tensor,
         right_padding_tensor, interior_padding_tensor);
 
@@ -1375,11 +1414,11 @@ class ConvertPadOpDynamic : public OpRewritePattern<TF::PadV2Op> {
 
 class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
   using OpRewritePattern<TF::GatherNdOp>::OpRewritePattern;
-  // Converts tf.GatherNdOp to mhlo.DynamicGatherOp.
+  // Converts tf.GatherNdOp to stablehlo.DynamicGatherOp.
   // Here we leave 'slice_sizes' as an Attr, without defining a new
   // DynamicGatherOp, since GatherDimensionNumbers has already provide enough
-  // information for shape inference and code generation of mhlo::GatherOp. '?'
-  // will be filled into slice_sizes for dimensions that are dynamic sized.
+  // information for shape inference and code generation of stablehlo::GatherOp.
+  // '?' will be filled into slice_sizes for dimensions that are dynamic sized.
   // TODO(disc): To recover static special case's performance with folding and
   // canonicalization.
   LogicalResult matchAndRewrite(TF::GatherNdOp op,
@@ -1450,18 +1489,18 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
     // index_vector_dim
     int64_t index_vector_dim = indices_rank - 1;
 
-    auto dims_attr = GatherDimensionNumbersAttr::get(
+    auto dims_attr = stablehlo::GatherDimensionNumbersAttr::get(
         rewriter.getContext(), offset_dims, collapsed_slice_dims,
         /*operandBatchingDims=*/{},
         /*startIndicesBatchingDims=*/{}, start_index_map, index_vector_dim);
     // TODO(disc): Remove this if-statement once fold and canonicalization is
     // implemented.
     if (params_ty.hasStaticShape() && indices_ty.hasStaticShape()) {
-      rewriter.replaceOpWithNewOp<mhlo::GatherOp>(
+      rewriter.replaceOpWithNewOp<stablehlo::GatherOp>(
           op, op.getType(), op.getParams(), op.getIndices(), dims_attr,
-          GetI64ElementsAttr(slice_sizes, &rewriter));
+          GetI64ArrayAttr(slice_sizes, &rewriter));
     } else {
-      rewriter.replaceOpWithNewOp<mhlo::DynamicGatherOp>(
+      rewriter.replaceOpWithNewOp<stablehlo::DynamicGatherOp>(
           op, op.getType(), op.getParams(), op.getIndices(), slice_sizes_value,
           dims_attr);
     }
@@ -1496,16 +1535,18 @@ class ConvertBF16FloorDivOp : public OpRewritePattern<TF::FloorDivOp> {
 
     auto out_type = op.getZ().getType();
 
-    l = rewriter.create<ConvertOp>(op.getLoc(), l, rewriter.getF32Type());
-    r = rewriter.create<ConvertOp>(op.getLoc(), r, rewriter.getF32Type());
+    l = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), l,
+                                              rewriter.getF32Type());
+    r = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), r,
+                                              rewriter.getF32Type());
 
     auto intermediate = rewriter.create<TF::FloorDivOp>(
         op.getLoc(),
         ChangeTensorElementType(&rewriter, out_type, rewriter.getF32Type()), l,
         r);
 
-    auto floor_op =
-        rewriter.create<ConvertOp>(op.getLoc(), out_type, intermediate);
+    auto floor_op = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), out_type,
+                                                          intermediate);
     rewriter.replaceOp(op, floor_op.getResult());
     return success();
   }
@@ -1534,9 +1575,9 @@ class ConvertBroadcastToOp : public OpRewritePattern<TF::BroadcastToOp> {
       broadcast_dimensions = llvm::to_vector<4>(
           llvm::seq<int64_t>(rank_diff, ranked_output_type.getRank()));
     }
-    rewriter.replaceOpWithNewOp<DynamicBroadcastInDimOp>(
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicBroadcastInDimOp>(
         op, output_type, op.getInput(), op.getShape(),
-        rewriter.getI64TensorAttr(broadcast_dimensions));
+        GetI64ArrayAttr(broadcast_dimensions, &rewriter));
     return success();
   }
 };
@@ -1574,25 +1615,27 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
     // offset = ((offset % axis_size) + axis_size) % axis_size
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     Value offset = op.getShift();
-    auto axis_size = b.create<mhlo::ConstantOp>(b.getIntegerAttr(
+    auto axis_size = b.create<stablehlo::ConstantOp>(b.getIntegerAttr(
         getElementTypeOrSelf(offset.getType()), input_shape[axis]));
-    offset = b.create<RemOp>(
-        b.create<AddOp>(b.create<RemOp>(offset, axis_size), axis_size),
+    offset = b.create<stablehlo::RemOp>(
+        b.create<stablehlo::AddOp>(
+            b.create<stablehlo::RemOp>(offset, axis_size), axis_size),
         axis_size);
 
     // Stack two copies of the dimension, then slice from the calculated
     // offset. This also works if shift is not constant.
     // DynamicSliceOp requires the sizes being integer, and we can get the
     // information from input shape.
-    auto concat = b.create<ConcatenateOp>(
+    auto concat = b.create<stablehlo::ConcatenateOp>(
         ValueRange{op.getInput(), op.getInput()}, b.getI64IntegerAttr(axis));
-    Value zero = b.create<mhlo::ConstantOp>(
+    Value zero = b.create<stablehlo::ConstantOp>(
         b.getIntegerAttr(getElementTypeOrSelf(offset.getType()), 0));
     SmallVector<Value> slice_begin_indices(input_rank, zero);
-    slice_begin_indices[axis] = b.create<SubtractOp>(axis_size, offset);
-    rewriter.replaceOpWithNewOp<DynamicSliceOp>(
+    slice_begin_indices[axis] =
+        b.create<stablehlo::SubtractOp>(axis_size, offset);
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicSliceOp>(
         op, input_ty, concat, slice_begin_indices,
-        rewriter.getI64TensorAttr(input_shape));
+        GetI64ArrayAttr(input_shape, &rewriter));
     return success();
   }
 };
@@ -1613,13 +1656,13 @@ class ConvertLeakyReluOp : public OpRewritePattern<TF::LeakyReluOp> {
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyActivationVal =
-        rewriter.create<mhlo::MulOp>(loc, features, alphaVal);
+        rewriter.create<stablehlo::MulOp>(loc, features, alphaVal);
 
-    Value compareGtZero = rewriter.create<mhlo::CompareOp>(
-        loc, features, zeroVal, ComparisonDirection::GT);
+    Value compareGtZero = rewriter.create<stablehlo::CompareOp>(
+        loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
 
-    rewriter.replaceOpWithNewOp<SelectOp>(op, compareGtZero, features,
-                                          leakyActivationVal);
+    rewriter.replaceOpWithNewOp<stablehlo::SelectOp>(
+        op, compareGtZero, features, leakyActivationVal);
     return success();
   }
 };
@@ -1643,29 +1686,29 @@ class ConvertLeakyReluGradOp : public OpRewritePattern<TF::LeakyReluGradOp> {
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyGradientVal =
-        rewriter.create<mhlo::MulOp>(loc, gradients, alphaVal);
+        rewriter.create<stablehlo::MulOp>(loc, gradients, alphaVal);
 
-    Value compareGtZero = rewriter.create<mhlo::CompareOp>(
-        loc, features, zeroVal, ComparisonDirection::GT);
+    Value compareGtZero = rewriter.create<stablehlo::CompareOp>(
+        loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
 
-    rewriter.replaceOpWithNewOp<SelectOp>(op, featureType, compareGtZero,
-                                          gradients, leakyGradientVal);
+    rewriter.replaceOpWithNewOp<stablehlo::SelectOp>(
+        op, featureType, compareGtZero, gradients, leakyGradientVal);
     return success();
   }
 };
 
 // Converts TensorFlow DiagPartOp to HLO ops using reduction on masked matrix.
 // For a Rank-2 input, it creates the following ops:
-//   %1 = "mhlo.iota"() {iota_dimension = 0 : i64}
-//   %2 = "mhlo.iota"() {iota_dimension = 1 : i64}
-//   %3 = "mhlo.compare"(%1, %2) {comparison_direction = "EQ"}
-//   %4 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-//   %5 = "mhlo.broadcast"(%4)
-//   %6 = "mhlo.select"(%3, %input, %5)
-//   %7 = "mhlo.reduce"(%6, %4) ({
+//   %1 = "stablehlo.iota"() {iota_dimension = 0 : i64}
+//   %2 = "stablehlo.iota"() {iota_dimension = 1 : i64}
+//   %3 = "stablehlo.compare"(%1, %2) {comparison_direction = "EQ"}
+//   %4 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+//   %5 = "stablehlo.broadcast"(%4)
+//   %6 = "stablehlo.select"(%3, %input, %5)
+//   %7 = "stablehlo.reduce"(%6, %4) ({
 //   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-//     %9 = mhlo.add %arg1, %arg2 : tensor<f32>
-//     "mhlo.return"(%9) : (tensor<f32>) -> ()
+//     %9 = stablehlo.add %arg1, %arg2 : tensor<f32>
+//     "stablehlo.return"(%9) : (tensor<f32>) -> ()
 //   }) {dimensions = dense<0> : tensor<1xi64>}
 //
 // If the input's rank N is greater than 2, we will reshape it to R2 first and
@@ -1690,35 +1733,35 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
       new_size *= input_type.getDimSize(i);
       new_dims.push_back(input_type.getDimSize(i));
     }
-    Value reshaped_input = rewriter.create<mhlo::ReshapeOp>(
+    Value reshaped_input = rewriter.create<stablehlo::ReshapeOp>(
         op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({new_size, new_size},
                                              input_type.getElementType()),
         op.getInput());
     auto iota_type = tensorflow::GetTypeFromTFTensorShape(
         {new_size, new_size}, rewriter.getIntegerType(32));
-    auto iota0 = rewriter.create<IotaOp>(op.getLoc(), iota_type,
-                                         rewriter.getI64IntegerAttr(0));
-    auto iota1 = rewriter.create<IotaOp>(op.getLoc(), iota_type,
-                                         rewriter.getI64IntegerAttr(1));
-    Value compare = rewriter.create<CompareOp>(op.getLoc(), iota0, iota1,
-                                               ComparisonDirection::EQ);
+    auto iota0 = rewriter.create<stablehlo::IotaOp>(
+        op.getLoc(), iota_type, rewriter.getI64IntegerAttr(0));
+    auto iota1 = rewriter.create<stablehlo::IotaOp>(
+        op.getLoc(), iota_type, rewriter.getI64IntegerAttr(1));
+    Value compare = rewriter.create<stablehlo::CompareOp>(
+        op.getLoc(), iota0, iota1, stablehlo::ComparisonDirection::EQ);
     Value zero = GetScalarConstOfType(input_type.getElementType(), op.getLoc(),
                                       0, &rewriter);
-    Value zero_matrix = rewriter.create<BroadcastOp>(
+    Value zero_matrix = rewriter.create<stablehlo::BroadcastOp>(
         op.getLoc(), reshaped_input.getType(), zero,
-        GetI64ElementsAttr({new_size, new_size}, &rewriter));
-    Value masked =
-        rewriter.create<SelectOp>(op.getLoc(), reshaped_input.getType(),
-                                  compare, reshaped_input, zero_matrix);
-    auto reduce = rewriter.create<ReduceOp>(op.getLoc(), masked, zero,
-                                            GetI64ElementsAttr({0}, &rewriter),
-                                            input_type.getElementType());
+        GetI64ArrayAttr({new_size, new_size}, &rewriter));
+    Value masked = rewriter.create<stablehlo::SelectOp>(
+        op.getLoc(), reshaped_input.getType(), compare, reshaped_input,
+        zero_matrix);
+    auto reduce = rewriter.create<stablehlo::ReduceOp>(
+        op.getLoc(), masked, zero, GetI64ArrayAttr({0}, &rewriter),
+        input_type.getElementType());
     assert(!input_type.getElementType().isInteger(1) &&
            "data type should not be i1");
-    BuildReduceBody<AddOp>(input_type.getElementType(), &reduce.getBody(),
-                           &rewriter);
-    rewriter.replaceOpWithNewOp<ReshapeOp>(
+    BuildReduceBody<stablehlo::AddOp>(input_type.getElementType(),
+                                      &reduce.getBody(), &rewriter);
+    rewriter.replaceOpWithNewOp<stablehlo::ReshapeOp>(
         op,
         tensorflow::GetTypeFromTFTensorShape(new_dims,
                                              input_type.getElementType()),
@@ -1756,15 +1799,16 @@ class ConvertMatrixDiagPartV3Op
   }
 
   // Utility method for broadcasting integer constants to a given shape.
-  BroadcastOp BroadcastConstant(Location loc, Shape shape, int32_t constant,
-                                int int_size, PatternRewriter &rewriter) const {
-    return rewriter.create<BroadcastOp>(
+  stablehlo::BroadcastOp BroadcastConstant(Location loc, Shape shape,
+                                           int32_t constant, int int_size,
+                                           PatternRewriter &rewriter) const {
+    return rewriter.create<stablehlo::BroadcastOp>(
         loc,
         tensorflow::GetTypeFromTFTensorShape(shape,
                                              rewriter.getIntegerType(int_size)),
         GetScalarConstOfType(rewriter.getIntegerType(int_size), loc, constant,
                              &rewriter),
-        GetI64ElementsAttr(shape, &rewriter));
+        GetI64ArrayAttr(shape, &rewriter));
   }
 
  public:
@@ -1834,10 +1878,10 @@ class ConvertMatrixDiagPartV3Op
 
     RankedTensorType iota_type = tensorflow::GetTypeFromTFTensorShape(
         indices_shape, rewriter.getIntegerType(32));
-    Value iotaM =
-        rewriter.create<IotaOp>(loc, iota_type, rewriter.getI64IntegerAttr(1));
-    Value iotaN =
-        rewriter.create<IotaOp>(loc, iota_type, rewriter.getI64IntegerAttr(2));
+    Value iotaM = rewriter.create<stablehlo::IotaOp>(
+        loc, iota_type, rewriter.getI64IntegerAttr(1));
+    Value iotaN = rewriter.create<stablehlo::IotaOp>(
+        loc, iota_type, rewriter.getI64IntegerAttr(2));
 
     // Boradcasted constants, of the same shape as iotaM and iotaN.
     Value b_zero = BroadcastConstant(loc, indices_shape, 0, 32, rewriter);
@@ -1854,17 +1898,17 @@ class ConvertMatrixDiagPartV3Op
     //  subtract m here. This means we start with the superdiagonals and
     //  move downwards towards the subdiagonals. So the start indices will
     //  be decreasing.)
-    Value d = rewriter.create<SubtractOp>(loc, b_k1, iotaM);
-    Value neg_d = rewriter.create<NegOp>(loc, d);
+    Value d = rewriter.create<stablehlo::SubtractOp>(loc, b_k1, iotaM);
+    Value neg_d = rewriter.create<stablehlo::NegOp>(loc, d);
 
     // diag_len_d = min(rows + min(d, 0), cols - max(d, 0))
     // (Length of a diagonal for a given d. Same as max_diag_len for m = 0.)
-    Value diag_len_d = rewriter.create<MinOp>(
+    Value diag_len_d = rewriter.create<stablehlo::MinOp>(
         loc,
-        rewriter.create<AddOp>(loc, b_rows,
-                               rewriter.create<MinOp>(loc, d, b_zero)),
-        rewriter.create<SubtractOp>(loc, b_cols,
-                                    rewriter.create<MaxOp>(loc, d, b_zero)));
+        rewriter.create<stablehlo::AddOp>(
+            loc, b_rows, rewriter.create<stablehlo::MinOp>(loc, d, b_zero)),
+        rewriter.create<stablehlo::SubtractOp>(
+            loc, b_cols, rewriter.create<stablehlo::MaxOp>(loc, d, b_zero)));
 
     // offset is max_diag_len - diag_len_d if we're padding, 0 otherwise.
     Value cmp;
@@ -1883,43 +1927,44 @@ class ConvertMatrixDiagPartV3Op
 
     // This offset shifts the diagonals to the "left" or "right", depending
     // on alignment.
-    Value offset = rewriter.create<SelectOp>(
+    Value offset = rewriter.create<stablehlo::SelectOp>(
         loc, b_zero.getType(), cmp,
-        rewriter.create<SubtractOp>(loc, b_max_diag_len, diag_len_d), b_zero);
+        rewriter.create<stablehlo::SubtractOp>(loc, b_max_diag_len, diag_len_d),
+        b_zero);
 
     // x = max(d, 0) - offset
     // y = max(-d, 0) - offset
-    Value x = rewriter.create<SubtractOp>(
-        loc, rewriter.create<MaxOp>(loc, d, b_zero), offset);
-    Value y = rewriter.create<SubtractOp>(
-        loc, rewriter.create<MaxOp>(loc, neg_d, b_zero), offset);
+    Value x = rewriter.create<stablehlo::SubtractOp>(
+        loc, rewriter.create<stablehlo::MaxOp>(loc, d, b_zero), offset);
+    Value y = rewriter.create<stablehlo::SubtractOp>(
+        loc, rewriter.create<stablehlo::MaxOp>(loc, neg_d, b_zero), offset);
 
-    Value n_plus_x = rewriter.create<AddOp>(loc, iotaN, x);
-    Value n_plus_y = rewriter.create<AddOp>(loc, iotaN, y);
+    Value n_plus_x = rewriter.create<stablehlo::AddOp>(loc, iotaN, x);
+    Value n_plus_y = rewriter.create<stablehlo::AddOp>(loc, iotaN, y);
 
     // GatherOp is happy about letting us index out of bounds values, but those
     // values will be undefined. So we mask them later. Set up the boolean
     // expression that tells us which entries, in the output shape, are out of
     // bounds and thus become the padding_value.
-    Value x_in_bounds = rewriter.create<AndOp>(
+    Value x_in_bounds = rewriter.create<stablehlo::AndOp>(
         loc,
         rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_x,
                                             b_zero),
         rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_x, b_cols));
-    Value y_in_bounds = rewriter.create<AndOp>(
+    Value y_in_bounds = rewriter.create<stablehlo::AndOp>(
         loc,
         rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_y,
                                             b_zero),
         rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_y, b_rows));
-    Value in_bounds = rewriter.create<ReshapeOp>(
+    Value in_bounds = rewriter.create<stablehlo::ReshapeOp>(
         loc,
         tensorflow::GetTypeFromTFTensorShape(Shape({num_diags, max_diag_len}),
                                              rewriter.getIntegerType(1)),
-        rewriter.create<AndOp>(loc, x_in_bounds, y_in_bounds));
+        rewriter.create<stablehlo::AndOp>(loc, x_in_bounds, y_in_bounds));
 
     // Now combine x and y into the index data structure needed for gather.
     Shape concat_shape({2, num_diags, max_diag_len});
-    Value start_indices = rewriter.create<ConcatenateOp>(
+    Value start_indices = rewriter.create<stablehlo::ConcatenateOp>(
         loc,
         tensorflow::GetTypeFromTFTensorShape(concat_shape,
                                              rewriter.getIntegerType(32)),
@@ -1957,16 +2002,16 @@ class ConvertMatrixDiagPartV3Op
     // Gather the diagonal entries.
     // TODO(kramm): For a single diagonal, this might be slower than the
     //              mask + sum approach. Special-case num_diags==1?
-    auto dims_attr = GatherDimensionNumbersAttr::get(
+    auto dims_attr = stablehlo::GatherDimensionNumbersAttr::get(
         rewriter.getContext(),
         /*offsetDims=*/llvm::to_vector<4>(llvm::seq<int64_t>(0, num_dims - 2)),
         /*collapsedSliceDims=*/collapsed_dims,
         /*operandBatchingDims=*/{},
         /*startIndicesBatchingDims=*/{}, start_index_map,
         /*indexVectorDim=*/0);
-    Value gather = rewriter.create<mhlo::GatherOp>(
+    Value gather = rewriter.create<stablehlo::GatherOp>(
         loc, op.getInput(), start_indices, dims_attr,
-        GetI64ElementsAttr(slice_sizes, &rewriter));
+        GetI64ArrayAttr(slice_sizes, &rewriter));
 
     // We now need to broadcast the "in_bounds" boolean expression, as well as
     // the padding value, to do the final select.
@@ -1974,22 +2019,22 @@ class ConvertMatrixDiagPartV3Op
     for (int i = 0; i < output_shape.size() - 2; i++) {
       broadcast_bounds.push_back(output_shape[i]);
     }
-    Value b_in_bounds = rewriter.create<BroadcastOp>(
+    Value b_in_bounds = rewriter.create<stablehlo::BroadcastOp>(
         loc,
         tensorflow::GetTypeFromTFTensorShape(output_shape,
                                              rewriter.getIntegerType(1)),
-        in_bounds, GetI64ElementsAttr(broadcast_bounds, &rewriter));
-    Value b_padding = rewriter.create<BroadcastOp>(
-        loc, op.getPaddingValue(), GetI64ElementsAttr(output_shape, &rewriter));
+        in_bounds, GetI64ArrayAttr(broadcast_bounds, &rewriter));
+    Value b_padding = rewriter.create<stablehlo::BroadcastOp>(
+        loc, op.getPaddingValue(), GetI64ArrayAttr(output_shape, &rewriter));
 
     // Replace all out-of-bounds values in the result with padding_value.
-    Value result =
-        rewriter.create<SelectOp>(loc, b_in_bounds, gather, b_padding);
+    Value result = rewriter.create<stablehlo::SelectOp>(loc, b_in_bounds,
+                                                        gather, b_padding);
 
     if (num_diags == 1) {
       // matrix_diag_part folds away the 1-sized band dimension if we only
       // extract a single diagonal.
-      result = rewriter.create<ReshapeOp>(loc, op.getType(), result);
+      result = rewriter.create<stablehlo::ReshapeOp>(loc, op.getType(), result);
     }
 
     rewriter.replaceOp(op, result);
@@ -2012,7 +2057,7 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
     // creates a scalar constant 1.0 for first operand.
     if (op.getN() == 1) {
       equation_str = "," + equation_str;
-      inputs.push_back(rewriter.create<ConstantOp>(
+      inputs.push_back(rewriter.create<stablehlo::ConstantOp>(
           op.getLoc(), hlo::getScalarOfType(
                            mlir::getElementTypeOrSelf(op.getOperand(0)), 1)));
     }
@@ -2022,8 +2067,8 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
     inputs.insert(inputs.end(), operands.begin(), operands.end());
     assert(inputs.size() == 2);
 
-    rewriter.replaceOpWithNewOp<EinsumOp>(op, op.getType(), inputs[0],
-                                          inputs[1], equation_str);
+    rewriter.replaceOpWithNewOp<stablehlo::EinsumOp>(
+        op, op.getType(), inputs[0], inputs[1], equation_str);
     return success();
   }
 };
@@ -2084,13 +2129,13 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
 
     // Last dim larger than expected_dim, slice the input
     if (input_shape.back() > expected_dim) {
-      reshaped = rewriter.create<SliceOp>(
+      reshaped = rewriter.create<stablehlo::SliceOp>(
           op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
-          op.getInput(), GetI64ElementsAttr(begin_indices, &rewriter),
-          GetI64ElementsAttr(expected_shape, &rewriter),
-          GetI64ElementsAttr(strides, &rewriter));
+          op.getInput(), GetI64ArrayAttr(begin_indices, &rewriter),
+          GetI64ArrayAttr(expected_shape, &rewriter),
+          GetI64ArrayAttr(strides, &rewriter));
 
       // Last dim smaller than expected_dim, zero-pad the input
     } else if (input_ty.getShape().back() < expected_dim) {
@@ -2099,20 +2144,21 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
       padding.push_back(expected_dim - input_shape.back());
       Value zero =
           GetScalarConstOfType(input_ty.getElementType(), loc, 0, &rewriter);
-      reshaped = rewriter.create<PadOp>(
+      reshaped = rewriter.create<stablehlo::PadOp>(
           loc,
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
-          op.getInput(), zero, GetI64ElementsAttr(no_padding, &rewriter),
-          GetI64ElementsAttr(padding, &rewriter),
-          GetI64ElementsAttr(no_padding, &rewriter));
+          op.getInput(), zero, GetI64ArrayAttr(no_padding, &rewriter),
+          GetI64ArrayAttr(padding, &rewriter),
+          GetI64ArrayAttr(no_padding, &rewriter));
     }
 
-    rewriter.replaceOpWithNewOp<FftOp>(
+    rewriter.replaceOpWithNewOp<stablehlo::FftOp>(
         op, op.getType(), reshaped,
-        FftTypeAttr::get(rewriter.getContext(),
-                         symbolizeFftType(fft_string).value()),
-        rewriter.getI64TensorAttr(fft_length));
+        stablehlo::FftTypeAttr::get(
+            rewriter.getContext(),
+            stablehlo::symbolizeFftType(fft_string).value()),
+        GetI64ArrayAttr(fft_length, &rewriter));
     return success();
   }
 };
@@ -2147,8 +2193,8 @@ class ConvertFusedBatchNormGradBase
     // To support mixed precision, the statistics type, which maybe more
     // precise than the input types, are used for this op.
     Type kernel_type = mlir::cast<TensorType>(scale.getType()).getElementType();
-    grad = rewriter.create<ConvertOp>(loc, grad, kernel_type);
-    act = rewriter.create<ConvertOp>(loc, act, kernel_type);
+    grad = rewriter.create<stablehlo::ConvertOp>(loc, grad, kernel_type);
+    act = rewriter.create<stablehlo::ConvertOp>(loc, act, kernel_type);
 
     tensorflow::TensorFormat data_format;
     if (!FormatFromString(op.getDataFormat().str(), &data_format))
@@ -2167,7 +2213,7 @@ class ConvertFusedBatchNormGradBase
 
       SmallVector<Type, 3> operand_types = {act.getType(), feature_type,
                                             feature_type};
-      auto training_op = rewriter.create<BatchNormGradOp>(
+      auto training_op = rewriter.create<stablehlo::BatchNormGradOp>(
           loc, operand_types, act, scale, mean, var, grad, op.getEpsilon(),
           feature_dim);
 
@@ -2188,43 +2234,45 @@ class ConvertFusedBatchNormGradBase
       // scratch1 = rsqrt(var + epsilon)
       RankedTensorType scalar_float =
           tensorflow::GetTypeFromTFTensorShape({}, kernel_type);
-      auto epsilon = rewriter.create<ConstantOp>(
+      auto epsilon = rewriter.create<stablehlo::ConstantOp>(
           loc, DenseFPElementsAttr::get(scalar_float, {op.getEpsilon()}));
       auto add_op = rewriter.create<chlo::BroadcastAddOp>(
           loc, var, epsilon.getResult(), scalar_broadcast_dims);
 
-      Value scratch1 = rewriter.create<RsqrtOp>(loc, add_op);
+      Value scratch1 = rewriter.create<stablehlo::RsqrtOp>(loc, add_op);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto sub_op = rewriter.create<mhlo::SubtractOp>(
+      auto sub_op = rewriter.create<stablehlo::SubtractOp>(
           loc, act,
           Broadcast1DToFeatureDim(loc, act, mean, feature_dim, rewriter));
-      auto weighted_grad = rewriter.create<mhlo::MulOp>(loc, grad, sub_op);
+      auto weighted_grad = rewriter.create<stablehlo::MulOp>(loc, grad, sub_op);
       Value scratch2 =
           ApplyReduction(loc, weighted_grad, reduce_dims, &rewriter);
 
       // x_backprop = y_backprop * (scale * scratch1)
       auto scaled_grad =
-          rewriter.create<mhlo::MulOp>(loc, op.getScale(), scratch1);
-      x_backprop = rewriter.create<mhlo::MulOp>(
+          rewriter.create<stablehlo::MulOp>(loc, op.getScale(), scratch1);
+      x_backprop = rewriter.create<stablehlo::MulOp>(
           loc, grad,
           Broadcast1DToFeatureDim(loc, act, scaled_grad, feature_dim,
                                   rewriter));
 
       // scale_backprop = scratch2 * scratch1
-      scale_backprop = rewriter.create<mhlo::MulOp>(loc, scratch1, scratch2);
+      scale_backprop =
+          rewriter.create<stablehlo::MulOp>(loc, scratch1, scratch2);
 
       // offset_backprop = sum(y_backprop)
       offset_backprop = ApplyReduction(loc, grad, reduce_dims, &rewriter);
     }
 
-    x_backprop = rewriter.create<ConvertOp>(loc, x_backprop, act_ele_type);
+    x_backprop =
+        rewriter.create<stablehlo::ConvertOp>(loc, x_backprop, act_ele_type);
     Value last_val[2];
     if (op.getResult(3).use_empty() && op.getResult(4).use_empty()) {
       // It doesn't matter what values we provide for the last 2 results.
       last_val[0] = last_val[1] = op.getX();
     } else {
-      auto const_val = rewriter.create<ConstantOp>(
+      auto const_val = rewriter.create<stablehlo::ConstantOp>(
           op.getLoc(), DenseElementsAttr::get<float>(
                            tensorflow::GetTypeFromTFTensorShape(
                                {0}, getElementTypeOrSelf(op.getResult(3))),
@@ -2285,7 +2333,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
     // TODO(b/69928690): Support mixed precision in the XLA batch
     // normalization operators. As a workaround, create a new x with the same
     // element type as scale (which may be more precise than the input type).
-    Value bn_train_input = rewriter.create<mhlo::ConvertOp>(
+    Value bn_train_input = rewriter.create<stablehlo::ConvertOp>(
         op.getLoc(), op.getX(), scale_element_type);
     TensorType bn_train_input_type_tensor =
         mlir::cast<TensorType>(bn_train_input.getType());
@@ -2303,7 +2351,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       // batch_mean, and batch_var.
       SmallVector<Type, 3> operand_types = {bn_train_input_type_tensor,
                                             mean_var_type, mean_var_type};
-      auto bn_train_op = rewriter.create<mhlo::BatchNormTrainingOp>(
+      auto bn_train_op = rewriter.create<stablehlo::BatchNormTrainingOp>(
           op.getLoc(), operand_types, bn_train_input, op.getScale(),
           op.getOffset(), op.getEpsilon(), feature_dim.getInt());
       // HLO op outputs a tuple of tensors. Extract those results.
@@ -2320,7 +2368,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       int sample_size_minus_one = std::max(1, sample_size - 1);
       double factor = static_cast<double>(sample_size) /
                       static_cast<double>(sample_size_minus_one);
-      auto factor_const_op = rewriter.create<mhlo::ConstantOp>(
+      auto factor_const_op = rewriter.create<stablehlo::ConstantOp>(
           op.getLoc(), rewriter.getFloatAttr(scale_element_type, factor));
 
       Value corrected_variance = rewriter.create<chlo::BroadcastMulOp>(
@@ -2329,16 +2377,16 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
-      y_out = rewriter.create<mhlo::ConvertOp>(op.getLoc(), y_out,
-                                               input_element_type);
+      y_out = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), y_out,
+                                                    input_element_type);
 
       float exponential_avg_factor =
           op.getExponentialAvgFactor().convertToFloat();
       if (exponential_avg_factor != 1.0f) {
-        auto alpha = rewriter.create<mhlo::ConstantOp>(
+        auto alpha = rewriter.create<stablehlo::ConstantOp>(
             op.getLoc(), rewriter.getFloatAttr(mean_element_type,
                                                1.0f - exponential_avg_factor));
-        auto beta = rewriter.create<mhlo::ConstantOp>(
+        auto beta = rewriter.create<stablehlo::ConstantOp>(
             op.getLoc(),
             rewriter.getFloatAttr(mean_element_type, exponential_avg_factor));
 
@@ -2385,7 +2433,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                : 0;
         auto const_attr_type = tensorflow::GetTypeFromTFTensorShape(
             {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
-        Value dummy_const = rewriter.create<ConstantOp>(
+        Value dummy_const = rewriter.create<stablehlo::ConstantOp>(
             op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
         if (const_attr_type != reserve_space_3_type)
           dummy_const = rewriter.create<tensor::CastOp>(
@@ -2397,7 +2445,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                 /*reserve_space_3=*/dummy_const});
       }
     } else {  // Inference case.
-      auto bn_train_op = rewriter.create<BatchNormInferenceOp>(
+      auto bn_train_op = rewriter.create<stablehlo::BatchNormInferenceOp>(
           op.getLoc(),
           /*result_type=*/bn_train_input_type_tensor, bn_train_input,
           op.getScale(), op.getOffset(), op.getMean(), op.getVariance(),
@@ -2405,8 +2453,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
-      auto y_out = rewriter.create<mhlo::ConvertOp>(op.getLoc(), bn_train_op,
-                                                    input_element_type);
+      auto y_out = rewriter.create<stablehlo::ConvertOp>(
+          op.getLoc(), bn_train_op, input_element_type);
 
       // The mean, variance, and reserved space outputs of the batch norm op are
       // not used for inference. It doesn't matter what values we provide for
@@ -2429,7 +2477,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                : 0;
         auto const_attr_type = tensorflow::GetTypeFromTFTensorShape(
             {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
-        Value dummy_const = rewriter.create<ConstantOp>(
+        Value dummy_const = rewriter.create<stablehlo::ConstantOp>(
             op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
         if (const_attr_type != reserve_space_3_type)
           dummy_const = rewriter.create<tensor::CastOp>(
@@ -2541,7 +2589,7 @@ Operation *AvgPoolDivideByCount(
 
     // Build all-ones tensor of same shape as the original input.
     ElementsAttr splat = hlo::getSplat(&rewriter, orig_input_type, 1);
-    auto all_ones_tensor = rewriter.create<ConstantOp>(loc, splat);
+    auto all_ones_tensor = rewriter.create<stablehlo::ConstantOp>(loc, splat);
 
     // Get padding for the input.
     DenseIntElementsAttr input_padding_attr =
@@ -2551,20 +2599,23 @@ Operation *AvgPoolDivideByCount(
 
     // Count the 1's in each window, using the same padding as for the input,
     // which gives us the window counts by which `pooled` needs to be divided.
-    auto divisor = rewriter.create<ReduceWindowOp>(
+    auto divisor = rewriter.create<stablehlo::ReduceWindowOp>(
         loc, pooled_type,
         /*operand=*/all_ones_tensor,
         /*init_value=*/zero,
-        /*window_dimensions=*/GetI64ElementsAttr(op.getKsize()),
-        /*window_strides=*/GetI64ElementsAttr(op.getStrides()),
-        /*base_dilations=*/DenseIntElementsAttr(),
-        /*window_dilations=*/DenseIntElementsAttr(),
+        /*window_dimensions=*/
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
+        /*window_strides=*/
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
+        /*base_dilations=*/DenseI64ArrayAttr(),
+        /*window_dilations=*/DenseI64ArrayAttr(),
         /*padding=*/input_padding_attr);
-    BuildReduceBody<AddOp>(element_type, &divisor.getBody(), &rewriter);
+    BuildReduceBody<stablehlo::AddOp>(element_type, &divisor.getBody(),
+                                      &rewriter);
 
     // Divide `pooled` by window counts.
-    result = rewriter.create<mhlo::DivOp>(loc, pooled_type, pooled,
-                                          divisor.getResult(0));
+    result = rewriter.create<stablehlo::DivOp>(loc, pooled_type, pooled,
+                                               divisor.getResult(0));
   }
   return result;
 }
@@ -2600,8 +2651,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
 
     // Convert if we need enlarge the element type's bitwidth.
     if (input_element_type != sum_element_type)
-      input_value = rewriter.create<ConvertOp>(op.getLoc(), input_value,
-                                               sum_element_type);
+      input_value = rewriter.create<stablehlo::ConvertOp>(
+          op.getLoc(), input_value, sum_element_type);
 
     // Create the ReduceWindow op.
     Value init =
@@ -2609,12 +2660,14 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_type.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
-    auto reduce = rewriter.create<ReduceWindowOp>(
+    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
         op.getLoc(), result_type, input_value, init,
-        GetI64ElementsAttr(op.getKsize()), GetI64ElementsAttr(op.getStrides()),
-        /*base_dilations=*/DenseIntElementsAttr(),
-        /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
-    BuildReduceBody<AddOp>(sum_element_type, &reduce.getBody(), &rewriter);
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
+        /*base_dilations=*/DenseI64ArrayAttr(),
+        /*window_dilations=*/DenseI64ArrayAttr(), paddings_attr);
+    BuildReduceBody<stablehlo::AddOp>(sum_element_type, &reduce.getBody(),
+                                      &rewriter);
 
     // Count the number of elements in the window. The following calculation
     // is only valid for no paddings.
@@ -2630,8 +2683,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     // Convert back if we enlarged the element type's bitwidth.
     Value result = result_op->getOpResult(0);
     if (input_element_type != sum_element_type)
-      result =
-          rewriter.create<ConvertOp>(op.getLoc(), result, input_element_type);
+      result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
+                                                     input_element_type);
 
     rewriter.replaceOp(op, result);
     return success();
@@ -2772,13 +2825,13 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
       out_grad_shape[dim] = low_padding[dim] + high_padding[dim] +
                             (out_grad_shape[dim] - 1) * strides[dim] + 1;
     }
-    Value reduce_window_input = rewriter.create<PadOp>(
+    Value reduce_window_input = rewriter.create<stablehlo::PadOp>(
         loc, tensorflow::GetTypeFromTFTensorShape(out_grad_shape, element_type),
         /*operand=*/out_grad_divided->getOpResult(0),
         /*padding_value=*/zero,
-        /*edge_padding_low=*/GetI64ElementsAttr(low_padding, &rewriter),
-        /*edge_padding_high=*/GetI64ElementsAttr(high_padding, &rewriter),
-        /*interior_padding=*/GetI64ElementsAttr(interior_padding, &rewriter));
+        /*edge_padding_low=*/GetI64ArrayAttr(low_padding, &rewriter),
+        /*edge_padding_high=*/GetI64ArrayAttr(high_padding, &rewriter),
+        /*interior_padding=*/GetI64ArrayAttr(interior_padding, &rewriter));
 
     // Compute result by convolving `reduce_window_input` with an all-ones
     // kernel, using `ReduceWindowOp` with `AddOp` body.
@@ -2786,29 +2839,31 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
     Type sum_element_type = GetSumAccumulationType(element_type);
     if (element_type != sum_element_type) {
       // Convert to appropriate sum accumulation type to avoid precision loss.
-      reduce_window_input = rewriter.create<ConvertOp>(loc, reduce_window_input,
-                                                       sum_element_type);
+      reduce_window_input = rewriter.create<stablehlo::ConvertOp>(
+          loc, reduce_window_input, sum_element_type);
       zero = GetScalarConstOfType(sum_element_type, loc, 0, &rewriter);
     }
-    auto ones = GetI64ElementsAttr(DimVector(num_dims, 1), &rewriter);
-    auto reduce_window_op = rewriter.create<ReduceWindowOp>(
+    auto ones = GetI64ArrayAttr(DimVector(num_dims, 1), &rewriter);
+    auto reduce_window_op = rewriter.create<stablehlo::ReduceWindowOp>(
         loc,
         tensorflow::GetTypeFromTFTensorShape(orig_input_shape,
                                              sum_element_type),
         /*operand=*/reduce_window_input,
         /*init_value=*/zero,
-        /*window_dimensions=*/GetI64ElementsAttr(op.getKsize()),
+        /*window_dimensions=*/
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
         /*window_strides=*/ones,
-        /*base_dilations=*/DenseIntElementsAttr(),
-        /*window_dilations=*/DenseIntElementsAttr(),
+        /*base_dilations=*/DenseI64ArrayAttr(),
+        /*window_dilations=*/DenseI64ArrayAttr(),
         /*padding=*/DenseIntElementsAttr());
-    BuildReduceBody<AddOp>(sum_element_type, &reduce_window_op.getBody(),
-                           &rewriter);
+    BuildReduceBody<stablehlo::AddOp>(sum_element_type,
+                                      &reduce_window_op.getBody(), &rewriter);
     Value result = reduce_window_op.getResult(0);
 
     if (element_type != sum_element_type) {
       // Convert back to original element type.
-      result = rewriter.create<ConvertOp>(op.getLoc(), result, element_type);
+      result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
+                                                     element_type);
     }
     rewriter.replaceOp(op, {result});
     return success();
@@ -2826,7 +2881,7 @@ using ConvertAvgPool3DGradOp =
 // Sample result for VALID padding mode:
 //
 //   %init = arith.constant dense<...> : tensor<i32>
-//   %max_pool = "mhlo.reduce"(%inp, %init) ["mhlo.maximum"]
+//   %max_pool = "stablehlo.reduce"(%inp, %init) ["stablehlo.maximum"]
 //               {window_dimensions = ..., window_strides = ... }
 //
 template <typename OpTy, int num_dims>
@@ -2846,7 +2901,7 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
       return failure();
     }
     Location loc = op.getLoc();
-    ConstantOp init = GetScalarLimitConstOfType(
+    stablehlo::ConstantOp init = GetScalarLimitConstOfType(
         element_type, loc, hlo::kInfinityLowest, &rewriter);
 
     auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
@@ -2854,12 +2909,14 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
-    auto reduce = rewriter.create<ReduceWindowOp>(
+    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
         loc, op.getType(), op.getInput(), init,
-        GetI64ElementsAttr(op.getKsize()), GetI64ElementsAttr(op.getStrides()),
-        /*base_dilations=*/DenseIntElementsAttr(),
-        /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
-    BuildReduceBody<MaxOp>(element_type, &reduce.getBody(), &rewriter);
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
+        /*base_dilations=*/DenseI64ArrayAttr(),
+        /*window_dilations=*/DenseI64ArrayAttr(), paddings_attr);
+    BuildReduceBody<stablehlo::MaxOp>(element_type, &reduce.getBody(),
+                                      &rewriter);
 
     rewriter.replaceOp(op, reduce.getResult(0));
     return success();
@@ -2869,8 +2926,8 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
 using ConvertMaxPool2DOp = ConvertMaxPoolOp<TF::MaxPoolOp, /*num_dims=*/4>;
 using ConvertMaxPool3DOp = ConvertMaxPoolOp<TF::MaxPool3DOp, /*num_dims=*/5>;
 
-// Converts tf.Select (SelectV1) to mhlo.select. It has optional broadcasting on
-// the condition only.
+// Converts tf.Select (SelectV1) to stablehlo.select. It has optional
+// broadcasting on the condition only.
 class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
  public:
   using OpRewritePattern::OpRewritePattern;
@@ -2931,13 +2988,13 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     if (needs_broadcast) {
       Value result_extents = b.create<shape::ToExtentTensorOp>(
           GetExtentsTensorTypeFor(result_type), then_shape);
-      cond = b.create<mhlo::DynamicBroadcastInDimOp>(
+      cond = b.create<stablehlo::DynamicBroadcastInDimOp>(
           tensorflow::GetTypeFromTFTensorShape(result_type.getShape(),
                                                b.getI1Type()),
           cond, result_extents,
-          GetI64ElementsAttrForSeq(0, cond_type.getRank(), &b));
+          GetI64ArrayAttrForSeq(0, cond_type.getRank(), &b));
     }
-    Value select = b.create<mhlo::SelectOp>(
+    Value select = b.create<stablehlo::SelectOp>(
         result_type, cond, op.getThenValue(), op.getElseValue());
     b.create<shape::AssumingYieldOp>(select);
     rewriter.replaceOp(op, {assuming_op.getResult(0)});
@@ -2945,7 +3002,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
   }
 };
 
-// Converts the tf.Slice op into mhlo.real_dynamic_slice
+// Converts the tf.Slice op into stablehlo.real_dynamic_slice
 // TODO(disc): To recover static special case's performance with folding and
 // canonicalization.
 class ConvertSliceOpDynamic : public OpRewritePattern<TF::SliceOp> {
@@ -3025,7 +3082,7 @@ class ConvertSliceOpDynamic : public OpRewritePattern<TF::SliceOp> {
             {static_cast<int64_t>(stride_values.size())}, index_ty),
         stride_values);
 
-    auto d_slice = rewriter.create<mhlo::RealDynamicSliceOp>(
+    auto d_slice = rewriter.create<stablehlo::RealDynamicSliceOp>(
         loc, op.getOperation()->getResult(0).getType(), input, start_indices,
         end_indices, stride_indices);
     rewriter.replaceOp(op, d_slice.getOperation()->getResults());
@@ -3100,8 +3157,8 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
 class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
  public:
   // TODO(hinsu): Legalize this op to Einsum op. HLO Einsum op needs to be moved
-  // to CHLO and it is missing legalization to MHLO. Once that is done, this
-  // pattern's benefit can be changed back to one as well as the fallback
+  // to CHLO and it is missing legalization to StableHLO. Once that is done,
+  // this pattern's benefit can be changed back to one as well as the fallback
   // lowering pattern for the op can be removed.
   //
   // Set benefit of this pattern to zero to prefer the fallback pattern when
@@ -3138,7 +3195,7 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
         llvm::ArrayRef({op.getAdjX() ? rank - 2 : rank - 1}));
     auto rhs_contracting_dimensions = llvm::to_vector<4>(
         llvm::ArrayRef({op.getAdjY() ? rank - 1 : rank - 2}));
-    auto dimension_numbers = DotDimensionNumbersAttr::get(
+    auto dimension_numbers = stablehlo::DotDimensionNumbersAttr::get(
         rewriter.getContext(),
         /*lhs_batching_dimensions=*/batch_dimensions,
         /*rhs_batching_dimensions=*/batch_dimensions,
@@ -3146,10 +3203,10 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
         /*rhs_contracting_dimensions=*/rhs_contracting_dimensions);
     // TODO(silvasean): Emit shape checks for contracting dimensions.
     // (The batch dimensions are checked by the broadcasting logic)
-    rewriter.replaceOpWithNewOp<DotGeneralOp>(
+    rewriter.replaceOpWithNewOp<stablehlo::DotGeneralOp>(
         op, op.getType(), lhs, rhs, dimension_numbers,
         /*precision_config=*/GetPrecisionConfig(&rewriter),
-        /*algorithm=*/DotAlgorithmAttr{});
+        /*algorithm=*/stablehlo::DotAlgorithmAttr{});
     return success();
   }
 };
@@ -3170,20 +3227,20 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
 //
 // will be converted into:
 //
-//   %0 = "mhlo.slice"(%input) {
-//             limit_indices = dense<[4, 2]> : tensor<2xi64>,
-//             start_indices = dense<0> : tensor<2xi64>,
-//             strides = dense<1> : tensor<2xi64>} :
+//   %0 = "stablehlo.slice"(%input) {
+//             limit_indices = array<i64: 4, 2>,
+//             start_indices = array<i64: 0, 0>,
+//             strides = array<i64: 1, 1>} :
 //        (tensor<4x6xf32>) -> tensor<4x2xf32>
-//   %1 = "mhlo.slice"(%input) {
-//             limit_indices = dense<4> : tensor<2xi64>,
-//              start_indices = dense<[0, 2]> : tensor<2xi64>,
-//            strides = dense<1> : tensor<2xi64>} :
+//   %1 = "stablehlo.slice"(%input) {
+//             limit_indices = array<i64: 4, 4>,
+//             start_indices = array<i64: 0, 2>,
+//             strides = array<i64: 1, 1>} :
 //        (tensor<4x6xf32>) -> tensor<4x2xf32>
-//    %2 = "mhlo.slice"(%input) {
-//            limit_indices = dense<[4, 6]> : tensor<2xi64>,
-//            start_indices = dense<[0, 4]> : tensor<2xi64>,
-//             strides = dense<1> : tensor<2xi64>} :
+//    %2 = "stablehlo.slice"(%input) {
+//             limit_indices = array<i64: 4, 6>,
+//             start_indices = array<i64: 0, 4>,
+//             strides = array<i64: 1, 1>} :
 //        (tensor<4x6xf32>) -> tensor<4x2xf32>
 // TODO(antiagainst): consider lowering into TF ops so the pattern can be more
 // applicable.
@@ -3231,11 +3288,11 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
     for (int i = 0; i < num_splits; ++i) {
       begin_indices[dim_index] = i * slice_size;
       end_indices[dim_index] = (i + 1) * slice_size;
-      slices.push_back(
-          rewriter.create<SliceOp>(op.getLoc(), slice_type, op.getValue(),
-                                   GetI64ElementsAttr(begin_indices, &rewriter),
-                                   GetI64ElementsAttr(end_indices, &rewriter),
-                                   GetI64ElementsAttr(strides, &rewriter)));
+      slices.push_back(rewriter.create<stablehlo::SliceOp>(
+          op.getLoc(), slice_type, op.getValue(),
+          GetI64ArrayAttr(begin_indices, &rewriter),
+          GetI64ArrayAttr(end_indices, &rewriter),
+          GetI64ArrayAttr(strides, &rewriter)));
     }
 
     rewriter.replaceOp(op, slices);
@@ -3243,8 +3300,8 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
   }
 };
 
-// Converts the tf.Split op into a series of mhlo.real_dynamic_slice ops the
-// dimension to split is a constant.
+// Converts the tf.Split op into a series of stablehlo.real_dynamic_slice ops
+// the dimension to split is a constant.
 // TODO(disc): To recover static special case's performance with folding and
 // canonicalization. delete ConvertSplitOp
 class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
@@ -3320,7 +3377,7 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(strides.size())}, index_ty),
           strides);
-      slices.push_back(rewriter.create<RealDynamicSliceOp>(
+      slices.push_back(rewriter.create<stablehlo::RealDynamicSliceOp>(
           loc, op.getOperation()->getResult(i).getType(), input, begin_value,
           end_value, stride_value));
     }
@@ -3347,20 +3404,20 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
 //                   (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>)
 //
 // We will generate slices following slices:
-// %0 = "mhlo.slice"(%input) {
-//        limit_indices = dense<[4, 1]> : tensor<2xi64>,
-//        start_indices = dense<0> : tensor<2xi64>,
-//        strides = dense<1> : tensor<2xi64>} :
+// %0 = "stablehlo.slice"(%input) {
+//        limit_indices = array<i64: 4, 1>,
+//        start_indices = array<i64: 0, 0>,
+//        strides = array<i64: 1, 1>} :
 //        (tensor<4x6xf32>) -> tensor<4x1xf32>
-// %1 = "mhlo.slice"(%input) {
-//        limit_indices = dense<[4, 3]> : tensor<2xi64>,
-//        start_indices = dense<[0, 1]> : tensor<2xi64>,
-//        strides = dense<1> : tensor<2xi64>} :
+// %1 = "stablehlo.slice"(%input) {
+//        limit_indices = array<i64: 4, 3>,
+//        start_indices = array<i64: 0, 1>,
+//        strides = array<i64: 1, 1>} :
 //        (tensor<4x6xf32>) -> tensor<4x2xf32>
-// %2 = "mhlo.slice"(%input) {
-//        limit_indices = dense<[4, 6]> : tensor<2xi64>,
-//        start_indices = dense<[0, 3]> : tensor<2xi64>,
-//        strides = dense<1> : tensor<2xi64>} :
+// %2 = "stablehlo.slice"(%input) {
+//        limit_indices = array<i64: 4, 6>,
+//        start_indices = array<i64: 0, 3>,
+//        strides = array<i64: 1, 1>} :
 //        (tensor<4x6xf32>) -> tensor<4x3xf32>
 class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
  public:
@@ -3427,11 +3484,10 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
 
     for (int i = 0, end = op.getNumResults(); i < end; ++i) {
       end_indices[dim_index] = begin_indices[dim_index] + split_sizes[i];
-      slices.push_back(rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), op.getValue(),
-          GetI64ElementsAttr(begin_indices, &rewriter),
-          GetI64ElementsAttr(end_indices, &rewriter),
-          GetI64ElementsAttr(strides, &rewriter)));
+      slices.push_back(rewriter.create<stablehlo::SliceOp>(
+          op.getLoc(), op.getValue(), GetI64ArrayAttr(begin_indices, &rewriter),
+          GetI64ArrayAttr(end_indices, &rewriter),
+          GetI64ArrayAttr(strides, &rewriter)));
       // Prepare the begin indice for the next slice.
       begin_indices[dim_index] = end_indices[dim_index];
     }
@@ -3446,19 +3502,19 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
 // strides operands are converted to attributes with non-negative indexing.
 //
 // If the begin input is not a compile time constant, the begin input needs to
-// be sliced and the slice needs to be lowered to mhlo.DynamicSlice. In this
-// case, strides must have a known value of 1 (otherwise we have insufficient
-// information to conform to XLA's op semantics).
+// be sliced and the slice needs to be lowered to stablehlo.DynamicSlice. In
+// this case, strides must have a known value of 1 (otherwise we have
+// insufficient information to conform to XLA's op semantics).
 //
 // For example with an op like following,
 //   tf.StridedSlice(%input, %begin, %end, %strides) {shrink_axis_mask = 1}
 //     : tensor<AxBxf32> -> tensor<Pxf32>
 //
 // If the %begin input is constant, output would be:
-//   %reversed = "mhlo.Reverse" (%input) {dimensions = ...}
-//   %sliced = "mhlo.Slice" (%input)
+//   %reversed = "stablehlo.Reverse" (%input) {dimensions = ...}
+//   %sliced = "stablehlo.Slice" (%input)
 //             {start_indices = ..., limit_indices = ..., strides = ...}
-//   %output = "mhlo.Reshape" (%sliced) : tensor<1xPxf32> -> tensor<Pxf32>
+//   %output = "stablehlo.Reshape" (%sliced) : tensor<1xPxf32> -> tensor<Pxf32>
 //
 class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
  public:
@@ -3512,17 +3568,17 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     Location loc = op.getLoc();
     Value input = op.getInput();
     if (!dims_to_reverse.empty())
-      input = rewriter.create<ReverseOp>(
+      input = rewriter.create<stablehlo::ReverseOp>(
           loc, input_ty, op.getInput(),
-          GetI64ElementsAttr(dims_to_reverse, &rewriter));
-    auto sliced = rewriter.create<SliceOp>(
-        loc, input, GetI64ElementsAttr(hlo_begin_indices, &rewriter),
-        GetI64ElementsAttr(hlo_end_indices, &rewriter),
-        GetI64ElementsAttr(hlo_strides, &rewriter));
+          GetI64ArrayAttr(dims_to_reverse, &rewriter));
+    auto sliced = rewriter.create<stablehlo::SliceOp>(
+        loc, input, GetI64ArrayAttr(hlo_begin_indices, &rewriter),
+        GetI64ArrayAttr(hlo_end_indices, &rewriter),
+        GetI64ArrayAttr(hlo_strides, &rewriter));
 
     // Reshape slice result so that the shape is updated depending on
     // 'new_axis_mask' or 'shrink_axis_mask' attributes.
-    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), sliced);
+    rewriter.replaceOpWithNewOp<stablehlo::ReshapeOp>(op, op.getType(), sliced);
     return success();
   }
 
@@ -3607,12 +3663,12 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
         continue;
       }
 
-      auto index = rewriter.create<SliceOp>(
-          loc, op.getBegin(), GetI64ElementsAttr({d}, &rewriter),
-          GetI64ElementsAttr({d + 1}, &rewriter),
-          GetI64ElementsAttr({1}, &rewriter));
+      auto index = rewriter.create<stablehlo::SliceOp>(
+          loc, op.getBegin(), GetI64ArrayAttr({d}, &rewriter),
+          GetI64ArrayAttr({d + 1}, &rewriter), GetI64ArrayAttr({1}, &rewriter));
       // Convert index to scalar.
-      auto reshaped_index = rewriter.create<ReshapeOp>(loc, type, index);
+      auto reshaped_index =
+          rewriter.create<stablehlo::ReshapeOp>(loc, type, index);
       // If the index is negative, wrap it around with dimension size.
       auto index_negative =
           rewriter.create<TF::LessOp>(loc, reshaped_index, zero);
@@ -3620,23 +3676,23 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
                                             input_shape[d], &rewriter);
       auto wrapped_index =
           rewriter.create<TF::AddV2Op>(loc, input_val, reshaped_index);
-      auto final_index = rewriter.create<SelectOp>(
+      auto final_index = rewriter.create<stablehlo::SelectOp>(
           loc, type, index_negative, wrapped_index, reshaped_index);
       slice_begin_indices.push_back(final_index);
       slice_sizes.push_back(1);
     }
 
-    auto slice_sizes_attr = GetI64ElementsAttr(slice_sizes, &rewriter);
+    auto slice_sizes_attr = GetI64ArrayAttr(slice_sizes, &rewriter);
     auto sliced_type = tensorflow::GetTypeFromTFTensorShape(
         slice_sizes, op.getType().getElementType());
     // This must be an xla DynamicSlice op due to the inputs that aren't
     // constant.
-    auto sliced = rewriter.create<DynamicSliceOp>(
+    auto sliced = rewriter.create<stablehlo::DynamicSliceOp>(
         loc, sliced_type, op.getInput(), slice_begin_indices, slice_sizes_attr);
 
     // Reshape slice result so that the shape is updated depending on
     // 'new_axis_mask' or 'shrink_axis_mask' attributes.
-    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), sliced);
+    rewriter.replaceOpWithNewOp<stablehlo::ReshapeOp>(op, op.getType(), sliced);
     return success();
   }
 
@@ -3704,7 +3760,7 @@ class ConvertStridedSliceGradOp
     Type element_type = mlir::cast<ShapedType>(grad.getType()).getElementType();
 
     // Perform reshape to undo any new/shrink axes done by strided slice.
-    grad = rewriter.create<mhlo::ReshapeOp>(
+    grad = rewriter.create<stablehlo::ReshapeOp>(
         op.getLoc(), tensorflow::GetTypeFromTFTensorShape(shape, element_type),
         grad);
 
@@ -3741,22 +3797,21 @@ class ConvertStridedSliceGradOp
     }
 
     if (!dims_to_reverse.empty()) {
-      grad = rewriter.create<mhlo::ReverseOp>(
+      grad = rewriter.create<stablehlo::ReverseOp>(
           op.getLoc(), grad.getType(), grad,
-          GetI64ElementsAttr(dims_to_reverse, &rewriter));
+          GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
     auto zero = GetScalarConstOfType(element_type, op.getLoc(), 0, &rewriter);
-    rewriter.replaceOpWithNewOp<mhlo::PadOp>(
-        op, op.getType(), grad, zero,
-        GetI64ElementsAttr(padding_low, &rewriter),
-        GetI64ElementsAttr(padding_high, &rewriter),
-        GetI64ElementsAttr(padding_interm, &rewriter));
+    rewriter.replaceOpWithNewOp<stablehlo::PadOp>(
+        op, op.getType(), grad, zero, GetI64ArrayAttr(padding_low, &rewriter),
+        GetI64ArrayAttr(padding_high, &rewriter),
+        GetI64ArrayAttr(padding_interm, &rewriter));
     return success();
   }
 };
 
-/// Converts the RangeOp tensorflow op to a mhlo.iota op with a scaling and
+/// Converts the RangeOp tensorflow op to a stablehlo.iota op with a scaling and
 /// offset applied to generate the range values. The output tensor needs to
 /// have a static shape.
 ///
@@ -3765,11 +3820,11 @@ class ConvertStridedSliceGradOp
 ///      : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<5xf32>
 ///
 /// Output would be:
-///   %iota = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<5xf32>
-///   %scaled = "mhlo.multiply"(%iota, %delta)
+///   %iota = "stablehlo.iota"() {iota_dimension = 0 : i64} : () ->
+///   tensor<5xf32> %scaled = "stablehlo.multiply"(%iota, %delta)
 ///       {broadcast_dimensions = dense<[]> : tensor<0xi64>} :
 ///       (tensor<5xf32>, tensor<f32>) -> tensor<5xf32>
-///   %result = "mhlo.add"(%scaled, %offset)
+///   %result = "stablehlo.add"(%scaled, %offset)
 ///       {broadcast_dimensions = dense<[]> : tensor<0xi64>} :
 ///       (tensor<5xf32>, tensor<f32>) -> tensor<5xf32>
 ///
@@ -3785,8 +3840,8 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
       return failure();
     }
 
-    auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
-                                        rewriter.getI64IntegerAttr(0));
+    auto iota = rewriter.create<stablehlo::IotaOp>(
+        op.getLoc(), result_type, rewriter.getI64IntegerAttr(0));
     auto scaled = rewriter.create<chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, op.getDelta(),
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, op.getDelta()));
@@ -3837,24 +3892,25 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
     // some conversion to float for the operations.
     //
     // %size = ceil(abs((%limit - %start) / %delta))
-    auto range = rewriter.create<mhlo::SubtractOp>(op.getLoc(), limit, start);
-    auto abs = rewriter.create<mhlo::AbsOp>(op.getLoc(), range);
+    auto range =
+        rewriter.create<stablehlo::SubtractOp>(op.getLoc(), limit, start);
+    auto abs = rewriter.create<stablehlo::AbsOp>(op.getLoc(), range);
 
     // Delta is not necessarily the same type as start and limit.
     auto abs_cast =
-        rewriter.create<mhlo::ConvertOp>(op.getLoc(), compute_type, abs);
+        rewriter.create<stablehlo::ConvertOp>(op.getLoc(), compute_type, abs);
     auto delta_cast =
-        rewriter.create<mhlo::ConvertOp>(op.getLoc(), compute_type, delta);
+        rewriter.create<stablehlo::ConvertOp>(op.getLoc(), compute_type, delta);
 
     // Compute the total number of integer steps and convert to the HLO
     // dimension tensor.
     auto normalized =
-        rewriter.create<mhlo::DivOp>(op.getLoc(), abs_cast, delta_cast);
-    auto ceil = rewriter.create<mhlo::CeilOp>(op.getLoc(), normalized);
-    auto steps = rewriter.create<mhlo::ConvertOp>(
+        rewriter.create<stablehlo::DivOp>(op.getLoc(), abs_cast, delta_cast);
+    auto ceil = rewriter.create<stablehlo::CeilOp>(op.getLoc(), normalized);
+    auto steps = rewriter.create<stablehlo::ConvertOp>(
         op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()), ceil);
-    auto reshape = rewriter.create<mhlo::ReshapeOp>(
+    auto reshape = rewriter.create<stablehlo::ReshapeOp>(
         op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getI64Type()),
         steps);
@@ -3864,12 +3920,12 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
     // %range = %start + %delta * iota(%size)
     auto out_scalar_type = tensorflow::GetTypeFromTFTensorShape(
         {}, getElementTypeOrSelf(result_type));
-    auto start_out_cast =
-        rewriter.create<mhlo::ConvertOp>(op.getLoc(), out_scalar_type, start);
-    auto delta_out_cast =
-        rewriter.create<mhlo::ConvertOp>(op.getLoc(), out_scalar_type, delta);
+    auto start_out_cast = rewriter.create<stablehlo::ConvertOp>(
+        op.getLoc(), out_scalar_type, start);
+    auto delta_out_cast = rewriter.create<stablehlo::ConvertOp>(
+        op.getLoc(), out_scalar_type, delta);
 
-    auto iota = rewriter.create<DynamicIotaOp>(
+    auto iota = rewriter.create<stablehlo::DynamicIotaOp>(
         op.getLoc(), result_type, reshape, rewriter.getI64IntegerAttr(0));
     auto scaled = rewriter.create<chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, delta_out_cast,
@@ -3881,7 +3937,8 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
   }
 };
 
-ElementsAttr ConvertAxisAttr(Value val, ElementsAttr attr, Builder *builder) {
+DenseI64ArrayAttr ConvertAxisAttr(Value val, ElementsAttr attr,
+                                  Builder *builder) {
   auto int_attr = mlir::cast<DenseIntElementsAttr>(attr);
   auto type = mlir::cast<ShapedType>(val.getType());
 
@@ -3893,10 +3950,10 @@ ElementsAttr ConvertAxisAttr(Value val, ElementsAttr attr, Builder *builder) {
     axis.push_back((val.getSExtValue() + rank) % rank);
   }
 
-  return builder->getI64TensorAttr(axis);
+  return builder->getDenseI64ArrayAttr(axis);
 }
 
-/// Converts the LinSpace tensorflow op to a mhlo.iota op with a scaling
+/// Converts the LinSpace tensorflow op to a stablehlo.iota op with a scaling
 /// and offset applied to generate the linspace values. The output tensor needs
 /// to have a static shape.  The implementation is defined in C++ because there
 /// is no type inference for the iota op.
@@ -3926,7 +3983,7 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
         op.getLoc(), op.getStart().getType(), op.getStop(), op.getStart(),
         hlo::getBroadcastDimensionsAttr(&rewriter, op.getStop(),
                                         op.getStart()));
-    Value step_denominator = rewriter.create<ConvertOp>(
+    Value step_denominator = rewriter.create<stablehlo::ConvertOp>(
         op.getLoc(), op.getNum(), result_type.getElementType());
     if (num > 1) {
       Value one = GetScalarConstOfType(result_type.getElementType(),
@@ -3941,8 +3998,8 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
                                         step_denominator));
 
     // Scale the iota and add the offset.
-    auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
-                                        rewriter.getI64IntegerAttr(0));
+    auto iota = rewriter.create<stablehlo::IotaOp>(
+        op.getLoc(), result_type, rewriter.getI64IntegerAttr(0));
     auto scaled = rewriter.create<chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, step,
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, step));
@@ -3953,7 +4010,7 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
   }
 };
 
-/// Converts a generic OpTy tensorflow op to a mhlo.reduce op over
+/// Converts a generic OpTy tensorflow op to a stablehlo.reduce op over
 /// ReductionOp.
 /// `is_accumulation` controls whether it uses higher precision for the actual
 /// reduction. This is set to false for ops like max where there is no precision
@@ -4011,15 +4068,15 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     // repeated arithmetic operations.
     Type reduce_element_type =
         is_accumulation ? GetAccumulationType(element_type) : element_type;
-    auto casted_input =
-        rewriter.create<ConvertOp>(loc, op.getInput(), reduce_element_type);
+    auto casted_input = rewriter.create<stablehlo::ConvertOp>(
+        loc, op.getInput(), reduce_element_type);
 
     // Each reduction op can have a different initial value.
     Value init = Derived::GetInitialValue(reduce_element_type, loc, &rewriter);
 
-    auto reduction = rewriter.create<ReduceOp>(
+    auto reduction = rewriter.create<stablehlo::ReduceOp>(
         loc, casted_input.getResult(), init,
-        GetI64ElementsAttr(xla_dimensions, &rewriter), reduce_element_type);
+        GetI64ArrayAttr(xla_dimensions, &rewriter), reduce_element_type);
     BuildReduceBody<ReductionOp>(reduce_element_type, &reduction.getBody(),
                                  &rewriter);
     Value result = reduction.getResult(0);
@@ -4043,7 +4100,7 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
       Value divisor_tensor = rewriter.create<tensor::FromElementsOp>(
           loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()),
           divisor_casted);
-      Value divisor = rewriter.create<ConvertOp>(
+      Value divisor = rewriter.create<stablehlo::ConvertOp>(
           loc, tensorflow::GetTypeFromTFTensorShape({}, reduce_element_type),
           divisor_tensor);
       auto broadcast_dims = rewriter.getDenseI64ArrayAttr({});
@@ -4051,7 +4108,7 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
                                                      broadcast_dims);
     }
 
-    result = rewriter.create<ConvertOp>(loc, result, element_type);
+    result = rewriter.create<stablehlo::ConvertOp>(loc, result, element_type);
 
     // Need to reshape back after the reduction if we're keeping the reduced
     // dimensions. Note that we do this through successive (nominally 1)
@@ -4079,12 +4136,13 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
 // Converts Mean op to HLO Reduce op.
 //
 //   %init = arith.constant dense<...> : tensor<T>
-//   %sum = "mhlo.reduce"(%inp, %init) ["mhlo.add"]
+//   %sum = "stablehlo.reduce"(%inp, %init) ["stablehlo.add"]
 //               {dimensions = ...}
 //   %divisor = arith.constant dense<...> : tensor<T>
-//   %mean = "mhlo.divide"(%sum, %divisor)
+//   %mean = "stablehlo.divide"(%sum, %divisor)
 class ConvertMeanOp
-    : public GenericConvertReductionOp<ConvertMeanOp, TF::MeanOp, AddOp> {
+    : public GenericConvertReductionOp<ConvertMeanOp, TF::MeanOp,
+                                       stablehlo::AddOp> {
  public:
   using GenericConvertReductionOp::GenericConvertReductionOp;
   static Value GetInitialValue(Type reduce_element_type, Location loc,
@@ -4096,10 +4154,10 @@ class ConvertMeanOp
 // Converts Sum op to HLO Reduce op.
 //
 //   %init = arith.constant dense<...> : tensor<T>
-//   %sum = "mhlo.reduce"(%inp, %init) ["mhlo.add"]
+//   %sum = "stablehlo.reduce"(%inp, %init) ["stablehlo.add"]
 //               {dimensions = ...}
-class ConvertSumOp
-    : public GenericConvertReductionOp<ConvertSumOp, TF::SumOp, AddOp> {
+class ConvertSumOp : public GenericConvertReductionOp<ConvertSumOp, TF::SumOp,
+                                                      stablehlo::AddOp> {
  public:
   using GenericConvertReductionOp::GenericConvertReductionOp;
 
@@ -4113,10 +4171,11 @@ class ConvertSumOp
 // Converts Max op to HLO Reduce op.
 //
 //   %init = arith.constant dense<...> : tensor<T>
-//   %max = "mhlo.reduce"(%inp, %init) ["mhlo.maximum"]
+//   %max = "stablehlo.reduce"(%inp, %init) ["stablehlo.maximum"]
 //               {dimensions = ...}
 class ConvertMaxOp
-    : public GenericConvertReductionOp<ConvertMaxOp, TF::MaxOp, MaxOp,
+    : public GenericConvertReductionOp<ConvertMaxOp, TF::MaxOp,
+                                       stablehlo::MaxOp,
                                        /* is_accumulation= */ false> {
  public:
   using GenericConvertReductionOp::GenericConvertReductionOp;
@@ -4131,10 +4190,11 @@ class ConvertMaxOp
 // Converts Min op to HLO Reduce op.
 //
 //   %init = arith.constant dense<...> : tensor<T>
-//   %min = "mhlo.reduce"(%inp, %init) ["mhlo.minimum"]
+//   %min = "stablehlo.reduce"(%inp, %init) ["stablehlo.minimum"]
 //               {dimensions = ...}
 class ConvertMinOp
-    : public GenericConvertReductionOp<ConvertMinOp, TF::MinOp, MinOp,
+    : public GenericConvertReductionOp<ConvertMinOp, TF::MinOp,
+                                       stablehlo::MinOp,
                                        /* is_accumulation= */ false> {
  public:
   using GenericConvertReductionOp::GenericConvertReductionOp;
@@ -4149,10 +4209,11 @@ class ConvertMinOp
 // Converts Prod op to HLO Reduce op.
 //
 //   %init = arith.constant dense<...> : tensor<T>
-//   %prod = "mhlo.reduce"(%inp, %init) ["mhlo.multiply"]
+//   %prod = "stablehlo.reduce"(%inp, %init) ["stablehlo.multiply"]
 //               {dimensions = ...}
 class ConvertProdOp
-    : public GenericConvertReductionOp<ConvertProdOp, TF::ProdOp, MulOp> {
+    : public GenericConvertReductionOp<ConvertProdOp, TF::ProdOp,
+                                       stablehlo::MulOp> {
  public:
   using GenericConvertReductionOp::GenericConvertReductionOp;
 
@@ -4165,10 +4226,10 @@ class ConvertProdOp
 // Converts All op to HLO Reduce op.
 //
 //   %init = arith.constant dense<...> : tensor<T>
-//   %max = "mhlo.reduce"(%inp, %init) ["mhlo.and"]
+//   %max = "stablehlo.reduce"(%inp, %init) ["stablehlo.and"]
 //               {dimensions = ...}
-class ConvertAllOp
-    : public GenericConvertReductionOp<ConvertAllOp, TF::AllOp, AndOp> {
+class ConvertAllOp : public GenericConvertReductionOp<ConvertAllOp, TF::AllOp,
+                                                      stablehlo::AndOp> {
  public:
   using GenericConvertReductionOp::GenericConvertReductionOp;
   static Value GetInitialValue(Type reduce_element_type, Location loc,
@@ -4180,10 +4241,10 @@ class ConvertAllOp
 // Converts Any op to HLO Reduce op.
 //
 //   %init = arith.constant dense<...> : tensor<T>
-//   %max = "mhlo.reduce"(%inp, %init) ["mhlo.or"]
+//   %max = "stablehlo.reduce"(%inp, %init) ["stablehlo.or"]
 //               {dimensions = ...}
-class ConvertAnyOp
-    : public GenericConvertReductionOp<ConvertAnyOp, TF::AnyOp, OrOp> {
+class ConvertAnyOp : public GenericConvertReductionOp<ConvertAnyOp, TF::AnyOp,
+                                                      stablehlo::OrOp> {
  public:
   using GenericConvertReductionOp::GenericConvertReductionOp;
   static Value GetInitialValue(Type reduce_element_type, Location loc,
@@ -4240,17 +4301,15 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
     IntegerAttr iota_dimension =
         IntegerAttr::get(rewriter.getIntegerType(64), axis);
     Value input_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
-    Value index_values = rewriter.create<DynamicIotaOp>(
+    Value index_values = rewriter.create<stablehlo::DynamicIotaOp>(
         loc, index_type, input_shape, iota_dimension);
 
     Value operands[] = {op.getInput(), index_values};
     Value init_values[] = {init_value, index_init_value};
-    DenseIntElementsAttr reduction_dimensions =
-        GetI64ElementsAttr({axis}, &rewriter);
 
-    auto reduction = rewriter.create<ReduceOp>(
+    auto reduction = rewriter.create<stablehlo::ReduceOp>(
         loc, llvm::ArrayRef<Value>(operands),
-        llvm::ArrayRef<Value>(init_values), reduction_dimensions,
+        llvm::ArrayRef<Value>(init_values), GetI64ArrayAttr({axis}, &rewriter),
         TypeRange({input_element_type, index_element_type}));
     auto direction = Derived::GetDirection();
     BuildArgMinMaxReductionBody(input_element_type, index_element_type,
@@ -4266,8 +4325,8 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
 //
 //   %init_index = arith.constant dense<...> : tensor<T>
 //   %init = arith.constant dense<...> : tensor<T>
-//   %reduce = "mhlo.reduce"(%selected_input, %select_index, %init,
-//                              %init_index) ["mhlo.arg_max"]
+//   %reduce = "stablehlo.reduce"(%selected_input, %select_index, %init,
+//                              %init_index) ["stablehlo.arg_max"]
 class ConvertArgMaxOp
     : public ConvertArgMinMaxOp<ConvertArgMaxOp, TF::ArgMaxOp> {
  public:
@@ -4279,7 +4338,9 @@ class ConvertArgMaxOp
                                      hlo::kInfinityLowest, &rewriter);
   }
 
-  static ComparisonDirection GetDirection() { return ComparisonDirection::GE; }
+  static stablehlo::ComparisonDirection GetDirection() {
+    return stablehlo::ComparisonDirection::GE;
+  }
 };
 
 // Converts tensorflow ArgMin op to mhlo operations. The actual
@@ -4287,8 +4348,8 @@ class ConvertArgMaxOp
 //
 //   %init_index = arith.constant dense<...> : tensor<T>
 //   %init = arith.constant dense<...> : tensor<T>
-//   %reduce = "mhlo.reduce"(%selected_input, %select_index, %init,
-//                              %init_index) ["mhlo.arg_min"]
+//   %reduce = "stablehlo.reduce"(%selected_input, %select_index, %init,
+//                              %init_index) ["stablehlo.arg_min"]
 class ConvertArgMinOp
     : public ConvertArgMinMaxOp<ConvertArgMinOp, TF::ArgMinOp> {
  public:
@@ -4300,13 +4361,15 @@ class ConvertArgMinOp
                                      hlo::kInfinityMax, &rewriter);
   }
 
-  static ComparisonDirection GetDirection() { return ComparisonDirection::LE; }
+  static stablehlo::ComparisonDirection GetDirection() {
+    return stablehlo::ComparisonDirection::LE;
+  }
 };
 
 // Converts TF TensorScatterUpdate/Min/Max/Add/Sub op into Scatter Op with
 // assignment:
 //
-//   %result = "mhlo.scatter"(%tensor, %indices, %updates)
+//   %result = "stablehlo.scatter"(%tensor, %indices, %updates)
 //     { dimensions = ... }
 //
 template <typename Derived, typename OpTy>
@@ -4381,7 +4444,7 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
         mlir::dyn_cast<RankedTensorType>(updates.getType()).getRank();
 
     int64_t window_dims = tensor_rank - num_index_dims;
-    auto dims_attr = ScatterDimensionNumbersAttr::get(
+    auto dims_attr = stablehlo::ScatterDimensionNumbersAttr::get(
         rewriter.getContext(),
         llvm::to_vector<4>(
             llvm::seq<int64_t>(updates_rank - window_dims, updates_rank)),
@@ -4392,7 +4455,7 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
         indices_rank - 1);
 
     Location loc = op.getLoc();
-    auto scatter = rewriter.create<ScatterOp>(
+    auto scatter = rewriter.create<stablehlo::ScatterOp>(
         loc, op.getType(), ValueRange(Value(op.getTensor())), op.getIndices(),
         updates, dims_attr);
     Derived::BuildScatterBody(tensor_ty.getElementType(),
@@ -4416,7 +4479,7 @@ class ConvertTensorScatterUpdateOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    builder.create<ReturnOp>(loc, block->getArgument(1));
+    builder.create<stablehlo::ReturnOp>(loc, block->getArgument(1));
   }
 };
 
@@ -4433,9 +4496,9 @@ class ConvertTensorScatterAddOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto add_op = builder.create<AddOp>(loc, block->getArgument(0),
-                                        block->getArgument(1));
-    builder.create<ReturnOp>(loc, add_op.getResult());
+    auto add_op = builder.create<stablehlo::AddOp>(loc, block->getArgument(0),
+                                                   block->getArgument(1));
+    builder.create<stablehlo::ReturnOp>(loc, add_op.getResult());
   }
 };
 
@@ -4452,9 +4515,9 @@ class ConvertTensorScatterSubOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto sub_op = builder.create<SubtractOp>(loc, block->getArgument(0),
-                                             block->getArgument(1));
-    builder.create<ReturnOp>(loc, sub_op.getResult());
+    auto sub_op = builder.create<stablehlo::SubtractOp>(
+        loc, block->getArgument(0), block->getArgument(1));
+    builder.create<stablehlo::ReturnOp>(loc, sub_op.getResult());
   }
 };
 
@@ -4471,9 +4534,9 @@ class ConvertTensorScatterMinOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto min_op = builder.create<MinOp>(loc, block->getArgument(0),
-                                        block->getArgument(1));
-    builder.create<ReturnOp>(loc, min_op.getResult());
+    auto min_op = builder.create<stablehlo::MinOp>(loc, block->getArgument(0),
+                                                   block->getArgument(1));
+    builder.create<stablehlo::ReturnOp>(loc, min_op.getResult());
   }
 };
 
@@ -4490,9 +4553,9 @@ class ConvertTensorScatterMaxOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto max_op = builder.create<MaxOp>(loc, block->getArgument(0),
-                                        block->getArgument(1));
-    builder.create<ReturnOp>(loc, max_op.getResult());
+    auto max_op = builder.create<stablehlo::MaxOp>(loc, block->getArgument(0),
+                                                   block->getArgument(1));
+    builder.create<stablehlo::ReturnOp>(loc, max_op.getResult());
   }
 };
 
@@ -4500,10 +4563,10 @@ class ConvertTensorScatterMaxOp
 //   For shape [S1, S2] and multiples [M1, M2],
 //     MS1 = M1 * S1; MS2 = M2 * S2
 //
-//   %broadcast = mhlo.broadcast_in_dim(%input) {
+//   %broadcast = stablehlo.broadcast_in_dim(%input) {
 //     broadcast_dimensions = [0, 2]
 //   }
-//   %result = "mhlo.reshape"(%broadcast) : (tensor<S1xM1xS2xM2xf32>)
+//   %result = "stablehlo.reshape"(%broadcast) : (tensor<S1xM1xS2xM2xf32>)
 //      -> tensor<MS1xMS2xf32>
 class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
  public:
@@ -4556,12 +4619,12 @@ class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
         tensorflow::GetTypeFromTFTensorShape(broadcasted_shape, element_type);
     Type output_type = op.getType();
 
-    Value result = rewriter.create<BroadcastInDimOp>(
+    Value result = rewriter.create<stablehlo::BroadcastInDimOp>(
         loc, broadcasted_type, op.getInput(),
-        GetI64ElementsAttr(broadcast_dimensions, &rewriter));
+        GetI64ArrayAttr(broadcast_dimensions, &rewriter));
 
     if (output_type != broadcasted_type) {
-      result = rewriter.create<ReshapeOp>(loc, output_type, result);
+      result = rewriter.create<stablehlo::ReshapeOp>(loc, output_type, result);
     }
 
     rewriter.replaceOp(op, {result});
@@ -4570,7 +4633,7 @@ class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
   }
 };
 
-// Converts the tf.TileOp op into mhlo.dynamic_reshape
+// Converts the tf.TileOp op into stablehlo.dynamic_reshape
 // TODO(disc): To recover static special case's performance with folding and
 // canonicalization.
 class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
@@ -4583,9 +4646,11 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
   //
   //   %out_dim_size = [S1, M1, S2, M2]
   //   %broadcast_dimensions = [1, 3];
-  //   %broadcast = mhlo.d_broadcast_in_dim(%input, %out_dim_size, %braodcast_dimensions);
+  //   %broadcast = stablehlo.d_broadcast_in_dim(
+  //     %input, %out_dim_size, %braodcast_dimensions);
   //   %shape = [MS1, MS2]
-  //   %result = "mhlo.d_reshape"(%broadcast, %shape) : (tensor<S1xM1xS2xM2xf32>) -> tensor<MS1xMS2xf32>
+  //   %result = "stablehlo.d_reshape"(%broadcast, %shape)
+  //     : (tensor<S1xM1xS2xM2xf32>) -> tensor<MS1xMS2xf32>
   // clang-format on
   LogicalResult matchAndRewrite(TF::TileOp op,
                                 PatternRewriter &rewriter) const final {
@@ -4640,8 +4705,7 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     for (int64_t dim_idx = 0; dim_idx < input_rank; ++dim_idx) {
       broadcast_dimensions.push_back(1 + 2 * dim_idx);
     }
-    auto broadcast_dims_attr =
-        GetI64ElementsAttr(broadcast_dimensions, &rewriter);
+    auto broadcast_dims_attr = GetI64ArrayAttr(broadcast_dimensions, &rewriter);
 
     Value out_dim_size_tensor = rewriter.create<tensor::FromElementsOp>(
         loc,
@@ -4652,7 +4716,7 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
                                             ShapedType::kDynamic);
     RankedTensorType broadcast_type =
         tensorflow::GetTypeFromTFTensorShape(broadcast_shape, element_type);
-    Value broadcast = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
+    Value broadcast = rewriter.create<stablehlo::DynamicBroadcastInDimOp>(
         loc, broadcast_type, input, out_dim_size_tensor, broadcast_dims_attr);
 
     // %shape = [MS1, MS2]
@@ -4666,8 +4730,8 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     Value shape = rewriter.create<tensor::FromElementsOp>(
         loc, tensorflow::GetTypeFromTFTensorShape({input_rank}, index_ty),
         shape_values);
-    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, op.getType(),
-                                                        broadcast, shape);
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(op, op.getType(),
+                                                             broadcast, shape);
     return success();
   }
 };
@@ -4694,13 +4758,15 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
 
-    auto result = rewriter.create<SelectAndScatterOp>(
+    auto result = rewriter.create<stablehlo::SelectAndScatterOp>(
         loc, op.getType(), op.getOrigInput(), op.getGrad(),
         GetScalarConstOfType(element_type, loc, 0, &rewriter),
-        GetI64ElementsAttr(op.getKsize()), GetI64ElementsAttr(op.getStrides()),
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
         paddings_attr);
 
-    BuildReduceBody<AddOp>(element_type, &result.getScatter(), &rewriter);
+    BuildReduceBody<stablehlo::AddOp>(element_type, &result.getScatter(),
+                                      &rewriter);
     {
       OpBuilder::InsertionGuard guard(rewriter);
       Block *block = rewriter.createBlock(&result.getSelect());
@@ -4710,10 +4776,10 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
           tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
       block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
 
-      auto reducer = rewriter.create<CompareOp>(loc, block->getArgument(0),
-                                                block->getArgument(1),
-                                                ComparisonDirection::GE);
-      rewriter.create<ReturnOp>(loc, reducer.getResult());
+      auto reducer = rewriter.create<stablehlo::CompareOp>(
+          loc, block->getArgument(0), block->getArgument(1),
+          stablehlo::ComparisonDirection::GE);
+      rewriter.create<stablehlo::ReturnOp>(loc, reducer.getResult());
     }
 
     rewriter.replaceOp(op, result);
@@ -4728,8 +4794,8 @@ using ConvertMaxPool3DGradOp =
     ConvertMaxPoolGradOp<TF::MaxPool3DGradOp, /*num_dims=*/5>;
 
 // Converts tf.Conv?DBackpropInputOp into:
-//   %rev_filter = "mhlo.reverse"(%filter)
-//   %result = "mhlo.convolution"(%out_backprop, %rev_filter)
+//   %rev_filter = "stablehlo.reverse"(%filter)
+//   %result = "stablehlo.convolution"(%out_backprop, %rev_filter)
 template <typename OpTy, int num_spatial_dims>
 class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
  public:
@@ -4858,8 +4924,8 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       int64_t expanded_output_size = (output_size - 1) * stride + 1;
       int64_t pad_after = padded_out_size - expanded_output_size - pad_before;
 
-      // Populate metadata for the upcoming mhlo.conv op using the result of
-      // the computations performed above.
+      // Populate metadata for the upcoming stablehlo.conv op using the result
+      // of the computations performed above.
       lhs_dilation.push_back(stride);
       rhs_dilation.push_back(dilation);
       paddings.push_back(pad_before);
@@ -4889,7 +4955,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       Type filter_element_ty = filter_ty.getElementType();
       auto ty =
           tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<ReshapeOp>(op.getLoc(), ty, filter);
+      filter = rewriter.create<stablehlo::ReshapeOp>(op.getLoc(), ty, filter);
 
       // 2. Transpose to [H, W, ..., G, filter_in_depth, out_depth / G].
       llvm::SmallVector<int64_t, 6> perm(num_dims + 1);
@@ -4897,15 +4963,15 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       std::swap(perm[num_spatial_dims], perm[num_spatial_dims + 1]);
       std::swap(new_shape[num_spatial_dims], new_shape[num_spatial_dims + 1]);
       ty = tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<TransposeOp>(
-          op.getLoc(), ty, filter, GetI64ElementsAttr(perm, &rewriter));
+      filter = rewriter.create<stablehlo::TransposeOp>(
+          op.getLoc(), ty, filter, GetI64ArrayAttr(perm, &rewriter));
 
       // 3. Reshape to [H, W, ..., in_depth, out_depth / G].
       new_shape[num_spatial_dims] *= new_shape[num_spatial_dims + 1];
       new_shape[num_spatial_dims + 1] = new_shape.back();
       new_shape.pop_back();
       ty = tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<ReshapeOp>(op.getLoc(), ty, filter);
+      filter = rewriter.create<stablehlo::ReshapeOp>(op.getLoc(), ty, filter);
     }
 
     SmallVector<int64_t, 4> kernel_spatial_dims;
@@ -4913,21 +4979,21 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
     std::iota(kernel_spatial_dims.begin(), kernel_spatial_dims.end(), 0);
 
     // Mirror the filter in the spatial dimensions.
-    filter = rewriter.create<ReverseOp>(
-        op.getLoc(), filter,
-        GetI64ElementsAttr(kernel_spatial_dims, &rewriter));
+    filter = rewriter.create<stablehlo::ReverseOp>(
+        op.getLoc(), filter, GetI64ArrayAttr(kernel_spatial_dims, &rewriter));
 
     // activation gradients
     //   = gradients (with padding and dilation) <conv> mirrored_weights
-    Value result = rewriter.create<ConvolutionOp>(
+    Value result = rewriter.create<stablehlo::ConvolutionOp>(
         op.getLoc(), op.getType(), op.getOutBackprop(), filter,
         /*window_strides=*/
-        GetI64ElementsAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
-                                   &rewriter),
-        /*padding=*/paddings_attr, GetI64ElementsAttr(lhs_dilation, &rewriter),
-        GetI64ElementsAttr(rhs_dilation, &rewriter),
+        GetI64ArrayAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
+                                &rewriter),
+        /*padding=*/paddings_attr,
+        /*lhs_dilation=*/GetI64ArrayAttr(lhs_dilation, &rewriter),
+        /*rhs_dilation=*/GetI64ArrayAttr(rhs_dilation, &rewriter),
         /*window_reversal=*/nullptr,
-        ConvDimensionNumbersAttr::get(
+        stablehlo::ConvDimensionNumbersAttr::get(
             rewriter.getContext(),
             /*inputBatchDimension=*/batch_dim,
             /*inputFeatureDimension=*/feature_dim,
@@ -4961,7 +5027,7 @@ using ConvertConv3DBackpropInputOp =
                                /*num_spatial_dims=*/3>;
 
 // Converts tf.Conv?DBackpropFilterOp into:
-//   %result = "mhlo.convolution"(%input, %out_backprop)
+//   %result = "stablehlo.convolution"(%input, %out_backprop)
 template <typename OpTy, int num_spatial_dims>
 class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
  public:
@@ -5125,15 +5191,15 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
     const int batch_dim =
         tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
 
-    Value result = rewriter.create<ConvolutionOp>(
+    Value result = rewriter.create<stablehlo::ConvolutionOp>(
         op.getLoc(), op.getType(), op.getInput(), op.getOutBackprop(),
-        /*window_strides=*/GetI64ElementsAttr(window_strides, &rewriter),
+        /*window_strides=*/GetI64ArrayAttr(window_strides, &rewriter),
         /*padding=*/paddings_attr, /*lhs_dilation=*/
-        GetI64ElementsAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
-                                   &rewriter),
-        GetI64ElementsAttr(rhs_dilation, &rewriter),
+        GetI64ArrayAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
+                                &rewriter),
+        GetI64ArrayAttr(rhs_dilation, &rewriter),
         /*window_reversal=*/nullptr,
-        ConvDimensionNumbersAttr::get(
+        stablehlo::ConvDimensionNumbersAttr::get(
             rewriter.getContext(),
             // Swap batch_dim and feature_dim in the activations.
             /*inputBatchDimension=*/feature_dim,
@@ -5203,22 +5269,22 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
     // just using static broadcasting.
     auto index_type =
         tensorflow::GetTypeFromTFTensorShape(output_dims, element_type);
-    auto iota = rewriter.create<IotaOp>(
+    auto iota = rewriter.create<stablehlo::IotaOp>(
         loc, index_type, IntegerAttr::get(rewriter.getIntegerType(64), axis));
-    auto broadcast_indices = rewriter.create<BroadcastInDimOp>(
+    auto broadcast_indices = rewriter.create<stablehlo::BroadcastInDimOp>(
         loc, index_type, op.getIndices(),
-        GetI64ElementsAttr(broadcast_dims, &rewriter));
+        GetI64ArrayAttr(broadcast_dims, &rewriter));
 
-    Value compare = rewriter.create<mhlo::CompareOp>(
-        loc, broadcast_indices, iota, ComparisonDirection::EQ);
-    Value on_value = rewriter.create<BroadcastOp>(
+    Value compare = rewriter.create<stablehlo::CompareOp>(
+        loc, broadcast_indices, iota, stablehlo::ComparisonDirection::EQ);
+    Value on_value = rewriter.create<stablehlo::BroadcastOp>(
         loc, op.getType(), op.getOnValue(),
-        GetI64ElementsAttr(output_dims, &rewriter));
-    Value off_value = rewriter.create<BroadcastOp>(
+        GetI64ArrayAttr(output_dims, &rewriter));
+    Value off_value = rewriter.create<stablehlo::BroadcastOp>(
         loc, op.getType(), op.getOffValue(),
-        GetI64ElementsAttr(output_dims, &rewriter));
-    Value result = rewriter.create<SelectOp>(loc, op.getType(), compare,
-                                             on_value, off_value);
+        GetI64ArrayAttr(output_dims, &rewriter));
+    Value result = rewriter.create<stablehlo::SelectOp>(
+        loc, op.getType(), compare, on_value, off_value);
 
     rewriter.replaceOp(op, {result});
 
@@ -5234,17 +5300,17 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
 // operations within a computation. The token type can come from other
 // infeed/outfeed/send/recv ops or can be generated using create_token op with
 // no operands. Here we emit a create_token op to generate the token type
-// operand of infeed. The mhlo.InfeedOp can produce multiple results and later
-// will be exported to XLA infeed op with single tuple return type.
+// operand of infeed. The stablehlo.InfeedOp can produce multiple results and
+// later will be exported to XLA infeed op with single tuple return type.
 //
 // For example the following IR:
 // %0:2 = "tf.InfeedDequeueTuple"() : () -> (tensor<3xi32>, tensor<4xf32>)
 //
 // would be lowered to
 //
-// %token = "mhlo.create_token"() : () -> !mhlo.token
-// %data_and_token = "mhlo.infeed"(%token) {infeed_config = ""} :
-//      (!mhlo.token) -> tensor<3xi32>, tensor<4xf32>, !mhlo.token>
+// %token = "stablehlo.create_token"() : () -> !stablehlo.token
+// %data_and_token = "stablehlo.infeed"(%token) {infeed_config = ""} :
+//      (!stablehlo.token) -> tensor<3xi32>, tensor<4xf32>, !stablehlo.token>
 //
 class ConvertInfeedDequeueTupleOp
     : public OpRewritePattern<TF::InfeedDequeueTupleOp> {
@@ -5265,16 +5331,16 @@ class ConvertInfeedDequeueTupleOp
 
     // Infeed takes a single token operand. Generate the token using
     // create_token op to pass to the infeed op.
-    auto token = rewriter.create<CreateTokenOp>(
-        op.getLoc(), mhlo::TokenType::get(rewriter.getContext()));
+    auto token = rewriter.create<stablehlo::CreateTokenOp>(
+        op.getLoc(), stablehlo::TokenType::get(rewriter.getContext()));
 
     result_types.push_back(token.getType());
 
     ArrayAttr layout;  // filled in during the xla-adjust-layout pass
-    auto data_and_token =
-        rewriter.create<InfeedOp>(op.getLoc(), result_types, token,
-                                  /*infeed_config=*/rewriter.getStringAttr(""),
-                                  /*layout=*/layout);
+    auto data_and_token = rewriter.create<stablehlo::InfeedOp>(
+        op.getLoc(), result_types, token,
+        /*infeed_config=*/rewriter.getStringAttr(""),
+        /*layout=*/layout);
 
     result_types.pop_back();  // remove the token type.
 
@@ -5301,9 +5367,9 @@ class ConvertInfeedDequeueTupleOp
     }
 
     if (op->hasAttr("layouts")) {
-      // Append a UnitAttr for the "token" operand of the mhlo.infeed op here to
-      // avoid compilation failure when exporting "layouts" attribute of the
-      // corresponding InfeedDequeueTupleOp to a graph node.
+      // Append a UnitAttr for the "token" operand of the stablehlo.infeed op
+      // here to avoid compilation failure when exporting "layouts" attribute of
+      // the corresponding InfeedDequeueTupleOp to a graph node.
       data_and_token->setAttr("layout", op->getAttr("layouts"));
     }
     llvm::SmallVector<Value> results;
@@ -5328,10 +5394,11 @@ class ConvertInfeedDequeueTupleOp
 //
 // would be lowered to
 //
-// %token = "mhlo.create_token"() : () -> !mhlo.token
-// %outfeed_token = "mhlo.outfeed"(%val_1, %val_2, %token) {outfeed_config = ""}
+// %token = "stablehlo.create_token"() : () -> !stablehlo.token
+// %outfeed_token = "stablehlo.outfeed"(%val_1, %val_2, %token) {outfeed_config
+// = ""}
 // :
-//      (tensor<3xi32>, tensor<4xf32>, !mhlo.token) -> !mhlo.token
+//      (tensor<3xi32>, tensor<4xf32>, !stablehlo.token) -> !stablehlo.token
 //
 class ConvertOutfeedEnqueueTupleOp
     : public OpRewritePattern<TF::OutfeedEnqueueTupleOp> {
@@ -5340,11 +5407,13 @@ class ConvertOutfeedEnqueueTupleOp
 
   LogicalResult matchAndRewrite(TF::OutfeedEnqueueTupleOp op,
                                 PatternRewriter &rewriter) const override {
-    auto token_type = mhlo::TokenType::get(rewriter.getContext());
-    auto token = rewriter.create<CreateTokenOp>(op.getLoc(), token_type);
+    auto token_type = stablehlo::TokenType::get(rewriter.getContext());
+    auto token =
+        rewriter.create<stablehlo::CreateTokenOp>(op.getLoc(), token_type);
 
-    rewriter.create<OutfeedOp>(op.getLoc(), token_type, op.getInputs(), token,
-                               /*outfeed_config=*/rewriter.getStringAttr(""));
+    rewriter.create<stablehlo::OutfeedOp>(
+        op.getLoc(), token_type, op.getInputs(), token,
+        /*outfeed_config=*/rewriter.getStringAttr(""));
     rewriter.eraseOp(op);
     return success();
   }
@@ -5406,11 +5475,10 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
       begin_indices[axis] = i;
       end_indices[axis] = i + 1;
 
-      auto slice_op = rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), op.getValue(),
-          GetI64ElementsAttr(begin_indices, &rewriter),
-          GetI64ElementsAttr(end_indices, &rewriter),
-          GetI64ElementsAttr(strides, &rewriter));
+      auto slice_op = rewriter.create<stablehlo::SliceOp>(
+          op.getLoc(), op.getValue(), GetI64ArrayAttr(begin_indices, &rewriter),
+          GetI64ArrayAttr(end_indices, &rewriter),
+          GetI64ArrayAttr(strides, &rewriter));
       // Reshape to drop the axis dimension.
       auto result = rewriter.create<TF::SqueezeOp>(
           op.getLoc(), op.getType(i), slice_op,
@@ -5487,7 +5555,7 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
     for (int64_t i = 0; i < op.getNumResults(); ++i) {
       begin_indices[axis] = rewriter.create<arith::ConstantIntOp>(loc, i, 32);
       end_indices[axis] = rewriter.create<arith::ConstantIntOp>(loc, i + 1, 32);
-      Value slice_op = rewriter.create<RealDynamicSliceOp>(
+      Value slice_op = rewriter.create<stablehlo::RealDynamicSliceOp>(
           loc,
           tensorflow::GetTypeFromTFTensorShape(slice_shape,
                                                value_type.getElementType()),
@@ -5513,8 +5581,8 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(shape_values.size())}, i32_ty),
           shape_values);
-      Value reshape_op = rewriter.create<DynamicReshapeOp>(loc, op.getType(i),
-                                                           slice_op, new_shape);
+      Value reshape_op = rewriter.create<stablehlo::DynamicReshapeOp>(
+          loc, op.getType(i), slice_op, new_shape);
       results.push_back(reshape_op);
     }
 
@@ -5551,7 +5619,7 @@ class ConvertSigmoidGradOpDynamic : public OpRewritePattern<TF::SigmoidGradOp> {
       assert(mlir::isa<FloatType>(elem_tp));
       attr = rewriter.getFloatAttr(elem_tp, 1);
     }
-    Value one = rewriter.create<mhlo::ConstantOp>(
+    Value one = rewriter.create<stablehlo::ConstantOp>(
         loc, DenseElementsAttr::get(
                  tensorflow::GetTypeFromTFTensorShape({}, elem_tp), attr));
 
@@ -5616,9 +5684,9 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
     // 'operand' parameter to scatter to for the final scatter op.
     Value init = ConcreteClass::GetInitialValue(data_type.getElementType(),
                                                 op.getLoc(), &rewriter);
-    auto broadcasted_init = rewriter.create<mhlo::BroadcastOp>(
+    auto broadcasted_init = rewriter.create<stablehlo::BroadcastOp>(
         op.getLoc(), output_type, init,
-        GetI64ElementsAttr(output_shape, &rewriter));
+        GetI64ArrayAttr(output_shape, &rewriter));
 
     // Parameters for the generated scatter op.
     SmallVector<int64_t, 1> inserted_window_dims(1, 0);
@@ -5626,7 +5694,7 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
     int64_t index_vector_dim = segment_ids_rank;
 
     // Put all parameters in a StructAttr.
-    auto dims_attr = ScatterDimensionNumbersAttr::get(
+    auto dims_attr = stablehlo::ScatterDimensionNumbersAttr::get(
         rewriter.getContext(),
         llvm::to_vector<4>(llvm::seq<int64_t>(segment_ids_rank, data_rank)),
         inserted_window_dims,
@@ -5634,7 +5702,7 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
         /*scatterIndicesBatchingDims=*/{}, scatter_dims_to_operand_dims,
         index_vector_dim);
 
-    auto scatter = rewriter.create<ScatterOp>(
+    auto scatter = rewriter.create<stablehlo::ScatterOp>(
         op.getLoc(), op.getType(), ValueRange(Value(broadcasted_init)),
         op.getSegmentIds(), op.getData(), dims_attr);
     BuildReduceBody<ReductionOp>(data_type.getElementType(),
@@ -5647,7 +5715,8 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
 
 class ConvertUnsortedSegmentMaxOp
     : public GenericConvertUnsortedSegmentReductionOp<
-          ConvertUnsortedSegmentMaxOp, TF::UnsortedSegmentMaxOp, MaxOp> {
+          ConvertUnsortedSegmentMaxOp, TF::UnsortedSegmentMaxOp,
+          stablehlo::MaxOp> {
  public:
   using GenericConvertUnsortedSegmentReductionOp::
       GenericConvertUnsortedSegmentReductionOp;
@@ -5661,7 +5730,8 @@ class ConvertUnsortedSegmentMaxOp
 
 class ConvertUnsortedSegmentMinOp
     : public GenericConvertUnsortedSegmentReductionOp<
-          ConvertUnsortedSegmentMinOp, TF::UnsortedSegmentMinOp, MinOp> {
+          ConvertUnsortedSegmentMinOp, TF::UnsortedSegmentMinOp,
+          stablehlo::MinOp> {
  public:
   using GenericConvertUnsortedSegmentReductionOp::
       GenericConvertUnsortedSegmentReductionOp;
@@ -5675,7 +5745,8 @@ class ConvertUnsortedSegmentMinOp
 
 class ConvertUnsortedSegmentProdOp
     : public GenericConvertUnsortedSegmentReductionOp<
-          ConvertUnsortedSegmentProdOp, TF::UnsortedSegmentProdOp, MulOp> {
+          ConvertUnsortedSegmentProdOp, TF::UnsortedSegmentProdOp,
+          stablehlo::MulOp> {
  public:
   using GenericConvertUnsortedSegmentReductionOp::
       GenericConvertUnsortedSegmentReductionOp;
@@ -5688,7 +5759,8 @@ class ConvertUnsortedSegmentProdOp
 
 class ConvertUnsortedSegmentSumOp
     : public GenericConvertUnsortedSegmentReductionOp<
-          ConvertUnsortedSegmentSumOp, TF::UnsortedSegmentSumOp, AddOp> {
+          ConvertUnsortedSegmentSumOp, TF::UnsortedSegmentSumOp,
+          stablehlo::AddOp> {
  public:
   using GenericConvertUnsortedSegmentReductionOp::
       GenericConvertUnsortedSegmentReductionOp;
@@ -5780,11 +5852,11 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
         auto keys =
             CreateRngUniform32(op.getLoc(), num_elements, /*lower_limit=*/0,
                                /*upper_limit=*/u32_max, &rewriter);
-        auto sorted = createSortOp(
+        auto sorted = stablehlo::createSortOp(
             &rewriter, op.getLoc(), {keys, current},
             {rewriter.getIntegerType(32), input_type.getElementType()},
             /*dimension=*/-1, /*isStable=*/false,
-            /*direction=*/ComparisonDirection::LT);
+            /*direction=*/stablehlo::ComparisonDirection::LT);
         current = sorted.getResult(1);
       }
       rewriter.replaceOp(op, current);
@@ -5796,7 +5868,7 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
     // Generate range(n) as the initial value for the indices to be swapped.
     auto indices_type = tensorflow::GetTypeFromTFTensorShape(
         {first_dim_size}, rewriter.getIntegerType(32));
-    Value indices = rewriter.create<mhlo::IotaOp>(
+    Value indices = rewriter.create<stablehlo::IotaOp>(
         op.getLoc(), indices_type, rewriter.getI64IntegerAttr(0));
 
     // Generate random numbers to be used as swaps for the indices.
@@ -5812,28 +5884,26 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
 
       auto scalar_i32_type =
           tensorflow::GetTypeFromTFTensorShape({}, builder->getIntegerType(32));
-      auto one_cross_i64_type = tensorflow::GetTypeFromTFTensorShape(
-          {1}, builder->getIntegerType(64));
 
-      auto scalar_one =
-          DenseIntElementsAttr::get(one_cross_i64_type, ArrayRef<int64_t>(1));
+      auto scalar_one = builder->getDenseI64ArrayAttr({1});
 
       // We need to swap the indices[i] with indices[swaps[i]]. First get
       // these index values.
-      Value source_index =
-          builder->create<mhlo::DynamicSliceOp>(loc, indices, i, scalar_one);
-      Value swap_index = builder->create<mhlo::ReshapeOp>(
+      Value source_index = builder->create<stablehlo::DynamicSliceOp>(
+          loc, indices, i, scalar_one);
+      Value swap_index = builder->create<stablehlo::ReshapeOp>(
           loc, scalar_i32_type,
-          builder->create<mhlo::DynamicSliceOp>(loc, swaps, i, scalar_one));
-      Value target_index = builder->create<mhlo::DynamicSliceOp>(
+          builder->create<stablehlo::DynamicSliceOp>(loc, swaps, i,
+                                                     scalar_one));
+      Value target_index = builder->create<stablehlo::DynamicSliceOp>(
           loc, indices, swap_index, scalar_one);
 
       // Then perform the swap.
       // indices[i] <- indices[swaps[i]]
-      indices = builder->create<mhlo::DynamicUpdateSliceOp>(
+      indices = builder->create<stablehlo::DynamicUpdateSliceOp>(
           loc, indices.getType(), indices, target_index, llvm::ArrayRef(i));
       // indices[swaps[i]] <- indices[i]
-      indices = builder->create<mhlo::DynamicUpdateSliceOp>(
+      indices = builder->create<stablehlo::DynamicUpdateSliceOp>(
           loc, indices.getType(), indices, source_index,
           llvm::ArrayRef(swap_index));
 
@@ -5850,7 +5920,7 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
     // Gather the data using the swapped indices as the shuffled order.
     auto slice_sizes = tensorflow::ConvertMlirShapeToTF(input_type.getShape());
     slice_sizes[0] = 1;
-    auto dims_attr = GatherDimensionNumbersAttr::get(
+    auto dims_attr = stablehlo::GatherDimensionNumbersAttr::get(
         rewriter.getContext(),
         /*offsetDims=*/llvm::to_vector<4>(llvm::seq<int64_t>(1, input_rank)),
         /*collapsedSliceDims=*/{0},
@@ -5874,14 +5944,14 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
             index_to_i64);
         slice_sizes_values.push_back(i64_to_tensor);
       } else {
-        slice_sizes_values.push_back(rewriter.create<mhlo::ConstantOp>(
+        slice_sizes_values.push_back(rewriter.create<stablehlo::ConstantOp>(
             op.getLoc(), GetI64ElementsAttr({slice_sizes[i]}, &rewriter)));
       }
     }
 
-    auto slice_sizes_concat = rewriter.create<mhlo::ConcatenateOp>(
+    auto slice_sizes_concat = rewriter.create<stablehlo::ConcatenateOp>(
         op.getLoc(), slice_sizes_values, rewriter.getI64IntegerAttr(0));
-    rewriter.replaceOpWithNewOp<mhlo::DynamicGatherOp>(
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicGatherOp>(
         op, op.getType(), op.getValue(), swaped_indices, slice_sizes_concat,
         dims_attr);
 
@@ -5903,7 +5973,7 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
     NamedAttribute call_target_name = rewriter.getNamedAttr(
         "call_target_name", rewriter.getStringAttr("Sharding"));
 
-    auto custom_call = rewriter.create<mhlo::CustomCallOp>(
+    auto custom_call = rewriter.create<stablehlo::CustomCallOp>(
         op.getLoc(), op.getType(), op.getInput(),
         ArrayRef<NamedAttribute>{call_target_name});
     custom_call->setAttr(kShardingAttr, op.get_XlaShardingAttr());
@@ -5959,8 +6029,8 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
         tensorflow::GetTypeFromTFTensorShape(split_updates_shape,
                                              updates_type.getElementType()));
 
-    auto cst =
-        rewriter.create<mhlo::ConstantOp>(op.getLoc(), zero_attr).getResult();
+    auto cst = rewriter.create<stablehlo::ConstantOp>(op.getLoc(), zero_attr)
+                   .getResult();
     auto split_updates = rewriter.create<TF::SplitOp>(
         op.getLoc(), split_updates_type, cst, updates);
 
@@ -5970,7 +6040,7 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     for (auto pair :
          llvm::zip(unpacked_indices.getOutput(), split_updates.getOutput())) {
       input_indices.front() = std::get<0>(pair);
-      input = rewriter.create<mhlo::DynamicUpdateSliceOp>(
+      input = rewriter.create<stablehlo::DynamicUpdateSliceOp>(
           op.getLoc(), op.getType(), input, std::get<1>(pair), input_indices);
     }
 
@@ -5999,7 +6069,7 @@ class ConvertXlaDynamicUpdateSliceOp
     auto unpacked_indices = rewriter.create<TF::UnpackOp>(
         op.getLoc(), unpacked_indices_type, op.getIndices(),
         IntegerAttr::get(rewriter.getIntegerType(64), 0));
-    rewriter.replaceOpWithNewOp<mhlo::DynamicUpdateSliceOp>(
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicUpdateSliceOp>(
         op, op.getType(), op.getInput(), op.getUpdate(),
         unpacked_indices.getOutput());
     return success();
@@ -6029,30 +6099,30 @@ class ConvertXlaReduceScatterOp
     Location loc = op.getLoc();
     Type element_type = getElementTypeOrSelf(op.getInput().getType());
 
-    auto reduce_scatter = rewriter.create<ReduceScatterOp>(
+    auto reduce_scatter = rewriter.create<stablehlo::ReduceScatterOp>(
         loc, op.getType(), op.getInput(),
         rewriter.getIntegerAttr(rewriter.getIntegerType(64),
                                 scatter_dimension.getSExtValue()),
-        replica_groups, ChannelHandleAttr());
+        replica_groups, stablehlo::ChannelHandleAttr());
     StringRef reduce_op = op.getReduceOp();
     if (reduce_op == "Add") {
-      BuildReduceBody<AddOp>(element_type, &reduce_scatter.getComputation(),
-                             &rewriter);
+      BuildReduceBody<stablehlo::AddOp>(
+          element_type, &reduce_scatter.getComputation(), &rewriter);
     } else if (reduce_op == "Mul") {
-      BuildReduceBody<MulOp>(element_type, &reduce_scatter.getComputation(),
-                             &rewriter);
+      BuildReduceBody<stablehlo::MulOp>(
+          element_type, &reduce_scatter.getComputation(), &rewriter);
     } else if (reduce_op == "Min") {
-      BuildReduceBody<MinOp>(element_type, &reduce_scatter.getComputation(),
-                             &rewriter);
+      BuildReduceBody<stablehlo::MinOp>(
+          element_type, &reduce_scatter.getComputation(), &rewriter);
     } else if (reduce_op == "Max") {
-      BuildReduceBody<MaxOp>(element_type, &reduce_scatter.getComputation(),
-                             &rewriter);
+      BuildReduceBody<stablehlo::MaxOp>(
+          element_type, &reduce_scatter.getComputation(), &rewriter);
     } else {
       // For mean, add replicas in the same group. Then divide the sum by the
       // number of replicas in each group below.
       assert(reduce_op == "Mean");
-      BuildReduceBody<AddOp>(element_type, &reduce_scatter.getComputation(),
-                             &rewriter);
+      BuildReduceBody<stablehlo::AddOp>(
+          element_type, &reduce_scatter.getComputation(), &rewriter);
     }
     Value result = reduce_scatter.getResult();
 
@@ -6072,7 +6142,7 @@ class ConvertXlaReduceScatterOp
   }
 };
 
-// Converts tf.XlaReduceWindow to mhlo.ReduceWindow
+// Converts tf.XlaReduceWindow to stablehlo.ReduceWindow
 class ConvertXlaReduceWindowOp
     : public OpRewritePattern<TF::XlaReduceWindowOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -6093,17 +6163,13 @@ class ConvertXlaReduceWindowOp
     Location loc = op.getLoc();
 
     SmallVector<Type> result_types{op.getResult().getType()};
-    // Create the mhlo.SelectAndScatter op.
-    auto reduce_window_op = rewriter.create<mhlo::ReduceWindowOp>(
+    // Create the stablehlo.SelectAndScatter op.
+    auto reduce_window_op = rewriter.create<stablehlo::ReduceWindowOp>(
         loc, result_types, op.getInput(), op.getInitValue(),
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            window_dimensions, rewriter.getIntegerType(64))),
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            window_strides, rewriter.getIntegerType(64))),
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            base_dilations, rewriter.getIntegerType(64))),
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            window_dilations, rewriter.getIntegerType(64))),
+        ToDenseI64ArrayAttr(window_dimensions, &rewriter),
+        ToDenseI64ArrayAttr(window_strides, &rewriter),
+        ToDenseI64ArrayAttr(base_dilations, &rewriter),
+        ToDenseI64ArrayAttr(window_dilations, &rewriter),
         mlir::cast<DenseIntElementsAttr>(
             hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))));
     // Insert a call to the reducer in the region of the mhlo op.
@@ -6156,7 +6222,8 @@ class ConvertClipByValueOp : public OpRewritePattern<TF::ClipByValueOp> {
           rewriter.create<TF::BroadcastToOp>(op.getLoc(), input_ty, max, shape);
     }
 
-    rewriter.replaceOpWithNewOp<mhlo::ClampOp>(op, input_ty, min, input, max);
+    rewriter.replaceOpWithNewOp<stablehlo::ClampOp>(op, input_ty, min, input,
+                                                    max);
     return success();
   }
 };
@@ -6176,7 +6243,7 @@ class ConvertConstOp : public OpRewritePattern<TF::ConstOp> {
       return failure();
 
     Location loc = op.getLoc();
-    Value result = rewriter.create<mhlo::ConstantOp>(loc, op.getValue());
+    Value result = rewriter.create<stablehlo::ConstantOp>(loc, op.getValue());
     if (result.getType() != op.getType())
       result = rewriter.create<tensor::CastOp>(loc, op.getType(), result);
     rewriter.replaceOp(op, result);
@@ -6196,10 +6263,12 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
   LogicalResult matchAndRewrite(OpT op,
                                 PatternRewriter &rewriter) const override {
     auto input = mlir::dyn_cast<TypedValue<RankedTensorType>>(op.getX());
-    if (!input) return failure();
+    if (!input) {
+      return rewriter.notifyMatchFailure(op, "input X not ranked tensor");
+    }
     auto input_type = mlir::dyn_cast<ShapedType>(input.getType());
     if (!input_type || !input_type.hasStaticShape()) {
-      return failure();
+      return rewriter.notifyMatchFailure(op, "input not static shape");
     }
 
     ArrayRef<int64_t> input_shape = input_type.getShape();
@@ -6208,7 +6277,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     // We can only match when the axis is a constant scalar.
     DenseIntElementsAttr axis_attr;
     if (!matchPattern(op.getAxis(), m_Constant(&axis_attr))) {
-      return failure();
+      return rewriter.notifyMatchFailure(op, "axis not constant");
     }
 
     // Get the dimension to apply the reduction on, and offset properly if it is
@@ -6222,8 +6291,8 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     // the input and then later reverse the output.
     if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
-      input = rewriter.create<ReverseOp>(
-          op.getLoc(), input, GetI64ElementsAttr(dims_to_reverse, &rewriter));
+      input = rewriter.create<stablehlo::ReverseOp>(
+          op.getLoc(), input, GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
     // Convert if we need to enlarge the element type's bitwidth to avoid
@@ -6231,10 +6300,14 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     Type input_element_type = input_type.getElementType();
 
     // TODO(hinsu): Handle complex element types.
-    if (!input_element_type.isIntOrFloat()) return failure();
+    if (!input_element_type.isIntOrFloat()) {
+      return rewriter.notifyMatchFailure(op,
+                                         "input element type not int or float");
+    }
 
     Type sum_element_type = GetSumAccumulationType(input_element_type);
-    input = rewriter.create<ConvertOp>(op.getLoc(), input, sum_element_type);
+    input = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), input,
+                                                  sum_element_type);
 
     SmallVector<int64_t, 4> window_dims(rank, 1);
     SmallVector<int64_t, 4> window_strides(rank, 1);
@@ -6248,16 +6321,17 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
                                       {rank, 2}, rewriter.getIntegerType(64)),
                                   paddings);
 
-    int64_t init_value = (std::is_same<AggregationOp, AddOp>::value) ? 0 : 1;
+    int64_t init_value =
+        (std::is_same<AggregationOp, stablehlo::AddOp>::value) ? 0 : 1;
     Value init = GetScalarConstOfType(sum_element_type, op.getLoc(), init_value,
                                       &rewriter);
 
-    auto reduce = rewriter.create<ReduceWindowOp>(
+    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
         op.getLoc(), input.getType(), input, init,
-        GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_dims)),
-        GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_strides)),
-        /*base_dilations=*/DenseIntElementsAttr(),
-        /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
+        GetI64ArrayAttr(window_dims, &rewriter),
+        GetI64ArrayAttr(window_strides, &rewriter),
+        /*base_dilations=*/DenseI64ArrayAttr(),
+        /*window_dilations=*/DenseI64ArrayAttr(), paddings_attr);
     BuildReduceBody<AggregationOp>(sum_element_type, &reduce.getBody(),
                                    &rewriter);
     Value result = reduce.getResult(0);
@@ -6272,20 +6346,20 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
       llvm::SmallVector<int64_t, 4> interior_padding(rank, 0);
       low_padding[axis] = 1;
       high_padding[axis] = -1;
-      result = rewriter.create<PadOp>(
-          op.getLoc(), result, init, GetI64ElementsAttr(low_padding, &rewriter),
-          GetI64ElementsAttr(high_padding, &rewriter),
-          GetI64ElementsAttr(interior_padding, &rewriter));
+      result = rewriter.create<stablehlo::PadOp>(
+          op.getLoc(), result, init, GetI64ArrayAttr(low_padding, &rewriter),
+          GetI64ArrayAttr(high_padding, &rewriter),
+          GetI64ArrayAttr(interior_padding, &rewriter));
     }
 
     // Convert back if we enlarged the element type's bitwidth.
-    result =
-        rewriter.create<ConvertOp>(op.getLoc(), result, input_element_type);
+    result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
+                                                   input_element_type);
 
     if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
-      result = rewriter.create<ReverseOp>(
-          op.getLoc(), result, GetI64ElementsAttr(dims_to_reverse, &rewriter));
+      result = rewriter.create<stablehlo::ReverseOp>(
+          op.getLoc(), result, GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
     rewriter.replaceOp(op, result);
@@ -6293,8 +6367,8 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
   }
 };
 
-using ConvertCumsumOp = ConvertCumOp<TF::CumsumOp, AddOp>;
-using ConvertCumprodOp = ConvertCumOp<TF::CumprodOp, MulOp>;
+using ConvertCumsumOp = ConvertCumOp<TF::CumsumOp, stablehlo::AddOp>;
+using ConvertCumprodOp = ConvertCumOp<TF::CumprodOp, stablehlo::MulOp>;
 
 // Converts the Tensorflow ShapeOp to a sequence of Shape dialect and Standard
 // dialect lowerings. This involves extracting the shape type, extracting and
@@ -6374,8 +6448,8 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
 
     auto from_extents =
         rewriter.create<tensor::FromElementsOp>(op.getLoc(), dims);
-    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_ty, input,
-                                                        from_extents);
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(
+        op, result_ty, input, from_extents);
     return success();
   }
 };
@@ -6421,13 +6495,13 @@ class ConvertDynamicSqueezeOp : public OpRewritePattern<TF::SqueezeOp> {
 
     auto from_extents =
         rewriter.create<tensor::FromElementsOp>(op.getLoc(), dims);
-    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_ty, input,
-                                                        from_extents);
+    rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(
+        op, result_ty, input, from_extents);
     return success();
   }
 };
 
-// Converts tf.XlaConvV2 to mhlo.Conv
+// Converts tf.XlaConvV2 to stablehlo.Conv
 class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
  public:
   using OpRewritePattern::OpRewritePattern;
@@ -6446,23 +6520,17 @@ class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
       return failure();
 
     auto window_strides_named_attr = rewriter.getNamedAttr(
-        "window_strides",
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            window_strides_attr, rewriter.getIntegerType(64))));
+        "window_strides", ToDenseI64ArrayAttr(window_strides_attr, &rewriter));
 
     auto padding_named_attr = rewriter.getNamedAttr(
         "padding", mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
                        padding_attr, rewriter.getIntegerType(64))));
 
     auto lhs_dilation_named_attr = rewriter.getNamedAttr(
-        "lhs_dilation",
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            lhs_dilation_attr, rewriter.getIntegerType(64))));
+        "lhs_dilation", ToDenseI64ArrayAttr(lhs_dilation_attr, &rewriter));
 
     auto rhs_dilation_named_attr = rewriter.getNamedAttr(
-        "rhs_dilation",
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            rhs_dilation_attr, rewriter.getIntegerType(64))));
+        "rhs_dilation", ToDenseI64ArrayAttr(rhs_dilation_attr, &rewriter));
 
     int64_t feature_group_count_val =
         feature_group_count_attr.getValues<IntegerAttr>()[0].getInt();
@@ -6477,14 +6545,14 @@ class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
     dnums.ParseFromString(op.getDimensionNumbersAttr().getValue().str());
     auto dimension_numbers_named_attr = rewriter.getNamedAttr(
         "dimension_numbers",
-        xla::ConvertConvDimensionNumbers(dnums, &rewriter));
+        xla::stablehlo::ConvertConvDimensionNumbers(dnums, &rewriter));
 
     xla::PrecisionConfig precision_config;
     precision_config.ParseFromString(
         op.getPrecisionConfigAttr().getValue().str());
     auto precision_config_named_attr = rewriter.getNamedAttr(
         "precision_config",
-        xla::ConvertPrecisionConfig(&precision_config, &rewriter));
+        xla::stablehlo::ConvertPrecisionConfig(&precision_config, &rewriter));
 
     SmallVector<Value, 2> operands{op.getLhs(), op.getRhs()};
     NamedAttribute attrs[] = {
@@ -6492,13 +6560,13 @@ class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
         lhs_dilation_named_attr,        rhs_dilation_named_attr,
         feature_group_count_named_attr, batch_group_count_named_attr,
         dimension_numbers_named_attr,   precision_config_named_attr};
-    rewriter.replaceOpWithNewOp<mhlo::ConvolutionOp>(op, op.getType(), operands,
-                                                     llvm::ArrayRef(attrs));
+    rewriter.replaceOpWithNewOp<stablehlo::ConvolutionOp>(
+        op, op.getType(), operands, llvm::ArrayRef(attrs));
     return success();
   }
 };
 
-// Converts tf.XlaSelectAndScatter to mhlo.SelectAndScatter
+// Converts tf.XlaSelectAndScatter to stablehlo.SelectAndScatter
 class ConvertXlaSelectAndScatterOp
     : public OpRewritePattern<TF::XlaSelectAndScatterOp> {
  public:
@@ -6516,13 +6584,11 @@ class ConvertXlaSelectAndScatterOp
     Location loc = op.getLoc();
 
     SmallVector<Type> result_types{op.getResult().getType()};
-    // Create the mhlo.SelectAndScatter op.
-    auto select_and_scatter_op = rewriter.create<mhlo::SelectAndScatterOp>(
+    // Create the stablehlo.SelectAndScatter op.
+    auto select_and_scatter_op = rewriter.create<stablehlo::SelectAndScatterOp>(
         loc, result_types, op.getOperand(), op.getSource(), op.getInitValue(),
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            window_dimensions, rewriter.getIntegerType(64))),
-        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
-            window_strides, rewriter.getIntegerType(64))),
+        ToDenseI64ArrayAttr(window_dimensions, &rewriter),
+        ToDenseI64ArrayAttr(window_strides, &rewriter),
         mlir::cast<DenseIntElementsAttr>(
             hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))));
 
@@ -6545,7 +6611,7 @@ class ConvertXlaSelectAndScatterOp
   }
 };
 
-// Convert tf.XlaSort to mhlo.Sort
+// Convert tf.XlaSort to stablehlo.Sort
 class ConvertXlaSortOp : public OpRewritePattern<TF::XlaSortOp> {
  public:
   using OpRewritePattern::OpRewritePattern;
@@ -6554,10 +6620,10 @@ class ConvertXlaSortOp : public OpRewritePattern<TF::XlaSortOp> {
                                 PatternRewriter &rewriter) const override {
     // Create the sort op.
     Type element_type = getElementTypeOrSelf(op.getInput().getType());
-    auto sort_op =
-        createSortOp(&rewriter, op.getLoc(), {op.getInput()}, {element_type},
-                     /*dimension=*/-1, /*isStable=*/false,
-                     /*direction=*/ComparisonDirection::LT);
+    auto sort_op = stablehlo::createSortOp(
+        &rewriter, op.getLoc(), {op.getInput()}, {element_type},
+        /*dimension=*/-1, /*isStable=*/false,
+        /*direction=*/stablehlo::ComparisonDirection::LT);
     rewriter.replaceOp(op, sort_op.getResult(0));
     return success();
   }
@@ -6575,7 +6641,7 @@ inline std::optional<xla::RandomAlgorithm> TensorFlowRngAlgToXla(
   return std::nullopt;
 }
 
-// Converts tf.XlaRngBitGenerator op to mhlo.RngBitGenerator op.
+// Converts tf.XlaRngBitGenerator op to stablehlo.RngBitGenerator op.
 class ConvertXlaRngBitGeneratorOp
     : public OpRewritePattern<TF::XlaRngBitGeneratorOp> {
  public:
@@ -6596,10 +6662,10 @@ class ConvertXlaRngBitGeneratorOp
       return op.emitOpError() << "unknown algorithm";
     }
 
-    auto algorithm_attr = mlir::mhlo::RngAlgorithmAttr::get(
+    auto algorithm_attr = mlir::stablehlo::RngAlgorithmAttr::get(
         rewriter.getContext(),
-        *mlir::mhlo::symbolizeRngAlgorithm(xla_alg.value()));
-    auto rng_bit_generator_op = rewriter.create<mhlo::RngBitGeneratorOp>(
+        *mlir::stablehlo::symbolizeRngAlgorithm(xla_alg.value()));
+    auto rng_bit_generator_op = rewriter.create<stablehlo::RngBitGeneratorOp>(
         loc, op.getResultTypes(), algorithm_attr, op.getInitialState());
 
     rewriter.replaceOp(op, rng_bit_generator_op.getResults());
@@ -6608,7 +6674,7 @@ class ConvertXlaRngBitGeneratorOp
   }
 };
 
-// Converts tf.XlaVariadicReduceV2 to mhlo.Reduce
+// Converts tf.XlaVariadicReduceV2 to stablehlo.Reduce
 class ConvertXlaVariadicReduceV2Op
     : public OpRewritePattern<TF::XlaVariadicReduceV2Op> {
  public:
@@ -6626,10 +6692,12 @@ class ConvertXlaVariadicReduceV2Op
         func_ty.getResults(),
         [](Type ty) { return mlir::cast<ShapedType>(ty).getElementType(); })};
 
-    // Create the mhlo.reduce op.
-    auto reduce_op = rewriter.create<mhlo::ReduceOp>(
+    // Create the stablehlo.reduce op.
+    auto reduce_op = rewriter.create<stablehlo::ReduceOp>(
         loc, op.getInputs(), op.getInitValues(),
-        GetI64ElementsAttr(op.getDimensionsToReduce()), elementTypes);
+        ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getDimensionsToReduce()),
+                            &rewriter),
+        elementTypes);
 
     // Insert a call to the reducer in the region of the mhlo op.
     BuildBodyWithCall(rewriter, loc, func, func_ty, &reduce_op.getBody());
@@ -6640,7 +6708,7 @@ class ConvertXlaVariadicReduceV2Op
   }
 };
 
-// Convert tf.XlaVariadicSort to mhlo.Sort
+// Convert tf.XlaVariadicSort to stablehlo.Sort
 class ConvertXlaVariadicSortOp
     : public OpRewritePattern<TF::XlaVariadicSortOp> {
  public:
@@ -6651,8 +6719,8 @@ class ConvertXlaVariadicSortOp
     Location loc = op.getLoc();
     ElementsAttr dimension;
     matchPattern(op.getDimension(), m_Constant(&dimension));
-    // Create the mhlo.sort op.
-    auto sort_op = rewriter.create<mhlo::SortOp>(
+    // Create the stablehlo.sort op.
+    auto sort_op = rewriter.create<stablehlo::SortOp>(
         loc, op.getInputs(), dimension.getValues<IntegerAttr>()[0].getInt(),
         op.getIsStable());
     mlir::SymbolRefAttr func = op.getComparator();
@@ -6667,7 +6735,7 @@ class ConvertXlaVariadicSortOp
   }
 };
 
-// Convert tf.XlaReducePrecision to mhlo.ReducePrecision
+// Convert tf.XlaReducePrecision to stablehlo.ReducePrecision
 class ConvertXlaReducePrecisionOp
     : public OpRewritePattern<TF::XlaReducePrecisionOp> {
  public:
@@ -6685,7 +6753,7 @@ class ConvertXlaReducePrecisionOp
     APInt mantissa_bits = op.getMantissaBitsAttr().getValue();
     IntegerAttr new_mantissa_attr =
         IntegerAttr::get(int32_type, mantissa_bits.truncSSat(32));
-    rewriter.replaceOpWithNewOp<mhlo::ReducePrecisionOp>(
+    rewriter.replaceOpWithNewOp<stablehlo::ReducePrecisionOp>(
         op, op.getType(), op.getOperand(), new_exponent_attr,
         new_mantissa_attr);
     return success();
@@ -6699,7 +6767,7 @@ class LowerYieldOp : public OpConversionPattern<TF::YieldOp> {
   LogicalResult matchAndRewrite(
       TF::YieldOp op, TF::YieldOp::Adaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<mhlo::ReturnOp>(op, adaptor.getOperands());
+    rewriter.replaceOpWithNewOp<stablehlo::ReturnOp>(op, adaptor.getOperands());
     return success();
   }
 };
@@ -6723,7 +6791,7 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
   LogicalResult matchAndRewrite(
       SrcOpT op, typename SrcOpT::Adaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    DstOpT mhlo_op;
+    DstOpT stablehlo_op;
     Location loc = op.getLoc();
 
     // To handle quant type conversions, use the converted operands' element
@@ -6731,20 +6799,20 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
     // result types. This is only done for the While op for now.
     llvm::SmallVector<Type, 4> element_types;
     int64_t num_results = op.getNumResults();
-    if constexpr (std::is_same<DstOpT, mhlo::WhileOp>::value) {
+    if constexpr (std::is_same<DstOpT, stablehlo::WhileOp>::value) {
       element_types.reserve(num_results);
       for (Value value : adaptor.getOperands()) {
         element_types.push_back(getElementTypeOrSelf(value.getType()));
       }
     }
 
-    if constexpr (std::is_same<DstOpT, mhlo::CaseOp>::value) {
+    if constexpr (std::is_same<DstOpT, stablehlo::CaseOp>::value) {
       // Explicitly handle the Case op because it has variadic regions and takes
       // the number of regions as an input along with the operands.
-      mhlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
-                                        adaptor.getBranchIndex(),
-                                        op.getBranches().size());
-    } else if constexpr (std::is_same<DstOpT, mhlo::WhileOp>::value) {
+      stablehlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
+                                             adaptor.getBranchIndex(),
+                                             op.getBranches().size());
+    } else if constexpr (std::is_same<DstOpT, stablehlo::WhileOp>::value) {
       llvm::SmallVector<Type, 4> while_result_types;
       while_result_types.reserve(num_results);
       for (int64_t idx = 0; idx < num_results; ++idx) {
@@ -6752,21 +6820,21 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
         while_result_types.push_back(ty);
       }
 
-      mhlo_op = rewriter.create<DstOpT>(loc, TypeRange(while_result_types),
-                                        adaptor.getOperands());
+      stablehlo_op = rewriter.create<DstOpT>(loc, TypeRange(while_result_types),
+                                             adaptor.getOperands());
     } else {
-      mhlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
-                                        adaptor.getOperands());
+      stablehlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
+                                             adaptor.getOperands());
     }
 
     int64_t num_regions = op.getNumRegions();
     for (int64_t idx = 0; idx < num_regions; ++idx) {
-      Region &region = mhlo_op.getBodyRegion(idx);
+      Region &region = stablehlo_op.getBodyRegion(idx);
       rewriter.inlineRegionBefore(op.getBodyRegion(idx), region, region.end());
 
       // Update region's entry blocks argument types to handle quantized element
       // types.
-      if constexpr (std::is_same<DstOpT, mhlo::WhileOp>::value) {
+      if constexpr (std::is_same<DstOpT, stablehlo::WhileOp>::value) {
         TypeConverter::SignatureConversion signature(num_results);
         Block &block = region.front();
         for (const auto &[block_idx, original_ty] :
@@ -6780,13 +6848,14 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
     }
 
     // Replace all uses of `op` results with the newly created op.
-    rewriter.replaceOp(op, mhlo_op);
+    rewriter.replaceOp(op, stablehlo_op);
     return success();
   }
 };
 }  // end namespace
 
 #include "tensorflow/compiler/mlir/tf2xla/transforms/generated_legalize_tf.inc"
+
 // LINT.IfChange
 void PopulateLegalizeTfPatterns(MLIRContext *context,
                                 RewritePatternSet *patterns) {
@@ -6886,12 +6955,13 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
     ConvertConv2DDynamic,
     ConvertPadOpDynamic,
     ConvertGatherNdOpDynamic,
-    LowerControlFlowOp<TF::CaseRegionOp, mhlo::CaseOp>,
-    LowerControlFlowOp<TF::IfRegionOp, mhlo::IfOp>,
-    LowerControlFlowOp<TF::WhileRegionOp, mhlo::WhileOp>,
+    LowerControlFlowOp<TF::CaseRegionOp, stablehlo::CaseOp>,
+    LowerControlFlowOp<TF::IfRegionOp, stablehlo::IfOp>,
+    LowerControlFlowOp<TF::WhileRegionOp, stablehlo::WhileOp>,
     LowerYieldOp>(context);
   // clang-format on
 }
 // LINT.ThenChange(:MlirAlwaysOps)
-}  // end namespace mhlo
+
+}  // namespace hlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index 1f6a999cc337..5507c82bc6f4 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -20,8 +20,9 @@ include "mlir/Dialect/Shape/IR/ShapeOps.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Tensor/IR/TensorOps.td"
 include "stablehlo/dialect/ChloOps.td"
+include "stablehlo/dialect/StablehloOps.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "mhlo/IR/hlo_ops.td"
+include "mhlo/IR/hlo_ops.td"  // for hlo_utils.td
 
 def SignedIntTensor : TensorOf<[I1, I8, I16, I32, I64]>;
 def UnsignedIntTensor : TensorOf<[UI8, UI16, UI32, UI64]>;
@@ -33,41 +34,51 @@ def IEEEFloatTensor : TensorOf<[F16, F32, F64]>;
 // BatchNorm op patterns.
 //===----------------------------------------------------------------------===//
 
-def FalseBoolAttr : AttrConstraint<CPred<"!$_self.cast<BoolAttr>().getValue()">>;
-def TrueBoolAttr : AttrConstraint<CPred<"$_self.cast<BoolAttr>().getValue()">>;
+def FalseBoolAttr : AttrConstraint<CPred<"!llvm::cast<BoolAttr>($_self).getValue()">>;
+def TrueBoolAttr : AttrConstraint<CPred<"llvm::cast<BoolAttr>($_self).getValue()">>;
 
 def CastValueToI64: NativeCodeCall<
   "CastValueToI64($0.getLoc(), $1, &$_builder)">;
 
 def CastValueToElementType: NativeCodeCall<
-  "$_builder.create<ConvertOp>($0.getLoc(), $1, "
+  "$_builder.create<stablehlo::ConvertOp>($0.getLoc(), $1, "
   "getElementTypeOrSelf($2.getType()))">;
 
 // Here, $0 is an ElementsAttr with exactly one element of type integer. $1 is
 // the corresponding value of ranked tensor type whose axis is referred in $0.
 def GetHLOAxisFromTFAxis : NativeCodeCall<
   "GetHLOAxisFromTFAxis("
-  "$0, $1.getType().cast<RankedTensorType>().getRank(), &$_builder)">;
+  "$0, llvm::cast<RankedTensorType>($1.getType()).getRank(), &$_builder)">;
 
 // Same as the above but with $1 of type operand_range from variadic TensorFlow
 // input.
 def GetHLOAxisFromTFAxisVariadic : NativeCodeCall<
   "GetHLOAxisFromTFAxis("
-  "$0, (*$1.begin()).getType().cast<RankedTensorType>().getRank(), "
+  "$0, llvm::cast<RankedTensorType>((*$1.begin()).getType()).getRank(), "
   "&$_builder)">;
 
-def CastElementsToI64Elements : NativeCodeCall<
-  "hlo::convertElementsAttr("
-    "$0.cast<ElementsAttr>(), $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
+def CastElementsToI64Elements : NativeCodeCall<[{
+  llvm::cast<mlir::DenseIntElementsAttr>(hlo::convertElementsAttr(
+    llvm::cast<ElementsAttr>($0), $_builder.getIntegerType(64)))
+  }]>;
 
-def EmptyDotAlgorithmAttr : NativeCodeCall<"mlir::mhlo::DotAlgorithmAttr{}">;
+def CastElementsToI64Array : NativeCodeCall<[{
+  ToDenseI64ArrayAttr(
+    llvm::cast<mlir::DenseIntElementsAttr>(hlo::convertElementsAttr(
+      llvm::cast<ElementsAttr>($0), $_builder.getIntegerType(64))), &$_builder)
+  }]>;
+
+def EmptyDotAlgorithmAttr : NativeCodeCall<"mlir::stablehlo::DotAlgorithmAttr{}">;
+
+def ConstDefaultResultAccuracyAttr :
+  ConstantAttr<StableHLO_ResultAccuracyAttr, "::mlir::stablehlo::ResultAccuracyMode::DEFAULT">;
 
 //===----------------------------------------------------------------------===//
 // ApproximateEqual op pattern.
 //===----------------------------------------------------------------------===//
 
-class MHLO_ComparisonDirectionValue<string enumStr> :
-  ConstantAttr<MHLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
+class StableHLO_ComparisonDirectionValue<string enumStr> :
+  ConstantAttr<StableHLO_ComparisonDirectionAttr, "::mlir::stablehlo::ComparisonDirection::" # enumStr>;
 
 class CHLO_ComparisonDirectionValue<string enumStr> :
   ConstantAttr<CHLO_ComparisonDirectionAttr, "::mlir::chlo::ComparisonDirection::" # enumStr>;
@@ -75,8 +86,8 @@ class CHLO_ComparisonDirectionValue<string enumStr> :
 // TODO(b/228291745): Assert that $x and $y have the same shape.
 def : Pat<(TF_ApproximateEqualOp:$result $x, $y, $tolerance),
           (CHLO_BroadcastCompareOp
-           (MHLO_AbsOp:$abs (MHLO_SubtractOp $x, $y)),
-           (CastValueToElementType $result, (MHLO_ConstantOp $tolerance), $abs),
+           (StableHLO_AbsOp:$abs (StableHLO_SubtractOp $x, $y)),
+           (CastValueToElementType $result, (StableHLO_ConstantOp $tolerance), $abs),
            (NullDenseI64ArrayAttr),
            CHLO_ComparisonDirectionValue<"LT">,
            (CHLO_DEFAULT_COMPARISON_TYPE))>;
@@ -133,7 +144,7 @@ def LowerRightShiftUnsigned :
 //
 //  return floor(div(x, y))
 def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
-          (MHLO_FloorOp
+          (StableHLO_FloorOp
            (CHLO_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r))),
           [(IEEEFloatTensor $l)]>;
 
@@ -148,7 +159,7 @@ def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
 // dimensions. This computes the broadcast of 'l' to broadcast('l', 'r')
 // without returning the broadcast of 'r' to broadcast('l', 'r').
 def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
-      (MHLO_SelectOp
+      (StableHLO_SelectOp
        (CHLO_BroadcastAndOp
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastMulOp:$mul
@@ -159,18 +170,18 @@ def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
          (CHLO_DEFAULT_COMPARISON_TYPE)),
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastCompareOp:$l_cmp $l,
-          (MHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
+          (StableHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
           (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (CHLO_BroadcastCompareOp:$r_cmp $r,
-          (MHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
+          (StableHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
           (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (BinBroadcastDimensions $l_cmp, $r_cmp), CHLO_ComparisonDirectionValue<"NE">,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
         (NullDenseI64ArrayAttr)),
        (CHLO_BroadcastSubOp $div,
-        (MHLO_ConstantOp:$ones (GetScalarOfType<1> $div)),
+        (StableHLO_ConstantOp:$ones (GetScalarOfType<1> $div)),
         (NullDenseI64ArrayAttr)), $div),
       [(SignedIntTensor $l)]>;
 
@@ -186,16 +197,16 @@ def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
 //   return trunc_mod != 0 && (y < 0 != trunc_mod < 0) ? trunc_mod + y
 //                                                     : trunc_mod
 def : Pat<(TF_FloorModOp AnyTensor:$l, AnyTensor:$r),
-      (MHLO_SelectOp
+      (StableHLO_SelectOp
        (CHLO_BroadcastAndOp
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
-         (MHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
+         (StableHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
          (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"NE">,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastCompareOp:$r_cmp $r,
-          (MHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
+          (StableHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
           (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (CHLO_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
@@ -216,10 +227,10 @@ def : Pat<(TF_FloorModOp AnyTensor:$l, AnyTensor:$r),
 def Get2DTransposePerm: NativeCodeCall<
   "Get2DTransposePerm($0, &$_builder)">;
 
-def : Pat<(TF_RiscAddOp $l, $r), (MHLO_AddOp $l, $r)>;
+def : Pat<(TF_RiscAddOp $l, $r), (StableHLO_AddOp $l, $r)>;
 
 def : Pat<(TF_RiscDotOp $a, $b, $transpose_a, $transpose_b),
-          (MHLO_DotOp
+          (StableHLO_DotOp
           (TF_TransposeOp $a, (TF_ConstOp (Get2DTransposePerm $transpose_a))),
           (TF_TransposeOp $b, (TF_ConstOp (Get2DTransposePerm $transpose_b))),
           /*precision_config=*/(NullArrayAttr))>;
@@ -261,7 +272,7 @@ class EqualityPat<Op FromOp, CHLO_ComparisonDirectionValue direction>
         (CHLO_BroadcastCompareOp
          $l, $r, (BinBroadcastDimensions $l, $r), direction,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
-        [(MHLO_Tensor $l)]>;
+        [(HLO_Tensor $l)]>;
 
 def : EqualityPat<TF_EqualOp, CHLO_ComparisonDirectionValue<"EQ">>;
 def : EqualityPat<TF_NotEqualOp, CHLO_ComparisonDirectionValue<"NE">>;
@@ -271,17 +282,17 @@ def : EqualityPat<TF_NotEqualOp, CHLO_ComparisonDirectionValue<"NE">>;
 //===----------------------------------------------------------------------===//
 
 def OneElementAttrPred
-  : CPred<"$_self.cast<ElementsAttr>().getShapedType().getNumElements() == 1">;
+  : CPred<"llvm::cast<ElementsAttr>($_self).getShapedType().getNumElements() == 1">;
 
 def OneElementAttr
   : ElementsAttrBase<And<[ElementsAttr.predicate, OneElementAttrPred]>,
                      "Scalar ElementsAttr">;
 
 def HasRankedFirstOperand
-  : Constraint<CPred<"(*$0.begin()).getType().isa<RankedTensorType>()">>;
+  : Constraint<CPred<"llvm::isa<RankedTensorType>((*$0.begin()).getType())">>;
 
 def IsShapedTensor
-  : Constraint<CPred<"$0.getType().isa<RankedTensorType>()">>;
+  : Constraint<CPred<"llvm::isa<RankedTensorType>($0.getType())">>;
 
 // This pattern converts TensorFlow axis format to HLO axis format which
 // doesn't wrap around like TensorFlow and is always positive. For this
@@ -292,7 +303,7 @@ def IsShapedTensor
 // if HLO constant op is introduced as an replacement for the TensorFlow
 // Constant op.
 def : Pat<(TF_ConcatV2Op $inputs, (ConstantLikeMatcher OneElementAttr:$axis)),
-          (MHLO_ConcatenateOp $inputs,
+          (StableHLO_ConcatenateOp $inputs,
             (GetHLOAxisFromTFAxisVariadic $axis, $inputs)),
           [(HasRankedFirstOperand $inputs)]>;
 
@@ -301,16 +312,16 @@ def : Pat<(TF_ConcatV2Op $inputs, (ConstantLikeMatcher OneElementAttr:$axis)),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_CollectivePermuteOp $input, (ConstantLikeMatcher ElementsAttr:$source_target_pairs)),
-          (MHLO_CollectivePermuteOp $input,
+          (StableHLO_CollectivePermuteOp $input,
             (CastElementsToI64Elements $source_target_pairs),
-            (NullChannelHandleAttr))>;
+            (StableHLO_NullChannelHandleAttr))>;
 
 //===----------------------------------------------------------------------===//
 // CrossReplicaSum op patterns.
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_CrossReplicaSumOp $input, (ConstantLikeMatcher ElementsAttr:$group_assignment)),
-          (MHLO_CrossReplicaSumOp $input,
+          (StableHLO_CrossReplicaSumOp $input,
             (CastElementsToI64Elements $group_assignment))>;
 
 //===----------------------------------------------------------------------===//
@@ -319,27 +330,27 @@ def : Pat<(TF_CrossReplicaSumOp $input, (ConstantLikeMatcher ElementsAttr:$group
 
 def ValueToVariadic: NativeCodeCall<"SmallVector<Value, 1>{$0}">;
 def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (ConstantLikeMatcher ElementsAttr:$group_assignment), I64Attr:$concat_dimension, $split_dimension, $split_count),
-          (MHLO_AllToAllOp (ValueToVariadic $input), $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment), (NullChannelHandleAttr))>;
+          (StableHLO_AllToAllOp (ValueToVariadic $input), $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment), (StableHLO_NullChannelHandleAttr))>;
 
 //===----------------------------------------------------------------------===//
 // FFT op patterns.
 //===----------------------------------------------------------------------===//
 
-class MHLO_FftTypeValue<string enumStr> :
-  ConstantAttr<MHLO_FftTypeAttr, "::mlir::mhlo::FftType::" # enumStr>;
+class StableHLO_FftTypeValue<string enumStr> :
+  ConstantAttr<StableHLO_FftTypeAttr, "::mlir::stablehlo::FftType::" # enumStr>;
 
 def GetInnerDimFromValue : NativeCodeCall<
-  "GetInnerDimFromValue($0.getType().cast<ShapedType>(), &$_builder)">;
+  "GetInnerDimFromValue(llvm::cast<ShapedType>($0.getType()), &$_builder)">;
 
 def CheckInnerDimStatic
-  : Constraint<CPred<"CheckInnerDimStatic($0.getType().cast<ShapedType>(), &$_builder)">>;
+  : Constraint<CPred<"CheckInnerDimStatic(llvm::cast<ShapedType>($0.getType()), &$_builder)">>;
 
 def : Pat<(TF_FFTOp:$res $input),
-          (MHLO_FftOp $input, MHLO_FftTypeValue<"FFT">, (GetInnerDimFromValue $res)),
+          (StableHLO_FftOp $input, StableHLO_FftTypeValue<"FFT">, (GetInnerDimFromValue $res)),
           [(CheckInnerDimStatic $input)]>;
 
 def : Pat<(TF_IFFTOp:$res $input),
-          (MHLO_FftOp $input, MHLO_FftTypeValue<"IFFT">, (GetInnerDimFromValue $res)),
+          (StableHLO_FftOp $input, StableHLO_FftTypeValue<"IFFT">, (GetInnerDimFromValue $res)),
           [(CheckInnerDimStatic $input)]>;
 
 //===----------------------------------------------------------------------===//
@@ -352,7 +363,7 @@ def : Pat<(TF_IFFTOp:$res $input),
 def LegalizeGatherV2 :
   Pat<(TF_GatherV2Op AnyRankedTensor:$params, AnyRankedTensor:$indices,
         (ConstantLikeMatcher ElementsAttr:$axis), $batch_dims),
-      (MHLO_TorchIndexSelectOp $params, $indices,
+      (StableHLO_TorchIndexSelectOp $params, $indices,
         (GetHLOAxisFromTFAxis $axis, $params),
         (GetHLOAxisFromTFAxis $batch_dims, $indices))>;
 
@@ -361,17 +372,17 @@ def LegalizeGatherV2 :
 //===----------------------------------------------------------------------===//
 
 class SliceDenseIntElementsAttrColumn2D<string column> : NativeCodeCall<
-  "SliceDenseIntElementsAttrColumn2D($0.cast<ElementsAttr>(), " # column # " )">;
+  "SliceDenseIntElementsAttrColumn2D(llvm::cast<ElementsAttr>($0), " # column # " )">;
 
 class SliceDenseIntElementsAttr<string index, string axis> : NativeCodeCall<
-  "SliceDenseIntElementsAttr($0.cast<ElementsAttr>(), " # index # ", " # axis # ")">;
+  "SliceDenseIntElementsAttr(llvm::cast<ElementsAttr>($0), " # index # ", " # axis # ")">;
 
 // Interior padding attribute based on the TF padding.
-def GetInteriorPadding : NativeCodeCall <
-  "GetInteriorPadding($0.cast<ElementsAttr>())">;
+def GetInteriorPadding : NativeCodeCall<
+  "GetInteriorPadding(llvm::cast<ElementsAttr>($0))">;
 
 def : Pat<(TF_PadV2Op $input, (ConstantLikeMatcher ElementsAttr:$padding), $c),
-          (MHLO_PadOp $input, $c,
+          (StableHLO_PadOp $input, $c,
            (SliceDenseIntElementsAttrColumn2D<"0"> $padding),
            (SliceDenseIntElementsAttrColumn2D<"1"> $padding),
            (GetInteriorPadding $padding))>;
@@ -391,55 +402,55 @@ foreach src = [TF_PreventGradientOp, TF_CheckNumericsOp] in
 // MatMul op patterns.
 //===----------------------------------------------------------------------===//
 
-def GetPrecisionConfig: NativeCodeCall<
+def StableHLO_GetPrecisionConfig: NativeCodeCall<
   "GetPrecisionConfig(&$_builder)">;
 
 def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
-          (MHLO_DotOp
+          (StableHLO_DotOp
           (TF_TransposeOp $a, (TF_ConstOp (Get2DTransposePerm $transpose_a))),
           (TF_TransposeOp $b, (TF_ConstOp (Get2DTransposePerm $transpose_b))),
-          /*precision_config=*/(GetPrecisionConfig))>;
+          /*precision_config=*/(StableHLO_GetPrecisionConfig))>;
 
 //===----------------------------------------------------------------------===//
 // Lower `tf.ZerosLike`
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_ZerosLikeOp AnyTensor:$arg),
-          (MHLO_ConstantLike<"0"> $arg)>;
+          (StableHLO_ConstantLike<"0"> $arg)>;
 
 //===----------------------------------------------------------------------===//
 // Lower `tf.OnesLike`
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_OnesLikeOp AnyTensor:$arg),
-          (MHLO_ConstantLike<"1"> $arg)>;
+          (StableHLO_ConstantLike<"1"> $arg)>;
 
 //===----------------------------------------------------------------------===//
 // Elu op patterns.
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_EluOp AnyTensor:$features),
-          (MHLO_SelectOp
-           (MHLO_CompareOp
+          (StableHLO_SelectOp
+           (StableHLO_CompareOp
               $features,
-              (MHLO_ConstantLike<"0">:$zero $features),
-              MHLO_ComparisonDirectionValue<"GT">, (MHLO_DEFAULT_COMPARISON_TYPE)),
+              (StableHLO_ConstantLike<"0">:$zero $features),
+              StableHLO_ComparisonDirectionValue<"GT">, (STABLEHLO_DEFAULT_COMPARISON_TYPE)),
            $features,
-           (MHLO_Expm1Op $features))>;
+           (StableHLO_Expm1Op $features, ConstDefaultResultAccuracyAttr))>;
 
 def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
-           (MHLO_SelectOp
+           (StableHLO_SelectOp
             (CHLO_BroadcastCompareOp
               $features,
-              (MHLO_ConstantOp:$zero (GetScalarOfType<0> $features)),
+              (StableHLO_ConstantOp:$zero (GetScalarOfType<0> $features)),
               (BinBroadcastDimensions $zero, $features),
               CHLO_ComparisonDirectionValue<"GT">, (CHLO_DEFAULT_COMPARISON_TYPE)),
             $gradients,
-            (MHLO_MulOp
+            (StableHLO_MulOp
              $gradients,
              (CHLO_BroadcastAddOp
                $features,
-               (MHLO_ConstantOp:$one (GetScalarOfType<1> $features)),
+               (StableHLO_ConstantOp:$one (GetScalarOfType<1> $features)),
                (BinBroadcastDimensions $one, $features))))>;
 
 //===----------------------------------------------------------------------===//
@@ -452,24 +463,24 @@ def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$featur
 // TODO(hinsu): Lower quantized types after supporting them in GetScalarOfType.
 def : Pat<(TF_ReluOp AnyTensor:$input),
           (CHLO_BroadcastMaxOp
-               (MHLO_ConstantOp:$zero (GetScalarOfType<0> $input)), $input,
+               (StableHLO_ConstantOp:$zero (GetScalarOfType<0> $input)), $input,
                (BinBroadcastDimensions $zero, $input)),
           [(TF_IntOrFpTensor $input)]>;
 
 // TODO(hinsu): Lower quantized types after supporting them in GetScalarOfType.
 def : Pat<(TF_Relu6Op AnyRankedTensor:$input),
-          (MHLO_ClampOp (MHLO_ConstantOp (GetScalarOfType<0> $input)), $input,
-                       (MHLO_ConstantOp (GetScalarOfType<6> $input))),
+          (StableHLO_ClampOp (StableHLO_ConstantOp (GetScalarOfType<0> $input)), $input,
+                       (StableHLO_ConstantOp (GetScalarOfType<6> $input))),
           [(TF_IntOrFpTensor $input)]>;
 
 // ReluGrad(gradients, features) = gradients * (features > 0)
 // The condition that $gradients and $features need to have the same shape is
 // implicitly enforced: $zero is created to have the same shape as $features,
-// MHLO_SelectOp enforces that $gradients and $zero have the same shape.
+// StableHLO_SelectOp enforces that $gradients and $zero have the same shape.
 def : Pat<(TF_ReluGradOp AnyTensor:$gradients, AnyTensor:$features),
-          (MHLO_SelectOp
-            (MHLO_CompareOp $features, (MHLO_ConstantLike<"0">:$zero $features),
-              MHLO_ComparisonDirectionValue<"GT">, (MHLO_DEFAULT_COMPARISON_TYPE)),
+          (StableHLO_SelectOp
+            (StableHLO_CompareOp $features, (StableHLO_ConstantLike<"0">:$zero $features),
+              StableHLO_ComparisonDirectionValue<"GT">, (STABLEHLO_DEFAULT_COMPARISON_TYPE)),
             $gradients, $zero)>;
 
 //===----------------------------------------------------------------------===//
@@ -479,9 +490,9 @@ def : Pat<(TF_ReluGradOp AnyTensor:$gradients, AnyTensor:$features),
 /// Converts a TF::SoftsignOp to HLO.
 /// Softsign(features) = features / (1 + abs(features))
 def : Pat<(TF_SoftsignOp AnyTensor:$input),
-          (MHLO_DivOp
+          (StableHLO_DivOp
             $input,
-            (MHLO_AddOp (MHLO_ConstantLike<"1"> $input), (MHLO_AbsOp $input))
+            (StableHLO_AddOp (StableHLO_ConstantLike<"1"> $input), (StableHLO_AbsOp $input))
           )
          >;
 
@@ -490,12 +501,12 @@ def : Pat<(TF_SoftsignOp AnyTensor:$input),
 def : Pattern<
         (TF_SoftsignGradOp AnyRankedTensor:$gradients, AnyRankedTensor:$features),
         [(CHLO_BroadcastAddOp:$add
-          (MHLO_ConstantOp:$one (GetScalarOfType<1> $features)), (MHLO_AbsOp $features),
+          (StableHLO_ConstantOp:$one (GetScalarOfType<1> $features)), (StableHLO_AbsOp $features),
           (BinBroadcastDimensions $one, $features)
          ),
          (CHLO_BroadcastDivOp
            $gradients,
-           (MHLO_MulOp $add, $add),
+           (StableHLO_MulOp $add, $add),
            (BinBroadcastDimensions $gradients, $add)
          )
         ]>;
@@ -508,15 +519,15 @@ def UnpackStartingIndices: NativeCodeCall<
   "UnpackTensorAlongZeroDim($0.getLoc(), $1, &$_builder).getOutput()">;
 
 def CanBeTranslatedToDynamicSlice : Constraint<CPred<
-  "CanBeTranslatedToDynamicSlice($0, $1, $2.cast<DenseIntElementsAttr>())">>;
+  "CanBeTranslatedToDynamicSlice($0, $1, llvm::cast<DenseIntElementsAttr>($2))">>;
 
 def TFSliceSizes2HLOSliceSizes : NativeCodeCall<
-    "TFSliceSizes2HLOSliceSizes($0, $1, $2.cast<DenseIntElementsAttr>(),"
+    "TFSliceSizes2HLOSliceSizes($0, $1, llvm::cast<DenseIntElementsAttr>($2),"
     "&$_builder)">;
 
-def : Pat<(TF_SliceOp:$op MHLO_Tensor:$input, MHLO_Tensor:$starting_indices,
+def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
            (ConstantLikeMatcher AnyAttr:$slice_sizes)),
-          (MHLO_DynamicSliceOp $input,
+          (StableHLO_DynamicSliceOp $input,
            (UnpackStartingIndices $op, $starting_indices),
            (TFSliceSizes2HLOSliceSizes $input, $starting_indices, $slice_sizes)),
           [(CanBeTranslatedToDynamicSlice $input, $starting_indices,
@@ -526,8 +537,8 @@ def : Pat<(TF_SliceOp:$op MHLO_Tensor:$input, MHLO_Tensor:$starting_indices,
 // Select op patterns.
 //===----------------------------------------------------------------------===//
 
- def : Pat<(TF_SelectV2Op MHLO_Tensor:$pred, MHLO_Tensor:$on_true,
-            MHLO_Tensor:$on_false),
+ def : Pat<(TF_SelectV2Op HLO_Tensor:$pred, HLO_Tensor:$on_true,
+            HLO_Tensor:$on_false),
            (CHLO_BroadcastSelectOp $pred, $on_true, $on_false)>;
 
 //===----------------------------------------------------------------------===//
@@ -560,47 +571,47 @@ def : Pat<(TF_LegacyCallOp:$op $args, $args_attrs, $res_attrs,
 //===----------------------------------------------------------------------===//
 
 // Handles axis conversion for TF reverse.
-def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1.cast<ElementsAttr>(), &$_builder)">;
+def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, llvm::cast<ElementsAttr>($1), &$_builder)">;
 
 def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (ConstantLikeMatcher ElementsAttr:$axis)),
-    (MHLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
+    (StableHLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
 
 //===----------------------------------------------------------------------===//
 // Unary op patterns.
 //===----------------------------------------------------------------------===//
 
 foreach Mapping = [
-                   [TF_AbsOp, MHLO_AbsOp],
-                   [TF_CeilOp, MHLO_CeilOp],
-                   [TF_ComplexAbsOp, MHLO_AbsOp],
-                   [TF_CosOp, MHLO_CosineOp],
-                   [TF_Expm1Op, MHLO_Expm1Op],
-                   [TF_ErfOp, MHLO_ErfOp],
-                   [TF_FloorOp, MHLO_FloorOp],
-                   [TF_ImagOp, MHLO_ImagOp],
-                   [TF_InvertOp, MHLO_NotOp],
-                   [TF_IsFiniteOp, MHLO_IsFiniteOp],
-                   [TF_LogOp, MHLO_LogOp],
-                   [TF_Log1pOp, MHLO_Log1pOp],
-                   [TF_LogicalNotOp, MHLO_NotOp],
-                   [TF_NegOp, MHLO_NegOp],
-                   [TF_RealOp, MHLO_RealOp],
-                   [TF_RsqrtOp, MHLO_RsqrtOp],
-                   [TF_SigmoidOp, MHLO_LogisticOp],
-                   [TF_SinOp, MHLO_SineOp],
-                   [TF_SqrtOp, MHLO_SqrtOp],
-                   [TF_TanhOp, MHLO_TanhOp],
-                   [TF_TanOp, MHLO_TanOp]
+                   [TF_AbsOp, StableHLO_AbsOp],
+                   [TF_CeilOp, StableHLO_CeilOp],
+                   [TF_ComplexAbsOp, StableHLO_AbsOp],
+                   [TF_ErfOp, CHLO_ErfOp],
+                   [TF_FloorOp, StableHLO_FloorOp],
+                   [TF_ImagOp, StableHLO_ImagOp],
+                   [TF_InvertOp, StableHLO_NotOp],
+                   [TF_IsFiniteOp, StableHLO_IsFiniteOp],
+                   [TF_LogicalNotOp, StableHLO_NotOp],
+                   [TF_NegOp, StableHLO_NegOp],
+                   [TF_RealOp, StableHLO_RealOp],
                   ] in {
- def : Pat<(Mapping[0] MHLO_Tensor:$input),
+ def : Pat<(Mapping[0] HLO_Tensor:$input),
            (Mapping[1] $input)>;
 }
 
-def ConstDefaultResultAccuracyAttr :
-  ConstantAttr<MHLO_ResultAccuracyAttr, "::mlir::mhlo::ResultAccuracyMode::DEFAULT">;
 
-foreach Mapping = [[TF_ExpOp, MHLO_ExpOp]] in {
- def : Pat<(Mapping[0] MHLO_Tensor:$input),
+foreach Mapping = [
+                   [TF_CosOp, StableHLO_CosineOp],
+                   [TF_ExpOp, StableHLO_ExpOp],
+                   [TF_Expm1Op, StableHLO_Expm1Op],
+                   [TF_LogOp, StableHLO_LogOp],
+                   [TF_Log1pOp, StableHLO_Log1pOp],
+                   [TF_RsqrtOp, StableHLO_RsqrtOp],
+                   [TF_SigmoidOp, StableHLO_LogisticOp],
+                   [TF_SinOp, StableHLO_SineOp],
+                   [TF_SqrtOp, StableHLO_SqrtOp],
+                   [TF_TanhOp, StableHLO_TanhOp],
+                   [TF_TanOp, StableHLO_TanOp]
+                  ] in {
+ def : Pat<(Mapping[0] HLO_Tensor:$input),
            (Mapping[1] $input, ConstDefaultResultAccuracyAttr)>;
 }
 
@@ -619,28 +630,28 @@ foreach Mapping = [
                    [TF_LgammaOp, CHLO_LgammaOp],
                    [TF_SinhOp, CHLO_SinhOp],
                   ] in {
- def : Pat<(Mapping[0] MHLO_AnyTensor:$input),
+ def : Pat<(Mapping[0] HLO_AnyTensor:$input),
            (Mapping[1] $input)>;
 }
 
-def : Pat<(TF_AngleOp $x), (MHLO_Atan2Op (MHLO_ImagOp $x), (MHLO_RealOp $x))>;
+def : Pat<(TF_AngleOp $x), (StableHLO_Atan2Op (StableHLO_ImagOp $x), (StableHLO_RealOp $x))>;
 
 // TODO(bixia): Lower with Truncate=True for floating point value conversions.
-def : Pat<(TF_CastOp $arg, ConstBoolAttrFalse), (MHLO_ConvertOp $arg)>;
+def : Pat<(TF_CastOp $arg, ConstBoolAttrFalse), (StableHLO_ConvertOp $arg)>;
 
 def : Pat<(TF_TransposeOp:$res $arg, (ConstantLikeMatcher ElementsAttr:$permutation)),
-          (MHLO_TransposeOp $arg, (CastElementsToI64Elements $permutation))>;
+          (StableHLO_TransposeOp $arg, (CastElementsToI64Array $permutation))>;
 
 
-// Lowering these ops with static shape to mhlo.reshape
+// Lowering these ops with static shape to stablehlo.reshape
 foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp, ] in {
-  def : Pat<(TfOp:$res MHLO_Tensor:$arg, $ignored),
-            (MHLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)], [],
+  def : Pat<(TfOp:$res HLO_Tensor:$arg, $ignored),
+            (StableHLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)], [],
             (addBenefit 2)>;
 }
 
 // Returns NaN if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-def : Pat<(TF_SignOp $x), (MHLO_SignOp $x)>;
+def : Pat<(TF_SignOp $x), (StableHLO_SignOp $x)>;
 
 def BothElementTypesSameWidthIntOrFloat : Constraint<CPred<
   "getElementTypeOrSelf($0.getType()).isIntOrFloat() && "
@@ -650,51 +661,55 @@ def BothElementTypesSameWidthIntOrFloat : Constraint<CPred<
 // TODO(mgester): Due to restrictions of xla::BitcastConvertType we currently
 // only lower if both input and output types are int or float and have same width
 
-def : Pat<(TF_BitcastOp:$res MHLO_Tensor:$arg),
-          (MHLO_BitcastConvertOp $arg),
+def : Pat<(TF_BitcastOp:$res HLO_Tensor:$arg),
+          (StableHLO_BitcastConvertOp $arg),
           [(BothElementTypesSameWidthIntOrFloat $res, $arg)]>;
 
 // TODO(jpienaar): Lower constant like to constant to broadcast if dynamic
-// and going to MHLO.
+// and going to StableHLO.
 
 //===----------------------------------------------------------------------===//
 // Random ops.
 //===----------------------------------------------------------------------===//
 // TODO(b/148269299): handle random number generator seeds/states correctly.
 
-class MHLO_RngDistributionValue<string enumStr> :
-  ConstantAttr<MHLO_RngDistributionAttr, "::mlir::mhlo::RngDistribution::" # enumStr>;
+class StableHLO_RngDistributionValue<string enumStr> :
+  ConstantAttr<StableHLO_RngDistributionAttr, "::mlir::stablehlo::RngDistribution::" # enumStr>;
 
 def : Pat<(TF_RandomUniformOp:$old $shape, $seed, $seed2),
-          (MHLO_RngOp
-            (MHLO_ConstantOp
+          (StableHLO_RngOp
+            (StableHLO_ConstantOp
               (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 0.0)">)),
-            (MHLO_ConstantOp
+            (StableHLO_ConstantOp
               (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 1.0)">)),
             (CastValueToI64 $old, $shape),
-            MHLO_RngDistributionValue<"UNIFORM">),
+            StableHLO_RngDistributionValue<"UNIFORM">),
           [(IsShapedTensor $shape)]>;
 
 def : Pat<(TF_RandomStandardNormalOp:$old $shape, $seed, $seed2),
-          (MHLO_RngOp
-            (MHLO_ConstantOp
+          (StableHLO_RngOp
+            (StableHLO_ConstantOp
               (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 0.0)">)),
-            (MHLO_ConstantOp
+            (StableHLO_ConstantOp
               (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 1.0)">)),
             (CastValueToI64 $old, $shape),
-            MHLO_RngDistributionValue<"NORMAL">),
+            StableHLO_RngDistributionValue<"NORMAL">),
           [(IsShapedTensor $shape)]>;
 
 //===----------------------------------------------------------------------===//
 // Sigmoid grad op.
 //===----------------------------------------------------------------------===//
 
-// TODO(hinsu): Handle unranked inputs by broadcasting constant one to the
-// shape of $l instead of having it as a constant.
+// Only handle static shape here, dynamic shape is handled by
+// ConvertSigmoidGradOpDynamic
+def HasStaticShape : Constraint<
+    CPred<"::llvm::dyn_cast<ShapedType>($0.getType()).hasStaticShape()">>;
+
 def : Pat<(TF_SigmoidGradOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-          (MHLO_MulOp
-           (MHLO_MulOp $r, $l),
-           (MHLO_SubtractOp (MHLO_ConstantOp (ConstantSplat<"1"> $l)), $l))>;
+          (StableHLO_MulOp
+           (StableHLO_MulOp $r, $l),
+           (StableHLO_SubtractOp (StableHLO_ConstantOp (ConstantSplat<"1"> $l)), $l)),
+          [(HasStaticShape $l)]>;
 
 //===----------------------------------------------------------------------===//
 // Softplus op.
@@ -704,22 +719,22 @@ def EpsilonValue : NativeCodeCall<"GetEpsilonValue($0.getType())">;
 
 def : Pattern<(TF_SoftplusOp AnyTensor:$features),
               [
-                (MHLO_ExpOp:$features_exp $features, ConstDefaultResultAccuracyAttr),
+                (StableHLO_ExpOp:$features_exp $features, ConstDefaultResultAccuracyAttr),
                 (CHLO_BroadcastAddOp:$threshold
-                 (MHLO_LogOp (MHLO_ConstantOp (EpsilonValue $features))),
-                 (MHLO_ConstantOp (GetScalarOfType<2> $features)),
+                 (StableHLO_LogOp (StableHLO_ConstantOp (EpsilonValue $features)), ConstDefaultResultAccuracyAttr),
+                 (StableHLO_ConstantOp (GetScalarOfType<2> $features)),
                  (NullDenseI64ArrayAttr)
                 ),
-                (MHLO_SelectOp:$output
+                (StableHLO_SelectOp:$output
                  (CHLO_BroadcastCompareOp
                   $features,
-                  (MHLO_NegOp $threshold),
+                  (StableHLO_NegOp $threshold),
                   (NullDenseI64ArrayAttr),
                   CHLO_ComparisonDirectionValue<"GT">,
                   (CHLO_DEFAULT_COMPARISON_TYPE)
                  ),
                  $features,
-                 (MHLO_SelectOp
+                 (StableHLO_SelectOp
                   (CHLO_BroadcastCompareOp
                    $features,
                    $threshold,
@@ -728,7 +743,7 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                    (CHLO_DEFAULT_COMPARISON_TYPE)
                   ),
                   $features_exp,
-                  (MHLO_Log1pOp $features_exp)
+                  (StableHLO_Log1pOp $features_exp, ConstDefaultResultAccuracyAttr)
                  )
                 ),
                 (replaceWithValue $output)
@@ -739,7 +754,7 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaReplicaIdOp),
-          (TF_CastOp (MHLO_ReplicaIdOp), /*truncate=*/ConstBoolAttrFalse)>;
+          (TF_CastOp (StableHLO_ReplicaIdOp), /*truncate=*/ConstBoolAttrFalse)>;
 
 //===----------------------------------------------------------------------===//
 // XlaGather op.
@@ -751,9 +766,9 @@ def HasValidGatherDims : Constraint<CPred<"HasValidGatherDims($0)">>;
 
 def : Pat<(TF_XlaGatherOp $operand, $start_indices, (ConstantLikeMatcher ElementsAttr:$slice_sizes),
                           $dimension_numbers, $indices_are_sorted),
-          (MHLO_GatherOp $operand, $start_indices,
+          (StableHLO_GatherOp $operand, $start_indices,
                         (ToGatherDimNumsAttr $dimension_numbers),
-                        (CastElementsToI64Elements $slice_sizes),
+                        (CastElementsToI64Array $slice_sizes),
                         $indices_are_sorted),
           [(HasValidGatherDims $dimension_numbers)]>;
 
@@ -770,7 +785,7 @@ def HasValidDotDims : Constraint<CPred<"HasValidDotDims($0)">>;
 def HasValidPrecisionConfig : Constraint<CPred<"HasValidPrecisionConfig($0)">>;
 
 def : Pat<(TF_XlaDotOp $lhs, $rhs, $dimension_numbers, $precision_config),
-          (MHLO_DotGeneralOp $lhs, $rhs,
+          (StableHLO_DotGeneralOp $lhs, $rhs,
                             (ToDotDimNumsAttr $dimension_numbers),
                             (ToPrecisionConfigsAttr $precision_config),
                             (EmptyDotAlgorithmAttr)),
@@ -781,7 +796,7 @@ def : Pat<(TF_XlaDotOp $lhs, $rhs, $dimension_numbers, $precision_config),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaDotV2Op $lhs, $rhs, $dimension_numbers, $precision_config),
-          (MHLO_DotGeneralOp $lhs, $rhs,
+          (StableHLO_DotGeneralOp $lhs, $rhs,
                             (ToDotDimNumsAttr $dimension_numbers),
                             (ToPrecisionConfigsAttr $precision_config),
                             (EmptyDotAlgorithmAttr)),
@@ -791,9 +806,9 @@ def : Pat<(TF_XlaDotV2Op $lhs, $rhs, $dimension_numbers, $precision_config),
 // XlaDynamicSlice op.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_XlaDynamicSliceOp:$op MHLO_Tensor:$input, MHLO_Tensor:$starting_indices,
+def : Pat<(TF_XlaDynamicSliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
            (ConstantLikeMatcher AnyAttr:$slice_sizes)),
-          (MHLO_DynamicSliceOp $input,
+          (StableHLO_DynamicSliceOp $input,
            (UnpackStartingIndices $op, $starting_indices),
            (TFSliceSizes2HLOSliceSizes $input, $starting_indices, $slice_sizes))>;
 
@@ -802,11 +817,11 @@ def : Pat<(TF_XlaDynamicSliceOp:$op MHLO_Tensor:$input, MHLO_Tensor:$starting_in
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaEinsumOp $lhs, $rhs, $equation),
-          (MHLO_EinsumOp $lhs, $rhs, $equation)>;
+          (StableHLO_EinsumOp $lhs, $rhs, $equation)>;
 
 //===----------------------------------------------------------------------===//
 // XlaOptimizationBarrierOp op.
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaOptimizationBarrierOp $args),
-          (MHLO_OptimizationBarrierOp $args)>;
+          (StableHLO_OptimizationBarrierOp $args)>;
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
index 2d9bc167d2c0..1a9022731889 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
@@ -31,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/Base.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
@@ -43,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
@@ -75,13 +76,11 @@ bool IsBounded(Type ty) {
 
   if (ranked_ty.hasStaticShape()) return true;
 
-  auto encoding =
-      mlir::dyn_cast_or_null<TypeExtensionsAttr>(ranked_ty.getEncoding());
-  if (!encoding) return false;
+  auto bounds = hlo::encodingToBounds(ranked_ty.getEncoding());
+  if (bounds.empty()) return false;
 
   for (int i = 0; i < ranked_ty.getRank(); ++i) {
-    if (ranked_ty.isDynamicDim(i) &&
-        encoding.getBounds()[i] == ShapedType::kDynamic) {
+    if (ranked_ty.isDynamicDim(i) && bounds[i] == ShapedType::kDynamic) {
       return false;
     }
   }
@@ -126,13 +125,13 @@ class Tf2XlaRewritePattern : public ConversionPattern {
     auto abstractOp = op->getRegisteredInfo();
     if (!abstractOp) return failure();
 
-    if (!(IsOpAllowedTf2xlaFallback(abstractOp->getTypeID()) ||
+    if (!(hlo::IsOpAllowedTf2xlaFallback(abstractOp->getTypeID()) ||
           (prefer_tf2xla_ &&
-           IsOpAllowedTf2xlaPreferred(abstractOp->getTypeID())))) {
+           hlo::IsOpAllowedTf2xlaPreferred(abstractOp->getTypeID())))) {
       return failure();
     }
 
-    return Tf2XlaRewriter::RewriteOp(op, rewriter, device_type_);
+    return hlo::Tf2XlaRewriter::RewriteOp(op, rewriter, device_type_);
   }
 
  private:
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/passes.h b/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
index 0b9f5a1efaab..85b97792f3d2 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/passes.h
@@ -38,6 +38,19 @@ template <typename T>
 class OperationPass;
 class Pass;
 
+namespace hlo {
+
+// Verifies that the TF/XLA ops have all been lowered to MHLO.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateVerifyTFXLALegalizationPass(
+    bool legalize_chlo = true);
+
+/// Adds the TF to TF lowerings and TF to XLA rewrite patterns to the pattern
+/// list.
+void PopulateLegalizeTfPatterns(MLIRContext* context,
+                                RewritePatternSet* patterns);
+
+}  // namespace hlo
+
 namespace mhlo {
 
 /// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
@@ -54,11 +67,6 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFPass(
     std::optional<StringRef> tf2xla_fallback_device_type = std::nullopt,
     bool prefer_tf2xla = false);
 
-/// Adds the TF to TF lowerings and TF to XLA rewrite patterns to the pattern
-/// list.
-void PopulateLegalizeTfPatterns(MLIRContext* context,
-                                RewritePatternSet* patterns);
-
 // Populates TF to MHLO legalization for some of the quantization ops.
 //
 // TODO(hinsu): Remove this once we combine quantized and non quantized op
@@ -88,10 +96,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCommunicationPass();
 // ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCollectivePass();
 
-// Verifies that the TF/XLA ops have all been lowered to MHLO.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateVerifyTFXLALegalizationPass(
-    bool legalize_chlo = true);
-
 // Transforms TFXLA Device specific ops into device independent ops.
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateTFXLADeviceSpecificTransformsPass(
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
index 3b33311b4f02..7f458bc90ba2 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 namespace test {
 
 using ::mlir::DialectRegistry;
@@ -50,5 +50,5 @@ absl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
 }
 
 }  // namespace test
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
index 0bfd53dc1104..0aa4c036c38d 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 namespace test {
 
 // Given a raw string, return a ModuleOp that can be used with the given
@@ -33,7 +33,7 @@ absl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
     absl::string_view module_string, MLIRContext* mlir_context);
 
 }  // namespace test
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TEST_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
index 161ae934df7d..ba20437c2174 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
@@ -50,6 +51,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/Base.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
@@ -69,7 +72,6 @@ limitations under the License.
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
 #include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -90,7 +92,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 namespace {
 
 using ::mlir::ModuleOp;
@@ -154,7 +156,7 @@ Tf2XlaRewriter::~Tf2XlaRewriter() {
   if (context_) context_->Unref();
 }
 
-absl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::ImportXlaComputation(
+absl::StatusOr<stablehlo::TupleOp> Tf2XlaRewriter::ImportXlaComputation(
     XlaComputation& computation) {
   xla::DebugOptions debug_options;
   TF_ASSIGN_OR_RETURN(auto hlo_module_config,
@@ -193,8 +195,8 @@ absl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::ImportXlaComputation(
       xla::HloFunctionImporter::ImportInstructions(
           *hlo_module->entry_computation(), arguments, symbol_table, &builder));
 
-  mhlo::TupleOp root_tuple =
-      mlir::dyn_cast_or_null<mhlo::TupleOp>(root_value.getDefiningOp());
+  stablehlo::TupleOp root_tuple =
+      mlir::dyn_cast_or_null<stablehlo::TupleOp>(root_value.getDefiningOp());
   if (!root_tuple) {
     return tsl::errors::InvalidArgument(
         "Imported XLA Root Value is not a tuple op");
@@ -259,13 +261,11 @@ bool IsBounded(Type ty) {
 
   if (ranked_ty.hasStaticShape()) return true;
 
-  auto encoding =
-      mlir::dyn_cast_or_null<TypeExtensionsAttr>(ranked_ty.getEncoding());
-  if (!encoding) return false;
+  ArrayRef<int64_t> bounds = hlo::encodingToBounds(ranked_ty.getEncoding());
+  if (bounds.empty()) return false;
 
   for (int i = 0; i < ranked_ty.getRank(); ++i) {
-    if (ranked_ty.isDynamicDim(i) &&
-        encoding.getBounds()[i] == ShapedType::kDynamic) {
+    if (ranked_ty.isDynamicDim(i) && bounds[i] == ShapedType::kDynamic) {
       return false;
     }
   }
@@ -410,23 +410,23 @@ LogicalResult Tf2XlaRewriter::LegalizeOp() {
 
   if (failed(VerifyOpResults(op_context))) return failure();
 
-  absl::StatusOr<mhlo::TupleOp> tuple_result_or_status =
+  absl::StatusOr<stablehlo::TupleOp> tuple_result_or_status =
       CompileWithHloImporter(op_context);
   if (!tuple_result_or_status.ok()) {
     return op_->emitRemark() << tuple_result_or_status.status().ToString();
   }
-    mhlo::TupleOp tuple_result = tuple_result_or_status.value();
+  stablehlo::TupleOp tuple_result = tuple_result_or_status.value();
 
-    llvm::SmallVector<Value> output_values;
-    if (failed(GetKernelOutputs(op_context, tuple_result, output_values))) {
-      return failure();
-    }
+  llvm::SmallVector<Value> output_values;
+  if (failed(GetKernelOutputs(op_context, tuple_result, output_values))) {
+    return failure();
+  }
 
   rewriter_.replaceOp(op_, output_values);
   return success();
 }
 
-absl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::CompileWithHloImporter(
+absl::StatusOr<stablehlo::TupleOp> Tf2XlaRewriter::CompileWithHloImporter(
     tensorflow::OpKernelContext& op_context) {
   // XLA can only return a single value. Wrap all output op return values
   // in a Tuple op that gets unpacked later.
@@ -470,7 +470,7 @@ mlir::LogicalResult Tf2XlaRewriter::VerifyOpResults(
 // multiple values. We get around this by returning a tuple as an XLA op. We
 // then unpack it here to return the multiple values instead.
 mlir::LogicalResult Tf2XlaRewriter::UnpackTupleResults(
-    mhlo::TupleOp tuple_result, llvm::SmallVector<Value>& outputs) {
+    stablehlo::TupleOp tuple_result, llvm::SmallVector<Value>& outputs) {
   if (tuple_result->getNumOperands() != op_->getNumResults()) {
     return op_->emitRemark() << "Translated TF2XLA tuple has different "
                                 "number of results than original op";
@@ -485,7 +485,7 @@ mlir::LogicalResult Tf2XlaRewriter::UnpackTupleResults(
 }
 
 mlir::LogicalResult Tf2XlaRewriter::GetKernelOutputs(
-    tensorflow::OpKernelContext& op_context, mhlo::TupleOp tuple_results,
+    tensorflow::OpKernelContext& op_context, stablehlo::TupleOp tuple_results,
     llvm::SmallVector<Value>& outputs) {
   outputs.reserve(op_->getNumResults());
 
@@ -522,5 +522,5 @@ tensorflow::XlaExpression Tf2XlaRewriter::GetExprForOperand(
   return tensorflow::XlaExpression::XlaOp(xla_op, dtype);
 }
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
index c5c417e27ba0..371db7214bc9 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
@@ -28,18 +28,17 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace mlir {
-namespace mhlo {
-
+namespace hlo {
 class Tf2XlaRewriterTestPeer;
 
 class Tf2XlaRewriter {
@@ -58,12 +57,12 @@ class Tf2XlaRewriter {
 
   // Compiles the given Operation with XlaBuilder and imports the generated HLO
   // via the HLO -> MHLO importer.
-  absl::StatusOr<mhlo::TupleOp> CompileWithHloImporter(
+  absl::StatusOr<stablehlo::TupleOp> CompileWithHloImporter(
       tensorflow::OpKernelContext& op_context);
 
   // Import the given XlaComputation into the parent module. Returns the given
   // generated function.
-  absl::StatusOr<mhlo::TupleOp> ImportXlaComputation(
+  absl::StatusOr<stablehlo::TupleOp> ImportXlaComputation(
       xla::XlaComputation& computation);
 
   // Prepares OpKernelContext params common to all the ops.
@@ -83,12 +82,12 @@ class Tf2XlaRewriter {
 
   mlir::LogicalResult VerifyOpResults(tensorflow::OpKernelContext& op_context);
   mlir::LogicalResult GetKernelOutputs(tensorflow::OpKernelContext& op_context,
-                                       mhlo::TupleOp tuple_results,
+                                       stablehlo::TupleOp tuple_results,
                                        llvm::SmallVector<Value>& outputs);
 
   // Given a translated function with a single return value, unpack the tuple
   // results.
-  mlir::LogicalResult UnpackTupleResults(mhlo::TupleOp tuple_result,
+  mlir::LogicalResult UnpackTupleResults(stablehlo::TupleOp tuple_result,
                                          llvm::SmallVector<Value>& outputs);
 
   // Tries to legalize the specified TensorFlow op, if supported.
@@ -122,7 +121,7 @@ class Tf2XlaRewriter {
   xla::XlaBuilder xla_builder_;
 };
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_TF2XLA_REWRITER_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
index eaad485ccab9..14da8868e5cb 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -33,23 +34,22 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
 using ::mlir::LogicalResult;
 using ::mlir::ModuleOp;
@@ -102,7 +102,7 @@ class Tf2XlaRewriterTestPeer {
         tf2xla_rewriter_(op, empty_rewriter_,
                          /*device_type=*/"XLA_CPU_JIT") {}
 
-  absl::StatusOr<TupleOp> ImportXlaComputationIntoModule(
+  absl::StatusOr<stablehlo::TupleOp> ImportXlaComputationIntoModule(
       XlaComputation& computation) {
     return tf2xla_rewriter_.ImportXlaComputation(computation);
   }
@@ -123,7 +123,7 @@ class Tf2XlaRewriterTest : public ::testing::Test {
 
   Status CreateMlirModule(std::string module_string = kMlirModuleStr) {
     TF_ASSIGN_OR_RETURN(
-        module_, test::GetMlirModuleFromString(module_string, &context_));
+        module_, hlo::test::GetMlirModuleFromString(module_string, &context_));
 
     context_.loadAllAvailableDialects();
     return absl::OkStatus();
@@ -184,7 +184,7 @@ class Tf2XlaRewriterTest : public ::testing::Test {
     return main_func.getBody().front().front();
   }
 
-  absl::StatusOr<TupleOp> ImportXlaComputationIntoModule(
+  absl::StatusOr<stablehlo::TupleOp> ImportXlaComputationIntoModule(
       XlaComputation& computation) {
     SourceMgrDiagnosticHandler sourceMgrHandler(source_manager_, &context_);
 
@@ -204,7 +204,8 @@ TEST_F(Tf2XlaRewriterTest, LegalizesOpWithTf2xlaHloImporter) {
   TF_EXPECT_OK(LegalizeModule());
 
   int num_tuple_ops = 0;
-  module_->walk([&num_tuple_ops](TupleOp tuple_op) { num_tuple_ops += 1; });
+  module_->walk(
+      [&num_tuple_ops](stablehlo::TupleOp tuple_op) { num_tuple_ops += 1; });
 
   EXPECT_EQ(num_tuple_ops, 0);
 }
@@ -214,7 +215,7 @@ TEST_F(Tf2XlaRewriterTest, ImportsXlaComputationIntoModule) {
 
   XlaComputation computation = GetTestXlaComputation();
 
-  TF_ASSERT_OK_AND_ASSIGN(TupleOp root_tuple,
+  TF_ASSERT_OK_AND_ASSIGN(stablehlo::TupleOp root_tuple,
                           ImportXlaComputationIntoModule(computation));
 
   ModuleOp parent_module =
@@ -261,7 +262,7 @@ TEST_F(Tf2XlaRewriterTest, ImportsSingleComputation) {
   EXPECT_EQ(computation.proto().computations_size(), 2);
 
   TF_ASSERT_OK(CreateMlirModule());
-  TF_ASSERT_OK_AND_ASSIGN(TupleOp root_tuple,
+  TF_ASSERT_OK_AND_ASSIGN(stablehlo::TupleOp root_tuple,
                           ImportXlaComputationIntoModule(computation));
   EXPECT_TRUE(root_tuple);
 
@@ -356,10 +357,10 @@ TEST_F(Tf2XlaRewriterTest, ErrorsWithInvalidNumberOfParametersToArgs) {
   EXPECT_EQ(computation.proto().computations_size(), 2);
 
   TF_ASSERT_OK(CreateMlirModule());
-  absl::StatusOr<TupleOp> status_or_tuple_op =
+  absl::StatusOr<stablehlo::TupleOp> status_or_tuple_op =
       ImportXlaComputationIntoModule(computation);
   EXPECT_FALSE(status_or_tuple_op.ok());
 }
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
index d99f80ff5eac..8530da4b9080 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
@@ -21,11 +21,13 @@ limitations under the License.
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/Base.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
@@ -87,9 +89,8 @@ static void IncrementCounterFor(tensorflow::monitoring::Counter<1>* counter,
 }
 
 bool HasBounds(RankedTensorType type) {
-  auto encoding = mlir::dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>(
-      type.getEncoding());
-  return (encoding && !encoding.getBounds().empty());
+  auto bounds = hlo::encodingToBounds(type.getEncoding());
+  return !bounds.empty();
 }
 
 bool HasStaticShapeOrBounded(Value val) {
@@ -146,7 +147,7 @@ bool IsDefaultConversionLegal(
 void VerifyTFXLALegalization::runOnOperation() {
   Operation* func_op = getOperation();
   ConversionTarget default_conversion_target =
-      GetDefaultLegalConversionTargets(getContext(), legalize_chlo_);
+      hlo::GetDefaultLegalConversionTargets(getContext(), legalize_chlo_);
 
   bool has_invalid_ops = false;
   func_op->walk([&](Operation* op) {
@@ -167,10 +168,13 @@ void VerifyTFXLALegalization::runOnOperation() {
 
 }  // namespace
 
+}  // namespace mhlo
+
+namespace hlo {
+
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateVerifyTFXLALegalizationPass(bool legalize_chlo) {
-  return std::make_unique<VerifyTFXLALegalization>(legalize_chlo);
+  return std::make_unique<mhlo::VerifyTFXLALegalization>(legalize_chlo);
 }
-
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc
index e0bc0f1ebe50..eee0c76e7d68 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization_test.cc
@@ -37,7 +37,7 @@ namespace {
 using ::mlir::MLIRContext;
 using ::mlir::ModuleOp;
 using ::mlir::OwningOpRef;
-using ::mlir::mhlo::test::GetMlirModuleFromString;
+using ::mlir::hlo::test::GetMlirModuleFromString;
 using ::tensorflow::monitoring::testing::CellReader;
 
 static constexpr char kFailedLegalizationStreamz[] =
@@ -55,7 +55,7 @@ class VerifyTfxlaLegalizationTest : public ::testing::Test {
 
     pm_ = std::make_unique<mlir::PassManager>(&context_);
     pm_->addNestedPass<mlir::func::FuncOp>(
-        mlir::mhlo::CreateVerifyTFXLALegalizationPass(/*legalize_chlo=*/false));
+        mlir::hlo::CreateVerifyTFXLALegalizationPass(/*legalize_chlo=*/false));
   }
   mlir::LogicalResult Run() { return pm_->run(module_.get()); }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
index 816b9a5e8b77..7a905fdd017d 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
 ConversionTarget GetDefaultLegalConversionTargets(MLIRContext& mlir_context,
                                                   bool legalize_chlo) {
@@ -39,7 +39,7 @@ ConversionTarget GetDefaultLegalConversionTargets(MLIRContext& mlir_context,
   } else {
     target.addLegalDialect<chlo::ChloDialect>();
   }
-  target.addLegalDialect<MhloDialect>();
+  target.addLegalDialect<mhlo::MhloDialect>();
   target.addLegalDialect<arith::ArithDialect>();
   target.addLegalDialect<func::FuncDialect>();
   target.addLegalDialect<tensor::TensorDialect>();
@@ -54,5 +54,5 @@ ConversionTarget GetDefaultLegalConversionTargets(MLIRContext& mlir_context,
   return target;
 }
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h
index 1711e0391af9..55aca716ac4d 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
 // Returns a ConversionTarget that includes default legalized MLIR dialects
 // for conversion to XLA.
@@ -28,7 +28,7 @@ namespace mhlo {
 mlir::ConversionTarget GetDefaultLegalConversionTargets(
     MLIRContext& mlir_context, bool legalize_chlo);
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_TRANSFORMS_XLA_LEGALIZE_TARGETS_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
index 635d7dc15bb7..fbdb818e5236 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 namespace {
 
 mlir::DialectRegistry GetDefaultDialectRegistry() {
@@ -91,5 +91,5 @@ TEST_F(XlaLegalizeTargetsTest, DontAllowCHLODialect) {
 }
 
 }  // namespace
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
index f5364586ec73..d312bc1cafdc 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "mhlo/transforms/rewriters.h"
 #include "absl/log/log.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
@@ -35,18 +36,19 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep, dependent dialect
 #include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
 #include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
@@ -96,8 +98,8 @@ RewritePatternSet PatternsIncludeOps(RewritePatternSet &from) {
     // If the pattern does not have a specific operation, always include it,
     // If the pattern is in include_ops then include it.
     bool include =
-        !pat_op_name ||
-        IsTypeLegalizedWithMlir(pat_op_name->getRegisteredInfo()->getTypeID());
+        !pat_op_name || hlo::IsTypeLegalizedWithMlir(
+                            pat_op_name->getRegisteredInfo()->getTypeID());
     if (include) to.add(std::move(pattern));
   }
 
@@ -139,7 +141,7 @@ void IncrementFailedLegalizationCount(Operation *op,
 mlir::LogicalResult ApplyPatterns(Operation *op, RewritePatternSet &patterns,
                                   bool legalize_chlo) {
   ConversionTarget target =
-      GetDefaultLegalConversionTargets(*op->getContext(), legalize_chlo);
+      hlo::GetDefaultLegalConversionTargets(*op->getContext(), legalize_chlo);
 
   DenseSet<Operation *> unconverted_ops;
   ConversionConfig config;
@@ -154,6 +156,22 @@ mlir::LogicalResult ApplyPatterns(Operation *op, RewritePatternSet &patterns,
   return result;
 }
 
+mlir::LogicalResult StablehloToMhlo(Operation *op) {
+  ConversionTarget target(*op->getContext());
+  stablehlo::setupStablehloToHloConversionTarget(target);
+
+  RewritePatternSet patterns(op->getContext());
+  stablehlo::StablehloToHloTypeConverter shlo_converter;
+  stablehlo::populateStablehloToHloPatterns(&patterns, &shlo_converter,
+                                            patterns.getContext());
+  stablehlo::registerFuncOpsForTypeConversion(target, patterns, shlo_converter);
+
+  if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
+    return op->emitError("TF2XLA failed to convert StableHLO to MHLO");
+  }
+  return success();
+}
+
 /// When `tf2xla_fallback_device_type` is not `None`, also uses legalization
 /// patterns from TF2XLA fallback for provided device type (see
 /// legalize_tf_with_tf2xla.cc for details). By default, TF2XLA fallback is
@@ -175,7 +193,7 @@ LogicalResult legalizeTF(Operation *op, bool legalize_chlo,
   // 4) Order of patterns in `RewritePatternSet`.
 
   // Add TF->HLO legalization patterns.
-  PopulateLegalizeTfPatterns(context, &legalize_lower_patterns);
+  hlo::PopulateLegalizeTfPatterns(context, &legalize_lower_patterns);
 
   // Add TF->TF lowering patterns.
   TF::PopulateTFLoweringBeforeHLOPatterns(context, &legalize_lower_patterns);
@@ -208,20 +226,30 @@ LogicalResult legalizeTF(Operation *op, bool legalize_chlo,
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
   // CHLO first.
   stablehlo::StablehloToHloTypeConverter hlo_converter;
+  stablehlo::populateStablehloToHloPatterns(&patterns, &hlo_converter, context);
   if (legalize_chlo) {
-    chlo::populateChloToHloPatterns(context, &hlo_converter, &patterns);
+    chlo::populateChloToHighLevelMhloOpPatterns(context, &patterns);
+    stablehlo::populateChloToStablehloPatterns(context, &patterns);
   }
   // ConstantLike op is convenient to create splat constants, but is
   // canonicalized to plain HLO constant if statically shaped. Add the
   // canonicalization pattern to pattern list to enable multi-hop lowering.
   chlo::ConstantLikeOp::getCanonicalizationPatterns(patterns, context);
 
-  return ApplyPatterns(op, patterns, legalize_chlo);
+  if (failed(ApplyPatterns(op, patterns, legalize_chlo))) {
+    return failure();
+  }
+
+  // HLO->MLIR raises to StableHLO, but users of this pass expect MHLO.
+  return StablehloToMhlo(op);
 }
 
 // Performs the lowering to XLA dialect.
 void LegalizeTF::runOnOperation() {
   auto op = getOperation();
+  VLOG(3) << "LegalizeTF(legalize_chlo=" << legalize_chlo_
+          << ", prefer_tf2xla=" << prefer_tf2xla_ << ") on module:\n"
+          << mlir::debugString(*op);
   auto op_name = op->getName().getStringRef().str();
   mlir_legalization_count->GetCell(op_name)->IncrementBy(1);
   std::optional<StringRef> tf2xla_fallback_device_type = std::nullopt;
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
index 368afec6ef07..5a87e106953f 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
@@ -77,7 +77,7 @@ def VerifyTFXLALegalization : Pass<"tfxla-verify-legalization", "mlir::func::Fun
         "Legalizes intermediate chlo ops to hlo">
     ];
 
-  let constructor = "mlir::mhlo::CreateVerifyTFXLALegalizationPass()";
+  let constructor = "mlir::hlo::CreateVerifyTFXLALegalizationPass()";
 }
 
 def TFXLADeviceSpecificTransforms : Pass<"tfxla-device-specific-transforms",
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 73e6e874555f..bf930ed91492 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/mlir/framework/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
   mlir::registerAllPasses();
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index babd62f6b13f..80e58756bbfa 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "mlir/Support/ToolUtilities.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
-#include "tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/core/platform/init_main.h"
 
 // NOLINTNEXTLINE
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index a90f25aab887..23242c9c0f77 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -53,16 +53,10 @@ td_library(
 gentbl_cc_library(
     name = "tfr_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tfr_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tfr_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tfr_ops.h.inc": ["-gen-op-decls"],
+        "ir/tfr_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tfr_ops.td",
     deps = [
@@ -73,12 +67,7 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tfr_decompose_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "passes/generated_decompose.inc",
-        ),
-    ],
+    tbl_outs = {"passes/generated_decompose.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/decompose_patterns.td",
     deps = [
@@ -101,7 +90,6 @@ cc_library(
     ],
     deps = [
         ":tfr_ops_inc_gen",
-        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
@@ -154,7 +142,6 @@ cc_library(
     deps = [
         ":tfr",
         ":utils",
-        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/mlir/tfr/build_defs.bzl b/tensorflow/compiler/mlir/tfr/build_defs.bzl
index 56a05d191de5..e2cdb93bd4d7 100644
--- a/tensorflow/compiler/mlir/tfr/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tfr/build_defs.bzl
@@ -1,6 +1,6 @@
 """BUILD extension for TF composition project."""
 
-load("@local_xla//third_party/py/rules_pywrap:pywrap.bzl", "use_pywrap_rules")
+load("@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
index 642caf2306b1..0c1a7c4dbf31 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
 
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
index 4b701132c23c..3b831c1586a9 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
 #define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
index 39fdd8391ce3..ab10c02926e7 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
 
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index 6780328b8e89..d44e65f029ad 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -111,12 +111,11 @@ class TFRInlinerInterface : public DialectInlinerInterface {
   Operation *materializeCallConversion(OpBuilder &builder, Value input,
                                        Type result_type,
                                        Location conversion_loc) const final {
-    if (!input.getType().isa<IntegerType>() ||
-        !result_type.isa<IntegerType>()) {
+    if (!isa<IntegerType>(input.getType()) || !isa<IntegerType>(result_type)) {
       return nullptr;
     }
-    auto input_itype = input.getType().cast<IntegerType>();
-    auto result_itype = result_type.cast<IntegerType>();
+    auto input_itype = llvm::cast<IntegerType>(input.getType());
+    auto result_itype = llvm::cast<IntegerType>(result_type);
     if (input_itype.getWidth() == result_itype.getWidth()) return nullptr;
     if (input_itype.getWidth() > result_itype.getWidth()) {
       return builder.create<arith::TruncIOp>(conversion_loc, result_type,
@@ -150,10 +149,10 @@ Operation *TFRDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                            Type type, Location loc) {
   if (arith::ConstantOp::isBuildableWith(value, type))
     return builder.create<arith::ConstantOp>(loc, type,
-                                             value.cast<TypedAttr>());
+                                             llvm::cast<TypedAttr>(value));
   if (func::ConstantOp::isBuildableWith(value, type))
-    return builder.create<func::ConstantOp>(loc, type,
-                                            value.cast<FlatSymbolRefAttr>());
+    return builder.create<func::ConstantOp>(
+        loc, type, llvm::cast<FlatSymbolRefAttr>(value));
   return nullptr;
 }
 
@@ -180,11 +179,11 @@ LogicalResult ConstantTensorOp::verify() {
   auto input_type = op.getArg().getType();
   auto output_type = op.getOut().getType();
 
-  if (auto output_tensor_type = output_type.dyn_cast<TFRTensorType>()) {
+  if (auto output_tensor_type = llvm::dyn_cast<TFRTensorType>(output_type)) {
     return success();
   }
 
-  auto output_tensor_type = output_type.dyn_cast<RankedTensorType>();
+  auto output_tensor_type = llvm::dyn_cast<RankedTensorType>(output_type);
   if (!output_tensor_type || !output_tensor_type.hasStaticShape()) {
     op.emitError("output type should be static and ranked.");
     return failure();
@@ -198,7 +197,7 @@ LogicalResult ConstantTensorOp::verify() {
     return success(same_scalar);
   }
 
-  if (auto input_vector_type = input_type.dyn_cast<VectorType>()) {
+  if (auto input_vector_type = llvm::dyn_cast<VectorType>(input_type)) {
     bool same_element_type = output_tensor_type.getElementType() ==
                              input_vector_type.getElementType();
     bool same_shape =
@@ -230,7 +229,7 @@ LogicalResult TFRFuncOp::verify() {
   for (auto arg : llvm::enumerate(func.getFunctionType().getInputs())) {
     Type arg_type = arg.value();
 
-    if (auto tensor = arg_type.dyn_cast<TFRTensorType>()) {
+    if (auto tensor = llvm::dyn_cast<TFRTensorType>(arg_type)) {
       if (first_tensor == -1) {
         first_tensor = arg.index();
       }
@@ -240,7 +239,7 @@ LogicalResult TFRFuncOp::verify() {
       continue;
     }
 
-    if (auto tensor_list = arg_type.dyn_cast<TFRTensorListType>()) {
+    if (auto tensor_list = llvm::dyn_cast<TFRTensorListType>(arg_type)) {
       if (first_tensor_list == -1) {
         first_tensor_list = arg.index();
       }
@@ -250,7 +249,7 @@ LogicalResult TFRFuncOp::verify() {
       continue;
     }
 
-    if (!arg_type.isa<TensorType>()) {
+    if (!isa<TensorType>(arg_type)) {
       if (first_attr == -1) {
         first_attr = arg.index();
       }
@@ -307,7 +306,7 @@ LogicalResult TFRFuncOp::verify() {
   bool seen_tensor_list = false, has_tensor_list_order_error = false,
        has_multiple_tensor_lists_error = false;
   for (auto result_type : func.getFunctionType().getResults()) {
-    if (auto tensor = result_type.dyn_cast<TFRTensorType>()) {
+    if (auto tensor = llvm::dyn_cast<TFRTensorType>(result_type)) {
       if (seen_tensor_list) {
         has_tensor_list_order_error = true;
       } else {
@@ -317,7 +316,7 @@ LogicalResult TFRFuncOp::verify() {
       continue;
     }
 
-    if (auto tensor_list = result_type.dyn_cast<TFRTensorListType>()) {
+    if (auto tensor_list = llvm::dyn_cast<TFRTensorListType>(result_type)) {
       if (seen_tensor_list) {
         has_multiple_tensor_lists_error = true;
       } else {
@@ -413,7 +412,7 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
     if (matchPattern(cst_tensor_op.getArg(), m_Constant(&array))) {
       llvm::DenseSet<Type> all_types;
       for (auto it : array) {
-        TypedAttr typed_attr = it.dyn_cast<TypedAttr>();
+        TypedAttr typed_attr = llvm::dyn_cast<TypedAttr>(it);
         if (!typed_attr) return failure();
         all_types.insert(typed_attr.getType());
       }
@@ -423,7 +422,7 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
       DenseElementsAttr attr =
           DenseElementsAttr::get(new_out_type, array.getValue());
       new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, attr);
-      if (out_type.isa<TFRTensorType>()) {
+      if (isa<TFRTensorType>(out_type)) {
         new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
       }
       rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
@@ -434,7 +433,7 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
     if (matchPattern(cst_tensor_op.getArg(), m_Constant(&scalar))) {
       Type new_out_type = RankedTensorType::get({}, scalar.getType());
       new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, scalar);
-      if (out_type.isa<TFRTensorType>()) {
+      if (isa<TFRTensorType>(out_type)) {
         new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
       }
       rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
@@ -445,9 +444,9 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
 };
 
 inline bool isQuantizedType(Type type) {
-  auto tensor_type = type.dyn_cast<TensorType>();
+  auto tensor_type = llvm::dyn_cast<TensorType>(type);
   return (tensor_type &&
-          tensor_type.getElementType().isa<quant::QuantizedType>());
+          isa<quant::QuantizedType>(tensor_type.getElementType()));
 }
 
 class RemoveRedundantCast : public OpRewritePattern<CastOp> {
@@ -471,8 +470,8 @@ class RemoveRedundantCast : public OpRewritePattern<CastOp> {
       return failure();
     }
 
-    auto input_tensor_type = input_type.dyn_cast<TensorType>();
-    auto output_tensor_type = output_type.dyn_cast<TensorType>();
+    auto input_tensor_type = llvm::dyn_cast<TensorType>(input_type);
+    auto output_tensor_type = llvm::dyn_cast<TensorType>(output_type);
     if (!input_tensor_type || !output_tensor_type) {
       return failure();
     }
@@ -493,7 +492,7 @@ class RemoveRedundantCast : public OpRewritePattern<CastOp> {
 
     // If the two types are the same, the back-to-back tfr.cast ops can be
     // removed.
-    if (input_type == output_type || output_type.isa<UnrankedTensorType>()) {
+    if (input_type == output_type || isa<UnrankedTensorType>(output_type)) {
       rewriter.replaceOp(cast_op, {input});
       return success();
     }
@@ -501,8 +500,8 @@ class RemoveRedundantCast : public OpRewritePattern<CastOp> {
     // If the rank of the input tensor isn't ranked, we replace the pair
     // with tf.EnsureShape op so it can be removed after shape inference or
     // confirmed at runtime.
-    if (input_type.isa<UnrankedTensorType>()) {
-      auto shape = output_type.cast<ShapedType>().getShape();
+    if (isa<UnrankedTensorType>(input_type)) {
+      auto shape = llvm::cast<ShapedType>(output_type).getShape();
       auto shape_attr = TF::ShapeAttr::get(rewriter.getContext(), shape);
       rewriter.replaceOpWithNewOp<TF::EnsureShapeOp>(cast_op, output_type,
                                                      input, shape_attr);
@@ -548,7 +547,7 @@ class RemoveRedundantGetElement : public OpRewritePattern<GetElementOp> {
     Value input = preceding_build_list.getOperand(index.getInt());
     Type output_type = ge_op.getType();
     if (input.getType() != output_type &&
-        !output_type.isa<UnrankedTensorType>()) {
+        !isa<UnrankedTensorType>(output_type)) {
       return failure();
     }
     rewriter.replaceOp(ge_op, {input});
@@ -599,10 +598,8 @@ quant::QuantizedType getQuantizedElementType(CastOp cast_op) {
   if (!cast_op || !cast_op.getInputElementType()) {
     return {};
   }
-  return cast_op.getInputElementType()
-      .cast<TypeAttr>()
-      .getValue()
-      .dyn_cast<quant::QuantizedType>();
+  return llvm::dyn_cast<quant::QuantizedType>(
+      llvm::cast<TypeAttr>(cast_op.getInputElementType()).getValue());
 }
 
 class RemoveRawDataOp : public OpRewritePattern<TFRQuantRawDataOp> {
@@ -681,15 +678,15 @@ class RemoveQParamsOp : public OpRewritePattern<TFRQuantQParamsOp> {
     // them to constants.
     rewriter.setInsertionPoint(qparams_op);
     Location loc = qparams_op->getLoc();
-    if (auto qtype = cast_qtype.dyn_cast<quant::UniformQuantizedType>()) {
+    if (auto qtype = llvm::dyn_cast<quant::UniformQuantizedType>(cast_qtype)) {
       scale_op = rewriter.create<TF::ConstOp>(
           loc, RankedTensorType::get({}, rewriter.getF32Type()),
           rewriter.getF32FloatAttr(qtype.getScale()));
       zp_op = rewriter.create<TF::ConstOp>(
           loc, RankedTensorType::get({}, rewriter.getI32Type()),
           rewriter.getI32IntegerAttr(qtype.getZeroPoint()));
-    } else if (auto qtype =
-                   cast_qtype.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+    } else if (auto qtype = llvm::dyn_cast<quant::UniformQuantizedPerAxisType>(
+                   cast_qtype)) {
       SmallVector<float> scales(qtype.getScales().begin(),
                                 qtype.getScales().end());
       SmallVector<int32_t> zps(qtype.getZeroPoints().begin(),
@@ -745,7 +742,7 @@ class RemoveScaleFactorOp : public OpRewritePattern<TFRQuantScaleFactorOp> {
       return failure();
     }
     const double out_scale =
-        out_scale_op.getValue().cast<FloatAttr>().getValueAsDouble();
+        llvm::cast<FloatAttr>(out_scale_op.getValue()).getValueAsDouble();
 
     auto in_scales_op =
         scale_factor_op.getInScales().getDefiningOp<BuildListOp>();
@@ -778,7 +775,8 @@ class RemoveScaleFactorOp : public OpRewritePattern<TFRQuantScaleFactorOp> {
 
     // The shape of scale_type is {} (rank 0) for per-tensor quantized tensor,
     // and {num_channels} (rank 1) for per-channel quantized one.
-    auto scale_type = filter_scale_attr.getType().dyn_cast<RankedTensorType>();
+    auto scale_type =
+        llvm::dyn_cast<RankedTensorType>(filter_scale_attr.getType());
     if (scale_type.getRank() != 0 && scale_type.getRank() != 1) {
       return failure();
     }
@@ -995,14 +993,14 @@ Type TFRDialect::parseType(DialectAsmParser &parser) const {
 void TFRDialect::printType(Type type, DialectAsmPrinter &os) const {
   llvm::ArrayRef<StringAttr> attrs;
 
-  if (type.isa<TFRAttrType>()) {
+  if (isa<TFRAttrType>(type)) {
     os << "attr";
     return;
   }
-  if (auto tensor_ty = type.dyn_cast<TFRTensorType>()) {
+  if (auto tensor_ty = llvm::dyn_cast<TFRTensorType>(type)) {
     attrs = tensor_ty.getAttrKeys();
     os << "tensor";
-  } else if (auto tensor_list_ty = type.dyn_cast<TFRTensorListType>()) {
+  } else if (auto tensor_list_ty = llvm::dyn_cast<TFRTensorListType>(type)) {
     attrs = tensor_list_ty.getAttrKeys();
     os << "tensor_list";
   } else {
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
index 7cdaee96512d..d1014fec8e3e 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
@@ -49,7 +49,7 @@ def TFR_Dialect : Dialect {
 
 // tensor argument types
 class TFR_Type<string name> : DialectType<TFR_Dialect,
-    CPred<"$_self.isa<mlir::TFR::" # name # "Type>()">,
+    CPred<"llvm::isa<mlir::TFR::" # name # "Type>($_self)">,
     "TFR " # name #" type">,
     BuildableType<"$_builder.getType<mlir::TFR::" # name # "Type>()">;
 def TFR_TensorType : TFR_Type<"TFRTensor">;
@@ -178,7 +178,7 @@ def TFR_CastOp : TFR_Op<"cast", [Pure]> {
     // Return element type of the input tensor type. Only available when the
     // input is a MLIR built-in tensor type.
     Attribute getInputElementType() {
-      if (auto ty = getArg().getType().dyn_cast<TensorType>()) {
+      if (auto ty = llvm::dyn_cast<TensorType>(getArg().getType())) {
         return TypeAttr::get(ty.getElementType());
       }
       return {};
diff --git a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
index 9cc555b78935..fb0640536d4f 100644
--- a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Inliner.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
@@ -142,8 +143,9 @@ LogicalResult SimplifySCFIfOp::InlineRegion(Location loc,
                                             Operation *inline_point,
                                             Region *region) const {
   InlinerInterface interface(loc.getContext());
-  if (failed(inlineRegion(interface, region, inline_point, {},
-                          inline_point->getResults(), loc,
+  InlinerConfig config;
+  if (failed(inlineRegion(interface, config.getCloneCallback(), region,
+                          inline_point, {}, inline_point->getResults(), loc,
                           /*shouldCloneInlinedRegion=*/true))) {
     return failure();
   }
diff --git a/tensorflow/compiler/mlir/tfr/passes/decompose.cc b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
index 3a5d6f23072b..7b3299cf5212 100644
--- a/tensorflow/compiler/mlir/tfr/passes/decompose.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "mlir/Transforms/Inliner.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_types.h"
@@ -282,6 +282,7 @@ LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
 LogicalResult DecomposeTFOpsPass::InlineTFRFuncCalls() {
   // The Inliner will automatically use the registered dialect inliner.
   InlinerInterface inliner(&getContext());
+  InlinerConfig config;
   func::FuncOp func = getOperation();
   SymbolTable table(external_tfr_module_.has_value()
                         ? *external_tfr_module_
@@ -301,7 +302,7 @@ LogicalResult DecomposeTFOpsPass::InlineTFRFuncCalls() {
 
     // Use the inliner to replace all the uses of the call_op by its
     // composition.
-    if (failed(inlineCall(inliner,
+    if (failed(inlineCall(inliner, config.getCloneCallback(),
                           cast<CallOpInterface>(call_op.getOperation()),
                           cast<CallableOpInterface>(callee.getOperation()),
                           callee.getCallableRegion(),
diff --git a/tensorflow/compiler/mlir/tfr/passes/decompose_patterns.td b/tensorflow/compiler/mlir/tfr/passes/decompose_patterns.td
index 503fd6256f16..d3b0322095d8 100644
--- a/tensorflow/compiler/mlir/tfr/passes/decompose_patterns.td
+++ b/tensorflow/compiler/mlir/tfr/passes/decompose_patterns.td
@@ -21,7 +21,7 @@ include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.td"
 class Quantize<string value> : NativeCodeCall<"TFR::Quantize(" # value # ", $0, $1, $_builder)">;
 
 class HasStringAttr<string value> : AttrConstraint<
-    CPred<"$_self.cast<StringAttr>().getValue() == \"" # value # "\"">>;
+    CPred<"llvm::cast<StringAttr>($_self).getValue() == \"" # value # "\"">>;
 
 def QuantActRangeNonePattern :
   Pattern<
diff --git a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
index 4f079395063a..94a84cc3072e 100644
--- a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_types.h"
diff --git a/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc b/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
index 34ae51c14ed1..0a30c8f21b58 100644
--- a/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <string>
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -54,12 +55,11 @@ void RewriteQuantizedIOPass::runOnOperation() {
     // with input_arg(tensor<storage_type>) -> tfr.cast
     for (BlockArgument arg : block.getArguments()) {
       Type arg_type = arg.getType();
-      if (auto quant_type = arg_type.cast<TensorType>()
-                                .getElementType()
-                                .dyn_cast<quant::QuantizedType>()) {
+      if (auto quant_type = llvm::dyn_cast<quant::QuantizedType>(
+              llvm::cast<TensorType>(arg_type).getElementType())) {
         if (arg.hasOneUse() && llvm::isa<TFR::CastOp>(*arg.user_begin())) {
-          arg.setType(
-              arg_type.cast<TensorType>().clone(quant_type.getStorageType()));
+          arg.setType(llvm::cast<TensorType>(arg_type).clone(
+              quant_type.getStorageType()));
         } else {
           std::string error_message;
           llvm::raw_string_ostream os{error_message};
@@ -77,17 +77,17 @@ void RewriteQuantizedIOPass::runOnOperation() {
     // with tfr.cast(tensor<storage_type>) -> output
     for (OpOperand& returned_value : terminator->getOpOperands()) {
       auto returned_type =
-          returned_value.get().getType().dyn_cast<TensorType>();
+          llvm::dyn_cast<TensorType>(returned_value.get().getType());
       if (!returned_type ||
-          !returned_type.getElementType().isa<quant::QuantizedType>()) {
+          !llvm::isa<quant::QuantizedType>(returned_type.getElementType())) {
         continue;
       }
 
       if (auto returned_op =
               returned_value.get().getDefiningOp<TFR::CastOp>()) {
-        auto new_type = returned_type.clone(returned_type.getElementType()
-                                                .cast<quant::QuantizedType>()
-                                                .getStorageType());
+        auto new_type = returned_type.clone(
+            llvm::cast<quant::QuantizedType>(returned_type.getElementType())
+                .getStorageType());
         auto new_op = builder.create<TFR::CastOp>(
             returned_op->getLoc(), new_type, returned_op.getArg());
         returned_value.set(new_op.getResult());
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 7c18a25ef083..2439e8e3b5e9 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -60,16 +60,10 @@ td_library(
 
 gentbl_cc_library(
     name = "runtime_fallback_ops_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "runtime_fallback_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "runtime_fallback_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "runtime_fallback_ops.h.inc": ["-gen-op-decls"],
+        "runtime_fallback_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "runtime_fallback/runtime_fallback_ops.td",
     deps = [":runtime_fallback_ops_td_files"],
@@ -556,6 +550,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
+        "@stablehlo//:register",
         "@tf_runtime//:init_tfrt_dialects",
         "@tf_runtime//:print_stream_pass",
     ],
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index b29066807fbf..ae5379f2102f 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -141,16 +141,10 @@ td_library(
 gentbl_cc_library(
     name = "tfrt_fallback_opdefs_inc_gen",
     compatible_with = get_compatible_with_portable(),  # copybara: comment
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tfrt_fallback.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tfrt_fallback.cpp.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tfrt_fallback.h.inc": ["-gen-op-decls"],
+        "tfrt_fallback.cpp.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tfrt_fallback.td",
     deps = [":tfrt_fallback_td_files"],
@@ -159,16 +153,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tfrt_fallback_async_opdefs_inc_gen",
     compatible_with = get_compatible_with_portable(),  # copybara: comment
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tfrt_fallback_async.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tfrt_fallback_async.cpp.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tfrt_fallback_async.h.inc": ["-gen-op-decls"],
+        "tfrt_fallback_async.cpp.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tfrt_fallback_async.td",
     deps = [":tfrt_fallback_td_files"],
@@ -176,23 +164,14 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "tfrt_fallback_sync_opdefs_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tfrt_fallback_sync.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tfrt_fallback_sync.cpp.inc",
-        ),
-        (
-            [
-                "-gen-dialect-decls",
-                "-dialect=tfrt_fallback_sync",
-            ],
-            "tfrt_fallback_sync_dialect.h.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tfrt_fallback_sync.h.inc": ["-gen-op-decls"],
+        "tfrt_fallback_sync.cpp.inc": ["-gen-op-defs"],
+        "tfrt_fallback_sync_dialect.h.inc": [
+            "-gen-dialect-decls",
+            "-dialect=tfrt_fallback_sync",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tfrt_fallback_sync.td",
     test = True,
@@ -219,23 +198,14 @@ td_library(
 
 gentbl_cc_library(
     name = "tfrt_gpu_opdefs_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "gpu_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "gpu_ops.cpp.inc",
-        ),
-        (
-            [
-                "-gen-dialect-decls",
-                "-dialect=gpurt",
-            ],
-            "gpurt_dialect.h.inc",
-        ),
-    ],
+    tbl_outs = {
+        "gpu_ops.h.inc": ["-gen-op-decls"],
+        "gpu_ops.cpp.inc": ["-gen-op-defs"],
+        "gpurt_dialect.h.inc": [
+            "-gen-dialect-decls",
+            "-dialect=gpurt",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "gpu_ops.td",
     test = True,
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index 374aad2a242d..200f66fd722f 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -23,16 +23,10 @@ td_library(
 
 gentbl_cc_library(
     name = "mlrt_ops_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "mlrt_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "mlrt_ops.cpp.inc",
-        ),
-    ],
+    tbl_outs = {
+        "mlrt_ops.h.inc": ["-gen-op-decls"],
+        "mlrt_ops.cpp.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mlrt_ops.td",
     deps = [":mlrt_td_files"],
@@ -96,16 +90,10 @@ td_library(
 
 gentbl_cc_library(
     name = "tf_mlrt_ops_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tf_mlrt_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tf_mlrt_ops.cpp.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tf_mlrt_ops.h.inc": ["-gen-op-decls"],
+        "tf_mlrt_ops.cpp.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_mlrt_ops.td",
     deps = [":tf_mlrt_td_files"],
@@ -113,16 +101,10 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "tf_mlrt_tpu_ops_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tf_mlrt_tpu_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tf_mlrt_tpu_ops.cpp.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tf_mlrt_tpu_ops.h.inc": ["-gen-op-decls"],
+        "tf_mlrt_tpu_ops.cpp.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_mlrt_tpu_ops.td",
     deps = [":tf_mlrt_tpu_td_files"],
@@ -130,16 +112,10 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "tf_ops_inc_gen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tf_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tf_ops.cpp.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tf_ops.h.inc": ["-gen-op-decls"],
+        "tf_ops.cpp.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_ops.td",
     deps = [":tf_mlrt_td_files"],
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
index b260dcb402f3..13409c3ece1f 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
@@ -29,7 +29,7 @@ def Mlrt_Dialect : Dialect {
 }
 
 def MlrtFutureType : DialectType<Mlrt_Dialect,
-    CPred<"$_self.isa<::mlrt::compiler::FutureType>()">, "!mlrt.future type">,
+    CPred<"::llvm::isa<::mlrt::compiler::FutureType>($_self)">, "!mlrt.future type">,
     BuildableType<"$_builder.getType<::mlrt::compiler::FutureType>()"> {
   let description = [{
     `!mlrt.future type` represents a C++ mlrt::Future.
@@ -37,7 +37,7 @@ def MlrtFutureType : DialectType<Mlrt_Dialect,
 }
 
 def MlrtPromiseType : DialectType<Mlrt_Dialect,
-    CPred<"$_self.isa<::mlrt::compiler::PromiseType>()">, "!mlrt.promise type">,
+    CPred<"::llvm::isa<::mlrt::compiler::PromiseType>($_self)">, "!mlrt.promise type">,
     BuildableType<"$_builder.getType<::mlrt::compiler::PromiseType>()"> {
   let description = [{
     `!mlrt.promise type` represents a C++ mlrt::Promise.
@@ -45,7 +45,7 @@ def MlrtPromiseType : DialectType<Mlrt_Dialect,
 }
 
 def MlrtAsyncHandleType : DialectType<Mlrt_Dialect,
-    CPred<"$_self.isa<::mlrt::compiler::AsyncHandleType>()">, "!mlrt.async_handle type">,
+    CPred<"::llvm::isa<::mlrt::compiler::AsyncHandleType>($_self)">, "!mlrt.async_handle type">,
     BuildableType<"$_builder.getType<::mlrt::compiler::AsyncHandleType>()"> {
   let description = [{
     `!mlrt.async_handle type` represents a C++ mlrt::AsyncHandle.
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
index 9cf997e0c3e8..e706ac0e36c7 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
@@ -37,7 +37,7 @@ class TensorflowMlrt_Op<string mnemonic, list<Trait> traits = []> :
 
 // This corresponds to tensorflow::Tensor.
 def TFTensorType : DialectType<TensorflowMlrt_Dialect,
-    CPred<"$_self.isa<::tensorflow::tf_mlrt::TFTensorType>()">, "!tf_mlrt.tensor type">,
+    CPred<"::llvm::isa<::tensorflow::tf_mlrt::TFTensorType>($_self)">, "!tf_mlrt.tensor type">,
     BuildableType<"$_builder.getType<::tensorflow::tf_mlrt::TFTensorType>()"> {
   let description = [{
     `!tf_mlrt.tensor type` represents a tensorflow::Tensor.
@@ -46,7 +46,7 @@ def TFTensorType : DialectType<TensorflowMlrt_Dialect,
 
 // This corresponds to tensorflow::Device* .
 def TFDeviceType : DialectType<TensorflowMlrt_Dialect,
-    CPred<"$_self.isa<::tensorflow::tf_mlrt::TFDeviceType>()">, "!tf_mlrt.device type">,
+    CPred<"::llvm::isa<::tensorflow::tf_mlrt::TFDeviceType>($_self)">, "!tf_mlrt.device type">,
     BuildableType<"$_builder.getType<::tensorflow::tf_mlrt::TFDeviceType>()"> {
   let description = [{
     `!tf_mlrt.device type` represents a tensorflow::device.
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
index 0c42590f9aa7..6587f825d7a0 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
@@ -31,7 +31,7 @@ def Fallback_Dialect : Dialect {
 
 // This corresponds to tensorflow::Tensor.
 def TFTensorType : DialectType<Fallback_Dialect,
-    CPred<"$_self.isa<::tfrt::fallback::TFTensorType>()">, "!tfrt_fallback.tf_tensor type">,
+    CPred<"::llvm::isa<::tfrt::fallback::TFTensorType>($_self)">, "!tfrt_fallback.tf_tensor type">,
     BuildableType<"$_builder.getType<::tfrt::fallback::TFTensorType>()"> {
   let description = [{
     `!tfrt_fallback.tf_tensor type` represents a tensorflow::Tensor.
@@ -40,7 +40,7 @@ def TFTensorType : DialectType<Fallback_Dialect,
 
 // This corresponds to tensorflow::Allocator.
 def TFAllocatorType : DialectType<Fallback_Dialect,
-    CPred<"$_self.isa<::tfrt::fallback::TFAllocatorType>()">, "!tfrt_fallback.tf_allocator type">,
+    CPred<"::llvm::isa<::tfrt::fallback::TFAllocatorType>($_self)">, "!tfrt_fallback.tf_allocator type">,
     BuildableType<"$_builder.getType<::tfrt::fallback::TFAllocatorType>()"> {
   let description = [{
     `!tfrt_fallback.tf_tensor type` represents a tensorflow::Tensor.
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
index 9d77a1a73aa8..5a5d64e90463 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
 
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
index a862e6abf727..fa2ec0b14c81 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
@@ -18,3 +18,26 @@
     %2 = "tf.IfrtCall"(%arg0, %array_key) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}> {__tpu_compile_metadata_text = "retvals { sharding { } }"} : (tensor<1x3xf32>, tensor<!tf_type.string>) -> tensor<1x1xf32>
     return %2 : tensor<1x1xf32>
   }
+
+
+// -----
+// Variable is used by two CPU ops
+//
+// CHECK-LABEL: func @serving_default
+// CHECK-NEXT:    [[HANDLE:%.*]] = "tf.VarHandleOp"()
+// CHECK-NEXT:    [[ARRAYKEY:%.*]], [[FURTURE:%.*]] = "tf_mlrt.tf_ifrt_load_variable"([[HANDLE]])
+// CHECK-SAME:       <{used_by_host = true}> : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, !mlrt.future)
+// CHECK:         [[TENSOR:%.*]] = "tf_mlrt.tf_await"([[FURTURE]]) : (!mlrt.future) -> tensor<3x1xf32>
+// CHECK-NEXT:    "tf.AddV2"([[TENSOR]], %cst) : (tensor<3x1xf32>, tensor<3x1xf32>) -> tensor<3x1xf32>
+// CHECK-NEXT:    "tf.Sub"([[TENSOR]], %cst) : (tensor<3x1xf32>, tensor<3x1xf32>) -> tensor<3x1xf32>
+// CHECK-NEXT:    return
+//
+ func.func @serving_default() {
+    %0 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    %array_key, %tensor = "tf.IfrtLoadVariable"(%0) <{used_by_host = true}> : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, tensor<3x1xf32>)
+    %cst_24 = "tf.Const"() <{value = dense<[[0.0], [1.0], [2.0]]> : tensor<3x1xf32>}> : () -> tensor<3x1xf32>
+    %1 = "tf.AddV2"(%tensor, %cst_24) : (tensor<3x1xf32>, tensor<3x1xf32>) -> tensor<3x1xf32>
+    %2 = "tf.Sub"(%tensor, %cst_24) : (tensor<3x1xf32>, tensor<3x1xf32>) -> tensor<3x1xf32>
+
+    return 
+  }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
index a74d6509a0ed..bfb3cc28a217 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
@@ -684,3 +684,69 @@ func.func @func(%arg0: tensor<?xf32>, %arg1: tensor<!tf_type.variant<tensor<*xf3
   %2 = "tf.TensorListStack"(%1#2, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 3 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<0xi32>) -> tensor<3xf32>
   return %2 : tensor<3xf32>
 }
+
+// -----
+
+// Test a while to map_fn conversion in which a tf.StopGradient is inserted to consume the while result.
+// CHECK-LABEL: @while_map_while_body_884030
+func.func private @while_map_while_body_884030(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, %arg4: tensor<!tf_type.variant<tensor<?x?x1xui8>>> {tf._user_specified_name = "while/map/TensorArrayUnstack/TensorListFromTensor"}) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x?x1xui8>>>) {
+  %cst = "tf.Const"() <{value = dense<[-1, -1, 1]> : tensor<3xi32>}>  : () -> tensor<3xi32>
+  %cst_0 = "tf.Const"() <{value = dense<1> : tensor<i32>}>  : () -> tensor<i32>
+  %0 = "tf.AddV2"(%arg2, %cst_0)  : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Identity"(%0)  : (tensor<i32>) -> tensor<i32>
+  %2 = "tf.TensorListGetItem"(%arg4, %arg2, %cst)  : (tensor<!tf_type.variant<tensor<?x?x1xui8>>>, tensor<i32>, tensor<3xi32>) -> tensor<?x?x1xui8>
+  %3 = "tf.EncodePng"(%2) <{compression = -1 : i64}>  : (tensor<?x?x1xui8>) -> tensor<!tf_type.string>
+  %4 = "tf.TensorListSetItem"(%arg3, %arg2, %3) <{resize_if_index_out_of_bounds = false}>  : (tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, tensor<i32>, tensor<!tf_type.string>) -> tensor<!tf_type.variant<tensor<*x!tf_type.string>>>
+  %5 = "tf.Identity"(%4)  : (tensor<!tf_type.variant<tensor<*x!tf_type.string>>>) -> tensor<!tf_type.variant<tensor<*x!tf_type.string>>>
+  %6 = "tf.AddV2"(%arg0, %cst_0)  : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %7 = "tf.Identity"(%6)  : (tensor<i32>) -> tensor<i32>
+  %8 = "tf.Identity"(%arg1)  : (tensor<i32>) -> tensor<i32>
+  return %7, %8, %1, %5, %arg4 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x?x1xui8>>>
+}
+
+// CHECK-LABEL:  while_map_while_body_884030/MapFnBody
+// CHECK:         tf.AddV2
+// CHECK-NEXT:    tf.TensorListGetItem
+// CHECK-NEXT:    tf.EncodePng
+// CHECK-NEXT:    tf.AddV2
+// CHECK-NEXT:    tf_await
+// CHECK-NEXT:    tf.TensorListSetItem
+// CHECK-NEXT:    tf_promise
+ 
+// CHECK-LABEL: @while_map_while_cond_884020
+func.func private @while_map_while_cond_884020(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, %arg4: tensor<!tf_type.variant<tensor<?x?x1xui8>>>) -> tensor<i1> {
+  %cst = "tf.Const"() <{value = dense<11> : tensor<i32>}>  : () -> tensor<i32>
+  %0 = "tf.Less"(%arg2, %cst)  : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = "tf.Less"(%arg0, %arg1)  : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = "tf.LogicalAnd"(%1, %0)  : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  %3 = "tf.Identity"(%2)  : (tensor<i1>) -> tensor<i1>
+  return %3 : tensor<i1>
+}
+
+// CHECK-LABEL: @main
+// CHECK:       tf.Cast
+// CHECK-NEXT:  tf.TensorListReserve
+// CHECK-NEXT:  tf.Transpose
+// CHECK-NEXT:  tf.TensorListFromTensor
+// CHECK-NEXT:  tf_mlrt.tf_map_fn
+// CHECK-SAME:   {body_fn = @"while_map_while_body_884030/MapFnBody", num_tensor_list_or_flow_in = 1 : i32} : (tensor<i32>, tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, tensor<i32>, tensor<!tf_type.variant<tensor<?x?x1xui8>>>) -> tensor<!tf_type.variant<tensor<*x!tf_type.string>>>
+// CHECK-NEXT:  tf.StopGradient
+// CHECK-NEXT:  tf.TensorListStack
+func.func @main(%arg0: tensor<1x?x?x11xf32>) -> tensor<11x!tf_type.string> {
+  %cst_0 = "tf.Const"() <{value = dense<[3, 1, 2, 0]> : tensor<4xi32>}>  : () -> tensor<4xi32>
+  %cst_10 = "tf.Const"() <{value = dense<0> : tensor<i32>}>  : () -> tensor<i32>
+  %cst_11 = "tf.Const"() <{value = dense<2> : tensor<i32>}>  : () -> tensor<i32>
+  %cst_12 = "tf.Const"() <{value = dense<1> : tensor<i32>}>  : () -> tensor<i32>
+  %cst_13 = "tf.Const"() <{value = dense<[-1, -1, 1]> : tensor<3xi32>}>  : () -> tensor<3xi32>
+  %cst_14 = "tf.Const"() <{value = dense<> : tensor<0xi32>}>  : () -> tensor<0xi32>
+  %cst_15 = "tf.Const"() <{value = dense<-1> : tensor<i32>}>  : () -> tensor<i32>
+  %cst_16 = "tf.Const"() <{value = dense<11> : tensor<i32>}>  : () -> tensor<i32>
+  %92 = "tf.Cast"(%arg0) <{Truncate = false}>  : (tensor<1x?x?x11xf32>) -> tensor<1x?x?x11xui8>
+  %0 = "tf.TensorListReserve"(%cst_15, %cst_16)  : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*x!tf_type.string>>>
+  %93 = "tf.Transpose"(%92, %cst_0)  : (tensor<1x?x?x11xui8>, tensor<4xi32>) -> tensor<11x?x?x1xui8>
+  %94 = "tf.TensorListFromTensor"(%93, %cst_13)  : (tensor<11x?x?x1xui8>, tensor<3xi32>) -> tensor<!tf_type.variant<tensor<?x?x1xui8>>>
+  %95:5 = "tf.While"(%cst_10, %cst_16, %cst_10, %0, %94) <{body = @while_map_while_body_884030, cond = @while_map_while_cond_884020, is_stateless = true, parallel_iterations = 16 : i64, shape_invariant}> {T = [i32, i32, i32, !tf_type.variant, !tf_type.variant], _lower_using_switch_merge = true, _num_original_outputs = 5 : i64, _read_only_resource_inputs = [], device = "", output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>]} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x?x1xui8>>>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x?x1xui8>>>)
+  %96 = "tf.StopGradient"(%95#3)  : (tensor<!tf_type.variant<tensor<*x!tf_type.string>>>) -> tensor<!tf_type.variant<tensor<*x!tf_type.string>>>
+  %97 = "tf.TensorListStack"(%96, %cst_14) <{num_elements = 11 : i64}>  : (tensor<!tf_type.variant<tensor<*x!tf_type.string>>>, tensor<0xi32>) -> tensor<11x!tf_type.string>
+  return %97 : tensor<11x!tf_type.string>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
index 0de1d1eaabf4..c6d21e330ad6 100644
--- a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
+++ b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
@@ -61,6 +62,7 @@ int main(int argc, char **argv) {
                   mlrt::compiler::MlrtDialect>();
   tensorflow::RegisterTPUDialects(&registry);
   tensorflow::RegisterGpuDialects(&registry);
+  mlir::stablehlo::registerAllDialects(registry);
 
   tfrt::RegisterTFRTDialects(registry);
   tensorflow::tfrt_compiler::RegisterTPULowerClusterToRuntimeOpsPassPipeline();
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index 2162d37eebcf..d2e9d84c1936 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -20,10 +20,8 @@ package_group(
     ] + if_google([
         "//learning/brain/tfrt/cpp_tests/...",
         "//learning/serving/servables/tfrt/...",
-        "//learning/pathways/serving/runtime/...",
-        "//learning/pathways/serving/tests/...",
         "//learning/brain/tfrt/ifrt/...",
-        "//learning/brain/tfrt/mlir/mlrt/application/pathways/compiler/...",
+        "//learning/brain/tfrt/tfrt_session/...",
         # Allow visibility from the mlir language server.
         "//learning/brain/mlir/mlir_lsp_server/...",
         "//learning/infra/mira/experimental/orbax_model/serving/sidecar/...",
@@ -33,15 +31,10 @@ package_group(
 
 gentbl_cc_library(
     name = "pass_inc_gen",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TfrtIfrtServing",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TfrtIfrtServing",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     deps = [
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
index 7122f26e0822..2cb92cb8baac 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
@@ -49,7 +49,6 @@ struct Tf2HloArg {
   tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn;
   std::shared_ptr<xla::ifrt::Topology> topology;
   absl::string_view platform_name;
-  bool enable_r1_optimization = true;
 
   absl::StatusOr<uint64_t> Fingerprint() const;
 };
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
index 24252c40ae7d..1bd737b98c37 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
@@ -118,11 +118,10 @@ TEST_F(Tf2HloTest, Empty) {
       GetCompileMetadata(mlir_module.get(), *client));
   TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, {}));
 
-  xla::CpuTopologyDescription cpu_topology =
-      xla::CpuTopologyDescription::Create(
-          xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
-          /*devices=*/std::vector<std::unique_ptr<xla::PjRtDevice>>{},
-          /*machine_attributes=*/std::vector<std::string>{});
+  const xla::CpuTopologyDescription cpu_topology(
+      xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
+      /*cpu_devices=*/{},
+      /*machine_attributes=*/std::vector<std::string>{});
   std::shared_ptr<xla::CpuTopologyDescription> cpu_topology_ptr =
       std::make_shared<xla::CpuTopologyDescription>(cpu_topology);
 
@@ -168,11 +167,10 @@ TEST_F(Tf2HloTest, Tuple) {
       GetCompileMetadata(mlir_module.get(), *client));
   TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
 
-  xla::CpuTopologyDescription cpu_topology =
-      xla::CpuTopologyDescription::Create(
-          xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
-          /*devices=*/std::vector<std::unique_ptr<xla::PjRtDevice>>{},
-          /*machine_attributes=*/std::vector<std::string>{});
+  const xla::CpuTopologyDescription cpu_topology(
+      xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
+      /*cpu_devices=*/{},
+      /*machine_attributes=*/std::vector<std::string>{});
   std::shared_ptr<xla::CpuTopologyDescription> cpu_topology_ptr =
       std::make_shared<xla::CpuTopologyDescription>(cpu_topology);
 
@@ -219,11 +217,10 @@ TEST_F(Tf2HloTest, Spmd) {
       GetCompileMetadata(mlir_module.get(), *client));
   TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
 
-  xla::CpuTopologyDescription cpu_topology =
-      xla::CpuTopologyDescription::Create(
-          xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
-          /*devices=*/std::vector<std::unique_ptr<xla::PjRtDevice>>{},
-          /*machine_attributes=*/std::vector<std::string>{});
+  const xla::CpuTopologyDescription cpu_topology(
+      xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
+      /*cpu_devices=*/{},
+      /*machine_attributes=*/std::vector<std::string>{});
   std::shared_ptr<xla::CpuTopologyDescription> cpu_topology_ptr =
       std::make_shared<xla::CpuTopologyDescription>(cpu_topology);
 
@@ -307,11 +304,10 @@ TEST_F(Tf2HloTest, UsingDefaultDeviceAssignment) {
       GetCompileMetadata(mlir_module.get(), *client));
   TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
 
-  xla::CpuTopologyDescription cpu_topology =
-      xla::CpuTopologyDescription::Create(
-          xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
-          /*devices=*/std::vector<std::unique_ptr<xla::PjRtDevice>>{},
-          /*machine_attributes=*/std::vector<std::string>{});
+  const xla::CpuTopologyDescription cpu_topology(
+      xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
+      /*cpu_devices=*/{},
+      /*machine_attributes=*/std::vector<std::string>{});
   std::shared_ptr<xla::CpuTopologyDescription> cpu_topology_ptr =
       std::make_shared<xla::CpuTopologyDescription>(cpu_topology);
 
@@ -420,11 +416,10 @@ TEST_F(Tf2HloTest, XlaCallHostCallback) {
       GetCompileMetadata(mlir_module.get(), *client));
   TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
 
-  xla::CpuTopologyDescription cpu_topology =
-      xla::CpuTopologyDescription::Create(
-          xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
-          /*devices=*/std::vector<std::unique_ptr<xla::PjRtDevice>>{},
-          /*machine_attributes=*/std::vector<std::string>{});
+  const xla::CpuTopologyDescription cpu_topology(
+      xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
+      /*cpu_devices=*/{},
+      /*machine_attributes=*/std::vector<std::string>{});
   std::shared_ptr<xla::CpuTopologyDescription> cpu_topology_ptr =
       std::make_shared<xla::CpuTopologyDescription>(cpu_topology);
 
@@ -530,11 +525,10 @@ TEST_F(Tf2HloTest, SameArgProduceSameKeyFingerprint) {
       GetCompileMetadata(mlir_module.get(), *client));
   TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
 
-  xla::CpuTopologyDescription cpu_topology =
-      xla::CpuTopologyDescription::Create(
-          xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
-          /*devices=*/std::vector<std::unique_ptr<xla::PjRtDevice>>{},
-          /*machine_attributes=*/std::vector<std::string>{});
+  const xla::CpuTopologyDescription cpu_topology(
+      xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
+      /*cpu_devices=*/{},
+      /*machine_attributes=*/std::vector<std::string>{});
   std::shared_ptr<xla::CpuTopologyDescription> cpu_topology_ptr =
       std::make_shared<xla::CpuTopologyDescription>(cpu_topology);
 
@@ -592,11 +586,10 @@ TEST_F(Tf2HloTest, DifferentCompileMetadataProduceDifferentKeyFingerprint) {
       GetCompileMetadata(mlir_module.get(), *client));
   TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
 
-  xla::CpuTopologyDescription cpu_topology =
-      xla::CpuTopologyDescription::Create(
-          xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
-          /*devices=*/std::vector<std::unique_ptr<xla::PjRtDevice>>{},
-          /*machine_attributes=*/std::vector<std::string>{});
+  const xla::CpuTopologyDescription cpu_topology(
+      xla::CpuId(), xla::CpuName(), /*platform_version=*/"",
+      /*cpu_devices=*/{},
+      /*machine_attributes=*/std::vector<std::string>{});
   std::shared_ptr<xla::CpuTopologyDescription> cpu_topology_ptr =
       std::make_shared<xla::CpuTopologyDescription>(cpu_topology);
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.cc
index 368a91ac54f9..98058a3b3202 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/rewrite_ifrt_load_variable.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -75,16 +76,27 @@ class RewriteIfrtLoadVariablePass
           builder.create<tf_mlrt::TFIfrtLoadVariableOp>(
               load_variable_op->getLoc(), result_types,
               load_variable_op->getOperands(), load_variable_op->getAttrs());
-      for (auto user : load_variable_op.getTensorFuture().getUsers()) {
-        builder.setInsertionPoint(user);
-        auto await_op = builder.create<tf_mlrt::TFAwaitOp>(
-            user->getLoc(), load_variable_op.getTensorFuture().getType(),
-            mlrt_load_variable_op.getTensorFuture());
+      tf_mlrt::TFAwaitOp await_op;
+      for (auto user : llvm::make_early_inc_range(
+               load_variable_op.getTensorFuture().getUsers())) {
+        // Materialize the future for the first use. Reuse it for the rest of
+        // the uses.
+        if (!await_op) {
+          builder.setInsertionPoint(user);
+          await_op = builder.create<tf_mlrt::TFAwaitOp>(
+              user->getLoc(), load_variable_op.getTensorFuture().getType(),
+              mlrt_load_variable_op.getTensorFuture());
+        } else {
+          if (user->isBeforeInBlock(await_op)) {
+            await_op->moveBefore(user);
+          }
+        }
         user->replaceUsesOfWith(load_variable_op.getTensorFuture(),
                                 await_op.getResult());
       }
 
-      for (auto user : load_variable_op.getArrayKey().getUsers()) {
+      for (auto user : llvm::make_early_inc_range(
+               load_variable_op.getArrayKey().getUsers())) {
         user->replaceUsesOfWith(load_variable_op.getArrayKey(),
                                 mlrt_load_variable_op.getArrayKey());
       }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 58ad4d856162..a82ba0be0cd2 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -1059,8 +1059,6 @@ class TfToMlrtConversionPass
 
     type_converter_.addTargetMaterialization(future_to_tensor_materialization);
     type_converter_.addSourceMaterialization(future_to_tensor_materialization);
-    type_converter_.addArgumentMaterialization(
-        future_to_tensor_materialization);
 
     if (use_tpu_host_allocator_for_inputs_.hasValue()) {
       options_.use_tpu_host_allocator_for_inputs =
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
index 0bc2a9617b12..31ddaea602fe 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
@@ -369,8 +369,14 @@ class WhileToMapFnPass
     }
 
     for (auto result_index : loop_info.tensor_list_or_flow_in) {
+      // Finds the use of the tensor list or flow in is a tensor list stack or
+      // tensor array gather. This maybe over-conservative, but we rather be
+      // correct than sorry.
       mlir::Operation *use_op =
           *while_op->getResult(result_index).getUsers().begin();
+      if (llvm::isa<mlir::TF::StopGradientOp>(use_op)) {
+        use_op = *use_op->getUsers().begin();
+      }
 
       if (!llvm::isa<mlir::TF::TensorListStackOp,
                      mlir::TF::TensorArrayGatherV3Op>(use_op)) {
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc b/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
index 2e33dcb9e67d..0ed5a6ac1b6a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
@@ -60,10 +60,8 @@ struct RewriteStatefulPartitionedCallToXlaLaunchOnCpu
 
     for (int i = 0; i < op.getNumOperands(); ++i) {
       auto value = op.getOperand(i);
-      if (value.getType()
-              .cast<mlir::TensorType>()
-              .getElementType()
-              .isa<mlir::tf_type::ResourceType>()) {
+      if (llvm::isa<mlir::tf_type::ResourceType>(
+              llvm::cast<mlir::TensorType>(value.getType()).getElementType())) {
         resources.push_back(i);
       } else if (auto* def = value.getDefiningOp();
                  def && llvm::isa<mlir::TF::ConstOp>(def)) {
diff --git a/tensorflow/compiler/mlir/tools/BUILD b/tensorflow/compiler/mlir/tools/BUILD
new file mode 100644
index 000000000000..3b29e0f56664
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/BUILD
@@ -0,0 +1,51 @@
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "translate_cl_options",
+    srcs = [
+        "tf_mlir_translate_cl.cc",
+    ],
+    hdrs = [
+        "tf_mlir_translate_cl.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "translate_registration",
+    srcs = [
+        "tf_mlir_translate_registration.cc",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/translate:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow/translate/tools:file_tf_mlir_translate",
+        "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_executor_to_graph",
+        "//tensorflow/compiler/mlir/tools:translate_cl_options",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TranslateLib",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_xla//xla/client:client_library",
+        "@local_xla//xla/client:compile_only_client",
+        "@local_xla//xla/service/cpu:cpu_compiler",
+        "@local_xla//xla/service/cpu:cpu_transfer_manager",
+        "@local_xla//xla/stream_executor/host:host_platform",
+        "@local_xla//xla/stream_executor/host:host_platform_id",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
index 5d29b211a94f..1bdcd145d899 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
@@ -40,6 +40,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
 #include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -73,7 +74,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
     } else {
       triple = llvm::Triple(llvm::sys::getDefaultTargetTriple());
     }
-    module->setTargetTriple(triple.getTriple());
+    module->setTargetTriple(llvm::Triple(triple.getTriple()));
   }
 
   std::string error;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index 42b29d86d31e..0c504a62de16 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -30,24 +30,12 @@ td_library(
 gentbl_cc_library(
     name = "tf_framework_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "tf_framework_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "tf_framework_ops.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "tf_framework_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "tf_framework_dialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tf_framework_ops.h.inc": ["-gen-op-decls"],
+        "tf_framework_ops.cc.inc": ["-gen-op-defs"],
+        "tf_framework_dialect.h.inc": ["-gen-dialect-decls"],
+        "tf_framework_dialect.cc.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_framework_ops.td",
     deps = [":td_files"],
@@ -56,16 +44,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "tf_status_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-enum-decls"],
-            "tf_status.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "tf_status.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "tf_status.h.inc": ["-gen-enum-decls"],
+        "tf_status.cc.inc": ["-gen-enum-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "tf_status.td",
     deps = [":td_files"],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
index d8e7617cc352..64f782d02346 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -43,7 +43,7 @@ def TFFramework_Dialect : Dialect {
 }
 
 def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
-    CPred<"$_self.isa<::mlir::kernel_gen::tf_framework::OpKernelContextType>()">,
+    CPred<"llvm::isa<::mlir::kernel_gen::tf_framework::OpKernelContextType>($_self)">,
           "op_kernel_construction">,
     BuildableType<"$_builder.getType<::mlir::kernel_gen::tf_framework::OpKernelContextType>()"> {
   let description = [{
@@ -53,7 +53,7 @@ def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
 }
 
 def TFFramework_JITCallableType : DialectType<TFFramework_Dialect, CPred<
-    "$_self.isa<::mlir::kernel_gen::tf_framework::JITCallableType>()">>,
+    "llvm::isa<::mlir::kernel_gen::tf_framework::JITCallableType>($_self)">>,
     BuildableType<"$_builder.getType<::mlir::kernel_gen::tf_framework::JITCallableType>()"> {
   let description = [{
     A `callable` represents the result of JIT compilation. Conceptually, it
@@ -107,7 +107,7 @@ def TFFramework_TFAllocOp : TFFramework_Op<"alloc", [
     }]>];
 
   let extraClassDeclaration = [{
-    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+    MemRefType getType() { return llvm::cast<MemRefType>(getResult().getType()); }
     static constexpr StringRef kReuseOutputAttrName = "reuse_output";
     static constexpr StringRef kReuseInputCandidatesAttrName =
         "reuse_input_candidates";
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index ec59794405b7..6f397bbcf8fb 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -170,9 +170,9 @@ absl::Status LowerHlotoLoops(mlir::ModuleOp module,
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<FuncOp>(mlir::createCSEPass());
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createShapeSimplification());
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createMergeAssumingOpsPass());
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createBroadcastPropagationPass());
+  pm.addNestedPass<FuncOp>(mlir::kernel_gen::createShapeSimplificationPass());
+  pm.addNestedPass<FuncOp>(mlir::kernel_gen::createMergeAssumingOpsPass());
+  pm.addNestedPass<FuncOp>(mlir::kernel_gen::createBroadcastPropagationPass());
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<FuncOp>(mlir::createCSEPass());
 
@@ -289,11 +289,12 @@ absl::Status LowerLoopsToGPU(mlir::ModuleOp module, bool index_64bit,
   // Make loops with min bounds into a conditional plus static bounds.
   pm.addNestedPass<FuncOp>(mlir::createForLoopSpecializationPass());
   // Take launches to launches with kernels.
-  pm.addPass(mlir::createGpuLauchSinkIndexComputationsPass());
+  pm.addPass(mlir::createGpuLaunchSinkIndexComputationsPass());
   const std::string gpuDataLayoutSpec =
       index_64bit ? "#dlti.dl_spec<#dlti.dl_entry<index,64:i64>>"
                   : "#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>";
-  pm.addPass(mlir::createGpuKernelOutliningPass(gpuDataLayoutSpec));
+  pm.addPass(
+      mlir::createGpuKernelOutliningPass({.dataLayoutStr = gpuDataLayoutSpec}));
 
   pm.addPass(::mlir::createLowerAffinePass());
   // Constraints are removed as late as possible and before lowering to CFG.
@@ -309,7 +310,8 @@ absl::Status LowerLoopsToGPU(mlir::ModuleOp module, bool index_64bit,
 }
 
 absl::Status LowerKernelBodiesToLowLevelIr(mlir::ModuleOp module,
-                                           bool apply_cl_options) {
+                                           bool apply_cl_options,
+                                           const std::string& architecture) {
 #if !defined(TENSORFLOW_USE_ROCM) && !defined(GOOGLE_CUDA)
   return absl::InternalError(
       "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
@@ -337,7 +339,7 @@ absl::Status LowerKernelBodiesToLowLevelIr(mlir::ModuleOp module,
   auto& kernelPm = pm.nest<::mlir::gpu::GPUModuleOp>();
   kernelPm.addPass(::mlir::createSCFToControlFlowPass());
 #if TENSORFLOW_USE_ROCM
-  kernelPm.addPass(mlir::createGpuKernelToRocdlPass());
+  kernelPm.addPass(mlir::createGpuKernelToRocdlPass(architecture));
 #elif GOOGLE_CUDA
   kernelPm.addPass(mlir::createGpuKernelToNvvmPass());
   kernelPm.addPass(mlir::NVVM::createOptimizeForTargetPass());
@@ -460,8 +462,15 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForHloCode(
         jit_i64_indexed_for_large_tensors, apply_cl_options));
     TF_RETURN_IF_ERROR(
         LowerLoopsToGPU(module.get(), index_64bit, apply_cl_options));
-    TF_RETURN_IF_ERROR(
-        LowerKernelBodiesToLowLevelIr(module.get(), apply_cl_options));
+
+    // Note: we're just passing the first architecture out of the list. This
+    // should be sufficient for now, but in the future perhaps we'll need
+    // restructure this code to generate separate MLIR modules for each
+    // architecture.
+    const std::string& first_architecture =
+        !architectures.empty() ? architectures[0] : "";
+    TF_RETURN_IF_ERROR(LowerKernelBodiesToLowLevelIr(
+        module.get(), apply_cl_options, first_architecture));
     TF_RETURN_IF_ERROR(
         AmendKernelLLVMIRWithStaticKnowledge(module.get(), apply_cl_options));
     TF_RETURN_IF_ERROR(GenerateDeviceCode(
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/broadcast_propagation.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/broadcast_propagation.mlir
similarity index 99%
rename from third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/broadcast_propagation.mlir
rename to tensorflow/compiler/mlir/tools/kernel_gen/tests/broadcast_propagation.mlir
index 4bf50644127e..f366f1938e0a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/broadcast_propagation.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/broadcast_propagation.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s --split-input-file --mhlo-broadcast-propagation | \
+// RUN: kernel-gen-opt %s --split-input-file --mhlo-broadcast-propagation | \
 // RUN: FileCheck %s
 
 // CHECK-LABEL: @single_bcast
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/merge_assuming_ops.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/merge_assuming_ops.mlir
similarity index 99%
rename from third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/merge_assuming_ops.mlir
rename to tensorflow/compiler/mlir/tools/kernel_gen/tests/merge_assuming_ops.mlir
index f8ff1a33d1c9..d463da199549 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/merge_assuming_ops.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/merge_assuming_ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt --split-input-file --allow-unregistered-dialect \
+// RUN: kernel-gen-opt --split-input-file --allow-unregistered-dialect \
 // RUN:   --mhlo-merge-assuming-ops --canonicalize --cse %s | \
 // RUN: FileCheck %s
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/shape_simplification.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/shape_simplification.mlir
similarity index 98%
rename from third_party/xla/xla/mlir_hlo/tests/shape_simplification.mlir
rename to tensorflow/compiler/mlir/tools/kernel_gen/tests/shape_simplification.mlir
index 998918bdfa07..f7ff67753bc2 100644
--- a/third_party/xla/xla/mlir_hlo/tests/shape_simplification.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/shape_simplification.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -split-input-file -shape-simplification %s | FileCheck %s
+// RUN: kernel-gen-opt -split-input-file -shape-simplification %s | FileCheck %s
 
 // Incompatible shapes. No folding.
 // CHECK-LABEL: func @f
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index 88564d60422f..262f9fc56d78 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -85,13 +85,10 @@ cc_library(
 gentbl_cc_library(
     name = "kernel_gen_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [(
-        [
-            "-gen-pass-decls",
-            "-name=KernelGen",
-        ],
-        "kernel_gen_passes.h.inc",
-    )],
+    tbl_outs = {"kernel_gen_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=KernelGen",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
@@ -113,6 +110,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:errors",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:TransformUtils",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:ArithDialect",
@@ -180,15 +178,18 @@ cc_library(
 cc_library(
     name = "passes",
     srcs = [
+        "broadcast_propagation_pass.cc",
         "buffer_reuse_pass.cc",
         "bufferize_pass.cc",
         "copy_cleanup_pass.cc",
         "embed_tf_framework_pass.cc",
         "func_to_jit_invocations.cc",
         "fuse_inner_parallel_loops_pass.cc",
+        "merge_assuming_ops_pass.cc",
         "parallel_loops_to_sequential.cc",
         "rewrite_tf_framework_assert.cc",
         "same_shape_propagation.cc",
+        "shape_simplification_pass.cc",
         "shape_to_descriptors_pass.cc",
         "tensorflow_abi_knowledge_propagation.cc",
     ],
@@ -199,8 +200,6 @@ cc_library(
         ":embed_tf_framework",  # buildcleaner: keep
         ":kernel_gen_passes_inc_gen",
         ":tf_framework_legalize_to_llvm",  # buildcleaner: keep
-        ":utils",
-        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -210,6 +209,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:MathDialect",
@@ -225,7 +225,9 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir_hlo",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
         "@local_xla//xla/mlir_hlo:transforms_passes",
+        "@stablehlo//:base",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
new file mode 100644
index 000000000000..159e630fb8fb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
@@ -0,0 +1,462 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+==============================================================================*/
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/Base.h"  // from @stablehlo
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace kernel_gen {
+
+using mhlo::DynamicBroadcastInDimOp;
+
+#define GEN_PASS_DEF_BROADCASTPROPAGATIONPASS
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+namespace {
+
+// To avoid duplicate broadcasts, we collect all the intended broadcasts ahead
+// of realizing any broadcasts in the IR. These are broadcasted versions of
+// values that we are interested in, and they are uniquely characterized by a
+// `BroadcastIntent` value.
+struct BroadcastIntent {
+  RankedTensorType resultType;
+  Value targetValue;
+  Value outputDimensions;
+  Attribute broadcastDimensions;
+  bool operator==(BroadcastIntent rhs) const {
+    return resultType == rhs.resultType && targetValue == rhs.targetValue &&
+           outputDimensions == rhs.outputDimensions &&
+           broadcastDimensions == rhs.broadcastDimensions;
+  }
+  bool operator!=(BroadcastIntent rhs) const { return !(*this == rhs); }
+};
+
+}  // namespace
+}  // namespace kernel_gen
+}  // namespace mlir
+
+namespace llvm {
+
+using mlir::kernel_gen::BroadcastIntent;
+
+template <>
+struct DenseMapInfo<BroadcastIntent> {
+  static BroadcastIntent getEmptyKey() {
+    return {DenseMapInfo<mlir::RankedTensorType>::getEmptyKey(),
+            DenseMapInfo<mlir::Value>::getEmptyKey(),
+            DenseMapInfo<mlir::Value>::getEmptyKey(),
+            DenseMapInfo<mlir::Attribute>::getEmptyKey()};
+  }
+  static BroadcastIntent getTombstoneKey() {
+    return {DenseMapInfo<mlir::RankedTensorType>::getTombstoneKey(),
+            DenseMapInfo<mlir::Value>::getTombstoneKey(),
+            DenseMapInfo<mlir::Value>::getTombstoneKey(),
+            DenseMapInfo<mlir::Attribute>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const BroadcastIntent &intent) {
+    return hash_combine(
+        DenseMapInfo<mlir::RankedTensorType>::getHashValue(intent.resultType),
+        DenseMapInfo<mlir::Value>::getHashValue(intent.targetValue),
+        DenseMapInfo<mlir::Value>::getHashValue(intent.outputDimensions),
+        DenseMapInfo<mlir::Attribute>::getHashValue(
+            intent.broadcastDimensions));
+  }
+  static bool isEqual(const BroadcastIntent &lhs, const BroadcastIntent &rhs) {
+    return lhs == rhs;
+  }
+};
+
+}  // namespace llvm
+
+namespace mlir {
+namespace kernel_gen {
+namespace {
+
+bool allowsForElementwiseBroadcastPropagation(Operation *op) {
+  if (op && op->hasTrait<mlir::OpTrait::SameOperandsAndResultShape>() &&
+      op->hasTrait<mlir::OpTrait::Elementwise>() && op->getNumResults() == 1) {
+    return true;
+  }
+  if (op && op->hasTrait<hlo::OpTrait::BroadcastingElementwise>() &&
+      op->getNumResults() == 1) {
+    return true;
+  }
+  return false;
+}
+
+bool allowsForBroadcastPropagation(Operation *op) {
+  return llvm::isa_and_nonnull<DynamicBroadcastInDimOp>(op) ||
+         allowsForElementwiseBroadcastPropagation(op);
+}
+
+DenseIntElementsAttr composeBroadcastDimensionsAttr(OpBuilder &builder,
+                                                    DenseIntElementsAttr a,
+                                                    DenseIntElementsAttr b) {
+  SmallVector<int64_t> bVec =
+      llvm::to_vector(llvm::map_range(b, [](const APInt &it) {
+        return static_cast<int64_t>(it.getLimitedValue());
+      }));
+  SmallVector<int64_t> composedVec = llvm::to_vector(llvm::map_range(
+      a, [bVec](const APInt &it) { return bVec[it.getLimitedValue()]; }));
+  return builder.getI64TensorAttr(composedVec);
+}
+
+// Find all the broadcast intents and their dependencies. Start analyzing from
+// the root an collect all broadcast intents that can help broadcast propagation
+// from there.
+void findBroadcastIntents(
+    DynamicBroadcastInDimOp root, Block *parentBlock,
+    BroadcastIntent &rootBcastIntent,
+    SmallVector<BroadcastIntent> &bcastIntents,
+    DenseMap<BroadcastIntent, SmallVector<BroadcastIntent>>
+        &bcastIntentDependencies) {
+  OpBuilder builder(root.getContext());
+
+  // Use the result vector of broadcast intents as a worklist. The set of
+  // broadcast intents helps to ensure their uniqueness.
+  DenseSet<BroadcastIntent> bcastIntentsSet;
+  auto addToWorklistIfNew = [&](BroadcastIntent bcastIntent) {
+    if (!bcastIntentsSet.count(bcastIntent)) {
+      bcastIntentsSet.insert(bcastIntent);
+      bcastIntents.push_back(bcastIntent);
+    }
+  };
+
+  // Derive the broadcast intent associated with the root broadcast operation.
+  // Add it to the worklist to seed the analysis.
+  rootBcastIntent = {mlir::cast<RankedTensorType>(root.getResult().getType()),
+                     root.getOperand(), root.getOutputDimensions(),
+                     root.getBroadcastDimensions()};
+  addToWorklistIfNew(rootBcastIntent);
+
+  // We use result vector of broadcast intents as a worklist, the first `i`
+  // intents of which have been processed.
+  for (int64_t i = 0; i < static_cast<int64_t>(bcastIntents.size()); ++i) {
+    BroadcastIntent it = bcastIntents[i];
+    Operation *producerOp = it.targetValue.getDefiningOp();
+
+    // We can propagate broadcasts over (broadcasting) element-wise operations
+    // and dynamic_broadcast_in_dim ops with the restriction that they must be
+    // in the same block as they may depend on assuming regions.
+    if (!producerOp || producerOp->getBlock() != parentBlock ||
+        !allowsForBroadcastPropagation(producerOp)) {
+      continue;
+    }
+
+    // We can skip broadcasting producers (dynamic_broadcast_in_dim ops) if we
+    // compose their broadcasting dimensions.
+    if (auto producerBcastOp =
+            llvm::dyn_cast<DynamicBroadcastInDimOp>(producerOp)) {
+      DenseIntElementsAttr composedBcastDims = composeBroadcastDimensionsAttr(
+          builder, producerBcastOp.getBroadcastDimensions(),
+          mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
+      BroadcastIntent bcastedOperandIntent = {
+          it.resultType, producerBcastOp.getOperand(), it.outputDimensions,
+          composedBcastDims};
+
+      // Record dependency and "recur".
+      bcastIntentDependencies[it] = {bcastedOperandIntent};
+      addToWorklistIfNew(bcastedOperandIntent);
+      continue;
+    }
+
+    // We can propagate broadcasts over (broadcasting) element-wise operations.
+    // Instead of broadcasting the result of such an op, we can broadcast the
+    // operands and apply the element-wise operation to them.
+    assert(allowsForElementwiseBroadcastPropagation(producerOp));
+    bcastIntentDependencies[it] = {};
+    for (auto operand : producerOp->getOperands()) {
+      auto operandTy = mlir::cast<RankedTensorType>(operand.getType());
+      auto operandBcastDims = operandTy.getRank() == 0
+                                  ? builder.getI64TensorAttr({})
+                                  : it.broadcastDimensions;
+      auto bcastedOperandTy = RankedTensorType::get(it.resultType.getShape(),
+                                                    operandTy.getElementType());
+      BroadcastIntent bcastedOperandIntent = {
+          bcastedOperandTy, operand, it.outputDimensions, operandBcastDims};
+
+      // Record dependency and "recur".
+      bcastIntentDependencies[it].push_back(bcastedOperandIntent);
+      addToWorklistIfNew(bcastedOperandIntent);
+    }
+  }
+}
+
+void sortBroadcastIntentsInReverseTopologicalOrder(
+    SmallVector<BroadcastIntent> &bcastIntentsVec, Block *parentBlock) {
+  // Sort broadcast intents in reverse topological order of the producer ops. We
+  // can use the positions in the block for this. All broadcast intents outside
+  // the block (e.g. arguments) will be sorted towards the front.
+  // This ordering is independent of the output dimensions as dependencies can
+  // only occur between broadcast intents of the same output dimension.
+  std::sort(bcastIntentsVec.begin(), bcastIntentsVec.end(),
+            [parentBlock](const BroadcastIntent &a, const BroadcastIntent &b) {
+              Operation *producerOpA = a.targetValue.getDefiningOp();
+              Operation *producerOpB = b.targetValue.getDefiningOp();
+              bool aInBlock = producerOpA != nullptr &&
+                              producerOpA->getBlock() == parentBlock;
+              bool bInBlock = producerOpB != nullptr &&
+                              producerOpB->getBlock() == parentBlock;
+              if (aInBlock && bInBlock) {
+                return producerOpA->isBeforeInBlock(producerOpB);
+              }
+              return !aInBlock && bInBlock;
+            });
+}
+
+void setInsertionPointToEarliestPointWithAllValuesAvailable(
+    PatternRewriter &rewriter, Block *block, ValueRange values) {
+  Operation *lastDef = nullptr;
+  for (Value v : values) {
+    Operation *def = v.getDefiningOp();
+    if (def && def->getBlock() == block) {
+      if (!lastDef || lastDef->isBeforeInBlock(def)) lastDef = def;
+    }
+  }
+  if (lastDef) {
+    rewriter.setInsertionPointAfter(lastDef);
+  } else {
+    rewriter.setInsertionPointToStart(block);
+  }
+}
+
+DenseMap<BroadcastIntent, Value> realizeBroadcastIntents(
+    SmallVector<BroadcastIntent> &sortedBcastIntents,
+    DenseMap<BroadcastIntent, SmallVector<BroadcastIntent>>
+        &bcastIntentDependencies,
+    Block *parentBlock, PatternRewriter &rewriter) {
+  // Realize broadcast intents in order. They must be sorted so that their
+  // dependencies are realized before them.
+  DenseMap<BroadcastIntent, Value> realizations;
+  for (auto it : sortedBcastIntents) {
+    Operation *producerOp = it.targetValue.getDefiningOp();
+    assert(!realizations.count(it) && "expect unrealized broadcast intent");
+    auto deps = bcastIntentDependencies.find(it);
+
+    // If we cannot propagate broadcasts further, materialize them as a
+    // dynamic_broadcast_in_dim op.
+    if (!producerOp || producerOp->getBlock() != parentBlock ||
+        !allowsForBroadcastPropagation(producerOp)) {
+      assert(deps == bcastIntentDependencies.end() && "expect no dependencies");
+      setInsertionPointToEarliestPointWithAllValuesAvailable(
+          rewriter, parentBlock,
+          ValueRange{it.targetValue, it.outputDimensions});
+      realizations[it] = rewriter.create<DynamicBroadcastInDimOp>(
+          it.targetValue.getLoc(), it.resultType, it.targetValue,
+          it.outputDimensions,
+          mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
+      continue;
+    }
+
+    // For broadcast propagation across dynamic_broadcast_in_dim ops, the
+    // broadcasted value is already materialized. Forward it.
+    if (auto producerBcastOp =
+            llvm::dyn_cast_or_null<DynamicBroadcastInDimOp>(producerOp)) {
+      assert(deps != bcastIntentDependencies.end() &&
+             deps->second.size() == 1 && "expect one dependency");
+      auto bcastedOperand = realizations.find(deps->second.front());
+      assert(bcastedOperand != realizations.end());
+      realizations[it] = Value(bcastedOperand->second);
+      continue;
+    }
+
+    // Othwerwise, realize broadcast intent for a (broadcasting) element-wise
+    // operation based on the broadcasted operands.
+    assert(allowsForElementwiseBroadcastPropagation(producerOp) &&
+           "expect broadcast propagation over an (broadcasting) element-wise "
+           "operation");
+    assert(deps != bcastIntentDependencies.end() &&
+           deps->second.size() == producerOp->getNumOperands() &&
+           "expect one dependency per operand");
+    auto bcastedOperands = llvm::to_vector(
+        llvm::map_range(deps->second, [&](BroadcastIntent operandIntent) {
+          auto bcastedOperand = realizations.find(operandIntent);
+          assert(bcastedOperand != realizations.end() &&
+                 "expect dependencies to be realized earlier");
+          return bcastedOperand->second;
+        }));
+    setInsertionPointToEarliestPointWithAllValuesAvailable(
+        rewriter, parentBlock, bcastedOperands);
+    OperationState newProducerOpState(
+        producerOp->getLoc(), producerOp->getName().getStringRef(),
+        bcastedOperands, it.resultType, producerOp->getAttrs());
+    Operation *newProducerOp = rewriter.create(newProducerOpState);
+    assert(newProducerOp->getNumResults() == 1 && "expect exactly one result");
+    realizations[it] = newProducerOp->getResults().front();
+  }
+
+  return realizations;
+}
+
+void transitivelyEraseUnusedSideEffectFreeOps(Operation *root,
+                                              PatternRewriter &rewriter) {
+  // Find ops to erase.
+  SmallPtrSet<Operation *, 16> opsToEraseSet;
+  SmallVector<Operation *, 16> opsToErase;
+  SmallVector<Operation *, 16> worklist = {root};
+  while (!worklist.empty()) {
+    Operation *op = worklist.pop_back_val();
+
+    // Erase ops only once.
+    if (opsToEraseSet.count(op)) continue;
+
+    // Erase only operations that are unused and free of side effects.
+    if (!isMemoryEffectFree(op) ||
+        !llvm::all_of(op->getUsers(), [opsToEraseSet](Operation *user) {
+          return opsToEraseSet.count(user);
+        })) {
+      continue;
+    }
+
+    // Erase and "recur".
+    opsToEraseSet.insert(op);
+    opsToErase.push_back(op);
+    for (Value operand : op->getOperands()) {
+      if (Operation *def = operand.getDefiningOp()) worklist.push_back(def);
+    }
+  }
+
+  // Finally, erase the ops in the order of their uses.
+  for (Operation *op : opsToErase) rewriter.eraseOp(op);
+}
+
+LogicalResult propagateBroadcast(DynamicBroadcastInDimOp root,
+                                 Block *parentBlock,
+                                 PatternRewriter &rewriter) {
+  // We can move broadcasts up over (i) (broadcasting) element-wise operations
+  // and (i) dynamic_broadcast_in_dim ops. This way, we propagate them through
+  // the IR to perform them early. Instead of broadcasting the result of such an
+  // op, we can broadcast the operands and apply the element-wise operation to
+  // them.
+  //
+  // To avoid exponential growth of the IR, we will do this in two phases:
+  //   1) First, we collect all the unique broadcast intents. These are
+  //      broadcasted versions of values that we are interested in. They may
+  //      later be materialized as an explicit broadcast or they can be the
+  //      direct result of an operation over which a broadcast was propagated.
+  //   2) Then, we fulfill every broadcast intent in reverse topological order
+  //      to ensure that their dependencies (the broadcasted operands) are
+  //      available.
+
+  // Find the unique broadcast intents.
+  BroadcastIntent rootBcastIntent;
+  SmallVector<BroadcastIntent> bcastIntents;
+  DenseMap<BroadcastIntent, SmallVector<BroadcastIntent>>
+      bcastIntentDependencies;
+  findBroadcastIntents(root, parentBlock, rootBcastIntent, bcastIntents,
+                       bcastIntentDependencies);
+
+  // Fail if there is nothing but the root intent, i.e. if there is nothing to
+  // rewrite here.
+  if (bcastIntents.size() <= 1) {
+    assert(bcastIntents.front() == rootBcastIntent && "expect root intent");
+    return failure();
+  }
+
+  // Sort the broadcast intents in reverse topological order so that they can be
+  // materialized and every depency is available when needed.
+  sortBroadcastIntentsInReverseTopologicalOrder(bcastIntents, parentBlock);
+
+  // Realize broadcast intents.
+  DenseMap<BroadcastIntent, Value> realizations = realizeBroadcastIntents(
+      bcastIntents, bcastIntentDependencies, parentBlock, rewriter);
+
+  // Find the operations that may become redundant after replacing the root
+  // operation. This allows us to transitively erase unused side effect-free
+  // operations that result from this rewrite (after the root operation is no
+  // longer accessible).
+  SmallVector<Operation *> possiblyUnused;
+  for (auto operand : root->getOperands()) {
+    if (Operation *def = operand.getDefiningOp()) possiblyUnused.push_back(def);
+  }
+
+  // Replace the root operation with its broadcast intent's realization.
+  rewriter.replaceOp(root, realizations[rootBcastIntent]);
+
+  // Erase all the operations that have become redundant as a result of this
+  // rewrite.
+  for (Operation *op : possiblyUnused) {
+    transitivelyEraseUnusedSideEffectFreeOps(op, rewriter);
+  }
+
+  return success();
+}
+
+struct BroadcastPropagationPattern
+    : public OpRewritePattern<DynamicBroadcastInDimOp> {
+  using OpRewritePattern<DynamicBroadcastInDimOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicBroadcastInDimOp op,
+                                PatternRewriter &rewriter) const override {
+    return propagateBroadcast(op, op->getBlock(), rewriter);
+  }
+};
+
+struct BroadcastPropagationPass
+    : public impl::BroadcastPropagationPassBase<BroadcastPropagationPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mhlo::MhloDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+
+    // Collect patterns.
+    RewritePatternSet patterns(ctx);
+    patterns.add<BroadcastPropagationPattern>(ctx);
+
+    // Apply broadcast propagation in reverse order to start propagation at
+    // the root of broadcast chains. This avoids duplicate work.
+    GreedyRewriteConfig config;
+    config.setUseTopDownTraversal(false);
+
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns),
+                                     config))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index 2986d6ce6571..092b9ff7a6bf 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
@@ -141,7 +142,7 @@ class GpuKernelToBlobPass
         "false";
 
     llvmModule->setDataLayout(xla::gpu::nvptx::DataLayout());
-    llvmModule->setTargetTriple(xla::gpu::nvptx::TargetTriple());
+    llvmModule->setTargetTriple(llvm::Triple(xla::gpu::nvptx::TargetTriple()));
 
     // Compile and collect requested cubin and PTX images.
     std::vector<tensorflow::se::CubinOrPTXImage> images;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
new file mode 100644
index 000000000000..4b1d10ca8dd3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
@@ -0,0 +1,476 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+==============================================================================*/
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <tuple>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/Base.h"  // from @stablehlo
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace kernel_gen {
+
+#define GEN_PASS_DEF_MERGEASSUMINGOPSPASS
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+namespace {
+
+struct ShapeReificationPattern : public OpRewritePattern<shape::ShapeOfOp> {
+  explicit ShapeReificationPattern(MLIRContext *context)
+      : OpRewritePattern<shape::ShapeOfOp>(context) {
+    // Recursively reify until we hit an op that doesn't support it.
+    setHasBoundedRewriteRecursion();
+  }
+
+  LogicalResult matchAndRewrite(shape::ShapeOfOp op,
+                                PatternRewriter &rewriter) const override {
+    // Only reify shape computation if operand allows for it.
+    auto shapeOrigin = op.getArg().getDefiningOp<InferShapedTypeOpInterface>();
+    if (!shapeOrigin) return failure();
+
+    llvm::SmallVector<Value, 1> reifications;
+    if (failed(shapeOrigin.reifyReturnTypeShapes(
+            rewriter, shapeOrigin->getOperands(), reifications)))
+      return failure();
+    assert(reifications.size() == 1);
+    Value reifiedShape = reifications.front();
+
+    // Insert cast if needed.
+    if (reifiedShape.getType() != op.getType()) {
+      reifiedShape = rewriter.create<tensor::CastOp>(op.getLoc(), op.getType(),
+                                                     reifiedShape);
+    }
+
+    rewriter.replaceOp(op, reifiedShape);
+    return success();
+  }
+};
+
+template <typename OpTy>
+struct InlineBroadcastedShapeOperandsPattern : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Find all the shape operands, direct and indirect.
+    SmallVector<Value, 8> inlinedOperands;
+    for (Value direct : op->getOperands()) {
+      if (auto bcastOp = direct.getDefiningOp<shape::BroadcastOp>()) {
+        for (Value indirect : bcastOp->getOperands())
+          inlinedOperands.push_back(indirect);
+      } else {
+        inlinedOperands.push_back(direct);
+      }
+    }
+
+    // Only rewrite if it makes a difference.
+    if (inlinedOperands.size() == op.getNumOperands()) return failure();
+
+    // Inline shape operands.
+    rewriter.replaceOpWithNewOp<OpTy>(op, op->getResultTypes(), inlinedOperands,
+                                      op->getAttrs());
+    return success();
+  }
+};
+
+LogicalResult moveUpIntoAssumingOpMatchAndRewrite(Operation *op,
+                                                  PatternRewriter &rewriter) {
+  // Only implemented for single-result ops.
+  if (op->getNumResults() != 1) return failure();
+
+  // Find a preceding `assuming` op.
+  auto *theBlock = op->getBlock();
+  Operation *prev = op->getPrevNode();
+  while (prev != nullptr && !llvm::isa<shape::AssumingOp>(prev))
+    prev = prev->getPrevNode();
+  auto assumingOp = llvm::dyn_cast_or_null<shape::AssumingOp>(prev);
+  if (!assumingOp) return failure();
+  assert(assumingOp->getBlock() == theBlock && op->getBlock() == theBlock &&
+         "expect assuming op and root op to be in the same block");
+
+  // Make sure that all operands will be available after moving.
+  auto isAvailable = [&](Value v) {
+    Operation *def = v.getDefiningOp();
+    return def == nullptr || def->getBlock() != theBlock ||
+           !assumingOp->isBeforeInBlock(def);
+  };
+  if (!llvm::all_of(op->getOperands(), isAvailable)) return failure();
+
+  Block *body = assumingOp.getBody();
+  auto yieldOp = llvm::cast<shape::AssumingYieldOp>(body->getTerminator());
+
+  // Find the operands to use if the op was within the assuming region. We
+  // will later use their copies, as we copy the assuming op and its body.
+  SmallVector<Value, 8> newOperandsUnmapped =
+      llvm::to_vector<8>(llvm::map_range(op->getOperands(), [&](Value v) {
+        for (const auto &result : llvm::enumerate(assumingOp->getResults())) {
+          if (result.value() == v) return yieldOp->getOperand(result.index());
+        }
+        return v;
+      }));
+
+  // Insert the rewritten assuming op right before the old one.
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(assumingOp);
+  auto newAssumingOp = rewriter.create<shape::AssumingOp>(
+      assumingOp.getLoc(), assumingOp.getWitness(),
+      [&](OpBuilder &b, Location) {
+        // Copy body.
+        IRMapping mapping;
+        for (auto &nested : body->without_terminator())
+          b.clone(nested, mapping);
+
+        // Copy op into the new body and use the mapped operands.
+        for (auto it : llvm::zip(op->getOperands(), newOperandsUnmapped)) {
+          Value oldOperand, newOperandUnmapped;
+          std::tie(oldOperand, newOperandUnmapped) = it;
+          mapping.map(oldOperand, mapping.lookupOrDefault(newOperandUnmapped));
+        }
+        Operation *newOp = b.clone(*op, mapping);
+
+        // Yield the previous results and also the new ones.
+        auto mappedResults = llvm::to_vector<8>(llvm::map_range(
+            yieldOp.getOperands(),
+            [&](Value v) { return mapping.lookupOrDefault(v); }));
+        mappedResults.append(newOp->getResults().begin(),
+                             newOp->getResults().end());
+        return mappedResults;
+      });
+
+  // Replace the assuming op and the root op with the corresponding result
+  // values.
+  ValueRange newAssumingOpResults = newAssumingOp->getResults();
+  rewriter.replaceOp(assumingOp, newAssumingOpResults.drop_back());
+  rewriter.replaceOp(op, newAssumingOpResults.back());
+  return success();
+}
+
+/// Move operation into a preceding assuming op. This allows to process
+/// operations that depend on the assuming op's results. It will eventually
+/// allow to make assuming regions' constraints independent from each other.
+template <typename OpTy>
+struct MoveUpIntoAssumingOpPattern : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    return moveUpIntoAssumingOpMatchAndRewrite(op.getOperation(), rewriter);
+  }
+};
+
+// Move elementwise operations into a preceding assuming op. This will
+// eventually allow for more fusion opportunities.
+struct MoveElementwiseOpsUpIntoAssumingOpPattern : public RewritePattern {
+  explicit MoveElementwiseOpsUpIntoAssumingOpPattern(MLIRContext *ctx)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    // Apply to all elementwise and broadcasting elementwise operations with no
+    // side effects.
+    if (!op->hasTrait<mlir::OpTrait::Elementwise>() &&
+        !op->hasTrait<hlo::OpTrait::BroadcastingElementwise>()) {
+      return failure();
+    }
+    if (!isMemoryEffectFree(op)) return failure();
+
+    return moveUpIntoAssumingOpMatchAndRewrite(op, rewriter);
+  }
+};
+
+// Move operation into an assuming region if all uses are within its body.
+LogicalResult moveDownIntoAssumingOpMatchAndRewrite(Operation *op,
+                                                    PatternRewriter &rewriter) {
+  auto users = op->getUsers();
+  auto it = users.begin();
+  auto end = users.end();
+  if (it == end) return failure();
+
+  // Find candidate assuming op.
+  auto assumingOp = (it++)->getParentOfType<shape::AssumingOp>();
+  if (!assumingOp || assumingOp->isProperAncestor(op)) return failure();
+
+  // Make sure all uses are within the unique assuming op's body.
+  while (it != end) {
+    auto hopefullySameAssumingOp = (it++)->getParentOfType<shape::AssumingOp>();
+    if (!hopefullySameAssumingOp || hopefullySameAssumingOp != assumingOp) {
+      return failure();
+    }
+  }
+
+  // Move op into the assuming region.
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToStart(assumingOp.getBody());
+  Operation *newOp = rewriter.clone(*op);
+  rewriter.replaceOp(op, newOp->getResults());
+  return success();
+}
+
+// Move elementwise operations into succeeding assuming regions. This will
+// eventually allow for more fusion opportunities.
+struct MoveElementwiseOpsDownIntoAssumingOpPattern : public RewritePattern {
+  explicit MoveElementwiseOpsDownIntoAssumingOpPattern(MLIRContext *ctx)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    // Apply to all elementwise and broadcasting elementwise operations with no
+    // side effects.
+    if (!op->hasTrait<mlir::OpTrait::Elementwise>() &&
+        !op->hasTrait<hlo::OpTrait::BroadcastingElementwise>()) {
+      return failure();
+    }
+    if (!isMemoryEffectFree(op)) return failure();
+
+    return moveDownIntoAssumingOpMatchAndRewrite(op, rewriter);
+  }
+};
+
+/// Move operation out of assuming op. This is only valid for
+/// constraint-independent ops, like `cstr_broadcastable` and `shape_of`. It
+/// will eventually allow to make assuming regions' constraints independent from
+/// each other.
+template <typename OpTy>
+struct MoveUpOutOfAssumingOpPattern : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Must be inside of an assuming op.
+    auto assumingOp = op->template getParentOfType<shape::AssumingOp>();
+    if (!assumingOp) return failure();
+
+    // Operands must not be defined within the assuming op.
+    Block *body = assumingOp.getBody();
+    auto isAvailable = [&](Value v) {
+      Operation *def = v.getDefiningOp();
+      return def == nullptr || def->getBlock() != body;
+    };
+    if (!llvm::all_of(op->getOperands(), isAvailable)) return failure();
+
+    // Move op before the assuming region.
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(assumingOp);
+    Operation *newOp = rewriter.clone(*op);
+    rewriter.replaceOp(op, newOp->getResults());
+
+    // If the assuming region yields none of the new op's results, these values
+    // are exclusively used in the assuming op's body. In these cases there is
+    // no need for further rewrites.
+    auto isNewOpResult = [newOp](Value v) {
+      return llvm::is_contained(newOp->getResults(), v);
+    };
+    auto yieldOp = cast<shape::AssumingYieldOp>(body->getTerminator());
+    if (llvm::none_of(yieldOp.getOperands(), isNewOpResult)) return success();
+
+    // If the assuming region yields any of the new op's results, these values
+    // can instead bypass the assuming region. There is no need to yield them
+    // explicitly as they are assumed to be independent. The assuming op is
+    // rewritten accordingly.
+    SmallVector<Value, 2> replacementValues;
+    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
+        assumingOp.getLoc(), assumingOp.getWitness(),
+        [&](OpBuilder &b, Location) {
+          // Copy body.
+          IRMapping mapping;
+          for (Operation &nested : body->without_terminator()) {
+            b.clone(nested, mapping);
+          }
+
+          // Collect new yield operands.
+          SmallVector<Value, 2> newYieldOperands;
+          for (Value result : yieldOp.getOperands()) {
+            if (isNewOpResult(result)) {
+              replacementValues.push_back(result);
+            } else {
+              newYieldOperands.push_back(mapping.lookupOrDefault(result));
+              replacementValues.push_back(nullptr);
+            }
+          }
+          return newYieldOperands;
+        });
+
+    // Use the assuming op's results for the missing replacement values.
+    auto src = newAssumingOp.getResults().begin();
+    for (auto &dst : replacementValues) {
+      if (dst) continue;
+      dst = *src++;
+    }
+
+    rewriter.replaceOp(assumingOp, replacementValues);
+    return success();
+  }
+};
+
+/// Merge assuming regions if their constraints are independent from each other.
+struct MergeAssumingOpsPattern : public OpRewritePattern<shape::AssumingOp> {
+  using OpRewritePattern<shape::AssumingOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(shape::AssumingOp op,
+                                PatternRewriter &rewriter) const override {
+    // Merge assuming op with directly preceding one if both witnesses are
+    // available.
+    auto precedingOp =
+        llvm::dyn_cast_or_null<shape::AssumingOp>(op->getPrevNode());
+    if (!precedingOp) return failure();
+    if (op.getWitness().getDefiningOp() == precedingOp) return failure();
+
+    // Merge witnesses.
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(precedingOp);
+    Value newWitness = rewriter.create<shape::AssumingAllOp>(
+        op.getWitness().getDefiningOp()->getLoc(),
+        ValueRange{precedingOp.getWitness(), op.getWitness()});
+
+    // Merge assuming ops.
+    Block *body_a = precedingOp.getBody();
+    Block *body_b = op.getBody();
+    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
+        precedingOp.getLoc(), newWitness, [&](OpBuilder &b, Location) {
+          // Copy preceding op's body.
+          IRMapping mapping;
+          for (auto &nested : body_a->without_terminator()) {
+            b.clone(nested, mapping);
+          }
+
+          // Map result values of preceding assuming op.
+          auto yieldOpA =
+              llvm::dyn_cast<shape::AssumingYieldOp>(body_a->getTerminator());
+          for (auto pair :
+               llvm::zip(precedingOp->getResults(), yieldOpA.getOperands())) {
+            mapping.map(std::get<0>(pair),
+                        mapping.lookupOrDefault(std::get<1>(pair)));
+          }
+
+          // Copy op's body.
+          for (auto &nested : body_b->without_terminator()) {
+            b.clone(nested, mapping);
+          }
+
+          // Collect merged assuming op's results.
+          SmallVector<Value, 4> mappedResults;
+          auto yieldOpB =
+              llvm::dyn_cast<shape::AssumingYieldOp>(body_b->getTerminator());
+          for (Value v : yieldOpA.getOperands()) {
+            mappedResults.push_back(mapping.lookupOrDefault(v));
+          }
+          for (Value v : yieldOpB.getOperands()) {
+            mappedResults.push_back(mapping.lookupOrDefault(v));
+          }
+          return mappedResults;
+        });
+
+    // Replace the two assuming ops with the new corresponding results.
+    ValueRange newResults = newAssumingOp->getResults();
+    size_t splitAt = precedingOp->getNumResults();
+    rewriter.replaceOp(precedingOp, newResults.take_front(splitAt));
+    rewriter.replaceOp(op, newResults.drop_front(splitAt));
+    return success();
+  }
+};
+
+struct EliminateDuplicateCstrBroadcastableOps
+    : public OpRewritePattern<shape::CstrBroadcastableOp> {
+  using OpRewritePattern<shape::CstrBroadcastableOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
+                                PatternRewriter &rewriter) const override {
+    // Search for previous occurence of the same constraint.
+    Operation *it = op->getPrevNode();
+    while (it != nullptr) {
+      if (auto candidate = llvm::dyn_cast<shape::CstrBroadcastableOp>(it)) {
+        if (candidate.getShapes() == op.getShapes()) {
+          rewriter.replaceOp(op, candidate.getResult());
+          return success();
+        }
+      }
+      it = it->getPrevNode();
+    }
+
+    return failure();
+  }
+};
+
+void populateMergeAssumingOpsPatterns(MLIRContext *context,
+                                      RewritePatternSet *patterns) {
+  patterns->add<
+      EliminateDuplicateCstrBroadcastableOps,
+      InlineBroadcastedShapeOperandsPattern<shape::CstrBroadcastableOp>,
+      MergeAssumingOpsPattern, MoveElementwiseOpsDownIntoAssumingOpPattern,
+      MoveElementwiseOpsUpIntoAssumingOpPattern,
+      MoveUpIntoAssumingOpPattern<shape::AssumingAllOp>,
+      MoveUpIntoAssumingOpPattern<shape::CstrBroadcastableOp>,
+      MoveUpIntoAssumingOpPattern<shape::ShapeOfOp>,
+      MoveUpOutOfAssumingOpPattern<shape::AssumingAllOp>,
+      MoveUpOutOfAssumingOpPattern<shape::CstrBroadcastableOp>,
+      MoveUpOutOfAssumingOpPattern<shape::ShapeOfOp>, ShapeReificationPattern>(
+      context);
+  mhlo::DynamicBroadcastInDimOp::getCanonicalizationPatterns(*patterns,
+                                                             context);
+  mhlo::DynamicReshapeOp::getCanonicalizationPatterns(*patterns, context);
+  shape::AssumingAllOp::getCanonicalizationPatterns(*patterns, context);
+  shape::AssumingOp::getCanonicalizationPatterns(*patterns, context);
+  shape::BroadcastOp::getCanonicalizationPatterns(*patterns, context);
+  shape::CstrBroadcastableOp::getCanonicalizationPatterns(*patterns, context);
+  tensor::CastOp::getCanonicalizationPatterns(*patterns, context);
+}
+
+struct MergeAssumingOpsPass
+    : public impl::MergeAssumingOpsPassBase<MergeAssumingOpsPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<shape::ShapeDialect, mhlo::MhloDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    populateMergeAssumingOpsPatterns(ctx, &patterns);
+    GreedyRewriteConfig config;
+    config.setMaxIterations(GreedyRewriteConfig::kNoLimit);
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns),
+                                     config))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index 45e248ceb904..d9dca26c8ce3 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -38,6 +38,9 @@ limitations under the License.
 #define GEN_PASS_DECL_PROPAGATESHAPEKNOWLEDGETOKERNELS
 #define GEN_PASS_DECL_FUSEINNERPARALLELLOOPSPASS
 #define GEN_PASS_DECL_COPYCLEANUPPASS
+#define GEN_PASS_DECL_SHAPESIMPLIFICATIONPASS
+#define GEN_PASS_DECL_MERGEASSUMINGOPSPASS
+#define GEN_PASS_DECL_BROADCASTPROPAGATIONPASS
 
 namespace mlir {
 namespace kernel_gen {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
index 4f92be70d253..9bd6fb8b2e8b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -137,4 +137,22 @@ def CopyCleanupPass : Pass<"copy-cleanup", "mlir::func::FuncOp"> {
   }];
 }
 
+def ShapeSimplificationPass
+    : Pass<"shape-simplification", "mlir::func::FuncOp"> {
+  let summary = "Simplify shape ops";
+}
+
+def MergeAssumingOpsPass : Pass<"mhlo-merge-assuming-ops", "func::FuncOp"> {
+  let summary = "Prepare moving dynamic broadcasts up over element-wise "
+    "operations and broadcast the operands rather than the result. This will "
+    "eventually allow for larger fusions.";
+}
+
+def BroadcastPropagationPass : Pass<"mhlo-broadcast-propagation", "func::FuncOp"> {
+  let summary = "Move dynamic broadcasts up over element-wise operations and "
+    "broadcast the operands rather than the result. This will eventually allow "
+    "for larger fusions.";
+}
+
+
 #endif // TF_KERNEL_GEN_PASSES
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_simplification_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_simplification_pass.cc
new file mode 100644
index 000000000000..b5ceec7f48e8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_simplification_pass.cc
@@ -0,0 +1,253 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains the patterns to simplify shape ops that were deemed not
+// suitable for shape op canonicalization in MLIR Core.
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+
+#define GEN_PASS_DEF_SHAPESIMPLIFICATIONPASS
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+namespace {
+
+using shape::BroadcastOp;
+using shape::ConstShapeOp;
+using shape::ShapeOfOp;
+
+// Try to remove operands from broadcasts that don't contribute to the final
+// result.
+struct BroadcastRemoveSubsumedOperandsPattern
+    : public OpRewritePattern<BroadcastOp> {
+  using OpRewritePattern<BroadcastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BroadcastOp op,
+                                PatternRewriter &rewriter) const override {
+    // First collect the static components when joining all shapes. The
+    // resulting vector contains a static dimension if any operand has a static
+    // non-1 dimension in that position. The remaining dimensions are set to
+    // dynamic size.
+    SmallVector<int64_t> knownExtents;
+    SmallVector<SmallVector<int64_t, 4>, 4> operandExtents;
+    for (Value shape : op.getShapes()) {
+      auto &extents = operandExtents.emplace_back();
+      if (failed(shape::getShapeVec(shape, extents))) return failure();
+
+      // Prepend dynamic dims if sizes don't match.
+      if (extents.size() > knownExtents.size()) {
+        knownExtents.insert(knownExtents.begin(),
+                            extents.size() - knownExtents.size(),
+                            ShapedType::kDynamic);
+      }
+
+      for (size_t i = 0, e = extents.size(); i != e; ++i) {
+        int64_t extent = extents[e - i - 1];
+        if (extent != ShapedType::kDynamic && extent != 1) {
+          int64_t &knownExtent = knownExtents[knownExtents.size() - i - 1];
+          // A dynamic dimension is subsumed by a static one, but bail out for
+          // known conflicting shapes.
+          if (knownExtent != extent && knownExtent != ShapedType::kDynamic)
+            return failure();
+          knownExtent = extent;
+        }
+      }
+    }
+
+    // If we've figured out all shapes to be constants we're done.
+    if (!llvm::is_contained(knownExtents, ShapedType::kDynamic)) {
+      rewriter.replaceOpWithNewOp<ConstShapeOp>(
+          op, op->getResultTypes(), rewriter.getIndexTensorAttr(knownExtents));
+      return success();
+    }
+
+    // If only some dimensions are known see if any of the operands can be
+    // removed without affecting the result.
+    SmallVector<Value, 4> filteredOperands;
+    for (auto tuple : llvm::zip(op.getShapes(), operandExtents)) {
+      Value shape = std::get<0>(tuple);
+      auto &extents = std::get<1>(tuple);
+
+      // An operand can't be dead if it's the only operand of the maximum rank.
+      // Removing it would reduce the rank of the output.
+      if (llvm::count_if(operandExtents, [&](ArrayRef<int64_t> op) {
+            return op.size() >= extents.size();
+          }) <= 1) {
+        filteredOperands.push_back(shape);
+        continue;
+      }
+
+      for (size_t i = 0, e = extents.size(); i != e; ++i) {
+        int64_t extent = extents[e - i - 1];
+        // A dimension of an operand can be subsumed if it's
+        //   - a 1 dimension. All other operands will have 1 dims or better.
+        if (extent == 1) continue;
+
+        //   - a dynamic dim but the result is known to be constant.
+        int64_t knownExtent = knownExtents[knownExtents.size() - i - 1];
+        assert(knownExtent != 1);
+        if (knownExtent != ShapedType::kDynamic &&
+            extent == ShapedType::kDynamic)
+          continue;
+
+        //   - a constant non-1 dimension equal to the "known" dim.
+        // In this case we also have to check whether this operand is the only
+        // contributor of that constant.
+        if (knownExtent != ShapedType::kDynamic && extent == knownExtent &&
+            llvm::count_if(operandExtents, [&](ArrayRef<int64_t> operandShape) {
+              return i < operandShape.size() &&
+                     operandShape[operandShape.size() - i - 1] == knownExtent;
+            }) > 1)
+          continue;
+
+        filteredOperands.push_back(shape);
+        break;
+      }
+    }
+    if (filteredOperands.size() != op.getShapes().size()) {
+      rewriter.replaceOpWithNewOp<BroadcastOp>(op, op->getResultTypes(),
+                                               filteredOperands);
+      return success();
+    }
+    return failure();
+  }
+};
+
+// Convert cases like:
+// ```
+//  %1 = shape.shape_of %arg0 : tensor<?x?x?xf64> -> tensor<3xindex>
+//  %2 = shape.shape_of %arg1 : tensor<?x?x1xf64> -> tensor<3xindex>
+//  %3 = shape.broadcast %1, %2 : tensor<3xindex>, tensor<3xindex>
+//                                -> tensor<3xindex>
+//  %result = tensor.extract %3[%c2] : tensor<3xindex>
+// ```
+// to
+//
+// ```
+//  %result = tensor.dim %arg0[%c2] : tensor<?x?x2048xf64>
+// ```
+struct ExtractFromBroadcastedTensorCanonicalizationPattern
+    : public OpRewritePattern<tensor::ExtractOp> {
+  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp op,
+                                PatternRewriter &rewriter) const override {
+    auto broadcastOp = op.getTensor().getDefiningOp<BroadcastOp>();
+    if (!broadcastOp) return failure();
+
+    // Confirm that there is a constant index. This is required, so we can
+    // confirm the DimOp's input will define the resulting broadcasted shape in
+    // that dimension.
+    auto index =
+        op.getIndices().front().getDefiningOp<arith::ConstantIndexOp>();
+    if (!index) return failure();
+    auto idx = index.value();
+
+    // Iterate through the operands with 3 considerations in this order:
+    // 1. If a static, non-1 dimension is seen, we know this to be the
+    // broadcasted result
+    // 2. If a single dynamic dimension is seen, we know this to be the
+    // broadcasted result (with a possibly 1 or non-1 result)
+    // 3. If no dynamic dimensions and no non-1 static dimensions are seen, we
+    // know the result to be 1
+    //
+    // Iterate through all operands, keeping track of dynamic dimensions and
+    // returning immediately if a non-1 static dimension is seen.
+    ShapeOfOp dynamicShape;
+    int64_t numDynamic = 0;
+    for (auto shape : broadcastOp.getShapes()) {
+      auto shapeOfOp = shape.getDefiningOp<ShapeOfOp>();
+      if (!shapeOfOp) return failure();
+      auto shapedType =
+          mlir::cast<ShapedType>(shapeOfOp->getOperandTypes().front());
+
+      // Abort on the existence of unranked shapes as they require more logic.
+      if (!shapedType.hasRank()) return failure();
+      if (shapedType.getRank() <= idx) continue;
+
+      // Only consider dynamic dimensions after the loop because any non-1
+      // static dimension takes precedence.
+      if (shapedType.isDynamicDim(idx)) {
+        dynamicShape = shapeOfOp;
+        numDynamic++;
+        continue;
+      }
+
+      if (shapedType.getDimSize(idx) == 1) continue;
+
+      // Return as soon as we see a non-1 static dim.
+      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(
+          op, shapedType.getDimSize(idx));
+      return success();
+    }
+    if (numDynamic > 1) return failure();
+
+    // Replace with the single dynamic dimension or 1.
+    if (dynamicShape) {
+      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, dynamicShape.getArg(),
+                                                 index);
+    } else {
+      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 1);
+    }
+    return success();
+  }
+};
+
+struct ShapeSimplificationPass
+    : public impl::ShapeSimplificationPassBase<ShapeSimplificationPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::arith::ArithDialect>();
+    registry.insert<mhlo::MhloDialect>();
+    registry.insert<mlir::func::FuncDialect>();
+    registry.insert<shape::ShapeDialect>();
+    registry.insert<tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(&getContext());
+
+    for (auto op : context->getRegisteredOperations()) {
+      if (isa<shape::ShapeDialect, mhlo::MhloDialect>(op.getDialect()))
+        op.getCanonicalizationPatterns(patterns, context);
+    }
+
+    patterns.add<BroadcastRemoveSubsumedOperandsPattern,
+                 ExtractFromBroadcastedTensorCanonicalizationPattern>(context);
+
+    auto func = getOperation();
+    if (failed(applyPatternsGreedily(func, std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index b002effdfccf..ff19510805fe 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -71,8 +71,6 @@ class ConvertLaunchFuncOpToTfRuntimeCallPattern
  private:
   Value generateParamsArray(gpu::LaunchFuncOp launch_op, OpAdaptor adaptor,
                             OpBuilder &builder) const;
-  Value generateKernelNameConstant(StringRef moduleName, StringRef name,
-                                   Location loc, OpBuilder &builder) const;
 
   LogicalResult matchAndRewrite(
       gpu::LaunchFuncOp launch_op, OpAdaptor adaptor,
diff --git a/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.cc b/tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.cc
similarity index 98%
rename from tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.cc
rename to tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.cc
index 397a510e14c9..db21d257cd58 100644
--- a/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.cc
+++ b/tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h"
+#include "tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h"
+
+#include <string>
 
 #include "llvm/Support/CommandLine.h"
 
diff --git a/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h b/tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h
similarity index 91%
rename from tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h
rename to tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h
index b3da62caa95e..ef67186d2066 100644
--- a/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h
+++ b/tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_TF_MLIR_TRANSLATE_CL_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_TF_MLIR_TRANSLATE_CL_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_TF_MLIR_TRANSLATE_CL_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_TF_MLIR_TRANSLATE_CL_H_
 
 // This file contains command-line options aimed to provide the parameters
 // required by the TensorFlow Graph(Def) to MLIR module conversion. It is only
@@ -51,4 +51,4 @@ extern llvm::cl::opt<bool> set_original_tf_func_name;
 extern llvm::cl::opt<bool> export_entry_func_to_flib;
 extern llvm::cl::opt<bool> export_original_tf_func_name;
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_TOOLS_TF_MLIR_TRANSLATE_CL_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_TF_MLIR_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tools/tf_mlir_translate_registration.cc
similarity index 96%
rename from tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_registration.cc
rename to tensorflow/compiler/mlir/tools/tf_mlir_translate_registration.cc
index 4a07a184bbff..7d14d3e954b5 100644
--- a/tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tools/tf_mlir_translate_registration.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tools/file_tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tools/tf_mlir_translate_cl.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tosa/BUILD b/tensorflow/compiler/mlir/tosa/BUILD
index 238781aa6455..4bc56d2d1b42 100644
--- a/tensorflow/compiler/mlir/tosa/BUILD
+++ b/tensorflow/compiler/mlir/tosa/BUILD
@@ -4,6 +4,7 @@
 #   https://github.com/llvm/llvm-project/blob/main/mlir/docs/Dialects/TOSA.md
 
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 # TODO: Tighten visibility once targets are at the right granularity.
@@ -85,6 +86,7 @@ cc_library(
         "//tensorflow/core/kernels:conv_grad_shape_utils",
         "//tensorflow/lite/kernels/internal:reference_base",
         "@com_google_absl//absl/status",
+        "@gemmlowp",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithUtils",
@@ -252,3 +254,47 @@ cc_library(
         "@llvm-project//mlir:Transforms",
     ],
 )
+
+tf_cc_binary(
+    name = "tf-tosa-opt",
+    testonly = True,
+    srcs = ["tf_tosa_opt.cc"],
+    tags = ["tf_tosa"],
+    deps = [
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir:passes",
+        "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",  # buildcleaner:keep
+        "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
+        "//tensorflow/compiler/mlir/tensorflow:mlprogram_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_test_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_graph_optimization_pass",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",  # buildcleaner:keep
+        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:runtime_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/sparsecore:sparsecore_passes",
+        "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
+        "//tensorflow/compiler/mlir/tf2xla/internal/passes:clustering_passes",
+        "//tensorflow/compiler/mlir/tf2xla/internal/passes:mlir_to_graph_passes",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/tosa:tf_passes",
+        "//tensorflow/compiler/mlir/tosa:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/tosa:tfl_passes",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@local_xla//xla/mlir/framework/ir:xla_framework",
+        "@local_xla//xla/mlir/framework/transforms:passes",
+        "@local_xla//xla/mlir_hlo:all_passes",
+    ],
+)
+
+filegroup(
+    name = "litfiles",
+    srcs = glob(["runlit*py"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/mlir/tosa/glob_lit_test.bzl b/tensorflow/compiler/mlir/tosa/glob_lit_test.bzl
new file mode 100644
index 000000000000..c5c72a3b9610
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/glob_lit_test.bzl
@@ -0,0 +1,151 @@
+# Test definitions for Lit, the LLVM test runner.
+#
+# This is reusing the LLVM Lit test runner in the interim until the new build
+# rules are upstreamed.
+# TODO(b/136126535): remove this custom rule.
+"""Lit runner globbing test
+"""
+
+load("@bazel_skylib//lib:paths.bzl", "paths")
+load(
+    "@local_xla//xla:lit.bzl",
+    "lit_script_with_xla_gpu_cuda_data_dir",
+)
+
+# Default values used by the test runner.
+_default_test_file_exts = ["mlir", ".pbtxt", ".td"]
+_default_driver = "@llvm-project//mlir:run_lit.sh"
+_default_size = "small"
+_default_tags = []
+
+# These are patterns which we should never match, for tests, subdirectories, or
+# test input data files.
+_ALWAYS_EXCLUDE = [
+    "**/LICENSE.txt",
+    "**/README.txt",
+    "**/lit.local.cfg",
+    # Exclude input files that have spaces in their names, since bazel
+    # cannot cope with such "targets" in the srcs list.
+    "**/* *",
+    "**/* */**",
+]
+
+def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
+    """Runs lit on all tests it can find in `data` under tensorflow/compiler/mlir.
+
+    Note that, due to Bazel's hermetic builds, lit only sees the tests that
+    are included in the `data` parameter, regardless of what other tests might
+    exist in the directory searched.
+
+    Args:
+      name: str, the name of the test, including extension.
+      data: [str], the data input to the test.
+      size: str, the size of the test.
+      tags: [str], tags to attach to the test.
+      driver: str, label of the driver shell script.
+              Note: use of a custom driver is not currently supported
+              and specifying a default driver will abort the tests.
+      features: [str], list of extra features to enable.
+    """
+
+    # Disable tests on windows for now, to enable testing rest of all xla and mlir.
+    native.py_test(
+        name = name,
+        srcs = ["@llvm-project//llvm:lit"],
+        tags = tags + ["no_pip", "no_windows"],
+        args = [
+            "tensorflow/compiler/mlir/tosa/" + paths.basename(data[-1]) + " --config-prefix=runlit -v",
+        ] + features,
+        data = data + [
+            "//tensorflow/compiler/mlir/tosa:litfiles",
+            "@llvm-project//llvm:FileCheck",
+            "@llvm-project//llvm:count",
+            "@llvm-project//llvm:not",
+        ],
+        deps = ["@pypi_lit//:pkg"],
+        size = size,
+        main = "lit.py",
+        exec_properties = exec_properties,
+    )
+
+def glob_lit_tests(
+        name = None,
+        exclude = [],
+        test_file_exts = _default_test_file_exts,
+        default_size = _default_size,
+        size_override = {},
+        data = [],
+        per_test_extra_data = {},
+        default_tags = _default_tags,
+        tags_override = {},
+        driver = _default_driver,
+        features = [],
+        exec_properties = {},
+        use_lit_test_suite = None,  # @unused
+        hermetic_cuda_data_dir = None):
+    """Creates all plausible Lit tests (and their inputs) under this directory.
+
+    Args:
+      name: str, name of the test_suite rule to generate for running all tests.
+      exclude: [str], paths to exclude (for tests and inputs).
+      test_file_exts: [str], extensions for files that are tests.
+      default_size: str, the test size for targets not in "size_override".
+      size_override: {str: str}, sizes to use for specific tests.
+      data: [str], additional input data to the test.
+      per_test_extra_data: {str: [str]}, extra data to attach to a given file.
+      default_tags: [str], additional tags to attach to the test.
+      tags_override: {str: str}, tags to add to specific tests.
+      driver: str, label of the driver shell script.
+              Note: use of a custom driver is not currently supported
+              and specifying a default driver will abort the tests.
+      features: [str], list of extra features to enable.
+      exec_properties: a dictionary of properties to pass on.
+      hermetic_cuda_data_dir: string. If set, the tests will be run with a
+        `--xla_gpu_cuda_data_dir` flag set to the hermetic CUDA data directory.
+      use_lit_test_suite: unused. For compatibility.
+    """
+
+    # Ignore some patterns by default for tests and input data.
+    exclude = _ALWAYS_EXCLUDE + exclude
+
+    tests = native.glob(
+        ["*." + ext for ext in test_file_exts],
+        exclude = exclude,
+    )
+
+    # Run tests individually such that errors can be attributed to a specific
+    # failure.
+    all_tests = []
+    for curr_test in tests:
+        final_test_name = curr_test
+        if hermetic_cuda_data_dir:
+            output_file = "with_xla_gpu_cuda_data_dir_{}".format(curr_test)
+            rule_name = "script_{}".format(output_file)
+            lit_script_with_xla_gpu_cuda_data_dir(
+                rule_name,
+                curr_test,
+                output_file,
+                hermetic_cuda_data_dir,
+            )
+            final_test_name = output_file
+        all_tests.append(final_test_name + ".test")
+
+        # Instantiate this test with updated parameters.
+        _run_lit_test(
+            name = final_test_name + ".test",
+            data = data + [final_test_name] +
+                   per_test_extra_data.get(curr_test, []),
+            size = size_override.get(curr_test, default_size),
+            tags = default_tags + tags_override.get(curr_test, []),
+            driver = driver,
+            features = features,
+            exec_properties = exec_properties,
+        )
+
+    # TODO: remove this check after making it a required param.
+    if name:
+        native.test_suite(
+            name = name,
+            tests = all_tests,
+            tags = ["manual"],
+        )
diff --git a/tensorflow/compiler/mlir/tosa/runlit.cfg.py b/tensorflow/compiler/mlir/tosa/runlit.cfg.py
new file mode 100644
index 000000000000..ccf0852be8f6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/runlit.cfg.py
@@ -0,0 +1,71 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lit runner configuration."""
+
+import os
+import platform
+import sys
+import lit.formats
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+
+# Lint for undefined variables is disabled as config is not defined inside this
+# file, instead config is injected by way of evaluating runlit.cfg.py from
+# runlit.site.cfg.py which in turn is evaluated by lit.py. The structure is
+# common for lit tests and intended to only persist temporarily (b/136126535).
+# pylint: disable=undefined-variable
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR ' + os.path.basename(config.mlir_test_dir)
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.cc', '.hlo', '.json', '.mlir', '.pbtxt', '.py']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = config.mlir_test_dir
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.environ['RUNFILES_DIR']
+
+if platform.system() == 'Windows':
+  tool_patterns = [
+      ToolSubst('FileCheck.exe', unresolved='fatal'),
+      #  Handle these specially as they are strings searched for during testing.
+      ToolSubst('count.exe', unresolved='fatal'),
+      ToolSubst('not.exe', unresolved='fatal')
+  ]
+
+  llvm_config.config.substitutions.append(
+      ('%python', '"%s"' % (sys.executable)))
+
+  llvm_config.add_tool_substitutions(tool_patterns,
+                                     [llvm_config.config.llvm_tools_dir])
+else:
+  llvm_config.use_default_substitutions()
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = config.mlir_tf_tools_dirs + [
+    config.mlir_tools_dir, config.llvm_tools_dir
+]
+tool_names = [
+    'tf-tosa-opt',
+]
+tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
+llvm_config.add_tool_substitutions(tools, tool_dirs)
+# pylint: enable=undefined-variable
diff --git a/tensorflow/compiler/mlir/tosa/runlit.site.cfg.py b/tensorflow/compiler/mlir/tosa/runlit.site.cfg.py
new file mode 100644
index 000000000000..3f17710069eb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/runlit.site.cfg.py
@@ -0,0 +1,63 @@
+# Copyright 2019 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lit runner site configuration."""
+
+import os
+import platform
+import lit.llvm
+
+# Handle the test srcdir for platforms. On windows, things are weird with bazel.
+if platform.system() == 'Windows':
+  srcdir = os.environ['TEST_SRCDIR']
+  real_test_srcdir = srcdir[:srcdir.find('tensorflow/compiler/mlir/tosa')]
+  external_srcdir = os.path.join(real_test_srcdir, 'external')
+else:
+  real_test_srcdir = os.environ['TEST_SRCDIR']
+  external_srcdir = real_test_srcdir
+
+# Lint for undefined variables is disabled as config is not defined inside this
+# file, instead config is injected by lit.py. The structure is common for lit
+# tests and intended to only persist temporarily (b/136126535).
+# pylint: disable=undefined-variable
+config.llvm_tools_dir = os.path.join(external_srcdir, 'llvm-project', 'llvm')
+config.mlir_obj_root = os.path.join(real_test_srcdir)
+config.mlir_tools_dir = os.path.join(external_srcdir, 'llvm-project', 'mlir')
+# TODO(jpienaar): Replace with suffices in build rule.
+config.suffixes = ['.td', '.mlir', '.pbtxt']
+
+mlir_tf_tools_dirs = [
+    'tensorflow/compiler/mlir/tosa',
+]
+config.mlir_tf_tools_dirs = [
+    os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'], s)
+    for s in mlir_tf_tools_dirs
+]
+test_dir = os.environ['TEST_TARGET']
+test_dir = test_dir.strip('/').rsplit(':', 1)[0]
+config.mlir_test_dir = os.path.join(real_test_srcdir,
+                                    os.environ['TEST_WORKSPACE'], test_dir)
+
+if platform.system() == 'Windows':
+  # Configure this to work with msys2, TF's preferred windows bash.
+  config.lit_tools_dir = '/usr/bin'
+
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(
+    config,
+    os.path.join(
+        os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'],
+                     'tensorflow/compiler/mlir/tosa/runlit.cfg.py')))
+# pylint: enable=undefined-variable
diff --git a/tensorflow/compiler/mlir/tosa/tests/BUILD b/tensorflow/compiler/mlir/tosa/tests/BUILD
index e936d924ef4a..46a4c1fc752b 100644
--- a/tensorflow/compiler/mlir/tosa/tests/BUILD
+++ b/tensorflow/compiler/mlir/tosa/tests/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/compiler/mlir/tosa:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -22,7 +22,7 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "//tensorflow/compiler/mlir:tf-opt",
+        "//tensorflow/compiler/mlir/tosa:tf-tosa-opt",
         "@llvm-project//llvm:FileCheck",
         "@llvm-project//llvm:not",
     ],
diff --git a/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir b/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
index c0be0b6760f7..34d7007ea6cb 100644
--- a/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --tosa-convert-tfl-uint8  --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --tosa-convert-tfl-uint8  --verify-each %s | FileCheck %s
+
 
 // Operations for testing --tosa-convert-tfl-uint8
 
@@ -18,9 +18,13 @@ func.func @test_add_u8(%arg0: tensor<14x19x!quant.uniform<u8:f32, 0.015603500418
 // ----
 
 // CHECK-LABEL: test_cast_ui8
-// CHECK: tosa.rescale %arg0 {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
+// CHECK-DAG: %[[multiplier:.+]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[shift:.+]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: %[[input_zp:.+]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: %[[output_zp:.+]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: tosa.rescale %arg0, %[[multiplier]], %[[shift]], %[[input_zp]], %[[output_zp]] {input_unsigned = true, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true}
 // CHECK: tfl.cast
-func.func @test_cast_ui8(%arg0: tensor<1x256x256x3xui8>) -> tensor<1x256x256x3xf32> {
-  %0 = "tfl.cast"(%arg0) : (tensor<1x256x256x3xui8>) -> tensor<1x256x256x3xf32>
+func.func @test_cast_ui8(%arg0: tensor<1x256x256x3x!quant.uniform<u8:f32, 0.015603500418365002:128>>) -> tensor<1x256x256x3xf32> {
+  %0 = "tfl.cast"(%arg0) : (tensor<1x256x256x3x!quant.uniform<u8:f32, 0.015603500418365002:128>>) -> tensor<1x256x256x3xf32>
   func.return %0 : tensor<1x256x256x3xf32>
 }
diff --git a/tensorflow/compiler/mlir/tosa/tests/convert_metadata.mlir b/tensorflow/compiler/mlir/tosa/tests/convert_metadata.mlir
index 5d7c3316b19e..ced3651bff32 100644
--- a/tensorflow/compiler/mlir/tosa/tests/convert_metadata.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/convert_metadata.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --split-input-file --pass-pipeline='builtin.module(func.func(tosa-tflite-convert-function-metadata))' %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --pass-pipeline='builtin.module(func.func(tosa-tflite-convert-function-metadata))' %s | FileCheck %s
+
 
 module attributes {tfl.schema_version = 3 : i32} {
   // CHECK: func.func @main(
diff --git a/tensorflow/compiler/mlir/tosa/tests/fuse-bias-tf.mlir b/tensorflow/compiler/mlir/tosa/tests/fuse-bias-tf.mlir
index f00c0358fdac..c41b202edc8f 100644
--- a/tensorflow/compiler/mlir/tosa/tests/fuse-bias-tf.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/fuse-bias-tf.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --tosa-fuse-bias-tf --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --tosa-fuse-bias-tf --verify-each %s | FileCheck %s
+
 
 // Operations for testing --tosa-fuse-bias-tf
 
diff --git a/tensorflow/compiler/mlir/tosa/tests/lower-complex-types.mlir b/tensorflow/compiler/mlir/tosa/tests/lower-complex-types.mlir
index c9b59c2201c3..3985720caf1d 100644
--- a/tensorflow/compiler/mlir/tosa/tests/lower-complex-types.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/lower-complex-types.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --split-input-file --tosa-lower-complex-types --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --tosa-lower-complex-types --verify-each %s | FileCheck %s
+
 
 // CHECK-LABEL: test_complex_input
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x4x4x2xf32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/multi_add.mlir b/tensorflow/compiler/mlir/tosa/tests/multi_add.mlir
index 28f3192bae2f..8952d5fcd5ef 100644
--- a/tensorflow/compiler/mlir/tosa/tests/multi_add.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/multi_add.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --tfl-to-tosa-pipeline=target-compilation-backend %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --tfl-to-tosa-pipeline=target-compilation-backend %s | FileCheck %s
+
 
 // CHECK:      tensor<1x8x8x3xf32> {ml_program.identifier = "a"}
 // CHECK-SAME: tensor<1x8x8x3xf32> {ml_program.identifier = "b"}
diff --git a/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir b/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
index 8feb41f2631f..cf4dacffe76f 100644
--- a/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --split-input-file --pass-pipeline='builtin.module(tflite-retain-call-once-funcs)' %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --pass-pipeline='builtin.module(tflite-retain-call-once-funcs)' %s | FileCheck %s
+
 
 // CHECK-LABEL: module {
 module {
diff --git a/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir b/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir
index cea7ec359b27..b595c032bef9 100644
--- a/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir
@@ -1,16 +1,16 @@
-// RUN: tf-opt --split-input-file --tosa-strip-quant-types  --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --tosa-strip-quant-types  --verify-each %s | FileCheck %s
+
 
 // -----
 
 // CHECK-LABEL: @test_max_pool2d_qi8
 // CHECK-SAME: %arg0: tensor<1x4x4x4xi8>) -> tensor<1x4x4x4xi8>
-func.func @test_max_pool2d_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:2>> {
-  %0 = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:2>>
+func.func @test_max_pool2d_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>> {
+  %0 = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>
 
   // CHECK: %[[VAR0:.+]] = tosa.max_pool2d %arg0 {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4xi8>) -> tensor<1x4x4x4xi8>
   // CHECK: return %[[VAR0]] : tensor<1x4x4x4xi8>
-  func.return %0 : tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:2>>
+  func.return %0 : tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/tosa/tests/strip_metadata.mlir b/tensorflow/compiler/mlir/tosa/tests/strip_metadata.mlir
index 5f75b923739d..e607798da0d6 100644
--- a/tensorflow/compiler/mlir/tosa/tests/strip_metadata.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/strip_metadata.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --pass-pipeline='builtin.module(tosa-tflite-strip-module-metadata,func.func(tosa-tflite-strip-function-metadata))' %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --pass-pipeline='builtin.module(tosa-tflite-strip-module-metadata,func.func(tosa-tflite-strip-function-metadata))' %s | FileCheck %s
+
 
 // CHECK-LABEL: module {
 // CHECK-NOT: tf.schema_version
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-tfl-to-tosa-pipeline.mlir
index 7eadb79b757b..fc1403205ca3 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-tfl-to-tosa-pipeline.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --split-input-file --tf-tfl-to-tosa-pipeline  --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --tf-tfl-to-tosa-pipeline  --verify-each %s | FileCheck %s
+
 
 // These tests focus on TensorFlow and TensorFlow Lite hybrid lowering and focus
 // on tfl.custom operations that are Flex ops.
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
index 4eeec30db4c0..0bd0eeb0285d 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
@@ -1,7 +1,7 @@
-// RUN: tf-opt --tf-to-tosa-pipeline  --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
-// RUN: tf-opt --tf-tfl-to-tosa-pipeline  --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --tf-to-tosa-pipeline  --verify-each %s | FileCheck %s
+
+// RUN: tf-tosa-opt --tf-tfl-to-tosa-pipeline  --verify-each %s | FileCheck %s
+
 
 // Operations for testing tf-to-tosa-pipeline
 // TODO: These tests are fairly minimal. Expand the checks to be more robust.
@@ -9,9 +9,9 @@
 // -----
 
 // CHECK-LABEL: test_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %arg1 {perms = array<i32: 3, 0, 1, 2>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAR4:.*]] = tosa.conv2d %arg0, %[[VAR2]], %[[VAR0]], %[[VAR3]], %[[VAR3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32> {
   %3 = "tf.Conv2D"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}  : (tensor<1x32x32x8xf32>, tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32>
@@ -21,8 +21,8 @@ func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x16xf32>
 // -----
 
 // CHECK-LABEL: test_depthwise_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
-// CHECK-DAG: %[[VAL1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK-DAG: %[[VAL1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAR2:.*]] = tosa.depthwise_conv2d %arg0, %arg1, %0, %1, %1 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_depthwise_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x2xf32>) -> tensor<1x32x32x16xf32> {
   %5 = "tf.DepthwiseConv2dNative"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>, tensor<2x2x8x2xf32>) -> tensor<1x32x32x16xf32>
@@ -34,9 +34,9 @@ func.func @test_depthwise_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2
 
 // CHECK-LABEL: @test_transpose_conv2d
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<1x32x32x8xf32>, %[[ARG1:.*]]: tensor<1x1x16x8xf32>
-// CHECK-DAG:     %[[VAR0:.*]] = tosa.const_shape {value = dense<[16, 1, 1, 8]> : tensor<4xindex>}
-// CHECK-DAG:     %[[CONST:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
-// CHECK-DAG:     %[[ZP:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG:     %[[VAR0:.*]] = tosa.const_shape {values = dense<[16, 1, 1, 8]> : tensor<4xindex>}
+// CHECK-DAG:     %[[CONST:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK-DAG:     %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK:         %[[RESHAPE:.*]] = tosa.reshape %[[ARG1]], %[[VAR0]]
 // CHECK:         %[[TRANSPOSE:.*]] = tosa.transpose_conv2d %[[ARG0]], %[[RESHAPE]], %[[CONST]], %[[ZP]], %[[ZP]] {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK:         return %[[TRANSPOSE]]
@@ -51,8 +51,8 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<1x1
 // CHECK-LABEL: test_conv3d
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<2x4x128x128x8xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x2x4xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<4xf32>}>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<4xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_5:.*]] = tosa.transpose %[[VAL_1]] {perms = array<i32: 4, 0, 1, 2, 3>}
 // CHECK: %[[VAL_6:.*]] = tosa.conv3d %[[VAL_0]], %[[VAL_5]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 2, 2>}
 func.func @test_conv3d(%arg0: tensor<2x4x128x128x8xf32>, %arg1: tensor<2x3x3x2x4xf32>) -> tensor<2x4x64x64x4xf32> {
@@ -66,7 +66,7 @@ func.func @test_conv3d(%arg0: tensor<2x4x128x128x8xf32>, %arg1: tensor<2x3x3x2x4
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<3x32x16x16x5xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x5x10xf32>
 // CHECK-SAME: %[[VAL_2:.*]]: tensor<10xf32>) -> tensor<3x32x16x16x10xf32>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_5:.*]] = tosa.transpose %[[VAL_1]] {perms = array<i32: 4, 0, 1, 2, 3>}
 // CHECK: %[[VAL_6:.*]] = tosa.conv3d %[[VAL_0]], %[[VAL_5]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d_bias(%arg0: tensor<3x32x16x16x5xf32>, %arg1: tensor<2x3x3x5x10xf32>, %bias: tensor<10xf32>) -> tensor<3x32x16x16x10xf32> {
@@ -96,7 +96,7 @@ func.func @test_sub(%arg0: tensor<1x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_mul
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR0:.*]] = tosa.mul %arg0, %arg1, %[[SHIFT]]
 func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Mul"(%arg0, %arg1)   : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32>
@@ -106,7 +106,7 @@ func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_real_div
-// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %arg1
+// CHECK: %[[VAR0:.*]] = tosa.intdiv %arg0, %arg1
 func.func @test_real_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> {
   %2 = "tf.RealDiv"(%arg0, %arg1)   : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi32>
   func.return %2 : tensor<13x21x3xi32>
@@ -114,8 +114,23 @@ func.func @test_real_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>)
 
 // -----
 
-// CHECK-LABEL: test_floor_div
-// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %arg1
+// CHECK-LABEL:   func.func @test_floor_div(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<13x21x3xi32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x1x1xi32>}> : () -> tensor<1x1x1xi32>
+// CHECK:           %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1> : tensor<1x1x1xi32>}> : () -> tensor<1x1x1xi32>
+// CHECK:           %[[VAL_5:.*]] = tosa.intdiv %[[VAL_0]], %[[VAL_1]] : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_6:.*]] = tosa.mul %[[VAL_0]], %[[VAL_1]], %[[VAL_2]] : (tensor<13x21x3xi32>, tensor<13x1x3xi32>, tensor<1xi8>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_7:.*]] = tosa.mul %[[VAL_1]], %[[VAL_5]], %[[VAL_2]] : (tensor<13x1x3xi32>, tensor<13x21x3xi32>, tensor<1xi8>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_8:.*]] = tosa.equal %[[VAL_0]], %[[VAL_7]] : (tensor<13x21x3xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_9:.*]] = tosa.logical_not %[[VAL_8]] : (tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_10:.*]] = tosa.greater %[[VAL_3]], %[[VAL_6]] : (tensor<1x1x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_11:.*]] = tosa.sub %[[VAL_5]], %[[VAL_4]] : (tensor<13x21x3xi32>, tensor<1x1x1xi32>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_12:.*]] = tosa.logical_and %[[VAL_9]], %[[VAL_10]] : (tensor<13x21x3xi1>, tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_13:.*]] = tosa.select %[[VAL_12]], %[[VAL_11]], %[[VAL_5]] : (tensor<13x21x3xi1>, tensor<13x21x3xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+// CHECK:           return %[[VAL_13]] : tensor<13x21x3xi32>
+// CHECK:         }
 func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> {
   %2 = "tf.FloorDiv"(%arg0, %arg1)   : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi32>
   func.return %2 : tensor<13x21x3xi32>
@@ -161,9 +176,9 @@ func.func @test_relu6(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_leaky_relu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1xf32>}>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<5.000000e-01> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.mul %arg0, %[[VAR1]], %[[SHIFT]]
 // CHECK-DAG: %[[VAR3:.*]] = tosa.greater_equal %arg0, %[[VAR0]]
 // CHECK: %[[VAR6:.*]] = tosa.select %[[VAR3]], %arg0, %[[VAR2]]
@@ -248,7 +263,7 @@ func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<1x21x3xi1> {
 // -----
 
 // CHECK-LABEL: test_reduce_any
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_any %arg0 {axis = 0 : i32}
 // CHECK: %[[VAR2:.*]] = tosa.reshape %[[VAR1]], %[[VAR0]]
 func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
@@ -261,7 +276,7 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 // -----
 
 // CHECK-LABEL: test_reduce_all
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_all %arg0 {axis = 0 : i32}
 // CHECK: %[[VAR2:.*]] = tosa.reshape %[[VAR1]], %[[VAR0]]
 func.func @test_reduce_all(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
@@ -273,7 +288,7 @@ func.func @test_reduce_all(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 // -----
 
 // CHECK-LABEL: test_reduce_min
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_min %arg0 {axis = 0 : i32}
 // CHECK: %[[VAR2:.*]] = tosa.reshape %[[VAR1]], %[[VAR0]]
 func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
@@ -285,7 +300,7 @@ func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_max
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_max %arg0 {axis = 0 : i32}
 // CHECK: %[[VAR2:.*]] = tosa.reshape %[[VAR1]], %[[VAR0]]
 func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
@@ -297,7 +312,7 @@ func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_sum
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_sum %arg0 {axis = 0 : i32}
 // CHECK: %[[VAR2:.*]] = tosa.reshape %[[VAR1]], %[[VAR0]]
 func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
@@ -310,7 +325,7 @@ func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_sum_nonzero_axis
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30x40x50xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape {value = dense<[10, 20, 30, 50]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape {values = dense<[10, 20, 30, 50]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK-DAG: %[[VAL_2:.*]] = tosa.reduce_sum %[[VAL_0]] {axis = 3 : i32} : (tensor<10x20x30x40x50xf32>) -> tensor<10x20x30x1x50xf32>
 // CHECK-DAG: %[[VAL_3:.*]] = tosa.reshape %[[VAL_2]], %[[VAL_1]] : (tensor<10x20x30x1x50xf32>, !tosa.shape<4>) -> tensor<10x20x30x50xf32>
 // CHECK: return %[[VAL_3]] : tensor<10x20x30x50xf32>
@@ -324,9 +339,9 @@ func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._u
 // -----
 
 // CHECK-LABEL: test_reduce_mean
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.0769230798> : tensor<1x1xf32>}>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0.0769230798> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK: %[[VAL_4:.*]] = tosa.reduce_sum %[[VAL_0]] {axis = 0 : i32}
 // CHECK: %[[VAL_5:.*]] = tosa.reshape %[[VAL_4]], %[[VAL_3]]
 // CHECK: %[[VAL_6:.*]] = tosa.mul %[[VAL_5]], %[[VAL_2]], %[[VAL_1]]
@@ -340,7 +355,7 @@ func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_product
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_product %arg0 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 func.func @test_reduce_product(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
@@ -414,7 +429,8 @@ func.func @test_log(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_negate
-// CHECK: %[[VAR0:.*]] = tosa.negate %arg0
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAR1:.*]] = tosa.negate %arg0, %[[VAR0]], %[[VAR0]]
 func.func @test_negate(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Neg"(%arg0)   : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   func.return %2 : tensor<13x21x3xf32>
@@ -451,9 +467,9 @@ func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_sign
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<8x33xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1xf32>}>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<-1.000000e+00> : tensor<1x1xf32>}>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<-1.000000e+00> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1xf32>}>
 // CHECK: %[[VAL_4:.*]] = tosa.greater %[[VAL_0]], %[[VAL_1]]
 // CHECK: %[[VAL_5:.*]] = tosa.greater %[[VAL_1]], %[[VAL_0]]
 // CHECK: %[[VAL_6:.*]] = tosa.select %[[VAL_5]], %[[VAL_2]], %[[VAL_1]]
@@ -475,7 +491,7 @@ func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_square
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR0:.*]] = tosa.mul %arg0, %arg0, %[[SHIFT]]
 func.func @test_square(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Square"(%arg0)   : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -542,7 +558,8 @@ func.func @test_argmax(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xi32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0 {acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0, %[[ZP]], %[[ZP]] {acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> {
   %2 = "tf.AvgPool"(%arg0)  {data_format = "NHWC", ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32>
   func.return %2 : tensor<1x32x32x8xf32>
@@ -560,7 +577,7 @@ func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32
 // -----
 
 // CHECK-LABEL: test_reshape
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {value = dense<[1, 819]> : tensor<2xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 819]> : tensor<2xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]] : (tensor<13x21x3xf32>, !tosa.shape<2>) -> tensor<1x819xf32>
 func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<1x819xf32> {
   %0 = "tf.Const"()  {value = dense<[1, 819]> : tensor<2xi32>}  : () -> tensor<2xi32>
@@ -582,8 +599,8 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
 // -----
 
 // CHECK-LABEL: test_slice
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[4, 11, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[6, 8, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[4, 11, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[6, 8, 0]> : tensor<3xindex>}
 // CHECK: %[[VAL_3:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xf32>
 func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
   %2 = "tf.Const"()  {value = dense<[6, 8, 0]> : tensor<3xi64>}  : () -> tensor<3xi64>
@@ -595,12 +612,12 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
 // -----
 
 // CHECK-LABEL: test_strided_slice
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[9, 7, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[9, 7, 1, 2]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[9, 7, 3, 2]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[9, 21, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[4, 0, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[9, 7, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[9, 7, 1, 2]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[9, 7, 3, 2]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[9, 21, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[4, 0, 1]> : tensor<3xindex>}
 // CHECK: %[[VAL_7:.*]] = tosa.slice %[[VAL_0]], %[[VAL_6]], %[[VAL_5]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<9x21x2xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_4]] : (tensor<9x21x2xf32>, !tosa.shape<4>) -> tensor<9x7x3x2xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_2]], %[[VAL_3]] : (tensor<9x7x3x2xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<9x7x1x2xf32>
@@ -616,7 +633,7 @@ func.func @test_strided_slice(%arg0: tensor<13x21x3xf32>) -> tensor<9x7x2xf32> {
 // -----
 
 // CHECK-LABEL: test_select
-// CHECK: %[[VAR0:.*]] = tosa.const_shape  {value = dense<1> : tensor<3xindex>}
+// CHECK: %[[VAR0:.*]] = tosa.const_shape  {values = dense<1> : tensor<3xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %arg2, %[[VAR0]] : (tensor<1xi1>, !tosa.shape<3>) -> tensor<1x1x1xi1>
 // CHECK: %[[VAR2:.*]] = tosa.select %[[VAR1]], %arg0, %arg1
 func.func @test_select(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<1xi1>) -> tensor<13x21x3xf32> {
@@ -649,7 +666,7 @@ func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>,
 
 // CHECK-LABEL: test_stack
 // CHECK-DAG: %[[VAR0:.*]] = tosa.concat %arg0, %arg1, %arg2, %arg3 {axis = 0 : i32}
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {value = dense<[4, 13, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {values = dense<[4, 13, 21, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[SHAPE]]
 func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32> {
   %2 = "tf.Pack"(%arg0, %arg1, %arg2, %arg3)  {axis = 0 : i64}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32>
@@ -659,7 +676,7 @@ func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %a
 // -----
 
 // CHECK-LABEL: test_unstack
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[32, 32, 8]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[32, 32, 8]> : tensor<3xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR0]]
 func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32> {
   %2 = "tf.Unpack"(%arg0)  {axis = 0 : i64}  : (tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32>
@@ -670,8 +687,8 @@ func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32> {
 // -----
 
 // CHECK-LABEL: test_pad
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<1> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<1> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAR1:.*]] = tosa.pad %arg0, %[[VAR0]], %[[PVAL]]
 func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<15x23x5xf32> {
   %2 = "tf.Const"()  {value = dense<1> : tensor<3x2xi32>}  : () -> tensor<3x2xi32>
@@ -682,8 +699,8 @@ func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<15x23x5xf32> {
 // -----
 
 // CHECK-LABEL: test_pad_v2
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<-3.40282347E+38> : tensor<1xf32>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape {value = dense<[1, 0, 0, 1, 1, 2]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<-3.40282347E+38> : tensor<1xf32>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape {values = dense<[1, 0, 0, 1, 1, 2]> : tensor<6xindex>} : () -> !tosa.shape<6>
 // CHECK: %[[VAL_3:.*]] = tosa.pad %[[VAL_0]], %[[VAL_2]], %[[VAL_1]]
 func.func @test_pad_v2(%arg0: tensor<13x21x3xf32>) -> tensor<15x23x5xf32> {
   %1 = "tf.Const"() {value = dense<[[1, 0], [0, 1], [1, 2]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
@@ -695,7 +712,7 @@ func.func @test_pad_v2(%arg0: tensor<13x21x3xf32>) -> tensor<15x23x5xf32> {
 // -----
 
 // CHECK-LABEL: test_expand_dims
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {value = dense<[1, 13, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {values = dense<[1, 13, 21, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]]
 func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<1x13x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
@@ -706,7 +723,7 @@ func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<1x13x21x3xf32>
 // -----
 
 // CHECK-LABEL: test_expand_dims_negative_index
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {value = dense<[13, 21, 1, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape  {values = dense<[13, 21, 1, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]]
 func.func @test_expand_dims_negative_index(%arg0: tensor<13x21x3xf32>) -> tensor<13x1x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<-2> : tensor<1xi32>}  : () -> tensor<1xi32>
@@ -717,7 +734,7 @@ func.func @test_expand_dims_negative_index(%arg0: tensor<13x21x3xf32>) -> tensor
 // -----
 
 // CHECK-LABEL: test_shape
-// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[13, 21, 3]> : tensor<3xi32>}>
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{values = dense<[13, 21, 3]> : tensor<3xi32>}>
 func.func @test_shape() -> tensor<3xi32> {
   %3 = "tf.Const"()  {value = dense<[13, 21, 3]> : tensor<3xi32>}  : () -> tensor<3xi32>
   func.return %3 : tensor<3xi32>
@@ -726,7 +743,7 @@ func.func @test_shape() -> tensor<3xi32> {
 // -----
 
 // CHECK-LABEL: test_rank
-// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<3> : tensor<i32>}>
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{values = dense<3> : tensor<i32>}>
 func.func @test_rank() -> tensor<i32> {
   %3 = "tf.Const"()  {value = dense<3> : tensor<i32>}  : () -> tensor<i32>
   func.return %3 : tensor<i32>
@@ -735,8 +752,8 @@ func.func @test_rank() -> tensor<i32> {
 // -----
 
 // CHECK-LABEL: test_elu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1x1xf32>}>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.exp %arg0
 // CHECK-DAG: %[[VAR4:.*]] = tosa.sub %[[VAR2]], %[[VAR0]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.greater_equal %arg0, %[[VAR1]]
@@ -749,7 +766,7 @@ func.func @test_elu(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_softmax
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_max %arg0
 // CHECK-DAG: %[[VAR1:.*]] = tosa.sub %arg0, %[[VAR0]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.exp %[[VAR1]]
@@ -764,7 +781,7 @@ func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_log_softmax
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR0:.*]] = tosa.exp %arg0
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_sum %[[VAR0]] {axis = 2 : i32}
 // CHECK-DAG: %[[VAR2:.*]] = tosa.reciprocal %[[VAR1]]
@@ -778,7 +795,8 @@ func.func @test_log_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_batch_matmul_3d
-// CHECK: %[[VAR0:.*]] = tosa.matmul %arg0, %arg1
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK: %[[VAR0:.*]] = tosa.matmul %arg0, %arg1, %[[ZP]], %[[ZP]]
 func.func @test_batch_matmul_3d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x3x42xf32>) -> tensor<13x21x42xf32> {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false, device = ""} : (tensor<13x21x3xf32>, tensor<13x3x42xf32>) -> tensor<13x21x42xf32>
   func.return %0 : tensor<13x21x42xf32>
@@ -787,13 +805,14 @@ func.func @test_batch_matmul_3d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x3x4
 // -----
 
 // CHECK-LABEL: test_batch_matmul_4d
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[5, 13, 21, 42]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[65, 3, 42]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[65, 21, 3]> : tensor<3xindex>}
-// CHECK: %[[VAL_5:.*]] = tosa.reshape %arg0, %[[VAL_4]]
-// CHECK: %[[VAL_6:.*]] = tosa.reshape %arg1, %[[VAL_3]]
-// CHECK: %[[VAL_7:.*]] = tosa.matmul %[[VAL_5]], %[[VAL_6]]
-// CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_2]]
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[65, 21, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[65, 3, 42]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[5, 13, 21, 42]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.reshape %arg0, %[[VAR10]]
+// CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg1, %[[VAR11]]
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = tosa.matmul %[[VAR0]], %[[VAR1]], %[[ZP]], %[[ZP]]
+// CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR12]]
 func.func @test_batch_matmul_4d(%arg0: tensor<5x13x21x3xf32>, %arg1: tensor<5x13x3x42xf32>) -> tensor<5x13x21x42xf32> {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false, device = ""} : (tensor<5x13x21x3xf32>, tensor<5x13x3x42xf32>) -> tensor<5x13x21x42xf32>
   func.return %0 : tensor<5x13x21x42xf32>
@@ -802,13 +821,14 @@ func.func @test_batch_matmul_4d(%arg0: tensor<5x13x21x3xf32>, %arg1: tensor<5x13
 // -----
 
 // CHECK-LABEL: test_matmul
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[14, 28]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 19, 28]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[1, 14, 19]> : tensor<3xindex>}
-// CHECK: %[[VAL_5:.*]] = tosa.reshape %arg0, %[[VAL_4]] : (tensor<14x19xf32>, !tosa.shape<3>) -> tensor<1x14x19xf32>
-// CHECK: %[[VAL_6:.*]] = tosa.reshape %arg1, %[[VAL_3]] : (tensor<19x28xf32>, !tosa.shape<3>) -> tensor<1x19x28xf32>
-// CHECK: %[[VAL_7:.*]] = tosa.matmul %[[VAL_5]], %[[VAL_6]] : (tensor<1x14x19xf32>, tensor<1x19x28xf32>) -> tensor<1x14x28xf32>
-// CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_2]] : (tensor<1x14x28xf32>, !tosa.shape<2>) -> tensor<14x28xf32>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 14, 19]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 19, 28]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[14, 28]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.reshape %arg0, %[[VAR10]]
+// CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg1, %[[VAR11]]
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = tosa.matmul %[[VAR0]], %[[VAR1]], %[[ZP]], %[[ZP]]
+// CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR12]]
 func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> tensor<14x28xf32> {
   %2 = "tf.MatMul"(%arg0, %arg1)  {transpose_a = false, transpose_b = false}  : (tensor<14x19xf32>, tensor<19x28xf32>) -> tensor<14x28xf32>
   func.return %2 : tensor<14x28xf32>
@@ -817,7 +837,7 @@ func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_add_scalar
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1x1xf32>}>
 // CHECK: %[[VAR2:.*]] = tosa.add %arg0, %[[VAR0]]
 func.func @test_add_scalar(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<1.000000e+00> : tensor<f32>}  : () -> tensor<f32>
@@ -841,10 +861,10 @@ func.func @test_add_1d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -
 // -----
 
 // CHECK-LABEL: test_split
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 14, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[0, 7, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[13, 7, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 14, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[0, 7, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[13, 7, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>}
 // CHECK: %[[VAL_5:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_1]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
@@ -878,13 +898,13 @@ func.func @test_reverse(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_space_to_batch
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<[0, 0, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<[0, 0, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.pad %arg0, %[[VAR0]], %[[PVAL]]
-// CHECK-DAG: %[[VAR13:.*]] = tosa.const_shape {value = dense<[13, 11, 2, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR13:.*]] = tosa.const_shape {values = dense<[13, 11, 2, 3]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR13]]
 // CHECK-DAG: %[[VAR4:.*]] = tosa.transpose %[[VAR3]] {perms = array<i32: 2, 0, 1, 3>}
-// CHECK-DAG: %[[VAR14:.*]] = tosa.const_shape {value = dense<[26, 11, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR14:.*]] = tosa.const_shape {values = dense<[26, 11, 3]> : tensor<3xindex>}
 // CHECK: %[[VAR5:.*]] = tosa.reshape %[[VAR4]], %[[VAR14]]
 func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32> {
   %2 = "tf.Const"()  {value = dense<2> : tensor<1xi32>}  : () -> tensor<1xi32>
@@ -897,10 +917,10 @@ func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32
 
 // CHECK-LABEL: test_batch_to_space
 // CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %arg0 {perms = array<i32: 3, 1, 2, 0>}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[2, 2, 2, 32, 32, 1]> : tensor<6xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[2, 2, 2, 32, 32, 1]> : tensor<6xindex>}
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR10]]
 // CHECK-DAG: %[[VAR4:.*]] = tosa.transpose %[[VAR3]] {perms = array<i32: 2, 3, 0, 4, 1, 5>}
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[2, 64, 64, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[2, 64, 64, 1]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %[[VAR4]], %[[VAR12]]
 // CHECK: return %[[VAR5]]
 func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1xf32> {
@@ -915,10 +935,10 @@ func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1
 // -----
 
 // CHECK-LABEL: test_space_to_depth
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 16, 2, 16, 2, 8]> : tensor<6xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 16, 2, 16, 2, 8]> : tensor<6xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %[[VAR1]] {perms = array<i32: 0, 1, 3, 2, 4, 5>}
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[1, 16, 16, 32]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[1, 16, 16, 32]> : tensor<4xindex>}
 // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR12]]
 func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32> {
   %2 = "tf.SpaceToDepth"(%arg0)  {block_size = 2 : i64, data_format = "NHWC"}  : (tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32>
@@ -928,10 +948,10 @@ func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x3
 // -----
 
 // CHECK-LABEL: test_depth_to_space
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 32, 32, 2, 2, 2]> : tensor<6xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 32, 32, 2, 2, 2]> : tensor<6xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %[[VAR1]] {perms = array<i32: 0, 1, 3, 2, 4, 5>}
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[1, 64, 64, 2]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[1, 64, 64, 2]> : tensor<4xindex>}
 // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR12]]
 func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32> {
   %2 = "tf.DepthToSpace"(%arg0)  {block_size = 2 : i64, data_format = "NHWC"}  : (tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32>
@@ -960,11 +980,11 @@ func.func @test_right_shift(%arg0: tensor<4x4xi32>, %arg1: tensor<1x1xi32>) -> t
 
 // CHECK-LABEL: @test_one_hot
 // CHECK-SAME:      %[[ARG0_0:.*]]: tensor<4x4xi32>, %[[ARG1_0:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>
-// CHECK-DAG:     %[[SHAPE_2:.*]] = tosa.const_shape  {value = dense<[4, 4, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG:     %[[SHAPE_1:.*]] = tosa.const_shape  {value = dense<[16, 1]> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG:     %[[CST1:.*]] = tosa.const_shape {value = dense<[16, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG:     %[[CST2:.*]] = tosa.const_shape {value = dense<[16, 2, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG:     %[[SHAPE_0:.*]] = tosa.const_shape  {value = dense<1> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[SHAPE_2:.*]] = tosa.const_shape  {values = dense<[4, 4, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[SHAPE_1:.*]] = tosa.const_shape  {values = dense<[16, 1]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG:     %[[CST1:.*]] = tosa.const_shape {values = dense<[16, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[CST2:.*]] = tosa.const_shape {values = dense<[16, 2, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[SHAPE_0:.*]] = tosa.const_shape  {values = dense<1> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK:         %[[RESHAPE_0:.*]] = tosa.reshape %[[ARG1_0]], %[[SHAPE_0]]
 // CHECK:         %[[TILE:.*]] = tosa.tile %[[RESHAPE_0]], %[[CST1]]
 // CHECK:         %[[RESHAPE_1:.*]] = tosa.reshape %[[ARG2]], %[[SHAPE_0]]
@@ -982,12 +1002,12 @@ func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tenso
 // -----
 
 // CHECK-LABEL: test_fakequant_with_min_max_args
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<-2.00003052> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<1.99996948> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<-2.00003052> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<1.99996948> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<6.10360876E-5> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{values = dense<16383.75> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{values = dense<5.000000e-01> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR6:.*]] = tosa.minimum %arg0, %[[VAR1]]
 // CHECK-DAG: %[[VAR8:.*]] = tosa.maximum %[[VAR6]], %[[VAR0]]
 // CHECK-DAG: %[[VAR10:.*]] = tosa.sub %[[VAR8]], %[[VAR0]]
@@ -1003,9 +1023,9 @@ func.func @test_fakequant_with_min_max_args(%arg0: tensor<13x21x3xf32>) -> tenso
 
 // -----
 // CHECK-LABEL: test_gather
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<[1, 13, 63]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<{{.*}} : tensor<1x49xi32>}>
-// CHECK-DAG: %[[VAR2:.*]] = tosa.const_shape {value = dense<[7, 7, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<[1, 13, 63]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<{{.*}} : tensor<1x49xi32>}>
+// CHECK-DAG: %[[VAR2:.*]] = tosa.const_shape {values = dense<[7, 7, 21, 3]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %arg0, %[[VAR0]]
 // CHECK-DAG: %[[VAR4:.*]] = tosa.gather %[[VAR3]], %[[VAR1]]
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %[[VAR4]], %[[VAR2]]
@@ -1020,9 +1040,9 @@ func.func @test_gather(%arg0: tensor<13x21x3xf32>) -> tensor<7x7x21x3xf32> {
 
 // -----
 // CHECK-LABEL: test_gather_nd
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<{{\[\[}}0, 5, 3, 12, 2, 4, 3, 11, 1, 11, 10, 3, 12, 8, 5, 3, 1, 11, 3, 10, 0, 0, 8, 4, 7, 3, 12, 2, 7, 6, 11, 4, 2, 10, 11, 11, 1, 11, 1, 1, 11, 8]]> : tensor<1x42xi32>}>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[6, 7, 21, 3]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 13, 63]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<{{\[\[}}0, 5, 3, 12, 2, 4, 3, 11, 1, 11, 10, 3, 12, 8, 5, 3, 1, 11, 3, 10, 0, 0, 8, 4, 7, 3, 12, 2, 7, 6, 11, 4, 2, 10, 11, 11, 1, 11, 1, 1, 11, 8]]> : tensor<1x42xi32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[6, 7, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 13, 63]> : tensor<3xindex>}
 // CHECK: %[[VAL_4:.*]] = tosa.reshape %[[VAL_0]], %[[VAL_3]]
 // CHECK: %[[VAL_5:.*]] = tosa.gather %[[VAL_4]], %[[VAL_1]]
 // CHECK: %[[VAL_6:.*]] = tosa.reshape %[[VAL_5]], %[[VAL_2]]
@@ -1033,13 +1053,27 @@ func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>) -> tensor<6x7x21x3xf32> {
   func.return %1 : tensor<6x7x21x3xf32>
 }
 
+// -----
+
+// CHECK-LABEL: test_scatter_nd
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x224x512xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x2xi32>}>
+// CHECK-DAG: %[[VAR3:.*]] = tosa.reduce_sum %[[VAR2:.*]] {axis = 1 : i32} : (tensor<1x2xi32>)
+// CHECK-DAG: %[[VAR4:.*]] = tosa.scatter %[[VAR1:.*]], %[[VAR3:.*]], %arg0 : (tensor<1x224x512xf32>, tensor<1x1xi32>, tensor<1x1x512xf32>)
+// CHECK: return %[[VAR4]]
+func.func @test_scatter_nd(%arg0: tensor<1x1x512xf32>) -> tensor<1x224x512xf32> {
+  %shape = "tf.Const"() {device = "", value = dense<[1, 224, 512]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %indices = "tf.Const"() {device = "", value = dense<[[[0, 0]]]>: tensor<1x1x2xi32>} : () -> tensor<1x1x2xi32>
+  %1 = "tf.ScatterNd"(%indices, %arg0, %shape) {device = ""} : (tensor<1x1x2xi32>, tensor<1x1x512xf32>, tensor<3xi32>) -> tensor<1x224x512xf32>
+  func.return %1 : tensor<1x224x512xf32>
+}
 
 // -----
 
 // CHECK-LABEL: test_fused_batch_norm
 func.func @test_fused_batch_norm(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> tensor<8x8x8x8xf32> {
-  // CHECK-DAG:  %[[CONST0:.+]] = tosa.const_shape  {value = dense<[1, 1, 1, 8]> : tensor<4xindex>}
-  // CHECK-DAG:  %[[ONE:.+]] = "tosa.const"() <{value = dense<1.000000e-03> : tensor<1xf32>}>
+  // CHECK-DAG:  %[[CONST0:.+]] = tosa.const_shape  {values = dense<[1, 1, 1, 8]> : tensor<4xindex>}
+  // CHECK-DAG:  %[[ONE:.+]] = "tosa.const"() <{values = dense<1.000000e-03> : tensor<1xf32>}>
   // CHECK:  %[[RES0:.+]] = tosa.reshape %arg3, %[[CONST0]]
   // CHECK:  %[[SUB0:.+]] = tosa.sub %arg0, %[[RES0]]
   // CHECK:  %[[ADD0:.+]] = tosa.add %arg4, %[[ONE]]
@@ -1068,13 +1102,13 @@ func.func @test_fused_batch_norm_training(%arg0: tensor<8x8x8x8xf32>, %arg1: ten
 // -----
 
 // CHECK-LABEL: mirrorpad_symmetric
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 8]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[8, 2]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[8, 1]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[3, 0]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[2, 10]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[1, 10]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {value = dense<0> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 8]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[8, 2]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[8, 1]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[3, 0]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[2, 10]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[1, 10]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<0> : tensor<2xindex>}
 // CHECK: %[[VAL_8:.*]] = tosa.slice %arg0, %[[VAL_7]], %[[VAL_6]] : (tensor<5x10xf32>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x10xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_5]] : (tensor<5x10xf32>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<2x10xf32>
 // CHECK: %[[VAL_10:.*]] = tosa.reverse %[[VAL_9]] {axis = 0 : i32} : (tensor<2x10xf32>) -> tensor<2x10xf32>
@@ -1093,12 +1127,12 @@ func.func @mirrorpad_symmetric(%arg0: tensor<5x10xf32>) -> tensor<8x13xf32> {
 // -----
 
 // CHECK-LABEL: mirrorpad_reflect
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 0, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[14, 22, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[0, 1, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[14, 1, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[1, 21, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[1, 0, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 0, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[14, 22, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[0, 1, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[14, 1, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[1, 21, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[1, 0, 0]> : tensor<3xindex>}
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_6]], %[[VAL_5]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<1x21x3xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.concat %[[VAL_7]], %arg0 {axis = 0 : i32} : (tensor<1x21x3xf32>, tensor<13x21x3xf32>) -> tensor<14x21x3xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_3]], %[[VAL_4]] : (tensor<14x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<14x1x3xf32>
@@ -1115,8 +1149,8 @@ func.func @mirrorpad_reflect(%arg0: tensor<13x21x3xf32>) -> tensor<14x22x4xf32>
 // -----
 
 // CHECK-LABEL: test_broadcast_to_f32
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {value = dense<[1, 1, 13, 1]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<-0.000000e+00> : tensor<3x3x13x7xf32>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {values = dense<[1, 1, 13, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<-0.000000e+00> : tensor<3x3x13x7xf32>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]] : (tensor<13x1xf32>, !tosa.shape<4>) -> tensor<1x1x13x1xf32>
 // CHECK: %[[VAL_2:.*]] = tosa.add %[[VAL_1]], %[[VAL_0]] : (tensor<1x1x13x1xf32>, tensor<3x3x13x7xf32>) -> tensor<3x3x13x7xf32>
 // CHECK: return %[[VAL_2]] : tensor<3x3x13x7xf32>
@@ -1129,8 +1163,8 @@ func.func @test_broadcast_to_f32(%arg0: tensor<13x1xf32>) -> (tensor<3x3x13x7xf3
 // -----
 
 // CHECK-LABEL: test_broadcast_to_i32
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {value = dense<[1, 1, 13, 1]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<0> : tensor<7x7x13x3xi32>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {values = dense<[1, 1, 13, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<0> : tensor<7x7x13x3xi32>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]] : (tensor<13x1xi32>, !tosa.shape<4>) -> tensor<1x1x13x1xi32>
 // CHECK: %[[VAL_2:.*]] = tosa.add %[[VAL_1]], %[[VAL_0]] : (tensor<1x1x13x1xi32>, tensor<7x7x13x3xi32>) -> tensor<7x7x13x3xi32>
 // CHECK: return %[[VAL_2]] : tensor<7x7x13x3xi32>
@@ -1143,8 +1177,8 @@ func.func @test_broadcast_to_i32(%arg0: tensor<13x1xi32>) -> (tensor<3x3x13x3xi3
 // -----
 
 // CHECK-LABEL: test_broadcast_to_i1
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {value = dense<[1, 1, 13, 1]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<false> : tensor<7x7x13x7xi1>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {values = dense<[1, 1, 13, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<false> : tensor<7x7x13x7xi1>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]] : (tensor<13x1xi1>, !tosa.shape<4>) -> tensor<1x1x13x1xi1>
 // CHECK: %[[VAL_2:.*]] = tosa.logical_or %[[VAL_1]], %[[VAL_0]] : (tensor<1x1x13x1xi1>, tensor<7x7x13x7xi1>) -> tensor<7x7x13x7xi1>
 // CHECK: return %[[VAL_2]] : tensor<7x7x13x7xi1>
@@ -1157,8 +1191,8 @@ func.func @test_broadcast_to_i1(%arg0: tensor<13x1xi1>) -> (tensor<7x7x13x7xi1>)
 // -----
 
 // CHECK-LABEL: test_broadcast_to_i16
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {value = dense<[1, 1, 13, 1]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<0> : tensor<7x7x13x3xi32>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {values = dense<[1, 1, 13, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<0> : tensor<7x7x13x3xi32>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]]
 // CHECK: %[[VAL_2:.*]] = tosa.cast %[[VAL_1]] : (tensor<1x1x13x1xi16>) -> tensor<1x1x13x1xi32>
 // CHECK: %[[VAL_3:.*]] = tosa.add %[[VAL_2]], %[[VAL_0]] : (tensor<1x1x13x1xi32>, tensor<7x7x13x3xi32>) -> tensor<7x7x13x3xi32>
@@ -1173,7 +1207,7 @@ func.func @test_broadcast_to_i16(%arg0: tensor<13x1xi16>) -> (tensor<7x7x13x3xi1
 // -----
 
 // CHECK-LABEL: test_broadcast_to_smaller_rank
-// CHECK: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<[13, 7]> : tensor<2xi32>}
+// CHECK: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<[13, 7]> : tensor<2xi32>}
 // CHECK: %[[VAL_1:.*]] = "tf.BroadcastTo"(%arg0, %[[VAL_0]]) : (tensor<2x3x13x1xi32>, tensor<2xi32>) -> tensor<13x7xi32>
 // CHECK: return %[[VAL_1]] : tensor<13x7xi32>
 func.func @test_broadcast_to_smaller_rank(%arg0: tensor<2x3x13x1xi32>) -> (tensor<13x7xi32>) {
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-unequal-ranks.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-unequal-ranks.mlir
index ead76da89912..97ebeeac782a 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-unequal-ranks.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-unequal-ranks.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --split-input-file --tf-to-tosa-pipeline --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --tf-to-tosa-pipeline --verify-each %s | FileCheck %s
+
 // Test tf legalization that produce TOSA ResultsBroadcastableShape operators with unequal ranks
 
 // -----
@@ -79,8 +79,9 @@ func.func @test_logical_or(%arg0: tensor<8x13x21x3xi1>, %arg1: tensor<13x21x1xi1
 // -----
 
 // CHECK-LABEL: test_floor_div
+// CHECK: tosa.intdiv
+// CHECK: tosa.select
 func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<1x13x1x3xi32>) -> tensor<1x13x21x3xi32> {
-  // CHECK: tosa.int_div
   %2 = "tf.FloorDiv"(%arg0, %arg1)   : (tensor<13x21x3xi32>, tensor<1x13x1x3xi32>) -> tensor<1x13x21x3xi32>
   func.return %2 : tensor<1x13x21x3xi32>
 }
@@ -88,7 +89,7 @@ func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<1x13x1x3xi32
 // -----
 
 // CHECK-LABEL: test_real_div
-// CHECK: tosa.int_div
+// CHECK: tosa.intdiv
 func.func @test_real_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<1x13x1x3xi32>) -> tensor<1x13x21x3xi32> {
   %2 = "tf.RealDiv"(%arg0, %arg1)   : (tensor<13x21x3xi32>, tensor<1x13x1x3xi32>) -> tensor<1x13x21x3xi32>
   func.return %2 : tensor<1x13x21x3xi32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-dequantize_softmax.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-dequantize_softmax.mlir
index 936dbf7c69c6..28c764de62ab 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-dequantize_softmax.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-dequantize_softmax.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --tosa-dequantize-tfl-softmax %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --tosa-dequantize-tfl-softmax %s | FileCheck %s
+
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
index 3c7fa3892da1..1bc7e084fdbc 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --pass-pipeline='builtin.module(func.func(tosa-legalize-tfl{disable-patterns=TFLConv2D,TFLSoftmax, enable-patterns=TFLFullyConnected,TFLTranspose}))' %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --pass-pipeline='builtin.module(func.func(tosa-legalize-tfl{disable-patterns=TFLConv2D,TFLSoftmax, enable-patterns=TFLFullyConnected,TFLTranspose}))' %s | FileCheck %s
+
 
 // -----
 
@@ -26,15 +26,14 @@ func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_matmul
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[14, 1, 1, 19]> : tensor<4xindex>}
-// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[28, 1, 1, 19]> : tensor<4xindex>}
-// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[14, 28]> : tensor<2xindex>}
-// CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<28xf32>}>
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {values = dense<[14, 1, 1, 19]> : tensor<4xindex>}
+// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {values = dense<[28, 1, 1, 19]> : tensor<4xindex>}
+// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {values = dense<[14, 28]> : tensor<2xindex>}
+// CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAR1:.*]] = tosa.transpose %arg1 {perms = array<i32: 1, 0>}
 // CHECK-DAG: %[[VAR2:.*]] = tosa.reshape %arg0, %[[CONST0]]
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR1]], %[[CONST1]]
-// CHECK-DAG: %[[VAR4:.*]] = tosa.conv2d %[[VAR2]], %[[VAR3]], %[[VAR0]], %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR4:.*]] = tosa.conv2d %[[VAR2]], %[[VAR3]], %[[CONST3]], %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAR5:.*]] = tosa.reshape %[[VAR4]], %[[CONST2]]
 func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index 95a468e8da6b..c217547b4a78 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -1,7 +1,7 @@
-// RUN: tf-opt --split-input-file --tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
-// RUN: tf-opt --split-input-file --tf-tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
+
+// RUN: tf-tosa-opt --split-input-file --tf-tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
+
 
 // Operations for testing tfl-to-tosa-pipeline
 
@@ -13,8 +13,8 @@
 // -----
 
 // CHECK-LABEL: test_conv2d
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<16xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_4:.*]] = tosa.conv2d %arg0, %arg1, %[[VAL_2]], %[[VAL_3]], %[[VAL_3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<0.000000e+00> : tensor<16xf32>
@@ -36,7 +36,7 @@ func.func @test_conv2d_dynamic(%arg0: tensor<?x32x32x8xf32>, %arg1: tensor<16x1x
 // -----
 
 // CHECK-LABEL: test_conv2d_bias
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_4:.*]] = tosa.conv2d %arg0, %arg1, %arg2, %[[VAL_3]], %[[VAL_3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 // CHECK-SAME: tensor<1x32x32x16xf32>
 func.func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x2x2x8xf32>, %cst_0: tensor<16xf32>) -> tensor<*xf32> {
@@ -47,9 +47,9 @@ func.func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x2x2x8x
 // -----
 
 // CHECK-LABEL: test_conv2d_slicing
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[2, 31, 30, 8]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[2, 31, 30, 8]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_6:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]]
 // CHECK: %[[VAL_7:.*]] = tosa.conv2d %[[VAL_6]], %arg1, %arg2, %[[VAL_5]], %[[VAL_5]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 3>}
 // CHECK-SAME: tensor<2x15x10x16xf32>
@@ -61,9 +61,8 @@ func.func @test_conv2d_slicing(%arg0: tensor<2x32x32x8xf32>, %arg1: tensor<16x3x
 // -----
 
 // CHECK-LABEL: test_transpose_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
-// CHECK: %[[VAR2:.*]] = tosa.transpose_conv2d %arg0, %arg1, %[[VAR0]], %[[VAR1]], %[[VAR1]] {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK: %[[VAR2:.*]] = tosa.transpose_conv2d %arg0, %arg1, %[[VAR1]], %[[VAR1]], %[[VAR1]] {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16x1x1x8xf32>) -> tensor<1x32x32x16xf32> {
   %cst = arith.constant dense<[1, 32, 32, 16]> : tensor<4xi32>
   %cst_1 = "tfl.no_value"() {value = unit} : () -> none
@@ -74,9 +73,8 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16
 // -----
 
 // CHECK-LABEL: test_transpose_conv2d_relu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
-// CHECK: %[[VAR2:.*]] = tosa.transpose_conv2d %arg0, %arg1, %[[VAR0]], %[[VAR1]], %[[VAR1]] {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK: %[[VAR2:.*]] = tosa.transpose_conv2d %arg0, %arg1, %[[VAR1]], %[[VAR1]], %[[VAR1]] {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAR3:.*]] = tosa.clamp %[[VAR2]] {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32}
 func.func @test_transpose_conv2d_relu(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16x1x1x8xf32>) -> tensor<1x32x32x16xf32> {
   %cst = arith.constant dense<[1, 32, 32, 16]> : tensor<4xi32>
@@ -87,10 +85,25 @@ func.func @test_transpose_conv2d_relu(%arg0: tensor<1x32x32x8xf32>, %cst_0: tens
 
 // -----
 
+// CHECK-LABEL: test_transpose_conv2d_outpad
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK: %[[VAR2:.*]] = tosa.transpose_conv2d %arg0, %arg1, %[[VAR0]], %[[VAR0]], %[[VAR0]] {acc_type = f32, out_pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
+func.func @test_transpose_conv2d_outpad(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>) -> tensor<1x33x33x16xf32> {
+  %cst = arith.constant dense<[1, 33, 33, 16]> : tensor<4xi32>
+  %cst_1 = "tfl.no_value"() {value = unit} : () -> none
+  %0 = "tfl.transpose_conv"(%cst, %arg1, %arg0, %cst_1)
+    {padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32,
+     fused_activation_function = "NONE"}
+    : (tensor<4xi32>, tensor<16x1x1x8xf32>, tensor<1x32x32x8xf32>, none) -> tensor<1x33x33x16xf32>
+  func.return %0 : tensor<1x33x33x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_conv2d_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<16x2x2x8xi8>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0> : tensor<16xi32>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<16x2x2x8xi8>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0> : tensor<16xi32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR3:.*]] = tosa.conv2d %arg0, %[[VAR0]], %[[VAR1]], %[[VAR2]], %[[VAR2]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAR4:.*]] = tosa.rescale %[[VAR3]]
 func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
@@ -103,9 +116,9 @@ func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.0156
 // -----
 
 // CHECK-LABEL: test_conv2d_qi8_2
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<16x2x2x8xi8>}>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<1> : tensor<16xi8>}>
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<16x2x2x8xi8>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1> : tensor<16xi32>}>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAL_6:.*]] = tosa.conv2d %arg0, %[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_5]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAL_7:.*]] = tosa.rescale %[[VAL_6]]
 func.func @test_conv2d_qi8_2(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
@@ -118,10 +131,10 @@ func.func @test_conv2d_qi8_2(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.01
 // -----
 
 // CHECK-LABEL: test_conv2d_qi16
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0> : tensor<16xi48>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<16x1x1x8xi8>}>
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi16>}>
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0> : tensor<16xi48>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<16x1x1x8xi8>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi16>}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR5:.*]] = tosa.conv2d %arg0, %[[VAR1]], %[[VAR0]], %[[VAR3]], %[[VAR4]] {acc_type = i48, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAR6:.*]] = tosa.rescale %[[VAR5]]
 func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>>) -> tensor<1x32x32x16x!quant.uniform<i16:f32, 1.0>> {
@@ -134,12 +147,14 @@ func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>
 // -----
 
 // CHECK-LABEL: @test_depthwise_conv2d_bias_qi8
-// CHECK-SAME:      %[[ARG0:.*]]: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015678688883781433:-1>>
-// CHECK-DAG:     %[[CONST0:.*]] = tosa.const_shape {value = dense<[2, 2, 8, 2]> : tensor<4xindex>}
-// CHECK-DAG:     %[[CONST1:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<16xi32>}>
-// CHECK-DAG:     %[[CONST2:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<1x2x2x16xi8>}>
-// CHECK-DAG:     %[[INPUT_ZP:.*]] = "tosa.const"() <{value = dense<-1> : tensor<1xi8>}>
-// CHECK-DAG:     %[[WEIGHT_ZP:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015678688883781433:-1>>
+// CHECK-DAG:     %[[shift:.*]] = "tosa.const"() <{values = dense<[36, 36, 36, 36, 32, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36]> : tensor<16xi8>}> : () -> tensor<16xi8>
+// CHECK-DAG:     %[[multiplier:.*]] = "tosa.const"() <{values = dense<[1373724854, 1373724854, 1373724854, 1373724854, 1803013871, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854]> : tensor<16xi32>}> : () -> tensor<16xi32>
+// CHECK-DAG:     %[[CONST0:.*]] = tosa.const_shape {values = dense<[2, 2, 8, 2]> : tensor<4xindex>}
+// CHECK-DAG:     %[[CONST1:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<16xi32>}>
+// CHECK-DAG:     %[[CONST2:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<1x2x2x16xi8>}>
+// CHECK-DAG:     %[[INPUT_ZP:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}>
+// CHECK-DAG:     %[[WEIGHT_ZP:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG:     %[[RESHAPE:.*]] = tosa.reshape %[[CONST2]], %[[CONST0]]
 // CHECK-DAG:     %[[DEPTHWISE:.*]] = tosa.depthwise_conv2d %[[ARG0]], %[[RESHAPE]], %[[CONST1]], %[[INPUT_ZP]], %[[WEIGHT_ZP]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 // CHECK-DAG:     %[[RESCALE:.*]] = tosa.rescale %[[DEPTHWISE]]
@@ -154,14 +169,14 @@ func.func @test_depthwise_conv2d_bias_qi8(%arg0: tensor<1x32x32x8x!quant.uniform
 // -----
 
 // CHECK-LABEL: @test_conv2d_grouped_convolution
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 4, 1, 64]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[64, 1, 1, 64]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<64> : tensor<1xindex>}
-// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {value = dense<0> : tensor<1xindex>}
-// CHECK-DAG: %[[VAL_8:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 64]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_9:.*]] = tosa.const_shape  {value = dense<[64, 0, 0, 0]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_10:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 4, 1, 64]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[64, 1, 1, 64]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<64> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<0> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_8:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 64]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_9:.*]] = tosa.const_shape  {values = dense<[64, 0, 0, 0]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK-DAG: %[[INPUT_SLICE_1:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]]
 // CHECK-DAG: %[[FILTER_SLICE_1:.*]] = tosa.slice %arg1, %[[VAL_4]], %[[VAL_5]]
 // CHECK-DAG: %[[BIAS_SLICE_1:.*]] = tosa.slice %arg2, %[[VAL_7]], %[[VAL_6]]
@@ -180,20 +195,20 @@ func.func @test_conv2d_grouped_convolution(%input: tensor<1x4x1x128xf32>, %weigh
 // -----
 
 // CHECK-LABEL: @test_conv2d_grouped_strided_convolution
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 3, 1, 16]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[128, 3, 1, 16]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<128> : tensor<1xindex>}
-// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {value = dense<0> : tensor<1xindex>}
-// CHECK-DAG: %[[VAL_8:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 16]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_9:.*]] = tosa.const_shape  {value = dense<[128, 0, 0, 0]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 32]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_11:.*]] = tosa.const_shape  {value = dense<[256, 0, 0, 0]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_12:.*]] = tosa.const_shape  {value = dense<256> : tensor<1xindex>}
-// CHECK-DAG: %[[VAL_13:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 48]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_14:.*]] = tosa.const_shape  {value = dense<[384, 0, 0, 0]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_15:.*]] = tosa.const_shape  {value = dense<384> : tensor<1xindex>}
-// CHECK-DAG: %[[VAL_16:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 3, 1, 16]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[128, 3, 1, 16]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<128> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<0> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_8:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 16]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_9:.*]] = tosa.const_shape  {values = dense<[128, 0, 0, 0]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 32]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_11:.*]] = tosa.const_shape  {values = dense<[256, 0, 0, 0]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_12:.*]] = tosa.const_shape  {values = dense<256> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_13:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 48]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_14:.*]] = tosa.const_shape  {values = dense<[384, 0, 0, 0]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_15:.*]] = tosa.const_shape  {values = dense<384> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_16:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK-DAG: %[[INPUT_SLICE_1:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]]
 // CHECK-DAG: %[[FILTER_SLICE_1:.*]] = tosa.slice %arg1, %[[VAL_4]], %[[VAL_5]]
 // CHECK-DAG: %[[BIAS_SLICE_1:.*]] = tosa.slice %arg2, %[[VAL_7]], %[[VAL_6]]
@@ -218,29 +233,31 @@ func.func @test_conv2d_grouped_strided_convolution(%input: tensor<1x3x1x64xf32>,
 }
 
 // -----
-
 // CHECK-LABEL: test_conv2d_q_grouped_convolution
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x4x1x16x!quant.uniform<i8:f32, 0.015684768557548523>>
-// CHECK: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[8, 0, 0, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
-// CHECK: %[[VAL_7:.*]] = tosa.const_shape  {value = dense<8> : tensor<1xindex>} : () -> !tosa.shape<1>
-// CHECK: %[[VAL_8:.*]] = tosa.const_shape  {value = dense<0> : tensor<1xindex>} : () -> !tosa.shape<1>
-// CHECK: %[[VAL_9:.*]] = tosa.const_shape  {value = dense<[8, 1, 1, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK: %[[VAL_10:.*]] = "tosa.const"() <{value = dense<42> : tensor<16x1x1x8xi8>}> : () -> tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>
-// CHECK: %[[VAL_11:.*]] = "tosa.const"() <{value = dense<0> : tensor<16xi32>}> : () -> tensor<16x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,2.000000e+00,2.400000e+00,1.700000e+00,2.300000e+00,2.400000e+00,2.400000e+00,2.300000e+00,2.100000e+00,2.400000e+00,2.100000e+00,2.400000e+00}>>
-// CHECK: %[[VAL_12:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK: %[[VAL_13:.*]] = tosa.const_shape  {value = dense<[1, 4, 1, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[8, 0, 0, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<36> : tensor<8xi8>}> : () -> tensor<8xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1374257539> : tensor<8xi32>}> : () -> tensor<8xi32>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<8> : tensor<1xindex>} : () -> !tosa.shape<1>
+// CHECK: %[[VAL_8:.*]] = tosa.const_shape  {values = dense<0> : tensor<1xindex>} : () -> !tosa.shape<1>
+// CHECK: %[[VAL_9:.*]] = tosa.const_shape  {values = dense<[8, 1, 1, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<42> : tensor<16x1x1x8xi8>}> : () -> tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>
+// CHECK: %[[VAL_11:.*]] = "tosa.const"() <{values = dense<0> : tensor<16xi32>}> : () -> tensor<16x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,2.000000e+00,2.400000e+00,1.700000e+00,2.300000e+00,2.400000e+00,2.400000e+00,2.300000e+00,2.100000e+00,2.400000e+00,2.100000e+00,2.400000e+00}>>
+// CHECK: %[[VAL_12:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_13:.*]] = tosa.const_shape  {values = dense<[1, 4, 1, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK: %[[VAL_14:.*]] = tosa.slice %[[VAL_0]], %[[VAL_12]], %[[VAL_13]] : (tensor<1x4x1x16x!quant.uniform<i8:f32, 0.015684768557548523>>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x4x1x8x!quant.uniform<i8:f32, 0.015684768557548523>>
 // CHECK: %[[VAL_15:.*]] = tosa.slice %[[VAL_10]], %[[VAL_12]], %[[VAL_9]] : (tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<8x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>
 // CHECK: %[[VAL_16:.*]] = tosa.slice %[[VAL_11]], %[[VAL_8]], %[[VAL_7]] : (tensor<16x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,2.000000e+00,2.400000e+00,1.700000e+00,2.300000e+00,2.400000e+00,2.400000e+00,2.300000e+00,2.100000e+00,2.400000e+00,2.100000e+00,2.400000e+00}>>, !tosa.shape<1>, !tosa.shape<1>) -> tensor<8x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,2.000000e+00,2.400000e+00,1.700000e+00}>>
 // CHECK: %[[VAL_17:.*]] = tosa.conv2d %[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[VAL_6]], %[[VAL_6]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x1x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<8x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>, tensor<8x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,2.000000e+00,2.400000e+00,1.700000e+00}>>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x1x8xi32>
-// CHECK: %[[VAL_18:.*]] = tosa.rescale %[[VAL_17]]
+// CHECK: %[[VAL_18:.*]] = tosa.rescale %[[VAL_17]], %[[VAL_4]], %[[VAL_3]], %[[VAL_5]], %[[VAL_6]] {input_unsigned = false, output_unsigned = false, per_channel = true, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x4x1x8xi32>, tensor<8xi32>, tensor<8xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x4x1x8x!quant.uniform<i8:f32, 0.078431375324726104>>
 // CHECK: %[[VAL_19:.*]] = tosa.slice %[[VAL_0]], %[[VAL_2]], %[[VAL_13]] : (tensor<1x4x1x16x!quant.uniform<i8:f32, 0.015684768557548523>>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x4x1x8x!quant.uniform<i8:f32, 0.015684768557548523>>
 // CHECK: %[[VAL_20:.*]] = tosa.slice %[[VAL_10]], %[[VAL_1]], %[[VAL_9]] : (tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<8x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>
 // CHECK: %[[VAL_21:.*]] = tosa.slice %[[VAL_11]], %[[VAL_7]], %[[VAL_7]] : (tensor<16x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,2.000000e+00,2.400000e+00,1.700000e+00,2.300000e+00,2.400000e+00,2.400000e+00,2.300000e+00,2.100000e+00,2.400000e+00,2.100000e+00,2.400000e+00}>>, !tosa.shape<1>, !tosa.shape<1>) -> tensor<8x!quant.uniform<i32:f32:0, {2.300000e+00,2.400000e+00,2.400000e+00,2.300000e+00,2.100000e+00,2.400000e+00,2.100000e+00,2.400000e+00}>>
 // CHECK: %[[VAL_22:.*]] = tosa.conv2d %[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_6]], %[[VAL_6]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x1x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<8x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01,1.000000e-01}>>, tensor<8x!quant.uniform<i32:f32:0, {2.300000e+00,2.400000e+00,2.400000e+00,2.300000e+00,2.100000e+00,2.400000e+00,2.100000e+00,2.400000e+00}>>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x1x8xi32>
-// CHECK: %[[VAL_23:.*]] = tosa.rescale %[[VAL_22]]
+// CHECK: %[[VAL_23:.*]] = tosa.rescale %[[VAL_22]], %[[VAL_4]], %[[VAL_3]], %[[VAL_5]], %[[VAL_6]] {input_unsigned = false, output_unsigned = false, per_channel = true, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x4x1x8xi32>, tensor<8xi32>, tensor<8xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x4x1x8x!quant.uniform<i8:f32, 0.078431375324726104>>
 // CHECK: %[[VAL_24:.*]] = tosa.concat %[[VAL_18]], %[[VAL_23]] {axis = 3 : i32} : (tensor<1x4x1x8x!quant.uniform<i8:f32, 0.078431375324726104>>, tensor<1x4x1x8x!quant.uniform<i8:f32, 0.078431375324726104>>) -> tensor<1x4x1x16x!quant.uniform<i8:f32, 0.078431375324726104>>
 func.func @test_conv2d_q_grouped_convolution(%input: tensor<1x4x1x16x!quant.uniform<i8:f32, 0.015684768557548523>>) -> tensor<1x4x1x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0,  {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1} >>
@@ -262,10 +279,10 @@ func.func @test_depthwise_conv2d_bias_inferred(%arg0: tensor<?x32x32x8xf32>, %ar
 // -----
 
 // CHECK-LABEL: test_depthwise_conv2d_slicing
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[3, 3, 8, 2]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[1, 31, 31, 8]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[3, 3, 8, 2]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[1, 31, 31, 8]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_7:.*]] = tosa.reshape %arg1, %[[VAL_3]]
 // CHECK: %[[VAL_8:.*]] = tosa.slice %arg0, %[[VAL_5]], %[[VAL_4]]
 // CHECK: %[[VAL_9:.*]] = tosa.depthwise_conv2d %[[VAL_8]], %[[VAL_7]], %arg2, %[[VAL_6]], %[[VAL_6]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>}
@@ -280,10 +297,9 @@ func.func @test_depthwise_conv2d_slicing(%arg0: tensor<1x32x32x8xf32>, %arg1: te
 // CHECK-LABEL: test_conv3d
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<2x2x7x7x2xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x2x4xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<4xf32>}>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_5:.*]] = tosa.transpose %[[VAL_1]] {perms = array<i32: 4, 0, 1, 2, 3>}
-// CHECK: %[[VAL_6:.*]] = tosa.conv3d %[[VAL_0]], %[[VAL_5]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
+// CHECK: %[[VAL_6:.*]] = tosa.conv3d %[[VAL_0]], %[[VAL_5]], %[[VAL_4]], %[[VAL_4]], %[[VAL_4]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d(%arg0: tensor<2x2x7x7x2xf32>, %arg1: tensor<2x3x3x2x4xf32>) -> tensor<2x2x7x7x4xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<2x2x7x7x2xf32>, tensor<2x3x3x2x4xf32>, none) -> tensor<2x2x7x7x4xf32>
@@ -295,10 +311,9 @@ func.func @test_conv3d(%arg0: tensor<2x2x7x7x2xf32>, %arg1: tensor<2x3x3x2x4xf32
 // CHECK-LABEL: test_conv3d_dynamic
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<?x11x32x32x8xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<3x1x1x8x16xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<16xf32>}>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_5:.*]] = tosa.transpose %[[VAL_1]] {perms = array<i32: 4, 0, 1, 2, 3>}
-// CHECK: %[[VAL_6:.*]] = tosa.conv3d %[[VAL_0]], %[[VAL_5]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 1, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>}
+// CHECK: %[[VAL_6:.*]] = tosa.conv3d %[[VAL_0]], %[[VAL_5]], %[[VAL_4]], %[[VAL_4]], %[[VAL_4]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 1, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d_dynamic(%arg0: tensor<?x11x32x32x8xf32>, %arg1: tensor<3x1x1x8x16xf32>) -> tensor<*xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x11x32x32x8xf32>, tensor<3x1x1x8x16xf32>, none) -> tensor<*xf32>
@@ -311,7 +326,7 @@ func.func @test_conv3d_dynamic(%arg0: tensor<?x11x32x32x8xf32>, %arg1: tensor<3x
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<10x3x64x64x12xf32>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<16x2x2x12x8xf32>
 // CHECK-SAME: %[[VAL_2:.*]]: tensor<8xf32>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_5:.*]] = tosa.transpose %[[VAL_1]] {perms = array<i32: 4, 0, 1, 2, 3>}
 // CHECK: %[[VAL_6:.*]] = tosa.conv3d %[[VAL_0]], %[[VAL_5]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 7, 8, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d_bias(%arg0: tensor<10x3x64x64x12xf32>, %arg1: tensor<16x2x2x12x8xf32>, %cst: tensor<8xf32>) -> tensor<10x3x64x64x8xf32> {
@@ -322,9 +337,9 @@ func.func @test_conv3d_bias(%arg0: tensor<10x3x64x64x12xf32>, %arg1: tensor<16x2
 // -----
 
 // CHECK-LABEL: test_conv3d_slicing
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 31, 31, 31, 8]> : tensor<5xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<5xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 31, 31, 31, 8]> : tensor<5xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<5xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]]
 // CHECK: %[[VAL_8:.*]] = tosa.transpose %[[VAL_1]] {perms = array<i32: 4, 0, 1, 2, 3>}
 // CHECK: %[[VAL_9:.*]] = tosa.conv3d %[[VAL_7]], %[[VAL_8]], %arg2, %[[VAL_6]], %[[VAL_6]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 2, 2, 2>}
@@ -338,16 +353,15 @@ func.func @test_conv3d_slicing(%arg0: tensor<1x32x32x32x8xf32>, %arg1: tensor<3x
 // CHECK-LABEL: test_conv3d_qi8(
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.015686264261603355>>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x17x34xf32>) -> tensor<1x4x8x11x34x!quant.uniform<i8:f32, 0.8929935097694397:-4>>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<0.0156862643> : tensor<1x1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<1.11982894> : tensor<1x1x1x1x1xf32>}
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<-4> : tensor<1x1x1x1x1xi32>}
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<34xf32>}
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0.0156862643> : tensor<1x1x1x1x1xf32>}
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1.11982894> : tensor<1x1x1x1x1xf32>}
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<-4> : tensor<1x1x1x1x1xi32>}
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[BIAS_ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAL_8:.*]] = tosa.cast %[[VAL_0]]
 // CHECK: %[[VAL_10:.*]] = tosa.mul %[[VAL_8]], %[[VAL_3]], %[[SHIFT]]
 // CHECK: %[[VAL_11:.*]] = tosa.transpose %[[VAL_1]] {perms = array<i32: 4, 0, 1, 2, 3>}
-// CHECK: %[[VAL_12:.*]] = tosa.conv3d %[[VAL_10]], %[[VAL_11]], %[[VAL_6]], %[[ZP]], %[[ZP]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 2>}
+// CHECK: %[[VAL_12:.*]] = tosa.conv3d %[[VAL_10]], %[[VAL_11]], %[[BIAS_ZP]], %[[BIAS_ZP]], %[[BIAS_ZP]] {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 2>}
 // CHECK: %[[VAL_13:.*]] = tosa.mul %[[VAL_12]], %[[VAL_4]], %[[SHIFT]]
 // CHECK: %[[VAL_14:.*]] = tosa.cast %[[VAL_13]]
 // CHECK: %[[VAL_15:.*]] = tosa.add %[[VAL_14]], %[[VAL_5]]
@@ -363,6 +377,17 @@ func.func @test_conv3d_qi8(%arg0: tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.01
 
 // -----
 
+// CHECK-LABEL: test_conv3d_qi16
+// CHECK: %[[BIAS:.+]] = "tosa.const"() <{values = dense<123> : tensor<16xi48>}> : () -> tensor<16xi48>
+// CHECK: tosa.conv3d {{.+}}, %[[BIAS]], %{{.+}} {acc_type = i48, {{.+}}} : {{.+}} -> tensor<1x15x15x15x16xi48>
+func.func @test_conv3d_qi16(%input: tensor<1x32x32x32x8x!quant.uniform<i16:f32, 1.0>>, %filter: tensor<3x3x3x8x16x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x15x15x15x16x!quant.uniform<i16:f32, 1.0>> {
+  %bias = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i16:f32, 1.0>>, value = dense<123> : tensor<16xi16>} : () -> tensor<16x!quant.uniform<i16:f32, 1.0>>
+  %0 = "tfl.conv_3d"(%input, %filter, %bias) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 2 : i32, stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x32x32x32x8x!quant.uniform<i16:f32, 1.0>>, tensor<3x3x3x8x16x!quant.uniform<i8:f32, 1.0>>, tensor<16x!quant.uniform<i16:f32, 1.0>>) -> tensor<1x15x15x15x16x!quant.uniform<i16:f32, 1.0>>
+  func.return %0 : tensor<1x15x15x15x16x!quant.uniform<i16:f32, 1.0>>
+}
+
+// -----
+
 // CHECK-LABEL: test_add
 // CHECK: %[[VAR0:.*]] = tosa.add %arg0, %arg1
 func.func @test_add(%arg0: tensor<13x21x1xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
@@ -400,7 +425,7 @@ func.func @test_sub_unranked(%arg0: tensor<1x21x3xf32>, %arg1: tensor<1x1x1xf32>
 // -----
 
 // CHECK-LABEL: test_mul
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR0:.*]] = tosa.mul %arg0, %arg1, %[[SHIFT]]
 func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.mul"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32>
@@ -410,7 +435,7 @@ func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_mul_unranked
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR0:.*]] = tosa.mul %arg0, %arg1, %[[SHIFT]]
 func.func @test_mul_unranked(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1x1x1xf32>) -> tensor<*xf32> {
   %0 = "tfl.mul"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<*xf32>
@@ -421,9 +446,31 @@ func.func @test_mul_unranked(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1x1x1xf32
 
 // CHECK-LABEL: test_exp
 // CHECK: %[[VAR0:.*]] = tosa.exp %arg0
-func.func @test_exp(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
-  %0 = "tfl.exp"(%arg0) : (tensor<13x21x3xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
+func.func @test_exp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+  %0 = "tfl.exp"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+  func.return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_exp_qi8
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.011764706112444401:43>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<{{.+}}> : tensor<256xi8>}>
+// CHECK: %[[VAL_2:.*]] = tosa.table %[[VAL_0]], %[[VAL_1]]
+func.func @test_exp_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.011764706112444401:43>>) -> (tensor<13x21x3x!quant.uniform<i8:f32, 0.028976691886782646:128>>) {
+  %0 = "tfl.exp"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.011764706112444401:43>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.028976691886782646:128>>
+  func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.028976691886782646:128>>
+}
+
+// -----
+
+// CHECK-LABEL: test_exp_qi16
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i16:f32, 6.1037018895149231E-5>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<{{.+}}> : tensor<513xi16>}>
+// CHECK: %[[VAL_2:.*]] = tosa.table %[[VAL_0]], %[[VAL_1]]
+func.func @test_exp_qi16(%arg0: tensor<13x21x3x!quant.uniform<i16:f32, 6.1037018895149231E-5>>) -> (tensor<13x21x3x!quant.uniform<i16:f32, 2.2550298308487982E-4>>) {
+  %0 = "tfl.exp"(%arg0) : (tensor<13x21x3x!quant.uniform<i16:f32, 6.1037018895149231E-5>>) -> tensor<13x21x3x!quant.uniform<i16:f32, 2.2550298308487982E-4>>
+  func.return %0 : tensor<13x21x3x!quant.uniform<i16:f32, 2.2550298308487982E-4>>
 }
 
 // -----
@@ -440,7 +487,7 @@ func.func @test_rcp(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_div
 // CHECK-DAG: %[[RESHAPE:.*]] = tosa.reshape %arg1
-// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %[[RESHAPE]]
+// CHECK: %[[VAR0:.*]] = tosa.intdiv %arg0, %[[RESHAPE]]
 func.func @test_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<*xi32> {
   %0 = "tfl.div"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xi32>, tensor<i32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
@@ -448,12 +495,30 @@ func.func @test_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<*x
 
 // -----
 
-// CHECK-LABEL: test_floor_div
-// CHECK-DAG: %[[RESHAPE:.*]] = tosa.reshape %arg1
-// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %[[RESHAPE]]
-func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<*xi32> {
-  %0 = "tfl.floor_div"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xi32>, tensor<i32>) -> tensor<*xi32>
-  func.return %0 : tensor<*xi32>
+// CHECK-LABEL:   func.func @test_floor_div(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<13x21x3xi32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<i32>) -> tensor<13x21x3xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x1x1xi32>}> : () -> tensor<1x1x1xi32>
+// CHECK:           %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1> : tensor<1x1x1xi32>}> : () -> tensor<1x1x1xi32>
+// CHECK:           %[[VAL_5:.*]] = tosa.const_shape  {values = dense<1> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[VAL_6:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_5]] : (tensor<i32>, !tosa.shape<3>) -> tensor<1x1x1xi32>
+// CHECK:           %[[VAL_7:.*]] = tosa.intdiv %[[VAL_0]], %[[VAL_6]] : (tensor<13x21x3xi32>, tensor<1x1x1xi32>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_8:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_5]] : (tensor<i32>, !tosa.shape<3>) -> tensor<1x1x1xi32>
+// CHECK:           %[[VAL_9:.*]] = tosa.mul %[[VAL_0]], %[[VAL_8]], %[[VAL_2]] : (tensor<13x21x3xi32>, tensor<1x1x1xi32>, tensor<1xi8>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_10:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_5]] : (tensor<i32>, !tosa.shape<3>) -> tensor<1x1x1xi32>
+// CHECK:           %[[VAL_11:.*]] = tosa.mul %[[VAL_10]], %[[VAL_7]], %[[VAL_2]] : (tensor<1x1x1xi32>, tensor<13x21x3xi32>, tensor<1xi8>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_12:.*]] = tosa.equal %[[VAL_0]], %[[VAL_11]] : (tensor<13x21x3xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_13:.*]] = tosa.logical_not %[[VAL_12]] : (tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_14:.*]] = tosa.greater %[[VAL_3]], %[[VAL_9]] : (tensor<1x1x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_15:.*]] = tosa.sub %[[VAL_7]], %[[VAL_4]] : (tensor<13x21x3xi32>, tensor<1x1x1xi32>) -> tensor<13x21x3xi32>
+// CHECK:           %[[VAL_16:.*]] = tosa.logical_and %[[VAL_13]], %[[VAL_14]] : (tensor<13x21x3xi1>, tensor<13x21x3xi1>) -> tensor<13x21x3xi1>
+// CHECK:           %[[VAL_17:.*]] = tosa.select %[[VAL_16]], %[[VAL_15]], %[[VAL_7]] : (tensor<13x21x3xi1>, tensor<13x21x3xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+// CHECK:           return %[[VAL_17]] : tensor<13x21x3xi32>
+// CHECK:         }
+func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<13x21x3xi32> {
+  %0 = "tfl.floor_div"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xi32>, tensor<i32>) -> tensor<13x21x3xi32>
+  func.return %0 : tensor<13x21x3xi32>
 }
 
 // -----
@@ -496,8 +561,8 @@ func.func @test_relu6_dynamic(%arg0: tensor<?x21x3xf32>) -> tensor<?x?x?xf32> {
 // -----
 
 // CHECK-LABEL: test_leaky_relu
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.707330704> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.707330704> : tensor<1x1x1xf32>}>
 // CHECK: %[[VAR1:.*]] = tosa.mul %arg0, %[[VAR0]], %[[SHIFT]]
 // CHECK: %[[VAR2:.*]] = tosa.maximum %[[VAR1]], %arg0 : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: return %[[VAR2]] : tensor<13x21x3xf32>
@@ -509,9 +574,9 @@ func.func @test_leaky_relu(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_prelu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 2, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 2, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg1, %[[VAR10]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.mul %arg0, %[[VAR1]], %[[SHIFT]]
 // CHECK-DAG: %[[VAR3:.*]] = tosa.greater_equal %arg0, %[[VAR0]]
@@ -525,22 +590,32 @@ func.func @test_prelu(%arg0: tensor<4x2x3xf32>, %arg1: tensor<2x3xf32>) -> tenso
 
 // CHECK-LABEL: test_prelu_qu8
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686038881540298:128>>
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape  {value = dense<[1, 8, 4, 17]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x1x1x1xi32>}>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<8x4x17xi8>}>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK: %[[VAL_3:.*]] = tosa.rescale %[[VAL_0]] {double_round = false, input_zp = 128 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_4:.*]] = tosa.rescale %[[VAL_3]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_5:.*]] = tosa.rescale %[[VAL_4]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_6:.*]] = tosa.greater_equal %[[VAL_5]], %[[VAL_1]] : (tensor<1x8x4x17xi32>, tensor<1x1x1x1xi32>
-// CHECK: %[[VAL_7:.*]] = tosa.rescale %[[VAL_2]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[CONST0]]
-// CHECK: %[[VAL_9:.*]] = tosa.mul %[[VAL_5]], %[[VAL_8]], %[[SHIFT]] : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>, tensor<1xi8>)
-// CHECK: %[[VAL_10:.*]] = tosa.rescale %[[VAL_9]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1130006236>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i8: 37>}
-// CHECK: %[[VAL_11:.*]] = tosa.rescale %[[VAL_4]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1472433039>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i8: 32>}
-// CHECK: %[[VAL_12:.*]] = tosa.select %[[VAL_6]], %[[VAL_11]], %[[VAL_10]]
-// CHECK: %[[VAL_13:.*]] = tosa.rescale %[[VAL_12]] {double_round = true, input_zp = 5 : i32, multiplier = array<i32: 1073741824>, output_zp = 5 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_14:.*]] = tosa.rescale %[[VAL_13]] {double_round = false, input_zp = 5 : i32, multiplier = array<i32: 1073741824>, output_zp = 133 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<32> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<1472433039> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<37> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1130006236> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[1, 8, 4, 17]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x1x1x1xi32>}> : () -> tensor<1x1x1x1xi32>
+// CHECK: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<"0x1{{.*}}"> : tensor<8x4x17xi8>}> : () -> tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.023982547223567963>>
+// CHECK: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<5> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<-123> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_11:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_12:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_13:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_14:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_15:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_11]], %[[VAL_12]], %[[VAL_13]], %[[VAL_14]] {input_unsigned = true, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686038881540298:128>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>
+// CHECK: %[[VAL_16:.*]] = tosa.rescale %[[VAL_15]], %[[VAL_11]], %[[VAL_12]], %[[VAL_14]], %[[VAL_14]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>
+// CHECK: %[[VAL_17:.*]] = tosa.rescale %[[VAL_16]], %[[VAL_11]], %[[VAL_12]], %[[VAL_14]], %[[VAL_7]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<1x8x4x17xi32>
+// CHECK: %[[VAL_18:.*]] = tosa.greater_equal %[[VAL_17]], %[[VAL_6]] : (tensor<1x8x4x17xi32>, tensor<1x1x1x1xi32>) -> tensor<1x8x4x17xi1>
+// CHECK: %[[VAL_19:.*]] = tosa.rescale %[[VAL_8]], %[[VAL_11]], %[[VAL_12]], %[[VAL_14]], %[[VAL_7]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.023982547223567963>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<8x4x17xi32>
+// CHECK: %[[VAL_20:.*]] = tosa.reshape %[[VAL_19]], %[[VAL_5]] : (tensor<8x4x17xi32>, !tosa.shape<4>) -> tensor<1x8x4x17xi32>
+// CHECK: %[[VAL_21:.*]] = tosa.mul %[[VAL_17]], %[[VAL_20]], %[[VAL_14]] : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>, tensor<1xi8>) -> tensor<1x8x4x17xi32>
+// CHECK: %[[VAL_22:.*]] = tosa.rescale %[[VAL_21]], %[[VAL_4]], %[[VAL_3]], %[[VAL_7]], %[[VAL_9]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>
+// CHECK: %[[VAL_23:.*]] = tosa.rescale %[[VAL_16]], %[[VAL_2]], %[[VAL_1]], %[[VAL_14]], %[[VAL_9]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>
+// CHECK: %[[VAL_24:.*]] = tosa.select %[[VAL_18]], %[[VAL_23]], %[[VAL_22]] : (tensor<1x8x4x17xi1>, tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>, tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>
+// CHECK: %[[VAL_25:.*]] = tosa.rescale %[[VAL_24]], %[[VAL_11]], %[[VAL_12]], %[[VAL_9]], %[[VAL_9]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>
+// CHECK: %[[VAL_26:.*]] = tosa.rescale %[[VAL_25]], %[[VAL_11]], %[[VAL_12]], %[[VAL_9]], %[[VAL_10]] {input_unsigned = false, output_unsigned = true, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.045754898339509964:5>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<u8:f32, 0.045754898339509964:133>>
 func.func @test_prelu_qu8(%arg0: tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686038881540298:128>>) -> tensor<1x8x4x17x!quant.uniform<u8:f32, 0.045754898339509964:133>> {
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>} : (tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686038881540298:128>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.023982547223567963>>, value = dense<"0x191D0557FF212FA1137FDE2B247CE8BA2A8B2213F6B109FA12232EC613FEEE03EF2D265BE5E4F6CB0E09F7F0A95606DA1709EDE632D0F92A2002E98E61F9213997D3FCEBFA0D2DFC4DD00D0700C60C0705F3CFCB01D30C3617C7144C294DAE27061A62E70665021AF50827F40EC9E0172D42B9FB01FB076A09553006F7F710211A031EC9F11BCF130FCC1906D5FED8E5F64E06EAEAFEFD2515F20BB6E3401023C89DFCF8DEC0390B37D8CA2001E1F7BC270ADDE92DFC6D230CE1FEEE1DE8F90ABF9E3ECAEEBC311DF6FDE41F0E31ED0AC309B3121533E7EC2D1B0F1E04D44513E627F4ED5E491D10E53EEA45FF23E31D11D1DE2E0A3B1015AF06102329DEED5C1C180402000B0D071BF0D4FBC0DE0C3BF012E018D80716351D1922F8D508CF2708BA0CEAFE14E4972732FDFD283ED9342A1506F4F137200A12F436D6C9EC071FBCBDEBF4F8051426B8201EC410F9C3C7EFF7CD04D7AC34E2F9D73A5A05CFFA0FF7FD21D6BBEA03F16AF8330C1105285605C9FFE72BE04726DA06F2DCDCDC14C1310CF4E32F06BE0941420B10C9293DD10EFE28D4D20716E6E6EE0A101FFE3AAF1716120EF62FECEBC0F0D72A0903F9E74425EDF82E290E0413BB69F3F45AF30A22D4D024411B4D243BE13FB9CBE0F5FA16A1D7532007AEF62837C42406E3ED3CCE0408CA1C0CFA18B40C0BF7261E06D3E504B8E714BCF6F010DB12373739E200E609E9DAEF1922A2C338FEF2C519F0E5101E2AE917DCA3FA27D245DD10F0EBCE"> : tensor<8x4x17xi8>} : () -> tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.023982547223567963>>
@@ -551,20 +626,28 @@ func.func @test_prelu_qu8(%arg0: tensor<1x8x4x17x!quant.uniform<u8:f32, 0.015686
 
 // -----
 
-// CHECK-LABEL:   test_prelu_qi8
-// CHECK-SAME:%[[VAL_0:.*]]: tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[1, 8, 4, 17]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x1x1x1xi32>}> : () -> tensor<1x1x1x1xi32>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<8x4x17xi8>}> : () -> tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>
-// CHECK: %[[VAL_5:.*]] = tosa.rescale %[[VAL_0]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>) -> tensor<1x8x4x17xi32>
-// CHECK: %[[VAL_6:.*]] = tosa.greater_equal %[[VAL_5]], %[[VAL_3]] : (tensor<1x8x4x17xi32>, tensor<1x1x1x1xi32>) -> tensor<1x8x4x17xi1>
-// CHECK: %[[VAL_7:.*]] = tosa.rescale %[[VAL_4]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>} : (tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>) -> tensor<8x4x17xi32>
-// CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_2]] : (tensor<8x4x17xi32>, !tosa.shape<4>) -> tensor<1x8x4x17xi32>
-// CHECK: %[[VAL_9:.*]] = tosa.mul %[[VAL_5]], %[[VAL_8]], %[[VAL_1]] : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>, tensor<1xi8>) -> tensor<1x8x4x17xi32>
-// CHECK: %[[VAL_10:.*]] = tosa.rescale %[[VAL_9]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1103996759>, output_zp = 1 : i32, per_channel = false, scale32 = true, shift = array<i8: 37>} : (tensor<1x8x4x17xi32>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>
-// CHECK: %[[VAL_11:.*]] = tosa.rescale %[[VAL_0]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1582183328>, output_zp = 1 : i32, per_channel = false, scale32 = true, shift = array<i8: 32>} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>
-// CHECK: %[[VAL_12:.*]] = tosa.select %[[VAL_6]], %[[VAL_11]], %[[VAL_10]] : (tensor<1x8x4x17xi1>, tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>, tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>
+// CHECK-LABEL:   func.func @test_prelu_qi8(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<32> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<1582183328> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<1> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<37> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<1103996759> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[1, 8, 4, 17]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x1x1x1xi32>}> : () -> tensor<1x1x1x1xi32>
+// CHECK: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<"0xD{{.*}}"> : tensor<8x4x17xi8>}> : () -> tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>
+// CHECK: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_11:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_12:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_13:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_12]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<1x8x4x17xi32>
+// CHECK: %[[VAL_14:.*]] = tosa.greater_equal %[[VAL_13]], %[[VAL_7]] : (tensor<1x8x4x17xi32>, tensor<1x1x1x1xi32>) -> tensor<1x8x4x17xi1>
+// CHECK: %[[VAL_15:.*]] = tosa.rescale %[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_11]], %[[VAL_12]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<8x4x17xi32>
+// CHECK: %[[VAL_16:.*]] = tosa.reshape %[[VAL_15]], %[[VAL_6]] : (tensor<8x4x17xi32>, !tosa.shape<4>) -> tensor<1x8x4x17xi32>
+// CHECK: %[[VAL_17:.*]] = tosa.mul %[[VAL_13]], %[[VAL_16]], %[[VAL_11]] : (tensor<1x8x4x17xi32>, tensor<1x8x4x17xi32>, tensor<1xi8>) -> tensor<1x8x4x17xi32>
+// CHECK: %[[VAL_18:.*]] = tosa.rescale %[[VAL_17]], %[[VAL_5]], %[[VAL_4]], %[[VAL_12]], %[[VAL_3]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>
+// CHECK: %[[VAL_19:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_2]], %[[VAL_1]], %[[VAL_11]], %[[VAL_3]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>
+// CHECK: %[[VAL_20:.*]] = tosa.select %[[VAL_14]], %[[VAL_19]], %[[VAL_18]] : (tensor<1x8x4x17xi1>, tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>, tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>
 func.func @test_prelu_qi8(%arg0: tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>, value = dense<"0xDAFDEBC120CBE1E028231F05CF04F52484B2F0AC0041E618200308F820FE308FFCF2E1E02A06D00606FB1044C928D8D811E3FCCE350E25C4DE2B0D00E20AC1E215940D0D12C809290D480FE9E2DB26E31E50F5F4FDD31EFF21C210E717E187144F27C848E820C5D503E31729218D96D2D6D3D9C43BF13014EFCB043631AE4403FE2D4CDF1F16E2D13BA20AE92CEAB7323405F728CF3DF4E9BBFAFEFEE120ECA7FA120609030FF0FCF0E5D40939172EE7E256BADEC5ECFFB32C35F4E936E2F8092FE2E3EFE22B0C02F5EE1D36DE03CBE02FF346081C30ED882AECCAF4E4E3361604EABF133CB6371DDAFCDA4F2D32034A270BF0120A0048131331E50D11CAEB1DEE0ADFC0F12531E8351DD7BDEB2821FF3ECC34F8D42EE4D6FF2AE5FEEDFC3DF7463CED10192CE4B728151827A92E000EE31CF3C5DF193DAC2836181BD916D339E914192B14F0163C58C500BDC6BAEFFB03EC33DA24E7FF0E292CE30504B3070AB5FDE6D7E7CB4CB0D818F90919EAEF5DFDF2DB6C4132DF8EF2E40AF7EA04F1D496F22F2971420FF01D012E2954D5081C0AF2C5E5DED2CCD8C6157416201AFF3A2B29FBDD9EF06340B021F45C322A202DDD86111EBDF44BE9110E29F3FE7FDEDDFB5FDEDBD933E2ED0DD4E21C4BC6FD28E31934C821CE10F61C12740A100F1BE205CC01434BD7E3FB14F01CE0E406710022E464E0F0D8FB3D01C733C9C94017FAC50BE812D202E2B10C04E70AF326CEFD0DE20ABD153D3D14171C34061DE5FC5A"> : tensor<8x4x17xi8>} : () -> tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>
   %1 = "tfl.prelu"(%arg0, %0) : (tensor<1x8x4x17x!quant.uniform<i8:f32, 0.015686038881540298>>, tensor<8x4x17x!quant.uniform<i8<-127:127>:f32, 0.021805247291922569>>) -> tensor<1x8x4x17x!quant.uniform<i8:f32, 0.042581047862768173:1>>
@@ -591,11 +674,38 @@ func.func @test_logical_or(%arg0: tensor<13x1x3xi1>, %arg1: tensor<13x21x3xi1>)
 
 // -----
 
+// CHECK-LABEL: test_bitwise_xor_int8
+// CHECK: %[[VAR0:.*]] = tosa.bitwise_xor %arg0, %arg1 : (tensor<1x11x5xi8>, tensor<29x11x5xi8>) -> tensor<29x11x5xi8>
+func.func @test_bitwise_xor_int8(%arg0: tensor<1x11x5xi8>, %arg1: tensor<29x11x5xi8>) -> tensor<29x11x5xi8> {
+  %0 = "tfl.bitwise_xor"(%arg0, %arg1) : (tensor<1x11x5xi8>, tensor<29x11x5xi8>) -> tensor<29x11x5xi8>
+  func.return %0 : tensor<29x11x5xi8>
+}
+
+// -----
+
+// CHECK-LABEL: test_bitwise_xor_int16
+// CHECK: %[[VAR0:.*]] = tosa.bitwise_xor %arg0, %arg1 : (tensor<1x11x5xi16>, tensor<29x11x5xi16>) -> tensor<29x11x5xi16>
+func.func @test_bitwise_xor_int16(%arg0: tensor<1x11x5xi16>, %arg1: tensor<29x11x5xi16>) -> tensor<*xi16> {
+  %0 = "tfl.bitwise_xor"(%arg0, %arg1) : (tensor<1x11x5xi16>, tensor<29x11x5xi16>) -> tensor<*xi16>
+  func.return %0 : tensor<*xi16>
+}
+
+// -----
+
+// CHECK-LABEL: test_bitwise_xor_int32
+// CHECK: %[[VAR0:.*]] = tosa.bitwise_xor %arg0, %arg1 : (tensor<4x16x1xi32>, tensor<1x16x1xi32>) -> tensor<4x16x1xi32>
+func.func @test_bitwise_xor_int32(%arg0: tensor<4x16x1xi32>, %arg1: tensor<1x16x1xi32>) -> tensor<4x16x1xi32> {
+  %0 = "tfl.bitwise_xor"(%arg0, %arg1) : (tensor<4x16x1xi32>, tensor<1x16x1xi32>) -> tensor<4x16x1xi32>
+  func.return %0 : tensor<4x16x1xi32>
+}
+
+// -----
+
 // CHECK-LABEL: test_logical_not
 // CHECK: %[[VAR0:.*]] = tosa.logical_not %arg0
-func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<*xi1> {
-  %0 = "tfl.logical_not"(%arg0) : (tensor<1x21x3xi1>) -> tensor<*xi1>
-  func.return %0 : tensor<*xi1>
+func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<1x21x3xi1> {
+  %0 = "tfl.logical_not"(%arg0) : (tensor<1x21x3xi1>) -> tensor<1x21x3xi1>
+  func.return %0 : tensor<1x21x3xi1>
 }
 
 // -----
@@ -622,7 +732,7 @@ func.func @test_reduce_all_axis_1_keep_true(%arg0: tensor<1x4x8x19xi1>) -> tenso
 
 // CHECK-LABEL: test_reduce_all_axis_1_keep_false
 // CHECK-SAME: %[[VAL_0:.+]]: tensor<1x4x8x19xi1>
-// CHECK-DAG: %[[VAL_10:.+]] = tosa.const_shape {value = dense<[1, 8, 19]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_10:.+]] = tosa.const_shape {values = dense<[1, 8, 19]> : tensor<3xindex>}
 // CHECK: %[[VAL_1:.*]] = tosa.reduce_all %[[VAL_0]] {axis = 1 : i32} : (tensor<1x4x8x19xi1>) -> tensor<1x1x8x19xi1>
 // CHECK: %[[VAL_2:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_10]] : (tensor<1x1x8x19xi1>, !tosa.shape<3>) -> tensor<1x8x19xi1>
 func.func @test_reduce_all_axis_1_keep_false(%arg0: tensor<1x4x8x19xi1>) -> tensor<1x8x19xi1> {
@@ -646,7 +756,7 @@ func.func @test_reduce_all_axis_2_keep_true(%arg0: tensor<1x4x8x19xi1>) -> tenso
 
 // CHECK-LABEL: test_reduce_all_axis_2_keep_false
 // CHECK-SAME: %[[VAL_0:.+]]: tensor<1x4x8x19xi1>
-// CHECK: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[1, 4, 19]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[1, 4, 19]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK: %[[VAL_1:.*]] = tosa.reduce_all %[[VAL_0]] {axis = 2 : i32} : (tensor<1x4x8x19xi1>) -> tensor<1x4x1x19xi1>
 // CHECK: %[[VAL_2:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_10]] : (tensor<1x4x1x19xi1>, !tosa.shape<3>) -> tensor<1x4x19xi1>
 func.func @test_reduce_all_axis_2_keep_false(%arg0: tensor<1x4x8x19xi1>) -> tensor<1x4x19xi1> {
@@ -659,7 +769,7 @@ func.func @test_reduce_all_axis_2_keep_false(%arg0: tensor<1x4x8x19xi1>) -> tens
 
 // CHECK-LABEL: test_reduce_any
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_any %arg0 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
   %cst = arith.constant dense<0> : tensor<1xi32>
@@ -669,9 +779,21 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 
 // -----
 
+// CHECK-LABEL: test_reduce_any_dynamic_output
+// CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_any %arg0 {axis = 0 : i32}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
+// CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
+func.func @test_reduce_any_dynamic_output(%arg0: tensor<13x21x3xi1>) -> tensor<?x?xi1> {
+  %cst = arith.constant dense<0> : tensor<1xi32>
+  %0 = "tfl.reduce_any"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xi1>, tensor<1xi32>) -> tensor<?x?xi1>
+  func.return %0 : tensor<?x?xi1>
+}
+
+// -----
+
 // CHECK-LABEL: test_reduce_min
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_min %arg0 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
@@ -683,7 +805,7 @@ func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_max
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_max %arg0 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
@@ -695,7 +817,7 @@ func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_sum
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_sum %arg0 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
@@ -707,7 +829,7 @@ func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_sum_nonzero_axis
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30x40x50xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape {value = dense<[10, 20, 30, 50]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape {values = dense<[10, 20, 30, 50]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAL_2:.*]] = tosa.reduce_sum %[[VAL_0]] {axis = 3 : i32} : (tensor<10x20x30x40x50xf32>) -> tensor<10x20x30x1x50xf32>
 // CHECK-DAG: %[[VAL_3:.*]] = tosa.reshape %[[VAL_2]], %[[VAL_1]] : (tensor<10x20x30x1x50xf32>, !tosa.shape<4>) -> tensor<10x20x30x50xf32>
 // CHECK: return %[[VAL_3]] : tensor<10x20x30x50xf32>
@@ -720,7 +842,7 @@ func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._u
 // -----
 
 // CHECK-LABEL: test_reduce_sum_5D
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<[6, 8]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<[6, 8]> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<4x5x6x7x8xf32>) -> tensor<1x5x6x7x8xf32>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.reduce_sum %[[VAR1]] {axis = 1 : i32} : (tensor<1x5x6x7x8xf32>) -> tensor<1x1x6x7x8xf32>
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reduce_sum %[[VAR2]] {axis = 3 : i32} : (tensor<1x1x6x7x8xf32>) -> tensor<1x1x6x1x8xf32>
@@ -735,11 +857,11 @@ func.func @test_reduce_sum_5D(%arg0: tensor<4x5x6x7x8xf32>) -> tensor<6x8xf32> {
 // -----
 
 // CHECK-LABEL: test_reduce_mean
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0.0769230798> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.0769230798> : tensor<1x1xf32>}>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_sum %arg0 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR2:.*]] = tosa.reshape %[[VAR1]], %[[VAR10]]
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR4:.*]] = tosa.mul %[[VAR2]], %[[VAR0]], %[[SHIFT]]
 func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
@@ -749,6 +871,21 @@ func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // -----
 
+// CHECK-LABEL: test_reduce_mean_dynamic_output
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<0.0769230798> : tensor<1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_sum %arg0 {axis = 0 : i32}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape  {values = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR2:.*]] = tosa.reshape %[[VAR1]], %[[VAR10]]
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK: %[[VAR4:.*]] = tosa.mul %[[VAR2]], %[[VAR0]], %[[SHIFT]]
+func.func @test_reduce_mean_dynamic_output(%arg0: tensor<13x21x3xf32>) -> tensor<?x3xf32> {
+  %cst = arith.constant dense<0> : tensor<1xi32>
+  %0 = "tfl.mean"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<?x3xf32>
+  func.return %0 : tensor<?x3xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_reduce_mean_out_of_bounds
 // CHECK: "tfl.mean"
 func.func @test_reduce_mean_out_of_bounds(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -759,9 +896,27 @@ func.func @test_reduce_mean_out_of_bounds(%arg0: tensor<13x21x3xf32>) -> tensor<
 
 // -----
 
+// CHECK-LABEL: test_reduce_mean_qi8
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x2x2x!quant.uniform<i8:f32, 0.0039208820089697838:-128>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<31> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<1105078632> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_7:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x2x2x!quant.uniform<i8:f32, 0.0039208820089697838:-128>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<1x2x2xi32>
+// CHECK: %[[VAL_8:.*]] = tosa.reduce_sum %[[VAL_7]] {axis = 2 : i32} : (tensor<1x2x2xi32>) -> tensor<1x2x1xi32>
+// CHECK: %[[VAL_9:.*]] = tosa.rescale %[[VAL_8]], %[[VAL_2]], %[[VAL_1]], %[[VAL_6]], %[[VAL_5]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x2x1xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x2x1x!quant.uniform<i8:f32, 0.0038096972275525331:-128>>
+func.func @test_reduce_mean_qi8(%arg0: tensor<1x2x2x!quant.uniform<i8:f32, 0.0039208820089697838:-128>>) -> (tensor<1x2x1x!quant.uniform<i8:f32, 0.0038096972275525331:-128>>) {
+%0 = "tfl.pseudo_const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+%1 = "tfl.mean"(%arg0, %0) {keep_dims = true} : (tensor<1x2x2x!quant.uniform<i8:f32, 0.0039208820089697838:-128>>, tensor<i32>) -> tensor<1x2x1x!quant.uniform<i8:f32, 0.0038096972275525331:-128>>
+return %1 : tensor<1x2x1x!quant.uniform<i8:f32, 0.0038096972275525331:-128>>
+}
+
+// -----
+
 // CHECK-LABEL: test_reduce_product
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_product %arg0 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[21, 3]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[21, 3]> : tensor<2xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 func.func @test_reduce_product(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
@@ -847,15 +1002,38 @@ func.func @test_floor(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_log
 // CHECK: %[[VAR0:.*]] = tosa.log %arg0
-func.func @test_log(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
-  %0 = "tfl.log"(%arg0) : (tensor<13x21x3xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
+func.func @test_log(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+  %0 = "tfl.log"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+  func.return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_log_qi8
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.011764706112444401:43>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<{{.+}}> : tensor<256xi8>}>
+// CHECK: %[[VAL_2:.*]] = tosa.table %[[VAL_0]], %[[VAL_1]]
+func.func @test_log_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.011764706112444401:43>>) -> (tensor<13x21x3x!quant.uniform<i8:f32, 0.0027182241901755333:128>>) {
+  %0 = "tfl.log"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.011764706112444401:43>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0027182241901755333:128>>
+  func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.0027182241901755333:128>>
+}
+
+// -----
+
+// CHECK-LABEL: test_log_qi16
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i16:f32, 6.1037018895149231E-5>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<{{.+}}> : tensor<513xi16>}>
+// CHECK: %[[VAL_2:.*]] = tosa.table %[[VAL_0]], %[[VAL_1]]
+func.func @test_log_qi16(%arg0: tensor<13x21x3x!quant.uniform<i16:f32, 6.1037018895149231E-5>>) -> (tensor<13x21x3x!quant.uniform<i16:f32, 2.1153819034225307E-5>>) {
+  %0 = "tfl.log"(%arg0) : (tensor<13x21x3x!quant.uniform<i16:f32, 6.1037018895149231E-5>>) -> tensor<13x21x3x!quant.uniform<i16:f32, 2.1153819034225307E-5>>
+  func.return %0 : tensor<13x21x3x!quant.uniform<i16:f32, 2.1153819034225307E-5>>
 }
 
 // -----
 
 // CHECK-LABEL: test_negate
-// CHECK: %[[VAR0:.*]] = tosa.negate %arg0
+// CHECK-DAG: %[[CONST_0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.negate %arg0, %[[CONST_0]], %[[CONST_0]]
 func.func @test_negate(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %0 = "tfl.neg"(%arg0) : (tensor<13x21x3xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -875,7 +1053,7 @@ func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 
 // CHECK-LABEL: test_rsqrt_qi8
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 1.500000e-02:-128>>
-// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<256xi8>}>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<{{.+}}> : tensor<256xi8>}>
 // CHECK: %[[VAL_2:.*]] = tosa.table %[[VAL_0]], %[[VAL_1]]
 func.func @test_rsqrt_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015:-128>>) -> (tensor<13x21x3x!quant.uniform<i8:f32, 3.71:-128>>) {
   %0 = "tfl.rsqrt"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015:-128>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.71:-128>>
@@ -886,9 +1064,9 @@ func.func @test_rsqrt_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015:-12
 
 // CHECK-LABEL: test_sign
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<21x45xi32>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<-1> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<1> : tensor<1x1xi32>}>
 // CHECK: %[[VAL_4:.*]] = tosa.greater %[[VAL_0]], %[[VAL_1]]
 // CHECK: %[[VAL_5:.*]] = tosa.greater %[[VAL_1]], %[[VAL_0]]
 // CHECK: %[[VAL_6:.*]] = tosa.select %[[VAL_5]], %[[VAL_2]], %[[VAL_1]]
@@ -922,15 +1100,16 @@ func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_atan2
 // CHECK-SAME: -> tensor<13x21x3xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<3.276700e+04> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<2.38418579E-7> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<1.57079637> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() <{value = dense<3.14159274> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_8:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK-DAG: %[[VAL_9:.*]] = "tosa.const"() <{value = dense<{{.+}}> : tensor<513xi16>}> : () -> tensor<513xi16>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[CONST_0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<2.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<3.276700e+04> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<2.38418579E-7> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<1.57079637> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<3.14159274> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<{{.+}}> : tensor<513xi16>}> : () -> tensor<513xi16>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAL_10:.*]] = tosa.abs %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_11:.*]] = tosa.abs %arg1 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_12:.*]] = tosa.minimum %[[VAL_10]], %[[VAL_11]] : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -950,13 +1129,13 @@ func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 // CHECK: %[[VAL_26:.*]] = tosa.sub %[[VAL_7]], %[[VAL_25]] : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_27:.*]] = tosa.greater %[[VAL_8]], %arg1 : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
 // CHECK: %[[VAL_28:.*]] = tosa.select %[[VAL_27]], %[[VAL_26]], %[[VAL_25]] : (tensor<13x21x3xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
-// CHECK: %[[VAL_29:.*]] = tosa.negate %[[VAL_28]] : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_29:.*]] = tosa.negate %[[VAL_28]], %[[CONST_0]], %[[CONST_0]] : (tensor<13x21x3xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<13x21x3xf32>
 // CHECK: %[[VAL_30:.*]] = tosa.greater %[[VAL_8]], %arg0 : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
 // CHECK: %[[VAL_31:.*]] = tosa.select %[[VAL_30]], %[[VAL_29]], %[[VAL_28]] : (tensor<13x21x3xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: return %[[VAL_31]] : tensor<13x21x3xf32>
-func.func @test_atan2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<*xf32> {
-  %0 = "tfl.atan2"(%arg0, %arg1) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
+func.func @test_atan2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+  %0 = "tfl.atan2"(%arg0, %arg1) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+  func.return %0 : tensor<13x21x3xf32>
 }
 
 // -----
@@ -972,7 +1151,7 @@ func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_square
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR0:.*]] = tosa.mul %arg0, %arg0, %[[SHIFT]]
 func.func @test_square(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %0 = "tfl.square"(%arg0) : (tensor<13x21x3xf32>) -> tensor<*xf32>
@@ -1047,7 +1226,8 @@ func.func @test_less_equal_dynamic(%arg0: tensor<13x1x3xf32>, %arg1: tensor<13x?
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0 {acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0, %[[ZP]], %[[ZP]] {acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -1056,7 +1236,8 @@ func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_dynamic
-// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0 {acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0, %[[ZP]], %[[ZP]] {acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_avg_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<?x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -1064,6 +1245,19 @@ func.func @test_avg_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32
 
 // -----
 
+// CHECK-LABEL: test_avg_pool2d_slicing
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[1, 31, 31, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_3:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]] : (tensor<1x32x32x8xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x31x31x8xf32>
+// CHECK: %[[VAL_4:.*]] = tosa.avg_pool2d %[[VAL_3]], %[[ZP]], %[[ZP]] {acc_type = f32, kernel = array<i64: 3, 3>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>} : (tensor<1x31x31x8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x15x15x8xf32>
+func.func @test_avg_pool2d_slicing(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
+  %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_max_pool2d
 // CHECK: %[[VAR0:.*]] = tosa.max_pool2d %arg0 {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
@@ -1082,8 +1276,20 @@ func.func @test_max_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32
 
 // -----
 
+// CHECK-LABEL: test_max_pool2d_slicing
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[1, 31, 31, 8]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_3:.*]] = tosa.slice %[[VAL_0]], %[[VAL_2]], %[[VAL_1]] : (tensor<1x32x32x8xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x31x31x8xf32>
+// CHECK: %[[VAL_4:.*]] = tosa.max_pool2d %[[VAL_3]] {kernel = array<i64: 3, 3>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>} : (tensor<1x31x31x8xf32>) -> tensor<1x15x15x8xf32>
+func.func @test_max_pool2d_slicing(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
+  %0 = "tfl.max_pool_2d"(%arg0)  {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_reshape
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 819]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[VAR10]]
 func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 819]> : tensor<2xi32>
@@ -1094,7 +1300,7 @@ func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_reshape_unknown
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[9, 91]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[9, 91]> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-SAME: -> tensor<9x91xf32>
 func.func @test_reshape_unknown(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -1106,7 +1312,7 @@ func.func @test_reshape_unknown(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_reshape_dynamic
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[3, -1]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[3, -1]> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-SAME: -> tensor<3x?xf32>
 func.func @test_reshape_dynamic(%arg0: tensor<13x21x?xf32>) -> tensor<*xf32> {
@@ -1118,7 +1324,7 @@ func.func @test_reshape_dynamic(%arg0: tensor<13x21x?xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_reshape_dynamic_ranked_output
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, -1, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, -1, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[VAR10]]
 func.func @test_reshape_dynamic_ranked_output(%arg0: tensor<?x52x52x2xf32>) -> tensor<1x?x2xf32> {
   %cst = arith.constant dense<[1, -1, 2]> : tensor<3xi32>
@@ -1149,8 +1355,8 @@ func.func @test_transpose_dynamic(%arg0: tensor<13x?x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_slice
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[4, 11, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[6, 8, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[4, 11, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[6, 8, 0]> : tensor<3xindex>}
 // CHECK: %[[VAL_3:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xf32>
 func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[6, 8, 0]> : tensor<3xi32>
@@ -1162,8 +1368,8 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_slice_minus1_size
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[4, 13, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[6, 8, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[4, 13, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[6, 8, 0]> : tensor<3xindex>}
 // CHECK: %[[VAL_3:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x13x1xf32>
 func.func @test_slice_minus1_size(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[6, 8, 0]> : tensor<3xi32>
@@ -1175,12 +1381,12 @@ func.func @test_slice_minus1_size(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_strided_slice_simple
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[9, 7, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[9, 7, 1, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[9, 7, 3, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[9, 21, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[4, 0, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[9, 7, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[9, 7, 1, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[9, 7, 3, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[9, 21, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[4, 0, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_6]], %[[VAL_5]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<9x21x2xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_4]] : (tensor<9x21x2xf32>, !tosa.shape<4>) -> tensor<9x7x3x2xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_2]], %[[VAL_3]] : (tensor<9x7x3x2xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<9x7x1x2xf32>
@@ -1196,12 +1402,12 @@ func.func @test_strided_slice_simple(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_strided_slice_simple_negative
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[9, 18, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[4, 0, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[9, 6, 3, 2]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[9, 6, 1, 2]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[9, 6, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[9, 18, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[4, 0, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[9, 6, 3, 2]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[9, 6, 1, 2]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[9, 6, 2]> : tensor<3xindex>}
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]]
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_3]]
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_5]], %[[VAL_4]]
@@ -1217,9 +1423,9 @@ func.func @test_strided_slice_simple_negative(%arg0: tensor<13x21x3xf32>) -> ten
 // -----
 
 // CHECK-LABEL: test_strided_slice_strideless
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[9, 2]> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[9, 1, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[4, 0, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[9, 2]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[9, 1, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[4, 0, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK: %[[VAL_4:.*]] = tosa.slice %arg0, %[[VAL_3]], %[[VAL_2]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<9x1x2xf32>
 // CHECK: %[[VAL_5:.*]] = tosa.reshape %[[VAL_4]], %[[VAL_1]] : (tensor<9x1x2xf32>, !tosa.shape<2>) -> tensor<9x2xf32>
 func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -1233,12 +1439,12 @@ func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*
 // -----
 
 // CHECK-LABEL: test_strided_slice_shrink
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<7> : tensor<1xindex>} : () -> !tosa.shape<1>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 7, 1, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[1, 7, 3, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[1, 21, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[4, 0, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<7> : tensor<1xindex>} : () -> !tosa.shape<1>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 7, 1, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[1, 7, 3, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[1, 21, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[4, 0, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_6]], %[[VAL_5]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<1x21x1xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_4]] : (tensor<1x21x1xf32>, !tosa.shape<4>) -> tensor<1x7x3x1xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_2]], %[[VAL_3]] : (tensor<1x7x3x1xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x7x1x1xf32>
@@ -1254,9 +1460,9 @@ func.func @test_strided_slice_shrink(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_strided_slice_shrink_ignore_stride
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[1, 1, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[4, 0, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape  {value = dense<2> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[1, 1, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[4, 0, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape  {values = dense<2> : tensor<1xindex>}
 // CHECK: %[[VAL_3:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<1x1x2xf32>
 // CHECK: %[[VAL_4:.*]] = tosa.reshape %[[VAL_3]], %[[CONST0]] : (tensor<1x1x2xf32>, !tosa.shape<1>) -> tensor<2xf32>
 func.func @test_strided_slice_shrink_ignore_stride(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -1270,8 +1476,8 @@ func.func @test_strided_slice_shrink_ignore_stride(%arg0: tensor<13x21x3xf32>) -
 // -----
 
 // CHECK-LABEL: test_strided_slice_unstrided
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[9, 21, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[4, 0, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[9, 21, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[4, 0, 1]> : tensor<3xindex>}
 // CHECK: %[[VAL_3:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<9x21x2xf32>
 // CHECK: %[[VAL_4:.*]] = tosa.reverse %[[VAL_3]] {axis = 2 : i32} : (tensor<9x21x2xf32>) -> tensor<9x21x2xf32>
 func.func @test_strided_slice_unstrided(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -1285,8 +1491,8 @@ func.func @test_strided_slice_unstrided(%arg0: tensor<13x21x3xf32>) -> tensor<*x
 // -----
 
 // CHECK-LABEL: test_strided_slice_unstrided_shorter
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[9, 21, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[4, 0, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[9, 21, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[4, 0, 0]> : tensor<3xindex>}
 // CHECK: %[[VAL_3:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_1]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<9x21x3xf32>
 // CHECK: %[[VAL_4:.*]] = tosa.reverse %[[VAL_3]] {axis = 1 : i32} : (tensor<9x21x3xf32>) -> tensor<9x21x3xf32>
 func.func @test_strided_slice_unstrided_shorter(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -1339,12 +1545,12 @@ func.func @test_strided_slice_dynamic_end(%arg0: tensor<10x?x?xf32>) -> tensor<*
   %end = arith.constant dense<[7, -1, 6]> : tensor<3xi32>
   %stride = arith.constant dense<[1, 2, -1]> : tensor<3xi32>
 
-  // CHECK-DAG: %[[CONST0:.+]] = tosa.const_shape {value = dense<[7, -1, 2, 1]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST1:.+]] = tosa.const_shape {value = dense<[7, -1]> : tensor<2xindex>}
-  // CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
-  // CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[7, -1, 1, 1]> : tensor<4xindex>}
-  // CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[7, -1, 1]> : tensor<3xindex>}
-  // CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[0, 1, 2]> : tensor<3xindex>}
+  // CHECK-DAG: %[[CONST0:.+]] = tosa.const_shape {values = dense<[7, -1, 2, 1]> : tensor<4xindex>}
+  // CHECK-DAG: %[[CONST1:.+]] = tosa.const_shape {values = dense<[7, -1]> : tensor<2xindex>}
+  // CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
+  // CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[7, -1, 1, 1]> : tensor<4xindex>}
+  // CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[7, -1, 1]> : tensor<3xindex>}
+  // CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[0, 1, 2]> : tensor<3xindex>}
   // CHECK: %[[VAL_5:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]] : (tensor<10x?x?xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<7x?x1xf32>
   // CHECK: %[[VAL_6:.*]] = tosa.reshape %[[VAL_5]], %[[CONST0]]
   // CHECK: %[[VAL_7:.*]] = tosa.slice %[[VAL_6]], %[[VAL_1]], %[[VAL_2]] : (tensor<7x?x2x1xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<7x?x1x1xf32>
@@ -1357,12 +1563,12 @@ func.func @test_strided_slice_dynamic_end(%arg0: tensor<10x?x?xf32>) -> tensor<*
 // -----
 
 // CHECK-LABEL: test_strided_slice_padding_even
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[4, 4, 64]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[4, 1, 4, 1, 64]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[4, 2, 4, 2, 64]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[0, 1, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[4, 4, 64]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[4, 1, 4, 1, 64]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[4, 2, 4, 2, 64]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[0, 1, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.pad %arg0, %[[VAL_5]], %[[VAL_6]] : (tensor<7x7x64xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<8x8x64xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_4]] : (tensor<8x8x64xf32>, !tosa.shape<5>) -> tensor<4x2x4x2x64xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_2]], %[[VAL_3]] : (tensor<4x2x4x2x64xf32>, !tosa.shape<5>, !tosa.shape<5>) -> tensor<4x1x4x1x64xf32>
@@ -1378,12 +1584,12 @@ func.func @test_strided_slice_padding_even(%arg0: tensor<7x7x64xf32>) -> tensor<
 // -----
 
 // CHECK-LABEL: test_strided_slice_padding_odd
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[5, 5, 32]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[5, 1, 5, 1, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[5, 3, 5, 3, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[0, 1, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[5, 5, 32]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[5, 1, 5, 1, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[5, 3, 5, 3, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[0, 1, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.pad %arg0, %[[VAL_5]], %[[VAL_6]] : (tensor<14x14x32xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<15x15x32xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_4]] : (tensor<15x15x32xf32>, !tosa.shape<5>) -> tensor<5x3x5x3x32xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_2]], %[[VAL_3]] : (tensor<5x3x5x3x32xf32>, !tosa.shape<5>, !tosa.shape<5>) -> tensor<5x1x5x1x32xf32>
@@ -1399,12 +1605,12 @@ func.func @test_strided_slice_padding_odd(%arg0: tensor<14x14x32xf32>) -> tensor
 // -----
 
 // CHECK-LABEL: test_strided_slice_padding_pad_greater_than_1
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[5, 5, 32]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[5, 1, 5, 1, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[5, 3, 5, 3, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[0, 2, 0, 2, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[5, 5, 32]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[5, 1, 5, 1, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[5, 3, 5, 3, 32]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[0, 2, 0, 2, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.pad %arg0, %[[VAL_5]], %[[VAL_6]] : (tensor<13x13x32xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<15x15x32xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_4]] : (tensor<15x15x32xf32>, !tosa.shape<5>) -> tensor<5x3x5x3x32xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.slice %[[VAL_8]], %[[VAL_2]], %[[VAL_3]] : (tensor<5x3x5x3x32xf32>, !tosa.shape<5>, !tosa.shape<5>) -> tensor<5x1x5x1x32xf32>
@@ -1420,7 +1626,7 @@ func.func @test_strided_slice_padding_pad_greater_than_1(%arg0: tensor<13x13x32x
 // -----
 
 // CHECK-LABEL: test_select
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<1> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<1> : tensor<3xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %arg2, %[[VAR0]] : (tensor<1xi1>, !tosa.shape<3>) -> tensor<1x1x1xi1>
 // CHECK: %[[VAR2:.*]] = tosa.select %[[VAR1]], %arg0, %arg1
 func.func @test_select(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<1xi1>) -> tensor<13x21x3xf32> {
@@ -1462,7 +1668,7 @@ func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>,
 
 // CHECK-LABEL: test_stack
 // CHECK-DAG: %[[VAR0:.*]] = tosa.concat %arg0, %arg1, %arg2, %arg3 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[4, 13, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[4, 13, 21, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32> {
   %0 = "tfl.pack"(%arg0, %arg1, %arg2, %arg3)  {axis = 0 : i32, values_count = 4 : i32}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32>
@@ -1473,7 +1679,7 @@ func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %a
 
 // CHECK-LABEL: test_stack_end
 // CHECK-DAG: %[[VAR0:.*]] = tosa.concat %arg0, %arg1 {axis = 0 : i32}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[2, 13, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[2, 13, 21, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %[[VAR0]], %[[VAR10]]
 // CHECK: %[[TRANSPOSE:.*]] = tosa.transpose %[[VAR1]] {perms = array<i32: 1, 2, 3, 0>}
 func.func @test_stack_end(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3x2xf32> {
@@ -1484,7 +1690,7 @@ func.func @test_stack_end(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>
 // -----
 
 // CHECK-LABEL: test_unstack
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<[32, 32, 8]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<[32, 32, 8]> : tensor<3xindex>}
 // CHECK: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR0]]
 func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.unpack"(%arg0)  {axis = 0 : i32, num = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
@@ -1494,8 +1700,8 @@ func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_pad
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<[1, 1, 2, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<[1, 1, 2, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAR1:.*]] = tosa.pad %arg0, %[[VAR0]], %[[PVAL]]
 func.func @test_pad(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[[1, 1], [2, 2]]> : tensor<2x2xi32>
@@ -1509,10 +1715,10 @@ func.func @test_pad(%arg0: tensor<2x3xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: test_pad_v2
 // CHECK-SAME: -> tensor<1x257x9x28xf32>
 func.func @test_pad_v2(%arg0: tensor<1x256x8x25xf32>) -> (tensor<*xf32>) {
-  // CHECK-DAG: %[[PADDING:.+]] = tosa.const_shape {value = dense<[0, 0, 1, 0, 0, 1, 1, 2]> : tensor<8xindex>} : () -> !tosa.shape<8>
+  // CHECK-DAG: %[[PADDING:.+]] = tosa.const_shape {values = dense<[0, 0, 1, 0, 0, 1, 1, 2]> : tensor<8xindex>} : () -> !tosa.shape<8>
   %0 = "tfl.pseudo_const"() {value = dense<[[0, 0], [1, 0], [0, 1], [1, 2]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
 
-  // CHECK-DAG: %[[VAL:.+]] = "tosa.const"() <{value = dense<-3.40282347E+38> : tensor<1xf32>}>
+  // CHECK-DAG: %[[VAL:.+]] = "tosa.const"() <{values = dense<-3.40282347E+38> : tensor<1xf32>}>
   %1 = "tfl.pseudo_const"() {value = dense<-3.40282347E+38> : tensor<f32>} : () -> tensor<f32>
 
   // CHECK-DAG: %[[PAD:.+]] = tosa.pad %arg0, %[[PADDING]], %[[VAL]] : (tensor<1x256x8x25xf32>, !tosa.shape<8>, tensor<1xf32>) -> tensor<1x257x9x28xf32>
@@ -1525,9 +1731,9 @@ func.func @test_pad_v2(%arg0: tensor<1x256x8x25xf32>) -> (tensor<*xf32>) {
 // -----
 
 // CHECK-LABEL: test_pad_v2_quant
-// CHECK-DAG: %[[VAL0:.*]] = "tosa.const"() <{value = dense<-128> : tensor<1xi8>}> : () -> tensor<1x!quant.uniform<i8:f32, 0.023529145866632462:42>>
-// CHECK-DAG: %[[VAL1:.*]] = tosa.const_shape  {value = dense<[0, 1, 0, 1, 0, 1, 0, 1]> : tensor<8xindex>} : () -> !tosa.shape<8>
-// CHECK: %[[VAL2:.*]] = tosa.pad %arg0, %[[VAL1]], %[[VAL0]] {input_zp = 42 : i32}
+// CHECK-DAG: %[[VAL0:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1x!quant.uniform<i8:f32, 0.023529145866632462:42>>
+// CHECK-DAG: %[[VAL1:.*]] = tosa.const_shape  {values = dense<[0, 1, 0, 1, 0, 1, 0, 1]> : tensor<8xindex>} : () -> !tosa.shape<8>
+// CHECK: %[[VAL2:.*]] = tosa.pad %arg0, %[[VAL1]], %[[VAL0]]
 // CHECK: return %[[VAL2]]
 func.func @test_pad_v2_quant(%arg0: tensor<1x7x7x9x!quant.uniform<i8:f32, 0.023529145866632462:42>>) -> (tensor<2x8x8x10x!quant.uniform<i8:f32, 0.023529145866632462:42>>) {
   %0 = "tfl.pseudo_const"() <{value = dense<[[0, 1], [0, 1], [0, 1], [0, 1]]> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
@@ -1539,7 +1745,7 @@ func.func @test_pad_v2_quant(%arg0: tensor<1x7x7x9x!quant.uniform<i8:f32, 0.0235
 // -----
 
 // CHECK-LABEL: test_expand_dims
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 13, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 13, 21, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[VAR10]]
 func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 13, 21, 3]> : tensor<4xi32>
@@ -1550,7 +1756,7 @@ func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_expand_dims_minus_1
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {value = dense<[13, 21, 3, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {values = dense<[13, 21, 3, 1]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]]
 func.func @test_expand_dims_minus_1(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf32> {
   %cst = "tfl.pseudo_const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
@@ -1561,7 +1767,7 @@ func.func @test_expand_dims_minus_1(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x
 // -----
 
 // CHECK-LABEL: test_expand_dims_minus_2
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {value = dense<[13, 21, 1, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {values = dense<[13, 21, 1, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]]
 func.func @test_expand_dims_minus_2(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf32> {
   %cst = "tfl.pseudo_const"() {value = dense<-2> : tensor<i32>} : () -> tensor<i32>
@@ -1572,7 +1778,7 @@ func.func @test_expand_dims_minus_2(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x
 // -----
 
 // CHECK-LABEL: test_expand_dims_0
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {value = dense<[1, 13, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {values = dense<[1, 13, 21, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]]
 func.func @test_expand_dims_0(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf32> {
   %cst = "tfl.pseudo_const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -1583,7 +1789,7 @@ func.func @test_expand_dims_0(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf32>
 // -----
 
 // CHECK-LABEL: test_expand_dims_2
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {value = dense<[13, 21, 1, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {values = dense<[13, 21, 1, 3]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]]
 func.func @test_expand_dims_2(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf32> {
   %cst = "tfl.pseudo_const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
@@ -1594,7 +1800,7 @@ func.func @test_expand_dims_2(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf32>
 // -----
 
 // CHECK-LABEL: test_expand_dims_size
-// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {value = dense<[13, 21, 3, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[SHAPE:.*]] = tosa.const_shape {values = dense<[13, 21, 3, 1]> : tensor<4xindex>}
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg0, %[[SHAPE]]
 func.func @test_expand_dims_size(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf32> {
   %cst = "tfl.pseudo_const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
@@ -1605,7 +1811,7 @@ func.func @test_expand_dims_size(%arg0: tensor<13x21x3xf32>) -> tensor<?x?x?x?xf
 // -----
 
 // CHECK-LABEL: test_shape
-// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[13, 21, 3]> : tensor<3xi32>}>
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{values = dense<[13, 21, 3]> : tensor<3xi32>}>
 func.func @test_shape() -> tensor<3xi32> {
   %cst = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
   func.return %cst : tensor<3xi32>
@@ -1614,7 +1820,7 @@ func.func @test_shape() -> tensor<3xi32> {
 // -----
 
 // CHECK-LABEL: test_rank
-// CHECK: %[[VAR0:.*]] = "tosa.const"() <{value = dense<3> : tensor<i32>}>
+// CHECK: %[[VAR0:.*]] = "tosa.const"() <{values = dense<3> : tensor<i32>}>
 func.func @test_rank() -> tensor<i32> {
   %cst = arith.constant dense<3> : tensor<i32>
   func.return %cst : tensor<i32>
@@ -1623,8 +1829,8 @@ func.func @test_rank() -> tensor<i32> {
 // -----
 
 // CHECK-LABEL: test_elu
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1x1xf32>}>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.exp %arg0
 // CHECK-DAG: %[[VAR4:.*]] = tosa.sub %[[VAR2]], %[[VAR0]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.greater_equal %arg0, %[[VAR1]]
@@ -1637,7 +1843,7 @@ func.func @test_elu(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_softmax
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR0:.*]] = tosa.reduce_max %arg0
 // CHECK-DAG: %[[VAR1:.*]] = tosa.sub %arg0, %[[VAR0]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.exp %[[VAR1]]
@@ -1653,8 +1859,8 @@ func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 
 // CHECK-LABEL: test_l2normalization
 func.func @test_l2normalization(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
-  // CHECK-DAG: %[[MIN:.+]] = "tosa.const"() <{value = dense<1.08420217E-19> : tensor<1x1xf32>}>
-  // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+  // CHECK-DAG: %[[MIN:.+]] = "tosa.const"() <{values = dense<1.08420217E-19> : tensor<1x1xf32>}>
+  // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
   // CHECK-DAG: %[[SQR:.+]] = tosa.mul %arg0, %arg0, %[[SHIFT]]
   // CHECK-DAG: %[[SUM:.+]] = tosa.reduce_sum %[[SQR]] {axis = 1 : i32}
   // CHECK-DAG: %[[MAX:.+]] = tosa.maximum %[[SUM]], %[[MIN]]
@@ -1668,7 +1874,7 @@ func.func @test_l2normalization(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>)
 // -----
 
 // CHECK-LABEL: test_log_softmax
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR0:.*]] = tosa.exp %arg0
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reduce_sum %[[VAR0]] {axis = 2 : i32}
 // CHECK-DAG: %[[VAR2:.*]] = tosa.reciprocal %[[VAR1]]
@@ -1682,15 +1888,14 @@ func.func @test_log_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // -----
 
 // CHECK-LABEL: test_matmul
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<28xf32>}>
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[14, 1, 1, 19]> : tensor<4xindex>}
-// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[28, 1, 1, 19]> : tensor<4xindex>}
-// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[14, 28]> : tensor<2xindex>}
-// CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {values = dense<[14, 1, 1, 19]> : tensor<4xindex>}
+// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {values = dense<[28, 1, 1, 19]> : tensor<4xindex>}
+// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {values = dense<[14, 28]> : tensor<2xindex>}
+// CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK: %[[VAR2:.*]] = tosa.transpose %arg1 {perms = array<i32: 1, 0>}
 // CHECK: %[[VAR3:.*]] = tosa.reshape %arg0, %[[CONST0]]
 // CHECK: %[[VAR4:.*]] = tosa.reshape %[[VAR2]], %[[CONST1]]
-// CHECK: %[[VAR5:.*]] = tosa.conv2d %[[VAR3]], %[[VAR4]], %[[VAR1]], %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR5:.*]] = tosa.conv2d %[[VAR3]], %[[VAR4]], %[[CONST3]], %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAR6:.*]] = tosa.reshape %[[VAR5]], %[[CONST2]]
 func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
@@ -1702,60 +1907,10 @@ func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> te
 
 // -----
 
-// CHECK-LABEL: @test_fullyconnected
-func.func @test_fullyconnected(%arg0: tensor<14x19xf32>, %arg1: tensor<28x19xf32>, %arg2: tensor<28xf32>) -> tensor<14x28xf32> {
-  // CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[14, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[28, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[14, 28]> : tensor<2xindex>}
-  // CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
-  // CHECK: %[[VAL0:.*]] = tosa.reshape %arg0, %[[CONST0]]
-  // CHECK: %[[VAL1:.*]] = tosa.reshape %arg1, %[[CONST1]]
-  // CHECK: %[[VAL2:.*]] = tosa.conv2d %[[VAL0]], %[[VAL1]], %arg2, %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
-  // CHECK: %[[VAL3:.*]] = tosa.reshape %[[VAL2]], %[[CONST2]]
-  // return %[[VAL3]]
-  %2 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<14x19xf32>, tensor<28x19xf32>, tensor<28xf32>) -> tensor<14x28xf32>
-  func.return %2 : tensor<14x28xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @test_fullyconnected_in_batch_dim
-func.func @test_fullyconnected_in_batch_dim(%arg0: tensor<1x14x19xf32>, %arg1: tensor<28x19xf32>, %arg2: tensor<28xf32>) -> tensor<14x28xf32> {
-  // CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[14, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[28, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[14, 28]> : tensor<2xindex>}
-  // CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
-  // CHECK: %[[VAL0:.*]] = tosa.reshape %arg0, %[[CONST0]]
-  // CHECK: %[[VAL1:.*]] = tosa.reshape %arg1, %[[CONST1]]
-  // CHECK: %[[VAL2:.*]] = tosa.conv2d %[[VAL0]], %[[VAL1]], %arg2, %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
-  // CHECK: %[[VAL3:.*]] = tosa.reshape %[[VAL2]], %[[CONST2]]
-  // return %[[VAL3]]
-  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x14x19xf32>, tensor<28x19xf32>, tensor<28xf32>) -> tensor<14x28xf32>
-  func.return %0 : tensor<14x28xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @test_fullyconnected_extra_dim
-func.func @test_fullyconnected_extra_dim(%arg0: tensor<1x14x19xf32>, %arg1: tensor<28x19xf32>, %arg2: tensor<28xf32>) -> tensor<1x14x28xf32> {
-  // CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[14, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[28, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[1, 14, 28]> : tensor<3xindex>}
-  // CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
-  // CHECK: %[[VAL0:.*]] = tosa.reshape %arg0, %[[CONST0]]
-  // CHECK: %[[VAL1:.*]] = tosa.reshape %arg1, %[[CONST1]]
-  // CHECK: %[[VAL2:.*]] = tosa.conv2d %[[VAL0]], %[[VAL1]], %arg2, %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
-  // CHECK: %[[VAL3:.*]] = tosa.reshape %[[VAL2]], %[[CONST2]]
-  // return %[[VAL3]]
-  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x14x19xf32>, tensor<28x19xf32>, tensor<28xf32>) -> tensor<1x14x28xf32>
-  func.return %0 : tensor<1x14x28xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @test_batch_matmul
 func.func @test_batch_matmul(%arg0: tensor<1x16x128xf32>, %arg1: tensor<1x128x32xf32>) -> (tensor<1x16x32xf32> ) {
-  // CHECK: tosa.matmul %arg0, %arg1
+  // CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+  // CHECK: %[[VAR0:.*]] = tosa.matmul %arg0, %arg1, %[[ZP]], %[[ZP]]
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x16x128xf32>, tensor<1x128x32xf32>) -> tensor<1x16x32xf32>
   func.return %0 : tensor<1x16x32xf32>
 }
@@ -1764,12 +1919,13 @@ func.func @test_batch_matmul(%arg0: tensor<1x16x128xf32>, %arg1: tensor<1x128x32
 
 // CHECK-LABEL: @test_batch_matmul2d
 func.func @test_batch_matmul2d(%arg0: tensor<16x128xf32>, %arg1: tensor<128x32xf32>) -> (tensor<16x32xf32> ) {
-  // CHECK-DAG: %[[VAR_10:.*]] = tosa.const_shape {value = dense<[1, 16, 128]> : tensor<3xindex>}
-  // CHECK-DAG: %[[VAR_11:.*]] = tosa.const_shape {value = dense<[1, 128, 32]> : tensor<3xindex>}
-  // CHECK-DAG: %[[VAR_12:.*]] = tosa.const_shape {value = dense<[16, 32]> : tensor<2xindex>}
+  // CHECK-DAG: %[[VAR_10:.*]] = tosa.const_shape {values = dense<[1, 16, 128]> : tensor<3xindex>}
+  // CHECK-DAG: %[[VAR_11:.*]] = tosa.const_shape {values = dense<[1, 128, 32]> : tensor<3xindex>}
+  // CHECK-DAG: %[[VAR_12:.*]] = tosa.const_shape {values = dense<[16, 32]> : tensor<2xindex>}
+  // CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
   // CHECK: %[[VAL_0:.*]] = tosa.reshape %arg0, %[[VAR_10]]
   // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg1, %[[VAR_11]]
-  // CHECK: %[[VAL_2:.*]] = tosa.matmul %[[VAL_0]], %[[VAL_1]]
+  // CHECK: %[[VAL_2:.*]] = tosa.matmul %[[VAL_0]], %[[VAL_1]], %[[ZP]], %[[ZP]]
   // CHECK: %[[VAL_3:.*]] = tosa.reshape %[[VAL_2]], %[[VAR_12]]
   // CHECK: return %[[VAL_3]]
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<16x128xf32>, tensor<128x32xf32>) -> tensor<16x32xf32>
@@ -1780,12 +1936,13 @@ func.func @test_batch_matmul2d(%arg0: tensor<16x128xf32>, %arg1: tensor<128x32xf
 
 // CHECK-LABEL: @test_batch_matmul_4d
 func.func @test_batch_matmul_4d(%arg0: tensor<4x5x16x128xf32>, %arg1: tensor<4x5x128x32xf32>) -> (tensor<4x5x16x32xf32> ) {
-  // CHECK-DAG: %[[C0:.*]] = tosa.const_shape {value = dense<[20, 16, 128]> : tensor<3xindex>}
-  // CHECK-DAG: %[[C1:.*]] = tosa.const_shape {value = dense<[20, 128, 32]> : tensor<3xindex>}
-  // CHECK-DAG: %[[C2:.*]] = tosa.const_shape {value = dense<[4, 5, 16, 32]> : tensor<4xindex>}
+  // CHECK-DAG: %[[C0:.*]] = tosa.const_shape {values = dense<[20, 16, 128]> : tensor<3xindex>}
+  // CHECK-DAG: %[[C1:.*]] = tosa.const_shape {values = dense<[20, 128, 32]> : tensor<3xindex>}
+  // CHECK-DAG: %[[C2:.*]] = tosa.const_shape {values = dense<[4, 5, 16, 32]> : tensor<4xindex>}
+  // CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
   // CHECK: %[[R0:.*]] = tosa.reshape %arg0, %[[C0]]
   // CHECK: %[[R1:.*]] = tosa.reshape %arg1, %[[C1]]
-  // CHECK: %[[MM:.*]] = tosa.matmul %[[R0]], %[[R1]]
+  // CHECK: %[[MM:.*]] = tosa.matmul %[[R0]], %[[R1]], %[[ZP]], %[[ZP]]
   // CHECK: tosa.reshape %[[MM]], %[[C2]]
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<4x5x16x128xf32>, tensor<4x5x128x32xf32>) -> tensor<4x5x16x32xf32>
   func.return %0 : tensor<4x5x16x32xf32>
@@ -1795,27 +1952,31 @@ func.func @test_batch_matmul_4d(%arg0: tensor<4x5x16x128xf32>, %arg1: tensor<4x5
 
 // CHECK-LABEL: @test_batch_matmul_transpose
 func.func @test_batch_matmul_transpose(%arg0: tensor<1x16x128xf32>, %arg1: tensor<1x128x32xf32>) -> (tensor<1x32x16xf32> ) {
+  // CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
   // CHECK-DAG: %[[TP0:.+]] = tosa.transpose %arg0 {perms = array<i32: 0, 2, 1>}
   // CHECK-DAG: %[[TP1:.+]] = tosa.transpose %arg1 {perms = array<i32: 0, 2, 1>}
-  // CHECK: tosa.matmul %[[TP1]], %[[TP0]]
+  // CHECK: tosa.matmul %[[TP1]], %[[TP0]], %[[ZP]], %[[ZP]]
   %0 = "tfl.batch_matmul"(%arg1, %arg0) {adj_x = true, adj_y = true} : (tensor<1x128x32xf32>, tensor<1x16x128xf32>) -> tensor<1x32x16xf32>
   func.return %0 : tensor<1x32x16xf32>
 }
 
 // -----
 
-// CHECK-LABEL: test_batch_matmul_qi8
+// CHECK-LABEL: @test_batch_matmul_qi8
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>
-// CHECK-DAG: %[[VAR_10:.*]] = tosa.const_shape {value = dense<[3, 4, 4]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.reshape %[[VAL_0]], %[[VAR_10]] : (tensor<1x3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>, !tosa.shape<3>) -> tensor<3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>
-// CHECK-DAG: %[[VAR_11:.*]] = tosa.const_shape {value = dense<[3, 4, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.reshape %[[VAL_1]], %[[VAR_11]] : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>, !tosa.shape<3>) -> tensor<3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.matmul %[[VAL_2]], %[[VAL_3]] {a_zp = -128 : i32, b_zp = -128 : i32} : (tensor<3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>, tensor<3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>) -> tensor<3x4x3xi32>
-// CHECK-DAG: %[[VAR_12:.*]] = tosa.const_shape {value = dense<[1, 3, 4, 3]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.reshape %[[VAL_4]], %[[VAR_12]] : (tensor<3x4x3xi32>, !tosa.shape<4>) -> tensor<1x3x4x3xi32>
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.rescale %[[VAL_5]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1488699087>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i8: 40>} : (tensor<1x3x4x3xi32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.011357889510691166:-128>>
-// CHECK: return %[[VAL_6]] : tensor<1x3x4x3x!quant.uniform<i8:f32, 0.011357889510691166:-128>>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<40> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<1488699087> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[1, 3, 4, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<[3, 4, 3]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK: %[[VAL_8:.*]] = tosa.const_shape  {values = dense<[3, 4, 4]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK: %[[VAL_9:.*]] = tosa.reshape %[[VAL_0]], %[[VAL_8]] : (tensor<1x3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>, !tosa.shape<3>) -> tensor<3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>
+// CHECK: %[[VAL_10:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_7]] : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>, !tosa.shape<3>) -> tensor<3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>
+// CHECK: %[[VAL_11:.*]] = tosa.matmul %[[VAL_9]], %[[VAL_10]], %[[VAL_6]], %[[VAL_6]] : (tensor<3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>, tensor<3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>, tensor<1xi8>, tensor<1xi8>) -> tensor<3x4x3xi32>
+// CHECK: %[[VAL_12:.*]] = tosa.reshape %[[VAL_11]], %[[VAL_5]] : (tensor<3x4x3xi32>, !tosa.shape<4>) -> tensor<1x3x4x3xi32>
+// CHECK: %[[VAL_13:.*]] = tosa.rescale %[[VAL_12]], %[[VAL_3]], %[[VAL_2]], %[[VAL_4]], %[[VAL_6]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x3x4x3xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.011357889510691166:-128>>
 func.func @test_batch_matmul_qi8(%arg0: tensor<1x3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>, %arg1: tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.011357889510691166:-128>> {
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<1x3x4x4x!quant.uniform<i8:f32, 0.003921466413885355:-128>>, tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0039215362630784512:-128>>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.011357889510691166:-128>>
   return %0 : tensor<1x3x4x3x!quant.uniform<i8:f32, 0.011357889510691166:-128>>
@@ -1826,8 +1987,8 @@ func.func @test_batch_matmul_qi8(%arg0: tensor<1x3x4x4x!quant.uniform<i8:f32, 0.
 // CHECK-LABEL: test_batch_matmul_with_input_broadcast
 // CHECK-SAME: %[[ARG0:.*]]: tensor<25x12x14x14x64xf32>
 // CHECK-SAME: %[[ARG1:.*]]: tensor<14x64x14xf32>
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[1, 1, 14, 64, 14]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<-0.000000e+00> : tensor<25x12x14x64x14xf32>}> : () -> tensor<25x12x14x64x14xf32>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[1, 1, 14, 64, 14]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<-0.000000e+00> : tensor<25x12x14x64x14xf32>}> : () -> tensor<25x12x14x64x14xf32>
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %[[ARG1]], %[[VAR0]] : (tensor<14x64x14xf32>, !tosa.shape<5>) -> tensor<1x1x14x64x14xf32>
 // CHECK: tosa.add %[[VAR5]], %[[VAR1]] : (tensor<1x1x14x64x14xf32>, tensor<25x12x14x64x14xf32>) -> tensor<25x12x14x64x14xf32>
 func.func @test_batch_matmul_with_input_broadcast(%arg0: tensor<25x12x14x14x64xf32>, %arg1: tensor<14x64x14xf32>) -> (tensor<25x12x14x14x14xf32> ) {
@@ -1838,17 +1999,21 @@ func.func @test_batch_matmul_with_input_broadcast(%arg0: tensor<25x12x14x14x64xf
 // -----
 
 // CHECK-LABEL: test_batch_matmul_qi16
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>,
-// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>) -> tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[3, 4, 4]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.reshape %[[VAL_0]], %[[VAL_10]] : (tensor<1x3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>, !tosa.shape<3>) -> tensor<3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>
-// CHECK-DAG: %[[VAL_11:.*]] = tosa.const_shape {value = dense<[3, 4, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_11]] : (tensor<1x3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>, !tosa.shape<3>) -> tensor<3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.matmul %[[VAL_2]], %[[VAL_3]] {a_zp = 0 : i32, b_zp = 0 : i32} : (tensor<3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>, tensor<3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>) -> tensor<3x4x3xi48>
-// CHECK-DAG: %[[VAR_12:.*]] = tosa.const_shape {value = dense<[1, 3, 4, 3]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.reshape %[[VAL_4]], %[[VAR_12]] : (tensor<3x4x3xi48>, !tosa.shape<4>) -> tensor<1x3x4x3xi48>
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.rescale %[[VAL_5]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 20139>, output_zp = 0 : i32, per_channel = false, scale32 = false, shift = array<i8: 31>} : (tensor<1x3x4x3xi48>) -> tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>
-// CHECK: return %[[VAL_6]] : tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>
+// CHECK-SAME: %[[VAL_1:.*]]: tensor<1x3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<31> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<20139> : tensor<1xi16>}> : () -> tensor<1xi16>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi48>}> : () -> tensor<1xi48>
+// CHECK: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[1, 3, 4, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi16>}> : () -> tensor<1xi16>
+// CHECK: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<[3, 4, 3]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK: %[[VAL_8:.*]] = tosa.const_shape  {values = dense<[3, 4, 4]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK: %[[VAL_9:.*]] = tosa.reshape %[[VAL_0]], %[[VAL_8]] : (tensor<1x3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>, !tosa.shape<3>)
+// CHECK: %[[VAL_10:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_7]] : (tensor<1x3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>, !tosa.shape<3>)
+// CHECK: %[[VAL_11:.*]] = tosa.matmul %[[VAL_9]], %[[VAL_10]], %[[VAL_6]], %[[VAL_6]] : (tensor<3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>, tensor<3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>, tensor<1xi16>, tensor<1xi16>)
+// CHECK: %[[VAL_12:.*]] = tosa.reshape %[[VAL_11]], %[[VAL_5]] : (tensor<3x4x3xi48>, !tosa.shape<4>) -> tensor<1x3x4x3xi48>
+// CHECK: %[[VAL_13:.*]] = tosa.rescale %[[VAL_12]], %[[VAL_3]], %[[VAL_2]], %[[VAL_4]], %[[VAL_6]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = false} : (tensor<1x3x4x3xi48>, tensor<1xi16>, tensor<1xi8>, tensor<1xi48>, tensor<1xi16>)
+// CHECK: return %[[VAL_13]] : tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>
 func.func @test_batch_matmul_qi16(%arg0: tensor<1x3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>, %arg1: tensor<1x3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>) -> (tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>) {
 %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<1x3x4x4x!quant.uniform<i16:f32, 3.0517894629156217E-5>>, tensor<1x3x4x3x!quant.uniform<i16:f32, 3.051840394618921E-5>>) -> tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>
 return %0 : tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>
@@ -1859,8 +2024,8 @@ return %0 : tensor<1x3x4x3x!quant.uniform<i16:f32, 9.9311851954553276E-5>>
 // CHECK-LABEL: test_batch_matmul_with_input_broadcast_1
 // CHECK-SAME: %[[ARG0:.*]]: tensor<1x256x256x32xf32>
 // CHECK-SAME: %[[ARG1:.*]]: tensor<1x32x4xf32>
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[1, 1, 32, 4]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<-0.000000e+00> : tensor<1x256x32x4xf32>}> : () -> tensor<1x256x32x4xf32>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[1, 1, 32, 4]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<-0.000000e+00> : tensor<1x256x32x4xf32>}> : () -> tensor<1x256x32x4xf32>
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %[[ARG1]], %[[VAR0]] : (tensor<1x32x4xf32>, !tosa.shape<4>) -> tensor<1x1x32x4xf32>
 // CHECK-DAG: %[[VAR6:.*]] = tosa.add %[[VAR5]], %[[VAR1]] : (tensor<1x1x32x4xf32>, tensor<1x256x32x4xf32>) -> tensor<1x256x32x4xf32>
 func.func @test_batch_matmul_with_input_broadcast_1(%arg0: tensor<1x256x256x32xf32>, %arg1: tensor<1x32x4xf32>) -> (tensor<1x256x256x4xf32>) {
@@ -1873,8 +2038,8 @@ func.func @test_batch_matmul_with_input_broadcast_1(%arg0: tensor<1x256x256x32xf
 // CHECK-LABEL: test_batch_matmul_with_input_broadcast_qi8
 // CHECK-SAME: %[[ARG0:.*]]: tensor<25x12x14x14x64x!quant.uniform<i8:f32, 0.061956532299518585:4>>
 // CHECK-SAME: %[[ARG1:.*]]: tensor<14x64x14x!quant.uniform<i8:f32, 3.9322837255895138E-4:1>>
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {value = dense<[1, 1, 14, 64, 14]> : tensor<5xindex>} : () -> !tosa.shape<5>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0> : tensor<25x12x14x64x14xi32>}> : () -> tensor<25x12x14x64x14xi32>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape  {values = dense<[1, 1, 14, 64, 14]> : tensor<5xindex>} : () -> !tosa.shape<5>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0> : tensor<25x12x14x64x14xi32>}> : () -> tensor<25x12x14x64x14xi32>
 // CHECK-DAG: %[[VAR7:.*]] = tosa.reshape %[[ARG1]], %[[VAR0]] : (tensor<14x64x14x!quant.uniform<i8:f32, 3.9322837255895138E-4:1>>, !tosa.shape<5>) -> tensor<1x1x14x64x14x!quant.uniform<i8:f32, 3.9322837255895138E-4:1>>
 // CHECK-DAG: %[[VAR8:.*]] = tosa.cast %[[VAR7]] : (tensor<1x1x14x64x14x!quant.uniform<i8:f32, 3.9322837255895138E-4:1>>) -> tensor<1x1x14x64x14xi32>
 // CHECK-DAG: %[[VAR9:.*]] = tosa.add %[[VAR8]], %[[VAR1]] : (tensor<1x1x14x64x14xi32>, tensor<25x12x14x64x14xi32>) -> tensor<25x12x14x64x14xi32>
@@ -1887,7 +2052,7 @@ func.func @test_batch_matmul_with_input_broadcast_qi8(%arg0: tensor<25x12x14x14x
 // -----
 
 // CHECK-LABEL: test_add_scalar
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1x1xf32>}>
 // CHECK: %[[VAR2:.*]] = tosa.add %arg0, %[[VAR0]]
 func.func @test_add_scalar(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<1.000000e+00> : tensor<f32>
@@ -1961,10 +2126,10 @@ func.func @test_fused_activation_relun1to1_clamp(
 // -----
 
 // CHECK-LABEL: test_split
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 14, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[0, 7, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[13, 7, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 14, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[0, 7, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[13, 7, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>}
 // CHECK: %[[VAL_5:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_1]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
@@ -1977,12 +2142,12 @@ func.func @test_split(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor
 // -----
 
 // CHECK-LABEL: test_split_dynamic
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 2, 0, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[0, 1, 0, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[13, -1, 3]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[13, 1, -1, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[13, 3, -1, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 2, 0, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[0, 1, 0, 0]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[13, -1, 3]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[13, 1, -1, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[13, 3, -1, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK: %[[VAL_7:.*]] = tosa.reshape %arg0, %[[VAL_6]] : (tensor<13x?x3xf32>, !tosa.shape<4>) -> tensor<13x3x?x3xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.slice %[[VAL_7]], %[[VAL_4]], %[[VAL_5]] : (tensor<13x3x?x3xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<13x1x?x3xf32>
 // CHECK: %[[VAL_9:.*]] = tosa.reshape %[[VAL_8]], %[[VAL_3]] : (tensor<13x1x?x3xf32>, !tosa.shape<3>) -> tensor<13x?x3xf32>
@@ -2000,10 +2165,10 @@ func.func @test_split_dynamic(%arg0: tensor<13x?x3xf32>) -> (tensor<13x?x3xf32>,
 // -----
 
 // CHECK-LABEL: test_split_neg
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 14, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[0, 7, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[13, 7, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 14, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[0, 7, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[13, 7, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>}
 // CHECK: %[[VAL_5:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_1]], %[[VAL_3]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x7x3xf32>
@@ -2017,10 +2182,10 @@ func.func @test_split_neg(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, te
 // -----
 
 // CHECK-LABEL: test_split_axis_0
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[14, 0, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[7, 0, 0]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[7, 13, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[14, 0, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[7, 0, 0]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[7, 13, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>}
 // CHECK: %[[VAL_5:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]] : (tensor<21x13x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<7x13x3xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_3]] : (tensor<21x13x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<7x13x3xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %arg0, %[[VAL_1]], %[[VAL_3]] : (tensor<21x13x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<7x13x3xf32>
@@ -2033,10 +2198,10 @@ func.func @test_split_axis_0(%arg0: tensor<21x13x3xf32>) -> (tensor<7x13x3xf32>,
 // -----
 
 // CHECK-LABEL: test_split_v_neg_axis
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 3]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[2, 3, 3, 5]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[2, 3, 3, 3]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[2, 3, 3, 5]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[2, 3, 3, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>}
 // CHECK: %[[VAL_5:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]] : (tensor<2x3x3x8xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<2x3x3x3xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.slice %arg0, %[[VAL_1]], %[[VAL_2]] : (tensor<2x3x3x8xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<2x3x3x5xf32>
 func.func @test_split_v_neg_axis(%arg0: tensor<2x3x3x8xf32>) -> (tensor<2x3x3x3xf32>, tensor<2x3x3x5xf32>) {
@@ -2059,13 +2224,13 @@ func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_space_to_batch
-// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {value = dense<[0, 0, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.const_shape {values = dense<[0, 0, 0, 1, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.pad %arg0, %[[VAR0]], %[[PVAL]]
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[13, 11, 2, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[13, 11, 2, 3]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR10]]
 // CHECK-DAG: %[[VAR4:.*]] = tosa.transpose %[[VAR3]] {perms = array<i32: 2, 0, 1, 3>}
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[26, 11, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[26, 11, 3]> : tensor<3xindex>}
 // CHECK: %[[VAR5:.*]] = tosa.reshape %[[VAR4]], %[[VAR11]]
 func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32> {
   %cst = arith.constant dense<2> : tensor<1xi32>
@@ -2077,10 +2242,10 @@ func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32
 // -----
 
 // CHECK-LABEL: test_space_to_batch_dyn
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[-1, 81, 1, 80]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[-1, 81, 3, 1, 1, 80]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 2, 0, 0, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[-1, 81, 1, 80]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[-1, 81, 3, 1, 1, 80]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 2, 0, 0, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.pad %arg0, %[[VAL_4]], %[[VAL_5]] : (tensor<?x241x1x80xf32>, !tosa.shape<8>, tensor<1xf32>) -> tensor<?x243x1x80xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.reshape %[[VAL_6]], %[[VAL_3]] : (tensor<?x243x1x80xf32>, !tosa.shape<6>) -> tensor<?x81x3x1x1x80xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.transpose %[[VAL_7]] {perms = array<i32: 2, 4, 0, 1, 3, 5>} : (tensor<?x81x3x1x1x80xf32>) -> tensor<3x1x?x81x1x80xf32>
@@ -2096,10 +2261,10 @@ func.func @test_space_to_batch_dyn(%arg0 : tensor<?x241x1x80xf32>) -> (tensor<?x
 
 // CHECK-LABEL: test_batch_to_space
 // CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %arg0 {perms = array<i32: 3, 1, 2, 0>}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[2, 2, 2, 32, 32, 1]> : tensor<6xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[2, 2, 2, 32, 32, 1]> : tensor<6xindex>}
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR10]]
 // CHECK-DAG: %[[VAR4:.*]] = tosa.transpose %[[VAR3]] {perms = array<i32: 2, 3, 0, 4, 1, 5>}
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[2, 64, 64, 1]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[2, 64, 64, 1]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %[[VAR4]], %[[VAR11]]
 // CHECK: return %[[VAR5:.*]]
 func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1xf32> {
@@ -2114,10 +2279,10 @@ func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1
 // -----
 
 // CHECK-LABEL: @test_batch_to_space_dyn
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[-1, 235, 1, 80]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[-1, 237, 1, 80]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[3, 1, -1, 79, 1, 80]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[-1, 235, 1, 80]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[-1, 237, 1, 80]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[3, 1, -1, 79, 1, 80]> : tensor<6xindex>} : () -> !tosa.shape<6>
 // CHECK: %[[VAL_6:.*]] = tosa.reshape %arg0, %[[VAL_5]] : (tensor<?x79x1x80xf32>, !tosa.shape<6>) -> tensor<3x1x?x79x1x80xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.transpose %[[VAL_6]] {perms = array<i32: 2, 3, 0, 4, 1, 5>} : (tensor<3x1x?x79x1x80xf32>) -> tensor<?x79x3x1x1x80xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_3]] : (tensor<?x79x3x1x1x80xf32>, !tosa.shape<4>) -> tensor<?x237x1x80xf32>
@@ -2132,10 +2297,10 @@ func.func @test_batch_to_space_dyn(%arg0 : tensor<?x79x1x80xf32>) -> (tensor<?x2
 // -----
 
 // CHECK-LABEL: @test_batch_to_space_shape_infer
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[1, 135, 240, 384]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 136, 240, 384]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[2, 2, 1, 68, 120, 384]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[1, 135, 240, 384]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 136, 240, 384]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[2, 2, 1, 68, 120, 384]> : tensor<6xindex>} : () -> !tosa.shape<6>
 // CHECK: %[[VAL_6:.*]] = tosa.reshape %arg0, %[[VAL_5]] : (tensor<4x68x120x384xf32>, !tosa.shape<6>) -> tensor<2x2x1x68x120x384xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.transpose %[[VAL_6]] {perms = array<i32: 2, 3, 0, 4, 1, 5>} : (tensor<2x2x1x68x120x384xf32>) -> tensor<1x68x2x120x2x384xf32>
 // CHECK: %[[VAL_8:.*]] = tosa.reshape %[[VAL_7]], %[[VAL_3]] : (tensor<1x68x2x120x2x384xf32>, !tosa.shape<4>) -> tensor<1x136x240x384xf32>
@@ -2150,10 +2315,10 @@ func.func @test_batch_to_space_shape_infer(%arg0 : tensor<4x68x120x384xf32>) ->
 // -----
 
 // CHECK-LABEL: test_space_to_depth
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 16, 2, 16, 2, 8]> : tensor<6xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 16, 2, 16, 2, 8]> : tensor<6xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %[[VAR1]] {perms = array<i32: 0, 1, 3, 2, 4, 5>}
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, 16, 16, 32]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 16, 16, 32]> : tensor<4xindex>}
 // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR11]]
 func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32> {
   %0 = "tfl.space_to_depth"(%arg0)  {block_size = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32>
@@ -2163,10 +2328,10 @@ func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x3
 // -----
 
 // CHECK-LABEL: test_depth_to_space
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 32, 32, 2, 2, 2]> : tensor<6xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 32, 32, 2, 2, 2]> : tensor<6xindex>}
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %[[VAR1]] {perms = array<i32: 0, 1, 3, 2, 4, 5>}
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, 64, 64, 2]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 64, 64, 2]> : tensor<4xindex>}
 // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR11]]
 func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32> {
   %0 = "tfl.depth_to_space"(%arg0)  {block_size = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32>
@@ -2176,10 +2341,10 @@ func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2
 // -----
 
 // CHECK-LABEL: @test_bucketize
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape  {value = dense<[2, 5]> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape  {value = dense<[2, 5, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape  {values = dense<[2, 5]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape  {values = dense<[2, 5, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
 
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<{{\[\[\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]]]> : tensor<1x1x4xf32>}>
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<{{\[\[\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]]]> : tensor<1x1x4xf32>}>
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[CONST2]]
 // CHECK: %[[VAL_2:.*]] = tosa.greater_equal %[[VAL_1]], %[[VAL_0]]
 // CHECK: %[[VAL_3:.*]] = tosa.cast %[[VAL_2]] : (tensor<2x5x4xi1>) -> tensor<2x5x4xi32>
@@ -2193,10 +2358,10 @@ func.func @test_bucketize(%arg0: tensor<2x5xf32>) -> tensor<2x5xi32> {
 // -----
 
 // CHECK-LABEL: @test_bucketize
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape  {value = dense<[2, 5]> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape  {value = dense<[2, 5, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape  {values = dense<[2, 5]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape  {values = dense<[2, 5, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
 
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<{{\[\[\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]]]> : tensor<1x1x4xf32>}>
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<{{\[\[\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]]]> : tensor<1x1x4xf32>}>
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[CONST2]]
 // CHECK: %[[VAL_2:.*]] = tosa.greater_equal %[[VAL_1]], %[[VAL_0]]
 // CHECK: %[[VAL_3:.*]] = tosa.cast %[[VAL_2]] : (tensor<2x5x4xi1>) -> tensor<2x5x4xi32>
@@ -2211,11 +2376,11 @@ func.func @test_bucketize(%arg0: tensor<2x5xf32>) -> tensor<2x5xi32> {
 
 // CHECK-LABEL: @test_one_hot
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<4x4xi32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>
-// CHECK-DAG:     %[[CST0:.*]] = tosa.const_shape {value = dense<1> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG:     %[[CST1:.*]] = tosa.const_shape {value = dense<[16, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG:     %[[CST2:.*]] = tosa.const_shape {value = dense<[16, 2, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG:     %[[CST3:.*]] = tosa.const_shape {value = dense<[16, 1]> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG:     %[[CST4:.*]] = tosa.const_shape {value = dense<[4, 4, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[CST0:.*]] = tosa.const_shape {values = dense<1> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[CST1:.*]] = tosa.const_shape {values = dense<[16, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[CST2:.*]] = tosa.const_shape {values = dense<[16, 2, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG:     %[[CST3:.*]] = tosa.const_shape {values = dense<[16, 1]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG:     %[[CST4:.*]] = tosa.const_shape {values = dense<[4, 4, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK-DAG:     %[[RESHAPE:.*]] = tosa.reshape %[[ARG1]], %[[CST0]]
 // CHECK-DAG:     %[[TILE:.*]] = tosa.tile %[[RESHAPE]], %[[CST1]]
 // CHECK-DAG:     %[[RESHAPE_0:.*]] = tosa.reshape %[[ARG2]], %[[CST0]]
@@ -2233,9 +2398,9 @@ func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tenso
 // -----
 
 // CHECK-LABEL: test_fakequant_with_min_max_args
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<6.10360876E-5> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<16383.75> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR3:.*]] = tosa.mul %arg0, %[[VAR2]], %[[SHIFT]]
 // CHECK-DAG: %[[VAR5:.*]] = tosa.cast %[[VAR3]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.cast %[[VAR5]]
@@ -2260,7 +2425,7 @@ func.func @test_dequantize_float(%arg0: tensor<10xf16>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @test_dequantize_quant_uniform
 func.func @test_dequantize_quant_uniform(%arg0: tensor<4x!quant.uniform<i8:f32, 1.0:-1>>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() <{value = dense<-1.000000e+00> : tensor<1xf32>}>
+  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() <{values = dense<-1.000000e+00> : tensor<1xf32>}>
   // CHECK-DAG: %[[VAL1:.+]] = tosa.cast %arg0
   // CHECK-DAG: %[[VAL2:.+]] = tosa.sub %[[VAL1]], %[[VAL0]]
   %0 = "tfl.dequantize"(%arg0) : (tensor<4x!quant.uniform<i8:f32, 1.0:-1>>) -> tensor<*xf32>
@@ -2270,9 +2435,9 @@ func.func @test_dequantize_quant_uniform(%arg0: tensor<4x!quant.uniform<i8:f32,
 
 // CHECK-LABEL: @test_dequantize_quant_per_axis
 func.func @test_dequantize_quant_per_axis(%arg0: tensor<1x4x!quant.uniform<i8:f32:1, {1.0:5, 2.0:6, 3.0:7, 4.0:8}>>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() <{value = dense<{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]]> : tensor<1x4xf32>}>
-  // CHECK-DAG: %[[VAL1:.+]] = "tosa.const"() <{value = dense<{{\[}}[5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]]> : tensor<1x4xf32>}>
-  // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+  // CHECK-DAG: %[[VAL0:.+]] = "tosa.const"() <{values = dense<{{\[}}[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]]> : tensor<1x4xf32>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tosa.const"() <{values = dense<{{\[}}[5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]]> : tensor<1x4xf32>}>
+  // CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
   // CHECK-DAG: %[[VAL2:.+]] = tosa.cast %arg0 : (tensor<1x4x!quant.uniform<i8:f32:1, {1.000000e+00:5,2.000000e+00:6,3.000000e+00:7,4.000000e+00:8}>>) -> tensor<1x4xf32>
   // CHECK-DAG: %[[VAL3:.+]] = tosa.sub %[[VAL2]], %[[VAL1]] : (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xf32>
   // CHECK: %[[VAL4:.+]] = tosa.mul %[[VAL3]], %[[VAL0]], %[[SHIFT]] : (tensor<1x4xf32>, tensor<1x4xf32>, tensor<1xi8>) -> tensor<1x4xf32>
@@ -2292,11 +2457,23 @@ func.func @test_quantfork.stats(%arg0: tensor<2x1xf32>) -> (tensor<2x1xf32>) {
 // -----
 
 // CHECK-LABEL: test_add_qi8
-// CHECK-DAG: %[[VAL_0:.*]] = tosa.rescale %arg0 {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 10>}
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.rescale %[[VAL_0]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2147311776>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 32>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.rescale %arg1 {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 11>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.add %[[VAL_1]], %[[VAL_2]]
-// CHECK: %[[VAL_4:.*]] = tosa.rescale %[[VAL_3]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1075580483>, output_zp = -1 : i32, per_channel = false, scale32 = true, shift = array<i8: 50>}
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>
+// CHECK-SAME: %[[VAL_1:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<50> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<1075580483> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<11> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<32> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<2147311776> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<10> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_11:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_7]], %[[VAL_8]], %[[VAL_9]], %[[VAL_10]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<13x21x1xi32>
+// CHECK: %[[VAL_12:.*]] = tosa.rescale %[[VAL_11]], %[[VAL_6]], %[[VAL_5]], %[[VAL_10]], %[[VAL_10]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x1xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi32>) -> tensor<13x21x1xi32>
+// CHECK: %[[VAL_13:.*]] = tosa.rescale %[[VAL_1]], %[[VAL_7]], %[[VAL_4]], %[[VAL_9]], %[[VAL_10]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<13x21x3xi32>
+// CHECK: %[[VAL_14:.*]] = tosa.add %[[VAL_12]], %[[VAL_13]] : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+// CHECK: %[[VAL_15:.*]] = tosa.rescale %[[VAL_14]], %[[VAL_3]], %[[VAL_2]], %[[VAL_10]], %[[VAL_9]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x3xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>>
+// CHECK: return %[[VAL_15]] : tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>>
 func.func @test_add_qi8(%arg0: tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>, %arg1: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>> {
   %0 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>, tensor<13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>>
@@ -2305,11 +2482,23 @@ func.func @test_add_qi8(%arg0: tensor<13x21x1x!quant.uniform<i8:f32, 0.015684800
 // -----
 
 // CHECK-LABEL: test_sub_qi8
-// CHECK-DAG: %[[VAL_0:.*]] = tosa.rescale %arg0 {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 10>}
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.rescale %[[VAL_0]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2147427038>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 32>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.rescale %arg1 {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 11>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.sub %[[VAL_1]], %[[VAL_2]]
-// CHECK: %[[VAL_4:.*]] = tosa.rescale %[[VAL_3]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1076408862>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 50>}
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x21x3x!quant.uniform<i8:f32, 0.015685770660638809:-1>>
+// CHECK-SAME: %[[VAL_1:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686184167861938:-1>>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<50> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1076408862> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<11> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<32> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<2147427038> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<10> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_11:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_12:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_8]], %[[VAL_9]], %[[VAL_10]], %[[VAL_11]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x21x3x!quant.uniform<i8:f32, 0.015685770660638809:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<1x21x3xi32>
+// CHECK: %[[VAL_13:.*]] = tosa.rescale %[[VAL_12]], %[[VAL_7]], %[[VAL_6]], %[[VAL_11]], %[[VAL_11]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x21x3xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x21x3xi32>
+// CHECK: %[[VAL_14:.*]] = tosa.rescale %[[VAL_1]], %[[VAL_8]], %[[VAL_5]], %[[VAL_10]], %[[VAL_11]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686184167861938:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<13x21x3xi32>
+// CHECK: %[[VAL_15:.*]] = tosa.sub %[[VAL_13]], %[[VAL_14]] : (tensor<1x21x3xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
+// CHECK: %[[VAL_16:.*]] = tosa.rescale %[[VAL_15]], %[[VAL_4]], %[[VAL_3]], %[[VAL_11]], %[[VAL_2]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x3xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031294636428356171>>
 func.func @test_sub_qi8(%arg0: tensor<1x21x3x!quant.uniform<i8:f32, 0.015685770660638809:-1>>, %arg1: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686184167861938:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031294636428356171>> {
   %0 = tfl.sub(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x21x3x!quant.uniform<i8:f32, 0.015685770660638809:-1>>, tensor<13x21x3x!quant.uniform<i8:f32, 0.015686184167861938:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.031294636428356171>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.031294636428356171>>
@@ -2318,11 +2507,15 @@ func.func @test_sub_qi8(%arg0: tensor<1x21x3x!quant.uniform<i8:f32, 0.0156857706
 // -----
 
 // CHECK-LABEL: test_mul_qi8
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[VAR0:.*]] = tosa.rescale %arg0
-// CHECK-DAG: %[[VAR1:.*]] = tosa.rescale %arg1
-// CHECK-DAG: %[[VAR2:.*]] = tosa.mul %[[VAR0]], %[[VAR1]], %[[SHIFT]]
-// CHECK: %[[VAR3:.*]] = tosa.rescale %[[VAR2]]
+// CHECK-DAG: %[[shift35:.*]] = "tosa.const"() <{values = dense<35> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: %[[mult1075664768:.*]] = "tosa.const"() <{values = dense<1075664768> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[const0:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: %[[mult1073741824:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[shift30:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.rescale %arg0, %[[mult1073741824]], %[[shift30]]
+// CHECK-DAG: %[[VAR1:.*]] = tosa.rescale %arg1, %[[mult1073741824]], %[[shift30]]
+// CHECK: %[[VAR2:.*]] = tosa.mul %[[VAR0]], %[[VAR1]], %[[const0]]
+// CHECK: %[[VAR3:.*]] = tosa.rescale %[[VAR2]], %[[mult1075664768]], %[[shift35]]
 func.func @test_mul_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236982345581>>, %arg1: tensor<13x21x3x!quant.uniform<i8:f32, 0.015647144988179207:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.0078376950696110725>> {
   %0 = "tfl.mul"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236982345581>>, tensor<13x21x3x!quant.uniform<i8:f32, 0.015647144988179207:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.0078376950696110725>>
   func.return %0 : tensor<*x!quant.uniform<i8:f32, 0.0078376950696110725>>
@@ -2331,7 +2524,8 @@ func.func @test_mul_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_qi8
-// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0 {acc_type = i32, input_zp = 0 : i32, kernel = array<i64: 1, 1>, output_zp = 0 : i32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0, %[[ZP]], %[[ZP]] {acc_type = i32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK-SAME: -> tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
 func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
@@ -2341,7 +2535,8 @@ func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_i16
-// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0 {acc_type = i32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK-DAG: %[[ZP:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi16>}> : () -> tensor<1xi16>
+// CHECK: %[[VAR0:.*]] = tosa.avg_pool2d %arg0, %[[ZP]], %[[ZP]] {acc_type = i32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK-SAME: -> tensor<1x32x32x8xi16>
 func.func @test_avg_pool2d_i16(%arg0: tensor<1x32x32x8xi16>) -> tensor<*xi16> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xi16>) -> tensor<*xi16>
@@ -2360,27 +2555,33 @@ func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // -----
 
 // CHECK-LABEL: test_softmax_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<35> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<4> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<536870912> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<1515870810> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.const"() <{value = dense<-1010580540> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() <{value = dense<1> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() <{value = dense<12> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR9:.*]] = "tosa.const"() <{value = dense<9> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR10:.*]] = "tosa.const"() <{value = dense<17> : tensor<1x1x1xi32>}>
-// CHECK-DAG: %[[VAR11:.*]] = "tosa.const"() <{value = dense<"0x5{{.*}}"> : tensor<513xi16>}>
-// CHECK-DAG: %[[VAR12:.*]] = "tosa.const"() <{value = dense<"0xE{{.*}}"> : tensor<513xi16>}>
-// CHECK-DAG: %[[VAR13:.*]] = "tosa.const"() <{value = dense<"0x4{{.*}}"> : tensor<513xi16>}>
-// CHECK-DAG: %[[VAR14:.*]] = "tosa.const"() <{value = dense<"0x0{{.*}}"> : tensor<513xi16>}>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[SHIFT_30:.*]] = "tosa.const"() <{value = dense<30> : tensor<1xi8>}>
-// CHECK-DAG: %[[SHIFT_31:.*]] = "tosa.const"() <{value = dense<31> : tensor<1xi
-// CHECK-DAG: %[[VAR15:.*]] = tosa.rescale %arg0 {double_round = false, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<35> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<4> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{values = dense<536870912> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{values = dense<1515870810> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.const"() <{values = dense<-1010580540> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() <{values = dense<1> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() <{values = dense<12> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() <{values = dense<7> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR9:.*]] = "tosa.const"() <{values = dense<9> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR10:.*]] = "tosa.const"() <{values = dense<17> : tensor<1x1x1xi32>}>
+// CHECK-DAG: %[[VAR11:.*]] = "tosa.const"() <{values = dense<"0x5{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR12:.*]] = "tosa.const"() <{values = dense<"0xE{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR13:.*]] = "tosa.const"() <{values = dense<"0x4{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAR14:.*]] = "tosa.const"() <{values = dense<"0x0{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT_31:.*]] = "tosa.const"() <{values = dense<31> : tensor<1xi8>}>
+// CHECK-DAG: %[[mult1073741824:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-DAG: %[[shift30:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: %[[shift23:.*]] = "tosa.const"() <{values = dense<23> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK-DAG: %[[input_zp1:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}>
+// CHECK-DAG: %[[zp0i32:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}>
+// CHECK-DAG: %[[output_zp128:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL27:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi16>}>
+// CHECK-DAG: %[[VAR15:.*]] = tosa.rescale %arg0, %[[mult1073741824]], %[[shift30]], %[[input_zp1]], %[[zp0i32]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true}
 // CHECK-DAG: %[[VAR16:.*]] = tosa.reduce_max %[[VAR15]] {axis = 2 : i32}
 // CHECK-DAG: %[[VAR17:.*]] = tosa.sub %[[VAR15]], %[[VAR16]]
-// CHECK-DAG: %[[VAR18:.*]] = tosa.rescale %[[VAR17]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 23>}
+// CHECK-DAG: %[[VAR18:.*]] = tosa.rescale %[[VAR17]], %[[mult1073741824]], %[[shift23]], %[[zp0i32]], %[[VAL27]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true}
 // CHECK-DAG: %[[VAR19:.*]] = tosa.table %[[VAR18]], %[[VAR14]]
 // CHECK-DAG: %[[VAR20:.*]] = tosa.table %[[VAR18]], %[[VAR13]]
 // CHECK-DAG: %[[VAR21:.*]] = tosa.table %[[VAR18]], %[[VAR12]]
@@ -2414,10 +2615,10 @@ func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // CHECK-DAG: %[[VAR49:.*]] = tosa.mul %[[VAR46]], %[[VAR48]], %[[SHIFT_31]]
 // CHECK-DAG: %[[VAR50:.*]] = tosa.mul %[[VAR49]], %[[VAR2]], %[[SHIFT]]
 // CHECK-DAG: %[[VAR51:.*]] = tosa.add %[[VAR46]], %[[VAR50]]
-// CHECK-DAG: %[[VAR52:.*]] = tosa.mul %[[VAR29]], %[[VAR51]], %[[SHIFT_30]]
+// CHECK-DAG: %[[VAR52:.*]] = tosa.mul %[[VAR29]], %[[VAR51]], %[[shift30]]
 // CHECK-DAG: %[[VAR53:.*]] = tosa.sub %[[VAR1]], %[[VAR32]]
 // CHECK-DAG: %[[VAR54:.*]] = tosa.arithmetic_right_shift %[[VAR52]], %[[VAR53]] {round = true}
-// CHECK: %[[VAR55:.*]] = tosa.rescale %[[VAR54]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
+// CHECK: %[[VAR55:.*]] = tosa.rescale %[[VAR54]], %[[mult1073741824]], %[[shift30]], %[[zp0i32]], %[[output_zp128]]  {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true}
 func.func @test_softmax_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015685837715864182:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>> {
   %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015685837715864182:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>>
@@ -2425,40 +2626,46 @@ func.func @test_softmax_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568
 
 // -----
 
-
 // CHECK-LABEL: test_softmax_qi16
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<31> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<7> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<32768> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<14> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<1073741824> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.const"() <{value = dense<1> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() <{value = dense<32767> : tensor<1x1xi32>}>
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() <{value = dense<"0xF{{.*}}>
-// CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() <{value = dense<"0x0{{.*}}> : tensor<513xi16>}>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[VAR9:.*]] = tosa.rescale %arg0 {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK-DAG: %[[VAR10:.*]] = tosa.reduce_max %[[VAR9]] {axis = 1 : i32}
-// CHECK-DAG: %[[VAR11:.*]] = tosa.sub %[[VAR9]], %[[VAR10]]
-// CHECK-DAG: %[[VAR12:.*]] = tosa.rescale %[[VAR11]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1717965619>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 32>}
-// CHECK-DAG: %[[VAR13:.*]] = tosa.add %[[VAR12]], %[[VAR6]]
-// CHECK-DAG: %[[VAR14:.*]] = tosa.cast %[[VAR13]]
-// CHECK-DAG: %[[VAR15:.*]] = tosa.table %[[VAR14]], %[[VAR8]]
-// CHECK-DAG: %[[VAR16:.*]] = tosa.arithmetic_right_shift %[[VAR15]], %[[VAR1]] {round = true}
-// CHECK-DAG: %[[VAR17:.*]] = tosa.reduce_sum %[[VAR16]] {axis = 1 : i32}
-// CHECK-DAG: %[[VAR18:.*]] = tosa.clz %[[VAR17]]
-// CHECK-DAG: %[[VAR19:.*]] = tosa.sub %[[VAR18]], %[[VAR5]]
-// CHECK-DAG: %[[VAR20:.*]] = tosa.logical_left_shift %[[VAR17]], %[[VAR19]]
-// CHECK-DAG: %[[VAR21:.*]] = tosa.sub %[[VAR20]], %[[VAR4]]
-// CHECK-DAG: %[[VAR22:.*]] = tosa.arithmetic_right_shift %[[VAR21]], %[[VAR3]] {round = true}
-// CHECK-DAG: %[[VAR23:.*]] = tosa.sub %[[VAR22]], %[[VAR2]]
-// CHECK-DAG: %[[VAR24:.*]] = tosa.cast %[[VAR23]]
-// CHECK-DAG: %[[VAR25:.*]] = tosa.table %[[VAR24]], %[[VAR7]]
-// CHECK-DAG: %[[VAR26:.*]] = tosa.arithmetic_right_shift %[[VAR25]], %[[VAR1]] {round = true}
-// CHECK-DAG: %[[VAR27:.*]] = tosa.mul %[[VAR26]], %[[VAR16]], %[[SHIFT]]
-// CHECK-DAG: %[[VAR28:.*]] = tosa.sub %[[VAR0]], %[[VAR18]]
-// CHECK-DAG: %[[VAR29:.*]] = tosa.arithmetic_right_shift %[[VAR27]], %[[VAR28]] {round = true}
-// CHECK: %[[VAR30:.*]] = tosa.rescale %[[VAR29]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<31> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<"0xF{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<32768> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<14> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<1> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<7> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<32767> : tensor<1x1xi32>}>
+// CHECK-DAG: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<32> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_11:.*]] = "tosa.const"() <{values = dense<1717965619> : tensor<1xi32>}>
+// CHECK-DAG: %[[VAL_12:.*]] = "tosa.const"() <{values = dense<"0x0{{.*}}"> : tensor<513xi16>}>
+// CHECK-DAG: %[[VAL_13:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}>
+// CHECK-DAG: %[[VAL_14:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_15:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi16>}>
+// CHECK-DAG: %[[VAL_16:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}>
+// CHECK-DAG: %[[VAL_17:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[VAL_16]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi16>, tensor<1xi32>) -> tensor<14x19xi32>
+// CHECK-DAG: %[[VAL_18:.*]] = tosa.reduce_max %[[VAL_17]]
+// CHECK-DAG: %[[VAL_19:.*]] = tosa.sub %[[VAL_17]], %[[VAL_18]]
+// CHECK-DAG: %[[VAL_20:.*]] = tosa.rescale %[[VAL_19]], %[[VAL_11]], %[[VAL_10]], %[[VAL_16]], %[[VAL_16]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<14x19xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi32>) -> tensor<14x19xi32>
+// CHECK-DAG: %[[VAL_21:.*]] = tosa.add %[[VAL_20]], %[[VAL_9]]
+// CHECK-DAG: %[[VAL_22:.*]] = tosa.cast %[[VAL_21]]
+// CHECK-DAG: %[[VAL_23:.*]] = tosa.table %[[VAL_22]], %[[VAL_12]]
+// CHECK-DAG: %[[VAL_24:.*]] = tosa.arithmetic_right_shift %[[VAL_23]], %[[VAL_8]]
+// CHECK-DAG: %[[VAL_25:.*]] = tosa.reduce_sum %[[VAL_24]]
+// CHECK-DAG: %[[VAL_26:.*]] = tosa.clz %[[VAL_25]]
+// CHECK-DAG: %[[VAL_27:.*]] = tosa.sub %[[VAL_26]], %[[VAL_7]]
+// CHECK-DAG: %[[VAL_28:.*]] = tosa.logical_left_shift %[[VAL_25]], %[[VAL_27]]
+// CHECK-DAG: %[[VAL_29:.*]] = tosa.sub %[[VAL_28]], %[[VAL_6]]
+// CHECK-DAG: %[[VAL_30:.*]] = tosa.arithmetic_right_shift %[[VAL_29]], %[[VAL_5]]
+// CHECK-DAG: %[[VAL_31:.*]] = tosa.sub %[[VAL_30]], %[[VAL_4]]
+// CHECK-DAG: %[[VAL_32:.*]] = tosa.cast %[[VAL_31]]
+// CHECK-DAG: %[[VAL_33:.*]] = tosa.table %[[VAL_32]], %[[VAL_3]]
+// CHECK-DAG: %[[VAL_34:.*]] = tosa.arithmetic_right_shift %[[VAL_33]], %[[VAL_8]]
+// CHECK-DAG: %[[VAL_35:.*]] = tosa.mul %[[VAL_34]], %[[VAL_24]], %[[VAL_2]]
+// CHECK-DAG: %[[VAL_36:.*]] = tosa.sub %[[VAL_1]], %[[VAL_26]]
+// CHECK-DAG: %[[VAL_37:.*]] = tosa.arithmetic_right_shift %[[VAL_35]], %[[VAL_36]]
+// CHECK-DAG: %[[VAL_38:.*]] = tosa.rescale %[[VAL_37]], %[[VAL_13]], %[[VAL_14]], %[[VAL_16]], %[[VAL_15]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<14x19xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi16>)
 func.func @test_softmax_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>) -> tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>> {
   %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>) -> tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>>
   func.return %0 : tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>>
@@ -2467,7 +2674,7 @@ func.func @test_softmax_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 6.10353
 // -----
 
 // CHECK-LABEL: test_sigmoid_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<256xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<256xi8>}>
 // CHECK: %[[VAR1:.*]] = tosa.table %arg0, %[[VAR0]]
 func.func @test_sigmoid_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015667613595724106>>) -> tensor<*x!quant.uniform<i8:f32, 3.906250e-03:-128>> {
   %0 = "tfl.logistic"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015667613595724106>>) -> tensor<*x!quant.uniform<i8:f32, 3.906250e-03:-128>>
@@ -2477,7 +2684,7 @@ func.func @test_sigmoid_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01566
 // -----
 
 // CHECK-LABEL: test_tanh_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<256xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<256xi8>}>
 // CHECK: %[[VAR1:.*]] = tosa.table %arg0, %[[VAR0]]
 func.func @test_tanh_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015673128888010979:-1>>) -> tensor<*x!quant.uniform<i8:f32, 7.812500e-03>> {
   %0 = "tfl.tanh"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015673128888010979:-1>>) -> tensor<*x!quant.uniform<i8:f32, 7.812500e-03>>
@@ -2487,8 +2694,13 @@ func.func @test_tanh_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01567312
 // -----
 
 // CHECK-LABEL: test_relu_qi8
-// CHECK-DAG: %[[VAR0:.*]] = tosa.rescale %arg0
-// CHECK: %[[VAL_1:.*]] = tosa.clamp %0 {max_val = 127 : i8, min_val = -128 : i8}
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.015685949474573135:-1>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<2147471153> : tensor<1xi32>}>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}>
+// CHECK: %[[VAL_5:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015685949474573135:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>)
+// CHECK: %[[VAL_6:.*]] = tosa.clamp %[[VAL_5]] {max_val = 127 : i8, min_val = -128 : i8} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>>
 func.func @test_relu_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015685949474573135:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>> {
   %0 = "tfl.relu"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015685949474573135:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.0078430203720927238:-128>>
@@ -2497,8 +2709,12 @@ func.func @test_relu_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568594
 // -----
 
 // CHECK-LABEL: test_relu0To1_qi8
-// CHECK-DAG: %[[VAR0:.*]] = tosa.rescale %arg0
-// CHECK: %[[VAL_1:.*]] = tosa.clamp %0 {max_val = 126 : i8, min_val = -128 : i8}
+// CHECK-DAG: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<2147449478> : tensor<1xi32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_3]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>)
+// CHECK: %[[VAL_5:.*]] = tosa.clamp %[[VAL_4]] {max_val = 126 : i8, min_val = -128 : i8}
 func.func @test_relu0To1_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431377187371254:-1>> {
   %0 = "tfl.relu_n1_to_1"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431377187371254:-1>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431377187371254:-1>>
@@ -2506,9 +2722,14 @@ func.func @test_relu0To1_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.0156
 
 // -----
 
-// CHECK-LABEL: test_relu6_qi8
-// CHECK-DAG: %[[VAL_0:.*]] = tosa.rescale %arg0
-// CHECK: %[[VAL_1:.*]] = tosa.clamp %0 {max_val = 127 : i8, min_val = -128 : i8}
+// CHECK-LABEL: test_relu6_qi8(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686137601733208:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>> {
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<2147467328> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_5:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686137601733208:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>
+// CHECK: %[[VAL_6:.*]] = tosa.clamp %[[VAL_5]] {max_val = 127 : i8, min_val = -128 : i8} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>
 func.func @test_relu6_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686137601733208:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>  {
     %0 = "tfl.relu6"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015686137601733208:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>
     func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 0.0078431284055113792:-128>>
@@ -2533,11 +2754,17 @@ func.func @test_relu6_qu8(%arg0: tensor<13x21x3x!quant.uniform<u8:f32, 0.0156861
 // -----
 
 // CHECK-LABEL: test_leaky_relu_qi8
-// CHECK: %[[VAR0:.*]] = tosa.rescale %arg0 {double_round = true, input_zp = -1 : i32,
-// CHECK: %[[VAR1:.*]] = tosa.rescale %arg0 {double_round = true, input_zp = -1 : i32,
-// CHECK: %[[VAR2:.*]] = tosa.maximum %[[VAR1]], %[[VAR0]]
-// CHECK: %[[VAR3:.*]] = tosa.rescale %[[VAR2]] {double_round = true, input_zp = 0 : i32,
-// CHECK: return %[[VAR3]] : tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<2037371008> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<31> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_7:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<14x19xi32>
+// CHECK: %[[VAL_8:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_2]], %[[VAL_1]], %[[VAL_5]], %[[VAL_6]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<14x19xi32>
+// CHECK: %[[VAL_9:.*]] = tosa.maximum %[[VAL_8]], %[[VAL_7]]
+// CHECK: %[[VAL_10:.*]] = tosa.rescale %[[VAL_9]], %[[VAL_2]], %[[VAL_1]], %[[VAL_6]], %[[VAL_5]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<14x19xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>)
 func.func @test_leaky_relu_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015519863925874233:-1>> {
   %0 = "tfl.leaky_relu"(%arg0) {alpha = 0.948724806 : f32} : (tensor<14x19x!quant.uniform<i8:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015519863925874233:-1>>
   func.return %0 : tensor<*x!quant.uniform<i8:f32, 0.015519863925874233:-1>>
@@ -2546,24 +2773,34 @@ func.func @test_leaky_relu_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.0155
 // -----
 
 // CHECK-LABEL: test_leaky_relu_qi16
-// CHECK: %[[VAR0:.*]] = tosa.rescale %arg0 {double_round = true, input_zp = -1 : i32,
-// CHECK: %[[VAR1:.*]] = tosa.rescale %arg0 {double_round = true, input_zp = -1 : i32,
-// CHECK: %[[VAR2:.*]] = tosa.minimum %[[VAR1]], %[[VAR0]]
-// CHECK: %[[VAR3:.*]] = tosa.rescale %[[VAR2]] {double_round = true, input_zp = 0 : i32,
-// CHECK: return %[[VAR3]] : tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:-1>>
-func.func @test_leaky_relu_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:-1>> {
-  %0 = "tfl.leaky_relu"(%arg0) {alpha = 1.048724806 : f32} : (tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:-1>>) -> tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:-1>>
-  func.return %0 : tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:-1>>
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<1126059648> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi16>}> : () -> tensor<1xi16>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_6:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi16>, tensor<1xi32>) -> tensor<14x19xi32>
+// CHECK: %[[VAL_7:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi16>, tensor<1xi32>) -> tensor<14x19xi32>
+// CHECK: %[[VAL_8:.*]] = tosa.minimum %[[VAL_7]], %[[VAL_6]]
+// CHECK: %[[VAL_9:.*]] = tosa.rescale %[[VAL_8]], %[[VAL_1]], %[[VAL_3]], %[[VAL_5]], %[[VAL_4]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<14x19xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi16>)
+func.func @test_leaky_relu_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:0>>) -> tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:0>> {
+  %0 = "tfl.leaky_relu"(%arg0) {alpha = 1.048724806 : f32} : (tensor<14x19x!quant.uniform<i16:f32, 0.015519863925874233:0>>) -> tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:0>>
+  func.return %0 : tensor<*x!quant.uniform<i16:f32, 0.015519863925874233:0>>
 }
 
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<14> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "BILINEAR"}
-// CHECK: %[[VAR2:.*]] = tosa.rescale %[[VAR1]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 38>}
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<38> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<14> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK: %[[VAL_8:.*]] = tosa.resize %[[VAL_0]], %[[VAL_5]], %[[VAL_6]], %[[VAL_7]] {mode = "BILINEAR"} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x640x640x2xi32>
+// CHECK: %[[VAL_9:.*]] = tosa.rescale %[[VAL_8]], %[[VAL_4]], %[[VAL_3]], %[[VAL_2]], %[[VAL_1]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<1x640x640x2xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>)
 func.func @test_resize_bilinear_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -2573,9 +2810,9 @@ func.func @test_resize_bilinear_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f3
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_half_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<-7> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<7> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {values = dense<-7> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {values = dense<7> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "BILINEAR"}
 func.func @test_resize_bilinear_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2586,8 +2823,8 @@ func.func @test_resize_bilinear_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_align_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[CONST0]], %[[CONST0]] {mode = "BILINEAR"}
 func.func @test_resize_bilinear_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2598,8 +2835,8 @@ func.func @test_resize_bilinear_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_align_half_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[KM560:.*]] = tosa.const_shape {value = dense<-560> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[KM560:.*]] = tosa.const_shape {values = dense<-560> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[KM560]], %[[KM560]] {mode = "BILINEAR"}
 func.func @test_resize_bilinear_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2610,9 +2847,9 @@ func.func @test_resize_bilinear_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.un
 // -----
 
 // CHECK-LABEL: test_resize_nearest_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<14> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {values = dense<14> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "NEAREST_NEIGHBOR"}
 func.func @test_resize_nearest_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2624,9 +2861,9 @@ func.func @test_resize_nearest_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32
 // -----
 
 // CHECK-LABEL: test_resize_nearest_half_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<15> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[16, 2, 16, 2]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {values = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {values = dense<15> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "NEAREST_NEIGHBOR"}
 func.func @test_resize_nearest_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2637,8 +2874,8 @@ func.func @test_resize_nearest_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i
 // -----
 
 // CHECK-LABEL: test_resize_nearest_align_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[K639:.*]] = tosa.const_shape {value = dense<639> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[K639:.*]] = tosa.const_shape {values = dense<639> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[K639]], %[[K639]] {mode = "NEAREST_NEIGHBOR"}
 func.func @test_resize_nearest_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2649,8 +2886,8 @@ func.func @test_resize_nearest_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<
 // -----
 
 // CHECK-LABEL: test_resize_nearest_align_half_qi8
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[K718:.*]] = tosa.const_shape {value = dense<718> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[1278, 158, 1278, 158]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[K718:.*]] = tosa.const_shape {values = dense<718> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[K718]], %[[K718]] {mode = "NEAREST_NEIGHBOR"}
 func.func @test_resize_nearest_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2661,9 +2898,9 @@ func.func @test_resize_nearest_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uni
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_f32_scalar_input
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {values = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "BILINEAR"}
 func.func @test_resize_bilinear_f32_scalar_input(%arg0: tensor<3x1x1x7xf32>) -> tensor<3x2x2x7xf32> {
   %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2674,11 +2911,16 @@ func.func @test_resize_bilinear_f32_scalar_input(%arg0: tensor<3x1x1x7xf32>) ->
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_half_qi8_scalar_input
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "BILINEAR"}
-// CHECK: %[[VAL_2:.*]] = tosa.rescale %[[VAL_1]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 32>}
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<3x1x1x7x!quant.uniform<i8:f32, 1.000000e-01>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<32> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK: %[[VAL_8:.*]] = tosa.resize %[[VAL_0]], %[[VAL_5]], %[[VAL_6]], %[[VAL_7]] {mode = "BILINEAR"} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 1.000000e-01>>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<3x2x2x7xi32>
+// CHECK: %[[VAL_9:.*]] = tosa.rescale %[[VAL_8]], %[[VAL_4]], %[[VAL_3]], %[[VAL_2]], %[[VAL_1]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<3x2x2x7xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>)
 func.func @test_resize_bilinear_half_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
   %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = true} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>, tensor<2xi32>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
@@ -2688,11 +2930,16 @@ func.func @test_resize_bilinear_half_qi8_scalar_input(%arg0: tensor<3x1x1x7x!qua
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_align_qi8_scalar_input
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "BILINEAR"}
-// CHECK: %[[VAL_2:.*]] = tosa.rescale %[[VAL_1]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 32>}
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<3x1x1x7x!quant.uniform<i8:f32, 1.000000e-01>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<32> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK: %[[VAL_8:.*]] = tosa.resize %[[VAL_0]], %[[VAL_5]], %[[VAL_6]], %[[VAL_7]] {mode = "BILINEAR"} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 1.000000e-01>>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<3x2x2x7xi32>
+// CHECK: %[[VAL_9:.*]] = tosa.rescale %[[VAL_8]], %[[VAL_4]], %[[VAL_3]], %[[VAL_2]], %[[VAL_1]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<3x2x2x7xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>)
 func.func @test_resize_bilinear_align_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
   %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = true, half_pixel_centers = false} : (tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>, tensor<2xi32>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>>
@@ -2702,9 +2949,9 @@ func.func @test_resize_bilinear_align_qi8_scalar_input(%arg0: tensor<3x1x1x7x!qu
 // -----
 
 // CHECK-LABEL: test_resize_nearest_f32_scalar_input
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {values = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAL_1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "NEAREST_NEIGHBOR"}
 func.func @test_resize_nearest_f32_scalar_input(%arg0: tensor<3x1x1x7xf32>) -> tensor<3x2x2x7xf32> {
   %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2715,9 +2962,9 @@ func.func @test_resize_nearest_f32_scalar_input(%arg0: tensor<3x1x1x7xf32>) -> t
 // -----
 
 // CHECK-LABEL: test_resize_nearest_half_qi8_scalar_input
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {values = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAL_1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "NEAREST_NEIGHBOR"}
 func.func @test_resize_nearest_half_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
   %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2728,9 +2975,9 @@ func.func @test_resize_nearest_half_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quan
 // -----
 
 // CHECK-LABEL: test_resize_nearest_align_qi8_scalar_input
-// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {value = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {value = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {value = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SCALE:.*]] = tosa.const_shape {values = dense<[2, 1, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[OFFSET:.*]] = tosa.const_shape {values = dense<0> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[BORDER:.*]] = tosa.const_shape {values = dense<1> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAL_1:.*]] = tosa.resize %arg0, %[[SCALE]], %[[OFFSET]], %[[BORDER]] {mode = "NEAREST_NEIGHBOR"}
 func.func @test_resize_nearest_align_qi8_scalar_input(%arg0: tensor<3x1x1x7x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x2x2x7x!quant.uniform<i8:f32, 0.1>> {
   %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -2740,34 +2987,64 @@ func.func @test_resize_nearest_align_qi8_scalar_input(%arg0: tensor<3x1x1x7x!qua
 
 // -----
 
-// CHECK-LABEL: test_fullyconnected_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<0> : tensor<28xi32>}>
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[14, 1, 1, 19]> : tensor<4xindex>}
-// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[28, 1, 1, 19]> : tensor<4xindex>}
-// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[14, 28]> : tensor<2xindex>}
-// CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{value = dense<-1> : tensor<1xi8>}>
-// CHECK-DAG: %[[VAR2:.*]] = tosa.transpose %arg1 {perms = array<i32: 1, 0>}
-// CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %arg0, %[[CONST0]]
-// CHECK-DAG: %[[VAR4:.*]] = tosa.reshape %[[VAR2]], %[[CONST1]]
-// CHECK-DAG: %[[VAR5:.*]] = tosa.conv2d %[[VAR3]], %[[VAR4]], %[[VAR1]], %[[CONST3]], %[[CONST3]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
-// CHECK: %[[VAR6:.*]] = tosa.rescale %[[VAR5]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1353377973>, output_zp = 3 : i32, per_channel = false, scale32 = true, shift = array<i8: 40>}
-// CHECK: %[[VAR9:.*]] = tosa.reshape %[[VAR6]], %[[CONST2]]
-func.func @test_fullyconnected_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.015685491263866425:-1>>, %arg1: tensor<19x28x!quant.uniform<i8:f32, 0.015685983002185822:-1>>) -> tensor<14x28x!quant.uniform<i8:f32, 0.19988977909088135:3>> {
-  %0 = "tfl.pseudo_const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %1 = "tfl.transpose"(%arg1, %0) : (tensor<19x28x!quant.uniform<i8:f32, 0.015685983002185822:-1>>, tensor<2xi32>) -> tensor<28x19x!quant.uniform<i8:f32, 0.015685983002185822:-1>>
-  %cst = "tfl.no_value"() {value = unit} : () -> none
-  %2 = "tfl.fully_connected"(%arg0, %1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<14x19x!quant.uniform<i8:f32, 0.015685491263866425:-1>>, tensor<28x19x!quant.uniform<i8:f32, 0.015685983002185822:-1>>, none) -> tensor<14x28x!quant.uniform<i8:f32, 0.19988977909088135:3>>
-  func.return %2 : tensor<14x28x!quant.uniform<i8:f32, 0.19988977909088135:3>>
+// CHECK-LABEL: test_fullyconnected_qi16
+// CHECK: %[[BIAS:.+]] = "tosa.const"() <{values = dense<123> : tensor<3xi48>}> : () -> tensor<3xi48>
+// CHECK: tosa.conv2d {{.+}}, %[[BIAS]], %{{.+}} {acc_type = i48, {{.+}}} : {{.+}} -> tensor<1x1x1x3xi48>
+func.func @test_fullyconnected_qi16(%input: tensor<1x7x!quant.uniform<i16:f32, 1.0>>, %filter: tensor<3x7x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x3x!quant.uniform<i16:f32, 1.0>> {
+  %bias = "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32:f32, 1.0>>, value = dense<123> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32, 1.0>>
+  %0 = "tfl.fully_connected"(%input, %filter, %bias) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x7x!quant.uniform<i16:f32, 1.0>>, tensor<3x7x!quant.uniform<i8:f32, 1.0>>, tensor<3x!quant.uniform<i32:f32, 1.0>>) -> tensor<1x3x!quant.uniform<i16:f32, 1.0>>
+  return %0 : tensor<1x3x!quant.uniform<i16:f32, 1.0>>
+}
+
+// -----
+
+// CHECK-LABEL: @test_fullyconnected_dynamic_output
+func.func @test_fullyconnected_dynamic_output(%arg0: tensor<1x2048xf32>, %arg1: tensor<1000x2048xf32>, %arg2: tensor<1000xf32>) -> tensor<?x1000xf32> {
+  // CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {values = dense<[1, 1, 1, 2048]> : tensor<4xindex>}
+  // CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {values = dense<[1000, 1, 1, 2048]> : tensor<4xindex>}
+  // CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {values = dense<[1, 1000]> : tensor<2xindex>}
+  // CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+  // CHECK: %[[VAL0:.*]] = tosa.reshape %arg0, %[[CONST0]]
+  // CHECK: %[[VAL1:.*]] = tosa.reshape %arg1, %[[CONST1]]
+  // CHECK: %[[VAL2:.*]] = tosa.conv2d %[[VAL0]], %[[VAL1]], %arg2, %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  // CHECK: %[[VAL3:.*]] = tosa.reshape %[[VAL2]], %[[CONST2]]
+  // return %[[VAL3]]
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x2048xf32>, tensor<1000x2048xf32>, tensor<1000xf32>) -> tensor<?x1000xf32>
+  func.return %0 : tensor<?x1000xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @test_fullyconnected_keep_dims
+func.func @test_fullyconnected_keep_dims(%arg0: tensor<1x64x64x768x!quant.uniform<i8:f32, 0.13852123916149139:5>>, %arg1: tensor<3072x768x!quant.uniform<i8<-127:127>:f32, 0.003333511995151639>>, %arg2: tensor<3072x!quant.uniform<i32:f32, 4.6176221803762019E-4>>) -> tensor<1x64x64x3072x!quant.uniform<i8:f32, 0.1022367924451828:45>> {
+    // CHECK-DAG: %[[CONST_SHAPE0:.*]] = tosa.const_shape  {values = dense<[1, 64, 64, 3072]> : tensor<4xindex>}
+    // CHECK-DAG: %[[CONST0:.*]] = "tosa.const"() <{values = dense<38> : tensor<1xi8>}>
+    // CHECK-DAG: %[[CONST1:.*]] = "tosa.const"() <{values = dense<1241512252> : tensor<1xi32>}>
+    // CHECK-DAG: %[[CONST2:.*]] = "tosa.const"() <{values = dense<45> : tensor<1xi8>}>
+    // CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}>
+    // CHECK-DAG: %[[CONST4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+    // CHECK-DAG: %[[CONST5:.*]] = "tosa.const"() <{values = dense<5> : tensor<1xi8>}>
+    // CHECK-DAG: %[[CONST_SHAPE1:.*]] = tosa.const_shape  {values = dense<[3072, 1, 1, 768]> : tensor<4xindex>}
+    // CHECK-DAG: %[[CONST_SHAPE2:.*]] = tosa.const_shape  {values = dense<[4096, 1, 1, 768]> : tensor<4xindex>}
+    // CHECK: %[[RESHAPE_IN:.*]] = tosa.reshape %arg0, %[[CONST_SHAPE2]] : (tensor<1x64x64x768x!quant.uniform<i8:f32, 0.13852123916149139:5>>, !tosa.shape<4>)
+    // CHECK: %[[RESHAPE_FILT:.*]] = tosa.reshape %arg1, %[[CONST_SHAPE1]] : (tensor<3072x768x!quant.uniform<i8<-127:127>:f32, 0.003333511995151639>>, !tosa.shape<4>)
+    // CHECK: %[[CONV:.*]] = tosa.conv2d %[[RESHAPE_IN]], %[[RESHAPE_FILT]], %arg2, %[[CONST5]], %[[CONST4]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<4096x1x1x768x!quant.uniform<i8:f32, 0.13852123916149139:5>>, tensor<3072x1x1x768x!quant.uniform<i8<-127:127>:f32, 0.003333511995151639>>, tensor<3072x!quant.uniform<i32:f32, 4.6176221803762019E-4>>, tensor<1xi8>, tensor<1xi8>)
+    // CHECK: %[[RESCALE:.*]] = tosa.rescale %[[CONV]], %[[CONST1]], %[[CONST0]], %[[CONST3]], %[[CONST2]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<4096x1x1x3072xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>)
+    // CHECK: %[[RESHAPE_OUT:.*]] = tosa.reshape %[[RESCALE]], %[[CONST_SHAPE0]] : (tensor<4096x1x1x3072x!quant.uniform<i8:f32, 0.1022367924451828:45>>, !tosa.shape<4>) -> tensor<1x64x64x3072x!quant.uniform<i8:f32, 0.1022367924451828:45>>
+    // CHECK: return %[[RESHAPE_OUT]]
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<1x64x64x768x!quant.uniform<i8:f32, 0.13852123916149139:5>>, tensor<3072x768x!quant.uniform<i8<-127:127>:f32, 0.003333511995151639>>, tensor<3072x!quant.uniform<i32:f32, 4.6176221803762019E-4>>) -> tensor<1x64x64x3072x!quant.uniform<i8:f32, 0.1022367924451828:45>>
+  func.return %0 : tensor<1x64x64x3072x!quant.uniform<i8:f32, 0.1022367924451828:45>>
 }
 
 // -----
+
 // CHECK-LABEL: test_gather
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 13, 63]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 13, 63]> : tensor<3xindex>}
 // CHECK-DAG: %[[VAR4:.*]] = tosa.reshape %arg0, %[[VAR10]]
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, 49]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 49]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %arg1, %[[VAR11]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.gather %[[VAR4]], %[[VAR5]]
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[7, 7, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[7, 7, 21, 3]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAR7:.*]] = tosa.reshape %[[VAR6]], %[[VAR12]]
 // CHECK: return %[[VAR7]]
 func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi32>) -> tensor<*xf32> {
@@ -2777,12 +3054,12 @@ func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi32>) -> te
 
 // -----
 // CHECK-LABEL: test_gather_dyn
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, -1, 63]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, -1, 63]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK-DAG: %[[VAR4:.*]] = tosa.reshape %arg0, %[[VAR10]]
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, 49]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 49]> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %arg1, %[[VAR11]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.gather %[[VAR4]], %[[VAR5]]
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[7, 7, 21, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[7, 7, 21, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK-DAG: %[[VAR7:.*]] = tosa.reshape %[[VAR6]], %[[VAR12]]
 // CHECK: return %[[VAR7]]
 func.func @test_gather_dyn(%arg0: tensor<?x21x3xf32>, %arg1 : tensor<7x7xi32>) -> tensor<*xf32> {
@@ -2793,12 +3070,12 @@ func.func @test_gather_dyn(%arg0: tensor<?x21x3xf32>, %arg1 : tensor<7x7xi32>) -
 
 // -----
 // CHECK-LABEL: test_gather_channel_dyn
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 13, -1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 13, -1]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK-DAG: %[[VAR4:.*]] = tosa.reshape %arg0, %[[VAR10]]
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, 49]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 49]> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %arg1, %[[VAR11]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.gather %[[VAR4]], %[[VAR5]]
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[7, 7, 21, -1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[7, 7, 21, -1]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK-DAG: %[[VAR7:.*]] = tosa.reshape %[[VAR6]], %[[VAR12]]
 // CHECK: return %[[VAR7]]
 func.func @test_gather_channel_dyn(%arg0: tensor<13x21x?xf32>, %arg1: tensor<7x7xi32>) -> tensor<*xf32> {
@@ -2808,12 +3085,12 @@ func.func @test_gather_channel_dyn(%arg0: tensor<13x21x?xf32>, %arg1: tensor<7x7
 
 // -----
 // CHECK-LABEL: test_gather_indices_dyn
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 13, 63]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 13, 63]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK-DAG: %[[VAR4:.*]] = tosa.reshape %arg0, %[[VAR10]]
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, -1]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, -1]> : tensor<2xindex>} : () -> !tosa.shape<2>
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %arg1, %[[VAR11]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.gather %[[VAR4]], %[[VAR5]]
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[-1, 7, 21, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[-1, 7, 21, 3]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK-DAG: %[[VAR7:.*]] = tosa.reshape %[[VAR6]], %[[VAR12]]
 // CHECK: return %[[VAR7]]
 func.func @test_gather_indices_dyn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<?x7xi32>) -> tensor<*xf32> {
@@ -2823,9 +3100,9 @@ func.func @test_gather_indices_dyn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<?x7
 
 // -----
 // CHECK-LABEL: test_gather_batch
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 4, 16]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, 3, 4, 4]> : tensor<4xindex>}
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{\[\[}}0, 3, 1]]> : tensor<1x3xi32>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 4, 16]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 3, 4, 4]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<{{\[\[}}0, 3, 1]]> : tensor<1x3xi32>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.gather %[[VAR1]], %[[VAR0]]
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR11]]
@@ -2839,10 +3116,10 @@ func.func @test_gather_batch(%arg0: tensor<1x4x4x4xi32>) -> tensor<1x3x4x4xi32>
 // -----
 
 // CHECK-LABEL: test_gather_batch_dyn
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[-1, 4, 16]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[-1, 4, 16]> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK-DAG: %[[VAR1:.*]] = tosa.reshape %arg0, %[[VAR10]]
 // CHECK-DAG: %[[VAR2:.*]] = tosa.gather %[[VAR1]], %arg1
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[-1, 3, 4, 4]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[-1, 3, 4, 4]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR2]], %[[VAR11]]
 // CHECK: return %[[VAR3]]
 func.func @test_gather_batch_dyn(%arg0: tensor<?x4x4x4xi32>, %arg1: tensor<?x3xi32>) -> tensor<?x3x4x4xi32> {
@@ -2852,11 +3129,11 @@ func.func @test_gather_batch_dyn(%arg0: tensor<?x4x4x4xi32>, %arg1: tensor<?x3xi
 
 // -----
 // CHECK-LABEL: test_gather_nd
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[1, 273, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[42, 2]> : tensor<2xindex>}
-// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[1, 42]> : tensor<2xindex>}
-// CHECK-DAG: %[[CONST3:.*]] = tosa.const_shape {value = dense<[6, 7, 3]> : tensor<3xindex>}
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {values = dense<[1, 273, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {values = dense<[42, 2]> : tensor<2xindex>}
+// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {values = dense<[1, 42]> : tensor<2xindex>}
+// CHECK-DAG: %[[CONST3:.*]] = tosa.const_shape {values = dense<[6, 7, 3]> : tensor<3xindex>}
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"
 // CHECK-DAG: %[[VAR2:.*]] = tosa.reshape %arg0, %[[CONST0]]
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %arg1, %[[CONST1]]
@@ -2873,12 +3150,12 @@ func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6x7x2xi32>)
 // -----
 // CHECK-LABEL: test_gather_cast
 // CHECK-DAG: %[[VAR1:.*]] = tosa.cast %arg1
-// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {value = dense<[1, 13, 63]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAR10:.*]] = tosa.const_shape {values = dense<[1, 13, 63]> : tensor<3xindex>}
 // CHECK-DAG: %[[VAR2:.*]] = tosa.reshape %arg0, %[[VAR10]]
-// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {value = dense<[1, 49]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAR11:.*]] = tosa.const_shape {values = dense<[1, 49]> : tensor<2xindex>}
 // CHECK-DAG: %[[VAR3:.*]] = tosa.reshape %[[VAR1]], %[[VAR11]]
 // CHECK-DAG: %[[VAR4:.*]] = tosa.gather %[[VAR2]], %[[VAR3]]
-// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {value = dense<[7, 7, 21, 3]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR12:.*]] = tosa.const_shape {values = dense<[7, 7, 21, 3]> : tensor<4xindex>}
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reshape %[[VAR4]], %[[VAR12]]
 // CHECK: return %[[VAR5]]
 func.func @test_gather_cast(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi64>) -> tensor<*xf32> {
@@ -2889,12 +3166,12 @@ func.func @test_gather_cast(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi64>)
 // -----
 
 // CHECK-LABEL: test_sparse_to_dense
-// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[1, -1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[1, -1]> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[1, 48]> : tensor<2xindex>} : () -> !tosa.shape<2>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<{{\[\[}}48, 1]]> : tensor<1x2xi32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<-1> : tensor<1x48x1xi64>}>
+// CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {values = dense<[1, -1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {values = dense<[1, -1]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {values = dense<[1, 48]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<{{\[\[}}48, 1]]> : tensor<1x2xi32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1x48x1xi64>}>
 // CHECK-DAG: %[[VAR2:.*]] = tosa.cast %arg0
 // CHECK-DAG: %[[VAR4:.*]] = tosa.mul %[[VAR2]], %[[VAR0]], %[[SHIFT]]
 // CHECK-DAG: %[[VAR5:.*]] = tosa.reduce_sum %[[VAR4]] {axis = 1 : i32}
@@ -2912,6 +3189,71 @@ func.func @test_sparse_to_dense(%arg0 : tensor<?x2xi64>, %arg1 : tensor<?xi64>)
 
 // -----
 
+// CHECK-LABEL: test_scatter_nd
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x224x512xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x2xi32>}>
+// CHECK-DAG: %[[VAR3:.*]] = tosa.reduce_sum %[[VAR2:.*]] {axis = 1 : i32} : (tensor<1x2xi32>)
+// CHECK-DAG: %[[VAR5:.*]] = tosa.scatter %[[VAR1:.*]], %[[VAR3:.*]], %arg0 : (tensor<1x224x512xf32>, tensor<1x1xi32>, tensor<1x1x512xf32>)
+// CHECK: return %[[VAR5]]
+func.func @test_scatter_nd(%arg0: tensor<1x1x512xf32>) -> tensor<1x224x512xf32> {
+  %shape = "tfl.pseudo_const"() <{value = dense<[1, 224, 512]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  %indices = "tfl.pseudo_const"() <{value = dense<[[[0, 0]]]> : tensor<1x1x2xi32>}> : () -> tensor<1x1x2xi32>
+  %0 = "tfl.scatter_nd"(%indices, %arg0, %shape) : (tensor<1x1x2xi32>, tensor<1x1x512xf32>, tensor<3xi32>) -> tensor<1x224x512xf32>
+  func.return %0 : tensor<1x224x512xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_scatter_nd_reshape
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<{{\[\[}}8, 4, 1]]> : tensor<1x3xi32>}> : ()
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x16x4xf32>}> : ()
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{values = dense<{{\[\[}}0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3]]> : tensor<8x3xi32>}> : ()
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[NEW_SHAPE:.*]] = tosa.const_shape {values = dense<[1, 8, 4]> : tensor<3xindex>}
+// CHECK-DAG: %[[NEW_SHAPE1:.*]] = tosa.const_shape {values = dense<[1, 8]> : tensor<2xindex>}
+// CHECK-DAG: %[[NEW_SHAPE2:.*]] = tosa.const_shape {values = dense<[2, 2, 4, 4]> : tensor<4xindex>}
+// CHECK-DAG: %[[VAR4:.*]] = tosa.reshape %arg0, %[[NEW_SHAPE]] : (tensor<2x2x2x4xf32>, !tosa.shape<3>)
+// CHECK-DAG: %[[VAR5:.*]] = tosa.mul %[[VAR3]], %[[VAR1]], %[[SHIFT]] : (tensor<8x3xi32>, tensor<1x3xi32>, tensor<1xi8>)
+// CHECK-DAG: %[[VAR6:.*]] = tosa.reduce_sum %[[VAR5]] {axis = 1 : i32} : (tensor<8x3xi32>)
+// CHECK-DAG: %[[VAR7:.*]] = tosa.reshape %[[VAR6]], %[[NEW_SHAPE1]] : (tensor<8x1xi32>, !tosa.shape<2>)
+// CHECK-DAG: %[[VAR8:.*]] = tosa.scatter %[[VAR2]], %[[VAR7]], %[[VAR4]] : (tensor<1x16x4xf32>, tensor<1x8xi32>, tensor<1x8x4xf32>)
+// CHECK-DAG: %[[VAR9:.*]] = tosa.reshape %[[VAR8]], %[[NEW_SHAPE2]] : (tensor<1x16x4xf32>, !tosa.shape<4>)
+// CHECK-DAG: return %[[VAR9]]
+func.func @test_scatter_nd_reshape(%arg0: tensor<2x2x2x4xf32>) -> tensor<2x2x4x4xf32> {
+  %shape = "tfl.pseudo_const"() <{value = dense<[2, 2, 4, 4]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %indices = "tfl.pseudo_const"() <{value = dense<[[[[0, 0, 0], [0, 0, 1]], [[0, 0, 2], [0, 0, 3]]], [[[1, 0, 0], [1, 0, 1]], [[1, 0, 2], [1, 0, 3]]]]> : tensor<2x2x2x3xi32>}> : () -> tensor<2x2x2x3xi32>
+  %0 = "tfl.scatter_nd"(%indices, %arg0, %shape) : (tensor<2x2x2x3xi32>, tensor<2x2x2x4xf32>, tensor<4xi32>) -> tensor<2x2x4x4xf32>
+  func.return %0 : tensor<2x2x4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_scatter_nd_qi8
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x224x512xi8>}> : () -> tensor<1x224x512x!quant.uniform<i8:f32, 1.000000e-01:-128>>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1x2xi32>}>
+// CHECK-DAG: %[[VAR3:.*]] = tosa.reduce_sum %[[VAR2:.*]] {axis = 1 : i32} : (tensor<1x2xi32>)
+// CHECK-DAG: %[[VAR4:.*]] = tosa.scatter %[[VAR1:.*]], %[[VAR3:.*]], %arg0 : (tensor<1x224x512x!quant.uniform<i8:f32, 1.000000e-01:-128>>, tensor<1x1xi32>, tensor<1x1x512x!quant.uniform<i8:f32, 1.000000e-01:-128>>)
+// CHECK: return %[[VAR4]]
+func.func @test_scatter_nd_qi8(%arg0: tensor<1x1x512x!quant.uniform<i8:f32, 0.1:-128>>) -> tensor<1x224x512x!quant.uniform<i8:f32, 0.1:-128>> {
+  %shape = "tfl.pseudo_const"() <{value = dense<[1, 224, 512]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  %indices = "tfl.pseudo_const"() <{value = dense<[[[0, 0]]]> : tensor<1x1x2xi32>}> : () -> tensor<1x1x2xi32>
+  %0 = "tfl.scatter_nd"(%indices, %arg0, %shape) : (tensor<1x1x2xi32>, tensor<1x1x512x!quant.uniform<i8:f32, 0.1:-128>>, tensor<3xi32>) -> tensor<1x224x512x!quant.uniform<i8:f32, 0.1:-128>>
+  func.return %0 : tensor<1x224x512x!quant.uniform<i8:f32, 0.1:-128>>
+}
+
+// -----
+
+// CHECK-LABEL: test_scatter_nd_duplicate_indices
+// CHECK: tfl.scatter_nd
+func.func @test_scatter_nd_duplicate_indices(%arg0: tensor<2x2x2x4xf32>) -> tensor<2x2x4x4xf32> {
+  %shape = "tfl.pseudo_const"() <{value = dense<[2, 2, 4, 4]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %indices = "tfl.pseudo_const"() <{value = dense<[[[[0, 0, 0], [0, 0, 1]], [[0, 0, 2], [0, 0, 3]]], [[[1, 0, 0], [1, 0, 0]], [[1, 0, 2], [1, 0, 3]]]]> : tensor<2x2x2x3xi32>}> : () -> tensor<2x2x2x3xi32>
+  %0 = "tfl.scatter_nd"(%indices, %arg0, %shape) : (tensor<2x2x2x3xi32>, tensor<2x2x2x4xf32>, tensor<4xi32>) -> tensor<2x2x4x4xf32>
+  func.return %0 : tensor<2x2x4x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @test_arg_max
 func.func @test_arg_max(%arg0: tensor<13x21x3xf32>) -> tensor<*xi32> {
   // CHECK: %[[ARGMAX:.+]] = tosa.argmax %arg0 {axis = 1 : i32}
@@ -2934,8 +3276,9 @@ func.func @test_arg_max_negative_dim(%arg0: tensor<13x21x3xf32>) -> tensor<13x21
 
 // CHECK-LABEL: @test_arg_min_f32
 func.func @test_arg_min_f32(%arg0: tensor<13x21x3xf32>) -> tensor<*xi32> {
-  // CHECK: %[[NEG:.+]] = tosa.negate %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
-  // CHECK: tosa.argmax %[[NEG]] {axis = 1 : i32}
+  // CHECK-DAG: %[[CONST_0:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+  // CHECK-DAG: %[[NEG:.+]] = tosa.negate %arg0, %[[CONST_0]], %[[CONST_0]] : (tensor<13x21x3xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<13x21x3xf32>
+  // CHECK-DAG: tosa.argmax %[[NEG]] {axis = 1 : i32}
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tfl.arg_min"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<*xi32>
   func.return %1 : tensor<*xi32>
@@ -2945,7 +3288,7 @@ func.func @test_arg_min_f32(%arg0: tensor<13x21x3xf32>) -> tensor<*xi32> {
 
 // CHECK-LABEL: @test_arg_min_i32
 func.func @test_arg_min_i32(%arg0: tensor<13x21x3xi32>) -> tensor<*xi32> {
-  // CHECK: %[[ONE:.+]] = "tosa.const"() <{value = dense<-1> : tensor<1x1x1xi32>}>
+  // CHECK: %[[ONE:.+]] = "tosa.const"() <{values = dense<-1> : tensor<1x1x1xi32>}>
   // CHECK: %[[SUB:.+]] = tosa.sub %[[ONE]], %arg0
   // CHECK: %[[ARGMAX:.+]] = tosa.argmax %[[SUB]] {axis = 1 : i32}
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
@@ -2955,17 +3298,22 @@ func.func @test_arg_min_i32(%arg0: tensor<13x21x3xi32>) -> tensor<*xi32> {
 
 // -----
 
-// CHECK-LABEL: @test_arg_min_ui8
+// CHECK-LABEL: test_arg_min_ui8
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xui8>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1x1x1xi8>}> : () -> tensor<1x1x1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_6:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_4]], %[[VAL_5]], %[[VAL_3]], %[[VAL_2]] {input_unsigned = true, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<13x21x3xui8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<13x21x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>
+// CHECK: %[[VAL_7:.*]] = tosa.cast %[[VAL_6]] : (tensor<13x21x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>) -> tensor<13x21x3xi8>
+// CHECK: %[[VAL_8:.*]] = tosa.sub %[[VAL_1]], %[[VAL_7]] : (tensor<1x1x1xi8>, tensor<13x21x3xi8>) -> tensor<13x21x3xi8>
+// CHECK: %[[VAL_9:.*]] = tosa.argmax %[[VAL_8]] {axis = 1 : i32} : (tensor<13x21x3xi8>) -> tensor<13x3xi8>
+// CHECK: %[[VAL_10:.*]] = tosa.rescale %[[VAL_9]], %[[VAL_4]], %[[VAL_5]], %[[VAL_3]], %[[VAL_2]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<13x3xi8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<13x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>
+// CHECK: %[[VAL_11:.*]] = tosa.rescale %[[VAL_10]], %[[VAL_4]], %[[VAL_5]], %[[VAL_2]], %[[VAL_3]] {input_unsigned = false, output_unsigned = true, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<13x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<13x3xui8>
+// CHECK: %[[VAL_12:.*]] = tensor.cast %[[VAL_11]] : tensor<13x3xui8> to tensor<*xui8>
+// CHECK: return %[[VAL_12]] : tensor<*xui8>
 func.func @test_arg_min_ui8(%arg0: tensor<13x21x3xui8>) -> tensor<*xui8> {
-  // CHECK: %[[MAX:.+]] = "tosa.const"() <{value = dense<-1> : tensor<1x1x1xi8>}
-  // CHECK: %[[RESCALE:.+]] = tosa.rescale %arg0 {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-  // CHECK: %[[CAST:.+]] = tosa.cast %[[RESCALE]] : (tensor<13x21x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>) -> tensor<13x21x3xi8>
-  // CHECK: %[[SUB:.+]] = tosa.sub %[[MAX]], %[[CAST]]
-  // CHECK: %[[ARGMAX:.+]] = tosa.argmax %[[SUB]] {axis = 1 : i32} : (tensor<13x21x3xi8>) -> tensor<13x3xi8>
-  // CHECK: %[[RESCALE2:.+]] = tosa.rescale %[[ARGMAX]] {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-  // CHECK: %[[RESCALE3:.+]] = tosa.rescale %[[RESCALE2]] {double_round = false, input_zp = -128 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-  // CHECK: %[[CAST2:.+]] = tensor.cast %[[RESCALE3]] : tensor<13x3xui8> to tensor<*xui8>
-  // CHECK: return %[[CAST2]] : tensor<*xui8>
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tfl.arg_min"(%arg0, %0) : (tensor<13x21x3xui8>, tensor<i32>) -> tensor<*xui8>
   func.return %1 : tensor<*xui8>
@@ -2974,12 +3322,12 @@ func.func @test_arg_min_ui8(%arg0: tensor<13x21x3xui8>) -> tensor<*xui8> {
 // -----
 
 // CHECK-LABEL: test_fakequant
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<-2.00003052> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<1.99996948> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<-2.00003052> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<1.99996948> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<6.10360876E-5> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.const"() <{values = dense<16383.75> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.const"() <{values = dense<5.000000e-01> : tensor<1x1x1xf32>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR6:.*]] = tosa.minimum %arg0, %[[VAR1]]
 // CHECK-DAG: %[[VAR8:.*]] = tosa.maximum %[[VAR6]], %[[VAR0]]
 // CHECK-DAG: %[[VAR10:.*]] = tosa.sub %[[VAR8]], %[[VAR0]]
@@ -2995,27 +3343,6 @@ func.func @test_fakequant(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // -----
 
-// CHECK-LABEL: @test_fullyconnected_hybrid
-func.func @test_fullyconnected_hybrid(%arg0: tensor<14x19xf32>, %arg1: tensor<28x19x!quant.uniform<i8:f32, 1.0:17>>, %arg2: tensor<28xf32>) -> tensor<*xf32> {
-  // This verifies that the constant is decomposed into a dequantization via a
-  // cast, subtract, and multiplication.
-  // CHECK-DAG: %[[CONST0:.*]] = tosa.const_shape {value = dense<[14, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST1:.*]] = tosa.const_shape {value = dense<[28, 1, 1, 19]> : tensor<4xindex>}
-  // CHECK-DAG: %[[CONST2:.*]] = tosa.const_shape {value = dense<[14, 28]> : tensor<2xindex>}
-  // CHECK-DAG: %[[VAL0:.*]] = "tosa.const"() <{value = dense<1.700000e+01> : tensor<1x1xf32>}>
-  // CHECK-DAG: %[[CONST3:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
-  // CHECK: %[[VAL1:.*]] = tosa.cast %arg1
-  // CHECK: %[[VAL2:.*]] = tosa.sub %[[VAL1]], %[[VAL0]]
-  // CHECK: %[[VAL3:.*]] = tosa.reshape %arg0, %[[CONST0]]
-  // CHECK: %[[VAL4:.*]] = tosa.reshape %[[VAL2]], %[[CONST1]]
-  // CHECK: %[[VAL5:.*]] = tosa.conv2d %[[VAL3]], %[[VAL4]], %arg2, %[[CONST3]], %[[CONST3]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
-  // CHECK: %[[VAL6:.*]] = tosa.reshape %[[VAL5]], %[[CONST2]]
-  %2 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<14x19xf32>, tensor<28x19x!quant.uniform<i8:f32, 1.0:17>>, tensor<28xf32>) -> tensor<*xf32>
-  func.return %2 : tensor<*xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @test_conv2d_infer
 // CHECK: -> tensor<1x32x32x16xf32>
 func.func @test_conv2d_infer(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>) -> tensor<*xf32> {
@@ -3029,6 +3356,15 @@ func.func @test_conv2d_infer(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x
 
 // -----
 
+// CHECK-LABEL: @test_conv2d_no_bias
+func.func @test_conv2d_no_bias(%input: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>>, %filter: tensor<3x3x8x16x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x32x32x3x!quant.uniform<i16:f32, 1.0>> {
+  %bias = "tfl.no_value"() {value} : () -> none
+  %0 = "tfl.conv_2d"(%input, %filter, %bias) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>>, tensor<3x3x8x16x!quant.uniform<i8:f32, 1.0>>, none) -> tensor<1x32x32x3x!quant.uniform<i16:f32, 1.0>>
+  return %0 : tensor<1x32x32x3x!quant.uniform<i16:f32, 1.0>>
+}
+
+// -----
+
 // CHECK-LABEL: @test_squeeze
 func.func @test_squeeze(%arg0: tensor<2x1x3x1xf32>) -> tensor<2x3x1xf32> {
   // CHECK: tosa.reshape
@@ -3051,12 +3387,12 @@ func.func @test_squeeze_neg(%arg0: tensor<2x1x3x1xf32>) -> tensor<2x1x3xf32> {
 
 // CHECK-LABEL: test_gelu
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x4x8x19xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xf32>}>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<4.471500e-02> : tensor<1x1x1x1xf32>}>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<0.797884583> : tensor<1x1x1x1xf32>}>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x1x1xf32>}>
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<5.000000e-01> : tensor<1x1x1x1xf32>}>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<3.000000e+00> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<4.471500e-02> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0.797884583> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<5.000000e-01> : tensor<1x1x1x1xf32>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAL_6:.*]] = tosa.pow %[[VAL_0]], %[[VAL_1]]
 // CHECK: %[[VAL_7:.*]] = tosa.mul %[[VAL_6]], %[[VAL_2]], %[[SHIFT]]
 // CHECK: %[[VAL_8:.*]] = tosa.add %[[VAL_0]], %[[VAL_7]]
@@ -3074,7 +3410,7 @@ func.func @test_gelu(%arg0: tensor<1x4x8x19xf32>) -> tensor<1x4x8x19xf32> {
 
 // CHECK-LABEL: test_gelu_qi8
 // CHECK-SAME: %[[VAR0:.*]]: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>
-// CHECK: %[[VAR1:.*]] = "tosa.const"() <{value = dense<{{.*}}> : tensor<256xi8>}>
+// CHECK: %[[VAR1:.*]] = "tosa.const"() <{values = dense<{{.*}}> : tensor<256xi8>}>
 // CHECK: %[[VAR2:.*]] = tosa.table %[[VAR0]], %[[VAR1]] : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>, tensor<256x!quant.uniform<i8:f32, 1.000000e+00>>
 func.func @test_gelu_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.0083315325900912285:-108>> {
   %0 = "tfl.gelu"(%arg0) {approximate = true} : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.015685562044382095:-1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
@@ -3084,14 +3420,14 @@ func.func @test_gelu_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.01568556
 // -----
 
 // CHECK-LABEL: mirrorpad_reflect
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[0, 7]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[7, 1]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[0, 1]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[7, 2]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {value = dense<[2, 0]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {value = dense<[1, 9]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {value = dense<[2, 9]> : tensor<2xindex>}
-// CHECK-DAG: %[[VAL_8:.*]] = tosa.const_shape  {value = dense<[1, 0]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[0, 7]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[7, 1]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[0, 1]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[7, 2]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_5:.*]] = tosa.const_shape  {values = dense<[2, 0]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[1, 9]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_7:.*]] = tosa.const_shape  {values = dense<[2, 9]> : tensor<2xindex>}
+// CHECK-DAG: %[[VAL_8:.*]] = tosa.const_shape  {values = dense<[1, 0]> : tensor<2xindex>}
 // CHECK: %[[VAL_9:.*]] = tosa.slice %arg0, %[[VAL_8]], %[[VAL_7]] : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
 // CHECK: %[[VAL_10:.*]] = tosa.reverse %[[VAL_9]] {axis = 0 : i32} : (tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>) -> tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
 // CHECK: %[[VAL_11:.*]] = tosa.slice %arg0, %[[VAL_5]], %[[VAL_6]] : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
@@ -3109,10 +3445,10 @@ func.func @mirrorpad_reflect(%arg0: tensor<4x9x!quant.uniform<i8:f32, 0.00833153
 // -----
 
 // CHECK-LABEL: mirrorpad_symmetric
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[16, 24, 1]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[16, 1, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 23, 2]> : tensor<3xindex>}
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<0> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[16, 24, 1]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[16, 1, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 23, 2]> : tensor<3xindex>}
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>}
 // CHECK: %[[VAL_5:.*]] = tosa.slice %arg0, %[[VAL_4]], %[[VAL_3]] : (tensor<15x23x2xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<1x23x2xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.concat %[[VAL_5]], %arg0 {axis = 0 : i32} : (tensor<1x23x2xf32>, tensor<15x23x2xf32>) -> tensor<16x23x2xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %[[VAL_6]], %[[VAL_4]], %[[VAL_2]] : (tensor<16x23x2xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<16x1x2xf32>
@@ -3161,9 +3497,9 @@ func.func @test_tfl_custom(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32x
 
 // CHECK-LABEL: test_tfl_while_loop
 // CHECK: %[[VAL_0:.*]]: tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["placeholder_0"]}) -> (tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["output_0"]}) {
-// CHECK-DAG: %[[VAL_20:.*]] = tosa.const_shape {value = dense<1> : tensor<1xindex>}
-// CHECK-DAG: %[[VAL_21:.*]] = tosa.const_shape {value = dense<> : tensor<0xindex>}
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<2.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_20:.*]] = tosa.const_shape {values = dense<1> : tensor<1xindex>}
+// CHECK-DAG: %[[VAL_21:.*]] = tosa.const_shape {values = dense<> : tensor<0xindex>}
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<2.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK: %[[VAL_2:.*]] = tosa.while_loop (%[[VAL_3:.*]] = %[[VAL_0]]) : (tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32> {
 // CHECK:   %[[VAL_4:.*]] = tosa.reduce_sum %[[VAL_3]] {axis = 1 : i32} : (tensor<1x4x4x4xf32>) -> tensor<1x1x4x4xf32>
 // CHECK:   %[[VAL_5:.*]] = tosa.reduce_sum %[[VAL_4]] {axis = 2 : i32} : (tensor<1x1x4x4xf32>) -> tensor<1x1x1x4xf32>
@@ -3211,7 +3547,7 @@ func.func private @result_body(%arg0: tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32
 
 // CHECK-LABEL: test_rfft2d
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x16xf32>
-// CHECK: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[1, 8, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[1, 8, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK: %[[VAL_1:.*]], %[[VAL_2:.*]] = tosa.rfft2d %[[VAL_0]] : (tensor<1x8x16xf32>) -> (tensor<1x8x9xf32>, tensor<1x8x9xf32>)
 // CHECK: %[[VAL_3:.*]] = tosa.reshape %[[VAL_1]], %[[VAL_10]] : (tensor<1x8x9xf32>, !tosa.shape<4>) -> tensor<1x8x9x1xf32>
 // CHECK: %[[VAL_4:.*]] = tosa.reshape %[[VAL_2]], %[[VAL_10]] : (tensor<1x8x9xf32>, !tosa.shape<4>) -> tensor<1x8x9x1xf32>
@@ -3226,9 +3562,9 @@ func.func @test_rfft2d(%arg0: tensor<1x8x16xf32>) -> tensor<1x8x9xcomplex<f32>>
 // -----
 
 // CHECK-LABEL: test_rfft2d_crop_input
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[13, 2, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[13, 2, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<0> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[13, 2, 2, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[13, 2, 2]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>} : () -> !tosa.shape<3>
 // CHECK: %[[VAL_4:.*]] = tosa.slice %arg0, %[[VAL_3]], %[[VAL_2]] : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x2x2xf32>
 // CHECK: %[[VAL_5:.*]], %[[VAL_6:.*]] = tosa.rfft2d %[[VAL_4]] : (tensor<13x2x2xf32>) -> (tensor<13x2x2xf32>, tensor<13x2x2xf32>)
 // CHECK: %[[VAL_7:.*]] = tosa.reshape %[[VAL_5]], %[[VAL_1]] : (tensor<13x2x2xf32>, !tosa.shape<4>) -> tensor<13x2x2x1xf32>
@@ -3244,9 +3580,9 @@ func.func @test_rfft2d_crop_input(%arg0: tensor<13x21x3xf32>) -> tensor<13x2x2xc
 
 // CHECK-LABEL: test_rfft2d_pad_input
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape {value = dense<[0, 0, 0, 11, 0, 5]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[13, 32, 5, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape {values = dense<[0, 0, 0, 11, 0, 5]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[13, 32, 5, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK: %[[VAL_3:.*]] = tosa.pad %[[VAL_0]], %[[VAL_2]], %[[VAL_1]] : (tensor<13x21x3xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<13x32x8xf32>
 // CHECK: %[[VAL_4:.*]], %[[VAL_5:.*]] = tosa.rfft2d %[[VAL_3]] : (tensor<13x32x8xf32>) -> (tensor<13x32x5xf32>, tensor<13x32x5xf32>)
 // CHECK: %[[VAL_6:.*]] = tosa.reshape %[[VAL_4]], %[[VAL_10]] : (tensor<13x32x5xf32>, !tosa.shape<4>) -> tensor<13x32x5x1xf32>
@@ -3264,11 +3600,11 @@ func.func @test_rfft2d_pad_input(%arg0: tensor<13x21x3xf32>) -> (tensor<13x32x5x
 // -----
 
 // CHECK-LABEL: test_rfft2d_crop_height_pad_width
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[13, 2, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[13, 2, 16]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 0, 0, 13]> : tensor<6xindex>} : () -> !tosa.shape<6>
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[13, 2, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[13, 2, 16]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 0, 0, 13]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK: %[[VAL_6:.*]] = tosa.pad %arg0, %[[VAL_4]], %[[VAL_5]] : (tensor<13x21x3xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<13x21x16xf32>
 // CHECK: %[[VAL_7:.*]] = tosa.slice %[[VAL_6]], %[[VAL_2]], %[[VAL_3]] : (tensor<13x21x16xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<13x2x16xf32>
 // CHECK: %[[VAL_8:.*]], %[[VAL_9:.*]] = tosa.rfft2d %[[VAL_7]] : (tensor<13x2x16xf32>) -> (tensor<13x2x9xf32>, tensor<13x2x9xf32>)
@@ -3286,9 +3622,9 @@ func.func @test_rfft2d_crop_height_pad_width(%arg0: tensor<13x21x3xf32>) -> (ten
 // -----
 
 // CHECK-LABEL: test_real
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[1, 8, 9]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 8, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[1, 8, 9]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<0> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 8, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK: %[[VAL_4:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_3]] : (tensor<1x8x9x2xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x8x9x1xf32>
 // CHECK: %[[VAL_5:.*]] = tosa.reshape %[[VAL_4]], %[[VAL_1]] : (tensor<1x8x9x1xf32>, !tosa.shape<3>) -> tensor<1x8x9xf32>
 func.func @test_real(%arg0: tensor<1x8x9xcomplex<f32>>) -> (tensor<1x8x9xf32>) {
@@ -3310,9 +3646,9 @@ func.func @test_real_non_complex(%arg0: tensor<1x8x9xf32>) -> (tensor<1x8x9xf32>
 // -----
 
 // CHECK-LABEL: test_imag
-// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {value = dense<[1, 8, 9]> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {value = dense<[0, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {value = dense<[1, 8, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_1:.*]] = tosa.const_shape  {values = dense<[1, 8, 9]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK-DAG: %[[VAL_2:.*]] = tosa.const_shape  {values = dense<[0, 0, 0, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_3:.*]] = tosa.const_shape  {values = dense<[1, 8, 9, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK: %[[VAL_4:.*]] = tosa.slice %arg0, %[[VAL_2]], %[[VAL_3]] : (tensor<1x8x9x2xf32>, !tosa.shape<4>, !tosa.shape<4>) -> tensor<1x8x9x1xf32>
 // CHECK: %[[VAL_5:.*]] = tosa.reshape %[[VAL_4]], %[[VAL_1]] : (tensor<1x8x9x1xf32>, !tosa.shape<3>) -> tensor<1x8x9xf32>
 func.func @test_imag(%arg0: tensor<1x8x9xcomplex<f32>>) -> (tensor<1x8x9xf32>) {
@@ -3324,7 +3660,7 @@ func.func @test_imag(%arg0: tensor<1x8x9xcomplex<f32>>) -> (tensor<1x8x9xf32>) {
 
 // CHECK-LABEL: test_imag_non_complex
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x8x9xf32>
-// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x8x9xf32>}> : () -> tensor<1x8x9xf32>
+// CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x8x9xf32>}> : () -> tensor<1x8x9xf32>
 // CHECK: return %[[VAL_1]] : tensor<1x8x9xf32>
 func.func @test_imag_non_complex(%arg0: tensor<1x8x9xf32>) -> (tensor<1x8x9xf32>) {
   %0 = "tfl.imag"(%arg0) {} : (tensor<1x8x9xf32>) -> tensor<1x8x9xf32>
@@ -3334,9 +3670,11 @@ func.func @test_imag_non_complex(%arg0: tensor<1x8x9xf32>) -> (tensor<1x8x9xf32>
 // -----
 
 // CHECK-LABEL: test_squared_difference_qi8
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK-DAG: %[[VAR2:.*]] = tosa.rescale %arg0
-// CHECK-DAG: %[[VAR3:.*]] = tosa.rescale %arg1
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAR0:.*]] = tosa.rescale %arg0
+// CHECK-DAG: %[[VAR1:.*]] = tosa.rescale %arg1
+// CHECK-DAG: %[[VAR2:.*]] = tosa.rescale %[[VAR0]]
+// CHECK-DAG: %[[VAR3:.*]] = tosa.rescale %[[VAR1]]
 // CHECK-DAG: %[[VAR4:.*]] = tosa.sub %[[VAR2]], %[[VAR3]]
 // CHECK-DAG: %[[VAR5:.*]] = tosa.mul %[[VAR4]], %[[VAR4]], %[[SHIFT]]
 // CHECK-DAG: %[[VAR6:.*]] = tosa.rescale %[[VAR5]]
@@ -3349,7 +3687,7 @@ func.func @test_squared_difference_qi8(%arg0: tensor<1x197x768x!quant.uniform<i8
 // -----
 
 // CHECK-LABEL: test_squared_difference_f32
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK-DAG: %[[VAR0:.*]] = tosa.sub %arg0, %arg1
 // CHECK-DAG: %[[VAR1:.*]] = tosa.mul %[[VAR0]], %[[VAR0]], %[[SHIFT]]
 // CHECK: return %[[VAR1]]
@@ -3361,14 +3699,29 @@ func.func @test_squared_difference_f32(%arg0: tensor<1x197x768xf32>, %arg1: tens
 // -----
 
 // CHECK-LABEL: test_squared_difference_with_unequal_ranks_qi8
-// CHECK: %[[C:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
-// CHECK: %[[CS:.*]] = tosa.const_shape  {value = dense<[1, 1, 1, 44]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK: %[[RH:.*]] = tosa.reshape %arg1, %[[CS]] : (tensor<44x!quant.uniform<i8:f32, 0.0050808675587177277:-16>>, !tosa.shape<4>) -> tensor<1x1x1x44x!quant.uniform<i8:f32, 0.0050808675587177277:-16>>
-// CHECK: %[[RS1:.*]] = tosa.rescale %arg0
-// CHECK: %[[RS2:.*]] = tosa.rescale %[[RH]]
-// CHECK: %[[SUB:.*]] = tosa.sub %[[RS1]], %[[RS2]]
-// CHECK: %[[MUL:.*]] = tosa.mul %[[SUB]], %[[SUB]], %[[C]]
-// CHECK: %[[RS3:.*]] = tosa.rescale %[[MUL]]
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x304x1x44x!quant.uniform<i8:f32, 0.6395336389541626:-2>>
+// CHECK-SAME: %[[VAL_1:.*]]: tensor<44x!quant.uniform<i8:f32, 0.0050808675587177277:-16>>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<49> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<2132442608> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_6:.*]] = tosa.const_shape  {values = dense<[1, 1, 1, 44]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<38> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<1091903658> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<31> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_10:.*]] = "tosa.const"() <{values = dense<-16> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_11:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_12:.*]] = "tosa.const"() <{values = dense<23> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_13:.*]] = "tosa.const"() <{values = dense<-2> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_14:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_15:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_11]], %[[VAL_12]], %[[VAL_13]], %[[VAL_14]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x304x1x44x!quant.uniform<i8:f32, 0.6395336389541626:-2>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<1x304x1x44xi32>
+// CHECK: %[[VAL_16:.*]] = tosa.rescale %[[VAL_1]], %[[VAL_11]], %[[VAL_12]], %[[VAL_10]], %[[VAL_14]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<44x!quant.uniform<i8:f32, 0.0050808675587177277:-16>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<44xi32>
+// CHECK: %[[VAL_17:.*]] = tosa.rescale %[[VAL_15]], %[[VAL_11]], %[[VAL_9]], %[[VAL_14]], %[[VAL_14]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x304x1x44xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x304x1x44xi32>
+// CHECK: %[[VAL_18:.*]] = tosa.rescale %[[VAL_16]], %[[VAL_8]], %[[VAL_7]], %[[VAL_14]], %[[VAL_14]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<44xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi32>) -> tensor<44xi32>
+// CHECK: %[[VAL_19:.*]] = tosa.reshape %[[VAL_18]], %[[VAL_6]] : (tensor<44xi32>, !tosa.shape<4>) -> tensor<1x1x1x44xi32>
+// CHECK: %[[VAL_20:.*]] = tosa.sub %[[VAL_17]], %[[VAL_19]] : (tensor<1x304x1x44xi32>, tensor<1x1x1x44xi32>) -> tensor<1x304x1x44xi32>
+// CHECK: %[[VAL_21:.*]] = tosa.mul %[[VAL_20]], %[[VAL_20]], %[[VAL_5]] : (tensor<1x304x1x44xi32>, tensor<1x304x1x44xi32>, tensor<1xi8>) -> tensor<1x304x1x44xi32>
+// CHECK: %[[VAL_22:.*]] = tosa.rescale %[[VAL_21]], %[[VAL_4]], %[[VAL_3]], %[[VAL_14]], %[[VAL_2]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x304x1x44xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x304x1x44x!quant.uniform<i8:f32, 26.360841751098633:-128>>
 func.func @test_squared_difference_with_unequal_ranks_qi8(%arg0: tensor<1x304x1x44x!quant.uniform<i8:f32, 0.6395336389541626:-2>>, %arg1: tensor<44x!quant.uniform<i8:f32, 0.0050808675587177277:-16>>) -> tensor<1x304x1x44x!quant.uniform<i8:f32, 26.360841751098633:-128>> {
   %0 = "tfl.squared_difference"(%arg0, %arg1) : (tensor<1x304x1x44x!quant.uniform<i8:f32, 0.6395336389541626:-2>>, tensor<44x!quant.uniform<i8:f32, 0.0050808675587177277:-16>>) -> tensor<1x304x1x44x!quant.uniform<i8:f32, 26.360841751098633:-128>>
   func.return %0 : tensor<1x304x1x44x!quant.uniform<i8:f32, 26.360841751098633:-128>>
@@ -3377,8 +3730,8 @@ func.func @test_squared_difference_with_unequal_ranks_qi8(%arg0: tensor<1x304x1x
 // -----
 
 // CHECK-LABEL: test_squared_difference_with_unequal_ranks_f32
-// CHECK: %[[C:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
-// CHECK: %[[CS:.*]] = tosa.const_shape  {value = dense<[1, 1, 1, 44]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[C:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
+// CHECK: %[[CS:.*]] = tosa.const_shape  {values = dense<[1, 1, 1, 44]> : tensor<4xindex>} : () -> !tosa.shape<4>
 // CHECK: %[[RH:.*]] = tosa.reshape %arg1, %[[CS]] : (tensor<44xf32>, !tosa.shape<4>) -> tensor<1x1x1x44xf32>
 // CHECK: %[[SUB:.*]] = tosa.sub %arg0, %[[RH]]
 // CHECK: %[[MUL:.*]] = tosa.mul %[[SUB]], %[[SUB]], %[[C]]
@@ -3391,8 +3744,8 @@ func.func @test_squared_difference_with_unequal_ranks_f32(%arg0: tensor<1x304x1x
 // -----
 
 // CHECK-LABEL: test_broadcast_to_f32
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<-0.000000e+00> : tensor<3x3x13x7xf32>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<-0.000000e+00> : tensor<3x3x13x7xf32>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]] : (tensor<13x1xf32>, !tosa.shape<4>)
 // CHECK: %[[VAL_2:.*]] = tosa.add %[[VAL_1]], %[[VAL_0]] : (tensor<1x1x13x1xf32>, tensor<3x3x13x7xf32>) -> tensor<3x3x13x7xf32>
 // CHECK: return %[[VAL_2]] : tensor<3x3x13x7xf32>
@@ -3405,8 +3758,8 @@ func.func @test_broadcast_to_f32(%arg0: tensor<13x1xf32>) -> (tensor<3x3x13x7xf3
 // -----
 
 // CHECK-LABEL: test_broadcast_to_f16
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<-0.000000e+00> : tensor<3x3x13x7xf16>}>
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<-0.000000e+00> : tensor<3x3x13x7xf16>}>
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]] : (tensor<13x1xf16>, !tosa.shape<4>)
 // CHECK: %[[VAL_2:.*]] = tosa.add %[[VAL_1]], %[[VAL_0]] : (tensor<1x1x13x1xf16>, tensor<3x3x13x7xf16>) -> tensor<3x3x13x7xf16>
 // CHECK: return %[[VAL_2]] : tensor<3x3x13x7xf16>
@@ -3419,8 +3772,8 @@ func.func @test_broadcast_to_f16(%arg0: tensor<13x1xf16>) -> (tensor<3x3x13x7xf1
 // -----
 
 // CHECK-LABEL: test_broadcast_to_i32
-// CHECK-DAG: %[[VAL_10]] = tosa.const_shape {value = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<0> : tensor<7x7x13x3xi32>}
+// CHECK-DAG: %[[VAL_10]] = tosa.const_shape {values = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<0> : tensor<7x7x13x3xi32>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]] : (tensor<13x1xi32>, !tosa.shape<4>)
 // CHECK: %[[VAL_2:.*]] = tosa.add %[[VAL_1]], %[[VAL_0]] : (tensor<1x1x13x1xi32>, tensor<7x7x13x3xi32>) -> tensor<7x7x13x3xi32>
 // CHECK: return %[[VAL_2]] : tensor<7x7x13x3xi32>
@@ -3433,8 +3786,8 @@ func.func @test_broadcast_to_i32(%arg0: tensor<13x1xi32>) -> (tensor<3x3x13x3xi3
 // -----
 
 // CHECK-LABEL: test_broadcast_to_i1
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<false> : tensor<7x7x13x7xi1>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<false> : tensor<7x7x13x7xi1>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]] : (tensor<13x1xi1>, !tosa.shape<4>)
 // CHECK: %[[VAL_2:.*]] = tosa.logical_or %[[VAL_1]], %[[VAL_0]] : (tensor<1x1x13x1xi1>, tensor<7x7x13x7xi1>) -> tensor<7x7x13x7xi1>
 // CHECK: return %[[VAL_2]] : tensor<7x7x13x7xi1>
@@ -3447,8 +3800,8 @@ func.func @test_broadcast_to_i1(%arg0: tensor<13x1xi1>) -> (tensor<7x7x13x7xi1>)
 // -----
 
 // CHECK-LABEL: test_broadcast_to_qi8
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<0> : tensor<7x7x13x3xi32>}
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[1, 1, 13, 1]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<0> : tensor<7x7x13x3xi32>}
 // CHECK: %[[VAL_1:.*]] = tosa.reshape %arg0, %[[VAL_10]]
 // CHECK: %[[VAL_2:.*]] = tosa.cast %2 : (tensor<1x1x13x1x!quant.uniform<i16:f32, 1.000000e+00:-1>>) -> tensor<1x1x13x1xi32>
 // CHECK: %[[VAL_3:.*]] = tosa.add %[[VAL_2]], %[[VAL_0]] : (tensor<1x1x13x1xi32>, tensor<7x7x13x3xi32>) -> tensor<7x7x13x3xi32>
@@ -3463,7 +3816,7 @@ func.func @test_broadcast_to_qi8(%arg0: tensor<13x1x!quant.uniform<i16:f32, 1.0:
 // -----
 
 // CHECK-LABEL: test_broadcast_to_smaller_rank
-// CHECK: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<[13, 7]> : tensor<2xi48>}
+// CHECK: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<[13, 7]> : tensor<2xi48>}
 // CHECK: %[[VAL_1:.*]] = "tfl.broadcast_to"(%arg0, %[[VAL_0]]) : (tensor<2x3x13x1xi32>, tensor<2xi48>) -> tensor<13x7xi32>
 // CHECK: return %[[VAL_1]] : tensor<13x7xi32>
 func.func @test_broadcast_to_smaller_rank(%arg0: tensor<2x3x13x1xi32>) -> (tensor<13x7xi32>) {
@@ -3475,7 +3828,7 @@ func.func @test_broadcast_to_smaller_rank(%arg0: tensor<2x3x13x1xi32>) -> (tenso
 // -----
 
 // CHECK-LABEL: test_broadcast_to_i48
-// CHECK: %[[VAL_0:.*]] = "tosa.const"() <{value = dense<[7, 7, 1, 7]> : tensor<4xi48>}
+// CHECK: %[[VAL_0:.*]] = "tosa.const"() <{values = dense<[7, 7, 1, 7]> : tensor<4xi48>}
 // CHECK: %[[VAL_1:.*]] = "tfl.broadcast_to"(%arg0, %[[VAL_0]]) : (tensor<1x1x13x1xi48>, tensor<4xi48>) -> tensor<7x7x13x7xi48>
 // CHECK: return %[[VAL_1]] : tensor<7x7x13x7xi48>
 func.func @test_broadcast_to_i48(%arg0: tensor<1x1x13x1xi48>) -> (tensor<7x7x13x7xi48>) {
@@ -3487,9 +3840,9 @@ func.func @test_broadcast_to_i48(%arg0: tensor<1x1x13x1xi48>) -> (tensor<7x7x13x
 // -----
 
 // CHECK-LABEL: test_transpose_conv2d_bias_f32
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00> : tensor<128xf32>}> : () -> tensor<128xf32>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<-1.000000e+00> : tensor<128x2x2x256xf32>}> : () -> tensor<128x2x2x256xf32>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<128xf32>}> : () -> tensor<128xf32>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<-1.000000e+00> : tensor<128x2x2x256xf32>}> : () -> tensor<128x2x2x256xf32>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
 // CHECK-DAG: %[[VAR3:.*]] = tosa.transpose_conv2d %arg0, %[[VAR1]], %[[VAR0]], %[[VAR2]], %[[VAR2]] {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>}
 func.func @test_transpose_conv2d_bias_f32(%arg0: tensor<1x64x64x256xf32>) -> tensor<1x128x128x128xf32> {
   %cst = arith.constant dense<[1, 128, 128, 128]> : tensor<4xi32>
@@ -3501,30 +3854,9 @@ func.func @test_transpose_conv2d_bias_f32(%arg0: tensor<1x64x64x256xf32>) -> ten
 
 // -----
 
-// CHECK-LABEL: test_cast_ui8
-// CHECK: %[[VAL_0:.*]] = tosa.rescale %arg0 {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_1:.*]] = tosa.rescale %[[VAL_0]] {double_round = true, input_zp = -128 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_2:.*]] = tosa.cast %[[VAL_1]] : (tensor<13x21x3xi32>) -> tensor<13x21x3xf32>
-func.func @test_cast_ui8(%arg0: tensor<13x21x3xui8>) -> (tensor<13x21x3xf32>) {
-  %0 = "tfl.cast"(%arg0) : (tensor<13x21x3xui8>) -> tensor<13x21x3xf32>
-  return %0 : tensor<13x21x3xf32>
-}
-
-// -----
-
-// CHECK-LABEL: test_cast_qi8
-// CHECK: %[[VAL_0:.*]] = tosa.rescale %arg0 {double_round = true, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 30>}
-// CHECK: %[[VAL_1:.*]] = tosa.cast %[[VAL_0]] : (tensor<13x21x3xi32>) -> tensor<13x21x3xf32>
-func.func @test_cast_qi8(%arg0: tensor<13x21x3x!quant.uniform<i16:f32, 1.0:-1>>) -> (tensor<13x21x3xf32>) {
-  %0 = "tfl.cast"(%arg0) : (tensor<13x21x3x!quant.uniform<i16:f32, 1.0:-1>>) -> tensor<13x21x3xf32>
-  return %0 : tensor<13x21x3xf32>
-}
-
-// -----
-
 // CHECK-LABEL: test_mul_with_unequal_ranks
-// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {value = dense<[1, 1, 1, 384]> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}>
+// CHECK-DAG: %[[VAL_10:.*]] = tosa.const_shape {values = dense<[1, 1, 1, 384]> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK-DAG: %[[SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}>
 // CHECK: %[[VAR0:.*]] = tosa.reshape %arg1, %[[VAL_10]] : (tensor<384xf32>, !tosa.shape<4>) -> tensor<1x1x1x384xf32>
 // CHECK: %[[VAR1:.*]] = tosa.mul %arg0, %[[VAR0]], %[[SHIFT]] : (tensor<?x135x240x384xf32>, tensor<1x1x1x384xf32>, tensor<1xi8>)
 func.func @test_mul_with_unequal_ranks(%arg0: tensor<?x135x240x384xf32>, %arg1: tensor<384xf32>) -> tensor<?x135x240x384xf32> {
@@ -3535,26 +3867,33 @@ func.func @test_mul_with_unequal_ranks(%arg0: tensor<?x135x240x384xf32>, %arg1:
 // -----
 
 // CHECK-LABEL: test_mul_with_unequal_ranks_qi8
-// CHECK: %[[C1:.*]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
-// CHECK: %[[CS:.*]] = tosa.const_shape  {value = dense<1> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK: %[[C2:.*]] = "tosa.const"() <{value = dense<127> : tensor<i8>}> : () -> tensor<!quant.uniform<i8:f32, 3.0757404601899907E-5:-128>>
-// CHECK: %[[RS1:.*]] = tosa.rescale %arg0
-// CHECK: %[[RS2:.*]] = tosa.rescale %[[C2]]
-// CHECK: %[[RH:.*]] = tosa.reshape %[[RS2]], %[[CS]] : (tensor<i32>, !tosa.shape<4>) -> tensor<1x1x1x1xi32>
-// CHECK: %[[MUL:.*]] = tosa.mul %[[RS1]], %[[RH]], %[[C1]] : (tensor<1x192x192x3xi32>, tensor<1x1x1x1xi32>, tensor<1xi8>) -> tensor<1x192x192x3xi32>
-// CHECK: %[[RS3:.*]] = tosa.rescale %[[MUL]]
-// CHECK: return %[[RS3]] : tensor<1x192x192x3x!quant.uniform<i8:f32, 0.0078431377187371254:-128>>
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1x192x192x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<38> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<1077952640> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = tosa.const_shape  {values = dense<1> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<127> : tensor<i8>}> : () -> tensor<!quant.uniform<i8:f32, 3.0757404601899907E-5:-128>>
+// CHECK: %[[VAL_6:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_7:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_8:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_9:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_10:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_6]], %[[VAL_7]], %[[VAL_8]], %[[VAL_9]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x192x192x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<1x192x192x3xi32>
+// CHECK: %[[VAL_11:.*]] = tosa.rescale %[[VAL_5]], %[[VAL_6]], %[[VAL_7]], %[[VAL_8]], %[[VAL_9]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<!quant.uniform<i8:f32, 3.0757404601899907E-5:-128>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<i32>
+// CHECK: %[[VAL_12:.*]] = tosa.reshape %[[VAL_11]], %[[VAL_4]] : (tensor<i32>, !tosa.shape<4>) -> tensor<1x1x1x1xi32>
+// CHECK: %[[VAL_13:.*]] = tosa.mul %[[VAL_10]], %[[VAL_12]], %[[VAL_3]] : (tensor<1x192x192x3xi32>, tensor<1x1x1x1xi32>, tensor<1xi8>) -> tensor<1x192x192x3xi32>
+// CHECK: %[[VAL_14:.*]] = tosa.rescale %[[VAL_13]], %[[VAL_2]], %[[VAL_1]], %[[VAL_9]], %[[VAL_8]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<1x192x192x3xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<1x192x192x3x!quant.uniform<i8:f32, 0.0078431377187371254:-128>>
 func.func @test_mul_with_unequal_ranks_qi8(%arg0: tensor<1x192x192x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>) -> tensor<1x192x192x3x!quant.uniform<i8:f32, 0.0078431377187371254:-128>> {
     %0 = "tfl.pseudo_qconst"() {qtype = tensor<!quant.uniform<i8:f32, 3.0757404601899907E-5:-128>>, value = dense<127> : tensor<i8>} : () -> tensor<!quant.uniform<i8:f32, 3.0757404601899907E-5:-128>>
     %1 = tfl.mul(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<1x192x192x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>, tensor<!quant.uniform<i8:f32, 3.0757404601899907E-5:-128>>) -> tensor<1x192x192x3x!quant.uniform<i8:f32, 0.0078431377187371254:-128>>
-    return %1 : tensor<1x192x192x3x!quant.uniform<i8:f32, 0.0078431377187371254:-128>>
+    func.return %1 : tensor<1x192x192x3x!quant.uniform<i8:f32, 0.0078431377187371254:-128>>
 }
 
+
 // -----
 
 // CHECK-LABEL: test_sub_with_unequal_ranks_qi8
-// CHECK: %[[CS:.*]] = tosa.const_shape  {value = dense<1> : tensor<4xindex>} : () -> !tosa.shape<4>
-// CHECK: %[[C:.*]] = "tosa.const"() <{value = dense<127> : tensor<i8>}> : () -> tensor<!quant.uniform<i8:f32, 0.0039215688593685627:-128>>
+// CHECK: %[[CS:.*]] = tosa.const_shape  {values = dense<1> : tensor<4xindex>} : () -> !tosa.shape<4>
+// CHECK: %[[C:.*]] = "tosa.const"() <{values = dense<127> : tensor<i8>}> : () -> tensor<!quant.uniform<i8:f32, 0.0039215688593685627:-128>>
 // CHECK: %[[RS1:.*]] = tosa.rescale %arg0
 // CHECK: %[[RS2:.*]] = tosa.rescale %[[C]]
 // CHECK: %[[RS3:.*]] = tosa.rescale %[[RS2]]
@@ -3571,8 +3910,8 @@ func.func @test_sub_with_unequal_ranks_qi8(%arg0: tensor<1x192x192x3x!quant.unif
 // -----
 
 // CHECK-LABEL: test_add_with_unequal_ranks_qi8
-// CHECK: %[[CS:.*]] = tosa.const_shape  {value = dense<1> : tensor<3xindex>} : () -> !tosa.shape<3>
-// CHECK: %[[C:.*]] = "tosa.const"() <{value = dense<127> : tensor<i8>}> : () -> tensor<!quant.uniform<i8:f32, 0.0070588234812021255:-128>>
+// CHECK: %[[CS:.*]] = tosa.const_shape  {values = dense<1> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK: %[[C:.*]] = "tosa.const"() <{values = dense<127> : tensor<i8>}> : () -> tensor<!quant.uniform<i8:f32, 0.0070588234812021255:-128>>
 // CHECK: %[[RS1:.*]] = tosa.rescale %arg0
 // CHECK: %[[RS2:.*]] = tosa.rescale %[[C]]
 // CHECK: %[[RS3:.*]] = tosa.rescale %[[RS2]]
@@ -3585,3 +3924,62 @@ func.func @test_add_with_unequal_ranks_qi8(%arg0: tensor<48x48x17x!quant.uniform
     %1 = tfl.add(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<48x48x17x!quant.uniform<i8:f32, 0.24252831935882568:-128>>, tensor<!quant.uniform<i8:f32, 0.0070588234812021255:-128>>) -> tensor<48x48x17x!quant.uniform<i8:f32, 0.24958714842796326:-128>>
     func.return %1 : tensor<48x48x17x!quant.uniform<i8:f32, 0.24958714842796326:-128>>
 }
+
+// -----
+
+// CHECK-LABEL: test_cast_ui8
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xui8>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<-128> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_6:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] {input_unsigned = true, output_unsigned = false, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<13x21x3xui8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<13x21x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>
+// CHECK: %[[VAL_7:.*]] = tosa.rescale %[[VAL_6]], %[[VAL_2]], %[[VAL_3]], %[[VAL_5]], %[[VAL_1]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x3x!quant.uniform<i8:f32, 1.000000e+00:-128>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<13x21x3xi32>
+// CHECK: %[[VAL_8:.*]] = tosa.cast %[[VAL_7]] : (tensor<13x21x3xi32>) -> tensor<13x21x3xf32>
+// CHECK: return %[[VAL_8]] : tensor<13x21x3xf32>
+func.func @test_cast_ui8(%arg0: tensor<13x21x3xui8>) -> (tensor<13x21x3xf32>) {
+  %0 = "tfl.cast"(%arg0) : (tensor<13x21x3xui8>) -> tensor<13x21x3xf32>
+  return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_qi8
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 1.000000e+00:-1>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<-1> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_5:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<13x21x3x!quant.uniform<i8:f32, 1.000000e+00:-1>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<13x21x3xi32>
+// CHECK: %[[VAL_6:.*]] = tosa.cast %[[VAL_5]] : (tensor<13x21x3xi32>) -> tensor<13x21x3xf32>
+func.func @test_cast_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 1.0:-1>>) -> (tensor<13x21x3xf32>) {
+  %0 = "tfl.cast"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 1.0:-1>>) -> tensor<13x21x3xf32>
+  return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_transpose_conv2d_bias_f32
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<1.000000e+00> : tensor<128xf32>}> : () -> tensor<128xf32>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{values = dense<-1.000000e+00> : tensor<128x2x2x256xf32>}> : () -> tensor<128x2x2x256xf32>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}>
+// CHECK-DAG: %[[VAR3:.*]] = tosa.transpose_conv2d %arg0, %[[VAR1]], %[[VAR0]], %[[VAR2]], %[[VAR2]] {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>}
+func.func @test_transpose_conv2d_bias_f32(%arg0: tensor<1x64x64x256xf32>) -> tensor<1x128x128x128xf32> {
+  %cst = arith.constant dense<[1, 128, 128, 128]> : tensor<4xi32>
+  %0 = arith.constant dense<-1.000000e+00> : tensor<128x2x2x256xf32>
+  %1 = arith.constant dense<1.000000e+00> : tensor<128xf32>
+  %2 = "tfl.transpose_conv"(%cst, %0, %arg0, %1)  {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"}  : (tensor<4xi32>, tensor<128x2x2x256xf32>, tensor<1x64x64x256xf32>, tensor<128xf32>) -> tensor<1x128x128x128xf32>
+  return %2 : tensor<1x128x128x128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_concat_qconst
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{values = dense<42> : tensor<28x19xi8>}> : () -> tensor<28x19x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-DAG: %[[VAR1:.*]] = tosa.concat %[[VAR0]], %arg0 {axis = 0 : i32} : (tensor<28x19x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<1x19x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<29x19x!quant.uniform<i8:f32, 1.000000e+00>>
+func.func @test_concat_qconst(%arg0: tensor<1x19x!quant.uniform<i8:f32, 1.0>> ) -> tensor<29x19x!quant.uniform<i8:f32, 1.0>> {
+  %0 = "tfl.pseudo_qconst"() {qtype = tensor<28x19x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<28x19xi8>} : () -> tensor<28x19x!quant.uniform<i8:f32, 1.0>>
+  %1 = "tfl.concatenation"(%0, %arg0) {axis = 0 : i32, fused_activation_function = "NONE"}: (tensor<28x19x!quant.uniform<i8:f32, 1.0>>, tensor<1x19x!quant.uniform<i8:f32, 1.0>>) -> tensor<29x19x!quant.uniform<i8:f32, 1.0>>
+  return %1 : tensor<29x19x!quant.uniform<i8:f32, 1.0>>
+}
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
index 2453efb5ca90..9c3cffb8651e 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
@@ -1,18 +1,19 @@
-// RUN: tf-opt --split-input-file --tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
-// RUN: tf-opt --split-input-file --tf-tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
+
+// RUN: tf-tosa-opt --split-input-file --tf-tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
+
 
 // Operations for testing tfl-to-tosa-pipeline
 
 // -----
 
+// CHECK-LABEL: tosa.variable @var_x = dense<7.000000e+00> : tensor<1xf32>
+// CHECK-LABEL: test_stateful_ops(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<1xf32>
+// CHECK: tosa.variable_write @var_x, %[[VAL_0]] : tensor<1xf32>
+// CHECK: %[[VAL_1:.*]] = tosa.variable_read @var_x : tensor<1xf32>
+// CHECK: return %[[VAL_1]] : tensor<1xf32>
 module attributes {tf_saved_model.semantics, tfl.description = "Test.", tfl.schema_version = 3 : i32} {
-    // CHECK: tosa.variable @var_x = dense<7.000000e+00> : tensor<1xf32>
-    // CHECK-LABEL: test_stateful_ops
-    // CHECK: tosa.variable.write @var_x, %arg0 : tensor<1xf32>
-    // CHECK: %[[VAL_0:.*]] = tosa.variable.read @var_x : tensor<1xf32>
-    // CHECK: return %[[VAL_0]] : tensor<1xf32>
     func.func @test_stateful_ops(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["placeholder_0"]})
       -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]})
       attributes {tf_saved_model.exported_names = ["serving_default"]} {
@@ -34,18 +35,24 @@ module attributes {tf_saved_model.semantics, tfl.description = "Test.", tfl.sche
 
 // -----
 
+// CHECK-LABEL: tosa.variable @Variable = dense<42> : tensor<2x3xi8>
+// CHECK-LABEL: readAssignQuant
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<49> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_3:.*]] = "tosa.const"() <{values = dense<2> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_4:.*]] = "tosa.const"() <{values = dense<11> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK: %[[VAL_5:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK: %[[VAL_6:.*]] = tosa.variable_read @Variable : tensor<2x3xi8>
+// CHECK: %[[VAL_7:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : tensor<2x3xi8> to tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+// CHECK: %[[VAL_8:.*]] = tosa.rescale %[[VAL_7]], %[[VAL_5]], %[[VAL_4]], %[[VAL_3]], %[[VAL_2]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<2x3xi32>
+// CHECK: %[[VAL_9:.*]] = tosa.rescale %[[VAL_0]], %[[VAL_5]], %[[VAL_4]], %[[VAL_3]], %[[VAL_2]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi32>) -> tensor<2x3xi32>
+// CHECK: %[[VAL_10:.*]] = tosa.add %[[VAL_8]], %[[VAL_9]] : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK: %[[VAL_11:.*]] = tosa.rescale %[[VAL_10]], %[[VAL_5]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]] {input_unsigned = false, output_unsigned = false, per_channel = false, rounding_mode = "DOUBLE_ROUND", scale32 = true} : (tensor<2x3xi32>, tensor<1xi32>, tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) -> tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+// CHECK: %[[VAL_12:.*]] = builtin.unrealized_conversion_cast %[[VAL_11]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>> to tensor<2x3xi8>
+// CHECK: tosa.variable_write @Variable, %[[VAL_12]] : tensor<2x3xi8>
+// CHECK: return %[[VAL_11]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
 module {
-    // CHECK: tosa.variable @Variable = dense<42> : tensor<2x3xi8>
-    // CHECK-LABEL: readAssignQuant
-    // CHECK: %[[VAL_0:.*]] = tosa.variable.read @Variable : tensor<2x3xi8>
-    // CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[VAL_0]] : tensor<2x3xi8> to tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
-    // CHECK: %[[VAL_2:.*]] = tosa.rescale %[[VAL_1]] {double_round = true, input_zp = 2 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 11>} : (tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>) -> tensor<2x3xi32>
-    // CHECK: %[[VAL_3:.*]] = tosa.rescale %[[VAL_4:.*]] {double_round = true, input_zp = 2 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i8: 11>} : (tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>) -> tensor<2x3xi32>
-    // CHECK: %[[VAL_5:.*]] = tosa.add %[[VAL_2]], %[[VAL_3]] : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-    // CHECK: %[[VAL_6:.*]] = tosa.rescale %[[VAL_5]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 2 : i32, per_channel = false, scale32 = true, shift = array<i8: 49>} : (tensor<2x3xi32>) -> tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
-    // CHECK: %[[VAL_7:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>> to tensor<2x3xi8>
-    // CHECK: tosa.variable.write @Variable, %[[VAL_7]] : tensor<2x3xi8>
-    // CHECK: return %[[VAL_6]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
     func.func @readAssignQuant(%arg0: tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> (tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) {
         "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
         %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
index 62f22d91e3d6..c4d077925495 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt --split-input-file --tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt --split-input-file --tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
+
 // Test tf legalization that produce TOSA ResultsBroadcastableShape operators with unequal ranks
 
 // -----
@@ -109,9 +109,18 @@ func.func @test_mul_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236
   func.return %0 : tensor<*x!quant.uniform<i8:f32, 0.0078376950696110725>>
 }
 
+// -----
+// CHECK-LABEL: test_floor_div
+// CHECK: tosa.intdiv
+// CHECK: tosa.select
+func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<1x13x1x3xi32>) -> tensor<1x13x21x3xi32> {
+  %0 = "tfl.floor_div"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xi32>, tensor<1x13x1x3xi32>) -> tensor<1x13x21x3xi32>
+  func.return %0 : tensor<1x13x21x3xi32>
+}
+
 // -----
 // CHECK-LABEL: test_div
-// CHECK: tosa.int_div
+// CHECK: tosa.intdiv
 func.func @test_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<*xi32> {
   %0 = "tfl.div"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xi32>, tensor<i32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/verify_fully_converted.mlir b/tensorflow/compiler/mlir/tosa/tests/verify_fully_converted.mlir
index ac918b321356..c8c8eb46c58c 100644
--- a/tensorflow/compiler/mlir/tosa/tests/verify_fully_converted.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/verify_fully_converted.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-opt %s --tosa-tflite-verify-fully-converted --split-input-file -verify-diagnostics
-// REQUIRES: tf_tosa
+// RUN: tf-tosa-opt %s --tosa-tflite-verify-fully-converted --split-input-file -verify-diagnostics
+
 
 // CHECK-LABEL: func.func @main
 func.func @main(%arg0: tensor<2xf32>) -> (tensor<2xf32>) {
diff --git a/tensorflow/compiler/mlir/tosa/tf_tosa_opt.cc b/tensorflow/compiler/mlir/tosa/tf_tosa_opt.cc
new file mode 100644
index 000000000000..9dd433708778
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tf_tosa_opt.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/InitAllPasses.h"               // from @llvm-project
+#include "mlir/Support/LogicalResult.h"       // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"           // from @llvm-project
+#include "tensorflow//compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
+#include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/test_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tosa/tf_passes.h"
+#include "tensorflow/compiler/mlir/tosa/tf_tfl_passes.h"
+#include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
+#include "xla/mlir/framework/transforms/passes.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  mlir::registerAllPasses();
+  mlir::registerTransformsPasses();
+  mlir::registerTensorFlowPasses();
+  mlir::TFDevice::registerTensorFlowDevicePasses();
+  mlir::tf_saved_model::registerTensorFlowSavedModelPasses();
+  mlir::TFL::registerTensorFlowLitePasses();
+  mlir::mhlo::registerAllMhloPasses();
+
+  // These are in compiler/mlir/tf2xla and not part of the above MHLO passes.
+  mlir::mhlo::registerLegalizeTfPasses();
+  mlir::mhlo::registerTfXlaPasses();
+  mlir::quant::stablehlo::registerBridgePasses();
+  tensorflow::tf2xla::internal::registerTFXLABridgeClusteringPasses();
+  tensorflow::tf2xla::internal::registerTFXLABridgeMlirToGraphPasses();
+  mlir::tf_test::registerTensorFlowTestPasses();
+  mlir::xla_framework::registerXlaFrameworkPasses();
+  tensorflow::RegisterConvertMlirToXlaHloPipelineWithDefaults();
+  tensorflow::RegisterGraphOptimizationPasses();
+  tensorflow::RegisterMlProgramPasses();
+  mlir::TFTPU::registerRuntimeLoweringPasses();
+  mlir::TFDevice::registerSparseCorePasses();
+  mlir::tosa::registerLegalizeTosaPasses();
+  mlir::tosa::registerTFtoTOSALegalizationPipeline();
+  mlir::tosa::registerTFLtoTOSALegalizationPipeline();
+  mlir::tosa::registerTFTFLtoTOSALegalizationPipeline();
+
+  tensorflow::tfrt_compiler::RegisterTPULowerClusterToRuntimeOpsPassPipeline();
+  tensorflow::tfrt_compiler::
+      RegisterNonTPULowerClusterToRuntimeOpsPassPipeline();
+
+  mlir::DialectRegistry registry;
+  mlir::RegisterCommonToolingDialects(registry);
+
+  return failed(
+      mlir::MlirOptMain(argc, argv, "TensorFlow pass driver\n", registry));
+}
diff --git a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
index 0ed3feec94f8..2931e5ae4654 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
@@ -28,16 +28,16 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
-#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"                // from @llvm-project
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"                // from @llvm-project
+#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"          // from @llvm-project
+#include "mlir/IR/Builders.h"                            // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"                   // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"                        // from @llvm-project
+#include "mlir/IR/PatternMatch.h"                        // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"                      // from @llvm-project
+#include "mlir/Support/LLVM.h"                           // from @llvm-project
+#include "mlir/Support/LogicalResult.h"                  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/legalize_common.h"
@@ -129,44 +129,47 @@ struct ConvertUint8QConstOp : public RewritePattern {
 
 namespace {
 
-// returns true iff @a shaped_type has element type that is uint8 or uniform
-// quantized unsigned 8 if it is, then return the rescaled type, uint8_zp, and
-// output_zp to use to rescale type to signed type with adjusted zero point.
-bool getUint8RescaleInfo(OpBuilder& builder, ShapedType shaped_type,
-                         Type& rescaled_type, int32_t& uint8_zp,
-                         int32_t& output_zp) {
-  auto element_type = shaped_type.getElementType();
-
-  if (auto quant_type =
-          dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    if (quant_type.isSigned() || quant_type.getStorageTypeIntegralWidth() != 8)
-      return false;
-    // element_type is uniform_quantized unsigned 8 bit
-    double type_range_min = static_cast<double>(quant_type.getStorageTypeMin() -
-                                                quant_type.getZeroPoint()) *
-                            quant_type.getScale();
-    double type_range_max = static_cast<double>(quant_type.getStorageTypeMax() -
-                                                quant_type.getZeroPoint()) *
-                            quant_type.getScale();
-    bool narrow_range = quant_type.getStorageTypeMin() == 1 ? true : false;
-
-    rescaled_type = shaped_type.clone(buildQTypeFromMinMax(
-        builder, quant_type.getExpressedType(),
+// returns true iff @a type is a shaped type with element type that is uint8
+// if it is, then return the rescaled type, uint8_zp, and output_zp to use to
+// rescale type to signed type with adjusted zero point.
+bool IsShapedUint8Type(OpBuilder &builder, const Type type, Type &rescaled_type,
+                       int32_t &uint8_zp, int32_t &output_zp) {
+  auto uint8_type = dyn_cast<mlir::ShapedType>(type);
+  if (!uint8_type) return false;
+
+  auto element_type = uint8_type.getElementType();
+  auto uint8_element_quant_type =
+      dyn_cast<mlir::quant::UniformQuantizedType>(element_type);
+  bool is_uint8_element_quant_type =
+      uint8_element_quant_type && !uint8_element_quant_type.isSigned() &&
+      uint8_element_quant_type.getStorageTypeIntegralWidth() == 8;
+  bool is_uint8_element_type = element_type.isUnsignedInteger(8);
+  if (!is_uint8_element_quant_type && !is_uint8_element_type) return false;
+
+  // type has uint8 element type
+  if (is_uint8_element_quant_type) {
+    double type_range_min =
+        static_cast<double>(uint8_element_quant_type.getStorageTypeMin() -
+                            uint8_element_quant_type.getZeroPoint()) *
+        uint8_element_quant_type.getScale();
+    double type_range_max =
+        static_cast<double>(uint8_element_quant_type.getStorageTypeMax() -
+                            uint8_element_quant_type.getZeroPoint()) *
+        uint8_element_quant_type.getScale();
+    bool narrow_range =
+        uint8_element_quant_type.getStorageTypeMin() == 1 ? true : false;
+
+    rescaled_type = uint8_type.clone(buildQTypeFromMinMax(
+        builder, uint8_element_quant_type.getExpressedType(),
         builder.getF64FloatAttr(type_range_min),
         builder.getF64FloatAttr(type_range_max),
-        builder.getI32IntegerAttr(quant_type.getStorageTypeIntegralWidth()),
-        /* filterQuantDim = */ 0,
-        /* isSigned = */ true, builder.getBoolAttr(narrow_range)));
-    uint8_zp = quant_type.getZeroPoint();
-    output_zp = uint8_zp - 128;
-    return true;
-  }
-
-  if (auto int_type = dyn_cast<IntegerType>(element_type)) {
-    if (!int_type.isUnsigned() || int_type.getWidth() != 8) return false;
-    // element_type is ui8
+        builder.getI32IntegerAttr(
+            uint8_element_quant_type.getStorageTypeIntegralWidth()),
+        0, true /* signed */, builder.getBoolAttr(narrow_range)));
+    uint8_zp = uint8_element_quant_type.getZeroPoint();
+  } else {
     // convert ui8 to i8 with zp=-128
-    rescaled_type = shaped_type.clone(quant::UniformQuantizedType::getChecked(
+    rescaled_type = uint8_type.clone(quant::UniformQuantizedType::getChecked(
         builder.getUnknownLoc(), quant::QuantizationFlags::Signed,
         builder.getI8Type(), builder.getF32Type(),
         /* scale = */ 1.0,
@@ -174,11 +177,9 @@ bool getUint8RescaleInfo(OpBuilder& builder, ShapedType shaped_type,
         /* storagTypeMin = */ -128,
         /* storageTypeMax = */ 127));
     uint8_zp = 0;
-    output_zp = uint8_zp - 128;
-    return true;
   }
-
-  return false;
+  output_zp = uint8_zp - 128;
+  return true;
 }
 
 }  // namespace
@@ -188,6 +189,7 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
   size_t num_blocks_in_main = 0;
   mlir::Region *region = function.getCallableRegion();
   OpBuilder builder(&context);
+  auto loc = function.getLoc();
 
   auto tmp_const_type = RankedTensorType::get({1}, builder.getIntegerType(8));
   auto tmp_const_attr =
@@ -204,34 +206,51 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
       return function.emitError("Invalid MLIR: block must be entry block");
     }
 
+    auto multiplier = tosa::getConstTensorInt<int32_t>(builder, loc, {1 << 30});
+    auto shift = tosa::getConstTensorInt<int8_t>(builder, loc, {30});
+
     // Insert rescale uint8->int8 after placeholders.
     for (Value arg : bb.getArguments()) {
       auto shaped_type = dyn_cast<ShapedType>(arg.getType());
       if (!shaped_type) continue;
       Type rescaled_type;
-      int32_t rescale_input_zp, rescale_output_zp;
-      if (!getUint8RescaleInfo(builder, shaped_type, rescaled_type,
-                               rescale_input_zp, rescale_output_zp))
+      int32_t rescale_input_zp_val, rescale_output_zp_val;
+      if (!IsShapedUint8Type(builder, arg.getType(), rescaled_type,
+                             rescale_input_zp_val, rescale_output_zp_val))
         continue;
 
       // Keep original input_val use with tmp_val.
-      Value tmp_val = builder.create<TFL::ConstOp>(
-          function.getLoc(), tmp_const_type, tmp_const_attr);
+      Value tmp_val =
+          builder.create<TFL::ConstOp>(loc, tmp_const_type, tmp_const_attr);
       arg.replaceAllUsesWith(tmp_val);
+      // mlir::quant::UniformQuantizedType uses signless storage type.
+      // For example, tensor<1x!quant.uniform<u8:...>> has the same storage type
+      // as tensor<1xi8>.
+      auto rescale_input_zp = tosa::getConstTensorInt<int8_t>(
+          builder, loc, {static_cast<int8_t>(rescale_input_zp_val)});
+      auto rescale_output_zp = tosa::getConstTensorInt<int8_t>(
+          builder, loc, {static_cast<int8_t>(rescale_output_zp_val)});
+
       auto rescale_op = builder.create<tosa::RescaleOp>(
-          function.getLoc(), rescaled_type, arg,
-          builder.getI32IntegerAttr(rescale_input_zp),
-          builder.getI32IntegerAttr(rescale_output_zp),
-          builder.getDenseI32ArrayAttr({1 << 30}),
-          builder.getDenseI8ArrayAttr({30}), builder.getBoolAttr(true),
-          builder.getBoolAttr(false), builder.getBoolAttr(false));
+          loc, rescaled_type, arg, multiplier, shift, rescale_input_zp,
+          rescale_output_zp,
+          /* scale32 = */ builder.getBoolAttr(true),
+          /* rounding_mode = */ builder.getStringAttr("SINGLE_ROUND"),
+          /* per_channel = */ builder.getBoolAttr(false),
+          /* input_unsigned = */ builder.getBoolAttr(true),     // uint8_t ->
+          /* output_unsigned = */ builder.getBoolAttr(false));  // int8_t
 
       Operation *op_rescale_op = static_cast<Operation *>(rescale_op);
       bb.push_front(op_rescale_op);
       tmp_val.replaceAllUsesWith(rescale_op.getResult());
       tmp_val.getDefiningOp()->erase();
+      bb.push_front(rescale_output_zp.getDefiningOp());
+      bb.push_front(rescale_input_zp.getDefiningOp());
     }
 
+    bb.push_front(shift.getDefiningOp());
+    bb.push_front(multiplier.getDefiningOp());
+
     // Record types of original graph output before we convert intermediate
     // tensor.
     auto terminator = bb.getTerminator();
@@ -242,13 +261,17 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
 
     // Convert intermediate tensor.
     for (auto &op : bb) {
+      if (llvm::dyn_cast<tosa::ConstOp>(&op)) {
+        continue;  // Skip if the operation is a tosa::ConstOp
+      }
+
       for (Value output_val : op.getResults()) {
         auto shaped_type = dyn_cast<ShapedType>(output_val.getType());
         if (!shaped_type) continue;
         Type new_type;
-        int32_t rescale_input_zp, rescale_output_zp;
-        if (getUint8RescaleInfo(builder, shaped_type, new_type,
-                                rescale_input_zp, rescale_output_zp)) {
+        int32_t unused_input_zp, unused_output_zp;
+        if (IsShapedUint8Type(builder, output_val.getType(), new_type,
+                              unused_input_zp, unused_output_zp)) {
           output_val.setType(new_type);
         }
       }
@@ -268,44 +291,55 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
       Value input_val = defining_op->getResult(0);
 
       // Check if graph output is uint8 type.
-      auto shaped_output_type = dyn_cast<mlir::ShapedType>(output_types[i]);
-      if (!shaped_output_type) continue;
+      auto uint8_output_type = dyn_cast<mlir::ShapedType>(output_types[i]);
+      if (!uint8_output_type) continue;
 
       // Check if graph output is uint8 type.
       Type rescaled_type;
-      int32_t uint8_zp, rescale_output_zp;
-      if (!getUint8RescaleInfo(builder, shaped_output_type, rescaled_type,
-                               uint8_zp, rescale_output_zp))
+      int32_t uint8_zp_val, unused_output_zp_val;
+      if (!IsShapedUint8Type(builder, output_types[i], rescaled_type,
+                             uint8_zp_val, unused_output_zp_val))
         continue;
 
       // convert terminator operand type back to original output_type.
       auto terminator_operand_type =
           dyn_cast<mlir::ShapedType>(terminator->getOperand(i).getType());
       if (!terminator_operand_type) continue;
-      int operand_zp = 0;
+      int operand_zp_val = 0;
       auto quantized_type = dyn_cast<mlir::quant::UniformQuantizedType>(
           terminator_operand_type.getElementType());
       if (quantized_type) {
-        operand_zp = quantized_type.getZeroPoint();
+        operand_zp_val = quantized_type.getZeroPoint();
       }
 
       // Keep original input_val use with tmp_val.
-      Value tmp_val = builder.create<TFL::ConstOp>(
-          function.getLoc(), tmp_const_type, tmp_const_attr);
-      input_val.replaceAllUsesWith(tmp_val);
+      Value tmp_val =
+          builder.create<TFL::ConstOp>(loc, tmp_const_type, tmp_const_attr);
+      input_val.replaceUsesWithIf(tmp_val, [&terminator](OpOperand &use) {
+        return use.getOwner() == terminator;
+      });
+
+      auto rescale_input_zp = tosa::getConstTensorInt<int8_t>(
+          builder, loc, {static_cast<int8_t>(operand_zp_val)});
+      auto rescale_output_zp = tosa::getConstTensorInt<int8_t>(
+          builder, loc, {static_cast<int8_t>(uint8_zp_val)});
+
       auto rescale_op = builder.create<tosa::RescaleOp>(
-          function.getLoc(), shaped_output_type, input_val,
-          builder.getI32IntegerAttr(operand_zp),
-          builder.getI32IntegerAttr(uint8_zp),
-          builder.getDenseI32ArrayAttr({1 << 30}),
-          builder.getDenseI8ArrayAttr({30}), builder.getBoolAttr(true),
-          builder.getBoolAttr(false), builder.getBoolAttr(false));
+          loc, uint8_output_type, input_val, multiplier, shift,
+          rescale_input_zp, rescale_output_zp,
+          /* scale32 = */ builder.getBoolAttr(true),
+          /* rounding_mode = */ builder.getStringAttr("SINGLE_ROUND"),
+          /* per_channel = */ builder.getBoolAttr(false),
+          /* input_unsigned = */ builder.getBoolAttr(false),   // int8_t ->
+          /* output_unsigned = */ builder.getBoolAttr(true));  // uint8_t
 
       Operation *op_rescale_op = static_cast<Operation *>(rescale_op);
       bb.push_back(op_rescale_op);
       op_rescale_op->moveBefore(terminator);
       tmp_val.replaceAllUsesWith(rescale_op.getResult());
       tmp_val.getDefiningOp()->erase();
+      bb.push_front(rescale_output_zp.getDefiningOp());
+      bb.push_front(rescale_input_zp.getDefiningOp());
     }
   }
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index 7abe46edb7fc..d0bc0d6b57d5 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/Utils/QuantUtils.h"  // from @llvm-project
+#include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"  // from @llvm-project
 #include "mlir/Dialect/Utils/StaticValueUtils.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -592,9 +593,6 @@ std::optional<Value> convertMultiplyOp(PatternRewriter& rewriter, Operation* op,
     return std::nullopt;
   }
 
-  if (EqualizeRanks(rewriter, op->getLoc(), input_lhs_val, input_rhs_val)
-          .failed())
-    return std::nullopt;
   input_lhs_type = dyn_cast<ShapedType>(input_lhs_val.getType());
   input_rhs_type = dyn_cast<ShapedType>(input_rhs_val.getType());
 
@@ -628,7 +626,7 @@ std::optional<Value> convertMultiplyOp(PatternRewriter& rewriter, Operation* op,
         rewriter, op, rescale_type, op1_rescale_lhs, op2_rescale_rhs);
     return buildRescale(rewriter, op, output_type, op3_mul_op1_op2.getResult(),
                         output_rescale_scale, 0, output_qtype.getZeroPoint(),
-                        true, scale32);
+                        "DOUBLE_ROUND", scale32);
   }
 
   return CreateMulOpAndInfer(rewriter, op, output_type, input_lhs_val,
@@ -669,12 +667,6 @@ std::optional<Value> convertSquaredDifferenceOp(PatternRewriter& rewriter,
     return std::nullopt;
   }
 
-  if (EqualizeRanks(rewriter, op->getLoc(), x, y)
-          .failed())
-    return std::nullopt;
-  x_type = dyn_cast<ShapedType>(x.getType());
-  y_type = dyn_cast<ShapedType>(y.getType());
-
   // If the output is I8 then we need to rescale to I32
   // Then scale back to I8
   if (result_is_qtype) {
@@ -706,14 +698,15 @@ std::optional<Value> convertSquaredDifferenceOp(PatternRewriter& rewriter,
           (twice_max_input_scale * twice_max_input_scale) /
           ((static_cast<double>(1 << LEFT_SHIFT * 2)) * result_scale);
 
-      Value x_scaled = buildRescaleToInt32(
-          rewriter, op, x,
-          x_rescale_scale * static_cast<double>(1 << LEFT_SHIFT),
-          x_qtype.getZeroPoint());
-      Value y_scaled = buildRescaleToInt32(
-          rewriter, op, y,
-          y_rescale_scale * static_cast<double>(1 << LEFT_SHIFT),
-          y_qtype.getZeroPoint());
+      Value x_shift = buildRescaleToInt32(rewriter, op, x, (1 << LEFT_SHIFT),
+                                          x_qtype.getZeroPoint());
+      Value y_shift = buildRescaleToInt32(rewriter, op, y, (1 << LEFT_SHIFT),
+                                          y_qtype.getZeroPoint());
+
+      Value x_scaled =
+          buildRescaleToInt32(rewriter, op, x_shift, x_rescale_scale, 0);
+      Value y_scaled =
+          buildRescaleToInt32(rewriter, op, y_shift, y_rescale_scale, 0);
 
       auto sub_op = CreateOpAndInfer<tosa::SubOp>(
           rewriter, op->getLoc(), rescale_type, x_scaled, y_scaled);
@@ -809,7 +802,7 @@ std::optional<Value> convertConcatV2Op(PatternRewriter& rewriter, Operation* op,
             operand_type.getShape(), result_quant_type);
         Value rescale_op = buildRescale(
             rewriter, op, rescale_type, v, operand_scale / result_scale,
-            operand_zeropoint, result_zeropoint, false, true);
+            operand_zeropoint, result_zeropoint, "SINGLE_ROUND", true);
         values_rescaled.push_back(rescale_op);
       } else {
         values_rescaled.push_back(v);
@@ -1585,7 +1578,7 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
   }
 
   // reduce_sum on last dimension
-  int32_t input_rank = input_type.getShape().size();
+  int32_t input_rank = input_type.getRank();
   ArrayRef<int64_t> logits_shape = output_type.getShape();
 
   if (mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType()) &&
@@ -1618,7 +1611,7 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
       // Step 1. get x - max(x)
       Value op1_rescale_in =
           buildRescale(rewriter, op, int32_logits_type, logits_value, 1.0f,
-                       in_quant_type.getZeroPoint(), 0, false, true);
+                       in_quant_type.getZeroPoint(), 0, "SINGLE_ROUND", true);
 
       auto op2_reducemax_op1 = CreateOpAndInfer<tosa::ReduceMaxOp>(
           rewriter, op->getLoc(), int32_rsum_type, op1_rescale_in,
@@ -1643,7 +1636,7 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
 
       Value op4_rescale_op3 =
           buildRescale(rewriter, op, int16_logits_type,
-                       op3_sub_op1_op2.getResult(), 128.0, 0, 0, false, true);
+                       op3_sub_op1_op2.getResult(), 128.0, 0, 0, "SINGLE_ROUND", true);
 
       // Input is 9.7, where lower 7 bits are all zeros.
       // Output is 23 bits, where lower 7 bits should be all zeros as well,
@@ -1811,13 +1804,13 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
 
       return buildRescale(rewriter, op, output_type,
                           op28_rshift_op26_op27.getResult(), 1.0, 0,
-                          out_quant_type.getZeroPoint(), false, true);
+                          out_quant_type.getZeroPoint(), "SINGLE_ROUND", true);
 
     } else if (in_quant_type.getStorageTypeIntegralWidth() == 16) {
       // Step 1. get x - max(x)
       Value op1_rescale_in =
           buildRescale(rewriter, op, int32_logits_type, logits_value, 1.0f,
-                       in_quant_type.getZeroPoint(), 0, false, true);
+                       in_quant_type.getZeroPoint(), 0, "SINGLE_ROUND", true);
 
       auto op2_reducemax_op1 = CreateOpAndInfer<tosa::ReduceMaxOp>(
           rewriter, op->getLoc(), int32_rsum_type, op1_rescale_in,
@@ -1832,8 +1825,8 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
       auto exp_func = [](double x) -> double { return std::exp(x); };
 
       // Follow TFLite reference: tensorflow/lite/kernels/activations.cc
-      Value exp_table_const =
-          getTosaConst16bitTable(rewriter, op, exp_func, -10.0, 0);
+      Value exp_table_const = getTosaConst16bitTable<double>(
+          rewriter, op, 10.0 / 65535.0, 32767, 2.0 / 65535.0, 0, exp_func);
 
       double input_diff_scale = in_quant_type.getScale() / (10.0 / 65535.0);
 
@@ -1841,7 +1834,7 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
       Value op4_rescale_op3 = buildRescale(
           rewriter, op, int32_logits_type, op3_sub_op1_op2.getResult(),
           /*scale=*/input_diff_scale, /*input_zp=*/0, /*output_zp=*/0,
-          /*double_round=*/true, /*scale32=*/true);
+          /*rounding_mode=*/"DOUBLE_ROUND", /*scale32=*/true);
       auto op5_add_op4 = CreateOpAndInfer<tosa::AddOp>(
           rewriter, op->getLoc(), int32_logits_type, op4_rescale_op3,
           getTosaConstTensorSingleI32(rewriter, op, 32767, input_rank));
@@ -1906,8 +1899,9 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
         return 1.0 / (1.0 + x);
       };
 
-      Value one_over_one_plus_x_table_const = getTosaConst16bitTable(
-          rewriter, op, one_over_one_plus_x_func, 0.0, 1.0);
+      Value one_over_one_plus_x_table_const = getTosaConst16bitTable<double>(
+          rewriter, op, 1.0 / 65535.0, -32768, 2.0 / 65535.0, 0,
+          one_over_one_plus_x_func);
 
       // Get (1 / sum(exp(x))) result as 23 bits (including sign bit)
       auto op17_table_op16 = CreateOpAndInfer<tosa::TableOp>(
@@ -1939,7 +1933,7 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
       return buildRescale(rewriter, op, output_type,
                           op21_rshift_op19_op20.getResult(),
                           (1.0 / out_quant_type.getScale()) * (1.0 / 32768.0),
-                          0, out_quant_type.getZeroPoint(), false, true);
+                          0, out_quant_type.getZeroPoint(), "SINGLE_ROUND", true);
     } else {
       (void)rewriter.notifyMatchFailure(op, "unknown quantization bitwidth");
       return std::nullopt;
@@ -2735,17 +2729,114 @@ std::optional<Value> convertStridedSliceOp(
   return reverseNegativeStride(rewriter, op, a4_reshape_op, strides);
 }
 
+// Helper function to perform division with floor rounding mode (rounding result
+// down) for integer type inputs.
+Value floorIntDiv(PatternRewriter& rewriter, Operation* op,
+                  ShapedType output_type, Value lhs_value, Value rhs_value) {
+  // To implement floor div int input, utilize tosa::IntDivOp (trunc div
+  // result - rounds towards zero) with the following formula elementwise:
+  // floor_value = trunc_value - ((trunc_value * rhs_value != lhs_value)
+  //                                && (sign(lhs_value) != sign(rhs_value)))
+  //
+  // a1 = intdiv(lhs_value, rhs_value); // IntDivOp return truncated result
+  // a2 = mul(lhs_value, rhs_value);
+  // a3 = mul(rhs_value, a1);
+  // a4 = eq(lhs_value, a3);
+  // a5 = not(a4); // (trunc_value * rhs_value != lhs_value)
+  // a6 = gt(zero, a2); // (sign(lhs_value) != sign(rhs_value))
+  // a7 = sub(a1, one);
+  // a8 = and(a5, a6); // (trunc_value * rhs_value != lhs_value) &&
+  //                                  (sign(lhs_value) != sign(rhs_value))
+  // a9 = select(a8, a7, a1);
+  // return a9;
+
+  ShapedType lhs_type = dyn_cast<ShapedType>(lhs_value.getType());
+  ShapedType rhs_type = dyn_cast<ShapedType>(rhs_value.getType());
+
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+
+  ShapedType output_i32_type = output_type.clone(rewriter.getIntegerType(32));
+  ShapedType output_bool_type = output_type.clone(rewriter.getIntegerType(1));
+
+  Value zero =
+      getTosaConstTensorSingleI32(rewriter, op, 0, output_type.getRank());
+  Value one =
+      getTosaConstTensorSingleI32(rewriter, op, 1, output_type.getRank());
+
+  auto output_shape_value = getTosaConstShape(
+      rewriter, op->getLoc(),
+      tensorflow::ConvertMlirShapeToTF(output_type.getShape()));
+
+  Value lhs_value_casted = CreateOpAndInfer<tosa::CastOp>(
+      rewriter, op->getLoc(), lhs_type.clone(rewriter.getIntegerType(32)),
+      lhs_value);
+
+  Value lhs_value_reshaped =
+      CreateOpAndInfer<tosa::ReshapeOp>(rewriter, op->getLoc(), output_i32_type,
+                                        lhs_value_casted, output_shape_value);
+
+  Value rhs_value_casted = CreateOpAndInfer<tosa::CastOp>(
+      rewriter, op->getLoc(), rhs_type.clone(rewriter.getIntegerType(32)),
+      rhs_value);
+
+  // TOSA IntDiv requires inputs to be i32
+  auto a1_int_div_op =
+      CreateOpAndInfer<tosa::IntDivOp>(rewriter, op->getLoc(), output_i32_type,
+                                       lhs_value_casted, rhs_value_casted);
+
+  auto a1_int_div_op_casted = CreateOpAndInfer<tosa::CastOp>(
+      rewriter, op->getLoc(), output_type, a1_int_div_op.getResult());
+
+  auto a2_lhs_mul_rhs_op =
+      CreateMulOpAndInfer(rewriter, op, output_type, lhs_value, rhs_value);
+
+  auto a3_rhs_mul_a1_op = CreateMulOpAndInfer(
+      rewriter, op, output_type, rhs_value, a1_int_div_op_casted.getResult());
+
+  auto a4_lhs_eq_a3_op = CreateOpAndInfer<tosa::EqualOp>(
+      rewriter, op->getLoc(), output_bool_type, lhs_value_reshaped,
+      a3_rhs_mul_a1_op.getResult());
+
+  // (trunc_value * rhs_value != lhs_value)
+  auto a5_not_a4_op = CreateOpAndInfer<tosa::LogicalNotOp>(
+      rewriter, op->getLoc(), output_bool_type, a4_lhs_eq_a3_op.getResult());
+
+  // (sign(lhs_value) != sign(rhs_value))
+  auto a6_zero_gt_a2_op = CreateOpAndInfer<tosa::GreaterOp>(
+      rewriter, op->getLoc(), output_bool_type, zero,
+      a2_lhs_mul_rhs_op.getResult());
+
+  auto a7_a1_sub_one_op =
+      CreateOpAndInfer<tosa::SubOp>(rewriter, op->getLoc(), output_type,
+                                    a1_int_div_op_casted.getResult(), one);
+
+  // (trunc_value * rhs_value != lhs_value)
+  //                      && (sign(lhs_value) != sign(rhs_value))
+  auto a8_a5_and_a6_op = CreateOpAndInfer<tosa::LogicalAndOp>(
+      rewriter, op->getLoc(), output_bool_type, a5_not_a4_op.getResult(),
+      a6_zero_gt_a2_op.getResult());
+
+  auto a9_select_op = CreateOpAndInfer<tosa::SelectOp>(
+      rewriter, op->getLoc(), output_type, a8_a5_and_a6_op.getResult(),
+      a7_a1_sub_one_op.getResult(), a1_int_div_op_casted.getResult());
+
+  return a9_select_op.getResult();
+}
+
 // Lowers FloorDiv to a sequence of TOSA operators.
 std::optional<Value> convertFloorDivOp(PatternRewriter& rewriter, Operation* op,
                                        Value result_value, Value lhs_value,
                                        Value rhs_value) {
-  // FloorDiv lowering:
+  // FloorDiv lowering for float type:
   // floor(1/rhs * lhs)
   //
   // a1 = reciprocal(rhs);
   // a2 = mul(lhs, a1);
   // a3 = floor(a2);
   // return a3;
+  //
+  // FloorDiv lowering for integer type:
+  // See floorIntDiv() function for details
   ShapedType output_type = dyn_cast<ShapedType>(result_value.getType());
   // Not a shaped tensor output
   if (!output_type) return std::nullopt;
@@ -2753,9 +2844,7 @@ std::optional<Value> convertFloorDivOp(PatternRewriter& rewriter, Operation* op,
   Type element_type = output_type.getElementType();
 
   if (mlir::isa<IntegerType>(element_type)) {
-    return CreateOpAndInfer<tosa::IntDivOp>(rewriter, op->getLoc(), output_type,
-                                            lhs_value, rhs_value)
-        .getResult();
+    return floorIntDiv(rewriter, op, output_type, lhs_value, rhs_value);
   }
 
   auto a1_reciprocal_rhs_op = CreateOpAndInfer<tosa::ReciprocalOp>(
@@ -2921,13 +3010,12 @@ std::optional<Value> convertReduceOpCommon(
     bool is_quantized, int32_t input_scale_multiplier,
     int32_t input_scale_shift, int64_t input_zp,
     int32_t output_scale_multiplier, int32_t output_scale_shift,
-    int64_t output_zp, StringRef nan_mode = "") {
+    int64_t output_zp, bool keep_dims, StringRef nan_mode = "") {
   RankedTensorType input_type =
       dyn_cast<RankedTensorType>(input_value.getType());
   if (!input_type) return std::nullopt;
 
   ArrayRef<int64_t> input_shape = input_type.getShape();
-  ArrayRef<int64_t> output_shape = output_type.getShape();
   auto input_rank = input_shape.size();
   Location loc = op->getLoc();
 
@@ -2985,15 +3073,38 @@ std::optional<Value> convertReduceOpCommon(
   }
 
   if (is_quantized) {
+    std::string rounding_mode = IsTFLDoubleRoundingMode() ? "DOUBLE_ROUND" : "SINGLE_ROUND";
     UnrankedTensorType output_rescale_type =
         UnrankedTensorType::get(output_type.getElementType());
     val = buildRescale(rewriter, op, output_rescale_type, val,
                        output_scale_multiplier, output_scale_shift,
-                       /*input_zp=*/0, output_zp, IsTFLDoubleRoundingMode(),
+                       /*input_zp=*/0, output_zp, rounding_mode,
                        /*scale32=*/true);
   }
 
+  // If keep dims, no reshaping of the output is required
+  if (keep_dims) {
+    return val;
+  }
+
   // Squeeze out the reduced axes.
+  const auto squeeze_axes = [](llvm::ArrayRef<int64_t> in, llvm::ArrayRef<int64_t> axes) {
+    llvm::SmallVector<int64_t> sorted_axes{axes};
+    std::sort(sorted_axes.begin(), sorted_axes.end());
+    auto current_axis = sorted_axes.begin();
+
+    llvm::SmallVector<int64_t> out;
+    out.reserve(in.size() - axes.size());
+    for (const auto& [i, dim] : llvm::enumerate(in)) {
+      if (current_axis == sorted_axes.end() || i != *current_axis)
+        out.push_back(dim);
+      else
+        current_axis++;
+    }
+    return out;
+  };
+
+  const auto output_shape = squeeze_axes(input_shape, axes);
   auto output_shape_value =
       getTosaConstShape(rewriter, op->getLoc(),
                     tensorflow::ConvertMlirShapeToTF(output_shape));
@@ -3009,7 +3120,7 @@ std::optional<Value> convertReduceOpCommon(
     PatternRewriter& rewriter, Operation* op, RankedTensorType output_type,
     Value input_value, ElementsAttr axes_elems, Type reduce_element_type,
     bool is_quantized, double input_scale, int64_t input_zp,
-    double output_scale, int64_t output_zp, StringRef nan_mode = "") {
+    double output_scale, int64_t output_zp, bool keep_dims, StringRef nan_mode = "") {
   const int32_t scale_width = 32;
 
   int32_t input_scale_multiplier;
@@ -3025,7 +3136,7 @@ std::optional<Value> convertReduceOpCommon(
   return convertReduceOpCommon<T>(
       rewriter, op, output_type, input_value, axes_elems, reduce_element_type,
       is_quantized, input_scale_multiplier, input_scale_shift, input_zp,
-      output_scale_multiplier, output_scale_shift, output_zp, nan_mode);
+      output_scale_multiplier, output_scale_shift, output_zp, keep_dims, nan_mode);
 }
 
 // Lowers ReduceAll to a sequence of TOSA ops.
@@ -3033,14 +3144,15 @@ std::optional<Value> convertReduceAllOp(PatternRewriter& rewriter,
                                         Operation* op,
                                         RankedTensorType output_type,
                                         Value input_value,
-                                        ElementsAttr axes_elems) {
+                                        ElementsAttr axes_elems,
+                                        bool keep_dims) {
   RankedTensorType input_type =
       dyn_cast<RankedTensorType>(input_value.getType());
   if (!input_type) return std::nullopt;
 
   return convertReduceOpCommon<tosa::ReduceAllOp>(
       rewriter, op, output_type, input_value, axes_elems,
-      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0);
+      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0, keep_dims);
 }
 
 // Lowers ReduceAny to a sequence of TOSA ops.
@@ -3048,14 +3160,15 @@ std::optional<Value> convertReduceAnyOp(PatternRewriter& rewriter,
                                         Operation* op,
                                         RankedTensorType output_type,
                                         Value input_value,
-                                        ElementsAttr axes_elems) {
+                                        ElementsAttr axes_elems,
+                                        bool keep_dims) {
   RankedTensorType input_type =
       dyn_cast<RankedTensorType>(input_value.getType());
   if (!input_type) return std::nullopt;
 
   return convertReduceOpCommon<tosa::ReduceAnyOp>(
       rewriter, op, output_type, input_value, axes_elems,
-      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0);
+      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0, keep_dims);
 }
 
 // Lowers ReduceMin to a sequence of TOSA ops.
@@ -3064,6 +3177,7 @@ std::optional<Value> convertReduceMinOp(PatternRewriter& rewriter,
                                         RankedTensorType output_type,
                                         Value input_value,
                                         ElementsAttr axes_elems,
+                                        bool keep_dims,
                                         StringRef nan_mode) {
   RankedTensorType input_type =
       dyn_cast<RankedTensorType>(input_value.getType());
@@ -3071,7 +3185,7 @@ std::optional<Value> convertReduceMinOp(PatternRewriter& rewriter,
 
   return convertReduceOpCommon<tosa::ReduceMinOp>(
       rewriter, op, output_type, input_value, axes_elems,
-      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0, nan_mode);
+      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0, keep_dims, nan_mode);
 }
 
 // Lowers ReduceMax to a sequence of TOSA ops.
@@ -3080,6 +3194,7 @@ std::optional<Value> convertReduceMaxOp(PatternRewriter& rewriter,
                                         RankedTensorType output_type,
                                         Value input_value,
                                         ElementsAttr axes_elems,
+                                        bool keep_dims,
                                         StringRef nan_mode) {
   RankedTensorType input_type =
       dyn_cast<RankedTensorType>(input_value.getType());
@@ -3087,7 +3202,7 @@ std::optional<Value> convertReduceMaxOp(PatternRewriter& rewriter,
 
   return convertReduceOpCommon<tosa::ReduceMaxOp>(
       rewriter, op, output_type, input_value, axes_elems,
-      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0, nan_mode);
+      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0, keep_dims, nan_mode);
 }
 
 // Lowers ReduceProd to a sequence of TOSA ops.
@@ -3095,7 +3210,8 @@ std::optional<Value> convertReduceProdOp(PatternRewriter& rewriter,
                                          Operation* op,
                                          RankedTensorType output_type,
                                          Value input_value,
-                                         ElementsAttr axes_elems) {
+                                         ElementsAttr axes_elems,
+                                         bool keep_dims) {
   RankedTensorType input_type =
       dyn_cast<RankedTensorType>(input_value.getType());
   if (!input_type) return std::nullopt;
@@ -3113,7 +3229,7 @@ std::optional<Value> convertReduceProdOp(PatternRewriter& rewriter,
 
   return convertReduceOpCommon<tosa::ReduceProductOp>(
       rewriter, op, output_type, input_value, axes_elems,
-      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0);
+      output_type.getElementType(), false, 1.0f, 0, 1.0f, 0, keep_dims);
 }
 
 // Lowers ReduceSum to a sequence of TOSA ops.
@@ -3121,7 +3237,8 @@ std::optional<Value> convertReduceSumOp(PatternRewriter& rewriter,
                                         Operation* op,
                                         RankedTensorType output_type,
                                         Value input_value,
-                                        ElementsAttr axes_elems) {
+                                        ElementsAttr axes_elems,
+                                        bool keep_dims) {
   RankedTensorType input_type =
       dyn_cast<RankedTensorType>(input_value.getType());
   if (!input_type) return std::nullopt;
@@ -3164,7 +3281,7 @@ std::optional<Value> convertReduceSumOp(PatternRewriter& rewriter,
 
   return convertReduceOpCommon<tosa::ReduceSumOp>(
       rewriter, op, output_type, input_value, axes_elems, reduce_element_type,
-      input_is_qtype, input_scale, input_zp, output_scale, output_zp);
+      input_is_qtype, input_scale, input_zp, output_scale, output_zp, keep_dims);
 }
 
 // Lowers ReduceMean to a sequence of TOSA ops.
@@ -3172,7 +3289,8 @@ std::optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
                                          Operation* op,
                                          RankedTensorType output_type,
                                          Value input_value,
-                                         ElementsAttr axes_elems) {
+                                         ElementsAttr axes_elems,
+                                         bool keep_dims) {
   // reduce_mean is lowered as followed for quantized types:
   // op1 = reduce_sum(input) with the 1.0/num_elements_on_reduced_axis
   // integrated to the rescale layer,
@@ -3265,7 +3383,7 @@ std::optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
   auto val = convertReduceOpCommon<tosa::ReduceSumOp>(
       rewriter, op, output_type, input_value, axes_elems, reduce_element_type,
       input_is_qtype, input_scale_multiplier, input_scale_shift, input_zp,
-      output_scale_multiplier, output_scale_shift, output_zp);
+      output_scale_multiplier, output_scale_shift, output_zp, keep_dims);
 
   if (!val.has_value()) return std::nullopt;
 
@@ -3493,7 +3611,7 @@ std::optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
       // This should be the expected lowering, but is +-1 within compared to
       // TFLite reference.
       return buildRescale(rewriter, op, output_type, resize_op.getResult(),
-                          1.0 / (scale_y_n * scale_x_n), 0, 0, false,
+                          1.0 / (scale_y_n * scale_x_n), 0, 0, "SINGLE_ROUND",
                           is_scale32);
 #endif
 
@@ -3837,7 +3955,7 @@ std::optional<Value> convertConv3DCommon(
     (void)rewriter.notifyMatchFailure(op, "currently only supports NDHWC");
     return std::nullopt;
   }
-  RankedTensorType filter_type = filter.getType().cast<RankedTensorType>();
+  RankedTensorType filter_type = mlir::cast<RankedTensorType>(filter.getType());
   // Note that the kernel shape of tfl.conv_3d isn't [O, D, H, W, I] but
   // [D, H, W, I, O] which is the same as in TF.
   // Transpose filter shape from [D, H, W, I, O] to [O, D, H, W, C]
@@ -4303,6 +4421,229 @@ std::optional<Value> convertGatherNdOp(PatternRewriter& rewriter, Operation* op,
       .getResult();
 }
 
+std::optional<Value> convertScatterNdOp(PatternRewriter& rewriter,
+                                        Operation* op, Value result_value,
+                                        Value indices_value,
+                                        Value updates_value,
+                                        Value shape_value) {
+  auto const result_type = dyn_cast<RankedTensorType>(result_value.getType());
+  auto const indices_type = dyn_cast<RankedTensorType>(indices_value.getType());
+  auto const updates_type = dyn_cast<RankedTensorType>(updates_value.getType());
+  auto const shape_type = dyn_cast<RankedTensorType>(shape_value.getType());
+
+  if (!result_type || !indices_type || !updates_type || !shape_type) {
+    (void)rewriter.notifyMatchFailure(
+        op, "input/output types must be ranked tensor type");
+    return std::nullopt;
+  }
+
+  // Don't support variable indices yet since we cannot check uniqueness
+  // of indices in this case
+  Operation* indices_op = indices_value.getDefiningOp();
+  if (!indices_op || !llvm::isa<tosa::ConstOp>(indices_op)) {
+    (void)rewriter.notifyMatchFailure(op, "indices must be a constant tensor");
+    return std::nullopt;
+  }
+
+  Type indices_elmt_type = indices_type.getElementType();
+  if (!indices_elmt_type.isInteger(32)) {
+    (void)rewriter.notifyMatchFailure(op, "indices expected to be int32");
+    return std::nullopt;
+  }
+
+  // The tosa scatter operation only supports unique indices, so if there
+  // are duplicates, we cannot legalize
+  tosa::ConstOp const_indices = cast<tosa::ConstOp>(indices_op);
+  ElementsAttr const_data = const_indices.getValues();
+  if (!checkUniqueConstantScatterIndices(indices_type, result_type,
+                                         const_data)) {
+    (void)rewriter.notifyMatchFailure(op, "index values must be unique");
+    return std::nullopt;
+  }
+
+  // N: number of batches
+  // Always 1 for ScatterND
+  //
+  // Because TOSA's SCATTER operator already uses the symbol 'N' for
+  // the number of batches, we will use the symbol 'ND' to specify the
+  // number of dimensions that are sliced from input instead of'N' in
+  // the TF MLIR documentation.
+  //
+  // ND: indices.shape[-1]
+  //
+  // W: number of indices in each batch
+  // Computed as:
+  // product(indices.shape[0:-1]) (all but the last dimension)
+  //
+  // K: range of each index
+  // Computed as:
+  // product(result.shape[0:ND-1])
+  //
+  // C: number of channels for each index
+  // Computed as:
+  // product(result.shape[ND:])
+  //
+  // The updates tensor needs to be reshaped, but not transposed, to move
+  // the dimensions into [N, W, C] order.
+  //
+  // Indices needs to be put in the form of [N, W], but a simple flattening
+  // will not suffice, because the indices need to index into the [W]-shape
+  // updates vector instead.
+  //
+  // To flatten the coordinates, first reshape indices to a [W, ND] matrix,
+  // where the matrix now represents W ND-dimensional coordinates into the
+  // updates tensor.
+  //
+  // From here, we take each of the ND dimensions and multiply it with
+  // the size of the next updates dimension (or 1 for the last
+  // dimension), then sum all these together with a reduce_sum
+  // operator. This is exactly the same mathematics as one would use
+  // flatten the indices of an N-dimensional row-major array into a
+  // 1-D array in C.
+  //
+  // More precisely, do an element-wise multiply with [updates.shape[1
+  // .. ND], 1] in axis 1, then reduce_sum in axis 1 to flatten to a
+  // [W]-shaped tensor, then trivially reshape to [N=1, W] to be
+  // compatible with the SCATTER operator's shape.
+  //
+  // Then perform the tosa.SCATTER() operation.
+  //
+  // Now we have result = [N, K, C].
+  //
+  // Reshape with a single, simple reshape to the final output shape
+  // provided by shape_value.
+
+  const unsigned int input_output_rank = result_type.getShape().size();
+  const unsigned int indices_rank = indices_type.getShape().size();
+
+  const unsigned int ND = indices_type.getShape()[indices_rank - 1];
+
+  if (ND > input_output_rank) {
+    (void)rewriter.notifyMatchFailure(
+        op, "size of last dimension of indices must be <= input/output rank");
+    return std::nullopt;
+  }
+
+  // Calculate N, K, W, C.  (N is always 1)
+  auto const indices_shape_begin{indices_type.getShape().begin()};
+  auto const result_shape_begin{result_type.getShape().begin()};
+  auto const accumulate_func = [](auto const& a_, auto const& b_) {
+    return a_ * b_;
+  };
+
+  const unsigned int N = 1;
+  const unsigned int W = std::accumulate(indices_shape_begin,
+                                         indices_shape_begin + indices_rank - 1,
+                                         1, accumulate_func);
+  const unsigned int K = std::accumulate(
+      result_shape_begin, result_shape_begin + ND, 1, accumulate_func);
+  const unsigned int C = std::accumulate(result_shape_begin + ND,
+                                         result_shape_begin + input_output_rank,
+                                         1, accumulate_func);
+
+  SmallVector<int64_t, 2> tosa_indices_shape({N, W});
+  SmallVector<int64_t, 2> indices_matrix_shape({W, ND});
+  SmallVector<int64_t, 3> tosa_input_shape({N, W, C});
+  SmallVector<int64_t, 3> tosa_values_in_out_shape({N, K, C});
+
+  // Flatten the updates tensor to an [N, W] matrix.
+  auto input_shape_value =
+      getTosaConstShape(rewriter, op->getLoc(),
+                    tensorflow::ConvertMlirShapeToTF(tosa_input_shape));
+  auto tosa_input_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
+      rewriter, op->getLoc(),
+      tensorflow::GetTypeFromTFTensorShape(tosa_input_shape,
+                                           result_type.getElementType()),
+      updates_value, input_shape_value);
+
+  // Flatten the indices tensor to an [W, ND] matrix.
+  auto indices_matrix_shape_value =
+      getTosaConstShape(rewriter, op->getLoc(),
+      tensorflow::ConvertMlirShapeToTF(indices_matrix_shape));
+  auto indices_matrix_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
+      rewriter, op->getLoc(),
+      tensorflow::GetTypeFromTFTensorShape(indices_matrix_shape,
+                                           indices_elmt_type),
+      indices_value, indices_matrix_shape_value);
+
+  SmallVector<int32_t> flattened_coeff_vec;
+  for (int i = 1; i < ND; i++) {
+    flattened_coeff_vec.push_back(result_type.getShape()[i]);
+  }
+  flattened_coeff_vec.push_back(1);
+  for (int i = ND - 1; i > 0; i--) {
+    flattened_coeff_vec[i - 1] *= flattened_coeff_vec[i];
+  }
+  std::optional<Value> flattened_coeff_value = getConstTensor<int32_t>(
+      rewriter, op, flattened_coeff_vec,
+      {static_cast<int64_t>(flattened_coeff_vec.size())});
+
+  if (!flattened_coeff_value) {
+    (void)rewriter.notifyMatchFailure(
+        op, "failed to calculate flattened coeff value");
+    return std::nullopt;
+  }
+
+  // Multiply the coefficients by the coordinates
+  Value mul_x = indices_matrix_reshape_op.getResult();
+  Value mul_y = flattened_coeff_value.value();
+  RankedTensorType mul_type = tensorflow::GetTypeFromTFTensorShape(
+      indices_matrix_shape, indices_type.getElementType());
+  if (EqualizeRanks(rewriter, op->getLoc(), mul_x, mul_y).failed()) {
+    (void)rewriter.notifyMatchFailure(
+        op, "failed to broadcast coefficients over the coordinates");
+    return std::nullopt;
+  }
+  auto flattened_indices_mul_op = CreateMulOpAndInfer(
+      rewriter, op, mul_type, mul_x, mul_y);
+
+  // Sum up the products of the coefficients and coordinates
+  auto flattened_indices_reduce_op = CreateOpAndInfer<tosa::ReduceSumOp>(
+      rewriter, op->getLoc(),
+      tensorflow::GetTypeFromTFTensorShape(tosa_indices_shape,
+                                           indices_type.getElementType()),
+      flattened_indices_mul_op.getResult(), rewriter.getI32IntegerAttr(1));
+
+  // And reshape to [N, W]
+  auto tosa_indices_shape_value =
+      getTosaConstShape(rewriter, op->getLoc(),
+                    tensorflow::ConvertMlirShapeToTF(tosa_indices_shape));
+  auto tosa_indices_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
+      rewriter, op->getLoc(),
+      tensorflow::GetTypeFromTFTensorShape(tosa_indices_shape,
+                                           indices_type.getElementType()),
+      flattened_indices_reduce_op.getResult(), tosa_indices_shape_value);
+
+  // Scatter_nd has no input tensor, use a zero tensor
+  Type const_element_type = updates_type.getElementType();
+  auto const_type =
+      RankedTensorType::get(tosa_values_in_out_shape, const_element_type);
+  if (mlir::isa<mlir::quant::QuantizedType>(const_element_type)) {
+    auto quant_type = dyn_cast<mlir::quant::QuantizedType>(const_element_type);
+    const_element_type = quant_type.getStorageType();
+  }
+  auto const_storage_type =
+      RankedTensorType::get(tosa_values_in_out_shape, const_element_type);
+  auto const_attr = DenseElementsAttr::get(
+      const_storage_type, rewriter.getZeroAttr(const_element_type));
+  Value tosa_values_in =
+      rewriter.create<tosa::ConstOp>(op->getLoc(), const_type, const_attr);
+
+  // Now the scatter op itself
+  auto tosa_scatter_op = CreateOpAndInfer<tosa::ScatterOp>(
+      rewriter, op->getLoc(), result_type, tosa_values_in,
+      tosa_indices_reshape_op.getResult(), tosa_input_reshape_op.getResult());
+
+  // Finally, reshape back to the expected output shape.
+  auto reshape_shape_value =
+      getTosaConstShape(rewriter, op->getLoc(),
+                    tensorflow::ConvertMlirShapeToTF(result_type.getShape()));
+  return CreateOpAndInfer<tosa::ReshapeOp>(rewriter, op->getLoc(), result_type,
+                                           tosa_scatter_op.getResult(),
+                                           reshape_shape_value)
+      .getResult();
+}
+
 // Lowers OneHot operator to a sequence of TOSA ops.
 std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
                                      Value result_value, Value indices_value,
@@ -4661,7 +5002,7 @@ std::optional<Value> convertBroadcastToOp(PatternRewriter& rewriter,
 // Lowers cast operator to a sequence of TOSA ops.
 std::optional<Value> convertCastOp(PatternRewriter& rewriter, Operation* op,
                                    Value input, RankedTensorType output_type) {
-  auto input_type = input.getType().cast<ShapedType>();
+  auto input_type = mlir::cast<ShapedType>(input.getType());
   auto input_element_type = input_type.getElementType();
   Value cast_input = input;
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
index ff42f56ba34f..8cc74ee9bd51 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
@@ -147,6 +147,11 @@ std::optional<Value> convertStridedSliceOp(
     int32_t begin_mask, int32_t end_mask, int32_t ellipsis_mask,
     int32_t new_axis_mask, int32_t shrink_axis_mask);
 
+// Helper function to perform division with floor rounding mode (rounding result
+// down) for integer type inputs.
+Value floorIntDiv(PatternRewriter& rewriter, Operation* op, ShapedType outType,
+                  Value lhs, Value rhs);
+
 // Lowers FloorDiv to a sequence of TOSA operators.
 std::optional<Value> convertFloorDivOp(PatternRewriter& rewriter, Operation* op,
                                        Value result_value, Value lhs_value,
@@ -174,14 +179,16 @@ std::optional<Value> convertReduceAllOp(PatternRewriter& rewriter,
                                         Operation* op,
                                         RankedTensorType output_type,
                                         Value input_value,
-                                        ElementsAttr axes_elems);
+                                        ElementsAttr axes_elems,
+                                        bool keep_dims);
 
 // Lowers ReduceAny to a sequence of TOSA ops.
 std::optional<Value> convertReduceAnyOp(PatternRewriter& rewriter,
                                         Operation* op,
                                         RankedTensorType output_type,
                                         Value input_value,
-                                        ElementsAttr axes_elems);
+                                        ElementsAttr axes_elems,
+                                        bool keep_dims);
 
 // Lowers ReduceMin to a sequence of TOSA ops.
 std::optional<Value> convertReduceMinOp(PatternRewriter& rewriter,
@@ -189,6 +196,7 @@ std::optional<Value> convertReduceMinOp(PatternRewriter& rewriter,
                                         RankedTensorType output_type,
                                         Value input_value,
                                         ElementsAttr axes_elems,
+                                        bool keep_dims,
                                         StringRef nan_mode = "PROPAGATE");
 
 // Lowers ReduceMax to a sequence of TOSA ops.
@@ -197,6 +205,7 @@ std::optional<Value> convertReduceMaxOp(PatternRewriter& rewriter,
                                         RankedTensorType output_type,
                                         Value input_value,
                                         ElementsAttr axes_elems,
+                                        bool keep_dims,
                                         StringRef nan_mode = "PROPAGATE");
 
 // Lowers ReduceProd to a sequence of TOSA ops.
@@ -204,21 +213,24 @@ std::optional<Value> convertReduceProdOp(PatternRewriter& rewriter,
                                          Operation* op,
                                          RankedTensorType output_type,
                                          Value input_value,
-                                         ElementsAttr axes_elems);
+                                         ElementsAttr axes_elems,
+                                         bool keep_dims);
 
 // Lowers ReduceSum to a sequence of TOSA ops.
 std::optional<Value> convertReduceSumOp(PatternRewriter& rewriter,
                                         Operation* op,
                                         RankedTensorType output_type,
                                         Value input_value,
-                                        ElementsAttr axes_elems);
+                                        ElementsAttr axes_elems,
+                                        bool keep_dims);
 
 // Lowers ReduceMean to a sequence of TOSA ops.
 std::optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
                                          Operation* op,
                                          RankedTensorType output_type,
                                          Value input_value,
-                                         ElementsAttr axes_elem);
+                                         ElementsAttr axes_elem,
+                                         bool keep_dims);
 
 // Lowers ResizeBilinear and ResizeNearestNeighbor to TOSA resize.
 std::optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
@@ -293,6 +305,12 @@ std::optional<Value> convertGatherNdOp(PatternRewriter& rewriter, Operation* op,
                                        Value result_value, Value params_value,
                                        Value indices_value);
 
+// Lowers ScatterNd operator to a sequence of TOSA ops.
+std::optional<Value> convertScatterNdOp(PatternRewriter& rewriter,
+                                        Operation* op, Value result_value,
+                                        Value indices_value,
+                                        Value updates_value, Value shape_value);
+
 // Lowers OneHot operator to a sequence of TOSA ops.
 std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
                                      Value result_value, Value indices_value,
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
index b355829547f0..5f2f04ad4051 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
@@ -131,6 +131,7 @@ DECL_CONVERT_OP(ResizeNearestNeighbor);
 DECL_CONVERT_OP(Gather);
 DECL_CONVERT_OP(GatherV2);
 DECL_CONVERT_OP(GatherNd);
+DECL_CONVERT_OP(ScatterNd);
 DECL_CONVERT_OP(SelectV2);
 DECL_CONVERT_OP(SpaceToDepth);
 DECL_CONVERT_OP(DepthToSpace);
@@ -176,7 +177,7 @@ LogicalResult ConvertTFReluOp::matchAndRewrite(
   }
 
   mlir::Attribute min_val, max_val;
-  if (element_type.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(element_type)) {
     min_val = rewriter.getFloatAttr(element_type, 0.0f);
     max_val =
         rewriter.getFloatAttr(element_type, std::numeric_limits<float>::max());
@@ -207,7 +208,7 @@ LogicalResult ConvertTFRelu6Op::matchAndRewrite(
   }
 
   mlir::Attribute min_val, max_val;
-  if (element_type.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(element_type)) {
     min_val = rewriter.getFloatAttr(element_type, 0.0f);
     max_val = rewriter.getFloatAttr(element_type, 6.0f);
   } else {
@@ -1122,7 +1123,7 @@ LogicalResult ConvertTFAllOp::matchAndRewrite(Operation* op,
     return failure();
 
   std::optional<Value> result = convertReduceAllOp(
-      rewriter, op, output_type, tf_all_op.getInput(), axes_elems);
+      rewriter, op, output_type, tf_all_op.getInput(), axes_elems, tf_all_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -1144,7 +1145,7 @@ LogicalResult ConvertTFAnyOp::matchAndRewrite(Operation* op,
     return failure();
 
   std::optional<Value> result = convertReduceAnyOp(
-      rewriter, op, output_type, tf_any_op.getInput(), axes_elems);
+      rewriter, op, output_type, tf_any_op.getInput(), axes_elems, tf_any_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -1166,7 +1167,7 @@ LogicalResult ConvertTFMaxOp::matchAndRewrite(Operation* op,
     return failure();
 
   std::optional<Value> result = convertReduceMaxOp(
-      rewriter, op, output_type, tf_max_op.getInput(), axes_elems);
+      rewriter, op, output_type, tf_max_op.getInput(), axes_elems, tf_max_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -1188,7 +1189,7 @@ LogicalResult ConvertTFMinOp::matchAndRewrite(Operation* op,
     return failure();
 
   std::optional<Value> result = convertReduceMinOp(
-      rewriter, op, output_type, tf_min_op.getInput(), axes_elems);
+      rewriter, op, output_type, tf_min_op.getInput(), axes_elems, tf_min_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -1210,7 +1211,7 @@ LogicalResult ConvertTFMeanOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceMeanOp(
-      rewriter, op, output_type, tf_mean_op.getInput(), axes_elems);
+      rewriter, op, output_type, tf_mean_op.getInput(), axes_elems, tf_mean_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -1232,7 +1233,7 @@ LogicalResult ConvertTFProdOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceProdOp(
-      rewriter, op, output_type, tf_prod_op.getInput(), axes_elems);
+      rewriter, op, output_type, tf_prod_op.getInput(), axes_elems, tf_prod_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -1254,7 +1255,7 @@ LogicalResult ConvertTFSumOp::matchAndRewrite(Operation* op,
     return failure();
 
   std::optional<Value> result = convertReduceSumOp(
-      rewriter, op, output_type, tf_sum_op.getInput(), axes_elems);
+      rewriter, op, output_type, tf_sum_op.getInput(), axes_elems, tf_sum_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -1446,7 +1447,7 @@ LogicalResult ConvertTFFusedBatchNormV3Op::matchAndRewrite(
   auto epsilon_const = CreateOpAndInfer<tosa::ConstOp>(
       rewriter, op->getLoc(), epsilon_type, epsilon_attr);
 
-  variance_type = variance.getType().cast<RankedTensorType>();
+  variance_type = mlir::cast<RankedTensorType>(variance.getType());
   Value op2_add_var_epsilon = CreateOpAndInfer<tosa::AddOp>(
       rewriter, op->getLoc(), variance_type, variance, epsilon_const);
 
@@ -1777,7 +1778,7 @@ LogicalResult ConvertTFPadV2Op::matchAndRewrite(
   auto tf_pad_op = cast<TF::PadV2Op>(op);
 
   RankedTensorType output_type =
-      tf_pad_op.getResult().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(tf_pad_op.getResult().getType());
   if (!output_type) {
     return rewriter.notifyMatchFailure(op, "output type not a ranked tensor");
   }
@@ -2001,6 +2002,22 @@ LogicalResult ConvertTFGatherNdOp::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertTFScatterNdOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_scatternd_op = cast<TF::ScatterNdOp>(op);
+
+  const std::optional<Value> result = convertScatterNdOp(
+      rewriter, op, tfl_scatternd_op.getResult(), tfl_scatternd_op.getIndices(),
+      tfl_scatternd_op.getUpdates(), tfl_scatternd_op.getShape());
+
+  if (!result) {
+    return failure();
+  }
+  rewriter.replaceOp(op, {result.value()});
+
+  return success();
+}
+
 LogicalResult ConvertTFSelectV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_sel_op = cast<TF::SelectV2Op>(op);
@@ -2620,6 +2637,7 @@ void populateLegalizeTFPatterns(MLIRContext* ctx, RewritePatternSet& patterns) {
   patterns.add<ConvertTFGatherOp>(ctx);
   patterns.add<ConvertTFGatherV2Op>(ctx);
   patterns.add<ConvertTFGatherNdOp>(ctx);
+  patterns.add<ConvertTFScatterNdOp>(ctx);
   patterns.add<ConvertTFSelectV2Op>(ctx);
   patterns.add<ConvertTFSpaceToDepthOp>(ctx);
   patterns.add<ConvertTFDepthToSpaceOp>(ctx);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index 889acbdb9b42..b37319b07d6e 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -196,6 +196,7 @@ DECL_CONVERT_OP(Const);
 DECL_CONVERT_OP(QConst);
 DECL_CONVERT_OP(Gather);
 DECL_CONVERT_OP(GatherNd);
+DECL_CONVERT_OP(ScatterNd);
 DECL_CONVERT_OP(SparseToDense);
 DECL_CONVERT_OP(OneHot);
 DECL_CONVERT_OP(ArgMax);
@@ -207,8 +208,11 @@ DECL_CONVERT_OP(Imag);
 DECL_CONVERT_OP(RFFT2d);
 DECL_CONVERT_OP(LogicalAnd);
 DECL_CONVERT_OP(LogicalOr);
+DECL_CONVERT_OP(BitwiseXor);
 DECL_CONVERT_OP(Pow);
 DECL_CONVERT_OP(BroadcastTo);
+DECL_CONVERT_OP(Exp);
+DECL_CONVERT_OP(Log);
 
 #undef DECL_CONVERT_OP
 
@@ -349,7 +353,7 @@ LogicalResult ConvertTFLReluOp::matchAndRewrite(
         buildRescale(rewriter, op, output_type, tfl_relu_op.getX(),
                      input_qtype.getScale() / output_qtype.getScale(),
                      input_qtype.getZeroPoint(), output_qtype.getZeroPoint(),
-                     /*double_round=*/false, /*scale32=*/true);
+                     /*double_round=*/"SINGLE_ROUND", /*scale32=*/true);
   }
 
   auto element_type = input_type.getElementType();
@@ -359,7 +363,7 @@ LogicalResult ConvertTFLReluOp::matchAndRewrite(
   }
 
   mlir::Attribute min_val, max_val;
-  if (element_type.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(element_type)) {
     min_val = rewriter.getFloatAttr(element_type, 0.0f);
     max_val =
         rewriter.getFloatAttr(element_type, std::numeric_limits<float>::max());
@@ -419,7 +423,7 @@ LogicalResult ConvertTFLRelu1Op::matchAndRewrite(
         buildRescale(rewriter, op, output_type, tfl_relu1_op.getX(),
                      input_qtype.getScale() / output_qtype.getScale(),
                      input_qtype.getZeroPoint(), output_qtype.getZeroPoint(),
-                     /*double_round=*/false, /*scale32=*/true);
+                     /*double_round=*/"SINGLE_ROUND", /*scale32=*/true);
   }
 
   auto element_type = input_type.getElementType();
@@ -429,7 +433,7 @@ LogicalResult ConvertTFLRelu1Op::matchAndRewrite(
   }
 
   mlir::Attribute min_val, max_val;
-  if (element_type.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(element_type)) {
     min_val = rewriter.getFloatAttr(element_type, -1.0f);
     max_val = rewriter.getFloatAttr(element_type, 1.0f);
   } else {
@@ -486,7 +490,7 @@ LogicalResult ConvertTFLRelu0To1Op::matchAndRewrite(
         buildRescale(rewriter, op, output_type, tfl_relu0to1_op.getX(),
                      input_qtype.getScale() / output_qtype.getScale(),
                      input_qtype.getZeroPoint(), output_qtype.getZeroPoint(),
-                     /*double_round=*/false, /*scale32=*/true);
+                     /*double_round=*/"SINGLE_ROUND", /*scale32=*/true);
   }
 
   auto element_type = input_type.getElementType();
@@ -496,7 +500,7 @@ LogicalResult ConvertTFLRelu0To1Op::matchAndRewrite(
   }
 
   mlir::Attribute min_val, max_val;
-  if (element_type.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(element_type)) {
     min_val = rewriter.getFloatAttr(element_type, 0.0f);
     max_val = rewriter.getFloatAttr(element_type, 1.0f);
   } else {
@@ -553,7 +557,7 @@ LogicalResult ConvertTFLRelu6Op::matchAndRewrite(
         buildRescale(rewriter, op, output_type, tfl_relu6_op.getX(),
                      input_qtype.getScale() / output_qtype.getScale(),
                      input_qtype.getZeroPoint(), output_qtype.getZeroPoint(),
-                     /*double_round=*/false, /*scale32=*/true);
+                     /*double_round=*/"SINGLE_ROUND", /*scale32=*/true);
   }
 
   auto element_type = input_type.getElementType();
@@ -563,7 +567,7 @@ LogicalResult ConvertTFLRelu6Op::matchAndRewrite(
   }
 
   mlir::Attribute min_val, max_val;
-  if (element_type.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(element_type)) {
     min_val = rewriter.getFloatAttr(element_type, 0.0f);
     max_val = rewriter.getFloatAttr(element_type, 6.0f);
   } else {
@@ -1296,17 +1300,28 @@ LogicalResult ConvertTFLAveragePool2DOp::matchAndRewrite(
     // TensorFlow Lite doesn't use the zero point when calculating
     // quantized average pool, while TOSA does. Force the TOSA
     // zero_points to zero to ensure that the calculations match
+    Location loc = op->getLoc();
+    const std::optional<Value> input_zp =
+      tosa::createZeroPointTensor(rewriter, loc, avg_pool_input.getType(), 0);
+    if (!input_zp.has_value())
+      return op->emitError("Failed to create input zero-point tensor for AvgPool2D op.");
+
+    const Value empty_output_val = rewriter.create<tensor::EmptyOp>(loc,
+      average_type.getShape(), average_type.getElementType());
+    const std::optional<Value> output_zp =
+      tosa::createZeroPointTensor(rewriter, loc, empty_output_val.getType(), 0);
+    if (!output_zp.has_value())
+      return op->emitError("Failed to create output zero-point tensor for AvgPool2D op.");
 
-    auto input_zp_attr = rewriter.getI32IntegerAttr(0);
-    auto output_zp_attr = rewriter.getI32IntegerAttr(0);
     result = CreateOpAndInfer<tosa::AvgPool2dOp>(
-        rewriter, op->getLoc(), average_type, avg_pool_input, kernel_size,
-        stride, pad, acc_attr, input_zp_attr, output_zp_attr);
+        rewriter, op->getLoc(), average_type, avg_pool_input, input_zp.value(),
+        output_zp.value(), kernel_size, stride, pad, acc_attr);
   } else {
     result = CreateOpAndInfer<tosa::AvgPool2dOp>(
-        rewriter, op->getLoc(), average_type, tfl_avgpool_op.getInput(),
-        kernel_size, stride, pad, acc_attr);
+        rewriter, op->getLoc(), average_type, avg_pool_input, kernel_size,
+        stride, pad, acc_attr);
   }
+
   if (average_type != output_type) {
     result = CreateOpAndInfer<tosa::CastOp>(rewriter, op->getLoc(), output_type,
                                             result);
@@ -1332,6 +1347,8 @@ LogicalResult ConvertTFLMaxPool2DOp::matchAndRewrite(
   DenseI64ArrayAttr kernel_size;
   DenseI64ArrayAttr stride;
   DenseI64ArrayAttr pad;
+  // Pooling has no non-unit dilation
+  DenseI64ArrayAttr dilation = rewriter.getDenseI64ArrayAttr({1, 1});
   {
     int64_t kernel_h = tfl_maxpool_op.getFilterHeight();
     int64_t kernel_w = tfl_maxpool_op.getFilterWidth();
@@ -1350,9 +1367,6 @@ LogicalResult ConvertTFLMaxPool2DOp::matchAndRewrite(
     if (!GetPaddingFromString(tfl_maxpool_op.getPadding().str(), &tf_pad).ok())
       return failure();
 
-    // Pooling has no non-unit dilation
-    DenseI64ArrayAttr dilation = rewriter.getDenseI64ArrayAttr({1, 1});
-
     RankedTensorType filter_type =
         RankedTensorType::get(i64array, rewriter.getIntegerType(64));
 
@@ -1365,8 +1379,13 @@ LogicalResult ConvertTFLMaxPool2DOp::matchAndRewrite(
       return failure();
   }
 
+  // TFLite only supports NHWC format
+  const Value max_pool_input = getInputSlicedToItsUsedSize(
+    rewriter, op, tensorflow::FORMAT_NHWC, input_type,
+    tfl_maxpool_op.getInput(), kernel_size, pad, stride, dilation);
+
   CreateReplaceOpAndInfer<tosa::MaxPool2dOp>(rewriter, op, output_type,
-                                             tfl_maxpool_op.getInput(),
+                                             max_pool_input,
                                              kernel_size, stride, pad);
   return success();
 }
@@ -1500,6 +1519,102 @@ Value lowerGroupedConvolution(TFL::Conv2DOp op, PatternRewriter& rewriter) {
                                                convolutions, output_slice_dim);
 }
 
+/* Ensure bias is of the correct type.
+TOSA requires that bias must be of the same type as the output, and that
+output type must be of a certain type depending on the input type.
+*/
+static FailureOr<std::pair<Type, Value>> getTosaBias(
+    Operation* op, PatternRewriter& rewriter, ShapedType input_type,
+    ShapedType output_type, bool output_is_qtype, Value bias) {
+  Type bias_ety;
+
+  int bias_bits;
+  if (output_is_qtype) {
+    auto input_qtype =
+        dyn_cast<mlir::quant::QuantizedType>(input_type.getElementType());
+    if (!input_qtype) {
+      return rewriter.notifyMatchFailure(op,
+                                         "output is qtype but input is not");
+    }
+    int input_bits = input_qtype.getStorageTypeIntegralWidth();
+    // For signed int8/int16 input tensor, int32/int48 bias and output
+    // tensor are generated.
+    bias_bits = input_bits == 16 ? 48 : 32;
+    bias_ety = rewriter.getIntegerType(bias_bits);
+  } else {
+    bias_ety = output_type.getElementType();
+    bias_bits = bias_ety.getIntOrFloatBitWidth();
+  }
+
+  if (!bias || !dyn_cast<RankedTensorType>(bias.getType())) {
+    // The bias may actually be typed "None" which has no value. TOSA requires
+    // bias to be an array of output_channel_count values, so create a constant
+    // of the appropriate number and type of zeros.
+    RankedTensorType bias_type = RankedTensorType::get({1}, bias_ety);
+    auto bias_attr = rewriter.getZeroAttr(bias_type);
+    bias = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(), bias_type,
+                                           mlir::cast<ElementsAttr>(bias_attr));
+  }
+
+  auto prev_bias_type = dyn_cast<ShapedType>(bias.getType());
+  if (!prev_bias_type) {
+    return rewriter.notifyMatchFailure(op, "bias not a ranked tensor");
+  }
+
+  auto prev_bias_etype = prev_bias_type.getElementType();
+
+  int prev_bias_bits;
+  if (auto prev_bias_eqtype =
+          dyn_cast<mlir::quant::QuantizedType>(prev_bias_etype)) {
+    prev_bias_bits = prev_bias_eqtype.getStorageTypeIntegralWidth();
+  } else {
+    prev_bias_bits = prev_bias_etype.getIntOrFloatBitWidth();
+  }
+
+  if (prev_bias_bits == bias_bits) {
+    return std::pair<Type, Value>(bias_ety, bias);
+  }
+
+  auto const_op = bias.getDefiningOp<tosa::ConstOp>();
+  if (!const_op) {
+    return rewriter.notifyMatchFailure(op, "bias not a ConstOp");
+  }
+
+  DenseElementsAttr bias_attr;
+  {
+    auto prev_bias_attr =
+        dyn_cast<DenseIntElementsAttr>(const_op.getValuesAttr());
+    if (!prev_bias_attr) {
+      return rewriter.notifyMatchFailure(
+          op, "bias values not DenseIntElementsAttr");
+    }
+    // Promote to int32/int48 if necessary.
+    bias_attr = prev_bias_attr.mapValues(
+        bias_ety,
+        [bias_bits = bias_ety.getIntOrFloatBitWidth()](
+            const APInt& x) -> APInt { return x.sext(bias_bits); });
+  }
+
+  ShapedType bias_output_type;
+  if (auto bias_attr_type = dyn_cast<ShapedType>(bias_attr.getType())) {
+    bias_output_type = bias_attr_type.clone(bias_ety);
+  } else {
+    bias_output_type = dyn_cast<ShapedType>(const_op.getResult().getType());
+    if (!bias_output_type) {
+      return rewriter.notifyMatchFailure(
+          op, "bias defining op result not ShapedType");
+    }
+    bias_output_type = bias_output_type.clone(bias_ety);
+  }
+
+  auto new_const_op =
+      rewriter.create<tosa::ConstOp>(op->getLoc(), bias_output_type, bias_attr);
+  Value new_bias = new_const_op.getResult();
+  rewriter.replaceOp(const_op, new_bias);
+
+  return std::make_pair(bias_ety, new_bias);
+}
+
 LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_conv2d_op = cast<TFL::Conv2DOp>(op);
@@ -1572,19 +1687,10 @@ LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
       return failure();
   }
 
-  Value unquantized_bias = tfl_conv2d_op.getBias();
-  Type bias_ety =
-      output_is_qtype ? rewriter.getI32Type() : output_type.getElementType();
-  if (unquantized_bias) {
-    Type new_bias_ety = getElementTypeOrSelf(unquantized_bias.getType());
-    if (auto qtype = mlir::dyn_cast<mlir::quant::QuantizedType>(new_bias_ety)) {
-      new_bias_ety = qtype.getStorageType();
-    }
-    if (new_bias_ety.getIntOrFloatBitWidth() >
-        bias_ety.getIntOrFloatBitWidth()) {
-      bias_ety = new_bias_ety;
-    }
-  }
+  auto bias_result = getTosaBias(op, rewriter, input_type, output_type,
+                                 output_is_qtype, tfl_conv2d_op.getBias());
+  if (failed(bias_result)) return failure();
+  auto [bias_ety, bias_val] = bias_result.value();
 
   // TFLite only supports NHWC format
   Value conv2d_input = getInputSlicedToItsUsedSize(
@@ -1598,8 +1704,7 @@ LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
 
   auto a1_conv2d_op = CreateOpAndInfer<tosa::Conv2DOp>(
       rewriter, op->getLoc(), output_type.clone(bias_ety), conv2d_input,
-      tfl_conv2d_op.getFilter(), unquantized_bias, pad, stride, dilation,
-      acc_type);
+      tfl_conv2d_op.getFilter(), bias_val, pad, stride, dilation, acc_type);
 
   Value conv2d_output;
   if (input_is_qtype) {
@@ -1643,11 +1748,11 @@ LogicalResult ConvertTFLConv3DOp::matchAndRewrite(
   }
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType());
   bool filter_is_qtype =
-      filter_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(filter_type.getElementType());
   bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
@@ -1699,37 +1804,26 @@ LogicalResult ConvertTFLConv3DOp::matchAndRewrite(
     }
   }
 
-  Value unquantized_bias = tfl_conv3d_op.getBias();
-  if (!dyn_cast<RankedTensorType>(unquantized_bias.getType())) {
-    // The bias may actually be typed "None" which has no value. TOSA requires
-    // bias to be an array of output_channel_count values, so create a constant
-    // of the appropriate number and type of zeros.
-    auto bias_dim = filter_type.getShape().back();
-    RankedTensorType bias_type =
-        RankedTensorType::get({bias_dim}, filter_type.getElementType());
-    auto bias_attr = rewriter.getZeroAttr(bias_type);
-    unquantized_bias = CreateOpAndInfer<tosa::ConstOp>(
-        rewriter, op->getLoc(), bias_type, bias_attr.cast<ElementsAttr>());
-  }
-
   // TFLite only supports NDHWC format, tensorflow::FORMAT_NHWC is used for both
   // rank 4 and rank 5 tensors
   Value conv3d_input = getInputSlicedToItsUsedSize(
       rewriter, op, tensorflow::FORMAT_NHWC, input_type,
       tfl_conv3d_op.getInput(), kernel_size, pad, stride, dilation);
 
-  Type bias_ety =
-      unquantized_bias.getType().cast<ShapedType>().getElementType();
+  auto bias_result = getTosaBias(op, rewriter, input_type, output_type,
+                                 output_is_qtype, tfl_conv3d_op.getBias());
+  if (failed(bias_result)) return failure();
+  auto [bias_ety, bias_val] = bias_result.value();
 
   auto acc_type =
       getConvAccTypeAttr(rewriter,
                          /* input_etype = */ input_type.getElementType(),
                          /* output_etype = */ bias_ety);
 
-  std::optional<Value> a1_conv3d_op = convertConv3DCommon(
-      rewriter, op, output_type.clone(bias_ety), conv3d_input,
-      tfl_conv3d_op.getFilter(), unquantized_bias, pad, stride, dilation,
-      acc_type, StringRef("NDHWC"));
+  std::optional<Value> a1_conv3d_op =
+      convertConv3DCommon(rewriter, op, output_type.clone(bias_ety),
+                          conv3d_input, tfl_conv3d_op.getFilter(), bias_val,
+                          pad, stride, dilation, acc_type, StringRef("NDHWC"));
 
   if (!a1_conv3d_op) return failure();
 
@@ -1778,23 +1872,6 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
   bool output_is_qtype =
       mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
-  const bool has_bias =
-      tfl_conv_op.getBias() && !isa<NoneType>(tfl_conv_op.getBias().getType());
-
-  if (has_bias) {
-    RankedTensorType bias_type =
-        dyn_cast<RankedTensorType>(tfl_conv_op.getBias().getType());
-    bool bias_is_qtype =
-        isa<mlir::quant::QuantizedType>(bias_type.getElementType());
-
-    if (input_is_qtype != bias_is_qtype) {
-      return rewriter.notifyMatchFailure(
-          op,
-          "input/bias tensor should "
-          "be all quantized or all floating-point");
-    }
-  }
-
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
     return rewriter.notifyMatchFailure(
@@ -1824,49 +1901,10 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
       return failure();
   }
 
-  int output_channel = 0;
-  // TODO(suderman): We need to figure out how to guarantee output channel
-  // propagation.
-  if (output_type.hasRank()) {
-    output_channel = output_type.getDimSize(3);
-  } else if (filter_type.hasRank()) {
-    output_channel = filter_type.getDimSize(0);
-  } else {
-    return failure();
-  }
-
-  Value bias_val;
-  if (has_bias) {
-    bias_val = tfl_conv_op.getBias();
-  } else {
-    std::optional<Value> zero_bias;
-    if (input_is_qtype) {
-      uint32_t input_bits =
-          cast<mlir::quant::QuantizedType>(input_type.getElementType())
-              .getStorageTypeIntegralWidth();
-      uint32_t weight_bits =
-          cast<mlir::quant::QuantizedType>(filter_type.getElementType())
-              .getStorageTypeIntegralWidth();
-
-      if (input_bits == 16 && weight_bits == 8) {
-        // For signed 16x8, the output is accumulated into int48
-        SmallVector<APInt> vec(output_channel, APInt(48, 0, true));
-        zero_bias = getConstTensor<APInt>(rewriter, op, vec, {output_channel});
-      } else {
-        SmallVector<int32_t> vec(output_channel, 0);
-        zero_bias =
-            getConstTensor<int32_t>(rewriter, op, vec, {output_channel});
-      }
-    } else {
-      SmallVector<float> vec(output_channel, 0.0f);
-      zero_bias = getConstTensor<float>(rewriter, op, vec, {output_channel});
-    }
-
-    if (!zero_bias) return failure();
-    bias_val = zero_bias.value();
-  }
-
-  Type bias_ety = cast<ShapedType>(bias_val.getType()).getElementType();
+  auto bias_result = getTosaBias(op, rewriter, input_type, output_type,
+                                 output_is_qtype, tfl_conv_op.getBias());
+  if (failed(bias_result)) return failure();
+  auto [bias_ety, bias_val] = bias_result.value();
 
   auto acc_type =
       getConvAccTypeAttr(rewriter,
@@ -1875,8 +1913,8 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
 
   auto a1_conv2d_op = CreateOpAndInfer<tosa::TransposeConv2DOp>(
       rewriter, op->getLoc(), output_type.clone(bias_ety),
-      tfl_conv_op.getInput(), tfl_conv_op.getWeights(), bias_val,
-      outpad, stride, acc_type);
+      tfl_conv_op.getInput(), tfl_conv_op.getWeights(), bias_val, outpad,
+      stride, acc_type);
 
   Value conv2d_output;
   if (input_is_qtype) {
@@ -1920,11 +1958,11 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
   if (!filter_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType());
   bool filter_is_qtype =
-      filter_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(filter_type.getElementType());
   bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
@@ -2009,20 +2047,10 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
                             filter_type.getElementType()),
       a1_filter_transpose_op.getResult(), a2_reshape_dims_value);
 
-  Type bias_ety =
-      output_is_qtype ? rewriter.getI32Type() : output_type.getElementType();
-
-  Value unquantized_bias = tfl_conv2d_op.getBias();
-  if (unquantized_bias) {
-    Type new_bias_ety = getElementTypeOrSelf(unquantized_bias.getType());
-    if (auto qtype = new_bias_ety.dyn_cast<mlir::quant::QuantizedType>()) {
-      new_bias_ety = qtype.getStorageType();
-    }
-    if (new_bias_ety.getIntOrFloatBitWidth() >
-        bias_ety.getIntOrFloatBitWidth()) {
-      bias_ety = new_bias_ety;
-    }
-  }
+  auto bias_result = getTosaBias(op, rewriter, input_type, output_type,
+                                 output_is_qtype, tfl_conv2d_op.getBias());
+  if (failed(bias_result)) return failure();
+  auto [bias_ety, bias_val] = bias_result.value();
 
   // TFLite only supports NHWC format
   Value conv2d_input = getInputSlicedToItsUsedSize(
@@ -2036,7 +2064,7 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
 
   auto a3_depthwise_conv2d_op = CreateOpAndInfer<tosa::DepthwiseConv2DOp>(
       rewriter, op->getLoc(), output_type.clone(bias_ety), conv2d_input,
-      a2_filter_reshape_op.getResult(), unquantized_bias, pad, stride, dilation,
+      a2_filter_reshape_op.getResult(), bias_val, pad, stride, dilation,
       acc_type);
 
   Value conv2d_output;
@@ -2127,8 +2155,8 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
         rewriter, op->getLoc(),
         UnrankedTensorType::get(rhs_ty.getElementType()), rhs,
         new_rhs_shape_value);
-    lhs_ty = lhs.getType().cast<RankedTensorType>();
-    rhs_ty = rhs.getType().cast<RankedTensorType>();
+    lhs_ty = mlir::cast<RankedTensorType>(lhs.getType());
+    rhs_ty = mlir::cast<RankedTensorType>(rhs.getType());
   }
 
   if (transpose_lhs) {
@@ -2220,8 +2248,6 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
       dyn_cast<RankedTensorType>(tfl_fc_op.getInput().getType());
   RankedTensorType filter_type =
       dyn_cast<RankedTensorType>(tfl_fc_op.getFilter().getType());
-  RankedTensorType bias_type =
-      dyn_cast<RankedTensorType>(tfl_fc_op.getBias().getType());
   if (!input_type || !filter_type) return failure();
 
   bool input_is_qtype =
@@ -2295,53 +2321,10 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
       filter_val, new_filter_shape_value);
   filter_type = cast<RankedTensorType>(filter_val.getType());
 
-  Value bias_val;
-  if (!bias_type) {
-    // For some matmuls, the bias may actually be a "UnitType" which has no
-    // value. TOSA requires bias to be an array of output_channel_count values,
-    // so create a constant of the appropriate number and type of zeros.
-    SmallVector<int64_t, 1> bias_shape({filter_type.getShape()[0]});
-    RankedTensorType new_bias_type;
-
-    DenseElementsAttr bias_attr;
-    if (mlir::isa<FloatType>(input_type.getElementType())) {
-      SmallVector<float> bias_arr(bias_shape[0]);
-
-      for (int i = 0; i < bias_shape[0]; i++) {
-        bias_arr[i] = 0.0;
-      }
-      new_bias_type =
-          RankedTensorType::get(bias_shape, input_type.getElementType());
-      bias_attr =
-          DenseElementsAttr::get(new_bias_type, llvm::ArrayRef(bias_arr));
-    } else {
-      SmallVector<int32_t> bias_arr(bias_shape[0]);
-
-      for (int i = 0; i < bias_shape[0]; i++) {
-        bias_arr[i] = 0;
-      }
-      if (!input_is_qtype) {
-        return rewriter.notifyMatchFailure(
-            op, "input must be quantized type if it's not float type");
-      }
-      auto input_qtype =
-          mlir::cast<mlir::quant::QuantizedType>(input_type.getElementType());
-      Type new_bias_ety = input_qtype.getStorageTypeIntegralWidth() == 16
-                              ? rewriter.getIntegerType(48)
-                              : rewriter.getI32Type();
-      new_bias_type = RankedTensorType::get(bias_shape, new_bias_ety);
-      bias_attr =
-          DenseElementsAttr::get(new_bias_type, llvm::ArrayRef(bias_arr));
-    }
-    auto bias_op = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
-                                                   new_bias_type, bias_attr);
-    bias_val = bias_op.getResult();
-    bias_type = new_bias_type;
-  } else {
-    bias_val = tfl_fc_op.getBias();
-  }
-
-  Type bias_ety = mlir::cast<ShapedType>(bias_val.getType()).getElementType();
+  auto bias_result = getTosaBias(op, rewriter, input_type, output_type,
+                                 output_is_qtype, tfl_fc_op.getBias());
+  if (failed(bias_result)) return failure();
+  auto [bias_ety, bias_val] = bias_result.value();
 
   auto acc_type =
       getConvAccTypeAttr(rewriter,
@@ -2367,19 +2350,16 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
   // If we know the output rank, we need to ensure the output shape is correct.
   ShapedType fc_type = mlir::cast<ShapedType>(fc_output.getType());
 
-  DenseI64ArrayAttr output_shape_attr;
-  if (output_type.hasRank()) {
-    output_shape_attr = rewriter.getDenseI64ArrayAttr(output_type.getShape());
+  llvm::SmallVector<int64_t> output_shape;
+  if (tfl_fc_op.getKeepNumDims()) {
+    const llvm::ArrayRef<int64_t> orig_input_shape = tfl_fc_op.getInput().getType().getShape();
+    output_shape.append(orig_input_shape.begin(), orig_input_shape.end() - 1);
+    output_shape.push_back(OC);
   } else {
-    // set output_shape to {N, OC} to match previous results
-    // with tosa::FullyConnectedOp
-    output_shape_attr = rewriter.getDenseI64ArrayAttr({N, OC});
+    output_shape.append({N, OC});
   }
 
-  auto output_shape_value =
-      (output_type.hasRank())
-          ? getTosaConstShape(rewriter, op->getLoc(), output_type.getShape())
-          : getTosaConstShape(rewriter, op->getLoc(), {N, OC});
+  auto output_shape_value = getTosaConstShape(rewriter, op->getLoc(), output_shape);
   fc_output = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(), UnrankedTensorType::get(fc_type.getElementType()),
       fc_output, output_shape_value);
@@ -2633,7 +2613,7 @@ LogicalResult ConvertTFLReduceAllOp::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "fail to get reduction indices");
 
   std::optional<Value> result = convertReduceAllOp(
-      rewriter, op, output_type, tfl_all_op.getInput(), axes_elems);
+      rewriter, op, output_type, tfl_all_op.getInput(), axes_elems, tfl_all_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -2655,7 +2635,7 @@ LogicalResult ConvertTFLReduceAnyOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceAnyOp(
-      rewriter, op, output_type, tfl_any_op.getInput(), axes_elems);
+      rewriter, op, output_type, tfl_any_op.getInput(), axes_elems, tfl_any_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -2677,7 +2657,7 @@ LogicalResult ConvertTFLReduceMaxOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceMaxOp(
-      rewriter, op, output_type, tfl_max_op.getInput(), axes_elems);
+      rewriter, op, output_type, tfl_max_op.getInput(), axes_elems, tfl_max_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -2699,7 +2679,7 @@ LogicalResult ConvertTFLReduceMinOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceMinOp(
-      rewriter, op, output_type, tfl_min_op.getInput(), axes_elems);
+      rewriter, op, output_type, tfl_min_op.getInput(), axes_elems, tfl_min_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -2721,7 +2701,7 @@ LogicalResult ConvertTFLReduceProdOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceProdOp(
-      rewriter, op, output_type, tfl_prod_op.getInput(), axes_elems);
+      rewriter, op, output_type, tfl_prod_op.getInput(), axes_elems, tfl_prod_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -2743,7 +2723,7 @@ LogicalResult ConvertTFLMeanOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceMeanOp(
-      rewriter, op, output_type, tfl_mean_op.getInput(), axes_elems);
+      rewriter, op, output_type, tfl_mean_op.getInput(), axes_elems, tfl_mean_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -2765,7 +2745,7 @@ LogicalResult ConvertTFLSumOp::matchAndRewrite(
     return failure();
 
   std::optional<Value> result = convertReduceSumOp(
-      rewriter, op, output_type, tfl_sum_op.getInput(), axes_elems);
+      rewriter, op, output_type, tfl_sum_op.getInput(), axes_elems, tfl_sum_op.getKeepDims());
 
   if (!result) return failure();
 
@@ -3465,17 +3445,11 @@ LogicalResult ConvertTFLHardSwishOp::matchAndRewrite(
         mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
             output_type.getElementType());
 
-    auto hardswish_func = [](double v) -> double {
-      double w = v + 3.0;
-      w = w < 0.0 ? 0.0 : w > 6.0 ? 6.0 : w;
-      return v * w / 6.0;
-    };
-
     if (input_qtype.getStorageTypeIntegralWidth() == 8) {
       // Implement with 8-bit table lookup.
-      Value table_const = getTosaConst8bitTable(
+      Value table_const = getTosaConstHardSwish8bitTable(
           rewriter, op, input_qtype.getScale(), input_qtype.getZeroPoint(),
-          output_qtype.getScale(), output_qtype.getZeroPoint(), hardswish_func);
+          output_qtype.getScale(), output_qtype.getZeroPoint());
 
       CreateReplaceOpAndInfer<tosa::TableOp>(
           rewriter, op, output_type, tfl_hardswish_op.getInput(), table_const);
@@ -3625,7 +3599,8 @@ LogicalResult ConvertTFLAtan2Op::matchAndRewrite(
   // Note: the implementation of std::atan2 may be different on
   // different machines, so may result in varying numerical results.
   auto atan_func = [](double x) -> double { return std::atan(x); };
-  Value table_const = getTosaConst16bitTable(rewriter, op, atan_func, 0.0, 1.0);
+  Value table_const = getTosaConst16bitTable<double>(
+      rewriter, op, 1.0 / 65535.0, -32768, 2.0 / 65535.0, 0, atan_func);
   auto table_result = CreateOpAndInfer<tosa::TableOp>(
       rewriter, loc, output_ty.clone(rewriter.getIntegerType(32)), casted,
       table_const);
@@ -3718,13 +3693,10 @@ LogicalResult ConvertTFLLogisticOp::matchAndRewrite(
         return rewriter.notifyMatchFailure(
             op, "input/output zeropoint should be 0 in 16-bit mode");
       }
-      double input_min = -32768 * input_qtype.getScale();
-      double input_max = 32767 * input_qtype.getScale();
 
-      // Generate table with gen_lut() in
-      // tensorflow/lite/kernels/internal/common.h
-      Value table_const = getTosaConst16bitTable(rewriter, op, sigmoid_func,
-                                                 input_min, input_max);
+      Value table_const =
+          getTosaConst16bitTable<double>(rewriter, op, input_qtype.getScale(),
+                                         0, 2.0 / 65535.0, 0, sigmoid_func);
 
       auto op1_table_in =
           CreateOpAndInfer<tosa::TableOp>(rewriter, op->getLoc(), int32_type,
@@ -3732,7 +3704,7 @@ LogicalResult ConvertTFLLogisticOp::matchAndRewrite(
 
       Value op2_rescale_op1 =
           buildRescale(rewriter, op, output_type, op1_table_in.getResult(),
-                       1.0 / 128.0, 0, 0, false, true);
+                       1.0 / 128.0, 0, 0, "SINGLE_ROUND", true);
 
       rewriter.replaceOp(op, {op2_rescale_op1});
     }
@@ -3790,13 +3762,9 @@ LogicalResult ConvertTFLTanhOp::matchAndRewrite(
         return rewriter.notifyMatchFailure(
             op, "input/output zeropoint should be 0 in 16-bit mode");
       }
-      double input_min = -32768 * input_qtype.getScale();
-      double input_max = 32767 * input_qtype.getScale();
 
-      // Generate table with gen_lut() in
-      // tensorflow/lite/kernels/internal/common.h
-      Value table_const =
-          getTosaConst16bitTable(rewriter, op, tanh_func, input_min, input_max);
+      Value table_const = getTosaConst16bitTable<double>(
+          rewriter, op, input_qtype.getScale(), 0, 2.0 / 65535.0, 0, tanh_func);
 
       auto op1_table_in =
           CreateOpAndInfer<tosa::TableOp>(rewriter, op->getLoc(), int32_type,
@@ -3804,7 +3772,7 @@ LogicalResult ConvertTFLTanhOp::matchAndRewrite(
 
       Value op2_rescale_op1 =
           buildRescale(rewriter, op, output_type, op1_table_in.getResult(),
-                       1.0 / 128.0, 0, 0, false, true);
+                       1.0 / 128.0, 0, 0, "SINGLE_ROUND", true);
 
       rewriter.replaceOp(op, {op2_rescale_op1});
     }
@@ -3822,7 +3790,7 @@ static LogicalResult LegalizeFloatingPointPrelu(Operation* op,
                                                 Value input, Value alpha,
                                                 ShapedType output_type) {
   Value mul = CreateMulOpAndInfer(rewriter, op, output_type, input, alpha);
-  auto rank = mul.getType().cast<ShapedType>().getRank();
+  auto rank = mlir::cast<ShapedType>(mul.getType()).getRank();
   Value const_zero = getTosaConstTensorSingleF32(rewriter, op, 0.0, rank);
   auto ge = CreateOpAndInfer<tosa::GreaterEqualOp>(
       rewriter, op->getLoc(), output_type.clone(rewriter.getIntegerType(1)),
@@ -3880,7 +3848,7 @@ static LogicalResult LegalizeQuantizedPrelu(Operation* op,
   // Initalize the negative values to the slope of leaky ReLU.
   Value op_rescale_slope_in = buildRescale(
       rewriter, op, output_type, input, scale_alpha, input_qtype.getZeroPoint(),
-      output_qtype.getZeroPoint(), true, true);
+      output_qtype.getZeroPoint(), "DOUBLE_ROUND", true);
 
   // Perform an element-wise multiplication on rescaled alpha and input for
   // PReLU.
@@ -3897,11 +3865,11 @@ static LogicalResult LegalizeQuantizedPrelu(Operation* op,
 
   op_rescale_slope_in =
       buildRescale(rewriter, op, output_type, op_mul, scale_alpha,
-                   /* input_zp = */ 0, output_qtype.getZeroPoint(), true, true);
+                   /* input_zp = */ 0, output_qtype.getZeroPoint(), "DOUBLE_ROUND", true);
 
   Value op_rescale_identity_in = buildRescale(
       rewriter, op, output_type, input, scale_identity,
-      input_qtype.getZeroPoint(), output_qtype.getZeroPoint(), true, true);
+      input_qtype.getZeroPoint(), output_qtype.getZeroPoint(), "DOUBLE_ROUND", true);
 
   CreateReplaceOpAndInfer<tosa::SelectOp>(rewriter, op, output_type, op_ge,
                                           op_rescale_identity_in,
@@ -3965,11 +3933,11 @@ static LogicalResult LegalizeQuantizedLeakyRelu(Operation* op,
 
   Value op_rescale_alpha_in =
       buildRescale(rewriter, op, rescale_type, input, scale_alpha,
-                   input_qtype.getZeroPoint(), 0, true, true);
+                   input_qtype.getZeroPoint(), 0, "DOUBLE_ROUND", true);
 
   Value op_rescale_identity_in =
       buildRescale(rewriter, op, rescale_type, input, scale_identity,
-                   input_qtype.getZeroPoint(), 0, true, true);
+                   input_qtype.getZeroPoint(), 0, "DOUBLE_ROUND", true);
 
   Value result_int32;
   if (alpha <= 1.0) {
@@ -3996,7 +3964,7 @@ static LogicalResult LegalizeFloatingPointLeakyRelu(Operation* op,
                                                     PatternRewriter& rewriter,
                                                     Value input, double alpha,
                                                     ShapedType output_type) {
-  auto rank = input.getType().cast<ShapedType>().getRank();
+  auto rank = mlir::cast<ShapedType>(input.getType()).getRank();
   Value const_alpha = getTosaConstTensorSingleF32(rewriter, op, alpha, rank);
   auto mul = CreateMulOpAndInfer(rewriter, op, output_type, input, const_alpha);
   if (alpha <= 1.0) {
@@ -4171,7 +4139,7 @@ LogicalResult ConvertTFLQuantizeOp::matchAndRewrite(
     Value rescale_op =
         buildRescale(rewriter, op, output_type, tfl_quantize_op.getInput(),
                      rescale_scale, input_element_type.getZeroPoint(),
-                     element_type.getZeroPoint(), true, true);
+                     element_type.getZeroPoint(), "DOUBLE_ROUND", true);
 
     rewriter.replaceOp(op, {rescale_op});
     return success();
@@ -4371,6 +4339,22 @@ LogicalResult ConvertTFLGatherNdOp::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertTFLScatterNdOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_scatternd_op = cast<TFL::ScatterNdOp>(op);
+
+  const std::optional<Value> result = convertScatterNdOp(
+      rewriter, op, tfl_scatternd_op.getResult(), tfl_scatternd_op.getIndices(),
+      tfl_scatternd_op.getUpdates(), tfl_scatternd_op.getShape());
+
+  if (!result) {
+    return failure();
+  }
+  rewriter.replaceOp(op, {result.value()});
+
+  return success();
+}
+
 LogicalResult ConvertTFLSparseToDenseOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_sparse_to_dense_op = cast<TFL::SparseToDenseOp>(op);
@@ -4551,12 +4535,12 @@ LogicalResult ConvertTFLArgMinOp::matchAndRewrite(
   // so need to rescale ArgMax output to original output zero point
   int output_zp = 0;
   Type output_ty = arg_min_op.getType();
-  Type output_ety = output_ty.cast<ShapedType>().getElementType();
+  Type output_ety = mlir::cast<ShapedType>(output_ty).getElementType();
   if (auto output_quantized_ty = dyn_cast<UniformQuantizedType>(output_ety)) {
     output_zp = output_quantized_ty.getZeroPoint();
     if (output_zp != 0) {
       // need to rescale arg_max output to output zero point
-      output_ty = output_ty.cast<ShapedType>().clone(input_ety);
+      output_ty = mlir::cast<ShapedType>(output_ty).clone(input_ety);
     }
   }
 
@@ -4572,7 +4556,7 @@ LogicalResult ConvertTFLArgMinOp::matchAndRewrite(
     result = buildRescale(rewriter, op, arg_min_op.getType(), result,
                           /* sclae = */ 1.0,
                           /* input_zp = */ 0,
-                          /* output_zp = */ output_zp, false, true);
+                          /* output_zp = */ output_zp, "SINGLE_ROUND", true);
   }
 
   rewriter.replaceOp(op, {result});
@@ -4624,11 +4608,11 @@ LogicalResult ConvertTFLWhileOp::matchAndRewrite(
   auto while_op = rewriter.create<mlir::tosa::WhileOp>(
       op->getLoc(), op->getResultTypes(), op->getOperands());
 
-  rewriter.createBlock(&while_op.getCond());
-  rewriter.createBlock(&while_op.getBody());
+  rewriter.createBlock(&while_op.getCondGraph());
+  rewriter.createBlock(&while_op.getBodyGraph());
 
-  inlineWhileCase(tfl_while_op.getCond(), while_op.getCond(), rewriter);
-  inlineWhileCase(tfl_while_op.getBody(), while_op.getBody(), rewriter);
+  inlineWhileCase(tfl_while_op.getCond(), while_op.getCondGraph(), rewriter);
+  inlineWhileCase(tfl_while_op.getBody(), while_op.getBodyGraph(), rewriter);
 
   rewriter.replaceOp(tfl_while_op, while_op.getResults());
 
@@ -4826,6 +4810,11 @@ LogicalResult ConvertTFLLogicalOrOp::matchAndRewrite(
   return ConvertBinaryOp<tosa::LogicalOrOp>(op, rewriter);
 }
 
+LogicalResult ConvertTFLBitwiseXorOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  return ConvertBinaryOp<tosa::BitwiseXorOp>(op, rewriter);
+}
+
 LogicalResult ConvertTFLPowOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   return ConvertBinaryOp<tosa::PowOp>(op, rewriter);
@@ -4846,6 +4835,128 @@ LogicalResult ConvertTFLBroadcastToOp::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertTFLExpOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_exp_op = cast<TFL::ExpOp>(op);
+
+  RankedTensorType input_type =
+      dyn_cast<RankedTensorType>(tfl_exp_op.getX().getType());
+  RankedTensorType output_type =
+      dyn_cast<RankedTensorType>(tfl_exp_op.getResult().getType());
+
+  if (!input_type || !output_type) {
+    return rewriter.notifyMatchFailure(
+        op, "input/output are not all a ranked tensor");
+  }
+
+  mlir::quant::UniformQuantizedType input_qtype =
+      dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          input_type.getElementType());
+  mlir::quant::UniformQuantizedType output_qtype =
+      dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          output_type.getElementType());
+
+  if ((input_qtype == nullptr) != (output_qtype == nullptr)) {
+    return rewriter.notifyMatchFailure(
+        op,
+        "input/output tensor should be all quantized or all floating-point");
+  }
+
+  // Quantization case
+  if (input_qtype && output_qtype) {
+    auto exp_func = [](float x) -> float { return std::exp(x); };
+
+    Value table_const;
+    if (input_qtype.getStorageTypeIntegralWidth() == 8) {
+      table_const = getTosaConst8bitTable(
+          rewriter, op, input_qtype.getScale(), input_qtype.getZeroPoint(),
+          output_qtype.getScale(), output_qtype.getZeroPoint(), exp_func);
+    } else if (input_qtype.getStorageTypeIntegralWidth() == 16) {
+      table_const = getTosaConst16bitTable<float>(
+          rewriter, op, input_qtype.getScale(), input_qtype.getZeroPoint(),
+          output_qtype.getScale(), output_qtype.getZeroPoint(), exp_func);
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "only quantized int8 and int16 are supported");
+    }
+
+    CreateReplaceOpAndInfer<tosa::TableOp>(rewriter, op, output_type,
+                                           tfl_exp_op.getX(), table_const);
+    return success();
+  }
+
+  CreateReplaceOpAndInfer<tosa::ExpOp>(rewriter, op, tfl_exp_op.getType(),
+                                       tfl_exp_op.getX());
+
+  return success();
+}
+
+LogicalResult ConvertTFLLogOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_log_op = cast<TFL::LogOp>(op);
+
+  RankedTensorType input_type =
+      dyn_cast<RankedTensorType>(tfl_log_op.getX().getType());
+  RankedTensorType output_type =
+      dyn_cast<RankedTensorType>(tfl_log_op.getResult().getType());
+
+  if (!input_type || !output_type) {
+    return rewriter.notifyMatchFailure(
+        op, "input/output are not all a ranked tensor");
+  }
+
+  mlir::quant::UniformQuantizedType input_qtype =
+      dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          input_type.getElementType());
+  mlir::quant::UniformQuantizedType output_qtype =
+      dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          output_type.getElementType());
+
+  if ((input_qtype == nullptr) != (output_qtype == nullptr)) {
+    return rewriter.notifyMatchFailure(
+        op,
+        "input/output tensor should be all quantized or all floating-point");
+  }
+
+  // Quantization case
+  if (input_qtype && output_qtype) {
+    const float output_min =
+        ((input_qtype.getStorageTypeIntegralWidth() == 8 ? -128 : -32768) -
+         output_qtype.getZeroPoint()) *
+        static_cast<float>(output_qtype.getScale());
+
+    auto log_func = [&](float x) -> float {
+      if (x <= 0.0f) {
+        return output_min;
+      }
+      return std::log(x);
+    };
+
+    Value table_const;
+    if (input_qtype.getStorageTypeIntegralWidth() == 8) {
+      table_const = getTosaConst8bitTable(
+          rewriter, op, input_qtype.getScale(), input_qtype.getZeroPoint(),
+          output_qtype.getScale(), output_qtype.getZeroPoint(), log_func);
+    } else if (input_qtype.getStorageTypeIntegralWidth() == 16) {
+      table_const = getTosaConst16bitTable<float>(
+          rewriter, op, input_qtype.getScale(), input_qtype.getZeroPoint(),
+          output_qtype.getScale(), output_qtype.getZeroPoint(), log_func);
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "only quantized int8 and int16 are supported");
+    }
+
+    CreateReplaceOpAndInfer<tosa::TableOp>(rewriter, op, output_type,
+                                           tfl_log_op.getX(), table_const);
+    return success();
+  }
+
+  CreateReplaceOpAndInfer<tosa::LogOp>(rewriter, op, tfl_log_op.getType(),
+                                       tfl_log_op.getX());
+
+  return success();
+}
+
 LogicalResult LegalizeTFL::initialize(MLIRContext* context) {
   RewritePatternSet patterns(context);
   mlir::tosa::populateLegalizeTFLPatterns(context, patterns);
@@ -4881,6 +4992,7 @@ void populateLegalizeTFLPatterns(MLIRContext* ctx,
 
   DEF_PATTERN_INSERT(TFLLogicalAnd);
   DEF_PATTERN_INSERT(TFLLogicalOr);
+  DEF_PATTERN_INSERT(TFLBitwiseXor);
   DEF_PATTERN_INSERT(TFLPow);
 
   DEF_PATTERN_INSERT(TFLGelu);
@@ -4972,6 +5084,7 @@ void populateLegalizeTFLPatterns(MLIRContext* ctx,
   DEF_PATTERN_INSERT(TFLConst);
   DEF_PATTERN_INSERT(TFLQConst);
   DEF_PATTERN_INSERT(TFLGatherNd);
+  DEF_PATTERN_INSERT(TFLScatterNd);
   DEF_PATTERN_INSERT(TFLSparseToDense);
   DEF_PATTERN_INSERT(Constant);
   DEF_PATTERN_INSERT(TFLOneHot);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index 97c10593bd9e..11c6212a9eac 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -22,22 +22,25 @@ limitations under the License.
 #include <optional>
 
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
-#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"  // from @llvm-project
-#include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"                 // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"                // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"               // from @llvm-project
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"                // from @llvm-project
+#include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"     // from @llvm-project
+#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"          // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"                   // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"               // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"                        // from @llvm-project
+#include "mlir/Support/LLVM.h"                           // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/kernels/internal/common.h"
 #include "tensorflow/compiler/mlir/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/legalize_common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
 #include "xla/tsl/framework/fixedpoint/FixedPoint.h"
 
 // Implements legalization and post-legalization optimization helper functions
@@ -110,8 +113,8 @@ std::optional<Value> convertTFConv2DCommon(
       stride = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
-      int64_t stride_h = strides_attr[1].cast<IntegerAttr>().getInt();
-      int64_t stride_w = strides_attr[2].cast<IntegerAttr>().getInt();
+      int64_t stride_h = mlir::cast<IntegerAttr>(strides_attr[1]).getInt();
+      int64_t stride_w = mlir::cast<IntegerAttr>(strides_attr[2]).getInt();
       stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
     }
   }
@@ -120,8 +123,8 @@ std::optional<Value> convertTFConv2DCommon(
       dilation = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
-      int64_t dilation_h = dilations_attr[1].cast<IntegerAttr>().getInt();
-      int64_t dilation_w = dilations_attr[2].cast<IntegerAttr>().getInt();
+      int64_t dilation_h = mlir::cast<IntegerAttr>(dilations_attr[1]).getInt();
+      int64_t dilation_w = mlir::cast<IntegerAttr>(dilations_attr[2]).getInt();
       dilation = rewriter.getDenseI64ArrayAttr({dilation_h, dilation_w});
     }
   }
@@ -169,9 +172,9 @@ std::optional<Value> convertTFConv3DCommon(
     // Defaults to [1, 1, 1].
     strides = rewriter.getDenseI64ArrayAttr({1, 1, 1});
   } else {
-    int64_t stride_d = strides_attr[1].cast<IntegerAttr>().getInt();
-    int64_t stride_h = strides_attr[2].cast<IntegerAttr>().getInt();
-    int64_t stride_w = strides_attr[3].cast<IntegerAttr>().getInt();
+    int64_t stride_d = mlir::cast<IntegerAttr>(strides_attr[1]).getInt();
+    int64_t stride_h = mlir::cast<IntegerAttr>(strides_attr[2]).getInt();
+    int64_t stride_w = mlir::cast<IntegerAttr>(strides_attr[3]).getInt();
     strides = rewriter.getDenseI64ArrayAttr({stride_d, stride_h, stride_w});
   }
 
@@ -180,17 +183,18 @@ std::optional<Value> convertTFConv3DCommon(
     // Defaults to [1, 1, 1].
     dilations = rewriter.getDenseI64ArrayAttr({1, 1, 1});
   } else {
-    int64_t dilation_d = dilations_attr[1].cast<IntegerAttr>().getInt();
-    int64_t dilation_h = dilations_attr[2].cast<IntegerAttr>().getInt();
-    int64_t dilation_w = dilations_attr[3].cast<IntegerAttr>().getInt();
+    int64_t dilation_d = mlir::cast<IntegerAttr>(dilations_attr[1]).getInt();
+    int64_t dilation_h = mlir::cast<IntegerAttr>(dilations_attr[2]).getInt();
+    int64_t dilation_w = mlir::cast<IntegerAttr>(dilations_attr[3]).getInt();
     dilations =
         rewriter.getDenseI64ArrayAttr({dilation_d, dilation_h, dilation_w});
   }
 
-  RankedTensorType input_type = input.getType().cast<RankedTensorType>();
+  RankedTensorType input_type = mlir::cast<RankedTensorType>(input.getType());
   DenseI64ArrayAttr pads;
   {
-    RankedTensorType filter_type = filter.getType().cast<RankedTensorType>();
+    RankedTensorType filter_type =
+        mlir::cast<RankedTensorType>(filter.getType());
 
     tensorflow::TensorFormat data_format_tf;
     if (!FormatFromString(data_format_ref, &data_format_tf)) {
@@ -263,8 +267,7 @@ std::optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
                                                  llvm::ArrayRef<Value> dims) {
   const ShapedType input_ty = dyn_cast<ShapedType>(input_value.getType());
   if (!input_ty) {
-    (void)rewriter.notifyMatchFailure(
-            op, "input is not a shaped type");
+    (void)rewriter.notifyMatchFailure(op, "input is not a shaped type");
     return std::nullopt;
   }
 
@@ -315,13 +318,13 @@ std::optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
   // can easily resolve the dim to be static
   if (input_ty.hasStaticShape() && dyn_count == 1) {
     const int64_t total_elements = input_ty.getNumElements();
-    const int64_t shape_elements = std::accumulate(static_dims.begin(), static_dims.end(), 1,
-        [](int64_t a, int64_t b) {
-      return b == tensorflow::kTFDynamicSize ? a : a * b;
-    });
+    const int64_t shape_elements = std::accumulate(
+        static_dims.begin(), static_dims.end(), 1, [](int64_t a, int64_t b) {
+          return b == tensorflow::kTFDynamicSize ? a : a * b;
+        });
     const int64_t dynamic_dim_value = total_elements / shape_elements;
-    std::replace(static_dims.begin(), static_dims.end(), tensorflow::kTFDynamicSize,
-      dynamic_dim_value);
+    std::replace(static_dims.begin(), static_dims.end(),
+                 tensorflow::kTFDynamicSize, dynamic_dim_value);
   }
 
   DenseI64ArrayAttr shape_attr = rewriter.getDenseI64ArrayAttr(static_dims);
@@ -330,25 +333,56 @@ std::optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
   auto shape_value = getTosaConstShape(rewriter, op->getLoc(), static_dims);
 
   return rewriter
-      .create<tosa::ReshapeOp>(op->getLoc(), output_ty, input_value, shape_value)
+      .create<tosa::ReshapeOp>(op->getLoc(), output_ty, input_value,
+                               shape_value)
       .getResult();
 }
 
+Value buildRescaleMultiplier(bool scale32, OpBuilder& builder, Location loc,
+                             ArrayRef<int32_t> multipliers) {
+  if (scale32) {
+    return tosa::getConstTensorInt<int32_t>(builder, loc, multipliers);
+  } else {
+    SmallVector<int16_t> vec(multipliers.begin(), multipliers.end());
+    return tosa::getConstTensorInt<int16_t>(builder, loc, vec);
+  }
+}
+
 // Create a TOSA rescale op from TFLite scaling multiplier, scaling shift, zero
 // points and rounding mode
 Value buildRescale(PatternRewriter& rewriter, Operation* op,
                    ShapedType output_type, Value input_val,
-                   int32_t scale_multiplier, int32_t scale_shit,
-                   int64_t input_zp, int64_t output_zp, bool double_round,
+                   int32_t scale_multiplier, int32_t scale_shift,
+                   int64_t input_zp, int64_t output_zp, StringRef rounding_mode,
                    bool scale32) {
+  bool input_unsigned = input_val.getType().isUnsignedInteger();
+  bool output_unsigned = output_type.isUnsignedInteger();
+  auto loc = op->getLoc();
+  Value multiplier_val =
+      buildRescaleMultiplier(scale32, rewriter, loc, {scale_multiplier});
+  auto shift_val = tosa::getConstTensorInt<int8_t>(rewriter, loc,
+                                            {static_cast<int8_t>(scale_shift)});
+
+  // Create input_zp matches the input type and output_zp matches the output
+  // type of RescaleOp
+  const Value empty_output_val = rewriter.create<tensor::EmptyOp>(
+      loc, output_type.getShape(), output_type.getElementType());
+  const auto input_zp_val =
+      tosa::createZeroPointTensor(rewriter, loc, input_val.getType(), input_zp);
+  if (!input_zp_val.has_value())
+    op->emitError("Failed to create input zero-point tensor for RescaleOp.");
+
+  const auto output_zp_val =
+      tosa::createZeroPointTensor(rewriter, loc, empty_output_val.getType(), output_zp);
+  if (!output_zp_val.has_value())
+    op->emitError("Failed to create output zero-point tensor for RescaleOp.");
+
   auto rescale_op = CreateOpAndInfer<tosa::RescaleOp>(
-      rewriter, op->getLoc(), output_type, input_val,
-      rewriter.getI32IntegerAttr(static_cast<int32_t>(input_zp)),
-      rewriter.getI32IntegerAttr(static_cast<int32_t>(output_zp)),
-      rewriter.getDenseI32ArrayAttr({scale_multiplier}),
-      rewriter.getDenseI8ArrayAttr({static_cast<int8_t>(scale_shit)}),
-      rewriter.getBoolAttr(scale32), rewriter.getBoolAttr(double_round),
-      rewriter.getBoolAttr(false));
+      rewriter, loc, output_type, input_val, multiplier_val, shift_val,
+      input_zp_val.value(), output_zp_val.value(),
+      rewriter.getBoolAttr(scale32), rewriter.getStringAttr(rounding_mode),
+      rewriter.getBoolAttr(false), rewriter.getBoolAttr(input_unsigned),
+      rewriter.getBoolAttr(output_unsigned));
 
   return rescale_op.getResult();
 }
@@ -356,17 +390,19 @@ Value buildRescale(PatternRewriter& rewriter, Operation* op,
 // Create a TOSA rescale op from TFLite scaling, zero points and rounding mode
 Value buildRescale(PatternRewriter& rewriter, Operation* op,
                    ShapedType output_type, Value input_val, double scale,
-                   int64_t input_zp, int64_t output_zp, bool double_round,
+                   int64_t input_zp, int64_t output_zp, StringRef rounding_mode,
                    bool scale32) {
   int32_t multiplier;
   int32_t shift;
 
   int32_t scale_width = scale32 ? 32 : 16;
 
-  computeMultiplierAndShift(scale, multiplier, shift, scale_width);
+  if (!computeMultiplierAndShift(scale, multiplier, shift, scale_width)) {
+    op->emitError("buildRescale: shift must be in the range 2 <= shift <= 62");
+  }
 
   return buildRescale(rewriter, op, output_type, input_val, multiplier, shift,
-                      input_zp, output_zp, double_round, scale32);
+                      input_zp, output_zp, rounding_mode, scale32);
 }
 
 // Removes the zero point and cast to int32, no need to handle roundings modes
@@ -384,9 +420,12 @@ Value buildRescaleToInt32(PatternRewriter& rewriter, Operation* op,
   assert(input_type);
   auto output_type = input_type.clone(rewriter.getI32Type());
 
+  std::string rounding_mode =
+      IsTFLDoubleRoundingMode() ? "DOUBLE_ROUND" : "SINGLE_ROUND";
+
   return buildRescale(rewriter, op, output_type, input_val,
                       input_scale_multiplier, input_scale_shift, input_zp,
-                      /*input_zp=*/0, IsTFLDoubleRoundingMode(),
+                      /*output_zp=*/0, rounding_mode,
                       /*scale32=*/true);
 }
 
@@ -414,9 +453,12 @@ Value buildRescaleFromInt32(PatternRewriter& rewriter, Operation* op,
   assert(input_type && input_type.getElementType().isInteger(32) &&
          "expected rescale input element type to be i32");
 
+  std::string rounding_mode =
+      IsTFLDoubleRoundingMode() ? "DOUBLE_ROUND" : "SINGLE_ROUND";
+
   // Potentially check input_shape == output_shape here
   return buildRescale(rewriter, op, output_type, input_val, output_scale,
-                      /*input_zp=*/0, output_zp, IsTFLDoubleRoundingMode(),
+                      /*input_zp=*/0, output_zp, rounding_mode,
                       /*scale32=*/true);
 }
 
@@ -437,7 +479,24 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
   bool scale32 = isScale32(output_qtype);
   int32_t scale_width = scale32 ? 32 : 16;
   // Only use double round if we are doing 32 bit scaling
-  bool double_round = scale32;
+  std::string rounding_mode = scale32 ? "DOUBLE_ROUND" : "SINGLE_ROUND";
+
+  bool input_unsigned = input_qtype.isUnsignedInteger();
+  bool output_unsigned = output_qtype.isUnsignedInteger();
+
+  auto loc = op->getLoc();
+  const Value empty_output_val = rewriter.create<tensor::EmptyOp>(
+      loc, output_type.getShape(), output_type.getElementType());
+
+  const auto input_zp_val = tosa::createZeroPointTensor(
+      rewriter, loc, conv_val.getType(), static_cast<int64_t>(0));
+  if (!input_zp_val.has_value())
+    op->emitError("Failed to create input zero-point tensor for RescaleOp.");
+
+  const auto output_zp_val =
+      tosa::createZeroPointTensor(rewriter, loc, empty_output_val.getType(), output_zp);
+  if (!output_zp_val.has_value())
+    op->emitError("Failed to create output zero-point tensor for RescaleOp.");
 
   if (auto weight_per_tensor_qtype =
           dyn_cast<mlir::quant::UniformQuantizedType>(
@@ -452,13 +511,17 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
 
     computeMultiplierAndShift(op_tensor_scale, multiplier, shift, scale_width);
 
+    Value multiplier_val =
+        buildRescaleMultiplier(scale32, rewriter, loc, {multiplier});
+    auto shift_val =
+        tosa::getConstTensorInt<int8_t>(rewriter, loc, {static_cast<int8_t>(shift)});
+
     auto rescale_op = CreateOpAndInfer<tosa::RescaleOp>(
-        rewriter, op->getLoc(), output_type, conv_val,
-        rewriter.getI32IntegerAttr(0), rewriter.getI32IntegerAttr(output_zp),
-        rewriter.getDenseI32ArrayAttr({multiplier}),
-        rewriter.getDenseI8ArrayAttr({static_cast<int8_t>(shift)}),
-        rewriter.getBoolAttr(scale32), rewriter.getBoolAttr(double_round),
-        rewriter.getBoolAttr(false));
+        rewriter, loc, output_type, conv_val, multiplier_val, shift_val,
+        input_zp_val.value(), output_zp_val.value(),
+        rewriter.getBoolAttr(scale32), rewriter.getStringAttr(rounding_mode),
+        rewriter.getBoolAttr(false), rewriter.getBoolAttr(input_unsigned),
+        rewriter.getBoolAttr(output_unsigned));
 
     return rescale_op.getResult();
 
@@ -482,19 +545,35 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
 
       double op_channel_scale = (input_scale * weight_scale) / output_scale;
 
-      computeMultiplierAndShift(op_channel_scale, multiplier, shift,
-                                scale_width);
+      if (!computeMultiplierAndShift(op_channel_scale, multiplier, shift, 32)) {
+        op->emitError(
+            "buildRescaleOpConvOutput: shift must be in the range 2 <= shift "
+            "<= 62");
+      }
+      // We are matching the tflite behaviour here by scaling by 32-bit
+      // then down-scaling to 16-bit for int16x8
+      // Reference: tensorflow/lite/kernels/internal/common.cc
+      if (!scale32) {
+        multiplier = (multiplier < 0x7FFF0000)
+                         ? ((multiplier + (1 << 15)) >> 16)
+                         : 0x7FFF;
+        shift = shift - 16;
+      }
 
       multiplier_arr.push_back(multiplier);
       shift_arr.push_back(static_cast<int8_t>(shift));
     }
 
+    Value multiplier_val =
+        buildRescaleMultiplier(scale32, rewriter, loc, multiplier_arr);
+    auto shift_val = tosa::getConstTensorInt<int8_t>(rewriter, loc, shift_arr);
+
     auto rescale_op = CreateOpAndInfer<tosa::RescaleOp>(
-        rewriter, op->getLoc(), output_type, conv_val,
-        rewriter.getI32IntegerAttr(0), rewriter.getI32IntegerAttr(output_zp),
-        rewriter.getDenseI32ArrayAttr(multiplier_arr),
-        rewriter.getDenseI8ArrayAttr(shift_arr), rewriter.getBoolAttr(scale32),
-        rewriter.getBoolAttr(double_round), rewriter.getBoolAttr(true));
+        rewriter, loc, output_type, conv_val, multiplier_val, shift_val,
+        input_zp_val.value(), output_zp_val.value(),
+        rewriter.getBoolAttr(scale32), rewriter.getStringAttr(rounding_mode),
+        rewriter.getBoolAttr(true), rewriter.getBoolAttr(input_unsigned),
+        rewriter.getBoolAttr(output_unsigned));
 
     return rescale_op.getResult();
 
@@ -504,6 +583,90 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
   }
 }
 
+Value getTosaConstHardSwish8bitTable(PatternRewriter& rewriter, Operation* op,
+                                     float input_scale, int32_t input_zp,
+                                     float output_scale, int32_t output_zp) {
+  // Define tflite params:
+  // See: HardSwishPrepare / HardSwishParams
+  const float hires_input_scale = (1.0f / 128.0f) * input_scale;
+  const float reluish_scale = 3.0f / 32768.0f;
+  const float output_multiplier = hires_input_scale / output_scale;
+
+  int16_t output_multiplier_fixedpoint_int16;
+  int output_multiplier_exponent;
+
+  int16_t reluish_multiplier_fixedpoint_int16;
+  int reluish_multiplier_exponent;
+
+  int32_t output_multiplier_fixedpoint_int32;
+  tflite::QuantizeMultiplier(output_multiplier,
+                             &output_multiplier_fixedpoint_int32,
+                             &output_multiplier_exponent);
+  tflite::DownScaleInt32ToInt16Multiplier(output_multiplier_fixedpoint_int32,
+                                          &output_multiplier_fixedpoint_int16);
+  assert(output_multiplier_exponent <= 0);
+
+  const float reluish_multiplier = hires_input_scale / reluish_scale;
+  int32_t reluish_multiplier_fixedpoint_int32;
+
+  tflite::QuantizeMultiplier(reluish_multiplier,
+                             &reluish_multiplier_fixedpoint_int32,
+                             &reluish_multiplier_exponent);
+  tflite::DownScaleInt32ToInt16Multiplier(reluish_multiplier_fixedpoint_int32,
+                                          &reluish_multiplier_fixedpoint_int16);
+
+  // See HardSwish function in
+  // tensorflow/lite/kernels/internal/reference/hardswish.h
+  SmallVector<int8_t, 256> table;
+  for (int32_t i = -128; i < 128; i++) {
+    const int16_t input_value = i - input_zp;
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
+    const int16_t input_value_on_preshift_output_scale =
+        gemmlowp::SaturatingRoundingDoublingHighMul(
+            input_value_on_hires_input_scale,
+            output_multiplier_fixedpoint_int16);
+    int16_t reluish_value = input_value_on_hires_input_scale;
+    if (reluish_multiplier_exponent > 0) {
+      reluish_value = tflite::reference_ops::SaturatingLeftShift(
+          reluish_value, reluish_multiplier_exponent - 1);
+    }
+    reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
+        reluish_value, reluish_multiplier_fixedpoint_int16);
+    if (reluish_multiplier_exponent > 0) {
+      reluish_value =
+          tflite::reference_ops::SaturatingLeftShift(reluish_value, 1);
+    }
+    if (reluish_multiplier_exponent < 0) {
+      reluish_value = gemmlowp::RoundingDivideByPOT(
+          reluish_value, -reluish_multiplier_exponent);
+    }
+    reluish_value = (reluish_value + (1 << 15)) >> 1;
+    const int16_t preshift_output_value =
+        tflite::reference_ops::SaturatingDoublingHighMul(
+            reluish_value, input_value_on_preshift_output_scale);
+    int16_t output_value = gemmlowp::RoundingDivideByPOT(
+        preshift_output_value, -output_multiplier_exponent);
+    output_value += output_zp;
+    output_value =
+        std::min<int16_t>(output_value, std::numeric_limits<int8_t>::max());
+    output_value =
+        std::max<int16_t>(output_value, std::numeric_limits<int8_t>::min());
+    table.push_back(output_value);
+  }
+
+  auto element_qtype =
+      UniformQuantizedType::get(true, rewriter.getIntegerType(8),
+                                rewriter.getF32Type(), 1.0f, 0, -128, 127);
+  auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
+  auto storage_type = tensorflow::GetTypeFromTFTensorShape(
+      {256}, element_qtype.getStorageType());
+  auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
+
+  auto const_op =
+      rewriter.create<tosa::ConstOp>(op->getLoc(), const_type, const_attr);
+  return const_op.getResult();
+}
+
 Value getTosaConstRsqrt8bitTable(PatternRewriter& rewriter, Operation* op,
                                  float input_scale, int32_t input_zp,
                                  float output_scale, int32_t output_zp) {
@@ -559,24 +722,25 @@ Value getTosaConstRsqrt8bitTable(PatternRewriter& rewriter, Operation* op,
 }
 
 // Create a 8-bit TOSA TABLE constant tensor with int8[256] array.
-// Follow PopulateLookupTable() tensorflow/lite/kernels/activations.cc
+// Follow LUTPopulateInt8() tensorflow/lite/kernels/internal/common.h
 Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
-                            double input_scale, int32_t input_zp,
-                            double output_scale, int32_t output_zp,
-                            std::function<double(double)> func) {
+                            float input_scale, int32_t input_zp,
+                            float output_scale, int32_t output_zp,
+                            std::function<float(float)> func) {
   SmallVector<int8_t, 256> table;
 
+  float inverse_scale = 1.0f / output_scale;
   for (int32_t i = -128; i < 128; i++) {
-    double dequantized = input_scale * (i - input_zp);
-    double transformed = func(dequantized);
+    float dequantized = input_scale * (i - input_zp);
+    float transformed = func(dequantized);
 
-    double max = (output_scale > 1.0) ? DBL_MAX : (DBL_MAX * output_scale);
+    float max = (output_scale > 1.0) ? FLT_MAX : (FLT_MAX * output_scale);
     if (transformed >= max) {
       table.push_back(INT8_MAX);
       continue;
     }
 
-    int32_t rescaled = std::llround(transformed / output_scale);
+    int32_t rescaled = std::round(transformed * inverse_scale);
     int32_t quantized = static_cast<int32_t>(rescaled + output_zp);
     table.push_back(
         static_cast<int8_t>(std::min(std::max(quantized, -128), 127)));
@@ -595,34 +759,52 @@ Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
   return const_op.getResult();
 }
 
-// Create a 16-bit TOSA TABLE constant tensor with int16[513] array.
-// Output is restricted to [-1.0, 1.0].
-// Follow gen_lut() tensorflow/lite/kernels/internal/common.h
+// Create a 16-bit TOSA TABLE constant tensor.
+// A float should be used by default for FloatT except if a double is required
+// for backward compatibility.
+// Follow LUTPopulateInt16() tensorflow/lite/kernels/internal/common.h
+template <typename FloatT>
 Value getTosaConst16bitTable(PatternRewriter& rewriter, Operation* op,
-                             std::function<double(double)> func, double min,
-                             double max) {
+                             FloatT input_scale, int32_t input_zp,
+                             FloatT output_scale, int32_t output_zp,
+                             std::function<FloatT(FloatT)> func) {
+  static_assert(std::is_floating_point<FloatT>::value,
+                "FloatT must be a floating-point type.");
+
   SmallVector<int16_t, 513> table;
 
-  double step = (max - min) / 512.0f;
-  double half_step = step / 2.0f;
+  FloatT input_min =
+      input_scale * (std::numeric_limits<int16_t>::min() - input_zp);
+  FloatT input_max =
+      input_scale * (std::numeric_limits<int16_t>::max() - input_zp);
+  FloatT output_min =
+      output_scale * (std::numeric_limits<int16_t>::min() - output_zp);
+  FloatT output_max =
+      output_scale * (std::numeric_limits<int16_t>::max() - output_zp);
+
+  FloatT step = (input_max - input_min) / 512;
+  FloatT half_step = step / 2;
+  FloatT output_scaling_inv = 65536 / (output_max - output_min);
+
   for (int32_t i = 0; i < 512; i++) {
-    int32_t sample_val = std::llround(func(min + (i * step)) * 32768.0);
-    double midpoint_interp_val =
-        std::round(((func(min + (i + 1) * step) * 32768.0) +
-                    std::round(func(min + (i * step)) * 32768.0)) /
-                   2.0);
-    double midpoint_val =
-        std::round(func(min + (i * step) + half_step) * 32768.0);
-    double midpoint_err = midpoint_interp_val - midpoint_val;
-    int32_t bias = std::llround(midpoint_err / 2.0);
+    FloatT sample_val =
+        std::round(func(input_min + (i * step)) * output_scaling_inv);
+    FloatT midpoint_interp_val = std::round(
+        ((func(input_min + (i + 1) * step) * output_scaling_inv) +
+         std::round(func(input_min + (i * step)) * output_scaling_inv)) /
+        2);
+    FloatT midpoint_val = std::round(func(input_min + (i * step) + half_step) *
+                                     output_scaling_inv);
+    FloatT midpoint_err = midpoint_interp_val - midpoint_val;
+    FloatT bias = std::round(midpoint_err / 2);
 
     table.push_back(static_cast<int16_t>(
-        std::min(std::max(sample_val - bias, -32768), 32767)));
+        std::min<FloatT>(std::max<FloatT>(sample_val - bias, -32768), 32767)));
   }
 
-  int32_t max_val = std::llround(func(max) * 32768.0);
-  table.push_back(
-      static_cast<int16_t>(std::min(std::max(max_val, -32768), 32767)));
+  FloatT max_val = std::round(func(input_max) * output_scaling_inv);
+  table.push_back(static_cast<int16_t>(
+      std::min<FloatT>(std::max<FloatT>(max_val, -32768), 32767)));
 
   auto const_type =
       tensorflow::GetTypeFromTFTensorShape({513}, rewriter.getIntegerType(16));
@@ -633,6 +815,18 @@ Value getTosaConst16bitTable(PatternRewriter& rewriter, Operation* op,
   return const_op.getResult();
 }
 
+template Value getTosaConst16bitTable<float>(PatternRewriter& rewriter,
+                                             Operation* op, float input_scale,
+                                             int32_t input_zp,
+                                             float output_scale,
+                                             int32_t output_zp,
+                                             std::function<float(float)> func);
+
+template Value getTosaConst16bitTable<double>(
+    PatternRewriter& rewriter, Operation* op, double input_scale,
+    int32_t input_zp, double output_scale, int32_t output_zp,
+    std::function<double(double)> func);
+
 // Create a 32-bit TOSA TABLE for Softmax Exp
 void getTosaConst32bitSoftmaxExpTable(PatternRewriter& rewriter, Operation* op,
                                       double beta, double input_scale,
@@ -759,7 +953,7 @@ Value getTosaConstTensorSingleI32(PatternRewriter& rewriter, Operation* op,
 Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
                                   int64_t val, int rank) {
   assert(rank >= 0);
-  assert(type.isa<IntegerType>());
+  assert(mlir::isa<IntegerType>(type));
   mlir::RankedTensorType const_type;
   mlir::DenseElementsAttr const_attr;
   auto bit_width = type.getIntOrFloatBitWidth();
@@ -958,14 +1152,14 @@ bool getTransposeConv2dPaddingValues(
       return false;
     }
 
-    int total_padding = ((ifm_size - 1) * dim_stride + filter_size - ofm_size);
-    total_padding = total_padding > 0 ? total_padding : 0;
+    int total_padding =
+        ((ifm_size - 1) * dim_stride + filter_size - ofm_size);
 
     pad_before = total_padding / 2;
     pad_after = total_padding - pad_before;
 
-    computed_paddings.push_back(pad_before);
-    computed_paddings.push_back(pad_after);
+    computed_paddings.push_back(-pad_before);
+    computed_paddings.push_back(-pad_after);
   }
 
   explicit_padding = rewriter.getDenseI64ArrayAttr(computed_paddings);
@@ -1130,7 +1324,7 @@ LogicalResult ApplyPatternsWithShapeResolution(
   // We use top-down traversal so that shape inference can fully infer types
   // during pattern rewrite.
   GreedyRewriteConfig config;
-  config.useTopDownTraversal = true;
+  config.setUseTopDownTraversal(true);
   if (failed(applyPatternsGreedily(func, patterns, config))) {
     return failure();
   }
@@ -1145,7 +1339,7 @@ LogicalResult ApplyPatternsWithShapeResolution(
     if (mlir::isa<QuantizedType>(op.getType().getElementType())) {
       return;
     }
-    auto ety = op.getValue().getShapedType().getElementType();
+    auto ety = op.getValues().getShapedType().getElementType();
     auto new_ty = mlir::cast<TensorType>(op.getType()).clone(ety);
     op.getResult().setType(new_ty);
   });
@@ -1177,8 +1371,9 @@ void TrimQuantizedIntegerRange(UniformQuantizedType dtype, int64_t& val_min,
   TrimQuantizedIntegerRangeMax(dtype, val_max);
 }
 
-tosa::MulOp CreateMulOpAndInfer(PatternRewriter& rewriter, Operation* op, Type result_ty,
-    Value input1, Value input2, int8_t shift) {
+tosa::MulOp CreateMulOpAndInfer(PatternRewriter& rewriter, Operation* op,
+                                Type result_ty, Value input1, Value input2,
+                                int8_t shift) {
   if (EqualizeRanks(rewriter, op->getLoc(), input1, input2).failed()) {
     // uncompatible broadcast shapes, no reshape is inserted
     // ResultsBroadcastableShape verify will handle this
@@ -1213,10 +1408,10 @@ Value reshapeScalarTo1D(PatternRewriter& rewriter, Location loc, Value value) {
     }
 
     DenseElementsAttr const_attr;
-    if (attr.getElementType().isa<mlir::IntegerType>()) {
+    if (mlir::isa<mlir::IntegerType>(attr.getElementType())) {
       const_attr = DenseElementsAttr::get(storage_type,
                                           {attr.getValues<mlir::APInt>()[0]});
-    } else if (attr.getElementType().isa<mlir::FloatType>()) {
+    } else if (mlir::isa<mlir::FloatType>(attr.getElementType())) {
       const_attr = DenseElementsAttr::get(storage_type,
                                           {attr.getValues<mlir::APFloat>()[0]});
     } else {
@@ -1289,11 +1484,7 @@ LogicalResult broadcastLowRankTensor(PatternRewriter& rewriter, Operation* op,
 
   std::optional<Value> result = convertBroadcastToOp(
       rewriter, op, low_rank_tensor, broadcast_shape_value);
-  if (!result) {
-    return rewriter.notifyMatchFailure(op,
-                                       "failed to broadcast low rank tensor "
-                                       "from convertBroadcastToOp");
-  }
+  if (!result) return failure();
 
   low_rank_tensor = result.value();
 
@@ -1307,5 +1498,36 @@ LogicalResult broadcastLowRankTensor(PatternRewriter& rewriter, Operation* op,
   return success();
 }
 
+bool checkUniqueConstantScatterIndices(ShapedType indices_type,
+                                       ShapedType result_type,
+                                       ElementsAttr const_data) {
+  llvm::ArrayRef<int64_t> const indices_shape = indices_type.getShape();
+  const unsigned int indices_rank = indices_shape.size();
+  const unsigned int result_rank = result_type.getRank();
+  const unsigned int last_dim_size = indices_shape[indices_rank - 1];
+
+  // Reconstruct each index from the unshaped constant data array and
+  // calculate the corresponding flattened index
+  auto const const_data_range = const_data.getValues<int32_t>();
+  assert((const_data_range.size() % last_dim_size == 0) &&
+         "Constant data length should be a multiple of indices_shape[-1]");
+
+  std::vector<int64_t> flattened_indices;
+  flattened_indices.reserve(const_data_range.size() / last_dim_size);
+  for (auto beg = const_data_range.begin(); beg < const_data_range.end();
+       beg += last_dim_size) {
+    std::vector<uint64_t> current_single_index(result_rank);
+    std::copy(beg, beg + last_dim_size, current_single_index.begin());
+    const uint64_t f_index{
+        ElementsAttr::getFlattenedIndex(result_type, current_single_index)};
+    flattened_indices.push_back(f_index);
+  }
+
+  // If adjacent flattened values are found, there are non-unique indices
+  std::sort(flattened_indices.begin(), flattened_indices.end());
+  return std::adjacent_find(flattened_indices.begin(),
+                            flattened_indices.end()) == flattened_indices.end();
+}
+
 }  // namespace tosa
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
index b51719eab23f..a2b990446924 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -67,13 +67,13 @@ std::optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
 Value buildRescale(PatternRewriter& rewriter, Operation* op,
                    ShapedType output_type, Value input_val,
                    int32_t scale_multiplier, int32_t scale_shit,
-                   int64_t input_zp, int64_t output_zp, bool double_round,
+                   int64_t input_zp, int64_t output_zp, StringRef rounding_mode,
                    bool scale32);
 
 // Create a TOSA rescale op from TFLite scaling, zero points and rounding mode
 Value buildRescale(PatternRewriter& rewriter, Operation* op,
                    ShapedType output_type, Value input_val, double scale,
-                   int64_t input_zp, int64_t output_zp, bool double_round,
+                   int64_t input_zp, int64_t output_zp, StringRef rounding_mode,
                    bool scale32);
 
 // Removes the zero point and cast to int32, no need to handle roundings modes
@@ -102,14 +102,18 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
 
 // Create a 8-bit TOSA TABLE constant tensor
 Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
-                            double input_scale, int32_t input_zp,
-                            double output_scale, int32_t output_zp,
-                            std::function<double(double)> func);
+                            float input_scale, int32_t input_zp,
+                            float output_scale, int32_t output_zp,
+                            std::function<float(float)> func);
 
 // Create a 16-bit TOSA TABLE constant tensor
+// A float should be used by default for FloatT except if a double is required
+// for backward compatibility
+template <typename FloatT>
 Value getTosaConst16bitTable(PatternRewriter& rewriter, Operation* op,
-                             std::function<double(double)> func, double min,
-                             double max);
+                             FloatT input_scale, int32_t input_zp,
+                             FloatT output_scale, int32_t output_zp,
+                             std::function<FloatT(FloatT)> func);
 
 // Create a 32-bit TOSA TABLE for Softmax Exp
 void getTosaConst32bitSoftmaxExpTable(PatternRewriter& rewriter, Operation* op,
@@ -122,6 +126,11 @@ Value getTosaConstRsqrt8bitTable(PatternRewriter& rewriter, Operation* op,
                                  float input_scale, int32_t input_zp,
                                  float output_scale, int32_t output_zp);
 
+// Create an 8-bit TOSA Table constant tensor for the HardSwish operator
+Value getTosaConstHardSwish8bitTable(PatternRewriter& rewriter, Operation* op,
+                                     float input_scale, int32_t input_zp,
+                                     float output_scale, int32_t output_zp);
+
 // Create a 32-bit float constant operator from a float
 Value getTosaConstTensorSingleF32(PatternRewriter& rewriter, Operation* op,
                                   float val, int rank);
@@ -203,6 +212,14 @@ Value getInputSlicedToItsUsedSize(PatternRewriter& rewriter, Operation* op,
 // Check if scale32 mode is used for given output_element_type
 bool isScale32(mlir::quant::UniformQuantizedType output_element_type);
 
+// Checks if the multi-dimensional indices supplied by a constant tensor
+// are unique. This is a useful check for legalizations to tosa.scatter
+// which requires indices are unique, while in TF/TFLite they may be
+// non-unique.
+bool checkUniqueConstantScatterIndices(ShapedType indices_type,
+                                       ShapedType result_type,
+                                       ElementsAttr const_data);
+
 // Applies a set of patterns greedily to the specified function, then applies
 // a cleanup to guarantee the function contract and constants are valid. This
 // means patterns can performed shape inference while not altering immutable
diff --git a/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td b/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td
index a7230ccf9013..b0141dcaf9fa 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td
+++ b/tensorflow/compiler/mlir/tosa/transforms/tfl_legalize_patterns.td
@@ -29,8 +29,6 @@ include "mlir/Dialect/Tosa/IR/TosaOps.td"
 def ConvertTFLAbsOp : Pat<(TFL_AbsOp $arg), (Tosa_AbsOp $arg)>;
 def ConvertTFLCeilOp : Pat<(TFL_CeilOp $arg), (Tosa_CeilOp $arg)>;
 def ConvertTFLFloorOp : Pat<(TFL_FloorOp $arg), (Tosa_FloorOp $arg)>;
-def ConvertTFLExpOp : Pat<(TFL_ExpOp $arg), (Tosa_ExpOp $arg)>;
-def ConvertTFLLogOp : Pat<(TFL_LogOp $arg), (Tosa_LogOp $arg)>;
 def ConvertTFLLogicalNotOp : Pat<(TFL_LogicalNotOp $arg), (Tosa_LogicalNotOp $arg)>;
 
 // Removing the quant.stats op for unquantized models.
diff --git a/tensorflow/compiler/mlir/utils/BUILD b/tensorflow/compiler/mlir/utils/BUILD
index 2256c421b457..ae6a01df20e1 100644
--- a/tensorflow/compiler/mlir/utils/BUILD
+++ b/tensorflow/compiler/mlir/utils/BUILD
@@ -37,3 +37,40 @@ cc_library(
         "@llvm-project//llvm:Support",
     ],
 )
+
+cc_library(
+    name = "saved_model_converter_utils",
+    srcs = ["saved_model_converter_utils.cc"],
+    hdrs = ["saved_model_converter_utils.h"],
+    visibility = [
+        "//tensorflow/cc/experimental/tfa:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/compiler/mlir/tf2xla/api/v2:mlir_roundtrip_flags",
+        "//tensorflow/core/framework:op",
+        "//tensorflow/core/framework:op_def_builder",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "validators",
+    srcs = [
+        "validators.cc",
+    ],
+    hdrs = [
+        "validators.h",
+    ],
+    deps = [
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/utils/saved_model_converter_utils.cc b/tensorflow/compiler/mlir/utils/saved_model_converter_utils.cc
new file mode 100644
index 000000000000..d818acf6ee52
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/saved_model_converter_utils.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/utils/saved_model_converter_utils.h"
+
+#include <stdlib.h>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/mlir_roundtrip_flags.h"
+
+
+namespace tensorflow {
+namespace utils {
+
+// Util that registers 'extra_tf_opdefs' to the TF global registry.
+// Return OK on success, failure if registering failed.
+absl::Status RegisterExtraTfOpDefs(
+    absl::Span<const std::string> extra_tf_opdefs) {
+  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
+    OpDef opdef;
+    // NOLINTNEXTLINE: Use tsl::protobuf to be compatible with OSS.
+    if (!tsl::protobuf::TextFormat::ParseFromString(tf_opdefs_string, &opdef)) {
+      LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
+      return absl::InvalidArgumentError("fail to parse extra OpDef");
+    }
+    // Register extra opdefs.
+    // TODO: b/133770952 - Support shape functions.
+    OpRegistry::Global()->Register(
+        [opdef](OpRegistrationData* op_reg_data) -> absl::Status {
+          *op_reg_data = OpRegistrationData(opdef);
+          return absl::OkStatus();
+        });
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+    const std::string& input_filename, const int saved_model_version,
+    const std::unordered_set<std::string>& tags,
+    absl::Span<const std::string> extra_tf_opdefs,
+    absl::Span<std::string> exported_names, const GraphImportConfig& specs,
+    bool enable_variable_lifting, mlir::MLIRContext* context,
+    std::unique_ptr<SavedModelBundle>* saved_model_bundle) {
+  // Register extra TF ops passed as OpDef.
+  auto extra_opdefs_status = RegisterExtraTfOpDefs(extra_tf_opdefs);
+  if (!extra_opdefs_status.ok()) return extra_opdefs_status;
+
+  if (saved_model_version == 2) {
+    auto module_or = SavedModelObjectGraphToMlirImport(
+        input_filename, tags, exported_names, context,
+        /*unconditionally_use_set_output_shapes=*/true);
+    if (!module_or.status().ok()) return module_or.status();
+    return std::move(module_or).value();
+  } else if (saved_model_version == 1) {
+    MLIRImportOptions options;
+    options.upgrade_legacy = specs.upgrade_legacy;
+    options.unconditionally_use_set_output_shapes = true;
+    options.lift_variables = enable_variable_lifting;
+    auto module_or = SavedModelSignatureDefsToMlirImport(
+        input_filename, tags, exported_names, context, options,
+        saved_model_bundle);
+
+    if (!module_or.status().ok()) return module_or.status();
+    return std::move(module_or).value();
+  } else {
+    return absl::InvalidArgumentError("Should be either saved model v1 or v2.");
+  }
+}
+
+}  // namespace utils
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/utils/saved_model_converter_utils.h b/tensorflow/compiler/mlir/utils/saved_model_converter_utils.h
new file mode 100644
index 000000000000..fc4440fb918a
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/saved_model_converter_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_SAVED_MODEL_CONVERTER_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_SAVED_MODEL_CONVERTER_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/mlir_roundtrip_flags.h"
+
+namespace tensorflow {
+namespace utils {
+
+// 'saved_model_bundle' will be initialized if V1 model was loaded.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+    const std::string& input_filename, int saved_model_version,
+    const std::unordered_set<std::string>& tags,
+    absl::Span<const std::string> extra_tf_opdefs,
+    absl::Span<std::string> exported_names, const GraphImportConfig& specs,
+    bool enable_variable_lifting, mlir::MLIRContext* context,
+    std::unique_ptr<tensorflow::SavedModelBundle>* saved_model_bundle);
+
+}  // namespace utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_SAVED_MODEL_CONVERTER_UTILS_H_
diff --git a/tensorflow/compiler/mlir/utils/validators.cc b/tensorflow/compiler/mlir/utils/validators.cc
new file mode 100644
index 000000000000..870c7e1f1efb
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/validators.cc
@@ -0,0 +1,147 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/utils/validators.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Returns true if the given `op`
+//   * has an attribute with the given `name`,
+//   * and the attribute is an integer list of the form [1, X, Y, 1],
+// and writes X, Y as 32-bit integer attribute to `x`, `y`.
+bool TFIntListIs1XY1(Operation *op, StringRef name, IntegerAttr *x,
+                     IntegerAttr *y) {
+  auto attr = op->getAttrOfType<ArrayAttr>(name);
+  if (!attr) return false;
+
+  auto elements = attr.getValue();
+  if (elements.size() != 4 ||
+      std::any_of(elements.begin(), elements.end(),
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
+    return false;
+
+  if (mlir::cast<IntegerAttr>(elements.front()).getInt() != 1 ||
+      mlir::cast<IntegerAttr>(elements.back()).getInt() != 1)
+    return false;
+
+  Builder b(op->getContext());
+  *x = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[1]).getInt());
+  *y = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[2]).getInt());
+
+  return true;
+}
+
+// Returns true if the attribute is an integer list of the form [1, X, Y, 1].
+bool TFIntListIs1XY1(const Attribute attr) {
+  const auto &elements = mlir::cast<ArrayAttr>(attr).getValue();
+  if (elements.size() != 4 ||
+      std::any_of(elements.begin(), elements.end(),
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
+    return false;
+
+  if (mlir::cast<IntegerAttr>(elements.front()).getValue() != 1 ||
+      mlir::cast<IntegerAttr>(elements.back()).getValue() != 1)
+    return false;
+  return true;
+}
+
+// Returns true if the attribute is an integer list of the form [1, 1, X, Y].
+bool TFIntListIs11XY(const Attribute attr) {
+  const auto &elements = mlir::cast<ArrayAttr>(attr).getValue();
+  if (elements.size() != 4 ||
+      std::any_of(elements.begin(), elements.end(),
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
+    return false;
+
+  const Attribute *data = elements.data();
+  if (mlir::cast<IntegerAttr>(data[0]).getValue() != 1 ||
+      mlir::cast<IntegerAttr>(data[1]).getValue() != 1)
+    return false;
+  return true;
+}
+
+// Returns true if the given `op`
+//   * has an attribute with the given `name`,
+//   * and the attribute is an integer list of the form [1, X, Y, Z, 1],
+// and writes X, Y as 32-bit integer attribute to `x`, `y`, z.
+bool TFIntListIs1XYZ1(Operation *op, StringRef name, IntegerAttr *x,
+                      IntegerAttr *y, IntegerAttr *z) {
+  auto attr = op->getAttrOfType<ArrayAttr>(name);
+  if (!attr) return false;
+
+  auto elements = attr.getValue();
+  if (elements.size() != 5 ||
+      std::any_of(elements.begin(), elements.end(),
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
+    return false;
+
+  if (mlir::cast<IntegerAttr>(elements.front()).getInt() != 1 ||
+      mlir::cast<IntegerAttr>(elements.back()).getInt() != 1)
+    return false;
+
+  Builder b(op->getContext());
+  *x = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[1]).getInt());
+  *y = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[2]).getInt());
+  *z = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[3]).getInt());
+
+  return true;
+}
+
+// Returns true if every element of the attribute is 1. All elements of `attr`
+// must be `IntegerAttr`.
+bool TFIntListIsAllOnes(const Attribute attr) {
+  const auto &elements = mlir::cast<ArrayAttr>(attr).getValue();
+
+  return !std::any_of(elements.begin(), elements.end(), [](Attribute e) {
+    return mlir::cast<IntegerAttr>(e).getValue() != 1;
+  });
+}
+
+bool IsBroadcastableElementsAttrs(mlir::TypedAttr a, mlir::TypedAttr b) {
+  // This would return false if we had unranked tensors (where they should
+  // probably be considered as broadcastable), but given we are working with
+  // attributes here that shouldn't be an issue,
+  return OpTrait::util::getBroadcastedType(a.getType(), b.getType()) != Type();
+}
+
+bool IsDimensionsDegenerateExceptLastOne(ArrayRef<int64_t> elements_shape) {
+  if (elements_shape.empty()) return true;
+
+  for (auto dim : elements_shape.drop_back(1)) {
+    if (dim != 1) return false;
+  }
+  return true;
+}
+
+bool IsDimensionsDegenerateExceptLastOne(TypedAttr val) {
+  if (auto ranked_type = mlir::dyn_cast<RankedTensorType>(val.getType())) {
+    return IsDimensionsDegenerateExceptLastOne(ranked_type.getShape());
+  }
+  return false;
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/utils/validators.h b/tensorflow/compiler/mlir/utils/validators.h
new file mode 100644
index 000000000000..b55bd2199146
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/validators.h
@@ -0,0 +1,126 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common validators used by TFLite transformation
+// passes to validate op attributes or values.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_VALIDATORS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_VALIDATORS_H_
+
+#include <cstdint>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// TODO(jpienaar): Change these to being one of these variants and/or generate
+// these predicates.
+
+// Returns true if the given TensorFlow op does not have a `data_format`
+// attribute (then default to "NHWC"), or its `data_format` attribute is "NHWC".
+inline bool TFDataFormatIsNHWC(Operation *op) {
+  auto attr = op->getAttrOfType<StringAttr>("data_format");
+  return !attr || attr.getValue() == "NHWC";
+}
+
+// Returns true if the given TensorFlow op does not have a `data_format`
+// attribute (then default to "NDHWC"), or its `data_format` attribute is
+// "NDHWC".
+inline bool TFDataFormatIsNDHWC(Operation *op) {
+  auto attr = op->getAttrOfType<StringAttr>("data_format");
+  return !attr || attr.getValue() == "NDHWC";
+}
+
+// Returns true if the given `op`
+//   * has an attribute with the given `name`,
+//   * and the attribute is an integer list of the form [1, X, Y, 1],
+// and writes X, Y as 32-bit integer attribute to `x`, `y`.
+bool TFIntListIs1XY1(Operation *op, StringRef name, IntegerAttr *x,
+                     IntegerAttr *y);
+
+// Returns true if the attribute is an integer list of the form [1, X, Y, 1].
+bool TFIntListIs1XY1(Attribute attr);
+
+// Returns true if the attribute is an integer list of the form [1, 1, X, Y].
+bool TFIntListIs11XY(Attribute attr);
+
+// Returns true if the given `op`
+//   * has an attribute with the given `name`,
+//   * and the attribute is an integer list of the form [1, X, Y, Z, 1],
+// and writes X, Y as 32-bit integer attribute to `x`, `y`, z.
+bool TFIntListIs1XYZ1(Operation *op, StringRef name, IntegerAttr *x,
+                      IntegerAttr *y, IntegerAttr *z);
+
+// Returns true if every element of the attribute is 1. All elements of `attr`
+// must be `IntegerAttr`.
+bool TFIntListIsAllOnes(Attribute attr);
+
+// Returns true iff the given value is a float32 tensor.
+// is "DT_FLOAT".
+inline bool TFTypeIsFloat32Tensor(Value value) {
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
+  if (!tensorType) return false;
+  return tensorType.getElementType().isF32();
+}
+
+// Returns true iff the given value is a bf16 tensor.
+inline bool TFTypeIsBFloat16Tensor(Value value) {
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
+  if (!tensorType) return false;
+  return tensorType.getElementType().isBF16();
+}
+
+// Returns true iff the given value is a f16 tensor.
+inline bool TFTypeIsHalfTensor(Value value) {
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
+  if (!tensorType) return false;
+  return tensorType.getElementType().isF16();
+}
+
+// Returns true iff the given value is a f16 or bf16 tensor.
+inline bool TFTypeIsBFloat16OrHalfTensor(Value value) {
+  return TFTypeIsBFloat16Tensor(value) || TFTypeIsHalfTensor(value);
+}
+
+// Returns true iff the given TensorFlow op has a `padding` attribute whose
+// value is "SAME" or "VALID", and writes the attribute to `padding`.
+inline bool TFPaddingIsSameOrValid(Operation *op, StringAttr *padding) {
+  auto padding_attr = op->getAttrOfType<StringAttr>("padding");
+  if (padding_attr.getValue() != "SAME" && padding_attr.getValue() != "VALID")
+    return false;
+  *padding = padding_attr;
+  return true;
+}
+
+/// Returns whether the given `a` and `b` have broadcast-compatible
+/// types.
+bool IsBroadcastableElementsAttrs(mlir::TypedAttr a, mlir::TypedAttr b);
+// Returns true if every dimension of the attribute is 1 except the last one.
+bool IsDimensionsDegenerateExceptLastOne(mlir::TypedAttr val);
+// Returns true if every element is 1 except the last one.
+bool IsDimensionsDegenerateExceptLastOne(ArrayRef<int64_t> elements_shape);
+
+}  // end namespace TF
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_VALIDATORS_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index a4a1dcbea3d7..73e075340f12 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -390,7 +390,6 @@ tf_xla_py_strict_test(
         "gpu_a100",
         "gpu_h100",
     ],
-    shard_count = 2,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",  # Times out frequently in fastbuild mode.
@@ -1120,7 +1119,7 @@ tf_xla_py_strict_test(
     size = "medium",
     timeout = "long",
     srcs = ["matrix_diag_ops_test.py"],
-    shard_count = 8,
+    shard_count = 4,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -1551,6 +1550,12 @@ tf_xla_py_strict_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
+    enabled_backends = [
+        "cpu",
+        "gpu",
+        "gpu_a100",
+        "gpu_h100",
+    ],
     tags = [
         "config-cuda-only",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1677,8 +1682,12 @@ tf_xla_py_strict_test(
     name = "tensor_array_ops_test",
     size = "medium",
     srcs = ["tensor_array_ops_test.py"],
-    # TensorArray ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = ["cpu_ondemand"],
+    enabled_backends = [
+        "cpu",
+        "gpu",
+        "gpu_a100",
+        "gpu_h100",
+    ],
     tags = [
         "config-cuda-only",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1737,7 +1746,7 @@ tf_xla_py_strict_test(
     name = "ternary_ops_test",
     size = "medium",
     srcs = ["ternary_ops_test.py"],
-    shard_count = 8,
+    shard_count = 4,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
@@ -2179,7 +2188,6 @@ tf_xla_py_strict_test(
     name = "conv_node_name_test",
     size = "medium",
     srcs = ["conv_node_name_test.py"],
-    shard_count = 5,
     tags = [
         "no_oss",  # TODO(b/148108508): Re-enable this test in OSS.
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -2419,12 +2427,14 @@ tf_xla_py_strict_test(
     shard_count = 10,
     tags = [
         "notap",
+        "optonly",
     ],
     deps = [
         ":xla_test",
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/ops:tpu_ops_gen",
diff --git a/tensorflow/compiler/tests/conv_node_name_test.py b/tensorflow/compiler/tests/conv_node_name_test.py
index 42c5c365c64b..ba1645e973a3 100644
--- a/tensorflow/compiler/tests/conv_node_name_test.py
+++ b/tensorflow/compiler/tests/conv_node_name_test.py
@@ -42,7 +42,7 @@ def _GetNodeNames(use_xla):
         input_tensor = array_ops.placeholder(np.float32, shape=input_sizes)
 
         if use_xla:
-          with self.test_scope():
+          with self.device_scope():
             # pylint: disable=protected-access
             graph = ops.get_default_graph()
             graph._set_control_flow_context(
diff --git a/tensorflow/compiler/tests/sharding_util_ops_test.py b/tensorflow/compiler/tests/sharding_util_ops_test.py
index 7d5ac5771f1f..ec47fddf23cc 100644
--- a/tensorflow/compiler/tests/sharding_util_ops_test.py
+++ b/tensorflow/compiler/tests/sharding_util_ops_test.py
@@ -23,7 +23,7 @@
 from tensorflow.python.client.session import Session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework.ops import control_dependencies
-from tensorflow.python.framework.ops import Tensor
+from tensorflow.python.framework.tensor import Tensor
 from tensorflow.python.ops import gen_tpu_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 809db242ac4a..101ca75f8b68 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -230,7 +230,8 @@ def testBetaincSanity(self):
       x = np.array([.3, .4, .0, .1], dtype=dtype)
       expected = sps.betainc(a, b, x)
       self._testTernary(
-          math_ops.betainc, a, b, x, expected, rtol=5e-6, atol=6e-6)
+          math_ops.betainc, a, b, x, expected, rtol=5e-5, atol=6e-5
+      )
 
   @parameterized.parameters(
       {
diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index b8d59d77641a..197df89e2c00 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -1568,6 +1568,30 @@ def f(x):
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
 
+  def test_op_backward_incompatibility(self):
+    """Test for ensuring XlaCallModuleOp with invalid bytecode."""
+    x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+
+    def f(x):
+      # Use an invalid MLIR string that will fail to parse when loading the
+      # call module op, emulating a backward incompatibility.
+      corrupted_module = 'stablehlo.invalid_op'
+      return gen_xla_ops.xla_call_module(
+          [x],
+          version=xla.call_module_maximum_supported_version(),
+          module=corrupted_module,
+          Tout=[x.dtype],
+          Sout=[x.shape],
+          platforms=[self.testing_platform()],
+      )
+
+    # Expect any error message to be included after `:`
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'Cannot deserialize computation: .+',
+    ):
+      f(x)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution(
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
index f31af03209cc..731410a24181 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/layer_utils.h
@@ -312,8 +312,8 @@ class TRTNetworkBuilder {
   // The tensor has "nb_dims" dimensions and each dimension has only one
   // element. The data type of the tensor is determined by the data type of
   // "scalar".
-  template <typename T,
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  template <typename T, typename std::enable_if<std::is_trivially_copyable<
+                            T>::value>::type* = nullptr>
   StatusOr<nvinfer1::IConstantLayer*> Constant(const T scalar,
                                                const int nb_dims) noexcept {
     TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS);
@@ -355,8 +355,8 @@ class TRTNetworkBuilder {
   }
 
   // Creates a nvinfer1::Weights object containing a single scalar.
-  template <typename T,
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  template <typename T, typename std::enable_if<std::is_trivially_copyable<
+                            T>::value>::type* = nullptr>
   StatusOr<nvinfer1::Weights> ScalarWeights(const T scalar,
                                             const int nb_dims) noexcept {
     TRT_ENSURE(nb_dims <= nvinfer1::Dims::MAX_DIMS);
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 394970481f3a..f2080a0752f4 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -4,7 +4,7 @@ load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 # load("//devtools/deps/check:deps_check.bzl", "check_dependencies")
 # copybara:uncomment_end
 
-load("@local_xla//xla:xla.bzl", "xla_py_proto_library")
+load("@local_xla//xla:xla.default.bzl", "xla_py_proto_library")
 load("@local_xla//xla/service/cpu:build_defs.bzl", "runtime_copts")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "mkl_deps")
 load("@local_xla//xla/tsl/platform:build_config_root.bzl", "if_static")
@@ -322,6 +322,7 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:mutex",
         "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:context",
         "@local_tsl//tsl/platform:cord",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:ml_dtypes",
@@ -346,6 +347,7 @@ cc_library(
 #         "@local_tsl//tsl/platform:bfloat16",
 #         "@local_tsl//tsl/platform:blocking_counter",
 #         "@local_xla//xla/tsl/platform:byte_order",
+#         "@local_tsl//tsl/platform:context",
 #         "@local_tsl//tsl/platform:cord",
 #         "@local_tsl//tsl/platform:env_time",
 #         "@local_tsl//tsl/platform:ml_dtypes",
@@ -361,10 +363,12 @@ cc_library(
 #         "@local_xla//xla/tsl/platform:logging",
 #         "@local_xla//xla/tsl/platform:types",
 #         "@local_xla//xla/tsl/platform:macros",
+#         "@local_xla//xla/tsl/platform/default:context",
 #         "@local_xla//xla/tsl/platform/default:cord",
 #         "@local_xla//xla/tsl/platform/default:env_time",
 #         "@local_xla//xla/tsl/platform/default:logging",
 #         "@local_xla//xla/tsl/platform/default:types",
+#         "@local_xla//xla/tsl/platform/google:context",
 #         "@local_xla//xla/tsl/platform/google:cord",
 #         "@local_xla//xla/tsl/platform/google:env_time",
 #         "@local_xla//xla/tsl/platform/google:logging",
@@ -405,7 +409,57 @@ cc_library(
         "@local_xla//xla:executable_run_options",
         "@local_xla//xla/service/cpu:buffer_desc",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "xla_compiled_cpu_function_thunks",
+    srcs = ["xla_compiled_cpu_function_thunks.cc"],
+    hdrs = ["xla_compiled_cpu_function_thunks.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_compiled_cpu_function",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@local_xla//xla:executable_run_options",
+        "@local_xla//xla/backends/cpu/codegen:aot_compiled_function_library",
+        "@local_xla//xla/backends/cpu/nanort:nanort_executable",
+        "@local_xla//xla/backends/cpu/runtime:function_library",
+        "@local_xla//xla/service:executable",  # buildcleaner: keep (b/404179184)
+        "@local_xla//xla/service/cpu:cpu_aot_compilation_result",
+        "@local_xla//xla/service/cpu:executable_proto_cc",
+        "@local_xla//xla/tsl/concurrency:async_value",
+        "@local_xla//xla/tsl/platform:env",
+        "@local_xla//xla/tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "xla_compiled_cpu_function_factory",
+    srcs = ["xla_compiled_cpu_function_factory.cc"],
+    hdrs = ["xla_compiled_cpu_function_factory.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+cc_library(
+    name = "xla_compiled_cpu_function_thunk_factory_registerer",
+    srcs = ["xla_compiled_cpu_function_thunk_factory_registerer.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_compiled_cpu_function_factory",
+        "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+        "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function_thunks",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -428,20 +482,27 @@ cc_library(
         ":tf2xla",
         ":tf2xla_proto_cc",
         ":xla_compiled_cpu_function",
+        ":xla_compiled_cpu_function_thunks",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
         "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/backends/cpu/codegen:compiled_function_library",
         "@local_xla//xla/client:client_library",
         "@local_xla//xla/client:executable_build_options",
         "@local_xla//xla/client:local_client",
         "@local_xla//xla/hlo/builder:xla_computation",
         "@local_xla//xla/service:platform_util",
+        "@local_xla//xla/service/cpu:cpu_aot_compilation_result",
+        "@local_xla//xla/service/cpu:executable_proto_cc",
         "@local_xla//xla/stream_executor:platform",
     ] + if_libtpu(
         if_false = [
@@ -978,7 +1039,7 @@ tf_cc_test(
     srcs = ["xla_jit_compiled_cpu_function_test.cc"],
     deps = [
         ":tf2xla_proto_cc",
-        ":xla_compiled_cpu_function",
+        ":xla_compiled_cpu_function_thunks",
         ":xla_jit_compiled_cpu_function",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -988,6 +1049,7 @@ tf_cc_test(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:casts",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla:xla_data_proto_cc",
@@ -996,6 +1058,7 @@ tf_cc_test(
         "@local_xla//xla/hlo/testlib:test",
         "@local_xla//xla/service:compiler",
         "@local_xla//xla/service:platform_util",
+        "@local_xla//xla/service/cpu:cpu_executable",
         "@local_xla//xla/stream_executor:platform",
         "@local_xla//xla/stream_executor:platform_manager",
     ],
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
index 57f1cbdf3bd4..50bd47ad73e7 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
@@ -96,7 +96,7 @@ TEST_F(FunctionalizeCondTest, JoinCondStates) {
   // An non-merge op with inputs from then and else branch.
   absl::Status status =
       JoinCondStatesNonMerge(then_branch, else_branch).status();
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_TRUE(absl::IsInvalidArgument(status));
 
   // Merge between then and else branch.
   auto joined_or = JoinCondStatesMerge(m, then_branch, else_branch);
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 604a24514f8e..7727853a8c42 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -1114,7 +1114,7 @@ void ComplexTestFixture::RunTest() {
   if (restrict_to_tpu_nodes_ && mark_outer_loop_tpu_ && !mark_inner_loop_tpu_) {
     // This case violates the precondition of `FunctionalizeControlFlow`, we
     // expect an internal error.
-    ASSERT_EQ(errors::IsInternal(status1), true);
+    ASSERT_EQ(absl::IsInternal(status1), true);
     return;
   } else {
     // Supported cases, no error expected.
diff --git a/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc b/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc
index 4134356d9249..de3077d850d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/approx_topk_op.cc
@@ -73,7 +73,7 @@ class ApproxTopKOpBase : public XlaOpKernel {
     int64_t reduction_dim = reduction_dim_;
     if (reduction_dim < 0) {
       // Reverse index.
-      reduction_dim += op_shape.dimensions_size();
+      reduction_dim += op_shape.dimensions().size();
     }
     auto cmp_builder = ctx->builder()->CreateSubBuilder(
         absl::StrFormat("top_k_%s_comparator", is_max_k_ ? "gt" : "lt"));
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index a4d9d37bd1ea..7a42150f3a9c 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -47,7 +47,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp input,
 
   OP_REQUIRES(
       ctx,
-      crops.shape().rank() == 2 &&
+      crops.shape().dimensions().size() == 2 &&
           block_rank == xla::ShapeUtil::GetDimension(crops.shape(), 0) &&
           2 == xla::ShapeUtil::GetDimension(crops.shape(), 1),
       errors::InvalidArgument("crops should have shape [", block_rank,
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index 5e0bd1829f1c..4d8f066b8555 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -48,7 +48,7 @@ class DenseBincountOp : public XlaOpKernel {
         ctx->builder()->GetShape(output_size_param);
     OP_REQUIRES_OK(ctx, output_shape_or.status());
     auto output_shape_param = output_shape_or.value();
-    auto output_rank = output_shape_param.rank();
+    auto output_rank = output_shape_param.dimensions().size();
     OP_REQUIRES(ctx, output_rank == 0,
                 errors::InvalidArgument("Shape must be rank 0 but is rank ",
                                         output_rank));
@@ -66,7 +66,7 @@ class DenseBincountOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, input_shape_or.status());
     auto input_shape = input_shape_or.value();
 
-    auto rank = input_shape.rank();
+    auto rank = input_shape.dimensions().size();
 
     OP_REQUIRES(ctx, rank <= 2,
                 errors::InvalidArgument(
@@ -81,7 +81,7 @@ class DenseBincountOp : public XlaOpKernel {
     OP_REQUIRES(ctx,
                 xla::ShapeUtil::CompatibleIgnoringElementType(weights_shape,
                                                               input_shape) ||
-                    (weights_shape.dimensions_size() > 0 &&
+                    (weights_shape.dimensions().size() > 0 &&
                      weights_shape.dimensions(0) == 0),
                 errors::InvalidArgument(
                     "`weights` must be the same shape as `arr` or a length-0 "
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index be9e1060939d..36ba898feab9 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -74,8 +74,8 @@ xla::PrecisionConfig GetPrecisionConfig() {
 // If `shape` is [H, W, ..., M, N] returns [H, W, ..., 1, M*N].
 xla::Shape GroupedFilterShapeForDepthwiseConvolution(
     const xla::Shape& filter_shape) {
-  int64_t input_feature_dim = filter_shape.dimensions_size() - 2;
-  int64_t output_feature_dim = filter_shape.dimensions_size() - 1;
+  int64_t input_feature_dim = filter_shape.dimensions().size() - 2;
+  int64_t output_feature_dim = filter_shape.dimensions().size() - 1;
   int64_t depthwise_multiplier = filter_shape.dimensions(output_feature_dim);
   int64_t input_feature = filter_shape.dimensions(input_feature_dim);
 
@@ -93,7 +93,7 @@ xla::XlaOp TransposeFilterForGroupConvolutionBackpropInput(
     int num_spatial_dims) {
   // 1. Reshape from [H, W, ..., filter_in_depth, out_depth] to [H, W, ...,
   // filter_in_depth, G, out_depth / G]
-  int num_dims = filter_shape.dimensions_size();
+  int num_dims = filter_shape.dimensions().size();
   CHECK_GE(num_dims, 2);  // Crash OK
   xla::Shape new_shape = filter_shape;
   new_shape.set_dimensions(num_dims - 1, num_groups);
@@ -256,11 +256,11 @@ absl::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
 
   // For 2D convolution, there should be 4 dimensions.
   int num_dims = attrs.num_spatial_dims + 2;
-  if (input_shape.dimensions_size() != num_dims) {
+  if (input_shape.dimensions().size() != num_dims) {
     return errors::InvalidArgument("input must be ", num_dims, "-dimensional",
                                    input_shape.DebugString());
   }
-  if (filter_shape.dimensions_size() != num_dims) {
+  if (filter_shape.dimensions().size() != num_dims) {
     return errors::InvalidArgument(
         "filter must be ", num_dims,
         "-dimensional: ", filter_shape.DebugString());
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 273c16f89c9d..b1da0acd6160 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -82,7 +82,8 @@ class ConvNDOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     // Need to know input rank ahead of time to determine type of convolution.
     OP_REQUIRES_VALUE(xla::Shape input_shape, ctx, ctx->InputXlaShape(0));
-    int num_spatial_dims = input_shape.rank() - 1 - attrs_.batch_dims;
+    int num_spatial_dims =
+        input_shape.dimensions().size() - 1 - attrs_.batch_dims;
     OP_REQUIRES_OK(ctx,
                    CheckValidPadding(attrs_.padding, attrs_.explicit_paddings,
                                      /*num_dims=*/num_spatial_dims + 2,
@@ -105,7 +106,7 @@ class ConvNDOp : public XlaOpKernel {
     if (attrs_.batch_dims == 0) {
       // Expand dummy batch dimension.
       xla::Shape expanded_input_shape(input_shape);
-      for (int i = 0; i < expanded_input_shape.rank() - 1; ++i) {
+      for (int i = 0; i < expanded_input_shape.dimensions().size() - 1; ++i) {
         expanded_input_shape.set_dimensions(i + 1, input_shape.dimensions(i));
       }
       expanded_input_shape.set_dimensions(0, 1);
@@ -133,7 +134,8 @@ class ConvNDOp : public XlaOpKernel {
       out = xla::Reshape(out, no_batch_shape.dimensions());
     } else if (attrs_.batch_dims > 1) {
       xla::Shape expanded_out_shape(input_shape);
-      for (int i = attrs_.batch_dims; i < input_shape.rank(); ++i) {
+      for (int i = attrs_.batch_dims; i < input_shape.dimensions().size();
+           ++i) {
         expanded_out_shape.set_dimensions(
             i, out_shape.dimensions(i - (attrs_.batch_dims - 1)));
       }
@@ -187,11 +189,12 @@ class ConvBackpropInputOp : public XlaOpKernel {
                                        xla::ValueInferenceMode::kUpperBound));
     xla::Shape input_shape =
         TensorShapeToXLAShape(ctx->input_xla_type(1), input_tensor_shape);
-    OP_REQUIRES(ctx, input_shape.rank() == attrs_.num_spatial_dims + 2,
-                errors::InvalidArgument(
-                    "The rank of the specified input shape must be "
-                    "num_spatial_dims + 2. Expected ",
-                    attrs_.num_spatial_dims + 2, " got ", input_shape.rank()));
+    OP_REQUIRES(
+        ctx, input_shape.dimensions().size() == attrs_.num_spatial_dims + 2,
+        errors::InvalidArgument("The rank of the specified input shape must be "
+                                "num_spatial_dims + 2. Expected ",
+                                attrs_.num_spatial_dims + 2, " got ",
+                                input_shape.dimensions().size()));
     xla::XlaOp input_sizes = ctx->Input(0);
     absl::StatusOr<xla::XlaOp> in_backprop = MakeXlaBackpropInputConvOp(
         ctx->op_kernel().type_string(), input_shape, ctx->Input(1),
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index c68e60c7884c..6c91556862d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -53,7 +53,7 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
       // Find out mismatched dimensions that are non-broadcastable.
       // Reconcile the
       // difference by slicing the bigger dimension.
-      for (int64_t i = 0; i < lhs_xla_shape.rank(); ++i) {
+      for (int64_t i = 0; i < lhs_xla_shape.dimensions().size(); ++i) {
         if (lhs_xla_shape.is_dynamic_dimension(i)) {
           if (!rhs_xla_shape.is_dynamic_dimension(i) &&
               lhs_xla_shape.dimensions(i) > rhs_xla_shape.dimensions(i) &&
@@ -116,7 +116,8 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
             std::vector<int64_t> dimensions(lhs_xla_shape.dimensions().begin(),
                                             lhs_xla_shape.dimensions().end());
             dimensions[i] = rhs_xla_shape.dimensions(i);
-            std::vector<int64_t> broadcast_dimensions(lhs_xla_shape.rank());
+            std::vector<int64_t> broadcast_dimensions(
+                lhs_xla_shape.dimensions().size());
             absl::c_iota(broadcast_dimensions, 0);
             lhs = xla::BroadcastInDim(lhs, dimensions, broadcast_dimensions);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
index 6e577f412fb3..ceeea010ee78 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
@@ -132,10 +132,10 @@ class DynamicPartitionOp : public XlaOpKernel {
     //
     // 3. We reshape the result of DynamicPartition1D back from 1D to output
     // shape.
-    if (data_shape.rank() > partition_shape.rank()) {
+    if (data_shape.dimensions().size() > partition_shape.dimensions().size()) {
       // Broadcast parititon_shape so that it can be the same as data_shape.
       std::vector<int64_t> broadcasted_dims;
-      auto rank = partition_shape.rank();
+      auto rank = partition_shape.dimensions().size();
       broadcasted_dims.reserve(rank);
       for (int64_t i = 0; i < rank; ++i) {
         broadcasted_dims.push_back(i);
@@ -152,7 +152,8 @@ class DynamicPartitionOp : public XlaOpKernel {
     output_shape_bound_dims.push_back(
         xla::ShapeUtil::ElementsIn(partition_shape));
     int64_t count_diff = 1;
-    for (int64_t i = partition_shape.rank(); i < data_shape.rank(); ++i) {
+    for (int64_t i = partition_shape.dimensions().size();
+         i < data_shape.dimensions().size(); ++i) {
       output_shape_bound_dims.push_back(data_shape.dimensions(i));
       count_diff *= data_shape.dimensions(i);
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index 96d3c9bf08cc..2a65441eb79b 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -270,7 +270,7 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public XlaOpKernel {
     absl::Span<const int64_t> input_dimensions = input_shape.dimensions();
     auto convert_to_input_shape = [&](const xla::XlaOp op) {
       return xla::BroadcastInDim(op, input_dimensions,
-                                 {input_shape.rank() - 1});
+                                 {input_shape.dimensions_size() - 1});
     };
     input_min = convert_to_input_shape(input_min);
     input_max = convert_to_input_shape(input_max);
@@ -325,13 +325,13 @@ class FakeQuantWithMinMaxVarsPerChannelGradOp : public XlaOpKernel {
     absl::Span<const int64_t> input_dimensions = input_shape.dimensions();
 
     std::vector<int64_t> reduce_axes;
-    for (int64_t i = 0; i + 1 < input_shape.rank(); ++i) {
+    for (int64_t i = 0; i + 1 < input_shape.dimensions_size(); ++i) {
       reduce_axes.push_back(i);
     }
 
     auto convert_to_input_shape = [&](const xla::XlaOp op) {
       return xla::BroadcastInDim(op, input_dimensions,
-                                 {input_shape.rank() - 1});
+                                 {input_shape.dimensions_size() - 1});
     };
     input_min = convert_to_input_shape(input_min);
     input_max = convert_to_input_shape(input_max);
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 2108db386a79..2783951e1b6b 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
-#include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index f20c2384b533..dcada42c0966 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -49,7 +49,8 @@ class MirrorPadOp : public XlaOpKernel {
     // - [1, 2, 3, 3, 2] in symmetric mode.
     int64_t excluded_edges = mode == MirrorPadMode::REFLECT ? 1 : 0;
     xla::XlaOp accum = t;
-    for (int64_t dimno = original_shape.rank() - 1; dimno >= 0; --dimno) {
+    for (int64_t dimno = original_shape.dimensions().size() - 1; dimno >= 0;
+         --dimno) {
       auto t_rev = xla::Rev(accum, {dimno});
       int64_t lhs_padding = pad_literal.Get<int64_t>({dimno, 0});
       int64_t rhs_padding = pad_literal.Get<int64_t>({dimno, 1});
@@ -136,7 +137,8 @@ class MirrorPadGradOp : public XlaOpKernel {
     // - [1, 2, 3, 3, 2] in symmetric mode.
     int64_t excluded_edges = mode == MirrorPadMode::REFLECT ? 1 : 0;
     xla::XlaOp grad = t;
-    for (int64_t dimno = original_shape.rank() - 1; dimno >= 0; --dimno) {
+    for (int64_t dimno = original_shape.dimensions().size() - 1; dimno >= 0;
+         --dimno) {
       int64_t lhs_padding = pad_literal.Get<int64_t>({dimno, 0});
       int64_t rhs_padding = pad_literal.Get<int64_t>({dimno, 1});
       int64_t dim_size = original_shape.dimensions(dimno);
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 98d75dfc2f89..aa7c78b8b8f9 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -240,7 +240,7 @@ class MaxPoolOp : public PoolingOp {
       OP_REQUIRES_OK(ctx, input_shape.status());
     }
 
-    OP_REQUIRES(ctx, input_shape->dimensions_size() == num_dims(),
+    OP_REQUIRES(ctx, input_shape->dimensions().size() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
                                         " dimensions"));
@@ -248,7 +248,7 @@ class MaxPoolOp : public PoolingOp {
         input, ksize, stride, padding_,
         XlaTensorFormat(
             data_format_ == FORMAT_NCHW_VECT_C ? FORMAT_NCHW : data_format_,
-            input_shape->dimensions_size() - 2));
+            input_shape->dimensions().size() - 2));
 
     if (data_format_ == FORMAT_NCHW_VECT_C) {
       absl::StatusOr<xla::Shape> result_shape =
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 34fe5e8651f0..ae225152fa4d 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -225,7 +225,7 @@ class ParameterizedTruncatedNormalOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape));
-    OP_REQUIRES(ctx, xla_shape.rank() >= 1,
+    OP_REQUIRES(ctx, xla_shape.dimensions().size() >= 1,
                 errors::InvalidArgument(
                     "shape parameter must have rank >= 1, received (",
                     xla::ShapeUtil::HumanString(xla_shape), ")"));
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc b/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
index 59a7e92a28df..4ba4961ad5b8 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops_util.cc
@@ -124,7 +124,7 @@ xla::XlaOp GetU64FromS32Seeds(xla::XlaOp seed0, xla::XlaOp seed1) {
 
 absl::StatusOr<int> GetAlgId(XlaOpKernelContext* ctx, int alg_input_idx) {
   TF_ASSIGN_OR_RETURN(auto alg_shape, ctx->InputXlaShape(alg_input_idx));
-  if (alg_shape.rank() != 0) {
+  if (alg_shape.dimensions().size() != 0) {
     return absl::InvalidArgumentError(
         absl::StrCat("The algorithm argument must be of shape [], not ",
                      alg_shape.DebugString()));
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index eb78eba56c11..ba17d1b295b7 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -99,7 +99,8 @@ class ReshapeOp : public XlaOpKernel {
 
       int64_t missing = input_num_elements / product;
       if (!input_has_zero_dim) {
-        if (input_xla_shape->is_static() || input_xla_shape->rank() != 1) {
+        if (input_xla_shape->is_static() ||
+            input_xla_shape->dimensions().size() != 1) {
           OP_REQUIRES(
               ctx, product * missing == input_num_elements,
               errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index b721011f5126..f6ff3345d687 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -326,7 +326,7 @@ class SqueezeOp : public XlaOpKernel {
         ctx->builder()->GetShape(ctx->Input(0));
     OP_REQUIRES_OK(ctx, input_shape.status());
     xla::Shape shape = input_shape.value();
-    int64_t rank = shape.rank();
+    int64_t rank = shape.dimensions().size();
 
     absl::flat_hash_set<int32_t> wrapped_squeeze_dims;
     wrapped_squeeze_dims.reserve(squeeze_dims_.size());
@@ -402,14 +402,14 @@ class ZerosLikeOp : public XlaOpKernel {
       OP_REQUIRES_OK(ctx, list_shape_or.status());
       const xla::Shape& list_shape = list_shape_or.value();
       std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
-      list_dynamic_dims.reserve(list_shape.tuple_shapes_size() - 1);
-      for (int i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
+      list_dynamic_dims.reserve(list_shape.tuple_shapes().size() - 1);
+      for (int i = 0; i < list_shape.tuple_shapes().size() - 1; ++i) {
         // Set dynamic dimension size to 0 for initialization value.
         std::vector<xla::XlaOp> dynamic_dims;
         const xla::Shape& shape = list_shape.tuple_shapes(i);
         auto sub_element = xla::GetTupleElement(list, i);
-        dynamic_dims.reserve(shape.dimensions_size());
-        for (int64_t dim = 0; dim < shape.dimensions_size(); ++dim) {
+        dynamic_dims.reserve(shape.dimensions().size());
+        for (int64_t dim = 0; dim < shape.dimensions().size(); ++dim) {
           dynamic_dims.push_back(xla::GetDimensionSize(sub_element, dim));
         }
         list_dynamic_dims.push_back(dynamic_dims);
@@ -433,7 +433,7 @@ class ZerosLikeOp : public XlaOpKernel {
       auto result = xla::Broadcast(zero, input_shape.dimensions());
 
       // Setting up dynamic dimensions of the broadcast.
-      for (int64_t i = 0; i < input_shape.dimensions_size(); ++i) {
+      for (int64_t i = 0; i < input_shape.dimensions().size(); ++i) {
         if (input_shape.is_dynamic_dimension(i)) {
           xla::XlaOp input_dynamic_dim = xla::GetDimensionSize(input, i);
           result = xla::SetDimensionSize(result, input_dynamic_dim, i);
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index d3804afd0f00..d4a93e055614 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -55,7 +55,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp input,
 
   OP_REQUIRES(
       ctx,
-      paddings.shape().rank() == 2 &&
+      paddings.shape().dimensions().size() == 2 &&
           block_rank == xla::ShapeUtil::GetDimension(paddings.shape(), 0) &&
           2 == xla::ShapeUtil::GetDimension(paddings.shape(), 1),
       errors::InvalidArgument("paddings should have shape [", block_rank,
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 64106e1ec910..e15196bd7564 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -470,7 +470,7 @@ class StridedSliceGradOp : public XlaOpKernel {
         need_padding = true;
       }
     }
-    for (int64_t i = 0; i < grad_shape.rank(); ++i) {
+    for (int64_t i = 0; i < grad_shape.dimensions().size(); ++i) {
       // Use grad shape, which is known, to update unknown processing shape.
       // Grad shape is the output of the ValidateStridedSliceOp function in
       // forward pass, thus we use output_to_processing_mapping.
@@ -613,7 +613,7 @@ class StridedSliceGradOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->ResolveInputDynamismIntoPredVector(0, &dynamic_input));
     // Input of strided_slice_op has to have the same shape as output.
-    DCHECK_EQ(grad_shape.rank(), input_shape.dims());
+    DCHECK_EQ(grad_shape.dimensions().size(), input_shape.dims());
     for (int64_t dim = 0; dim < input_shape.dims(); ++dim) {
       DCHECK_EQ(grad_shape.dimensions(dim), input_shape.dim_size(dim));
       if (dynamic_input[dim]) {
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 74ac971ae3f3..a1f58d5ae9b4 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -65,14 +65,14 @@ absl::StatusOr<std::vector<std::vector<xla::XlaOp>>> GetTensorListDynamicDims(
   std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
   // Set dynamic dimension size to 0 for initialization value.
   std::vector<xla::XlaOp> dynamic_dims;
-  dynamic_dims.reserve(1 + element_shape.dimensions_size());
+  dynamic_dims.reserve(1 + element_shape.dimensions().size());
   if (leading_dim_is_dynamic) {
     dynamic_dims.push_back(ctx->Input(1));
   } else {
     dynamic_dims.push_back(
         xla::ConstantR0<int32>(ctx->builder(), num_elements));
   }
-  for (int64_t dim = 0; dim < element_shape.dimensions_size(); ++dim) {
+  for (int64_t dim = 0; dim < element_shape.dimensions().size(); ++dim) {
     if (dims_are_dynamic[dim]) {
       auto dynamic_dim_size = xla::Slice(ctx->Input(0), {dim}, {dim + 1}, {1});
       dynamic_dim_size = xla::Reshape(dynamic_dim_size, {});
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index 50c4cdb19c43..0cb01190dbd1 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -290,8 +290,8 @@ absl::Status CreateZerosTensorListWithShape(
     xla::XlaOp zero =
         xla::ConstantLiteral(b, xla::LiteralUtil::Zero(shape.element_type()));
     xla::XlaOp zeros = xla::Broadcast(zero, shape.dimensions());
-    TF_RET_CHECK(dynamic_dims[i].size() == shape.dimensions_size());
-    for (int64_t dim = 0; dim < shape.dimensions_size(); ++dim) {
+    TF_RET_CHECK(dynamic_dims[i].size() == shape.dimensions().size());
+    for (int64_t dim = 0; dim < shape.dimensions().size(); ++dim) {
       if (shape.is_dynamic_dimension(dim)) {
         zeros = xla::SetDimensionSize(zeros, dynamic_dims[i][dim], dim);
       }
@@ -343,7 +343,7 @@ absl::Status GetInitializedTensorListForElement(xla::XlaOp list,
     // Prepare dynamic dimension dimensions for zero tensor list. The dynamic
     // sizes are created by reading the dynamic dimension size of sub-elements.
     std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
-    for (int i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
+    for (int i = 0; i < list_shape.tuple_shapes().size() - 1; ++i) {
       std::vector<xla::XlaOp> dynamic_dims;
       const xla::Shape& shape = list_shape.tuple_shapes(i);
       dynamic_dims.push_back(leading_dim_dynamic_size);
@@ -353,7 +353,7 @@ absl::Status GetInitializedTensorListForElement(xla::XlaOp list,
       } else {
         sub_element = element;
       }
-      for (int64_t dim = 0; dim < shape.dimensions_size() - 1; ++dim) {
+      for (int64_t dim = 0; dim < shape.dimensions().size() - 1; ++dim) {
         dynamic_dims.push_back(xla::GetDimensionSize(sub_element, dim));
       }
       list_dynamic_dims.push_back(dynamic_dims);
@@ -392,7 +392,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
       element_part = xla::Reshape(element_part, element_part_dims);
 
       std::vector<xla::XlaOp> start_indices(
-          element_part_shape.dimensions_size() + 1,
+          element_part_shape.dimensions().size() + 1,
           xla::ConstantR0<int32>(b, 0));
       start_indices[0] = push_index;
 
@@ -408,7 +408,7 @@ absl::Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
     element_dims.insert(element_dims.begin(), 1);
     xla::XlaOp update = xla::Reshape(element, element_dims);
 
-    std::vector<xla::XlaOp> start_indices(element_shape.dimensions_size() + 1,
+    std::vector<xla::XlaOp> start_indices(element_shape.dimensions().size() + 1,
                                           xla::ConstantR0<int32>(b, 0));
     start_indices[0] = push_index;
 
@@ -447,7 +447,7 @@ absl::Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
   for (int i = 0; i < list_tuple_size - 1; i++) {
     const xla::Shape& list_part_shape =
         xla::ShapeUtil::GetTupleElementShape(list_shape, i);
-    std::vector<xla::XlaOp> start_indices(list_part_shape.dimensions_size(),
+    std::vector<xla::XlaOp> start_indices(list_part_shape.dimensions().size(),
                                           xla::ConstantR0<int32>(b, 0));
     start_indices[0] = push_index;
 
@@ -495,7 +495,7 @@ absl::Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
   element_dims.insert(element_dims.begin(), 1);
   xla::XlaOp update = xla::Reshape(element, element_dims);
 
-  std::vector<xla::XlaOp> start_indices(element_shape.dimensions_size() + 1,
+  std::vector<xla::XlaOp> start_indices(element_shape.dimensions().size() + 1,
                                         xla::ConstantR0<int32>(b, 0));
   start_indices[0] = index;
 
@@ -504,7 +504,7 @@ absl::Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
     TF_ASSIGN_OR_RETURN(const xla::Shape* list_part_shape,
                         b->GetShapePtr(list_part));
     TF_ASSIGN_OR_RETURN(const xla::Shape* update_shape, b->GetShapePtr(update));
-    for (int i = 0; i < list_part_shape->dimensions_size(); ++i) {
+    for (int i = 0; i < list_part_shape->dimensions().size(); ++i) {
       auto list_part_dim_size = list_part_shape->dimensions(i);
       auto update_dim_size = update_shape->dimensions(i);
       // If the update is larger than the list part, the DynamicUpdateSlice will
@@ -549,7 +549,7 @@ absl::Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, b->GetShape(list));
   const xla::Shape& buffer_shape =
       xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
-  std::vector<xla::XlaOp> start_indices(buffer_shape.dimensions_size(),
+  std::vector<xla::XlaOp> start_indices(buffer_shape.dimensions().size(),
                                         xla::ConstantR0<int32>(b, 0));
   start_indices[0] = index;
 
@@ -561,7 +561,7 @@ absl::Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
   xla::XlaOp read = xla::DynamicSlice(list_part, start_indices, slice_shape);
   // Propagate dynamic dimensions from buffer to the sliced buffer, except for
   // leading dimension (which is always static 1).
-  for (int64_t i = 1; i < buffer_shape.dimensions_size(); ++i) {
+  for (int64_t i = 1; i < buffer_shape.dimensions().size(); ++i) {
     if (buffer_shape.is_dynamic_dimension(i)) {
       auto buffer = xla::GetTupleElement(list, 0);
       auto gds = xla::GetDimensionSize(buffer, i);
diff --git a/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc b/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc
index 763159f2140f..9db0334ff438 100644
--- a/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc
@@ -43,7 +43,7 @@ class ToBoolOp : public XlaOpKernel {
 
     // If the input is a scalar, then non-zero value returns True.
     TF_ASSIGN_OR_RETURN(auto shape, ctx->InputXlaShape(0));
-    if (shape.rank() == 0) {
+    if (shape.dimensions().empty()) {
       auto result = xla::Ne(ctx->Input(0), xla::ZerosLike(input));
       ctx->SetOutput(0, result);
       return absl::OkStatus();
@@ -52,7 +52,7 @@ class ToBoolOp : public XlaOpKernel {
     // Otherwise, any input tensor with elements returns True. Input tensor
     // dimensions might be dynamic with bounds so multiply all the dimensions.
     xla::XlaOp num_elements = xla::One(ctx->builder(), xla::S32);
-    for (int64_t dim = 0; dim < shape.rank(); dim++) {
+    for (int64_t dim = 0; dim < shape.dimensions().size(); dim++) {
       num_elements = xla::Mul(num_elements, xla::GetDimensionSize(input, dim));
     }
     auto result = xla::Ne(num_elements, xla::ZerosLike(num_elements));
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 422bef6ba3fb..2643e11e89e5 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -45,7 +45,7 @@ class TopKOp : public XlaOpKernel {
     const absl::StatusOr<xla::Shape> input_shape_or = context->InputXlaShape(0);
     OP_REQUIRES_OK(context, input_shape_or.status());
     const xla::Shape& input_shape = *input_shape_or;
-    int last_dim = input_shape.dimensions_size() - 1;
+    int last_dim = input_shape.dimensions().size() - 1;
     int last_dim_size = input_shape.dimensions(last_dim);
 
     int64_t k;
@@ -62,7 +62,7 @@ class TopKOp : public XlaOpKernel {
     OP_REQUIRES(context, k >= 0,
                 errors::InvalidArgument("Need k >= 0, got ", k));
 
-    OP_REQUIRES(context, input_shape.dimensions_size() >= 1,
+    OP_REQUIRES(context, input_shape.dimensions().size() >= 1,
                 errors::InvalidArgument("input must be >= 1-D, got shape ",
                                         input_shape.DebugString()));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
index b9b7f606d970..dbd6cda9d950 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
@@ -46,7 +46,7 @@ void PopulateXlaOpGeneratorMap(XlaOpGeneratorMap* op_generator_map) {
 
 #define ADD_XLA_OP_GENERATOR(Name) \
   add_xla_op_generator(#Name,      \
-                       static_cast<xla::XlaOp (*)(xla::XlaOp)>(xla::Name));
+                       [](xla::XlaOp operand) { return xla::Name(operand); });
 
   ADD_XLA_OP_GENERATOR(Abs);
   ADD_XLA_OP_GENERATOR(Acos);
@@ -70,7 +70,8 @@ void PopulateXlaOpGeneratorMap(XlaOpGeneratorMap* op_generator_map) {
   add_xla_op_generator("Rint", xla::RoundToEven);
   ADD_XLA_OP_GENERATOR(Round);
   ADD_XLA_OP_GENERATOR(Rsqrt);
-  add_xla_op_generator("Sigmoid", xla::Logistic);
+  add_xla_op_generator("Sigmoid",
+                       [](xla::XlaOp x) { return xla::Logistic(x); });
   ADD_XLA_OP_GENERATOR(Sin);
   ADD_XLA_OP_GENERATOR(Sinh);
   ADD_XLA_OP_GENERATOR(Sqrt);
diff --git a/tensorflow/compiler/tf2xla/kernels/unique_op.cc b/tensorflow/compiler/tf2xla/kernels/unique_op.cc
index 9730427dff3b..46de3dd89b61 100644
--- a/tensorflow/compiler/tf2xla/kernels/unique_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unique_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/statusor.h"
-#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -57,8 +56,8 @@ class UniqueOpBase : public XlaOpKernel {
   xla::XlaOp MoveAxis(xla::XlaOp a, int64_t from, int64_t to,
                       const xla::Shape& input_shape) {
     std::vector<int64_t> permutation;
-    permutation.reserve(input_shape.rank());
-    for (int64_t i = 0; i < input_shape.rank(); ++i) {
+    permutation.reserve(input_shape.dimensions().size());
+    for (int64_t i = 0; i < input_shape.dimensions().size(); ++i) {
       permutation.push_back(i);
     }
     std::swap(permutation[from], permutation[to]);
@@ -147,15 +146,15 @@ class UniqueOpBase : public XlaOpKernel {
     absl::StatusOr<xla::Shape> input_shape_or = ctx->builder()->GetShape(input);
     OP_REQUIRES_OK(ctx, input_shape_or.status());
     auto input_shape = input_shape_or.value();
-    axis = axis < 0 ? axis + input_shape.rank() : axis;
-    OP_REQUIRES(ctx, 0 <= axis && axis < input_shape.rank(),
+    axis = axis < 0 ? axis + input_shape.dimensions().size() : axis;
+    OP_REQUIRES(ctx, 0 <= axis && axis < input_shape.dimensions().size(),
                 errors::InvalidArgument("axis has to be between [0, ",
-                                        input_shape.rank(), ")"));
+                                        input_shape.dimensions().size(), ")"));
     auto aux = MoveAxis(input, axis, 0, input_shape);
     auto aux_shape = ctx->builder()->GetShape(aux).value();
     int64_t leading_size = aux_shape.dimensions(0);
     int64_t product = 1;
-    for (int64_t i = 1; i < aux_shape.rank(); ++i) {
+    for (int64_t i = 1; i < aux_shape.dimensions().size(); ++i) {
       product *= aux_shape.dimensions(i);
     }
     aux = xla::Reshape(aux, {leading_size, product});
diff --git a/tensorflow/compiler/tf2xla/kernels/where_op.cc b/tensorflow/compiler/tf2xla/kernels/where_op.cc
index f9dc5a0a456e..f97e6d5077ef 100644
--- a/tensorflow/compiler/tf2xla/kernels/where_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/where_op.cc
@@ -54,7 +54,7 @@ absl::StatusOr<XlaOp> ShiftElemsRight(XlaOp x) {
   xla::XlaBuilder* b = x.builder();
   absl::StatusOr<xla::Shape> shape = b->GetShape(x);
   TF_RETURN_IF_ERROR(shape.status());
-  TF_RET_CHECK(shape->dimensions_size() == 1);
+  TF_RET_CHECK(shape->dimensions().size() == 1);
   int64_t n = shape->dimensions(0);
 
   XlaOp padded = xla::PadInDim(x, xla::Zero(b, shape->element_type()),
@@ -94,7 +94,7 @@ absl::StatusOr<XlaOp> PrefixSum(XlaOp arr) {
   absl::StatusOr<xla::Shape> input_shape = b->GetShape(arr);
   TF_RETURN_IF_ERROR(input_shape.status());
 
-  TF_RET_CHECK(input_shape->dimensions_size() == 1);
+  TF_RET_CHECK(input_shape->dimensions().size() == 1);
   int64_t n = input_shape->dimensions(0);
 
   // The original input length must be a power of 2, but we recursively divide
@@ -173,7 +173,7 @@ absl::StatusOr<XlaOp> CompileWhereWithSort(XlaOpKernelContext* ctx) {
   std::vector<xla::PrimitiveType> types_to_sort = {xla::PRED};
   // Generate iota for each dimension, which after combining becomes
   // indices of each element.
-  for (int64_t axis = 0; axis < iota_shape.rank(); ++axis) {
+  for (int64_t axis = 0; axis < iota_shape.dimensions_size(); ++axis) {
     XlaOp iota = xla::Iota(ctx->builder(), iota_shape, axis);
     XlaOp reshaped = xla::Reshape(iota, {flattened_size});
     to_sort.push_back(reshaped);
@@ -184,7 +184,7 @@ absl::StatusOr<XlaOp> CompileWhereWithSort(XlaOpKernelContext* ctx) {
       to_sort, xla::CreateScalarGtComputation(types_to_sort, ctx->builder()),
       /*dimension=*/0, /*is_stable=*/true);
   std::vector<XlaOp> to_concat;
-  for (int64_t i = 0; i < iota_shape.rank(); ++i) {
+  for (int64_t i = 0; i < iota_shape.dimensions_size(); ++i) {
     XlaOp index_single_dim = xla::GetTupleElement(sorted, i + 1);
     to_concat.push_back(xla::Reshape(index_single_dim, {flattened_size, 1}));
   }
@@ -277,8 +277,8 @@ absl::StatusOr<XlaOp> CompileWhereWithPrefixSum(XlaOpKernelContext* ctx) {
   // and then scatter iotas[out_idxs] into the output.
   std::vector<XlaOp> iotas_to_concat;
   auto iota_shape = xla::ShapeUtil::MakeShape(S32, input_shape.dimensions());
-  iotas_to_concat.reserve(iota_shape.rank());
-  for (int64_t axis = 0; axis < iota_shape.rank(); ++axis) {
+  iotas_to_concat.reserve(iota_shape.dimensions_size());
+  for (int64_t axis = 0; axis < iota_shape.dimensions_size(); ++axis) {
     iotas_to_concat.push_back(
         xla::Reshape(xla::Iota(b, iota_shape, axis), {flattened_size, 1}));
   }
@@ -303,8 +303,9 @@ absl::StatusOr<XlaOp> CompileWhereWithPrefixSum(XlaOpKernelContext* ctx) {
   scatter_dnums.add_scatter_dims_to_operand_dims(0);
   scatter_dnums.add_update_window_dims(1);
   XlaOp scattered = xla::Scatter(
-      /*input=*/xla::Zeros(b, /*shape=*/xla::ShapeUtil::MakeShape(
-                               S32, {flattened_size, iota_shape.rank()})),
+      /*input=*/xla::Zeros(
+          b, /*shape=*/xla::ShapeUtil::MakeShape(
+              S32, {flattened_size, iota_shape.dimensions_size()})),
       /*scatter_indices=*/out_idxs, /*updates=*/iotas,
       /*update_computation=*/assn_computation, scatter_dnums,
       /*indices_are_sorted=*/true, /*unique_indices=*/true);
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 8f021521eded..415f465f0b50 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -264,7 +264,7 @@ absl::StatusOr<xla::XlaComputation> BuildWrappedBody(
               if (output_subshape.IsArray()) {
                 const xla::Shape& input_subshape =
                     xla::ShapeUtil::GetSubshape(input_shape, index);
-                for (int d = 0; d < output_subshape.rank(); ++d) {
+                for (int d = 0; d < output_subshape.dimensions().size(); ++d) {
                   if (input_subshape.is_dynamic_dimension(d) &&
                       !output_subshape.is_dynamic_dimension(d)) {
                     *element = xla::SetDimensionSize(
@@ -576,7 +576,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       if (input_shape != list_shape) {
         // Prepare dynamic dimensions for element shapes.
         std::vector<std::vector<xla::XlaOp>> list_dynamic_dims;
-        for (int i = 0; i < list_shape.tuple_shapes_size() - 1; ++i) {
+        for (int i = 0; i < list_shape.tuple_shapes().size() - 1; ++i) {
           std::vector<xla::XlaOp> dynamic_dims;
 
           const xla::Shape& shape = list_shape.tuple_shapes(i);
@@ -596,7 +596,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
           // Set dynamic dimension size to 0 for element value. Inside the while
           // loop, TensorlistSetItem will properly set the element shape's
           // dynamic dimension.
-          for (int64_t dim = 1; dim < shape.dimensions_size(); ++dim) {
+          for (int64_t dim = 1; dim < shape.dimensions().size(); ++dim) {
             int32_t dim_size = shape.dimensions(dim);
             if (shape.is_dynamic_dimension(dim)) {
               dim_size = 0;
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index ddd1f23cbb06..fc8e8bdf62e2 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -187,7 +187,7 @@ absl::Status XlaCallModuleLoader::SetPlatformIndex(
       platform_index_arg.getLoc(), const_attr);
   platform_index_arg.replaceAllUsesWith(platform_index_op);
 
-  main_.eraseArgument(0);
+  CHECK(llvm::succeeded(main_.eraseArgument(0)));
   platform_index_arg_set_ = true;
   return absl::OkStatus();
 }
@@ -267,8 +267,11 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
 
     // Get static MLIR Type from xla Shape.
     const xla::Shape &xla_shape = input_shapes[next_actual_input++];
-    std::vector<int64_t> xla_dimensions(xla_shape.dimensions().begin(),
-                                        xla_shape.dimensions().end());
+    std::vector<int64_t> xla_dimensions;
+    if (xla_shape.IsArray()) {
+      xla_dimensions = std::vector<int64_t>(xla_shape.dimensions().begin(),
+                                            xla_shape.dimensions().end());
+    }
     TF_ASSIGN_OR_RETURN(
         mlir::Type element_type,
         ConvertPrimitiveTypeToMlirType(xla_shape.element_type(), builder));
@@ -399,9 +402,15 @@ absl::Status XlaCallModuleLoader::LoadModule(
   }
 
   // Parse the StableHLO/VHLO bytecode
-  module_ = mlir::stablehlo::deserializePortableArtifact(module_str, context_);
-  if (!module_) {
-    return absl::InvalidArgumentError("Cannot deserialize computation");
+  {
+    mlir::StatusScopedDiagnosticHandler diag_handler(context_);
+    module_ =
+        mlir::stablehlo::deserializePortableArtifact(module_str, context_);
+    if (!module_) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot deserialize computation: ",
+                       diag_handler.ConsumeStatus().ToString()));
+    }
   }
   VLOG(3) << "Parsed serialized module (version = " << version
           << ", platforms = [" << absl::StrJoin(platforms, ", ")
@@ -481,18 +490,14 @@ absl::Status XlaCallModuleLoader::ValidateStaticShapes() {
 absl::Status XlaCallModuleLoader::PrepareStablehloForLowering() {
   mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
 
-  // TODO (b/393390051): Migrate required passes to StableHLO.
+  // TODO (b/410057228): Replace MHLO canonicalization with StableHLO.
+  // This code requires MHLO CaseOp canonicalization to remove unreachable
+  // branches, else `tf.call_tf_function` inlining can fail.
   mlir::PassManager pm(module_->getContext());
-  applyTensorflowAndCLOptions(pm);
   pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-  // In order to export to XLA, we must sink constants to control flow
-  // regions, since XLA uses functional control flow.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createSinkConstantsToControlFlowPass());
   pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+
   if (failed(pm.run(*module_))) {
     return absl::InternalError(
         absl::StrCat("MHLO->HLO lowering passes failed: ",
@@ -500,7 +505,7 @@ absl::Status XlaCallModuleLoader::PrepareStablehloForLowering() {
   }
 
   if (VLOG_IS_ON(5)) {
-    DumpMlirOpToFile("xla_call_module.after_mhlo_lowering", *module_);
+    DumpMlirOpToFile("xla_call_module.after_canonicalization", *module_);
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/compiler/tf2xla/layout_util.cc b/tensorflow/compiler/tf2xla/layout_util.cc
index 5fda54d2903d..b000c49f1f96 100644
--- a/tensorflow/compiler/tf2xla/layout_util.cc
+++ b/tensorflow/compiler/tf2xla/layout_util.cc
@@ -72,8 +72,8 @@ absl::Status RewriteLayoutWithShardedShape(
         sharding->TileOffsetForDevice(*xla_shape, device);
     std::vector<int64_t> limit =
         sharding->TileLimitForDevice(*xla_shape, device);
-    std::vector<int64_t> dimensions(xla_shape->rank());
-    for (int64_t i = 0; i < xla_shape->rank(); ++i) {
+    std::vector<int64_t> dimensions(xla_shape->dimensions().size());
+    for (int64_t i = 0; i < xla_shape->dimensions().size(); ++i) {
       dimensions[i] = limit[i] - offset[i];
     }
     xla::Shape per_device_xla_shape =
@@ -102,7 +102,7 @@ absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
     std::optional<xla::OpSharding> sharding, bool fast_mem) {
   if (original_shape.IsTuple()) {
     std::vector<xla::XlaOp> elements;
-    for (int i = 0; i < original_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < original_shape.tuple_shapes().size(); ++i) {
       auto subsharding = sharding ? sharding->tuple_shardings(i) : sharding;
       TF_ASSIGN_OR_RETURN(auto element,
                           ReshapeWithCorrectRepresentationAndSharding(
@@ -131,7 +131,7 @@ absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
         hlo_sharding, fast_mem, shape_determination_fns, &to_shape));
   }
   if (xla::ShapeUtil::Compatible(original_shape, to_shape)) {
-    for (int64_t i = 0; i < original_shape.rank(); ++i) {
+    for (int64_t i = 0; i < original_shape.dimensions().size(); ++i) {
       to_shape.set_dynamic_dimension(i, original_shape.is_dynamic_dimension(i));
     }
   }
diff --git a/tensorflow/compiler/tf2xla/lib/data_format.cc b/tensorflow/compiler/tf2xla/lib/data_format.cc
index 6e00a4153325..2473b97af4c2 100644
--- a/tensorflow/compiler/tf2xla/lib/data_format.cc
+++ b/tensorflow/compiler/tf2xla/lib/data_format.cc
@@ -39,7 +39,7 @@ absl::StatusOr<xla::XlaOp> Contract(xla::XlaOp input, int64_t dim) {
 
   // Transpose the input so C is directly followed by VECT_C.
   std::vector<int64_t> permutation;
-  auto rank = input_shape.rank();
+  const int64_t rank = input_shape.dimensions().size();
   permutation.reserve(rank);
   for (int64_t i = 0; i != rank - 1; ++i) {
     permutation.push_back(i);
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 606c3d596282..91e357ec69ea 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -35,9 +35,8 @@ limitations under the License.
 namespace tensorflow {
 
 absl::StatusOr<xla::XlaOp> XlaScatter(
-    const xla::XlaOp& buffer, const xla::XlaOp& updates,
-    const xla::XlaOp& indices, bool indices_are_vectors,
-    bool indices_are_sorted,
+    const xla::XlaOp buffer, const xla::XlaOp updates, const xla::XlaOp indices,
+    bool indices_are_vectors, bool indices_are_sorted,
     const std::function<xla::XlaOp(xla::XlaOp, xla::XlaOp, xla::XlaBuilder*)>&
         combiner,
     xla::XlaBuilder* builder) {
@@ -52,7 +51,7 @@ absl::StatusOr<xla::XlaOp> XlaScatter(
   if (indices_are_vectors) {
     TF_RET_CHECK(!indices_dims.empty());
     num_index_dims = indices_dims.back();
-    if (num_index_dims > buffer_shape.rank()) {
+    if (num_index_dims > buffer_shape.dimensions().size()) {
       return errors::InvalidArgument(
           "The size of the minor dimension of the indices (shape: ",
           xla::ShapeUtil::HumanString(indices_shape),
@@ -141,11 +140,11 @@ absl::StatusOr<xla::XlaOp> XlaScatter(
 
   xla::ScatterDimensionNumbers dim_numbers;
   dim_numbers.set_index_vector_dim(indices_are_vectors
-                                       ? indices_shape.dimensions_size() - 1
-                                       : indices_shape.dimensions_size());
+                                       ? indices_shape.dimensions().size() - 1
+                                       : indices_shape.dimensions().size());
 
-  int64_t updates_rank = updates_shape.rank();
-  int64_t buffer_rank = buffer_shape.rank();
+  int64_t updates_rank = updates_shape.dimensions().size();
+  int64_t buffer_rank = buffer_shape.dimensions().size();
   int64_t num_window_dims_in_updates = buffer_rank - num_index_dims;
 
   // If the rank of `updates` is 0 and does not match the expected rank of
@@ -160,7 +159,7 @@ absl::StatusOr<xla::XlaOp> XlaScatter(
   if (updates_rank == 0 && expected_updates_rank != 0) {
     new_updates = xla::Broadcast(updates, expected_updates_dims);
     TF_ASSIGN_OR_RETURN(updates_shape, builder->GetShape(new_updates));
-    updates_rank = updates_shape.rank();
+    updates_rank = updates_shape.dimensions().size();
   }
 
   if (updates_rank > 0) {
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h
index 90af6e63fcbf..1428d173ea13 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.h
+++ b/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -45,9 +45,8 @@ namespace tensorflow {
 // the buffer using the combiner function. Otherwise, the updates replace the
 // existing values. The order of updates is implementation-defined.
 absl::StatusOr<xla::XlaOp> XlaScatter(
-    const xla::XlaOp& buffer, const xla::XlaOp& updates,
-    const xla::XlaOp& indices, bool indices_are_vectors,
-    bool indices_are_sorted,
+    xla::XlaOp buffer, xla::XlaOp updates, xla::XlaOp indices,
+    bool indices_are_vectors, bool indices_are_sorted,
     const std::function<xla::XlaOp(xla::XlaOp, xla::XlaOp, xla::XlaBuilder*)>&
         combiner,
     xla::XlaBuilder* builder);
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 8bae314ff472..d7df0f531001 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -49,7 +49,7 @@ absl::Status HostTensorToBorrowingLiteral(const xla::Shape& xla_shape,
                                           xla::BorrowingLiteral* literal) {
   const auto& tshape = host_tensor.shape();
   TF_RET_CHECK(tshape.IsFullyDefined() &&
-               tshape.dims() == xla_shape.dimensions_size() &&
+               tshape.dims() == xla_shape.dimensions().size() &&
                tshape.dim_sizes() == xla_shape.dimensions())
       << "Provided xla::Shape must have the same dims as the Tensor shape.";
   *literal = xla::BorrowingLiteral(
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
index eae5fb83c5d6..f41c202b01e4 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
+#include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index e65c948c87e4..6a67cfa237af 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -1152,8 +1152,7 @@ xla::Shape GetShape(shape_inference::ShapeHandle shape_handle,
   return xla::Shape(
       // Type matters only for indices. S64 is the widest possible type.
       xla::PrimitiveType::S64, dims,
-      absl::InlinedVector<bool, 4>(dynamic_dims.begin(), dynamic_dims.end()),
-      /*tuple_shapes=*/{});
+      absl::InlinedVector<bool, 4>(dynamic_dims.begin(), dynamic_dims.end()));
 }
 
 REGISTER_OP("XlaGather")
@@ -1211,7 +1210,7 @@ REGISTER_OP("XlaGather")
                               input_shape, start_indices_shape,
                               gather_dim_numbers, slice_sizes));
       std::vector<shape_inference::DimensionHandle> dims;
-      for (int64_t i = 0; i < output_shape.rank(); ++i) {
+      for (int64_t i = 0; i < output_shape.dimensions().size(); ++i) {
         if (output_shape.is_unbounded_dynamic_dimension(i)) {
           dims.push_back(c->UnknownDim());
         } else {
@@ -1417,6 +1416,7 @@ REGISTER_OP("XlaCallModule")
     .Attr("function_list: list(func) = []")
     .Attr("has_token_input_output: bool = false")
     .Attr("disabled_checks: list(string) = []")
+    .Attr("use_shardy_partitioner: bool = false")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       std::vector<shape_inference::ShapeHandle> args_shapes;
@@ -1492,6 +1492,7 @@ disabled_checks: A list of strings describing the safety checks that were
   This list, supplemented with a comma-separate list of directives specified
   using the flag --tf_xla_call_module_disabled_checks,
   is used at module loading time to skip the corresponding checks.
+use_shardy_partitioner: Indicates whether Shardy is used for SPMD partitioning.
 )doc");
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/shape_util.cc b/tensorflow/compiler/tf2xla/shape_util.cc
index b8b56d4eafdc..0d7549d81c20 100644
--- a/tensorflow/compiler/tf2xla/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/shape_util.cc
@@ -43,7 +43,7 @@ absl::Status PopulateInfeedLayoutVector(const xla::Shape& shape,
       layouts->push_back(dim);
     }
   } else {
-    layouts->insert(layouts->end(), shape.rank(), -1);
+    layouts->insert(layouts->end(), shape.dimensions().size(), -1);
   }
   return absl::OkStatus();
 }
@@ -97,7 +97,7 @@ absl::Status XLAShapeToTensorShape(const xla::Shape& shape,
                                    " cannot be converted to a TensorShape");
   }
   *tensor_shape = TensorShape();
-  for (int i = 0; i < shape.rank(); ++i) {
+  for (int i = 0; i < shape.dimensions().size(); ++i) {
     TF_RETURN_IF_ERROR(tensor_shape->AddDimWithStatus(shape.dimensions(i)));
   }
   return absl::OkStatus();
@@ -237,7 +237,7 @@ absl::Status GetShapeWithLayout(
             "Nested tuples not supported: ",
             xla::ShapeUtil::HumanString(input_shape));
       }
-      int64_t rank = shape.rank();
+      int64_t rank = shape.dimensions().size();
       if (position + rank > minor_to_major.size()) {
         return errors::InvalidArgument(
             "Not enough layout attribute elements: position=", position,
@@ -259,7 +259,7 @@ absl::Status GetShapeWithLayout(
     }
     *output_shape = xla::ShapeUtil::MakeTupleShape(shapes);
   } else {
-    int64_t rank = input_shape.rank();
+    int64_t rank = input_shape.dimensions().size();
     const int64_t minor_to_major_size = minor_to_major.size();
     if (rank != minor_to_major_size) {
       return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index 9cc8787d44b6..d61d66bfe53b 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -136,7 +136,7 @@ TEST(ConvertGraphDefToXla, Sum) {
 
   config.mutable_feed(0)->mutable_id()->set_output_index(
       123); /* invalid output_index */
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index b2d8a878cc45..ec456344bcfc 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -30,6 +31,9 @@ absl::Status DataTypeToPrimitiveType(DataType data_type,
     case tensorflow::DT_BOOL:
       *type = xla::PRED;
       return absl::OkStatus();
+    case tensorflow::DT_INT2:
+      *type = xla::S2;
+      return absl::OkStatus();
     case tensorflow::DT_INT4:
       *type = xla::S4;
       return absl::OkStatus();
@@ -48,6 +52,9 @@ absl::Status DataTypeToPrimitiveType(DataType data_type,
     case tensorflow::DT_INT64:
       *type = xla::S64;
       return absl::OkStatus();
+    case tensorflow::DT_UINT2:
+      *type = xla::U2;
+      return absl::OkStatus();
     case tensorflow::DT_UINT4:
       *type = xla::U4;
       return absl::OkStatus();
@@ -120,11 +127,13 @@ absl::StatusOr<DataType> EncodePrimitiveTypeAsDataType(
           {xla::F32, DT_FLOAT},
           {xla::F64, DT_DOUBLE},
           {xla::C64, DT_COMPLEX64},
+          {xla::S2, DT_INT2},
           {xla::S4, DT_INT4},
           {xla::S8, DT_INT8},
           {xla::S16, DT_INT16},
           {xla::S32, DT_INT32},
           {xla::S64, DT_INT64},
+          {xla::U2, DT_UINT2},
           {xla::U4, DT_UINT4},
           {xla::U8, DT_UINT8},
           {xla::U16, DT_UINT16},
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 2da560c23635..e84e4b0ba7e3 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -15,18 +15,35 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 
+#include <algorithm>
 #include <cassert>
 #include <iostream>
-#include <vector>
 
 #include "xla/cpu_function_runtime.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+namespace {
+
+int32 GetResultIndex(const int32* result_index_table, int32 num_results) {
+  auto it =
+      std::min_element(result_index_table, result_index_table + num_results);
+
+  if (it == result_index_table + num_results) {
+    return -1;
+  }
+  return *it;
+}
+
+}  // namespace
+
 XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
                                                AllocMode alloc_mode)
-    : raw_function_(static_data.raw_function_),
-      result_index_(static_data.result_index_),
+    : temp_allocation_index_(static_data.temp_allocation_index_),
+      raw_function_(static_data.raw_function_),
+      result_index_(GetResultIndex(static_data.result_index_table_,
+                                   static_data.num_results_)),
       buffer_table_(new void*[static_data.num_buffers_]),
       buffer_infos_(static_data.buffer_infos_),
       num_buffers_(static_data.num_buffers_),
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index db280e239f04..da1f668e79dc 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -17,9 +17,14 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
 
 #include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
 #include <string>
-#include <vector>
+#include <utility>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/cpu/buffer_desc.h"
@@ -33,12 +38,20 @@ class ProgramShapeProto;
 class HloProfilePrinterData;
 
 namespace cpu {
+
+class AotCompiledFunctionLibrary;
+class CompilationResultProto;
 class CpuExecutable;
+class NanoRtExecutable;
+
 }  // namespace cpu
 }  // namespace xla
 
 namespace tensorflow {
 
+// Forward-declare so that it can access StaticData.
+class XlaCompiledCpuFunctionThunks;
+
 // Represents a function compiled by XLA, produced via either JIT or AOT.
 //
 // The Run method invokes the actual computation, with inputs read from arg
@@ -77,9 +90,25 @@ class XlaCompiledCpuFunction {
   // The contents of StaticData are XLA-internal implementation details and
   // should not be relied on by clients (and therefore are private).
   class StaticData {
+   public:
+    bool has_thunk_sequence() const {
+      return compilation_result_proto_ != nullptr;
+    }
+
    private:
+    // start thunk execution specific
+    const xla::cpu::CompilationResultProto* compilation_result_proto_ = nullptr;
+
+    absl::flat_hash_map<
+        std::string,
+        /*xla::cpu::AotCompiledFunctionLibrary::FunctionPtr*/ void*>
+        function_library_symbol_map_;
+
+    std::optional<size_t> temp_allocation_index_ = std::nullopt;
+    // end thunk execution specific
+
     // The raw function to call.
-    RawFunction raw_function_;
+    RawFunction raw_function_ = nullptr;
 
     // Contains information about the buffers used by the XLA computation.
     const xla::cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr;
@@ -130,6 +159,7 @@ class XlaCompiledCpuFunction {
     // Only XlaCompiledCpuFunction is allowed to read and write the above
     // fields.
     friend class XlaCompiledCpuFunction;
+    friend class XlaCompiledCpuFunctionThunks;
   };
 
   // AllocMode controls the buffer allocation mode.
@@ -154,20 +184,22 @@ class XlaCompiledCpuFunction {
   XlaCompiledCpuFunction& operator=(XlaCompiledCpuFunction&&) = default;
 
   // Sets the intra-op thread pool used to run individual ops concurrently.
-  void set_thread_pool(const Eigen::ThreadPoolDevice* pool) {
+  virtual void set_thread_pool(const Eigen::ThreadPoolDevice* pool) {
     run_options_.set_intra_op_thread_pool(pool);
   }
 
   // Runs the computation, with inputs read from arg buffers, and outputs
   // written to result buffers. Returns true on success and false on failure.
-  bool Run();
+  virtual bool Run();
 
   // Returns the error message from the previous failed Run call.
   //
   // TODO(fschneider): For now this always returns an empty string because there
   // is no support for error reporting in XLA. Remove this once all callers are
   // updated.
-  string error_msg() const { return {}; }
+  string error_msg() const { return error_msg_; }
+
+  void set_error_msg(absl::string_view error_msg) { error_msg_ = error_msg; }
 
   // ------------------------------
   // Arg methods for managing input buffers. Buffers are in row-major order.
@@ -196,6 +228,11 @@ class XlaCompiledCpuFunction {
     return buffer_infos_[arg_index_table_[idx]].size();
   }
 
+  int result_size(int idx) const {
+    assert(idx < num_results());
+    return buffer_infos_[result_index_table_[idx]].size();
+  }
+
   // Sets the buffer for the positional argument at the given `index` to `data`.
   // Must be called before Run to have an effect. May be called under any
   // AllocMode; if the AllocMode is RESULTS_AND_TEMPS_ONLY, this method must be
@@ -221,19 +258,6 @@ class XlaCompiledCpuFunction {
     buffer_table_[arg_index_table_[index]] = const_cast<void*>(data);
   }
 
-  // ------------------------------
-  // Result methods for managing output buffers. Buffers are in row-major order.
-  // Must only be called after a successful Run call. Unlike the arg methods,
-  // there is no set_resultN_data method. The result buffers are managed
-  // internally, and may change after each call to Run.
-
-  // Returns the underlying array of result buffers, where results()[I] is the
-  // buffer for the positional result at index I.
-  void** results() { return static_cast<void**>(buffer_table_[result_index_]); }
-  const void* const* results() const {
-    return static_cast<const void* const*>(buffer_table_[result_index_]);
-  }
-
   // Profile counters for this XLA computation.
   //
   // When Hlo profiling is enabled (`hlo_profiling_enabled()` return true in
@@ -245,8 +269,12 @@ class XlaCompiledCpuFunction {
   const int64_t* profile_counters() const { return profile_counters_; }
 
   // Returns the buffer for the positional result at the given `index`.
-  void* result_data(size_t index) { return results()[index]; }
-  const void* result_data(size_t index) const { return results()[index]; }
+  void* result_data(size_t index) {
+    return buffer_table_[result_index_table_[index]];
+  }
+  const void* result_data(size_t index) const {
+    return buffer_table_[result_index_table_[index]];
+  }
 
   // ------------------------------
   // Methods for extracting optional metadata.
@@ -307,6 +335,18 @@ class XlaCompiledCpuFunction {
   }
 
  protected:
+  virtual bool is_thunk_mode() const { return false; }
+
+  std::optional<size_t> temp_allocation_index() const {
+    return temp_allocation_index_;
+  }
+
+  const xla::cpu_function_runtime::BufferInfo* buffer_infos() const {
+    return buffer_infos_;
+  }
+
+  void** buffer_table() const { return buffer_table_; }
+
   // ---------------------------------------------------------------------------
   // Accessors for reading from and writing to instances of `StaticData`.
   //
@@ -314,6 +354,22 @@ class XlaCompiledCpuFunction {
   // inherit from `XlaCompiledCpuFunction`.  `XlaJitCompiledCpuFunction` can
   // call these because it is explicitly added as a friend.
 
+  static void set_static_data_function_library_symbol_map(
+      StaticData* static_data,
+      absl::flat_hash_map<
+          std::string,
+          /*xla::cpu::AotCompiledFunctionLibrary::FunctionPtr*/ void*>
+          function_library_symbol_map) {
+    static_data->function_library_symbol_map_ =
+        std::move(function_library_symbol_map);
+  }
+
+  static void set_static_data_compilation_result_proto(
+      StaticData* static_data,
+      const xla::cpu::CompilationResultProto* compilation_result_proto) {
+    static_data->compilation_result_proto_ = compilation_result_proto;
+  }
+
   static void set_static_data_raw_function(StaticData* static_data,
                                            RawFunction raw_function) {
     static_data->raw_function_ = raw_function;
@@ -355,6 +411,12 @@ class XlaCompiledCpuFunction {
     static_data->num_variables_ = num_variables;
   }
 
+  static void set_static_data_temp_allocation_index(
+      StaticData* static_data,
+      const std::optional<size_t> temp_allocation_index) {
+    static_data->temp_allocation_index_ = temp_allocation_index;
+  }
+
   static void set_static_data_result_index(StaticData* static_data,
                                            size_t result_index) {
     static_data->result_index_ = result_index;
@@ -411,8 +473,9 @@ class XlaCompiledCpuFunction {
   static void set_static_data_use_xla_runtime(StaticData* static_data, bool) {}
 
  private:
-  const RawFunction raw_function_;
+  const std::optional<size_t> temp_allocation_index_;
 
+  const RawFunction raw_function_ = nullptr;
   const size_t result_index_;
 
   // Array containing pointers to argument and temp buffers (slots corresponding
@@ -460,6 +523,8 @@ class XlaCompiledCpuFunction {
   const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 
+  std::string error_msg_ = "";
+
   // Add `XlaJitCompiledCpuFunction` as a friend so that it can access the
   // `set_static_data_*` static methods above.
   friend class XlaJitCompiledCpuFunction;
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_factory.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_factory.cc
new file mode 100644
index 000000000000..2f526e1efd96
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_factory.cc
@@ -0,0 +1,49 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function_factory.h"
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+
+namespace tensorflow {
+namespace xla_compiled_cpu_function_factory {
+
+// Weak symbol to allow for the thunk factory to be registered by the
+// xla_compiled_cpu_function_thunk_factory_registerer. This is a workaround that
+// allows us to link in the thunk runtime without breaking AOT size constraints.
+std::unique_ptr<XlaCompiledCpuFunction> CreateXlaCompiledCpuFunctionThunks(
+    const XlaCompiledCpuFunction::StaticData& static_data,
+    XlaCompiledCpuFunction::AllocMode alloc_mode) __attribute__((weak));
+
+absl::StatusOr<std::unique_ptr<XlaCompiledCpuFunction>> Create(
+    const XlaCompiledCpuFunction::StaticData& static_data,
+    XlaCompiledCpuFunction::AllocMode alloc_mode) {
+  if (static_data.has_thunk_sequence()) {
+    if (CreateXlaCompiledCpuFunctionThunks == nullptr) {
+      return absl::InternalError(
+          "XlaCompiledCpuFunctionThunks factory is not registered");
+    }
+    return CreateXlaCompiledCpuFunctionThunks(static_data, alloc_mode);
+  } else {
+    return std::make_unique<XlaCompiledCpuFunction>(static_data, alloc_mode);
+  }
+}
+
+}  // namespace xla_compiled_cpu_function_factory
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_factory.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_factory.h
new file mode 100644
index 000000000000..099c8f05fc8d
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_factory.h
@@ -0,0 +1,38 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_FACTORY_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_FACTORY_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+
+namespace tensorflow {
+namespace xla_compiled_cpu_function_factory {
+
+// A utility function to create an XlaCompiledCpuFunction.
+absl::StatusOr<std::unique_ptr<XlaCompiledCpuFunction>> Create(
+    const XlaCompiledCpuFunction::StaticData& static_data,
+    XlaCompiledCpuFunction::AllocMode alloc_mode = XlaCompiledCpuFunction::
+        AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS);
+
+}  // namespace xla_compiled_cpu_function_factory
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_FACTORY_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunk_factory_registerer.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunk_factory_registerer.cc
new file mode 100644
index 000000000000..4301c663765c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunk_factory_registerer.cc
@@ -0,0 +1,32 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h"
+
+namespace tensorflow {
+namespace xla_compiled_cpu_function_factory {
+
+std::unique_ptr<XlaCompiledCpuFunction> CreateXlaCompiledCpuFunctionThunks(
+    const XlaCompiledCpuFunction::StaticData& static_data,
+    XlaCompiledCpuFunction::AllocMode alloc_mode) {
+  return std::make_unique<XlaCompiledCpuFunctionThunks>(static_data,
+                                                        alloc_mode);
+}
+
+}  // namespace xla_compiled_cpu_function_factory
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
new file mode 100644
index 000000000000..8c1d22fe65b5
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
@@ -0,0 +1,130 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h"
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "xla/backends/cpu/codegen/aot_compiled_function_library.h"
+#include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/status.h"
+
+namespace tensorflow {
+
+XlaCompiledCpuFunctionThunks::XlaCompiledCpuFunctionThunks(
+    const StaticData& static_data, AllocMode alloc_mode)
+    : XlaCompiledCpuFunction(static_data, alloc_mode) {
+  CHECK(static_data.compilation_result_proto_ != nullptr);
+
+  std::unique_ptr<xla::cpu::FunctionLibrary> function_library =
+      std::make_unique<xla::cpu::AotCompiledFunctionLibrary>(
+          static_data.function_library_symbol_map_);
+
+  auto aot_compilation_result =
+      xla::cpu::CpuAotCompilationResultThunks::FromString(
+          static_data.compilation_result_proto_->SerializeAsString(),
+          function_library.release());
+
+  // To load a CPU executable we don't need a compiler or a stream executor.
+  TF_CHECK_OK(aot_compilation_result.status());
+  // NO_CDC: aot_compilation_result is checked to be OK above.
+  auto cpu_executable = std::move(*aot_compilation_result.value())
+                            .LoadExecutable(nullptr, nullptr);
+
+  TF_CHECK_OK(cpu_executable.status());
+  auto executable_or_err =
+      // NO_CDC: cpu_executable is checked to be OK above.
+      xla::cpu::NanoRtExecutable::Create(std::move(cpu_executable.value()));
+
+  TF_CHECK_OK(executable_or_err.status());
+  // NO_CDC: executable_or_err is checked to be OK above.
+  executable_ = std::move(executable_or_err.value());
+}
+
+bool XlaCompiledCpuFunctionThunks::Run() {
+  auto ret = Execute(GenerateNanortArgs(), GenerateNanortResults(),
+                     GenerateNanortPreallocatedTemp());
+
+  if (!ret.ok()) {
+    set_error_msg(ret.message());
+  }
+
+  return ret.ok();
+}
+
+std::vector<xla::cpu::NanoRtExecutable::Argument>
+XlaCompiledCpuFunctionThunks::GenerateNanortArgs() {
+  std::vector<xla::cpu::NanoRtExecutable::Argument> arguments;
+  arguments.reserve(num_args());
+  for (int i = 0; i < num_args(); ++i) {
+    arguments.push_back(
+        xla::cpu::NanoRtExecutable::Argument(arg_data(i), arg_size(i)));
+  }
+
+  return arguments;
+}
+
+std::vector<xla::cpu::NanoRtExecutable::Result>
+XlaCompiledCpuFunctionThunks::GenerateNanortResults() {
+  std::vector<xla::cpu::NanoRtExecutable::Result> results;
+  results.reserve(num_results());
+  for (int i = 0; i < num_results(); ++i) {
+    results.push_back(
+        xla::cpu::NanoRtExecutable::Result(result_data(i), result_size(i)));
+  }
+
+  return results;
+}
+
+xla::cpu::NanoRtExecutable::PreallocatedTemp
+XlaCompiledCpuFunctionThunks::GenerateNanortPreallocatedTemp() {
+  xla::cpu::NanoRtExecutable::PreallocatedTemp temp;
+
+  auto temp_allocation_index = this->temp_allocation_index();
+  if (temp_allocation_index.has_value()) {
+    temp = xla::cpu::NanoRtExecutable::PreallocatedTemp(
+        static_cast<std::byte*>(buffer_table()[*temp_allocation_index]),
+        buffer_infos()[*temp_allocation_index].size());
+  }
+
+  return temp;
+}
+
+absl::Status XlaCompiledCpuFunctionThunks::Execute(
+    absl::Span<const xla::cpu::NanoRtExecutable::Argument> arguments,
+    absl::Span<const xla::cpu::NanoRtExecutable::Result> results,
+    xla::cpu::NanoRtExecutable::PreallocatedTemp temp) {
+  auto event =
+      executable_->Execute(arguments, results, temp, thunk_run_options_);
+  tsl::BlockUntilReady(event);
+
+  if (!event.IsConcrete()) {
+    return event.GetError();
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h
new file mode 100644
index 000000000000..efe533106b7c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h
@@ -0,0 +1,66 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_THUNKS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_THUNKS_H_
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace tensorflow {
+
+class XlaCompiledCpuFunctionThunks : public XlaCompiledCpuFunction {
+ public:
+  explicit XlaCompiledCpuFunctionThunks(
+      const StaticData& static_data,
+      AllocMode alloc_mode =
+          AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS);
+
+  bool Run() override;
+
+  bool is_thunk_mode() const override { return true; }
+
+  void set_thread_pool(const Eigen::ThreadPoolDevice* pool) override {
+    thunk_run_options_.set_intra_op_thread_pool(pool);
+  }
+
+ protected:
+  std::vector<xla::cpu::NanoRtExecutable::Argument> GenerateNanortArgs();
+  std::vector<xla::cpu::NanoRtExecutable::Result> GenerateNanortResults();
+  xla::cpu::NanoRtExecutable::PreallocatedTemp GenerateNanortPreallocatedTemp();
+
+ private:
+  // For NanoRtExecutable.
+  absl::Status Execute(
+      absl::Span<const xla::cpu::NanoRtExecutable::Argument> arguments,
+      absl::Span<const xla::cpu::NanoRtExecutable::Result> results,
+      xla::cpu::NanoRtExecutable::PreallocatedTemp temp);
+
+  std::unique_ptr<xla::cpu::NanoRtExecutable> executable_;
+  xla::cpu::NanoRtExecutable::ExecuteOptions thunk_run_options_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_THUNKS_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index c51107fb9dea..4df8870022a0 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -935,7 +935,7 @@ absl::Status XlaCompiler::XLAShapeForArgument(
         if (std::holds_alternative<xla::Shape>(arg.shape) &&
             std::get<xla::Shape>(arg.shape).is_dynamic()) {
           xla::Shape dynamic_shape = std::get<xla::Shape>(arg.shape);
-          for (int i = 0; i < xla_shape->dimensions_size(); ++i) {
+          for (int i = 0; i < xla_shape->dimensions().size(); ++i) {
             xla_shape->set_dynamic_dimension(
                 i, dynamic_shape.is_dynamic_dimension(i));
           }
@@ -1678,7 +1678,8 @@ absl::Status XlaCompiler::SetDeviceToHostMetadata(
     tf2xla::HostTransferMetadata& existing_transfer = host_compute_sends_[key];
     tf2xla::HostTransferMetadata new_transfer;
     SetTransfer(key, types, shapes, &new_transfer);
-    if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
+    if (xla::protobuf_util::HaveSameSerialization(existing_transfer,
+                                                  new_transfer)) {
       return absl::OkStatus();
     } else {
       return errors::InvalidArgument(
@@ -1712,7 +1713,8 @@ absl::Status XlaCompiler::SetHostToDeviceMetadata(
     tf2xla::HostTransferMetadata& existing_transfer = host_compute_recvs_[key];
     tf2xla::HostTransferMetadata new_transfer;
     SetTransfer(key, types, shapes, &new_transfer);
-    if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
+    if (xla::protobuf_util::HaveSameSerialization(existing_transfer,
+                                                  new_transfer)) {
       return absl::OkStatus();
     } else {
       return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index ac8586148b66..a9542714efdf 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -15,20 +15,25 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h"
 
+#include <cstddef>
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "xla/backends/cpu/codegen/compiled_function_library.h"
 #include "xla/client/client_library.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/service/cpu/buffer_info_util.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
@@ -39,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -131,22 +137,15 @@ XlaJitCompiledCpuFunction::Compile(
     arg_shapes.push_back(&program_shape->parameters(i));
   }
 
-  // TODO(b/342515164): Implement XLA jit compiled functions + thunks.
-  xla::ExecutableBuildOptions build_options_copy = build_options;
-  build_options_copy.mutable_debug_options()->set_xla_cpu_use_thunk_runtime(
-      false);
-
   // Compile the executable. The static_cast to the CpuExecutable subclass is
   // necessary since the raw function and buffer assignments are only available
   // there.
-  TF_ASSIGN_OR_RETURN(auto executables, client->Compile(computation, arg_shapes,
-                                                        build_options_copy));
+  TF_ASSIGN_OR_RETURN(auto executables,
+                      client->Compile(computation, arg_shapes, build_options));
   TF_RET_CHECK(executables.size() == 1);
   std::unique_ptr<xla::LocalExecutable> executable = std::move(executables[0]);
-  const xla::cpu::CpuExecutable* cpu_executable =
+  xla::cpu::CpuExecutable* cpu_executable =
       static_cast<xla::cpu::CpuExecutable*>(executable->executable());
-  XlaCompiledCpuFunction::RawFunction raw_function =
-      cpu_executable->compute_function();
   const xla::BufferAssignment& buffer_assignment =
       cpu_executable->buffer_assignment();
 
@@ -156,26 +155,82 @@ XlaJitCompiledCpuFunction::Compile(
                                                       buffer_assignment);
   std::vector<int32> arg_index_table =
       xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
+  std::vector<int32> result_index_table =
+      xla::cpu::CreateResultIndexTableFromBufferInfos(buffer_infos);
   TF_ASSIGN_OR_RETURN(size_t result_index,
                       ComputeResultIndex(buffer_assignment));
   const int num_results = CountResults(buffer_infos);
 
   std::unique_ptr<XlaJitCompiledCpuFunction> jit_unique_ptr(
       new XlaJitCompiledCpuFunction);
+
   XlaJitCompiledCpuFunction* jit = jit_unique_ptr.get();
+
+  if (!cpu_executable->has_thunks()) {
+    return absl::InternalError(
+        "JIT compilation supports only thunk execution.");
+  }
+
+  {
+    // This is here for simplicity, effectively just used to get the thunk
+    // information to the XlaCompiledCpuFunction.
+    TF_ASSIGN_OR_RETURN(
+        auto compilation_result,
+        xla::cpu::CpuAotCompilationResultThunks::Create(
+            &cpu_executable->module(), &cpu_executable->buffer_assignment(),
+            cpu_executable->module_name(),
+            // Symbols and object files are not needed since the function
+            // library will be backed by the one in the executable which is
+            // owned by XlaJitCompiledCpuFunction.
+            /*obj_files=*/{}, /*symbols=*/{},
+            cpu_executable->thunks().thunk_sequence(),
+            cpu_executable->function_library(),
+            /*hlo_profile_printer_data=*/nullptr));
+
+    const std::optional<size_t> temp_allocation_index =
+        compilation_result->temp_allocation_index();
+
+    XlaCompiledCpuFunction::set_static_data_temp_allocation_index(
+        &jit->static_data_, temp_allocation_index);
+
+    jit->compilation_result_proto_ =
+        std::make_unique<xla::cpu::CompilationResultProto>(
+            compilation_result->proto());
+
+    auto compiled_function_library =
+        tsl::down_cast<xla::cpu::CompiledFunctionLibrary*>(
+            cpu_executable->function_library());
+
+    if (!compiled_function_library) {
+      return absl::InternalError(
+          "Could not downcast FunctionLibrary to CompiledFunctionLibrary");
+    }
+
+    // NOTE: This will work because the function library is by the
+    // executable and keeps the function pointers alive.
+    jit->function_library_symbol_map_ =
+        compiled_function_library->GetTypelessSymbolsMap();
+  }
+
   jit->executable_ = std::move(executable);
   jit->buffer_infos_ = std::move(buffer_infos);
   jit->arg_index_table_ = std::move(arg_index_table);
+  jit->result_index_table_ = std::move(result_index_table);
   jit->program_shape_ =
       std::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
-  XlaCompiledCpuFunction::set_static_data_raw_function(&jit->static_data_,
-                                                       raw_function);
+  XlaCompiledCpuFunction::set_static_data_compilation_result_proto(
+      &jit->static_data_, jit->compilation_result_proto_.get());
+  XlaCompiledCpuFunction::set_static_data_function_library_symbol_map(
+      &jit->static_data_, jit->function_library_symbol_map_);
+
   XlaCompiledCpuFunction::set_static_data_buffer_infos(
       &jit->static_data_, jit->buffer_infos_.data());
   XlaCompiledCpuFunction::set_static_data_num_buffers(
       &jit->static_data_, jit->buffer_infos_.size());
   XlaCompiledCpuFunction::set_static_data_arg_index_table(
       &jit->static_data_, jit->arg_index_table_.data());
+  XlaCompiledCpuFunction::set_static_data_result_index_table(
+      &jit->static_data_, jit->result_index_table_.data());
   XlaCompiledCpuFunction::set_static_data_num_args(
       &jit->static_data_, jit->arg_index_table_.size());
   XlaCompiledCpuFunction::set_static_data_num_variables(&jit->static_data_,
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index c3982bb5307e..8d142ffbe325 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -17,15 +17,17 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_JIT_COMPILED_CPU_FUNCTION_H_
 
 #include <memory>
+#include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
-#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h"
 #include "xla/client/local_client.h"
 #include "xla/cpu_function_runtime.h"
+#include "xla/service/cpu/executable.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -65,11 +67,17 @@ class XlaJitCompiledCpuFunction {
   }
 
  private:
-  XlaJitCompiledCpuFunction() {}
+  XlaJitCompiledCpuFunction() : compilation_result_proto_(nullptr) {}
 
   // The executable holds the underlying function.
   std::unique_ptr<xla::LocalExecutable> executable_;
 
+  // The compilation result proto.
+  std::unique_ptr<xla::cpu::CompilationResultProto> compilation_result_proto_;
+
+  // Function library symbol map used to construct AotCompiledFunctionLibrary
+  absl::flat_hash_map<std::string, void*> function_library_symbol_map_;
+
   // The static data is backed by the rest of the state in this class.
   XlaCompiledCpuFunction::StaticData static_data_;
 
@@ -79,6 +87,9 @@ class XlaJitCompiledCpuFunction {
   // The backing array for the arg index table.
   std::vector<int32> arg_index_table_;
 
+  // The backing array for the result index table.
+  std::vector<int32> result_index_table_;
+
   // The backing arrays of arg and result names. We hold the actual strings in
   // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static
   // data to refer to.
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index 3c91d462fc2e..acac1efd7388 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -19,18 +19,15 @@ limitations under the License.
 #include <string>
 
 #include "absl/log/check.h"
-#include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
-#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
+#include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.h"
 #include "xla/client/executable_build_options.h"
-#include "xla/client/local_client.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/compiler.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -39,11 +36,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -175,21 +169,6 @@ tf2xla::Config SumConfigVariable() {
   return config;
 }
 
-TEST(XlaJitCompiledCpuFunction, CheckThunkDisabled) {
-  GraphDef graph_def = SumGraph();
-  tf2xla::Config config = SumConfig();
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<XlaJitCompiledCpuFunction> jit,
-      XlaJitCompiledCpuFunction::Compile(graph_def, config,
-                                         xla::ExecutableBuildOptions()));
-  ASSERT_TRUE(jit->LocalExecutable().build_options().has_debug_options());
-  ASSERT_FALSE(jit->LocalExecutable()
-                   .build_options()
-                   .debug_options()
-                   .xla_cpu_use_thunk_runtime());
-}
-
 TEST(XlaJitCompiledCpuFunction, Sum) {
   GraphDef graph_def = SumGraph();
   tf2xla::Config config = SumConfig();
@@ -198,7 +177,7 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
       std::unique_ptr<XlaJitCompiledCpuFunction> jit,
       XlaJitCompiledCpuFunction::Compile(graph_def, config,
                                          xla::ExecutableBuildOptions()));
-  XlaCompiledCpuFunction function(jit->StaticData());
+  XlaCompiledCpuFunctionThunks function(jit->StaticData());
   ASSERT_EQ(function.num_args(), 2);
   ASSERT_EQ(function.num_results(), 1);
 
@@ -262,7 +241,9 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
   ASSERT_TRUE(function.ProgramShape() != nullptr);
-  const xla::ProgramShape program_shape(*function.ProgramShape());
+  TF_ASSERT_OK_AND_ASSIGN(
+      xla::ProgramShape program_shape,
+      xla::ProgramShape::FromProto(*function.ProgramShape()));
   ASSERT_EQ(program_shape.parameters_size(), 2);
   EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(0), s32));
   EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(1), s32));
@@ -282,7 +263,7 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
       std::unique_ptr<XlaJitCompiledCpuFunction> jit,
       XlaJitCompiledCpuFunction::Compile(graph_def, config,
                                          xla::ExecutableBuildOptions()));
-  XlaCompiledCpuFunction function(jit->StaticData());
+  XlaCompiledCpuFunctionThunks function(jit->StaticData());
   ASSERT_EQ(function.num_args(), 2);
   ASSERT_EQ(function.num_results(), 2);
 
@@ -320,7 +301,9 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
   const xla::Shape s32_1 = ShapeUtil::MakeShape(xla::S32, {1});
   ASSERT_TRUE(function.ProgramShape() != nullptr);
-  const xla::ProgramShape program_shape(*function.ProgramShape());
+  TF_ASSERT_OK_AND_ASSIGN(
+      xla::ProgramShape program_shape,
+      xla::ProgramShape::FromProto(*function.ProgramShape()));
   ASSERT_EQ(program_shape.parameters_size(), 2);
   EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(0), s32));
   EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(1), s32_1));
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index a17ccd63d14f..e999c23fffae 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -203,7 +203,7 @@ absl::Status XlaOpKernelContext::ConstantInputReshaped(
 // Converts an int16, int32 or int64 scalar literal to an int64.
 static absl::Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
                                          int64_t* out) {
-  if (literal.shape().rank() != 0) {
+  if (!literal.shape().dimensions().empty()) {
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::S16) {
@@ -221,7 +221,7 @@ static absl::Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
 // Converts an float32 or float64 scalar literal to a float64.
 static absl::Status LiteralToFloat64Scalar(const xla::LiteralSlice& literal,
                                            double* out) {
-  if (literal.shape().rank() != 0) {
+  if (!literal.shape().dimensions().empty()) {
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::F32) {
@@ -263,7 +263,7 @@ absl::Status XlaOpKernelContext::ConstantInputAsFloatScalar(
 
 static absl::Status LiteralToPredVector(const xla::LiteralSlice& literal,
                                         std::vector<bool>* out) {
-  if (literal.shape().rank() != 1) {
+  if (literal.shape().dimensions().size() != 1) {
     return errors::InvalidArgument("output_shape must be rank 1, got shape ",
                                    literal.shape().DebugString());
   }
@@ -363,7 +363,7 @@ absl::Status XlaOpKernelContext::ResolveInputDynamismIntoPredVector(
 // Converts an int32 or int64 1D literal to an int64 vector.
 static absl::Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
                                          std::vector<int64_t>* out) {
-  if (literal.shape().rank() != 1) {
+  if (literal.shape().dimensions().size() != 1) {
     return errors::InvalidArgument("output_shape must be rank 1, got shape ",
                                    literal.shape().DebugString());
   }
@@ -472,7 +472,7 @@ absl::Status XlaOpKernelContext::ConstantInputAsPartialShape(
   xla::Literal literal;
   TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
   // If `literal` is a scalar it's value must be -1.
-  if (literal.shape().rank() == 0) {
+  if (literal.shape().dimensions().empty()) {
     int64_t shape_val;
     TF_RETURN_IF_ERROR(LiteralToInt64Scalar(literal, &shape_val));
     if (shape_val != -1) {
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1b62cc5770e2..2f0ff5e91867 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -179,7 +179,6 @@ tf_proto_library(
         "//tensorflow/core/example:protos_all",
         "//tensorflow/core/framework:protos_all",
         "//tensorflow/core/lib/core:error_codes_proto",
-        "//tensorflow/core/profiler/protobuf:xplane_proto",
         "//tensorflow/core/profiler:profiler_options_proto",
         "//tensorflow/core/protobuf:error_codes_proto_impl",
         "//tensorflow/core/protobuf:for_core_protos",
@@ -469,7 +468,6 @@ cc_library(
     hdrs = ["//tensorflow/core/public:session_options.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":lib",
         ":protos_all_cc",
     ],
 )
@@ -529,6 +527,9 @@ cc_library(
         "//tensorflow/dtensor/cc:dtensor_ops",
     ] + select({
         # Non-tpu platforms don't need tpu dependency.
+        # copybara:uncomment_begin(google-only)
+        # "//buildenv/platforms/settings:chrome_linux": [],
+        # copybara:uncomment_end
         "//tensorflow:chromiumos": [],
         "//tensorflow:fuchsia": [],
         "//conditions:default": [
@@ -1014,6 +1015,9 @@ cc_library(
     hdrs = if_mobile(["//tensorflow/core/config:flags_headers_filegroup"]),
     copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
     defines = ["SUPPORT_SELECTIVE_REGISTRATION"] + select({
+                  # copybara:uncomment_begin(google-only)
+                  # "//buildenv/platforms/settings:chrome_linux": ["IS_MOBILE_PLATFORM"],
+                  # copybara:uncomment_end
                   "//tensorflow:chromiumos": ["IS_MOBILE_PLATFORM"],
                   "//tensorflow:fuchsia": ["IS_MOBILE_PLATFORM"],
                   "//conditions:default": [],
@@ -1031,7 +1035,9 @@ cc_library(
         "//tensorflow/core:mobile_additional_lib_deps",
         "//tensorflow/core/platform:resource",
         "//tensorflow/core/public:release_version",
+        "//tensorflow/core/util:onednn_env_vars",
         "//tensorflow/core/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:safe_reinterpret_cast",
     ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
     alwayslink = 1,
 )
@@ -1046,6 +1052,7 @@ cc_library(
 #         "EIGEN_NEON_GEBP_NR=4",
 #     ] + select({
 #         "//tensorflow:chromiumos": ["IS_MOBILE_PLATFORM"],
+#         "//buildenv/platforms/settings:chrome_linux": ["IS_MOBILE_PLATFORM"],
 #         "//tensorflow:fuchsia": ["IS_MOBILE_PLATFORM"],
 #         "//conditions:default": [],
 #     }) + tf_defines_nortti_if_lite_protos() + select({
@@ -1067,6 +1074,7 @@ cc_library(
 #         "@com_google_absl//absl/strings",
 #         "@com_google_absl//absl/types:optional",
 #         "@local_xla//xla/tsl/framework/fixedpoint",
+#         "@local_xla//xla/tsl/util:safe_reinterpret_cast",
 #         "//tensorflow/core/platform:resource",
 #         "//tensorflow/core/util:managed_stack_trace",
 #         "//tensorflow/core/util:stats_calculator_portable",
@@ -1473,7 +1481,6 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc_impl",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc_impl",
         "//tensorflow/core/protobuf:autotuning_proto_cc_impl",
         "//tensorflow/core/protobuf:conv_autotuning_proto_cc_impl",
         ":protos_all_cc_impl",
@@ -1823,6 +1830,12 @@ tf_cuda_library(
         "//tensorflow/core/public:session.h",
     ],
     copts = tf_copts(),
+    visibility = [
+        ":dependency_allowlist",
+        "//learning/gemini/gemax/core/models/gemini3/vision/vision_decoder:__pkg__",
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
     deps = ["//tensorflow/core/common_runtime:core_cpu_base_no_ops"] + if_static([
         ":function_ops_op_lib",
         ":functional_grad",
@@ -2035,6 +2048,7 @@ filegroup(
         "//tensorflow/core/lib/gif/testdata:gif_testdata",
         # BMP data
         "//tensorflow/core/lib/bmp:bmp_testdata",
+        "//tensorflow/core/lib/webp:testdata",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
index 7174c8d3dafe..3e1f81cc9596 100644
--- a/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
@@ -28,25 +28,27 @@ END
   attr {
     name: "expand_animations"
     description: <<END
-Controls the output shape of the returned op. If True, the returned op will
-produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D tensor for all
-GIFs, whether animated or not. If, False, the returned op will produce a 3-D
-tensor for all file types and will truncate animated GIFs to the first frame.
+Controls the output shape of the returned op. If True, the returned op
+will produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D
+tensor for all GIFs and WebP images, whether animated or not. If,
+False, the returned op will produce a 3-D tensor for all file types
+and will truncate animated images to the first frame.
 END
   }
-  summary: "Function for decode_bmp, decode_gif, decode_jpeg, and decode_png."
+  summary: "Function for decode_bmp, decode_gif, decode_jpeg, decode_webp, and decode_png."
   description: <<END
-Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+Detects whether an image is a BMP, GIF, JPEG, WebP, or PNG, and performs the
 appropriate operation to convert the input bytes string into a Tensor of type
 dtype.
 
-*NOTE*: decode_gif returns a 4-D array [num_frames, height, width, 3], as
-opposed to decode_bmp, decode_jpeg and decode_png, which return 3-D arrays
-[height, width, num_channels]. Make sure to take this into account when
-constructing your graph if you are intermixing GIF files with BMP, JPEG, and/or
-PNG files. Alternately, set the expand_animations argument of this function to
-False, in which case the op will return 3-dimensional tensors and will truncate
-animated GIF files to the first frame.
+*NOTE*: decode_gif and decode_webp return a 4-D
+array [num_frames, height, width, 3], as opposed to decode_bmp,
+decode_jpeg, and decode_png, which always return 3-D arrays [height,
+width, num_channels]. Make sure to take this into account when
+constructing your graph if you are intermixing animated files with
+BMP, JPEG, and/or PNG files. Alternately, set the expand_animations
+argument of this function to False, in which case the op will return
+3-dimensional tensors and will truncate animations to the first frame.
 
 *NOTE*: If the first frame of an animated GIF does not occupy the entire
 canvas (maximum frame width x maximum frame height), then it fills the
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeWebP.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeWebP.pbtxt
new file mode 100644
index 000000000000..3d1af0ffe163
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeWebP.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "DecodeWebP"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D.  The WebP-encoded image.
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+4-D with shape `[num_frames, height, width, channels]`.
+END
+  }
+  attr {
+    name: "channels"
+    description: <<END
+Number of color channels for the decoded image.
+END
+  }
+  summary: "Decode a WebP-encoded image to a uint8 tensor."
+  description: <<END
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the WebP-encoded image.
+*   3: output an RGB image.
+*   4: output an RGBA image.
+
+The number of channels must currently match that of the underlying file.
+For WebP animations, only 4-channel RGBA is supported.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt
new file mode 100644
index 000000000000..7e0087297924
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt
new file mode 100644
index 000000000000..c9b5362c29f4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt
new file mode 100644
index 000000000000..e55443b71e24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt
new file mode 100644
index 000000000000..ccc3643bbf43
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt
new file mode 100644
index 000000000000..493df681eabb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt
new file mode 100644
index 000000000000..1725da02426c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput"
+  visibility: HIDDEN
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt
new file mode 100644
index 000000000000..72728218d6ea
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeWebP.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeWebP.pbtxt
new file mode 100644
index 000000000000..0e33440a3107
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeWebP.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DecodeWebP"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt
new file mode 100644
index 000000000000..7e0087297924
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt
new file mode 100644
index 000000000000..c9b5362c29f4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt
new file mode 100644
index 000000000000..e55443b71e24
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt
new file mode 100644
index 000000000000..ccc3643bbf43
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt
new file mode 100644
index 000000000000..493df681eabb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt
new file mode 100644
index 000000000000..1725da02426c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput"
+  visibility: HIDDEN
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt
new file mode 100644
index 000000000000..72728218d6ea
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index ebdbf074c04b..301015eba61f 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -2471,6 +2471,7 @@ tf_cc_tests(
     create_named_test_suite = True,
     data = [
         "testdata/simplify_ici_dummy_variables_pass_before.pbtxt",
+        "testdata/simplify_ici_dummy_variables_pass_updatevars_before.pbtxt",
     ],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
diff --git a/tensorflow/core/common_runtime/all_to_all_test.cc b/tensorflow/core/common_runtime/all_to_all_test.cc
index ba483eb9452a..96fc2c3581c3 100644
--- a/tensorflow/core/common_runtime/all_to_all_test.cc
+++ b/tensorflow/core/common_runtime/all_to_all_test.cc
@@ -150,7 +150,7 @@ TEST_F(AllToAllTest, WrongFirstDimensionSize) {
       absl::Status status = RunCollective(test_env_.get(), col_params.get(),
                                           device, &tensors[i], &tensors[i]);
       counter.DecrementCount();
-      EXPECT_TRUE(errors::IsInvalidArgument(status));
+      EXPECT_TRUE(absl::IsInvalidArgument(status));
     });
   }
   counter.Wait();
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index b549b012a9ff..5a394c4c6c85 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -267,7 +267,7 @@ TEST_F(BufRendezvousTest, DeviceIncarnationMismatch) {
       },
       /*cancellation_manager=*/nullptr);
   note.WaitForNotification();
-  EXPECT_TRUE(errors::IsFailedPrecondition(cons_status));
+  EXPECT_TRUE(absl::IsFailedPrecondition(cons_status));
 }
 
 TEST_F(BufRendezvousTest, ProvideThenCancel) {
@@ -282,7 +282,7 @@ TEST_F(BufRendezvousTest, ProvideThenCancel) {
       &cm_);
   cm_.StartCancel();
   note.WaitForNotification();
-  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_TRUE(absl::IsCancelled(status));
   EXPECT_NE(
       status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
@@ -301,7 +301,7 @@ TEST_F(BufRendezvousTest, CancelThenProvide) {
       },
       &cm_);
   note.WaitForNotification();
-  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_TRUE(absl::IsCancelled(status));
   EXPECT_NE(
       status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
@@ -320,7 +320,7 @@ TEST_F(BufRendezvousTest, ConsumeThenCancel) {
       &cm_);
   cm_.StartCancel();
   note.WaitForNotification();
-  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_TRUE(absl::IsCancelled(status));
   EXPECT_NE(
       status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
@@ -339,7 +339,7 @@ TEST_F(BufRendezvousTest, CancelThenConsume) {
       },
       &cm_);
   note.WaitForNotification();
-  EXPECT_TRUE(errors::IsCancelled(status));
+  EXPECT_TRUE(absl::IsCancelled(status));
   EXPECT_NE(
       status.message().find(absl::StrCat(
           "Operation was cancelled for BufRendezvous key ", *kDefaultKey)),
@@ -392,23 +392,23 @@ TEST_F(BufRendezvousTest, CancelThenProvideConsume) {
       *kDefaultKey, default_device_, fake_device_context_, &a_, aa_,
       [&prod_status, &prod_callback_called](const absl::Status& s) {
         prod_status = s;
-        EXPECT_TRUE(errors::IsCancelled(prod_status));
+        EXPECT_TRUE(absl::IsCancelled(prod_status));
         prod_callback_called = true;
       },
       &cm_);
   EXPECT_TRUE(prod_callback_called);
-  EXPECT_TRUE(errors::IsCancelled(prod_status));
+  EXPECT_TRUE(absl::IsCancelled(prod_status));
   br_->ConsumeBuf(
       *kDefaultKey, *kDefaultDeviceName, kDefaultIncarnation,
       [&cons_status, &cons_callback_called](const absl::Status& s,
                                             BufRendezvous::Hook* h) {
         cons_status = s;
-        EXPECT_TRUE(errors::IsCancelled(cons_status));
+        EXPECT_TRUE(absl::IsCancelled(cons_status));
         cons_callback_called = true;
       },
       &cm_);
   EXPECT_TRUE(cons_callback_called);
-  EXPECT_TRUE(errors::IsCancelled(cons_status));
+  EXPECT_TRUE(absl::IsCancelled(cons_status));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index ff60c2d5dcd9..f52655dda366 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -165,7 +165,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, CheckHealth) {
                           done.Notify();
                         });
   done.WaitForNotification();
-  EXPECT_TRUE(errors::IsInternal(status));
+  EXPECT_TRUE(absl::IsInternal(status));
 }
 
 TEST_F(CollectiveRemoteAccessLocalTest, RecvThenCancel) {
@@ -187,7 +187,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, RecvThenCancel) {
   cm_->StartCancel();
   recv_note.WaitForNotification();
   EXPECT_TRUE(cm_->IsCancelled());
-  EXPECT_TRUE(errors::IsCancelled(recv_status));
+  EXPECT_TRUE(absl::IsCancelled(recv_status));
 }
 
 TEST_F(CollectiveRemoteAccessLocalTest, CancelThenRecv) {
@@ -209,7 +209,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, CancelThenRecv) {
                      });
   recv_note.WaitForNotification();
   EXPECT_TRUE(cm_->IsCancelled());
-  EXPECT_TRUE(errors::IsCancelled(recv_status));
+  EXPECT_TRUE(absl::IsCancelled(recv_status));
 }
 
 TEST_F(CollectiveRemoteAccessLocalTest, PostThenCancel) {
@@ -231,7 +231,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostThenCancel) {
   cm_->StartCancel();
   send_note.WaitForNotification();
   EXPECT_TRUE(cm_->IsCancelled());
-  EXPECT_TRUE(errors::IsCancelled(send_status));
+  EXPECT_TRUE(absl::IsCancelled(send_status));
 }
 
 TEST_F(CollectiveRemoteAccessLocalTest, CancelThenPost) {
@@ -253,7 +253,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, CancelThenPost) {
                    });
   send_note.WaitForNotification();
   EXPECT_TRUE(cm_->IsCancelled());
-  EXPECT_TRUE(errors::IsCancelled(send_status));
+  EXPECT_TRUE(absl::IsCancelled(send_status));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/cost_constants.h b/tensorflow/core/common_runtime/cost_constants.h
index df01bf53826e..5ea9892da774 100644
--- a/tensorflow/core/common_runtime/cost_constants.h
+++ b/tensorflow/core/common_runtime/cost_constants.h
@@ -32,6 +32,12 @@ inline constexpr char kNoOpCostName[] = "no_op";
 inline constexpr char kWithSmearSuffix[] = "_with_smear";
 inline constexpr char kNoSmearSuffix[] = "_no_smear";
 inline constexpr char kNonBatchingSuffix[] = "_non_batching";
+inline constexpr char kDecodeWithSmearSuffix[] = "_decode_with_smear";
+inline constexpr char kDecodeNoSmearSuffix[] = "_decode_no_smear";
+inline constexpr char kInsertWithSmearSuffix[] = "_insert_with_smear";
+inline constexpr char kInsertNoSmearSuffix[] = "_insert_no_smear";
+inline constexpr char kGenerateWithSmearSuffix[] = "_generate_with_smear";
+inline constexpr char kGenerateNoSmearSuffix[] = "_generate_no_smear";
 
 // Full names of per-request cost.
 inline constexpr char kTpuWithSmearCostName[] = "tpu_with_smear";
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index 76d64f79af02..9c37edfea233 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -53,19 +53,19 @@ TEST_F(DeviceResolverLocalTest, GetDeviceAttributesKnown) {
 
 TEST_F(DeviceResolverLocalTest, GetDeviceAttributesUnknown) {
   DeviceAttributes attributes;
-  EXPECT_TRUE(errors::IsNotFound(drl_->GetDeviceAttributes(
+  EXPECT_TRUE(absl::IsNotFound(drl_->GetDeviceAttributes(
       "/job:localhost/replica:0/task:0/device:CPU:9", &attributes)));
 }
 
 TEST_F(DeviceResolverLocalTest, GetAllDeviceAttributes) {
   std::vector<DeviceAttributes> attributes;
-  EXPECT_TRUE(errors::IsInternal(
-      drl_->GetAllDeviceAttributes(/*task*/ "", &attributes)));
+  EXPECT_TRUE(
+      absl::IsInternal(drl_->GetAllDeviceAttributes(/*task*/ "", &attributes)));
 }
 
 TEST_F(DeviceResolverLocalTest, UpdateDeviceAttributes) {
   std::vector<DeviceAttributes> attributes;
-  EXPECT_TRUE(errors::IsInternal(drl_->UpdateDeviceAttributes(attributes)));
+  EXPECT_TRUE(absl::IsInternal(drl_->UpdateDeviceAttributes(attributes)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 30d8d12897b4..5309d473ca20 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -904,7 +904,7 @@ absl::Status DirectSession::Run(
     }
   }
   const absl::Status s = call_frame.SetArgs(feed_args);
-  if (errors::IsInternal(s)) {
+  if (absl::IsInternal(s)) {
     return errors::InvalidArgument(s.message());
   } else if (!s.ok()) {
     return s;
@@ -925,7 +925,7 @@ absl::Status DirectSession::Run(
     std::vector<Tensor> sorted_outputs;
     const absl::Status s = call_frame.ConsumeRetvals(
         &sorted_outputs, /* allow_dead_tensors = */ false);
-    if (errors::IsInternal(s)) {
+    if (absl::IsInternal(s)) {
       return errors::InvalidArgument(s.message());
     } else if (!s.ok()) {
       return s;
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 6622b239b465..1a8c2c1160ee 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -188,7 +188,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
     }
 
     absl::Status s = session->RunCallable(handle, {}, nullptr, nullptr);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(
         absl::StrContains(s.message(), "`fetch_tensors` must be provided"));
 
@@ -196,12 +196,12 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
 
     std::vector<Tensor> outputs;
     s = session->RunCallable(handle, {}, &outputs, nullptr);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(
         s.message(), "Attempted to run callable after handle was released"));
 
     s = session->RunCallable(handle + 1, {}, &outputs, nullptr);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(s.message(), "No such callable handle"));
   }
 }
@@ -231,7 +231,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_OptimizeForStaticGraph) {
   EXPECT_FLOAT_EQ(5.0, mat(0, 0));
 
   s = session->Extend({});
-  EXPECT_TRUE(errors::IsFailedPrecondition(s));
+  EXPECT_TRUE(absl::IsFailedPrecondition(s));
   EXPECT_TRUE(absl::StrContains(s.message(), "optimize_for_static_graph"));
 }
 
@@ -268,7 +268,7 @@ TEST_F(DirectSessionMinusAXTest,
   s = session->Run(run_options, inputs, output_names, target_nodes, &outputs,
                    &run_metadata);
 
-  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(absl::IsInvalidArgument(s));
   EXPECT_TRUE(
       absl::StrContains(s.message(), "disable_output_partition_graphs"));
 }
@@ -305,7 +305,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_FinalizeWithCallables) {
   // Making a new callable fails because the session has been finalized.
   absl::Status s =
       session->MakeCallable(MakeCallableOptions({}, {y_ + ":0"}, {}), &handle);
-  EXPECT_TRUE(errors::IsFailedPrecondition(s));
+  EXPECT_TRUE(absl::IsFailedPrecondition(s));
   EXPECT_TRUE(absl::StrContains(s.message(), "Session has been finalized."));
 }
 
@@ -337,7 +337,7 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_FinalizeWithRun) {
 
   // Running a different subgraph fails because the session has been finalized.
   absl::Status s = session->Run({}, {y_ + ":0"}, {}, &outputs);
-  EXPECT_TRUE(errors::IsFailedPrecondition(s));
+  EXPECT_TRUE(absl::IsFailedPrecondition(s));
   EXPECT_TRUE(absl::StrContains(s.message(), "Session has been finalized."));
 }
 
@@ -543,7 +543,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
 
     Session::CallableHandle handle;
     absl::Status s = session->MakeCallable(callable_options, &handle);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(s.message(), "would create a cycle"));
   }
 
@@ -557,7 +557,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
 
     Session::CallableHandle handle;
     absl::Status s = session->MakeCallable(callable_options, &handle);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(s.message(), "unknown node"));
   }
 
@@ -572,7 +572,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
 
     Session::CallableHandle handle;
     absl::Status s = session->MakeCallable(callable_options, &handle);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(s.message(), "unknown edge"));
   }
 
@@ -586,7 +586,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
 
     Session::CallableHandle handle;
     absl::Status s = session->MakeCallable(callable_options, &handle);
-    EXPECT_TRUE(errors::IsNotFound(s));
+    EXPECT_TRUE(absl::IsNotFound(s));
     EXPECT_TRUE(absl::StrContains(s.message(), "unable to find feed output"));
   }
 
@@ -603,7 +603,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
 
     Session::CallableHandle handle;
     absl::Status s = session->MakeCallable(callable_options, &handle);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
   }
 
@@ -618,7 +618,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
 
     Session::CallableHandle handle;
     absl::Status s = session->MakeCallable(callable_options, &handle);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
   }
 }
@@ -1043,7 +1043,7 @@ TEST(DirectSessionTest, MultipleFeedTest) {
       {{first_const->name(), value_11}, {first_const->name(), value_22}},
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs);
-  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(absl::IsInvalidArgument(s));
   EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
 }
 
@@ -1126,7 +1126,7 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
           {first_const->name(), first_const->name()},
           {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
       &handle);
-  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(absl::IsInvalidArgument(s));
   EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
 }
 
@@ -1280,7 +1280,7 @@ TEST(DirectSessionTest, MultipleFeedTestSomeSyncRun) {
       {{first_const->name(), value_11}, {first_const->name(), value_22}},
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs, nullptr);
-  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(absl::IsInvalidArgument(s));
   EXPECT_TRUE(absl::StrContains(s.message(), "fed more than once"));
 }
 
@@ -1463,8 +1463,7 @@ TEST(DirectSessionTest, SessionMetadataKey) {
 
   // Trying to use the same metadata (name, version) will cause an error.
   Session* dup_ptr;
-  EXPECT_TRUE(
-      errors::IsInvalidArgument(NewSession(session_options0, &dup_ptr)));
+  EXPECT_TRUE(absl::IsInvalidArgument(NewSession(session_options0, &dup_ptr)));
 
   // A new (name, version) is fine.
   auto session_options1 = DefaultSessionOptions();
@@ -1503,7 +1502,7 @@ TEST(DirectSessionTest, SessionMetadataInvalid) {
   // Version should be >= 0.
   invalid_metadata->set_version(-1);
   Session* error_sess_ptr;
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       NewSession(invalid_session_options, &error_sess_ptr)));
 }
 
@@ -1657,7 +1656,7 @@ TEST(DirectSessionTest, DarthKernel) {
   TF_ASSERT_OK(sess->Create(def));
   std::vector<Tensor> outputs;
   auto s = sess->Run({}, {y->name() + ":0"}, {}, &outputs);
-  EXPECT_TRUE(errors::IsInternal(s));
+  EXPECT_TRUE(absl::IsInternal(s));
 }
 
 // Have the Darth op in the graph placed on GPU, but don't run it.
@@ -1677,7 +1676,7 @@ TEST(DirectSessionTest, PlacePrunedGraph) {
     SessionOptions options;
     std::unique_ptr<Session> sess(NewSession(options));
     auto s = sess->Create(def);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
   }
 
   {
@@ -1794,7 +1793,7 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
   value_11.scalar<float>()() = 11.0;
   s = session->PRun(handle, {{first_const->name(), value_11}},
                     {third_identity->name() + ":0"}, &outputs);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
   EXPECT_TRUE(
       absl::StrContains(s.message(), "can't be computed from the feeds"));
 }
@@ -1825,7 +1824,7 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
 
   // Fetch fourth_identity without feeds.
   s = session->PRun(handle, {}, {fourth_identity->name() + ":0"}, &outputs);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
   EXPECT_TRUE(
       absl::StrContains(s.message(), "can't be computed from the feeds"));
 
@@ -1963,7 +1962,7 @@ TEST(DirectSessionTest, CreateGraphFailsWhenAssigningAFedVar) {
   std::vector<Tensor> outputs;
   absl::Status s =
       session->Run({{a->name(), zero}}, {assign->name()}, {}, &outputs);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 }
 
 TEST(DirectSessionTest, TimeoutSession) {
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index f2ada4a8ab85..46d6d0ab7e7e 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -125,6 +125,7 @@ tf_cuda_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@local_xla//xla/tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
@@ -253,7 +254,6 @@ tf_cuda_library(
             "@local_xla//xla/pjrt/gpu:se_gpu_pjrt_client",
             "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service",
             "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
-            "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_impl",
             "@local_xla//xla/tsl/distributed_runtime/preemption:preemption_notifier",
             "@local_xla//xla/tsl/platform:statusor",
         ],
@@ -267,7 +267,6 @@ tf_cuda_library(
             clean_dep("//tensorflow:linux_x86_64_with_weightwatcher"): [],
             (
                 clean_dep("//tensorflow:linux_x86_64"),
-                clean_dep("//tensorflow:haswell"),
             ): [
                 "//tensorflow/core",
                 "//tensorflow/core/framework:resource_base",
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 8440e298a952..98fa2e7e31b9 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -398,6 +398,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   FunctionLibraryDefinition* FuncLibDef() override { return &func_lib_def_; }
 
+  const FunctionLibraryDefinition* FuncLibDef() const { return &func_lib_def_; }
+
   FunctionLibraryDefinition* GetComponentFunctionFunctionLibraryDefinition(
       const string& function_name) {
     tf_shared_lock lock(cache_mu_);
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 55860a66fbbd..fef34369649e 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -208,7 +208,7 @@ absl::Status EagerOperation::SetAttrShapeList(const char* attr_name,
     }
   }
   MutableAttrs()->Set(
-      attr_name, gtl::ArraySlice<TensorShapeProto>(proto.get(), num_values));
+      attr_name, absl::Span<const TensorShapeProto>(proto.get(), num_values));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index fd2e37b509ae..f9280d6d12fc 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -545,10 +545,12 @@ absl::Status UpdateCompileCounter(const EagerOperation* op,
   string device_type = CanonicalizeDeviceType(op->GetDeviceParsedName().type);
   string compilation_option = kDisabled;
   if (!compile_with_xla) {
-    bool nested_jit_compile;
+    bool nested_jit_compile = false;
     string device;
-    TF_RETURN_IF_ERROR(
-        HasNestedJitCompile(*op, ctx, &nested_jit_compile, &device));
+    if (!ctx.FuncLibDef()->HasOptimizedFunctionGraph(op->Name())) {
+      TF_RETURN_IF_ERROR(
+          HasNestedJitCompile(*op, ctx, &nested_jit_compile, &device));
+    }
     if (nested_jit_compile) {
       if (!device.empty()) {
         tsl::DeviceNameUtils::ParsedName device_parsed_name;
diff --git a/tensorflow/core/common_runtime/eager/execute_test.cc b/tensorflow/core/common_runtime/eager/execute_test.cc
index e424f217130c..ea174fd22f76 100644
--- a/tensorflow/core/common_runtime/eager/execute_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_test.cc
@@ -169,7 +169,7 @@ TEST(ExecuteTest, SimpleFunctionInt32BadFullType) {
   std::vector<TensorHandle*> retvals(1);
   int num_retvals = retvals.size();
   absl::Status status = EagerExecute(op.get(), retvals.data(), &num_retvals);
-  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(absl::IsInvalidArgument(status)) << "Actual status: " << status;
   EXPECT_TRUE(
       absl::StrContains(status.message(), "TFT_TENSOR has 0 args instead of 1"))
       << "Actual: " << status.message();
diff --git a/tensorflow/core/common_runtime/eager/placement_test.cc b/tensorflow/core/common_runtime/eager/placement_test.cc
index b89b9384ba71..87bdf17a449d 100644
--- a/tensorflow/core/common_runtime/eager/placement_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_test.cc
@@ -128,7 +128,7 @@ TEST_F(PlacementTest, SelectDeviceExplicitHardPlacement) {
 
   absl::Status status = context()->SelectDevice(requested, invalid_op, &dev);
   LOG(ERROR) << status;
-  EXPECT_TRUE(errors::IsNotFound(status));
+  EXPECT_TRUE(absl::IsNotFound(status));
   EXPECT_TRUE(
       absl::StrContains(status.message(), "Could not find device for node"))
       << "unexpected error message " << status.message();
@@ -138,7 +138,7 @@ TEST_F(PlacementTest, SelectDeviceExplicitHardPlacement) {
   NodeDef node = NDef("x", "TestOp", {}, {});
   status = context()->SelectDevice(requested, node, &dev);
 
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_TRUE(absl::IsInvalidArgument(status));
   EXPECT_TRUE(absl::StrContains(status.message(),
                                 "Could not satisfy device specification"))
       << "unexpected error message " << status.message();
@@ -169,7 +169,7 @@ TEST_F(PlacementTest, SelectDeviceExplicitSoftPlacement) {
 
   absl::Status status = context()->SelectDevice(requested, invalid_op, &dev);
   LOG(ERROR) << status;
-  EXPECT_TRUE(errors::IsNotFound(status));
+  EXPECT_TRUE(absl::IsNotFound(status));
   EXPECT_TRUE(
       absl::StrContains(status.message(), "Could not find device for node"))
       << "unexpected error message " << status.message();
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 6634aa723b56..08e163e8ac6c 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -94,9 +94,9 @@ static const Tensor* const kEmptyTensor = new Tensor;
 namespace nodestats {
 inline int64_t NowInNsec() { return EnvTime::NowNanos(); }
 
-void SetScheduled(NodeExecStatsInterface* stats, int64_t micros) {
+void SetScheduled(NodeExecStatsInterface* stats, int64_t nanos) {
   if (!stats) return;
-  stats->SetScheduled(micros * EnvTime::kMicrosToNanos);
+  stats->SetScheduled(nanos);
 }
 
 void SetAllStart(NodeExecStatsInterface* stats) {
@@ -1228,7 +1228,7 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
         // trigger cancellation, and here we make sure the original error is
         // exposed to users and not buried as a derived error.
         if (cancellation_manager_ && cancellation_manager_->IsCancelled() &&
-            (errors::IsCancelled(s) || errors::IsAborted(s))) {
+            (absl::IsCancelled(s) || absl::IsAborted(s))) {
           status_ = StatusGroup::MakeDerived(s);
           maybe_derived_s = status_;
         } else {
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 8a814ed2b683..ca733d1b9173 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -104,6 +104,11 @@ class ExecutorTest : public ::testing::Test {
     return exec_->Run(args);
   }
 
+  const StepStats& GetStepStats() {
+    step_stats_collector_.Finalize();
+    return step_stats_;
+  }
+
   thread::ThreadPool* thread_pool_ = nullptr;
   std::unique_ptr<Device> device_;
   Executor* exec_ = nullptr;
@@ -185,6 +190,60 @@ TEST_F(ExecutorTest, SimpleAdd) {
   EXPECT_EQ(2.0, V(out));  // out = 1.0 + 1.0 = 2.0
 }
 
+TEST_F(ExecutorTest, StepStatsNumerical) {
+  // Similar to SimpleAdd, but tests numerical values in StepStats
+
+  // c = a + b
+  auto g = std::make_unique<Graph>(OpRegistry::Global());
+  auto in0 = test::graph::Recv(g.get(), "a", "float", ALICE, 1, BOB);
+  auto in1 = test::graph::Recv(g.get(), "b", "float", ALICE, 1, BOB);
+  auto tmp = test::graph::Add(g.get(), in0, in1);
+  test::graph::Send(g.get(), tmp, "c", BOB, 1, ALICE);
+  Create(std::move(g));
+  Rendezvous::Args args;
+  TF_ASSERT_OK(rendez_->Send(Key(ALICE, kIncarnation, BOB, "a"), args, V(1.0),
+                             false));  // in0 = 1.0
+  TF_ASSERT_OK(rendez_->Send(Key(ALICE, kIncarnation, BOB, "b"), args, V(1.0),
+                             false));  // in1 = 1.0
+  TF_ASSERT_OK(Run(rendez_));
+  Tensor out = V(-1);
+  bool is_dead = false;
+  TF_ASSERT_OK(
+      rendez_->Recv(Key(BOB, kIncarnation, ALICE, "c"), args, &out, &is_dead));
+  EXPECT_EQ(2.0, V(out));  // out = 1.0 + 1.0 = 2.0
+
+  auto& step_stats = GetStepStats();
+  EXPECT_EQ(1, step_stats.dev_stats_size());
+  for (const auto& dev_stat : step_stats.dev_stats()) {
+    EXPECT_EQ(2, dev_stat.node_stats_size());
+    for (const auto& node_stat : dev_stat.node_stats()) {
+      // scheduled_nanos <= all_start_nanos <= op_start_nanos <= op_end_nanos <=
+      // all_end_nanos
+      auto scheduled_nanos = node_stat.scheduled_nanos();
+      auto all_start_nanos = node_stat.all_start_nanos();
+      auto op_start_nanos = all_start_nanos + node_stat.op_start_rel_nanos();
+      auto op_end_nanos = all_start_nanos + node_stat.op_end_rel_nanos();
+      auto all_end_nanos = all_start_nanos + node_stat.all_end_rel_nanos();
+      EXPECT_LE(scheduled_nanos, all_start_nanos);
+      EXPECT_LE(all_start_nanos, op_start_nanos);
+      EXPECT_LE(op_start_nanos, op_end_nanos);
+      EXPECT_LE(op_end_nanos, all_end_nanos);
+
+      auto scheduled_micros = node_stat.scheduled_micros();
+      auto all_start_micros = node_stat.all_start_micros();
+      auto op_start_micros = all_start_micros + node_stat.op_start_rel_micros();
+      auto op_end_micros = all_start_micros + node_stat.op_end_rel_micros();
+      auto all_end_micros = all_start_micros + node_stat.all_end_rel_micros();
+      const int64_t kMicrosToNanos = 1000;
+      EXPECT_EQ(scheduled_nanos / kMicrosToNanos, scheduled_micros);
+      EXPECT_EQ(all_start_nanos / kMicrosToNanos, all_start_micros);
+      EXPECT_EQ(op_start_nanos / kMicrosToNanos, op_start_micros);
+      EXPECT_EQ(op_end_nanos / kMicrosToNanos, op_end_micros);
+      EXPECT_EQ(all_end_nanos / kMicrosToNanos, all_end_micros);
+    }
+  }
+}
+
 TEST_F(ExecutorTest, SelfAdd) {
   // v0 <- a
   // v1 = v0 + v0
@@ -380,10 +439,10 @@ TEST_F(ExecutorTest, Abort) {
     rendez_->StartAbort(errors::Aborted(""));
     rendez_->Unref();
   });
-  EXPECT_TRUE(errors::IsAborted(Run(rendez_)));
+  EXPECT_TRUE(absl::IsAborted(Run(rendez_)));
   Tensor out = V(-1);
   bool is_dead = false;
-  EXPECT_TRUE(errors::IsAborted(rendez_->Recv(
+  EXPECT_TRUE(absl::IsAborted(rendez_->Recv(
       Key(BOB, kIncarnation, ALICE, "c"), Rendezvous::Args(), &out, &is_dead)));
   // At this point there can still be pending (albeit Aborted) Send
   // closures holding Refs on rendez_.  We need to wait for them, or
@@ -408,10 +467,10 @@ TEST_F(ExecutorTest, RecvInvalidDtype) {
   TF_ASSERT_OK(rendez->Send(Key(ALICE, 1, BOB, "one"), Rendezvous::Args(),
                             VD(1.0), false));
   // Fails due to invalid dtype.
-  EXPECT_TRUE(errors::IsInternal(Run(rendez)));
+  EXPECT_TRUE(absl::IsInternal(Run(rendez)));
   Tensor output;
   bool is_dead;
-  EXPECT_TRUE(errors::IsInternal(rendez->Recv(
+  EXPECT_TRUE(absl::IsInternal(rendez->Recv(
       Key(BOB, 1, ALICE, "two"), Rendezvous::Args(), &output, &is_dead)));
   rendez->Unref();
 }
@@ -423,10 +482,10 @@ TEST_F(ExecutorTest, RecvInvalidRefDtype) {
   test::graph::Send(g.get(), var, "out", BOB, 1, ALICE);
   Create(std::move(g));
   Rendezvous* rendez = NewLocalRendezvous();
-  EXPECT_TRUE(errors::IsInternal(Run(rendez)));
+  EXPECT_TRUE(absl::IsInternal(Run(rendez)));
   Tensor output;
   bool is_dead;
-  EXPECT_TRUE(errors::IsInternal(rendez->Recv(
+  EXPECT_TRUE(absl::IsInternal(rendez->Recv(
       Key(BOB, 1, ALICE, "out"), Rendezvous::Args(), &output, &is_dead)));
   rendez->Unref();
 }
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
index bac8b98f2f39..c215f18c2acf 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
@@ -57,7 +57,7 @@ TEST(FunctionOptimizationPassRegistry, PassWithError) {
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
-  EXPECT_TRUE(errors::IsUnknown(status));
+  EXPECT_TRUE(absl::IsUnknown(status));
   EXPECT_TRUE(FailingFunctionPass::ran_);
 }
 
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 13a2e02668d1..befef7eba8cb 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -159,7 +159,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     absl::Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(absl::IsNotFound(status2));
     EXPECT_TRUE(absl::StrContains(status2.message(), "Handle"));
     EXPECT_TRUE(absl::StrContains(status2.message(), "not found"));
 
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 15968c019e9e..b65e5f720edd 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -1,4 +1,6 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_xla//xla/tsl:tsl.bzl", "if_cuda_libs")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -201,6 +203,7 @@ tf_cuda_library(
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla/stream_executor/gpu:gpu_init_impl",
         "@local_xla//xla/stream_executor/integrations:stream_executor_allocator",
         "@local_xla//xla/tsl/framework:device_id_utils",
@@ -214,7 +217,6 @@ tf_cuda_library(
             clean_dep("//tensorflow:linux_x86_64_with_weightwatcher"): [],
             (
                 clean_dep("//tensorflow:linux_x86_64"),
-                clean_dep("//tensorflow:haswell"),
             ): [
                 "//tensorflow/compiler/tf2xla:layout_util",
                 "//tensorflow/compiler/jit:flags",
@@ -230,6 +232,10 @@ tf_cuda_library(
         ]),
     ) + if_cuda_or_rocm([
         "@local_tsl//tsl/platform:dso_loader",
+    ]) + if_cuda([
+        "@local_xla//xla/stream_executor/cuda:all_runtime",
+    ]) + if_rocm([
+        "@local_xla//xla/stream_executor/rocm:all_runtime",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 74ff3f7c39cd..26eee311a61f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "xla/layout_util.h"
+#include "xla/shape.h"
 
 // TODO(b/282059652): Merge google internal and open-source code path once TF
 // dependency issue is resolved.
@@ -361,9 +363,18 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
       dynamic_cast<const PjRtTensorBuffer*>(DMAHelper::buffer(gpu_tensor));
   if (pjrt_tensor_buffer != nullptr) {
     VLOG(1) << "CopyGPUTensorToCPU using PjRtTensorBuffer";
-    auto literal = std::make_unique<xla::MutableBorrowingLiteral>();
-    auto status = tensorflow::HostTensorToMutableBorrowingLiteral(
-        cpu_tensor, literal.get());
+    std::unique_ptr<xla::MutableBorrowingLiteral> literal;
+    if (pjrt_tensor_buffer->pjrt_buffer()->has_dynamic_dimensions()) {
+      literal = std::make_unique<xla::MutableBorrowingLiteral>();
+      auto status = tensorflow::HostTensorToMutableBorrowingLiteral(
+          cpu_tensor, literal.get());
+    } else {
+      xla::Shape shape = pjrt_tensor_buffer->pjrt_buffer()->on_device_shape();
+      *shape.mutable_layout() =
+          xla::LayoutUtil::MakeDescendingLayout(shape.dimensions().size());
+      literal = std::make_unique<xla::MutableBorrowingLiteral>(
+          cpu_tensor->tensor_data().data(), shape);
+    }
     xla::PjRtFuture<> future =
         pjrt_tensor_buffer->pjrt_buffer()->ToLiteral(literal.get());
     future.OnReady([literal = std::move(literal),
diff --git a/tensorflow/core/common_runtime/graph_constructor_test.cc b/tensorflow/core/common_runtime/graph_constructor_test.cc
index 619f4ad38e89..9494bf48f9a7 100644
--- a/tensorflow/core/common_runtime/graph_constructor_test.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_test.cc
@@ -988,7 +988,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef) {
 
   // Importing again should fail because of node name collisions.
   s = ImportGraphDef(opts, def, &graph_, nullptr);
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 
   // But succeed if a unique prefix is provided.
   opts.prefix = "import";
@@ -1046,17 +1046,17 @@ TEST_F(GraphConstructorTest, ImportGraphDef_Versioning) {
 
   def.mutable_versions()->set_producer(TF_GRAPH_DEF_VERSION_MIN_PRODUCER - 1);
   absl::Status s = ImportGraphDef(opts, def, &graph_, nullptr);
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 
   def.mutable_versions()->Clear();
   def.mutable_versions()->set_min_consumer(TF_GRAPH_DEF_VERSION + 1);
   s = ImportGraphDef(opts, def, &graph_, nullptr);
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 
   def.mutable_versions()->Clear();
   def.mutable_versions()->add_bad_consumers(TF_GRAPH_DEF_VERSION);
   s = ImportGraphDef(opts, def, &graph_, nullptr);
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 
   def.mutable_versions()->Clear();
   graph_.ToGraphDef(&def);
@@ -3220,7 +3220,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateColocationConstraints) {
   // TODO(yaozhang): Extend ExpectError to check error type and use ExpectError
   // and ExpectOK to replace the code below.
   absl::Status s = ImportGraphDef(options, def, &graph_, nullptr);
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
   options.validate_colocation_constraints = false;
   TF_EXPECT_OK(ImportGraphDef(options, def, &graph_, nullptr));
 }
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index d1fe278a3443..83d15e712820 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -115,8 +115,8 @@ struct NodeItem {
     return absl::Span<EdgeInfo>(output_edge_base(), num_output_edges);
   }
 
-  gtl::ArraySlice<EdgeInfo> output_edges() const {
-    return gtl::ArraySlice<EdgeInfo>(output_edge_base(), num_output_edges);
+  absl::Span<const EdgeInfo> output_edges() const {
+    return absl::Span<const EdgeInfo>(output_edge_base(), num_output_edges);
   }
 
   gtl::ArraySlice<ControlEdgeInfo> output_control_edges() const {
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
index b610c9bda148..ac151a4df5d6 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
@@ -62,6 +62,16 @@ tf_cc_shared_object(
     ],
 )
 
+cc_library(
+    name = "test_next_pluggable_device_plugin",
+    srcs = ["test_next_pluggable_device_plugin.cc"],
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":plugin_c_api_hdrs",
+        "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "outside_compilation_params",
     hdrs = ["outside_compilation_params.h"],
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
index e44e5f3ff26a..89a71117e2d2 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
@@ -114,7 +114,7 @@ typedef void TFNPD_InitPluginInternalDeviceStates(TF_Status* status);
 // --------------------------- C API access ------------------------------------
 #define TFNPD_API_STRUCT_FN(fn_type) fn_type* fn_type
 
-typedef struct {
+typedef struct TFNPD_Api {
   size_t struct_size;
   void* priv;
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
index 0ebfdd96819d..a8ca6ab99769 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
@@ -137,6 +137,11 @@ class TestCoordinationClient : public CoordinationClient {
                          StatusCallback done) override {
     done(absl::UnimplementedError("GetTaskStateAsync"));
   }
+  void GetJobStateAsync(const tsl::GetJobStateRequest* request,
+                        tsl::GetJobStateResponse* response,
+                        StatusCallback done) override {
+    done(absl::UnimplementedError("GetJobStateAsync"));
+  }
   void WaitForAllTasksAsync(const tsl::WaitForAllTasksRequest* request,
                             tsl::WaitForAllTasksResponse* response,
                             StatusCallback done) override {
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index 38e0f3483061..71e1f947a10a 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -129,7 +129,7 @@ TEST_F(PartitioningUtilsTest, GraphWithoutAssignedDevicesFails) {
   std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
   absl::Status status =
       PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
-  ASSERT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+  ASSERT_TRUE(absl::IsInvalidArgument(status)) << status.ToString();
 }
 
 TEST_F(PartitioningUtilsTest, OneDevice) {
diff --git a/tensorflow/core/common_runtime/pluggable_device/BUILD b/tensorflow/core/common_runtime/pluggable_device/BUILD
index 25944e3fec1a..2902bd45baf9 100644
--- a/tensorflow/core/common_runtime/pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/pluggable_device/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_copts")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
@@ -223,3 +223,23 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+tf_cc_test(
+    name = "pluggable_device_plugin_init_test",
+    size = "small",
+    srcs = ["pluggable_device_plugin_init_test.cc"],
+    deps = [
+        ":pluggable_device_plugin_init",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c/experimental/grappler:grappler_hdrs",
+        "//tensorflow/c/experimental/pluggable_profiler:pluggable_profiler_hdrs",
+        "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
+        "//tensorflow/c/experimental/stream_executor/test:test_pluggable_device",  # buildcleaner: keep
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:test_next_pluggable_device_plugin",  # buildcleaner: keep
+        "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
index ee116abdb439..5e41c8db0c39 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h"
+
 #include <memory>
 #include <string>
 
@@ -37,24 +39,15 @@ limitations under the License.
 #include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-static absl::Status InitDeviceModule(void* dso_handle) {
-  void* dso_symbol;
-  tensorflow::Env* env = tensorflow::Env::Default();
-  absl::Status status =
-      env->GetSymbolFromLibrary(dso_handle, "SE_InitPlugin", &dso_symbol);
-
-  if (absl::IsNotFound(status)) {
-    VLOG(1) << "Device module not found.";
+static absl::Status InitDeviceModule(stream_executor::SEInitPluginFn init_fn) {
+  if (init_fn == nullptr) {
+    VLOG(1) << "Device module init function not found.";
     return absl::OkStatus();
-  } else if (status != absl::OkStatus()) {
-    return status;
   }
-  auto init_fn = reinterpret_cast<stream_executor::SEInitPluginFn>(dso_symbol);
 
   string device_type, platform_name;
   TF_RETURN_IF_ERROR(stream_executor::InitStreamExecutorPlugin(
@@ -74,45 +67,29 @@ static absl::Status InitDeviceModule(void* dso_handle) {
   return absl::OkStatus();
 }
 
-typedef const PJRT_Api* (*PjrtApiInitFn)();
-static absl::Status InitNextPluggableDeviceModule(void* dso_handle) {
-  void* dso_symbol;
-  tensorflow::Env* env = tensorflow::Env::Default();
-
-  // Loads the next pluggable device.
-  absl::Status status =
-      env->GetSymbolFromLibrary(dso_handle, "TFNPD_InitPlugin", &dso_symbol);
-  if (absl::IsNotFound(status)) {
-    VLOG(1) << "Next pluggable device module not found.";
+static absl::Status InitNextPluggableDeviceModule(TFNPDInitPluginFn init_fn,
+                                                  PjrtApiInitFn init_pjrt_fn) {
+  if (init_fn == nullptr) {
+    VLOG(1) << "Next pluggable device init function not found.";
     return absl::OkStatus();
-  } else if (status != absl::OkStatus()) {
-    return status;
   }
-  auto init_fn = reinterpret_cast<TFNPDInitPluginFn>(dso_symbol);
   TF_ASSIGN_OR_RETURN(auto init_params, InitNextPluggableDevicePlugin(init_fn));
   std::string device_type(init_params.device_type);
   std::string compilation_device_name(init_params.compilation_device_name);
   int priority = init_params.priority;
   bool is_pluggable_device = init_params.is_pluggable_device;
-
   // Loads the PJRT plugin.
   // TODO(b/265301627): use LoadPjrtPlugin when it supports windows.
-  status = env->GetSymbolFromLibrary(dso_handle, "GetPjrtApi", &dso_symbol);
-  if (absl::IsNotFound(status)) {
-    VLOG(1) << "Loading PJRT plugin failed for " << device_type << ": "
-            << status.message();
+  if (init_pjrt_fn == nullptr) {
+    VLOG(1) << "PJRT plugin init function not found for " << device_type;
     return absl::OkStatus();
-  } else if (!status.ok()) {
-    return status;
   }
-  auto init_pjrt_fn = reinterpret_cast<PjrtApiInitFn>(dso_symbol);
   TF_RETURN_IF_ERROR(pjrt::SetPjrtApi(device_type, init_pjrt_fn()));
   TF_ASSIGN_OR_RETURN(bool is_pjrt_plugin_initialized,
                       pjrt::IsPjrtPluginInitialized(device_type));
   if (!is_pjrt_plugin_initialized) {
     TF_RETURN_IF_ERROR(pjrt::InitializePjrtPlugin(device_type));
   }
-
   DeviceFactory::Register(device_type,
                           std::make_unique<NextPluggableDeviceFactory>(
                               device_type, compilation_device_name),
@@ -137,91 +114,110 @@ static absl::Status InitNextPluggableDeviceModule(void* dso_handle) {
     VLOG(1) << "Registered XlaCompileOnDemand op for device_type: "
             << device_type;
   }
-
   TF_RETURN_IF_ERROR(CopyTensor::Register(
       DeviceType(device_type), DeviceType(device_type), PjRtDeviceToDeviceCopy,
       /*is_pluggable_device=*/true));  // Register the Copy tensor.
-
   VLOG(1) << "Successfully initialized NextPluggableDevice module.";
   return absl::OkStatus();
 }
 
-static absl::Status InitGraphModule(void* dso_handle) {
-  void* dso_symbol;
-  tensorflow::Env* env = tensorflow::Env::Default();
-  absl::Status status =
-      env->GetSymbolFromLibrary(dso_handle, "TF_InitGraph", &dso_symbol);
+static absl::Status InitKernelModule(TFKernelInitFn init_fn) {
+  if (init_fn == nullptr) {
+    VLOG(1) << "Kernel module init function not found.";
+    return absl::OkStatus();
+  }
 
-  if (absl::IsNotFound(status)) {
-    VLOG(1) << "Graph module not found.";
+  init_fn();
+
+  VLOG(1) << "Successfully initialized Kernel module.";
+  return absl::OkStatus();
+}
+
+static absl::Status InitGraphModule(grappler::TFInitGraphPluginFn init_fn) {
+  if (init_fn == nullptr) {
+    VLOG(1) << "Graph module init function not found.";
     return absl::OkStatus();
-  } else if (status != absl::OkStatus()) {
-    return status;
   }
-  auto init_fn = reinterpret_cast<grappler::TFInitGraphPluginFn>(dso_symbol);
   TF_RETURN_IF_ERROR(grappler::InitGraphPlugin(init_fn));
 
   VLOG(1) << "Successfully initialized Graph module.";
   return absl::OkStatus();
 }
 
-typedef void (*TFKernelInitFn)();
-static absl::Status InitKernelModule(void* dso_handle) {
-  void* dso_symbol;
-  tensorflow::Env* env = tensorflow::Env::Default();
-  absl::Status status =
-      env->GetSymbolFromLibrary(dso_handle, "TF_InitKernel", &dso_symbol);
-
-  if (absl::IsNotFound(status)) {
-    VLOG(1) << "Kernel module not found.";
+static absl::Status InitProfilerModule(profiler::TFInitProfilerFn init_fn) {
+  if (init_fn == nullptr) {
+    VLOG(1) << "Profiler module init function not found.";
     return absl::OkStatus();
-  } else if (status != absl::OkStatus()) {
-    return status;
   }
 
-  auto init_fn = reinterpret_cast<TFKernelInitFn>(dso_symbol);
-  init_fn();
+  TF_RETURN_IF_ERROR(profiler::InitPluginProfiler(init_fn));
 
-  VLOG(1) << "Successfully initialized Kernel module.";
+  VLOG(1) << "Successfully initialized Profiler module";
   return absl::OkStatus();
 }
 
-static absl::Status InitProfilerModule(void* dso_handle) {
-  void* dso_symbol;
-  tensorflow::Env* env = tensorflow::Env::Default();
-
+absl::Status FindSymbol(void* dso_handle, const char* name,
+                        void** output_symbol) {
+  static tensorflow::Env* env = tensorflow::Env::Default();
   absl::Status status =
-      env->GetSymbolFromLibrary(dso_handle, "TF_InitProfiler", &dso_symbol);
-
+      env->GetSymbolFromLibrary(dso_handle, name, output_symbol);
   if (absl::IsNotFound(status)) {
-    VLOG(1) << "Profiler module not found.";
+    *output_symbol = nullptr;
     return absl::OkStatus();
-  } else if (status != absl::OkStatus()) {
-    return status;
   }
+  return status;
+}
 
-  auto init_fn = reinterpret_cast<profiler::TFInitProfilerFn>(dso_symbol);
-  TF_RETURN_IF_ERROR(profiler::InitPluginProfiler(init_fn));
+absl::Status RegisterPluggableDevicePlugin(void* dso_handle) {
+  if (dso_handle == nullptr) {
+    return absl::InvalidArgumentError(
+        "RegisterPluggableDevicePlugin called with null dso_handle");
+  }
 
-  VLOG(1) << "Successfully initialized Profiler module";
-  return absl::OkStatus();
+  // Step 1 Find InitDevice functions.
+  PluggableDeviceInit_Api api;
+  TF_RETURN_IF_ERROR(FindSymbol(dso_handle, "SE_InitPlugin",
+                                reinterpret_cast<void**>(&api.init_plugin_fn)));
+  TF_RETURN_IF_ERROR(
+      FindSymbol(dso_handle, "TFNPD_InitPlugin",
+                 reinterpret_cast<void**>(&api.init_np_plugin_fn)));
+  TF_RETURN_IF_ERROR(
+      FindSymbol(dso_handle, "GetPjrtApi",
+                 reinterpret_cast<void**>(&api.get_pjrt_api_fn)));
+  // Step 2 Find InitKernel function.
+  TF_RETURN_IF_ERROR(FindSymbol(dso_handle, "TF_InitKernel",
+                                reinterpret_cast<void**>(&api.init_kernel_fn)));
+  // Step 3 Find InitGraph function.
+  TF_RETURN_IF_ERROR(FindSymbol(dso_handle, "TF_InitGraph",
+                                reinterpret_cast<void**>(&api.init_graph_fn)));
+  // Step 4 Find InitProfiler function.
+  TF_RETURN_IF_ERROR(
+      FindSymbol(dso_handle, "TF_InitProfiler",
+                 reinterpret_cast<void**>(&api.init_profiler_fn)));
+  return RegisterPluggableDevicePlugin(&api);
 }
 
-absl::Status RegisterPluggableDevicePlugin(void* dso_handle) {
+absl::Status RegisterPluggableDevicePlugin(const PluggableDeviceInit_Api* api) {
+  if (api == nullptr) {
+    VLOG(1) << "PluggableDevice_Api is null";
+    return absl::OkStatus();
+  }
+
   // All modules are optional. Only return an error when a module is found but
   // has issues in loading / initializing.
   // Step 1 Init Device Module.
-  TF_RETURN_IF_ERROR(InitDeviceModule(dso_handle));
-  TF_RETURN_IF_ERROR(InitNextPluggableDeviceModule(dso_handle));
+  TF_RETURN_IF_ERROR(InitDeviceModule(api->init_plugin_fn));
+  TF_RETURN_IF_ERROR(InitNextPluggableDeviceModule(api->init_np_plugin_fn,
+                                                   api->get_pjrt_api_fn));
 
   // Step 2 Init Kernel Module.
-  TF_RETURN_IF_ERROR(InitKernelModule(dso_handle));
+  TF_RETURN_IF_ERROR(InitKernelModule(api->init_kernel_fn));
 
   // Step 3 Init Graph Module.
-  TF_RETURN_IF_ERROR(InitGraphModule(dso_handle));
+  TF_RETURN_IF_ERROR(InitGraphModule(api->init_graph_fn));
 
   // Step 4 Init Profiler Module.
-  TF_RETURN_IF_ERROR(InitProfilerModule(dso_handle));
+  TF_RETURN_IF_ERROR(InitProfilerModule(api->init_profiler_fn));
 
   return absl::OkStatus();
 }
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h
index 9676a70662ee..4e3456c7dfe9 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h
@@ -16,11 +16,51 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PLUGIN_INIT_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_PLUGIN_INIT_H_
 
+#include "absl/status/status.h"
 #include "tensorflow/core/platform/status.h"
 
+// Forward declarations to avoid dependency.
+struct PJRT_Api;
+struct SE_PlatformRegistrationParams;
+struct TFNPD_Api;
+struct TFNPD_PluginParams;
+struct TF_ProfilerRegistrationParams;
+struct TP_OptimizerRegistrationParams;
+struct TSL_Status;
+
+namespace stream_executor {
+using SEInitPluginFn = void (*)(SE_PlatformRegistrationParams* const,
+                                TSL_Status* const);
+}  // namespace stream_executor
+
 namespace tensorflow {
 
-absl::Status RegisterPluggableDevicePlugin(void* library_filename);
+namespace grappler {
+using TFInitGraphPluginFn = void (*)(TP_OptimizerRegistrationParams* const,
+                                     TSL_Status* const);
+}  // namespace grappler
+
+namespace profiler {
+using TFInitProfilerFn = void (*)(TF_ProfilerRegistrationParams* const,
+                                  TSL_Status* const);
+}  // namespace profiler
+
+using PjrtApiInitFn = const PJRT_Api* (*)();
+using TFKernelInitFn = void (*)();
+using TFNPDInitPluginFn = const TFNPD_Api* (*)(TFNPD_PluginParams*,
+                                               TSL_Status*);
+
+struct PluggableDeviceInit_Api {
+  ::stream_executor::SEInitPluginFn init_plugin_fn = nullptr;
+  TFNPDInitPluginFn init_np_plugin_fn = nullptr;
+  PjrtApiInitFn get_pjrt_api_fn = nullptr;
+  TFKernelInitFn init_kernel_fn = nullptr;
+  grappler::TFInitGraphPluginFn init_graph_fn = nullptr;
+  profiler::TFInitProfilerFn init_profiler_fn = nullptr;
+};
+
+absl::Status RegisterPluggableDevicePlugin(void* dso_handle);
+absl::Status RegisterPluggableDevicePlugin(const PluggableDeviceInit_Api* api);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init_test.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init_test.cc
new file mode 100644
index 000000000000..5f7f0be57bbc
--- /dev/null
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/c/experimental/grappler/grappler.h"
+#include "tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.h"
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/c/tf_status.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+extern "C" {
+// from test_pluggable_device.cc
+void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status);
+void TF_InitKernel();
+
+// from test_next_pluggable_device_plugin.cc
+const TFNPD_Api* TFNPD_InitPlugin(TFNPD_PluginParams* params,
+                                  TF_Status* tf_status);
+const PJRT_Api* GetPjrtApi();
+}
+
+TEST(PluggableDevicePluginInitTest, StaticInitTest) {
+  static bool init_plugin_fn_called = false;
+
+  auto init_plugin_fn = +[](SE_PlatformRegistrationParams* const platform,
+                            TF_Status* const status) {
+    init_plugin_fn_called = true;
+    SE_InitPlugin(platform, status);
+  };
+
+  PluggableDeviceInit_Api api;
+  // All initialization functions are optional
+  TF_ASSERT_OK(RegisterPluggableDevicePlugin(&api));
+
+  init_plugin_fn_called = false;
+  api.init_plugin_fn = init_plugin_fn;
+  TF_ASSERT_OK(RegisterPluggableDevicePlugin(&api));
+  ASSERT_TRUE(init_plugin_fn_called);
+}
+
+TEST(PluggableDevicePluginInitTest, StaticNPInitTest) {
+  static bool init_np_plugin_fn_called = false;
+  static bool init_pjrt_fn_called = false;
+
+  auto init_np_plugin_fn = +[](TFNPD_PluginParams* plugin_params,
+                               TF_Status* status) -> const TFNPD_Api* {
+    init_np_plugin_fn_called = true;
+    return TFNPD_InitPlugin(plugin_params, status);
+  };
+
+  auto init_pjrt_fn = +[]() -> const PJRT_Api* {
+    init_pjrt_fn_called = true;
+    return GetPjrtApi();
+  };
+
+  PluggableDeviceInit_Api api;
+  init_np_plugin_fn_called = false;
+  init_pjrt_fn_called = false;
+  api.init_np_plugin_fn = init_np_plugin_fn;
+  api.get_pjrt_api_fn = init_pjrt_fn;
+  TF_ASSERT_OK(RegisterPluggableDevicePlugin(&api));
+  ASSERT_TRUE(init_np_plugin_fn_called);
+  ASSERT_TRUE(init_pjrt_fn_called);
+}
+
+TEST(PluggableDevicePluginInitTest, StaticKernelInitTest) {
+  static bool init_kernel_fn_called = false;
+
+  auto init_kernel_fn = +[]() {
+    init_kernel_fn_called = true;
+    TF_InitKernel();
+  };
+
+  PluggableDeviceInit_Api api;
+  init_kernel_fn_called = false;
+  api.init_kernel_fn = init_kernel_fn;
+  TF_ASSERT_OK(RegisterPluggableDevicePlugin(&api));
+  ASSERT_TRUE(init_kernel_fn_called);
+}
+
+TEST(PluggableDevicePluginInitTest, StaticGraphInitTest) {
+  static bool init_graph_fn_called = false;
+
+  auto init_graph_fn =
+      +[](TP_OptimizerRegistrationParams* const params, TF_Status* const) {
+        init_graph_fn_called = true;
+        params->device_type = "GPU";
+        params->optimizer->optimize_func =
+            +[](void*, const TF_Buffer*, const TF_GrapplerItem*, TF_Buffer*,
+                TF_Status*) {};
+      };
+
+  PluggableDeviceInit_Api api;
+  init_graph_fn_called = false;
+  api.init_graph_fn = init_graph_fn;
+  TF_ASSERT_OK(RegisterPluggableDevicePlugin(&api));
+  ASSERT_TRUE(init_graph_fn_called);
+}
+
+TEST(PluggableDevicePluginInitTest, StaticProfilerInitTest) {
+  static bool init_profiler_fn_called = false;
+
+  auto init_profiler_fn =
+      +[](TF_ProfilerRegistrationParams* const params, TF_Status* const) {
+        init_profiler_fn_called = true;
+        params->destroy_profiler = +[](TP_Profiler*) {};
+        params->destroy_profiler_fns = +[](TP_ProfilerFns*) {};
+        params->profiler->device_type = "GPU";
+        params->profiler_fns->start =
+            +[](const TP_Profiler* profiler, TF_Status* status) {};
+        params->profiler_fns->stop =
+            +[](const TP_Profiler* profiler, TF_Status* status) {};
+        params->profiler_fns->collect_data_xspace =
+            +[](const TP_Profiler* profiler, uint8_t* buffer,
+                size_t* size_in_bytes, TF_Status* status) {};
+      };
+
+  PluggableDeviceInit_Api api;
+  init_profiler_fn_called = false;
+  api.init_profiler_fn = init_profiler_fn;
+  TF_ASSERT_OK(RegisterPluggableDevicePlugin(&api));
+  ASSERT_TRUE(init_profiler_fn_called);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 3a10314bb587..cf824f8d44ed 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -262,7 +262,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                 done2.Notify();
               });
     done2.WaitForNotification();
-    EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
+    EXPECT_TRUE(absl::IsNotFound(status)) << "Actual status: " << status;
     EXPECT_TRUE(absl::StrContains(status.message(), "not found."));
 
     return absl::OkStatus();
@@ -676,8 +676,7 @@ void TestTwoDeviceMult(
   absl::Status status = fixture->Run("TwoDeviceMult", opts, {{"T", DT_FLOAT}},
                                      inst_opts, {x}, {&y_cpu, &y_gpu});
   if (!error.empty()) {
-    EXPECT_TRUE(errors::IsInvalidArgument(status))
-        << "Actual status: " << status;
+    EXPECT_TRUE(absl::IsInvalidArgument(status)) << "Actual status: " << status;
     EXPECT_TRUE(absl::StrContains(status.message(), error))
         << "Actual error message: " << status.message();
     return;
@@ -836,7 +835,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListInput) {
   absl::Status status = proc_flr_->Instantiate(
       "FuncWithListInput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
       MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
-  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(absl::IsInvalidArgument(status)) << "Actual status: " << status;
   ASSERT_TRUE(absl::StrContains(
       status.message(),
       "FuncWithListInput has an input named \"x1\" that is a list of tensors"))
@@ -857,7 +856,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, FullTypeForInt32) {
   absl::Status status =
       proc_flr_->Instantiate("XTimesTwoInt32", test::function::Attrs({}),
                              MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
-  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(absl::IsInvalidArgument(status)) << "Actual status: " << status;
   // Check that the error is found by earlier in ProcessFunctionLibraryRuntime
   // and not later in FunctionLibraryRuntime.
   EXPECT_TRUE(absl::StrContains(
@@ -873,7 +872,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListOutput) {
   absl::Status status = proc_flr_->Instantiate(
       "FuncWithListOutput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
       MakeOptions("CPU:0", {}, {"CPU:0"}), &handle);
-  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(absl::IsInvalidArgument(status)) << "Actual status: " << status;
   ASSERT_TRUE(absl::StrContains(
       status.message(),
       "FuncWithListOutput has an output named \"y\" that is a list of tensors"))
@@ -1123,7 +1122,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_PlacerError) {
   absl::Status status = proc_flr_->Instantiate(
       "ResourceOutput", test::function::Attrs({{"T", DT_FLOAT}}), inst_opts,
       &handle);
-  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(absl::IsInvalidArgument(status)) << "Actual status: " << status;
   ASSERT_TRUE(absl::StrContains(status.message(), "Cannot place"));
 }
 
@@ -1173,7 +1172,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CreateKernelsEagerly) {
   inst_opts.create_kernels_eagerly = true;
   absl::Status status =
       Instantiate("Broken", {{"T", DT_INT32}}, inst_opts, &handle);
-  EXPECT_TRUE(errors::IsInternal(status));
+  EXPECT_TRUE(absl::IsInternal(status));
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_StateHandle) {
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index 9a365177770d..dee1903b112b 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -51,7 +51,7 @@ PropagatorState::~PropagatorState() {
   }
 }
 
-void PropagatorState::ActivateRoots(gtl::ArraySlice<const NodeItem*> roots,
+void PropagatorState::ActivateRoots(absl::Span<const NodeItem* const> roots,
                                     TaggedNodeSeq* ready) {
   mutex_lock l(root_frame_->mu);
   IterationState* root_iter = root_frame_->GetIteration(0);
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index e5f4fd6bfec0..bdfea225a5ac 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -462,7 +462,7 @@ class PropagatorState {
 
  public:
   // Creates and adds a `TaggedNode` for each node in `roots` to `*ready`.
-  void ActivateRoots(gtl::ArraySlice<const NodeItem*> roots,
+  void ActivateRoots(absl::Span<const NodeItem* const> roots,
                      TaggedNodeSeq* ready);
 
   // After processing the outputs, propagates the outputs to their dsts.
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 580dafb0ed6c..111303a095f5 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -259,8 +259,7 @@ class ShapeRefiner {
 
   // Stores a map from a node to its InferenceContext.
   absl::flat_hash_map<const Node*,
-                      std::unique_ptr<shape_inference::InferenceContext>,
-                      hash<const Node*>>
+                      std::unique_ptr<shape_inference::InferenceContext>>
       node_to_context_;
 
   // Holds a cache from tensor id (node id:node output) to the tensor that
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
index 195ccc8b8b31..af721c1893ba 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.cc
+++ b/tensorflow/core/common_runtime/simple_propagator_state.cc
@@ -46,7 +46,7 @@ SimplePropagatorState::SimplePropagatorState(
 SimplePropagatorState::~SimplePropagatorState() {}
 
 void SimplePropagatorState::ActivateRoots(
-    gtl::ArraySlice<const NodeItem*> roots, TaggedNodeSeq* ready) {
+    absl::Span<const NodeItem* const> roots, TaggedNodeSeq* ready) {
   for (const NodeItem* item : roots) {
     DCHECK_EQ(item->num_inputs, 0);
     ready->push_back(TaggedNode{item});
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.h b/tensorflow/core/common_runtime/simple_propagator_state.h
index 506d16e23892..3c53a5f90041 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.h
+++ b/tensorflow/core/common_runtime/simple_propagator_state.h
@@ -107,7 +107,7 @@ class SimplePropagatorState {
   typedef absl::InlinedVector<TaggedNode, 8UL> TaggedNodeSeq;
 
   // Creates and adds a `TaggedNode` for each node in `roots` to `*ready`.
-  void ActivateRoots(gtl::ArraySlice<const NodeItem*> roots,
+  void ActivateRoots(absl::Span<const NodeItem* const> roots,
                      TaggedNodeSeq* ready);
 
   // After processing the outputs, propagates the outputs to their dsts.
diff --git a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
index 1e83031898db..afd834eafd17 100644
--- a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
+++ b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass.cc
@@ -48,6 +48,8 @@ namespace tensorflow {
 namespace {
 
 constexpr absl::string_view kTpuExecute = "TPUExecute";
+constexpr absl::string_view kTpuExecuteAndUpdateVariables =
+    "TPUExecuteAndUpdateVariables";
 constexpr absl::string_view kParallelExecuteIds = "_parallel_execution_ids";
 const char kICIWeightDistributionMlirBridgeMarker[] =
     "_ici_weight_distribution_mlir_bridge_marker";
@@ -66,7 +68,8 @@ std::vector<Node*> GetNonMainReplicaIciTPUExecuteNodes(Graph* graph,
                                                        bool& is_spmd) {
   std::vector<Node*> tpu_nodes;
   for (Node* node : graph->nodes()) {
-    if (node->type_string() == kTpuExecute &&
+    if ((node->type_string() == kTpuExecute ||
+         node->type_string() == kTpuExecuteAndUpdateVariables) &&
         HasNodeAttr(node->def(), kParallelExecuteIds)) {
       auto parallel_exec_ids = node->attrs().Find(kParallelExecuteIds)->s();
       std::vector<std::string> group_vec =
diff --git a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass_test.cc b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass_test.cc
index e11e1a8772c6..105eac43ddc7 100644
--- a/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass_test.cc
+++ b/tensorflow/core/common_runtime/simplify_ici_dummy_variables_pass_test.cc
@@ -49,6 +49,8 @@ std::string TestDataPath() {
       "tensorflow/core/common_runtime/testdata/");
 }
 
+using SimplifyIciDummyVariablesPassTest = ::testing::TestWithParam<bool>;
+
 // Test the case enable_tf2min_ici_weight is false.
 TEST(SimplifyIciDummyVariablesPassTest, flag_is_false) {
   flags::Global().enable_tf2min_ici_weight.reset(false);
@@ -78,11 +80,15 @@ TEST(SimplifyIciDummyVariablesPassTest, flag_is_false) {
 
 // Test the case enable_tf2min_ici_weight is true, graph after pass will have
 // dummy variables on task 2.
-TEST(SimplifyIciDummyVariablesPassTest, replace_dummy_variable) {
+// The bool test parameter decides whether to load a graph with TPUExecute or
+// TPUExecuteAndUpdateVariables ops.
+TEST_P(SimplifyIciDummyVariablesPassTest, replace_dummy_variable) {
   flags::Global().enable_tf2min_ici_weight.reset(true);
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
-  std::string graph_path =
-      TestDataPath() + "simplify_ici_dummy_variables_pass_before.pbtxt";
+  const std::string graph_file_name =
+      GetParam() ? "simplify_ici_dummy_variables_pass_updatevars_before.pbtxt"
+                 : "simplify_ici_dummy_variables_pass_before.pbtxt";
+  std::string graph_path = TestDataPath() + graph_file_name;
   tensorflow::GraphDef graph_def;
   absl::Status load_graph_status =
       ReadTextProto(tensorflow::Env::Default(), graph_path, &graph_def);
@@ -108,4 +114,8 @@ TEST(SimplifyIciDummyVariablesPassTest, replace_dummy_variable) {
             "/job:tpu_host_worker/replica:0/task:2/device:CPU:0");
 }
 
+INSTANTIATE_TEST_SUITE_P(All, SimplifyIciDummyVariablesPassTest,
+                         ::testing::Values(false, true),
+                         ::testing::PrintToStringParamName());
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/testdata/simplify_ici_dummy_variables_pass_updatevars_before.pbtxt b/tensorflow/core/common_runtime/testdata/simplify_ici_dummy_variables_pass_updatevars_before.pbtxt
new file mode 100644
index 000000000000..4d2086490450
--- /dev/null
+++ b/tensorflow/core/common_runtime/testdata/simplify_ici_dummy_variables_pass_updatevars_before.pbtxt
@@ -0,0 +1,983 @@
+# proto-file: third_party/tensorflow/core/framework/graph.proto
+# proto-message: GraphDef
+node {
+  name: "unknown_2"
+  op: "_Arg"
+  device: "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_handle_dtypes"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_handle_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "_user_specified_name"
+    value {
+      s: "905"
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 4
+    }
+  }
+}
+
+node {
+  name: "unknown_17"
+  op: "_Arg"
+  device: "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_handle_dtypes"
+    value {
+      list { type: DT_FLOAT }
+    }
+  }
+  attr {
+    key: "_handle_shapes"
+    value {
+      list {
+        shape {
+          dim { size: 128 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape { unknown_rank: true }
+      }
+    }
+  }
+  attr {
+    key: "_user_specified_name"
+    value { s: "935" }
+  }
+  attr {
+    key: "index"
+    value { i: 19 }
+  }
+}
+
+node {
+  name: "tpu_compile_mlir"
+  op: "_TPUCompileMlir"
+  device: "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "NumDynamicShapes"
+    value { i: 0 }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {}
+        shape { dim { size: 3 } }
+        shape { dim { size: 3 } }
+        shape { dim { size: 3 } }
+        shape { dim { size: 3 } }
+      }
+    }
+  }
+  attr {
+    key: "metadata"
+    value { s: "" }
+  }
+  attr {
+    key: "mlir_module"
+    value { s: "" }
+  }
+  attr {
+    key: "num_computations"
+    value { i: 4 }
+  }
+}
+
+node {
+  name: "readvariableop_1"
+  op: "ReadVariableOp"
+  input: "unknown_17"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim { size: 128 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value { type: DT_FLOAT }
+  }
+}
+
+node {
+  name: "identity_1"
+  op: "Identity"
+  input: "readvariableop_1"
+  device: "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim { size: 128 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:0"
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value { b: true }
+  }
+}
+
+node {
+  name: "const_1"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {}
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value { s: "r0:0" }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value { b: true }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+      }
+    }
+  }
+}
+
+node {
+  name: "split_1"
+  op: "Split"
+  input: "const_1"
+  input: "identity_1"
+  attr {
+    key: "T"
+    value { type: DT_FLOAT }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:0"
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value { b: true }
+  }
+  attr {
+    key: "num_split"
+    value { i: 4 }
+  }
+}
+
+node {
+  name: "readvariableop_2"
+  op: "ReadVariableOp"
+  input: "unknown_2"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape { dim { size: 1024 } }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value { type: DT_FLOAT }
+  }
+}
+
+node {
+  name: "identity_2"
+  op: "Identity"
+  input: "readvariableop_2"
+  device: "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape { dim { size: 1024 } }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:0"
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value { b: true }
+  }
+}
+
+node {
+  name: "tpu_execute_1"
+  op: "TPUExecuteAndUpdateVariables"
+  input: "split_1"
+  input: "identity_2"
+  input: "tpu_compile_mlir:1"
+  device: "/job:tpu_host_worker/replica:0/task:0/device:TPU:0"
+  attr {
+    key: "Targs"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tresults"
+    value {
+      list {
+        type: DT_INT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {}
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:0,p0:0"
+    }
+  }
+  attr {
+    key: "device_var_reads_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "device_var_updates_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+
+node {
+  name: "tpu_execute_2"
+  op: "TPUExecuteAndUpdateVariables"
+  input: "split_1:1"
+  input: "identity_2"
+  input: "tpu_compile_mlir:2"
+  device: "/job:tpu_host_worker/replica:0/task:0/device:TPU:1"
+  attr {
+    key: "Targs"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "Tresults"
+    value {
+      list {
+        type: DT_INT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {}
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:0,p0:1"
+    }
+  }
+  attr {
+    key: "device_var_reads_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "device_var_updates_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+
+node {
+  name: "const_3"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\000\000\000\000\000\004\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+
+node {
+  name: "const_4"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+      }
+    }
+  }
+}
+
+node {
+  name: "fill_1"
+  op: "Fill"
+  input: "const_3"
+  input: "const_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT64
+    }
+  }
+}
+
+node {
+  name: "identity_3"
+  op: "Identity"
+  input: "fill_1"
+  device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:1"
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+}
+
+node {
+  name: "const_2"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:1"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+      }
+    }
+  }
+}
+
+node {
+  name: "split_2"
+  op: "Split"
+  input: "const_2"
+  input: "identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value { s: "r0:1" }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value { b: true }
+  }
+  attr {
+    key: "num_split"
+    value { i: 4 }
+  }
+}
+
+node {
+  name: "const_5"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int64_val: 1024
+      }
+    }
+  }
+}
+
+node {
+  name: "const_6"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "bcast_id"
+    value {
+      i: 4
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BFLOAT16
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BFLOAT16
+        tensor_shape {
+        }
+      }
+    }
+  }
+  experimental_debug_info {
+    original_node_names: "Identity"
+    original_func_names: "__inference__train_helper_851"
+  }
+}
+
+node {
+  name: "fill_2"
+  op: "Fill"
+  input: "const_5"
+  input: "const_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_BFLOAT16
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT64
+    }
+  }
+}
+
+node {
+  name: "identity_4"
+  op: "Identity"
+  input: "fill_2"
+  device: "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BFLOAT16
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:1"
+    }
+  }
+  attr {
+    key: "_ici_weight_distribution_mlir_bridge_marker"
+    value {
+      b: true
+    }
+  }
+}
+
+node {
+  name: "tpu_execute_3"
+  op: "TPUExecuteAndUpdateVariables"
+  input: "split_2"
+  input: "identity_4"
+  input: "tpu_compile_mlir:1"
+  device: "/job:tpu_host_worker/replica:0/task:2/device:TPU:0"
+  attr {
+    key: "Targs"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    key: "Tresults"
+    value {
+      list {
+        type: DT_INT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {}
+        shape {
+          dim { size: 32 }
+          dim { size: 1024 }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:1,p0:0"
+    }
+  }
+  attr {
+    key: "device_var_reads_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "device_var_updates_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+
+node {
+  name: "tpu_execute_4"
+  op: "TPUExecuteAndUpdateVariables"
+  input: "split_2:1"
+  input: "identity_4"
+  input: "tpu_compile_mlir:2"
+  device: "/job:tpu_host_worker/replica:0/task:2/device:TPU:1"
+  attr {
+    key: "Targs"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    key: "Tresults"
+    value {
+      list {
+        type: DT_INT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+        shape {
+          dim {
+            size: 32
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "_parallel_execution_ids"
+    value {
+      s: "r0:1,p0:1"
+    }
+  }
+  attr {
+    key: "device_var_reads_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "device_var_updates_indices"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index f6a6c3760e13..088d954e51fd 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -663,6 +663,7 @@ tf_cc_test(
     # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":standalone",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ] + tf_protos_all(),
diff --git a/tensorflow/core/data/flat_map_utils.cc b/tensorflow/core/data/flat_map_utils.cc
index b128742c1957..31dd3a37e706 100644
--- a/tensorflow/core/data/flat_map_utils.cc
+++ b/tensorflow/core/data/flat_map_utils.cc
@@ -187,10 +187,11 @@ FlatMapRandomAccessHandler::MakeInputDatasets() const {
     }
 
     input_datasets.push_back(nullptr);
+    DatasetBase*& input_dataset = input_datasets.back();
     threads.push_back(ctx_->StartThread(
         "flat_map_random_access_iterator",
-        [this, input_tensors = std::move(input_tensors), &input_datasets,
-         dataset_index = input_datasets.size() - 1, &map_func, &status, &mu]() {
+        [this, input_tensors = std::move(input_tensors), &input_dataset,
+         &map_func, &status, &mu]() {
           absl::StatusOr<DatasetBase*> dataset =
               MakeInputDataset(std::move(input_tensors), *map_func);
           if (!dataset.ok()) {
@@ -198,7 +199,7 @@ FlatMapRandomAccessHandler::MakeInputDatasets() const {
             status.Update(dataset.status());
             return;
           }
-          input_datasets[dataset_index] = *dataset;
+          input_dataset = *dataset;
         }));
   }
   threads.clear();
diff --git a/tensorflow/core/data/hash_utils.cc b/tensorflow/core/data/hash_utils.cc
index be806eedc625..b35936469998 100644
--- a/tensorflow/core/data/hash_utils.cc
+++ b/tensorflow/core/data/hash_utils.cc
@@ -116,7 +116,7 @@ absl::Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
           return absl::OkStatus();
         }
       }
-    } else if (errors::IsNotFound(status)) {
+    } else if (absl::IsNotFound(status)) {
       LOG(WARNING) << "Cannot find " << node.op()
                    << " in global op registry, so cannot determine which "
                       "inputs are seeds.";
diff --git a/tensorflow/core/data/metric_utils.cc b/tensorflow/core/data/metric_utils.cc
index 8816593e3c05..1219c04980a5 100644
--- a/tensorflow/core/data/metric_utils.cc
+++ b/tensorflow/core/data/metric_utils.cc
@@ -32,7 +32,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// Safely subtracts `x` from `y` avoiding underflow.
+// Safely subtracts `y` from `x` avoiding underflow.
 uint64_t safe_sub(uint64_t x, uint64_t y) { return x >= y ? x - y : 0; }
 
 }  // namespace
diff --git a/tensorflow/core/data/rewrite_utils.cc b/tensorflow/core/data/rewrite_utils.cc
index 34cafb3403ea..0628b408722d 100644
--- a/tensorflow/core/data/rewrite_utils.cc
+++ b/tensorflow/core/data/rewrite_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/refcount.h"
 
@@ -243,8 +244,7 @@ absl::Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
           VLOG(3) << "Failed to hash tensor: " << s;
         }
       }
-      string graph_hash =
-          strings::StrCat(strings::Hex(hash, strings::kZeroPad16));
+      std::string graph_hash = absl::StrCat(absl::Hex(hash, absl::kZeroPad16));
       metrics::RecordTFDataFingerprint(graph_hash);
     });
   }
diff --git a/tensorflow/core/data/root_dataset.cc b/tensorflow/core/data/root_dataset.cc
index a051cddfa8c5..068ccf51a943 100644
--- a/tensorflow/core/data/root_dataset.cc
+++ b/tensorflow/core/data/root_dataset.cc
@@ -151,7 +151,7 @@ absl::Status RootDataset::FromOptions(const DatasetBase* input,
   Params params;
   SetRootDatasetParams(input->options(), &params);
   *output = new RootDataset(input, params);
-  (*output)->Initialize(/*metadata=*/{});
+  (*output)->Initialize(input->metadata());
   for (const auto& framework : input->options().framework_type()) {
     metrics::RecordTFDataFrameworkType(framework);
   }
@@ -165,8 +165,9 @@ absl::Status RootDataset::FromOptions(core::RefCountPtr<DatasetBase> input,
     metrics::RecordTFDataFrameworkType(framework);
   }
   SetRootDatasetParams(input->options(), &params);
+  Metadata metadata = input->metadata();
   *output = new RootDataset(std::move(input), params);
-  (*output)->Initialize(/*metadata=*/{});
+  (*output)->Initialize(metadata);
   return absl::OkStatus();
 }
 
@@ -457,6 +458,19 @@ absl::Status RootDataset::AsGraphDefInternal(SerializationContext* ctx,
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
+
+namespace {
+// Initialize the rewritten output dataset with the metadata from the input
+// dataset. Note: Do not override the `name` field in the metadata.
+void InitializeRewrittenDatasetMetadata(const DatasetBase* input,
+                                        DatasetBase* rewritten_output) {
+  Metadata rewritten_output_metadata = rewritten_output->metadata();
+  rewritten_output_metadata.set_data_service_address(
+      input->metadata().data_service_address());
+  rewritten_output->Initialize(rewritten_output_metadata);
+}
+}  // namespace
+
 absl::Status FinalizeDataset(OpKernelContext* ctx, const DatasetBase* input,
                              DatasetBase** output) {
   const Options& options = input->options();
@@ -488,7 +502,7 @@ absl::Status FinalizeDataset(OpKernelContext* ctx, const DatasetBase* input,
 
   *output = rewritten_output.get();
   bool rewritten = (*output != input);
-  if (errors::IsDeadlineExceeded(s)) {
+  if (absl::IsDeadlineExceeded(s)) {
     // Ignore DeadlineExceeded as it implies that the attempted rewrite took too
     // long which should not prevent further computation.
     LOG(WARNING) << s;
@@ -498,6 +512,7 @@ absl::Status FinalizeDataset(OpKernelContext* ctx, const DatasetBase* input,
   if (!rewritten) {
     return RootDataset::FromOptions(input, output);
   } else {
+    InitializeRewrittenDatasetMetadata(input, rewritten_output.get());
     return RootDataset::FromOptions(std::move(rewritten_output), output);
   }
   return absl::OkStatus();
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index aebc510e5bae..87fb4f45f9b2 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -348,6 +348,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/framework:graph_proto_cc",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
     ],
 )
diff --git a/tensorflow/core/data/service/common.cc b/tensorflow/core/data/service/common.cc
index adde241b3863..787066353c77 100644
--- a/tensorflow/core/data/service/common.cc
+++ b/tensorflow/core/data/service/common.cc
@@ -132,8 +132,8 @@ absl::StatusOr<DeploymentMode> ParseDeploymentMode(absl::string_view s) {
 }
 
 bool IsPreemptedError(const absl::Status& status) {
-  return errors::IsAborted(status) || errors::IsCancelled(status) ||
-         errors::IsUnavailable(status);
+  return absl::IsAborted(status) || absl::IsCancelled(status) ||
+         absl::IsUnavailable(status);
 }
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dataset_store_test.cc b/tensorflow/core/data/service/dataset_store_test.cc
index cf127475a680..d290de4c8185 100644
--- a/tensorflow/core/data/service/dataset_store_test.cc
+++ b/tensorflow/core/data/service/dataset_store_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -40,11 +41,10 @@ std::string NewDatasetsDir() {
   if (Env::Default()->FileExists(dir).ok()) {
     int64_t undeleted_files;
     int64_t undeleted_dirs;
-    CHECK(Env::Default()
-              ->DeleteRecursively(dir, &undeleted_files, &undeleted_dirs)
-              .ok());
+    CHECK_OK(Env::Default()->DeleteRecursively(dir, &undeleted_files,
+                                               &undeleted_dirs));
   }
-  CHECK(Env::Default()->RecursivelyCreateDir(dir).ok());
+  CHECK_OK(Env::Default()->RecursivelyCreateDir(dir));
   return dir;
 }
 
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 1b3af854ac19..3fcd560a5492 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -227,8 +227,10 @@ absl::Status DataServiceDispatcherImpl::Start() {
         env_->RecursivelyCreateDir(DatasetsDir(config_.work_dir())));
   }
   if (!config_.fault_tolerant_mode()) {
-    LOG(INFO) << "Running with fault_tolerant_mode=False. The dispatcher will "
-                 "not be able to recover its state on restart.";
+    LOG(INFO) << "Started tf.data service dispatcher in non-fault-tolerant "
+                 "mode with config: "
+              << config_.DebugString()
+              << "\nIt will not recover its state on restart.";
     started_ = true;
     return absl::OkStatus();
   }
@@ -240,7 +242,7 @@ absl::Status DataServiceDispatcherImpl::Start() {
   bool end_of_journal = false;
   FileJournalReader reader(env_, JournalDir(config_.work_dir()));
   absl::Status s = reader.Read(update, end_of_journal);
-  if (errors::IsNotFound(s)) {
+  if (absl::IsNotFound(s)) {
     LOG(INFO) << "No journal found. Starting dispatcher from new state.";
   } else if (!s.ok()) {
     return s;
@@ -270,7 +272,7 @@ absl::Status DataServiceDispatcherImpl::Start() {
   TF_RETURN_IF_ERROR(journal_writer_.value()->EnsureInitialized());
   TF_RETURN_IF_ERROR(RestoreSnapshots());
   started_ = true;
-  LOG(INFO) << "Started tf.data service dispatcher with config "
+  LOG(INFO) << "Started tf.data service dispatcher with config: "
             << config_.DebugString();
   return absl::OkStatus();
 }
@@ -438,7 +440,7 @@ absl::Status DataServiceDispatcherImpl::WorkerHeartbeat(
     std::vector<std::shared_ptr<const Task>> assigned_tasks;
     absl::Status s = state_.TasksForWorker(worker_address, assigned_tasks);
     if (!s.ok()) {
-      if (!errors::IsNotFound(s)) {
+      if (!absl::IsNotFound(s)) {
         return s;
       }
       VLOG(1) << "Registering new worker at address " << worker_address;
@@ -637,7 +639,7 @@ DataServiceDispatcherImpl::FindDataset(
   absl::Status status =
       state_.DatasetFromId(request.dataset_id(), existing_dataset);
 
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     return std::optional<std::string>();
   }
   TF_RETURN_IF_ERROR(status);
@@ -704,7 +706,7 @@ absl::Status DataServiceDispatcherImpl::GetOrCreateJob(
     absl::Status s = state_.JobByName(job_name, job);
     if (s.ok()) {
       TF_RETURN_IF_ERROR(ValidateMatchingJob(job, *request));
-    } else if (errors::IsNotFound(s)) {
+    } else if (absl::IsNotFound(s)) {
       TF_RETURN_IF_ERROR(CreateJob(job_name, *request, job));
     } else {
       return s;
@@ -729,10 +731,10 @@ absl::Status DataServiceDispatcherImpl::GetOrCreateIteration(
     TF_RETURN_IF_ERROR(state_.JobFromId(request->job_id(), job));
     IterationKey key(job->job_name, request->repetition());
     absl::Status s = state_.IterationByKey(key, iteration);
-    if (!s.ok() && !errors::IsNotFound(s)) {
+    if (!s.ok() && !absl::IsNotFound(s)) {
       return s;
     }
-    if (errors::IsNotFound(s) || iteration->garbage_collected) {
+    if (absl::IsNotFound(s) || iteration->garbage_collected) {
       TF_RETURN_IF_ERROR(CreateIteration(*request, iteration));
       TF_RETURN_IF_ERROR(CreateTasksForIteration(iteration, tasks));
     }
@@ -755,7 +757,7 @@ absl::Status DataServiceDispatcherImpl::MaybeRemoveTask(
   {
     mutex_lock l(mu_);
     absl::Status s = state_.TaskFromId(request->task_id(), task);
-    if (errors::IsNotFound(s)) {
+    if (absl::IsNotFound(s)) {
       // Task is already removed.
       response->set_removed(true);
       return absl::OkStatus();
@@ -1074,7 +1076,7 @@ absl::Status DataServiceDispatcherImpl::ClientHeartbeat(
   std::shared_ptr<const Iteration> iteration;
   absl::Status s = state_.IterationForIterationClientId(
       request->iteration_client_id(), iteration);
-  if (errors::IsNotFound(s) && !config_.fault_tolerant_mode()) {
+  if (absl::IsNotFound(s) && !config_.fault_tolerant_mode()) {
     return errors::NotFound(
         "Unknown iteration client id ", request->iteration_client_id(),
         ". The dispatcher is not configured to be fault tolerant, so this "
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index ddcc432dce92..33e0a4b8569d 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -109,7 +109,7 @@ int GrpcDataServerBase::BoundPort() { return bound_port(); }
 
 void GrpcDataServerBase::AddProfilerServiceToBuilder(
     ::grpc::ServerBuilder& builder) {
-  profiler_service_ = profiler::CreateProfilerService();
+  profiler_service_ = tsl::profiler::CreateProfilerService();
   builder.RegisterService(profiler_service_.get());
 }
 
diff --git a/tensorflow/core/data/service/task_runner_test.cc b/tensorflow/core/data/service/task_runner_test.cc
index 62b1ab632510..52fdf89dc9b8 100644
--- a/tensorflow/core/data/service/task_runner_test.cc
+++ b/tensorflow/core/data/service/task_runner_test.cc
@@ -475,7 +475,7 @@ TEST(CachingTaskRunnerTest, Errors) {
             if (element.ok()) {
               result.push_back(*element);
             }
-            if (errors::IsInvalidArgument(element.status())) {
+            if (absl::IsInvalidArgument(element.status())) {
               EXPECT_THAT(
                   element.status(),
                   StatusIs(error::INVALID_ARGUMENT,
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 4b2040674bec..c89c8a1c4881 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -430,8 +430,9 @@ DataServiceWorkerImpl::MakeDataset(const DatasetDef& dataset_def,
   // `ApplyAutoShardRewrite` does nothing if auto-sharding is disabled.
   TF_ASSIGN_OR_RETURN(graph, auto_shard_rewriter.ApplyAutoShardRewrite(graph));
   std::unique_ptr<standalone::Dataset> dataset;
-  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
-      standalone::Dataset::Params(), graph, &dataset));
+  standalone::Dataset::Params params;
+  params.metadata_options.data_service_address = config_.dispatcher_address();
+  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(params, graph, &dataset));
   return dataset;
 }
 
@@ -700,7 +701,7 @@ void DataServiceWorkerImpl::UpdateTasks(const WorkerHeartbeatResponse& response)
         continue;
       }
       absl::Status s = ProcessTaskInternal(task);
-      if (!s.ok() && !errors::IsAlreadyExists(s)) {
+      if (!s.ok() && !absl::IsAlreadyExists(s)) {
         LOG(WARNING) << "Failed to start processing task " << task.task_id()
                      << ": " << s;
       }
diff --git a/tensorflow/core/data/snapshot_utils.cc b/tensorflow/core/data/snapshot_utils.cc
index 0a35361092fd..576cbed01fb6 100644
--- a/tensorflow/core/data/snapshot_utils.cc
+++ b/tensorflow/core/data/snapshot_utils.cc
@@ -1060,8 +1060,7 @@ absl::Status ReadMetadataFile(
 
 absl::Status DumpDatasetGraph(Env* env, const std::string& path, uint64 hash,
                               const GraphDef* graph) {
-  std::string hash_hex =
-      strings::StrCat(strings::Hex(hash, strings::kZeroPad16));
+  std::string hash_hex = absl::StrCat(absl::Hex(hash, absl::kZeroPad16));
   std::string graph_file =
       io::JoinPath(path, absl::StrCat(hash_hex, "-graph.pbtxt"));
 
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index d2d81f8a2e9f..a105bf407915 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/data/tf_data_memory_logger.h"
 #include "tensorflow/core/data/tfdataz_metrics.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_metadata.pb.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/function.h"
@@ -179,6 +180,10 @@ absl::Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
                                       {fetch_node}, &outputs));
   data::DatasetBase* dataset;
   TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
+  Metadata metadata;
+  metadata.set_data_service_address(
+      params.metadata_options.data_service_address);
+  dataset->Initialize(metadata);
 
   data::DatasetBase* finalized_dataset;
   std::unique_ptr<thread::ThreadPool> pool(
diff --git a/tensorflow/core/data/standalone.h b/tensorflow/core/data/standalone.h
index 83888f80e441..efd25a054eaa 100644
--- a/tensorflow/core/data/standalone.h
+++ b/tensorflow/core/data/standalone.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "xla/tsl/platform/status.h"
@@ -113,9 +114,15 @@ class Iterator {
 // plan of transformations that operate over the data.
 class Dataset {
  public:
+  // Metadata options for `Dataset` creation.
+  struct MetadataOptions {
+    std::string data_service_address;
+  };
+
   // Parameters for `Dataset` creation (e.g. TensorFlow runtime configuration).
   struct Params {
     SessionOptions session_options;
+    MetadataOptions metadata_options;
   };
 
   // Creates a new `Dataset` instance by running the given dataset graph.
diff --git a/tensorflow/core/data/standalone_test.cc b/tensorflow/core/data/standalone_test.cc
index fac2a9eeb6e6..b60b4e752492 100644
--- a/tensorflow/core/data/standalone_test.cc
+++ b/tensorflow/core/data/standalone_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -523,8 +524,12 @@ TEST(Scalar, Standalone) {
   for (auto test_case : test_cases) {
     GraphDef graph_def;
     protobuf::TextFormat::ParseFromString(test_case.graph_string, &graph_def);
+    Dataset::Params params;
+    params.metadata_options.data_service_address = "localhost:12345";
     std::unique_ptr<Dataset> dataset;
-    TF_EXPECT_OK(Dataset::FromGraph({}, graph_def, &dataset));
+    TF_EXPECT_OK(Dataset::FromGraph(params, graph_def, &dataset));
+    EXPECT_EQ(dataset->Get()->metadata().data_service_address(),
+              "localhost:12345");
     std::unique_ptr<Iterator> iterator;
     TF_EXPECT_OK(dataset->MakeIterator(&iterator));
     EXPECT_DOUBLE_EQ(iterator->model()->ComputeSnapshotProcessingTimeNsec(), 0);
@@ -551,8 +556,12 @@ TEST(NoAutotune, Standalone) {
   std::vector<int64_t> expected_outputs({0, 1, 4, 9, 16, 25, 36, 49, 64, 81});
   GraphDef graph_def;
   protobuf::TextFormat::ParseFromString(kMapGraphNoAutotuneProto, &graph_def);
+  Dataset::Params params;
+  params.metadata_options.data_service_address = "localhost:12345";
   std::unique_ptr<Dataset> dataset;
-  TF_EXPECT_OK(Dataset::FromGraph({}, graph_def, &dataset));
+  TF_EXPECT_OK(Dataset::FromGraph(params, graph_def, &dataset));
+  EXPECT_EQ(dataset->Get()->metadata().data_service_address(),
+            "localhost:12345");
   std::unique_ptr<Iterator> iterator;
   TF_EXPECT_OK(dataset->MakeIterator(&iterator));
   EXPECT_EQ(iterator->model(), nullptr);
diff --git a/tensorflow/core/debug/bfc_dump_reader.cc b/tensorflow/core/debug/bfc_dump_reader.cc
index aabdf146fc5e..9ff9dd9d474e 100644
--- a/tensorflow/core/debug/bfc_dump_reader.cc
+++ b/tensorflow/core/debug/bfc_dump_reader.cc
@@ -39,7 +39,7 @@ MemoryDump ReadDumpFile(const string& fname) {
   std::unique_ptr<char> buffer(static_cast<char*>(malloc(file_size + 1)));
   DCHECK(buffer.get());
   absl::string_view contents(buffer.get(), file_size);
-  status = file->Read(0, file_size, &contents, buffer.get());
+  status = file->Read(0, contents, absl::MakeSpan(buffer.get(), file_size));
   if (!status.ok()) {
     LOG(ERROR) << "read from file " << fname << " failed " << status;
   }
diff --git a/tensorflow/core/debug/debug_graph_utils_test.cc b/tensorflow/core/debug/debug_graph_utils_test.cc
index 5ffee94043a0..207b8bc1b3c1 100644
--- a/tensorflow/core/debug/debug_graph_utils_test.cc
+++ b/tensorflow/core/debug/debug_graph_utils_test.cc
@@ -47,15 +47,15 @@ TEST_F(DebugGraphUtilsTest, TestMalformedDebugOpName) {
 
   absl::Status s = ParseDebugOpName("(mute_if_healthy=true)",
                                     &debug_op_name_proper, &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 
   s = ParseDebugOpName("DebugNumericSummary(", &debug_op_name_proper,
                        &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 
   s = ParseDebugOpName("DebugNumericSummary)", &debug_op_name_proper,
                        &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 }
 
 TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
@@ -64,28 +64,28 @@ TEST_F(DebugGraphUtilsTest, TestDebugOpNameWithMalformedAttributes) {
 
   absl::Status s = ParseDebugOpName("DebugNumericSummary(=)",
                                     &debug_op_name_proper, &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 
   s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy=)",
                        &debug_op_name_proper, &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 
   s = ParseDebugOpName("DebugNumericSummary(=true)", &debug_op_name_proper,
                        &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 
   s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy:true)",
                        &debug_op_name_proper, &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 
   s = ParseDebugOpName("DebugNumericSummary(mute_if_healthy=true;threshold=)",
                        &debug_op_name_proper, &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 
   s = ParseDebugOpName(
       "DebugNumericSummary(mute_if_healthy=true;threshold:300.0)",
       &debug_op_name_proper, &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithSingleAttribute) {
@@ -134,7 +134,7 @@ TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithMoreDuplicateAttributes) {
       "DebugNumericSummary(mute_if_healthy=true; lower_bound=3; "
       "mute_if_healthy=false;)",
       &debug_op_name_proper, &attributes);
-  ASSERT_TRUE(errors::IsInvalidArgument(s));
+  ASSERT_TRUE(absl::IsInvalidArgument(s));
 }
 
 TEST_F(DebugGraphUtilsTest, TestValidDebugOpNameWithWhitespaceInAttributes) {
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 04317455a945..2156b05b992e 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -317,7 +317,7 @@ absl::Status ReadEventFromFile(const string& dump_file_path, Event* event) {
   }
 
   absl::string_view result;
-  s = file->Read(0, file_size, &result, &(content)[0]);
+  s = file->Read(0, result, absl::MakeSpan(&content[0], file_size));
   if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index f48758f99367..fe9fba5ffa50 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -346,7 +346,7 @@ TEST_F(DeviceResDistTest, DifferentIncarnation) {
   const string task_name = "/job:worker/replica:0/task:1";
   const string device_name = absl::StrCat(task_name, "/device:CPU:0");
   IssueRequest(task_name, device_name, num_workers * num_devices);
-  EXPECT_TRUE(errors::IsFailedPrecondition(status_[device_name]));
+  EXPECT_TRUE(absl::IsFailedPrecondition(status_[device_name]));
 }
 
 TEST_F(DeviceResDistTest, BroadcastSourceRank0) {
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 0338abeda899..3a97b7342023 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -568,7 +568,7 @@ TEST_P(CollRMADistTest, WorkerRestart) {
         post_restart_note.Notify();
       });
   post_restart_note.WaitForNotification();
-  EXPECT_TRUE(errors::IsFailedPrecondition(consumer_status));
+  EXPECT_TRUE(absl::IsFailedPrecondition(consumer_status));
 }
 
 TEST_P(CollRMADistTest, CheckHealthOKWithCachedAttr) {
@@ -611,7 +611,7 @@ TEST_P(CollRMADistTest, CheckHealthRestarted) {
         check_health_done.Notify();
       });
   check_health_done.WaitForNotification();
-  EXPECT_TRUE(errors::IsFailedPrecondition(check_health_status));
+  EXPECT_TRUE(absl::IsFailedPrecondition(check_health_status));
 }
 
 TEST_P(CollRMADistTest, CheckHealthFailedPeer) {
@@ -628,7 +628,7 @@ TEST_P(CollRMADistTest, CheckHealthFailedPeer) {
         check_health_done.Notify();
       });
   check_health_done.WaitForNotification();
-  EXPECT_TRUE(errors::IsUnavailable(check_health_status));
+  EXPECT_TRUE(absl::IsUnavailable(check_health_status));
 }
 
 TEST_P(CollRMADistTest, CheckHealthRestartedWithDifferentDevices) {
@@ -643,7 +643,7 @@ TEST_P(CollRMADistTest, CheckHealthRestartedWithDifferentDevices) {
         check_health_done.Notify();
       });
   check_health_done.WaitForNotification();
-  EXPECT_TRUE(errors::IsFailedPrecondition(check_health_status));
+  EXPECT_TRUE(absl::IsFailedPrecondition(check_health_status));
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index a09cec8ab6c7..ca6381f448e6 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h"
 
 #include <atomic>
-#include <map>
 #include <memory>
 #include <optional>
 #include <string>
@@ -25,8 +24,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
@@ -56,81 +55,6 @@ class MockCoordinationServiceAgent : public CoordinationServiceAgent {
               (override));
   MOCK_METHOD(absl::Status, CancelBarrier, (std::string_view barrier_id),
               (override));
-
-  // All the following member functions are not needed for testing.
-  MOCK_METHOD(absl::Status, Initialize,
-              (Env * env, std::string_view job_name, int task_id,
-               const CoordinationServiceConfig& configs,
-               std::unique_ptr<CoordinationClient> leader_client,
-               StatusCallback error_fn, bool recoverable),
-              (override));
-  MOCK_METHOD(absl::Status, Initialize,
-              (Env * env, std::string_view job_name, int task_id,
-               const CoordinationServiceConfig& configs,
-               std::unique_ptr<CoordinationClient> leader_client,
-               StatusCallback error_fn),
-              (override));
-  MOCK_METHOD(absl::Status, Initialize,
-              (Env * env, const CoordinatedTask& task,
-               const CoordinationServiceConfig& configs,
-               std::unique_ptr<CoordinationClient> leader_client,
-               StatusCallback error_fn),
-              (override));
-  MOCK_METHOD(bool, IsInitialized, (), (override));
-  MOCK_METHOD(bool, IsConnected, (), (override));
-  MOCK_METHOD(bool, IsError, (), (override));
-  MOCK_METHOD(absl::Status, Connect, (), (override));
-  MOCK_METHOD(absl::Status, WaitForAllTasks, (const DeviceInfo& local_devices),
-              (override));
-  MOCK_METHOD(const DeviceInfo&, GetClusterDeviceInfo, (), (override));
-  MOCK_METHOD(absl::StatusOr<CoordinatedTask>, GetOwnTask, (), (override));
-  MOCK_METHOD(absl::StatusOr<std::vector<CoordinatedTaskStateInfo>>,
-              GetTaskState, (const std::vector<CoordinatedTask>& task),
-              (override));
-  MOCK_METHOD(absl::Status, ReportError, (const absl::Status& error),
-              (override));
-  MOCK_METHOD(absl::Status, Shutdown, (), (override));
-  MOCK_METHOD(absl::Status, Reset, (), (override));
-  MOCK_METHOD(absl::StatusOr<std::string>, GetKeyValue, (std::string_view key),
-              (override));
-  MOCK_METHOD(absl::StatusOr<std::string>, GetKeyValue,
-              (std::string_view key, absl::Duration timeout), (override));
-  MOCK_METHOD(std::shared_ptr<CallOptions>, GetKeyValueAsync,
-              (std::string_view key, StatusOrValueCallback done), (override));
-  MOCK_METHOD(absl::StatusOr<std::string>, TryGetKeyValue,
-              (std::string_view key), (override));
-  MOCK_METHOD(absl::StatusOr<std::vector<KeyValueEntry>>, GetKeyValueDir,
-              (std::string_view key), (override));
-  MOCK_METHOD(void, GetKeyValueDirAsync,
-              (std::string_view key, StatusOrValueDirCallback done),
-              (override));
-  MOCK_METHOD(absl::Status, InsertKeyValue,
-              (std::string_view key, std::string_view value), (override));
-  MOCK_METHOD(absl::Status, InsertKeyValue,
-              (std::string_view key, std::string_view value,
-               bool allow_overwrite),
-              (override));
-  MOCK_METHOD(absl::Status, DeleteKeyValue, (std::string_view key), (override));
-  MOCK_METHOD(absl::Status, UpdateKeyValue,
-              (std::string_view key, std::string_view value), (override));
-  MOCK_METHOD(absl::Status, StartWatchKey,
-              (std::string_view key, ChangedKeyValuesCallback on_change),
-              (override));
-  MOCK_METHOD(absl::Status, StopWatchKey, (std::string_view key), (override));
-  MOCK_METHOD(void, WaitAtBarrierAsync,
-              (std::string_view barrier_id, absl::Duration timeout,
-               const std::vector<CoordinatedTask>& tasks, StatusCallback done),
-              (override));
-  MOCK_METHOD(void, CancelBarrierAsync,
-              (std::string_view barrier_id, StatusCallback done), (override));
-  MOCK_METHOD(absl::StatusOr<std::vector<CoordinatedTask>>, GetAliveTasks,
-              (const std::vector<CoordinatedTask>& tasks), (override));
-  MOCK_METHOD(absl::StatusOr<Env*>, GetEnv, (), (override));
-  MOCK_METHOD(void, SetError, (const absl::Status& error), (override));
-  MOCK_METHOD(absl::Status, ActivateWatch,
-              (std::string_view key,
-               (const std::map<std::string, std::string>&)),
-              (override));
 };
 
 constexpr auto kTestKey = "test_key";
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index 0c0eb756837e..0c2bdba1da59 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -83,7 +83,7 @@ TEST_F(DeviceResDistTest, GetDeviceAttributesLocal) {
 
 TEST_F(DeviceResDistTest, GetDeviceAttributesLocalUnknown) {
   DeviceAttributes attributes;
-  EXPECT_TRUE(errors::IsNotFound(dev_resolver_->GetDeviceAttributes(
+  EXPECT_TRUE(absl::IsNotFound(dev_resolver_->GetDeviceAttributes(
       "/job:worker/replica:0/task:0/device:CPU:9", &attributes)));
 }
 
@@ -109,7 +109,7 @@ TEST_F(DeviceResDistTest, GetAllDeviceAttributes) {
 
 TEST_F(DeviceResDistTest, GetAllDeviceAttributesUnknown) {
   std::vector<DeviceAttributes> attributes;
-  EXPECT_TRUE(errors::IsNotFound(dev_resolver_->GetAllDeviceAttributes(
+  EXPECT_TRUE(absl::IsNotFound(dev_resolver_->GetAllDeviceAttributes(
       "/job:worker/replica:0/task:3", &attributes)));
 }
 
@@ -157,7 +157,7 @@ TEST_F(DeviceResDistTest, UpdateDeviceAttributesDifferentIncarnation) {
   attributes.push_back(
       NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:1")
           ->attributes());
-  EXPECT_TRUE(errors::IsFailedPrecondition(
+  EXPECT_TRUE(absl::IsFailedPrecondition(
       dev_resolver_->UpdateDeviceAttributes(attributes)));
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index c053c91f8fdb..a9f54eac03a9 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -496,7 +496,7 @@ void RemoteCopyNode::RunAsync(StatusCallback done) {
   const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
   auto done_wrapper = [captured_state,
                        done = std::move(done)](const absl::Status& s) {
-    if (!s.ok() && errors::IsCancelled(s)) {
+    if (!s.ok() && absl::IsCancelled(s)) {
       absl::Status send_status = captured_state->GetSendStatus();
       if (!send_status.ok()) {
         // In this case, Recv is cancelled because the Send op failed.
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index cd46939b8357..f50dc7bb42bf 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -148,7 +148,7 @@ TEST_F(MasterTest, CreateClose) {
   string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def, &handle, &initial_version));
-  EXPECT_TRUE(errors::IsAborted(CloseSession("randombits")));
+  EXPECT_TRUE(absl::IsAborted(CloseSession("randombits")));
   EXPECT_TRUE(CloseSession(handle).ok());
 }
 
@@ -169,8 +169,8 @@ TEST_F(MasterTest, Reset) {
   TF_ASSERT_OK(CreateSession(def, &s1, &initial_version1));
   TF_ASSERT_OK(CreateSession(def, &s2, &initial_version2));
   EXPECT_TRUE(Reset().ok());
-  EXPECT_TRUE(errors::IsAborted(CloseSession(s1)));
-  EXPECT_TRUE(errors::IsAborted(CloseSession(s2)));
+  EXPECT_TRUE(absl::IsAborted(CloseSession(s1)));
+  EXPECT_TRUE(absl::IsAborted(CloseSession(s2)));
 }
 
 TEST_F(MasterTest, Extend) {
@@ -201,7 +201,7 @@ TEST_F(MasterTest, Extend) {
   GraphDef def_2;
   test::graph::ToGraphDef(&graph_2, &def_2);
   int64_t version_2;
-  EXPECT_TRUE(errors::IsAborted(
+  EXPECT_TRUE(absl::IsAborted(
       ExtendSession("randombits", def_2, version_1, &version_2)));
   TF_ASSERT_OK(ExtendSession(handle, def_2, version_1, &version_2));
   EXPECT_GT(version_2, version_1);
@@ -228,7 +228,7 @@ TEST_F(MasterTest, ExtendUpdateStatefulFails) {
   int64_t version_1, version_2;
   TF_ASSERT_OK(ExtendSession(handle, def_1, initial_version, &version_1));
   EXPECT_GT(version_1, initial_version);
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       ExtendSession(handle, def_1, version_1, &version_2)));
   TF_ASSERT_OK(CloseSession(handle));
 }
@@ -247,7 +247,7 @@ TEST_F(MasterTest, ExtendTwiceFails) {
   int64_t version_1;
   TF_ASSERT_OK(ExtendSession(handle, def_1, initial_version, &version_1));
   EXPECT_GT(version_1, initial_version);
-  EXPECT_TRUE(errors::IsAborted(
+  EXPECT_TRUE(absl::IsAborted(
       ExtendSession(handle, def_1, initial_version, &version_1)));
   TF_ASSERT_OK(CloseSession(handle));
 }
@@ -273,7 +273,7 @@ TEST_F(MasterTest, ConcurrentExtendOnlyOneSucceeds) {
     int64_t new_version;
     absl::Status s =
         ExtendSession(handle, def_1, initial_version, &new_version);
-    EXPECT_TRUE(s.ok() || errors::IsAborted(s));
+    EXPECT_TRUE(s.ok() || absl::IsAborted(s));
     {
       mutex_lock l(mu);
       if (s.ok()) {
@@ -335,14 +335,14 @@ TEST_F(MasterTest, ConcurrentExtendAndRun) {
 
     // Run at least once before the Extend has completed.
     EXPECT_TRUE(
-        errors::IsNotFound(RunStep(handle, {}, {{"A:0", &A}, {"B:0", &B}})));
+        absl::IsNotFound(RunStep(handle, {}, {{"A:0", &A}, {"B:0", &B}})));
     extend_can_start.Notify();
 
     // Concurrent with the Extend, we will either fail (as above), or
     // succeed (as below).
     while (!extend_done.HasBeenNotified()) {
       absl::Status s = RunStep(handle, {}, {{"A:0", &A}, {"B:0", &B}});
-      EXPECT_TRUE(errors::IsNotFound(s) || s.ok());
+      EXPECT_TRUE(absl::IsNotFound(s) || s.ok());
     }
 
     // Run at least once after the Extend has completed.
diff --git a/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc b/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
index ffeb7e061551..a2afbc3aead3 100644
--- a/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
+++ b/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
@@ -45,7 +45,7 @@ class CheckPreemptionOp : public OpKernel {
     auto status_or_task = agent->TryGetKeyValue(preemption_key_);
 
     // No-op if preemption key is not found.
-    if (errors::IsNotFound(status_or_task.status())) {
+    if (absl::IsNotFound(status_or_task.status())) {
       return;
     }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 5ef36545817e..c0351bbe4485 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -335,6 +335,7 @@ cc_library(
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/profiler/rpc:profiler_service_impl",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service",
         "@local_xla//xla/tsl/distributed_runtime/rpc:async_service_interface",
     ] + tf_protos_profiler_service() + tf_grpc_dependencies() + tf_grpc_cc_dependencies(),
     alwayslink = 1,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 6ccc00364c39..f85d7836928b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -152,7 +152,7 @@ class GrpcRemoteMaster : public MasterInterface {
         ctx.set_deadline(absl::ToChronoTime(absl::Now() + timeout));
       }
       s = FromGrpcStatus((stub_.get()->*pfunc)(&ctx, *request, response));
-      if (!errors::IsUnavailable(s)) {
+      if (!absl::IsUnavailable(s)) {
         return s;
       }
       // TODO(b/117162170): we may want to make this configurable.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 8bd1193724f2..cc5ab2bd5a2c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "grpcpp/security/credentials.h"
 #include "grpcpp/server_builder.h"
 #include "absl/strings/numbers.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -279,11 +280,13 @@ absl::Status GrpcServer::Init(const GrpcServerOptions& opts) {
                                          opts.worker_service_options)
                         .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
-  thread::ThreadPool* compute_pool = ComputePool(sess_opts);
-  coordination_service_ =
-      new GrpcCoordinationServiceImpl(compute_pool, &builder);
+  coordination_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
+      env_, "CoordinationServiceRpcHandler",
+      /*num_threads=*/4);
+  coordination_service_ = new GrpcCoordinationServiceImpl(
+      coordination_compute_pool_.get(), &builder);
 
-  profiler_service_ = profiler::CreateProfilerService();
+  profiler_service_ = tsl::profiler::CreateProfilerService();
   builder.RegisterService(profiler_service_.get());
 
   // Add any extra services to be started.
@@ -331,7 +334,7 @@ absl::Status GrpcServer::Init(const GrpcServerOptions& opts) {
         return WorkerCacheFactory(options, worker_cache);
       },
       grpc_coordination_service->GetRpcHandler());
-  worker_env_.compute_pool = compute_pool;
+  worker_env_.compute_pool = ComputePool(sess_opts);
 
   // Finish setting up master environment.
   master_env_.ops = OpRegistry::Global();
@@ -522,7 +525,7 @@ absl::Status GrpcServer::SetCoordinationServiceAgentInstance(
 }
 
 absl::Status GrpcServer::SetCoordinationServiceInstance(
-    tsl::CoordinationServiceInterface* service) {
+    tsl::CoordinationService* service) {
   auto* coord_service =
       static_cast<GrpcCoordinationServiceImpl*>(coordination_service_);
   coord_service->SetCoordinationServiceInstance(service);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index ca162c193d3b..431e4c4490be 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/platform/env.h"
+#include "tsl/platform/threadpool.h"
 #include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
@@ -175,7 +176,7 @@ class GrpcServer : public ServerInterface {
   GrpcWorkerEnv* grpc_worker_env() const { return grpc_worker_env_.get(); }
 
   absl::Status SetCoordinationServiceInstance(
-      tsl::CoordinationServiceInterface* service);
+      tsl::CoordinationService* service);
 
  private:
   Env* env_;
@@ -225,6 +226,7 @@ class GrpcServer : public ServerInterface {
   std::shared_ptr<WorkerSession> worker_session_;
 
   // Experimental coordination service implementation, and RPC polling thread.
+  std::unique_ptr<tsl::thread::ThreadPool> coordination_compute_pool_ = nullptr;
   tsl::AsyncServiceInterface* coordination_service_ = nullptr;
   std::unique_ptr<Thread> coordination_thread_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 9e293d70e0e3..573627e1ae2d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -346,7 +346,7 @@ TEST(GrpcSessionTest, DisableOutputPartitionGraphs) {
     RunMetadata run_metadata;
     absl::Status s = session->Run(run_options, {}, {}, {node_names[2]}, nullptr,
                                   &run_metadata);
-    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(absl::IsInvalidArgument(s));
     EXPECT_TRUE(
         absl::StrContains(s.message(), "disable_output_partition_graphs"));
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 701ce3ed4e61..80044d1fcb6a 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -177,7 +177,7 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
     bool val_dead = false;
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Initialize(&worker_session_));
-    EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
+    EXPECT_TRUE(absl::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
   {  // Cleanup causes Abort().
     const int64_t step_id = 321;
@@ -190,7 +190,7 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
     bool val_dead = false;
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Initialize(&worker_session_));
-    EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
+    EXPECT_TRUE(absl::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
 }
 
@@ -212,7 +212,7 @@ TEST_F(RpcRendezvousMgrTest, LocalCancel) {
   Rendezvous::Args args;
   args.cancellation_manager = cm;
   TF_ASSERT_OK(rendez->Initialize(&worker_session_));
-  EXPECT_TRUE(errors::IsCancelled(rendez->Recv(key, args, &val, &val_dead)));
+  EXPECT_TRUE(absl::IsCancelled(rendez->Recv(key, args, &val, &val_dead)));
   n.WaitForNotification();
   delete cm;
 }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index a881b2952fa5..44e3eaf1ecc2 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -289,9 +289,8 @@ absl::Status SessionMgr::CreateSession(
 
     // Initialize coordination service if it is the leader.
     if (IsMultiClientLeader(server_def, coordination_config)) {
-      coordination_service_ =
-          tsl::CoordinationServiceInterface::EnableCoordinationService(
-              worker_env_->env, coordination_config, std::move(client_cache));
+      coordination_service_ = tsl::CoordinationService::Create(
+          worker_env_->env, coordination_config, std::move(client_cache));
       if (coordination_handler_ != nullptr) {
         coordination_handler_->SetServiceInstance(coordination_service_.get());
       }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 55c64f45c9da..0a2bddddb1ae 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -134,7 +134,7 @@ class SessionMgr {
 
   std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
   std::shared_ptr<WorkerSession> legacy_session_;
-  std::unique_ptr<tsl::CoordinationServiceInterface> coordination_service_;
+  std::unique_ptr<tsl::CoordinationService> coordination_service_;
   std::unique_ptr<tsl::CoordinationServiceAgent> coordination_service_agent_;
 
   bool is_logging_active_ = false;
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index f1f82e053d43..09142e303e3e 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -41,8 +41,14 @@ default_visibility = [
     # copybara:uncomment #internal tuning service,
     # copybara:uncomment "//third_party/odml/experimental:__subpackages__",
     #internal nexus library,
+    #internal headless cuj tests library,
     #internal nexus library tests,
     "//tensorflow/compiler/jit:__subpackages__",
+    #internal library,
+    # TODO(matthurd): to be removed when summary.proto.h deps moves to TSL
+    "@org_xprof//xprof:__subpackages__",
+    "//tensorflow/cc/experimental/tfa:__subpackages__",
+    "//tensorflow/compiler/mlir/utils:__subpackages__",
 ]
 
 package(
@@ -842,6 +848,8 @@ tf_cuda_library(
         "variant_tensor_data.h",
     ],
     visibility = [
+        "//learning/gemini/gemax/core/models/gemini3/vision/vision_decoder:__pkg__",
+        "//learning/gemini/gemax/core/models/lsp_inference_staging/vision/vision_decoder:__pkg__",
         "//learning/infra/runtime/experimental/mixed_engine:__subpackages__",
         "//tensorflow:__pkg__",
         "//tensorflow/core:__pkg__",
@@ -1703,6 +1711,7 @@ tf_proto_library(
         ":types_proto",
     ],
     visibility = [
+        #internal library,
         "//tensorflow/core:__subpackages__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 45ad4c47aafd..ba3f396b6c3e 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/framework/dataset_metadata.proto b/tensorflow/core/framework/dataset_metadata.proto
index 0e667dd48d1d..2f5785931e17 100644
--- a/tensorflow/core/framework/dataset_metadata.proto
+++ b/tensorflow/core/framework/dataset_metadata.proto
@@ -4,7 +4,8 @@ package tensorflow.data;
 
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/dataset_metadata_go_proto";
 
-// next: 2
+// next: 3
 message Metadata {
   bytes name = 1;
+  string data_service_address = 2;
 }
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index be8bb6c7e36e..150a488c47e1 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -217,7 +217,16 @@ class FunctionInstantiationHelper {
       } else {
         gnode->set_op(FunctionLibraryDefinition::kArgOp);
       }
-      DataType dtype = arg_def.is_ref() ? MakeRefType(dtypes[i]) : dtypes[i];
+      DataType dtype;
+      if (arg_def.is_ref()) {
+        if (IsRefType(dtypes[i])) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Cannot make a ref type for a ref type ", dtypes[i]));
+        }
+        dtype = MakeRefType(dtypes[i]);
+      } else {
+        dtype = dtypes[i];
+      }
       AddAttr("T", dtype, gnode);
       AddAttr("index", arg_index, gnode);
       if (resource_arg_unique_id >= 0) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index c3cffd0b975b..7fbf120afd67 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -641,6 +641,15 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
     return std::nullopt;
   }
 
+  // Returns true if this library contains an OptimizedFunctionGraph for the
+  // given `function_name`, otherwise false.
+  bool HasOptimizedFunctionGraph(const std::string& function_name) const
+      TF_LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return optimized_function_graph_creator_map_.find(function_name) !=
+           optimized_function_graph_creator_map_.end();
+  }
+
   // Creates a map of function names to stack traces for a FunctionDefLibrary.
   static FunctionDefLibraryStackTraces CreateStackTracesForFunctionDefLibrary(
       const FunctionDefLibrary& library, const GraphDebugInfo& debug_info);
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 6a115bb14c55..1a396876e001 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -1530,13 +1530,15 @@ TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
   EXPECT_FALSE(reachable_flib.Contains("Func6"));
 }
 
-TEST(FunctionLibraryDefinitionTest, AddAndFindOptimizedFunctionGraph) {
+TEST(FunctionLibraryDefinitionTest, AddHasAndFindOptimizedFunctionGraph) {
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), FunctionDefLibrary());
   EXPECT_FALSE(lib_def.FindOptimizedFunctionGraph("test").has_value());
+  EXPECT_FALSE(lib_def.HasOptimizedFunctionGraph("test"));
   OptimizedFunctionGraph proto;
   lib_def.AddOptimizedFunctionGraph("test", proto);
   EXPECT_TRUE(lib_def.FindOptimizedFunctionGraph("test").has_value());
   EXPECT_TRUE(lib_def.FindOptimizedFunctionGraph("test").value().ok());
+  EXPECT_TRUE(lib_def.HasOptimizedFunctionGraph("test"));
 }
 
 TEST(FunctionLibraryDefinitionTest, MoveTest) {
@@ -1549,6 +1551,7 @@ TEST(FunctionLibraryDefinitionTest, MoveTest) {
   EXPECT_TRUE(copy_lib_def.Contains("XTimesTwo"));
   EXPECT_TRUE(copy_lib_def.FindOptimizedFunctionGraph("test").has_value());
   EXPECT_TRUE(copy_lib_def.FindOptimizedFunctionGraph("test").value().ok());
+  EXPECT_TRUE(copy_lib_def.HasOptimizedFunctionGraph("test"));
 }
 
 TEST(FunctionLibraryDefinitionTest, ConstructFromGraphDef) {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index d925bc214b20..66864b4f97b5 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -708,7 +708,7 @@ class OpKernelContext {
   // params must outlive the OpKernelContext.
   explicit OpKernelContext(Params* params);
   OpKernelContext(Params* params, int num_outputs);
-  ~OpKernelContext();
+  virtual ~OpKernelContext();
 
   Env* env() const { return params_->device->env(); }
 
@@ -995,33 +995,37 @@ class OpKernelContext {
   // If memory allocation fails, returns an error status.
   //
   // REQUIRES: !IsRefType(expected_output_dtype(index))
-  absl::Status allocate_output(int index, const TensorShape& shape,
-                               Tensor** tensor);
-  absl::Status allocate_output(StringPiece name, const TensorShape& shape,
-                               Tensor** tensor);
+  virtual absl::Status allocate_output(int index, const TensorShape& shape,
+                                       Tensor** tensor);
+  virtual absl::Status allocate_output(StringPiece name,
+                                       const TensorShape& shape,
+                                       Tensor** tensor);
   // The following methods use the supplied attributes instead of
   // those in output_attr_array. The caller is responsible for
   // ensuring that the attributes are "compatible" with the
   // output_attr_array, e.g. the tensor is allocated on the correct
   // device. See comment above.
-  absl::Status allocate_output(int index, const TensorShape& shape,
-                               Tensor** tensor, AllocatorAttributes attr);
-  absl::Status allocate_output(StringPiece name, const TensorShape& shape,
-                               Tensor** tensor, AllocatorAttributes attr);
+  virtual absl::Status allocate_output(int index, const TensorShape& shape,
+                                       Tensor** tensor,
+                                       AllocatorAttributes attr);
+  virtual absl::Status allocate_output(StringPiece name,
+                                       const TensorShape& shape,
+                                       Tensor** tensor,
+                                       AllocatorAttributes attr);
 
   // Allocates a temporary Tensor of the specified type and
   // shape. Devices such as GPUs that enqueue Ops for lazy execution
   // may retain references to the temporary tensors after the Op's
   // Compute method has run. See comment above.
-  absl::Status allocate_temp(DataType type, const TensorShape& shape,
-                             Tensor* out_temp,
-                             AllocatorAttributes allocator_attr,
-                             const AllocationAttributes& allocation_attr);
-  absl::Status allocate_temp(DataType type, const TensorShape& shape,
-                             Tensor* out_temp,
-                             AllocatorAttributes allocator_attr);
-  absl::Status allocate_temp(DataType type, const TensorShape& shape,
-                             Tensor* out_temp);
+  virtual absl::Status allocate_temp(
+      DataType type, const TensorShape& shape, Tensor* out_temp,
+      AllocatorAttributes allocator_attr,
+      const AllocationAttributes& allocation_attr);
+  virtual absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                                     Tensor* out_temp,
+                                     AllocatorAttributes allocator_attr);
+  virtual absl::Status allocate_temp(DataType type, const TensorShape& shape,
+                                     Tensor* out_temp);
 
   // Copies a tensor (allocated by the caller) to the specified output
   // index.  REQUIRES: !IsRefType(expected_output_dtype(index))
@@ -1271,7 +1275,7 @@ class OpKernelContext {
     outputs_.resize(num_outputs);
   }
 
- private:
+ protected:
   bool record_memory_consumption_ = false;
 
   // Internal common method used when allocating tensor memory
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 535233009593..c91a9847c429 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -95,6 +95,8 @@ limitations under the License.
 
 #define TF_CALL_int4(m) m(::tensorflow::int4)
 #define TF_CALL_uint4(m) m(::tensorflow::uint4)
+#define TF_CALL_int2(m) m(::tensorflow::int2)
+#define TF_CALL_uint2(m) m(::tensorflow::uint2)
 
 #elif defined(__ANDROID_TYPES_FULL__)
 
@@ -136,6 +138,8 @@ limitations under the License.
 
 #define TF_CALL_int4(m)
 #define TF_CALL_uint4(m)
+#define TF_CALL_int2(m)
+#define TF_CALL_uint2(m)
 
 #else  // defined(IS_MOBILE_PLATFORM) && !defined(__ANDROID_TYPES_FULL__)
 
@@ -176,6 +180,8 @@ limitations under the License.
 
 #define TF_CALL_int4(m)
 #define TF_CALL_uint4(m)
+#define TF_CALL_int2(m)
+#define TF_CALL_uint2(m)
 
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 0f4ab03958a5..007199557815 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -67,7 +67,7 @@ class Other : public ResourceBase {
 
 class Finalizable : public ResourceBase {
  public:
-  explicit Finalizable(absl::Nonnull<int*> finalize_count)
+  explicit Finalizable(int* absl_nonnull finalize_count)
       : finalize_count_(*finalize_count) {}
   ~Finalizable() override = default;
 
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index 1bc52ed09de6..4ac5c7a76496 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -89,7 +89,7 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
                    " but should list ", num_outputs);
   }
   for (int i = 0; i < num_outputs; ++i) {
-    StringPiece expected(expected_outs_v[i]);
+    absl::string_view expected(expected_outs_v[i]);
     shape_inference::ShapeHandle out = c.output(i);
 
     string err_prefix = strings::StrCat("Output ", i);
@@ -155,7 +155,7 @@ absl::Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     }
     for (int j = 0; j < expected_dims.size(); ++j) {
       err_prefix = strings::StrCat("Output dim ", i, ",", j);
-      StringPiece expected_dim(expected_dims[j]);
+      absl::string_view expected_dim(expected_dims[j]);
       DimensionHandle out_dim = c.Dim(out, j);
 
       std::pair<int, int> in_dim_idx(-1, -1);
@@ -247,7 +247,7 @@ absl::Status ShapeInferenceTestutil::MakeShapeFromString(
       dims.push_back(manager->MakeDim(InferenceContext::kUnknownDim));
     } else {
       scanner.RestartCapture().Many(strings::Scanner::DIGIT);
-      StringPiece match;
+      absl::string_view match;
       int64_t dim_size = 0;
 
       if (!scanner.GetResult(nullptr, &match) ||
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 73bbc554ea1b..47f256c3647f 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -354,6 +354,7 @@ struct ProtoHelper {};
       proto->mutable_##N##_val()->Swap(&copy);                         \
     }                                                                  \
   };
+
 PROTO_TRAITS(float, float, float);
 PROTO_TRAITS(double, double, double);
 PROTO_TRAITS(int32, int32, int);
@@ -370,8 +371,8 @@ PROTO_TRAITS(qint16, int32, int);
 PROTO_TRAITS(quint16, int32, int);
 #undef PROTO_TRAITS
 
-template <>
-struct ProtoHelper<int4> {
+template <typename T>
+struct LowBitIntProtoHelper {
   typedef protobuf::RepeatedField<int> FieldType;
   static FieldType::const_iterator Begin(const TensorProto& proto) {
     return proto.int_val().begin();
@@ -379,7 +380,7 @@ struct ProtoHelper<int4> {
   static size_t NumElements(const TensorProto& proto) {
     return proto.int_val().size();
   }
-  static void Fill(const int4* data, size_t n, TensorProto* proto) {
+  static void Fill(const T* data, size_t n, TensorProto* proto) {
     proto->mutable_int_val()->Reserve(n);
     for (size_t i = 0; i < n; ++i) {
       proto->mutable_int_val()->AddAlreadyReserved(static_cast<int>(data[i]));
@@ -388,21 +389,16 @@ struct ProtoHelper<int4> {
 };
 
 template <>
-struct ProtoHelper<uint4> {
-  typedef protobuf::RepeatedField<int> FieldType;
-  static FieldType::const_iterator Begin(const TensorProto& proto) {
-    return proto.int_val().begin();
-  }
-  static size_t NumElements(const TensorProto& proto) {
-    return proto.int_val().size();
-  }
-  static void Fill(const uint4* data, size_t n, TensorProto* proto) {
-    proto->mutable_int_val()->Reserve(n);
-    for (size_t i = 0; i < n; ++i) {
-      proto->mutable_int_val()->AddAlreadyReserved(static_cast<int>(data[i]));
-    }
-  }
-};
+struct ProtoHelper<int2> : public LowBitIntProtoHelper<int2> {};
+
+template <>
+struct ProtoHelper<uint2> : public LowBitIntProtoHelper<uint2> {};
+
+template <>
+struct ProtoHelper<int4> : public LowBitIntProtoHelper<int4> {};
+
+template <>
+struct ProtoHelper<uint4> : public LowBitIntProtoHelper<uint4> {};
 
 template <>
 struct ProtoHelper<int64_t> {
@@ -642,8 +638,8 @@ TensorBuffer* FromProtoField(Allocator* a, const TensorProto& in, int64_t n) {
 }
 
 template <typename T>
-TensorBuffer* Int4FromProtoField(Allocator* a, const TensorProto& in,
-                                 int64_t n) {
+TensorBuffer* Int4OrInt2FromProtoField(Allocator* a, const TensorProto& in,
+                                       int64_t n) {
   n = std::max<int64_t>(n, 0);
   Buffer<T>* buf = new Buffer<T>(a, n);
   int8_t* data = buf->template base<int8_t>();
@@ -665,16 +661,28 @@ TensorBuffer* Int4FromProtoField(Allocator* a, const TensorProto& in,
   return buf;
 }
 
+template <>
+TensorBuffer* FromProtoField<int2>(Allocator* a, const TensorProto& in,
+                                   int64_t n) {
+  return Int4OrInt2FromProtoField<int2>(a, in, n);
+}
+
+template <>
+TensorBuffer* FromProtoField<uint2>(Allocator* a, const TensorProto& in,
+                                    int64_t n) {
+  return Int4OrInt2FromProtoField<uint2>(a, in, n);
+}
+
 template <>
 TensorBuffer* FromProtoField<int4>(Allocator* a, const TensorProto& in,
                                    int64_t n) {
-  return Int4FromProtoField<int4>(a, in, n);
+  return Int4OrInt2FromProtoField<int4>(a, in, n);
 }
 
 template <>
 TensorBuffer* FromProtoField<uint4>(Allocator* a, const TensorProto& in,
                                     int64_t n) {
-  return Int4FromProtoField<uint4>(a, in, n);
+  return Int4OrInt2FromProtoField<uint4>(a, in, n);
 }
 
 // Separate implementation for `ResourceHandle` to handle the case when the
@@ -970,6 +978,8 @@ int Tensor::RefCount() const {
     CASE(float8_e5m2fnuz, SINGLE_ARG(STMTS))                   \
     CASE(int4, SINGLE_ARG(STMTS))                              \
     CASE(uint4, SINGLE_ARG(STMTS))                             \
+    CASE(int2, SINGLE_ARG(STMTS))                              \
+    CASE(uint2, SINGLE_ARG(STMTS))                             \
     case DT_INVALID:                                           \
       INVALID;                                                 \
       break;                                                   \
@@ -1268,6 +1278,10 @@ inline float PrintOneElement(float8_e4m3fn f, bool print_v2) {
   return static_cast<float>(f);
 }
 
+inline float PrintOneElement(float8_e4m3b11fnuz f, bool print_v2) {
+  return static_cast<float>(f);
+}
+
 inline int16_t PrintOneElement(int4 a, bool print_v2) {
   return static_cast<int16_t>(a);
 }
@@ -1276,6 +1290,14 @@ inline uint16_t PrintOneElement(uint4 a, bool print_v2) {
   return static_cast<uint16_t>(a);
 }
 
+inline int16_t PrintOneElement(int2 a, bool print_v2) {
+  return static_cast<int16_t>(a);
+}
+
+inline uint16_t PrintOneElement(uint2 a, bool print_v2) {
+  return static_cast<uint16_t>(a);
+}
+
 // Print from left dim to right dim recursively.
 template <typename T>
 void PrintOneDim(int dim_index, const absl::InlinedVector<int64, 4UL>& shape,
@@ -1454,6 +1476,9 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
     case DT_FLOAT8_E4M3FN:
       return SummarizeArray<float8_e4m3fn>(limit, num_elts, shape_, data,
                                            print_v2);
+    case DT_FLOAT8_E4M3B11FNUZ:
+      return SummarizeArray<float8_e4m3b11fnuz>(limit, num_elts, shape_, data,
+                                                print_v2);
     case DT_FLOAT:
       return SummarizeArray<float>(limit, num_elts, shape_, data, print_v2);
       break;
@@ -1500,6 +1525,10 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
       return SummarizeArray<int4>(limit, num_elts, shape_, data, print_v2);
     case DT_UINT4:
       return SummarizeArray<uint4>(limit, num_elts, shape_, data, print_v2);
+    case DT_INT2:
+      return SummarizeArray<int2>(limit, num_elts, shape_, data, print_v2);
+    case DT_UINT2:
+      return SummarizeArray<uint2>(limit, num_elts, shape_, data, print_v2);
     default: {
       // All irregular cases
       string ret;
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index 3cae72d95ff7..4718013093b0 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -282,6 +282,10 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
       return ExpectEqual<int4>(x, y, t);
     case DT_UINT4:
       return ExpectEqual<uint4>(x, y, t);
+    case DT_INT2:
+      return ExpectEqual<int2>(x, y, t);
+    case DT_UINT2:
+      return ExpectEqual<uint2>(x, y, t);
     default:
       EXPECT_TRUE(false) << "Unsupported type : " << DataTypeString(x.dtype());
   }
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index ba54b2c5f8f6..ac0c1c951859 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -27,7 +27,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// NOLINTEND(misc-unused-using-decls)
+// NOLINTBEGIN(misc-unused-using-decls)
 using tsl::AllocRecord;
 using tsl::TrackingAllocator;
 // NOLINTEND(misc-unused-using-decls)
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 1f2ba385744f..adc9ffc96873 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -142,6 +142,10 @@ string DataTypeStringInternal(DataType dtype) {
       return "int4";
     case DT_UINT4:
       return "uint4";
+    case DT_INT2:
+      return "int2";
+    case DT_UINT2:
+      return "uint2";
     case DT_RESOURCE:
       return "resource";
     case DT_VARIANT:
@@ -257,6 +261,12 @@ bool DataTypeFromString(absl::string_view sp, DataType* dt) {
   } else if (sp == "uint4") {
     *dt = DT_UINT4;
     return true;
+  } else if (sp == "int2") {
+    *dt = DT_INT2;
+    return true;
+  } else if (sp == "uint2") {
+    *dt = DT_UINT2;
+    return true;
   } else if (sp == "resource") {
     *dt = DT_RESOURCE;
     return true;
@@ -311,6 +321,8 @@ int DataTypeSize(DataType dt) {
     TF_CALL_float8_e5m2fnuz(CASE);
     TF_CALL_int4(CASE);
     TF_CALL_uint4(CASE);
+    TF_CALL_int2(CASE);
+    TF_CALL_uint2(CASE);
 
     default:
       return 0;
@@ -350,6 +362,8 @@ DEFINE_DATATYPETOENUM_VALUE(float8_e4m3b11fnuz);
 DEFINE_DATATYPETOENUM_VALUE(float8_e5m2fnuz);
 DEFINE_DATATYPETOENUM_VALUE(int4);
 DEFINE_DATATYPETOENUM_VALUE(uint4);
+DEFINE_DATATYPETOENUM_VALUE(int2);
+DEFINE_DATATYPETOENUM_VALUE(uint2);
 DEFINE_DATATYPETOENUM_VALUE(ResourceHandle);
 DEFINE_DATATYPETOENUM_VALUE(Variant);
 #undef DEFINE_DATATYPETOENUM_VALUE
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 177de7e9fe05..bb3e9f34b213 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -88,7 +88,7 @@ typedef absl::InlinedVector<std::pair<DeviceType, int32>, 4UL>
 // Convert the enums to strings for errors:
 std::string DataTypeString(DataType dtype);
 std::string DeviceTypeString(const DeviceType& device_type);
-std::string DataTypeSliceString(const DataTypeSlice dtypes);
+std::string DataTypeSliceString(DataTypeSlice dtypes);
 inline std::string DataTypeVectorString(const DataTypeVector& dtypes) {
   return DataTypeSliceString(dtypes);
 }
@@ -98,25 +98,25 @@ inline std::string DataTypeVectorString(const DataTypeVector& dtypes) {
 // cannot represent any of the DT_*_REF values.
 class DataTypeSet {
  private:
-  const uint32 mask_;
+  const uint64 mask_;
 
-  static constexpr uint32 kNumBits = 32;
+  static constexpr uint64 kNumBits = 64;
 
  public:
   constexpr DataTypeSet(const DataTypeSet& other) : mask_(other.mask_) {}
-  explicit constexpr DataTypeSet(uint32 mask) : mask_(mask) {}
+  explicit constexpr DataTypeSet(uint64 mask) : mask_(mask) {}
 
   constexpr bool Contains(DataType dt) const {
-    return (static_cast<uint32>(dt) < kNumBits) &&
-           ((mask_ >> static_cast<uint32>(dt)) & 1u) != 0u;
+    return (static_cast<uint64>(dt) < kNumBits) &&
+           ((mask_ >> static_cast<uint64>(dt)) & 1ull) != 0ull;
   }
 
   class Iterator {
     const DataTypeSet& set_;
-    uint32 pos_;
+    uint64 pos_;
 
    public:
-    Iterator(const DataTypeSet& set, uint32 pos) : set_(set), pos_(pos) {
+    Iterator(const DataTypeSet& set, uint64 pos) : set_(set), pos_(pos) {
       DCHECK_LE(pos, kNumBits);
     }
     DataType operator*() const { return static_cast<DataType>(pos_); }
@@ -124,8 +124,8 @@ class DataTypeSet {
       ++pos_;
       DCHECK_LE(pos_, kNumBits);
       if (pos_ < kNumBits) {
-        uint32 remaining_mask = set_.mask_ >> pos_;
-        if (remaining_mask != 0u) {
+        uint64 remaining_mask = set_.mask_ >> pos_;
+        if (remaining_mask != 0ull) {
           pos_ += absl::countr_zero(remaining_mask);
         }
       }
@@ -171,7 +171,7 @@ class DataTypeSet {
 bool DataTypeFromString(absl::string_view sp, DataType* dt);
 
 constexpr inline DataTypeSet ToSet(DataType dt) {
-  return DataTypeSet(1u << static_cast<uint32>(dt));
+  return DataTypeSet(1ull << static_cast<uint64>(dt));
 }
 
 // DT_FLOAT + kDataTypeRefOffset == DT_FLOAT_REF, etc.
@@ -206,7 +206,8 @@ constexpr DataTypeSet kAllTypes =
     ToSet(DT_VARIANT) | ToSet(DT_UINT32) | ToSet(DT_UINT64) |
     ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT8_E5M2) | ToSet(DT_FLOAT8_E4M3FN) |
     ToSet(DT_FLOAT8_E4M3FNUZ) | ToSet(DT_FLOAT8_E4M3B11FNUZ) |
-    ToSet(DT_FLOAT8_E5M2FNUZ) | ToSet(DT_INT4) | ToSet(DT_UINT4);
+    ToSet(DT_FLOAT8_E5M2FNUZ) | ToSet(DT_INT4) | ToSet(DT_UINT4) |
+    ToSet(DT_INT2) | ToSet(DT_UINT2);
 
 inline const DataTypeSet& AllTypes() { return kAllTypes; }
 
@@ -348,6 +349,8 @@ MATCH_TYPE_AND_ENUM(float8_e4m3b11fnuz, DT_FLOAT8_E4M3B11FNUZ);
 MATCH_TYPE_AND_ENUM(float8_e5m2fnuz, DT_FLOAT8_E5M2FNUZ);
 MATCH_TYPE_AND_ENUM(int4, DT_INT4);
 MATCH_TYPE_AND_ENUM(uint4, DT_UINT4);
+MATCH_TYPE_AND_ENUM(int2, DT_INT2);
+MATCH_TYPE_AND_ENUM(uint2, DT_UINT2);
 MATCH_TYPE_AND_ENUM(ResourceHandle, DT_RESOURCE);
 MATCH_TYPE_AND_ENUM(Variant, DT_VARIANT);
 
@@ -427,7 +430,8 @@ constexpr DataTypeSet kDataTypesCanUseMemcpy =
     ToSet(DT_BFLOAT16) | ToSet(DT_HALF) | ToSet(DT_FLOAT8_E5M2) |
     ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_FLOAT8_E4M3FNUZ) |
     ToSet(DT_FLOAT8_E4M3B11FNUZ) | ToSet(DT_FLOAT8_E5M2FNUZ) | ToSet(DT_INT4) |
-    ToSet(DT_UINT4);
+    ToSet(DT_UINT4) | ToSet(DT_INT2) | ToSet(DT_UINT2);
+
 inline bool DataTypeCanUseMemcpy(DataType dt) {
   return kDataTypesCanUseMemcpy.Contains(dt);
 }
@@ -460,7 +464,7 @@ inline bool DataTypeIsQuantized(DataType dt) {
 constexpr DataTypeSet kDataTypeIsInteger =
     ToSet(DT_INT4) | ToSet(DT_UINT4) | ToSet(DT_INT8) | ToSet(DT_UINT8) |
     ToSet(DT_INT16) | ToSet(DT_UINT16) | ToSet(DT_INT32) | ToSet(DT_UINT32) |
-    ToSet(DT_INT64) | ToSet(DT_UINT64);
+    ToSet(DT_INT64) | ToSet(DT_UINT64) | ToSet(DT_INT2) | ToSet(DT_UINT2);
 inline bool DataTypeIsInteger(DataType dt) {
   return kDataTypeIsInteger.Contains(dt);
 }
@@ -468,15 +472,15 @@ inline bool DataTypeIsInteger(DataType dt) {
 // Is the dtype a signed integral type?
 constexpr DataTypeSet kDataTypeIsSigned = ToSet(DT_INT4) | ToSet(DT_INT8) |
                                           ToSet(DT_INT16) | ToSet(DT_INT32) |
-                                          ToSet(DT_INT64);
+                                          ToSet(DT_INT64) | ToSet(DT_INT2);
 inline bool DataTypeIsSigned(DataType dt) {
   return kDataTypeIsSigned.Contains(dt);
 }
 
 // Is the dtype an unsigned integral type?
-constexpr DataTypeSet kDataTypeIsUnsigned = ToSet(DT_UINT4) | ToSet(DT_UINT8) |
-                                            ToSet(DT_UINT16) |
-                                            ToSet(DT_UINT32) | ToSet(DT_UINT64);
+constexpr DataTypeSet kDataTypeIsUnsigned =
+    ToSet(DT_UINT4) | ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_UINT32) |
+    ToSet(DT_UINT64) | ToSet(DT_UINT2);
 inline bool DataTypeIsUnsigned(DataType dt) {
   return kDataTypeIsUnsigned.Contains(dt);
 }
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index 1f4858d11c4a..bef9863cbb32 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -52,6 +52,8 @@ enum DataType {
 
   DT_INT4 = 29;
   DT_UINT4 = 30;
+  DT_INT2 = 31;
+  DT_UINT2 = 32;
 
   // Do not use!  These are only for TF1's obsolete reference Variables.
   // Every enum above should have a corresponding value below (verified by
@@ -87,6 +89,8 @@ enum DataType {
   DT_FLOAT8_E5M2FNUZ_REF = 128;
   DT_INT4_REF = 129;
   DT_UINT4_REF = 130;
+  DT_INT2_REF = 131;
+  DT_UINT2_REF = 132;
 }
 // LINT.ThenChange(
 //    https://www.tensorflow.org/code/tensorflow/c/tf_datatype.h,
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 031b4a4efe98..6293b5489e25 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -120,6 +120,10 @@ TEST(TypesTest, DataTypeFromString) {
   EXPECT_EQ(DT_INT4, dt);
   ASSERT_TRUE(DataTypeFromString("uint4", &dt));
   EXPECT_EQ(DT_UINT4, dt);
+  ASSERT_TRUE(DataTypeFromString("int2", &dt));
+  EXPECT_EQ(DT_INT2, dt);
+  ASSERT_TRUE(DataTypeFromString("uint2", &dt));
+  EXPECT_EQ(DT_UINT2, dt);
 }
 
 template <typename T>
@@ -156,6 +160,8 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E5M2FNUZ));
   EXPECT_FALSE(DataTypeIsQuantized(DT_UINT4));
   EXPECT_FALSE(DataTypeIsQuantized(DT_INT4));
+  EXPECT_FALSE(DataTypeIsQuantized(DT_UINT2));
+  EXPECT_FALSE(DataTypeIsQuantized(DT_INT2));
 }
 
 TEST(TypesTest, ComplexTypes) {
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 20ceeb934907..d93991ac0dc2 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -40,11 +40,13 @@ namespace tensorflow {
 // * A protocol buffer (TypeResolver<T, false, false, true>)
 // * None of the above (TypeResolver<T, false, false, false>)
 //
-template <typename T, bool = std::is_pod<typename std::decay<T>::type>::value,
-          bool = std::is_same<typename std::decay<T>::type,
-                              ::tensorflow::Tensor>::value,
-          bool = std::is_base_of<protobuf::MessageLite,
-                                 typename std::decay<T>::type>::value>
+template <
+    typename T,
+    bool = std::is_trivially_copyable<typename std::decay<T>::type>::value,
+    bool =
+        std::is_same<typename std::decay<T>::type, ::tensorflow::Tensor>::value,
+    bool = std::is_base_of<protobuf::MessageLite,
+                           typename std::decay<T>::type>::value>
 struct TypeResolver {};
 
 // Specialization for POD type
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index bfe5899d3265..41e1ba6da9d9 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_VARIANT_TENSOR_DATA_H_
 
 #include <algorithm>
+#include <type_traits>
 #include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
@@ -47,7 +48,9 @@ class VariantTensorData {
   const std::string& type_name() const { return type_name_; }
   void set_type_name(const std::string& type_name) { type_name_ = type_name; }
 
-  template <typename T, bool = std::is_pod<typename std::decay<T>::type>::value>
+  template <typename T,
+            bool =
+                std::is_trivially_copyable<typename std::decay<T>::type>::value>
   struct PODResolver {};
 
   // Portions of the object that are not Tensors.
diff --git a/tensorflow/core/function/trace_type/trace_type_builder.py b/tensorflow/core/function/trace_type/trace_type_builder.py
index 9ee0cd525a94..502b9fe96709 100644
--- a/tensorflow/core/function/trace_type/trace_type_builder.py
+++ b/tensorflow/core/function/trace_type/trace_type_builder.py
@@ -204,5 +204,5 @@ def from_value(value: Any,
     except:
       raise TypeError(  # pylint: disable=raise-missing-from
           f"Could not generate a generic TraceType for {value!r}."
-          f"Please verify that it is immutable/hashable. Otheriwse, consider "
+          f"Please verify that it is immutable/hashable. Otherwise, consider "
           f"implementing the Tracing Protocol for it.")
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 0b08a127cdbd..640806b2afae 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -1038,7 +1038,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
     if (opts.need_to_record_start_times) {
       int64_t start_time;
       status = GetNodeAttr(*dst_def, "_start_time", &start_time);
-      if (errors::IsNotFound(status)) {
+      if (absl::IsNotFound(status)) {
         start_time = opts.start_times[dst->id()].value();
         AddNodeAttr("_start_time", start_time, dst_def);
       } else if (!status.ok()) {
@@ -1101,14 +1101,14 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
       int64_t recv_start_time = 0;
       if (opts.scheduling_for_recvs) {
         status = GetNodeAttr(src->attrs(), "_start_time", &send_start_time);
-        if (errors::IsNotFound(status) && opts.need_to_record_start_times) {
+        if (absl::IsNotFound(status) && opts.need_to_record_start_times) {
           send_start_time = opts.start_times[src->id()].value();
         } else if (!status.ok()) {
           return status;
         }
 
         status = GetNodeAttr(dst->attrs(), "_start_time", &recv_start_time);
-        if (errors::IsNotFound(status) && opts.need_to_record_start_times) {
+        if (absl::IsNotFound(status) && opts.need_to_record_start_times) {
           recv_start_time = opts.start_times[dst->id()].value();
         } else if (!status.ok()) {
           return status;
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 2e807b2e2134..e1cd9b8892f1 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -64,7 +64,6 @@ using ops::Const;
 using ops::Identity;
 using ops::LoopCond;
 using ops::NextIteration;
-using ::testing::Eq;
 using ::testing::Ne;
 
 const char gpu_device[] = "/job:a/replica:0/task:0/device:GPU:0";
@@ -173,7 +172,7 @@ REGISTER_OP("Combine")
     .SetShapeFn(shape_inference::UnknownShape);
 
 Output ConstructOp(const Scope& scope, const string& op_type,
-                   const absl::Span<const Input>& inputs) {
+                   const absl::Span<const Input> inputs) {
   if (!scope.ok()) return Output();
   const string unique_name = scope.GetUniqueNameForOp(op_type);
   auto builder =
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 13602cec25ab..fab432bfd1c3 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -40,8 +40,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using ::testing::UnorderedElementsAre;
-
 REGISTER_OP("OneInput").Input("x: float");
 
 REGISTER_OP("OneOutput").Output("y: float");
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index b2d1a4166fc3..df7843f884b1 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -50,8 +50,8 @@ Node* HostConstant(Graph* g, const Tensor& tensor);
 Node* HostConstant(Graph* g, const Tensor& tensor, const string& name);
 
 // Adds a variable in "g" of the given "shape" and "dtype".
-Node* Var(Graph* g, const DataType dtype, const TensorShape& shape);
-Node* Var(Graph* g, const DataType dtype, const TensorShape& shape,
+Node* Var(Graph* g, DataType dtype, const TensorShape& shape);
+Node* Var(Graph* g, DataType dtype, const TensorShape& shape,
           const string& name);
 
 // Adds an assign node in "g" which assigns "val" into "var".
@@ -60,12 +60,12 @@ Node* Assign(Graph* g, Node* var, Node* val);
 // Adds a send node "g" sending "input" as a named "tensor" from
 // "sender" to "receiver".
 Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
-           const uint64 sender_incarnation, const string& receiver);
+           uint64 sender_incarnation, const string& receiver);
 
 // Adds a recv node in "g" receiving a named "tensor" from "sender"
 // to "receiver".
 Node* Recv(Graph* g, const string& tensor, const string& type,
-           const string& sender, const uint64 sender_incarnation,
+           const string& sender, uint64 sender_incarnation,
            const string& receiver);
 
 // Adds a cumsum "node" in "g" doing cumsum(data, axes).
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 4e3a0019da2e..fb3960ea2a8b 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -249,9 +249,9 @@ TEST_F(SingleMachineTest, TimeOuts) {
   TF_CHECK_OK(cluster_->Initialize(item));
   RunMetadata metadata;
   absl::Status s1 = cluster_->Run(item.graph, item.feed, item.fetch, &metadata);
-  EXPECT_TRUE(errors::IsDeadlineExceeded(s1));
+  EXPECT_TRUE(absl::IsDeadlineExceeded(s1));
   absl::Status s2 = cluster_->Run(item.graph, item.feed, item.fetch, &metadata);
-  EXPECT_TRUE(errors::IsDeadlineExceeded(s2));
+  EXPECT_TRUE(absl::IsDeadlineExceeded(s2));
 }
 
 static void RunInfiniteTFLoop() {
@@ -337,7 +337,7 @@ static void RunInfiniteTFLoop() {
   TF_CHECK_OK(cluster.Initialize(item));
 
   absl::Status s1 = cluster.Run(item.graph, item.feed, item.fetch, nullptr);
-  if (!errors::IsDeadlineExceeded(s1)) {
+  if (!absl::IsDeadlineExceeded(s1)) {
     LOG(ERROR) << "Expected 'deadline exceeded' error, got " << s1;
     // Exit to break the infinite loop
     _exit(1);
@@ -345,7 +345,7 @@ static void RunInfiniteTFLoop() {
 
   // Attempt to shutdown the cluster and make sure we get the proper error code.
   absl::Status s2 = cluster.Shutdown();
-  if (!errors::IsUnavailable(s2)) {
+  if (!absl::IsUnavailable(s2)) {
     LOG(ERROR) << "Expected 'unavailable' error, got " << s2;
     // Exit to break the infinite loop
     _exit(2);
@@ -633,7 +633,7 @@ TEST_F(SingleMachineTest, PeakMemoryStatsNotEnabled) {
   absl::Status s = cluster.GetPeakMemoryUsage(&device_peak_memory);
   TF_CHECK_OK(cluster.Shutdown());
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(absl::IsInvalidArgument(s));
 }
 #endif
 
diff --git a/tensorflow/core/grappler/costs/cost_estimator.cc b/tensorflow/core/grappler/costs/cost_estimator.cc
index df2e8e2e1108..dd3daa171613 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/cost_estimator.cc
@@ -33,6 +33,8 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
   result.intermediate_memory_write_time += right.intermediate_memory_write_time;
   result.hbm_read_time += right.hbm_read_time;
   result.hbm_write_time += right.hbm_write_time;
+  result.hbm_read_time_noderate += right.hbm_read_time_noderate;
+  result.hbm_write_time_noderate += right.hbm_write_time_noderate;
 
   if (right.max_per_op_buffers != kMemoryUnknown) {
     result.max_per_op_buffers =
@@ -73,6 +75,8 @@ Costs MultiplyCosts(const Costs& costs, int multiplier) {
   result.memory_time *= multiplier;
   result.hbm_read_time *= multiplier;
   result.hbm_write_time *= multiplier;
+  result.hbm_read_time_noderate *= multiplier;
+  result.hbm_write_time_noderate *= multiplier;
   result.network_time *= multiplier;
   result.intermediate_memory_time *= multiplier;
   result.intermediate_memory_read_time *= multiplier;
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 794511a8b919..0554fb53e332 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -138,10 +138,19 @@ struct Costs {
   // Memory access cost of running the graph.
   Duration memory_time;
 
-  // HBM read and write cost of running the graph.
+  // HBM read and write cost of running the graph. These are the achieved time
+  // where the HBM bw might haven been derated (from the peak bw). For small
+  // DMAs there is usually high derating, i.e. the achieved HBM bw is smaller
+  // than the peak bw and the achieved time is much larger than the theoretical
+  // roofline time.
   Duration hbm_read_time;
   Duration hbm_write_time;
 
+  // HBM read and write cost of running the graph without HBM bw derating. These
+  // are the theoretical roofline HBM time if the peak HBM bw is achieved.
+  Duration hbm_read_time_noderate;
+  Duration hbm_write_time_noderate;
+
   // Intermediate memory access cost of running the graph
   Duration intermediate_memory_time;
   Duration intermediate_memory_read_time;   // Intermediate memory read cost.
diff --git a/tensorflow/core/grappler/costs/cost_estimator_test.cc b/tensorflow/core/grappler/costs/cost_estimator_test.cc
index c5b630bc1572..c2ca584d767d 100644
--- a/tensorflow/core/grappler/costs/cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/cost_estimator_test.cc
@@ -32,6 +32,8 @@ TEST(CostEstimatorTest, CombineCosts) {
   c.intermediate_memory_write_time = Costs::NanoSeconds(6);
   c.hbm_read_time = Costs::NanoSeconds(7);
   c.hbm_write_time = Costs::NanoSeconds(8);
+  c.hbm_read_time_noderate = Costs::NanoSeconds(9);
+  c.hbm_write_time_noderate = Costs::NanoSeconds(10);
   c.max_memory = 1;
   c.max_per_op_buffers = 2;
   c.max_per_op_streaming = 3;
@@ -49,6 +51,8 @@ TEST(CostEstimatorTest, CombineCosts) {
   EXPECT_EQ(sum.intermediate_memory_write_time, Costs::NanoSeconds(12));
   EXPECT_EQ(sum.hbm_read_time, Costs::NanoSeconds(14));
   EXPECT_EQ(sum.hbm_write_time, Costs::NanoSeconds(16));
+  EXPECT_EQ(sum.hbm_read_time_noderate, Costs::NanoSeconds(18));
+  EXPECT_EQ(sum.hbm_write_time_noderate, Costs::NanoSeconds(20));
   EXPECT_EQ(sum.max_memory, 2);
   EXPECT_EQ(sum.max_per_op_buffers, 2);
   EXPECT_EQ(sum.max_per_op_streaming, 3);
@@ -65,6 +69,8 @@ TEST(CostEstimatorTest, MultiplyCosts) {
   c.intermediate_memory_time = Costs::NanoSeconds(4);
   c.intermediate_memory_read_time = Costs::NanoSeconds(5);
   c.intermediate_memory_write_time = Costs::NanoSeconds(6);
+  c.hbm_read_time_noderate = Costs::NanoSeconds(7);
+  c.hbm_write_time_noderate = Costs::NanoSeconds(8);
   c.max_memory = 1;
   c.max_per_op_buffers = 2;
   c.max_per_op_streaming = 3;
@@ -80,6 +86,8 @@ TEST(CostEstimatorTest, MultiplyCosts) {
   EXPECT_EQ(product.intermediate_memory_time, Costs::NanoSeconds(40));
   EXPECT_EQ(product.intermediate_memory_read_time, Costs::NanoSeconds(50));
   EXPECT_EQ(product.intermediate_memory_write_time, Costs::NanoSeconds(60));
+  EXPECT_EQ(product.hbm_read_time_noderate, Costs::NanoSeconds(70));
+  EXPECT_EQ(product.hbm_write_time_noderate, Costs::NanoSeconds(80));
   EXPECT_EQ(product.max_memory, 1);
   EXPECT_EQ(product.max_per_op_buffers, 2);
   EXPECT_EQ(product.max_per_op_streaming, 3);
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index d637ec538caa..f5b207698713 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -33,8 +33,8 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& node_name,
 FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
   // Parses node_name:node_output:position string into its components.
   full_str = input;
-  StringPiece capture;
-  StringPiece remaining;
+  absl::string_view capture;
+  absl::string_view remaining;
 
   // Parse "node_name"
   if (strings::Scanner(input)
@@ -60,7 +60,7 @@ FunctionDefTensorDesc::FunctionDefTensorDesc(const string& input) {
           .RestartCapture()
           .Many(strings::Scanner::DIGIT)
           .GetResult(nullptr, &capture)) {
-    CHECK(strings::safe_strto32(capture, &position));
+    CHECK(absl::SimpleAtoi(capture, &position));
   }
 }
 
@@ -85,8 +85,8 @@ void ReplaceReferences(const string& from, const string& to,
   }
 }
 
-void AddFunctionOutputWithUniqueName(StringPiece prefix,
-                                     StringPiece output_tensor_name,
+void AddFunctionOutputWithUniqueName(absl::string_view prefix,
+                                     absl::string_view output_tensor_name,
                                      FunctionDef* fdef, DataType dtype) {
   string name = string(prefix);
   int id = fdef->signature().output_arg_size();
@@ -110,7 +110,7 @@ OpDef_ArgDef* AddFunctionInput(const string& name, FunctionDef* fdef,
   return input_arg;
 }
 
-NodeDef* AddNode(StringPiece name, StringPiece op,
+NodeDef* AddNode(absl::string_view name, absl::string_view op,
                  const std::vector<string>& inputs,
                  const std::vector<std::pair<string, AttrValue>>& attributes,
                  FunctionDef* fd) {
@@ -130,45 +130,49 @@ NodeDef* AddNode(StringPiece name, StringPiece op,
   return node;
 }
 
-bool ContainsFunctionNodeWithName(StringPiece name,
+bool ContainsFunctionNodeWithName(absl::string_view name,
                                   const FunctionDef& function) {
   return FindFunctionNodeWithName(name, function) != -1;
 }
 
-bool ContainsFunctionNodeWithOp(StringPiece op, const FunctionDef& function) {
+bool ContainsFunctionNodeWithOp(absl::string_view op,
+                                const FunctionDef& function) {
   return FindFunctionNodeWithOp(op, function) != -1;
 }
 
-bool ContainsFunctionOutputWithName(StringPiece name,
+bool ContainsFunctionOutputWithName(absl::string_view name,
                                     const FunctionDef& function) {
   return FindFunctionOutputWithName(name, function) != -1;
 }
 
-int FindFunctionInputWithName(StringPiece name, const FunctionDef& function) {
+int FindFunctionInputWithName(absl::string_view name,
+                              const FunctionDef& function) {
   return graph_utils::GetFirstElementIndexWithPredicate(
       [&name](const OpDef_ArgDef& arg) { return arg.name() == name; },
       function.signature().input_arg());
 }
 
-int FindFunctionOutputWithName(StringPiece name, const FunctionDef& function) {
+int FindFunctionOutputWithName(absl::string_view name,
+                               const FunctionDef& function) {
   return graph_utils::GetFirstElementIndexWithPredicate(
       [&name](const OpDef_ArgDef& arg) { return arg.name() == name; },
       function.signature().output_arg());
 }
 
-int FindFunctionNodeWithName(StringPiece name, const FunctionDef& function) {
+int FindFunctionNodeWithName(absl::string_view name,
+                             const FunctionDef& function) {
   return graph_utils::GetFirstElementIndexWithPredicate(
       [&name](const NodeDef& node) { return node.name() == name; },
       function.node_def());
 }
 
-int FindFunctionNodeWithOp(StringPiece op, const FunctionDef& function) {
+int FindFunctionNodeWithOp(absl::string_view op, const FunctionDef& function) {
   return graph_utils::GetFirstElementIndexWithPredicate(
       [&op](const NodeDef& node) { return node.op() == op; },
       function.node_def());
 }
 
-void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function,
+void SetUniqueFunctionNodeName(absl::string_view prefix, FunctionDef* function,
                                NodeDef* node) {
   string name = string(prefix);
   int id = function->node_def_size();
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.h b/tensorflow/core/grappler/optimizers/data/function_utils.h
index 0603463632d5..e78e98aef5e0 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_UTILS_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FUNCTION_UTILS_H_
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
index c032e61580d6..7b3a784bd205 100644
--- a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
+++ b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter_test.cc
@@ -183,7 +183,7 @@ TEST_P(BatchOpRewriterTest, InvalidArgumentForAdaptiveBatchScheduler) {
   Status status = optimizer.Optimize(nullptr, item, &optimized_graph);
 
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_TRUE(absl::IsInvalidArgument(status));
 }
 
 // Tests that reserved attributes relevant with adaptive scheduler are
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 7c78d998018e..06b98d90b894 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -896,7 +896,7 @@ TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnInvalidGraph) {
   MetaOptimizer optimizer_with_post_verifiers(nullptr, config_proto);
   absl::Status status =
       optimizer_with_post_verifiers.Optimize(nullptr, item, &output);
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_TRUE(absl::IsInvalidArgument(status));
   EXPECT_TRUE(absl::StrContains(
       status.message(),
       "NodeDef expected inputs 'float' do not match 3 inputs specified"));
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index dbbab4e2c9c4..a5a48347f075 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -489,17 +489,13 @@ bool RuntimeFusionEnabled(const Cluster* cluster) {
         }
       }
     }
-    bool runtime_fusion_enabled = CudnnUseRuntimeFusion() &&
-                                  CudnnUseFrontend() && num_gpus > 0 &&
-                                  num_gpus == num_ampere;
+    bool runtime_fusion_enabled =
+        CudnnUseRuntimeFusion() && num_gpus > 0 && num_gpus == num_ampere;
 
     if (CudnnUseRuntimeFusion() && !runtime_fusion_enabled) {
-      VLOG(1) << "Enabling Cudnn with runtime compilation requires the "
-              << "Cudnn frontend and Ampere GPUs or later, but we got "
-              << "Cudnn frontend is "
-              << (CudnnUseFrontend() ? "enabled" : "disabled") << " and "
-              << num_ampere << " Ampere GPU(s) out of total " << num_gpus
-              << " GPU(s)";
+      VLOG(1) << "Enabling Cudnn with runtime compilation requires "
+              << "Ampere (sm_80) GPUs or later, but we got " << num_ampere
+              << " sm_80+ GPU(s) out of total " << num_gpus << " GPU(s)";
     }
 
     return runtime_fusion_enabled;
diff --git a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc
index 82f0be75e9ef..9087fcc1b3aa 100644
--- a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc
+++ b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc
@@ -119,7 +119,7 @@ TEST(TFGOptimizerTest, TestImportErrorReturnsAborted) {
 
   // Expect an aborted error.
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(errors::IsAborted(status));
+  EXPECT_TRUE(absl::IsAborted(status));
 }
 
 TEST(TFGOptimizerTest, TestPassErrorIsFatal) {
@@ -139,8 +139,8 @@ TEST(TFGOptimizerTest, TestPassErrorIsFatal) {
 
   // Expect a non-aborted, non-timeout error.
   EXPECT_FALSE(status.ok());
-  EXPECT_FALSE(errors::IsAborted(status));
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_FALSE(absl::IsAborted(status));
+  EXPECT_TRUE(absl::IsInvalidArgument(status));
 }
 
 TEST(TFGOptimizerTest, TestImportErrorMetaOptimizerIsNotFatal) {
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
index 95c0a759159c..562deb536749 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -85,7 +85,7 @@ TEST_F(StructureVerifierTest, OpNotRegistered) {
       "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
       "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
   absl::Status status = verifier_->Verify(graph_);
-  EXPECT_TRUE(errors::IsNotFound(status));
+  EXPECT_TRUE(absl::IsNotFound(status));
   EXPECT_TRUE(absl::StrContains(status.message(), "Op type not registered"));
 }
 
@@ -94,7 +94,7 @@ TEST_F(StructureVerifierTest, DuplicateNodeNames) {
       "node { name: 'A' op: 'TestParams' }"
       "node { name: 'A' op: 'TestInput' }");
   absl::Status status = verifier_->Verify(graph_);
-  EXPECT_TRUE(errors::IsAlreadyExists(status));
+  EXPECT_TRUE(absl::IsAlreadyExists(status));
   EXPECT_TRUE(absl::StrContains(status.message(), "Node already exists:"));
 }
 
@@ -104,7 +104,7 @@ TEST_F(StructureVerifierTest, GraphWithInvalidCycle) {
       "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
       "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
   absl::Status status = verifier_->Verify(graph_);
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_TRUE(absl::IsInvalidArgument(status));
   EXPECT_TRUE(absl::StrContains(
       status.message(), "The graph couldn't be sorted in topological order"));
 }
diff --git a/tensorflow/core/ir/BUILD b/tensorflow/core/ir/BUILD
index 6d9aee324fb3..9a77ffcfd972 100644
--- a/tensorflow/core/ir/BUILD
+++ b/tensorflow/core/ir/BUILD
@@ -23,16 +23,10 @@ td_library(
 
 gentbl_cc_library(
     name = "InterfacesIncGen",
-    tbl_outs = [
-        (
-            ["-gen-op-interface-decls"],
-            "interfaces.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "interfaces.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "interfaces.h.inc": ["-gen-op-interface-decls"],
+        "interfaces.cc.inc": ["-gen-op-interface-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "interfaces.td",
     deps = [
@@ -59,56 +53,38 @@ td_library(
 
 gentbl_cc_library(
     name = "DialectIncGen",
-    tbl_outs = [
-        (
-            [
-                "-gen-op-decls",
-                "-dialect",
-                "tfg",
-            ],
-            "ops.h.inc",
-        ),
-        (
-            [
-                "-gen-op-defs",
-                "-dialect",
-                "tfg",
-            ],
-            "ops.cc.inc",
-        ),
-        (
-            [
-                "-gen-dialect-decls",
-                "-dialect",
-                "tfg",
-            ],
-            "dialect.h.inc",
-        ),
-        (
-            [
-                "-gen-dialect-defs",
-                "-dialect",
-                "tfg",
-            ],
-            "dialect.cc.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-decls",
-                "-attrdefs-dialect",
-                "tfg",
-            ],
-            "attributes.h.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-defs",
-                "-attrdefs-dialect",
-                "tfg",
-            ],
-            "attributes.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ops.h.inc": [
+            "-gen-op-decls",
+            "-dialect",
+            "tfg",
+        ],
+        "ops.cc.inc": [
+            "-gen-op-defs",
+            "-dialect",
+            "tfg",
+        ],
+        "dialect.h.inc": [
+            "-gen-dialect-decls",
+            "-dialect",
+            "tfg",
+        ],
+        "dialect.cc.inc": [
+            "-gen-dialect-defs",
+            "-dialect",
+            "tfg",
+        ],
+        "attributes.h.inc": [
+            "-gen-attrdef-decls",
+            "-attrdefs-dialect",
+            "tfg",
+        ],
+        "attributes.cc.inc": [
+            "-gen-attrdef-defs",
+            "-attrdefs-dialect",
+            "tfg",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ops.td",
     deps = [
diff --git a/tensorflow/core/ir/importexport/convert_types.cc b/tensorflow/core/ir/importexport/convert_types.cc
index c4e25aed1e88..7aff049e2d75 100644
--- a/tensorflow/core/ir/importexport/convert_types.cc
+++ b/tensorflow/core/ir/importexport/convert_types.cc
@@ -171,7 +171,7 @@ Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
   }
 
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  if (type.isa<tftype##Type>()) {               \
+  if (llvm::isa<tftype##Type>(type)) {          \
     *dtype = tensorflow::DT_##enumerant;        \
     return ::tensorflow::OkStatus();            \
   }
diff --git a/tensorflow/core/ir/ops.td b/tensorflow/core/ir/ops.td
index 0cb9ea90d8b9..b6bbbee3b6e8 100644
--- a/tensorflow/core/ir/ops.td
+++ b/tensorflow/core/ir/ops.td
@@ -684,7 +684,7 @@ class TFGraph_CaseLikeRegionOp<string mnemonic> : TFGraph_RegionOp<
 
     RegionAttr getPreservedAttrs(unsigned index) {
       if (auto attrs = getRegionAttrsAttr())
-        return attrs[index].cast<RegionAttr>();
+        return llvm::cast<RegionAttr>(attrs[index]);
       return {};
     }
     void setPreservedAttrs(unsigned index, RegionAttr attrs) {
diff --git a/tensorflow/core/ir/types/BUILD b/tensorflow/core/ir/types/BUILD
index 4577967ebc7d..73d8e61c03c6 100644
--- a/tensorflow/core/ir/types/BUILD
+++ b/tensorflow/core/ir/types/BUILD
@@ -30,16 +30,10 @@ td_library(
 
 gentbl_cc_library(
     name = "DialectIncGen",
-    tbl_outs = [
-        (
-            ["-gen-dialect-decls"],
-            "dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "dialect.cpp.inc",
-        ),
-    ],
+    tbl_outs = {
+        "dialect.h.inc": ["-gen-dialect-decls"],
+        "dialect.cpp.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "dialect.td",
     deps = [
@@ -49,24 +43,12 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "AttributesIncGen",
-    tbl_outs = [
-        (
-            ["-gen-attrdef-decls"],
-            "attributes.h.inc",
-        ),
-        (
-            ["-gen-attrdef-defs"],
-            "attributes.cc.inc",
-        ),
-        (
-            ["-gen-enum-decls"],
-            "attributes_enum.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "attributes_enum.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "attributes.h.inc": ["-gen-attrdef-decls"],
+        "attributes.cc.inc": ["-gen-attrdef-defs"],
+        "attributes_enum.h.inc": ["-gen-enum-decls"],
+        "attributes_enum.cc.inc": ["-gen-enum-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "attributes.td",
     deps = [":DialectTdFiles"],
@@ -74,16 +56,10 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "TypesIncGen",
-    tbl_outs = [
-        (
-            ["-gen-typedef-decls"],
-            "types.h.inc",
-        ),
-        (
-            ["-gen-typedef-defs"],
-            "types.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "types.h.inc": ["-gen-typedef-decls"],
+        "types.cc.inc": ["-gen-typedef-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "types.td",
     deps = [":DialectTdFiles"],
diff --git a/tensorflow/core/ir/types/dialect.cc b/tensorflow/core/ir/types/dialect.cc
index 207594f5ba83..886b2265ab1f 100644
--- a/tensorflow/core/ir/types/dialect.cc
+++ b/tensorflow/core/ir/types/dialect.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/ir/types/dialect.h"
 
+#include <cassert>
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -163,15 +164,15 @@ Type TFTypeDialect::parseType(DialectAsmParser& parser) const {
 // Entry point for Type parsing, TableGen generated code will handle the
 // dispatch to the individual classes.
 void TFTypeDialect::printType(Type type, DialectAsmPrinter& printer) const {
-#define HANDLE_TF_TYPE(tftype, enumerant, name)          \
-  if (auto derived_ty = type.dyn_cast<tftype##Type>()) { \
-    printer << name;                                     \
-    return;                                              \
+#define HANDLE_TF_TYPE(tftype, enumerant, name)               \
+  if (auto derived_ty = mlir::dyn_cast<tftype##Type>(type)) { \
+    printer << name;                                          \
+    return;                                                   \
   }
-#define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name)   \
-  if (auto derived_ty = type.dyn_cast<tftype##Type>()) { \
-    Print##tftype##Type(derived_ty, printer);            \
-    return;                                              \
+#define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name)        \
+  if (auto derived_ty = mlir::dyn_cast<tftype##Type>(type)) { \
+    Print##tftype##Type(derived_ty, printer);                 \
+    return;                                                   \
   }
 // NOLINTNEXTLINE: intended redundant include.
 #include "tensorflow/core/ir/types/types.def"
@@ -583,8 +584,8 @@ TensorFlowType TensorFlowRefType::get(Type type) {
         llvm_unreachable("unexpected integer type");
     }
   }
-#define HANDLE_TF_TYPE(tftype, enumerant, name)        \
-  if (auto derived_ty = type.dyn_cast<tftype##Type>()) \
+#define HANDLE_TF_TYPE(tftype, enumerant, name)             \
+  if (auto derived_ty = mlir::dyn_cast<tftype##Type>(type)) \
     return tftype##RefType::get(ctx);
 
 #define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
@@ -630,7 +631,7 @@ Type TensorFlowRefType::RemoveRef() {
   if (mlir::isa<Complex128RefType>(*this))
     return ComplexType::get(Float64Type::get(ctx));
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  if (isa<tftype##RefType>()) return tftype##Type::get(ctx);
+  if (mlir::isa<tftype##RefType>(*this)) return tftype##Type::get(ctx);
 
 #define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
 // NOLINTNEXTLINE
diff --git a/tensorflow/core/ir/utility.cc b/tensorflow/core/ir/utility.cc
index e04b4df00c98..34ab5dc5e44f 100644
--- a/tensorflow/core/ir/utility.cc
+++ b/tensorflow/core/ir/utility.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/ir/utility.h"
 
+#include <cassert>
+#include <iterator>
 #include <optional>
 
 #include "mlir/IR/Block.h"  // from @llvm-project
diff --git a/tensorflow/core/ir/utils/shape_inference_utils.cc b/tensorflow/core/ir/utils/shape_inference_utils.cc
index 753ad1450b8a..2e417e25f083 100644
--- a/tensorflow/core/ir/utils/shape_inference_utils.cc
+++ b/tensorflow/core/ir/utils/shape_inference_utils.cc
@@ -93,7 +93,7 @@ NamedAttrList GetAllAttributesFromOperation(Operation* op) {
 // TODO(tlongeri): Should num_elements overflow be handled by the MLIR
 // verifier? Are there other cases?
 std::optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
-  if (auto ranked_type = t.dyn_cast<RankedTensorType>()) {
+  if (auto ranked_type = llvm::dyn_cast<RankedTensorType>(t)) {
     tensorflow::PartialTensorShape shape;
     const absl::Status status =
         tensorflow::PartialTensorShape::BuildPartialTensorShape(
@@ -106,7 +106,7 @@ std::optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
 // Extracts a PartialTensorShape from the MLIR attr.
 std::optional<tensorflow::PartialTensorShape> GetShapeFromMlirAttr(Value v) {
   // Function arguments may have shape attr to describe its output shape.
-  if (auto arg = v.dyn_cast<BlockArgument>()) {
+  if (auto arg = dyn_cast<BlockArgument>(v)) {
     Operation* parent_op = arg.getOwner()->getParentOp();
     if (auto func_op = llvm::dyn_cast<FunctionOpInterface>(parent_op)) {
       int arg_idx = arg.getArgNumber();
@@ -116,7 +116,7 @@ std::optional<tensorflow::PartialTensorShape> GetShapeFromMlirAttr(Value v) {
 
       // "tf._output_shapes" in certain models may not store the shape as
       // ShapeAttr, ignore them because we don't know how to interpret it.
-      auto shape_attr = attrs[0].dyn_cast<tf_type::ShapeAttr>();
+      auto shape_attr = llvm::dyn_cast<tf_type::ShapeAttr>(attrs[0]);
       if (shape_attr && shape_attr.hasRank())
         return tensorflow::PartialTensorShape(shape_attr.getShape());
     }
@@ -131,7 +131,7 @@ std::unique_ptr<std::vector<
     std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
 GetSubtypesHelper(Type type) {
   auto type_with_subtypes =
-      type.cast<TensorType>().getElementType().dyn_cast<T>();
+      llvm::dyn_cast<T>(llvm::cast<TensorType>(type).getElementType());
   if (!type_with_subtypes || type_with_subtypes.getSubtypes().empty()) {
     return nullptr;
   }
@@ -317,8 +317,8 @@ LogicalResult InferReturnTypeComponentsForTFOp(
       if (input_tensors[input]) continue;
 
       if (c.requested_input_tensor(input)) {
-        if (auto attr = operand_as_constant_fn(op->getOperand(input))
-                            .dyn_cast_or_null<ElementsAttr>()) {
+        if (auto attr = llvm::dyn_cast_if_present<ElementsAttr>(
+                operand_as_constant_fn(op->getOperand(input)))) {
           VLOG(4) << "Requesting " << input << " as constant\n";
           tensorflow::Tensor* input_tensor = &tensors.at(input);
           auto status = ConvertToTensor(attr, input_tensor);
@@ -336,7 +336,7 @@ LogicalResult InferReturnTypeComponentsForTFOp(
       if (c.requested_input_tensor_as_partial_shape(input) &&
           !input_tensors[input] && !input_tensors_as_shapes[input].Handle()) {
         VLOG(4) << "Requesting " << input << " as shape\n";
-        auto op_result = op->getOperand(input).dyn_cast<OpResult>();
+        auto op_result = dyn_cast<OpResult>(op->getOperand(input));
         if (!op_result) continue;
         // Resize on first valid shape computed.
         auto handle = op_result_as_shape_fn(c, op_result);
@@ -370,7 +370,7 @@ LogicalResult InferReturnTypeComponentsForTFOp(
     Type new_element_type = result_element_type_fn(output);
     // Populate the handle shapes for a resource/variant.
     if (new_element_type &&
-        new_element_type.isa<tf_type::ResourceType, tf_type::VariantType>()) {
+        isa<tf_type::ResourceType, tf_type::VariantType>(new_element_type)) {
       auto handle_shapes_types = c.output_handle_shapes_and_types(output);
       if (handle_shapes_types) {
         SmallVector<TensorType, 1> subtypes;
@@ -382,7 +382,7 @@ LogicalResult InferReturnTypeComponentsForTFOp(
           subtypes.push_back(
               CreateTensorType(c, shape_n_type.shape, element_type));
         }
-        if (new_element_type.isa<tf_type::ResourceType>()) {
+        if (isa<tf_type::ResourceType>(new_element_type)) {
           new_element_type =
               tf_type::ResourceType::get(subtypes, op->getContext());
         } else {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index d1954edf0e74..d25f6736317f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -538,6 +538,11 @@ cc_library(
     hdrs = ["conv_ops_gpu.h"],
 )
 
+cc_library(
+    name = "pooling_ops_gpu_hdrs",
+    hdrs = ["maxpooling_op_gpu.h"],
+)
+
 # We keep this target only because some contrib/ targets depend on it. The
 # reason why the contrib/ targets can't depend on gpu_utils is that, some
 # of the targets are tf_custom_op_library. tf_custom_op_library forbids the
@@ -7173,6 +7178,7 @@ cc_library(
         "//tensorflow/core/framework:tensor_shape_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/lib/png:png_io",
+        "//tensorflow/core/lib/webp:webp_io",
         "//tensorflow/core/platform:strong_hash",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/protobuf:autotuning_proto_cc",
diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index 975a2e01a948..ecb6e9a2e1d1 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/criticality.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
@@ -47,7 +48,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
 #include "tsl/platform/blocking_counter.h"
-#include "tsl/platform/criticality.h"
 #include "tsl/platform/refcount.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index f953c4de7e74..9a66862779e5 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -123,8 +123,8 @@ cc_library(
         "//tensorflow/core/platform:thread_annotations",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_xla//xla/tsl/platform:criticality",
     ],
 )
 
@@ -138,8 +138,8 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_xla//xla/tsl/platform:criticality",
     ],
 )
 
@@ -217,11 +217,11 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:context_types_hdrs",
         "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_xla//xla/tsl/platform:criticality",
     ],
 )
 
@@ -240,10 +240,10 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:context_types_hdrs",
         "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_xla//xla/tsl/platform:criticality",
     ],
     alwayslink = 1,
 )
@@ -261,7 +261,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core/platform:status_matchers",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:fixed_array",
@@ -466,7 +465,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/platform:criticality",
+        "@local_xla//xla/tsl/platform:criticality",
     ],
 )
 
@@ -514,8 +513,8 @@ tf_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/platform:status",
+        "@local_xla//xla/tsl/platform:criticality",
     ],
 )
 
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
index 508cb794e6f5..197058ef7982 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // Benchmarks for performance (throughput and latency) of BasicBatchScheduler
 // under various rates of task injection.
 
+#include <memory>
+
 #include "tensorflow/core/kernels/batching_util/basic_batch_scheduler.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -226,7 +228,7 @@ void LatencyBenchmark::InjectLoad() {
   UniformLoadInjector injector;
   injector.InjectLoad(
       [this] {
-        auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
+        auto task = std::make_unique<BenchmarkBatchTask>();
         TF_CHECK_OK(scheduler_->Schedule(&task));
       },
       kNumTasks, task_injection_interval_micros_);
@@ -298,7 +300,7 @@ void ThroughputBM(::testing::benchmark::State& state) {
     scheduler_options.batch_timeout_micros = state.range(0) * 1000;
     scheduler_options.num_batch_threads = state.range(1);
     scheduler_options.max_enqueued_batches = INT_MAX;  // Unbounded queue.
-    bm.reset(new ThroughputBenchmark(scheduler_options));
+    bm = std::make_unique<ThroughputBenchmark>(scheduler_options);
   }
 
   // Have each iteration issue a reasonably large number of tasks, to ensure our
@@ -308,7 +310,7 @@ void ThroughputBM(::testing::benchmark::State& state) {
   // Schedule 'num_iterations_*kNumTasksPerIteration' tasks.
   for (auto s : state) {
     for (int j = 0; j < kNumTasksPerIteration; ++j) {
-      auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
+      auto task = std::make_unique<BenchmarkBatchTask>();
       TF_CHECK_OK(bm->GetScheduler()->Schedule(&task));
     }
   }
@@ -354,8 +356,8 @@ void LatencyBM(::testing::benchmark::State& state) {
                    << " duration: " << latency_benchmark_duration_secs
                    << " interval: " << kInjectionIntervalMicros;
     }
-    bm.reset(new LatencyBenchmark(scheduler_options, kInjectionIntervalMicros,
-                                  kBatchCpuCost));
+    bm = std::make_unique<LatencyBenchmark>(
+        scheduler_options, kInjectionIntervalMicros, kBatchCpuCost);
   }
 
   for (auto s : state) {
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 43e3e5ffa820..92d6c4c04d18 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
+#include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/common_runtime/cost_constants.h"
 #include "tensorflow/core/common_runtime/cost_measurement.h"
 #include "tensorflow/core/common_runtime/cost_measurement_registry.h"
@@ -73,7 +74,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/util/incremental_barrier.h"
-#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -1054,39 +1054,35 @@ void BatchResourceBase::ProcessFuncBatch(
       batch->task(batch->num_tasks() - 1).captured_inputs;
   args.insert(args.end(), captured_inputs.begin(), captured_inputs.end());
 
-  uint64 current_time = EnvTime::NowNanos();
-  for (int i = 0; i < batch->num_tasks(); ++i) {
-    RecordBatchDelayUs((current_time - batch->task(i).start_time) * 1e-3,
-                       model_name, last_task_context->op_kernel().name(),
-                       processed_size);
-    RecordBatchDelayUsV2((current_time - batch->task(i).start_time) * 1e-3,
-                         model_name, last_task_context->op_kernel().name(),
-                         processed_size);
-  }
+  RecordBatchDelayMetrics(
+      *batch, model_name, op_name, processed_size,
+      /*batch_schedule_time=*/absl::FromUnixNanos(EnvTime::NowNanos()),
+      GetBatchTimeout());
+
   // Releases the cleanup method here, because the callback of the function
   // library runtime will handle it now.
   finally.release();
-  ProcessFuncBatchImpl(
-      last_task, args, &combined_outputs, [&](const absl::Status& run_status) {
-        absl::Status final_status;
-        auto run_finally = gtl::MakeCleanup([&]() {
-          // We do the cleanup here as an optimization, so that
-          // it runs in the underlying TF inter-op threadpool.
-          // Running it in the threadpool, let's the ensuing
-          // ops be scheduled faster, because the executor will
-          // add them to the front of the threadpool's task
-          // queue rather than the end.
-          cleanup_fn(final_status);
-        });
-        final_status = run_status;
-        if (!final_status.ok()) {
-          return;
-        }
-        if (last_task.forced_warmup_batch_size == 0) {
-          final_status = SplitOutputTensors(combined_outputs, batch.get(),
-                                            unbatched_tasks);
-        }
-      });
+  ProcessFuncBatchImpl(last_task, args, &combined_outputs,
+                       [&](const absl::Status& run_status) {
+                         absl::Status final_status;
+                         auto run_finally = gtl::MakeCleanup([&]() {
+                           // We do the cleanup here as an optimization, so that
+                           // it runs in the underlying TF inter-op threadpool.
+                           // Running it in the threadpool, let's the ensuing
+                           // ops be scheduled faster, because the executor will
+                           // add them to the front of the threadpool's task
+                           // queue rather than the end.
+                           cleanup_fn(final_status);
+                         });
+                         final_status = run_status;
+                         if (!final_status.ok()) {
+                           return;
+                         }
+                         if (last_task.forced_warmup_batch_size == 0) {
+                           final_status = SplitOutputTensors(
+                               combined_outputs, batch.get(), unbatched_tasks);
+                         }
+                       });
 }
 
 // Processes a batch of one or more BatchTask entries.
@@ -1248,6 +1244,17 @@ absl::Status BatchResourceBase::LookupOrCreateBatcherQueue(
   return absl::OkStatus();
 }
 
+std::optional<absl::Duration> BatchResourceBase::GetBatchTimeout() const {
+  if (batcher_) {
+    return absl::Microseconds(batcher_queue_options_.batch_timeout_micros);
+  }
+  if (adaptive_batcher_) {
+    return absl::Microseconds(
+        adaptive_batcher_queue_options_.batch_timeout_micros);
+  }
+  return std::nullopt;
+}
+
 void BatchResourceBase::SplitBatchCostsAndRecordMetrics(
     const std::string& model_name, const std::string& op_name,
     const std::vector<std::unique_ptr<CostMeasurement>>&
@@ -1324,5 +1331,51 @@ void BatchResourceBase::SplitBatchCostsAndRecordMetrics(
   }
 }
 
+void BatchResourceBase::RecordBatchDelayMetrics(
+    const BatchResourceBase::BatchT& batch, const std::string& model_name,
+    const std::string& op_name, int64_t processed_size,
+    absl::Time batch_schedule_time,
+    std::optional<absl::Duration> batch_timeout) {
+  absl::Time earliest_task_start_time = absl::InfiniteFuture();
+  for (int i = 0; i < batch.num_tasks(); ++i) {
+    earliest_task_start_time =
+        std::min(earliest_task_start_time,
+                 absl::FromUnixNanos(batch.task(i).start_time));
+  }
+  for (int i = 0; i < batch.num_tasks(); ++i) {
+    const BatchResourceBase::BatchTask& task = batch.task(i);
+
+    const absl::Time start_time = absl::FromUnixNanos(task.start_time);
+    const absl::Duration total_scheduler_delay =
+        batch_schedule_time - start_time;
+    RecordBatchDelayUs(absl::ToInt64Microseconds(total_scheduler_delay),
+                       model_name, op_name, processed_size);
+    RecordBatchDelayUsV2(absl::ToInt64Microseconds(total_scheduler_delay),
+                         model_name, op_name, processed_size);
+
+    RequestCost* request_cost = task.request_cost;
+    // Skip recording the cost if the request_cost is null.
+    if (!request_cost) continue;
+
+    // The duration from when the task was enqueued to when the earliest task in
+    // its batch has been in the queue for a duration of batch_timeout (i.e.
+    // when the task is eligible being scheduled into a batch, regardless of the
+    // number of tasks in the queue) is considered as batching delay, and the
+    // remaining duration in the queue is considered as queueing delay.
+    const absl::Duration remaining_batch_timeout =
+        std::max(earliest_task_start_time +
+                     batch_timeout.value_or(absl::ZeroDuration()) - start_time,
+                 absl::ZeroDuration());
+    const absl::Duration batching_delay =
+        std::min(remaining_batch_timeout, total_scheduler_delay);
+    const absl::Duration queueing_delay =
+        total_scheduler_delay - batching_delay;
+    request_cost->RecordMetrics(
+        {{"batching_delay_msecs", absl::ToDoubleMilliseconds(batching_delay)},
+         {"batch_queueing_delay_msecs",
+          absl::ToDoubleMilliseconds(queueing_delay)}});
+  }
+}
+
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index e853fc482eeb..54e83c82367f 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -27,6 +28,8 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
+#include "absl/time/time.h"
+#include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/common_runtime/cost_measurement_registry.h"
 #include "tensorflow/core/common_runtime/request_cost.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tsl/platform/criticality.h"
 
 namespace tensorflow {
 namespace serving {
@@ -82,7 +84,7 @@ class BatchResourceBase : public ResourceBase {
   // Note input from one batch-op invocation is valid and considered a
   // specialized `slice`.
   struct BatchTask : public tensorflow::serving::BatchTask {
-    BatchTask() : criticality_val(tsl::criticality::GetCriticality()){};
+    BatchTask() : criticality_val(tsl::criticality::GetCriticality()) {};
 
     // A unique ID to identify this invocation of Batch.
     int64_t guid;
@@ -274,6 +276,14 @@ class BatchResourceBase : public ResourceBase {
           batch_cost_measurements,
       int64_t processed_size, BatchT& batch);
 
+  // Records information about the delay between a task being registered and
+  // that task being scheduled into a batch.
+  static void RecordBatchDelayMetrics(
+      const BatchResourceBase::BatchT& batch, const std::string& model_name,
+      const std::string& op_name, int64_t processed_size,
+      absl::Time batch_schedule_time,
+      std::optional<absl::Duration> batch_timeout);
+
  private:
   // Implementation of calling the process batch function.
   virtual void ProcessFuncBatchImpl(
@@ -346,6 +356,10 @@ class BatchResourceBase : public ResourceBase {
                                     const string& op_name,
                                     BatcherQueueT** queue);
 
+  // Returns the batch timeout for the configured scheduler, or nullopt if the
+  // scheduler does not have such a parameter.
+  std::optional<absl::Duration> GetBatchTimeout() const;
+
   SessionMetadata session_metadata_;
 
   absl::Mutex outstanding_batch_mu_;
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
index fa4fad932cfc..2def1e323543 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
@@ -25,8 +25,10 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/common_runtime/cost_constants.h"
 #include "tensorflow/core/common_runtime/cost_measurement.h"
 #include "tensorflow/core/common_runtime/cost_measurement_registry.h"
@@ -48,7 +50,6 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
-#include "tsl/platform/criticality.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
@@ -129,10 +130,12 @@ class TestGcuCostMeasurement : public CostMeasurement {
 REGISTER_COST_MEASUREMENT("test_gcu", TestGcuCostMeasurement);
 
 std::unique_ptr<BatchResourceBase::BatchTask> MakeBatchTask(
-    const int64_t task_size, RequestCost* request_cost) {
+    const int64_t task_size, RequestCost* request_cost,
+    absl::Time start_time = absl::UnixEpoch()) {
   auto task = std::make_unique<BatchResourceBase::BatchTask>();
   task->inputs.push_back(Tensor(DT_DOUBLE, TensorShape({task_size, 1})));
   task->request_cost = request_cost;
+  task->start_time = absl::ToUnixNanos(start_time);
   return task;
 }
 
@@ -418,6 +421,136 @@ TEST(SplitBatchCostsAndRecordMetricsTest, GlobalBatchStatsProcessedSize) {
             original_cumulative_processed_size + 4);
 }
 
+TEST(RecordBatchDelayMetricsTest,
+     TwoRequestsWithNoQueueingDelayAndSchedulingAtBatchTimeout) {
+  const absl::Duration batch_timeout = absl::Seconds(1);
+  const absl::Duration task2_delay = batch_timeout / 4;
+  const absl::Time task1_start_time = absl::Now();
+  const absl::Time task2_start_time = task1_start_time + task2_delay;
+  const absl::Time batch_schedule_time = task1_start_time + batch_timeout;
+
+  BatchResourceBase::BatchT batch;
+  RequestCost cost1, cost2;
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost1, task1_start_time));
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost2, task2_start_time));
+  batch.Close();
+
+  BatchResourceBase::RecordBatchDelayMetrics(
+      batch, "model_name", "op_name", /*processed_size=*/20,
+      batch_schedule_time, batch_timeout);
+
+  EXPECT_THAT(
+      batch.task(0).request_cost->GetMetrics(),
+      UnorderedElementsAre(Pair("batching_delay_msecs",
+                                absl::ToDoubleMilliseconds(batch_timeout)),
+                           Pair("batch_queueing_delay_msecs", 0)));
+  EXPECT_THAT(batch.task(1).request_cost->GetMetrics(),
+              UnorderedElementsAre(
+                  Pair("batching_delay_msecs",
+                       absl::ToDoubleMilliseconds(batch_timeout - task2_delay)),
+                  Pair("batch_queueing_delay_msecs", 0)));
+}
+
+TEST(RecordBatchDelayMetricsTest,
+     TwoRequestsWithNoQueueingDelayAndSchedulingAfterSecondRequest) {
+  const absl::Duration batch_timeout = absl::Seconds(1);
+  const absl::Duration task2_delay = batch_timeout / 4;
+  const absl::Duration scheduling_delay = batch_timeout / 10;
+  const absl::Time task1_start_time = absl::Now();
+  const absl::Time task2_start_time = task1_start_time + task2_delay;
+  const absl::Time batch_schedule_time =
+      task1_start_time + task2_delay + scheduling_delay;
+
+  BatchResourceBase::BatchT batch;
+  RequestCost cost1, cost2;
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost1, task1_start_time));
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost2, task2_start_time));
+  batch.Close();
+
+  BatchResourceBase::RecordBatchDelayMetrics(
+      batch, "model_name", "op_name", /*processed_size=*/20,
+      batch_schedule_time, batch_timeout);
+
+  EXPECT_THAT(
+      batch.task(0).request_cost->GetMetrics(),
+      UnorderedElementsAre(
+          Pair("batching_delay_msecs",
+               absl::ToDoubleMilliseconds(task2_delay + scheduling_delay)),
+          Pair("batch_queueing_delay_msecs", 0)));
+  EXPECT_THAT(
+      batch.task(1).request_cost->GetMetrics(),
+      UnorderedElementsAre(Pair("batching_delay_msecs",
+                                absl::ToDoubleMilliseconds(scheduling_delay)),
+                           Pair("batch_queueing_delay_msecs", 0)));
+}
+
+TEST(RecordBatchDelayMetricsTest, TwoRequestWithQueueingDelay) {
+  const absl::Duration batch_timeout = absl::Seconds(1);
+  const absl::Duration task2_delay = batch_timeout / 4;
+  const absl::Duration queueing_delay = 5 * batch_timeout;
+  const absl::Time task1_start_time = absl::Now();
+  const absl::Time task2_start_time = task1_start_time + task2_delay;
+  const absl::Time batch_schedule_time =
+      task1_start_time + batch_timeout + queueing_delay;
+
+  BatchResourceBase::BatchT batch;
+  RequestCost cost1, cost2;
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost1, task1_start_time));
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost2, task2_start_time));
+  batch.Close();
+
+  BatchResourceBase::RecordBatchDelayMetrics(
+      batch, "model_name", "op_name", /*processed_size=*/20,
+      batch_schedule_time, batch_timeout);
+
+  EXPECT_THAT(
+      batch.task(0).request_cost->GetMetrics(),
+      UnorderedElementsAre(Pair("batching_delay_msecs",
+                                absl::ToDoubleMilliseconds(batch_timeout)),
+                           Pair("batch_queueing_delay_msecs",
+                                absl::ToDoubleMilliseconds(queueing_delay))));
+  EXPECT_THAT(batch.task(1).request_cost->GetMetrics(),
+              UnorderedElementsAre(
+                  Pair("batching_delay_msecs",
+                       absl::ToDoubleMilliseconds(batch_timeout - task2_delay)),
+                  Pair("batch_queueing_delay_msecs",
+                       absl::ToDoubleMilliseconds(queueing_delay))));
+}
+
+TEST(RecordBatchDelayMetricsTest,
+     TwoRequestsWithQueueingDelayAndSecondArrivingAfterBatchTimeout) {
+  const absl::Duration batch_timeout = absl::Seconds(1);
+  const absl::Duration task2_delay = 3 * batch_timeout;
+  const absl::Duration queueing_delay = 5 * batch_timeout;
+  const absl::Time task1_start_time = absl::Now();
+  const absl::Time task2_start_time = task1_start_time + task2_delay;
+  const absl::Time batch_schedule_time =
+      task1_start_time + task2_delay + queueing_delay;
+
+  BatchResourceBase::BatchT batch;
+  RequestCost cost1, cost2;
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost1, task1_start_time));
+  batch.AddTask(MakeBatchTask(/*task_size=*/1, &cost2, task2_start_time));
+  batch.Close();
+
+  BatchResourceBase::RecordBatchDelayMetrics(
+      batch, "model_name", "op_name", /*processed_size=*/20,
+      batch_schedule_time, batch_timeout);
+
+  EXPECT_THAT(batch.task(0).request_cost->GetMetrics(),
+              UnorderedElementsAre(
+                  Pair("batching_delay_msecs",
+                       absl::ToDoubleMilliseconds(batch_timeout)),
+                  Pair("batch_queueing_delay_msecs",
+                       absl::ToDoubleMilliseconds(task2_delay - batch_timeout +
+                                                  queueing_delay))));
+  EXPECT_THAT(
+      batch.task(1).request_cost->GetMetrics(),
+      UnorderedElementsAre(Pair("batching_delay_msecs", 0),
+                           Pair("batch_queueing_delay_msecs",
+                                absl::ToDoubleMilliseconds(queueing_delay))));
+}
+
 class BatchResourceBaseTest : public ::testing::Test {
  protected:
   // Like BatchResourceBase but overrides abstract methods, one of which
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index ccb344127b5f..3d05eed8d3ac 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -41,13 +41,13 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/platform/criticality.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index f8e08f7b0021..381bbb8147fa 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -26,11 +26,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
-#include "tsl/platform/criticality.h"
 
 namespace tensorflow {
 namespace serving {
@@ -388,7 +388,7 @@ TEST(BatchTest, Basic) {
 
 TEST(BatchTest, WaitUntilClosed) {
   Batch<FakeTask> batch;
-  batch.AddTask(std::unique_ptr<FakeTask>(new FakeTask(3)));
+  batch.AddTask(std::make_unique<FakeTask>(3));
   EXPECT_FALSE(batch.IsClosed());
 
   std::unique_ptr<Thread> close_thread(
@@ -402,7 +402,7 @@ TEST(BatchTest, WaitUntilClosed) {
 
 TEST(BatchTest, DeletionBlocksUntilClosed) {
   Batch<FakeTask>* batch = new Batch<FakeTask>;
-  batch->AddTask(std::unique_ptr<FakeTask>(new FakeTask(3)));
+  batch->AddTask(std::make_unique<FakeTask>(3));
   EXPECT_FALSE(batch->IsClosed());
 
   Notification do_delete, deleted;
diff --git a/tensorflow/core/kernels/batching_util/periodic_function_test.cc b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
index b7b8d507ab0b..10c18505698d 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function_test.cc
+++ b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
@@ -130,7 +130,7 @@ TEST(PeriodicFunctionTest, StartupDelayRace) {
     mutex_lock l(mu);
     EXPECT_EQ(1, counter);
     // A notification can only be notified once.
-    listener.reset(new Notification);
+    listener = std::make_unique<Notification>();
   }
   fake_clock_env.BlockUntilThreadsAsleep(1);
   fake_clock_env.AdvanceByMicroseconds(kPeriodMicros);
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
index a7285077a5e5..7fa9f48b23de 100644
--- a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -312,8 +312,8 @@ SerialDeviceBatchScheduler<TaskType>::SerialDeviceBatchScheduler(
     : options_(options),
       in_flight_batches_limit_(options.initial_in_flight_batches_limit),
       processing_threads_(options.initial_in_flight_batches_limit) {
-  batch_thread_pool_.reset(new thread::ThreadPool(
-      env(), options.thread_pool_name, options.num_batch_threads));
+  batch_thread_pool_ = std::make_unique<thread::ThreadPool>(
+      env(), options.thread_pool_name, options.num_batch_threads);
   for (int i = 0; i < processing_threads_; i++) {
     batch_thread_pool_->Schedule(
         std::bind(&SerialDeviceBatchScheduler<TaskType>::ProcessBatches, this));
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 347f300836cd..8fc22c721cef 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/time/clock.h"
+#include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/kernels/batching_util/batch_input_task.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
-#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 #include "tsl/profiler/lib/context_types.h"
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 34b23da6099d..f47f075da168 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/time/time.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/criticality.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler_utils.h"
 #include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tsl/platform/criticality.h"
 
 namespace tensorflow {
 namespace serving {
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 4c01847dea65..976763fc2b11 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -67,7 +67,7 @@ struct BincountFunctor<CPUDevice, Tidx, T, true> {
     auto partial_bins = partial_bins_t.matrix<bool>();
     partial_bins.setZero();
     thread_pool->ParallelForWithWorkerId(
-        arr.size(), 8 /* cost */,
+        arr.size(), thread::ThreadPool::SchedulingParams::Adaptive(8),
         [&](int64_t start_ind, int64_t limit_ind, int64_t worker_id) {
           for (int64_t i = start_ind; i < limit_ind; i++) {
             Tidx value = arr(i);
@@ -140,7 +140,7 @@ struct BincountFunctor<CPUDevice, Tidx, T, false> {
       auto partial_bins = partial_bins_t.matrix<T>();
       partial_bins.setZero();
       thread_pool->ParallelForWithWorkerId(
-          arr_size, 8 /* cost */,
+          arr_size, thread::ThreadPool::SchedulingParams::Adaptive(8),
           [&](int64_t start_ind, int64_t limit_ind, int64_t worker_id) {
             if (weights.size()) {
               for (int64_t i = start_ind; i < limit_ind; i++) {
@@ -181,7 +181,7 @@ struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_output> {
     ThreadPool* thread_pool =
         context->device()->tensorflow_cpu_worker_threads()->workers;
     thread_pool->ParallelForWithWorkerId(
-        num_rows, 8 /* cost */,
+        num_rows, thread::ThreadPool::SchedulingParams::Adaptive(8),
         [&](int64_t start_row, int64_t end_row, int64_t worker_id) {
           for (int64_t i = start_row; i < end_row; ++i) {
             for (int64_t j = 0; j < num_cols; ++j) {
@@ -442,7 +442,7 @@ class SparseBincountOp : public OpKernel {
             errors::InvalidArgument("Index out of bound. `batch` (", batch,
                                     ") must be less than the dimension size (",
                                     out.dimension(0), ")."));
-        if (bin < size) {
+        if (0 <= bin && bin < size) {
           if (binary_output_) {
             out(batch, bin) = T(1);
           } else {
diff --git a/tensorflow/core/kernels/conv_ops_fused_int8.cc b/tensorflow/core/kernels/conv_ops_fused_int8.cc
index 5d7af5c8482b..7f919d5087db 100644
--- a/tensorflow/core/kernels/conv_ops_fused_int8.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_int8.cc
@@ -570,21 +570,20 @@ void operator()(
   constexpr auto type = se::dnn::ToDataType<int8>::value;
   constexpr auto bias_type = se::dnn::ToDataType<BiasType>::value;
 
-  const bool use_cudnn_frontend = CudnnUseFrontend();
   AutotuneEntry<se::dnn::FusedConvOp> autotune_entry;
   if (!FusedConvAutotuneMap::GetInstance()->Find(fused_conv_parameters,
                                                  &autotune_entry)) {
-    VLOG(2) << "Autotuning fused convolution (use_frontend="
-            << use_cudnn_frontend << "): " << fused_conv_parameters.ToString();
+    VLOG(2) << "Autotuning fused convolution: "
+            << fused_conv_parameters.ToString();
     profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners;
     auto dnn = stream->parent()->AsDnn();
     CHECK_NE(dnn, nullptr);
     TF_CHECK_OK(dnn->GetFusedConvolveRunners(
-        use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type, bias_type,
-        type, conv_scale, side_input_scale, /*leakyrelu_alpha=*/0.0, stream,
-        conv_input_desc, filter_desc, bias_desc, output_desc, conv_desc,
+        se::dnn::ConvolutionKind::FORWARD, type, bias_type, type, conv_scale,
+        side_input_scale, /*leakyrelu_alpha=*/0.0, stream, conv_input_desc,
+        filter_desc, bias_desc, output_desc, conv_desc,
         /*use_fallback=*/false, dnn_activation_mode,
         GetNumericOptionsForCuDnn(), &runners));
 
@@ -621,7 +620,7 @@ void operator()(
       }
     }
 
-    if (!CudnnUseFrontend() || found_working_engine) {
+    if (found_working_engine) {
       auto runners_or = BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
           results, std::move(runners));
       OP_REQUIRES_OK(ctx, runners_or.status());
@@ -632,10 +631,9 @@ void operator()(
       auto dnn = stream->parent()->AsDnn();
       CHECK_NE(dnn, nullptr);
       TF_CHECK_OK(dnn->GetFusedConvolveRunners(
-          use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type,
-          bias_type, type, conv_scale, side_input_scale, leakyrelu_alpha,
-          stream, conv_input_desc, filter_desc, bias_desc, output_desc,
-          conv_desc,
+          se::dnn::ConvolutionKind::FORWARD, type, bias_type, type, conv_scale,
+          side_input_scale, leakyrelu_alpha, stream, conv_input_desc,
+          filter_desc, bias_desc, output_desc, conv_desc,
           /*use_fallback=*/true, dnn_activation_mode,
           GetNumericOptionsForCuDnn(), &fallback_runners));
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cc b/tensorflow/core/kernels/conv_ops_gpu.cc
index 86ff5761c7a9..466a49082fa4 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cc
@@ -100,11 +100,11 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
       return absl::InvalidArgumentError("No DNN in stream executor.");
     }
     TF_RETURN_IF_ERROR(dnn->GetFusedConvolveRunners(
-        CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type,
-        element_type, element_type, conv_scale, side_input_scale,
-        leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
-        output_desc, conv_desc, /*use_fallback=*/false, activation_mode,
-        GetNumericOptionsForCuDnn(), &runners));
+        se::dnn::ConvolutionKind::FORWARD, element_type, element_type,
+        element_type, conv_scale, side_input_scale, leakyrelu_alpha, stream,
+        input_desc, filter_desc, bias_desc, output_desc, conv_desc,
+        /*use_fallback=*/false, activation_mode, GetNumericOptionsForCuDnn(),
+        &runners));
 
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
@@ -139,7 +139,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
       }
     }
 
-    if (!CudnnUseFrontend() || found_working_engine) {
+    if (found_working_engine) {
       TF_ASSIGN_OR_RETURN(autotune_entry,
                           BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
                               results, std::move(runners)));
@@ -151,11 +151,11 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
       std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>>
           fallback_runners;
       TF_RETURN_IF_ERROR(dnn->GetFusedConvolveRunners(
-          CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type,
-          element_type, element_type, conv_scale, side_input_scale,
-          leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
-          output_desc, conv_desc, /*use_fallback=*/true, activation_mode,
-          GetNumericOptionsForCuDnn(), &fallback_runners));
+          se::dnn::ConvolutionKind::FORWARD, element_type, element_type,
+          element_type, conv_scale, side_input_scale, leakyrelu_alpha, stream,
+          input_desc, filter_desc, bias_desc, output_desc, conv_desc,
+          /*use_fallback=*/true, activation_mode, GetNumericOptionsForCuDnn(),
+          &fallback_runners));
 
       TF_ASSIGN_OR_RETURN(auto fallback_results,
                           internal::AutotuneConvImpl(
@@ -286,10 +286,10 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
       return absl::InvalidArgumentError("No DNN in stream executor.");
     }
     TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
-        CudnnUseFrontend(), kind, element_type, element_type, stream,
-        input_desc, input_ptr, filter_desc, filter_ptr, output_desc, output_ptr,
-        conv_desc, /*use_fallback=*/false, &rz_allocator,
-        GetNumericOptionsForCuDnn(), &runners));
+        kind, element_type, element_type, stream, input_desc, input_ptr,
+        filter_desc, filter_ptr, output_desc, output_ptr, conv_desc,
+        /*use_fallback=*/false, &rz_allocator, GetNumericOptionsForCuDnn(),
+        &runners));
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
             const std::unique_ptr<const se::dnn::ConvRunner>& runner,
@@ -320,7 +320,7 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
       }
     }
 
-    if (!CudnnUseFrontend() || found_working_engine) {
+    if (found_working_engine) {
       TF_ASSIGN_OR_RETURN(
           autotune_entry,
           BestCudnnConvAlgorithm<se::dnn::ConvOp>(results, std::move(runners)));
@@ -331,10 +331,10 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
           << conv_parameters.ToString();
       std::vector<std::unique_ptr<const se::dnn::ConvRunner>> fallback_runners;
       TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
-          CudnnUseFrontend(), kind, element_type, element_type, stream,
-          input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
-          output_ptr, conv_desc, /*use_fallback=*/true, &rz_allocator,
-          GetNumericOptionsForCuDnn(), &fallback_runners));
+          kind, element_type, element_type, stream, input_desc, input_ptr,
+          filter_desc, filter_ptr, output_desc, output_ptr, conv_desc,
+          /*use_fallback=*/true, &rz_allocator, GetNumericOptionsForCuDnn(),
+          &fallback_runners));
 
       TF_ASSIGN_OR_RETURN(auto fallback_results,
                           internal::AutotuneConvImpl(
diff --git a/tensorflow/core/kernels/cwise_op_leakyrelu.cc b/tensorflow/core/kernels/cwise_op_leakyrelu.cc
index 0de8a65a01aa..7bff5c9f61c1 100644
--- a/tensorflow/core/kernels/cwise_op_leakyrelu.cc
+++ b/tensorflow/core/kernels/cwise_op_leakyrelu.cc
@@ -36,8 +36,8 @@ namespace internal {
 
 template <typename Scalar>
 struct leakyrelu_op {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit leakyrelu_op(float val = 0.2f)
-      EIGEN_NO_THROW {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit leakyrelu_op(
+      float val = 0.2f) {
     m_alpha = Scalar(val);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index eff4f1c14551..c6f46b325396 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -1195,7 +1195,7 @@ void CacheDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
       auto handle = HandleFromInput(ctx, 2);
       absl::Status s = ctx->resource_manager()->Lookup<MemoryCacheManager>(
           handle.container(), handle.name(), &manager);
-      if (errors::IsNotFound(s)) {
+      if (absl::IsNotFound(s)) {
         owns_resource = true;
         OP_REQUIRES_OK(
             ctx,
diff --git a/tensorflow/core/kernels/data/cache_ops.cc b/tensorflow/core/kernels/data/cache_ops.cc
index 32b01e74a1ab..0dce7f73215f 100644
--- a/tensorflow/core/kernels/data/cache_ops.cc
+++ b/tensorflow/core/kernels/data/cache_ops.cc
@@ -94,7 +94,7 @@ void DeleteMemoryCacheOp::Compute(OpKernelContext* ctx) {
   const ResourceHandle& handle = ctx->input(0).flat<ResourceHandle>()(0);
   // The resource might have been already deleted by the dataset.
   absl::Status s = ctx->resource_manager()->Delete(handle);
-  if (!errors::IsNotFound(s)) {
+  if (!absl::IsNotFound(s)) {
     OP_REQUIRES_OK(ctx, s);
   }
 }
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index bda4912804bd..734eb29500e2 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -401,6 +401,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core/platform:logging",
         "@eigen_archive//:eigen3",
     ],
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 0c16db83a66a..22119c4910c6 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -303,7 +303,7 @@ class CSVDatasetOp : public DatasetOpKernel {
               *end_of_sequence = false;
               return s;
             }
-            if (!errors::IsOutOfRange(s)) {
+            if (!absl::IsOutOfRange(s)) {
               // Not at the end of file, return OK or non-EOF errors to caller.
               *end_of_sequence = false;
               return s;
@@ -368,7 +368,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           // Restores the most recently held buffer
           absl::Status s = input_stream_->SkipNBytes(
               num_buffer_reads_ * dataset()->options_.input_buffer_size);
-          if (!s.ok() && !errors::IsOutOfRange(s)) {
+          if (!s.ok() && !absl::IsOutOfRange(s)) {
             // We might get out of range error here if the size of the file
             // is not an exact multiple of the buffer size, and the last buffer
             // read is < buffer_size. This is valid and we do not surface the
@@ -377,7 +377,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           }
 
           absl::Status s2 = FillBuffer(&buffer_);
-          if (!s2.ok() && !errors::IsOutOfRange(s2)) {
+          if (!s2.ok() && !absl::IsOutOfRange(s2)) {
             return s2;
           }
           pos_ = size_t(pos);
@@ -448,11 +448,11 @@ class CSVDatasetOp : public DatasetOpKernel {
           // with the end of the buffer. We can fill the buffer without abandon.
           absl::Status s = FillBuffer(&buffer_);
 
-          if (errors::IsOutOfRange(s)) {
+          if (absl::IsOutOfRange(s)) {
             // Reached EOF, and last field is empty
             *end_of_record = true;
             if (include) {
-              return FieldToOutput(ctx, StringPiece(), out_tensors);
+              return FieldToOutput(ctx, absl::string_view(), out_tensors);
             } else {
               return absl::OkStatus();
             }
@@ -515,7 +515,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           if (pos_ >= buffer_.size()) {
             absl::Status s =
                 SaveAndFillBuffer(&earlier_pieces, &start, include);
-            if (errors::IsOutOfRange(s)) {
+            if (absl::IsOutOfRange(s)) {
               return errors::InvalidArgument(
                   "Reached end of file without closing quoted field in "
                   "record");
@@ -532,11 +532,12 @@ class CSVDatasetOp : public DatasetOpKernel {
             if (pos_ >= buffer_.size()) {
               absl::Status s =
                   SaveAndFillBuffer(&earlier_pieces, &start, include);
-              if (errors::IsOutOfRange(s)) {
+              if (absl::IsOutOfRange(s)) {
                 // This was the last field. We are done
                 *end_of_record = true;
-                parse_result.Update(QuotedFieldToOutput(
-                    ctx, StringPiece(), out_tensors, earlier_pieces, include));
+                parse_result.Update(
+                    QuotedFieldToOutput(ctx, absl::string_view(), out_tensors,
+                                        earlier_pieces, include));
                 return parse_result;
               } else if (!s.ok()) {
                 return s;
@@ -547,14 +548,14 @@ class CSVDatasetOp : public DatasetOpKernel {
             pos_++;
             if (next == dataset()->delim_) {
               parse_result.Update(QuotedFieldToOutput(
-                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  ctx, absl::string_view(&buffer_[start], pos_ - 1 - start),
                   out_tensors, earlier_pieces, include));
               return parse_result;
 
             } else if (next == '\n' || next == '\r') {
               *end_of_record = true;
               parse_result.Update(QuotedFieldToOutput(
-                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  ctx, absl::string_view(&buffer_[start], pos_ - 1 - start),
                   out_tensors, earlier_pieces, include));
               if (next == '\r') SkipNewLineIfNecessary();
               return parse_result;
@@ -575,7 +576,8 @@ class CSVDatasetOp : public DatasetOpKernel {
       // Converts quoted field to an output tensor, removing the starting
       // and ending quotes from it and unescaping double quotations if
       // necessary.
-      absl::Status QuotedFieldToOutput(IteratorContext* ctx, StringPiece field,
+      absl::Status QuotedFieldToOutput(IteratorContext* ctx,
+                                       absl::string_view field,
                                        std::vector<Tensor>* out_tensors,
                                        const std::vector<Piece>& earlier_pieces,
                                        bool include)
@@ -605,17 +607,17 @@ class CSVDatasetOp : public DatasetOpKernel {
         // the opening quotation mark of the quoted field.
         bool skip_next_quote = true;
         for (const Piece& p : earlier_pieces) {
-          AppendUnescapedPiece(StringPiece(&p.buffer[p.start], p.len),
+          AppendUnescapedPiece(absl::string_view(&p.buffer[p.start], p.len),
                                &field_complete, &skip_next_quote);
         }
         AppendUnescapedPiece(field, &field_complete, &skip_next_quote);
-        StringPiece result = StringPiece(field_complete);
+        absl::string_view result = absl::string_view(field_complete);
         result.remove_suffix(1);  // Skip final quote
 
         return FieldToOutput(ctx, result, out_tensors);
       }
 
-      void AppendUnescapedPiece(StringPiece piece, string* field_complete,
+      void AppendUnescapedPiece(absl::string_view piece, string* field_complete,
                                 bool* skip_next_quote) {
         size_t from = 0;
         size_t found = piece.find('\"', from);
@@ -651,12 +653,12 @@ class CSVDatasetOp : public DatasetOpKernel {
             absl::Status s =
                 SaveAndFillBuffer(&earlier_pieces, &start, include);
             // Handle errors
-            if (errors::IsOutOfRange(s)) {
+            if (absl::IsOutOfRange(s)) {
               // Whatever we have is the last field of the last record
               *end_of_record = true;
               parse_result.Update(UnquotedFieldToOutput(
-                  ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                  earlier_pieces, include));
+                  ctx, absl::string_view(&buffer_[start], pos_ - start),
+                  out_tensors, earlier_pieces, include));
               return parse_result;
             } else if (!s.ok()) {
               return s;  // Surface all other errors to caller
@@ -667,8 +669,8 @@ class CSVDatasetOp : public DatasetOpKernel {
 
           if (ch == dataset()->delim_) {
             parse_result.Update(UnquotedFieldToOutput(
-                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                earlier_pieces, include));
+                ctx, absl::string_view(&buffer_[start], pos_ - start),
+                out_tensors, earlier_pieces, include));
             pos_++;
             return parse_result;
           }
@@ -676,8 +678,8 @@ class CSVDatasetOp : public DatasetOpKernel {
             // need special case to skip over first \n of record if the line
             // breaks are \r\n
             parse_result.Update(UnquotedFieldToOutput(
-                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                earlier_pieces, include));
+                ctx, absl::string_view(&buffer_[start], pos_ - start),
+                out_tensors, earlier_pieces, include));
             *end_of_record = true;
             pos_++;
             if (ch == '\r') SkipNewLineIfNecessary();
@@ -700,7 +702,7 @@ class CSVDatasetOp : public DatasetOpKernel {
         absl::Status s = input_stream_->ReadNBytes(
             dataset()->options_.input_buffer_size, result);
 
-        if (errors::IsOutOfRange(s) && !result->empty()) {
+        if (absl::IsOutOfRange(s) && !result->empty()) {
           // Ignore OutOfRange error when ReadNBytes read < N bytes.
           return absl::OkStatus();
         }
@@ -708,7 +710,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
       // Given a field, converts it to the right output tensor type
-      absl::Status FieldToOutput(IteratorContext* ctx, StringPiece field,
+      absl::Status FieldToOutput(IteratorContext* ctx, absl::string_view field,
                                  std::vector<Tensor>* out_tensors) {
         size_t output_idx = out_tensors->size();
         if (output_idx >= dataset()->out_type_.size()) {
@@ -828,7 +830,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       // converts it to a Tensor of the right type and adds it to the
       // out_tensors vector.
       absl::Status UnquotedFieldToOutput(
-          IteratorContext* ctx, StringPiece field,
+          IteratorContext* ctx, absl::string_view field,
           std::vector<Tensor>* out_tensors,
           const std::vector<Piece>& earlier_pieces, bool include)
           TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index e1999ad1dbae..a14dd0b9d65a 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -665,7 +665,7 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
       iteration_counter_handle.container(), iteration_counter_handle.name(),
       &iteration_counter);
   bool owns_resource = false;
-  if (errors::IsNotFound(s)) {
+  if (absl::IsNotFound(s)) {
     owns_resource = true;
     static std::atomic<int64_t> resource_id_counter(0);
     const std::string& container = ctx->resource_manager()->default_container();
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index ff6757fe7ec1..3e4469515c81 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -12,7 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/logging.h"
@@ -114,7 +120,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
             return absl::OkStatus();
           }
           s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-          while (!s.ok() && !errors::IsCancelled(s)) {
+          while (!s.ok() && !absl::IsCancelled(s)) {
             if (dataset()->log_warning_) {
               LOG(WARNING) << "Error raised with error message " << s.message();
             }
@@ -132,8 +138,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
      protected:
       std::shared_ptr<model::Node> CreateNode(
           IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
+        return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
       }
 
       absl::Status SaveInternal(SerializationContext* ctx,
diff --git a/tensorflow/core/kernels/data/experimental/lookup_ops.cc b/tensorflow/core/kernels/data/experimental/lookup_ops.cc
index 904e6b04c8c5..4359d7212ac3 100644
--- a/tensorflow/core/kernels/data/experimental/lookup_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/lookup_ops.cc
@@ -172,7 +172,7 @@ void InitializeTableFromDataset(OpKernelContext* ctx,
   OP_REQUIRES_OK(ctx, iter.Init(ctx));
   absl::Status s =
       table->Initialize(iter, MakeDatasetInitializerSerializer(ctx, dataset));
-  if (errors::IsFailedPrecondition(s) && table->is_initialized()) {
+  if (absl::IsFailedPrecondition(s) && table->is_initialized()) {
     LOG(INFO) << "Table already initialized from dataset.";
     return;
   }
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 335d18ab0905..7d3b1ca79d9b 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -451,7 +451,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
           std::make_shared<std::vector<Tensor>>();
       auto done = [this, ctx, result, return_values,
                    offset](absl::Status status) {
-        if (dataset()->preserve_cardinality_ && errors::IsOutOfRange(status)) {
+        if (dataset()->preserve_cardinality_ && absl::IsOutOfRange(status)) {
           // To guarantee that the transformation preserves the cardinality of
           // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
           // former may be interpreted by a caller as the end of sequence.
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 3390c25af624..60eb64bff5bf 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -12,7 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
+#include <cstdint>
 #include <deque>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/data/dataset_utils.h"
@@ -735,7 +742,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           *end_of_sequence = false;
           return absl::OkStatus();
         }
-        if (errors::IsOutOfRange(result->status)) {
+        if (absl::IsOutOfRange(result->status)) {
           // To guarantee that the transformation preserves the cardinality of
           // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
           // former may be interpreted by a caller as the end of sequence.
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index dd699bcdee27..355124b1f3bd 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -282,7 +282,7 @@ void RandomDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) {
     absl::Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
         handle.container(), handle.name(), &manager);
     owns_resource = false;
-    if (errors::IsNotFound(s)) {
+    if (absl::IsNotFound(s)) {
       owns_resource = true;
     } else {
       OP_REQUIRES_OK(ctx, s);
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 1fc215502445..82d958ab4f34 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -237,7 +237,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
             out_tensors->push_back(std::move(state_and_output[i]));
           }
-        } else if (errors::IsOutOfRange(s)) {
+        } else if (absl::IsOutOfRange(s)) {
           if (dataset()->preserve_cardinality_) {
             // To guarantee that the transformation preserves the cardinality of
             // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index f1d7e58c1411..880654fa6e71 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -939,8 +939,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                    << dump_status;
     }
 
-    std::string graph_hash =
-        strings::StrCat(strings::Hex(hash, strings::kZeroPad16));
+    std::string graph_hash = absl::StrCat(absl::Hex(hash, absl::kZeroPad16));
     LOG(INFO) << "Graph def serialized to hash: " << graph_hash;
 
     *output = new Dataset(ctx, input, path, graph_hash, reader_path_prefix_,
@@ -1521,7 +1520,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               buffer_.push_back(std::move(elem));
               num_elements_read_++;
               cond_var_.notify_all();
-            } else if (errors::IsOutOfRange(s)) {
+            } else if (absl::IsOutOfRange(s)) {
               return absl::OkStatus();
             } else {
               return s;
@@ -1667,8 +1666,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               hash_dir_(hash_dir),
               run_id_(run_id) {
           if (run_id_.empty()) {
-            run_id_ = strings::StrCat(
-                strings::Hex(random::New64(), strings::kZeroPad4));
+            run_id_ = absl::StrCat(absl::Hex(random::New64(), absl::kZeroPad4));
           }
           run_dir_ =
               io::JoinPath(dataset()->writer_path_prefix_, hash_dir_, run_id_);
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
index e996fac56ae6..2e20e18312e7 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -131,6 +132,11 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
     explicit UncompressedIterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    absl::Status Initialize(IteratorContext* ctx) override {
+      LogFilenames(dataset()->filenames_);
+      return absl::OkStatus();
+    }
+
     absl::Status GetNextInternal(IteratorContext* ctx,
                                  std::vector<Tensor>* out_tensors,
                                  bool* end_of_sequence) override {
@@ -257,6 +263,11 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
     explicit CompressedIterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    absl::Status Initialize(IteratorContext* ctx) override {
+      LogFilenames(dataset()->filenames_);
+      return absl::OkStatus();
+    }
+
     absl::Status GetNextInternal(IteratorContext* ctx,
                                  std::vector<Tensor>* out_tensors,
                                  bool* end_of_sequence) override {
@@ -301,7 +312,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
               *end_of_sequence = false;
               return absl::OkStatus();
             }
-            if (errors::IsOutOfRange(s) && !record.empty()) {
+            if (absl::IsOutOfRange(s) && !record.empty()) {
               uint64 body_size =
                   current_pos + record.size() -
                   (dataset()->header_bytes_ + dataset()->footer_bytes_);
@@ -479,7 +490,6 @@ void FixedLengthRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
     filenames.push_back(filenames_tensor->flat<tstring>()(i));
     metrics::RecordTFDataFilename(kDatasetType, filenames[i]);
   }
-  LogFilenames(filenames);
 
   int64_t header_bytes = -1;
   OP_REQUIRES_OK(
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index d7e6faeec7a3..0f6f7b6fe892 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -144,7 +144,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
           ctx, state_, out_tensors, model_node());
       if (s.ok()) {
         *end_of_sequence = false;
-      } else if (errors::IsOutOfRange(s)) {
+      } else if (absl::IsOutOfRange(s)) {
         // `next_func` may deliberately raise `errors::OutOfRange`
         // to indicate that we should terminate the iteration.
         s = absl::OkStatus();
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index bf034a569733..d733640e3bd3 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -190,7 +190,7 @@ class MapDatasetOp::Dataset : public DatasetBase {
 
       absl::Status s = instantiated_captured_func_->Run(
           ctx, std::move(args), out_tensors, model_node());
-      if (errors::IsOutOfRange(s)) {
+      if (absl::IsOutOfRange(s)) {
         if (dataset()->preserve_cardinality_) {
           // To guarantee that the transformation preserves the cardinality of
           // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index d06cb1ffe419..98cb9f34f3aa 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -86,7 +86,7 @@ void MakeDatasetHelper(OpKernelContext* ctx,
   absl::Status s = RewriteDataset(ctx, input, std::move(config_factory),
                                   /*record_fingerprint=*/false, &rewritten);
   *output = rewritten.release();
-  if (errors::IsDeadlineExceeded(s)) {
+  if (absl::IsDeadlineExceeded(s)) {
     // Ignore DeadlineExceeded as it implies that the attempted rewrite took too
     // long which should not prevent further computation.
     LOG(WARNING) << s.ToString();
diff --git a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
index e8a16c0a08e2..3099d6e654fc 100644
--- a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
@@ -407,7 +407,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
         *end_of_sequence = false;
         return absl::OkStatus();
       }
-      if (errors::IsOutOfRange(result->status)) {
+      if (absl::IsOutOfRange(result->status)) {
         // `predicate` may deliberately raise `errors::OutOfRange` to indicate
         // that we should terminate the iteration early.
         return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 92e0827c7cd9..65373934f251 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -625,7 +625,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         *end_of_sequence = false;
         return absl::OkStatus();
       }
-      if (errors::IsOutOfRange(result->status)) {
+      if (absl::IsOutOfRange(result->status)) {
         if (preserve_cardinality_) {
           // To guarantee that the transformation preserves the cardinality of
           // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
diff --git a/tensorflow/core/kernels/data/random_seed_ops.h b/tensorflow/core/kernels/data/random_seed_ops.h
index f0afa739bfd5..c4e9b2e42bb8 100644
--- a/tensorflow/core/kernels/data/random_seed_ops.h
+++ b/tensorflow/core/kernels/data/random_seed_ops.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_RANDOM_SEED_OPS_H_
 
+#include <memory>
+#include <string>
+#include <utility>
+
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/random/philox_random.h"
diff --git a/tensorflow/core/kernels/data/reduce_dataset_op_test.cc b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
index 779a9ab82104..76b85e6ad702 100644
--- a/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/data/dataset_test_base.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 522c95364a10..6edd6259655c 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -214,7 +214,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         absl::Status s =
             input_impl_->Skip(ctx, dataset()->num_shards_ - next_index_,
                               end_of_sequence, &num_skipped);
-        if (*end_of_sequence || errors::IsOutOfRange(s)) {
+        if (*end_of_sequence || absl::IsOutOfRange(s)) {
           // `dataset()->require_non_empty_` implies that this transformation
           // was introduced by auto_sharding rewrite, so it's acceptable
           // produce an error message that assumes auto-sharding context.
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 04e1ae701799..c331fc16a69d 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -816,7 +816,7 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64_t>(ctx, kSeed2, &seed2));
     RandomSeeds seeds(seed, seed2);
     bool owns_resource = false;
-    if (errors::IsNotFound(s)) {
+    if (absl::IsNotFound(s)) {
       owns_resource = true;
       OP_REQUIRES_OK(
           ctx,
@@ -848,7 +848,7 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     absl::Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
         handle.container(), handle.name(), &manager);
     bool owns_resource = false;
-    if (errors::IsNotFound(s)) {
+    if (absl::IsNotFound(s)) {
       owns_resource = true;
       LOG(WARNING) << "Failed to find seed generator resource. Falling back to "
                       "using a non-deterministically seeded generator and "
@@ -1076,7 +1076,7 @@ void ShuffleAndRepeatDatasetOp::MakeDataset(OpKernelContext* ctx,
     absl::Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
         handle.container(), handle.name(), &manager);
     bool owns_resource = false;
-    if (errors::IsNotFound(s)) {
+    if (absl::IsNotFound(s)) {
       owns_resource = true;
       OP_REQUIRES_OK(
           ctx,
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op.cc b/tensorflow/core/kernels/data/text_line_dataset_op.cc
index f7b858b65664..c733c8c521fc 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -99,6 +100,11 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    absl::Status Initialize(IteratorContext* ctx) override {
+      LogFilenames(dataset()->filenames_);
+      return absl::OkStatus();
+    }
+
     bool SymbolicCheckpointCompatible() const override { return true; }
 
     absl::Status GetNextInternal(IteratorContext* ctx,
@@ -121,7 +127,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
             out_tensors->push_back(std::move(line_contents));
             *end_of_sequence = false;
             return absl::OkStatus();
-          } else if (!errors::IsOutOfRange(s)) {
+          } else if (!absl::IsOutOfRange(s)) {
             // Report non-EOF errors to the caller.
             return s;
           }
@@ -282,7 +288,6 @@ void TextLineDatasetOp::MakeDataset(OpKernelContext* ctx,
     filenames.push_back(filenames_tensor->flat<tstring>()(i));
     metrics::RecordTFDataFilename(kDatasetType, filenames[i]);
   }
-  LogFilenames(filenames);
 
   *output = new Dataset(ctx, std::move(filenames), compression_type,
                         zlib_compression_options);
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index df91a2b9ea9e..d391c6dcd731 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -155,7 +155,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
             return absl::OkStatus();
           }
           out_tensors->pop_back();
-          if (!errors::IsOutOfRange(s)) {
+          if (!absl::IsOutOfRange(s)) {
             // In case of other errors e.g., DataLoss, we still move forward
             // the file index so that it works with ignore_errors.
             // Otherwise the same file will repeat.
@@ -197,7 +197,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
             *end_of_sequence = false;
             return absl::OkStatus();
           }
-          if (!errors::IsOutOfRange(s)) {
+          if (!absl::IsOutOfRange(s)) {
             // In case of other errors e.g., DataLoss, we still move forward
             // the file index so that it works with ignore_errors.
             // Otherwise the same file will repeat.
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 33c8851cd405..b5c77d24f98e 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -15,8 +15,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/zip_dataset_op.h"
 
 #include <functional>
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index d9ab52e88137..5af2eb1210e3 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -69,6 +69,8 @@ TF_CALL_float8_e5m2(DEFINE_GPU_KERNELS);
 TF_CALL_float8_e4m3fn(DEFINE_GPU_KERNELS);
 TF_CALL_int4(DEFINE_GPU_KERNELS);
 TF_CALL_uint4(DEFINE_GPU_KERNELS);
+TF_CALL_int2(DEFINE_GPU_KERNELS);
+TF_CALL_uint2(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 0ee9dbbfd29b..68b20ef6e3d7 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -69,6 +69,8 @@ DEFINE_SETZERO_CPU(float8_e4m3b11fnuz);
 DEFINE_SETZERO_CPU(float8_e5m2fnuz);
 DEFINE_SETZERO_CPU(int4);
 DEFINE_SETZERO_CPU(uint4);
+DEFINE_SETZERO_CPU(int2);
+DEFINE_SETZERO_CPU(uint2);
 #undef DEFINE_SETZERO_CPU
 
 template <typename T>
@@ -102,6 +104,8 @@ DEFINE_SETONE_CPU(float8_e4m3b11fnuz);
 DEFINE_SETONE_CPU(float8_e5m2fnuz);
 DEFINE_SETONE_CPU(int4);
 DEFINE_SETONE_CPU(uint4);
+DEFINE_SETONE_CPU(int2);
+DEFINE_SETONE_CPU(uint2);
 #undef DEFINE_SETONE_CPU
 
 template <typename T>
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
index def1dc12d708..6db2fca006a0 100644
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -95,7 +95,7 @@ class FixedLengthRecordReader : public ReaderBase {
         absl::Status s =
             buffered_inputstream_->SkipNBytes(hop_bytes_ - cache_size);
         if (!s.ok()) {
-          if (!errors::IsOutOfRange(s)) {
+          if (!absl::IsOutOfRange(s)) {
             return s;
           }
           *at_end = true;
@@ -109,7 +109,7 @@ class FixedLengthRecordReader : public ReaderBase {
     absl::Status s = buffered_inputstream_->ReadNBytes(bytes_to_read, value);
     if (!s.ok()) {
       value->clear();
-      if (!errors::IsOutOfRange(s)) {
+      if (!absl::IsOutOfRange(s)) {
         return s;
       }
       *at_end = true;
diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h
index bef22b50ada1..bcc873bfb031 100644
--- a/tensorflow/core/kernels/gpu_prim.h
+++ b/tensorflow/core/kernels/gpu_prim.h
@@ -44,10 +44,9 @@ __device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::half>(
       Eigen::numext::bit_cast<uint16_t>(val);
 }
 
-template <>
-__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer<Eigen::half>(
+__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer(
     Eigen::half *ptr, Int2Type<true> /*is_primitive*/) {
-  uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
+  const uint16_t result = *reinterpret_cast<volatile const uint16_t *>(ptr);
   return Eigen::numext::bit_cast<Eigen::half>(result);
 }
 
@@ -59,10 +58,8 @@ __device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::bfloat16>(
       Eigen::numext::bit_cast<uint16_t>(val);
 }
 
-template <>
-__device__ __forceinline__ Eigen::bfloat16
-ThreadLoadVolatilePointer<Eigen::bfloat16>(Eigen::bfloat16 *ptr,
-                                           Int2Type<true> /*is_primitive*/) {
+__device__ __forceinline__ Eigen::bfloat16 ThreadLoadVolatilePointer(
+    Eigen::bfloat16 *ptr, Int2Type<true> /*is_primitive*/) {
   uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
   return Eigen::numext::bit_cast<Eigen::bfloat16>(result);
 }
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index 0ac954577264..a47f1771243a 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -132,6 +132,7 @@ IMAGE_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core/lib/png:png_io",
+    "//tensorflow/core/lib/webp:webp_io",
     "//tensorflow/core:protos_all_cc",
     "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core/kernels:eigen_helpers",
@@ -317,7 +318,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sample_distorted_bounding_box_op",
     prefix = "sample_distorted_bounding_box_op",
-    deps = IMAGE_DEPS + ["//tensorflow/core/kernels:stateless_random_ops"],
+    deps = IMAGE_DEPS + [
+        "//tensorflow/core/kernels:stateless_random_ops",
+        "@com_google_absl//absl/log:check",
+    ],
 )
 
 tf_kernel_library(
@@ -461,6 +465,7 @@ cc_library(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/lib/core:status",
         "//tensorflow/core/lib/png:png_io",
+        "//tensorflow/core/lib/webp:webp_io",
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:errors",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/kernels/image/adjust_hue_op.cc b/tensorflow/core/kernels/image/adjust_hue_op.cc
index fb089f13f8ed..8795185c365d 100644
--- a/tensorflow/core/kernels/image/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/image/adjust_hue_op.cc
@@ -11,16 +11,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 #define EIGEN_USE_THREADS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif
 
-#include "tensorflow/core/kernels/image/adjust_hue_op.h"
-
-#include <memory>
-
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -28,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/image/adjust_hue_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/image/adjust_saturation_op.cc b/tensorflow/core/kernels/image/adjust_saturation_op.cc
index 5c108aa2ab74..5387e636f69a 100644
--- a/tensorflow/core/kernels/image/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/image/adjust_saturation_op.cc
@@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #define EIGEN_USE_THREADS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif
 
-#include "tensorflow/core/kernels/image/adjust_saturation_op.h"
-
-#include <memory>
-
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/image/adjust_saturation_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index e87d79fe67b5..165747897a8e 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
+#include <functional>
 #include <limits>
 #include <memory>
+#include <string>
 
 #define EIGEN_USE_THREADS
 
@@ -38,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/lib/webp/webp_io.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -57,6 +60,9 @@ static const char kGifMagicBytes[] = "\x47\x49\x46\x38";
 static const char kBmpMagicBytes[] = "\x42\x4d";
 // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three.
 static const char kJpegMagicBytes[] = "\xff\xd8\xff";
+// WebP is RIFF????WEBP
+static const char kRiffMagicBytes[] = "\x52\x49\x46\x46";
+static const char kWebpMagicBytes[] = "\x57\x45\x42\x50";
 
 enum FileFormat {
   kUnknownFormat = 0,
@@ -64,6 +70,7 @@ enum FileFormat {
   kJpgFormat = 2,
   kGifFormat = 3,
   kBmpFormat = 4,
+  kWebpFormat = 5,
 };
 
 // Classify the contents of a file based on starting bytes (the magic number).
@@ -72,12 +79,19 @@ FileFormat ClassifyFileFormat(absl::string_view data) {
   if (absl::StartsWith(data, kPngMagicBytes)) return kPngFormat;
   if (absl::StartsWith(data, kGifMagicBytes)) return kGifFormat;
   if (absl::StartsWith(data, kBmpMagicBytes)) return kBmpFormat;
+
+  if (absl::StartsWith(data, kRiffMagicBytes) && data.size() > 12) {
+    // Move forward by RIFF plus 4 size bytes.
+    data.remove_prefix(8);
+    if (absl::StartsWith(data, kWebpMagicBytes)) return kWebpFormat;
+  }
+
   return kUnknownFormat;
 }
 
-// Decode an image. Supported image formats are JPEG, PNG, GIF and BMP. This is
-// a newer version of `DecodeImageOp` for enabling image data parsing to take
-// place in kernels only, reducing security vulnerabilities and redundancy.
+// Decode an image. Supported image formats are JPEG, PNG, GIF, BMP, and WebP.
+// This is a newer version of `DecodeImageOp` for enabling image data parsing to
+// take place in kernels only, reducing security vulnerabilities and redundancy.
 class DecodeImageV2Op : public OpKernel {
  public:
   explicit DecodeImageV2Op(OpKernelConstruction* context) : OpKernel(context) {
@@ -93,7 +107,8 @@ class DecodeImageV2Op : public OpKernel {
     OP_REQUIRES(context,
                 op_type_ == "DecodeJpeg" || op_type_ == "DecodeAndCropJpeg" ||
                     op_type_ == "DecodePng" || op_type_ == "DecodeGif" ||
-                    op_type_ == "DecodeBmp" || op_type_ == "DecodeImage",
+                    op_type_ == "DecodeBmp" || op_type_ == "DecodeWebP" ||
+                    op_type_ == "DecodeImage",
                 errors::InvalidArgument("Bad op type ", op_type_));
 
     // Get attributes from `DecodeJpeg` and `DecodeAndCropJpeg` op
@@ -218,10 +233,14 @@ class DecodeImageV2Op : public OpKernel {
       case kBmpFormat:
         DecodeBmpV2(context, input);
         break;
+      case kWebpFormat:
+        DecodeWebP(context, input);
+        break;
       case kUnknownFormat:
-        OP_REQUIRES(context, false,
-                    errors::InvalidArgument("Unknown image file format. One of "
-                                            "JPEG, PNG, GIF, BMP required."));
+        OP_REQUIRES(
+            context, false,
+            errors::InvalidArgument("Unknown image file format. One of "
+                                    "JPEG, PNG, GIF, BMP, WebP required."));
         break;
     }
   }
@@ -666,6 +685,93 @@ class DecodeImageV2Op : public OpKernel {
     }
   }
 
+  void DecodeWebP(OpKernelContext* context, absl::string_view input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3 || channels_ == 4,
+                errors::InvalidArgument("WebP only supports 3 or 4 channels"));
+
+    OP_REQUIRES(context, data_type_ == DataType::DT_UINT8,
+                errors::InvalidArgument("WebP only supports uint8 for dtype"));
+
+    int width, height, channels;
+    bool has_animation;
+
+    OP_REQUIRES(context,
+                webp::DecodeWebPHeader(input, &width, &height, &channels,
+                                       &has_animation),
+                errors::InvalidArgument("Failed to decode WebP header."));
+
+    // We either wanted auto-detection of channels or that they match the input
+    // image.
+    OP_REQUIRES(context, channels_ == 0 || channels_ == channels,
+                errors::InvalidArgument(
+                    "Number of channels requested does not match input"));
+
+    if (!has_animation) {
+      Tensor* output = nullptr;
+
+      // If this is DecodeImage w/ expand_animations_ = False, return a 3D
+      // tensor. Otherwise, return a 4D tensor with num_frames = 1.
+      if (expand_animations_) {
+        OP_REQUIRES_OK(
+            context,
+            context->allocate_output(
+                0, TensorShape({1, height, width, channels}), &output));
+      } else {
+        OP_REQUIRES_OK(context,
+                       context->allocate_output(
+                           0, TensorShape({height, width, channels}), &output));
+      }
+
+      // Actually decode the image into the output buffer.
+      OP_REQUIRES(context,
+                  webp::DecodeWebPImage(input, output->flat<uint8>().data(),
+                                        width, height, channels),
+                  errors::InvalidArgument("Failed to decode WebP image."));
+      // Note: Here we could also perform casting to other dtypes, but users can
+      // also just convert in their own code.
+      return;
+    }
+
+    // Handle the animation case.
+    OP_REQUIRES(
+        context, channels_ == 0 || channels_ == 4,
+        errors::InvalidArgument("WebP Animation must be 4 channel RGBA"));
+
+    Tensor* output = nullptr;
+    std::string error_string;
+
+    uint8_t* buffer = webp::DecodeWebPAnimation(
+        input,
+        [&](int num_frames, int width, int height, int channls) -> uint8_t* {
+          // If expand_animations is false, we want {height, width, channels}
+          // otherwise, we want {num_frames, height, width, channels} even if
+          // it's a single frame.
+          absl::Status status;
+
+          if (expand_animations_) {
+            status = context->allocate_output(
+                0, TensorShape({num_frames, height, width, channels}), &output);
+          } else {
+            status = context->allocate_output(
+                0, TensorShape({height, width, channels}), &output);
+          }
+
+          if (!status.ok()) {
+            VLOG(1) << status;
+            context->SetStatus(status);
+            return nullptr;
+          }
+
+          return output->flat<uint8>().data();
+        },
+        &error_string, expand_animations_);
+
+    OP_REQUIRES(context, buffer != nullptr,
+                errors::InvalidArgument("Failed to decode WebP Animation: ",
+                                        error_string));
+    // All done, output should have been filled in by DecodeWebPAnimation.
+  }
+
  private:
   void DecodeBMP(const uint8* input, const int row_size, uint8* const output,
                  const int width, const int height, const int output_channels,
@@ -686,6 +792,7 @@ REGISTER_KERNEL_BUILDER(Name("DecodeAndCropJpeg").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("DecodeImage").Device(DEVICE_CPU),
                         DecodeImageV2Op);
 REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeImageV2Op);
+REGISTER_KERNEL_BUILDER(Name("DecodeWebP").Device(DEVICE_CPU), DecodeImageV2Op);
 
 void DecodeImageV2Op::DecodeBMP(const uint8* input, const int row_size,
                                 uint8* const output, const int width,
diff --git a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
index 922a3aff5f72..0e51b4e24414 100644
--- a/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
+++ b/tensorflow/core/kernels/image/encode_jpeg_op_test.cc
@@ -39,7 +39,7 @@ TEST_F(EncodeJpegWithVariableQualityTest, FailsForInvalidQuality) {
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
   AddInputFromArray<int32>(TensorShape({}), {200});
   absl::Status status = RunOpKernel();
-  EXPECT_TRUE(errors::IsInvalidArgument(status));
+  EXPECT_TRUE(absl::IsInvalidArgument(status));
   EXPECT_TRUE(absl::StartsWith(status.message(), "quality must be in [0,100]"));
 }
 
diff --git a/tensorflow/core/kernels/image/extract_image_patches_op.cc b/tensorflow/core/kernels/image/extract_image_patches_op.cc
index a1dbcd9efa36..b40c59147e51 100644
--- a/tensorflow/core/kernels/image/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/image/extract_image_patches_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/image_ops.cc.
 
+#include <cstdint>
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
diff --git a/tensorflow/core/kernels/image/random_crop_op.cc b/tensorflow/core/kernels/image/random_crop_op.cc
index 987001c58c0a..1fceed794d29 100644
--- a/tensorflow/core/kernels/image/random_crop_op.cc
+++ b/tensorflow/core/kernels/image/random_crop_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/image_ops.cc.
 
+#include <cstdint>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/image/resize_bicubic_op.cc b/tensorflow/core/kernels/image/resize_bicubic_op.cc
index 23e6251f8a0f..338a9fbfcf9a 100644
--- a/tensorflow/core/kernels/image/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/image/resize_bicubic_op.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/image_ops.cc
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <vector>
 #define EIGEN_USE_THREADS
 
 #include <math.h>
diff --git a/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
index 90e26496ed8f..a754a8cec1fc 100644
--- a/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/image/sample_distorted_bounding_box_op.cc
@@ -15,8 +15,13 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc.
 #include <math.h>
 
+#include <algorithm>
 #include <cmath>
+#include <cstdint>
+#include <limits>
+#include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
index 33ae127ac940..c1ac84726ffb 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.cc
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -92,7 +92,7 @@ absl::Status InitializableLookupTable::Initialize(
     TF_RETURN_IF_ERROR(DoInsert(iter.keys(), iter.values()));
     iter.Next();
   }
-  if (!errors::IsOutOfRange(iter.status())) {
+  if (!absl::IsOutOfRange(iter.status())) {
     return iter.status();
   }
 
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 64f7449d33d9..502654ca0976 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -364,8 +364,8 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
       return errors::Internal("No DNN in stream executor.");
     }
     TF_RETURN_IF_ERROR(dnn->GetFusedMatmulRunners(
-        CudnnUseFrontend(), element_type, element_type, element_type, stream,
-        trans_a, trans_b, m, n, k, lda, ldb, ldc, activation_mode,
+        element_type, element_type, element_type, stream, trans_a, trans_b, m,
+        n, k, lda, ldb, ldc, activation_mode,
         /*use_fallback=*/false, GetNumericOptionsForCuDnn(), &runners));
 
     auto launch_func =
@@ -408,8 +408,8 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
       std::vector<std::unique_ptr<const se::dnn::FusedMatmulRunner>>
           fallback_runners;
       TF_RETURN_IF_ERROR(dnn->GetFusedMatmulRunners(
-          CudnnUseFrontend(), element_type, element_type, element_type, stream,
-          trans_a, trans_b, m, n, k, lda, ldb, ldc, activation_mode,
+          element_type, element_type, element_type, stream, trans_a, trans_b, m,
+          n, k, lda, ldb, ldc, activation_mode,
           /*use_fallback=*/true, GetNumericOptionsForCuDnn(),
           &fallback_runners));
 
@@ -737,6 +737,7 @@ class FusedMatMulOp : public OpKernel {
 
 TF_CALL_float(REGISTER_FUSED_CPU_MATMUL);
 TF_CALL_half(REGISTER_FUSED_CPU_MATMUL);
+TF_CALL_bfloat16(REGISTER_FUSED_CPU_MATMUL);
 
 #undef REGISTER_FUSED_CPU_MATMUL
 
@@ -750,6 +751,7 @@ TF_CALL_half(REGISTER_FUSED_CPU_MATMUL);
 
 TF_CALL_float(REGISTER_FUSED_GPU_MATMUL);
 TF_CALL_half(REGISTER_FUSED_GPU_MATMUL);
+TF_CALL_bfloat16(REGISTER_FUSED_GPU_MATMUL);
 
 #undef REGISTER_FUSED_GPU_MATMUL
 
diff --git a/tensorflow/core/kernels/matmul_util.cc b/tensorflow/core/kernels/matmul_util.cc
index a04c09f684fc..3675018709df 100644
--- a/tensorflow/core/kernels/matmul_util.cc
+++ b/tensorflow/core/kernels/matmul_util.cc
@@ -63,15 +63,16 @@ namespace {
 struct BlasLtMatmulPlanMap {
   absl::Mutex mu;
 
-  template <class... Args>
-  auto emplace(Args&&... args) {
+  template <class K, class... Args>
+  auto try_emplace(K&& k, Args&&... args) {
     absl::MutexLock lock(&mu);
-    return map_.emplace(std::forward<Args>(args)...);
+    return map_.try_emplace(std::forward<K>(k), std::forward<Args>(args)...);
   }
 
  private:
-  absl::flat_hash_map<BlasLtMatmulPlanParams, PlanAndAlgorithms> map_
-      ABSL_GUARDED_BY(mu);
+  absl::flat_hash_map<BlasLtMatmulPlanParams,
+                      std::unique_ptr<PlanAndAlgorithms>>
+      map_ ABSL_GUARDED_BY(mu);
 };
 
 int MatmulMaxAutotuneAlgorithmCount() {
@@ -125,7 +126,8 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
 
   static BlasLtMatmulPlanMap plan_map;
 
-  auto [ptr, inserted] = plan_map.emplace(params, PlanAndAlgorithms{});
+  auto [ptr, inserted] =
+      plan_map.try_emplace(params, std::make_unique<PlanAndAlgorithms>());
   if (inserted) {
     TF_ASSIGN_OR_RETURN(auto xlatype,
                         se::gpu::AsXlaPrimitiveType(params.dtype));
@@ -181,10 +183,10 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
         auto algorithms,
         plan->GetAlgorithms(stream, *max_algorithm_count, max_scratch_size));
 
-    ptr->second = {std::move(plan), std::move(algorithms)};
+    *ptr->second = {std::move(plan), std::move(algorithms)};
   }
   *ppmu = &plan_map.mu;
-  return &ptr->second;
+  return ptr->second.get();
 }
 
 Status PlanAndAlgorithms::ExecuteOnStream(
@@ -196,6 +198,7 @@ Status PlanAndAlgorithms::ExecuteOnStream(
   if (!plan || algorithm_idx >= algorithms.size()) {
     return errors::Internal("MatmulPlan or algorithms are not initialized!");
   }
+  TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithms[algorithm_idx]));
   return plan->ExecuteOnStream(stream, a, b, c, c,
                                bias,                    // bias_buffer
                                se::DeviceMemoryBase{},  // aux_buffer
@@ -204,8 +207,7 @@ Status PlanAndAlgorithms::ExecuteOnStream(
                                se::DeviceMemoryBase{},  // c_scale_buffer
                                se::DeviceMemoryBase{},  // d_scale_buffer
                                se::DeviceMemoryBase{},  // d_amax_buffer
-                               algorithms[algorithm_idx], scratch_allocator,
-                               profile_result);
+                               scratch_allocator, profile_result);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 3dc886b53b10..702fae1c2b37 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -319,7 +319,7 @@ tf_mkl_kernel_library(
         "mkl_pooling_ops_common.cc",
     ],
     hdrs = ["mkl_pooling_ops_common.h"],
-    deps = MKL_SHORT_DEPS,
+    deps = MKL_SHORT_DEPS + ["//tensorflow/core/kernels:pooling_ops_hdrs"],
 )
 
 tf_mkl_kernel_library(
diff --git a/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
index 15f1533a5e2f..133bd13e4503 100644
--- a/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
@@ -139,7 +139,10 @@ class MklDequantizeOp : public OpKernel {
       // The quantization logic here for mode SCALED is similar to the logic
       // in QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
       static constexpr int num_bits = sizeof(T) * 8;
-      bool is_signed = std::numeric_limits<T>::is_signed;
+
+      // Currently, T can be qint8 or quint8. So it is sufficient to check
+      // signedness with std::is_same<T, qint8>()
+      static constexpr bool is_signed = std::is_same<T, qint8>();
 
       const int target_bits = is_signed ? (num_bits - 1) : num_bits;
       const float v_max = static_cast<float>(uint64_t{1} << target_bits) - 1;
diff --git a/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
index 63eeb951186e..bf095f745dce 100644
--- a/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/cc/ops/const_op.h"
@@ -125,4 +125,4 @@ TEST_F(MklDequantizeOpTest, MklDequantize_Signed_Input_Bfloat16_Output) {
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc
index 2c27f67ed8a0..a6435518d8bb 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_concat_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 
 #define EIGEN_USE_THREADS
 
@@ -191,4 +191,4 @@ void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
index c3388fa51c31..37ab6c2fb760 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_perchannel_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
 #include <functional>
@@ -191,4 +191,4 @@ TEST_F(QuantizedConv2DPerChannelTest, SmallOldAPI) { TestSmall(true); }
 TEST_F(QuantizedConv2DPerChannelTest, SmallNewAPI) { TestSmall(false); }
 
 }  // namespace tensorflow
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
index 4dc4634775b0..ca59e496fc19 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
 #include <functional>
@@ -1062,4 +1062,4 @@ TEST_F(QuantizedConvTest, BiasAddSumReluFusionFloatSummand) {
 }
 
 }  // namespace tensorflow
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
index 5665ee47b488..c49fda1a5467 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_pooling_ops_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -261,4 +261,4 @@ TEST_F(QuantizedPoolingTest, SmallMaxPooling) {
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc
index 501fbb89d402..10760f75a7b5 100644
--- a/tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_requantize_ops_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 
 #include <cmath>
 
@@ -107,6 +107,8 @@ void MklRequantizatedOpsTestHelper::Setup(Tensor &input_tensor_qint32,
 // Tests the RequantizationRangePerChannel op wherein the range
 // of the weights is calculated per channel.
 TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_Basic) {
+// TODO(intel-tf): the test case fails on Windows platform, and shall be fixed.
+#ifdef TF_PLATFORM_LINUX_X86_64
   // Let us set up the tensor and inputs before we run this op.
   float clip_value_max = static_cast<float>((1L << 31) - 1);
   float range_weights_ch1 = 0.0;
@@ -165,9 +167,12 @@ TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_Basic) {
   EXPECT_NEAR(14.8217, output_max, 0.002);
 
   // Output range is made use in RequantizePerChannelTest_Basic
+#endif  // TF_PLATFORM_LINUX_X86_64
 }
 
 TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_ClipMax) {
+// TODO(intel-tf): the test case fails on Windows platform, and shall be fixed.
+#ifdef TF_PLATFORM_LINUX_X86_64
   // Let us setup the tensor and inputs before we run this op.
   float clip_value_max = 6;  // Can be used as 6 for Relu 6 activations.
   float range_weights_ch1 = 0.0;
@@ -224,6 +229,7 @@ TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_ClipMax) {
   const float output_max = GetOutput(1)->scalar<float>()();
   EXPECT_NEAR(-6.0, output_min, 0.002);  // Values are aligned with clip_value.
   EXPECT_NEAR(6.0, output_max, 0.002);   // Values are aligned with clip_value.
+#endif                                   // TF_PLATFORM_LINUX_X86_64
 }
 
 TEST_F(MklRequantizatedOpsTest, RequantizePerChannelTest_Basic) {
@@ -296,4 +302,4 @@ TEST_F(MklRequantizatedOpsTest, RequantizePerChannelTest_Basic) {
 }
 
 }  // namespace tensorflow
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_tmp_ops.cc b/tensorflow/core/kernels/mkl/mkl_tmp_ops.cc
index fd97d2fa9523..0e276735a952 100644
--- a/tensorflow/core/kernels/mkl/mkl_tmp_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_tmp_ops.cc
@@ -58,11 +58,4 @@ TF_CALL_bfloat16(REGISTER_CPU_CONV_3D);
 TF_CALL_half(REGISTER_CPU_CONV_3D);
 #undef REGISTER_CPU_CONV_3D
 
-#define REGISTER_CPU_MATMUL(T)                                        \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("_FusedMatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      RaiseError<T>);
-TF_CALL_bfloat16(REGISTER_CPU_MATMUL);
-#undef REGISTER_CPU_MATMUL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantized_concat_op_test.cc b/tensorflow/core/kernels/quantized_concat_op_test.cc
index 8322929be93a..81f8b718d2b4 100644
--- a/tensorflow/core/kernels/quantized_concat_op_test.cc
+++ b/tensorflow/core/kernels/quantized_concat_op_test.cc
@@ -97,7 +97,7 @@ void QuantizedConcatTest::TestInvalidMinMax(const Tensor& first_min,
   AddInputFromArray<float>(TensorShape({}), {1.0});
   AddInputFromArray<float>(first_max.shape(), first_max.flat<float>());
   AddInputFromArray<float>(TensorShape({}), {2.0});
-  EXPECT_TRUE(errors::IsInvalidArgument(RunOpKernel()));
+  EXPECT_TRUE(absl::IsInvalidArgument(RunOpKernel()));
 }
 
 TEST_F(QuantizedConcatTest, Small8Bit) {
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
index 999fd44d7c0e..b0f53598d32d 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
@@ -501,7 +501,7 @@ TEST_F(RaggedTensorToTensorOpTest, ShapeWrongDimensions) {
        createVector<int32>({1, 1, 1, 2})}  // row_partition_tensors
   );
   // Fails with an invalid argument.
-  EXPECT_EQ(errors::IsInvalidArgument(RunOpKernel()), true);
+  EXPECT_EQ(absl::IsInvalidArgument(RunOpKernel()), true);
 }
 
 class RaggedTensorToTensorOpUnknownShapeTest
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index ae3f9fab6fcb..56491462129a 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -220,7 +220,7 @@ void RecordYielder::ShardLoop(Shard* shard) {
           shard->status = errors::Aborted("stopped");
           break;
         }
-      } else if (errors::IsOutOfRange(s)) {
+      } else if (absl::IsOutOfRange(s)) {
         break;
       } else {
         shard->status = s;
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index d362d9d66ccc..f30a06543331 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -134,6 +134,8 @@ absl::Status CopyVariable(int output_idx, OpKernelContext* ctx,
       TF_CALL_float8_e4m3fn(HANDLER);
       TF_CALL_int4(HANDLER);
       TF_CALL_uint4(HANDLER);
+      TF_CALL_int2(HANDLER);
+      TF_CALL_uint2(HANDLER);
 #undef HANDLER
       default:
         return errors::Internal("Unsupported dtype", t->dtype());
@@ -303,6 +305,8 @@ TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU_KERNELS);
 TF_CALL_variant(REGISTER_GPU_KERNELS);
 TF_CALL_int4(REGISTER_GPU_KERNELS);
 TF_CALL_uint4(REGISTER_GPU_KERNELS);
+TF_CALL_int2(REGISTER_GPU_KERNELS);
+TF_CALL_uint2(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -318,6 +322,8 @@ TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_DEFAULT_KERNELS);
 TF_CALL_variant(REGISTER_DEFAULT_KERNELS);
 TF_CALL_int4(REGISTER_DEFAULT_KERNELS);
 TF_CALL_uint4(REGISTER_DEFAULT_KERNELS);
+TF_CALL_int2(REGISTER_DEFAULT_KERNELS);
+TF_CALL_uint2(REGISTER_DEFAULT_KERNELS);
 #undef REGISTER_DEFAULT_KERNELS
 
 REGISTER_KERNEL_BUILDER(
@@ -359,7 +365,7 @@ DestroyResourceOp::DestroyResourceOp(OpKernelConstruction* ctx)
 void DestroyResourceOp::Compute(OpKernelContext* ctx) {
   const ResourceHandle& p = HandleFromInput(ctx, 0);
   absl::Status status = DeleteResource(ctx, p);
-  if (ignore_lookup_error_ && errors::IsNotFound(status)) {
+  if (ignore_lookup_error_ && absl::IsNotFound(status)) {
     return;
   }
   OP_REQUIRES_OK(ctx, status);
@@ -569,6 +575,8 @@ TF_CALL_float8_e5m2(REGISTER_KERNELS);
 TF_CALL_float8_e4m3fn(REGISTER_KERNELS);
 TF_CALL_int4(REGISTER_KERNELS);
 TF_CALL_uint4(REGISTER_KERNELS);
+TF_CALL_int2(REGISTER_KERNELS);
+TF_CALL_uint2(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -585,6 +593,8 @@ TF_CALL_float8_e5m2(REGISTER_GPU_KERNELS);
 TF_CALL_float8_e4m3fn(REGISTER_GPU_KERNELS);
 TF_CALL_int4(REGISTER_GPU_KERNELS);
 TF_CALL_uint4(REGISTER_GPU_KERNELS);
+TF_CALL_int2(REGISTER_GPU_KERNELS);
+TF_CALL_uint2(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -599,6 +609,8 @@ TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_int4(REGISTER_KERNELS);
 TF_CALL_uint4(REGISTER_KERNELS);
+TF_CALL_int2(REGISTER_KERNELS);
+TF_CALL_uint2(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T, DenseUpdateType Op>
diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD
index 3b9298c5bac4..17b2545986a7 100644
--- a/tensorflow/core/kernels/rnn/BUILD
+++ b/tensorflow/core/kernels/rnn/BUILD
@@ -49,6 +49,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:eigen_helpers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@eigen_archive//:eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/rnn/gru_ops.cc b/tensorflow/core/kernels/rnn/gru_ops.cc
index ed424e922a4f..f1722497cc81 100644
--- a/tensorflow/core/kernels/rnn/gru_ops.cc
+++ b/tensorflow/core/kernels/rnn/gru_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/rnn/gru_ops.h"
diff --git a/tensorflow/core/kernels/rnn/lstm_ops.cc b/tensorflow/core/kernels/rnn/lstm_ops.cc
index 5bf12c3b56cd..8fb0dcfd9ce6 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops.cc
+++ b/tensorflow/core/kernels/rnn/lstm_ops.cc
@@ -13,15 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+#include <map>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #define EIGEN_USE_THREADS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#include "tensorflow/core/kernels/rnn/lstm_ops.h"
-
-#include <memory>
 #include <vector>
 
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
@@ -31,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/rnn/lstm_ops.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
index 795791254a47..4b3867edfbf4 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <type_traits>
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 504e2c894d62..64bdad6ab167 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -16,11 +16,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 
 #include <atomic>
+#include <cstdint>
+#include <functional>
 #include <optional>
 
 #include "absl/synchronization/barrier.h"
 #include "absl/synchronization/blocking_counter.h"
-#include "absl/types/optional.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -34,7 +35,6 @@ static const int kNumThreads = 30;
 
 TEST(ThreadPool, Empty) {
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
-    fprintf(stderr, "Testing with %d threads\n", num_threads);
     ThreadPool pool(Env::Default(), "test", num_threads);
   }
 }
@@ -42,7 +42,6 @@ TEST(ThreadPool, Empty) {
 TEST(ThreadPool, DoWork) {
   Context outer_context(ContextKind::kThread);
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
-    fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
     std::atomic<bool> work[kWorkItems];
     for (int i = 0; i < kWorkItems; i++) {
@@ -265,14 +264,14 @@ TEST(ThreadPool, ParallelFor) {
   // Make ParallelFor use as many threads as possible.
   int64_t kHugeCost = 1 << 30;
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
-    fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
     std::atomic<bool> work[kWorkItems];
     ThreadPool pool(Env::Default(), "test", num_threads);
     for (int i = 0; i < kWorkItems; i++) {
       work[i] = false;
     }
-    pool.ParallelFor(kWorkItems, kHugeCost,
+    pool.ParallelFor(kWorkItems,
+                     ThreadPool::SchedulingParams::Adaptive(kHugeCost),
                      [&outer_context, &work](int64_t begin, int64_t end) {
                        Context inner_context(ContextKind::kThread);
                        ASSERT_EQ(outer_context, inner_context);
@@ -291,7 +290,6 @@ TEST(ThreadPool, ParallelForWithAdaptiveSchedulingStrategy) {
   // Make ParallelFor use as many threads as possible.
   int64_t kHugeCost = 1 << 30;
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
-    fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
     std::atomic<bool> work[kWorkItems];
     ThreadPool pool(Env::Default(), "test", num_threads);
@@ -320,7 +318,6 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
   // Make ParallelForWithWorkerId use as many threads as possible.
   int64_t kHugeCost = 1 << 30;
   for (int num_threads = 1; num_threads < kNumThreads; num_threads++) {
-    fprintf(stderr, "Testing with %d threads\n", num_threads);
     const int kWorkItems = 15;
     std::atomic<bool> work[kWorkItems];
     ThreadPool pool(Env::Default(), "test", num_threads);
@@ -332,7 +329,7 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
       threads_running[i] = false;
     }
     pool.ParallelForWithWorkerId(
-        kWorkItems, kHugeCost,
+        kWorkItems, ThreadPool::SchedulingParams::Adaptive(kHugeCost),
         [&threads_running, &work](int64_t begin, int64_t end, int64_t id) {
           // Store true for the current thread, and assert that another thread
           // is not running with the same id.
diff --git a/tensorflow/core/lib/gif/BUILD b/tensorflow/core/lib/gif/BUILD
index d91845aa87dc..ca18730828f3 100644
--- a/tensorflow/core/lib/gif/BUILD
+++ b/tensorflow/core/lib/gif/BUILD
@@ -36,6 +36,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:gif",
         "@com_google_absl//absl/strings",
+        "@gif",
     ],
 )
 
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index e3e8821d398e..61c9ea32792b 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "absl/strings/str_cat.h"
+#include "gif_lib.h"  // from @gif
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
@@ -60,6 +61,7 @@ uint8* Decode(const void* srcdata, int datasize,
               string* error_string, bool expand_animations) {
   int error_code = D_GIF_SUCCEEDED;
   InputBufferInfo info = {reinterpret_cast<const uint8*>(srcdata), datasize};
+  /// NOTE: After this, gif file is mostly not initialized!
   GifFileType* gif_file =
       DGifOpen(static_cast<void*>(&info), &input_callback, &error_code);
   const auto cleanup = gtl::MakeCleanup([gif_file]() {
@@ -81,20 +83,18 @@ uint8* Decode(const void* srcdata, int datasize,
     // Stop load if no images are detected or the allocation of the last image
     // buffer was failed.
     if (gif_file->ImageCount <= 0 ||
-        gif_file->SavedImages[gif_file->ImageCount - 1].RasterBits == NULL) {
+        gif_file->SavedImages[gif_file->ImageCount - 1].RasterBits == nullptr) {
       return nullptr;
     }
-
     LOG(ERROR) << *error_string;
   }
+  int target_num_frames = gif_file->ImageCount;
 
-  if (gif_file->ImageCount <= 0) {
+  if (target_num_frames <= 0) {
     *error_string = "gif file does not contain any image";
     return nullptr;
   }
 
-  int target_num_frames = gif_file->ImageCount;
-
   // Don't request more memory than needed for each frame, preventing OOM
   int max_frame_width = 0;
   int max_frame_height = 0;
diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
index 31a74dca33bb..338d6fe6fb45 100644
--- a/tensorflow/core/lib/gtl/BUILD
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -22,7 +22,7 @@ package(
         # tensorflow/examples/custom_ops_doc/simple_hash_table uses map_util
         "//tensorflow/examples/custom_ops_doc/simple_hash_table:__pkg__",
         # tensorflow/core/profiler/convert uses map_util
-        "//tensorflow/core/profiler/convert:__pkg__",
+        "@org_xprof//xprof/convert:__pkg__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/lib/jpeg/BUILD b/tensorflow/core/lib/jpeg/BUILD
index e0f267a92eee..928e2b217db3 100644
--- a/tensorflow/core/lib/jpeg/BUILD
+++ b/tensorflow/core/lib/jpeg/BUILD
@@ -82,6 +82,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:tstring",
         "@local_tsl//tsl/platform:types",
+        "@local_xla//xla/tsl/platform:types",
     ],
 )
 
diff --git a/tensorflow/core/lib/strings/BUILD b/tensorflow/core/lib/strings/BUILD
index eb5ee9e513e3..36161cf3db9d 100644
--- a/tensorflow/core/lib/strings/BUILD
+++ b/tensorflow/core/lib/strings/BUILD
@@ -62,12 +62,8 @@ cc_library(
     srcs = ["proto_text_util.cc"],
     hdrs = ["proto_text_util.h"],
     deps = [
-        "//tensorflow/core/lib/strings:scanner",
-        "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:numbers",
-        "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/platform:str_util",
-        "//tensorflow/core/platform:strcat",
+        "//tensorflow/core/platform:scanner",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/lib/strings/proto_text_util.cc b/tensorflow/core/lib/strings/proto_text_util.cc
index a1b646448eff..50fc690b08db 100644
--- a/tensorflow/core/lib/strings/proto_text_util.cc
+++ b/tensorflow/core/lib/strings/proto_text_util.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/proto_text_util.h"
 
+#include <string>
+
 #include "absl/strings/escaping.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/scanner.h"
 
 namespace tensorflow {
 namespace strings {
@@ -39,7 +43,7 @@ bool ProtoParseBoolFromScanner(Scanner* scanner, bool* value) {
   }
 }
 
-bool ProtoParseStringLiteralFromScanner(Scanner* scanner, string* value) {
+bool ProtoParseStringLiteralFromScanner(Scanner* scanner, std::string* value) {
   const char quote = scanner->Peek();
   if (quote != '\'' && quote != '"') return false;
 
diff --git a/tensorflow/core/lib/strings/proto_text_util.h b/tensorflow/core/lib/strings/proto_text_util.h
index ef73108b0575..6d9ee5793497 100644
--- a/tensorflow/core/lib/strings/proto_text_util.h
+++ b/tensorflow/core/lib/strings/proto_text_util.h
@@ -16,13 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
 #define TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
 
+#include <cstddef>
+#include <string>
+
+#include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/lib/strings/scanner.h"
-#include "tensorflow/core/platform/macros.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/numbers.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/str_util.h"
-#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/scanner.h"
 
 namespace tensorflow {
 namespace strings {
@@ -36,35 +37,37 @@ class ProtoTextOutput {
   // Construct a ProtoTextOutput that writes to <output> If short_debug is true,
   // outputs text to match proto.ShortDebugString(); else matches
   // proto.DebugString().
-  ProtoTextOutput(string* output, bool short_debug)
+  ProtoTextOutput(std::string* output, bool short_debug)
       : output_(output),
         short_debug_(short_debug),
         field_separator_(short_debug ? " " : "\n") {}
 
   // Writes opening of nested message and increases indent level.
   void OpenNestedMessage(const char field_name[]) {
-    StrAppend(output_, level_empty_ ? "" : field_separator_, indent_,
-              field_name, " {", field_separator_);
-    if (!short_debug_) StrAppend(&indent_, "  ");
+    absl::StrAppend(output_, level_empty_ ? "" : field_separator_, indent_,
+                    field_name, " {", field_separator_);
+    if (!short_debug_) absl::StrAppend(&indent_, "  ");
     level_empty_ = true;
   }
 
   // Writes close of nested message and decreases indent level.
   void CloseNestedMessage() {
     if (!short_debug_) indent_.resize(indent_.size() - 2);
-    StrAppend(output_, level_empty_ ? "" : field_separator_, indent_, "}");
+    absl::StrAppend(output_, level_empty_ ? "" : field_separator_, indent_,
+                    "}");
     level_empty_ = false;
   }
 
   // Print the close of the top-level message that was printed.
   void CloseTopMessage() {
-    if (!short_debug_ && !level_empty_) StrAppend(output_, "\n");
+    if (!short_debug_ && !level_empty_) absl::StrAppend(output_, "\n");
   }
 
   // Appends a numeric value, like my_field: 123
   template <typename T>
   void AppendNumeric(const char field_name[], T value) {
-    AppendFieldAndValue(field_name, StrCat(value));
+    AppendFieldAndValue(field_name,
+                        absl::StrCat(strings::LegacyPrecision(value)));
   }
 
   // Appends a numeric value, like my_field: 123, but only if value != 0.
@@ -84,18 +87,20 @@ class ProtoTextOutput {
   }
 
   // Appends a string value, like my_field: "abc123".
-  void AppendString(const char field_name[], const string& value) {
-    AppendFieldAndValue(field_name, StrCat("\"", absl::CEscape(value), "\""));
+  void AppendString(const char field_name[], const std::string& value) {
+    AppendFieldAndValue(field_name,
+                        absl::StrCat("\"", absl::CEscape(value), "\""));
   }
 
   // Appends a string value, like my_field: "abc123", but only if value is not
   // empty.
-  void AppendStringIfNotEmpty(const char field_name[], const string& value) {
+  void AppendStringIfNotEmpty(const char field_name[],
+                              const std::string& value) {
     if (!value.empty()) AppendString(field_name, value);
   }
 
   // Appends the string name of an enum, like my_field: FIRST_ENUM.
-  void AppendEnumName(const char field_name[], const string& name) {
+  void AppendEnumName(const char field_name[], const std::string& name) {
     AppendFieldAndValue(field_name, name);
   }
 
@@ -107,10 +112,10 @@ class ProtoTextOutput {
     level_empty_ = false;
   }
 
-  string* const output_;
+  std::string* const output_;
   const bool short_debug_;
-  const string field_separator_;
-  string indent_;
+  const std::string field_separator_;
+  std::string indent_;
 
   // False when at least one field has been output for the message at the
   // current deepest level of nesting.
@@ -161,7 +166,7 @@ bool ProtoParseBoolFromScanner(Scanner* scanner, bool* value);
 
 // Parse the next string literal from <scanner>, returning false if parsing
 // failed.
-bool ProtoParseStringLiteralFromScanner(Scanner* scanner, string* value);
+bool ProtoParseStringLiteralFromScanner(Scanner* scanner, std::string* value);
 
 }  // namespace strings
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/webp/BUILD b/tensorflow/core/lib/webp/BUILD
new file mode 100644
index 000000000000..762f943809c0
--- /dev/null
+++ b/tensorflow/core/lib/webp/BUILD
@@ -0,0 +1,36 @@
+load("//tensorflow:tensorflow.bzl", "if_google")
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "webp_io",
+    srcs = ["webp_io.cc"],
+    hdrs = ["webp_io.h"],
+    features = ["-layering_check"],
+    deps = [
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@libwebp//:webp",
+    ] + if_google([
+        "@libwebp//:webp_demux",
+    ]),
+)
+
+alias(
+    name = "testdata",
+    actual = "//tensorflow/core/lib/webp/testdata:webp_testdata",
+    visibility = ["//tensorflow/core:__pkg__"],
+)
diff --git a/tensorflow/core/lib/webp/testdata/BUILD b/tensorflow/core/lib/webp/testdata/BUILD
new file mode 100644
index 000000000000..2ff7d769afbc
--- /dev/null
+++ b/tensorflow/core/lib/webp/testdata/BUILD
@@ -0,0 +1,18 @@
+# Description:
+# WebP test data.
+
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "webp_testdata",
+    srcs = glob(["*.webp"]),
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/webp:__pkg__",
+    ],
+)
diff --git a/tensorflow/core/lib/webp/testdata/RGB_noise_large_pixels_115x115.webp b/tensorflow/core/lib/webp/testdata/RGB_noise_large_pixels_115x115.webp
new file mode 100644
index 000000000000..cb5cdb4c28d4
Binary files /dev/null and b/tensorflow/core/lib/webp/testdata/RGB_noise_large_pixels_115x115.webp differ
diff --git a/tensorflow/core/lib/webp/testdata/bouncy_ball.webp b/tensorflow/core/lib/webp/testdata/bouncy_ball.webp
new file mode 100644
index 000000000000..dfa9399b9965
Binary files /dev/null and b/tensorflow/core/lib/webp/testdata/bouncy_ball.webp differ
diff --git a/tensorflow/core/lib/webp/testdata/lossless_raw.webp b/tensorflow/core/lib/webp/testdata/lossless_raw.webp
new file mode 100644
index 000000000000..cd351978740d
Binary files /dev/null and b/tensorflow/core/lib/webp/testdata/lossless_raw.webp differ
diff --git a/tensorflow/core/lib/webp/testdata/lossy_alpha1.webp b/tensorflow/core/lib/webp/testdata/lossy_alpha1.webp
new file mode 100644
index 000000000000..14fe79dcffa7
Binary files /dev/null and b/tensorflow/core/lib/webp/testdata/lossy_alpha1.webp differ
diff --git a/tensorflow/core/lib/webp/webp_io.cc b/tensorflow/core/lib/webp/webp_io.cc
new file mode 100644
index 000000000000..723fc155cfc4
--- /dev/null
+++ b/tensorflow/core/lib/webp/webp_io.cc
@@ -0,0 +1,147 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read images in WebP format.
+
+#include "tensorflow/core/lib/webp/webp_io.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <string>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "third_party/libwebp/src/webp/decode.h"
+#include "third_party/libwebp/src/webp/demux.h"
+#include "third_party/libwebp/src/webp/mux_types.h"
+
+namespace tensorflow {
+namespace webp {
+
+bool DecodeWebPHeader(absl::string_view webp_string, int* width, int* height,
+                      int* channels, bool* has_animation) {
+  const uint8_t* input_data =
+      reinterpret_cast<const uint8_t*>(webp_string.data());
+  const size_t input_size = webp_string.size();
+
+  WebPBitstreamFeatures features;
+  if (WebPGetFeatures(input_data, input_size, &features) != VP8_STATUS_OK) {
+    return false;
+  }
+
+  *width = features.width;
+  *height = features.height;
+  *channels = features.has_alpha ? 4 : 3;
+  *has_animation = features.has_animation;
+  return true;
+}
+
+bool DecodeWebPImage(absl::string_view webp_string, uint8_t* output, int width,
+                     int height, int channels) {
+  const uint8_t* input_data =
+      reinterpret_cast<const uint8_t*>(webp_string.data());
+  const size_t input_size = webp_string.size();
+  const size_t row_stride = width * channels * sizeof(uint8_t);
+  const size_t output_size = height * row_stride;
+
+  // row_stride is actually an *int*, so check that it won't overflow.
+  if (static_cast<int>(row_stride) != static_cast<int64_t>(row_stride)) {
+    return false;
+  }
+
+  switch (channels) {
+    case 3:
+      return ::WebPDecodeRGBInto(input_data, input_size, output, output_size,
+                                 row_stride) != nullptr;
+    case 4:
+      return ::WebPDecodeRGBAInto(input_data, input_size, output, output_size,
+                                  row_stride) != nullptr;
+    default:
+      // Invalid number of channels.
+      return false;
+  }
+}
+
+uint8_t* DecodeWebPAnimation(
+    absl::string_view webp_string,
+    const std::function<uint8_t*(int, int, int, int)>& allocate_output,
+    std::string* error_string, bool expand_animations) {
+  WebPData webp_data = {reinterpret_cast<const uint8_t*>(webp_string.data()),
+                        webp_string.size()};
+
+  // Use the default decoder options, which is single-threaded RGBA decode.
+  WebPAnimDecoder* decoder = WebPAnimDecoderNew(&webp_data, nullptr);
+  if (decoder == nullptr) {
+    *error_string = "failed to decode WebP Animation";
+    return nullptr;
+  }
+
+  const auto cleanup =
+      absl::MakeCleanup([decoder] { WebPAnimDecoderDelete(decoder); });
+
+  WebPAnimInfo info;
+  if (!WebPAnimDecoderGetInfo(decoder, &info)) {
+    *error_string = "failed to get WebP Animation Info";
+    return nullptr;
+  }
+
+  const uint32_t width = info.canvas_width;
+  const uint32_t height = info.canvas_height;
+  // If we only want the first frame, expand_animations will be false.
+  const uint32_t num_frames = (expand_animations) ? info.frame_count : 1;
+  const uint32_t num_channels = 4; /* libwebp only supports RGBA animations */
+  const size_t bytes_per_frame = width * height * num_channels;
+
+  uint8_t* output = allocate_output(num_frames, width, height, num_channels);
+  if (output == nullptr) {
+    *error_string = "failed to allocate output for WebP Animation";
+    return nullptr;
+  }
+
+  size_t frame = 0;
+  while (WebPAnimDecoderHasMoreFrames(decoder)) {
+    uint8_t* buffer;
+    int timestamp_dummy;
+    if (!WebPAnimDecoderGetNext(decoder, &buffer, &timestamp_dummy)) {
+      *error_string = absl::StrCat("failed to decode frame: ", frame);
+      return nullptr;
+    }
+
+    // Copy buffer (owned by decoder) into our output.
+    uint8_t* frame_output = output + frame * bytes_per_frame;
+    memcpy(frame_output, buffer, bytes_per_frame);
+
+    // Move on to the next frame.
+    frame++;
+
+    // Exit early, if we only want to grab the first frame.
+    if (!expand_animations) break;
+  }
+
+  // We should have gotten all the frames in num_frames.
+  if (frame != num_frames) {
+    *error_string =
+        absl::StrCat("only read ", frame, " of ", num_frames, " frames");
+    return nullptr;
+  }
+
+  return output;
+}
+
+}  // namespace webp
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/webp/webp_io.h b/tensorflow/core/lib/webp/webp_io.h
new file mode 100644
index 000000000000..c4f2f9038622
--- /dev/null
+++ b/tensorflow/core/lib/webp/webp_io.h
@@ -0,0 +1,74 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read images in WebP format.
+//
+// First call DecodeWebPHeader with an input WebP image as a string_view, to get
+// the width, height, channels, and whether or not the WebP file is an animation
+// (animation decoding is currently not supported beyond the first frame). Then
+// call DecodeWebP with an appropriately sized output buffer to hold the decoded
+// images as either RGB or RGBA (based on channels)
+//
+//
+// int width, height, channels;
+// bool has_animation;
+// DecodeWebPHeader(input_bytes, &width, &height, &channels, &has_animation);
+//
+// if (has_animation) { DecideIfYouWantFrame0(); }
+//
+// uint8_t* output_bytes = new uint8_t[width * height * channels];
+// DecodeWebPImage(input_bytes, output_bytes, width, height, channels);
+//
+
+#ifndef TENSORFLOW_CORE_LIB_WEBP_WEBP_IO_H_
+#define TENSORFLOW_CORE_LIB_WEBP_WEBP_IO_H_
+
+#include <functional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace webp {
+
+// Given an input encoded in WebP as `webp_string`, extract the width, height,
+// number of channels, and whether or not the file is an animation. Return false
+// on failure or true for success.
+bool DecodeWebPHeader(absl::string_view webp_string, int* width, int* height,
+                      int* channels, bool* has_animation);
+
+// Decode the first image from `webp_string` into the output buffer
+// `output`. `output` is assumed to be width * height * channels *
+// sizeof(uint8_t) or larger.
+bool DecodeWebPImage(absl::string_view webp_string, uint8_t* output, int width,
+                     int height, int channels);
+
+// Decode a sequence of images in the animation from `webp_string` into a
+// dynamically allocated output buffer via `allocate_output`. `allocate_output`
+// takes the arguments as (num_frames, width, height, channels). The channels is
+// (currently) always 4 (RGBA).
+//
+// Note: Decoding a WebP animation, even to get the number of frames, reads the
+// entire image into memory, hence this callback mechanism.
+uint8_t* DecodeWebPAnimation(
+    absl::string_view webp_string,
+    const std::function<uint8_t*(int, int, int, int)>& allocate_output,
+    std::string* error_string, bool expand_animations);
+
+}  // namespace webp
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_WEBP_WEBP_IO_H_
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeWebP.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeWebP.pbtxt
new file mode 100644
index 000000000000..b63e00831366
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeWebP.pbtxt
@@ -0,0 +1,30 @@
+op {
+  name: "DecodeWebP"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt
new file mode 100644
index 000000000000..19e97d82db8d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput.pbtxt
@@ -0,0 +1,107 @@
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt
new file mode 100644
index 000000000000..92d6891fb9ef
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput.pbtxt
@@ -0,0 +1,135 @@
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+  }
+  attr {
+    name: "exponent"
+    type: "float"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt
new file mode 100644
index 000000000000..850cd016b729
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput.pbtxt
@@ -0,0 +1,131 @@
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocity"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_velocity"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_sum_inside_sqrt"
+    type: "bool"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt
new file mode 100644
index 000000000000..9a2cd09ba62f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput.pbtxt
@@ -0,0 +1,104 @@
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tables"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "hyperparameters"
+    type: DT_FLOAT
+    number_attr: "M"
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_tables"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "M"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "optimizer_custom_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt
new file mode 100644
index 000000000000..3b4da49183e6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput.pbtxt
@@ -0,0 +1,135 @@
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linear"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_linear"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "multiply_linear_by_learning_rate"
+    type: "bool"
+  }
+  attr {
+    name: "beta"
+    type: "float"
+  }
+  attr {
+    name: "learning_rate_power"
+    type: "float"
+  }
+  attr {
+    name: "l1_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "l2_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt
new file mode 100644
index 000000000000..4f3f35d0f34f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput.pbtxt
@@ -0,0 +1,99 @@
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt
new file mode 100644
index 000000000000..1379b143fc3c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput.pbtxt
@@ -0,0 +1,165 @@
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "input_size"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_computation"
+    type: "func"
+  }
+  attr {
+    name: "quantization_config_low"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_high"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "input_size"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_computation"
+    type: "func"
+  }
+  attr {
+    name: "quantization_config_low"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_high"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
index fbf266b10e35..41f2c7e450e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
@@ -114,3 +114,75 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
index 359a038ea9b6..d163a3853104 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
@@ -75,3 +75,87 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
index 5150a4f23b59..bb07dc634bff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
@@ -170,3 +170,103 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+  }
+  attr {
+    name: "exponent"
+    type: "float"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
index 4fd6fa9bb5a5..bbcb164284b8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
@@ -103,3 +103,115 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+  }
+  attr {
+    name: "exponent"
+    type: "float"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
index aaa27b25954a..1041a351d30f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
@@ -162,3 +162,99 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_velocity"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_sum_inside_sqrt"
+    type: "bool"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
index 5024f72b5c66..992de6625521 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
@@ -99,3 +99,111 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_velocity"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_sum_inside_sqrt"
+    type: "bool"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithCsrInput.pbtxt
index 9c4e7f05f035..c5fa98477af8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithCsrInput.pbtxt
@@ -60,3 +60,72 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tables"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "hyperparameters"
+    type: DT_FLOAT
+    number_attr: "M"
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_tables"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "M"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "custom_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
index 261f25bebfd7..693f05a2d553 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
@@ -170,3 +170,103 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linear"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_linear"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "multiply_linear_by_learning_rate"
+    type: "bool"
+  }
+  attr {
+    name: "beta"
+    type: "float"
+  }
+  attr {
+    name: "learning_rate_power"
+    type: "float"
+  }
+  attr {
+    name: "l1_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "l2_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
index f2f57f2f744d..44254c5d5f89 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
@@ -103,3 +103,115 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linear"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_linear"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "multiply_linear_by_learning_rate"
+    type: "bool"
+  }
+  attr {
+    name: "beta"
+    type: "float"
+  }
+  attr {
+    name: "learning_rate_power"
+    type: "float"
+  }
+  attr {
+    name: "l1_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "l2_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
index 9446a6fa98c5..5b496b7b543f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
@@ -98,3 +98,67 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
index dbb06c95f6d6..362489eef3be 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
@@ -67,3 +67,79 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt
index 2b4bc1dcba74..2093beb48779 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt
@@ -51,3 +51,63 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulWithCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "input_size"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "quantization_config_low"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_high"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
index 471ded163524..f712f5e406e4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
@@ -63,3 +63,75 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulWithStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "input_size"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "quantization_config_low"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_high"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/update_ops_main.cc b/tensorflow/core/ops/compat/update_ops_main.cc
index 40c34914939f..b72394191863 100644
--- a/tensorflow/core/ops/compat/update_ops_main.cc
+++ b/tensorflow/core/ops/compat/update_ops_main.cc
@@ -54,10 +54,10 @@ void WriteUpdateTo(const string& directory) {
 
   const string& history_dir = compatibility.op_history_directory();
   absl::Status status = env->CreateDir(history_dir);
-  if (!errors::IsAlreadyExists(status)) {
+  if (!absl::IsAlreadyExists(status)) {
     TF_QCHECK_OK(status);
   }
-  if (changed_ops + added_ops > 0 || !errors::IsAlreadyExists(status)) {
+  if (changed_ops + added_ops > 0 || !absl::IsAlreadyExists(status)) {
     // Write out new op history.
     printf("Writing updated op history to %s/...\n", history_dir.c_str());
     for (const auto& op_file : out_op_history) {
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 72ffa938e498..677cf96ea9c4 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
@@ -70,22 +72,26 @@ absl::Status ResizeShapeFn(InferenceContext* c) {
                                c->Dim(input, 3));
 }
 
-absl::Status DecodeImageShapeFn(InferenceContext* c) {
-  ShapeHandle unused;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-  DimensionHandle channels_dim;
+absl::StatusOr<DimensionHandle> GetChannelsDim(InferenceContext* c) {
   int32_t channels;
   TF_RETURN_IF_ERROR(c->GetAttr("channels", &channels));
   if (channels == 0) {
-    channels_dim = c->UnknownDim();
-  } else {
-    if (channels < 0) {
-      return errors::InvalidArgument("channels must be non-negative, got ",
-                                     channels);
-    }
-    channels_dim = c->MakeDim(channels);
+    return c->UnknownDim();
   }
 
+  if (channels < 0) {
+    return errors::InvalidArgument("channels must be non-negative, got ",
+                                   channels);
+  }
+
+  return c->MakeDim(channels);
+}
+
+absl::Status DecodeImageShapeFn(InferenceContext* c) {
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  TF_ASSIGN_OR_RETURN(DimensionHandle channels_dim, GetChannelsDim(c));
+
   c->set_output(0, c->MakeShape({InferenceContext::kUnknownDim,
                                  InferenceContext::kUnknownDim, channels_dim}));
   return absl::OkStatus();
@@ -93,36 +99,26 @@ absl::Status DecodeImageShapeFn(InferenceContext* c) {
 
 absl::Status DecodeImageV2ShapeFn(InferenceContext* c) {
   ShapeHandle unused;
-  int32_t channels;
   bool expand_animations;
-  DimensionHandle channels_dim;
 
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-  TF_RETURN_IF_ERROR(c->GetAttr("channels", &channels));
+  TF_ASSIGN_OR_RETURN(DimensionHandle channels_dim, GetChannelsDim(c));
   TF_RETURN_IF_ERROR(c->GetAttr("expand_animations", &expand_animations));
 
-  if (channels == 0) {
-    channels_dim = c->UnknownDim();
-  } else {
-    if (channels < 0) {
-      return errors::InvalidArgument("channels must be non-negative, got ",
-                                     channels);
-    }
-    channels_dim = c->MakeDim(channels);
-  }
-
-  // `expand_animations` set to true will return 4-D shapes for GIF. 3-D shapes
-  // will be returned for jpg, png, and bmp. `expand_animations` set to false
-  // will always return 3-D shapes for all (jpg, png, bmp, gif).
+  // `expand_animations` set to true will return 4-D shapes for GIF and
+  // WebP. 3-D shapes will be returned for jpg, png, and
+  // bmp. `expand_animations` set to false will always return 3-D shapes for all
+  // (jpg, png, bmp, gif, webp). So we *may* have a mix of 3D and 4D
+  // shapes. Just return unknown.
   if (expand_animations) {
     c->set_output(0, c->UnknownShape());
     return absl::OkStatus();
-  } else {
-    c->set_output(0,
-                  c->MakeShape({InferenceContext::kUnknownDim,
-                                InferenceContext::kUnknownDim, channels_dim}));
-    return absl::OkStatus();
   }
+
+  // expand_animations is False. We'll have a 3D tensor.
+  c->set_output(0, c->MakeShape({InferenceContext::kUnknownDim,
+                                 InferenceContext::kUnknownDim, channels_dim}));
+  return absl::OkStatus();
 }
 
 absl::Status EncodeImageShapeFn(InferenceContext* c) {
@@ -640,6 +636,7 @@ REGISTER_OP("DecodeBmp")
 REGISTER_OP("DecodeGif")
     .Input("contents: string")
     .Output("image: uint8")
+    // Always a 4D tensor, and no Alpha support, so channels=3.
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
@@ -649,6 +646,26 @@ REGISTER_OP("DecodeGif")
       return absl::OkStatus();
     });
 
+// --------------------------------------------------------------------------
+REGISTER_OP("DecodeWebP")
+    .Input("contents: string")
+    .Attr("channels: int = 0")
+    // Add this dtype arg for now, even if we don't yet support conversion.
+    .Attr("dtype: {uint8} = DT_UINT8")
+    .Output("image: dtype")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_ASSIGN_OR_RETURN(DimensionHandle channels_dim, GetChannelsDim(c));
+
+      // Always a 4D tensor, but channels is dynamic.
+      c->set_output(
+          0, c->MakeShape({InferenceContext::kUnknownDim,
+                           InferenceContext::kUnknownDim,
+                           InferenceContext::kUnknownDim, channels_dim}));
+      return absl::OkStatus();
+    });
+
 // --------------------------------------------------------------------------
 REGISTER_OP("RGBToHSV")
     .Input("images: T")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 8b52737d1035..1ce120a11054 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -14591,6 +14591,36 @@ op {
     }
   }
 }
+op {
+  name: "DecodeWebP"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+      }
+    }
+  }
+}
 op {
   name: "DeepCopy"
   input_arg {
@@ -67580,7 +67610,7 @@ op {
   }
 }
 op {
-  name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput"
   input_arg {
     name: "row_pointers"
     type: DT_INT32
@@ -67594,80 +67624,39 @@ op {
     type: DT_INT32
   }
   input_arg {
-    name: "sorted_gains"
-    type: DT_FLOAT
+    name: "sorted_pos_ids"
+    type: DT_INT32
   }
   input_arg {
-    name: "activation_gradients"
+    name: "sorted_gains"
     type: DT_FLOAT
   }
   input_arg {
-    name: "learning_rate"
+    name: "weights"
     type: DT_FLOAT
   }
   input_arg {
-    name: "embedding_table"
-    type: DT_FLOAT
+    name: "preserved_valencies"
+    type: DT_INT32
   }
   input_arg {
-    name: "accumulator"
+    name: "preserved_vectors"
     type: DT_FLOAT
   }
   input_arg {
-    name: "num_minibatches_per_physical_sparse_core"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "updated_embedding_table"
+    name: "preserved_weights"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "updated_accumulator"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "clip_weight_min"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "clip_weight_max"
-    type: "float"
-    default_value {
-      f: inf
-    }
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-  }
-}
-op {
-  name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
-  input_arg {
-    name: "row_pointers"
-    type: DT_INT32
-  }
   input_arg {
-    name: "sorted_sample_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_token_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_gains"
+    name: "activation_gradients"
     type: DT_FLOAT
   }
   input_arg {
-    name: "activation_gradients"
+    name: "learning_rate"
     type: DT_FLOAT
   }
   input_arg {
-    name: "learning_rate"
+    name: "combiner_weights_learning_rate"
     type: DT_FLOAT
   }
   input_arg {
@@ -67678,10 +67667,6 @@ op {
     name: "accumulator"
     type: DT_FLOAT
   }
-  input_arg {
-    name: "num_minibatches_per_physical_sparse_core"
-    type: DT_INT32
-  }
   output_arg {
     name: "updated_embedding_table"
     type: DT_FLOAT
@@ -67690,6 +67675,10 @@ op {
     name: "updated_accumulator"
     type: DT_FLOAT
   }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
   attr {
     name: "clip_weight_min"
     type: "float"
@@ -67705,16 +67694,22 @@ op {
     }
   }
   attr {
-    name: "max_ids_per_sparse_core"
+    name: "max_valency"
     type: "int"
     has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "max_unique_ids_per_sparse_core"
+    name: "num_weights"
     type: "int"
     has_minimum: true
-    minimum: 1
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
   }
   attr {
     name: "table_name"
@@ -67722,7 +67717,7 @@ op {
   }
 }
 op {
-  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput"
   input_arg {
     name: "row_pointers"
     type: DT_INT32
@@ -67736,108 +67731,39 @@ op {
     type: DT_INT32
   }
   input_arg {
-    name: "sorted_gains"
-    type: DT_FLOAT
+    name: "sorted_pos_ids"
+    type: DT_INT32
   }
   input_arg {
-    name: "activation_gradients"
+    name: "sorted_gains"
     type: DT_FLOAT
   }
   input_arg {
-    name: "learning_rate"
+    name: "weights"
     type: DT_FLOAT
   }
   input_arg {
-    name: "embedding_table"
-    type: DT_FLOAT
+    name: "preserved_valencies"
+    type: DT_INT32
   }
   input_arg {
-    name: "accumulator"
+    name: "preserved_vectors"
     type: DT_FLOAT
   }
   input_arg {
-    name: "momenta"
+    name: "preserved_weights"
     type: DT_FLOAT
   }
   input_arg {
-    name: "num_minibatches_per_physical_sparse_core"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "updated_embedding_table"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "updated_accumulator"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "updated_momenta"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-  }
-  attr {
-    name: "exponent"
-    type: "float"
-  }
-  attr {
-    name: "beta1"
-    type: "float"
-  }
-  attr {
-    name: "beta2"
-    type: "float"
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-  }
-  attr {
-    name: "clip_weight_min"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "clip_weight_max"
-    type: "float"
-    default_value {
-      f: inf
-    }
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-  }
-}
-op {
-  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
-  input_arg {
-    name: "row_pointers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_sample_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_token_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_gains"
+    name: "activation_gradients"
     type: DT_FLOAT
   }
   input_arg {
-    name: "activation_gradients"
+    name: "learning_rate"
     type: DT_FLOAT
   }
   input_arg {
-    name: "learning_rate"
+    name: "combiner_weights_learning_rate"
     type: DT_FLOAT
   }
   input_arg {
@@ -67852,10 +67778,6 @@ op {
     name: "momenta"
     type: DT_FLOAT
   }
-  input_arg {
-    name: "num_minibatches_per_physical_sparse_core"
-    type: DT_INT32
-  }
   output_arg {
     name: "updated_embedding_table"
     type: DT_FLOAT
@@ -67868,6 +67790,10 @@ op {
     name: "updated_momenta"
     type: DT_FLOAT
   }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
   attr {
     name: "use_nesterov"
     type: "bool"
@@ -67903,16 +67829,22 @@ op {
     }
   }
   attr {
-    name: "max_ids_per_sparse_core"
+    name: "max_valency"
     type: "int"
     has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "max_unique_ids_per_sparse_core"
+    name: "num_weights"
     type: "int"
     has_minimum: true
-    minimum: 1
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
   }
   attr {
     name: "table_name"
@@ -67920,7 +67852,7 @@ op {
   }
 }
 op {
-  name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput"
   input_arg {
     name: "row_pointers"
     type: DT_INT32
@@ -67934,104 +67866,39 @@ op {
     type: DT_INT32
   }
   input_arg {
-    name: "sorted_gains"
-    type: DT_FLOAT
+    name: "sorted_pos_ids"
+    type: DT_INT32
   }
   input_arg {
-    name: "activation_gradients"
+    name: "sorted_gains"
     type: DT_FLOAT
   }
   input_arg {
-    name: "learning_rate"
+    name: "weights"
     type: DT_FLOAT
   }
   input_arg {
-    name: "embedding_table"
-    type: DT_FLOAT
+    name: "preserved_valencies"
+    type: DT_INT32
   }
   input_arg {
-    name: "momenta"
+    name: "preserved_vectors"
     type: DT_FLOAT
   }
   input_arg {
-    name: "velocity"
+    name: "preserved_weights"
     type: DT_FLOAT
   }
   input_arg {
-    name: "num_minibatches_per_physical_sparse_core"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "updated_embedding_table"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "updated_momenta"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "updated_velocity"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "use_sum_inside_sqrt"
-    type: "bool"
-  }
-  attr {
-    name: "beta1"
-    type: "float"
-  }
-  attr {
-    name: "beta2"
-    type: "float"
-  }
-  attr {
-    name: "epsilon"
-    type: "float"
-  }
-  attr {
-    name: "clip_weight_min"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "clip_weight_max"
-    type: "float"
-    default_value {
-      f: inf
-    }
-  }
-  attr {
-    name: "table_name"
-    type: "string"
-  }
-}
-op {
-  name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
-  input_arg {
-    name: "row_pointers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_sample_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_token_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "sorted_gains"
+    name: "activation_gradients"
     type: DT_FLOAT
   }
   input_arg {
-    name: "activation_gradients"
+    name: "learning_rate"
     type: DT_FLOAT
   }
   input_arg {
-    name: "learning_rate"
+    name: "combiner_weights_learning_rate"
     type: DT_FLOAT
   }
   input_arg {
@@ -68046,10 +67913,1005 @@ op {
     name: "velocity"
     type: DT_FLOAT
   }
-  input_arg {
-    name: "num_minibatches_per_physical_sparse_core"
-    type: DT_INT32
-  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_velocity"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_sum_inside_sqrt"
+    type: "bool"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "tables"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "hyperparameters"
+    type: DT_FLOAT
+    number_attr: "M"
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_tables"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "M"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "optimizer_custom_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linear"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_linear"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "multiply_linear_by_learning_rate"
+    type: "bool"
+  }
+  attr {
+    name: "beta"
+    type: "float"
+  }
+  attr {
+    name: "learning_rate_power"
+    type: "float"
+  }
+  attr {
+    name: "l1_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "l2_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "preserved_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "combiner_weights_learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_weights"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_table_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "combiner_weights_vjp_computation"
+    type: "func"
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_pos_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "preserved_valencies"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "preserved_vectors"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "input_size"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "max_valency"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_weights"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "combiner_computation"
+    type: "func"
+  }
+  attr {
+    name: "quantization_config_low"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_high"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+  }
+  attr {
+    name: "exponent"
+    type: "float"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+  }
+  attr {
+    name: "exponent"
+    type: "float"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_velocity"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_sum_inside_sqrt"
+    type: "bool"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
   output_arg {
     name: "updated_embedding_table"
     type: DT_FLOAT
@@ -68108,6 +68970,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSparseDenseMatmulGradWithCsrInput"
@@ -68170,6 +69039,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
@@ -68263,6 +69139,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
@@ -68368,6 +69251,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
@@ -68425,6 +69315,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
@@ -68494,6 +69391,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSparseDenseMatmulWithCsrInput"
@@ -68547,6 +69451,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSparseDenseMatmulWithStaticBufferSize"
@@ -68612,6 +69523,13 @@ op {
     name: "table_name"
     type: "string"
   }
+  attr {
+    name: "num_sparsecores_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
 }
 op {
   name: "XlaSplitND"
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index fa0866cbb6e2..b5be79cfcbf9 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -950,9 +950,12 @@ cc_library(
         ":bfloat16",
         ":platform",
         ":tstring",
+        "@local_tsl//tsl/platform:bfloat16",
         "@local_tsl//tsl/platform:ml_dtypes",
+        "@local_tsl//tsl/platform:tstring",
         "@local_tsl//tsl/platform:types",
         "@local_xla//xla/tsl/framework:device_type",
+        "@local_xla//xla/tsl/platform:types",
     ],
 )
 
@@ -1123,7 +1126,7 @@ cc_library(
     name = "test_main",
     testonly = 1,
     deps = [
-        "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/tsl/platform:test_main",
     ],
 )
 
diff --git a/tensorflow/core/platform/build_config_root.default.bzl b/tensorflow/core/platform/build_config_root.default.bzl
index ff5c8df72d17..90699048f3c8 100644
--- a/tensorflow/core/platform/build_config_root.default.bzl
+++ b/tensorflow/core/platform/build_config_root.default.bzl
@@ -1,6 +1,6 @@
 """TODO(jakeharmon): Write module docstring."""
 
-load("@local_xla//third_party/py/rules_pywrap:pywrap.bzl", "use_pywrap_rules")
+load("@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
 
 # unused in TSL
 def tf_additional_plugin_deps():
diff --git a/tensorflow/core/platform/distribute.bzl b/tensorflow/core/platform/distribute.bzl
index e05b3e06a797..dc8ba8479dd1 100644
--- a/tensorflow/core/platform/distribute.bzl
+++ b/tensorflow/core/platform/distribute.bzl
@@ -1,5 +1,6 @@
 """Build rules for tf.distribute testing."""
 
+load("//tensorflow:py.default.bzl", "py_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 
@@ -20,7 +21,7 @@ def distribute_py_test(
         disable_v3 = False,
         disable_mlir_bridge = True,
         disable_tpu_use_tfrt = None,
-        test_rule = native.py_test,
+        test_rule = py_test,
         **kwargs):
     """Generates py_test targets for CPU and GPU.
 
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 3080e97f03fe..b1ad7ed1beee 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -84,11 +84,12 @@ TEST_F(DefaultEnvTest, IncompleteReadOutOfRange) {
   // Reading past EOF should give an OUT_OF_RANGE error
   absl::string_view result;
   char scratch[3];
-  EXPECT_EQ(error::OUT_OF_RANGE, f->Read(0, 3, &result, scratch).code());
+  EXPECT_EQ(error::OUT_OF_RANGE,
+            f->Read(0, result, absl::MakeSpan(scratch, 3)).code());
   EXPECT_EQ(input, result);
 
   // Exact read to EOF works.
-  TF_EXPECT_OK(f->Read(0, 2, &result, scratch));
+  TF_EXPECT_OK(f->Read(0, result, absl::MakeSpan(scratch, 2)));
   EXPECT_EQ(input, result);
 }
 
@@ -407,9 +408,10 @@ TEST_F(DefaultEnvTest, LocalTempFilename) {
   TF_CHECK_OK(env->NewRandomAccessFile(filename, &file_to_read));
   absl::string_view content;
   char scratch[1024];
-  CHECK_EQ(
-      error::OUT_OF_RANGE,
-      file_to_read->Read(/*offset=*/0, /*n=*/1024, &content, scratch).code());
+  CHECK_EQ(error::OUT_OF_RANGE, file_to_read
+                                    ->Read(/*offset=*/0, content,
+                                           absl::MakeSpan(scratch, /*n=*/1024))
+                                    .code());
   EXPECT_EQ("Null", content);
 
   // Delete the temporary file.
diff --git a/tensorflow/core/platform/numbers.h b/tensorflow/core/platform/numbers.h
index 3164aab44ff7..11bb0f09e63c 100644
--- a/tensorflow/core/platform/numbers.h
+++ b/tensorflow/core/platform/numbers.h
@@ -37,6 +37,7 @@ using tsl::strings::HumanReadableElapsedTime;
 using tsl::strings::HumanReadableNum;
 using tsl::strings::HumanReadableNumBytes;
 using tsl::strings::kFastToBufferSize;
+using tsl::strings::LegacyPrecision;
 using tsl::strings::ProtoParseNumeric;
 using tsl::strings::safe_strto32;
 using tsl::strings::safe_strto64;
diff --git a/tensorflow/core/platform/strcat.h b/tensorflow/core/platform/strcat.h
index 9a11dd2dbb52..c4052364a66d 100644
--- a/tensorflow/core/platform/strcat.h
+++ b/tensorflow/core/platform/strcat.h
@@ -28,21 +28,6 @@ namespace strings {
 // NOLINTBEGIN(misc-unused-using-decls)
 using tsl::strings::AlphaNum;
 using tsl::strings::Hex;
-using tsl::strings::kZeroPad10;
-using tsl::strings::kZeroPad11;
-using tsl::strings::kZeroPad12;
-using tsl::strings::kZeroPad13;
-using tsl::strings::kZeroPad14;
-using tsl::strings::kZeroPad15;
-using tsl::strings::kZeroPad16;
-using tsl::strings::kZeroPad2;
-using tsl::strings::kZeroPad3;
-using tsl::strings::kZeroPad4;
-using tsl::strings::kZeroPad5;
-using tsl::strings::kZeroPad6;
-using tsl::strings::kZeroPad7;
-using tsl::strings::kZeroPad8;
-using tsl::strings::kZeroPad9;
 using tsl::strings::PadSpec;
 using tsl::strings::StrAppend;
 using tsl::strings::StrCat;
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index b5aa5ffe150c..53328afe0bcf 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -27,6 +27,7 @@ limitations under the License.
 
 #if defined(TENSORFLOW_PROTOBUF_USES_CORD)
 #include "strings/cord_varint.h"
+#include "util/gtl/stl_util.h"
 #endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 5e4498717ec0..599f5b442fd3 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -16,9 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_TYPES_H_
 #define TENSORFLOW_CORE_PLATFORM_TYPES_H_
 
+#include <cstdint>
+#include <string>
+
+#include "xla/tsl/platform/types.h"
 #include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/tstring.h"
+#include "tsl/platform/bfloat16.h"
+#include "tsl/platform/tstring.h"
 #include "tsl/platform/types.h"
 
 namespace tensorflow {
@@ -27,12 +33,14 @@ namespace tensorflow {
 using tsl::string;
 
 using tsl::uint16;
+using tsl::uint2;
 using tsl::uint32;
 using tsl::uint4;
 using tsl::uint64;
 using tsl::uint8;
 
 using tsl::int16;
+using tsl::int2;
 using tsl::int32;
 using tsl::int4;
 using tsl::int64;
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index c746a269b87b..0d5c90068481 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -1,6 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_alias", "tf_profiler_copts")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -9,1466 +8,67 @@ package(
 )
 
 cc_library(
-    name = "xplane_to_op_metrics_db",
-    srcs = ["xplane_to_op_metrics_db.cc"],
-    hdrs = ["xplane_to_op_metrics_db.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":op_metrics_db_combiner",
-        ":op_stack",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/utils:cost_utils",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:op_utils",
-        "//tensorflow/core/profiler/utils:trace_utils",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_op_metrics_db_test",
-    size = "small",
-    srcs = ["xplane_to_op_metrics_db_test.cc"],
-    deps = [
-        ":xplane_to_op_metrics_db",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-    ],
-)
-
-cc_library(
-    name = "op_metrics_db_combiner",
-    srcs = ["op_metrics_db_combiner.cc"],
-    hdrs = ["op_metrics_db_combiner.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-    ],
-)
-
-cc_library(
-    name = "op_metrics_to_record",
-    srcs = ["op_metrics_to_record.cc"],
-    hdrs = ["op_metrics_to_record.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "op_stack",
-    hdrs = ["op_stack.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_hlo_stats",
-    srcs = ["op_stats_to_hlo_stats.cc"],
-    hdrs = ["op_stats_to_hlo_stats.h"],
-    deps = [
-        ":data_table_utils",
-        ":op_metrics_to_record",
-        "//tensorflow/core/profiler/protobuf:hlo_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_roofline_model",
-    srcs = ["op_stats_to_roofline_model.cc"],
-    hdrs = ["op_stats_to_roofline_model.h"],
-    deps = [
-        ":op_metrics_db_combiner",
-        ":op_metrics_to_record",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:roofline_model_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:diagnostics",
-        "@com_google_absl//absl/log:check",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_op_profile",
-    srcs = ["op_stats_to_op_profile.cc"],
-    hdrs = ["op_stats_to_op_profile.h"],
-    deps = [
-        ":op_profile_builder",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_profile_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_overview_page",
-    srcs = ["op_stats_to_overview_page.cc"],
-    hdrs = ["op_stats_to_overview_page.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":op_metrics_to_record",
-        ":op_stats_to_input_pipeline_analysis",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
-        "//tensorflow/core/profiler/protobuf:power_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
-        "//tensorflow/core/profiler/utils:diagnostics",
-        "//tensorflow/core/profiler/utils:hardware_type_utils",
-        "//tensorflow/core/profiler/utils:html_utils",
-        "//tensorflow/core/profiler/utils:kernel_stats_utils",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:format_utils",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_pod_stats",
-    srcs = ["op_stats_to_pod_stats.cc"],
-    hdrs = ["op_stats_to_pod_stats.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:diagnostics",
-        "//tensorflow/core/profiler/utils:event_span",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "op_stats_to_pod_stats_test",
-    srcs = ["op_stats_to_pod_stats_test.cc"],
-    deps = [
-        ":op_stats_to_pod_stats",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:diagnostics",
-        "//tensorflow/core/profiler/utils:event_span",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_pod_viewer",
-    srcs = ["op_stats_to_pod_viewer.cc"],
-    hdrs = ["op_stats_to_pod_viewer.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":op_stats_to_pod_stats",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:pod_viewer_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:diagnostics",
-        "@com_google_absl//absl/log:check",
-    ],
-)
-
-tf_cc_test(
-    name = "op_stats_to_pod_viewer_test",
-    srcs = ["op_stats_to_pod_viewer_test.cc"],
-    deps = [
-        ":op_stats_to_pod_viewer",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:diagnostics",
-        "//tensorflow/core/profiler/utils:event_span",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_trace_container_test",
-    srcs = ["xplane_to_trace_container_test.cc"],
-    deps = [
-        ":xplane_to_trace_container",
-        "//tensorflow/core/util/proto:proto_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_input_pipeline_analysis",
-    srcs = ["op_stats_to_input_pipeline_analysis.cc"],
-    hdrs = ["op_stats_to_input_pipeline_analysis.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":op_metrics_to_record",
-        ":profile_time_breakdown",
-        ":step_events_to_steps_db",
-        ":tpu_input_pipeline_analysis_constants",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tpu_input_pipeline_proto_cc",
-        "//tensorflow/core/profiler/utils:diagnostics",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:html_utils",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:tpu_step_breakdown_utils",
-        "//tensorflow/core/profiler/utils:tpu_step_details_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:format_utils",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/util:stats_calculator_portable",
-    ],
-)
-
-tf_cc_test(
-    name = "op_stats_to_input_pipeline_analysis_test",
-    srcs = ["op_stats_to_input_pipeline_analysis_test.cc"],
-    deps = [
-        ":op_stats_to_input_pipeline_analysis",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-cc_library(
-    name = "op_stats_to_tf_stats",
-    srcs = ["op_stats_to_tf_stats.cc"],
-    hdrs = ["op_stats_to_tf_stats.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":op_metrics_to_record",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:kernel_stats_utils",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "op_stats_to_tf_stats_test",
-    size = "small",
-    srcs = ["op_stats_to_tf_stats_test.cc"],
-    deps = [
-        ":op_stats_to_tf_stats",
-        ":xplane_to_op_stats",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "step_events_to_steps_db",
-    srcs = ["step_events_to_steps_db.cc"],
-    hdrs = ["step_events_to_steps_db.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":op_metrics_db_combiner",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_op_stats",
-    srcs = ["xplane_to_op_stats.cc"],
-    hdrs = ["xplane_to_op_stats.h"],
+    name = "xplane_to_step_stats",
+    srcs = ["xplane_to_step_stats.cc"],
+    hdrs = ["xplane_to_step_stats.h"],
     copts = tf_profiler_copts(),
     deps = [
-        ":duty_cycle_combiner",
-        ":duty_cycle_tracker",
-        ":op_metrics_db_combiner",
-        ":repository",
-        ":step_events_to_steps_db",
-        ":xplane_to_kernel_stats_db",
-        ":xplane_to_op_metrics_db",
-        ":xplane_to_step_events",
-        ":xplane_to_tf_functions",
-        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
-        "//tensorflow/core/profiler/utils:device_caps_utils",
-        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/utils:gpu_event_stats",
-        "//tensorflow/core/profiler/utils:hardware_type_utils",
-        "//tensorflow/core/profiler/utils:hlo_module_map",
-        "//tensorflow/core/profiler/utils:hlo_proto_map",
-        "//tensorflow/core/profiler/utils:kernel_stats_utils",
-        "//tensorflow/core/profiler/utils:op_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
         "@local_xla//xla/tsl/profiler/utils:math_utils",
         "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-        "@local_xla//xla/tsl/profiler/utils:tpu_xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-    ],
-)
-
-cc_library(
-    name = "multi_xplanes_to_op_stats",
-    srcs = ["multi_xplanes_to_op_stats.cc"],
-    hdrs = ["multi_xplanes_to_op_stats.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":op_stats_combiner",
-        ":preprocess_single_host_xplane",
-        ":repository",
-        ":xplane_to_op_stats",
-        "//tensorflow/core:portable_gif_internal",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:hardware_type_utils",
-        "//tensorflow/core/profiler/utils:step_intersection",
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:statusor",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_op_stats_test",
-    size = "small",
-    srcs = ["xplane_to_op_stats_test.cc"],
-    deps = [
-        ":duty_cycle_tracker",
-        ":multi_xplanes_to_op_stats",
-        ":repository",
-        ":step_events_to_steps_db",
-        ":xplane_to_op_stats",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_step_events",
-    srcs = ["xplane_to_step_events.cc"],
-    hdrs = ["xplane_to_step_events.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:trace_utils",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-        "@local_xla//xla/tsl/profiler/utils:tpu_xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_step_events_test",
-    size = "small",
-    srcs = ["xplane_to_step_events_test.cc"],
-    deps = [
-        ":xplane_to_step_events",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_kernel_stats_db",
-    srcs = ["xplane_to_kernel_stats_db.cc"],
-    hdrs = ["xplane_to_kernel_stats_db.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:gpu_event_stats",
-        "//tensorflow/core/profiler/utils:hlo_module_map",
-        "//tensorflow/core/profiler/utils:kernel_stats_utils",
-        "//tensorflow/core/profiler/utils:trace_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
+        "@org_xprof//xprof/utils:gpu_event_stats",
     ],
 )
 
-tf_cc_test(
-    name = "xplane_to_kernel_stats_db_test",
-    size = "small",
-    srcs = ["xplane_to_kernel_stats_db_test.cc"],
-    deps = [
-        ":xplane_to_kernel_stats_db",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:kernel_stats_utils",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-    ],
-)
+# DO NOT ADD NEW DEPENDENCIES TO ANY TARGET IN THIS FILE.
+# Instead, use //third_party/xprof/convert.
 
 cc_library(
-    name = "xplane_to_tf_functions",
-    srcs = ["xplane_to_tf_functions.cc"],
-    hdrs = ["xplane_to_tf_functions.h"],
+    name = "hlo_proto_to_memory_visualization_utils",
+    hdrs = ["hlo_proto_to_memory_visualization_utils.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+    visibility = [
+        "//learning/deepmind/jax/statix:__subpackages__",
+        "//platforms/xla/tools/shardy_migration:__subpackages__",
+        "//smartass/brain/tpu_worker:__subpackages__",
+        "//tensorflow/core/profiler:internal",
     ],
-)
-
-tf_cc_test(
-    name = "xplane_to_tf_functions_test",
-    size = "small",
-    srcs = ["xplane_to_tf_functions_test.cc"],
     deps = [
-        ":xplane_to_tf_functions",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
+        "@org_xprof//xprof/convert:hlo_proto_to_memory_visualization_utils",
     ],
 )
 
 cc_library(
-    name = "xplane_to_memory_profile",
-    srcs = ["xplane_to_memory_profile.cc"],
-    hdrs = ["xplane_to_memory_profile.h"],
+    name = "profile_time_breakdown",
+    hdrs = ["profile_time_breakdown.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/framework:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
+    visibility = [
+        "//platforms/performance/autograppler/utils:__subpackages__",
+        "//tensorflow/core/profiler:internal",
     ],
-)
-
-tf_cc_test(
-    name = "xplane_to_memory_profile_test",
-    size = "small",
-    srcs = ["xplane_to_memory_profile_test.cc"],
     deps = [
-        ":xplane_to_memory_profile",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
+        "@org_xprof//xprof/convert:profile_time_breakdown",
     ],
 )
 
 cc_library(
-    name = "op_stats_combiner",
-    srcs = ["op_stats_combiner.cc"],
-    hdrs = ["op_stats_combiner.h"],
+    name = "xplane_to_op_stats",
+    hdrs = ["xplane_to_op_stats.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        ":op_metrics_db_combiner",
-        ":xplane_to_tf_functions",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:power_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/protobuf:topology_proto_cc",
-        "//tensorflow/core/profiler/utils:hardware_type_utils",
-        "//tensorflow/core/profiler/utils:kernel_stats_utils",
-        "//tensorflow/core/profiler/utils:step_intersection",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-tf_cc_test(
-    name = "op_stats_combiner_test",
-    srcs = ["op_stats_combiner_test.cc"],
-    deps = [
-        ":op_stats_combiner",
-        "//tensorflow/core:portable_gif_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "//tensorflow/core/profiler/utils:step_intersection",
-        "@com_google_absl//absl/container:flat_hash_map",
+    visibility = [
+        "//platforms/xla/tools/multihost_hlo_runner/hybrid_sim:__subpackages__",
+        "//tensorflow/core/profiler:internal",
     ],
-)
-
-cc_library(
-    name = "preprocess_single_host_xplane",
-    srcs = ["preprocess_single_host_xplane.cc"],
-    hdrs = ["preprocess_single_host_xplane.h"],
-    copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/core/profiler:internal"],
-    deps = [
-        "//tensorflow/core/profiler/utils:derived_timeline",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
-        "@local_xla//xla/tsl/profiler/utils:preprocess_xplane",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_tools_data",
-    srcs = ["xplane_to_tools_data.cc"],
-    hdrs = ["xplane_to_tools_data.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":compute_inference_latency",
-        ":hlo_to_tools_data",
-        ":multi_xplanes_to_op_stats",
-        ":multi_xspace_to_inference_stats",
-        ":op_stats_to_hlo_stats",
-        ":op_stats_to_input_pipeline_analysis",
-        ":op_stats_to_op_profile",
-        ":op_stats_to_overview_page",
-        ":op_stats_to_pod_viewer",
-        ":op_stats_to_roofline_model",
-        ":op_stats_to_tf_stats",
-        ":preprocess_single_host_xplane",
-        ":process_megascale_dcn",
-        ":repository",
-        ":tool_options",
-        ":xplane_to_dcn_collective_stats",
-        ":xplane_to_memory_profile",
-        ":xplane_to_op_stats",
-        ":xplane_to_tf_data_stats",
-        ":xplane_to_tool_names",
-        ":xplane_to_trace_container",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/convert/trace_viewer:trace_events_to_json",
-        "//tensorflow/core/profiler/convert/trace_viewer:trace_viewer_visibility",
-        "//tensorflow/core/profiler/protobuf:dcn_slack_analysis_proto_cc",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_profile_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
-        "//tensorflow/core/profiler/protobuf:roofline_model_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:hardware_type_utils",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/convert:xplane_to_trace_events",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_tf_data_stats",
-    srcs = ["xplane_to_tf_data_stats.cc"],
-    hdrs = ["xplane_to_tf_data_stats.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:html_utils",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_tf_data_stats_test",
-    size = "small",
-    srcs = ["xplane_to_tf_data_stats_test.cc"],
-    tags = if_oss([
-        "manual",
-        "no_oss",
-    ]),  # b/169705709, no protobuf matchers in OSS.
-    deps = [
-        ":xplane_to_tf_data_stats",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_test_utils",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_step_stats",
-    srcs = ["xplane_to_step_stats.cc"],
-    hdrs = ["xplane_to_step_stats.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/utils:gpu_event_stats",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-    ],
-)
-
-cc_library(
-    name = "hlo_to_tools_data",
-    srcs = ["hlo_to_tools_data.cc"],
-    hdrs = ["hlo_to_tools_data.h"],
-    copts = tf_profiler_copts(),
-    visibility = ["//visibility:private"],
-    deps = [
-        ":hlo_proto_to_graph_view",
-        ":hlo_proto_to_memory_visualization_utils",
-        ":repository",
-        ":tool_options",
-        ":xplane_to_hlo",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:memory_viewer_preprocess_proto_cc",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/service:hlo_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "hlo_proto_to_memory_visualization_utils",
-    srcs = ["hlo_proto_to_memory_visualization_utils.cc"],
-    hdrs = ["hlo_proto_to_memory_visualization_utils.h"],
-    copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/core/profiler/protobuf:memory_viewer_friends"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:memory_viewer_preprocess_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/service:hlo_proto_cc",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_proto_to_memory_visualization_utils_test",
-    srcs = ["hlo_proto_to_memory_visualization_utils_test.cc"],
-    deps = [
-        ":hlo_proto_to_memory_visualization_utils",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:memory_viewer_preprocess_proto_cc",
-        "//tensorflow/core/util/proto:proto_utils",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_xla//xla/service:hlo_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_hlo",
-    srcs = ["xplane_to_hlo.cc"],
-    hdrs = ["xplane_to_hlo.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":repository",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/core/profiler/utils:hlo_proto_map",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:file_system_utils",
-    ],
-)
-
-cc_library(
-    name = "op_profile_builder",
-    srcs = ["op_profile_builder.cc"],
-    hdrs = ["op_profile_builder.h"],
-    deps = [
-        ":op_metrics_db_combiner",
-        ":op_metrics_to_record",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_headers_for_pybind",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_profile_proto_cc",
-        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "hlo_proto_to_graph_view",
-    srcs = ["hlo_proto_to_graph_view.cc"],
-    hdrs = ["hlo_proto_to_graph_view.h"],
-    deps = [
-        ":tool_options",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        # copybara:uncomment(b/360874576) "//tensorflow/compiler/mlir/lite/experimental/google/tooling/hlo_adapter:direct_hlo_to_json_graph_convert",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_graph_dumper",
-        "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/tsl/platform:statusor",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/core/profiler/utils:hlo_module_utils",
-        "//tensorflow/core/profiler/utils:hlo_proto_to_module",
-        # copybara:uncomment "@com_github_nlohmann_json//:json",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_proto_to_graph_view_test",
-    size = "small",
-    srcs = ["hlo_proto_to_graph_view_test.cc"],
-    deps = [
-        ":hlo_proto_to_graph_view",
-        ":tool_options",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/service:hlo_graph_dumper",
-        "@local_xla//xla/tsl/platform:status_matchers",
-        "@local_xla//xla/tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "tool_options",
-    hdrs = ["tool_options.h"],
-    deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:str_format",
-    ],
-)
-
-cc_library(
-    name = "repository",
-    srcs = ["repository.cc"],
-    hdrs = ["repository.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/core/profiler/utils:hlo_module_map",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:statusor",
-        "@local_xla//xla/tsl/profiler/utils:file_system_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "repository_test",
-    size = "small",
-    srcs = ["repository_test.cc"],
-    deps = [
-        ":repository",
-        "//tensorflow/core/platform:errors",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:status",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_tool_names",
-    srcs = ["xplane_to_tool_names.cc"],
-    hdrs = ["xplane_to_tool_names.h"],
-    deps = [
-        ":repository",
-        ":xplane_to_dcn_collective_stats",
-        ":xplane_to_hlo",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/platform:statusor",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_tool_names_test",
-    size = "small",
-    srcs = ["xplane_to_tool_names_test.cc"],
-    deps = [
-        ":repository",
-        ":xplane_to_tool_names",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:status",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_trace_container",
-    srcs = ["xplane_to_trace_container.cc"],
-    hdrs = ["xplane_to_trace_container.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core/profiler/convert/trace_viewer:trace_event_arguments_builder",
-        "//tensorflow/core/profiler/convert/trace_viewer:trace_events",
-        "//tensorflow/core/profiler/convert/trace_viewer:trace_events_util",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-        "@local_xla//xla/tsl/profiler/utils:trace_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
-    ],
-)
-
-cc_library(
-    name = "dcn_utils",
-    srcs = ["dcn_utils.cc"],
-    hdrs = ["dcn_utils.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
-    ],
-)
-
-tf_cc_test(
-    name = "dcn_utils_test",
-    srcs = ["dcn_utils_test.cc"],
-    deps = [
-        ":dcn_utils",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:xplane_builder",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
-    ],
-)
-
-cc_library(
-    name = "dcn_analysis",
-    srcs = ["dcn_analysis.cc"],
-    hdrs = ["dcn_analysis.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":dcn_utils",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tpu_xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-    ],
-)
-
-cc_library(
-    name = "process_megascale_dcn",
-    srcs = ["process_megascale_dcn.cc"],
-    hdrs = ["process_megascale_dcn.h"],
-    deps = [
-        ":dcn_analysis",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:tpu_xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-    ],
-)
-
-tf_cc_test(
-    name = "dcn_analysis_test",
-    srcs = ["dcn_analysis_test.cc"],
-    deps = [
-        ":dcn_analysis",
-        ":dcn_utils",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:xplane_builder",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
-    ],
-)
-
-cc_library(
-    name = "xspace_to_dcn_slack_analysis",
-    srcs = ["xspace_to_dcn_slack_analysis.cc"],
-    hdrs = ["xspace_to_dcn_slack_analysis.h"],
-    deps = [
-        "//tensorflow/core/profiler/protobuf:dcn_collective_info_proto_cc",
-        "//tensorflow/core/profiler/protobuf:dcn_slack_analysis_proto_cc",
-        "//tensorflow/core/profiler/protobuf:topology_proto_cc",
-        "//tensorflow/core/profiler/utils:hlo_module_utils",
-        "//tensorflow/core/profiler/utils:hlo_proto_map",
-        "//tensorflow/core/profiler/utils:hlo_proto_to_module",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:regexp",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:side_effect_util",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/tsl/platform:statusor",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-        "@local_xla//xla/tsl/profiler/utils:tpu_xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
-    ],
-)
-
-cc_library(
-    name = "dcn_slack_analysis_combiner",
-    srcs = ["dcn_slack_analysis_combiner.cc"],
-    hdrs = ["dcn_slack_analysis_combiner.h"],
-    deps = [
-        "//tensorflow/core/profiler/protobuf:dcn_slack_analysis_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "xplane_to_dcn_collective_stats",
-    srcs = ["xplane_to_dcn_collective_stats.cc"],
-    hdrs = ["xplane_to_dcn_collective_stats.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        ":dcn_slack_analysis_combiner",
-        ":repository",
-        ":xspace_to_dcn_slack_analysis",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/core/profiler/protobuf:dcn_slack_analysis_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:statusor",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_dcn_collective_stats_test",
-    size = "small",
-    srcs = ["xplane_to_dcn_collective_stats_test.cc"],
-    deps = [
-        ":repository",
-        ":xplane_to_dcn_collective_stats",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:dcn_slack_analysis_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:status",
-    ],
-)
-
-cc_library(
-    name = "inference_stats",
-    srcs = ["inference_stats.cc"],
-    hdrs = ["inference_stats.h"],
-    deps = [
-        "//tensorflow/core/lib/gtl:map_util",
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:logging",
-        "@local_xla//xla/tsl/profiler/utils:device_utils",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
-    ],
-)
-
-cc_library(
-    name = "inference_stats_sampler",
-    srcs = ["inference_stats_sampler.cc"],
-    hdrs = ["inference_stats_sampler.h"],
-    deps = [
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "inference_stats_grouping",
-    srcs = ["inference_stats_grouping.cc"],
-    hdrs = ["inference_stats_grouping.h"],
-    deps = [
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_xla//xla/tsl/lib/gtl:map_util",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-cc_library(
-    name = "inference_stats_combiner",
-    srcs = ["inference_stats_combiner.cc"],
-    hdrs = ["inference_stats_combiner.h"],
-    deps = [
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_xla//xla/tsl/lib/gtl:map_util",
-    ],
-)
-
-cc_library(
-    name = "multi_xspace_to_inference_stats",
-    srcs = ["multi_xspace_to_inference_stats.cc"],
-    hdrs = ["multi_xspace_to_inference_stats.h"],
-    deps = [
-        ":inference_stats",
-        ":inference_stats_combiner",
-        ":inference_stats_grouping",
-        ":inference_stats_sampler",
-        ":preprocess_single_host_xplane",
-        ":repository",
-        ":xplane_to_step_events",
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/platform:statusor",
-        "@local_xla//xla/tsl/profiler/utils:device_utils",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:tpu_xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "inference_stats_grouping_test",
-    srcs = ["inference_stats_grouping_test.cc"],
-    tags = [
-        "no_oss",
-    ],
-    deps = [
-        ":inference_stats_grouping",
-        "//tensorflow/core:test",
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/tests:test_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "inference_stats_sampler_test",
-    srcs = ["inference_stats_sampler_test.cc"],
-    tags = [
-        "no_oss",
-    ],
-    deps = [
-        ":inference_stats_sampler",
-        "//tensorflow/core:test",
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/tests:test_utils",
-    ],
-)
-
-cc_library(
-    name = "compute_inference_latency",
-    srcs = ["compute_inference_latency.cc"],
-    hdrs = ["compute_inference_latency.h"],
-    visibility = ["//perftools/accelerators/xprof/convert:__pkg__"],
-    deps = [
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "profile_time_breakdown",
-    srcs = ["profile_time_breakdown.cc"],
-    hdrs = ["profile_time_breakdown.h"],
-    visibility = ["@local_xla//xla/tsl/profiler:friends"],
-    deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-    ],
-)
-
-cc_library(
-    name = "tpu_input_pipeline_analysis_constants",
-    srcs = [tf_profiler_alias("//tensorflow/core/profiler/convert/", "tpu_input_pipeline_analysis_constants.cc")],
-    hdrs = ["tpu_input_pipeline_analysis_constants.h"],
-    visibility = ["@local_xla//xla/tsl/profiler:friends"],
-    deps = [
-        "@com_google_absl//absl/strings:string_view",
-        "@local_xla//xla/tsl/platform:macros",
-    ],
-)
-
-cc_library(
-    name = "duty_cycle_tracker",
-    srcs = ["duty_cycle_tracker.cc"],
-    hdrs = ["duty_cycle_tracker.h"],
-    deps = [
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/log:check",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-cc_library(
-    name = "duty_cycle_combiner",
-    hdrs = ["duty_cycle_combiner.h"],
-    deps = [
-        ":duty_cycle_tracker",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-    ],
-)
-
-tf_cc_test(
-    name = "duty_cycle_tracker_test",
-    srcs = ["duty_cycle_tracker_test.cc"],
-    deps = [
-        ":duty_cycle_tracker",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/log:check",
-        "@com_google_googletest//:gtest",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-tf_cc_test(
-    name = "compute_inference_latency_test",
-    srcs = ["compute_inference_latency_test.cc"],
-    deps = [
-        ":compute_inference_latency",
-        "//tensorflow/core/profiler/protobuf:inference_stats_proto_cc",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-tf_cc_test(
-    name = "duty_cycle_combiner_test",
-    srcs = ["duty_cycle_combiner_test.cc"],
-    deps = [
-        ":duty_cycle_combiner",
-        ":duty_cycle_tracker",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_googletest//:gtest",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-cc_library(
-    name = "data_table_utils",
-    hdrs = ["data_table_utils.h"],
-    deps = [
-        "@com_github_nlohmann_json//:json",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "data_table_utils_test",
-    srcs = ["data_table_utils_test.cc"],
     deps = [
-        ":data_table_utils",
-        "@com_github_nlohmann_json//:json",
-        "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
+        "@org_xprof//xprof/convert:xplane_to_op_stats",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/compute_inference_latency.cc b/tensorflow/core/profiler/convert/compute_inference_latency.cc
deleted file mode 100644
index ba0c8245fd03..000000000000
--- a/tensorflow/core/profiler/convert/compute_inference_latency.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/compute_inference_latency.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <limits>
-#include <vector>
-
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
-
-namespace tensorflow::profiler {
-
-struct LatencyBreakdown {
-  double total_latency_us = 0.0;
-  double host_latency_us = 0.0;
-  double device_latency_us = 0.0;
-  double communication_latency_us = 0.0;
-};
-
-void SetLatencyBreakdown(const LatencyBreakdown& src,
-                         OverviewLatencyBreakdown* res) {
-  res->set_total_latency_us(src.total_latency_us);
-  res->set_host_latency_us(src.host_latency_us);
-  res->set_device_latency_us(src.device_latency_us);
-  res->set_communication_latency_us(src.communication_latency_us);
-}
-
-void SafeDivide(int64_t count, double* num) {
-  constexpr double kEpsilon = 1.0e-20;
-  if (count == 0 || std::abs(*num) < kEpsilon) {
-    *num = 0.0;
-  } else {
-    *num /= count;
-  }
-}
-
-void ComputeAverage(int64_t count, LatencyBreakdown* breakdown) {
-  SafeDivide(count, &breakdown->total_latency_us);
-  SafeDivide(count, &breakdown->host_latency_us);
-  SafeDivide(count, &breakdown->device_latency_us);
-  SafeDivide(count, &breakdown->communication_latency_us);
-}
-
-void ComputeBreakdownFromSessionRun(
-    const tensorflow::profiler::RequestDetail& request_detail,
-    LatencyBreakdown* res, LatencyBreakdown* avg) {
-  double session_run_duration_us = tsl::profiler::PicoToMicro(
-      request_detail.end_time_ps() - request_detail.start_time_ps());
-  double device_time_us =
-      tsl::profiler::PicoToMicro(request_detail.device_time_ps());
-  double communication_time_us =
-      tsl::profiler::PicoToMicro(request_detail.read_from_device_time_ps() +
-                                 request_detail.write_to_device_time_ps());
-  double host_time_us =
-      session_run_duration_us - device_time_us - communication_time_us;
-  *res = {session_run_duration_us, host_time_us, device_time_us,
-          communication_time_us};
-
-  avg->total_latency_us += session_run_duration_us;
-  avg->device_latency_us += device_time_us;
-  avg->communication_latency_us += communication_time_us;
-  avg->host_latency_us +=
-      session_run_duration_us - device_time_us - communication_time_us;
-}
-
-// Compute the inference latency from inference stats proto.
-OverviewInferenceLatency ComputeInferenceLatencyResult(
-    const tensorflow::profiler::InferenceStats& inference_stats) {
-  OverviewInferenceLatency result;
-  // If inference_stats is empty, return early with empty result.
-  // The following code is able to return empty result even
-  // without early return.
-  if (inference_stats.inference_stats_per_model_size() == 0) return result;
-
-  // Target percentiles over all session runs.
-  // Default is [50.0, 75.0, 90.0, 99.0, 99.9].
-  constexpr double kTargetPercentiles[] = {50.0, 75.0, 90.0, 99.0, 99.9};
-  // Saves the latency corresponding to each percentile.
-
-  std::vector<LatencyBreakdown> sessions;
-  double total_sessioins_per_sec = 0;
-  double max_latency = 0.0;
-  double min_latency = std::numeric_limits<double>::max();
-  LatencyBreakdown avg;
-  // Iterate over all session runs from all models, calculate the device,
-  // communication, and host time for each session run, and push in the
-  // vector<LatencyBreakdown> sessions. Also update the max, min, count, avg.
-  for (const auto& model_inference_stats :
-       inference_stats.inference_stats_per_model()) {
-    total_sessioins_per_sec +=
-        model_inference_stats.second.request_throughput();
-    for (const auto& request_detail :
-         model_inference_stats.second.request_details()) {
-      LatencyBreakdown session_breakdown;
-      ComputeBreakdownFromSessionRun(request_detail, &session_breakdown, &avg);
-      sessions.push_back(session_breakdown);
-      double session_run_duration_us = tsl::profiler::PicoToMicro(
-          request_detail.end_time_ps() - request_detail.start_time_ps());
-      max_latency = std::max(max_latency, session_run_duration_us);
-      min_latency = std::min(min_latency, session_run_duration_us);
-    }
-  }
-  // Return empty result if there is no session found.
-  if (sessions.empty()) return result;
-  result.set_sessions_per_second(total_sessioins_per_sec);
-  result.set_max_latency_us(max_latency);
-  result.set_min_latency_us(min_latency);
-  ComputeAverage(sessions.size(), &avg);
-
-  // Sort the sessions based on session run duration. For a specified
-  // percentile, get the corresponding session with the (lower-bound) index.
-  std::sort(sessions.begin(), sessions.end(),
-            [](const LatencyBreakdown& a, const LatencyBreakdown& b) {
-              return a.total_latency_us < b.total_latency_us;
-            });
-  for (const auto& percent : kTargetPercentiles) {
-    result.add_percentile_numbers(percent);
-    int64_t index = percent / 100.0 * sessions.size();
-    SetLatencyBreakdown(sessions[index], result.add_latency_breakdowns());
-  }
-  // Set the average latency stats.
-  SetLatencyBreakdown(avg, result.add_latency_breakdowns());
-
-  return result;
-}
-
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/compute_inference_latency.h b/tensorflow/core/profiler/convert/compute_inference_latency.h
deleted file mode 100644
index 91632c907cbd..000000000000
--- a/tensorflow/core/profiler/convert/compute_inference_latency.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_COMPUTE_INFERENCE_LATENCY_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_COMPUTE_INFERENCE_LATENCY_H_
-
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
-
-namespace tensorflow::profiler {
-
-// Compute the inference latency from inference stats proto.
-OverviewInferenceLatency ComputeInferenceLatencyResult(
-    const InferenceStats& inference_stats);
-
-}  // namespace tensorflow::profiler
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_COMPUTE_INFERENCE_LATENCY_H_
diff --git a/tensorflow/core/profiler/convert/compute_inference_latency_test.cc b/tensorflow/core/profiler/convert/compute_inference_latency_test.cc
deleted file mode 100644
index efd931c13847..000000000000
--- a/tensorflow/core/profiler/convert/compute_inference_latency_test.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/compute_inference_latency.h"
-
-#include <gtest/gtest.h>
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-
-namespace tensorflow::profiler {
-namespace {
-
-constexpr double kMaxError = 0.0001;
-
-TEST(ComputeInferenceLatencyResult, InferenceLatencyTest) {
-  InferenceStats inference_stats;
-  auto& model = (*inference_stats.mutable_inference_stats_per_model())[0];
-
-  // Generates requests for testing.
-  for (int i = 0; i < 100; i++) {
-    RequestDetail request_detail;
-    request_detail.set_start_time_ps(0);
-    request_detail.set_end_time_ps(i * 10000);
-    request_detail.set_device_time_ps(i * 1000);
-    request_detail.set_write_to_device_time_ps(i * 1000);
-    model.add_request_details()->Swap(&request_detail);
-  }
-
-  auto result = ComputeInferenceLatencyResult(inference_stats);
-
-  // 5 percentiles and 1 average, so 6 results in total.
-  ASSERT_EQ(result.latency_breakdowns_size(), 6);
-
-  // Verify 50 percentile result.
-  EXPECT_NEAR(result.latency_breakdowns(0).total_latency_us(), 0.5, kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(0).host_latency_us(), 0.4, kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(0).device_latency_us(), 0.05,
-              kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(0).communication_latency_us(), 0.05,
-              kMaxError);
-
-  // Verify 99.9 percentile result.
-  EXPECT_NEAR(result.latency_breakdowns(4).total_latency_us(), 0.99, kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(4).host_latency_us(), 0.792, kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(4).device_latency_us(), 0.099,
-              kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(4).communication_latency_us(), 0.099,
-              kMaxError);
-
-  // Verify average result.
-  EXPECT_NEAR(result.latency_breakdowns(5).total_latency_us(), 0.495,
-              kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(5).host_latency_us(), 0.396, kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(5).device_latency_us(), 0.0495,
-              kMaxError);
-  EXPECT_NEAR(result.latency_breakdowns(5).communication_latency_us(), 0.0495,
-              kMaxError);
-}
-
-}  // namespace
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/data_table_utils.h b/tensorflow/core/profiler/convert/data_table_utils.h
deleted file mode 100644
index 34bc248b356d..000000000000
--- a/tensorflow/core/profiler/convert/data_table_utils.h
+++ /dev/null
@@ -1,155 +0,0 @@
-
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DATA_TABLE_UTILS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_DATA_TABLE_UTILS_H_
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/container/btree_map.h"
-#include "absl/strings/str_replace.h"
-#include "nlohmann/json_fwd.hpp"
-#include "nlohmann/json.hpp"
-namespace tensorflow {
-namespace profiler {
-// We Don't deal with formatted values on backend now.
-struct TableCell {
-  TableCell() = default;
-  explicit TableCell(nlohmann::json value) : value(value) {};
-  explicit TableCell(
-      nlohmann::json value,
-      absl::btree_map<std::string, std::string> custom_properties)
-      : value(value), custom_properties(custom_properties) {};
-  std::string value_str() const {
-    return absl::StrReplaceAll(value.dump(), {{"\"", ""}});
-  }
-  nlohmann::json value;
-  absl::btree_map<std::string, std::string> custom_properties;
-};
-struct TableColumn {
-  TableColumn() = default;
-  explicit TableColumn(std::string id, std::string type, std::string label)
-      : id(id), type(type), label(label) {};
-  explicit TableColumn(
-      std::string id, std::string type, std::string label,
-      absl::btree_map<std::string, std::string> custom_properties)
-      : id(id), type(type), label(label), custom_properties(custom_properties) {
-        };
-  std::string id;
-  std::string type;
-  std::string label;
-  absl::btree_map<std::string, std::string> custom_properties;
-};
-class TableRow {
- public:
-  TableRow() = default;
-  virtual ~TableRow() = default;
-  // Adds a value of a single cell to the end of the row.
-  // Memory will be freed by the TableRow.
-  TableCell* AddCell(nlohmann::json value) {
-    cells_.push_back(std::make_unique<TableCell>(value));
-    return cells_.back().get();
-  }
-  std::vector<const TableCell*> GetCells() const {
-    std::vector<const TableCell*> cells;
-    cells.reserve(cells_.size());
-    for (const std::unique_ptr<TableCell>& cell : cells_) {
-      cells.push_back(cell.get());
-    }
-    return cells;
-  }
-  void SetCustomProperties(
-      const absl::btree_map<std::string, std::string>& custom_properties) {
-    custom_properties_ = custom_properties;
-  }
-  void AddCustomProperty(std::string name, std::string value) {
-    custom_properties_[name] = value;
-  }
-  const absl::btree_map<std::string, std::string>& GetCustomProperties() const {
-    return custom_properties_;
-  }
-  int RowSize() const { return cells_.size(); }
-
- private:
-  std::vector<std::unique_ptr<TableCell>> cells_;
-  absl::btree_map<std::string, std::string> custom_properties_;
-};
-// A DataTable class that can be used to create a DataTable JSON/CSV
-// serialization. We need this class instead raw JSON manipulation because we
-// need to support custom properties.
-class DataTable {
- public:
-  DataTable() = default;
-  void AddColumn(TableColumn column) { table_descriptions_.push_back(column); }
-  const std::vector<TableColumn>& GetColumns() { return table_descriptions_; }
-  // Create an empty row and return a pointer to it.
-  // DataTable takes the ownership of the returned TableRow.
-  TableRow* AddRow() {
-    table_rows_.push_back(std::make_unique<TableRow>());
-    return table_rows_.back().get();
-  }
-  std::vector<const TableRow*> GetRows() {
-    std::vector<const TableRow*> rows;
-    rows.reserve(table_rows_.size());
-    for (const std::unique_ptr<TableRow>& row : table_rows_) {
-      rows.push_back(row.get());
-    }
-    return rows;
-  }
-  void AddCustomProperty(std::string name, std::string value) {
-    custom_properties_[name] = value;
-  }
-  std::string ToJson() {
-    nlohmann::json table;
-    table["cols"] = nlohmann::json::array();
-    table["rows"] = nlohmann::json::array();
-    if (!custom_properties_.empty()) {
-      table["p"] = custom_properties_;
-    }
-    for (const TableColumn& col : table_descriptions_) {
-      nlohmann::json column_json;
-      column_json["id"] = col.id;
-      column_json["type"] = col.type;
-      column_json["label"] = col.label;
-      if (!col.custom_properties.empty()) {
-        column_json["p"] = col.custom_properties;
-      }
-      table["cols"].push_back(column_json);
-    }
-    for (const std::unique_ptr<TableRow>& row : table_rows_) {
-      nlohmann::json row_json;
-      row_json["c"] = nlohmann::json::array();
-      for (const TableCell* cell : row->GetCells()) {
-        nlohmann::json cell_json;
-        cell_json["v"] = cell->value;
-        if (!cell->custom_properties.empty()) {
-          cell_json["p"] = cell->custom_properties;
-        }
-        row_json["c"].push_back(cell_json);
-      }
-      if (!row->GetCustomProperties().empty()) {
-        row_json["p"] = row->GetCustomProperties();
-      }
-      table["rows"].push_back(row_json);
-    }
-    return table.dump();
-  }
-
- private:
-  std::vector<TableColumn> table_descriptions_;
-  std::vector<std::unique_ptr<TableRow>> table_rows_;
-  absl::btree_map<std::string, std::string> custom_properties_;
-};
-}  // namespace profiler
-}  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DATA_TABLE_UTILS_H_
diff --git a/tensorflow/core/profiler/convert/data_table_utils_test.cc b/tensorflow/core/profiler/convert/data_table_utils_test.cc
deleted file mode 100644
index 58c89f0e25f3..000000000000
--- a/tensorflow/core/profiler/convert/data_table_utils_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/data_table_utils.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "nlohmann/json_fwd.hpp"
-#include "nlohmann/json.hpp"
-
-namespace tensorflow::profiler {
-namespace {
-
-std::vector<std::vector<std::string>> GetTestColumns() {
-  return {{"rank", "number", "Rank"},
-          {"program_id", "string", "Program Id"},
-          {"op_category", "string", "Op Category"},
-          {"op_name", "string", "Op Name"},
-          {"bytes_accessed", "number", "Bytes Accessed"},
-          {"model_flops", "number", "Model Flops"},
-          {"occurrences", "number", "#Occurrences"}};
-}
-
-std::vector<nlohmann::json> GetTestRows() {
-  return {{1, "11111", "category1", "op1", 200000000, 123123123, 10},
-          {2, "22222", "category2", "op2", 1000000, 0, 20},
-          {3, "33333", "category3", "op3", 3000000, 565656, 30}};
-}
-
-std::unique_ptr<tensorflow::profiler::DataTable> CreateTestDataTable() {
-  auto data_table = std::make_unique<DataTable>();
-  for (const std::vector<std::string>& col : GetTestColumns()) {
-    data_table->AddColumn(TableColumn(col[0], col[1], col[2]));
-  }
-  for (const nlohmann::json& row_json : GetTestRows()) {
-    TableRow* row = data_table->AddRow();
-    for (int i = 0; i < row_json.size(); ++i) {
-      row->AddCell(row_json[i]);
-    }
-  }
-  return data_table;
-}
-
-std::unique_ptr<tensorflow::profiler::DataTable>
-CreateTestDataTableWithCustomProperties() {
-  auto data_table = std::make_unique<DataTable>();
-  data_table->AddCustomProperty("key1", "value1");
-  data_table->AddCustomProperty("key2", "value2");
-  return data_table;
-}
-
-TEST(DataTableUtilsTest, ToJson) {
-  std::unique_ptr<tensorflow::profiler::DataTable> data_table =
-      CreateTestDataTable();
-  std::string json_string = data_table->ToJson();
-  const nlohmann::basic_json<> parsed_json = nlohmann::json::parse(json_string);
-  auto test_columns = GetTestColumns();
-  auto test_rows = GetTestRows();
-  EXPECT_EQ(parsed_json["cols"].size(), test_columns.size());
-  EXPECT_EQ(parsed_json["rows"].size(), test_rows.size());
-  for (int i = 0; i < test_columns.size(); ++i) {
-    EXPECT_EQ(parsed_json["cols"][i]["id"], test_columns[i][0]);
-    EXPECT_EQ(parsed_json["cols"][i]["label"], test_columns[i][2]);
-    EXPECT_EQ(parsed_json["cols"][i]["type"], test_columns[i][1]);
-  }
-  for (int i = 0; i < test_rows.size(); ++i) {
-    for (int j = 0; j < test_columns.size(); ++j) {
-      EXPECT_EQ(parsed_json["rows"][i]["c"][j]["v"], GetTestRows()[i][j]);
-    }
-  }
-}
-
-TEST(DataTableUtilsTest, ToJsonWithCustomProperties) {
-  std::unique_ptr<tensorflow::profiler::DataTable> data_table =
-      CreateTestDataTableWithCustomProperties();
-  std::string table_json_string = data_table->ToJson();
-  const nlohmann::basic_json<> parsed_json =
-      nlohmann::json::parse(table_json_string);
-  EXPECT_EQ(parsed_json.find("p")->size(), 2);
-  EXPECT_EQ(parsed_json.find("p")->at("key1"), "value1");
-  EXPECT_EQ(parsed_json.find("p")->at("key2"), "value2");
-}
-
-}  // namespace
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/dcn_analysis.cc b/tensorflow/core/profiler/convert/dcn_analysis.cc
deleted file mode 100644
index 15de6d44400d..000000000000
--- a/tensorflow/core/profiler/convert/dcn_analysis.cc
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/dcn_analysis.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/convert/dcn_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using tsl::profiler::kMaxCollectivesToDisplay;
-using tsl::profiler::kMegaScaleDcnReceive;
-using tsl::profiler::LineIdType;
-using tsl::profiler::MicroToNano;
-
-void DcnBurstManager::ResetBurstState() {
-  active_burst_messages_ = 0;
-  straggler_idx_ = 0;
-  active_burst_.num_messages = 0;
-  active_burst_.max_overlapping_messages = 0;
-  active_burst_.start_timestamp_ns = 0;
-  active_burst_.end_timestamp_ns = 0;
-  active_burst_.burst_size_bytes = 0;
-}
-
-void DcnBurstManager::CreateBursts(const TimestampMap& tm_events) {
-  ResetBurstState();
-  for (const auto& tm_event : tm_events) {
-    if (active_burst_messages_ < 0) {
-      LOG_FIRST_N(WARNING, 10)
-          << "Negative messages in burst, bursts will be incorrect.";
-    }
-    if (active_burst_messages_ == 0) {
-      // When no messages are active, next event starts a new burst
-      active_burst_.start_timestamp_ns = tm_event.first;
-    }
-    active_burst_messages_ += tm_event.second->message_diff;
-    if (tm_event.second->message_diff > 0) {
-      // On beginning of message increase messages and bytes
-      active_burst_.num_messages += tm_event.second->message_diff;
-      active_burst_.burst_size_bytes += tm_event.second->size_diff;
-    } else {
-      // On end of message, register straggler
-      Straggler straggler = {tm_event.second->duration_ns,   // duration_ns
-                             tm_event.second->timestamp_ns,  // end_timestamp_ns
-                             tm_event.second->size_diff * (-1),  // size_bytes
-                             tm_event.second->src_slice_id};     // src_slice_id
-      active_burst_.stragglers[straggler_idx_] = straggler;
-      straggler_idx_ = (straggler_idx_ + 1) % kMaxStragglersPerBurst;
-    }
-    active_burst_.max_overlapping_messages =
-        std::max(active_burst_.max_overlapping_messages,
-                 static_cast<uint64_t>(active_burst_messages_));
-    // If we are back at 0 messages, the burst has finished and can be added
-    // to the bursts_ vector.
-    if (active_burst_messages_ == 0) {
-      active_burst_.end_timestamp_ns = tm_event.first;
-      total_latency_ +=
-          (active_burst_.end_timestamp_ns - active_burst_.start_timestamp_ns);
-      bursts_.emplace_back(std::move(active_burst_));
-      ResetBurstState();
-    }
-  }
-}
-
-DcnEventsProcessor::DcnEventsProcessor(uint32_t num_tpu_tensor_cores,
-                                       bool is_megacore)
-    : num_tpu_tensor_cores_(num_tpu_tensor_cores), is_megacore_(is_megacore) {
-  // Register all MSXLA messages we may need to analyze. Currently only
-  // receive messages are processed.
-  registered_dcn_messages_.push_back(kMegaScaleDcnReceive);
-  tpu_collective_ts_map_.resize(num_tpu_tensor_cores_);
-  tpu_collective_bursts_.resize(num_tpu_tensor_cores_);
-}
-
-// Sets up map between registered Megascale messages and their event metadata
-// so they can be captured from host events.
-void DcnEventsProcessor::SetupMessageInfo(const XPlaneVisitor& plane) {
-  plane.ForEachEventMetadata([&](const XEventMetadataVisitor& event_metadata) {
-    if (std::find(registered_dcn_messages_.begin(),
-                  registered_dcn_messages_.end(),
-                  event_metadata.Name()) != registered_dcn_messages_.end()) {
-      megascale_msg_[event_metadata.Name()] = event_metadata.Id();
-    }
-  });
-}
-
-// If we use megacore, collective traffic goes to even TPU tensor cores.
-// Odd ones are woken up from their even pair (e.g. 0 wakes up 1).
-uint32_t DcnEventsProcessor::FindTpuIdx(int tpu) {
-  uint32_t num_tpus = num_tpu_tensor_cores_;
-  if (is_megacore_) {
-    num_tpus /= 2;
-  }
-  uint32_t tpu_idx = tpu % num_tpus;
-  if (is_megacore_) {
-    tpu_idx = tpu_idx * 2;
-  }
-  return tpu_idx;
-}
-
-void DcnEventsProcessor::GenerateTimestampEvents(
-    const DcnMessage& dcn_message) {
-  // Create one event for the beginning and one for the end of the message
-  std::shared_ptr<TimestampEvent> start_event(
-      new TimestampEvent{dcn_message.start_timestamp_ns, 0, 1,
-                         dcn_message.size_bytes, dcn_message.slice_src});
-  std::shared_ptr<TimestampEvent> end_event(new TimestampEvent{
-      dcn_message.end_timestamp_ns,
-      static_cast<uint64_t>(MicroToNano(dcn_message.duration_us)), -1,
-      -1 * dcn_message.size_bytes, dcn_message.slice_src});
-
-  // Add messages to host timestamp event map
-  std::pair<uint64_t, std::shared_ptr<TimestampEvent>> start_event_entry =
-      std::make_pair(dcn_message.start_timestamp_ns, start_event);
-  std::pair<uint64_t, std::shared_ptr<TimestampEvent>> end_event_entry =
-      std::make_pair(dcn_message.end_timestamp_ns, end_event);
-  host_ts_map_.insert(start_event_entry);
-  host_ts_map_.insert(end_event_entry);
-
-  // Add messages to the proper TPU collective timestamp event map.
-  const std::string& collective_name = dcn_message.collective_name;
-  uint32_t tpu_idx = FindTpuIdx(dcn_message.tpu_dst);
-  auto& m = tpu_collective_ts_map_[tpu_idx][collective_name];
-  m.insert(start_event_entry);
-  m.insert(end_event_entry);
-}
-
-void DcnEventsProcessor::PrintTimestampEvents() {
-  for (const auto& host_ts : host_ts_map_) {
-    LOG(INFO) << host_ts.first << ": " << host_ts.second->timestamp_ns << " "
-              << host_ts.second->duration_ns << " "
-              << host_ts.second->message_diff << " "
-              << host_ts.second->size_diff << " "
-              << host_ts.second->src_slice_id;
-  }
-  for (uint32_t tpu_idx = 0; tpu_idx < num_tpu_tensor_cores_; tpu_idx++) {
-    LOG(INFO) << "TPU: " << tpu_idx;
-    for (const auto& col_id : tpu_collective_ts_map_[tpu_idx]) {
-      LOG(INFO) << col_id.first;
-      for (const auto& tpu_col_ts :
-           tpu_collective_ts_map_[tpu_idx][col_id.first]) {
-        LOG(INFO) << tpu_col_ts.first << ": " << tpu_col_ts.second->timestamp_ns
-                  << " " << tpu_col_ts.second->duration_ns << " "
-                  << tpu_col_ts.second->message_diff << " "
-                  << tpu_col_ts.second->size_diff << " "
-                  << tpu_col_ts.second->src_slice_id;
-      }
-    }
-  }
-}
-
-// Uses heuristics to qualify a good enough amount of collectives.
-// kMaxCollectivesToDisplay - 1 are displayed.
-// Collectives with < 5% of total host BW time are never qualified
-// Collectives with < 20% of total host BW time are qualified if less than 4
-//   collectives  have already been qualified.
-// Top 8 collectives with > 20% of total host BW time are qualified
-uint32_t DcnEventsProcessor::NumCollectivesQualified(
-    const std::vector<uint64_t>& latencies) {
-  uint32_t num_collectives_qualified = 0;
-  // Allow for 1 line to display stragglers of non-qualified collectives.
-  uint32_t max_collectives = kMaxCollectivesToDisplay - 1;
-  for (const auto& lat : latencies) {
-    if (lat < host_dcn_bursts_.TotalLatency() * 0.05) {
-      return num_collectives_qualified;
-    } else if (lat < host_dcn_bursts_.TotalLatency() * 0.2 &&
-               num_collectives_qualified >= (max_collectives / 2)) {
-      return num_collectives_qualified;
-    } else if (num_collectives_qualified >= max_collectives) {
-      return num_collectives_qualified;
-    } else {
-      num_collectives_qualified++;
-    }
-  }
-  return latencies.size();
-}
-
-// Find which collectives you are going to display in details (dedicated line)
-// and which not (shared line for stragglers).
-// Order collectives based on burst latency -- then qualify the top ones based
-// on NumCollectivesQualified function.
-void DcnEventsProcessor::QualifyCollectives() {
-  for (auto tpu_idx = 0; tpu_idx < num_tpu_tensor_cores_; tpu_idx++) {
-    std::vector<uint64_t> latency_to_order;
-    latency_to_order.reserve(tpu_collective_bursts_[tpu_idx].size());
-    for (const auto& col_info : tpu_collective_bursts_[tpu_idx]) {
-      latency_to_order.emplace_back(col_info.second.TotalLatency());
-    }
-    std::sort(latency_to_order.begin(), latency_to_order.end(),
-              std::greater<uint64_t>());
-    uint32_t num_collectives_qualified =
-        NumCollectivesQualified(latency_to_order);
-    if (num_collectives_qualified > 0) {
-      uint32_t min_latency_to_qualify =
-          latency_to_order[num_collectives_qualified - 1];
-      uint32_t col_num = 0;
-      for (auto& col_info : tpu_collective_bursts_[tpu_idx]) {
-        if (col_info.second.TotalLatency() >= min_latency_to_qualify) {
-          col_info.second.SetToDisplay(true);
-          if (++col_num == kMaxCollectivesToDisplay - 1) break;
-        }
-      }
-    }
-  }
-}
-
-void DcnEventsProcessor::GenerateBursts() {
-  host_dcn_bursts_.CreateBursts(host_ts_map_);
-  host_dcn_bursts_.SetToDisplay(true);
-
-  for (auto tpu_idx = 0; tpu_idx < num_tpu_tensor_cores_; tpu_idx++) {
-    for (const auto& col_info : tpu_collective_ts_map_[tpu_idx]) {
-      tpu_collective_bursts_[tpu_idx][col_info.first].CreateBursts(
-          tpu_collective_ts_map_[tpu_idx][col_info.first]);
-    }
-  }
-  QualifyCollectives();
-}
-
-void DcnEventsProcessor::ProcessReceiveMessages(const XPlaneVisitor& plane) {
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    uint32_t recv_msg_id = megascale_msg_[kMegaScaleDcnReceive];
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      if (event.Id() == recv_msg_id) {
-        DcnMessage dcn_message = GetDcnMessageFromXEvent(event);
-        // TODO(emizan): Report invalid and clock skew messages somehow.
-        // TODO(emizan): Bring back loopback messages when MSXLA fixes them.
-        if (dcn_message.validity_info == DCN_MESSAGE_VALID) {
-          GenerateTimestampEvents(dcn_message);
-        }
-        received_messages_.emplace_back(std::move(dcn_message));
-      }
-    });
-  });
-  GenerateBursts();
-}
-
-absl::string_view DcnEventsProcessor::GetBwInfo(bool is_per_tpu,
-                                                const DcnBurst& burst,
-                                                float& burst_mean_bw,
-                                                float& burst_bw_utilization) {
-  absl::string_view bw_level;
-  uint32_t bw_divider = 1;
-  burst_mean_bw = static_cast<float>(burst.burst_size_bytes) /
-                  (burst.end_timestamp_ns - burst.start_timestamp_ns);
-  if (is_per_tpu) {
-    bw_divider = num_tpu_tensor_cores_;
-    if (is_megacore_) {
-      bw_divider /= 2;
-    }
-  }
-  // Have 3 BW categories (low/med/high) to limit the amount of colors in the
-  // trace viewer
-  if (burst_mean_bw < kLimitLowHostDcnBw / bw_divider) {
-    bw_level = "Low BW";
-  } else if (burst_mean_bw < kLimitMedHostDcnBw / bw_divider) {
-    bw_level = "Med BW";
-  } else {
-    bw_level = "High BW";
-  }
-  burst_bw_utilization = burst_mean_bw / (kMaxHostDcnBw / bw_divider);
-  return bw_level;
-}
-
-void DcnEventsProcessor::AddHostDcnTrafficToXPlane(XPlane* host_xplane) {
-  if (!host_dcn_bursts_.ToDisplay()) return;
-  XPlaneBuilder plane_builder(host_xplane);
-  XLineBuilder line =
-      plane_builder.GetOrCreateLine(LineIdType::kDcnHostTraffic);
-  line.SetNameIfEmpty("DCN Host Bandwidth");
-  line.SetTimestampNs(0);
-  XStatMetadata* bw_stat_metadata =
-      plane_builder.GetOrCreateStatMetadata("Bandwidth (GBytes/sec)");
-  XStatMetadata* bw_util_stat_metadata =
-      plane_builder.GetOrCreateStatMetadata("Bandwidth Utilization");
-  XStatMetadata* num_msg_stat_metadata =
-      plane_builder.GetOrCreateStatMetadata("Total Messages");
-  XStatMetadata* max_overlap_msg_stat_metadata =
-      plane_builder.GetOrCreateStatMetadata("Max Overlapping Messages");
-  XStatMetadata* avg_msg_size_stat_metadata =
-      plane_builder.GetOrCreateStatMetadata("Average Message Size (Bytes)");
-  for (const auto& host_burst : host_dcn_bursts_.GetBursts()) {
-    float burst_mean_bw, bw_utilization;
-    absl::string_view bw_level =
-        GetBwInfo(false, host_burst, burst_mean_bw, bw_utilization);
-    XEventMetadata* event_metadata =
-        plane_builder.GetOrCreateEventMetadata(bw_level);
-    XEventBuilder event = line.AddEvent(*event_metadata);
-    event.SetOffsetNs(host_burst.start_timestamp_ns);
-    event.SetDurationNs(host_burst.end_timestamp_ns -
-                        host_burst.start_timestamp_ns);
-
-    // Using std::string to limit number of decimals.
-    event.ParseAndAddStatValue(*bw_stat_metadata,
-                               std::to_string(burst_mean_bw));
-    event.ParseAndAddStatValue(*bw_util_stat_metadata,
-                               std::to_string(bw_utilization));
-    event.AddStatValue(*num_msg_stat_metadata, host_burst.num_messages);
-    event.AddStatValue(*max_overlap_msg_stat_metadata,
-                       host_burst.max_overlapping_messages);
-    uint32_t avg_message_size =
-        host_burst.burst_size_bytes / host_burst.num_messages;
-    event.AddStatValue(*avg_msg_size_stat_metadata, avg_message_size);
-  }
-}
-
-void DcnEventsProcessor::AddUnqualifiedCollectivesToXPlane(
-    XPlaneBuilder& plane_builder, uint32_t tpu_idx) {
-  XLineBuilder line =
-      plane_builder.GetOrCreateLine(LineIdType::kDcnCollectiveTrafficMax);
-  line.SetNameIfEmpty("Remaining collectives");
-  line.SetTimestampNs(0);
-  for (const auto& col_item : tpu_collective_bursts_[tpu_idx]) {
-    if (col_item.second.ToDisplay()) continue;
-    for (const auto& col_burst : col_item.second.GetBursts()) {
-      XEventMetadata* straggler_event_metadata =
-          plane_builder.GetOrCreateEventMetadata(col_item.first);
-      uint32_t stragglers_processed = 0;
-      XStatMetadata* straggler_src_slice_stat_metadata =
-          plane_builder.GetOrCreateStatMetadata("Source slice");
-      XStatMetadata* straggler_duration_ns_stat_metadata =
-          plane_builder.GetOrCreateStatMetadata("Duration ns");
-      XStatMetadata* straggler_send_time_ns_stat_metadata =
-          plane_builder.GetOrCreateStatMetadata("Send timestamp ns");
-      XStatMetadata* straggler_recv_time_ns_stat_metadata =
-          plane_builder.GetOrCreateStatMetadata("Recv timestamp ns");
-      for (const auto& straggler : col_burst.stragglers) {
-        XEventBuilder straggler_event =
-            line.AddEvent(*straggler_event_metadata);
-        straggler_event.SetOffsetNs(straggler.end_timestamp_ns - 10000);
-        straggler_event.SetDurationNs(10000);
-        straggler_event.AddStatValue(*straggler_src_slice_stat_metadata,
-                                     straggler.src_slice_id);
-        straggler_event.AddStatValue(*straggler_duration_ns_stat_metadata,
-                                     straggler.duration_ns);
-        straggler_event.AddStatValue(
-            *straggler_send_time_ns_stat_metadata,
-            straggler.end_timestamp_ns - straggler.duration_ns);
-        straggler_event.AddStatValue(*straggler_recv_time_ns_stat_metadata,
-                                     straggler.end_timestamp_ns);
-        if (++stragglers_processed >= col_burst.num_messages) break;
-      }
-    }
-  }
-}
-
-void DcnEventsProcessor::AddQualifiedCollectivesToXPlane(
-    XPlaneBuilder& plane_builder, uint32_t tpu_idx) {
-  uint32_t total_collectives = 0;
-  for (const auto& col_item : tpu_collective_bursts_[tpu_idx]) {
-    // Skip collectives not enabled for display.
-    if (!col_item.second.ToDisplay()) continue;
-    const std::string& col_name = col_item.first;
-    XLineBuilder line = plane_builder.GetOrCreateLine(
-        LineIdType::kDcnCollectiveTraffic + total_collectives++);
-    line.SetNameIfEmpty(col_name);
-    line.SetTimestampNs(0);
-    XStatMetadata* bw_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Bandwidth (GBytes/sec)");
-    XStatMetadata* bw_util_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Bandwidth Utilization");
-    XStatMetadata* num_msg_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Total Messages");
-    XStatMetadata* max_overlap_msg_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Max Overlapping Messages");
-    XStatMetadata* avg_msg_size_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Average Message Size (Bytes)");
-    XStatMetadata* straggler_details_metadata =
-        plane_builder.GetOrCreateStatMetadata("Straggler info:");
-    XStatMetadata* straggler_src_slice_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Source slice");
-    XStatMetadata* straggler_duration_ns_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Duration ns");
-    XStatMetadata* straggler_send_time_ns_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Send timestamp ns");
-    XStatMetadata* straggler_recv_time_ns_stat_metadata =
-        plane_builder.GetOrCreateStatMetadata("Recv timestamp ns");
-    for (const auto& col_burst : col_item.second.GetBursts()) {
-      float burst_mean_bw, bw_utilization;
-      absl::string_view bw_level =
-          GetBwInfo(true, col_burst, burst_mean_bw, bw_utilization);
-      XEventMetadata* event_metadata =
-          plane_builder.GetOrCreateEventMetadata(bw_level);
-      XEventBuilder event = line.AddEvent(*event_metadata);
-      event.SetOffsetNs(col_burst.start_timestamp_ns);
-      event.SetDurationNs(col_burst.end_timestamp_ns -
-                          col_burst.start_timestamp_ns);
-      event.ParseAndAddStatValue(*bw_stat_metadata,
-                                 std::to_string(burst_mean_bw));
-      event.ParseAndAddStatValue(*bw_util_stat_metadata,
-                                 std::to_string(bw_utilization));
-      event.AddStatValue(*num_msg_stat_metadata, col_burst.num_messages);
-      event.AddStatValue(*max_overlap_msg_stat_metadata,
-                         col_burst.max_overlapping_messages);
-      event.AddStatValue(*avg_msg_size_stat_metadata,
-                         col_burst.burst_size_bytes / col_burst.num_messages);
-      // Add straggler info.
-      XEventMetadata* straggler_event_metadata =
-          plane_builder.GetOrCreateEventMetadata("Straggler");
-      uint32_t stragglers_processed = 0;
-      std::string straggler_details = "Stragglers:\n";
-      for (const auto& straggler : col_burst.stragglers) {
-        // Add an event for the last straggler
-        if (straggler.end_timestamp_ns == col_burst.end_timestamp_ns) {
-          XEventBuilder straggler_event =
-              line.AddEvent(*straggler_event_metadata);
-          straggler_event.SetOffsetNs(straggler.end_timestamp_ns -
-                                      straggler.duration_ns);
-          straggler_event.SetDurationNs(straggler.duration_ns);
-          straggler_event.AddStatValue(*straggler_src_slice_stat_metadata,
-                                       straggler.src_slice_id);
-          straggler_event.AddStatValue(*straggler_duration_ns_stat_metadata,
-                                       straggler.duration_ns);
-          straggler_event.AddStatValue(
-              *straggler_send_time_ns_stat_metadata,
-              straggler.end_timestamp_ns - straggler.duration_ns);
-          straggler_event.AddStatValue(*straggler_recv_time_ns_stat_metadata,
-                                       straggler.end_timestamp_ns);
-        }
-        // Add text metadata for all stragglers.
-        straggler_details +=
-            "  Src slice: " + std::to_string(straggler.src_slice_id) +
-            " -- Duration (ns): " + std::to_string(straggler.duration_ns) +
-            " -- [Send Timestamp, Recv Timestamp]: [" +
-            std::to_string(straggler.end_timestamp_ns - straggler.duration_ns) +
-            ", " + std::to_string(straggler.end_timestamp_ns) + "]\n";
-        if (++stragglers_processed >= col_burst.num_messages) break;
-      }
-      event.AddStatValue(*straggler_details_metadata, straggler_details);
-    }
-  }
-}
-
-void DcnEventsProcessor::AddTpuCollectiveDcnTrafficToXPlane(
-    XPlane* device_xplane) {
-  XPlaneBuilder plane_builder(device_xplane);
-  auto tpu = tsl::profiler::GetTensorCoreId(plane_builder.Name());
-  if (!tpu.has_value()) return;
-  uint32_t tpu_idx = FindTpuIdx(tpu.value());
-  AddQualifiedCollectivesToXPlane(plane_builder, tpu_idx);
-  AddUnqualifiedCollectivesToXPlane(plane_builder, tpu_idx);
-}
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/dcn_analysis.h b/tensorflow/core/profiler/convert/dcn_analysis.h
deleted file mode 100644
index cdff81770799..000000000000
--- a/tensorflow/core/profiler/convert/dcn_analysis.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
-
-#include <array>
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/convert/dcn_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Structure representing a DcnMessage using two entries:
-// One for the start of the message and one for the end.
-struct TimestampEvent {
-  uint64_t timestamp_ns;  // TraceMe logging timestamp
-  uint64_t duration_ns;   // 0 for start of message, duration for end of message
-  int32_t message_diff;   // +1/-1 for start/end of message.
-                          // Makes handling 0-sized messages easier and is
-                          // convenient for the burst generation algorithm.
-  size_t size_diff;       // +size/-size for start/end of message.
-  int32_t src_slice_id;   // Source slice for message, used for stragglers
-};
-
-// We use an multi map since TimestampEvents will be ordered and we
-// need separate entries for possible events happening at exactly the
-// same time.
-typedef std::multimap<uint64_t, std::shared_ptr<TimestampEvent>> TimestampMap;
-typedef absl::flat_hash_map<std::string, TimestampMap> CollectiveTimestampMap;
-
-// Straggler messages. These are shown at the end of the bursts they belong to.
-struct Straggler {
-  uint64_t duration_ns;       // Message duration in ns
-  uint64_t end_timestamp_ns;  // End of the message. For the last straggler
-                              // this will be the end of the burst
-  size_t size_bytes;          // Size of the message in bytes
-  int32_t src_slice_id;       // Source slice of the message
-                              // TODO(emizan) Add host info.
-};
-
-static constexpr uint32_t kMaxStragglersPerBurst = 4;
-
-// DCN Burst description.
-// A burst is defined as a period of time during which there is at least one
-// message in the network. Since DCN traffic is bursty this structure is
-// convenient to summarize 100K+ messages in a few 10s of bursts.
-// Burst scope is flexible. In this analysis we have per-host bursts, which
-// include messages arriving on a single host independent of sender/target TPU/
-// and collective. We also have per collective/TPU bursts which include messages
-// for a single collective+TPU combination.
-struct DcnBurst {
-  uint64_t start_timestamp_ns;        // Beginning of burst in ns
-  uint64_t end_timestamp_ns;          // End of burst in ns
-  uint64_t burst_size_bytes;          // Total number of bytes in burst
-  uint64_t num_messages;              // Messages in burst
-  uint64_t max_overlapping_messages;  // Max overlapping messages in burst
-  // Buffer of stragglers in a bursts. Contains the last few messages in a burst
-  std::array<Straggler, kMaxStragglersPerBurst> stragglers;
-};
-
-// Class with functionality to generate DcnBursts out of TimestampEvents.
-// Burst creation is a non-trivial state machine
-class DcnBurstManager {
- public:
-  DcnBurstManager() = default;
-  uint64_t TotalLatency() const { return total_latency_; }
-  void SetToDisplay(bool to_display) { to_display_ = to_display; }
-  bool ToDisplay() const { return to_display_; }
-  const std::vector<DcnBurst> &GetBursts() const { return bursts_; }
-
-  // Run burst state machine creation out of timestamp map.
-  void CreateBursts(const TimestampMap &tm_events);
-  // For debugging purposes.
-  void PrintBursts() {
-    for (const auto &burst : bursts_) {
-      LOG(INFO) << burst.start_timestamp_ns << " " << burst.end_timestamp_ns
-                << " " << burst.num_messages << " " << burst.burst_size_bytes
-                << " " << burst.max_overlapping_messages;
-    }
-  }
-
- private:
-  std::vector<DcnBurst> bursts_;  // Bursts created by this manager
-  uint64_t total_latency_ = 0;    // Total latency of all bursts created
-                                  // Used to see if bursts will be displayed
-  bool to_display_ = false;       // Set to true to enable burst display
-
-  int32_t active_burst_messages_;  // Used by burst creation state machine.
-  DcnBurst active_burst_;          // Active burst in creation
-  uint32_t straggler_idx_;
-
-  // Initializes state machine when new burst is detected.
-  void ResetBurstState();
-};
-
-typedef absl::flat_hash_map<std::string, DcnBurstManager>
-    CollectiveBurstManager;
-
-class DcnEventsProcessor {
- public:
-  DcnEventsProcessor() = delete;
-  DcnEventsProcessor(uint32_t num_tpu_tensor_cores, bool is_megacore);
-
-  uint32_t NumTpuTensorCores() const { return num_tpu_tensor_cores_; }
-  bool IsMegacore() const { return is_megacore_; }
-
-  // Populates available megascale messages from event metadata.
-  void SetupMessageInfo(const tensorflow::profiler::XPlaneVisitor &plane);
-
-  std::optional<int32_t> MegaScaleMessageId(absl::string_view msg_name) const {
-    auto iter = megascale_msg_.find(msg_name);
-    if (iter != megascale_msg_.end()) {
-      return iter->second;
-    }
-    return std::nullopt;
-  }
-
-  uint32_t NumReceivedMessages() const { return received_messages_.size(); }
-  const tensorflow::profiler::DcnMessage &GetMessage(uint32_t i) const {
-    return received_messages_[i];
-  }
-
-  // Checks if messages with msg event name have been found in event metadata.
-  bool HasDcnMessages(absl::string_view msg_name) const {
-    return (megascale_msg_.find(msg_name) != megascale_msg_.end());
-  }
-
-  const TimestampMap &HostTsMap() const { return host_ts_map_; }
-  const std::vector<DcnBurst> &GetHostBursts() const {
-    return host_dcn_bursts_.GetBursts();
-  }
-
-  // Main function to process receive messages, and call other functions
-  // to generate timestamp events and bursts.
-  void ProcessReceiveMessages(const tensorflow::profiler::XPlaneVisitor &plane);
-
-  // Update XPlanes using DCN traffic info
-  void AddHostDcnTrafficToXPlane(tensorflow::profiler::XPlane *host_xplane);
-  void AddTpuCollectiveDcnTrafficToXPlane(
-      tensorflow::profiler::XPlane *device_xplane);
-
- private:
-  // Tensor cores and megacore flag for this host. DCN messages are sent to a
-  // TPU chip, so we need to know the number of tensor cores and whether
-  // megacore is used to map DCN traffic to the proper tensor core.
-  const uint32_t num_tpu_tensor_cores_;
-  const bool is_megacore_;
-
-  // Used for visualization of BW and computation of BW utilization.
-  static constexpr float kLimitLowHostDcnBw = 4.17;
-  static constexpr float kLimitMedHostDcnBw = 8.34;
-  static constexpr float kMaxHostDcnBw = 12.5;
-
-  std::vector<absl::string_view> registered_dcn_messages_;
-
-  // Available megascale messages for this trace.
-  absl::flat_hash_map<absl::string_view, int32_t> megascale_msg_;
-
-  std::vector<tensorflow::profiler::DcnMessage> received_messages_;
-
-  // TimestampMaps for messages that arrive to this host
-  // and for messages of distinct collectives going to different TPUs.
-  TimestampMap host_ts_map_;
-  std::vector<CollectiveTimestampMap> tpu_collective_ts_map_;
-
-  // DcnBurstManagers for bursts that arrive to this host
-  // and for burst from distinct collectives going to different TPUs.
-  DcnBurstManager host_dcn_bursts_;
-  std::vector<CollectiveBurstManager> tpu_collective_bursts_;
-
-  // Find the TPU index a DCN message goes to.
-  uint32_t FindTpuIdx(int tpu);
-
-  // Generates BW info to display in the trace viewer.
-  // This included trace event BW level string, mean BW per burst and
-  // utilization.
-  absl::string_view GetBwInfo(bool is_per_tpu, const DcnBurst &burst,
-                              float &burst_mean_bw,
-                              float &burst_bw_utilization);
-
-  // Qualify collectives to display on trace viewer.
-  // Qualified collectives are given a dedicated line, while for the rest
-  // we share a single line for their stragglers.
-  uint32_t NumCollectivesQualified(const std::vector<uint64_t> &latencies);
-  void QualifyCollectives();
-  // Export collective DCN activity to trace viewer.
-  void AddQualifiedCollectivesToXPlane(
-      tensorflow::profiler::XPlaneBuilder &plane_builder, uint32_t tpu_idx);
-  void AddUnqualifiedCollectivesToXPlane(
-      tensorflow::profiler::XPlaneBuilder &plane_builder, uint32_t tpu_idx);
-
-  // Create timestamp events for every message
-  void GenerateTimestampEvents(
-      const tensorflow::profiler::DcnMessage &dcn_message);
-  // For debugging purposes
-  void PrintTimestampEvents();
-  // Generate bursts (host and TPU/collective) from timestamp events.
-  void GenerateBursts();
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
diff --git a/tensorflow/core/profiler/convert/dcn_analysis_test.cc b/tensorflow/core/profiler/convert/dcn_analysis_test.cc
deleted file mode 100644
index 2673ca308839..000000000000
--- a/tensorflow/core/profiler/convert/dcn_analysis_test.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/dcn_analysis.h"
-
-#include <optional>
-#include <string>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/xplane_builder.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tensorflow/core/profiler/convert/dcn_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-using tensorflow::profiler::DCN_MESSAGE_INVALID_BAD_KEY;
-using tensorflow::profiler::DCN_MESSAGE_INVALID_CLOCK_SKEW;
-using tensorflow::profiler::DCN_MESSAGE_VALID;
-using tensorflow::profiler::DCN_MESSAGE_VALID_LOOPBACK;
-using tensorflow::profiler::XEventBuilder;
-using tensorflow::profiler::XEventMetadata;
-using tensorflow::profiler::XLineBuilder;
-using tensorflow::profiler::XPlane;
-using tensorflow::profiler::XPlaneBuilder;
-using tensorflow::profiler::XPlaneVisitor;
-using tensorflow::profiler::XSpace;
-using ::testing::FieldsAre;
-using tsl::profiler::kMegaScaleDcnReceive;
-using tsl::profiler::kMegaScaleDcnSend;
-
-TEST(DcnAnalysis, SetupMessageInfoTest) {
-  XSpace space;
-  XPlane *host_trace = space.add_planes();
-  XPlaneBuilder host_trace_builder(host_trace);
-
-  XEventMetadata *event_metadata_1 =
-      host_trace_builder.GetOrCreateEventMetadata(1);
-  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
-  XEventMetadata *event_metadata_2 =
-      host_trace_builder.GetOrCreateEventMetadata(2);
-  event_metadata_2->set_name(std::string(kMegaScaleDcnSend));
-
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  DcnEventsProcessor dcn_events_processor(/*num_tpu_tensor_cores*/ 4,
-                                          /*is_megacore*/ false);
-  dcn_events_processor.SetupMessageInfo(plane);
-  ASSERT_FALSE(dcn_events_processor.HasDcnMessages(kMegaScaleDcnSend));
-  ASSERT_TRUE(dcn_events_processor.HasDcnMessages(kMegaScaleDcnReceive));
-  ASSERT_FALSE(dcn_events_processor.HasDcnMessages("Another Message"));
-  ASSERT_EQ(dcn_events_processor.MegaScaleMessageId(kMegaScaleDcnReceive), 1);
-  ASSERT_EQ(dcn_events_processor.MegaScaleMessageId(kMegaScaleDcnSend),
-            std::nullopt);
-}
-
-// Test processing of valid messages and that all of them are received.
-TEST(DcnAnalysis, CreateMessageTestValidMessages) {
-  XSpace space;
-  XPlane *host_trace = space.add_planes();
-  XPlaneBuilder xplane_builder(host_trace);
-
-  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
-  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
-
-  XLineBuilder xline_builder_0 = xplane_builder.GetOrCreateLine(0);
-  XLineBuilder xline_builder_1 = xplane_builder.GetOrCreateLine(1);
-
-  // 1st event
-  XEventBuilder event_builder = xline_builder_0.AddEvent(*event_metadata_1);
-  event_builder.SetOffsetNs(100000);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_label"),
-      "all-reduce.273_312");
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 2);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
-      3);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 1);
-  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                                 "dcn_destination_per_slice_device_id"),
-                             3);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 0);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 24);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 50);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 32768);
-
-  // 2nd event, same line
-  event_builder = xline_builder_0.AddEvent(*event_metadata_1);
-  event_builder.SetOffsetNs(175000);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_label"),
-      "super-collective.1234");
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 112);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
-      1);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 34);
-  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                                 "dcn_destination_per_slice_device_id"),
-                             2);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 4);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 0);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 50);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 1);
-
-  // 3rd event event, new line, no chunk/loop index
-  event_builder = xline_builder_1.AddEvent(*event_metadata_1);
-  event_builder.SetOffsetNs(150000);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_label"), "super-collective");
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 9);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
-      3);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 0);
-  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                                 "dcn_destination_per_slice_device_id"),
-                             0);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 75);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 10);
-
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  DcnEventsProcessor dcn_events_processor(4, false);
-  dcn_events_processor.SetupMessageInfo(plane);
-  dcn_events_processor.ProcessReceiveMessages(plane);
-
-  ASSERT_EQ(dcn_events_processor.NumReceivedMessages(), 3);
-  EXPECT_THAT(dcn_events_processor.GetMessage(0),
-              FieldsAre("all-reduce.273_312", /* collective name */
-                        2, 3, 1, 3, /* slice_src, tpu_src, slice_dst, tpu_dst */
-                        /* start_timestamp_ns, end_timestamp_ns, duration_us */
-                        50000, 100000, 50,
-                        /* size_bytes, chunk_id, loop_index_id */
-                        32768, 0, 24,
-                        /* validity_info */
-                        DCN_MESSAGE_VALID));
-  EXPECT_THAT(dcn_events_processor.GetMessage(1),
-              FieldsAre("super-collective.1234", /* collective name */
-                        /* slice_src, tpu_src, slice_dst, tpu_dst */
-                        112, 1, 34, 2,
-                        /* start_timestamp_ns. end_timestamp_ns, duration_us */
-                        125000, 175000, 50,
-                        /* size_bytes, chunk_id, loop_index_id */
-                        1, 4, 0,
-                        /* validity_info */
-                        DCN_MESSAGE_VALID));
-  EXPECT_THAT(
-      dcn_events_processor.GetMessage(2),
-      FieldsAre("super-collective", /* collective name */
-                9, 3, 0, 0,         /* slice_src, tpu_src, slice_dst, tpu_dst */
-                75000, 150000,      /* start_timestamp_ns. end_timestamp_ns */
-                75,                 /* duration_us */
-                10, -1, -1,         /* size_bytes, chunk_id, loop_index_id */
-                /* validity_info */
-                DCN_MESSAGE_VALID));
-  TimestampMap host_ts_map = dcn_events_processor.HostTsMap();
-  ASSERT_EQ(host_ts_map.size(), 6);
-  for (const auto &ts_map_item : host_ts_map) {
-    ASSERT_EQ(ts_map_item.first, ts_map_item.second->timestamp_ns);
-    if (ts_map_item.first == 50000) {
-      ASSERT_EQ(ts_map_item.second->duration_ns, 0);
-      ASSERT_EQ(ts_map_item.second->message_diff, 1);
-      ASSERT_EQ(ts_map_item.second->size_diff, 32768);
-    } else if (ts_map_item.first == 125000) {
-      ASSERT_EQ(ts_map_item.second->duration_ns, 0);
-      ASSERT_EQ(ts_map_item.second->message_diff, 1);
-      ASSERT_EQ(ts_map_item.second->size_diff, 1);
-    } else if (ts_map_item.first == 75000) {
-      ASSERT_EQ(ts_map_item.second->duration_ns, 0);
-      ASSERT_EQ(ts_map_item.second->message_diff, 1);
-      ASSERT_EQ(ts_map_item.second->size_diff, 10);
-    } else if (ts_map_item.first == 100000) {
-      ASSERT_EQ(ts_map_item.second->duration_ns, 50000);
-      ASSERT_EQ(ts_map_item.second->message_diff, -1);
-      ASSERT_EQ(ts_map_item.second->size_diff, -32768);
-    } else if (ts_map_item.first == 175000) {
-      ASSERT_EQ(ts_map_item.second->duration_ns, 50000);
-      ASSERT_EQ(ts_map_item.second->message_diff, -1);
-      ASSERT_EQ(ts_map_item.second->size_diff, -1);
-    } else if (ts_map_item.first == 150000) {
-      ASSERT_EQ(ts_map_item.second->duration_ns, 75000);
-      ASSERT_EQ(ts_map_item.second->message_diff, -1);
-      ASSERT_EQ(ts_map_item.second->size_diff, -10);
-    } else {
-      FAIL() << "Unexpected timestamp entry.";
-    }
-  }
-  const std::vector<DcnBurst> &host_bursts =
-      dcn_events_processor.GetHostBursts();
-  ASSERT_EQ(host_bursts.size(), 1);
-  ASSERT_EQ(host_bursts[0].num_messages, 3);
-  ASSERT_EQ(host_bursts[0].start_timestamp_ns, 50000);
-  ASSERT_EQ(host_bursts[0].end_timestamp_ns, 175000);
-  ASSERT_EQ(host_bursts[0].burst_size_bytes, 32779);
-  ASSERT_EQ(host_bursts[0].max_overlapping_messages, 2);
-}
-
-// Loopback message test, currently interpreted as valid.
-TEST(DcnAnalysis, CreateLoopBackMessageTest) {
-  XSpace space;
-  XPlane *host_trace = space.add_planes();
-  XPlaneBuilder xplane_builder(host_trace);
-
-  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
-  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
-
-  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
-  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata_1);
-  event_builder.SetOffsetNs(5000000);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_label"), "all-gather.1234");
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 2);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
-      3);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 2);
-  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                                 "dcn_destination_per_slice_device_id"),
-                             1);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 4);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 40);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 1000);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 1000);
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  DcnEventsProcessor dcn_events_processor(4, false);
-  dcn_events_processor.SetupMessageInfo(plane);
-  dcn_events_processor.ProcessReceiveMessages(plane);
-  ASSERT_EQ(dcn_events_processor.NumReceivedMessages(), 1);
-  EXPECT_THAT(dcn_events_processor.GetMessage(0),
-              FieldsAre("all-gather.1234", /* collective name */
-                        2, 3, 2, 1, /* slice_src, tpu_src, slice_dst, tpu_dst */
-                        /* start_timestamp_ns. end_timestamp_ns, duration_us */
-                        4000000, 5000000, 1000,
-                        /* size_bytes, chunk_id, loop_index_id */
-                        1000, 4, 40,
-                        /* validity_info */
-                        DCN_MESSAGE_VALID_LOOPBACK));
-}
-
-// Zero duration message, this is due to a bug or clock skew between source
-// and destination. Any analysis will just cause confusion, mark it as invalid.
-TEST(DcnAnalysis, CreateZeroDurationMessageTest) {
-  XSpace space;
-  XPlane *host_trace = space.add_planes();
-  XPlaneBuilder xplane_builder(host_trace);
-
-  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
-  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
-
-  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
-  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata_1);
-  event_builder.SetOffsetNs(20000);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_label"),
-      "all-reduce.273_312");
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 2);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
-      3);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 1);
-  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                                 "dcn_destination_per_slice_device_id"),
-                             1);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 0);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 25);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 0);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 512);
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  DcnEventsProcessor dcn_events_processor(4, false);
-  dcn_events_processor.SetupMessageInfo(plane);
-  dcn_events_processor.ProcessReceiveMessages(plane);
-  EXPECT_THAT(
-      dcn_events_processor.GetMessage(0),
-      FieldsAre("all-reduce.273_312", /* collective name */
-                2, 3, 1, 1, /* slice_src, tpu_src, slice_dst, tpu_dst */
-                20000, 20000,
-                0, /* start_timestamp_ns. end_timestamp_ns, duration_us */
-                512, 0, 25, /* size_bytes, chunk_id, loop_index_id */
-                            /* validity_info */
-                DCN_MESSAGE_INVALID_CLOCK_SKEW));
-}
-
-// Missing key test, make sure it is invalid and correctly initialized.
-TEST(DcnAnalysis, CreateMissingKeyTest) {
-  XSpace space;
-  XPlane *host_trace = space.add_planes();
-  XPlaneBuilder xplane_builder(host_trace);
-
-  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
-  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
-
-  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
-  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata_1);
-  event_builder.SetOffsetNs(50000);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 10);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 100);
-
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  DcnEventsProcessor dcn_events_processor(4, false);
-  dcn_events_processor.SetupMessageInfo(plane);
-  dcn_events_processor.ProcessReceiveMessages(plane);
-  EXPECT_THAT(
-      dcn_events_processor.GetMessage(0),
-      FieldsAre("",             /* collective name */
-                -1, -1, -1, -1, /* slice_src, tpu_src, slice_dst, tpu_dst */
-                40000, 50000,   /* start_timestamp_ns. end_timestamp_ns, */
-                10,             /* duration_us */
-                100, -1, -1,    /* size_bytes, chunk_id, loop_index_id */
-                                /* validity_info */
-                DCN_MESSAGE_INVALID_BAD_KEY));
-}
-
-}  // namespace
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.cc b/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.cc
deleted file mode 100644
index d2b1e7abd59a..000000000000
--- a/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h"
-
-#include <cstdint>
-
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using tensorflow::profiler::DcnSlackAnalysis;
-using tensorflow::profiler::DcnSlackSummary;
-using tsl::profiler::SafeDivide;
-
-void DcnSlackAnalysisCombiner::Combine(const DcnSlackAnalysis& slack_analysis) {
-  for (const auto& slack : slack_analysis.dcn_slack_summary()) {
-    uint64_t occurrences = slack.occurrences();
-    DcnSlackSummary& summary = slack_summary_[slack.rendezvous()];
-    summary.set_slack_us(summary.slack_us() + slack.slack_us() * occurrences);
-    summary.set_observed_duration_us(summary.observed_duration_us() +
-                                     slack.observed_duration_us() *
-                                         occurrences);
-    summary.set_stall_duration_us(summary.stall_duration_us() +
-                                  slack.stall_duration_us() * occurrences);
-    summary.set_send_done_duration_us(summary.send_done_duration_us() +
-                                      slack.send_done_duration_us() *
-                                          occurrences);
-    summary.set_recv_done_duration_us(summary.recv_done_duration_us() +
-                                      slack.recv_done_duration_us() *
-                                          occurrences);
-    summary.set_send_duration_us(summary.send_duration_us() +
-                                 slack.send_duration_us() * occurrences);
-    summary.set_recv_duration_us(summary.recv_duration_us() +
-                                 slack.recv_duration_us() * occurrences);
-    summary.set_host_stall_us(summary.host_stall_us() +
-                              slack.host_stall_us() * occurrences);
-    summary.set_occurrences(summary.occurrences() + slack.occurrences());
-    summary.set_bytes_transmitted_over_network(
-        slack.bytes_transmitted_over_network());
-    summary.set_recv_op_name(slack.recv_op_name());
-    summary.set_send_op_name(slack.send_op_name());
-    summary.set_transfer_type(slack.transfer_type());
-  }
-}
-
-DcnSlackAnalysis DcnSlackAnalysisCombiner::Finalize() {
-  DcnSlackAnalysis analysis;
-  for (const auto& [rendezvous, summary] : slack_summary_) {
-    auto* slack = analysis.add_dcn_slack_summary();
-    slack->set_rendezvous(rendezvous);
-    slack->set_recv_op_name(summary.recv_op_name());
-    slack->set_send_op_name(summary.send_op_name());
-    slack->set_transfer_type(summary.transfer_type());
-    slack->set_slack_us(SafeDivide(summary.slack_us(), summary.occurrences()));
-    slack->set_observed_duration_us(
-        SafeDivide(summary.observed_duration_us(), summary.occurrences()));
-    slack->set_stall_duration_us(
-        SafeDivide(summary.stall_duration_us(), summary.occurrences()));
-    slack->set_send_done_duration_us(
-        SafeDivide(summary.send_done_duration_us(), summary.occurrences()));
-    slack->set_recv_done_duration_us(
-        SafeDivide(summary.recv_done_duration_us(), summary.occurrences()));
-    slack->set_send_duration_us(
-        SafeDivide(summary.send_duration_us(), summary.occurrences()));
-    slack->set_recv_duration_us(
-        SafeDivide(summary.recv_duration_us(), summary.occurrences()));
-    slack->set_host_stall_us(
-        SafeDivide(summary.host_stall_us(), summary.occurrences()));
-    slack->set_occurrences(summary.occurrences());
-    slack->set_bytes_transmitted_over_network(
-        summary.bytes_transmitted_over_network());
-  }
-
-  return analysis;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h b/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h
deleted file mode 100644
index f0fc727a62dc..000000000000
--- a/tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_SLACK_ANALYSIS_COMBINER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_SLACK_ANALYSIS_COMBINER_H_
-
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using tensorflow::profiler::DcnSlackAnalysis;
-using tensorflow::profiler::DcnSlackSummary;
-
-class DcnSlackAnalysisCombiner {
- private:
-  absl::flat_hash_map<std::string, DcnSlackSummary> slack_summary_;
-
- public:
-  // Combine the DCN Slack Summary in the DcnSlackAnalysis.
-  // The DcnSlackAnalysis consists of average durations, The combine phase, the
-  // summary consists of the total duration for all the occurrences. Finazile
-  // must be called to get the accurate value.
-  void Combine(const DcnSlackAnalysis& slack_analysis);
-
-  // Finalize the DcnSlackSummary by converting total durations to averages.
-  DcnSlackAnalysis Finalize();
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_SLACK_ANALYSIS_COMBINER_H_
diff --git a/tensorflow/core/profiler/convert/dcn_utils.cc b/tensorflow/core/profiler/convert/dcn_utils.cc
deleted file mode 100644
index 6a457053c30b..000000000000
--- a/tensorflow/core/profiler/convert/dcn_utils.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/dcn_utils.h"
-
-#include "absl/strings/match.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tsl::profiler::MicroToNano;
-using tsl::profiler::StatType;
-using tsl::profiler::XEventVisitor;
-using tsl::profiler::XStatVisitor;
-
-DcnMessage CreateDcnMessageFromStats(const XEventVisitor& event_visitor) {
-  DcnMessage dcn_message;
-  event_visitor.ForEachStat([&](const XStatVisitor& stat) {
-    if (!stat.Type()) return;
-    switch (static_cast<StatType>(*stat.Type())) {
-      case StatType::kDcnLabel: {
-        dcn_message.collective_name = stat.ToString();
-        break;
-      }
-      case StatType::kDcnSourceSliceId: {
-        dcn_message.slice_src = stat.IntValue();
-        break;
-      }
-      case StatType::kDcnSourcePerSliceDeviceId: {
-        dcn_message.tpu_src = stat.IntValue();
-        break;
-      }
-      case StatType::kDcnDestinationSliceId: {
-        dcn_message.slice_dst = stat.IntValue();
-        break;
-      }
-      case StatType::kDcnDestinationPerSliceDeviceId: {
-        dcn_message.tpu_dst = stat.IntValue();
-        break;
-      }
-      case StatType::kDcnChunk: {
-        dcn_message.chunk_id = stat.IntValue();
-        break;
-      }
-      case StatType::kDcnLoopIndex: {
-        dcn_message.loop_index_id = stat.IntValue();
-
-        break;
-      }
-      case StatType::kPayloadSizeBytes: {
-        dcn_message.size_bytes = stat.IntValue();
-        break;
-      }
-      case StatType::kDuration: {
-        dcn_message.duration_us = stat.IntOrUintValue();
-        dcn_message.start_timestamp_ns =
-            event_visitor.TimestampNs() - MicroToNano(dcn_message.duration_us);
-        dcn_message.end_timestamp_ns = event_visitor.TimestampNs();
-        break;
-      }
-      default:
-        break;
-    }
-  });
-  return dcn_message;
-}
-
-// Analyze message to see if it can be directly processed or it falls under
-// corner-case categories, or if there is something wrong with it.
-void SetMessageValidity(DcnMessage& dcn_message) {
-  // Message should not be valid if fields have not been set properly
-  // The main use of that is to detect unexpected key format changes that do
-  // not cause crashes.
-  if (dcn_message.collective_name.empty() || dcn_message.slice_src == -1 ||
-      dcn_message.tpu_src == -1 || dcn_message.slice_dst == -1 ||
-      dcn_message.tpu_dst == -1 || dcn_message.size_bytes == -1) {
-    dcn_message.validity_info = DCN_MESSAGE_INVALID_BAD_KEY;
-  } else if (dcn_message.duration_us == 0) {
-    // Destination timestamp smaller than the source timestamp likely due to
-    // clock skew
-    dcn_message.validity_info = DCN_MESSAGE_INVALID_CLOCK_SKEW;
-  } else if (dcn_message.slice_src == dcn_message.slice_dst) {
-    // Loopback messages remain on the same host, so they are valid
-    // even though they should not go through DCN.
-    // TODO(emizan): Get host/TPU info and check host, not slice.
-    dcn_message.validity_info = DCN_MESSAGE_VALID_LOOPBACK;
-  } else {
-    dcn_message.validity_info = DCN_MESSAGE_VALID;
-  }
-}
-}  // namespace
-
-DcnMessage GetDcnMessageFromXEvent(const XEventVisitor& event_visitor) {
-  DcnMessage dcn_message = CreateDcnMessageFromStats(event_visitor);
-  SetMessageValidity(dcn_message);
-  return dcn_message;
-}
-
-bool IsDcnEvent(const tsl::profiler::XEventVisitor& event) {
-  return absl::StartsWith(event.Name(), "MegaScale:");
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/dcn_utils.h b/tensorflow/core/profiler/convert/dcn_utils.h
deleted file mode 100644
index e0dd3a174df9..000000000000
--- a/tensorflow/core/profiler/convert/dcn_utils.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
-
-#include <string>
-
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// DCN Message Validity
-enum DcnMessageValidity {
-  // Valid message
-  DCN_MESSAGE_VALID = 1,
-  // Valid message, but should not go through DCN, so it should not use BW.
-  DCN_MESSAGE_VALID_LOOPBACK = 2,
-  // Invalid message with 0 duration due to clock skew. Should be ignored.
-  DCN_MESSAGE_INVALID_CLOCK_SKEW = 3,
-  // Message that cannot be decoded. Should be ignored.
-  DCN_MESSAGE_INVALID_BAD_KEY = 4
-};
-
-// Structure representing a DCN event
-struct DcnMessage {
-  // Unique collective that generated this message, format should be
-  // <col name>_<number>, e.g. all_gather_34
-  std::string collective_name = "";
-  // Src info
-  // TODO(emizan) Add host info when you figure out how to get it from
-  // slice+tpu.
-  int32_t slice_src = -1;
-  int32_t tpu_src = -1;
-  // Dst info
-  int32_t slice_dst = -1;
-  int32_t tpu_dst = -1;
-  // Timing info in ns. Since MSXLA TraceMe's have us timestamps, we need to
-  // multiply by 1000 to get these timestamps.
-  uint64_t start_timestamp_ns = 0;
-  uint64_t end_timestamp_ns = 0;
-  uint64_t duration_us = 0;
-  // Size info
-  size_t size_bytes = 0;
-  // Chunk and Loop index
-  int32_t chunk_id = -1;
-  int32_t loop_index_id = -1;
-  // Is message valid/invalid and why
-  DcnMessageValidity validity_info = DCN_MESSAGE_INVALID_BAD_KEY;
-  // TBD: Add flow events in case you need to connect to other events pointed to
-  // by MSXLA TraceMe's
-};
-
-DcnMessage GetDcnMessageFromXEvent(
-    const tsl::profiler::XEventVisitor& event_visitor);
-
-// Check if the XEventVisitor is a DCN Message
-bool IsDcnEvent(const tsl::profiler::XEventVisitor& event);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
diff --git a/tensorflow/core/profiler/convert/dcn_utils_test.cc b/tensorflow/core/profiler/convert/dcn_utils_test.cc
deleted file mode 100644
index 8789da9d07b8..000000000000
--- a/tensorflow/core/profiler/convert/dcn_utils_test.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/dcn_utils.h"
-
-#include <cstdint>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/xplane_builder.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tsl::profiler::kMegaScaleDcnReceive;
-using tsl::profiler::XEventBuilder;
-using tsl::profiler::XEventVisitor;
-using tsl::profiler::XLineBuilder;
-using tsl::profiler::XPlaneBuilder;
-using tsl::profiler::XPlaneVisitor;
-
-void PopulateXPlane(XPlane &xplane, absl::string_view event_name, int offset,
-                    absl::string_view label, int64_t source_slice_id,
-                    int64_t source_per_slice_device_id,
-                    int64_t destination_slice_id,
-                    int64_t destination_per_slice_device_id, int64_t chunk,
-                    int64_t loop_index, int64_t payload_size,
-                    int64_t duration) {
-  XPlaneBuilder xplane_builder(&xplane);
-
-  XEventMetadata *event_metadata = xplane_builder.GetOrCreateEventMetadata(1);
-  event_metadata->set_name(std::string(event_name));
-
-  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
-  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata);
-  event_builder.SetOffsetNs(offset);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_label"), label);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"),
-      source_slice_id);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
-      source_per_slice_device_id);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"),
-      destination_slice_id);
-  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                                 "dcn_destination_per_slice_device_id"),
-                             destination_per_slice_device_id);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), chunk);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), loop_index);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("duration_us"), duration);
-  event_builder.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"),
-      payload_size);
-}
-
-TEST(DcnUtilsTest, IsDcnEvent) {
-  XPlane xplane;
-  PopulateXPlane(xplane, kMegaScaleDcnReceive, 0, "test", 0, 0, 0, 0, 0, 0, 0,
-                 0);
-  XLine line = xplane.lines()[0];
-  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
-
-  XEventVisitor visitor(&xplane_visitor, &line, &line.events()[0]);
-  EXPECT_TRUE(IsDcnEvent(visitor));
-}
-
-TEST(DcnUtilsTest, IsNotDcnEvent) {
-  XPlane xplane;
-  PopulateXPlane(xplane, "test", 0, "test", 0, 0, 0, 0, 0, 0, 0, 0);
-  XLine line = xplane.lines()[0];
-  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
-
-  XEventVisitor visitor(&xplane_visitor, &line, &line.events()[0]);
-  EXPECT_FALSE(IsDcnEvent(visitor));
-}
-
-TEST(DcnUtilsTest, GetDcnMessageFromXEvent) {
-  XPlane xplane;
-  PopulateXPlane(xplane, kMegaScaleDcnReceive, 100000, "all-reduce.273_312", 2,
-                 3, 1, 3, 0, 24, 32768, 50);
-  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
-  XEventVisitor visitor(&xplane_visitor, &xplane.lines()[0],
-                        &xplane.lines()[0].events()[0]);
-  EXPECT_THAT(GetDcnMessageFromXEvent(visitor),
-              testing::FieldsAre(
-                  "all-reduce.273_312", /* collective name */
-                  2, 3, 1, 3, /* slice_src, tpu_src, slice_dst, tpu_dst */
-                  /* start_timestamp_ns, end_timestamp_ns, duration_us */
-                  50000, 100000, 50,
-                  /* size_bytes, chunk_id, loop_index_id */
-                  32768, 0, 24,
-                  /* validity_info */
-                  DCN_MESSAGE_VALID));
-}
-
-TEST(DcnUtilsTest, GetDcnMessageFromXEventLoopBack) {
-  XPlane xplane;
-  PopulateXPlane(xplane, kMegaScaleDcnReceive, 5000000, "all-gather.1234", 2, 3,
-                 2, 1, 4, 40, 1000, 1000);
-  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
-  XEventVisitor visitor(&xplane_visitor, &xplane.lines()[0],
-                        &xplane.lines()[0].events()[0]);
-  EXPECT_THAT(GetDcnMessageFromXEvent(visitor),
-              testing::FieldsAre(
-                  "all-gather.1234", /* collective name */
-                  2, 3, 2, 1, /* slice_src, tpu_src, slice_dst, tpu_dst */
-                  /* start_timestamp_ns. end_timestamp_ns, duration_us */
-                  4000000, 5000000, 1000,
-                  /* size_bytes, chunk_id, loop_index_id */
-                  1000, 4, 40,
-                  /* validity_info */
-                  DCN_MESSAGE_VALID_LOOPBACK));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/duty_cycle_combiner.h b/tensorflow/core/profiler/convert/duty_cycle_combiner.h
deleted file mode 100644
index 74b2e0ebdc9a..000000000000
--- a/tensorflow/core/profiler/convert/duty_cycle_combiner.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DUTY_CYCLE_COMBINER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_DUTY_CYCLE_COMBINER_H_
-
-#include <sys/types.h>
-
-#include <cstdint>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/profiler/convert/duty_cycle_tracker.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Responsible for combining the duty cycle trackers for all cores and chips.
-class DutyCycleCombiner {
- public:
-  // Combines the given core tracker with the tracker for the given chip.
-  // NOTE: The given chip_id should be unique across all chips being combined.
-  void CombineCore(const DutyCycleTracker& core_tracker, uint32_t chip_id) {
-    chip_duty_cycle_trackers_[chip_id].Union(core_tracker);
-  }
-
-  // Combines the given chip tracker with the tracker for other chips.
-  void CombineChip(const DutyCycleTracker& chip_tracker) {
-    chip_active_time_ps_ += chip_tracker.GetActiveTimePs();
-    chip_idle_time_ps_ += chip_tracker.GetIdleTimePs();
-  }
-
-  // Returns the total active time across all chips and cores.
-  uint64_t GetTotalActiveTimePs() const {
-    uint64_t total_busy_time_ps = chip_active_time_ps_;
-    for (const auto& [chip_id, tracker] : chip_duty_cycle_trackers_) {
-      total_busy_time_ps += tracker.GetActiveTimePs();
-    }
-    return total_busy_time_ps;
-  }
-
-  // Returns the total idle time across all chips and cores.
-  uint64_t GetTotalIdleTimePs() const {
-    uint64_t total_idle_time_ps = chip_idle_time_ps_;
-    for (const auto& [chip_id, tracker] : chip_duty_cycle_trackers_) {
-      total_idle_time_ps += tracker.GetIdleTimePs();
-    }
-    return total_idle_time_ps;
-  }
-
- private:
-  absl::flat_hash_map<uint32_t, DutyCycleTracker> chip_duty_cycle_trackers_;
-  uint64_t chip_active_time_ps_ = 0;
-  uint64_t chip_idle_time_ps_ = 0;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DUTY_CYCLE_COMBINER_H_
diff --git a/tensorflow/core/profiler/convert/duty_cycle_combiner_test.cc b/tensorflow/core/profiler/convert/duty_cycle_combiner_test.cc
deleted file mode 100644
index 6a9e158b43da..000000000000
--- a/tensorflow/core/profiler/convert/duty_cycle_combiner_test.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/duty_cycle_combiner.h"
-
-#include <gtest/gtest.h>
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/convert/duty_cycle_tracker.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::tsl::profiler::Timespan;
-
-TEST(DutyCycleAnalysisTest, CombineMultiCoreChipTest) {
-  DutyCycleTracker core0_tracker;
-  core0_tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  core0_tracker.AddInterval(Timespan::FromEndPoints(20, 30), false);
-  DutyCycleTracker core1_tracker;
-  core1_tracker.AddInterval(Timespan::FromEndPoints(10, 20), false);
-  core1_tracker.AddInterval(Timespan::FromEndPoints(20, 30), true);
-
-  DutyCycleCombiner combiner;
-  combiner.CombineCore(core0_tracker, 0);
-  combiner.CombineCore(core1_tracker, 0);
-
-  EXPECT_EQ(combiner.GetTotalActiveTimePs(), 20);
-  EXPECT_EQ(combiner.GetTotalIdleTimePs(), 0);
-}
-
-TEST(DutyCycleAnalysisTest, CombineMultiChipTest) {
-  DutyCycleTracker chip0_tracker;
-  chip0_tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  chip0_tracker.AddInterval(Timespan::FromEndPoints(20, 30), false);
-  DutyCycleTracker chip1_tracker;
-  chip1_tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  chip1_tracker.AddInterval(Timespan::FromEndPoints(20, 30), false);
-
-  DutyCycleCombiner combiner;
-  combiner.CombineChip(chip0_tracker);
-  combiner.CombineChip(chip1_tracker);
-
-  EXPECT_EQ(combiner.GetTotalActiveTimePs(), 20);
-  EXPECT_EQ(combiner.GetTotalIdleTimePs(), 20);
-}
-
-TEST(DutyCycleAnalysisTest, CombineMultiChipAndCoreTest) {
-  DutyCycleTracker chip0_core0_tracker;
-  chip0_core0_tracker.AddInterval(Timespan::FromEndPoints(10, 20), false);
-  chip0_core0_tracker.AddInterval(Timespan::FromEndPoints(20, 30), true);
-  DutyCycleTracker chip0_core1_tracker;
-  chip0_core1_tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  chip0_core1_tracker.AddInterval(Timespan::FromEndPoints(20, 30), false);
-  DutyCycleTracker chip1_tracker;
-  chip1_tracker.AddInterval(Timespan::FromEndPoints(15, 25), true);
-  chip1_tracker.AddInterval(Timespan::FromEndPoints(10, 30), false);
-
-  DutyCycleCombiner combiner;
-  combiner.CombineCore(chip0_core0_tracker, 0);
-  combiner.CombineCore(chip0_core1_tracker, 0);
-  combiner.CombineChip(chip1_tracker);
-
-  EXPECT_EQ(combiner.GetTotalActiveTimePs(), 30);
-  EXPECT_EQ(combiner.GetTotalIdleTimePs(), 10);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/duty_cycle_tracker.cc b/tensorflow/core/profiler/convert/duty_cycle_tracker.cc
deleted file mode 100644
index 96d793e86b31..000000000000
--- a/tensorflow/core/profiler/convert/duty_cycle_tracker.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/duty_cycle_tracker.h"
-
-#include <sys/types.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <iterator>
-
-#include "absl/container/btree_set.h"
-#include "absl/log/check.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using tsl::profiler::Timespan;
-
-DutyCycleTracker::ActiveTimeSpans::const_iterator
-DutyCycleTracker::MergeOrInsert(const Timespan& timespan,
-                                ActiveTimeSpans::const_iterator hint) {
-  DCHECK(hint == active_time_spans_.end() ||
-         hint == active_time_spans_.begin() ||
-         hint->begin_ps() <= timespan.begin_ps());
-  ActiveTimeSpans::const_iterator merge_begin = hint;
-  while (merge_begin != active_time_spans_.end() &&
-         merge_begin->end_ps() < timespan.begin_ps()) {
-    ++merge_begin;
-  }
-
-  // timespan is fully contained in an existing timespan.
-  if (merge_begin != active_time_spans_.end() &&
-      merge_begin->Includes(timespan)) {
-    return merge_begin;
-  }
-
-  ActiveTimeSpans::const_iterator merge_end = merge_begin;
-  while (merge_end != active_time_spans_.end() &&
-         merge_end->begin_ps() <= timespan.end_ps()) {
-    ++merge_end;
-  }
-  if (merge_begin != merge_end) {
-    Timespan merged = Timespan::FromEndPoints(
-        std::min(timespan.begin_ps(), merge_begin->begin_ps()),
-        std::max(timespan.end_ps(), std::prev(merge_end)->end_ps()));
-    merge_end = active_time_spans_.erase(merge_begin, merge_end);
-    return active_time_spans_.insert(merge_end, merged);
-  } else {
-    // There is no overlap with the existing timespans.
-    return active_time_spans_.insert(merge_begin, timespan);
-  }
-}
-
-void DutyCycleTracker::AddInterval(tsl::profiler::Timespan time_span,
-                                   bool is_active) {
-  total_time_span_.ExpandToInclude(time_span);
-  if (!is_active) {
-    return;
-  }
-
-  auto hint = active_time_spans_.lower_bound(time_span);
-  if (hint != active_time_spans_.begin()) --hint;
-  MergeOrInsert(time_span, hint);
-}
-
-void DutyCycleTracker::Union(const DutyCycleTracker& other) {
-  total_time_span_.ExpandToInclude(other.total_time_span_);
-  if (other.active_time_spans_.empty()) return;
-  ActiveTimeSpans::const_iterator hint_it =
-      active_time_spans_.lower_bound(*other.active_time_spans_.begin());
-  if (hint_it != active_time_spans_.begin()) --hint_it;
-  for (const auto& interval : other.active_time_spans_) {
-    hint_it = MergeOrInsert(interval, hint_it);
-  }
-}
-
-uint64_t DutyCycleTracker::GetActiveTimePs() const {
-  uint64_t active_time_ps = 0;
-  for (const auto& interval : active_time_spans_) {
-    DCHECK(!interval.Empty());
-    active_time_ps += interval.duration_ps();
-  }
-  return active_time_ps;
-}
-
-uint64_t DutyCycleTracker::GetIdleTimePs() const {
-  return total_time_span_.duration_ps() - GetActiveTimePs();
-}
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/duty_cycle_tracker.h b/tensorflow/core/profiler/convert/duty_cycle_tracker.h
deleted file mode 100644
index bf5160d97d30..000000000000
--- a/tensorflow/core/profiler/convert/duty_cycle_tracker.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DUTY_CYCLE_TRACKER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_DUTY_CYCLE_TRACKER_H_
-
-#include <cstdint>
-
-#include "absl/container/btree_set.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Tracks the active time intervals for a given TPU core.
-// Disjoint intervals of time in ps for which this core was active.
-class DutyCycleTracker {
- public:
-  DutyCycleTracker() : active_time_spans_() {}
-  void AddInterval(tsl::profiler::Timespan time_span, bool is_active);
-  void Union(const DutyCycleTracker& other);
-  uint64_t GetActiveTimePs() const;
-  uint64_t GetIdleTimePs() const;
-  uint64_t GetDurationPs() const { return total_time_span_.duration_ps(); }
-  double DutyCycle() const {
-    return tsl::profiler::SafeDivide(GetActiveTimePs(), GetDurationPs());
-  }
-
- private:
-  struct TimespanComparator {
-    // Order by increasing begin_ps, then decreasing duration_ps.
-    bool operator()(const tsl::profiler::Timespan& a,
-                    const tsl::profiler::Timespan& b) const {
-      return a.begin_ps() < b.begin_ps() || (a.begin_ps() == b.begin_ps() &&
-                                             a.duration_ps() > b.duration_ps());
-    }
-  };
-  using ActiveTimeSpans =
-      absl::btree_set<tsl::profiler::Timespan, TimespanComparator>;
-
-  /**
-   * Merge or insert the given timespan into the set of active time spans.
-   *
-   * @param timespan The timespan to merge or insert.
-   * @param hint The iterator indicating where to begin the merge search.
-   * @return The iterator where the timespan was merged or inserted.
-   */
-  ActiveTimeSpans::const_iterator MergeOrInsert(
-      const tsl::profiler::Timespan& timespan,
-      ActiveTimeSpans::const_iterator hint);
-
-  ActiveTimeSpans active_time_spans_;
-  tsl::profiler::Timespan total_time_span_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DUTY_CYCLE_TRACKER_H_
diff --git a/tensorflow/core/profiler/convert/duty_cycle_tracker_test.cc b/tensorflow/core/profiler/convert/duty_cycle_tracker_test.cc
deleted file mode 100644
index 2ee0218d986f..000000000000
--- a/tensorflow/core/profiler/convert/duty_cycle_tracker_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/duty_cycle_tracker.h"
-
-#include <sys/types.h>
-
-#include <cstdint>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/log/check.h"
-#include "xla/tsl/platform/test_benchmark.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::tsl::profiler::Timespan;
-
-TEST(DutyCycleTrackerTest, NonOverlappingIntervalsTest) {
-  DutyCycleTracker tracker;
-  tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  tracker.AddInterval(Timespan::FromEndPoints(30, 40), true);
-  EXPECT_EQ(tracker.GetActiveTimePs(), 20);
-  EXPECT_EQ(tracker.GetIdleTimePs(), 10);
-  EXPECT_EQ(tracker.GetDurationPs(), 30);
-  EXPECT_NEAR(tracker.DutyCycle(), 0.6666, 0.0001);
-}
-
-TEST(DutyCycleTrackerTest, OverlappingIntervalsTest) {
-  DutyCycleTracker tracker;
-  tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  tracker.AddInterval(Timespan::FromEndPoints(30, 40), true);
-  tracker.AddInterval(Timespan::FromEndPoints(20, 35), true);
-  EXPECT_EQ(tracker.GetActiveTimePs(), 30);
-  EXPECT_EQ(tracker.GetIdleTimePs(), 0);
-  EXPECT_EQ(tracker.GetDurationPs(), 30);
-  EXPECT_EQ(tracker.DutyCycle(), 1.0);
-}
-
-TEST(DutyCycleTrackerTest, DutyCycleTestWithIncludedIntervals) {
-  DutyCycleTracker tracker;
-  tracker.AddInterval(Timespan::FromEndPoints(10, 40), true);
-  tracker.AddInterval(Timespan::FromEndPoints(20, 30), true);
-  EXPECT_EQ(tracker.GetActiveTimePs(), 30);
-  EXPECT_EQ(tracker.GetIdleTimePs(), 0);
-  EXPECT_EQ(tracker.GetDurationPs(), 30);
-  EXPECT_EQ(tracker.DutyCycle(), 1.0);
-}
-
-TEST(DutyCycleTrackerTest, UnionTest) {
-  DutyCycleTracker tracker;
-  tracker.AddInterval(Timespan::FromEndPoints(0, 10), true);
-  tracker.AddInterval(Timespan::FromEndPoints(20, 30), true);
-
-  DutyCycleTracker other_tracker;
-  other_tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  other_tracker.AddInterval(Timespan::FromEndPoints(30, 40), true);
-
-  tracker.Union(other_tracker);
-  EXPECT_EQ(tracker.GetActiveTimePs(), 40);
-  EXPECT_EQ(tracker.GetIdleTimePs(), 0);
-  EXPECT_EQ(tracker.GetDurationPs(), 40);
-}
-
-TEST(DutyCycleTrackerTest, OverlappingMixedIntervalsTest) {
-  DutyCycleTracker tracker;
-  EXPECT_EQ(tracker.GetActiveTimePs(), 0);
-  tracker.AddInterval(Timespan::FromEndPoints(10, 20), true);
-  tracker.AddInterval(Timespan::FromEndPoints(20, 30), false);
-  EXPECT_EQ(tracker.GetActiveTimePs(), 10);
-  EXPECT_EQ(tracker.GetIdleTimePs(), 10);
-}
-
-void BM_DutyCycleTracker_AddInterval(::testing::benchmark::State& state) {
-  std::vector<Timespan> timespans;
-  timespans.reserve(state.range(0));
-  for (uint64_t i = 0; i < state.range(0); ++i) {
-    timespans.push_back(Timespan::FromEndPoints(i * 2, i * 2 + 1));
-  }
-  for (auto s : state) {
-    DutyCycleTracker tracker;
-    for (const auto& timespan : timespans) {
-      tracker.AddInterval(timespan, true);
-    }
-  }
-  state.SetItemsProcessed(state.iterations() * timespans.size());
-}
-
-BENCHMARK(BM_DutyCycleTracker_AddInterval)->Range(1 << 15, 1 << 21);
-
-void BM_DutyCycleTracker_AddInterval_Merge(::testing::benchmark::State& state) {
-  std::vector<Timespan> timespans;
-  timespans.reserve(state.range(0));
-  for (uint64_t i = 0; i < state.range(0); ++i) {
-    timespans.push_back(Timespan::FromEndPoints(i, i + 1));
-  }
-  for (auto s : state) {
-    DutyCycleTracker tracker;
-    for (const auto& timespan : timespans) {
-      tracker.AddInterval(timespan, true);
-    }
-  }
-  state.SetItemsProcessed(state.iterations() * timespans.size());
-}
-
-BENCHMARK(BM_DutyCycleTracker_AddInterval_Merge)->Range(1 << 15, 1 << 21);
-
-void BM_DutyCycleTracker_Union(::testing::benchmark::State& state) {
-  DCHECK_GT(state.range(1), 1);
-  DCHECK_LT(state.range(1), state.range(0));
-  DutyCycleTracker tracker_a;
-  DutyCycleTracker tracker_b;
-  uint64_t merge_rate = state.range(1);
-  for (uint64_t i = 0; i < state.range(0); ++i) {
-    tracker_a.AddInterval(Timespan(i * 2, 1), true);
-    if (i % merge_rate == 0) {
-      tracker_b.AddInterval(Timespan(i * 2 + 1, merge_rate * 2 - 1), true);
-    }
-  }
-  for (auto s : state) {
-    DutyCycleTracker unioned_tracker;
-    unioned_tracker.Union(tracker_a);
-    unioned_tracker.Union(tracker_b);
-  }
-  state.SetItemsProcessed(state.iterations() *
-                          (state.range(0) + state.range(0) / merge_rate));
-}
-
-BENCHMARK(BM_DutyCycleTracker_Union)->RangePair(1 << 10, 1 << 16, 2, 10);
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
deleted file mode 100644
index 729782b99b6c..000000000000
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
+++ /dev/null
@@ -1,556 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h"
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_replace.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_print_options.h"
-#include "xla/tsl/platform/statusor.h"
-#ifdef PLATFORM_GOOGLE
-#include "nlohmann/json.hpp"
-#include "tensorflow/compiler/mlir/lite/experimental/google/tooling/hlo_adapter/direct_hlo_to_json_graph_convert.h"
-#endif  // PLATFORM_GOOGLE
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_graph_dumper.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-#include "tensorflow/core/profiler/utils/hlo_module_utils.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_to_module.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::tensorflow::StatusOr;
-using ::tensorflow::errors::InvalidArgument;
-using ::xla::HloComputation;
-using ::xla::HloInstruction;
-using ::xla::HloModule;
-using ::xla::HloPrintOptions;
-using ::xla::HloProto;
-using ::xla::HloRenderOptions;
-using ::xla::RenderedGraphFormat;
-
-constexpr char kCenterNodeKey[] = "centerNode";
-
-void CleanUpHloModuleForGraphviz(HloModule* hlo_module) {
-  // Infeed config is escaped serialized proto, and graphviz server complains.
-  for (HloComputation* computation : hlo_module->computations()) {
-    for (HloInstruction* inst : computation->instructions()) {
-      if (inst->opcode() == xla::HloOpcode::kInfeed) {
-        inst->set_infeed_config("");
-      } else if (inst->opcode() == xla::HloOpcode::kOutfeed) {
-        inst->set_outfeed_config("");
-      }
-    }
-  }
-}
-
-#ifdef PLATFORM_GOOGLE
-// Add a custom group node on the graph level, for the center node chosen by the
-// user set its attributes like `id`, `name` or `opcode` in `graph_json`.
-void AddCenterNodeMetadata(nlohmann::json& graph_json, std::string id,
-                           absl::string_view name, absl::string_view opcode) {
-  nlohmann::json centerGroupNodeAttributes;
-  centerGroupNodeAttributes["name"] = name;
-  centerGroupNodeAttributes["id"] = id;
-  if (!opcode.empty()) {
-    centerGroupNodeAttributes["opcode"] = opcode;
-  }
-  // Follow ModelExplorer's Graph typing: GraphCollectionFromBuiltinAdapters
-  graph_json[0]["subgraphs"][0]["groupNodeAttributes"][kCenterNodeKey] =
-      centerGroupNodeAttributes;
-}
-#endif  // PLATFORM_GOOGLE
-
-void AddGraphMetadata(std::string& graph_json_str,
-                      const HloInstruction& instr) {
-#ifdef PLATFORM_GOOGLE
-  nlohmann::json graph_json = nlohmann::json::parse(graph_json_str);
-  // 1. Fusion instruction is represented as a layer on client, use its
-  // pinned node as the center node, id of the pinned node is the fusion name.
-  // 2. Other instructions are represented as nodes on client, use iteself as
-  // the center node, where node id is the instruction name.
-  std::string id = absl::StrCat(instr.name());
-  AddCenterNodeMetadata(graph_json, id, instr.name(),
-                        HloOpcodeString(instr.opcode()));
-  graph_json_str = graph_json.dump();
-#endif  // PLATFORM_GOOGLE
-}
-
-void AddGraphMetadata(std::string& graph_json_str, const HloComputation& comp) {
-#ifdef PLATFORM_GOOGLE
-  nlohmann::json graph_json = nlohmann::json::parse(graph_json_str);
-  // Computation is represented as a layer on client, use its pinned node as the
-  // center node,id of the pinned node is the computation name.
-  AddCenterNodeMetadata(graph_json, absl::StrCat(comp.name()), comp.name(), "");
-  graph_json_str = graph_json.dump();
-#endif  // PLATFORM_GOOGLE
-}
-
-// This function does the same thing as Plot() but uses the ModelExplorer
-// instead of graphviz.
-absl::StatusOr<std::string> PlotMe(std::unique_ptr<HloModule> module,
-                                   const std::string& node_name,
-                                   int graph_width) {
-  if (node_name.empty()) {
-    // This should not happen.
-    return InvalidArgument("node_name should not be empty");
-  }
-  // Find the node with the given name.
-  const HloInstruction* instr = FindInstruction(*module, node_name);
-  const HloComputation* comp = FindComputation(*module, node_name);
-
-  if (!instr && !comp) {
-    return InvalidArgument(
-        absl::StrCat("Couldn't find HloInstruction or HloComputation named ",
-                     node_name, "."));
-  }
-  // Generate the graph and print the resulting string.
-  absl::StatusOr<std::string> graph_handle;
-  std::string graph_json_str;
-// b/360874576: Enable when the adapter is open sourced.
-#ifdef PLATFORM_GOOGLE
-  if (comp) {
-    graph_handle = tooling::visualization_client::HloGraphAdapter(*comp);
-  } else {
-    graph_handle =
-        tooling::visualization_client::HloGraphAdapter(*instr, graph_width);
-  }
-#endif  // PLATFORM_GOOGLE
-  if (graph_handle.ok()) {
-    VLOG(1) << graph_handle.value();
-    graph_json_str = graph_handle.value();
-    if (comp) {
-      AddGraphMetadata(graph_json_str, *comp);
-    } else {
-      AddGraphMetadata(graph_json_str, *instr);
-    }
-    return graph_json_str;
-  } else {
-    LOG(ERROR) << "Unable to render graph: " << graph_handle.status();
-  }
-
-  return graph_handle;
-}
-
-absl::StatusOr<std::string> Plot(std::unique_ptr<HloModule> module,
-                                 const std::string& node_name, int graph_width,
-                                 const HloRenderOptions& render_options,
-                                 const RenderedGraphFormat& format) {
-  if (node_name.empty()) {
-    // This should not happen.
-    return InvalidArgument("node_name should not be empty");
-  }
-  // Find the node with the given name.
-  const HloInstruction* instr = FindInstruction(*module, node_name);
-  const HloComputation* comp = FindComputation(*module, node_name);
-  if (!instr && !comp) {
-    return InvalidArgument(
-        absl::StrCat("Couldn't find HloInstruction or HloComputation named ",
-                     node_name, "."));
-  }
-  // Generate the graph and print the resulting string.
-  absl::StatusOr<std::string> graph_handle;
-
-  CleanUpHloModuleForGraphviz(module.get());
-  if (comp) {
-    graph_handle =
-        RenderGraphView(*comp, "", comp->parent()->config().debug_options(),
-                        format, render_options);
-  } else {
-    graph_handle = RenderGraphNeighborhoodAround(*instr, graph_width, format,
-                                                 render_options);
-  }
-  if (graph_handle.ok()) {
-    VLOG(1) << graph_handle.value();
-  } else {
-    LOG(ERROR) << "Unable to render graph: " << graph_handle.status();
-  }
-
-  return graph_handle;
-}
-
-// Default parameter constants for graph viewer.
-static constexpr char kGraphTypeName[] = "graph";
-static constexpr char kShortTxtTypeName[] = "short_txt";
-static constexpr char kLongTxtTypeName[] = "long_txt";
-static constexpr char kDefaultFormatString[] = "url";
-static constexpr int kDefaultWidth = 3;
-static constexpr int kDefaultShowMetadata = 0;
-static constexpr int kDefaultMergeFusion = 0;
-
-}  // namespace
-
-absl::StatusOr<std::string> GetNodeStyles() {
-  std::vector<xla::HloOpcode> async_op_codes = {xla::HloOpcode::kAsyncStart,
-                                                xla::HloOpcode::kAsyncUpdate,
-                                                xla::HloOpcode::kAsyncDone};
-  std::vector<xla::HloOpcode> brown_op_codes = {
-      xla::HloOpcode::kAllGather,
-      xla::HloOpcode::kAllGatherStart,
-      xla::HloOpcode::kAllGatherDone,
-      xla::HloOpcode::kAllReduce,
-      xla::HloOpcode::kReduceScatter,
-      xla::HloOpcode::kAllReduceStart,
-      xla::HloOpcode::kAllReduceDone,
-      xla::HloOpcode::kAllToAll,
-      xla::HloOpcode::kCollectiveBroadcast,
-      xla::HloOpcode::kCollectivePermute,
-      xla::HloOpcode::kCollectivePermuteStart,
-      xla::HloOpcode::kCollectivePermuteDone,
-      xla::HloOpcode::kInfeed,
-      xla::HloOpcode::kOutfeed,
-      xla::HloOpcode::kPartitionId,
-      xla::HloOpcode::kRecv,
-      xla::HloOpcode::kRecvDone,
-      xla::HloOpcode::kSend,
-      xla::HloOpcode::kSendDone,
-      xla::HloOpcode::kReplicaId};
-  std::vector<xla::HloOpcode> dark_blue_op_codes = {
-      xla::HloOpcode::kConvolution, xla::HloOpcode::kDot, xla::HloOpcode::kFft,
-      xla::HloOpcode::kTriangularSolve, xla::HloOpcode::kCholesky};
-  std::vector<xla::HloOpcode> dark_green_op_codes = {
-      xla::HloOpcode::kCall, xla::HloOpcode::kConditional,
-      xla::HloOpcode::kCustomCall, xla::HloOpcode::kWhile};
-  std::vector<xla::HloOpcode> gray_op_codes = {
-      xla::HloOpcode::kDomain, xla::HloOpcode::kFusion, xla::HloOpcode::kMap,
-      xla::HloOpcode::kGetDimensionSize, xla::HloOpcode::kSetDimensionSize};
-  std::vector<xla::HloOpcode> green_op_codes = {
-      xla::HloOpcode::kConcatenate, xla::HloOpcode::kDynamicSlice,
-      xla::HloOpcode::kReshape,     xla::HloOpcode::kDynamicReshape,
-      xla::HloOpcode::kReverse,     xla::HloOpcode::kTranspose,
-      xla::HloOpcode::kCopy,        xla::HloOpcode::kCopyStart,
-      xla::HloOpcode::kCopyDone};
-  std::vector<xla::HloOpcode> orange_op_codes = {xla::HloOpcode::kParameter};
-  std::vector<xla::HloOpcode> purple_op_codes = {
-      xla::HloOpcode::kBatchNormGrad,     xla::HloOpcode::kBatchNormInference,
-      xla::HloOpcode::kBatchNormTraining, xla::HloOpcode::kReduce,
-      xla::HloOpcode::kReduceWindow,      xla::HloOpcode::kScatter,
-      xla::HloOpcode::kSelectAndScatter,  xla::HloOpcode::kGather};
-  std::vector<xla::HloOpcode> yellow_op_codes = {
-      xla::HloOpcode::kBroadcast, xla::HloOpcode::kDynamicUpdateSlice};
-
-  auto OpCodesToNames =
-      [&](std::vector<xla::HloOpcode> op_codes) -> std::string {
-    std::string op_names = "";
-    for (const auto& op_code : op_codes) {
-      if (!op_names.empty()) {
-        op_names += ",";
-      }
-      op_names += std::string(xla::HloOpcodeString(op_code));
-    }
-    return op_names;
-  };
-
-  return absl::StrReplaceAll(
-      R"json({
-      "kBlue": "$asyncOpNames",
-      "kBrown": "$brownOpNames",
-      "kDarkBlue": "$darkBlueOpNames",
-      "kDarkGreen": "$darkGreenOpNames",
-      "kGray": "$grayOpNames",
-      "kGreen": "$greenOpNames",
-      "kOrange": "$orangeOpNames",
-      "kPurple": "$purpleOpNames",
-      "kYellow": "$yellowOpNames"
-    })json",
-      {
-          {"$asyncOpNames", OpCodesToNames(async_op_codes)},
-          {"$brownOpNames", OpCodesToNames(brown_op_codes)},
-          {"$darkBlueOpNames", OpCodesToNames(dark_blue_op_codes)},
-          {"$darkGreenOpNames", OpCodesToNames(dark_green_op_codes)},
-          {"$grayOpNames", OpCodesToNames(gray_op_codes)},
-          {"$greenOpNames", OpCodesToNames(green_op_codes)},
-          {"$orangeOpNames", OpCodesToNames(orange_op_codes)},
-          {"$purpleOpNames", OpCodesToNames(purple_op_codes)},
-          {"$yellowOpNames", OpCodesToNames(yellow_op_codes)},
-      });
-}
-
-absl::StatusOr<GraphViewerParams> ParseGraphViewerParams(
-    const ToolOptions& options) {
-  GraphViewerParams params;
-  std::optional<std::string> type = GetParam<std::string>(options, "type");
-  if (!type.has_value()) {
-    return errors::InvalidArgument("Graph viewer must provide a type option.");
-  }
-
-  // For graph type.
-  if (type == kGraphTypeName) {
-    params.type = type.value();
-    if (std::optional<std::string> node_name =
-            GetParam<std::string>(options, "node_name")) {
-      params.node_name = node_name.value();
-    }
-
-    params.graph_width =
-        GetParamWithDefault<int>(options, "graph_width", kDefaultWidth);
-    params.render_options.show_backend_config = GetParamWithDefault<int>(
-        options, "show_metadata", kDefaultShowMetadata);
-    params.render_options.show_fusion_subcomputations =
-        !GetParamWithDefault<int>(options, "merge_fusion", kDefaultMergeFusion);
-    params.format = GetRenderFormat(GetParamWithDefault<std::string>(
-        options, "format", kDefaultFormatString));
-
-    return params;
-  }
-
-  // For txt type.
-  if (type == kShortTxtTypeName || type == kLongTxtTypeName) {
-    params.type = type.value();
-    params.verbose = (type == kLongTxtTypeName);
-    params.show_metadata =
-        GetParamWithDefault(options, "show_metadata", kDefaultShowMetadata);
-    return params;
-  }
-
-  // Unknown type.
-  return errors::InvalidArgument("Unknown graph viewer type option: ",
-                                 type.value());
-}
-
-xla::RenderedGraphFormat GetRenderFormat(const std::string& format_string) {
-  if (format_string == "html") {
-    return xla::RenderedGraphFormat::kHtml;
-  } else if (format_string == "dot") {
-    return xla::RenderedGraphFormat::kDot;
-  } else if (format_string == "url") {
-    return xla::RenderedGraphFormat::kUrl;
-  } else {
-    LOG(ERROR) << "Invalid graph format argument: " << format_string
-               << ", fallback to default url";
-    return xla::RenderedGraphFormat::kUrl;
-  }
-}
-
-absl::StatusOr<std::string> ConvertHloProtoToGraph(
-    const HloProto& hlo_proto, const std::string& node_name, int graph_width,
-    const HloRenderOptions& render_options, const RenderedGraphFormat& format) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                      ConvertHloProtoToModule(hlo_proto));
-  return Plot(std::move(hlo_module), node_name, graph_width, render_options,
-              format);
-}
-
-absl::StatusOr<std::string> ConvertHloProtoToMeGraph(
-    const HloProto& hlo_proto, const std::string& node_name, int graph_width) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                      ConvertHloProtoToModule(hlo_proto));
-  return PlotMe(std::move(hlo_module), node_name, graph_width);
-}
-
-absl::StatusOr<std::string> ConvertHloProtoToStringView(
-    const HloProto& hlo_proto, bool verbose, bool metadata) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                      ConvertHloProtoToModule(hlo_proto));
-  HloPrintOptions options;
-  if (!verbose) {
-    options = HloPrintOptions::ShortParsable();
-  }
-  options.set_print_large_constants(verbose);
-  options.set_print_metadata(metadata);
-  return hlo_module->ToString(options);
-}
-
-std::function<absl::StatusOr<std::string>(absl::string_view)>* url_renderer =
-    nullptr;
-
-// Precondition: (url_renderer != nullptr || format != kUrl).
-//
-// (We specify this as a precondition rather than checking it in here and
-// returning an error because we want to fail quickly when there's no URL
-// renderer available, and this function runs only after we've done all the work
-// of producing dot for the graph.)
-absl::Status CheckPrecondition(xla::RenderedGraphFormat format) {
-  if (format == xla::RenderedGraphFormat::kUrl && url_renderer == nullptr) {
-    return absl::FailedPreconditionError(
-        "Can't render as URL; no URL renderer was registered.");
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<std::string> RenderGraphView(
-    const xla::HloComputation& computation, absl::string_view label,
-    const xla::DebugOptions& debug_options, xla::RenderedGraphFormat format,
-    xla::HloRenderOptions hlo_render_options) {
-  auto precheck_status = CheckPrecondition(format);
-  if (!precheck_status.ok()) {
-    return precheck_status;
-  }
-  auto rendered_dot =
-      xla::RenderGraph(computation, label, debug_options,
-                       RenderedGraphFormat::kDot, hlo_render_options);
-  if (!rendered_dot.ok()) {
-    return rendered_dot.status();
-  }
-  return WrapDotInFormat(rendered_dot.value(), format);
-}
-
-absl::StatusOr<std::string> RenderGraphNeighborhoodAround(
-    const xla::HloInstruction& node, int radius,
-    xla::RenderedGraphFormat format, xla::HloRenderOptions hlo_render_options,
-    const absl::flat_hash_set<const xla::HloInstruction*>& boundary) {
-  auto precheck_status = CheckPrecondition(format);
-  if (!precheck_status.ok()) {
-    return precheck_status;
-  }
-  auto rendered_dot = xla::RenderNeighborhoodAround(
-      node, radius, RenderedGraphFormat::kDot, hlo_render_options, boundary);
-  if (!rendered_dot.ok()) {
-    return rendered_dot.status();
-  }
-  return WrapDotInFormat(rendered_dot.value(), format);
-}
-
-absl::StatusOr<std::string> WrapDotInFormat(std::string dot,
-                                            xla::RenderedGraphFormat format) {
-  switch (format) {
-    case xla::RenderedGraphFormat::kUrl:
-      if (url_renderer == nullptr) {
-        return absl::InternalError("url_renderer is null");
-      }
-      return (*url_renderer)(dot);
-    case xla::RenderedGraphFormat::kHtml:
-      return WrapDotInHtml(dot);
-    case xla::RenderedGraphFormat::kDot:
-      return std::string(dot);
-  }
-}
-
-std::string WrapDotInHtml(std::string dot) {
-  return absl::StrReplaceAll(R"html(
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset="utf-8">
-  <style type="text/css">
-    body {
-      height: 100vh;
-      margin: 0;
-    }
-    #graph-container {height:95vh;width:100%;padding:10px;display:block;}
-    #graph-container svg { height: 100% !important; width: 100% !important;}
-    .node, .cluster {cursor:pointer;}
-    .cluster:hover, .node:hover {outline: solid 3px black;}
-  </style>
-</head>
-<body>
-  <script src="https://www.gstatic.com/external_hosted/hpcc_js_wasm/index.min.js"
-      integrity="sha384-LigJPbR3TOfU/Xbb+PjiN1dGJYPweLk7kiGnaMgmxnUmKWaCFKbb5tH6iLlyVhPZ"
-      crossorigin="anonymous"></script>
-  <script src="https://www.gstatic.com/external_hosted/svg_pan_zoom/svg-pan-zoom.js"></script>
-  <div id="graph-container"></div>
-  <script>
-    const cssregex = new RegExp('stylesheet=<([^]*)\n>\n', 'gm');
-    const hpccWasm = window["@hpcc-js/wasm"];
-    const data = `$DOT`;
-    const results = cssregex.exec(data);
-    // graphviz has problem dealing with large stylesheets.
-    // https://github.com/tensorflow/tensorflow/issues/17220#issuecomment-369228492
-    // In order to avoid the problem, remove the stylesheet from the dot and
-    // insert it directly info the rendered SVG.
-
-    let dot_data = data;
-    let css_data = '';
-    if (results !== null) {
-        css_data = results[1].replace(/\s*data:.*\s*,/,''); // Strip content-type field.
-        // CSS inside DOT is URL-escaped, so we must unescape it
-        // before we can insert it into SVG.
-        css_data = unescape(css_data);
-        dot_data = data.replace(cssregex, ''); // Remove the stylesheet
-    }
-
-    var render_start = performance.now()
-    function add_controls(svg) {
-        var htmlblob = new Blob([document.documentElement.innerHTML],
-                                {type: 'text/html'});
-        var savehtml = document.createElement('a');
-        savehtml.setAttribute('href', URL.createObjectURL(htmlblob));
-        savehtml.setAttribute('download', 'graph.html');
-        savehtml.innerHTML = " [Save HTML+SVG] ";
-        document.body.append(savehtml);
-        var svgblob = new Blob([svg.outerHTML], {type: 'image/svg'});
-        var savesvg = document.createElement('a');
-        savesvg.setAttribute('href', URL.createObjectURL(svgblob));
-        savesvg.setAttribute('download', 'graph.svg');
-        savesvg.innerHTML = " [Save SVG] ";
-        document.body.append(savesvg);
-        var dotblob =  new Blob([data], {type: 'text/dot'});
-        var savedot = document.createElement('a');
-        savedot.setAttribute('href', URL.createObjectURL(dotblob));
-        savedot.setAttribute('download', 'graph.dot');
-        savedot.innerHTML = " [Save DOT] ";
-        document.body.append(savedot);
-        // Will get called after embed element was loaded
-        var render_end = performance.now();
-        var render_note = document.createElement('div')
-        render_note.innerHTML = 'Rendering took '
-                                + (render_end - render_start).toFixed(2) + "ms."
-        document.body.append(render_note);
-    }
-    const render_callback = svg => {
-      const container = document.getElementById('graph-container')
-      container.innerHTML = `${svg}<style>${css_data}</style>`;
-      const panZoom = svgPanZoom(container.children[0], {
-        zoomEnabled: true,
-        controlIconsEnabled: true,
-        maxZoom: 200,
-        minZoom: 0,
-      });
-      add_controls(svg);
-    };
-    hpccWasm.graphviz.layout(dot_data, "svg", "dot").then(render_callback);
-  </script>
-</body>
-</html>
-)html",
-                             {
-                                 {"$DOT", dot},
-                             });
-}
-
-void RegisterGraphvizURLRenderer(
-    std::function<absl::StatusOr<std::string>(absl::string_view)> renderer) {
-  if (url_renderer != nullptr) {
-    LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer. Last call "
-                    "wins, but because order of initialization in C++ is "
-                    "nondeterministic, this may not be what you want.";
-  }
-  delete url_renderer;
-  url_renderer =
-      new std::function<absl::StatusOr<std::string>(absl::string_view)>(
-          std::move(renderer));
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
deleted file mode 100644
index b3a3a7c45e11..000000000000
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_GRAPH_VIEW_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_GRAPH_VIEW_H_
-
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_graph_dumper.h"
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// All the parameters for graph viewer.
-struct GraphViewerParams {
-  // Whether to use GraphView or TxtView.
-  std::string type;
-  // Parameters for GraphView.
-  std::string node_name;
-  int graph_width;
-  xla::HloRenderOptions render_options;
-  xla::RenderedGraphFormat format;
-  // Parameters for TxtView.
-  bool verbose;
-  bool show_metadata;
-};
-
-// Return mapping from style key word to op names separated by comma.
-// following hlo_graph_dumper styling
-absl::StatusOr<std::string> GetNodeStyles();
-
-// Parse tool options to get the parameters for graph viewer.
-absl::StatusOr<GraphViewerParams> ParseGraphViewerParams(
-    const ToolOptions& options);
-
-// Get graph render format.
-xla::RenderedGraphFormat GetRenderFormat(const std::string& format_string);
-
-// Convert `hlo_proto` to GraphView with the provided render options.
-absl::StatusOr<std::string> ConvertHloProtoToGraph(
-    const xla::HloProto& hlo_proto, const std::string& node_name,
-    int graph_width, const xla::HloRenderOptions& render_options,
-    const xla::RenderedGraphFormat& format);
-
-// Convert `hlo_proto` to ModelExplorer Graph JSON data.
-absl::StatusOr<std::string> ConvertHloProtoToMeGraph(
-    const xla::HloProto& hlo_proto, const std::string& node_name,
-    int graph_width);
-
-// Render graph with the provided render options.
-absl::StatusOr<std::string> RenderGraphView(
-    const xla::HloComputation& computation, absl::string_view label,
-    const xla::DebugOptions& debug_options, xla::RenderedGraphFormat format,
-    xla::HloRenderOptions hlo_render_options = {});
-
-// Render graph with centered node and depth
-absl::StatusOr<std::string> RenderGraphNeighborhoodAround(
-    const xla::HloInstruction& node, int radius,
-    xla::RenderedGraphFormat format,
-    xla::HloRenderOptions hlo_render_options = {},
-    const absl::flat_hash_set<const xla::HloInstruction*>& boundary = {});
-
-// Convert `hlo_proto` to StringView.
-absl::StatusOr<std::string> ConvertHloProtoToStringView(
-    const xla::HloProto& hlo_proto, bool verbose, bool metadata);
-
-// Convert dot into certain format
-absl::StatusOr<std::string> WrapDotInFormat(std::string dot,
-                                            xla::RenderedGraphFormat format);
-
-// Convert dot into visual graph in html
-std::string WrapDotInHtml(std::string dot);
-
-// Registers a function which implements RenderedGraphFormat::kUrl.
-// The input to the function is dot, and the output should be a URL or an error.
-// There can only be one active renderer, and the last call to this function
-// wins.
-void RegisterGraphvizURLRenderer(
-    std::function<absl::StatusOr<std::string>(absl::string_view dot)> renderer);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_GRAPH_VIEW_H_
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view_test.cc b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view_test.cc
deleted file mode 100644
index b53ec03de282..000000000000
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view_test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h"
-
-#include <variant>
-
-#include <gmock/gmock.h>
-#include "xla/service/hlo_graph_dumper.h"
-#include "xla/tsl/platform/status_matchers.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::testing::HasSubstr;
-using ::tsl::testing::StatusIs;
-
-TEST(GraphViewerParamsTest, GraphType) {
-  // Default for graph type.
-  ToolOptions options1;
-  options1["type"] = "graph";
-  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params1,
-                          ParseGraphViewerParams(options1));
-  EXPECT_EQ(params1.type, "graph");
-  EXPECT_EQ(params1.node_name, "");
-  EXPECT_EQ(params1.graph_width, 3);
-  EXPECT_EQ(params1.render_options.show_backend_config, false);
-  EXPECT_EQ(params1.render_options.show_fusion_subcomputations, true);
-  EXPECT_EQ(params1.format, xla::RenderedGraphFormat::kUrl);
-
-  // User defined options for graph type.
-  ToolOptions options2;
-  options2["type"] = "graph";
-  options2["node_name"] = "fusion.111";
-  options2["graph_width"] = 10;
-  options2["show_metadata"] = 1;
-  options2["merge_fusion"] = 1;
-  options2["format"] = "html";
-  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params2,
-                          ParseGraphViewerParams(options2));
-  EXPECT_EQ(params2.type, "graph");
-  EXPECT_EQ(params2.node_name, "fusion.111");
-  EXPECT_EQ(params2.graph_width, 10);
-  EXPECT_EQ(params2.render_options.show_backend_config, true);
-  EXPECT_EQ(params2.render_options.show_fusion_subcomputations, false);
-  EXPECT_EQ(params2.format, xla::RenderedGraphFormat::kHtml);
-}
-
-TEST(GraphViewerParamsTest, ShortTxtType) {
-  // Default for short txt type.
-  ToolOptions options1;
-  options1["type"] = "short_txt";
-  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params1,
-                          ParseGraphViewerParams(options1));
-  EXPECT_EQ(params1.type, "short_txt");
-  EXPECT_EQ(params1.verbose, false);
-  EXPECT_EQ(params1.show_metadata, false);
-
-  // User defined options for short txt type.
-  ToolOptions options2;
-  options2["type"] = "short_txt";
-  options2["show_metadata"] = 1;
-  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params2,
-                          ParseGraphViewerParams(options2));
-  EXPECT_EQ(params2.type, "short_txt");
-  EXPECT_EQ(params2.verbose, false);
-  EXPECT_EQ(params2.show_metadata, true);
-}
-
-TEST(GraphViewerParamsTest, LongTxtType) {
-  // Default for long txt type.
-  ToolOptions options1;
-  options1["type"] = "long_txt";
-  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params1,
-                          ParseGraphViewerParams(options1));
-  EXPECT_EQ(params1.type, "long_txt");
-  EXPECT_EQ(params1.verbose, true);
-  EXPECT_EQ(params1.show_metadata, false);
-
-  // User defined options for long txt type.
-  ToolOptions options2;
-  options2["type"] = "long_txt";
-  options2["show_metadata"] = 1;
-  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params2,
-                          ParseGraphViewerParams(options2));
-  EXPECT_EQ(params2.type, "long_txt");
-  EXPECT_EQ(params2.verbose, true);
-  EXPECT_EQ(params2.show_metadata, true);
-}
-
-TEST(GraphViewerParamsTest, OtherTypes) {
-  ToolOptions options1;
-  EXPECT_THAT(ParseGraphViewerParams(options1),
-              StatusIs(error::INVALID_ARGUMENT,
-                       HasSubstr("Graph viewer must provide a type option")));
-
-  ToolOptions options2;
-  options2["type"] = "abcd";
-  EXPECT_THAT(ParseGraphViewerParams(options2),
-              StatusIs(error::INVALID_ARGUMENT,
-                       HasSubstr("Unknown graph viewer type option: abcd")));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
deleted file mode 100644
index cf4fce7aecde..000000000000
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
+++ /dev/null
@@ -1,1108 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <list>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/layout_util.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/memory_viewer_preprocess.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::xla::BufferAllocationProto;
-using ::xla::HeapSimulatorTrace;
-using ::xla::HloInstructionProto;
-using ::xla::HloProto;
-using ::xla::LayoutUtil;
-using ::xla::LogicalBufferProto;
-using ::xla::Shape;
-using ::xla::ShapeUtil;
-
-Shape ResolveShapeIndex(const xla::ShapeProto& shape_proto,
-                        absl::Span<const int64_t> shape_index) {
-  if (shape_index.empty()) return Shape(shape_proto);
-  // Choosing the last subshape to maintain historical behavior.
-  int64_t i = shape_index.back();
-  if (i >= shape_proto.tuple_shapes_size()) {
-    return Shape(shape_proto);
-  }
-  return Shape(shape_proto.tuple_shapes(i));
-}
-
-std::string ShapeDescription(const Shape& shape) {
-  return ShapeUtil::HumanStringWithLayout(shape);
-}
-
-// A wrapper around ShapeUtil::ByteSizeOf that clears out the layout/padding,
-// since that is considered in the ByteSizeOf calculation.
-int64_t ShapeUnpaddedSize(Shape shape) {
-  // Ensure the layout has no padding by making it the default layout.
-  LayoutUtil::SetToDefaultLayout(&shape);
-  // Note: we make a simplifying assumption here that a "minimal" size for a
-  // tuple member would be the size of a `void*` -- there may be even fancier
-  // ways of doing things, but this should give a good enough approximation of
-  // what a minimal tuple size is.
-  return ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
-}
-
-class BufferAllocationStruct {
- public:
-  explicit BufferAllocationStruct(const BufferAllocationProto& proto)
-      : buffer_allocation_((proto)) {}
-  bool IsIndefinite() const {
-    return buffer_allocation_.is_thread_local() ||
-           buffer_allocation_.is_entry_computation_parameter() ||
-           buffer_allocation_.is_constant() ||
-           buffer_allocation_.maybe_live_out();
-  }
-  const BufferAllocationProto& proto() const { return buffer_allocation_; }
-  size_t size() const { return buffer_allocation_.size(); }
-  int64_t color() const { return buffer_allocation_.color(); }
-  int64_t index() const { return buffer_allocation_.index(); }
-  std::optional<int64_t> heap_simulator_trace_id() const {
-    return heap_simulator_trace_id_;
-  }
-  void set_heap_simulator_trace_id(int64_t id) {
-    heap_simulator_trace_id_ = id;
-  }
-
-  // Get buffer allocation category.
-  std::string category() const {
-    if (buffer_allocation_.is_entry_computation_parameter()) {
-      return "Parameter";
-    } else if (buffer_allocation_.maybe_live_out()) {
-      return "Output";
-    } else if (buffer_allocation_.is_thread_local()) {
-      return "Thread-local";
-    } else if (buffer_allocation_.is_constant()) {
-      return "Constant";
-    } else {
-      return "Temporary";
-    }
-  }
-
-  std::string description() const {
-    return absl::StrFormat(
-        "buffer_allocation_id:%d\nsize:%d\nbuffer_counts:%d\n",
-        buffer_allocation_.index(), size(), buffer_allocation_.assigned_size());
-  }
-
- private:
-  const BufferAllocationProto& buffer_allocation_;
-  std::optional<int64_t> heap_simulator_trace_id_;
-};
-
-struct LogicalBufferStruct {
-  LogicalBufferStruct(const LogicalBufferProto& p,
-                      const BufferAllocationStruct& b,
-                      const ::xla::HloInstructionProto& i, uint64_t offset)
-      : proto(p),
-        buffer_allocation(b),
-        hlo_instruction(i),
-        offset(offset),
-        shape(ResolveShapeIndex(hlo_instruction.shape(),
-                                proto.defined_at().shape_index())) {}
-
-  absl::string_view instruction_name() const { return hlo_instruction.name(); }
-
-  int64_t color() const { return proto.color(); }
-  size_t size() const { return proto.size(); }
-  size_t unpadded_size() const { return ShapeUnpaddedSize(shape); }
-
-  // reference counting related
-  int64_t inc() {
-    if (canonical_buffer) return canonical_buffer->inc();
-    return ++ref_count;
-  }
-  int64_t dec() {
-    if (canonical_buffer) return canonical_buffer->dec();
-    return --ref_count;
-  }
-  int64_t share_with(LogicalBufferStruct* buffer) {
-    canonical_buffer = buffer;
-    return canonical_buffer->inc();
-  }
-  LogicalBufferStruct* get_canonical_buffer() {
-    return canonical_buffer ? canonical_buffer->get_canonical_buffer() : this;
-  }
-
-  // Get the instruction name with shape index for a logical buffer.
-  std::string GetInstructionNameWithShapeIndex() const {
-    if (proto.defined_at().shape_index().empty()) {
-      return std::string(instruction_name());
-    } else {
-      return absl::StrCat(instruction_name(), "{",
-                          absl::StrJoin(proto.defined_at().shape_index(), ","),
-                          "}");
-    }
-  }
-
-  std::string description() const {
-    return absl::StrFormat(
-        "buffer_id:%d\nhlo_op:%s\nshape:%s\nsize:%d\nunpadded_size:%d\n"
-        "offset:%d\nspan:(%lld,%lld)",
-        proto.id(), instruction_name(), ShapeDescription(shape), size(),
-        unpadded_size(), offset, span ? span->first : -1,
-        span ? span->second : -1);
-  }
-
-  const LogicalBufferProto& proto;
-  const BufferAllocationStruct& buffer_allocation;
-  const ::xla::HloInstructionProto& hlo_instruction;
-  uint64_t offset;  // within the buffer allocation;
-  // Span within the specific simulator trace.
-  std::optional<std::pair<uint64_t, uint64_t>> span;
-  xla::Shape shape;
-  int64_t ref_count = 0;
-  LogicalBufferStruct* canonical_buffer = nullptr;
-};
-
-// A wrapper of HLO BufferAssignment, with lookup maps for logical buffers and
-// buffer allocations.
-class HloProtoBufferWrapper {
- public:
-  explicit HloProtoBufferWrapper(const ::xla::HloProto& hlo_proto)
-      : hlo_proto_(hlo_proto) {
-    Init();
-  }
-
-  // Get the heap simulator trace ID using memory color.
-  // If unable to find the heap simulator trace, return -1.
-  int64_t GetHeapSimulatorTraceId(const int64_t memory_color) const {
-    int64_t id = GetHeapSimulatorTraceIdFromBufferAllocationIndex(memory_color);
-    if (id != -1) {
-      return id;
-    }
-    return GetHeapSimulatorTraceIdFromEvents(memory_color);
-  }
-
-  // Get the raw HLO proto.
-  const ::xla::HloProto& GetHloProto() const { return hlo_proto_; }
-
-  std::vector<const BufferAllocationStruct*> GetBufferAllocations(
-      int64_t memory_color) const {
-    std::vector<const BufferAllocationStruct*> buffer_allocations;
-    for (const auto& iter : id_to_buffer_allocation_) {
-      if (iter.second->proto().color() != memory_color) continue;
-      buffer_allocations.push_back(iter.second.get());
-    }
-    return buffer_allocations;
-  }
-
-  LogicalBufferStruct* GetLogicalBuffer(int64_t logical_buffer_id) const {
-    if (!id_to_logical_buffer_.contains(logical_buffer_id)) {
-      LOG(DFATAL) << "logical_buffer_id " << logical_buffer_id << "not found.";
-      return nullptr;
-    }
-    return id_to_logical_buffer_.at(logical_buffer_id).get();
-  }
-
-  // Get the logical buffers with indefinite lifetime (excluding thread_local).
-  std::vector<const LogicalBufferStruct*> LogicalBuffersWithIndefiniteLifetime(
-      int64_t memory_color) const {
-    std::vector<const LogicalBufferStruct*> indefinite_logical_buffers;
-
-    for (const auto& buffer_assignment : GetBufferAllocations(memory_color)) {
-      if (!buffer_assignment->IsIndefinite()) continue;
-      if (buffer_assignment->proto().is_thread_local()) continue;
-      // A indefinite buffer allocation will contain multiple logical buffers.
-      // None of them have a offset, and may have different size than the buffer
-      // allocation's size. In most cases, if not all cases, one of the logical
-      // buffer will have the size equal to buffer allocation's size. We will
-      // pick the biggest logical buffer.
-      const LogicalBufferStruct* best_logical_buffer = nullptr;
-      size_t best_size = 0;
-      for (const auto& assigned : buffer_assignment->proto().assigned()) {
-        const LogicalBufferStruct* logical_buffer_struct =
-            GetLogicalBuffer(assigned.logical_buffer_id());
-        if (logical_buffer_struct == nullptr) continue;
-        if (logical_buffer_struct->size() > best_size) {
-          best_size = logical_buffer_struct->size();
-          best_logical_buffer = logical_buffer_struct;
-        }
-      }
-      if (best_logical_buffer) {
-        indefinite_logical_buffers.push_back(best_logical_buffer);
-      }
-    }
-    return indefinite_logical_buffers;
-  }
-
- private:
-  // Initialize the mappings of logical buffers and buffer allocations.
-  void Init() {
-    // A mapping from name to HLO instruction.
-    absl::flat_hash_map<absl::string_view, const ::xla::HloInstructionProto*>
-        name_to_hlo;
-    absl::flat_hash_map<uint64_t, const ::xla::HloInstructionProto*>
-        unique_id_to_hlo;
-
-    for (const auto& computation : hlo_proto_.hlo_module().computations()) {
-      for (const auto& instruction : computation.instructions()) {
-        name_to_hlo[instruction.name()] = &instruction;
-        unique_id_to_hlo[instruction.id()] = &instruction;
-      }
-    }
-
-    absl::flat_hash_map<int64_t, const LogicalBufferProto*>
-        id_to_logical_buffer_proto;
-    for (const auto& logical_buffer :
-         hlo_proto_.buffer_assignment().logical_buffers()) {
-      id_to_logical_buffer_proto[logical_buffer.id()] = &logical_buffer;
-    }
-
-    for (const auto& buffer_allocation :
-         hlo_proto_.buffer_assignment().buffer_allocations()) {
-      auto& buffer_allocation_s =
-          id_to_buffer_allocation_[buffer_allocation.index()];
-      buffer_allocation_s =
-          std::make_unique<BufferAllocationStruct>(buffer_allocation);
-      for (const auto& assigned : buffer_allocation.assigned()) {
-        const auto id = assigned.logical_buffer_id();
-        if (!id_to_logical_buffer_proto.contains(id)) {
-          LOG(DFATAL) << "logical_buffer_id " << id << " not found.";
-          continue;
-        }
-        const auto* logical_buffer = id_to_logical_buffer_proto.at(id);
-        int64_t inst_id = logical_buffer->defined_at().instruction_id();
-        if (!unique_id_to_hlo.contains(inst_id)) {
-          LOG(DFATAL) << "instruction_id " << inst_id << " not found.";
-          continue;
-        }
-        const auto* instruction = unique_id_to_hlo.at(inst_id);
-        id_to_logical_buffer_[id] = std::make_unique<LogicalBufferStruct>(
-            *logical_buffer, *buffer_allocation_s, *instruction,
-            assigned.offset());
-      }
-    }
-
-    const auto& heap_simulator_traces =
-        hlo_proto_.buffer_assignment().heap_simulator_traces();
-    for (int64_t i = 0; i < heap_simulator_traces.size(); i++) {
-      // The trace's buffer_allocation_index is not trustful, so we are trying
-      // to obtain the buffer allocation index ourselves.
-      if (heap_simulator_traces[i].events().empty()) continue;
-      int logical_buffer_id = heap_simulator_traces[i].events(0).buffer_id();
-      if (!id_to_logical_buffer_.contains(logical_buffer_id)) continue;
-      auto* logical_buffer = id_to_logical_buffer_[logical_buffer_id].get();
-      auto buffer_allocation_index = logical_buffer->buffer_allocation.index();
-      id_to_buffer_allocation_[buffer_allocation_index]
-          ->set_heap_simulator_trace_id(i);
-    }
-  }
-
-  // From a list of heap simulator traces, identify the one that has the largest
-  // number of memory events with color <memory_color>.
-  int64_t GetHeapSimulatorTraceIdFromEvents(const int64_t memory_color) const {
-    int64_t best_index = -1;
-    int64_t best_event_count = 0;
-    for (int64_t i = 0;
-         i < hlo_proto_.buffer_assignment().heap_simulator_traces_size(); i++) {
-      const auto& heap_simulator_trace =
-          hlo_proto_.buffer_assignment().heap_simulator_traces(i);
-      int64_t event_count = 0;
-      for (const auto& event : heap_simulator_trace.events()) {
-        if (!id_to_logical_buffer_.contains(event.buffer_id())) {
-          LOG(DFATAL) << "buffer_id " << event.buffer_id() << "not found.";
-          continue;
-        }
-        const auto& logical_buffer =
-            id_to_logical_buffer_.at(event.buffer_id());
-        if (logical_buffer->color() == memory_color) {
-          event_count++;
-        }
-      }
-      if (event_count > best_event_count) {
-        best_index = i;
-        best_event_count = event_count;
-      }
-    }
-    return best_index;
-  }
-
-  // Tries to get heap simulator trace based on buffer_allocation_index.
-  int64_t GetHeapSimulatorTraceIdFromBufferAllocationIndex(
-      const int64_t memory_color) const {
-    auto buffer_allocations = GetBufferAllocations(memory_color);
-    for (const auto* buffer_allocation : buffer_allocations) {
-      if (buffer_allocation->IsIndefinite()) continue;
-      // TODO(xprof): handle multiple temporary buffer allocations for the same
-      // color.
-      if (buffer_allocation->heap_simulator_trace_id()) {
-        return *buffer_allocation->heap_simulator_trace_id();
-      }
-    }
-    return -1;
-  }
-
-  // Reference to the original HLO proto.
-  const ::xla::HloProto& hlo_proto_;
-
-  // A mapping from logical buffer ID to logical buffer.
-  absl::flat_hash_map<int64_t, std::unique_ptr<LogicalBufferStruct>>
-      id_to_logical_buffer_;
-
-  // A mapping from buffer allocation ID to BufferAllocationProto.
-  absl::flat_hash_map<int64_t, std::unique_ptr<BufferAllocationStruct>>
-      id_to_buffer_allocation_;
-};
-
-double BytesToMiB(int64_t bytes) {
-  return static_cast<double>(bytes) / (1ULL << 20);
-}
-
-HeapObject MakeHeapObjectCommon(std::string label, int32_t color,
-                                int64_t logical_buffer_id,
-                                int64_t logical_buffer_size_bytes,
-                                int64_t unpadded_shape_bytes) {
-  HeapObject result;
-  result.set_numbered(color);
-  result.set_label(std::move(label));
-  result.set_logical_buffer_id(logical_buffer_id);
-  result.set_logical_buffer_size_mib(BytesToMiB(logical_buffer_size_bytes));
-  result.set_unpadded_shape_mib(BytesToMiB(unpadded_shape_bytes));
-  return result;
-}
-
-HeapObject MakeHeapObject(const LogicalBufferStruct& logical_buffer,
-                          int32_t color) {
-  const HloInstructionProto& hlo_instruction = logical_buffer.hlo_instruction;
-  std::string shape_string = ShapeDescription(logical_buffer.shape);
-  std::string label =
-      absl::StrFormat("%s: %s # %s", logical_buffer.instruction_name(),
-                      shape_string, hlo_instruction.metadata().op_name());
-  HeapObject result = MakeHeapObjectCommon(
-      std::move(label), color, logical_buffer.proto.id(), logical_buffer.size(),
-      logical_buffer.unpadded_size());
-  result.set_instruction_name(
-      logical_buffer.GetInstructionNameWithShapeIndex());
-  result.set_group_name(logical_buffer.buffer_allocation.category());
-  result.set_tf_op_name(hlo_instruction.metadata().op_name());
-  result.set_shape_string(shape_string);
-  result.set_op_code(hlo_instruction.opcode());
-  return result;
-}
-
-BufferSpan MakeBufferSpan(int32 start, int32 limit) {
-  BufferSpan result;
-  result.set_start(start);
-  result.set_limit(limit);
-  return result;
-}
-
-void Convert(const xla::BufferAllocationProto_Assigned& assigned,
-             const HloProtoBufferWrapper& wrapper, LogicalBuffer* result) {
-  result->set_id(assigned.logical_buffer_id()),
-      result->set_size_mib(BytesToMiB(assigned.size()));
-  const LogicalBufferStruct* logical_buffer =
-      wrapper.GetLogicalBuffer(assigned.logical_buffer_id());
-  if (logical_buffer == nullptr) return;
-  result->set_hlo_name(std::string(logical_buffer->instruction_name()));
-  result->mutable_shape_index()->CopyFrom(
-      logical_buffer->proto.defined_at().shape_index());
-  result->set_shape(ShapeDescription(logical_buffer->shape));
-}
-
-bool IsReusable(const BufferAllocationProto& buffer_allocation) {
-  return !buffer_allocation.is_thread_local() && !buffer_allocation.is_tuple();
-}
-
-void Convert(const BufferAllocationProto& proto,
-             const HloProtoBufferWrapper& wrapper, BufferAllocation* result) {
-  result->set_id(proto.index());
-  result->set_size_mib(BytesToMiB(proto.size()));
-  if (proto.is_entry_computation_parameter()) {
-    result->add_attributes("entry computation parameter");
-  }
-  if (proto.maybe_live_out()) {
-    result->add_attributes("may-be live out");
-  }
-  if (IsReusable(proto)) {
-    result->add_attributes("reusable");
-  }
-  for (const auto& assigned : proto.assigned()) {
-    Convert(assigned, wrapper, result->add_logical_buffers());
-  }
-  // Check whether all logical buffers for this buffer allocation have a common
-  // shape.
-  if (!result->logical_buffers().empty()) {
-    std::string common_shape = result->logical_buffers(0).shape();
-    for (int64_t i = 1; i < result->logical_buffers_size(); ++i) {
-      if (result->logical_buffers(i).shape() != common_shape) {
-        common_shape = "";
-        break;
-      }
-    }
-    if (!common_shape.empty()) {
-      result->set_common_shape(common_shape);
-    }
-  }
-}
-
-void NoteSpecialAllocations(const HloProtoBufferWrapper& wrapper,
-                            int64_t memory_color, int64_t small_buffer_size,
-                            PreprocessResult* result) {
-  int64_t entry_parameters_bytes = 0;
-  int64_t non_reusable_bytes = 0;
-  int64_t maybe_live_out_bytes = 0;
-  int64_t total_buffer_allocation_bytes = 0;
-  int64_t indefinite_buffer_allocation_bytes = 0;
-  for (const auto* buffer_allocation_struct :
-       wrapper.GetBufferAllocations(memory_color)) {
-    const auto& buffer_allocation = buffer_allocation_struct->proto();
-    if (buffer_allocation.is_entry_computation_parameter()) {
-      entry_parameters_bytes += buffer_allocation.size();
-    }
-    if (!IsReusable(buffer_allocation)) {
-      non_reusable_bytes += buffer_allocation.size();
-    }
-    if (buffer_allocation.maybe_live_out()) {
-      if (buffer_allocation.size() > small_buffer_size) {
-        VLOG(1) << "Maybe live out buffer allocation: "
-                << buffer_allocation.size()
-                << " bytes :: " << buffer_allocation.ShortDebugString();
-      }
-      maybe_live_out_bytes += buffer_allocation.size();
-    }
-    if (buffer_allocation_struct->IsIndefinite()) {
-      indefinite_buffer_allocation_bytes += buffer_allocation.size();
-      Convert(buffer_allocation, wrapper, result->add_indefinite_lifetimes());
-    }
-    total_buffer_allocation_bytes += buffer_allocation.size();
-  }
-
-  result->set_entry_computation_parameters_mib(
-      BytesToMiB(entry_parameters_bytes));
-  result->set_non_reusable_mib(BytesToMiB(non_reusable_bytes));
-  result->set_maybe_live_out_mib(BytesToMiB(maybe_live_out_bytes));
-  result->set_total_buffer_allocation_mib(
-      BytesToMiB(total_buffer_allocation_bytes));
-  result->set_indefinite_buffer_allocation_mib(
-      BytesToMiB(indefinite_buffer_allocation_bytes));
-}
-
-// Memory usage statistics collected from heap simulator trace.
-struct HeapSimulatorStats {
-  explicit HeapSimulatorStats(const HloProtoBufferWrapper& wrapper)
-      : wrapper(wrapper) {}
-
-  void SetSimulatorTraceEventSize(int64_t size) {
-    simulator_trace_event_size = size;
-  }
-
-  // Update stats for general simulator event.
-  void UpdateOnSimulatorEvent(const HeapSimulatorTrace::Event& event) {
-    // Update memory timelines and seen buffers.
-    heap_size_bytes_timeline.push_back(heap_size_bytes);
-    unpadded_heap_size_bytes_timeline.push_back(unpadded_heap_size_bytes);
-    hlo_instruction_name_timeline.push_back(event.instruction_name());
-    const LogicalBufferStruct* logical_buffer =
-        wrapper.GetLogicalBuffer(event.buffer_id());
-    if (logical_buffer == nullptr) return;
-    seen_logical_buffers.insert(logical_buffer);
-    seen_buffer_allocations.insert(&logical_buffer->buffer_allocation.proto());
-  }
-
-  // Update stats when memory usage increase.
-  void IncreaseMemoryUsage(LogicalBufferStruct* canonical_logical_buffer,
-                           bool init_buffer_span) {
-    logical_buffers.push_back(canonical_logical_buffer->proto.id());
-    heap_size_bytes += canonical_logical_buffer->size();
-    unpadded_heap_size_bytes += canonical_logical_buffer->unpadded_size();
-
-    // Increase peak memory usage if needed.
-    int64_t prior_peak_heap_size_bytes = peak_heap_size_bytes;
-    peak_heap_size_bytes = std::max(peak_heap_size_bytes, heap_size_bytes);
-    if (prior_peak_heap_size_bytes != peak_heap_size_bytes) {
-      peak_heap_size_position = heap_size_bytes_timeline.size() - 1;
-      peak_unpadded_heap_size_bytes = unpadded_heap_size_bytes;
-      VLOG(1) << absl::StrFormat("New peak heap size on %d :: %d bytes",
-                                 peak_heap_size_position, peak_heap_size_bytes);
-      peak_logical_buffers = logical_buffers;
-    }
-    // Initialize the buffer lifespan if needed.
-    if (init_buffer_span) {
-      // Initialize the buffer span from the current event to the last event in
-      // heap simulator trace.
-      canonical_logical_buffer->span.emplace(
-          heap_size_bytes_timeline.size() - 1, simulator_trace_event_size - 1);
-    }
-  }
-
-  // Update stats when memory usage decrease.
-  absl::Status DecreaseMemoryUsage(
-      LogicalBufferStruct* canonical_logical_buffer) {
-    int64_t canonical_buffer_id = canonical_logical_buffer->proto.id();
-    logical_buffers.remove(canonical_buffer_id);
-    heap_size_bytes -= canonical_logical_buffer->size();
-    if (heap_size_bytes < 0) {
-      return errors::InvalidArgument(absl::StrCat(
-          "Heap size should be non-negative, but get: ", heap_size_bytes));
-    }
-    unpadded_heap_size_bytes -= canonical_logical_buffer->unpadded_size();
-    // Mark the end of this buffer.
-    if (canonical_logical_buffer->span) {
-      canonical_logical_buffer->span->second =
-          heap_size_bytes_timeline.size() - 1;
-    }
-    return absl::OkStatus();
-  }
-
-  // Finalize the memory usage stats from heap simulator trace.
-  absl::Status FinalizeMemoryUsage() {
-    // Add the final heap size after simulating the entire heap trace.
-    heap_size_bytes_timeline.push_back(heap_size_bytes);
-    unpadded_heap_size_bytes_timeline.push_back(unpadded_heap_size_bytes);
-    // Add an empty instruction name just so that this array is the same size as
-    // the other two.
-    hlo_instruction_name_timeline.push_back("");
-
-    if (seen_buffer_allocations.size() != 1) {
-      return errors::InvalidArgument(
-          absl::StrCat("All heap simulation should work out of a single buffer "
-                       "allocation, actual seen_buffer_allocations.size():",
-                       seen_buffer_allocations.size()));
-    }
-
-    // Log stats.
-    VLOG(1) << "Found " << peak_logical_buffers.size()
-            << " logical buffers alive at point of peak heap usage.";
-
-    VLOG(1) << "Peak logical buffers: ["
-            << absl::StrJoin(peak_logical_buffers, ", ") << "]";
-
-    return absl::OkStatus();
-  }
-
-  // Keep track of memory usage when iterating through heap simulator trace
-  // events.
-  int64_t heap_size_bytes = 0;
-  int64_t unpadded_heap_size_bytes = 0;
-  // Memory usage at peak.
-  int64_t peak_heap_size_bytes = 0;
-  int64_t peak_unpadded_heap_size_bytes = 0;
-
-  // Keep track of logical buffer IDs when iterating through heap simulator
-  // trace events. It is important this is in "program order", i.e. heap
-  // simulator's order.
-  std::list<int64_t> logical_buffers;
-  // Logical buffer IDs at peak.
-  std::list<int64_t> peak_logical_buffers;
-
-  // Heap size timeline.
-  std::vector<int64_t> heap_size_bytes_timeline;
-  std::vector<int64_t> unpadded_heap_size_bytes_timeline;
-  std::vector<std::string> hlo_instruction_name_timeline;
-
-  // Position of peak memory usage in the timeline.
-  int64_t peak_heap_size_position = 0;
-
-  // Logical buffers and buffer allocations that exists in heap simulator trace.
-  absl::flat_hash_set<const LogicalBufferStruct*> seen_logical_buffers;
-  absl::flat_hash_set<const BufferAllocationProto*> seen_buffer_allocations;
-
-  // Constants while iterating through heap simulator trace.
-  const HloProtoBufferWrapper& wrapper;
-  int64_t simulator_trace_event_size;
-};
-
-absl::Status ProcessHeapSimulatorTrace(const HloProtoBufferWrapper& wrapper,
-                                       const int64_t memory_color,
-                                       HeapSimulatorStats* stats) {
-  int64_t heap_simulator_trace_id =
-      wrapper.GetHeapSimulatorTraceId(memory_color);
-
-  // If unable to get a valid heap simulator trace id, skip heap simulator
-  // trace and process the rest of the buffers.
-  if (heap_simulator_trace_id < 0 ||
-      heap_simulator_trace_id >= wrapper.GetHloProto()
-                                     .buffer_assignment()
-                                     .heap_simulator_traces_size()) {
-    return absl::OkStatus();
-  }
-
-  // Run through all the simulator events in the given trace, and simulate the
-  // heap in order to find the point of peak memory usage and record its
-  // associated metadata.
-  const auto& trace =
-      wrapper.GetHloProto().buffer_assignment().heap_simulator_traces(
-          heap_simulator_trace_id);
-
-  stats->SetSimulatorTraceEventSize(trace.events_size());
-  for (const auto& event : trace.events()) {
-    stats->UpdateOnSimulatorEvent(event);
-    LogicalBufferStruct* logical_buffer =
-        wrapper.GetLogicalBuffer(event.buffer_id());
-    if (logical_buffer == nullptr) {
-      continue;
-    }
-    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
-      // ALLOC event increases memory usage and initializes the buffer lifetime
-      // span.
-      logical_buffer->inc();
-      stats->IncreaseMemoryUsage(logical_buffer,
-                                 /*init_buffer_span=*/true);
-    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
-      auto ref_count = logical_buffer->dec();
-      if (ref_count < 0) {
-        return errors::InvalidArgument(absl::StrCat(
-            "Buffer ", logical_buffer->proto.id(), "is freed multiple times."));
-      }
-      if (ref_count == 0) {
-        // There is no more reference to the canonical buffer, the canonical
-        // buffer is finally freed. Update memory usage and memory timespan
-        // using the metadata of canonical buffer.
-        auto& canonical_buffer = *logical_buffer->get_canonical_buffer();
-        TF_RETURN_IF_ERROR(stats->DecreaseMemoryUsage(&canonical_buffer));
-      }
-    } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
-      int64_t canonical_buffer_id = event.share_with_canonical_id();
-      LogicalBufferStruct* canonical_buffer =
-          wrapper.GetLogicalBuffer(canonical_buffer_id);
-      if (canonical_buffer == nullptr) {
-        continue;
-      }
-      auto ref_count = logical_buffer->share_with(canonical_buffer);
-
-      if (ref_count == 1) {
-        // SHARE_WITH happens after the FREE of a canonical buffer.
-        // SHARE_WITH event does not initialize buffer lifetime span, it was
-        // initialized by ALLOC event using the canonical logical buffer.
-        stats->IncreaseMemoryUsage(canonical_buffer,
-                                   /*init_buffer_span=*/false);
-      }
-    } else {
-      return errors::InvalidArgument(
-          absl::StrCat("Unhandled event kind: ", event.kind()));
-    }
-  }
-  TF_RETURN_IF_ERROR(stats->FinalizeMemoryUsage());
-  return absl::OkStatus();
-}
-
-// The stats when processing buffer allocations and logical buffers.
-struct PeakUsageSnapshot {
-  PeakUsageSnapshot(const HloProtoBufferWrapper& wrapper,
-                    const HeapSimulatorStats& simulator_stats,
-                    int64_t small_buffer_size)
-      : wrapper(wrapper),
-        simulator_stats(simulator_stats),
-        small_buffer_size(small_buffer_size) {}
-
-  // Add a HeapObject derived from logical buffer and buffer allocation.
-  void AddHeapObject(const LogicalBufferStruct& logical_buffer) {
-    if (logical_buffer.size() < small_buffer_size) {
-      // Accumulate small buffers, don't make a HeapObject.
-      total_small_buffer_size_bytes += logical_buffer.size();
-    } else {
-      // Make a new HeapObject, assign a new color to visualize it.
-      max_heap_objects.push_back(MakeHeapObject(logical_buffer, colorno++));
-    }
-  }
-
-  void FinalizeBufferUsage() {
-    // Buffers from HeapSimulatorTrace.
-    for (const int64_t logical_buffer_id :
-         simulator_stats.peak_logical_buffers) {
-      const LogicalBufferStruct* logical_buffer =
-          wrapper.GetLogicalBuffer(logical_buffer_id);
-      if (logical_buffer == nullptr) return;
-      AddHeapObject(*logical_buffer);
-    }
-
-    // Make a single HeapObject out of all the small buffers.
-    if (total_small_buffer_size_bytes != 0) {
-      max_heap_objects.push_back(MakeHeapObjectCommon(
-          absl::StrFormat("small (<%d bytes)", small_buffer_size), colorno++,
-          /*logical_buffer_id=*/-1, total_small_buffer_size_bytes,
-          /*unpadded_shape_bytes=*/0));
-    }
-  }
-
-  // All the HeapObjects at peak memory time.
-  std::vector<HeapObject> max_heap_objects;
-  // The total size of all memory buffers with indefinite lifetime.
-  int64_t indefinite_memory_usage_bytes = 0;
-  // The accumulated size of all small buffers.
-  int64_t total_small_buffer_size_bytes = 0;
-  // Tracker of memory viewer color.
-  int32_t colorno = 0;
-
-  const HloProtoBufferWrapper& wrapper;
-  const HeapSimulatorStats& simulator_stats;
-  const int64_t small_buffer_size;
-};
-
-void CreatePeakUsageSnapshot(const HloProtoBufferWrapper& wrapper,
-                             int64_t memory_color,
-                             PeakUsageSnapshot* peak_snapshot) {
-  // Add indefinite (global) buffers to peak usage snapshot.
-  for (const auto* logical_buffer :
-       wrapper.LogicalBuffersWithIndefiniteLifetime(memory_color)) {
-    const auto& buffer_allocation = logical_buffer->buffer_allocation;
-    peak_snapshot->indefinite_memory_usage_bytes += buffer_allocation.size();
-    peak_snapshot->AddHeapObject(*logical_buffer);
-  }
-
-  // Add temporary buffers (traced by heap simulator) to peak usage snapshot.
-  peak_snapshot->FinalizeBufferUsage();
-}
-
-void ConvertAllocationTimeline(const HloProtoBufferWrapper& wrapper,
-                               const HeapSimulatorStats& simulator_stats,
-                               const int64_t memory_color,
-                               PreprocessResult* result) {
-  // The color constants from https://graphviz.org/doc/info/colors.html.
-  const char* lb_colors[] = {
-      "antiquewhite3",
-      "aqua",
-      "aquamarine",
-      "bisque",
-      "blanchedalmond",
-      "blue",
-      "blueviolet",
-      "brown",
-      "burlywood",
-      "cadetblue",
-      "chartreuse",
-      "chocolate",
-      "coral",
-      "cornflowerblue",
-      "crimson",
-      "cyan",
-      "darkblue",
-      "darkcyan",
-      "darkgoldenrod",
-      "darkgray",
-      "darkgreen",
-      "darkkhaki",
-      "darkmagenta",
-      "darkolivegreen",
-      "darkorange",
-      "darkorchid",
-      "darkred",
-      "darksalmon",
-      "darkseagreen",
-      "darkslateblue",
-      "darkslategray",
-      "darkturquoise",
-      "darkviolet",
-      "deeppink",
-      "deepskyblue",
-      "dimgray",
-      "dodgerblue",
-      "firebrick",
-      "floralwhite",
-      "forestgreen",
-      "fuchsia",
-      "gainsboro",
-      "gold",
-      "goldenrod",
-      "green",
-      "greenyellow",
-      "goldenrod",
-      "greenyellow",
-      "honeydew",
-      "hotpink",
-      "indianred",
-      "indigo",
-      "ivory3",
-      "khaki",
-      "lavender",
-      "lavenderblush",
-      "lawngreen",
-      "lemonchiffon",
-      "lightblue",
-      "lightcoral",
-      "lightcyan",
-      "lightpink",
-      "limegreen",
-      "lightsalmon",
-      "lightseagreen",
-      "lightskyblue",
-      "lime",
-      "magenta",
-      "maroon",
-      "mediumaquamarine",
-      "mediumblue",
-      "mediumorchid",
-      "mediumpurple",
-      "midnightblue",
-      "mediumvioletred",
-      "mistyrose",
-      "moccasin",
-      "olive",
-      "orange",
-      "orangered",
-      "orchid",
-      "palegoldenrod",
-      "palegreen",
-      "paleturquoise",
-      "palevioletred",
-      "papayawhip",
-      "peachpuff",
-      "peachpuff",
-      "pink",
-      "plum",
-      "powderblue",
-      "purple",
-      "rebeccapurple",
-      "red",
-      "rosybrown",
-      "royalblue",
-      "salmon",
-      "sandybrown",
-      "seagreen",
-      "seashell",
-      "sienna",
-      "skyblue",
-      "tan",
-      "teal",
-      "turquoise",
-      "tomato",
-      "violet",
-      "violetred",
-      "yellow",
-  };
-
-  struct RenderOptions {
-    size_t graph_width = 2048;
-    size_t graph_height = 2048;
-  } render_options;
-
-  const char* ba_colors[] = {
-      "azure",
-      "beige",
-      "cornsilk",
-  };
-
-  int num_lb_colors = sizeof(lb_colors) / sizeof(lb_colors[0]);
-  int num_ba_colors = sizeof(ba_colors) / sizeof(ba_colors[0]);
-  std::vector<size_t> buffer_allocation_offsets;
-  size_t total_y_size = 0;  // Range of y dimension.
-  size_t total_x_size = 0;  // Range of x dimension.
-  std::vector<std::string> rects;
-  auto buffer_allocations = wrapper.GetBufferAllocations(memory_color);
-  const auto& heap_simulator_traces =
-      wrapper.GetHloProto().buffer_assignment().heap_simulator_traces();
-  for (const auto& buffer_allocation : buffer_allocations) {
-    // Exclude BAs for "global variables". The timeline provides little value.
-    if (buffer_allocation->IsIndefinite()) continue;
-    auto heap_simulator_trace_id = buffer_allocation->heap_simulator_trace_id();
-    if (!heap_simulator_trace_id) continue;
-    buffer_allocation_offsets.push_back(total_y_size);
-    total_y_size += buffer_allocation->size();
-    if (*heap_simulator_trace_id >= heap_simulator_traces.size()) {
-      LOG(DFATAL) << "heap_simulator_trace_id " << *heap_simulator_trace_id
-                  << " out of bounds.";
-      continue;
-    }
-    total_x_size = std::max<size_t>(
-        total_x_size,
-        heap_simulator_traces.at(*heap_simulator_trace_id).events_size());
-  }
-  if (!total_y_size || !total_x_size) return;
-  double scale_x =
-      static_cast<double>(render_options.graph_width) / total_x_size;
-  double scale_y =
-      static_cast<double>(render_options.graph_height) / total_y_size;
-
-  int node_id = 0;
-  auto add_rect = [&](size_t x, size_t y, size_t width, size_t height,
-                      const string& description, const char* color) {
-    size_t center_x = x + (width >> 1);
-    size_t center_y = y + (height >> 1);
-    int pos_x = center_x * scale_x;
-    int pos_y = center_y * scale_y;
-    int rect_w = width * scale_x;
-    int rect_h = height * scale_y;
-    // Skip when block size is smaller than half a pixel in output size.
-    if (height * scale_y < 0.5) return;
-    rect_h = std::max(rect_h, 1);  // Rounding up.
-    std::string rect = absl::StrFormat(
-        R"("%d" [tooltip="%s", pos="%d,%d!", width="%d!", height="%d!", color=%s];)",
-        node_id++, description, pos_x, pos_y, rect_w, rect_h, color);
-    rects.push_back(rect);
-  };
-  int buffer_id = 0;
-  for (const auto& buffer_allocation : buffer_allocations) {
-    // Exclude BAs for "global variables". The timeline provides little value.
-    if (buffer_allocation->IsIndefinite()) continue;
-    auto buffer_allocation_offset = buffer_allocation_offsets[buffer_id++];
-    add_rect(0, buffer_allocation_offset, total_x_size,
-             buffer_allocation->size(), buffer_allocation->description(),
-             ba_colors[buffer_id % num_ba_colors]);
-
-    for (const auto& assigned : buffer_allocation->proto().assigned()) {
-      const LogicalBufferStruct* logical_buffer =
-          wrapper.GetLogicalBuffer(assigned.logical_buffer_id());
-      if (logical_buffer == nullptr) continue;
-      // Exclude non-canonical logical buffers.
-      if (!logical_buffer->span || logical_buffer->canonical_buffer) continue;
-      size_t width = logical_buffer->span->second - logical_buffer->span->first;
-      size_t height = buffer_allocation_offset + logical_buffer->size();
-      add_rect(logical_buffer->span->first, logical_buffer->offset, width,
-               height, logical_buffer->description(),
-               lb_colors[node_id % num_lb_colors]);
-    }
-  }
-  VLOG(1) << "rects:" << rects.size();
-  result->set_allocation_timeline(
-      absl::StrFormat("graph G {\n node [shape=box,style=filled];\n %s\n}",
-                      absl::StrJoin(rects, "\n")));
-}
-
-void GeneratePreprocessResult(const HloProtoBufferWrapper& wrapper,
-                              const HeapSimulatorStats& simulator_stats,
-                              const PeakUsageSnapshot& peak_snapshot,
-                              const int64_t memory_color,
-                              PreprocessResult* result) {
-  // Module info.
-  result->set_module_name(wrapper.GetHloProto().hlo_module().name());
-  result->set_entry_computation_name(
-      wrapper.GetHloProto().hlo_module().entry_computation_name());
-
-  // Build HeapObjects and index.
-  std::vector<const HeapObject*> max_heap_by_size;
-  max_heap_by_size.reserve(peak_snapshot.max_heap_objects.size());
-  for (const auto& object : peak_snapshot.max_heap_objects) {
-    max_heap_by_size.push_back(&object);
-  }
-  std::sort(max_heap_by_size.begin(), max_heap_by_size.end(),
-            [](const HeapObject* a, const HeapObject* b) {
-              return a->logical_buffer_size_mib() >
-                     b->logical_buffer_size_mib();
-            });
-
-  std::vector<int> max_heap_to_by_size;
-  max_heap_to_by_size.reserve(max_heap_by_size.size());
-  for (const auto& object : peak_snapshot.max_heap_objects) {
-    auto it =
-        std::find(max_heap_by_size.begin(), max_heap_by_size.end(), &object);
-    int index = std::distance(max_heap_by_size.begin(), it);
-    max_heap_to_by_size.push_back(index);
-  }
-
-  std::vector<int> by_size_to_max_heap;
-  for (const auto* object : max_heap_by_size) {
-    int index = object - &peak_snapshot.max_heap_objects[0];
-    by_size_to_max_heap.push_back(index);
-  }
-
-  *result->mutable_max_heap() = {peak_snapshot.max_heap_objects.begin(),
-                                 peak_snapshot.max_heap_objects.end()};
-  result->mutable_max_heap_by_size()->Reserve(max_heap_by_size.size());
-  for (const HeapObject* o : max_heap_by_size) {
-    *result->add_max_heap_by_size() = *o;
-  }
-  *result->mutable_max_heap_to_by_size() = {max_heap_to_by_size.begin(),
-                                            max_heap_to_by_size.end()};
-  *result->mutable_by_size_to_max_heap() = {by_size_to_max_heap.begin(),
-                                            by_size_to_max_heap.end()};
-
-  // For the buffers that have indefinite lifetime (that is, lifetime not
-  // reflected by the heap simulation) add it to the peak values and the vectors
-  // of heap sizes.
-  size_t timeline_size = simulator_stats.heap_size_bytes_timeline.size();
-  double add_mib = BytesToMiB(peak_snapshot.indefinite_memory_usage_bytes);
-  result->mutable_heap_sizes()->Reserve(timeline_size);
-  result->mutable_unpadded_heap_sizes()->Reserve(timeline_size);
-  for (size_t i = 0; i < timeline_size; i++) {
-    result->add_heap_sizes(
-        BytesToMiB(simulator_stats.heap_size_bytes_timeline[i]) + add_mib);
-    result->add_unpadded_heap_sizes(
-        BytesToMiB(simulator_stats.unpadded_heap_size_bytes_timeline[i]) +
-        add_mib);
-    result->add_hlo_instruction_names(
-        simulator_stats.hlo_instruction_name_timeline[i]);
-  }
-
-  result->set_peak_heap_mib(BytesToMiB(simulator_stats.peak_heap_size_bytes) +
-                            add_mib);
-  result->set_peak_unpadded_heap_mib(
-      BytesToMiB(simulator_stats.peak_unpadded_heap_size_bytes) + add_mib);
-  result->set_peak_heap_size_position(simulator_stats.peak_heap_size_position);
-
-  // Build buffer lifespan.
-  for (const auto* logical_buffer : simulator_stats.seen_logical_buffers) {
-    if (!logical_buffer->span) continue;
-    (*result->mutable_logical_buffer_spans())[logical_buffer->proto.id()] =
-        MakeBufferSpan(logical_buffer->span->first,
-                       logical_buffer->span->second);
-  }
-
-  NoteSpecialAllocations(wrapper, memory_color, peak_snapshot.small_buffer_size,
-                         result);
-
-  ConvertAllocationTimeline(wrapper, simulator_stats, memory_color, result);
-}
-
-}  // namespace
-
-absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
-    const HloProto& hlo_proto, int64_t small_buffer_size,
-    int64_t memory_color) {
-  HloProtoBufferWrapper wrapper(hlo_proto);
-
-  // Process heap simulator trace.
-  HeapSimulatorStats simulator_stats(wrapper);
-  auto status =
-      ProcessHeapSimulatorTrace(wrapper, memory_color, &simulator_stats);
-  if (!status.ok()) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Failed to process heap simulator trace: ", status.message()));
-  }
-
-  // Process buffers with indefinite lifetime.
-  PeakUsageSnapshot peak_snapshot(wrapper, simulator_stats, small_buffer_size);
-  CreatePeakUsageSnapshot(wrapper, memory_color, &peak_snapshot);
-
-  PreprocessResult result;
-  GeneratePreprocessResult(wrapper, simulator_stats, peak_snapshot,
-                           memory_color, &result);
-  return result;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
index e7a681de51c3..d5a6061f9018 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
@@ -16,29 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_MEMORY_VISUALIZATION_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_MEMORY_VISUALIZATION_UTILS_H_
 
-#include <cstdint>
-
-#include "absl/status/statusor.h"
-#include "xla/service/hlo.pb.h"
-#include "tensorflow/core/profiler/protobuf/memory_viewer_preprocess.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-constexpr int kSmallBufferSize = 16 * 1024;
-
-// Convert HloProto to PreprocessResult proto for memory visualization.
-// small_buffer_size sets the byte size within which we collapse buffer entries
-// for the max-heap display.
-// <heap_simulator_trace_id> is the index of heap simulator trace to be
-// displayed. By default it is -1, which means the profiler will infer the heap
-// simulator trace id from <memory_color>.
-// By default the memory color is 0, which is HBM.
-absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
-    const xla::HloProto& hlo_proto,
-    int64_t small_buffer_size = kSmallBufferSize, int64_t memory_color = 0);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/hlo_proto_to_memory_visualization_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_HLO_PROTO_TO_MEMORY_VISUALIZATION_UTILS_H_
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils_test.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils_test.cc
deleted file mode 100644
index d92dea32152a..000000000000
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/memory_viewer_preprocess.pb.h"
-#include "tensorflow/core/util/proto/proto_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// 1 buffer allocation of 1MB
-// 2 logical buffers, each is 0.5MB
-static constexpr char kHLOBase[] = R"pb(
-  hlo_module {
-    name: "test_module"
-    entry_computation_name: "test_computation"
-    computations {
-      name: "test_computation"
-      instructions {
-        name: "fusion.1"
-        id: 0
-        shape { tuple_shapes { element_type: U64 } }
-      }
-      instructions {
-        name: "fusion.2"
-        id: 1
-        shape { tuple_shapes { element_type: U64 } }
-      }
-    }
-  }
-  buffer_assignment {
-    buffer_allocations {
-      index: 0
-      size: 1048576
-      color: 0
-      assigned { logical_buffer_id: 1 offset: 0 size: 524288 }
-      assigned { logical_buffer_id: 2 offset: 524288 size: 524288 }
-    }
-    logical_buffers {
-      id: 1
-      size: 524288
-      color: 0
-      defined_at { instruction_id: 0 shape_index: 0 }
-    }
-    logical_buffers {
-      id: 2
-      size: 524288
-      color: 0
-      defined_at { instruction_id: 1 shape_index: 0 }
-    }
-    heap_simulator_traces { %s }
-  }
-)pb";
-
-TEST(MemoryViewerTest, TestHeapSimulatorTraceShareWith_1) {
-  // Allocate and then share, the memory usage is not doubled.
-  static constexpr char kHeapSimulatorTrace[] = R"pb(
-    events { kind: ALLOC buffer_id: 1 }
-    events { kind: SHARE_WITH buffer_id: 2 share_with_canonical_id: 1 }
-    events { kind: FREE buffer_id: 1 }
-    events { kind: FREE buffer_id: 2 }
-  )pb";
-  std::string hlo_string = absl::StrFormat(kHLOBase, kHeapSimulatorTrace);
-  xla::HloProto hlo_proto;
-  ASSERT_TRUE(
-      proto_utils::ParseTextFormatFromString(hlo_string, &hlo_proto).ok());
-  TF_ASSERT_OK_AND_ASSIGN(
-      PreprocessResult preprocess_result,
-      ConvertHloProtoToPreprocessResult(hlo_proto, /*small_buffer_size=*/0));
-  EXPECT_EQ(preprocess_result.peak_heap_mib(), 0.5);
-}
-
-TEST(MemoryViewerTest, TestHeapSimulatorTraceShareWith_2) {
-  // Allocate, free and then share, the memory usage is not doubled.
-  static constexpr char kHeapSimulatorTrace[] = R"pb(
-    events { kind: ALLOC buffer_id: 1 }
-    events { kind: FREE buffer_id: 1 }
-    events { kind: SHARE_WITH buffer_id: 2 share_with_canonical_id: 1 }
-    events { kind: FREE buffer_id: 2 }
-  )pb";
-  std::string hlo_string = absl::StrFormat(kHLOBase, kHeapSimulatorTrace);
-  xla::HloProto hlo_proto;
-  ASSERT_TRUE(
-      proto_utils::ParseTextFormatFromString(hlo_string, &hlo_proto).ok());
-  TF_ASSERT_OK_AND_ASSIGN(
-      PreprocessResult preprocess_result,
-      ConvertHloProtoToPreprocessResult(hlo_proto, /*small_buffer_size=*/0));
-  EXPECT_EQ(preprocess_result.peak_heap_mib(), 0.5);
-  EXPECT_FALSE(preprocess_result.allocation_timeline().empty());
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc b/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
deleted file mode 100644
index 0978f0211d4d..000000000000
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/hlo_to_tools_data.h"
-
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h"
-#include "tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-#include "tensorflow/core/profiler/convert/xplane_to_hlo.h"
-#include "tensorflow/core/profiler/protobuf/memory_viewer_preprocess.pb.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-absl::StatusOr<PreprocessResult> GetMemoryViewerPreprocessResult(
-    const xla::HloProto& hlo_proto) {
-  static constexpr int kSmallBufferSize = 16 * 1024;  // 16KB
-  static constexpr int kMemorySpaceColor = 0;         // HBM
-
-  auto result_or = ConvertHloProtoToPreprocessResult(
-      hlo_proto, kSmallBufferSize, kMemorySpaceColor);
-  if (!result_or.ok()) {
-    return errors::Internal(
-        "Failed to convert HLO proto to memory viewer result: ",
-        result_or.status().message());
-  }
-  return result_or;
-}
-
-absl::StatusOr<std::string> ConvertHloProtoToMemoryViewer(
-    const xla::HloProto& hlo_proto) {
-  auto result_or = GetMemoryViewerPreprocessResult(hlo_proto);
-  if (!result_or.ok()) {
-    return result_or.status();
-  }
-
-  std::string json_output;
-  tsl::protobuf::util::JsonPrintOptions options;
-  options.always_print_primitive_fields = true;
-  auto encoded_status = tsl::protobuf::util::MessageToJsonString(
-      result_or.value(), &json_output, options);
-  if (!encoded_status.ok()) {
-    const auto& error_message = encoded_status.message();
-    return errors::Internal(
-        "Failed to convert memory viewer result to JSON format: ",
-        absl::string_view(error_message.data(), error_message.length()));
-  }
-
-  return json_output;
-}
-
-absl::StatusOr<std::string> ConvertHloProtoToAllocationTimeline(
-    const xla::HloProto& hlo_proto) {
-  auto result_or = GetMemoryViewerPreprocessResult(hlo_proto);
-  if (!result_or.ok()) {
-    return result_or.status();
-  }
-
-  return WrapDotInHtml(std::move(result_or.value().allocation_timeline()));
-}
-
-absl::StatusOr<std::string> ConvertHloProtoToGraphViewer(
-    const xla::HloProto& hlo_proto, const ToolOptions& options) {
-  TF_ASSIGN_OR_RETURN(GraphViewerParams params,
-                      ParseGraphViewerParams(options));
-  if (params.type == "graph") {
-    return ConvertHloProtoToGraph(hlo_proto, params.node_name,
-                                  params.graph_width, params.render_options,
-                                  params.format);
-  } else {
-    return ConvertHloProtoToStringView(hlo_proto, params.verbose,
-                                       params.show_metadata);
-  }
-}
-
-}  // namespace
-
-absl::StatusOr<std::string> ConvertHloProtoToToolData(
-    const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
-    const ToolOptions& options) {
-  // <options> must provide a hlo module_name field to identify the HLO module.
-  std::optional<std::string> hlo_module_name =
-      GetParam<std::string>(options, "module_name");
-  if (!hlo_module_name.has_value() || hlo_module_name->empty()) {
-    return errors::InvalidArgument(
-        "Can not find HLO module name from options.");
-  }
-
-  // Load HLO module from file.
-  TF_ASSIGN_OR_RETURN(
-      xla::HloProto hlo_proto,
-      GetHloProtoByModuleName(session_snapshot, *hlo_module_name));
-
-  // Convert from HLO proto to tools data.
-  if (tool_name == "memory_viewer") {
-    if (GetParamWithDefault(options, "view_memory_allocation_timeline", 0)) {
-      return ConvertHloProtoToAllocationTimeline(hlo_proto);
-    }
-    return ConvertHloProtoToMemoryViewer(hlo_proto);
-  } else if (tool_name == "graph_viewer") {
-    return ConvertHloProtoToGraphViewer(hlo_proto, options);
-  } else {
-    return errors::InvalidArgument(
-        "Can not find tool: ", tool_name,
-        ". Please update to the latest version of Tensorflow.");
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.h b/tensorflow/core/profiler/convert/hlo_to_tools_data.h
deleted file mode 100644
index b567c9733829..000000000000
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_HLO_TO_TOOLS_DATA_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_HLO_TO_TOOLS_DATA_H_
-
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Convert HLO proto to tool specific data.
-// <options> must provide a "module_name" field to identify which HLO proto
-// is used for the conversion.
-// Return the serialized string of tool specific data when the conversion is
-// successful, else return an error status.
-absl::StatusOr<std::string> ConvertHloProtoToToolData(
-    const SessionSnapshot& session_snapshot, absl::string_view tool_name,
-    const ToolOptions& options);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_HLO_TO_TOOLS_DATA_H_
diff --git a/tensorflow/core/profiler/convert/inference_stats.cc b/tensorflow/core/profiler/convert/inference_stats.cc
deleted file mode 100644
index 16e49d2063aa..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats.cc
+++ /dev/null
@@ -1,1409 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/inference_stats.h"
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/macros.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/match.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/profiler/utils/device_utils.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::tensorflow::profiler::EventType;
-using ::tensorflow::profiler::EventTypeSpan;
-using ::tensorflow::profiler::HostEventType;
-using ::tensorflow::profiler::StatType;
-using ::tensorflow::profiler::StepEvents;
-using ::tensorflow::profiler::ToNonOverlappedEvents;
-using ::tensorflow::profiler::XPlane;
-using ::tensorflow::profiler::XSpace;
-using ::tsl::profiler::CreateTfXPlaneVisitor;
-using ::tsl::profiler::DeviceType;
-using ::tsl::profiler::GroupMetadata;
-using ::tsl::profiler::GroupMetadataMap;
-using ::tsl::profiler::Timespan;
-using ::tsl::profiler::XEventVisitor;
-using ::tsl::profiler::XLineVisitor;
-using ::tsl::profiler::XPlaneVisitor;
-using ::tsl::profiler::XStatVisitor;
-
-using EventsByType =
-    absl::flat_hash_map<int64_t /*event_type*/, std::vector<XEventVisitor>>;
-
-// Holds all the events within a user facing request.
-// A user facing request can be a Session.Run without batching, or a
-// BatchingSession.Run with Batching, or a Session.Run with
-// BatchingFunctionOp.
-struct RequestEvents {
-  // Index to the model id.
-  int32_t model_id_index;
-  // The timespan of the entire request(including both host and device).
-  Timespan request_timespan;
-  // The latency between a request is scheduled and is processed in a batch.
-  int64_t batching_request_delay_ps;
-  // Size of a request in batching mode.
-  int32_t batching_request_size;
-
-  // Timestamps of the events used for the detailed execution time breakdown.
-  struct EventTimestamps {
-    std::optional<int64_t> ts_batch_schedule;
-    std::optional<int64_t> ts_batch_concat_input;
-    std::optional<int64_t> ts_tpu_execute;
-    std::optional<int64_t> ts_tpu_program_launch;
-    std::optional<int64_t> ts_tpu_complete_callback;
-  };
-  // Mapping from group ID to the timestamps, there can be multiple group IDs
-  // in a single request, because if request splitting is enabled, one request
-  // can be split to multiple batches for execution, and each batch has
-  // different group ID.
-  absl::flat_hash_map<int64_t, EventTimestamps> timestamps;
-
-  // The events that record tensor details like shape, type and layout.
-  std::vector<const XEventVisitor*> tensor_events;
-  // The final tensor details in proto format.
-  std::vector<tensorflow::profiler::TensorEventDetail>
-      tensor_event_detail_protos;
-
-  // The batch ids related to this request.
-  std::vector<int64_t> related_batch_ids;
-  // All the events.
-  std::vector<EventTypeSpan> events;
-};
-
-// Helper functions to handle absl::optional
-void MinOfOptional(std::optional<int64_t>& min, std::optional<int64_t> value) {
-  if (!min.has_value())
-    min = value;
-  else
-    min = std::min(min, value);
-}
-void MaxOfOptional(std::optional<int64_t>& max, std::optional<int64_t> value) {
-  if (!max.has_value())
-    max = value;
-  else
-    max = std::max(max, value);
-}
-
-// Helper functions to set timestamps in RequestEvents.
-void UpdateTsBatchSchedule(int64_t group_id, int64_t value,
-                           RequestEvents* events) {
-  events->timestamps[group_id].ts_batch_schedule = value;
-}
-void UpdateTsBatchConcatInput(int64_t group_id, int64_t value,
-                              RequestEvents* events) {
-  events->timestamps[group_id].ts_batch_concat_input = value;
-}
-void UpdateTsTPUExecute(int64_t group_id, int64_t value,
-                        RequestEvents* events) {
-  events->timestamps[group_id].ts_tpu_execute = value;
-}
-void UpdateTsTPUProgramLaunch(int64_t group_id, int64_t value,
-                              RequestEvents* events) {
-  // There might be multiple TPUProgramLaunch events in a single request.
-  // Set ts_tpu_program_launch to the earlist timestamp.
-  MinOfOptional(events->timestamps[group_id].ts_tpu_program_launch, value);
-}
-void UpdateTsTPUCompleteCallback(int64_t group_id, int64_t value,
-                                 RequestEvents* events) {
-  events->timestamps[group_id].ts_tpu_complete_callback = value;
-}
-
-// Map from the ID of a request to its events.
-using RequestEventsMap =
-    absl::flat_hash_map<int64_t /*request_id*/, RequestEvents>;
-
-// An internal data structure that holds all the events within a batch.
-struct BatchEvents {
-  // The events that record tensor details like shape, type and layout.
-  std::vector<const XEventVisitor*> tensor_events;
-
-  // The BatchDetail proto.
-  tensorflow::profiler::BatchDetail batch_detail_proto;
-};
-
-// Map from the ID of a batch to its events.
-using BatchEventsMap = absl::flat_hash_map<int64_t /*batch_id*/, BatchEvents>;
-
-// Map from the ID of a request to its model ID.
-using ModelIdMap = absl::flat_hash_map<int64_t, std::string>;
-
-int32_t AssignIndexToModelId(
-    const std::string& model_id,
-    tensorflow::profiler::ModelIdDatabase* model_id_db) {
-  if (model_id.empty()) return -1;
-  auto [iter, inserted] = model_id_db->mutable_id_to_index()->insert(
-      {model_id, model_id_db->ids_size()});
-  if (inserted) {
-    model_id_db->add_ids(model_id);
-  }
-  return iter->second;
-}
-
-// Updates timestamps in RequestEvents.
-// <function> is the timestamp to update, <value> is the updated value.
-void UpdateEventTimestamps(
-    const GroupMetadataMap& group_metadata_map, int64_t group_id, int64_t value,
-    std::function<void(int64_t, int64_t, RequestEvents*)> function,
-    RequestEventsMap* request_events_map) {
-  // Update RequestEvents that are directly associated with <group_id>.
-  if (auto request_events = gtl::FindOrNull(*request_events_map, group_id)) {
-    function(group_id, value, request_events);
-  }
-
-  // Update all the parent RequestEvents of <group_id>.
-  const GroupMetadata* group_metadata =
-      gtl::FindOrNull(group_metadata_map, group_id);
-  if (!group_metadata) return;
-  for (const int64_t parent_group_id : group_metadata->parents) {
-    if (auto parent_request_events =
-            gtl::FindOrNull(*request_events_map, parent_group_id)) {
-      // Update parent events, but still use <group_id> instead of
-      // <parent_group_id>, because xprof needs to track where these event
-      // timestamps originally come from.
-      function(group_id, value, parent_request_events);
-    }
-  }
-}
-
-// Updates RequestEvents using ReadFromDevice, WriteToDevice and DeviceRun.
-void UpdateRequestEvents(const GroupMetadataMap& group_metadata_map,
-                         absl::Span<const EventTypeSpan> events,
-                         int64_t group_id,
-                         RequestEventsMap* request_events_map) {
-  // Update RequestEvents that are directly associated with <group_id>.
-  if (auto request_events = gtl::FindOrNull(*request_events_map, group_id)) {
-    request_events->events.insert(request_events->events.end(), events.begin(),
-                                  events.end());
-  }
-
-  // Update all the parent RequestEvents of <group_id> with the same
-  // <event_type> and <time_span>. Parent RequestEvents are all the requests
-  // in a batch.
-  const GroupMetadata* group_metadata =
-      gtl::FindOrNull(group_metadata_map, group_id);
-  if (!group_metadata) return;
-  for (const int64_t parent_group_id : group_metadata->parents) {
-    if (auto parent_request_events =
-            gtl::FindOrNull(*request_events_map, parent_group_id)) {
-      parent_request_events->events.insert(parent_request_events->events.end(),
-                                           events.begin(), events.end());
-    }
-  }
-}
-
-// Initializes RequestEvents.
-// <is_batching_request> determines whether this event is a
-// BatchingSession.Run
-void InitializeRequestEvents(
-    const XEventVisitor& event, const GroupMetadataMap& group_metadata_map,
-    const absl::flat_hash_set<int64_t>& process_batch_group_ids,
-    const ModelIdMap& model_id_map, bool is_batching_request,
-    bool is_user_defined_request,
-    tensorflow::profiler::ModelIdDatabase* model_id_db,
-    RequestEventsMap* request_events_map) {
-  std::optional<XStatVisitor> optional_group_id =
-      event.GetStat(StatType::kGroupId);
-  if (!optional_group_id.has_value()) return;
-  int64_t group_id = optional_group_id->IntValue();
-
-  // If the event has ProcessBatch event as a parent, then do not consider
-  // it as a request.
-  if (process_batch_group_ids.contains(group_id)) return;
-
-  RequestEvents& request_events = (*request_events_map)[group_id];
-  const GroupMetadata* group_metadata =
-      gtl::FindOrNull(group_metadata_map, group_id);
-  if (!group_metadata) return;
-  // The children group_ids of a request are the batches related to this
-  // request.
-  for (const int64_t child_group_id : group_metadata->children) {
-    request_events.related_batch_ids.push_back(child_group_id);
-  }
-  // Sort related_batch_ids to get deterministic result.
-  absl::c_sort(request_events.related_batch_ids);
-  if (is_batching_request) {
-    // The children events of BatchingSession.Run are multiple Session.Run,
-    // use the first child event to initialize ModelId information, because
-    // all the children events should have the same ModelId.
-    if (group_metadata->children.empty()) return;
-    int64_t children_group_id = *group_metadata->children.begin();
-    const std::string* children_model_id =
-        gtl::FindOrNull(model_id_map, children_group_id);
-    request_events.model_id_index = AssignIndexToModelId(
-        children_model_id ? *children_model_id : "", model_id_db);
-  } else if (is_user_defined_request) {
-    const std::string* model_id = gtl::FindOrNull(model_id_map, group_id);
-    if (model_id) {
-      request_events.model_id_index =
-          AssignIndexToModelId(*model_id, model_id_db);
-    } else {
-      // In some cases (e.g., BrainServer::Estimate), a single request might
-      // dispatch batches for multiple models. If all children events
-      // have the same ModelId, we assign that ModelId to the request.
-      if (group_metadata->children.empty()) return;
-      int32_t model_id_index_for_all_children = -1;
-      bool all_children_have_same_model_id = true;
-      for (int64_t children_group_id : group_metadata->children) {
-        const std::string* children_model_id =
-            gtl::FindOrNull(model_id_map, children_group_id);
-        int32_t child_model_id_index = AssignIndexToModelId(
-            children_model_id ? *children_model_id : "", model_id_db);
-        if (model_id_index_for_all_children == -1) {
-          model_id_index_for_all_children = child_model_id_index;
-        } else if (child_model_id_index != model_id_index_for_all_children) {
-          all_children_have_same_model_id = false;
-        }
-      }
-      request_events.model_id_index =
-          all_children_have_same_model_id
-              ? model_id_index_for_all_children
-              : AssignIndexToModelId("", model_id_db);
-    }
-  } else {
-    const std::string* model_id = gtl::FindOrNull(model_id_map, group_id);
-    request_events.model_id_index =
-        AssignIndexToModelId(model_id ? *model_id : "", model_id_db);
-  }
-}
-
-// Set the begin and end timestamp of the request.
-// The timespan of the request is marked by the earliest timestamp and latest
-// timestamp of the events with the same group_id.
-void UpdateRequestTimespan(const EventsByType& host_events_by_type,
-                           RequestEventsMap* request_events_map) {
-  for (const auto& [_, events] : host_events_by_type) {
-    for (const auto& event : events) {
-      auto optional_group_id = event.GetStat(StatType::kGroupId);
-      if (optional_group_id.has_value()) {
-        if (RequestEvents* request = gtl::FindOrNull(
-                *request_events_map, optional_group_id->IntValue())) {
-          auto begin_ps = request->request_timespan.begin_ps() == 0
-                              ? event.GetTimespan().begin_ps()
-                              : std::min(request->request_timespan.begin_ps(),
-                                         event.GetTimespan().begin_ps());
-          auto end_ps = std::max(request->request_timespan.end_ps(),
-                                 event.GetTimespan().end_ps());
-          request->request_timespan = Timespan::FromEndPoints(begin_ps, end_ps);
-        }
-      }
-    }
-  }
-}
-
-// Update RequestEventsMap using data transfer events in tpu::system.
-// Each data transfer is associated with a start event, an end event, and a
-// transfer type (H2D or D2H).
-void UpdateTpuDataTransferEventsInTpuSystem(
-    const EventsByType& host_events_by_type,
-    const GroupMetadataMap& group_metadata_map,
-    const HostEventType data_transfer_start_event,
-    const HostEventType data_transfer_end_event,
-    const EventType data_transfer_type, RequestEventsMap* request_events_map) {
-  absl::flat_hash_map<uint64_t, std::array<const XEventVisitor*, 2>>
-      events_per_transfer;
-
-  auto build_events =
-      [&](const HostEventType event_type,
-          std::function<void(uint64_t, const XEventVisitor*)> func) {
-        if (const auto* events =
-                gtl::FindOrNull(host_events_by_type, event_type)) {
-          for (const XEventVisitor& event : *events) {
-            std::optional<XStatVisitor> optional_group_id =
-                event.GetStat(StatType::kGroupId);
-            if (!optional_group_id.has_value()) continue;
-            std::optional<XStatVisitor> context_id =
-                event.GetStat(StatType::kConsumerId);
-            if (!context_id.has_value()) continue;
-            func(context_id->IntValue(), &event);
-          }
-        }
-      };
-
-  // Build start event.
-  build_events(data_transfer_start_event,
-               [&](uint64_t id, const XEventVisitor* start_event) {
-                 events_per_transfer[id] = {start_event, nullptr};
-               });
-
-  // Build end event.
-  // This only happens when the start event exists, the end event has the same
-  // group ID as the start event, and the end event timestamp is larger than
-  // start event timestamp.
-  build_events(data_transfer_end_event,
-               [&](uint64_t id, const XEventVisitor* end_event) {
-                 if (auto* value = gtl::FindOrNull(events_per_transfer, id)) {
-                   const XEventVisitor* start_event = value->at(0);
-                   if (start_event->TimestampPs() < end_event->TimestampPs()) {
-                     value->at(1) = end_event;
-                   }
-                 }
-               });
-
-  std::vector<EventTypeSpan> event_to_update = {
-      {data_transfer_type, Timespan(0, 0)}};
-  for (const auto& [id, events] : events_per_transfer) {
-    if (events[0] != nullptr && events[1] != nullptr) {
-      // Duration of the data transfer is measured as the timespan between
-      // start and end events.
-      event_to_update[0].span =
-          Timespan(events[0]->TimestampPs(),
-                   events[1]->EndTimestampPs() - events[0]->TimestampPs());
-      UpdateRequestEvents(group_metadata_map, event_to_update,
-                          events[0]->GetStat(StatType::kGroupId)->IntValue(),
-                          request_events_map);
-    }
-  }
-}
-
-// Initializes device side events for TPU.
-void BuildTPUDeviceEvents(const std::vector<XPlane*>& device_traces,
-                          const EventsByType& host_events_by_type,
-                          const GroupMetadataMap& group_metadata_map,
-                          RequestEventsMap* request_events_map) {
-  static constexpr int64_t kDataTransferTypes[] = {
-      HostEventType::kReadHbm, HostEventType::kTransferD2HRequest,
-      HostEventType::kWriteHbm, HostEventType::kTransferH2DRequest,
-      HostEventType::kTransferPreprocessedH2DRequest};
-  auto data_transfer_type_to_enum = [](const int64_t type) {
-    switch (type) {
-      case HostEventType::kReadHbm:
-      case HostEventType::kTransferD2HRequest:
-        return EventType::DEVICE_TO_HOST;
-      case HostEventType::kWriteHbm:
-      case HostEventType::kTransferH2DRequest:
-      case HostEventType::kTransferPreprocessedH2DRequest:
-        return EventType::HOST_TO_DEVICE;
-      default:
-        return EventType::UNKNOWN_TIME;
-    }
-  };
-
-  // Initialize a TPU device event for future updates.
-  // In order to reuse the same UpdateRequestEvents function with GPU device
-  // events, here we create a vector of size 1 for TPU event.
-  std::vector<EventTypeSpan> event_to_update = {
-      {EventType::UNKNOWN_TIME, Timespan(0, 0)}};
-
-  // Update RequestEventsMap using data transfer events.
-  for (const int64_t data_transfer_type : kDataTransferTypes) {
-    if (const auto* data_transfer_events =
-            gtl::FindOrNull(host_events_by_type, data_transfer_type)) {
-      for (const XEventVisitor& data_transfer_event : *data_transfer_events) {
-        std::optional<XStatVisitor> optional_group_id =
-            data_transfer_event.GetStat(StatType::kGroupId);
-        if (!optional_group_id.has_value()) continue;
-        int64_t group_id = optional_group_id->IntValue();
-        event_to_update[0] = {data_transfer_type_to_enum(data_transfer_type),
-                              data_transfer_event.GetTimespan()};
-        UpdateRequestEvents(group_metadata_map, event_to_update, group_id,
-                            request_events_map);
-      }
-    }
-  }
-
-  UpdateTpuDataTransferEventsInTpuSystem(
-      host_events_by_type, group_metadata_map,
-      HostEventType::kTransferToDeviceIssueEvent,
-      HostEventType::kTransferToDeviceDone, EventType::HOST_TO_DEVICE,
-      request_events_map);
-
-  UpdateTpuDataTransferEventsInTpuSystem(
-      host_events_by_type, group_metadata_map,
-      HostEventType::kTransferFromDeviceIssueEvent,
-      HostEventType::kTransferFromDeviceDone, EventType::DEVICE_TO_HOST,
-      request_events_map);
-
-  for (const XPlane* device_trace : device_traces) {
-    XPlaneVisitor device_plane = CreateTfXPlaneVisitor(device_trace);
-    device_plane.ForEachLine([request_events_map, &event_to_update,
-                              &group_metadata_map](const XLineVisitor& line) {
-      if (line.Name() != tensorflow::profiler::kXlaModuleLineName) return;
-      line.ForEachEvent([request_events_map, &event_to_update,
-                         &group_metadata_map](const XEventVisitor& event) {
-        std::optional<XStatVisitor> group_id =
-            event.GetStat(StatType::kGroupId);
-        if (!group_id) return;
-        // TPU compute does not specify 32bit or 16bit, use
-        // DEVICE_COMPUTE_32 to annotate this is a compute event.
-        event_to_update[0] = {EventType::DEVICE_COMPUTE_32,
-                              event.GetTimespan()};
-        UpdateRequestEvents(group_metadata_map, event_to_update,
-                            group_id->IntValue(), request_events_map);
-      });
-    });
-  }
-
-  // Update timestamp for TPU execute event. It is used as the beginning of
-  // TPU runtime. For old TPU runtime, it is the TPUPartitionedCall events,
-  // for the new TPU runtime, it is the tpu::system::Execute event. There
-  // might be multiple TPU execute events in the same request,
-  // UpdateTsTPUExecute is implemented as getting the earlist timestamp of TPU
-  // execute event.
-  static constexpr int64_t kTPUExecuteTypes[] = {
-      HostEventType::kTpuPartitionedCallOpExecuteLocal,
-      HostEventType::kTpuPartitionedCallOpExecuteRemote,
-      HostEventType::kTpuPartitionedCallOpInitializeVarOnTpu,
-      HostEventType::kTpuSystemExecute};
-  for (const int64_t tpu_execute_type : kTPUExecuteTypes) {
-    if (const auto* tpu_execute_events =
-            gtl::FindOrNull(host_events_by_type, tpu_execute_type)) {
-      for (const XEventVisitor& tpu_execute_event : *tpu_execute_events) {
-        std::optional<XStatVisitor> optional_group_id =
-            tpu_execute_event.GetStat(StatType::kGroupId);
-        if (!optional_group_id.has_value()) continue;
-        int64_t group_id = optional_group_id->IntValue();
-        UpdateEventTimestamps(group_metadata_map, group_id,
-                              tpu_execute_event.TimestampPs(),
-                              UpdateTsTPUExecute, request_events_map);
-      }
-    }
-  }
-
-  // Update timestamp for TPU program launch events. This is used as the end
-  // of TPU runtime. Only one of the following program launch events will
-  // appear in a single profile.
-  static constexpr int64_t kTPUProgramLaunchTypes[] = {
-      HostEventType::kDoEnqueueProgram,
-      HostEventType::kDoEnqueueContinuationProgram};
-  for (const int64_t tpu_program_launch_type : kTPUProgramLaunchTypes) {
-    if (const auto* tpu_program_launch_events =
-            gtl::FindOrNull(host_events_by_type, tpu_program_launch_type)) {
-      for (const XEventVisitor& tpu_program_launch_event :
-           *tpu_program_launch_events) {
-        std::optional<XStatVisitor> optional_group_id =
-            tpu_program_launch_event.GetStat(StatType::kGroupId);
-        if (!optional_group_id.has_value()) continue;
-        int64_t group_id = optional_group_id->IntValue();
-        UpdateEventTimestamps(group_metadata_map, group_id,
-                              tpu_program_launch_event.TimestampPs(),
-                              UpdateTsTPUProgramLaunch, request_events_map);
-      }
-    }
-  }
-
-  // Update timestamp for TPU complete callbacks. This is used as the start of
-  // host postprocessing.
-  if (const auto* tpu_complete_callback_events = gtl::FindOrNull(
-          host_events_by_type, HostEventType::kCompleteCallbacks)) {
-    for (const XEventVisitor& tpu_complete_callback_event :
-         *tpu_complete_callback_events) {
-      std::optional<XStatVisitor> optional_group_id =
-          tpu_complete_callback_event.GetStat(StatType::kGroupId);
-      if (!optional_group_id.has_value()) continue;
-      int64_t group_id = optional_group_id->IntValue();
-      UpdateEventTimestamps(group_metadata_map, group_id,
-                            tpu_complete_callback_event.TimestampPs(),
-                            UpdateTsTPUCompleteCallback, request_events_map);
-    }
-  }
-}
-
-// Initializes device side events for GPU.
-void BuildGPUDeviceEvents(const StepEvents& nonoverlapped_step_events,
-                          const GroupMetadataMap& group_metadata_map,
-                          RequestEventsMap* request_events_map) {
-  for (const auto& [step_id, step_details] : nonoverlapped_step_events) {
-    UpdateRequestEvents(group_metadata_map, step_details.Events(), step_id,
-                        request_events_map);
-  }
-}
-
-// Initialize the mapping from group_id to model_id. Skip the event if it
-// doesn't have group_id or model_id.
-ModelIdMap InitializeModelIdMap(
-    const EventsByType& host_events_by_type,
-    const std::vector<const XEventVisitor*>& user_defined_root_events) {
-  ModelIdMap model_id_map;
-
-  // Helper function to process model id.
-  auto process_model_id = [&](const XEventVisitor& event) {
-    auto group_id = event.GetStat(StatType::kGroupId);
-    if (!group_id.has_value()) return;
-    std::optional<XStatVisitor> model_id = event.GetStat(StatType::kModelId);
-    if (!model_id.has_value()) return;
-    model_id_map[group_id->IntValue()] = model_id->ToString();
-  };
-
-  static constexpr int64_t kModelIdRequestTypes[] = {
-      HostEventType::kSessionRun, HostEventType::kTfrtModelRun,
-      HostEventType::kServingModelRun};
-  for (const int64_t event_type : kModelIdRequestTypes) {
-    auto event_list = gtl::FindOrNull(host_events_by_type, event_type);
-    if (!event_list) continue;
-    for (const XEventVisitor& event : *event_list) {
-      process_model_id(event);
-    }
-  }
-
-  for (const XEventVisitor* event : user_defined_root_events) {
-    process_model_id(*event);
-  }
-
-  return model_id_map;
-}
-
-// Builds a request_events_map from the given trace events.
-void BuildRequestEventsMap(const std::vector<XPlane*>& device_traces,
-                           const EventsByType& host_events_by_type,
-                           const GroupMetadataMap& group_metadata_map,
-                           const StepEvents& nonoverlapped_step_events,
-                           DeviceType device_type,
-                           tensorflow::profiler::ModelIdDatabase* model_id_db,
-                           RequestEventsMap* request_events_map) {
-  static constexpr int64_t kBatchingRequestTypes[] = {
-      HostEventType::kBatchingSessionRun};
-  static constexpr int64_t kNonBatchingRequestTypes[] = {
-      HostEventType::kSessionRun, HostEventType::kRunGraph};
-  // TODO(wffw): Merge them once go/pathways-tfrt-serving-unification is done.
-  static constexpr int64_t kTfrtRequestTypes[] = {HostEventType::kTfrtModelRun};
-  static constexpr int64_t kPathwayRequestTypes[] = {
-      HostEventType::kServingModelRun};
-
-  static constexpr int64_t kScheduleEventTypes[] = {
-      HostEventType::kScheduleWithSplit, HostEventType::kScheduleWithoutSplit,
-      HostEventType::kScheduleWithEagerSplit,
-      HostEventType::kASBSQueueSchedule};
-
-  // Events marked with "_r:-1" are user defined root events.
-  std::vector<const XEventVisitor*> user_defined_root_events;
-  for (const auto& [_, events] : host_events_by_type) {
-    for (const auto& event : events) {
-      std::optional<XStatVisitor> stat = event.GetStat(StatType::kIsRoot);
-      if (stat.has_value() && stat->IntValue() == -1) {
-        user_defined_root_events.push_back(&event);
-      }
-    }
-  }
-
-  // Group IDs of ProcessBatch events.
-  absl::flat_hash_set<int64_t> process_batch_group_ids;
-  if (const auto* process_batch_events =
-          gtl::FindOrNull(host_events_by_type, HostEventType::kProcessBatch)) {
-    for (const XEventVisitor& process_batch_event : *process_batch_events) {
-      std::optional<XStatVisitor> optional_group_id =
-          process_batch_event.GetStat(StatType::kGroupId);
-      if (!optional_group_id.has_value()) continue;
-      process_batch_group_ids.insert(optional_group_id->IntValue());
-    }
-  }
-
-  ModelIdMap model_id_map =
-      InitializeModelIdMap(host_events_by_type, user_defined_root_events);
-
-  // Initialize RequestEventsMap.
-  bool is_batching_request =
-      host_events_by_type.contains(HostEventType::kBatchingSessionRun);
-  bool is_tfrt_request =
-      host_events_by_type.contains(HostEventType::kTfrtModelRun);
-  // TODO(wffw): Merge them once go/pathways-tfrt-serving-unification is done.
-  bool is_pathway_request =
-      host_events_by_type.contains(HostEventType::kServingModelRun);
-  absl::Span<const int64_t> request_types;
-  if (is_batching_request) {
-    request_types = absl::Span<const int64_t>(kBatchingRequestTypes);
-  } else if (is_tfrt_request) {
-    request_types = absl::Span<const int64_t>(kTfrtRequestTypes);
-  } else if (is_pathway_request) {
-    request_types = absl::Span<const int64_t>(kPathwayRequestTypes);
-  } else {
-    request_types = absl::Span<const int64_t>(kNonBatchingRequestTypes);
-  }
-  for (const int64_t request_type : request_types) {
-    if (const auto* request_events =
-            gtl::FindOrNull(host_events_by_type, request_type)) {
-      for (const XEventVisitor& request_event : *request_events) {
-        InitializeRequestEvents(request_event, group_metadata_map,
-                                process_batch_group_ids, model_id_map,
-                                is_batching_request,
-                                /* is_user_defined_request=*/false, model_id_db,
-                                request_events_map);
-      }
-    }
-  }
-
-  for (const XEventVisitor* event : user_defined_root_events) {
-    InitializeRequestEvents(
-        *event, group_metadata_map, process_batch_group_ids, model_id_map,
-        /*is_batching_request=*/false,
-        /* is_user_defined_request=*/true, model_id_db, request_events_map);
-  }
-
-  // Set the begin and end timestamp of the request.
-  UpdateRequestTimespan(host_events_by_type, request_events_map);
-
-  // Update RequestEventsMap using the request size in schedule event.
-  for (const int64_t schedule_type : kScheduleEventTypes) {
-    if (const auto* schedule_events =
-            gtl::FindOrNull(host_events_by_type, schedule_type)) {
-      for (const XEventVisitor& schedule_event : *schedule_events) {
-        std::optional<XStatVisitor> optional_group_id =
-            schedule_event.GetStat(StatType::kGroupId);
-        if (!optional_group_id.has_value()) continue;
-        int64_t group_id = optional_group_id->IntValue();
-        // Update timestamp for schedule events. It is used as the beginning
-        // of batch formation.
-        UpdateEventTimestamps(group_metadata_map, group_id,
-                              schedule_event.TimestampPs(),
-                              UpdateTsBatchSchedule, request_events_map);
-        if (auto* request_events =
-                gtl::FindOrNull(*request_events_map, group_id)) {
-          std::optional<XStatVisitor> batching_request_size =
-              schedule_event.GetStat(StatType::kBatchingInputTaskSize);
-          if (!batching_request_size.has_value()) continue;
-          request_events->batching_request_size =
-              batching_request_size->IntValue();
-        }
-      }
-    }
-  }
-
-  if (device_type == DeviceType::kTpu) {
-    BuildTPUDeviceEvents(device_traces, host_events_by_type, group_metadata_map,
-                         request_events_map);
-  } else if (device_type == DeviceType::kGpu) {
-    BuildGPUDeviceEvents(nonoverlapped_step_events, group_metadata_map,
-                         request_events_map);
-  }
-}
-
-// Extracts batch details from <event_forest>.
-void BuildBatchEventsMap(const EventsByType& host_events_by_type,
-                         const GroupMetadataMap& group_metadata_map,
-                         RequestEventsMap* request_events_map,
-                         BatchEventsMap* batch_events_map) {
-  // Initialize BatchDetails from ProcessBatch events.
-  if (const auto* process_batch_events =
-          gtl::FindOrNull(host_events_by_type, HostEventType::kProcessBatch)) {
-    for (const XEventVisitor& process_batch_event : *process_batch_events) {
-      std::optional<XStatVisitor> optional_group_id =
-          process_batch_event.GetStat(StatType::kGroupId);
-      if (!optional_group_id.has_value()) continue;
-      int64_t group_id = optional_group_id->IntValue();
-      const GroupMetadata* group_metadata =
-          gtl::FindOrNull(group_metadata_map, group_id);
-      if (!group_metadata) continue;
-      BatchEvents& batch_events = (*batch_events_map)[group_id];
-      tensorflow::profiler::BatchDetail& batch_detail =
-          batch_events.batch_detail_proto;
-      batch_detail.set_batch_id(group_id);
-      batch_detail.set_start_time_ps(process_batch_event.TimestampPs());
-      batch_detail.set_end_time_ps(process_batch_event.EndTimestampPs());
-      // The parent group_ids of a batch are the requests related to this
-      // batch.
-      for (const int64_t parent_group_id : group_metadata->parents) {
-        batch_detail.add_related_request_ids(parent_group_id);
-      }
-      // Sort related_request_ids to get deterministic result.
-      std::sort(batch_detail.mutable_related_request_ids()->begin(),
-                batch_detail.mutable_related_request_ids()->end());
-    }
-  }
-
-  // Update BatchDetailsMap with padding information. Only one of
-  // ConcatInputTensors (for in-graph batching) or MergeInputTensors (for
-  // BatchingSession), or BrainSessionRun will appear in the
-  // same profile.
-  static constexpr int64_t kPaddingEventTypes[] = {
-      HostEventType::kConcatInputTensors,
-      HostEventType::kMergeInputTensors,
-      HostEventType::kBrainSessionRun,
-  };
-  for (const int64_t padding_event_type : kPaddingEventTypes) {
-    if (const auto* padding_events =
-            gtl::FindOrNull(host_events_by_type, padding_event_type)) {
-      for (const XEventVisitor& padding_event : *padding_events) {
-        // Update timestamp for padding events. They are used as the
-        // beginning of batch processing.
-        std::optional<XStatVisitor> optional_group_id =
-            padding_event.GetStat(StatType::kGroupId);
-        if (!optional_group_id.has_value()) continue;
-        int64_t group_id = optional_group_id->IntValue();
-        UpdateEventTimestamps(group_metadata_map, group_id,
-                              padding_event.TimestampPs(),
-                              UpdateTsBatchConcatInput, request_events_map);
-        BatchEvents* batch_events =
-            gtl::FindOrNull(*batch_events_map, group_id);
-        if (!batch_events) continue;
-        std::optional<XStatVisitor> padding_amount =
-            padding_event.GetStat(StatType::kPaddingAmount);
-        if (!padding_amount.has_value()) continue;
-        std::optional<XStatVisitor> batch_size_after_padding =
-            padding_event.GetStat(StatType::kBatchSizeAfterPadding);
-        if (!batch_size_after_padding.has_value()) continue;
-        tensorflow::profiler::BatchDetail* batch_detail =
-            &batch_events->batch_detail_proto;
-        batch_detail->set_batch_size_after_padding(
-            batch_size_after_padding->IntValue());
-        batch_detail->set_padding_amount(padding_amount->IntValue());
-      }
-    }
-  }
-
-  // Populate BatchDetailsMap with model_id information from the corresponding
-  // requests in RequestEventsMap.
-  for (auto& [batch_id, batch_events] : *batch_events_map) {
-    tensorflow::profiler::BatchDetail& batch_detail =
-        batch_events.batch_detail_proto;
-    if (!batch_detail.related_request_ids().empty()) {
-      // Set the model_id of a batch using the model_id of the corresponding
-      // request. All requests in the same batch must share the same model_id,
-      // so we can pick any request in the batch here.
-      int32_t first_request_id = batch_detail.related_request_ids(0);
-      const RequestEvents* request_events =
-          gtl::FindOrNull(*request_events_map, first_request_id);
-      if (request_events) {
-        batch_detail.set_model_id_index(request_events->model_id_index);
-      }
-    }
-  }
-}
-
-// Calculates the delay between request and batch.
-void GenerateRequestAndBatchDelay(RequestEventsMap* request_events_map,
-                                  BatchEventsMap* batch_events_map) {
-  for (auto& [request_id, request_event] : *request_events_map) {
-    const tensorflow::profiler::BatchDetail* first_batch_detail = nullptr;
-    const tensorflow::profiler::BatchDetail* last_batch_detail = nullptr;
-    // For each request, measure the latency between the request and the first
-    // batch that processes this request.
-    for (const int64_t batch_id : request_event.related_batch_ids) {
-      const auto* batch_events = gtl::FindOrNull(*batch_events_map, batch_id);
-      if (!batch_events) continue;
-      const tensorflow::profiler::BatchDetail* batch_detail =
-          &batch_events->batch_detail_proto;
-      if (!first_batch_detail || (first_batch_detail->has_start_time_ps() >
-                                  batch_detail->has_start_time_ps())) {
-        first_batch_detail = batch_detail;
-      }
-      if (!last_batch_detail || (last_batch_detail->has_end_time_ps() <
-                                 batch_detail->has_end_time_ps())) {
-        last_batch_detail = batch_detail;
-      }
-    }
-    if (first_batch_detail) {
-      request_event.batching_request_delay_ps =
-          first_batch_detail->start_time_ps() -
-          request_event.request_timespan.begin_ps();
-    }
-    if (last_batch_detail && request_event.request_timespan.end_ps() <
-                                 last_batch_detail->end_time_ps()) {
-      request_event.request_timespan =
-          Timespan::FromEndPoints(request_event.request_timespan.begin_ps(),
-                                  last_batch_detail->end_time_ps());
-    }
-  }
-
-  for (auto& [batch_id, batch_events] : *batch_events_map) {
-    const RequestEvents* first_request_events = nullptr;
-    tensorflow::profiler::BatchDetail& batch_detail =
-        batch_events.batch_detail_proto;
-    // For each batch, measure the latency between the first request in this
-    // batch and the start time of this batch.
-    for (const int64_t request_id : batch_detail.related_request_ids()) {
-      const auto* request_events =
-          gtl::FindOrNull(*request_events_map, request_id);
-      if (!request_events) continue;
-      if (!first_request_events ||
-          (first_request_events->request_timespan.begin_ps() >
-           request_events->request_timespan.begin_ps())) {
-        first_request_events = request_events;
-      }
-    }
-    if (first_request_events) {
-      batch_detail.set_batch_delay_ps(
-          batch_detail.start_time_ps() -
-          first_request_events->request_timespan.begin_ps());
-    }
-  }
-}
-
-// Generates detailed breakdown for a request by generating events using the
-// timestamps in RequestEvents.
-void GenerateRequestDetailedBreakdown(RequestEventsMap* request_events_map) {
-  for (auto& [_, request] : *request_events_map) {
-    std::optional<int64_t> first_tpu_execute;
-    std::optional<int64_t> first_batch_concat_input;
-    std::optional<int64_t> last_tpu_complete_callback;
-    std::optional<int64_t> only_batch_schedule;
-    for (const auto& [group_id, timestamps] : request.timestamps) {
-      if (timestamps.ts_tpu_execute.has_value()) {
-        MinOfOptional(first_tpu_execute, timestamps.ts_tpu_execute);
-
-        // Host runtime: From the start of TPU execute event to the start of
-        // TPU program launch. Because of request splitting, there can be
-        // multiple host runtime in a single request, one for each batch.
-        if (timestamps.ts_tpu_program_launch.has_value()) {
-          request.events.push_back(
-              {EventType::HOST_RUNTIME,
-               Timespan::FromEndPoints(
-                   timestamps.ts_tpu_execute.value(),
-                   timestamps.ts_tpu_program_launch.value())});
-        }
-      }
-
-      if (timestamps.ts_batch_concat_input.has_value()) {
-        MinOfOptional(first_batch_concat_input,
-                      timestamps.ts_batch_concat_input);
-      }
-
-      if (timestamps.ts_tpu_complete_callback.has_value()) {
-        MaxOfOptional(last_tpu_complete_callback,
-                      timestamps.ts_tpu_complete_callback);
-      }
-
-      if (timestamps.ts_batch_schedule.has_value()) {
-        if (only_batch_schedule.has_value()) {
-          LOG(ERROR) << "Found multiple batch schedule events in a single "
-                     << "request.";
-        } else {
-          only_batch_schedule = timestamps.ts_batch_schedule;
-        }
-      }
-    }
-
-    // Host preprocessing: From the start of the request to the start of the
-    // first execute event. There is only one host preprocess even if there
-    // are multiple batches caused by request splitting.
-    if (first_tpu_execute.has_value()) {
-      request.events.push_back(
-          {EventType::HOST_PREPROCESS,
-           Timespan::FromEndPoints(request.request_timespan.begin_ps(),
-                                   first_tpu_execute.value())});
-    }
-
-    // Host postprocessing: If there are CompleteCallback events for this
-    // request, use the last CompleteCallback event as the beginning of host
-    // postprocessing. Else, use the end time of the last TPU device compute
-    // events. There is only one host postprocessing even if there are
-    // multiple batches caused by request splitting.
-    if (last_tpu_complete_callback.has_value()) {
-      request.events.push_back(
-          {EventType::HOST_POSTPROCESS,
-           Timespan::FromEndPoints(last_tpu_complete_callback.value(),
-                                   request.request_timespan.end_ps())});
-    } else {
-      // Get the latest end time of TPU device compute events.
-      // These events are annotated with type DEVICE_COMPUTE_32.
-      // TODO(tianrun): Deprecate this code path after CompleteCallback is
-      // enabled in all Tensorflow binaries.
-      uint64_t device_compute_end = 0;
-      for (const auto& event : request.events) {
-        if (event.type == EventType::DEVICE_COMPUTE_32) {
-          device_compute_end =
-              std::max(device_compute_end, event.span.end_ps());
-        }
-      }
-      if (device_compute_end != 0) {
-        request.events.push_back(
-            {EventType::HOST_POSTPROCESS,
-             Timespan::FromEndPoints(device_compute_end,
-                                     request.request_timespan.end_ps())});
-      }
-    }
-
-    // Batch formation: From the start of batch schedule, to the start of the
-    // first concat input. This is only applicable when batching is enabled,
-    // and it overlaps with host preprocessing.
-    if (only_batch_schedule.has_value() &&
-        first_batch_concat_input.has_value()) {
-      request.events.push_back(
-          {EventType::HOST_BATCH_FORMATION,
-           Timespan::FromEndPoints(only_batch_schedule.value(),
-                                   first_batch_concat_input.value())});
-    }
-  }
-}
-
-// Generates tensor patterns from tensor related EventNodes.
-// If there is any error during the generation, return an empty string.
-std::string GenerateTensorPattern(
-    const std::vector<const XEventVisitor*>& tensor_events) {
-  // Generate one sub pattern for each tensor event, the sub pattern records
-  // the tensor shape, type, and layout.
-  std::vector<std::string> sub_patterns;
-  sub_patterns.reserve(tensor_events.size());
-  for (const XEventVisitor* tensor_event : tensor_events) {
-    std::optional<XStatVisitor> shape =
-        tensor_event->GetStat(StatType::kTensorShapes);
-    if (!shape.has_value()) return "";
-    std::optional<XStatVisitor> layout =
-        tensor_event->GetStat(StatType::kTensorLayout);
-    if (!layout.has_value()) return "";
-    sub_patterns.push_back(absl::StrCat(tensor_event->Name(), " ",
-                                        shape->StrOrRefValue(), " ",
-                                        layout->StrOrRefValue()));
-  }
-  // Sort the sub patterns to get a deterministic result.
-  std::sort(sub_patterns.begin(), sub_patterns.end());
-  // The final tensor pattern is generated as the concatenation of all sub
-  // patterns. Use <br> as separator so it can be displayed properly in
-  // frontend.
-  return absl::StrJoin(sub_patterns, "<br>");
-}
-
-// Generates the total time spent on linearize and delinearize tensors.
-uint64_t GenerateTensorLinearizeDelinearizeTime(
-    const std::vector<const XEventVisitor*>& tensor_events) {
-  uint64_t result = 0;
-  for (const XEventVisitor* tensor_event : tensor_events) {
-    result += tensor_event->DurationPs();
-  }
-  return result;
-}
-
-// Generates the details related to tensor shape, type, and layout.
-void GenerateTensorDetails(
-    const EventsByType& host_events_by_type,
-    RequestEventsMap* request_events_map, BatchEventsMap* batch_events_map,
-    tensorflow::profiler::InferenceStats* inference_stats) {
-  static constexpr int64_t kTensorDetailEventTypes[] = {
-      HostEventType::kLinearize, HostEventType::kDelinearize,
-      HostEventType::kTransferBufferFromDeviceFastPath};
-
-  for (const int64_t tensor_detail_event_type : kTensorDetailEventTypes) {
-    if (const auto* tensor_detail_events =
-            gtl::FindOrNull(host_events_by_type, tensor_detail_event_type)) {
-      for (const XEventVisitor& tensor_detail_event : *tensor_detail_events) {
-        std::optional<XStatVisitor> optional_group_id =
-            tensor_detail_event.GetStat(StatType::kGroupId);
-        if (!optional_group_id.has_value()) continue;
-        int64_t group_id = optional_group_id->IntValue();
-        // Add events to corresponding requests and batches.
-        if (auto* request_events =
-                gtl::FindOrNull(*request_events_map, group_id)) {
-          request_events->tensor_events.push_back(&tensor_detail_event);
-        } else if (auto* batch_events =
-                       gtl::FindOrNull(*batch_events_map, group_id)) {
-          batch_events->tensor_events.push_back(&tensor_detail_event);
-        }
-      }
-    }
-  }
-
-  absl::flat_hash_map<std::string, int> tensor_patterns;
-  auto get_tensor_pattern_index =
-      [&tensor_patterns](const std::string& tensor_pattern) {
-        if (int* index = gtl::FindOrNull(tensor_patterns, tensor_pattern)) {
-          return *index;
-        }
-        int index = tensor_patterns.size();
-        tensor_patterns.insert(std::make_pair(tensor_pattern, index));
-        return index;
-      };
-
-  // Generates the tensor details that are owned by request.
-  for (auto& [group_id, request_events] : *request_events_map) {
-    if (request_events.tensor_events.empty()) continue;
-    std::string tensor_pattern =
-        GenerateTensorPattern(request_events.tensor_events);
-    if (tensor_pattern.empty()) continue;
-    int index = get_tensor_pattern_index(tensor_pattern);
-    tensorflow::profiler::TensorEventDetail tensor_event_detail;
-    tensor_event_detail.set_tensor_pattern_index(index);
-    tensor_event_detail.set_owner(
-        tensorflow::profiler::TensorEventDetail::REQUEST);
-    tensor_event_detail.set_linearize_delinearize_time_ps(
-        GenerateTensorLinearizeDelinearizeTime(request_events.tensor_events));
-    request_events.tensor_event_detail_protos.push_back(
-        std::move(tensor_event_detail));
-  }
-
-  // Generates the tensor details that are owned by batch.
-  for (auto& [group_id, batch_events] : *batch_events_map) {
-    if (batch_events.tensor_events.empty()) continue;
-    std::string tensor_pattern =
-        GenerateTensorPattern(batch_events.tensor_events);
-    if (tensor_pattern.empty()) continue;
-    int index = get_tensor_pattern_index(tensor_pattern);
-    auto* tensor_event_detail =
-        batch_events.batch_detail_proto.mutable_tensor_event_detail();
-    tensor_event_detail->set_tensor_pattern_index(index);
-    tensor_event_detail->set_owner(
-        tensorflow::profiler::TensorEventDetail::BATCH);
-    tensor_event_detail->set_linearize_delinearize_time_ps(
-        GenerateTensorLinearizeDelinearizeTime(batch_events.tensor_events));
-  }
-
-  // Populates the tensor details from batch to the related requests. These
-  // tensor details are still owned by the batches and will not be used to
-  // calculate statistics like the number of occurrence of each tensor
-  // pattern.
-  for (const auto& [group_id, batch_events] : *batch_events_map) {
-    if (!batch_events.batch_detail_proto.has_tensor_event_detail()) continue;
-    for (const int64_t request_id :
-         batch_events.batch_detail_proto.related_request_ids()) {
-      if (auto* request_events =
-              gtl::FindOrNull(*request_events_map, request_id)) {
-        request_events->tensor_event_detail_protos.push_back(
-            batch_events.batch_detail_proto.tensor_event_detail());
-      }
-    }
-  }
-
-  // Generates TensorPatternDatabase.
-  if (tensor_patterns.empty()) {
-    return;
-  }
-  absl::flat_hash_map<int, const std::string*> reversed_tensor_patterns;
-  for (const auto& tensor_pattern : tensor_patterns) {
-    reversed_tensor_patterns[tensor_pattern.second] = &tensor_pattern.first;
-  }
-  for (int i = 0; i < static_cast<int>(tensor_patterns.size()); i++) {
-    inference_stats->mutable_tensor_pattern_db()->add_tensor_pattern(
-        *reversed_tensor_patterns.at(i));
-  }
-}
-
-// Generates the request details proto from its events.
-void RequestEventsToDetails(
-    DeviceType device_type, int64_t group_id,
-    const RequestEvents& request_events,
-    tensorflow::profiler::RequestDetail* request_detail) {
-  request_detail->set_request_id(group_id);
-  request_detail->set_model_id_index(request_events.model_id_index);
-  request_detail->set_start_time_ps(request_events.request_timespan.begin_ps());
-  request_detail->set_end_time_ps(request_events.request_timespan.end_ps());
-  request_detail->set_batching_request_delay_ps(
-      request_events.batching_request_delay_ps);
-  request_detail->set_batching_request_size(
-      request_events.batching_request_size);
-  for (const auto& tensor_event_detail :
-       request_events.tensor_event_detail_protos) {
-    *request_detail->add_tensor_event_details() = tensor_event_detail;
-  }
-  for (const int64_t batch_id : request_events.related_batch_ids) {
-    request_detail->add_related_batch_ids(batch_id);
-  }
-
-  std::vector<EventTypeSpan> tpu_non_overlapped_events;
-  const std::vector<EventTypeSpan>* non_overlapped_events =
-      &tpu_non_overlapped_events;
-  if (device_type == DeviceType::kTpu) {
-    // For TPU device events, request_events.events may be overlapped in the
-    // timeline. So first converts it to non-overlapped events in the timeline
-    // before the breakdown.
-    tpu_non_overlapped_events = ToNonOverlappedEvents(request_events.events);
-  } else if (device_type == DeviceType::kGpu) {
-    // For GPU device events, request_events.events come from non overlapped
-    // StepEvents, so there is no need to convert to non overlapping events
-    // again.
-    non_overlapped_events = &(request_events.events);
-  }
-
-  int64_t device_time_ps = 0;
-  int64_t write_time_ps = 0;
-  int64_t read_time_ps = 0;
-  int64_t host_preprocess_ps = 0;
-  int64_t host_postprocess_ps = 0;
-  int64_t host_runtime_ps = 0;
-  int64_t host_batch_formation_ps = 0;
-  int64_t idle_time_ps = 0;
-  for (const auto& event : *non_overlapped_events) {
-    const auto& duration_ps = event.span.duration_ps();
-    switch (event.type) {
-      case EventType::DEVICE_COMPUTE_16:
-      case EventType::DEVICE_COMPUTE_32:
-        device_time_ps += duration_ps;
-        break;
-      case EventType::HOST_TO_DEVICE:
-        write_time_ps += duration_ps;
-        break;
-      case EventType::DEVICE_TO_HOST:
-        read_time_ps += duration_ps;
-        break;
-      case EventType::HOST_PREPROCESS:
-        host_preprocess_ps += duration_ps;
-        break;
-      case EventType::HOST_POSTPROCESS:
-        host_postprocess_ps += duration_ps;
-        break;
-      case EventType::HOST_RUNTIME:
-        host_runtime_ps += duration_ps;
-        break;
-      case EventType::HOST_BATCH_FORMATION:
-        host_batch_formation_ps += duration_ps;
-        break;
-      case EventType::UNKNOWN_TIME:
-        idle_time_ps += duration_ps;
-        break;
-      default:
-        break;
-    }
-  }
-  request_detail->set_device_time_ps(device_time_ps);
-  request_detail->set_write_to_device_time_ps(write_time_ps);
-  request_detail->set_read_from_device_time_ps(read_time_ps);
-  request_detail->set_host_preprocessing_ps(host_preprocess_ps);
-  request_detail->set_host_postprocessing_ps(host_postprocess_ps);
-  request_detail->set_host_runtime_ps(host_runtime_ps);
-  request_detail->set_host_batch_formation_ps(host_batch_formation_ps);
-  request_detail->set_idle_time_ps(idle_time_ps);
-}
-
-// Compares two data points by duration.
-// DataType can be either RequestDetail or BatchDetail.
-template <typename DataType>
-bool CompareByDuration(const DataType& a, const DataType& b) {
-  return Timespan::ByDuration(
-      Timespan::FromEndPoints(a.start_time_ps(), a.end_time_ps()),
-      Timespan::FromEndPoints(b.start_time_ps(), b.end_time_ps()));
-}
-
-void BuildRequestDetails(
-    const RequestEventsMap& request_events_map, DeviceType device_type,
-    const int32_t host_id,
-    tsl::protobuf::RepeatedPtrField<tensorflow::profiler::RequestDetail>*
-        request_details) {
-  for (auto& [group_id, request_events] : request_events_map) {
-    if (request_events.request_timespan.duration_ps() == 0) continue;
-    tensorflow::profiler::RequestDetail* request_detail =
-        request_details->Add();
-    request_detail->set_host_id(host_id);
-    RequestEventsToDetails(device_type, group_id, request_events,
-                           request_detail);
-  }
-  std::sort(request_details->begin(), request_details->end(),
-            CompareByDuration<tensorflow::profiler::RequestDetail>);
-}
-
-void BuildBatchDetails(
-    BatchEventsMap batch_events_map, const int32_t host_id,
-    tsl::protobuf::RepeatedPtrField<tensorflow::profiler::BatchDetail>*
-        batch_details) {
-  for (auto& [group_id, batch_events] : batch_events_map) {
-    batch_events.batch_detail_proto.set_host_id(host_id);
-    *batch_details->Add() = std::move(batch_events.batch_detail_proto);
-  }
-  std::sort(batch_details->begin(), batch_details->end(),
-            CompareByDuration<tensorflow::profiler::BatchDetail>);
-}
-
-// Parses TFstreamz xplane to get batching parameters, and stores the
-// parameters to <model_id_db>.
-void ParseTfstreamzForBatchingParameter(
-    const XSpace& xspace, tensorflow::profiler::ModelIdDatabase* model_id_db) {
-  const XPlane* tfstreamz_plane = ::tsl::profiler::FindPlaneWithName(
-      xspace, tensorflow::profiler::kTFStreamzPlaneName);
-  // There are two TFStreamz events per profile, one at the beginning, one at
-  // the end of the profile, each represents a snapshot of the TFstreamz.
-  // Use the last one as the source to get batching parameters because the
-  // first snapshot might be taken before Tensorflow setting up the batching
-  // parameters.
-  if (tfstreamz_plane == nullptr || tfstreamz_plane->lines().empty() ||
-      tfstreamz_plane->lines(0).events_size() != 2) {
-    return;
-  }
-  XPlaneVisitor plane(tfstreamz_plane);
-  XEventVisitor event(&plane, &tfstreamz_plane->lines(0),
-                      &tfstreamz_plane->lines(0).events(1));
-
-  static constexpr char kBatchingParamPrefix[] =
-      "/tensorflow/serving/batching/";
-  static constexpr char kBatchingParamNumBatchThreads[] = "num_batch_threads";
-  static constexpr char kBatchingParamBatchTimeoutMicros[] =
-      "batch_timeout_micros";
-  static constexpr char kBatchingParamMaxBatchSize[] = "max_batch_size";
-  static constexpr char kBatchingParamMaxEnqueuedBatches[] =
-      "max_enqueued_batches";
-  static constexpr char kBatchingParamAllowedBatchSizes[] =
-      "allowed_batch_sizes";
-
-  // Parse the batching parameters from TFstreamz and associate it them with
-  // model IDs.
-  absl::flat_hash_map<absl::string_view,
-                      tensorflow::profiler::BatchingParameters>
-      model_params;
-  event.ForEachStat([&](const XStatVisitor& stat) {
-    if (!absl::StartsWith(stat.Name(), kBatchingParamPrefix)) return;
-
-    absl::string_view param_detail =
-        stat.Name().substr(ABSL_ARRAYSIZE(kBatchingParamPrefix) - 1);
-    auto [parse_success, model_id_tfstreamz] = ParseModelName(param_detail);
-    if (!parse_success) {
-      return;
-    }
-
-    if (absl::StartsWith(param_detail, kBatchingParamNumBatchThreads)) {
-      model_params[model_id_tfstreamz].set_num_batch_threads(stat.IntValue());
-    } else if (absl::StartsWith(param_detail,
-                                kBatchingParamBatchTimeoutMicros)) {
-      model_params[model_id_tfstreamz].set_batch_timeout_micros(
-          stat.IntValue());
-    } else if (absl::StartsWith(param_detail, kBatchingParamMaxBatchSize)) {
-      model_params[model_id_tfstreamz].set_max_batch_size(stat.IntValue());
-    } else if (absl::StartsWith(param_detail,
-                                kBatchingParamMaxEnqueuedBatches)) {
-      model_params[model_id_tfstreamz].set_max_enqueued_batches(
-          stat.IntValue());
-    } else if (absl::StartsWith(param_detail,
-                                kBatchingParamAllowedBatchSizes)) {
-      model_params[model_id_tfstreamz].set_allowed_batch_sizes(
-          std::string(stat.StrOrRefValue()));
-    }
-  });
-
-  // It is possible that the model IDs from Session.Run is in the format of
-  // <model_id>:<version>, while the model IDs in TFstreamz is in the format
-  // of <model_id> (without the version number). Build a map to connect the
-  // model IDs in TFstreamz and Session.Run.
-  absl::flat_hash_map<absl::string_view, std::vector<absl::string_view>>
-      model_id_map;
-  for (const auto& model_id_and_version : model_id_db->ids()) {
-    size_t i = model_id_and_version.find_last_of(':');
-    if (i == std::string::npos) {
-      model_id_map[model_id_and_version].push_back(model_id_and_version);
-    } else {
-      // If there is a version number at the end of model_id, remove the
-      // version number.
-      absl::string_view version_str(model_id_and_version.data() + i + 1);
-      int64_t version;
-      bool success = absl::SimpleAtoi(version_str, &version);
-      if (success) {
-        absl::string_view model_id_only(model_id_and_version.data(), i);
-        model_id_map[model_id_only].push_back(model_id_and_version);
-      } else {
-        LOG(ERROR) << "Can not parse model version number: " << version_str;
-      }
-    }
-  }
-
-  // One model ID from TFstreamz might map to multiple model IDs in
-  // Session.Run, update the batching parameters of all the model IDs in
-  // Session.Run.
-  for (const auto& [model_id_tfstreamz, params] : model_params) {
-    if (const std::vector<absl::string_view>* model_ids_session_run =
-            gtl::FindOrNull(model_id_map, model_id_tfstreamz)) {
-      for (const absl::string_view model_id_session_run :
-           *model_ids_session_run) {
-        (*model_id_db->mutable_id_to_batching_params())[model_id_session_run] =
-            params;
-      }
-    }
-  }
-}
-
-}  // namespace
-
-std::pair<bool, absl::string_view> ParseModelName(absl::string_view param) {
-  // Param can be in one of the two following formats:
-  // batching_param{model_name=<model_name>}
-  // batching_param{model_name=<model_name>, op_name=<op_name>}
-  size_t label_begin = param.find_first_of('{');
-  size_t label_end = param.find_last_of('}');
-  if (label_begin == absl::string_view::npos ||
-      label_end == absl::string_view::npos || label_end <= label_begin) {
-    return {false, ""};
-  }
-  // Go over all the labels to look for model name.
-  std::vector<absl::string_view> labels = absl::StrSplit(
-      param.substr(label_begin + 1, label_end - label_begin - 1), ", ");
-  for (const absl::string_view label : labels) {
-    std::vector<absl::string_view> key_value = absl::StrSplit(label, '=');
-    if (key_value.size() != 2) continue;
-    if (key_value[0] == "model_name") {
-      return {true, key_value[1]};
-    }
-  }
-  // Unable to find model name.
-  return {false, ""};
-}
-
-void GenerateInferenceStats(
-    const std::vector<XPlane*>& device_traces,
-    const StepEvents& nonoverlapped_step_events,
-    const GroupMetadataMap& group_metadata_map, const XSpace& xspace,
-    DeviceType device_type, int32_t host_id,
-    tensorflow::profiler::InferenceStats* inference_stats) {
-  tensorflow::profiler::PerHostInferenceStats* per_host_inference_stats =
-      &(*inference_stats->mutable_inference_stats_per_host())[host_id];
-  RequestEventsMap request_events_map;
-
-  // Build the mapping from host event type to events.
-  EventsByType host_events_by_type;
-  const XPlane* host = tsl::profiler::FindPlaneWithName(
-      xspace, tsl::profiler::kHostThreadsPlaneName);
-  if (!host) return;
-  XPlaneVisitor host_plane = CreateTfXPlaneVisitor(host);
-  for (const auto& line : host->lines()) {
-    for (const auto& event : line.events()) {
-      XEventVisitor event_visitor(&host_plane, &line, &event);
-      auto type = event_visitor.Type();
-      if (!type.has_value()) {
-        type = HostEventType::kUnknownHostEventType;
-      }
-      host_events_by_type[type.value()].push_back(event_visitor);
-    }
-  }
-
-  BuildRequestEventsMap(device_traces, host_events_by_type, group_metadata_map,
-                        nonoverlapped_step_events, device_type,
-                        inference_stats->mutable_model_id_db(),
-                        &request_events_map);
-  BatchEventsMap batch_events_map;
-  BuildBatchEventsMap(host_events_by_type, group_metadata_map,
-                      &request_events_map, &batch_events_map);
-
-  GenerateRequestAndBatchDelay(&request_events_map, &batch_events_map);
-  GenerateRequestDetailedBreakdown(&request_events_map);
-
-  GenerateTensorDetails(host_events_by_type, &request_events_map,
-                        &batch_events_map, inference_stats);
-
-  auto* request_details = per_host_inference_stats->mutable_request_details();
-  BuildRequestDetails(request_events_map, device_type, host_id,
-                      request_details);
-  auto* batch_details = per_host_inference_stats->mutable_batch_details();
-  BuildBatchDetails(std::move(batch_events_map), host_id, batch_details);
-
-  ParseTfstreamzForBatchingParameter(xspace,
-                                     inference_stats->mutable_model_id_db());
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/inference_stats.h b/tensorflow/core/profiler/convert/inference_stats.h
deleted file mode 100644
index 9d65f50f339d..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_H_
-
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/device_utils.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Generates PerHostInferenceStats from the given trace events.
-// For TPU, get time breakdown from device_traces. For GPU, get time breakdown
-// from nonoverlapped_step_events.
-// Get batching parameters from TFstreamz xplane in <xspace>.
-void GenerateInferenceStats(
-    const std::vector<tensorflow::profiler::XPlane*>& device_traces,
-    const tensorflow::profiler::StepEvents& nonoverlapped_step_events,
-    const tsl::profiler::GroupMetadataMap& group_metadata_map,
-    const tensorflow::profiler::XSpace& xspace,
-    tsl::profiler::DeviceType device_type, int32_t host_id,
-    tensorflow::profiler::InferenceStats* inference_stats);
-
-// Parses model name from TFstreamz.
-// Returns whether the parsing is successful and the actual model name. If
-// parsing failed, returns false and an empty string.
-std::pair<bool, absl::string_view> ParseModelName(absl::string_view param);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_H_
diff --git a/tensorflow/core/profiler/convert/inference_stats_combiner.cc b/tensorflow/core/profiler/convert/inference_stats_combiner.cc
deleted file mode 100644
index fcca1310061d..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_combiner.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/inference_stats_combiner.h"
-
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/lib/gtl/map_util.h"
-
-namespace tensorflow::profiler {
-namespace {
-// Combines two ModelIdDatabases. Returns true if this combination requires
-// updating the model_id_index in the SessionRunTimes of dst. This will be
-// the case if: (1) Src has a model name that doesn't already exist in dst;
-// or (2) Src has a model name that does exist in dst but has a different
-// index.
-bool CombineModelIdDatabases(const ModelIdDatabase& src, ModelIdDatabase* dst) {
-  if (dst->ids_size() == 0) {
-    // dst is empty. Simply copy src to dst. This avoids rebuilding
-    // dst from src from scratch, which may change the name-to-index mapping.
-    *dst = src;
-    return false;
-  }
-  // TODO(tianrun): For now, assume a model is always served with the same
-  // parameter on different hosts. In the future, we might consider the case
-  // when the same model are served with different batching parameters on
-  // different hosts.
-  for (const auto& id_and_param : src.id_to_batching_params()) {
-    dst->mutable_id_to_batching_params()->insert(id_and_param);
-  }
-  bool need_update = false;
-  for (const auto& [src_id, index] : src.id_to_index()) {
-    auto [iter, was_inserted] =
-        dst->mutable_id_to_index()->insert({src_id, dst->ids_size()});
-    if (was_inserted) {
-      *dst->add_ids() = src_id;
-      need_update = true;
-      continue;
-    }
-    if (iter->second != index) {
-      // src_id is already in dst but has a different index.
-      need_update = true;
-    }
-  }
-  return need_update;
-}
-
-// Combines two TensorPatternDatabase. Returns true if this combination requires
-// updating the tensor_pattern_index. This will be the case if: (1) Src has a
-// tensor pattern that doesn't exist in dst; or (2) Src has a tensor pattern
-// that does exist in dst but has a different index.
-bool CombineTensorPatternDatabase(
-    const TensorPatternDatabase& src, TensorPatternDatabase* dst,
-    absl::flat_hash_map<absl::string_view, int>* dst_pattern_to_index) {
-  if (dst->tensor_pattern().empty()) {
-    *dst = src;
-    return false;
-  }
-
-  bool need_update = false;
-  for (int i = 0; i < static_cast<int>(src.tensor_pattern_size()); i++) {
-    auto [iter, inserted] = dst_pattern_to_index->insert(
-        {src.tensor_pattern(i), dst_pattern_to_index->size()});
-    if (inserted) {
-      // Src has a tensor pattern that doesn't exist in dst.
-      dst->add_tensor_pattern(src.tensor_pattern(i));
-      need_update = true;
-    } else if (iter->second != i) {
-      // Src has a tensor pattern with different index than dst.
-      need_update = true;
-    }
-  }
-  return need_update;
-}
-
-void UpdateTensorPatternIndex(
-    const TensorPatternDatabase& src,
-    const absl::flat_hash_map<absl::string_view, int>& dst_pattern_to_index,
-    TensorEventDetail* detail) {
-  absl::string_view tensor_pattern =
-      src.tensor_pattern(detail->tensor_pattern_index());
-  if (const int* new_index =
-          tsl::gtl::FindOrNull(dst_pattern_to_index, tensor_pattern)) {
-    detail->set_tensor_pattern_index(*new_index);
-  } else {
-    LOG(WARNING) << "Tensor pattern " << tensor_pattern
-                 << " is not found in dst->tensor_pattern_db()";
-  }
-}
-}  // namespace
-
-void CombineInferenceStatsResult(int src_host_id, const InferenceStats& src,
-                                 InferenceStats* dst) {
-  // There should be one key-value pair inside src.inference_stats_per_host(),
-  // because the src comes from one XprofResponse (i.e., one host).
-  DCHECK_LE(src.inference_stats_per_host_size(), 1);
-  bool need_update_model_id =
-      CombineModelIdDatabases(src.model_id_db(), dst->mutable_model_id_db());
-  absl::flat_hash_map<absl::string_view, int> dst_pattern_to_index;
-  for (int i = 0;
-       i < static_cast<int>(dst->tensor_pattern_db().tensor_pattern_size());
-       i++) {
-    dst_pattern_to_index[dst->tensor_pattern_db().tensor_pattern(i)] = i;
-  }
-  bool need_update_tensor_pattern = CombineTensorPatternDatabase(
-      src.tensor_pattern_db(), dst->mutable_tensor_pattern_db(),
-      &dst_pattern_to_index);
-  for (const auto& [host_id, inf_stats] : src.inference_stats_per_host()) {
-    auto [iter, was_inserted] = dst->mutable_inference_stats_per_host()->insert(
-        {src_host_id, inf_stats});
-    if (!was_inserted) {
-      LOG(INFO) << "Duplicate host_id: " << iter->first;
-    }
-    if (need_update_model_id || need_update_tensor_pattern) {
-      // Needs to update the model_id_index in the dst.
-      PerHostInferenceStats* dst_inference_stats =
-          &(*dst->mutable_inference_stats_per_host())[src_host_id];
-      for (RequestDetail& request_detail :
-           *dst_inference_stats->mutable_request_details()) {
-        if (need_update_model_id && request_detail.model_id_index() != -1) {
-          // "model_id_index = -1" means there is no model_id associated with
-          // the group id in this event if client doesn't specify "model_id" in
-          // TraceMeEncode. so we don't need to update model_id if it doesn't
-          // have a model.
-          const std::string& model_id =
-              src.model_id_db().ids(request_detail.model_id_index());
-          auto iter = dst->model_id_db().id_to_index().find(model_id);
-          if (iter == dst->model_id_db().id_to_index().end()) {
-            LOG(WARNING) << "Model ID " << model_id
-                         << " is not found in dst->model_id_db()";
-            continue;
-          }
-          request_detail.set_model_id_index(iter->second);
-        }
-        if (need_update_tensor_pattern) {
-          for (auto& tensor_event_details :
-               *request_detail.mutable_tensor_event_details()) {
-            UpdateTensorPatternIndex(src.tensor_pattern_db(),
-                                     dst_pattern_to_index,
-                                     &tensor_event_details);
-          }
-        }
-      }
-    }
-    if (need_update_tensor_pattern) {
-      PerHostInferenceStats* dst_inference_stats =
-          &(*dst->mutable_inference_stats_per_host())[src_host_id];
-      for (BatchDetail& batch_detail :
-           *dst_inference_stats->mutable_batch_details()) {
-        UpdateTensorPatternIndex(src.tensor_pattern_db(), dst_pattern_to_index,
-                                 batch_detail.mutable_tensor_event_detail());
-      }
-    }
-  }
-}
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/inference_stats_combiner.h b/tensorflow/core/profiler/convert/inference_stats_combiner.h
deleted file mode 100644
index ceccc9cca260..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_combiner.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_COMBINER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_COMBINER_H_
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-
-namespace tensorflow::profiler {
-void CombineInferenceStatsResult(int src_host_id, const InferenceStats& src,
-                                 InferenceStats* dst);
-}  // namespace tensorflow::profiler
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_COMBINER_H_
diff --git a/tensorflow/core/profiler/convert/inference_stats_grouping.cc b/tensorflow/core/profiler/convert/inference_stats_grouping.cc
deleted file mode 100644
index 09b6d5baaa13..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_grouping.cc
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/inference_stats_grouping.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "xla/tsl/lib/gtl/map_util.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow::profiler {
-
-namespace {
-
-using ::tensorflow::profiler::BatchDetail;
-using ::tensorflow::profiler::InferenceStats;
-using ::tensorflow::profiler::ModelIdDatabase;
-using ::tensorflow::profiler::PerBatchSizeAggregatedResult;
-using ::tensorflow::profiler::PerModelInferenceStats;
-using ::tensorflow::profiler::RequestDetail;
-using ::tensorflow::profiler::TensorEventDetail;
-using ::tsl::profiler::Timespan;
-
-template <typename RandIt, typename Compare>
-void push_down_heap(size_t hole, RandIt first, RandIt last, Compare comp) {
-  size_t size = last - first;
-  assert(hole < size);
-  auto value = std::move(first[hole]);
-  while (true) {
-    size_t l_child = 2 * hole + 1;
-    size_t r_child = l_child + 1;
-    size_t max_child = l_child;
-    if (r_child < size && comp(first[l_child], first[r_child])) {
-      max_child = r_child;
-    }
-    if (max_child >= size) break;
-    if (!comp(value, first[max_child])) break;
-    first[hole] = std::move(first[max_child]);
-    hole = max_child;
-  }
-  first[hole] = std::move(value);
-}
-// Pushes the root down the heap.
-template <typename RandIt, typename Compare>
-void push_root_heap(RandIt first, RandIt last, Compare comp) {
-  push_down_heap(0, std::move(first), std::move(last), std::move(comp));
-}
-
-template <typename ContainerContainer, typename Out, typename Cmp>
-Out nway_merge(const ContainerContainer& containers, Out out, Cmp cmp) {
-  using std::begin;
-  using std::end;
-  using In = decltype(begin(*begin(containers)));  // The input iterator type.
-  using Range = std::pair<In, In>;
-  std::vector<Range> sources;
-  for (const auto& container : containers) {
-    Range r(begin(container), end(container));
-    if (r.first != r.second) sources.push_back(std::move(r));
-  }
-  // Zero, one or two collections can be merged without a priority queue.
-  switch (sources.size()) {
-    case 0:
-      return out;
-    case 1:
-      return std::copy(sources[0].first, sources[0].second, out);
-    case 2:
-      return std::merge(sources[0].first, sources[0].second, sources[1].first,
-                        sources[1].second, out, cmp);
-  }
-  // Take a comparator for T and produce an inverse comparator
-  // for std::pair<In<T>, In<T>>, inverted so as to produce a min-heap.
-  auto heap_cmp = [&](const Range& a, const Range& b) {
-    // Compares b < a instead of a < b.
-    return cmp(*b.first, *a.first);
-  };
-  auto heap_data = sources.data();
-  auto heap_size = sources.size();
-  std::make_heap(heap_data, heap_data + heap_size, heap_cmp);
-  auto& top = sources.front();
-  auto pop = [&]() {
-    *out = *top.first;
-    ++out;
-    ++top.first;
-  };
-
-  for (; heap_size > 2;) {
-    for (pop(); top.first != top.second; pop()) {
-      push_root_heap(heap_data, heap_data + heap_size, heap_cmp);
-    }
-    top = std::move(sources[--heap_size]);
-    push_root_heap(heap_data, heap_data + heap_size, heap_cmp);
-  }
-
-  return std::merge(sources[0].first, sources[0].second, sources[1].first,
-                    sources[1].second, out, cmp);
-}
-
-double GetThroughput(size_t data_size, uint64_t start_time_ps,
-                     uint64_t end_time_ps) {
-  return data_size / tsl::profiler::PicoToUni(end_time_ps - start_time_ps);
-}
-
-// Compute throughput and average latency.
-// DataType can either be RequestDetail or BatchDetail.
-template <typename DataType>
-std::pair<double, double> ComputeThroughputAndAverageLatencyUs(
-    const std::vector<const DataType*>& all_data) {
-  if (all_data.empty()) {
-    // Return 0 immediately to avoid divide by zero error.
-    return std::make_pair(0.0, 0.0);
-  }
-
-  uint64_t min_start_time_ps = std::numeric_limits<uint64_t>::max();
-  uint64_t max_end_time_ps = 0;
-  uint64_t total_latency_ps = 0;
-
-  for (const DataType* data : all_data) {
-    min_start_time_ps = std::min(min_start_time_ps, data->start_time_ps());
-    max_end_time_ps = std::max(max_end_time_ps, data->end_time_ps());
-    total_latency_ps += (data->end_time_ps() - data->start_time_ps());
-  }
-
-  double throughput =
-      GetThroughput(all_data.size(), min_start_time_ps, max_end_time_ps);
-  double average_latency_us =
-      tsl::profiler::PicoToMicro(total_latency_ps) / all_data.size();
-  return std::make_pair(throughput, average_latency_us);
-}
-
-template <typename DataType>
-bool CompareByDuration(const DataType* a, const DataType* b) {
-  return Timespan::ByDuration(
-      Timespan::FromEndPoints(a->start_time_ps(), a->end_time_ps()),
-      Timespan::FromEndPoints(b->start_time_ps(), b->end_time_ps()));
-}
-
-// Regroup data in <data_by_host> using model id for future analysis.
-// DataType can be either RequestDetail or BatchDetail.
-template <typename DataType>
-void RegroupDataByModelId(
-    const ModelIdDatabase& model_id_db,
-    const std::vector<const tsl::protobuf::RepeatedPtrField<DataType>*>&
-        data_by_host,
-    std::vector<std::vector<const DataType*>>* data_by_model_id) {
-  // First group data by model_id and host.
-  std::vector<std::vector<std::vector<const DataType*>>>
-      data_by_model_id_by_host;
-
-  // If model_id_db is empty, this means model_id is not available in the trace,
-  // so we simply consider the entire execution as a single model_id.
-  bool no_model_id = model_id_db.ids_size() == 0;
-  int model_index_size = no_model_id ? 1 : model_id_db.ids_size();
-  int host_index_size = data_by_host.size();
-  data_by_model_id_by_host.resize(model_index_size);
-  for (size_t model_index = 0; model_index < model_index_size; ++model_index) {
-    data_by_model_id_by_host[model_index].resize(host_index_size);
-  }
-
-  int32_t host_index = 0;
-  for (const tsl::protobuf::RepeatedPtrField<DataType>* single_host_data :
-       data_by_host) {
-    for (const DataType& data : *single_host_data) {
-      int model_index = no_model_id ? 0 : data.model_id_index();
-      // If model_id_db is not empty, and a session/batch does not have
-      // model_id, ignore it in per model analysis.
-      if (model_index == -1) {
-        continue;
-      }
-      data_by_model_id_by_host[model_index][host_index].push_back(&data);
-    }
-    ++host_index;
-  }
-
-  // data_by_host is already sorted by the latency, so
-  // data_by_model_id_by_host is also sorted by the latency. Therefore,
-  // we just need to do a n way merge instead of a real sorting.
-  data_by_model_id->resize(model_index_size);
-  for (size_t model_index = 0; model_index < model_index_size; ++model_index) {
-    int total_size = 0;
-    for (const auto& per_model_per_host :
-         data_by_model_id_by_host[model_index]) {
-      total_size += per_model_per_host.size();
-    }
-    data_by_model_id->at(model_index).reserve(total_size);
-  }
-  for (size_t model_index = 0; model_index < model_index_size; ++model_index) {
-    nway_merge(data_by_model_id_by_host[model_index],
-               std::back_inserter(data_by_model_id->at(model_index)),
-               CompareByDuration<DataType>);
-  }
-}
-
-// Generates the tensor transfer aggregated result using the per model data in
-// <per_model>.
-void GenerateTensorTransferAggregatedResult(PerModelInferenceStats* per_model) {
-  absl::flat_hash_map<int32_t, std::vector<const TensorEventDetail*>>
-      tensor_events_by_index;
-  // For requests, only count the tensor events with owner REQUEST, because if
-  // inference batching is enabled, there will be tensor events that are owned
-  // by batches and just inherited by requests. Counting these tensor events
-  // will lead to double counting.
-  for (const auto& request : per_model->request_details()) {
-    for (const auto& tensor_event : request.tensor_event_details()) {
-      if (tensor_event.owner() == TensorEventDetail::REQUEST) {
-        tensor_events_by_index[tensor_event.tensor_pattern_index()].push_back(
-            &tensor_event);
-      }
-    }
-  }
-  for (const auto& batch : per_model->batch_details()) {
-    if (batch.has_tensor_event_detail()) {
-      tensor_events_by_index[batch.tensor_event_detail().tensor_pattern_index()]
-          .push_back(&batch.tensor_event_detail());
-    }
-  }
-
-  if (tensor_events_by_index.empty()) return;
-
-  static constexpr double kPercentiles[] = {50.0, 75.0, 90.0, 95.0, 99.0, 99.9};
-  for (auto& [index, events] : tensor_events_by_index) {
-    auto* tensor_pattern_result =
-        per_model->mutable_tensor_transfer_aggregated_result()
-            ->add_tensor_pattern_results();
-    tensor_pattern_result->set_tensor_pattern_index(index);
-    tensor_pattern_result->set_count(events.size());
-    std::sort(events.begin(), events.end(),
-              [](const TensorEventDetail* a, const TensorEventDetail* b) {
-                return a->linearize_delinearize_time_ps() <
-                       b->linearize_delinearize_time_ps();
-              });
-    for (const double percentile : kPercentiles) {
-      int index = static_cast<int>(percentile / 100.0 * events.size());
-      auto* percentile_time =
-          tensor_pattern_result->add_linearize_delinearize_percentile_time();
-      percentile_time->set_percentile(percentile);
-      percentile_time->set_time_ps(
-          events[index]->linearize_delinearize_time_ps());
-    }
-  }
-}
-
-void AggregateRequest(const RequestDetail& input, RequestDetail* result) {
-  // In aggregated result, start_time is set to 0, and end time is set to the
-  // sum of the duration of the input requests.
-  result->set_end_time_ps(input.end_time_ps() - input.start_time_ps() +
-                          result->end_time_ps());
-  result->set_device_time_ps(result->device_time_ps() + input.device_time_ps());
-  result->set_read_from_device_time_ps(result->read_from_device_time_ps() +
-                                       input.read_from_device_time_ps());
-  result->set_write_to_device_time_ps(result->write_to_device_time_ps() +
-                                      input.write_to_device_time_ps());
-  result->set_batching_request_delay_ps(result->batching_request_delay_ps() +
-                                        input.batching_request_delay_ps());
-  result->set_batching_request_size(result->batching_request_size() +
-                                    input.batching_request_size());
-  result->set_host_preprocessing_ps(result->host_preprocessing_ps() +
-                                    input.host_preprocessing_ps());
-  result->set_host_batch_formation_ps(result->host_batch_formation_ps() +
-                                      input.host_batch_formation_ps());
-  result->set_host_runtime_ps(result->host_runtime_ps() +
-                              input.host_runtime_ps());
-  result->set_host_postprocessing_ps(result->host_postprocessing_ps() +
-                                     input.host_postprocessing_ps());
-  result->set_idle_time_ps(result->idle_time_ps() + input.idle_time_ps());
-}
-
-RequestDetail GetAverageRequestDetails(const RequestDetail& request,
-                                       int64_t size) {
-  RequestDetail result;
-  if (size == 0) return result;
-  // Average request detail does not have a request ID.
-  result.set_request_id(-1);
-  result.set_start_time_ps(0);
-  // Calculating average by dividing aggregated request by size.
-  result.set_end_time_ps(request.end_time_ps() / size);
-  result.set_device_time_ps(request.device_time_ps() / size);
-  result.set_write_to_device_time_ps(request.write_to_device_time_ps() / size);
-  result.set_read_from_device_time_ps(request.read_from_device_time_ps() /
-                                      size);
-  result.set_batching_request_delay_ps(request.batching_request_delay_ps() /
-                                       size);
-  result.set_batching_request_size(request.batching_request_size() / size);
-  result.set_host_preprocessing_ps(request.host_preprocessing_ps() / size);
-  result.set_host_batch_formation_ps(request.host_batch_formation_ps() / size);
-  result.set_host_runtime_ps(request.host_runtime_ps() / size);
-  result.set_host_postprocessing_ps(request.host_postprocessing_ps() / size);
-  result.set_idle_time_ps(request.idle_time_ps() / size);
-  return result;
-}
-
-void AggregateBatch(const BatchDetail& input, BatchDetail* result) {
-  // In aggregated result, start_time is set to 0, and end time is set to the
-  // sum of the duration of the input batches.
-  result->set_end_time_ps(input.end_time_ps() - input.start_time_ps() +
-                          result->end_time_ps());
-  result->set_batch_delay_ps(result->batch_delay_ps() + input.batch_delay_ps());
-  result->set_padding_amount(result->padding_amount() + input.padding_amount());
-  result->set_batch_size_after_padding(result->batch_size_after_padding() +
-                                       input.batch_size_after_padding());
-}
-
-BatchDetail GetAverageBatchDetails(const BatchDetail& batch, int64_t size) {
-  BatchDetail result;
-  if (size == 0) return result;
-  // Average batch detail does not have a batch ID.
-  result.set_batch_id(-1);
-  result.set_start_time_ps(0);
-  // Calculating average by dividing aggregated batch by size.
-  result.set_end_time_ps(batch.end_time_ps() / size);
-  result.set_batch_delay_ps(batch.batch_delay_ps() / size);
-  result.set_padding_amount(batch.padding_amount() / size);
-  result.set_batch_size_after_padding(batch.batch_size_after_padding() / size);
-  return result;
-}
-
-void AggregatePerModelInferenceStats(InferenceStats* inference_stats) {
-  for (auto& [model_index, per_model_stats] :
-       *inference_stats->mutable_inference_stats_per_model()) {
-    absl::flat_hash_map<int /*batch_id*/, const BatchDetail*> batch_id_to_batch;
-    for (const BatchDetail& b : per_model_stats.batch_details()) {
-      batch_id_to_batch[b.batch_id()] = &b;
-    }
-
-    // Aggregated result for all data.
-    RequestDetail aggregated_r;
-    BatchDetail aggregated_b;
-
-    struct PerBatchSizeInfo {
-      PerBatchSizeAggregatedResult result;
-      int request_count;
-      int batch_count;
-    };
-    // Aggregated result per batch size.
-    absl::flat_hash_map<int /*batch_id*/, PerBatchSizeInfo> per_batch_size_info;
-
-    for (const RequestDetail& r : per_model_stats.request_details()) {
-      // Aggregate all data.
-      AggregateRequest(r, &aggregated_r);
-      // Aggregate per batch size.
-      for (const auto batch_id : r.related_batch_ids()) {
-        if (const BatchDetail* batch =
-                ::tsl::gtl::FindPtrOrNull(batch_id_to_batch, batch_id)) {
-          int batch_size = batch->batch_size_after_padding();
-          auto& info = per_batch_size_info[batch_size];
-          AggregateRequest(r, info.result.mutable_aggregated_request_result());
-          info.request_count++;
-        }
-      }
-    }
-
-    for (const BatchDetail& b : per_model_stats.batch_details()) {
-      // Aggregate all data.
-      AggregateBatch(b, &aggregated_b);
-      // Aggregate per batch size.
-      int batch_size = b.batch_size_after_padding();
-      auto& info = per_batch_size_info[batch_size];
-      AggregateBatch(b, info.result.mutable_aggregated_batch_result());
-      info.batch_count++;
-    }
-
-    *per_model_stats.mutable_aggregated_request_detail() =
-        GetAverageRequestDetails(aggregated_r,
-                                 per_model_stats.request_details().size());
-    *per_model_stats.mutable_aggregated_batch_detail() = GetAverageBatchDetails(
-        aggregated_b, per_model_stats.batch_details().size());
-
-    std::vector<int> sorted_batch_sizes;
-    for (const auto& [batch_size, _] : per_batch_size_info) {
-      sorted_batch_sizes.push_back(batch_size);
-    }
-    std::sort(sorted_batch_sizes.begin(), sorted_batch_sizes.end());
-    for (const int batch_size : sorted_batch_sizes) {
-      auto* result = per_model_stats.add_per_batch_size_aggregated_result();
-      result->set_batch_size(batch_size);
-      auto& info = per_batch_size_info[batch_size];
-      *result->mutable_aggregated_request_result() = GetAverageRequestDetails(
-          info.result.aggregated_request_result(), info.request_count);
-      result->set_request_throughput(info.request_count *
-                                     per_model_stats.request_throughput() /
-                                     per_model_stats.request_details_size());
-      *result->mutable_aggregated_batch_result() = GetAverageBatchDetails(
-          info.result.aggregated_batch_result(), info.batch_count);
-      result->set_batch_throughput(info.batch_count *
-                                   per_model_stats.batch_throughput() /
-                                   per_model_stats.batch_details_size());
-    }
-  }
-}
-
-}  // namespace
-
-void RegroupInferenceStatsByModel(InferenceStats* inference_stats) {
-  if (inference_stats->inference_stats_per_host().empty()) {
-    return;
-  }
-  std::vector<const tsl::protobuf::RepeatedPtrField<RequestDetail>*>
-      all_requests_by_host;
-  for (const auto& [host_id, per_host_inference_stats] :
-       inference_stats->inference_stats_per_host()) {
-    all_requests_by_host.push_back(&per_host_inference_stats.request_details());
-  }
-  std::vector<std::vector<const RequestDetail*>> requests_by_model_id;
-  RegroupDataByModelId(inference_stats->model_id_db(), all_requests_by_host,
-                       &requests_by_model_id);
-
-  std::vector<const tsl::protobuf::RepeatedPtrField<BatchDetail>*>
-      all_batches_by_host;
-  for (const auto& [host_id, per_host_inference_stats] :
-       inference_stats->inference_stats_per_host()) {
-    all_batches_by_host.push_back(&per_host_inference_stats.batch_details());
-  }
-  std::vector<std::vector<const BatchDetail*>> batches_by_model_id;
-  RegroupDataByModelId(inference_stats->model_id_db(), all_batches_by_host,
-                       &batches_by_model_id);
-
-  for (size_t index = 0; index < requests_by_model_id.size(); index++) {
-    auto* per_model =
-        &(*inference_stats->mutable_inference_stats_per_model())[index];
-    for (const RequestDetail* request : requests_by_model_id[index]) {
-      *per_model->add_request_details() = *request;
-    }
-    for (const BatchDetail* batch : batches_by_model_id[index]) {
-      *per_model->add_batch_details() = *batch;
-    }
-    auto [request_throughput, request_latency] =
-        ComputeThroughputAndAverageLatencyUs(requests_by_model_id[index]);
-    per_model->set_request_throughput(request_throughput);
-    per_model->set_request_average_latency_us(request_latency);
-    auto [batch_throughput, batch_latency] =
-        ComputeThroughputAndAverageLatencyUs(batches_by_model_id[index]);
-    per_model->set_batch_throughput(batch_throughput);
-    per_model->set_batch_average_latency_us(batch_latency);
-    GenerateTensorTransferAggregatedResult(per_model);
-  }
-
-  AggregatePerModelInferenceStats(inference_stats);
-
-  // If there is no model id provided by user, create a fake "ALL" model id to
-  // represent all the requests during profiling.
-  // This ALL model id is mapped to index 0, which is consistent with the index
-  // used by RegroupDataByModelId.
-  if (inference_stats->model_id_db().ids().empty()) {
-    inference_stats->mutable_model_id_db()->add_ids("ALL");
-    inference_stats->mutable_model_id_db()->mutable_id_to_index()->insert(
-        {"ALL", 0});
-  }
-  inference_stats->clear_inference_stats_per_host();
-}
-
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/inference_stats_grouping.h b/tensorflow/core/profiler/convert/inference_stats_grouping.h
deleted file mode 100644
index 7d60da0f3118..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_grouping.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_GROUPING_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_GROUPING_H_
-
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-
-namespace tensorflow::profiler {
-
-// Change inference stats from per host to per model_id by doing a regroup.
-// Future analysis of inference_stats will be on a per model_id basis.
-void RegroupInferenceStatsByModel(
-    tensorflow::profiler::InferenceStats* inference_stats);
-
-}  // namespace tensorflow::profiler
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_GROUPING_H_
diff --git a/tensorflow/core/profiler/convert/inference_stats_grouping_test.cc b/tensorflow/core/profiler/convert/inference_stats_grouping_test.cc
deleted file mode 100644
index 4832063d0abd..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_grouping_test.cc
+++ /dev/null
@@ -1,505 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/inference_stats_grouping.h"
-
-#include <gmock/gmock.h>
-#include "xla/tests/test_utils.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-
-namespace tensorflow::profiler {
-namespace {
-
-using ::testing::EqualsProto;
-using ::xla::ParseTextProto;
-
-TEST(InferenceStatsGroupingTest, TestWithModelId) {
-  // An inference stats with two hosts, two models.
-  InferenceStats inference_stats = ParseTextProto<InferenceStats>(R"pb(
-                                     inference_stats_per_host {
-                                       key: 0
-                                       value {
-                                         request_details {
-                                           start_time_ps: 1000
-                                           end_time_ps: 2000
-                                           model_id_index: 0
-                                           request_id: 0
-                                           device_time_ps: 100
-                                         }
-                                         request_details {
-                                           start_time_ps: 2000
-                                           end_time_ps: 3000
-                                           model_id_index: 1
-                                           request_id: 1
-                                           device_time_ps: 100
-                                         }
-                                       }
-                                     }
-                                     inference_stats_per_host {
-                                       key: 1
-                                       value {
-                                         request_details {
-                                           start_time_ps: 3000
-                                           end_time_ps: 4000
-                                           model_id_index: 0
-                                           request_id: 2
-                                           device_time_ps: 100
-                                         }
-                                         request_details {
-                                           start_time_ps: 4000
-                                           end_time_ps: 5000
-                                           model_id_index: 1
-                                           request_id: 3
-                                           device_time_ps: 100
-                                         }
-                                       }
-                                     }
-                                     model_id_db {
-                                       ids: "Model-A:1"
-                                       ids: "Model-B:1"
-                                       id_to_index { key: "Model-A:1" value: 0 }
-                                       id_to_index { key: "Model-B:1" value: 1 }
-                                     }
-                                   )pb")
-                                       .value();
-
-  RegroupInferenceStatsByModel(&inference_stats);
-
-  // Verifies that requests with the same model ID are grouped together.
-  EXPECT_THAT(inference_stats, EqualsProto(R"pb(
-                model_id_db {
-                  ids: "Model-A:1"
-                  ids: "Model-B:1"
-                  id_to_index { key: "Model-A:1" value: 0 }
-                  id_to_index { key: "Model-B:1" value: 1 }
-                }
-                inference_stats_per_model {
-                  key: 0
-                  value {
-                    request_details {
-                      start_time_ps: 1000
-                      end_time_ps: 2000
-                      model_id_index: 0
-                      request_id: 0
-                      device_time_ps: 100
-                    }
-                    request_details {
-                      start_time_ps: 3000
-                      end_time_ps: 4000
-                      model_id_index: 0
-                      request_id: 2
-                      device_time_ps: 100
-                    }
-                    aggregated_request_detail {
-                      request_id: -1
-                      start_time_ps: 0
-                      end_time_ps: 1000
-                      write_to_device_time_ps: 0
-                      read_from_device_time_ps: 0
-                      device_time_ps: 100
-                      batching_request_delay_ps: 0
-                      batching_request_size: 0
-                      host_preprocessing_ps: 0
-                      host_batch_formation_ps: 0
-                      host_runtime_ps: 0
-                      host_postprocessing_ps: 0
-                      idle_time_ps: 0
-                    }
-                    aggregated_batch_detail {}
-                    request_throughput: 666666666.66666663
-                    request_average_latency_us: 0.001
-                    batch_throughput: 0
-                    batch_average_latency_us: 0
-                  }
-                }
-                inference_stats_per_model {
-                  key: 1
-                  value {
-                    request_details {
-                      start_time_ps: 2000
-                      end_time_ps: 3000
-                      model_id_index: 1
-                      request_id: 1
-                      device_time_ps: 100
-                    }
-                    request_details {
-                      start_time_ps: 4000
-                      end_time_ps: 5000
-                      model_id_index: 1
-                      request_id: 3
-                      device_time_ps: 100
-                    }
-                    aggregated_request_detail {
-                      request_id: -1
-                      start_time_ps: 0
-                      end_time_ps: 1000
-                      write_to_device_time_ps: 0
-                      read_from_device_time_ps: 0
-                      device_time_ps: 100
-                      batching_request_delay_ps: 0
-                      batching_request_size: 0
-                      host_preprocessing_ps: 0
-                      host_batch_formation_ps: 0
-                      host_runtime_ps: 0
-                      host_postprocessing_ps: 0
-                      idle_time_ps: 0
-                    }
-                    aggregated_batch_detail {}
-                    request_throughput: 666666666.66666663
-                    request_average_latency_us: 0.001
-                    batch_throughput: 0
-                    batch_average_latency_us: 0
-                  }
-                })pb"));
-}
-
-TEST(InferenceStatsGroupingTest, TestTensorPatternPercentile) {
-  // Generates an inference stats for test, 6 requests have tensor events owned
-  // by REQUEST, 2 requests have tensor events owned by BATCH.
-  InferenceStats inference_stats =
-      ParseTextProto<InferenceStats>(R"pb(
-        inference_stats_per_host {
-          key: 0
-          value {
-            request_details {
-              start_time_ps: 1000
-              end_time_ps: 2000
-              request_id: 0
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: REQUEST
-                linearize_delinearize_time_ps: 600000
-              }
-            }
-            request_details {
-              start_time_ps: 2000
-              end_time_ps: 3000
-              request_id: 1
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: REQUEST
-                linearize_delinearize_time_ps: 500000
-              }
-            }
-            request_details {
-              start_time_ps: 1000
-              end_time_ps: 2000
-              request_id: 2
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: REQUEST
-                linearize_delinearize_time_ps: 400000
-              }
-            }
-            request_details {
-              start_time_ps: 2000
-              end_time_ps: 3000
-              request_id: 3
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: REQUEST
-                linearize_delinearize_time_ps: 300000
-              }
-            }
-            request_details {
-              start_time_ps: 1000
-              end_time_ps: 2000
-              request_id: 4
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: REQUEST
-                linearize_delinearize_time_ps: 200000
-              }
-            }
-            request_details {
-              start_time_ps: 2000
-              end_time_ps: 3000
-              request_id: 5
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: REQUEST
-                linearize_delinearize_time_ps: 100000
-              }
-            }
-            request_details {
-              start_time_ps: 2000
-              end_time_ps: 3000
-              request_id: 6
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: BATCH
-                linearize_delinearize_time_ps: 700000
-              }
-            }
-            request_details {
-              start_time_ps: 2000
-              end_time_ps: 3000
-              request_id: 7
-              tensor_event_details {
-                tensor_pattern_index: 0
-                owner: BATCH
-                linearize_delinearize_time_ps: 800000
-              }
-            }
-          }
-        }
-      )pb")
-          .value();
-
-  RegroupInferenceStatsByModel(&inference_stats);
-
-  // Count equals to 6 because request tensor events owned by BATCH are ignored.
-  // Percentile selector selects linearize and delinearize time at 50.0, 75.0,
-  // 90.0, 95.0, 99.0, 99.9 percentiles.
-  EXPECT_THAT(inference_stats.inference_stats_per_model()
-                  .at(0)
-                  .tensor_transfer_aggregated_result(),
-              EqualsProto(R"pb(
-                tensor_pattern_results {
-                  tensor_pattern_index: 0
-                  count: 6
-                  linearize_delinearize_percentile_time {
-                    percentile: 50
-                    time_ps: 400000
-                  }
-                  linearize_delinearize_percentile_time {
-                    percentile: 75
-                    time_ps: 500000
-                  }
-                  linearize_delinearize_percentile_time {
-                    percentile: 90
-                    time_ps: 600000
-                  }
-                  linearize_delinearize_percentile_time {
-                    percentile: 95
-                    time_ps: 600000
-                  }
-                  linearize_delinearize_percentile_time {
-                    percentile: 99
-                    time_ps: 600000
-                  }
-                  linearize_delinearize_percentile_time {
-                    percentile: 99.9
-                    time_ps: 600000
-                  }
-                }
-              )pb"));
-}
-
-TEST(InferenceStatsGroupingTest, TestWithoutModelId) {
-  // An inference stats with two hosts, no model ID data.
-  InferenceStats inference_stats = ParseTextProto<InferenceStats>(R"pb(
-                                     inference_stats_per_host {
-                                       key: 0
-                                       value {
-                                         request_details {
-                                           start_time_ps: 1000
-                                           end_time_ps: 2000
-                                           request_id: 0
-                                           related_batch_ids: 0
-                                           host_runtime_ps: 100
-                                         }
-                                         request_details {
-                                           start_time_ps: 2000
-                                           end_time_ps: 4000
-                                           request_id: 1
-                                           related_batch_ids: 0
-                                           host_runtime_ps: 100
-                                         }
-                                         batch_details {
-                                           batch_id: 0
-                                           related_request_ids: 0
-                                           related_request_ids: 1
-                                           start_time_ps: 1000
-                                           end_time_ps: 2000
-                                           batch_size_after_padding: 128
-                                         }
-                                       }
-                                     }
-                                     inference_stats_per_host {
-                                       key: 1
-                                       value {
-                                         request_details {
-                                           start_time_ps: 3000
-                                           end_time_ps: 6000
-                                           request_id: 2
-                                           related_batch_ids: 1
-                                           host_runtime_ps: 100
-                                         }
-                                         request_details {
-                                           start_time_ps: 4000
-                                           end_time_ps: 8000
-                                           request_id: 3
-                                           related_batch_ids: 1
-                                           host_runtime_ps: 100
-                                         }
-                                         batch_details {
-                                           batch_id: 1
-                                           related_request_ids: 2
-                                           related_request_ids: 3
-                                           start_time_ps: 3000
-                                           end_time_ps: 4000
-                                           batch_size_after_padding: 256
-                                         }
-                                       }
-                                     }
-                                   )pb")
-                                       .value();
-
-  RegroupInferenceStatsByModel(&inference_stats);
-
-  // Verifies that all requests are grouped into a single model, and a "ALL"
-  // model ID is added.
-  EXPECT_THAT(inference_stats, EqualsProto(R"pb(
-                model_id_db {
-                  ids: "ALL"
-                  id_to_index { key: "ALL" value: 0 }
-                }
-                inference_stats_per_model {
-                  key: 0
-                  value {
-                    request_details {
-                      start_time_ps: 1000
-                      end_time_ps: 2000
-                      request_id: 0
-                      related_batch_ids: 0
-                      host_runtime_ps: 100
-                    }
-                    request_details {
-                      start_time_ps: 2000
-                      end_time_ps: 4000
-                      request_id: 1
-                      related_batch_ids: 0
-                      host_runtime_ps: 100
-                    }
-                    request_details {
-                      start_time_ps: 3000
-                      end_time_ps: 6000
-                      request_id: 2
-                      related_batch_ids: 1
-                      host_runtime_ps: 100
-                    }
-                    request_details {
-                      start_time_ps: 4000
-                      end_time_ps: 8000
-                      request_id: 3
-                      related_batch_ids: 1
-                      host_runtime_ps: 100
-                    }
-                    batch_details {
-                      batch_id: 0
-                      related_request_ids: 0
-                      related_request_ids: 1
-                      start_time_ps: 1000
-                      end_time_ps: 2000
-                      batch_size_after_padding: 128
-                    }
-                    batch_details {
-                      batch_id: 1
-                      related_request_ids: 2
-                      related_request_ids: 3
-                      start_time_ps: 3000
-                      end_time_ps: 4000
-                      batch_size_after_padding: 256
-                    }
-                    aggregated_request_detail {
-                      request_id: -1
-                      start_time_ps: 0
-                      end_time_ps: 2500
-                      write_to_device_time_ps: 0
-                      read_from_device_time_ps: 0
-                      device_time_ps: 0
-                      batching_request_delay_ps: 0
-                      batching_request_size: 0
-                      host_preprocessing_ps: 0
-                      host_batch_formation_ps: 0
-                      host_runtime_ps: 100
-                      host_postprocessing_ps: 0
-                      idle_time_ps: 0
-                    }
-                    aggregated_batch_detail {
-                      batch_id: -1
-                      start_time_ps: 0
-                      end_time_ps: 1000
-                      batch_delay_ps: 0
-                      padding_amount: 0
-                      batch_size_after_padding: 192
-                    }
-                    per_batch_size_aggregated_result {
-                      batch_size: 128
-                      aggregated_request_result {
-                        start_time_ps: 0
-                        end_time_ps: 1500
-                        write_to_device_time_ps: 0
-                        read_from_device_time_ps: 0
-                        device_time_ps: 0
-                        request_id: -1
-                        batching_request_delay_ps: 0
-                        batching_request_size: 0
-                        host_preprocessing_ps: 0
-                        host_batch_formation_ps: 0
-                        host_runtime_ps: 100
-                        host_postprocessing_ps: 0
-                        idle_time_ps: 0
-                      }
-                      aggregated_batch_result {
-                        batch_id: -1
-                        start_time_ps: 0
-                        end_time_ps: 1000
-                        batch_delay_ps: 0
-                        padding_amount: 0
-                        batch_size_after_padding: 128
-                      }
-                      request_throughput: 285714285.71428573
-                      batch_throughput: 333333333.33333331
-                    }
-                    per_batch_size_aggregated_result {
-                      batch_size: 256
-                      aggregated_request_result {
-                        start_time_ps: 0
-                        end_time_ps: 3500
-                        write_to_device_time_ps: 0
-                        read_from_device_time_ps: 0
-                        device_time_ps: 0
-                        request_id: -1
-                        batching_request_delay_ps: 0
-                        batching_request_size: 0
-                        host_preprocessing_ps: 0
-                        host_batch_formation_ps: 0
-                        host_runtime_ps: 100
-                        host_postprocessing_ps: 0
-                        idle_time_ps: 0
-                      }
-                      aggregated_batch_result {
-                        batch_id: -1
-                        start_time_ps: 0
-                        end_time_ps: 1000
-                        batch_delay_ps: 0
-                        padding_amount: 0
-                        batch_size_after_padding: 256
-                      }
-                      request_throughput: 285714285.71428573
-                      batch_throughput: 333333333.33333331
-                    }
-                    request_throughput: 571428571.42857146
-                    request_average_latency_us: 0.0025
-                    batch_throughput: 666666666.66666663
-                    batch_average_latency_us: 0.001
-                  }
-                })pb"));
-}
-
-}  // namespace
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/inference_stats_sampler.cc b/tensorflow/core/profiler/convert/inference_stats_sampler.cc
deleted file mode 100644
index be3e392ec85e..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_sampler.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/inference_stats_sampler.h"
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <functional>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-
-namespace tensorflow::profiler {
-
-namespace {
-
-using ::tensorflow::profiler::BatchDetail;
-using ::tensorflow::profiler::InferenceStats;
-using ::tensorflow::profiler::PerModelInferenceStats;
-using ::tensorflow::profiler::RequestDetail;
-
-// Column names that can be used to do percentile selection.
-// For request:
-constexpr char kColumnLatencyUs[] = "Latency";
-constexpr char kColumnBatchingRequestDelayUs[] = "Request delay for batching";
-constexpr char kColumnBatchingRequestSize[] = "Request size";
-constexpr char kColumnHostPreprocessing[] = "Host preprocess";
-constexpr char kColumnHostBatchFormation[] = "Host batch formation";
-constexpr char kColumnHostRuntime[] = "Host runtime";
-constexpr char kColumnHostToDevice[] = "Data transfer H2D";
-constexpr char kColumnDeviceToHost[] = "Data transfer D2H";
-constexpr char kColumnDeviceCompute[] = "Device compute";
-constexpr char kColumnHostPostprocessing[] = "Host postprocess";
-constexpr char kColumnIdleTime[] = "Idle time";
-// For batch:
-constexpr char kColumnBatchingDelayUs[] = "Batching delay";
-constexpr char kColumnPaddingAmount[] = "Padding amount";
-constexpr char kColumnBatchSizeAfterPadding[] = "Batch size after padding";
-constexpr char kColumnBatchingEfficiency[] = "Batching efficiency";
-
-double CalculateBatchingEfficiency(const BatchDetail& batch) {
-  return tsl::profiler::SafeDivide(
-      static_cast<double>(batch.batch_size_after_padding() -
-                          batch.padding_amount()),
-      static_cast<double>(batch.batch_size_after_padding()));
-}
-
-// Comparator for RequestDetail proto.
-bool CompareByRequestLatency(const RequestDetail* a, const RequestDetail* b) {
-  return (a->end_time_ps() - a->start_time_ps()) <
-         (b->end_time_ps() - b->start_time_ps());
-}
-bool CompareByBatchingRequestDelay(const RequestDetail* a,
-                                   const RequestDetail* b) {
-  return a->batching_request_delay_ps() < b->batching_request_delay_ps();
-}
-bool CompareByBatchingRequestSize(const RequestDetail* a,
-                                  const RequestDetail* b) {
-  return a->batching_request_size() < b->batching_request_size();
-}
-bool CompareByHostPreprocessing(const RequestDetail* a,
-                                const RequestDetail* b) {
-  return a->host_preprocessing_ps() < b->host_preprocessing_ps();
-}
-bool CompareByHostBatchFormation(const RequestDetail* a,
-                                 const RequestDetail* b) {
-  return a->host_batch_formation_ps() < b->host_batch_formation_ps();
-}
-bool CompareByHostRuntime(const RequestDetail* a, const RequestDetail* b) {
-  return a->host_runtime_ps() < b->host_runtime_ps();
-}
-bool CompareByHostToDevice(const RequestDetail* a, const RequestDetail* b) {
-  return a->write_to_device_time_ps() < b->write_to_device_time_ps();
-}
-bool CompareByDeviceToHost(const RequestDetail* a, const RequestDetail* b) {
-  return a->read_from_device_time_ps() < b->read_from_device_time_ps();
-}
-bool CompareByDeviceCompute(const RequestDetail* a, const RequestDetail* b) {
-  return a->device_time_ps() < b->device_time_ps();
-}
-bool CompareByPostProcessing(const RequestDetail* a, const RequestDetail* b) {
-  return a->host_postprocessing_ps() < b->host_postprocessing_ps();
-}
-bool CompareByIdleTime(const RequestDetail* a, const RequestDetail* b) {
-  return a->idle_time_ps() < b->idle_time_ps();
-}
-// Use percentile column name to get the corresponding compare function.
-std::function<bool(const RequestDetail*, const RequestDetail*)>
-GetRequestCompareFunction(absl::string_view column_name) {
-  if (column_name == kColumnBatchingRequestDelayUs) {
-    return CompareByBatchingRequestDelay;
-  } else if (column_name == kColumnBatchingRequestSize) {
-    return CompareByBatchingRequestSize;
-  } else if (column_name == kColumnHostPreprocessing) {
-    return CompareByHostPreprocessing;
-  } else if (column_name == kColumnHostBatchFormation) {
-    return CompareByHostBatchFormation;
-  } else if (column_name == kColumnHostRuntime) {
-    return CompareByHostRuntime;
-  } else if (column_name == kColumnHostToDevice) {
-    return CompareByHostToDevice;
-  } else if (column_name == kColumnDeviceToHost) {
-    return CompareByDeviceToHost;
-  } else if (column_name == kColumnDeviceCompute) {
-    return CompareByDeviceCompute;
-  } else if (column_name == kColumnHostPostprocessing) {
-    return CompareByPostProcessing;
-  } else if (column_name == kColumnIdleTime) {
-    return CompareByIdleTime;
-  } else {
-    // Return CompareByRequestLatency by default.
-    return CompareByRequestLatency;
-  }
-}
-
-// Comparator for BatchDetail proto.
-bool CompareByBatchLatency(const BatchDetail* a, const BatchDetail* b) {
-  return (a->end_time_ps() - a->start_time_ps()) <
-         (b->end_time_ps() - b->start_time_ps());
-}
-bool CompareByBatchDelay(const BatchDetail* a, const BatchDetail* b) {
-  return a->batch_delay_ps() < b->batch_delay_ps();
-}
-bool CompareByPaddingAmount(const BatchDetail* a, const BatchDetail* b) {
-  return a->padding_amount() < b->padding_amount();
-}
-bool CompareByBatchSizeAfterPadding(const BatchDetail* a,
-                                    const BatchDetail* b) {
-  return a->batch_size_after_padding() < b->batch_size_after_padding();
-}
-bool CompareByBatchingEfficiency(const BatchDetail* a, const BatchDetail* b) {
-  return CalculateBatchingEfficiency(*a) < CalculateBatchingEfficiency(*b);
-}
-// Use percentile column name to get the corresponding compare function.
-std::function<bool(const BatchDetail*, const BatchDetail*)>
-GetBatchCompareFunction(absl::string_view column_name) {
-  if (column_name == kColumnBatchingDelayUs) {
-    return CompareByBatchDelay;
-  } else if (column_name == kColumnPaddingAmount) {
-    return CompareByPaddingAmount;
-  } else if (column_name == kColumnBatchSizeAfterPadding) {
-    return CompareByBatchSizeAfterPadding;
-  } else if (column_name == kColumnBatchingEfficiency) {
-    return CompareByBatchingEfficiency;
-  } else {
-    // Return CompareByBatchLatency by default.
-    return CompareByBatchLatency;
-  }
-}
-
-// A static helper class to select a subset of inference data (request or batch)
-// to show in the frontend.
-// DataType can be either RequestDetail or BatchDetail.
-template <typename DataType>
-class PercentileSelector {
- public:
-  // The range of values in [percentile, perentile+error) are still regarded as
-  // percentile.
-  struct PercentileRange {
-    double percentile;
-    double error;
-  };
-
-  // The percentiles (with the corresponding error bounds) that will be included
-  // in inference profile result.
-  static constexpr std::array<PercentileRange, 6> kWantedPercentiles = {
-      {{50.0, 1},
-       {75.0, 1},
-       {90.0, 1},
-       {99.0, 0.5},
-       {99.9, 0.05},
-       {99.99, 0.005}}};
-
-  // Maximum number of values included for each percentile range.
-  static constexpr size_t kMaxNumDataSelectedPerPercentile = 10;
-
-  // Select a subset of data from <all_data>, return pointer to the original
-  // data and the percentile.
-  static std::vector<std::pair<const DataType*, double>> Select(
-      const std::vector<const DataType*>& all_data) {
-    return SelectInternal(all_data);
-  }
-
- private:
-  static bool GreaterThan(double percentile, const PercentileRange& wanted) {
-    // Uses ">=" instead of ">" so that the round-up value is not included.
-    return percentile >= (wanted.percentile + wanted.error);
-  }
-
-  static bool LessThan(double percentile, const PercentileRange& wanted) {
-    return percentile < wanted.percentile;
-  }
-
-  static bool WithinRange(double percentile, const PercentileRange& wanted) {
-    return !GreaterThan(percentile, wanted) && !LessThan(percentile, wanted);
-  }
-
-  static std::vector<std::pair<const DataType*, double>> SelectInternal(
-      const std::vector<const DataType*>& all_data) {
-    std::vector<std::pair<const DataType*, double>> result;
-    // If the number of data points is too small (smaller than the result size
-    // when select by percentile, like in a unit test), it does not make sense
-    // to select by percentile, just select all the data points and the frontend
-    // is able to display all of them.
-    if (all_data.size() <=
-        kWantedPercentiles.size() * kMaxNumDataSelectedPerPercentile) {
-      for (size_t i = 0; i < all_data.size(); i++) {
-        double percentile = 100.0 * i / all_data.size();
-        result.push_back(std::make_pair(all_data[i], percentile));
-      }
-      return result;
-    }
-
-    // Select by percentile.
-    size_t idx_to_next_data = 0;
-    for (size_t i = 0; i < kWantedPercentiles.size(); i++) {
-      const auto& wanted = kWantedPercentiles[i];
-      size_t num_data_selected = 0;
-      for (size_t k = idx_to_next_data; k < all_data.size(); k++) {
-        double percentile = 100.0 * k / all_data.size();
-        if (GreaterThan(percentile, wanted)) {
-          // Updates idx_to_next_data to k so that when we select data for the
-          // next percentile we don't need to consider the data with smaller
-          // latenices than that for the next percentile.
-          idx_to_next_data = k;
-          break;
-        }
-        if (WithinRange(percentile, wanted)) {
-          if (num_data_selected < kMaxNumDataSelectedPerPercentile) {
-            // Selects this data only if we have not hit the limit for this
-            // percentile.
-            result.push_back(std::make_pair(all_data[k], percentile));
-            ++num_data_selected;
-          }
-        }
-      }
-    }
-    return result;
-  }
-};
-
-// Sample the requests and batches in <per_model_stats> using sampling column
-// <request_percentile_column> and <batch_percentile_column>.
-void SamplePerModelInferenceStats(
-    absl::string_view request_percentile_column,
-    absl::string_view batch_percentile_column,
-    const PerModelInferenceStats& per_model_stats,
-    SampledPerModelInferenceStats* sampled_per_model_stats) {
-  // Select a subset of requests and batches based on percentile and generate
-  // final result.
-  std::vector<const RequestDetail*> requests(
-      per_model_stats.request_details_size());
-  for (size_t i = 0; i < per_model_stats.request_details_size(); i++) {
-    requests[i] = &per_model_stats.request_details(i);
-  }
-  // Requests in per model stats are already sorted by latency. Only redo the
-  // sorting when percentile column is not latency.
-  if (request_percentile_column != kColumnLatencyUs) {
-    std::sort(requests.begin(), requests.end(),
-              GetRequestCompareFunction(request_percentile_column));
-  }
-  sampled_per_model_stats->sampled_requests =
-      PercentileSelector<RequestDetail>::Select(requests);
-
-  std::vector<const BatchDetail*> batches(per_model_stats.batch_details_size());
-  for (size_t i = 0; i < per_model_stats.batch_details_size(); i++) {
-    batches[i] = &per_model_stats.batch_details(i);
-  }
-  // Batches in per model stats are already sorted by latency. Only redo the
-  // sorting when percentile column is not latency.
-  if (batch_percentile_column != kColumnLatencyUs) {
-    std::sort(batches.begin(), batches.end(),
-              GetBatchCompareFunction(batch_percentile_column));
-  }
-  sampled_per_model_stats->sampled_batches =
-      PercentileSelector<BatchDetail>::Select(batches);
-}
-
-}  // namespace
-
-SampledInferenceStats SampleInferenceStats(
-    absl::string_view request_percentile_column,
-    absl::string_view batch_percentile_column,
-    const InferenceStats& inference_stats) {
-  SampledInferenceStats result;
-  for (const auto& [model_index, model_inference_stats] :
-       inference_stats.inference_stats_per_model()) {
-    SamplePerModelInferenceStats(request_percentile_column,
-                                 batch_percentile_column, model_inference_stats,
-                                 &(result[model_index]));
-  }
-
-  return result;
-}
-
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/inference_stats_sampler.h b/tensorflow/core/profiler/convert/inference_stats_sampler.h
deleted file mode 100644
index 2706c16a8ff9..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_sampler.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_SAMPLER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_SAMPLER_H_
-
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-
-namespace tensorflow::profiler {
-
-// Sampled inference stats of a model.
-// The pointers of RequestDetail and BatchDetail point to the actual data stored
-// in TfOpStats.InferenceStats.
-struct SampledPerModelInferenceStats {
-  // Sampled requests and their percentile.
-  std::vector<std::pair<const tensorflow::profiler::RequestDetail*, double>>
-      sampled_requests;
-  // Sampled batches and their percentile.
-  std::vector<std::pair<const tensorflow::profiler::BatchDetail*, double>>
-      sampled_batches;
-};
-
-// All the sampled inference stats of a profile.
-// TODO: Move to use SampledInferenceStatsProto if feasible.
-using SampledInferenceStats =
-    absl::flat_hash_map<int /*model_index*/, SampledPerModelInferenceStats>;
-
-// Samples a subset of InferenceStats from <inference_stats> based on sampling
-// column <request_percentile_column> and <batch_percentile_column>.
-SampledInferenceStats SampleInferenceStats(
-    absl::string_view request_percentile_column,
-    absl::string_view batch_percentile_column,
-    const tensorflow::profiler::InferenceStats& inference_stats);
-
-}  // namespace tensorflow::profiler
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_INFERENCE_STATS_SAMPLER_H_
diff --git a/tensorflow/core/profiler/convert/inference_stats_sampler_test.cc b/tensorflow/core/profiler/convert/inference_stats_sampler_test.cc
deleted file mode 100644
index 72c35a520a4c..000000000000
--- a/tensorflow/core/profiler/convert/inference_stats_sampler_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/inference_stats_sampler.h"
-
-#include "absl/status/statusor.h"
-#include "xla/tests/test_utils.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-
-namespace tensorflow::profiler {
-namespace {
-using ::tensorflow::profiler::InferenceStats;
-using xla::ParseTextProto;
-
-TEST(ConvertInferenceStatsToInferenceProfileTest, TestSort) {
-  // Generate an inference stats for test.
-  // Requests and batches are ordered by latency (end_time_ps - start_time_ps),
-  // this is guaranteed by inference_stats.cc
-  InferenceStats inference_stats = ParseTextProto<InferenceStats>(
-                                       R"pb(
-                                         inference_stats_per_model {
-                                           key: 1
-                                           value {
-                                             request_details {
-                                               request_id: 0
-                                               start_time_ps: 0
-                                               end_time_ps: 10000
-                                               batching_request_delay_ps: 2000
-                                               batching_request_size: 200
-                                             }
-                                             request_details {
-                                               request_id: 1
-                                               start_time_ps: 0
-                                               end_time_ps: 20000
-                                               batching_request_delay_ps: 1000
-                                               batching_request_size: 100
-                                             }
-                                             request_details {
-                                               request_id: 2
-                                               start_time_ps: 0
-                                               end_time_ps: 30000
-                                               batching_request_delay_ps: 3000
-                                               batching_request_size: 300
-                                             }
-                                             batch_details {
-                                               batch_id: 3
-                                               start_time_ps: 0
-                                               end_time_ps: 10000
-                                               batch_delay_ps: 2000
-                                               padding_amount: 20
-                                               batch_size_after_padding: 200
-                                             }
-                                             batch_details {
-                                               batch_id: 4
-                                               start_time_ps: 0
-                                               end_time_ps: 20000
-                                               batch_delay_ps: 1000
-                                               padding_amount: 10
-                                               batch_size_after_padding: 100
-                                             }
-                                             batch_details {
-                                               batch_id: 5
-                                               start_time_ps: 0
-                                               end_time_ps: 30000
-                                               batch_delay_ps: 3000
-                                               padding_amount: 30
-                                               batch_size_after_padding: 300
-                                             }
-                                           }
-                                         }
-                                       )pb")
-                                       .value();
-
-  // Sort by latency, the result does not change.
-  auto result_1 = SampleInferenceStats("Latency", "Latency", inference_stats);
-  const auto& per_model_1 = result_1.at(1);
-  EXPECT_EQ(per_model_1.sampled_requests.at(0).first->request_id(), 0);
-  EXPECT_EQ(per_model_1.sampled_requests.at(1).first->request_id(), 1);
-  EXPECT_EQ(per_model_1.sampled_requests.at(2).first->request_id(), 2);
-  EXPECT_EQ(per_model_1.sampled_batches.at(0).first->batch_id(), 3);
-  EXPECT_EQ(per_model_1.sampled_batches.at(1).first->batch_id(), 4);
-  EXPECT_EQ(per_model_1.sampled_batches.at(2).first->batch_id(), 5);
-
-  // Sort requests by Request size, sort batches by Padding amount.
-  // Verifies the values are in increasing order.
-  auto result_2 =
-      SampleInferenceStats("Request size", "Padding amount", inference_stats);
-  const auto& per_model_2 = result_2.at(1);
-  EXPECT_EQ(per_model_2.sampled_requests.at(0).first->batching_request_size(),
-            100);
-  EXPECT_EQ(per_model_2.sampled_requests.at(1).first->batching_request_size(),
-            200);
-  EXPECT_EQ(per_model_2.sampled_requests.at(2).first->batching_request_size(),
-            300);
-  EXPECT_EQ(per_model_2.sampled_batches.at(0).first->padding_amount(), 10);
-  EXPECT_EQ(per_model_2.sampled_batches.at(1).first->padding_amount(), 20);
-  EXPECT_EQ(per_model_2.sampled_batches.at(2).first->padding_amount(), 30);
-
-  // Sort requests by Request delay for batching, sort batches by
-  // Batching delay. Verifies the values are in increasing order.
-  auto result_3 = SampleInferenceStats("Request delay for batching",
-                                       "Batching delay", inference_stats);
-  const auto& per_model_3 = result_3.at(1);
-  EXPECT_EQ(
-      per_model_3.sampled_requests.at(0).first->batching_request_delay_ps(),
-      1000);
-  EXPECT_EQ(
-      per_model_3.sampled_requests.at(1).first->batching_request_delay_ps(),
-      2000);
-  EXPECT_EQ(
-      per_model_3.sampled_requests.at(2).first->batching_request_delay_ps(),
-      3000);
-  EXPECT_EQ(per_model_3.sampled_batches.at(0).first->batch_delay_ps(), 1000);
-  EXPECT_EQ(per_model_3.sampled_batches.at(1).first->batch_delay_ps(), 2000);
-  EXPECT_EQ(per_model_3.sampled_batches.at(2).first->batch_delay_ps(), 3000);
-}
-
-}  // namespace
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.cc b/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.cc
deleted file mode 100644
index e01b645d3b19..000000000000
--- a/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h"
-
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
-#include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
-#include "tensorflow/core/profiler/utils/step_intersection.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-absl::Status ConvertMultiXSpacesToCombinedOpStats(
-    const SessionSnapshot& session_snapshot, const OpStatsOptions& options,
-    OpStats* combined_op_stats) {
-  // Read multiple XSpaces and convert to multiple OpStats.
-  // TODO(profiler): Change the combiner to convert and combine one OpStats at a
-  // time, to reduce peak memory usage.
-  std::vector<OpStats> all_op_stats;
-  all_op_stats.reserve(session_snapshot.XSpaceSize());
-  for (int i = 0; i < session_snapshot.XSpaceSize(); i++) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(i));
-    PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
-                               /*derived_timeline=*/true);
-    all_op_stats.push_back(ConvertXSpaceToOpStats(*xspace, options));
-  }
-
-  // Combine OpStats.
-  std::vector<OpStatsInfo> all_op_stats_info;
-  all_op_stats_info.reserve(all_op_stats.size());
-  for (int i = 0; i < all_op_stats.size(); i++) {
-    all_op_stats_info.emplace_back(
-        &all_op_stats[i],
-        ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
-  }
-
-  // Do not limit the maximum number of steps during the merge of OpStats.
-  StepIntersection step_intersection =
-      ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
-  CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
-
-  return absl::OkStatus();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h b/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h
deleted file mode 100644
index 51348097d321..000000000000
--- a/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
-
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Converts and combines multiple XSpace protos into a single OpStats
-// <combined_op_stats>.
-// Return the first error status during conversion, or return OkStatus() if
-// there is no error.
-absl::Status ConvertMultiXSpacesToCombinedOpStats(
-    const SessionSnapshot& session_snapshot, const OpStatsOptions& options,
-    OpStats* combined_op_stats);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
diff --git a/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.cc b/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.cc
deleted file mode 100644
index e1a466665492..000000000000
--- a/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h"
-
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/profiler/utils/device_utils.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/convert/inference_stats.h"
-#include "tensorflow/core/profiler/convert/inference_stats_combiner.h"
-#include "tensorflow/core/profiler/convert/inference_stats_grouping.h"
-#include "tensorflow/core/profiler/convert/inference_stats_sampler.h"
-#include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow::profiler {
-
-namespace {
-using tsl::profiler::FindMutablePlanesWithPrefix;
-using tsl::profiler::FindMutablePlaneWithName;
-
-SampledInferenceStatsProto GetSampledInferenceStatsProto(
-    const InferenceStats& inference_stats, absl::string_view request_column,
-    absl::string_view batch_column) {
-  SampledInferenceStatsProto result;
-  SampledInferenceStats sampled_stats =
-      SampleInferenceStats(request_column, batch_column, inference_stats);
-  for (const auto& [model_index, samples] : sampled_stats) {
-    SampledPerModelInferenceStatsProto per_model_stats;
-    for (const auto& [request, percentile] : samples.sampled_requests) {
-      RequestDetail request_detail = *request;
-      request_detail.set_percentile(percentile);
-      *per_model_stats.add_sampled_requests() = request_detail;
-    }
-    for (const auto& [batch, percentile] : samples.sampled_batches) {
-      BatchDetail batch_detail = *batch;
-      batch_detail.set_percentile(percentile);
-      *per_model_stats.add_sampled_batches() = batch_detail;
-    }
-    result.mutable_sampled_inference_stats_per_model()->insert(
-        {model_index, per_model_stats});
-  }
-  return result;
-}
-}  // namespace
-
-StepEvents GetNonOverlappedStepEvents(XSpace* xspace) {
-  StepEvents non_overlapped_step_events;
-
-  std::vector<XPlane*> device_traces =
-      FindMutablePlanesWithPrefix(xspace, kGpuPlanePrefix);
-  if (device_traces.empty()) return non_overlapped_step_events;
-
-  StepEvents device_step_events;
-  StepEvents host_step_events;
-  for (XPlane* device_trace : device_traces) {
-    StepEvents events = ConvertDeviceTraceXPlaneToStepEvents(*device_trace);
-    UnionCombineStepEvents(events, &device_step_events);
-  }
-
-  XPlaneVisitor host_plane = tsl::profiler::CreateTfXPlaneVisitor(
-      FindMutablePlaneWithName(xspace, kHostThreadsPlaneName));
-
-  host_plane.ForEachLine([&](const XLineVisitor& line) {
-    StepEvents events =
-        ConvertHostThreadsXLineToStepEvents(line, &device_step_events);
-    UnionCombineStepEvents(events, &host_step_events);
-  });
-  StepEvents overlapped_step_events;
-  UnionCombineStepEvents(device_step_events, &overlapped_step_events);
-  UnionCombineStepEvents(host_step_events, &overlapped_step_events);
-  non_overlapped_step_events =
-      ToNonOverlappedStepEvents(overlapped_step_events);
-  return non_overlapped_step_events;
-}
-
-absl::Status ConvertMultiXSpaceToInferenceStats(
-    const SessionSnapshot& session_snapshot, absl::string_view request_column,
-    absl::string_view batch_column, InferenceStats* inference_stats) {
-  for (int i = 0; i < session_snapshot.XSpaceSize(); ++i) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(i));
-    tsl::profiler::GroupMetadataMap metadata_map;
-    InferenceStats inference_stats_per_host;
-    std::vector<XPlane*> device_traces =
-        tsl::profiler::FindMutableTensorCorePlanes(xspace.get());
-    PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
-                               /*derived_timeline=*/false, &metadata_map);
-    StepEvents non_overlapped_step_events =
-        GetNonOverlappedStepEvents(xspace.get());
-    GenerateInferenceStats(
-        device_traces, non_overlapped_step_events, metadata_map, *xspace,
-        tsl::profiler::DeviceType::kTpu, i, &inference_stats_per_host);
-    CombineInferenceStatsResult(i, inference_stats_per_host, inference_stats);
-  }
-  RegroupInferenceStatsByModel(inference_stats);
-  *inference_stats->mutable_sampled_inference_stats() =
-      GetSampledInferenceStatsProto(*inference_stats, request_column,
-                                    batch_column);
-  return absl::OkStatus();
-}
-}  // namespace tensorflow::profiler
diff --git a/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h b/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h
deleted file mode 100644
index 8214921600ef..000000000000
--- a/tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XSPACE_TO_INFERENCE_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XSPACE_TO_INFERENCE_STATS_H_
-
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-
-namespace tensorflow::profiler {
-// Get non overlapped step events from xspace for GPU.
-StepEvents GetNonOverlappedStepEvents(XSpace* xspace);
-
-absl::Status ConvertMultiXSpaceToInferenceStats(
-    const SessionSnapshot& session_snapshot, absl::string_view request_column,
-    absl::string_view batch_column, InferenceStats* inference_stats);
-}  // namespace tensorflow::profiler
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XSPACE_TO_INFERENCE_STATS_H_
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
deleted file mode 100644
index ab0c25b6f38c..000000000000
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-
-#include <algorithm>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using OperationType = OpMetrics::MemoryAccessed::OperationType;
-
-void CombinePrecisionStats(const PrecisionStats& src, PrecisionStats* dst) {
-  dst->set_compute_16bit_ps(src.compute_16bit_ps() + dst->compute_16bit_ps());
-  dst->set_compute_32bit_ps(src.compute_32bit_ps() + dst->compute_32bit_ps());
-}
-
-}  // namespace
-
-void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst) {
-  DCHECK(dst != nullptr);
-  DCHECK_EQ(src.hlo_module_id(), dst->hlo_module_id());
-  DCHECK_EQ(src.name(), dst->name());
-  if (dst->long_name().empty()) {
-    dst->set_long_name(src.long_name());
-  }
-  if (dst->fingerprint() == 0) {
-    dst->set_fingerprint(src.fingerprint());
-  }
-  if (dst->category().empty()) {
-    dst->set_category(src.category());
-  }
-  if (dst->provenance().empty()) {
-    dst->set_provenance(src.provenance());
-  }
-  if (dst->deduplicated_name().empty()) {
-    dst->set_deduplicated_name(src.deduplicated_name());
-  }
-  if (!dst->has_layout() && src.has_layout()) {
-    *dst->mutable_layout() = src.layout();
-  }
-  if (!dst->has_children() && src.has_children()) {
-    *dst->mutable_children() = src.children();
-  }
-}
-
-void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst,
-                      bool update_num_cores) {
-  DCHECK(dst != nullptr);
-  if (dst->occurrences() == 0) {
-    dst->set_min_time_ps(src.min_time_ps());
-  } else {
-    dst->set_min_time_ps(std::min(src.min_time_ps(), dst->min_time_ps()));
-  }
-  dst->set_is_eager(dst->is_eager() || src.is_eager());
-  dst->set_occurrences(src.occurrences() + dst->occurrences());
-  dst->set_time_ps(src.time_ps() + dst->time_ps());
-  dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
-  dst->set_flops(src.flops() + dst->flops());
-  dst->set_model_flops(src.model_flops() + dst->model_flops());
-  dst->set_bytes_accessed(src.bytes_accessed() + dst->bytes_accessed());
-  dst->set_autotuned(dst->autotuned() || src.autotuned());
-  if (update_num_cores) {
-    dst->set_num_cores(src.num_cores() + dst->num_cores());
-  }
-  CombineMemoryAccessedBreakdown(src.memory_accessed_breakdown(),
-                                 dst->mutable_memory_accessed_breakdown());
-  dst->set_dma_stall_ps(src.dma_stall_ps() + dst->dma_stall_ps());
-}
-
-void CombineMemoryAccessedBreakdown(
-    const tsl::protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>& src,
-    tsl::protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>* dst) {
-  if (src.empty()) return;
-  absl::flat_hash_map<std::pair<uint64 /*memory_space*/, OperationType>,
-                      OpMetrics_MemoryAccessed*>
-      dst_memory_accessed_map;
-  for (auto& dst_memory_accessed : *dst) {
-    dst_memory_accessed_map[{dst_memory_accessed.memory_space(),
-                             dst_memory_accessed.operation_type()}] =
-        &dst_memory_accessed;
-  }
-  for (const auto& src_memory_accessed : src) {
-    uint64 memory_space = src_memory_accessed.memory_space();
-    OperationType operation_type = src_memory_accessed.operation_type();
-    auto*& dst_memory_accessed =
-        dst_memory_accessed_map[{memory_space, operation_type}];
-    if (dst_memory_accessed == nullptr) {
-      dst_memory_accessed = dst->Add();
-      dst_memory_accessed->set_memory_space(memory_space);
-      dst_memory_accessed->set_operation_type(operation_type);
-    }
-    dst_memory_accessed->set_bytes_accessed(
-        src_memory_accessed.bytes_accessed() +
-        dst_memory_accessed->bytes_accessed());
-  }
-}
-
-void OpMetricsDbCombiner::Combine(const OpMetricsDb& src,
-                                  bool update_num_cores) {
-  OpMetricsDb* dst = db();
-  dst->set_total_host_infeed_enq_duration_ps(
-      src.total_host_infeed_enq_duration_ps() +
-      dst->total_host_infeed_enq_duration_ps());
-  dst->set_total_host_infeed_enq_start_timestamp_ps_diff(
-      src.total_host_infeed_enq_start_timestamp_ps_diff() +
-      dst->total_host_infeed_enq_start_timestamp_ps_diff());
-  dst->set_total_time_ps(src.total_time_ps() + dst->total_time_ps());
-  dst->set_total_op_time_ps(src.total_op_time_ps() + dst->total_op_time_ps());
-  dst->set_idle_time_ps(src.idle_time_ps() + dst->idle_time_ps());
-  dst->set_busy_time_ps(src.busy_time_ps() + dst->busy_time_ps());
-  CombinePrecisionStats(src.precision_stats(), dst->mutable_precision_stats());
-
-  for (const auto& src_metrics : src.metrics_db()) {
-    auto* dst_metrics = LookupOrInsertNewOpMetrics(src_metrics.hlo_module_id(),
-                                                   src_metrics.name());
-    CopyOpMetricsMetadata(src_metrics, dst_metrics);
-    CombineOpMetrics(src_metrics, dst_metrics, update_num_cores);
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.h b/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
deleted file mode 100644
index 76019da86cd4..000000000000
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_DB_COMBINER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_DB_COMBINER_H_
-
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Copies OpMetrics metadata (e.g., category, provenance) from src to dst.
-void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst);
-
-// Combines OpMetrics data (e.g., occurrences, time) from src into dst.
-// If <update_num_cores> is set to true, update the dst->num_cores to
-// calculate the number of cores a certain op occurs.
-void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst,
-                      bool update_num_cores);
-
-// Combines the memory access breakdown.
-void CombineMemoryAccessedBreakdown(
-    const protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>& src,
-    protobuf::RepeatedPtrField<OpMetrics_MemoryAccessed>* dst);
-
-// Helper to combine op metrics databases.
-class OpMetricsDbCombiner : public OpMetricsDbBuilder {
- public:
-  explicit OpMetricsDbCombiner(OpMetricsDb* dst) : OpMetricsDbBuilder(dst) {}
-
-  // Combine the OpMetrics in OpMetricsDb <src> to current OpMetricsDbCombiner.
-  // If <update_num_cores> is set to true, update the OpMetrics.num_cores to
-  // calculate the number of cores a certain op occurs.
-  void Combine(const OpMetricsDb& src, bool update_num_cores = true);
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_DB_COMBINER_H_
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.cc b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
deleted file mode 100644
index b6f1cadb5938..000000000000
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
-
-#include <tuple>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-std::vector<const OpMetrics*> SortedOpMetricsDb(const OpMetricsDb& metrics_db,
-                                                int max_records) {
-  std::vector<const OpMetrics*> result;
-  result.reserve(metrics_db.metrics_db_size());
-  for (const OpMetrics& metrics : metrics_db.metrics_db()) {
-    result.push_back(&metrics);
-  }
-
-  auto comp = [](const OpMetrics* a, const OpMetrics* b) {
-    return std::make_tuple(a->self_time_ps(), b->name()) >
-           std::make_tuple(b->self_time_ps(), a->name());
-  };
-  int result_size = result.size();
-  if (max_records != -1 && result_size > max_records) {
-    absl::c_partial_sort(result, result.begin() + max_records, comp);
-    result.resize(max_records);
-  } else {
-    absl::c_sort(result, comp);
-  }
-  return result;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.h b/tensorflow/core/profiler/convert/op_metrics_to_record.h
deleted file mode 100644
index 4884fb64adc2..000000000000
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-std::vector<const OpMetrics*> SortedOpMetricsDb(const OpMetricsDb& metrics_db,
-                                                int max_records = -1);
-
-inline double GigaFlopsPerSecondPerCore(const OpMetrics& metrics) {
-  // flops and time_ps are accumulated across all occurrences on all cores.
-  // time_ps is used instead of self_time_ps because flops for an op includes
-  // the flops executed by children (nested) ops.
-  return tsl::profiler::SafeDivide(
-      metrics.flops(), tsl::profiler::PicoToNano(metrics.time_ps()));
-}
-
-inline double GigaModelFlopsPerSecondPerCore(const OpMetrics& metrics) {
-  // flops and time_ps are accumulated across all occurrences on all cores.
-  // time_ps is used instead of self_time_ps because flops for an op includes
-  // the flops executed by children (nested) ops.
-  return tsl::profiler::SafeDivide(
-      metrics.model_flops(), tsl::profiler::PicoToNano(metrics.time_ps()));
-}
-
-// Return ByteAccessed for memory_space and operation_type.
-inline double BytesAccessedPerCore(
-    const OpMetrics& metrics, uint64_t memory_space,
-    OpMetrics::MemoryAccessed::OperationType operation_type) {
-  uint64_t bytes = 0;
-  if (memory_space == MemorySpace::MEMORY_SPACE_ALL) {
-    bytes = metrics.bytes_accessed();
-  } else {
-    for (const auto& breakdown : metrics.memory_accessed_breakdown()) {
-      // Count either on-chip or off-chip bytes.
-      if ((breakdown.operation_type() != operation_type) &&
-          (operation_type != OpMetrics::MemoryAccessed::UNKNOWN)) {
-        continue;
-      }
-      if (((memory_space == MemorySpace::MEMORY_SPACE_HBM) &&
-           (breakdown.memory_space() == MemorySpace::MEMORY_SPACE_HBM)) ||
-          ((memory_space == MemorySpace::MEMORY_SPACE_ON_CHIP) &&
-           (breakdown.memory_space() != MemorySpace::MEMORY_SPACE_HBM))) {
-        bytes += breakdown.bytes_accessed();
-      }
-    }
-  }
-  return bytes;
-}
-
-inline double GigaBytesPerSecondPerCore(
-    const OpMetrics& metrics, uint64_t memory_space,
-    OpMetrics::MemoryAccessed::OperationType operation_type) {
-  // bytes_accessed and time_ps are accumulated across all occurrences on all
-  // cores.
-  // time_ps is used instead of self_time_ps because bytes_accessed for an op
-  // includes the bytes accessed by children (nested) ops.
-  return tsl::profiler::SafeDivide(
-      BytesAccessedPerCore(metrics, memory_space, operation_type),
-      tsl::profiler::PicoToNano(metrics.time_ps()));
-}
-
-inline double GibiBytesPerSecondPerCore(
-    const OpMetrics& metrics, uint64_t memory_space,
-    OpMetrics::MemoryAccessed::OperationType op_type) {
-  return tsl::profiler::GigaToGibi(
-      GigaBytesPerSecondPerCore(metrics, memory_space, op_type));
-}
-
-template <typename Record>
-inline void SetExecutionTimes(const OpMetrics& metrics, Record* record) {
-  record->set_occurrences(metrics.occurrences());
-  record->set_total_time_in_us(tsl::profiler::PicoToMicro(metrics.time_ps()));
-  record->set_avg_time_in_us(tsl::profiler::SafeDivide(
-      record->total_time_in_us(), metrics.occurrences()));
-  record->set_total_self_time_in_us(
-      tsl::profiler::PicoToMicro(metrics.self_time_ps()));
-  record->set_avg_self_time_in_us(tsl::profiler::SafeDivide(
-      record->total_self_time_in_us(), metrics.occurrences()));
-}
-
-template <typename Record>
-inline void SetTpuUnitFractions(const OpMetrics& metrics, Record* record) {
-  record->set_dma_stall_fraction(
-      tsl::profiler::SafeDivide(metrics.dma_stall_ps(), metrics.time_ps()));
-}
-
-template <typename Record>
-inline void SetRankAndTimeFractions(double total_time_us,
-                                    const Record& prev_record, Record* record) {
-  record->set_rank(prev_record.rank() + 1);
-  record->set_total_self_time_as_fraction(tsl::profiler::SafeDivide(
-      record->total_self_time_in_us(), total_time_us));
-  record->set_cumulative_total_self_time_as_fraction(
-      prev_record.cumulative_total_self_time_as_fraction() +
-      record->total_self_time_as_fraction());
-}
-
-template <typename Record>
-inline void SetRankAndDeviceTimeFractions(double total_time_us,
-                                          const Record& prev_record,
-                                          Record* record) {
-  record->set_rank(prev_record.rank() + 1);
-  record->set_device_total_self_time_as_fraction(tsl::profiler::SafeDivide(
-      record->total_self_time_in_us(), total_time_us));
-  record->set_device_cumulative_total_self_time_as_fraction(
-      prev_record.device_cumulative_total_self_time_as_fraction() +
-      record->device_total_self_time_as_fraction());
-}
-
-template <typename Record>
-inline void SetRankAndHostTimeFractions(double total_time_us,
-                                        const Record& prev_record,
-                                        Record* record) {
-  record->set_rank(prev_record.rank() + 1);
-  record->set_host_total_self_time_as_fraction(tsl::profiler::SafeDivide(
-      record->total_self_time_in_us(), total_time_us));
-  record->set_host_cumulative_total_self_time_as_fraction(
-      prev_record.host_cumulative_total_self_time_as_fraction() +
-      record->host_total_self_time_as_fraction());
-}
-
-// Returns the memory bandwidth in GigaBytes/s in the PerfEnv.
-// memory space is chosen by index following order in xplane_to_op_stats.cc
-static inline double GetMemoryPeakBandwidth(const PerfEnv& perf_env,
-                                            const int index) {
-  if (perf_env.peak_bws_giga_bytes_per_second_size() > index) {
-    return perf_env.peak_bws_giga_bytes_per_second(index);
-  }
-  return perf_env.peak_hbm_bw_giga_bytes_per_second();
-}
-
-template <typename Record>
-inline void SetRooflineMetrics(const OpMetrics& metrics, const PerfEnv perf_env,
-                               const RunEnvironment& run_env, Record* record) {
-  using ::tensorflow::profiler::MemorySpace;
-  using ::tensorflow::profiler::PerformanceInfo;
-
-  // Set overall performance metrics.
-  record->set_measured_flop_rate(GigaFlopsPerSecondPerCore(metrics));
-  record->set_model_flop_rate(GigaModelFlopsPerSecondPerCore(metrics));
-  record->set_measured_memory_bw(GibiBytesPerSecondPerCore(
-      metrics, tensorflow::profiler::MemorySpace::MEMORY_SPACE_ALL,
-      OpMetrics::MemoryAccessed::UNKNOWN));
-  record->set_flops(metrics.flops());
-  record->set_bytes_accessed(metrics.bytes_accessed());
-  record->set_operational_intensity(
-      tsl::profiler::SafeDivide(metrics.flops(), metrics.bytes_accessed()));
-  // Set performance metrics per memory access type.
-  uint64_t hbm_bytes = 0;
-  uint64_t cmem_read_bytes = 0;
-  uint64_t cmem_write_bytes = 0;
-  uint64_t vmem_read_bytes = 0;
-  uint64_t vmem_write_bytes = 0;
-  for (const auto& memory_access : metrics.memory_accessed_breakdown()) {
-    if (memory_access.memory_space() == PerformanceInfo::MemoryAccessed::HBM) {
-      hbm_bytes += memory_access.bytes_accessed();
-    } else if (memory_access.memory_space() ==
-               PerformanceInfo::MemoryAccessed::CMEM) {
-      if (memory_access.operation_type() == OpMetrics::MemoryAccessed::READ) {
-        cmem_read_bytes += memory_access.bytes_accessed();
-      } else if (memory_access.operation_type() ==
-                 OpMetrics::MemoryAccessed::WRITE) {
-        cmem_write_bytes += memory_access.bytes_accessed();
-      }
-    } else if (memory_access.memory_space() ==
-               PerformanceInfo::MemoryAccessed::VMEM) {
-      if (memory_access.operation_type() == OpMetrics::MemoryAccessed::READ) {
-        vmem_read_bytes += memory_access.bytes_accessed();
-      } else if (memory_access.operation_type() ==
-                 OpMetrics::MemoryAccessed::WRITE) {
-        vmem_write_bytes += memory_access.bytes_accessed();
-      }
-    }
-  }
-  if (metrics.memory_accessed_breakdown_size() == 0) {
-    // For legacy profiles without memory access breakdown, consider all memory
-    // access as HBM access.
-    hbm_bytes = metrics.bytes_accessed();
-  }
-  record->set_hbm_bw(tsl::profiler::GibibytesPerSecond(
-      hbm_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
-  record->set_cmem_read_bw(tsl::profiler::GibibytesPerSecond(
-      cmem_read_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
-  record->set_cmem_write_bw(tsl::profiler::GibibytesPerSecond(
-      cmem_write_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
-  record->set_vmem_read_bw(tsl::profiler::GibibytesPerSecond(
-      vmem_read_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
-  record->set_vmem_write_bw(tsl::profiler::GibibytesPerSecond(
-      vmem_write_bytes, tsl::profiler::PicoToNano(metrics.time_ps())));
-  record->set_hbm_operational_intensity(
-      tsl::profiler::SafeDivide(metrics.flops(), hbm_bytes));
-  record->set_cmem_read_operational_intensity(
-      tsl::profiler::SafeDivide(metrics.flops(), cmem_read_bytes));
-  record->set_cmem_write_operational_intensity(
-      tsl::profiler::SafeDivide(metrics.flops(), cmem_write_bytes));
-  record->set_vmem_read_operational_intensity(
-      tsl::profiler::SafeDivide(metrics.flops(), vmem_read_bytes));
-  record->set_vmem_write_operational_intensity(
-      tsl::profiler::SafeDivide(metrics.flops(), vmem_write_bytes));
-  // Resources considered for roofline analysis.
-  constexpr absl::string_view kUnknown = "Unknown";
-  constexpr absl::string_view kCompute = "Compute";
-  constexpr absl::string_view kHbm = "HBM";
-  constexpr absl::string_view kCmemRead = "CMEM Read";
-  constexpr absl::string_view kCmemWrite = "CMEM Write";
-  constexpr absl::string_view kVmemRead = "VMEM Read";
-  constexpr absl::string_view kVmemWrite = "VMEM Write";
-  constexpr absl::string_view kShmL1 = "Shm/L1";
-  // Compute the bound time assuming the peak capacity of each resource and
-  // choose the highest one as the bottleneck. See go/xprof-roofline-pxc for
-  // more details.
-  // NOTE: The roofline analysis result is the same for Megacore because every
-  // resource's capacity is doubled for Megacore so the comparison result is the
-  // same.
-  absl::string_view bottleneck_resource = kUnknown;
-  double bottleneck_utilization = 0;
-  double bottleneck_operational_intensity = 0;
-  double peak_flops =
-      tsl::profiler::TeraToGiga(perf_env.peak_tera_flops_per_second());
-  double flops_utilization =
-      tsl::profiler::SafeDivide(record->measured_flop_rate(), peak_flops);
-  if (bottleneck_utilization < flops_utilization) {
-    bottleneck_resource = kCompute;
-    bottleneck_utilization = flops_utilization;
-    bottleneck_operational_intensity = record->operational_intensity();
-  }
-  double peak_hbm_bw = GetMemoryPeakBandwidth(perf_env, 0);
-  double hbm_bw_utilization = tsl::profiler::SafeDivide(
-      record->hbm_bw(), tsl::profiler::GigaToGibi(peak_hbm_bw));
-  if (bottleneck_utilization < hbm_bw_utilization) {
-    bottleneck_resource = kHbm;
-    bottleneck_utilization = hbm_bw_utilization;
-    bottleneck_operational_intensity = record->hbm_operational_intensity();
-  }
-  tensorflow::profiler::HardwareType hardware_type = run_env.hardware_type();
-  if (hardware_type == tensorflow::profiler::HardwareType::TPU) {
-    if (cmem_read_bytes) {
-      double peak_cmem_read_bw = GetMemoryPeakBandwidth(perf_env, 3);
-      if (peak_cmem_read_bw) {
-        double cmem_read_bw_utilization = tsl::profiler::SafeDivide(
-            record->cmem_read_bw(),
-            tsl::profiler::GigaToGibi(peak_cmem_read_bw));
-        if (bottleneck_utilization < cmem_read_bw_utilization) {
-          bottleneck_resource = kCmemRead;
-          bottleneck_utilization = cmem_read_bw_utilization;
-          bottleneck_operational_intensity =
-              record->cmem_read_operational_intensity();
-        }
-      }
-    }
-    if (cmem_write_bytes) {
-      double peak_cmem_write_bw = GetMemoryPeakBandwidth(perf_env, 4);
-      if (peak_cmem_write_bw) {
-        double cmem_write_bw_utilization = tsl::profiler::SafeDivide(
-            record->cmem_write_bw(),
-            tsl::profiler::GigaToGibi(peak_cmem_write_bw));
-        if (bottleneck_utilization < cmem_write_bw_utilization) {
-          bottleneck_resource = kCmemWrite;
-          bottleneck_utilization = cmem_write_bw_utilization;
-          bottleneck_operational_intensity =
-              record->cmem_write_operational_intensity();
-        }
-      }
-    }
-    if (vmem_read_bytes) {
-      double peak_vmem_read_bw = GetMemoryPeakBandwidth(perf_env, 5);
-      if (peak_vmem_read_bw) {
-        double vmem_read_bw_utilization = tsl::profiler::SafeDivide(
-            record->vmem_read_bw(),
-            tsl::profiler::GigaToGibi(peak_vmem_read_bw));
-        if (bottleneck_utilization < vmem_read_bw_utilization) {
-          bottleneck_resource = kVmemRead;
-          bottleneck_utilization = vmem_read_bw_utilization;
-          bottleneck_operational_intensity =
-              record->vmem_read_operational_intensity();
-        }
-      }
-    }
-    if (vmem_write_bytes) {
-      double peak_vmem_write_bw = GetMemoryPeakBandwidth(perf_env, 6);
-      if (peak_vmem_write_bw) {
-        double vmem_write_bw_utilization = tsl::profiler::SafeDivide(
-            record->vmem_write_bw(),
-            tsl::profiler::GigaToGibi(peak_vmem_write_bw));
-        if (bottleneck_utilization < vmem_write_bw_utilization) {
-          bottleneck_resource = kVmemWrite;
-          bottleneck_utilization = vmem_write_bw_utilization;
-          bottleneck_operational_intensity =
-              record->vmem_write_operational_intensity();
-        }
-      }
-    }
-  }
-  if (hardware_type == tensorflow::profiler::HardwareType::GPU) {
-    double peak_shm_l1_bw = GetMemoryPeakBandwidth(perf_env, 2);
-    if (peak_shm_l1_bw) {
-      // Currently, we only have general read/write bandwidth in record.
-      double shm_l1_bw_utilization = tsl::profiler::SafeDivide(
-          record->hbm_bw(), tsl::profiler::GigaToGibi(peak_shm_l1_bw));
-      if (bottleneck_utilization < shm_l1_bw_utilization) {
-        bottleneck_resource = kShmL1;
-        bottleneck_utilization = shm_l1_bw_utilization;
-        bottleneck_operational_intensity = record->hbm_operational_intensity();
-      }
-    }
-  }
-  record->set_bound_by(std::string(bottleneck_resource));
-  record->set_bottleneck_operational_intensity(
-      bottleneck_operational_intensity);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
deleted file mode 100644
index 7af28a1b6af8..000000000000
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ /dev/null
@@ -1,453 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_profile_builder.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/ascii.h"
-#include "absl/strings/str_cat.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/lib/gtl/top_n.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using op_profile::Metrics;
-using op_profile::Node;
-using tsl::profiler::IsFusion;
-
-double CapUtilization(double utilization) { return std::min(utilization, 1.0); }
-
-// Fill symbol details into a node.
-void PopulateSymbolNode(const OpMetrics& op_metrics, Node* node) {
-  node->set_name(op_metrics.name());
-  Node::XLAInstruction& xla = *node->mutable_xla();
-  xla.set_expression(op_metrics.long_name());
-  xla.set_fingerprint(op_metrics.fingerprint());
-  xla.set_category(op_metrics.category());
-  xla.set_provenance(op_metrics.provenance());
-  if (op_metrics.has_layout()) {
-    for (const auto& dimension : op_metrics.layout().dimensions()) {
-      auto* dim = xla.mutable_layout()->add_dimensions();
-      dim->set_size(dimension.size());
-      dim->set_alignment(dimension.alignment());
-      dim->set_semantics(absl::AsciiStrToLower(
-          LayoutDimensionSemantics_Name(dimension.semantics())));
-    }
-  }
-  xla.set_computation_primitive_size(op_metrics.computation_primitive_size());
-}
-
-// Sort the children and only keep the top K children.
-template <typename Cmp>
-Node TopKChildren(const Node* root, int k, Cmp cmp) {
-  tensorflow::gtl::TopN<const Node*, decltype(cmp)> top_n(k, cmp);
-  for (const Node& node : root->children()) {
-    top_n.push(&node);
-  }
-  Node output;
-  std::unique_ptr<std::vector<const Node*>> extracted_nodes(top_n.Extract());
-  for (const Node* node : *extracted_nodes) {
-    *output.add_children() = *node;
-  }
-  return output;
-}
-
-// Copy symbol details into a deduplicated node from the top child node.
-void CopySymbolDetailsToDeduplicatedNode(Node* top_child_node,
-                                         Node* deduplicated_node) {
-  deduplicated_node->set_name(
-      absl::StrCat(top_child_node->name(), " and its duplicate(s)"));
-  Node::XLAInstruction& xla = *deduplicated_node->mutable_xla();
-  const Node::XLAInstruction& top_child_node_xla = top_child_node->xla();
-  xla.set_expression(top_child_node_xla.expression());
-  xla.set_fingerprint(top_child_node_xla.fingerprint());
-  xla.set_category(top_child_node_xla.category());
-  if (IsFusion(top_child_node_xla.category())) return;
-  xla.set_provenance(top_child_node_xla.provenance());
-  *xla.mutable_layout() = top_child_node_xla.layout();
-}
-
-void SortAndPruneChildren(int k, int level, Node* root) {
-  // Set the total number of children before pruning.
-  root->set_num_children(root->children_size());
-  for (Node& node : *root->mutable_children()) {
-    SortAndPruneChildren(k, level - 1, &node);
-  }
-  k = level > 0 ? root->children_size() : k;
-
-  if (root->children_size() > 1) {
-    if (root->has_xla() && IsFusion(root->xla().category())) {
-      // Sort the children under fusion node by raw flops.
-      *root->mutable_children() =
-          TopKChildren(root, k, [](const Node* a, const Node* b) {
-            return a->metrics().raw_flops() > b->metrics().raw_flops();
-          }).children();
-    } else {
-      *root->mutable_children() =
-          TopKChildren(root, k, [](const Node* a, const Node* b) {
-            return a->metrics().raw_time() > b->metrics().raw_time();
-          }).children();
-    }
-  }
-}
-
-// Finalize deduplicated nodes by copying symbol details from the top child
-// node.
-void FinalizeDeduplicatedNodes(bool by_program, Node* root) {
-  if (by_program) {
-    for (Node& program_node : *root->mutable_children()) {
-      for (Node& category_node : *program_node.mutable_children()) {
-        for (Node& deduplicated_node : *category_node.mutable_children()) {
-          // Node with 1 child doesn't have deduplication, the child is itself.
-          // Removing the dedup layer.
-          if (deduplicated_node.children_size() == 1) {
-            Node child = *deduplicated_node.mutable_children(0);
-            deduplicated_node = child;
-            continue;
-          }
-          CopySymbolDetailsToDeduplicatedNode(
-              deduplicated_node.mutable_children(0), &deduplicated_node);
-        }
-      }
-    }
-  } else {
-    for (Node& category_node : *root->mutable_children()) {
-      for (Node& deduplicated_node : *category_node.mutable_children()) {
-        // Node with 1 child doesn't have deduplication, the child is itself.
-        // Removing the dedup layer.
-        if (deduplicated_node.children_size() == 1) {
-          Node child = *deduplicated_node.mutable_children(0);
-          deduplicated_node = child;
-          continue;
-        }
-        CopySymbolDetailsToDeduplicatedNode(
-            deduplicated_node.mutable_children(0), &deduplicated_node);
-      }
-    }
-  }
-}
-
-// Fills op metrics into a node.
-void PopulateOpMetricsNode(
-    const OpMetrics& op_metrics, double peak_gigaflops_per_second_per_core,
-    std::vector<double> peak_mem_gibibytes_per_second_per_core,
-    uint64_t total_time_ps, Node* node) {
-  // TODO(dfinchel): remove this temporary change to avoid crash.
-  // This is only needed while we make an update to proto version that is not
-  // backwards compatible.
-  if (peak_mem_gibibytes_per_second_per_core.size() !=
-      (MemBwType_MAX - MemBwType_MIN + 1)) {
-    peak_mem_gibibytes_per_second_per_core.clear();
-    for (int i = MemBwType_MIN; i <= MemBwType_MAX; ++i) {
-      peak_mem_gibibytes_per_second_per_core.push_back(0);
-    }
-  }
-
-  Metrics* metrics = node->mutable_metrics();
-  // The UI computes flops_rate = raw_flops / raw_time
-  // and memory_bandwidth = raw_bytes_accessed / raw_time. See:
-  // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
-  metrics->set_raw_time(op_metrics.time_ps());
-  metrics->set_raw_flops(op_metrics.model_flops());
-  metrics->set_occurrences(op_metrics.occurrences());
-  metrics->set_avg_time_ps(tsl::profiler::SafeDivide(op_metrics.time_ps(),
-                                                     op_metrics.occurrences()));
-
-  double flops_utilization = CapUtilization(
-      tsl::profiler::SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
-                                peak_gigaflops_per_second_per_core));
-  // The UI expects flops_utilization = flop_util / time_fraction. See:
-  // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
-  const double time_fraction =
-      tsl::profiler::SafeDivide(op_metrics.time_ps(), total_time_ps);
-  metrics->set_flops(flops_utilization * time_fraction);
-
-  // Capture both on-chip and off-chip memory utilization.
-  const double hbm_gibibytes_per_second =
-      tsl::profiler::GigaToGibi(
-          GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_HBM,
-                                    OpMetrics::MemoryAccessed::READ)) +
-      tsl::profiler::GigaToGibi(
-          GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_HBM,
-                                    OpMetrics::MemoryAccessed::WRITE));
-  const double hbm_bw_utilization = CapUtilization(tsl::profiler::SafeDivide(
-      hbm_gibibytes_per_second,
-      peak_mem_gibibytes_per_second_per_core[MemBwType::MEM_BW_TYPE_HBM_RW]));
-  metrics->add_bandwidth_utils(hbm_bw_utilization);
-  double hbm_bytes = tsl::profiler::GibiToGiga(hbm_gibibytes_per_second) *
-                     tsl::profiler::PicoToNano(op_metrics.time_ps());
-
-  const double sram_rd_gibibytes_per_second = tsl::profiler::GigaToGibi(
-      GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_ON_CHIP,
-                                OpMetrics::MemoryAccessed::READ));
-  const double sram_rd_bw_utilization =
-      CapUtilization(tsl::profiler::SafeDivide(
-          sram_rd_gibibytes_per_second, peak_mem_gibibytes_per_second_per_core
-                                            [MemBwType::MEM_BW_TYPE_SRAM_RD]));
-  metrics->add_bandwidth_utils(sram_rd_bw_utilization);
-  double sram_rd_bytes =
-      tsl::profiler::GibiToGiga(sram_rd_gibibytes_per_second) *
-      tsl::profiler::PicoToNano(op_metrics.time_ps());
-
-  const double sram_wr_gibibytes_per_second = tsl::profiler::GigaToGibi(
-      GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_ON_CHIP,
-                                OpMetrics::MemoryAccessed::WRITE));
-  const double sram_wr_bw_utilization =
-      CapUtilization(tsl::profiler::SafeDivide(
-          sram_wr_gibibytes_per_second, peak_mem_gibibytes_per_second_per_core
-                                            [MemBwType::MEM_BW_TYPE_SRAM_WR]));
-  metrics->add_bandwidth_utils(sram_wr_bw_utilization);
-  double sram_wr_bytes =
-      tsl::profiler::GibiToGiga(sram_wr_gibibytes_per_second) *
-      tsl::profiler::PicoToNano(op_metrics.time_ps());
-
-  metrics->add_raw_bytes_accessed_array(hbm_bytes);
-  metrics->add_raw_bytes_accessed_array(sram_rd_bytes);
-  metrics->add_raw_bytes_accessed_array(sram_wr_bytes);
-}
-
-// Recursively insert "fused instruction" nodes (with raw flops).
-void InsertFusedInstructions(const OpMetrics& op_metrics, Node* node) {
-  if (!op_metrics.has_children()) return;
-  for (const auto& child : op_metrics.children().metrics_db()) {
-    Node* new_node = node->add_children();
-    PopulateSymbolNode(child, new_node);
-    new_node->mutable_metrics()->set_raw_flops(child.flops());
-    if (child.has_children()) {
-      InsertFusedInstructions(child, new_node);
-    }
-  }
-}
-
-void UpdateNodeMetrics(const OpMetrics& child, OpMetrics* parent) {
-  DCHECK(parent != nullptr);
-  parent->set_time_ps(child.self_time_ps() + parent->time_ps());
-  if (ChildrenTimePs(child) == 0) {
-    parent->set_flops(child.flops() + parent->flops());
-    parent->set_model_flops(child.model_flops() + parent->model_flops());
-    parent->set_bytes_accessed(child.bytes_accessed() +
-                               parent->bytes_accessed());
-    parent->set_dma_stall_ps(child.dma_stall_ps() + parent->dma_stall_ps());
-    CombineMemoryAccessedBreakdown(child.memory_accessed_breakdown(),
-                                   parent->mutable_memory_accessed_breakdown());
-  }
-}
-
-}  // namespace
-
-std::string OpProfileBuilder::GenerateProgramName(uint64_t program_id) const {
-  DCHECK(program_name_map_ != nullptr);
-  auto iter = program_name_map_->find(program_id);
-  if (iter == program_name_map_->end()) return "main";
-  return tsl::profiler::HloModuleNameWithProgramId(iter->second, program_id);
-}
-
-Node* OpProfileBuilder::AddOpNode(const OpMetrics& op_metrics,
-                                  Category* category, Node* deduplicated_node) {
-  Node* leaf;
-  if (deduplicated_node != nullptr) {
-    leaf = deduplicated_node->add_children();
-  } else if (category != nullptr) {
-    leaf = category->node->add_children();
-  } else {
-    leaf = root_->add_children();
-  }
-  PopulateSymbolNode(op_metrics, leaf);
-  InsertFusedInstructions(op_metrics, leaf);
-  return leaf;
-}
-
-// Function to create deduplicated aggregation layer.
-// 1. Empty deduplicated_name in op_metrics means either:
-// (1) a grouping op of a deduplicated op list. (fusion.3 in the example below)
-// (2) an op that does not have duplicates. (fusion.4 in the example below)
-// We create dedup layer for both cases due to lack of clue which case it is.
-// The op name is used directly as the hash key for the dedup group. The dedup
-// layer will be removed in the 2nd pass for case (2).
-// 2. Non-empty deduplicated_name means this op can be grouped to a
-// deduplicated op list (fusion.1 in the example below).
-// Example:
-// op_metrics {
-//   name: "fusion.1"
-//   deduplicated_name: "fusion.3"
-//   category: "convolution"
-// }
-// op_metrics {
-//   name: "fusion.3"
-//   deduplicated_name: ""
-//   category: "convolution"
-// }
-// op_metrics {
-//   name: "fusion.4"
-//   deduplicated_name: ""
-//   category: "convolution"
-// }
-// The data above will create the following tree after calling the function
-// repeatedly:
-// root(by_program)
-// - jit.xx
-//   - convolution
-//     - fusion.3
-//       - fusion.1
-//       - fusion.2
-//       - fusion.3
-//     - fusion.4
-//       - fusion.4
-// After finalization, the tree will look like:
-// root(by_program)
-// - jit.xx
-//   - convolution
-//     - fusion.3 and its duplicate(s)
-//       - fusion.1
-//       - fusion.2
-//       - fusion.3
-//     - fusion.4
-Node* OpProfileBuilder::LookupOrAddDeduplicatedNode(const OpMetrics& op_metrics,
-                                                    Category* category) {
-  std::string deduplicated_name = op_metrics.deduplicated_name().empty()
-                                      ? op_metrics.name()
-                                      : op_metrics.deduplicated_name();
-  Node*& deduplicated_node = category->deduplicated_nodes[deduplicated_name];
-  if (deduplicated_node == nullptr) {
-    deduplicated_node = category->node->add_children();
-    // Set deduplicated name which is the hash key for the dedup group.
-    // Symbol details will be added in finalization step.
-    deduplicated_node->set_name(deduplicated_name);
-  }
-  return deduplicated_node;
-}
-
-OpProfileBuilder::Category* OpProfileBuilder::LookupOrAddCategoryNode(
-    const OpMetrics& op_metrics, Program* program) {
-  Category* category;
-  Node* category_parent;
-  if (program != nullptr) {
-    category = &program->categories[op_metrics.category()];
-    category_parent = program->node;
-  } else {
-    category = &category_map_[op_metrics.category()];
-    category_parent = root_;
-  }
-  if (category->node == nullptr) {
-    category->node = category_parent->add_children();
-    category->node->set_name(op_metrics.category());
-  }
-  return category;
-}
-
-OpProfileBuilder::Program* OpProfileBuilder::LookupOrAddProgramNode(
-    const OpMetrics& op_metrics) {
-  uint64_t program_id = op_metrics.hlo_module_id();
-  Program* program = &programs_map_[program_id];
-  if (program->node == nullptr) {
-    program->node = root_->add_children();
-    program->node->set_name(GenerateProgramName(program_id));
-  }
-  return program;
-}
-
-void OpProfileBuilder::AddOp(const OpMetrics& op_metrics) {
-  // 1. Deal with nested parent nodes
-  // op_metrics.time_ps in root node will be reset to total_time_ps later
-  UpdateNodeMetrics(op_metrics, &metrics_[root_]);
-  Program* program = nullptr;
-  if (!IsIdleOp(op_metrics) && options_.group_by_program) {
-    program = LookupOrAddProgramNode(op_metrics);
-    UpdateNodeMetrics(op_metrics, &metrics_[program->node]);
-  }
-
-  // 2. Deal with nested grouping nodes, only accumulate non-child ops
-  if (ChildrenTimePs(op_metrics) > 0) return;
-  std::vector<Node*> nested_grouping_nodes;
-  if (IsIdleOp(op_metrics)) {
-    Node* leaf = AddOpNode(op_metrics);
-    nested_grouping_nodes.push_back(leaf);
-  } else {
-    Category* category = LookupOrAddCategoryNode(op_metrics, program);
-    nested_grouping_nodes.push_back(category->node);
-
-    Node* deduplicated_node = nullptr;
-    if (options_.group_by_deduplicated_name) {
-      deduplicated_node = LookupOrAddDeduplicatedNode(op_metrics, category);
-      nested_grouping_nodes.push_back(deduplicated_node);
-    }
-
-    Node* leaf = AddOpNode(op_metrics, category, deduplicated_node);
-    nested_grouping_nodes.push_back(leaf);
-  }
-
-  for (auto* node : nested_grouping_nodes) {
-    // Per program combiner does not need to update OpMetrics.num_cores
-    CombineOpMetrics(op_metrics, &metrics_[node], /*update_num_cores=*/false);
-  }
-}
-
-void OpProfileBuilder::Finalize(
-    double peak_gigaflops_per_second_per_core,
-    std::vector<double> peak_mem_gibibytes_per_second_per_core,
-    uint64_t total_time_ps) {
-  // Call to `PopulateOpMetricsNode` depends on node time_ps to calculate
-  // flops, bandwidth_utils..etc. The root / program node time_ps might
-  // be off a bit, missing its own self_time when calling `UpdateNodeMetrics`.
-  // This is best effort to at least reset the time_ps for root node to be more
-  // precise.
-  metrics_[root_].set_time_ps(total_time_ps);
-  for (const auto& [node, op_metrics] : metrics_) {
-    PopulateOpMetricsNode(op_metrics, peak_gigaflops_per_second_per_core,
-                          peak_mem_gibibytes_per_second_per_core, total_time_ps,
-                          node);
-  }
-  // If grouping by program, we build a two-level pruned tree: the first level
-  // is per program and the second level is per category. Otherwise we build a
-  // single-level per category pruned tree.
-  int level = options_.group_by_program ? 2 : 1;
-  SortAndPruneChildren(options_.children_per_node, level, root_);
-  if (options_.group_by_deduplicated_name) {
-    FinalizeDeduplicatedNodes(options_.group_by_program, root_);
-  }
-}
-
-OpProfileBuilder::OpProfileBuilder(
-    const OpProfileOptions& options,
-    tensorflow::profiler::op_profile::Node* root,
-    const tsl::protobuf::Map<uint64_t, std::string>* program_name_map)
-    : options_(options), root_(root), program_name_map_(program_name_map) {
-  if (root == nullptr) {
-    LOG(DFATAL) << "root is null.";
-    return;
-  }
-  DCHECK(!options_.group_by_program || program_name_map_ != nullptr);
-  root->set_name(options_.group_by_program ? "by_program" : "by_category");
-}
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.h b/tensorflow/core/profiler/convert/op_profile_builder.h
deleted file mode 100644
index 3d4e7abd1f6b..000000000000
--- a/tensorflow/core/profiler/convert/op_profile_builder.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_PROFILE_BUILDER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_PROFILE_BUILDER_H_
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_map.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-struct OpProfileOptions {
-  bool group_by_program = true;
-  bool group_by_deduplicated_name = true;
-  int children_per_node = 100;
-};
-
-// The structure of an op profile tree may looks like below:
-// 1. group "by_program"
-// - It starts from the root node, named as "by_program", and this node does
-// not show up in op profile.
-// - The children of root node is a list of hlo program node, named as the
-// program/module name (eg. cluster.xx).
-// - The children of a program node is hlo op category node, named as the
-// category name (eg. data formatting).
-// - The children of a category node is a list of op node or deduplicated
-// group node:
-//   - For op that has duplicates, the child will be a deduplicated node,
-// named like "copy.1111 and its deduplicate(s)". Its children will be all op
-// nodes that are deduplicated.
-//   - For op that does not have duplicates, the child will be an op node
-// under the op category (eg. copy.2222).
-//
-// Example path: "by_program" -> "main(...)"
-// -> "data_formatting" -> "copy.12345 and its duplicate(s) -> "copy.12345"
-//
-// 2. group "by_category"
-// Similarly to how the `by_program` op profile tree is constructed,
-// `by_category` just removed the "program_node" layer:
-// - It starts from the root node, named as "by_category", this node also does
-// not show up in op profile.
-// - The children of root node is a list of op category node, everything below
-// is similar to above.
-// - ...
-//
-// Example path: "by_category" -> "data_formatting" -> "copy.12345 and its
-// duplicate(s) -> "copy.12345"
-//
-// How the op profile metrics are calculated:
-// 1. For parent node in the nested structure like root node and program node:
-// - time_ps will be accumulated from the self_time of all op nodes under it
-// (might still be off a bit if the parent node has self_time, more details in
-// b/333608397#comment5)
-// - flops and memory access will only be accumulated from leaf op node under
-// it to avoid double counting
-// - unable to get occurrences of program executions now
-// 2. For conceptual horizontal grouping node (eg.category, deduplicated)
-// - all op_metris fields will be accumulated from leaf op node only in the
-// group, to avoid double counting
-class OpProfileBuilder {
- public:
-  OpProfileBuilder(const OpProfileOptions& options, op_profile::Node* root,
-                   const tensorflow::protobuf::Map<uint64_t, std::string>*
-                       program_name_map = nullptr);
-
-  // Accumulate the op_metrics to the op_profile node tree
-  void AddOp(const OpMetrics& op_metrics);
-
-  // Finalize the op_profile proto in a few steps (inter-dependent):
-  // 1. Reset time_ps for root node for more precise total time
-  // 2. Loop over the node to op_metrics map, populate corresponding op_metrics
-  // to the node.metrics
-  // 3. `SortAndPruneChildren` given query param `op_profile_limit`
-  // 4. `FinalizeDeduplicatedNodes` by coping the first op node data to the
-  // deduplicated node
-  void Finalize(double peak_gigaflops_per_second_per_core,
-                std::vector<double> peak_mem_gibibytes_per_second_per_core,
-                uint64_t total_time_ps);
-
- private:
-  struct Category {
-    op_profile::Node* node;
-    absl::flat_hash_map<std::string, op_profile::Node*> deduplicated_nodes;
-  };
-
-  struct Program {
-    op_profile::Node* node;
-    absl::flat_hash_map<std::string, Category> categories;
-  };
-
-  std::string GenerateProgramName(uint64_t program_id) const;
-
-  // Adds and returns a node for op_metrics.
-  // If op_metrics corresponds to a fusion, adds children to the node for the
-  // fused instructions.
-  // If deduplicated_node is not null, adds the node under it.
-  // Otherwise, if category is not null, adds the node under category.
-  // Otherwise, adds the node under root.
-  op_profile::Node* AddOpNode(const OpMetrics& op_metrics,
-                              Category* category = nullptr,
-                              op_profile::Node* deduplicated_node = nullptr);
-
-  // Returns a node for op_metrics.deduplicated_name().
-  // Adds a node to the tree if necessary.
-  op_profile::Node* LookupOrAddDeduplicatedNode(const OpMetrics& op_metrics,
-                                                Category* category);
-
-  // Returns a node for op_metrics.category().
-  // Adds a node to the tree if necessary.
-  // If program is not null, the category node is added under program.
-  // Otherwise, the category node is added under root.
-  Category* LookupOrAddCategoryNode(const OpMetrics& op_metrics,
-                                    Program* program);
-
-  // Returns a node for op_metrics.hlo_module_id().
-  // Adds a node to the Node tree if necessary.
-  Program* LookupOrAddProgramNode(const OpMetrics& op_metrics);
-
-  OpProfileOptions options_;
-  op_profile::Node* root_;
-
-  // Map to look up and aggregate OpMetrics.
-  absl::node_hash_map<op_profile::Node*, OpMetrics> metrics_;
-
-  // Maps to look up if a category / program / deduplicated node has
-  // already been added to the tree.
-  absl::flat_hash_map<uint64_t, Program> programs_map_;
-  absl::flat_hash_map<std::string, Category> category_map_;
-
-  // Map to look up program names by id.
-  const tensorflow::protobuf::Map<uint64_t, std::string>* program_name_map_ =
-      nullptr;
-};
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_PROFILE_BUILDER_H_
diff --git a/tensorflow/core/profiler/convert/op_stack.h b/tensorflow/core/profiler/convert/op_stack.h
deleted file mode 100644
index 6bfa4d776436..000000000000
--- a/tensorflow/core/profiler/convert/op_stack.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace profiler {
-
-template <typename OpInfo>
-class OpStack {
- public:
-  // Pushes an Op onto the stack.
-  void Push(uint32 op_id, std::unique_ptr<OpInfo> op_info) {
-    stack_.emplace_back(op_id, std::move(op_info));
-  }
-
-  // Pops the Op with the given op_id from the stack.
-  std::unique_ptr<OpInfo> Pop(uint32 op_id) {
-    // Pop until match or stack_ is empty.
-    std::unique_ptr<OpInfo> result;
-    while (!stack_.empty()) {
-      auto back = std::move(stack_.back());
-      stack_.pop_back();
-      if (op_id == back.first) {
-        result = std::move(back.second);
-        break;
-      }
-    }
-    return result;
-  }
-
-  // Returns the Op at the top of the stack.
-  OpInfo* Top() const {
-    return stack_.empty() ? nullptr : stack_.back().second.get();
-  }
-
-  // Returns true if the stack is empty.
-  bool Empty() const { return stack_.empty(); }
-
-  // Clears the stack.
-  void Clear() { stack_.clear(); }
-
- private:
-  std::vector<std::pair<uint32 /*op_id*/, std::unique_ptr<OpInfo>>> stack_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.cc b/tensorflow/core/profiler/convert/op_stats_combiner.cc
deleted file mode 100644
index 8879eac65c4f..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_combiner.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
-#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/power_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/protobuf/topology.pb.h"
-#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-#include "tensorflow/core/profiler/utils/step_intersection.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-// Combines the src PerCoreStepInfo into the dst PerCoreStepInfo.
-void CombinePerCoreStepInfo(
-    int src_host_id, const PerCoreStepInfo& src, bool use_incomplete_step,
-    PerCoreStepInfo* dst,
-    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
-    OpMetricsDbCombiner* hlo_metrics_db_per_step_combiner) {
-  CombineCoreIdMap(src_host_id, src.step_info_per_core(),
-                   dst->mutable_step_info_per_core());
-
-  // Since we have assigned a new step number to the combined result, update
-  // the step number on each core to this new step number.
-  uint32 new_step_num = dst->step_num();
-  for (auto& percore_stepinfo : *dst->mutable_step_info_per_core()) {
-    auto& stepinfo = percore_stepinfo.second;
-    stepinfo.set_step_num(new_step_num);
-  }
-
-  if (!use_incomplete_step) {
-    hlo_metrics_db_complete_steps_only_combiner->Combine(src.hlo_metrics_db());
-  }
-  hlo_metrics_db_per_step_combiner->Combine(src.hlo_metrics_db());
-  CombineCoreIdMap(src_host_id, src.all_reduce_db_per_core(),
-                   dst->mutable_all_reduce_db_per_core());
-  CombineCoreIdMap(src_host_id, src.core_id_to_replica_id_map(),
-                   dst->mutable_core_id_to_replica_id_map());
-}
-
-void CombineStepDatabase(
-    int src_host_id, const StepIntersection& step_intersection,
-    const StepDatabaseResult& src, StepDatabaseResult* dst,
-    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
-    std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
-  if (src.use_incomplete_step()) dst->set_use_incomplete_step(true);
-  uint32 src_first_step_idx = step_intersection.FirstStepIndex(src_host_id);
-  for (uint32 i = 0; i < step_intersection.NumSteps(); i++) {
-    CombinePerCoreStepInfo(
-        src_host_id, src.step_sequence(src_first_step_idx + i),
-        src.use_incomplete_step(), dst->mutable_step_sequence(i),
-        hlo_metrics_db_complete_steps_only_combiner,
-        &(*hlo_metrics_db_per_step_combiners)[i]);
-  }
-}
-
-void CombinePowerMetrics(const RunEnvironment& src, RunEnvironment* dst) {
-  const size_t src_hosts = src.hostnames_size();
-  const size_t dst_hosts = dst->hostnames_size();
-  const double src_weight = src_hosts * 1.0 / (src_hosts + dst_hosts);
-  const double dst_weight = dst_hosts * 1.0 / (src_hosts + dst_hosts);
-  // Always assume src/dst have the same number of power components.
-  for (const auto& src_metric : src.power_metrics().power_component_metrics()) {
-    for (auto& dst_metric :
-         *dst->mutable_power_metrics()->mutable_power_component_metrics()) {
-      if (src_metric.component_name() != dst_metric.component_name()) continue;
-      dst_metric.set_max_power(
-          std::max(src_metric.max_power(), dst_metric.max_power()));
-      dst_metric.set_avg_power(src_metric.avg_power() * src_weight +
-                               dst_metric.avg_power() * dst_weight);
-    }
-  }
-}
-
-void CombineRunEnvironment(const RunEnvironment& src, RunEnvironment* dst) {
-  dst->mutable_hostnames()->insert(src.hostnames().begin(),
-                                   src.hostnames().end());
-  dst->set_host_count(dst->hostnames_size());
-  // Ignore CPU and Unknown Device type for device type selection if the
-  // destination does not have a device type already.
-  if (src.device_type() != "CPU" && src.device_type() != "Device") {
-    dst->set_device_type(src.device_type());
-    dst->set_device_core_count(src.device_core_count() +
-                               dst->device_core_count());
-    // Replica count and num cores per replica must be same for all copies.
-    dst->set_replica_count(std::max(src.replica_count(), dst->replica_count()));
-    dst->set_num_cores_per_replica(
-        std::max(src.num_cores_per_replica(), dst->num_cores_per_replica()));
-    *dst->mutable_system_topology() = src.system_topology();
-  } else if (dst->device_type().empty()) {
-    dst->set_device_type(src.device_type());
-  }
-  if (src.hardware_type() != dst->hardware_type()) {
-    // Select the highest hardware type as TPU/GPU should override CPU_ONLY
-    // (e.g. coordinator).
-    dst->set_hardware_type(std::max(src.hardware_type(), dst->hardware_type()));
-  }
-  dst->set_task_count(src.task_count() + dst->task_count());
-  // Only overwrite the dst if profile_duration_ms in dst is not defined or
-  // is zero and profile_duration_ms in src is greater than zero.
-  if (src.host_independent_job_info().profile_duration_ms() > 0) {
-    (*dst->mutable_host_independent_job_info()) =
-        src.host_independent_job_info();
-  }
-  for (const auto& job_info : src.host_dependent_job_info()) {
-    *(dst->add_host_dependent_job_info()) = job_info;
-  }
-  dst->set_host_trace_level(src.host_trace_level());
-  dst->set_is_training(src.is_training());
-  CombinePowerMetrics(src, dst);
-}
-
-// Combines the src PerfEnv into the dst PerfEnv.
-void CombinePerfEnv(const PerfEnv& src, PerfEnv* dst) {
-  if (src.peak_tera_flops_per_second() > 0) {
-    dst->set_peak_tera_flops_per_second(src.peak_tera_flops_per_second());
-  }
-
-  if (src.peak_bws_giga_bytes_per_second_size() > 0 &&
-      dst->peak_bws_giga_bytes_per_second_size() == 0) {
-    *dst->mutable_peak_bws_giga_bytes_per_second() =
-        src.peak_bws_giga_bytes_per_second();
-  }
-  if (src.ridge_point() > 0) {
-    dst->set_ridge_point(src.ridge_point());
-  }
-}
-
-// Combines the src Diagnostics into the dst Diagnostics.
-void CombineDiagnostics(const Diagnostics& src, Diagnostics* dst) {
-  dst->mutable_info()->MergeFrom(src.info());
-  dst->mutable_warnings()->MergeFrom(src.warnings());
-  dst->mutable_errors()->MergeFrom(src.errors());
-}
-
-// Combine the src OpStats into the dst OpStats.
-void CombineOpStats(
-    bool no_accelerator_in_system, int src_host_id, HardwareType hardware_type,
-    const StepIntersection& step_intersection, const OpStats& src, OpStats* dst,
-    OpMetricsDbCombiner* host_op_metrics_db_combiner,
-    OpMetricsDbCombiner* device_op_metrics_db_combiner,
-    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
-    std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
-  // Combine host_metrics_db.
-  // Host OpMetricsDb does not need to update the number of cores a certain op
-  // occurs.
-  host_op_metrics_db_combiner->Combine(src.host_op_metrics_db(),
-                                       /*update_num_cores=*/false);
-  // Combine device_metrics_db.
-  device_op_metrics_db_combiner->Combine(src.device_op_metrics_db());
-
-  // Combine step_db.
-  if (!IsCoordinator(no_accelerator_in_system, hardware_type)) {
-    CombineStepDatabase(src_host_id, step_intersection, src.step_db(),
-                        dst->mutable_step_db(),
-                        hlo_metrics_db_complete_steps_only_combiner,
-                        hlo_metrics_db_per_step_combiners);
-  }
-
-  // Combine run environment info.
-  CombineRunEnvironment(src.run_environment(), dst->mutable_run_environment());
-
-  // Combine the perf environment info.
-  CombinePerfEnv(src.perf_env(), dst->mutable_perf_env());
-
-  // Combine diagnostics.
-  CombineDiagnostics(src.diagnostics(), dst->mutable_diagnostics());
-
-  // Combine kernel stats.
-  dst->mutable_kernel_stats_db()->mutable_reports()->MergeFrom(
-      src.kernel_stats_db().reports());
-
-  // Combine tf-function stats.
-  CombineTfFunctionDb(src.tf_function_db(), dst->mutable_tf_function_db());
-
-  // Combine the mapping from core ID to details.
-  CombineCoreIdMap(src_host_id, src.core_id_to_details(),
-                   dst->mutable_core_id_to_details());
-
-  // Combine performance counter result.
-  dst->mutable_performance_counter_result()
-      ->set_matrix_unit_utilization_percent(
-          dst->performance_counter_result().matrix_unit_utilization_percent() +
-          src.performance_counter_result().matrix_unit_utilization_percent());
-}
-
-}  // namespace
-
-bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type) {
-  // A host is a coordinator if:
-  //   (1) The host doesn't have a device, and
-  //   (2) The system does use accelerator (if not, it uses CPU only and so this
-  //   host should be regarded as a worker as well).
-  return !HasDevice(hardware_type) && !no_accelerator_in_system;
-}
-
-bool NoAcceleratorInSystem(const std::vector<OpStatsInfo>& all_op_stats_info) {
-  for (const auto& op_stats_info : all_op_stats_info) {
-    if (HasDevice(op_stats_info.hardware_type)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-uint32 GlobalCoreId(int host_id, uint32 device_ordinal) {
-  constexpr uint32 kMaxDevicesPerHost = 1000;  // power-of-10 for debuggability
-  return host_id * kMaxDevicesPerHost + device_ordinal;
-}
-
-StepIntersection ComputeStepIntersectionToMergeOpStats(
-    const std::vector<OpStatsInfo>& all_op_stats_info,
-    uint32 max_step_per_host) {
-  bool no_accelerator_in_system = NoAcceleratorInSystem(all_op_stats_info);
-
-  absl::flat_hash_map<uint32, const StepDatabaseResult*> per_host_step_db;
-  for (const auto& op_stats_info : all_op_stats_info) {
-    if (IsCoordinator(no_accelerator_in_system, op_stats_info.hardware_type))
-      continue;
-    // Includes only workers in per_host_step_db.
-    per_host_step_db[op_stats_info.src_host_id] =
-        &op_stats_info.op_stats->step_db();
-  }
-
-  return StepIntersection(max_step_per_host, per_host_step_db);
-}
-
-void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
-                       const StepIntersection& step_intersection,
-                       OpStats* combined_op_stats) {
-  // A shortcut code path for a single OpStats. There is no need to merge.
-  if (all_op_stats_info.size() == 1) {
-    *combined_op_stats = *all_op_stats_info[0].op_stats;
-    return;
-  }
-
-  StepDatabaseResult* combined_step_db = combined_op_stats->mutable_step_db();
-  // Initialize the StepDatabaseResult field that depends on the number of
-  // steps.
-  for (uint32 dst_step_num : step_intersection.DstStepNumbers()) {
-    combined_step_db->add_step_sequence()->set_step_num(dst_step_num);
-  }
-  // Record the number of steps that are dropped.
-  combined_step_db->set_num_steps_dropped(step_intersection.StepsDropped());
-
-  combined_step_db->set_empty_intersect(step_intersection.EmptyIntersect());
-
-  // Initialize all the OpMetricsDbCombiners.
-  OpMetricsDbCombiner host_op_metrics_db_combiner(
-      combined_op_stats->mutable_host_op_metrics_db());
-  OpMetricsDbCombiner device_op_metrics_db_combiner(
-      combined_op_stats->mutable_device_op_metrics_db());
-  OpMetricsDbCombiner hlo_metrics_db_complete_steps_only_combiner(
-      combined_op_stats->mutable_hlo_metrics_db_complete_steps_only());
-  std::vector<OpMetricsDbCombiner> hlo_metrics_db_per_step_combiners;
-  hlo_metrics_db_per_step_combiners.reserve(
-      combined_step_db->step_sequence_size());
-  for (PerCoreStepInfo& step_info :
-       *combined_step_db->mutable_step_sequence()) {
-    hlo_metrics_db_per_step_combiners.emplace_back(
-        step_info.mutable_hlo_metrics_db());
-  }
-
-  bool no_accelerator_in_system = NoAcceleratorInSystem(all_op_stats_info);
-
-  for (const auto& op_stats_info : all_op_stats_info) {
-    CombineOpStats(no_accelerator_in_system, op_stats_info.src_host_id,
-                   op_stats_info.hardware_type, step_intersection,
-                   *op_stats_info.op_stats, combined_op_stats,
-                   &host_op_metrics_db_combiner, &device_op_metrics_db_combiner,
-                   &hlo_metrics_db_complete_steps_only_combiner,
-                   &hlo_metrics_db_per_step_combiners);
-  }
-
-  // Sorts all the kernel reports that have been merged by CombineTfOpStats and
-  // keeps only the top kernel reports with long kernel duration.
-  SortAndKeepTopKDurationKernelReportsInDb(
-      combined_op_stats->mutable_kernel_stats_db());
-
-  // Process performance counter results.
-  combined_op_stats->mutable_performance_counter_result()
-      ->set_matrix_unit_utilization_percent(
-          combined_op_stats->performance_counter_result()
-              .matrix_unit_utilization_percent() /
-          all_op_stats_info.size());
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.h b/tensorflow/core/profiler/convert/op_stats_combiner.h
deleted file mode 100644
index a8cb3c62c408..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_combiner.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
-
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/utils/step_intersection.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Whether a host is a coordinator.
-bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type);
-
-// Translates the core id from single host to the one for multiple-host.
-// We need this translation because the device_ordinal was assigned when a
-// single host response was given. Now, we need a global core_id to distinguish
-// it with multiple hosts.
-uint32 GlobalCoreId(int host_id, uint32 device_ordinal);
-
-// Combines the src map into the dst map.
-// The src map keys are local core_ids. The src_host_id is used to convert them
-// into global core_ids used as keys in the dst map.
-// REQUIRED: cores from src_host_id are not already in dst.
-template <typename CoreIdMap>
-void CombineCoreIdMap(int src_host_id, const CoreIdMap& src, CoreIdMap* dst) {
-  for (const auto& core_id_and_value : src) {
-    uint32 global_core_id = GlobalCoreId(src_host_id, core_id_and_value.first);
-    auto iter_and_inserted =
-        dst->insert({global_core_id, core_id_and_value.second});
-    DCHECK(iter_and_inserted.second)
-        << "Duplicated core_id: " << iter_and_inserted.first->first;
-  }
-}
-
-// A struct that contains all the information that is needed to combine OpStats.
-struct OpStatsInfo {
-  OpStatsInfo(const OpStats* op_stats, HardwareType hardware_type,
-              int src_host_id)
-      : op_stats(op_stats),
-        hardware_type(hardware_type),
-        src_host_id(src_host_id) {}
-  const OpStats* op_stats;
-  HardwareType hardware_type;
-  int src_host_id;
-};
-
-// Returns true if there is no device (accelerator) in any of the hosts.
-bool NoAcceleratorInSystem(const std::vector<OpStatsInfo>& all_op_stats_info);
-
-// Compute the StepIntersection to merge OpStats.
-// Profiler will limit the number of steps to be at most <max_step_per_host>.
-StepIntersection ComputeStepIntersectionToMergeOpStats(
-    const std::vector<OpStatsInfo>& all_op_stats_info,
-    uint32 max_step_per_host);
-
-// Combine all the OpStats in <all_op_stats_info> using the steps in range
-// <step_intersection>. The result is stored in <combined_op_stats>.
-void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
-                       const StepIntersection& step_intersection,
-                       OpStats* combined_op_stats);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner_test.cc b/tensorflow/core/profiler/convert/op_stats_combiner_test.cc
deleted file mode 100644
index cd5e97fe3c7e..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_combiner_test.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
-
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/step_intersection.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// Tests that the run_environment field of the combined op stats is set
-// correctly.
-TEST(CombineAllOpStatsTest, CombineRunEnvironment) {
-  // Construct OpStatsInfo and all_op_stats_info.
-  OpStats dst_op_stats, op_stats_1, op_stats_2;
-  op_stats_1.mutable_run_environment()
-      ->mutable_host_independent_job_info()
-      ->set_profile_duration_ms(100);
-  op_stats_2.mutable_run_environment()
-      ->mutable_host_independent_job_info()
-      ->set_profile_duration_ms(0);
-  OpStatsInfo op_stats_info_1(&op_stats_1, TPU, 0),
-      op_stats_info_2(&op_stats_2, TPU, 0);
-  std::vector<OpStatsInfo> all_op_stats_info = {op_stats_info_1,
-                                                op_stats_info_2};
-
-  // Construct dummy step_intersection.
-  StepDatabaseResult dummy_step_db_result;
-  absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> result;
-  result.insert({0, &dummy_step_db_result});
-  StepIntersection dummy_step_intersection = StepIntersection(1, result);
-
-  // Combine all op stats.
-  CombineAllOpStats(all_op_stats_info, dummy_step_intersection, &dst_op_stats);
-
-  // Verify that the profile_duration_ms field of the second object is now set.
-  EXPECT_EQ(100, dst_op_stats.run_environment()
-                     .host_independent_job_info()
-                     .profile_duration_ms());
-}
-
-TEST(CombineAllOpStatsTest, CombineRunEnvironmentWithUnknownDevice) {
-  OpStats dst_op_stats, op_stats_1, op_stats_2;
-  op_stats_1.mutable_run_environment()->set_device_type("TPU");
-  op_stats_2.mutable_run_environment()->set_device_type("Device");
-  OpStatsInfo op_stats_info_1(&op_stats_1, TPU, 0),
-      op_stats_info_2(&op_stats_2, TPU, 0);
-  std::vector<OpStatsInfo> all_op_stats_info = {op_stats_info_1,
-                                                op_stats_info_2};
-
-  // Construct dummy step_intersection.
-  StepDatabaseResult dummy_step_db_result;
-  absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> result;
-  result.insert({0, &dummy_step_db_result});
-  StepIntersection dummy_step_intersection = StepIntersection(1, result);
-
-  CombineAllOpStats(all_op_stats_info, dummy_step_intersection, &dst_op_stats);
-
-  EXPECT_EQ("TPU", dst_op_stats.run_environment().device_type());
-}
-
-TEST(CombineAllOpStatsTest, CombinePerfEnvOrderZero) {
-  // Ensure CombinePerfEnv behaves consistently regardless of order of op stats.
-  OpStats dst_op_stats1, dst_op_stats2, op_stats_1, op_stats_2;
-  op_stats_1.mutable_perf_env()->set_peak_tera_flops_per_second(100);
-  op_stats_2.mutable_perf_env()->set_peak_tera_flops_per_second(0);
-  // Construct dummy step_intersection which is required by CombineAllOpStats().
-  absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> result;
-  StepIntersection dummy_step_intersection = StepIntersection(1, result);
-
-  OpStatsInfo op_stats_info_1(&op_stats_1, TPU, 0),
-      op_stats_info_2(&op_stats_2, TPU, 0);
-
-  // Test order 1.
-  std::vector<OpStatsInfo> all_op_stats_info = {op_stats_info_1,
-                                                op_stats_info_2};
-  CombineAllOpStats(all_op_stats_info, dummy_step_intersection, &dst_op_stats1);
-  EXPECT_EQ(100, dst_op_stats1.perf_env().peak_tera_flops_per_second());
-
-  // Test order 2.
-  all_op_stats_info = {
-      op_stats_info_2,
-      op_stats_info_1,
-  };
-  CombineAllOpStats(all_op_stats_info, dummy_step_intersection, &dst_op_stats2);
-  EXPECT_EQ(100, dst_op_stats2.perf_env().peak_tera_flops_per_second());
-}
-
-TEST(CombineAllOpStatsTest, CombineRunEnvironmentWithMismatchHardwareType) {
-  OpStats coordinator_op_stats, device_op_stats, dst_op_stats;
-  coordinator_op_stats.mutable_run_environment()->set_hardware_type(
-      HardwareType::CPU_ONLY);
-  device_op_stats.mutable_run_environment()->set_hardware_type(
-      HardwareType::TPU);
-  CombineAllOpStats({OpStatsInfo(&coordinator_op_stats, CPU_ONLY, 0),
-                     OpStatsInfo(&device_op_stats, TPU, 1)},
-                    StepIntersection(1, {}), &dst_op_stats);
-  EXPECT_EQ(dst_op_stats.run_environment().hardware_type(), HardwareType::TPU);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.cc
deleted file mode 100644
index 66ceccf0af3e..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "tensorflow/core/profiler/convert/data_table_utils.h"
-#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
-#include "tensorflow/core/profiler/protobuf/hlo_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tensorflow::profiler::OpMetrics;
-using tensorflow::profiler::OpMetricsDb;
-using ::tensorflow::profiler::OpStats;
-using ::tensorflow::profiler::PerfEnv;
-using ::tensorflow::profiler::RunEnvironment;
-using tensorflow::profiler::hlo_stats::HloStatsDatabase;
-using tensorflow::profiler::hlo_stats::HloStatsRecord;
-using tsl::profiler::IsOutsideCompilationOp;
-
-HloStatsRecord ConvertOpMetricsToHloStatsRecord(const OpMetrics& metrics,
-                                                const PerfEnv& perf_env,
-                                                const RunEnvironment& run_env) {
-  HloStatsRecord record;
-  record.set_program_id(metrics.hlo_module_id());
-  record.set_hlo_expression(metrics.long_name());
-  record.set_tf_op_name(metrics.provenance());
-  record.set_hlo_category(metrics.category());
-  record.set_autotuned(metrics.autotuned());
-  tensorflow::profiler::SetExecutionTimes(metrics, &record);
-  tensorflow::profiler::SetTpuUnitFractions(metrics, &record);
-  SetRooflineMetrics(metrics, perf_env, run_env, &record);
-  record.set_rematerialization(tsl::profiler::IsRematerialization(
-      /*hlo_expression=*/metrics.long_name(),
-      /*framework_op_name=*/metrics.provenance()));
-  record.set_outside_compilation(
-      IsOutsideCompilationOp(metrics.provenance(), metrics.long_name()));
-  return record;
-}
-
-}  // namespace
-
-HloStatsDatabase ConvertOpStatsToHloStats(const OpStats& op_stats) {
-  HloStatsDatabase hlo_stats_db;
-  const OpMetricsDb& hlo_metrics_db = op_stats.device_op_metrics_db();
-  double total_device_time_us =
-      tsl::profiler::PicoToMicro(hlo_metrics_db.total_time_ps());
-  HloStatsRecord sentinel;
-  sentinel.set_rank(0);
-  sentinel.set_cumulative_total_self_time_as_fraction(0.0);
-  const HloStatsRecord* prev_record = &sentinel;
-  for (const OpMetrics* metrics :
-       tensorflow::profiler::SortedOpMetricsDb(hlo_metrics_db)) {
-    if (metrics->occurrences() == 0) continue;
-    HloStatsRecord* record = hlo_stats_db.add_hlo_stats_record();
-    *record = ConvertOpMetricsToHloStatsRecord(*metrics, op_stats.perf_env(),
-                                               op_stats.run_environment());
-    tensorflow::profiler::SetRankAndTimeFractions(total_device_time_us,
-                                                  *prev_record, record);
-    prev_record = record;
-  }
-  return hlo_stats_db;
-}
-
-// The parse logic based on the assumption that the hlo op text is in format of
-// '%op_name = <long name>'
-std::string GetHloOpNameFromExpression(std::string expression) {
-  std::vector<::std::string> parts = absl::StrSplit(expression, " = ");
-  std::string hlo_op_name = parts[0];
-  if (hlo_op_name[0] == '%') {
-    hlo_op_name = hlo_op_name.substr(1);
-  }
-  return hlo_op_name;
-}
-
-std::vector<std::vector<std::string>> HloStatsDataTableColumns() {
-  const std::vector<std::vector<std::string>> kColumns = {
-      {"rank", "number", "Rank"},
-      {"program_id", "string", "Program id"},
-      {"category", "string", "HLO op category"},
-      {"hlo_op_name", "string", "HLO op name"},
-      {"hlo_op_expression", "string", "HLO op text"},
-      {"tf_op_name", "string", "Framework op name"},
-      {"occurrences", "number", "#Occurrences"},
-      {"total_time", "number", "Total time (us)"},
-      {"avg_time", "number", "Avg. time (us)"},
-      {"total_self_time", "number", "Total self time (us)"},
-      {"avg_self_time", "number", "Avg. self time (us)"},
-      {"total_self_time_percent", "number", "Total self time (%)"},
-      {
-          "cumulative_total_self_time_percent",
-          "number",
-          "Cumulative total self time (%)",
-      },
-      {"dma_stall_percent", "number", "%time stalled by DMA"},
-      {"model_flop_rate", "number", "Model GFLOP/s"},
-      {"normalized_flop_rate", "number", "Normalized GFLOP/s"},
-      {"measured_memory_bw", "number", "Measured memory BW (GiB/s)"},
-      {"hbm_bw", "number", "HBM BW (GiB/s)"},
-      {"cmem_read_bw", "number", "CMEM Read BW (GiB/s)"},
-      {"cmem_write_bw", "number", "CMEM Write BW (GiB/s)"},
-      {"operational_intensity", "number", "Operational intensity (FLOPS/Byte)"},
-      {"bound_by", "string", "Bound by"},
-      {"hlo_rematerialization", "string", "Rematerialization"},
-      {"outside_compilation", "string", "Outside Compilation"},
-      {"autotuned", "string", "Autotuned"},
-  };
-  return kColumns;
-}
-
-std::unique_ptr<tensorflow::profiler::DataTable> CreateHloStatsDataTable(
-    const HloStatsDatabase& hlo_stats_db) {
-  auto data_table = std::make_unique<tensorflow::profiler::DataTable>();
-  for (const std::vector<std::string>& col : HloStatsDataTableColumns()) {
-    data_table->AddColumn(TableColumn(col[0], col[1], col[2]));
-  }
-  for (const HloStatsRecord& record : hlo_stats_db.hlo_stats_record()) {
-    TableRow* row = data_table->AddRow();
-    row->AddCell(record.rank());
-    row->AddCell(absl::StrCat(record.program_id()));
-    row->AddCell(record.hlo_category());
-    row->AddCell(GetHloOpNameFromExpression(record.hlo_expression()));
-    row->AddCell(record.hlo_expression());
-    row->AddCell(record.tf_op_name());
-    row->AddCell(record.occurrences());
-    row->AddCell(record.total_time_in_us());
-    row->AddCell(record.avg_time_in_us());
-    row->AddCell(record.total_self_time_in_us());
-    row->AddCell(record.avg_self_time_in_us());
-    row->AddCell(record.total_self_time_as_fraction());
-    row->AddCell(record.cumulative_total_self_time_as_fraction());
-    row->AddCell(record.dma_stall_fraction());
-    row->AddCell(record.model_flop_rate());
-    row->AddCell(record.measured_flop_rate());
-    row->AddCell(record.measured_memory_bw());
-    row->AddCell(record.hbm_bw());
-    row->AddCell(record.cmem_read_bw());
-    row->AddCell(record.cmem_write_bw());
-    row->AddCell(record.operational_intensity());
-    row->AddCell(absl::StrCat(record.bound_by()));
-    row->AddCell(record.rematerialization() ? "Yes" : "No");
-    row->AddCell(record.outside_compilation() ? "Yes" : "No");
-    row->AddCell(record.autotuned() ? "Yes" : "No");
-  }
-  return data_table;
-}
-
-std::string HloStatsToDataTableJson(const HloStatsDatabase& hlo_stats_db) {
-  return CreateHloStatsDataTable(hlo_stats_db)->ToJson();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h b/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h
deleted file mode 100644
index 359024df04b2..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_HLO_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_HLO_STATS_H_
-
-#include <memory>
-#include <string>
-
-#include "tensorflow/core/profiler/convert/data_table_utils.h"
-#include "tensorflow/core/profiler/protobuf/hlo_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-tensorflow::profiler::hlo_stats::HloStatsDatabase ConvertOpStatsToHloStats(
-    const tensorflow::profiler::OpStats& op_stats);
-
-// Converts to JSON align with current DataTable JSON format.
-std::string HloStatsToDataTableJson(
-    const hlo_stats::HloStatsDatabase& hlo_stats_db);
-
-// Construct a DataTable object from HloStatsDatabase.
-std::unique_ptr<tensorflow::profiler::DataTable> CreateHloStatsDataTable(
-    const hlo_stats::HloStatsDatabase& hlo_stats_db);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_HLO_STATS_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
deleted file mode 100644
index c2b450b039e0..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ /dev/null
@@ -1,1639 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
-
-#include <math.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "google/protobuf/any.pb.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/format_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/util/stats_calculator.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
-#include "tensorflow/core/profiler/convert/profile_time_breakdown.h"
-#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
-#include "tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/html_utils.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h"
-#include "tensorflow/core/profiler/utils/tpu_step_details_utils.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-using tsl::profiler::OneDigit;
-
-// If the percentage of step time that spends on SparseCoreV0 is more than
-// kModeratelySparseCoreV0BoundThresholdInPercent, it is considered highly
-// SparseCoreV0 bound.
-constexpr double kModeratelySparseCoreV0BoundThresholdInPercent = 10;
-// If the percentage of step time that spends on all-reduce is more than
-// kAllReduceBoundThresholdInPercent, it is considered all-reduce bound.
-constexpr double kAllReduceBoundThresholdInPercent = 6;
-// If the percentage of step time that is idle due to host overhead (but not
-// input-related) is >= kTcIdleThresholdInPercent, it will be highlighted in the
-// recommendation section of the Overview Page.
-constexpr double kTcIdleThresholdInPercent = 3;
-// Public doc on how to run multiple steps in a tf-function.
-constexpr absl::string_view kMultipleStepsInTffunctionDoc =
-    "https://www.tensorflow.org/guide/"
-    "tpu#improving_performance_by_multiple_steps_within_tffunction";
-
-const double kNumPsPerMs = 1000000000.0;
-
-// If the percentage of step time that is due to infeed is less than
-// kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
-// input-bound; else if it is less than
-// kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
-// input-bound; else if it is considered HIGHLY input-bound.
-constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
-constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
-
-// If the percentage of step time that is due to outfeed is less than
-// kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
-// output-bound; else if it is less than
-// kHighlyOutfeedBoundThresholdInPercent, it is considered MODERATELY
-// output-bound; else if it is considered HIGHLY output-bound.
-constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
-constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
-
-// If the percentage of step time that is due to kernel launch is less than
-// kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
-// kernel-launch bound; else if it is less than
-// kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
-// kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
-constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
-constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
-
-// If the percentage of step time that is due to all other time is less than
-// kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
-// all-other bound; else if it is less than
-// kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
-// all-other bound; else if it is considered HIGHLY all-other bound.
-constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
-constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
-
-// If the percentage of step time that is due to device collectives is less than
-// kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
-// device-collectives bound; else if it is less than
-// kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
-// device-collectives  bound; else if it is considered HIGHLY device-collectives
-// bound.
-constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
-constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
-
-// Section number of the host-analysis section in the input-pipeline analysis.
-constexpr int kHostAnalysisSectionNumber = 3;
-// Python-only explanation for "All Others" time.
-const char* kAllOthersPythonExplanation =
-    " % of the total step time sampled is spent on 'All Others' time. "
-    "This could be due to Python execution overhead.";
-// Explanation for "Kernel Launch" time due to CPU contention with tf.data.
-const char* kKernelLaunchTfDataContention =
-    " It could be due to CPU contention with tf.data. In this case, you may "
-    "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
-
-template <class Collection>
-double GetTimeInMs(const Collection& type_ps, EventType event_type) {
-  return tsl::profiler::PicoToMilli(
-      gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
-}
-
-GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
-    const InputPipelineAnalysisResult& analysis) {
-  tsl::Stat<double> unknown_time_ms;
-  tsl::Stat<double> host_wait_input_ms;
-  tsl::Stat<double> host_to_device_ms;
-  tsl::Stat<double> input_ms;
-  tsl::Stat<double> output_ms;
-  tsl::Stat<double> device_compute_ms;
-  tsl::Stat<double> device_to_device_ms;
-  tsl::Stat<double> device_collectives_ms;
-  tsl::Stat<double> host_compute_ms;
-  tsl::Stat<double> host_prepare_ms;
-  tsl::Stat<double> host_compile_ms;
-  GenericStepTimeBreakdown result;
-
-  for (const google::protobuf::Any& step_details : analysis.step_details()) {
-    PerGenericStepDetails details;
-    bool success = step_details.UnpackTo(&details);
-    if (!success && !step_details.type_url().empty()) {
-      LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
-                 << std::endl;
-      return {};
-    }
-    unknown_time_ms.UpdateStat(details.unknown_time_ms());
-    host_wait_input_ms.UpdateStat(details.host_wait_input_ms());
-    host_to_device_ms.UpdateStat(details.host_to_device_ms());
-    input_ms.UpdateStat(details.host_wait_input_ms() +
-                        details.host_to_device_ms());
-    output_ms.UpdateStat(details.output_ms());
-    device_compute_ms.UpdateStat(details.device_compute_ms());
-    device_to_device_ms.UpdateStat(details.device_to_device_ms());
-    device_collectives_ms.UpdateStat(details.device_collectives_ms());
-    host_compute_ms.UpdateStat(details.host_compute_ms());
-    host_prepare_ms.UpdateStat(details.host_prepare_ms());
-    host_compile_ms.UpdateStat(details.host_compile_ms());
-  }
-  *result.mutable_unknown_time_ms_summary() =
-      GetStepSummaryForSampleStats(unknown_time_ms);
-  *result.mutable_host_wait_input_ms_summary() =
-      GetStepSummaryForSampleStats(host_wait_input_ms);
-  *result.mutable_host_to_device_ms_summary() =
-      GetStepSummaryForSampleStats(host_to_device_ms);
-  *result.mutable_input_ms_summary() = GetStepSummaryForSampleStats(input_ms);
-  *result.mutable_output_ms_summary() = GetStepSummaryForSampleStats(output_ms);
-  *result.mutable_device_compute_ms_summary() =
-      GetStepSummaryForSampleStats(device_compute_ms);
-  *result.mutable_device_to_device_ms_summary() =
-      GetStepSummaryForSampleStats(device_to_device_ms);
-  *result.mutable_device_collectives_ms_summary() =
-      GetStepSummaryForSampleStats(device_collectives_ms);
-  *result.mutable_host_compute_ms_summary() =
-      GetStepSummaryForSampleStats(host_compute_ms);
-  *result.mutable_host_prepare_ms_summary() =
-      GetStepSummaryForSampleStats(host_prepare_ms);
-  *result.mutable_host_compile_ms_summary() =
-      GetStepSummaryForSampleStats(host_compile_ms);
-  return result;
-}
-
-InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
-    const tsl::protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
-  InputPipelineAnalysisResult result;
-  result.set_tag(false);
-
-  // Computes the summary of step time in ms.
-  *result.mutable_step_time_summary() =
-      ComputeStepTimeSummaryInMs(grouped_by_step);
-
-  tsl::Stat<double> input_summary_stats_in_percent;
-  for (const auto& coreid_stepinfo_map : grouped_by_step) {
-    // Iterates over each step.
-    const auto* ptr = gtl::FindOrNull(coreid_stepinfo_map.step_info_per_core(),
-                                      kDefaultGpuLocalCoreId);
-    if (ptr == nullptr) {
-      // For generic hardware, all step-info is put under core-0. If ptr
-      // is nullptr, it means there is no step at all.
-      continue;
-    }
-    const StepInfoResult& step_info = *ptr;
-    // Adds the details for a new step.
-    PerGenericStepDetails details;
-    details.set_step_number(step_info.step_num());
-    if (step_info.step_name().empty()) {
-      details.set_step_name(absl::StrCat(step_info.step_num()));
-    } else {
-      details.set_step_name(step_info.step_name());
-    }
-    details.set_step_time_ms(
-        tsl::profiler::PicoToMilli(step_info.duration_ps()));
-    GenericStepBreakdown generic;
-    bool success = step_info.step_breakdown().UnpackTo(&generic);
-    if (!success && !step_info.step_breakdown().type_url().empty()) {
-      LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
-                 << std::endl;
-      return {};
-    }
-    const auto& type_ps = generic.type_ps();
-    details.set_unknown_time_ms(GetTimeInMs(type_ps, UNKNOWN_TIME));
-    details.set_host_wait_input_ms(GetTimeInMs(type_ps, HOST_WAIT_INPUT));
-    details.set_host_to_device_ms(GetTimeInMs(type_ps, HOST_TO_DEVICE) +
-                                  GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
-    details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
-    details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
-                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
-    details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
-                                    GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
-    details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
-    details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
-    details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
-    details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
-    result.add_step_details()->PackFrom(details);
-
-    const double input_percent_of_step_time =
-        100.0 * tsl::profiler::SafeDivide(
-                    details.host_wait_input_ms() + details.host_to_device_ms(),
-                    details.step_time_ms());
-    input_summary_stats_in_percent.UpdateStat(input_percent_of_step_time);
-  }
-
-  // Computes the summary of input time as percentage of step time.
-  *result.mutable_input_percent_summary() =
-      GetStepSummaryForSampleStats(input_summary_stats_in_percent);
-
-  // Computes the breakdown of step time.
-  GenericStepTimeBreakdown generic_step_time_breakdown =
-      ComputeGenericStepTimeBreakdownInMs(result);
-  result.mutable_step_time_breakdown()->PackFrom(generic_step_time_breakdown);
-
-  return result;
-}
-
-// Classification of input processing on the host.
-enum class InputOpCategory {
-  kEnqueue,           // enqueue data to be transferred to device.
-  kDemandedFileRead,  // demanded read from file.
-  kAdvancedFileRead,  // advanced read from file (including cached,
-                      // prefetch, parallel-map, interleave).
-  kPreprocessing      // data preprocessing.
-};
-
-std::string InputOpCategoryString(InputOpCategory category) {
-  switch (category) {
-    case InputOpCategory::kEnqueue:
-      return "Enqueue";
-    case InputOpCategory::kDemandedFileRead:
-      return "Demanded file read";
-    case InputOpCategory::kAdvancedFileRead:
-      return "Advanced file read";
-    case InputOpCategory::kPreprocessing:
-      return "Preprocessing";
-  }
-}
-
-inline bool IsInputOp(absl::string_view category) {
-  // Do not include "IteratorGetNext*" here, because IteratorGetNext is an Op
-  // that experiences the install stall, not an Op that causes the input stall.
-  return tsl::profiler::IsInfeedEnqueueOp(category) ||
-         tsl::profiler::IsDatasetOp(category) ||
-         tsl::profiler::IsMemcpyHToDOp(category);
-}
-
-// TODO(ckluk):
-//   Confirm with the tf.data team if the classification below is correct.
-InputOpCategory CategorizeInputOp(absl::string_view name,
-                                  absl::string_view category) {
-  if (tsl::profiler::IsInfeedEnqueueOp(category) ||
-      tsl::profiler::IsMemcpyHToDOp(category)) {
-    // Ops for sending input from host to device.
-    return InputOpCategory::kEnqueue;
-  }
-  DCHECK(tsl::profiler::IsDatasetOp(category));
-  if (absl::EndsWith(name, "::TFRecord") ||
-      absl::EndsWith(name, "::TextLine") ||
-      absl::EndsWith(name, "::FixedLengthRecord") ||
-      absl::EndsWith(name, "::SSTable") || absl::EndsWith(name, "::RecordIO")) {
-    // Ops that read files.
-    if (absl::StrContains(name, "::MemoryReader") ||
-        absl::StrContains(name, "::MemoryWriter") ||
-        absl::StrContains(name, "::Interleave") ||
-        absl::StrContains(name, "::Prefetch") ||
-        absl::StrContains(name, "::ParallelMap")) {
-      // Ops that read files in advance, including caching, interleaving, and
-      // prefetching.
-      return InputOpCategory::kAdvancedFileRead;
-    } else {
-      // Ops that read files on demand.
-      return InputOpCategory::kDemandedFileRead;
-    }
-  } else {
-    // All other ops are classified as preprocessing.
-    return InputOpCategory::kPreprocessing;
-  }
-}
-
-struct InputOpMetrics {
-  std::vector<const OpMetrics*> input_op_metrics;
-  uint64 input_op_time_ps = 0;
-};
-
-InputOpMetrics SelectInputOpMetrics(const OpMetricsDb& all_op_metrics) {
-  InputOpMetrics input_op_metrics;
-  for (const OpMetrics* op_metrics : SortedOpMetricsDb(all_op_metrics)) {
-    if (IsInputOp(op_metrics->category())) {
-      input_op_metrics.input_op_metrics.push_back(op_metrics);
-      input_op_metrics.input_op_time_ps += op_metrics->self_time_ps();
-    }
-  }
-  return input_op_metrics;
-}
-
-InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
-                                                uint64 input_op_time_ps,
-                                                InputOpCategory category) {
-  InputOpDetails details;
-  details.set_op_name(op_metrics.name());
-  details.set_count(op_metrics.occurrences());
-  details.set_time_in_ms(tsl::profiler::PicoToMilli(op_metrics.time_ps()));
-  details.set_self_time_in_ms(
-      tsl::profiler::PicoToMilli(op_metrics.self_time_ps()));
-  details.set_time_in_percent(
-      100.0 *
-      tsl::profiler::SafeDivide(op_metrics.time_ps(), input_op_time_ps));
-  details.set_self_time_in_percent(
-      100.0 *
-      tsl::profiler::SafeDivide(op_metrics.self_time_ps(), input_op_time_ps));
-  details.set_category(InputOpCategoryString(category));
-  return details;
-}
-
-// Returns the ratio of the host-to-device time in each step to the step-time.
-double RatioOfHostToDeviceTimeToStepTime(
-    const OpMetricsDb& host_tf_metrics_db,
-    const InputPipelineAnalysisResult& input_pipeline_analysis) {
-  // For TPU execution that uses infeed.
-  std::optional<double> host_infeed_enqueue_ratio =
-      HostInfeedEnqueueRatio(host_tf_metrics_db);
-  if (host_infeed_enqueue_ratio.has_value()) {
-    return host_infeed_enqueue_ratio.value();
-  }
-  // For GPU and TPU execution that do not use infeed.
-  double avg_step_time_ms =
-      input_pipeline_analysis.step_time_summary().average();
-  if (avg_step_time_ms > 0) {
-    // Uses the on-device step time.
-    GenericStepTimeBreakdown generic_breakdown;
-    if (input_pipeline_analysis.step_time_breakdown().UnpackTo(
-            &generic_breakdown)) {
-      double avg_host_to_device_time_ms =
-          generic_breakdown.host_to_device_ms_summary().average();
-      return tsl::profiler::SafeDivide(avg_host_to_device_time_ms,
-                                       avg_step_time_ms);
-    }
-  }
-  return 0.0;
-}
-
-void DeviceCollectivesAnalysis(double device_collectives_percent,
-                               std::string* device_collectives_classification,
-                               std::string* device_collectives_statement) {
-  if (device_collectives_percent >=
-      kHighlyDeviceCollectivesBoundThresholdInPercent) {
-    *device_collectives_classification = "high";
-    *device_collectives_statement =
-        absl::StrCat(OneDigit(device_collectives_percent),
-                     " % of the total step time sampled is spent on 'Device "
-                     "Collective Communication'.");
-  } else if (device_collectives_percent >=
-             kModeratelyDeviceCollectivesBoundThresholdInPercent) {
-    *device_collectives_classification = "moderate";
-    *device_collectives_statement =
-        absl::StrCat(OneDigit(device_collectives_percent),
-                     " % of the total step time sampled is spent on 'Device "
-                     "Collective Communication'.");
-  } else {
-    *device_collectives_classification = "no";
-    *device_collectives_statement = "";
-  }
-}
-
-void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
-                          std::string* kernel_launch_classification,
-                          std::string* kernel_launch_statement) {
-  if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
-    *kernel_launch_classification = "high";
-    *kernel_launch_statement = absl::StrCat(
-        OneDigit(kernel_launch_percent),
-        " % of the total step time sampled is spent on 'Kernel Launch'.");
-    if (tfdata_used) {
-      absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
-    }
-  } else if (kernel_launch_percent >=
-             kModeratelyKernelLaunchBoundThresholdInPercent) {
-    *kernel_launch_classification = "moderate";
-    *kernel_launch_statement = absl::StrCat(
-        OneDigit(kernel_launch_percent),
-        " % of the total step time sampled is spent on 'Kernel Launch'.");
-    if (tfdata_used) {
-      absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
-    }
-  } else {
-    *kernel_launch_classification = "no";
-    *kernel_launch_statement = "";
-  }
-}
-
-void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
-                      std::string* all_other_classification,
-                      std::string* all_other_statement) {
-  if (all_other_reported) {
-    *all_other_classification = "no";
-    *all_other_statement = "";
-    return;
-  }
-  if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
-    *all_other_classification = "high";
-    *all_other_statement =
-        absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
-  } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
-    *all_other_classification = "moderate";
-    *all_other_statement =
-        absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
-  } else {
-    *all_other_classification = "no";
-    *all_other_statement = "";
-  }
-}
-
-// Tests if tf.data API is in use.
-bool TfDataInUse(const InputTimeBreakdown& breakdown) {
-  // Do not include enqueue_us because the "enqueue" Op that Xprof recognizes is
-  // not part of tf.data.
-  return breakdown.demanded_file_read_us() > 0 ||
-         breakdown.advanced_file_read_us() > 0 ||
-         breakdown.preprocessing_us() > 0;
-}
-
-// Returns a HTML link with the given text.
-std::string MakeDocLink(absl::string_view doc_link, absl::string_view text) {
-  return absl::StrCat("<a href=\"", doc_link, "\" target=\"_blank\">", text,
-                      "</a>");
-}
-
-// Returns the HTML link to the introduction to the tf.data API.
-std::string DatasetIntroDoc() {
-  return "https://www.tensorflow.org/guide/data";
-}
-
-struct WaitForScV0Breakdown {
-  uint64_t DurationPs() const {
-    return scv0_infeed_duration_ps + scv0_compute_duration_ps;
-  }
-
-  uint64_t scv0_infeed_duration_ps = 0;
-  uint64_t scv0_compute_duration_ps = 0;
-};
-
-struct TcInfeed {
-  std::optional<uint32_t> core_id;
-  uint64_t duration_ps = 0;
-};
-
-void ConvertGenericStepBreakdownToTpuStepBreakdown(
-    const tensorflow::profiler::GenericStepBreakdown& generic_step_breakdown,
-    uint64_t step_time_ps, TpuStepBreakdown& tpu_step_breakdown) {
-  auto& category_ps = generic_step_breakdown.category_ps();
-  tensorflow::profiler::ProfileTimeBreakdown time_breakdown;
-  for (const auto& [category, time_ps] : category_ps) {
-    // Don't add idle time to time_breakdown as the idle time is inferred.
-    if (category == "IDLE") continue;
-    time_breakdown.IncrementCategoryTimePs(category, time_ps);
-  }
-  time_breakdown.SetProfileTimePs(step_time_ps);
-  time_breakdown.BreakdownSparseCoreV0Infeed();
-
-  tpu_step_breakdown.set_infeed_duration_ps(time_breakdown.InfeedTimePs());
-  tpu_step_breakdown.set_host_outfeed_ps(time_breakdown.OutfeedTimePs());
-  tpu_step_breakdown.set_wait_for_scv0_duration_ps(
-      time_breakdown.SparseCoreV0InfeedWaitTimePs());
-  tpu_step_breakdown.set_scv0_infeed_transform_ps(
-      time_breakdown.SparseCoreV0InfeedTransformTimePs());
-  tpu_step_breakdown.set_scv0_outfeed_ps(
-      time_breakdown.SparseCoreV0OutfeedTimePs());
-  tpu_step_breakdown.set_crs_duration_ps(
-      time_breakdown.AllReduceOrAllToAllTimePs());
-  tpu_step_breakdown.set_send_duration_ps(time_breakdown.SendTimePs());
-  tpu_step_breakdown.set_recv_duration_ps(time_breakdown.RecvTimePs());
-  tpu_step_breakdown.set_host_send_duration_ps(time_breakdown.HostSendTimePs());
-  tpu_step_breakdown.set_host_recv_duration_ps(time_breakdown.HostRecvTimePs());
-  tpu_step_breakdown.set_wait_for_megacore_fusion_peer_duration_ps(
-      time_breakdown.MegacoreFusionTimePs());
-  tpu_step_breakdown.set_high_flops_compute_ps(
-      time_breakdown.HighFlopsComputeTimePs());
-  tpu_step_breakdown.set_tc_idle_ps(time_breakdown.IdleTimePs());
-  tpu_step_breakdown.set_tc_busy_ps(time_breakdown.TensorCoreBusyTimePs());
-}
-// Computes the fields in PerStepData by considering the different StepInfos
-// of the same step across cores.
-PerTpuStepDetails ComputeTpuPerStepDataAcrossCores(
-    const PerCoreStepInfo& coreid_stepinfo_map,
-    const tsl::protobuf::Map<uint32_t, tensorflow::profiler::CoreDetails>&
-        core_details_map) {
-  PerTpuStepDetails per_step_data;
-
-  PerCoreAllReduceBreakdown all_reduce_breakdown =
-      ComputePerStepAllReduceBreakdownAcrossCores(coreid_stepinfo_map);
-
-  tsl::Stat<double> infeed_percent_stats;
-  tsl::Stat<uint64_t> step_stats_in_ps;
-  tsl::Stat<uint64_t> optimal_step_time_ps;
-  // Take the average TC outfeed time in result.
-  tsl::Stat<uint64_t> tc_outfeed_time_in_ps;
-  tsl::Stat<uint64_t> sc_optimal_step_time_ps;
-  tsl::Stat<uint64_t> sc_step_stats_in_ps;
-  tsl::Stat<uint64_t> sc_outfeed_time_in_ps;
-  tsl::Stat<uint64_t> sc_infeed_time_in_ps;
-  tsl::Stat<uint64_t> sc_idle_time_in_ps;
-
-  tsl::Stat<uint64_t> host_send_recv_time_ps;
-
-  // For the core with the max wait-for-scv0 duration, breakdown to compute and
-  // infeed time.
-  WaitForScV0Breakdown max_wait_for_scv0;
-
-  TcInfeed max_infeed;
-
-  // For the core with the max all reduce duration, breakdown to compute and
-  // synchronization time.
-  AllReduceBreakdown max_all_reduce;
-
-  per_step_data.set_step_number(-1);
-  auto process_step_for_sc =
-      [&](const tensorflow::profiler::StepInfoResult& step_info,
-          const SparseCoreStepBreakdown& sc_step) {
-        if (per_step_data.step_number() < 0) {
-          per_step_data.set_step_number(step_info.step_num());
-        } else {
-          if (per_step_data.step_number() != step_info.step_num()) {
-            VLOG(1) << "Inconsistent step numbers across cores ("
-                    << per_step_data.step_number() << " vs. "
-                    << step_info.step_num() << ").";
-          }
-        }
-        sc_step_stats_in_ps.UpdateStat(step_info.duration_ps());
-        sc_outfeed_time_in_ps.UpdateStat(sc_step.sc_outfeed_ps());
-        sc_infeed_time_in_ps.UpdateStat(sc_step.sc_infeed_ps());
-        sc_optimal_step_time_ps.UpdateStat(step_info.duration_ps() -
-                                           sc_step.sc_infeed_ps() -
-                                           sc_step.sc_outfeed_ps());
-        sc_idle_time_in_ps.UpdateStat(sc_step.sc_idle_ps());
-      };
-  for (const auto& [core_id, step_info] :
-       coreid_stepinfo_map.step_info_per_core()) {
-    // iterates over each core.
-    TpuStepBreakdown tpu;
-    if (!step_info.step_breakdown().UnpackTo(&tpu)) {
-      VLOG(1) << "Unable to unpack step_breakdown from tpu, try unpacking from "
-                 "generic";
-      tensorflow::profiler::GenericStepBreakdown generic_step_breakdown;
-      if (!step_info.step_breakdown().UnpackTo(&generic_step_breakdown)) {
-        SparseCoreStepBreakdown sc_step;
-        if (step_info.step_breakdown().UnpackTo(&sc_step)) {
-          process_step_for_sc(step_info, sc_step);
-          continue;
-        } else {
-          LOG(ERROR) << "Unable to unpack step_breakdown from "
-                        "GenericStepBreakdown or SparseCoreStepBreakdown";
-          // TODO(b/302086111): Switch back to DFATAL once absl is updated.
-          DCHECK(false);
-          return per_step_data;
-        }
-      }
-      if (core_id >= kSparseCoreIndexStart) {
-        // Sparse core step breakdown from xspace.
-        uint64_t total_time_ps = step_info.duration_ps();
-        uint64_t idle_time_ps =
-            generic_step_breakdown.category_ps().find("IDLE")->second;
-        sc_step_stats_in_ps.UpdateStat(total_time_ps);
-        sc_idle_time_in_ps.UpdateStat(idle_time_ps);
-        continue;
-      } else {
-        // Tensor core step breakdown from xspace.
-        ConvertGenericStepBreakdownToTpuStepBreakdown(
-            generic_step_breakdown, step_info.duration_ps(), tpu);
-      }
-    }
-    step_stats_in_ps.UpdateStat(step_info.duration_ps());
-    if (tpu.wait_for_scv0_duration_ps() > max_wait_for_scv0.DurationPs()) {
-      max_wait_for_scv0.scv0_infeed_duration_ps = ScV0InfeedDurationPs(tpu);
-      max_wait_for_scv0.scv0_compute_duration_ps = ScV0ComputeDurationPs(tpu);
-    }
-
-    tc_outfeed_time_in_ps.UpdateStat(tpu.host_outfeed_ps());
-
-    const AllReduceBreakdown& breakdown = all_reduce_breakdown[core_id];
-    if (breakdown.DurationPs() > max_all_reduce.DurationPs()) {
-      max_all_reduce = breakdown;
-    }
-
-    infeed_percent_stats.UpdateStat(100.0 * TcPlusScV0InfeedDurationPs(tpu) /
-                                    step_info.duration_ps());
-    // The optimal step time is the actual step time minus the time tensor
-    // core spends waiting for host or sparsecorev0 (but not other tensor
-    // cores).
-    optimal_step_time_ps.UpdateStat(step_info.duration_ps() -
-                                    WaitForHostOrScV0DurationPs(tpu));
-    host_send_recv_time_ps.UpdateStat(HostSendRecvDurationPs(tpu));
-
-    if (per_step_data.step_number() < 0) {
-      // Sets the step number of the current step from the first core.
-      per_step_data.set_step_number(step_info.step_num());
-    } else {
-      // The step number of the current step is already set. Checks if it is
-      // the same across cores. In case of multi-host tracing, we may have
-      // some inconsistent steps as tracing is not exactly guaranteed to be
-      // synchronized across all hosts.
-      if (per_step_data.step_number() != step_info.step_num()) {
-        VLOG(1) << "Inconsistent step numbers across cores ("
-                << per_step_data.step_number() << " vs. "
-                << step_info.step_num() << ").";
-      }
-    }
-    if (tpu.infeed_duration_ps() > max_infeed.duration_ps) {
-      max_infeed.core_id = core_id;
-      max_infeed.duration_ps = tpu.infeed_duration_ps();
-    }
-  }
-
-  per_step_data.set_tc_outfeed_time_ms(
-      tsl::profiler::PicoToMilli(tc_outfeed_time_in_ps.avg()));
-  // The TC compute time is the minimum of the optimal step time across cores.
-  per_step_data.set_tc_compute_time_ms(
-      tsl::profiler::PicoToMilli(optimal_step_time_ps.min()));
-  per_step_data.set_host_transfer_ms(
-      tsl::profiler::PicoToMilli(host_send_recv_time_ps.max()));
-  // TODO(b/153730997): Use the maximum step time.
-  // The infeed time is the step time across cores minus all other times.
-  // Previously, we used the maximum step time but changed to use the minimum
-  // step time to work around b/153730997.
-  // Uses the max TC infeed duration across cores as the step's TC infeed
-  // duration.
-  per_step_data.set_tc_infeed_time_ms(
-      tsl::profiler::PicoToMilli(max_infeed.duration_ps));
-  if (max_infeed.core_id.has_value()) {
-    per_step_data.set_coreid_max_infeed_time(max_infeed.core_id.value());
-    if (core_details_map.contains(max_infeed.core_id.value())) {
-      const CoreDetails& core_details =
-          core_details_map.at(max_infeed.core_id.value());
-      per_step_data.set_max_infeed_time_core_name(absl::StrCat(
-          core_details.hostname(), ":", core_details.device_ordinal()));
-    }
-  }
-
-  per_step_data.set_scv0_compute_time_ms(
-      tsl::profiler::PicoToMilli(max_wait_for_scv0.scv0_compute_duration_ps));
-  per_step_data.set_scv0_infeed_time_ms(
-      tsl::profiler::PicoToMilli(max_wait_for_scv0.scv0_infeed_duration_ps));
-
-  // The TC idle time is the time TC spends waiting for the host but not
-  // waiting for input.
-  per_step_data.set_tc_idle_time_ms(
-      tsl::profiler::PicoToMilli(step_stats_in_ps.min()) -
-      NonIdleTimeMs(per_step_data));
-  if (per_step_data.tc_idle_time_ms() < 0) {
-    per_step_data.set_tc_idle_time_ms(0);
-  }
-
-  per_step_data.set_all_reduce_compute_time_ms(
-      tsl::profiler::PicoToMilli(max_all_reduce.compute_duration_ps));
-  per_step_data.set_all_reduce_sync_time_ms(
-      tsl::profiler::PicoToMilli(max_all_reduce.sync_duration_ps));
-
-  per_step_data.set_infeed_percent_average(infeed_percent_stats.avg());
-  per_step_data.set_infeed_percent_minimum(infeed_percent_stats.min());
-  per_step_data.set_infeed_percent_maximum(infeed_percent_stats.max());
-
-  per_step_data.set_sc_infeed_time_ms(
-      tsl::profiler::PicoToMilli(sc_infeed_time_in_ps.avg()));
-  per_step_data.set_sc_outfeed_time_ms(
-      tsl::profiler::PicoToMilli(sc_outfeed_time_in_ps.avg()));
-  per_step_data.set_sc_compute_time_ms(
-      tsl::profiler::PicoToMilli(sc_optimal_step_time_ps.min()));
-  per_step_data.set_sc_idle_time_ms(
-      tsl::profiler::PicoToMilli(sc_idle_time_in_ps.avg()));
-  per_step_data.set_sc_step_time_ms(
-      tsl::profiler::PicoToMilli(sc_step_stats_in_ps.avg()));
-  if (per_step_data.sc_idle_time_ms() < 0) {
-    per_step_data.set_sc_idle_time_ms(0);
-  }
-  return per_step_data;
-}
-
-TpuStepTimeBreakdown ComputeTpuStepTimeBreakdownInMs(
-    const InputPipelineAnalysisResult& analysis, bool has_sparse_core) {
-  tsl::Stat<double> tc_compute_ms;
-  tsl::Stat<double> tc_infeed_ms;
-  tsl::Stat<double> tc_outfeed_ms;
-  tsl::Stat<double> tc_idle_ms;
-  tsl::Stat<double> scv0_compute_ms;
-  tsl::Stat<double> scv0_infeed_ms;
-  tsl::Stat<double> host_transfer_ms;
-  tsl::Stat<double> sc_compute_ms;
-  tsl::Stat<double> sc_infeed_ms;
-  tsl::Stat<double> sc_outfeed_ms;
-  tsl::Stat<double> sc_idle_ms;
-  tsl::Stat<double> sc_step_time_ms;
-  TpuStepTimeBreakdown result;
-
-  for (const google::protobuf::Any& step_details : analysis.step_details()) {
-    PerTpuStepDetails details;
-    if (!step_details.UnpackTo(&details)) {
-      LOG(ERROR) << "Unable to unpack step_details. Expected: tpu";
-      // TODO(b/302086111): Switch back to DFATAL once absl is updated.
-      DCHECK(false);
-      return result;
-    }
-    tc_compute_ms.UpdateStat(details.tc_compute_time_ms());
-    tc_idle_ms.UpdateStat(details.tc_idle_time_ms());
-    tc_infeed_ms.UpdateStat(details.tc_infeed_time_ms());
-    tc_outfeed_ms.UpdateStat(details.tc_outfeed_time_ms());
-    scv0_compute_ms.UpdateStat(details.scv0_compute_time_ms());
-    scv0_infeed_ms.UpdateStat(details.scv0_infeed_time_ms());
-    host_transfer_ms.UpdateStat(details.host_transfer_ms());
-    sc_compute_ms.UpdateStat(details.sc_compute_time_ms());
-    sc_idle_ms.UpdateStat(details.sc_idle_time_ms());
-    sc_infeed_ms.UpdateStat(details.sc_infeed_time_ms());
-    sc_outfeed_ms.UpdateStat(details.sc_outfeed_time_ms());
-    sc_step_time_ms.UpdateStat(details.sc_step_time_ms());
-  }
-  *result.mutable_tc_compute_ms_summary() =
-      GetStepSummaryForSampleStats(tc_compute_ms);
-  *result.mutable_scv0_compute_ms_summary() =
-      GetStepSummaryForSampleStats(scv0_compute_ms);
-  *result.mutable_tc_infeed_ms_summary() =
-      GetStepSummaryForSampleStats(tc_infeed_ms);
-  *result.mutable_tc_outfeed_ms_summary() =
-      GetStepSummaryForSampleStats(tc_outfeed_ms);
-  *result.mutable_scv0_infeed_ms_summary() =
-      GetStepSummaryForSampleStats(scv0_infeed_ms);
-  *result.mutable_tc_idle_ms_summary() =
-      GetStepSummaryForSampleStats(tc_idle_ms);
-  *result.mutable_host_transfer_ms_summary() =
-      GetStepSummaryForSampleStats(host_transfer_ms);
-  if (has_sparse_core) {
-    auto* sparse_core_step_summary = result.mutable_sparse_core_step_summary();
-    *sparse_core_step_summary->mutable_sc_compute_ms_summary() =
-        GetStepSummaryForSampleStats(sc_compute_ms);
-    *sparse_core_step_summary->mutable_sc_infeed_ms_summary() =
-        GetStepSummaryForSampleStats(sc_infeed_ms);
-    *sparse_core_step_summary->mutable_sc_outfeed_ms_summary() =
-        GetStepSummaryForSampleStats(sc_outfeed_ms);
-    *sparse_core_step_summary->mutable_sc_idle_ms_summary() =
-        GetStepSummaryForSampleStats(sc_idle_ms);
-    *sparse_core_step_summary->mutable_sc_step_time_ms_summary() =
-        GetStepSummaryForSampleStats(sc_step_time_ms);
-  }
-  return result;
-}
-
-// Given the step sequence on each core, computes the result proto of the
-// input-pipeline analysis tool (the InputPipelineAnalysisResult defined in
-// input_pipeline.proto).
-// Note on grouped_by_step: There is one element for each step executed (on
-// multiple cores). Each element is a map from the core_id to the information
-// of the step that runs on that core. Elements are in the same order that the
-// steps are executed over time.
-InputPipelineAnalysisResult ComputeTpuInputPipelineAnalysisResult(
-    const tsl::protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step,
-    const tsl::protobuf::Map<uint32_t, tensorflow::profiler::CoreDetails>&
-        core_details_map) {
-  InputPipelineAnalysisResult result;
-  bool has_sparse_core = false;
-  for (const auto& [core_id, core_details] : core_details_map) {
-    has_sparse_core |= core_details.is_sparse_core();
-  }
-
-  // Computes the summary of step time in ms.
-  *result.mutable_step_time_summary() =
-      ComputeStepTimeSummaryInMs(grouped_by_step);
-
-  // Summary of the statistics of infeed time as percentage of the step
-  // time.
-  tsl::Stat<double> infeed_summary_stats_in_percent;
-  for (const auto& coreid_stepinfo_map : grouped_by_step) {
-    // Compute each TPU step stats.
-    const PerTpuStepDetails& per_step_data =
-        ComputeTpuPerStepDataAcrossCores(coreid_stepinfo_map, core_details_map);
-    result.add_step_details()->PackFrom(per_step_data);
-
-    // The infeed summary is based on the maximum infeed time across cores at
-    // each step.
-    infeed_summary_stats_in_percent.UpdateStat(
-        per_step_data.infeed_percent_maximum());
-  }
-
-  // Computes the summary of infeed time as percentage of step time.
-  *result.mutable_input_percent_summary() =
-      GetStepSummaryForSampleStats(infeed_summary_stats_in_percent);
-
-  // Computes the breakdown of step time
-  TpuStepTimeBreakdown tpu_step_time_breakdown =
-      ComputeTpuStepTimeBreakdownInMs(result, has_sparse_core);
-  result.mutable_step_time_breakdown()->PackFrom(tpu_step_time_breakdown);
-  result.set_tag(true);
-
-  return result;
-}
-
-// Returns true if device_op_metrics_db contains an infeed op.
-bool HasTpuInfeedOp(const OpMetricsDb& device_op_metrics_db) {
-  for (const OpMetrics& metrics : device_op_metrics_db.metrics_db()) {
-    if (tsl::profiler::IsHostOrSparseCoreV0Infeed(metrics.category())) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Returns the time spent waiting for input for generic hardware.
-uint64_t TotalInputPs(const StepDetails& step_details) {
-  uint64_t total_input_ps = 0;
-  for (const auto& event : step_details.Events()) {
-    if (event.type == HOST_WAIT_INPUT || event.type == HOST_TO_DEVICE) {
-      // Includes both the time where the host was waiting input and the time
-      // where the host was sending data to the device.
-      total_input_ps += event.span.duration_ps();
-    }
-  }
-  return total_input_ps;
-}
-
-void TensorCoreIdleAnalysis(bool all_cores_profiled, double tc_idle_percent,
-                            std::string* input_classification,
-                            std::string* input_statement,
-                            std::string* tc_idle_classification,
-                            std::string* tc_idle_statement) {
-  // In MayFixTpuStepAnalysis(), we have already separated the idle time from
-  // the input time. So, we don't need to substract the input time from the
-  // idle time here.
-  if (tc_idle_percent < kTcIdleThresholdInPercent) {
-    *tc_idle_classification = "no";
-    *tc_idle_statement = "";
-    return;
-  }
-  std::string idle_percent_str = absl::StrFormat("%.1lf", tc_idle_percent);
-  if (all_cores_profiled) {
-    // Significant idle time with all cores profiled.
-    *tc_idle_classification = "yes";
-    *tc_idle_statement =
-        absl::StrCat(idle_percent_str,
-                     " % of the total step time sampled is due to host "
-                     "overhead that is not input-related. For TF 2.x, you may "
-                     "want to use a ",
-                     AnchorElement(kMultipleStepsInTffunctionDoc,
-                                   "host-training loop (i.e. running multiple "
-                                   "steps within a tf.function)."));
-    return;
-  }
-
-  // Significant idle time without all cores profiled.
-  if (*input_classification == "host") {
-    // We've already identified that it is input bound. So, no need to issue
-    // more warnings.
-    *tc_idle_classification = "no";
-    *tc_idle_statement = "";
-    return;
-  }
-
-  *input_classification = "host";  // focuses on "host" first.
-  *input_statement = absl::StrCat(
-      "Your program COULD be input-bound because ", idle_percent_str,
-      "% of the total step time is idle. This may be a manifestation of an "
-      "input issue on a worker "
-      "machine that was not profiled. To be certain, please profile ALL "
-      "worker machines in your job by following ",
-      AnchorElement(kProfileAllHostsDoc, "this instruction."));
-  *tc_idle_classification = "no";
-  *tc_idle_statement = "";
-}
-
-void AllReduceAnalysis(bool all_cores_profiled,
-                       double all_reduce_compute_percent,
-                       double all_reduce_sync_percent, double input_percent,
-                       std::string* input_classification,
-                       std::string* input_statement,
-                       std::string* all_reduce_classification,
-                       std::string* all_reduce_statement) {
-  double all_reduce_percent =
-      all_reduce_compute_percent + all_reduce_sync_percent;
-  // Since all-reduce time is overlapped with the input time, we consider the
-  // all-reduce time that is not input related.
-  double all_reduce_not_input_related_percent =
-      all_reduce_percent - input_percent;
-
-  if (all_reduce_not_input_related_percent <
-      kAllReduceBoundThresholdInPercent) {
-    // Insignificant time spent on all-reduce.
-    *all_reduce_classification = "no";
-    *all_reduce_statement = "";
-    return;
-  }
-
-  if (all_cores_profiled) {
-    // Significant time spent on all-reduce with all cores profiled.
-    std::string all_reduce_compute_percent_str =
-        absl::StrFormat("%.1lf", all_reduce_compute_percent);
-    std::string all_reduce_sync_percent_str =
-        absl::StrFormat("%.1lf", all_reduce_sync_percent);
-    *all_reduce_classification = "yes";
-    *all_reduce_statement = absl::StrCat(
-        "Also, ", all_reduce_sync_percent_str,
-        " % of the total step time sampled is spent on synchronization with "
-        "other TPU cores, and ",
-        all_reduce_compute_percent_str,
-        " % of the total step time sampled is spent on actual AllReduce.");
-    return;
-  }
-
-  // Significant time spent on all-reduce and not all cores were profiled.
-  std::string all_reduce_percent_str =
-      absl::StrFormat("%.1lf", all_reduce_percent);
-
-  if (*input_classification != "device") {
-    // InputAnalysis() already indicates some potential input issue. So, we
-    // can focus on all-reduce performance.
-    *all_reduce_classification = "yes";
-    *all_reduce_statement = absl::StrCat(
-        "Also, ", all_reduce_percent_str,
-        " % of the total step time sampled is spent on synchronization "
-        "with "
-        "other TPU cores and AllReduce. Not all worker machines are "
-        "profiled, "
-        "therefore "
-        "we "
-        "cannot disambiguate the actual time for AllReduce from the "
-        "synchronization. To be certain, please profile ALL "
-        "worker machines in your job by following ",
-        AnchorElement(kProfileAllHostsDoc, "this instruction."));
-    return;
-  }
-
-  // InputAnalysis() indicates that it is NOT input-bound. However, it may
-  // be because the input delay is manifested as all-reduce time. So,
-  // attribute it to a possible input issue.
-  *input_classification = "host";  // focuses on "host" first.
-  *input_statement = absl::StrCat(
-      "Your program COULD be input-bound because ", all_reduce_percent_str,
-      "% of the total step time is spent on synchronization with other "
-      "TPU cores. This may be a manifestation of an input issue on a "
-      "worker "
-      "machine that was not profiled. To be certain, please profile ALL "
-      "worker machines in your job by following ",
-      AnchorElement(kProfileAllHostsDoc, "this instruction."));
-  *all_reduce_classification = "no";
-  *all_reduce_statement = "";
-}
-
-void ScV0Analysis(double scv0_percent, std::string* scv0_classification,
-                  std::string* scv0_statement) {
-  if (scv0_percent == 0) {
-    *scv0_classification = "no";
-    *scv0_statement = "";
-    return;
-  }
-  std::string scv0_percent_str = absl::StrFormat("%.1lf", scv0_percent);
-  if (scv0_percent < kModeratelySparseCoreV0BoundThresholdInPercent) {
-    *scv0_classification = "moderate";
-    *scv0_statement = absl::StrCat(
-        "Also, ", scv0_percent_str,
-        " % of the total step time sampled is spent on the ", kSparseCoreV0Name,
-        " compute. You may also want to reduce the ", kSparseCoreV0Name,
-        " compute time.");
-    return;
-  }
-  *scv0_classification = "high";
-  *scv0_statement = absl::StrCat(
-      "Also, ", scv0_percent_str,
-      " % of the total step time sampled is spent on the ", kSparseCoreV0Name,
-      " compute. You should focus on reducing the ", kSparseCoreV0Name,
-      " compute time as well.");
-}
-
-// A map keeps track of the minimum value associated with an id.
-class MinMap {
- public:
-  void Observe(uint64_t id, uint64_t value) {
-    auto [iter, inserted] = min_map_.try_emplace(id, value);
-    if (!inserted && iter->second > value) {
-      iter->second = value;
-    }
-  }
-
-  uint64_t Min(uint64_t id) const {
-    auto iter = min_map_.find(id);
-    return (iter != min_map_.end()) ? iter->second : 0;
-  }
-
- private:
-  absl::flat_hash_map<uint64_t /*id*/, uint64_t /*min*/> min_map_;
-};
-
-}  // namespace
-
-StepSummary GetStepSummaryForSampleStats(
-    const tsl::Stat<double>& sample_stats) {
-  StepSummary step_time_summary;
-  double avg, sdv, min, max;
-  if (sample_stats.empty()) {
-    // If sample_stats is empty, sample_stats.avg() will return NaN. However, we
-    // prefer to show an 0 instead.
-    avg = sdv = min = max = 0.0;
-  } else {
-    avg = sample_stats.avg();
-    sdv = sqrt(sample_stats.sample_variance());
-    min = sample_stats.min();
-    max = sample_stats.max();
-  }
-  step_time_summary.set_average(avg);
-  step_time_summary.set_standard_deviation(sdv);
-  step_time_summary.set_minimum(min);
-  step_time_summary.set_maximum(max);
-  return step_time_summary;
-}
-
-PerCoreAllReduceBreakdown ComputePerStepAllReduceBreakdownAcrossCores(
-    const PerCoreStepInfo& coreid_stepinfo_map) {
-  PerCoreAllReduceBreakdown result;
-  MinMap min_duration_map;
-  for (const auto& [core_id, all_reduce_db] :
-       coreid_stepinfo_map.all_reduce_db_per_core()) {
-    for (const auto& all_reduce : all_reduce_db.all_reduce_info()) {
-      uint64_t duration_ps =
-          all_reduce.end_time_ps() - all_reduce.start_time_ps();
-      min_duration_map.Observe(all_reduce.id(), duration_ps);
-    }
-  }
-  for (const auto& [core_id, all_reduce_db] :
-       coreid_stepinfo_map.all_reduce_db_per_core()) {
-    AllReduceBreakdown& breakdown = result[core_id];
-    for (const auto& all_reduce : all_reduce_db.all_reduce_info()) {
-      uint64_t duration_ps =
-          all_reduce.end_time_ps() - all_reduce.start_time_ps();
-      uint64_t min_duration_ps = min_duration_map.Min(all_reduce.id());
-      breakdown.compute_duration_ps += min_duration_ps;
-      breakdown.sync_duration_ps += duration_ps - min_duration_ps;
-    }
-  }
-  return result;
-}
-
-void MayFixTpuStepAnalysis(
-    const StepEvents& host_step_events, const OpMetricsDb& device_op_metrics_db,
-    StepDatabaseResult& step_db,
-    const tsl::protobuf::Map<uint32_t, tensorflow::profiler::CoreDetails>&
-        core_details_map) {
-  // This code is only applicable when input is received by the tensor core
-  // from the host without the use of infeed. If the tensor core receives
-  // input via host infeed or via sparsecorev0 infeed, there's nothing to do.
-  if (HasTpuInfeedOp(device_op_metrics_db)) return;
-
-  for (PerCoreStepInfo& per_core_step_info :
-       *(step_db.mutable_step_sequence())) {
-    uint32_t step_num = per_core_step_info.step_num();
-    // TODO(ckluk): step_num is obtained from tf_op_stats, which is based on the
-    // step-tracking mechanism with the on-device training loop. However, this
-    // step_num is different from the group_id. So, what we are doing here is
-    // only an approximation, assuming that all steps exhibit similar
-    // breakdown. Once grouping works on TPU device, we need to replace step_num
-    // by the group_id from TPU device.
-    const StepDetails* step_details =
-        gtl::FindOrNull(host_step_events, step_num);
-    if (step_details == nullptr) {
-      continue;  // step_num not in host_step_events, we don't know how to fix.
-    }
-    uint64_t total_input_ps = TotalInputPs(*step_details);
-    if (total_input_ps == 0) {
-      continue;  // no host input events.
-    }
-    PerTpuStepDetails tpu_step_data =
-        ComputeTpuPerStepDataAcrossCores(per_core_step_info, core_details_map);
-    double tc_idle_ms = tpu_step_data.tc_idle_time_ms();
-    double adjusted_input_ratio =
-        std::min(tsl::profiler::SafeDivide(
-                     tsl::profiler::PicoToMilli(total_input_ps), tc_idle_ms),
-                 1.0);
-    for (auto& [core_id, step_info] :
-         *per_core_step_info.mutable_step_info_per_core()) {
-      // skip sparse cores for this.
-      if (core_id >= kSparseCoreIndexStart) continue;
-      TpuStepBreakdown tpu;
-      if (TpuStepBreakdown tpu; step_info.step_breakdown().UnpackTo(&tpu)) {
-        DCHECK_EQ(tpu.infeed_duration_ps(), 0);
-        if (tpu.tc_idle_ps() > 0) {
-          // Extract the infeed fraction of idle time.
-          tpu.set_infeed_duration_ps(tpu.tc_idle_ps() * adjusted_input_ratio);
-          tpu.set_tc_idle_ps(tpu.tc_idle_ps() - tpu.infeed_duration_ps());
-          step_info.mutable_step_breakdown()->PackFrom(tpu);
-        }
-      } else if (tensorflow::profiler::GenericStepBreakdown generic;
-                 step_info.step_breakdown().UnpackTo(&generic)) {
-        uint64_t& infeed_time_ps =
-            (*generic.mutable_category_ps())[xla::HloOpcodeString(
-                xla::HloOpcode::kInfeed)];
-        uint64_t& idle_time_ps =
-            (*generic.mutable_category_ps())[tensorflow::profiler::kIdle];
-        DCHECK_EQ(infeed_time_ps, 0);
-        if (idle_time_ps > 0) {
-          infeed_time_ps = idle_time_ps * adjusted_input_ratio;
-          idle_time_ps -= infeed_time_ps;
-          step_info.mutable_step_breakdown()->PackFrom(generic);
-        }
-      } else {
-        // Likely encountered an ScStepBreakdown instance which can be skipped
-        // as we only care about attributing TC idle time to host.
-        LOG(INFO) << "Unable to unpack step_breakdown.";
-      }
-    }
-  }
-}
-
-TpuBottleneckAnalysis ComputeTpuBottleneckAnalysis(
-    bool all_cores_profiled, const InputPipelineAnalysisResult& result) {
-  double total_step_time_ms = 0;
-  double total_infeed_time_ms = 0;
-  double total_tc_outfeed_time_ms = 0;
-  double total_scv0_compute_time_ms = 0;
-  double total_all_reduce_compute_time_ms = 0;
-  double total_all_reduce_sync_time_ms = 0;
-  double total_tc_idle_time_ms = 0;
-
-  TpuBottleneckAnalysis analysis;
-  for (const google::protobuf::Any& step_details : result.step_details()) {
-    PerTpuStepDetails details;
-    if (!step_details.UnpackTo(&details)) {
-      LOG(ERROR) << "Unable to unpack step_details. Expected: tpu";
-      // TODO(b/302086111): Switch back to DFATAL once absl is updated.
-      DCHECK(false);
-      return analysis;
-    }
-    total_step_time_ms += StepTimeMs(details);
-    total_infeed_time_ms += InfeedTimeMs(details);
-    total_tc_outfeed_time_ms += details.tc_outfeed_time_ms();
-    total_scv0_compute_time_ms += details.scv0_compute_time_ms();
-    total_all_reduce_compute_time_ms += details.all_reduce_compute_time_ms();
-    total_all_reduce_sync_time_ms += details.all_reduce_sync_time_ms();
-    total_tc_idle_time_ms += details.tc_idle_time_ms();
-  }
-  if (total_step_time_ms == 0) {
-    analysis.set_input_classification("unknown");
-    analysis.set_input_statement(
-        "No step time measured. Therefore we cannot tell where the performance "
-        "bottleneck is.");
-    analysis.set_tc_idle_classification("no"),
-        analysis.set_tc_idle_statement("");
-    analysis.set_scv0_classification("no");
-    analysis.set_scv0_statement("");
-    analysis.set_all_reduce_classification("no");
-    analysis.set_all_reduce_statement("");
-    return analysis;
-  }
-
-  double infeed_percent = 100.0 * total_infeed_time_ms / total_step_time_ms;
-  std::string input_classification;
-  std::string input_statement;
-  InputAnalysis(infeed_percent, /*all_other_percent=*/0, &input_classification,
-                &input_statement);
-
-  double tc_outfeed_percent =
-      100.0 * total_tc_outfeed_time_ms / total_step_time_ms;
-  std::string output_classification;
-  std::string output_statement;
-  OutputAnalysis(tc_outfeed_percent, &output_classification, &output_statement);
-
-  double tc_idle_percent = 100.0 * total_tc_idle_time_ms / total_step_time_ms;
-  std::string tc_idle_classification;
-  std::string tc_idle_statement;
-  TensorCoreIdleAnalysis(all_cores_profiled, tc_idle_percent,
-                         &input_classification, &input_statement,
-                         &tc_idle_classification, &tc_idle_statement);
-
-  double all_reduce_compute_percent =
-      100.0 * total_all_reduce_compute_time_ms / total_step_time_ms;
-  double all_reduce_sync_percent =
-      100.0 * total_all_reduce_sync_time_ms / total_step_time_ms;
-  std::string all_reduce_classification;
-  std::string all_reduce_statement;
-  AllReduceAnalysis(all_cores_profiled, all_reduce_compute_percent,
-                    all_reduce_sync_percent, infeed_percent,
-                    &input_classification, &input_statement,
-                    &all_reduce_classification, &all_reduce_statement);
-
-  double scv0_percent = 100.0 * total_scv0_compute_time_ms / total_step_time_ms;
-  std::string scv0_classification;
-  std::string scv0_statement;
-  ScV0Analysis(scv0_percent, &scv0_classification, &scv0_statement);
-
-  // compute_percent includes both TC and ScV0 compute.
-  double compute_percent = std::max(
-      0.0, 100.0 - infeed_percent - tc_outfeed_percent - tc_idle_percent);
-
-  analysis.set_compute_percent(compute_percent);
-  analysis.set_input_percent(infeed_percent);
-  analysis.set_output_percent(tc_outfeed_percent);
-  analysis.set_tc_idle_percent(tc_idle_percent);
-  analysis.set_input_classification(input_classification);
-  analysis.set_input_statement(input_statement);
-  analysis.set_output_statement(output_statement);
-  analysis.set_tc_idle_classification(tc_idle_classification),
-      analysis.set_tc_idle_statement(tc_idle_statement);
-  analysis.set_scv0_classification(scv0_classification);
-  analysis.set_scv0_statement(scv0_statement);
-  analysis.set_all_reduce_classification(all_reduce_classification);
-  analysis.set_all_reduce_statement(all_reduce_statement);
-  return analysis;
-}
-
-void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
-                        InputPipelineAnalysisResult* result) {
-  InputOpMetrics input_op_metrics = SelectInputOpMetrics(host_tf_metrics_db);
-  // Returns if the program is not using an input pipeline with
-  // instrumentation and hence no input ops are found.
-  if (input_op_metrics.input_op_metrics.empty()) return;
-
-  absl::flat_hash_map<InputOpCategory, double> aggregated_input_op_times_us;
-  for (const OpMetrics* op_metrics : input_op_metrics.input_op_metrics) {
-    InputOpCategory category =
-        CategorizeInputOp(op_metrics->name(), op_metrics->category());
-    *result->add_input_op_details() = ConvertOpMetricsToInputOpDetails(
-        *op_metrics, input_op_metrics.input_op_time_ps, category);
-    aggregated_input_op_times_us[category] +=
-        tsl::profiler::PicoToMicro(op_metrics->self_time_ps());
-  }
-
-  double enqueue_time_us =
-      aggregated_input_op_times_us[InputOpCategory::kEnqueue];
-  double total_input_op_time_us =
-      aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead] +
-      aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead] +
-      aggregated_input_op_times_us[InputOpCategory::kPreprocessing];
-
-  double ratio = std::min(
-      1.0, RatioOfHostToDeviceTimeToStepTime(host_tf_metrics_db, *result));
-  DCHECK_GE(ratio, 0.0);
-  double non_enqueue_time_us = (ratio != 0.0)
-                                   ? (enqueue_time_us * (1.0 - ratio) / ratio)
-                                   : total_input_op_time_us;
-
-  // Scales the various input-time components wrt to non_enqueue_time_us.
-  double scaled_demanded_fileread_time_us = tsl::profiler::SafeDivide(
-      non_enqueue_time_us *
-          aggregated_input_op_times_us[InputOpCategory::kDemandedFileRead],
-      total_input_op_time_us);
-  double scaled_advanced_fileread_time_us = tsl::profiler::SafeDivide(
-      non_enqueue_time_us *
-          aggregated_input_op_times_us[InputOpCategory::kAdvancedFileRead],
-      total_input_op_time_us);
-  double scaled_preprocessing_time_us = tsl::profiler::SafeDivide(
-      non_enqueue_time_us *
-          aggregated_input_op_times_us[InputOpCategory::kPreprocessing],
-      total_input_op_time_us);
-  double unclassified_non_enqueue_time_us = std::max(
-      0.0, non_enqueue_time_us - scaled_demanded_fileread_time_us -
-               scaled_advanced_fileread_time_us - scaled_preprocessing_time_us);
-
-  InputTimeBreakdown* input_time_breakdown =
-      result->mutable_input_time_breakdown();
-  input_time_breakdown->set_enqueue_us(enqueue_time_us);
-  input_time_breakdown->set_demanded_file_read_us(
-      scaled_demanded_fileread_time_us);
-  input_time_breakdown->set_advanced_file_read_us(
-      scaled_advanced_fileread_time_us);
-  input_time_breakdown->set_preprocessing_us(scaled_preprocessing_time_us);
-  input_time_breakdown->set_unclassified_non_enqueue_us(
-      unclassified_non_enqueue_time_us);
-}
-
-InputPipelineAnalysisRecommendation GenerateRecommendation() {
-  const absl::string_view kDatasetIntro =
-      "https://www.tensorflow.org/programmers_guide/datasets";
-
-  const absl::string_view kDatasetTopic =
-      "https://www.tensorflow.org/api_docs/python/tf/data/Dataset#";
-
-  const absl::string_view kTfRecordDataset =
-      "https://www.tensorflow.org/api_docs/python/tf/data/"
-      "TFRecordDataset#class_tfrecorddataset";
-
-  InputPipelineAnalysisRecommendation recommendation;
-  *recommendation.add_details() =
-      "Enqueuing data: you may want to combine small input data chunks "
-      "into fewer "
-      "but larger chunks.";
-  *recommendation.add_details() = absl::StrCat(
-      "Data preprocessing: you may increase num_parallel_calls in ",
-      AnchorElement(absl::StrCat(kDatasetTopic, "map"), "Dataset map()"),
-      " or preprocess the data OFFLINE.");
-  *recommendation.add_details() = absl::StrCat(
-      "Reading data from files in advance: you may tune parameters in the "
-      "following tf.data API (",
-      AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch size"),
-      ", ",
-      AnchorElement(absl::StrCat(kDatasetTopic, "interleave"),
-                    "interleave cycle_length"),
-      ", ", AnchorElement(kTfRecordDataset, "reader buffer_size"), ")");
-  *recommendation.add_details() = absl::StrCat(
-      "Reading data from files on demand: you should read data IN ADVANCE "
-      "using the following tf.data API (",
-      AnchorElement(absl::StrCat(kDatasetTopic, "prefetch"), "prefetch"), ", ",
-      AnchorElement(absl::StrCat(kDatasetTopic, "interleave"), "interleave"),
-      ", ", AnchorElement(kTfRecordDataset, "reader buffer"), ")");
-  *recommendation.add_details() = absl::StrCat(
-      "Other data reading or processing: you may consider using the ",
-      AnchorElement(kDatasetIntro, "tf.data API"),
-      " (if you are not using it now)");
-
-  return recommendation;
-}
-
-StepSummary ComputeStepTimeSummaryInMs(
-    const tsl::protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step) {
-  tsl::Stat<double> total_step_stats_in_ms;
-  // iterates over each step.
-  for (const auto& coreid_stepinfo_map : grouped_by_step) {
-    double max_per_step_stats_in_ms = 0.0;
-    // iterates over each core.
-    for (const auto& coreid_and_stepinfo :
-         coreid_stepinfo_map.step_info_per_core()) {
-      if (coreid_and_stepinfo.first >= kSparseCoreIndexStart) continue;
-      const auto& step_info = coreid_and_stepinfo.second;
-      max_per_step_stats_in_ms = std::max(step_info.duration_ps() / kNumPsPerMs,
-                                          max_per_step_stats_in_ms);
-    }
-    // Step time of each step is determined by the slowest core.
-    total_step_stats_in_ms.UpdateStat(max_per_step_stats_in_ms);
-  }
-
-  return GetStepSummaryForSampleStats(total_step_stats_in_ms);
-}
-
-InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
-    const OpStats& op_stats) {
-  const HardwareType hardware_type = op_stats.run_environment().hardware_type();
-
-  InputPipelineAnalysisResult result;
-  if (hardware_type == tensorflow::profiler::TPU) {
-    result = ComputeTpuInputPipelineAnalysisResult(
-        op_stats.step_db().step_sequence(), op_stats.core_id_to_details());
-  } else {
-    result = ComputeGenericInputPipelineAnalysisResult(
-        op_stats.step_db().step_sequence());
-  }
-  result.set_hardware_type(HardwareType_Name(hardware_type));
-
-  PopulateStepDiagnostics(op_stats, result.mutable_diagnostics());
-  GenerateHostResult(op_stats.host_op_metrics_db(), &result);
-
-  InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
-  if (hardware_type == tensorflow::profiler::TPU) {
-    TpuBottleneckAnalysis bottleneck_analysis = ComputeTpuBottleneckAnalysis(
-        /*all_cores_profiled=*/true, result);
-    result.set_input_percent(bottleneck_analysis.input_percent());
-    result.set_output_percent(bottleneck_analysis.output_percent());
-    result.set_idle_percent(bottleneck_analysis.tc_idle_percent());
-    result.set_compute_percent(bottleneck_analysis.compute_percent());
-
-    recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
-    *recommendation.mutable_summary_next_step() =
-        GetSummaryNextStep(bottleneck_analysis.input_classification(),
-                           result.input_time_breakdown());
-  } else {
-    BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
-        result.input_time_breakdown(), result.step_details());
-    result.set_input_percent(bottleneck_analysis.input_percent());
-    result.set_output_percent(bottleneck_analysis.output_percent());
-    result.set_idle_percent(bottleneck_analysis.idle_percent());
-    result.set_compute_percent(bottleneck_analysis.compute_percent());
-    recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
-    *recommendation.mutable_summary_next_step() =
-        GetSummaryNextStep(bottleneck_analysis.input_classification(),
-                           result.input_time_breakdown());
-  }
-
-  *result.mutable_recommendation() = recommendation;
-  return result;
-}
-
-bool InputAnalysis(double input_percent, double all_other_percent,
-                   std::string* input_classification,
-                   std::string* input_statement) {
-  absl::string_view non_input_time = "other time";
-  if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
-    *input_classification = "host";
-    *input_statement = absl::StrCat(
-        "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
-        "% of the total step time sampled is waiting for input. Therefore, you "
-        "should first focus on reducing the input time.");
-    return false;
-  } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
-    *input_classification = "both";
-    *input_statement = absl::StrCat(
-        "Your program is MODERATELY input-bound because ",
-        OneDigit(input_percent),
-        "% of the total step time sampled is waiting for input. Therefore, "
-        "you would need to reduce both the input time and ",
-        non_input_time, ".");
-    return false;
-  } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
-    // Input analysis says it is not input-bound, but "All-Other" time
-    // is significant. It could still be input-bound (or Python overhead).
-    *input_classification = "both";
-    *input_statement = absl::StrCat(
-        "Your program is POTENTIALLY input-bound because ",
-        OneDigit(all_other_percent),
-        "% of the total step time sampled is spent on 'All Others' time (which "
-        "could be due to I/O or Python execution or both).");
-    return true;
-  } else {
-    // Definitely not input-bound.
-    *input_classification = "device";
-    *input_statement =
-        absl::StrCat("Your program is NOT input-bound because only ",
-                     OneDigit(input_percent),
-                     "% of the total step time sampled is waiting for "
-                     "input. Therefore, you should focus on "
-                     "reducing ",
-                     non_input_time, ".");
-    return false;
-  }
-}
-
-void OutputAnalysis(double output_percent, std::string* output_classification,
-                    std::string* output_statement) {
-  if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
-    *output_classification = "host";
-    *output_statement = absl::StrCat(
-        "Your program is HIGHLY output-bound because ",
-        OneDigit(output_percent),
-        "% of the total step time sampled is spent on output. Therefore, you "
-        "should first focus on reducing the output time.");
-  } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
-    *output_classification = "both";
-    *output_statement = absl::StrCat(
-        "Your program is MODERATELY output-bound because ",
-        OneDigit(output_percent),
-        "% of the total step time sampled is spent on output. Therefore, "
-        "you would need to reduce both the output time and other time.");
-  } else {
-    *output_classification = "device";
-    *output_statement = "";
-  }
-}
-
-BottleneckAnalysis ComputeBottleneckAnalysis(
-    const InputTimeBreakdown& input_time_breakdown,
-    const tsl::protobuf::RepeatedPtrField<::google::protobuf::Any>&
-        any_step_details) {
-  double total_step_time_ms = 0;
-  double total_input_ms = 0;
-  double total_output_ms = 0;
-  double total_host_compute_ms = 0;
-  double total_host_prepare_ms = 0;
-  double total_host_compile_ms = 0;
-  double total_device_compute_ms = 0;
-  double total_device_to_device_ms = 0;
-  double total_device_collectives_ms = 0;
-  double total_unknown_ms = 0;
-
-  for (const google::protobuf::Any& step_details : any_step_details) {
-    PerGenericStepDetails details;
-    bool success = step_details.UnpackTo(&details);
-    if (!success && !step_details.type_url().empty()) {
-      LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
-                 << std::endl;
-      return {};
-    }
-    total_step_time_ms += details.step_time_ms();
-    total_input_ms +=
-        details.host_wait_input_ms() + details.host_to_device_ms();
-    total_output_ms += details.output_ms();
-    total_host_prepare_ms += details.host_prepare_ms();
-    total_device_compute_ms += details.device_compute_ms();
-    total_device_to_device_ms += details.device_to_device_ms();
-    total_device_collectives_ms += details.device_collectives_ms();
-    total_host_compute_ms += details.host_compute_ms();
-    total_host_compile_ms += details.host_compile_ms();
-    total_unknown_ms += details.unknown_time_ms();
-  }
-
-  if (total_step_time_ms == 0) {
-    BottleneckAnalysis analysis;
-    analysis.set_input_classification("unknown");
-    analysis.set_input_statement(
-        "No step time measured. Therefore we cannot tell where the "
-        "performance bottleneck is.");
-    analysis.set_kernel_launch_classification("no");
-    analysis.set_kernel_launch_statement("");
-    analysis.set_all_other_classification("no");
-    analysis.set_all_other_statement("");
-    analysis.set_device_collectives_classification("no");
-    analysis.set_device_collectives_statement("");
-    return analysis;
-  }
-  double input_percent = 100.0 * total_input_ms / total_step_time_ms;
-  double output_percent = 100.0 * total_output_ms / total_step_time_ms;
-  double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
-  double device_collectives_percent =
-      100.0 * total_device_collectives_ms / total_step_time_ms;
-
-  // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
-  // host compute, host compile, and unknown.
-  double idle_percent =
-      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
-                        device_collectives_percent);
-  double kernel_launch_percent =
-      100.0 * total_host_prepare_ms / total_step_time_ms;
-  double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
-
-  std::string input_classification;
-  std::string input_statement;
-  bool all_other_reported =
-      InputAnalysis(input_percent, all_other_percent, &input_classification,
-                    &input_statement);
-
-  std::string device_collectives_classification;
-  std::string device_collectives_statement;
-  DeviceCollectivesAnalysis(device_collectives_percent,
-                            &device_collectives_classification,
-                            &device_collectives_statement);
-
-  std::string kernel_launch_classification;
-  std::string kernel_launch_statement;
-  KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
-                       &kernel_launch_classification, &kernel_launch_statement);
-
-  std::string all_other_classification;
-  std::string all_other_statement;
-  AllOtherAnalysis(all_other_reported, all_other_percent,
-                   &all_other_classification, &all_other_statement);
-
-  BottleneckAnalysis analysis;
-  analysis.set_input_percent(input_percent);
-  analysis.set_output_percent(output_percent);
-  analysis.set_idle_percent(idle_percent);
-  analysis.set_compute_percent(compute_percent);
-
-  analysis.set_input_classification(input_classification);
-  analysis.set_input_statement(input_statement);
-  analysis.set_kernel_launch_classification(kernel_launch_classification);
-  analysis.set_kernel_launch_statement(kernel_launch_statement);
-  analysis.set_all_other_classification(all_other_classification);
-  analysis.set_all_other_statement(all_other_statement);
-  analysis.set_device_collectives_classification(
-      device_collectives_classification);
-  analysis.set_device_collectives_statement(device_collectives_statement);
-
-  return analysis;
-}
-
-std::string GetSummaryNextStep(absl::string_view input_classification,
-                               const InputTimeBreakdown& breakdown) {
-  std::string summary_next_step;
-  if (input_classification == "host" || input_classification == "both") {
-    if (!TfDataInUse(breakdown)) {
-      summary_next_step = absl::StrCat(
-          "Consider using ", MakeDocLink(DatasetIntroDoc(), "the tf.data API"),
-          " to enable profiler's host-side analysis for input pipeline. "
-          "Profiler currently does not support custom input pipeline (please "
-          "ignore "
-          "Section ",
-          kHostAnalysisSectionNumber, " below).");
-    } else {
-      summary_next_step =
-          absl::StrCat("Look at Section ", kHostAnalysisSectionNumber,
-                       " for the breakdown of input time on the host.");
-    }
-  } else {
-    summary_next_step = "You may skip the rest of this page.";
-  }
-
-  return summary_next_step;
-}
-
-double HostToDeviceTransferAsPercentOfInputTime(
-    const InputTimeBreakdown& breakdown) {
-  // Thanks to the scaling trick we did in GenerateHostResult(), we can
-  // estimate the percentage of input-time spent on host-to-device transfer in
-  // the following way.
-  double total_input_time_us =
-      breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
-      breakdown.preprocessing_us() + breakdown.enqueue_us() +
-      breakdown.unclassified_non_enqueue_us();
-  return 100.0 *
-         tsl::profiler::SafeDivide(breakdown.enqueue_us(), total_input_time_us);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
deleted file mode 100644
index c490e76ad6b6..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
-
-#include <cstdint>
-#include <string>
-
-#include "google/protobuf/any.pb.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/util/stats_calculator.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/protobuf/tpu_input_pipeline.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-
-struct AllReduceBreakdown {
-  uint64_t compute_duration_ps = 0;
-  uint64_t sync_duration_ps = 0;
-
-  uint64_t DurationPs() const { return compute_duration_ps + sync_duration_ps; }
-};
-
-// Used to store AllReduceBreakdown per core id. Just an alias for user
-// convenience.
-using PerCoreAllReduceBreakdown =
-    absl::flat_hash_map<uint32_t /*core_id*/, AllReduceBreakdown>;
-
-// Breakdown AllReduce time into synchronization time and actual compute time
-// for each core and step.
-PerCoreAllReduceBreakdown ComputePerStepAllReduceBreakdownAcrossCores(
-    const PerCoreStepInfo& coreid_stepinfo_map);
-
-StepSummary GetStepSummaryForSampleStats(const tsl::Stat<double>& sample_stats);
-
-// If the percent of input-time spent on host-to-device transfer is greater than
-// kHostToDeviceTimePercentAsSignificant, we should advise the
-// user to optimize this transfer.
-constexpr double kHostToDeviceTimePercentAsSignificant = 10.0;
-
-// If the percent of input-time spent on host-to-device transfer is greater than
-// kHostToDeviceTimePercentAsDominant, we should ONLY advise the
-// user to optimize this transfer; we won't bother to suggest optimization for
-// tf.data.
-constexpr double kHostToDeviceTimePercentAsDominant = 90.0;
-
-// Computes the summary of step time in milliseconds.
-StepSummary ComputeStepTimeSummaryInMs(
-    const tsl::protobuf::RepeatedPtrField<PerCoreStepInfo>& grouped_by_step);
-
-void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
-                        InputPipelineAnalysisResult* result);
-
-InputPipelineAnalysisRecommendation GenerateRecommendation();
-
-// For TPU, we may have mis-regarded some host overhead as idle time.
-// This function checks if this is the case using host_step_events. If this is,
-// it will do the correction in op_stats.
-void MayFixTpuStepAnalysis(
-    const StepEvents& host_step_events, const OpMetricsDb& device_op_metrics_db,
-    StepDatabaseResult& step_db,
-    const tsl::protobuf::Map<uint32_t, CoreDetails>& core_details_map);
-
-// Returns a struct that describes the performance bottleneck of the
-// program executed on TPU.
-TpuBottleneckAnalysis ComputeTpuBottleneckAnalysis(
-    bool all_cores_profiled, const InputPipelineAnalysisResult& result);
-
-// Returns the performance bottleneck of the program executed.
-BottleneckAnalysis ComputeBottleneckAnalysis(
-    const InputTimeBreakdown& input_time_breakdown,
-    const tsl::protobuf::RepeatedPtrField<::google::protobuf::Any>&
-        any_step_details);
-
-InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
-    const OpStats& op_stats);
-
-// Returns true if explanation for "All Others" time is also included in
-// input_statement.
-bool InputAnalysis(double input_percent, double all_other_percent,
-                   std::string* input_classification,
-                   std::string* input_statement);
-
-void OutputAnalysis(double output_percent, std::string* output_classification,
-                    std::string* output_statement);
-
-string GetSummaryNextStep(absl::string_view input_classification,
-                          const InputTimeBreakdown& breakdown);
-
-// Returns the percentage of the input time that is spent on transferring the
-// data from host to device.
-double HostToDeviceTransferAsPercentOfInputTime(
-    const InputTimeBreakdown& breakdown);
-
-void AddErrorMessages(const OpStats& op_stats,
-                      InputPipelineAnalysisResult* result);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis_test.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis_test.cc
deleted file mode 100644
index 3b9cff764107..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis_test.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
-
-#include <cstdint>
-#include <string>
-
-#include "google/protobuf/any.pb.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::tensorflow::profiler::CoreDetails;
-using ::tensorflow::profiler::OpMetricsDb;
-using ::tensorflow::profiler::StepDatabaseResult;
-using ::tensorflow::profiler::StepEvents;
-
-TEST(TfOpStatsToInputPipelineAnalysisTest,
-     AttributeHostInputTimeToTCWhenInfeedMissing) {
-  uint64_t step_num = 1;
-  tensorflow::profiler::StepDetails step_details;
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_WAIT_INPUT,
-      tsl::profiler::Timespan::FromEndPoints(50, 100)));
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_TO_DEVICE,
-      tsl::profiler::Timespan::FromEndPoints(110, 200)));
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_TO_DEVICE,
-      tsl::profiler::Timespan::FromEndPoints(430, 500)));
-  StepEvents host_step_events = {{step_num, step_details}};
-  StepDatabaseResult step_db;
-  tensorflow::profiler::PerCoreStepInfo* pcsi = step_db.add_step_sequence();
-  pcsi->set_step_num(step_num);
-  auto& sipc_map = *pcsi->mutable_step_info_per_core();
-  tensorflow::profiler::StepInfoResult& sir = sipc_map[/* core_id= */ 2];
-  sir.set_step_num(step_num);
-  sir.set_begin_ps(40);
-  sir.set_duration_ps(1000);
-  tensorflow::profiler::GenericStepBreakdown step_breakdown;
-  tsl::protobuf::Map<std::string, uint64_t>& category_ps =
-      *step_breakdown.mutable_category_ps();
-  category_ps[tensorflow::profiler::kIdle] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kMultiply)] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAllGather)] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAsyncStart)] = 50;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAsyncDone)] = 50;
-  sir.mutable_step_breakdown()->PackFrom(step_breakdown);
-  tsl::protobuf::Map<uint32_t, CoreDetails> core_details_map;
-  MayFixTpuStepAnalysis(host_step_events, OpMetricsDb(), step_db,
-                        core_details_map);
-  tensorflow::profiler::GenericStepBreakdown updated_step_breakdown;
-  sir.step_breakdown().UnpackTo(&updated_step_breakdown);
-  const tsl::protobuf::Map<std::string, uint64_t>& updated_category_ps =
-      updated_step_breakdown.category_ps();
-  EXPECT_EQ(updated_category_ps.at(tensorflow::profiler::kIdle), 90);
-  ASSERT_TRUE(updated_category_ps.contains(
-      xla::HloOpcodeString(xla::HloOpcode::kInfeed)));
-  EXPECT_EQ(
-      updated_category_ps.at(xla::HloOpcodeString(xla::HloOpcode::kInfeed)),
-      210);
-}
-
-TEST(TfOpStatsToInputPipelineAnalysisTest,
-     AttributeHostInputTimeToTCWhenInfeedMissingMultiCore) {
-  uint64_t step_num = 1;
-  tensorflow::profiler::StepDetails step_details;
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_WAIT_INPUT,
-      tsl::profiler::Timespan::FromEndPoints(50, 100)));
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_TO_DEVICE,
-      tsl::profiler::Timespan::FromEndPoints(110, 200)));
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_TO_DEVICE,
-      tsl::profiler::Timespan::FromEndPoints(430, 500)));
-  StepEvents host_step_events = {{step_num, step_details}};
-  StepDatabaseResult step_db;
-  tensorflow::profiler::PerCoreStepInfo* pcsi = step_db.add_step_sequence();
-  pcsi->set_step_num(step_num);
-  tsl::protobuf::Map<uint32_t, tensorflow::profiler::StepInfoResult>& sipc_map =
-      *pcsi->mutable_step_info_per_core();
-  tensorflow::profiler::StepInfoResult& sir = sipc_map[/* core_id= */ 2];
-  sir.set_step_num(step_num);
-  sir.set_begin_ps(40);
-  sir.set_duration_ps(1000);
-  tensorflow::profiler::GenericStepBreakdown step_breakdown;
-  tsl::protobuf::Map<std::string, uint64_t>& category_ps =
-      *step_breakdown.mutable_category_ps();
-  category_ps[tensorflow::profiler::kIdle] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kMultiply)] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAllGather)] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAsyncStart)] = 50;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAsyncDone)] = 50;
-  sir.mutable_step_breakdown()->PackFrom(step_breakdown);
-  tensorflow::profiler::StepInfoResult& sir2 = sipc_map[/* core_id= */ 1];
-  sir2.set_step_num(step_num);
-  sir2.set_begin_ps(45);
-  sir2.set_duration_ps(900);
-  tensorflow::profiler::GenericStepBreakdown step_breakdown2;
-  tsl::protobuf::Map<std::string, uint64_t>& category_ps2 =
-      *step_breakdown2.mutable_category_ps();
-  category_ps2[tensorflow::profiler::kIdle] = 250;
-  category_ps2[xla::HloOpcodeString(xla::HloOpcode::kMultiply)] = 300;
-  category_ps2[xla::HloOpcodeString(xla::HloOpcode::kAllGather)] = 250;
-  category_ps2[xla::HloOpcodeString(xla::HloOpcode::kAsyncStart)] = 50;
-  category_ps2[xla::HloOpcodeString(xla::HloOpcode::kAsyncDone)] = 50;
-  sir2.mutable_step_breakdown()->PackFrom(step_breakdown2);
-  tsl::protobuf::Map<uint32_t, CoreDetails> core_details_map;
-  OpMetricsDb device_op_metrics_db;
-  MayFixTpuStepAnalysis(host_step_events, device_op_metrics_db, step_db,
-                        core_details_map);
-  tensorflow::profiler::GenericStepBreakdown updated_step_breakdown;
-  sir.step_breakdown().UnpackTo(&updated_step_breakdown);
-  const tsl::protobuf::Map<std::string, uint64_t>& updated_category_ps =
-      updated_step_breakdown.category_ps();
-  EXPECT_EQ(updated_category_ps.at(tensorflow::profiler::kIdle), 48);
-  ASSERT_TRUE(updated_category_ps.contains(
-      xla::HloOpcodeString(xla::HloOpcode::kInfeed)));
-  EXPECT_EQ(
-      updated_category_ps.at(xla::HloOpcodeString(xla::HloOpcode::kInfeed)),
-      252);
-  tensorflow::profiler::GenericStepBreakdown updated_step_breakdown2;
-  sir2.step_breakdown().UnpackTo(&updated_step_breakdown2);
-  const tsl::protobuf::Map<std::string, uint64_t>& updated_category_ps2 =
-      updated_step_breakdown2.category_ps();
-  EXPECT_EQ(updated_category_ps2.at(tensorflow::profiler::kIdle), 40);
-  ASSERT_TRUE(updated_category_ps2.contains(
-      xla::HloOpcodeString(xla::HloOpcode::kInfeed)));
-  EXPECT_EQ(
-      updated_category_ps2.at(xla::HloOpcodeString(xla::HloOpcode::kInfeed)),
-      210);
-}
-
-TEST(TfOpStatsToInputPipelineAnalysisTest,
-     SkipMayFixTpuStepAnalysisWhenInfeedExists) {
-  uint64_t step_num = 1;
-  tensorflow::profiler::StepDetails step_details;
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_WAIT_INPUT,
-      tsl::profiler::Timespan::FromEndPoints(50, 100)));
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_TO_DEVICE,
-      tsl::profiler::Timespan::FromEndPoints(110, 200)));
-  step_details.AddEvent(tensorflow::profiler::EventTypeSpan(
-      tensorflow::profiler::EventType::HOST_TO_DEVICE,
-      tsl::profiler::Timespan::FromEndPoints(430, 500)));
-  StepEvents host_step_events = {{step_num, step_details}};
-  StepDatabaseResult step_db;
-  tensorflow::profiler::PerCoreStepInfo* pcsi = step_db.add_step_sequence();
-  pcsi->set_step_num(step_num);
-  tsl::protobuf::Map<uint32_t, tensorflow::profiler::StepInfoResult>& sipc_map =
-      *pcsi->mutable_step_info_per_core();
-  tensorflow::profiler::StepInfoResult& sir = sipc_map[/* core_id= */ 2];
-  sir.set_step_num(step_num);
-  sir.set_begin_ps(40);
-  sir.set_duration_ps(1000);
-  tensorflow::profiler::GenericStepBreakdown step_breakdown;
-  tsl::protobuf::Map<std::string, uint64_t>& category_ps =
-      *step_breakdown.mutable_category_ps();
-  category_ps[tensorflow::profiler::kIdle] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kMultiply)] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAllGather)] = 300;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kAsyncStart)] = 50;
-  category_ps[xla::HloOpcodeString(xla::HloOpcode::kInfeed)] = 50;
-  sir.mutable_step_breakdown()->PackFrom(step_breakdown);
-  tsl::protobuf::Map<uint32_t, CoreDetails> core_details_map;
-  OpMetricsDb device_op_metrics_db;
-  device_op_metrics_db.add_metrics_db()->set_category(
-      std::string(xla::HloOpcodeString(xla::HloOpcode::kInfeed)));
-  MayFixTpuStepAnalysis(host_step_events, device_op_metrics_db, step_db,
-                        core_details_map);
-  tensorflow::profiler::GenericStepBreakdown updated_step_breakdown;
-  sir.step_breakdown().UnpackTo(&updated_step_breakdown);
-  const tsl::protobuf::Map<std::string, uint64_t>& updated_category_ps =
-      updated_step_breakdown.category_ps();
-  EXPECT_EQ(updated_category_ps.at(tensorflow::profiler::kIdle), 300);
-  EXPECT_EQ(
-      updated_category_ps.at(xla::HloOpcodeString(xla::HloOpcode::kInfeed)),
-      50);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc b/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
deleted file mode 100644
index 59ac8ca086bd..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_op_profile.h"
-
-#include <string>
-#include <vector>
-
-#include "absl/log/check.h"
-#include "absl/strings/match.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/convert/op_profile_builder.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::tensorflow::profiler::IsIdleOp;
-using ::tensorflow::profiler::OpMetrics;
-using ::tensorflow::profiler::OpProfileBuilder;
-using ::tensorflow::profiler::OpProfileOptions;
-using ::tensorflow::profiler::OpStats;
-using ::tensorflow::profiler::TotalTimePs;
-using ::tensorflow::profiler::op_profile::Node;
-
-void BuildOpProfileNodeTree(const OpStats& op_stats, bool group_by_program,
-                            bool exclude_idle_ops, int op_profile_limit,
-                            Node* root) {
-  const auto& metrics_db = op_stats.device_op_metrics_db();
-  if (metrics_db.metrics_db().empty()) return;
-
-  OpProfileOptions options = {group_by_program,
-                              /*group_by_deduplicated_name=*/true,
-                              /*children_per_node=*/op_profile_limit};
-  OpProfileBuilder builder(options, root, &op_stats.program_id_to_name_map());
-
-  for (const OpMetrics& op_metrics : metrics_db.metrics_db()) {
-    DCHECK(!op_metrics.name().empty());
-    // Don't add ops that cannot be symbolized.
-    if (absl::StartsWith(op_metrics.name(), "region")) continue;
-    if (exclude_idle_ops && IsIdleOp(op_metrics)) continue;
-    builder.AddOp(op_metrics);
-  }
-
-  const auto& perf_env = op_stats.perf_env();
-  double max_gigaflops_per_second_per_core =
-      tsl::profiler::TeraToGiga(perf_env.peak_tera_flops_per_second());
-  std::vector<double> peak_bws;
-  for (auto bw : perf_env.peak_bws_giga_bytes_per_second()) {
-    peak_bws.push_back(tsl::profiler::GigaToGibi(bw));
-  }
-  builder.Finalize(max_gigaflops_per_second_per_core, peak_bws,
-                   TotalTimePs(metrics_db, exclude_idle_ops));
-}
-
-}  // namespace
-
-void ConvertOpStatsToOpProfile(
-    const OpStats& op_stats, tensorflow::profiler::HardwareType hardware_type,
-    tensorflow::profiler::op_profile::Profile& profile, int op_profile_limit) {
-  profile.set_device_type(HardwareType_Name(hardware_type));
-  BuildOpProfileNodeTree(op_stats,
-                         /*group_by_program=*/false,
-                         /*exclude_idle_ops=*/false, op_profile_limit,
-                         profile.mutable_by_category());
-
-  BuildOpProfileNodeTree(op_stats,
-                         /*group_by_program=*/false,
-                         /*exclude_idle_ops=*/true, op_profile_limit,
-                         profile.mutable_by_category_exclude_idle());
-
-  BuildOpProfileNodeTree(op_stats,
-                         /*group_by_program=*/true,
-                         /*exclude_idle_ops=*/false, op_profile_limit,
-                         profile.mutable_by_program());
-
-  BuildOpProfileNodeTree(op_stats,
-                         /*group_by_program=*/true,
-                         /*exclude_idle_ops=*/true, op_profile_limit,
-                         profile.mutable_by_program_exclude_idle());
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_op_profile.h b/tensorflow/core/profiler/convert/op_stats_to_op_profile.h
deleted file mode 100644
index 1fcfefb510d4..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_op_profile.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OP_PROFILE_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OP_PROFILE_H_
-
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Assembles a hierarchical performance profile based on HLOs in the op metrics
-// db.
-// The node hierarchy is as following:
-//    by_category
-//      - combined_root
-//          - category 1
-//          - category 2
-//          - ...
-//      - idle
-//    by_program
-//      - program_1_root
-//          - category 1
-//          - category 2
-//          - ...
-//      - program_2_root
-//          - category 1
-//          - ...
-//      - idle
-// The nodes in the profile are sorted by time in decreasing order and pruned
-// to reduce the profile size. Only 100 nodes are kept for level >= 3.
-// See op_profile.proto for the detailed semantics of the returned profile.
-void ConvertOpStatsToOpProfile(
-    const tensorflow::profiler::OpStats& op_stats,
-    tensorflow::profiler::HardwareType hardware_type,
-    tensorflow::profiler::op_profile::Profile& profile,
-    int op_profile_limit = 100);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OP_PROFILE_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
deleted file mode 100644
index eba497986037..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ /dev/null
@@ -1,408 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/any.pb.h"
-#include "absl/algorithm/container.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/format_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
-#include "tensorflow/core/profiler/protobuf/power_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
-#include "tensorflow/core/profiler/utils/html_utils.h"
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-using tsl::profiler::OneDigit;
-
-// If the use of low-precision ops is less than this percentage threshold, a
-// statement of suggestion will be made.
-constexpr double kLowPrecisionPercentThreshold = 10;
-
-struct TfFunctionInfo {
-  absl::string_view function_name;
-  double expensive_call_percent;
-};
-
-OverviewPageTip MakeOverviewPageTip(std::string text) {
-  OverviewPageTip tip;
-  tip.set_link(std::move(text));
-  return tip;
-}
-
-// Makes a recommendation for looking up a document.
-// doc_url is expected to be already be escaped suitably for use in an HTML
-// attribute.
-OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
-                                           absl::string_view text) {
-  return MakeOverviewPageTip(AnchorElement(doc_url, text));
-}
-
-void ComputeHostTips(OverviewPageRecommendation* re) {
-  *re->add_host_tips() = MakeOverviewPageTip(
-      "input_pipeline_analyzer (especially Section 3 for the breakdown of "
-      "input operations on the Host)");
-  *re->add_host_tips() = MakeOverviewPageTip(
-      "tf_data_bottleneck_analysis (find the bottleneck in the tf.data input "
-      "pipeline)");
-  *re->add_host_tips() = MakeOverviewPageTip(
-      "trace_viewer (look at the activities on the timeline of each Host "
-      "Thread near the bottom of the trace view)");
-}
-
-void ComputeDeviceTips(HardwareType hardware_type,
-                       OverviewPageRecommendation* re) {
-  absl::string_view device_name = HardwareType_Name(hardware_type);
-  absl::string_view timeline_name = device_name;
-  absl::string_view op_stats_toolname = "framework_op_stats";
-  if (hardware_type == tensorflow::profiler::TPU) {
-    timeline_name = "TPU core";
-    op_stats_toolname = "op_profile";
-  }
-  *re->add_device_tips() = MakeOverviewPageTip(
-      absl::StrCat(op_stats_toolname,
-                   " (identify the time-consuming operations "
-                   "executed on the ",
-                   device_name, ")"));
-  *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
-      "trace_viewer (look at the activities on the timeline of each ",
-      timeline_name, " in the trace view)"));
-}
-
-void ComputeFaqTips(OverviewPageRecommendation* re) {
-  *re->add_faq_tips() = MakeOverviewPageTip("Refer to the TF2 Profiler FAQ");
-}
-
-void ComputeDocumentationTips(OverviewPageRecommendation* re) {
-  *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
-      "https://www.tensorflow.org/guide/data_performance_analysis",
-      "Analyze tf.data performance with the TF Profiler");
-  *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
-      "https://www.tensorflow.org/guide/"
-      "data_performance",
-      "Better performance with the tf.data API");
-}
-
-std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
-  uint64 total_compute_ps =
-      precision_stats.compute_16bit_ps() + precision_stats.compute_32bit_ps();
-  if (total_compute_ps > 0) {
-    double percent_16bit =
-        (100.0 * precision_stats.compute_16bit_ps()) / total_compute_ps;
-    if (percent_16bit < kLowPrecisionPercentThreshold) {
-      return absl::StrCat(
-          "Only ", OneDigit(percent_16bit),
-          "% of device computation is 16 bit. So you might want to replace "
-          "more 32-bit Ops by 16-bit Ops to improve performance (if the "
-          "reduced accuracy is acceptable).");
-    }
-  }
-  return "";
-}
-
-}  // namespace
-
-void SetCommonRecommendation(
-    absl::string_view input_classification, absl::string_view input_statement,
-    absl::string_view output_statement, HardwareType hardware_type,
-    absl::string_view tf_function_statement_html,
-    absl::string_view eager_statement_html,
-    absl::string_view outside_compilation_statement_html,
-    OverviewPageRecommendation* re) {
-  re->set_bottleneck(std::string(input_classification));
-  re->set_statement(std::string(input_statement));
-  re->set_output_statement(std::string(output_statement));
-  re->set_tf_function_statement_html(std::string(tf_function_statement_html));
-  re->set_eager_statement_html(std::string(eager_statement_html));
-  re->set_outside_compilation_statement_html(
-      std::string(outside_compilation_statement_html));
-  ComputeHostTips(re);
-  ComputeDeviceTips(hardware_type, re);
-  ComputeDocumentationTips(re);
-  ComputeFaqTips(re);
-}
-
-OverviewPageRecommendation ComputeGenericRecommendation(
-    const BottleneckAnalysis& bottleneck,
-    const PrecisionStats& precision_stats) {
-  OverviewPageRecommendation re;
-  GenericRecommendation generic;
-  generic.set_device_collectives_bottleneck(
-      bottleneck.device_collectives_classification());
-  generic.set_device_collectives_statement(
-      bottleneck.device_collectives_statement());
-  generic.set_kernel_launch_bottleneck(
-      bottleneck.kernel_launch_classification());
-  generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
-  generic.set_all_other_bottleneck(bottleneck.all_other_classification());
-  generic.set_all_other_statement(bottleneck.all_other_statement());
-  generic.set_precision_statement(GeneratePrecisionStatement(precision_stats));
-  re.mutable_recommendation()->PackFrom(generic);
-  return re;
-}
-
-OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
-  OverviewPageAnalysis analysis;
-  OpMetricsDb device_tf_op_metrics_db = CreateTfMetricsDbFromDeviceOpMetricsDb(
-      op_stats.device_op_metrics_db(), /*with_idle=*/false);
-  KernelStatsByOpName kernel_stats_by_op_name =
-      GroupKernelReportsByOpName(op_stats.kernel_stats_db());
-  uint64 total_device_time_ps = device_tf_op_metrics_db.total_time_ps();
-  constexpr int kNumTopOpsShown = 10;
-  double device_cumulative_fraction = 0.0;
-  for (const OpMetrics* metrics :
-       SortedOpMetricsDb(device_tf_op_metrics_db, kNumTopOpsShown)) {
-    OverviewTfOp* op = analysis.add_top_device_ops();
-    op->set_name(metrics->name());
-    op->set_category(metrics->category());
-    op->set_self_time_fraction(tsl::profiler::SafeDivide(
-        metrics->self_time_ps(), total_device_time_ps));
-    device_cumulative_fraction += op->self_time_fraction();
-    op->set_cumulative_time_fraction(device_cumulative_fraction);
-    op->set_flop_rate(tsl::profiler::SafeDivide(
-        metrics->flops(), tsl::profiler::PicoToNano(metrics->time_ps())));
-    auto iter = kernel_stats_by_op_name.find(op->name());
-    if (iter != kernel_stats_by_op_name.end()) {
-      op->set_is_op_tensorcore_eligible(
-          iter->second.is_op_tensor_core_eligible);
-      op->set_is_op_using_tensorcore(iter->second.tensor_core_duration_ns != 0);
-    }
-  }
-  uint64 total_device_compute_ps =
-      op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps() +
-      op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps();
-  analysis.set_device_compute_16bit_percent(
-      100.0 *
-      tsl::profiler::SafeDivide(
-          op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps(),
-          total_device_compute_ps));
-  analysis.set_device_compute_32bit_percent(
-      100.0 *
-      tsl::profiler::SafeDivide(
-          op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
-          total_device_compute_ps));
-
-  uint64 num_host_tf_ops = 0;
-  uint64 total_host_op_time_ps_exclude_idle = 0;
-  uint64 eager_host_op_time_ps = 0;
-  for (const OpMetrics& metrics : op_stats.host_op_metrics_db().metrics_db()) {
-    num_host_tf_ops += metrics.occurrences();
-    if (!IsIdleOp(metrics)) {
-      total_host_op_time_ps_exclude_idle += metrics.self_time_ps();
-      if (metrics.is_eager()) eager_host_op_time_ps += metrics.self_time_ps();
-    }
-  }
-  uint64 num_device_tf_ops = 0;
-  uint64 total_device_op_time_ps_exclude_idle = 0;
-  uint64 eager_device_op_time_ps = 0;
-  for (const OpMetrics& metrics : device_tf_op_metrics_db.metrics_db()) {
-    num_device_tf_ops += metrics.occurrences();
-    if (!IsIdleOp(metrics)) {
-      total_device_op_time_ps_exclude_idle += metrics.self_time_ps();
-      if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
-    }
-  }
-  // Figures out outside_compilation time from
-  // op_stats.device_op_metrics_db().metrics_db(). We don't use the
-  // {metrics.provenance(), metrics.name()} from
-  // device_tf_op_metrics_db.metrics_db(), because metrics.provenance() there is
-  // not set and metrics.name() can be either HLO-Op name or TF-Op name, which
-  // will confuse tsl::profiler::IsOutsideCompilationOp().
-  uint64 outside_compilation_device_op_time_ps = 0;
-  for (const OpMetrics& metrics :
-       op_stats.device_op_metrics_db().metrics_db()) {
-    if (!tsl::profiler::IsOutsideCompilationOp(metrics.provenance(),
-                                               metrics.long_name()))
-      continue;
-    outside_compilation_device_op_time_ps += metrics.self_time_ps();
-  }
-  uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
-  analysis.set_host_tf_op_percent(
-      100.0 * tsl::profiler::SafeDivide(num_host_tf_ops, num_total_tf_ops));
-  analysis.set_device_tf_op_percent(
-      100.0 * tsl::profiler::SafeDivide(num_device_tf_ops, num_total_tf_ops));
-  analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
-  analysis.set_host_op_time_eager_percent(
-      100.0 * tsl::profiler::SafeDivide(eager_host_op_time_ps,
-                                        total_host_op_time_ps_exclude_idle));
-  analysis.set_device_op_time_eager_percent(
-      100.0 * tsl::profiler::SafeDivide(eager_device_op_time_ps,
-                                        total_device_op_time_ps_exclude_idle));
-  analysis.set_device_op_time_outside_compilation_percent(
-      100.0 * tsl::profiler::SafeDivide(outside_compilation_device_op_time_ps,
-                                        total_device_op_time_ps_exclude_idle));
-  return analysis;
-}
-
-// Converts from HostIndependentJobInfo to OverviewPageHostIndependentJobInfo.
-OverviewPageHostIndependentJobInfo ToOverviewPageHostIndependentJobInfo(
-    const HostIndependentJobInfoResult& host_independent_job_info) {
-  OverviewPageHostIndependentJobInfo result;
-  result.set_change_list(host_independent_job_info.change_list());
-  result.set_build_time(host_independent_job_info.build_time());
-  result.set_build_target(host_independent_job_info.build_target());
-  result.set_profile_duration_ms(
-      host_independent_job_info.profile_duration_ms());
-  return result;
-}
-
-// Converts from HostDependentJobInfo to OverviewPageHostDependentJobInfo.
-OverviewPageHostDependentJobInfo ToOverviewPageHostDependentJobInfo(
-    const HostDependentJobInfoResult& host_dependent_job_info) {
-  OverviewPageHostDependentJobInfo result;
-  result.set_host_id(host_dependent_job_info.host_id());
-  result.set_command_line(host_dependent_job_info.command_line());
-  result.set_start_time(host_dependent_job_info.start_time());
-  result.set_bns_address(host_dependent_job_info.bns_address());
-  result.set_profile_time_ns(host_dependent_job_info.profile_time_ns());
-  return result;
-}
-
-OverviewPageRunEnvironment ComputeRunEnvironment(
-    const RunEnvironment& run_environment) {
-  OverviewPageRunEnvironment re;
-  re.set_host_count(run_environment.host_count());
-  re.set_task_count(run_environment.task_count());
-  re.set_device_type(run_environment.device_type());
-  re.set_device_core_count(run_environment.device_core_count());
-  re.set_replica_count(run_environment.replica_count());
-  re.set_num_cores_per_replica(run_environment.num_cores_per_replica());
-  re.set_is_training(run_environment.is_training());
-  if (run_environment.has_power_metrics()) {
-    *re.mutable_power_metrics() = run_environment.power_metrics();
-  }
-  *re.mutable_host_independent_job_info() =
-      ToOverviewPageHostIndependentJobInfo(
-          run_environment.host_independent_job_info());
-  for (const auto& host_dependent_job_info :
-       run_environment.host_dependent_job_info()) {
-    *re.add_host_dependent_job_info() =
-        ToOverviewPageHostDependentJobInfo(host_dependent_job_info);
-  }
-  return re;
-}
-
-std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
-  std::vector<TfFunctionInfo> candidates;
-  for (const auto& name_fun : tf_function_db.tf_functions()) {
-    const auto& fun = name_fun.second;
-    if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
-      candidates.push_back({name_fun.first, fun.expensive_call_percent()});
-    }
-  }
-  if (candidates.empty()) return "";
-  auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
-    return a.expensive_call_percent > b.expensive_call_percent;
-  };
-  // Sorts candidates in descending order of expensive_call_percent.
-  absl::c_sort(candidates, cmp);
-  std::string expensive_functions = "";
-  auto num_functions_shown = std::min(
-      static_cast<decltype(candidates)::size_type>(3), candidates.size());
-
-  for (decltype(candidates)::size_type i = 0; i < num_functions_shown; i++) {
-    if (i > 0) absl::StrAppend(&expensive_functions, ", ");
-    absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
-                    "\"");
-  }
-  if (candidates.size() > num_functions_shown)
-    absl::StrAppend(&expensive_functions, " and more");
-  return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
-                      ") due to either retracing or eager execution.");
-}
-
-std::string EagerRecommendationHtml(double host_op_time_eager_percent,
-                                    double device_op_time_eager_percent) {
-  std::string recommendation = "";
-  if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation, OneDigit(host_op_time_eager_percent),
-                    "% of Op time on the host used eager execution. ");
-  if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation, OneDigit(device_op_time_eager_percent),
-                    "% of Op time on the device used eager execution. ");
-  if (!recommendation.empty())
-    absl::StrAppend(&recommendation, "Performance could be improved with ",
-                    AnchorElement("https://www.tensorflow.org/guide/function",
-                                  "tf.function."));
-  return recommendation;
-}
-
-std::string OutsideCompilationRecommendationHtml(
-    double device_op_time_outside_compilation_percent) {
-  if (device_op_time_outside_compilation_percent <=
-      kOutsideCompilationThresholdInPercent)
-    return "";
-  return absl::StrCat(
-      OneDigit(device_op_time_outside_compilation_percent),
-      " % of Op time on the device are for outside compilation. Performance "
-      "could be improved by avoiding outside compilation.");
-}
-
-OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
-  OverviewPage overview_page;
-  *overview_page.mutable_run_environment() =
-      ComputeRunEnvironment(op_stats.run_environment());
-  *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
-  *overview_page.mutable_input_analysis() =
-      ConvertOpStatsToInputPipelineAnalysis(op_stats);
-  BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
-      overview_page.input_analysis().input_time_breakdown(),
-      overview_page.input_analysis().step_details());
-  *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
-      bottleneck, op_stats.device_op_metrics_db().precision_stats());
-  SetCommonRecommendation(
-      bottleneck.input_classification(), bottleneck.input_statement(), "",
-      ParseHardwareType(op_stats.run_environment().device_type()),
-      TfFunctionRecommendationHtml(op_stats.tf_function_db()),
-      EagerRecommendationHtml(
-          overview_page.analysis().host_op_time_eager_percent(),
-          overview_page.analysis().device_op_time_eager_percent()),
-      OutsideCompilationRecommendationHtml(
-          overview_page.analysis()
-              .device_op_time_outside_compilation_percent()),
-      overview_page.mutable_recommendation());
-  PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
-  overview_page.mutable_analysis()->set_mxu_utilization_percent(
-      op_stats.performance_counter_result().matrix_unit_utilization_percent());
-  return overview_page;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
deleted file mode 100644
index ba6d906e325d..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
-
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Reports tf-function optimization opportunity in the Overview Page if the
-// expensive-call-time percentage is over this threshold for at least one of
-// the tf-functions profiled.
-const double kTfFunctionReportThresholdInPercent = 20;
-
-// Reports eager-mode optimization opportunity in the Overview Page if the
-// percent of Op time on host (or device) that is spent on eager mode is over
-// this threshold.
-const double kEagerReportThresholdInPercent = 10;
-
-// Reports outside-compilation opportunity in the Overview Page if the
-// percent of Op time on device that is for outside compilation is over
-// this threshold.
-const double kOutsideCompilationThresholdInPercent = 5;
-
-void SetCommonRecommendation(
-    absl::string_view input_classification, absl::string_view input_statement,
-    absl::string_view output_statement, HardwareType hardware_type,
-    absl::string_view tf_function_statement_html,
-    absl::string_view eager_statement_html,
-    absl::string_view outside_compilation_statement_html,
-    OverviewPageRecommendation* re);
-
-OverviewPageRecommendation ComputeGenericRecommendation(
-    const BottleneckAnalysis& bottleneck,
-    const PrecisionStats& precision_stats);
-
-OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats);
-
-OverviewPageRunEnvironment ComputeRunEnvironment(
-    const RunEnvironment& run_environment);
-
-OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats);
-
-// Returns a html which provides tf-function related recommendation.
-std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
-
-// Returns a html which provides eager-mode related recommendation.
-std::string EagerRecommendationHtml(double host_op_time_eager_percent,
-                                    double device_op_time_eager_percent);
-
-// Returns a html which provides outside-compilation related recommendation.
-std::string OutsideCompilationRecommendationHtml(
-    double device_op_time_outside_compilation_percent);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
deleted file mode 100644
index 3735c2a188bc..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
-
-#include <algorithm>
-#include <initializer_list>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/any.pb.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
-                                    const StepInfoResult& step_info) {
-  PodStatsRecord record;
-  GenericStepBreakdown generic;
-  bool success = step_info.step_breakdown().UnpackTo(&generic);
-  DCHECK(success);
-  record.set_host_name(string(host_name));
-  record.set_step_num(step_info.step_num());
-  record.set_total_duration_us(
-      tsl::profiler::PicoToMicro(step_info.duration_ps()));
-  auto& step_breakdown_map = *record.mutable_step_breakdown_us();
-  std::vector<std::pair<uint64, absl::string_view>> metrics;
-
-  auto add_event = [&](GenericEventType type,
-                       std::initializer_list<EventType> event_list) {
-    uint64 ps = 0;
-    for (const auto& event_type : event_list) {
-      ps += gtl::FindWithDefault(generic.type_ps(), event_type, /*value=*/0);
-    }
-    step_breakdown_map[type] = tsl::profiler::PicoToMicro(ps);
-    metrics.emplace_back(ps, GetGenericEventTypeStr(type));
-  };
-
-  add_event(kDeviceCompute, {DEVICE_COMPUTE_32, DEVICE_COMPUTE_16});
-  add_event(kDeviceToDevice, {DEVICE_TO_DEVICE, DEVICE_WAIT_DEVICE});
-  add_event(kDeviceCollectives, {DEVICE_COLLECTIVES});
-  add_event(kHostCompute, {HOST_COMPUTE});
-  add_event(kHostPrepare, {HOST_PREPARE});
-  add_event(kInput, {HOST_WAIT_INPUT, HOST_TO_DEVICE, DEVICE_WAIT_HOST});
-  add_event(kOutput, {DEVICE_TO_HOST});
-  add_event(kCompile, {HOST_COMPILE});
-  add_event(kAllOthers, {UNKNOWN_TIME});
-
-  std::sort(metrics.begin(), metrics.end());
-  record.set_bottleneck(metrics.back().second.data(),
-                        metrics.back().second.size());
-  return record;
-}
-
-}  // namespace
-
-PodStatsDatabase ConvertOpStatsToPodStats(const OpStats& op_stats) {
-  PodStatsDatabase pod_stats_db;
-  const auto& core_id_map = op_stats.core_id_to_details();
-  for (int i = GenericEventType::kFirstGenericEventType;
-       i <= GenericEventType::kLastGenericEventType; i++) {
-    auto& event = *pod_stats_db.add_step_breakdown_events();
-    event.set_id(i);
-    absl::string_view type_str =
-        GetGenericEventTypeStr(static_cast<GenericEventType>(i));
-    event.set_name(type_str.data(), type_str.size());
-  }
-
-  for (const auto& step_sequence : op_stats.step_db().step_sequence()) {
-    for (const auto& entry : step_sequence.step_info_per_core()) {
-      if (!core_id_map.contains(entry.first)) {
-        LOG(WARNING) << "core_id_map does not contain " << entry.first;
-        continue;
-      }
-      const CoreDetails& details = core_id_map.at(entry.first);
-      *pod_stats_db.add_pod_stats_record() =
-          CreatePodStatsRecord(details.hostname(), entry.second);
-    }
-  }
-  PopulateStepDiagnostics(op_stats, pod_stats_db.mutable_diagnostics());
-  return pod_stats_db;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h
deleted file mode 100644
index bd3d74068d8a..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
-
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-PodStatsDatabase ConvertOpStatsToPodStats(const OpStats& op_stats);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
deleted file mode 100644
index 899b8ade54ca..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
-
-#include "google/protobuf/any.pb.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-const double kMaxError = 1e-6;
-constexpr int kStepNum = 2;
-constexpr int kCoreId = 1001;
-constexpr int kStepTimePs = 1000;
-constexpr int kHostComputePs = 50;
-constexpr int kHostCompilePs = 50;
-constexpr int kHostToHostPs = 50;
-constexpr int kHostToDevicePs = 50;
-constexpr int kHostPreparePs = 50;
-constexpr int kDeviceCollectivePs = 350;
-constexpr int kHostWaitInputPs = 50;
-constexpr int kDeviceToDevicePs = 50;
-constexpr int kDeviceToHostPs = 50;
-constexpr int kDeviceCompute32Ps = 50;
-constexpr int kDeviceCompute16Ps = 50;
-constexpr int kDeviceWaitDevicePs = 50;
-constexpr int kDeviceWaitHostPs = 50;
-constexpr int kUnknownTimePs = 50;
-static constexpr char kHostname[] = "host:123";
-
-void CreateOpStats(OpStats* op_stats) {
-  PerCoreStepInfo* info = op_stats->mutable_step_db()->add_step_sequence();
-  info->set_step_num(kStepNum);
-  StepInfoResult& step_info = (*info->mutable_step_info_per_core())[kCoreId];
-  step_info.set_step_num(kStepNum);
-  step_info.set_duration_ps(kStepTimePs);
-  GenericStepBreakdown breakdown;
-  auto& type_ps = *breakdown.mutable_type_ps();
-  type_ps[HOST_COMPUTE] = kHostComputePs;
-  type_ps[HOST_COMPILE] = kHostCompilePs;
-  type_ps[HOST_TO_HOST] = kHostToHostPs;
-  type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
-  type_ps[HOST_PREPARE] = kHostPreparePs;
-  type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
-  type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
-  type_ps[DEVICE_TO_DEVICE] = kDeviceToDevicePs;
-  type_ps[DEVICE_TO_HOST] = kDeviceToHostPs;
-  type_ps[DEVICE_COMPUTE_32] = kDeviceCompute32Ps;
-  type_ps[DEVICE_COMPUTE_16] = kDeviceCompute16Ps;
-  type_ps[DEVICE_WAIT_DEVICE] = kDeviceWaitDevicePs;
-  type_ps[DEVICE_WAIT_HOST] = kDeviceWaitHostPs;
-  type_ps[UNKNOWN_TIME] = kUnknownTimePs;
-  step_info.mutable_step_breakdown()->PackFrom(breakdown);
-  CoreDetails& details = (*op_stats->mutable_core_id_to_details())[kCoreId];
-  details.set_hostname(kHostname);
-}
-
-TEST(OpStatsToPodStats, GpuPodStats) {
-  OpStats op_stats;
-  CreateOpStats(&op_stats);
-  PodStatsDatabase pod_stats_db = ConvertOpStatsToPodStats(op_stats);
-  EXPECT_EQ(1, pod_stats_db.pod_stats_record_size());
-  const PodStatsRecord& record = pod_stats_db.pod_stats_record(0);
-  EXPECT_EQ(kStepNum, record.step_num());
-  EXPECT_EQ(kHostname, record.host_name());
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kStepTimePs),
-              record.total_duration_us(), kMaxError);
-  const auto& breakdown = record.step_breakdown_us();
-  EXPECT_NEAR(
-      tsl::profiler::PicoToMicro(kDeviceCompute32Ps + kDeviceCompute16Ps),
-      breakdown.at(kDeviceCompute), kMaxError);
-  EXPECT_NEAR(
-      tsl::profiler::PicoToMicro(kDeviceToDevicePs + kDeviceWaitDevicePs),
-      breakdown.at(kDeviceToDevice), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceCollectivePs),
-              breakdown.at(kDeviceCollectives), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostComputePs),
-              breakdown.at(kHostCompute), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostPreparePs),
-              breakdown.at(kHostPrepare), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostWaitInputPs + kHostToDevicePs +
-                                         kDeviceWaitHostPs),
-              breakdown.at(kInput), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceToHostPs),
-              breakdown.at(kOutput), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostCompilePs),
-              breakdown.at(kCompile), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kUnknownTimePs),
-              breakdown.at(kAllOthers), kMaxError);
-
-  EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
-}
-
-TEST(OpStatsToPodStats, Diagnostics) {
-  OpStats op_stats;
-  op_stats.mutable_step_db()->set_use_incomplete_step(true);
-  PodStatsDatabase pod_stats_db = ConvertOpStatsToPodStats(op_stats);
-  EXPECT_EQ(1, pod_stats_db.diagnostics().warnings_size());
-  EXPECT_EQ(kErrorIncompleteStep, pod_stats_db.diagnostics().warnings(0));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
deleted file mode 100644
index aad1e1ca79fd..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
-
-#include <utility>
-
-#include "absl/log/check.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
-#include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-PodStatsSequence ConvertOpStatsToPodStatsSequence(const OpStats& op_stats,
-                                                  PodStatsDatabase pod_stats) {
-  PodStatsSequence result_db;
-  // PodStatsDatabase is created using the same iteration order below.
-  // Thus, we just need to move one record at a time.
-  int i = 0;
-  for (const auto& step_sequence : op_stats.step_db().step_sequence()) {
-    PodStatsMap* pod_stats_map = result_db.add_pod_stats_map();
-    pod_stats_map->set_step_num(step_sequence.step_num());
-    for (const auto& entry : step_sequence.step_info_per_core()) {
-      PodStatsRecord& record =
-          (*pod_stats_map->mutable_pod_stats_per_core())[entry.first];
-      DCHECK_LE(i, pod_stats.pod_stats_record_size());
-      record = std::move(*pod_stats.mutable_pod_stats_record(i++));
-    }
-  }
-  return result_db;
-}
-
-}  // namespace
-
-PodViewerDatabase ConvertOpStatsToPodViewer(const OpStats& op_stats) {
-  PodViewerDatabase database;
-  database.set_device_type(op_stats.run_environment().device_type());
-  PodStatsDatabase pod_stats = ConvertOpStatsToPodStats(op_stats);
-  database.mutable_step_breakdown_events()->Swap(
-      pod_stats.mutable_step_breakdown_events());
-  *database.mutable_pod_stats_sequence() =
-      ConvertOpStatsToPodStatsSequence(op_stats, std::move(pod_stats));
-  PopulateStepDiagnostics(op_stats, database.mutable_diagnostics());
-  return database;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h
deleted file mode 100644
index c45c99393758..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
-
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/pod_viewer.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-PodViewerDatabase ConvertOpStatsToPodViewer(const OpStats& op_stats);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
deleted file mode 100644
index 2273bce70fb2..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
-
-#include "google/protobuf/any.pb.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-const double kMaxError = 1e-6;
-constexpr int kStepNum = 2;
-constexpr int kCoreId = 1001;
-constexpr int kStepTimePs = 1000;
-constexpr int kHostComputePs = 50;
-constexpr int kHostCompilePs = 50;
-constexpr int kHostToHostPs = 50;
-constexpr int kHostToDevicePs = 50;
-constexpr int kHostPreparePs = 50;
-constexpr int kDeviceCollectivePs = 350;
-constexpr int kHostWaitInputPs = 50;
-constexpr int kDeviceToDevicePs = 50;
-constexpr int kDeviceToHostPs = 50;
-constexpr int kDeviceCompute32Ps = 50;
-constexpr int kDeviceCompute16Ps = 50;
-constexpr int kDeviceWaitDevicePs = 50;
-constexpr int kDeviceWaitHostPs = 50;
-constexpr int kUnknownTimePs = 50;
-static constexpr char kHostname[] = "host:123";
-
-void CreateOpStats(OpStats* op_stats) {
-  PerCoreStepInfo* info = op_stats->mutable_step_db()->add_step_sequence();
-  info->set_step_num(kStepNum);
-  StepInfoResult& step_info = (*info->mutable_step_info_per_core())[kCoreId];
-  step_info.set_step_num(kStepNum);
-  step_info.set_duration_ps(kStepTimePs);
-  GenericStepBreakdown breakdown;
-  auto& type_ps = *breakdown.mutable_type_ps();
-  type_ps[HOST_COMPUTE] = kHostComputePs;
-  type_ps[HOST_COMPILE] = kHostCompilePs;
-  type_ps[HOST_TO_HOST] = kHostToHostPs;
-  type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
-  type_ps[HOST_PREPARE] = kHostPreparePs;
-  type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
-  type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
-  type_ps[DEVICE_TO_DEVICE] = kDeviceToDevicePs;
-  type_ps[DEVICE_TO_HOST] = kDeviceToHostPs;
-  type_ps[DEVICE_COMPUTE_32] = kDeviceCompute32Ps;
-  type_ps[DEVICE_COMPUTE_16] = kDeviceCompute16Ps;
-  type_ps[DEVICE_WAIT_DEVICE] = kDeviceWaitDevicePs;
-  type_ps[DEVICE_WAIT_HOST] = kDeviceWaitHostPs;
-  type_ps[UNKNOWN_TIME] = kUnknownTimePs;
-  step_info.mutable_step_breakdown()->PackFrom(breakdown);
-  CoreDetails& details = (*op_stats->mutable_core_id_to_details())[kCoreId];
-  details.set_hostname(kHostname);
-}
-
-TEST(OpStatsToPodViewer, GpuPodViewer) {
-  OpStats op_stats;
-  CreateOpStats(&op_stats);
-  PodViewerDatabase pod_viewer_db = ConvertOpStatsToPodViewer(op_stats);
-  EXPECT_EQ(1, pod_viewer_db.pod_stats_sequence().pod_stats_map_size());
-  const PodStatsMap& pod_stats_map =
-      pod_viewer_db.pod_stats_sequence().pod_stats_map(0);
-  EXPECT_EQ(kStepNum, pod_stats_map.step_num());
-  const PodStatsRecord& record = pod_stats_map.pod_stats_per_core().at(kCoreId);
-  EXPECT_EQ(kStepNum, record.step_num());
-  EXPECT_EQ(kHostname, record.host_name());
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kStepTimePs),
-              record.total_duration_us(), kMaxError);
-  const auto& breakdown = record.step_breakdown_us();
-  EXPECT_NEAR(
-      tsl::profiler::PicoToMicro(kDeviceCompute32Ps + kDeviceCompute16Ps),
-      breakdown.at(kDeviceCompute), kMaxError);
-  EXPECT_NEAR(
-      tsl::profiler::PicoToMicro(kDeviceToDevicePs + kDeviceWaitDevicePs),
-      breakdown.at(kDeviceToDevice), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceCollectivePs),
-              breakdown.at(kDeviceCollectives), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostComputePs),
-              breakdown.at(kHostCompute), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostPreparePs),
-              breakdown.at(kHostPrepare), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostWaitInputPs + kHostToDevicePs +
-                                         kDeviceWaitHostPs),
-              breakdown.at(kInput), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kDeviceToHostPs),
-              breakdown.at(kOutput), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kHostCompilePs),
-              breakdown.at(kCompile), kMaxError);
-  EXPECT_NEAR(tsl::profiler::PicoToMicro(kUnknownTimePs),
-              breakdown.at(kAllOthers), kMaxError);
-
-  EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
-}
-
-TEST(OpStatsToPodViewer, Diagnostics) {
-  OpStats op_stats;
-  op_stats.mutable_step_db()->set_use_incomplete_step(true);
-  PodViewerDatabase pod_viewer_db = ConvertOpStatsToPodViewer(op_stats);
-  EXPECT_EQ(1, pod_viewer_db.diagnostics().warnings_size());
-  EXPECT_EQ(kErrorIncompleteStep, pod_viewer_db.diagnostics().warnings(0));
-}
-
-TEST(OpStatsToPodViewer, DeviceType) {
-  OpStats op_stats;
-  op_stats.mutable_run_environment()->set_device_type("GPU");
-  PodViewerDatabase pod_viewer_db = ConvertOpStatsToPodViewer(op_stats);
-  EXPECT_EQ("GPU", pod_viewer_db.device_type());
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_roofline_model.cc b/tensorflow/core/profiler/convert/op_stats_to_roofline_model.cc
deleted file mode 100644
index 58ebbc10ec95..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_roofline_model.cc
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_roofline_model.h"
-
-#include <algorithm>
-#include <cstdint>
-
-#include "absl/log/check.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/roofline_model.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tensorflow::profiler::OpMetrics;
-using tensorflow::profiler::OpMetricsDb;
-using tensorflow::profiler::PerfEnv;
-using tensorflow::profiler::roofline_model::RecordType;
-using tensorflow::profiler::roofline_model::RooflineModelDatabase;
-using tensorflow::profiler::roofline_model::RooflineModelRecord;
-
-// The maximum number of records to generate.
-const uint32_t kMaxNumRecords = 1000;
-}  // namespace
-
-RooflineModelRecord ConvertOpMetricsToRooflineModelRecord(
-    const OpStats& op_stats, const OpMetrics& metrics, RecordType record_type,
-    uint32_t step_num, uint64_t total_time_ps,
-    const RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed) {
-  RooflineModelRecord record;
-  record.set_hlo_name(metrics.name());
-  record.set_hlo_category(metrics.category());
-  record.set_hlo_module_id(metrics.hlo_module_id());
-  record.set_record_type(record_type);
-  record.set_step_num(step_num);
-  SetExecutionTimes(metrics, &record);
-  if (record_type == RecordType::AVERAGE_STEP) {
-    // For RecordType::AVERAGE_STEP, divide by num_steps to show per-step
-    // numbers when appropriate.
-    int num_steps = op_stats.step_db().step_sequence_size();
-    record.set_total_time_in_us(record.total_time_in_us() / num_steps);
-    record.set_total_self_time_in_us(record.total_self_time_in_us() /
-                                     num_steps);
-  }
-  record.set_total_time_per_core_in_us(tsl::profiler::SafeDivide(
-      record.total_time_in_us(),
-      op_stats.run_environment().device_core_count()));
-  record.set_total_time_in_percentage(
-      tsl::profiler::SafeDivide(metrics.time_ps(), total_time_ps));
-
-  tensorflow::profiler::SetTpuUnitFractions(metrics, &record);
-
-  // Set the roofline-specific fields.
-  SetRooflineMetrics(metrics, op_stats.perf_env(), op_stats.run_environment(),
-                     &record);
-  const double cmem_wr_utilization =
-      roofline_model_db.has_cmem()
-          ? tsl::profiler::SafeDivide(record.cmem_write_bw(),
-                                      roofline_model_db.peak_cmem_write_bw())
-          : 0;
-  const double cmem_rd_utilization =
-      roofline_model_db.has_cmem()
-          ? tsl::profiler::SafeDivide(record.cmem_read_bw(),
-                                      roofline_model_db.peak_cmem_read_bw())
-          : 0;
-  const double vmem_rd_utilization =
-      roofline_model_db.has_merged_vmem()
-          ? tsl::profiler::SafeDivide(record.vmem_read_bw(),
-                                      roofline_model_db.peak_vmem_read_bw())
-          : 0;
-  const double vmem_wr_utilization =
-      roofline_model_db.has_merged_vmem()
-          ? tsl::profiler::SafeDivide(record.vmem_write_bw(),
-                                      roofline_model_db.peak_vmem_write_bw())
-          : 0;
-  const double flops_utilization = tsl::profiler::SafeDivide(
-      record.measured_flop_rate(), roofline_model_db.peak_flop_rate());
-  const double hbm_utilization = tsl::profiler::SafeDivide(
-      record.hbm_bw(), roofline_model_db.peak_hbm_bw());
-
-  const double max_mem_utilization =
-      std::max({cmem_wr_utilization, cmem_rd_utilization, hbm_utilization,
-                vmem_wr_utilization, vmem_rd_utilization});
-  const double roofline_efficiency =
-      std::max({max_mem_utilization, flops_utilization});
-  // Note, copy-start/done can have utilizations above 1.0 since their
-  // bytes/time are not accurate as they are asynchronous.
-  record.set_optimal_flop_rate(tsl::profiler::SafeDivide(
-      record.measured_flop_rate(), roofline_efficiency));
-  record.set_roofline_efficiency(roofline_efficiency);
-  record.set_flop_rate_relative_to_hw_limit(flops_utilization);
-  record.set_memory_bw_relative_to_hw_limit(max_mem_utilization);
-
-  record.set_include_infeed_outfeed(include_infeed_outfeed);
-
-  return record;
-}
-
-RooflineModelRecord GenerateRooflineModelProgramRecord(
-    const OpStats& op_stats, const OpMetricsDb& db, RecordType record_type,
-    uint32_t step_num, const RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed) {
-  OpMetrics program_metrics;
-  program_metrics.set_name("Program");
-  program_metrics.set_category("Program");
-  program_metrics.set_occurrences(1);
-  uint64_t infeed_outfeed_time = 0;
-  for (const OpMetrics& metrics : db.metrics_db()) {
-    // Aggregate innermost ops only to avoid redundant counting.
-    if (tsl::profiler::MayHaveInnerOps(metrics.category())) continue;
-    if (!include_infeed_outfeed &&
-        tsl::profiler::IsInfeedOrOutfeed(metrics.category())) {
-      infeed_outfeed_time += metrics.time_ps();
-      continue;
-    }
-    program_metrics.set_flops(program_metrics.flops() + metrics.flops());
-    program_metrics.set_model_flops(program_metrics.model_flops() +
-                                    metrics.model_flops());
-    program_metrics.set_bytes_accessed(program_metrics.bytes_accessed() +
-                                       metrics.bytes_accessed());
-    CombineMemoryAccessedBreakdown(
-        metrics.memory_accessed_breakdown(),
-        program_metrics.mutable_memory_accessed_breakdown());
-  }
-  uint64_t total_time_ps = db.total_time_ps();
-  if (!include_infeed_outfeed) total_time_ps -= infeed_outfeed_time;
-  program_metrics.set_time_ps(total_time_ps);
-  RooflineModelRecord program_record = ConvertOpMetricsToRooflineModelRecord(
-      op_stats, program_metrics, record_type, step_num, total_time_ps,
-      roofline_model_db, include_infeed_outfeed);
-  program_record.set_rank(0);
-  program_record.set_total_self_time_as_fraction(0.0);
-  program_record.set_cumulative_total_self_time_as_fraction(0.0);
-  return program_record;
-}
-
-tsl::protobuf::RepeatedPtrField<RooflineModelRecord>
-ConvertOpMetricsDbToRooflineModelRecords(
-    const OpStats& op_stats, const OpMetricsDb& db, RecordType record_type,
-    uint32_t step_num, const RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed) {
-  tsl::protobuf::RepeatedPtrField<RooflineModelRecord> roofline_model_records;
-  RooflineModelRecord* program_record = roofline_model_records.Add();
-  *program_record = GenerateRooflineModelProgramRecord(
-      op_stats, db, record_type, step_num, roofline_model_db,
-      include_infeed_outfeed);
-  const RooflineModelRecord* prev_record = program_record;
-  uint64_t infeed_outfeed_time = 0;
-  if (!include_infeed_outfeed) {
-    // Calculate the total time spent on infeed and outfeed ops.
-    for (const OpMetrics& metrics : db.metrics_db()) {
-      if (tsl::profiler::IsInfeedOrOutfeed(metrics.category())) {
-        infeed_outfeed_time += metrics.time_ps();
-      }
-    }
-  }
-  uint64_t total_time_ps = db.total_time_ps() - infeed_outfeed_time;
-  double total_time_us = tsl::profiler::PicoToMicro(total_time_ps);
-  for (const auto* metrics : SortedOpMetricsDb(db, kMaxNumRecords)) {
-    if (metrics->occurrences() == 0) continue;
-    if (!include_infeed_outfeed &&
-        tsl::profiler::IsInfeedOrOutfeed(metrics->category())) {
-      continue;
-    }
-    RooflineModelRecord* record = roofline_model_records.Add();
-    *record = ConvertOpMetricsToRooflineModelRecord(
-        op_stats, *metrics, record_type, step_num, total_time_ps,
-        roofline_model_db, include_infeed_outfeed);
-    SetRankAndTimeFractions(total_time_us, *prev_record, record);
-    prev_record = record;
-  }
-  return roofline_model_records;
-}
-
-RooflineModelDatabase InitializeRooflineModelDatabaseFromOpStats(
-    const OpStats& op_stats, bool include_infeed_outfeed) {
-  tensorflow::profiler::HardwareType hardware_type =
-      op_stats.run_environment().hardware_type();
-  DCHECK(hardware_type == GPU || hardware_type == TPU);
-
-  RooflineModelDatabase roofline_model_db;
-  const PerfEnv& perf_env = op_stats.perf_env();
-  roofline_model_db.set_device_type(op_stats.run_environment().device_type());
-
-  // Set peak flop rate in GFLOPs/s.
-  roofline_model_db.set_peak_flop_rate(
-      tsl::profiler::TeraToGiga((perf_env.peak_tera_flops_per_second())));
-  roofline_model_db.set_peak_hbm_bw(
-      tsl::profiler::GigaToGibi(GetMemoryPeakBandwidth(perf_env, 0)));
-
-  if (hardware_type == HardwareType::TPU) {
-    roofline_model_db.set_megacore(perf_env.has_megacore());
-
-    roofline_model_db.set_has_cmem(perf_env.has_cmem());
-    roofline_model_db.set_has_merged_vmem(perf_env.has_merged_vmem());
-    if (roofline_model_db.has_cmem()) {
-      roofline_model_db.set_peak_cmem_read_bw(
-          tsl::profiler::GigaToGibi(GetMemoryPeakBandwidth(perf_env, 3)));
-      roofline_model_db.set_peak_cmem_write_bw(
-          tsl::profiler::GigaToGibi(GetMemoryPeakBandwidth(perf_env, 4)));
-    } else if (roofline_model_db.has_merged_vmem()) {
-      roofline_model_db.set_peak_vmem_read_bw(
-          tsl::profiler::GigaToGibi(GetMemoryPeakBandwidth(perf_env, 5)));
-      roofline_model_db.set_peak_vmem_write_bw(
-          tsl::profiler::GigaToGibi(GetMemoryPeakBandwidth(perf_env, 6)));
-    }
-  } else if (hardware_type == HardwareType::GPU) {
-    roofline_model_db.set_megacore(false);
-    roofline_model_db.set_has_cmem(false);
-    roofline_model_db.set_has_merged_vmem(true);
-    roofline_model_db.set_peak_vmem_read_bw(
-        tsl::profiler::GigaToGibi(GetMemoryPeakBandwidth(perf_env, 1)));
-    roofline_model_db.set_peak_vmem_write_bw(
-        tsl::profiler::GigaToGibi(GetMemoryPeakBandwidth(perf_env, 2)));
-  }
-
-  return roofline_model_db;
-}
-
-RooflineModelDatabase ConvertOpStatsToRooflineModel(
-    const OpStats& op_stats, bool include_infeed_outfeed) {
-  HardwareType hardware_type = op_stats.run_environment().hardware_type();
-  if (hardware_type != GPU && hardware_type != TPU) {
-    return RooflineModelDatabase();
-  }
-
-  RooflineModelDatabase roofline_model_db =
-      InitializeRooflineModelDatabaseFromOpStats(op_stats,
-                                                 include_infeed_outfeed);
-
-  AddRooflineModelRecordForProfileDuration(op_stats, roofline_model_db,
-                                           include_infeed_outfeed);
-  AddRooflineModelRecordsForCompleteSteps(op_stats, roofline_model_db,
-                                          include_infeed_outfeed);
-  AddRooflineModelRecordsPerStep(op_stats, roofline_model_db,
-                                 include_infeed_outfeed);
-  PopulateStepDiagnostics(op_stats, roofline_model_db.mutable_diagnostics());
-  return roofline_model_db;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_roofline_model.h b/tensorflow/core/profiler/convert/op_stats_to_roofline_model.h
deleted file mode 100644
index f2ed42f783d8..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_roofline_model.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_ROOFLINE_MODEL_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_ROOFLINE_MODEL_H_
-
-#include <cstdint>
-
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/roofline_model.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using tensorflow::profiler::OpMetrics;
-using tensorflow::profiler::roofline_model::RecordType;
-using tensorflow::profiler::roofline_model::RooflineModelDatabase;
-using tensorflow::profiler::roofline_model::RooflineModelRecord;
-
-RooflineModelRecord ConvertOpMetricsToRooflineModelRecord(
-    const OpStats& op_stats, const OpMetrics& metrics, RecordType record_type,
-    uint32_t step_num, uint64_t total_time_ps,
-    const RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed);
-
-RooflineModelRecord GenerateRooflineModelProgramRecord(
-    const OpStats& op_stats, const OpMetricsDb& db, RecordType record_type,
-    uint32_t step_num, const RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed);
-
-tsl::protobuf::RepeatedPtrField<RooflineModelRecord>
-ConvertOpMetricsDbToRooflineModelRecords(
-    const OpStats& op_stats, const OpMetricsDb& db, RecordType record_type,
-    uint32_t step_num, const RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed);
-
-tensorflow::profiler::roofline_model::RooflineModelDatabase
-ConvertOpStatsToRooflineModel(const tensorflow::profiler::OpStats& tf_op_stats,
-                              bool include_infeed_outfeed);
-
-tensorflow::profiler::roofline_model::RooflineModelDatabase
-InitializeRooflineModelDatabaseFromOpStats(const OpStats& op_stats,
-                                           bool include_infeed_outfeed);
-// Generate RooflineModelRecord for the HLO DB over the entire profiling
-// duration including incomplete steps.
-inline void AddRooflineModelRecordForProfileDuration(
-    const OpStats& op_stats, RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed) {
-  *roofline_model_db.mutable_roofline_model_record() =
-      ConvertOpMetricsDbToRooflineModelRecords(
-          op_stats, op_stats.device_op_metrics_db(), RecordType::ALL,
-          /*step_num=*/0, roofline_model_db, include_infeed_outfeed);
-}
-
-// Generate RooflineModelRecord for the HLO DB over complete steps only.
-inline void AddRooflineModelRecordsForCompleteSteps(
-    const OpStats& op_stats, RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed) {
-  if (op_stats.has_hlo_metrics_db_complete_steps_only()) {
-    *roofline_model_db.add_roofline_model_record() =
-        GenerateRooflineModelProgramRecord(
-            op_stats, op_stats.hlo_metrics_db_complete_steps_only(),
-            RecordType::AVERAGE_STEP, /*step_num=*/0, roofline_model_db,
-            include_infeed_outfeed);
-  }
-}
-
-// Generate RooflineModelRecords for the per-step DBs.
-inline void AddRooflineModelRecordsPerStep(
-    const OpStats& op_stats, RooflineModelDatabase& roofline_model_db,
-    bool include_infeed_outfeed) {
-  for (const auto& step_info : op_stats.step_db().step_sequence()) {
-    *roofline_model_db.add_roofline_model_record() =
-        GenerateRooflineModelProgramRecord(
-            op_stats, step_info.hlo_metrics_db(), RecordType::PER_STEP,
-            step_info.step_num(), roofline_model_db, include_infeed_outfeed);
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_ROOFLINE_MODEL_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
deleted file mode 100644
index 841a7b58be9d..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
-
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// The maximum number of Tensorflow Ops displayed on Tensorflow Stats page.
-// 500 device side ops and 500 host side ops.
-const int kMaxNumOfOps = 500;
-
-TfStatsRecord ConvertOpMetricsToTfStatsRecord(bool on_device,
-                                              const OpMetrics& metrics,
-                                              const PerfEnv& perf_env,
-                                              const RunEnvironment& run_env) {
-  TfStatsRecord record;
-  record.set_host_or_device(on_device ? "Device" : "Host");
-  record.set_is_eager(metrics.is_eager());
-  record.set_op_type(metrics.category());
-  record.set_op_name(metrics.name());
-  SetExecutionTimes(metrics, &record);
-  SetRooflineMetrics(metrics, perf_env, run_env, &record);
-  return record;
-}
-
-TfStatsTable GenerateTfStatsTable(
-    const OpMetricsDb& host_tf_metrics_db,
-    const OpMetricsDb& device_tf_metrics_db,
-    const KernelStatsByOpName& kernel_stats_by_op_name, const PerfEnv& perf_env,
-    const RunEnvironment& run_env, bool exclude_idle) {
-  TfStatsTable tf_stats_table;
-  TfStatsRecord sentinel;
-  sentinel.set_rank(0);
-  sentinel.set_device_cumulative_total_self_time_as_fraction(0.0);
-  sentinel.set_host_cumulative_total_self_time_as_fraction(0.0);
-  const TfStatsRecord* prev_record = &sentinel;
-
-  // Sets device-side TF stats.
-  uint64 total_device_time_ps = TotalTimePs(device_tf_metrics_db, exclude_idle);
-  double total_device_time_us =
-      tsl::profiler::PicoToMicro(total_device_time_ps);
-  for (const OpMetrics* metrics :
-       SortedOpMetricsDb(device_tf_metrics_db, kMaxNumOfOps)) {
-    if (exclude_idle && IsIdleOp(*metrics)) continue;
-    TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
-    *record = ConvertOpMetricsToTfStatsRecord(
-        /*on_device=*/true, *metrics, perf_env, run_env);
-    // Compute TensorCore utilization only on device side.
-    auto iter = kernel_stats_by_op_name.find(record->op_name());
-    if (iter != kernel_stats_by_op_name.end()) {
-      record->set_gpu_tensorcore_utilization(
-          tsl::profiler::SafeDivide(iter->second.tensor_core_duration_ns,
-                                    iter->second.total_duration_ns));
-    } else {
-      record->set_gpu_tensorcore_utilization(0.0);
-    }
-    SetRankAndDeviceTimeFractions(total_device_time_us, *prev_record, record);
-    prev_record = record;
-  }
-
-  // Sets host-side TF stats.
-  uint64 total_host_time_ps = TotalTimePs(host_tf_metrics_db, exclude_idle);
-  double total_host_time_us = tsl::profiler::PicoToMicro(total_host_time_ps);
-  for (const OpMetrics* metrics : tensorflow::profiler::SortedOpMetricsDb(
-           host_tf_metrics_db, kMaxNumOfOps)) {
-    if (exclude_idle && IsIdleOp(*metrics)) continue;
-    TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
-    *record = ConvertOpMetricsToTfStatsRecord(
-        /*on_device=*/false, *metrics, perf_env, run_env);
-    // Host side TensorCore utilization is always 0.0
-    record->set_gpu_tensorcore_utilization(0.0);
-    SetRankAndHostTimeFractions(total_host_time_us, *prev_record, record);
-    prev_record = record;
-  }
-  return tf_stats_table;
-}
-
-}  // namespace
-
-TfStatsDatabase ConvertOpStatsToTfStats(const OpStats& op_stats) {
-  const OpMetricsDb& host_tf_metrics_db = op_stats.host_op_metrics_db();
-  OpMetricsDb device_tf_metrics_db =
-      CreateTfMetricsDbFromDeviceOpMetricsDb(op_stats.device_op_metrics_db());
-  const PerfEnv perf_env = op_stats.perf_env();
-  const RunEnvironment run_env = op_stats.run_environment();
-  KernelStatsByOpName kernel_stats_by_op_name =
-      GroupKernelReportsByOpName(op_stats.kernel_stats_db());
-  TfStatsDatabase tf_stats_db;
-  *tf_stats_db.mutable_with_idle() = GenerateTfStatsTable(
-      host_tf_metrics_db, device_tf_metrics_db, kernel_stats_by_op_name,
-      perf_env, run_env, /*exclude_idle=*/false);
-  *tf_stats_db.mutable_without_idle() = GenerateTfStatsTable(
-      host_tf_metrics_db, device_tf_metrics_db, kernel_stats_by_op_name,
-      perf_env, run_env, /*exclude_idle=*/true);
-  tf_stats_db.set_device_type(op_stats.run_environment().device_type());
-  return tf_stats_db;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.h b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.h
deleted file mode 100644
index 3b8a06ef1c66..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_TF_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_TF_STATS_H_
-
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-TfStatsDatabase ConvertOpStatsToTfStats(const OpStats& op_stats);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_TF_STATS_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
deleted file mode 100644
index abe9d599d971..000000000000
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
-
-#include <cstdint>
-#include <string>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-XEventBuilder AddTensorFlowOpEvent(std::string&& tf_op_fullname,
-                                   int64_t start_timestamp_ns,
-                                   int64_t duration_ns, bool on_device,
-                                   absl::string_view kernel_name,
-                                   XPlaneBuilder* plane, XLineBuilder* line) {
-  absl::string_view name = on_device ? kernel_name : tf_op_fullname;
-  XEventBuilder event = line->AddEvent(*plane->GetOrCreateEventMetadata(name));
-  event.SetTimestampNs(start_timestamp_ns);
-  event.SetDurationNs(duration_ns);
-  if (!on_device) return event;
-  event.AddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-      *plane->GetOrCreateStatMetadata(std::move(tf_op_fullname)));
-  return event;
-}
-
-void AddTensorFlowOpEventWithKernelDetails(std::string&& tf_op_fullname,
-                                           int64_t start_timestamp_ns,
-                                           int64_t duration_ns, bool on_device,
-                                           absl::string_view kernel_name,
-                                           absl::string_view kernel_details,
-                                           XPlaneBuilder* plane,
-                                           XLineBuilder* line) {
-  XEventBuilder event =
-      AddTensorFlowOpEvent(std::move(tf_op_fullname), start_timestamp_ns,
-                           duration_ns, on_device, kernel_name, plane, line);
-  if (!on_device) return;
-  event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("kernel_details"),
-                             kernel_details);
-}
-
-TEST(OpStatsToTfStats, GpuTfStats) {
-  // TfOp1 has kernel1 and kernel2; TfOp2 has kernel3;
-  // TfOp3 has kernel4 and kernel5 and is TensorCore eligible.
-  static constexpr char kTfOp1[] = "TfOp1";
-  static constexpr char kTfOp2[] = "TfOp2";
-  static constexpr char kTfOp3[] = "Conv2D";
-  static constexpr char kKernel1[] = "kernel1";
-  static constexpr char kKernel2[] = "kernel2";
-  static constexpr char kKernel3[] = "kernel3";
-  // Kernel4 is a kernel using TensorCore
-  static constexpr char kKernel4[] = "volta_fp16_s884gemm";
-  static constexpr char kKernel5[] = "kernel5";
-  constexpr int64_t kKernel1StartNs = 100000;
-  constexpr int64_t kKernel1DurationNs = 8000;
-  constexpr int64_t kKernel2StartNs = 110000;
-  constexpr int64_t kKernel2DurationNs = 10000;
-  constexpr int64_t kKernel3StartNs = 120000;
-  constexpr int64_t kKernel3DurationNs = 10000;
-  constexpr int64_t kKernel4StartNs = 130000;
-  constexpr int64_t kKernel4DurationNs = 10000;
-  constexpr int64_t kKernel5StartNs = 150000;
-  constexpr int64_t kKernel5DurationNs = 10000;
-
-  // Mock kernel details for both kernel4 and kernel5.
-  const std::string kKernelDetails = R"MULTI(regs:32
-static_shared:0
-dynamic_shared:16384
-grid:2,1,1
-block:32,1,1
-occ_pct:100)MULTI";
-
-  XSpace space;
-  XPlaneBuilder device_plane(
-      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
-  XLineBuilder stream1 = device_plane.GetOrCreateLine(/*line_id=*/10);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel1StartNs,
-                       kKernel1DurationNs, /*on_device=*/true, kKernel1,
-                       &device_plane, &stream1);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel2StartNs,
-                       kKernel2DurationNs, /*on_device=*/true, kKernel2,
-                       &device_plane, &stream1);
-  XLineBuilder stream2 = device_plane.GetOrCreateLine(/*line_id=*/20);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel1StartNs,
-                       kKernel1DurationNs, /*on_device=*/true, kKernel1,
-                       &device_plane, &stream2);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel2StartNs,
-                       kKernel2DurationNs, /*on_device=*/true, kKernel2,
-                       &device_plane, &stream2);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp2, ":", kTfOp2), kKernel3StartNs,
-                       kKernel3DurationNs, /*on_device=*/true, kKernel3,
-                       &device_plane, &stream2);
-  AddTensorFlowOpEventWithKernelDetails(
-      absl::StrCat(kTfOp3, ":", kTfOp3), kKernel4StartNs, kKernel4DurationNs,
-      /*on_device=*/true, kKernel4, kKernelDetails, &device_plane, &stream2);
-  AddTensorFlowOpEventWithKernelDetails(
-      absl::StrCat(kTfOp3, ":", kTfOp3), kKernel5StartNs, kKernel5DurationNs,
-      /*on_device=*/true, kKernel5, kKernelDetails, &device_plane, &stream2);
-
-  OpStatsOptions options;
-  options.generate_kernel_stats_db = true;
-  options.generate_op_metrics_db = true;
-  const OpStats op_stats = ConvertXSpaceToOpStats(space, options);
-  const TfStatsDatabase tf_stats = ConvertOpStatsToTfStats(op_stats);
-
-  EXPECT_EQ(tf_stats.device_type(), op_stats.run_environment().device_type());
-
-  // TfOp1, TfOp3, TfOp2, Idle
-  EXPECT_EQ(4, tf_stats.with_idle().tf_stats_record_size());
-
-  const TfStatsRecord& record_0 = tf_stats.with_idle().tf_stats_record(0);
-  EXPECT_EQ(kTfOp1, record_0.op_name());
-  EXPECT_EQ(kTfOp1, record_0.op_type());
-  EXPECT_EQ(2, record_0.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel1DurationNs) * 2 +
-                tsl::profiler::NanoToMicro(kKernel2DurationNs) * 2,
-            record_0.total_self_time_in_us());
-
-  const TfStatsRecord& record_1 = tf_stats.with_idle().tf_stats_record(1);
-  EXPECT_EQ(kTfOp3, record_1.op_name());
-  EXPECT_EQ(kTfOp3, record_1.op_type());
-  EXPECT_EQ(1, record_1.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel4DurationNs) +
-                tsl::profiler::NanoToMicro(kKernel5DurationNs),
-            record_1.total_self_time_in_us());
-  // GPU TensorCore utilization is 0.5 because kernel4 is using TensorCore and
-  // kernel5 is not using TensorCore, and they have the same duration.
-  EXPECT_DOUBLE_EQ(0.5, record_1.gpu_tensorcore_utilization());
-
-  const TfStatsRecord& record_2 = tf_stats.with_idle().tf_stats_record(2);
-  EXPECT_EQ(kTfOp2, record_2.op_name());
-  EXPECT_EQ(kTfOp2, record_2.op_type());
-  EXPECT_EQ(1, record_2.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel3DurationNs),
-            record_2.total_self_time_in_us());
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/oss/BUILD b/tensorflow/core/profiler/convert/oss/BUILD
deleted file mode 100644
index b2a4a71ee08b..000000000000
--- a/tensorflow/core/profiler/convert/oss/BUILD
+++ /dev/null
@@ -1,4 +0,0 @@
-exports_files(
-    ["tpu_input_pipeline_analysis_constants.cc"],
-    visibility = ["//tensorflow/core/profiler/convert:__pkg__"],
-)
diff --git a/tensorflow/core/profiler/convert/oss/tpu_input_pipeline_analysis_constants.cc b/tensorflow/core/profiler/convert/oss/tpu_input_pipeline_analysis_constants.cc
deleted file mode 100644
index 006f4c2cc0a4..000000000000
--- a/tensorflow/core/profiler/convert/oss/tpu_input_pipeline_analysis_constants.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h"
-
-#include "absl/strings/string_view.h"
-
-namespace tensorflow {
-namespace profiler {
-
-constexpr absl::string_view kProfileAllHostsDoc =
-    "https://cloud.google.com/tpu/docs/troubleshooting/troubleshoot-multislice";
-constexpr absl::string_view kSparseCoreV0Name = "SparseCoreV0";
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.cc b/tensorflow/core/profiler/convert/preprocess_single_host_xplane.cc
deleted file mode 100644
index 7d8f22914421..000000000000
--- a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
-
-#include <vector>
-
-#include "absl/strings/match.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/preprocess_xplane.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/derived_timeline.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-
-namespace tensorflow {
-namespace profiler {
-
-void PreprocessSingleHostXSpace(
-    XSpace* space, bool step_grouping, bool derived_timeline,
-    tsl::profiler::GroupMetadataMap* group_metadata_map) {
-  if (step_grouping && !tsl::profiler::IsXSpaceGrouped(*space)) {
-    // Grouping (i.e. marking step number) events in the XSpace.
-    std::vector<XPlane*> device_traces;
-    bool isTpu = false;
-    for (XPlane& plane : *space->mutable_planes()) {
-      if (tsl::profiler::IsDevicePlane(plane)) {
-        device_traces.push_back(&plane);
-      }
-      // Preprocess XPlane to convert stats to Traceme2 semantics
-      tsl::profiler::PreprocessXPlane(&plane);
-
-      if (!isTpu && absl::StartsWith(plane.name(), kTpuPlanePrefix)) {
-        isTpu = true;
-      }
-    }
-
-    tsl::profiler::EventForest event_forest;
-    if (isTpu) {
-      // group TPU events
-      GroupTpuEventsOSS(space, device_traces, &event_forest);
-    } else {
-      // group GPU events
-      tsl::profiler::GroupTfEvents(space, &event_forest);
-    }
-
-    if (derived_timeline) {
-      // Generated miscellaneous derived time lines for device planes.
-      GenerateDerivedTimeLines(event_forest.GetGroupMetadataMap(), space);
-    }
-
-    if (group_metadata_map != nullptr) {
-      *group_metadata_map = event_forest.GetGroupMetadataMap();
-    }
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h b/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h
deleted file mode 100644
index 4c86ed8758bc..000000000000
--- a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
-
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Preprocess XSpaces before tools conversion.
-// If step_grouping = true, perform events grouping for step tracking.
-// If derived_timeline, generate derived timeline (XLines).
-// If group_metadata_map is not nullptr, populate the group metadata map.
-void PreprocessSingleHostXSpace(
-    XSpace* space, bool step_grouping, bool derived_timeline,
-    tsl::profiler::GroupMetadataMap* group_metadata_map = nullptr);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
diff --git a/tensorflow/core/profiler/convert/process_megascale_dcn.cc b/tensorflow/core/profiler/convert/process_megascale_dcn.cc
deleted file mode 100644
index ab2b9fbefe60..000000000000
--- a/tensorflow/core/profiler/convert/process_megascale_dcn.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/process_megascale_dcn.h"
-
-#include <vector>
-
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/convert/dcn_analysis.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using tsl::profiler::CreateTfXPlaneVisitor;
-using tsl::profiler::FindMutableTensorCorePlanes;
-
-void ProcessMegascaleDcn(XSpace* space) {
-  std::vector<XPlane*> device_xplanes = FindMutableTensorCorePlanes(space);
-  int num_tpu_cores = device_xplanes.size();
-  // DCN TraceMe's are in the Host XPlane
-  XPlane* host_plane =
-      FindMutablePlaneWithName(space, tsl::profiler::kHostThreadsPlaneName);
-  const XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(host_plane);
-  // TODO(yashjs): Update parameter value for `is_megacore`.
-  DcnEventsProcessor dcn_events_processor(num_tpu_cores, false);
-  dcn_events_processor.SetupMessageInfo(plane_visitor);
-  if (dcn_events_processor.HasDcnMessages(
-          tsl::profiler::kMegaScaleDcnReceive)) {
-    dcn_events_processor.ProcessReceiveMessages(plane_visitor);
-  }
-  // Update host XPlane with DCN traffic
-  dcn_events_processor.AddHostDcnTrafficToXPlane(host_plane);
-  // Update device XPlanes with per collective TPU traffic.
-  for (XPlane* device_xplane : device_xplanes) {
-    dcn_events_processor.AddTpuCollectiveDcnTrafficToXPlane(device_xplane);
-  }
-
-  SortXSpace(space);
-}
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/process_megascale_dcn.h b/tensorflow/core/profiler/convert/process_megascale_dcn.h
deleted file mode 100644
index 794c2bea6646..000000000000
--- a/tensorflow/core/profiler/convert/process_megascale_dcn.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_PROCESS_MEGASCALE_DCN_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_PROCESS_MEGASCALE_DCN_H_
-
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Process Dcn Megascale TraceMe info.
-void ProcessMegascaleDcn(XSpace* space);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_PROCESS_MEGASCALE_DCN_H_
diff --git a/tensorflow/core/profiler/convert/profile_time_breakdown.cc b/tensorflow/core/profiler/convert/profile_time_breakdown.cc
deleted file mode 100644
index e1826a7119f9..000000000000
--- a/tensorflow/core/profiler/convert/profile_time_breakdown.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/profile_time_breakdown.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-void ProfileTimeBreakdown::SetCategoryTimePs(absl::string_view category,
-                                             uint64_t time_ps) {
-  time_ps_by_category_.insert_or_assign(category, time_ps);
-}
-
-uint64_t ProfileTimeBreakdown::PopCategoryTimePs(absl::string_view category) {
-  uint64_t time_ps = 0;
-  auto iter = time_ps_by_category_.find(category);
-  if (iter != time_ps_by_category_.end()) {
-    time_ps = iter->second;
-    time_ps_by_category_.erase(iter);
-  }
-  return time_ps;
-}
-
-void ProfileTimeBreakdown::BreakdownSparseCoreV0Infeed() {
-  // Infeed from SparseCoreV0 and outfeed to SparseCoreV0 are mostly identical
-  // in compute since they do the same transformation. We can subtract out the
-  // outfeed time from the infeed time to know how much time the TensorCore
-  // actually spent waiting on SparseCoreV0.
-  uint64_t bc_infeed_ps =
-      PopCategoryTimePs(tsl::profiler::kHloSparseCoreV0Infeed);
-  if (bc_infeed_ps == 0) return;
-  uint64_t bc_outfeed_ps =
-      CategoryTimePs(tsl::profiler::kHloSparseCoreV0Outfeed);
-
-  uint64_t bc_infeed_transform_ps = std::min(bc_infeed_ps, bc_outfeed_ps);
-  uint64_t bc_infeed_wait_ps = bc_infeed_ps - bc_infeed_transform_ps;
-
-  SetCategoryTimePs(tsl::profiler::kHloSparseCoreV0InfeedWait,
-                    bc_infeed_wait_ps);
-  SetCategoryTimePs(tsl::profiler::kHloSparseCoreV0InfeedTransform,
-                    bc_infeed_transform_ps);
-}
-
-std::string ProfileTimeBreakdown::DebugString() const {
-  std::string str;
-  for (const auto& [category, time_ps] : time_ps_by_category_) {
-    absl::StrAppend(&str, category, ": ", tsl::profiler::PicoToUni(time_ps),
-                    "\n");
-  }
-  absl::StrAppend(
-      &str, "total_time: ", tsl::profiler::PicoToUni(total_time_ps_), "\n");
-  absl::StrAppend(
-      &str, "profile_time: ", tsl::profiler::PicoToUni(profile_time_ps_), "\n");
-  return str;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/profile_time_breakdown.h b/tensorflow/core/profiler/convert/profile_time_breakdown.h
index 1e3379beb4c4..9b68baad5ecf 100644
--- a/tensorflow/core/profiler/convert/profile_time_breakdown.h
+++ b/tensorflow/core/profiler/convert/profile_time_breakdown.h
@@ -15,230 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_PROFILE_TIME_BREAKDOWN_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_PROFILE_TIME_BREAKDOWN_H_
 
-#include <cstdint>
-#include <initializer_list>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Allows accumulating time spent in different HLO instruction categories to
-// breakdown the total profile time and compute metrics of interest.
-class ProfileTimeBreakdown {
- public:
-  // Category should be the operator category disambiguated by xprof instead of
-  // the original category from XLA.
-  // For a correct time breakdown, we need to use the self time of operators,
-  // instead of total time to avoid double counting. Note that for leaf ops,
-  // self time and total time are the same.
-  void IncrementCategoryTimePs(absl::string_view category,
-                               uint64_t self_time_ps) {
-    time_ps_by_category_[category] += self_time_ps;
-    total_time_ps_ += self_time_ps;
-  }
-
-  // Profile time cannot be smaller than the total time in all categories.
-  // If combining profiles across multiple cores, profile time should be the
-  // profiling duration multiplied by the number of cores that were profiled.
-  // go/autograppler_profile_time
-  void SetProfileTimePs(uint64_t profile_time_ps) {
-    DCHECK_LE(total_time_ps_, profile_time_ps);
-    profile_time_ps_ = profile_time_ps;
-  }
-
-  // Breaks down "sparsecorev0 infeed" into two components:
-  // 1) "sparsecorev0 infeed wait": Time spent waiting on the SparseCoreV0.
-  // 2) "sparsecorev0 infeed transform": Time spent transforming activations in
-  //    SparseCoreV0 layout into XLA layout.
-  // Even though 2) is part of the overall embedding computation, it is time
-  // spent doing work on the TensorCore.
-  void BreakdownSparseCoreV0Infeed();
-
-  // Duty cycle is the fraction of time an accelerator is being actively used.
-  // go/accelerator-metrics-definitions#common-accelerator-metrics
-  // go/ag-tpu-duty-cycle
-  double DutyCycle() const { return TimeFraction(OnDutyTimePs()); }
-
-  double IdleFraction() const { return TimeFraction(IdleTimePs()); }
-
-  double InfeedFraction() const {
-    return CategoryFraction(tsl::profiler::kHloInfeed);
-  }
-
-  double OutfeedFraction() const {
-    return CategoryFraction(tsl::profiler::kHloOutfeed);
-  }
-
-  double SparseCoreV0InfeedFraction() const {
-    return CategoriesFraction({tsl::profiler::kHloSparseCoreV0Infeed,
-                               tsl::profiler::kHloSparseCoreV0InfeedWait,
-                               tsl::profiler::kHloSparseCoreV0InfeedTransform});
-  }
-
-  double SparseCoreV0OutfeedFraction() const {
-    return CategoryFraction(tsl::profiler::kHloSparseCoreV0Outfeed);
-  }
-
-  double AllReduceFraction() const {
-    return CategoryFraction(tsl::profiler::kHloAllReduce);
-  }
-
-  double AllReduceFusionFraction() const {
-    return CategoryFraction(tsl::profiler::kHloAllReduceFusion);
-  }
-
-  double SendRecvFraction() const {
-    return CategoriesFraction(
-        {tsl::profiler::kHloSend, tsl::profiler::kHloSendDone,
-         tsl::profiler::kHloRecv, tsl::profiler::kHloRecvDone});
-  }
-
-  double HostSendRecvFraction() const {
-    return CategoriesFraction(
-        {tsl::profiler::kHloHostSend, tsl::profiler::kHloHostSendDone,
-         tsl::profiler::kHloHostRecv, tsl::profiler::kHloHostRecvDone});
-  }
-
-  double CategoriesFraction(
-      const std::initializer_list<absl::string_view>& categories) const {
-    return TimeFraction(CategoriesTimePs(categories));
-  }
-
-  double CategoryFraction(absl::string_view category) const {
-    return TimeFraction(CategoryTimePs(category));
-  }
-
-  uint64_t ProfileTimePs() const { return profile_time_ps_; }
-
-  uint64_t TotalTimePs() const { return total_time_ps_; }
-
-  uint64_t IdleTimePs() const { return profile_time_ps_ - total_time_ps_; }
-
-  uint64_t OnDutyTimePs() const { return profile_time_ps_ - OffDutyTimePs(); }
-
-  uint64_t OffDutyTimePs() const {
-    return IdleTimePs() +
-           CategoriesTimePs(
-               {tsl::profiler::kHloInfeed, tsl::profiler::kHloOutfeed,
-                tsl::profiler::kHloHostSend, tsl::profiler::kHloHostSendDone,
-                tsl::profiler::kHloHostRecv, tsl::profiler::kHloHostRecvDone,
-                tsl::profiler::kHloMegacoreFusion});
-  }
-
-  uint64_t InfeedTimePs() const {
-    return CategoryTimePs(tsl::profiler::kHloInfeed);
-  }
-
-  uint64_t OutfeedTimePs() const {
-    return CategoryTimePs(tsl::profiler::kHloOutfeed);
-  }
-
-  uint64_t SparseCoreV0InfeedWaitTimePs() const {
-    return CategoryTimePs(tsl::profiler::kHloSparseCoreV0InfeedWait);
-  }
-
-  uint64_t SparseCoreV0InfeedTransformTimePs() const {
-    return CategoryTimePs(tsl::profiler::kHloSparseCoreV0InfeedTransform);
-  }
-
-  uint64_t SparseCoreV0OutfeedTimePs() const {
-    return CategoryTimePs(tsl::profiler::kHloSparseCoreV0Outfeed);
-  }
-
-  uint64_t AllReduceOrAllToAllTimePs() const {
-    return CategoriesTimePs({tsl::profiler::kHloAllReduce,
-                             tsl::profiler::kHloAllReduceFusion,
-                             tsl::profiler::kHloAllToAll});
-  }
-
-  uint64_t SendTimePs() const {
-    return CategoriesTimePs(
-        {tsl::profiler::kHloSend, tsl::profiler::kHloSendDone});
-  }
-
-  uint64_t RecvTimePs() const {
-    return CategoriesTimePs(
-        {tsl::profiler::kHloRecv, tsl::profiler::kHloRecvDone});
-  }
-
-  uint64_t HostSendTimePs() const {
-    return CategoriesTimePs(
-        {tsl::profiler::kHloHostSend, tsl::profiler::kHloHostSendDone});
-  }
-
-  uint64_t HostRecvTimePs() const {
-    return CategoriesTimePs(
-        {tsl::profiler::kHloHostRecv, tsl::profiler::kHloHostRecvDone});
-  }
-
-  // Megacore fusion runs different operations on each core, e.g., a convolution
-  // on one core and an all-reduce on the other core. In a trace, megacore
-  // fusion is the parent operation, and its self time is the time that the core
-  // executing the faster operation waits for the core executing the slower
-  // operation to reach the synchronization point.
-  uint64_t MegacoreFusionTimePs() const {
-    return CategoryTimePs(tsl::profiler::kHloMegacoreFusion);
-  }
-
-  uint64_t HighFlopsComputeTimePs() const {
-    return CategoriesTimePs({tsl::profiler::kHloConvolution,
-                             tsl::profiler::kHloConvolutionBaseDilated,
-                             tsl::profiler::kHloConvolutionWindowDilated,
-                             tsl::profiler::kHloConvolutionFusion,
-                             tsl::profiler::kHloOutputFusion});
-  }
-
-  // Calculated according to the "TC busy time" defined in go/tpu_kpis
-  uint64_t TensorCoreBusyTimePs() const {
-    return profile_time_ps_ - OffDutyTimePs() - SparseCoreV0InfeedWaitTimePs();
-  }
-
-  uint64_t CategoriesTimePs(
-      const std::initializer_list<absl::string_view>& categories) const {
-    uint64_t time_ps = 0;
-    for (auto category : categories) {
-      time_ps += CategoryTimePs(category);
-    }
-    return time_ps;
-  }
-
-  uint64_t CategoryTimePs(absl::string_view category) const {
-    auto iter = time_ps_by_category_.find(category);
-    return (iter == time_ps_by_category_.end()) ? 0 : iter->second;
-  }
-
-  template <typename Map>
-  void ComputeCategoryFractions(Map& category_fractions) {
-    for (const auto& [category, time_ps] : time_ps_by_category_) {
-      category_fractions[category] = TimeFraction(time_ps);
-    }
-  }
-
-  std::string DebugString() const;
-
- private:
-  // Overwrites the time attributed to the given category.
-  void SetCategoryTimePs(absl::string_view category, uint64_t time_ps);
-
-  // Removes and returns the time attributed to the given category.
-  uint64_t PopCategoryTimePs(absl::string_view category);
-
-  double TimeFraction(uint64_t time_ps) const {
-    return tsl::profiler::SafeDivide(time_ps, profile_time_ps_);
-  }
-
-  absl::flat_hash_map<std::string, uint64_t> time_ps_by_category_;
-  uint64_t total_time_ps_ = 0;  // Sum of values in time_ps_by_category_.
-  uint64_t profile_time_ps_ = 0;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/profile_time_breakdown.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_PROFILE_TIME_BREAKDOWN_H_
diff --git a/tensorflow/core/profiler/convert/repository.cc b/tensorflow/core/profiler/convert/repository.cc
deleted file mode 100644
index b2a98d597205..000000000000
--- a/tensorflow/core/profiler/convert/repository.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/repository.h"
-
-#include <cstddef>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/strings/strip.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/profiler/utils/file_system_utils.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/path.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-std::string GetHostnameByPath(absl::string_view xspace_path) {
-  std::string_view file_name = tensorflow::io::Basename(xspace_path);
-  // Remove suffix from file_name, preserving entire prefix.
-  absl::ConsumeSuffix(&file_name, ".xplane.pb");
-  return std::string(file_name);
-}
-}  // namespace
-
-absl::StatusOr<SessionSnapshot> SessionSnapshot::Create(
-    std::vector<std::string> xspace_paths,
-    std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces) {
-  if (xspace_paths.empty()) {
-    return errors::InvalidArgument("Can not find XSpace path.");
-  }
-
-  if (xspaces.has_value()) {
-    if (xspaces->size() != xspace_paths.size()) {
-      return errors::InvalidArgument(
-          "The size of the XSpace paths: ", xspace_paths.size(),
-          " is not equal ",
-          "to the size of the XSpace proto: ", xspaces->size());
-    }
-    for (size_t i = 0; i < xspace_paths.size(); ++i) {
-      auto host_name = GetHostnameByPath(xspace_paths.at(i));
-      if (xspaces->at(i)->hostnames_size() > 0 && !host_name.empty()) {
-        if (!absl::StrContains(host_name, xspaces->at(i)->hostnames(0))) {
-          return errors::InvalidArgument(
-              "The hostname of xspace path and preloaded xpace don't match at "
-              "index: ",
-              i, ". \nThe host name of xpace path is ", host_name,
-              " but the host name of preloaded xpace is ",
-              xspaces->at(i)->hostnames(0), ".");
-        }
-      }
-    }
-  }
-
-  return SessionSnapshot(std::move(xspace_paths), std::move(xspaces));
-}
-
-absl::StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpace(
-    size_t index) const {
-  if (index >= xspace_paths_.size()) {
-    return errors::InvalidArgument("Can not get the ", index,
-                                   "th XSpace. The total number of XSpace is ",
-                                   xspace_paths_.size());
-  }
-
-  // Return the pre-loaded XSpace proto.
-  if (xspaces_.has_value()) {
-    if (xspaces_->at(index) == nullptr) {
-      return errors::Internal("");
-    }
-    return std::move(xspaces_->at(index));
-  }
-
-  // Return the XSpace proto from file.
-  auto xspace_from_file = std::make_unique<XSpace>();
-  TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
-                                                 xspace_paths_.at(index),
-                                                 xspace_from_file.get()));
-  return xspace_from_file;
-}
-
-absl::StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpaceByName(
-    absl::string_view name) const {
-  if (auto it = hostname_map_.find(name); it != hostname_map_.end()) {
-    return GetXSpace(it->second);
-  }
-
-  return errors::InvalidArgument("Can not find the XSpace by name: ", name,
-                                 ". The total number of XSpace is ",
-                                 xspace_paths_.size());
-}
-
-std::string SessionSnapshot::GetHostname(size_t index) const {
-  return GetHostnameByPath(xspace_paths_.at(index));
-}
-
-std::optional<std::string> SessionSnapshot::GetFilePath(
-    absl::string_view toolname, absl::string_view hostname) const {
-  if (!has_accessible_run_dir_) return std::nullopt;
-  std::string file_name = "";
-  if (toolname == "trace_viewer@")
-    file_name = absl::StrCat(hostname, ".", "SSTABLE");
-  if (!file_name.empty())
-    return tensorflow::io::JoinPath(session_run_dir_, file_name);
-  return std::nullopt;
-}
-
-absl::StatusOr<std::string> SessionSnapshot::GetHostDataFileName(
-    const StoredDataType data_type, const std::string host) const {
-  for (const auto& format : *kHostDataSuffixes) {
-    if (data_type == format.first) return absl::StrCat(host, format.second);
-  }
-  return absl::InternalError(&"Unknown StoredDataType: "[data_type]);
-}
-
-absl::StatusOr<std::optional<std::string>> SessionSnapshot::GetHostDataFilePath(
-    const StoredDataType data_type, const std::string host) const {
-  // Gets all the files in session run directory.
-  std::vector<std::string> results;
-  TF_RETURN_IF_ERROR(::tsl::Env::Default()->GetChildren(
-      std::string(GetSessionRunDir()), &results));
-
-  TF_ASSIGN_OR_RETURN(std::string filename,
-                      GetHostDataFileName(data_type, host));
-
-  for (const std::string& path : results) {
-    if (absl::EndsWith(path, filename)) {
-      return ::tsl::profiler::ProfilerJoinPath(GetSessionRunDir(), filename);
-    }
-  }
-
-  return std::nullopt;
-}
-
-absl::StatusOr<std::pair<bool, std::string>> SessionSnapshot::HasCacheFile(
-    const StoredDataType data_type) const {
-  std::optional<std::string> filepath;
-  TF_ASSIGN_OR_RETURN(filepath,
-                      GetHostDataFilePath(data_type, kNoHostIdentifier));
-  if (filepath) {
-    // cache file is present but file contains no data_type events
-    return std::pair<bool, std::string>(true, std::string());
-  }
-
-  TF_ASSIGN_OR_RETURN(filepath,
-                      GetHostDataFilePath(data_type, kAllHostsIdentifier));
-  if (filepath) {
-    // cache file is present and file contains data_type events
-    return std::pair<bool, std::string>(true, filepath.value());
-  }
-
-  // no cache file present
-  return std::pair<bool, std::string>(false, std::string());
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/repository.h b/tensorflow/core/profiler/convert/repository.h
deleted file mode 100644
index 5002df7d6085..000000000000
--- a/tensorflow/core/profiler/convert/repository.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_REPOSITORY_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_REPOSITORY_H_
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/profiler/utils/file_system_utils.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/utils/hlo_module_map.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-constexpr char kAllHostsIdentifier[] = "ALL_HOSTS";
-constexpr char kNoHostIdentifier[] = "NO_HOST";
-
-enum StoredDataType {
-  DCN_COLLECTIVE_STATS,
-};
-
-static auto* kHostDataSuffixes =
-    new std::vector<std::pair<StoredDataType, const char*>>(
-        {{StoredDataType::DCN_COLLECTIVE_STATS, ".dcn_collective_stats.pb"}});
-
-// File system directory snapshot of a profile session.
-class SessionSnapshot {
- public:
-  // Performs validation and creates SessionSnapshot.
-  // <xspace_paths> are the file paths to XSpace protos.
-  // Optionally, <xspaces> can contain the XSpace protos pre-loaded by the
-  // profiler plugin.
-  static absl::StatusOr<SessionSnapshot> Create(
-      std::vector<std::string> xspace_paths,
-      std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces);
-
-  // Returns the number of XSpaces in the profile session.
-  size_t XSpaceSize() const { return xspace_paths_.size(); }
-
-  // Gets XSpace proto.
-  // The caller of this function will take ownership of the XSpace.
-  absl::StatusOr<std::unique_ptr<XSpace>> GetXSpace(size_t index) const;
-
-  // Gets XSpace proto.
-  // The caller of this function will take ownership of the XSpace.
-  absl::StatusOr<std::unique_ptr<XSpace>> GetXSpaceByName(
-      absl::string_view name) const;
-
-  // Gets host name.
-  std::string GetHostname(size_t index) const;
-
-  // Gets the run directory of the profile session.
-  absl::string_view GetSessionRunDir() const { return session_run_dir_; }
-
-  // Gets whether the session has an accessible run dir. If false, any
-  // path-based file read will be disabled in this mode.
-  bool HasAccessibleRunDir() const { return has_accessible_run_dir_; }
-
-  // Gets the path of the fast file for a given tool.
-  std::optional<std::string> GetFilePath(absl::string_view toolname,
-                                         absl::string_view host) const;
-
-  // Gets the name of the host data file.
-  absl::StatusOr<std::string> GetHostDataFileName(StoredDataType data_type,
-                                                  std::string host) const;
-
-  // Gets the path of the host data file.
-  absl::StatusOr<std::optional<std::string>> GetHostDataFilePath(
-      StoredDataType data_type, std::string host) const;
-
-  /* Gets whether the cache file is present in run dir. First value indicates
-  whether cache file is present or not. Second value indicates the path of cache
-  file. Possible cases are:
-      1. <false, "">: If no cache file is present
-      2. <true, "">: If cache file is present but file contains no data_type
-     events
-      3. <true, filepath>: If cache file is present and file contains data_type
-     events
-  */
-  absl::StatusOr<std::pair<bool, std::string>> HasCacheFile(
-      StoredDataType data_type) const;
-
-  template <typename T>
-  absl::Status WriteBinaryProto(const StoredDataType data_type,
-                                const std::string host, T& proto) const {
-    // Gets name for host data file.
-    TF_ASSIGN_OR_RETURN(std::string filename,
-                        GetHostDataFileName(data_type, host));
-
-    std::string filepath =
-        tsl::profiler::ProfilerJoinPath(GetSessionRunDir(), filename);
-
-    return tensorflow::WriteBinaryProto(tsl::Env::Default(), filepath, proto);
-  }
-
-  template <typename T>
-  absl::Status ReadBinaryProto(const StoredDataType data_type,
-                               const std::string host, T* proto) const {
-    // Gets file path for host data.
-    TF_ASSIGN_OR_RETURN(std::optional<std::string> filepath,
-                        GetHostDataFilePath(data_type, host));
-    if (filepath) {
-      return tensorflow::ReadBinaryProto(tsl::Env::Default(), filepath.value(),
-                                         proto);
-    }
-
-    return absl::NotFoundError(
-        absl::StrCat("No binary proto found for ", host, " and ", data_type));
-  }
-
- private:
-  SessionSnapshot(std::vector<std::string> xspace_paths,
-                  std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces)
-      : xspace_paths_(std::move(xspace_paths)),
-        // If the snapshot was initialized by xspaces, the file path and run dir
-        // is a path tensorflow can't read from or write to so any file IO
-        // encapsulated in this class will be disabled in this mode.
-        has_accessible_run_dir_(!xspaces.has_value()),
-        xspaces_(std::move(xspaces)) {
-    session_run_dir_ = tensorflow::io::Dirname(xspace_paths_.at(0));
-    for (size_t i = 0; i < xspace_paths_.size(); ++i) {
-      std::string host_name = GetHostname(i);
-      hostname_map_[host_name] = i;
-    }
-  }
-
-  // File paths to XSpace protos.
-  std::vector<std::string> xspace_paths_;
-  // The run directory of the profile session.
-  absl::string_view session_run_dir_;
-
-  absl::flat_hash_map<std::string /*host_name*/, size_t /*index*/>
-      hostname_map_;
-
-  const bool has_accessible_run_dir_;
-
-  // XSpace protos pre-loaded by the profiler plugin.
-  // TODO(profiler): Use blobstore paths to initialize SessionSnapshot instead
-  // of using pre-loaded XSpaces.
-  mutable std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces_;
-};
-
-// Writes binary proto format T for a host and data_type to a session.
-template <typename T>
-absl::Status WriteBinaryProto(const SessionSnapshot& session_snapshot,
-                              const StoredDataType data_type,
-                              const std::string& host, T& proto) {
-  return session_snapshot.WriteBinaryProto(data_type, host, proto);
-}
-
-// Reads binary proto format T for a host and data_type to a session.
-template <typename T>
-absl::Status ReadBinaryProto(const SessionSnapshot& session_snapshot,
-                             const StoredDataType data_type,
-                             const std::string& host, T* proto) {
-  return session_snapshot.ReadBinaryProto(data_type, host, proto);
-}
-
-// Process HloModuleMap from all XSpaces in a session.
-inline absl::StatusOr<HloModuleMap> ProcessHloModuleMap(
-    const SessionSnapshot& session_snapshot) {
-  HloModuleMap hlo_module_map;
-  for (int i = 0; i < session_snapshot.XSpaceSize(); i++) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(i));
-    ProcessHloModuleMapFromXSpace(hlo_module_map, xspace.get());
-  }
-  return hlo_module_map;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_REPOSITORY_H_
diff --git a/tensorflow/core/profiler/convert/repository_test.cc b/tensorflow/core/profiler/convert/repository_test.cc
deleted file mode 100644
index 3f3872bd13fd..000000000000
--- a/tensorflow/core/profiler/convert/repository_test.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/repository.h"
-
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/tsl/platform/status.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::testing::Eq;
-
-TEST(Repository, GetHostName) {
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"log/plugins/profile/hostname0.xplane.pb",
-                               "log/plugins/profile/hostname1.xplane.pb"},
-                              /*xspaces=*/std::nullopt);
-  TF_CHECK_OK(session_snapshot_or.status());
-  EXPECT_THAT(session_snapshot_or.value().GetHostname(0), Eq("hostname0"));
-  EXPECT_THAT(session_snapshot_or.value().GetHostname(1), Eq("hostname1"));
-  EXPECT_TRUE(session_snapshot_or.value().HasAccessibleRunDir());
-}
-
-TEST(Repository, GetHostNameWithPeriods) {
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"log/plugins/profile/127.0.0.1_6009.xplane.pb"},
-                              /*xspaces=*/std::nullopt);
-  TF_CHECK_OK(session_snapshot_or.status());
-  EXPECT_THAT(session_snapshot_or.value().GetHostname(0), Eq("127.0.0.1_6009"));
-  EXPECT_TRUE(session_snapshot_or.value().HasAccessibleRunDir());
-}
-
-TEST(Repository, GetSpaceByHostName) {
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  // prepare host 1.
-  auto space1 = std::make_unique<XSpace>();
-  *(space1->add_hostnames()) = "hostname1";
-  // with index 0 which shouldn't impact the space finding by name.
-  xspaces.push_back(std::move(space1));
-
-  // prepare host 0.
-  auto space0 = std::make_unique<XSpace>();
-  *(space0->add_hostnames()) = "hostname0";
-  // with index 1 which shouldn't impact the space finding by name.
-  xspaces.push_back(std::move(space0));
-
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"log/plugins/profile/hostname1.xplane.pb",
-                               "log/plugins/profile/hostname0.xplane.pb"},
-                              std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  auto xspace0_or = session_snapshot_or.value().GetXSpaceByName("hostname0");
-  TF_CHECK_OK(xspace0_or.status());
-  auto xspace1_or = session_snapshot_or.value().GetXSpaceByName("hostname1");
-  EXPECT_FALSE(session_snapshot_or.value().HasAccessibleRunDir());
-  TF_CHECK_OK(xspace1_or.status());
-  EXPECT_THAT(xspace0_or.value()->hostnames(0), Eq("hostname0"));
-  EXPECT_THAT(xspace1_or.value()->hostnames(0), Eq("hostname1"));
-}
-
-TEST(Repository, GetSSTableFile) {
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"log/plugins/profile/hostname0.xplane.pb"},
-                              /*xspaces=*/std::nullopt);
-  TF_CHECK_OK(session_snapshot_or.status());
-  auto sstable_path =
-      session_snapshot_or.value().GetFilePath("trace_viewer@", "hostname0");
-  auto not_found_path =
-      session_snapshot_or.value().GetFilePath("memory_viewer", "hostname0");
-  EXPECT_THAT(sstable_path, Eq("log/plugins/profile/hostname0.SSTABLE"));
-  EXPECT_THAT(not_found_path, Eq(std::nullopt));
-}
-
-TEST(Repository, GetSSTableFileWithXSpace) {
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  // prepare host 0.
-  auto space0 = std::make_unique<XSpace>();
-  *(space0->add_hostnames()) = "hostname0";
-  // with index 1 which shouldn't impact the space finding by name.
-  xspaces.push_back(std::move(space0));
-  auto session_snapshot_or = SessionSnapshot::Create(
-      {"log/plugins/profile/hostname0.xplane.pb"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  auto file_path_init_by_xspace =
-      session_snapshot_or.value().GetFilePath("trace_viewer@", "hostname0");
-  // The file path should be disabled in this mode.
-  EXPECT_THAT(file_path_init_by_xspace, Eq(std::nullopt));
-}
-
-TEST(Repository, MismatchedXSpaceAndPath) {
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  // prepare host 1.
-  auto space1 = std::make_unique<XSpace>();
-  *(space1->add_hostnames()) = "hostname1";
-  // with index 0 which shouldn't impact the space finding by name.
-  xspaces.push_back(std::move(space1));
-
-  // prepare host 0.
-  auto space0 = std::make_unique<XSpace>();
-  *(space0->add_hostnames()) = "hostname0";
-  // with index 1 which shouldn't impact the space finding by name.
-  xspaces.push_back(std::move(space0));
-
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"log/plugins/profile/hostname0.xplane.pb",
-                               "log/plugins/profile/hostname1.xplane.pb"},
-                              std::move(xspaces));
-  auto error =
-      R"(The hostname of xspace path and preloaded xpace don't match at index: 0. 
-The host name of xpace path is hostname0 but the host name of preloaded xpace is hostname1.)";
-  EXPECT_THAT(session_snapshot_or.status(), Eq(errors::InvalidArgument(error)));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
deleted file mode 100644
index 0d8f90bcbbbb..000000000000
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
-
-#include <cstdint>
-#include <ostream>
-#include <sstream>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/any.pb.h"
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Local core id should start from 1.
-const uint32 kDefaultGpuLocalCoreId = 1;
-
-namespace {
-
-void StepEventsToPerCoreStepInfo(uint32_t step_num, StepDetails& step_details,
-                                 PerCoreStepInfo& per_core_step_info) {
-  per_core_step_info.set_step_num(step_num);
-  OpMetricsDbCombiner combiner(per_core_step_info.mutable_hlo_metrics_db());
-  auto step_time = step_details.StepTime();
-  if (step_time.duration_ps() == 0) {
-    // In case no step markers are observed for the particular step, Skip the
-    // step.
-    VLOG(1) << "Skipping step " << step_details.StepName()
-            << "with no step markers";
-    return;
-  }
-  for (auto& [core_id, metrics_db] : step_details.PerCoreOpMetricsDb()) {
-    SetTotalTimePs(metrics_db, step_time.duration_ps());
-    AddIdleOp(metrics_db);
-    // TODO(b/397774568): Remove this once the SparseCore OpMetricsDb is
-    // implemented.
-    if (core_id < kSparseCoreIndexStart) combiner.Combine(metrics_db);
-
-    GenericStepBreakdown step_breakdown;
-    auto& category_ps = *(step_breakdown.mutable_category_ps());
-    for (auto& metric : metrics_db.metrics_db()) {
-      category_ps[metric.category()] += metric.self_time_ps();
-    }
-
-    StepInfoResult step_info;
-    step_info.set_step_num(step_num);
-    step_info.set_step_name(step_details.StepName());
-    step_info.set_begin_ps(step_time.begin_ps());
-    step_info.set_duration_ps(step_time.duration_ps());
-    step_info.mutable_step_breakdown()->PackFrom(step_breakdown);
-    (*per_core_step_info.mutable_step_info_per_core())[core_id] =
-        std::move(step_info);
-  }
-  auto& all_reduce_db_per_core_map =
-      *per_core_step_info.mutable_all_reduce_db_per_core();
-  for (const auto& [core_id, all_reduce_db] : step_details.Collectives()) {
-    all_reduce_db_per_core_map[core_id].CopyFrom(all_reduce_db);
-  }
-}
-
-// Converts from StepDetails to StepInfoResult.
-StepInfoResult ConvertStepDetailsToStepInfo(bool has_device, int64_t step_num,
-                                            StepDetails& step_details) {
-  GenericStepBreakdown generic;
-  tsl::profiler::Timespan step_time = step_details.StepTime();
-  auto& type_ps = *(generic.mutable_type_ps());
-  uint64 total_event_duration = 0;
-  for (const auto& event : step_details.Events()) {
-    // Ignore event duration outside the step marker.
-    uint64 event_duration = step_time.OverlappedDurationPs(event.span);
-    type_ps[event.type] += event_duration;
-    total_event_duration += event_duration;
-  }
-  if (total_event_duration < step_time.duration_ps()) {
-    // Some time in the step is not associated with any event. Classify them as
-    // "unknown time".
-    type_ps[UNKNOWN_TIME] += step_time.duration_ps() - total_event_duration;
-  }
-  // Determines if this particular step is a well-formed one.
-  bool well_formed_step = has_device ? type_ps.contains(DEVICE_COMPUTE_16) ||
-                                           type_ps.contains(DEVICE_COMPUTE_32)
-                                     : type_ps.contains(HOST_COMPUTE);
-  StepInfoResult step_info;
-  step_info.mutable_step_breakdown()->PackFrom(generic);
-  if (well_formed_step) {
-    step_info.set_step_num(step_num);
-    step_info.set_step_name(step_details.StepName());
-    step_info.set_begin_ps(step_time.begin_ps());
-    step_info.set_duration_ps(step_time.duration_ps());
-  } else {
-    // For a non-well-formed step, sets its duration to 0 so that it will be
-    // ignored by the caller of this function.
-    step_info.set_duration_ps(0);
-  }
-  return step_info;
-}
-
-string DebugGenericStepBreakdown(const GenericStepBreakdown& generic) {
-  std::ostringstream out;
-  uint64 total_ps = 0;
-  const auto& type_ps_map = generic.type_ps();
-  for (const auto& type_ps : type_ps_map) {
-    total_ps += type_ps.second;
-  }
-  out << "Total ps = " << total_ps << std::endl;
-  for (int type = LAST_EVENT_TYPE; type >= 0; --type) {
-    const auto* ps = gtl::FindOrNull(type_ps_map, type);
-    if (ps == nullptr) continue;
-    double percent = (*ps * 100.0) / total_ps;
-    auto event_type = static_cast<EventType>(type);
-    out << PrintEventType(event_type) << ": " << percent << "%"
-        << ", ps = " << *ps << std::endl;
-  }
-  return out.str();
-}
-
-string DebugStepInfo(const StepInfoResult& step_info) {
-  std::ostringstream out;
-  out << "step_num=" << step_info.step_num()
-      << ", duration_ps=" << step_info.duration_ps()
-      << ", begin_ps=" << step_info.begin_ps() << std::endl;
-  GenericStepBreakdown generic;
-  if (step_info.step_breakdown().UnpackTo(&generic)) {
-    out << "Generic step breakdown:" << std::endl;
-    out << DebugGenericStepBreakdown(generic) << std::endl;
-  } else {
-    out << step_info.step_breakdown().DebugString() << std::endl;
-  }
-  return out.str();
-}
-
-}  // namespace
-
-StepDatabaseResult ConvertStepEventsToStepDb(
-    bool has_device, bool maybe_drop_incomplete_steps,
-    StepEvents& nonoverlapped_step_events) {
-  StepDatabaseResult step_db;
-  // Gets sorted step numbers.
-  std::vector<int64_t> step_numbers;
-  step_numbers.reserve(nonoverlapped_step_events.size());
-  for (const auto& step_events : nonoverlapped_step_events) {
-    step_numbers.push_back(step_events.first);
-  }
-  absl::c_sort(step_numbers);
-  for (const auto& step : step_numbers) {
-    auto* step_details = gtl::FindOrNull(nonoverlapped_step_events, step);
-    if (step_details == nullptr) continue;
-    PerCoreStepInfo per_core_step_info;
-    per_core_step_info.set_step_num(step);
-    if (!step_details->PerCoreOpMetricsDb().empty()) {
-      StepEventsToPerCoreStepInfo(step, *step_details, per_core_step_info);
-    } else {
-      StepInfoResult step_info =
-          ConvertStepDetailsToStepInfo(has_device, step, *step_details);
-      if (step_info.duration_ps() == 0)
-        continue;  // Do not include non-well-formed steps.
-      // When we generated StepEvents, we already put events from all device
-      // cores and cpu threads on this host into a single event stream,
-      // therefore we can't separate them anymore. Simply assigns all events to
-      // Core-0.
-      (*per_core_step_info
-            .mutable_step_info_per_core())[kDefaultGpuLocalCoreId] =
-          std::move(step_info);
-      VLOG(2)
-          << std::endl
-          << "step_id: " << step << ", step_info:" << std::endl
-          << DebugStepInfo(
-                 (*per_core_step_info
-                       .mutable_step_info_per_core())[kDefaultGpuLocalCoreId]);
-      // Populates the collective ops information.
-      auto& collectives = *per_core_step_info.mutable_all_reduce_db_per_core();
-      for (const auto& it : step_details->Collectives()) {
-        collectives[it.first] = it.second;
-      }
-      // Populates the device transfer stats for this step.
-      auto& device_memory_transfers =
-          *per_core_step_info.mutable_device_memory_transfers();
-      for (const auto& dma : step_details->DeviceMemoryTransfers()) {
-        *device_memory_transfers.Add() = dma;
-      }
-    }
-    // The remaining fields in PerCoreStepInfo are not filled.
-    *step_db.add_step_sequence() = per_core_step_info;
-  }
-
-  // If we are using sampling mode and we get enough steps, we would like to
-  // drop the incomplete steps at the beginning and the end.
-  // (Sometimes CUTPI instrumentation will prolong the first step too).
-  int kDropIncomplteteStepThreshold = 5;
-  if (maybe_drop_incomplete_steps &&
-      step_db.step_sequence_size() > kDropIncomplteteStepThreshold) {
-    step_db.mutable_step_sequence()->erase(
-        step_db.mutable_step_sequence()->begin());
-    step_db.mutable_step_sequence()->RemoveLast();
-  }
-  return step_db;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.h b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
deleted file mode 100644
index 9764c46cfca6..000000000000
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
-
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-
-namespace tensorflow {
-namespace profiler {
-
-TF_CONST_INIT extern const uint32 kDefaultGpuLocalCoreId;
-
-// Converts from overlapped Step-Events to StepDatabaseResult.
-StepDatabaseResult ConvertStepEventsToStepDb(
-    bool has_device, bool maybe_drop_incomplete_steps,
-    StepEvents& overlapped_step_events);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
diff --git a/tensorflow/core/profiler/convert/tool_options.h b/tensorflow/core/profiler/convert/tool_options.h
deleted file mode 100644
index 85f285e7526a..000000000000
--- a/tensorflow/core/profiler/convert/tool_options.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TOOL_OPTIONS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_TOOL_OPTIONS_H_
-
-#include <optional>
-#include <string>
-#include <variant>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_format.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using ToolOptions =
-    absl::flat_hash_map<std::string, std::variant<int, std::string>>;
-
-// Helper function to get parameter from tool options.
-template <typename T>
-std::optional<T> GetParam(const ToolOptions& options, const std::string& key) {
-  const auto iter = options.find(key);
-  if (iter == options.end()) {
-    return std::nullopt;
-  }
-
-  const T* result = std::get_if<T>(&iter->second);
-  if (!result) {
-    return std::nullopt;
-  }
-  return *result;
-}
-
-// Helper function to get parameter from tool options with default value.
-template <typename T>
-T GetParamWithDefault(const ToolOptions& options, const std::string& key,
-                      const T& default_param) {
-  if (auto param = GetParam<T>(options, key)) {
-    return *param;
-  }
-  return default_param;
-}
-
-inline std::string DebugString(const ToolOptions& options) {
-  std::string output;
-  for (const auto& [k, v] : options) {
-    absl::StrAppend(
-        &output, k, ":",
-        std::visit([](const auto& value) { return absl::StrCat(value); }, v),
-        ":", v.index(), ";");
-  }
-  return absl::StrCat("{", output, "}");
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TOOL_OPTIONS_H_
diff --git a/tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h b/tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h
deleted file mode 100644
index ba0fcf1919e4..000000000000
--- a/tensorflow/core/profiler/convert/tpu_input_pipeline_analysis_constants.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TPU_INPUT_PIPELINE_ANALYSIS_CONSTANTS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_TPU_INPUT_PIPELINE_ANALYSIS_CONSTANTS_H_
-
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/macros.h"
-
-namespace tensorflow {
-namespace profiler {
-
-TF_CONST_INIT extern const absl::string_view kProfileAllHostsDoc;
-TF_CONST_INIT extern const absl::string_view kSparseCoreV0Name;
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TPU_INPUT_PIPELINE_ANALYSIS_CONSTANTS_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/BUILD b/tensorflow/core/profiler/convert/trace_viewer/BUILD
index 74b53f788c6b..05e430e1d395 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/BUILD
+++ b/tensorflow/core/profiler/convert/trace_viewer/BUILD
@@ -1,4 +1,3 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -14,70 +13,31 @@ cc_library(
     name = "trace_events_filter_interface",
     hdrs = ["trace_events_filter_interface.h"],
     deps = [
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "@org_xprof//xprof/convert/trace_viewer:trace_events_filter_interface",
     ],
 )
 
 cc_library(
     name = "trace_viewer_visibility",
-    srcs = ["trace_viewer_visibility.cc"],
     hdrs = ["trace_viewer_visibility.h"],
     deps = [
-        ":trace_events_filter_interface",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/types:optional",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-    ],
-)
-
-tf_cc_test(
-    name = "trace_viewer_visibility_test",
-    srcs = ["trace_viewer_visibility_test.cc"],
-    deps = [
-        ":trace_viewer_visibility",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+        "@org_xprof//xprof/convert/trace_viewer:trace_viewer_visibility",
     ],
 )
 
 cc_library(
     name = "trace_viewer_color",
-    srcs = ["trace_viewer_color.cc"],
     hdrs = ["trace_viewer_color.h"],
     deps = [
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
+        "@org_xprof//xprof/convert/trace_viewer:trace_viewer_color",
     ],
 )
 
 cc_library(
     name = "trace_events_to_json",
-    srcs = ["trace_events_to_json.cc"],
     hdrs = ["trace_events_to_json.h"],
     deps = [
-        ":trace_events_util",
-        ":trace_viewer_color",
-        "//tensorflow/core/profiler/lib:context_types",
-        "//tensorflow/core/profiler/protobuf:task_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+        "@org_xprof//xprof/convert/trace_viewer:trace_events_to_json",
     ],
 )
 
@@ -85,50 +45,22 @@ cc_library(
     name = "trace_event_arguments_builder",
     hdrs = ["trace_event_arguments_builder.h"],
     deps = [
-        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
-        "@com_google_absl//absl/strings",
+        "@org_xprof//xprof/convert/trace_viewer:trace_event_arguments_builder",
     ],
 )
 
 cc_library(
     name = "trace_events_util",
-    srcs = ["trace_events_util.cc"],
     hdrs = ["trace_events_util.h"],
     deps = [
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+        "@org_xprof//xprof/convert/trace_viewer:trace_events_util",
     ],
 )
 
 cc_library(
     name = "trace_events",
-    srcs = ["trace_events.cc"],
     hdrs = ["trace_events.h"],
     deps = [
-        ":trace_events_filter_interface",
-        ":trace_events_util",
-        ":trace_viewer_visibility",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:context_types_hdrs",
-        "//tensorflow/core/profiler/protobuf:task_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:endian",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:bind_front",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
-        "@local_xla//xla/tsl/lib/io:iterator",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+        "@org_xprof//xprof/convert/trace_viewer:trace_events",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h b/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
index 73a0f81ef28b..2239961d2f88 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
@@ -15,50 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
 
-#include <cstdint>
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Helper class for adding arguments to TraceEventsArguments.
-class TraceEventArgumentsBuilder {
- public:
-  explicit TraceEventArgumentsBuilder(TraceEventArguments* args)
-      : args_(args) {}
-
-  void Append(absl::string_view key, absl::string_view value) {
-    auto* arg = args_->add_arg();
-    arg->set_name(key.data(), key.size());
-    arg->set_str_value(value.data(), value.size());
-  }
-
-  void Append(absl::string_view key, int64_t value) {
-    auto* arg = args_->add_arg();
-    arg->set_name(key.data(), key.size());
-    arg->set_int_value(value);
-  }
-
-  void Append(absl::string_view key, uint64_t value) {
-    auto* arg = args_->add_arg();
-    arg->set_name(key.data(), key.size());
-    arg->set_uint_value(value);
-  }
-
-  void Append(absl::string_view key, double value) {
-    auto* arg = args_->add_arg();
-    arg->set_name(key.data(), key.size());
-    arg->set_double_value(value);
-  }
-
- private:
-  TraceEventArguments* args_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/trace_viewer/trace_event_arguments_builder.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENT_ARGUMENTS_BUILDER_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
deleted file mode 100644
index cd55b3aa3180..000000000000
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
+++ /dev/null
@@ -1,404 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events.h"
-
-#include <stddef.h>
-
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/base/internal/endian.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/lib/io/iterator.h"
-#include "xla/tsl/lib/io/table.h"
-#include "xla/tsl/lib/io/table_builder.h"
-#include "xla/tsl/lib/io/table_options.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/platform/file_system.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/file_system.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-constexpr uint64_t kLayerResolutions[] = {
-    1000000000000ull,  // 1 second.
-    100000000000ull,  10000000000ull, 1000000000ull, 100000000ull,
-    10000000ull,      1000000ull,     100000ull,     10000ull,
-    1000ull,          100ull,         10ull,         1ull,
-};
-
-constexpr int NumLevels() { return TF_ARRAYSIZE(kLayerResolutions); }
-// Constants used by the LevelDB Table-based efficient trace viewer storage.
-static constexpr char kTraceMetadataKey[] = "/trace";
-static constexpr absl::string_view kLevelKey("123456789ABCDEFGHIJKLMNOPQ");
-static constexpr size_t kLevelDbKeyLength = 10;
-
-// Level Db table don't allow duplicated keys, so we add a tie break at the last
-// bytes. the format is zoom[1B] + timestamp[8B] + repetition[1B]
-std::string LevelDbTableKey(int zoom_level, uint64_t timestamp,
-                            uint64_t repetition) {
-  if (repetition >= 256) return std::string();
-  std::string output(kLevelDbKeyLength, 0);
-  char* ptr = output.data();
-  ptr[0] = kLevelKey[zoom_level];
-  // The big-endianness preserve the monotonic order of timestamp when convert
-  // to lexigraphical order (of Sstable key namespace).
-  uint64_t timestamp_bigendian = absl::big_endian::FromHost64(timestamp);
-  memcpy(ptr + 1, &timestamp_bigendian, sizeof(uint64_t));
-  ptr[9] = repetition;
-  return output;
-}
-
-uint64_t TimestampFromLevelDbTableKey(absl::string_view level_db_table_key) {
-  DCHECK_EQ(level_db_table_key.size(), kLevelDbKeyLength);
-  uint64_t value;  // big endian representation of timestamp.
-  memcpy(&value, level_db_table_key.data() + 1, sizeof(uint64_t));
-  return absl::big_endian::ToHost64(value);
-}
-
-bool ReadTraceMetadata(tsl::table::Iterator* iterator,
-                       absl::string_view metadata_key, Trace* trace) {
-  if (!iterator->Valid()) return false;
-  if (iterator->key() != metadata_key) return false;
-  auto serialized_trace = iterator->value();
-  return trace->ParseFromArray(serialized_trace.data(),
-                               serialized_trace.size());
-}
-
-// Returns the total number of events.
-inline int32_t NumEvents(
-    const std::vector<const TraceEventTrack*>& event_tracks) {
-  int32_t num_events = 0;
-  for (const auto* track : event_tracks) {
-    num_events += track->size();
-  }
-  return num_events;
-}
-
-// Mark events with duplicated timestamp with different serial. This is to
-// help front end to deduplicate events during streaming mode. The uniqueness
-// is guaranteed by the tuple <device_id, timestamp_ps, serial_number>.
-// REQUIRES: events is sorted by timestamp_ps
-void MaybeAddEventUniqueId(std::vector<TraceEvent*>& events) {
-  uint64_t last_ts = UINT64_MAX;
-  uint64_t serial = 0;
-  for (TraceEvent* event : events) {
-    if (event->timestamp_ps() == last_ts) {
-      event->set_serial(++serial);
-    } else {
-      serial = 0;
-    }
-    last_ts = event->timestamp_ps();
-  }
-}
-
-}  // namespace
-
-uint64_t LayerResolutionPs(unsigned level) {
-  // This sometimes gets called in a tight loop, so levels are precomputed.
-  return level >= NumLevels() ? 0 : kLayerResolutions[level];
-}
-
-std::pair<uint64_t, uint64_t> GetLevelBoundsForDuration(uint64_t duration_ps) {
-  int i = 0;
-  for (; i < NumLevels(); ++i) {
-    if (duration_ps > kLayerResolutions[i]) {
-      if (i == 0) {
-        return std::make_pair(kLayerResolutions[i], kint64max);
-      } else {
-        return std::make_pair(kLayerResolutions[i], kLayerResolutions[i - 1]);
-      }
-    }
-  }
-  // Tiny event. Put it in the bottom bucket. ([0, 1ps])
-  return std::make_pair(0, 1);
-}
-
-std::vector<TraceEvent*> MergeEventTracks(
-    const std::vector<const TraceEventTrack*>& event_tracks) {
-  std::vector<TraceEvent*> events;
-  events.reserve(NumEvents(event_tracks));
-  nway_merge(event_tracks, std::back_inserter(events), TraceEventsComparator());
-  return events;
-}
-
-std::vector<std::vector<const TraceEvent*>> GetEventsByLevel(
-    const Trace& trace, std::vector<TraceEvent*>& events) {
-  MaybeAddEventUniqueId(events);
-
-  constexpr int kNumLevels = NumLevels();
-
-  // Track visibility per zoom level.
-  tsl::profiler::Timespan trace_span = TraceSpan(trace);
-  std::vector<TraceViewerVisibility> visibility_by_level;
-  visibility_by_level.reserve(kNumLevels);
-  for (int zoom_level = 0; zoom_level < kNumLevels - 1; ++zoom_level) {
-    visibility_by_level.emplace_back(trace_span, LayerResolutionPs(zoom_level));
-  }
-
-  std::vector<std::vector<const TraceEvent*>> events_by_level(kNumLevels);
-  for (const TraceEvent* event : events) {
-    int zoom_level = 0;
-    // Find the smallest zoom level on which we can distinguish this event.
-    for (; zoom_level < kNumLevels - 1; ++zoom_level) {
-      if (visibility_by_level[zoom_level].VisibleAtResolution(*event)) {
-        break;
-      }
-    }
-    events_by_level[zoom_level].push_back(event);
-    // Record the visibility of this event in all higher zoom levels.
-    // An event on zoom level N can make events at zoom levels >N invisible.
-    for (++zoom_level; zoom_level < kNumLevels - 1; ++zoom_level) {
-      visibility_by_level[zoom_level].SetVisibleAtResolution(*event);
-    }
-  }
-  return events_by_level;
-}
-
-absl::Status ReadFileTraceMetadata(std::string& filepath, Trace* trace) {
-  // 1. Open the file.
-  uint64_t file_size;
-  TF_RETURN_IF_ERROR(tsl::Env::Default()->GetFileSize(filepath, &file_size));
-
-  tsl::FileSystem* file_system;
-  TF_RETURN_IF_ERROR(
-      tsl::Env::Default()->GetFileSystemForFile(filepath, &file_system));
-
-  std::unique_ptr<tsl::RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(file_system->NewRandomAccessFile(filepath, &file));
-
-  tsl::table::Options options;
-  options.block_size = 20 * 1024 * 1024;
-  tsl::table::Table* table = nullptr;
-  TF_RETURN_IF_ERROR(
-      tsl::table::Table::Open(options, file.get(), file_size, &table));
-  std::unique_ptr<tsl::table::Table> table_deleter(table);
-
-  std::unique_ptr<tsl::table::Iterator> iterator(table->NewIterator());
-  if (iterator == nullptr) return absl::UnknownError("Could not open table");
-
-  // 2. Read the metadata.
-  iterator->SeekToFirst();
-  if (!ReadTraceMetadata(iterator.get(), kTraceMetadataKey, trace)) {
-    return absl::UnknownError("Could not parse Trace proto");
-  }
-  return absl::OkStatus();
-}
-
-// Store the contents of this container in an sstable file. The format is as
-// follows:
-//
-// key                     | value
-// trace                   | The Trace-proto trace_
-// 0<timestamp><serial>    | Event at timestamp visible at a 10ms resolution
-// 1<timestamp><serial>    | Event at timestamp visible at a 1ms resolution
-// ...
-// 7<timestamp><serial>    | Event at timestamp visible at a 1ns resolution
-//
-// Note that each event only appears exactly once, at the first layer it's
-// eligible for.
-absl::Status DoStoreAsLevelDbTable(
-    std::unique_ptr<tsl::WritableFile>& file, const Trace& trace,
-    const std::vector<std::vector<const TraceEvent*>>& events_by_level) {
-  tsl::table::Options options;
-  options.block_size = 20 * 1024 * 1024;
-  options.compression = tsl::table::kSnappyCompression;
-  tsl::table::TableBuilder builder(options, file.get());
-
-  builder.Add(kTraceMetadataKey, trace.SerializeAsString());
-
-  size_t num_of_events_dropped = 0;  // Due to too many timestamp repetitions.
-  for (int zoom_level = 0; zoom_level < events_by_level.size(); ++zoom_level) {
-    // The key of level db table have to be monotonically increasing, therefore
-    // we make the timestamp repetition count as the last byte of key as tie
-    // breaker. The hidden assumption was that there are not too many identical
-    // timestamp per resolution, (if there are such duplications, we dropped
-    // them if it overflow the last byte).
-    uint64_t last_timestamp = std::numeric_limits<uint64_t>::max();
-    uint64_t last_timestamp_repetition = 0;
-    for (const TraceEvent* event : events_by_level[zoom_level]) {
-      uint64_t timestamp = event->timestamp_ps();
-      if (timestamp != last_timestamp) {
-        last_timestamp = timestamp;
-        last_timestamp_repetition = 0;
-      } else {
-        ++last_timestamp_repetition;
-      }
-      std::string key =
-          LevelDbTableKey(zoom_level, timestamp, last_timestamp_repetition);
-      if (!key.empty()) {
-        // To reduce file size, clear the timestamp from the value. It is
-        // redundant info because the timestamp is part of the key.
-        TraceEvent event_copy = *event;
-        event_copy.clear_timestamp_ps();
-        builder.Add(key, event_copy.SerializeAsString());
-      } else {
-        ++num_of_events_dropped;
-      }
-    }
-  }
-  absl::string_view filename;
-  TF_RETURN_IF_ERROR(file->Name(&filename));
-  LOG(INFO) << "Storing " << trace.num_events() - num_of_events_dropped
-            << " as LevelDb table fast file: " << filename << " with "
-            << num_of_events_dropped << " events dropped.";
-
-  TF_RETURN_IF_ERROR(builder.Finish());
-  return file->Close();
-}
-
-absl::Status DoLoadFromLevelDbTable(
-    const std::string& filename,
-    std::unique_ptr<TraceEventsFilterInterface> filter,
-    std::unique_ptr<TraceVisibilityFilter> visibility_filter,
-    int64_t filter_by_visibility_threshold, Trace& trace,
-    bool& filter_by_visibility,
-    const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
-    const std::function<void(TraceEvent*)>& add_arena_event) {
-  uint64_t file_size;
-  TF_RETURN_IF_ERROR(tsl::Env::Default()->GetFileSize(filename, &file_size));
-
-  tensorflow::FileSystem* file_system;
-  TF_RETURN_IF_ERROR(
-      tsl::Env::Default()->GetFileSystemForFile(filename, &file_system));
-
-  std::unique_ptr<tensorflow::RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(file_system->NewRandomAccessFile(filename, &file));
-
-  tsl::table::Options options;
-  options.block_size = 20 * 1024 * 1024;
-  tsl::table::Table* table = nullptr;
-  TF_RETURN_IF_ERROR(
-      tsl::table::Table::Open(options, file.get(), file_size, &table));
-  std::unique_ptr<tsl::table::Table> table_deleter(table);
-  std::unique_ptr<tsl::table::Iterator> iterator(table->NewIterator());
-  if (iterator == nullptr) return tsl::errors::Unknown("Could not open table");
-
-  // Read the metadata.
-  iterator->SeekToFirst();
-  if (!ReadTraceMetadata(iterator.get(), kTraceMetadataKey, &trace)) {
-    return absl::UnknownError(
-        "Could not parse Trace proto to read trace metadata");
-  }
-
-  if (filter) filter->SetUp(trace);
-
-  tsl::profiler::Timespan visible_span;
-  uint64_t container_resolution_ps = 0;
-
-  filter_by_visibility = filter_by_visibility_threshold == -1LL ||
-                         !trace.has_num_events() ||
-                         trace.num_events() >= filter_by_visibility_threshold;
-  if (visibility_filter) {
-    if (!filter_by_visibility) {
-      // disable streaming
-      visibility_filter->UpdateVisibility(0);
-    }
-    visibility_filter->SetUp(trace);
-    visible_span = visibility_filter->VisibleSpan();
-    container_resolution_ps = visibility_filter->ResolutionPs();
-  } else {
-    visible_span = TraceSpan(trace);
-  }
-
-  // Read events at the different zoom levels.
-  std::vector<std::unique_ptr<std::vector<TraceEvent*>>> loaded_events_by_level;
-  size_t filtered = 0;
-  TraceEvent event;  // Declared outside of the loop to avoid repeated calls to
-                     // the constructor and destructor in the loop body. Cleared
-                     // by every call to ParseFromCord.
-  for (int i = 0;; ++i) {
-    loaded_events_by_level.emplace_back(
-        std::make_unique<std::vector<TraceEvent*>>());
-    auto& loaded_events = *loaded_events_by_level.back();
-    uint64_t resolution_ps = LayerResolutionPs(i);
-    // Seek to the first element that might be in range. For the initial zoom
-    // level, we don't know any bounds as events might be arbitrarily large.
-    uint64_t min_timestamp_ps = 0;
-    if (i > 0 && visible_span.begin_ps() > LayerResolutionPs(i - 1)) {
-      min_timestamp_ps = visible_span.begin_ps() - LayerResolutionPs(i - 1);
-    }
-    iterator->Seek(LevelDbTableKey(i, i == 0 ? 0 : min_timestamp_ps, 0));
-    while (iterator->Valid() && iterator->key().at(0) == kLevelKey[i]) {
-      auto serialized_event = iterator->value();
-      if (!event.ParseFromArray(serialized_event.data(),
-                                serialized_event.size())) {
-        return tsl::errors::Unknown("Could not parse TraceEvent proto");
-      }
-      uint64_t timestamp = TimestampFromLevelDbTableKey(iterator->key());
-      event.set_timestamp_ps(timestamp);
-      if (event.timestamp_ps() > visible_span.end_ps()) {
-        // This (and all following) events are outside of our window.
-        break;
-      }
-      // Filter before copying to the arena as it does not require sorting.
-      if (!filter || !filter->Filter(event)) {
-        loaded_events.push_back(copy_event_to_arena(event));
-      } else {
-        ++filtered;
-      }
-      iterator->Next();
-    }
-    if (container_resolution_ps >= resolution_ps) {
-      // No need to read further, the resolution we just loaded already exceeds
-      // the desired resolution.
-      break;
-    }
-  }
-
-  // We have loaded events from different zoom levels. Sort them by timestamp
-  // so visibility filtering works as expected.
-  std::vector<TraceEvent*> loaded_events;
-  nway_merge(loaded_events_by_level, std::back_inserter(loaded_events),
-             TraceEventsComparator());
-  loaded_events_by_level.clear();
-
-  LOG(INFO) << "Loaded " << loaded_events.size() << " events after filtering "
-            << filtered << " events from LevelDb fast file: " << filename;
-  size_t visible_events_count = 0;
-  for (TraceEvent* event : loaded_events) {
-    if (!visibility_filter || !visibility_filter->Filter(*event)) {
-      add_arena_event(event);
-      ++visible_events_count;
-    }
-  }
-  LOG(INFO) << "Added " << visible_events_count
-            << " visible events from LevelDb fast file: " << filename;
-  return absl::OkStatus();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
index 0581aab2cedf..9535f3e492ba 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
@@ -12,502 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
 
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/base/optimization.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/functional/bind_front.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "xla/tsl/lib/io/table.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
-#include "tensorflow/core/profiler/lib/context_types.h"
-#include "tensorflow/core/profiler/protobuf/task.pb.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/file_system.h"
-#include "tsl/platform/status.h"
-#include "tsl/profiler/lib/context_types.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// A track of events in the trace-viewer.
-using TraceEventTrack = std::vector<TraceEvent*>;
-
-// Merge-sorts the given event tracks. Each track must be sorted.
-std::vector<TraceEvent*> MergeEventTracks(
-    const std::vector<const TraceEventTrack*>& event_tracks);
-
-absl::Status DoStoreAsLevelDbTable(
-    std::unique_ptr<tsl::WritableFile>& file, const Trace& trace,
-    const std::vector<std::vector<const TraceEvent*>>& events_by_level);
-
-absl::Status DoLoadFromLevelDbTable(
-    const std::string& filename,
-    std::unique_ptr<TraceEventsFilterInterface> filter,
-    std::unique_ptr<TraceVisibilityFilter> visibility_filter,
-    int64_t filter_by_visibility_threshold, Trace& trace,
-    bool& filter_by_visibility,
-    const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
-    const std::function<void(TraceEvent*)>& add_arena_event);
-
-// Reads the trace metadata from a file with given path
-absl::Status ReadFileTraceMetadata(std::string& filepath, Trace* trace);
-
-std::vector<std::vector<const TraceEvent*>> GetEventsByLevel(
-    const Trace& trace, std::vector<TraceEvent*>& events);
-
-// Return the minimum duration an event can have in `level`.
-uint64_t LayerResolutionPs(unsigned level);
-
-// Returns <lower, upper> bounds (in picoseconds) for the level that an event
-// with `duration_ps` would go into. (upper >= duration_ps > lower)
-std::pair<uint64_t, uint64_t> GetLevelBoundsForDuration(uint64_t duration_ps);
-
-struct EventFactory {
-  TraceEvent* Create() {
-    events.push_back(std::make_unique<TraceEvent>());
-    return events.back().get();
-  }
-  std::vector<std::unique_ptr<TraceEvent>> events;
-};
-
-struct DefaultStdHash {
-  size_t operator()(absl::string_view input) {
-    return std::hash<absl::string_view>()(input);
-  }
-};
-
-template <typename EventFactory, typename RawData,
-          typename Hash = DefaultStdHash>
-class TraceEventsContainerBase {
- public:
-  TraceEventsContainerBase() {
-    arenas_.insert(std::make_shared<EventFactory>());
-  }
-
-  // Movable but non-copyable.
-  TraceEventsContainerBase(TraceEventsContainerBase&&) = default;
-  TraceEventsContainerBase& operator=(TraceEventsContainerBase&&) = default;
-  TraceEventsContainerBase(const TraceEventsContainerBase&) = delete;
-  TraceEventsContainerBase& operator=(const TraceEventsContainerBase&) = delete;
-
-  // Creates a TraceEvent prefilled with the given values.
-  void AddCompleteEvent(absl::string_view name, uint32_t resource_id,
-                        uint32_t device_id, tsl::profiler::Timespan timespan,
-                        RawData* raw_data = nullptr,
-                        std::optional<int64_t> group_id = std::nullopt,
-                        std::optional<int64_t> serial = std::nullopt) {
-    TraceEvent* event = CreateArenaEvent();
-    MaybeInternEventName(event, name);
-    event->set_resource_id(resource_id);
-    event->set_device_id(device_id);
-    event->set_timestamp_ps(timespan.begin_ps());
-    if (timespan.duration_ps() != 0) {
-      event->set_duration_ps(timespan.duration_ps());
-    }
-    if (raw_data) {
-      MaybeInternTraceArgument(raw_data);
-      raw_data->SerializePartialToString(event->mutable_raw_data());
-      if (event->raw_data().empty()) event->clear_raw_data();
-    }
-    if (group_id) {
-      event->set_group_id(*group_id);
-    }
-    if (serial && *serial > 0) {
-      event->set_serial(static_cast<uint32_t>(*serial));
-    }
-    AddArenaEvent(event);
-  }
-
-  // Similar to above, but the TraceEvent also has an associated flow_id and
-  // flow_entry_type, to make it part of a flow.
-  void AddFlowEvent(absl::string_view name, uint32_t resource_id,
-                    uint32_t device_id, tsl::profiler::Timespan timespan,
-                    uint64_t flow_id, TraceEvent::FlowEntryType flow_entry_type,
-                    tsl::profiler::ContextType flow_category =
-                        tsl::profiler::ContextType::kGeneric,
-                    RawData* raw_data = nullptr,
-                    std::optional<int64_t> group_id = std::nullopt,
-                    std::optional<int64_t> serial = std::nullopt) {
-    TraceEvent* event = CreateArenaEvent();
-    MaybeInternEventName(event, name);
-    event->set_resource_id(resource_id);
-    event->set_device_id(device_id);
-    event->set_timestamp_ps(timespan.begin_ps());
-    if (timespan.duration_ps() != 0) {
-      event->set_duration_ps(timespan.duration_ps());
-    }
-    event->set_flow_id(flow_id);
-    event->set_flow_entry_type(flow_entry_type);
-    event->set_flow_category(static_cast<uint32_t>(flow_category));
-    if (raw_data) {
-      MaybeInternTraceArgument(raw_data);
-      raw_data->SerializePartialToString(event->mutable_raw_data());
-      if (event->raw_data().empty()) event->clear_raw_data();
-    }
-    if (group_id) {
-      event->set_group_id(*group_id);
-    }
-    if (serial && *serial > 0) {
-      event->set_serial(static_cast<uint32_t>(*serial));
-    }
-    AddArenaEvent(event);
-  }
-
-  // Similar to above, but the "async" TraceEvent don't have a resource id, its
-  // name is used as "async channel" which are used as "thread" name. It has an
-  // associated unique flow_id and flow_entry_type to signal asynchronous
-  // start and end events and match up between them.
-  void AddAsyncEvent(absl::string_view name, uint32_t device_id,
-                     tsl::profiler::Timespan timespan, uint64_t flow_id,
-                     TraceEvent::FlowEntryType flow_entry_type,
-                     tsl::profiler::ContextType flow_category =
-                         tsl::profiler::ContextType::kGeneric,
-                     RawData* raw_data = nullptr,
-                     std::optional<int64_t> group_id = std::nullopt,
-                     std::optional<int64_t> serial = std::nullopt) {
-    TraceEvent* event = CreateArenaEvent();
-    MaybeInternEventName(event, name);
-    event->set_device_id(device_id);
-    event->set_timestamp_ps(timespan.begin_ps());
-    if (timespan.duration_ps() != 0) {
-      event->set_duration_ps(timespan.duration_ps());
-    }
-    event->set_flow_id(flow_id);
-    event->set_flow_entry_type(flow_entry_type);
-    event->set_flow_category(static_cast<uint32_t>(flow_category));
-    if (raw_data) {
-      MaybeInternTraceArgument(raw_data);
-      raw_data->SerializePartialToString(event->mutable_raw_data());
-      if (event->raw_data().empty()) event->clear_raw_data();
-    }
-    if (group_id) {
-      event->set_group_id(*group_id);
-    }
-    if (serial && *serial > 0) {
-      event->set_serial(static_cast<int32_t>(*serial));
-    }
-    AddArenaEvent(event);
-  }
-
-  // Similar to above, but the TraceEvent also has an associated counter name
-  // and value in RawData.args. Counter events are per device, so no resource_id
-  // is passed.
-  void AddCounterEvent(absl::string_view name, uint32_t device_id,
-                       uint64_t timestamp_ps, const RawData& raw_data,
-                       std::optional<int64_t> serial = std::nullopt) {
-    TraceEvent* event = CreateArenaEvent();
-    event->set_name(name.data(), name.size());
-    event->set_device_id(device_id);
-    // Do not set resource_id for counter events, they are per device.
-    event->set_timestamp_ps(timestamp_ps);
-    DCHECK(raw_data.has_args());
-    DCHECK_EQ(raw_data.args().arg_size(), 1);
-    DCHECK(raw_data.args().arg(0).has_uint_value());
-    raw_data.SerializePartialToString(event->mutable_raw_data());
-    if (serial && *serial > 0) {
-      event->set_serial(static_cast<uint32_t>(*serial));
-    }
-    AddArenaEvent(event);
-  }
-
-  // Returns a device descriptor.
-  Device* MutableDevice(uint32_t device_id) {
-    return &(*trace_.mutable_devices())[device_id];
-  }
-
-  // Returns a resource descriptor,
-  Resource* MutableResource(uint32_t resource_id, uint32_t device_id) {
-    Device* device = MutableDevice(device_id);
-    return &(*device->mutable_resources())[resource_id];
-  }
-
-  // Adds metadata events to set the name of each device and resource.
-  // The arguments are callbacks that return the names given ids.
-  // This must be called after all AddEvent calls, and no more AddEvent
-  // calls should be made after calling AddMetadataEvents.
-  void AddMetadataEvents(
-      const std::function<std::string(uint32_t /*device_id*/)>& device_name,
-      const std::function<std::string(
-          uint32_t /*device_id*/, uint32_t /*resource_id*/)>& resource_name) {
-    for (const auto& id_and_device : events_by_device_) {
-      uint32_t device_id = id_and_device.first;
-      auto& device = (*trace_.mutable_devices())[device_id];
-      device.set_device_id(device_id);
-      device.set_name(device_name(device_id));
-      const DeviceEvents& device_events = id_and_device.second;
-      for (const auto& id_and_resource : device_events.events_by_resource) {
-        uint32_t resource_id = id_and_resource.first;
-        auto& resource = (*device.mutable_resources())[resource_id];
-        resource.set_resource_id(resource_id);
-        resource.set_name(resource_name(device_id, resource_id));
-        resource.set_num_events(id_and_resource.second.size());
-      }
-    }
-  }
-
-  // Adds task metadata for the given host.
-  void AddTask(int host_id, const Task& task) {
-    (*trace_.mutable_tasks())[host_id] = task;
-  }
-
-  // Stores the contents of this container in a level-db sstable file.
-  absl::Status StoreAsLevelDbTable(
-      std::unique_ptr<tsl::WritableFile> file) const {
-    Trace trace = trace_;
-    trace.set_num_events(NumEvents());
-    auto events_by_level = EventsByLevel();
-    return DoStoreAsLevelDbTable(file, trace, events_by_level);
-  }
-
-  std::vector<std::vector<const TraceEvent*>> GetTraceEventsByLevel() const {
-    return EventsByLevel();
-  }
-
-  // Loads the contents of this container from a level-db sstable file.
-  // In order to be efficient, requires resolution__ to be set.
-  // If span_ is not set, it is initialized from the loaded trace_.
-  absl::Status LoadFromLevelDbTable(
-      const std::string& filename,
-      std::unique_ptr<TraceEventsFilterInterface> filter = nullptr,
-      std::unique_ptr<TraceVisibilityFilter> visibility = nullptr,
-      int64_t filter_by_visibility_threshold = -1LL) {
-    return DoLoadFromLevelDbTable(
-        filename, std::move(filter), std::move(visibility),
-        filter_by_visibility_threshold, trace_, filter_by_visibility_,
-        absl::bind_front(&TraceEventsContainerBase::CopyEventToArena, this),
-        absl::bind_front(&TraceEventsContainerBase::AddArenaEvent, this));
-  }
-
-  // Calls 'callback' with all events stored in this container.
-  template <typename Callback>
-  void ForAllEvents(Callback callback) const {
-    for (const auto& [device_id, device] : events_by_device_) {
-      for (const auto& [counter_name, events] : device.counter_events_by_name) {
-        for (auto* event : events) {
-          callback(*event);
-        }
-      }
-      for (const auto& [resource_id, events] : device.events_by_resource) {
-        for (auto* event : events) {
-          callback(*event);
-        }
-      }
-    }
-  }
-
-  // Calls 'callback' with all event tracks stored in this container.
-  template <typename Callback>
-  void ForAllTracks(Callback callback) const {
-    for (const auto& [device_id, device] : events_by_device_) {
-      for (const auto& [counter_name, events] : device.counter_events_by_name) {
-        if (!events.empty()) {
-          if (ABSL_PREDICT_FALSE(!callback(device_id, counter_name, events)))
-            return;
-        }
-      }
-      for (const auto& [resource_id, events] : device.events_by_resource) {
-        if (!events.empty()) {
-          if (ABSL_PREDICT_FALSE(!callback(device_id, resource_id, events)))
-            return;
-        }
-      }
-    }
-  }
-
-  // Calls 'callback' with all event tracks stored in this container.
-  template <typename Callback>
-  void ForAllMutableTracks(Callback callback) const {
-    for (auto& [device_id, device] : events_by_device_) {
-      for (auto& [counter_name, events] : device.counter_events_by_name) {
-        if (!events.empty()) {
-          callback(device_id, counter_name, &events);
-        }
-      }
-      for (auto& [resource_id, events] : device.events_by_resource) {
-        if (!events.empty()) {
-          callback(device_id, resource_id, &events);
-        }
-      }
-    }
-  }
-
-  // Calls 'callback' with all event flows stored in this container.
-  template <typename Callback>
-  void ForAllFlows(Callback callback) const {
-    absl::flat_hash_map<uint64_t /*flow_id*/, TraceEventFlow> flows;
-    for (const auto& [device_id, device] : events_by_device_) {
-      // Counter events are not flow events.
-      for (const auto& [resource_id, events] : device.events_by_resource) {
-        for (auto* event : events) {
-          if (event->has_flow_id()) flows[event->flow_id()].push_back(event);
-        }
-      }
-    }
-    for (auto& [flow_id, combined_flow] : flows) {
-      // If the flow_id is reused, split into individual flows.
-      for (auto& flow : SplitEventFlow(std::move(combined_flow))) {
-        callback(flow_id, flow);
-      }
-    }
-  }
-
-  // Returns the metadata for this trace container.
-  const Trace& trace() const { return trace_; }
-
-  // Returns the number of events.
-  size_t NumEvents() const {
-    size_t count = 0;
-    for (const auto& [device_id, device] : events_by_device_) {
-      for (const auto& [counter_name, events] : device.counter_events_by_name) {
-        count += events.size();
-      }
-      for (const auto& [resource_id, events] : device.events_by_resource) {
-        count += events.size();
-      }
-    }
-    return count;
-  }
-
-  // Returns the number of tracks.
-  size_t NumTracks() const {
-    return std::accumulate(
-        events_by_device_.begin(), events_by_device_.end(), 0,
-        [](const size_t tracks, const std::pair<uint32_t, DeviceEvents> item) {
-          return tracks + item.second.counter_events_by_name.size() +
-                 item.second.events_by_resource.size();
-        });
-  }
-
-  bool FilterByVisibility() const { return filter_by_visibility_; }
-
- protected:
-  // Allocates an event in the first of the arenas_.
-  TraceEvent* CreateArenaEvent() { return (*arenas_.begin())->Create(); }
-
-  // Copies event into arenas_.
-  TraceEvent* CopyEventToArena(const TraceEvent& event) {
-    TraceEvent* copy = CreateArenaEvent();
-    *copy = event;
-    return copy;
-  }
-
-  // Adds an event from arenas_ to events_by_device_.
-  void AddArenaEvent(TraceEvent* event) {
-    ExpandTraceSpan(EventSpan(*event), &trace_);
-    DeviceEvents& device_events = events_by_device_[event->device_id()];
-    if (!event->has_resource_id()) {
-      device_events.counter_events_by_name[event->name()].push_back(event);
-    } else {
-      device_events.events_by_resource[event->resource_id()].push_back(event);
-    }
-  }
-
-  // Returns all events grouped by visibility level.
-  std::vector<std::vector<const TraceEvent*>> EventsByLevel() const {
-    std::vector<TraceEvent*> events = SortedEvents();
-    return GetEventsByLevel(trace_, events);
-  }
-
-  // Returns all events sorted using TraceEventsComparator.
-  // Helper for EventsByLevel().
-  // REQUIRED: All events have been added and SortTracks() has been called.
-  std::vector<TraceEvent*> SortedEvents() const {
-    std::vector<const TraceEventTrack*> event_tracks;
-    event_tracks.reserve(NumTracks());
-    ForAllMutableTracks(
-        [&event_tracks](uint32_t device_id,
-                        std::variant<uint32_t, absl::string_view> resource_id,
-                        TraceEventTrack* events) {
-          event_tracks.push_back(events);
-        });
-    return MergeEventTracks(event_tracks);
-  }
-
-  uint64_t MaybeInternString(absl::string_view name) {
-    uint64_t fp = hash_(name);
-    auto& it = (*trace_.mutable_name_table())[fp];
-    if (it.empty()) {
-      it = name;
-    }
-    return fp;
-  }
-
-  void MaybeInternEventName(TraceEvent* event, absl::string_view name) {
-    static constexpr size_t kNameInternThreshold = 32;
-    if (name.size() > kNameInternThreshold) {
-      event->set_name_ref(MaybeInternString(name));
-    } else {
-      event->set_name(name.data(), name.size());
-    }
-  }
-
-  void MaybeInternTraceArgument(RawData* raw_data) {
-    if (raw_data->has_args()) {
-      for (auto& arg : *raw_data->mutable_args()->mutable_arg()) {
-        constexpr size_t kTraceArgInternThreshold = 16;
-        if (arg.has_str_value() &&
-            arg.str_value().size() > kTraceArgInternThreshold) {
-          // Use name table to string intern the trace argument.
-          if (arg.name() == "long_name" || arg.name() == "hlo_text") {
-            // Also mark it as potential stack frame.
-            arg.set_ref_value(MaybeInternString("@@" + arg.str_value()));
-          } else {
-            arg.set_ref_value(MaybeInternString(arg.str_value()));
-          }
-        }
-      }
-    }
-  }
-
-  // Events shown within a single device.
-  struct DeviceEvents {
-    // Counter events, which are per-device (don't have resource_id), and are
-    // plotted in different tracks for each counter name.
-    absl::flat_hash_map<std::string, TraceEventTrack> counter_events_by_name;
-
-    // Complete events and flow events, mapped by resource_id.
-    std::map<uint32_t, TraceEventTrack> events_by_resource;
-  };
-
-  // Events, mapped by device_id.
-  mutable std::map<uint32_t, DeviceEvents> events_by_device_;
-
-  // Indicator on if visibility filtering is applied or not
-  // Currently skip visibility filtering only applies to ssTable
-  bool filter_by_visibility_ = true;
-
-  // The arenas containing events constructed in this container or in containers
-  // that have been merged into this container.
-  using Arenas = absl::flat_hash_set<std::shared_ptr<EventFactory>>;
-  Arenas arenas_;
-
-  Trace trace_;
-  Hash hash_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/trace_viewer/trace_events.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h
index 24f632038b3b..de8b3857de0d 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h
@@ -12,29 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
 
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Trace event filter interface.
-class TraceEventsFilterInterface {
- public:
-  virtual ~TraceEventsFilterInterface() = default;
-
-  // Allow sub-classes to set up filtering by processing the trace, e.g., by
-  // capturing the names of devices and resources that need to be filtered.
-  virtual void SetUp(const Trace& trace) = 0;
-
-  // Returns true if event should not be added to a TraceEventsContainer.
-  virtual bool Filter(const TraceEvent& event) = 0;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/trace_viewer/trace_events_filter_interface.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_FILTER_INTERFACE_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc
deleted file mode 100644
index 87ebbe0e6206..000000000000
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <string>
-#include <string_view>
-#include <utility>
-
-#include "absl/strings/match.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-std::string JsonEscape(absl::string_view raw) {
-  std::string escaped_string;
-  const size_t length = raw.length();
-  escaped_string.reserve((length + 1) * 2);
-  escaped_string.push_back('"');
-  for (size_t i = 0; i < length; ++i) {
-    const unsigned char c = raw[i];
-    if (c < 0x20) {
-      // Not printable.
-      escaped_string.push_back('\\');
-      switch (c) {
-        case '\b':
-          escaped_string.push_back('b');
-          break;
-        case '\f':
-          escaped_string.push_back('f');
-          break;
-        case '\n':
-          escaped_string.push_back('n');
-          break;
-        case '\r':
-          escaped_string.push_back('r');
-          break;
-        case '\t':
-          escaped_string.push_back('t');
-          break;
-        default:
-          absl::StrAppendFormat(&escaped_string, "u%04x",
-                                static_cast<unsigned int>(c));
-      }
-      continue;
-    }
-
-    switch (c) {
-      case '\"':
-        escaped_string.append("\\\"");
-        continue;
-      case '\\':
-        escaped_string.append("\\\\");
-        continue;
-
-      case '<':
-      case '>':
-      case '&': {
-        absl::StrAppendFormat(&escaped_string, "\\u%04x",
-                              static_cast<unsigned int>(c));
-        continue;
-      }
-      case '\xe2': {
-        if ((i + 2 < length) && (raw[i + 1] == '\x80')) {
-          if (raw[i + 2] == '\xa8') {
-            escaped_string.append("\\u2028");
-            i += 2;
-            continue;
-          } else if (raw[i + 2] == '\xa9') {
-            escaped_string.append("\\u2029");
-            i += 2;
-            continue;
-          }
-        }
-        escaped_string.push_back(c);
-        continue;
-      }
-    }
-
-    // Character should not be escaped.
-    escaped_string.push_back(c);
-  }
-
-  escaped_string.push_back('"');
-  return escaped_string;
-}
-
-// Converts the given proto to text format and escapes it for JSON.
-std::string ProtoString(const tsl::protobuf::Message& pb) {
-  return JsonEscape(pb.DebugString());
-}
-
-std::map<uint64_t, uint64_t> BuildStackFrameReferences(const Trace& trace) {
-  const auto& name_table = trace.name_table();
-  std::map<uint64_t, uint64_t> output;
-  for (const auto& [fp, name] : name_table) {
-    if (!absl::StartsWith(name, "@@")) continue;
-    output[fp] = 0;
-  }
-  uint64_t sf = 1;
-  for (auto& it : output) {
-    it.second = sf++;
-  }
-  return output;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
index 0cac63173a79..65781b4584ea 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
@@ -12,579 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/base/macros.h"
-#include "absl/container/fixed_array.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/strings/strip.h"
-#include "absl/time/time.h"
-#include "absl/types/optional.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h"
-#include "tensorflow/core/profiler/lib/context_types.h"
-#include "tensorflow/core/profiler/protobuf/task.pb.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/profiler/lib/context_types.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// JSON generation options.
-struct JsonTraceOptions {
-  using Details = std::vector<std::pair<std::string, bool>>;
-
-  // Options and values for filtering based on the "details" menu.
-  Details details;
-
-  // Device IDs of devices whose resources should be sorted by name instead of
-  // by resource ID.
-  absl::flat_hash_set<uint32_t /*device_id*/> sort_resources_by_name;
-
-  // Returns the color for an event.
-  TraceEventsColorerInterface* colorer = nullptr;
-
-  bool generate_stack_frames = true;
-  bool use_new_backend = false;
-  std::string code_link;
-};
-
-// Counts generated JSON events by type.
-class JsonEventCounter {
- public:
-  JsonEventCounter() : event_count_(kNumEventTypes, 0) {}
-  ~JsonEventCounter() { LOG(INFO) << ToString(); }
-
-  // Types of JSON events (bit.ly/trace-event-format)
-  enum EventType {
-    kCompleteEvent,
-    kCompleteEventWithFlow,
-    kCounterEvent,
-    kAsyncEvent,
-  };
-
-  void Inc(EventType e) { ++event_count_[e]; }
-
-  std::string ToString() const {
-    std::string output = "Generated JSON events:";
-    for (size_t i = 0; i < event_count_.size(); ++i) {
-      absl::StrAppend(&output, " ", kEventTypeName[i], ": ", event_count_[i]);
-    }
-    return output;
-  }
-
- private:
-  static constexpr absl::string_view kEventTypeName[] = {
-      "complete",
-      "complete+flow",
-      "counter",
-      "async",
-  };
-
-  static constexpr size_t kNumEventTypes = ABSL_ARRAYSIZE(kEventTypeName);
-
-  absl::FixedArray<size_t> event_count_;
-};
-
-// Adds a separator between elements of a JSON array or object.
-template <typename IOBuffer>
-class JsonSeparator {
- public:
-  explicit JsonSeparator(IOBuffer* output) : output_(output) {}
-
-  // Does nothing on the first call; adds a comma to the output on subsequent
-  // calls.
-  void Add() {
-    output_->Append(sep_);
-    sep_ = ",";
-  }
-
- private:
-  IOBuffer* output_;
-  absl::string_view sep_;
-};
-
-// Converts picoseconds to microseconds.
-inline double PicosToMicros(uint64_t ps) { return ps / 1E6; }
-
-// Escapes the contents of "raw" in JSON style.
-// Also adds double quotes to the beginning and end of the string.
-std::string JsonEscape(absl::string_view raw);
-
-std::string ProtoString(const tsl::protobuf::Message& pb);
-
-template <typename RawDataType, typename IOBuffer>
-void WriteTpuData(const RawDataType& data, JsonSeparator<IOBuffer>* separator,
-                  IOBuffer* output) {}
-
-// Writes JSON events from a TraceEvent.
-template <typename IOBuffer, typename RawDataType>
-class JsonEventWriter {
- public:
-  JsonEventWriter(const TraceEventsColorerInterface* colorer,
-                  const Trace& trace,
-                  const std::map<uint64_t, uint64_t>& references,
-                  IOBuffer* output)
-      : colorer_(colorer),
-        trace_(trace),
-        references_(references),
-        output_(output) {}
-
-  void WriteEvent(const TraceEvent& event) const {
-    std::optional<TraceEvent> async_event;
-    output_->Append(R"({"pid":)", event.device_id());
-    if (event.has_resource_id()) {
-      output_->Append(R"(,"tid":)", event.resource_id());
-    }
-    const std::string& event_name =
-        event.has_name_ref() ? trace_.name_table().at(event.name_ref())
-                             : event.name();
-    output_->Append(R"(,"name":)", JsonEscape(event_name));
-    tsl::profiler::Timespan span = EventSpan(event);
-    // "%.17g" is the default double format in proto2::util::JsonFormat.
-    absl::Format(output_, R"(,"ts":%.17g)", PicosToMicros(span.begin_ps()));
-    JsonEventCounter::EventType event_type = JsonEventCounter::kCounterEvent;
-    if (event.has_resource_id()) {
-      event_type = event.has_flow_id()
-                       ? JsonEventCounter::kCompleteEventWithFlow
-                       : JsonEventCounter::kCompleteEvent;
-      // A complete event must have a duration, otherwise trace-viewer will
-      // extend the event to the end of the trace and append "(Did Not Finish)"
-      // to its name. Make the minimum duration 1 picosecond.
-      uint64_t duration_ps = std::max(span.duration_ps(), uint64_t{1});
-      absl::Format(output_, R"(,"dur":%.17g)", PicosToMicros(duration_ps));
-
-      if (std::optional<uint32_t> color_id = colorer_->GetColor(event)) {
-        output_->Append(R"(,"cname":)", TraceViewerColorName(*color_id));
-      }
-
-      // FlowV2
-      if (event_type == JsonEventCounter::kCompleteEventWithFlow) {
-        output_->Append(R"(,"bind_id":)", event.flow_id());
-        if (event.has_flow_category()) {
-          tsl::profiler::ContextType type =
-              tsl::profiler::GetSafeContextType(event.flow_category());
-          if (type != tsl::profiler::ContextType::kGeneric &&
-              type != tsl::profiler::ContextType::kLegacy) {
-            const char* category = tsl::profiler::GetContextTypeString(type);
-            output_->Append(R"(,"cat":")", category, R"(")");
-          }
-        }
-        switch (event.flow_entry_type()) {
-          case TraceEvent::FLOW_NONE:
-            // The caller prevents this case from happening.
-            break;
-          case TraceEvent::FLOW_START:
-            output_->Append(R"(,"flow_out":true)");
-            break;
-          case TraceEvent::FLOW_MID:
-            output_->Append(R"(,"flow_in":true,"flow_out":true)");
-            break;
-          case TraceEvent::FLOW_END:
-            output_->Append(R"(,"flow_in":true)");
-            break;
-        }
-      }
-      output_->Append(R"(,"ph":"X")");
-    } else {
-      event_type = event.has_flow_id() ? JsonEventCounter::kAsyncEvent
-                                       : JsonEventCounter::kCounterEvent;
-      if (event_type == JsonEventCounter::kCounterEvent) {
-        output_->Append(R"(,"ph":"C")");
-      } else {  // async events
-        output_->Append(R"(,"id":)", event.flow_id());
-        if (event.has_flow_category()) {
-          tsl::profiler::ContextType type =
-              tsl::profiler::GetSafeContextType(event.flow_category());
-          const char* category = tsl::profiler::GetContextTypeString(type);
-          output_->Append(R"(,"cat":")", category, R"(")");
-        }
-        switch (event.flow_entry_type()) {
-          case TraceEvent::FLOW_NONE:
-            // The caller prevents this case from happening.
-            break;
-          case TraceEvent::FLOW_START:
-            output_->Append(R"(,"ph":"b")");
-            break;
-          case TraceEvent::FLOW_END:
-            output_->Append(R"(,"ph":"e")");
-            break;
-          case TraceEvent::FLOW_MID:
-            output_->Append(R"(,"ph":"b")");
-            async_event.emplace(event);
-            async_event->set_flow_entry_type(TraceEvent::FLOW_END);
-            async_event->set_timestamp_ps(event.timestamp_ps() +
-                                          event.duration_ps());
-            async_event->clear_raw_data();
-            break;
-        }
-      }
-    }
-    WriteArgs(event);
-    if (event.has_serial()) {
-      output_->Append(R"(,"z":)", event.serial());
-    }
-
-    output_->Append("}");
-    counter_.Inc(event_type);
-    if (async_event) {
-      output_->Append(",");
-      WriteEvent(*async_event);
-    }
-  }
-
- private:
-  void WriteArgs(const TraceEvent& event) const {
-    if (!event.has_group_id() && !event.has_raw_data()) {
-      return;
-    }
-    output_->Append(R"(,"args":{)");
-    std::optional<uint64_t> stack_frames;
-    JsonSeparator<IOBuffer> separator(output_);
-    if (event.has_group_id()) {
-      separator.Add();
-      output_->Append(R"("group_id":)", event.group_id());
-    }
-    if (event.has_raw_data()) {
-      RawDataType data;
-      data.ParseFromString(event.raw_data());
-      switch (data.raw_data_case()) {
-        case RawDataType::RAW_DATA_NOT_SET:
-          break;
-        case RawDataType::kTpuData:
-          WriteTpuData<RawDataType, IOBuffer>(data, &separator, output_);
-          break;
-        case RawDataType::kDmaActivity:
-          separator.Add();
-          output_->Append(R"("DMA activity":)",
-                          ProtoString(data.dma_activity()));
-          break;
-        case RawDataType::kArgs:
-          for (const auto& arg : data.args().arg()) {
-            switch (arg.value_case()) {
-              case TraceEventArguments::Argument::kStrValue:
-                separator.Add();
-                WriteArg(arg.name(), arg.str_value());
-                break;
-              case TraceEventArguments::Argument::kIntValue:
-                separator.Add();
-                WriteArg(arg.name(), arg.int_value());
-                break;
-              case TraceEventArguments::Argument::kUintValue:
-                separator.Add();
-                WriteArg(arg.name(), arg.uint_value());
-                break;
-              case TraceEventArguments::Argument::kDoubleValue:
-                separator.Add();
-                WriteArg(arg.name(), arg.double_value());
-                break;
-              case TraceEventArguments::Argument::kRefValue: {
-                const auto& it = trace_.name_table().find(arg.ref_value());
-                if (it != trace_.name_table().end()) {
-                  // Each event could only have one stack frame.
-                  if (absl::StartsWith(it->second, "@@") && !stack_frames) {
-                    stack_frames = arg.ref_value();
-                  } else {
-                    separator.Add();
-                    WriteArg(arg.name(), it->second);
-                  }
-                }
-                break;
-              }
-              case TraceEventArguments::Argument::VALUE_NOT_SET:
-                break;
-            }
-          }
-          break;
-      }
-    }
-    output_->Append("}");
-
-    // Write the optional stack frame.
-    if (stack_frames.has_value()) {
-      output_->Append(R"(,"sf":)", references_.at(*stack_frames), R"()");
-    }
-  }
-  void WriteArg(absl::string_view name, absl::string_view value) const {
-    output_->Append(JsonEscape(name), ":", JsonEscape(value));
-  }
-  void WriteArg(absl::string_view name, uint64_t value) const {
-    // Limit beyond which integers converted to 64-bit IEEE floating point may
-    // lose accuracy. JavaScript stores all numbers as doubles, quote the value
-    // to preserve accuracy.
-    // https://en.wikipedia.org/wiki/Double-precision_floating-point_format
-    constexpr uint64_t kIeeeLimit = 1ULL << 53;
-    if (value > kIeeeLimit) {
-      output_->Append(JsonEscape(name), ":\"", value, "\"");
-    } else {
-      output_->Append(JsonEscape(name), ":", value);
-    }
-  }
-  void WriteArg(absl::string_view name, int64_t value) const {
-    // Limit beyond which integers converted to 64-bit IEEE floating point may
-    // lose accuracy. JavaScript stores all numbers as doubles, quote the value
-    // to preserve accuracy.
-    // https://en.wikipedia.org/wiki/Double-precision_floating-point_format
-    constexpr uint64_t kIeeeLimit = 1ULL << 53;
-    if (abs(value) > kIeeeLimit) {
-      output_->Append(JsonEscape(name), ":\"", value, "\"");
-    } else {
-      output_->Append(JsonEscape(name), ":", value);
-    }
-  }
-  void WriteArg(absl::string_view name, double value) const {
-    if (std::isfinite(value)) {
-      output_->Append(JsonEscape(name));
-      // "%.17g" is the default double format in proto2::util::JsonFormat.
-      absl::Format(output_, ":%.17g", value);
-    } else if (std::isinf(value)) {
-      output_->Append(JsonEscape(name), R"(:"Infinity")");
-    } else if (std::isinf(-value)) {
-      output_->Append(JsonEscape(name), R"(:"-Infinity")");
-    } else {
-      output_->Append(JsonEscape(name), R"(:"NaN")");
-    }
-  }
-
-  const TraceEventsColorerInterface* colorer_;
-  const Trace& trace_;
-  const std::map<uint64_t, uint64_t>& references_;
-  IOBuffer* output_;
-  mutable JsonEventCounter counter_;
-};
-
-template <typename IOBuffer>
-void WriteTasks(const Trace& trace, IOBuffer* output) {
-  const auto& tasks = trace.tasks();
-  if (tasks.empty()) return;
-  output->Append(R"("tasks":[)");
-  JsonSeparator<IOBuffer> task_separator(output);
-  std::map<uint32_t, Task> ordered_tasks(tasks.begin(), tasks.end());
-  for (const auto& entry : ordered_tasks) {
-    const uint32_t host_id = entry.first;
-    const auto& task = entry.second;
-
-    task_separator.Add();
-    output->Append("{");
-    JsonSeparator<IOBuffer> field_separator(output);
-    field_separator.Add();
-    output->Append(R"("host_id":)", host_id);
-    if (task.has_changelist()) {
-      field_separator.Add();
-      output->Append(R"("changelist":)", task.changelist());
-    }
-    if (task.has_clean_build()) {
-      field_separator.Add();
-      output->Append(R"("clean_build":)", task.clean_build());
-    }
-    if (task.has_build_time()) {
-      field_separator.Add();
-      output->Append(
-          R"("build_time":)",
-          JsonEscape(absl::FormatTime(absl::FromUnixNanos(task.build_time()),
-                                      absl::UTCTimeZone())));
-    }
-    if (task.has_build_target()) {
-      field_separator.Add();
-      output->Append(R"("build_target":)", JsonEscape(task.build_target()));
-    }
-    if (task.has_command_line()) {
-      field_separator.Add();
-      output->Append(R"("command_line":)", JsonEscape(task.command_line()));
-    }
-    if (task.has_start_time()) {
-      field_separator.Add();
-      output->Append(
-          R"("start_time":)",
-          JsonEscape(absl::FormatTime(absl::FromUnixNanos(task.start_time()),
-                                      absl::UTCTimeZone())));
-    }
-    if (task.has_gtc_freq_hz()) {
-      field_separator.Add();
-      output->Append(R"("gtc_freq_hz":)", task.gtc_freq_hz());
-    }
-    if (task.has_tensor_core_freq_hz()) {
-      field_separator.Add();
-      output->Append(R"("tensor_core_freq_hz":)", task.tensor_core_freq_hz());
-    }
-    if (task.has_sparse_core_freq_hz()) {
-      field_separator.Add();
-      output->Append(R"("sparse_core_freq_hz":)", task.sparse_core_freq_hz());
-    }
-    output->Append("}");
-  }
-  output->Append("],");
-}
-
-template <typename IOBuffer>
-void WriteStackFrames(const Trace& trace,
-                      const std::map<uint64_t, uint64_t>& references,
-                      IOBuffer* output) {
-  const auto& name_table = trace.name_table();
-  output->Append(R"("stackFrames":{)");
-  JsonSeparator<IOBuffer> separator(output);
-  for (const auto& [fp, name] : name_table) {
-    if (!absl::StartsWith(name, "@@")) continue;
-    separator.Add();
-    std::string_view name_view = name;
-    absl::ConsumePrefix(&name_view, "@@");
-    output->Append(R"(")", references.at(fp), R"(":{"name":)",
-                   JsonEscape(name_view), R"(})");
-  }
-  output->Append("},");
-}
-
-template <typename IOBuffer>
-void WriteDetails(const JsonTraceOptions::Details& details, IOBuffer* output) {
-  if (details.empty()) return;
-  output->Append(R"("details":[)");
-  JsonSeparator<IOBuffer> separator(output);
-  for (const auto& detail : details) {
-    separator.Add();
-    output->Append(R"({"name":)", JsonEscape(detail.first), R"(,"value":)",
-                   detail.second ? "true" : "false", "}");
-  }
-  output->Append("],");
-}
-
-std::map<uint64_t, uint64_t> BuildStackFrameReferences(const Trace& trace);
-
-template <typename IOBuffer>
-void WriteReturnedEventsSize(const int events_size, IOBuffer* output) {
-  output->Append(R"("returnedEventsSize":)", events_size, R"(,)");
-}
-
-template <typename IOBuffer>
-void WriteFilteredByVisibility(bool filtered_by_visibility, IOBuffer* output) {
-  absl::string_view filtered_by_visibility_str =
-      filtered_by_visibility ? "true" : "false";
-  output->Append(R"("filteredByVisibility":)", filtered_by_visibility_str,
-                 R"(,)");
-}
-
-template <typename IOBuffer>
-void WriteTraceFullTimespan(const Trace* trace, IOBuffer* output) {
-  auto start_time_ms = trace->min_timestamp_ps() / 1000000000.0;
-  auto end_time_ms = trace->max_timestamp_ps() / 1000000000.0;
-  output->Append(R"("fullTimespan":[)", start_time_ms, R"(,)", end_time_ms,
-                 R"(],)");
-}
-
-template <typename IOBuffer, typename TraceEventsContainer,
-          typename RawDataType>
-void TraceEventsToJson(const JsonTraceOptions& options,
-                       const TraceEventsContainer& events, IOBuffer* output) {
-  // Set the displayTimeUnit to nanoseconds (default is milliseconds), so the UI
-  // uses higher-precision when manipulating event times. Note that the
-  // timestamps of trace events are always given in microseconds.
-  output->Append(
-      R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true}, "codeLink":")",
-      options.code_link, R"(",)");
-
-  output->Append(absl::StrFormat(R"("useNewBackend": %s,)",
-                                 options.use_new_backend ? "true" : "false"));
-  WriteDetails(options.details, output);
-  WriteReturnedEventsSize(events.NumEvents(), output);
-  WriteFilteredByVisibility(events.FilterByVisibility(), output);
-  WriteTraceFullTimespan(&events.trace(), output);
-
-  const Trace& trace = events.trace();
-
-  WriteTasks(trace, output);
-
-  auto references = BuildStackFrameReferences(trace);
-  if (options.generate_stack_frames) {
-    WriteStackFrames(trace, references, output);
-  }
-
-  output->Append(R"("traceEvents":[)");
-  JsonSeparator<IOBuffer> separator(output);
-  // Write metadata events.
-  std::map<uint32_t, Device> ordered_devices(trace.devices().begin(),
-                                             trace.devices().end());
-  for (const auto& [device_id, device] : ordered_devices) {
-    if (device.has_name()) {
-      separator.Add();
-      output->Append(R"({"args":{"name":)", JsonEscape(device.name()),
-                     R"(},"name":"process_name","ph":"M","pid":)", device_id,
-                     R"(,"thread_count":)", device.resources_size(), "}");
-    }
-    separator.Add();
-    output->Append(R"({"args":{"sort_index":)", device_id,
-                   R"(},"name":"process_sort_index","ph":"M","pid":)",
-                   device_id, "}");
-    std::map<uint32_t, Resource> ordered_resources(device.resources().begin(),
-                                                   device.resources().end());
-    for (const auto& [resource_id, resource] : ordered_resources) {
-      if (resource.has_name()) {
-        separator.Add();
-        output->Append(R"({"args":{"name":)", JsonEscape(resource.name()),
-                       R"(},"name":"thread_name","ph":"M","pid":)", device_id,
-                       R"(,"tid":)", resource_id, "}");
-      }
-      if (!options.sort_resources_by_name.count(device_id)) {
-        separator.Add();
-        output->Append(R"({"args":{"sort_index":)", resource_id,
-                       R"(},"name":"thread_sort_index","ph":"M","pid":)",
-                       device_id, R"(,"tid":)", resource_id, "}");
-      }
-    }
-  }
-
-  TraceEventsColorerInterface* colorer = options.colorer;
-  DefaultTraceEventsColorer default_colorer;
-  if (colorer == nullptr) colorer = &default_colorer;
-  colorer->SetUp(trace);
-
-  // Write events.
-  JsonEventWriter<IOBuffer, RawDataType> writer(colorer, trace, references,
-                                                output);
-  events.ForAllEvents([&](const TraceEvent& event) {
-    separator.Add();
-    writer.WriteEvent(event);
-  });
-  output->Append("]}");
-}
-
-class IOBufferAdapter {
- public:
-  explicit IOBufferAdapter(std::string* output) : output_(output) {}
-
-  template <typename... AV>
-  inline void Append(AV&&... args) {
-    absl::StrAppend(output_, std::forward<AV>(args)...);
-  }
-
-  // Support IOBufferAdapter as a sink object for absl::Format.
-  friend void AbslFormatFlush(IOBufferAdapter* buffer, absl::string_view s) {
-    absl::StrAppend(buffer->output_, s);
-  }
-
- private:
-  std::string* output_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/trace_viewer/trace_events_to_json.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc
deleted file mode 100644
index ace020ae1d63..000000000000
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Functor that compares flow events for sorting.
-struct FlowEventsComparator {
-  bool operator()(const TraceEvent* a, const TraceEvent* b) const {
-    if (a->timestamp_ps() < b->timestamp_ps()) return true;
-    if (a->timestamp_ps() > b->timestamp_ps()) return false;
-    return (a->flow_entry_type() < b->flow_entry_type());
-  }
-};
-
-std::vector<TraceEventFlow> SplitEventFlow(TraceEventFlow&& flow) {
-  std::vector<TraceEventFlow> flows;
-  absl::c_sort(flow, FlowEventsComparator());
-  TraceEventFlow* current = nullptr;
-  for (TraceEvent* event : flow) {
-    if (current == nullptr ||
-        event->flow_entry_type() == TraceEvent::FLOW_START) {
-      current = &flows.emplace_back();
-    }
-    current->push_back(event);
-    if (event->flow_entry_type() == TraceEvent::FLOW_END) {
-      current = nullptr;
-    }
-  }
-  return flows;
-}
-
-void ExpandTraceSpan(const tsl::profiler::Timespan& span, Trace* trace) {
-  if (!trace->has_min_timestamp_ps() ||
-      span.begin_ps() < trace->min_timestamp_ps()) {
-    trace->set_min_timestamp_ps(span.begin_ps());
-  }
-  if (!trace->has_max_timestamp_ps() ||
-      span.end_ps() > trace->max_timestamp_ps()) {
-    trace->set_max_timestamp_ps(span.end_ps());
-  }
-}
-
-class DefaultResourceGrouper : public ResourceGrouperInterface {
- public:
-  explicit DefaultResourceGrouper(uint32_t device_id, absl::string_view name)
-      : device_id_(device_id), name_(name) {}
-
-  std::vector<std::pair<uint32_t, absl::string_view>> Devices() const override {
-    return {{device_id_, name_}};
-  }
-
-  uint32_t GetDeviceId(uint32_t resource_id) const override {
-    return device_id_;
-  }
-
- private:
-  uint32_t device_id_;
-  absl::string_view name_;
-};
-
-std::unique_ptr<ResourceGrouperInterface> CreateDefaultResourceGrouper(
-    uint32_t device_id, absl::string_view name) {
-  return std::make_unique<DefaultResourceGrouper>(device_id, name);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
index 832da3f3e8ea..51d079953918 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
@@ -15,154 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
 
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Returns the resource name for the given (device_id, resource_id) in trace.
-inline absl::string_view ResourceName(const Trace& trace, uint32_t device_id,
-                                      uint32_t resource_id) {
-  return trace.devices().at(device_id).resources().at(resource_id).name();
-}
-
-// Returns the resource name for the given event in trace.
-inline absl::string_view ResourceName(const Trace& trace,
-                                      const TraceEvent& event) {
-  return ResourceName(trace, event.device_id(), event.resource_id());
-}
-
-// Functor that compares trace events for sorting.
-// Trace events are sorted by timestamp_ps (ascending) and duration_ps
-// (descending) so nested events are sorted from outer to innermost.
-struct TraceEventsComparator {
-  bool operator()(const TraceEvent* a, const TraceEvent* b) const {
-    if (a->timestamp_ps() < b->timestamp_ps()) return true;
-    if (a->timestamp_ps() > b->timestamp_ps()) return false;
-    return (a->duration_ps() > b->duration_ps());
-  }
-};
-
-// Creates a tsl::profiler::Timespan from a TraceEvent.
-inline tsl::profiler::Timespan EventSpan(const TraceEvent& event) {
-  return tsl::profiler::Timespan(event.timestamp_ps(), event.duration_ps());
-}
-
-// Creates a tsl::profiler::Timespan from a Trace.
-inline tsl::profiler::Timespan TraceSpan(const Trace& trace) {
-  return tsl::profiler::Timespan::FromEndPoints(trace.min_timestamp_ps(),
-                                                trace.max_timestamp_ps());
-}
-
-// A flow of events in the trace-viewer.
-// All events in the flow have the same flow_id.
-using TraceEventFlow = std::vector<TraceEvent*>;
-
-// In case the flow_id was re-used, split into individual flows based on the
-// flow_entry_type.
-std::vector<TraceEventFlow> SplitEventFlow(TraceEventFlow&& flow);
-
-// Returns whether the flow is complete.
-inline bool IsCompleteFlow(const TraceEventFlow& flow) {
-  DCHECK(!flow.empty());
-  return flow.front()->flow_entry_type() == TraceEvent::FLOW_START &&
-         flow.back()->flow_entry_type() == TraceEvent::FLOW_END;
-}
-
-// Updates the timestamps of a Trace to ensure it includes the given
-// tsl::profiler::Timespan.
-void ExpandTraceSpan(const tsl::profiler::Timespan& span, Trace* trace);
-
-// Nway-merge implementation.
-
-// Reorders the elements of the range [first, last) to restore the heap
-// condition (i.e. `std::is_heap(first, last, comp)`) following a change
-// in the value of `*first`.
-//
-// REQUIRES: `first < last`, and [first, last) would be a valid heap if `*first`
-// had a suitable value.
-template <typename RandIt, typename Compare>
-void push_down_root(RandIt first, RandIt last, Compare comp) {
-  size_t size = last - first;
-  size_t hole = 0;  // root.
-  auto value = std::move(*first);
-  while (true) {
-    size_t l_child = 2 * hole + 1;
-    size_t r_child = l_child + 1;
-    size_t max_child = l_child;
-    if (r_child < size && comp(first[l_child], first[r_child])) {
-      max_child = r_child;
-    }
-    if (max_child >= size) break;
-    if (!comp(value, first[max_child])) break;
-    first[hole] = std::move(first[max_child]);
-    hole = max_child;
-  }
-  first[hole] = std::move(value);
-}
-
-// ContainerContainer could be a container of pointers to container.
-template <typename ContainerContainer, typename Out, typename Cmp>
-Out nway_merge(const ContainerContainer& containers, Out out, Cmp cmp) {
-  using std::begin;
-  using std::end;
-  using In = decltype(begin(**begin(containers)));  // The input iterator type.
-  using Range = std::pair<In, In>;
-  std::vector<Range> sources;
-  for (const auto& container : containers) {
-    Range r(begin(*container), end(*container));
-    if (r.first != r.second) {
-      sources.push_back(r);
-    }
-  }
-  if (sources.empty()) return out;
-  // Take a comparator for T and produce an inverse comparator
-  // for std::pair<In<T>, In<T>>, inverted so as to produce a min-heap.
-  auto heap_cmp = [&](const Range& a, const Range& b) {
-    // Compares b < a instead of a < b.
-    return cmp(*b.first, *a.first);
-  };
-  std::make_heap(sources.begin(), sources.end(), heap_cmp);
-  while (true) {
-    Range& r = sources.front();
-    *out = *r.first;
-    ++r.first;
-    ++out;
-    if (r.first == r.second) {
-      if (sources.size() == 1) return out;
-      r = std::move(sources.back());
-      sources.pop_back();
-    }
-    push_down_root(sources.begin(), sources.end(), heap_cmp);
-  }
-}
-
-// Interface that allows defining classes that map XLines within a single XPlane
-// to multiple virtual devices in trace viewer.
-class ResourceGrouperInterface {
- public:
-  virtual ~ResourceGrouperInterface() = default;
-
-  virtual std::vector<std::pair<uint32_t /*resource_id*/, absl::string_view>>
-  Devices() const = 0;
-
-  virtual uint32_t GetDeviceId(uint32_t resource_id) const = 0;
-};
-
-std::unique_ptr<ResourceGrouperInterface> CreateDefaultResourceGrouper(
-    uint32_t device_id, absl::string_view name);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/trace_viewer/trace_events_util.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.cc
deleted file mode 100644
index 466f5abe8bc1..000000000000
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h"
-
-#include <cstdint>
-
-#include "absl/strings/string_view.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// Pre-defined color names (excluding "black" and "white") from:
-// https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html.
-// Use raw string to add double-quote around the color name.
-const absl::string_view kTraceViewerColors[kNumTraceViewerColors] = {
-    R"("thread_state_uninterruptible")",
-    R"("thread_state_iowait")",
-    R"("thread_state_running")",
-    R"("thread_state_runnable")",
-    R"("thread_state_unknown")",
-    R"("background_memory_dump")",
-    R"("light_memory_dump")",
-    R"("detailed_memory_dump")",
-    R"("vsync_highlight_color")",
-    R"("generic_work")",
-    R"("good")",
-    R"("bad")",
-    R"("terrible")",
-    R"("grey")",
-    R"("yellow")",
-    R"("olive")",
-    R"("rail_response")",
-    R"("rail_animation")",
-    R"("rail_idle")",
-    R"("rail_load")",
-    R"("startup")",
-    R"("heap_dump_stack_frame")",
-    R"("heap_dump_object_type")",
-    R"("heap_dump_child_node_arrow")",
-    R"("cq_build_running")",
-    R"("cq_build_passed")",
-    R"("cq_build_failed")",
-    R"("cq_build_abandoned")",
-    R"("cq_build_attempt_runnig")",
-    R"("cq_build_attempt_passed")",
-    R"("cq_build_attempt_failed")"};
-
-}  // namespace
-
-absl::string_view TraceViewerColorName(uint32_t color_id) {
-  return kTraceViewerColors[color_id % kNumTraceViewerColors];
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h
index be2bb9f0a241..e710a17d48d0 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_color.h
@@ -15,84 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
 
-#include <cstdint>
-#include <optional>
-
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Pre-defined color names (excluding "black" and "white") from:
-// https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html.
-// Possible value of TraceEvent.color_id
-enum TraceViewerColor {
-  kThreadStateUninterruptible,
-  kThreadStateIowait,
-  kThreadStateRunning,
-  kThreadStateRunnable,
-  kThreadStateUnknown,
-  kBackgroundMemoryDump,
-  kLightMemoryDump,
-  kDetailedMemoryDump,
-  kVsyncHighlightColor,
-  kGenericWork,
-  kGood,
-  kBad,
-  kTerrible,
-  kGrey,
-  kYellow,
-  kOlive,
-  kRailResponse,
-  kRailAnimation,
-  kRailIdle,
-  kRailLoad,
-  kStartup,
-  kHeapDumpStackFrame,
-  kHeapDumpObjectType,
-  kHeapDumpChildNodeArrow,
-  kCqBuildRunning,
-  kCqBuildPassed,
-  kCqBuildFailed,
-  kCqBuildAbandoned,
-  kCqBuildAttemptRunnig,
-  kCqBuildAttemptPassed,
-  kCqBuildAttemptFailed,
-};
-
-// Number of named colors in TraceViewer.
-constexpr uint32_t kNumTraceViewerColors =
-    TraceViewerColor::kCqBuildAttemptFailed + 1;
-
-// Returns the color name for a given color id.
-// Used to decode the value in TraceEvent.color_id.
-absl::string_view TraceViewerColorName(uint32_t color_id);
-
-// Trace event colorer interface.
-class TraceEventsColorerInterface {
- public:
-  virtual ~TraceEventsColorerInterface() = default;
-
-  // Allow sub-classes to set up coloring by processing the trace, e.g., by
-  // capturing the names of devices and resources that need to be colored.
-  virtual void SetUp(const Trace& trace) = 0;
-
-  // Returns the color for a trace event.
-  virtual std::optional<uint32_t> GetColor(const TraceEvent& event) const = 0;
-};
-
-class DefaultTraceEventsColorer : public TraceEventsColorerInterface {
- public:
-  void SetUp(const Trace& trace) override {}
-
-  std::optional<uint32_t> GetColor(const TraceEvent& event) const override {
-    return std::nullopt;
-  }
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/trace_viewer/trace_viewer_color.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_COLOR_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.cc
deleted file mode 100644
index c51f18043aa4..000000000000
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
-
-#include <cstdint>
-
-#include "absl/log/check.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-TraceViewerVisibility::TraceViewerVisibility(
-    tsl::profiler::Timespan visible_span, uint64_t resolution_ps)
-    : visible_span_(visible_span), resolution_ps_(resolution_ps) {}
-
-bool TraceViewerVisibility::Visible(const TraceEvent& event) {
-  // If visible_span_ is instant, we cannot usefully filter.
-  if (visible_span_.Instant()) return true;
-
-  // Events outside visible_span are not visible.
-  tsl::profiler::Timespan span(event.timestamp_ps(), event.duration_ps());
-  if (!visible_span_.Overlaps(span)) return false;
-
-  // If resolution is zero, no downsampling.
-  if (resolution_ps_ == 0) return true;
-
-  return VisibleAtResolution(event);
-}
-
-bool TraceViewerVisibility::VisibleAtResolution(const TraceEvent& event) {
-  DCHECK_NE(resolution_ps_, 0);
-  // A counter event is visible if its distance from the last visible counter
-  // event in the same device is large enough. The first counter event in a
-  // row is always visible.
-  if (!event.has_resource_id()) {
-#if 1
-    // TODO(b/218368708): Streaming mode does not seem to work for counters:
-    // even if more counter events are loaded, the chart does not refresh.
-    // For now, the workaround is to make counters always visible.
-    return true;
-#else
-    // TODO(b/218368708): Provided streaming mode works, we should use the
-    // difference in counter values as a criteria for visibility: if the height
-    // of the bar changes significantly, ignore the time between updates.
-    CounterRowId counter_row_id(event.device_id(), event.name());
-    auto iter = last_counter_timestamp_ps_.find(counter_row_id);
-    bool found = (iter != last_counter_timestamp_ps_.end());
-    bool visible =
-        !found || ((event.timestamp_ps() - iter->second) >= resolution_ps_);
-    if (visible) {
-      if (found) {
-        iter->second = event.timestamp_ps();
-      } else {
-        last_counter_timestamp_ps_.emplace(counter_row_id,
-                                           event.timestamp_ps());
-      }
-    }
-    return visible;
-#endif
-  }
-
-  // An event is visible if its duration is large enough.
-  tsl::profiler::Timespan span(event.timestamp_ps(), event.duration_ps());
-  bool visible = (span.duration_ps() >= resolution_ps_);
-
-  auto& row = rows_[RowId(event.device_id(), event.resource_id())];
-
-  // An event is visible if it is the first event at its nesting depth, or its
-  // distance from the last visible event at the same depth is large enough.
-  size_t depth = row.Depth(span.begin_ps());
-  if (!visible) {
-    auto last_end_timestamp_ps = row.LastEndTimestampPs(depth);
-    visible = !last_end_timestamp_ps ||
-              (span.begin_ps() - *last_end_timestamp_ps >= resolution_ps_);
-  }
-
-  // A flow event is visible if the first event in the flow is visible.
-  // The first event in the flow is visible if the distance between its arrow
-  // binding point and the previous visible arrow binding point is large enough.
-  // The arrow binds to the end time of the complete event.
-  if (event.has_flow_id()) {
-    // Only compute visibility for the first event in the flow.
-    auto result = flows_.try_emplace(event.flow_id(), visible);
-    if (!visible) {
-      if (result.second) {
-        auto last_flow_timestamp_ps = row.LastFlowTimestampPs();
-        result.first->second =
-            !last_flow_timestamp_ps ||
-            (span.end_ps() - *last_flow_timestamp_ps >= resolution_ps_);
-      }
-      visible = result.first->second;
-    }
-    // If we see the last event in the flow, remove it from the map. We don't
-    // use flow_entry_type for determining the first event in the flow because
-    // for cross-host flows it won't be FLOW_START.
-    // This removal prevents the map from growing too large.
-    if (event.flow_entry_type() == TraceEvent::FLOW_END) {
-      flows_.erase(result.first);
-    }
-    if (visible) {
-      row.SetLastFlowTimestampPs(span.end_ps());
-    }
-  }
-
-  if (visible) {
-    row.SetLastEndTimestampPs(depth, span.end_ps());
-  }
-  return visible;
-}
-
-void TraceViewerVisibility::SetVisibleAtResolution(const TraceEvent& event) {
-  DCHECK_NE(resolution_ps_, 0);
-  if (!event.has_resource_id()) {
-    CounterRowId counter_row_id(event.device_id(), event.name());
-    last_counter_timestamp_ps_.insert_or_assign(counter_row_id,
-                                                event.timestamp_ps());
-
-  } else {
-    tsl::profiler::Timespan span(event.timestamp_ps(), event.duration_ps());
-    auto& row = rows_[RowId(event.device_id(), event.resource_id())];
-    if (event.has_flow_id()) {
-      if (event.flow_entry_type() == TraceEvent::FLOW_END) {
-        flows_.erase(event.flow_id());
-      } else {
-        flows_.try_emplace(event.flow_id(), true);
-      }
-      row.SetLastFlowTimestampPs(span.end_ps());
-    }
-    size_t depth = row.Depth(span.begin_ps());
-    row.SetLastEndTimestampPs(depth, span.end_ps());
-  }
-}
-
-size_t TraceViewerVisibility::RowVisibility::Depth(
-    uint64_t begin_timestamp_ps) const {
-  size_t depth = 0;
-  for (; depth < last_end_timestamp_ps_.size(); ++depth) {
-    if (last_end_timestamp_ps_[depth] <= begin_timestamp_ps) break;
-  }
-  return depth;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
index 6648c8959b36..63b7210d8bd5 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h
@@ -12,175 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
 
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/types/optional.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_filter_interface.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Determines whether an event will be visible in trace viewer within a visible
-// tsl::profiler::Timespan at a certain resolution.
-// Events must be evaluated in order by timestamp, because when an event is
-// determined to be visible, the internal state of this class is updated.
-class TraceViewerVisibility {
- public:
-  // Create with visible timespan and resolution (in picoseconds).
-  // The visible timespan must have non-zero duration.
-  // If resolution is zero, no events are downsampled.
-  explicit TraceViewerVisibility(tsl::profiler::Timespan visible_span,
-                                 uint64_t resolution_ps = 0);
-
-  // Returns true if the event overlaps the visible span and is distinguishable
-  // at resolution_ps.
-  bool Visible(const TraceEvent& event);
-
-  // Returns true if the event is distinguishable at resolution_ps.
-  bool VisibleAtResolution(const TraceEvent& event);
-
-  // Records that event is distinguishable at resolution_ps.
-  void SetVisibleAtResolution(const TraceEvent& event);
-
-  tsl::profiler::Timespan VisibleSpan() const { return visible_span_; }
-  // TODO(tf-profiler) Rename ResolutionPs and resolution_ps to be more
-  // self-explanatory (eg. MinDurationPs)
-  uint64_t ResolutionPs() const { return resolution_ps_; }
-
- private:
-  // Identifier for one Trace Viewer row.
-  using RowId = std::pair<uint32_t /*device_id*/, uint32_t /*resource_id*/>;
-  using CounterRowId = std::pair<uint32_t /*device_id*/, std::string /*name*/>;
-
-  // Visibility for one Trace Viewer row.
-  class RowVisibility {
-   public:
-    // Returns the nesting depth for an event at begin_timestamp_ps.
-    size_t Depth(uint64_t begin_timestamp_ps) const;
-
-    // Returns the end_timestamp_ps of the last visibile event at the given
-    // nesting depth.
-    std::optional<uint64_t> LastEndTimestampPs(size_t depth) const {
-      std::optional<uint64_t> result;
-      if (depth < last_end_timestamp_ps_.size()) {
-        result = last_end_timestamp_ps_[depth];
-      }
-      return result;
-    }
-
-    // Returns the arrow timestamp of the last visible flow event.
-    std::optional<uint64_t> LastFlowTimestampPs() const {
-      return last_flow_timestamp_ps_;
-    }
-
-    // Sets the last visible timestamp at the given nesting depth.
-    void SetLastEndTimestampPs(size_t depth, uint64_t timestamp_ps) {
-      last_end_timestamp_ps_.resize(depth);
-      last_end_timestamp_ps_.push_back(timestamp_ps);
-    }
-
-    // Sets the last visible arrow timestamp.
-    void SetLastFlowTimestampPs(uint64_t timestamp_ps) {
-      last_flow_timestamp_ps_ = timestamp_ps;
-    }
-
-   private:
-    // Stack of most recently visible event end times. A stack is used to handle
-    // nested events.
-    std::vector<uint64_t> last_end_timestamp_ps_;
-
-    // Timestamp of the arrow binding point of the last visible flow event.
-    std::optional<uint64_t> last_flow_timestamp_ps_;
-  };
-
-  // Constructor arguments.
-  tsl::profiler::Timespan visible_span_;
-  uint64_t resolution_ps_;
-
-  // Visibility data for all rows.
-  absl::flat_hash_map<RowId, RowVisibility> rows_;
-
-  // Visibility of flows.
-  absl::flat_hash_map<uint64_t /*flow_id*/, bool> flows_;
-
-  // Visibility data for counter events.
-  absl::flat_hash_map<CounterRowId, uint64_t> last_counter_timestamp_ps_;
-};
-
-class TraceVisibilityFilter : public TraceEventsFilterInterface {
- public:
-  // If visible_span.Instant(), all events are visible.
-  // If resolution is 0.0, events aren't downsampled.
-  TraceVisibilityFilter(tsl::profiler::Timespan visible_span, double resolution)
-      : resolution_(resolution),
-        visibility_(visible_span, ResolutionPs(visible_span.duration_ps())) {}
-
-  tsl::profiler::Timespan VisibleSpan() const {
-    return visibility_.VisibleSpan();
-  }
-  uint64_t ResolutionPs() const { return visibility_.ResolutionPs(); }
-
-  void SetUp(const Trace& trace) override {
-    // Update visible_span with trace bounds and recompute the resolution in
-    // picoseconds.
-    tsl::profiler::Timespan visible_span = VisibleSpan();
-    uint64_t start_time_ps = visible_span.begin_ps();
-    uint64_t end_time_ps = visible_span.end_ps();
-    if (trace.has_min_timestamp_ps() &&
-        start_time_ps < trace.min_timestamp_ps()) {
-      VLOG(1) << "Adjusting start_time_ps from " << start_time_ps << " to "
-              << trace.min_timestamp_ps();
-      start_time_ps = trace.min_timestamp_ps();
-    }
-    if (trace.has_max_timestamp_ps() &&
-        (end_time_ps == 0 || end_time_ps > trace.max_timestamp_ps())) {
-      VLOG(1) << "Adjusting end_time_ps from " << end_time_ps << " to "
-              << trace.max_timestamp_ps();
-      end_time_ps = trace.max_timestamp_ps();
-    }
-    visible_span =
-        tsl::profiler::Timespan::FromEndPoints(start_time_ps, end_time_ps);
-    visibility_ = TraceViewerVisibility(
-        visible_span, ResolutionPs(visible_span.duration_ps()));
-  }
-
-  // Updates the visibility based on `resolution`.
-  void UpdateVisibility(double resolution) {
-    resolution_ = resolution;
-    visibility_ = TraceViewerVisibility(
-        visibility_.VisibleSpan(),
-        ResolutionPs(visibility_.VisibleSpan().duration_ps()));
-  }
-
-  bool Filter(const TraceEvent& event) override {
-    return !visibility_.Visible(event);
-  }
-
- private:
-  // Returns the minimum duration in picoseconds that an event must have in
-  // order to be visible.
-  uint64_t ResolutionPs(uint64_t duration_ps) {
-    return (resolution_ == 0.0) ? 0 : std::llround(duration_ps / resolution_);
-  }
-
-  double resolution_;  // number of visible events per row
-  TraceViewerVisibility visibility_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/trace_viewer/trace_viewer_visibility.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_VIEWER_VISIBILITY_H_
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility_test.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility_test.cc
deleted file mode 100644
index e9c4dce6d17a..000000000000
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility_test.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
-
-#include <cstdint>
-
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tsl::profiler::Timespan;
-
-constexpr uint32_t kDeviceId = 10;
-constexpr uint32_t kResourceId = 1;
-constexpr uint32_t kSrcResourceId = 2;
-constexpr uint32_t kDstResourceId = 4;
-
-TraceEvent Complete(Timespan span, uint32_t resource_id = kResourceId) {
-  TraceEvent event;
-  event.set_device_id(kDeviceId);
-  event.set_resource_id(resource_id);
-  event.set_timestamp_ps(span.begin_ps());
-  event.set_duration_ps(span.duration_ps());
-  return event;
-}
-
-TraceEvent Counter(uint64_t time_ps) {
-  TraceEvent event;
-  event.set_device_id(kDeviceId);
-  event.set_timestamp_ps(time_ps);
-  return event;
-}
-
-TraceEvent Flow(Timespan span, uint64_t flow_id, uint32_t resource_id) {
-  TraceEvent event;
-  event.set_flow_id(flow_id);
-  event.set_device_id(kDeviceId);
-  event.set_resource_id(resource_id);
-  event.set_timestamp_ps(span.begin_ps());
-  event.set_duration_ps(span.duration_ps());
-  return event;
-}
-
-TEST(TraceViewerVisibilityTest, VisibilityNoDownsampling) {
-  TraceViewerVisibility v(Timespan(1000, 1000));
-
-  // Instant events.
-  EXPECT_FALSE(v.Visible(Complete(Timespan(999))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1000))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1500))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(2000))));
-  EXPECT_FALSE(v.Visible(Complete(Timespan(2001))));
-
-  // Complete events.
-  EXPECT_FALSE(v.Visible(Complete(Timespan(900, 99))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(900, 100))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1450, 100))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(2000, 50))));
-  EXPECT_FALSE(v.Visible(Complete(Timespan(2001, 50))));
-}
-
-// TODO(b/218368708): Counter events are currently always visible.
-TEST(TraceViewerVisibilityTest, DISABLED_CounterEventsDownsampling) {
-  TraceViewerVisibility v(Timespan(1000, 1000), 100);
-
-  // A counter event within the visible span is visible if its distance from the
-  // previous event is >= resolution_ps.
-  EXPECT_FALSE(v.Visible(Counter(999)));
-  EXPECT_TRUE(v.Visible(Counter(1000)));
-  EXPECT_FALSE(v.Visible(Counter(1099)));
-  EXPECT_TRUE(v.Visible(Counter(1100)));
-  EXPECT_TRUE(v.Visible(Counter(2000)));
-  EXPECT_FALSE(v.Visible(Counter(2001)));
-}
-
-TEST(TraceViewerVisibilityTest, CompleteEventsDownsampling) {
-  TraceViewerVisibility v(Timespan(1000, 1000), 100);
-
-  // First event is always visible.
-  EXPECT_TRUE(v.Visible(Complete(Timespan(950, 50))));
-  // Next visible event must have duration_ps >= resolution_ps or its distance
-  // from the previous event must be >= resolution_ps.
-  EXPECT_FALSE(v.Visible(Complete(Timespan(1050, 50))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1055, 200))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1355, 50))));
-}
-
-TEST(TraceViewerVisibilityTest, CompleteNestedEventsDownsampling) {
-  TraceViewerVisibility v(Timespan(1000, 1000), 100);
-
-  // First event is always visible.
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1000, 200))));
-  // Nested events are visible when increasing depth.
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1200, 190))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1250, 20))));
-  // Next visible event must have duration_ps >= resolution_ps or its distance
-  // from the previous event must be >= resolution_ps.
-  EXPECT_FALSE(v.Visible(Complete(Timespan(1270, 20))));
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1290, 100))));
-}
-
-TEST(TraceViewerVisibilityTest, FlowEventsDownsampling) {
-  TraceViewerVisibility v(Timespan(1000, 1000), 100);
-
-  // First event in the flow determines the visibility of the full flow.
-  // First flow event in a row is always visible.
-  EXPECT_TRUE(v.Visible(Flow(Timespan(1000, 50), 1, kSrcResourceId)));
-  // Distance between arrow points must be >= resolution_ps.
-  EXPECT_FALSE(v.Visible(Flow(Timespan(1050, 50), 2, kSrcResourceId)));
-  EXPECT_TRUE(v.Visible(Flow(Timespan(1100, 50), 3, kSrcResourceId)));
-
-  // Other events in the flow have the same visibility as the first event in
-  // the flow.
-  EXPECT_TRUE(v.Visible(Flow(Timespan(1100, 50), 1, kDstResourceId)));
-  EXPECT_FALSE(v.Visible(Flow(Timespan(1200, 52), 2, kDstResourceId)));
-  EXPECT_TRUE(v.Visible(Flow(Timespan(1252, 10), 3, kDstResourceId)));
-
-  // Sanity check for first events: complete events with same distance between
-  // events, the third event is not visible because the distance between
-  // rectangles is not enough, unlike the distance between arrows.
-  EXPECT_TRUE(v.Visible(Complete(Timespan(1300, 50))));
-  EXPECT_FALSE(v.Visible(Complete(Timespan(1350, 50))));
-  EXPECT_FALSE(v.Visible(Complete(Timespan(1400, 50))));
-
-  // Sanity check for other events: if only the distance between arrows is
-  // considered, the second flow would be visible and the third would be
-  // invisible.
-  EXPECT_TRUE(v.Visible(Flow(Timespan(1600, 50), 4, kResourceId)));
-  EXPECT_TRUE(v.Visible(Flow(Timespan(1700, 52), 5, kResourceId)));
-  EXPECT_FALSE(v.Visible(Flow(Timespan(1752, 10), 6, kResourceId)));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
deleted file mode 100644
index 94ecff1b3aa8..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/status/statusor.h"
-#include "absl/strings/match.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/dcn_slack_analysis_combiner.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-bool HasDcnCollectiveStatsInXSpace(const XSpace& xspace) {
-  if (const tensorflow::profiler::XPlane* xplane = FindPlaneWithName(
-          xspace, tensorflow::profiler::kHostThreadsPlaneName);
-      xplane != nullptr) {
-    for (const auto& [_, metadata] : xplane->event_metadata()) {
-      if (absl::StartsWith(metadata.name(), "MegaScale:")) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-absl::StatusOr<bool> GetDcnCollectiveStatsFromMultiXSpaceAndSaveToFile(
-    const SessionSnapshot& session_snapshot) {
-  DcnSlackAnalysisCombiner combiner;
-  for (int idx = 0; idx < session_snapshot.XSpaceSize(); idx++) {
-    std::string hostname = session_snapshot.GetHostname(idx);
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(idx));
-
-    // The profile does not have dcn collective stats.
-    if (!HasDcnCollectiveStatsInXSpace(*xspace)) {
-      DcnSlackAnalysis dcnSlackAnalysis;
-      TF_RETURN_IF_ERROR(WriteBinaryProto(session_snapshot,
-                                          StoredDataType::DCN_COLLECTIVE_STATS,
-                                          kNoHostIdentifier, dcnSlackAnalysis));
-      return false;
-    }
-
-    DcnSlackAnalysis dcnSlackAnalysis =
-        ConvertXSpaceToDcnSlackAnalysis(*xspace, nullptr, nullptr);
-
-    TF_RETURN_IF_ERROR(WriteBinaryProto(session_snapshot,
-                                        StoredDataType::DCN_COLLECTIVE_STATS,
-                                        hostname, dcnSlackAnalysis));
-
-    combiner.Combine(dcnSlackAnalysis);
-  }
-
-  DcnSlackAnalysis dcnSlackAnalysis = combiner.Finalize();
-  TF_RETURN_IF_ERROR(WriteBinaryProto(session_snapshot,
-                                      StoredDataType::DCN_COLLECTIVE_STATS,
-                                      kAllHostsIdentifier, dcnSlackAnalysis));
-
-  // The profile has dcn collective stats.
-  return true;
-}
-
-}  // namespace
-
-absl::StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
-    const SessionSnapshot& session_snapshot) {
-  std::pair<bool, std::string> hasCacheFile;
-  TF_ASSIGN_OR_RETURN(hasCacheFile, session_snapshot.HasCacheFile(
-                                        StoredDataType::DCN_COLLECTIVE_STATS));
-
-  // Cache file not present, check if trace contains dcn collective stats.
-  if (!hasCacheFile.first) {
-    for (int idx = 0; idx < session_snapshot.XSpaceSize(); idx++) {
-      std::string hostname = session_snapshot.GetHostname(idx);
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                          session_snapshot.GetXSpace(idx));
-
-      if (HasDcnCollectiveStatsInXSpace(*xspace)) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  if (hasCacheFile.second.empty()) {
-    // If the profiler finds a file NO_HOST.dcn_collective_stats.pb, this means
-    // dcn collective stats are not present in the profile.
-    return false;
-  } else {
-    // If the profiler finds a file ALL_HOSTS.dcn_collective_stats.pb, this
-    // means dcn collective stats are present in the profile.
-    return true;
-  }
-}
-
-absl::StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
-    const SessionSnapshot& session_snapshot) {
-  std::pair<bool, std::string> hasCacheFile;
-  TF_ASSIGN_OR_RETURN(hasCacheFile, session_snapshot.HasCacheFile(
-                                        StoredDataType::DCN_COLLECTIVE_STATS));
-
-  // Cache file not present, generate dcn collective stats.
-  if (!hasCacheFile.first) {
-    return GetDcnCollectiveStatsFromMultiXSpaceAndSaveToFile(session_snapshot);
-  }
-
-  if (hasCacheFile.second.empty()) {
-    // If the profiler finds a file NO_HOST.dcn_collective_stats.pb, this means
-    // dcn collective stats are not present in the profile.
-    return false;
-  } else {
-    // If the profiler finds a file ALL_HOSTS.dcn_collective_stats.pb, this
-    // means dcn collective stats are present in the profile.
-    return true;
-  }
-}
-
-absl::StatusOr<DcnSlackAnalysis> GetDcnSlackAnalysisByHostName(
-    const SessionSnapshot& session_snapshot, const std::string hostname) {
-  TF_ASSIGN_OR_RETURN(bool hasDcnCollectiveStats,
-                      ConvertMultiXSpaceToDcnCollectiveStats(session_snapshot));
-
-  DcnSlackAnalysis dcnSlackAnalysis;
-  if (hasDcnCollectiveStats) {
-    TF_RETURN_IF_ERROR(ReadBinaryProto(session_snapshot,
-                                       StoredDataType::DCN_COLLECTIVE_STATS,
-                                       hostname, &dcnSlackAnalysis));
-  }
-
-  return dcnSlackAnalysis;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h
deleted file mode 100644
index 68e0b491331b..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_DCN_COLLECTIVE_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_DCN_COLLECTIVE_STATS_H_
-
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Converts multiple XSpaces to dcn collective stats.
-// Stores the dcn collective stats as files in the same directory
-// as the xspace files.
-absl::StatusOr<bool> ConvertMultiXSpaceToDcnCollectiveStats(
-    const SessionSnapshot& session_snapshot);
-
-// Returns whether there are dcn collective stats in the profile.
-absl::StatusOr<bool> HasDcnCollectiveStatsInMultiXSpace(
-    const SessionSnapshot& session_snapshot);
-
-// Gets DcnSlackAnalysis proto for a host.
-absl::StatusOr<DcnSlackAnalysis> GetDcnSlackAnalysisByHostName(
-    const SessionSnapshot& session_snapshot, std::string hostname);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_DCN_COLLECTIVE_STATS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats_test.cc
deleted file mode 100644
index 4a3edbba2b52..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats_test.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h"
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/file_system.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-DcnSlackAnalysis CreateDcnSlackAnalysisProto() {
-  DcnSlackAnalysis dcn_slack_analysis;
-  DcnSlackSummary* dcn_slack_summary =
-      dcn_slack_analysis.add_dcn_slack_summary();
-  dcn_slack_summary->set_rendezvous("collective");
-  dcn_slack_summary->set_recv_op_name("recv-done");
-  dcn_slack_summary->set_send_op_name("send");
-  dcn_slack_summary->set_slack_us(2);
-  dcn_slack_summary->set_observed_duration_us(12);
-  dcn_slack_summary->set_stall_duration_us(5);
-  dcn_slack_summary->set_occurrences(4);
-  dcn_slack_summary->set_bytes_transmitted_over_network(819200);
-  return dcn_slack_analysis;
-}
-
-SessionSnapshot CreateSessionSnapshot(bool create_cache_file,
-                                      bool has_dcn_collective_stats) {
-  std::string test_name =
-      ::testing::UnitTest::GetInstance()->current_test_info()->name();
-  std::string path = absl::StrCat("ram://", test_name, "/");
-  std::unique_ptr<WritableFile> xplane_file;
-  std::vector<std::string> paths = {absl::StrCat(path, "hostname.xplane.pb")};
-
-  auto xspace = std::make_unique<XSpace>();
-  XPlane* xplane = FindOrAddMutablePlaneWithName(xspace.get(), "/host:CPU");
-  if (has_dcn_collective_stats) {
-    XPlaneBuilder xplane_builder(xplane);
-    xplane_builder.GetOrCreateEventMetadata("MegaScale:");
-  }
-
-  if (create_cache_file) {
-    if (has_dcn_collective_stats) {
-      tensorflow::Env::Default()
-          ->NewAppendableFile(
-              absl::StrCat(path, "hostname.dcn_collective_stats.pb"),
-              &xplane_file)
-          .IgnoreError();
-      tensorflow::Env::Default()
-          ->NewAppendableFile(
-              absl::StrCat(path, "ALL_HOSTS.dcn_collective_stats.pb"),
-              &xplane_file)
-          .IgnoreError();
-    } else {
-      tensorflow::Env::Default()
-          ->NewAppendableFile(
-              absl::StrCat(path, "NO_HOST.dcn_collective_stats.pb"),
-              &xplane_file)
-          .IgnoreError();
-    }
-  }
-
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(xspace));
-
-  absl::StatusOr<SessionSnapshot> session_snapshot_status =
-      SessionSnapshot::Create(paths, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_status.status());
-  SessionSnapshot session_snapshot = std::move(session_snapshot_status.value());
-  if (has_dcn_collective_stats) {
-    DcnSlackAnalysis dcn_slack_analysis = CreateDcnSlackAnalysisProto();
-    TF_CHECK_OK(session_snapshot.WriteBinaryProto(
-        DCN_COLLECTIVE_STATS, "hostname", dcn_slack_analysis));
-    TF_CHECK_OK(session_snapshot.WriteBinaryProto(
-        DCN_COLLECTIVE_STATS, kAllHostsIdentifier, dcn_slack_analysis));
-  }
-  return session_snapshot;
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats,
-     HasAllHostsDcnCollectiveStatsCacheFile) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(true, true);
-
-  absl::StatusOr<bool> status =
-      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
-  EXPECT_EQ(status.value(), true);
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats, HasNoHostDcnCollectiveStatsCacheFile) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(true, false);
-
-  absl::StatusOr<bool> status =
-      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
-  EXPECT_EQ(status.value(), false);
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats,
-     NoCacheFileButTraceHasDcnCollectiveStats) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(false, true);
-
-  absl::StatusOr<bool> status =
-      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
-  EXPECT_EQ(status.value(), true);
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats,
-     NoCacheFileNoDcnCollectiveStatsPresent) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(false, false);
-
-  absl::StatusOr<bool> status =
-      HasDcnCollectiveStatsInMultiXSpace(session_snapshot);
-  EXPECT_EQ(status.value(), false);
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats,
-     ConvertXSpaceToDcnCollectiveStatsWhenStatsPresent) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(false, true);
-
-  absl::StatusOr<bool> status =
-      ConvertMultiXSpaceToDcnCollectiveStats(session_snapshot);
-  absl::StatusOr<std::optional<std::string>> all_hosts_filepath =
-      session_snapshot.GetHostDataFilePath(StoredDataType::DCN_COLLECTIVE_STATS,
-                                           kAllHostsIdentifier);
-  absl::StatusOr<std::optional<std::string>> host_filepath =
-      session_snapshot.GetHostDataFilePath(StoredDataType::DCN_COLLECTIVE_STATS,
-                                           "hostname");
-
-  EXPECT_EQ(status.value(), true);
-  TF_EXPECT_OK(all_hosts_filepath.status());
-  EXPECT_TRUE(all_hosts_filepath.value().has_value());
-  EXPECT_FALSE(all_hosts_filepath.value().value().empty());
-  TF_EXPECT_OK(host_filepath.status());
-  EXPECT_TRUE(host_filepath.value().has_value());
-  EXPECT_FALSE(host_filepath.value().value().empty());
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats,
-     ConvertXSpaceToDcnCollectiveStatsWhenStatsNotPresent) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(false, false);
-
-  absl::StatusOr<bool> status =
-      ConvertMultiXSpaceToDcnCollectiveStats(session_snapshot);
-  absl::StatusOr<std::optional<std::string>> filepath =
-      session_snapshot.GetHostDataFilePath(StoredDataType::DCN_COLLECTIVE_STATS,
-                                           kNoHostIdentifier);
-
-  EXPECT_EQ(status.value(), false);
-  TF_EXPECT_OK(filepath.status());
-  EXPECT_TRUE(filepath.value().has_value());
-  EXPECT_FALSE(filepath.value().value().empty());
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats,
-     GetHostDcnSlackAnalysisWhenStatsNotPresent) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(false, false);
-
-  absl::StatusOr<DcnSlackAnalysis> host_dcn_slack_analysis =
-      GetDcnSlackAnalysisByHostName(session_snapshot, "hostname");
-
-  TF_EXPECT_OK(host_dcn_slack_analysis.status());
-  EXPECT_EQ(host_dcn_slack_analysis.value().dcn_slack_summary_size(), 0);
-}
-
-TEST(ConvertXplaneToDcnCollectiveStats,
-     GetHostDcnSlackAnalysisWhenStatsPresent) {
-  SessionSnapshot session_snapshot = CreateSessionSnapshot(true, true);
-
-  absl::StatusOr<DcnSlackAnalysis> host_dcn_slack_analysis =
-      GetDcnSlackAnalysisByHostName(session_snapshot, "hostname");
-
-  TF_EXPECT_OK(host_dcn_slack_analysis.status());
-  EXPECT_EQ(host_dcn_slack_analysis.value().dcn_slack_summary_size(), 1);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_hlo.cc b/tensorflow/core/profiler/convert/xplane_to_hlo.cc
deleted file mode 100644
index 4f57e1341a85..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_hlo.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_hlo.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/profiler/utils/file_system_utils.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-using tsl::profiler::ProfilerJoinPath;
-
-constexpr char kNoModuleIdentifier[] = "NO_MODULE";
-constexpr char kHloProtoSuffix[] = ".hlo_proto.pb";
-
-// Extracts and deduplicates the HLO protos from all the XSpaces.
-// Stores the HLO protos as files in the same directory as the xspace files.
-absl::StatusOr<bool> GetHloProtoFromMultiXSpaceAndSaveToFile(
-    const SessionSnapshot& session_snapshot) {
-  // Get all HLO protos from XSpaces and deduplicate.
-  HloProtoMap hlo_proto_map;
-  for (int i = 0; i < session_snapshot.XSpaceSize(); i++) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(i));
-    hlo_proto_map.AddHloProtosFromXSpace(*xspace);
-  }
-
-  std::vector<absl::string_view> module_list = hlo_proto_map.GetModuleList();
-  // Write an empty identifier if there is no HLO module.
-  if (module_list.empty()) {
-    std::string file_name =
-        ProfilerJoinPath(session_snapshot.GetSessionRunDir(),
-                         absl::StrCat(kNoModuleIdentifier, kHloProtoSuffix));
-    xla::HloProto empty_hlo;
-    TF_RETURN_IF_ERROR(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
-                                                    file_name, empty_hlo));
-    // The profile does not have HLO proto.
-    return false;
-  }
-
-  // Save HLO protos to session run directory.
-  for (const absl::string_view module_name : module_list) {
-    auto hlo_proto_or = hlo_proto_map.GetHloProtoByModuleName(module_name);
-    if (!hlo_proto_or.ok()) {
-      return errors::Internal(hlo_proto_or.status().message());
-    }
-    std::string file_name =
-        ProfilerJoinPath(session_snapshot.GetSessionRunDir(),
-                         absl::StrCat(module_name, kHloProtoSuffix));
-    TF_RETURN_IF_ERROR(tensorflow::WriteBinaryProto(
-        tensorflow::Env::Default(), file_name, *hlo_proto_or.value()));
-  }
-
-  // The profile has HLO proto.
-  return true;
-}
-
-}  // namespace
-
-absl::StatusOr<xla::HloProto> GetHloProtoByModuleName(
-    const SessionSnapshot& session_snapshot,
-    const absl::string_view module_name) {
-  std::string file_name =
-      ProfilerJoinPath(session_snapshot.GetSessionRunDir(),
-                       absl::StrCat(module_name, kHloProtoSuffix));
-  xla::HloProto hlo_proto;
-  TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
-                                                 file_name, &hlo_proto));
-  return hlo_proto;
-}
-
-absl::StatusOr<bool> ConvertMultiXSpaceToHloProto(
-    const SessionSnapshot& session_snapshot) {
-  // Gets all the files in session run directory.
-  // TODO(profiler): Move this glob to SessionSnapshot and build a map from file
-  // type to file paths.
-  std::vector<std::string> results;
-  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->GetChildren(
-      std::string(session_snapshot.GetSessionRunDir()), &results));
-
-  // If the profiler finds a filename with hlo proto suffix, this means HLO
-  // proto was already generated previously.
-  for (const std::string& path : results) {
-    if (absl::EndsWith(path, kHloProtoSuffix)) {
-      if (absl::EndsWith(path,
-                         absl::StrCat(kNoModuleIdentifier, kHloProtoSuffix))) {
-        return false;
-      } else {
-        return true;
-      }
-    }
-  }
-
-  // Generate HLO proto.
-  // TODO(jiesun): Maybe generate a tag file at profile collection time, so
-  // don't need to read XSpace files for checking whether HLO proto exists or
-  // not.
-  return GetHloProtoFromMultiXSpaceAndSaveToFile(session_snapshot);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_hlo.h b/tensorflow/core/profiler/convert/xplane_to_hlo.h
deleted file mode 100644
index 2361ba6e13d1..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_hlo.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_HLO_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_HLO_H_
-
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "xla/service/hlo.pb.h"
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Get HLO proto by module name.
-absl::StatusOr<xla::HloProto> GetHloProtoByModuleName(
-    const SessionSnapshot& session_snapshot, absl::string_view module_name);
-
-// Converts multiple XSpaces to HLO protos.
-// Stores the HLO protos as files in the same directory as the xspace files.
-// Returns whether there are HLO protos in this profile.
-absl::StatusOr<bool> ConvertMultiXSpaceToHloProto(
-    const SessionSnapshot& session_snapshot);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_HLO_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
deleted file mode 100644
index 733185a27476..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
-
-#include <functional>
-#include <ostream>
-#include <string>
-
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/utils/gpu_event_stats.h"
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-void ConvertDeviceTraceXPlaneToKernelReports(
-    const XPlane& device_trace,
-    const std::function<void(const GpuEventStats&, KernelReport*)>&
-        on_kernel_fn,
-    KernelReportMap* reports) {
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&device_trace);
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    if (IsDerivedThreadId(line.Id())) {
-      return;
-    }
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      if (event.DurationNs() == 0) return;
-      KernelReport kernel;
-      GpuEventStats stats(&event);
-      if (!stats.IsKernel()) return;
-
-      kernel.set_name(std::string(event.Name()));
-      kernel.set_is_kernel_using_tensor_core(
-          IsKernelUsingTensorCore(event.Name()));
-      kernel.set_total_duration_ns(event.DurationNs());
-      kernel.set_min_duration_ns(event.DurationNs());
-      kernel.set_max_duration_ns(event.DurationNs());
-      ParseKernelLaunchParams(stats.kernel_details, &kernel);
-
-      if (stats.IsTfOp()) {
-        tsl::profiler::TfOp tf_op =
-            tsl::profiler::ParseTfOpFullname(stats.tf_op_fullname);
-        kernel.set_op_name(std::string(tf_op.name));
-        bool tensor_core_eligible =
-            IsEinsumTensorCoreEligible(stats.equation) ||
-            IsOpTensorCoreEligible(kernel.op_name());
-        if (!tensor_core_eligible && kernel.is_kernel_using_tensor_core()) {
-          VLOG(1) << "Detected new Op using TensorCores: " << kernel.op_name()
-                  << std::endl;
-          tensor_core_eligible = true;
-        }
-        kernel.set_is_op_tensor_core_eligible(tensor_core_eligible);
-      }
-
-      if (on_kernel_fn) {
-        on_kernel_fn(stats, &kernel);
-      }
-
-      KernelReportValue value;
-      value.total_duration_ns = event.DurationNs();
-      value.min_duration_ns = event.DurationNs();
-      value.max_duration_ns = event.DurationNs();
-      value.occurrences = 1;
-      InsertOrUpdateKernelReport(kernel, value, reports);
-    });
-  });
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
deleted file mode 100644
index 57607d06337b..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
-
-#include <functional>
-#include <ostream>
-
-#include "absl/log/log.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/utils/gpu_event_stats.h"
-#include "tensorflow/core/profiler/utils/hlo_module_map.h"
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-void ConvertDeviceTraceXPlaneToKernelReports(
-    const XPlane& device_trace,
-    const std::function<void(const GpuEventStats&, KernelReport*)>&
-        on_kernel_fn,
-    KernelReportMap* reports);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
deleted file mode 100644
index a675e69248a8..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-TEST(ConvertXplaneToKernelStats, MultiKernels) {
-  XSpace space;
-  XPlane* device_trace = space.add_planes();
-  XPlaneBuilder device_trace_builder(device_trace);
-
-  // Empty default stream
-  device_trace_builder.GetOrCreateLine(0);
-
-  XLineBuilder line_builder = device_trace_builder.GetOrCreateLine(0);
-  CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_shortest",
-               /*offset_ps=*/10000, /*duration_ps=*/1000,
-               {{StatType::kTfOp, "mul_786"},
-                {StatType::kKernelDetails, R"MULTI(regs:16
-static_shared:0
-dynamic_shared:0
-grid:1,1,1
-block:1,1,1
-occ_pct:50.0)MULTI"},
-                {StatType::kEquation, ""}});
-
-  CreateXEvent(&device_trace_builder, &line_builder, "kernel_name_middle",
-               /*offset_ps=*/20000, /*duration_ps=*/2000,
-               {{StatType::kTfOp, "Conv2D"},
-                {StatType::kKernelDetails, R"MULTI(regs:32
-static_shared:0
-dynamic_shared:16384
-grid:2,1,1
-block:32,1,1
-occ_pct=13.0)MULTI"},
-                {StatType::kEquation, ""}});
-
-  CreateXEvent(&device_trace_builder, &line_builder,
-               "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn",
-               /*offset_ps=*/30000, /*duration_ps=*/3000,
-               {{StatType::kTfOp, "Einsum_80"},
-                {StatType::kKernelDetails, R"MULTI(regs:32
-static_shared:0
-dynamic_shared:16384
-grid:3,1,1
-block:64,1,1
-occ_pct:25.0)MULTI"},
-                {StatType::kEquation, ""}});
-
-  KernelReportMap reports;
-  ConvertDeviceTraceXPlaneToKernelReports(*device_trace, {}, &reports);
-  KernelStatsDb kernel_stats;
-  CopyTopKDurationKernelReportsToDb(reports, &kernel_stats);
-
-  EXPECT_EQ(kernel_stats.reports_size(), 3);
-
-  {
-    const auto& kernel = kernel_stats.reports().at(2);
-    EXPECT_EQ(kernel.name(), "kernel_name_shortest");
-    EXPECT_EQ(kernel.registers_per_thread(), 16);
-    EXPECT_EQ(kernel.static_shmem_bytes(), 0);
-    EXPECT_EQ(kernel.dynamic_shmem_bytes(), 0);
-    EXPECT_EQ(kernel.grid_dim().at(0), 1);
-    EXPECT_EQ(kernel.grid_dim().at(1), 1);
-    EXPECT_EQ(kernel.grid_dim().at(2), 1);
-    EXPECT_EQ(kernel.block_dim().at(0), 1);
-    EXPECT_EQ(kernel.block_dim().at(1), 1);
-    EXPECT_EQ(kernel.block_dim().at(2), 1);
-    EXPECT_EQ(kernel.total_duration_ns(), 1);
-    EXPECT_FALSE(kernel.is_kernel_using_tensor_core());
-    EXPECT_FALSE(kernel.is_op_tensor_core_eligible());
-    EXPECT_EQ(kernel.op_name(), "mul_786");
-  }
-
-  {
-    const auto& kernel = kernel_stats.reports().at(1);
-    EXPECT_EQ(kernel.name(), "kernel_name_middle");
-    EXPECT_EQ(kernel.registers_per_thread(), 32);
-    EXPECT_EQ(kernel.static_shmem_bytes(), 0);
-    EXPECT_EQ(kernel.dynamic_shmem_bytes(), 16384);
-    EXPECT_EQ(kernel.grid_dim().at(0), 2);
-    EXPECT_EQ(kernel.grid_dim().at(1), 1);
-    EXPECT_EQ(kernel.grid_dim().at(2), 1);
-    EXPECT_EQ(kernel.block_dim().at(0), 32);
-    EXPECT_EQ(kernel.block_dim().at(1), 1);
-    EXPECT_EQ(kernel.block_dim().at(2), 1);
-    EXPECT_EQ(kernel.total_duration_ns(), 2);
-    EXPECT_FALSE(kernel.is_kernel_using_tensor_core());
-    EXPECT_TRUE(kernel.is_op_tensor_core_eligible());
-    EXPECT_EQ(kernel.op_name(), "Conv2D");
-  }
-
-  {
-    const auto& kernel = kernel_stats.reports().at(0);
-    EXPECT_EQ(kernel.name(), "volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_tn");
-    EXPECT_EQ(kernel.registers_per_thread(), 32);
-    EXPECT_EQ(kernel.static_shmem_bytes(), 0);
-    EXPECT_EQ(kernel.dynamic_shmem_bytes(), 16384);
-    EXPECT_EQ(kernel.grid_dim().at(0), 3);
-    EXPECT_EQ(kernel.grid_dim().at(1), 1);
-    EXPECT_EQ(kernel.grid_dim().at(2), 1);
-    EXPECT_EQ(kernel.block_dim().at(0), 64);
-    EXPECT_EQ(kernel.block_dim().at(1), 1);
-    EXPECT_EQ(kernel.block_dim().at(2), 1);
-    EXPECT_EQ(kernel.total_duration_ns(), 3);
-    EXPECT_TRUE(kernel.is_kernel_using_tensor_core());
-    EXPECT_TRUE(kernel.is_op_tensor_core_eligible());
-    EXPECT_EQ(kernel.op_name(), "Einsum_80");
-  }
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
deleted file mode 100644
index f99600057930..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ /dev/null
@@ -1,571 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-constexpr int64_t kInvalidStepId = -1;
-
-// Index of the time-sorted memory_profile_snapshots list, and the
-// MemoryActivityMetadata proto it contains.
-using IndexMetaPair =
-    std::pair<int64_t /*index*/, const MemoryActivityMetadata*>;
-
-bool IsMemoryAllocation(int64_t event_type) {
-  return event_type == HostEventType::kMemoryAllocation;
-}
-
-bool IsMemoryDeallocation(int64_t event_type) {
-  return event_type == HostEventType::kMemoryDeallocation;
-}
-
-void UpdateProfileSummary(const MemoryAggregationStats& stats,
-                          int64_t time_offset_ps,
-                          MemoryProfileSummary* summary) {
-  // Update the peak memory usage over allocator's lifetime.
-  summary->set_peak_bytes_usage_lifetime(stats.peak_bytes_in_use());
-  MemoryAggregationStats* peak_stats = summary->mutable_peak_stats();
-  // If we reach (or stay at) peak memory usage within the profiling window,
-  // update memory profile summary.
-  if (stats.stack_reserved_bytes() + stats.heap_allocated_bytes() >=
-      peak_stats->peak_bytes_in_use()) {
-    *peak_stats = stats;
-    peak_stats->set_peak_bytes_in_use(stats.stack_reserved_bytes() +
-                                      stats.heap_allocated_bytes());
-    summary->set_peak_stats_time_ps(time_offset_ps);
-    summary->set_memory_capacity(stats.stack_reserved_bytes() +
-                                 stats.heap_allocated_bytes() +
-                                 stats.free_memory_bytes());
-  }
-}
-
-// Generate memory profile proto by processing host trace XPlane.
-MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  MemoryProfile memory_profile;
-  // Iterate over all XEvents in the XPlane, and add the XStats to a new
-  // MemoryProfileSnapshot if the EventType is kMemoryAllocation or
-  // kMemoryDeallocation.
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      int64_t event_type =
-          event.Type().value_or(HostEventType::kUnknownHostEventType);
-      if (!(IsMemoryAllocation(event_type) ||
-            IsMemoryDeallocation(event_type))) {
-        return;
-      }
-
-      MemoryAggregationStats stats;
-      MemoryActivityMetadata metadata;
-      if (IsMemoryAllocation(event_type)) {
-        metadata.set_memory_activity(ALLOCATION);
-      } else if (IsMemoryDeallocation(event_type)) {
-        metadata.set_memory_activity(DEALLOCATION);
-      }
-      metadata.set_step_id(kInvalidStepId);
-
-      std::string memory_id;
-      event.ForEachStat([&](const XStatVisitor& stat) {
-        if (!stat.Type().has_value()) return;
-        switch (stat.Type().value()) {
-          case StatType::kIndexOnHost:
-          case StatType::kDeviceOrdinal:
-            memory_id = absl::StrCat(stat.IntValue());
-            break;
-          case StatType::kAllocatorName:
-            memory_id = std::string(stat.StrOrRefValue());
-            break;
-          case StatType::kBytesReserved:
-            stats.set_stack_reserved_bytes(stat.IntValue());
-            break;
-          case StatType::kBytesAllocated:
-            stats.set_heap_allocated_bytes(stat.IntValue());
-            break;
-          case StatType::kBytesAvailable:
-            stats.set_free_memory_bytes(stat.IntValue());
-            break;
-          case StatType::kFragmentation:
-            stats.set_fragmentation(stat.DoubleValue());
-            break;
-          case StatType::kPeakBytesInUse:
-            stats.set_peak_bytes_in_use(stat.IntValue());
-            break;
-          case StatType::kRequestedBytes:
-            metadata.set_requested_bytes(stat.IntValue());
-            break;
-          case StatType::kAllocationBytes:
-            metadata.set_allocation_bytes(stat.IntValue());
-            break;
-          case StatType::kAddress:
-            metadata.set_address(stat.IntValue());
-            break;
-          case StatType::kTfOp:
-            metadata.set_tf_op_name(std::string(stat.StrOrRefValue()));
-            break;
-          case StatType::kGroupId:
-            metadata.set_step_id(stat.IntValue());
-            break;
-          case StatType::kRegionType:
-            metadata.set_region_type(std::string(stat.StrOrRefValue()));
-            break;
-          case StatType::kDataType:
-            metadata.set_data_type(tensorflow::DataTypeString(
-                static_cast<tensorflow::DataType>(stat.IntValue())));
-            break;
-          case StatType::kTensorShapes:
-            metadata.set_tensor_shape(std::string(stat.StrOrRefValue()));
-            break;
-        }
-      });
-
-      MemoryProfileSummary* summary =
-          (*memory_profile.mutable_memory_profile_per_allocator())[memory_id]
-              .mutable_profile_summary();
-      UpdateProfileSummary(stats, event.OffsetPs(), summary);
-
-      MemoryProfileSnapshot* snapshot =
-          (*memory_profile.mutable_memory_profile_per_allocator())[memory_id]
-              .add_memory_profile_snapshots();
-      snapshot->set_time_offset_ps(event.OffsetPs());
-      *snapshot->mutable_aggregation_stats() = std::move(stats);
-      *snapshot->mutable_activity_metadata() = std::move(metadata);
-    });
-  });
-  return memory_profile;
-}
-
-// Fix invalid step ids of snapshots at the beginning/end of the profile or at
-// the step boundaries. The snapshots with invalid step ids at the beginning get
-// 0 for their step ids. Those at the step boundaries or at the end get the
-// previous snapshot's step id + 1.
-void UpdateStepId(PerAllocatorMemoryProfile* memory_profile) {
-  int64_t last_valid_step_id = -1;
-  // Snapshots are already sorted in time.
-  for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
-    DCHECK(snapshot.has_activity_metadata());
-    if (snapshot.mutable_activity_metadata()->step_id() == kInvalidStepId) {
-      snapshot.mutable_activity_metadata()->set_step_id(last_valid_step_id + 1);
-    } else {
-      last_valid_step_id = snapshot.mutable_activity_metadata()->step_id();
-    }
-  }
-}
-
-// Update the MemoryActivityMetadata for each deallocation event by copying from
-// matching allocation.
-void UpdateDeallocation(PerAllocatorMemoryProfile* memory_profile) {
-  absl::flat_hash_map<uint64 /*address*/, const MemoryActivityMetadata*>
-      addr_metadata_map;
-  for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
-    // Match the deallocation with previous allocation based on address.
-    uint64 address = snapshot.activity_metadata().address();
-    if (snapshot.activity_metadata().memory_activity() == DEALLOCATION) {
-      if (addr_metadata_map.contains(address)) {
-        const MemoryActivityMetadata* alloc_meta = addr_metadata_map[address];
-        snapshot.mutable_activity_metadata()->set_tf_op_name(
-            alloc_meta->tf_op_name());
-        snapshot.mutable_activity_metadata()->set_region_type(
-            alloc_meta->region_type());
-        snapshot.mutable_activity_metadata()->set_data_type(
-            alloc_meta->data_type());
-        snapshot.mutable_activity_metadata()->set_tensor_shape(
-            alloc_meta->tensor_shape());
-        // In case of following (unexpected) deallocations to the same chunk
-        // address, leave the metadata as it is (empty or already captured).
-        addr_metadata_map.erase(address);
-      } else {
-        VLOG(2)
-            << "Can't find matching memory allocation for this deallocation: "
-            << snapshot.DebugString();
-      }
-    } else if (!addr_metadata_map.contains(address)) {  // Allocation.
-      addr_metadata_map[address] = &snapshot.activity_metadata();
-    } else {
-      VLOG(2) << "There are two allocations recorded for the same address: "
-              << address
-              << ". The later allocation event is: " << snapshot.DebugString();
-    }
-  }
-  VLOG(2) << "Number of allocations that cannot find matching dealloctions: "
-          << addr_metadata_map.size();
-}
-
-// Return the step id for the peak memory usage data point.
-int64_t GetPeakMemoryStep(int64_t peak_bytes_profile,
-                          const PerAllocatorMemoryProfile* memory_profile) {
-  int64_t peak_bytes_profile_step_id = 0;
-  for (const auto& snapshot : memory_profile->memory_profile_snapshots()) {
-    // Get the step id of the peak memory usage.
-    if (peak_bytes_profile ==
-        snapshot.aggregation_stats().heap_allocated_bytes() +
-            snapshot.aggregation_stats().stack_reserved_bytes()) {
-      DCHECK(snapshot.has_activity_metadata());
-      peak_bytes_profile_step_id = snapshot.activity_metadata().step_id();
-    }
-  }
-  return peak_bytes_profile_step_id;
-}
-
-// Functor that compares (index, metadata) pair to sort in the order of
-// allocation bytes and requested bytes (descending), as well as TF Op name,
-// region type, data type, and tensor shape (ascending).
-struct MetadataComparator {
-  bool operator()(const IndexMetaPair& a, const IndexMetaPair& b) const {
-    const MemoryActivityMetadata* a_meta = a.second;
-    const MemoryActivityMetadata* b_meta = b.second;
-    DCHECK_NE(a_meta, nullptr);
-    DCHECK_NE(b_meta, nullptr);
-
-    auto lhs =
-        std::make_tuple(-a_meta->allocation_bytes(), -a_meta->requested_bytes(),
-                        a_meta->tf_op_name(), a_meta->region_type(),
-                        a_meta->data_type(), a_meta->tensor_shape());
-    auto rhs =
-        std::make_tuple(-b_meta->allocation_bytes(), -b_meta->requested_bytes(),
-                        b_meta->tf_op_name(), b_meta->region_type(),
-                        b_meta->data_type(), b_meta->tensor_shape());
-    return lhs < rhs;
-  }
-};
-
-// If applicable, add items into active_allocs vector and special_allocations
-// proto for the unmapped memory usage (in heap) and stack reservation at peak.
-void InsertSpecialAllocations(int64_t unmapped_allocation_bytes,
-                              int64_t step_id,
-                              PerAllocatorMemoryProfile* memory_profile,
-                              std::vector<IndexMetaPair>* active_allocs) {
-  int index = 0;
-  if (unmapped_allocation_bytes > 0) {
-    MemoryActivityMetadata* special_allocation =
-        memory_profile->add_special_allocations();
-    special_allocation->set_memory_activity(ALLOCATION);
-    special_allocation->set_requested_bytes(unmapped_allocation_bytes);
-    special_allocation->set_allocation_bytes(unmapped_allocation_bytes);
-    special_allocation->set_address(0);
-    special_allocation->set_tf_op_name("unused preallocated device memory");
-    special_allocation->set_step_id(step_id);
-    special_allocation->set_region_type("persist/dynamic");
-    special_allocation->set_data_type(
-        tensorflow::DataTypeString(static_cast<tensorflow::DataType>(0)));
-    special_allocation->set_tensor_shape("unknown");
-    active_allocs->push_back({--index, special_allocation});
-  }
-  int64_t stack_bytes =
-      memory_profile->profile_summary().peak_stats().stack_reserved_bytes();
-  if (stack_bytes > 0) {
-    MemoryActivityMetadata* special_allocation =
-        memory_profile->add_special_allocations();
-    special_allocation->set_memory_activity(ALLOCATION);
-    special_allocation->set_requested_bytes(stack_bytes);
-    special_allocation->set_allocation_bytes(stack_bytes);
-    special_allocation->set_address(0);
-    special_allocation->set_tf_op_name("stack");
-    special_allocation->set_step_id(step_id);
-    special_allocation->set_region_type("stack");
-    special_allocation->set_data_type(
-        tensorflow::DataTypeString(static_cast<tensorflow::DataType>(0)));
-    special_allocation->set_tensor_shape("unknown");
-    active_allocs->push_back({--index, special_allocation});
-  }
-}
-
-bool operator==(const IndexMetaPair& a, const IndexMetaPair& b) {
-  const MemoryActivityMetadata* a_meta = a.second;
-  const MemoryActivityMetadata* b_meta = b.second;
-  return a_meta->allocation_bytes() == b_meta->allocation_bytes() &&
-         a_meta->requested_bytes() == b_meta->requested_bytes() &&
-         a_meta->tf_op_name() == b_meta->tf_op_name() &&
-         a_meta->region_type() == b_meta->region_type() &&
-         a_meta->data_type() == b_meta->data_type() &&
-         a_meta->tensor_shape() == b_meta->tensor_shape();
-}
-
-// Generate the memory breakdown table of active allocations at the peak usage
-// (within profiling window) and fill each ActiveAllocation proto (i.e. a row).
-void ProcessActiveAllocations(int64_t peak_bytes_profile_step_id,
-                              PerAllocatorMemoryProfile* memory_profile) {
-  int64_t unmapped_allocation_bytes =
-      memory_profile->profile_summary().peak_stats().heap_allocated_bytes();
-  int64_t unmapped_deallocation_bytes = 0;
-  absl::flat_hash_map<int64_t /*address*/, IndexMetaPair> active_alloc_map;
-  // Only account for the memory activities in the step that includes peak
-  // memory usage.
-  for (int i = 0; i < memory_profile->memory_profile_snapshots_size(); i++) {
-    const auto& snapshot = memory_profile->memory_profile_snapshots().at(i);
-    DCHECK(snapshot.has_activity_metadata());
-    const MemoryActivityMetadata& metadata = snapshot.activity_metadata();
-    if (snapshot.time_offset_ps() >
-        memory_profile->profile_summary().peak_stats_time_ps())
-      break;
-    if (metadata.step_id() != peak_bytes_profile_step_id) continue;
-
-    if (metadata.memory_activity() == ALLOCATION) {
-      active_alloc_map[metadata.address()] = {i, &metadata};
-      unmapped_allocation_bytes -= metadata.allocation_bytes();
-    } else {
-      DCHECK_EQ(metadata.memory_activity(), DEALLOCATION);
-      if (active_alloc_map.contains(metadata.address())) {
-        active_alloc_map.erase(metadata.address());
-      } else {
-        unmapped_deallocation_bytes += metadata.allocation_bytes();
-      }
-      unmapped_allocation_bytes += metadata.allocation_bytes();
-    }
-  }
-  // This separates the persistent memory from the freed memory from last step's
-  // allocations.
-  unmapped_allocation_bytes -= unmapped_deallocation_bytes;
-
-  VLOG(2) << "unmapped_allocation_bytes=" << unmapped_allocation_bytes
-          << ", unmapped_deallocation_bytes=" << unmapped_deallocation_bytes;
-
-  // Using pair of (index, MemoryActivityMetadata*) so that we can sort by the
-  // metadata, and fetch metadata by indexing the time-sorted snapshots at
-  // frontend.
-  std::vector<IndexMetaPair> active_allocs;
-  for (const auto& address_and_index_meta : active_alloc_map) {
-    active_allocs.push_back(address_and_index_meta.second);
-  }
-
-  InsertSpecialAllocations(unmapped_allocation_bytes,
-                           peak_bytes_profile_step_id, memory_profile,
-                           &active_allocs);
-
-  std::sort(active_allocs.begin(), active_allocs.end(), MetadataComparator());
-
-  // Fill the sorted active_allocations proto messages at peak memory usage.
-  // Merge identical allocations and show occurrences.
-  for (int i = 0, end = active_allocs.size(); i < end; i++) {
-    ActiveAllocation* allocation = memory_profile->add_active_allocations();
-    allocation->set_snapshot_index(active_allocs[i].first);
-    if (active_allocs[i].first < 0) {
-      allocation->set_special_index(-active_allocs[i].first - 1);
-    } else {
-      allocation->set_special_index(-1);
-    }
-    allocation->set_num_occurrences(1);
-    const int last_alloc = active_allocs.size() - 1;
-    while (i < last_alloc && active_allocs[i] == active_allocs[i + 1]) {
-      allocation->set_num_occurrences(allocation->num_occurrences() + 1);
-      i++;
-    }
-  }
-
-  VLOG(2) << "Distinctive active allocation count="
-          << memory_profile->active_allocations_size();
-}
-
-// This function saves the MemoryProfileSnapshots referenced by
-// <active_allocations> max_num_snapshots.
-void SaveActiveAllocationSnapshots(
-    tsl::protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots,
-    tsl::protobuf::RepeatedPtrField<ActiveAllocation>* active_allocations) {
-  std::vector<MemoryProfileSnapshot*> samples;
-  // Puts the snapshots referenced by active_allocations in <samples>.
-  for (const auto& allocation : *active_allocations) {
-    auto orig_index = allocation.snapshot_index();
-    if (orig_index < 0) continue;
-    samples.push_back(&(*snapshots)[orig_index]);
-  }
-
-  // Change the reference index in <active_allocations>.
-  int new_index = 0;
-  for (auto& allocation : *active_allocations) {
-    int64_t origin_index = allocation.snapshot_index();
-    if (origin_index < 0) continue;
-    allocation.set_snapshot_index(new_index);
-    new_index++;
-  }
-
-  tsl::protobuf::RepeatedPtrField<MemoryProfileSnapshot> new_snapshots;
-  new_snapshots.Reserve(samples.size());
-  for (const auto& sample : samples) {
-    *new_snapshots.Add() = std::move(*sample);
-  }
-  *snapshots = std::move(new_snapshots);
-}
-
-// Sample <max_num_snapshots> memory profile snapshots from the original memory
-// profile data.
-void SampleMemoryProfileTimeline(int64_t max_num_snapshots,
-                                 PerAllocatorMemoryProfile* memory_profile) {
-  const tsl::protobuf::RepeatedPtrField<MemoryProfileSnapshot>&
-      original_snapshots = memory_profile->memory_profile_snapshots();
-  tsl::protobuf::RepeatedPtrField<MemoryProfileSnapshot>* timeline_snapshots =
-      memory_profile->mutable_sampled_timeline_snapshots();
-  int64_t snapshot_count = original_snapshots.size();
-  if (snapshot_count > max_num_snapshots) {
-    // When there are more memory profile data than <max_num_snapshots>, we
-    // sample the origin data using a max box filter. Filter width is
-    // <filter_width>, collect <count> samples starting from the <start> index
-    // in the original snapshots.
-    auto max_box_filter = [&](int filter_width, int count, int start) {
-      for (int i = 0; i < count; i++) {
-        // Use a max function to get the MemoryProfileSnapshot with the largest
-        // memory usage in the box filter.
-        const MemoryProfileSnapshot* max_snapshot =
-            &original_snapshots[start + filter_width * i];
-        int64_t max_bytes =
-            max_snapshot->aggregation_stats().heap_allocated_bytes() +
-            max_snapshot->aggregation_stats().stack_reserved_bytes();
-        for (int index = start + filter_width * i + 1;
-             index < start + filter_width * (i + 1); index++) {
-          int64_t bytes = original_snapshots[index]
-                              .aggregation_stats()
-                              .heap_allocated_bytes() +
-                          original_snapshots[index]
-                              .aggregation_stats()
-                              .stack_reserved_bytes();
-          if (bytes > max_bytes) {
-            max_snapshot = &original_snapshots[index];
-            max_bytes = bytes;
-          }
-        }
-        *timeline_snapshots->Add() = *max_snapshot;
-      }
-    };
-
-    int width = snapshot_count / max_num_snapshots;
-    int count1 = max_num_snapshots * (width + 1) - snapshot_count;
-    int count2 = max_num_snapshots - count1;
-
-    // Collect <count1> samples with box filter width <width>, then collect
-    // <count2> samples with box filter width <width+1>, the total number of
-    // samples collected will be <max_num_snapshot>.
-    max_box_filter(width, count1, 0);
-    max_box_filter(width + 1, count2, width * count1);
-  } else {
-    // When the number of original snapshots are smaller than
-    // <max_num_snapshots>, just copy all the data points to the timeline.
-    *timeline_snapshots = original_snapshots;
-  }
-}
-
-// Post-process the memory profile to correctly update proto fields, and break
-// down peak memory usage for each allocator.
-void ProcessMemoryProfileProto(int64_t max_num_snapshots,
-                               MemoryProfile* memory_profile) {
-  memory_profile->set_num_hosts(1);
-  // Add sorted memory ids within memory profile data to the selection list.
-  for (const auto& id_and_allocator_profile :
-       memory_profile->memory_profile_per_allocator()) {
-    if (!id_and_allocator_profile.second.memory_profile_snapshots().empty()) {
-      memory_profile->add_memory_ids(id_and_allocator_profile.first);
-    }
-  }
-  absl::c_sort(*memory_profile->mutable_memory_ids());
-
-  for (auto& id_and_allocator_profile :
-       *memory_profile->mutable_memory_profile_per_allocator()) {
-    PerAllocatorMemoryProfile* allocator_memory_profile =
-        &id_and_allocator_profile.second;
-    tsl::protobuf::RepeatedPtrField<MemoryProfileSnapshot>* snapshots =
-        allocator_memory_profile->mutable_memory_profile_snapshots();
-    // Sort the memory_profile_snapshots by time_offset_ps (ascending) in proto.
-    absl::c_sort(*snapshots, [](const MemoryProfileSnapshot& a,
-                                const MemoryProfileSnapshot& b) {
-      return a.time_offset_ps() < b.time_offset_ps();
-    });
-
-    UpdateStepId(allocator_memory_profile);
-    UpdateDeallocation(allocator_memory_profile);
-
-    // Sample a subset of MemoryProfileSnapshots to display in the frontend
-    // memory timeline graph.
-    SampleMemoryProfileTimeline(max_num_snapshots, allocator_memory_profile);
-
-    int64_t peak_step_id =
-        GetPeakMemoryStep(allocator_memory_profile->profile_summary()
-                              .peak_stats()
-                              .peak_bytes_in_use(),
-                          allocator_memory_profile);
-    ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
-    SaveActiveAllocationSnapshots(
-        snapshots, allocator_memory_profile->mutable_active_allocations());
-  }
-}
-
-template <typename Proto>
-absl::Status ConvertProtoToJson(const Proto& proto_output,
-                                std::string* json_output) {
-  tsl::protobuf::util::JsonPrintOptions json_options;
-  json_options.always_print_primitive_fields = true;
-  auto status = tsl::protobuf::util::MessageToJsonString(
-      proto_output, json_output, json_options);
-  if (!status.ok()) {
-    // Convert error_msg google::protobuf::StringPiece (or absl::string_view) to
-    // tensorflow::StringPiece.
-    auto error_msg = status.message();
-    return errors::Internal(
-        "Could not convert proto to JSON string: ",
-        absl::string_view(error_msg.data(), error_msg.length()));
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
-                                           int64_t max_num_snapshots) {
-  MemoryProfile memory_profile = GenerateMemoryProfile(&host_plane);
-  ProcessMemoryProfileProto(max_num_snapshots, &memory_profile);
-  // Default version number is 0, set version number to 1 here due to the new
-  // memory profile sampling algorithm.
-  memory_profile.set_version(1);
-  return memory_profile;
-}
-
-absl::Status ConvertXSpaceToMemoryProfileJson(const XSpace& xspace,
-                                              std::string* json_output) {
-  if (const XPlane* host_plane =
-          FindPlaneWithName(xspace, kHostThreadsPlaneName)) {
-    MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
-    TF_RETURN_IF_ERROR(ConvertProtoToJson(memory_profile, json_output));
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
deleted file mode 100644
index 00f919d4dbd4..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
-
-#include <string>
-
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Process the host threads XPlane and generate MemoryProfile result; at most
-// max_num_snapshots will be displayed on the UI.
-// REQUIRED: host_plane should have been grouped by calling GroupTfEvents().
-MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
-                                           int64_t max_num_snapshots = 1000);
-
-absl::Status ConvertXSpaceToMemoryProfileJson(const XSpace& xspace,
-                                              std::string* json_output);
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
deleted file mode 100644
index a60d505cfc78..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
-
-#include <cstdint>
-
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// Tests with a sample profile with multiple memory allocation and deallocation
-// activities within one memory allocator captured in host trace.
-TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
-  XSpace space;
-  XPlane* host_plane = GetOrCreateHostXPlane(&space);
-  XPlaneBuilder host_plane_builder(host_plane);
-  host_plane_builder.ReserveLines(1);
-
-  auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
-               40000, 1000,
-               {{StatType::kBytesReserved, int64_t{2000}},
-                {StatType::kBytesAllocated, int64_t{3000}},
-                {StatType::kBytesAvailable, int64_t{5000}},
-                {StatType::kPeakBytesInUse, int64_t{8500}},
-                {StatType::kRequestedBytes, int64_t{200}},
-                {StatType::kAllocationBytes, int64_t{256}},
-                {StatType::kAddress, int64_t{222333}},
-                {StatType::kStepId, int64_t{-93746}},
-                {StatType::kDataType, int64_t{1}},
-                {StatType::kAllocatorName, "GPU_0_bfc"},
-                {StatType::kTfOp, "foo/bar"},
-                {StatType::kRegionType, "output"},
-                {StatType::kTensorShapes, "[3, 3, 512, 512]"}});
-
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryDeallocation",
-               50000, 1000,
-               {{StatType::kBytesReserved, int64_t{2000}},
-                {StatType::kBytesAllocated, int64_t{2744}},
-                {StatType::kBytesAvailable, int64_t{5256}},
-                {StatType::kPeakBytesInUse, int64_t{8500}},
-                {StatType::kRequestedBytes, int64_t{200}},
-                {StatType::kAllocationBytes, int64_t{256}},
-                {StatType::kAddress, int64_t{222333}},
-                {StatType::kStepId, int64_t{0}},
-                {StatType::kDataType, int64_t{0}},
-                {StatType::kAllocatorName, "GPU_0_bfc"},
-                {StatType::kRegionType, ""},
-                {StatType::kTensorShapes, ""}});
-
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
-               70000, 1000,
-               {{StatType::kBytesReserved, int64_t{2000}},
-                {StatType::kBytesAllocated, int64_t{5000}},
-                {StatType::kBytesAvailable, int64_t{3000}},
-                {StatType::kPeakBytesInUse, int64_t{9500}},
-                {StatType::kRequestedBytes, int64_t{300}},
-                {StatType::kAllocationBytes, int64_t{300}},
-                {StatType::kAddress, int64_t{345678}},
-                {StatType::kStepId, int64_t{-93746}},
-                {StatType::kDataType, int64_t{9}},
-                {StatType::kAllocatorName, "GPU_0_bfc"},
-                {StatType::kTfOp, "mul_grad/Sum"},
-                {StatType::kRegionType, "temp"},
-                {StatType::kTensorShapes, "[1, 2]"}});
-
-  tsl::profiler::GroupTfEvents(&space);
-  MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
-  EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
-  EXPECT_EQ(memory_profile.num_hosts(), 1);
-  EXPECT_EQ(memory_profile.memory_ids_size(), 1);
-  EXPECT_EQ(memory_profile.memory_profile_per_allocator().begin()->first,
-            "GPU_0_bfc");
-  EXPECT_EQ(memory_profile.version(), 1);
-  const auto& allocator_memory_profile =
-      memory_profile.memory_profile_per_allocator().begin()->second;
-  EXPECT_EQ(
-      allocator_memory_profile.profile_summary().peak_bytes_usage_lifetime(),
-      9500);
-  EXPECT_EQ(allocator_memory_profile.profile_summary()
-                .peak_stats()
-                .peak_bytes_in_use(),
-            7000);
-  EXPECT_EQ(allocator_memory_profile.profile_summary().peak_stats_time_ps(),
-            70000);
-  EXPECT_EQ(allocator_memory_profile.sampled_timeline_snapshots_size(), 3);
-  EXPECT_EQ(allocator_memory_profile.memory_profile_snapshots_size(), 1);
-  EXPECT_EQ(allocator_memory_profile.memory_profile_snapshots()
-                .at(0)
-                .activity_metadata()
-                .tf_op_name(),
-            "mul_grad/Sum");
-  EXPECT_EQ(allocator_memory_profile.active_allocations_size(), 3);
-  EXPECT_EQ(
-      allocator_memory_profile.active_allocations().at(2).snapshot_index(), 0);
-  EXPECT_EQ(allocator_memory_profile.special_allocations_size(), 2);
-  EXPECT_EQ(allocator_memory_profile.special_allocations().at(1).tf_op_name(),
-            "stack");
-  EXPECT_EQ(
-      allocator_memory_profile.special_allocations().at(1).allocation_bytes(),
-      2000);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
deleted file mode 100644
index b216f95a4a2f..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ /dev/null
@@ -1,332 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/convert/op_stack.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/cost_utils.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/op_utils.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tsl::profiler::GetDeviceEventTimespan;
-
-// Type of a TensorFlow Op activity, which is either beginning or ending an Op.
-enum TfActivityType { kTfOpBegin, kTfOpEnd };
-
-// Instant activity representing the begin or end of a host-side TF Op.
-struct TfActivity {
-  // The timestamp in picoseconds when this activity happened.
-  uint64 timestamp_ps;
-  // The ID of this Op.
-  uint32 tf_op_id;
-  // Type of this activity.
-  TfActivityType activity_type;
-  // Full TF op name and type of this activity (backed by XEvent::name).
-  tsl::profiler::TfOp tf_op;
-  // Whether it is eagerly executed.
-  bool is_eager;
-};
-
-// TF Op metrics stored as element in OpStack.
-struct TfOpInfo {
-  explicit TfOpInfo(uint64 ts) : start_timestamp_ps(ts) {}
-
-  // Start timestamp in picoseconds.
-  uint64 start_timestamp_ps;
-  // Children duration in picoseconds.
-  uint64 children_duration_ps = 0;
-};
-
-// Processes a TF-activity on particular core.
-void ProcessOneTfActivity(const TfActivity& activity,
-                          OpStack<TfOpInfo>* tf_op_stack,
-                          TfMetricsDbData* tf_metrics_data) {
-  uint32 tf_op_id = activity.tf_op_id;
-  switch (activity.activity_type) {
-    case kTfOpBegin: {
-      tf_op_stack->Push(tf_op_id,
-                        std::make_unique<TfOpInfo>(activity.timestamp_ps));
-      break;
-    }
-    case kTfOpEnd: {
-      std::unique_ptr<TfOpInfo> info = tf_op_stack->Pop(tf_op_id);
-      if (info == nullptr) {
-        // This happens if TraceMes overlap.
-        VLOG(1) << "No begin event found for TF activity id=" << tf_op_id
-                << " name=" << activity.tf_op.name
-                << " type=" << activity.tf_op.type;
-        break;
-      }
-      tsl::profiler::Timespan tf_op_span = tsl::profiler::PicoSpan(
-          info->start_timestamp_ps, activity.timestamp_ps);
-      tf_metrics_data->tf_metrics_db_builder.EnterOp(
-          activity.tf_op.name, activity.tf_op.type, activity.is_eager,
-          tf_op_span.duration_ps(), info->children_duration_ps);
-      TfOpInfo* parent_info = tf_op_stack->Top();
-      if (parent_info != nullptr) {
-        parent_info->children_duration_ps += tf_op_span.duration_ps();
-      }
-      if (tsl::profiler::IsInfeedEnqueueOp(activity.tf_op.type)) {
-        tf_metrics_data->tf_metrics_db_builder.EnterHostInfeedEnqueue(
-            tf_op_span);
-      }
-      break;
-    }
-  }
-}
-
-// Processes all TF-activities on the given core.
-void ProcessTfActivities(std::vector<TfActivity>* tf_activities,
-                         TfMetricsDbData* tf_metrics_db_data) {
-  if (tf_activities->empty()) return;
-  absl::c_stable_sort(*tf_activities,
-                      [](const TfActivity& a, const TfActivity& b) {
-                        return a.timestamp_ps < b.timestamp_ps;
-                      });
-  OpStack<TfOpInfo> tf_op_stack;
-  for (const auto& tf_activity : *tf_activities) {
-    ProcessOneTfActivity(tf_activity, &tf_op_stack, tf_metrics_db_data);
-  }
-  SetTotalTimePs(
-      tf_metrics_db_data->tf_metrics_db,
-      tf_activities->back().timestamp_ps - tf_activities->front().timestamp_ps);
-}
-
-void CollectTfActivities(
-    const XLineVisitor& line,
-    const absl::flat_hash_map<int64_t, tsl::profiler::TfOp>& tf_ops,
-    std::vector<TfActivity>* tf_activities) {
-  uint32 tf_op_id = 0;
-  if (IsDerivedThreadId(line.Id())) return;
-  tf_activities->reserve(line.NumEvents() * 2);
-  line.ForEachEvent(
-      [&tf_ops, &tf_op_id, &tf_activities](const XEventVisitor& event) {
-        const tsl::profiler::TfOp* tf_op = gtl::FindOrNull(tf_ops, event.Id());
-        if (tf_op != nullptr) {
-          ++tf_op_id;
-          bool is_eager = false;
-          if (std::optional<XStatVisitor> stat =
-                  event.GetStat(StatType::kIsEager)) {
-            is_eager = stat->IntValue();
-          }
-          tsl::profiler::Timespan span = event.GetTimespan();
-          tf_activities->push_back(
-              {span.begin_ps(), tf_op_id, kTfOpBegin, *tf_op, is_eager});
-          tf_activities->push_back(
-              {span.end_ps(), tf_op_id, kTfOpEnd, *tf_op, is_eager});
-        }
-        if (auto tf_op_stat = event.GetStat(StatType::kTfOp);
-            tf_op_stat.has_value()) {
-          ++tf_op_id;
-          tsl::profiler::TfOp tf_op =
-              tsl::profiler::ParseTfOpFullname(tf_op_stat->StrOrRefValue());
-          tsl::profiler::Timespan span = event.GetTimespan();
-          tf_activities->push_back(
-              {span.begin_ps(), tf_op_id, kTfOpBegin, tf_op, false});
-          tf_activities->push_back(
-              {span.end_ps(), tf_op_id, kTfOpEnd, tf_op, false});
-        }
-      });
-}
-
-}  // namespace
-
-absl::flat_hash_map<int64_t, tsl::profiler::TfOp>
-CollectTfOpsFromHostThreadsXPlane(const XPlane& host_trace) {
-  absl::flat_hash_map<int64_t, tsl::profiler::TfOp> tf_ops;
-  for (const auto& id_metadata : host_trace.event_metadata()) {
-    const XEventMetadata& metadata = id_metadata.second;
-    // On the host, we have added some user-specified TraceMe's in addition to
-    // the TraceMe's added to every TensorFlow op by the system. These
-    // user-inserted TraceMe's have "unknown" type. We don't count them in
-    // Tf-stats.
-    tsl::profiler::TfOp tf_op =
-        tsl::profiler::ParseTfOpFullname(metadata.name());
-    if (tf_op.category != tsl::profiler::Category::kUnknown) {
-      tf_ops.try_emplace(metadata.id(), tf_op);
-    }
-  }
-  return tf_ops;
-}
-
-TfMetricsDbData ConvertHostThreadsXLineToTfMetricsDbData(
-    const XLineVisitor& line,
-    const absl::flat_hash_map<int64_t, tsl::profiler::TfOp>& tf_ops) {
-  TfMetricsDbData tf_metrics_db_data;
-  std::vector<TfActivity> tf_activities;
-  CollectTfActivities(line, tf_ops, &tf_activities);
-  ProcessTfActivities(&tf_activities, &tf_metrics_db_data);
-  return tf_metrics_db_data;
-}
-
-void ConsumeTfMetricsDbData(TfMetricsDbData src, OpMetricsDbCombiner* dst) {
-  AddIdleOp(src.tf_metrics_db);
-  // Host OpMetricsDb does not need to update the number of cores a certain op
-  // occurs.
-  dst->Combine(src.tf_metrics_db, /*update_num_cores=*/false);
-  src.tf_metrics_db.Clear();
-}
-
-OpMetricsDb ConvertHostThreadsXPlaneToOpMetricsDb(const XPlane& host_trace) {
-  absl::flat_hash_map<int64_t, tsl::profiler::TfOp> tf_ops =
-      CollectTfOpsFromHostThreadsXPlane(host_trace);
-  OpMetricsDb result;
-  OpMetricsDbCombiner combiner(&result);
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&host_trace);
-  plane.ForEachLine([&tf_ops, &combiner](const XLineVisitor& line) {
-    ConsumeTfMetricsDbData(
-        ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
-  });
-  return result;
-}
-
-OpMetricsDb ConvertTpuDeviceTraceXPlaneToOpMetricsDb(
-    const XPlane& device_trace) {
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&device_trace);
-  XEventsOpMetricsDbBuilder builder;
-  uint64_t first_op_timestamp_ps = std::numeric_limits<uint64_t>::max();
-  uint64_t last_op_timestamp_ps = 0;
-
-  struct ParentReference {
-    const XEventVisitor event;
-    tsl::profiler::Timespan device_timespan;
-    uint64_t children_duration_ps = 0;
-  };
-
-  tsl::profiler::AncestorStack<ParentReference> event_stack(
-      [&](const ParentReference& parent) {
-        OpMetrics op_metrics = FromXEvent(parent.event);
-        op_metrics.set_time_ps(parent.device_timespan.duration_ps());
-        op_metrics.set_self_time_ps(op_metrics.time_ps() -
-                                    parent.children_duration_ps);
-        builder.AddOpMetric(op_metrics, GetOpKeyFromXEvent(parent.event));
-      },
-      [](const ParentReference& parent, const ParentReference& child) {
-        return parent.device_timespan.Includes(child.device_timespan);
-      },
-      [](ParentReference& parent, ParentReference& child) {
-        parent.children_duration_ps += child.device_timespan.duration_ps();
-      });
-
-  auto track_first_and_last_op_timestamps = [&](const XEventVisitor& event) {
-    tsl::profiler::Timespan timespan = GetDeviceEventTimespan(event);
-    first_op_timestamp_ps =
-        std::min(first_op_timestamp_ps, timespan.begin_ps());
-    last_op_timestamp_ps = std::max(last_op_timestamp_ps, timespan.end_ps());
-  };
-
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    if (line.Name() == tsl::profiler::kSparseCoreStepLineName ||
-        line.Name() == tsl::profiler::kStepLineName) {
-      line.ForEachEvent(track_first_and_last_op_timestamps);
-    }
-    if (!tsl::profiler::IsOpLineName(line.Name())) return;
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      tsl::profiler::Timespan timespan = GetDeviceEventTimespan(event);
-      track_first_and_last_op_timestamps(event);
-
-      event_stack.Push({.event = event, .device_timespan = timespan});
-    });
-    event_stack.Flush();
-  });
-
-  return builder.Finalize(last_op_timestamp_ps - first_op_timestamp_ps);
-}
-
-OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace) {
-  OpMetricsDb result;
-  DeviceOpMetricsDbBuilder device_op_metrics_db_builder(&result);
-
-  int64_t first_op_offset_ps = kint64max;
-  int64_t last_op_offset_ps = 0;
-
-  TfOpRoofLineCostEstimator op_level_cost_estimator;
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&device_trace);
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    if (IsDerivedThreadId(line.Id())) return;
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      first_op_offset_ps = std::min(first_op_offset_ps, event.OffsetPs());
-      last_op_offset_ps = std::max(last_op_offset_ps, event.EndOffsetPs());
-
-      absl::string_view tf_op_full_name;
-      bool is_eager = false;
-      int64_t program_id = 0;
-      absl::string_view deduplicated_name = "";
-      event.ForEachStat([&](const XStatVisitor& stat) {
-        if (stat.Type() == StatType::kTfOp) {
-          tf_op_full_name = stat.StrOrRefValue();
-        } else if (stat.Type() == StatType::kIsEager) {
-          is_eager = stat.IntValue();
-        } else if (stat.Type() == StatType::kProgramId) {
-          program_id = stat.IntOrUintValue();
-        } else if (stat.Type() == StatType::kDeduplicatedName) {
-          deduplicated_name = stat.StrOrRefValue();
-        }
-      });
-      if (tf_op_full_name.empty()) return;
-      tsl::profiler::TfOp tf_op =
-          tsl::profiler::ParseTfOpFullname(tf_op_full_name);
-      TfOpRoofLineCostEstimator::OpRoofLineStats costs;
-      if (tf_op.category != tsl::profiler::Category::kUnknown) {
-        costs = op_level_cost_estimator.Predict(event);
-      }
-      device_op_metrics_db_builder.EnterOp(
-          /*program_id=*/program_id,
-          /**name=*/absl::StrCat(tf_op.name, "/", event.Name()),
-          /**category=*/tf_op.type,
-          /*provenance=*/tf_op_full_name, deduplicated_name, is_eager,
-          /*occurrences=*/1, event.DurationPs(),
-          /*children_time_ps=*/0, costs.flops, costs.bytes_accessed);
-    });
-  });
-  SetTotalTimePs(
-      result, last_op_offset_ps ? last_op_offset_ps - first_op_offset_ps : 0);
-  AddIdleOp(result);
-  return result;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
deleted file mode 100644
index 06bcec66d136..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_METRICS_DB_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_METRICS_DB_H_
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/types/optional.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/op_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Data per host thread for TensorFlow Op Metrics Database.
-struct TfMetricsDbData {
-  // A database of TF-Op metrics for this core.
-  OpMetricsDb tf_metrics_db;
-  HostOpMetricsDbBuilder tf_metrics_db_builder{&tf_metrics_db};
-};
-
-absl::flat_hash_map<int64_t, tsl::profiler::TfOp>
-CollectTfOpsFromHostThreadsXPlane(const XPlane& host_trace);
-
-TfMetricsDbData ConvertHostThreadsXLineToTfMetricsDbData(
-    const XLineVisitor& line,
-    const absl::flat_hash_map<int64_t, tsl::profiler::TfOp>& tf_ops);
-
-void ConsumeTfMetricsDbData(TfMetricsDbData src, OpMetricsDbCombiner* dst);
-
-OpMetricsDb ConvertHostThreadsXPlaneToOpMetricsDb(const XPlane& host_trace);
-
-OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace);
-
-// Convert TPU DeviceTrace XPlane to OpMetricDb
-OpMetricsDb ConvertTpuDeviceTraceXPlaneToOpMetricsDb(
-    const XPlane& device_trace);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_METRICS_DB_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
deleted file mode 100644
index 2110d58b6797..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <string>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-#if defined(PLATFORM_GOOGLE)
-// NOLINTNEXTLINE: clang-tidy missing-includes
-using ::testing::EqualsProto;
-#endif
-
-void AddTensorFlowTpuOpEvent(std::string&& name, std::string&& tf_op_fullname,
-                             int64_t start_timestamp_ns, int64_t duration_ns,
-                             std::string&& hlo_category, uint64 flops,
-                             uint64 bytes_accessed, int64_t occurences,
-                             int64_t self_duration, int64_t program_id,
-                             int64_t symbol_id, XPlaneBuilder* plane,
-                             XLineBuilder* line) {
-  XEventBuilder event = line->AddEvent(*plane->GetOrCreateEventMetadata(name));
-  event.SetTimestampNs(start_timestamp_ns);
-  event.SetDurationNs(duration_ns);
-  event.SetNumOccurrences(occurences);
-  XStatsBuilder<XEventMetadata> event_metadata(
-      plane->GetOrCreateEventMetadata(name), plane);
-  event_metadata.AddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-      tf_op_fullname);
-  event_metadata.AddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kHloCategory)),
-      hlo_category);
-  event_metadata.AddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kFlops)), flops);
-  event_metadata.AddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSymbolId)),
-      symbol_id);
-  event_metadata.AddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProgramId)),
-      program_id);
-}
-
-void AddTensorFlowOpEvent(std::string&& tf_op_fullname,
-                          int64_t start_timestamp_ns, int64_t duration_ns,
-                          bool on_device, absl::string_view kernel_name,
-                          XPlaneBuilder* plane, XLineBuilder* line) {
-  absl::string_view name = on_device ? kernel_name : tf_op_fullname;
-  XEventBuilder event = line->AddEvent(*plane->GetOrCreateEventMetadata(name));
-  event.SetTimestampNs(start_timestamp_ns);
-  event.SetDurationNs(duration_ns);
-  if (!on_device) return;
-  event.AddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-      *plane->GetOrCreateStatMetadata(std::move(tf_op_fullname)));
-}
-
-void AddXlaCpuOpEvent(std::string&& hlo_op_name, std::string&& tf_op,
-                      int64_t start_timestamp_ns, int64_t duration_ns,
-                      XPlaneBuilder* plane, XLineBuilder* line) {
-  XEventBuilder event =
-      line->AddEvent(*plane->GetOrCreateEventMetadata(hlo_op_name));
-  event.SetTimestampNs(start_timestamp_ns);
-  event.SetDurationNs(duration_ns);
-  event.ParseAndAddStatValue(
-      *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)), tf_op);
-}
-
-TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) {
-  static constexpr char kTfOp1[] = "TfOp1";
-  static constexpr char kTfOp2[] = "TfOp2";
-  constexpr int64_t kTfOp1StartNs = 100000;
-  constexpr int64_t kTfOp1DurationNs = 8000;
-  constexpr int64_t kTfOp2StartNs = 110000;
-  constexpr int64_t kTfOp2DurationNs = 10000;
-
-  XSpace xspace;
-  XPlane* xplane = GetOrCreateHostXPlane(&xspace);
-  XPlaneBuilder host_plane(xplane);
-  XLineBuilder thread1 = host_plane.GetOrCreateLine(/*line_id=*/10);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kTfOp1StartNs,
-                       kTfOp1DurationNs, /*on_device=*/false,
-                       /*kernel_name=*/"", &host_plane, &thread1);
-  XLineBuilder thread2 = host_plane.GetOrCreateLine(/*line_id=*/20);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kTfOp1StartNs,
-                       kTfOp1DurationNs, /*on_device=*/false,
-                       /*kernel_name=*/"", &host_plane, &thread2);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp2, ":", kTfOp2), kTfOp2StartNs,
-                       kTfOp2DurationNs, /*on_device=*/false,
-                       /*kernel_name=*/"", &host_plane, &thread2);
-
-  OpMetricsDb op_metrics = ConvertHostThreadsXPlaneToOpMetricsDb(*xplane);
-  // Op1, Op2, Idle.
-  EXPECT_EQ(3, op_metrics.metrics_db_size());
-  uint64 total_op_duration =
-      tsl::profiler::NanoToPico(kTfOp1DurationNs * 2 + kTfOp2DurationNs);
-  EXPECT_EQ(total_op_duration, op_metrics.total_op_time_ps());
-  uint64 total_duration = tsl::profiler::NanoToPico(
-      kTfOp2StartNs - kTfOp1StartNs + kTfOp2DurationNs + kTfOp1DurationNs);
-  EXPECT_EQ(total_duration, op_metrics.total_time_ps());
-
-  // Verifies OpMetricsDb is built correctly.
-  const OpMetrics& op_1 = op_metrics.metrics_db().at(0);
-  EXPECT_EQ(kTfOp1, op_1.name());
-  EXPECT_EQ(kTfOp1, op_1.category());
-  EXPECT_EQ(2, op_1.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToPico(kTfOp1DurationNs) * 2, op_1.time_ps());
-
-  const OpMetrics& idle = op_metrics.metrics_db().at(1);
-  EXPECT_EQ(kIdle, idle.name());
-  EXPECT_EQ(kIdle, idle.category());
-  // Idle time is the gap between Op2 start and the end of Op1, which is 2000ns.
-  EXPECT_EQ(tsl::profiler::NanoToPico(2000), idle.time_ps());
-
-  const OpMetrics& op_2 = op_metrics.metrics_db().at(2);
-  EXPECT_EQ(kTfOp2, op_2.name());
-  EXPECT_EQ(kTfOp2, op_2.category());
-  EXPECT_EQ(1, op_2.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToPico(kTfOp2DurationNs), op_2.time_ps());
-}
-
-TEST(ConvertXPlaneToOpMetricsDb, DeviceOpMetricsDb) {
-  // TfOp1 has kernel1 and kernel2; TfOp2 has kernel3.
-  static constexpr char kTfOp1[] = "TfOp1";
-  static constexpr char kTfOp2[] = "TfOp2";
-  static constexpr char kKernel1[] = "kernel1";
-  static constexpr char kKernel2[] = "kernel2";
-  static constexpr char kKernel3[] = "kernel3";
-  constexpr int64_t kKernel1StartNs = 100000;
-  constexpr int64_t kKernel1DurationNs = 8000;
-  constexpr int64_t kKernel2StartNs = 110000;
-  constexpr int64_t kKernel2DurationNs = 10000;
-  constexpr int64_t kKernel3StartNs = 120000;
-  constexpr int64_t kKernel3DurationNs = 10000;
-
-  XSpace xspace;
-  XPlane* xplane = GetOrCreateGpuXPlane(&xspace, /*device_ordinal=*/0);
-  XPlaneBuilder device_plane(xplane);
-  XLineBuilder stream1 = device_plane.GetOrCreateLine(/*line_id=*/10);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel1StartNs,
-                       kKernel1DurationNs, /*on_device=*/true, kKernel1,
-                       &device_plane, &stream1);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel2StartNs,
-                       kKernel2DurationNs, /*on_device=*/true, kKernel2,
-                       &device_plane, &stream1);
-  XLineBuilder stream2 = device_plane.GetOrCreateLine(/*line_id=*/20);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel1StartNs,
-                       kKernel1DurationNs, /*on_device=*/true, kKernel1,
-                       &device_plane, &stream2);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp1, ":", kTfOp1), kKernel2StartNs,
-                       kKernel2DurationNs, /*on_device=*/true, kKernel2,
-                       &device_plane, &stream2);
-  AddTensorFlowOpEvent(absl::StrCat(kTfOp2, ":", kTfOp2), kKernel3StartNs,
-                       kKernel3DurationNs, /*on_device=*/true, kKernel3,
-                       &device_plane, &stream2);
-
-  OpMetricsDb op_metrics = ConvertDeviceTraceXPlaneToOpMetricsDb(*xplane);
-
-  // kernel1, kernel2, kernel3, Idle.
-  EXPECT_EQ(4, op_metrics.metrics_db_size());
-  uint64 total_op_duration = tsl::profiler::NanoToPico(
-      kKernel1DurationNs * 2 + kKernel2DurationNs * 2 + kKernel3DurationNs);
-  EXPECT_EQ(total_op_duration, op_metrics.total_op_time_ps());
-  // For device, the total_duration for each device is the total duration merged
-  // from all GPU streams, which is from 100000 to 130000.
-  uint64 total_duration = tsl::profiler::NanoToPico(
-      kKernel3StartNs + kKernel3DurationNs - kKernel1StartNs);
-  EXPECT_EQ(std::max(total_duration, total_op_duration),
-            op_metrics.total_time_ps());
-
-  // Verifies OpMetricsDb is built correctly.
-  const OpMetrics& op_1 = op_metrics.metrics_db().at(0);
-  EXPECT_EQ(absl::StrCat(kTfOp1, "/", kKernel1), op_1.name());
-  EXPECT_EQ(kTfOp1, op_1.category());
-  EXPECT_EQ(2, op_1.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToPico(kKernel1DurationNs) * 2, op_1.time_ps());
-
-  const OpMetrics& op_2 = op_metrics.metrics_db().at(1);
-  EXPECT_EQ(absl::StrCat(kTfOp1, "/", kKernel2), op_2.name());
-  EXPECT_EQ(kTfOp1, op_2.category());
-  EXPECT_EQ(2, op_2.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToPico(kKernel2DurationNs) * 2, op_2.time_ps());
-
-  const OpMetrics& op_3 = op_metrics.metrics_db().at(2);
-  EXPECT_EQ(absl::StrCat(kTfOp2, "/", kKernel3), op_3.name());
-  EXPECT_EQ(kTfOp2, op_3.category());
-  EXPECT_EQ(1, op_3.occurrences());
-  EXPECT_EQ(tsl::profiler::NanoToPico(kKernel3DurationNs), op_3.time_ps());
-
-  const OpMetrics& idle = op_metrics.metrics_db().at(3);
-  EXPECT_EQ(kIdle, idle.name());
-  EXPECT_EQ(kIdle, idle.category());
-  // GPU is always busy in this example.
-  EXPECT_EQ(tsl::profiler::NanoToPico(0), idle.time_ps());
-}
-
-TEST(ConvertXPlaneToOpMetricsDb, TpuDeviceOpMetricsDb) {
-  XSpace xspace;
-  XPlane* xplane = GetOrCreateTpuXPlane(&xspace, /*device_ordinal=*/0, "TPU V4",
-                                        /*peak_tera_flops_per_second=*/0,
-                                        /*peak_hbm_bw_gigabytes_per_second=*/0);
-  XPlaneBuilder device_plane(xplane);
-  XLineBuilder stream1 = device_plane.GetOrCreateLine(/*line_id=*/10);
-  stream1.SetName(tsl::profiler::kTensorFlowOpLineName);
-  AddTensorFlowTpuOpEvent("MatMul", "while:MatMul", 0, 10, "MatMul", 34, 45, 2,
-                          5, 1, 1, &device_plane, &stream1);
-  OpMetricsDb op_metrics = ConvertTpuDeviceTraceXPlaneToOpMetricsDb(*xplane);
-#if defined(PLATFORM_GOOGLE)
-  EXPECT_THAT(op_metrics,
-              EqualsProto(R"pb(metrics_db {
-                                 hlo_module_id: 1
-                                 self_time_ps: 10000
-                                 flops: 68
-                                 model_flops: 68
-                                 num_cores: 1
-                                 occurrences: 2
-                                 name: "MatMul"
-                                 time_ps: 10000
-                                 category: "MatMul"
-                                 provenance: "while:MatMul"
-                                 min_time_ps: 10000
-                               }
-                               metrics_db { name: "IDLE" category: "IDLE" }
-                               total_time_ps: 10000
-                               total_op_time_ps: 10000
-              )pb"));
-#endif
-}
-
-TEST(ConvertXPlaneToOpMetricsDb, HostXPlaneWithXlaOps) {
-  XPlane xplane;
-  XPlaneBuilder plane(&xplane);
-  XLineBuilder line = plane.GetOrCreateLine(/*line_id=*/10);
-  AddXlaCpuOpEvent("xla_op", "tf_op", 100000, 8000, &plane, &line);
-  AddXlaCpuOpEvent("xla_op2", "tf_op2", 110000, 10000, &plane, &line);
-  OpMetricsDb op_metrics = ConvertHostThreadsXPlaneToOpMetricsDb(xplane);
-#if defined(PLATFORM_GOOGLE)
-  EXPECT_THAT(op_metrics, EqualsProto(R"pb(metrics_db {
-                                             self_time_ps: 8000000
-                                             occurrences: 1
-                                             name: "tf_op"
-                                             time_ps: 8000000
-                                           }
-                                           metrics_db {
-                                             self_time_ps: 10000000
-                                             occurrences: 1
-                                             name: "tf_op2"
-                                             time_ps: 10000000
-                                           }
-                                           metrics_db {
-                                             self_time_ps: 2000000
-                                             name: "IDLE"
-                                             time_ps: 2000000
-                                             category: "IDLE"
-                                           }
-                                           total_time_ps: 20000000
-                                           total_op_time_ps: 18000000
-                                           precision_stats {}
-              )pb"));
-#endif
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
deleted file mode 100644
index de75de4bc2b7..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ /dev/null
@@ -1,470 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-
-#include <optional>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/convert/duty_cycle_combiner.h"
-#include "tensorflow/core/profiler/convert/duty_cycle_tracker.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
-#include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
-#include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
-#include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
-#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/utils/device_caps_utils.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/gpu_event_stats.h"
-#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
-#include "tensorflow/core/profiler/utils/hlo_module_map.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-#include "tensorflow/core/profiler/utils/op_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tsl::profiler::FindPlanesWithPrefix;
-using tsl::profiler::FindTensorCorePlanes;
-using tsl::profiler::Timespan;
-
-std::string Hostname(const XSpace& space) {
-  if (space.hostnames().empty()) return "localhost";
-  DCHECK_EQ(space.hostnames_size(), 1);
-  const std::string& hostname = space.hostnames(0);
-  return hostname;
-}
-
-}  // namespace
-
-PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
-                    std::vector<double> peak_bws) {
-  PerfEnv result;
-  result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
-
-  for (const auto bw : peak_bws) {
-    result.add_peak_bws_giga_bytes_per_second(bw);
-  }
-  result.set_ridge_point(tsl::profiler::TeraToGiga(peak_tera_flops_per_second) /
-                         peak_bws[MemBwType::MEM_BW_TYPE_HBM_RW]);
-  return result;
-}
-
-PerfEnv MakePerfEnvForTpu(double peak_tera_flops_per_second,
-                          std::vector<double> peak_bws, bool has_merged_vmem,
-                          bool has_megacore) {
-  PerfEnv result = MakePerfEnv(peak_tera_flops_per_second, peak_bws);
-  result.set_has_cmem(peak_bws[MemBwType::MEM_BW_TYPE_CMEM_RD] > 0 ||
-                      peak_bws[MemBwType::MEM_BW_TYPE_CMEM_WR] > 0);
-  result.set_has_merged_vmem(has_merged_vmem);
-  result.set_has_megacore(has_megacore);
-  return result;
-}
-
-PerfEnv MakePerfEnvForGpu(double peak_tera_flops_per_second,
-                          std::vector<double> peak_bws) {
-  return MakePerfEnv(peak_tera_flops_per_second, peak_bws);
-}
-
-PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
-  DeviceCapabilities cap = GetDeviceCaps(device_plane);
-  if (!absl::StartsWith(device_plane.name(), kTpuPlanePrefix)) {
-    double peak_tera_flops_per_second =
-        cap.num_cores() *
-        tsl::profiler::GigaToTera(GetFlopMaxThroughputPerSM(cap));
-    double hbm_bw_giga_bytes_per_second =
-        tsl::profiler::UniToGiga(cap.memory_bandwidth());
-    double shm_giga_bytes_per_second =
-        cap.num_cores() *
-        tsl::profiler::UniToGiga(GetSharedMemoryBandwidthPerSM(cap));
-    // Note that treat SRAM_RD and SRAM_WR as the same. So in future, we could
-    // only use one for shared memory / L1 cache, one for another like L2.
-    return MakePerfEnvForGpu(peak_tera_flops_per_second,
-                             {/*HBM_RW=*/hbm_bw_giga_bytes_per_second,
-                              /*SRAM_RD=*/shm_giga_bytes_per_second,
-                              /*SRAM_WR=*/shm_giga_bytes_per_second});
-  } else {
-    XPlaneVisitor visitor = tsl::profiler::CreateTfXPlaneVisitor(&device_plane);
-    std::optional<XStatVisitor> peak_tera_flops_per_second =
-        visitor.GetStat(StatType::kDevCapPeakTeraflopsPerSecond);
-    double peak_tera_flops_per_second_val =
-        peak_tera_flops_per_second.has_value()
-            ? peak_tera_flops_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> peak_hbm_bw_giga_bytes_per_second =
-        visitor.GetStat(StatType::kDevCapPeakHbmBwGigabytesPerSecond);
-    double peak_hbm_bw_giga_bytes_per_second_val =
-        peak_hbm_bw_giga_bytes_per_second.has_value()
-            ? peak_hbm_bw_giga_bytes_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> peak_sram_rd_bw_giga_bytes_per_second =
-        visitor.GetStat(StatType::kDevCapPeakSramRdBwGigabytesPerSecond);
-    double peak_sram_rd_bw_giga_bytes_per_second_val =
-        peak_sram_rd_bw_giga_bytes_per_second.has_value()
-            ? peak_sram_rd_bw_giga_bytes_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> peak_sram_wr_bw_giga_bytes_per_second =
-        visitor.GetStat(StatType::kDevCapPeakSramWrBwGigabytesPerSecond);
-    double peak_sram_wr_bw_giga_bytes_per_second_val =
-        peak_sram_wr_bw_giga_bytes_per_second.has_value()
-            ? peak_sram_wr_bw_giga_bytes_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> cmem_rd_bw_giga_bytes_per_second =
-        visitor.GetStat(StatType::kDevCapPeakCmemRdBwGigabytesPerSecond);
-    double cmem_rd_bw_giga_bytes_per_second_val =
-        cmem_rd_bw_giga_bytes_per_second.has_value()
-            ? cmem_rd_bw_giga_bytes_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> cmem_wr_bw_giga_bytes_per_second =
-        visitor.GetStat(StatType::kDevCapPeakCmemWrBwGigabytesPerSecond);
-    double cmem_wr_bw_giga_bytes_per_second_val =
-        cmem_wr_bw_giga_bytes_per_second.has_value()
-            ? cmem_wr_bw_giga_bytes_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> vmem_rd_bw_giga_bytes_per_second =
-        visitor.GetStat(StatType::kDevCapPeakVmemRdBwGigabytesPerSecond);
-    double vmem_rd_bw_giga_bytes_per_second_val =
-        vmem_rd_bw_giga_bytes_per_second.has_value()
-            ? vmem_rd_bw_giga_bytes_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> vmem_wr_bw_giga_bytes_per_second =
-        visitor.GetStat(StatType::kDevCapPeakVmemWrBwGigabytesPerSecond);
-    double vmem_wr_bw_giga_bytes_per_second_val =
-        vmem_wr_bw_giga_bytes_per_second.has_value()
-            ? vmem_wr_bw_giga_bytes_per_second->DoubleValue()
-            : 0.0;
-    std::optional<XStatVisitor> has_megacore =
-        visitor.GetStat(StatType::kDevHasMegacore);
-    bool has_megacore_val =
-        has_megacore.has_value() ? has_megacore->BoolValue() : false;
-    std::optional<XStatVisitor> has_merged_vmem =
-        visitor.GetStat(StatType::kDevHasMergedVmem);
-    bool has_merged_vmem_val =
-        has_merged_vmem.has_value() ? has_merged_vmem->BoolValue() : false;
-    return MakePerfEnvForTpu(
-        peak_tera_flops_per_second_val,
-        {/*HBM_RW=*/peak_hbm_bw_giga_bytes_per_second_val,
-         /*SRAM_RD=*/peak_sram_rd_bw_giga_bytes_per_second_val,
-         /*SRAM_WR=*/peak_sram_wr_bw_giga_bytes_per_second_val,
-         /**CMEM_RD=*/cmem_rd_bw_giga_bytes_per_second_val,
-         /**CMEM_WR=*/cmem_wr_bw_giga_bytes_per_second_val,
-         /**VMEM_RD=*/vmem_rd_bw_giga_bytes_per_second_val,
-         /**VMEM_WR=*/vmem_wr_bw_giga_bytes_per_second_val},
-        has_merged_vmem_val, has_megacore_val);
-  }
-}
-
-void SetRunEnvironment(const XSpace& space, RunEnvironment* env) {
-  // Currently, we only support profiling one host and one program.
-  env->set_host_count(1);
-  env->set_task_count(1);
-  env->mutable_hostnames()->insert({Hostname(space), true});
-
-  std::vector<const XPlane*> gpu_planes =
-      FindPlanesWithPrefix(space, kGpuPlanePrefix);
-  if (!gpu_planes.empty()) {
-    absl::string_view gpu_model =
-        GpuModelName(GetDeviceCaps(*gpu_planes.front()));
-    if (!gpu_model.empty()) {
-      env->set_device_type(std::string(gpu_model));
-    } else {
-      env->set_device_type("GPU");
-    }
-    env->set_device_core_count(gpu_planes.size());
-    env->set_hardware_type(tensorflow::profiler::HardwareType::GPU);
-  } else if (std::vector<const XPlane*> tpu_planes =
-                 FindTensorCorePlanes(space);
-             !tpu_planes.empty()) {
-    XPlaneVisitor visitor =
-        tsl::profiler::CreateTfXPlaneVisitor(tpu_planes.at(0));
-    auto xstat = visitor.GetStat(StatType::kDeviceTypeString);
-    if (xstat.has_value()) {
-      env->set_device_type(std::string(xstat->StrOrRefValue()));
-    }
-    env->set_device_core_count(tpu_planes.size());
-    env->set_hardware_type(tensorflow::profiler::HardwareType::TPU);
-  } else {
-    env->set_device_type("CPU");
-    env->set_device_core_count(0);
-    env->set_hardware_type(tensorflow::profiler::HardwareType::CPU_ONLY);
-  }
-}
-
-void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
-                                         OpStats* op_stats) {
-  if (!space.errors().empty()) {
-    absl::flat_hash_set<std::string> unique_errors;
-    unique_errors.insert(space.errors().begin(), space.errors().end());
-    *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
-                                                          unique_errors.end()};
-  }
-  if (!space.warnings().empty()) {
-    absl::flat_hash_set<std::string> unique_warnings;
-    unique_warnings.insert(space.warnings().begin(), space.warnings().end());
-    *op_stats->mutable_diagnostics()->mutable_warnings() = {
-        unique_warnings.begin(), unique_warnings.end()};
-  }
-}
-
-// This function should be idempotent to be called
-void SetProgramIdToNameMap(const HloProtoMap& hlo_proto_map,
-                           tensorflow::profiler::OpStats& op_stats) {
-  auto& program_id_to_name_map = *op_stats.mutable_program_id_to_name_map();
-  for (const auto& [program_id, hlo_proto] : hlo_proto_map) {
-    program_id_to_name_map[program_id] = hlo_proto->hlo_module().name();
-  }
-}
-
-void UpdateOpMetricsDbFromHloModuleMap(OpMetricsDb& op_metrics_db,
-                                       const HloModuleMap& hlo_module_map) {
-  for (OpMetrics& op_metrics : *op_metrics_db.mutable_metrics_db()) {
-    EnterOpMetadataFromHloModuleMap(&op_metrics, hlo_module_map);
-  }
-}
-
-DutyCycleTracker ConstructDutyCycleTracker(XPlaneVisitor& visitor) {
-  DutyCycleTracker duty_cycle_tracker;
-  visitor.ForEachLine([&](const XLineVisitor& line) {
-    if (line.Name() == kXlaOpLineName) {
-      line.ForEachEvent([&](const XEventVisitor& event) {
-        auto hlo_category_stat = event.GetStat(StatType::kHloCategory);
-        duty_cycle_tracker.AddInterval(
-            Timespan(event.OffsetPs(), event.DurationPs()),
-            !(hlo_category_stat &&
-              tsl::profiler::IsOffDutyOp(hlo_category_stat->StrOrRefValue())));
-      });
-    } else if (line.Name() == kSparseCoreOpLineName ||
-               line.Name() == kSparseCoreModuleLineName) {
-      line.ForEachEvent([&](const XEventVisitor& event) {
-        duty_cycle_tracker.AddInterval(
-            Timespan(event.OffsetPs(), event.DurationPs()),
-            /*is_active=*/line.Name() == kSparseCoreOpLineName);
-      });
-    }
-  });
-  return duty_cycle_tracker;
-}
-
-OpStats ConvertXSpaceToOpStats(const XSpace& space,
-                               const OpStatsOptions& options) {
-  OpStats op_stats;
-  StepEvents step_events;
-  PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
-  // Convert device planes.
-  OpMetricsDbCombiner op_metrics_db_combiner(
-      op_stats.mutable_device_op_metrics_db());
-  SetRunEnvironment(space, op_stats.mutable_run_environment());
-
-  KernelReportMap reports;
-
-  // Handle device planes first. device_planes will contain either GPU or TPU.
-  std::vector<const XPlane*> device_planes =
-      FindPlanesWithPrefix(space, kTpuPlanePrefix);
-  const bool is_gpu = device_planes.empty();
-  if (is_gpu) {
-    device_planes = FindPlanesWithPrefix(space, kGpuPlanePrefix);
-  }
-  const bool is_tpu = !is_gpu;
-  std::string hostname = Hostname(space);
-  auto& core_id_to_details_map = *op_stats.mutable_core_id_to_details();
-  if (is_gpu) {
-    core_id_to_details_map[kDefaultGpuLocalCoreId].set_hostname(hostname);
-  }
-  DutyCycleCombiner duty_cycle_combiner;
-  // TODO(b/161942993) parallelize XPlane processing per thread.
-  HloModuleMap hlo_module_map;
-  if (options.generate_kernel_stats_db ||
-      (is_tpu && options.generate_op_metrics_db)) {
-    ProcessHloModuleMapFromXSpace(hlo_module_map, &space);
-  }
-  for (const XPlane* device_trace : device_planes) {
-    if (options.generate_op_metrics_db) {
-      if (!op_stats.has_perf_env()) {
-        *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
-      }
-      if (!is_tpu) {
-        OpMetricsDb device_op_metrics_db =
-            ConvertDeviceTraceXPlaneToOpMetricsDb(*device_trace);
-        op_metrics_db_combiner.Combine(device_op_metrics_db);
-      } else {
-        // TODO(b/397774568): Remove this once the SparseCore OpMetricsDb is
-        // implemented.
-        if (!tsl::profiler::GetSparseCoreId(device_trace->name()).has_value()) {
-          OpMetricsDb device_op_metrics_db =
-              ConvertTpuDeviceTraceXPlaneToOpMetricsDb(*device_trace);
-          UpdateOpMetricsDbFromHloModuleMap(device_op_metrics_db,
-                                            hlo_module_map);
-          op_metrics_db_combiner.Combine(device_op_metrics_db);
-        }
-      }
-    }
-    if (options.generate_step_db) {
-      StepEvents device_step_events =
-          ConvertDeviceTraceXPlaneToStepEvents(*device_trace);
-      if (is_tpu) {
-        // In TPU, we take the intersection of step events across cores as well
-        // as hosts.see b/158249775 and cl/331842545.
-        IntersectCombineStepEvents(device_step_events, &step_events);
-      } else {
-        UnionCombineStepEvents(device_step_events, &step_events);
-      }
-    }
-    if (options.generate_kernel_stats_db) {
-      ConvertDeviceTraceXPlaneToKernelReports(
-          *device_trace,
-          // TODO(cleanup): Move this to xplane_to_kernel_stats_db.cc
-          [&](const GpuEventStats& stats, KernelReport* kernel) {
-            if (!stats.IsXlaOp()) return;
-            const HloInstructionWrapper* hlo_instruction = GetHloInstruction(
-                hlo_module_map, stats.program_id, stats.hlo_op_names.back());
-            if (hlo_instruction != nullptr) {
-              kernel->set_op_name(std::string(hlo_instruction->TfOpName()));
-              bool tc_eligible = IsOpTensorCoreEligible(kernel->op_name());
-              if (VLOG_IS_ON(1) && !tc_eligible &&
-                  kernel->is_kernel_using_tensor_core()) {
-                VLOG(1) << "Detected new Op using TensorCores: "
-                        << kernel->op_name() << std::endl;
-              }
-              kernel->set_is_op_tensor_core_eligible(
-                  tc_eligible || kernel->is_op_tensor_core_eligible());
-            }
-          },
-          &reports);
-    }
-    XPlaneVisitor visitor = tsl::profiler::CreateTfXPlaneVisitor(device_trace);
-    DutyCycleTracker duty_cycle_tracker = ConstructDutyCycleTracker(visitor);
-    if (std::optional<XStatVisitor> core_details_stat =
-            visitor.GetStat(StatType::kCoreDetails)) {
-      CoreDetails core_details;
-      absl::string_view core_details_bytes = core_details_stat->BytesValue();
-      if (core_details.ParseFromArray(core_details_bytes.data(),
-                                      core_details_bytes.size())) {
-        core_details.set_hostname(hostname);
-        // This is a backfill for XPlanes that were create before this field was
-        // added.
-        core_details.set_is_sparse_core(
-            tsl::profiler::GetSparseCoreId(device_trace->name()).has_value());
-        core_id_to_details_map[device_trace->id()] = core_details;
-      }
-    }
-    if (core_id_to_details_map.contains(device_trace->id())) {
-      CoreDetails& core_details = core_id_to_details_map[device_trace->id()];
-      duty_cycle_combiner.CombineCore(duty_cycle_tracker,
-                                      core_details.local_chip_id());
-    } else {
-      LOG(WARNING) << "No CoreDetails found for TPU device plane: "
-                   << device_trace->name();
-      duty_cycle_combiner.CombineChip(duty_cycle_tracker);
-    }
-  }
-
-  if (is_tpu) {
-    OpMetricsDb& op_metrics_db = *op_stats.mutable_device_op_metrics_db();
-    op_metrics_db.set_idle_time_ps(duty_cycle_combiner.GetTotalIdleTimePs());
-    op_metrics_db.set_busy_time_ps(duty_cycle_combiner.GetTotalActiveTimePs());
-  }
-
-  // Combine into reports.
-  if (options.generate_kernel_stats_db) {
-    CopyTopKDurationKernelReportsToDb(reports,
-                                      op_stats.mutable_kernel_stats_db());
-  }
-
-  bool has_device = !device_planes.empty();
-  // Convert a host plane.
-  const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
-  if (host_plane) {
-    if (options.generate_op_metrics_db) {
-      *op_stats.mutable_host_op_metrics_db() =
-          ConvertHostThreadsXPlaneToOpMetricsDb(*host_plane);
-    }
-    if (options.generate_step_db && !has_device) {
-      StepEvents host_step_events =
-          ConvertHostThreadsXPlaneToStepEvents(*host_plane, nullptr);
-      UnionCombineStepEvents(host_step_events, &step_events);
-    }
-    XPlaneVisitor visitor = tsl::profiler::CreateTfXPlaneVisitor(host_plane);
-    auto stat = visitor.GetStat(StatType::kMatrixUnitUtilizationPercent);
-    if (stat.has_value()) {
-      op_stats.mutable_performance_counter_result()
-          ->set_matrix_unit_utilization_percent(stat->DoubleValue());
-    }
-    TfFunctionDb* tf_function_db = op_stats.mutable_tf_function_db();
-    visitor.ForEachLine([&](const XLineVisitor& line) {
-      CombineTfFunctionDb(ConvertHostThreadsXLineToTfFunctionDb(line),
-                          tf_function_db);
-    });
-  }
-  if (options.generate_step_db) {
-    if (is_tpu) {
-      // TPU steps relies on step number in step line in Xplane which has
-      // already dropped the incomplete steps at both beginning and end.
-      *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
-          has_device, /*maybe_drop_incomplete_steps=*/false, step_events);
-      *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
-          ComputePrecisionStats(step_events);
-      OpMetricsDbCombiner combiner(
-          op_stats.mutable_hlo_metrics_db_complete_steps_only());
-      for (const auto& step_info : op_stats.step_db().step_sequence()) {
-        combiner.Combine(step_info.hlo_metrics_db());
-      }
-    } else {
-      StepEvents nonoverlapped_step_events =
-          ToNonOverlappedStepEvents(step_events);
-      *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
-          has_device, options.maybe_drop_incomplete_steps,
-          nonoverlapped_step_events);
-      *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
-          ComputePrecisionStats(nonoverlapped_step_events);
-    }
-  }
-
-  // Set program_id_to_name map in OpStats from Xspace
-  // Will be non-op if the space does not have materialized device traces
-  HloProtoMap hlo_proto_map;
-  hlo_proto_map.AddHloProtosFromXSpace(space);
-  SetProgramIdToNameMap(hlo_proto_map, op_stats);
-
-  return op_stats;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index cd180e7c8dcd..4f1164943937 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -16,51 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
 
-#include <vector>
-
-#include "tensorflow/core/profiler/convert/duty_cycle_tracker.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-struct OpStatsOptions {
-  bool maybe_drop_incomplete_steps = false;
-  bool generate_op_metrics_db = false;
-  bool generate_step_db = false;
-  bool generate_kernel_stats_db = false;
-};
-
-// NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
-OpStats ConvertXSpaceToOpStats(const XSpace& space,
-                               const OpStatsOptions& options);
-
-// Populates the program_id_to_name map in OpStats.
-void SetProgramIdToNameMap(const HloProtoMap& hlo_proto_map,
-                           tensorflow::profiler::OpStats& op_stats);
-
-// Populates the given RunEnvironment with data from XSpace.
-void SetRunEnvironment(const XSpace& space, RunEnvironment* env);
-
-// Propagate and dedup the diagnostics in XSpace and add to OpStats.
-void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
-                                         OpStats* op_stats);
-
-// Populates PerfEnv.
-PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
-                    std::vector<double> peak_bws);
-
-// Extracts PerfEnv from XPlane stats.
-PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane);
-
-// Constructs a DutyCycleTracker from the given XPlaneVisitor.
-DutyCycleTracker ConstructDutyCycleTracker(XPlaneVisitor& visitor);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/convert/xplane_to_op_stats.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
deleted file mode 100644
index c1a310e01271..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ /dev/null
@@ -1,814 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/duty_cycle_tracker.h"
-#include "tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
-#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::testing::Property;
-using ::testing::UnorderedElementsAre;
-
-TEST(ConvertXPlaneToOpStats, GpuPerfEnv) {
-  auto space = std::make_unique<XSpace>();
-  constexpr double kMaxError = 0.01;
-  constexpr int kClockRateKHz = 1530000;
-  constexpr int kCoreCount = 80;
-  constexpr uint64 kMemoryBandwidthBytesPerSecond =
-      uint64{900} * 1000 * 1000 * 1000;
-  // Volta.
-  constexpr int kComputeCapMajor = 7;
-  constexpr int kComputeCapMinor = 0;
-
-  XPlaneBuilder device_plane(
-      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/0));
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kDevVendor)),
-                            kDeviceVendorNvidia);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("clock_rate"),
-                            kClockRateKHz);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("core_count"),
-                            kCoreCount);
-  device_plane.AddStatValue(
-      *device_plane.GetOrCreateStatMetadata("memory_bandwidth"),
-      kMemoryBandwidthBytesPerSecond);
-  device_plane.AddStatValue(
-      *device_plane.GetOrCreateStatMetadata("compute_cap_major"),
-      kComputeCapMajor);
-  device_plane.AddStatValue(
-      *device_plane.GetOrCreateStatMetadata("compute_cap_minor"),
-      kComputeCapMinor);
-
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(space));
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  OpStats op_stats;
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
-                                                   options, &op_stats));
-  const PerfEnv& perf_env = op_stats.perf_env();
-  // Change to lower flops number that we do not use sum of the tensor core peak
-  // flops and the cuda core peak flops together as peak flops. Only use the
-  // tensor core peak flops as all those white papers are using.
-  EXPECT_NEAR(125.34, perf_env.peak_tera_flops_per_second(), kMaxError);
-  EXPECT_NEAR(
-      900,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_HBM_RW),
-      kMaxError);
-  // Ridge point changed accordingly from above peak flops change.
-  EXPECT_NEAR(139.26, perf_env.ridge_point(), kMaxError);
-}
-
-TEST(ConvertXPlaneToOpStats, GpuRunEnvironment) {
-  auto space = std::make_unique<XSpace>();
-  XPlaneBuilder device_plane1(
-      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/0));
-  device_plane1.AddStatValue(*device_plane1.GetOrCreateStatMetadata(
-                                 GetStatTypeStr(StatType::kDevVendor)),
-                             kDeviceVendorNvidia);
-  XPlaneBuilder device_plane2(
-      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/1));
-  device_plane2.AddStatValue(*device_plane2.GetOrCreateStatMetadata(
-                                 GetStatTypeStr(StatType::kDevVendor)),
-                             kDeviceVendorNvidia);
-
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(space));
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  OpStats op_stats;
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot_or.value(), OpStatsOptions(), &op_stats));
-  const RunEnvironment& run_env = op_stats.run_environment();
-
-  EXPECT_EQ("Nvidia GPU", run_env.device_type());
-  EXPECT_EQ(1, run_env.host_count());
-  EXPECT_EQ(1, run_env.task_count());
-  EXPECT_EQ(2, run_env.device_core_count());
-}
-
-TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
-  constexpr int64_t kStepNum = 123;
-  constexpr int64_t kStepId = 0;
-
-  auto space = std::make_unique<XSpace>();
-  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(space.get()));
-  host_plane_builder.ReserveLines(2);
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
-               0, 100, {{StatType::kStepNum, kStepNum}});
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90,
-               {{StatType::kStepId, kStepId},
-                {StatType::kProducerType, int64_t{1}},
-                {StatType::kProducerId, kStepId}});
-
-  auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&host_plane_builder, &tf_executor_thread,
-               HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId},
-                {StatType::kConsumerType, int64_t{1}},
-                {StatType::kConsumerId, kStepId}});
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70);
-
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  options.generate_step_db = true;
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(space));
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  OpStats op_stats;
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
-                                                   options, &op_stats));
-  const StepDatabaseResult& step_db = op_stats.step_db();
-
-  EXPECT_EQ(step_db.step_sequence_size(), 1);
-}
-
-TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
-  constexpr int64_t kStepNum = 123;
-  constexpr int64_t kStepId = 0;
-  constexpr int64_t kCorrelationId = 100;
-
-  auto space = std::make_unique<XSpace>();
-  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(space.get()));
-  host_plane_builder.ReserveLines(2);
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
-               0, 100, {{StatType::kStepNum, kStepNum}});
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90,
-               {{StatType::kStepId, kStepId},
-                {StatType::kProducerType, int64_t{1}},
-                {StatType::kProducerId, kStepId}});
-
-  auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&host_plane_builder, &tf_executor_thread,
-               HostEventType::kExecutorStateProcess, 20, 20,
-               {{StatType::kStepId, kStepId},
-                {StatType::kConsumerType, int64_t{1}},
-                {StatType::kConsumerId, kStepId}});
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 10,
-               {{StatType::kCorrelationId, kCorrelationId}});
-
-  XPlaneBuilder device_plane_builder(
-      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/0));
-  device_plane_builder.ReserveLines(1);
-
-  auto stream = device_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&device_plane_builder, &stream, "matmul", 50, 40,
-               {{StatType::kCorrelationId, kCorrelationId}});
-
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  options.generate_step_db = true;
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(space));
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  OpStats op_stats;
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
-                                                   options, &op_stats));
-  const StepDatabaseResult& step_db = op_stats.step_db();
-
-  EXPECT_EQ(step_db.step_sequence_size(), 1);
-
-  PrecisionStats precision_stats =
-      op_stats.device_op_metrics_db().precision_stats();
-  EXPECT_EQ(precision_stats.compute_16bit_ps(), 0);
-  EXPECT_EQ(precision_stats.compute_32bit_ps(), 40);
-}
-
-TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
-  XSpace space;
-  static constexpr char kError[] = "host: error";
-  *space.add_errors() = kError;
-  *space.add_errors() = kError;
-
-  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
-
-  EXPECT_EQ(1, op_stats.diagnostics().errors_size());
-  EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0));
-}
-
-TEST(ConvertXPlaneToOpStats, Hostnames) {
-  XSpace space;
-  static constexpr char kHost[] = "host1";
-  *space.add_hostnames() = kHost;
-
-  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
-  EXPECT_EQ(
-      kHost,
-      op_stats.core_id_to_details().at(kDefaultGpuLocalCoreId).hostname());
-}
-
-void BuildXSpaceForTest(XSpace& xspace, absl::string_view hostname) {
-  constexpr int64_t kStepNum = 123;
-  constexpr int64_t kStepId = 456;
-  // Create a host only XSpace for test.
-  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&xspace));
-  host_plane_builder.ReserveLines(2);
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
-               0, 100, {{StatType::kStepNum, kStepNum}});
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90,
-               {{StatType::kStepId, kStepId},
-                {StatType::kProducerType, int64_t{1}},
-                {StatType::kProducerId, kStepId}});
-
-  auto executor_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&host_plane_builder, &executor_thread,
-               HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId},
-                {StatType::kConsumerType, int64_t{1}},
-                {StatType::kConsumerId, kStepId}});
-  // Create a TensorFlow op that runs for 70 ps.
-  CreateXEvent(&host_plane_builder, &executor_thread, "aaa:bbb", 30, 70);
-  xspace.add_hostnames(std::string(hostname));
-}
-
-TEST(ConvertXPlaneToOpStats, TestConvertMultiXSpacesToCombinedOpStats) {
-  static constexpr char kHost1[] = "host1";
-  static constexpr char kHost2[] = "host2";
-
-  auto xspace1 = std::make_unique<XSpace>();
-  auto xspace2 = std::make_unique<XSpace>();
-
-  BuildXSpaceForTest(*xspace1, kHost1);
-  BuildXSpaceForTest(*xspace2, kHost2);
-
-  std::vector<std::string> xspace_paths;
-  xspace_paths.push_back("host1.pb");
-  xspace_paths.push_back("host2.pb");
-
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(xspace1));
-  xspaces.push_back(std::move(xspace2));
-
-  auto session_snapshot_or =
-      SessionSnapshot::Create(std::move(xspace_paths), std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  options.generate_step_db = true;
-  OpStats combined_op_stats;
-
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
-                                                   options, &combined_op_stats))
-      << "Failed to convert multi XSpace to OpStats";
-
-  // Result OpStats has 2 Host Ops, "IDLE" and "aaa:bbb".
-  ASSERT_EQ(combined_op_stats.host_op_metrics_db().metrics_db_size(), 2);
-  const auto& metric = combined_op_stats.host_op_metrics_db().metrics_db(1);
-  EXPECT_EQ(metric.name(), "aaa");
-  EXPECT_EQ(metric.category(), "bbb");
-  // Each host has the HostOp "aaa:bbb" running for 70 ps, so the combined
-  // OpStats has "aaa:bbb" running for 140 ps in total.
-  EXPECT_EQ(metric.self_time_ps(), 140);
-
-  // Result OpStats has 1 step, 2 cores.
-  ASSERT_EQ(combined_op_stats.step_db().step_sequence_size(), 1);
-  ASSERT_EQ(
-      combined_op_stats.step_db().step_sequence(0).step_info_per_core_size(),
-      2);
-  const auto& step_info_per_core =
-      combined_op_stats.step_db().step_sequence(0).step_info_per_core();
-  // global_core_id is computed using: 1000 * host_id + local_core_id.
-  EXPECT_TRUE(step_info_per_core.contains(kDefaultGpuLocalCoreId));
-  EXPECT_TRUE(step_info_per_core.contains(1000 + kDefaultGpuLocalCoreId));
-
-  const auto& core_details_map = combined_op_stats.core_id_to_details();
-  EXPECT_EQ(kHost1, core_details_map.at(kDefaultGpuLocalCoreId).hostname());
-  EXPECT_EQ(kHost2,
-            core_details_map.at(1000 + kDefaultGpuLocalCoreId).hostname());
-}
-
-TEST(ConvertXPlaneToOpStats, RunEnvironmentExtractedFromTpuPlane) {
-  XSpace xspace;
-  for (int i : {0, 1, 2, 3}) {
-    GetOrCreateTpuXPlane(&xspace, i, "TPU V4", 0, 0);
-  }
-
-  OpStats op_stats = ConvertXSpaceToOpStats(xspace, OpStatsOptions());
-
-  EXPECT_EQ(op_stats.run_environment().device_type(), "TPU V4");
-  EXPECT_EQ(op_stats.run_environment().device_core_count(), 4);
-}
-
-TEST(ConvertXPlaneToOpStats, TpuPerfEnv) {
-  auto space = std::make_unique<XSpace>();
-  constexpr double kMaxError = 0.01;
-  constexpr int kClockRateKHz = 1530000;
-  constexpr int kCoreCount = 80;
-  constexpr uint64 kMemoryBandwidthBytesPerSecond =
-      uint64{900} * 1000 * 1000 * 1000;
-  // Volta.
-  constexpr int kComputeCapMajor = 7;
-  constexpr int kComputeCapMinor = 0;
-  constexpr double kDevCapPeakTeraflopsPerSecond = 141.0;
-  constexpr double kDevCapPeakHbmBwGigabytesPerSecond = 900.0;
-  constexpr double kDevCapPeakSramRdBwGigabytesPerSecond = 101.0;
-  constexpr double kDevCapPeakSramWrBwGigabytesPerSecond = 102.0;
-  constexpr double kDevCapPeakCmemRdBwGigabytesPerSecond = 101.0;
-  constexpr double kDevCapPeakCmemWrBwGigabytesPerSecond = 102.0;
-  constexpr double kDevCapPeakVmemRdBwGigabytesPerSecond = 201.0;
-  constexpr double kDevCapPeakVmemWrBwGigabytesPerSecond = 202.0;
-
-  XPlaneBuilder device_plane(GetOrCreateTpuXPlane(
-      space.get(), /*device_ordinal=*/0, "TPU V4",
-      kDevCapPeakTeraflopsPerSecond, kDevCapPeakHbmBwGigabytesPerSecond));
-  /*device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kDevVendor)),
-                        kDeviceVendorNvidia); // "Google, Inc.");*/
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("clock_rate"),
-                            kClockRateKHz);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("core_count"),
-                            kCoreCount);
-  device_plane.AddStatValue(
-      *device_plane.GetOrCreateStatMetadata("memory_bandwidth"),
-      kMemoryBandwidthBytesPerSecond);
-  device_plane.AddStatValue(
-      *device_plane.GetOrCreateStatMetadata("compute_cap_major"),
-      kComputeCapMajor);
-  device_plane.AddStatValue(
-      *device_plane.GetOrCreateStatMetadata("compute_cap_minor"),
-      kComputeCapMinor);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                                "peak_sram_rd_bw_gigabytes_per_second"),
-                            kDevCapPeakSramRdBwGigabytesPerSecond);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                                "peak_sram_wr_bw_gigabytes_per_second"),
-                            kDevCapPeakSramWrBwGigabytesPerSecond);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                                "peak_cmem_rd_bw_gigabytes_per_second"),
-                            kDevCapPeakCmemRdBwGigabytesPerSecond);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                                "peak_cmem_wr_bw_gigabytes_per_second"),
-                            kDevCapPeakCmemWrBwGigabytesPerSecond);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                                "peak_vmem_rd_bw_gigabytes_per_second"),
-                            kDevCapPeakVmemRdBwGigabytesPerSecond);
-  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
-                                "peak_vmem_wr_bw_gigabytes_per_second"),
-                            kDevCapPeakVmemWrBwGigabytesPerSecond);
-
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(space));
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  OpStats op_stats;
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
-                                                   options, &op_stats));
-  const PerfEnv& perf_env = op_stats.perf_env();
-  EXPECT_NEAR(kDevCapPeakTeraflopsPerSecond,
-              perf_env.peak_tera_flops_per_second(), kMaxError);
-  EXPECT_NEAR(
-      kDevCapPeakHbmBwGigabytesPerSecond,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_HBM_RW),
-      kMaxError);
-  EXPECT_NEAR(
-      kDevCapPeakSramRdBwGigabytesPerSecond,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_SRAM_RD),
-      kMaxError);
-  EXPECT_NEAR(
-      kDevCapPeakSramWrBwGigabytesPerSecond,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_SRAM_WR),
-      kMaxError);
-  EXPECT_NEAR(
-      kDevCapPeakCmemRdBwGigabytesPerSecond,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_CMEM_RD),
-      kMaxError);
-  EXPECT_NEAR(
-      kDevCapPeakCmemWrBwGigabytesPerSecond,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_CMEM_WR),
-      kMaxError);
-  EXPECT_NEAR(
-      kDevCapPeakVmemRdBwGigabytesPerSecond,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_VMEM_RD),
-      kMaxError);
-  EXPECT_NEAR(
-      kDevCapPeakVmemWrBwGigabytesPerSecond,
-      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_VMEM_WR),
-      kMaxError);
-  EXPECT_NEAR(156.67, perf_env.ridge_point(), kMaxError);
-}
-
-TEST(ConvertXPlaneToOpStats, TpuRunEnvironment) {
-  auto space = std::make_unique<XSpace>();
-  XPlaneBuilder device_plane1(
-      GetOrCreateTpuXPlane(space.get(), /*device_ordinal=*/0, "TPU V4", 0, 0));
-  XPlaneBuilder device_plane2(
-      GetOrCreateTpuXPlane(space.get(), /*device_ordinal=*/1, "TPU V4", 0, 0));
-
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(space));
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  OpStats op_stats;
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot_or.value(), OpStatsOptions(), &op_stats));
-  const RunEnvironment& run_env = op_stats.run_environment();
-
-  EXPECT_EQ("TPU V4", run_env.device_type());
-  EXPECT_EQ(1, run_env.host_count());
-  EXPECT_EQ(1, run_env.task_count());
-  EXPECT_EQ(2, run_env.device_core_count());
-}
-
-TEST(ConvertXPlaneToOpStats, TpuDeviceTraceToStepDb) {
-  auto space = std::make_unique<XSpace>();
-  constexpr double kDevCapPeakTeraflopsPerSecond = 141.0;
-  constexpr double kDevCapPeakHbmBwGigabytesPerSecond = 1000.0;
-  XPlaneBuilder xplane_builder(GetOrCreateTpuXPlane(
-      space.get(), /*device_ordinal=*/0, "TPU V4",
-      kDevCapPeakTeraflopsPerSecond, kDevCapPeakHbmBwGigabytesPerSecond));
-
-  XEventMetadata* event_metadata = xplane_builder.GetOrCreateEventMetadata(1);
-  event_metadata->set_name("op_name");
-  XStatsBuilder<XEventMetadata> stats(event_metadata, &xplane_builder);
-
-  stats.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                         GetStatTypeStr(StatType::kProgramId)),
-                     1);
-  stats.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                         GetStatTypeStr(StatType::kSymbolId)),
-                     1);
-  stats.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                         GetStatTypeStr(StatType::kSelfDurationPs)),
-                     10);
-  stats.AddStatValue(
-      *xplane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-      "tf_op_name");
-  stats.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
-                         GetStatTypeStr(StatType::kHloCategory)),
-                     "category");
-  XLineBuilder line = xplane_builder.GetOrCreateLine(1);
-  line.SetName(kTensorFlowOpLineName);
-  XEventBuilder event = line.AddEvent(*event_metadata);
-  event.SetOffsetNs(0);
-  event.SetDurationNs(10);
-
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(space));
-  auto session_snapshot_or =
-      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot_or.status());
-  OpStats op_stats;
-  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
-                                                   options, &op_stats));
-  EXPECT_THAT(op_stats.device_op_metrics_db().metrics_db(),
-              UnorderedElementsAre(Property(&OpMetrics::name, "op_name"),
-                                   Property(&OpMetrics::name, "IDLE")));
-}
-
-// Verifies that the step db is generated correctly by intersecting for
-// multi-device TPU.
-TEST(ConvertXPlaneToOpStats, TpuMultiDeviceStepDbTest) {
-  auto space = std::make_unique<XSpace>();
-
-  XPlaneBuilder device_plane_builder1(
-      GetOrCreateTpuXPlane(space.get(), /*device_ordinal=*/0, "TPU V4", 0, 0));
-  XPlaneBuilder device_plane_builder2(
-      GetOrCreateTpuXPlane(space.get(), /*device_ordinal=*/1, "TPU V4", 0, 0));
-  device_plane_builder1.ReserveLines(1);
-  device_plane_builder2.ReserveLines(1);
-
-  // Create 1 step in xplane in TPU ordinal 0.
-  XStatMetadata* kGroupId1 = device_plane_builder1.GetOrCreateStatMetadata(
-      GetStatTypeStr(StatType::kGroupId));
-  XLineBuilder line = device_plane_builder1.GetOrCreateLine(1);
-  line.SetName(kXlaOpLineName);
-  // Step 1
-  XEventMetadata* event_metadata =
-      device_plane_builder1.GetOrCreateEventMetadata(1);
-  event_metadata->set_name("Step 1");
-  XEventBuilder event_builder = line.AddEvent(*event_metadata);
-  event_builder.AddStatValue(*kGroupId1, 1);  // step num
-  event_builder.SetDurationNs(100);
-  event_builder.SetOffsetNs(100);
-
-  // Create 2 steps in xplane in TPU ordinal 1.
-  line = device_plane_builder2.GetOrCreateLine(1);
-  line.SetName(kXlaOpLineName);
-  // Step 1
-  XStatMetadata* kGroupId2 = device_plane_builder2.GetOrCreateStatMetadata(
-      GetStatTypeStr(StatType::kGroupId));
-  XEventMetadata* event_metadata2 =
-      device_plane_builder2.GetOrCreateEventMetadata(2);
-  event_metadata2->set_name("Step 1");
-  XEventBuilder event_builder2 = line.AddEvent(*event_metadata2);
-  event_builder2.AddStatValue(*kGroupId2, 1);  // step num
-  event_builder2.SetDurationNs(100);
-  event_builder2.SetOffsetNs(300);
-  // Step 2
-  XStatMetadata* kGroupId3 = device_plane_builder2.GetOrCreateStatMetadata(
-      GetStatTypeStr(StatType::kGroupId));
-  XEventMetadata* event_metadata3 =
-      device_plane_builder2.GetOrCreateEventMetadata(2);
-  event_metadata3->set_name("Step 2");
-  XEventBuilder event_builder3 = line.AddEvent(*event_metadata3);
-  event_builder3.AddStatValue(*kGroupId3, 2);  // step num
-  event_builder3.SetDurationNs(100);
-  event_builder3.SetOffsetNs(300);
-
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  options.generate_step_db = true;
-  OpStats op_stats = ConvertXSpaceToOpStats(*space, options);
-  const StepDatabaseResult& step_db = op_stats.step_db();
-  // For TPU step events, we intersect the step events by step num across
-  // different TPU devices.
-  EXPECT_EQ(step_db.step_sequence_size(), 1);
-}
-
-TEST(ConvertXPlaneToOpStats, ConstructDutyCycleTrackerFromXlaOps) {
-  XSpace space;
-  XPlane* device_plane = GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/0, /*device_type=*/"TPU v4",
-      /*peak_tera_flops_per_second=*/0,
-      /*peak_hbm_bw_gigabytes_per_second=*/0);
-  XPlaneBuilder device_plane_builder(device_plane);
-  XLineBuilder op_line = device_plane_builder.GetOrCreateLine(0);
-  op_line.SetName(kXlaOpLineName);
-  CreateXEvent(&device_plane_builder, &op_line, "op.1", /*offset_ps=*/10,
-               /*duration_ps=*/10,
-               {{StatType::kHloCategory, tsl::profiler::kHloInfeed}});
-  CreateXEvent(&device_plane_builder, &op_line, "op.2", /*offset_ps=*/20,
-               /*duration_ps=*/10,
-               {{StatType::kHloCategory, tsl::profiler::kHloCall}});
-  CreateXEvent(&device_plane_builder, &op_line, "op.3", /*offset_ps=*/30,
-               /*duration_ps=*/10);
-  CreateXEvent(&device_plane_builder, &op_line, "op.4", /*offset_ps=*/40,
-               /*duration_ps=*/10,
-               {{StatType::kHloCategory, tsl::profiler::kHloOutfeed}});
-
-  XPlaneVisitor visitor = tsl::profiler::CreateTfXPlaneVisitor(device_plane);
-  DutyCycleTracker tracker = ConstructDutyCycleTracker(visitor);
-  EXPECT_EQ(tracker.GetActiveTimePs(), 20);
-  EXPECT_EQ(tracker.GetIdleTimePs(), 20);
-}
-
-TEST(ConvertXPlaneToOpStats, ConstructDutyCycleTrackerFromSparseCore) {
-  XSpace space;
-  XPlane* sc_plane = GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/0, /*device_type=*/"TPU v4",
-      /*peak_tera_flops_per_second=*/0,
-      /*peak_hbm_bw_gigabytes_per_second=*/0);
-  XPlaneBuilder sc_plane_builder(sc_plane);
-  XLineBuilder op_line = sc_plane_builder.GetOrCreateLine(0);
-  op_line.SetName(kSparseCoreOpLineName);
-  CreateXEvent(&sc_plane_builder, &op_line, "op.1", /*offset_ps=*/10,
-               /*duration_ps=*/10);
-  CreateXEvent(&sc_plane_builder, &op_line, "op.2", /*offset_ps=*/20,
-               /*duration_ps=*/10);
-  CreateXEvent(&sc_plane_builder, &op_line, "op.3", /*offset_ps=*/30,
-               /*duration_ps=*/10);
-  CreateXEvent(&sc_plane_builder, &op_line, "op.4", /*offset_ps=*/40,
-               /*duration_ps=*/10);
-  XLineBuilder module_line = sc_plane_builder.GetOrCreateLine(1);
-  module_line.SetName(kSparseCoreModuleLineName);
-  CreateXEvent(&sc_plane_builder, &module_line, "module.1", /*offset_ps=*/5,
-               /*duration_ps=*/50);
-
-  XPlaneVisitor visitor = tsl::profiler::CreateTfXPlaneVisitor(sc_plane);
-  DutyCycleTracker tracker = ConstructDutyCycleTracker(visitor);
-  EXPECT_EQ(tracker.GetActiveTimePs(), 40);
-  EXPECT_EQ(tracker.GetIdleTimePs(), 10);
-}
-
-TEST(ConvertXPlaneToOpStats, MultiCoreChipBusyAndIdleTimeTest) {
-  XSpace space;
-  CoreDetails tc_core_details;
-  tc_core_details.set_local_chip_id(0);
-  CoreDetails sc_core_details;
-  sc_core_details.set_local_chip_id(0);
-  XPlane* tc_plane = GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/0, /*device_type=*/"TPU v4",
-      /*peak_tera_flops_per_second=*/0,
-      /*peak_hbm_bw_gigabytes_per_second=*/0);
-  XPlaneBuilder tc_plane_builder(tc_plane);
-  tc_plane_builder.AddStatValue(*tc_plane_builder.GetOrCreateStatMetadata(
-                                    GetStatTypeStr(StatType::kCoreDetails)),
-                                tc_core_details);
-  XLineBuilder xla_op_line = tc_plane_builder.GetOrCreateLine(0);
-  xla_op_line.SetName(kXlaOpLineName);
-  CreateXEvent(&tc_plane_builder, &xla_op_line, "op.1", /*offset_ps=*/10,
-               /*duration_ps=*/10,
-               {{StatType::kHloCategory, tsl::profiler::kHloInfeed}});
-  CreateXEvent(&tc_plane_builder, &xla_op_line, "op.2", /*offset_ps=*/20,
-               /*duration_ps=*/10,
-               {{StatType::kHloCategory, tsl::profiler::kHloCall}});
-  CreateXEvent(&tc_plane_builder, &xla_op_line, "op.3", /*offset_ps=*/30,
-               /*duration_ps=*/10);
-  CreateXEvent(&tc_plane_builder, &xla_op_line, "op.4", /*offset_ps=*/40,
-               /*duration_ps=*/10,
-               {{StatType::kHloCategory, tsl::profiler::kHloOutfeed}});
-
-  XPlane* sc_plane = GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/1, /*device_type=*/"TPU v4",
-      /*peak_tera_flops_per_second=*/0,
-      /*peak_hbm_bw_gigabytes_per_second=*/0);
-  XPlaneBuilder sc_plane_builder(sc_plane);
-  sc_plane_builder.AddStatValue(*sc_plane_builder.GetOrCreateStatMetadata(
-                                    GetStatTypeStr(StatType::kCoreDetails)),
-                                sc_core_details);
-  XLineBuilder sc_op_line = sc_plane_builder.GetOrCreateLine(0);
-  sc_op_line.SetName(kSparseCoreOpLineName);
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "op.1", /*offset_ps=*/10,
-               /*duration_ps=*/10);
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "op.2", /*offset_ps=*/20,
-               /*duration_ps=*/10);
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "op.3", /*offset_ps=*/30,
-               /*duration_ps=*/10);
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "op.4", /*offset_ps=*/40,
-               /*duration_ps=*/10);
-  XLineBuilder sc_module_line = sc_plane_builder.GetOrCreateLine(1);
-  sc_module_line.SetName(kSparseCoreModuleLineName);
-  CreateXEvent(&sc_plane_builder, &sc_module_line, "module.1", /*offset_ps=*/5,
-               /*duration_ps=*/50);
-
-  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
-  EXPECT_EQ(op_stats.device_op_metrics_db().idle_time_ps(), 10);
-  EXPECT_EQ(op_stats.device_op_metrics_db().busy_time_ps(), 40);
-}
-
-TEST(ConvertXPlaneToOpStats, HandleSparseCoreBusyOpMetrics) {
-  XSpace space;
-  XPlane* tc_plane = GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/0, /*device_type=*/"TPU v4",
-      /*peak_tera_flops_per_second=*/0,
-      /*peak_hbm_bw_gigabytes_per_second=*/0);
-  XPlaneBuilder tc_plane_builder(tc_plane);
-  tc_plane_builder.SetId(0);
-  XLineBuilder tc_step_line = tc_plane_builder.GetOrCreateLine(0);
-  tc_step_line.SetName(tsl::profiler::kStepLineName);
-  CreateXEvent(&tc_plane_builder, &tc_step_line, "step.1", /*offset_ps=*/10,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{1}}});
-  CreateXEvent(&tc_plane_builder, &tc_step_line, "step.2", /*offset_ps=*/20,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{2}}});
-  CreateXEvent(&tc_plane_builder, &tc_step_line, "step.3", /*offset_ps=*/30,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{3}}});
-  CreateXEvent(&tc_plane_builder, &tc_step_line, "step.4", /*offset_ps=*/40,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{4}}});
-  XLineBuilder tc_module_line = tc_plane_builder.GetOrCreateLine(1);
-  tc_module_line.SetName(tsl::profiler::kXlaModuleLineName);
-  CreateXEvent(&tc_plane_builder, &tc_module_line, "module.1", /*offset_ps=*/10,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{1}}});
-  CreateXEvent(&tc_plane_builder, &tc_module_line, "module.2", /*offset_ps=*/20,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{2}}});
-  CreateXEvent(&tc_plane_builder, &tc_module_line, "module.3", /*offset_ps=*/30,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{3}}});
-  CreateXEvent(&tc_plane_builder, &tc_module_line, "module.4", /*offset_ps=*/40,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{4}}});
-  XLineBuilder tc_op_line = tc_plane_builder.GetOrCreateLine(2);
-  tc_op_line.SetName(kXlaOpLineName);
-  auto& program_id_stat = *tc_plane_builder.GetOrCreateStatMetadata(
-      GetStatTypeStr(StatType::kProgramId));
-  auto& symbol_id_stat = *tc_plane_builder.GetOrCreateStatMetadata(
-      GetStatTypeStr(StatType::kSymbolId));
-  XStatsBuilder<XEventMetadata> op1_stats(
-      tc_plane_builder.GetOrCreateEventMetadata("op.1"), &tc_plane_builder);
-  op1_stats.AddStatValue(program_id_stat, 1);
-  op1_stats.AddStatValue(symbol_id_stat, 1);
-  XStatsBuilder<XEventMetadata> op2_stats(
-      tc_plane_builder.GetOrCreateEventMetadata("op.2"), &tc_plane_builder);
-  op2_stats.AddStatValue(program_id_stat, 1);
-  op2_stats.AddStatValue(symbol_id_stat, 2);
-  XStatsBuilder<XEventMetadata> op3_stats(
-      tc_plane_builder.GetOrCreateEventMetadata("op.3"), &tc_plane_builder);
-  op3_stats.AddStatValue(program_id_stat, 1);
-  op3_stats.AddStatValue(symbol_id_stat, 3);
-  XStatsBuilder<XEventMetadata> op4_stats(
-      tc_plane_builder.GetOrCreateEventMetadata("op.4"), &tc_plane_builder);
-  op4_stats.AddStatValue(program_id_stat, 1);
-  op4_stats.AddStatValue(symbol_id_stat, 4);
-  CreateXEvent(&tc_plane_builder, &tc_op_line, "op.1", /*offset_ps=*/15,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{1}}});
-  CreateXEvent(&tc_plane_builder, &tc_op_line, "op.2", /*offset_ps=*/25,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{2}}});
-  CreateXEvent(&tc_plane_builder, &tc_op_line, "op.3", /*offset_ps=*/35,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{3}}});
-  CreateXEvent(&tc_plane_builder, &tc_op_line, "op.4", /*offset_ps=*/45,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{4}}});
-  XPlane* sc_plane = GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/1, /*device_type=*/"TPU v4",
-      /*peak_tera_flops_per_second=*/0,
-      /*peak_hbm_bw_gigabytes_per_second=*/0);
-  XPlaneBuilder sc_plane_builder(sc_plane);
-  sc_plane_builder.SetId(1);
-  sc_plane_builder.SetName(
-      absl::StrCat(sc_plane->name(), " SparseCore ", sc_plane->id()));
-  XLineBuilder sc_step_line = sc_plane_builder.GetOrCreateLine(0);
-  sc_step_line.SetName(tsl::profiler::kSparseCoreStepLineName);
-  CreateXEvent(&sc_plane_builder, &sc_step_line, "step.1", /*offset_ps=*/10,
-               /*duration_ps=*/10,
-               {{StatType::kStepIdleTimePs, int64_t{5}},
-                {StatType::kGroupId, int64_t{1}}});
-  CreateXEvent(&sc_plane_builder, &sc_step_line, "step.2", /*offset_ps=*/20,
-               /*duration_ps=*/10,
-               {{StatType::kStepIdleTimePs, int64_t{5}},
-                {StatType::kGroupId, int64_t{2}}});
-  CreateXEvent(&sc_plane_builder, &sc_step_line, "step.3", /*offset_ps=*/30,
-               /*duration_ps=*/10,
-               {{StatType::kStepIdleTimePs, int64_t{5}},
-                {StatType::kGroupId, int64_t{3}}});
-  CreateXEvent(&sc_plane_builder, &sc_step_line, "step.4", /*offset_ps=*/40,
-               /*duration_ps=*/10,
-               {{StatType::kStepIdleTimePs, int64_t{5}},
-                {StatType::kGroupId, int64_t{4}}});
-  XLineBuilder sc_module_line = sc_plane_builder.GetOrCreateLine(1);
-  sc_module_line.SetName(kSparseCoreModuleLineName);
-  CreateXEvent(&sc_plane_builder, &sc_module_line, "module.1", /*offset_ps=*/10,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{1}}});
-  CreateXEvent(&sc_plane_builder, &sc_module_line, "module.2", /*offset_ps=*/20,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{2}}});
-  CreateXEvent(&sc_plane_builder, &sc_module_line, "module.3", /*offset_ps=*/30,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{3}}});
-  CreateXEvent(&sc_plane_builder, &sc_module_line, "module.4", /*offset_ps=*/40,
-               /*duration_ps=*/10, {{StatType::kGroupId, int64_t{4}}});
-  XLineBuilder sc_op_line = sc_plane_builder.GetOrCreateLine(2);
-  sc_op_line.SetName(kSparseCoreOpLineName);
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "scs op.1", /*offset_ps=*/15,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{1}}});
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "scs op.2", /*offset_ps=*/25,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{2}}});
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "scs op.3", /*offset_ps=*/35,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{3}}});
-  CreateXEvent(&sc_plane_builder, &sc_op_line, "scs op.4", /*offset_ps=*/45,
-               /*duration_ps=*/5, {{StatType::kGroupId, int64_t{4}}});
-  OpStats op_stats = ConvertXSpaceToOpStats(
-      space,
-      OpStatsOptions{.generate_op_metrics_db = true, .generate_step_db = true});
-  EXPECT_EQ(op_stats.device_op_metrics_db().total_time_ps(), 40);
-  EXPECT_EQ(op_stats.device_op_metrics_db().total_op_time_ps(), 20);
-  EXPECT_EQ(op_stats.step_db().step_sequence_size(), 4);
-  EXPECT_EQ(op_stats.hlo_metrics_db_complete_steps_only().total_time_ps(), 40);
-  EXPECT_EQ(op_stats.hlo_metrics_db_complete_steps_only().total_op_time_ps(),
-            20);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
deleted file mode 100644
index e7debb44da89..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ /dev/null
@@ -1,392 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
-
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/match.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-inline AllReduceInfo GetAllReduceInfo(const XEventVisitor& event,
-                                      uint64_t all_reduce_unique_id) {
-  AllReduceInfo collective_ops;
-  collective_ops.set_id(all_reduce_unique_id);
-  collective_ops.set_start_time_ps(event.TimestampPs());
-  if (auto device_offset_ps_stat = event.GetStat(StatType::kDeviceOffsetPs)) {
-    collective_ops.set_start_time_ps(device_offset_ps_stat->IntOrUintValue());
-  }
-  collective_ops.set_end_time_ps(event.EndTimestampPs());
-  if (auto device_duration_ps_stat =
-          event.GetStat(StatType::kDeviceDurationPs)) {
-    collective_ops.set_end_time_ps(collective_ops.start_time_ps() +
-                                   device_duration_ps_stat->IntOrUintValue());
-  }
-  if (auto all_reduce_id_stat = event.GetStat(StatType::kAllReduceId)) {
-    collective_ops.set_all_reduce_id(all_reduce_id_stat->IntOrUintValue());
-  }
-  if (auto bytes_accessed_stat =
-          event.Metadata().GetStat(StatType::kBytesAccessed)) {
-    collective_ops.set_byte_size(bytes_accessed_stat->IntOrUintValue());
-  }
-  return collective_ops;
-}
-
-inline bool IsExplicitHostStepMarker(absl::string_view event_name) {
-  return (absl::StartsWith(event_name, "train") ||
-          absl::StartsWith(event_name, "test") ||
-          absl::StartsWith(event_name, "TraceContext")) &&
-         !absl::StrContains(event_name, "/");
-}
-
-// Returns true if the given event_name should be considered as real computation
-// on CPU.
-inline bool IsRealCpuCompute(absl::string_view event_name) {
-  bool not_real = absl::StartsWith(event_name, "EagerExecute") ||
-                  absl::StartsWith(event_name, "EagerLocalExecute") ||
-                  absl::StartsWith(event_name, "EagerKernelExecute") ||
-                  absl::StartsWith(event_name, "FunctionRun") ||
-                  IsExplicitHostStepMarker(event_name);
-  return !not_real;
-}
-
-uint64 ParseNumBytesFromMemcpyDetail(absl::string_view memcpy_detail) {
-  const std::vector<absl::string_view> params =
-      absl::StrSplit(memcpy_detail, absl::ByAnyChar(":\n"));
-
-  // Processes value pairs.
-  for (uint32 ii = 0; ii < params.size(); ii += 2) {
-    if (params[ii] != "num_bytes") continue;
-    uint64 value = 0;
-    if (absl::SimpleAtoi(params[ii + 1], &value)) return value;
-    break;
-  }
-  return 0ULL;
-}
-
-EventType ClassifyGpuCompute(absl::string_view event_name,
-                             absl::string_view tensor_shapes) {
-  if (tensor_shapes.empty()) {
-    // Deduces the precision from the name.
-    return (absl::StrContains(event_name, "half") ||
-            absl::StrContains(event_name, "fp16"))
-               ? DEVICE_COMPUTE_16
-               : DEVICE_COMPUTE_32;
-  } else {
-    // Deduces the precision from the shapes.
-    return (absl::StrContains(tensor_shapes, "half")) ? DEVICE_COMPUTE_16
-                                                      : DEVICE_COMPUTE_32;
-  }
-}
-
-EventType ClassifyGpuEvent(absl::string_view event_name,
-                           absl::string_view tensor_shapes) {
-  tsl::profiler::TfOp tf_op = tsl::profiler::ParseTfOpFullname(event_name);
-  if (tsl::profiler::IsMemcpyHToDOp(tf_op)) {
-    return HOST_TO_DEVICE;
-  } else if (tsl::profiler::IsMemcpyDToHOp(tf_op)) {
-    return DEVICE_TO_HOST;
-  } else if (tsl::profiler::IsMemcpyDToDOp(tf_op)) {
-    return DEVICE_TO_DEVICE;
-  } else if (absl::StartsWithIgnoreCase(event_name, "nccl")) {
-    return DEVICE_COLLECTIVES;
-  } else {
-    return ClassifyGpuCompute(event_name, tensor_shapes);
-  }
-}
-
-EventType ClassifyCpuEvent(absl::string_view event_name, bool has_device,
-                           bool has_correlation_id) {
-  tsl::profiler::TfOp tf_op = tsl::profiler::ParseTfOpFullname(event_name);
-  if (tsl::profiler::IsInfeedEnqueueOp(tf_op) ||
-      tsl::profiler::IsMemcpyHToDOp(tf_op)) {
-    return HOST_TO_DEVICE;
-  } else if (tsl::profiler::IsMemcpyHToHOp(tf_op)) {
-    return HOST_TO_HOST;
-  } else if (has_device && (has_correlation_id ||
-                            absl::StartsWithIgnoreCase(
-                                event_name, "ExecutorState::Process"))) {
-    // TODO(b/150420972): Separate runtime overhead from actual compute for
-    // CPU-only.
-    return HOST_PREPARE;
-  } else if (absl::StartsWithIgnoreCase(event_name, "IteratorGetNext")) {
-    return HOST_WAIT_INPUT;
-  } else {
-    return HOST_COMPUTE;
-  }
-}
-
-}  // namespace
-
-StepEvents ConvertHostThreadsXLineToStepEvents(
-    const XLineVisitor& line, const StepEvents* device_step_events) {
-  StepEvents result;
-  line.ForEachEvent([&](const XEventVisitor& event) {
-    int64_t correlation_id = -1;
-    int64_t group_id = -1;
-    absl::string_view step_name;
-    event.ForEachStat([&](const XStatVisitor& stat) {
-      if (!stat.Type().has_value()) return;
-      switch (stat.Type().value()) {
-        case StatType::kCorrelationId:
-          correlation_id = stat.IntValue();
-          break;
-        case StatType::kGroupId:
-          group_id = stat.IntValue();
-          break;
-        case StatType::kStepName:
-          step_name = stat.StrOrRefValue();
-          break;
-      }
-    });
-    if (group_id < 0) return;
-    // Don't add CPU events when (1) it includes device step events and (2) it
-    // doesn't have a device and that the group_id (i.e. step number) already
-    // appears on the device. This will filter out all cpu events that do not
-    // correspond to any steps executed on the device.
-    bool has_device = (device_step_events != nullptr);
-    if (has_device && !device_step_events->contains(group_id)) return;
-    if (IsExplicitHostStepMarker(event.Name())) {
-      result[group_id].AddMarker(
-          StepMarker(StepMarkerType::kExplicitHostStepMarker, event.Name(),
-                     event.GetTimespan()));
-    } else if (!step_name.empty()) {
-      // Grouping adds a step_name stat to implicit host step markers.
-      result[group_id].AddMarker(
-          StepMarker(StepMarkerType::kImplicitHostStepMarker, event.Name(),
-                     event.GetTimespan()));
-    } else if (IsRealCpuCompute(event.Name())) {
-      result[group_id].AddEvent(EventTypeSpan(
-          ClassifyCpuEvent(event.Name(), has_device, correlation_id >= 0),
-          event.GetTimespan()));
-    }
-    if (!step_name.empty()) {
-      result[group_id].SetStepName(std::string(step_name));
-    }
-  });
-  return result;
-}
-
-StepEvents ConvertHostThreadsXPlaneToStepEvents(
-    const XPlane& host_trace, const StepEvents* device_step_events) {
-  StepEvents host_step_events;
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&host_trace);
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    StepEvents thread_step_events =
-        ConvertHostThreadsXLineToStepEvents(line, device_step_events);
-    UnionCombineStepEvents(thread_step_events, &host_step_events);
-  });
-  return host_step_events;
-}
-
-StepEvents ConvertDeviceStepInfoToStepMarkers(const XLineVisitor& line) {
-  StepEvents result;
-  line.ForEachEvent([&](const XEventVisitor& event) {
-    if (std::optional<XStatVisitor> stat = event.GetStat(StatType::kGroupId)) {
-      result[stat->IntValue()].AddMarker(
-          StepMarker(StepMarkerType::kDeviceStepMarker, event.Name(),
-                     event.GetTimespan()));
-    }
-  });
-  return result;
-}
-
-StepEvents ConvertDeviceTraceXLineToStepEvents(const uint64 device_id,
-                                               const XLineVisitor& line) {
-  StepEvents result;
-  line.ForEachEvent([&](const XEventVisitor& event) {
-    int64_t correlation_id = -1;
-    int64_t group_id = -1;
-    absl::string_view tensor_shapes;
-    absl::string_view memcpy_details;
-    event.ForEachStat([&](const XStatVisitor& stat) {
-      if (!stat.Type().has_value()) return;
-      switch (stat.Type().value()) {
-        case StatType::kCorrelationId:
-          correlation_id = stat.IntValue();
-          break;
-        case StatType::kGroupId:
-          group_id = stat.IntValue();
-          break;
-        case StatType::kTensorShapes:
-          tensor_shapes = stat.StrOrRefValue();
-          break;
-        case StatType::kMemcpyDetails:
-          memcpy_details = stat.StrOrRefValue();
-          break;
-      }
-    });
-
-    if (correlation_id >= 0 && group_id >= 0) {
-      EventType event_type = ClassifyGpuEvent(event.Name(), tensor_shapes);
-      EventTypeSpan event_type_span(event_type, event.GetTimespan());
-      result[group_id].AddEvent(event_type_span);
-      switch (event_type) {
-        case DEVICE_COLLECTIVES: {
-          AllReduceInfo collective_ops;
-          collective_ops.set_start_time_ps(event.TimestampPs());
-          collective_ops.set_end_time_ps(event.EndOffsetPs());
-          // TODO(jiesun): figure out how to get size info etc.
-          result[group_id].AddCollectiveOpEvent(device_id, collective_ops);
-          break;
-        }
-        case HOST_TO_DEVICE:
-        case DEVICE_TO_DEVICE:
-        case DEVICE_TO_HOST: {
-          // TODO(jiesun): not all memcpy events are grouped, figure out a
-          // better way to attribute them to steps.
-          uint64 bytes_transferred =
-              ParseNumBytesFromMemcpyDetail(memcpy_details);
-          result[group_id].AddDeviceMemoryTransferEvent(
-              event_type, event.GetTimespan(), bytes_transferred);
-          break;
-        }
-        default:
-          return;
-      }
-    }
-  });
-  return result;
-}
-
-StepEvents ConvertTpuDeviceTraceXLineToStepEvents(const uint64 device_id,
-                                                  const XLineVisitor& line) {
-  StepEvents result;
-  absl::flat_hash_map</*group_id=*/int64_t, XEventsOpMetricsDbBuilder>
-      op_metrics_builder;
-  struct ParentRef {
-    const XEventVisitor event;
-    tsl::profiler::Timespan device_timespan;
-    uint64_t children_duration_ps = 0;
-    int64_t group_id = -1;
-  };
-  tsl::profiler::AncestorStack<ParentRef> event_stack(
-      // Adds an OpMetric to the builder based on the provided parent reference.
-      [&](const ParentRef& parent) {
-        OpMetrics op_metrics = FromXEvent(parent.event);
-        op_metrics.set_time_ps(parent.device_timespan.duration_ps());
-        // TODO(b/397774568): Remove this once the SparseCore OpMetricsDb is
-        // implemented.
-        if (device_id < kSparseCoreIndexStart) {
-          op_metrics.set_self_time_ps(op_metrics.time_ps() -
-                                      parent.children_duration_ps);
-        }
-        op_metrics_builder[parent.group_id].AddOpMetric(
-            op_metrics, GetOpKeyFromXEvent(parent.event));
-      },
-      // Checks if the child event is a child of the parent event.
-      [](const ParentRef& parent, const ParentRef& child) {
-        return parent.device_timespan.Includes(child.device_timespan);
-      },
-      // Adds the child duration to the parent.
-      [](ParentRef& parent, ParentRef& child) {
-        parent.children_duration_ps += child.device_timespan.duration_ps();
-      });
-  line.ForEachEvent([&](const XEventVisitor& event) {
-    auto group_id_stat = event.GetStat(StatType::kGroupId);
-    if (!group_id_stat.has_value()) return;
-    int64_t group_id = group_id_stat->IntOrUintValue();
-    event_stack.Push(ParentRef{
-        .event = event,
-        .device_timespan = tsl::profiler::GetDeviceEventTimespan(event),
-        .group_id = group_id,
-    });
-
-    if (auto all_reduce_unique_id_stat =
-            event.GetStat(StatType::kAllReduceUniqueId)) {
-      result[group_id].AddCollectiveOpEvent(
-          device_id,
-          GetAllReduceInfo(event, all_reduce_unique_id_stat->IntOrUintValue()));
-    }
-  });
-  event_stack.Flush();
-  for (auto& [group_id, builder] : op_metrics_builder) {
-    // Finalize Without the step time now.
-    result[group_id].SetPerCoreOpMetricsDb(builder.Finalize(), device_id);
-  }
-  return result;
-}
-
-StepEvents ConvertDeviceTraceXPlaneToStepEvents(const XPlane& device_trace) {
-  StepEvents device_step_events;
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&device_trace);
-  std::optional<int> tpu_core_id = tsl::profiler::GetTensorCoreId(plane.Name());
-  std::optional<int> sc_core_id = tsl::profiler::GetSparseCoreId(plane.Name());
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    int64_t line_id = line.Id();
-    if (line_id == kThreadIdStepInfo ||
-        (tpu_core_id.has_value() &&
-         line.Name() == tsl::profiler::kStepLineName)) {
-      // TODO(b/397774568): Re-add processing of SparseCore steps once the
-      // SparseCore OpMetricsDb is implemented.
-      StepEvents step_marker_events = ConvertDeviceStepInfoToStepMarkers(line);
-      UnionCombineStepEvents(step_marker_events, &device_step_events);
-    } else if (IsDerivedThreadId(line_id)) {
-      return;
-    } else {
-      StepEvents stream_step_events;
-      if (tpu_core_id.has_value()) {
-        if (!tsl::profiler::IsOpLineName(line.Name())) return;
-        // In TPU sampling mode, the profiling session could stop in the middle
-        //  of a training step. In this case, the "XLA Ops" line will have
-        // one more step than the "Step" line. We need to intersect them to get
-        // the common step numbers.
-        stream_step_events =
-            ConvertTpuDeviceTraceXLineToStepEvents(plane.Id(), line);
-        IntersectCombineStepEvents(stream_step_events, &device_step_events);
-      } else if (sc_core_id.has_value()) {
-        // TODO(b/397774568): Switch to IsOpLineName once SparseCore OpMetricsDb
-        // is implemented.
-        if (line.Name() != tsl::profiler::kSparseCoreStepLineName) return;
-        stream_step_events = ConvertTpuDeviceTraceXLineToStepEvents(
-            kSparseCoreIndexStart + plane.Id(), line);
-        IntersectCombineStepEvents(stream_step_events, &device_step_events);
-      } else {
-        stream_step_events =
-            ConvertDeviceTraceXLineToStepEvents(plane.Id(), line);
-        UnionCombineStepEvents(stream_step_events, &device_step_events);
-      }
-    }
-  });
-  return device_step_events;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.h b/tensorflow/core/profiler/convert/xplane_to_step_events.h
deleted file mode 100644
index 35580f952815..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_EVENTS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_EVENTS_H_
-
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Convert the host threads in XLine format to StepEvents format. If
-// device_step_events is non-null, we will filter out events that only happens
-// on CPU.
-StepEvents ConvertHostThreadsXLineToStepEvents(
-    const XLineVisitor& line, const StepEvents* device_step_events);
-
-// Convert the host threads in XPlane format to StepEvents format. If
-// device_step_events is non-null, we will filter out events that only happens
-// on CPU.
-StepEvents ConvertHostThreadsXPlaneToStepEvents(
-    const XPlane& host_trace, const StepEvents* device_step_events);
-
-// Convert the device trace in XLine format to StepEvents.
-StepEvents ConvertDeviceTraceXLineToStepEvents(const XLineVisitor& line);
-
-// Convert the device trace in XPlane format to StepEvents.
-StepEvents ConvertDeviceTraceXPlaneToStepEvents(const XPlane& device_trace);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_EVENTS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
deleted file mode 100644
index 60ea16c28881..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
-
-#include <cstdint>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// Tests with a sample profile with two steps captured on the host but only one
-// step on the device. On the host, each step consists of TraceContext ->
-// FunctionRun -> ExecutorState::Process -> matmul. On the host, each step
-// consists of matmul. The host's step db should be created only for the step
-// observed on the host.
-TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
-  constexpr int64_t kFirstStepNum = 123;
-  constexpr int64_t kSecondStepNum = 456;
-  constexpr int64_t kFirstStepId = 0;
-  constexpr int64_t kSecondStepId = 1;
-  constexpr int64_t kFirstCorrelationId = 100;
-  constexpr int64_t kSecondCorrelationId = 200;
-
-  XSpace space;
-  XPlane* host_plane = GetOrCreateHostXPlane(&space);
-  XPlaneBuilder host_plane_builder(host_plane);
-  host_plane_builder.ReserveLines(2);
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
-               0, 100, {{StatType::kStepNum, kFirstStepNum}});
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90,
-               {{StatType::kStepId, kFirstStepId},
-                {StatType::kProducerType, int64_t{1}},
-                {StatType::kProducerId, kFirstStepId}});
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
-               300, 100, {{StatType::kStepNum, kSecondStepNum}});
-  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               310, 90,
-               {{StatType::kStepId, kSecondStepId},
-                {StatType::kProducerType, int64_t{1}},
-                {StatType::kProducerId, kSecondStepId}});
-
-  auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&host_plane_builder, &tf_executor_thread,
-               HostEventType::kExecutorStateProcess, 20, 20,
-               {{StatType::kStepId, kFirstStepId},
-                {StatType::kConsumerType, int64_t{1}},
-                {StatType::kConsumerId, kFirstStepId}});
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 10,
-               {{StatType::kCorrelationId, kFirstCorrelationId}});
-  CreateXEvent(&host_plane_builder, &tf_executor_thread,
-               HostEventType::kExecutorStateProcess, 320, 20,
-               {{StatType::kStepId, kSecondStepId},
-                {StatType::kConsumerType, int64_t{1}},
-                {StatType::kConsumerId, kSecondStepId}});
-  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 330, 10,
-               {{StatType::kCorrelationId, kSecondCorrelationId}});
-
-  XPlane* device_plane = space.add_planes();
-  XPlaneBuilder device_plane_builder(device_plane);
-  device_plane_builder.ReserveLines(1);
-
-  auto stream = device_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&device_plane_builder, &stream, "matmul", 50, 40,
-               {{StatType::kCorrelationId, kFirstCorrelationId}});
-
-  tsl::profiler::GroupTfEvents(&space);
-  StepEvents device_step_events =
-      ConvertDeviceTraceXPlaneToStepEvents(*device_plane);
-  EXPECT_EQ(device_step_events.size(), 1);
-  EXPECT_EQ(device_step_events[0].Events().size(), 1);
-  StepEvents host_step_events =
-      ConvertHostThreadsXPlaneToStepEvents(*host_plane, &device_step_events);
-  // Should contain only the step which is also present on the device.
-  EXPECT_EQ(host_step_events.size(), 1);
-  // TraceContext should be added as a step marker.
-  EXPECT_EQ(host_step_events[0].Markers().size(), 1);
-  // FunctionRun shouldn't be added.
-  EXPECT_EQ(host_step_events[0].Events().size(), 2);
-}
-
-TEST(ConvertXPlaneToStepEvents, TpuDevicePlaneToStepEvents) {
-  XPlane raw_plane;
-  XPlaneBuilder plane(&raw_plane);
-  int64_t device_id = 1;
-  plane.SetId(device_id);
-  plane.SetName("/device:TPU:0");
-  XLineBuilder op_line = plane.GetOrCreateLine(0);
-  op_line.SetName(tsl::profiler::kXlaOpLineName);
-  const XStatMetadata& program_id_stat =
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProgramId));
-  const XStatMetadata& symbol_id_stat =
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSymbolId));
-  const XStatMetadata& group_id_stat =
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId));
-  {
-    XEventMetadata* event_metadata =
-        plane.GetOrCreateEventMetadata("op_long_name");
-    event_metadata->set_display_name("op_name");
-    XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-    stats.AddStatValue(program_id_stat, 1);
-    stats.AddStatValue(symbol_id_stat, 1);
-    {
-      XEventBuilder event = op_line.AddEvent(*event_metadata);
-      event.SetOffsetPs(0);
-      event.SetDurationPs(50);
-      event.AddStatValue(group_id_stat, 1);
-    }
-    {
-      XEventBuilder event = op_line.AddEvent(*event_metadata);
-      event.SetOffsetPs(100);
-      event.SetDurationPs(50);
-      event.AddStatValue(group_id_stat, 2);
-    }
-  }
-  {
-    XEventMetadata* event_metadata =
-        plane.GetOrCreateEventMetadata("op_long_name2");
-    event_metadata->set_display_name("op_name2");
-    XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-    stats.AddStatValue(program_id_stat, 1);
-    stats.AddStatValue(symbol_id_stat, 2);
-    XEventBuilder event = op_line.AddEvent(*event_metadata);
-    event.SetOffsetPs(50);
-    event.SetDurationPs(50);
-    event.AddStatValue(group_id_stat, 1);
-  }
-  XLineBuilder step_line = plane.GetOrCreateLine(1);
-  step_line.SetName(tsl::profiler::kStepLineName);
-  {
-    XEventMetadata* event_metadata = plane.CreateEventMetadata();
-    XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-    {
-      XEventBuilder event = step_line.AddEvent(*event_metadata);
-      event.SetOffsetPs(0);
-      event.SetDurationPs(100);
-      event.AddStatValue(group_id_stat, 1);
-    }
-    {
-      XEventBuilder event = step_line.AddEvent(*event_metadata);
-      event.SetOffsetPs(100);
-      event.SetDurationPs(100);
-      event.AddStatValue(group_id_stat, 2);
-    }
-  }
-
-  StepEvents step_events = ConvertDeviceTraceXPlaneToStepEvents(raw_plane);
-  EXPECT_EQ(step_events.size(), 2);
-  EXPECT_TRUE(step_events.contains(1));
-  StepDetails step_1 = step_events[/*group_id=*/1];
-  ASSERT_TRUE(step_1.PerCoreOpMetricsDb().contains(device_id));
-  EXPECT_EQ(step_1.PerCoreOpMetricsDb().at(device_id).metrics_db_size(), 2);
-  EXPECT_EQ(step_1.Markers().size(), 1);
-  EXPECT_TRUE(step_events.contains(2));
-  StepDetails step_2 = step_events[/*group_id=*/2];
-  ASSERT_TRUE(step_2.PerCoreOpMetricsDb().contains(device_id));
-  EXPECT_EQ(step_2.PerCoreOpMetricsDb().at(device_id).metrics_db_size(), 1);
-  EXPECT_EQ(step_2.Markers().size(), 1);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_stats.cc b/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
index ccf9c3008df5..8645876955bb 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
@@ -30,12 +30,12 @@ limitations under the License.
 #include "xla/tsl/profiler/utils/math_utils.h"
 #include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/gpu_event_stats.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
+#include "xprof/utils/gpu_event_stats.h"  // from @org_xprof
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
deleted file mode 100644
index ae7871569e30..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
+++ /dev/null
@@ -1,523 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
-#include "tensorflow/core/profiler/utils/html_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// 50 us from https://www.tensorflow.org/guide/data_performance_analysis
-const int64_t kSlowCallThresholdPs = 50 * 1000000;
-
-namespace {
-
-// Returns true if the given iterator event is for a root iterator.
-bool IsRootIteratorEvent(const XEventVisitor& iterator_event) {
-  std::vector<absl::string_view> split_result =
-      absl::StrSplit(iterator_event.Name(), "::");
-  // The root iterator's name contains only its own name (no parent
-  // information).
-  return split_result.size() == 2;
-}
-
-// Returns true if the given iterator event name is for an async iterator.
-bool IsAsyncIterator(absl::string_view iterator_event_name) {
-  static auto* kAsyncIterators = new absl::flat_hash_set<absl::string_view>(
-      {"Prefetch", "ParallelInterleave", "ParallelMap", "ParseExample",
-       "MapAndBatch", "DataService", "LegacyParallelInterleave",
-       "ParallelBatch"});
-  return kAsyncIterators->contains(iterator_event_name);
-}
-
-void SetIteratorMetadata(int64_t id, const XEventVisitor& event,
-                         IteratorMetadata* metadata) {
-  metadata->set_id(id);
-  auto parent_id_stat = event.GetStat(StatType::kParentId);
-  if (parent_id_stat.has_value()) {
-    metadata->set_parent_id(parent_id_stat->IntValue());
-  }
-  metadata->set_name(tsl::profiler::IteratorName(event.Name()));
-  metadata->set_long_name(event.Name().data(), event.Name().size());
-  metadata->set_is_async(IsAsyncIterator(metadata->name()));
-  // TODO(b/161831651): Set params.
-}
-
-// Returns the parent iterator's id if it is a root of a device input
-// pipeline.
-std::optional<int64_t> FindDeviceInputPipeline(const XEventVisitor& event) {
-  if (event.Type() == HostEventType::kDeviceInputPipelineSecondIterator) {
-    auto parent_id_stat = event.GetStat(StatType::kParentId);
-    if (parent_id_stat.has_value()) return parent_id_stat->IntValue();
-  }
-  return std::nullopt;
-}
-
-// Processes tsl::profiler::EventForest to do the following:
-// (1) set iterator metadata
-// (2) find root iterator events
-// (3) find device input pipeline ids
-void ProcessEventForest(
-    const tsl::profiler::EventForest& event_forest,
-    absl::flat_hash_set<int64_t>* device_input_pipeline_ids,
-    absl::flat_hash_map<int64_t, std::vector<const tsl::profiler::EventNode*>>*
-        root_iterator_event_map,
-    TfDataStats* tf_data_stats) {
-  const tsl::profiler::EventNodeMap& event_node_map =
-      event_forest.GetEventNodeMap();
-  auto* iterator_event_list =
-      gtl::FindOrNull(event_node_map, HostEventType::kIterator);
-  if (!iterator_event_list) return;
-  for (const tsl::profiler::EventNode& iterator_event : *iterator_event_list) {
-    const XEventVisitor& iterator_event_visitor =
-        iterator_event.GetEventVisitor();
-    auto iterator_id_stat = iterator_event_visitor.GetStat(StatType::kStepId);
-    if (!iterator_id_stat.has_value()) continue;
-    int64_t iterator_id = iterator_id_stat->IntValue();
-    auto result = tf_data_stats->mutable_iterator_metadata()->insert(
-        {iterator_id, IteratorMetadata()});
-    IteratorMetadata& metadata = result.first->second;
-    if (result.second) {
-      // First time processing this iterator.
-      SetIteratorMetadata(iterator_id, iterator_event_visitor, &metadata);
-    }
-    if (IsRootIteratorEvent(iterator_event_visitor)) {
-      // Record root iterator events.
-      (*root_iterator_event_map)[iterator_id].push_back(&iterator_event);
-    }
-  }
-  auto* device_input_pipeline_second_iterator_events = gtl::FindOrNull(
-      event_node_map, HostEventType::kDeviceInputPipelineSecondIterator);
-  if (!device_input_pipeline_second_iterator_events) return;
-  for (const tsl::profiler::EventNode& iterator_event :
-       *device_input_pipeline_second_iterator_events) {
-    const XEventVisitor& iterator_event_visitor =
-        iterator_event.GetEventVisitor();
-    auto iterator_id_stat = iterator_event_visitor.GetStat(StatType::kStepId);
-    if (!iterator_id_stat.has_value()) continue;
-    int64_t iterator_id = iterator_id_stat->IntValue();
-    auto result = tf_data_stats->mutable_iterator_metadata()->insert(
-        {iterator_id, IteratorMetadata()});
-    IteratorMetadata& metadata = result.first->second;
-    if (result.second) {
-      // First time processing this iterator.
-      SetIteratorMetadata(iterator_id, iterator_event_visitor, &metadata);
-      // Find and record device input pipeline ids.
-      std::optional<int64_t> device_input_pipeline_id =
-          FindDeviceInputPipeline(iterator_event_visitor);
-      if (device_input_pipeline_id.has_value()) {
-        device_input_pipeline_ids->insert(*device_input_pipeline_id);
-      }
-    }
-  }
-}
-
-void SetInputPipelineMetadata(int64_t id, int64_t name_id,
-                              bool is_device_input_pipeline,
-                              InputPipelineMetadata* metadata) {
-  constexpr absl::string_view kHostInputPipelinePrefix = "Host:";
-  constexpr absl::string_view kDeviceInputPipelinePrefix = "Device:";
-  metadata->set_id(id);
-  if (is_device_input_pipeline) {
-    metadata->set_type(InputPipelineMetadata::DEVICE);
-    metadata->set_name(absl::StrCat(kDeviceInputPipelinePrefix, name_id));
-  } else {
-    metadata->set_type(InputPipelineMetadata::HOST);
-    metadata->set_name(absl::StrCat(kHostInputPipelinePrefix, name_id));
-  }
-}
-
-void ProcessIteratorEvent(const tsl::profiler::EventNode& iterator_event,
-                          InputPipelineStat* input_pipeline_stat,
-                          bool is_blocking, int level = 0) {
-  if (level > 100) return;
-  const XEventVisitor& visitor = iterator_event.GetEventVisitor();
-  auto iterator_id_stat = visitor.GetStat(StatType::kStepId);
-  if (!iterator_id_stat.has_value()) return;
-  int64_t iterator_id = iterator_id_stat->IntValue();
-  auto result = input_pipeline_stat->mutable_iterator_stats()->insert(
-      {iterator_id, IteratorStat()});
-  IteratorStat& iterator_stat = result.first->second;
-  if (result.second) {
-    iterator_stat.set_id(iterator_id);
-    iterator_stat.set_start_time_ps(visitor.TimestampPs());
-  }
-  iterator_stat.set_duration_ps(iterator_stat.duration_ps() +
-                                visitor.DurationPs());
-  int64_t self_time_ps = visitor.DurationPs();
-  tsl::profiler::Timespan self_time_span = visitor.GetTimespan();
-  for (const tsl::profiler::EventNode* child : iterator_event.GetChildren()) {
-    const XEventVisitor& child_visitor = child->GetEventVisitor();
-    if (tsl::profiler::ParseTfOpFullname(child_visitor.Name()).category ==
-        tsl::profiler::Category::kTfData) {
-      int64_t overlap_duration_ps =
-          self_time_span.OverlappedDurationPs(child_visitor.GetTimespan());
-      ProcessIteratorEvent(*child, input_pipeline_stat,
-                           is_blocking && overlap_duration_ps, level + 1);
-      // Note: Assume no overlap between child events.
-      self_time_ps -= overlap_duration_ps;
-    }
-  }
-  iterator_stat.set_self_time_ps(iterator_stat.self_time_ps() + self_time_ps);
-  iterator_stat.set_is_blocking(iterator_stat.is_blocking() || is_blocking);
-  iterator_stat.set_num_calls(iterator_stat.num_calls() + 1);
-}
-
-void SetBottleneckIteratorId(InputPipelineStat* input_pipeline_stat) {
-  int64_t bottleneck_iterator_id = 0;
-  int64_t max_self_time = 0;
-  for (const auto& pair : input_pipeline_stat->iterator_stats()) {
-    const auto& id = pair.first;
-    const auto& iterator_stat = pair.second;
-    if (iterator_stat.is_blocking() &&
-        iterator_stat.self_time_ps() > max_self_time) {
-      bottleneck_iterator_id = id;
-      max_self_time = iterator_stat.self_time_ps();
-    }
-  }
-  input_pipeline_stat->set_bottleneck_iterator_id(bottleneck_iterator_id);
-  input_pipeline_stat->set_bottleneck_iterator_latency_ps(max_self_time);
-}
-
-void ProcessInputPipelines(
-    const absl::flat_hash_set<int64_t>& device_input_pipeline_ids,
-    absl::flat_hash_map<int64_t, std::vector<const tsl::profiler::EventNode*>>*
-        root_iterator_event_map,
-    TfDataStats* tf_data_stats) {
-  auto* input_pipelines = tf_data_stats->mutable_input_pipelines();
-  int64_t num_host_input_pipelines = 0;
-  int64_t num_device_input_pipelines = 0;
-  for (auto& id_and_events : *root_iterator_event_map) {
-    auto& root_iterator_id = id_and_events.first;
-    auto& root_iterator_events = id_and_events.second;
-    absl::c_sort(root_iterator_events, [](const tsl::profiler::EventNode* lhs,
-                                          const tsl::profiler::EventNode* rhs) {
-      return lhs->GetEventVisitor().DurationPs() >
-             rhs->GetEventVisitor().DurationPs();
-    });
-    auto result =
-        input_pipelines->insert({root_iterator_id, InputPipelineStats()});
-    InputPipelineStats& input_pipeline_stats = result.first->second;
-    InputPipelineMetadata* metadata = input_pipeline_stats.mutable_metadata();
-    if (result.second) {
-      bool is_device_input_pipeline =
-          device_input_pipeline_ids.contains(root_iterator_id);
-      int64_t name_id = is_device_input_pipeline ? num_device_input_pipelines++
-                                                 : num_host_input_pipelines++;
-      SetInputPipelineMetadata(root_iterator_id, name_id,
-                               is_device_input_pipeline, metadata);
-    }
-    int64_t sum_latency_ps = 0;
-    int64_t min_latency_ps = INT64_MAX;
-    int64_t max_latency_ps = 0;
-    int64_t num_slow_calls = 0;
-    for (const tsl::profiler::EventNode* root_iterator_event :
-         root_iterator_events) {
-      InputPipelineStat* stat = input_pipeline_stats.add_stats();
-      ProcessIteratorEvent(*root_iterator_event, stat,
-                           /*is_blocking*/ true);
-      SetBottleneckIteratorId(stat);
-      int64_t latency_ps = root_iterator_event->GetEventVisitor().DurationPs();
-      sum_latency_ps += latency_ps;
-      min_latency_ps = std::min(min_latency_ps, latency_ps);
-      max_latency_ps = std::max(max_latency_ps, latency_ps);
-      if (latency_ps > kSlowCallThresholdPs) num_slow_calls++;
-    }
-    input_pipeline_stats.set_avg_latency_ps(sum_latency_ps /
-                                            root_iterator_events.size());
-    input_pipeline_stats.set_min_latency_ps(min_latency_ps);
-    input_pipeline_stats.set_max_latency_ps(max_latency_ps);
-    input_pipeline_stats.set_num_slow_calls(num_slow_calls);
-  }
-}
-
-void SetBottleneckAnalysis(CombinedTfDataStats* combined_tf_data_stats) {
-  struct InputPipeline {
-    InputPipeline(absl::string_view host_name,
-                  absl::string_view input_pipeline_name, int64_t max_latency_ps,
-                  absl::string_view iterator_name,
-                  absl::string_view iterator_long_name,
-                  int64_t iterator_latency_ps)
-        : host_name(host_name),
-          input_pipeline_name(input_pipeline_name),
-          max_latency_ps(max_latency_ps),
-          iterator_name(iterator_name),
-          iterator_long_name(iterator_long_name),
-          iterator_latency_ps(iterator_latency_ps) {}
-    absl::string_view host_name;
-    absl::string_view input_pipeline_name;
-    int64_t max_latency_ps;
-    absl::string_view iterator_name;
-    absl::string_view iterator_long_name;
-    int64_t iterator_latency_ps;
-
-    bool operator<(const InputPipeline& rhs) const {
-      return max_latency_ps > rhs.max_latency_ps;
-    }
-  };
-  std::vector<InputPipeline> slow_input_pipelines;
-  for (const auto& host_name_and_tf_data_stats :
-       combined_tf_data_stats->tf_data_stats()) {
-    absl::string_view host_name = host_name_and_tf_data_stats.first;
-    const TfDataStats& tf_data_stats = host_name_and_tf_data_stats.second;
-    for (const auto& id_and_stats : tf_data_stats.input_pipelines()) {
-      const InputPipelineStats& input_pipeline_stats = id_and_stats.second;
-      if (input_pipeline_stats.metadata().type() ==
-          InputPipelineMetadata::DEVICE) {
-        // Ignore device input pipelines.
-        continue;
-      }
-      // Choose the slowest execution trace of the input pipeline.
-      // `input_pipeline_stats.stats` is already sorted so choose the first one.
-      const InputPipelineStat& input_pipeline_stat =
-          input_pipeline_stats.stats(0);
-      const IteratorMetadata& metadata = tf_data_stats.iterator_metadata().at(
-          input_pipeline_stat.bottleneck_iterator_id());
-      slow_input_pipelines.emplace_back(
-          host_name, input_pipeline_stats.metadata().name(),
-          input_pipeline_stats.max_latency_ps(), metadata.name(),
-          metadata.long_name(),
-          input_pipeline_stat.bottleneck_iterator_latency_ps());
-    }
-  }
-  std::sort(slow_input_pipelines.begin(), slow_input_pipelines.end());
-  for (const auto& input_pipeline : slow_input_pipelines) {
-    TfDataBottleneckAnalysis* bottleneck_analysis =
-        combined_tf_data_stats->add_bottleneck_analysis();
-    bottleneck_analysis->set_host(input_pipeline.host_name.data(),
-                                  input_pipeline.host_name.size());
-    bottleneck_analysis->set_input_pipeline(
-        input_pipeline.input_pipeline_name.data(),
-        input_pipeline.input_pipeline_name.size());
-    bottleneck_analysis->set_max_latency_ps(input_pipeline.max_latency_ps);
-    bottleneck_analysis->set_iterator_name(input_pipeline.iterator_name.data(),
-                                           input_pipeline.iterator_name.size());
-    bottleneck_analysis->set_iterator_long_name(
-        input_pipeline.iterator_long_name.data(),
-        input_pipeline.iterator_long_name.size());
-    bottleneck_analysis->set_iterator_latency_ps(
-        input_pipeline.iterator_latency_ps);
-  }
-}
-
-std::string GetSuggestion(BottleneckType type) {
-  constexpr absl::string_view kPlaybookLink =
-      "https://www.tensorflow.org/guide/data_performance_analysis";
-  constexpr absl::string_view kPlaybookSourceDatasetLink =
-      "https://www.tensorflow.org/guide/"
-      "data_performance_analysis#source_datasets";
-  constexpr absl::string_view kPlaybookCpuUtilizationLink =
-      "https://www.tensorflow.org/guide/"
-      "data_performance_analysis#3_are_you_reaching_high_cpu_utilization";
-  constexpr absl::string_view kPlaybookTransformationLink =
-      "https://www.tensorflow.org/guide/"
-      "data_performance_analysis#transformation_datasets";
-  constexpr absl::string_view kTfGuideParallelDataExtractionLink =
-      "https://www.tensorflow.org/guide/"
-      "data_performance#parallelizing_data_extraction";
-  constexpr absl::string_view kTfGuideParallelTransformationLink =
-      "https://www.tensorflow.org/guide/"
-      "data_performance#parallelizing_data_transformation";
-  constexpr absl::string_view kTfGuideCacheLink =
-      "https://www.tensorflow.org/guide/data_performance#caching";
-  constexpr absl::string_view kTfDataServiceLink =
-      "https://www.tensorflow.org/api_docs/python/tf/data/experimental/"
-      "service?version=nightly";
-  switch (type) {
-    case BottleneckType::kSlowSource:
-      return absl::StrFormat(
-          "1. Check the locality of a host and input data. Ideally, they "
-          "should be in the same cell (or very close, like the same "
-          "region).<br/>"
-          "2. Parallelize reading from this dataset source. See %s and %s for "
-          "more details.<br/>",
-          AnchorElement(kPlaybookSourceDatasetLink, "here"),
-          AnchorElement(kTfGuideParallelDataExtractionLink, "here"));
-    case BottleneckType::kSlowDataService:
-      return absl::StrFormat(
-          "1. Fetching data from tf.data service took a while. Profile the "
-          "tf.data service worker to analyze the issue further.<br/>"
-          "2. See %s for more details on tf.data service.<br/>"
-          "3. See %s for other suggestions.",
-          AnchorElement(kTfDataServiceLink, "this"),
-          AnchorElement(kPlaybookLink, "this"));
-    case BottleneckType::kSlowRemoteSource:
-      return absl::StrFormat(
-          "1. The remote data source is slow. Profile its host to analyze the "
-          "issue further.<br/>"
-          "2. See %s for other suggestions.",
-          AnchorElement(kPlaybookLink, "this"));
-    case BottleneckType::kSlowTransformationWithParallelVersion:
-      return absl::StrFormat(
-          "1. Parallelize this transformation by setting "
-          "<code>num_parallel_calls=tf.data.experimental.AUTOTUNE</code>. See "
-          "%s for more details.<br/>"
-          "2. Consider adding <code>cache</code> after this transformation if "
-          "your data fits into memory and it is appropriate (e.g., there is no "
-          "randomness in upstream transformations like <code>shuffle</code>). "
-          "See %s for more details.<br/>"
-          "3. Find more resources %s.",
-          AnchorElement(kTfGuideParallelTransformationLink, "this"),
-          AnchorElement(kTfGuideCacheLink, "this"),
-          AnchorElement(kPlaybookTransformationLink, "here"));
-    case BottleneckType::kSlowTransformationWithoutParallelVersion:
-      return absl::StrFormat(
-          "1. This transformation is inherently sequential. Add outer "
-          "parallelism by running multiple copies of the input pipeline over "
-          "sharded inputs and combining the results. See %s for more "
-          "details.<br/>"
-          "2. Consider adding <code>cache</code> after this transformation if "
-          "your data fits into memory and it is appropriate (e.g., there is no "
-          "randomness in upstream transformations like <code>shuffle</code>). "
-          "See %s for more details.<br/>"
-          "3. Find more resources %s.",
-          AnchorElement(kPlaybookTransformationLink, "this"),
-          AnchorElement(kTfGuideCacheLink, "this"),
-          AnchorElement(kPlaybookCpuUtilizationLink, "here"));
-    default:
-      return absl::StrFormat("See %s for suggestions.",
-                             AnchorElement(kPlaybookLink, "this"));
-  }
-}
-
-void SetSuggestion(CombinedTfDataStats* combined_tf_data_stats) {
-  for (TfDataBottleneckAnalysis& bottleneck_analysis :
-       *combined_tf_data_stats->mutable_bottleneck_analysis()) {
-    bottleneck_analysis.set_suggestion(
-        GetSuggestion(GetBottleneckType(bottleneck_analysis.iterator_name())));
-  }
-}
-
-void SetSummary(CombinedTfDataStats* combined_tf_data_stats) {
-  int64_t max_latency_ps = 0;
-  if (combined_tf_data_stats->bottleneck_analysis_size()) {
-    max_latency_ps =
-        combined_tf_data_stats->bottleneck_analysis().at(0).max_latency_ps();
-  }
-  if (max_latency_ps > kSlowCallThresholdPs) {
-    combined_tf_data_stats->set_is_input_bound(true);
-    combined_tf_data_stats->set_summary(
-        "Your profile has a tf.data input pipeline slower than 50 us. For each "
-        "slow input pipeline, below shows a bottleneck in the input pipeline "
-        "and a suggestion on how to fix it.");
-  } else if (max_latency_ps > 0) {
-    combined_tf_data_stats->set_is_input_bound(false);
-    combined_tf_data_stats->set_summary(
-        "Your profile does not have any tf.data input pipeline slower than 50 "
-        "us. Your job could be still input bound if this profile didn't "
-        "capture all workers.");
-  } else {
-    combined_tf_data_stats->set_is_input_bound(false);
-    combined_tf_data_stats->set_summary(
-        "No tf.data activity captured in your profile. If your job uses "
-        "tf.data, try to capture a longer profile.");
-  }
-}
-
-}  // namespace
-
-BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name) {
-  static auto* kBottleneckTypeMap = new absl::flat_hash_map<absl::string_view,
-                                                            BottleneckType>(
-      {// Read from storage.
-       {"TFRecord", BottleneckType::kSlowSource},
-       {"SSTable", BottleneckType::kSlowSource},
-       {"RecordIO", BottleneckType::kSlowSource},
-       {"Spanner", BottleneckType::kSlowSource},
-       {"TFColumn", BottleneckType::kSlowSource},
-       {"SleepwalkRemoteDataset", BottleneckType::kSlowSource},
-       {"TextLine", BottleneckType::kSlowSource},
-       {"StitchedTimelineDataset", BottleneckType::kSlowSource},
-       {"DateKeyDataset", BottleneckType::kSlowSource},
-       {"CapacitorProto", BottleneckType::kSlowSource},
-       {"LMDB", BottleneckType::kSlowSource},
-       {"ExternalDataset", BottleneckType::kSlowSource},
-       {"PearModel", BottleneckType::kSlowSource},
-       {"FixedLengthRecordV2", BottleneckType::kSlowSource},
-       // Read from local memory.
-       {"FromTensor", BottleneckType::kSlowSource},
-       {"TensorSlice", BottleneckType::kSlowSource},
-       {"Generator", BottleneckType::kSlowSource},
-       {"SyntheticDatasetOp", BottleneckType::kSlowSource},
-       // tf.data service.
-       {"DataService", BottleneckType::kSlowDataService},
-       // Read from remote memory.
-       {"GuzzlerDataGuzzlerRemoteDataset", BottleneckType::kSlowRemoteSource},
-       {"ReverbDataset", BottleneckType::kSlowRemoteSource},
-       {"DatasetSampleGame", BottleneckType::kSlowRemoteSource},
-       {"Courier", BottleneckType::kSlowRemoteSource},
-       {"ReverbEpisodeDataset", BottleneckType::kSlowRemoteSource},
-       // Transformations with parallel version.
-       {"Map", BottleneckType::kSlowTransformationWithParallelVersion},
-       {"Interleave", BottleneckType::kSlowTransformationWithParallelVersion},
-       // Transformations without parallel version.
-       {"Filter", BottleneckType::kSlowTransformationWithoutParallelVersion},
-       {"Batch", BottleneckType::kSlowTransformationWithoutParallelVersion},
-       {"Unbatch", BottleneckType::kSlowTransformationWithoutParallelVersion}});
-  if (auto type =
-          gtl::FindOrNull(*kBottleneckTypeMap, bottleneck_iterator_name)) {
-    return *type;
-  }
-  return BottleneckType::kOther;
-}
-
-void CombinedTfDataStatsBuilder::Add(absl::string_view host_name,
-                                     XPlane* host_plane) {
-  TfDataStats& tf_data_stats =
-      (*combined_tf_data_stats_
-            ->mutable_tf_data_stats())[std::string(host_name)];
-  tsl::profiler::EventForest event_forest;
-  event_forest.AddPlanes(tsl::profiler::CreateTfXPlaneVisitor, {host_plane});
-  event_forest.ConnectEvents();
-  event_forest.ConnectTfDataEvents();
-  absl::flat_hash_set<int64_t> device_input_pipeline_ids;
-  absl::flat_hash_map<int64_t, std::vector<const tsl::profiler::EventNode*>>
-      root_iterator_event_map;
-  ProcessEventForest(event_forest, &device_input_pipeline_ids,
-                     &root_iterator_event_map, &tf_data_stats);
-  ProcessInputPipelines(device_input_pipeline_ids, &root_iterator_event_map,
-                        &tf_data_stats);
-}
-
-void CombinedTfDataStatsBuilder::Finalize() {
-  SetBottleneckAnalysis(combined_tf_data_stats_);
-  if (generate_suggestion_) SetSuggestion(combined_tf_data_stats_);
-  SetSummary(combined_tf_data_stats_);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
deleted file mode 100644
index f5f534887919..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-TF_CONST_INIT extern const int64_t kSlowCallThresholdPs;
-
-enum class BottleneckType {
-  kSlowSource,
-  kSlowDataService,
-  kSlowRemoteSource,
-  kSlowTransformationWithParallelVersion,
-  kSlowTransformationWithoutParallelVersion,
-  kOther,
-};
-
-BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name);
-
-class CombinedTfDataStatsBuilder {
- public:
-  explicit CombinedTfDataStatsBuilder(
-      CombinedTfDataStats* combined_tf_data_stats,
-      bool generate_suggestion = true)
-      : combined_tf_data_stats_(combined_tf_data_stats),
-        generate_suggestion_(generate_suggestion) {}
-
-  void Add(absl::string_view host_name, XPlane* host_plane);
-
-  // Finalizes by populating TfDataBottleneckAnalysis.
-  void Finalize();
-
- private:
-  CombinedTfDataStats* combined_tf_data_stats_;
-  bool generate_suggestion_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc
deleted file mode 100644
index 64f1f68fe322..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
-
-#include <cstdint>
-
-#include <gmock/gmock.h>
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::testing::EqualsProto;
-
-// Test with the following example dataset:
-// dataset = tf.data.Dataset.range(8)
-// dataset = dataset.prefetch(2)
-// for _ in dataset:
-//   pass
-TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
-  constexpr int64_t kPrefetchIteratorId = 123;
-  constexpr int64_t kRangeIteratorId = 456;
-  constexpr int64_t kFirstElementId = 100;
-  constexpr int64_t kSecondElementId = 200;
-
-  XPlane host_plane;
-  XPlaneBuilder host_plane_builder(&host_plane);
-  host_plane_builder.ReserveLines(2);
-
-  auto consumer_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 0,
-               100000000, {{StatType::kStepId, kPrefetchIteratorId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kPrefetchConsume, 80000000, 20000000,
-               {{StatType::kElementId, kFirstElementId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch",
-               200000000, 20000000, {{StatType::kStepId, kPrefetchIteratorId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kPrefetchConsume, 210000000, 10000000,
-               {{StatType::kElementId, kSecondElementId}});
-
-  auto producer_thread = host_plane_builder.GetOrCreateLine(1);
-  // Blocking producer.
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kPrefetchProduce, 0, 80000000,
-               {{StatType::kElementId, kFirstElementId}});
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::Prefetch::Range", 0, 80000000,
-               {{StatType::kStepId, kRangeIteratorId},
-                {StatType::kParentId, kPrefetchIteratorId}});
-  // Non-blocking producer.
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kPrefetchProduce, 100000000, 80000000,
-               {{StatType::kElementId, kSecondElementId}});
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::Prefetch::Range", 100000000, 80000000,
-               {{StatType::kStepId, kRangeIteratorId},
-                {StatType::kParentId, kPrefetchIteratorId}});
-
-  CombinedTfDataStats combined_tf_data_stats;
-  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
-  builder.Add("host1", &host_plane);
-  builder.Finalize();
-  EXPECT_THAT(
-      combined_tf_data_stats, EqualsProto(R"pb(
-        bottleneck_analysis: {
-          host: "host1"
-          input_pipeline: "Host:0"
-          max_latency_ps: 100000000
-          iterator_name: "Range"
-          iterator_long_name: "Iterator::Prefetch::Range"
-          iterator_latency_ps: 80000000
-          suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
-        }
-        tf_data_stats: {
-          key: "host1"
-          value: {
-            iterator_metadata: {
-              key: 123,
-              value: {
-                id: 123
-                name: "Prefetch"
-                long_name: "Iterator::Prefetch"
-                is_async: true
-              }
-            }
-            iterator_metadata: {
-              key: 456,
-              value: {
-                id: 456
-                parent_id: 123
-                name: "Range"
-                long_name: "Iterator::Prefetch::Range"
-                is_async: false
-              }
-            }
-            input_pipelines {
-              key: 123,
-              value: {
-                metadata { id: 123 type: HOST name: "Host:0" }
-                avg_latency_ps: 60000000
-                min_latency_ps: 20000000
-                max_latency_ps: 100000000
-                num_slow_calls: 1
-                stats {
-                  bottleneck_iterator_id: 456
-                  bottleneck_iterator_latency_ps: 80000000
-                  iterator_stats {
-                    key: 123,
-                    value: {
-                      id: 123
-                      start_time_ps: 0
-                      duration_ps: 100000000
-                      self_time_ps: 20000000
-                      is_blocking: true
-                      num_calls: 1
-                    }
-                  }
-                  iterator_stats {
-                    key: 456,
-                    value: {
-                      id: 456
-                      start_time_ps: 0
-                      duration_ps: 80000000
-                      self_time_ps: 80000000
-                      is_blocking: true
-                      num_calls: 1
-                    }
-                  }
-                }
-                stats {
-                  bottleneck_iterator_id: 123
-                  bottleneck_iterator_latency_ps: 20000000
-                  iterator_stats {
-                    key: 123,
-                    value: {
-                      id: 123
-                      start_time_ps: 200000000
-                      duration_ps: 20000000
-                      self_time_ps: 20000000
-                      is_blocking: true
-                      num_calls: 1
-                    }
-                  }
-                  iterator_stats {
-                    key: 456,
-                    value: {
-                      id: 456
-                      start_time_ps: 100000000
-                      duration_ps: 80000000
-                      self_time_ps: 80000000
-                      is_blocking: false
-                      num_calls: 1
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        is_input_bound: true
-        summary: "Your profile has a tf.data input pipeline slower than 50 us. For each slow input pipeline, below shows a bottleneck in the input pipeline and a suggestion on how to fix it."
-      )pb"));
-}
-
-TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
-  constexpr int64_t kPrefetchIteratorId = 123;
-  constexpr int64_t kRangeIteratorId = 456;
-  constexpr int64_t kElementId = 100;
-
-  XPlane host_plane;
-  XPlaneBuilder host_plane_builder(&host_plane);
-  host_plane_builder.ReserveLines(2);
-
-  auto consumer_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 0,
-               30000000, {{StatType::kStepId, kPrefetchIteratorId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch",
-               100000000, 100000000,
-               {{StatType::kStepId, kPrefetchIteratorId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kPrefetchConsume, 180000000, 20000000,
-               {{StatType::kElementId, kElementId}});
-
-  auto producer_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kPrefetchProduce, 100000000, 80000000,
-               {{StatType::kElementId, kElementId}});
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::Prefetch::Generator", 100000000, 80000000,
-               {{StatType::kStepId, kRangeIteratorId},
-                {StatType::kParentId, kPrefetchIteratorId}});
-
-  CombinedTfDataStats combined_tf_data_stats;
-  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
-  builder.Add("host1", &host_plane);
-  builder.Finalize();
-  // Device input pipeline is not considered for bottleneck analysis.
-  EXPECT_THAT(
-      combined_tf_data_stats, EqualsProto(R"pb(
-        tf_data_stats: {
-          key: "host1"
-          value: {
-            iterator_metadata: {
-              key: 123,
-              value: {
-                id: 123
-                name: "Prefetch"
-                long_name: "Iterator::Prefetch"
-                is_async: true
-              }
-            }
-            iterator_metadata: {
-              key: 456,
-              value: {
-                id: 456
-                parent_id: 123
-                name: "Generator"
-                long_name: "Iterator::Prefetch::Generator"
-                is_async: false
-              }
-            }
-            input_pipelines {
-              key: 123,
-              value: {
-                metadata { id: 123 type: DEVICE name: "Device:0" }
-                avg_latency_ps: 65000000
-                min_latency_ps: 30000000
-                max_latency_ps: 100000000
-                num_slow_calls: 1
-                stats {
-                  bottleneck_iterator_id: 456
-                  bottleneck_iterator_latency_ps: 80000000
-                  iterator_stats {
-                    key: 123,
-                    value: {
-                      id: 123
-                      start_time_ps: 100000000
-                      duration_ps: 100000000
-                      self_time_ps: 20000000
-                      is_blocking: true
-                      num_calls: 1
-                    }
-                  }
-                  iterator_stats {
-                    key: 456,
-                    value: {
-                      id: 456
-                      start_time_ps: 100000000
-                      duration_ps: 80000000
-                      self_time_ps: 80000000
-                      is_blocking: true
-                      num_calls: 1
-                    }
-                  }
-                }
-                stats {
-                  bottleneck_iterator_id: 123
-                  bottleneck_iterator_latency_ps: 30000000
-                  iterator_stats {
-                    key: 123,
-                    value: {
-                      id: 123
-                      start_time_ps: 0
-                      duration_ps: 30000000
-                      self_time_ps: 30000000
-                      is_blocking: true
-                      num_calls: 1
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        summary: "No tf.data activity captured in your profile. If your job uses tf.data, try to capture a longer profile."
-      )pb"));
-}
-
-// Test with the following example dataset:
-// dataset = tf.data.Dataset.range(8)
-// dataset = dataset.map(lambda x: x + 1)
-// dataset = dataset.batch(2)
-// for _ in dataset:
-//   pass
-TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
-  constexpr int64_t kMapAndBatchIteratorId = 123;
-  constexpr int64_t kRangeIteratorId = 456;
-  constexpr int64_t kElementId = 100;
-
-  XPlane host_plane;
-  XPlaneBuilder host_plane_builder(&host_plane);
-  host_plane_builder.ReserveLines(2);
-
-  XLineBuilder consumer_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::MapAndBatch",
-               0, 100000000, {{StatType::kStepId, kMapAndBatchIteratorId}});
-  CreateXEvent(&host_plane_builder, &consumer_thread,
-               HostEventType::kMapAndBatchConsume, 80000000, 20000000,
-               {{StatType::kElementId, kElementId}});
-
-  XLineBuilder producer_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kMapAndBatchProduce, 0, 30000000,
-               {{StatType::kElementId, kElementId}});
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::MapAndBatch::Range", 0, 30000000,
-               {{StatType::kStepId, kRangeIteratorId},
-                {StatType::kParentId, kMapAndBatchIteratorId}});
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               HostEventType::kMapAndBatchProduce, 40000000, 30000000,
-               {{StatType::kElementId, kElementId}});
-  CreateXEvent(&host_plane_builder, &producer_thread,
-               "Iterator::MapAndBatch::Range", 40000000, 30000000,
-               {{StatType::kStepId, kRangeIteratorId},
-                {StatType::kParentId, kMapAndBatchIteratorId}});
-
-  CombinedTfDataStats combined_tf_data_stats;
-  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
-  builder.Add("host1", &host_plane);
-  builder.Finalize();
-  EXPECT_THAT(
-      combined_tf_data_stats, EqualsProto(R"pb(
-        bottleneck_analysis: {
-          host: "host1"
-          input_pipeline: "Host:0"
-          max_latency_ps: 100000000
-          iterator_name: "Range"
-          iterator_long_name: "Iterator::MapAndBatch::Range"
-          iterator_latency_ps: 60000000
-          suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
-        }
-        tf_data_stats: {
-          key: "host1"
-          value: {
-            iterator_metadata: {
-              key: 123,
-              value: {
-                id: 123
-                name: "MapAndBatch"
-                long_name: "Iterator::MapAndBatch"
-                is_async: true
-              }
-            }
-            iterator_metadata: {
-              key: 456,
-              value: {
-                id: 456
-                parent_id: 123
-                name: "Range"
-                long_name: "Iterator::MapAndBatch::Range"
-                is_async: false
-              }
-            }
-            input_pipelines {
-              key: 123,
-              value: {
-                metadata { id: 123 type: HOST name: "Host:0" }
-                avg_latency_ps: 100000000
-                min_latency_ps: 100000000
-                max_latency_ps: 100000000
-                num_slow_calls: 1
-                stats {
-                  bottleneck_iterator_id: 456
-                  bottleneck_iterator_latency_ps: 60000000
-                  iterator_stats {
-                    key: 123,
-                    value: {
-                      id: 123
-                      start_time_ps: 0
-                      duration_ps: 100000000
-                      self_time_ps: 40000000
-                      is_blocking: true
-                      num_calls: 1
-                    }
-                  }
-                  iterator_stats {
-                    key: 456,
-                    value: {
-                      id: 456
-                      start_time_ps: 0
-                      duration_ps: 60000000
-                      self_time_ps: 60000000
-                      is_blocking: true
-                      num_calls: 2
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        is_input_bound: true
-        summary: "Your profile has a tf.data input pipeline slower than 50 us. For each slow input pipeline, below shows a bottleneck in the input pipeline and a suggestion on how to fix it."
-      )pb"));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
deleted file mode 100644
index 74134a46f520..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <ostream>
-#include <stack>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-std::pair<TfFunctionExecutionMode, TfFunctionCompiler> Decode(
-    absl::string_view function_name, absl::string_view mode) {
-  // mode is one of ["eager", "concrete", "traced-xla", "traced-nonXla",
-  // "notTraced-xla", "notTraced-nonXla"]
-  if (mode == "eager") return {EAGER_MODE, INVALID_COMPILER};
-  if (mode == "concrete") return {CONCRETE_MODE, INVALID_COMPILER};
-  if (mode == "traced-xla") return {TRACED_MODE, XLA_COMPILER};
-  if (mode == "traced-nonXla") return {TRACED_MODE, OTHER_COMPILER};
-  if (mode == "notTraced-xla") return {NOT_TRACED_MODE, XLA_COMPILER};
-  if (mode == "notTraced-nonXla") return {NOT_TRACED_MODE, OTHER_COMPILER};
-  // Shouldn't reach here.
-  LOG(ERROR) << absl::StrCat("tf-function '", function_name,
-                             "' has an unexpected execution mode '", mode, "'")
-             << std::endl;
-  return {INVALID_MODE, INVALID_COMPILER};
-  DCHECK(false);
-}
-
-double ComputeExpensiveCallPercent(const TfFunction& tf_function) {
-  // Computes the expensiveness in terms of time (rather than count).
-  uint64 total_call_time_ps = 0;
-  uint64 expensive_call_time_ps = 0;
-  for (const auto& mode_metrics : tf_function.metrics()) {
-    const auto mode = mode_metrics.first;
-    const auto& metrics = mode_metrics.second;
-    total_call_time_ps += metrics.self_time_ps();
-    if (mode == TRACED_MODE || mode == EAGER_MODE) {
-      expensive_call_time_ps += metrics.self_time_ps();
-    }
-  }
-  return tsl::profiler::SafeDivide(100.0 * expensive_call_time_ps,
-                                   total_call_time_ps);
-}
-
-// Each invocation of a tf-function creates an ActivationRecord.
-struct ActivationRecord {
-  std::string function_name;               // name of the tf-function.
-  tsl::profiler::Timespan timespan;        // timespan of this invocation.
-  TfFunctionExecutionMode execution_mode;  // execution mode.
-  TfFunctionCompiler compiler;             // compiler used.
-  int64_t tracing_count;  // the total tracing count of this function when this
-                          // invocation happened.
-  uint64 children_duration_ps;  // Sum of the duration of all (immediate)
-                                // children tf-functions of this function.
-  ActivationRecord()
-      : function_name(""),
-        execution_mode(INVALID_MODE),
-        compiler(INVALID_COMPILER),
-        tracing_count(0),
-        children_duration_ps(0) {}
-  ActivationRecord(absl::string_view name,
-                   const tsl::profiler::Timespan& timespan,
-                   TfFunctionExecutionMode exe_mode,
-                   TfFunctionCompiler compiler, int64_t tracing_cnt)
-      : function_name(std::string(name)),
-        timespan(timespan),
-        execution_mode(exe_mode),
-        compiler(compiler),
-        tracing_count(tracing_cnt),
-        children_duration_ps(0) {}
-  std::string DebugString() const {
-    return absl::StrCat("{", function_name, ", ",
-                        TfFunctionExecutionMode_Name(execution_mode), ", ",
-                        TfFunctionCompiler_Name(compiler),
-                        ", tracing_count:", tracing_count,
-                        ", children_duration:", children_duration_ps,
-                        " ps, timespan:", timespan.DebugString(), "}");
-  }
-};
-
-// Entry or exit point of a tf-function.
-struct EntryOrExit {
-  bool is_entry;        // true for entry, false for exit.
-  int64_t index;        // index to the ActivationRecord.
-  uint64 timestamp_ps;  // the time when this entry/exit happens.
-  EntryOrExit() : is_entry(false), index(-1), timestamp_ps(0) {}
-  EntryOrExit(bool is_entry, int64_t index, uint64 timestamp_ps)
-      : is_entry(is_entry), index(index), timestamp_ps(timestamp_ps) {}
-  std::string DebugString() const {
-    std::string entry_or_exit = is_entry ? "entry, " : "exit,  ";
-    return absl::StrCat("{", entry_or_exit, "idx:", index,
-                        ", timestamp:", timestamp_ps, "}");
-  }
-};
-
-TfFunctionCompiler CombineCompilers(TfFunctionCompiler a,
-                                    TfFunctionCompiler b) {
-  if (a == INVALID_COMPILER) return b;
-  if (b == INVALID_COMPILER) return a;
-  if (a == b) return a;
-  return MIXED_COMPILER;
-}
-
-void CombineTfFunctionMetrics(const TfFunctionMetrics& src,
-                              TfFunctionMetrics* dst) {
-  dst->set_count(src.count() + dst->count());
-  dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
-}
-
-void CombineTfFunction(const TfFunction& src, TfFunction* dst) {
-  dst->set_total_tracing_count(
-      std::max(src.total_tracing_count(), dst->total_tracing_count()));
-  dst->set_compiler(CombineCompilers(src.compiler(), dst->compiler()));
-  for (const auto& mode_metrics : src.metrics()) {
-    int32_t execution_mode = mode_metrics.first;
-    const TfFunctionMetrics& src_metrics = mode_metrics.second;
-    TfFunctionMetrics* dst_metrics =
-        gtl::FindOrNull(*dst->mutable_metrics(), execution_mode);
-    if (dst_metrics == nullptr) {
-      (*dst->mutable_metrics())[execution_mode] = src_metrics;
-    } else {
-      CombineTfFunctionMetrics(src_metrics, dst_metrics);
-    }
-  }
-  dst->set_expensive_call_percent(ComputeExpensiveCallPercent(*dst));
-}
-
-// Execution history of all tf-functions invoked.
-class TfFunctionExecutions {
- public:
-  explicit TfFunctionExecutions(const XLineVisitor& line) {
-    // Creates points_ and activations_ from line.
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      absl::string_view mode;
-      int64_t tracing_count = 0;
-      event.ForEachStat([&mode, &tracing_count](const XStatVisitor& stat) {
-        if (!stat.Type().has_value()) return;
-        switch (stat.Type().value()) {
-          case StatType::kTfFunctionCall:
-            mode = stat.StrOrRefValue();
-            break;
-          case StatType::kTfFunctionTracingCount:
-            tracing_count = stat.IntValue();
-            break;
-        }
-      });
-      if (mode.empty()) return;
-
-      // event is a tf-function.
-      int64_t index = activations_.size();
-      auto timespan = event.GetTimespan();
-      auto mode_compiler = Decode(event.Name(), mode);
-      ActivationRecord activation_record =
-          ActivationRecord(event.Name(), timespan, mode_compiler.first,
-                           mode_compiler.second, tracing_count);
-      activations_.push_back(activation_record);
-      EntryOrExit entry_point =
-          EntryOrExit(/*is_entry=*/true, index, timespan.begin_ps());
-      EntryOrExit exit_point =
-          EntryOrExit(/*is_entry=*/false, index, timespan.end_ps());
-      points_.push_back(entry_point);
-      points_.push_back(exit_point);
-    });
-
-    // Sorts points_ in ascending order of timestamps.
-    auto ascending_in_timestamp = [](const EntryOrExit& a,
-                                     const EntryOrExit& b) {
-      return a.timestamp_ps < b.timestamp_ps;
-    };
-    absl::c_sort(points_, ascending_in_timestamp);
-
-    // Calculates the children duration for each activation record.
-    CalculateChildrenDurations();
-  }
-
-  std::string DebugString() const {
-    std::string result = "\nActivations:\n";
-    for (int i = 0, end = activations_.size(); i < end; i++) {
-      absl::StrAppend(&result, "[", i, "] ", activations_[i].DebugString(),
-                      "\n");
-    }
-    absl::StrAppend(&result, "tf-function Entry/Exit Points:\n");
-    for (const auto& pt : points_) {
-      absl::StrAppend(&result, pt.DebugString(), "\n");
-    }
-    return result;
-  }
-
-  // Converts this execution history to a TfFunctionDb.
-  TfFunctionDb ConvertToTfFunctionDb() {
-    TfFunctionDb result;
-    for (const auto& record : activations_) {
-      TfFunction* fun = &(*result.mutable_tf_functions())[record.function_name];
-      fun->set_total_tracing_count(
-          std::max(static_cast<int64_t>(fun->total_tracing_count()),
-                   record.tracing_count));
-      fun->set_compiler(CombineCompilers(fun->compiler(), record.compiler));
-      // The self-time of this function is the difference between the duration
-      // of this function and the duration of its children.
-      uint64 self_time_ps =
-          record.timespan.duration_ps() - record.children_duration_ps;
-      // Updates the metrics for this execution mode with this invocation.
-      TfFunctionMetrics* metrics =
-          &(*fun->mutable_metrics())[record.execution_mode];
-      metrics->set_count(metrics->count() + 1);
-      metrics->set_self_time_ps(metrics->self_time_ps() + self_time_ps);
-    }
-    for (auto& name_fun : *result.mutable_tf_functions()) {
-      TfFunction& fun = name_fun.second;
-      fun.set_expensive_call_percent(ComputeExpensiveCallPercent(fun));
-    }
-    return result;
-  }
-
-  // Calculates the children duration of every tf-function.
-  void CalculateChildrenDurations() {
-    std::stack<int64_t> call_stack;
-    for (const auto& pt : points_) {
-      if (pt.is_entry) {
-        // Function entry.
-        call_stack.push(pt.index);
-      } else {
-        // Function exit.
-        DCHECK(call_stack.top() == pt.index);  // must be well nested.
-        uint64 call_duration = activations_[pt.index].timespan.duration_ps();
-        call_stack.pop();
-        if (!call_stack.empty()) {
-          // call_stack.top() is the parent tf-function; adds call_duration to
-          // its children_duration.
-          activations_[call_stack.top()].children_duration_ps += call_duration;
-        }
-      }
-    }
-  }
-
- private:
-  // ActivationRecords for all tf-function invocations.
-  std::vector<ActivationRecord> activations_;
-  // Entry and exit points of all invocations.
-  std::vector<EntryOrExit> points_;
-};
-
-}  // namespace
-
-std::string DebugString(const TfFunctionDb& tf_function_db) {
-  std::string str;
-  tsl::protobuf::TextFormat::PrintToString(tf_function_db, &str);
-  return str;
-}
-
-void CombineTfFunctionDb(const TfFunctionDb& src, TfFunctionDb* dst) {
-  for (const auto& name_function : src.tf_functions()) {
-    const auto& name = name_function.first;
-    const auto& src_fun = name_function.second;
-    TfFunction* dst_fun = gtl::FindOrNull(*dst->mutable_tf_functions(), name);
-    if (dst_fun == nullptr) {
-      (*dst->mutable_tf_functions())[name] = src_fun;
-    } else {
-      CombineTfFunction(src_fun, dst_fun);
-    }
-  }
-}
-
-TfFunctionDb ConvertHostThreadsXLineToTfFunctionDb(const XLineVisitor& line) {
-  TfFunctionExecutions tf_function_executions = TfFunctionExecutions(line);
-  return tf_function_executions.ConvertToTfFunctionDb();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
deleted file mode 100644
index fbff7ccecc72..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
-
-#include <string>
-
-#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Converts from the given XLine to a TfFunctionDb.
-TfFunctionDb ConvertHostThreadsXLineToTfFunctionDb(const XLineVisitor& line);
-
-// Returns a debugging string for the given TfFunctionDb.
-std::string DebugString(TfFunctionDb tf_function_db);
-
-// Combines the tf-function statistics from src and dst into dst.
-void CombineTfFunctionDb(const TfFunctionDb& src, TfFunctionDb* dst);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
deleted file mode 100644
index e77883c847e5..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
-
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-const absl::string_view kEager = "eager";
-const absl::string_view kConcrete = "concrete";
-const absl::string_view kTracedNonXla = "traced-nonXla";
-const absl::string_view kTracedXla = "traced-xla";
-const absl::string_view kNotTracedNonXla = "notTraced-nonXla";
-const absl::string_view kNotTracedXla = "notTraced-xla";
-
-constexpr double kMaxError = 0.001;
-
-TfFunctionDb ConvertXSpaceToTfFunctionDb(const XSpace& space) {
-  TfFunctionDb result;
-  const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
-  if (host_plane) {
-    XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(host_plane);
-    plane.ForEachLine([&result](const XLineVisitor& line) {
-      TfFunctionDb tf_function_db = ConvertHostThreadsXLineToTfFunctionDb(line);
-      CombineTfFunctionDb(tf_function_db, &result);
-    });
-  }
-  return result;
-}
-
-TEST(ConvertXPlaneToTfFunctions, CombineTwoThreads) {
-  XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreadsPlaneName);
-  host_plane_builder.ReserveLines(2);
-  std::string kFunctionName = "decrement";
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
-                            10, 100, kTracedNonXla, 1);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
-                            150, 20, kNotTracedNonXla, 2);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread, kFunctionName,
-                            200, 80, kTracedNonXla, 3);
-
-  auto other_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread, kFunctionName,
-                            20, 100, kTracedNonXla, 2);
-  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread, kFunctionName,
-                            160, 20, kNotTracedNonXla, 2);
-  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread, kFunctionName,
-                            210, 80, kTracedXla, 4);
-
-  TfFunctionDb tf_function_db = ConvertXSpaceToTfFunctionDb(space);
-  EXPECT_EQ(tf_function_db.tf_functions().size(), 1);
-  EXPECT_EQ(tf_function_db.tf_functions().count(kFunctionName), 1);
-  const TfFunction& tf_function =
-      tf_function_db.tf_functions().at(kFunctionName);
-  EXPECT_EQ(tf_function.total_tracing_count(), 4);
-  EXPECT_EQ(tf_function.compiler(), MIXED_COMPILER);
-  EXPECT_NEAR(tf_function.expensive_call_percent(), 90, kMaxError);
-
-  const auto& metrics = tf_function.metrics();
-  EXPECT_EQ(metrics.size(), 2);
-  EXPECT_EQ(metrics.count(TRACED_MODE), 1);
-  EXPECT_EQ(metrics.count(NOT_TRACED_MODE), 1);
-  const auto& traced_mode = metrics.at(TRACED_MODE);
-  EXPECT_EQ(traced_mode.count(), 4);
-  EXPECT_EQ(traced_mode.self_time_ps(), 360);
-  const auto& not_traced_mode = metrics.at(NOT_TRACED_MODE);
-  EXPECT_EQ(not_traced_mode.count(), 2);
-  EXPECT_EQ(not_traced_mode.self_time_ps(), 40);
-}
-
-TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
-  XSpace space;
-  XPlaneBuilder host_plane_builder(space.add_planes());
-  host_plane_builder.SetName(kHostThreadsPlaneName);
-  host_plane_builder.ReserveLines(1);
-  std::string kOuterFunctionName = "outer";
-  std::string kInnerFunctionName = "inner";
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread,
-                            kOuterFunctionName, 10, 100, kTracedNonXla, 1);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread,
-                            kInnerFunctionName, 30, 40, kNotTracedXla, 0);
-  TfFunctionDb tf_function_db = ConvertXSpaceToTfFunctionDb(space);
-  EXPECT_EQ(tf_function_db.tf_functions().size(), 2);
-  EXPECT_EQ(tf_function_db.tf_functions().count(kOuterFunctionName), 1);
-  EXPECT_EQ(tf_function_db.tf_functions().count(kInnerFunctionName), 1);
-  const TfFunction& outer =
-      tf_function_db.tf_functions().at(kOuterFunctionName);
-  EXPECT_EQ(outer.total_tracing_count(), 1);
-  EXPECT_EQ(outer.compiler(), OTHER_COMPILER);
-  EXPECT_NEAR(outer.expensive_call_percent(), 100, kMaxError);
-  const auto& outer_metrics = outer.metrics();
-  EXPECT_EQ(outer_metrics.size(), 1);
-  EXPECT_EQ(outer_metrics.count(TRACED_MODE), 1);
-  const auto& traced_mode = outer_metrics.at(TRACED_MODE);
-  EXPECT_EQ(traced_mode.count(), 1);
-  EXPECT_EQ(traced_mode.self_time_ps(), 60);
-  const TfFunction& inner =
-      tf_function_db.tf_functions().at(kInnerFunctionName);
-  EXPECT_EQ(inner.total_tracing_count(), 0);
-  EXPECT_EQ(inner.compiler(), XLA_COMPILER);
-  EXPECT_NEAR(inner.expensive_call_percent(), 0, kMaxError);
-  const auto& inner_metrics = inner.metrics();
-  EXPECT_EQ(inner_metrics.size(), 1);
-  EXPECT_EQ(inner_metrics.count(NOT_TRACED_MODE), 1);
-  const auto& not_traced_mode = inner_metrics.at(NOT_TRACED_MODE);
-  EXPECT_EQ(not_traced_mode.count(), 1);
-  EXPECT_EQ(not_traced_mode.self_time_ps(), 40);
-}
-
-TEST(ConvertXPlaneToTfFunctions, EagerPlusConcrete) {
-  XSpace space;
-  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
-  host_plane_builder.ReserveLines(2);
-  std::string kEagerFunctionName = "i_am_eager";
-  std::string kConcreteFunctionName = "i_am_concrete";
-
-  auto main_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateTfFunctionCallEvent(&host_plane_builder, &main_thread,
-                            kEagerFunctionName, 10, 200, kEager);
-  auto other_thread = host_plane_builder.GetOrCreateLine(1);
-  CreateTfFunctionCallEvent(&host_plane_builder, &other_thread,
-                            kConcreteFunctionName, 20, 40, kConcrete);
-  TfFunctionDb tf_function_db = ConvertXSpaceToTfFunctionDb(space);
-  EXPECT_EQ(tf_function_db.tf_functions().size(), 2);
-  EXPECT_EQ(tf_function_db.tf_functions().count(kEagerFunctionName), 1);
-  EXPECT_EQ(tf_function_db.tf_functions().count(kConcreteFunctionName), 1);
-  const TfFunction& eager =
-      tf_function_db.tf_functions().at(kEagerFunctionName);
-  EXPECT_EQ(eager.total_tracing_count(), 0);
-  EXPECT_EQ(eager.compiler(), INVALID_COMPILER);
-  EXPECT_NEAR(eager.expensive_call_percent(), 100, kMaxError);
-  const auto& eager_metrics = eager.metrics();
-  EXPECT_EQ(eager_metrics.size(), 1);
-  EXPECT_EQ(eager_metrics.count(EAGER_MODE), 1);
-  const auto& eager_mode = eager_metrics.at(EAGER_MODE);
-  EXPECT_EQ(eager_mode.count(), 1);
-  EXPECT_EQ(eager_mode.self_time_ps(), 200);
-  const TfFunction& concrete =
-      tf_function_db.tf_functions().at(kConcreteFunctionName);
-  EXPECT_EQ(concrete.total_tracing_count(), 0);
-  EXPECT_EQ(concrete.compiler(), INVALID_COMPILER);
-  EXPECT_NEAR(concrete.expensive_call_percent(), 0, kMaxError);
-  const auto& concrete_metrics = concrete.metrics();
-  EXPECT_EQ(concrete_metrics.size(), 1);
-  EXPECT_EQ(concrete_metrics.count(CONCRETE_MODE), 1);
-  const auto& concrete_mode = concrete_metrics.at(CONCRETE_MODE);
-  EXPECT_EQ(concrete_mode.count(), 1);
-  EXPECT_EQ(concrete_mode.self_time_ps(), 40);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
deleted file mode 100644
index 6fe625425386..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "absl/strings/str_join.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h"
-#include "tensorflow/core/profiler/convert/xplane_to_hlo.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-absl::StatusOr<std::string> GetAvailableToolNames(
-    const SessionSnapshot& session_snapshot) {
-  std::vector<std::string> tools;
-  bool is_cloud_vertex_ai = !session_snapshot.HasAccessibleRunDir();
-  if (session_snapshot.XSpaceSize() != 0) {
-    tools.reserve(11);
-    tools.push_back(is_cloud_vertex_ai ? "trace_viewer" : "trace_viewer@");
-    tools.push_back("overview_page");
-    tools.push_back("input_pipeline_analyzer");
-    tools.push_back("framework_op_stats");
-    tools.push_back("memory_profile");
-    tools.push_back("pod_viewer");
-    tools.push_back("tf_data_bottleneck_analysis");
-    tools.push_back("op_profile");
-    tools.push_back("inference_profile");
-    tools.push_back("hlo_stats");
-    tools.push_back("roofline_model");
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(0));
-
-    if (!FindPlanesWithPrefix(*xspace, kGpuPlanePrefix).empty()) {
-      tools.push_back("kernel_stats");
-    }
-
-    TF_ASSIGN_OR_RETURN(bool has_hlo,
-                        ConvertMultiXSpaceToHloProto(session_snapshot));
-    if (has_hlo) {
-      tools.push_back("memory_viewer");
-      tools.push_back("graph_viewer");
-    }
-
-    TF_ASSIGN_OR_RETURN(bool has_dcn_collective_stats,
-                        HasDcnCollectiveStatsInMultiXSpace(session_snapshot));
-    if (has_dcn_collective_stats) {
-      tools.push_back("dcn_collective_stats");
-    }
-  }
-
-  return absl::StrJoin(tools, ",");
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names.h b/tensorflow/core/profiler/convert/xplane_to_tool_names.h
deleted file mode 100644
index a1e936940d2b..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOL_NAMES_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOL_NAMES_H_
-
-#include <string>
-
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Gets the names of the available tools given a session snapshot.
-// Returns a comma separated list of tool names.
-absl::StatusOr<std::string> GetAvailableToolNames(
-    const SessionSnapshot& session_snapshot);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOL_NAMES_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
deleted file mode 100644
index ee58c995e549..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
-
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
-#include "xla/tsl/platform/status.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/file_system.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-struct XPlaneToToolsTestCase {
-  std::string test_name;
-  std::string_view plane_name;
-  bool has_hlo_module;
-  bool has_dcn_collective_stats;
-  std::vector<std::string> expected_tools;
-};
-
-SessionSnapshot CreateSessionSnapshot(std::unique_ptr<XSpace> xspace,
-                                      bool has_hlo_module,
-                                      bool has_dcn_collective_stats) {
-  std::string test_name =
-      ::testing::UnitTest::GetInstance()->current_test_info()->name();
-  std::string path = absl::StrCat("ram://", test_name, "/");
-  std::unique_ptr<WritableFile> xplane_file;
-  tensorflow::Env::Default()
-      ->NewAppendableFile(absl::StrCat(path, "hostname.xplane.pb"),
-                          &xplane_file)
-      .IgnoreError();
-  std::vector<std::string> paths = {path};
-
-  if (has_hlo_module) {
-    tensorflow::Env::Default()
-        ->NewAppendableFile(absl::StrCat(path, "module_name.hlo_proto.pb"),
-                            &xplane_file)
-        .IgnoreError();
-  } else {
-    tensorflow::Env::Default()
-        ->NewAppendableFile(absl::StrCat(path, "NO_MODULE.hlo_proto.pb"),
-                            &xplane_file)
-        .IgnoreError();
-  }
-
-  if (has_dcn_collective_stats) {
-    tensorflow::Env::Default()
-        ->NewAppendableFile(
-            absl::StrCat(path, "hostname.dcn_collective_stats.pb"),
-            &xplane_file)
-        .IgnoreError();
-    tensorflow::Env::Default()
-        ->NewAppendableFile(
-            absl::StrCat(path, "ALL_HOSTS.dcn_collective_stats.pb"),
-            &xplane_file)
-        .IgnoreError();
-  } else {
-    tensorflow::Env::Default()
-        ->NewAppendableFile(
-            absl::StrCat(path, "NO_HOST.dcn_collective_stats.pb"), &xplane_file)
-        .IgnoreError();
-  }
-
-  std::vector<std::unique_ptr<XSpace>> xspaces;
-  xspaces.push_back(std::move(xspace));
-
-  absl::StatusOr<SessionSnapshot> session_snapshot =
-      SessionSnapshot::Create(paths, std::move(xspaces));
-  TF_CHECK_OK(session_snapshot.status());
-  return std::move(session_snapshot.value());
-}
-
-using XPlaneToToolsTest = ::testing::TestWithParam<XPlaneToToolsTestCase>;
-
-TEST_P(XPlaneToToolsTest, ToolsList) {
-  const XPlaneToToolsTestCase& test_case = GetParam();
-  auto xspace = std::make_unique<XSpace>();
-  FindOrAddMutablePlaneWithName(xspace.get(), test_case.plane_name);
-
-  SessionSnapshot sessionSnapshot =
-      CreateSessionSnapshot(std::move(xspace), test_case.has_hlo_module,
-                            test_case.has_dcn_collective_stats);
-
-  absl::StatusOr<std::string> toolsString =
-      GetAvailableToolNames(sessionSnapshot);
-  ASSERT_TRUE(toolsString.ok());
-
-  std::vector<std::string> tools = absl::StrSplit(toolsString.value(), ',');
-
-  std::vector<std::string> expected_tools = {
-      "trace_viewer",
-      "overview_page",
-      "input_pipeline_analyzer",
-      "framework_op_stats",
-      "memory_profile",
-      "pod_viewer",
-      "tf_data_bottleneck_analysis",
-      "op_profile",
-      "hlo_stats",
-      "roofline_model",
-      "inference_profile",
-  };
-  expected_tools.insert(expected_tools.end(), test_case.expected_tools.begin(),
-                        test_case.expected_tools.end());
-  EXPECT_THAT(tools, ::testing::UnorderedElementsAreArray(expected_tools));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    XPlaneToToolsTests, XPlaneToToolsTest,
-    ::testing::ValuesIn<XPlaneToToolsTestCase>({
-        {"ToolsForTpuWithoutHloModule", kTpuPlanePrefix, false, false, {}},
-        {"ToolsForTpuWithHloModule",
-         kTpuPlanePrefix,
-         true,
-         false,
-         {"graph_viewer", "memory_viewer"}},
-        {"ToolsForGpuWithoutHloModule",
-         kGpuPlanePrefix,
-         false,
-         false,
-         {"kernel_stats"}},
-        {"ToolsForGpuWithHloModule",
-         kGpuPlanePrefix,
-         true,
-         false,
-         {"kernel_stats", "graph_viewer", "memory_viewer"}},
-        {"ToolsForTpuWithDcnCollectiveStats",
-         kTpuPlanePrefix,
-         false,
-         true,
-         {"dcn_collective_stats"}},
-    }),
-    [](const ::testing::TestParamInfo<XPlaneToToolsTest::ParamType>& info) {
-      return info.param.test_name;
-    });
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
deleted file mode 100644
index 532376405ea2..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
+++ /dev/null
@@ -1,423 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/file_system.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/profiler/convert/xplane_to_trace_events.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/profiler/convert/compute_inference_latency.h"
-#include "tensorflow/core/profiler/convert/hlo_to_tools_data.h"
-#include "tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h"
-#include "tensorflow/core/profiler/convert/multi_xspace_to_inference_stats.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_hlo_stats.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_op_profile.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_roofline_model.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
-#include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
-#include "tensorflow/core/profiler/convert/process_megascale_dcn.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
-#include "tensorflow/core/profiler/convert/xplane_to_dcn_collective_stats.h"
-#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
-#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
-#include "tensorflow/core/profiler/convert/xplane_to_trace_container.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/protobuf/inference_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
-#include "tensorflow/core/profiler/protobuf/roofline_model.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
-#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-struct TraceViewOption {
-  uint64_t resolution = 0;
-  double start_time_ms = 0.0;
-  double end_time_ms = 0.0;
-};
-
-absl::StatusOr<TraceViewOption> GetTraceViewOption(const ToolOptions& options) {
-  TraceViewOption trace_options;
-  auto start_time_ms_opt =
-      GetParamWithDefault<std::string>(options, "start_time_ms", "0.0");
-  auto end_time_ms_opt =
-      GetParamWithDefault<std::string>(options, "end_time_ms", "0.0");
-  auto resolution_opt =
-      GetParamWithDefault<std::string>(options, "resolution", "0");
-
-  if (!absl::SimpleAtoi(resolution_opt, &trace_options.resolution) ||
-      !absl::SimpleAtod(start_time_ms_opt, &trace_options.start_time_ms) ||
-      !absl::SimpleAtod(end_time_ms_opt, &trace_options.end_time_ms)) {
-    return errors::InvalidArgument("wrong arguments");
-  }
-  return trace_options;
-}
-
-absl::StatusOr<std::string> ConvertXSpaceToTraceEvents(
-    const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
-    const ToolOptions& options) {
-  if (session_snapshot.XSpaceSize() != 1) {
-    return errors::InvalidArgument(
-        "Trace events tool expects only 1 XSpace path but gets ",
-        session_snapshot.XSpaceSize());
-  }
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                      session_snapshot.GetXSpace(0));
-  PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
-                             /*derived_timeline=*/true);
-  std::string content;
-  if (tool_name == "trace_viewer") {
-    tsl::profiler::ConvertXSpaceToTraceEventsString(*xspace, &content);
-    return content;
-  } else {  // streaming trace viewer.
-    std::string host_name = session_snapshot.GetHostname(0);
-    auto sstable_path = session_snapshot.GetFilePath(tool_name, host_name);
-    if (!sstable_path) {
-      return errors::Unimplemented(
-          "streaming trace viewer hasn't been supported in Cloud AI");
-    }
-    if (!Env::Default()->FileExists(*sstable_path).ok()) {
-      ProcessMegascaleDcn(xspace.get());
-      TraceEventsContainer trace_container;
-      ConvertXSpaceToTraceEventsContainer(host_name, *xspace, &trace_container);
-      std::unique_ptr<tsl::WritableFile> file;
-      TF_RETURN_IF_ERROR(
-          tensorflow::Env::Default()->NewWritableFile(*sstable_path, &file));
-      TF_RETURN_IF_ERROR(trace_container.StoreAsLevelDbTable(std::move(file)));
-    }
-    TF_ASSIGN_OR_RETURN(TraceViewOption trace_option,
-                        GetTraceViewOption(options));
-    auto visibility_filter = std::make_unique<TraceVisibilityFilter>(
-        tsl::profiler::MilliSpan(trace_option.start_time_ms,
-                                 trace_option.end_time_ms),
-        trace_option.resolution);
-    TraceEventsContainer trace_container;
-    // Trace smaller than threshold will be disabled from streaming.
-    constexpr int64_t kDisableStreamingThreshold = 500000;
-    TF_RETURN_IF_ERROR(trace_container.LoadFromLevelDbTable(
-        *sstable_path, /*filter=*/nullptr, std::move(visibility_filter),
-        kDisableStreamingThreshold));
-    JsonTraceOptions options;
-    IOBufferAdapter adapter(&content);
-    TraceEventsToJson<IOBufferAdapter, TraceEventsContainer, RawData>(
-        options, trace_container, &adapter);
-    return content;
-  }
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToOverviewPage(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions options;
-  options.generate_kernel_stats_db = true;
-  options.generate_op_metrics_db = true;
-  options.generate_step_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, options, &combined_op_stats));
-  OverviewPage overview_page = ConvertOpStatsToOverviewPage(combined_op_stats);
-  InferenceStats inference_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpaceToInferenceStats(session_snapshot, "",
-                                                        "", &inference_stats));
-  *overview_page.mutable_inference_latency() =
-      ComputeInferenceLatencyResult(inference_stats);
-  return overview_page.SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToInputPipeline(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  options.generate_step_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, options, &combined_op_stats));
-  return ConvertOpStatsToInputPipelineAnalysis(combined_op_stats)
-      .SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToTfStats(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  options.generate_kernel_stats_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, options, &combined_op_stats));
-  return ConvertOpStatsToTfStats(combined_op_stats).SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToKernelStats(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions options;
-  options.generate_kernel_stats_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, options, &combined_op_stats));
-  return combined_op_stats.kernel_stats_db().SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertXSpaceToMemoryProfile(
-    const SessionSnapshot& session_snapshot) {
-  if (session_snapshot.XSpaceSize() != 1) {
-    return errors::InvalidArgument(
-        "Memory profile tool expects only 1 XSpace path but gets ",
-        session_snapshot.XSpaceSize());
-  }
-
-  std::string json_output;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                      session_snapshot.GetXSpace(0));
-  PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
-                             /*derived_timeline=*/false);
-  TF_RETURN_IF_ERROR(ConvertXSpaceToMemoryProfileJson(*xspace, &json_output));
-  return json_output;
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToPodViewer(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  options.generate_step_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, options, &combined_op_stats));
-
-  std::string json_output;
-  tsl::protobuf::util::JsonPrintOptions opts;
-  opts.always_print_primitive_fields = true;
-  auto encode_status = tsl::protobuf::util::MessageToJsonString(
-      ConvertOpStatsToPodViewer(combined_op_stats), &json_output, opts);
-  if (!encode_status.ok()) {
-    const auto& error_message = encode_status.message();
-    return errors::Internal(
-        "Could not convert pod viewer to json. Error: ",
-        absl::string_view(error_message.data(), error_message.length()));
-  }
-  return json_output;
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToTfDataBottleneckAnalysis(
-    const SessionSnapshot& session_snapshot) {
-  CombinedTfDataStats combined_tf_data_stats;
-  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
-
-  for (int idx = 0; idx < session_snapshot.XSpaceSize(); ++idx) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(idx));
-
-    PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
-                               /*derived_timeline=*/false);
-    XPlane* host_plane =
-        FindMutablePlaneWithName(xspace.get(), kHostThreadsPlaneName);
-    std::string host_name_from_file = session_snapshot.GetHostname(idx);
-    if (host_plane == nullptr) {
-      return errors::InvalidArgument(
-          "Could not find host XPlane for tf data stats: ",
-          host_name_from_file);
-    }
-    absl::string_view host_name =
-        xspace->hostnames_size() ? xspace->hostnames(0) : host_name_from_file;
-    builder.Add(host_name, host_plane);
-  }
-  builder.Finalize();
-  return combined_tf_data_stats.SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToHloStats(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, options, &combined_op_stats));
-  hlo_stats::HloStatsDatabase hlo_stats_db =
-      ConvertOpStatsToHloStats(combined_op_stats);
-  return HloStatsToDataTableJson(hlo_stats_db);
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToRooflineModel(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions op_stats_options;
-  op_stats_options.generate_op_metrics_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, op_stats_options, &combined_op_stats));
-  RooflineModelDatabase result =
-      ConvertOpStatsToRooflineModel(combined_op_stats, true);
-  RooflineModelDatabase result_without_infeed_outfeed =
-      ConvertOpStatsToRooflineModel(combined_op_stats, false);
-  result.mutable_roofline_model_record()->MergeFrom(
-      result_without_infeed_outfeed.roofline_model_record());
-  return result.SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToOpProfileViewer(
-    const SessionSnapshot& session_snapshot) {
-  OpStatsOptions options;
-  options.generate_op_metrics_db = true;
-  OpStats combined_op_stats;
-  TF_RETURN_IF_ERROR(ConvertMultiXSpacesToCombinedOpStats(
-      session_snapshot, options, &combined_op_stats));
-
-  tensorflow::profiler::op_profile::Profile profile;
-  ConvertOpStatsToOpProfile(
-      combined_op_stats,
-      ParseHardwareType(combined_op_stats.run_environment().device_type()),
-      profile);
-  std::string json_output;
-  tsl::protobuf::util::JsonPrintOptions opts;
-  opts.always_print_primitive_fields = true;
-
-  auto encode_status =
-      tsl::protobuf::util::MessageToJsonString(profile, &json_output, opts);
-  if (!encode_status.ok()) {
-    const auto& error_message = encode_status.message();
-    return errors::Internal(
-        "Could not convert op profile proto to json. Error: ",
-        absl::string_view(error_message.data(), error_message.length()));
-  }
-  return json_output;
-}
-
-absl::StatusOr<std::string> PreprocessXSpace(
-    const SessionSnapshot& session_snapshot) {
-  if (session_snapshot.XSpaceSize() != 1) {
-    return errors::InvalidArgument(
-        "PreprocessXSpace tool expects only 1 XSpace path but gets ",
-        session_snapshot.XSpaceSize());
-  }
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                      session_snapshot.GetXSpace(0));
-  PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
-                             /*derived_timeline=*/true);
-  return xspace->SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertDcnCollectiveStatsToToolData(
-    const SessionSnapshot& session_snapshot, const ToolOptions& options) {
-  // <options> must provide a host_name field.
-  std::optional<std::string> hostname =
-      GetParam<std::string>(options, "host_name");
-  if (!hostname.has_value() || hostname->empty()) {
-    return absl::InvalidArgumentError(
-        "Cannot find host_name from options for dcn_collective_stats tool.");
-  }
-
-  // Load DcnSlackAnalysis for a host.
-  TF_ASSIGN_OR_RETURN(
-      DcnSlackAnalysis dcnSlackAnalysis,
-      GetDcnSlackAnalysisByHostName(session_snapshot, hostname.value()));
-
-  return dcnSlackAnalysis.SerializeAsString();
-}
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToInferenceStats(
-    const SessionSnapshot& session_snapshot, const ToolOptions& options) {
-  InferenceStats inference_stats;
-  std::string request_column =
-      GetParamWithDefault<std::string>(options, "request_column", "");
-  std::string batch_column =
-      GetParamWithDefault<std::string>(options, "batch_column", "");
-  TF_RETURN_IF_ERROR(ConvertMultiXSpaceToInferenceStats(
-      session_snapshot, request_column, batch_column, &inference_stats));
-  return inference_stats.SerializeAsString();
-}
-
-}  // namespace
-
-absl::StatusOr<std::string> ConvertMultiXSpacesToToolData(
-    const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
-    const ToolOptions& options) {
-  LOG(INFO) << "serving tool: " << tool_name
-            << " with options: " << DebugString(options);
-  if (tool_name == "trace_viewer" || tool_name == "trace_viewer@") {
-    return ConvertXSpaceToTraceEvents(session_snapshot, tool_name, options);
-  } else if (tool_name == "overview_page") {
-    return ConvertMultiXSpacesToOverviewPage(session_snapshot);
-  } else if (tool_name == "input_pipeline_analyzer") {
-    return ConvertMultiXSpacesToInputPipeline(session_snapshot);
-  } else if (tool_name == "framework_op_stats") {
-    return ConvertMultiXSpacesToTfStats(session_snapshot);
-  } else if (tool_name == "kernel_stats") {
-    return ConvertMultiXSpacesToKernelStats(session_snapshot);
-  } else if (tool_name == "memory_profile") {
-    return ConvertXSpaceToMemoryProfile(session_snapshot);
-  } else if (tool_name == "pod_viewer") {
-    return ConvertMultiXSpacesToPodViewer(session_snapshot);
-  } else if (tool_name == "tf_data_bottleneck_analysis") {
-    return ConvertMultiXSpacesToTfDataBottleneckAnalysis(session_snapshot);
-  } else if (tool_name == "op_profile") {
-    return ConvertMultiXSpacesToOpProfileViewer(session_snapshot);
-  } else if (tool_name == "hlo_stats") {
-    return ConvertMultiXSpacesToHloStats(session_snapshot);
-  } else if (tool_name == "roofline_model") {
-    return ConvertMultiXSpacesToRooflineModel(session_snapshot);
-  } else if (tool_name == "memory_viewer" || tool_name == "graph_viewer") {
-    return ConvertHloProtoToToolData(session_snapshot, tool_name, options);
-  } else if (tool_name == "dcn_collective_stats") {
-    return ConvertDcnCollectiveStatsToToolData(session_snapshot, options);
-  } else if (tool_name == "tool_names") {
-    return GetAvailableToolNames(session_snapshot);
-  } else if (tool_name == "_xplane.pb") {  // internal test only.
-    return PreprocessXSpace(session_snapshot);
-  } else if (tool_name == "inference_profile") {
-    return ConvertMultiXSpacesToInferenceStats(session_snapshot, options);
-  } else {
-    return errors::InvalidArgument(
-        "Can not find tool: ", tool_name,
-        ". Please update to the latest version of Tensorflow.");
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.h b/tensorflow/core/profiler/convert/xplane_to_tools_data.h
deleted file mode 100644
index 8a40e03a7cd1..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
-
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Convert XSpace protos to a tool specific data.
-// Return the serialized string of tool specific data when the conversion is
-// successful, else return error status.
-absl::StatusOr<std::string> ConvertMultiXSpacesToToolData(
-    const SessionSnapshot& session_snapshot, absl::string_view tool_name,
-    const ToolOptions& options);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_container.cc b/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
deleted file mode 100644
index 44f7b92bcc7a..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_trace_container.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/trace_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h"
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tsl::profiler::FindPlanesWithPrefix;
-using tsl::profiler::FindPlaneWithName;
-using tsl::profiler::HostEventType;
-using tsl::profiler::StatType;
-using tsl::profiler::XEventVisitor;
-using tsl::profiler::XFlow;
-using tsl::profiler::XLineVisitor;
-using tsl::profiler::XPlaneVisitor;
-using tsl::profiler::XStatVisitor;
-
-struct SpecialArguments {
-  std::optional<int64_t> group_id;
-  absl::string_view step_name;
-  bool is_async_event = false;
-  // Both flow and async events share the flow specification.
-  std::optional<XFlow> flow;
-};
-
-inline TraceEvent::FlowEntryType FlowEntryTypeFromDirection(
-    XFlow::FlowDirection direction) {
-  switch (direction) {
-    case XFlow::kFlowUnspecified:
-      return TraceEvent::FLOW_NONE;
-    case XFlow::kFlowIn:
-      return TraceEvent::FLOW_END;
-    case XFlow::kFlowOut:
-      return TraceEvent::FLOW_START;
-    case XFlow::kFlowInOut:
-      return TraceEvent::FLOW_MID;
-  }
-}
-
-template <typename T>
-void ConvertXStatToTraceEventArgument(const XStatVisitor& stat, T value,
-                                      SpecialArguments& special_args,
-                                      TraceEventArgumentsBuilder& args) {
-  if (stat.Type() == StatType::kFlow) {
-    special_args.flow = XFlow::FromStatValue(value);
-  } else if (stat.Type() == StatType::kGroupId) {
-    special_args.group_id = value;
-  } else if (stat.Type() == StatType::kIsAsync) {
-    special_args.is_async_event = true;
-  } else {
-    args.Append(stat.Name(), value);
-  }
-}
-
-SpecialArguments ConvertXStatsToTraceEventArguments(
-    const XEventVisitor& event, RawData* raw_data,
-    TraceEventArguments* raw_args) {
-  TraceEventArgumentsBuilder args(raw_args);
-  SpecialArguments special_args;
-  auto for_each_stat = [&special_args, &args](const XStatVisitor& stat) {
-    if (tsl::profiler::IsInternalStat(stat.Type())) return;
-    switch (stat.ValueCase()) {
-      case XStat::kInt64Value:
-        ConvertXStatToTraceEventArgument(stat, stat.IntValue(), special_args,
-                                         args);
-        break;
-      case XStat::kUint64Value:
-        ConvertXStatToTraceEventArgument(stat, stat.UintValue(), special_args,
-                                         args);
-        break;
-      case XStat::kDoubleValue:
-        args.Append(stat.Name(), stat.DoubleValue());
-        break;
-      case XStat::kStrValue:
-      case XStat::kRefValue: {
-        auto stat_value = stat.StrOrRefValue();
-        if (stat.Type() == StatType::kStepName) {
-          special_args.step_name = stat_value;
-        }
-        args.Append(stat.Name(), stat_value);
-        break;
-      }
-      case XStat::kBytesValue:
-        break;
-      case XStat::VALUE_NOT_SET:
-        break;
-    }
-  };
-  // Ensure the metadata stats appear before the per-occurrence stats.
-  event.Metadata().ForEachStat(for_each_stat);
-  event.ForEachStat(for_each_stat);
-  return special_args;
-}
-
-void ConvertXLineToTraceEventsContainer(uint32_t device_id,
-                                        const XLineVisitor& line,
-                                        TraceEventsContainer* container) {
-  std::optional<uint32_t> resource_id;
-
-  if (line.Name() != tsl::profiler::kCounterEventsLineName) {
-    resource_id = line.DisplayId();
-    Resource* resource = container->MutableResource(*resource_id, device_id);
-    resource->set_resource_id(*resource_id);
-    resource->set_name(std::string(line.DisplayName()));
-    resource->set_num_events(line.NumEvents());
-  }
-
-  RawData raw_data;  // hoisted for performance
-  line.ForEachEvent([device_id, resource_id, &raw_data,
-                     container](const XEventVisitor& event) {
-    int64_t event_type =
-        event.Type().value_or(HostEventType::kUnknownHostEventType);
-    if (tsl::profiler::IsInternalEvent(event_type)) return;
-    TraceEventArguments* raw_args = raw_data.mutable_args();
-    absl::string_view event_name;
-    if (event.HasDisplayName()) {
-      event_name = event.DisplayName();
-      TraceEventArgumentsBuilder args(raw_args);
-      constexpr size_t kMaxLongName = 10000;
-      if (event.Name().size() > kMaxLongName) {
-        args.Append("long_name",
-                    absl::StrCat(event.Name().substr(0, kMaxLongName),
-                                 "...<truncated>"));
-      } else {
-        args.Append("long_name", event.Name());
-      }
-    } else {
-      event_name = event.Name();
-    }
-    SpecialArguments special_args =
-        ConvertXStatsToTraceEventArguments(event, &raw_data, raw_args);
-    if (!special_args.step_name.empty()) {
-      event_name = special_args.step_name;
-    }
-    if (!resource_id) {
-      container->AddCounterEvent(event_name, device_id, event.TimestampPs(),
-                                 raw_data);
-    } else if (special_args.flow) {
-      tsl::profiler::Timespan span(event.TimestampPs(), event.DurationPs());
-      if (special_args.is_async_event) {
-        container->AddAsyncEvent(
-            event_name, device_id, span, special_args.flow->Id(),
-            FlowEntryTypeFromDirection(special_args.flow->Direction()),
-            special_args.flow->Category(), &raw_data, special_args.group_id);
-      } else {
-        container->AddFlowEvent(
-            event_name, *resource_id, device_id, span, special_args.flow->Id(),
-            FlowEntryTypeFromDirection(special_args.flow->Direction()),
-            special_args.flow->Category(), &raw_data, special_args.group_id);
-      }
-    } else {
-      tsl::profiler::Timespan span(event.TimestampPs(), event.DurationPs());
-      container->AddCompleteEvent(event_name, *resource_id, device_id, span,
-                                  &raw_data, special_args.group_id);
-    }
-    // Cleanup hoisted structure for next event.
-    if (raw_data.has_args()) raw_args->clear_arg();
-  });
-}
-
-void ConvertXPlaneToTraceEventsContainer(uint64_t device_id,
-                                         absl::string_view hostname,
-                                         const XPlane& xplane,
-                                         TraceEventsContainer* container) {
-  XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
-  std::unique_ptr<ResourceGrouperInterface> resource_grouper =
-      CreateDefaultResourceGrouper(device_id, plane.Name());
-
-  if (plane.NumLines() == 0) return;
-
-  for (const auto& [device_id, name] : resource_grouper->Devices()) {
-    Device* device = container->MutableDevice(device_id);
-    device->set_device_id(device_id);
-    device->set_name(absl::StrCat(hostname, " ", name));
-  }
-
-  plane.ForEachLine([&](const XLineVisitor& line) {
-    if (line.DisplayName() == tsl::profiler::kXlaAsyncOpLineName) return;
-    if (line.NumEvents() == 0) return;
-    // Capture a copy of XLineVisitor because it will go out of scope.
-    uint32_t device_id = resource_grouper->GetDeviceId(line.DisplayId());
-    ConvertXLineToTraceEventsContainer(device_id, line, container);
-  });
-}
-
-}  // namespace
-
-void ConvertXSpaceToTraceEventsContainer(absl::string_view hostname,
-                                         const XSpace& space,
-                                         TraceEventsContainer* container) {
-  const XPlane* host_plane =
-      FindPlaneWithName(space, tsl::profiler::kHostThreadsPlaneName);
-  if (host_plane != nullptr) {
-    ConvertXPlaneToTraceEventsContainer(tsl::profiler::kHostThreadsDeviceId,
-                                        hostname, *host_plane, container);
-  }
-
-  std::vector<const XPlane*> device_planes =
-      FindPlanesWithPrefix(space, tsl::profiler::kGpuPlanePrefix);
-
-  if (device_planes.empty()) {
-    device_planes = FindPlanesWithPrefix(space, tsl::profiler::kTpuPlanePrefix);
-  }
-
-  for (const XPlane* device_plane : device_planes) {
-    ConvertXPlaneToTraceEventsContainer(
-        tsl::profiler::kFirstDeviceId + device_plane->id(), hostname,
-        *device_plane, container);
-  }
-  for (const XPlane* custom_plane :
-       FindPlanesWithPrefix(space, tsl::profiler::kCustomPlanePrefix)) {
-    ConvertXPlaneToTraceEventsContainer(
-        tsl::profiler::kFirstCustomPlaneDeviceId + custom_plane->id(), hostname,
-        *custom_plane, container);
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_container.h b/tensorflow/core/profiler/convert/xplane_to_trace_container.h
deleted file mode 100644
index cdf3a72f7c23..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_trace_container.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
-
-#include "tensorflow/core/profiler/convert/trace_viewer/trace_events.h"
-#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using TraceEventsContainer = TraceEventsContainerBase<EventFactory, RawData>;
-
-// Converts XEvents within the XSpace into trace_viewer events container.
-void ConvertXSpaceToTraceEventsContainer(absl::string_view hostname,
-                                         const XSpace& xspace,
-                                         TraceEventsContainer* container);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_container_test.cc b/tensorflow/core/profiler/convert/xplane_to_trace_container_test.cc
deleted file mode 100644
index 821582610fd6..000000000000
--- a/tensorflow/core/profiler/convert/xplane_to_trace_container_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_trace_container.h"
-
-#include <cstdint>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/strings/match.h"
-#include "absl/strings/substitute.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/util/proto/proto_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::testing::Pair;
-using ::testing::UnorderedElementsAre;
-
-TEST(XPlaneToTraceContainerTest, CounterLine) {
-  XSpace xspace;
-  CHECK_OK(tensorflow::proto_utils::ParseTextFormatFromString(
-      absl::Substitute(
-          "planes {"
-          "  name: \"/device:GPU:0\""
-          "  lines {"
-          "    name: \"_counters_\""
-          "    events {"
-          "      metadata_id: 100"
-          "      offset_ps: $0"
-          "      stats { metadata_id: 200 uint64_value: 100 }"
-          "    }"
-          "    events {"
-          "      metadata_id: 100"
-          "      offset_ps: $1"
-          "      stats { metadata_id: 200 uint64_value: 200 }"
-          "    }"
-          "    events {"
-          "      metadata_id: 101"
-          "      offset_ps: $0"
-          "      stats { metadata_id: 201 uint64_value: 300 }"
-          "    }"
-          "    events {"
-          "      metadata_id: 101"
-          "      offset_ps: $1"
-          "      stats { metadata_id: 201 uint64_value: 400 }"
-          "    }"
-          "  }"
-          "  lines {"
-          "    id: 14"
-          "    name: \"Stream #14(MemcpyH2D)\""
-          "    timestamp_ns: $3"
-          "    events {"
-          "      metadata_id: 10"
-          "      offset_ps: 0"
-          "      duration_ps: $1"
-          "      stats { metadata_id: 8 uint64_value: 100 }"
-          "      stats { metadata_id: 9 str_value: \"$$1\" }"
-          "    }"
-          "    events {"
-          "      metadata_id: 10"
-          "      offset_ps: $0"
-          "      duration_ps: $3"
-          "      stats { metadata_id: 8 uint64_value: 200 }"
-          "      stats { metadata_id: 9 str_value: \"abcd\" }"
-          "    }"
-          "  }"
-          "  event_metadata {key: 10 value: { id: 10 name: \"MemcpyD2D\" }}"
-          "  event_metadata {key: 100 value: { id: 100 name: \"Counter 1\" }}"
-          "  event_metadata {key: 101 value: { id: 101 name: \"Counter 2\" }}"
-          "  stat_metadata {key: 8 value: { id: 8 name: \"RemoteCall\"}}"
-          "  stat_metadata {key: 9 value: { id: 8 name: \"context_id\"}}"
-          "  stat_metadata {key: 200 value: { id: 200 name: \"counter_1\"}}"
-          "  stat_metadata {key: 201 value: { id: 201 name: \"counter_2\"}}"
-          "}",
-          tsl::profiler::UniToPico(1), tsl::profiler::UniToPico(2),
-          tsl::profiler::UniToNano(1), tsl::profiler::UniToNano(500)),
-      &xspace));
-  TraceEventsContainer container;
-  ConvertXSpaceToTraceEventsContainer("localhost", xspace, &container);
-  absl::flat_hash_map<std::string, absl::flat_hash_map<uint64_t, uint64_t>>
-      counter_offset_to_values;
-  container.ForAllEvents([&counter_offset_to_values](const TraceEvent& event) {
-    if (absl::StrContains(event.name(), "Counter")) {
-      uint64_t offset = event.timestamp_ps();
-      RawData raw_data;
-      raw_data.ParseFromString(event.raw_data());
-      counter_offset_to_values[event.name()][offset] =
-          raw_data.args().arg(0).uint_value();
-    }
-  });
-  EXPECT_THAT(
-      counter_offset_to_values,
-      UnorderedElementsAre(
-          Pair("Counter 1",
-               UnorderedElementsAre(Pair(tsl::profiler::UniToPico(1), 100),
-                                    Pair(tsl::profiler::UniToPico(2), 200))),
-          Pair("Counter 2",
-               UnorderedElementsAre(Pair(tsl::profiler::UniToPico(1), 300),
-                                    Pair(tsl::profiler::UniToPico(2), 400)))));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
deleted file mode 100644
index 55977e2ed008..000000000000
--- a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
+++ /dev/null
@@ -1,525 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h"
-
-#include <sys/types.h>
-
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/shape_util.h"
-#include "xla/side_effect_util.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/core/profiler/protobuf/dcn_collective_info.pb.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-#include "tensorflow/core/profiler/protobuf/topology.pb.h"
-#include "tensorflow/core/profiler/utils/hlo_module_utils.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_to_module.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tsl/platform/regexp.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using tensorflow::profiler::DcnSlackSummary;
-using tensorflow::profiler::Topology;
-using tsl::profiler::CreateTfXPlaneVisitor;
-using tsl::profiler::FindLineWithName;
-using tsl::profiler::kXlaOpLineName;
-using tsl::profiler::NanoToMicro;
-using tsl::profiler::PicoToMicro;
-using tsl::profiler::SafeDivide;
-using tsl::profiler::StatType;
-using tsl::profiler::Timespan;
-using tsl::profiler::XEventContextTracker;
-using tsl::profiler::XEventVisitor;
-using tsl::profiler::XLineVisitor;
-using tsl::profiler::XPlaneVisitor;
-using tsl::profiler::XStatVisitor;
-using xla::HloOpcode;
-
-// TODO: Identify mechanism to maintain consistency between producer and
-// consumer here.
-const char kHostEventRegex[] = {
-    "device_[0-9]+([0-9][0-9][0-9][0-9][0-9])_gid_(.*)"};
-
-std::optional<std::string> GetAttributeFromInstr(
-    const xla::HloInstruction* instr, std::string_view attribute) {
-  std::optional<std::string> attribute_value;
-  if (instr->frontend_attributes().IsInitialized() &&
-      !instr->frontend_attributes().map().empty() &&
-      instr->frontend_attributes().map().contains(attribute)) {
-    attribute_value = instr->frontend_attributes().map().at(attribute);
-  }
-  return attribute_value;
-}
-std::optional<std::string> GetRendezvous(const xla::HloInstruction* instr) {
-  return GetAttributeFromInstr(instr, xla::kXlaHostTransferRendezvousNameAttr);
-}
-
-dcn_analysis_internal::DcnHostEvent ParseDcnHostEvent(
-    const XEventVisitor& visitor) {
-  dcn_analysis_internal::DcnHostEvent event;
-  static const LazyRE2 re = {kHostEventRegex};
-  RE2::FullMatch(visitor.Name(), *re, &event.multi_slice_device_id,
-                 &event.rendezvous_name);
-
-  event.timespan = visitor.GetTimespan();
-  return event;
-}
-
-std::optional<std::string> GetTransferType(const xla::HloInstruction* instr) {
-  return GetAttributeFromInstr(instr, "_xla_megascale_transfer_type");
-}
-
-std::string HostCollectiveKey(int index_on_host,
-                              std::string_view rendezvous_name) {
-  return absl::StrCat(index_on_host, "_", rendezvous_name);
-}
-
-DcnCollectiveInfoProto GetDcnCollectiveInfoProto(const XEventVisitor& xevent) {
-  DcnCollectiveInfoProto dcn_collective_info;
-  xevent.Metadata().ForEachStat([&](const XStatVisitor& xstat) {
-    if (static_cast<StatType>(*xstat.Type()) == StatType::kDcnCollectiveInfo) {
-      absl::string_view byte_value = xstat.BytesValue();
-      if (!dcn_collective_info.ParseFromArray(byte_value.data(),
-                                              byte_value.size())) {
-        LOG(WARNING) << "Could not parse DcnCollectiveInfoProto from metadata.";
-      }
-    }
-  });
-
-  return dcn_collective_info;
-}
-
-}  // namespace
-
-namespace dcn_analysis_internal {
-
-void DcnHostEventList::insert(DcnHostEvent event) {
-  if (iter_ != events_.end() && event.timespan < iter_->timespan) {
-    // The event being inserted is from a new line, Reset iterator to the
-    // beginning.
-    iter_ = events_.begin();
-  }
-  while (iter_ != events_.end() && iter_->timespan < event.timespan) {
-    iter_++;
-  }
-  iter_ = events_.insert(iter_, event);
-}
-
-std::optional<DcnHostEvent> DcnHostEventList::pop(const Timespan& timespan) {
-  while (!events_.empty() && events_.front().timespan < timespan) {
-    events_.pop_front();
-  }
-
-  if (!events_.empty() &&
-      (timespan.Includes(events_.front().timespan.begin_ps()) ||
-       events_.front().timespan.Includes(timespan.begin_ps()))) {
-    DcnHostEvent front = events_.front();
-    events_.pop_front();
-    return front;
-  } else {
-    return std::nullopt;
-  }
-}
-
-absl::StatusOr<InstrMetadata> DcnTracker::GetInstrMetadataFromHloModule(
-    std::string_view module_name, std::string_view instr_name) {
-  if (!hlo_module_cache_.contains(module_name)) {
-    TF_ASSIGN_OR_RETURN(auto hlo_proto,
-                        hlo_proto_map_.GetHloProtoByModuleName(module_name));
-    TF_ASSIGN_OR_RETURN(auto module, ConvertHloProtoToModule(*hlo_proto));
-    hlo_module_cache_[module_name] = std::move(module);
-  }
-  const auto& hlo_module = hlo_module_cache_[module_name];
-  dcn_analysis_internal::InstrMetadata instr_metadata;
-  auto instr = FindInstruction(*hlo_module, std::string(instr_name));
-
-  instr_metadata.opcode = instr->opcode();
-  instr_metadata.channel_id = instr->channel_id().value();
-  instr_metadata.rendezvous_name = GetRendezvous(instr);
-  instr_metadata.transfer_type = GetTransferType(instr);
-  instr_metadata.size = 0;
-  if (instr->shape().IsArray()) {
-    instr_metadata.size = xla::ShapeUtil::ByteSizeOfElements(instr->shape());
-  } else if (instr->shape().IsTuple()) {
-    for (const auto& shape : instr->shape().tuple_shapes()) {
-      instr_metadata.size += xla::ShapeUtil::ByteSizeOf(shape);
-    }
-  }
-  return instr_metadata;
-}
-
-absl::StatusOr<InstrMetadata> DcnTracker::GetInstructionMetadata(
-    std::string_view module, std::string_view instr) {
-  std::string key = absl::StrCat(module, "_", instr);
-  if (const auto& it = instruction_metadata_map_.find(key);
-      it != instruction_metadata_map_.end()) {
-    return it->second;
-  }
-
-  absl::StatusOr<InstrMetadata> instr_metadata =
-      GetInstrMetadataFromHloModule(module, instr);
-  if (instr_metadata.ok()) {
-    instruction_metadata_map_[key] = *instr_metadata;
-  }
-
-  return instr_metadata;
-}
-
-DcnSlackAnalysis DcnTracker::Finalize() {
-  SummarizeDcnSlackAnalysis();
-  return slack_analysis_;
-}
-
-void DcnTracker::DebugString() {
-  for (const DcnSlack& analysis : slack_analysis_.dcn_slack()) {
-    LOG(INFO) << analysis.rendezvous() << " : " << analysis.slack_us();
-  }
-}
-
-void DcnTracker::UpdateActiveOps(uint64_t duration) {
-  for (auto& [rendezvous, opState] : rendezvous_to_op_map_) {
-    opState.overlapping_duration += duration;
-  }
-}
-
-int DcnTracker::GetReplicaGroupSize(const std::string& rendezvous_name,
-                                    const XEventVisitor& visitor) {
-  if (rendezvous_to_replica_group_size_map_.contains(rendezvous_name)) {
-    return rendezvous_to_replica_group_size_map_[rendezvous_name];
-  }
-
-  DcnCollectiveInfoProto dcn_collective_info =
-      GetDcnCollectiveInfoProto(visitor);
-
-  if (dcn_collective_info.one_to_one_groups_size() != 0) {
-    // OneToOneGroup has a source and a destination, which is one replica group
-    rendezvous_to_replica_group_size_map_[rendezvous_name] = 1;
-  } else if (dcn_collective_info.endpoint_groups_size() != 0) {
-    rendezvous_to_replica_group_size_map_[rendezvous_name] =
-        dcn_collective_info.endpoint_groups(0).endpoints().size();
-  } else {
-    rendezvous_to_replica_group_size_map_[rendezvous_name] = 0;
-  }
-
-  return rendezvous_to_replica_group_size_map_[rendezvous_name];
-}
-
-// ComputeTransmittedDataSize is called with the buffer_size for recv-done.
-uint64_t DcnTracker::ComputeTransmittedDataSize(
-    const int64_t recv_buffer_size, const int group_size,
-    const std::string& transfer_type) {
-  uint64_t transmitted_bytes = 0;
-  if (group_size == 0) {
-    LOG(ERROR) << "Replica group size is 0.";
-    return transmitted_bytes;
-  }
-
-  if (transfer_type == "ONE_TO_ONE") {
-    transmitted_bytes = group_size * recv_buffer_size;
-  } else if (transfer_type == "ALL_GATHER") {
-    transmitted_bytes =
-        SafeDivide((group_size - 1) * recv_buffer_size, group_size);
-  } else if (transfer_type == "ALL_REDUCE") {
-    // Since the reduced buffer now has to be sent back to the replicas,
-    // the total bytes transmitted over the network is 2x the shape of the op.
-    transmitted_bytes =
-        2 * SafeDivide(group_size - 1, group_size) * recv_buffer_size;
-  } else if (transfer_type == "ALL_TO_ALL") {
-    transmitted_bytes =
-        SafeDivide(group_size - 1, group_size) * recv_buffer_size;
-  } else if (transfer_type == "REDUCE_SCATTER") {
-    transmitted_bytes = recv_buffer_size * (group_size - 1);
-  } else {
-    LOG(ERROR) << "Unsupported transfer type: " << transfer_type;
-  }
-  return transmitted_bytes;
-}
-
-void DcnTracker::VisitOp(const InstrMetadata& instr,
-                         const XEventVisitor& visitor) {
-  std::string rendezvous_name;
-  if (instr.rendezvous_name.has_value()) {
-    rendezvous_name = *instr.rendezvous_name;
-    channel_id_to_rendezvous_map_[instr.channel_id] = rendezvous_name;
-  } else {
-    if (auto it = channel_id_to_rendezvous_map_.find(instr.channel_id);
-        it != channel_id_to_rendezvous_map_.end()) {
-      rendezvous_name = it->second;
-    } else {
-      // Ignore ops as we have not seen the corresponding send/recv.
-      return;
-    }
-  }
-
-  DcnOpState& opState = rendezvous_to_op_map_[rendezvous_name];
-  opState.stall_duration_ns += visitor.DurationNs();
-
-  switch (instr.opcode) {
-    case HloOpcode::kSend:
-      opState.start_time = visitor.TimestampNs();
-      opState.rendezvous_name = rendezvous_name;
-      opState.transfer_type =
-          instr.transfer_type.has_value() ? *instr.transfer_type : "";
-      opState.overlapping_duration = 0;
-      opState.stall_duration_ns = visitor.DurationNs();
-      opState.send_op_name = visitor.DisplayName();
-      opState.send.set_duration_ps(visitor.DurationPs());
-      opState.send.set_start_time_ps(visitor.TimestampPs());
-      opState.replica_group_size =
-          GetReplicaGroupSize(rendezvous_name, visitor);
-      break;
-    case HloOpcode::kRecv:
-      opState.recv.set_duration_ps(visitor.DurationPs());
-      opState.recv.set_start_time_ps(visitor.TimestampPs());
-      break;
-    case HloOpcode::kSendDone:
-      opState.send_done.set_duration_ps(visitor.DurationPs());
-      opState.send_done.set_start_time_ps(visitor.TimestampPs());
-      break;
-    case HloOpcode::kRecvDone: {
-      opState.recv_done.set_duration_ps(visitor.DurationPs());
-      opState.recv_done.set_start_time_ps(visitor.TimestampPs());
-      if (opState.start_time != 0) {
-        DcnSlack* analysis = slack_analysis_.add_dcn_slack();
-        analysis->set_rendezvous(rendezvous_name);
-        analysis->set_transfer_type(opState.transfer_type);
-        analysis->set_send_start_time_us(NanoToMicro(opState.start_time));
-        analysis->set_recv_done_end_time_us(
-            NanoToMicro(visitor.EndTimestampNs()));
-        analysis->set_slack_us(NanoToMicro(visitor.TimestampNs() -
-                                           opState.start_time -
-                                           opState.overlapping_duration));
-        analysis->set_bytes_transmitted_over_network(ComputeTransmittedDataSize(
-            instr.size, opState.replica_group_size, opState.transfer_type));
-        analysis->set_stall_duration_us(NanoToMicro(opState.stall_duration_ns));
-        analysis->set_recv_op_name(std::string(visitor.DisplayName()));
-        analysis->set_send_op_name(opState.send_op_name);
-        *analysis->mutable_send() = opState.send;
-        *analysis->mutable_recv() = opState.recv;
-        *analysis->mutable_send_done() = opState.send_done;
-        *analysis->mutable_recv_done() = opState.recv_done;
-      }
-
-      break;
-    }
-    default:
-      LOG(ERROR) << "Received unexpected op";
-  }
-  UpdateActiveOps(visitor.DurationNs());
-}
-
-std::optional<DcnHostEvent> DcnTracker::GetCollectiveHostEvent(
-    int core_id, std::string_view rendezvous, Timespan timespan) {
-  return core_id_to_host_event_map_[HostCollectiveKey(core_id, rendezvous)].pop(
-      timespan);
-}
-
-void DcnTracker::SummarizeDcnSlackAnalysis() {
-  absl::flat_hash_map<std::string_view, DcnSlackSummary> summary;
-  // TODO(b/302596260) : Expand to process all cores.
-  int core_id = 0;
-  for (DcnSlack& analysis : *slack_analysis_.mutable_dcn_slack()) {
-    DcnSlackSummary& s = summary[analysis.rendezvous()];
-    s.set_slack_us(s.slack_us() + analysis.slack_us());
-    s.set_occurrences(s.occurrences() + 1);
-    s.set_rendezvous(analysis.rendezvous());
-    s.set_transfer_type(analysis.transfer_type());
-    s.set_bytes_transmitted_over_network(
-        analysis.bytes_transmitted_over_network());
-    s.set_stall_duration_us(s.stall_duration_us() +
-                            analysis.stall_duration_us());
-    s.set_observed_duration_us(s.observed_duration_us() +
-                               analysis.recv_done_end_time_us() -
-                               analysis.send_start_time_us());
-    s.set_recv_op_name(analysis.recv_op_name());
-    s.set_send_op_name(analysis.send_op_name());
-    s.set_send_duration_us(s.send_duration_us() +
-                           PicoToMicro(analysis.send().duration_ps()));
-    s.set_recv_duration_us(s.recv_duration_us() +
-                           PicoToMicro(analysis.recv().duration_ps()) / 1E6);
-    s.set_send_done_duration_us(
-        s.send_done_duration_us() +
-        PicoToMicro(analysis.send_done().duration_ps()));
-    s.set_recv_done_duration_us(
-        s.recv_done_duration_us() +
-        PicoToMicro(analysis.recv_done().duration_ps()));
-
-    // Populate Host summary to DcnSlackSummary
-    std::optional<DcnHostEvent> host_event = GetCollectiveHostEvent(
-        core_id, analysis.rendezvous(),
-        Timespan::FromEndPoints(analysis.send().start_time_ps(),
-                                analysis.recv_done().start_time_ps() +
-                                    analysis.recv_done().duration_ps()));
-    if (host_event.has_value()) {
-      OpInstance* host_graph_execution =
-          analysis.mutable_host_graph_execution();
-      host_graph_execution->set_start_time_ps(host_event->timespan.begin_ps());
-      host_graph_execution->set_duration_ps(host_event->timespan.duration_ps());
-      s.set_host_stall_us(s.host_stall_us() +
-                          (((int64_t)host_event->timespan.end_ps() -
-                            (int64_t)analysis.recv_done().start_time_ps()) /
-                           1E6));
-      s.set_host_events_count(s.host_events_count() + 1);
-    }
-  }
-
-  for (auto& [_, s] : summary) {
-    s.set_slack_us(SafeDivide(s.slack_us(), s.occurrences()));
-    s.set_stall_duration_us(SafeDivide(s.stall_duration_us(), s.occurrences()));
-    s.set_observed_duration_us(
-        SafeDivide(s.observed_duration_us(), s.occurrences()));
-    s.set_send_done_duration_us(
-        SafeDivide(s.send_done_duration_us(), s.occurrences()));
-    s.set_recv_done_duration_us(
-        SafeDivide(s.recv_done_duration_us(), s.occurrences()));
-    s.set_send_duration_us(SafeDivide(s.send_duration_us(), s.occurrences()));
-    s.set_recv_duration_us(SafeDivide(s.recv_duration_us(), s.occurrences()));
-    s.set_host_stall_us(SafeDivide(s.host_stall_us(), s.host_events_count()));
-    *slack_analysis_.add_dcn_slack_summary() = s;
-  }
-}
-
-void DcnTracker::ProcessTopology(const Topology& topology) {
-  for (const auto& mesh_location : topology.mesh_location()) {
-    global_chip_id_to_local_index_map_[mesh_location.global_id()] =
-        mesh_location.index_on_host();
-  }
-}
-
-int DcnTracker::GetLocalIndex(int dcn_device_id) {
-  /* Based on if megacore was present or not, the LocalIndex calculation will
-   * differ,
-   * dcn device id would use the global index in cases of megacore, and use
-   * 2*global_index (+1) for non megacore instances
-   * TODO(b/302145703): Identify if transformation can be obtained from the
-   * TpuTopology directly
-   */
-  int global_device_id = dcn_device_id;
-  if (!is_megacore_) {
-    if (global_chip_id_to_local_index_map_.contains(global_device_id)) {
-      return global_chip_id_to_local_index_map_[dcn_device_id / 2] +
-             dcn_device_id % 2;
-    }
-  }
-  if (global_chip_id_to_local_index_map_.contains(global_device_id)) {
-    return global_chip_id_to_local_index_map_[global_device_id];
-  }
-  LOG(WARNING) << "Could not map dcn_device_id to Local index, Using "
-                  "dcn_device_id : "
-               << global_device_id;
-  return global_device_id;
-}
-
-void DcnTracker::VisitHostEvent(const DcnHostEvent& event) {
-  std::string key = HostCollectiveKey(
-      GetLocalIndex(event.multi_slice_device_id), event.rendezvous_name);
-  if (event.rendezvous_name.empty()) return;
-  core_id_to_host_event_map_[key].insert(event);
-}
-
-void ProcessDcnTraces(const XPlane& xplane, DcnTracker& dcn_tracker) {
-  XPlaneVisitor xplane_visitor = CreateTfXPlaneVisitor(&xplane);
-  HloProtoMap hlo_proto_map;
-  xplane_visitor.ForEachLine([&](const XLineVisitor& line) {
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      dcn_tracker.VisitHostEvent(ParseDcnHostEvent(event));
-    });
-  });
-}
-
-}  // namespace dcn_analysis_internal
-
-DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(const XSpace& xspace,
-                                                 const XPlane* dcn_host_plane,
-                                                 const Topology* topology,
-                                                 bool is_megacore) {
-  int num_cores = tsl::profiler::FindTensorCorePlanes(xspace).size();
-  if (num_cores == 0) return DcnSlackAnalysis();
-  const XPlane* xplane =
-      FindPlaneWithName(xspace, tsl::profiler::TpuPlaneName(0));
-  XPlaneVisitor xplane_visitor = CreateTfXPlaneVisitor(xplane);
-  HloProtoMap hlo_proto_map;
-  hlo_proto_map.AddHloProtosFromXSpace(xspace);
-  dcn_analysis_internal::DcnTracker dcn_tracker(hlo_proto_map, is_megacore);
-  XEventContextTracker hlo_module_context(
-      &xplane_visitor,
-      FindLineWithName(*xplane, tsl::profiler::kXlaModuleLineName));
-  xplane_visitor.ForEachLine([&](const XLineVisitor& xline) {
-    if (xline.Name() == kXlaOpLineName) {
-      xline.ForEachEvent([&](const XEventVisitor& xevent) {
-        std::string_view hlo_category;
-
-        xevent.Metadata().ForEachStat([&](const XStatVisitor& xstat) {
-          switch (static_cast<StatType>(*xstat.Type())) {
-            case StatType::kHloCategory:
-              hlo_category = xstat.StrOrRefValue();
-              break;
-            default:
-              break;
-          }
-        });
-        auto module =
-            hlo_module_context.GetContainingEvent(xevent.GetTimespan());
-        if (!module.has_value()) return;
-        if (absl::StrContains(hlo_category, "host send") ||
-            absl::StrContains(hlo_category, "host recv")) {
-          // All Dcn send/send-done/recv/recv-done ops.
-          auto instr = dcn_tracker.GetInstructionMetadata(module->Name(),
-                                                          xevent.DisplayName());
-          if (instr.ok()) {
-            dcn_tracker.VisitOp(*instr, xevent);
-          }
-        }
-      });
-    }
-  });
-
-  if (dcn_host_plane != nullptr) {
-    VLOG(1) << "Processing host traces.";
-    if (topology != nullptr) {
-      dcn_tracker.ProcessTopology(*topology);
-    }
-    ProcessDcnTraces(*dcn_host_plane, dcn_tracker);
-  }
-  return dcn_tracker.Finalize();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
deleted file mode 100644
index 2f9e5551449c..000000000000
--- a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XSPACE_TO_DCN_SLACK_ANALYSIS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XSPACE_TO_DCN_SLACK_ANALYSIS_H_
-
-#include <cstdint>
-#include <deque>
-#include <list>
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tensorflow/core/profiler/protobuf/dcn_collective_info.pb.h"
-#include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
-#include "tensorflow/core/profiler/protobuf/topology.pb.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using tensorflow::profiler::DcnSlackAnalysis;
-
-namespace dcn_analysis_internal {
-
-struct DcnOpState {
-  uint64_t start_time = 0;
-  uint64_t end_time = 0;
-
-  // Duration of containing send/send-done/recv/recv-done ops that needs to be
-  // subtracted from the total duration
-  uint64_t overlapping_duration = 0;
-  std::string rendezvous_name;
-  std::string transfer_type;
-  uint64_t stall_duration_ns = 0;
-  std::string send_op_name;
-  int replica_group_size = 0;
-
-  OpInstance send;
-  OpInstance send_done;
-  OpInstance recv;
-  OpInstance recv_done;
-};
-
-// Structure to extract and store the DcnHostEvents.
-struct DcnHostEvent {
-  std::string rendezvous_name;
-  tsl::profiler::Timespan timespan;
-  int multi_slice_device_id;
-};
-
-// When visiting DcnHostEvents from the megascale planes, The events are stored
-// in separate lines in an ascending (by time) order. The List allows insertion
-// of multiple arrays of sorted events.
-class DcnHostEventList {
- public:
-  // Insert the event into the sorted list.
-  void insert(DcnHostEvent event);
-
-  // Pop the events from the front that is included within the timestamp when
-  // available.
-  std::optional<DcnHostEvent> pop(const tsl::profiler::Timespan& timespan);
-
-  // Number of events.
-  int size() const { return events_.size(); }
-
- private:
-  std::list<DcnHostEvent> events_;
-  std::list<DcnHostEvent>::iterator iter_ = events_.begin();
-};
-
-struct InstrMetadata {
-  xla::HloOpcode opcode;
-  uint64_t channel_id;
-  std::optional<std::string> rendezvous_name;
-  int64_t size = 0;
-  std::optional<std::string> transfer_type;
-};
-
-class DcnTracker {
- public:
-  explicit DcnTracker(const tensorflow::profiler::HloProtoMap& hlo_proto_map,
-                      bool is_megacore)
-      : hlo_proto_map_(hlo_proto_map), is_megacore_(is_megacore) {}
-
-  absl::StatusOr<InstrMetadata> GetInstructionMetadata(std::string_view module,
-                                                       std::string_view instr);
-
-  DcnSlackAnalysis Finalize();
-
-  void DebugString();
-
-  void VisitOp(const InstrMetadata& instr,
-               const tsl::profiler::XEventVisitor& visitor);
-
-  void VisitHostEvent(const DcnHostEvent& event);
-
-  void ProcessTopology(const tensorflow::profiler::Topology& topology);
-
- private:
-  DcnSlackAnalysis slack_analysis_;
-  absl::flat_hash_map<std::string, DcnOpState> rendezvous_to_op_map_;
-  absl::flat_hash_map<uint64_t, std::string> channel_id_to_rendezvous_map_;
-  absl::flat_hash_map<std::string, InstrMetadata> instruction_metadata_map_;
-  absl::flat_hash_map<std::string, DcnHostEventList> core_id_to_host_event_map_;
-  const tensorflow::profiler::HloProtoMap& hlo_proto_map_;
-  absl::flat_hash_map<int, int> global_chip_id_to_local_index_map_;
-  absl::flat_hash_map<std::string, std::unique_ptr<xla::HloModule>>
-      hlo_module_cache_;
-  absl::flat_hash_map<std::string, int> rendezvous_to_replica_group_size_map_;
-  bool is_megacore_ = true;
-
-  absl::StatusOr<InstrMetadata> GetInstrMetadataFromHloModule(
-      std::string_view module, std::string_view instr);
-
-  void UpdateActiveOps(uint64_t duration);
-
-  void SummarizeDcnSlackAnalysis();
-
-  std::optional<DcnHostEvent> GetCollectiveHostEvent(
-      int core_id, std::string_view rendezvous_name,
-      tsl::profiler::Timespan timespan);
-
-  // GetLocalIndex when available, else return the global_device_id itself.
-  int GetLocalIndex(int dcn_device_id);
-
-  // Get number of replica group
-  int GetReplicaGroupSize(const std::string& rendezvous_name,
-                          const tsl::profiler::XEventVisitor& visitor);
-
-  // Compute data transmitted size based on number of replica groups
-  uint64_t ComputeTransmittedDataSize(int64_t buffer_size, int group_size,
-                                      const std::string& transfer_type);
-};
-
-}  // namespace dcn_analysis_internal
-
-// Convert Hlo Events in XSpace to Dcn Slack analysis.
-DcnSlackAnalysis ConvertXSpaceToDcnSlackAnalysis(
-    const tensorflow::profiler::XSpace& xspace,
-    const tensorflow::profiler::XPlane* dcn_host_plane,
-    const tensorflow::profiler::Topology* topology, bool is_megacore = true);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XSPACE_TO_DCN_SLACK_ANALYSIS_H_
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 1ac965deb93f..76cafa8a8aa1 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -42,9 +42,9 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_session",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ] + if_not_android([
         ":profiler_interface",
         ":profiler_lock",
@@ -111,6 +111,8 @@ cc_library(
     hdrs = ["profiler_controller.h"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
+        "@local_tsl//tsl/profiler/lib:profiler_controller",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index a29c64df4c67..fcdfed4c4711 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -26,19 +26,6 @@ package_group(
     ],
 )
 
-tf_proto_library(
-    name = "xplane_proto",
-    srcs = ["xplane.proto"],
-    make_default_target_header_only = True,
-    protodeps = [
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto",
-    ],
-    visibility = [":friends"],
-    exports = [
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto",
-    ],
-)
-
 # This is needed because of how tf_android_core_proto_sources parses proto paths.
 exports_files(
     srcs = ["xplane.proto"],
@@ -48,129 +35,173 @@ exports_files(
 tf_proto_library(
     name = "diagnostics_proto",
     srcs = ["diagnostics.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:diagnostics_proto"],
     visibility = [
         ":friends",
     ],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:diagnostics_proto",
+    ],
 )
 
 tf_proto_library(
     name = "topology_proto",
     srcs = ["topology.proto"],
+    protodeps = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:topology_proto",
+    ],
     visibility = [
         ":friends",
     ],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:topology_proto",
+    ],
 )
 
 tf_proto_library(
     name = "input_pipeline_proto",
     srcs = ["input_pipeline.proto"],
-    protodeps = [":diagnostics_proto"],
+    protodeps = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:input_pipeline_proto",
+    ],
     visibility = [
         ":friends",
     ],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:input_pipeline_proto",
+    ],
 )
 
 tf_proto_library(
     name = "overview_page_proto",
     srcs = ["overview_page.proto"],
     protodeps = [
-        ":diagnostics_proto",
-        ":input_pipeline_proto",
-        ":power_metrics_proto",
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:overview_page_proto",
     ],
     visibility = [
         ":friends",
     ],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:overview_page_proto",
+    ],
 )
 
 tf_proto_library(
     name = "op_metrics_proto",
     srcs = ["op_metrics.proto"],
-    visibility = [":friends"],
+    protodeps = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:op_metrics_proto",
+    ],
+    visibility = [
+        ":friends",
+    ],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:op_metrics_proto",
+    ],
 )
 
 tf_proto_library(
     name = "power_metrics_proto",
     srcs = ["power_metrics.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:power_metrics_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:power_metrics_proto",
+    ],
 )
 
 tf_proto_library(
     name = "pod_stats_proto",
     srcs = ["pod_stats.proto"],
-    protodeps = [
-        ":diagnostics_proto",
-    ],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:pod_stats_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:pod_stats_proto",
+    ],
 )
 
 tf_proto_library(
     name = "pod_viewer_proto",
     srcs = ["pod_viewer.proto"],
-    protodeps = [
-        ":diagnostics_proto",
-        ":pod_stats_proto",
-        ":topology_proto",
-    ],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:pod_viewer_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:pod_viewer_proto",
+    ],
 )
 
 tf_proto_library(
     name = "steps_db_proto",
     srcs = ["steps_db.proto"],
-    protodeps = [":op_metrics_proto"],
+    protodeps = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:steps_db_proto",
+    ],
     visibility = [
         ":friends",
     ],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:steps_db_proto",
+    ],
 )
 
 tf_proto_library(
     name = "op_profile_proto",
     srcs = ["op_profile.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:op_profile_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:op_profile_proto",
+    ],
 )
 
 tf_proto_library(
     name = "hlo_stats_proto",
     srcs = ["hlo_stats.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:hlo_stats_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:hlo_stats_proto",
+    ],
 )
 
 tf_proto_library(
     name = "roofline_model_proto",
     srcs = ["roofline_model.proto"],
-    protodeps = [":diagnostics_proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:roofline_model_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:roofline_model_proto",
+    ],
 )
 
 tf_proto_library(
     name = "op_stats_proto",
     srcs = ["op_stats.proto"],
-    protodeps = [
-        ":topology_proto",
-        ":diagnostics_proto",
-        ":kernel_stats_proto",
-        ":op_metrics_proto",
-        ":power_metrics_proto",
-        ":steps_db_proto",
-        ":tf_function_proto",
-        ":hardware_types_proto",
-    ],
-    visibility = [
-        ":friends",
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:op_stats_proto"],
+    visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:op_stats_proto",
     ],
 )
 
 tf_proto_library(
     name = "kernel_stats_proto",
     srcs = ["kernel_stats.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:kernel_stats_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:kernel_stats_proto",
+    ],
 )
 
 tf_proto_library(
     name = "tf_function_proto",
     srcs = ["tf_function.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tf_function_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tf_function_proto",
+    ],
 )
 
 # This proto is deprecating and not guaranteed to be compatible across versions.
@@ -178,45 +209,64 @@ tf_proto_library(
 tf_proto_library(
     name = "tf_stats_proto",
     srcs = ["tf_stats.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tf_stats_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tf_stats_proto",
+    ],
 )
 
 tf_proto_library(
     name = "hardware_types_proto",
     srcs = ["hardware_types.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:hardware_types_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:hardware_types_proto",
+    ],
 )
 
 tf_proto_library(
     name = "tfstreamz_proto",
     srcs = ["tfstreamz.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tfstreamz_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tfstreamz_proto",
+    ],
 )
 
 tf_proto_library(
     name = "memory_profile_proto",
     srcs = ["memory_profile.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:memory_profile_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:memory_profile_proto",
+    ],
 )
 
 tf_proto_library(
     name = "tf_data_stats_proto",
     srcs = ["tf_data_stats.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tf_data_stats_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tf_data_stats_proto",
+    ],
 )
 
 tf_proto_library(
     name = "memory_viewer_preprocess_proto",
     srcs = ["memory_viewer_preprocess.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:memory_viewer_preprocess_proto"],
     visibility = [":memory_viewer_friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:memory_viewer_preprocess_proto",
+    ],
 )
 
 # copybara:uncomment_begin(google-only)
-# py_proto_library(
-#     name = "xplane_py_pb2",
-#     visibility = [":friends"],
-#     deps = [":xplane_proto"],
-# )
 #
 # py_proto_library(
 #     name = "memory_viewer_preprocess_py_pb2",
@@ -252,47 +302,71 @@ tf_proto_library(
 tf_proto_library(
     name = "task_proto",
     srcs = ["task.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:task_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:task_proto",
+    ],
 )
 
 tf_proto_library(
     name = "trace_events_proto",
     srcs = ["trace_events.proto"],
-    protodeps = [
-        ":task_proto",
-    ],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:trace_events_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:trace_events_proto",
+    ],
 )
 
 tf_proto_library(
     name = "trace_events_raw_proto",
     srcs = ["trace_events_raw.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:trace_events_raw_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:trace_events_raw_proto",
+    ],
 )
 
 tf_proto_library(
     name = "dcn_slack_analysis_proto",
     srcs = ["dcn_slack_analysis.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:dcn_slack_analysis_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:dcn_slack_analysis_proto",
+    ],
 )
 
 tf_proto_library(
     name = "dcn_collective_info_proto",
     srcs = ["dcn_collective_info.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:dcn_collective_info_proto"],
     visibility = [":friends"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:dcn_collective_info_proto",
+    ],
 )
 
 tf_proto_library(
     name = "inference_stats_proto",
     srcs = ["inference_stats.proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:inference_stats_proto"],
     visibility = [
         ":friends",
         "//learning/serving/tools/servo_model_profiler:__subpackages__",
     ],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:inference_stats_proto",
+    ],
 )
 
 tf_proto_library(
     name = "tpu_input_pipeline_proto",
     srcs = ["tpu_input_pipeline.proto"],
-    protodeps = [":input_pipeline_proto"],
+    protodeps = ["@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tpu_input_pipeline_proto"],
+    exports = [
+        "@org_xprof//plugin/tensorboard_plugin_profile/protobuf:tpu_input_pipeline_proto",
+    ],
 )
diff --git a/tensorflow/core/profiler/protobuf/dcn_collective_info.proto b/tensorflow/core/profiler/protobuf/dcn_collective_info.proto
index d44f17313a81..0e842298bdf5 100644
--- a/tensorflow/core/profiler/protobuf/dcn_collective_info.proto
+++ b/tensorflow/core/profiler/protobuf/dcn_collective_info.proto
@@ -2,58 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// This proto is based on MegaScaleInfoProto and should be consistent with it.
-message DcnCollectiveInfoProto {
-  enum TransferType {
-    UNKNOWN_TRANSFER_TYPE = 0;
-
-    // XLA AllToAll transfer.
-    // Needs `endpoint_groups`.
-    ALL_TO_ALL = 1;
-
-    // Peer-To-Peer DCN transfer from source to one destination.
-    // Needs one_to_one_groups.
-    ONE_TO_ONE = 2;
-
-    // XLA reduce-scatter transfer.
-    // Needs `endpoint_groups`.
-    REDUCE_SCATTER = 3;
-
-    // XLA AllGather transfer.
-    // Needs `endpoint_groups`.
-    ALL_GATHER = 4;
-
-    // XLA all-reduce transfer.
-    // Needs `endpoint_groups`.
-    ALL_REDUCE = 5;
-
-    // XLA ragged all-to-all transfer.
-    // Needs `endpoint_groups`.
-    RAGGED_ALL_TO_ALL = 6;
-  }
-
-  message Endpoint {
-    int32 slice_id = 1;
-    int32 device_id = 2;
-  }
-
-  message EndpointGroup {
-    repeated Endpoint endpoints = 1;
-  }
-
-  message OneToOneGroup {
-    Endpoint source = 1;
-    Endpoint destination = 2;
-  }
-
-  // The type of DCN transfer.
-  TransferType transfer_type = 1;
-
-  // Groups of endpoints (in the form of slice id and device id) involved in
-  // `ALL_TO_ALL`, `REDUCE_SCATTER`, `ALL_REDUCE` and `ALL_GATHER` transfer.
-  repeated EndpointGroup endpoint_groups = 2;
-
-  // Groups of endpoints (in the form of slice id and device id) involved in
-  // `ONE_TO_ONE` transfer.
-  repeated OneToOneGroup one_to_one_groups = 3;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/dcn_collective_info.proto";
diff --git a/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto b/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto
index cc57b96af1bf..cca2e7d6e796 100644
--- a/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto
+++ b/tensorflow/core/profiler/protobuf/dcn_slack_analysis.proto
@@ -2,92 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-message OpInstance {
-  uint64 start_time_ps = 1;
-  uint64 duration_ps = 2;
-}
-
-message DcnSlack {
-  string rendezvous = 1;
-  // Xprof observed send start time.
-  uint64 send_start_time_us = 2;
-  // Xprof observed recv_done end time.
-  uint64 recv_done_end_time_us = 3;
-
-  // Slack is defined as the time the collective has to send and recv data
-  // without stalling the tpu. The effect of the network and other overlapping
-  // collectives are removed from the collective of interest.
-  //
-  //
-  // HOST 1 :
-  // |--------|SEND1|-------|SEND1.DONE|-------|RECV1|------|RECV1.DONE|-------
-  // HOST 2:
-  // |------|SEND2|-------|SEND2.DONE|-------|RECV2|------|RECV2.DONE    |-----
-  //
-  // Slack is computed as
-  // RECV2.DONE.StartTime - SEND2.StartTime - (Overlapping Communication)
-  // In this case, Overlapping communication is the duration of SEND2,
-  // SEND2.DONE and RECV2. In cases where other collectives are interspaced
-  // between this collective, Overlapping duration would include their durations
-  // as well. Host 1 is ignored while computing the slack, as we assume that the
-  // similar ops are executing each core. This also prevents clock drifts to
-  // effect the analysis.
-  uint64 slack_us = 4;
-
-  uint64 bytes_transmitted_over_network = 5;
-
-  // Duration the collective stalled the TPU.
-  uint64 stall_duration_us = 6;
-
-  // Recv op name
-  string recv_op_name = 7;
-
-  // Send op name
-  string send_op_name = 8;
-
-  // Timestamp for the send/send-done/recv/recv-done ops
-  OpInstance send = 9;
-  OpInstance send_done = 10;
-  OpInstance recv = 11;
-  OpInstance recv_done = 12;
-
-  string transfer_type = 13;
-
-  OpInstance host_graph_execution = 14;
-}
-
-message DcnSlackSummary {
-  // Rendezvous name for the collective.
-  string rendezvous = 1;
-  // Slack Time in Microseconds,
-  uint64 slack_us = 2;
-  // Number of occurrences in the sampled duration.
-  uint64 occurrences = 3;
-  // Bytes transmitted over the network.
-  uint64 bytes_transmitted_over_network = 4;
-  // Duration the collective stalled the TPU.
-  uint64 stall_duration_us = 5;
-  // Observed duration.
-  uint64 observed_duration_us = 6;
-  // Recv op name.
-  string recv_op_name = 7;
-
-  // Send op name.
-  string send_op_name = 8;
-
-  // Stall duration based on the op.
-  uint64 send_duration_us = 9;
-  uint64 recv_duration_us = 10;
-  uint64 send_done_duration_us = 11;
-  uint64 recv_done_duration_us = 12;
-
-  string transfer_type = 13;
-
-  int64 host_stall_us = 14;
-  uint64 host_events_count = 15;
-}
-
-message DcnSlackAnalysis {
-  repeated DcnSlack dcn_slack = 1;
-  repeated DcnSlackSummary dcn_slack_summary = 2;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/dcn_slack_analysis.proto";
diff --git a/tensorflow/core/profiler/protobuf/diagnostics.proto b/tensorflow/core/profiler/protobuf/diagnostics.proto
index def2d4e49d42..d8fa463bf475 100644
--- a/tensorflow/core/profiler/protobuf/diagnostics.proto
+++ b/tensorflow/core/profiler/protobuf/diagnostics.proto
@@ -4,8 +4,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-message Diagnostics {
-  repeated string info = 1;
-  repeated string warnings = 2;
-  repeated string errors = 3;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/diagnostics.proto";
diff --git a/tensorflow/core/profiler/protobuf/hardware_types.proto b/tensorflow/core/profiler/protobuf/hardware_types.proto
index c204595f9f3a..ad6fd188f4b6 100644
--- a/tensorflow/core/profiler/protobuf/hardware_types.proto
+++ b/tensorflow/core/profiler/protobuf/hardware_types.proto
@@ -4,28 +4,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// Types of hardware profiled.
-enum HardwareType {
-  // Unknown hardware.
-  UNKNOWN_HARDWARE = 0;
-  // CPU only without any hardware accelerator.
-  CPU_ONLY = 1;
-  // GPU.
-  GPU = 2;
-  // TPU.
-  TPU = 3;
-}
-
-message GPUComputeCapability {
-  uint32 major = 1;
-  uint32 minor = 2;
-}
-
-message DeviceCapabilities {
-  double clock_rate_in_ghz = 1;
-  uint32 num_cores = 2;
-  uint64 memory_size_in_bytes = 3;
-  uint64 memory_bandwidth = 4;  // Bytes/s.
-  GPUComputeCapability compute_capability = 5;
-  string device_vendor = 6;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/hardware_types.proto";
diff --git a/tensorflow/core/profiler/protobuf/hlo_stats.proto b/tensorflow/core/profiler/protobuf/hlo_stats.proto
index 19776256efc3..34f40e1d5cb5 100644
--- a/tensorflow/core/profiler/protobuf/hlo_stats.proto
+++ b/tensorflow/core/profiler/protobuf/hlo_stats.proto
@@ -4,121 +4,4 @@ syntax = "proto2";
 
 package tensorflow.profiler.hlo_stats;
 
-// A database of HloStats records.
-message HloStatsDatabase {
-  // All HloStats records, one for each HLO operation.
-  repeated HloStatsRecord hlo_stats_record = 1;
-}
-
-// There is one HloStatsRecord for each HLO operation profiled.
-// Next ID: 39
-message HloStatsRecord {
-  // The rank by self time
-  optional uint64 rank = 1;
-
-  // program_id for this op
-  optional uint64 program_id = 30;
-
-  // The HLO category name.
-  optional string hlo_category = 17;
-
-  // The HLO expression.
-  optional string hlo_expression = 2;
-
-  // The framework op name (TF Op, JAX Op)
-  optional string tf_op_name = 21;
-
-  // Number of occurrences of the operation.
-  optional int64 occurrences = 3;
-
-  // Total "accumulated" time in micro-seconds that the operation
-  // took. If this operation has any children operations,
-  // the "accumulated" time includes the time spent inside children.
-  optional double total_time_in_us = 4;
-
-  // Average "accumulated" time in micro-seconds that each
-  // occurrence of the operation took.
-  optional double avg_time_in_us = 5;
-
-  // Total "self" time in micro-seconds that the operation took.
-  // If this operation has any children operations, the "self" time
-  // doesn't include the time spent inside children.
-  optional double total_self_time_in_us = 6;
-
-  // Average "self" time in micro-seconds that the operation took.
-  optional double avg_self_time_in_us = 7;
-  optional double total_self_time_as_fraction = 8;
-  optional double cumulative_total_self_time_as_fraction = 9;
-
-  // Percentage of the total "accumulated" time that was caused by
-  // DMA stall.
-  optional double dma_stall_fraction = 10;
-
-  // Total floating-point operations (FLOPs) performed per second normalized to
-  // the bf16 peak capacity.
-  optional double measured_flop_rate = 13;
-
-  // Total Floating-point operations for the op per second.
-  optional double model_flop_rate = 34;
-
-  // Number of total bytes (including both read and write) accessed per second.
-  optional double measured_memory_bw = 14;
-
-  // Number of bytes accessed from HBM (including both read and write) per
-  // second.
-  optional double hbm_bw = 22;
-
-  // Number of bytes read from CMEM per second.
-  optional double cmem_read_bw = 23;
-
-  // Number of bytes written to CMEM per second.
-  optional double cmem_write_bw = 24;
-
-  // Number of bytes read from VMEM per second.
-  optional double vmem_read_bw = 35;
-
-  // Number of bytes written to VMEM per second.
-  optional double vmem_write_bw = 36;
-
-  // Overall operational intensity in FLOP/Byte.
-  optional double operational_intensity = 15;
-
-  // Operational intensity based on HBM in FLOP/Byte.
-  optional double hbm_operational_intensity = 26;
-
-  // Operational intensity based on CMEM read in FLOP/Byte.
-  optional double cmem_read_operational_intensity = 27;
-
-  // Operational intensity based on CMEM write in FLOP/Byte.
-  optional double cmem_write_operational_intensity = 28;
-
-  // Operational intensity based on VMEM read in FLOP/Byte.
-  optional double vmem_read_operational_intensity = 37;
-
-  // Operational intensity based on VMEM write in FLOP/Byte.
-  optional double vmem_write_operational_intensity = 38;
-
-  // Operational intensity based on the bottleneck resource in FLOP/Byte.
-  optional double bottleneck_operational_intensity = 29;
-
-  // Whether this operation is "Compute", "HBM", "CMEM Read", "CMEM Write"
-  // bound, according to the Roofline Model.
-  optional string bound_by = 16;
-
-  // Whether this operation is for HLO or Framework rematerialization.
-  optional bool rematerialization = 20;
-
-  // Whether this op is for outside compilation.
-  optional bool outside_compilation = 25;
-
-  // Whether this op is autotuned.
-  optional bool autotuned = 31;
-
-  // Flops for the record
-  optional uint64 flops = 32;
-
-  // Bytes accessed for the record
-  optional uint64 bytes_accessed = 33;
-
-  reserved 11, 12, 18, 19;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/hlo_stats.proto";
diff --git a/tensorflow/core/profiler/protobuf/inference_stats.proto b/tensorflow/core/profiler/protobuf/inference_stats.proto
index 3bd15423b06e..aa2c998f3f31 100644
--- a/tensorflow/core/profiler/protobuf/inference_stats.proto
+++ b/tensorflow/core/profiler/protobuf/inference_stats.proto
@@ -3,296 +3,4 @@ syntax = "proto2";
 
 package tensorflow.profiler;
 
-message TensorEventDetail {
-  // The index of the tensor pattern in TensorPatternDatabase.
-  optional int32 tensor_pattern_index = 1;
-
-  // The owner of this TensorEventDetail.
-  enum TensorEventOwner {
-    // Unknown. This should not happen in production code.
-    UNKNOWN = 0;
-
-    // Owned by the request.
-    REQUEST = 1;
-
-    // Owned by the batch.
-    BATCH = 2;
-  }
-
-  // If batching is enabled, the TensorEventDetails in BatchDetail will have
-  // owner = BATCH, and they are counted when calculating statistics like the
-  // number of occurrence for each tensor pattern. The TensorEventDetails in
-  // RequestDetail will have owner = BATCH, which means the tensor events
-  // actually happen in the batch, and they are not counted when calculating
-  // various statistics.
-  // If batching is not enabled, the TensorEventDetail will only appear in
-  // RequestDetail and the owner will only be REQUEST.
-  optional TensorEventOwner owner = 2;
-
-  // Total time in picosecs spent on linearize and delinearize tensors.
-  optional uint64 linearize_delinearize_time_ps = 3;
-}
-
-// Detail of a user facing request.
-// Next ID: 22
-message RequestDetail {
-  // Request id.
-  optional int64 request_id = 10 [default = -1];
-
-  // An index to the model_id inside InferenceStats below. Storing index
-  // instead of string to save space. It will be -1 if the model id is not
-  // given.
-  optional int32 model_id_index = 8 [default = -1];
-
-  // Start-time of the request in picosecs.
-  optional uint64 start_time_ps = 1 [default = 0];
-
-  // End-time of the request in picosecs.
-  optional uint64 end_time_ps = 2 [default = 0];
-
-  // Total time in picosecs in this request spent on device.
-  optional uint64 device_time_ps = 7 [default = 0];
-
-  // Total time in picosecs in this request spent on writes to device.
-  optional uint64 write_to_device_time_ps = 5 [default = 0];
-
-  // Total time in picosecs in this request spent on reads from device.
-  optional uint64 read_from_device_time_ps = 6 [default = 0];
-
-  // If this inference request is running in batching mode, record the latency
-  // between a request is scheduled and is processed in a batch. Otherwise, it
-  // will always be 0.
-  optional uint64 batching_request_delay_ps = 12 [default = 0];
-
-  // Batch ids related to this request.
-  repeated int64 related_batch_ids = 11;
-
-  // If this inference request is running in batching mode, record the size of
-  // the request. Otherwise, it will always be 0.
-  optional int32 batching_request_size = 13;
-
-  // Detailed breakdown for host side activities of a request.
-  // Total time in picosecs spent on host preprocessing.
-  optional uint64 host_preprocessing_ps = 14;
-
-  // Total time in picosecs spent on host batch formation.
-  optional uint64 host_batch_formation_ps = 15;
-
-  // Total time in picosecs spent on host runtime.
-  optional uint64 host_runtime_ps = 16;
-
-  // Total time in picosecs spent on host postprocessing.
-  optional uint64 host_postprocessing_ps = 17;
-
-  // Tensor event details.
-  // One request can have multiple TensorEventDetails because it might be
-  // split into multiple batches for execution.
-  repeated TensorEventDetail tensor_event_details = 18;
-
-  // Host index for this request.
-  optional int32 host_id = 19;
-
-  // Percentile of this request in all requests in the profile duration.
-  optional double percentile = 20;
-
-  // The time no event associated with. It could be that the machine was idle or
-  // executing some events which were not traced.
-  optional double idle_time_ps = 21;
-
-  // Were device_start_time_ps, device_end_time_ps, session_id
-  reserved 3, 4, 9;
-}
-
-// Detail of a batch.
-// Next ID: 12
-message BatchDetail {
-  // Batch id.
-  optional int64 batch_id = 1 [default = -1];
-
-  // Start time of the batch in picosecs.
-  optional uint64 start_time_ps = 2 [default = 0];
-
-  // End time of the batch in picosecs.
-  optional uint64 end_time_ps = 3 [default = 0];
-
-  // The latency between "start time of the first request in this batch", and
-  // the time this batch is processed.
-  optional uint64 batch_delay_ps = 5 [default = 0];
-
-  // Request ids related to this batch.
-  repeated int64 related_request_ids = 4;
-
-  // Size of padding.
-  optional int32 padding_amount = 6;
-
-  // Size of a batch after padding.
-  optional int32 batch_size_after_padding = 7;
-
-  // Model ID of this batch. This is the same model_id as any of the request in
-  // this batch. All the requests from the same batch must share the same
-  // model_id.
-  optional int32 model_id_index = 8;
-
-  // Tensor event details.
-  optional TensorEventDetail tensor_event_detail = 9;
-
-  // Host index for this batch.
-  optional int32 host_id = 10;
-
-  // Percentile of this batch in all batches in the profile duration.
-  optional double percentile = 11;
-}
-
-// Per-host data for inference analysis.
-message PerHostInferenceStats {
-  // A list of requests selected for inference analysis on this host.
-  // This list is in ascending order of the request duration.
-  repeated RequestDetail request_details = 3;
-
-  // A list of batches selected for inference analysis on this host.
-  // This list is in ascending order of the batch duration.
-  repeated BatchDetail batch_details = 5;
-
-  reserved 1, 2, 4, 6;
-
-  // were session_run_times, sessions_per_second, requests_per_second,
-  // batches_per_second.
-}
-
-// Per-model aggregated result of tensor transfer.
-message TensorTransferAggregatedResult {
-  message TensorPatternResult {
-    // The index of the tensor pattern in TensorPatternDatabase.
-    optional int32 tensor_pattern_index = 1;
-
-    // The number of occurrence of this tensor pattern in this model.
-    optional uint64 count = 2;
-
-    message PercentileTime {
-      optional double percentile = 1;
-      optional uint64 time_ps = 2;
-    }
-
-    // The percentiles of the linearize and delinearize time of this tensor
-    // pattern in this model.
-    repeated PercentileTime linearize_delinearize_percentile_time = 3;
-  }
-
-  repeated TensorPatternResult tensor_pattern_results = 1;
-}
-
-// Aggregated result per batch size.
-message PerBatchSizeAggregatedResult {
-  optional int32 batch_size = 1;
-  optional RequestDetail aggregated_request_result = 2;
-  optional BatchDetail aggregated_batch_result = 3;
-  optional double request_throughput = 4;
-  optional double batch_throughput = 5;
-}
-
-// Per-model data for inference analysis.
-message PerModelInferenceStats {
-  // A list of requests selected for inference analysis on this model.
-  // This list is in ascending order of the request duration.
-  repeated RequestDetail request_details = 1;
-
-  // Aggregated result from all the <request_details>.
-  optional RequestDetail aggregated_request_detail = 8;
-
-  // Inference requests per second for this model.
-  optional double request_throughput = 2;
-
-  // Average latency in microseconds of the requests in this model.
-  optional double request_average_latency_us = 3;
-
-  // A list of batches selected for inference analysis on this model.
-  // This list is in ascending order of the batch duration.
-  repeated BatchDetail batch_details = 4;
-
-  // Aggregated result from all the <batch_details>.
-  optional BatchDetail aggregated_batch_detail = 9;
-
-  // Batches per second for this model.
-  optional double batch_throughput = 5;
-
-  // Average latency in microseconds of the batches in this model.
-  optional double batch_average_latency_us = 6;
-
-  // The aggregated result of tensor transfer in this model.
-  optional TensorTransferAggregatedResult tensor_transfer_aggregated_result = 7;
-
-  // Aggregated result per batch size.
-  repeated PerBatchSizeAggregatedResult per_batch_size_aggregated_result = 10;
-}
-
-// Batching parameters collected from TFstreamz.
-message BatchingParameters {
-  // Number of batch threads.
-  optional int64 num_batch_threads = 1;
-
-  // How long a request can wait before being processed by a batch.
-  optional int64 batch_timeout_micros = 2;
-
-  // Maximum size of a batch.
-  optional int64 max_batch_size = 3;
-
-  // Maximum number of enqueued batches.
-  optional int64 max_enqueued_batches = 4;
-
-  // Sizes that are allowed to form a batch. A list of integers separated by ","
-  optional string allowed_batch_sizes = 5;
-}
-
-// Model ID database. Unknown model id will be "" and won't be stored here. So
-// if model id is not found in the TF-session metadata, ModelIdDatabase will be
-// empty.
-message ModelIdDatabase {
-  // Array of model ids.
-  repeated string ids = 1;
-
-  // Map from id to index.
-  map<string, int32> id_to_index = 2;
-
-  // Map from id to batching parameters.
-  map<string, BatchingParameters> id_to_batching_params = 3;
-}
-
-// Tensor pattern database for all the tensor patterns that occurred during the
-// profiling window.
-message TensorPatternDatabase {
-  // A tensor pattern is the string concatenation of all the linearize and
-  // delinearize events in an inference request. Each event records the tensor
-  // shape, data type and the layout on device.
-  repeated string tensor_pattern = 1;
-}
-
-// Proto consumed by inference analysis.
-message InferenceStats {
-  // Map from host-id to the InferenceStats for that host.
-  map<int32 /* host-id */, PerHostInferenceStats> inference_stats_per_host = 3;
-
-  // Map from model-id to the InferenceStats for that model.
-  map<int32 /* model-id */, PerModelInferenceStats> inference_stats_per_model =
-      5;
-
-  // A database of model ids.
-  optional ModelIdDatabase model_id_db = 4;
-
-  // A database of tensor patterns.
-  optional TensorPatternDatabase tensor_pattern_db = 6;
-
-  optional SampledInferenceStatsProto sampled_inference_stats = 7;
-
-  reserved 1, 2;  // were processing_stats, session_run_times
-}
-
-message SampledPerModelInferenceStatsProto {
-  repeated RequestDetail sampled_requests = 1;
-  repeated BatchDetail sampled_batches = 2;
-}
-
-message SampledInferenceStatsProto {
-  // Map from model index to the Sampled Stats.
-  map<int32 /* host-id */, SampledPerModelInferenceStatsProto>
-      sampled_inference_stats_per_model = 1;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/inference_stats.proto";
diff --git a/tensorflow/core/profiler/protobuf/input_pipeline.proto b/tensorflow/core/profiler/protobuf/input_pipeline.proto
index 08ef575b7100..86ed849a6976 100644
--- a/tensorflow/core/profiler/protobuf/input_pipeline.proto
+++ b/tensorflow/core/profiler/protobuf/input_pipeline.proto
@@ -2,184 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-import "google/protobuf/any.proto";
-import "tensorflow/core/profiler/protobuf/diagnostics.proto";
-
-// Generic hardware bottleneck.
-message BottleneckAnalysis {
-  // Percentage of step time that is spent on input.
-  double input_percent = 7;
-  // Percentage of step time that is spent on output.
-  double output_percent = 8;
-  // Percentage of step time that is idle for non-I/O-related reason.
-  double idle_percent = 9;
-  // Percentage of step time that is spent on compute.
-  double compute_percent = 10;
-  // Indicates if input is a bottleneck. Possible values:  "host", "device",
-  // "both", or "unknown"
-  string input_classification = 1;
-  // A human-readable description of the input bottleneck.
-  string input_statement = 2;
-  // Indicates if kernel launching is a bottleneck. Possible values: "no",
-  // "moderate", "high".
-  string kernel_launch_classification = 3;
-  // A human-readable description of the kernel launching overhead.
-  string kernel_launch_statement = 4;
-  // Indicates if all other is a bottleneck. Possible values: "no", "moderate",
-  // "high".
-  string all_other_classification = 5;
-  // A human-readable description of the all other overhead.
-  string all_other_statement = 6;
-  // Indicates if device collective communication is a bottleneck. Possible
-  // values: "no", "moderate", "high".
-  string device_collectives_classification = 11;
-  // A human-readable description of the device collective communication
-  // overhead.
-  string device_collectives_statement = 12;
-}
-
-// Used for both step duration and Op duration.
-message StepSummary {
-  double average = 1;
-  double standard_deviation = 2;
-  double minimum = 3;
-  double maximum = 4;
-}
-
-// Per-step details on generic hardware.
-message PerGenericStepDetails {
-  // The step number of a step.
-  int32 step_number = 1;
-  // The step name.
-  string step_name = 14;
-  // The step time (in ms).
-  double step_time_ms = 2;
-  // Breakdown of the step time in different event categories.
-  // The unknown time (in ms).
-  double unknown_time_ms = 3;
-  // The time (in ms) in which the host is waiting for input data to be ready.
-  double host_wait_input_ms = 11;
-  // The time (in ms) in which the host is sending input data to the device.
-  // Total input time = host_wait_input_ms + host_to_device_ms.
-  double host_to_device_ms = 12;
-  // The output time (in ms).
-  double output_ms = 5;
-  // The device-compute time (in ms).
-  double device_compute_ms = 6;
-  // The device-to-device communication time (in ms).
-  double device_to_device_ms = 7;
-  // The device time spent on collective communications (in ms).
-  double device_collectives_ms = 13;
-  // The host-compute time (in ms).
-  double host_compute_ms = 8;
-  // The host-prepare time (in ms).
-  double host_prepare_ms = 9;
-  // The time spent on compiling (in ms).
-  double host_compile_ms = 10;
-  reserved 4;
-}
-
-message InputTimeBreakdown {
-  // Time spent on demanded file read in microseconds.
-  double demanded_file_read_us = 1;
-  // Time spent on advanced file read in microseconds.
-  double advanced_file_read_us = 2;
-  // Time spent on data preprocessing in microseconds.
-  double preprocessing_us = 3;
-  // The infeed enqueue time in microseconds.
-  double enqueue_us = 4;
-  // This entry is for the situtation where we can't further
-  // break down the non-enqueue input time (because the input pipeline
-  // is not instrumented).
-  double unclassified_non_enqueue_us = 5;
-}
-
-message InputOpDetails {
-  // The Op's name.
-  string op_name = 1;
-  // The number of occurrences.
-  uint64 count = 2;
-  // Time (accumulated over all occurrences) in milliseconds.
-  double time_in_ms = 3;
-  // Time (accumulated over all occurrences) in
-  // percentage of the total input processing time.
-  double time_in_percent = 4;
-  // Self time (accumulated over all occurrences) in milliseconds.
-  double self_time_in_ms = 5;
-  // Self time (accumulated over all occurrences) in
-  // percentage of the total input processing time.
-  double self_time_in_percent = 6;
-  // Possible categories: "Enqueue", "Advanced file read",
-  // "Demanded file read", "Preprocessing", "Unknown".
-  string category = 7;
-}
-
-message InputPipelineAnalysisRecommendation {
-  // A list of detailed recommendations.
-  repeated string details = 1;
-  // An analysis of different types of bottlenecks. Can be unpacked into a
-  // BottleneckAnalysis.
-  google.protobuf.Any bottleneck_analysis = 2;
-  // A suggested step to take next.
-  string summary_next_step = 3;
-}
-
-message GenericStepTimeBreakdown {
-  // Summary of all unknown time as a part of step in ms.
-  StepSummary unknown_time_ms_summary = 1;
-  // Summary of all host-wait-input time as a part of step in ms.
-  StepSummary host_wait_input_ms_summary = 9;
-  // Summary of all host-to-device time as a part of step in ms.
-  StepSummary host_to_device_ms_summary = 10;
-  // Summary of all input time as a part of step in ms.
-  StepSummary input_ms_summary = 11;
-  // Summary of all output time as a part of step in ms.
-  StepSummary output_ms_summary = 3;
-  // Summary of all device-compute time as a part of step in ms.
-  StepSummary device_compute_ms_summary = 4;
-  // Summary of all device-to-device time as a part of step in ms.
-  StepSummary device_to_device_ms_summary = 5;
-  // Summary of all device-collectives time as a part of step in ms.
-  StepSummary device_collectives_ms_summary = 12;
-  // Summary of all host-compute time as a part of step in ms.
-  StepSummary host_compute_ms_summary = 6;
-  // Summary of all host-prepare time as a part of step in ms.
-  StepSummary host_prepare_ms_summary = 7;
-  // Summary of all compilation time as a part of step in ms.
-  StepSummary host_compile_ms_summary = 8;
-  reserved 2;
-}
-
-message InputPipelineAnalysisResult {
-  // tag : indicate the format of step_details and step_time_breakdown.
-  // true for TPU-specific data models.
-  bool tag = 16;
-  // Hardware type.
-  string hardware_type = 9;
-  // Summary of all step duration across all cores.
-  StepSummary step_time_summary = 2;
-  // Summary of all input-related stall as percentage of step duration.
-  StepSummary input_percent_summary = 3;
-  // Percentage of step time that is waiting for input.
-  double input_percent = 11;
-  // Percentage of step time that is doing output.
-  double output_percent = 13;
-  // Percentage of step time that is idle for non-I/O-related reason.
-  double idle_percent = 14;
-  // Percentage of step time that is doing compute.
-  double compute_percent = 15;
-  // Details of each step. Can be unpacked into a PerGenericStepDetails.
-  repeated google.protobuf.Any step_details = 4;
-  // The breakdown of the input processing time.
-  InputTimeBreakdown input_time_breakdown = 5;
-  // Details of each input Op executed.
-  repeated InputOpDetails input_op_details = 6;
-  // Recommendation for next steps to users.
-  InputPipelineAnalysisRecommendation recommendation = 7;
-  // Breakdown of the step time. Can be unpacked into a
-  // GenericStepTimeBreakdown.
-  google.protobuf.Any step_time_breakdown = 8;
-  // Error and warning messages for diagnosing profiling issues.
-  Diagnostics diagnostics = 12;
-  reserved 1, 10;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/input_pipeline.proto";
diff --git a/tensorflow/core/profiler/protobuf/kernel_stats.proto b/tensorflow/core/profiler/protobuf/kernel_stats.proto
index 2c1f1b9ee915..f6869d7a3938 100644
--- a/tensorflow/core/profiler/protobuf/kernel_stats.proto
+++ b/tensorflow/core/profiler/protobuf/kernel_stats.proto
@@ -2,39 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// Next ID: 15
-message KernelReport {
-  // Name of the kernel.
-  string name = 1;
-  // Registers per thread.
-  uint32 registers_per_thread = 2;
-  // Static shared memory in bytes.
-  uint32 static_shmem_bytes = 3;
-  // Dynamic shared memory in bytes.
-  uint32 dynamic_shmem_bytes = 4;
-  // Block dimensions.
-  repeated uint32 block_dim = 5;
-  // Grid dimensions.
-  repeated uint32 grid_dim = 6;
-  // Total duration of this kernel.
-  uint64 total_duration_ns = 7;
-  // Min duration of kernel in nanoseconds.
-  uint64 min_duration_ns = 8;
-  // Max duration of kernel in nanoseconds.
-  uint64 max_duration_ns = 9;
-  // Kernel utilizes TensorCore instructions.
-  bool is_kernel_using_tensor_core = 10;
-  // Operation is eligible to use TensorCores.
-  bool is_op_tensor_core_eligible = 11;
-  // TF operation name.
-  string op_name = 12;
-  // Number of occurrences.
-  uint32 occurrences = 13;
-  // Occupancy percentage.
-  float occupancy_pct = 14;
-}
-
-message KernelStatsDb {
-  // A list of kernels aggregated by name.
-  repeated KernelReport reports = 1;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/kernel_stats.proto";
diff --git a/tensorflow/core/profiler/protobuf/memory_profile.proto b/tensorflow/core/profiler/protobuf/memory_profile.proto
index b4bb5013bc63..f8960f17d4fe 100644
--- a/tensorflow/core/profiler/protobuf/memory_profile.proto
+++ b/tensorflow/core/profiler/protobuf/memory_profile.proto
@@ -3,131 +3,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// The memory activity that causes change of memory state.
-enum MemoryActivity {
-  UNKNOWN_ACTIVITY = 0;
-  // Memory allocation in heap.
-  ALLOCATION = 1;
-  // Memory deallocation in heap.
-  DEALLOCATION = 2;
-  // Memory reservation for stack.
-  RESERVATION = 3;
-  // Expansion of existing memory allocation.
-  EXPANSION = 4;
-}
-
-// The aggregated memory stats including heap, stack, free memory and
-// fragmentation at a specific time.
-message MemoryAggregationStats {
-  // Memory usage by stack reservation, in bytes.
-  int64 stack_reserved_bytes = 1;
-  // Memory usage by heap allocation, in bytes.
-  int64 heap_allocated_bytes = 2;
-  // Free memory available for allocation or reservation, in bytes.
-  int64 free_memory_bytes = 3;
-  // Fragmentation value within [0, 1].
-  double fragmentation = 4;
-  // The peak memory usage over the entire program (lifetime of memory
-  // allocator). It monotonically increases with upper limit as memory capacity.
-  int64 peak_bytes_in_use = 5;
-}
-
-// The metadata associated with each memory allocation/deallocation. It can
-// also be interpreted as the metadata for the delta of memory state.
-// Next ID: 10
-message MemoryActivityMetadata {
-  // The activity associated with the MemoryProfileSnapshot.
-  MemoryActivity memory_activity = 1;
-  // The requested memory size in bytes from the caller of memory allocation.
-  // Should be a positive number.
-  int64 requested_bytes = 2;
-  // The allocated (block/chunk) size for the memory allocation.
-  // Should be a positive number.
-  int64 allocation_bytes = 3;
-  // Starting address of the allocated memory chunk/block.
-  uint64 address = 4;
-  // TensorFlow Op name for the memory activity.
-  string tf_op_name = 5;
-  // Step Id at which the memory activity occurred.
-  int64 step_id = 6;
-  // Tensor memory region type including "output", "temp", "persist", and
-  // "dynamic".
-  string region_type = 7;
-  // From enum DataType defined in tensorflow/core/framework/types.proto.
-  string data_type = 8;
-  // Tensor shape printed in string, e.g. "[3, 3, 512, 512]".
-  string tensor_shape = 9;
-}
-
-// Profile snapshot of the TensorFlow memory at runtime, including
-// MemoryAggregationStats (memory usage breakdown etc.), and
-// MemoryActivityMetadata (allocation or deallocation, TF Op name etc.).
-message MemoryProfileSnapshot {
-  // Memory activity timestamp.
-  int64 time_offset_ps = 1;
-  // The memory aggregation stats at the snapshot time.
-  MemoryAggregationStats aggregation_stats = 2;
-  // The metadata for the memory activity at the snapshot time.
-  MemoryActivityMetadata activity_metadata = 3;
-}
-
-// The summary of memory profile within the profiling window duration.
-message MemoryProfileSummary {
-  // The peak memory usage over the entire program (lifetime of memory
-  // allocator).
-  int64 peak_bytes_usage_lifetime = 1;
-  // The peak memory usage stats within the profiling window.
-  MemoryAggregationStats peak_stats = 2;
-  // The timestamp for peak memory usage within the profiling window.
-  int64 peak_stats_time_ps = 3;
-  // The memory capacity of the allocator.
-  int64 memory_capacity = 4;
-}
-
-// The active memory allocations at the peak memory usage.
-message ActiveAllocation {
-  // The index of a snapshot in the time-sorted list, used to fetch the
-  // MemoryActivityMetadata at front end from the memory_profile_snapshots list.
-  int64 snapshot_index = 1;
-  // The index of MemoryActivityMetadata in the special_allocations list.
-  int64 special_index = 2;
-  // Number of occurrences for identical memory allocations.
-  int64 num_occurrences = 3;
-}
-
-// Memory profile snapshots per memory allocator.
-message PerAllocatorMemoryProfile {
-  // A list of MemoryProfileSnapshots referenced by <active_allocations>.
-  repeated MemoryProfileSnapshot memory_profile_snapshots = 1;
-  // The summary of memory profile (e.g. the peak memory usage).
-  MemoryProfileSummary profile_summary = 2;
-  // The rows in the table of active allocations at peak memory usage within
-  // profiling window.
-  repeated ActiveAllocation active_allocations = 3;
-  // The special allocations (e.g. pre-allocated heap memory, stack reservation)
-  // that are not captured in the MemoryActivityMetadata of
-  // memory_profile_snapshots. Need to handle separately.
-  repeated MemoryActivityMetadata special_allocations = 4;
-  // A list of MemoryProfileSnapshots sampled from all the snapshots during the
-  // profiling window. It is used to display the memory timeline graph in the
-  // frontend. The snapshots are sorted by timestamp.
-  repeated MemoryProfileSnapshot sampled_timeline_snapshots = 5;
-}
-
-// Data for memory usage analysis in one host.
-message MemoryProfile {
-  // A map from memory allocator's id to PerAllocatorMemoryProfile for memory
-  // usage analysis on this host.
-  map<string /*memory_id*/, PerAllocatorMemoryProfile>
-      memory_profile_per_allocator = 1;
-  // Number of hosts profiled, used to populate host selection list at front
-  // end.
-  int32 num_hosts = 2;
-  // Ids for profiled memory allocators, used to populate memory selection list
-  // at front end.
-  repeated string memory_ids = 3;
-  // Version number of MemoryProfile proto.
-  int32 version = 5;
-
-  reserved 4;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/memory_profile.proto";
diff --git a/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto b/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto
index 32bd5a7d8d43..7ef325078b35 100644
--- a/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto
+++ b/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto
@@ -6,103 +6,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// Describes a heap object that is displayed in a plot in the memory
-// visualization HTML.
-message HeapObject {
-  oneof color {
-    int32 numbered = 1;
-    string named = 2;
-  }
-  string label = 3;
-  int32 logical_buffer_id = 4;
-  double logical_buffer_size_mib = 5;
-  double unpadded_shape_mib = 6;
-  string instruction_name = 7;
-  string shape_string = 8;
-  string tf_op_name = 9;
-  string group_name = 10;
-  string op_code = 11;
-}
-
-// Describes the start / exclusive limit HLO program points for a given buffer
-// lifetime, used for rendering a box on the plot.
-message BufferSpan {
-  int32 start = 1;
-  int32 limit = 2;
-}
-
-message LogicalBuffer {
-  int64 id = 1;
-  string shape = 2;
-  double size_mib = 3;
-  string hlo_name = 4;
-  repeated int64 shape_index = 5;
-}
-
-message BufferAllocation {
-  int64 id = 1;
-  double size_mib = 2;
-  repeated string attributes = 3;
-  repeated LogicalBuffer logical_buffers = 4;
-  string common_shape = 5;
-}
-
-// Groups together all results from the preprocessing C++ step.
-message PreprocessResult {
-  // Heap sizes at each HLO program point (the HLO sequential order).
-  repeated double heap_sizes = 1;
-
-  // Unpadded heap sizes (calculated as the minimal sizes based on the data type
-  // and dimensionality) at each HLO program point (the HLO sequential order).
-  repeated double unpadded_heap_sizes = 2;
-
-  // The HloInstruction that was being processed at this HLO program point.
-  repeated string hlo_instruction_names = 20;
-
-  // Heap objects at the peak memory usage point ordered by HLO program "birth"
-  // time.
-  repeated HeapObject max_heap = 3;
-
-  // Heap objects at the peak memory usage point ordered by size, descending.
-  repeated HeapObject max_heap_by_size = 4;
-
-  // Mapping from logical buffer ID to the HLO sequential order span in which it
-  // is alive.
-  map<int32, BufferSpan> logical_buffer_spans = 5;
-
-  // Indexes to get back and forth from the by-size and by-program-order
-  // sequences.
-  repeated int32 max_heap_to_by_size = 6;
-  repeated int32 by_size_to_max_heap = 7;
-
-  string module_name = 8;
-  string entry_computation_name = 9;
-
-  // Peak heap size for the HLO program.
-  double peak_heap_mib = 10;
-
-  // Peak unpadded heap size for the HLO program.
-  double peak_unpadded_heap_mib = 11;
-
-  // HLO program point number at which the peak heap size occurs.
-  int32 peak_heap_size_position = 12;
-
-  // Size of the entry computation parameters in MiB.
-  //
-  // This does not reflect whether those MiB are reusable during the computation
-  // or not, it is simply a size value.
-  double entry_computation_parameters_mib = 13;
-
-  double non_reusable_mib = 14;
-
-  double maybe_live_out_mib = 15;
-
-  // total size of indefinite/global and temporary buffer allocations.
-  double total_buffer_allocation_mib = 18;
-  // total size of indefinite/global buffer allocations.
-  double indefinite_buffer_allocation_mib = 19;
-
-  repeated BufferAllocation indefinite_lifetimes = 16;
-
-  string allocation_timeline = 17;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/memory_viewer_preprocess.proto";
diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index c30557b6d96e..006ca03a4f4a 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -2,194 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// Predicted computational cost of the instruction associated with the symbol.
-// Estimated by traversing the HLO graph.
-message PerformanceInfo {
-  // The number of floating-point operations computed.
-  optional int64 flops = 1;
-  // The sum of bytes read and bytes written.
-  optional int64 bytes_accessed = 2;
-  // Breakdown of memory accessed by read/write and memory space.
-  message MemoryAccessed {
-    optional bool is_read = 1;
-    enum MemorySpace {
-      UNKNOWN = 0;
-      HBM = 1;
-      CMEM = 2;
-      VMEM = 3;
-    }
-    optional MemorySpace memory_space = 2;
-    optional int64 bytes_accessed = 3;
-  }
-  repeated MemoryAccessed memory_accessed_breakdown = 3;
-
-  reserved 4;
-}
-
-// Types of memory bandwidth we track in the system.
-enum MemBwType {
-  option allow_alias = true;
-
-  // We use FIRST and LAST enum values to be able to iterate over this enum
-  // in TypeScript, since the _MIN and _MAX values are not automatically
-  // available as in C++.
-  MEM_BW_TYPE_FIRST = 0;
-  // Aggregated BW across on-chip and off-chip memory.
-  // For GPU, 1/2 is shared memory bandwisth.
-  MEM_BW_TYPE_HBM_RW = 0;
-  // On-chip memory read bw.
-  MEM_BW_TYPE_SRAM_RD = 1;
-  // On-chip memory write bw.
-  MEM_BW_TYPE_SRAM_WR = 2;
-  MEM_BW_TYPE_CMEM_RD = 3;
-  MEM_BW_TYPE_CMEM_WR = 4;
-  MEM_BW_TYPE_VMEM_RD = 5;
-  MEM_BW_TYPE_VMEM_WR = 6;
-
-  // Leave last.
-  // Leave this MAX unchanged now to avoid op profile changes.
-  // TODO(b/359279074) Revisit the memory breakdown in op profile since we have
-  // more memory types now.
-  MEM_BW_TYPE_MAX = 2;
-}
-
-// Tensorflow generic memory space names.
-// These space names are used in analysis code to get memory bandwidth per core.
-enum MemorySpace {
-  MEMORY_SPACE_UNDEFINED = 0;
-  // Off-chip memory.
-  // Assume all backends use 1 for HBM/off-chip memory.
-  MEMORY_SPACE_HBM = 1;
-  // On-chip memory.
-  MEMORY_SPACE_ON_CHIP = 0x7FFFFFFE;
-  // Any memory.
-  MEMORY_SPACE_ALL = 0x7FFFFFFF;
-}
-
-// What the dimension represents, e.g. spatial, feature or batch.
-enum LayoutDimensionSemantics {
-  UNKNOWN_SEMANTICS = 0;
-  FEATURE = 1;
-  BATCH = 2;
-  SPATIAL = 3;
-}
-
-// Data layout of an op.
-message LayoutAnalysis {
-  // Physical data layout in each tensor dimension.
-  message Dimension {
-    // Size of the data in this dimension.
-    int32 size = 1;
-    // Data must be padded to a multiple of alignment.
-    int32 alignment = 2;
-    // What the dimension represents.
-    LayoutDimensionSemantics semantics = 3;
-  }
-  // The physical data layout, from most-minor to most-major dimensions.
-  repeated Dimension dimensions = 1;
-}
-
-// A container to serialize this repeated field in "symbolized xplane."
-message MemoryAccessBreakdown {
-  repeated OpMetrics.MemoryAccessed memory_accessed = 1;
-}
-
-// Metrics for an operation (accumulated over all occurrences).
-// Next ID: 26
-message OpMetrics {
-  // HLO module id. 0 for Framework ops.
-  uint64 hlo_module_id = 13;
-  // Name of this op.
-  string name = 6;
-  // Long name of this op (e.g., HLO expression).
-  string long_name = 20;
-  // Category of this op. (e.g. Hlo op category, Framework op type)
-  // Could be parsed from provenance if it is a framework op.
-  string category = 11;
-  // Provenance of this op if it is an HLO Op. (e.g. TF Op name, JAX Op name)
-  // TODO(b/310434797) Extends this for JAX as now only TF Op is populated.
-  string provenance = 12;
-  // Whether it is executed eagerly.
-  bool is_eager = 18;
-  // Number of executions.
-  uint32 occurrences = 3;
-  // Total time (self + children) in picoseconds.
-  uint64 time_ps = 7;
-  // Minimum time (self + children) among all occurrences.
-  uint64 min_time_ps = 17;
-  // Total self time in picoseconds.
-  uint64 self_time_ps = 1;
-  // Total FLOPs. Normalized to the devices peak bandwidth.
-  uint64 flops = 2;
-  // Total FLOPs for the model. Can be 0, in which case assume it's same as
-  // flops
-  uint64 model_flops = 24;
-  // Fingerprint of the symbol (cs/xla::HloPrintOptions::Fingerprint),
-  // if 0, the fingerprint is not set.
-  uint64 fingerprint = 25;
-  // Total bytes accessed.
-  uint64 bytes_accessed = 5;
-  // Breakdown of memory accessed by operation type and memory space.
-  message MemoryAccessed {
-    enum OperationType {
-      UNKNOWN = 0;
-      READ = 1;
-      WRITE = 2;
-    }
-    OperationType operation_type = 1;
-    // Device-specific id of memory space.
-    uint64 memory_space = 2;
-    uint64 bytes_accessed = 3;
-  }
-  repeated MemoryAccessed memory_accessed_breakdown = 19;
-  // Total dma stall time in picoseconds.
-  uint64 dma_stall_ps = 10;
-  // The data layout for this op. Only set for convolution ops for now.
-  LayoutAnalysis layout = 14;
-  // Deduplicated HLO name for this op. Not set for TF ops.
-  string deduplicated_name = 15;
-  // Children of the op. e.g. fused ops if this op is fusion.
-  OpMetricsDb children = 16;
-  // Number of cores this op occurs.
-  uint32 num_cores = 21;
-  // Computation primitive size in BITS. This is the size of the type of the
-  // hardware computation. In the future this may be extended to include info
-  // such as signed/unsigned, int/fp, etc. Currently only the size is needed.
-  uint32 computation_primitive_size = 22;
-  // Whether the op is autotuned.
-  bool autotuned = 23;
-  reserved 4, 8, 9;
-}
-
-// Statistics about the various precision used in computation.
-message PrecisionStats {
-  // Amount of time spent on 16-bit computation (in ps).
-  uint64 compute_16bit_ps = 1;
-  // Amount of time spent on 32-bit computation (in ps).
-  uint64 compute_32bit_ps = 2;
-}
-
-// A database for OpMetrics.
-// Next ID: 16
-message OpMetricsDb {
-  // A bunch of OpMetrics.
-  repeated OpMetrics metrics_db = 10;
-  // The total host infeed-enqueue duration in picoseconds.
-  uint64 total_host_infeed_enq_duration_ps = 2;
-  // The total of the difference between the start times of two
-  // consecutive infeed-enqueues (per host) in picoseconds.
-  uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
-  // The total time in picoseconds.
-  uint64 total_time_ps = 11;
-  // The total time incurred by OPs in picoseconds.
-  uint64 total_op_time_ps = 12;
-  // Precision-related stats.
-  PrecisionStats precision_stats = 13;
-  // The below two stats will be different from the total time ps and total op
-  // time ps because they are unioned all cores (and not summed).
-  // For duty cycle, a device is idle if all the cores are idle.
-  uint64 idle_time_ps = 14;
-  // For duty cycle, a device is busy if any of the cores is busy.
-  uint64 busy_time_ps = 15;
-  reserved 1, 4, 5, 6, 7, 8, 9;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/op_metrics.proto";
diff --git a/tensorflow/core/profiler/protobuf/op_profile.proto b/tensorflow/core/profiler/protobuf/op_profile.proto
index 8603cd819f23..b342c7a903f7 100644
--- a/tensorflow/core/profiler/protobuf/op_profile.proto
+++ b/tensorflow/core/profiler/protobuf/op_profile.proto
@@ -2,91 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler.op_profile;
 
-// Profile is the top-level data that summarizes a program.
-message Profile {
-  reserved 2;
-  reserved "by_program_structure";
-  reserved 3;
-  reserved "per_program";
-  // Root of a profile broken down by instruction category.
-  Node by_category = 1;
-  // Root of a profile broken down by program.
-  Node by_program = 4;
-
-  // Device type.
-  string device_type = 5;
-
-  // Exclude idle ops.
-  Node by_category_exclude_idle = 6;
-  Node by_program_exclude_idle = 7;
-}
-
-// An entry in the profile tree. (An instruction, or set of instructions).
-message Node {
-  string name = 1;             // Semantics depend on contents.
-  Metrics metrics = 2;         // May be omitted e.g. for fused instructions.
-  repeated Node children = 3;  // Subjected to pruning.
-
-  // Details about what this node represents.
-  oneof contents {
-    InstructionCategory category = 4;
-    XLAInstruction xla = 5;
-  }
-
-  int32 num_children = 6;  // Total number of children before pruning.
-  // A category of XLA instructions.
-  // name is a descriptive string, like "data formatting".
-  message InstructionCategory {}
-  // A single XLA instruction.
-  // name is the unique instruction id, like "%multiply.5".
-  message XLAInstruction {
-    string op = 1;          // Opcode like %multiply
-    string expression = 2;  // %multiply = [shape]multiply(operand1, operand2)
-    string provenance = 3;  // Provenance op name, eg. TF Op name, JAX Op name
-    string category = 4;
-    // Describes the physical memory layout of the instruction's primary input.
-    // e.g. for a convolution, this analyzes the image and ignores the kernel.
-    LayoutAnalysis layout = 5;
-    uint32 computation_primitive_size = 6;
-    uint64 fingerprint = 7;
-    message LayoutAnalysis {
-      // The physical data layout, from most-minor to most-major dimensions.
-      repeated Dimension dimensions = 1;
-      message Dimension {
-        int32 size = 1;       // Size of the data in this dimension.
-        int32 alignment = 2;  // Data must be padded to a multiple of alignment.
-        string semantics = 3;  // What the dimension represents, e.g. "spatial".
-      }
-    }
-  }
-}
-
-// Measurements of an operation (or aggregated set of operations).
-// Metrics are always "total" rather than "self".
-message Metrics {
-  // Floating point computations performed by this operation, as a fraction of
-  // peak core FLOPS * program time. This representation has useful properties:
-  //  - it is proportional to the number of floating point operations performed
-  //  - utilization is flops/time
-  //  - wasted potential flops is proportional to time - flops
-  //  - it does not reveal the peak core FLOPS of the hardware
-  double flops = 2;
-
-  // The memory bandwidth used to load operands, as a fraction of
-  // thereotical memory bandwidth on the specific hardware.
-  // Index into array using MemBwType enum.
-  repeated double bandwidth_utils = 5;
-
-  // The raw stats below are aggregated across all occurrences.
-  double raw_time = 11;   // Elapsed core-time in picoseconds.
-  double raw_flops = 12;  // Total floating-point operations performed.
-  // Total bytes accessed for each memory type.
-  // Index into array using MemBwType enum.
-  repeated double raw_bytes_accessed_array = 15;
-  // Number of executions.
-  uint32 occurrences = 16;
-  // Average "accumlated" time in picoseconds that the operation took.
-  double avg_time_ps = 17;
-
-  reserved 1, 3, 4, 13, 14;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/op_profile.proto";
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index 614bb322bfea..6b0c61448e54 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -2,160 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-import "tensorflow/core/profiler/protobuf/diagnostics.proto";
-import "tensorflow/core/profiler/protobuf/hardware_types.proto";
-import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
-import "tensorflow/core/profiler/protobuf/op_metrics.proto";
-import "tensorflow/core/profiler/protobuf/power_metrics.proto";
-import "tensorflow/core/profiler/protobuf/steps_db.proto";
-import "tensorflow/core/profiler/protobuf/tf_function.proto";
-import "tensorflow/core/profiler/protobuf/topology.proto";
-
-// Performance environment, e.g the peak performance capabilities of the device.
-message PerfEnv {
-  // Peak performance of a TPU core or a GPU in TFLOP/s.
-  double peak_tera_flops_per_second = 1;
-  // Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
-  double peak_bw_giga_bytes_per_second = 4;
-  // Peak off-chip memory bandwidth of a TPU core or a GPU in GiBs/s.
-  double peak_hbm_bw_giga_bytes_per_second = 2;
-  // Peak memory bandwidths of a TPU core or a GPU in GiBs/s.
-  // Index into array using MemBwType enum.
-  // TODO: remove the 2 above fields and bump up the proto version to maintain
-  // backwards compatibility.
-  repeated double peak_bws_giga_bytes_per_second = 5;
-  // The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational
-  // intensity required to achieve maximum performance).
-  double ridge_point = 3;
-  // Whether the device has CMEM.
-  bool has_cmem = 6;
-  // Whether the device has merged VMEM (with CMEM).
-  bool has_merged_vmem = 7;
-  // Whether megacore is used.
-  bool has_megacore = 8;
-}
-
-// Result proto for host-independent job information.
-message HostIndependentJobInfoResult {
-  // The change-list number of this build.
-  int64 change_list = 1;
-  // The time of this build (nanoseconds since the Unix epoch).
-  int64 build_time = 2;
-  // The target of this build.
-  string build_target = 3;
-  // Profiling duration (in ms).
-  uint32 profile_duration_ms = 4;
-}
-
-// Result proto for host-dependent job information.
-message HostDependentJobInfoResult {
-  // This ID of the host where the job was run on.
-  string host_id = 1;
-  // The command line used to run the job.
-  string command_line = 2;
-  // The start time of this run (nanoseconds since the Unix epoch).
-  int64 start_time = 3;
-  // BNS address specified by client at time of profiling request.
-  string bns_address = 4;
-  // Profiling start walltime (in ns).
-  uint64 profile_time_ns = 5;
-}
-
-// System topology, which describes the number of chips in a pod
-// and the connectivity style.
-message SystemTopology {
-  // The X, Y, and Z dimensions of this topology. 0 means that dimension does
-  // not exist.
-  int64 x_dimension = 1;
-  int64 y_dimension = 2;
-  int64 z_dimension = 3;
-  // The number of expected bad chips in this system.
-  int64 num_expected_reduced_chips = 4;
-}
-
-// The run environment of a profiling session.
-message RunEnvironment {
-  // Number of hosts used.
-  int32 host_count = 1;
-  // Number of tasks used.
-  int32 task_count = 2;
-  // Distinct hostnames seen.
-  map<string, bool> hostnames = 3;
-  // The type of device used.
-  string device_type = 4;
-  // The number of device cores used.
-  //   In TPU case, this corresponds to the number of TPU cores
-  //   In GPU case, this corresponds to the number of GPUs (not the number of
-  //   SMs).
-  int32 device_core_count = 5;
-  // Host-independent information about this job.
-  HostIndependentJobInfoResult host_independent_job_info = 7;
-  // Host-dependent information about this job.
-  repeated HostDependentJobInfoResult host_dependent_job_info = 8;
-  // The number of replicas, corresponds to input parallelism.
-  // If there is no model parallelism, replica_count = device_core_count
-  int32 replica_count = 9;
-  // The number of cores used for a single replica, e.g. model parallelism.
-  // If there is no model parallelism, then num_cores_per_replica = 1
-  int32 num_cores_per_replica = 10;
-  // Host trace level.
-  uint32 host_trace_level = 12;
-  // The chip and host interconnection topology.
-  Topology system_topology = 13;
-  // Whether it is a training analysis or inference analysis.
-  bool is_training = 14;
-  // Power Metrics for TPU.
-  PowerMetrics power_metrics = 15;
-  // Hardware type.
-  tensorflow.profiler.HardwareType hardware_type = 16;
-  reserved 6, 11;
-}
-
-// Next ID: 8
-message CoreDetails {
-  string hostname = 1;
-  uint32 device_ordinal = 2;  // unique within host, TPU core only
-  uint32 core_num = 3;        // unique within chip per core type
-  uint32 local_chip_id = 4;   // unique within host
-  uint32 global_chip_id = 5;  // unique within mesh
-  uint32 global_core_id = 6;  // unique within mesh, TPU core only
-  bool is_sparse_core = 7;
-}
-
-// Metrics based on hardware performance counters.
-message PerformanceCounterResult {
-  // Overall matrix unit utilization in percentage.
-  double matrix_unit_utilization_percent = 1;
-}
-
-// Next ID: 14
-// Operator Statistics.
-message OpStats {
-  // The database for the op metrics collected from the host over the entire
-  // profiling session including incomplete steps.
-  OpMetricsDb host_op_metrics_db = 1;
-  // The database for the op metrics collected from the device over the entire
-  // profiling session including incomplete steps.
-  OpMetricsDb device_op_metrics_db = 2;
-  // The result for the HLO-metric database over the complete steps only.
-  OpMetricsDb hlo_metrics_db_complete_steps_only = 10;
-  // Performance environment of the op metrics collected.
-  PerfEnv perf_env = 3;
-  // The database of step sequences.
-  StepDatabaseResult step_db = 4;
-  // The run environment of this profiling session.
-  RunEnvironment run_environment = 5;
-  // Kernel stats results from all GPUs.
-  KernelStatsDb kernel_stats_db = 6;
-  // Statistics for all tf-functions.
-  TfFunctionDb tf_function_db = 8;
-  // A map from core ID to details.
-  map<uint32, CoreDetails> core_id_to_details = 11;
-  // Error and warning messages for diagnosing profiling issues.
-  Diagnostics diagnostics = 9;
-  // A map from program ID to program name.
-  map<uint64, string> program_id_to_name_map = 12;
-  // Performance counters.
-  PerformanceCounterResult performance_counter_result = 13;
-  reserved 7;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/op_stats.proto";
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index d28978a95f2c..ac6e1b009d53 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -2,271 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-import "google/protobuf/any.proto";
-import "tensorflow/core/profiler/protobuf/diagnostics.proto";
-import "tensorflow/core/profiler/protobuf/input_pipeline.proto";
-import "tensorflow/core/profiler/protobuf/power_metrics.proto";
-
-// Overview result for a TensorFlow Op.
-message OverviewTfOp {
-  // Name of the Op.
-  string name = 1;
-  // Category of the Op.
-  string category = 2;
-  // The amount of time that this Op takes by itself
-  // as fraction of the total execution time on the device or host.
-  double self_time_fraction = 3;
-  // The cumulative time upto this Op as fraction of the total execution time.
-  double cumulative_time_fraction = 4;
-  // How many GFlops/sec that this Op achieves.
-  double flop_rate = 5;
-  // Whether the Op is eligible to use TensorCores.
-  bool is_op_tensorcore_eligible = 6;
-  // Whether at least one of the kernels launched in this op is using
-  // TensorCore.
-  bool is_op_using_tensorcore = 7;
-}
-
-// Overview result for general analysis.
-message OverviewPageAnalysis {
-  // MXU utilization in percentage.
-  double mxu_utilization_percent = 1;
-  // Percentage of the device time that is idle.
-  double device_idle_time_percent = 2;
-  // Percentage of the host time that is idle.
-  double host_idle_time_percent = 3;
-  // Top TF Ops executed on the device.
-  repeated OverviewTfOp top_device_ops = 4;
-  // Remark text in the performance summary section.
-  string remark_text = 5;
-  // Color of the remark text.
-  string remark_color = 6;
-  // FLOP rate utilization relative to the roofline in percentage.
-  double flop_rate_utilization_relative_to_roofline_percent = 7;
-  // Memory bandwidth utilization relative to the hw limit in percentage.
-  double memory_bw_utilization_relative_to_hw_limit_percent = 8;
-  // Percentage of device computation that is 16-bit.
-  double device_compute_16bit_percent = 9;
-  // Percentage of device computation that is 32-bit.
-  double device_compute_32bit_percent = 10;
-  // Percentage of TF ops executed on the host.
-  double host_tf_op_percent = 11;
-  // Percentage of TF ops executed on the device.
-  double device_tf_op_percent = 12;
-  // Host trace level.
-  uint32 host_trace_level = 13;
-  // Percentage of TF-op execution time on the host (excluding the idle time)
-  // that are in eager mode.
-  double host_op_time_eager_percent = 14;
-  // Percentage of TF-op execution time on the device (excluding the idle time)
-  // that are in eager mode.
-  double device_op_time_eager_percent = 15;
-  // Percentage of TF-op execution time on the device (excluding the idle time)
-  // that are for outside compilation.
-  double device_op_time_outside_compilation_percent = 16;
-  // Percentage of the device time that is in use.
-  double device_duty_cycle_percent = 17;
-  // BEGIN-INTERNAL
-  // Program Goodput metric in percentage.
-  double program_goodput_percent = 18;
-  // Sparse core step time in ms average.
-  double sc_step_time_ms_average = 19;
-  // Sparse core infeed time in ms average.
-  double sc_infeed_time_ms_avg = 20;
-  // Sparse core outfeed time in ms average.
-  double sc_outfeed_time_ms_avg = 21;
-  // Sparse core idle time in ms average.
-  double sc_idle_time_ms_avg = 22;
-  // Max FW VDD Core PL1 power metrics in watts.
-  double fw_max_vdd_core_pl1_power_watts = 23;
-  // Max FW VDD Core PL2 power metrics in watts.
-  double fw_max_vdd_core_pl2_power_watts = 24;
-  // Max FW VDD Core PL3 power metrics in watts.
-  double fw_max_vdd_core_pl3_power_watts = 25;
-  // Max FW VDD Core PL4 power metrics in watts.
-  double fw_max_vdd_core_pl4_power_watts = 26;
-  // Max FW HBM PL1 power metrics in watts.
-  double fw_max_hbm_pl1_power_watts = 27;
-  // Max FW HBM PL2 power metrics in watts.
-  double fw_max_hbm_pl2_power_watts = 28;
-  // Max FW HBM PL3 power metrics in watts.
-  double fw_max_hbm_pl3_power_watts = 29;
-  // Max FW HBM PL4 power metrics in watts.
-  double fw_max_hbm_pl4_power_watts = 30;
-  // END-INTERNAL
-}
-
-// Overview result for a performance tip to users.
-message OverviewPageTip {
-  // Link to the tip.
-  string link = 1;
-}
-
-message GenericRecommendation {
-  // Indicates if kernel launch is a performance bottleneck. Possible values:
-  // "no", "moderate", "high".
-  string kernel_launch_bottleneck = 1;
-  // A statement that recommends if we need to further investigate kernel-launch
-  // performance.
-  string kernel_launch_statement = 2;
-  // Indicates if all other is a performance bottleneck. Possible values: "no",
-  // "moderate", "high".
-  string all_other_bottleneck = 3;
-  // A statement that recommends if we need to further investigate all-other
-  // performance.
-  string all_other_statement = 4;
-  // A statement that recommends if the user should try using lower precision.
-  // Shows this statement to users only if it is not empty.
-  string precision_statement = 5;
-  // Indicates if device collectives are a performance bottleneck. Possible
-  // values: "no", "moderate", "high".
-  string device_collectives_bottleneck = 6;
-  // A statement that recommends if we need to further investigate
-  // device-collectives performance.
-  string device_collectives_statement = 7;
-}
-
-// Overview result for the recommendation section.
-message OverviewPageRecommendation {
-  // Possible performance bottleneck: "host", "device", "both".
-  string bottleneck = 1;
-  // A statement for input that recommends the next steps for investigating the
-  // bottleneck.
-  string statement = 2;
-  // A list of tips for tackling input bottleneck.
-  repeated OverviewPageTip input_tips = 11;
-  // A statement for output that recommends the next steps for investigating the
-  // bottleneck.
-  string output_statement = 9;
-  // A statement that recommends the next steps for investigating eager-mode
-  // related bottleneck (it is an html so that it can link to other tools/docs.)
-  string eager_statement_html = 12;
-  // A statement that recommends the next steps for investigating
-  // outside-compilation related bottleneck (it is an html so that it can link
-  // to other tools/docs.)
-  string outside_compilation_statement_html = 13;
-  // A statement that recommends the next steps for investigating tf-function
-  // related bottleneck (it is an html so that it can link to other tools/docs.)
-  string tf_function_statement_html = 10;
-  // A list of tips for improving host performance.
-  repeated OverviewPageTip host_tips = 3;
-  // A list of tips for improving device performance.
-  repeated OverviewPageTip device_tips = 4;
-  // A list of links to related useful documents.
-  repeated OverviewPageTip documentation_tips = 5;
-  // // The recommendation made to the user. Can be unpacked into a
-  // GenericRecommendation.
-  google.protobuf.Any recommendation = 6;
-  // A list of tips for FAQ.
-  repeated OverviewPageTip faq_tips = 7;
-  // A list of tips for inference run.
-  repeated OverviewPageTip inference_tips = 8;
-}
-
-// Result proto for host-independent job information.
-message OverviewPageHostIndependentJobInfo {
-  // The change-list number of this build.
-  int64 change_list = 1;
-  // The time of this build (nanoseconds since the Unix epoch).
-  int64 build_time = 2;
-  // The target of this build.
-  string build_target = 3;
-  // Profiling duration (in ms).
-  uint32 profile_duration_ms = 4;
-}
-
-// Result proto for host-dependent job information.
-message OverviewPageHostDependentJobInfo {
-  // This ID of the host where the job was run on.
-  string host_id = 1;
-  // The command line used to run the job.
-  string command_line = 2;
-  // The start time of this run (nanoseconds since the Unix epoch).
-  int64 start_time = 3;
-  // BNS address specified by client at time of profiling request.
-  string bns_address = 4;
-  // Profiling start walltime (in ns).
-  uint64 profile_time_ns = 5;
-}
-
-// The run environment of a profiling session.
-message OverviewPageRunEnvironment {
-  // Number of hosts used.
-  int32 host_count = 1;
-  // Number of tasks used.
-  int32 task_count = 2;
-  // Distinct hostnames seen.
-  map<string, bool> hostnames = 3;
-  // The type of device used.
-  string device_type = 4;
-  // The number of device cores used.
-  //   In TPU case, this corresponds to the number of TPU cores
-  //   In GPU case, this corresponds to the number of GPUs (not the number of
-  //   SMs).
-  int32 device_core_count = 5;
-  // Host-independent information about this job.
-  OverviewPageHostIndependentJobInfo host_independent_job_info = 7;
-  // Host-dependent information about this job.
-  repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 8;
-  // The number of replicas, corresponds to input parallelism.
-  // If there is no model parallelism, replica_count = device_core_count
-  int32 replica_count = 9;
-  // The number of cores used for a single replica, e.g. model parallelism.
-  // If there is no model parallelism, then num_cores_per_replica = 1
-  int32 num_cores_per_replica = 10;
-  // Whether it is a training analysis or inference analysis.
-  bool is_training = 11;
-  // Power Metrics for TPU.
-  PowerMetrics power_metrics = 12;
-  reserved 6;
-}
-
-// Total and breakdown latency for inference query(s).
-// Breakdown into host/device/communication.
-message OverviewLatencyBreakdown {
-  double total_latency_us = 1;
-
-  double host_latency_us = 2;
-
-  double device_latency_us = 3;
-
-  double communication_latency_us = 4;
-}
-
-// Overview result for the inference query latency stats.
-message OverviewInferenceLatency {
-  // The percentile numbers that the inference query latency distribution
-  // should follow.  E.g., 50.0 means 50%ile.
-  // Default is [50.0, 75.0, 90.0, 99.0, 99.9].
-  repeated double percentile_numbers = 1;
-
-  // Total and breakdown of a certain percentile latency. Each element
-  // corresponds to element with the same index in percentile_numbers.
-  repeated OverviewLatencyBreakdown latency_breakdowns = 2;
-
-  // Max latency in micro seconds.
-  double max_latency_us = 3;
-
-  // Min Latency in micro seconds.
-  double min_latency_us = 4;
-
-  // Inference sessions per second aggregated over all hosts. There can be
-  // multiple queries batched in one session.
-  optional double sessions_per_second = 5;
-}
-
-message OverviewPage {
-  // The run environment of the profiled session.
-  OverviewPageRunEnvironment run_environment = 6;
-  // The step-time result.
-  InputPipelineAnalysisResult input_analysis = 2;
-  // The other analysis result.
-  OverviewPageAnalysis analysis = 3;
-  // The recommendation made to the user.
-  OverviewPageRecommendation recommendation = 4;
-  // Error and warning messages for diagnosing profiling issues.
-  Diagnostics diagnostics = 8;
-  // The inference query latency stats.
-  OverviewInferenceLatency inference_latency = 9;
-  reserved 1, 5, 7;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/overview_page.proto";
diff --git a/tensorflow/core/profiler/protobuf/pod_stats.proto b/tensorflow/core/profiler/protobuf/pod_stats.proto
index abe55a5e8e07..10004fed3671 100644
--- a/tensorflow/core/profiler/protobuf/pod_stats.proto
+++ b/tensorflow/core/profiler/protobuf/pod_stats.proto
@@ -2,40 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-import "tensorflow/core/profiler/protobuf/diagnostics.proto";
-
-message StepBreakdownEvents {
-  int32 id = 1;
-  string name = 2;
-}
-
-// A database of PodStats records.
-message PodStatsDatabase {
-  // All PodStats records, one for each row in the PodStats tool.
-  repeated PodStatsRecord pod_stats_record = 1;
-  // Error and warning messages for diagnosing profiling issues.
-  Diagnostics diagnostics = 3;
-  // A map from event type number to event name string for step breakdown.
-  repeated StepBreakdownEvents step_breakdown_events = 4;
-  reserved 2;
-}
-
-// Next ID: 20
-// There is one PodStatsRecord for each step traced on each compute node.
-message PodStatsRecord {
-  // The host name where the trace was collected.
-  string host_name = 1;
-  // The TPU global chip id where the trace was collected.
-  int32 chip_id = 2;
-  // The TPU node id where the trace was collected.
-  int32 node_id = 3;
-  // The step number.
-  uint32 step_num = 4;
-  // The step duration in micro-seconds.
-  double total_duration_us = 5;
-  // Breakdown the durations for each event type in micro-seconds.
-  map<int32, double> step_breakdown_us = 19;
-  // Indicates the bottleneck out of the above mentioned metrics.
-  string bottleneck = 14;
-  reserved 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/pod_stats.proto";
diff --git a/tensorflow/core/profiler/protobuf/pod_viewer.proto b/tensorflow/core/profiler/protobuf/pod_viewer.proto
index b7e942ad723e..c8e5b763e8b7 100644
--- a/tensorflow/core/profiler/protobuf/pod_viewer.proto
+++ b/tensorflow/core/profiler/protobuf/pod_viewer.proto
@@ -4,125 +4,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-import "tensorflow/core/profiler/protobuf/diagnostics.proto";
-import "tensorflow/core/profiler/protobuf/pod_stats.proto";
-import "tensorflow/core/profiler/protobuf/topology.proto";
-
-// Describes the replica groups in a cross replica op (e.g., all-reduce and
-// all-to-all).
-message ReplicaGroup {
-  // The ids of the replicas that belongs to the same group. The ordering of the
-  // ids matters in some ops (e.g., all-to-all).
-  repeated int64 replica_ids = 1;
-}
-
-message AllReduceOpInfo {
-  // Name of this OP.
-  string name = 1;
-  // Number of instances that this OP occurred.
-  uint32 occurrences = 2;
-  // The time in microseconds spent in this OP (averaged across all of its
-  // occurrences).
-  double duration_us = 3;
-  // Byte size of data transferred.
-  uint64 data_size = 4;
-  // Replica groups.
-  repeated ReplicaGroup replica_groups = 5;
-  // Description (e.g. XLA expression).
-  string description = 6;
-}
-
-// Result proto for information in a step across all cores.
-message PodStatsMap {
-  // The (micro) step number.
-  uint32 step_num = 1;
-  // A map from core_id to PodStatsRecord.
-  map<uint32, PodStatsRecord> pod_stats_per_core = 2;
-  // A database of channel info.
-  repeated ChannelInfo channel_db = 3;
-  // A map from core ID to program replica id. Replica id map could change
-  // during a profile session, but should stay stable within a step.
-  map<uint32, uint32> core_id_to_replica_id_map = 4;
-  // A database of all reduce ops.
-  repeated AllReduceOpInfo all_reduce_op_db = 5;
-}
-
-// A sequence of PodStatsMap for each step.
-message PodStatsSequence {
-  repeated PodStatsMap pod_stats_map = 1;
-}
-
-// Next ID: 14
-// Information about a send and recv channel.
-message ChannelInfo {
-  // Id of the channel.
-  int64 channel_id = 1;
-  // Core ids of send ops.
-  repeated uint32 src_core_ids = 11;
-  // Core ids of recv ops.
-  repeated uint32 dst_core_ids = 12;
-  // Byte size of the data transferred.
-  uint64 data_size = 4;
-  // Duration from the beginning of send to the end of recv-done in
-  // microseconds.
-  double duration_us = 5;
-  // Number of occurrences of a channel.
-  uint32 occurrences = 6;
-  // Percentage of the link BW utilized over the peak link BW.
-  double utilization = 7;
-  // A list of hlo names associated with this channel id.
-  repeated string hlo_names = 8;
-  // Duration from the beginning of the recv-done to the beginning of send in
-  // microseconds. If the recv-done op starts after the beginning of the send
-  // op, the delay is zero.
-  double send_delay_us = 9;
-  // Description (e.g. XLA expression).
-  string description = 13;
-
-  reserved 2, 3, 10;
-}
-
-message PodViewerSummary {
-  repeated string warnings = 1;
-}
-
-// Next ID: 9
-// Topology graph draws all the cores in the system in a 2-D rectangle or
-// 3-D cube. It is hierarchically grouped by host, chip and core.
-message PodViewerTopology {
-  // Number of chips in the x dimension of the rectangle/cube.
-  int32 x_dimension = 1;
-  // Number of chips in the y dimension of the rectangle/cube.
-  int32 y_dimension = 2;
-  // Number of chips in the z dimension of the cube.
-  int32 z_dimension = 3;
-  // Number of chips in the x dimension of each host.
-  int32 host_x_stride = 4;
-  // Number of chips in the y dimension of each host.
-  int32 host_y_stride = 5;
-  // Number of chips in the z dimension of each host.
-  int32 host_z_stride = 6;
-  // Number of cores per chip.
-  int32 num_cores_per_chip = 7;
-  // Core locations.
-  repeated TopologyLocation cores = 8;
-}
-
-// Next ID: 12
-// A database of pod viewer records.
-message PodViewerDatabase {
-  // The type of device used.
-  string device_type = 10;
-  // Pod level stats for each step.
-  PodStatsSequence pod_stats_sequence = 3;
-  // Top level summary of pod viewer.
-  PodViewerSummary summary = 7;
-  // Error and warning messages for diagnosing profiling issues.
-  Diagnostics diagnostics = 8;
-  // A map from event type number to event name string for step breakdown.
-  repeated StepBreakdownEvents step_breakdown_events = 9;
-  // Info to draw the topology graph.
-  PodViewerTopology topology = 11;
-
-  reserved 1, 2, 4, 5, 6;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/pod_viewer.proto";
diff --git a/tensorflow/core/profiler/protobuf/power_metrics.proto b/tensorflow/core/profiler/protobuf/power_metrics.proto
index c5066e34485c..c97f4d6bee19 100644
--- a/tensorflow/core/profiler/protobuf/power_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/power_metrics.proto
@@ -2,28 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-message PowerComponentMetrics {
-  // power rail or component name, e.g. HBM, Core.
-  string component_name = 1;
-  // maximum watts monitored.
-  double max_power = 2;
-  // average watts monitored.
-  double avg_power = 3;
-  // (SPI sampler only) maximum watts of moving average power over a time window
-  // of 100us.
-  double max_moving_avg_power_100us = 4;
-  // (SPI sampler only) maximum watts of moving average power over a time window
-  // of 1ms.
-  double max_moving_avg_power_1ms = 5;
-  // (SPI sampler only) maximum watts of moving average power over a time window
-  // of 10ms.
-  double max_moving_avg_power_10ms = 6;
-  // (FW only) The timescale in us to compute moving averages.
-  uint32 timescale_us = 7;
-  // The number of samples.
-  uint64 sample_count = 8;
-}
-
-message PowerMetrics {
-  repeated PowerComponentMetrics power_component_metrics = 1;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/power_metrics.proto";
diff --git a/tensorflow/core/profiler/protobuf/roofline_model.proto b/tensorflow/core/profiler/protobuf/roofline_model.proto
index 6f68a1f80478..291532f6b239 100644
--- a/tensorflow/core/profiler/protobuf/roofline_model.proto
+++ b/tensorflow/core/profiler/protobuf/roofline_model.proto
@@ -4,193 +4,4 @@ syntax = "proto2";
 
 package tensorflow.profiler.roofline_model;
 
-import "tensorflow/core/profiler/protobuf/diagnostics.proto";
-
-// The record type which describes the scope this record captures.
-enum RecordType {
-  INVALID_RECORD_TYPE = 0;
-
-  // Captures the entire profiling duration including incomplete steps.
-  ALL = 1;
-
-  // Captures the average of all complete steps.
-  AVERAGE_STEP = 2;
-
-  // Captures a single step.
-  PER_STEP = 3;
-
-  // Same as ALL but the performance metrics (FLOPS and memory bandwidth) are
-  // derived from the hardware performance conuters.
-  ALL_HW = 4;
-}
-
-// A database of RooflineModel records.
-message RooflineModelDatabase {
-  // The device type.
-  optional string device_type = 1;
-
-  // Whether megacore is used.
-  optional bool megacore = 12;
-
-  // Whether the device has shared CMEM.
-  optional bool has_cmem = 8;
-
-  // Whether the device has merged VMEM.
-  optional bool has_merged_vmem = 15;
-
-  // Peak flop rate in GFLOP/s.
-  optional double peak_flop_rate = 2;
-
-  // Peak HBM bandwidth in GiB/s
-  optional double peak_hbm_bw = 9;
-
-  // Peak CMEM read bandwidth in GiB/s
-  optional double peak_cmem_read_bw = 10;
-
-  // Peak CMEM write bandwidth in GiB/s
-  optional double peak_cmem_write_bw = 11;
-
-  // Peak VMEM read bandwidth in GiB/s
-  optional double peak_vmem_read_bw = 13;
-
-  // Peak VMEM write bandwidth in GiB/s
-  optional double peak_vmem_write_bw = 14;
-
-  // All RooflineModel records, one for each HLO operation.
-  repeated RooflineModelRecord roofline_model_record = 5;
-
-  // Error and warning messages for diagnosing profiling issues.
-  optional tensorflow.profiler.Diagnostics diagnostics = 7;
-
-  reserved 3, 4, 6;
-}
-
-// There is one RooflineModelRecord for each HLO operation profiled.
-// Next ID: 43
-message RooflineModelRecord {
-  // The record type.
-  optional RecordType record_type = 18;
-
-  // Step number when record type is PER_STEP. Otherwise, invalid.
-  optional uint32 step_num = 19;
-
-  // The rank by self time
-  optional uint64 rank = 1;
-
-  // The hlo module id of the op
-  optional uint64 hlo_module_id = 35;
-
-  // The HLO category name.
-  optional string hlo_category = 17;
-
-  // The HLO operation name.
-  optional string hlo_name = 2;
-
-  // Number of occurrences of the operation.
-  optional int64 occurrences = 3;
-
-  // Total "accumulated" time in micro-seconds that the operation
-  // took. If this operation has any children operations,
-  // the "accumulated" time includes the time spent inside children.
-  optional double total_time_in_us = 4;
-
-  // Total time per core in micro-seconds.
-  optional double total_time_per_core_in_us = 20;
-
-  // Total time as fraction of the total program time.
-  optional double total_time_in_percentage = 21;
-
-  // Average "accumulated" time in micro-seconds that each
-  // occurrence of the operation took.
-  optional double avg_time_in_us = 5;
-
-  // Total "self" time in micro-seconds that the operation took.
-  // If this operation has any children operations, the "self" time
-  // doesn't include the time spent inside children.
-  optional double total_self_time_in_us = 6;
-
-  // Average "self" time in micro-seconds that the operation took.
-  optional double avg_self_time_in_us = 7;
-
-  // Percentage of the total "accumulated" time that was caused by
-  // DMA stall.
-  optional double total_self_time_as_fraction = 8;
-  optional double cumulative_total_self_time_as_fraction = 9;
-  optional double dma_stall_fraction = 10;
-
-  // Number of total floating-point operations (FLOPs) performed per second
-  // normalized to the bf16 peak performance.
-  optional double measured_flop_rate = 13;
-
-  // Numbef or total floating point operations (FLOPs) performed per second for
-  // the op.
-  optional double model_flop_rate = 38;
-
-  // Number of total bytes (including both read and write) accessed per
-  // second.
-  optional double measured_memory_bw = 14;
-
-  // HBM bandwidth in GiB/s (including both read and write).
-  optional double hbm_bw = 27;
-
-  // CMEM read bandwidth in GiB/s.
-  optional double cmem_read_bw = 28;
-
-  // CMEM write bandwidth in GiB/s.
-  optional double cmem_write_bw = 29;
-
-  // VMEM read bandwidth in GiB/s.
-  optional double vmem_read_bw = 39;
-
-  // VMEM write bandwidth in GiB/s.
-  optional double vmem_write_bw = 40;
-
-  // Overall operational intensity in FLOP/Byte.
-  optional double operational_intensity = 15;
-
-  // Operational intensity based on HBM in FLOP/Byte.
-  optional double hbm_operational_intensity = 30;
-
-  // Operational intensity based on CMEM read in FLOP/Byte.
-  optional double cmem_read_operational_intensity = 31;
-
-  // Operational intensity based on CMEM write in FLOP/Byte.
-  optional double cmem_write_operational_intensity = 32;
-
-  // Operational intensity based on VMEM read in FLOP/Byte.
-  optional double vmem_read_operational_intensity = 41;
-
-  // Operational intensity based on VMEM write in FLOP/Byte.
-  optional double vmem_write_operational_intensity = 42;
-
-  // Operational intensity based on the bottleneck resource in FLOP/Byte.
-  optional double bottleneck_operational_intensity = 33;
-
-  // Whether this operation is "Compute", "HBM", "CMEM Read", "CMEM Write"
-  // bound, according to the Roofline Model.
-  optional string bound_by = 16;
-
-  // The optimal flop rate calculated as
-  // (operational intensity) * (peak memory bw)
-  optional double optimal_flop_rate = 22;
-
-  // Roofline efficiency.
-  optional double roofline_efficiency = 34;
-
-  // Percentage of measured flop rate relative to the hardware limit.
-  optional double flop_rate_relative_to_hw_limit = 24;
-
-  // Percentage of measured memory bandwidth relative to the hardware limit.
-  optional double memory_bw_relative_to_hw_limit = 25;
-
-  // Whether the record is calculated including infeed and outfeed ops.
-  optional bool include_infeed_outfeed = 26;
-
-  // Flops for the record
-  optional uint64 flops = 36;
-
-  // Bytes accessed for the record
-  optional uint64 bytes_accessed = 37;
-
-  reserved 11, 12, 23;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/roofline_model.proto";
diff --git a/tensorflow/core/profiler/protobuf/steps_db.proto b/tensorflow/core/profiler/protobuf/steps_db.proto
index 5fb524b3c4d3..ba14500b8fab 100644
--- a/tensorflow/core/profiler/protobuf/steps_db.proto
+++ b/tensorflow/core/profiler/protobuf/steps_db.proto
@@ -2,201 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-import "google/protobuf/any.proto";
-import "tensorflow/core/profiler/protobuf/op_metrics.proto";
-
-// Breakdown of step-time on generic hardware. Note that these components are
-// mutually exclusive so that adding them together is equal to the step time. If
-// an execution time interval has multiple types of event happening, we need to
-// pick one of the event type to attribute the time interval to.
-message GenericStepBreakdown {
-  // Map event type to the accumulated duration in
-  // picoseconds of that type.
-  map<int32, uint64> type_ps = 1;
-
-  // Map of string category to accumulated duration in picoseconds for
-  // that category.
-  map<string, uint64> category_ps = 2;
-}
-
-// Breakdown of step-time on TPU.
-// Next ID: 20
-message TpuStepBreakdown {
-  // The infeed duration (host to TensorCore) in picoseconds.
-  uint64 infeed_duration_ps = 1;
-
-  // The outfeed duration (TensorCore to host) in picoseconds.
-  uint64 host_outfeed_ps = 2;
-
-  // The TensorCore time that is waiting for SparseCoreV0 in picoseconds.
-  uint64 wait_for_scv0_duration_ps = 3;
-
-  // The TensorCore time spent transforming activations in SparseCoreV0 layout
-  // into XLA layout.
-  uint64 scv0_infeed_transform_ps = 4;
-
-  // The outfeed duration (TensorCore to SparseCoreV0) in picoseconds.
-  uint64 scv0_outfeed_ps = 5;
-
-  // The time spent on all-reduce (used to be cross-replica-sum) in picoseconds.
-  uint64 crs_duration_ps = 6;
-
-  // The percentage of the SparseCoreV0 time that spends on infeed from host
-  // (including both data and instruction).
-  double scv0_infeed_percent = 7;
-
-  // The time spent on send operation.
-  uint64 send_duration_ps = 8;
-
-  // The time spent on recv operation.
-  uint64 recv_duration_ps = 9;
-
-  // The time spent on host send operation.
-  uint64 host_send_duration_ps = 15;
-
-  // The time spent on host recv operation.
-  uint64 host_recv_duration_ps = 16;
-
-  // Megacore fusion runs different operations on each core, e.g., a convolution
-  // on one core and an all-reduce on the other core. This is the time that the
-  // core executing the faster operation waits for the core executing the slower
-  // operation to reach the synchronization point.
-  uint64 wait_for_megacore_fusion_peer_duration_ps = 14;
-
-  // The time waiting for overlay DMAs in picoseconds.
-  uint64 overlay_wait_duration_ps = 11;
-
-  // The time spent running high flops ops, such as convolution and output
-  // fusion.
-  uint64 high_flops_compute_ps = 12;
-
-  // The time that the Tensorcore is idle but not waiting for input or
-  // SparseCoreV0.
-  uint64 tc_idle_ps = 13;
-
-  // The TensorCore time that is busy in picoseconds.
-  uint64 tc_busy_ps = 17;
-
-  // The SparseCoreV0 time that is busy in picoseconds (equal to
-  // SparseCoreV0 time - HOST_INSTRUCTION_STALL - HOST_DATA_STALL -
-  // TENSOR_CORE_STALL).
-  uint64 scv0_busy_ps = 18;
-
-  // SparseCoreV0 step time in picoseconds (equal to SparseCoreV0 time -
-  // TENSOR_CORE_STALL).
-  uint64 scv0_step_ps = 19;
-
-  reserved 10;
-}
-
-// Breakdown of step-time on SparseCore.
-message SparseCoreStepBreakdown {
-  // SparseCore step time in picoseconds (equal to SparseCore time - sc_idle -
-  // sc_wait_time).
-  uint64 sc_compute_ps = 1;
-
-  // Host to sparse core time in picoseconds.
-  uint64 sc_infeed_ps = 2;
-
-  // SparseCore to host time in picoseconds.
-  uint64 sc_outfeed_ps = 3;
-
-  // Idle time but not waiting for input in picoseconds.
-  uint64 sc_idle_ps = 4;
-
-  // SparseCore busy time in picoseconds.
-  uint64 sc_busy_ps = 5;
-}
-
-// Information about memory transfer to/from device memory.
-message DeviceMemoryTransfer {
-  uint64 occurrence = 1;
-  double time_us = 2;
-  uint64 bytes_transferred = 3;
-}
-
-// Next ID: 7
-// Result proto for StepInfo.
-message StepInfoResult {
-  // The step number.
-  uint32 step_num = 1;
-  // The step name.
-  string step_name = 5;
-  // The step duration in picoseconds.
-  uint64 duration_ps = 2;
-  // The start time of this step in picoseconds.
-  uint64 begin_ps = 3;
-  // Breakdown of the step-time. Can be unpacked into a GenericStepBreakdown.
-  google.protobuf.Any step_breakdown = 4;
-  // Total time/bytes/occurences for collectives. (All-Reduce, All-to-All etc)
-  DeviceMemoryTransfer collectives = 6;
-}
-
-// Result proto for all -educe ops.
-message AllReduceInfo {
-  // Unique id for all-reduce ops.
-  uint64 id = 1;
-  // The name of the hlo op. This field is no longer set by the profiler.
-  string name = 2 [deprecated = true];
-  // For all-reduce nodes from different modules, if they have the same
-  // all_reduce_id, they will be 'Allreduce'd'. If empty, AllReduce will not be
-  // applied across modules.
-  uint64 all_reduce_id = 3;
-  // The start time in picoseconds of the op event.
-  uint64 start_time_ps = 4;
-  // The end time in picoseconds of the op event.
-  uint64 end_time_ps = 5;
-  // The size of the op in bytes.
-  uint64 byte_size = 6;
-}
-
-// Result database for all-reduce ops.
-message AllReduceDbResult {
-  repeated AllReduceInfo all_reduce_info = 1;
-}
-
-// Result proto for information in a step across all cores.
-message PerCoreStepInfo {
-  // The step number.
-  uint32 step_num = 1;
-  // A map from core_id to StepInfo.
-  map<uint32, StepInfoResult> step_info_per_core = 2;
-  // The result for the per-step HLO-metric database.
-  OpMetricsDb hlo_metrics_db = 3;
-  // A map from core ID to program replica id. Replica id map could change
-  // during a profile session, but should stay stable within a step.
-  map<uint32, uint32> core_id_to_replica_id_map = 5;
-  // A map from core_id to all-reduce ops.
-  map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6;
-  // Information about deivce memory transfers, categoried by source and
-  // destination. Ordered by following categories:
-  // 1. HostToDevice
-  // 2. DeviceToHost
-  // 3. DeviceToDevice
-  // Cores are normally sharing host interfaces (i.e. PCIe).
-  repeated DeviceMemoryTransfer device_memory_transfers = 7;
-
-  reserved 4;
-}
-
-// Result proto for a StepDatabase.
-message StepDatabaseResult {
-  // A sequence of PerCoreStepInfo.
-  repeated PerCoreStepInfo step_sequence = 1;
-  // Whether the step db uses incomplete step information.
-  // This flag is set to true when:
-  // 1) no step marker or annotation present.
-  // 2) profiling duration is too short to cover a full step.
-  // If this flag is false, we will group and breakdown the
-  // profile by complete steps only and ignore incomplete steps.
-  // If this flag is true, we will simply aggregate and breakdown over the total
-  // profile as a single step.
-  bool use_incomplete_step = 2;
-  // Number of steps dropped during post processing.
-  uint32 num_steps_dropped = 3;
-  // If the step_sequence is empty because:
-  //   * there is no step profiled on any host, then empty_intersect is false.
-  //   * there are steps profiled on some host, but the intersection of steps
-  //     over all hosts is empty, then empty_intersect is true.
-  bool empty_intersect = 4;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/steps_db.proto";
diff --git a/tensorflow/core/profiler/protobuf/task.proto b/tensorflow/core/profiler/protobuf/task.proto
index b2cd02fee8f9..7142d5b99ad1 100644
--- a/tensorflow/core/profiler/protobuf/task.proto
+++ b/tensorflow/core/profiler/protobuf/task.proto
@@ -1,38 +1,7 @@
-// Describes a task that profiler traced.
-
-syntax = "proto2";
+// This proto describes the format of the output profile file from
+// the TF-stats tool.
+syntax = "proto3";
 
 package tensorflow.profiler;
 
-option cc_enable_arenas = true;
-
-// 'Task' contains information about a task that profiler traced.
-message Task {
-  // The most recent changelist number from the client that built the binary.
-  optional int64 changelist = 1;
-  // True if the client that built the binary was mint (no local changes).
-  optional bool clean_build = 2;
-  // Build time (in ns relative to the Unix epoch).
-  optional int64 build_time = 3;
-  // Build target for the binary.
-  optional string build_target = 4;
-  // The full command line used to invoke the task.
-  optional string command_line = 5;
-  // Start time of the task (in ns relative to the Unix epoch).
-  optional int64 start_time = 6;
-  // Task address specified by client at time of profiling request.
-  optional string task_address = 7;
-  // Profiling start walltime (in ns).
-  optional uint64 profile_time_ns = 8;
-  // Profiling duration (in ms).
-  optional uint32 profile_duration_ms = 9;
-  // Host trace level.
-  optional uint32 host_trace_level = 10;
-  // Hardware core frequency.
-  optional uint64 tensor_core_freq_hz = 11;
-  optional uint64 sparse_core_freq_hz = 12;
-  optional uint64 gtc_freq_hz = 13;
-  optional uint64 peak_memory_usage = 14;
-  optional double cpu_limit = 15;
-  optional double cpu_usage = 16;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/task.proto";
diff --git a/tensorflow/core/profiler/protobuf/tf_data_stats.proto b/tensorflow/core/profiler/protobuf/tf_data_stats.proto
index d9049e54e5e7..5a60117f3dad 100644
--- a/tensorflow/core/profiler/protobuf/tf_data_stats.proto
+++ b/tensorflow/core/profiler/protobuf/tf_data_stats.proto
@@ -4,119 +4,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// Stat for iterator.
-message IteratorStat {
-  // Id of the iterator.
-  int64 id = 1;
-  // Start time of the iterator's GetNext in ps.
-  int64 start_time_ps = 2;
-  // Duration of the iterator's GetNext in ps.
-  int64 duration_ps = 3;
-  // Self time of the iterator's GetNext in ps. It takes account into async
-  // iterators. It is calculated by subtracting the time overlapped with its
-  // child iterator's duration from the iterator's duration.
-  int64 self_time_ps = 4;
-  // Whether it is blocking the root iterator. An async iterator's child
-  // iterator may not block its parent iterator if it is executed in advance and
-  // does not overlap with the parent iterator.
-  bool is_blocking = 5;
-  // The number of times this iterator is called. For example, a batch
-  // iterator's child iterator may be called multiple times.
-  int64 num_calls = 6;
-}
-
-// Metadata for iterator.
-message IteratorMetadata {
-  // Id of the iterator.
-  int64 id = 1;
-  // Id of the parent iterator.
-  int64 parent_id = 2;
-  // Name of the iterator.
-  string name = 3;
-  // Long name of the iterator.
-  string long_name = 6;
-  // Whether it is an async iterator.
-  bool is_async = 4;
-  // Parameters of the iterator (e.g., num_parallel_calls).
-  map<string, string> params = 5;
-}
-
-// Stat and metadata for input pipeline.
-message InputPipelineStat {
-  // Id of the blocking iterator with the longest self time.
-  int64 bottleneck_iterator_id = 2;
-  // Latency of the bottleneck iterator.
-  int64 bottleneck_iterator_latency_ps = 3;
-  // Stats per iterator.
-  map<int64, IteratorStat> iterator_stats = 1;
-}
-
-// Metadata for input pipeline.
-message InputPipelineMetadata {
-  // The distribution strategy creates one "host" input pipeline which actually
-  // runs tf.data user code. Also, it creates a "device" input pipeline per
-  // device (e.g., TensorCore) which takes an element from the host input
-  // pipeline and transfers it to the device.
-  enum InputPipelineType {
-    HOST = 0;
-    DEVICE = 1;
-  }
-  // Id of the input pipeline which is set to the id of its root iterator.
-  int64 id = 1;
-  InputPipelineType type = 2;
-  string name = 4;
-  reserved 3;
-}
-
-// Collection of metadata and stats of input pipeline.
-message InputPipelineStats {
-  // Metadata of the input pipeline.
-  InputPipelineMetadata metadata = 1;
-  // Average latency (i.e., the root iterator's latency) of the input pipeline.
-  int64 avg_latency_ps = 3;
-  // Minimum latency of the input pipeline.
-  int64 min_latency_ps = 4;
-  // Maximum latency of the input pipeline.
-  int64 max_latency_ps = 5;
-  // The number of times this input pipeline was slower than 50 us.
-  int64 num_slow_calls = 6;
-  // Stats per call sorted by the root iterator's duration.
-  repeated InputPipelineStat stats = 2;
-}
-
-// Collection of stats of tf.data input pipelines within a host.
-message TfDataStats {
-  // Metadata per iterator.
-  map<int64, IteratorMetadata> iterator_metadata = 2;
-  // Stats per input pipeline.
-  map<int64, InputPipelineStats> input_pipelines = 1;
-}
-
-message TfDataBottleneckAnalysis {
-  // Host name.
-  string host = 1;
-  // Input pipeline name.
-  string input_pipeline = 2;
-  // Maximum latency of the input pipeline.
-  int64 max_latency_ps = 3;
-  // Name of the bottleneck iterator.
-  string iterator_name = 4;
-  // Long name of the bottleneck iterator.
-  string iterator_long_name = 5;
-  // Latency of the bottleneck iterator.
-  int64 iterator_latency_ps = 7;
-  // Suggestion to resolve the bottleneck.
-  string suggestion = 6;
-}
-
-// TfDataStats of all hosts.
-message CombinedTfDataStats {
-  // Whether it is input bound.
-  bool is_input_bound = 3;
-  // Summary of the analysis.
-  string summary = 4;
-  // Bottleneck analysis result.
-  repeated TfDataBottleneckAnalysis bottleneck_analysis = 1;
-  // TfDataStats per host.
-  map<string, TfDataStats> tf_data_stats = 2;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/tf_data_stats.proto";
diff --git a/tensorflow/core/profiler/protobuf/tf_function.proto b/tensorflow/core/profiler/protobuf/tf_function.proto
index 1f5e15304753..2fd6e2cceeaa 100644
--- a/tensorflow/core/profiler/protobuf/tf_function.proto
+++ b/tensorflow/core/profiler/protobuf/tf_function.proto
@@ -2,60 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// All possible execution modes of a tf-function.
-enum TfFunctionExecutionMode {
-  // Yet to be set.
-  INVALID_MODE = 0;
-  // Eager execution.
-  EAGER_MODE = 1;
-  // Graph execution with tracing.
-  TRACED_MODE = 2;
-  // Graph execution without tracing.
-  NOT_TRACED_MODE = 3;
-  // Concrete function.
-  CONCRETE_MODE = 4;
-}
-
-// All possible compilers that can be used to compile a tf-function in the graph
-// mode.
-enum TfFunctionCompiler {
-  // Yet to be set.
-  INVALID_COMPILER = 0;
-  // Any other compiler.
-  OTHER_COMPILER = 1;
-  // If some instance of the function is compiled with XLA and some is compiled
-  // with Non-XLA, use "MIXED_COMPILER".
-  MIXED_COMPILER = 2;
-  // XLA compiler.
-  XLA_COMPILER = 3;
-  // MLIR compiler.
-  MLIR_COMPILER = 4;
-}
-
-// Metrics associated with a particular execution mode of a tf-function.
-message TfFunctionMetrics {
-  // Number of invocations to the function in that execution mode.
-  uint64 count = 1;
-  // The sum of "self-execution" time of this function over those invocations.
-  uint64 self_time_ps = 2;
-}
-
-// Statistics for a tf-function.
-message TfFunction {
-  // A map from each execution mode to its corresponding metrics.
-  map<int32, TfFunctionMetrics> metrics = 1;
-  // Total tracing count from the program's beginning (i.e. beyond the profiling
-  // period) of this tf-function.
-  int64 total_tracing_count = 2;
-  // Compiler used to compile this function.
-  TfFunctionCompiler compiler = 3;
-  // Percentage of time spent in the expensive calls to this function in the
-  // profiled period.
-  double expensive_call_percent = 4;
-}
-
-// Statistics for all tf-functions.
-message TfFunctionDb {
-  // A map from function name to the statistics of that function.
-  map<string, TfFunction> tf_functions = 1;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/tf_function.proto";
diff --git a/tensorflow/core/profiler/protobuf/tf_stats.proto b/tensorflow/core/profiler/protobuf/tf_stats.proto
index 72f071a362c4..be21fbaf60c6 100644
--- a/tensorflow/core/profiler/protobuf/tf_stats.proto
+++ b/tensorflow/core/profiler/protobuf/tf_stats.proto
@@ -4,118 +4,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-// A database of TfStatsTables.
-message TfStatsDatabase {
-  // The table that includes IDLE time.
-  TfStatsTable with_idle = 4;
-  // The table that excludes IDLE time.
-  TfStatsTable without_idle = 5;
-  // The type of device used.
-  string device_type = 6;
-  reserved 1, 2, 3;
-}
-
-// A table of TFStatsRecords plus the corresponding pprof keys.
-message TfStatsTable {
-  // All TfStats records, one for each TF operation.
-  repeated TfStatsRecord tf_stats_record = 1;
-  // key to the pprof profile for host TF operations.
-  string host_tf_pprof_key = 2;
-  // key to the pprof profile for device TF operations.
-  string device_tf_pprof_key = 3;
-}
-
-// There is one TfStatsRecord for each TF operation profiled.
-message TfStatsRecord {
-  // Rank of this TF-op among all TF-ops.
-  uint64 rank = 1;
-  // Whether this TF-op is on "Host" or "Device".
-  string host_or_device = 2;
-  // TF-op type.
-  string op_type = 3;
-  // TF-op name.
-  string op_name = 4;
-  // Number of occurrences of the operation.
-  int64 occurrences = 5;
-  // Total "accumulated" time in micro-seconds that the operation
-  // took. If this operation has any children operations,
-  // the "accumulated" time includes the time spent inside children.
-  double total_time_in_us = 6;
-  // Average "accumulated" time in micro-seconds that each
-  // occurrence of the operation took.
-  double avg_time_in_us = 7;
-  // Total "self" time in micro-seconds that the operation took.
-  // If this operation has any children operations, the "self" time
-  // doesn't include the time spent inside children.
-  double total_self_time_in_us = 8;
-  // Average "self" time in micro-seconds that the operation took.
-  double avg_self_time_in_us = 9;
-  // Total "self" time as fraction of the sum of the total self-time
-  // of operations run on the device. It is 0 if this op runs on the host.
-  double device_total_self_time_as_fraction = 10;
-  // Cumulative value of device_total_self_time_as_fraction.
-  double device_cumulative_total_self_time_as_fraction = 11;
-  // Total "self" time as fraction of the sum of the total self-time
-  // of operations run on the host. It is 0 if this op runs on the device.
-  double host_total_self_time_as_fraction = 12;
-  // Cumulative value of host_total_self_time_as_fraction.
-  double host_cumulative_total_self_time_as_fraction = 13;
-  // Number of floating-point operations (FLOPs) performed per
-  // second.
-  double measured_flop_rate = 14;
-  // Number of bytes (including both read and write) accessed per
-  // second.
-  double measured_memory_bw = 15;
-  // Operational intensity, which is defined as FLOPs/bytes-accessed.
-  double operational_intensity = 16;
-  // Whether this operation is "Compute" or "Memory" bound,
-  // according to the Roofline Model.
-  string bound_by = 17;
-  // Whether this TF-op is eagerly executed.
-  bool is_eager = 18;
-  // Fraction of kernel time that utilizes GPU TensorCore.
-  // It is 0.0 if this op does not run on a GPU device.
-  double gpu_tensorcore_utilization = 19;
-
-  // Total Floating-point operations for the op per second.
-  double model_flop_rate = 20;
-  // Number of bytes accessed from HBM (including both read and write) per
-  // second.
-  double hbm_bw = 21;
-
-  // Number of bytes read from CMEM per second.
-  double cmem_read_bw = 22;
-
-  // Number of bytes written to CMEM per second.
-  double cmem_write_bw = 23;
-
-  // Number of bytes read from VMEM per second.
-  double vmem_read_bw = 24;
-
-  // Number of bytes written to VMEM per second.
-  double vmem_write_bw = 25;
-
-  // Operational intensity based on HBM in FLOP/Byte.
-  double hbm_operational_intensity = 26;
-
-  // Operational intensity based on CMEM read in FLOP/Byte.
-  double cmem_read_operational_intensity = 27;
-
-  // Operational intensity based on CMEM write in FLOP/Byte.
-  double cmem_write_operational_intensity = 28;
-
-  // Operational intensity based on VMEM read in FLOP/Byte.
-  double vmem_read_operational_intensity = 29;
-
-  // Operational intensity based on VMEM write in FLOP/Byte.
-  double vmem_write_operational_intensity = 30;
-
-  // Operational intensity based on the bottleneck resource in FLOP/Byte.
-  double bottleneck_operational_intensity = 31;
-
-  // Flops for the record
-  uint64 flops = 32;
-
-  // Bytes accessed for the record
-  uint64 bytes_accessed = 33;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/tf_stats.proto";
diff --git a/tensorflow/core/profiler/protobuf/tfstreamz.proto b/tensorflow/core/profiler/protobuf/tfstreamz.proto
index 4fe5c168f3a9..fabb15bf5427 100644
--- a/tensorflow/core/profiler/protobuf/tfstreamz.proto
+++ b/tensorflow/core/profiler/protobuf/tfstreamz.proto
@@ -4,30 +4,4 @@ syntax = "proto3";
 
 package tensorflow.profiler.tfstreamz;
 
-// A proxy proto to serialize tensorflow::monitoring::Percentiles
-
-enum UnitOfMeasure {
-  NUMBER = 0;
-  TIME = 1;
-  BYTES = 2;
-}
-
-message PercentilePoint {
-  // In the [0, 100] range.
-  double percentile = 1;
-  double value = 2;
-}
-
-message Percentiles {
-  UnitOfMeasure unit_of_measure = 1;
-  uint64 start_nstime = 2;
-  uint64 end_nstime = 3;
-  double min_value = 4;
-  double max_value = 5;
-  double mean = 6;
-  double stddev = 7;
-  uint64 num_samples = 8;
-  uint64 total_samples = 9;
-  double accumulator = 10;
-  repeated PercentilePoint points = 11;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/tfstreamz.proto";
diff --git a/tensorflow/core/profiler/protobuf/topology.proto b/tensorflow/core/profiler/protobuf/topology.proto
index bd07e9c7b7b0..3800772729ff 100644
--- a/tensorflow/core/profiler/protobuf/topology.proto
+++ b/tensorflow/core/profiler/protobuf/topology.proto
@@ -2,77 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-message TopologyDimension {
-  int32 x = 1;
-  int32 y = 2;
-  int32 z = 3;
-}
-
-message TopologyLocation {
-  int32 x = 1;
-  int32 y = 2;
-  int32 z = 3;
-
-  int32 host_x = 4;
-  int32 host_y = 5;
-  int32 host_z = 6;
-
-  int32 index_on_host = 7;
-  int32 global_id = 8;
-}
-
-// Topology of the system.
-// Describes the number of chips and hosts and their connectivity.
-message Topology {
-  // Topology of chips per host.
-  TopologyDimension chips_per_host_bounds = 1;
-  // Topology of hosts.
-  TopologyDimension host_bounds = 2;
-  // Chip position within the mesh
-  repeated TopologyLocation mesh_location = 3;
-}
-
-// The logical topology of the job.
-message LogicalTopology {
-  // Logical metadata about a specific device.
-  message LogicalDevice {
-    // The id that uniquely identifies the device globally.
-    int32 global_id = 1;
-
-    // The id that uniquely identifies the device within its slice.
-    int32 slice_local_id = 2;
-
-    // The id that uniquely identifies the device within its host.
-    int32 host_local_id = 3;
-  }
-
-  // The network address of a specific host.
-  message HostNetworkAddress {
-    string address = 1;
-    string interface_name = 2;
-  }
-
-  // Logical metadata about a specific host.
-  message LogicalHost {
-    // The id that uniquely identifies the host within its slice.
-    int32 slice_local_id = 1;
-
-    // The network addresses of the host.
-    repeated HostNetworkAddress network_addresses = 2;
-
-    // The devices that are connected to this host.
-    repeated LogicalDevice devices = 3;
-  }
-
-  // Logical metadata about a specific slice.
-  message LogicalSlice {
-    // The id that uniquely identifies the slice globally.
-    int32 global_id = 1;
-
-    // The hosts that are part of this slice.
-    repeated LogicalHost hosts = 2;
-  }
-
-  // The slices that are part of the job.
-  repeated LogicalSlice slices = 1;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/topology.proto";
diff --git a/tensorflow/core/profiler/protobuf/tpu_input_pipeline.proto b/tensorflow/core/profiler/protobuf/tpu_input_pipeline.proto
index fba156c1c751..5db910b5a6f1 100644
--- a/tensorflow/core/profiler/protobuf/tpu_input_pipeline.proto
+++ b/tensorflow/core/profiler/protobuf/tpu_input_pipeline.proto
@@ -2,166 +2,4 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
-import "tensorflow/core/profiler/protobuf/input_pipeline.proto";
-
-// Per-step details on TPU.
-// Next ID: 26
-message PerTpuStepDetails {
-  // The step number of a step.
-  int32 step_number = 1;
-
-  // The TensorCore compute time in this step.
-  double tc_compute_time_ms = 13;
-
-  // The maximum TensorCore idle time that is due to host overhead (but not
-  // input-related).
-  double tc_idle_time_ms = 14;
-
-  // The part of a step (in ms) TC spends sending data to the host via outfeed.
-  double tc_outfeed_time_ms = 15;
-
-  // The part of a step (in ms) on TC that is waiting for input data from the
-  // host.
-  double tc_infeed_time_ms = 3;
-
-  // Average infeed-dequeue time across cores (as percentage of step time).
-  double infeed_percent_average = 4;
-
-  // Minimum infeed-dequeue time across cores (as percentage of step time).
-  double infeed_percent_minimum = 5;
-
-  // Maximum infeed-dequeue time across cores (as percentage of step time).
-  double infeed_percent_maximum = 6;
-
-  // The core with the maximum infeed time in this step.
-  uint32 coreid_max_infeed_time = 7;
-
-  // The name of the core with the maximum infeed time in this step.
-  string max_infeed_time_core_name = 25;
-
-  // The part of a step (in ms) that is spent on the all-reduce compute.
-  double all_reduce_compute_time_ms = 11;
-
-  // The part of a step (in ms) that is spent on the all-reduce synchronization.
-  double all_reduce_sync_time_ms = 12;
-
-  // The part of a step (in ms) that is spent on SparseCoreV0 compute.
-  double scv0_compute_time_ms = 16;
-
-  // The part of a step (in ms) that spent on infeed from host to SparseCoreV0.
-  double scv0_infeed_time_ms = 17;
-
-  // The part of the step (in ms) that is spent waiting for device to host or
-  // host to device transfer.
-  double host_transfer_ms = 18;
-
-  // The SparseCore compute time in this step.
-  double sc_compute_time_ms = 20;
-
-  // The maximum SparseCore idle time that is due to host overhead (but not
-  // input-related).
-  double sc_idle_time_ms = 21;
-
-  // The part of a step (in ms) SC spends sending data to the host via outfeed.
-  double sc_outfeed_time_ms = 22;
-
-  // The part of a step (in ms) on SC that is waiting for input data from the
-  // host.
-  double sc_infeed_time_ms = 23;
-
-  // Sparse core step time in ms.
-  double sc_step_time_ms = 24;
-
-  reserved 2, 8, 9, 10;
-}
-
-// Next Id: 9
-message TpuStepTimeBreakdown {
-  // Summary of all TensorCore compute op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary tc_compute_ms_summary = 1;
-
-  // Summary of all SparseCoreV0 compute op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary scv0_compute_ms_summary = 2;
-
-  // Summary of all TensorCore infeed op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary tc_infeed_ms_summary = 3;
-
-  // Summary of all TensorCore outfeed op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary tc_outfeed_ms_summary = 6;
-
-  // Summary of all SparseCoreV0 infeed op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary scv0_infeed_ms_summary = 4;
-
-  // Summary of all TensorCore idle (but not input-related) duration as a part
-  // of step in ms.
-  tensorflow.profiler.StepSummary tc_idle_ms_summary = 5;
-
-  // Summary of all Host to Device and Device to Host transfer part of the step
-  // in ms.
-  tensorflow.profiler.StepSummary host_transfer_ms_summary = 7;
-  // Summary of all sparsecore step summary info.
-  SparseCoreStepSummary sparse_core_step_summary = 8;
-}
-
-// Similar to TpuStepTimeBreakdown, this is for sparse core step time info.
-message SparseCoreStepSummary {
-  // Summary of all SparseCore compute op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary sc_compute_ms_summary = 1;
-  // Summary of all SparseCore infeed op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary sc_infeed_ms_summary = 2;
-  // Summary of all SparseCore outfeed op duration as a part of step in ms.
-  tensorflow.profiler.StepSummary sc_outfeed_ms_summary = 3;
-  // Summary of all SparseCore idle (but not input-related) duration as a part
-  // of step in ms.
-  tensorflow.profiler.StepSummary sc_idle_ms_summary = 4;
-  // Summary of all SparseCore step time in ms.
-  tensorflow.profiler.StepSummary sc_step_time_ms_summary = 5;
-}
-
-message TpuBottleneckAnalysis {
-  // Percentage of step time that is spent on input.
-  double input_percent = 11;
-
-  // Indicates if input is a bottleneck. Possible values:  "host", "device",
-  // "both", or "unknown"
-  string input_classification = 1;
-
-  // A human-readable description of the input bottleneck.
-  string input_statement = 2;
-
-  // Indicates if output is a bottleneck. Possible values:  "host", "device",
-  // "both", or "unknown"
-  double output_percent = 12;
-
-  // Percentage of step time that is spent on output.
-  string output_classification = 9;
-
-  // A human-readable description of the output bottleneck.
-  string output_statement = 10;
-
-  // Percentage of step time where the TC is idle (other than I/O).
-  double tc_idle_percent = 13;
-
-  // Indicates if TensorCore being idle (other than input) is a bottleneck.
-  // Possible values: "no", "yes".
-  string tc_idle_classification = 3;
-
-  // A human-readable description of the TC-idle bottleneck.
-  string tc_idle_statement = 4;
-
-  // Indicates if SparseCoreV0 is a bottleneck. Possible values: "no",
-  // "moderate", "high".
-  string scv0_classification = 5;
-
-  // A human-readable description of the SparseCoreV0 bottleneck.
-  string scv0_statement = 6;
-
-  // Indicates if all-reduce is a bottleneck. Possible values: "no", "yes".
-  string all_reduce_classification = 7;
-
-  // A human-readable description of the all-reduce bottleneck.
-  string all_reduce_statement = 8;
-
-  // Percentage of step time that is spent on compute.
-  double compute_percent = 14;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/tpu_input_pipeline.proto";
diff --git a/tensorflow/core/profiler/protobuf/trace_events.proto b/tensorflow/core/profiler/protobuf/trace_events.proto
index 7b1276148ae4..5dd45ae3c5a6 100644
--- a/tensorflow/core/profiler/protobuf/trace_events.proto
+++ b/tensorflow/core/profiler/protobuf/trace_events.proto
@@ -1,198 +1,7 @@
-// This file describes a pre-aggregated event format for further processing and
-// displaying of trace data.
-//
-// A trace can contain events from several devices. Each device has
-// several resources. These can be threads on a CPU or specific computation
-// units on hardware. Within each resource, many trace events occur over time.
-// Most resources can only execute one thing at a time and so trace events don't
-// overlap in those.
-//
-// Use cases:
-// ==========
-// Traces in this format are can be consumed by timeline views (e.g. like
-// the chrome trace viewer).
-//
-// The goal is to have this data be independent of a specific hardware type and
-// be able to represent traces of arbitrary combinations of CPU, GPU, TPU and
-// whatever else might come up.
-//
-// Data format:
-// ============
-// The messages Trace>Device>Resource form the metadata necessary to understand
-// this trace. TraceEvent objects themselves are not nested within these
-// structures for two reasons:
-//
-// - Efficient on-disk representation: A trace can become huge and have millions
-//   of trace events. If the events were nested inside the other data
-//   structures, a single large proto would have to be stored. Having the traces
-//   outside of those structures means that efficient and shardable storage
-//   formats can be used.
-//
-// - Streaming use cases: Some consumers might not be interested in a
-//   per-device-and-resource-structured representation. They might just want to
-//   churn through all of the trace events to get aggregate representations. In
-//   such cases, it's much more effective to churn through the events one by one
-//   rather than creating the huge memory structure.
-//
-// The downside is obviously that two additional integers need to be stored per
-// trace event to identify the device and resource it occurred in.
-//
-// Timestamps:
-// ===========
-// Events may be as short as one processor cycle - on the order of 1 ns.
-// Timestamps are therefore 64-bit picosecond counters, with 0 being the
-// start of the trace, and overflow occurring after ~200 days.
+// This proto describes the format of the output profile file from
+// the TF-stats tool.
+syntax = "proto3";
 
-syntax = "proto2";
+package tensorflow.profiler.trace_events;
 
-package tensorflow.profiler;
-
-import "tensorflow/core/profiler/protobuf/task.proto";
-
-option cc_enable_arenas = true;
-
-// A 'Trace' contains metadata for the individual traces of a system.
-message Trace {
-  // The devices that this trace has information about. Maps from device_id to
-  // more data about the specific device.
-  map<uint32, Device> devices = 1;
-
-  // The tasks that were traced, keyed by a unique ID for the server on
-  // which the task ran.
-  map<uint32, tensorflow.profiler.Task> tasks = 6;
-
-  // The time range that this trace covers.
-  // Timestamps are picoseconds since tracing started.
-  optional uint64 min_timestamp_ps = 4;  // Start of first event.
-  optional uint64 max_timestamp_ps = 5;  // End of last event.
-  optional uint64 num_events = 7;
-
-  // String intern table for event's name or TraceMe argument.
-  map<fixed64, string> name_table = 8;
-
-  reserved 2, 3;
-}
-
-// A 'device' is a physical entity in the system and is comprised of several
-// resources.
-message Device {
-  // The name of the device.
-  optional string name = 1;
-
-  // The id of this device, unique in a single trace.
-  optional uint32 device_id = 2;
-
-  // The resources on this device, keyed by resource_id;
-  map<uint32, Resource> resources = 3;
-
-  reserved 4;
-}
-
-// A 'resource' generally is a specific computation component on a device. These
-// can range from threads on CPUs to specific arithmetic units on hardware
-// devices.
-message Resource {
-  // The name of the resource.
-  optional string name = 1;
-
-  // The id of the resource. Unique within a device.
-  optional uint32 resource_id = 2;
-
-  // Number of events added to this resource.
-  optional uint32 num_events = 3;
-}
-
-/* TraceEvent can represent four kinds of different events specified by
- * go/trace-event-format
- * 1. Complete/Duration Event
- * 2. Flow Event
- * 3. Counter Event
- * 4. Async Event
- *
- * =======================================================================
- *                         |  has_flow_id = true   |   has_flow_id = false
- * =======================================================================
- * has_resource_id = true  |      flow event       |     complete event
- * =======================================================================
- * has_resource_id = false |      async event      |     counter event
- * =======================================================================
- *
- *  for async events:
- *  if flow_entry_type == FLOW_MID, a pair of json events is generated for a
- *  single async event (one for begin and one for end). if flow_entry_type is
- *  FLOW_START or FLOW_END, a single json event is generated correspondingly.
- *  flow_category is Mandetory for async event even if it is "".
- */
-
-message TraceEvent {
-  enum EventType {
-    EVENT_TYPE_UNSPECIFIED = 0;
-    EVENT_TYPE_COMPLETE = 1;
-    EVENT_TYPE_FLOW = 2;  // DEPRECATED.
-    EVENT_TYPE_ASYNC = 3;
-    EVENT_TYPE_COUNTER = 4;
-  }
-  optional EventType type = 14;
-
-  // The id of the device that this event occurred on. The full dataset should
-  // have this device present in the Trace object.
-  optional uint32 device_id = 1;
-
-  // The id of the resource that this event occurred on. The full dataset should
-  // have this resource present in the Device object of the Trace object. A
-  // resource_id is unique on a specific device, but not necessarily within the
-  // trace.
-  // NOTE: counter events do not have this field set as they are per device.
-  optional uint32 resource_id = 2;
-
-  oneof name_oneof {
-    // The name of this trace event.
-    string name = 3;
-    // Reference of the name in Trace's name_table (e.g. in SStable format).
-    fixed64 name_ref = 12;
-  }
-
-  // The group id which this event belongs to. This allows the trace viewer to
-  // show only a particular group of trace events.
-  optional int64 group_id = 5 [default = -1];
-
-  // The timestamp when this event occurred (picos since tracing started).
-  // This timestamp is in the range [min_timestamp, max_timestamp].
-  optional uint64 timestamp_ps = 6;
-
-  // The duration of the event in picoseconds, if applicable.
-  // Events without duration are called instant events.
-  optional uint64 duration_ps = 7;
-
-  // Storage for additional details, e.g. the raw data that led to this
-  // TraceEvent. These are stored as raw data so that we don't pay the
-  // deserialization cost (memory and runtime) if the data isn't used.
-  // See RawData in trace_events_raw.proto.
-  optional bytes raw_data = 8;
-
-  // Used to correlate the multiple events of a flow.
-  optional uint64 flow_id = 9;
-
-  // Indicates the order of the event within a flow.
-  // Events with the same flow_id will appear in trace_viewer linked by arrows.
-  // For an arrow to be shown, at least the FLOW_START and FLOW_END must be
-  // present. There can be zero or more FLOW_MID events in the flow. Arrows are
-  // drawn from FLOW_START to FLOW_END and through each FLOW_MID event in
-  // timestamp order.
-  enum FlowEntryType {
-    FLOW_NONE = 0;
-    FLOW_START = 1;
-    FLOW_MID = 2;
-    FLOW_END = 3;
-  }
-  optional FlowEntryType flow_entry_type = 10;
-  optional uint32 flow_category = 11;
-
-  // For streaming trace viewer frontend deduplication, we need an unique id
-  // for each events, in the same time, we want to reduce the entropy overhead
-  // introduced by this. therefore we will use tuple<device_id, timestamp_ps,
-  // serial> as unique ids, serial is optional and only required when timestamp
-  // is not unique.
-  optional uint32 serial = 13;
-  reserved 4;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/trace_events.proto";
diff --git a/tensorflow/core/profiler/protobuf/trace_events_raw.proto b/tensorflow/core/profiler/protobuf/trace_events_raw.proto
index 829105b246ef..db2bcc662686 100644
--- a/tensorflow/core/profiler/protobuf/trace_events_raw.proto
+++ b/tensorflow/core/profiler/protobuf/trace_events_raw.proto
@@ -5,50 +5,4 @@ syntax = "proto2";
 
 package tensorflow.profiler;
 
-option cc_enable_arenas = true;
-
-// RawData contains raw data that can be used to attach further details to a
-// TraceEvent. TraceEvents store this raw data in serialized form so it can be
-// decoded on demand. This can improve performance as TraceEvents are often
-// subject to filtering and only a small subset actually needs to be decoded.
-// NEXT ID: 4
-message RawData {
-  oneof raw_data {
-    DmaActivity dma_activity = 1;
-    TraceEventArguments args = 2;
-    TpuTraceData tpu_data = 3;  // Never used. For the ease of template code.
-  }
-}
-
-// DmaActivity can be used to add DMA details to a trace event.
-message DmaActivity {
-  optional uint64 start_time_cycles = 1;
-  optional uint64 end_time_cycles = 2;
-  optional uint64 kilobytes = 4;
-  optional uint32 mesh_chip_id = 5;
-  optional uint32 core_id = 11;
-  optional uint64 dma_address = 6;
-  optional uint32 multicast = 8;
-  optional uint32 segmented = 9;
-  optional uint64 temporary = 10;  // temporary field, not saved to .sstable.
-  reserved 3;
-}
-
-// Generic trace event arguments.
-message TraceEventArguments {
-  message Argument {
-    optional string name = 1;
-    oneof value {
-      string str_value = 2;
-      uint64 uint_value = 3;
-      int64 int_value = 5;
-      double double_value = 4;
-      fixed64 ref_value = 6;  // string type but stored in metadata.
-    }
-  }
-  repeated Argument arg = 1;
-}
-
-message TpuTraceData {
-  optional uint32 dummy = 1;
-}
+import public "plugin/tensorboard_plugin_profile/protobuf/trace_events_raw.proto";
diff --git a/tensorflow/core/profiler/protobuf/xplane.proto b/tensorflow/core/profiler/protobuf/xplane.proto
deleted file mode 100644
index 69655b76d3e1..000000000000
--- a/tensorflow/core/profiler/protobuf/xplane.proto
+++ /dev/null
@@ -1,5 +0,0 @@
-syntax = "proto3";
-
-package tensorflow.profiler.empty;
-
-import public "tsl/profiler/protobuf/xplane.proto";
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index aa8f29fe55d3..ae8b8955caf4 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -43,7 +43,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:profiler_session",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -63,7 +62,10 @@ cc_library(
 tf_profiler_pybind_cc_library_wrapper(
     name = "profiler_server_for_pybind",
     actual = ":profiler_server_impl",
-    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
+    visibility = [
+        "//tensorflow/python/profiler/internal:__pkg__",
+        "@org_xprof//xprof/pywrap:__pkg__",
+    ],
 )
 
 cc_library(
@@ -77,6 +79,7 @@ cc_library(
         "//tensorflow/python/profiler/internal:__pkg__",
         "@local_xla//xla:__subpackages__",
         "@local_xla//xla/tsl/profiler/rpc/client:__pkg__",
+        "@org_xprof//xprof/pywrap:__pkg__",
     ],
     deps = [
         ":profiler_service_impl",
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 2826a9747c29..e41694b21f67 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -1,4 +1,3 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
@@ -15,69 +14,61 @@ package_group(
     ],
 )
 
+# DO NOT ADD NEW DEPENDENCIES TO ANY TARGET IN THIS FILE.
+# Instead, use //third_party/xprof/utils.
+
 cc_library(
     name = "diagnostics",
-    srcs = ["diagnostics.cc"],
     hdrs = ["diagnostics.h"],
     copts = tf_profiler_copts(),
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
+        "@org_xprof//xprof/utils:diagnostics",
     ],
 )
 
 cc_library(
     name = "event_span",
-    srcs = ["event_span.cc"],
     hdrs = ["event_span.h"],
     copts = tf_profiler_copts(),
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+        "@org_xprof//xprof/utils:event_span",
     ],
 )
 
 cc_library(
     name = "hardware_type_utils",
-    srcs = ["hardware_type_utils.cc"],
     hdrs = ["hardware_type_utils.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        ":xplane_schema",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
-)
-
-tf_cc_test(
-    name = "hardware_type_utils_test",
-    srcs = ["hardware_type_utils_test.cc"],
     deps = [
-        ":hardware_type_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
+        "@org_xprof//xprof/utils:hardware_type_utils",
     ],
 )
 
 cc_library(
     name = "math_utils",
     hdrs = ["math_utils.h"],
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/accelerators/xprof/service:__pkg__",
+        "//perftools/accelerators/xprof/xplane:__pkg__",
+        "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/rpc:__pkg__",
+    ],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@local_xla//xla/tsl/profiler/utils:math_utils",
@@ -87,64 +78,40 @@ cc_library(
 cc_library(
     name = "html_utils",
     hdrs = ["html_utils.h"],
+    visibility = [
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
-        "@com_google_absl//absl/strings",
+        "@org_xprof//xprof/utils:html_utils",
     ],
 )
 
 cc_library(
     name = "op_metrics_db_utils",
-    srcs = ["op_metrics_db_utils.cc"],
     hdrs = ["op_metrics_db_utils.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        ":xplane_visitor",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/accelerators/xprof/xplane:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
-)
-
-tf_cc_test(
-    name = "op_metrics_db_utils_test",
-    srcs = ["op_metrics_db_utils_test.cc"],
     deps = [
-        ":op_metrics_db_utils",
-        ":xplane_builder",
-        ":xplane_schema",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
+        "@org_xprof//xprof/utils:op_metrics_db_utils",
     ],
 )
 
 cc_library(
     name = "op_utils",
-    srcs = ["op_utils.cc"],
     hdrs = ["op_utils.h"],
     copts = tf_profiler_copts(),
+    visibility = [
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
-        ":hlo_module_map",
-        ":op_metrics_db_utils",
-        "//tensorflow/core/profiler/convert:op_metrics_db_combiner",
-        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/tsl/platform:types",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+        "@org_xprof//xprof/utils:op_utils",
     ],
 )
 
@@ -152,6 +119,12 @@ cc_library(
     name = "trace_utils",
     hdrs = ["trace_utils.h"],
     copts = tf_profiler_copts(),
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/accelerators/xprof/xprofilez/nvidia_gpu:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
         "@local_xla//xla/tsl/profiler/utils:trace_utils",
     ],
@@ -190,7 +163,11 @@ cc_library(
     testonly = True,
     hdrs = ["xplane_test_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = [
+        "//perftools/accelerators/xprof/db:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = ["@local_xla//xla/tsl/profiler/utils:xplane_test_utils"],
 )
 
@@ -206,275 +183,132 @@ cc_library(
 
 cc_library(
     name = "cost_utils",
-    srcs = ["cost_utils.cc"],
     hdrs = ["cost_utils.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        ":xplane_schema",
-        ":xplane_visitor",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler/costs:cost_estimator",
-        "//tensorflow/core/grappler/costs:op_context",
-        "//tensorflow/core/grappler/costs:op_level_cost_estimator",
-        "//tensorflow/core/grappler/costs:op_performance_data_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
-)
-
-cc_library(
-    name = "host_offload_utils",
-    srcs = ["host_offload_utils.cc"],
-    hdrs = ["host_offload_utils.h"],
-    copts = tf_profiler_copts(),
     deps = [
-        ":trace_utils",
-        ":xplane_builder",
-        ":xplane_schema",
-        ":xplane_visitor",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+        "@org_xprof//xprof/utils:cost_utils",
     ],
 )
 
 cc_library(
     name = "derived_timeline",
-    srcs = ["derived_timeline.cc"],
     hdrs = ["derived_timeline.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
-    deps = [
-        ":gpu_event_stats",
-        ":hlo_module_map",
-        ":hlo_proto_map",
-        ":host_offload_utils",
-        ":trace_utils",
-        ":xplane_builder",
-        ":xplane_schema",
-        ":xplane_utils",
-        ":xplane_visitor",
-        "//tensorflow/core:lib_internal",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:device_utils",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
-        "@local_xla//xla/tsl/profiler/utils:math_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
-        "@local_xla//xla/tsl/profiler/utils:tpu_xplane_utils",
-        "@local_xla//xla/tsl/profiler/utils:trace_utils",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
-        "@local_xla//xla/tsl/profiler/utils:xplane_visitor",
-        "@local_xla//xla/tsl/util:stats_calculator_portable",
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/accelerators/xprof/xplane:__pkg__",
+        "//platforms/darwinn/tools/xprof_trace:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
-)
-
-tf_cc_test(
-    name = "derived_timeline_test",
-    srcs = ["derived_timeline_test.cc"],
     deps = [
-        ":derived_timeline",
-        ":trace_utils",
-        ":xplane_builder",
-        ":xplane_schema",
-        ":xplane_test_utils",
-        ":xplane_visitor",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:group_events",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-        "@local_xla//xla/tsl/profiler/utils:xplane_schema",
+        "@org_xprof//xprof/utils:derived_timeline",
     ],
 )
 
 cc_library(
     name = "kernel_stats_utils",
-    srcs = ["kernel_stats_utils.cc"],
     hdrs = ["kernel_stats_utils.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "kernel_stats_utils_test",
-    srcs = ["kernel_stats_utils_test.cc"],
-    deps = [
-        ":kernel_stats_utils",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/backends/profiler/gpu:cupti_buffer_events",
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
+    deps = ["@org_xprof//xprof/utils:kernel_stats_utils"],
 )
 
 cc_library(
     name = "tfstreamz_utils",
-    srcs = ["tfstreamz_utils.cc"],
     hdrs = ["tfstreamz_utils.h"],
     copts = tf_profiler_copts(),
+    visibility = ["//perftools/accelerators/xprof/xprofilez/cpu:__pkg__"],
     deps = [
-        ":xplane_builder",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/framework:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:tfstreamz_proto_cc",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+        "@org_xprof//xprof/utils:tfstreamz_utils",
     ],
 )
 
 cc_library(
     name = "step_intersection",
-    srcs = ["step_intersection.cc"],
     hdrs = ["step_intersection.h"],
     copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/tsl/profiler/utils:timespan",
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
-)
-
-tf_cc_test(
-    name = "step_intersection_test",
-    srcs = ["step_intersection_test.cc"],
     deps = [
-        ":step_intersection",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@org_xprof//xprof/utils:step_intersection",
     ],
 )
 
 cc_library(
     name = "device_caps_utils",
-    srcs = ["device_caps_utils.cc"],
     hdrs = ["device_caps_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
-    deps = [
-        ":xplane_builder",
-        ":xplane_schema",
-        ":xplane_visitor",
-        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
+    visibility = [
+        "//perftools/accelerators/xprof/xplane:__pkg__",
+        "//platforms/xla/tools:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
+    deps = ["@org_xprof//xprof/utils:device_caps_utils"],
 )
 
 cc_library(
     name = "gpu_event_stats",
-    srcs = ["gpu_event_stats.cc"],
     hdrs = ["gpu_event_stats.h"],
     copts = tf_profiler_copts(),
-    visibility = [":friends"],
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
-        ":xplane_schema",
-        ":xplane_visitor",
-        "@com_google_absl//absl/strings",
+        "@org_xprof//xprof/utils:gpu_event_stats",
     ],
 )
 
 cc_library(
     name = "hlo_proto_map",
-    srcs = ["hlo_proto_map.cc"],
     hdrs = ["hlo_proto_map.h"],
-    visibility = [":friends"],
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/accelerators/xprof/xplane:__pkg__",
+        "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+        "@org_xprof//xprof/convert/google:__pkg__",
+    ],
     deps = [
-        ":xplane_schema",
-        ":xplane_utils",
-        ":xplane_visitor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-        "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
+        "@org_xprof//xprof/utils:hlo_proto_map",
     ],
 )
 
 cc_library(
     name = "hlo_proto_to_module",
-    srcs = ["hlo_proto_to_module.cc"],
     hdrs = ["hlo_proto_to_module.h"],
-    visibility = [":friends"],
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status:statusor",
-        "@local_xla//xla:util",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/tsl/platform:statusor",
+        "@org_xprof//xprof/utils:hlo_proto_to_module",
     ],
 )
 
-tf_cuda_library(
+cc_library(
     name = "hlo_module_map",
-    srcs = ["hlo_module_map.cc"],
     hdrs = ["hlo_module_map.h"],
-    cuda_deps = [
-        "@local_xla//xla/service/gpu/model:gpu_hlo_cost_analysis",
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
-    visibility = [":friends"],
     deps = [
-        ":hlo_module_utils",
-        ":hlo_proto_map",
-        ":hlo_proto_to_module",
-        "//tensorflow/core/platform:path",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_cost_analysis",
-        "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
+        "@org_xprof//xprof/utils:hlo_module_map",
     ],
 )
 
@@ -482,77 +316,48 @@ cc_library(
     name = "hlo_module_utils",
     hdrs = ["hlo_module_utils.h"],
     visibility = [
-        ":friends",
-        # copybara:uncomment "//tensorflow/compiler/mlir/lite/experimental/google/tooling/google:__subpackages__",
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/compiler/mlir/lite/experimental/google/tooling/hlo_adapter:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
     ],
     deps = [
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/tsl/profiler/convert:xla_op_utils",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_module_utils_test",
-    srcs = ["hlo_module_utils_test.cc"],
-    deps = [
-        ":hlo_module_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/tests:hlo_test_base",
+        "@org_xprof//xprof/utils:hlo_module_utils",
     ],
 )
 
 cc_library(
     name = "xprof_gpu_cost_analysis",
-    srcs = ["xprof_gpu_cost_analysis.cc"],
     hdrs = ["xprof_gpu_cost_analysis.h"],
-    visibility = [":friends"],
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//perftools/accelerators/xprof/xplane:__pkg__",
+    ],
     deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_cost_analysis",
-        "@local_xla//xla/service/gpu:cublas_cudnn",
-        "@local_xla//xla/service/gpu/model:gpu_hlo_cost_analysis",
-        "@local_xla//xla/tsl/platform:errors",
+        "@org_xprof//xprof/utils:xprof_gpu_cost_analysis",
     ],
 )
 
 cc_library(
     name = "tpu_step_breakdown_utils",
     hdrs = ["tpu_step_breakdown_utils.h"],
-    visibility = [":friends"],
-    deps = ["//tensorflow/core/profiler/protobuf:steps_db_proto_cc"],
+    visibility = [
+        "//perftools/accelerators/xprof/convert:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
+    deps = [
+        "@org_xprof//xprof/utils:tpu_step_breakdown_utils",
+    ],
 )
 
 cc_library(
     name = "tpu_step_details_utils",
     hdrs = ["tpu_step_details_utils.h"],
-    visibility = [":friends"],
-    deps = ["//tensorflow/core/profiler/protobuf:tpu_input_pipeline_proto_cc"],
-)
-
-tf_cc_test(
-    name = "xprof_gpu_cost_analysis_test",
-    srcs = ["xprof_gpu_cost_analysis_test.cc"],
+    visibility = [
+        "//perftools/gputools/profiler/collector:__pkg__",
+        "//tensorflow/core/profiler/convert:__pkg__",
+    ],
     deps = [
-        ":xprof_gpu_cost_analysis",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/hlo/testlib:test_helpers",
-        "@local_xla//xla/service:hlo_cost_analysis",
-        "@local_xla//xla/tests:hlo_test_base",
-        "@local_xla//xla/tests:xla_internal_test_main",
-        "@local_xla//xla/tsl/platform:statusor",
+        "@org_xprof//xprof/utils:tpu_step_details_utils",
     ],
 )
diff --git a/tensorflow/core/profiler/utils/cost_utils.cc b/tensorflow/core/profiler/utils/cost_utils.cc
deleted file mode 100644
index 8d44fd513d6e..000000000000
--- a/tensorflow/core/profiler/utils/cost_utils.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/cost_utils.h"
-
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "absl/strings/strip.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/grappler/costs/cost_estimator.h"
-#include "tensorflow/core/grappler/costs/op_context.h"
-#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-// Decode the string that encodes tensor shape and type information and convert
-// to TensorProperties.
-// Returns an empty TensorProperties if error or input is "".
-// See OpKernel::TraceString() to see when the shape is encoded as "".
-// Input format is <DTYPE>[<dim1>, <dim2>,...]
-static OpInfo::TensorProperties GetTensorProperties(absl::string_view info) {
-  OpInfo::TensorProperties tensor_prop;
-  std::vector<absl::string_view> parts = absl::StrSplit(info, '[');
-  if (parts.size() != 2) return tensor_prop;
-  DataType data_type = DT_INVALID;
-  if (!DataTypeFromString(parts[0], &data_type)) return tensor_prop;
-  tensor_prop.set_dtype(data_type);
-  absl::ConsumeSuffix(&parts[1], "]");
-  if (parts[1].empty()) {  // Scalar type.
-    tensor_prop.mutable_shape()->add_dim()->set_size(1);
-    return tensor_prop;
-  }
-  std::vector<absl::string_view> dims = absl::StrSplit(parts[1], ',');
-  for (const auto dim : dims) {
-    int size;
-    if (!absl::SimpleAtoi(dim, &size)) return OpInfo::TensorProperties();
-    tensor_prop.mutable_shape()->add_dim()->set_size(size);
-  }
-  return tensor_prop;
-}
-
-}  // namespace
-
-TfOpRoofLineCostEstimator::~TfOpRoofLineCostEstimator() {
-  if (!unsupported_ops_.empty()) {
-    LOG(ERROR) << "Unsupported Op for Roofline Cost Analysis are:"
-               << absl::StrJoin(unsupported_ops_, ",");
-  }
-}
-
-grappler::DeviceInfo TfOpRoofLineCostEstimator::GetDeviceInfo(
-    const DeviceProperties& device) const {
-  // Hypothetical devices that is used to measure peak flops and memory bytes
-  // accessed.
-  return grappler::DeviceInfo(/*gigaops=*/1, /*gb_per_sec=*/1);
-}
-
-TfOpRoofLineCostEstimator::OpRoofLineStats TfOpRoofLineCostEstimator::Predict(
-    const XEventVisitor& event) {
-  tsl::profiler::TfOp tf_op;
-  absl::string_view tensor_shapes;
-  event.ForEachStat([&](const XStatVisitor& stat) {
-    if (!stat.Type().has_value()) return;
-    switch (stat.Type().value()) {
-      case StatType::kTfOp:
-        tf_op = tsl::profiler::ParseTfOpFullname(stat.StrOrRefValue());
-        break;
-      case StatType::kTensorShapes:
-        tensor_shapes = stat.StrOrRefValue();
-        break;
-    }
-  });
-
-  // Return empty OpRoofLineStats if shape is not traced or this is not a tf op.
-  if (tf_op.type.empty() || tensor_shapes.empty()) {
-    return {0ULL, 0ULL, /*inaccurate=*/true};
-  }
-
-  grappler::OpContext op_context;
-  op_context.name = std::string(tf_op.type);
-  op_context.op_info.set_op(op_context.name);
-  for (absl::string_view tensor :
-       tsl::profiler::ParseTensorShapes(tensor_shapes)) {
-    *op_context.op_info.add_inputs() = GetTensorProperties(tensor);
-  }
-  grappler::Costs costs = PredictCosts(op_context);
-  if (costs.inaccurate) unsupported_ops_.insert(std::string(tf_op.type));
-
-  VLOG(1) << tf_op.type << tensor_shapes
-          << " flops:" << costs.compute_time.count()
-          << " bytes:" << costs.memory_time.count();
-
-  /* The compute_time is measured in nanoseconds, therefore numerically it is
-   * equal to flops because giga ops / second cancel the nanoseconds.
-   * Same for memory_time */
-  return {/*flops=*/static_cast<uint64>(costs.compute_time.count()),
-          /*bytes_accessed=*/static_cast<uint64>(costs.memory_time.count()),
-          /*inaccurate=*/costs.inaccurate};
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/cost_utils.h b/tensorflow/core/profiler/utils/cost_utils.h
index 01a6540d8145..b7f139ffdb59 100644
--- a/tensorflow/core/profiler/utils/cost_utils.h
+++ b/tensorflow/core/profiler/utils/cost_utils.h
@@ -15,45 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 
-#include <string>
-
-#include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/grappler/costs/cost_estimator.h"
-#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// This is a wrapper of tensorflow::grappler::OpLevelCostEstimator and use
-// tracing time information to estimate the roof line stats for each traced
-// tensorflow op.
-class TfOpRoofLineCostEstimator
-    : public tensorflow::grappler::OpLevelCostEstimator {
- public:
-  TfOpRoofLineCostEstimator() = default;
-  ~TfOpRoofLineCostEstimator() override;
-
-  grappler::DeviceInfo GetDeviceInfo(
-      const DeviceProperties& device) const override;
-
-  struct OpRoofLineStats {
-    uint64 flops = 0LL;
-    uint64 bytes_accessed = 0LL;
-    bool inaccurate = false;
-  };
-  OpRoofLineStats Predict(const XEventVisitor& event);
-
- private:
-  absl::flat_hash_set<std::string>
-      unsupported_ops_;  // summary for unsupported ops.
-
-  TfOpRoofLineCostEstimator(const TfOpRoofLineCostEstimator&) = delete;
-  void operator=(const TfOpRoofLineCostEstimator&) = delete;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/cost_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
deleted file mode 100644
index 78c59be30d11..000000000000
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ /dev/null
@@ -1,771 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/utils/derived_timeline.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "absl/types/span.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/device_utils.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
-#include "xla/tsl/profiler/utils/trace_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "xla/tsl/util/stats_calculator.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/profiler/utils/gpu_event_stats.h"
-#include "tensorflow/core/profiler/utils/hlo_module_map.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-#include "tensorflow/core/profiler/utils/host_offload_utils.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::tsl::profiler::DeviceType;
-using ::tsl::profiler::FindMutableTensorCorePlanes;
-using ::tsl::profiler::GetDeviceType;
-
-inline std::string HloModuleEventName(const GpuEventStats& stats) {
-  return stats.program_id ? tsl::profiler::HloModuleNameWithProgramId(
-                                stats.hlo_module_name, *stats.program_id)
-                          : std::string(stats.hlo_module_name);
-}
-
-// Returns a prefix that uniquely identifies the HLO module.
-inline std::string HloOpEventPrefix(const GpuEventStats& stats) {
-  return stats.program_id ? absl::StrCat(*stats.program_id, "/")
-                          : absl::StrCat(stats.hlo_module_name, "/");
-}
-
-std::vector<XEventMetadata*> GetOrCreateHloOpEventsMetadata(
-    XPlaneBuilder& xplane, const GpuEventStats& stats, const Symbol symbol) {
-  DCHECK(stats.IsXlaOp());
-  std::vector<XEventMetadata*> hlo_op_events_metadata;
-  hlo_op_events_metadata.reserve(stats.hlo_op_names.size());
-  // Prepend an HLO module identifier so HLO operators with the same name but in
-  // different modules have different metadata.
-  std::string hlo_op_event_prefix = HloOpEventPrefix(stats);
-  for (absl::string_view hlo_op_name : stats.hlo_op_names) {
-    XEventMetadata* hlo_op_event_metadata = xplane.GetOrCreateEventMetadata(
-        absl::StrCat(hlo_op_event_prefix, hlo_op_name));
-    // Display the HLO name without the module name in tools.
-    if (hlo_op_event_metadata->display_name().empty()) {
-      hlo_op_event_metadata->set_display_name(std::string(hlo_op_name));
-    }
-    hlo_op_events_metadata.push_back(hlo_op_event_metadata);
-    if (!symbol.hlo_text.empty()) {
-      XStatsBuilder<XEventMetadata> event_stats(hlo_op_event_metadata, &xplane);
-      event_stats.SetOrAddStatValue(*xplane.GetOrCreateStatMetadata("hlo_text"),
-                                    symbol.hlo_text);
-    }
-  }
-  return hlo_op_events_metadata;
-}
-
-// Get the derived line id for a given derived line in group which starts from
-// first_derived_line_id.
-// According to definition in trace_utils.h, the derived lines are:
-// kThreadIdTfNameScope to kThreadIdSource. Keep the line id sequence in each
-// group as this original group..
-inline int64_t GetDerivedLineId(int64_t first_derived_line_id,
-                                int64_t target_line_id) {
-  return first_derived_line_id + (target_line_id - kThreadIdTfNameScope);
-}
-
-// Get the derived line name for a given derived line in group which starts from
-// first_derived_line_id.
-std::string GetDerivedLineName(int64_t first_derived_line_id,
-                               int64_t target_line_id,
-                               absl::Span<const int64_t> source_line_ids) {
-  int64_t offset = target_line_id - kThreadIdTfNameScope;
-  std::string suffix;
-  if (first_derived_line_id != kThreadIdTfNameScope &&
-      !source_line_ids.empty()) {
-    suffix = absl::StrCat(" - from #", source_line_ids[0]);
-  }
-  switch (offset) {
-    case kThreadIdTfNameScope - kThreadIdTfNameScope:
-      return absl::StrCat(kTensorFlowNameScopeLineName, suffix);
-    case kThreadIdHloOp - kThreadIdTfNameScope:
-      return absl::StrCat(kXlaOpLineName, suffix);
-    case kThreadIdHloModule - kThreadIdTfNameScope:
-      return absl::StrCat(kXlaModuleLineName, suffix);
-    case kThreadIdTfOp - kThreadIdTfNameScope:
-      return absl::StrCat(kTensorFlowOpLineName, suffix);
-    case kThreadIdSource - kThreadIdTfNameScope:
-      return absl::StrCat(kSourceLineName, suffix);
-    default:
-      LOG(ERROR) << "Invalid target line id: " << target_line_id
-                 << " for first_derived_line_id: " << first_derived_line_id;
-      return absl::StrCat("UnknownDerived#", first_derived_line_id + offset);
-  }
-}
-
-// Derive events from the given line ids using annotations.
-// Returns the derived line ids in the order of tf_name_scope, tf_op, hlo_op,
-// hlo_module, source. Where the derived line id for tf_name_scope is
-// first_derived_line_id.
-std::vector<int64_t> DeriveEventsFromAnnotationsForLines(
-    const SymbolResolver& symbol_resolver, XPlane* device_trace,
-    absl::Span<const int64_t> line_ids, int64_t first_derived_line_id,
-    const ScopeRangeIdTree* scope_range_id_tree = nullptr) {
-  XPlaneVisitor plane_visitor =
-      tsl::profiler::CreateTfXPlaneVisitor(device_trace);
-
-  XPlaneBuilder plane_builder(device_trace);
-  int64_t start_timestamp_ns = GetStartTimestampNs(*device_trace);
-  DerivedXLineBuilder tf_ops(
-      &plane_builder, GetDerivedLineId(first_derived_line_id, kThreadIdTfOp),
-      GetDerivedLineName(first_derived_line_id, kThreadIdTfOp, line_ids),
-      start_timestamp_ns, {});
-  DerivedXLineBuilder tf_name_scope(
-      &plane_builder,
-      GetDerivedLineId(first_derived_line_id, kThreadIdTfNameScope),
-      GetDerivedLineName(first_derived_line_id, kThreadIdTfNameScope, line_ids),
-      start_timestamp_ns, {&tf_ops});
-  DerivedXLineBuilder hlo_ops(
-      &plane_builder, GetDerivedLineId(first_derived_line_id, kThreadIdHloOp),
-      GetDerivedLineName(first_derived_line_id, kThreadIdHloOp, line_ids),
-      start_timestamp_ns, {});
-  DerivedXLineBuilder hlo_modules(
-      &plane_builder,
-      GetDerivedLineId(first_derived_line_id, kThreadIdHloModule),
-      GetDerivedLineName(first_derived_line_id, kThreadIdHloModule, line_ids),
-      start_timestamp_ns, {&tf_name_scope, &hlo_ops});
-  DerivedXLineBuilder source(
-      &plane_builder, GetDerivedLineId(first_derived_line_id, kThreadIdSource),
-      GetDerivedLineName(first_derived_line_id, kThreadIdSource, line_ids),
-      start_timestamp_ns, {});
-
-  // Declare this vector here so that its memory will be reused during the loop,
-  // instead of being allocated and deallocated for each iteration.
-  std::vector<std::optional<int64_t>> level_range_ids;
-  for (const XEventVisitor& event :
-       GetSortedEvents<XEventVisitor>(plane_visitor, false, line_ids)) {
-    GpuEventStats stats(&event);
-    // For HLO/TF op lines, only use kernel events (i.e. excluding memcpy or
-    // allocation events). Also CudaGraph executions are also treated as
-    // kernel events.
-    if (!stats.IsKernel() && !stats.IsCudaGraphExecution()) continue;
-    tsl::profiler::Timespan event_span = event.GetTimespan();
-
-    if ((!stats.hlo_module_name.empty() || stats.IsXlaOp())) {
-      level_range_ids.clear();
-      if (stats.scope_range_id.has_value()) {
-        level_range_ids.push_back(stats.scope_range_id);
-        if (scope_range_id_tree) {
-          for (auto it = scope_range_id_tree->find(*stats.scope_range_id);
-               it != scope_range_id_tree->end();
-               it = scope_range_id_tree->find(it->second)) {
-            level_range_ids.push_back(it->second);
-          }
-        }
-      }
-      // Now, level_range_ids looks like:
-      // [child_level_n, child_level_n-1, ..., child_level_1, root_level]
-    }
-
-    if (!stats.hlo_module_name.empty()) {
-      // back() of the level_range_ids, i.e. root_level in above comment,
-      // is the scope range id of HLO module.
-      hlo_modules.ExpandOrAddEvent(
-          *plane_builder.GetOrCreateEventMetadata(HloModuleEventName(stats)),
-          event_span, stats.group_id,
-          level_range_ids.empty() ? std::nullopt : level_range_ids.back());
-    }
-
-    if (stats.IsXlaOp()) {
-      auto symbol = symbol_resolver(stats.program_id, stats.hlo_module_name,
-                                    stats.hlo_op_names.back());
-      auto hlo_events_metadata =
-          GetOrCreateHloOpEventsMetadata(plane_builder, stats, symbol);
-      // level_range_ids, if not empty, should be of same size as
-      // hlo_events_metadata. If not of same size, do not use those ids.
-      absl::Span<std::optional<int64_t>> xla_op_level_range_ids = {};
-      if (level_range_ids.size() == hlo_events_metadata.size()) {
-        std::reverse(level_range_ids.begin(), level_range_ids.end());
-        // after reverse, the level_range_ids looks like:
-        // [root_level, child_level_1, ..., child_level_n-1, child_level_n]
-        xla_op_level_range_ids = absl::MakeSpan(level_range_ids);
-      }
-      hlo_ops.ExpandOrAddEvents(hlo_events_metadata, event_span, stats.group_id,
-                                xla_op_level_range_ids);
-
-      // If the kernel event is nodes of a CudaGraph or a whole cuda graph
-      // exec, try to mark extra stats to to corresponding XLA op event here.
-      if (stats.cuda_graph_id_for_inner_node.has_value() &&
-          *stats.cuda_graph_id_for_inner_node != 0) {
-        int level = static_cast<int>(hlo_events_metadata.size()) - 1;
-        if (level >= 0) {
-          hlo_ops.AddStatToLevelEvent(level, *hlo_ops.GetCudaGraphIdMetadata(),
-                                      *stats.cuda_graph_id_for_inner_node);
-          if (stats.correlation_id.has_value()) {
-            hlo_ops.AddStatToLevelEvent(level,
-                                        *hlo_ops.GetCorrelationIdMetadata(),
-                                        *stats.correlation_id);
-          }
-        }
-      }
-
-      if (!symbol.tf_op_name.empty()) {
-        ProcessTfOpEvent(symbol.tf_op_name, event_span, stats.group_id,
-                         plane_builder, tf_name_scope, tf_ops);
-      }
-      if (!symbol.source_info.empty()) {
-        source.ExpandOrAddEvent(
-            *plane_builder.GetOrCreateEventMetadata(symbol.source_info),
-            event_span, stats.group_id);
-      }
-    } else if (stats.IsTfOp()) {
-      ProcessTfOpEvent(stats.tf_op_fullname, event_span, stats.group_id,
-                       plane_builder, tf_name_scope, tf_ops);
-    }
-  }
-  return {tf_name_scope.Line().Id(), tf_ops.Line().Id(),
-          hlo_modules.Line().Id(), hlo_ops.Line().Id(), source.Line().Id()};
-}
-
-}  // namespace
-
-void ProcessTfOpEvent(absl::string_view tf_op_full_name,
-                      tsl::profiler::Timespan event_span,
-                      std::optional<int64_t> group_id,
-                      XPlaneBuilder& plane_builder,
-                      DerivedXLineBuilder& tf_name_scope_line_builder,
-                      DerivedXLineBuilder& tf_op_line_builder) {
-  tsl::profiler::TfOp tf_op = tsl::profiler::ParseTfOpFullname(tf_op_full_name);
-  tsl::profiler::Category category = tf_op.category;
-  if (category == tsl::profiler::Category::kTensorFlow ||
-      category == tsl::profiler::Category::kJax) {
-    tf_name_scope_line_builder.ExpandOrAddEvents(
-        plane_builder.GetOrCreateEventsMetadata(
-            tsl::profiler::ParseTfNameScopes(tf_op)),
-        event_span, group_id);
-  }
-  XEventMetadata* tf_op_event_metadata =
-      plane_builder.GetOrCreateEventMetadata(tf_op_full_name);
-  // Set the display name to op_type so that the events of the same op_type have
-  // the same color in the trace viewer.
-  if (tf_op_event_metadata->display_name().empty()) {
-    tf_op_event_metadata->set_display_name(tsl::profiler::TfOpEventName(tf_op));
-  }
-  tf_op_line_builder.ExpandOrAddEvent(*tf_op_event_metadata, event_span,
-                                      group_id);
-}
-
-DerivedXEventBuilder::DerivedXEventBuilder(
-    XEventBuilder event, std::optional<int64_t> group_id,
-    std::optional<int64_t> scope_range_id)
-    : event_(std::move(event)),
-      group_id_(group_id),
-      scope_range_id_(scope_range_id) {}
-
-bool DerivedXEventBuilder::ShouldExpand(
-    const XEventMetadata& event_metadata, std::optional<int64_t> group_id,
-    std::optional<int64_t> scope_range_id) const {
-  return event_.MetadataId() == event_metadata.id() && group_id_ == group_id &&
-         (!scope_range_id.has_value() || !scope_range_id_.has_value() ||
-          scope_range_id_ == scope_range_id);
-}
-
-void DerivedXEventBuilder::Expand(tsl::profiler::Timespan event_span) {
-  tsl::profiler::Timespan timespan = event_.GetTimespan();
-  DCHECK_LE(timespan.begin_ps(), event_span.begin_ps());
-  timespan.ExpandToInclude(event_span);
-  event_.SetTimespan(timespan);
-}
-
-DerivedXLineBuilder::DerivedXLineBuilder(
-    XPlaneBuilder* plane, int64_t line_id, absl::string_view name,
-    int64_t timestamp_ns, std::vector<DerivedXLineBuilder*> dependent_lines)
-    : group_id_stat_metadata_(
-          plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId))),
-      correlation_id_metadata_(plane->GetOrCreateStatMetadata(
-          GetStatTypeStr(StatType::kCorrelationId))),
-      cuda_graph_id_metadata_(plane->GetOrCreateStatMetadata(
-          GetStatTypeStr(StatType::kCudaGraphId))),
-      line_(plane->GetOrCreateLine(line_id)),
-      dependent_lines_(std::move(dependent_lines)) {
-  line_.SetName(name);
-  line_.SetTimestampNs(timestamp_ns);
-  is_gpu_plane_ = GetDeviceType(plane->Name()) == DeviceType::kGpu;
-}
-
-void DerivedXLineBuilder::ExpandOrAddEvent(
-    const XEventMetadata& event_metadata, tsl::profiler::Timespan event_span,
-    std::optional<int64_t> group_id, std::optional<int64_t> scope_range_id) {
-  ExpandOrAddLevelEvent(event_metadata, event_span, group_id, scope_range_id,
-                        /*level=*/0);
-}
-
-void DerivedXLineBuilder::ExpandOrAddEvents(
-    const std::vector<XEventMetadata*>& events_metadata_per_level,
-    tsl::profiler::Timespan event_span, std::optional<int64_t> group_id,
-    absl::Span<std::optional<int64_t>> scope_range_ids) {
-  if (events_metadata_per_level.empty()) return;
-
-  size_t current_nested_level = events_metadata_per_level.size();
-  for (size_t level = 0; level < current_nested_level; ++level) {
-    ExpandOrAddLevelEvent(
-        *events_metadata_per_level[level], event_span, group_id,
-        level < scope_range_ids.size() ? scope_range_ids[level] : std::nullopt,
-        level);
-  }
-  ResetLastEvents(current_nested_level);
-}
-
-void DerivedXLineBuilder::ExpandOrAddLevelEvent(
-    const XEventMetadata& event_metadata, tsl::profiler::Timespan event_span,
-    std::optional<int64_t> group_id, std::optional<int64_t> scope_range_id,
-    int level) {
-  auto& last_event = last_event_by_level_[level];
-  // If group_id is not set and we still choose to expand, put an extra check:
-  // Expand only if the gap between the last event and the new event is less
-  // than 2 * duration of the last event.
-  // TODO: b/373944719 - add the extra node_id check for GPU profiles.
-  if (last_event.has_value() &&
-      last_event->ShouldExpand(event_metadata, group_id, scope_range_id) &&
-      (is_gpu_plane_ || group_id.has_value() ||
-       (last_event->GetTimespan().end_ps() +
-        2 * last_event->GetTimespan().duration_ps()) >=
-           event_span.begin_ps())) {
-    // Expand the last event to cover the given event.
-    last_event->Expand(event_span);
-  } else {
-    // Otherwise, reset the last events lower than or equal to the given level.
-    ResetLastEvents(level);
-    // And create a new event for the given level.
-    XEventBuilder event = line_.AddEvent(event_metadata);
-    event.SetTimespan(event_span);
-    if (group_id.has_value()) {
-      event.AddStatValue(*group_id_stat_metadata_, *group_id);
-    }
-    last_event.emplace(std::move(event), group_id, scope_range_id);
-  }
-}
-
-void DerivedXLineBuilder::AddStatToLevelEvent(int level,
-                                              const XStatMetadata& metadata,
-                                              int64_t value) {
-  if (auto it = last_event_by_level_.find(level);
-      it != last_event_by_level_.end() && it->second.has_value()) {
-    it->second->SetOrAddStatValue(metadata, value);
-  }
-}
-
-void DerivedXLineBuilder::AddStatToLevelEvent(int level,
-                                              const XStatMetadata& metadata,
-                                              uint64_t value) {
-  if (auto it = last_event_by_level_.find(level);
-      it != last_event_by_level_.end() && it->second.has_value()) {
-    it->second->SetOrAddStatValue(metadata, value);
-  }
-}
-
-// When deriving a bunch of events with the same timespan, there could be
-// indeterministic behavior of how trace viewer stacking these events.
-// This function will shrink the stack of events with the same timespan when
-// necessary. Event at top of stack might shrink more than event at the
-// bottom. Because the time unit in trace viewer is nanosecond, therefore the
-// minimum difference is 1ns. However to prevent shrink induced inconsitency,
-// we can not shrink more than the duration of event at the top of the stack.
-void DerivedXLineBuilder::AdjustDurationForTraceViewer(int level) {
-  if (level >= last_event_by_level_.size() || !last_event_by_level_[level])
-    return;
-
-  int max_level = level;
-  for (; max_level < last_event_by_level_.size(); ++max_level) {
-    if (!last_event_by_level_[max_level].has_value()) {
-      break;
-    }
-  }
-  --max_level;
-  if (max_level <= level) return;
-  auto& event_on_top_stack = *last_event_by_level_[max_level];
-  tsl::profiler::Timespan timespan = event_on_top_stack.GetTimespan();
-  // We will at most shrink the top of the stack to 1ns.
-  int64_t max_shrink_ns = timespan.duration_ps() / 1000 - 1;
-  int64_t shrink_ns = 0;
-  std::optional<tsl::profiler::Timespan> last_level_timespan;
-  for (int i = level; i <= max_level; ++i) {
-    auto& current_event = *last_event_by_level_[i];
-    if (shrink_ns < max_shrink_ns &&
-        last_level_timespan == current_event.GetTimespan()) {
-      shrink_ns++;
-    }
-    last_level_timespan = current_event.GetTimespan();
-    if (shrink_ns) {
-      current_event.SetTimespan(tsl::profiler::Timespan::FromEndPoints(
-          last_level_timespan->begin_ps(),
-          last_level_timespan->end_ps() - 1000 * shrink_ns));
-    }
-  }
-}
-
-void DerivedXLineBuilder::ResetLastEvents(int level) {
-  AdjustDurationForTraceViewer(level);
-  for (int i = level, end = last_event_by_level_.size(); i < end; ++i) {
-    last_event_by_level_[i].reset();
-  }
-  if (level == 0) {
-    for (DerivedXLineBuilder* line : dependent_lines_) {
-      line->ResetLastEvents(0);
-    }
-  }
-}
-
-void DeriveStepEventsFromGroups(
-    const tsl::profiler::GroupMetadataMap& group_metadata_map,
-    XPlane* device_trace) {
-  XPlaneVisitor plane_visitor =
-      tsl::profiler::CreateTfXPlaneVisitor(device_trace);
-  const XStatMetadata* group_id_stat_metadata =
-      plane_visitor.GetStatMetadataByType(StatType::kGroupId);
-  if (group_id_stat_metadata == nullptr) return;
-  XPlaneBuilder plane_builder(device_trace);
-  int64_t start_timestamp_ns = GetStartTimestampNs(*device_trace);
-  DerivedXLineBuilder steps(&plane_builder, kThreadIdStepInfo, kStepLineName,
-                            start_timestamp_ns, {});
-  for (const XEventVisitor& event_visitor :
-       GetSortedEvents<XEventVisitor>(plane_visitor)) {
-    std::optional<XStatVisitor> group_id_stat =
-        event_visitor.GetStat(StatType::kGroupId, *group_id_stat_metadata);
-    if (group_id_stat.has_value()) {
-      int64_t group_id = group_id_stat->IntValue();
-      steps.ExpandOrAddEvent(
-          *plane_builder.GetOrCreateEventMetadata(absl::StrCat(group_id)),
-          event_visitor.GetTimespan(), group_id);
-    }
-  }
-  AddGroupMetadataToStepEvents(group_metadata_map, steps.Line());
-}
-
-void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
-                                 XPlane* device_trace,
-                                 const ScopeRangeIdTree* scope_range_id_tree) {
-  if (tsl::profiler::GetDeviceType(*device_trace) !=
-      tsl::profiler::DeviceType::kGpu) {
-    DeriveEventsFromAnnotationsForLines(symbol_resolver, device_trace, {},
-                                        kThreadIdTfNameScope);
-  } else {
-    // TODO: Currently we derive events only from the line with the most number
-    // of events. We should consider deriving events from all lines in the
-    // future, also then we need to utilize the derived relation provided by
-    // DeriveEventsFromAnnotationsForLines(), and find solid way to sort all
-    // lines.
-    int64_t line_id_with_most_events = -1;
-    int64_t max_num_events_per_line = -1;
-    {
-      XPlaneVisitor plane_visitor =
-          tsl::profiler::CreateTfXPlaneVisitor(device_trace);
-      plane_visitor.ForEachLine([&](const XLineVisitor& line) {
-        if (IsDerivedThreadId(line.Id())) return;
-        int num_events = line.NumEvents();
-        // make sure strong ordering
-        if (num_events > max_num_events_per_line ||
-            (num_events == max_num_events_per_line &&
-             line.Id() < line_id_with_most_events)) {
-          max_num_events_per_line = num_events;
-          line_id_with_most_events = line.Id();
-        }
-      });
-    }
-
-    if (line_id_with_most_events >= 0) {
-      DeriveEventsFromAnnotationsForLines(
-          symbol_resolver, device_trace, {line_id_with_most_events},
-          kThreadIdTfNameScope, scope_range_id_tree);
-    }
-  }
-  RemoveEmptyLines(device_trace);
-}
-
-void DeriveEventsFromHostTrace(
-    const XPlane* host_trace,
-    const tsl::profiler::GroupMetadataMap& group_metadata_map,
-    std::vector<XPlane*> device_traces) {
-  struct GroupLaunchInfo {  // "Group" normally means step.
-    tsl::profiler::Timespan timespan;
-    tsl::Stat<uint64_t> stat;
-
-    void AddEventTimespan(tsl::profiler::Timespan event_span) {
-      if (stat.count() == 0) {
-        timespan = event_span;
-      } else {
-        timespan.ExpandToInclude(event_span);
-      }
-      stat.UpdateStat(event_span.duration_ps());
-    }
-  };
-  using DeviceLaunchInfo =
-      absl::flat_hash_map<int64_t /*group_id*/, GroupLaunchInfo>;
-
-  const int num_devices = device_traces.size();
-  std::vector<DeviceLaunchInfo> per_device_launch_info(num_devices);
-
-  XPlaneVisitor host_plane = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  host_plane.ForEachLine([&](const XLineVisitor& line) {
-    if (IsDerivedThreadId(line.Id())) return;
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      // Filter out API calls for cuEventRecord/cuEventQuery/cuCtxSynchronize
-      // etc for now. TODO: find a better way to filter out only the memcpy and
-      // kernel launch events.
-      if (absl::StartsWith(event.Name(), "cu")) return;
-      LaunchEventStats stats(&event);
-      if (stats.group_id.has_value() && stats.IsLaunch() &&
-          0 <= *stats.device_id && *stats.device_id < num_devices) {
-        // This is a launch event on a known device.
-        GroupLaunchInfo& group_launch_info =
-            per_device_launch_info[*stats.device_id][*stats.group_id];
-        group_launch_info.AddEventTimespan(event.GetTimespan());
-      }
-    });
-  });
-
-  int64_t host_plane_start = GetStartTimestampNs(*host_trace);
-  for (int i = 0; i < num_devices; ++i) {
-    if (per_device_launch_info[i].empty()) continue;
-    int64_t device_plane_start = GetStartTimestampNs(*device_traces[i]);
-
-    XPlaneBuilder device_plane(device_traces[i]);
-    const XStatMetadata& group_id_stat_metadata =
-        *device_plane.GetOrCreateStatMetadata(
-            GetStatTypeStr(StatType::kGroupId));
-    const XStatMetadata& num_launches_stat_metadata =
-        *device_plane.GetOrCreateStatMetadata("num_launches");
-    const XStatMetadata& max_launch_time_us_stat_metadata =
-        *device_plane.GetOrCreateStatMetadata("max_launch_time_us");
-    const XStatMetadata& avg_launch_time_us_stat_metadata =
-        *device_plane.GetOrCreateStatMetadata("avg_launch_time_us");
-
-    XLineBuilder launch_line =
-        device_plane.GetOrCreateLine(kThreadIdKernelLaunch);
-    launch_line.SetName(kKernelLaunchLineName);
-    launch_line.SetTimestampNs(std::min(device_plane_start, host_plane_start));
-    for (const auto& kv : per_device_launch_info[i]) {
-      int64_t group_id = kv.first;
-      const GroupLaunchInfo& group_info = kv.second;
-      if (const tsl::profiler::GroupMetadata* group_metadata =
-              gtl::FindOrNull(group_metadata_map, group_id)) {
-        XEventBuilder device_event =
-            launch_line.AddEvent(*device_plane.GetOrCreateEventMetadata(
-                absl::StrCat("Launch Stats for ", group_metadata->name)));
-        device_event.SetTimespan(group_info.timespan);
-        device_event.AddStatValue(group_id_stat_metadata, group_id);
-        device_event.AddStatValue(num_launches_stat_metadata,
-                                  group_info.stat.count());
-        device_event.AddStatValue(
-            max_launch_time_us_stat_metadata,
-            tsl::profiler::PicoToMicro(group_info.stat.max()));
-        device_event.AddStatValue(
-            avg_launch_time_us_stat_metadata,
-            tsl::profiler::PicoToMicro(group_info.stat.avg()));
-      }
-    }
-  }
-}
-
-void GenerateDerivedTimeLines(
-    const tsl::profiler::GroupMetadataMap& group_metadata_map, XSpace* space) {
-  HloModuleMap hlo_module_map;
-  {
-    HloProtoMap hlo_proto_map;
-    hlo_proto_map.AddHloProtosFromXSpace(*space);
-    for (const auto& [program_id, hlo_proto] : hlo_proto_map) {
-      AddHloProto(hlo_module_map, program_id, *hlo_proto);
-    }
-  }
-
-  auto symbol_resolver = [&](absl::optional<uint64_t> program_id,
-                             absl::string_view hlo_module,
-                             absl::string_view hlo_op) -> Symbol {
-    Symbol output;
-    const auto* hlo_instruction =
-        GetHloInstruction(hlo_module_map, program_id, hlo_op);
-    if (hlo_instruction != nullptr) {
-      output.tf_op_name = hlo_instruction->op_full_name();
-      output.source_info = std::string(hlo_instruction->source_info());
-    }
-    return output;
-  };
-
-  ScopeRangeIdTree scope_range_id_tree;
-  const XPlane* namespace_tree_plane =
-      FindPlaneWithName(*space, tsl::profiler::kScopeRangeIdTreePlaneName);
-  if (namespace_tree_plane) {
-    XPlaneVisitor namespace_tree_visitor =
-        tsl::profiler::CreateTfXPlaneVisitor(namespace_tree_plane);
-    namespace_tree_visitor.ForEachStat([&](const XStatVisitor& stat) {
-      scope_range_id_tree.emplace(stat.Id(), stat.IntValue());
-    });
-  }
-
-  std::vector<XPlane*> device_planes =
-      FindMutablePlanesWithPrefix(space, kGpuPlanePrefix);
-  for (XPlane* plane : device_planes) {
-    DeriveStepEventsFromGroups(group_metadata_map, plane);
-    DeriveEventsFromAnnotations(symbol_resolver, plane, &scope_range_id_tree);
-  }
-
-  const XPlane* host_plane = FindPlaneWithName(*space, kHostThreadsPlaneName);
-  if (host_plane) {
-    DeriveEventsFromHostTrace(host_plane, group_metadata_map, device_planes);
-  }
-  for (XPlane* plane : FindMutableTensorCorePlanes(space)) {
-    DeriveLinesFromStats(plane);
-    SortXPlane(plane);
-  }
-}
-
-void DeriveLinesFromStats(XPlane* device_trace) {
-  XPlaneVisitor plane_visitor =
-      tsl::profiler::CreateTfXPlaneVisitor(device_trace);
-  XPlaneBuilder plane_builder(device_trace);
-  int64_t start_timestamp_ns = GetStartTimestampNs(*device_trace);
-  DerivedXLineBuilder tf_ops(
-      &plane_builder, tensorflow::profiler::kThreadIdTfOp,
-      tensorflow::profiler::kTensorFlowOpLineName, start_timestamp_ns, {});
-  DerivedXLineBuilder tf_name_scope(
-      &plane_builder, tensorflow::profiler::kThreadIdTfNameScope,
-      tensorflow::profiler::kTensorFlowNameScopeLineName, start_timestamp_ns,
-      {&tf_ops});
-  DerivedXLineBuilder source(
-      &plane_builder, tensorflow::profiler::kThreadIdSource,
-      tensorflow::profiler::kSourceLineName, start_timestamp_ns, {});
-
-  HostOffloadEventProcessor host_offload_event_processor(&plane_builder,
-                                                         start_timestamp_ns);
-
-  for (const XEventVisitor& event :
-       GetSortedEvents<XEventVisitor>(plane_visitor, true)) {
-    tsl::profiler::Timespan event_span = event.GetTimespan();
-    std::optional<absl::string_view> tf_op_name;
-    std::optional<absl::string_view> source_info;
-    std::optional<uint64_t> group_id;
-    std::optional<uint64_t> is_async;
-    auto for_each_stat = [&](const XStatVisitor& stat) {
-      if (stat.Type() == StatType::kTfOp) {
-        tf_op_name = stat.StrOrRefValue();
-      } else if (stat.Type() == StatType::kGroupId) {
-        group_id = stat.IntOrUintValue();
-      } else if (stat.Type() == StatType::kSourceInfo) {
-        source_info = stat.StrOrRefValue();
-      } else if (stat.Type() == StatType::kIsAsync) {
-        is_async = stat.IntOrUintValue();
-      }
-    };
-    event.Metadata().ForEachStat(for_each_stat);
-    event.ForEachStat(for_each_stat);
-
-    if (is_async && *is_async) continue;  // Disregard asynchronous events.
-
-    if (tf_op_name && !tf_op_name->empty()) {
-      ProcessTfOpEvent(*tf_op_name, event_span, group_id, plane_builder,
-                       tf_name_scope, tf_ops);
-    }
-    if (source_info && !source_info->empty()) {
-      source.ExpandOrAddEvent(
-          *plane_builder.GetOrCreateEventMetadata(*source_info), event_span,
-          group_id);
-    }
-    if (host_offload_event_processor.IsHostOffloadOpName(event)) {
-      host_offload_event_processor.ProcessHostOffloadOpEvent(event, group_id);
-    }
-  }
-
-  RemoveEmptyLines(device_trace);
-}
-
-void DeriveLinesForXlaCpuOps(XPlane* host_trace) {
-  if (host_trace == nullptr ||
-      !absl::StartsWith(host_trace->name(), kHostThreadsPlaneName))
-    return;
-  XPlaneVisitor visitor = tsl::profiler::CreateTfXPlaneVisitor(host_trace);
-  XPlane destination_plane;
-  XPlaneBuilder plane_builder(&destination_plane);
-  int64_t line_id = tsl::profiler::kThreadIdHostXlaRegionStart;
-  visitor.ForEachLine([&](const XLineVisitor& line) {
-    int64_t start_timestamp_ns = line.TimestampNs();
-    DerivedXLineBuilder tf_ops(
-        &plane_builder, line_id++,
-        absl::StrCat(line.Name(), "-",
-                     tensorflow::profiler::kTensorFlowOpLineName),
-        start_timestamp_ns, {});
-    DerivedXLineBuilder tf_name_scope(
-        &plane_builder, line_id++,
-        absl::StrCat(line.Name(), "-",
-                     tensorflow::profiler::kTensorFlowNameScopeLineName),
-        start_timestamp_ns, {&tf_ops});
-    DerivedXLineBuilder xla_cpu_ops(
-        &plane_builder, line_id++,
-        absl::StrCat(line.Name(), "-", tsl::profiler::kXlaModuleLineName),
-        start_timestamp_ns, {});
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      std::optional<std::string> hlo_module_name;
-      std::optional<std::string> framework_op_name;
-      event.ForEachStat([&](const XStatVisitor& stat) {
-        if (!stat.Type().has_value()) return;
-        // TODO: Add additional stats for framework ops.
-        switch (stat.Type().value()) {
-          case StatType::kHloModule:
-            hlo_module_name = stat.StrOrRefValue();
-            break;
-          case StatType::kTfOp:
-            framework_op_name = stat.StrOrRefValue();
-            break;
-        }
-      });
-      if (hlo_module_name.has_value()) {
-        xla_cpu_ops.ExpandOrAddEvent(
-            *plane_builder.GetOrCreateEventMetadata(*hlo_module_name),
-            event.GetTimespan(), std::nullopt);
-        if (framework_op_name.has_value()) {
-          ProcessTfOpEvent(*framework_op_name, event.GetTimespan(),
-                           std::nullopt, plane_builder, tf_name_scope, tf_ops);
-        }
-      }
-    });
-  });
-  RemoveEmptyLines(&destination_plane);
-  MergePlanes(destination_plane, host_trace);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index a152327319cc..f2d41461fa2f 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -15,186 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 
-#include <cstdint>
-#include <functional>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Store the mapping from child scope range id to parent scope range id, which
-// logically form a scope range call stack tree/forest.
-typedef absl::flat_hash_map<int64_t /* child_scope_range_id */,
-                            int64_t /* parent_scope_range_id */>
-    ScopeRangeIdTree;
-
-// Helper for deriving XEvents.
-class DerivedXEventBuilder {
- public:
-  DerivedXEventBuilder(XEventBuilder event, std::optional<int64_t> group_id,
-                       std::optional<int64_t> scope_range_id = std::nullopt);
-
-  bool ShouldExpand(const XEventMetadata& event_metadata,
-                    std::optional<int64_t> group_id,
-                    std::optional<int64_t> scope_range_id = std::nullopt) const;
-
-  void Expand(tsl::profiler::Timespan event_span);
-  tsl::profiler::Timespan GetTimespan() const { return event_.GetTimespan(); }
-  void SetTimespan(tsl::profiler::Timespan event_span) {
-    event_.SetTimespan(event_span);
-  }
-
-  template <typename ValueT>
-  void SetOrAddStatValue(const XStatMetadata& metadata, ValueT&& value) {
-    event_.SetOrAddStatValue(metadata, std::forward<ValueT>(value));
-  }
-
- private:
-  XEventBuilder event_;
-  std::optional<int64_t> group_id_;
-  std::optional<int64_t> scope_range_id_;
-};
-
-// Helper for deriving an XLine from events in another XLine.
-class DerivedXLineBuilder {
- public:
-  DerivedXLineBuilder(XPlaneBuilder* plane, int64_t line_id,
-                      absl::string_view name, int64_t timestamp_ns,
-                      std::vector<DerivedXLineBuilder*> dependent_lines);
-
-  XLineBuilder& Line() { return line_; }
-
-  // Either merges event with the last event or creates a new event on this
-  // XLine. group_id and low_level_event_name may be passed to separate
-  // consecutive invocations of the same event, depending on the XEvent type:
-  //   TF-op, TF name scope: both group_id and low_level_event_name are used.
-  //   HLO-op, step: only group_id is used.
-  //   HLO module, source: both group_id and low_level_event_name are NOT used.
-  // If scope_range_id is provided, it will be compared with the one in the
-  // event which is to be merged with. If they are different, merging is not
-  // allowed.
-  void ExpandOrAddEvent(const XEventMetadata& event_metadata,
-                        tsl::profiler::Timespan event_span,
-                        std::optional<int64_t> group_id,
-                        std::optional<int64_t> scope_range_id = std::nullopt);
-
-  // The multi-level version of ExpandOrAddEvent. Here, the XEvents at different
-  // levels all share the same group_id and low_level_event_name.
-  // Conceptually, the scope_range_ids should be of same length as the
-  // events_metadata_per_level. However, if it is shorter, this function will
-  // assume the missing elements at the end of scope_range_ids vector with the
-  // value of std::nullopt; and if it is longer, the extra elements in
-  // scope_range_ids will be ignored.
-  void ExpandOrAddEvents(
-      const std::vector<XEventMetadata*>& events_metadata_per_level,
-      tsl::profiler::Timespan event_span, std::optional<int64_t> group_id,
-      absl::Span<std::optional<int64_t>> scope_range_ids = {});
-
-  // Reset the last events lower than or equal to the given level.
-  void ResetLastEvents(int level = 0);
-
-  // To avoid using templates while need hide its implementation in .cc file,
-  // use two functions to set stat value for int64_t and uint64_t here.
-  void AddStatToLevelEvent(int level, const XStatMetadata& metadata,
-                           int64_t value);
-
-  void AddStatToLevelEvent(int level, const XStatMetadata& metadata,
-                           uint64_t value);
-
-  const XStatMetadata* GetCorrelationIdMetadata() const {
-    return correlation_id_metadata_;
-  }
-
-  const XStatMetadata* GetCudaGraphIdMetadata() const {
-    return cuda_graph_id_metadata_;
-  }
-
- private:
-  // If the last event of the given level has the same metadata, expands it to
-  // include the time until the given event's end time.
-  // Otherwise, adds a new event and clears last_event_by_level_ for the levels
-  // below the given level and all levels of the dependent lines. Clearing
-  // last_event_by_level_ prevents a nested event from growing larger than the
-  // parent event(s).
-  void ExpandOrAddLevelEvent(const XEventMetadata& event_metadata,
-                             tsl::profiler::Timespan event_span,
-                             std::optional<int64_t> group_id,
-                             std::optional<int64_t> scope_range_id, int level);
-  void AdjustDurationForTraceViewer(int level);
-
-  const XStatMetadata* group_id_stat_metadata_ = nullptr;
-  const XStatMetadata* correlation_id_metadata_ = nullptr;
-  const XStatMetadata* cuda_graph_id_metadata_ = nullptr;
-
-  XLineBuilder line_;
-  absl::flat_hash_map<int, std::optional<DerivedXEventBuilder>>
-      last_event_by_level_;
-  std::vector<DerivedXLineBuilder*> dependent_lines_;
-  bool is_gpu_plane_ = false;
-};
-
-struct Symbol {
-  absl::string_view tf_op_name;
-  std::string source_info;
-  std::string hlo_text;
-};
-
-using SymbolResolver = std::function<Symbol(std::optional<uint64_t> program_id,
-                                            absl::string_view hlo_module_name,
-                                            absl::string_view hlo_op)>;
-
-// Derives TF name scope and op events from the TF op's fully qualified name
-// with the name of the originating low-level event.
-void ProcessTfOpEvent(absl::string_view tf_op_full_name,
-                      tsl::profiler::Timespan event_span,
-                      std::optional<int64_t> group_id,
-                      XPlaneBuilder& plane_builder,
-                      DerivedXLineBuilder& tf_name_scope_line_builder,
-                      DerivedXLineBuilder& tf_op_line_builder);
-
-// Derives "Steps" line from group_id XStat in XEvents.
-void DeriveStepEventsFromGroups(
-    const tsl::profiler::GroupMetadataMap& group_metadata_map,
-    XPlane* device_trace);
-
-// Derives "TensorFlow Ops", "TensorFlow Name Scope", "XLA Ops" and "XLA Module"
-// lines in an NVIDIA_GPU device trace from data passed as ScopedAnnotations and
-// stored as XStats in XEvents corresponding to GPU Kernels. Consecutive
-// annotations with the same value are merged into a single event except for XLA
-// modules. The device_trace is both input and output.
-void DeriveEventsFromAnnotations(
-    const SymbolResolver& symbol_resolver, XPlane* device_trace,
-    const ScopeRangeIdTree* scope_range_id_tree = nullptr);
-
-// Derives "Launch Activities Summary" line from host trace.
-void DeriveEventsFromHostTrace(
-    const XPlane* host_trace,
-    const tsl::profiler::GroupMetadataMap& group_metadata_map,
-    std::vector<XPlane*> device_traces);
-
-// Loops through XPlanes of input XSpace, if it is "device" XPlane, generating
-// derived timelines for the plane by calling DeriveEventsFromAnnotations.
-void GenerateDerivedTimeLines(
-    const tsl::profiler::GroupMetadataMap& group_metadata_map, XSpace* space);
-
-// Derives `Tensorflow Ops`, `Tensorflow Name Scope` and `Source Code` lines
-// from device_trace.
-void DeriveLinesFromStats(tensorflow::profiler::XPlane* device_trace);
-
-// Devices Framework Op and Module lines for XLA:CPU ops.
-void DeriveLinesForXlaCpuOps(tensorflow::profiler::XPlane* host_trace);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/derived_timeline.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
deleted file mode 100644
index ee3caa5189d6..000000000000
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ /dev/null
@@ -1,543 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/derived_timeline.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <optional>
-
-#include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/group_events.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-TEST(DerivedTimelineTest, EmptySpaceTest) {
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  EXPECT_EQ(space.planes_size(), 0);
-}
-
-// Checks that HLO module events are expanded.
-TEST(DerivedTimelineTest, HloModuleNameTest) {
-  const absl::string_view kHloModuleName = "hlo_module";
-  const absl::string_view kKernelDetails = "kernel_details";
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kHloModule, kHloModuleName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kHloModule, kHloModuleName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // Only the hlo module line is added and other empty lines are removed at the
-  // end.
-  EXPECT_EQ(plane_visitor.NumLines(), 2);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == 0) return;
-    EXPECT_EQ(line_visitor.Id(), kThreadIdHloModule);
-    EXPECT_EQ(line_visitor.NumEvents(), 1);
-    line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-      EXPECT_EQ(event_visitor.Name(), kHloModuleName);
-    });
-  });
-}
-
-// Checks that HLO module events are expanded, with both same name and scope
-// range id. Note that strange XStatValue{int64_t{10}} is to handle different
-// compilers behavior.
-TEST(DerivedTimelineTest, HloModuleNameSameScopeRangeIdTest) {
-  const absl::string_view kHloModuleName = "hlo_module";
-  const absl::string_view kKernelDetails = "kernel_details";
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kHloModule, XStatValue{kHloModuleName}},
-                {StatType::kKernelDetails, XStatValue{kKernelDetails}},
-                {StatType::kScopeRangeId, XStatValue{int64_t{10}}}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kHloModule, XStatValue{kHloModuleName}},
-                {StatType::kKernelDetails, XStatValue{kKernelDetails}},
-                {StatType::kScopeRangeId, XStatValue{int64_t{10}}}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // Only the hlo module line is added and other empty lines are removed at the
-  // end.
-  EXPECT_EQ(plane_visitor.NumLines(), 2);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == 0) return;
-    EXPECT_EQ(line_visitor.Id(), kThreadIdHloModule);
-    EXPECT_EQ(line_visitor.NumEvents(), 1);
-    line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-      EXPECT_EQ(event_visitor.Name(), kHloModuleName);
-    });
-  });
-}
-
-// Checks that HLO module events are expanded, with same name only,
-// but different scope range id.
-TEST(DerivedTimelineTest, HloModuleNameDifferentScopeRangeIdTest) {
-  const absl::string_view kHloModuleName = "hlo_module";
-  const absl::string_view kKernelDetails = "kernel_details";
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kHloModule, XStatValue{kHloModuleName}},
-                {StatType::kKernelDetails, XStatValue{kKernelDetails}},
-                {StatType::kScopeRangeId, XStatValue{int64_t{10}}}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kHloModule, XStatValue{kHloModuleName}},
-                {StatType::kKernelDetails, XStatValue{kKernelDetails}},
-                {StatType::kScopeRangeId, XStatValue{int64_t{20}}}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // Only the hlo module line is added and other empty lines are removed at the
-  // end.
-  EXPECT_EQ(plane_visitor.NumLines(), 2);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == 0) return;
-    EXPECT_EQ(line_visitor.Id(), kThreadIdHloModule);
-    EXPECT_EQ(line_visitor.NumEvents(), 2);
-    line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-      EXPECT_EQ(event_visitor.Name(), kHloModuleName);
-    });
-  });
-}
-
-// Checks that HLO module events are expanded.
-TEST(DerivedTimelineTest, NoHloModuleNameTest) {
-  const absl::string_view kKernelDetails = "kernel_details";
-  const uint64_t kCudaGraphExecId = 1;
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane& plane = *GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(&plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kKernelDetails, kKernelDetails}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kKernelDetails, kKernelDetails}});
-  // Also add a CudaGraph Execution event.
-  CreateXEvent(&plane_builder, &line_builder, "op3", 500, 100,
-               {{StatType::kCudaGraphExecId, kCudaGraphExecId}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&plane);
-  // Only the hlo module line is added and other empty lines are removed at the
-  // end.
-  EXPECT_EQ(plane_visitor.NumLines(), 1);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == 0) return;
-    EXPECT_EQ(line_visitor.Id(), kThreadIdHloModule);
-    EXPECT_EQ(line_visitor.NumEvents(), 0);
-  });
-}
-
-// Checks that the TF op events are expanded.
-TEST(DerivedTimelineTest, TfOpLineTest) {
-  const absl::string_view kTfOpName = "mul:Mul";
-  const absl::string_view kKernelDetails = "kernel_details";
-  const uint64_t kCudaGraphExecId = 1;
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  // Also add a CudaGraph Execution event.
-  CreateXEvent(&plane_builder, &line_builder, "op3", 500, 100,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kCudaGraphExecId, kCudaGraphExecId}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // Only the tf op line is added and other empty lines are removed at the end.
-  EXPECT_EQ(plane_visitor.NumLines(), 2);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == 0) return;
-    EXPECT_EQ(line_visitor.Id(), kThreadIdTfOp);
-    EXPECT_EQ(line_visitor.NumEvents(), 1);
-    line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-      EXPECT_EQ(event_visitor.Name(), kTfOpName);
-      EXPECT_EQ(event_visitor.OffsetPs(), 0);
-      EXPECT_EQ(event_visitor.DurationPs(), 600);
-    });
-  });
-}
-
-// Checks that the dependency between the step line and the TF op line prevents
-// TF op events from being expanded.
-TEST(DerivedTimelineTest, DependencyTest) {
-  constexpr int64_t kFirstGroupId = 0;
-  constexpr int64_t kSecondGroupId = 1;
-
-  const absl::string_view kTfOpName = "mul:Mul";
-  const absl::string_view kKernelDetails = "kernel_details";
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map(
-      {{0, {"train 0"}}, {1, {"train 1"}}});
-  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kGroupId, kFirstGroupId},
-                {StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kGroupId, kSecondGroupId},
-                {StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // The step line and the TF op line are added.
-  EXPECT_EQ(plane_visitor.NumLines(), 3);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == 0) return;
-    EXPECT_TRUE(line_visitor.Id() == kThreadIdStepInfo ||
-                line_visitor.Id() == kThreadIdTfOp);
-    EXPECT_EQ(line_visitor.NumEvents(), 2);
-  });
-}
-
-// Checks that the TF op events are expanded.
-TEST(DerivedTimelineTest, TfOpNameScopeTest) {
-  const absl::string_view kTfOpName = "scope1/scope2/mul:Mul";
-  const absl::string_view kKernelDetails = "kernel_details";
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // The TF name scope line and the TF op line are added.
-  EXPECT_EQ(plane_visitor.NumLines(), 3);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    int64_t line_id = line_visitor.Id();
-    if (line_id == 0) {
-      return;
-    } else if (line_id == kThreadIdTfNameScope) {
-      EXPECT_EQ(line_visitor.NumEvents(), 2);
-      line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-        EXPECT_EQ(event_visitor.OffsetPs(), 0);
-        EXPECT_EQ(event_visitor.DurationPs(), 500);
-      });
-    } else if (line_id == kThreadIdTfOp) {
-      EXPECT_EQ(line_visitor.NumEvents(), 1);
-      line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-        EXPECT_EQ(event_visitor.Name(), kTfOpName);
-        EXPECT_EQ(event_visitor.OffsetPs(), 0);
-        EXPECT_EQ(event_visitor.DurationPs(), 500);
-      });
-    }
-  });
-}
-
-// Checks only derived events from line with most events for gpu trace.
-TEST(DerivedTimelineTest, OnlyDerivedEventsFromLineWithMostEvents) {
-  const absl::string_view kTfOpName = "scope1/scope2/mul:Mul";
-  const absl::string_view kKernelDetails = "kernel_details";
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  // Add first line with two events.
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  // Add second line with only one event.
-  auto line_builder_2 = plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&plane_builder, &line_builder_2, "op3", 50, 850,
-               {{StatType::kTfOp, kTfOpName},
-                {StatType::kKernelDetails, kKernelDetails}});
-  // Derive lines for the plane.
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // The TF name scope line and the TF op line are added.
-  EXPECT_EQ(plane_visitor.NumLines(), 4);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    int64_t line_id = line_visitor.Id();
-    if (line_id == 0 || line_id == 1) {
-      return;
-    } else if (line_id == kThreadIdTfNameScope) {
-      EXPECT_EQ(line_visitor.NumEvents(), 2);
-      line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-        EXPECT_EQ(event_visitor.OffsetPs(), 0);
-        // When derived from first line only, we should get single event which
-        // starts from op1' start (0), end at op2's end (200 + 300),
-        // duration is 500.
-        // If derived from both lines, the derived event duration will be
-        // (50 + 850) - 0 = 900.
-        EXPECT_EQ(event_visitor.DurationPs(), 500);
-      });
-    } else if (line_id == kThreadIdTfOp) {
-      EXPECT_EQ(line_visitor.NumEvents(), 1);
-      line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-        EXPECT_EQ(event_visitor.Name(), kTfOpName);
-        EXPECT_EQ(event_visitor.OffsetPs(), 0);
-        EXPECT_EQ(event_visitor.DurationPs(), 500);
-      });
-    }
-  });
-}
-
-// Checks that the TF op events are expanded.
-TEST(DerivedTimelineTest, TfOpNameScopeShrinkTest) {
-  {
-    // Case 1: shirnk is possible.
-    XSpace space;
-    tsl::profiler::GroupMetadataMap group_metadata_map;
-    XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-    XPlaneBuilder plane_builder(plane);
-    auto line_builder = plane_builder.GetOrCreateLine(0);
-    CreateXEvent(&plane_builder, &line_builder, "op1", 0, 10000,
-                 {{StatType::kTfOp, "a/b/c/Add:Add"},
-                  {StatType::kKernelDetails, "blah"}});
-    CreateXEvent(
-        &plane_builder, &line_builder, "op2", 20000, 30000,
-        {{StatType::kTfOp, "a/d/Mul:Mul"}, {StatType::kKernelDetails, "blah"}});
-    GenerateDerivedTimeLines(group_metadata_map, &space);
-    XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-    // The TF name scope line and the TF op line are added.
-    EXPECT_EQ(plane_visitor.NumLines(), 3);
-    plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-      int64_t line_id = line_visitor.Id();
-      if (line_id == 0) {
-        return;
-      } else if (line_id == kThreadIdTfNameScope) {
-        EXPECT_EQ(line_visitor.NumEvents(), 4);
-        std::map<absl::string_view, uint64_t> durations;
-        line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-          durations[event_visitor.Name()] = event_visitor.DurationPs();
-        });
-        EXPECT_EQ(durations["a"], 50000);
-        EXPECT_EQ(durations["b"], 10000);
-        EXPECT_EQ(durations["c"], 9000);  // shrinked to be distinguish with b.
-        EXPECT_EQ(durations["d"], 30000);
-      }
-    });
-  }
-  {
-    // Case 2: shirnk is impossible due to top event is too small.
-    XSpace space;
-    tsl::profiler::GroupMetadataMap group_metadata_map;
-    XPlane* plane = GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-    XPlaneBuilder plane_builder(plane);
-    auto line_builder = plane_builder.GetOrCreateLine(0);
-    CreateXEvent(&plane_builder, &line_builder, "op1", 0, 10000,
-                 {{StatType::kTfOp, "a/b/c/d/e/Add:Add"},
-                  {StatType::kKernelDetails, "blah"}});
-    CreateXEvent(&plane_builder, &line_builder, "op2", 10000, 2000,
-                 {{StatType::kTfOp, "a/b/c/d/f/Sub:Sub"},
-                  {StatType::kKernelDetails, "blah"}});
-    CreateXEvent(
-        &plane_builder, &line_builder, "op3", 20000, 30000,
-        {{StatType::kTfOp, "a/g/Mul:Mul"}, {StatType::kKernelDetails, "blah"}});
-    GenerateDerivedTimeLines(group_metadata_map, &space);
-    XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-    // The TF name scope line and the TF op line are added.
-    EXPECT_EQ(plane_visitor.NumLines(), 3);
-    plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-      int64_t line_id = line_visitor.Id();
-      if (line_id == 0) {
-        return;
-      } else if (line_id == kThreadIdTfNameScope) {
-        EXPECT_EQ(line_visitor.NumEvents(), 7);
-        std::map<absl::string_view, uint64_t> durations;
-        line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-          durations[event_visitor.Name()] = event_visitor.DurationPs();
-        });
-        for (const auto& [name, duration] : durations) {
-          LOG(ERROR) << name << ": " << duration;
-        }
-        EXPECT_EQ(durations["a"], 50000);
-        EXPECT_EQ(durations["b"], 12000);
-        EXPECT_EQ(durations["c"], 11000);  // shrinked to be distinguish with b.
-        EXPECT_EQ(durations["d"], 11000);  // not shrinked because of f.
-        EXPECT_EQ(durations["e"], 10000);
-        EXPECT_EQ(durations["f"], 1000);
-        EXPECT_EQ(durations["g"], 30000);
-      }
-    });
-  }
-}
-
-// Checks that XLA Ops mapping to CudaGraph launch has extra stats.
-TEST(DerivedTimelineTest, XloOpHasCudaGraphStats) {
-  constexpr absl::string_view kModuleName = "module";
-  constexpr absl::string_view kHloOpName = "op_level_2";
-  constexpr absl::string_view kKernelDetails = "kernel_details";
-  constexpr int64_t kGroupIdValue = 1;
-  constexpr int64_t kCorrelationIdValue = 10000;
-  const uint64_t kCudaGraphIdValue = 20;
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-
-  // Build Input Plane/Line/Events and derive events from them.
-  XPlane& plane = *GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0);
-  XPlaneBuilder plane_builder(&plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kKernelDetails, kKernelDetails},
-                {StatType::kGroupId, kGroupIdValue},
-                {StatType::kHloModule, kModuleName},
-                {StatType::kHloOp, kHloOpName},
-                {StatType::kCorrelationId, kCorrelationIdValue},
-                {StatType::kCudaGraphId, kCudaGraphIdValue}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
-               {{StatType::kKernelDetails, kKernelDetails},
-                {StatType::kGroupId, kGroupIdValue},
-                {StatType::kHloModule, kModuleName},
-                {StatType::kHloOp, kHloOpName},
-                {StatType::kCorrelationId, kCorrelationIdValue},
-                {StatType::kCudaGraphId, kCudaGraphIdValue}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-
-  // Check that the HLO op line is added and has the extra stats for the first
-  // derived event.
-  size_t num_hlo_op_line = 0;
-  size_t num_events = 0;
-  std::optional<XStatVisitor> correlation_id;
-  std::optional<XStatVisitor> cuda_graph_id;
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&plane);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == kThreadIdHloOp) {
-      num_hlo_op_line++;
-      if (num_hlo_op_line == 1) {
-        num_events = line_visitor.NumEvents();
-        line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-          correlation_id = event_visitor.GetStat(StatType::kCorrelationId);
-          cuda_graph_id = event_visitor.GetStat(StatType::kCudaGraphId);
-        });
-      }
-    }
-  });
-  EXPECT_EQ(num_hlo_op_line, 1);
-  EXPECT_EQ(num_events, 1);
-  ASSERT_TRUE(correlation_id.has_value());
-  EXPECT_EQ(correlation_id->IntValue(), kCorrelationIdValue);
-  ASSERT_TRUE(cuda_graph_id.has_value());
-  EXPECT_EQ(cuda_graph_id->UintValue(), kCudaGraphIdValue);
-}
-
-TEST(DerivedTimelineTest, DeriveLinesForXlaCpuOps) {
-  XPlane xplane;
-  XPlaneBuilder plane_builder(&xplane);
-  plane_builder.SetName(tsl::profiler::kHostThreadsPlaneName);
-
-  absl::string_view main_line_name = "main";
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  line_builder.SetName(main_line_name);
-  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-               {{StatType::kHloModule, "Module1"}});
-  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 400,
-               {{StatType::kHloModule, "Module2"}});
-
-  DeriveLinesForXlaCpuOps(&xplane);
-
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
-  EXPECT_EQ(plane_visitor.NumLines(), 2);
-  plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
-    if (line_visitor.Name() == main_line_name) return;
-    line_visitor.ForEachEvent([&](const XEventVisitor& event_visitor) {
-      if (event_visitor.Name() == "Module1") {
-        EXPECT_EQ(event_visitor.DurationPs(), 100);
-        EXPECT_EQ(event_visitor.OffsetPs(), 0);
-      } else if (event_visitor.Name() == "Module2") {
-        EXPECT_EQ(event_visitor.DurationPs(), 400);
-        EXPECT_EQ(event_visitor.OffsetPs(), 200);
-      } else {
-        FAIL() << "Found Event " << event_visitor.Name();
-      }
-    });
-  });
-}
-
-TEST(DerivedTimelineTest, MergeAndNoMerge) {
-  constexpr absl::string_view kHloModuleName = "Framework Ops";
-  static constexpr absl::string_view kTfOpName = "abc:model/layer/MatMul_1";
-  XSpace space;
-  tsl::profiler::GroupMetadataMap group_metadata_map;
-  XPlane* plane =
-      GetOrCreateTpuXPlane(&space, /*device_ordinal=*/0, "DummyTPU", 1.0, 1.0);
-  XPlaneBuilder plane_builder(plane);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(
-      &plane_builder, &line_builder, "op1", 0, 100,
-      {{StatType::kHloModule, kHloModuleName}, {StatType::kTfOp, kTfOpName}});
-  CreateXEvent(
-      &plane_builder, &line_builder, "op2", 200, 300,
-      {{StatType::kHloModule, kHloModuleName}, {StatType::kTfOp, kTfOpName}});
-  // The above two events are merged into one. This event will not be merged
-  // because the gap is > 2x(0..200+300) = 1000.
-  CreateXEvent(
-      &plane_builder, &line_builder, "op3", 1501, 300,
-      {{StatType::kHloModule, kHloModuleName}, {StatType::kTfOp, kTfOpName}});
-  GenerateDerivedTimeLines(group_metadata_map, &space);
-  XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
-  // Only the hlo module line is added and other empty lines are removed at the
-  // end.
-  EXPECT_EQ(plane_visitor.NumLines(), 2);
-  plane_visitor.ForEachLine([](const XLineVisitor& line_visitor) {
-    if (line_visitor.Id() == 0) return;
-    EXPECT_EQ(line_visitor.NumEvents(), 2);
-    line_visitor.ForEachEvent([](const XEventVisitor& event_visitor) {
-      EXPECT_EQ(event_visitor.Name(), kTfOpName);
-    });
-  });
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/device_caps_utils.cc b/tensorflow/core/profiler/utils/device_caps_utils.cc
deleted file mode 100644
index 3b149ad528b6..000000000000
--- a/tensorflow/core/profiler/utils/device_caps_utils.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/device_caps_utils.h"
-
-#include <string>
-
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-void SetDeviceCaps(const DeviceCapabilities& caps, XPlane* plane) {
-  XPlaneBuilder xplane(plane);
-  int clock_rate_in_khz =
-      static_cast<int>(caps.clock_rate_in_ghz() * 1000000.0);
-  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
-                          GetStatTypeStr(StatType::kDevCapClockRateKHz)),
-                      clock_rate_in_khz);
-  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
-                          GetStatTypeStr(StatType::kDevCapCoreCount)),
-                      caps.num_cores());
-  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
-                          GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
-                      caps.memory_bandwidth());
-  xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
-                          GetStatTypeStr(StatType::kDevCapMemorySize)),
-                      caps.memory_size_in_bytes());
-  if (caps.has_compute_capability()) {
-    xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
-                        caps.compute_capability().major());
-    xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(
-                            GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
-                        caps.compute_capability().minor());
-  }
-  xplane.AddStatValue(
-      *xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDevVendor)),
-      caps.device_vendor());
-}
-
-DeviceCapabilities GetDeviceCaps(const XPlane& plane) {
-  DeviceCapabilities caps;
-  XPlaneVisitor xplane = tsl::profiler::CreateTfXPlaneVisitor(&plane);
-  xplane.ForEachStat([&](const tensorflow::profiler::XStatVisitor& stat) {
-    if (!stat.Type().has_value()) return;
-    switch (stat.Type().value()) {
-      case StatType::kDevCapClockRateKHz:
-        caps.set_clock_rate_in_ghz(stat.IntOrUintValue() / 1000000.0);
-        break;
-      case StatType::kDevCapCoreCount:
-        caps.set_num_cores(stat.IntOrUintValue());
-        break;
-      case StatType::kDevCapMemoryBandwidth:
-        caps.set_memory_bandwidth(stat.IntOrUintValue());
-        break;
-      case StatType::kDevCapMemorySize:
-        caps.set_memory_size_in_bytes(stat.IntOrUintValue());
-        break;
-      case StatType::kDevCapComputeCapMajor:
-        caps.mutable_compute_capability()->set_major(stat.IntOrUintValue());
-        break;
-      case StatType::kDevCapComputeCapMinor:
-        caps.mutable_compute_capability()->set_minor(stat.IntOrUintValue());
-        break;
-      case StatType::kDevVendor:
-        caps.set_device_vendor(std::string(stat.StrOrRefValue()));
-        break;
-    }
-  });
-  return caps;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/device_caps_utils.h b/tensorflow/core/profiler/utils/device_caps_utils.h
index c6c84133db3a..a500ed1d18ac 100644
--- a/tensorflow/core/profiler/utils/device_caps_utils.h
+++ b/tensorflow/core/profiler/utils/device_caps_utils.h
@@ -16,16 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAPS_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAPS_UTILS_H_
 
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-void SetDeviceCaps(const DeviceCapabilities& caps, XPlane* plane);
-DeviceCapabilities GetDeviceCaps(const XPlane& plane);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/device_caps_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_DEVICE_CAPS_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/diagnostics.cc b/tensorflow/core/profiler/utils/diagnostics.cc
deleted file mode 100644
index c4ff0f2069f0..000000000000
--- a/tensorflow/core/profiler/utils/diagnostics.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/diagnostics.h"
-
-#include <string>
-
-#include "absl/algorithm/container.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-const absl::string_view kErrorIncompleteStep =
-    "Incomplete step observed and hence the step time is unknown."
-    "Instead, we use the trace duration as the step time. This may happen"
-    " if your profiling duration is shorter than the step time. In this"
-    " case, you may try to profile longer.";
-
-const absl::string_view kErrorEmptyIntersect =
-    "Although there are steps observed on some host(s), the intersection of "
-    "the steps over all hosts is empty (because the differences among "
-    "individual host's step sequences are too big). Consequently, the overall "
-    "step time is "
-    "unknown.";
-
-const absl::string_view kErrorNoStepMarker =
-    "No step marker observed and hence the step time is unknown."
-    " This may happen if (1) training steps are not instrumented (e.g., if"
-    " you are not using Keras) or (2) the profiling duration is shorter"
-    " than the step time. For (1), you need to add step instrumentation;"
-    " for (2), you may try to profile longer.";
-
-const absl::string_view kNoDeviceTraceCollected =
-    "No TensorCore device trace was collected. This might happen if your job "
-    "hadn't been run on the device when sampling was turned on. You could try "
-    "the sampling again later.";
-
-const absl::string_view kStepsDropped =
-    " steps dropped. This might happen when you profile many hosts and/or many "
-    "steps. You could try to profile shorter or reduce the number of hosts "
-    "you profile.";
-
-void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
-  if (op_stats.step_db().use_incomplete_step()) {
-    *diag->add_warnings() = std::string(kErrorIncompleteStep);
-  } else if (op_stats.step_db().step_sequence().empty()) {
-    *diag->add_warnings() = op_stats.step_db().empty_intersect()
-                                ? std::string(kErrorEmptyIntersect)
-                                : std::string(kErrorNoStepMarker);
-  }
-  if (op_stats.step_db().num_steps_dropped()) {
-    *diag->add_warnings() =
-        absl::StrCat(op_stats.step_db().num_steps_dropped(), kStepsDropped);
-  }
-}
-
-void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
-  *diag->mutable_errors() = op_stats.diagnostics().errors();
-  absl::c_sort(*diag->mutable_errors());
-  if (diag->errors().empty()) {
-    // Shows run-environment error only if there is no other existing error.
-    if (op_stats.run_environment().device_type() != "CPU" &&
-        op_stats.run_environment().device_core_count() <= 0) {
-      *diag->add_errors() = std::string(kNoDeviceTraceCollected);
-    }
-  }
-  *diag->mutable_warnings() = op_stats.diagnostics().warnings();
-  PopulateStepDiagnostics(op_stats, diag);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/diagnostics.h b/tensorflow/core/profiler/utils/diagnostics.h
index 25fb16900f25..67eb4020d54c 100644
--- a/tensorflow/core/profiler/utils/diagnostics.h
+++ b/tensorflow/core/profiler/utils/diagnostics.h
@@ -16,30 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_DIAGNOSTICS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_DIAGNOSTICS_H_
 
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/macros.h"
-#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
-#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Error message that the visualization is based on incomplete step.
-TF_CONST_INIT extern const absl::string_view kErrorIncompleteStep;
-
-// Error message that no step marker is seen and visualization contains no
-// step info.
-TF_CONST_INIT extern const absl::string_view kErrorNoStepMarker;
-
-TF_CONST_INIT extern const absl::string_view kNoDeviceTraceCollected;
-
-TF_CONST_INIT extern const absl::string_view kStepsDropped;
-
-void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag);
-
-void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/diagnostics.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_DIAGNOSTICS_H_
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
deleted file mode 100644
index b5e9b813a15c..000000000000
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ /dev/null
@@ -1,449 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/utils/event_span.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-// Representing a boundary of an event.
-struct EventBoundary {
-  // Time at this boundary.
-  uint64 time_ps;
-  // Type of the event.
-  EventType type;
-  // True if this is the start of the event; False if this is the end.
-  bool is_start;
-  EventBoundary(uint64 time_ps, EventType type, bool is_start)
-      : time_ps(time_ps), type(type), is_start(is_start) {}
-};
-
-// Returns true if EventBoundary a should appear before EventBoundary b.
-bool CmpEventBoundaries(const EventBoundary& a, const EventBoundary& b) {
-  if (a.time_ps == b.time_ps) {
-    if (a.is_start == b.is_start) {
-      // Puts the higher-priority type before the lower-priority type if they
-      // have the same time and same boundary type.
-      return a.type > b.type;
-    } else {
-      // Puts the "end" bounary before the "start" boundary if they have the
-      // same time.
-      return !a.is_start;
-    }
-  }
-  // In ascending order of time.
-  return a.time_ps < b.time_ps;
-}
-
-// Generates vector of event boundaries from the given overlapped_events.
-std::vector<EventBoundary> GenerateEventBoundaries(
-    const std::vector<EventTypeSpan>& overlapped_events) {
-  std::vector<EventBoundary> boundaries;
-  boundaries.reserve(2 * overlapped_events.size());
-  for (const auto& event : overlapped_events) {
-    boundaries.push_back(
-        {event.span.begin_ps(), event.type, /*is_start=*/true});
-    boundaries.push_back({event.span.end_ps(), event.type, /*is_start=*/false});
-  }
-  absl::c_sort(boundaries, CmpEventBoundaries);
-  return boundaries;
-}
-
-// A class to track the highest priority that an event should be assigned.
-class PriorityTracker {
- private:
-  // The current maximum priority.
-  EventType current_max_priority_;
-  // A count for each possible priority.
-  std::vector<int64_t> priority_count_;
-
- public:
-  PriorityTracker() {
-    current_max_priority_ = UNKNOWN_TIME;
-    priority_count_.resize(LAST_EVENT_TYPE + 1, 0);
-  }
-  // Updates current_max_priority_ and priority_count_[] given the boundary.
-  // Returns the new current_max_priority_.
-  EventType Update(const EventBoundary& boundary) {
-    EventType event_type = boundary.type;
-    bool is_start = boundary.is_start;
-    if (is_start) {
-      priority_count_[event_type]++;
-      if (event_type > current_max_priority_) {
-        current_max_priority_ = event_type;
-      }
-    } else {
-      priority_count_[event_type]--;
-      if (event_type == current_max_priority_ &&
-          priority_count_[event_type] == 0) {
-        // Reduces current_max_priority_ to the first event type (starting from
-        // the highest priority) that has a non-zero count.
-        bool found = false;
-        for (int i = event_type - 1; i >= 0; i--) {
-          if (priority_count_[i] > 0) {
-            current_max_priority_ = static_cast<EventType>(i);
-            found = true;
-            break;
-          }
-        }
-        if (!found) current_max_priority_ = UNKNOWN_TIME;
-      }
-    }
-    return current_max_priority_;
-  }
-};
-
-constexpr int kNumGenericEventTypes = GenericEventType::kLastGenericEventType -
-                                      GenericEventType::kFirstGenericEventType +
-                                      1;
-
-using GenericEventTypeStrMap =
-    absl::flat_hash_map<GenericEventType, absl::string_view>;
-
-const GenericEventTypeStrMap& GetGenericEventTypeStrMap() {
-  static const auto* generic_event_type_str_map = new GenericEventTypeStrMap({
-      {kDeviceCompute, "Device compute"},
-      {kDeviceToDevice, "Device to device"},
-      {kDeviceCollectives, "Device collective communication"},
-      {kHostCompute, "Host compute"},
-      {kHostPrepare, "Kernel launch"},
-      {kInput, "Input"},
-      {kOutput, "Output"},
-      {kCompile, "Compilation"},
-      {kAllOthers, "All others"},
-  });
-  DCHECK_EQ(generic_event_type_str_map->size(), kNumGenericEventTypes);
-  return *generic_event_type_str_map;
-}
-
-}  // namespace
-
-absl::string_view GetGenericEventTypeStr(GenericEventType event_type) {
-  return GetGenericEventTypeStrMap().at(event_type);
-}
-
-std::string PrintEventType(EventType event_type) {
-  switch (event_type) {
-    case UNKNOWN_TIME:
-      return "unknown_time";
-    case HOST_COMPUTE:
-      return "host_compute";
-    case HOST_COMPILE:
-      return "host_compile";
-    case HOST_TO_HOST:
-      return "host_to_host";
-    case HOST_TO_DEVICE:
-      return "host_to_device";
-    case HOST_PREPARE:
-      return "host_prepare";
-    case DEVICE_COLLECTIVES:
-      return "device_collectives";
-    case HOST_WAIT_INPUT:
-      return "host_wait_input";
-    case DEVICE_TO_DEVICE:
-      return "device_to_device";
-    case DEVICE_TO_HOST:
-      return "device_to_host";
-    case DEVICE_COMPUTE_32:
-      return "device_compute_32";
-    case DEVICE_COMPUTE_16:
-      return "device_compute_16";
-    case DEVICE_WAIT_DEVICE:
-      return "device_wait_device";
-    case DEVICE_WAIT_HOST:
-      return "device_wait_host";
-    default:
-      return "unexpected";
-  }
-}
-
-std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span) {
-  return absl::StrCat("(", PrintEventType(event_type_span.type), ", ",
-                      event_type_span.span.DebugString(), ")");
-}
-
-absl::string_view PrintStepMarkerType(StepMarkerType type) {
-  switch (type) {
-    case StepMarkerType::kExplicitHostStepMarker:
-      return "ExplicitHostStepMarker";
-    case StepMarkerType::kImplicitHostStepMarker:
-      return "ImplicitHostStepMarker";
-    case StepMarkerType::kDeviceStepMarker:
-      return "DeviceStepMarker";
-  }
-}
-
-std::string PrintStepMarker(const StepMarker& step_marker) {
-  return absl::StrCat("(", PrintStepMarkerType(step_marker.type), ", ",
-                      step_marker.event_name, ", ",
-                      step_marker.span.DebugString(), ")");
-}
-
-std::string PrintStepEvents(const StepEvents& step_events) {
-  std::vector<int64_t> step_ids;
-  step_ids.reserve(step_events.size());
-  for (const auto& id_details : step_events) {
-    step_ids.push_back(id_details.first);
-  }
-  absl::c_sort(step_ids);
-  std::string result = "{";
-  for (auto id : step_ids) {
-    absl::StrAppend(&result, "\n");
-    auto* details = gtl::FindOrNull(step_events, id);
-    std::string details_str = details ? details->DebugString() : "()";
-    absl::StrAppend(&result, id, ":", details_str);
-  }
-  return absl::StrCat(result, "\n}");
-}
-
-void UnionCombineStepEvents(const StepEvents& src, StepEvents* dst) {
-  for (const auto& step_details : src) {
-    int64_t step_id = step_details.first;
-    const StepDetails& src_details = step_details.second;
-    StepDetails* dst_details = &(*dst)[step_id];
-    dst_details->Combine(src_details);
-  }
-}
-
-void IntersectCombineStepEvents(const StepEvents& src, StepEvents* dst) {
-  if (dst->empty()) {
-    *dst = src;
-    return;
-  }
-  auto iter = dst->begin();
-  while (iter != dst->end()) {
-    if (!src.contains(iter->first)) {
-      // This is safe because the post-increment is sequenced after the full
-      // expression that contains it.
-      dst->erase(iter++);
-    } else {
-      iter->second.Combine(src.at(iter->first));
-      iter++;
-    }
-  }
-}
-
-std::vector<EventTypeSpan> ToNonOverlappedEvents(
-    const std::vector<EventTypeSpan>& overlapped_events) {
-  std::vector<EventBoundary> event_boundaries =
-      GenerateEventBoundaries(overlapped_events);
-  std::vector<EventTypeSpan> result;
-  if (event_boundaries.empty()) return result;
-  result.reserve(event_boundaries.size());
-  PriorityTracker priority_tracker;
-  for (int64_t i = 0, end = (event_boundaries.size() - 1); i < end; i++) {
-    EventType highest_priority = priority_tracker.Update(event_boundaries[i]);
-    result.push_back({highest_priority, tsl::profiler::Timespan::FromEndPoints(
-                                            event_boundaries[i].time_ps,
-                                            event_boundaries[i + 1].time_ps)});
-  }
-  return result;
-}
-
-// Converts from overlapped step-events to non-overlapped step-events.
-StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
-  StepEvents non_overlapped_step_events;
-  for (const auto& step_events : overlapped_step_events) {
-    const auto& step_id = step_events.first;
-    const auto& step_details = step_events.second;
-    non_overlapped_step_events.try_emplace(step_id,
-                                           step_details.ToNonOverlapped());
-  }
-  return non_overlapped_step_events;
-}
-
-void StepDetails::AddMarker(const StepMarker& m) { markers_.push_back(m); }
-
-void StepDetails::AddEvent(const EventTypeSpan& e) { events_.push_back(e); }
-
-void StepDetails::AggregateDeviceMemoryTransfers(
-    const std::vector<DeviceMemoryTransfer>& device_memory_transfers) {
-  if (device_memory_transfers.size() != device_memory_transfers_.size()) {
-    return;  // Sanity check.
-  }
-  for (size_t i = 0; i < device_memory_transfers.size(); ++i) {
-    device_memory_transfers_[i].set_occurrence(
-        device_memory_transfers_[i].occurrence() +
-        device_memory_transfers[i].occurrence());
-    device_memory_transfers_[i].set_bytes_transferred(
-        device_memory_transfers_[i].bytes_transferred() +
-        device_memory_transfers[i].bytes_transferred());
-    device_memory_transfers_[i].set_time_us(
-        device_memory_transfers_[i].time_us() +
-        device_memory_transfers[i].time_us());
-  }
-}
-
-void StepDetails::AddCollectiveOpEvent(uint64 core_id, const AllReduceInfo& e) {
-  *collectives_[core_id].add_all_reduce_info() = e;
-}
-
-void StepDetails::AddDeviceMemoryTransferEvent(
-    EventType event_type, const tsl::profiler::Timespan& time_span,
-    uint64 bytes) {
-  int index = 0;
-  switch (event_type) {
-    case HOST_TO_DEVICE:
-      index = 0;
-      break;
-    case DEVICE_TO_HOST:
-      index = 1;
-      break;
-    case DEVICE_TO_DEVICE:
-      index = 2;
-      break;
-    default:
-      return;
-  }
-  device_memory_transfers_[index].set_occurrence(
-      device_memory_transfers_[index].occurrence() + 1);
-  device_memory_transfers_[index].set_time_us(
-      device_memory_transfers_[index].time_us() +
-      time_span.duration_ps() / 1000000.0);
-  device_memory_transfers_[index].set_bytes_transferred(
-      device_memory_transfers_[index].bytes_transferred() + bytes);
-}
-
-tsl::profiler::Timespan StepDetails::StepTime() const {
-  tsl::profiler::Timespan max_host_step_time;
-  tsl::profiler::Timespan max_device_step_time;
-  for (const auto& marker : markers_) {
-    tsl::profiler::Timespan& cur_max_step_time =
-        marker.type == StepMarkerType::kDeviceStepMarker ? max_device_step_time
-                                                         : max_host_step_time;
-    const tsl::profiler::Timespan& new_step_time = marker.span;
-    if (new_step_time.duration_ps() > cur_max_step_time.duration_ps())
-      cur_max_step_time = new_step_time;
-  }
-  // CPU-only profile.
-  if (max_device_step_time.Empty()) {
-    return max_host_step_time;
-  }
-
-  // If the host step time includes the device step time, use the host step
-  // time. This covers the case where the device is synchronized at the end of
-  // each step.
-  if (max_host_step_time.Includes(max_device_step_time)) {
-    return max_host_step_time;
-  }
-  return max_device_step_time;
-}
-
-StepDetails StepDetails::ToNonOverlapped() const {
-  StepDetails non_overlapped_step_details;
-  non_overlapped_step_details.markers_ = markers_;
-  non_overlapped_step_details.events_ = ToNonOverlappedEvents(events_);
-  non_overlapped_step_details.collectives_ = collectives_;
-  non_overlapped_step_details.device_memory_transfers_ =
-      device_memory_transfers_;
-  non_overlapped_step_details.step_name_ = step_name_;
-  non_overlapped_step_details.per_core_op_metrics_db_ = per_core_op_metrics_db_;
-  return non_overlapped_step_details;
-}
-
-void StepDetails::Combine(const StepDetails& other) {
-  markers_.insert(markers_.end(), other.markers_.begin(), other.markers_.end());
-  events_.insert(events_.end(), other.events_.begin(), other.events_.end());
-  collectives_.insert(other.collectives_.begin(), other.collectives_.end());
-  AggregateDeviceMemoryTransfers(other.device_memory_transfers_);
-  for (const auto& [core_id, op_metric_db] : other.per_core_op_metrics_db_) {
-    per_core_op_metrics_db_[core_id] = op_metric_db;
-  }
-  if (step_name_.empty()) step_name_ = other.step_name_;
-}
-
-std::string StepDetails::DebugString() const {
-  std::string result = "([";
-  for (int i = 0, end = markers_.size(); i < end; i++) {
-    if (i > 0) absl::StrAppend(&result, ", ");
-    absl::StrAppend(&result, PrintStepMarker(markers_[i]));
-  }
-  absl::StrAppend(&result, "], [");
-  for (int i = 0, end = events_.size(); i < end; i++) {
-    if (i > 0) absl::StrAppend(&result, ", ");
-    absl::StrAppend(&result, PrintEventTypeSpan(events_[i]));
-  }
-  return absl::StrCat(result, "])");
-}
-
-bool StepDetails::operator==(const StepDetails& other) const {
-  const auto& other_markers = other.Markers();
-  if (markers_.size() != other_markers.size()) return false;
-  for (uint64 i = 0; i < markers_.size(); i++) {
-    if (markers_[i] != other_markers[i]) return false;
-  }
-  const auto& other_events = other.Events();
-  if (events_.size() != other_events.size()) return false;
-  for (uint64 i = 0; i < events_.size(); i++) {
-    if (events_[i] != other_events[i]) return false;
-  }
-  return true;
-}
-
-bool operator==(const StepEvents& a, const StepEvents& b) {
-  if (a.size() != b.size()) return false;
-  for (const auto& id_details : a) {
-    const auto a_id = id_details.first;
-    const auto& a_details = id_details.second;
-    const auto* b_details = gtl::FindOrNull(b, a_id);
-    if (b_details == nullptr) return false;
-    if (a_details != *b_details) return false;
-  }
-  return true;
-}
-
-PrecisionStats ComputePrecisionStats(
-    const StepEvents& nonoverlapped_step_events) {
-  int64_t compute_32bit_ps = 0;
-  int64_t compute_16bit_ps = 0;
-  for (const auto& id_details : nonoverlapped_step_events) {
-    for (const auto& event : id_details.second.Events()) {
-      switch (event.type) {
-        case DEVICE_COMPUTE_32:
-          compute_32bit_ps += event.span.duration_ps();
-          break;
-        case DEVICE_COMPUTE_16:
-          compute_16bit_ps += event.span.duration_ps();
-          break;
-        default:
-          break;
-      }
-    }
-  }
-  PrecisionStats precision_stats;
-  precision_stats.set_compute_32bit_ps(compute_32bit_ps);
-  precision_stats.set_compute_16bit_ps(compute_16bit_ps);
-  return precision_stats;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 2b7b2c75b2f7..04506b6e6c68 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -16,254 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// The various event types. Enumerations are numbered such that a bigger number
-// has a higher priority than a smaller number when used in execution-time
-// breakdown.
-enum EventType {
-  // No event associated with the time. It could be that the machine was idle or
-  // executing some events which were not traced.
-  UNKNOWN_TIME = 0,
-  // Host is computing.
-  HOST_COMPUTE = 10,
-  // Host is preprocessing the data before the execution on device.
-  HOST_PREPROCESS = 20,
-  // Host is postprocessing the data after the execution on device.
-  HOST_POSTPROCESS = 30,
-  // Host is batching data (for inference).
-  HOST_BATCH_FORMATION = 40,
-  // Host runtime, like memory allocation and etc.
-  HOST_RUNTIME = 50,
-  // Host is compiling.
-  HOST_COMPILE = 60,
-  // Host-to-host communication.
-  HOST_TO_HOST = 70,
-  // Host-to-device communication.
-  HOST_TO_DEVICE = 80,
-  // Host is preparing to launch a computation on device.
-  HOST_PREPARE = 90,
-  // Assigns a smaller priority to DEVICE_COLLECTIVES than HOST_WAIT_INPUT,
-  // because if an all-reduce event is overlapped with an host-wait-input event,
-  // we want to count it as waiting for input.
-  // Collective Ops such as All-Reduce.
-  DEVICE_COLLECTIVES = 100,
-  // Host is waiting for input.
-  HOST_WAIT_INPUT = 110,
-  // Device-to-device communication.
-  DEVICE_TO_DEVICE = 120,
-  // Device-to-host communication.
-  DEVICE_TO_HOST = 130,
-  // Device is computing with 32-bit precision.
-  DEVICE_COMPUTE_32 = 140,
-  // Device is computing with 16-bit precision.
-  DEVICE_COMPUTE_16 = 150,
-  // Device is waiting for another device.
-  DEVICE_WAIT_DEVICE = 160,
-  // Device is waiting for host.
-  DEVICE_WAIT_HOST = 170,
-  LAST_EVENT_TYPE = DEVICE_WAIT_HOST
-};
-
-// Generic event types that shown to the user.
-enum GenericEventType {
-  kFirstGenericEventType = 1,
-  // Device is computing.
-  kDeviceCompute = kFirstGenericEventType,
-  // Device-to-device communication.
-  kDeviceToDevice,
-  // Collective Ops such as All-Reduce and NCCL.
-  kDeviceCollectives,
-  // Host is computing.
-  kHostCompute,
-  // Host is preparing to launch a computation on device.
-  kHostPrepare,
-  // Device waiting for input from the host.
-  kInput,
-  // Device sending output to the host.
-  kOutput,
-  // Host is compling.
-  kCompile,
-  // No recognized event associated with the time.
-  kAllOthers,
-  kLastGenericEventType = kAllOthers,
-};
-
-// Contains the type and timespan of an event.
-struct EventTypeSpan {
-  EventType type;                // type of this event.
-  tsl::profiler::Timespan span;  // timespan of this event.
-  EventTypeSpan(EventType t, tsl::profiler::Timespan s) : type(t), span(s) {}
-  // Equality test.
-  bool operator==(const EventTypeSpan& other) const {
-    return type == other.type && span == other.span;
-  }
-  // Inequality test.
-  bool operator!=(const EventTypeSpan& other) const {
-    return !(*this == other);
-  }
-};
-
-enum class StepMarkerType {
-  // "TraceContext" TraceMe events.
-  kExplicitHostStepMarker,
-  // Identified by group_events (e.g., FunctionRun, SessionRun).
-  kImplicitHostStepMarker,
-  // Derived from the result of group_events. A device step marker starts with
-  // the first device event of the group and ends with the last event of the
-  // group.
-  kDeviceStepMarker,
-};
-
-// Record of an event that is used as a step marker.
-struct StepMarker {
-  StepMarkerType type;
-  std::string event_name;  // name of this event.
-  std::string step_name;
-  tsl::profiler::Timespan span;  // timespan of this event.
-  StepMarker(StepMarkerType step_marker_type, absl::string_view name,
-             tsl::profiler::Timespan s)
-      : type(step_marker_type), event_name(name), span(s) {}
-  // Equality test.
-  bool operator==(const StepMarker& other) const {
-    return type == other.type && event_name == other.event_name &&
-           span == other.span;
-  }
-  // Inequality test.
-  bool operator!=(const StepMarker& other) const { return !(*this == other); }
-};
-
-// Details of a step. Note that this could be the result of combining the
-// StepDetails of the same step executed on different cores.
-class StepDetails {
- public:
-  StepDetails() : device_memory_transfers_(3) {}
-
-  const std::vector<StepMarker>& Markers() const { return markers_; }
-  const std::vector<EventTypeSpan>& Events() const { return events_; }
-
-  const absl::flat_hash_map<uint32, AllReduceDbResult>& Collectives() const {
-    return collectives_;
-  }
-  const std::vector<DeviceMemoryTransfer>& DeviceMemoryTransfers() const {
-    return device_memory_transfers_;
-  }
-
-  absl::flat_hash_map<uint32, OpMetricsDb>& PerCoreOpMetricsDb() {
-    return per_core_op_metrics_db_;
-  }
-  // Returns the step time.
-  tsl::profiler::Timespan StepTime() const;
-  // Adds a step-marker to this step.
-  void AddMarker(const StepMarker& m);
-  // Adds an EventTypeSpan to this step.
-  void AddEvent(const EventTypeSpan& e);
-  // Adds a collective op to this step.
-  void AddCollectiveOpEvent(uint64 core_id, const AllReduceInfo& e);
-  // Appends device memory transfer events to this step.
-  // Only event type of HOST_TO_DEVICE/DEVICE_TO_DEVICE/DEVICE_TO_HOST are
-  // allowed.
-  void AddDeviceMemoryTransferEvent(EventType event_type,
-                                    const tsl::profiler::Timespan& time_span,
-                                    uint64 bytes);
-  // Returns the step name.
-  std::string StepName() const { return step_name_; }
-  // Sets the name of this step.
-  void SetStepName(std::string step_name) { step_name_ = step_name; }
-
-  // Converts from overlapped events to non-overlapped events.
-  StepDetails ToNonOverlapped() const;
-
-  // Combines other.
-  void Combine(const StepDetails& other);
-
-  // Equality test.
-  bool operator==(const StepDetails& other) const;
-  // Inequality test.
-  bool operator!=(const StepDetails& other) const { return !(*this == other); }
-
-  // Returns a string that prints the content of this object.
-  std::string DebugString() const;
-
-  void SetPerCoreOpMetricsDb(OpMetricsDb db, uint32 core_id) {
-    per_core_op_metrics_db_[core_id] = db;
-  }
-
- private:
-  // Accumulates the device memory transfers from another step to this step.
-  void AggregateDeviceMemoryTransfers(
-      const std::vector<DeviceMemoryTransfer>& device_memory_transfers);
-
-  // All step-markers found for marking this step in the traces. There could be
-  // multiple step-markers for a single step for different reasons. One such
-  // reason is that there may be one step-marker for the same step on each core;
-  // so after combining the StepDetails from multiple cores, there would be
-  // multiple step-markers for the same step.
-  std::vector<StepMarker> markers_;
-  // All events belonging to this step.
-  std::vector<EventTypeSpan> events_;
-  // Collective operation related events such as all-reduce etc.
-  absl::flat_hash_map<uint32, AllReduceDbResult> collectives_;
-  // Device memory transfers (including time and bytes involved).
-  // TODO(jiesun): Consider to use IntervalSet instead of just sum up the event
-  // durations.
-  std::vector<DeviceMemoryTransfer> device_memory_transfers_;
-  std::string step_name_;
-
-  absl::flat_hash_map<uint32, OpMetricsDb> per_core_op_metrics_db_;
-};
-
-// Map from step_id to the events happened in that step.
-using StepEvents = absl::flat_hash_map<int64_t /*step_id*/, StepDetails>;
-
-// Equality test for StepEvents.
-bool operator==(const StepEvents& a, const StepEvents& b);
-
-// Returns the name of the given EventType.
-std::string PrintEventType(EventType event_type);
-
-// Returns the string of the given GenericEventType.
-absl::string_view GetGenericEventTypeStr(GenericEventType event_type);
-
-// Returns a string that prints the given EventTypeSpan.
-std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span);
-
-// Returns a string that prints the given StepMarker.
-std::string PrintStepMarker(const StepMarker& step_marker);
-
-// Returns a string that prints the given StepEvents.
-std::string PrintStepEvents(const StepEvents& step_events);
-
-// Unions the map of StepEvents and combines the src StepEvents into dst.
-void UnionCombineStepEvents(const StepEvents& src, StepEvents* dst);
-
-// Intersects the map of StepEvents and combines the src StepEvents into dst.
-void IntersectCombineStepEvents(const StepEvents& src, StepEvents* dst);
-
-// Converts from overlapped events to non-overlapped events.
-std::vector<EventTypeSpan> ToNonOverlappedEvents(
-    const std::vector<EventTypeSpan>& overlapped_events);
-
-// Converts from overlapped step-events to non-overlapped step events.
-StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events);
-
-// Returns the precision stats of the given non-overlapped step events.
-PrecisionStats ComputePrecisionStats(
-    const StepEvents& nonoverlapped_step_events);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/event_span.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
diff --git a/tensorflow/core/profiler/utils/gpu_event_stats.cc b/tensorflow/core/profiler/utils/gpu_event_stats.cc
deleted file mode 100644
index eaa4c6ae17ae..000000000000
--- a/tensorflow/core/profiler/utils/gpu_event_stats.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/gpu_event_stats.h"
-
-#include <cstdint>
-
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-const absl::string_view kAnnotationDelimiter = "::";
-
-}
-
-GpuEventStats::GpuEventStats(const XEventVisitor* event) {
-  event->ForEachStat([&](const XStatVisitor& stat) {
-    if (!stat.Type().has_value()) return;
-    switch (stat.Type().value()) {
-      case StatType::kTfOp:
-        tf_op_fullname = stat.StrOrRefValue();
-        break;
-      case StatType::kEquation:
-        equation = stat.StrOrRefValue();
-        break;
-      case StatType::kTensorShapes:
-        tensor_shapes = stat.StrOrRefValue();
-        break;
-      case StatType::kHloOp:
-        hlo_op_names =
-            absl::StrSplit(stat.StrOrRefValue(), kAnnotationDelimiter);
-        break;
-      case StatType::kHloModule:
-        hlo_module_name = stat.StrOrRefValue();
-        break;
-      case StatType::kProgramId:
-        program_id = stat.IntOrUintValue();
-        break;
-      case StatType::kKernelDetails:
-        kernel_details = stat.StrOrRefValue();
-        break;
-      case StatType::kMemcpyDetails:
-        memcpy_details = stat.StrOrRefValue();
-        break;
-      case StatType::kCorrelationId:
-        correlation_id = static_cast<int64_t>(stat.IntOrUintValue());
-        break;
-      case StatType::kGroupId:
-        group_id = stat.IntValue();
-        break;
-      case StatType::kIsEager:
-        is_eager = stat.BoolValue();
-        break;
-      case StatType::kCudaGraphExecId:
-        cuda_graph_exec_id = stat.UintValue();
-        break;
-      case StatType::kCudaGraphId:
-        cuda_graph_id_for_inner_node = stat.UintValue();
-        break;
-      case StatType::kScopeRangeId:
-        scope_range_id = stat.IntValue();
-        break;
-      default:
-        break;
-    }
-  });
-}
-
-LaunchEventStats::LaunchEventStats(const XEventVisitor* event) {
-  event->ForEachStat([&](const XStatVisitor& stat) {
-    if (!stat.Type().has_value()) return;
-    switch (stat.Type().value()) {
-      case StatType::kDeviceId:
-        device_id = stat.IntOrUintValue();
-        break;
-      case StatType::kCorrelationId:
-        correlation_id = static_cast<int64_t>(stat.IntOrUintValue());
-        break;
-      case StatType::kGroupId:
-        group_id = stat.IntValue();
-        break;
-      default:
-        break;
-    }
-  });
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/gpu_event_stats.h b/tensorflow/core/profiler/utils/gpu_event_stats.h
index 369492dcceef..574e333ae678 100644
--- a/tensorflow/core/profiler/utils/gpu_event_stats.h
+++ b/tensorflow/core/profiler/utils/gpu_event_stats.h
@@ -16,67 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_GPU_EVENT_STATS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_GPU_EVENT_STATS_H_
 
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Stats from a GPU stream XEvent.
-struct GpuEventStats {
-  explicit GpuEventStats(const XEventVisitor* event);
-
-  bool IsKernel() const { return !kernel_details.empty(); }
-  bool IsMemCpy() const { return !memcpy_details.empty(); }
-  bool IsCudaGraphExecution() const { return cuda_graph_exec_id.has_value(); }
-
-  bool IsXlaOp() const { return !hlo_op_names.empty(); }
-  bool IsTfOp() const { return !tf_op_fullname.empty(); }
-
-  // Stats from TensorFlow.
-  absl::string_view tf_op_fullname;
-  absl::string_view equation;
-  absl::string_view tensor_shapes;
-
-  // Stats from XLA.
-  std::vector<absl::string_view> hlo_op_names;
-  absl::string_view hlo_module_name;
-  std::optional<uint64_t> program_id;
-
-  // Stats from CUPTI.
-  absl::string_view kernel_details;
-  absl::string_view memcpy_details;
-  std::optional<int64_t> correlation_id;
-  std::optional<int64_t> scope_range_id;
-
-  // Stats derived by grouping.
-  std::optional<int64_t> group_id;
-  bool is_eager = false;
-  std::optional<uint64_t> cuda_graph_exec_id;
-  std::optional<uint64_t> cuda_graph_id_for_inner_node;
-};
-
-// Stats for a host-side GPU launch XEvent.
-struct LaunchEventStats {
-  explicit LaunchEventStats(const XEventVisitor* event);
-
-  bool IsLaunch() const {
-    return device_id.has_value() && correlation_id.has_value();
-  }
-
-  // Stats from CUPTI.
-  std::optional<int64_t> device_id;
-  std::optional<int64_t> correlation_id;
-
-  // Stat derived by grouping.
-  std::optional<int64_t> group_id;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/gpu_event_stats.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_GPU_EVENT_STATS_H_
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
deleted file mode 100644
index 22beb1d51bc8..000000000000
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
-
-#include <algorithm>
-
-#include "absl/container/btree_map.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// The calculation methods is referred from Nvidia developer forum:
-// https://forums.developer.nvidia.com/t/how-to-calculate-the-tensor-core-fp16-performance-of-h100/244727
-// Below data are calculated from the various NVidia whitepapers/specs.
-
-// https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_9_0 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 128,
-            .fp32_tflops = 256,
-            .bf16_tflops = 512,
-            .fp16_tflops = 512,
-            .int8_tops = 1024,
-        },
-    .tensor_core =
-        {
-            .fp64_tflops = 256,
-            .fp32_tflops = 2048,
-            .bf16_tflops = 4096,
-            .fp16_tflops = 4096,
-            .fp8_tflops = 8192,
-            .int8_tops = 8192,
-        },
-    .has_tensor_core_sparsity_support = true,
-};
-
-// https://images.nvidia.com/aem-dam/Solutions/geforce/ada/nvidia-ada-gpu-architecture.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_8_9 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 128,
-            .fp32_tflops = 256,
-            .bf16_tflops = 256,
-            .fp16_tflops = 256,
-            .int8_tops = 512,
-        },
-    .tensor_core =
-        {
-            .fp32_tflops = 512,
-            .bf16_tflops = 1024,
-            .fp16_tflops = 1024,
-            .fp8_tflops = 2048,
-            .int8_tops = 2048,
-            .int4_tops = 4096,
-        },
-    .has_tensor_core_sparsity_support = true,
-};
-
-// https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_8_6 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 128,
-            .fp32_tflops = 256,
-            .bf16_tflops = 256,
-            .fp16_tflops = 256,
-            .int8_tops = 512,
-        },
-    .tensor_core =
-        {
-            .fp32_tflops = 256,
-            .bf16_tflops = 512,
-            .fp16_tflops = 1024,
-            .int8_tops = 2048,
-            .int4_tops = 4096,
-        },
-    .has_tensor_core_sparsity_support = true,
-};
-
-// https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_8_0 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 64,
-            .fp32_tflops = 128,
-            .bf16_tflops = 256,
-            .fp16_tflops = 512,
-            .int8_tops = 512,
-        },
-    .tensor_core =
-        {
-            .fp64_tflops = 128,
-            .fp32_tflops = 1024,
-            .bf16_tflops = 2048,
-            .fp16_tflops = 2048,
-            .int8_tops = 4096,
-        },
-    .has_tensor_core_sparsity_support = true,
-};
-
-// https://images.nvidia.com/aem-dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_7_5 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 64,
-            .fp32_tflops = 128,
-            .fp16_tflops = 256,
-            .int8_tops = 512,
-        },
-    .tensor_core =
-        {
-            .fp16_tflops = 1024,
-            .int8_tops = 2048,
-            .int4_tops = 4096,
-        },
-    .has_tensor_core_sparsity_support = false,
-};
-
-// https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_7_0 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 64,
-            .fp32_tflops = 128,
-            .bf16_tflops = 0.0,
-            .fp16_tflops = 256,
-            .int8_tops = 512,
-        },
-    .tensor_core =
-        {
-            .fp16_tflops = 1024,
-        },
-    .has_tensor_core_sparsity_support = false,
-};
-
-// https://images.nvidia.com/content/pdf/tesla/whitepaper/pascal-architecture-whitepaper.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_6_1 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 8,
-            .fp32_tflops = 256,
-            .fp16_tflops = 4,
-            .int8_tops = 1024,
-        },
-    .tensor_core = {},
-    .has_tensor_core_sparsity_support = false,
-};
-
-// https://images.nvidia.com/content/pdf/tesla/whitepaper/pascal-architecture-whitepaper.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_6_0 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 64,
-            .fp32_tflops = 128,
-            .fp16_tflops = 256,
-            .int8_tops = 512,
-        },
-    .tensor_core = {},
-    .has_tensor_core_sparsity_support = false,
-};
-
-// https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-product-literature/NVIDIA-Kepler-GK110-GK210-Architecture-Whitepaper.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_5_0 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 4,
-            .fp32_tflops = 256,
-        },
-    .tensor_core = {},
-    .has_tensor_core_sparsity_support = false,
-};
-
-// https://www.nvidia.com/content/PDF/product-specifications/GeForce_GTX_680_Whitepaper_FINAL.pdf
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_3_0 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 128,
-            .fp32_tflops = 384,
-        },
-    .tensor_core = {},
-    .has_tensor_core_sparsity_support = false,
-};
-
-const GpuFlopCapabilities kComputeCap_PerSM_PerCycle_2_0 = {
-    .cuda_core =
-        {
-            .fp64_tflops = 8,
-            .fp32_tflops = 64,
-        },
-    .tensor_core = {},
-    .has_tensor_core_sparsity_support = false,
-};
-
-GpuFlopCapabilities GetNvidiaFlopCapsPerSMPerCycle(int major_comp_cap,
-                                                   int minor_comp_cap) {
-  static const auto& kPerSMFlopCapsTable =
-      *new absl::btree_map<int, GpuFlopCapabilities const*>{
-          // TODO: Add incoming blackwell, and other old GPUS
-          {9000, &kComputeCap_PerSM_PerCycle_9_0},
-          {8090, &kComputeCap_PerSM_PerCycle_8_9},
-          {8060, &kComputeCap_PerSM_PerCycle_8_6},
-          {8000, &kComputeCap_PerSM_PerCycle_8_0},
-          {7050, &kComputeCap_PerSM_PerCycle_7_5},
-          {7000, &kComputeCap_PerSM_PerCycle_7_0},
-          {6010, &kComputeCap_PerSM_PerCycle_6_1},
-          {6000, &kComputeCap_PerSM_PerCycle_6_0},
-          {5000, &kComputeCap_PerSM_PerCycle_5_0},
-          {3000, &kComputeCap_PerSM_PerCycle_3_0},
-          {2000, &kComputeCap_PerSM_PerCycle_2_0},
-      };
-
-  const int normalized_compute_cap =
-      major_comp_cap * 1000 + minor_comp_cap * 10;
-  GpuFlopCapabilities flops_cap{};
-  auto it = kPerSMFlopCapsTable.lower_bound(normalized_compute_cap);
-  if (it == kPerSMFlopCapsTable.end()) {
-    LOG(WARNING) << "GPU compute capability " << major_comp_cap << "."
-                 << minor_comp_cap << " is too old to support.";
-  } else {
-    flops_cap = *it->second;
-    if (it->first != normalized_compute_cap) {
-      LOG(WARNING) << "GPU compute capability " << major_comp_cap << "."
-                   << minor_comp_cap
-                   << " is not found. Use the highest compute cap known "
-                   << (it->first / 1000) << "." << ((it->first % 1000) / 10)
-                   << " instead.";
-    }
-  }
-  return flops_cap;
-}
-
-GpuFlopCapabilities GetGpuFlopCapabilitiesPerSM(
-    const DeviceCapabilities& device_cap) {
-  GpuFlopCapabilities flops_cap{};
-  if (device_cap.device_vendor() == kDeviceVendorNvidia) {
-    flops_cap =
-        GetNvidiaFlopCapsPerSMPerCycle(device_cap.compute_capability().major(),
-                                       device_cap.compute_capability().minor());
-  } else {
-    LOG(WARNING) << "Unsupported device vendor " << device_cap.device_vendor();
-  }
-
-  flops_cap.ScaleWith(device_cap.clock_rate_in_ghz());
-  return flops_cap;
-}
-
-}  // namespace
-
-double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
-  GpuFlopCapabilities sm_flops = GetGpuFlopCapabilitiesPerSM(device_cap);
-  double result = std::max(
-      {sm_flops.cuda_core.fp32_tflops, sm_flops.cuda_core.fp16_tflops,
-       sm_flops.tensor_core.fp32_tflops, sm_flops.tensor_core.fp16_tflops});
-  VLOG(3) << "GetFlopMaxThroughputPerSM get result: " << result << " GFLOPs";
-  return result;
-}
-
-double GetSharedMemoryBandwidthPerSM(const DeviceCapabilities& device_cap) {
-  // https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/memorystatisticsshared.htm
-  // Compute capability 2.0, each bank has bandwidth of 4 bytes per 2 cycles.
-  // For compute capability 3.0 and above, each bank has bandwidth 8 bytes per
-  // cycle. Each SM has 32 banks.
-  double transaction_byts_per_cycle =
-      device_cap.compute_capability().major() <= 2 ? (32 * 4 / 2) : (32 * 8);
-  double GiBPS = transaction_byts_per_cycle * device_cap.clock_rate_in_ghz();
-  return tsl::profiler::GigaToUni(GiBPS);
-}
-
-absl::string_view GpuModelName(const DeviceCapabilities& device_cap) {
-  if (device_cap.device_vendor() == kDeviceVendorNvidia) {
-    switch (device_cap.compute_capability().major()) {
-      case 2:
-        return "Nvidia GPU (Fermi)";
-      case 3:
-        return "Nvidia GPU (Kepler)";
-      case 5:
-        return "Nvidia GPU (Maxwell)";
-      case 6:
-        return "Nvidia GPU (Pascal)";
-      case 7:
-        if (device_cap.compute_capability().minor() < 5) {
-          return "Nvidia GPU (Volta)";
-        } else {
-          return "Nvidia GPU (Turing)";
-        }
-      case 8:
-        if (device_cap.compute_capability().minor() < 9) {
-          return "Nvidia GPU (Ampere)";
-        } else {
-          return "Nvidia GPU (Ada Lovelace)";
-        }
-      case 9:
-        return "Nvidia GPU (Hopper)";
-      case 10:
-        return "Nvidia GPU (Blackwell)";
-      default:
-        return "Nvidia GPU";
-    }
-  } else if (device_cap.device_vendor() == kDeviceVendorAMD) {
-    switch (device_cap.compute_capability().major()) {
-      case 9:
-        return "AMD GPU - gfx-9XX series";
-      case 10:
-        return "AMD GPU - gfx-10XX series";
-      case 11:
-        return "AMD GPU - gfx-11XX series";
-      default:
-        return "AMD GPU";
-    }
-  } else {
-    LOG(ERROR) << "Unknown device vendor " << device_cap.device_vendor();
-    return "";
-  }
-}
-
-HardwareType ParseHardwareType(absl::string_view device_type) {
-  if (absl::StrContains(device_type, "GPU")) return HardwareType::GPU;
-  if (device_type == "CPU") return HardwareType::CPU_ONLY;
-  if (absl::StrContains(device_type, "TPU")) return HardwareType::TPU;
-  return HardwareType::UNKNOWN_HARDWARE;
-}
-
-bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; }
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.h b/tensorflow/core/profiler/utils/hardware_type_utils.h
index 41b1bd4b6547..c2fc5266bc37 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.h
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.h
@@ -16,67 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
 
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-struct GpuFlopCapabilities {
-  struct FlopCapabilityOnPrecisions {
-    double fp64_tflops = 0;
-    double fp32_tflops = 0;  // also for tf32 for nvidia tensor core
-    double bf16_tflops = 0;
-    double fp16_tflops = 0;
-    double fp8_tflops = 0;
-    double int8_tops = 0;
-    double fp4_tflops = 0;
-    double int4_tops = 0;
-
-    void ScaleWith(double scale) {
-      fp64_tflops *= scale;
-      fp32_tflops *= scale;
-      bf16_tflops *= scale;
-      fp16_tflops *= scale;
-      fp8_tflops *= scale;
-      int8_tops *= scale;
-      fp4_tflops *= scale;
-      int4_tops *= scale;
-    }
-  };
-
-  FlopCapabilityOnPrecisions cuda_core;
-  FlopCapabilityOnPrecisions tensor_core;
-  bool has_tensor_core_sparsity_support = false;
-
-  void ScaleWith(double scale) {
-    cuda_core.ScaleWith(scale);
-    tensor_core.ScaleWith(scale);
-  }
-};
-
-// Get peak single precision throughput of the GPU in GFLOPS per
-// streaming multiprocessor.
-// TODO: Need design on how to use the sparsity capability of FLOPs.
-double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
-
-// for Nvidia GPU, return shared memory bandwidth in Bytes Per Second on
-// one single SM given the GPU core freq in device_cap.
-double GetSharedMemoryBandwidthPerSM(const DeviceCapabilities& device_cap);
-
-// Returns the GPU model name from the given DeviceCapabilities.
-// For nvidia GPUs, the name is like "Nvidia GPU (Kepler)" or "Nvidia GPU
-// (Turing)". For AMD GPUs, the name is like "AMD GPU - gfx-10XX series".
-// The model name here for Nvidia GPU in fact refers to its microarchitecture
-// name.
-absl::string_view GpuModelName(const DeviceCapabilities& device_cap);
-
-HardwareType ParseHardwareType(absl::string_view device_type);
-
-// Returns true if the given hardware type has a device.
-bool HasDevice(HardwareType x);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/hardware_type_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_HARDWARE_TYPE_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils_test.cc b/tensorflow/core/profiler/utils/hardware_type_utils_test.cc
deleted file mode 100644
index 9476848a650d..000000000000
--- a/tensorflow/core/profiler/utils/hardware_type_utils_test.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
-
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-TEST(HardwareTypeUtilsTest, H100PeakComputTFlops) {
-  DeviceCapabilities device_cap;
-  // For NVIDIA H100 PCIe 80 GB, according to
-  // https://resources.nvidia.com/en-us-data-center-overview/gtc22-whitepaper-hopper
-  // https://www.techpowerup.com/gpu-specs/h100-pcie-80-gb.c3899
-  device_cap.set_clock_rate_in_ghz(1.620);
-  device_cap.set_num_cores(114);
-  device_cap.set_memory_size_in_bytes(
-      tsl::profiler::GibiToGiga(tsl::profiler::GigaToUni(80)));
-  device_cap.set_memory_bandwidth(tsl::profiler::GigaToUni(2.04 * 1024));
-  device_cap.set_device_vendor("Nvidia");
-  device_cap.mutable_compute_capability()->set_major(9);
-  device_cap.mutable_compute_capability()->set_minor(0);
-
-  // Get target TFLOPS per SM and check.
-  double peak_tflops =
-      GetFlopMaxThroughputPerSM(device_cap) * device_cap.num_cores() / 1000.0;
-  EXPECT_NEAR(peak_tflops, 756, /*abs_error=*/1.0);
-}
-
-TEST(HardwareTypeUtilsTest, A100PeakComputTFlops) {
-  DeviceCapabilities device_cap;
-  // For NVIDIA A100 SXM4 80 GB, according to:
-  // https://images.nvidia.com/aem-dam/en-zz/Solutions/data-center/nvidia-ampere-architecture-whitepaper.pdf
-  // https://www.techpowerup.com/gpu-specs/a100-sxm4-80-gb.c3746
-  device_cap.set_clock_rate_in_ghz(1.410);
-  device_cap.set_num_cores(108);
-  device_cap.set_memory_size_in_bytes(
-      tsl::profiler::GibiToGiga(tsl::profiler::GigaToUni(80)));
-  device_cap.set_memory_bandwidth(tsl::profiler::GigaToUni(2.04 * 1024));
-  device_cap.set_device_vendor("Nvidia");
-  device_cap.mutable_compute_capability()->set_major(8);
-  device_cap.mutable_compute_capability()->set_minor(0);
-
-  double peak_tflops =
-      GetFlopMaxThroughputPerSM(device_cap) * device_cap.num_cores() / 1000.0;
-  EXPECT_NEAR(peak_tflops, 312, /*abs_error=*/1.0);
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hlo_module_map.cc b/tensorflow/core/profiler/utils/hlo_module_map.cc
deleted file mode 100644
index d4683d22f33e..000000000000
--- a/tensorflow/core/profiler/utils/hlo_module_map.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/hlo_module_map.h"
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/strings/str_cat.h"
-#include "xla/service/hlo_cost_analysis.h"
-#include "xla/shape.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "tsl/profiler/lib/traceme_encode.h"
-
-#if GOOGLE_CUDA
-#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
-#endif
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/profiler/utils/hlo_module_utils.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-#include "tensorflow/core/profiler/utils/hlo_proto_to_module.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-#if GOOGLE_CUDA
-int64_t ShapeSize(const xla::Shape& shape) {
-  constexpr int64_t kPointerSize = 8;
-  return xla::ShapeUtil::ByteSizeOf(shape, kPointerSize);
-}
-#endif
-
-}  // namespace
-
-HloInstructionWrapper::HloInstructionWrapper(
-    const xla::HloInstruction* instr, const xla::HloCostAnalysis* cost_analysis)
-    : instr_(instr),
-      op_full_name_(
-          tsl::profiler::TraceMeOp(Metadata().op_name(), Metadata().op_type())),
-      tf_op_name_(tsl::profiler::TfOpFullname(Metadata().op_type(),
-                                              Metadata().op_name())),
-      category_(instr_->ToCategory()),
-      expression_(tensorflow::profiler::UncachedExpression(
-          instr_, false, tensorflow::profiler::kMaxHlolNameSize)) {
-  ProcessXlaCostAnalysis(cost_analysis);
-}
-
-HloModuleWrapper::HloModuleWrapper(
-    const xla::HloProto& hlo_proto,
-    std::function<int64_t(const xla::Shape&)> shape_func)
-    : HloModuleWrapper(ConvertHloProtoToModuleIgnoringErrors(hlo_proto),
-                       shape_func) {}
-
-HloModuleWrapper::HloModuleWrapper(
-    std::unique_ptr<xla::HloModule> module,
-    std::function<int64_t(const xla::Shape&)> shape_func)
-    : module_(std::move(module)) {
-  if (module_ == nullptr) return;
-
-  const xla::HloCostAnalysis* cost_analysis = nullptr;
-#if GOOGLE_CUDA
-  if (shape_func == nullptr) shape_func = ShapeSize;
-  xla::HloCostAnalysis::Options options;
-  options.shape_size = shape_func;
-  xla::gpu::GpuHloCostAnalysis gpu_cost_analysis(options);
-
-  const xla::HloComputation* hlo_computation = module_->entry_computation();
-  gpu_cost_analysis.ReserveVisitStates(hlo_computation->instruction_count());
-  tsl::Status analysis_status = hlo_computation->Accept(&gpu_cost_analysis);
-  if (analysis_status.ok()) {
-    // Clear the visit state as it isn't used by anybody and it uses a lot of
-    // memory.
-    gpu_cost_analysis.DestroyVisitState();
-  } else {
-    LOG(ERROR) << "Failed to create cost analysis: " << analysis_status;
-  }
-  cost_analysis = &gpu_cost_analysis;
-#endif
-
-  // Populate instructions_by_name_ with module.
-  for (const xla::HloComputation* computation : module_->computations()) {
-    for (const xla::HloInstruction* instr : computation->instructions()) {
-      instructions_by_name_.try_emplace(
-          instr->name(), HloInstructionWrapper(instr, cost_analysis));
-    }
-  }
-  // Gather nested fusion instructions.
-  for (const xla::HloComputation* computation : module_->computations()) {
-    // Some modules still seem to have "dead" fusions computations. In this
-    // case, IsFusionComputation() = true but there is no parent
-    // FusionInstruction().
-    if (computation->FusionInstruction() != nullptr) {
-      GatherFusionInstructions(computation->FusionInstruction());
-    }
-  }
-}
-
-// Function to gather all the instructions in a fusion computation.
-void HloModuleWrapper::GatherFusionInstructions(xla::HloInstruction* inst) {
-  HloInstructionWrapper* fused_inst_wrapper =
-      GetMutableHloInstruction(inst->name());
-  DCHECK(fused_inst_wrapper != nullptr);
-  if (!fused_inst_wrapper->FusedChildren().empty()) return;
-  for (auto* fused : inst->fused_instructions()) {
-    const auto child_inst_wrapper = GetHloInstruction(fused->name());
-    DCHECK(child_inst_wrapper != nullptr);
-    fused_inst_wrapper->AddFusedChild(child_inst_wrapper);
-    if (fused->opcode() == xla::HloOpcode::kFusion) {
-      GatherFusionInstructions(fused);
-    }
-  }
-}
-
-HloInstructionWrapper* HloModuleWrapper::GetMutableHloInstruction(
-    absl::string_view hlo_name) {
-  auto it = instructions_by_name_.find(hlo_name);
-  if (it != instructions_by_name_.end()) return &it->second;
-  return nullptr;
-}
-
-const HloInstructionWrapper* HloModuleWrapper::GetHloInstruction(
-    absl::string_view hlo_name) const {
-  auto it = instructions_by_name_.find(hlo_name);
-  if (it != instructions_by_name_.end()) return &it->second;
-  return nullptr;
-}
-
-std::string HloInstructionWrapper::source_info() const {
-  if (!Metadata().source_file().empty()) {
-    return absl::StrCat(io::Basename(Metadata().source_file()), ":",
-                        Metadata().source_line());
-  } else {
-    return std::string();
-  }
-}
-
-void AddHloProto(HloModuleMap& hlo_module_map, uint64_t program_id,
-                 const xla::HloProto& hlo_proto) {
-  auto hlo_module = ConvertHloProtoToModule(hlo_proto);
-  if (!hlo_module.ok()) {
-    LOG(ERROR) << hlo_module.status();
-    return;
-  }
-  hlo_module_map.try_emplace(program_id,
-                             HloModuleWrapper(std::move(hlo_module).value(),
-                                              /*shape_func=*/nullptr));
-}
-
-void ProcessHloModuleMapFromXSpace(HloModuleMap& hlo_module_map,
-                                   const XSpace* space) {
-  for (auto& [program_id, hlo_proto] : ParseHloProtosFromXSpace(*space)) {
-    AddHloProto(hlo_module_map, program_id, *hlo_proto);
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hlo_module_map.h b/tensorflow/core/profiler/utils/hlo_module_map.h
index ab6898af72ed..e6c58633d5f3 100644
--- a/tensorflow/core/profiler/utils/hlo_module_map.h
+++ b/tensorflow/core/profiler/utils/hlo_module_map.h
@@ -16,200 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
 
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_cost_analysis.h"
-#include "xla/shape.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "tensorflow/core/profiler/utils/hlo_module_utils.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-class HloInstructionInterface {
- public:
-  virtual ~HloInstructionInterface() = default;
-  virtual absl::string_view Name() const = 0;
-  virtual xla::HloOpcode HloOpcode() const = 0;
-  virtual absl::string_view Category() const = 0;
-  virtual std::string HloOpcodeString() const = 0;
-  virtual const xla::OpMetadata& Metadata() const = 0;
-  virtual size_t flops() const = 0;
-  virtual size_t bytes_accessed() const = 0;
-  virtual std::string_view op_full_name() const = 0;
-  virtual std::string_view TfOpName() const = 0;
-  virtual std::string source_info() const = 0;
-  virtual bool isRoot() const = 0;
-  virtual bool IsFusion() const = 0;
-  virtual const std::string& Expression() const = 0;
-
-  virtual void ProcessXlaCostAnalysis(
-      const xla::HloCostAnalysis* cost_analysis) = 0;
-  virtual std::string OpLocationStack(int32_t frame_id) const = 0;
-  virtual tsl::profiler::OpSourceInfo SourceInfo() const = 0;
-};
-
-// This wrapper allows caching the results of HloInstruction methods.
-// This wrapper is not thread safe.
-class HloInstructionWrapper : public HloInstructionInterface {
- public:
-  explicit HloInstructionWrapper(
-      const xla::HloInstruction* instr,
-      const xla::HloCostAnalysis* cost_analysis = nullptr);
-
-  // Non copyable
-  HloInstructionWrapper(const HloInstructionWrapper&) = delete;
-  HloInstructionWrapper& operator=(const HloInstructionWrapper&) = delete;
-  // Movable.
-  HloInstructionWrapper(HloInstructionWrapper&&) = default;
-  HloInstructionWrapper& operator=(HloInstructionWrapper&&) = default;
-
-  absl::string_view Name() const override { return instr_->name(); }
-
-  xla::HloOpcode HloOpcode() const override { return instr_->opcode(); }
-
-  absl::string_view Category() const override { return category_; }
-
-  std::string HloOpcodeString() const override {
-    return std::string(xla::HloOpcodeString(instr_->opcode()));
-  }
-
-  const xla::OpMetadata& Metadata() const override {
-    return instr_->metadata();
-  }
-
-  size_t flops() const override { return flops_; }
-  size_t bytes_accessed() const override { return bytes_accessed_; }
-
-  std::string_view op_full_name() const override { return op_full_name_; }
-  std::string_view TfOpName() const override { return tf_op_name_; }
-  std::string source_info() const override;
-
-  bool isRoot() const override { return instr_->IsRoot(); }
-  bool IsFusion() const override { return !fused_children_.empty(); };
-
-  void ProcessXlaCostAnalysis(
-      const xla::HloCostAnalysis* cost_analysis) override {
-    if (cost_analysis == nullptr) return;
-    flops_ = cost_analysis->flop_count(*instr_);
-    bytes_accessed_ = cost_analysis->bytes_accessed(*instr_);
-  }
-
-  const std::string& Expression() const override { return expression_; }
-
-  void AddFusedChild(const HloInstructionWrapper* child) {
-    fused_children_.push_back(child);
-  };
-
-  const std::vector<const HloInstructionWrapper*>& FusedChildren() const {
-    return fused_children_;
-  }
-
-  std::string OpLocationStack(int32_t frame_id) const override {
-    return GetOpLocationStack(frame_id, instr_);
-  }
-
-  tsl::profiler::OpSourceInfo SourceInfo() const override {
-    return GetSourceInfo(instr_);
-  }
-
- private:
-  const xla::HloInstruction* instr_;
-  std::vector<const HloInstructionWrapper*> fused_children_;
-  std::string op_full_name_;
-  std::string tf_op_name_;
-  size_t flops_ = 0;
-  size_t bytes_accessed_ = 0;
-  std::string category_;
-  std::string expression_;
-};
-
-// Helper class for accessing HloModule.
-class HloModuleInterface {
- public:
-  virtual ~HloModuleInterface() = default;
-
-  // If the module contains no instructions.
-  virtual bool Empty() const = 0;
-  virtual absl::string_view Name() const = 0;
-  // Function to populated nested childs= instructions in a fusion.
-  virtual void GatherFusionInstructions(xla::HloInstruction* inst) = 0;
-};
-
-// Wraps HLO module and provides an interface that maps HLO names to
-// HloInstructionWrappers.
-class HloModuleWrapper : public HloModuleInterface {
- public:
-  explicit HloModuleWrapper(
-      const xla::HloProto& hlo_proto,
-      std::function<int64_t(const xla::Shape&)> shape_func = nullptr);
-
-  explicit HloModuleWrapper(
-      std::unique_ptr<xla::HloModule> module,
-      std::function<int64_t(const xla::Shape&)> shape_func);
-
-  const HloInstructionWrapper* GetHloInstruction(
-      absl::string_view hlo_name) const;
-  HloInstructionWrapper* GetMutableHloInstruction(absl::string_view hlo_name);
-
-  bool Empty() const override { return instructions_by_name_.empty(); }
-
-  absl::string_view Name() const override { return module_->name(); }
-  void GatherFusionInstructions(xla::HloInstruction* inst) override;
-
- private:
-  std::unique_ptr<xla::HloModule> module_;
-
-  // Map of HloInstructionWrappers by name.
-  using HloInstructionMap =
-      absl::flat_hash_map<absl::string_view, HloInstructionWrapper>;
-  HloInstructionMap instructions_by_name_;
-};
-
-// Map of HloModuleWrappers by program_id.
-using HloModuleMap =
-    absl::flat_hash_map<uint64_t /*program_id*/, HloModuleWrapper>;
-
-void AddHloProto(HloModuleMap& hlo_module_map, uint64_t program_id,
-                 const xla::HloProto& hlo_proto);
-
-// Process HloModuleMap from single XSpace.
-void ProcessHloModuleMapFromXSpace(HloModuleMap& hlo_module_map,
-                                   const XSpace* space);
-
-// WARNING: The returned pointer will be invalidated if HloModuleMap is mutated.
-inline const HloModuleWrapper* GetHloModule(const HloModuleMap* hlo_module_map,
-                                            uint64_t program_id) {
-  if (hlo_module_map == nullptr) return nullptr;
-  auto iter = hlo_module_map->find(program_id);
-  if (iter == hlo_module_map->end()) return nullptr;
-  return &iter->second;
-}
-
-inline const HloInstructionWrapper* GetHloInstruction(
-    const HloModuleMap& hlo_module_map, std::optional<uint64_t> program_id,
-    absl::string_view hlo_name) {
-  if (!program_id.has_value()) return nullptr;
-  const auto* hlo_module = GetHloModule(&hlo_module_map, *program_id);
-  if (hlo_module == nullptr) return nullptr;
-  return hlo_module->GetHloInstruction(hlo_name);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/hlo_module_map.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
diff --git a/tensorflow/core/profiler/utils/hlo_module_utils.h b/tensorflow/core/profiler/utils/hlo_module_utils.h
index 20813f2663b2..8b68816a52eb 100644
--- a/tensorflow/core/profiler/utils/hlo_module_utils.h
+++ b/tensorflow/core/profiler/utils/hlo_module_utils.h
@@ -16,101 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
 
-#include <cstddef>
-#include <cstdint>
-#include <string>
-
-#include "absl/algorithm/container.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_print_options.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Sometimes HLO produce a huge string (>100MB). Limit the name size to 1MB.
-static constexpr size_t kMaxHlolNameSize = 1000000;
-
-inline const xla::HloInstruction* FindInstruction(const xla::HloModule& module,
-                                                  std::string node_name) {
-  if (absl::StartsWith(node_name, "%")) {
-    node_name.erase(node_name.begin());
-  }
-  for (const xla::HloComputation* computation : module.computations()) {
-    auto instrs = computation->instructions();
-    auto it = absl::c_find_if(instrs, [&](const xla::HloInstruction* instr) {
-      // Try with and without "%" at the beginning of the node name.
-      return absl::EqualsIgnoreCase(instr->name(), node_name) ||
-             absl::EqualsIgnoreCase(instr->name(),
-                                    absl::StrCat("%", node_name));
-    });
-    if (it != instrs.end()) {
-      return *it;
-    }
-  }
-  return nullptr;
-}
-
-inline const xla::HloComputation* FindComputation(
-    const xla::HloModule& module, const std::string& comp_name) {
-  for (const xla::HloComputation* computation : module.computations()) {
-    if (absl::EqualsIgnoreCase(computation->name(), comp_name)) {
-      return computation;
-    }
-  }
-  return nullptr;
-}
-
-inline std::string UncachedExpression(const xla::HloInstruction* instr,
-                                      bool skip_expression, size_t max_size) {
-  if (skip_expression) {
-    return "";
-  }
-  static const auto* hlo_print_options =
-      new xla::HloPrintOptions(xla::HloPrintOptions()
-                                   .set_print_metadata(false)
-                                   .set_print_backend_config(false)
-                                   .set_print_infeed_outfeed_config(false));
-  std::string expression = instr->ToString(*hlo_print_options);
-  if (expression.size() > max_size) {
-    expression.resize(max_size);
-  }
-  return expression;
-}
-
-inline std::string GetOpLocationStack(int32_t frame_id,
-                                      const xla::HloInstruction* instr) {
-  std::string stack_lines;
-  xla::HloModule* hlo_module = instr->GetModule();
-  while (frame_id != 0) {
-    xla::HloModule::StackFrame frame = hlo_module->get_stack_frame(frame_id);
-    if (frame.empty()) {
-      break;
-    }
-    stack_lines.insert(0, absl::StrCat(frame.file_name, ":", frame.line, ":",
-                                       frame.column, "\n"));
-    frame_id = frame.parent_frame_id;
-  }
-
-  return stack_lines;
-};
-
-inline tsl::profiler::OpSourceInfo GetSourceInfo(
-    const xla::HloInstruction* instr) {
-  if (int32_t stack_frame_id = instr->metadata().stack_frame_id();
-      stack_frame_id != 0) {
-    return {.source_file = instr->metadata().source_file(),
-            .source_line = instr->metadata().source_line(),
-            .stack_frame = GetOpLocationStack(stack_frame_id, instr)};
-  }
-  return {.source_file = instr->metadata().source_file(),
-          .source_line = instr->metadata().source_line()};
-};
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/hlo_module_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/hlo_module_utils_test.cc b/tensorflow/core/profiler/utils/hlo_module_utils_test.cc
deleted file mode 100644
index 18eb2a2cdce7..000000000000
--- a/tensorflow/core/profiler/utils/hlo_module_utils_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/hlo_module_utils.h"
-
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "absl/status/statusor.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-class HloModuleUtilsTest : public xla::HloTestBase {
- protected:
-  absl::StatusOr<std::unique_ptr<xla::HloModule>> GetModuleWithStackFrames() {
-    const char file_name[] = "main.py";
-    const char function_name[] = "func1";
-    const int line_number = 10;
-    const int column_number = 5;
-    const int frame_id = 1;
-    const char text[] = R"(
-    HloModule a_module
-
-    ENTRY main {
-      %c = s32[] constant(1)
-      ROOT %result = s32[] parameter(0)
-    }
-    )";
-    TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(text));
-
-    auto module_proto = module->ToProto();
-    auto index = module_proto.mutable_stack_frame_index();
-    index->add_file_names(file_name);
-    index->add_function_names(function_name);
-    auto location = index->add_file_locations();
-    location->set_file_name_id(frame_id);
-    location->set_function_name_id(1);
-    location->set_line(line_number);
-    location->set_column(column_number);
-
-    auto frame = index->add_stack_frames();
-    frame->set_file_location_id(1);
-
-    // Set the stack frame id of the root instruction.
-    for (auto& computation : *module_proto.mutable_computations()) {
-      if (computation.id() == module_proto.entry_computation_id()) {
-        for (auto& instruction : *computation.mutable_instructions()) {
-          if (instruction.id() == computation.root_id()) {
-            instruction.mutable_metadata()->set_stack_frame_id(frame_id);
-            instruction.mutable_metadata()->set_source_file(file_name);
-            instruction.mutable_metadata()->set_source_line(line_number);
-          }
-        }
-      }
-    }
-
-    return xla::HloModule::CreateFromProto(module_proto, module->config());
-  }
-};
-
-TEST_F(HloModuleUtilsTest, TestGetLocationStack) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<xla::HloModule> module_with_stack_frames,
-      GetModuleWithStackFrames());
-  auto root_instruction =
-      module_with_stack_frames->entry_computation()->root_instruction();
-  EXPECT_EQ(GetOpLocationStack(1, root_instruction), "main.py:10:5\n");
-}
-
-TEST_F(HloModuleUtilsTest, TestGetSourceInfo) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<xla::HloModule> module_with_stack_frames,
-      GetModuleWithStackFrames());
-  auto root_instruction =
-      module_with_stack_frames->entry_computation()->root_instruction();
-  auto source_info = GetSourceInfo(root_instruction);
-  EXPECT_EQ(source_info.source_file, "main.py");
-  EXPECT_EQ(source_info.source_line, 10);
-  EXPECT_EQ(source_info.stack_frame, "main.py:10:5\n");
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hlo_proto_map.cc b/tensorflow/core/profiler/utils/hlo_proto_map.cc
deleted file mode 100644
index 50d96c49980e..000000000000
--- a/tensorflow/core/profiler/utils/hlo_proto_map.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/tsl/profiler/convert/xla_op_utils.h"
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-int NumHeapSimulatorTraceEvents(const xla::HloProto* hlo) {
-  int result = 0;
-  for (const auto& trace : hlo->buffer_assignment().heap_simulator_traces()) {
-    result += trace.events_size();
-  }
-  return result;
-}
-
-}  // namespace
-
-absl::flat_hash_map<uint64_t, std::unique_ptr<xla::HloProto>>
-ParseHloProtosFromXSpace(const XSpace& space) {
-  absl::flat_hash_map<uint64_t, std::unique_ptr<xla::HloProto>> hlo_protos;
-  std::vector<const XPlane*> planes =
-      FindPlanesWithNames(space, {kMetadataPlaneName});
-  for (const XPlane* raw_plane : planes) {
-    if (raw_plane != nullptr) {
-      XPlaneVisitor plane = tsl::profiler::CreateTfXPlaneVisitor(raw_plane);
-
-      const XStatMetadata* hlo_proto_stat_metadata =
-          plane.GetStatMetadataByType(StatType::kHloProto);
-      if (hlo_proto_stat_metadata != nullptr) {
-        plane.ForEachEventMetadata(
-            [&](const XEventMetadataVisitor& event_metadata) {
-              auto hlo_proto_stat = event_metadata.GetStat(
-                  StatType::kHloProto, *hlo_proto_stat_metadata);
-              if (!hlo_proto_stat) return;
-              if (hlo_proto_stat->ValueCase() != XStat::kBytesValue) return;
-              auto hlo_proto = std::make_unique<xla::HloProto>();
-              absl::string_view byte_value = hlo_proto_stat->BytesValue();
-              if (hlo_proto->ParseFromArray(byte_value.data(),
-                                            byte_value.size())) {
-                if (!hlo_protos
-                         .try_emplace(event_metadata.Id(), std::move(hlo_proto))
-                         .second) {
-                  LOG(WARNING) << "Insert failed for hlo_proto with program_id"
-                               << event_metadata.Id();
-                }
-              }
-            });
-      }
-    }
-  }
-  return hlo_protos;
-}
-
-bool HloProtoMap::AddHloProto(uint64_t program_id,
-                              const xla::HloProto* hlo_proto) {
-  bool new_program_id =
-      hlo_protos_by_program_id_.try_emplace(program_id, hlo_proto).second;
-  absl::string_view hlo_module_name = hlo_proto->hlo_module().name();
-  bool new_module_name =
-      hlo_protos_by_name_
-          .try_emplace(tsl::profiler::HloModuleNameWithProgramId(
-                           hlo_module_name, program_id),
-                       hlo_proto)
-          .second;
-  return new_program_id || new_module_name;
-}
-
-void HloProtoMap::AddHloProto(uint64_t program_id,
-                              std::unique_ptr<const xla::HloProto> hlo_proto) {
-  if (AddHloProto(program_id, hlo_proto.get())) {
-    // Only add to <owned_hlo_protos_> if <hlo_proto> is new to HloProtoMap.
-    owned_hlo_protos_.push_back(std::move(hlo_proto));
-  }
-}
-
-void HloProtoMap::AddHloProtosFromXSpace(const XSpace& space) {
-  for (auto& [program_id, hlo_proto] : ParseHloProtosFromXSpace(space)) {
-    AddHloProto(program_id, std::move(hlo_proto));
-  }
-}
-
-std::vector<absl::string_view> HloProtoMap::GetModuleList() const {
-  std::vector<absl::string_view> module_list;
-  module_list.reserve(hlo_protos_by_name_.size());
-  for (const auto& [name, hlo_proto] : hlo_protos_by_name_) {
-    module_list.push_back(name);
-  }
-  return module_list;
-}
-
-std::vector<absl::string_view> HloProtoMap::GetSortedModuleList() const {
-  std::vector<absl::string_view> module_list = GetModuleList();
-  absl::c_sort(module_list);
-  return module_list;
-}
-
-std::vector<absl::string_view> HloProtoMap::GetSortedModuleListByHeapTraceSize()
-    const {
-  std::vector<std::pair<absl::string_view, const xla::HloProto*>> hlo_protos(
-      hlo_protos_by_name_.begin(), hlo_protos_by_name_.end());
-
-  // Sort the hlo protos by heap trace size and then by hlo module name.
-  // This way trivial computations will be on the bottom of the list.
-  absl::c_stable_sort(hlo_protos, [](const auto& a, const auto& b) {
-    int num_a = tensorflow::profiler::NumHeapSimulatorTraceEvents(a.second);
-    int num_b = tensorflow::profiler::NumHeapSimulatorTraceEvents(b.second);
-    return std::tie(num_a, b.first) > std::tie(num_b, a.first);
-  });
-
-  std::vector<absl::string_view> module_list;
-  module_list.reserve(hlo_protos.size());
-  for (const auto& [name, hlo_proto] : hlo_protos) {
-    module_list.push_back(name);
-  }
-  return module_list;
-}
-
-absl::StatusOr<const xla::HloProto*> HloProtoMap::GetHloProtoByProgramId(
-    uint64_t program_id) const {
-  auto iter = hlo_protos_by_program_id_.find(program_id);
-  if (iter != hlo_protos_by_program_id_.end()) {
-    return iter->second;
-  }
-  return absl::NotFoundError(
-      absl::StrCat("Program id: ", program_id, " is not found."));
-}
-
-absl::StatusOr<const xla::HloProto*> HloProtoMap::GetHloProtoByModuleName(
-    absl::string_view module_name) const {
-  auto iter = hlo_protos_by_name_.find(module_name);
-  if (iter != hlo_protos_by_name_.end()) {
-    return iter->second;
-  }
-  return absl::NotFoundError(
-      absl::StrCat("Module name: ", module_name, " is not found."));
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hlo_proto_map.h b/tensorflow/core/profiler/utils/hlo_proto_map.h
index 383c3064bc85..23259adffaed 100644
--- a/tensorflow/core/profiler/utils/hlo_proto_map.h
+++ b/tensorflow/core/profiler/utils/hlo_proto_map.h
@@ -16,71 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_MAP_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_MAP_H_
 
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/service/hlo.pb.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-absl::flat_hash_map<uint64_t /*program_id*/, std::unique_ptr<xla::HloProto>>
-ParseHloProtosFromXSpace(const XSpace& space);
-
-class HloProtoMap {
- public:
-  void AddHloProtosFromXSpace(const XSpace& space);
-
-  void AddHloProto(uint64_t program_id,
-                   std::unique_ptr<const xla::HloProto> hlo_proto);
-
-  size_t size() const { return hlo_protos_by_program_id_.size(); }
-
-  auto begin() const { return hlo_protos_by_program_id_.begin(); }
-  auto end() const { return hlo_protos_by_program_id_.end(); }
-
-  bool contains(absl::string_view name) const {
-    return hlo_protos_by_name_.contains(name);
-  }
-
-  bool contains(uint64_t program_id) const {
-    return hlo_protos_by_program_id_.contains(program_id);
-  }
-
-  // Returns a list of module names (not sorted).
-  std::vector<absl::string_view> GetModuleList() const;
-
-  // Returns a list of module names sorted alphabetically.
-  std::vector<absl::string_view> GetSortedModuleList() const;
-
-  // Returns a list of hlo module names sorted first by heap trace size and then
-  // by hlo module name alphabetically.
-  std::vector<absl::string_view> GetSortedModuleListByHeapTraceSize() const;
-
-  absl::StatusOr<const xla::HloProto*> GetHloProtoByModuleName(
-      absl::string_view module_name) const;
-
-  absl::StatusOr<const xla::HloProto*> GetHloProtoByProgramId(
-      uint64_t program_id) const;
-
- private:
-  absl::flat_hash_map<uint64_t, const xla::HloProto*> hlo_protos_by_program_id_;
-  absl::flat_hash_map<std::string, const xla::HloProto*> hlo_protos_by_name_;
-  std::vector<std::unique_ptr<const xla::HloProto>> owned_hlo_protos_;
-
-  // Try to add proto to the map and returns true if the addition is successful
-  // (i.e., the proto is new to the map).
-  bool AddHloProto(uint64_t program_id, const xla::HloProto* hlo_proto);
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/hlo_proto_map.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_MAP_H_
diff --git a/tensorflow/core/profiler/utils/hlo_proto_to_module.cc b/tensorflow/core/profiler/utils/hlo_proto_to_module.cc
deleted file mode 100644
index 4083bbfe8bbe..000000000000
--- a/tensorflow/core/profiler/utils/hlo_proto_to_module.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/hlo_proto_to_module.h"
-
-#include <memory>
-#include <utility>
-
-#include "absl/log/log.h"
-#include "absl/status/statusor.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace tensorflow {
-namespace profiler {
-
-absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertHloProtoToModule(
-    const xla::HloProto& hlo_proto) {
-  if (!hlo_proto.has_hlo_module()) {
-    return xla::Internal("No HLO module found in the HLO proto");
-  }
-  const xla::HloModuleProto& module_proto = hlo_proto.hlo_module();
-  TF_ASSIGN_OR_RETURN(auto config, xla::HloModule::CreateModuleConfigFromProto(
-                                       module_proto, xla::DebugOptions()));
-  TF_ASSIGN_OR_RETURN(auto module,
-                      xla::HloModule::CreateFromProto(module_proto, config));
-  return module;
-}
-
-std::unique_ptr<xla::HloModule> ConvertHloProtoToModuleIgnoringErrors(
-    const xla::HloProto& hlo_proto) {
-  auto module = ConvertHloProtoToModule(hlo_proto);
-  if (!module.ok()) {
-    LOG(ERROR) << module.status();
-    return nullptr;
-  }
-  return std::move(module).value();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hlo_proto_to_module.h b/tensorflow/core/profiler/utils/hlo_proto_to_module.h
index 4cf3fa638336..954ed71345c9 100644
--- a/tensorflow/core/profiler/utils/hlo_proto_to_module.h
+++ b/tensorflow/core/profiler/utils/hlo_proto_to_module.h
@@ -16,22 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_TO_MODULE_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_TO_MODULE_H_
 
-#include <memory>
-
-#include "absl/status/statusor.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/hlo.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertHloProtoToModule(
-    const xla::HloProto& hlo_proto);
-
-std::unique_ptr<xla::HloModule> ConvertHloProtoToModuleIgnoringErrors(
-    const xla::HloProto& hlo_proto);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/hlo_proto_to_module.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_PROTO_TO_MODULE_H_
diff --git a/tensorflow/core/profiler/utils/host_offload_utils.cc b/tensorflow/core/profiler/utils/host_offload_utils.cc
deleted file mode 100644
index 7f135985d0b1..000000000000
--- a/tensorflow/core/profiler/utils/host_offload_utils.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/utils/host_offload_utils.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <queue>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-bool HostOffloadEventProcessor::IsHostOffloadOpName(
-    const XEventVisitor& event) const {
-  static constexpr absl::string_view keywords[] = {"copy-start",
-                                                   "copy-done",
-                                                   "dynamic-slice-start",
-                                                   "dynamic-slice-done",
-                                                   "dynamic-update-slice-start",
-                                                   "dynamic-update-slice-done"};
-
-  for (const auto& keyword : keywords) {
-    // The host_memory_label_ S(5) is used by instructions to designate tensors
-    // that are on the host.
-    if (absl::StrContains(event.DisplayName(), keyword) &&
-        absl::StrContains(event.Name(), host_memory_label_)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-std::string HostOffloadEventProcessor::GetOffloadInstructionID(
-    absl::string_view op_name) const {
-  std::vector<std::string> op_name_vec = absl::StrSplit(op_name, '.');
-
-  // If no dot is found, or it's at the beginning or end of the string, return
-  // a 0. Hlo opnames are not expected to have a dot followed by 0.
-  if (op_name_vec.size() < 2) {
-    return "0";
-  }
-  return op_name_vec.back();
-}
-
-std::string HostOffloadEventProcessor::GetOffloadInstructionName(
-    absl::string_view op_name) const {
-  // TODO(b/342469268): Get the display ID and name from the HloInstruction, not
-  // just the event name.
-  std::string display_id = GetOffloadInstructionID(op_name);
-
-  size_t startPos = op_name.find("-start");
-  size_t donePos = op_name.find("-done");
-
-  absl::string_view display_opname;
-  if (startPos != absl::string_view::npos) {
-    display_opname = op_name.substr(0, startPos);
-  } else if (donePos != absl::string_view::npos) {
-    display_opname = op_name.substr(0, donePos);
-  } else {
-    // Invalid input format: neither "-start" nor "-done" found
-    LOG(WARNING) << "Invalid op name: " << op_name;
-    display_opname = op_name;
-  }
-  return absl::StrCat("offload-", display_opname, ".", display_id);
-}
-
-void HostOffloadEventProcessor::ProcessHostOffloadOpEvent(
-    const XEventVisitor& event, std::optional<int64_t> group_id) {
-  std::string display_opname = GetOffloadInstructionName(event.DisplayName());
-
-  auto [iter, inserted] = seen_events_.try_emplace(display_opname);
-  std::queue<const XEventVisitor*>& events = iter->second;
-
-  if (absl::StrContains(event.DisplayName(), "-start")) {
-    // For start events, just push them into the queue.
-    events.push(&event);
-    return;
-  } else if (absl::StrContains(event.DisplayName(), "-done")) {
-    // for done events, pop the start event and create the new event.
-    // Not all start events may be traced. In this case we just skip the
-    // corresponding done event.
-    if (events.empty()) {
-      LOG(INFO) << "No corresponding start event found for "
-                << event.DisplayName();
-      return;
-    }
-    const XEventVisitor* start_event = events.front();
-    events.pop();
-
-    // At this point, we have the corresponding start and end event.
-    // Create the new event.
-    tsl::profiler::Timespan event_span = tsl::profiler::Timespan::FromEndPoints(
-        start_event->GetTimespan().begin_ps(), event.GetTimespan().end_ps());
-
-    // Find the line with the smallest event end time frontier that can fit this
-    // new event without overlapping with its other events.
-    int line_builder_index = -1;
-    uint64_t minimum_end_time_frontier = event_span.begin_ps();
-    for (int i = 0; i < host_offload_op_line_builders_.size(); ++i) {
-      if (host_offload_op_line_builders_[i].event_end_time_frontier_ns <=
-          minimum_end_time_frontier) {
-        line_builder_index = i;
-        minimum_end_time_frontier =
-            host_offload_op_line_builders_[i].event_end_time_frontier_ns;
-      }
-    }
-
-    constexpr int kMaxHostOffloadOpLinesSize =
-        kThreadIdHostOffloadOpEnd - kThreadIdHostOffloadOpStart + 1;
-
-    // If no existing lines can fit this new event, create a new line.
-    if (line_builder_index == -1) {
-      if (host_offload_op_line_builders_.size() < kMaxHostOffloadOpLinesSize) {
-        XLineBuilder lb = plane_builder_->GetOrCreateLine(
-            kThreadIdHostOffloadOpStart +
-            host_offload_op_line_builders_.size());
-        lb.SetName(absl::StrFormat("%s row %d", kHostOffloadOpLineName,
-                                   host_offload_op_line_builders_.size()));
-        lb.SetTimestampNs(start_timestamp_ns_);
-        host_offload_op_line_builders_.push_back(
-            {std::move(lb), event_span.end_ps()});
-      }
-      // If we have reached the maximum number of lines, just use the last line.
-      line_builder_index = host_offload_op_line_builders_.size() - 1;
-    }
-
-    // Update the event end time frontier for the line.
-    host_offload_op_line_builders_[line_builder_index]
-        .event_end_time_frontier_ns =
-        std::max(host_offload_op_line_builders_[line_builder_index]
-                     .event_end_time_frontier_ns,
-                 event_span.end_ps());
-
-    XEventMetadata* host_offload_copy_metadata =
-        plane_builder_->CreateEventMetadata();
-    host_offload_copy_metadata->set_display_name(display_opname);
-    XEventBuilder event_builder =
-        host_offload_op_line_builders_[line_builder_index]
-            .line_builder.AddEvent(*host_offload_copy_metadata);
-    event_builder.SetTimespan(event_span);
-
-    // We mark the events as async so that they are displayed on new sub-lines
-    // below other async events.
-    const XStatMetadata& async_stat = *plane_builder_->GetOrCreateStatMetadata(
-        GetStatTypeStr(StatType::kIsAsync));
-    event_builder.AddStatValue(async_stat, 1);
-
-    // Set metadata stats for the event.
-    const XStatMetadata& raw_bytes_stat =
-        *plane_builder_->GetOrCreateStatMetadata(
-            GetStatTypeStr(StatType::kRawBytesAccessed));
-    event.Metadata().ForEachStat([&](const XStatVisitor& stat) {
-      if (stat.Type() == StatType::kRawBytesAccessed) {
-        event_builder.AddStatValue(raw_bytes_stat, stat.IntValue());
-      }
-    });
-    const XStatMetadata& shape_with_layout_str =
-        *plane_builder_->GetOrCreateStatMetadata(
-            GetStatTypeStr(StatType::kShapeWithLayout));
-    // Use the shape from start_event, since it contains the shape of end event.
-    start_event->Metadata().ForEachStat([&](const XStatVisitor& stat) {
-      if (stat.Type() == StatType::kShapeWithLayout) {
-        event_builder.AddStatValue(shape_with_layout_str, stat.StrOrRefValue());
-      }
-    });
-  }
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/host_offload_utils.h b/tensorflow/core/profiler/utils/host_offload_utils.h
deleted file mode 100644
index dbf308fbfe1e..000000000000
--- a/tensorflow/core/profiler/utils/host_offload_utils.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
-#define TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
-
-#include <cstdint>
-#include <optional>
-#include <queue>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "xla/layout.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-struct LineBuilderAndEventEndTimeFrontier {
-  XLineBuilder line_builder;
-  uint64_t event_end_time_frontier_ns;
-};
-
-class HostOffloadEventProcessor {
- public:
-  HostOffloadEventProcessor(XPlaneBuilder* plane_builder,
-                            uint64_t start_timestamp_ns)
-      : plane_builder_(plane_builder),
-        start_timestamp_ns_(start_timestamp_ns) {}
-  ~HostOffloadEventProcessor() = default;
-
-  void ProcessHostOffloadOpEvent(const XEventVisitor& event,
-                                 std::optional<int64_t> group_id);
-
-  bool IsHostOffloadOpName(const XEventVisitor& event) const;
-
- private:
-  std::string GetOffloadInstructionID(absl::string_view op_name) const;
-  std::string GetOffloadInstructionName(absl::string_view op_name) const;
-
-  absl::flat_hash_map<std::string, std::queue<const XEventVisitor*>>
-      seen_events_;
-  std::string host_memory_label_ =
-      absl::StrCat("S(", xla::Layout::kHostMemorySpace, ")");
-
-  XPlaneBuilder* plane_builder_;
-  uint64_t start_timestamp_ns_;
-
-  std::vector<LineBuilderAndEventEndTimeFrontier>
-      host_offload_op_line_builders_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HOST_OFFLOAD_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/html_utils.h b/tensorflow/core/profiler/utils/html_utils.h
index 215d9f51d5be..9dbf42507b43 100644
--- a/tensorflow/core/profiler/utils/html_utils.h
+++ b/tensorflow/core/profiler/utils/html_utils.h
@@ -16,21 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
 
-#include <string>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Creates a html that links to the given url with the given text.
-inline std::string AnchorElement(absl::string_view url,
-                                 absl::string_view text) {
-  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/html_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
deleted file mode 100644
index be88b2164652..000000000000
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-
-#include <algorithm>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/match.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-// The maximum number of Kernels displayed on Kernel Stats page.
-const int kMaxNumOfKernels = 1000;
-
-// A list of patterns to help determine if a kernel uses Tensor Core.
-// A kernel uses Tensor Core if its kernel name contains any of these patterns.
-// Some examples of kernel names: volta_h884gemm, turing_fp16_s1688cudnn_fp16
-constexpr absl::string_view kTensorCoreKernelNamePatterns[] = {
-    "16816",
-    "c1688",
-    "conv1x1",
-    "conv2d_c1_k1",
-    "dgrad_1x1_stride_2x2",
-    "direct_group",
-    "first_layer_wgrad_kernel",
-    "h1688",
-    "h884",
-    "hmma",
-    "i16832",
-    "i8816",
-    "s884",
-    "s1688",
-    "xmma_gemm",
-    "xmma_implicit_gemm",
-    "xmma_sparse_conv",
-    "xmma_sparse_gemm",
-    "xmma_warp_specialized_implicit_gemm"};
-
-}  // namespace
-
-void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
-                             KernelReport* kernel) {
-  const std::vector<absl::string_view> params =
-      absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(" \n"));
-
-  constexpr uint32 kNumDimensions = 3;
-  for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
-    kernel->add_block_dim(1);
-    kernel->add_grid_dim(1);
-  }
-
-  // Process tokens.
-  for (const auto& param : params) {
-    const std::vector<absl::string_view> key_value = absl::StrSplit(param, ':');
-    if (key_value.size() != 2) {
-      // Unrecognized token.
-      continue;
-    }
-    absl::string_view key = key_value[0];
-    absl::string_view value_str = key_value[1];
-    uint32 value = 0;
-    double pct = 0.0;
-    // Cases that consume a pair of tokens "key:value".
-    if (key == "regs" && absl::SimpleAtoi(value_str, &value)) {
-      kernel->set_registers_per_thread(value);
-    } else if (key == "static_shared" && absl::SimpleAtoi(value_str, &value)) {
-      kernel->set_static_shmem_bytes(value);
-    } else if (key == "dynamic_shared" && absl::SimpleAtoi(value_str, &value)) {
-      kernel->set_dynamic_shmem_bytes(value);
-    } else if (key == "block") {
-      const std::vector<absl::string_view>& block =
-          absl::StrSplit(value_str, ',');
-      uint32 tmp[3];
-      if (block.size() == 3 && absl::SimpleAtoi(block[0], &tmp[0]) &&
-          absl::SimpleAtoi(block[1], &tmp[1]) &&
-          absl::SimpleAtoi(block[2], &tmp[2])) {
-        std::copy_n(tmp, 3, kernel->mutable_block_dim()->begin());
-      }
-    } else if (key == "grid") {
-      const std::vector<absl::string_view>& grid =
-          absl::StrSplit(value_str, ',');
-      uint32 tmp[3];
-      if (grid.size() == 3 && absl::SimpleAtoi(grid[0], &tmp[0]) &&
-          absl::SimpleAtoi(grid[1], &tmp[1]) &&
-          absl::SimpleAtoi(grid[2], &tmp[2])) {
-        std::copy_n(tmp, 3, kernel->mutable_grid_dim()->begin());
-      }
-    } else if (key == "occ_pct" && absl::SimpleAtod(value_str, &pct)) {
-      kernel->set_occupancy_pct(pct);
-    }
-  }
-}
-
-bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
-  VLOG(1) << "kernel name: " << kernel_name;
-  for (absl::string_view pattern : kTensorCoreKernelNamePatterns) {
-    if (absl::StrContains(kernel_name, pattern)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// This list is not exhaustive.
-bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
-  // Disable formatting to keep inline comments vertically aligned.
-  // clang-format off
-  return false
-      // Using EndsWith to match Fused operations.
-      || absl::EndsWith(tf_op_name, "Conv2D")
-      || absl::EndsWith(tf_op_name, "Conv2DBackpropFilter")
-      || absl::EndsWith(tf_op_name, "Conv2DBackpropInput")
-      || absl::EndsWith(tf_op_name, "Conv3D")
-      || absl::EndsWith(tf_op_name, "DepthwiseConv2dNative")
-      || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropFilter")
-      || absl::EndsWith(tf_op_name, "DepthwiseConv2dNativeBackpropInput")
-      // Using Contains to match V2/V3 suffixes.
-      || absl::StrContains(tf_op_name, "BatchMatMul")
-      // MatMul requires exact matching.
-      || absl::EndsWith(tf_op_name, "/MatMul")
-      || absl::EndsWith(tf_op_name, "FusedMatMul")
-      // cuDNN operations.
-      || absl::EndsWith(tf_op_name, "/CudnnRNN")
-      || absl::StrContains(tf_op_name, "CudnnRNNV")
-      || absl::StrContains(tf_op_name, "CudnnRNNForward")
-      || absl::StrContains(tf_op_name, "CudnnRNNBackprop")
-      // Special cases.
-      || absl::EndsWith(tf_op_name, "XlaDot")
-      || absl::EndsWith(tf_op_name, "XlaDotV2");
-  // clang-format on
-}
-
-bool IsEinsumTensorCoreEligible(absl::string_view equation) {
-  if (equation.empty()) {
-    return false;
-  }
-  const std::vector<absl::string_view> input_output =
-      absl::StrSplit(equation, "->");
-  if (input_output.size() != 2) {
-    return false;
-  }
-  const std::vector<absl::string_view> lhs_rhs =
-      absl::StrSplit(input_output[0], ',');
-  return lhs_rhs.size() == 2;
-}
-
-bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
-                                                const KernelReport& rhs) const {
-  // Disable formatting to keep vertical alignment for better readability,
-  // and make it easier to reorder columns.
-  // clang-format off
-  auto lhs_tuple = std::make_tuple(
-      lhs.name(),
-      lhs.grid_dim(0),
-      lhs.grid_dim(1),
-      lhs.grid_dim(2),
-      lhs.block_dim(0),
-      lhs.block_dim(1),
-      lhs.block_dim(2),
-      lhs.registers_per_thread(),
-      lhs.static_shmem_bytes(),
-      lhs.dynamic_shmem_bytes(),
-      lhs.is_kernel_using_tensor_core(),
-      lhs.is_op_tensor_core_eligible(),
-      lhs.op_name());
-
-  auto rhs_tuple = std::make_tuple(
-      rhs.name(),
-      rhs.grid_dim(0),
-      rhs.grid_dim(1),
-      rhs.grid_dim(2),
-      rhs.block_dim(0),
-      rhs.block_dim(1),
-      rhs.block_dim(2),
-      rhs.registers_per_thread(),
-      rhs.static_shmem_bytes(),
-      rhs.dynamic_shmem_bytes(),
-      rhs.is_kernel_using_tensor_core(),
-      rhs.is_op_tensor_core_eligible(),
-      rhs.op_name());
-  // clang-format on
-  return lhs_tuple < rhs_tuple;
-}
-
-bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
-                                               const KernelReport& rhs) const {
-  // Disable formatting to keep vertical alignment for better readability,
-  // and make it easier to reorder columns.
-  // clang-format off
-  // Put the most expensive string comparisons last.
-  return (
-      lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
-      lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
-      lhs.block_dim(0) == rhs.block_dim(0) &&
-      lhs.block_dim(1) == rhs.block_dim(1) &&
-      lhs.block_dim(2) == rhs.block_dim(2) &&
-      lhs.grid_dim(0) == rhs.grid_dim(0) &&
-      lhs.grid_dim(1) == rhs.grid_dim(1) &&
-      lhs.grid_dim(2) == rhs.grid_dim(2) &&
-      lhs.registers_per_thread() == rhs.registers_per_thread() &&
-      lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
-      lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
-      lhs.name() == rhs.name() &&
-      lhs.op_name() == rhs.op_name());
-  // clang-format on
-}
-
-void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db) {
-  auto comp = [](const KernelReport& lhs, const KernelReport& rhs) {
-    return lhs.total_duration_ns() > rhs.total_duration_ns() ||
-           (lhs.total_duration_ns() == rhs.total_duration_ns() &&
-            KernelReportLessThanComparator()(lhs, rhs));
-  };
-
-  // Sort and keep at most <kMaxNumOfKernels> kernel reports.
-  if (kernel_stats_db->reports_size() > kMaxNumOfKernels) {
-    std::partial_sort(
-        kernel_stats_db->mutable_reports()->begin(),
-        kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
-        kernel_stats_db->mutable_reports()->end(), comp);
-    kernel_stats_db->mutable_reports()->erase(
-        kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
-        kernel_stats_db->mutable_reports()->end());
-  } else {
-    std::sort(kernel_stats_db->mutable_reports()->begin(),
-              kernel_stats_db->mutable_reports()->end(), comp);
-  }
-}
-
-void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
-                                       KernelStatsDb* dst) {
-  std::vector<std::pair<const KernelReport*, const KernelReportValue*>>
-      kernels_to_sort;
-  kernels_to_sort.reserve(reports.size());
-  for (const auto& report_value : reports) {
-    kernels_to_sort.push_back(
-        std::make_pair(&report_value.first, &report_value.second));
-  }
-
-  auto comp =
-      [](const std::pair<const KernelReport*, const KernelReportValue*>& lhs,
-         const std::pair<const KernelReport*, const KernelReportValue*>& rhs) {
-        return lhs.second->total_duration_ns > rhs.second->total_duration_ns ||
-               (lhs.second->total_duration_ns ==
-                    rhs.second->total_duration_ns &&
-                KernelReportLessThanComparator()(*lhs.first, *rhs.first));
-      };
-
-  // Sort and copy at most <kMaxNumOfKernels> kernels to <dst>.
-  if (kernels_to_sort.size() > kMaxNumOfKernels) {
-    absl::c_partial_sort(kernels_to_sort,
-                         kernels_to_sort.begin() + kMaxNumOfKernels, comp);
-  } else {
-    absl::c_sort(kernels_to_sort, comp);
-  }
-
-  int copy_size =
-      std::min(kMaxNumOfKernels, static_cast<int>(kernels_to_sort.size()));
-  for (int i = 0; i < copy_size; i++) {
-    KernelReport* report = dst->add_reports();
-    *report = *kernels_to_sort[i].first;
-    const KernelReportValue& kernel_value = *kernels_to_sort[i].second;
-    // Set value using KernelReportValue.
-    report->set_occurrences(kernel_value.occurrences);
-    report->set_min_duration_ns(kernel_value.min_duration_ns);
-    report->set_max_duration_ns(kernel_value.max_duration_ns);
-    report->set_total_duration_ns(kernel_value.total_duration_ns);
-  }
-}
-
-void InsertOrUpdateKernelReport(const KernelReport& kernel,
-                                const KernelReportValue& value,
-                                KernelReportMap* dst) {
-  KernelReportValue& element = (*dst)[kernel];
-  if (element.occurrences == 0) {
-    element = value;
-  } else {
-    element.total_duration_ns += value.total_duration_ns;
-    element.min_duration_ns =
-        std::min(element.min_duration_ns, value.min_duration_ns);
-    element.max_duration_ns =
-        std::max(element.max_duration_ns, value.max_duration_ns);
-    element.occurrences += value.occurrences;
-  }
-}
-
-void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst) {
-  for (auto& kernel_value : reports) {
-    InsertOrUpdateKernelReport(kernel_value.first, kernel_value.second, dst);
-  }
-}
-
-KernelStatsByOpName GroupKernelReportsByOpName(
-    const KernelStatsDb& kernel_stats_db) {
-  KernelStatsByOpName op_level_kernel_stats;
-  for (const KernelReport& kernel_report : kernel_stats_db.reports()) {
-    auto ret = op_level_kernel_stats.emplace(kernel_report.op_name(),
-                                             OpLevelKernelStats());
-    if (ret.second) {
-      // Inserted. Add a new op in <op_level_kernel_stats>.
-      OpLevelKernelStats& stats = ret.first->second;
-      stats.is_op_tensor_core_eligible =
-          kernel_report.is_op_tensor_core_eligible();
-      stats.total_duration_ns += kernel_report.total_duration_ns();
-      if (kernel_report.is_kernel_using_tensor_core()) {
-        stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
-      }
-    } else {
-      // Not inserted. Aggregate kernel stats to op level.
-      OpLevelKernelStats& stats = ret.first->second;
-      // Verifies operations with the same name have the same TensorCore
-      // eligibility.
-      DCHECK_EQ(stats.is_op_tensor_core_eligible,
-                kernel_report.is_op_tensor_core_eligible());
-      stats.total_duration_ns += kernel_report.total_duration_ns();
-      if (kernel_report.is_kernel_using_tensor_core()) {
-        stats.tensor_core_duration_ns += kernel_report.total_duration_ns();
-      }
-    }
-  }
-  return op_level_kernel_stats;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.h b/tensorflow/core/profiler/utils/kernel_stats_utils.h
index 1afecd6d54b1..6e625d9835e9 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.h
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.h
@@ -16,121 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
 
-#include <cstddef>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/hash/hash.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Populates kernel launch information from a kKernelDetails XStat.
-void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
-                             KernelReport* kernel);
-
-// Returns true if kernel uses TensorCores.
-bool IsKernelUsingTensorCore(absl::string_view kernel_name);
-
-// Returns true if operation is eligible to use TensorCores.
-bool IsOpTensorCoreEligible(absl::string_view tf_op_name);
-
-// Returns true if Einsum equation is eligible to use TensorCores.
-bool IsEinsumTensorCoreEligible(absl::string_view equation);
-
-// Less than comparator for Kernel Reports.
-struct KernelReportLessThanComparator {
-  bool operator()(const KernelReport& lhs, const KernelReport& rhs) const;
-};
-
-// Equal to comparator for Kernel Reports.
-struct KernelReportEqualToComparator {
-  bool operator()(const KernelReport& lhs, const KernelReport& rhs) const;
-};
-
-// Sorts kernel reports by total duration descendingly.
-// Keeps only the top kernel reports with long kernel duration in the given
-// KernelStatsDb. Kernel reports with shorter kernel duration are dropped.
-void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db);
-
-struct KernelReportValue {
-  uint64 total_duration_ns = 0;
-  uint64 min_duration_ns = 0;
-  uint64 max_duration_ns = 0;
-  uint64 occurrences = 0;
-};
-
-struct KernelKeyWrap {
-  const KernelReport* key;
-  template <typename H>
-  friend H AbslHashValue(H h, KernelKeyWrap wrap) {
-    // Kernel reports are grouped by these fields, hence they are used as
-    // hashing criteria.
-    // clang-format off
-    return H::combine(
-        std::move(h),
-        wrap.key->is_kernel_using_tensor_core(),
-        wrap.key->is_op_tensor_core_eligible(),
-        wrap.key->block_dim(0),
-        wrap.key->block_dim(1),
-        wrap.key->block_dim(2),
-        wrap.key->grid_dim(0),
-        wrap.key->grid_dim(1),
-        wrap.key->grid_dim(2),
-        wrap.key->registers_per_thread(),
-        wrap.key->static_shmem_bytes(),
-        wrap.key->dynamic_shmem_bytes(),
-        wrap.key->name(),
-        wrap.key->op_name());
-    // clang-format on
-  }
-};
-
-struct KernelHash {
-  size_t operator()(const KernelReport& key) const {
-    return absl::Hash<KernelKeyWrap>()(KernelKeyWrap{&key});
-  }
-};
-
-using KernelReportMap =
-    absl::flat_hash_map<KernelReport, KernelReportValue, KernelHash,
-                        KernelReportEqualToComparator>;
-
-// Copies the top kernel reports with long kernel duration into the given
-// KernelStatsDb.
-void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
-                                       KernelStatsDb* dst);
-
-// Inserts or aggregates KernelReports into the given KernelReportMap.
-void InsertOrUpdateKernelReport(const KernelReport& kernel,
-                                const KernelReportValue& value,
-                                KernelReportMap* dst);
-
-// Aggregates values from one KernelReportMap into another.
-void MergeKernelReports(const KernelReportMap& reports, KernelReportMap* dst);
-
-// Kernel stats aggregated at TF operation level.
-struct OpLevelKernelStats {
-  // Whether op is eligible to use TensorCore.
-  bool is_op_tensor_core_eligible = false;
-  // The accumulated duration of all the kernels launched in this op.
-  uint64 total_duration_ns = 0;
-  // The accumulated duration of all the kernels using TensorCore in this op.
-  // If this value is not 0, at least one of the kernels launched by this op
-  // is using TensorCore.
-  uint64 tensor_core_duration_ns = 0;
-};
-
-using KernelStatsByOpName =
-    absl::flat_hash_map<absl::string_view, OpLevelKernelStats>;
-
-// Groups KernelReport in <kernel_stats_db> by tensorflow operation name.
-KernelStatsByOpName GroupKernelReportsByOpName(
-    const KernelStatsDb& kernel_stats_db);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/kernel_stats_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc b/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc
deleted file mode 100644
index a8cf90adf62a..000000000000
--- a/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
-
-#include <string>
-
-#include <gmock/gmock.h>
-#include "absl/strings/string_view.h"
-#include "xla/backends/profiler/gpu/cupti_buffer_events.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using ::testing::FieldsAre;
-
-TEST(KernelStatsUtilsTest, TestGroupKernelReportsByOpName) {
-  KernelStatsDb kernel_stats_db;
-  KernelReport* kernel_report_1 = kernel_stats_db.add_reports();
-  kernel_report_1->set_name("op1_kernel1");
-  kernel_report_1->set_op_name("op1");
-  kernel_report_1->set_total_duration_ns(1000);
-  kernel_report_1->set_is_kernel_using_tensor_core(true);
-  kernel_report_1->set_is_op_tensor_core_eligible(true);
-
-  KernelReport* kernel_report_2 = kernel_stats_db.add_reports();
-  kernel_report_2->set_name("op1_kernel2");
-  kernel_report_2->set_op_name("op1");
-  kernel_report_2->set_total_duration_ns(1000);
-  kernel_report_2->set_is_kernel_using_tensor_core(false);
-  kernel_report_2->set_is_op_tensor_core_eligible(true);
-
-  KernelReport* kernel_report_3 = kernel_stats_db.add_reports();
-  kernel_report_3->set_name("op2_kernel1");
-  kernel_report_3->set_op_name("op2");
-  kernel_report_3->set_total_duration_ns(100);
-  kernel_report_3->set_is_kernel_using_tensor_core(false);
-  kernel_report_3->set_is_op_tensor_core_eligible(false);
-
-  KernelStatsByOpName kernel_stats_by_op_name =
-      GroupKernelReportsByOpName(kernel_stats_db);
-
-  // Verifies there are two OpLevelKernelStats
-  ASSERT_EQ(kernel_stats_by_op_name.size(), 2);
-  auto iter1 = kernel_stats_by_op_name.find("op1");
-  auto iter2 = kernel_stats_by_op_name.find("op2");
-  ASSERT_NE(iter1, kernel_stats_by_op_name.end());
-  ASSERT_NE(iter2, kernel_stats_by_op_name.end());
-  const OpLevelKernelStats& op1_stats = iter1->second;
-  const OpLevelKernelStats& op2_stats = iter2->second;
-
-  EXPECT_EQ(op1_stats.is_op_tensor_core_eligible, true);
-  EXPECT_EQ(op1_stats.total_duration_ns, 2000);
-  EXPECT_EQ(op1_stats.tensor_core_duration_ns, 1000);
-
-  EXPECT_EQ(op2_stats.is_op_tensor_core_eligible, false);
-  EXPECT_EQ(op2_stats.total_duration_ns, 100);
-  EXPECT_EQ(op2_stats.tensor_core_duration_ns, 0);
-}
-
-TEST(KernelStatsUtilsTest, KernelDetailsXStatParser) {
-  xla::profiler::KernelDetails kernel_info;
-  kernel_info.registers_per_thread = 10;
-  kernel_info.static_shared_memory_usage = 128;
-  kernel_info.dynamic_shared_memory_usage = 256;
-  kernel_info.block_x = 32;
-  kernel_info.block_y = 8;
-  kernel_info.block_z = 4;
-  kernel_info.grid_x = 3;
-  kernel_info.grid_y = 2;
-  kernel_info.grid_z = 1;
-  const double occupancy_pct = 50.0;
-  std::string xstat_kernel_details = ToXStat(kernel_info, occupancy_pct);
-  KernelReport kernel;
-  ParseKernelLaunchParams(xstat_kernel_details, &kernel);
-  // Verifies that the parser can parse kKernelDetails XStat.
-  EXPECT_EQ(kernel.registers_per_thread(), 10);
-  EXPECT_EQ(kernel.static_shmem_bytes(), 128);
-  EXPECT_EQ(kernel.dynamic_shmem_bytes(), 256);
-  EXPECT_EQ(kernel.block_dim()[0], 32);
-  EXPECT_EQ(kernel.block_dim()[1], 8);
-  EXPECT_EQ(kernel.block_dim()[2], 4);
-  EXPECT_EQ(kernel.grid_dim()[0], 3);
-  EXPECT_EQ(kernel.grid_dim()[1], 2);
-  EXPECT_EQ(kernel.grid_dim()[2], 1);
-}
-
-TEST(KernelStatsUtilsTest, KernelDetailsTokenizer) {
-  KernelReport kernel;
-
-  // Test odd token count (3): { "odd", "grid", "3,2,1" }
-  absl::string_view kernel_details_0 = "odd grid:3,2,1";
-  ParseKernelLaunchParams(kernel_details_0, &kernel);
-  EXPECT_EQ(kernel.grid_dim()[0], 3);
-  EXPECT_EQ(kernel.grid_dim()[1], 2);
-  EXPECT_EQ(kernel.grid_dim()[2], 1);
-
-  // Test odd token count (3): { "block", "6,5,4", "odd" }
-  absl::string_view kernel_details_1 = "block:6,5,4 odd ";
-  ParseKernelLaunchParams(kernel_details_1, &kernel);
-  EXPECT_EQ(kernel.block_dim()[0], 6);
-  EXPECT_EQ(kernel.block_dim()[1], 5);
-  EXPECT_EQ(kernel.block_dim()[2], 4);
-
-  // Test odd token count (3): { "block", "1,2,3", "odd", "grid", "4,5,6" }
-  absl::string_view kernel_details_2 = "block:1,2,3 odd grid:4,5,6";
-  ParseKernelLaunchParams(kernel_details_2, &kernel);
-  EXPECT_EQ(kernel.block_dim()[0], 1);
-  EXPECT_EQ(kernel.block_dim()[1], 2);
-  EXPECT_EQ(kernel.block_dim()[2], 3);
-  EXPECT_EQ(kernel.grid_dim()[0], 4);
-  EXPECT_EQ(kernel.grid_dim()[1], 5);
-  EXPECT_EQ(kernel.grid_dim()[2], 6);
-
-  // Test even token count (4): { "static_shared", "7", "dynamic_shared", "8" }
-  absl::string_view kernel_details_3 = "static_shared:7 dynamic_shared:8";
-  ParseKernelLaunchParams(kernel_details_3, &kernel);
-  EXPECT_EQ(kernel.static_shmem_bytes(), 7);
-  EXPECT_EQ(kernel.dynamic_shmem_bytes(), 8);
-}
-
-TEST(KernelStatsUtilsTest, TestInsertOrUpdateKernelReport) {
-  KernelReport kr;
-  kr.set_name("op1_kernel1");
-  kr.set_op_name("op1");
-  // Must provide dummy dims since KernelReportMap's comparator assumes array of
-  // size 3; values here were suggested by autocomplete
-  kr.add_block_dim(32);
-  kr.add_block_dim(8);
-  kr.add_block_dim(4);
-  kr.add_grid_dim(3);
-  kr.add_grid_dim(2);
-  kr.add_grid_dim(1);
-
-  KernelReportValue krv1;
-  krv1.total_duration_ns = 1700;
-  krv1.min_duration_ns = 500;
-  krv1.max_duration_ns = 1200;
-  krv1.occurrences = 2;
-
-  KernelReportValue krv2;
-  krv2.total_duration_ns = 900;
-  krv2.min_duration_ns = 900;
-  krv2.max_duration_ns = 900;
-  krv2.occurrences = 1;
-
-  KernelReportMap dst1;
-  InsertOrUpdateKernelReport(kr, krv1, &dst1);
-  InsertOrUpdateKernelReport(kr, krv2, &dst1);
-  EXPECT_THAT(dst1[kr], FieldsAre(2600, 500, 1200, 3));
-
-  KernelReportMap dst2;
-  InsertOrUpdateKernelReport(kr, krv2, &dst2);
-  InsertOrUpdateKernelReport(kr, krv1, &dst2);
-  EXPECT_THAT(dst2[kr], FieldsAre(2600, 500, 1200, 3));
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
deleted file mode 100644
index 7ff1c33c762f..000000000000
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <optional>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/profiler/utils/math_utils.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/profiler/utils/xplane_schema.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
-namespace profiler {
-
-const absl::string_view kIdle = "IDLE";
-const uint32_t kSparseCoreIndexStart = 1000000;
-const int64_t kSingleOccurrence = 1;
-
-namespace {
-
-constexpr uint64_t kRootSymbolId = 0;
-
-using tsl::profiler::StatType;
-using tsl::profiler::XEventMetadataVisitor;
-using tsl::profiler::XStatVisitor;
-
-class DeviceTfOpMetricsDbBuilder : public OpMetricsDbBuilder {
- public:
-  explicit DeviceTfOpMetricsDbBuilder(OpMetricsDb* db)
-      : OpMetricsDbBuilder(db) {}
-
-  void UpdateTfOpMetricsWithDeviceOpMetrics(
-      absl::string_view tf_op_name, absl::string_view tf_op_type,
-      const OpMetrics& device_op_metrics) {
-    OpMetrics* tf_op_metrics = OpMetricsDbBuilder::LookupOrInsertNewOpMetrics(
-        /*hlo_module_id=*/0, tf_op_name);
-    if (tf_op_metrics->category().empty()) {
-      tf_op_metrics->set_category(tf_op_type == tsl::profiler::kUnknownOp
-                                      ? "Unknown"
-                                      : std::string(tf_op_type));
-    }
-    tf_op_metrics->set_is_eager(device_op_metrics.is_eager());
-    // The occurrences of a TF-op is the maximum among the occurrences of all
-    // device ops that it contains.
-    tf_op_metrics->set_occurrences(std::max(tf_op_metrics->occurrences(),
-                                            device_op_metrics.occurrences()));
-    tf_op_metrics->set_time_ps(tf_op_metrics->time_ps() +
-                               device_op_metrics.time_ps());
-    tf_op_metrics->set_self_time_ps(tf_op_metrics->self_time_ps() +
-                                    device_op_metrics.self_time_ps());
-    tf_op_metrics->set_flops(tf_op_metrics->flops() +
-                             device_op_metrics.flops());
-    tf_op_metrics->set_bytes_accessed(tf_op_metrics->bytes_accessed() +
-                                      device_op_metrics.bytes_accessed());
-  }
-};
-
-void SetOpMetadataFromHloEventMetadata(
-    const XEventMetadataVisitor& hlo_event_metadata, OpMetrics* op_metrics) {
-  if (hlo_event_metadata.HasDisplayName()) {
-    op_metrics->set_name(std::string(hlo_event_metadata.DisplayName()));
-    op_metrics->set_long_name(std::string(hlo_event_metadata.Name()));
-  } else {
-    op_metrics->set_name(std::string(hlo_event_metadata.Name()));
-  }
-  hlo_event_metadata.ForEachStat([&](const XStatVisitor& stat) {
-    if (stat.Type().has_value()) {
-      switch (static_cast<StatType>(*stat.Type())) {
-        case StatType::kProgramId:
-          op_metrics->set_hlo_module_id(stat.IntOrUintValue());
-          break;
-        case StatType::kHloCategory:
-          op_metrics->set_category(std::string(stat.StrOrRefValue()));
-          break;
-        case StatType::kTfOp:
-          op_metrics->set_provenance(std::string(stat.StrOrRefValue()));
-          break;
-        case StatType::kFlops:
-          op_metrics->set_flops(stat.IntOrUintValue());
-          break;
-        case StatType::kModelFlops:
-          op_metrics->set_model_flops(stat.IntOrUintValue());
-          break;
-        case StatType::kBytesAccessed:
-          op_metrics->set_bytes_accessed(stat.IntOrUintValue());
-          break;
-        case StatType::kMemoryAccessBreakdown: {
-          tensorflow::profiler::MemoryAccessBreakdown breakdown;
-          const auto& value = stat.BytesValue();
-          if (breakdown.ParseFromArray(value.data(), value.size())) {
-            *op_metrics->mutable_memory_accessed_breakdown() =
-                breakdown.memory_accessed();
-          }
-          break;
-        }
-        case StatType::kDeduplicatedName:
-          op_metrics->set_deduplicated_name(std::string(stat.StrOrRefValue()));
-          break;
-        default:
-          break;
-      }
-    }
-  });
-  hlo_event_metadata.ForEachChild(
-      [&](const XEventMetadataVisitor& child_hlo_event_metadata) {
-        OpMetrics* child = op_metrics->mutable_children()->add_metrics_db();
-        child->set_occurrences(1);
-        SetOpMetadataFromHloEventMetadata(child_hlo_event_metadata, child);
-      });
-}
-
-void SetOpMetricsFromHloEvent(const tsl::profiler::XEventVisitor& hlo_event,
-                              OpMetrics* op_metrics) {
-  uint64_t duration_ps = hlo_event.DurationPs();
-  uint64_t min_duration_ps = duration_ps;
-  uint64_t self_duration_ps = duration_ps;
-  uint64_t dma_stall_ps = 0;
-  hlo_event.ForEachStat([&](const XStatVisitor& stat) {
-    if (!stat.Type()) return;
-    switch (static_cast<StatType>(*stat.Type())) {
-      case StatType::kMinDurationPs:
-        min_duration_ps = stat.IntValue();
-        break;
-      case StatType::kSelfDurationPs:
-        self_duration_ps = stat.IntValue();
-        break;
-      case StatType::kDmaStallDurationPs:
-        dma_stall_ps = stat.IntValue();
-        break;
-      default:
-        break;
-    }
-  });
-  if (op_metrics->occurrences() == 0) {
-    SetOpMetadataFromHloEventMetadata(hlo_event.Metadata(), op_metrics);
-    op_metrics->set_occurrences(
-        std::max(kSingleOccurrence, hlo_event.NumOccurrences()));
-    op_metrics->set_time_ps(duration_ps);
-    op_metrics->set_min_time_ps(min_duration_ps);
-    op_metrics->set_self_time_ps(self_duration_ps);
-    op_metrics->set_dma_stall_ps(dma_stall_ps);
-    op_metrics->set_num_cores(1);
-  } else {
-    op_metrics->set_occurrences(op_metrics->occurrences() +
-                                hlo_event.NumOccurrences());
-    op_metrics->set_time_ps(op_metrics->time_ps() + duration_ps);
-    op_metrics->set_min_time_ps(
-        std::min<uint64_t>(op_metrics->min_time_ps(), min_duration_ps));
-    op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_duration_ps);
-    op_metrics->set_dma_stall_ps(op_metrics->dma_stall_ps() + dma_stall_ps);
-  }
-}
-
-void MergeOpMetrics(const OpMetrics& src, OpMetrics& dst) {
-  if (dst.occurrences() == 0) {
-    dst = src;
-  } else {
-    dst.set_occurrences(src.occurrences() + dst.occurrences());
-    dst.set_time_ps(src.time_ps() + dst.time_ps());
-    dst.set_min_time_ps(
-        std::min<uint64_t>(src.min_time_ps(), dst.min_time_ps()));
-    dst.set_self_time_ps(src.self_time_ps() + dst.self_time_ps());
-    dst.set_dma_stall_ps(src.dma_stall_ps() + dst.dma_stall_ps());
-  }
-}
-
-void AdjustFlopsAndBytesAccessed(OpMetrics& op_metrics) {
-  op_metrics.set_flops(op_metrics.flops() * op_metrics.occurrences());
-  if (op_metrics.model_flops() > 0) {
-    op_metrics.set_model_flops(op_metrics.model_flops() *
-                               op_metrics.occurrences());
-  } else {
-    op_metrics.set_model_flops(op_metrics.flops());
-  }
-  op_metrics.set_bytes_accessed(op_metrics.bytes_accessed() *
-                                op_metrics.occurrences());
-  for (auto& memory_access : *op_metrics.mutable_memory_accessed_breakdown()) {
-    memory_access.set_bytes_accessed(memory_access.bytes_accessed() *
-                                     op_metrics.occurrences());
-  }
-}
-
-}  // namespace
-
-OpMetricsDbBuilder::OpMetricsDbBuilder(OpMetricsDb* db) : db_(db) {
-  DCHECK_NE(db_, nullptr);
-  DCHECK_EQ(db_->metrics_db_size(), db->metrics_db_size());
-}
-
-OpMetrics* OpMetricsDbBuilder::LookupOrInsertNewOpMetrics(
-    uint64 hlo_module_id, absl::string_view name) {
-  OpMetrics*& op_metrics = op_metrics_map_[hlo_module_id][name];
-  if (op_metrics == nullptr) {
-    op_metrics = db_->add_metrics_db();
-    op_metrics->set_hlo_module_id(hlo_module_id);
-    op_metrics->set_name(name.data(), name.size());
-  }
-  return op_metrics;
-}
-
-void XEventsOpMetricsDbBuilder::AddOpMetric(
-    const tsl::profiler::XEventVisitor& event) {
-  AddOpMetric(FromXEvent(event), GetOpKeyFromXEvent(event));
-}
-
-void XEventsOpMetricsDbBuilder::AddOpMetric(const OpMetrics& op_metrics,
-                                            const OpKey& key) {
-  if (!key.program_id.has_value() || !key.symbol_id.has_value() ||
-      key.symbol_id == kRootSymbolId)
-    return;
-  MergeOpMetrics(
-      op_metrics,
-      flat_op_metric_[key.program_id.value()][key.symbol_id.value()]);
-}
-
-OpMetricsDb XEventsOpMetricsDbBuilder::Finalize(uint64_t total_time_ps) {
-  OpMetricsDb db = Finalize();
-  SetTotalTimePs(db, total_time_ps);
-  AddIdleOp(db);
-  return db;
-}
-
-OpMetricsDb XEventsOpMetricsDbBuilder::Finalize() {
-  OpMetricsDb db;
-  uint64_t total_op_time_ps = 0;
-  for (auto& [program_id, op_metric_by_symbol] : flat_op_metric_) {
-    for (auto& [symbol_id, op_metrics] : op_metric_by_symbol) {
-      AdjustFlopsAndBytesAccessed(op_metrics);
-      total_op_time_ps += op_metrics.self_time_ps();
-      db.add_metrics_db()->Swap(&op_metrics);
-    }
-  }
-  db.set_total_op_time_ps(total_op_time_ps);
-  return db;
-}
-
-double IdleTimeRatio(const OpMetricsDb& db) {
-  return 1.0 -
-         tsl::profiler::SafeDivide(db.total_op_time_ps(), db.total_time_ps());
-}
-
-uint64 IdleTimePs(const OpMetricsDb& db) {
-  DCHECK_GE(db.total_time_ps(), db.total_op_time_ps());
-  return db.total_time_ps() - db.total_op_time_ps();
-}
-
-void SetIdleOp(uint64_t idle_time_ps, OpMetrics& metrics) {
-  metrics.set_name(std::string(kIdle));
-  metrics.set_category(std::string(kIdle));
-  metrics.set_occurrences(0);
-  metrics.set_time_ps(idle_time_ps);
-  metrics.set_self_time_ps(idle_time_ps);
-}
-
-void AddIdleOp(OpMetricsDb& db) {
-  uint64 idle_time_ps = IdleTimePs(db);
-  SetIdleOp(idle_time_ps, *db.add_metrics_db());
-}
-
-std::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db) {
-  if (db.total_host_infeed_enq_start_timestamp_ps_diff() > 0) {
-    // We use total_host_infeed_enq_start_timestamp_ps_diff to approximate the
-    // total host time.
-    return tsl::profiler::SafeDivide(
-        db.total_host_infeed_enq_duration_ps(),
-        db.total_host_infeed_enq_start_timestamp_ps_diff());
-  }
-  return std::nullopt;
-}
-
-OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(
-    const OpMetricsDb& device_op_metrics_db, bool with_idle) {
-  OpMetricsDb tf_op_metrics_db;
-  DeviceTfOpMetricsDbBuilder builder(&tf_op_metrics_db);
-  for (const auto& device_op_metrics : device_op_metrics_db.metrics_db()) {
-    if (IsIdleOp(device_op_metrics)) {
-      if (with_idle) {
-        builder.UpdateTfOpMetricsWithDeviceOpMetrics(kIdle, kIdle,
-                                                     device_op_metrics);
-      }
-    } else if (device_op_metrics.provenance().empty()) {
-      builder.UpdateTfOpMetricsWithDeviceOpMetrics(device_op_metrics.name(),
-                                                   tsl::profiler::kUnknownOp,
-                                                   device_op_metrics);
-    } else {
-      tsl::profiler::TfOp tf_op =
-          tsl::profiler::ParseTfOpFullname(device_op_metrics.provenance());
-      builder.UpdateTfOpMetricsWithDeviceOpMetrics(tf_op.name, tf_op.type,
-                                                   device_op_metrics);
-    }
-  }
-  tf_op_metrics_db.set_total_op_time_ps(
-      device_op_metrics_db.total_op_time_ps());
-
-  tf_op_metrics_db.set_total_time_ps(
-      with_idle ? device_op_metrics_db.total_time_ps()
-                : device_op_metrics_db.total_op_time_ps());
-
-  return tf_op_metrics_db;
-}
-
-OpMetrics FromXEvent(const tsl::profiler::XEventVisitor& xevent) {
-  OpMetrics op_metrics;
-  std::optional<XStatVisitor> stat = xevent.GetStat(StatType::kStepIdleTimePs);
-  if (stat.has_value()) {
-    // TODO(b/397774568) : Remove this once the SparseCore OpMetricsDb is
-    // implemented.
-    uint64_t idle_time_ps = stat->IntOrUintValue();
-    op_metrics.set_self_time_ps(xevent.DurationPs() - idle_time_ps);
-    op_metrics.set_name("sparse_core_busy_ops");
-    op_metrics.set_category("sparse_core_busy_ops");
-    return op_metrics;
-  }
-  SetOpMetricsFromHloEvent(xevent, &op_metrics);
-  return op_metrics;
-}
-
-XEventsOpMetricsDbBuilder::OpKey GetOpKeyFromXEvent(
-    const XEventVisitor& event) {
-  std::optional<XStatVisitor> stat = event.GetStat(StatType::kStepIdleTimePs);
-  if (stat.has_value()) {
-    return {.program_id = std::numeric_limits<uint64_t>::max(),
-            .symbol_id = std::numeric_limits<uint64_t>::max()};
-  }
-
-  XEventsOpMetricsDbBuilder::OpKey op_key;
-  DCHECK(event.metadata() != nullptr);
-  event.Metadata().ForEachStat([&](const XStatVisitor& stat) {
-    if (stat.Type().has_value()) {
-      switch (static_cast<StatType>(*stat.Type())) {
-        case StatType::kProgramId:
-          op_key.program_id = stat.IntOrUintValue();
-          break;
-        case StatType::kSymbolId:
-          op_key.symbol_id = stat.IntOrUintValue();
-          break;
-        default:
-          break;
-      }
-    }
-  });
-  return op_key;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.h b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
index 4eca439960b0..5ed177ac3780 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.h
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
@@ -16,136 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
 
-#include <algorithm>
-#include <cstdint>
-#include <optional>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/macros.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// The name of OpMetrics to represent the idle time.
-TF_CONST_INIT extern const absl::string_view kIdle;
-// The core index to add to sparse core index in op metrics.
-TF_CONST_INIT extern const uint32_t kSparseCoreIndexStart;
-
-// Helps build an op metrics database (borrowed).
-// Enables fast lookup of existing ops and prevents the creation of duplicate
-// ops. It is the user's responsibility to ensure an op metrics database
-// outlives its builder, and that no ops are added to the database outside of
-// the builder.
-class OpMetricsDbBuilder {
- public:
-  // Create with a borrowed op database.
-  // REQUIRED: The op database must be empty.
-  explicit OpMetricsDbBuilder(OpMetricsDb* db);
-
- protected:
-  // Looks up the given OP name. If it is already in the database,
-  // return its OpMetrics; otherwise, insert a new one.
-  OpMetrics* LookupOrInsertNewOpMetrics(uint64 hlo_module_id,
-                                        absl::string_view name);
-
-  OpMetricsDb* db() { return db_; }
-
- private:
-  // Map op (hlo_module_id, name) to the corresponding metrics in the op
-  // database.
-  absl::flat_hash_map<uint64 /*hlo_module_id*/,
-                      absl::flat_hash_map<std::string /*name*/, OpMetrics*>>
-      op_metrics_map_;
-
-  // The op database.
-  OpMetricsDb* db_;
-};
-
-// Helps build an op metrics database (borrowed) from XEvents,
-class XEventsOpMetricsDbBuilder {
- public:
-  struct OpKey {
-    std::optional<uint64_t> program_id;
-    std::optional<uint64_t> symbol_id;
-  };
-  // DEPRECATED: Use the OpKey version below.
-  // Add OpMetric from XEventVisitor.
-  void AddOpMetric(const tsl::profiler::XEventVisitor& xevent);
-
-  // Add an OpMetric to the builder based on the provided key.
-  void AddOpMetric(const OpMetrics& op_metrics, const OpKey& key);
-
-  // Finalize OpMetricDb and add total time and Idle op.
-  OpMetricsDb Finalize(uint64_t total_time);
-
-  // Finalize OpMetricDb, but the total time is unknown at the moment, So ignore
-  // the total time and Idle Op and will be handled by the caller.
-  OpMetricsDb Finalize();
-
- private:
-  using OpMetricBySymbol =
-      absl::flat_hash_map</*symbol_id=*/uint64_t, OpMetrics>;
-  absl::flat_hash_map</*program_id=*/uint64_t, OpMetricBySymbol>
-      flat_op_metric_;
-};
-
-// Constructs an OpMetrics from the provided XEventVisitor.
-OpMetrics FromXEvent(const tsl::profiler::XEventVisitor& xevent);
-
-// Returns the OpKey for the provided XEventVisitor.
-XEventsOpMetricsDbBuilder::OpKey GetOpKeyFromXEvent(
-    const tsl::profiler::XEventVisitor& event);
-
-// Sets the total time for OpMetricsDb, ensuring idle time is not negative.
-inline void SetTotalTimePs(OpMetricsDb& db, uint64_t total_time_ps) {
-  db.set_total_time_ps(std::max(db.total_op_time_ps(), total_time_ps));
-}
-
-// Returns the total time in OpMetricsDb, optionally excluding the idle time.
-inline uint64_t TotalTimePs(const OpMetricsDb& db, bool exclude_idle = false) {
-  return exclude_idle ? db.total_op_time_ps() : db.total_time_ps();
-}
-
-// Returns the ratio of time that is idle (no op execution) over total time.
-double IdleTimeRatio(const OpMetricsDb& db);
-
-// Returns the idle time in picoseconds.
-uint64 IdleTimePs(const OpMetricsDb& db);
-
-// Populates an OpMetrics record representing idle time, i.e., the amount of
-// time spent without any op execution.
-void SetIdleOp(uint64_t idle_time_ps, OpMetrics& metrics);
-
-// Adds an OpMetrics record representing idle time, i.e., the amount of time
-// spent without any op execution.
-// REQUIRED: All ops must have been added to the database and the total time
-// must have been set.
-void AddIdleOp(OpMetricsDb& db);
-
-// Returns true if the given metrics represents idle time.
-inline bool IsIdleOp(const OpMetrics& metrics) {
-  return metrics.category() == kIdle;
-}
-
-// Returns the time spent in children (nested) ops.
-inline uint64_t ChildrenTimePs(const OpMetrics& metrics) {
-  return metrics.time_ps() - metrics.self_time_ps();
-}
-
-// Returns the ratio of time spent sending data from the host to the device
-// relative to the total time the host was active.
-std::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db);
-
-// Converts from the device op metrics to Tf-op metrics.
-OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(
-    const OpMetricsDb& device_op_metrics_db, bool with_idle = true);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/op_metrics_db_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
deleted file mode 100644
index 07d85e1411e0..000000000000
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
-#include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-#if defined(PLATFORM_GOOGLE)
-using ::testing::EqualsProto;
-using ::testing::proto::IgnoringRepeatedFieldOrdering;
-#endif
-
-constexpr double kMaxError = 1E-10;
-
-TEST(OpMetricsDbTest, IdleTimeRatio) {
-  OpMetricsDb metrics_db_0;
-  metrics_db_0.set_total_time_ps(100000000);
-  metrics_db_0.set_total_op_time_ps(60000000);
-  EXPECT_NEAR(0.4, IdleTimeRatio(metrics_db_0), kMaxError);
-
-  OpMetricsDb metrics_db_1;
-  metrics_db_1.set_total_time_ps(200000000);
-  metrics_db_1.set_total_op_time_ps(150000000);
-  EXPECT_NEAR(0.25, IdleTimeRatio(metrics_db_1), kMaxError);
-
-  OpMetricsDb metrics_db_2;
-  metrics_db_1.set_total_time_ps(0);
-  metrics_db_1.set_total_op_time_ps(0);
-  EXPECT_NEAR(1.0, IdleTimeRatio(metrics_db_2), kMaxError);
-}
-
-TEST(OpMetricsDbTest, FromXEventHandlesMissingOccurrences) {
-  XPlane raw_plane;
-  XPlaneBuilder plane(&raw_plane);
-  XLineBuilder line = plane.GetOrCreateLine(0);
-  XEventMetadata* event_metadata = plane.GetOrCreateEventMetadata("metadata");
-  event_metadata->set_display_name("display_name");
-  XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProgramId)), 1);
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSymbolId)), 2);
-  stats.AddStatValue(*plane.GetOrCreateStatMetadata(
-                         GetStatTypeStr(StatType::kDeduplicatedName)),
-                     "deduplicated_name");
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)), "tf_op");
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kHloCategory)),
-      "tf_op_category");
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kFlops)), 3);
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kModelFlops)), 4);
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kBytesAccessed)),
-      5);
-  XEventBuilder event = line.AddEvent(*event_metadata);
-  event.SetOffsetPs(0);
-  event.SetDurationPs(100);
-  tsl::profiler::XPlaneVisitor plane_visitor =
-      tsl::profiler::CreateTfXPlaneVisitor(&raw_plane);
-  tsl::profiler::XEventVisitor event_visitor(
-      &plane_visitor, &raw_plane.lines(0), &raw_plane.lines(0).events(0));
-  OpMetrics op_metrics = FromXEvent(event_visitor);
-
-#if defined(PLATFORM_GOOGLE)
-  EXPECT_THAT(op_metrics, EqualsProto(R"pb(
-                occurrences: 1
-                time_ps: 100
-                self_time_ps: 100
-                dma_stall_ps: 0
-                hlo_module_id: 1
-                flops: 3
-                model_flops: 4
-                bytes_accessed: 5
-                name: "display_name"
-                long_name: "metadata"
-                deduplicated_name: "deduplicated_name"
-                category: "tf_op_category"
-                provenance: "tf_op"
-                min_time_ps: 100
-                num_cores: 1
-              )pb"));
-#endif
-}
-
-TEST(OpMetricsDbTest, GetOpKeyFromXEvent) {
-  XPlane raw_plane;
-  XPlaneBuilder plane(&raw_plane);
-  XEventMetadata* event_metadata = plane.GetOrCreateEventMetadata("metadata");
-  event_metadata->set_display_name("display_name");
-  XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProgramId)), 1);
-  stats.AddStatValue(
-      *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSymbolId)), 2);
-  XLineBuilder line = plane.GetOrCreateLine(0);
-  XEventBuilder event = line.AddEvent(*event_metadata);
-  event.SetOffsetPs(0);
-  event.SetDurationPs(100);
-  tsl::profiler::XPlaneVisitor plane_visitor =
-      tsl::profiler::CreateTfXPlaneVisitor(&raw_plane);
-  tsl::profiler::XEventVisitor event_visitor(
-      &plane_visitor, &raw_plane.lines(0), &raw_plane.lines(0).events(0));
-  XEventsOpMetricsDbBuilder::OpKey op_key = GetOpKeyFromXEvent(event_visitor);
-  EXPECT_EQ(op_key.program_id, 1);
-  EXPECT_EQ(op_key.symbol_id, 2);
-}
-
-TEST(OpMetricsDbTest, XEventsOpMetricsDbBuilder) {
-  XPlane raw_plane;
-  XPlaneBuilder plane(&raw_plane);
-  XLineBuilder line = plane.GetOrCreateLine(0);
-  {
-    XEventMetadata* event_metadata = plane.GetOrCreateEventMetadata("m1");
-    event_metadata->set_display_name("display_name1");
-    XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-    stats.AddStatValue(
-        *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProgramId)),
-        1);
-    stats.AddStatValue(
-        *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSymbolId)), 1);
-    XEventBuilder event = line.AddEvent(*event_metadata);
-    event.SetOffsetPs(0);
-    event.SetDurationPs(100);
-    XEventBuilder event2 = line.AddEvent(*event_metadata);
-    event2.SetOffsetPs(100);
-    event2.SetDurationPs(100);
-  }
-  {
-    XEventMetadata* event_metadata = plane.GetOrCreateEventMetadata("m2");
-    event_metadata->set_display_name("display_name2");
-    XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-    stats.AddStatValue(
-        *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProgramId)),
-        1);
-    stats.AddStatValue(
-        *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSymbolId)), 2);
-    XEventBuilder event = line.AddEvent(*event_metadata);
-    event.SetOffsetPs(0);
-    event.SetDurationPs(100);
-  }
-  {
-    XEventMetadata* event_metadata = plane.GetOrCreateEventMetadata("m3");
-    event_metadata->set_display_name("display_name3");
-    XStatsBuilder<XEventMetadata> stats(event_metadata, &plane);
-    stats.AddStatValue(
-        *plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSymbolId)), 1);
-    XEventBuilder event = line.AddEvent(*event_metadata);
-    event.SetOffsetPs(0);
-    event.SetDurationPs(100);
-  }
-
-  XEventsOpMetricsDbBuilder builder;
-  XEventsOpMetricsDbBuilder legacy_builder;
-  tsl::profiler::XPlaneVisitor plane_visitor =
-      tsl::profiler::CreateTfXPlaneVisitor(&raw_plane);
-  plane_visitor.ForEachLine([&](const tsl::profiler::XLineVisitor& line) {
-    line.ForEachEvent([&](const tsl::profiler::XEventVisitor& event) {
-      builder.AddOpMetric(FromXEvent(event), GetOpKeyFromXEvent(event));
-      legacy_builder.AddOpMetric(event);
-    });
-  });
-#if defined(PLATFORM_GOOGLE)
-  OpMetricsDb legacy_db = legacy_builder.Finalize();
-  OpMetricsDb db = builder.Finalize();
-  EXPECT_THAT(db, IgnoringRepeatedFieldOrdering(EqualsProto(legacy_db)));
-  EXPECT_THAT(db, IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
-                metrics_db {
-                  hlo_module_id: 1
-                  self_time_ps: 200
-                  occurrences: 2
-                  name: "display_name1"
-                  long_name: "m1"
-                  time_ps: 200
-                  min_time_ps: 100
-                  num_cores: 1
-                }
-                metrics_db {
-                  hlo_module_id: 1
-                  self_time_ps: 100
-                  occurrences: 1
-                  name: "display_name2"
-                  long_name: "m2"
-                  time_ps: 100
-                  min_time_ps: 100
-                  num_cores: 1
-                }
-                total_op_time_ps: 300
-              )pb")));
-#endif
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
deleted file mode 100644
index 72b55ba1a76c..000000000000
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/op_utils.h"
-
-#include <cstdint>
-#include <string>
-
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tsl/platform/types.h"
-#include "xla/tsl/profiler/utils/tf_op_utils.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/hlo_module_map.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-using tsl::uint64;
-
-namespace {}  // namespace
-
-// Annotate the op_metrics with the metadata from the instr_wrapper.
-void EnterOpMetadata(OpMetrics* op_metrics,
-                     const HloInstructionWrapper* instr_wrapper) {
-  if (op_metrics->name().empty() && op_metrics->category().empty() &&
-      op_metrics->provenance().empty()) {
-    op_metrics->set_name(std::string(instr_wrapper->Name()));
-    op_metrics->set_category(std::string(instr_wrapper->Category()));
-    op_metrics->set_deduplicated_name(
-        instr_wrapper->Metadata().deduplicated_name());
-    op_metrics->set_provenance(std::string(instr_wrapper->op_full_name()));
-    op_metrics->set_num_cores(1);
-    op_metrics->set_occurrences(op_metrics->occurrences() + 1);
-    op_metrics->set_flops(op_metrics->flops() + instr_wrapper->flops());
-    op_metrics->set_bytes_accessed(op_metrics->bytes_accessed() +
-                                   instr_wrapper->bytes_accessed());
-    op_metrics->set_long_name(instr_wrapper->Expression());
-  }
-}
-
-void AddFusionChildrenToOpMetricsFromHloInstruction(
-    OpMetrics* op_metrics, const HloInstructionWrapper* instr_wrapper) {
-  if (instr_wrapper->FusedChildren().empty()) return;
-  for (const HloInstructionWrapper* child : instr_wrapper->FusedChildren()) {
-    if (child->HloOpcode() == xla::HloOpcode::kParameter ||
-        child->HloOpcode() == xla::HloOpcode::kTuple)
-      continue;
-    OpMetrics* child_op_metrics =
-        op_metrics->mutable_children()->add_metrics_db();
-    // DeviceOpMetricsDbBuilder children_db_builder(
-    //     op_metrics->mutable_children());
-    EnterOpMetadata(child_op_metrics, child);
-    // children_db_builder.EnterOpMetadata(child_op_metrics, child);
-    AddFusionChildrenToOpMetricsFromHloInstruction(child_op_metrics, child);
-  }
-}
-
-void EnterOpMetadataFromHloModuleMap(OpMetrics* op_metrics,
-                                     const HloModuleMap& hlo_module_map) {
-  const HloInstructionWrapper* instr_wrapper = GetHloInstruction(
-      hlo_module_map, op_metrics->hlo_module_id(), op_metrics->name());
-  if (instr_wrapper != nullptr) {
-    AddFusionChildrenToOpMetricsFromHloInstruction(op_metrics, instr_wrapper);
-  }
-}
-
-void HostOpMetricsDbBuilder::EnterOp(absl::string_view name,
-                                     absl::string_view category, bool is_eager,
-                                     uint64 time_ps, uint64 children_time_ps) {
-  uint64 self_time_ps = time_ps - children_time_ps;
-  DCHECK_GE(time_ps, self_time_ps);
-  OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(/*hlo_module_id=*/0, name);
-  if (op_metrics->category().empty())
-    op_metrics->set_category(category.data(), category.size());
-  op_metrics->set_num_cores(1);
-  op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
-  op_metrics->set_occurrences(op_metrics->occurrences() + 1);
-  op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
-  op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
-  db()->set_total_op_time_ps(db()->total_op_time_ps() + self_time_ps);
-}
-
-void HostOpMetricsDbBuilder::EnterHostInfeedEnqueue(
-    tsl::profiler::Timespan host_infeed_enqueue) {
-  if (!last_host_infeed_enqueue_.Empty()) {
-    // Expect non-overlapping InfeedEnqueue timespans sorted by time.
-    DCHECK_GE(host_infeed_enqueue.end_ps(),
-              last_host_infeed_enqueue_.begin_ps());
-    db()->set_total_host_infeed_enq_duration_ps(
-        db()->total_host_infeed_enq_duration_ps() +
-        last_host_infeed_enqueue_.duration_ps());
-    db()->set_total_host_infeed_enq_start_timestamp_ps_diff(
-        db()->total_host_infeed_enq_start_timestamp_ps_diff() +
-        (host_infeed_enqueue.begin_ps() -
-         last_host_infeed_enqueue_.begin_ps()));
-  }
-  last_host_infeed_enqueue_ = host_infeed_enqueue;
-}
-
-void DeviceOpMetricsDbBuilder::EnterOpMetadataFromHloModuleMap(
-    uint64 program_id, absl::string_view op_name,
-    const HloModuleMap& hlo_module_map) {
-  OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, op_name);
-  tensorflow::profiler::EnterOpMetadataFromHloModuleMap(op_metrics,
-                                                        hlo_module_map);
-}
-
-void DeviceOpMetricsDbBuilder::EnterOpMetadata(
-    uint64 program_id, absl::string_view program_name,
-    absl::string_view category, absl::string_view provenance,
-    absl::string_view deduplicated_name, bool is_eager,
-    absl::string_view long_name) {
-  // We only need to add xla metadata once to each new op, as they are the
-  // same across occurrences.
-  OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, program_name);
-  if (op_metrics->occurrences() > 0 || !op_metrics->category().empty() ||
-      !op_metrics->provenance().empty())
-    return;
-  op_metrics->set_category(category == tsl::profiler::kUnknownOp
-                               ? "unknown"
-                               : std::string(category));
-  op_metrics->set_provenance(std::string(provenance));
-  if (!deduplicated_name.empty()) {
-    op_metrics->set_deduplicated_name(std::string(deduplicated_name));
-  }
-  if (!long_name.empty()) {
-    op_metrics->set_long_name(std::string(long_name));
-  }
-  op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
-}
-
-void DeviceOpMetricsDbBuilder::EnterOp(
-    uint64 program_id, absl::string_view name, absl::string_view category,
-    absl::string_view provenance, absl::string_view deduplicated_name,
-    bool is_eager, uint64 occurrences, uint64 time_ps, uint64 children_time_ps,
-    int64_t flops, int64_t bytes_accessed,
-    // NOLINTNEXTLINE: clang-tidy missing-includes false positive
-    const tsl::protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
-        memory_accessed_breakdown,
-    int64_t model_flops) {
-  EnterOpMetadata(program_id, name, category, provenance, deduplicated_name,
-                  is_eager);
-  uint64 self_time_ps = time_ps - children_time_ps;
-  DCHECK_GE(time_ps, self_time_ps);
-  OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
-  op_metrics->set_num_cores(1);
-  op_metrics->set_occurrences(op_metrics->occurrences() + occurrences);
-  op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
-  op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
-  op_metrics->set_flops(op_metrics->flops() + flops * occurrences);
-  if (model_flops == 0) {
-    // If ModelsFlops is 0, use the same value as device flops.
-    op_metrics->set_model_flops(op_metrics->flops());
-  } else {
-    op_metrics->set_model_flops(op_metrics->model_flops() +
-                                model_flops * occurrences);
-  }
-  op_metrics->set_bytes_accessed(op_metrics->bytes_accessed() +
-                                 bytes_accessed * occurrences);
-  CombineMemoryAccessedBreakdown(
-      memory_accessed_breakdown,
-      op_metrics->mutable_memory_accessed_breakdown());
-  db()->set_total_op_time_ps(db()->total_op_time_ps() + self_time_ps);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index 9363574a474f..b5edd9288a46 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -16,94 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 
-#include <cstdint>
-
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/types.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/hlo_module_map.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tsl/platform/protobuf.h"
-
-namespace tensorflow {
-namespace profiler {
-using tsl::uint64;
-
-// Annotate the op_metrics with the metadata from the instr_wrapper.
-void EnterOpMetadata(OpMetrics* op_metrics,
-                     const HloInstructionWrapper* instr_wrapper);
-void EnterOpMetadataFromHloModuleMap(OpMetrics* op_metrics,
-                                     const HloModuleMap& hlo_module_map);
-
-void AddFusionChildrenToOpMetricsFromHloInstruction(
-    OpMetrics* op_metrics, const HloInstructionWrapper* instr_wrapper);
-
-class HostOpMetricsDbBuilder : public OpMetricsDbBuilder {
- public:
-  explicit HostOpMetricsDbBuilder(OpMetricsDb* db) : OpMetricsDbBuilder(db) {}
-
-  // A function that will be called when the end of an OP is
-  // observed on a trace, where:
-  //   name = the OP name.
-  //   category = the OP category.
-  //   is_eager = whether this OP is eagerly executed.
-  //   time_ps = the total execution time of the OP in picoseconds, including
-  //             the execution time of its children.
-  //   children_time_ps = the execution time of the children of this OP in
-  //                      picoseconds
-  void EnterOp(absl::string_view name, absl::string_view category,
-               bool is_eager, uint64 time_ps, uint64 children_time_ps);
-
-  // Updates total_host_infeed_enq_duration_ps_ and
-  // total_host_infeed_enq_duration_ps_.
-  void EnterHostInfeedEnqueue(tsl::profiler::Timespan host_infeed_enqueue);
-
- private:
-  // The tsl::profiler::Timespan of the last InfeedEnqueue op on this thread.
-  tsl::profiler::Timespan last_host_infeed_enqueue_;
-};
-
-class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
- public:
-  explicit DeviceOpMetricsDbBuilder(OpMetricsDb* db) : OpMetricsDbBuilder(db) {}
-
-  // A function that will be called when the end of an OP is
-  // observed on a trace, where:
-  //   program_id = the ID of the program that contains this OP.
-  //   name = the OP name.
-  //   category = the OP category.
-  //   provenance = the provenance of this OP (e.g. original TF OP).
-  //   is_eager = whether this OP is eagerly executed.
-  //   occurrences = the number of occurrences of this OP.
-  //   time_ps = the total execution time of the OP in picoseconds, including
-  //             the execution time of its children.
-  //   children_time_ps = the execution time of the children of this OP in
-  //                      picoseconds.
-  //   flops = the number of floating-point operations computed.
-  //   bytes_accessed = the sum of bytes read and bytes written by this OP.
-  //   memory_accessed_breakdown = the breakdown of memory accessed by operation
-  //                               type and memory space.
-  void EnterOp(uint64 program_id, absl::string_view name,
-               absl::string_view category, absl::string_view provenance,
-               absl::string_view deduplicated_name, bool is_eager,
-               uint64 occurrences, uint64 time_ps, uint64 children_time_ps,
-               int64_t flops, int64_t bytes_accessed,
-               const tsl::protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
-                   memory_accessed_breakdown = {},
-               int64_t model_flops = 0);
-
-  void EnterOpMetadata(uint64 program_id, absl::string_view program_name,
-                       absl::string_view category, absl::string_view provenance,
-                       absl::string_view deduplicated_name, bool is_eager,
-                       absl::string_view long_name = "");
-
-  void EnterOpMetadataFromHloModuleMap(uint64 program_id,
-                                       absl::string_view op_name,
-                                       const HloModuleMap& hlo_module_map);
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/op_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/step_intersection.cc b/tensorflow/core/profiler/utils/step_intersection.cc
deleted file mode 100644
index 8eb967fafba1..000000000000
--- a/tensorflow/core/profiler/utils/step_intersection.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/utils/step_intersection.h"
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
-#include "xla/tsl/profiler/utils/timespan.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-// Returns the timespan in this step (across all cores).
-tsl::profiler::Timespan StepTimespan(const PerCoreStepInfo& percore_stepinfo) {
-  uint64 min_ps = kuint64max;
-  uint64 max_ps = 0;
-  for (const auto& core_stepinfo : percore_stepinfo.step_info_per_core()) {
-    const auto& stepinfo = core_stepinfo.second;
-    uint64 begin_ps = stepinfo.begin_ps();
-    uint64 end_ps = begin_ps + stepinfo.duration_ps();
-    min_ps = std::min(min_ps, begin_ps);
-    max_ps = std::max(max_ps, end_ps);
-  }
-  return (min_ps < max_ps)
-             ? tsl::profiler::Timespan::FromEndPoints(min_ps, max_ps)
-             : tsl::profiler::Timespan();
-}
-
-// Returns the timespan across all steps in the given step_db.
-tsl::profiler::Timespan AllStepsTimespan(const StepDatabaseResult& step_db) {
-  uint64 min_ps = kuint64max;
-  uint64 max_ps = 0;
-  for (const auto& step : step_db.step_sequence()) {
-    tsl::profiler::Timespan timespan = StepTimespan(step);
-    uint64 begin_ps = timespan.begin_ps();
-    uint64 end_ps = timespan.end_ps();
-    min_ps = std::min(min_ps, begin_ps);
-    max_ps = std::max(max_ps, end_ps);
-  }
-  return (min_ps < max_ps)
-             ? tsl::profiler::Timespan::FromEndPoints(min_ps, max_ps)
-             : tsl::profiler::Timespan();
-}
-
-struct AlignmentInfo {
-  StepsAlignment alignment;
-  double similarity;
-};
-
-// Computes the similarity between the given two steps. The closer their
-// timespans are, the larger is the similarity.
-double StepSimilarity(const PerCoreStepInfo& subordinate_step,
-                      const PerCoreStepInfo& chief_step) {
-  tsl::profiler::Timespan subordinate_timespan = StepTimespan(subordinate_step);
-  tsl::profiler::Timespan chief_timespan = StepTimespan(chief_step);
-  return chief_timespan.OverlappedDurationPs(subordinate_timespan);
-}
-
-// If the subordinate steps and the chief steps are aligned at the given anchor
-// points (i.e. at the subordinate_anchor step on the subordinate sequence, at
-// the chief_anchor step on the chief sequence), returns the corresponding
-// AlignmentInfo.
-AlignmentInfo ComputeAlignmentInfo(const StepDatabaseResult& subordinate,
-                                   uint32 subordinate_anchor,
-                                   const StepDatabaseResult& chief,
-                                   uint32 chief_anchor) {
-  // Assumes that the step at subordinate_anchor on the subordinate sequence is
-  // aligned with the step at the chief_anchor on the chief sequence. Then the
-  // number of steps before the anchor is the minimum of the number of steps
-  // before the anchor in the subordinate and that before the anchor in the
-  // chief. Similarly, the number of steps after the anchor is the minimum of
-  // the number of steps after the anchor in the subordinate and that after the
-  // anchor in the chief.
-  uint32 pre_anchor_steps = std::min(subordinate_anchor, chief_anchor);
-  uint32 post_anchor_steps =
-      std::min(subordinate.step_sequence_size() - subordinate_anchor,
-               chief.step_sequence_size() - chief_anchor);
-  // total number of steps aligned = pre_anchor_steps + post_anchor_steps.
-  uint32 alignment_steps = pre_anchor_steps + post_anchor_steps;
-
-  double similarity = 0;
-  // Where the aligned steps begin on the subordinate sequence.
-  uint32 begin_subordinate_idx = subordinate_anchor - pre_anchor_steps;
-  // Where the aligned steps begin on the chief sequence.
-  uint32 begin_chief_idx = chief_anchor - pre_anchor_steps;
-
-  for (uint32 i = 0; i < alignment_steps; i++) {
-    // Accumulates the similarity at each step.
-    similarity +=
-        StepSimilarity(subordinate.step_sequence(begin_subordinate_idx + i),
-                       chief.step_sequence(begin_chief_idx + i));
-  }
-  StepsAlignment alignment = {begin_subordinate_idx, begin_chief_idx,
-                              alignment_steps};
-  return {alignment, similarity};
-}
-
-// Returns the best alignment for aligning subordinate against chief.
-StepsAlignment FindStepsAlignment(const StepDatabaseResult& subordinate,
-                                  const StepDatabaseResult& chief) {
-  double max_similarity = -1;
-  StepsAlignment alignment = {0, 0, 0};
-  if (subordinate.step_sequence_size() == 0 || chief.step_sequence_size() == 0)
-    return alignment;
-  for (auto c = 0; c < chief.step_sequence_size(); c++) {
-    AlignmentInfo info =
-        ComputeAlignmentInfo(subordinate, /*subordinate_anchor=*/0, chief, c);
-    if (info.similarity <= max_similarity) continue;
-    max_similarity = info.similarity;
-    alignment = info.alignment;
-  }
-  for (auto s = 1; s < subordinate.step_sequence_size(); s++) {
-    // s starts at 1 instead of 0, because the loop above already considers
-    // (s=0, c=0).
-    AlignmentInfo info =
-        ComputeAlignmentInfo(subordinate, s, chief, /*chief_anchor=*/0);
-    if (info.similarity <= max_similarity) continue;
-    max_similarity = info.similarity;
-    alignment = info.alignment;
-  }
-  return alignment;
-}
-
-std::string StringStepsAlignment(const StepsAlignment& alignment) {
-  return absl::StrCat(
-      "[begin_subordinate_idx: ", alignment.begin_subordinate_idx,
-      ", begin_chief_idx: ", alignment.begin_chief_idx,
-      ", num_steps: ", alignment.num_steps, "]");
-}
-
-std::string StringDstStepNumbers(const std::vector<uint32>& step_numbers) {
-  std::string str;
-  absl::StrAppend(&str, "[");
-  for (auto i = 0; i < step_numbers.size(); i++) {
-    if (i > 0) absl::StrAppend(&str, ", ");
-    absl::StrAppend(&str, step_numbers[i]);
-  }
-  absl::StrAppend(&str, "]");
-  return str;
-}
-
-std::string StringSrcToDstIndexMap(uint32 src_first_step_idx,
-                                   uint32 num_steps) {
-  std::string str;
-  absl::StrAppend(&str, "[");
-  for (auto i = 0; i < num_steps; i++) {
-    if (i > 0) absl::StrAppend(&str, ", ");
-    absl::StrAppend(&str, src_first_step_idx + i, ":", i);
-  }
-  absl::StrAppend(&str, "]");
-  return str;
-}
-
-}  // namespace
-
-StepIntersection::StepIntersection(
-    uint32 max_steps,
-    const absl::flat_hash_map<uint32, const StepDatabaseResult*>&
-        perhost_stepdb) {
-  empty_intersect_ = false;
-
-  // Figures out the host with the shortest timespan among their steps (called
-  // this host the "chief").
-  chief_host_id_ = kuint32max;
-  uint64 min_duration_ps = kuint64max;
-  const StepDatabaseResult* chief_step_db = nullptr;
-  for (const auto& hostid_stepdb : perhost_stepdb) {
-    auto host_id = hostid_stepdb.first;
-    const auto& step_db = hostid_stepdb.second;
-    tsl::profiler::Timespan timespan = AllStepsTimespan(*step_db);
-    if (timespan.duration_ps() < min_duration_ps) {
-      chief_host_id_ = host_id;
-      chief_step_db = step_db;
-      min_duration_ps = timespan.duration_ps();
-    }
-  }
-  if (chief_host_id_ == kuint32max) {
-    // There is no step at all on any host.
-    steps_dropped_ = 0;
-    begin_chief_idx_ = 0;
-    end_chief_idx_ = 0;
-    return;
-  }
-
-  uint32 max_begin_chief_idx = 0;
-  uint32 min_end_chief_idx = kuint32max;
-  // Aligns the steps in all hosts with those in the chief.
-  for (const auto& hostid_stepdb : perhost_stepdb) {
-    auto host_id = hostid_stepdb.first;
-    const auto& step_db = hostid_stepdb.second;
-    if (host_id == chief_host_id_) {
-      // Simply aligns with itself.
-      perhost_alignment_[host_id] = {
-          /*begin_subordinate_idx=*/0, /*begin_chief_idx=*/0,
-          static_cast<uint32>(step_db->step_sequence_size())};
-    } else {
-      perhost_alignment_[host_id] =
-          FindStepsAlignment(*step_db, *chief_step_db);
-    }
-    // Intersects this host's alignment with other hosts' alignments.
-    uint32 host_begin_chief_idx = perhost_alignment_[host_id].begin_chief_idx;
-    max_begin_chief_idx = std::max(max_begin_chief_idx, host_begin_chief_idx);
-    uint32 host_end_chief_idx = perhost_alignment_[host_id].begin_chief_idx +
-                                perhost_alignment_[host_id].num_steps;
-    min_end_chief_idx = std::min(min_end_chief_idx, host_end_chief_idx);
-  }
-  if (max_begin_chief_idx > min_end_chief_idx) {
-    // The intersection is empty.
-    steps_dropped_ = 0;
-    begin_chief_idx_ = 0;
-    end_chief_idx_ = 0;
-    empty_intersect_ = true;
-    return;
-  }
-
-  begin_chief_idx_ = max_begin_chief_idx;
-
-  // Takes max_steps into account.
-  uint32 num_steps = min_end_chief_idx - max_begin_chief_idx;
-  if (num_steps > max_steps) {
-    steps_dropped_ = num_steps - max_steps;
-    // TODO(ckluk): Drops from both ends to avoid incomplete steps at the
-    // beginning and end of the profile.
-    end_chief_idx_ = max_begin_chief_idx + max_steps;
-  } else {
-    steps_dropped_ = 0;
-    end_chief_idx_ = min_end_chief_idx;
-  }
-}
-
-std::vector<uint32> StepIntersection::DstStepNumbers() const {
-  // TODO(ckluk): Honors training-loop boundaries (if more than one loop
-  // sampled).
-  std::vector<uint32> result;
-  result.reserve(NumSteps());
-  for (uint32 i = 0; i < NumSteps(); i++) {
-    result.push_back(i);
-  }
-  return result;
-}
-
-uint32 StepIntersection::FirstStepIndex(uint32 host_id) const {
-  const auto* alignment = gtl::FindOrNull(perhost_alignment_, host_id);
-  if (alignment == nullptr) return 0;
-  DCHECK(alignment->begin_chief_idx <= begin_chief_idx_);
-  uint32 shift = begin_chief_idx_ - alignment->begin_chief_idx;
-  uint32 begin_subordinate_idx = alignment->begin_subordinate_idx + shift;
-  return begin_subordinate_idx;
-}
-
-std::string StepIntersection::DebugString() const {
-  std::string str;
-  absl::StrAppend(&str, "chief host id_: ", chief_host_id_, "\n");
-  absl::StrAppend(&str, "begin_chief_idx_: ", begin_chief_idx_,
-                  ", num_steps: ", NumSteps(), "\n");
-  absl::StrAppend(
-      &str, "DstStepNumbers(): ", StringDstStepNumbers(DstStepNumbers()), "\n");
-
-  std::vector<uint32> host_ids;
-  host_ids.reserve(perhost_alignment_.size());
-  for (const auto& hostid_alignment : perhost_alignment_) {
-    auto host_id = hostid_alignment.first;
-    host_ids.push_back(host_id);
-  }
-  absl::c_sort(host_ids);
-
-  absl::StrAppend(&str, "perhost_alignment:\n");
-  for (const auto host_id : host_ids) {
-    const auto* ptr = gtl::FindOrNull(perhost_alignment_, host_id);
-    if (ptr == nullptr) continue;
-    absl::StrAppend(&str, "host: ", host_id,
-                    ", step-alignment: ", StringStepsAlignment(*ptr), "\n");
-  }
-  absl::StrAppend(&str, "SrcToDstIndexMap():\n");
-  for (const auto host_id : host_ids) {
-    absl::StrAppend(&str, "host: ", host_id, ", src-to-dst-index-map: ",
-                    StringSrcToDstIndexMap(FirstStepIndex(host_id), NumSteps()),
-                    "\n");
-  }
-  return str;
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/step_intersection.h b/tensorflow/core/profiler/utils/step_intersection.h
index 777b0528c30a..d1932a5c5e43 100644
--- a/tensorflow/core/profiler/utils/step_intersection.h
+++ b/tensorflow/core/profiler/utils/step_intersection.h
@@ -16,72 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
 
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Description of how two step sequences are aligned.
-struct StepsAlignment {
-  uint32 begin_subordinate_idx;  // where the alignment begins on the
-                                 // subordinate steps.
-  uint32 begin_chief_idx;  // where the alignment begins on the chief steps.
-  uint32 num_steps;        // aligned for how many steps.
-};
-
-class StepIntersection {
- public:
-  StepIntersection(
-      uint32 max_steps,
-      const absl::flat_hash_map</*host_id=*/uint32, const StepDatabaseResult*>&
-          perhost_stepdb);
-
-  // Returns the number of steps in the intersection.
-  uint32 NumSteps() const { return end_chief_idx_ - begin_chief_idx_; }
-
-  // Returns the value of empty_intersect_ (see the explanation of
-  // empty_intersect_ below).
-  bool EmptyIntersect() const { return empty_intersect_; }
-
-  // Returns the step numbers for the destination (i.e. the intersection
-  // result).
-  std::vector<uint32> DstStepNumbers() const;
-
-  // Returns the index to the step in the given host that corresponds to the
-  // first step in the intersection.
-  uint32 FirstStepIndex(uint32 host_id) const;
-
-  // Returns the number of steps dropped due to the max_steps constraint
-  // specified in the constructor.
-  uint32 StepsDropped() const { return steps_dropped_; }
-
-  std::string DebugString() const;
-
- private:
-  absl::flat_hash_map</*host_id=*/uint32, StepsAlignment> perhost_alignment_;
-  uint32
-      chief_host_id_;  // the host whose step sequence is selected as the chief.
-  uint32 steps_dropped_;  // number of steps dropped.
-  // If NumSteps() is 0, empty_intersect indicates one of two possible reasons:
-  //   (i) At least one host has some steps, but the intersection over all hosts
-  //   is empty. In this case, empty_intersect is true,
-  //   (ii) None of the hosts has any steps. In this case, empty_intersect is
-  //   false.
-  // If NumSteps() > 0, empty_intersect is don't care.
-  bool empty_intersect_;
-  // The begin and end indices to the chief step sequence for this step
-  // intersection. Note that the begin index is inclusive but the end index is
-  // exclusive.
-  uint32 begin_chief_idx_;
-  uint32 end_chief_idx_;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/step_intersection.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
diff --git a/tensorflow/core/profiler/utils/step_intersection_test.cc b/tensorflow/core/profiler/utils/step_intersection_test.cc
deleted file mode 100644
index 2115581ff1a2..000000000000
--- a/tensorflow/core/profiler/utils/step_intersection_test.cc
+++ /dev/null
@@ -1,260 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/step_intersection.h"
-
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-using PerHostStepDb =
-    absl::flat_hash_map<uint32 /*=host_id*/, StepDatabaseResult>;
-
-constexpr uint64 kStepDurationPs = 2000000000;
-constexpr uint32 kNumStepsPerHost = 10;
-constexpr uint64 kStepGapPs = 0;
-constexpr uint32 kNumCoresPerHost = 8;
-
-PerCoreStepInfo CreateOneTestStep(uint32 host_id, uint32 num_steps,
-                                  uint32 step_idx, uint64 step_begin_ps) {
-  PerCoreStepInfo result;
-  uint32 step_num =
-      step_idx * host_id;  // creates the situation where each host has a
-                           // different step number for the same step.
-  result.set_step_num(step_num);
-  StepInfoResult info;
-  info.set_step_num(step_num);
-  if (host_id == 0 && step_idx == (num_steps - 1)) {
-    // Makes the last step on host_id is little bit shorter so that host-0 will
-    // be chosen as the chief.
-    info.set_duration_ps(kStepDurationPs - 1);
-  } else {
-    info.set_duration_ps(kStepDurationPs);
-  }
-  info.set_begin_ps(step_begin_ps);
-  // Don't care about the rest of the fields in StepInfoResult.
-  for (uint32 core_id = 0; core_id < kNumCoresPerHost; core_id++) {
-    (*result.mutable_step_info_per_core())[core_id] = info;
-    // Don't care about the rest of the fields in PerCoreStepInfo.
-  }
-  return result;
-}
-
-PerHostStepDb CreateTestSteps(uint32 num_hosts, uint64 shift_ps) {
-  PerHostStepDb result;
-  uint64 first_step_begin_ps = 0;
-  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
-    StepDatabaseResult step_db;
-    uint64 step_begin_ps = first_step_begin_ps;
-    for (uint32 step_idx = 0; step_idx < kNumStepsPerHost; step_idx++) {
-      *step_db.add_step_sequence() =
-          CreateOneTestStep(host_id, kNumStepsPerHost, step_idx, step_begin_ps);
-      step_begin_ps += (kStepDurationPs + kStepGapPs);
-    }
-    result[host_id] = step_db;
-    first_step_begin_ps += shift_ps;
-  }
-  return result;
-}
-
-PerHostStepDb CreateEmptyIntersectTestSteps() {
-  PerHostStepDb result;
-
-  uint64 step_begin_ps;
-  uint32 host_id;
-
-  // Host-0
-  host_id = 0;
-  step_begin_ps = 0;
-  uint64 host_0_num_steps = 10;
-  StepDatabaseResult step_db_0;
-  for (uint32 step_idx = 0; step_idx < host_0_num_steps; step_idx++) {
-    *step_db_0.add_step_sequence() =
-        CreateOneTestStep(host_id, host_0_num_steps, step_idx, step_begin_ps);
-    step_begin_ps += (kStepDurationPs + kStepGapPs);
-  }
-  result[host_id] = step_db_0;
-
-  // Host-1
-  host_id = 1;
-  step_begin_ps = (host_0_num_steps - 2) * (kStepDurationPs + kStepGapPs);
-  uint64 host_1_num_steps = 5;
-  StepDatabaseResult step_db_1;
-  for (uint32 step_idx = 0; step_idx < host_1_num_steps; step_idx++) {
-    *step_db_1.add_step_sequence() =
-        CreateOneTestStep(host_id, host_1_num_steps, step_idx, step_begin_ps);
-    step_begin_ps += (kStepDurationPs + kStepGapPs);
-  }
-  result[host_id] = step_db_1;
-
-  // Host-2
-  host_id = 2;
-  step_begin_ps = (host_0_num_steps + host_1_num_steps - 4) *
-                  (kStepDurationPs + kStepGapPs);
-  uint64 host_2_num_steps = 10;
-  StepDatabaseResult step_db_2;
-  for (uint32 step_idx = 0; step_idx < host_2_num_steps; step_idx++) {
-    *step_db_2.add_step_sequence() =
-        CreateOneTestStep(host_id, host_2_num_steps, step_idx, step_begin_ps);
-    step_begin_ps += (kStepDurationPs + kStepGapPs);
-  }
-  result[host_id] = step_db_2;
-
-  return result;
-}
-
-PerHostStepDb CreateNoStep(uint32 num_hosts) {
-  PerHostStepDb result;
-  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
-    StepDatabaseResult step_db;
-    result[host_id] = step_db;
-  }
-  return result;
-}
-
-absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> Convert(
-    const PerHostStepDb& perhost_stepdb) {
-  absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> result;
-  for (const auto& hostid_stepdb : perhost_stepdb) {
-    auto host_id = hostid_stepdb.first;
-    const auto& step_db = hostid_stepdb.second;
-    result[host_id] = &step_db;
-  }
-  return result;
-}
-
-TEST(StepIntersectionTest, EachHostShiftedBy1StepDuration) {
-  uint32 num_hosts = 4;
-  uint64 shift_ps = kStepDurationPs;
-
-  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
-  StepIntersection intersection =
-      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
-  EXPECT_EQ(intersection.StepsDropped(), 0);
-  uint32 dst_num_steps = kNumStepsPerHost - num_hosts + 1;
-  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
-
-  uint32 src_first_step_index = intersection.FirstStepIndex(0);
-  EXPECT_EQ(src_first_step_index, num_hosts - 1);
-  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
-  for (uint32 i = 0; i < dst_num_steps; i++) {
-    EXPECT_EQ(dst_step_numbers[i], i);
-  }
-}
-
-TEST(StepIntersectionTest, ExactlyNoShift) {
-  uint32 num_hosts = 4;
-  uint64 shift_ps = 0;
-
-  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
-  StepIntersection intersection =
-      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
-  EXPECT_EQ(intersection.StepsDropped(), 0);
-  uint32 dst_num_steps = kNumStepsPerHost;
-  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
-
-  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
-  for (uint32 i = 0; i < dst_num_steps; i++) {
-    EXPECT_EQ(dst_step_numbers[i], i);
-  }
-  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
-    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
-    EXPECT_EQ(src_first_step_index, 0);
-  }
-}
-
-TEST(StepIntersectionTest, EachHostShiftedByJustABit) {
-  uint32 num_hosts = 4;
-  uint64 shift_ps = 100;
-
-  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
-  StepIntersection intersection =
-      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
-  EXPECT_EQ(intersection.StepsDropped(), 0);
-  uint32 dst_num_steps = kNumStepsPerHost;
-  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
-
-  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
-  for (uint32 i = 0; i < dst_num_steps; i++) {
-    EXPECT_EQ(dst_step_numbers[i], i);
-  }
-  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
-    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
-    EXPECT_EQ(src_first_step_index, 0);
-  }
-}
-
-TEST(StepIntersectionTest, SingleHost) {
-  uint32 num_hosts = 1;
-  uint64 shift_ps = 0;
-
-  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
-  StepIntersection intersection =
-      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
-  EXPECT_EQ(intersection.StepsDropped(), 0);
-  uint32 dst_num_steps = kNumStepsPerHost;
-  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
-
-  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
-  for (uint32 i = 0; i < dst_num_steps; i++) {
-    EXPECT_EQ(dst_step_numbers[i], i);
-  }
-  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
-    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
-    EXPECT_EQ(src_first_step_index, 0);
-  }
-}
-
-TEST(StepIntersectionTest, WithMaxSteps) {
-  uint32 num_hosts = 4;
-  uint64 shift_ps = 0;
-  uint32 max_steps = 3;
-
-  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
-  StepIntersection intersection =
-      StepIntersection(max_steps, Convert(perhost_stepdb));
-  EXPECT_EQ(intersection.StepsDropped(), kNumStepsPerHost - max_steps);
-  EXPECT_EQ(intersection.NumSteps(), max_steps);
-}
-
-TEST(StepIntersectionTest, NoStep) {
-  uint32 num_hosts = 4;
-  uint32 max_steps = 100;
-  PerHostStepDb perhost_stepdb = CreateNoStep(num_hosts);
-  StepIntersection intersection =
-      StepIntersection(max_steps, Convert(perhost_stepdb));
-  EXPECT_EQ(intersection.NumSteps(), 0);
-  EXPECT_FALSE(intersection.EmptyIntersect());
-}
-
-TEST(StepIntersectionTest, EmptyIntersection) {
-  uint32 max_steps = 100;
-  PerHostStepDb perhost_stepdb = CreateEmptyIntersectTestSteps();
-  StepIntersection intersection =
-      StepIntersection(max_steps, Convert(perhost_stepdb));
-  EXPECT_EQ(intersection.StepsDropped(), 0);
-  EXPECT_EQ(intersection.NumSteps(), 0);
-  EXPECT_TRUE(intersection.EmptyIntersect());
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.cc b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
deleted file mode 100644
index 2d3b5fa4a1bc..000000000000
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/utils/tfstreamz_utils.h"
-
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/strings/substitute.h"
-#include "tensorflow/core/framework/summary.pb.h"
-#include "tensorflow/core/lib/monitoring/collected_metrics.h"
-#include "tensorflow/core/lib/monitoring/metric_def.h"
-#include "tensorflow/core/lib/monitoring/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/tfstreamz.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-std::string ConstructXStatName(absl::string_view name,
-                               const monitoring::Point& point) {
-  if (point.labels.empty()) {
-    return std::string(name);
-  }
-  return absl::Substitute(
-      "$0{$1}", name,
-      absl::StrJoin(
-          point.labels, ", ",
-          [](std::string* out, const monitoring::Point::Label& label) {
-            absl::StrAppend(out, label.name, "=", label.value);
-          }));
-}
-
-tfstreamz::Percentiles ToProto(const monitoring::Percentiles& percentiles) {
-  tfstreamz::Percentiles output;
-  output.set_unit_of_measure(
-      static_cast<tfstreamz::UnitOfMeasure>(percentiles.unit_of_measure));
-  output.set_start_nstime(percentiles.start_nstime);
-  output.set_end_nstime(percentiles.end_nstime);
-  output.set_min_value(percentiles.min_value);
-  output.set_max_value(percentiles.max_value);
-  output.set_mean(percentiles.mean);
-  output.set_stddev(percentiles.stddev);
-  output.set_num_samples(percentiles.num_samples);
-  output.set_total_samples(percentiles.total_samples);
-  output.set_accumulator(percentiles.accumulator);
-  for (const auto& pp : percentiles.points) {
-    auto* percentile_point = output.add_points();
-    percentile_point->set_percentile(pp.percentile);
-    percentile_point->set_value(pp.value);
-  }
-  return output;
-}
-
-}  // namespace
-
-absl::Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
-                               XPlane* plane, uint64 line_start_time_ns) {
-  XPlaneBuilder xplane(plane);
-  XLineBuilder line = xplane.GetOrCreateLine(0);  // This plane has single line.
-  line.SetTimestampNs(line_start_time_ns);
-
-  // For each snapshot, create a virtual event.
-  for (const auto& snapshot : snapshots) {
-    XEventMetadata* event_metadata =
-        xplane.GetOrCreateEventMetadata("TFStreamz Snapshot");
-    XEventBuilder xevent = line.AddEvent(*event_metadata);
-    xevent.SetTimestampNs(snapshot.start_time_ns);
-    xevent.SetEndTimestampNs(snapshot.end_time_ns);
-    auto& metric_descriptor_map = snapshot.metrics->metric_descriptor_map;
-    for (const auto& point_set : snapshot.metrics->point_set_map) {
-      const std::string& metric_name = point_set.first;
-      // Each metrics have multiple points corresponding to different labels.
-      for (const auto& point : point_set.second->points) {
-        // Generates one KPI metric for each point.
-        std::string stat_name = ConstructXStatName(metric_name, *point);
-        auto* metadata = xplane.GetOrCreateStatMetadata(stat_name);
-        auto it = metric_descriptor_map.find(metric_name);
-        if (it != metric_descriptor_map.end()) {
-          metadata->set_description(it->second->description);
-        }
-        switch (point->value_type) {
-          case monitoring::ValueType::kInt64:
-            xevent.AddStatValue(*metadata, point->int64_value);
-            break;
-          case monitoring::ValueType::kBool:
-            xevent.AddStatValue(*metadata, point->bool_value);
-            break;
-          case monitoring::ValueType::kString:
-            xevent.AddStatValue(*metadata, *xplane.GetOrCreateStatMetadata(
-                                               point->string_value));
-            break;
-          case monitoring::ValueType::kDouble:
-            xevent.AddStatValue(*metadata, point->double_value);
-            break;
-          case monitoring::ValueType::kHistogram:
-            xevent.AddStatValue(*metadata, point->histogram_value);
-            break;
-          case monitoring::ValueType::kPercentiles:
-            xevent.AddStatValue(*metadata, ToProto(point->percentiles_value));
-            break;
-        }
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.h b/tensorflow/core/profiler/utils/tfstreamz_utils.h
index abaafbc6e3c9..dffca153ac07 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.h
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.h
@@ -15,27 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
 
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "tensorflow/core/lib/monitoring/collected_metrics.h"
-#include "tensorflow/core/platform/types.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-struct TfStreamzSnapshot {
-  std::unique_ptr<monitoring::CollectedMetrics> metrics;
-  uint64 start_time_ns;  // time before collection.
-  uint64 end_time_ns;    // time after collection.
-};
-
-absl::Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
-                               XPlane* plane, uint64 line_start_time_ns);
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/tfstreamz_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h b/tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h
index 731481a4da86..e803bbc1b412 100644
--- a/tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h
+++ b/tensorflow/core/profiler/utils/tpu_step_breakdown_utils.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,61 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_BREAKDOWN_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_BREAKDOWN_UTILS_H_
 
-#include <cstdint>
-
-#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Total duration of infeed from host or SparseCoreV0 to TensorCore.
-inline uint64_t InfeedDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.infeed_duration_ps() + tpu.wait_for_scv0_duration_ps() +
-         tpu.scv0_infeed_transform_ps();
-}
-
-// Total duration of outfeed from TensorCore to host or SparseCoreV0.
-inline uint64_t OutfeedDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.host_outfeed_ps() + tpu.scv0_outfeed_ps();
-}
-
-// Total duration of infeed from host to SparseCoreV0.
-inline uint64_t ScV0InfeedDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.wait_for_scv0_duration_ps() * tpu.scv0_infeed_percent() / 100.0;
-}
-
-// Total duration of SparseCoreV0 compute.
-inline uint64_t ScV0ComputeDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.wait_for_scv0_duration_ps() - ScV0InfeedDurationPs(tpu);
-}
-
-// Total duration of infeed from host to TensorCore or SparseCoreV0.
-inline uint64_t TcPlusScV0InfeedDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.infeed_duration_ps() + ScV0InfeedDurationPs(tpu);
-}
-
-// Total duration of send and recv ops.
-inline uint64_t SendRecvDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.send_duration_ps() + tpu.recv_duration_ps();
-}
-
-// Total duration of host send and host recv ops.
-inline uint64_t HostSendRecvDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.host_send_duration_ps() + tpu.host_recv_duration_ps();
-}
-
-// Total duration TensorCore spends waiting for host.
-inline uint64_t WaitForHostDurationPs(const TpuStepBreakdown& tpu) {
-  return tpu.infeed_duration_ps() + tpu.host_outfeed_ps() +
-         HostSendRecvDurationPs(tpu) + tpu.tc_idle_ps();
-}
-
-// Total duration TensorCore spends waiting for host or SparseCoreV0.
-inline uint64_t WaitForHostOrScV0DurationPs(const TpuStepBreakdown& tpu) {
-  return WaitForHostDurationPs(tpu) + tpu.wait_for_scv0_duration_ps();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/tpu_step_breakdown_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_BREAKDOWN_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/tpu_step_details_utils.h b/tensorflow/core/profiler/utils/tpu_step_details_utils.h
index 23c1609dc797..8ce4f3a2bef4 100644
--- a/tensorflow/core/profiler/utils/tpu_step_details_utils.h
+++ b/tensorflow/core/profiler/utils/tpu_step_details_utils.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,35 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_DETAILS_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_DETAILS_UTILS_H_
 
-#include "tensorflow/core/profiler/protobuf/tpu_input_pipeline.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-inline double ComputeTimeMs(const PerTpuStepDetails& details) {
-  return details.tc_compute_time_ms() + details.scv0_compute_time_ms();
-}
-
-inline double InfeedTimeMs(const PerTpuStepDetails& details) {
-  return details.tc_infeed_time_ms() + details.scv0_infeed_time_ms();
-}
-
-inline double AllReduceTimeMs(const PerTpuStepDetails& details) {
-  return details.all_reduce_compute_time_ms() +
-         details.all_reduce_sync_time_ms();
-}
-
-inline double NonIdleTimeMs(const PerTpuStepDetails& details) {
-  return ComputeTimeMs(details) + InfeedTimeMs(details) +
-         AllReduceTimeMs(details) + details.tc_outfeed_time_ms();
-}
-
-// Time spent by a training step on TPU.
-inline double StepTimeMs(const PerTpuStepDetails& details) {
-  return NonIdleTimeMs(details) + details.tc_idle_time_ms();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/tpu_step_details_utils.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_TPU_STEP_DETAILS_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.cc b/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.cc
deleted file mode 100644
index 321cf041502c..000000000000
--- a/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/primitive_util.h"
-#include "xla/service/gpu/cublas_cudnn.h"
-#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
-#include "xla/service/hlo_cost_analysis.h"
-#include "xla/tsl/platform/errors.h"
-
-namespace tensorflow {
-namespace profiler {
-
-namespace {
-
-std::vector<uint32_t> GetInputBitwidths(const xla::HloInstruction& hlo) {
-  std::vector<uint32_t> input_bitwidths;
-  for (const auto& operand : hlo.operands()) {
-    switch (operand->shape().element_type()) {
-      case xla::PRIMITIVE_TYPE_INVALID:
-      case xla::TUPLE:
-      case xla::OPAQUE_TYPE:
-      case xla::TOKEN:
-        break;
-      default:
-        input_bitwidths.push_back(
-            xla::primitive_util::BitWidth(operand->shape().element_type()));
-    }
-  }
-  return input_bitwidths;
-}
-
-}  // namespace
-
-absl::Status XProfGpuCostAnalysis::HandleCustomCall(
-    const xla::HloInstruction* hlo) {
-  TF_RETURN_IF_ERROR(xla::gpu::GpuHloCostAnalysis::HandleCustomCall(hlo));
-
-  if (xla::gpu::IsCublasGemm(*hlo)) {
-    // The naming conventions and meanings of gemm parameters are documented at:
-    // https://docs.nvidia.com/cuda/cublas/index.html#using-the-cublaslt-api
-    // as inherited from GpuHloCostAnalysis, we only normalize the flops based
-    // on the datatype of A and B, which are supposed of same bitwidth.
-    int dot_operands_bitwidth =
-        xla::primitive_util::BitWidth(hlo->operand(0)->shape().element_type());
-    uint32_t flop_rate_adjustment = 1;
-    switch (dot_operands_bitwidth) {
-      case 8:
-        flop_rate_adjustment = 2;
-        break;
-      case 4:
-        flop_rate_adjustment = 4;
-        break;
-      default:
-        break;
-    }
-    float model_flops = current_properties_[kFlopsKey];
-    current_properties_[kDeviceFlopsAdjustment] =
-        model_flops - model_flops / flop_rate_adjustment;
-  }
-  return absl::OkStatus();
-}
-
-absl::Status XProfGpuCostAnalysis::DefaultPostprocess(
-    const xla::HloInstruction* hlo) {
-  uint32_t flop_rate_adjustment = 1;
-  float model_flops = current_properties_[kFlopsKey];
-
-  // Calculate adjustment of device flops based on input bit widths.
-  // This provide most general adjustment for all ops, and for all gpus.
-  std::vector<uint32_t> input_bitwidths = GetInputBitwidths(*hlo);
-  if (!input_bitwidths.empty()) {
-    int max_input_bitwidth =
-        *std::max_element(input_bitwidths.begin(), input_bitwidths.end());
-    if (model_flops) {
-      // for int8/fp8, 2x flops assumed comparing with fp16 flops(most of
-      // recent GPU models); for int4, 4x of model flops assumed comparing
-      // with fp16 flops. (like Nvidia T4, 3090). It will be more precise
-      // after adjustment based on specific GPUs mentioned above.
-      switch (max_input_bitwidth) {
-        case 8:
-          flop_rate_adjustment = 2;
-          break;
-        case 4:
-          flop_rate_adjustment = 4;
-          break;
-        default:
-          break;
-      }
-    }
-  }
-  current_properties_[kDeviceFlopsAdjustment] =
-      model_flops - model_flops / flop_rate_adjustment;
-  return absl::OkStatus();
-}
-
-absl::Status XProfGpuCostAnalysis::Postprocess(const xla::HloInstruction* hlo) {
-  if (hlo == nullptr) {
-    return absl::OkStatus();
-  }
-
-  switch (hlo->opcode()) {
-    case xla::HloOpcode::kCustomCall:
-      // Already handled specially in HandleCustomCall(), skip here.
-      // Add more OpCode here if it is handled specially in future.
-      break;
-    default:
-      DefaultPostprocess(hlo).IgnoreError();
-      break;
-  }
-
-  return xla::gpu::GpuHloCostAnalysis::Postprocess(hlo);
-}
-
-std::unique_ptr<xla::HloCostAnalysis>
-XProfGpuCostAnalysis::CreateNestedCostAnalysis() {
-  return std::make_unique<XProfGpuCostAnalysis>(options_);
-}
-
-int64_t XProfGpuCostAnalysis::GetDeviceFlopsAdjustment(
-    const xla::HloInstruction& hlo) {
-  return GetPropertyForHlo(hlo, kDeviceFlopsAdjustment, hlo_properties_);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h b/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h
index 76b50f5997d9..3814be42d656 100644
--- a/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h
+++ b/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h
@@ -16,42 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPROF_GPU_COST_ANALYSIS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPROF_GPU_COST_ANALYSIS_H_
 
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
-#include "xla/service/hlo_cost_analysis.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// XProfGpuCostAnalysis provides additional cost analysis for XProf, which
-// normalizes the flops to the device flops based on input bit widths.
-class XProfGpuCostAnalysis : public xla::gpu::GpuHloCostAnalysis {
- public:
-  explicit XProfGpuCostAnalysis(const xla::HloCostAnalysis::Options& options)
-      : xla::gpu::GpuHloCostAnalysis(options) {}
-
-  absl::Status HandleCustomCall(const xla::HloInstruction* hlo) override;
-
-  absl::Status Postprocess(const xla::HloInstruction* hlo) override;
-
-  int64_t GetDeviceFlopsAdjustment(const xla::HloInstruction& hlo);
-
- protected:
-  std::unique_ptr<xla::HloCostAnalysis> CreateNestedCostAnalysis() override;
-
-  absl::Status DefaultPostprocess(const xla::HloInstruction* hlo);
-
- private:
-  static inline constexpr absl::string_view kDeviceFlopsAdjustment =
-      "device_flops_adjustment";
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
+#include "xprof/utils/xprof_gpu_cost_analysis.h"  // from @org_xprof  // IWYU pragma: export
 
 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPROF_GPU_COST_ANALYSIS_H_
diff --git a/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis_test.cc b/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis_test.cc
deleted file mode 100644
index c71d1a9dfb57..000000000000
--- a/tensorflow/core/profiler/utils/xprof_gpu_cost_analysis_test.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/utils/xprof_gpu_cost_analysis.h"
-
-#include <cstdint>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/testlib/test_helpers.h"
-#include "xla/service/hlo_cost_analysis.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla_data.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-class XprofGpuHloCostAnalysisTest : public xla::HloTestBase {
-  xla::HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const {
-    return [&](const xla::Shape& shape) {
-      constexpr int64_t kPointerSize = 8;
-      return xla::ShapeUtil::ByteSizeOf(shape, kPointerSize);
-    };
-  }
-
- public:
-  xla::HloCostAnalysis::Options options_{
-      ShapeSizeBytesFunction(),
-      /*per_second_rates=*/{},
-      /*min_latencies_seconds=*/{},
-      /*count_multiple_input_accesses=*/true};
-  XProfGpuCostAnalysis analysis_{options_};
-  XprofGpuHloCostAnalysisTest() : xla::HloTestBase() {}
-};
-
-TEST_F(XprofGpuHloCostAnalysisTest, Fp16GemmNoAdjustment) {
-  absl::string_view hlo_string = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = f16[65536,32800] parameter(0)
-  arg1 = f16[32800,32] parameter(1)
-  gemm = (f16[65536,32], s8[0]) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config="{
-        \"gemm_backend_config\": {
-            \"alpha_real\":1,
-            \"beta\":0,
-            \"dot_dimension_numbers\":{
-                \"lhs_contracting_dimensions\":[\"1\"],
-                \"rhs_contracting_dimensions\":[\"0\"],
-                \"lhs_batch_dimensions\":[],
-                \"rhs_batch_dimensions\":[]
-            },
-            \"alpha_imag\":0,
-            \"precision_config\":{
-                \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
-            },
-            \"epilogue\":\"DEFAULT\"
-        }
-    }"
-  ROOT get-tuple-element = f16[65536,32]
-    get-tuple-element((f16[65536,32], s8[0]) gemm), index=0
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
-  xla::HloComputation* comp = module->entry_computation();
-  const xla::HloInstruction* fp16gemm = comp->GetInstructionWithName("gemm");
-  // flops of gemm A * B = rows(A) * cols(B) * cols(A) * 2
-  // where 2 is for the add and multiply
-  int64_t gold_flops = 65536LL * 32800 * 32 * 2;
-  EXPECT_EQ(analysis_.flop_count(*fp16gemm), gold_flops);
-  EXPECT_EQ(analysis_.GetDeviceFlopsAdjustment(*fp16gemm), 0);
-}
-
-TEST_F(XprofGpuHloCostAnalysisTest, S8GemmAdjustment) {
-  absl::string_view hlo_string = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = s8[65536,32800] parameter(0)
-  arg1 = s8[32800,32] parameter(1)
-  gemm = (s32[65536,32], s8[0]) custom-call(arg0, arg1),
-    custom_call_target="__cublas$gemm",
-    backend_config="{
-        \"gemm_backend_config\": {
-            \"alpha_real\":1,
-            \"beta\":0,
-            \"dot_dimension_numbers\":{
-                \"lhs_contracting_dimensions\":[\"1\"],
-                \"rhs_contracting_dimensions\":[\"0\"],
-                \"lhs_batch_dimensions\":[],
-                \"rhs_batch_dimensions\":[]
-            },
-            \"alpha_imag\":0,
-            \"precision_config\":{
-                \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
-            },
-            \"epilogue\":\"DEFAULT\"
-        }
-    }"
-  ROOT get-tuple-element = s32[65536,32]
-    get-tuple-element((s32[65536,32], s8[0]) gemm), index=0
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
-  xla::HloComputation* comp = module->entry_computation();
-  const xla::HloInstruction* s8gemm = comp->GetInstructionWithName("gemm");
-  int64_t gold_flops = 65536LL * 32800 * 32 * 2;
-  EXPECT_EQ(analysis_.flop_count(*s8gemm), gold_flops);
-  // Matmul of int8 * int8 -> int32, normalized it to equivalent fp16 flops by
-  // dividing by 2 as all inputs are 8 bits
-  EXPECT_EQ(analysis_.GetDeviceFlopsAdjustment(*s8gemm), gold_flops / 2);
-}
-
-// test special handling logic when fp32 parameter is also used
-TEST_F(XprofGpuHloCostAnalysisTest, Fp8GemmWithFp32ParameterAdjustment) {
-  absl::string_view hlo_string = R"(
-HloModule r
-
-ENTRY e {
-  arg0 = f8e4m3fn[2048,5120]{1,0} parameter(0)
-  arg1 = f8e4m3fn[5120,5120]{0,1} parameter(1)
-  arg2 = f32[] parameter(2)
-  arg3 = f32[] parameter(3)
-  gemm = (bf16[2048,5120]{1,0}, s8[33554432]{0})
-    custom-call(arg0, arg1, arg2, arg3),
-    custom_call_target="__cublas$lt$matmul$f8",
-    backend_config="{
-        \"gemm_backend_config\": {
-            \"alpha_real\":1,
-            \"beta\":0,
-            \"dot_dimension_numbers\":{
-                \"lhs_contracting_dimensions\":[\"1\"],
-                \"rhs_contracting_dimensions\":[\"0\"],
-                \"lhs_batch_dimensions\":[],
-                \"rhs_batch_dimensions\":[]
-            },
-            \"alpha_imag\":0,
-            \"precision_config\":{
-                \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
-            },
-            \"epilogue\":\"DEFAULT\",
-            \"lhs_stride\":\"10485760\",
-            \"rhs_stride\":\"26214400\",
-            \"grad_x\":false,
-            \"grad_y\":false,
-            \"damax_output\":false
-        }
-    }"
-  ROOT get-tuple-element = bf16[2048,5120]{1,0}
-    get-tuple-element((bf16[2048,5120]{1,0}, s8[33554432]{0}) gemm), index=0
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
-  xla::HloComputation* comp = module->entry_computation();
-  const xla::HloInstruction* fp8_gemm = comp->GetInstructionWithName("gemm");
-  int64_t gold_flops = 2048LL * 5120 * 5120 * 2;
-  EXPECT_EQ(analysis_.flop_count(*fp8_gemm), gold_flops);
-  // Matmul of int8 * int8 -> int32, normalized it to equivalent fp16 flops by
-  // dividing by 2 as all inputs are 8 bits
-  EXPECT_EQ(analysis_.GetDeviceFlopsAdjustment(*fp8_gemm), gold_flops / 2);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/public/release_version.h b/tensorflow/core/public/release_version.h
index 143d3c7e8b3e..9e2eb8d092a5 100644
--- a/tensorflow/core/public/release_version.h
+++ b/tensorflow/core/public/release_version.h
@@ -26,6 +26,22 @@ limitations under the License.
 #define _TF_STR_HELPER(x) #x
 #define _TF_STR(x) _TF_STR_HELPER(x)
 
+#ifndef TF_MAJOR_VERSION
+#error "TF_MAJOR_VERSION is not defined!"
+#endif
+
+#ifndef TF_MINOR_VERSION
+#error "TF_MINOR_VERSION is not defined!"
+#endif
+
+#ifndef TF_PATCH_VERSION
+#error "TF_PATCH_VERSION is not defined!"
+#endif
+
+#ifndef TF_VERSION_SUFFIX
+#error "TF_VERSION_SUFFIX is not defined!"
+#endif
+
 // e.g. "0.5.0" or "0.6.0-alpha".
 #define TF_VERSION_STRING                                            \
   (_TF_STR(TF_MAJOR_VERSION) "." _TF_STR(TF_MINOR_VERSION) "." _TF_STR( \
diff --git a/tensorflow/core/public/session_options.h b/tensorflow/core/public/session_options.h
index 92134528dbf9..3335046aa58d 100644
--- a/tensorflow/core/public/session_options.h
+++ b/tensorflow/core/public/session_options.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tsl {
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 85cb63c68b72..800089a96174 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2158  // Updated: 2025/3/6
+#define TF_GRAPH_DEF_VERSION 2228  // Updated: 2025/5/15
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/BUILD b/tensorflow/core/runtime_fallback/BUILD
index 40b841ce0033..d57b97dae9f6 100644
--- a/tensorflow/core/runtime_fallback/BUILD
+++ b/tensorflow/core/runtime_fallback/BUILD
@@ -11,7 +11,6 @@ package_group(
         "//learning/brain/experimental/tfrt/...",
         "//learning/brain/tfrt/...",
         "//learning/brain/mobile/lite/...",
-        "//learning/infra/mira/distributed/...",
     ]),
 )
 
diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index 918007d927a5..7b0981742dd5 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@local_xla//xla/tsl/protobuf:histogram_proto_cc",
     ],
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index 8f4e06651a4b..e7c00dd19416 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/numbers.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc b/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
index 027bf7bed783..58dbc529fd56 100644
--- a/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
+++ b/tensorflow/core/tfrt/common/create_pjrt_client_util_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"  // IWYU pragma: keep
+#include "xla/tsl/protobuf/error_codes.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tsl/platform/status_matchers.h"
 
diff --git a/tensorflow/core/tfrt/common/pjrt_util_test.cc b/tensorflow/core/tfrt/common/pjrt_util_test.cc
index 4f44e2cdffd3..cd70483674d2 100644
--- a/tensorflow/core/tfrt/common/pjrt_util_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/tsl/lib/core/status_test_util.h"
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 97034c8dd53f..1c985c50aada 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -30,7 +30,6 @@ package_group(
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
         # copybara:uncomment "//learning/brain/mobile/lite/delegates/tfmrt/...",
-        # copybara:uncomment "//learning/infra/mira/distributed/...",
         # copybara:uncomment "//learning/infra/mira/experimental/orbax_model/...",
     ],
 )
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index 6e3b98511f9e..0a19bd1c5d71 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -115,13 +115,13 @@ absl::StatusOr<std::unique_ptr<FallbackState>>
 FallbackState::CreateWithDeviceMgr(
     const SessionOptions &session_options,
     const tensorflow::FunctionDefLibrary &fdef_lib,
-    absl::Nonnull<DynamicDeviceMgr *> device_mgr) {
+    DynamicDeviceMgr *absl_nonnull device_mgr) {
   return std::make_unique<FallbackState>(session_options, device_mgr, fdef_lib);
 }
 
 FallbackState::FallbackState(const SessionOptions &session_options,
                              std::variant<std::vector<std::unique_ptr<Device>>,
-                                          absl::Nonnull<DynamicDeviceMgr *>>
+                                          DynamicDeviceMgr *absl_nonnull>
                                  device_mgr,
                              const tensorflow::FunctionDefLibrary &fdef_lib)
     : session_options_(session_options),
@@ -132,8 +132,8 @@ FallbackState::FallbackState(const SessionOptions &session_options,
                     std::get<std::vector<std::unique_ptr<Device>>>(device_mgr))
               : std::vector<std::unique_ptr<Device>>()),
       device_manager_ptr_(
-          std::holds_alternative<absl::Nonnull<DynamicDeviceMgr *>>(device_mgr)
-              ? std::get<absl::Nonnull<DynamicDeviceMgr *>>(device_mgr)
+          std::holds_alternative<DynamicDeviceMgr *absl_nonnull>(device_mgr)
+              ? std::get<DynamicDeviceMgr *absl_nonnull>(device_mgr)
               : &device_manager_),
       func_lib_def_(OpRegistry::Global(), fdef_lib),
       pflr_(device_manager_ptr_, session_options.env, &session_options.config,
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.h b/tensorflow/core/tfrt/fallback/fallback_state.h
index ffbf0695bafb..703955448e6b 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.h
+++ b/tensorflow/core/tfrt/fallback/fallback_state.h
@@ -56,11 +56,11 @@ class FallbackState {
   static absl::StatusOr<std::unique_ptr<FallbackState>> CreateWithDeviceMgr(
       const SessionOptions &session_options,
       const tensorflow::FunctionDefLibrary &fdef_lib,
-      absl::Nonnull<DynamicDeviceMgr *> device_mgr);
+      DynamicDeviceMgr *absl_nonnull device_mgr);
 
   FallbackState(const SessionOptions &session_options,
                 std::variant<std::vector<std::unique_ptr<Device>>,
-                             absl::Nonnull<DynamicDeviceMgr *>>
+                             DynamicDeviceMgr *absl_nonnull>
                     device_mgr,
                 const tensorflow::FunctionDefLibrary &fdef_lib);
 
@@ -93,7 +93,7 @@ class FallbackState {
  private:
   SessionOptions session_options_;
   DynamicDeviceMgr device_manager_;
-  absl::Nonnull<DynamicDeviceMgr *> device_manager_ptr_;
+  DynamicDeviceMgr *absl_nonnull device_manager_ptr_;
   DeviceSet device_set_;
   FunctionLibraryDefinition func_lib_def_;
   ProcessFunctionLibraryRuntime pflr_;
diff --git a/tensorflow/core/tfrt/fallback/fallback_state_test.cc b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
index 3546992cfa76..21f961556e3b 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state_test.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
@@ -49,7 +49,7 @@ TEST(FallbackStateTest, CreateWithCpuDeviceVector) {
       session_options, "/job:localhost/replica:0/task:0", &devices));
 
   std::variant<std::vector<std::unique_ptr<Device>>,
-               absl::Nonnull<DynamicDeviceMgr*>>
+               DynamicDeviceMgr* absl_nonnull>
       device_variant = std::move(devices);
 
   auto fallback_state = std::make_unique<tfrt_stub::FallbackState>(
@@ -70,7 +70,7 @@ TEST(FallbackStateTest, CreateWithDynamicDeviceMgr) {
   auto static_device_mgr =
       std::make_unique<DynamicDeviceMgr>(std::move(devices));
 
-  absl::Nonnull<DynamicDeviceMgr*> device_mgr_ptr(static_device_mgr.get());
+  DynamicDeviceMgr* absl_nonnull device_mgr_ptr(static_device_mgr.get());
 
   auto fallback_state = std::make_unique<tfrt_stub::FallbackState>(
       session_options, device_mgr_ptr, fdef_lib);
diff --git a/tensorflow/core/tfrt/graph_executor/python/BUILD b/tensorflow/core/tfrt/graph_executor/python/BUILD
index bdfdac936af2..4aad84629b94 100644
--- a/tensorflow/core/tfrt/graph_executor/python/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/python/BUILD
@@ -31,7 +31,7 @@ tf_python_pybind_extension(
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/saved_model",
         "//tensorflow/core/tfrt/saved_model:saved_model_aot_compile",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:status_casters",
     ],
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index c407b263ac83..5d012f5e1c36 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -4,14 +4,9 @@ load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/experimental/orbax_model/serving/sidecar:__subpackages__",
-        # copybara:uncomment "//learning/serving/servables/tfrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
-        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
-        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
         "//tensorflow/core/tfrt:__subpackages__",
         "//tensorflow/core/tfrt/mlrt:__subpackages__",
     ],
@@ -271,14 +266,10 @@ cc_library(
     srcs = ["ifrt_model_context.cc"],
     hdrs = ["ifrt_model_context.h"],
     visibility = [
-        "//learning/brain/experimental/tfrt:__subpackages__",
         "//learning/brain/tfrt:__subpackages__",
-        "//learning/infra/mira/distributed:__subpackages__",
         "//learning/infra/mira/experimental/orbax_model/serving/sidecar:__pkg__",
         "//learning/serving/servables/tfrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
-        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
-        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
         "//tensorflow/core/tfrt:__subpackages__",
         "//tensorflow/core/tfrt/mlrt:__subpackages__",
     ],
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
index d488d9367769..4494cc89d554 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
@@ -69,7 +69,7 @@ class IfrtLoadedVariableRegistry {
   };
 
   struct LoadedVariable {
-    xla::ifrt::Future<tsl::RCReference<xla::ifrt::Array>> array;
+    xla::ifrt::Future<xla::ifrt::ArrayRef> array;
   };
   using LoadedVariableConstructor =
       absl::AnyInvocable<absl::StatusOr<LoadedVariable>() const>;
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
index 3e2ea9b26ec0..b4645b67becd 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
@@ -47,7 +47,7 @@ namespace ifrt_serving {
 
 namespace {
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> LoadIfrtVariable(
+absl::StatusOr<xla::ifrt::ArrayRef> LoadIfrtVariable(
     std::shared_ptr<xla::ifrt::Client> ifrt_client,
     const tsl::thread::ThreadPool& thread_pool,
     const tensorflow::Tensor& variable,
@@ -109,10 +109,9 @@ absl::Status AsyncLoadRestoredTensorAsIfrtLoadedVariable(
         "LoadVariableOp: failed to fetch variable tensor: ", runtime_name));
   }
   auto loaded_variable_promise =
-      xla::ifrt::Future<tsl::RCReference<xla::ifrt::Array>>::CreatePromise();
+      xla::ifrt::Future<xla::ifrt::ArrayRef>::CreatePromise();
   auto loaded_variable_future =
-      xla::ifrt::Future<tsl::RCReference<xla::ifrt::Array>>(
-          loaded_variable_promise);
+      xla::ifrt::Future<xla::ifrt::ArrayRef>(loaded_variable_promise);
   TF_ASSIGN_OR_RETURN(
       absl::StatusOr<ifrt_serving::DtypeAndShape> dtype_and_shape,
       ifrt_restore_tensor_registry.GetDtypeAndShape(runtime_name));
@@ -141,10 +140,9 @@ absl::Status AsyncLoadRestoredTensorAsIfrtLoadedVariable(
              restored_tensor = std::move(*restored_tensor),
              loaded_variable_promise =
                  std::move(loaded_variable_promise)]() mutable {
-              absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-                  variable_array =
-                      LoadIfrtVariable(ifrt_client, thread_pool,
-                                       restored_tensor, sharding_config);
+              absl::StatusOr<xla::ifrt::ArrayRef> variable_array =
+                  LoadIfrtVariable(ifrt_client, thread_pool, restored_tensor,
+                                   sharding_config);
               loaded_variable_promise.Set(std::move(variable_array));
             });
       });
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.cc b/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.cc
index d7be8039f03a..199135e40822 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.cc
@@ -20,8 +20,6 @@ limitations under the License.
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
@@ -38,7 +36,7 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
-absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
+absl::StatusOr<xla::ifrt::LoadedExecutableRef>
 IfrtPersistentCompilationCache::LookupLoadedExecutableOrCreate(
     std::unique_ptr<xla::ifrt::HloProgram> hlo_program,
     xla::ifrt::DeviceListRef device_list,
@@ -46,17 +44,15 @@ IfrtPersistentCompilationCache::LookupLoadedExecutableOrCreate(
     const std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>&
         loaded_host_callbacks,
     xla::ifrt::Client* client,
-    absl::AnyInvocable<
-        absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>(
-            std::unique_ptr<xla::ifrt::Program> program,
-            std::unique_ptr<xla::ifrt::CompileOptions> options)>
+    absl::AnyInvocable<absl::StatusOr<xla::ifrt::LoadedExecutableRef>(
+        std::unique_ptr<xla::ifrt::Program> program,
+        std::unique_ptr<xla::ifrt::CompileOptions> options)>
         value_fn) {
   // No persistent cache implemented, compile directly.
   auto ifrt_xla_compile_options =
-      std::make_unique<xla::ifrt::XlaCompileOptions>(xla_compile_options,
-                                                     loaded_host_callbacks);
+      std::make_unique<xla::ifrt::XlaCompileOptions>(
+          xla_compile_options, std::move(device_list), loaded_host_callbacks);
   return value_fn(std::move(hlo_program), std::move(ifrt_xla_compile_options));
-  ;
 }
 
 absl::StatusOr<Tf2HloResult>
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h b/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h
index a068af33bb27..184e2556d050 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h
@@ -44,7 +44,7 @@ class IfrtPersistentCompilationCache {
   // for looking up the executable in the persistent cache and it will return
   // the LoadedExecutable if hits cache. Otherwise, it will call the `value_fn`
   // to generate and return the LoadedExecutable.
-  virtual absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
+  virtual absl::StatusOr<xla::ifrt::LoadedExecutableRef>
   LookupLoadedExecutableOrCreate(
       std::unique_ptr<xla::ifrt::HloProgram> hlo_program,
       xla::ifrt::DeviceListRef device_list,
@@ -52,10 +52,9 @@ class IfrtPersistentCompilationCache {
       const std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>&
           loaded_host_callbacks,
       xla::ifrt::Client* client,
-      absl::AnyInvocable<
-          absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>(
-              std::unique_ptr<xla::ifrt::Program> program,
-              std::unique_ptr<xla::ifrt::CompileOptions> options)>
+      absl::AnyInvocable<absl::StatusOr<xla::ifrt::LoadedExecutableRef>(
+          std::unique_ptr<xla::ifrt::Program> program,
+          std::unique_ptr<xla::ifrt::CompileOptions> options)>
           value_fn);
 
   // The implementation of this API should be thread-safe. It generates a key
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 0a0af7fa1bc9..f531466ba7f0 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -258,8 +258,7 @@ IfrtServingExecutable::Create(
   return executable;
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-IfrtServingExecutable::ConvertTensorToArray(
+absl::StatusOr<xla::ifrt::ArrayRef> IfrtServingExecutable::ConvertTensorToArray(
     const tensorflow::Tensor& tensor,
     const xla::ifrt::DeviceListRef& device_list,
     const xla::OpSharding& sharding) {
@@ -447,7 +446,9 @@ IfrtServingExecutable::CreateExecutableSynchronously(
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> mlir_hlo_module,
       xla::ConvertHloToMlirHlo(*module_copy->getContext(),
-                               &tf2hlo_result.hlo_module_proto));
+                               &tf2hlo_result.hlo_module_proto,
+                               /*import_all_computations=*/false,
+                               /*flatten_computation_args_result=*/true));
 
   if (VLOG_IS_ON(1)) {
     tensorflow::DumpMlirOpToFile("ifrt_after_bridge_phase2",
@@ -512,19 +513,18 @@ IfrtServingExecutable::CreateExecutableSynchronously(
   }
   auto hlo_program =
       std::make_unique<xla::ifrt::HloProgram>(mlir_hlo_module.get());
-  std::unique_ptr<xla::ifrt::LoadedExecutable> ifrt_executable;
   SharedCachedExecutableBundle executable_bundle =
       std::make_shared<CachedExecutableBundle>();
 
   TF_ASSIGN_OR_RETURN(
-      ifrt_executable,
+      xla::ifrt::LoadedExecutableRef ifrt_executable,
       persistent_compilation_cache_->LookupLoadedExecutableOrCreate(
           std::move(hlo_program), assigned_device_list_, xla_compile_options,
           loaded_host_callbacks, ifrt_client_.get(),
           [&](std::unique_ptr<xla::ifrt::Program> program,
               std::unique_ptr<xla::ifrt::CompileOptions> options)
-              -> absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> {
-            return ifrt_client_->GetDefaultCompiler()->Compile(
+              -> absl::StatusOr<xla::ifrt::LoadedExecutableRef> {
+            return ifrt_client_->GetDefaultCompiler()->CompileAndLoad(
                 std::move(program), std::move(options));
           }));
 
@@ -679,7 +679,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
 
   VLOG(2) << "Completed AsyncLoadIfrtArray";
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> args;
+  std::vector<xla::ifrt::ArrayRef> args;
   args.reserve(inputs.size());
   int variable_index = 0;
   for (int i = 0; i < inputs.size(); i++) {
@@ -702,7 +702,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
       TF_ASSIGN_OR_RETURN(
           auto loaded_variable,
           ifrt_loaded_variable_registry_.GetLoadedVariable(key));
-      TF_ASSIGN_OR_RETURN(tsl::RCReference<xla::ifrt::Array> single_array,
+      TF_ASSIGN_OR_RETURN(xla::ifrt::ArrayRef single_array,
                           loaded_variable.array.Await());
       args.push_back(std::move(single_array));
       variable_index++;
@@ -756,8 +756,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   output_futures.reserve(execution_result.outputs.size());
   for (int i = 0; i < execution_result.outputs.size(); ++i) {
     tensorflow::TensorShape tensor_shape;
-    const tsl::RCReference<xla::ifrt::Array>& array_for_copy =
-        execution_result.outputs[i];
+    const xla::ifrt::ArrayRef& array_for_copy = execution_result.outputs[i];
     const tpu::TPUCompileMetadataProto::Retval& metadata_retval =
         executable_bundle->compile_metadata.retvals()[i];
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index d16631bfe02b..b2717d283aaf 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -125,7 +125,7 @@ class IfrtServingExecutable {
   };
 
   struct CachedExecutableBundle {
-    std::unique_ptr<xla::ifrt::LoadedExecutable> ifrt_executable;
+    xla::ifrt::LoadedExecutableRef ifrt_executable;
     tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
     std::vector<std::unique_ptr<TfHostCallback>> host_callbacks;
 
@@ -221,7 +221,7 @@ class IfrtServingExecutable {
       const CachedExecutableBundle& executable_bundle,
       const xla::ifrt::DeviceListRef& devices);
 
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> ConvertTensorToArray(
+  absl::StatusOr<xla::ifrt::ArrayRef> ConvertTensorToArray(
       const tensorflow::Tensor& tensor,
       const xla::ifrt::DeviceListRef& device_list,
       const xla::OpSharding& sharding);
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
index 5d9135b4f1ca..538fb2195a40 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.cc b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
index 264b03a42951..921cc9c26f3e 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <cstdlib>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -86,7 +85,7 @@ struct IndexDomainLexicographicalComparator {
 // `devices` contains a list of devices flattend into the following
 // order: [slice0][replicate0], [slice0][replicate1], ..., [slice1][replicate0],
 // [slice1][replicate1], ...
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
 SplitAndCreateArraysFromHostBuffer(
     xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
     const std::vector<int32_t>& num_partitions_per_axis, int num_replicas,
@@ -162,7 +161,7 @@ SplitAndCreateArraysFromHostBuffer(
                      split_tensors.size(), " x ", num_replicas));
   }
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  std::vector<xla::ifrt::ArrayRef> arrays;
   arrays.reserve(devices.size());
   TF_ASSIGN_OR_RETURN(xla::ifrt::DType dtype, ToIfrtDType(tensor_data_type));
   auto device_iter = devices.begin();
@@ -335,10 +334,9 @@ absl::StatusOr<int> VerifyIndexDomainsAndGetReplicas(
 }
 
 // A simple wrapper function to create ifrt array for one single device.
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-CreateArrayFromHostTensorForSingleDevice(xla::ifrt::Client& ifrt_client,
-                                         const tensorflow::Tensor& tensor,
-                                         xla::ifrt::Device* device) {
+absl::StatusOr<xla::ifrt::ArrayRef> CreateArrayFromHostTensorForSingleDevice(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& tensor,
+    xla::ifrt::Device* device) {
   TF_ASSIGN_OR_RETURN(auto dtype, ToIfrtDType(tensor.dtype()));
 
   VLOG(2) << "Make single device array for buffer slice at " << tensor.data();
@@ -357,12 +355,11 @@ CreateArrayFromHostTensorForSingleDevice(xla::ifrt::Client& ifrt_client,
       });
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-MakeAssembledArrayFromHostBuffer(xla::ifrt::Client& ifrt_client,
-                                 const tensorflow::Tensor& input_tensor,
-                                 const xla::HloSharding& hlo_sharding,
-                                 const xla::ifrt::DeviceListRef& device_list,
-                                 const tsl::thread::ThreadPool& thread_pool) {
+absl::StatusOr<xla::ifrt::ArrayRef> MakeAssembledArrayFromHostBuffer(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    const xla::HloSharding& hlo_sharding,
+    const xla::ifrt::DeviceListRef& device_list,
+    const tsl::thread::ThreadPool& thread_pool) {
   // TODO(b/316959894): use xla::HloSharding to identifying sharding axis.
   auto sharding = xla::ifrt::HloSharding::Create(
       device_list, xla::ifrt::MemoryKind(), hlo_sharding);
@@ -455,7 +452,7 @@ MakeAssembledArrayFromHostBuffer(xla::ifrt::Client& ifrt_client,
                           num_replicas, devices, thread_pool));
 
   // Re-arranged arrays back to original device order
-  std::vector<tsl::RCReference<xla::ifrt::Array>> rearranged_arrays;
+  std::vector<xla::ifrt::ArrayRef> rearranged_arrays;
   rearranged_arrays.resize(arrays.size());
   for (int i = 0; i < arrays.size(); ++i) {
     rearranged_arrays[original_device_indices[i]] = std::move(arrays[i]);
@@ -514,10 +511,9 @@ absl::StatusOr<xla::ifrt::Future<tensorflow::Tensor>> MakeTensorFromArrayHelper(
   } else if (hlo_sharding.IsTileMaximal()) {
     // Maximal implies single device
     VLOG(1) << "Fast path for maximal";
-    TF_ASSIGN_OR_RETURN(
-        std::vector<tsl::RCReference<xla::ifrt::Array>> disassembled_array,
-        input_array.DisassembleIntoSingleDeviceArrays(
-            xla::ifrt::ArrayCopySemantics::kDonateInput));
+    TF_ASSIGN_OR_RETURN(std::vector<xla::ifrt::ArrayRef> disassembled_array,
+                        input_array.DisassembleIntoSingleDeviceArrays(
+                            xla::ifrt::ArrayCopySemantics::kDonateInput));
 
     int64_t device_id = hlo_sharding.GetUniqueDevice();
 
@@ -548,10 +544,9 @@ absl::StatusOr<xla::ifrt::Future<tensorflow::Tensor>> MakeTensorFromArrayHelper(
                          absl::MakeSpan(index_domains), tensor_shape)
                          .status());
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<tsl::RCReference<xla::ifrt::Array>> disassembled_array,
-      input_array.DisassembleIntoSingleDeviceArrays(
-          xla::ifrt::ArrayCopySemantics::kDonateInput));
+  TF_ASSIGN_OR_RETURN(std::vector<xla::ifrt::ArrayRef> disassembled_array,
+                      input_array.DisassembleIntoSingleDeviceArrays(
+                          xla::ifrt::ArrayCopySemantics::kDonateInput));
 
   if (index_domains.size() != disassembled_array.size()) {
     return absl::FailedPreconditionError(absl::StrCat(
@@ -588,12 +583,12 @@ absl::StatusOr<xla::ifrt::Future<tensorflow::Tensor>> MakeTensorFromArrayHelper(
   // disassembled array is in device order.
   struct IndexDomainDeviceArray {
     xla::ifrt::IndexDomain index_domain;
-    tsl::RCReference<xla::ifrt::Array> array;
+    xla::ifrt::ArrayRef array;
   };
   // `index_domains` could have duplicate index when `replicate_on_last_tile_dim
   // is enabled. So, we use the btreemap to remove duplicates and sort the index
   // domains lexicographically.
-  absl::btree_map<xla::ifrt::IndexDomain, tsl::RCReference<xla::ifrt::Array>,
+  absl::btree_map<xla::ifrt::IndexDomain, xla::ifrt::ArrayRef,
                   IndexDomainLexicographicalComparator>
       index_domain_device_arrays;
   for (int i = 0; i < index_domains.size(); ++i) {
@@ -660,7 +655,7 @@ xla::ifrt::Future<tensorflow::Tensor> MakeTensorFromArray(
   return *std::move(output_tensor_future);
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromTensor(
     xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
     const xla::ifrt::DeviceListRef& device_list,
     const xla::HloSharding& hlo_sharding,
@@ -718,7 +713,7 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
                                           thread_pool);
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromTensor(
     xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
     absl::Span<const int> device_ids, const xla::HloSharding& hlo_sharding,
     const tsl::thread::ThreadPool& thread_pool) {
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.h b/tensorflow/core/tfrt/ifrt/sharding_utils.h
index 2fa11ec7b45c..14a730d6e277 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.h
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.h
@@ -39,14 +39,14 @@ namespace ifrt_serving {
 
 // Create a tensor from the given host tensor based on given device ids and
 // sharding information.
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromTensor(
     xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
     absl::Span<const int> device_ids, const xla::HloSharding& hlo_sharding,
     const tsl::thread::ThreadPool& thread_pool);
 
 // A variant of the above api. The difference is that the user passes in
 // device_list directly instead of a list of device_ids.
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromTensor(
     xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
     const xla::ifrt::DeviceListRef& device_list,
     const xla::HloSharding& hlo_sharding,
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
index 17badadd0a79..c9b4e0c988ec 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
@@ -109,7 +109,7 @@ TEST_P(ReshardToTensorTest, MakeHostTensorFromDeviceArrays) {
                           xla::ifrt::test_util::GetDevices(
                               client.get(), GetParam().device_indices));
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> split_arrays;
+  std::vector<xla::ifrt::ArrayRef> split_arrays;
   for (int i = 0; i < GetParam().split_tensors.size(); ++i) {
     const auto& split_tensor = GetParam().split_tensors[i];
     auto single_device_sharding = xla::ifrt::SingleDeviceSharding::Create(
@@ -128,7 +128,7 @@ TEST_P(ReshardToTensorTest, MakeHostTensorFromDeviceArrays) {
 
   auto ifrt_sharding = xla::ifrt::HloSharding::Create(
       device_list, xla::ifrt::MemoryKind(), GetParam().sharding);
-  tsl::RCReference<xla::ifrt::Array> assembled_array;
+  xla::ifrt::ArrayRef assembled_array;
 
   TF_ASSERT_OK_AND_ASSIGN(
       assembled_array,
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.cc b/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.cc
index 8ca71ba8e25b..fdac986b64e4 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.cc
+++ b/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.cc
@@ -142,10 +142,8 @@ void CaseOp::Invoke() {
   mlrt::bc::Vector<uint32_t> attribute_function_indices = function_indices();
 
   if (argument_branch_idx >= attribute_function_indices.size()) {
-    execution_context().Fail(absl::InvalidArgumentError(
-        absl::StrCat("Case branch number ", argument_branch_idx,
-                     " exceeds limit ", attribute_function_indices.size())));
-    return;
+    // Consistent with the behavior of the legacy TFRT case kernel behavior.
+    argument_branch_idx = attribute_function_indices.size() - 1;
   }
 
   auto function =
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/interpreter_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/interpreter_test.cc
index 34b53ac1fb1b..568949ac3792 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/interpreter_test.cc
+++ b/tensorflow/core/tfrt/mlrt/interpreter/interpreter_test.cc
@@ -2825,6 +2825,43 @@ TEST(KernelTest, Case) {
   }
 }
 
+TEST(KernelTest, CaseInvalidBranchIndexShallChooseLastBranch) {
+  auto buffer = CreateCaseExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry registry;
+  RegisterBuiltinKernels(registry);
+  LoadedExecutable loaded_executable(executable, registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  Value inputs[3];
+
+  constexpr int32_t kBranch0In = 123;
+  constexpr int32_t kBranch1In = 456;
+
+  // Test Invalid Branch 10
+  {
+    inputs[0].Set<uint32_t>(10);
+    inputs[1].Set(kBranch0In);
+    inputs[2].Set(kBranch1In);
+    Value output;
+
+    std::vector<uint8_t> last_uses = {true, true, true};
+    execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                           absl::Span<Value>(&output, 1));
+
+    Execute(execution_context);
+
+    ASSERT_TRUE(output.HasValue());
+    EXPECT_EQ(kBranch1In, output.Get<int32_t>());
+  }
+}
+
 struct TestPromiseReturnOp : PromiseReturnOpBase<TestPromiseReturnOp> {
   using PromiseReturnOpBase::PromiseReturnOpBase;
 
diff --git a/tensorflow/core/tfrt/ops/stream_ops.cc b/tensorflow/core/tfrt/ops/stream_ops.cc
index 27c93ad86982..9ff0d3ea66a4 100644
--- a/tensorflow/core/tfrt/ops/stream_ops.cc
+++ b/tensorflow/core/tfrt/ops/stream_ops.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <vector>
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/core/tfrt/runtime/stream_test.cc b/tensorflow/core/tfrt/runtime/stream_test.cc
index bcb8a14a5536..e0b73ebcb1c1 100644
--- a/tensorflow/core/tfrt/runtime/stream_test.cc
+++ b/tensorflow/core/tfrt/runtime/stream_test.cc
@@ -50,7 +50,29 @@ using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
 using ::testing::status::StatusIs;
 
-TEST(StreamTest, Simple) {
+class TestStreamControllerInterface : public StreamControllerInterface {
+ public:
+  TestStreamControllerInterface()
+      : StreamControllerInterface("test_controller_address") {}
+};
+
+class StreamTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    GetGlobalStreamInterfaceFactory().RegisterController(
+        []() { return std::make_unique<TestStreamControllerInterface>(); });
+  }
+};
+
+TEST_F(StreamTest, Initialize) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto controller_interface,
+      GetGlobalStreamInterfaceFactory().CreateControllerStreamInterface());
+  EXPECT_EQ(controller_interface->controller_address(),
+            "test_controller_address");
+}
+
+TEST_F(StreamTest, Simple) {
   StreamCallbackId callback_id(1234);
   StepId step_id(5678);
 
@@ -97,7 +119,7 @@ TEST(StreamTest, Simple) {
   EXPECT_THAT(status, StatusIs(absl::StatusCode::kAlreadyExists));
 }
 
-TEST(StreamTest, MultipleWriters) {
+TEST_F(StreamTest, MultipleWriters) {
   StreamCallbackId callback_id(1234);
   StepId step_id(5678);
 
@@ -146,22 +168,6 @@ TEST(StreamTest, MultipleWriters) {
   }
 }
 
-class TestStreamControllerInterface : public StreamControllerInterface {
- public:
-  TestStreamControllerInterface()
-      : StreamControllerInterface("test_controller_address") {}
-};
-
-TEST(StreamControllerInterface, Initialize) {
-  GetGlobalStreamInterfaceFactory().RegisterController(
-      []() { return std::make_unique<TestStreamControllerInterface>(); });
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto controller_interface,
-      GetGlobalStreamInterfaceFactory().CreateControllerStreamInterface());
-  EXPECT_EQ(controller_interface->controller_address(),
-            "test_controller_address");
-}
-
 class TestStreamWorkerInterface : public StreamWorkerInterface {
  public:
   explicit TestStreamWorkerInterface(std::string worker_address)
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 85d47603565a..7362e5fdcfcd 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -27,6 +27,7 @@ package_group(
         # copybara:uncomment "//quality/webanswers/servo2/...",
         "//tensorflow/core/tfrt/saved_model/python",  #python wrapping
         "//tensorflow/core/tfrt/saved_model/utils/...",
+        "//smartass/brain/inference/...",
     ],
 )
 
@@ -102,6 +103,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":saved_model_util",
+        "//tensorflow/cc/saved_model:fingerprinting",
         "//tensorflow/cc/saved_model:reader",
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -240,7 +242,6 @@ cc_library(
         ":saved_model",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/tfrt/runtime",
diff --git a/tensorflow/core/tfrt/saved_model/python/BUILD b/tensorflow/core/tfrt/saved_model/python/BUILD
index 9d6e3f15d9ee..ad0da1c41290 100644
--- a/tensorflow/core/tfrt/saved_model/python/BUILD
+++ b/tensorflow/core/tfrt/saved_model/python/BUILD
@@ -54,12 +54,12 @@ cc_library(
         "//tensorflow/core/tfrt/saved_model",
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:refcount",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@tf_runtime//:hostcontext",
     ],
 )
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
index 448e05d411d1..c2dcfbebe877 100644
--- a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h"
 
+#include <Python.h>
+
 #include <memory>
 #include <string>
 #include <unordered_set>
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 505cf7685a3b..69e7b0ed44ee 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/fingerprinting.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.h"
@@ -59,6 +60,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
@@ -139,6 +141,12 @@ auto* saved_model_init_time_seconds =
         "/tensorflow/tfrt/saved_model/init_time",
         "Record the initialization time for the savedmodel.", "model_name");
 
+auto* saved_model_unified_model_id =
+    tensorflow::monitoring::Gauge<std::string, 2>::New(
+        "/tensorflow/tfrt/saved_model/unified_model_id",
+        "Record the unified model id of the saved model.", "model_name",
+        "model_version");
+
 // TODO(b/279197040) clean up this retention after input spec validation is
 // enabled everywhere.
 auto* saved_model_input_spec_validation_failure =
@@ -505,6 +513,27 @@ void UpdateCompileOptions(SavedModel::Options& options) {
       !options.graph_execution_options.enable_mlrt;
 }
 
+// TODO(b/416666698): When possible, call the reference implementation.
+void EmitSavedModelUnifiedModelId(absl::string_view saved_model_dir,
+                                  const SavedModel::Options& options) {
+  string saved_model_uuid = "(empty)";
+  absl::StatusOr<FingerprintDef> fingerprint_or =
+      saved_model::fingerprinting::ReadSavedModelFingerprint(saved_model_dir);
+  if (fingerprint_or.ok()) {
+    if (fingerprint_or.value().uuid().empty()) {
+      saved_model_uuid =
+          saved_model::fingerprinting::Singleprint(fingerprint_or.value());
+    } else {
+      saved_model_uuid = fingerprint_or.value().uuid();
+    }
+  }
+  saved_model_unified_model_id
+      ->GetCell(options.graph_execution_options.model_metadata.name(),
+                absl::StrCat(
+                    options.graph_execution_options.model_metadata.version()))
+      ->Set(saved_model_uuid);
+}
+
 }  // namespace
 
 absl::StatusOr<std::unique_ptr<SavedModel>> SavedModelImpl::LoadSavedModel(
@@ -521,6 +550,8 @@ absl::StatusOr<std::unique_ptr<SavedModel>> SavedModelImpl::LoadSavedModel(
     absl::string_view saved_model_dir) {
   LOG(INFO) << "TFRT loading v1 savedmodel: " << saved_model_dir;
 
+  EmitSavedModelUnifiedModelId(saved_model_dir, options);
+
   if (options.graph_execution_options.use_ifrt) {
     if (!options.graph_execution_options.enable_mlrt ||
         !options.enable_lazy_loading ||
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index 3c24a35738ef..94f9c4dbf81b 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -336,9 +336,9 @@ absl::StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
 
   const xla::CompileOptions pjrt_options =
       GetPjRtCompileOptions(options, **compilation_result);
-  TF_ASSIGN_OR_RETURN(
-      auto executable,
-      se_client->Compile(*((*compilation_result)->computation), pjrt_options));
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      se_client->CompileAndLoad(
+                          *((*compilation_result)->computation), pjrt_options));
   return se_client->SerializeExecutable(*executable);
 }
 
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc b/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc
index f4fa1034985d..79bc0a1a6b9d 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc
@@ -69,6 +69,8 @@ SavedModel::Options DefaultSavedModelOptions(
 
   if (user_options) {
     options.graph_execution_options.enable_mlrt = user_options->enable_mlrt;
+    options.graph_execution_options.model_metadata =
+        user_options->session_metadata;
     compile_options.enable_optimizer = user_options->enable_optimizer;
     compile_options.enable_grappler = user_options->enable_grappler;
     compile_options.force_data_format = user_options->force_data_format;
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_testutil.h b/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
index c0a69cd98469..afbd3f4a2443 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
@@ -45,6 +45,7 @@ struct UserSavedModelOptions {
   bool enable_optimizer = false;
   bool enable_grappler = false;
   std::string force_data_format = "";
+  tensorflow::SessionMetadata session_metadata;
 };
 
 SavedModel::Options DefaultSavedModelOptions(
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index a5241e976e92..bb93bbc27b70 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -200,18 +200,13 @@ pytype_strict_binary(
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
         "//tensorflow/python/client:session",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:test_ops",
         # copybara:uncomment "//tensorflow/python/framework:test_ops_kernels",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
-        "//tensorflow/python/saved_model:save",
-        "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
@@ -566,6 +561,7 @@ cc_library(
     ],
     tags = ["no_oss"],
     deps = [
+        "//tensorflow/cc/saved_model:constants",
         "//tensorflow/compiler/mlir/tfrt:backend_compiler",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
@@ -573,6 +569,7 @@ cc_library(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/tfrt/graph_executor:config",
         "//tensorflow/core/tfrt/graph_executor:test_config_proto_cc",
         "//tensorflow/core/tfrt/run_handler_thread_pool:run_handler_concurrent_work_queue",
@@ -583,6 +580,7 @@ cc_library(
         "//tensorflow/core/tfrt/saved_model:saved_model_util",
         "//tensorflow/python/framework:test_ops_kernels",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -591,9 +589,13 @@ cc_library(
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:strcat",
         "@local_xla//xla/tsl/lib/monitoring:cell_reader",
+        "@local_xla//xla/tsl/platform:env",
+        "@local_xla//xla/tsl/platform:status",
         "@tf_runtime//:core_runtime_alwayslink",
         "@tf_runtime//:hostcontext",
     ],
diff --git a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
index 227fdc6c5ebc..8300f6186454 100644
--- a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
+++ b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
@@ -11,6 +11,7 @@ def gen_saved_model(model_name = "", script = "", version = "", **kwargs):
         name = "saved_model_gen_" + model_name,
         srcs = [],
         outs = [
+            model_path + "/fingerprint.pb",
             model_path + "/saved_model.pb",
             model_path + "/variables/variables.data-00000-of-00001",
             model_path + "/variables/variables.index",
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
index 605e4413ffff..d7360d72b6ad 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
@@ -25,20 +25,28 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/lib/monitoring/cell_reader.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
 #include "tensorflow/core/tfrt/graph_executor/config.h"
 #include "tensorflow/core/tfrt/graph_executor/test_config.pb.h"
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h"
@@ -46,14 +54,18 @@ limitations under the License.
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
+#include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/strcat.h"
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace tfrt_stub {
 namespace {
 
+using ::tsl::monitoring::testing::CellReader;
+
 struct TestParams {
   bool enable_grappler = false;
   bool enable_lazy_loading = false;
@@ -1181,6 +1193,123 @@ TEST(SavedModelTest, CustomCompiler) {
   EXPECT_EQ(test_context.signature_name, "toy");
 }
 
+// Creates a sub directory under the temp directory.
+// Return the path of the created directory.
+std::string CreateTempDir(::tsl::Env* env, absl::string_view subdir) {
+  std::string dst_dir = ::tsl::io::JoinPath(testing::TempDir(), subdir);
+  TF_CHECK_OK(env->RecursivelyCreateDir(dst_dir));
+  return dst_dir;
+}
+
+// Copy the source directory recursively to the destination directory.
+void CopyRecursively(::tsl::Env* env, std::string src_dir,
+                     std::string dst_dir) {
+  std::vector<std::string> children;
+  TF_CHECK_OK(env->GetChildren(src_dir, &children));
+  for (const auto& child : children) {
+    std::string src_path = ::tsl::io::JoinPath(src_dir, child);
+    std::string dst_path = ::tsl::io::JoinPath(dst_dir, child);
+    if (env->IsDirectory(src_path).ok()) {
+      TF_CHECK_OK(env->RecursivelyCreateDir(dst_path));
+      CopyRecursively(env, src_path, dst_path);
+    } else {
+      TF_CHECK_OK(env->CopyFile(src_path, dst_path));
+    }
+  }
+}
+
+TEST(SavedModelTest, EmitUnifiedModelIdFromFingerprintUUID) {
+  tsl::Env* env = ::tsl::Env::Default();
+  std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
+
+  auto unified_model_id =
+      CellReader<std::string>("/tensorflow/tfrt/saved_model/unified_model_id");
+
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  std::string model_name = "test_model";
+  int model_version = 1;
+  UserSavedModelOptions user_options;
+  user_options.session_metadata.set_name(model_name);
+  user_options.session_metadata.set_version(model_version);
+  auto options = DefaultSavedModelOptions(runtime.get(), user_options);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
+
+  FingerprintDef fingerprint_proto;
+  std::string fingerprint_pb_path =
+      ::tsl::io::JoinPath(saved_model_dir, kFingerprintFilenamePb);
+  TF_CHECK_OK(
+      tsl::ReadBinaryProto(env, fingerprint_pb_path, &fingerprint_proto));
+  EXPECT_EQ(unified_model_id.Read(model_name, ::absl::StrCat(model_version)),
+            fingerprint_proto.uuid());
+}
+
+TEST(SavedModelTest, EmitUnifiedModelIdWhenNoFingerprintFile) {
+  tsl::Env* env = ::tsl::Env::Default();
+  std::string source_model_dir = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
+  std::string saved_model_dir =
+      CreateTempDir(env, "emit_unified_model_id_when_no_fingerprint_file");
+  CopyRecursively(env, source_model_dir, saved_model_dir);
+  TF_CHECK_OK(env->DeleteFile(
+      ::tsl::io::JoinPath(saved_model_dir, kFingerprintFilenamePb)));
+
+  auto unified_model_id =
+      CellReader<std::string>("/tensorflow/tfrt/saved_model/unified_model_id");
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  std::string model_name = "test_model";
+  int model_version = 1;
+  UserSavedModelOptions user_options;
+  user_options.session_metadata.set_name(model_name);
+  user_options.session_metadata.set_version(model_version);
+  auto options = DefaultSavedModelOptions(runtime.get(), user_options);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
+
+  EXPECT_EQ(
+      unified_model_id.Read(model_name, ::tsl::strings::StrCat(model_version)),
+      "(empty)");
+}
+
+TEST(SavedModelTest, EmitUnifiedModelIdWhenFingerprintHasNoUUID) {
+  tsl::Env* env = ::tsl::Env::Default();
+  std::string source_model_dir = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
+  std::string saved_model_dir =
+      CreateTempDir(env, "emit_unified_model_id_when_fingerprint_has_no_uuid");
+  CopyRecursively(env, source_model_dir, saved_model_dir);
+  // Update the fingerprint file to have an empty UUID.
+  FingerprintDef fingerprint_proto;
+  std::string fingerprint_pb_path =
+      ::tsl::io::JoinPath(saved_model_dir, kFingerprintFilenamePb);
+  TF_CHECK_OK(
+      tsl::ReadBinaryProto(env, fingerprint_pb_path, &fingerprint_proto));
+  fingerprint_proto.clear_uuid();
+  TF_CHECK_OK(env->DeleteFile(fingerprint_pb_path));
+  TF_CHECK_OK(
+      tsl::WriteBinaryProto(env, fingerprint_pb_path, fingerprint_proto));
+
+  auto unified_model_id =
+      CellReader<std::string>("/tensorflow/tfrt/saved_model/unified_model_id");
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  std::string model_name = "test_model";
+  int model_version = 1;
+  UserSavedModelOptions user_options;
+  user_options.session_metadata.set_name(model_name);
+  user_options.session_metadata.set_version(model_version);
+  auto options = DefaultSavedModelOptions(runtime.get(), user_options);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
+
+  auto value = unified_model_id.Read(model_name, absl::StrCat(model_version));
+  EXPECT_TRUE(!value.empty());
+  EXPECT_NE(value, "(empty)");
+}
+
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 05f1a9c35218..ea46dc949dbd 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -503,7 +503,7 @@ class TfrtSession : public tensorflow::Session {
     compile_options.device_target = device_target_;
     compile_options.tpu_fuse_ops = tpu_use_tpu_runner_;
     compile_options.hoist_invariant_ops = true;
-    compile_options.sink_in_invariant_ops = false;
+    compile_options.sink_in_invariant_ops = true;
     compile_options.cost_threshold = 1024;
 
     if (use_gpu_) {
@@ -779,18 +779,22 @@ void TfrtSessionFactory::RegisterInitializer(RuntimeInitializer initializer) {
 absl::Status TfrtSessionFactory::InitializeLocked(
     const TfrtSessionOptions& options) {
   mutex_.AssertHeld();
+  if (options.backend_compiler) {
+    backend_compiler_ = options.backend_compiler;
+  }
   if (options.use_tpu) {
-    DCHECK(!options.backend_compiler);
     DCHECK(!options.use_gpu);
     device_target_ = TfrtDeviceInfraTarget::kTpurt;
-    tpu_use_tpu_runner_ = true;
+    if (!options.backend_compiler) {
+      tpu_use_tpu_runner_ = true;
+    }
   } else if (options.use_gpu) {
-    DCHECK(!options.backend_compiler);
     device_target_ = TfrtDeviceInfraTarget::kGpu;
-    use_gpu_ = true;
-  } else if (options.backend_compiler) {
-    backend_compiler_ = options.backend_compiler;
+    if (!options.backend_compiler) {
+      use_gpu_ = true;
+    }
   }
+
   LOG(INFO) << "Start initializing TfrtSession";
   if (options.runtime != nullptr) {
     runtime_ = options.runtime;
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index 15305ff29ce2..82f779bd3765 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -14,7 +14,6 @@ package_group(
         "//learning/brain/google/xla/kernels/...",
         # copybara:uncomment "//learning/brain/research/pjrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
-        # copybara:uncomment "//learning/infra/mira/distributed/...",
         # copybara:uncomment "//learning/pathways/serving/...",
         "//learning/serving/...",
         "//quality/webanswers/...",
diff --git a/tensorflow/core/tfrt/utils/fallback_tensor.cc b/tensorflow/core/tfrt/utils/fallback_tensor.cc
index 7481fb489be9..70ee2ec9cd15 100644
--- a/tensorflow/core/tfrt/utils/fallback_tensor.cc
+++ b/tensorflow/core/tfrt/utils/fallback_tensor.cc
@@ -41,9 +41,9 @@ class ImmutableTensorBuffer final : public tensorflow::TensorBuffer {
 
   size_t size() const override {
     // Instead of using tensorflow::Tensor::TotalBytes(),
-    // tensorflow::TensorBuffer::size() should be used, because for cases like
-    // tstring they don't match.
-    return tensorflow::DMAHelper::buffer(&tensor_)->size();
+    // tensorflow::Tensor::GetBufferSize() should be used, because for cases
+    // like tstring they don't match.
+    return tensor_.GetBufferSize();
   }
 
   // Force OwnsMemory() to return false so that it can never be
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 8f14a5abac0c..eb4c221c8f43 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -192,6 +192,7 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -200,10 +201,12 @@ cc_library(
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/builder:xla_builder",
         "@local_xla//xla/hlo/builder:xla_computation",
+        "@local_xla//xla/hlo/builder/lib:arithmetic",
         "@local_xla//xla/hlo/builder/lib:slicing",
         "@local_xla//xla/stream_executor/tpu:c_api_decl",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
         "@local_xla//xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
+        "@local_xla//xla/tsl/platform:errors",
     ],
     alwayslink = 1,
 )
@@ -1504,6 +1507,9 @@ cc_library(
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla/hlo/builder:xla_builder",
+        "@local_xla//xla/hlo/builder:xla_computation",
         "@local_xla//xla/stream_executor/tpu:status_helper",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
         "@local_xla//xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
diff --git a/tensorflow/core/tpu/kernels/image_resize_ops.cc b/tensorflow/core/tpu/kernels/image_resize_ops.cc
index 5bd2f21f55d7..7e255bab0545 100644
--- a/tensorflow/core/tpu/kernels/image_resize_ops.cc
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@@ -50,8 +50,8 @@ class TpuCustomResizeOp : public XlaOpKernel {
     TF_ASSIGN_OR_RETURN(xla::Shape input_shape, ctx->InputXlaShape(0));
     xla::Shape output_shape =
         TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
-    output_shape.mutable_dimensions()[1] = out_size[0];
-    output_shape.mutable_dimensions()[2] = out_size[1];
+    output_shape.set_dimensions(1, out_size[0]);
+    output_shape.set_dimensions(2, out_size[1]);
     output_shape.set_dynamic_dimension(0, input_shape.is_dynamic_dimension(0));
     output_shape.set_dynamic_dimension(3, input_shape.is_dynamic_dimension(3));
     return output_shape;
@@ -75,7 +75,7 @@ class TpuCustomResizeOp : public XlaOpKernel {
     if (input_shape.dimensions(1) / output_shape.dimensions(1) > 3 &&
         input_shape.dimensions(2) / output_shape.dimensions(2) > 3) {
       auto intermediate_shape = output_shape;
-      intermediate_shape.mutable_dimensions()[1] = input_shape.dimensions(1);
+      intermediate_shape.set_dimensions(1, input_shape.dimensions(1));
       input = xla::CustomCall(ctx->builder(), target, {ctx->Input(0)},
                               intermediate_shape, OpaqueField());
     }
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
index f6167820c354..d59c6c4b6d46 100644
--- a/tensorflow/core/tpu/kernels/infeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -89,7 +89,7 @@ absl::StatusOr<Tensor> TransposeTensor(OpKernelContext* ctx,
                                        const Tensor& input_tensor,
                                        const xla::Shape& xla_shape) {
   tsl::profiler::TraceMe trace_me("TransposeTensor", /*level=*/2);
-  const int64_t rank = xla_shape.rank();
+  const int64_t rank = xla_shape.dimensions().size();
   std::vector<int32_t> permutation(rank);
   std::vector<int64_t> transposed_shapes(rank);
   for (int64_t i = 0; i < rank; ++i) {
diff --git a/tensorflow/core/tpu/kernels/sparse_core_layout.proto b/tensorflow/core/tpu/kernels/sparse_core_layout.proto
index a7e162b228ac..df589b99e2d7 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_layout.proto
+++ b/tensorflow/core/tpu/kernels/sparse_core_layout.proto
@@ -102,6 +102,7 @@ message SparseCoreTableLayout {
   int64 num_features = 11;
 }
 
+// Each proto in this list corresponds to an unstacked table.
 message SparseCoreTableLayouts {
   repeated SparseCoreTableLayout tables = 1;
 }
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
index 5f69c246549a..182f5bf29ca3 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -27,8 +28,14 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/tpu/status_helper.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_ops_c_api.h"
@@ -70,10 +77,12 @@ int64 ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
 }
 
 absl::Status ValidateInputCombiner(const std::string& combiner) {
-  if (combiner != "sum" && combiner != "mean" && combiner != "sqrtn") {
+  if (combiner != "sum" && combiner != "mean" && combiner != "sqrtn" &&
+      !absl::StartsWith(combiner, "custom")) {
     return absl::InvalidArgumentError(
-        "Invalid combiner: only \"sum\", \"mean\", and "
-        "\"sqrtn\" are supported.");
+        absl::StrCat("Invalid combiner: only \"sum\", \"mean\", \"sqrtn\", and "
+                     "\"custom\" are supported, but got ",
+                     combiner));
   }
   return absl::OkStatus();
 }
@@ -220,4 +229,319 @@ ABSL_ATTRIBUTE_WEAK int64_t GetXlaSparseCoreStackingTableShardLimit() {
   return sparse_core_flags->tf_xla_sparse_core_stacking_table_shard_limit_bytes;
 }
 
+xla::XlaOp ApplyWeightClippingToTable(xla::XlaBuilder* builder,
+                                      xla::XlaOp table, float clip_weight_min,
+                                      float clip_weight_max) {
+  xla::XlaOp clip_weight_min_op = xla::ConstantR0(builder, clip_weight_min);
+  xla::XlaOp clip_weight_max_op = xla::ConstantR0(builder, clip_weight_max);
+  xla::XlaOp clipped_table =
+      xla::Clamp(clip_weight_min_op, table, clip_weight_max_op);
+  return clipped_table;
+}
+
+xla::XlaComputation BuildSgdOptimizerComputation(const int32_t feature_width,
+                                                 const float clip_weight_min,
+                                                 const float clip_weight_max) {
+  auto sgd_optimizer_builder =
+      std::make_unique<xla::XlaBuilder>("sgd_optimizer_builder");
+
+  xla::Shape per_row_shape =
+      xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+  xla::XlaOp gradient =
+      xla::Parameter(sgd_optimizer_builder.get(), 0, per_row_shape, "gradient");
+
+  xla::XlaOp embedding_table = xla::Parameter(sgd_optimizer_builder.get(), 1,
+                                              per_row_shape, "embedding_table");
+
+  xla::XlaOp learning_rate = xla::Parameter(sgd_optimizer_builder.get(), 2,
+                                            per_row_shape, "learning_rate");
+
+  xla::XlaOp updated_embedding_table =
+      embedding_table - learning_rate * gradient;
+
+  // Apply the weight clipping.
+  xla::XlaOp clipped_embedding_table = ApplyWeightClippingToTable(
+      sgd_optimizer_builder.get(), updated_embedding_table, clip_weight_min,
+      clip_weight_max);
+
+  xla::XlaOp updated_tables =
+      xla::Tuple(sgd_optimizer_builder.get(), {clipped_embedding_table});
+
+  return sgd_optimizer_builder->Build(updated_tables).value();
+}
+
+xla::XlaComputation BuildAdagradOptimizerComputation(
+    const int32_t feature_width, const float clip_weight_min,
+    const float clip_weight_max) {
+  auto adagrad_optimizer_builder =
+      std::make_unique<xla::XlaBuilder>("adagrad_optimizer_builder");
+
+  xla::Shape per_row_shape =
+      xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+  xla::XlaOp gradient = xla::Parameter(adagrad_optimizer_builder.get(), 0,
+                                       per_row_shape, "gradient");
+
+  xla::XlaOp embedding_table = xla::Parameter(
+      adagrad_optimizer_builder.get(), 1, per_row_shape, "embedding_table");
+
+  xla::XlaOp accumulator = xla::Parameter(adagrad_optimizer_builder.get(), 2,
+                                          per_row_shape, "accumulator");
+
+  xla::XlaOp learning_rate = xla::Parameter(adagrad_optimizer_builder.get(), 3,
+                                            per_row_shape, "learning_rate");
+
+  xla::XlaOp new_accumulator = accumulator + gradient * gradient;
+
+  xla::XlaOp updated_embedding_table =
+      embedding_table - learning_rate * gradient / xla::Sqrt(new_accumulator);
+
+  // Apply the weight clipping.
+  xla::XlaOp clipped_embedding_table = ApplyWeightClippingToTable(
+      adagrad_optimizer_builder.get(), updated_embedding_table, clip_weight_min,
+      clip_weight_max);
+
+  xla::XlaOp updated_tables =
+      xla::Tuple(adagrad_optimizer_builder.get(),
+                 {clipped_embedding_table, new_accumulator});
+  return adagrad_optimizer_builder->Build(updated_tables).value();
+}
+
+xla::XlaComputation BuildAdagradMomentumOptimizerComputation(
+    const int32_t feature_width, const bool use_nesterov, const float exponent,
+    const float beta1, const float beta2, const float epsilon,
+    const float clip_weight_min, const float clip_weight_max) {
+  auto adagrad_momentum_optimizer_builder =
+      std::make_unique<xla::XlaBuilder>("adagrad_momentum_optimizer_builder");
+
+  xla::Shape per_row_shape =
+      xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+  xla::XlaOp gradient = xla::Parameter(adagrad_momentum_optimizer_builder.get(),
+                                       0, per_row_shape, "gradient");
+  xla::XlaOp embedding_table =
+      xla::Parameter(adagrad_momentum_optimizer_builder.get(), 1, per_row_shape,
+                     "embedding_table");
+  xla::XlaOp accumulator =
+      xla::Parameter(adagrad_momentum_optimizer_builder.get(), 2, per_row_shape,
+                     "accumulator");
+  xla::XlaOp momenta = xla::Parameter(adagrad_momentum_optimizer_builder.get(),
+                                      3, per_row_shape, "momenta");
+  xla::XlaOp learning_rate =
+      xla::Parameter(adagrad_momentum_optimizer_builder.get(), 4, per_row_shape,
+                     "learning_rate");
+
+  xla::XlaOp beta1_op =
+      xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), beta1);
+  xla::XlaOp beta2_op =
+      xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), beta2);
+  xla::XlaOp epsilon_op =
+      xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), epsilon);
+
+  // If beta_2 == 1:
+  //    accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
+  // Else:
+  //    accumulator(t) = beta_2 * accumulator(t-1) +
+  //                    (1-beta_2) * gradient(t) ^ 2
+  xla::XlaOp exponent_op = xla::ConstantR0(
+      adagrad_momentum_optimizer_builder.get(), 1.0f / exponent);
+  xla::XlaOp one =
+      xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), 1.0f);
+
+  xla::XlaOp new_accumulator = xla::Select(
+      xla::Eq(beta2_op, one), accumulator + gradient * gradient,
+      beta2_op * accumulator + (one - beta2_op) * gradient * gradient);
+
+  // scaled_gradient = (accumulator + epsilon)^(-1/k) * gradient
+  xla::XlaOp scaled_gradients =
+      Pow(new_accumulator + epsilon_op, xla::Neg(exponent_op)) * gradient;
+
+  // momenta(t) = beta1 * momenta(t-1) + scaled_gradient(t)
+  xla::XlaOp new_momenta = beta1_op * momenta + scaled_gradients;
+
+  // Table update:
+  // non-nesterov: update = momenta_t
+  // nesterov:     update = beta_1 * momenta_t + scaled_gradient
+  // weights(t) = weights(t-1) - lr * update
+  xla::XlaOp updated_embedding_table;
+  if (use_nesterov) {
+    updated_embedding_table =
+        embedding_table -
+        learning_rate * (beta1_op * new_momenta + scaled_gradients);
+  } else {
+    updated_embedding_table = embedding_table - learning_rate * new_momenta;
+  }
+
+  // Apply the weight clipping.
+  xla::XlaOp clipped_embedding_table = ApplyWeightClippingToTable(
+      adagrad_momentum_optimizer_builder.get(), updated_embedding_table,
+      clip_weight_min, clip_weight_max);
+
+  xla::XlaOp updated_tables =
+      xla::Tuple(adagrad_momentum_optimizer_builder.get(),
+                 {clipped_embedding_table, new_accumulator, new_momenta});
+  return adagrad_momentum_optimizer_builder->Build(updated_tables).value();
+}
+
+xla::XlaComputation BuildAdamOptimizerComputation(
+    const int32_t feature_width, const bool use_sum_inside_sqrt,
+    const float beta1, const float beta2, const float epsilon,
+    const float clip_weight_min, const float clip_weight_max) {
+  auto adam_optimizer_builder =
+      std::make_unique<xla::XlaBuilder>("adam_optimizer_builder");
+
+  xla::Shape per_row_shape =
+      xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+  xla::XlaOp gradient = xla::Parameter(adam_optimizer_builder.get(), 0,
+                                       per_row_shape, "gradient");
+  xla::XlaOp embedding_table = xla::Parameter(adam_optimizer_builder.get(), 1,
+                                              per_row_shape, "embedding_table");
+  xla::XlaOp momenta =
+      xla::Parameter(adam_optimizer_builder.get(), 2, per_row_shape, "momenta");
+  xla::XlaOp velocity = xla::Parameter(adam_optimizer_builder.get(), 3,
+                                       per_row_shape, "velocity");
+  xla::XlaOp learning_rate = xla::Parameter(adam_optimizer_builder.get(), 4,
+                                            per_row_shape, "learning_rate");
+
+  xla::XlaOp beta1_op = xla::ConstantR0(adam_optimizer_builder.get(), beta1);
+  xla::XlaOp beta2_op = xla::ConstantR0(adam_optimizer_builder.get(), beta2);
+  xla::XlaOp epsilon_op =
+      xla::ConstantR0(adam_optimizer_builder.get(), epsilon);
+
+  // Depending on sum_inside_sqrt, the denominator is either:
+  //     sum_inside_sqrt==true: sqrt(v + eps^2)
+  //     sum_inside_sqrt==false: sqrt(v) + eps
+  // To simplify the for loop below, write the sqrt denominator as:
+  //     sqrt(v + e1) + e2
+  // and set e1 and e2 appropriately:
+  xla::XlaOp zero = xla::ConstantR0(adam_optimizer_builder.get(), 0.0f);
+  xla::XlaOp one = xla::ConstantR0(adam_optimizer_builder.get(), 1.0f);
+  xla::XlaOp e1 = use_sum_inside_sqrt ? epsilon_op * epsilon_op : zero;
+  xla::XlaOp e2 = use_sum_inside_sqrt ? zero : epsilon_op;
+
+  // momentum(t) = beta_1 * momentum(t-1)
+  //                      + (1-beta_1)*gradient(t)
+  xla::XlaOp new_momenta = beta1_op * momenta + (one - beta1_op) * gradient;
+
+  // velocity(t) = beta_2 * velocity(t-1)
+  //                      + (1-beta_2)*gradient(t)*gradient(t)
+  xla::XlaOp new_velocity =
+      beta2_op * velocity + (one - beta2_op) * gradient * gradient;
+
+  xla::XlaOp updated_embedding_table =
+      embedding_table -
+      learning_rate * new_momenta / (xla::Sqrt(new_velocity + e1) + e2);
+
+  // Apply the weight clipping.
+  xla::XlaOp clipped_embedding_table = ApplyWeightClippingToTable(
+      adam_optimizer_builder.get(), updated_embedding_table, clip_weight_min,
+      clip_weight_max);
+
+  xla::XlaOp updated_tables =
+      xla::Tuple(adam_optimizer_builder.get(),
+                 {clipped_embedding_table, new_momenta, new_velocity});
+  return adam_optimizer_builder->Build(updated_tables).value();
+}
+
+xla::XlaComputation BuildFtrlOptimizerComputation(
+    int32_t feature_width, bool multiply_linear_by_learning_rate, float beta,
+    float learning_rate_power, float l1_regularization_strength,
+    float l2_regularization_strength, float clip_weight_min,
+    float clip_weight_max) {
+  auto ftrl_optimizer_builder =
+      std::make_unique<xla::XlaBuilder>("ftrl_optimizer_builder");
+
+  xla::Shape per_row_shape =
+      xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
+
+  xla::XlaOp gradient = xla::Parameter(ftrl_optimizer_builder.get(), 0,
+                                       per_row_shape, "gradient");
+
+  xla::XlaOp embedding_table = xla::Parameter(ftrl_optimizer_builder.get(), 1,
+                                              per_row_shape, "embedding_table");
+  xla::XlaOp accumulator = xla::Parameter(ftrl_optimizer_builder.get(), 2,
+                                          per_row_shape, "accumulator");
+  xla::XlaOp linear =
+      xla::Parameter(ftrl_optimizer_builder.get(), 3, per_row_shape, "linear");
+  xla::XlaOp learning_rate = xla::Parameter(ftrl_optimizer_builder.get(), 4,
+                                            per_row_shape, "learning_rate");
+
+  // accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
+  xla::XlaOp new_accumulator = accumulator + gradient * gradient;
+
+  xla::XlaOp learning_rate_power_op =
+      xla::ConstantR0(ftrl_optimizer_builder.get(), learning_rate_power);
+
+  xla::XlaOp power_old = Pow(accumulator, xla::Neg(learning_rate_power_op));
+  xla::XlaOp power_new = Pow(new_accumulator, xla::Neg(learning_rate_power_op));
+  xla::XlaOp delta_p = power_new - power_old;
+
+  xla::XlaOp zero = xla::ConstantR0(ftrl_optimizer_builder.get(), 0.0f);
+
+  xla::XlaOp two = xla::ConstantR0(ftrl_optimizer_builder.get(), 2.0f);
+
+  xla::XlaOp l1_regularization_strength_op =
+      xla::ConstantR0(ftrl_optimizer_builder.get(), l1_regularization_strength);
+
+  xla::XlaOp l2_regularization_strength_op =
+      xla::ConstantR0(ftrl_optimizer_builder.get(), l2_regularization_strength);
+
+  xla::XlaOp beta_op = xla::ConstantR0(ftrl_optimizer_builder.get(), beta);
+
+  // Note:
+  //    min(|linear(t)|, lr*l1)*sgn(linear(t))
+  // can be written as
+  //    clamp( -lr*l1, linear(t), lr*l1)
+  // assuming lr>0 and l1>0.
+  xla::XlaOp new_linear;
+  xla::XlaOp numer;
+  xla::XlaOp denom;
+  if (multiply_linear_by_learning_rate) {
+    new_linear = linear + learning_rate * gradient - delta_p * embedding_table;
+    // if multiply_linear:
+    //   linear(t) = linear(t-1) + lr*g - delta_p * table(t-1)
+    //   Update numerator:
+    //      N = min(|linear(t)|, lr*l1)*sgn(linear(t)) - linear(t)
+    //   Update denomninator:
+    //      D = power(t) + 2*lr*l2 + beta
+    //   table(t) = N / D
+    numer = xla::Select(
+        xla::Eq(l1_regularization_strength_op, zero), xla::Neg(new_linear),
+        xla::Clamp(xla::Neg(learning_rate * l1_regularization_strength_op),
+                   new_linear, learning_rate * l1_regularization_strength_op) -
+            new_linear);
+    denom = power_new + two * learning_rate * l2_regularization_strength_op +
+            beta_op;
+  } else {
+    new_linear = linear + gradient - delta_p * embedding_table / learning_rate;
+    // if NOT multiply_linear:
+    //   linear(t) = linear(t-1) + g - (1/lr) * delta_p * table(t-1)
+    //   Update numerator:
+    //     N = min(|linear(t)|, l1)*sgn(linear(t)) - linear(t)
+    //   Update denomninator:
+    //     D = (1/lr) * (power(t) + beta) + 2*l2
+    //   table(t) = N / D
+    numer = xla::Select(xla::Eq(l1_regularization_strength_op, zero),
+                        xla::Neg(new_linear),
+                        xla::Clamp(xla::Neg(l1_regularization_strength_op),
+                                   new_linear, l1_regularization_strength_op) -
+                            new_linear);
+    denom = (power_new + beta_op) / learning_rate +
+            two * l2_regularization_strength_op;
+  }
+  xla::XlaOp updated_embedding_table = numer / denom;
+
+  // Apply the weight clipping.
+  xla::XlaOp clipped_embedding_table = ApplyWeightClippingToTable(
+      ftrl_optimizer_builder.get(), updated_embedding_table, clip_weight_min,
+      clip_weight_max);
+
+  xla::XlaOp updated_tables =
+      xla::Tuple(ftrl_optimizer_builder.get(),
+                 {clipped_embedding_table, new_accumulator, new_linear});
+  return ftrl_optimizer_builder->Build(updated_tables).value();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
index dc9b028edca4..72419504760a 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
@@ -23,6 +23,8 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -70,6 +72,32 @@ absl::Status GetMaxIdsAndUniquesExternal(const std::string& program_key,
                                          int64_t* max_ids_per_partition,
                                          int64_t* max_unique_ids_per_partition);
 
+xla::XlaOp ApplyWeightClippingToTable(xla::XlaBuilder* builder,
+                                      xla::XlaOp table, float clip_weight_min,
+                                      float clip_weight_max);
+
+xla::XlaComputation BuildSgdOptimizerComputation(int32_t feature_width,
+                                                 float clip_weight_min,
+                                                 float clip_weight_max);
+
+xla::XlaComputation BuildAdagradOptimizerComputation(int32_t feature_width,
+                                                     float clip_weight_min,
+                                                     float clip_weight_max);
+
+xla::XlaComputation BuildAdagradMomentumOptimizerComputation(
+    int32_t feature_width, bool use_nesterov, float exponent, float beta1,
+    float beta2, float epsilon, float clip_weight_min, float clip_weight_max);
+
+xla::XlaComputation BuildAdamOptimizerComputation(
+    int32_t feature_width, bool use_sum_inside_sqrt, float beta1, float beta2,
+    float epsilon, float clip_weight_min, float clip_weight_max);
+
+xla::XlaComputation BuildFtrlOptimizerComputation(
+    int32_t feature_width, bool multiply_linear_by_learning_rate, float beta,
+    float learning_rate_power, float l1_regularization_strength,
+    float l2_regularization_strength, float clip_weight_min,
+    float clip_weight_max);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_OPS_UTILS_H_
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index af5f0a97fabe..a12e032e64cb 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/lib/slicing.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
@@ -37,7 +39,9 @@ limitations under the License.
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/macros.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -64,9 +68,52 @@ namespace tensorflow {
 namespace {
 
 // Get the SparseCore logical replica count.
-absl::StatusOr<int64_t> GetSparseCoresPerChip() {
-  return stream_executor::tpu::OpsApiFn()->TpuTopology_AvailableCoresPerChipFn(
-      /*tpu_core_type=*/TpuCoreTypeEnum::kEmbeddingV2);
+absl::StatusOr<int64_t> GetSparseCoresPerLogicalDevice() {
+  return stream_executor::tpu::OpsApiFn()
+      ->TpuTopology_MaybeAvailableSparseCoresPerLogicalDeviceFn(
+          /*tpu_core_type=*/TpuCoreTypeEnum::kEmbeddingV2);
+}
+
+// Helper function to get the number of sparsecores per device from the topology
+// if available. Or if not, from the op's attribute.
+void GetAndSetSparseCoresPerLogicalDevice(OpKernelConstruction* ctx,
+                                          int64_t& num_sparsecores_per_device) {
+  // Try to get the number of sparsecores per chip from topology.
+  absl::StatusOr<int> num_from_topology = GetSparseCoresPerLogicalDevice();
+  int64_t num_sparsecores_from_attribute;
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_sparsecores_per_device",
+                                   &num_sparsecores_from_attribute));
+
+  if (num_from_topology.ok()) {
+    num_sparsecores_per_device = num_from_topology.value();
+    // Verify that the attribute is consistent with the topology value.
+    OP_REQUIRES(
+        ctx,
+        num_sparsecores_from_attribute == -1 ||
+            num_sparsecores_from_attribute == num_sparsecores_per_device,
+        absl::InvalidArgumentError(absl::StrCat(
+            "The op's attribute num_sparsecores_per_device: ",
+            num_sparsecores_per_device,
+            " is not consistent with the value discovered from the topology: ",
+            num_sparsecores_from_attribute)));
+  } else {
+    // Fall back to the attribute if topology is not available or failed.;
+    num_sparsecores_per_device = num_sparsecores_from_attribute;
+  }
+
+  // Validate the final value.
+  OP_REQUIRES(
+      ctx, num_sparsecores_per_device == 2 || num_sparsecores_per_device == 4,
+      absl::InvalidArgumentError(
+          absl::StrCat("num_sparsecores_per_device must be 2 or 4, but got: ",
+                       num_sparsecores_per_device)));
+}
+
+// Returns the number of ops in the tuple.
+absl::StatusOr<int32_t> GetTupleOpSize(xla::XlaBuilder* builder,
+                                       xla::XlaOp tuple_op) {
+  TF_ASSIGN_OR_RETURN(xla::Shape tuple_shape, builder->GetShape(tuple_op));
+  return tuple_shape.tuple_shapes().size();
 }
 
 // This TensorFlow op performs the embedding lookup on SparseCore. It takes the
@@ -143,8 +190,6 @@ class XlaSparseDenseMatmulOp : public XlaOpKernel {
 
     // Pack the input tensors as a tuple. This is a intermediate stage before
     // switching to SparseTensor type.
-    xla::XlaOp coo_tensor_input =
-        xla::Tuple(builder, {row_ids, col_ids, values});
 
     new_frontend_attributes.mutable_map()->insert(
         {"_xla_sharding_strategy", "mod"});
@@ -160,7 +205,7 @@ class XlaSparseDenseMatmulOp : public XlaOpKernel {
 
     xla::XlaOp result = xla::CustomCall(
         builder, "SparseDenseMatmulOp",
-        {coo_tensor_input, embedding_table, offsets, activation_init},
+        {row_ids, col_ids, values, embedding_table, offsets, activation_init},
         xla::ShapeUtil::MakeTupleShape({activation_shape, row_pointers_shape,
                                         sorted_ids_shape, sorted_ids_shape,
                                         sorted_gains_shape}));
@@ -201,12 +246,10 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
       : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_size", &input_size_));
-    OP_REQUIRES_VALUE(num_sparsecores_per_chip_, ctx, GetSparseCoresPerChip());
-    OP_REQUIRES(ctx, input_size_ % num_sparsecores_per_chip_ == 0,
-                errors::InvalidArgument("input_size_ ", input_size_,
-                                        " not divisible by the number "
-                                        "of sparsecores per chip ",
-                                        num_sparsecores_per_chip_));
+
+    // Try to get the number of sparsecores per chip from topology. And fall
+    // back to the attribute if the topology is not available.
+    GetAndSetSparseCoresPerLogicalDevice(ctx, num_sparsecores_per_device_);
 
     // Get and save quantization config params, if they were configured.
     // num_buckets == 0 indicate no quantization configs were provided.
@@ -247,7 +290,7 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     int64_t per_sparse_core_batch_size =
-        input_size_ / num_sparsecores_per_chip_;
+        input_size_ / num_sparsecores_per_device_;
     int64_t max_ids_per_partition = 0;
     int64_t max_unique_ids_per_partition = 0;
 
@@ -340,9 +383,9 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
     ctx->SetOutput(0, result);
   }
 
- private:
+ protected:
   int input_size_;
-  int64_t num_sparsecores_per_chip_;
+  int64_t num_sparsecores_per_device_;
   std::optional<float> quantization_config_low_;
   std::optional<float> quantization_config_high_;
   std::optional<int> quantization_config_num_buckets_;
@@ -357,6 +400,191 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmulWithCsrInput"),
                 XlaSparseDenseMatmulWithCsrInputOp);
 
+// Similar to XlaSparseDenseMatmulWithCsrInputOp, but with an additional field
+// `sorted_pos_ids` in the input Csr, `weights` which is a tensor of shape
+// [num_weights] to be used by the `combiner_computation`. It produces the same
+// embedding look up result as `XlaSparseDenseMatmulWithCsrInputOp`.
+class XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp
+    : public XlaSparseDenseMatmulWithCsrInputOp {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulWithCsrInputOp(ctx) {
+    const NameAttrList* name_attr;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_valency", &max_valency_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_weights", &num_weights_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner_computation", &name_attr));
+    combiner_computation_ = *name_attr;
+  }
+
+  ~XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp() override = default;
+
+  absl::StatusOr<xla::XlaComputation> BuildTcCustomCombinerComputation(
+      XlaOpKernelContext* ctx, const int32_t feature_width) {
+    XlaCompiler::CompileOptions options;
+    options.use_tuple_arg = false;
+    options.always_return_tuple = false;
+    options.is_entry_computation = false;
+
+    XlaCompiler* compiler = ctx->compiler();
+    XlaCompiler::CompilationResult custom_combiner_computation_result;
+
+    XlaCompiler::Argument valencies_arg;
+    XlaCompiler::Argument vectors_arg;
+
+    valencies_arg.kind = XlaCompiler::Argument::kParameter;
+    valencies_arg.type = DT_INT32;
+    valencies_arg.shape = xla::ShapeUtil::MakeShape(xla::S32, {input_size_});
+    valencies_arg.name = "valencies";
+    vectors_arg.kind = XlaCompiler::Argument::kParameter;
+    vectors_arg.type = DT_FLOAT;
+    vectors_arg.shape = xla::ShapeUtil::MakeShape(
+        xla::F32, {input_size_, max_valency_, feature_width});
+    vectors_arg.name = "vectors";
+
+    std::vector<XlaCompiler::Argument> arguments = {valencies_arg, vectors_arg};
+
+    // Don't add the weights argument if it's not needed. This helps avoid
+    // issues of passing around zero-sized tensors and Xla values.
+    if (num_weights_ > 0) {
+      XlaCompiler::Argument weights_arg;
+      weights_arg.kind = XlaCompiler::Argument::kParameter;
+      weights_arg.type = DT_FLOAT;
+      weights_arg.shape =
+          xla::ShapeUtil::MakeShape(xla::F32, {input_size_, num_weights_});
+      weights_arg.name = "weights";
+      arguments.push_back(weights_arg);
+    }
+
+    TF_RETURN_IF_ERROR(
+        compiler->CompileFunction(options, combiner_computation_, arguments,
+                                  &custom_combiner_computation_result));
+    return std::move(*custom_combiner_computation_result.computation);
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    int64_t per_sparse_core_batch_size =
+        input_size_ / num_sparsecores_per_device_;
+    int64_t max_ids_per_partition = 0;
+    int64_t max_unique_ids_per_partition = 0;
+
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp row_pointers = ctx->Input("row_pointers");
+    xla::XlaOp sorted_sample_ids = ctx->Input("sorted_sample_ids");
+    xla::XlaOp sorted_token_ids = ctx->Input("sorted_token_ids");
+    xla::XlaOp sorted_pos_ids = ctx->Input("sorted_pos_ids");
+    xla::XlaOp sorted_gains = ctx->Input("sorted_gains");
+    xla::XlaOp embedding_table = ctx->Input("embedding_table");
+
+    OP_REQUIRES_VALUE(xla::Shape embedding_table_shape, ctx,
+                      ctx->InputXlaShape("embedding_table"));
+    const int32_t feature_width = embedding_table_shape.dimensions(1);
+
+    OP_REQUIRES_OK(
+        ctx, GetMaxIdsAndUniques(per_sparse_core_batch_size, feature_width,
+                                 &max_ids_per_partition,
+                                 &max_unique_ids_per_partition));
+    // Log max_ids and max_uniques for offline analysis. We do this here since
+    // these values are fixed at TPU compile time and remain fixed during
+    // training.
+    max_ids_per_partition_gauge_->GetCell(device_name_, table_name_)
+        ->Set(max_ids_per_partition);
+    max_unique_ids_per_partition_gauge_->GetCell(device_name_, table_name_)
+        ->Set(max_unique_ids_per_partition);
+    LOG(INFO) << "Lowering "
+                 "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp to HLO: "
+              << "table_name = '" << table_name_
+              << "', max_ids = " << max_ids_per_partition
+              << ", max_uniques = " << max_unique_ids_per_partition;
+
+    xla::FrontendAttributes tc_frontend_attributes;
+    xla::FrontendAttributes sc_frontend_attributes;
+
+    sc_frontend_attributes.mutable_map()->insert(
+        {"_xla_compute_type", "sparse"});
+
+    sc_frontend_attributes.mutable_map()->insert(
+        {"_xla_sharding_strategy", "mod"});
+
+    sc_frontend_attributes.mutable_map()->insert(
+        {"_xla_pad_value", absl::StrCat(kXlaPadValue)});
+
+    sc_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_ids_per_partition", absl::StrCat(max_ids_per_partition)});
+
+    sc_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_unique_ids_per_partition",
+         absl::StrCat(max_unique_ids_per_partition)});
+
+    sc_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_valency", absl::StrCat(max_valency_)});
+
+    if (quantization_config_low_.has_value()) {
+      sc_frontend_attributes.mutable_map()->insert(
+          {"_xla_quantization_high_value",
+           absl::StrCat(quantization_config_high_.value())});
+      sc_frontend_attributes.mutable_map()->insert(
+          {"_xla_quantization_low_value",
+           absl::StrCat(quantization_config_low_.value())});
+      sc_frontend_attributes.mutable_map()->insert(
+          {"_xla_quantization_num_buckets_value",
+           absl::StrCat(quantization_config_num_buckets_.value())});
+    }
+
+    tc_frontend_attributes =
+        builder->SwapFrontendAttributes(sc_frontend_attributes);
+
+    // Emit the custom call that performs the SC embedding lookup.
+    xla::Shape valencies_shape =
+        xla::ShapeUtil::MakeShape(xla::S32, {input_size_});
+    xla::Shape vectors_shape = xla::ShapeUtil::MakeShape(
+        xla::F32, {input_size_, max_valency_, feature_width});
+    xla::Shape gains_shape =
+        xla::ShapeUtil::MakeShape(xla::F32, {input_size_, max_valency_});
+    xla::XlaOp sc_lookup_result_tuple = xla::CustomCall(
+        builder, "SparseDenseMatmulCustomCombinerTcCombinerMegachipOp",
+        {row_pointers, sorted_token_ids, sorted_sample_ids, sorted_pos_ids,
+         sorted_gains, embedding_table},
+        xla::ShapeUtil::MakeTupleShape(
+            {valencies_shape, vectors_shape, gains_shape}));
+
+    // Emit the custom combiner computation into an HLO computation.
+    OP_REQUIRES_VALUE(xla::XlaComputation custom_combiner_tc_computation, ctx,
+                      BuildTcCustomCombinerComputation(ctx, feature_width));
+
+    builder->SetFrontendAttributes(tc_frontend_attributes);
+
+    xla::XlaOp valencies = xla::GetTupleElement(sc_lookup_result_tuple, 0);
+    xla::XlaOp vectors = xla::GetTupleElement(sc_lookup_result_tuple, 1);
+
+    std::vector<xla::XlaOp> tc_combiner_args = {valencies, vectors};
+    if (num_weights_ > 0) {
+      xla::XlaOp weights = ctx->Input("weights");
+      tc_combiner_args.push_back(xla::Broadcast(weights, {input_size_}));
+    }
+
+    xla::XlaOp tc_activations =
+        xla::Call(builder, custom_combiner_tc_computation, tc_combiner_args);
+
+    ctx->SetOutput(0, tc_activations);
+    ctx->SetOutput(1, valencies);
+    ctx->SetOutput(2, vectors);
+  }
+
+ private:
+  int max_valency_;
+  int num_weights_;
+  NameAttrList combiner_computation_;
+
+  XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp(
+      const XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp&) = delete;
+  void operator=(const XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp&) =
+      delete;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"),
+                XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInputOp);
+
 // Base class for all the minibatch with CSR input optimizer kernel.
 class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
  public:
@@ -366,6 +594,10 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("clip_weight_min", &clip_weight_min_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("clip_weight_max", &clip_weight_max_));
 
+    // Try to get the number of sparsecores per chip from topology. And fall
+    // back to the attribute if the topology is not available.
+    GetAndSetSparseCoresPerLogicalDevice(ctx, num_sparsecores_per_device_);
+
     OP_REQUIRES(ctx, clip_weight_min_ <= clip_weight_max_,
                 absl::InvalidArgumentError(
                     absl::StrCat("clip_weight_min must be smaller or equal to "
@@ -385,15 +617,6 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
 
   virtual xla::Shape get_tables_shape(xla::Shape embedding_table_shape) = 0;
 
-  xla::XlaOp apply_weight_clipping_to_table(xla::XlaBuilder* builder,
-                                            xla::XlaOp table) {
-    xla::XlaOp clip_weight_min = xla::ConstantR0(builder, clip_weight_min_);
-    xla::XlaOp clip_weight_max = xla::ConstantR0(builder, clip_weight_max_);
-    xla::XlaOp clipped_table =
-        xla::Clamp(clip_weight_min, table, clip_weight_max);
-    return clipped_table;
-  }
-
   virtual absl::Status GetMaxIdsAndUniques(
       int64_t num_samples_per_sparse_core, int64_t feature_width,
       int64_t* max_ids_per_partition, int64_t* max_unique_ids_per_partition) {
@@ -417,22 +640,20 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
     // Get the shape of the gradient.
     OP_REQUIRES_VALUE(xla::Shape activation_shape, ctx,
                       ctx->InputXlaShape("activation_gradients"));
-    OP_REQUIRES(
-        ctx,
-        activation_shape.is_static() && activation_shape.dimensions_size() == 2,
-        errors::InvalidArgument(
-            "activations input has non static or non-rank 2 shape: ",
-            activation_shape.ToString()));
-    OP_REQUIRES_VALUE(int64_t num_sparsecores_per_chip, ctx,
-                      GetSparseCoresPerChip());
+    OP_REQUIRES(ctx,
+                activation_shape.is_static() &&
+                    activation_shape.dimensions().size() == 2,
+                errors::InvalidArgument(
+                    "activations input has non static or non-rank 2 shape: ",
+                    activation_shape.ToString()));
     int64 num_samples_per_chip = activation_shape.dimensions(0);
-    OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_chip == 0,
+    OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_device_ == 0,
                 errors::InvalidArgument(
                     "num_samples_per_chip ", num_samples_per_chip,
                     " not divisible by the number of sparsecores per chip ",
-                    num_sparsecores_per_chip));
+                    num_sparsecores_per_device_));
     int64_t per_sparse_core_batch_size =
-        num_samples_per_chip / num_sparsecores_per_chip;
+        num_samples_per_chip / num_sparsecores_per_device_;
     int64_t max_ids_per_partition = 0;
     int64_t max_unique_ids_per_partition = 0;
     OP_REQUIRES_VALUE(xla::Shape embedding_table_shape, ctx,
@@ -496,7 +717,7 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
     builder->SetFrontendAttributes(tuple_frontend_attributes);
 
     // Updated embedding table.
-    for (int i = 0; i < tables_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < tables_shape.tuple_shapes().size(); ++i) {
       ctx->SetOutput(i, xla::GetTupleElement(updated_tables, i));
     }
 
@@ -509,6 +730,7 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
 
  private:
   std::string table_name_;
+  int64_t num_sparsecores_per_device_;
 
   XlaSparseDenseMatmulGradWithCsrInputBase(
       const XlaSparseDenseMatmulGradWithCsrInputBase&) = delete;
@@ -522,6 +744,11 @@ class XlaSparseDenseMatmulGradWithCsrInputOp : public XlaOpKernel {
     const NameAttrList* name_attr;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("custom_computation", &name_attr));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
+
+    // Try to get the number of sparsecores per chip from topology. And fall
+    // back to the attribute if the topology is not available.
+    GetAndSetSparseCoresPerLogicalDevice(ctx, num_sparsecores_per_device_);
+
     custom_computation_ = *name_attr;
   }
 
@@ -549,23 +776,22 @@ class XlaSparseDenseMatmulGradWithCsrInputOp : public XlaOpKernel {
     // Get the shape of the gradient.
     OP_REQUIRES_VALUE(xla::Shape activation_shape, ctx,
                       ctx->InputXlaShape("activation_gradients"));
-    OP_REQUIRES(
-        ctx,
-        activation_shape.is_static() && activation_shape.dimensions_size() == 2,
-        absl::InvalidArgumentError(absl::StrCat(
-            "activations input has non static or non-rank 2 shape: ",
-            activation_shape.ToString())));
-    OP_REQUIRES_VALUE(int64_t num_sparsecores_per_chip, ctx,
-                      GetSparseCoresPerChip());
+    OP_REQUIRES(ctx,
+                activation_shape.is_static() &&
+                    activation_shape.dimensions().size() == 2,
+                absl::InvalidArgumentError(absl::StrCat(
+                    "activations input has non static or non-rank 2 shape: ",
+                    activation_shape.ToString())));
+
     int64_t num_samples_per_chip = activation_shape.dimensions(0);
-    OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_chip == 0,
+    OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_device_ == 0,
                 absl::InvalidArgumentError(absl::StrCat(
                     "num_samples_per_chip ", num_samples_per_chip,
                     " not divisible by the number of sparsecores per chip ",
-                    num_sparsecores_per_chip)));
+                    num_sparsecores_per_device_)));
 
     int64_t per_sparse_core_batch_size =
-        num_samples_per_chip / num_sparsecores_per_chip;
+        num_samples_per_chip / num_sparsecores_per_device_;
     int64_t max_ids_per_partition = 0;
     int64_t max_unique_ids_per_partition = 0;
 
@@ -669,7 +895,7 @@ class XlaSparseDenseMatmulGradWithCsrInputOp : public XlaOpKernel {
     builder->SetFrontendAttributes(tuple_frontend_attributes);
 
     // Updated embedding table.
-    for (int i = 0; i < tables_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < tables_shape.tuple_shapes().size(); ++i) {
       ctx->SetOutput(i, xla::GetTupleElement(updated_tables, i));
     }
 
@@ -679,6 +905,7 @@ class XlaSparseDenseMatmulGradWithCsrInputOp : public XlaOpKernel {
  private:
   std::string table_name_;
   NameAttrList custom_computation_;
+  int64_t num_sparsecores_per_device_;
   XlaSparseDenseMatmulGradWithCsrInputOp(
       const XlaSparseDenseMatmulGradWithCsrInputOp&) = delete;
   void operator=(const XlaSparseDenseMatmulGradWithCsrInputOp&) = delete;
@@ -687,6 +914,703 @@ class XlaSparseDenseMatmulGradWithCsrInputOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithCsrInput"),
                 XlaSparseDenseMatmulGradWithCsrInputOp);
 
+class XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase
+    : public XlaOpKernel {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(
+      OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_valency", &max_valency_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_weights", &num_weights_));
+
+    // Not all subclasses have the weight range attributes. We parse these
+    // attributes anyway (otherwise we lose the op construction context) and
+    // record possible errors. The main compile method can choose to report
+    // errors or not (depending if the attributes are expected to be present).
+    clip_weight_range_status_.Update(
+        ctx->GetAttr("clip_weight_min", &clip_weight_min_));
+    clip_weight_range_status_.Update(
+        ctx->GetAttr("clip_weight_max", &clip_weight_max_));
+    if (clip_weight_range_status_.ok() && clip_weight_min_ > clip_weight_max_) {
+      clip_weight_range_status_ = absl::InvalidArgumentError(absl::StrCat(
+          "clip_weight_min must be smaller or equal to "
+          "clip_weight_max but got clip_weight_min as ",
+          clip_weight_min_, " and clip_weight_max as ", clip_weight_max_, "."));
+    }
+
+    const NameAttrList* name_attr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("combiner_table_vjp_computation", &name_attr));
+    combiner_lookups_custom_vjp_computation_ = *name_attr;
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("combiner_weights_vjp_computation", &name_attr));
+    combiner_weights_custom_vjp_computation_ = *name_attr;
+  }
+
+  virtual absl::Status HandleClipWeightRangeStatus() {
+    // Most subclasses require the weight range attributes, and we return the
+    // status as-is.
+    return clip_weight_range_status_;
+  }
+
+  // Returns an xla::Tuple of all table-shaped optimizer inputs.
+  virtual absl::StatusOr<xla::XlaOp> GetTablesInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) = 0;
+
+  // Returns an xla::Tuple of all hyperparameter optimizer inputs.
+  virtual absl::StatusOr<xla::XlaOp> GetHyperparametersInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) = 0;
+
+  // Returns the optimizer computation.
+  virtual absl::StatusOr<xla::XlaComputation> BuildOptimizerComputation(
+      XlaOpKernelContext* ctx, int32_t feature_width) = 0;
+
+  absl::StatusOr<int32_t> GetNumTablesInput(XlaOpKernelContext* ctx) {
+    // No side effects should remain from this builder -- we derive the number
+    // of inputs by inspecting the tuple XlaOp, which should be optimized away
+    // as the results are not consumed.
+    xla::XlaBuilder* builder = ctx->builder();
+    TF_ASSIGN_OR_RETURN(xla::XlaOp tuple_op, GetTablesInput(ctx, builder));
+    return GetTupleOpSize(builder, tuple_op);
+  }
+
+  absl::StatusOr<int32_t> GetNumHyperparametersInput(XlaOpKernelContext* ctx) {
+    // No side effects should remain from this builder -- see comments above.
+    xla::XlaBuilder* builder = ctx->builder();
+    TF_ASSIGN_OR_RETURN(xla::XlaOp tuple_op,
+                        GetHyperparametersInput(ctx, builder));
+    return GetTupleOpSize(builder, tuple_op);
+  }
+
+  std::vector<XlaCompiler::Argument> BuildVjpArguments(XlaOpKernelContext* ctx,
+                                                       int32_t input_size,
+                                                       int32_t feature_width) {
+    std::vector<XlaCompiler::Argument> arguments;
+
+    XlaCompiler::Argument valencies_arg;
+    XlaCompiler::Argument vectors_arg;
+    XlaCompiler::Argument weights_arg;
+    XlaCompiler::Argument activation_gradients_arg;
+
+    valencies_arg.kind = XlaCompiler::Argument::kParameter;
+    valencies_arg.type = DT_INT32;
+    valencies_arg.shape = xla::ShapeUtil::MakeShape(xla::S32, {input_size});
+    valencies_arg.name = "valencies";
+
+    vectors_arg.kind = XlaCompiler::Argument::kParameter;
+    vectors_arg.type = DT_FLOAT;
+    vectors_arg.shape = xla::ShapeUtil::MakeShape(
+        xla::F32, {input_size, max_valency_, feature_width});
+    vectors_arg.name = "vectors";
+
+    weights_arg.kind = XlaCompiler::Argument::kParameter;
+    weights_arg.type = DT_FLOAT;
+    weights_arg.shape =
+        xla::ShapeUtil::MakeShape(xla::F32, {input_size, num_weights_});
+    weights_arg.name = "weights";
+    arguments.push_back(weights_arg);
+
+    activation_gradients_arg.kind = XlaCompiler::Argument::kParameter;
+    activation_gradients_arg.type = DT_FLOAT;
+    activation_gradients_arg.shape =
+        xla::ShapeUtil::MakeShape(xla::F32, {input_size, feature_width});
+    activation_gradients_arg.name = "activation_gradients";
+    arguments.push_back(activation_gradients_arg);
+
+    if (num_weights_ > 0) {
+      arguments = {valencies_arg, vectors_arg, weights_arg,
+                   activation_gradients_arg};
+    } else {
+      // Don't add the weights argument if it's not needed. This helps avoid
+      // issues of passing around zero-sized tensors and Xla values.
+      arguments = {valencies_arg, vectors_arg, activation_gradients_arg};
+    }
+
+    return arguments;
+  }
+
+  absl::StatusOr<xla::XlaComputation> BuildCombinerVjpComputation(
+      XlaOpKernelContext* ctx, int32_t input_size, int32_t feature_width,
+      const NameAttrList& computation) {
+    XlaCompiler::CompileOptions options;
+    options.use_tuple_arg = false;
+    options.always_return_tuple = false;
+    options.is_entry_computation = false;
+
+    XlaCompiler* compiler = ctx->compiler();
+    XlaCompiler::CompilationResult vjp_computation_result;
+
+    TF_RETURN_IF_ERROR(compiler->CompileFunction(
+        options, computation, BuildVjpArguments(ctx, input_size, feature_width),
+        &vjp_computation_result));
+    return std::move(*vjp_computation_result.computation);
+  }
+
+  absl::StatusOr<xla::XlaOp> EmitTensorCoreComputations(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder, int32_t input_size,
+      int32_t feature_width) {
+    xla::XlaOp weights = ctx->Input("weights");
+    xla::XlaOp preserved_weights = ctx->Input("preserved_weights");
+    xla::XlaOp activation_gradients = ctx->Input("activation_gradients");
+    xla::XlaOp valencies = ctx->Input("preserved_valencies");
+    xla::XlaOp vectors = ctx->Input("preserved_vectors");
+
+    // Build the required computations for the custom combiner.
+    TF_ASSIGN_OR_RETURN(
+        xla::XlaComputation combiner_vectors_vjp,
+        BuildCombinerVjpComputation(ctx, input_size, feature_width,
+                                    combiner_lookups_custom_vjp_computation_));
+    TF_ASSIGN_OR_RETURN(
+        xla::XlaComputation combiner_weights_vjp,
+        BuildCombinerVjpComputation(ctx, input_size, feature_width,
+                                    combiner_weights_custom_vjp_computation_));
+
+    // The updated weights are the last output in the list.
+    const int32_t kUpdatedWeightsIndex = ctx->num_outputs() - 1;
+
+    std::vector<xla::XlaOp> vjp_args;
+    if (num_weights_ > 0) {
+      xla::XlaOp broadcasted_preserved_weights =
+          xla::Broadcast(preserved_weights, {input_size});
+      vjp_args = {valencies, vectors, broadcasted_preserved_weights,
+                  activation_gradients};
+    } else {
+      vjp_args = {valencies, vectors, activation_gradients};
+    }
+
+    // Compute the lookup gradients based on the activation gradients. This
+    // result will be passed to SC to drive the embedding table update.
+    xla::XlaOp lookup_gradients =
+        xla::Call(builder, combiner_vectors_vjp, vjp_args);
+
+    // Compute the weights gradients based on the activation gradients.
+    if (num_weights_ > 0) {
+      // The weights VJP returns a tensor of shape f32[input_size, num_weights].
+      xla::XlaOp weights_gradients_all_samples =
+          xla::Call(builder, combiner_weights_vjp, vjp_args);
+      // Local reduction, which aggregates the contributions from all samples
+      // and returns a tensor of shape f32[num_weights].
+      xla::XlaOp per_replica_reduced_weights_gradients = xla::Reduce(
+          weights_gradients_all_samples, xla::ConstantR0<float>(builder, 0.0),
+          xla::CreateScalarAddComputation(xla::F32, builder), {0});
+      // Global reduction, which aggregates the contributions from all replicas
+      // and returns a tensor of shape f32[num_weights].
+      // Here we assume that all replicas participate in the all-reduce (using
+      // default value of `replica_groups`) and that all-reduce from different
+      // modules do not participate in this reduction (using default value of
+      // `channel_id`).
+      xla::XlaOp global_reduced_weights_gradients =
+          xla::AllReduce(per_replica_reduced_weights_gradients,
+                         xla::CreateScalarAddComputation(xla::F32, builder));
+      // Use SGD optimizer on the weights.
+      // TODO(peitianpan): Add support for more optimizers.
+      xla::XlaOp learning_rate = ctx->Input("combiner_weights_learning_rate");
+      xla::XlaOp updated_weights =
+          weights - learning_rate * global_reduced_weights_gradients;
+      ctx->SetOutput(kUpdatedWeightsIndex, updated_weights);
+    } else {
+      // The caller is not supposed to rely on this output if num_weights is 0.
+      ctx->SetOutput(kUpdatedWeightsIndex, xla::ConstantR0<float>(builder, 0));
+    }
+
+    return lookup_gradients;
+  }
+
+  absl::Status EmitSparseCoreComputations(XlaOpKernelContext* ctx,
+                                          xla::XlaBuilder* builder,
+                                          xla::XlaOp lookup_gradients,
+                                          int32_t max_ids_per_partition,
+                                          int32_t max_unique_ids_per_partition,
+                                          int32_t feature_width) {
+    xla::XlaOp row_pointers = ctx->Input("row_pointers");
+    xla::XlaOp sorted_sample_ids = ctx->Input("sorted_sample_ids");
+    xla::XlaOp sorted_token_ids = ctx->Input("sorted_token_ids");
+    xla::XlaOp sorted_pos_ids = ctx->Input("sorted_pos_ids");
+    xla::XlaOp sorted_gains = ctx->Input("sorted_gains");
+
+    xla::FrontendAttributes original_frontend_attributes =
+        builder->frontend_attributes();
+
+    xla::FrontendAttributes tuple_frontend_attributes;
+
+    tuple_frontend_attributes.mutable_map()->insert(
+        {"_xla_compute_type", "sparse"});
+
+    builder->SetFrontendAttributes(tuple_frontend_attributes);
+
+    TF_ASSIGN_OR_RETURN(xla::XlaOp tables, GetTablesInput(ctx, builder));
+    TF_ASSIGN_OR_RETURN(xla::XlaOp hyperparameters,
+                        GetHyperparametersInput(ctx, builder));
+
+    TF_ASSIGN_OR_RETURN(xla::Shape tables_shape, builder->GetShape(tables));
+    if (tables_shape.tuple_shapes().size() + 1 != ctx->num_outputs()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Expecting ", tables_shape.tuple_shapes().size() + 1,
+                       " outputs but got ", ctx->num_outputs()));
+    }
+
+    TF_ASSIGN_OR_RETURN(xla::XlaComputation optimizer,
+                        BuildOptimizerComputation(ctx, feature_width));
+
+    xla::FrontendAttributes custom_call_frontend_attributes;
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_compute_type", "sparse"});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_sharding_strategy", "mod"});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_pad_value", absl::StrCat(kXlaPadValue)});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_ids_per_partition", absl::StrCat(max_ids_per_partition)});
+
+    custom_call_frontend_attributes.mutable_map()->insert(
+        {"_xla_max_unique_ids_per_partition",
+         absl::StrCat(max_unique_ids_per_partition)});
+
+    builder->SetFrontendAttributes(custom_call_frontend_attributes);
+
+    xla::XlaOp updated_tables = xla::CustomCallWithComputation(
+        builder,
+        "SparseDenseMatmulCustomCombinerTcCombinerGradOptimizerUpdateMegachipO"
+        "p",
+        {row_pointers, sorted_token_ids, sorted_sample_ids, sorted_pos_ids,
+         sorted_gains, tables, lookup_gradients, hyperparameters},
+        optimizer, tables_shape);
+
+    builder->SetFrontendAttributes(tuple_frontend_attributes);
+
+    // Updated embedding table.
+    for (int i = 0; i < tables_shape.tuple_shapes().size(); ++i) {
+      ctx->SetOutput(i, xla::GetTupleElement(updated_tables, i));
+    }
+
+    builder->SetFrontendAttributes(original_frontend_attributes);
+    return absl::OkStatus();
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+
+    OP_REQUIRES_OK(ctx, HandleClipWeightRangeStatus());
+
+    // Get the shape of the gradient.
+    OP_REQUIRES_VALUE(xla::Shape activation_shape, ctx,
+                      ctx->InputXlaShape("activation_gradients"));
+    OP_REQUIRES(ctx,
+                activation_shape.is_static() &&
+                    activation_shape.dimensions().size() == 2,
+                absl::InvalidArgumentError(absl::StrCat(
+                    "activations input has non static or non-rank 2 shape: ",
+                    activation_shape.ToString())));
+    OP_REQUIRES_VALUE(int64_t num_sparsecores_per_device, ctx,
+                      GetSparseCoresPerLogicalDevice());
+    int64_t num_samples_per_chip = activation_shape.dimensions(0);
+    OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_device == 0,
+                absl::InvalidArgumentError(absl::StrCat(
+                    "num_samples_per_chip ", num_samples_per_chip,
+                    " not divisible by the number of sparsecores per chip ",
+                    num_sparsecores_per_device)));
+
+    int64_t per_sparse_core_batch_size =
+        num_samples_per_chip / num_sparsecores_per_device;
+    int64_t max_ids_per_partition = 0;
+    int64_t max_unique_ids_per_partition = 0;
+
+    const int32_t feature_width = activation_shape.dimensions(1);
+    OP_REQUIRES_OK(
+        ctx, GetMaxIdsAndUniquesExternal(kUnknownProgramKey, table_name_,
+                                         per_sparse_core_batch_size,
+                                         feature_width, &max_ids_per_partition,
+                                         &max_unique_ids_per_partition));
+    LOG(INFO)
+        << "Lowering XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp "
+        << "to HLO: table_name = '" << table_name_
+        << "', max_ids = " << max_ids_per_partition
+        << ", max_uniques = " << max_unique_ids_per_partition;
+
+    // Emit the two custom combiner VJP computations onto TC.
+    int32_t input_size = activation_shape.dimensions(0);
+    OP_REQUIRES_VALUE(
+        xla::XlaOp lookup_gradients, ctx,
+        EmitTensorCoreComputations(ctx, builder, input_size, feature_width));
+
+    // Pass the TC activation gradients back to SC for back-propagation with
+    // optimizer.
+    OP_REQUIRES_OK(ctx,
+                   EmitSparseCoreComputations(
+                       ctx, builder, lookup_gradients, max_ids_per_partition,
+                       max_unique_ids_per_partition, feature_width));
+  }
+
+ protected:
+  int32_t max_valency_;
+  int32_t num_weights_;
+  float clip_weight_min_;
+  float clip_weight_max_;
+  std::string table_name_;
+  NameAttrList combiner_weights_custom_vjp_computation_;
+  NameAttrList combiner_lookups_custom_vjp_computation_;
+
+  absl::Status clip_weight_range_status_;
+
+ private:
+  XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase&) =
+      delete;
+  void operator=(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase&) =
+      delete;
+};
+
+// TC custom combiner VJP + SC back-propagation with a custom optimizer.
+class XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp
+    : public XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("N", &num_tables_));
+
+    const NameAttrList* name_attr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("optimizer_custom_computation", &name_attr));
+    optimizer_custom_computation_ = *name_attr;
+  }
+
+  absl::Status HandleClipWeightRangeStatus() override {
+    // The custom optimizer BWD op does not require the weight clip range.
+    return absl::OkStatus();
+  }
+
+  absl::StatusOr<xla::XlaOp> GetTablesInput(XlaOpKernelContext* ctx,
+                                            xla::XlaBuilder* builder) override {
+    std::vector<xla::XlaOp> tables_input;
+    std::vector<TensorShape> tables_shapes;
+    TF_RETURN_IF_ERROR(ctx->InputList("tables", &tables_input, &tables_shapes));
+    if (num_tables_ != tables_shapes.size()) {
+      return absl::InvalidArgumentError(absl::StrCat("Expecting ", num_tables_,
+                                                     " tables, but got ",
+                                                     tables_shapes.size()));
+    }
+    return xla::Tuple(builder, tables_input);
+  }
+
+  absl::StatusOr<xla::XlaOp> GetHyperparametersInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) override {
+    std::vector<xla::XlaOp> hyperparameters_input;
+    std::vector<TensorShape> hyperparameters_shapes;
+    TF_RETURN_IF_ERROR(ctx->InputList("hyperparameters", &hyperparameters_input,
+                                      &hyperparameters_shapes));
+    return xla::Tuple(builder, hyperparameters_input);
+  }
+
+  absl::StatusOr<xla::XlaComputation> BuildOptimizerComputation(
+      XlaOpKernelContext* ctx, int32_t feature_width) override {
+    XlaCompiler::CompileOptions options;
+
+    // We don't use tuple args and always return tuple for this computation.
+    options.use_tuple_arg = false;
+    options.always_return_tuple = true;
+    options.is_entry_computation = false;
+
+    XlaCompiler* compiler = ctx->compiler();
+
+    XlaCompiler::CompilationResult custom_computation_result;
+
+    // The number of arguments is the number of tables + the number of
+    // hyperparameters + 1 for the activation gradients.
+    TF_ASSIGN_OR_RETURN(const int32_t num_tables_inputs,
+                        GetNumTablesInput(ctx));
+    TF_ASSIGN_OR_RETURN(const int32_t num_hyperparameters_inputs,
+                        GetNumHyperparametersInput(ctx));
+    int32_t num_arguments = 1 + num_tables_inputs + num_hyperparameters_inputs;
+
+    std::vector<XlaCompiler::Argument> arguments(num_arguments);
+
+    // For all the arguments, we use the float type and the shape is
+    // {1, feature_width}.
+    for (int32_t i = 0; i < num_arguments; ++i) {
+      arguments[i].kind = XlaCompiler::Argument::kParameter;
+      arguments[i].type = DT_FLOAT;
+      arguments[i].shape =
+          xla::ShapeUtil::MakeShape(xla::F32, {1, feature_width});
+    }
+
+    TF_RETURN_IF_ERROR(
+        compiler->CompileFunction(options, optimizer_custom_computation_,
+                                  arguments, &custom_computation_result));
+
+    return std::move(*custom_computation_result.computation);
+  }
+
+ private:
+  int32_t num_tables_;
+  NameAttrList optimizer_custom_computation_;
+
+  XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp&) = delete;
+  void operator=(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp&) = delete;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput"),
+                XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputOp);
+
+// TC custom combiner VJP + SC back-propagation with the SGD optimizer.
+class XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp
+    : public XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(ctx) {}
+
+  ~XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp() override =
+      default;
+
+  absl::StatusOr<xla::XlaOp> GetTablesInput(XlaOpKernelContext* ctx,
+                                            xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder, {ctx->Input("embedding_table")});
+  }
+
+  absl::StatusOr<xla::XlaOp> GetHyperparametersInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder, {ctx->Input("learning_rate")});
+  }
+
+  absl::StatusOr<xla::XlaComputation> BuildOptimizerComputation(
+      XlaOpKernelContext* ctx, const int32_t feature_width) override {
+    return BuildSgdOptimizerComputation(feature_width, clip_weight_min_,
+                                        clip_weight_max_);
+  }
+
+ private:
+  XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp&) =
+      delete;
+  void operator=(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp&) =
+      delete;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput"),
+    XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInputOp);
+
+// TC custom combiner VJP + SC back-propagation with the Adagrad optimizer.
+class XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp
+    : public XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(ctx) {}
+
+  ~XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp()
+      override = default;
+
+  absl::StatusOr<xla::XlaOp> GetTablesInput(XlaOpKernelContext* ctx,
+                                            xla::XlaBuilder* builder) override {
+    return xla::Tuple(
+        builder, {ctx->Input("embedding_table"), ctx->Input("accumulator")});
+  }
+
+  absl::StatusOr<xla::XlaOp> GetHyperparametersInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder, {ctx->Input("learning_rate")});
+  }
+
+  absl::StatusOr<xla::XlaComputation> BuildOptimizerComputation(
+      XlaOpKernelContext* ctx, const int32_t feature_width) override {
+    return BuildAdagradOptimizerComputation(feature_width, clip_weight_min_,
+                                            clip_weight_max_);
+  }
+
+ private:
+  XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp&) =  // NOLINT
+      delete;
+  void operator=(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp&) =  // NOLINT
+      delete;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput"),
+    XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInputOp);
+
+// TC custom combiner VJP + SC back-propagation with the AdagradMomentum
+// optimizer.
+class XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp
+    : public XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp(  // NOLINT
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exponent", &exponent_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta1", &beta1_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta2", &beta2_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+  }
+
+  ~XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp()
+      override = default;
+
+  absl::StatusOr<xla::XlaOp> GetTablesInput(XlaOpKernelContext* ctx,
+                                            xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder,
+                      {ctx->Input("embedding_table"), ctx->Input("accumulator"),
+                       ctx->Input("momenta")});
+  }
+
+  absl::StatusOr<xla::XlaOp> GetHyperparametersInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder, {ctx->Input("learning_rate")});
+  }
+
+  absl::StatusOr<xla::XlaComputation> BuildOptimizerComputation(
+      XlaOpKernelContext* ctx, const int32_t feature_width) override {
+    return BuildAdagradMomentumOptimizerComputation(
+        feature_width, use_nesterov_, exponent_, beta1_, beta2_, epsilon_,
+        clip_weight_min_, clip_weight_max_);
+  }
+
+ private:
+  bool use_nesterov_;
+  float exponent_;
+  float beta1_;
+  float beta2_;
+  float epsilon_;
+
+  XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp&) =  // NOLINT
+      delete;
+  void operator=(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp&) =  // NOLINT
+      delete;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrIn"
+         "put"),
+    XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInputOp);
+
+// TC custom combiner VJP + SC back-propagation with the Adam optimizer.
+class XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp
+    : public XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(ctx) {
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("use_sum_inside_sqrt", &use_sum_inside_sqrt_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta1", &beta1_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta2", &beta2_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+  }
+
+  ~XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp() override =
+      default;
+
+  absl::StatusOr<xla::XlaOp> GetTablesInput(XlaOpKernelContext* ctx,
+                                            xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder, {ctx->Input("embedding_table"),
+                                ctx->Input("momenta"), ctx->Input("velocity")});
+  }
+
+  absl::StatusOr<xla::XlaOp> GetHyperparametersInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder, {ctx->Input("learning_rate")});
+  }
+
+  absl::StatusOr<xla::XlaComputation> BuildOptimizerComputation(
+      XlaOpKernelContext* ctx, const int32_t feature_width) override {
+    return BuildAdamOptimizerComputation(feature_width, use_sum_inside_sqrt_,
+                                         beta1_, beta2_, epsilon_,
+                                         clip_weight_min_, clip_weight_max_);
+  }
+
+ private:
+  bool use_sum_inside_sqrt_;
+  float beta1_;
+  float beta2_;
+  float epsilon_;
+
+  XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp&) =
+      delete;
+  void operator=(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp&) =
+      delete;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput"),
+    XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInputOp);
+
+// TC custom combiner VJP + SC back-propagation with the FTRL optimizer.
+class XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp
+    : public XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase {
+ public:
+  explicit XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInputBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("multiply_linear_by_learning_rate",
+                                     &multiply_linear_by_learning_rate_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("beta", &beta_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("learning_rate_power", &learning_rate_power_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("l1_regularization_strength",
+                                     &l1_regularization_strength_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("l2_regularization_strength",
+                                     &l2_regularization_strength_));
+  }
+
+  ~XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp() override =
+      default;
+
+  absl::StatusOr<xla::XlaOp> GetTablesInput(XlaOpKernelContext* ctx,
+                                            xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder,
+                      {ctx->Input("embedding_table"), ctx->Input("accumulator"),
+                       ctx->Input("linear")});
+  }
+
+  absl::StatusOr<xla::XlaOp> GetHyperparametersInput(
+      XlaOpKernelContext* ctx, xla::XlaBuilder* builder) override {
+    return xla::Tuple(builder, {ctx->Input("learning_rate")});
+  }
+
+  absl::StatusOr<xla::XlaComputation> BuildOptimizerComputation(
+      XlaOpKernelContext* ctx, const int32_t feature_width) override {
+    return BuildFtrlOptimizerComputation(
+        feature_width, multiply_linear_by_learning_rate_, beta_,
+        learning_rate_power_, l1_regularization_strength_,
+        l2_regularization_strength_, clip_weight_min_, clip_weight_max_);
+  }
+
+ private:
+  bool multiply_linear_by_learning_rate_;
+  float beta_;
+  float learning_rate_power_;
+  float l1_regularization_strength_;
+  float l2_regularization_strength_;
+
+  XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp&) =
+      delete;
+  void operator=(
+      const XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp&) =
+      delete;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput"),
+    XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInputOp);
+
 // This TensorFlow op calculates the gradients and performs SGD update on the
 // embedding table on SparseCore. It takes the activation gradients, input
 // sparse tensor represented by the `row_pointers`, `sorted_embedding_ids`,
@@ -703,36 +1627,8 @@ class XlaSparseDenseMatmulGradWithSgdAndCsrInputOp
 
   xla::XlaComputation build_optimizer_computation(
       const int32_t feature_width) override {
-    xla::XlaComputation sgd_optimizer = [&] {
-      auto sgd_optimizer_builder =
-          std::make_unique<xla::XlaBuilder>("sgd_optimizer_builder");
-
-      xla::Shape per_row_shape =
-          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
-
-      xla::XlaOp gradient = xla::Parameter(sgd_optimizer_builder.get(), 0,
-                                           per_row_shape, "gradient");
-
-      xla::XlaOp embedding_table = xla::Parameter(
-          sgd_optimizer_builder.get(), 1, per_row_shape, "embedding_table");
-
-      xla::XlaOp learning_rate = xla::Parameter(sgd_optimizer_builder.get(), 2,
-                                                per_row_shape, "learning_rate");
-
-      xla::XlaOp updated_embedding_table =
-          embedding_table - learning_rate * gradient;
-
-      // Apply the weight clipping.
-      xla::XlaOp clipped_embedding_table = apply_weight_clipping_to_table(
-          sgd_optimizer_builder.get(), updated_embedding_table);
-
-      xla::XlaOp updated_tables =
-          xla::Tuple(sgd_optimizer_builder.get(), {clipped_embedding_table});
-
-      return sgd_optimizer_builder->Build(updated_tables).value();
-    }();
-
-    return sgd_optimizer;
+    return BuildSgdOptimizerComputation(feature_width, clip_weight_min_,
+                                        clip_weight_max_);
   }
 
   xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
@@ -772,42 +1668,8 @@ class XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp
 
   xla::XlaComputation build_optimizer_computation(
       const int32_t feature_width) override {
-    xla::XlaComputation adagrad_optimizer = [&] {
-      auto adagrad_optimizer_builder =
-          std::make_unique<xla::XlaBuilder>("adagrad_optimizer_builder");
-
-      xla::Shape per_row_shape =
-          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
-
-      xla::XlaOp gradient = xla::Parameter(adagrad_optimizer_builder.get(), 0,
-                                           per_row_shape, "gradient");
-
-      xla::XlaOp embedding_table = xla::Parameter(
-          adagrad_optimizer_builder.get(), 1, per_row_shape, "embedding_table");
-
-      xla::XlaOp accumulator = xla::Parameter(adagrad_optimizer_builder.get(),
-                                              2, per_row_shape, "accumulator");
-
-      xla::XlaOp learning_rate = xla::Parameter(
-          adagrad_optimizer_builder.get(), 3, per_row_shape, "learning_rate");
-
-      xla::XlaOp new_accumulator = accumulator + gradient * gradient;
-
-      xla::XlaOp updated_embedding_table =
-          embedding_table -
-          learning_rate * gradient / xla::Sqrt(new_accumulator);
-
-      // Apply the weight clipping.
-      xla::XlaOp clipped_embedding_table = apply_weight_clipping_to_table(
-          adagrad_optimizer_builder.get(), updated_embedding_table);
-
-      xla::XlaOp updated_tables =
-          xla::Tuple(adagrad_optimizer_builder.get(),
-                     {clipped_embedding_table, new_accumulator});
-      return adagrad_optimizer_builder->Build(updated_tables).value();
-    }();
-
-    return adagrad_optimizer;
+    return BuildAdagradOptimizerComputation(feature_width, clip_weight_min_,
+                                            clip_weight_max_);
   }
 
   xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
@@ -857,82 +1719,9 @@ class XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp
 
   xla::XlaComputation build_optimizer_computation(
       const int32_t feature_width) override {
-    xla::XlaComputation adagrad_momentum_optimizer = [&] {
-      auto adagrad_momentum_optimizer_builder =
-          std::make_unique<xla::XlaBuilder>(
-              "adagrad_momentum_optimizer_builder");
-
-      xla::Shape per_row_shape =
-          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
-
-      xla::XlaOp gradient =
-          xla::Parameter(adagrad_momentum_optimizer_builder.get(), 0,
-                         per_row_shape, "gradient");
-      xla::XlaOp embedding_table =
-          xla::Parameter(adagrad_momentum_optimizer_builder.get(), 1,
-                         per_row_shape, "embedding_table");
-      xla::XlaOp accumulator =
-          xla::Parameter(adagrad_momentum_optimizer_builder.get(), 2,
-                         per_row_shape, "accumulator");
-      xla::XlaOp momenta =
-          xla::Parameter(adagrad_momentum_optimizer_builder.get(), 3,
-                         per_row_shape, "momenta");
-      xla::XlaOp learning_rate =
-          xla::Parameter(adagrad_momentum_optimizer_builder.get(), 4,
-                         per_row_shape, "learning_rate");
-
-      xla::XlaOp beta1 =
-          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), beta1_);
-      xla::XlaOp beta2 =
-          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), beta2_);
-      xla::XlaOp epsilon =
-          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), epsilon_);
-
-      // If beta_2 == 1:
-      //    accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
-      // Else:
-      //    accumulator(t) = beta_2 * accumulator(t-1) +
-      //                    (1-beta_2) * gradient(t) ^ 2
-      xla::XlaOp exponent = xla::ConstantR0(
-          adagrad_momentum_optimizer_builder.get(), 1.0f / exponent_);
-      xla::XlaOp one =
-          xla::ConstantR0(adagrad_momentum_optimizer_builder.get(), 1.0f);
-
-      xla::XlaOp new_accumulator = xla::Select(
-          xla::Eq(beta2, one), accumulator + gradient * gradient,
-          beta2 * accumulator + (one - beta2) * gradient * gradient);
-
-      // scaled_gradient = (accumulator + epsilon)^(-1/k) * gradient
-      xla::XlaOp scaled_gradients =
-          Pow(new_accumulator + epsilon, xla::Neg(exponent)) * gradient;
-
-      // momenta(t) = beta1 * momenta(t-1) + scaled_gradient(t)
-      xla::XlaOp new_momenta = beta1 * momenta + scaled_gradients;
-
-      // Table update:
-      // non-nesterov: update = momenta_t
-      // nesterov:     update = beta_1 * momenta_t + scaled_gradient
-      // weights(t) = weights(t-1) - lr * update
-      xla::XlaOp updated_embedding_table;
-      if (use_nesterov_) {
-        updated_embedding_table =
-            embedding_table -
-            learning_rate * (beta1 * new_momenta + scaled_gradients);
-      } else {
-        updated_embedding_table = embedding_table - learning_rate * new_momenta;
-      }
-
-      // Apply the weight clipping.
-      xla::XlaOp clipped_embedding_table = apply_weight_clipping_to_table(
-          adagrad_momentum_optimizer_builder.get(), updated_embedding_table);
-
-      xla::XlaOp updated_tables =
-          xla::Tuple(adagrad_momentum_optimizer_builder.get(),
-                     {clipped_embedding_table, new_accumulator, new_momenta});
-      return adagrad_momentum_optimizer_builder->Build(updated_tables).value();
-    }();
-
-    return adagrad_momentum_optimizer;
+    return BuildAdagradMomentumOptimizerComputation(
+        feature_width, use_nesterov_, exponent_, beta1_, beta2_, epsilon_,
+        clip_weight_min_, clip_weight_max_);
   }
 
   xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
@@ -986,64 +1775,9 @@ class XlaSparseDenseMatmulGradWithAdamAndCsrInputOp
 
   xla::XlaComputation build_optimizer_computation(
       const int32_t feature_width) override {
-    xla::XlaComputation adam_optimizer = [&] {
-      auto adam_optimizer_builder =
-          std::make_unique<xla::XlaBuilder>("adam_optimizer_builder");
-
-      xla::Shape per_row_shape =
-          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
-
-      xla::XlaOp gradient = xla::Parameter(adam_optimizer_builder.get(), 0,
-                                           per_row_shape, "gradient");
-      xla::XlaOp embedding_table = xla::Parameter(
-          adam_optimizer_builder.get(), 1, per_row_shape, "embedding_table");
-      xla::XlaOp momenta = xla::Parameter(adam_optimizer_builder.get(), 2,
-                                          per_row_shape, "momenta");
-      xla::XlaOp velocity = xla::Parameter(adam_optimizer_builder.get(), 3,
-                                           per_row_shape, "velocity");
-      xla::XlaOp learning_rate = xla::Parameter(adam_optimizer_builder.get(), 4,
-                                                per_row_shape, "learning_rate");
-
-      xla::XlaOp beta1 = xla::ConstantR0(adam_optimizer_builder.get(), beta1_);
-      xla::XlaOp beta2 = xla::ConstantR0(adam_optimizer_builder.get(), beta2_);
-      xla::XlaOp epsilon =
-          xla::ConstantR0(adam_optimizer_builder.get(), epsilon_);
-
-      // Depending on sum_inside_sqrt, the denominator is either:
-      //     sum_inside_sqrt==true: sqrt(v + eps^2)
-      //     sum_inside_sqrt==false: sqrt(v) + eps
-      // To simplify the for loop below, write the sqrt denominator as:
-      //     sqrt(v + e1) + e2
-      // and set e1 and e2 appropriately:
-      xla::XlaOp zero = xla::ConstantR0(adam_optimizer_builder.get(), 0.0f);
-      xla::XlaOp one = xla::ConstantR0(adam_optimizer_builder.get(), 1.0f);
-      xla::XlaOp e1 = use_sum_inside_sqrt_ ? epsilon * epsilon : zero;
-      xla::XlaOp e2 = use_sum_inside_sqrt_ ? zero : epsilon;
-
-      // momentum(t) = beta_1 * momentum(t-1)
-      //                      + (1-beta_1)*gradient(t)
-      xla::XlaOp new_momenta = beta1 * momenta + (one - beta1) * gradient;
-
-      // velocity(t) = beta_2 * velocity(t-1)
-      //                      + (1-beta_2)*gradient(t)*gradient(t)
-      xla::XlaOp new_velocity =
-          beta2 * velocity + (one - beta2) * gradient * gradient;
-
-      xla::XlaOp updated_embedding_table =
-          embedding_table -
-          learning_rate * new_momenta / (xla::Sqrt(new_velocity + e1) + e2);
-
-      // Apply the weight clipping.
-      xla::XlaOp clipped_embedding_table = apply_weight_clipping_to_table(
-          adam_optimizer_builder.get(), updated_embedding_table);
-
-      xla::XlaOp updated_tables =
-          xla::Tuple(adam_optimizer_builder.get(),
-                     {clipped_embedding_table, new_momenta, new_velocity});
-      return adam_optimizer_builder->Build(updated_tables).value();
-    }();
-
-    return adam_optimizer;
+    return BuildAdamOptimizerComputation(feature_width, use_sum_inside_sqrt_,
+                                         beta1_, beta2_, epsilon_,
+                                         clip_weight_min_, clip_weight_max_);
   }
 
   xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
@@ -1101,104 +1835,10 @@ class XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp
 
   xla::XlaComputation build_optimizer_computation(
       const int32_t feature_width) override {
-    xla::XlaComputation ftrl_optimizer = [&] {
-      auto ftrl_optimizer_builder =
-          std::make_unique<xla::XlaBuilder>("ftrl_optimizer_builder");
-
-      xla::Shape per_row_shape =
-          xla::ShapeUtil::MakeShapeWithType<float>({1, feature_width});
-
-      xla::XlaOp gradient = xla::Parameter(ftrl_optimizer_builder.get(), 0,
-                                           per_row_shape, "gradient");
-
-      xla::XlaOp embedding_table = xla::Parameter(
-          ftrl_optimizer_builder.get(), 1, per_row_shape, "embedding_table");
-      xla::XlaOp accumulator = xla::Parameter(ftrl_optimizer_builder.get(), 2,
-                                              per_row_shape, "accumulator");
-      xla::XlaOp linear = xla::Parameter(ftrl_optimizer_builder.get(), 3,
-                                         per_row_shape, "linear");
-      xla::XlaOp learning_rate = xla::Parameter(ftrl_optimizer_builder.get(), 4,
-                                                per_row_shape, "learning_rate");
-
-      // accumulator(t) = accumulator(t-1) + gradient(t) ^ 2
-      xla::XlaOp new_accumulator = accumulator + gradient * gradient;
-
-      xla::XlaOp learning_rate_power =
-          xla::ConstantR0(ftrl_optimizer_builder.get(), learning_rate_power_);
-
-      xla::XlaOp power_old = Pow(accumulator, xla::Neg(learning_rate_power));
-      xla::XlaOp power_new =
-          Pow(new_accumulator, xla::Neg(learning_rate_power));
-      xla::XlaOp delta_p = power_new - power_old;
-
-      xla::XlaOp zero = xla::ConstantR0(ftrl_optimizer_builder.get(), 0.0f);
-
-      xla::XlaOp two = xla::ConstantR0(ftrl_optimizer_builder.get(), 2.0f);
-
-      xla::XlaOp l1_regularization_strength = xla::ConstantR0(
-          ftrl_optimizer_builder.get(), l1_regularization_strength_);
-
-      xla::XlaOp l2_regularization_strength = xla::ConstantR0(
-          ftrl_optimizer_builder.get(), l2_regularization_strength_);
-
-      xla::XlaOp beta = xla::ConstantR0(ftrl_optimizer_builder.get(), beta_);
-
-      // Note:
-      //    min(|linear(t)|, lr*l1)*sgn(linear(t))
-      // can be written as
-      //    clamp( -lr*l1, linear(t), lr*l1)
-      // assuming lr>0 and l1>0.
-      xla::XlaOp new_linear;
-      xla::XlaOp numer;
-      xla::XlaOp denom;
-      if (multiply_linear_by_learning_rate_) {
-        new_linear =
-            linear + learning_rate * gradient - delta_p * embedding_table;
-        // if multiply_linear:
-        //   linear(t) = linear(t-1) + lr*g - delta_p * table(t-1)
-        //   Update numerator:
-        //      N = min(|linear(t)|, lr*l1)*sgn(linear(t)) - linear(t)
-        //   Update denomninator:
-        //      D = power(t) + 2*lr*l2 + beta
-        //   table(t) = N / D
-        numer = xla::Select(
-            xla::Eq(l1_regularization_strength, zero), xla::Neg(new_linear),
-            xla::Clamp(xla::Neg(learning_rate * l1_regularization_strength),
-                       new_linear, learning_rate * l1_regularization_strength) -
-                new_linear);
-        denom =
-            power_new + two * learning_rate * l2_regularization_strength + beta;
-      } else {
-        new_linear =
-            linear + gradient - delta_p * embedding_table / learning_rate;
-        // if NOT multiply_linear:
-        //   linear(t) = linear(t-1) + g - (1/lr) * delta_p * table(t-1)
-        //   Update numerator:
-        //     N = min(|linear(t)|, l1)*sgn(linear(t)) - linear(t)
-        //   Update denomninator:
-        //     D = (1/lr) * (power(t) + beta) + 2*l2
-        //   table(t) = N / D
-        numer = xla::Select(xla::Eq(l1_regularization_strength, zero),
-                            xla::Neg(new_linear),
-                            xla::Clamp(xla::Neg(l1_regularization_strength),
-                                       new_linear, l1_regularization_strength) -
-                                new_linear);
-        denom = (power_new + beta) / learning_rate +
-                two * l2_regularization_strength;
-      }
-      xla::XlaOp updated_embedding_table = numer / denom;
-
-      // Apply the weight clipping.
-      xla::XlaOp clipped_embedding_table = apply_weight_clipping_to_table(
-          ftrl_optimizer_builder.get(), updated_embedding_table);
-
-      xla::XlaOp updated_tables =
-          xla::Tuple(ftrl_optimizer_builder.get(),
-                     {clipped_embedding_table, new_accumulator, new_linear});
-      return ftrl_optimizer_builder->Build(updated_tables).value();
-    }();
-
-    return ftrl_optimizer;
+    return BuildFtrlOptimizerComputation(
+        feature_width, multiply_linear_by_learning_rate_, beta_,
+        learning_rate_power_, l1_regularization_strength_,
+        l2_regularization_strength_, clip_weight_min_, clip_weight_max_);
   }
 
   xla::XlaOp get_tables_input(XlaOpKernelContext* ctx) override {
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
index f52fc0f928c9..8fe6537b5168 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
@@ -342,15 +342,16 @@ absl::Status CreateHloModules(
 
   auto debug_options = xla::DebugOptions();
   debug_options.set_xla_step_marker_location(metadata.step_marker_location());
+  TF_ASSIGN_OR_RETURN(
+      auto program_shape,
+      xla::ProgramShape::FromProto(
+          compilation_result.computation->proto().host_program_shape()));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<xla::HloModuleConfig> module_config,
-      CreateModuleConfig(
-          xla::ProgramShape(
-              compilation_result.computation->proto().host_program_shape()),
-          compilation_result.xla_input_shapes,
-          compilation_result.xla_output_shape, device_assignment,
-          metadata.num_replicas(), metadata.num_cores_per_replica(),
-          &debug_options));
+      CreateModuleConfig(program_shape, compilation_result.xla_input_shapes,
+                         compilation_result.xla_output_shape, device_assignment,
+                         metadata.num_replicas(),
+                         metadata.num_cores_per_replica(), &debug_options));
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<xla::HloModule> hlo_module,
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index 4b0e60e024ad..61bddffda534 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -484,7 +484,7 @@ class SplitDedupDataOp : public XlaOpKernel {
     absl::StatusOr<xla::Shape> tuple_shape = builder->GetShape(input_tuple);
     OP_REQUIRES_OK(ctx, tuple_shape.status());
 
-    const int num_tuple_elements = tuple_shape->tuple_shapes_size();
+    const int num_tuple_elements = tuple_shape->tuple_shapes().size();
     OP_REQUIRES(
         ctx,
         tuple_mask_tensor_.tensor_shape().dim(0).size() == num_tuple_elements,
@@ -523,7 +523,7 @@ class SplitDedupDataOp : public XlaOpKernel {
               "enum = ", element_type));
       OP_REQUIRES_VALUE(auto element_shape, ctx, builder->GetShape(element));
       OP_REQUIRES(
-          ctx, element_shape.dimensions_size() == 1,
+          ctx, element_shape.dimensions().size() == 1,
           errors::InvalidArgument("Elements of input tuple should be 1-D."));
 
       if (element_type == DedupTupleElementType::kInteger) {
@@ -687,19 +687,20 @@ class MergeDedupDataOp : public XlaOpKernel {
     absl::StatusOr<xla::Shape> integer_tensor_shape =
         ctx->builder()->GetShape(integer_tensor);
     OP_REQUIRES_OK(ctx, integer_tensor_shape.status());
-    OP_REQUIRES(ctx, integer_tensor_shape->rank() == 1,
+    OP_REQUIRES(ctx, integer_tensor_shape->dimensions().size() == 1,
                 errors::InvalidArgument(
                     "Expected rank of integer_vals is 1, but gets, ",
-                    integer_tensor_shape->rank()));
+                    integer_tensor_shape->dimensions().size()));
     const int64_t num_integers = integer_tensor_shape->dimensions(0);
 
     // `float_tensor` should be a 1-D tensor.
     absl::StatusOr<xla::Shape> float_tensor_shape =
         ctx->builder()->GetShape(float_tensor);
     OP_REQUIRES_OK(ctx, float_tensor_shape.status());
-    OP_REQUIRES(ctx, float_tensor_shape->rank() == 1,
-                errors::InvalidArgument("Expects rank of value is 1, but gets ",
-                                        float_tensor_shape->rank()));
+    OP_REQUIRES(
+        ctx, float_tensor_shape->dimensions().size() == 1,
+        errors::InvalidArgument("Expects rank of value is 1, but gets ",
+                                float_tensor_shape->dimensions().size()));
     const int64_t num_floats = float_tensor_shape->dimensions(0);
 
     // Get total number of elements in deduplication data tuple.
diff --git a/tensorflow/core/tpu/ops/sparse_core_ops.cc b/tensorflow/core/tpu/ops/sparse_core_ops.cc
index 8d0186f145b3..db71374caa06 100644
--- a/tensorflow/core/tpu/ops/sparse_core_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_ops.cc
@@ -21,6 +21,51 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+absl::Status ValidateSparseDenseMatmulCustomCombinerGradWithCsrInputShape(
+    shape_inference::InferenceContext* c, const int weights_index,
+    const int preserved_valencies_index, const int preserved_vectors_index,
+    const int preserved_weights_index, const int activation_gradients_index,
+    const int tables_index, const int num_tables) {
+  shape_inference::ShapeHandle shape;
+  int num_weights;
+  int max_valency_int;
+  TF_RETURN_IF_ERROR(c->GetAttr("num_weights", &num_weights));
+  TF_RETURN_IF_ERROR(c->GetAttr("max_valency", &max_valency_int));
+  // Only check the shape of the weights when num_weights > 0 to avoid
+  // issues of 0-shaped values.
+  if (num_weights > 0) {
+    TF_RETURN_IF_ERROR(c->Merge(c->input(weights_index),
+                                c->MakeShape({c->MakeDim(num_weights)}),
+                                &shape));
+    TF_RETURN_IF_ERROR(c->Merge(c->input(preserved_weights_index),
+                                c->MakeShape({c->MakeDim(num_weights)}),
+                                &shape));
+  }
+  // Check that the preserved tensors have the expected shapes:
+  // valencies: [input_size];
+  // vectors: [input_size, max_valency, feature_width];
+  auto input_size = c->Dim(c->input(activation_gradients_index), 0);
+  auto max_valency = c->MakeDim(max_valency_int);
+  auto feature_width = c->Dim(c->input(tables_index), 1);
+  TF_RETURN_IF_ERROR(c->Merge(c->input(preserved_valencies_index),
+                              c->MakeShape({input_size}), &shape));
+  TF_RETURN_IF_ERROR(
+      c->Merge(c->input(preserved_vectors_index),
+               c->MakeShape({input_size, max_valency, feature_width}), &shape));
+  // `updated_tables` refers to both the embedding table and the associated
+  // slot variables. They all have the same embedding table shape.
+  for (int i = 0; i < num_tables; ++i) {
+    c->set_output(i, c->input(tables_index));
+  }
+  // `updated_weights` simply have a 1D shape of `num_weights`.
+  c->set_output(num_tables, c->MakeShape({c->MakeDim(num_weights)}));
+  return absl::OkStatus();
+}
+
+}  // namespace
+
 REGISTER_OP("XlaSparseDenseMatmul")
     .Input("row_ids: int32")
     .Input("col_ids: uint32")
@@ -75,6 +120,7 @@ REGISTER_OP("XlaSparseDenseMatmulWithCsrInput")
     .Attr("quantization_config_high: float")
     .Attr("quantization_config_num_buckets: int >= 0")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       int input_size;
       TF_RETURN_IF_ERROR(c->GetAttr("input_size", &input_size));
@@ -95,6 +141,82 @@ REGISTER_OP("XlaSparseDenseMatmulWithCsrInput")
       return absl::OkStatus();
     });
 
+REGISTER_OP("XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_pos_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("embedding_table: float32")
+    .Input("weights: float32")
+    .Output("activations: float32")
+    .Output("preserved_valencies: int32")
+    .Output("preserved_vectors: float32")
+    .Attr("input_size: int >= 0")
+    .Attr("max_valency: int >= 0")
+    .Attr("num_weights: int >= 0")
+    .Attr("combiner_computation: func")
+    .Attr("quantization_config_low: float")
+    .Attr("quantization_config_high: float")
+    .Attr("quantization_config_num_buckets: int >= 0")
+    .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      constexpr int kRowPointersIndex = 0;
+      constexpr int kSortedSampleIdsIndex = 1;
+      constexpr int kEmbeddingTableIndex = 5;
+      constexpr int kEmbeddingTableRank = 2;
+      constexpr int kWeightsIndex = 6;
+      constexpr int kWeightsRank = 1;
+      constexpr int kOutputActivationsIndex = 0;
+      constexpr int kPreservedValenciesIndex = 1;
+      constexpr int kPreservedVectorsIndex = 2;
+      // This input_size is per-chip batch size.
+      int input_size;
+      TF_RETURN_IF_ERROR(c->GetAttr("input_size", &input_size));
+      int max_valency;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_valency", &max_valency));
+      int num_weights;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_weights", &num_weights));
+
+      shape_inference::ShapeHandle rank;
+      for (int i = kRowPointersIndex; i < kEmbeddingTableIndex; ++i) {
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i), kSortedSampleIdsIndex, &rank));
+      }
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(kEmbeddingTableIndex),
+                                     kEmbeddingTableRank, &rank));
+      for (int i = kSortedSampleIdsIndex + 1; i < kEmbeddingTableIndex; ++i) {
+        shape_inference::ShapeHandle merged;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->input(i), c->input(kSortedSampleIdsIndex), &merged));
+      }
+      if (num_weights > 0) {
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(kWeightsIndex), kWeightsRank, &rank));
+        shape_inference::DimensionHandle weights_dim;
+        TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(kWeightsIndex), 0),
+                                        num_weights, &weights_dim));
+      }
+
+      shape_inference::DimensionHandle input_size_dim = c->MakeDim(input_size);
+      shape_inference::DimensionHandle max_valency_dim =
+          c->MakeDim(max_valency);
+      shape_inference::DimensionHandle feature_width_dim =
+          c->Dim(c->input(kEmbeddingTableIndex), 1);
+      shape_inference::ShapeHandle output_activations_shape;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(c->input(kEmbeddingTableIndex), 0,
+                                       c->MakeDim(input_size),
+                                       &output_activations_shape));
+      c->set_output(kOutputActivationsIndex, output_activations_shape);
+      c->set_output(kPreservedValenciesIndex, c->MakeShape({input_size_dim}));
+      c->set_output(
+          kPreservedVectorsIndex,
+          c->MakeShape({input_size_dim, max_valency_dim, feature_width_dim}));
+
+      return absl::OkStatus();
+    });
+
 REGISTER_OP("XlaSparseDenseMatmulGradWithSgdAndCsrInput")
     .Input("row_pointers: int32")
     .Input("sorted_sample_ids: int32")
@@ -108,6 +230,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithSgdAndCsrInput")
     .Attr("clip_weight_min: float = -inf")
     .Attr("clip_weight_max: float = inf")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       return absl::OkStatus();
@@ -128,6 +251,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradAndCsrInput")
     .Attr("clip_weight_min: float = -inf")
     .Attr("clip_weight_max: float = inf")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -156,6 +280,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput")
     .Attr("clip_weight_min: float = -inf")
     .Attr("clip_weight_max: float = inf")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -184,6 +309,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdamAndCsrInput")
     .Attr("clip_weight_min: float = -inf")
     .Attr("clip_weight_max: float = inf")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -213,6 +339,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithFtrlAndCsrInput")
     .Attr("clip_weight_min: float = -inf")
     .Attr("clip_weight_max: float = inf")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -344,6 +471,7 @@ REGISTER_OP("XlaSparseDenseMatmulWithStaticBufferSize")
     .Attr("max_ids_per_sparse_core: int >= 1")
     .Attr("max_unique_ids_per_sparse_core: int >= 1")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       int input_size;
       TF_RETURN_IF_ERROR(c->GetAttr("input_size", &input_size));
@@ -379,6 +507,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize")
     .Attr("max_ids_per_sparse_core: int >= 1")
     .Attr("max_unique_ids_per_sparse_core: int >= 1")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       return absl::OkStatus();
@@ -401,6 +530,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize")
     .Attr("max_ids_per_sparse_core: int >= 1")
     .Attr("max_unique_ids_per_sparse_core: int >= 1")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -431,6 +561,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize")
     .Attr("max_ids_per_sparse_core: int >= 1")
     .Attr("max_unique_ids_per_sparse_core: int >= 1")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -461,6 +592,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize")
     .Attr("max_ids_per_sparse_core: int >= 1")
     .Attr("max_unique_ids_per_sparse_core: int >= 1")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -492,6 +624,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize")
     .Attr("max_ids_per_sparse_core: int >= 1")
     .Attr("max_unique_ids_per_sparse_core: int >= 1")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       c->set_output(0, c->input(6));
       c->set_output(1, c->input(7));
@@ -513,6 +646,7 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithCsrInput")
     .Attr("M: int >= 1")
     .Attr("custom_computation: func")
     .Attr("table_name: string")
+    .Attr("num_sparsecores_per_device: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
       int num_tables;
       TF_RETURN_IF_ERROR(c->GetAttr("N", &num_tables));
@@ -522,4 +656,271 @@ REGISTER_OP("XlaSparseDenseMatmulGradWithCsrInput")
       return absl::OkStatus();
     });
 
+REGISTER_OP("XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_pos_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("weights: float32")
+    .Input("preserved_valencies: int32")
+    .Input("preserved_vectors: float32")
+    .Input("preserved_weights: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("combiner_weights_learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_weights: float32")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_valency: int >= 0")
+    .Attr("num_weights: int >= 0")
+    .Attr("combiner_table_vjp_computation: func")
+    .Attr("combiner_weights_vjp_computation: func")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      constexpr int kWeightsIndex = 5;
+      constexpr int kPreservedValenciesIndex = 6;
+      constexpr int kPreservedVectorsIndex = 7;
+      constexpr int kPreservedWeightsIndex = 8;
+      constexpr int kActivationGradientsIndex = 9;
+      constexpr int kTablesIndex = 12;
+      constexpr int kNumTables = 1;
+      TF_RETURN_IF_ERROR(
+          ValidateSparseDenseMatmulCustomCombinerGradWithCsrInputShape(
+              c, kWeightsIndex, kPreservedValenciesIndex,
+              kPreservedVectorsIndex, kPreservedWeightsIndex,
+              kActivationGradientsIndex, kTablesIndex, kNumTables));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_pos_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("weights: float32")
+    .Input("preserved_valencies: int32")
+    .Input("preserved_vectors: float32")
+    .Input("preserved_weights: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("combiner_weights_learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("accumulator: float32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_accumulator: float32")
+    .Output("updated_weights: float32")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_valency: int >= 0")
+    .Attr("num_weights: int >= 0")
+    .Attr("combiner_table_vjp_computation: func")
+    .Attr("combiner_weights_vjp_computation: func")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      constexpr int kWeightsIndex = 5;
+      constexpr int kPreservedValenciesIndex = 6;
+      constexpr int kPreservedVectorsIndex = 7;
+      constexpr int kPreservedWeightsIndex = 8;
+      constexpr int kActivationGradientsIndex = 9;
+      constexpr int kTablesIndex = 12;
+      constexpr int kNumTables = 2;
+      TF_RETURN_IF_ERROR(
+          ValidateSparseDenseMatmulCustomCombinerGradWithCsrInputShape(
+              c, kWeightsIndex, kPreservedValenciesIndex,
+              kPreservedVectorsIndex, kPreservedWeightsIndex,
+              kActivationGradientsIndex, kTablesIndex, kNumTables));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP(
+    "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_pos_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("weights: float32")
+    .Input("preserved_valencies: int32")
+    .Input("preserved_vectors: float32")
+    .Input("preserved_weights: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("combiner_weights_learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("accumulator: float32")
+    .Input("momenta: float32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_accumulator: float32")
+    .Output("updated_momenta: float32")
+    .Output("updated_weights: float32")
+    .Attr("use_nesterov: bool")
+    .Attr("exponent: float")
+    .Attr("beta1: float")
+    .Attr("beta2: float")
+    .Attr("epsilon: float")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_valency: int >= 0")
+    .Attr("num_weights: int >= 0")
+    .Attr("combiner_table_vjp_computation: func")
+    .Attr("combiner_weights_vjp_computation: func")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      constexpr int kWeightsIndex = 5;
+      constexpr int kPreservedValenciesIndex = 6;
+      constexpr int kPreservedVectorsIndex = 7;
+      constexpr int kPreservedWeightsIndex = 8;
+      constexpr int kActivationGradientsIndex = 9;
+      constexpr int kTablesIndex = 12;
+      constexpr int kNumTables = 3;
+      TF_RETURN_IF_ERROR(
+          ValidateSparseDenseMatmulCustomCombinerGradWithCsrInputShape(
+              c, kWeightsIndex, kPreservedValenciesIndex,
+              kPreservedVectorsIndex, kPreservedWeightsIndex,
+              kActivationGradientsIndex, kTablesIndex, kNumTables));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_pos_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("weights: float32")
+    .Input("preserved_valencies: int32")
+    .Input("preserved_vectors: float32")
+    .Input("preserved_weights: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("combiner_weights_learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("momenta: float32")
+    .Input("velocity: float32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_momenta: float32")
+    .Output("updated_velocity: float32")
+    .Output("updated_weights: float32")
+    .Attr("use_sum_inside_sqrt: bool")
+    .Attr("beta1: float")
+    .Attr("beta2: float")
+    .Attr("epsilon: float")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_valency: int >= 0")
+    .Attr("num_weights: int >= 0")
+    .Attr("combiner_table_vjp_computation: func")
+    .Attr("combiner_weights_vjp_computation: func")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      constexpr int kWeightsIndex = 5;
+      constexpr int kPreservedValenciesIndex = 6;
+      constexpr int kPreservedVectorsIndex = 7;
+      constexpr int kPreservedWeightsIndex = 8;
+      constexpr int kActivationGradientsIndex = 9;
+      constexpr int kTablesIndex = 12;
+      constexpr int kNumTables = 3;
+      TF_RETURN_IF_ERROR(
+          ValidateSparseDenseMatmulCustomCombinerGradWithCsrInputShape(
+              c, kWeightsIndex, kPreservedValenciesIndex,
+              kPreservedVectorsIndex, kPreservedWeightsIndex,
+              kActivationGradientsIndex, kTablesIndex, kNumTables));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_pos_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("weights: float32")
+    .Input("preserved_valencies: int32")
+    .Input("preserved_vectors: float32")
+    .Input("preserved_weights: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("combiner_weights_learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("accumulator: float32")
+    .Input("linear: float32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_accumulator: float32")
+    .Output("updated_linear: float32")
+    .Output("updated_weights: float32")
+    .Attr("multiply_linear_by_learning_rate: bool")
+    .Attr("beta: float")
+    .Attr("learning_rate_power: float")
+    .Attr("l1_regularization_strength: float")
+    .Attr("l2_regularization_strength: float")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_valency: int >= 0")
+    .Attr("num_weights: int >= 0")
+    .Attr("combiner_table_vjp_computation: func")
+    .Attr("combiner_weights_vjp_computation: func")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      constexpr int kWeightsIndex = 5;
+      constexpr int kPreservedValenciesIndex = 6;
+      constexpr int kPreservedVectorsIndex = 7;
+      constexpr int kPreservedWeightsIndex = 8;
+      constexpr int kActivationGradientsIndex = 9;
+      constexpr int kTablesIndex = 12;
+      constexpr int kNumTables = 3;
+      TF_RETURN_IF_ERROR(
+          ValidateSparseDenseMatmulCustomCombinerGradWithCsrInputShape(
+              c, kWeightsIndex, kPreservedValenciesIndex,
+              kPreservedVectorsIndex, kPreservedWeightsIndex,
+              kActivationGradientsIndex, kTablesIndex, kNumTables));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_pos_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("weights: float32")
+    // We need to preserve the outputs of the SC forward pass and feed them into
+    // the VJP computations in the backward pass.
+    .Input("preserved_valencies: int32")
+    .Input("preserved_vectors: float32")
+    .Input("preserved_weights: float32")
+    .Input("activation_gradients: float32")
+    .Input("tables: N * float32")
+    .Input("hyperparameters: M * float32")
+    .Input("combiner_weights_learning_rate: float32")
+    .Output("updated_tables: N * float32")
+    .Output("updated_weights: float32")
+    .Attr("N: int >= 1")
+    .Attr("M: int >= 1")
+    .Attr("max_valency: int >= 0")
+    .Attr("num_weights: int >= 0")
+    .Attr("combiner_table_vjp_computation: func")
+    .Attr("combiner_weights_vjp_computation: func")
+    .Attr("optimizer_custom_computation: func")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      constexpr int kWeightsIndex = 5;
+      constexpr int kPreservedValenciesIndex = 6;
+      constexpr int kPreservedVectorsIndex = 7;
+      constexpr int kPreservedWeightsIndex = 8;
+      constexpr int kActivationGradientsIndex = 9;
+      constexpr int kTablesIndex = 10;
+      int num_tables;
+      TF_RETURN_IF_ERROR(c->GetAttr("N", &num_tables));
+      TF_RETURN_IF_ERROR(
+          ValidateSparseDenseMatmulCustomCombinerGradWithCsrInputShape(
+              c, kWeightsIndex, kPreservedValenciesIndex,
+              kPreservedVectorsIndex, kPreservedWeightsIndex,
+              kActivationGradientsIndex, kTablesIndex, num_tables));
+      return absl::OkStatus();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc b/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc
index 0baa9a372aeb..bbe019b3bddb 100644
--- a/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.cc
@@ -41,7 +41,7 @@ absl::StatusOr<xla::OpSharding> SpmdShardingAnnotationOnFirstDim(
   }
 
   xla::OpSharding op_sharding;
-  if (shape.rank() == 0) {
+  if (shape.dimensions().empty()) {
     // Replicate scalar tensor (used for handling dynamic learning rates).
     op_sharding.set_type(xla::OpSharding::REPLICATED);
   } else {
@@ -54,7 +54,7 @@ absl::StatusOr<xla::OpSharding> SpmdShardingAnnotationOnFirstDim(
           shape.dimensions(0), core_count_per_replica));
     }
 
-    std::vector<int> tile_assignment_dimensions(shape.dimensions_size(), 1);
+    std::vector<int> tile_assignment_dimensions(shape.dimensions().size(), 1);
     tile_assignment_dimensions[0] = core_count_per_replica;
 
     op_sharding.set_type(xla::OpSharding::OTHER);
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index 6b34ae0ce444..12009787a109 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -134,10 +134,10 @@ absl::Status FixTupleTableAsync(se::Stream* stream,
 // "bounded_shape".
 bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
                               const xla::Shape& bounded_shape) {
-  if (dynamic_shape.rank() != bounded_shape.rank()) {
+  if (dynamic_shape.dimensions().size() != bounded_shape.dimensions().size()) {
     return false;
   }
-  for (int64_t i = 0; i < dynamic_shape.rank(); ++i) {
+  for (int64_t i = 0; i < dynamic_shape.dimensions().size(); ++i) {
     if (dynamic_shape.dimensions(i) > bounded_shape.dimensions(i)) {
       return false;
     }
diff --git a/tensorflow/core/transforms/BUILD b/tensorflow/core/transforms/BUILD
index bd859307124a..5ebaa61c329d 100644
--- a/tensorflow/core/transforms/BUILD
+++ b/tensorflow/core/transforms/BUILD
@@ -15,16 +15,11 @@ package(
 
 gentbl_cc_library(
     name = "PassIncGen",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "--name",
-                "TFGraph",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "--name",
+        "TFGraph",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     deps = [
@@ -52,9 +47,15 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:node_def_proto_cc",
+        "//tensorflow/core/framework:tensor_proto_cc",
         "//tensorflow/core/ir:Dialect",
         "//tensorflow/core/ir/importexport:convert_tensor",
         "//tensorflow/core/ir/importexport:graphdef_export",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -63,12 +64,7 @@ cc_library(
 
 gentbl_cc_library(
     name = "PDLLUtilsIncGen",
-    tbl_outs = [
-        (
-            ["-x=cpp"],
-            "utils/pdll/PDLLUtils.h.inc",
-        ),
-    ],
+    tbl_outs = {"utils/pdll/PDLLUtils.h.inc": ["-x=cpp"]},
     tblgen = "@llvm-project//mlir:mlir-pdll",
     td_file = "utils/pdll/utils.pdll",
     deps = [
@@ -101,7 +97,10 @@ cc_library(
     hdrs = ["utils/op_cat_helper.h"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:op_def_proto_cc",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/ir:Dialect",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
diff --git a/tensorflow/core/transforms/constant_folding/BUILD b/tensorflow/core/transforms/constant_folding/BUILD
index 9f18c597b427..d3fe6d77e876 100644
--- a/tensorflow/core/transforms/constant_folding/BUILD
+++ b/tensorflow/core/transforms/constant_folding/BUILD
@@ -28,6 +28,8 @@ cc_library(
         "//tensorflow/core/transforms:op_cat_helper",
         "//tensorflow/core/transforms:utils",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Dialect",
diff --git a/tensorflow/core/transforms/constant_folding/pass.cc b/tensorflow/core/transforms/constant_folding/pass.cc
index 68f3a0f0a23a..845cfef52662 100644
--- a/tensorflow/core/transforms/constant_folding/pass.cc
+++ b/tensorflow/core/transforms/constant_folding/pass.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/transforms/constant_folding/pass.h"
 
 #include <algorithm>
-#include <iterator>
+#include <cassert>
+#include <cctype>
+#include <cstdint>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -26,6 +28,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
@@ -3717,7 +3721,7 @@ void ConstantFolding::runOnOperation() {
     }
     bool changed = false;
     GreedyRewriteConfig config;
-    config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps;
+    config.setStrictness(GreedyRewriteStrictness::ExistingAndNewOps);
     (void)applyOpPatternsAndFold(ops, final_patterns_, config, &changed);
     if (!changed) break;
   } while (iteration++ < max_iterations);
diff --git a/tensorflow/core/transforms/functional_to_region/pass.cc b/tensorflow/core/transforms/functional_to_region/pass.cc
index 87dbdd855a6f..9705f8dbc5b0 100644
--- a/tensorflow/core/transforms/functional_to_region/pass.cc
+++ b/tensorflow/core/transforms/functional_to_region/pass.cc
@@ -43,13 +43,13 @@ struct FunctionalToRegionPass
     GreedyRewriteConfig config;
     // Use top-down traversal for more efficient conversion. Disable region
     // simplification as all regions are single block.
-    config.useTopDownTraversal = true;
-    config.enableRegionSimplification =
-        mlir::GreedySimplifyRegionLevel::Disabled;
+    config.setUseTopDownTraversal(true);
+    config.setRegionSimplificationLevel(
+        mlir::GreedySimplifyRegionLevel::Disabled);
     // If there are deeply nested conditionals, instantiating them too deep will
     // cause the verifiers, which are implemented recursively, to stack
     // overflow. Set a relatively low iteration limit.
-    config.maxIterations = 16;
+    config.setMaxIterations(16);
     if (failed(
             applyPatternsGreedily(getOperation(), std::move(patterns), config)))
       signalPassFailure();
diff --git a/tensorflow/core/transforms/region_to_functional/pass.cc b/tensorflow/core/transforms/region_to_functional/pass.cc
index 75e62d9b9ced..3f5a91e2f62e 100644
--- a/tensorflow/core/transforms/region_to_functional/pass.cc
+++ b/tensorflow/core/transforms/region_to_functional/pass.cc
@@ -47,12 +47,12 @@ struct RegionToFunctionalPass
     GreedyRewriteConfig config;
     // Use top-down traversal for more efficient conversion. Disable region
     // simplification as all regions are single block.
-    config.useTopDownTraversal = true;
-    config.enableRegionSimplification =
-        mlir::GreedySimplifyRegionLevel::Disabled;
+    config.setUseTopDownTraversal(true);
+    config.setRegionSimplificationLevel(
+        mlir::GreedySimplifyRegionLevel::Disabled);
     // Iterate until all regions have been outlined. This is guaranteed to
     // terminate because the IR can only hold a finite depth of regions.
-    config.maxIterations = GreedyRewriteConfig::kNoLimit;
+    config.setMaxIterations(GreedyRewriteConfig::kNoLimit);
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns),
                                      config))) {
       getOperation()->emitError(getArgument() + " pass failed");
diff --git a/tensorflow/core/transforms/remapper/BUILD b/tensorflow/core/transforms/remapper/BUILD
index 1b0c4ca4ded5..455abdaf32d8 100644
--- a/tensorflow/core/transforms/remapper/BUILD
+++ b/tensorflow/core/transforms/remapper/BUILD
@@ -13,12 +13,7 @@ package(
 
 gentbl_cc_library(
     name = "MklPDLLPatternsIncGen",
-    tbl_outs = [
-        (
-            ["-x=cpp"],
-            "pdll/MklPDLLPatterns.h.inc",
-        ),
-    ],
+    tbl_outs = {"pdll/MklPDLLPatterns.h.inc": ["-x=cpp"]},
     tblgen = "@llvm-project//mlir:mlir-pdll",
     td_file = "pdll/mkl_patterns.pdll",
     deps = [
diff --git a/tensorflow/core/transforms/remapper/pdll/mkl_patterns.pdll b/tensorflow/core/transforms/remapper/pdll/mkl_patterns.pdll
index 3003dc282418..e4286a6fff0d 100644
--- a/tensorflow/core/transforms/remapper/pdll/mkl_patterns.pdll
+++ b/tensorflow/core/transforms/remapper/pdll/mkl_patterns.pdll
@@ -16,10 +16,10 @@
 #include "tensorflow/core/transforms/utils/pdll/utils.pdll"
 
 Constraint AttrIsF32OrBF16(attr: Attr) [{
-  TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+  TypeAttr type_attr = llvm::dyn_cast<TypeAttr>(attr);
   if (!type_attr) return failure();
-  return success(type_attr.getValue().isa<Float32Type>() ||
-                 type_attr.getValue().isa<BFloat16Type>());
+  return success(llvm::isa<Float32Type>(type_attr.getValue()) ||
+                 llvm::isa<BFloat16Type>(type_attr.getValue()));
 }];
 
 Rewrite ReplaceMulWith_MklSwish(op: Op, arg: Value, controls: ValueRange) -> Op [{
diff --git a/tensorflow/core/transforms/utils/eval_utils.cc b/tensorflow/core/transforms/utils/eval_utils.cc
index 9002a9b27fe6..389f9e3326aa 100644
--- a/tensorflow/core/transforms/utils/eval_utils.cc
+++ b/tensorflow/core/transforms/utils/eval_utils.cc
@@ -13,6 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/transforms/utils/eval_utils.h"
diff --git a/tensorflow/core/transforms/utils/eval_utils.h b/tensorflow/core/transforms/utils/eval_utils.h
index 28128938d358..d4c10803f699 100644
--- a/tensorflow/core/transforms/utils/eval_utils.h
+++ b/tensorflow/core/transforms/utils/eval_utils.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Support/LLVM.h"  // from @llvm-project
diff --git a/tensorflow/core/transforms/utils/op_cat_helper.cc b/tensorflow/core/transforms/utils/op_cat_helper.cc
index 114a2d971da5..2b1a7b5e9101 100644
--- a/tensorflow/core/transforms/utils/op_cat_helper.cc
+++ b/tensorflow/core/transforms/utils/op_cat_helper.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/transforms/utils/op_cat_helper.h"
 
+#include "absl/status/status.h"
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/ir/dialect.h"
 
 namespace mlir {
diff --git a/tensorflow/core/transforms/utils/utils.cc b/tensorflow/core/transforms/utils/utils.cc
index fb862e1c3c0e..e64da220b621 100644
--- a/tensorflow/core/transforms/utils/utils.cc
+++ b/tensorflow/core/transforms/utils/utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/transforms/utils/utils.h"
 
+#include <cassert>
+#include <cstdint>
+#include <numeric>
 #include <optional>
 #include <string>
 
diff --git a/tensorflow/core/util/activation_mode.cc b/tensorflow/core/util/activation_mode.cc
index 5e2618eef654..24de700824f1 100644
--- a/tensorflow/core/util/activation_mode.cc
+++ b/tensorflow/core/util/activation_mode.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/util/activation_mode.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/core/util/activation_mode.h b/tensorflow/core/util/activation_mode.h
index 2c2e6476eb06..a541794ef9da 100644
--- a/tensorflow/core/util/activation_mode.h
+++ b/tensorflow/core/util/activation_mode.h
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index b4fac84e7aa0..18d58405287b 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -1582,9 +1582,10 @@ absl::Status FastParseSingleExample(const Config& config,
       return errors::InvalidArgument("Key: ", feature_name, ".  ", suffix);
     };
 
-    auto parse_error = [feature_name] {
-      return errors::InvalidArgument("Key: ", feature_name,
-                                     ".  Can't parse serialized Example.");
+    auto parse_error = [feature_name](absl::string_view description) {
+      return errors::InvalidArgument(
+          "Key: ", feature_name,
+          ".  Can't parse serialized Example: ", description);
     };
 
     DataType example_dtype;
@@ -1619,27 +1620,30 @@ absl::Status FastParseSingleExample(const Config& config,
         case DT_INT64: {
           auto out_p = out->flat<int64_t>().data();
           LimitedArraySlice<int64_t> slice(out_p, num_elements);
-          if (!feature.ParseInt64List(&slice)) return parse_error();
+          if (!feature.ParseInt64List(&slice))
+            return parse_error("Parsing int64_list failed.");
           if (slice.EndDistance() != 0) {
-            return parse_error();
+            return parse_error("Some int64_list slice was not parsed.");
           }
           break;
         }
         case DT_FLOAT: {
           auto out_p = out->flat<float>().data();
           LimitedArraySlice<float> slice(out_p, num_elements);
-          if (!feature.ParseFloatList(&slice)) return parse_error();
+          if (!feature.ParseFloatList(&slice))
+            return parse_error("Parsing float_list failed.");
           if (slice.EndDistance() != 0) {
-            return parse_error();
+            return parse_error("Some float_list slice was not parsed.");
           }
           break;
         }
         case DT_STRING: {
           auto out_p = out->flat<tstring>().data();
           LimitedArraySlice<tstring> slice(out_p, num_elements);
-          if (!feature.ParseBytesList(&slice)) return parse_error();
+          if (!feature.ParseBytesList(&slice))
+            return parse_error("Parsing bytes_list failed.");
           if (slice.EndDistance() != 0) {
-            return parse_error();
+            return parse_error("Some bytes_list slice was not parsed.");
           }
           break;
         }
@@ -1697,22 +1701,25 @@ absl::Status FastParseSingleExample(const Config& config,
         case DT_INT64: {
           // TODO(mrry): Use the fact that the `int64_list` is packed to read
           // out the length and pre-allocate the output tensor.
-          if (!feature.ParseInt64List(&int64_list)) return parse_error();
+          if (!feature.ParseInt64List(&int64_list))
+            return parse_error("Parsing int64_list failed.");
           num_elements = int64_list.size();
           break;
         }
         case DT_FLOAT: {
-          if (!feature.ParseFloatList(&float_list)) return parse_error();
+          if (!feature.ParseFloatList(&float_list))
+            return parse_error("Parsing float_list failed.");
           num_elements = float_list.size();
           break;
         }
         case DT_STRING: {
           int actual_num_elements = 0;
           if (!feature.GetNumElementsInBytesList(&actual_num_elements)) {
-            return parse_error();
+            return parse_error("Could not get num elements in bytes_list.");
           }
           bytes_list.reserve(actual_num_elements);
-          if (!feature.ParseBytesList(&bytes_list)) return parse_error();
+          if (!feature.ParseBytesList(&bytes_list))
+            return parse_error("Parsing bytes_list failed.");
           num_elements = bytes_list.size();
           break;
         }
@@ -1778,7 +1785,9 @@ absl::Status FastParseSingleExample(const Config& config,
         }
         case DT_FLOAT: {
           if (!out->CopyFrom(float_list.tensor(), out_shape)) {
-            return parse_error();
+            return parse_error(absl::StrCat("Size of float_list is ",
+                                            float_list.tensor().dims(),
+                                            ", expected ", out_shape.dims()));
           }
           break;
         }
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 1eab3903134e..b44978fcc848 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/lib/random/philox_random.h"
@@ -44,7 +45,7 @@ string SerializedToReadable(string serialized) {
   string result;
   result += '"';
   for (char c : serialized)
-    result += strings::StrCat("\\x", strings::Hex(c, strings::kZeroPad2));
+    absl::StrAppend(&result, "\\x", absl::Hex(c, absl::kZeroPad2));
   result += '"';
   return result;
 }
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index be7dc745cfdb..fc0573a53be7 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -15,9 +15,12 @@ limitations under the License.
 #include "tensorflow/core/util/example_proto_helper.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <limits>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -407,6 +410,22 @@ absl::Status BatchExampleProtoToTensors(
   return absl::OkStatus();
 }
 
+absl::Status ParseExampleAttrs::UpdateDenseShapes(
+    const std::vector<size_t>& got_dims) {
+  if (got_dims.size() != dense_shapes.size()) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "got_dims.size() (%d) must match dense_shapes.size() (%d)",
+        got_dims.size(), dense_shapes.size()));
+  }
+  for (size_t d = 0; d < dense_shapes.size(); ++d) {
+    dense_shapes[d].set_dim(0, got_dims[d]);
+  }
+  // Recalculate relative fields.
+  variable_length.clear();
+  elements_per_stride.clear();
+  return GetDenseShapes(dense_shapes, &variable_length, &elements_per_stride);
+}
+
 absl::Status ParseExampleAttrs::FinishInit(int op_version) {
   switch (op_version) {
     case 1:
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index 801aae375e0d..f3ff81ec9894 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -16,20 +16,22 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
 #define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
 
-#include <string>
+#include <cstddef>
+#include <cstdint>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 // This is a set of helper methods that will make it possible to share
 // tensorflow::Example proto Tensor conversion code inside the ExampleParserOp
@@ -186,6 +188,8 @@ struct ParseExampleAttrs {
     return FinishInit(op_version);
   }
 
+  absl::Status UpdateDenseShapes(const std::vector<size_t>& got_dims);
+
   int64_t num_sparse;
   int64_t num_dense;
   int64_t num_ragged;
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index bb9ff8c750b7..532b7a720944 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -194,7 +194,7 @@ __device__ const unsigned kGpuWarpAll = 0xffffffff;
 __device__ inline unsigned GpuLaneId() {
   unsigned int lane_id;
 #if GOOGLE_CUDA
-#if __clang__
+#if __clang__ && !__NVCC__
   return __nvvm_read_ptx_sreg_laneid();
 #else   // __clang__
   asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index f79d14eb8aa5..f231fbe44278 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -59,8 +59,10 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_xla//xla/tsl/lib/io:buffered_file",
         "@local_xla//xla/tsl/util:byte_swap_array",
     ],
@@ -76,7 +78,11 @@ cc_library(
     name = "naming",
     srcs = ["naming.cc"],
     hdrs = ["naming.h"],
-    deps = ["//tensorflow/core:lib"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
index c86ffc26f1aa..1ad66491fc67 100644
--- a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
+++ b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_TENSOR_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_TENSOR_H_
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/byte_order.h"
diff --git a/tensorflow/core/util/tensor_bundle/naming.cc b/tensorflow/core/util/tensor_bundle/naming.cc
index 43b379e2aaed..d59f12cd8561 100644
--- a/tensorflow/core/util/tensor_bundle/naming.cc
+++ b/tensorflow/core/util/tensor_bundle/naming.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_bundle/naming.h"
 
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index f1ae16a0dfa5..75027019a1de 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -69,9 +69,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/call_once.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/tsl/lib/io/buffered_file.h"
 #include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 941a6e57b5b4..1ee6384c555a 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -534,7 +534,7 @@ void VersionTest(const VersionDef& version, absl::string_view expected_error) {
   }
   // Read it back in and verify that we get the expected error.
   BundleReader reader(Env::Default(), path);
-  EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
+  EXPECT_TRUE(absl::IsInvalidArgument(reader.status()));
   EXPECT_TRUE(absl::StartsWith(reader.status().message(), expected_error));
 }
 
@@ -999,7 +999,7 @@ TEST(TensorBundleTest, Checksum) {
                               const string& expected_msg, Tensor& val) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     absl::Status status = reader.Lookup(key, &val);
-    EXPECT_TRUE(errors::IsDataLoss(status));
+    EXPECT_TRUE(absl::IsDataLoss(status));
     EXPECT_TRUE(absl::StrContains(status.ToString(), expected_msg));
   };
 
@@ -1058,7 +1058,7 @@ TEST(TensorBundleTest, TruncatedTensorContents) {
   BundleReader reader(env, Prefix("end"));
   TF_ASSERT_OK(reader.status());
   Tensor val(DT_FLOAT, TensorShape({2, 3}));
-  EXPECT_TRUE(errors::IsOutOfRange(reader.Lookup("key", &val)));
+  EXPECT_TRUE(absl::IsOutOfRange(reader.Lookup("key", &val)));
 }
 
 TEST(TensorBundleTest, HeaderEntry) {
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index ba13b74016ce..18cb057b30b7 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -27,7 +27,6 @@ namespace tensorflow {
 using tsl::CudnnDisableConv1x1Optimization;
 using tsl::CudnnRnnUseAutotune;
 using tsl::CudnnUseAutotune;
-using tsl::CudnnUseFrontend;
 using tsl::CudnnUseRuntimeFusion;
 using tsl::DebugCudnnRnn;
 using tsl::DebugCudnnRnnAlgo;
diff --git a/tensorflow/dtensor/build_defs.bzl b/tensorflow/dtensor/build_defs.bzl
index a40457b34059..8c05193d751e 100644
--- a/tensorflow/dtensor/build_defs.bzl
+++ b/tensorflow/dtensor/build_defs.bzl
@@ -11,8 +11,6 @@ ALL_BACKENDS = [
 TPU_V3_DONUT_BACKEND = "tpu_v3_2x2"  # 8 TPU devices; includes TFRT and non-TFRT targets
 TPU_V4_DONUT_BACKEND = "tpu_v4_2x2"  # 8 TPU devices for non-Megacore targets and 4 for Megacore targets
 GPU_2DEVS_BACKEND = "2gpus"  # 2 Physical GPUs.
-PATHWAYS = "pw"
-PATHWAYS_V3_DONUT_BACKEND = "pw_v3_2x2"
 # LINT.ThenChange(
 #     python/tests/test_backend_name.py:backend_name,
 #     python/tests/test_backend_name.oss.py:backend_name
@@ -42,10 +40,6 @@ def _get_configurations(
         ],
         TPU_V4_DONUT_BACKEND: [
         ],
-        PATHWAYS: [
-        ],
-        PATHWAYS_V3_DONUT_BACKEND: [
-        ],
     }
     configurations = [
         dict(suffix = "cpu", backend = "cpu", tags = [], flags = [], env = {}, deps = []),
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 3f7259910d40..c448602c8519 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -1,7 +1,6 @@
 #include "third_party/absl/strings/str_cat.h"
 #DTensor C++ runtime and libraries.
 
-load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu")
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -289,13 +288,7 @@ cc_library(
 
 cc_library(
     name = "default_parallel_executor_lib",
-    deps = if_libtpu(
-        if_false = if_google(
-            google_value = ["//tensorflow/dtensor/cc/google:default_parallel_executor"],
-            oss_value = [":default_parallel_executor"],
-        ),
-        if_true = [":default_parallel_executor"],
-    ),
+    deps = [":default_parallel_executor"],
 )
 
 cc_library(
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index d55f2291b932..25e83bbe3740 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -135,15 +135,9 @@ class DTensorDevice {
   static StatusOr<DTensorDevice*> Create(absl::string_view name, bool is_async,
                                          int in_flight_nodes_limit) {
     std::string use_parallel_executor;
-    TF_RETURN_IF_ERROR(tsl::ReadStringFromEnvVar(
-        "DTENSOR_USE_PARALLEL_EXECUTOR", "", &use_parallel_executor));
-    std::unique_ptr<ParallelExecutor> parallel_executor;
-    if (!use_parallel_executor.empty()) {
-      TF_ASSIGN_OR_RETURN(parallel_executor, CreateDefaultParallelExecutor());
-    }
     auto eager_executor = std::make_unique<EagerExecutor>(
         is_async, /*enable_streaming_enqueue=*/true, in_flight_nodes_limit);
-    return new DTensorDevice(name, std::move(parallel_executor),
+    return new DTensorDevice(name, /*parallel_executor=*/nullptr,
                              std::move(eager_executor), is_async,
                              in_flight_nodes_limit);
   }
diff --git a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
index 12d032bde9c5..027f53cc3fc3 100644
--- a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
+++ b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
 
+#include <cstdint>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index d3bdcb73f083..08ffac8e75b8 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -20,16 +20,10 @@ package(
 gentbl_cc_library(
     name = "tensorflow_dtensor_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/tf_dtensor.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/tf_dtensor.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/tf_dtensor.h.inc": ["-gen-op-decls"],
+        "ir/tf_dtensor.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_dtensor.td",
     td_srcs = [
@@ -48,13 +42,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "dtensor_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [(
-        [
-            "-gen-pass-decls",
-            "-name=DTensor",
-        ],
-        "dtensor_passes.h.inc",
-    )],
+    tbl_outs = {"dtensor_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=DTensor",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "Passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
index c0cca6cf3846..e32cb17a0bda 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
@@ -33,24 +33,12 @@ td_library(
 gentbl_cc_library(
     name = "DialectIncGen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/ops.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "ir/dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "ir/dialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ir/ops.h.inc": ["-gen-op-decls"],
+        "ir/ops.cc.inc": ["-gen-op-defs"],
+        "ir/dialect.h.inc": ["-gen-dialect-decls"],
+        "ir/dialect.cc.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/dtensor_ops.td",
     deps = [":dtensor_td_files"],
diff --git a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
index e8084b9c33f7..3235badda66b 100644
--- a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
+++ b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
@@ -45,14 +45,30 @@ using mlir::TF::DTensorLayout;
 class RemoveDTensorLayoutAfterConstOrBlockArgPattern
     : public mlir::OpRewritePattern<DTensorLayout> {
  public:
-  using mlir::OpRewritePattern<DTensorLayout>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult match(DTensorLayout layout_op) const override;
-
-  void rewrite(DTensorLayout layout_op,
-               mlir::PatternRewriter& rewriter) const override {
+  mlir::LogicalResult matchAndRewrite(
+      DTensorLayout layout_op, mlir::PatternRewriter& rewriter) const override {
+    if (match(layout_op).failed()) {
+      return mlir::failure();
+    }
     rewriter.replaceAllUsesWith(layout_op, layout_op.getInput());
     rewriter.eraseOp(layout_op);
+    return mlir::success();
+  }
+
+ private:
+  mlir::LogicalResult match(DTensorLayout layout_op) const {
+    auto input = layout_op.getInput();
+    if (mlir::isa<mlir::BlockArgument>(input)) {
+      return mlir::success();
+    }
+    mlir::Operation* input_op = input.getDefiningOp();
+    if (input_op != nullptr) {
+      return mlir::success(input_op->hasTrait<mlir::OpTrait::ConstantLike>());
+    } else {
+      return layout_op->emitOpError() << "Can't find defining op for " << input;
+    }
   }
 };
 
@@ -63,20 +79,6 @@ class DTensorLayoutToXlaShardingOpPass
   void runOnOperation() override;
 };
 
-mlir::LogicalResult RemoveDTensorLayoutAfterConstOrBlockArgPattern::match(
-    DTensorLayout layout_op) const {
-  auto input = layout_op.getInput();
-  if (mlir::isa<mlir::BlockArgument>(input)) {
-    return mlir::success();
-  }
-  mlir::Operation* input_op = input.getDefiningOp();
-  if (input_op != nullptr) {
-    return mlir::success(input_op->hasTrait<mlir::OpTrait::ConstantLike>());
-  } else {
-    return layout_op->emitOpError() << "Can't find defining op for " << input;
-  }
-}
-
 void DTensorLayoutToXlaShardingOpPass::runOnOperation() {
   mlir::RewritePatternSet patterns(&getContext());
   // Some patterns in tf2xla requires operands to be ConstantLike.
diff --git a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
index 973bbb90c94a..a58087816462 100644
--- a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
+++ b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
@@ -109,7 +109,7 @@ mlir::BlockArgument InsertArgumentForDevice(mlir::OpBuilder& builder,
 
   llvm::ArrayRef<mlir::NamedAttribute> named_array_ref(named_attrs);
   mlir::DictionaryAttr dic_attr = builder.getDictionaryAttr(named_array_ref);
-  func.insertArgument(arg_index, arg_type, dic_attr, func.getLoc());
+  (void)func.insertArgument(arg_index, arg_type, dic_attr, func.getLoc());
 
   return func.getArgument(arg_index);
 }
@@ -363,7 +363,7 @@ mlir::LogicalResult RewriteTPUFunction(mlir::func::FuncOp func,
 
   // Erase the function's original arguments.
   for (unsigned arg_idx = 0; arg_idx < num_arguments; ++arg_idx) {
-    func.eraseArgument(0);
+    if (failed(func.eraseArgument(0))) return mlir::failure();
   }
 
   // Update the function's type.
@@ -691,9 +691,11 @@ mlir::LogicalResult BuildOuterMainFunc(
   std::vector<mlir::Value> inputs;
   for (auto [arg_index, arg] :
        llvm::enumerate(translated_func.getArguments())) {
-    main_func.insertArgument(arg_index, arg.getType(),
-                             translated_func.getArgAttrDict(arg_index),
-                             old_main_func.getLoc());
+    if (failed((main_func.insertArgument(
+            arg_index, arg.getType(), translated_func.getArgAttrDict(arg_index),
+            old_main_func.getLoc())))) {
+      return mlir::failure();
+    }
     inputs.emplace_back(main_func.getArgument(arg_index));
   }
 
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.td b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
index 999d8df041e7..11a6ea761e00 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.td
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
@@ -31,17 +31,17 @@ include "mlir/IR/OpBase.td"
 //===----------------------------------------------------------------------===//
 
 class DTensor_DTensorAttr <string name, string description> :
-    Attr<CPred<"$_self.isa<mlir::dtensor::" # name # "Attr>()">,
+    Attr<CPred<"llvm::isa<mlir::dtensor::" # name # "Attr>($_self)">,
          "DTensor " # description # " attribute">;
 
 def DTensor_LayoutAttr : DTensor_DTensorAttr<"Layout", "layout"> {
   let returnType = "mlir::dtensor::LayoutAttr::Layout";
-  let convertFromStorage = "$_self.cast<mlir::dtensor::LayoutAttr>().getValue()";
+  let convertFromStorage = "llvm::cast<mlir::dtensor::LayoutAttr>($_self).getValue()";
 }
 
 def DTensor_MeshAttr : DTensor_DTensorAttr<"Mesh", "mesh"> {
   let returnType = "mlir::dtensor::MeshAttr::Mesh";
-  let convertFromStorage = "$_self.cast<mlir::dtensor::MeshAttr>().getValue()";
+  let convertFromStorage = "llvm::cast<mlir::dtensor::MeshAttr>($_self).getValue()";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/dtensor/mlir/propagate_default_layout.cc b/tensorflow/dtensor/mlir/propagate_default_layout.cc
index 0323e53bea2a..6b0b35283fdc 100644
--- a/tensorflow/dtensor/mlir/propagate_default_layout.cc
+++ b/tensorflow/dtensor/mlir/propagate_default_layout.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "absl/types/optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/tensorflow/dtensor/mlir/shape_utils.cc b/tensorflow/dtensor/mlir/shape_utils.cc
index 0864fe28ba07..27f9fc5f7f0a 100644
--- a/tensorflow/dtensor/mlir/shape_utils.cc
+++ b/tensorflow/dtensor/mlir/shape_utils.cc
@@ -73,7 +73,7 @@ StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalInputShape(
         return errors::Internal("global_shape does not have static rank");
       return *global_shape;
     }
-    return ExtractGlobalOutputShape(input_value.get().cast<mlir::OpResult>());
+    return ExtractGlobalOutputShape(cast<mlir::OpResult>(input_value.get()));
   }
 
   // If we reach this point, we're working with a function argument.
@@ -85,7 +85,7 @@ StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalInputShape(
                       operand_index, op->getName())
             .str());
 
-  auto block_arg = input_value.get().dyn_cast<mlir::BlockArgument>();
+  auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(input_value.get());
   auto global_shape_attr =
       enclosing_function.getArgAttrOfType<mlir::TF::ShapeAttr>(
           block_arg.getArgNumber(), kGlobalShapeDialectAttr);
@@ -127,7 +127,7 @@ StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalOutputShape(
             .str());
 
   auto shape_attr = global_shape_attr[output_index];
-  return shape_attr.cast<mlir::TF::ShapeAttr>().getShape();
+  return llvm::cast<mlir::tf_type::ShapeAttr>(shape_attr).getShape();
 }
 
 namespace {
@@ -167,7 +167,7 @@ mlir::LogicalResult InferShapeOfTFOpWithCustomOperandConstantFn(
     for (const auto& inferred_return_type :
          llvm::enumerate(inferred_return_types)) {
       if (auto shaped_type =
-              inferred_return_type.value().dyn_cast<mlir::ShapedType>()) {
+              llvm::dyn_cast<mlir::ShapedType>(inferred_return_type.value())) {
         if (shaped_type.hasRank()) {
           inferred_return_shapes[inferred_return_type.index()] =
               mlir::ShapedTypeComponents(shaped_type.getShape(),
@@ -207,14 +207,14 @@ mlir::LogicalResult InferShapeOfTFOpWithCustomOperandConstantFn(
   auto op_result_as_shape_fn =
       [](shape_inference::InferenceContext& ic,
          mlir::OpResult op_result) -> shape_inference::ShapeHandle {
-    auto rt = op_result.getType().dyn_cast<mlir::RankedTensorType>();
+    auto rt = llvm::dyn_cast<mlir::RankedTensorType>(op_result.getType());
     if (!rt || rt.getRank() != 1 || !rt.hasStaticShape()) return {};
 
     std::vector<shape_inference::DimensionHandle> dims(rt.getDimSize(0),
                                                        ic.UnknownDim());
     mlir::Attribute attr;
     if (matchPattern(op_result, m_Constant(&attr))) {
-      auto elements = attr.dyn_cast<mlir::DenseIntElementsAttr>();
+      auto elements = llvm::dyn_cast<mlir::DenseIntElementsAttr>(attr);
       if (elements)
         for (const auto& element :
              llvm::enumerate(elements.getValues<llvm::APInt>()))
@@ -241,10 +241,8 @@ absl::Status InferSPMDExpandedLocalShapeForResourceOutput(
                         GetGlobalShapeOfValueFromDTensorLayout(*op_result));
     const std::vector<int64_t>& local_shape =
         output_layout.LocalShapeFromGlobalShape(global_shape);
-    auto resource_type = op_result->getType()
-                             .cast<mlir::TensorType>()
-                             .getElementType()
-                             .dyn_cast<mlir::TF::ResourceType>();
+    auto resource_type = llvm::dyn_cast<mlir::tf_type::ResourceType>(
+        llvm::cast<mlir::TensorType>(op_result->getType()).getElementType());
 
     auto sub_types = resource_type.getSubtypes();
     auto resource_arg_sub_type = sub_types.front();
@@ -274,7 +272,7 @@ mlir::Operation* InferSPMDExpandedLocalShape(mlir::Operation* op) {
     const auto& return_type = std::get<0>(it);
     auto& op_result = std::get<1>(it);
     const auto element_type =
-        op_result.getType().cast<mlir::TensorType>().getElementType();
+        llvm::cast<mlir::TensorType>(op_result.getType()).getElementType();
 
     if (return_type.hasRank()) {
       op_result.setType(
@@ -292,7 +290,7 @@ StatusOr<llvm::ArrayRef<int64_t>> GetShapeOfValue(const mlir::Value& value,
   // Getting the subtype or self allows supporting extracting the underlying
   // shape that variant or resource tensors point to.
   mlir::Type type = GetSubtypeOrSelf(value);
-  if (auto ranked_type = type.dyn_cast<mlir::RankedTensorType>()) {
+  if (auto ranked_type = llvm::dyn_cast<mlir::RankedTensorType>(type)) {
     if (ranked_type.hasStaticShape() || !fail_on_dynamic)
       return ranked_type.getShape();
     else
@@ -303,7 +301,7 @@ StatusOr<llvm::ArrayRef<int64_t>> GetShapeOfValue(const mlir::Value& value,
 
 StatusOr<llvm::ArrayRef<int64_t>> GetGlobalShapeOfValueFromDTensorLayout(
     const mlir::Value& value) {
-  if (value.isa<mlir::OpResult>() &&
+  if (mlir::isa<mlir::OpResult>(value) &&
       mlir::isa<mlir::TF::DTensorLayout>(value.getDefiningOp())) {
     auto layout_op = mlir::cast<mlir::TF::DTensorLayout>(value.getDefiningOp());
     if (layout_op.getGlobalShape()) return layout_op.getGlobalShape().value();
diff --git a/tensorflow/dtensor/mlir/spmd_expander.cc b/tensorflow/dtensor/mlir/spmd_expander.cc
index 5625b1fe7c7c..1a9ed6f5e49d 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/dtensor/mlir/spmd_expansion.cc b/tensorflow/dtensor/mlir/spmd_expansion.cc
index c0733a410f71..434bf869a4eb 100644
--- a/tensorflow/dtensor/mlir/spmd_expansion.cc
+++ b/tensorflow/dtensor/mlir/spmd_expansion.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
-#include "absl/types/optional.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -123,10 +122,8 @@ mlir::LogicalResult UpdateResourceArgumentType(
     return mlir::success();
   }
 
-  auto resource_type = resource_arg.getType()
-                           .cast<mlir::TensorType>()
-                           .getElementType()
-                           .dyn_cast<mlir::TF::ResourceType>();
+  auto resource_type = llvm::dyn_cast<mlir::tf_type::ResourceType>(
+      llvm::cast<mlir::TensorType>(resource_arg.getType()).getElementType());
   if (!resource_type) return mlir::success();
 
   auto sub_types = resource_type.getSubtypes();
@@ -191,7 +188,7 @@ bool GetResourceArgIndexIfUsedInAssignmentOp(
           GetForwardedDTensorLayoutInput(assign_variable_op.getResource());
       if (llvm::isa<mlir::BlockArgument>(resource)) {
         *resource_argument_index_for_assign_variable =
-            resource.cast<mlir::BlockArgument>().getArgNumber();
+            cast<mlir::BlockArgument>(resource).getArgNumber();
         return true;
       }
     }
@@ -224,16 +221,14 @@ mlir::LogicalResult UpdateFunctionArgsUsingLayout(mlir::func::FuncOp function) {
 
     // If argument is a resource type update the subtype shape information
     // to reflect local shape of resources.
-    if (arg_type.isa<mlir::TF::ResourceType>()) {
+    if (isa<mlir::TF::ResourceType>(arg_type)) {
       if (mlir::failed(UpdateResourceArgumentType(argument_index, function)))
         return mlir::failure();
       continue;
     }
 
-    mlir::RankedTensorType ranked_type =
-        function.getFunctionType()
-            .getInput(argument_index)
-            .dyn_cast<mlir::RankedTensorType>();
+    mlir::RankedTensorType ranked_type = llvm::dyn_cast<mlir::RankedTensorType>(
+        function.getFunctionType().getInput(argument_index));
     if (!ranked_type) continue;
 
     // If input value is non-resource type, then update the value to reflect
@@ -267,7 +262,8 @@ mlir::LogicalResult UpdateFunctionWithLocalInputShapes(
     mlir::func::FuncOp function) {
   for (auto& operand : function_operands) {
     const int index = operand.getOperandNumber();
-    auto arg_type = operand.get().getType().dyn_cast<mlir::RankedTensorType>();
+    auto arg_type =
+        llvm::dyn_cast<mlir::RankedTensorType>(operand.get().getType());
     if (!arg_type) continue;
 
     auto arg_local_shape = arg_type.getShape();
diff --git a/tensorflow/dtensor/mlir/utils/BUILD b/tensorflow/dtensor/mlir/utils/BUILD
index 56620400ec95..9868e178fc34 100644
--- a/tensorflow/dtensor/mlir/utils/BUILD
+++ b/tensorflow/dtensor/mlir/utils/BUILD
@@ -44,6 +44,11 @@ cc_library(
         "//tensorflow/dtensor/mlir/dtensor_dialect:ir/dtensor_attributes",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index fb71b092d3c6..df52a5ddde93 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <atomic>
+#include <cassert>
+#include <cstdint>
 #include <cstdlib>
+#include <cstring>
+#include <functional>
 #include <iterator>
 #include <map>
 #include <memory>
@@ -26,6 +30,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc b/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc
index f2b1429bc050..ee572beab528 100644
--- a/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc
+++ b/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.h"
 
 #include <cstdlib>
+#include <cstring>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
index b8213713e459..6eaeacee29f6 100644
--- a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
+++ b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_join.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/value_utils.cc b/tensorflow/dtensor/mlir/value_utils.cc
index aff455417595..e9240996904f 100644
--- a/tensorflow/dtensor/mlir/value_utils.cc
+++ b/tensorflow/dtensor/mlir/value_utils.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -57,7 +58,8 @@ mlir::Value GetForwardedInput(mlir::Value value) {
   bool value_updated;
   do {
     value_updated = false;
-    if (mlir::BlockArgument argument = value.dyn_cast<mlir::BlockArgument>()) {
+    if (mlir::BlockArgument argument =
+            mlir::dyn_cast<mlir::BlockArgument>(value)) {
       mlir::Region* region = argument.getParentRegion();
       if (region == nullptr) break;
       mlir::Operation* parent_op = region->getParentOp();
@@ -86,7 +88,7 @@ namespace ops_util = ::mlir::TF::collection_ops_util;
 
 int ValueRank(mlir::Value operand_value) {
   mlir::Type type = GetSubtypeOrSelf(operand_value);
-  const auto operand_type = type.cast<mlir::TensorType>();
+  const auto operand_type = llvm::cast<mlir::TensorType>(type);
   if (!operand_type.hasRank()) return -1;
   return operand_type.getRank();
 }
@@ -116,7 +118,7 @@ mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
 }
 
 StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type) {
-  auto ranked_type = type.dyn_cast<mlir::RankedTensorType>();
+  auto ranked_type = llvm::dyn_cast<mlir::RankedTensorType>(type);
   if (!ranked_type) {
     return errors::InvalidArgument(
         llvm::formatv("Type {0} is not a RankedTensorType.", type).str());
@@ -166,7 +168,7 @@ mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
                                      mlir::Location loc,
                                      llvm::ArrayRef<int64_t> values,
                                      mlir::Type type) {
-  if (type.cast<mlir::RankedTensorType>().getElementType().isInteger(64)) {
+  if (llvm::cast<mlir::RankedTensorType>(type).getElementType().isInteger(64)) {
     return Int64Const(builder, loc, values);
   } else {
     llvm::SmallVector<int32, 4> values32(values.begin(), values.end());
@@ -176,7 +178,7 @@ mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
 
 StatusOr<int64_t> ExtractConstIntFromValue(mlir::Value value) {
   value = GetForwardedInput(value);
-  if (value.isa<mlir::BlockArgument>())
+  if (mlir::isa<mlir::BlockArgument>(value))
     return errors::Internal("unable get constant value from block argument");
   mlir::DenseIntElementsAttr attr;
   if (!matchPattern(value, m_Constant(&attr))) {
@@ -195,7 +197,7 @@ StatusOr<int64_t> ExtractConstIntFromValue(mlir::Value value) {
 absl::Status ExtractConstVectorFromValue(
     mlir::Value value, llvm::SmallVector<int64_t, 4>* out_vector) {
   value = GetForwardedInput(value);
-  if (value.isa<mlir::BlockArgument>())
+  if (mlir::isa<mlir::BlockArgument>(value))
     return errors::Internal("unable get constant value from block argument");
   mlir::DenseIntElementsAttr attr;
   if (!matchPattern(value, m_Constant(&attr))) {
@@ -263,7 +265,7 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
                                                  int index,
                                                  mlir::Location location,
                                                  mlir::Value array) {
-  mlir::TensorType arrayType = array.getType().cast<mlir::TensorType>();
+  mlir::TensorType arrayType = llvm::cast<mlir::TensorType>(array.getType());
   if (arrayType.getRank() != 2 || arrayType.getDimSize(0) != 1) {
     return errors::InvalidArgument("Input array must have shape [1, N].");
   }
@@ -289,8 +291,8 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
 mlir::Type GetSubtypeOrSelf(mlir::Value val) {
   mlir::Type type = val.getType();
   if (auto type_with_subtype =
-          mlir::getElementTypeOrSelf(val)
-              .dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>()) {
+          mlir::dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>(
+              mlir::getElementTypeOrSelf(val))) {
     if (type_with_subtype.GetSubtypes().size() == 1) {
       type = type_with_subtype.GetSubtypes().front();
     }
@@ -299,10 +301,8 @@ mlir::Type GetSubtypeOrSelf(mlir::Value val) {
 }
 
 bool IsResourceType(mlir::Value val) {
-  return val.getType()
-      .cast<mlir::TensorType>()
-      .getElementType()
-      .isa<mlir::TF::ResourceType>();
+  return mlir::isa<mlir::TF::ResourceType>(
+      mlir::cast<mlir::TensorType>(val.getType()).getElementType());
 }
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/python/accelerator_util.py b/tensorflow/dtensor/python/accelerator_util.py
index b1e96c169de4..08b6ebffef96 100644
--- a/tensorflow/dtensor/python/accelerator_util.py
+++ b/tensorflow/dtensor/python/accelerator_util.py
@@ -153,8 +153,6 @@ def initialize_accelerator_system(
       The default value is `localhost` in local mode, and
       `worker` when in the multi-client mode. All DTensor clients within the
       same multi-client cluster share the same job name.
-  - `DTENSOR_USE_PARALLEL_EXECUTOR`: string, with its value being `pw` to
-      specify that the backend is Pathways, and TensorFlow otherwise.
 
   Args:
     device_type: Type of accelerator to use, can be CPU, GPU, or TPU. If None,
@@ -259,7 +257,7 @@ def initialize_accelerator_system(
       )._collective_use_nccl_communication = config.gpu_use_nccl_communication(
       )
 
-  if device_type == "TPU" and not config.backend_is_pw():
+  if device_type == "TPU":
     tpu_util.initialize_tpu_system(use_megacore=experimental_enable_megcore)
 
   _INITIALIZED_ACCELERATOR_SYSTEM_TYPE = device_type
@@ -291,7 +289,7 @@ def shutdown_accelerator_system() -> None:
           "not supported."
       )
 
-    if device_type == "TPU" and not config.backend_is_pw():
+    if device_type == "TPU":
       tpu_util.shutdown_tpu_system()
 
     # reset TF context to stop gRPC servers.
diff --git a/tensorflow/dtensor/python/config.py b/tensorflow/dtensor/python/config.py
index d03491d20bbe..35de02847868 100644
--- a/tensorflow/dtensor/python/config.py
+++ b/tensorflow/dtensor/python/config.py
@@ -45,8 +45,8 @@
 
 @tf_export("experimental.dtensor.local_devices", v1=[])
 def local_devices(
-    device_type: str,
-    for_client_id: Optional[int] = None) -> List[tf_device.DeviceSpec]:
+    device_type: str, for_client_id: Optional[int] = None
+) -> List[tf_device.DeviceSpec]:
   """Returns a list of device specs configured on this client."""
   if device_type.upper() not in ["CPU", "GPU", "TPU"]:
     raise ValueError(f"Device type {device_type} is not CPU, GPU, or TPU.")
@@ -61,7 +61,9 @@ def local_devices(
           replica=0,  # replica is deprecated and mostly hard-coded now.
           task=for_client_id,
           device_type=device_type,
-          device_index=i) for i in range(num_local_devices(device_type))
+          device_index=i,
+      )
+      for i in range(num_local_devices(device_type))
   ]
 
 
@@ -89,11 +91,15 @@ def client_id() -> int:
   # If missing, assume running with a single client with client_id of 0.
   client_id_value = int(os.environ.get(_DT_CLIENT_ID, "0"))
   if client_id_value < 0:
-    raise ValueError(f"Environment variable {_DT_CLIENT_ID} "
-                     f"must be >= 0, got {client_id_value}. ")
+    raise ValueError(
+        f"Environment variable {_DT_CLIENT_ID} "
+        f"must be >= 0, got {client_id_value}. "
+    )
   if client_id_value >= num_clients():
-    raise ValueError(f"Environment variable {_DT_CLIENT_ID} "
-                     f"must be < {num_clients()}, got {client_id_value}")
+    raise ValueError(
+        f"Environment variable {_DT_CLIENT_ID} "
+        f"must be < {num_clients()}, got {client_id_value}"
+    )
   return client_id_value
 
 
@@ -110,8 +116,9 @@ def job_name() -> str:
   """Returns the job name used by all clients in this DTensor cluster."""
   # If missing, assumes the program runs locally and use localhost as job name
   # per TensorFlow convention.
-  return os.environ.get(_DT_JOB_NAME,
-                        "localhost" if num_clients() == 1 else "worker")
+  return os.environ.get(
+      _DT_JOB_NAME, "localhost" if num_clients() == 1 else "worker"
+  )
 
 
 @tf_export("experimental.dtensor.full_job_name", v1=[])
@@ -160,7 +167,8 @@ def jobs() -> List[str]:
       raise ValueError(
           f"Unexpected DTENSOR_JOBS content {d_jobs}. Sort entries "
           "in DTENSOR_JOBS because cluster construction relies on "
-          "the order.")
+          "the order."
+      )
 
   return d_jobs_list
 
@@ -212,8 +220,3 @@ def use_multi_device_mode() -> bool:
 def gpu_use_nccl_communication() -> bool:
   """Return True if environment indicates NCCL shall be used for GPU."""
   return os.environ.get("DTENSOR_GPU_USE_NCCL_COMMUNICATION", "0") != "0"
-
-
-def backend_is_pw() -> bool:
-  """Return True if environment indicates the backend is Pathways."""
-  return os.environ.get("DTENSOR_USE_PARALLEL_EXECUTOR") == "pw"
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 8f135032ee2f..7c7c78a77ebd 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -5,8 +5,6 @@ load(
     "//tensorflow/dtensor:build_defs.bzl",
     "ALL_BACKENDS",
     "GPU_2DEVS_BACKEND",
-    "PATHWAYS",
-    "PATHWAYS_V3_DONUT_BACKEND",
     "TPU_V3_DONUT_BACKEND",
     "TPU_V4_DONUT_BACKEND",
     "dtensor_test",
@@ -192,13 +190,10 @@ dtensor_test(
     additional_backends = [
         TPU_V3_DONUT_BACKEND,
         GPU_2DEVS_BACKEND,
-        PATHWAYS,
-        PATHWAYS_V3_DONUT_BACKEND,
     ],
     deps = [
         ":test_util",
         "//tensorflow/dtensor/python:api",
-        "//tensorflow/dtensor/python:config",
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
@@ -230,7 +225,6 @@ dtensor_test(
     deps = [
         ":test_util",
         "//tensorflow/dtensor/python:api",
-        "//tensorflow/dtensor/python:config",
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
@@ -600,6 +594,7 @@ dtensor_test(
     env = {
         "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
         "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
+        "NCCL_CUMEM_HOST_ENABLE": "0",  # TODO(b/414413290): workaround for cuda 12.8.1 failure.
     },
     tags = [
         "no_windows",
diff --git a/tensorflow/dtensor/python/tests/collective_test.py b/tensorflow/dtensor/python/tests/collective_test.py
index a6af55a5f2cd..0f720a14a3d4 100644
--- a/tensorflow/dtensor/python/tests/collective_test.py
+++ b/tensorflow/dtensor/python/tests/collective_test.py
@@ -21,11 +21,9 @@
 
 # pylint: disable=g-direct-tensorflow-import
 from tensorflow.dtensor.python import api
-from tensorflow.dtensor.python import config
 from tensorflow.dtensor.python import d_variable
 from tensorflow.dtensor.python import dtensor_device
 from tensorflow.dtensor.python import layout as layout_lib
-from tensorflow.dtensor.python.tests import test_backend_util
 from tensorflow.dtensor.python.tests import test_util
 from tensorflow.python.eager.polymorphic_function import polymorphic_function
 from tensorflow.python.framework import constant_op
@@ -103,7 +101,6 @@ def testReduceOnInt8(self):
     self.assertDTensorEqual(expected_result, self.scalar_layout, dtensor_result)
 
   def testTwoReducesWithAssign(self):
-    self.skipForPathways('TODO(b/260775095)')
     # FIXME(b/238384852): The purpose of this test is to validate the control
     # dependency added by DTensor.
     # However, as we have no way of testing the per-device graph
@@ -247,11 +244,8 @@ def testDeviceIdTensorOnSplitHost(self):
     # core IDs: both are range(8). So local device IDs happen to be usable here.
     # TODO(b/180046115): Add a device.get_tpu_core_ids method and translate
     # device IDs to core IDs before setting the list here.
-    if not config.backend_is_pw():
-      device = dtensor_device.DTensorDevice(meshes=[mesh])
-      device.set_tpu_core_ids('tpu_mesh', local_ids)
-    else:
-      test_backend_util.config_test_mesh(mesh)
+    device = dtensor_device.DTensorDevice(meshes=[mesh])
+    device.set_tpu_core_ids('tpu_mesh', local_ids)
     layout_x = Layout.batch_sharded(mesh, _MESH_DIM_X, 2)
     layout_y = Layout.batch_sharded(mesh, _MESH_DIM_Y, 2)
 
@@ -536,7 +530,6 @@ def testAllReduceCombinerWithIndirectDependency(self):
     # The purpose of this test is to validate the depdency check in AllReduce
     # AllReduce combiner (dtensor_allreduce_combine_optimization). Specifically,
     # the side effects from indirect dependency.
-    self.skipForPathways('TODO(b/260775095)')
     self.skipForDeviceType(['TPU'],
                            'This test requires 8 TPU cores.',
                            unless_device_count_equals_to=8)
diff --git a/tensorflow/dtensor/python/tests/test_backend_name.py b/tensorflow/dtensor/python/tests/test_backend_name.py
index 0aa665b58bc3..8c679a3dc157 100644
--- a/tensorflow/dtensor/python/tests/test_backend_name.py
+++ b/tensorflow/dtensor/python/tests/test_backend_name.py
@@ -29,8 +29,6 @@ class DTensorTestUtilBackend(enum.Enum):
   TPU_STREAM_EXECUTOR = 'tpu_se'
   TPU_V3_DONUT_BACKEND = 'tpu_v3_2x2'
   TPU_V4_DONUT_BACKEND = 'tpu_v4_2x2'
-  PATHWAYS = 'pw'
-  PATHWAYS_V3_DONUT_BACKEND = 'pw_v3_2x2'
 
 
 DTENSOR_TEST_UTIL_BACKEND = DTensorTestUtilBackend(
diff --git a/tensorflow/dtensor/python/tests/test_backend_util.py b/tensorflow/dtensor/python/tests/test_backend_util.py
index 02fc82a71b75..b52bc1cfe713 100644
--- a/tensorflow/dtensor/python/tests/test_backend_util.py
+++ b/tensorflow/dtensor/python/tests/test_backend_util.py
@@ -19,8 +19,6 @@
 import os
 
 from tensorflow.dtensor.python import accelerator_util
-from tensorflow.dtensor.python import config
-from tensorflow.dtensor.python import layout as layout_lib
 from tensorflow.dtensor.python.tests.test_backend_name import DTENSOR_TEST_UTIL_BACKEND
 from tensorflow.python.platform import test as tf_test
 
@@ -39,16 +37,6 @@ def tearDown(self):
       accelerator_util.shutdown_accelerator_system()
 
 
-def config_test_mesh(mesh: layout_lib.Mesh):
-  """No Op.
-
-  Args:
-    mesh: The DTensor mesh.
-  """
-  if config.backend_is_pw():
-    del mesh
-
-
 def slice_host_devices_for_multiworker(num_clients, client_id, ports):
   """Configure the current process to only use a slice of devices."""
   if num_clients == 0:
diff --git a/tensorflow/dtensor/python/tests/test_util.py b/tensorflow/dtensor/python/tests/test_util.py
index 50aa465fd969..d6e875a5b5fe 100644
--- a/tensorflow/dtensor/python/tests/test_util.py
+++ b/tensorflow/dtensor/python/tests/test_util.py
@@ -35,7 +35,6 @@
 from tensorflow.dtensor.python.config import is_tpu_present  # pylint: disable=unused-import
 from tensorflow.dtensor.python.config import preferred_device_type  # pylint: disable=unused-import
 from tensorflow.dtensor.python.config import use_multi_device_mode  # pylint: disable=unused-import
-from tensorflow.dtensor.python.tests import test_backend_util
 from tensorflow.dtensor.python.tests.test_backend_name import DTENSOR_TEST_UTIL_BACKEND
 from tensorflow.dtensor.python.tests.test_backend_name import DTensorTestUtilBackend
 from tensorflow.dtensor.python.tests.test_backend_util import DTensorTestBackendConfigurator
@@ -83,7 +82,9 @@ def create_device_array(shape, device_type):
       tf_device.DeviceSpec(  # pylint: disable=g-complex-comprehension
           job='localhost/replica:0/task:0',
           device_type=device_type,
-          device_index=i) for i in range(device_count)
+          device_index=i,
+      )
+      for i in range(device_count)
   ]).reshape(shape)
 
 
@@ -110,8 +111,10 @@ def reset_logical_devices(device_type, count):
   reset_context()
   devices = tf_config.list_physical_devices(device_type)
   if device_type.upper() not in ('CPU', 'GPU'):
-    raise ValueError('resetting logical device for non-supported device type : '
-                     '%s' % device_type)
+    raise ValueError(
+        'resetting logical device for non-supported device type : %s'
+        % device_type
+    )
 
   if count < len(devices):
     devices = devices[:count]
@@ -125,7 +128,8 @@ def reset_logical_devices(device_type, count):
       if device_type.upper() == 'GPU':
         dev_config = context.LogicalDeviceConfiguration(
             memory_limit=_DEFAULT_GPU_MEMORY_LIMIT,
-            experimental_device_ordinal=ordinal)
+            experimental_device_ordinal=ordinal,
+        )
       else:
         dev_config = context.LogicalDeviceConfiguration()
       configs.append(dev_config)
@@ -183,7 +187,7 @@ def tearDown(self):
 
   @staticmethod
   def configTestMesh(  # pylint: disable=invalid-name
-      device_type_mesh_map: typing.Dict[typing.Text, layout_lib.Mesh]
+      device_type_mesh_map: typing.Dict[typing.Text, layout_lib.Mesh],
   ) -> layout_lib.Mesh:
     """Configs corresponding mesh given test context.
 
@@ -202,8 +206,9 @@ def configTestMesh(  # pylint: disable=invalid-name
     def get_mesh(device_type):
       mesh = device_type_mesh_map.get(device_type, None)
       if mesh is None:
-        raise ValueError('Requires a %s mesh to run test on %s.' %
-                         (device_type, device_type))
+        raise ValueError(
+            'Requires a %s mesh to run test on %s.' % (device_type, device_type)
+        )
       return mesh
 
     mesh = None
@@ -220,15 +225,14 @@ def get_mesh(device_type):
       reset_logical_devices('CPU', np.prod(mesh.shape()))
       accelerator_util.initialize_accelerator_system('CPU')
 
-    test_backend_util.config_test_mesh(mesh)
-
     return mesh
 
   def skipForDeviceType(  # pylint: disable=invalid-name
       self,
       device_type: typing.List[str],
       reason: str,
-      unless_device_count_equals_to=None):
+      unless_device_count_equals_to=None,
+  ):
     """Skip the test for the specific device_type.
 
     Args:
@@ -239,16 +243,22 @@ def skipForDeviceType(  # pylint: disable=invalid-name
         of TPUs equals to the specified count.
     """
     physical_device_types = set(
-        [d.device_type for d in tf_config.list_physical_devices()])
+        [d.device_type for d in tf_config.list_physical_devices()]
+    )
     for device in device_type:
       if device == 'TPU' and is_tpu_present():
         if unless_device_count_equals_to is None:
           self.skipTest(reason)
-        elif len(list_local_logical_devices(
-            device)) != unless_device_count_equals_to:
+        elif (
+            len(list_local_logical_devices(device))
+            != unless_device_count_equals_to
+        ):
           self.skipTest(reason)
-      if device == 'CPU' and len(
-          physical_device_types) == 1 and 'CPU' in physical_device_types:
+      if (
+          device == 'CPU'
+          and len(physical_device_types) == 1
+          and 'CPU' in physical_device_types
+      ):
         # Make sure we skip when only `CPU` is present.
         self.skipTest(reason)
       if device == 'GPU' and 'GPU' in physical_device_types:
@@ -264,19 +274,17 @@ def skipTest(self, reason):  # pylint: disable=invalid-name
       self._backend_configurator.tearDown()
     super().skipTest(reason)
 
-  def skipForPathways(self, reason: str):  # pylint: disable=invalid-name
-    if config.backend_is_pw():
-      self.skipTest(reason)
-
   def assertDTensorEqual(
       self,  # pylint: disable=invalid-name
       expected_result,
       expected_layout,
       result_dtensor,
-      tol=DEFAULT_TOL):
+      tol=DEFAULT_TOL,
+  ):
     """Asserts DTensor is of the particular value."""
     if issubclass(
-        type(result_dtensor), resource_variable_ops.BaseResourceVariable):
+        type(result_dtensor), resource_variable_ops.BaseResourceVariable
+    ):
       result_dtensor = result_dtensor.value()
     if expected_layout is not None:
       # This, the assertEqual, is a pure proto raw bytes comparison. To make it
@@ -288,11 +296,13 @@ def assertDTensorEqual(
       expected_str = expected_layout.to_string()
       got_str = api.fetch_layout(result_dtensor).to_string()
       index_for_mesh = expected_str.find('mesh:')
-      if index_for_mesh != -1 and got_str.find(
-          expected_str[index_for_mesh:]) != -1:
+      if (
+          index_for_mesh != -1
+          and got_str.find(expected_str[index_for_mesh:]) != -1
+      ):
         # the mesh part is same. cut them so it is more readable.
         expected_str = expected_str[:index_for_mesh]
-        got_str = got_str[:got_str.find('mesh:')]
+        got_str = got_str[: got_str.find('mesh:')]
 
       self.assertEqual(
           api.fetch_layout(result_dtensor),
@@ -375,9 +385,7 @@ def product(*lists):
   # (("test1", ...), ("test2", ...), ...).
   # Function returns the product of the lists with the labels concatenated.
   return [  # pylint: disable=g-complex-comprehension
-      (''.join(p[0]
-               for p in elt), *sum((p[1:]
-                                    for p in elt), ()))
+      (''.join(p[0] for p in elt), *sum((p[1:] for p in elt), ()))
       for elt in itertools.product(*lists)
   ]
 
diff --git a/tensorflow/dtensor/python/tpu_util.py b/tensorflow/dtensor/python/tpu_util.py
index 5c746a53b5db..3e7dd5e22465 100644
--- a/tensorflow/dtensor/python/tpu_util.py
+++ b/tensorflow/dtensor/python/tpu_util.py
@@ -91,7 +91,7 @@ def _create_device_array(shape, device_type, host_id, local_device_ids=None):
   # User can specify local_device_ids or use default list for multi host.
   num_local_devices = len(local_device_list)
   local_device_ids = [
-      x + host_id * num_local_devices for x in range(num_local_devices)
+      x + host_id * num_local_devices for x in range(num_local_devices)  # pytype: disable=unsupported-operands
   ] if not local_device_ids else local_device_ids
 
   return global_device_ids, local_device_ids, local_device_list
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_kernel.cc b/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_kernel.cc
index 41ac3f60116e..3f0f539bd0c3 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_kernel.cc
+++ b/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_kernel.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/log/log.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 
diff --git a/tensorflow/examples/custom_ops_doc/sleep/sleep_kernel.cc b/tensorflow/examples/custom_ops_doc/sleep/sleep_kernel.cc
index 1e8b542fbf5d..02f17106f346 100644
--- a/tensorflow/examples/custom_ops_doc/sleep/sleep_kernel.cc
+++ b/tensorflow/examples/custom_ops_doc/sleep/sleep_kernel.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
+#include "absl/log/log.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/examples/custom_ops_doc/sleep/sleep_op.cc b/tensorflow/examples/custom_ops_doc/sleep/sleep_op.cc
index 80d91f2e747c..5b4988d87cd3 100644
--- a/tensorflow/examples/custom_ops_doc/sleep/sleep_op.cc
+++ b/tensorflow/examples/custom_ops_doc/sleep/sleep_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 68977e68f980..6ba78d3df619 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -46,6 +46,7 @@ tf_cc_binary(
             "//tensorflow/core:tensorflow",
         ],
     }) + [
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "//tensorflow/cc:ops",
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index b45127e53a32..fcea63eb3796 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "tensorflow/cc/framework/ops.h"
@@ -119,7 +120,8 @@ static Status ReadEntireFile(tensorflow::Env* env, const string& filename,
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
 
   absl::string_view data;
-  TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(contents)[0]));
+  TF_RETURN_IF_ERROR(
+      file->Read(0, data, absl::MakeSpan(&contents[0], file_size)));
   if (data.size() != file_size) {
     return tensorflow::errors::DataLoss("Truncated read of '", filename,
                                         "' expected ", file_size, " got ",
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 8b41f837840a..4c749134e6d7 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11891,10 +11891,11 @@ func DecodeImageDtype(value tf.DataType) DecodeImageAttr {
 
 // DecodeImageExpandAnimations sets the optional expand_animations attribute to value.
 //
-// value: Controls the output shape of the returned op. If True, the returned op will
-// produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D tensor for all
-// GIFs, whether animated or not. If, False, the returned op will produce a 3-D
-// tensor for all file types and will truncate animated GIFs to the first frame.
+// value: Controls the output shape of the returned op. If True, the returned op
+// will produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D
+// tensor for all GIFs and WebP images, whether animated or not. If,
+// False, the returned op will produce a 3-D tensor for all file types
+// and will truncate animated images to the first frame.
 // If not specified, defaults to true
 func DecodeImageExpandAnimations(value bool) DecodeImageAttr {
 	return func(m optionalAttr) {
@@ -11902,19 +11903,20 @@ func DecodeImageExpandAnimations(value bool) DecodeImageAttr {
 	}
 }
 
-// Function for decode_bmp, decode_gif, decode_jpeg, and decode_png.
+// Function for decode_bmp, decode_gif, decode_jpeg, decode_webp, and decode_png.
 //
-// Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+// Detects whether an image is a BMP, GIF, JPEG, WebP, or PNG, and performs the
 // appropriate operation to convert the input bytes string into a Tensor of type
 // dtype.
 //
-// *NOTE*: decode_gif returns a 4-D array [num_frames, height, width, 3], as
-// opposed to decode_bmp, decode_jpeg and decode_png, which return 3-D arrays
-// [height, width, num_channels]. Make sure to take this into account when
-// constructing your graph if you are intermixing GIF files with BMP, JPEG, and/or
-// PNG files. Alternately, set the expand_animations argument of this function to
-// False, in which case the op will return 3-dimensional tensors and will truncate
-// animated GIF files to the first frame.
+// *NOTE*: decode_gif and decode_webp return a 4-D
+// array [num_frames, height, width, 3], as opposed to decode_bmp,
+// decode_jpeg, and decode_png, which always return 3-D arrays [height,
+// width, num_channels]. Make sure to take this into account when
+// constructing your graph if you are intermixing animated files with
+// BMP, JPEG, and/or PNG files. Alternately, set the expand_animations
+// argument of this function to False, in which case the op will return
+// 3-dimensional tensors and will truncate animations to the first frame.
 //
 // *NOTE*: If the first frame of an animated GIF does not occupy the entire
 // canvas (maximum frame width x maximum frame height), then it fills the
@@ -12459,6 +12461,65 @@ func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (aud
 	return op.Output(0), op.Output(1)
 }
 
+// DecodeWebPAttr is an optional argument to DecodeWebP.
+type DecodeWebPAttr func(optionalAttr)
+
+// DecodeWebPChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeWebPChannels(value int64) DecodeWebPAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeWebPDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodeWebPDtype(value tf.DataType) DecodeWebPAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Decode a WebP-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the WebP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// The number of channels must currently match that of the underlying file.
+// For WebP animations, only 4-channel RGBA is supported.
+//
+// Arguments:
+//
+//	contents: 0-D.  The WebP-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, channels]`.
+func DecodeWebP(scope *Scope, contents tf.Output, optional ...DecodeWebPAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeWebP",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Makes a copy of `x`.
 //
 // Arguments:
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index dd1997b8e2d4..12a1caaa992a 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -72,6 +72,8 @@ const (
 	Float8e5m2fnuz    DataType = C.TF_FLOAT8_E5M2FNUZ
 	Int4              DataType = C.TF_INT4
 	Uint4             DataType = C.TF_UINT4
+	Int2              DataType = C.TF_INT2
+	Uint2             DataType = C.TF_UINT2
 )
 
 // Tensor holds a multi-dimensional array of elements of a single data type.
@@ -561,7 +563,7 @@ func isTensorSerializable(dataType DataType) error {
 	// serialization and deserialization of Tensors.  Till then capitalize
 	// on knowledge of the implementation for numeric types.
 	switch dataType {
-	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half, Float8e5m2, Float8e4m3fn, Float8e4m3fnuz, Float8e4m3b11fnuz, Float8e5m2fnuz, Int4, Uint4:
+	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half, Float8e5m2, Float8e4m3fn, Float8e4m3fnuz, Float8e4m3b11fnuz, Float8e5m2fnuz, Int4, Uint4, Int2, Uint2:
 		return nil
 	default:
 		return fmt.Errorf("serialization of tensors with the DataType %d is not yet supported, see https://github.com/tensorflow/tensorflow/issues/6003", dataType)
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 741e089eddf0..2a6cb89cb485 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -137,7 +137,9 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
     ],
diff --git a/tensorflow/java/generate_pom.cc b/tensorflow/java/generate_pom.cc
index 85f1c56d5b61..99a716b7ba2b 100644
--- a/tensorflow/java/generate_pom.cc
+++ b/tensorflow/java/generate_pom.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <iostream>
 #include <string>
 
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
index 751289797875..814bf035d0e7 100644
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -13,7 +13,6 @@
  limitations under the License.
 ==============================================================================*/
 
-#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 11798ad56641..61ccbd124eaa 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -19,10 +19,12 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <set>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 #include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/api_def.pb.h"
diff --git a/tensorflow/java/src/gen/cc/op_generator.h b/tensorflow/java/src/gen/cc/op_generator.h
index 048c05193d9b..e59c706e7355 100644
--- a/tensorflow/java/src/gen/cc/op_generator.h
+++ b/tensorflow/java/src/gen/cc/op_generator.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 9774ff2bb5ae..bffb769004b5 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/java/src/gen/cc/op_specs.h"
 
+#include <cctype>
 #include <map>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/strip.h"
@@ -138,7 +141,7 @@ std::pair<Type, Type> TypeResolver::TypesOf(const OpDef_AttrDef& attr_def,
                                             bool* iterable_out) {
   std::pair<Type, Type> types = MakeTypePair(Type::Wildcard());
   *iterable_out = false;
-  StringPiece attr_type = attr_def.type();
+  absl::string_view attr_type = attr_def.type();
   if (absl::ConsumePrefix(&attr_type, "list(")) {
     attr_type.remove_suffix(1);  // remove closing brace
     *iterable_out = true;
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index 161f14c1ba9b..b3878e85c6b1 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/java/src/gen/cc/source_writer.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <list>
 #include <string>
 
@@ -55,7 +56,7 @@ SourceWriter& SourceWriter::Prefix(const char* line_prefix) {
   return *this;
 }
 
-SourceWriter& SourceWriter::Write(const StringPiece& str) {
+SourceWriter& SourceWriter::Write(const absl::string_view& str) {
   size_t line_pos = 0;
   do {
     size_t start_pos = line_pos;
@@ -78,7 +79,7 @@ SourceWriter& SourceWriter::WriteFromFile(const string& fname, Env* env) {
   return Write(data_);
 }
 
-SourceWriter& SourceWriter::Append(const StringPiece& str) {
+SourceWriter& SourceWriter::Append(const absl::string_view& str) {
   if (!str.empty()) {
     if (newline_) {
       DoAppend(left_margin_ + line_prefix_);
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 527288dc5348..f7fd51a7e9cc 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -4,29 +4,37 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_library")
 
-package(default_visibility = [
-    "//tensorflow/java:__pkg__",
-    "//tensorflow/tools/android/inference_interface:__pkg__",
-])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/java:__pkg__",
+        # TODO(ashankar): Temporary hack for the Java API and
+        # //third_party/tensorflow/tools/android/inference_interface:android_tensorflow_inference_jni
+        # to co-exist in a single shared library. However, the hope is that
+        # //third_party/tensorflow/tools/android/inference_interface:android_tensorflow_jni can be
+        # removed once the Java API provides feature parity with it.
+        "//tensorflow/tools/android/inference_interface:__pkg__",
+    ],
+    licenses = ["notice"],
+)
 
-licenses(["notice"])  # Apache 2.0
+filegroup(
+    name = "native_srcs",
+    srcs = glob([
+        "*.cc",
+        "*.h",
+    ]),
+    visibility = ["//visibility:public"],
+)
 
 tf_cuda_library(
     name = "native",
-    srcs = glob(["*.cc"]) + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            ":jni.h",
-            ":jni_md.h",
-        ],
-    }),
+    srcs = glob(["*.cc"]),
     hdrs = glob(["*.h"]),
     copts = tf_copts(),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["."],
+    features = select({
+        "//tensorflow:android": ["-layering_check"],
+        "//conditions:default": [],
     }),
     deps = select({
         "//tensorflow:android": [
@@ -38,34 +46,8 @@ tf_cuda_library(
             "//tensorflow/core:all_kernels",
             "//tensorflow/core:direct_session",
             "//tensorflow/core:ops",
+            "@bazel_tools//tools/jdk:jni",
         ],
     }),
     alwayslink = 1,
 )
-
-# Silly rules to make
-# #include <jni.h>
-# in the source headers work
-# (in combination with the "includes" attribute of the tf_cuda_library rule
-# above. Not needed when using the Android toolchain).
-#
-# Inspired from:
-# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
-# but hopefully there is a simpler alternative to this.
-genrule(
-    name = "copy_jni_h",
-    srcs = ["@bazel_tools//tools/jdk:jni_header"],
-    outs = ["jni.h"],
-    cmd = "cp -f $< $@",
-)
-
-genrule(
-    name = "copy_jni_md_h",
-    srcs = select({
-        "//tensorflow:windows": ["@bazel_tools//tools/jdk:jni_md_header-windows"],
-        "//tensorflow:macos": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
-        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
-    }),
-    outs = ["jni_md.h"],
-    cmd = "cp -f $< $@",
-)
diff --git a/tensorflow/lite/acceleration/configuration/configuration.proto b/tensorflow/lite/acceleration/configuration/configuration.proto
index 29d911bd1b05..f9e480ec2b3e 100644
--- a/tensorflow/lite/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/acceleration/configuration/configuration.proto
@@ -337,6 +337,8 @@ message XNNPackSettings {
   // reloaded from this cache which can reduce initialization time and the
   // packing memory footprint.
   optional string weight_cache_file_path = 3;
+  // Extra flags to pass to xnn_create_runtime.
+  optional int32 runtime_flags = 4;
 }
 
 // CoreML Delegate settings.
diff --git a/tensorflow/lite/acceleration/configuration/configuration_generated.h b/tensorflow/lite/acceleration/configuration/configuration_generated.h
index 4cb4861e78f4..0e7d3219ef49 100644
--- a/tensorflow/lite/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/acceleration/configuration/configuration_generated.h
@@ -1701,6 +1701,7 @@ struct XNNPackSettingsT : public ::flatbuffers::NativeTable {
   int32_t num_threads = 0;
   tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS;
   std::string weight_cache_file_path{};
+  int32_t runtime_flags = 0;
 };
 
 struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
@@ -1709,7 +1710,8 @@ struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NUM_THREADS = 4,
     VT_FLAGS = 6,
-    VT_WEIGHT_CACHE_FILE_PATH = 8
+    VT_WEIGHT_CACHE_FILE_PATH = 8,
+    VT_RUNTIME_FLAGS = 10
   };
   int32_t num_threads() const {
     return GetField<int32_t>(VT_NUM_THREADS, 0);
@@ -1720,12 +1722,16 @@ struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const ::flatbuffers::String *weight_cache_file_path() const {
     return GetPointer<const ::flatbuffers::String *>(VT_WEIGHT_CACHE_FILE_PATH);
   }
+  int32_t runtime_flags() const {
+    return GetField<int32_t>(VT_RUNTIME_FLAGS, 0);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
            VerifyField<int32_t>(verifier, VT_FLAGS, 4) &&
            VerifyOffset(verifier, VT_WEIGHT_CACHE_FILE_PATH) &&
            verifier.VerifyString(weight_cache_file_path()) &&
+           VerifyField<int32_t>(verifier, VT_RUNTIME_FLAGS, 4) &&
            verifier.EndTable();
   }
   XNNPackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1746,6 +1752,9 @@ struct XNNPackSettingsBuilder {
   void add_weight_cache_file_path(::flatbuffers::Offset<::flatbuffers::String> weight_cache_file_path) {
     fbb_.AddOffset(XNNPackSettings::VT_WEIGHT_CACHE_FILE_PATH, weight_cache_file_path);
   }
+  void add_runtime_flags(int32_t runtime_flags) {
+    fbb_.AddElement<int32_t>(XNNPackSettings::VT_RUNTIME_FLAGS, runtime_flags, 0);
+  }
   explicit XNNPackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1761,8 +1770,10 @@ inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
     ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_threads = 0,
     tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
-    ::flatbuffers::Offset<::flatbuffers::String> weight_cache_file_path = 0) {
+    ::flatbuffers::Offset<::flatbuffers::String> weight_cache_file_path = 0,
+    int32_t runtime_flags = 0) {
   XNNPackSettingsBuilder builder_(_fbb);
+  builder_.add_runtime_flags(runtime_flags);
   builder_.add_weight_cache_file_path(weight_cache_file_path);
   builder_.add_flags(flags);
   builder_.add_num_threads(num_threads);
@@ -1773,13 +1784,15 @@ inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettingsDirect(
     ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_threads = 0,
     tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
-    const char *weight_cache_file_path = nullptr) {
+    const char *weight_cache_file_path = nullptr,
+    int32_t runtime_flags = 0) {
   auto weight_cache_file_path__ = weight_cache_file_path ? _fbb.CreateString(weight_cache_file_path) : 0;
   return tflite::CreateXNNPackSettings(
       _fbb,
       num_threads,
       flags,
-      weight_cache_file_path__);
+      weight_cache_file_path__,
+      runtime_flags);
 }
 
 ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -4971,7 +4984,8 @@ inline bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs)
   return
       (lhs.num_threads == rhs.num_threads) &&
       (lhs.flags == rhs.flags) &&
-      (lhs.weight_cache_file_path == rhs.weight_cache_file_path);
+      (lhs.weight_cache_file_path == rhs.weight_cache_file_path) &&
+      (lhs.runtime_flags == rhs.runtime_flags);
 }
 
 inline bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
@@ -4991,6 +5005,7 @@ inline void XNNPackSettings::UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers:
   { auto _e = num_threads(); _o->num_threads = _e; }
   { auto _e = flags(); _o->flags = _e; }
   { auto _e = weight_cache_file_path(); if (_e) _o->weight_cache_file_path = _e->str(); }
+  { auto _e = runtime_flags(); _o->runtime_flags = _e; }
 }
 
 inline ::flatbuffers::Offset<XNNPackSettings> XNNPackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -5004,11 +5019,13 @@ inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffer
   auto _num_threads = _o->num_threads;
   auto _flags = _o->flags;
   auto _weight_cache_file_path = _o->weight_cache_file_path.empty() ? 0 : _fbb.CreateString(_o->weight_cache_file_path);
+  auto _runtime_flags = _o->runtime_flags;
   return tflite::CreateXNNPackSettings(
       _fbb,
       _num_threads,
       _flags,
-      _weight_cache_file_path);
+      _weight_cache_file_path,
+      _runtime_flags);
 }
 
 
diff --git a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
index 569042d3c88e..b8881307b3aa 100644
--- a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
+++ b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
@@ -335,6 +335,8 @@ message XNNPackSettings {
   // reloaded from this cache which can reduce initialization time and the
   // packing memory footprint.
   optional string weight_cache_file_path = 3;
+  // Extra flags to pass to xnn_create_runtime
+  optional int32 runtime_flags = 4;
 }
 
 // CoreML Delegate settings.
diff --git a/tensorflow/lite/cmake/DownloadPThreadPool.cmake b/tensorflow/lite/cmake/DownloadPThreadPool.cmake
index 08441656f177..e12799e3231a 100644
--- a/tensorflow/lite/cmake/DownloadPThreadPool.cmake
+++ b/tensorflow/lite/cmake/DownloadPThreadPool.cmake
@@ -19,8 +19,8 @@ PROJECT(pthreadpool-download NONE)
 
 INCLUDE(ExternalProject)
 ExternalProject_Add(pthreadpool
-  URL https://github.com/google/pthreadpool/archive/b1aee199d54003fb557076a201bcac3398af580b.zip
-  URL_HASH SHA256=215724985c4845cdcadcb5f26a2a8777943927bb5a172a00e7716fe16a6f3c1b
+  URL https://github.com/google/pthreadpool/archive/b92447772365661680f486e39a91dfe6675adafc.zip
+  URL_HASH SHA256=745e56516d6a58d183eb33d9017732d87cff43ce9f78908906f9faa52633e421
   SOURCE_DIR "${CMAKE_BINARY_DIR}/pthreadpool-source"
   BINARY_DIR "${CMAKE_BINARY_DIR}/pthreadpool"
   CONFIGURE_COMMAND ""
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index f52c4cc77b46..5e0ded435cd9 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -148,6 +148,19 @@ cc_library(
     alwayslink = 1,  # TODO(b/161243354): eliminate this.
 )
 
+# This is a private target, its visibility is set to public only to be
+# used by LiteRT dependencies.
+# Do not use this target directly and don't consider it as a part of the public API.
+# TODO(weiyiw): Refactor LiteRT deps from TFLite.
+alias(
+    name = "private_cc_api_stable",
+    actual = ":cc_api_stable",
+    tags = ["avoid_dep"],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
 # TODO(b/242310498): move logger.cc from tensorflow/lite/ to here.
 cc_library(
     name = "cc_api_stable",
diff --git a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
index 1133b1b69c0e..8154615931a4 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
@@ -40,6 +40,7 @@ static TfLiteDelegate* CreateDelegate(const void* settings) {
     if (xnnpack_settings->flags()) {
       options.flags = xnnpack_settings->flags();
     }
+    options.runtime_flags = xnnpack_settings->runtime_flags();
     if (xnnpack_settings->weight_cache_file_path()) {
       options.weight_cache_file_path =
           xnnpack_settings->weight_cache_file_path()->c_str();
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
index 18a643c78dc2..c5052c78f840 100644
--- a/tensorflow/lite/core/api/tensor_utils.cc
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -33,8 +33,8 @@ TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {
   }
   // TODO(b/139446230): Provide a platform header to better handle these
   // specific scenarios.
-#if __ANDROID__ || defined(__x86_64__) || defined(__i386__) || \
-    defined(__i386) || defined(__x86__) || defined(__X86__) || \
+#if defined(__ANDROID__) || defined(__x86_64__) || defined(__i386__) || \
+    defined(__i386) || defined(__x86__) || defined(__X86__) ||          \
     defined(_X86_) || defined(_M_IX86) || defined(_M_X64)
   memset(tensor->data.raw, value, tensor->bytes);
 #else
diff --git a/tensorflow/lite/core/async/task_internal_test.cc b/tensorflow/lite/core/async/task_internal_test.cc
index b0dc1ae38591..68e8004fa5d4 100644
--- a/tensorflow/lite/core/async/task_internal_test.cc
+++ b/tensorflow/lite/core/async/task_internal_test.cc
@@ -14,8 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/core/async/task_internal.h"
 
-#include <string>
-
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index a2b03389d686..98ae0d425ab1 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -53,6 +53,10 @@ filegroup(
         "//tensorflow/compiler/mlir/lite/core/c:lite_headers_filegroup",
         "//tensorflow/lite/core/async/c:types.h",
     ],
+    visibility = [
+        # Temporary workaround to make visible to litert in OSS (default vis is not transformed correctly.)
+        "//visibility:public",
+    ],
 )
 
 filegroup(
@@ -277,7 +281,7 @@ cc_test(
 )
 
 # This is a private target, its visibility is set to public only to be
-# used by "tflite_custom_c_library".
+# used by "tflite_custom_c_library" and LiteRT dependencies.
 # Do not use this target directly and don't consider it as a part of the public API.
 alias(
     name = "private_c_api_types",
@@ -552,7 +556,7 @@ tflite_cc_library_with_c_headers_test(
 )
 
 # This is a private target, its visibility is set to public only to be
-# used by "custom_c_library_with_tflite".
+# used by "custom_c_library_with_tflite" and LiteRT dependencies.
 # Do not use this target directly and don't consider it as a part of the public API.
 alias(
     name = "private_c_api_opaque_without_op_resolver",
@@ -618,6 +622,7 @@ cc_test(
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/delegates:delegate_test_util",
         "//tensorflow/lite/testing:util",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index beb924415e29..fcbda4e4fb0c 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <stddef.h>
 #include <string.h>
 
+#include <cstdarg>
 #include <cstdint>
 #include <memory>
 #include <mutex>  // NOLINT
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index 41726d2f6dc2..d8445c9c7c9c 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -212,6 +212,8 @@ TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateFromFileWithErrorReporter(
     void* user_data);
 
 /// Destroys the model instance.
+///
+/// If `model` is a null pointer, this function has no effect.
 TFL_CAPI_EXPORT extern void TfLiteModelDelete(TfLiteModel* model);
 
 /// Returns a new interpreter options instances.
@@ -226,6 +228,8 @@ TFL_CAPI_EXPORT extern TfLiteInterpreterOptions* TfLiteInterpreterOptionsCopy(
     const TfLiteInterpreterOptions* from);
 
 /// Destroys the interpreter options instance.
+///
+/// If `options` is a null pointer, this function has no effect.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsDelete(
     TfLiteInterpreterOptions* options);
 
@@ -309,6 +313,8 @@ TFL_CAPI_EXPORT extern TfLiteInterpreter* TfLiteInterpreterCreate(
     const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options);
 
 /// Destroys the interpreter.
+///
+/// If `interpreter` is a null pointer, this function has no effect.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterDelete(
     TfLiteInterpreter* interpreter);
 
@@ -641,6 +647,8 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyToBuffer(
     size_t output_data_size);
 
 /// Destroys the signature runner.
+///
+/// If `signature_runner` is a null pointer, this function has no effect.
 TFL_CAPI_EXPORT extern void TfLiteSignatureRunnerDelete(
     TfLiteSignatureRunner* signature_runner);
 
diff --git a/tensorflow/lite/core/c/c_api_experimental.cc b/tensorflow/lite/core/c/c_api_experimental.cc
index d2128efe608f..a07d8246b629 100644
--- a/tensorflow/lite/core/c/c_api_experimental.cc
+++ b/tensorflow/lite/core/c/c_api_experimental.cc
@@ -17,9 +17,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <cstddef>
 #include <memory>
-#include <utility>
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
diff --git a/tensorflow/lite/core/c/c_api_experimental_test.cc b/tensorflow/lite/core/c/c_api_experimental_test.cc
index f98ddb0b2c00..7ee05979e427 100644
--- a/tensorflow/lite/core/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/core/c/c_api_experimental_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/c_api.h"
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index 8aeb116b6922..b9a0af088072 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include <array>
 #include <cmath>
+#include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <ios>
 #include <string>
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index 8f123d6e9365..baa6282fd5b1 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -102,6 +102,8 @@ void TfLiteVarArrayFree(T* a) {
   free(a);
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
+
 TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) {
   TfLiteQuantization dst;
   dst.type = src.type;
@@ -111,14 +113,25 @@ TfLiteQuantization TfLiteQuantizationClone(const TfLiteQuantization& src) {
     case kTfLiteAffineQuantization: {
       dst.params = calloc(1, sizeof(TfLiteAffineQuantization));
       const TfLiteAffineQuantization* const src_params =
-          (TfLiteAffineQuantization*)(src.params);
+          reinterpret_cast<TfLiteAffineQuantization*>(src.params);
       TfLiteAffineQuantization* const dst_params =
-          (TfLiteAffineQuantization*)(dst.params);
+          reinterpret_cast<TfLiteAffineQuantization*>(dst.params);
       dst_params->quantized_dimension = src_params->quantized_dimension;
       dst_params->scale = TfLiteFloatArrayCopy(src_params->scale);
       dst_params->zero_point = TfLiteIntArrayCopy(src_params->zero_point);
       break;
     }
+    case kTfLiteBlockwiseQuantization: {
+      dst.params = calloc(1, sizeof(TfLiteBlockwiseQuantization));
+      const TfLiteBlockwiseQuantization* const src_params =
+          (TfLiteBlockwiseQuantization*)(src.params);
+      TfLiteBlockwiseQuantization* const dst_params =
+          (TfLiteBlockwiseQuantization*)(dst.params);
+      dst_params->blocksize = src_params->blocksize;
+      dst_params->scale = src_params->scale;
+      dst_params->zero_point = src_params->zero_point;
+      break;
+    }
   }
   return dst;
 }
@@ -152,6 +165,8 @@ TfLiteSparsity* TfLiteSparsityClone(const TfLiteSparsity* const src) {
   return dst;
 }
 
+#endif  // TF_LITE_STATIC_MEMORY
+
 }  // namespace
 
 extern "C" {
@@ -221,7 +236,7 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
 void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
   if (quantization->type == kTfLiteAffineQuantization) {
     TfLiteAffineQuantization* q_params =
-        (TfLiteAffineQuantization*)(quantization->params);
+        reinterpret_cast<TfLiteAffineQuantization*>(quantization->params);
     if (q_params->scale) {
       TfLiteFloatArrayFree(q_params->scale);
       q_params->scale = nullptr;
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 87a9b1a50750..3f1fe32b8b4f 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -322,12 +322,18 @@ typedef struct TfLiteBFloat16 {
 const char* TfLiteTypeGetName(TfLiteType type);
 
 /// SupportedQuantizationTypes.
+#ifdef __cplusplus
 typedef enum TfLiteQuantizationType : int {
+#else
+typedef enum TfLiteQuantizationType {
+#endif
   /// No quantization.
   kTfLiteNoQuantization = 0,
   /// Affine quantization (with support for per-channel quantization).
   /// Corresponds to TfLiteAffineQuantization.
   kTfLiteAffineQuantization = 1,
+  /// Blockwise quantization.
+  kTfLiteBlockwiseQuantization = 2,
 } TfLiteQuantizationType;
 
 /// Structure specifying the quantization used by the tensor, if-any.
@@ -353,6 +359,20 @@ typedef struct TfLiteAffineQuantization {
   int32_t quantized_dimension;
 } TfLiteAffineQuantization;
 
+/// Parameters for blockwise quantization across the output channels dimension.
+/// For a particular value in quantized_dimension, quantized values can be
+/// converted back to float using:
+///     `real_value = scale * (quantized_value - zero_point)`
+typedef struct TfLiteBlockwiseQuantization {
+  // Index of the tensor containing the scales.
+  int32_t scale;
+  // Index of the tensor containing the zero points.
+  int32_t zero_point;
+  // Quantization blocksize.
+  int32_t blocksize;
+  int32_t quantized_dimension;
+} TfLiteBlockwiseQuantization;
+
 /// A union of pointers that points to memory for a given tensor.
 ///
 /// Do not access these members directly, if possible, use
diff --git a/tensorflow/lite/core/c/common_test.cc b/tensorflow/lite/core/c/common_test.cc
index fadc3f2bc68f..e449b4821a44 100644
--- a/tensorflow/lite/core/c/common_test.cc
+++ b/tensorflow/lite/core/c/common_test.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdlib>
+#include <cstring>
 #include <limits>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.cc b/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.cc
index 51c927836135..1e8401aeb6f3 100644
--- a/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.cc
+++ b/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h"
 
-#include <algorithm>
+#include <cstdarg>
+#include <cstdint>
 #include <cstdlib>
 #include <cstring>
-#include <iterator>
 #include <memory>
 #include <vector>
 
@@ -25,9 +25,9 @@ limitations under the License.
 #include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
 #include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h"
-#include "tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
 
diff --git a/tensorflow/lite/core/interpreter_builder.cc b/tensorflow/lite/core/interpreter_builder.cc
index 1c6cc8c2ac9d..8741022e3c2a 100644
--- a/tensorflow/lite/core/interpreter_builder.cc
+++ b/tensorflow/lite/core/interpreter_builder.cc
@@ -407,6 +407,20 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
 TfLiteStatus InterpreterBuilder::ParseQuantization(
     const QuantizationParameters* src_quantization,
     TfLiteQuantization* quantization, const std::vector<int>& dims) {
+  // Blockwise quantization.
+  if (src_quantization && src_quantization->details_type() ==
+                              QuantizationDetails_BlockwiseQuantization) {
+    auto* src_quant = src_quantization->details_as_BlockwiseQuantization();
+    quantization->type = kTfLiteBlockwiseQuantization;
+    auto* blockwise_quantization =
+        reinterpret_cast<TfLiteBlockwiseQuantization*>(
+            malloc(sizeof(TfLiteBlockwiseQuantization)));
+    blockwise_quantization->scale = src_quant->scales();
+    blockwise_quantization->quantized_dimension = 0;
+    blockwise_quantization->blocksize = src_quant->block_size();
+    quantization->params = reinterpret_cast<void*>(blockwise_quantization);
+    return kTfLiteOk;
+  }
   quantization->type = kTfLiteNoQuantization;
   quantization->params = nullptr;
   if (!src_quantization || !src_quantization->scale() ||
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index 216f1dece7ef..699b53e50f69 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -28,6 +28,7 @@ TfLiteRegistration* Register_NUMERIC_VERIFY();
 TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
 TfLiteRegistration* Register_MFCC();
 TfLiteRegistration* Register_DETECTION_POSTPROCESS();
+TfLiteRegistration* Register_HADAMARD_ROTATION();
 
 }  // namespace custom
 
@@ -354,7 +355,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DYNAMIC_UPDATE_SLICE,
              Register_DYNAMIC_UPDATE_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_UNSORTED_SEGMENT_PROD,
              Register_UNSORTED_SEGMENT_PROD());
   AddBuiltin(BuiltinOperator_UNSORTED_SEGMENT_MAX,
@@ -397,6 +398,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
             tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
   AddCustom("TFLite_Detection_PostProcess",
             tflite::ops::custom::Register_DETECTION_POSTPROCESS());
+  AddCustom("aeq.hadamard_rotation",
+            tflite::ops::custom::Register_HADAMARD_ROTATION());
   // By definition, all of the ops added above are not user-defined ops,
   // since they are supported by BuiltinOpResolver.
   may_directly_contain_user_defined_ops_ = false;
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 39fed1f829a3..fdec2ee8783b 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <cstring>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -2158,7 +2159,8 @@ TfLiteStatus Subgraph::UndoAllDelegates() {
   // nodes' inputs to point to their fp16 versions (if delegate supports fp16
   // acceleration). This remapping is performed in FP16GraphPartitionHelper in
   // delegates/utils. We need to undo this remapping to ensure CPU kernels work.
-  std::vector<int> fp16_to_fp32(tensors_size(), -1);
+  std::vector<int> fp16_to_fp32(tensors_size());
+  std::iota(fp16_to_fp32.begin(), fp16_to_fp32.end(), 0);
   for (int execution_plan_index = 0;
        execution_plan_index < execution_plan_.size(); ++execution_plan_index) {
     int node_index = execution_plan_[execution_plan_index];
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index a742981c2eee..d40ef05805d1 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -213,6 +213,8 @@ class Subgraph {
   // TODO(ycling): Move this function to an external context interface.
   resource::ResourceMap& resources() { return *resources_; }
 
+  resource::ResourceMap* resources_ptr() { return resources_; }
+
   // WARNING: Experimental interface, subject to change.
   // TODO(b/149099381): Move this function to an external context interface.
   resource::ResourceIDMap& resource_ids() { return *resource_ids_; }
diff --git a/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc b/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc
index 221a64fe830f..528695dc1352 100644
--- a/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h"
 
+#include <cstdio>
 #include <string>
 
 #include "mlmodel/format/NeuralNetwork.pb.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.cc
index 52a1264646b2..09a33b37385f 100644
--- a/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <cstdio>
 #include <memory>
 #include <string>
 
diff --git a/tensorflow/lite/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/delegates/coreml/builders/op_builder.cc
index 5d151069f3d9..688a62efbb42 100644
--- a/tensorflow/lite/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/op_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
 
+#include <cstdio>
 #include <functional>
 #include <memory>
 #include <string>
diff --git a/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc
index 29cf8c2a7650..e16575145fac 100644
--- a/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/coreml/builders/pad_op_builder.h"
 
+#include <cstdint>
 #include <string>
 
 #include "mlmodel/format/NeuralNetwork.pb.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc b/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc
index 0d5328532c39..698ed599b705 100644
--- a/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.h"
 
+#include <cstdio>
 #include <string>
+#include <vector>
 
 #include "mlmodel/format/NeuralNetwork.pb.h"
 #include "tensorflow/lite/builtin_ops.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc
index b877bfa955e4..48d2721ab3b6 100644
--- a/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/coreml/builders/reshape_op_builder.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
 #include <memory>
 #include <string>
 
diff --git a/tensorflow/lite/delegates/coreml/builders/util.cc b/tensorflow/lite/delegates/coreml/builders/util.cc
index 3622810b7314..2cddc87a674e 100644
--- a/tensorflow/lite/delegates/coreml/builders/util.cc
+++ b/tensorflow/lite/delegates/coreml/builders/util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/coreml/builders/util.h"
 
+#include <cstdint>
 #include <vector>
 
 #include "fp16.h"  // from @FP16
diff --git a/tensorflow/lite/delegates/coreml/builders/util_test.cc b/tensorflow/lite/delegates/coreml/builders/util_test.cc
index b83365e4dce1..6aafe5891305 100644
--- a/tensorflow/lite/delegates/coreml/builders/util_test.cc
+++ b/tensorflow/lite/delegates/coreml/builders/util_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/coreml/builders/util.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index cfd17b4a07f8..eeb4160c6097 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -1067,6 +1067,7 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
     TfLiteRegistration reg = DynamicCopyOpRegistration();
     interpreter_->AddNodeWithParameters({0}, {1, 2}, nullptr, 0, nullptr, &reg);
 
+    delegate_ = TfLiteDelegateCreate();
     delegate_.Prepare = [](TfLiteContext* context,
                            TfLiteDelegate* delegate) -> TfLiteStatus {
       // In this test, the delegate replaces all the nodes if this function is
diff --git a/tensorflow/lite/delegates/delegate_test_util.cc b/tensorflow/lite/delegates/delegate_test_util.cc
index 1097c1408238..91899a74ce39 100644
--- a/tensorflow/lite/delegates/delegate_test_util.cc
+++ b/tensorflow/lite/delegates/delegate_test_util.cc
@@ -158,6 +158,7 @@ SimpleDelegate::SimpleDelegate(const std::vector<int>& nodes,
       automatic_shape_propagation_(automatic_shape_propagation),
       custom_op_(custom_op),
       set_output_tensor_dynamic_(set_output_tensor_dynamic) {
+  delegate_ = TfLiteDelegateCreate();
   delegate_.Prepare = [](TfLiteContext* context,
                          TfLiteDelegate* delegate) -> TfLiteStatus {
     auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 814225a2ffab..a7572d9f74bd 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -166,6 +166,7 @@ cc_library(
         ":delegate_data",
         ":tflite_subgraph_execute",
         ":util",
+        "//tensorflow/core:session_options",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:macros",
@@ -173,10 +174,14 @@ cc_library(
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/utils:simple_delegate",
         "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
     ] + if_mobile([
@@ -394,6 +399,7 @@ cc_library(
         "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:portable_jpeg_internal",
         "//tensorflow/core/lib/png:png_io",
+        "//tensorflow/core/lib/webp:webp_io",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index eee6bd04e6de..f7fca34d49d7 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -14,19 +14,24 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/flex/delegate.h"
 
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
 #include <memory>
 #include <utility>
-#include <vector>
 
-#include "absl/strings/str_cat.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/lite/context_util.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/macros.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/lite/delegates/flex/kernel.h"
-#include "tensorflow/lite/delegates/flex/util.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+#include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/util.h"
@@ -158,11 +163,9 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 
   if (output->bytes != t_data.size()) {
     TF_LITE_KERNEL_LOG(context,
-                       absl::StrCat("The given ", output->bytes,
-                                    " bytes are not enough to store "
-                                    "TensorFlow's aligned buffer of size ",
-                                    t_data.size(), " bytes.")
-                           .c_str());
+                       "The given %zu bytes are not enough to store "
+                       "TensorFlow's aligned buffer of size %zu bytes.",
+                       output->bytes, t_data.size());
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 27d05f9740c4..9e6532d6b7b9 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/flex/kernel.h"
 
+#include <inttypes.h>
+
 #include <algorithm>
+#include <cstdint>
+#include <cstring>
 #include <map>
 #include <memory>
 #include <set>
@@ -22,23 +26,38 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/util.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
 // completed soon.
@@ -338,12 +357,12 @@ class OpNode {
       // the Tensorflow tensor.
       CopyShapeAndType(context, *tf_tensor, tensor);
     }
-    tensorflow::StringPiece t_data = tf_tensor->tensor_data();
+    absl::string_view t_data = tf_tensor->tensor_data();
     if (tf_tensor->NumElements() != NumElements(tensor) ||
         tf_tensor->TotalBytes() != tensor->bytes) {
       TF_LITE_KERNEL_LOG(context,
                          "FlexDelegate: Tensor %s(%d) buffer size mismatch "
-                         "%zu(%lld) != %ld(%ld)",
+                         "%zu(%" PRId64 ") != %zu(%" PRId64 ")",
                          tensor->name, tensor_index, tf_tensor->TotalBytes(),
                          tf_tensor->NumElements(), tensor->bytes,
                          NumElements(tensor));
@@ -466,14 +485,14 @@ TfLiteStatus DelegateKernel::Init(TfLiteContext* context,
   op_data_->shared_info.tensor_release_map =
       flex_delegate_data->GetTensorReleaseMap(context);
 
-  CHECK(params->output_tensors);
+  TF_LITE_ENSURE(context, params->output_tensors != nullptr);
   std::set<int> output_set;
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
     op_data_->subgraph_outputs.push_back(tensor_index);
     output_set.insert(tensor_index);
   }
 
-  CHECK(params->input_tensors);
+  TF_LITE_ENSURE(context, params->input_tensors != nullptr);
   for (auto tensor_index : TfLiteIntArrayView(params->input_tensors)) {
     op_data_->subgraph_inputs.push_back(tensor_index);
   }
@@ -482,7 +501,7 @@ TfLiteStatus DelegateKernel::Init(TfLiteContext* context,
 
   op_data_->nodes.reserve(params->nodes_to_replace->size);
 
-  CHECK(params->nodes_to_replace);
+  TF_LITE_ENSURE(context, params->nodes_to_replace != nullptr);
   absl::Status status;
 
   // Now we explicitly disable reusing TFLite tensor buffers for certain TF ops,
@@ -813,13 +832,13 @@ TfLiteStatus DelegateKernel::Eval(TfLiteContext* context, TfLiteNode* node) {
         tf_tensor.TotalBytes() != tensor->bytes) {
       TF_LITE_KERNEL_LOG(context,
                          "FlexDelegate: Tensor %s(%d) buffer size mismatch "
-                         "%zu(%lld) != %ld(%ld)",
+                         "%zu(%" PRId64 ") != %zu(%" PRId64 ")",
                          tensor->name, tensor_index, tf_tensor.TotalBytes(),
                          tf_tensor.NumElements(), tensor->bytes,
                          NumElements(tensor));
       return kTfLiteError;
     }
-    tensorflow::StringPiece t_data = tf_tensor.tensor_data();
+    absl::string_view t_data = tf_tensor.tensor_data();
     memcpy(tensor->data.raw, t_data.data(), t_data.size());
   }
 
diff --git a/tensorflow/lite/delegates/flex/test/BUILD b/tensorflow/lite/delegates/flex/test/BUILD
index 92f81d68892b..0c5d4781fbdf 100644
--- a/tensorflow/lite/delegates/flex/test/BUILD
+++ b/tensorflow/lite/delegates/flex/test/BUILD
@@ -106,6 +106,7 @@ build_test(
     tags = [
         "noasan",
         "nomsan",
+        "notap",  # TODO(b/415812396): Re-enable once the test is fixed.
         "notsan",
         "nozapfhahn",
     ],
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 421a6faebfbd..4e216c6677ff 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -42,25 +42,31 @@ _DELEGATE_NO_GL_DEPS = select({
     ":tflite_profile",
     #"//third_party/GL:EGL_headers",
     #"//third_party/GL:GLES3_headers",
+    # go/keep-sorted start
     "@com_google_absl//absl/container:flat_hash_map",
     "@com_google_absl//absl/container:flat_hash_set",
     "@com_google_absl//absl/memory",
     "@com_google_absl//absl/strings",
     "@com_google_absl//absl/types:span",
-    "//tensorflow/lite:kernel_api",
-    "//tensorflow/lite:minimal_logging",
     "//tensorflow/lite/async:backend_async_kernel_interface",
     "//tensorflow/lite/core/async/interop/c:types",
     "//tensorflow/lite/core/c:common",
-    "//tensorflow/lite/delegates:serialization",
     "//tensorflow/lite/delegates/gpu/cl:util",
+    "//tensorflow/lite/delegates/gpu/common:data_type",
+    "//tensorflow/lite/delegates/gpu/common:model",
     "//tensorflow/lite/delegates/gpu/common:model_builder",
     "//tensorflow/lite/delegates/gpu/common:model_builder_helper",
     "//tensorflow/lite/delegates/gpu/common:quantization_util",
+    "//tensorflow/lite/delegates/gpu/common:status",
+    "//tensorflow/lite/delegates:serialization",
     "//tensorflow/lite/kernels:kernel_util",
     "//tensorflow/lite/profiling/telemetry",
-    "//tensorflow/lite/profiling/telemetry:telemetry_status",
+    "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
     "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
+    "//tensorflow/lite/profiling/telemetry:telemetry_status",
+    "//tensorflow/lite:kernel_api",
+    "//tensorflow/lite:minimal_logging",
+    # go/keep-sorted end
 ]
 
 config_setting(
@@ -70,14 +76,12 @@ config_setting(
 
 config_setting(
     name = "tflite_gpu_extra_gles_deps",
-    # copybara:uncomment_begin(google-only)
-    # constraint_values = [
-    # "//third_party/bazel_platforms/os:linux",
-    # ],
-    # copybara:uncomment_end
+    constraint_values = [
+        "//third_party/bazel_platforms/cpu:x86_64",
+        "//third_party/bazel_platforms/os:linux",
+    ],
     values = {
         "copt": "-DTFLITE_GPU_EXTRA_GLES_DEPS",
-        "cpu": "k8",
     },
 )
 
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 7e15aecee29a..2c7d07c8a7a9 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -44,6 +44,10 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
+
+// The `absl::Status` conflicts with the macro definition in the X11/Xlib.h,
+// undefine the VK_USE_PLATFORM_XLIB_KHR to exclude the header file.
+#undef VK_USE_PLATFORM_XLIB_KHR
 #include "vulkan/vulkan.h"  // from @vulkan_headers
 
 #define GL_NO_PROTOTYPES
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index 088a66aa57af..01d4e6312477 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -97,8 +97,9 @@ template <typename T>
 absl::Status Buffer::WriteData(CLCommandQueue* queue,
                                const absl::Span<T> data) {
   if (size_ != sizeof(T) * data.size()) {
-    return absl::InvalidArgumentError(
-        "absl::Span<T> data size is different from buffer allocated size.");
+    return absl::InvalidArgumentError(absl::StrCat(
+        "absl::Span<T> data size is different from buffer allocated size: ",
+        size_, " vs ", sizeof(T) * data.size()));
   }
   RETURN_IF_ERROR(queue->EnqueueWriteBuffer(buffer_, size_, data.data()));
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index 849f2772d252..2eb95df35ae5 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -182,6 +182,14 @@ void LoadOpenCLFunctionExtensions(cl_platform_id platform_id) {
   LoadFunctionExtension(platform_id, clEnqueueCommandBufferKHR);
   LoadFunctionExtension(platform_id, clCommandNDRangeKernelKHR);
   LoadFunctionExtension(platform_id, clGetCommandBufferInfoKHR);
+
+  // cl_arm_import_memory extension
+  LoadFunctionExtension(platform_id, clImportMemoryARM);
+
+  // cl_khr_semaphore extension
+  LoadFunctionExtension(platform_id, clCreateSemaphoreWithPropertiesKHR);
+  LoadFunctionExtension(platform_id, clEnqueueWaitSemaphoresKHR);
+  LoadFunctionExtension(platform_id, clEnqueueSignalSemaphoresKHR);
 }
 
 #ifdef __WINDOWS__
@@ -444,6 +452,14 @@ PFN_clEnqueueCommandBufferKHR clEnqueueCommandBufferKHR;
 PFN_clCommandNDRangeKernelKHR clCommandNDRangeKernelKHR;
 PFN_clGetCommandBufferInfoKHR clGetCommandBufferInfoKHR;
 
+// cl_arm_import_memory extension
+PFN_clImportMemoryARM clImportMemoryARM;
+
+// cl_khr_semaphore extension
+PFN_clCreateSemaphoreWithPropertiesKHR clCreateSemaphoreWithPropertiesKHR;
+PFN_clEnqueueWaitSemaphoresKHR clEnqueueWaitSemaphoresKHR;
+PFN_clEnqueueSignalSemaphoresKHR clEnqueueSignalSemaphoresKHR;
+
 DEFINE_QCOM_FUNCTION_PTRS
 
 cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
index 92b5cb12ef2a..5dd762a2d51a 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
@@ -574,6 +574,29 @@ typedef cl_int(CL_API_CALL *PFN_clGetCommandBufferInfoKHR)(
     cl_command_buffer_info_khr /*param_name*/, size_t /*param_value_size*/,
     void * /*param_value*/, size_t * /*param_value_size_ret*/);
 
+// cl_arm_import_memory extension
+typedef cl_mem(CL_API_CALL *PFN_clImportMemoryARM)(
+    cl_context /*context*/, cl_mem_flags /*flags*/,
+    const cl_import_properties_arm * /*properties*/, void * /*memory*/,
+    size_t /*size*/, cl_int * /*errcode_ret*/);
+
+// cl_khr_semaphore extension
+typedef cl_semaphore_khr(CL_API_CALL *PFN_clCreateSemaphoreWithPropertiesKHR)(
+    cl_context /*context*/, const cl_semaphore_properties_khr * /*sema_props*/,
+    cl_int * /*errcode_ret*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWaitSemaphoresKHR)(
+    cl_command_queue /*command_queue*/, cl_uint /*num_sema_objects*/,
+    const cl_semaphore_khr * /*sema_objects*/,
+    const cl_semaphore_payload_khr * /*sema_payload_list*/,
+    cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/,
+    cl_event * /*event*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSignalSemaphoresKHR)(
+    cl_command_queue /*command_queue*/, cl_uint /*num_sema_objects*/,
+    const cl_semaphore_khr * /*sema_objects*/,
+    const cl_semaphore_payload_khr * /*sema_payload_list*/,
+    cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/,
+    cl_event * /*event*/);
+
 extern PFN_clGetPlatformIDs clGetPlatformIDs;
 extern PFN_clGetPlatformInfo clGetPlatformInfo;
 extern PFN_clGetDeviceIDs clGetDeviceIDs;
@@ -701,6 +724,15 @@ extern PFN_clEnqueueCommandBufferKHR clEnqueueCommandBufferKHR;
 extern PFN_clCommandNDRangeKernelKHR clCommandNDRangeKernelKHR;
 extern PFN_clGetCommandBufferInfoKHR clGetCommandBufferInfoKHR;
 
+// cl_arm_import_memory extension
+extern PFN_clImportMemoryARM clImportMemoryARM;
+
+// cl_khr_semaphore extension
+extern PFN_clCreateSemaphoreWithPropertiesKHR
+    clCreateSemaphoreWithPropertiesKHR;
+extern PFN_clEnqueueWaitSemaphoresKHR clEnqueueWaitSemaphoresKHR;
+extern PFN_clEnqueueSignalSemaphoresKHR clEnqueueSignalSemaphoresKHR;
+
 // For convenient image creation
 // It uses clCreateImage if it available (clCreateImage available since cl 1.2)
 // otherwise it will use legacy clCreateImage2D
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.fbs b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
index bce4b1a10df3..21f181328554 100644
--- a/tensorflow/lite/delegates/gpu/cl/serialization.fbs
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 include "tensorflow/lite/delegates/gpu/common/gpu_model.fbs";
-include "tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs";
+include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base.fbs";
 
 namespace tflite.gpu.cl.data;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
index 17d079b7ce17..58394a07cec3 100755
--- a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
@@ -28,7 +28,7 @@ static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
              "Non-compatible flatbuffers version included");
 
 #include "gpu_model_generated.h"
-#include "serialization_base_generated.h"
+#include "tflite_serialization_base_generated.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
index 9d9ff8e7f6c0..ac8106a2effc 100644
--- a/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
+++ b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
@@ -17,9 +17,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/operation_parser.h"
 #include "tensorflow/lite/delegates/gpu/common/unimplemented_operation_parser.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model.fbs b/tensorflow/lite/delegates/gpu/common/gpu_model.fbs
index ed3d1f850942..0a452d4128c5 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_model.fbs
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model.fbs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-include "tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs";
+include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base.fbs";
 
 namespace tflite.gpu.data;
 
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
index 49fd064a05f4..d292995e27f5 100755
--- a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
@@ -26,7 +26,7 @@ static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
               FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
-#include "serialization_base_generated.h"
+#include "tflite_serialization_base_generated.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 31d12a503daf..8200eb9a4228 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <map>
 #include <memory>
@@ -31,9 +32,13 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/lstm_parser.h"
@@ -3369,7 +3374,7 @@ TfLiteIntArray* GetOpsToReplace(
                       partition_helper.num_total_nodes());
     }
     absl::StrAppend(&error_message, " operations will run on the CPU.");
-    TF_LITE_KERNEL_LOG(context, error_message.c_str());
+    TF_LITE_KERNEL_LOG(context, "%s", error_message.c_str());
   }
   return ConvertVectorToTfLiteIntArray(ops_to_replace);
 }
@@ -3612,7 +3617,8 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 
 absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
                                  const tflite::OpResolver& op_resolver,
-                                 GraphFloat32* graph, bool allow_quant_ops) {
+                                 GraphFloat32* graph, bool allow_quant_ops,
+                                 bool apply_model_transformations) {
   std::unique_ptr<tflite::Interpreter> interpreter;
   tflite::InterpreterBuilder interpreter_builder(flatbuffer, op_resolver);
   if (interpreter_builder(&interpreter) != kTfLiteOk || !interpreter) {
@@ -3638,9 +3644,11 @@ absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
     return absl::InternalError("Conversion from TfLite model failed.");
   }
 
-  ModelTransformer transformer(graph);
-  if (!ApplyModelTransformations(&transformer)) {
-    return absl::InternalError("Graph transformations failed");
+  if (apply_model_transformations) {
+    ModelTransformer transformer(graph);
+    if (!ApplyModelTransformations(&transformer)) {
+      return absl::InternalError("Graph transformations failed");
+    }
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.h b/tensorflow/lite/delegates/gpu/common/model_builder.h
index 62c2310880cd..e6522bcb0b5e 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -89,7 +89,8 @@ absl::Status BuildFinalModel(
 absl::Status BuildFromFlatBuffer(const FlatBufferModel& flatbuffer,
                                  const OpResolver& op_resolver,
                                  GraphFloat32* graph,
-                                 bool allow_quant_ops = false);
+                                 bool allow_quant_ops = false,
+                                 bool apply_model_transformations = true);
 
 // Module-internal converter, exposed for unit testing purpose only.
 absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc
index 6d3dec487e4e..02b4e16aa1a7 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/common/selectors/simple_selectors.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <set>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
diff --git a/tensorflow/lite/delegates/gpu/common/task/BUILD b/tensorflow/lite/delegates/gpu/common/task/BUILD
index 427ae78ed58a..7a3108dceced 100644
--- a/tensorflow/lite/delegates/gpu/common/task/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/task/BUILD
@@ -127,7 +127,7 @@ cc_library(
 
 flatbuffer_cc_library(
     name = "serialization_base_cc_fbs",
-    srcs = ["serialization_base.fbs"],
+    srcs = ["tflite_serialization_base.fbs"],
     flatc_args = [
         "--scoped-enums",
     ],
diff --git a/tensorflow/lite/delegates/gpu/common/task/arguments.h b/tensorflow/lite/delegates/gpu/common/task/arguments.h
index c88ac77d722f..30afa197b44d 100644
--- a/tensorflow/lite/delegates/gpu/common/task/arguments.h
+++ b/tensorflow/lite/delegates/gpu/common/task/arguments.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
-#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h b/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
index 419666fa4321..a16b7bfc5e99 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
index 722294334770..35d01a70b252 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/task/compiler_options.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base.cc b/tensorflow/lite/delegates/gpu/common/task/serialization_base.cc
index 024079808068..a7a5b68bfbbb 100644
--- a/tensorflow/lite/delegates/gpu/common/task/serialization_base.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base.h b/tensorflow/lite/delegates/gpu/common/task/serialization_base.h
index b4319547dc5d..f34ac9fcca0b 100644
--- a/tensorflow/lite/delegates/gpu/common/task/serialization_base.h
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs b/tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base.fbs
similarity index 100%
rename from tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs
rename to tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base.fbs
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h b/tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h
similarity index 100%
rename from tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
rename to tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
index ce2152fa29a2..28af9f9f8fad 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
@@ -37,7 +37,9 @@ namespace gpu {
 namespace {
 bool UseBufferForWeights(const GpuInfo& gpu_info) {
   return gpu_info.IsAdreno() || gpu_info.IsAMD() || gpu_info.IsMali() ||
-         gpu_info.IsApple();
+         gpu_info.IsApple() ||
+         (gpu_info.IsIntel() && gpu_info.IsApiOpenCl() &&
+          gpu_info.opencl_info.IsCLVK());
 }
 
 void RearrangeFCWeightsToOIO4I4(
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
index 66040d03aa8c..ed362b4afde6 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/types/any.h"
-#include "absl/types/optional.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index 355f92bf0aaa..4bfdf5e3c3ac 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/any.h"
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index babe7f8a05a0..ce1f6dfc8edf 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/any.h"
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
index 0b9ef11585d3..fe5bb45589f2 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
index 1e1c2515fe0d..319f89bff3d6 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 9fac6e598f1b..cfad37899158 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -20,13 +20,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 
-#include "tensorflow/lite/logger.h"
-
-#if defined(__ANDROID__)
-#include <android/hardware_buffer.h>
-#endif
-
 #include <algorithm>
+#include <atomic>
 #include <cstdint>
 #include <cstring>
 #include <memory>
@@ -40,28 +35,35 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
-
-#if defined(__ANDROID__)
-#include "tensorflow/lite/async/backend_async_kernel_interface.h"
-#include "tensorflow/lite/core/async/c/task.h"
-#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
-#include "tensorflow/lite/core/async/interop/c/constants.h"
-#include "tensorflow/lite/core/async/interop/c/types.h"
-#endif
-
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/delegates/gpu/android_hardware_buffer.h"
 #include "tensorflow/lite/delegates/gpu/api.h"
 #include "tensorflow/lite/delegates/gpu/cl/api.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/delegate_options.h"
 #include "tensorflow/lite/delegates/gpu/tflite_profile.h"
 #include "tensorflow/lite/delegates/serialization.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
 
 #if defined(__ANDROID__)
+#include <android/hardware_buffer.h>
+
+#include "tensorflow/lite/async/backend_async_kernel_interface.h"
+#include "tensorflow/lite/core/async/c/task.h"
+#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
+#include "tensorflow/lite/core/async/interop/c/constants.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/delegates/gpu/android_hardware_buffer.h"
 #include "tensorflow/lite/delegates/gpu/async_buffers.h"
 #include "tensorflow/lite/delegates/gpu/gl/android_sync.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
@@ -71,12 +73,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/utils/utils.h"
 #endif
 
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
-#include "tensorflow/lite/profiling/telemetry/telemetry.h"
-#include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
-
 #ifndef CL_DELEGATE_NO_GL
 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
 #endif
@@ -469,7 +465,7 @@ absl::Status DelegateKernelCore::Setup(
         InitializeOpenClApi(&graph, &builder, &graph_is_destroyed, context,
                             delegate_params, delegate_->serialization());
     if (!status.ok()) {
-      TF_LITE_KERNEL_LOG(context, std::string(status.message()).c_str());
+      TF_LITE_KERNEL_LOG(context, "%s", std::string(status.message()).c_str());
       TF_LITE_KERNEL_LOG(context, "Falling back to OpenGL");
 
       // Graph needs to be re-created because it is moved above.
diff --git a/tensorflow/lite/delegates/gpu/gl/command_queue.cc b/tensorflow/lite/delegates/gpu/gl/command_queue.cc
index 0e40be1c85d6..0503505bef75 100644
--- a/tensorflow/lite/delegates/gpu/gl/command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/gl/command_queue.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 
-#include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
index 985da96ebff6..7db139c4ccfa 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <any>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
index 00a95c816e99..f6aee5dd8896 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
index fbca570d892f..0a057b14a80a 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
index d1a7fd78e1a8..81b8e89f2252 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 
+#include <array>
+#include <cstdint>
 #include <string>
 #include <utility>
 #include <variant>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
diff --git a/tensorflow/lite/delegates/gpu/gl/float16_conversions.cc b/tensorflow/lite/delegates/gpu/gl/float16_conversions.cc
index bf395f807792..c29b0de0f151 100644
--- a/tensorflow/lite/delegates/gpu/gl/float16_conversions.cc
+++ b/tensorflow/lite/delegates/gpu/gl/float16_conversions.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 
 #include "fp16.h"  // from @FP16
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_program.cc b/tensorflow/lite/delegates/gpu/gl/gl_program.cc
index fa3253b839fe..28099c6a8686 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_program.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_program.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.fbs b/tensorflow/lite/delegates/gpu/metal/inference_context.fbs
index e4cd7e0ee408..5c7b68b3f822 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.fbs
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.fbs
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 include "tensorflow/lite/delegates/gpu/common/gpu_model.fbs";
-include "tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs";
+include "tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base.fbs";
 
 namespace tflite.gpu.metal.data;
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc b/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
index 97202338826f..ee17b8497068 100644
--- a/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc b/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc
index 0b3921f1b346..f4b39a023576 100644
--- a/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/hexagon/builders/arg_min_max_builder.h"
 
-#include <limits>
+#include <cstddef>
 
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
index 4e888de5fc5e..c054b86735be 100644
--- a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc b/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc
index 7f624203dae9..d4b8adb105e9 100644
--- a/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
index 17c1ce637186..0c17d2f0baae 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <cmath>
-#include <limits>
+#include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
index 58c7bd76fb02..744d048b6997 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
-#include <limits>
 #include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
index 5e6ff2699fd1..dc42c8f51ed6 100644
--- a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
index 9e87d4109dba..e4bb336b6e36 100644
--- a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
index fa91b5080856..c242ff8e7d11 100644
--- a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
+#include <vector>
 
 #include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
index 9b6103fcc935..772c52a7f6b4 100644
--- a/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/min_max_builder.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/hexagon/builders/min_max_builder.h"
 
+#include <cstdint>
+
 #include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
index 353b8a007d65..bcce11acd02e 100644
--- a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
+#include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc b/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc
index 93511dc491da..715aa3955793 100644
--- a/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/neg_op_builder.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/hexagon/builders/neg_op_builder.h"
 
-#include <limits>
+#include <cstdint>
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
index 91258a418fd3..a3cb4157a5b3 100644
--- a/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/op_builder.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
 
+#include <cstdint>
+#include <vector>
+
 #include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/c/common.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc
index 7ccdb299d5d8..9d7cc75f7a99 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
index d49a3de4ab9b..4047d438f309 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
index 45529b68858c..729d988c2493 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc b/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
index 6e653fd70e48..078f27161f34 100644
--- a/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
index a41a9fb23ee7..38e3a2e6633d 100644
--- a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
+#include <cstddef>
 #include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc b/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc
index 5946abff4d1f..58e2cc80f006 100644
--- a/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
+#include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc b/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc
index 5cdd5398de1b..8c846b415959 100644
--- a/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/hexagon/builders/resize_bilinear_builder.h"
 
+#include <cstdint>
+#include <vector>
+
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
index 7276e9ad4500..b21665f30e56 100644
--- a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/rsqrt_builder.cc b/tensorflow/lite/delegates/hexagon/builders/rsqrt_builder.cc
index ad52495f54ea..f31800edb01e 100644
--- a/tensorflow/lite/delegates/hexagon/builders/rsqrt_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/rsqrt_builder.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <cstdint>
+#include <vector>
 
 #include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc b/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc
index 05dfd3ffeb07..149106d43509 100644
--- a/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/slice_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/hexagon/builders/slice_builder.h"
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
index 9915512856a2..281658756215 100644
--- a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
index 65e3899b79fe..6426fc36a077 100644
--- a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/split_builder.cc b/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
index 6ea35f60114e..a3a0254df5cd 100644
--- a/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/squared_difference.cc b/tensorflow/lite/delegates/hexagon/builders/squared_difference.cc
index b040aa0a12b9..51231f07fd79 100644
--- a/tensorflow/lite/delegates/hexagon/builders/squared_difference.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/squared_difference.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
+
 #include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc b/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc
index 9eabf5334199..257e1910455e 100644
--- a/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/hexagon/builders/strided_slice_builder.h"
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
index e306fcf68190..a793df16bbc6 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_builder.cc
@@ -16,8 +16,6 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
-
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
index 67ef945c61e4..c1593b050e9c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include <limits>
+#include <vector>
 
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
index 657408259771..767673e51595 100644
--- a/tensorflow/lite/delegates/interpreter_utils.cc
+++ b/tensorflow/lite/delegates/interpreter_utils.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/interpreter_utils.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <vector>
 
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index f48ac1c9f724..ca1f3c1e8633 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -184,6 +184,8 @@ cc_test(
     ],
     tags = [
         "no_windows",
+        # TODO(b/401046768): Re-enable once the test is fixed.
+        "notap",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -212,6 +214,8 @@ cc_test(
     ],
     tags = [
         "no_windows",
+        # TODO(b/401046768): Re-enable once the test is fixed.
+        "notap",
         "tflite_not_portable_ios",
     ],
     visibility = ["//visibility:private"],
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 9b18f620e72a..e1ee12204203 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -773,12 +773,12 @@ namespace delegate {
 namespace nnapi {
 
 #ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
-NNMemory::NNMemory(const NnApi* nnapi, const char* name, size_t size) {
-  if (name && size > 0) {
-    nnapi_ = nnapi;
-    byte_size_ = size;
+std::unique_ptr<NNMemory> NNMemory::Create(const NnApi* nnapi, const char* name,
+                                           size_t size) {
+  if (nnapi && name && size > 0) {
+    int fd = 0;
 #ifdef __ANDROID__
-    fd_ = nnapi_->ASharedMemory_create(name, size);
+    fd = nnapi->ASharedMemory_create(name, size);
 #else
     // For non-Android platforms ASharedMemory_create needs unique name to
     // create a shared memory object (see nnapi_implementation.cc).
@@ -788,21 +788,33 @@ NNMemory::NNMemory(const NnApi* nnapi, const char* name, size_t size) {
     }
     // tmpnam will produce a string containing with slashes, but shm_open
     // won't like that.
-    shm_region_name_ = std::string(name) + std::string(shm_name_buffer);
-    std::replace(shm_region_name_.begin(), shm_region_name_.end(), '/', '-');
-    fd_ = nnapi_->ASharedMemory_create(shm_region_name_.c_str(), size);
+    std::string shm_region_name =
+        std::string(name) + std::string(shm_name_buffer);
+    std::replace(shm_region_name.begin(), shm_region_name.end(), '/', '-');
+    fd = nnapi->ASharedMemory_create(shm_region_name.c_str(), size);
 #endif
-
-    data_ptr_ = reinterpret_cast<uint8_t*>(
-        mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
-    nnapi_->ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE,
-                                               fd_, 0, &nn_memory_handle_);
+    if (fd < 0) {
+      return nullptr;
+    }
+    uint8_t* data_ptr = reinterpret_cast<uint8_t*>(
+        mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+    if (data_ptr == MAP_FAILED) {
+      return nullptr;
+    }
+    ANeuralNetworksMemory* nn_memory_handle = nullptr;
+    nnapi->ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd,
+                                              0, &nn_memory_handle);
+    return std::unique_ptr<NNMemory>(
+        new NNMemory(nnapi, fd, size, data_ptr, nn_memory_handle));
   }
+  return nullptr;
 }
 #else
-NNMemory::NNMemory(const NnApi* /*nnapi*/, const char* /*name*/,
-                   size_t /*size*/)
-    : nnapi_(nullptr) {}
+std::unique_ptr<NNMemory> NNMemory::Create(const NnApi* /*nnapi*/,
+                                           const char* /*name*/,
+                                           size_t /*size*/) {
+  return nullptr;
+}
 #endif
 
 NNMemory::~NNMemory() {
@@ -4961,8 +4973,13 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
         }
       }
       if (total_input_byte_size > nn_input_memory_->get_byte_size()) {
-        nn_input_memory_ = std::make_unique<NNMemory>(nnapi_, "input_pool",
-                                                      total_input_byte_size);
+        nn_input_memory_ =
+            NNMemory::Create(nnapi_, "input_pool", total_input_byte_size);
+        if (nn_input_memory_ == nullptr) {
+          TF_LITE_KERNEL_LOG(context, "Failed to create input memory pool.");
+          return kTfLiteError;
+        }
+
         // Reset all cached executions when the memory pool is recreated.
         nn_execution_cache_.Clear();
       }
@@ -4989,8 +5006,13 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
         total_output_byte_size += GetNumPaddingBytes(tensor_size);
       }
       if (total_output_byte_size > nn_output_memory_->get_byte_size()) {
-        nn_output_memory_ = std::make_unique<NNMemory>(nnapi_, "output_pool",
-                                                       total_output_byte_size);
+        nn_output_memory_ =
+            NNMemory::Create(nnapi_, "output_pool", total_output_byte_size);
+        if (nn_output_memory_ == nullptr) {
+          TF_LITE_KERNEL_LOG(context, "Failed to create output memory pool.");
+          return kTfLiteError;
+        }
+
         // Reset all cached executions when the memory pool is recreated.
         nn_execution_cache_.Clear();
       }
@@ -6339,9 +6361,13 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 
   // Create shared memory pool for inputs and outputs.
   nn_input_memory_ =
-      std::make_unique<NNMemory>(nnapi_, "input_pool", total_input_byte_size);
+      NNMemory::Create(nnapi_, "input_pool", total_input_byte_size);
   nn_output_memory_ =
-      std::make_unique<NNMemory>(nnapi_, "output_pool", total_output_byte_size);
+      NNMemory::Create(nnapi_, "output_pool", total_output_byte_size);
+
+  if (nn_input_memory_ == nullptr || nn_output_memory_ == nullptr) {
+    return kTfLiteError;
+  }
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_c_api_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_c_api_test.cc
index 75111df2a7e6..c5d3c61b10f3 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_c_api_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_c_api_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #include <sys/mman.h>
 
-#include <algorithm>
 #include <initializer_list>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index 4644fa746301..cd33aa60e800 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -16,11 +16,12 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <initializer_list>
 #include <iterator>
 #include <memory>
 #include <numeric>
-#include <ostream>
 #include <string>
 #include <unordered_set>
 #include <vector>
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
index 8fb14d4634e7..eb35af31ed5e 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <sys/mman.h>
 
 #include <memory>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 5e3a31ee2d9c..2cd65acc63d4 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
 #define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_KERNEL_H_
 
+#include <cstddef>
 #include <list>
 #include <map>
 #include <memory>
@@ -116,8 +117,8 @@ class NNFreeMappingUtil {
 // Manage NNAPI shared memory handle
 class NNMemory {
  public:
-  NNMemory(const NnApi* nnapi, const char* name, size_t size);
-
+  static std::unique_ptr<NNMemory> Create(const NnApi* nnapi, const char* name,
+                                          size_t size);
   ~NNMemory();
 
   ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
@@ -125,6 +126,15 @@ class NNMemory {
   size_t get_byte_size() { return byte_size_; }
 
  private:
+  // Private constructor. Use Create() to create an instance.
+  NNMemory(const NnApi* nnapi, int fd, size_t byte_size, uint8_t* data_ptr,
+           ANeuralNetworksMemory* nn_memory_handle)
+      : nnapi_(nnapi),
+        fd_(fd),
+        byte_size_(byte_size),
+        data_ptr_(data_ptr),
+        nn_memory_handle_(nn_memory_handle) {};
+
   // NnApi instance to use. Not owned by this object.
   const NnApi* nnapi_;
   int fd_ = 0;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
index 88ed5d911249..e123d8e1d216 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
@@ -14,15 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <sys/mman.h>
 
-#include <algorithm>
-#include <array>
 #include <cstdint>
-#include <iterator>
 #include <memory>
-#include <numeric>
-#include <ostream>
 #include <string>
-#include <unordered_set>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
index 473f4fd0e044..9d3da57a9977 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
+#include <cstdint>
 #include <initializer_list>
+#include <map>
 #include <memory>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
index 4b1f4af1a09f..3cb6454e48f7 100644
--- a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
+++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <vector>
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc b/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc
index bcb549e2fe2a..ed8ddf20837a 100644
--- a/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc
+++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <initializer_list>
-#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
diff --git a/tensorflow/lite/delegates/utils/async_type_helpers.cc b/tensorflow/lite/delegates/utils/async_type_helpers.cc
index 4f6904c45bfe..2d8bf0b79fc3 100644
--- a/tensorflow/lite/delegates/utils/async_type_helpers.cc
+++ b/tensorflow/lite/delegates/utils/async_type_helpers.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/utils/async_type_helpers.h"
 
 #include <cstring>
-#include <string_view>
 
 #include "tensorflow/lite/async/interop/c/attribute_map.h"
 #include "tensorflow/lite/async/interop/c/constants.h"
diff --git a/tensorflow/lite/delegates/utils/sync_fence.cc b/tensorflow/lite/delegates/utils/sync_fence.cc
index c7de61ad5b1e..168e66b9a58a 100644
--- a/tensorflow/lite/delegates/utils/sync_fence.cc
+++ b/tensorflow/lite/delegates/utils/sync_fence.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <poll.h>
 
 #include <cerrno>
+#include <cstddef>
 #include <optional>
 #include <variant>
 #include <vector>
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index a0905e314a02..7b7a341c5d2b 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -240,24 +240,25 @@ cc_library(
         ":file_util",
         ":flexbuffers_util",
         ":quantization_util",
-        ":tflite_with_xnnpack_dynamic_fully_connected",
-        ":tflite_with_xnnpack_logging",
-        ":tflite_with_xnnpack_qs8",
-        ":tflite_with_xnnpack_qu8",
-        ":tflite_with_xnnpack_transient_indirection_buffer",
+        ":tflite_with_xnnpack_dynamic_fully_connected",  # buildcleaner: keep
+        ":tflite_with_xnnpack_logging",  # buildcleaner: keep
+        ":tflite_with_xnnpack_qs8",  # buildcleaner: keep
+        ":tflite_with_xnnpack_qu8",  # buildcleaner: keep
+        ":tflite_with_xnnpack_transient_indirection_buffer",  # buildcleaner: keep
         ":weight_cache",
         "//tensorflow/compiler/mlir/lite/kernels/internal:compatibility_macros",
         "//tensorflow/compiler/mlir/lite/tools/optimize:reduced_precision_metadata",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
-        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
         "//tensorflow/lite/schema:schema_fbs",
@@ -295,13 +296,14 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
-        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 9f72dc3ee26c..6142ac8c00b8 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -205,24 +205,6 @@ does not allow new instances to be created, and has lower memory overhead. Soft
 finalization allows new instances to be created, and has higher memory overhead
 (up to the size of the largest packed weights, rounded up to page alignment).
 
-### Using XNNPACK for variable operations
-
-XNNPACK can handle resource variables and associated operations: `VAR_HANDLE`,
-`READ_VARIABLE`, and `ASSIGN_VARIABLE`, but needs to be opted in by the user
-using delegate options:
-
-```c++
-TfLiteXNNPackDelegateOptions xnnpack_options =
-    TfLiteXNNPackDelegateOptionsDefault();
-xnnpack_options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS;
-```
-
-When XNNPACK handles resource variables,
-[tflite::Subgraph::resources](https://github.com/tensorflow/tensorflow/blob/5b4239ba9cf127fd26cd9f03c04dfc4c94c078d4/tensorflow/lite/core/subgraph.h#L197)
-cannot be used to access resources, because the resources are now internal to
-XNNPACK, and the changes are not reflected in tflite::Subgraph::resources. There
-is currently no way to access resources if XNNPACK handles resource variables.
-
 ## Profiling
 When TfLite profiling is enabled, XNNPACK will time each operator and report the
 results to TfLite which will print them as part of the overall execution profile.
diff --git a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
index c65b6c336629..d1ead6f43c10 100644
--- a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
@@ -65,10 +65,7 @@ TEST_F(BatchMatrixMultiplyTest, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-// TODO(b/332675940): This test is currently disabled since the TFLite default
-// implementation of `BatchMatMul` can't handle per-channel quantized inputs.
-TEST_F(BatchMatrixMultiplyTest,
-       DISABLED_DynamicallyQuantizedPerChannelWeights2D) {
+TEST_F(BatchMatrixMultiplyTest, DynamicallyQuantizedPerChannelWeights2D) {
   const auto height = shape_rng();
   const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
@@ -81,10 +78,8 @@ TEST_F(BatchMatrixMultiplyTest,
       .Test(xnnpack_delegate.get());
 }
 
-// TODO(b/332675940): This test is currently disabled since the TFLite default
-// implementation of `BatchMatMul` can't handle per-channel quantized inputs.
 TEST_F(BatchMatrixMultiplyTest,
-       DISABLED_DynamicallyQuantizedPerChannelWeights2DTransposeB) {
+       DynamicallyQuantizedPerChannelWeights2DTransposeB) {
   const auto height = shape_rng();
   const auto input_channels = channels_rng();
   const auto output_channels = channels_rng();
@@ -128,6 +123,36 @@ TEST_F(BatchMatrixMultiplyTest,
       .Test(xnnpack_delegate.get());
 }
 
+TEST_F(BatchMatrixMultiplyTest, DynamicallyQuantizedPerChannelWeights3D) {
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+  auto xnnpack_delegate = get_delegate();
+
+  BatchMatrixMultiplyTester()
+      .InputADims({batch, height, input_channels})
+      .InputBDims({batch, input_channels, output_channels})
+      .InputBQuant(BatchMatrixMultiplyTester::kChannel)
+      .Test(xnnpack_delegate.get());
+}
+
+TEST_F(BatchMatrixMultiplyTest,
+       DynamicallyQuantizedPerChannelWeights3DTransposeB) {
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+  auto xnnpack_delegate = get_delegate();
+
+  BatchMatrixMultiplyTester()
+      .InputADims({batch, height, input_channels})
+      .InputBDims({batch, output_channels, input_channels})
+      .InputBQuant(BatchMatrixMultiplyTester::kChannel)
+      .TransposeB(true)
+      .Test(xnnpack_delegate.get());
+}
+
 TEST_F(BatchMatrixMultiplyTest, BroadcastOne3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
index 337848cb3e8f..d81772e766ce 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
index c47dc76099f0..4ebfc4d7230b 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
index 49bc4d4227bb..60b55d58d0de 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <memory>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
index 8d028ffb5b45..f2f88e054ed8 100644
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/quantization_util.cc b/tensorflow/lite/delegates/xnnpack/quantization_util.cc
index 1567c65e5001..80a183c2b995 100644
--- a/tensorflow/lite/delegates/xnnpack/quantization_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantization_util.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/quantization_util.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
 
 #include "fp16.h"  // from @FP16
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantize_tester.cc b/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
index dd1ba3315b3a..1ffa4c79fb27 100644
--- a/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
@@ -17,8 +17,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
index 0109427159c7..a00540922f44 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <limits>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
index 3e91c73d0993..2d51b5def1cd 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <limits>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
index 162037f9a74f..d1dcaba34040 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <limits>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
index fdcb89995653..424233a8b41e 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <limits>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
index 410c1dbf21c8..bd0f59a985d4 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h"
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
index 545f5cfd761a..7594234e3a61 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h"
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
index f1cd0249e7d0..0c9bd2aafae4 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <random>
 #include <vector>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
index 484b25f2c68b..45658408da1b 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
@@ -17,9 +17,10 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
-#include <numeric>
 #include <random>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
index f5bc843ca947..d40280d71bb3 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
@@ -17,12 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <memory>
 #include <numeric>
 #include <random>
-#include <type_traits>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
index 6efcbafca015..be7e8cfec4fb 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h"
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_variable_ops_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_variable_ops_tester.cc
index 61ba2b60f2b8..fa21d3fb9f7b 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_variable_ops_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_variable_ops_tester.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <limits>
@@ -51,7 +52,6 @@ constexpr uint32_t CALL_ONCE = 3;
 std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
 NewXnnPackDelegateSupportingVariableOps() {
   TfLiteXNNPackDelegateOptions options = TfLiteXNNPackDelegateOptionsDefault();
-  options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS;
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(&options),
                        TfLiteXNNPackDelegateDelete);
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
index a3c5a17fd381..0c84bc16bc9e 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_test.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_test.cc
index c66004e32056..b68bc48fb3d6 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
index c2832b0c64d6..38a1d8e86794 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
@@ -18,9 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
-#include <numeric>
 #include <random>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_concatenation_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_concatenation_test.cc
index c7590f21f8a9..445046a7806e 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_concatenation_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_concatenation_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_depth_to_space_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_depth_to_space_test.cc
index d85a9cfb1cea..9a86a45638fb 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_depth_to_space_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_depth_to_space_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
index 1be48daba796..3097d314a3a6 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <random>
 
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_resize_bilinear_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_resize_bilinear_test.cc
index c3cf1cef9dc3..dce7c02685c0 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_resize_bilinear_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_resize_bilinear_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_slice_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_slice_test.cc
index 48ca30e3adfb..7dde81a36a92 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_slice_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_slice_test.cc
@@ -16,9 +16,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <numeric>
 #include <random>
-#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_space_to_depth_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_space_to_depth_test.cc
index 99d4ce31ea9a..0a9a4cb489f5 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_space_to_depth_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_space_to_depth_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_split_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_split_test.cc
index 2cf61c50ef66..937025e71a28 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_split_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_split_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_tanh_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_tanh_test.cc
index 708ac12112be..ece99272b5c6 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_tanh_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_tanh_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
-#include <limits>
 #include <memory>
 #include <random>
 
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_test.cc
index d32af38da21c..9bdf84a6b7c1 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/slice_test.cc b/tensorflow/lite/delegates/xnnpack/slice_test.cc
index 3a1790b1143d..c83c5112c2c1 100644
--- a/tensorflow/lite/delegates/xnnpack/slice_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/slice_test.cc
@@ -16,9 +16,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <numeric>
 #include <random>
-#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/xnnpack/slice_tester.cc b/tensorflow/lite/delegates/xnnpack/slice_tester.cc
index da97c89e9836..bc46b264ded6 100644
--- a/tensorflow/lite/delegates/xnnpack/slice_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/slice_tester.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
index 34afd06f0628..73ced56fd5f6 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
@@ -17,8 +17,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/space_to_depth_test.cc b/tensorflow/lite/delegates/xnnpack/space_to_depth_test.cc
index f39e305ea7dd..3d16fd7b21bc 100644
--- a/tensorflow/lite/delegates/xnnpack/space_to_depth_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/space_to_depth_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
index bd3e7205cedc..6dc63082d4fd 100644
--- a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
-#include <numeric>
 #include <random>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/split_test.cc b/tensorflow/lite/delegates/xnnpack/split_test.cc
index 1c772a389e80..27b3ff8286ec 100644
--- a/tensorflow/lite/delegates/xnnpack/split_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/split_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/split_tester.cc b/tensorflow/lite/delegates/xnnpack/split_tester.cc
index 7eb20af3cc0c..7016208f25c1 100644
--- a/tensorflow/lite/delegates/xnnpack/split_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/split_tester.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <random>
diff --git a/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc b/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc
index e5f0ca4e680d..77a5156eff6f 100644
--- a/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc
@@ -17,12 +17,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <numeric>
-#include <random>
-#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/xnnpack/test_util.cc b/tensorflow/lite/delegates/xnnpack/test_util.cc
index 25564e051bfa..3a81c32d8e46 100644
--- a/tensorflow/lite/delegates/xnnpack/test_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/test_util.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/test_util.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
 #include <limits>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
index c68aa3e3b7de..54c1791917b0 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
+#include <limits>
 #include <memory>
-#include <numeric>
 #include <random>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_test.cc b/tensorflow/lite/delegates/xnnpack/transpose_test.cc
index 61adbdbd4749..5fcf5a1774f8 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_test.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_test.cc
index 97cea8e5294a..afd77fad0607 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_test.cc
@@ -34,6 +34,9 @@ ToleranceInfo GetTolerance(BuiltinOperator op) {
       return ToleranceInfo{.relative = 1.0e+4f};
     case BuiltinOperator_GELU:
       return ToleranceInfo{.relative = 5.0f, .absolute = 10.0f};
+    case BuiltinOperator_COS:
+    case BuiltinOperator_SIN:
+      return ToleranceInfo{.relative = 5.0f, .absolute = 3.0f};
     default:
       return ToleranceInfo{};
   }
@@ -139,11 +142,15 @@ TEST_P(UnaryTest, MultiThreading) {
 }
 
 BuiltinOperator all_unary_ops[] = {
-    BuiltinOperator_ABS,          BuiltinOperator_CEIL,   BuiltinOperator_ELU,
-    BuiltinOperator_FLOOR,        BuiltinOperator_GELU,   BuiltinOperator_NEG,
-    BuiltinOperator_HARD_SWISH,   BuiltinOperator_RELU,   BuiltinOperator_RELU6,
-    BuiltinOperator_RELU_N1_TO_1, BuiltinOperator_ROUND,  BuiltinOperator_RSQRT,
-    BuiltinOperator_SQRT,         BuiltinOperator_SQUARE, BuiltinOperator_TANH,
+    BuiltinOperator_ABS,        BuiltinOperator_CEIL,
+    BuiltinOperator_COS,        BuiltinOperator_ELU,
+    BuiltinOperator_EXP,        BuiltinOperator_FLOOR,
+    BuiltinOperator_GELU,       BuiltinOperator_NEG,
+    BuiltinOperator_HARD_SWISH, BuiltinOperator_RELU,
+    BuiltinOperator_RELU6,      BuiltinOperator_RELU_N1_TO_1,
+    BuiltinOperator_ROUND,      BuiltinOperator_RSQRT,
+    BuiltinOperator_SIN,        BuiltinOperator_SQRT,
+    BuiltinOperator_SQUARE,     BuiltinOperator_TANH,
     BuiltinOperator_LOGISTIC,
 };
 
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc
index 50ca96db44db..2062eb3d74bc 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_concatenation_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depth_to_space_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depth_to_space_test.cc
index 1e66406dbd84..9b093a21c420 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depth_to_space_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depth_to_space_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
index 8139299d24b6..90df47c884d0 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <random>
 
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_resize_bilinear_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_resize_bilinear_test.cc
index aea92fd94bd8..4bbd6398ed28 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_resize_bilinear_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_resize_bilinear_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_slice_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_slice_test.cc
index 813df26c1797..f39612f8f358 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_slice_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_slice_test.cc
@@ -16,9 +16,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <numeric>
 #include <random>
-#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_space_to_depth_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_space_to_depth_test.cc
index c34407275855..703ba2c7656c 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_space_to_depth_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_space_to_depth_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_split_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_split_test.cc
index 70bd4d945e8e..1da6192d440a 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_split_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_split_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_tanh_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_tanh_test.cc
index 10ccd46b357e..cdffb7122252 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_tanh_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_tanh_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
-#include <limits>
 #include <memory>
 #include <random>
 
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_test.cc
index e441c5edcb5d..f1d81a0a3359 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
index 6547f15a1c60..989ce9486731 100644
--- a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <random>
@@ -46,12 +48,9 @@ constexpr uint32_t CALL_ONCE = 3;
 std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
 NewXnnPackDelegateSupportingVariableOps() {
   TfLiteXNNPackDelegateOptions options = TfLiteXNNPackDelegateOptionsDefault();
-  options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS;
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(&options),
                        TfLiteXNNPackDelegateDelete);
-  TfLiteDelegate* delegate = xnnpack_delegate.get();
-  delegate->flags |= kTfLiteDelegateFlagsAllowDynamicTensors;
   return xnnpack_delegate;
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
index c047ca444273..b6ae7246fad4 100644
--- a/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
@@ -48,10 +48,6 @@ TEST(XNNPACK_WEIGHTS_CACHE, WithSize) {
   const Model* model = GetModel(buffer.data());
   DummyOpResolver resolver;
 
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
-  ASSERT_EQ(kTfLiteOk, interpreter->AllocateTensors());
-
   size_t four_mb = 4194304;
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
                   decltype(&TfLiteXNNPackDelegateWeightsCacheDelete)>
@@ -66,6 +62,10 @@ TEST(XNNPACK_WEIGHTS_CACHE, WithSize) {
       delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
                TfLiteXNNPackDelegateDelete);
 
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
+  ASSERT_EQ(kTfLiteOk, interpreter->AllocateTensors());
+
   ASSERT_EQ(kTfLiteOk, interpreter->ModifyGraphWithDelegate(delegate.get()));
 
   ASSERT_TRUE(
@@ -79,10 +79,6 @@ TEST(XNNPACK_WEIGHTS_CACHE, InvokeBeforeFinalization) {
   const Model* model = GetModel(buffer.data());
   DummyOpResolver resolver;
 
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
-  ASSERT_EQ(kTfLiteOk, interpreter->AllocateTensors());
-
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
                   decltype(&TfLiteXNNPackDelegateWeightsCacheDelete)>
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
@@ -96,6 +92,10 @@ TEST(XNNPACK_WEIGHTS_CACHE, InvokeBeforeFinalization) {
       delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
                TfLiteXNNPackDelegateDelete);
 
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
+  ASSERT_EQ(kTfLiteOk, interpreter->AllocateTensors());
+
   ASSERT_EQ(kTfLiteOk, interpreter->ModifyGraphWithDelegate(delegate.get()));
 
   // Invoking before finalization fails.
@@ -107,10 +107,6 @@ TEST(XNNPACK_WEIGHTS_CACHE, HardFinalization) {
   const Model* model = GetModel(buffer.data());
   DummyOpResolver resolver;
 
-  std::unique_ptr<Interpreter> interpreter1;
-  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter1));
-  ASSERT_EQ(kTfLiteOk, interpreter1->AllocateTensors());
-
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
                   decltype(&TfLiteXNNPackDelegateWeightsCacheDelete)>
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
@@ -123,6 +119,11 @@ TEST(XNNPACK_WEIGHTS_CACHE, HardFinalization) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       delegate1(TfLiteXNNPackDelegateCreate(&delegate_options),
                 TfLiteXNNPackDelegateDelete);
+
+  std::unique_ptr<Interpreter> interpreter1;
+  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter1));
+  ASSERT_EQ(kTfLiteOk, interpreter1->AllocateTensors());
+
   ASSERT_EQ(kTfLiteOk, interpreter1->ModifyGraphWithDelegate(delegate1.get()));
   ASSERT_TRUE(
       TfLiteXNNPackDelegateWeightsCacheFinalizeHard(weights_cache.get()));
@@ -131,12 +132,12 @@ TEST(XNNPACK_WEIGHTS_CACHE, HardFinalization) {
 
   // We cannot create new instances using the same weights cache after hard
   // finalization.
-  std::unique_ptr<Interpreter> interpreter2;
-  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter2));
-  ASSERT_EQ(kTfLiteOk, interpreter2->AllocateTensors());
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       delegate2(TfLiteXNNPackDelegateCreate(&delegate_options),
                 TfLiteXNNPackDelegateDelete);
+  std::unique_ptr<Interpreter> interpreter2;
+  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter2));
+  ASSERT_EQ(kTfLiteOk, interpreter2->AllocateTensors());
   ASSERT_NE(kTfLiteOk, interpreter2->ModifyGraphWithDelegate(delegate2.get()));
 }
 
@@ -154,12 +155,13 @@ TEST(XNNPACK_WEIGHTS_CACHE, SoftFinalization) {
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.weights_cache = weights_cache.get();
 
-  std::unique_ptr<Interpreter> interpreter1;
-  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter1));
-  ASSERT_EQ(kTfLiteOk, interpreter1->AllocateTensors());
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       delegate1(TfLiteXNNPackDelegateCreate(&delegate_options),
                 TfLiteXNNPackDelegateDelete);
+
+  std::unique_ptr<Interpreter> interpreter1;
+  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter1));
+  ASSERT_EQ(kTfLiteOk, interpreter1->AllocateTensors());
   ASSERT_EQ(kTfLiteOk, interpreter1->ModifyGraphWithDelegate(delegate1.get()));
 
   ASSERT_TRUE(
@@ -168,12 +170,12 @@ TEST(XNNPACK_WEIGHTS_CACHE, SoftFinalization) {
   ASSERT_EQ(kTfLiteOk, interpreter1->Invoke());
 
   // Build a second interpreter, it should work after soft finalization.
-  std::unique_ptr<Interpreter> interpreter2;
-  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter2));
-  ASSERT_EQ(kTfLiteOk, interpreter2->AllocateTensors());
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       delegate2(TfLiteXNNPackDelegateCreate(&delegate_options),
                 TfLiteXNNPackDelegateDelete);
+  std::unique_ptr<Interpreter> interpreter2;
+  ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter2));
+  ASSERT_EQ(kTfLiteOk, interpreter2->AllocateTensors());
   ASSERT_EQ(kTfLiteOk, interpreter2->ModifyGraphWithDelegate(delegate2.get()));
   ASSERT_EQ(kTfLiteOk, interpreter2->Invoke());
 }
@@ -196,13 +198,13 @@ TEST_P(WeightsCacheTest, SoftFinalizationMultithreaded) {
   delegate_options.weights_cache = weights_cache.get();
 
   // Create the first interpreter and finalize it.
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      initial_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
   std::unique_ptr<Interpreter> initial_interpreter;
   ASSERT_EQ(kTfLiteOk,
             InterpreterBuilder(model, resolver)(&initial_interpreter));
   ASSERT_EQ(kTfLiteOk, initial_interpreter->AllocateTensors());
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      initial_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
   ASSERT_EQ(kTfLiteOk, initial_interpreter->ModifyGraphWithDelegate(
                            initial_delegate.get()));
 
@@ -221,14 +223,14 @@ TEST_P(WeightsCacheTest, SoftFinalizationMultithreaded) {
   threads.reserve(num_threads);
   for (size_t i = 0; i < num_threads; i++) {
     threads.emplace_back(std::thread([&] {
-      std::unique_ptr<Interpreter> interpreter;
-      ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
-      ASSERT_EQ(kTfLiteOk, interpreter->AllocateTensors());
-
       std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
           delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
                    TfLiteXNNPackDelegateDelete);
 
+      std::unique_ptr<Interpreter> interpreter;
+      ASSERT_EQ(kTfLiteOk, InterpreterBuilder(model, resolver)(&interpreter));
+      ASSERT_EQ(kTfLiteOk, interpreter->AllocateTensors());
+
       ASSERT_EQ(kTfLiteOk,
                 interpreter->ModifyGraphWithDelegate(delegate.get()));
       ASSERT_EQ(kTfLiteOk, interpreter->Invoke());
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index ed9bcf7a4716..d134c813eccb 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <cstring>
 #include <functional>
 #include <limits>
-#include <map>
 #include <memory>
 #include <mutex>  // NOLINT: We don't have `absl::Mutex`.
 #include <numeric>
@@ -42,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/kernels/internal/compatibility_macros.h"
 #include "tensorflow/compiler/mlir/lite/tools/optimize/reduced_precision_metadata.h"
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/flexbuffers_util.h"
 #include "tensorflow/lite/delegates/xnnpack/quantization_util.h"
 #include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
+#include "tensorflow/lite/experimental/resource/resource_variable.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
@@ -71,6 +72,13 @@ namespace tflite {
 namespace xnnpack {
 namespace {
 
+// VisitDotAttentionNode uses a clamp to add a constant value to the XNNPack
+// subgraph. The constant data must outlive the XNNPack delegate and there is no
+// simple way of doing this. Therefore a clamp was used to clamp some arbitrary
+// data to this constant value. The static input data to the clamp can be
+// anything.
+const float kConstantClampData = 0.f;
+
 constexpr char kOdmlSDPA[] = "odml.scaled_dot_product_attention";
 
 template <typename T>
@@ -91,6 +99,100 @@ void CopyTensorDataInt32OrInt64(int64_t* dst, const TfLiteTensor& tensor,
   }
 }
 
+bool CheckFp16Scale(TfLiteContext* context, const TfLiteTensor& tensor, int t,
+                    const TfLiteBlockwiseQuantization* quantization_params) {
+  const TfLiteTensor& scale = context->tensors[quantization_params->scale];
+  int num_scales = NumElements(&scale);
+  std::vector<float> dequantized_scale(num_scales);
+  DequantizeFloat16(reinterpret_cast<uint16_t*>(scale.data.data),
+                    dequantized_scale.data(), num_scales);
+  for (int i = 0; i < num_scales; i++) {
+    if (!std::isnormal(dequantized_scale[i]) || dequantized_scale[i] <= 0.0f) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unsupported scale value (%f) in channel %d for "
+                         "%s tensor %d in XNNPACK delegate",
+                         dequantized_scale[i], i,
+                         TfLiteTypeGetName(tensor.type), t);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool CheckAffineQuantization(
+    TfLiteContext* context, const TfLiteTensor& tensor, int t,
+    const TfLiteAffineQuantization& quantization_params) {
+  if (quantization_params.scale == nullptr) {
+    TF_LITE_KERNEL_LOG(context,
+                       "missing scale quantization parameters for %s "
+                       "tensor %d in XNNPACK delegate",
+                       TfLiteTypeGetName(tensor.type), t);
+    return false;
+  }
+  if (quantization_params.zero_point == nullptr) {
+    TF_LITE_KERNEL_LOG(context,
+                       "missing zero point quantization parameters for "
+                       "%s tensor %d in XNNPACK delegate",
+                       TfLiteTypeGetName(tensor.type), t);
+    return false;
+  }
+  if (quantization_params.scale->size != quantization_params.zero_point->size) {
+    TF_LITE_KERNEL_LOG(context,
+                       "mismatching number of scale (%d) and zero "
+                       "point (%d) quantization parameters for %s "
+                       "tensor %d in XNNPACK delegate",
+                       quantization_params.scale->size,
+                       quantization_params.zero_point->size,
+                       TfLiteTypeGetName(tensor.type), t);
+    return false;
+  }
+  for (int i = 0; i < quantization_params.scale->size; i++) {
+    const float scale = quantization_params.scale->data[i];
+    if (!std::isnormal(scale) || scale <= 0.0f) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unsupported scale value (%f) in channel %d for "
+                         "%s tensor %d in XNNPACK delegate",
+                         scale, i, TfLiteTypeGetName(tensor.type), t);
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+bool CheckZeroPointForPerTensorQuantization(
+    TfLiteContext* context, const TfLiteTensor& tensor, int t,
+    const TfLiteIntArray& quantization_zero_point) {
+  // The single zero point must be within the min-max range of the tensor type.
+  const int zero_point = quantization_zero_point.data[0];
+  if (zero_point < std::numeric_limits<T>::min() ||
+      zero_point > std::numeric_limits<T>::max()) {
+    TF_LITE_KERNEL_LOG(context,
+                       "unsupported zero-point value (%d) for %s tensor "
+                       "%d in XNNPACK delegate",
+                       zero_point, TfLiteTypeGetName(tensor.type), t);
+    return false;
+  }
+  return true;
+}
+
+bool CheckZeroPointForPerChannelQuantization(
+    TfLiteContext* context, const TfLiteTensor& tensor, int t,
+    const TfLiteIntArray& quantization_zero_point) {
+  // All zero points must be 0, except for INT4 tensors where it can also be 8.
+  for (int c = 0; c < quantization_zero_point.size; c++) {
+    const int zero_point = quantization_zero_point.data[c];
+    if (zero_point != 0 && (tensor.type != kTfLiteInt4 && zero_point != 8)) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unsupported zero-point value (%d) in channel %d of "
+                         "%s tensor %d in XNNPACK delegate",
+                         zero_point, c, TfLiteTypeGetName(tensor.type), t);
+      return false;
+    }
+  }
+  return true;
+}
+
 xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
                                 const TfLiteTensor& tensor, int t) {
   switch (tensor.type) {
@@ -109,18 +211,14 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
       const auto quantization_params =
           static_cast<const TfLiteAffineQuantization*>(
               tensor.quantization.params);
-      if (quantization_params->scale == nullptr) {
+      if (quantization_params == nullptr) {
         TF_LITE_KERNEL_LOG(context,
-                           "missing scale quantization parameters for UINT8 "
-                           "tensor %d in XNNPACK delegate",
+                           "missing quantization parameters for affine "
+                           "quantized tensor %d in XNNPACK delegate",
                            t);
         return xnn_datatype_invalid;
       }
-      if (quantization_params->zero_point == nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "missing zero point quantization parameters for "
-                           "UINT8 tensor %d in XNNPACK delegate",
-                           t);
+      if (!CheckAffineQuantization(context, tensor, t, *quantization_params)) {
         return xnn_datatype_invalid;
       }
       if (quantization_params->scale->size != 1) {
@@ -131,145 +229,111 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
             quantization_params->scale->size, t);
         return xnn_datatype_invalid;
       }
-      if (quantization_params->zero_point->size != 1) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "unsupported number (%d) of zero point quantization parameters "
-            "for UINT8 tensor %d in XNNPACK delegate",
-            quantization_params->zero_point->size, t);
-        return xnn_datatype_invalid;
-      }
-
-      const float scale = quantization_params->scale->data[0];
-      if (!std::isnormal(scale) || scale <= 0.0f) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unsupported scale value (%f) for UINT8 tensor %d "
-                           "in XNNPACK delegate",
-                           scale, t);
-        return xnn_datatype_invalid;
-      }
-
-      const int zero_point = quantization_params->zero_point->data[0];
-      if (zero_point < std::numeric_limits<uint8_t>::min() ||
-          zero_point > std::numeric_limits<uint8_t>::max()) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unsupported zero-point value (%d) for UINT8 tensor "
-                           "%d in XNNPACK delegate",
-                           zero_point, t);
+      // Checking if quantization_params->zero_point->size != 1 is redundant,
+      // CheckAffineQuantization already checks if it is the same as
+      // quantization_params->scale->size.
+      if (!CheckZeroPointForPerTensorQuantization<uint8_t>(
+              context, tensor, t, *(quantization_params->zero_point))) {
         return xnn_datatype_invalid;
       }
-
       return xnn_datatype_quint8;
     }
     case kTfLiteInt8:
     case kTfLiteInt4: {
-      if (tensor.quantization.type != kTfLiteAffineQuantization) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unsupported quantization type %d for %s "
-                           "tensor %d in XNNPACK delegate",
-                           tensor.quantization.type,
-                           TfLiteTypeGetName(tensor.type), t);
-        return xnn_datatype_invalid;
-      }
-      const auto quantization_params =
-          static_cast<const TfLiteAffineQuantization*>(
-              tensor.quantization.params);
-      if (quantization_params->scale == nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "missing scale quantization parameters for %s "
-                           "tensor %d in XNNPACK delegate",
-                           TfLiteTypeGetName(tensor.type), t);
-        return xnn_datatype_invalid;
-      }
-      if (quantization_params->zero_point == nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "missing zero point quantization parameters for "
-                           "%s tensor %d in XNNPACK delegate",
-                           TfLiteTypeGetName(tensor.type), t);
-        return xnn_datatype_invalid;
-      }
-      if (quantization_params->scale->size !=
-          quantization_params->zero_point->size) {
-        TF_LITE_KERNEL_LOG(context,
-                           "mismatching number of scale (%d) and zero "
-                           "point (%d) quantization parameters for %s "
-                           "tensor %d in XNNPACK delegate",
-                           quantization_params->scale->size,
-                           quantization_params->zero_point->size,
-                           TfLiteTypeGetName(tensor.type), t);
-        return xnn_datatype_invalid;
-      }
-
-      for (int i = 0; i < quantization_params->scale->size; i++) {
-        const float scale = quantization_params->scale->data[i];
-        if (!std::isnormal(scale) || scale <= 0.0f) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported scale value (%f) in channel %d for "
-                             "%s tensor %d in XNNPACK delegate",
-                             scale, i, TfLiteTypeGetName(tensor.type), t);
-          return xnn_datatype_invalid;
-        }
-      }
-
-      if (quantization_params->scale->size == 1) {
-        // Per-tensor quantization parameters
-        if (kTfLiteInt8 != tensor.type) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported per-tensor quantization scale "
-                             "parameter for %s tensor %d in XNNPACK delegate",
-                             TfLiteTypeGetName(tensor.type), t);
-          return xnn_datatype_invalid;
-        }
-
-        const int zero_point = quantization_params->zero_point->data[0];
-        if (zero_point < std::numeric_limits<int8_t>::min() ||
-            zero_point > std::numeric_limits<int8_t>::max()) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported zero-point value (%d) for INT8 "
-                             "tensor %d in XNNPACK delegate",
-                             zero_point, t);
-          return xnn_datatype_invalid;
-        }
-        return xnn_datatype_qint8;
-      } else if (NumDimensions(&tensor) >= 1 &&
-                 quantization_params->scale->size ==
-                     SizeOfDimension(
-                         &tensor, quantization_params->quantized_dimension)) {
-        // Per-channel quantization parameters
-        for (int c = 0;
-             c <
-             SizeOfDimension(&tensor, quantization_params->quantized_dimension);
-             c++) {
-          if (quantization_params->zero_point->data[c] != 0 &&
-              (tensor.type != kTfLiteInt4 &&
-               quantization_params->zero_point->data[c] != 8)) {
+      switch (tensor.quantization.type) {
+        case kTfLiteAffineQuantization: {
+          const auto quantization_params =
+              static_cast<const TfLiteAffineQuantization*>(
+                  tensor.quantization.params);
+          if (quantization_params == nullptr) {
             TF_LITE_KERNEL_LOG(context,
-                               "unsupported zero-point value %d in channel "
-                               "%d of %s tensor %d in XNNPACK delegate",
-                               quantization_params->zero_point->data[c], c,
-                               TfLiteTypeGetName(tensor.type), t);
+                               "missing quantization parameters for affine "
+                               "quantized tensor %d in XNNPACK delegate",
+                               t);
+            return xnn_datatype_invalid;
+          }
+          if (!CheckAffineQuantization(context, tensor, t,
+                                       *quantization_params)) {
+            return xnn_datatype_invalid;
+          }
+          const auto quantization_scale = quantization_params->scale;
+          const auto quantization_zero_point = quantization_params->zero_point;
+          if (quantization_scale->size == 1 && tensor.type == kTfLiteInt8) {
+            // Per-tensor quantization
+            if (!CheckZeroPointForPerTensorQuantization<int8_t>(
+                    context, tensor, t, *quantization_zero_point)) {
+              return xnn_datatype_invalid;
+            }
+            return xnn_datatype_qint8;
+          }
+          if (NumDimensions(&tensor) >= 1 &&
+              quantization_scale->size ==
+                  SizeOfDimension(&tensor,
+                                  quantization_params->quantized_dimension)) {
+            // Per-channel quantization
+            if (!CheckZeroPointForPerChannelQuantization(
+                    context, tensor, t, *(quantization_params->zero_point))) {
+              return xnn_datatype_invalid;
+            }
+          } else {
+            TF_LITE_KERNEL_LOG(
+                context,
+                "mismatching number of quantization parameters %d and outer "
+                "dimension %d for INT8 tensor %d in XNNPACK delegate",
+                quantization_params->scale->size,
+                SizeOfDimension(&tensor,
+                                quantization_params->quantized_dimension),
+                t);
             return xnn_datatype_invalid;
           }
+          switch (tensor.type) {
+            case kTfLiteInt8:
+              return xnn_datatype_qcint8;
+            case kTfLiteInt4:
+              return xnn_datatype_qcint4;
+            default:
+              // Outermost switch prevents this
+              TFL_UNREACHABLE();
+          }
         }
-        switch (tensor.type) {
-          case kTfLiteInt4:
-            return xnn_datatype_qcint4;
-          case kTfLiteInt8:
-            return xnn_datatype_qcint8;
-          default:
+        case kTfLiteBlockwiseQuantization: {
+          if (tensor.type != kTfLiteInt4) {
+            TF_LITE_KERNEL_LOG(context,
+                               "unsupported tensor type %d for blockwise "
+                               "quantized tensor %d in XNNPACK delegate",
+                               tensor.type, t);
+            return xnn_datatype_invalid;
+          }
+          const auto quantization_params =
+              reinterpret_cast<const TfLiteBlockwiseQuantization*>(
+                  tensor.quantization.params);
+          if (!CheckFp16Scale(context, tensor, t, quantization_params)) {
+            return xnn_datatype_invalid;
+          }
+          int64_t num_scales =
+              NumElements(&context->tensors[quantization_params->scale]);
+          int64_t num_filter_elements = NumElements(&tensor);
+          if (num_filter_elements / num_scales !=
+              quantization_params->blocksize) {
+            TF_LITE_KERNEL_LOG(
+                context,
+                "Unsupported combination of filter elements %" PRId64
+                " number of scales %" PRId64 " and blocksize %" PRId32
+                " for %s tensor %d in XNNPACK delegate",
+                num_filter_elements, num_scales, quantization_params->blocksize,
+                tensor.name, t);
             return xnn_datatype_invalid;
+          }
+          return xnn_datatype_qbint4;
         }
-      } else {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "mismatching number of quantization parameters %d and outer "
-            "dimension %d for INT8 tensor %d in XNNPACK delegate",
-            quantization_params->scale->size,
-            SizeOfDimension(&tensor, quantization_params->quantized_dimension),
-            t);
-        return xnn_datatype_invalid;
+        default:
+          TF_LITE_KERNEL_LOG(context,
+                             "unsupported quantization type %d for %s "
+                             "tensor %d in XNNPACK delegate",
+                             tensor.quantization.type,
+                             TfLiteTypeGetName(tensor.type), t);
+          return xnn_datatype_invalid;
       }
-      break;
     }
     case kTfLiteInt32: {
       if (tensor.quantization.type != kTfLiteAffineQuantization) {
@@ -282,28 +346,14 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
       const auto quantization_params =
           static_cast<const TfLiteAffineQuantization*>(
               tensor.quantization.params);
-      if (quantization_params->scale == nullptr) {
+      if (quantization_params == nullptr) {
         TF_LITE_KERNEL_LOG(context,
-                           "missing scale quantization parameters for "
-                           "INT32 tensor %d in XNNPACK delegate",
+                           "missing quantization parameters for affine "
+                           "quantized tensor %d in XNNPACK delegate",
                            t);
         return xnn_datatype_invalid;
       }
-      if (quantization_params->zero_point == nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "missing zero point quantization parameters for "
-                           "INT32 tensor %d in XNNPACK delegate",
-                           t);
-        return xnn_datatype_invalid;
-      }
-      if (quantization_params->scale->size !=
-          quantization_params->zero_point->size) {
-        TF_LITE_KERNEL_LOG(context,
-                           "mismatching number of scale (%d) and zero "
-                           "point (%d) quantization parameters for INT32 "
-                           "tensor %d in XNNPACK delegate",
-                           quantization_params->scale->size,
-                           quantization_params->zero_point->size, t);
+      if (!CheckAffineQuantization(context, tensor, t, *quantization_params)) {
         return xnn_datatype_invalid;
       }
       if (quantization_params->quantized_dimension != 0) {
@@ -313,29 +363,19 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
                            quantization_params->quantized_dimension, t);
         return xnn_datatype_invalid;
       }
+      // Note, for INT32 tensors, the zero-point values for per-tensor
+      // quantization follow the stricter per-channel quantization requirements.
+      if (!CheckZeroPointForPerChannelQuantization(
+              context, tensor, t, *(quantization_params->zero_point))) {
+        return xnn_datatype_invalid;
+      }
       if (quantization_params->scale->size == 1) {
-        // Per-tensor quantization parameters
-        if (quantization_params->zero_point->data[0] != 0) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported zero-point value %d for INT32 "
-                             "tensor %d in XNNPACK delegate",
-                             quantization_params->zero_point->data[0], t);
-          return xnn_datatype_invalid;
-        }
+        // Per-tensor quantization
         return xnn_datatype_qint32;
       } else if (NumDimensions(&tensor) >= 1 &&
                  quantization_params->scale->size ==
                      SizeOfDimension(&tensor, 0)) {
-        // Per-channel quantization parameters
-        for (int c = 0; c < SizeOfDimension(&tensor, 0); c++) {
-          if (quantization_params->zero_point->data[c] != 0) {
-            TF_LITE_KERNEL_LOG(context,
-                               "unsupported zero-point value %d in channel "
-                               "%d of INT32 tensor %d in XNNPACK delegate",
-                               quantization_params->zero_point->data[c], c, t);
-            return xnn_datatype_invalid;
-          }
-        }
+        // Per-channel quantization
         return xnn_datatype_qcint32;
       } else {
         TF_LITE_KERNEL_LOG(
@@ -345,12 +385,10 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
             quantization_params->scale->size, SizeOfDimension(&tensor, 0), t);
         return xnn_datatype_invalid;
       }
-      break;
     }
     default:
-      break;
+      return xnn_datatype_invalid;
   }
-  return xnn_datatype_invalid;
 }
 
 // Forward declaration.
@@ -371,129 +409,63 @@ struct PairHash {
   }
 };
 
-// Variable tensors are tensors that can are persistent across graph
-// invocations. A handle to a variable tensor is given by the VAR_HANDLE
-// operation, the result of this operation is a tensor of type kTfLiteResource,
-// which represents the name/id of a variable tensor. READ_VARIABLE (RV) and
-// ASSIGN_VARIABLE (AV) access variable tensors using the result of VAR_HANDLE.
-// XNNPACK does not materialize any resource tensor. In order for RV/AV to know
-// which variable tensor it is accessing, we track:
-// - the name in each VAR_HANDLE node,
-// - the output tensor of VAR_HANDLE in each Subgraph
-// - the input tensor of RV/AV in each Subgraph
-// and match these up.
-// Each unique name is given a "global variable id". The output tensor of
-// VAR_HANDLE is mapped to this global variable id using its name.
-// Then RV/AV's input resource tensor id is used to lookup the global variable
-// id, and using that we get a pointer to the underlying buffer.
-// This is performed in two pass because:
-// - XNNPACK requires tensor declaration upfront and the dimensions are fixed
-// - VAR_HANDLE node has no dimensions information, only RV/AV has it
-// The two passes are:
-// - PrepareOpsToDelegate will record a mapping of variable name to the global
-// variable id and also record the dimensions based on RV/AV. This is called per
-// subgraph in the model.
-// - Subgraph::Create will actually define the tensors. This is called per
-// subgraph in the model.
-class VariableHolder {
+// This class stores information about a resource tensor in a subgraph.
+class ResourceInfo {
  public:
-  // Defines a variable with a given name. This variable is given a global
-  // variable id. The global id is also associated with a subgraph-local
-  // tensor id. tensor_id should be the id of a VAR_HANDLE output tensor.
-  TfLiteStatus DefineVariable(const std::pair<std::string, std::string>& name,
-                              int tensor_id, TfLiteContext* logging_context,
-                              int node_index) {
-    const auto variable_name_global_id_it = variable_name_to_global_id_.insert(
-        {name, variable_name_to_global_id_.size()});
-    const uint32_t global_id = variable_name_global_id_it.first->second;
-
-    const auto it = tensor_id_to_global_id_.insert({tensor_id, global_id});
-    if (!it.second && global_id != it.first->second) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "global id mismatch for tensor "
-          "%d, expected %u, found %u at VAR_HANDLE node %d",
-          tensor_id, global_id, it.first->second, node_index);
-      return kTfLiteError;
+  // Associate a VarHandle node and global id to this local subgraph resource.
+  bool SetVarHandle(int node_index, const TfLiteNode* var_handle,
+                    int global_id) {
+    if (global_id_ != -1 && global_id_ != global_id) {
+      // This VarHandle op is changing the resource tensor to a different value.
+      // We can't delegate this.
+      return false;
     }
-    return kTfLiteOk;
-  }
-
-  // Get the global variable id associated with this local tensor.
-  uint32_t GetGlobalId(int local) const {
-    return tensor_id_to_global_id_.at(local);
+    global_id_ = global_id;
+    var_handle_ = var_handle;
+    var_handle_node_index_ = node_index;
+    return true;
   }
 
-  bool HasLocalId(int local) const {
-    return tensor_id_to_global_id_.count(local) != 0;
-  }
+  // A representative VarHandle node that assigns this resource tensor.
+  const TfLiteNode* GetVarHandle() const { return var_handle_; }
+  int GetVarHandleNodeIndex() const { return var_handle_node_index_; }
+  // A unique ID indicating which handle this resource tensor comes from.
+  int GetGlobalId() const { return global_id_; }
 
-  // Variable tensors don't have dimensions or type, because VAR_HANDLE don't
-  // have that information. When a node (READ_VARIABLE or ASSIGN_VARIABLE) uses
-  // a variable, we associate the variable tensor with an underlying TFLite
-  // tensor via its global_id.
-  TfLiteStatus AssociateVariableWithTensor(int local_id,
-                                           const TfLiteTensor* tensor,
-                                           TfLiteContext* logging_context) {
-    if (tensor->type != kTfLiteFloat32 && tensor->type != kTfLiteInt8 &&
-        tensor->type != kTfLiteUInt8) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          logging_context,
-          "failed to associate variable tensors with tensor %d: "
-          "only kTfLiteFloat32, kTfLiteInt8, or kTfLiteUint8 variable tensors "
-          "are supported",
-          local_id);
-      return kTfLiteError;
+  // Associate a proxy value to this variable. All proxy values associated with
+  // this resource must have the same type and shape. `value_flags` indicates
+  // the flags to pass to `xnn_define_tensor` for this tensor.
+  bool AddProxyValue(const TfLiteTensor* tensors, int id,
+                     uint32_t value_flags = 0) {
+    if (!var_handle_) {
+      // We don't have a var handle yet, can't be accessed.
+      return false;
     }
-
-    const uint32_t global_id = GetGlobalId(local_id);
-    const std::vector<int> dims(tensor->dims->data,
-                                tensor->dims->data + tensor->dims->size);
-    const auto it =
-        global_id_to_dims_and_type_.insert(std::make_pair(global_id, tensor));
-    if (!it.second) {
-      // Not inserted.
-      const TfLiteTensor* found_tensor = it.first->second;
-      if (found_tensor->type != tensor->type) {
-        // Make sure that existing type matches.
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "mismatch between existing type of "
-                                 "variable tensor id %d: expected %d, got %d",
-                                 local_id, tensor->type, found_tensor->type);
-        return kTfLiteError;
-      }
-      auto const& dims = found_tensor->dims;
-      for (size_t i = 0; i < NumDimensions(found_tensor); i++) {
-        if (found_tensor->dims->data[i] != tensor->dims->data[i]) {
-          TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                   "mismatch between dimension %zu of "
-                                   "variable tensor id %d: expected %d, got %d",
-                                   i, local_id, dims->data[i],
-                                   tensor->dims->data[i]);
-          return kTfLiteError;
-        }
+    if (proxy_value_ < 0) {
+      proxy_value_ = id;
+    } else {
+      const TfLiteTensor& old_value = tensors[proxy_value_];
+      const TfLiteTensor& new_value = tensors[id];
+      if (old_value.type != new_value.type ||
+          !TfLiteIntArrayEqual(old_value.dims, new_value.dims)) {
+        return false;
       }
     }
-    return kTfLiteOk;
-  }
-
-  const std::map<uint32_t, const TfLiteTensor*>& GetAllTensors() const {
-    return global_id_to_dims_and_type_;
+    value_flags_ |= value_flags;
+    return true;
   }
 
-  // Global ids are per-delegate/per-model. However, the mapping from local
-  // tensor id to global id is per-subgraph. This functions allows clearing
-  // this mapping, which should be called at the start of visiting each
-  // subgraph in the model.
-  void ClearTensorIdToGlobalId() { tensor_id_to_global_id_.clear(); }
+  // A value with the same type and shape as the variable.
+  int GetProxyValue() const { return proxy_value_; }
+  // The XNNPACK flags to pass to xnn_define_tensor for this tensor.
+  uint32_t GetValueFlags() const { return value_flags_; }
 
  private:
-  std::unordered_map<std::pair<std::string, std::string>, uint32_t, PairHash>
-      variable_name_to_global_id_;
-  std::unordered_map<int, uint32_t> tensor_id_to_global_id_;
-  // Variable tensors need to be defined in the same order across all XNNPACK
-  // subgraphs, so we want the global ids to be ordered.
-  std::map<uint32_t, const TfLiteTensor*> global_id_to_dims_and_type_;
+  int global_id_ = -1;
+  const TfLiteNode* var_handle_ = nullptr;
+  int var_handle_node_index_ = -1;
+  int proxy_value_ = -1;
+  uint32_t value_flags_ = 0;
 };
 
 class Subgraph;
@@ -623,19 +595,21 @@ class Delegate {
     return (options_.flags & TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SLINKY) != 0;
   }
 
+  uint32_t runtime_flags() const { return options_.runtime_flags; }
+
   bool support_variable_ops() const {
     if (options_.flags & TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS) {
-      return true;
+      TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_ERROR,
+                           "Variable ops support is enabled by default, "
+                           "TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS is "
+                           "deprecated and will be removed in the future.");
     } else if (options_.handle_variable_ops) {
-      TFLITE_LOG_PROD_ONCE(
-          tflite::TFLITE_LOG_ERROR,
-          "TfLiteXNNPackDelegateOptions::handle_variable_ops "
-          "is deprecated and will be removed in the future. "
-          "Use TfLiteXNNPackDelegateOptions::flags with "
-          "TFLITE_XNNPACK_DELEGATE_FLAG_VARIABLE_OPERATORS mask");
-      return true;
+      TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_ERROR,
+                           "Variable ops support is enabled by default, "
+                           "TfLiteXNNPackDelegateOptions::handle_variable_ops "
+                           "is deprecated and will be removed in the future.");
     }
-    return false;
+    return true;
   }
 
   bool transient_indirection_buffer() const {
@@ -665,40 +639,21 @@ class Delegate {
 
   xnn_workspace_t workspace() const { return workspace_.get(); }
 
-  TfLiteStatus AssociateVariableWithTensor(int local_id,
-                                           const TfLiteTensor* tensor,
-                                           TfLiteContext* logging_context) {
-    return variable_holder_.AssociateVariableWithTensor(local_id, tensor,
-                                                        logging_context);
-  }
-
-  TfLiteStatus DefineVariable(const TfLiteVarHandleParams* params, int local,
-                              TfLiteContext* logging_context, int node_index) {
-    const std::pair<std::string, std::string> name = std::make_pair(
-        std::string(params->container ? params->container : ""),
-        std::string(params->shared_name ? params->shared_name : ""));
-    return variable_holder_.DefineVariable(name, local, logging_context,
-                                           node_index);
-  }
-
-  uint32_t GetGlobalId(int local) const {
-    return variable_holder_.GetGlobalId(local);
-  }
-
-  uint32_t HasLocalId(int local) const {
-    return variable_holder_.HasLocalId(local);
+  const ResourceInfo* FindResourceInfo(int local_id) const {
+    auto it = local_id_to_resources_.find(local_id);
+    return it != local_id_to_resources_.end() ? &it->second : nullptr;
   }
 
-  const std::map<uint32_t, const TfLiteTensor*>& GetAllVariableTensors() const {
-    return variable_holder_.GetAllTensors();
+  ResourceInfo& GetResourceInfo(int local_id) {
+    return local_id_to_resources_[local_id];
   }
 
-  bool CanVariableBeDelegated(int local_id) const {
-    if (HasLocalId(local_id)) {
-      return GetAllVariableTensors().count(GetGlobalId(local_id)) != 0;
-    } else {
-      return false;
-    }
+  int GetGlobalId(const TfLiteVarHandleParams* params) {
+    const std::pair<std::string, std::string> key = std::make_pair(
+        std::string(params->container ? params->container : ""),
+        std::string(params->shared_name ? params->shared_name : ""));
+    auto it = var_handles_.insert({key, var_handles_.size()});
+    return it.first->second;
   }
 
   void maybe_release_threadpool_ownership() {
@@ -756,7 +711,6 @@ class Delegate {
       nullptr, &xnn_release_workspace};
 
   TfLiteXNNPackDelegateOptions options_{};
-  VariableHolder variable_holder_;
   std::mutex workspace_mutex_;
 
   // If no weight cache is provided and a cache is set in the delegate options,
@@ -766,73 +720,48 @@ class Delegate {
   // A map of `f16`->`f32` dequantization tensor indices that will be skipped in
   // the XNNPACK subgraph.
   std::unordered_map<int, int> f16_input_tensor_for_dequant_f32_tensor_;
+
+  // A map of local tensor IDs to resource info. This map is only used by one
+  // Subgraph at a time and is cleared when preparing a new subgraph
+  std::unordered_map<int, ResourceInfo> local_id_to_resources_;
+  // Uniquely identify var handles
+  std::unordered_map<std::pair<std::string, std::string>, int, PairHash>
+      var_handles_;
 };
 
-class Subgraph {
- public:
-  // Defines all variable tensors in this subgraph. global_id_to_xnnpack_id is
-  // updated to record mapping from global variable tensor id to XNNPACK value
-  // id.
-  static TfLiteStatus DefineVariableTensors(
-      const Delegate& delegate, xnn_subgraph_t subgraph, TfLiteContext* context,
-      std::unordered_map<uint32_t, uint32_t>& global_id_to_xnnpack_id) {
-    for (auto const& it : delegate.GetAllVariableTensors()) {
-      const xnn_datatype datatype =
-          GetXNNPackDatatype(context, *it.second, it.first);
-      if (datatype == xnn_datatype_invalid) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "unsupported datatype (%s) of tensor %d in XNNPACK delegate",
-            TfLiteTypeGetName(it.second->type), it.first);
-        return kTfLiteError;
-      }
+// Prepare/invoke for VarHandle that also returns the resource_id. We can't use
+// the tensorflow/lite/kernels/var_handle.cc implementation because there's a
+// circular dependency if we try to depend on "builtin_op_kernels".
+TfLiteStatus PrepareVarHandle(TfLiteContext* context, const TfLiteNode* node) {
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
-      uint32_t out_id = XNN_INVALID_VALUE_ID;
+  output->allocation_type = kTfLiteArenaRwPersistent;
+  const int kBytesRequired = sizeof(int32_t);
+  TfLiteTensorRealloc(kBytesRequired, output);
+  output->bytes = kBytesRequired;
 
-      const std::vector<size_t> dims(
-          &it.second->dims->data[0],
-          &it.second->dims->data[NumDimensions(it.second)]);
-      xnn_status status{};
-      switch (datatype) {
-        case xnn_datatype_qint8:
-        case xnn_datatype_quint8: {
-          auto quantization = static_cast<const TfLiteAffineQuantization*>(
-              it.second->quantization.params);
-          status = xnn_define_quantized_tensor_value(
-              subgraph, datatype, quantization->zero_point->data[0],
-              quantization->scale->data[0], dims.size(), dims.data(), nullptr,
-              XNN_INVALID_VALUE_ID, XNN_VALUE_FLAG_PERSISTENT, &out_id);
-          break;
-        }
-        case xnn_datatype_qint32:
-        case xnn_datatype_qcint8:
-        case xnn_datatype_qcint32: {
-          // These are used for bias for Convolution/Fully Connected ops, and
-          // are always static.
-          TF_LITE_KERNEL_LOG(
-              context,
-              "unsupported datatype (%s) of tensor %d in XNNPACK delegate",
-              TfLiteTypeGetName(it.second->type), it.first);
-          return kTfLiteError;
-        }
-        default:
-          status = xnn_define_tensor_value(
-              subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr,
-              XNN_INVALID_VALUE_ID, XNN_VALUE_FLAG_PERSISTENT, &out_id);
-          break;
-      }
+  return kTfLiteOk;
+}
 
-      if (out_id == XNN_INVALID_VALUE_ID || status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(context,
-                           "failed to define tensor for variable global id %d",
-                           it.first);
-        return kTfLiteError;
-      }
-      global_id_to_xnnpack_id[it.first] = out_id;
-    }
-    return kTfLiteOk;
+TfLiteStatus InvokeVarHandle(TfLiteContext* context,
+                             const TfLiteNode* var_handle, int& resource_id) {
+  // This is struct VarParams { int resource_id; };
+  const int32_t* op_data = static_cast<const int32_t*>(var_handle->user_data);
+  TF_LITE_ENSURE(context, op_data != nullptr);
+  resource_id = *op_data;
+
+  TfLiteTensor& output = context->tensors[var_handle->outputs->data[0]];
+  if (int32_t* output_data = GetTensorData<int32_t>(&output)) {
+    // If we delegate the VarHandle op, but the result is an output of
+    // the delegated subgraph, we need to implement the op.
+    *output_data = resource_id;
   }
+  return kTfLiteOk;
+}
 
+class Subgraph {
+ public:
   static Subgraph* Create(TfLiteContext* context,
                           const TfLiteDelegateParams* params,
                           Delegate& delegate) {
@@ -907,6 +836,8 @@ class Subgraph {
         case kTfLiteBuiltinExpandDims:
         case kTfLiteBuiltinMean:
         case kTfLiteBuiltinPad:
+        case kTfLiteBuiltinReduceMax:
+        case kTfLiteBuiltinReduceMin:
         case kTfLiteBuiltinSum:
         case kTfLiteBuiltinReshape:
         case kTfLiteBuiltinResizeBilinear:
@@ -992,35 +923,15 @@ class Subgraph {
     std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
         subgraph_ptr, &xnn_delete_subgraph);
 
-    // Persistent tensors need to be defined in same order in all XNNPACK
-    // runtimes. This is because they are allocated in order of their XNNPACK
-    // value id. We cannot do this inside the subsequent for-loop that walks
-    // through all the tensors in the subgraph, because the same 2 VAR_HANDLE
-    // in 2 different subgraphs can be iterated over in different
-    // order, thus breaking our requirement that persistent tensors are
-    // defined in the same order.
-    // For example, given subgraph 1 with VAR_HANDLE1 then VAR_HANDLE2, and
-    // subgraph 2 with VAR_HANDLE2 then VAR_HANDLE1.
-    // 1. Create subgraph 1
-    // 2. Define persistent tensor for VAR_HANDLE1 (global id 0, xnn id 0)
-    // 3. Define persistent tensor for VAR_HANDLE2 (global id 1, xnn id 1)
-    // 4. Create runtime 1, tensor for VAR_HANDLE1 comes before VAR_HANDLE2
-    // 5. Create subgraph 2
-    // 6. Define persistent tensor for VAR_HANDLE2 (global id 1, xnn id 0)
-    // 7. Define persistent tensor for VAR_HANDLE1 (global id 0, xnn id 1)
-    // 8. Create runtime 2, tensor for VAR_HANDLE2 comes before VAR_HANDLE1,
-    // which is wrong.
-    std::unordered_map<uint32_t, uint32_t> global_id_to_xnnpack_id;
-    if (DefineVariableTensors(delegate, subgraph.get(), context,
-                              global_id_to_xnnpack_id) != kTfLiteOk) {
-      return nullptr;
-    }
-
     std::unordered_map<int, uint32_t> tflite_tensor_to_xnnpack;
     std::vector<int> external_inputs;
     std::vector<int> external_outputs;
     for (int t : tensors) {
-      if (context->tensors[t].type == kTfLiteResource) {
+      const TfLiteTensor* tensor = &context->tensors[t];
+
+      uint32_t flags = 0;
+      bool is_external = false;
+      if (tensor->type == kTfLiteResource) {
         // We should never see a resource tensor if we are not handling variable
         // ops.
         if (!delegate.support_variable_ops()) {
@@ -1030,45 +941,45 @@ class Subgraph {
               "not configured to handle variable operations");
           return nullptr;
         }
-        // Resource tensors are not materialized directly. We instead create a
-        // tensor that is the same type as how the resource is used, and all
-        // references to the resource tensor (that is produced by a VarHandle
-        // node) refers directly to this backing tensor.
-        const uint32_t global_id = delegate.GetGlobalId(t);
-        const auto it = global_id_to_xnnpack_id.find(global_id);
-        if (it == global_id_to_xnnpack_id.end()) {
-          TF_LITE_KERNEL_LOG(context,
-                             "could not find variable with global id %u in "
-                             "context %p for local tensor %d",
-                             global_id, context, t);
+
+        // Use the proxy value to define the tensor.
+        const ResourceInfo* resource = delegate.FindResourceInfo(t);
+        if (resource == nullptr) {
+          TF_LITE_KERNEL_LOG(context, "resource not found for tensor %d", t);
           return nullptr;
         }
-        tflite_tensor_to_xnnpack[t] = it->second;
-        // Proceed with processing the next tensor.
-        continue;
+        flags = resource->GetValueFlags();
+        if (flags == 0) continue;
+        tensor = &context->tensors[resource->GetProxyValue()];
+        is_external = true;
+        externals.insert(t);
+        if (flags & XNN_VALUE_FLAG_EXTERNAL_OUTPUT) {
+          // Treat this as an output so it gets reshaped?
+          external_outputs.push_back(t);
+        }
       }
 
-      const xnn_datatype datatype =
-          GetXNNPackDatatype(context, context->tensors[t], t);
+      const xnn_datatype datatype = GetXNNPackDatatype(context, *tensor, t);
       if (datatype == xnn_datatype_invalid) {
         TF_LITE_KERNEL_LOG(
             context,
             "unsupported datatype (%s) of tensor %d in XNNPACK delegate",
-            TfLiteTypeGetName(context->tensors[t].type), t);
+            TfLiteTypeGetName(tensor->type), t);
         return nullptr;
       }
 
-      uint32_t flags = 0;
       const void* data = nullptr;
-      if (context->tensors[t].allocation_type == kTfLiteMmapRo) {
-        data = context->tensors[t].data.raw_const;
-      } else {
-        // Check for quasi-static data.
-        const auto it =
-            delegate.static_unpacked_data_map_[subgraph_index].find(t);
-        if (it != delegate.static_unpacked_data_map_[subgraph_index].end()) {
-          data = delegate.static_unpacked_data_[subgraph_index].data() +
-                 it->second;
+      if (!is_external) {
+        if (tensor->allocation_type == kTfLiteMmapRo) {
+          data = tensor->data.raw_const;
+        } else {
+          // Check for quasi-static data.
+          const auto it =
+              delegate.static_unpacked_data_map_[subgraph_index].find(t);
+          if (it != delegate.static_unpacked_data_map_[subgraph_index].end()) {
+            data = delegate.static_unpacked_data_[subgraph_index].data() +
+                   it->second;
+          }
         }
       }
       if (inputs.count(t) != 0) {
@@ -1083,9 +994,8 @@ class Subgraph {
         external_outputs.push_back(t);
       }
 
-      std::vector<size_t> dims(
-          &context->tensors[t].dims->data[0],
-          &context->tensors[t].dims->data[NumDimensions(&context->tensors[t])]);
+      std::vector<size_t> dims(&tensor->dims->data[0],
+                               &tensor->dims->data[NumDimensions(tensor)]);
 
       xnn_status status = xnn_status_success;
       uint32_t xnnpack_id = XNN_INVALID_VALUE_ID;
@@ -1096,10 +1006,10 @@ class Subgraph {
           status = xnn_define_quantized_tensor_value(
               subgraph.get(), datatype,
               static_cast<const TfLiteAffineQuantization*>(
-                  context->tensors[t].quantization.params)
+                  tensor->quantization.params)
                   ->zero_point->data[0],
               static_cast<const TfLiteAffineQuantization*>(
-                  context->tensors[t].quantization.params)
+                  tensor->quantization.params)
                   ->scale->data[0],
               dims.size(), dims.data(), data, XNN_INVALID_VALUE_ID, flags,
               &xnnpack_id);
@@ -1110,14 +1020,28 @@ class Subgraph {
           status = xnn_define_channelwise_quantized_tensor_value(
               subgraph.get(), datatype,
               static_cast<const TfLiteAffineQuantization*>(
-                  context->tensors[t].quantization.params)
+                  tensor->quantization.params)
                   ->scale->data,
               dims.size(),
               static_cast<const TfLiteAffineQuantization*>(
-                  context->tensors[t].quantization.params)
+                  tensor->quantization.params)
                   ->quantized_dimension,
               dims.data(), data, XNN_INVALID_VALUE_ID, flags, &xnnpack_id);
           break;
+        case xnn_datatype_qbint4: {
+          const auto* quantization_params =
+              reinterpret_cast<const TfLiteBlockwiseQuantization*>(
+                  tensor->quantization.params);
+          const TfLiteTensor& scale_tensor =
+              context->tensors[quantization_params->scale];
+          status = xnn_define_blockwise_quantized_tensor_value_v2(
+              subgraph.get(), datatype, 0,
+              reinterpret_cast<const uint16_t*>(scale_tensor.data.data),
+              dims.size(), quantization_params->quantized_dimension,
+              quantization_params->blocksize, dims.data(), data,
+              XNN_INVALID_VALUE_ID, flags, xnn_datatype_fp16, &xnnpack_id);
+          break;
+        }
         default:
           status = xnn_define_tensor_value(
               subgraph.get(), datatype, dims.size(), dims.data(), data,
@@ -1206,6 +1130,7 @@ class Subgraph {
       constexpr uint32_t XNN_FLAG_SLINKY_ENABLED = 0x40000000;
       flags |= XNN_FLAG_SLINKY_ENABLED;
     }
+    flags |= delegate.runtime_flags();
 
     if (delegate.weight_cache_provider_.IsActive() &&
         delegate.weight_cache_provider_.CanStartBuildStep()) {
@@ -1231,8 +1156,9 @@ class Subgraph {
       return nullptr;
     }
 
-    return new Subgraph(delegate, runtime_ptr, externals, external_inputs,
-                        external_outputs, tflite_tensor_to_xnnpack);
+    return new Subgraph(delegate, runtime_ptr, externals,
+                        std::move(external_inputs), std::move(external_outputs),
+                        std::move(tflite_tensor_to_xnnpack));
   }
 
   TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node,
@@ -1289,6 +1215,18 @@ class Subgraph {
         }
       }
     }
+
+    // Prepare any VarHandle ops we delegated.
+    for (std::pair<const int, void*>& io_info : externals_) {
+      const auto& resource_it = resources_.find(io_info.first);
+      if (resource_it != resources_.end()) {
+        const TfLiteNode* var_handle = resource_it->second.GetVarHandle();
+        if (var_handle) {
+          TF_LITE_ENSURE_STATUS(PrepareVarHandle(context, var_handle));
+        }
+      }
+    }
+
     return kTfLiteOk;
   }
 
@@ -1296,29 +1234,86 @@ class Subgraph {
                       Delegate* delegate) {
     std::lock_guard<std::mutex> lock(delegate->workspace_mutex_);
 
+    tflite::Subgraph* this_subgraph =
+        reinterpret_cast<tflite::Subgraph*>(context->impl_);
+
     bool any_pointers_changed = false;
-    for (std::pair<int, void*> io_info : externals_) {
-      const TfLiteTensor& tensor = context->tensors[io_info.first];
-      void* data_pointer = &dummy_data_;
-      if (tensor.data.raw != nullptr) {
-        data_pointer = tensor.data.raw;
+    for (std::pair<const int, void*>& io_info : externals_) {
+      const auto& resource_it = resources_.find(io_info.first);
+      if (resource_it == resources_.end()) {
+        const TfLiteTensor& tensor = context->tensors[io_info.first];
+        void* data_pointer = &dummy_data_;
+        if (tensor.data.raw != nullptr) {
+          data_pointer = tensor.data.raw;
+        } else {
+          if (tensor.bytes != 0) {
+            TF_LITE_KERNEL_LOG(
+                context, "unexpected null data pointer in external tensor %d",
+                io_info.first);
+            return kTfLiteError;
+          }
+        }
+        if (data_pointer != io_info.second) {
+          any_pointers_changed = true;
+          io_info.second = data_pointer;
+        }
       } else {
-        if (tensor.bytes != 0) {
-          TF_LITE_KERNEL_LOG(
-              context, "unexpected null data pointer in external tensor %d",
-              io_info.first);
-          return kTfLiteError;
+        const TfLiteNode* var_handle = resource_it->second.GetVarHandle();
+        int resource_id;
+        if (var_handle) {
+          // By invoking VarHandle here, we're effectively reordering these ops
+          // to be at the beginning of the subgraph. This is OK because
+          // VarHandle has no input dependencies, and we already checked that
+          // multiple different VarHandles are not written to the same variable.
+          TF_LITE_ENSURE_STATUS(
+              InvokeVarHandle(context, var_handle, resource_id));
+        } else {
+          // There was no var handle. Maybe the resource is a static tensor?
+          const TfLiteTensor& resource_tensor =
+              context->tensors[resource_it->first];
+          TF_LITE_ENSURE(context, resource_tensor.data.raw != nullptr);
+          resource_id = *GetTensorData<int>(&resource_tensor);
+        }
+
+        resource::CreateResourceVariableIfNotAvailable(
+            &this_subgraph->resources(), resource_id);
+        tflite::resource::ResourceVariable* variable =
+            resource::GetResourceVariable(&this_subgraph->resources(),
+                                          resource_id);
+        TF_LITE_ENSURE(context, variable != nullptr);
+        if (!variable->GetTensor()) {
+          TF_LITE_ENSURE(context, resource_it->second.GetProxyValue() >= 0);
+          TfLiteTensor value =
+              context->tensors[resource_it->second.GetProxyValue()];
+
+          // We only want the shape and type of this tensor, not the data.
+          value.data.raw = nullptr;
+
+          // Replace the shape with the shape we inferred from XNNPACK. We need
+          // to do this because the value may have changed size due to an
+          // assignment.
+          size_t num_dims;
+          size_t dims[XNN_MAX_TENSOR_DIMS];
+          xnn_status status = xnn_get_external_value_shape(
+              runtime_.get(), tflite_tensor_to_xnnpack_[io_info.first],
+              &num_dims, &dims[0]);
+          TF_LITE_ENSURE_EQ(context, status, xnn_status_success);
+          TF_LITE_ENSURE_EQ(context, num_dims, value.dims->size);
+          std::copy_n(dims, num_dims, value.dims->data);
+
+          // Update the size.
+          value.bytes = NumElements(&value) * TfLiteTypeGetSize(value.type);
+
+          variable->AssignFrom(&value);
+        }
+        if (io_info.second != variable->GetTensor()->data.raw) {
+          any_pointers_changed = true;
+          io_info.second = variable->GetTensor()->data.raw;
         }
-      }
-      if (data_pointer != io_info.second) {
-        any_pointers_changed = true;
-        externals_[io_info.first] = data_pointer;
       }
     }
 
-    // Even with no externals, we need to setup the runtime if there are
-    // variables.
-    if (any_pointers_changed || NeedToSetUpVariableTensors()) {
+    if (any_pointers_changed) {
       std::vector<xnn_external_value> external_values;
       for (std::pair<int, void*> io_info : externals_) {
         xnn_external_value value = {0};
@@ -1340,7 +1335,6 @@ class Subgraph {
         TF_LITE_KERNEL_LOG(context, "failed to setup XNNPACK runtime");
         return kTfLiteError;
       }
-      variables_set_up_ = true;
     }
 
     xnn_status status = xnn_invoke_runtime(runtime_.get());
@@ -1901,6 +1895,22 @@ class Subgraph {
                            node_index);
   }
 
+  static TfLiteStatus CheckTensorFloatType(TfLiteContext* context,
+                                           const TfLiteTensor& tensor,
+                                           int tensor_index, int node_index) {
+    switch (tensor.type) {
+      case kTfLiteFloat32:
+      case kTfLiteFloat16:
+        return kTfLiteOk;
+      default:
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "%s: unsupported type %s in tensor #%d in node #%d",
+            __FUNCTION__, TfLiteTypeGetName(tensor.type), tensor_index,
+            node_index);
+        return kTfLiteError;
+    }
+  }
+
   static TfLiteStatus CheckTensorFloat32OrQInt8Type(const Delegate& delegate,
                                                     TfLiteContext* context,
                                                     const TfLiteTensor& tensor,
@@ -2168,32 +2178,73 @@ class Subgraph {
       case kTfLiteInt8:
         if (delegate.support_signed_8bit_quantization() &&
             (kTfLiteInt8 == tensor.type || kTfLiteInt4 == tensor.type)) {
-          if (tensor.quantization.type != kTfLiteAffineQuantization) {
-            TF_LITE_MAYBE_KERNEL_LOG(
-                context,
-                "unsupported quantization type %d in tensor #%d in node #%d",
-                tensor.quantization.type, tensor_index, node_index);
-            return kTfLiteError;
-          }
-          const TfLiteAffineQuantization* quantization_params =
-              static_cast<const TfLiteAffineQuantization*>(
-                  tensor.quantization.params);
-          if (quantization_params->scale == nullptr) {
-            TF_LITE_MAYBE_KERNEL_LOG(context,
-                                     "missing scale quantization parameters in "
-                                     "tensor #%d in node #%d",
-                                     tensor_index, node_index);
-            return kTfLiteError;
-          }
-          if (quantization_params->scale->size > 1 &&
-              quantization_params->quantized_dimension !=
-                  expected_quantized_dimension) {
-            TF_LITE_MAYBE_KERNEL_LOG(
-                context,
-                "unsupported quantized dimension %d in tensor #%d in node #%d",
-                quantization_params->quantized_dimension, tensor_index,
-                node_index);
-            return kTfLiteError;
+          switch (tensor.quantization.type) {
+            case kTfLiteAffineQuantization: {
+              const TfLiteAffineQuantization* quantization_params =
+                  static_cast<const TfLiteAffineQuantization*>(
+                      tensor.quantization.params);
+              if (quantization_params->scale == nullptr) {
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    context,
+                    "missing scale quantization parameters in "
+                    "tensor #%d in node #%d",
+                    tensor_index, node_index);
+                return kTfLiteError;
+              }
+              if (quantization_params->scale->size > 1 &&
+                  quantization_params->quantized_dimension !=
+                      expected_quantized_dimension) {
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    context,
+                    "unsupported quantized dimension %d in tensor #%d in node "
+                    "#%d",
+                    quantization_params->quantized_dimension, tensor_index,
+                    node_index);
+                return kTfLiteError;
+              } else if (tensor.type == kTfLiteInt4 &&
+                         quantization_params->scale->size !=
+                             SizeOfDimension(
+                                 &tensor,
+                                 quantization_params->quantized_dimension)) {
+                // Only per channel quantized 4 bit weights are supported.
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    context,
+                    "4 bit weights must be per channel and not per tensor "
+                    "quantized in channel #%" PRId32
+                    " in tensor #%d in node #%d",
+                    quantization_params->quantized_dimension, tensor_index,
+                    node_index);
+                return kTfLiteError;
+              }
+              break;
+            }
+            case kTfLiteBlockwiseQuantization: {
+              const TfLiteBlockwiseQuantization* quantization_params =
+                  reinterpret_cast<const TfLiteBlockwiseQuantization*>(
+                      tensor.quantization.params);
+              if (quantization_params->scale == kTfLiteOptionalTensor) {
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    context,
+                    "missing scale quantization parameters in "
+                    "tensor #%d in node #%d",
+                    tensor_index, node_index);
+                return kTfLiteError;
+              }
+              if (quantization_params->blocksize % 32 != 0) {
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    context,
+                    "Blocksize %" PRId32
+                    " must be multiple of 32 in tensor #%d in node #%d",
+                    quantization_params->blocksize, tensor_index, node_index);
+                return kTfLiteError;
+              }
+              break;
+            }
+            default:
+              TF_LITE_MAYBE_KERNEL_LOG(
+                  context,
+                  "unsupported quantization type %d in tensor #%d in node #%d",
+                  tensor.quantization.type, tensor_index, node_index);
           }
           return kTfLiteOk;
         }
@@ -2696,16 +2747,44 @@ class Subgraph {
 #endif
     switch (registration->builtin_code) {
       case kTfLiteBuiltinAbs:
-        return VisitAbsNode(subgraph, delegate, logging_context, node_index,
-                            node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinAdd: {
-        const TfLiteAddParams* add_params =
-            static_cast<const TfLiteAddParams*>(node->builtin_data);
+      case kTfLiteBuiltinCeil:
+      case kTfLiteBuiltinCos:
+      case kTfLiteBuiltinDequantize:
+      case kTfLiteBuiltinElu:
+      case kTfLiteBuiltinExp:
+      case kTfLiteBuiltinFloor:
+      case kTfLiteBuiltinGelu:
+      case kTfLiteBuiltinHardSwish:
+      case kTfLiteBuiltinLeakyRelu:
+      case kTfLiteBuiltinLogistic:
+      case kTfLiteBuiltinNeg:
+      case kTfLiteBuiltinQuantize:
+      case kTfLiteBuiltinRelu:
+      case kTfLiteBuiltinRelu6:
+      case kTfLiteBuiltinReluN1To1:
+      case kTfLiteBuiltinRound:
+      case kTfLiteBuiltinRsqrt:
+      case kTfLiteBuiltinSin:
+      case kTfLiteBuiltinSqrt:
+      case kTfLiteBuiltinSquare:
+      case kTfLiteBuiltinTanh:
+        return VisitUnaryNode(subgraph, delegate, logging_context, node_index,
+                              node, (BuiltinOperator)registration->builtin_code,
+                              context->tensors, input_output_tensors);
+
+      case kTfLiteBuiltinAdd:
+      case kTfLiteBuiltinDiv:
+      case kTfLiteBuiltinMaximum:
+      case kTfLiteBuiltinMinimum:
+      case kTfLiteBuiltinMul:
+      case kTfLiteBuiltinPrelu:
+      case kTfLiteBuiltinSquaredDifference:
+      case kTfLiteBuiltinSub:
+        return VisitBinaryNode(
+            subgraph, delegate, logging_context, node_index, node,
+            (BuiltinOperator)registration->builtin_code, context->tensors,
+            quasi_static_tensors, input_output_tensors);
 
-        return VisitAddNode(subgraph, delegate, logging_context, node_index,
-                            node, context->tensors, add_params,
-                            input_output_tensors);
-      }
       case kTfLiteBuiltinAssignVariable:
         return VisitAssignVariableNode(subgraph, delegate, logging_context,
                                        node_index, node, context->tensors,
@@ -2726,9 +2805,6 @@ class Subgraph {
                                     node_index, node, context->tensors,
                                     batchmatmul_params, input_output_tensors);
       }
-      case kTfLiteBuiltinCeil:
-        return VisitCeilNode(subgraph, delegate, logging_context, node_index,
-                             node, context->tensors, input_output_tensors);
       case kTfLiteBuiltinConcatenation: {
         const TfLiteConcatenationParams* concat_params =
             static_cast<const TfLiteConcatenationParams*>(node->builtin_data);
@@ -2761,21 +2837,6 @@ class Subgraph {
             subgraph, delegate, logging_context, node_index, node,
             context->tensors, depth_to_space_params, input_output_tensors);
       }
-      case kTfLiteBuiltinDequantize:
-        return VisitDequantizeNode(subgraph, delegate, logging_context,
-                                   node_index, node, context->tensors,
-                                   input_output_tensors);
-      case kTfLiteBuiltinDiv: {
-        const TfLiteDivParams* div_params =
-            static_cast<const TfLiteDivParams*>(node->builtin_data);
-
-        return VisitDivNode(subgraph, delegate, logging_context, node_index,
-                            node, context->tensors, div_params,
-                            input_output_tensors);
-      }
-      case kTfLiteBuiltinElu:
-        return VisitEluNode(subgraph, delegate, logging_context, node_index,
-                            node, context->tensors, input_output_tensors);
       case kTfLiteBuiltinExpandDims:
         return VisitExpandDimsNode(subgraph, delegate, logging_context,
                                    node_index, node, context->tensors,
@@ -2798,28 +2859,6 @@ class Subgraph {
                                        fc_params, quasi_static_tensors,
                                        input_output_tensors);
       }
-      case kTfLiteBuiltinFloor:
-        return VisitFloorNode(subgraph, delegate, logging_context, node_index,
-                              node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinGelu:
-        return VisitGeluNode(subgraph, delegate, logging_context, node_index,
-                             node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinHardSwish:
-        return VisitHardSwishNode(subgraph, delegate, logging_context,
-                                  node_index, node, context->tensors,
-                                  input_output_tensors);
-      case kTfLiteBuiltinLeakyRelu: {
-        const TfLiteLeakyReluParams* leaky_relu_params =
-            static_cast<const TfLiteLeakyReluParams*>(node->builtin_data);
-
-        return VisitLeakyReluNode(subgraph, delegate, logging_context,
-                                  node_index, node, context->tensors,
-                                  leaky_relu_params, input_output_tensors);
-      }
-      case kTfLiteBuiltinLogistic:
-        return VisitLogisticNode(subgraph, delegate, logging_context,
-                                 node_index, node, context->tensors,
-                                 input_output_tensors);
       case kTfLiteBuiltinMaxPool2d: {
         const TfLitePoolParams* pool_params =
             static_cast<const TfLitePoolParams*>(node->builtin_data);
@@ -2828,7 +2867,22 @@ class Subgraph {
                                   node_index, node, context->tensors,
                                   pool_params, input_output_tensors);
       }
-
+      case kTfLiteBuiltinReduceMin: {
+        const TfLiteReducerParams* reducer_params =
+            static_cast<const TfLiteReducerParams*>(node->builtin_data);
+        return VisitReduceNode(BuiltinOperator_MIN, xnn_reduce_min, subgraph,
+                               delegate, logging_context, node_index, node,
+                               context->tensors, reducer_params,
+                               input_output_tensors);
+      }
+      case kTfLiteBuiltinReduceMax: {
+        const TfLiteReducerParams* reducer_params =
+            static_cast<const TfLiteReducerParams*>(node->builtin_data);
+        return VisitReduceNode(BuiltinOperator_MAX, xnn_reduce_max, subgraph,
+                               delegate, logging_context, node_index, node,
+                               context->tensors, reducer_params,
+                               input_output_tensors);
+      }
       case kTfLiteBuiltinSum: {
         const TfLiteReducerParams* reducer_params =
             static_cast<const TfLiteReducerParams*>(node->builtin_data);
@@ -2837,9 +2891,6 @@ class Subgraph {
                                context->tensors, reducer_params,
                                input_output_tensors);
       }
-      case kTfLiteBuiltinMaximum:
-        return VisitMaximumNode(subgraph, delegate, logging_context, node_index,
-                                node, context->tensors, input_output_tensors);
       case kTfLiteBuiltinMean: {
         const TfLiteReducerParams* reducer_params =
             static_cast<const TfLiteReducerParams*>(node->builtin_data);
@@ -2848,48 +2899,13 @@ class Subgraph {
                                context->tensors, reducer_params,
                                input_output_tensors);
       }
-      case kTfLiteBuiltinMinimum:
-        return VisitMinimumNode(subgraph, delegate, logging_context, node_index,
-                                node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinMul: {
-        const TfLiteMulParams* mul_params =
-            static_cast<const TfLiteMulParams*>(node->builtin_data);
-
-        return VisitMulNode(subgraph, delegate, logging_context, node_index,
-                            node, context->tensors, mul_params,
-                            input_output_tensors);
-      }
-      case kTfLiteBuiltinNeg:
-        return VisitNegNode(subgraph, delegate, logging_context, node_index,
-                            node, context->tensors, input_output_tensors);
       case kTfLiteBuiltinPad:
         return VisitPadNode(subgraph, delegate, logging_context, node_index,
                             node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinPrelu:
-        return VisitPreluNode(subgraph, delegate, logging_context, node_index,
-                              node, context->tensors, quasi_static_tensors,
-                              input_output_tensors);
-      case kTfLiteBuiltinQuantize:
-        return VisitQuantizeNode(subgraph, delegate, logging_context,
-                                 node_index, node, context->tensors,
-                                 input_output_tensors);
       case kTfLiteBuiltinReadVariable:
         return VisitReadVariableNode(subgraph, delegate, logging_context,
                                      node_index, node, context->tensors,
                                      input_output_tensors);
-      case kTfLiteBuiltinRelu:
-        return VisitReluNode(subgraph, delegate, logging_context, node_index,
-                             node, context->tensors, 0.0f,
-                             std::numeric_limits<float>::infinity(),
-                             input_output_tensors);
-      case kTfLiteBuiltinReluN1To1:
-        return VisitReluNode(subgraph, delegate, logging_context, node_index,
-                             node, context->tensors, -1.0f, 1.0f,
-                             input_output_tensors);
-      case kTfLiteBuiltinRelu6:
-        return VisitReluNode(subgraph, delegate, logging_context, node_index,
-                             node, context->tensors, 0.0f, 6.0f,
-                             input_output_tensors);
       case kTfLiteBuiltinReshape: {
         const TfLiteReshapeParams* reshape_params =
             static_cast<const TfLiteReshapeParams*>(node->builtin_data);
@@ -2906,12 +2922,6 @@ class Subgraph {
                                        node_index, node, context->tensors,
                                        resize_params, input_output_tensors);
       }
-      case kTfLiteBuiltinRound:
-        return VisitRoundNode(subgraph, delegate, logging_context, node_index,
-                              node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinRsqrt:
-        return VisitRsqrtNode(subgraph, delegate, logging_context, node_index,
-                              node, context->tensors, input_output_tensors);
       case kTfLiteBuiltinSlice:
         return VisitSliceNode(subgraph, delegate, logging_context, node_index,
                               node, context->tensors, input_output_tensors);
@@ -2938,16 +2948,6 @@ class Subgraph {
                               node, context->tensors, split_params,
                               input_output_tensors);
       }
-      case kTfLiteBuiltinSqrt:
-        return VisitSqrtNode(subgraph, delegate, logging_context, node_index,
-                             node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinSquare:
-        return VisitSquareNode(subgraph, delegate, logging_context, node_index,
-                               node, context->tensors, input_output_tensors);
-      case kTfLiteBuiltinSquaredDifference:
-        return VisitSquaredDifferenceNode(subgraph, delegate, logging_context,
-                                          node_index, node, context->tensors,
-                                          input_output_tensors);
       case kTfLiteBuiltinStridedSlice: {
         const auto* params =
             static_cast<const TfLiteStridedSliceParams*>(node->builtin_data);
@@ -2955,17 +2955,6 @@ class Subgraph {
                                      node_index, node, context->tensors, params,
                                      input_output_tensors);
       }
-      case kTfLiteBuiltinSub: {
-        const TfLiteSubParams* sub_params =
-            static_cast<const TfLiteSubParams*>(node->builtin_data);
-
-        return VisitSubNode(subgraph, delegate, logging_context, node_index,
-                            node, context->tensors, sub_params,
-                            input_output_tensors);
-      }
-      case kTfLiteBuiltinTanh:
-        return VisitTanhNode(subgraph, delegate, logging_context, node_index,
-                             node, context->tensors, input_output_tensors);
       case kTfLiteBuiltinTranspose: {
         return VisitTransposeNode(subgraph, delegate, logging_context,
                                   node_index, node, context->tensors,
@@ -3051,112 +3040,6 @@ class Subgraph {
     }
   }
 
-  static TfLiteStatus VisitAbsNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_ABS, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_abs(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_ABS),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitAddNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteAddParams* add_params,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_ADD, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input1_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input1_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_ADD, node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input2_tensor,
-                                       node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input2_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[1],
-        BuiltinOperator_ADD, node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-
-    if (input1_tensor.type != input2_tensor.type ||
-        input1_tensor.type != output_tensor.type) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "unsupported mixed types in ADD operator #%d",
-                               node_index);
-      return kTfLiteError;
-    }
-    const float scale_min = 1.0f / 1024.0f;
-    const float scale_max = 256.0f;
-    TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
-        logging_context, input1_tensor, output_tensor, scale_min, scale_max,
-        BuiltinOperator_ADD, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
-        logging_context, input2_tensor, output_tensor, scale_min, scale_max,
-        BuiltinOperator_ADD, node_index));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    if (add_params != nullptr) {
-      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-          logging_context, node_index, add_params->activation, &output_min,
-          &output_max));
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_add2(
-          subgraph, output_min, output_max,
-          /*input1_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*input2_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_ADD),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitAssignVariableNode(
       xnn_subgraph_t subgraph, Delegate& delegate,
       TfLiteContext* logging_context, int node_index, const TfLiteNode* node,
@@ -3165,14 +3048,25 @@ class Subgraph {
     if (!delegate.support_variable_ops()) {
       return kTfLiteError;
     }
+
+    const int resource_tensor_id = node->inputs->data[0];
+    const int input_tensor_id = node->inputs->data[1];
+
+    const TfLiteTensor& input_tensor = tensors[input_tensor_id];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+        delegate, logging_context, input_tensor, input_tensor_id, node_index));
+
     if (subgraph == nullptr) {
-      const int resource_tensor_id = node->inputs->data[0];
-      return delegate.AssociateVariableWithTensor(
-          resource_tensor_id, &tensors[node->inputs->data[1]], logging_context);
+      ResourceInfo& resource_info =
+          delegate.GetResourceInfo(resource_tensor_id);
+      if (!resource_info.AddProxyValue(tensors, input_tensor_id,
+                                       XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) {
+        return kTfLiteError;
+      }
     } else {
       const xnn_status status = xnn_define_copy(
-          subgraph, input_output_tensors.at(node->inputs->data[1]),
-          input_output_tensors.at(node->inputs->data[0]), 0 /* flags */);
+          subgraph, input_output_tensors.at(input_tensor_id),
+          input_output_tensors.at(resource_tensor_id), 0 /* flags */);
       if (status != xnn_status_success) {
         TF_LITE_KERNEL_LOG(
             logging_context, "failed to delegate %s node #%d",
@@ -3342,31 +3236,44 @@ class Subgraph {
         }
 
         // Validate or create the quantization parameters for the per-channel
-        // quantized input_b. Note that we currently only expect the `B` tensor
-        // to be per-tensor quantized, and not per-channel (see b/332675940).
+        // quantized input_b.
         TfLiteAffineQuantization* quant_params_b =
             reinterpret_cast<TfLiteAffineQuantization*>(
                 input_b.quantization.params);
+        const int num_quant_params = quant_params_b->scale->size;
+        float* scale_b = quant_params_b->scale->data;
+        const int zero_point_b = num_quant_params > 1
+                                     ? quant_params_b->zero_point->data[0]
+                                     : input_b.params.zero_point;
+        int32_t quantized_dimension = quant_params_b->quantized_dimension;
         if (quant_params_b->scale->size != batch_size_b * n) {
-          if (quant_params_b->scale->size != 1) {
+          if ((batch_size_b * n) % num_quant_params) {
             TF_LITE_MAYBE_KERNEL_LOG(
                 logging_context,
                 "failed to delegate %s node #%d. unexpected number of "
-                "quantizations scales (expected %d or 1, got %d)",
+                "quantizations scales (expected a divisor of %d, got %d)",
                 EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL),
-                node_index, batch_size_b * n, quant_params_b->scale->size);
+                node_index, batch_size_b * n, num_quant_params);
             return kTfLiteError;
           }
+          TfLiteFloatArray* new_scale_b =
+              TfLiteFloatArrayCreate(num_quant_params + batch_size_b * n);
+          if (num_quant_params == 1) {
+            std::fill_n(new_scale_b->data, new_scale_b->size,
+                        input_b.params.scale);
+          } else {
+            std::copy_n(quant_params_b->scale->data, num_quant_params,
+                        new_scale_b->data);
+            for (int k = 0; k < batch_size_b * n; k++) {
+              new_scale_b->data[num_quant_params + k] =
+                  quant_params_b->scale->data[k % num_quant_params];
+            }
+          }
           TfLiteFloatArrayFree(quant_params_b->scale);
-          quant_params_b->scale = TfLiteFloatArrayCreate(batch_size_b * n);
-          std::fill_n(quant_params_b->scale->data, batch_size_b * n,
-                      input_b.params.scale);
-          TfLiteIntArrayFree(quant_params_b->zero_point);
-          quant_params_b->zero_point = TfLiteIntArrayCreate(batch_size_b * n);
-          std::fill_n(quant_params_b->zero_point->data, batch_size_b * n,
-                      input_b.params.zero_point);
-          quant_params_b->quantized_dimension =
-              params->adj_y ? num_dims_b - 2 : num_dims_b - 1;
+          new_scale_b->size = num_quant_params;
+          quant_params_b->scale = new_scale_b;
+          scale_b = new_scale_b->data + num_quant_params;
+          quantized_dimension = params->adj_y ? num_dims_b - 2 : num_dims_b - 1;
         }
 
         // Create the quantized input_b.
@@ -3374,12 +3281,11 @@ class Subgraph {
         for (int i = 0; i < num_dims_b; ++i) {
           dims_b[i] = SizeOfDimension(&input_b, i);
         }
-        const int32_t zero_point_value = quant_params_b->zero_point->data[0];
         uint32_t cq_input_b_id = XNN_INVALID_VALUE_ID;
         if (xnn_status status =
                 xnn_define_channelwise_quantized_tensor_value_v2(
-                    subgraph, xnn_datatype_qcint8, zero_point_value,
-                    quant_params_b->scale->data, dims_b.size(),
+                    subgraph, xnn_datatype_qcint8, zero_point_b, scale_b,
+                    dims_b.size(),
                     /*channel_dim=*/
                     (params->adj_y ? num_dims_b - 2 : num_dims_b - 1),
                     dims_b.data(), GetTensorData<int8_t>(&input_b),
@@ -3452,39 +3358,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitCeilNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_CEIL, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_ceiling(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_CEIL),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitConcatenationNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -3960,91 +3833,198 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitDequantizeNode(
+  static TfLiteStatus VisitBinaryNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
+      tflite::BuiltinOperator op_type, const TfLiteTensor* tensors,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_DEQUANTIZE, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorQInt8OrQUInt8Type(delegate, logging_context, input_tensor,
-                                     node->inputs->data[0], node_index));
+    // Get the input and output tensors.
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(logging_context, node, 2, 1,
+                                                   op_type, node_index));
+    const int input1_id = node->inputs->data[0];
+    const int input2_id = node->inputs->data[1];
+    const int output_id = node->outputs->data[0];
+    const TfLiteTensor& input1_tensor = tensors[input1_id];
+    const TfLiteTensor& input2_tensor = tensors[input2_id];
+    const TfLiteTensor& output_tensor = tensors[output_id];
+
+    // Check the input shapes.
     TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_DEQUANTIZE, node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_convert(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_DEQUANTIZE),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitDivNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteDivParams* div_params,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_DIV, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input1_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_DIV, node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input2_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[1],
-        BuiltinOperator_DIV, node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
+        logging_context, input1_tensor, /*min_num_dims=*/0,
+        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, input1_id, op_type, node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input2_tensor, /*min_num_dims=*/0,
+        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, input2_id, op_type, node_index));
+
+    // Check the input/output tensor types.
+    switch (op_type) {
+      case BuiltinOperator_ADD:
+      case BuiltinOperator_MUL:
+      case BuiltinOperator_SUB:
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, input1_tensor, input1_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, input2_tensor, input2_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, output_tensor, output_id, node_index));
+        if (input1_tensor.type != input2_tensor.type ||
+            input1_tensor.type != output_tensor.type) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context, "unsupported mixed types in %s operator #%d",
+              EnumNameBuiltinOperator(op_type), node_index);
+          return kTfLiteError;
+        }
+        break;
+      case BuiltinOperator_DIV:
+      case BuiltinOperator_MAXIMUM:
+      case BuiltinOperator_MINIMUM:
+      case BuiltinOperator_PRELU:
+      case BuiltinOperator_SQUARED_DIFFERENCE:
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+            logging_context, input1_tensor, input1_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+            logging_context, input2_tensor, input2_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+            logging_context, output_tensor, output_id, node_index));
+        break;
+      default:
+        TF_LITE_KERNEL_LOG(
+            logging_context,
+            "failed to delegate %s node #%d as a binary operator",
+            EnumNameBuiltinOperator(op_type), node_index);
+        return kTfLiteError;
+    }
 
+    // Extract any op-specific params.
     float output_min = -std::numeric_limits<float>::infinity();
     float output_max = +std::numeric_limits<float>::infinity();
-    if (div_params != nullptr) {
-      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-          logging_context, node_index, div_params->activation, &output_min,
-          &output_max));
+    switch (op_type) {
+      case BuiltinOperator_ADD: {
+        const float scale_min = 1.0f / 1024.0f;
+        const float scale_max = 256.0f;
+        TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
+            logging_context, input1_tensor, output_tensor, scale_min, scale_max,
+            op_type, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
+            logging_context, input2_tensor, output_tensor, scale_min, scale_max,
+            op_type, node_index));
+        const TfLiteAddParams* add_params =
+            static_cast<const TfLiteAddParams*>(node->builtin_data);
+        if (add_params != nullptr) {
+          TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+              logging_context, node_index, add_params->activation, &output_min,
+              &output_max));
+        }
+        break;
+      }
+      case BuiltinOperator_DIV: {
+        const TfLiteDivParams* div_params =
+            static_cast<const TfLiteDivParams*>(node->builtin_data);
+        if (div_params != nullptr) {
+          TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+              logging_context, node_index, div_params->activation, &output_min,
+              &output_max));
+        }
+        break;
+      }
+      case BuiltinOperator_MUL: {
+        const float scale_min = 1.0f / 65536.0f;
+        const float scale_max = 256.0f;
+        TF_LITE_ENSURE_STATUS(CheckTensorsInputProductOutputScale(
+            logging_context, input1_tensor, input2_tensor, output_tensor,
+            scale_min, scale_max, op_type, node_index));
+        const TfLiteMulParams* mul_params =
+            static_cast<const TfLiteMulParams*>(node->builtin_data);
+        if (mul_params != nullptr) {
+          TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+              logging_context, node_index, mul_params->activation, &output_min,
+              &output_max));
+        }
+        break;
+      }
+      case BuiltinOperator_PRELU:
+        if (quasi_static_tensors.count(input2_id) == 0) {
+          TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+              logging_context, input2_tensor, input2_id, op_type, node_index));
+        }
+        break;
+      case BuiltinOperator_SUB: {
+        const float scale_min = 1.0f / 1024.0f;
+        const float scale_max = 256.0f;
+        TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
+            logging_context, input1_tensor, output_tensor, scale_min, scale_max,
+            op_type, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
+            logging_context, input2_tensor, output_tensor, scale_min, scale_max,
+            op_type, node_index));
+        const TfLiteSubParams* sub_params =
+            static_cast<const TfLiteSubParams*>(node->builtin_data);
+        if (sub_params != nullptr) {
+          TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+              logging_context, node_index, sub_params->activation, &output_min,
+              &output_max));
+        }
+        break;
+      }
+      default:
+        break;
     }
 
     if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_divide(
-          subgraph, output_min, output_max,
-          /*input1_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*input2_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
+      // Setup the binary op params.
+      struct xnn_binary_params params;
+      params.output_min = output_min;
+      params.output_max = output_max;
+
+      // Set the binary op type and any special params associated with it.
+      enum xnn_binary_operator binary_op_type = xnn_binary_invalid;
+      switch (op_type) {
+        case BuiltinOperator_ADD:
+          binary_op_type = xnn_binary_add;
+          break;
+        case BuiltinOperator_DIV:
+          binary_op_type = xnn_binary_divide;
+          break;
+        case BuiltinOperator_MAXIMUM:
+          binary_op_type = xnn_binary_maximum;
+          break;
+        case BuiltinOperator_MINIMUM:
+          binary_op_type = xnn_binary_minimum;
+          break;
+        case BuiltinOperator_MUL:
+          binary_op_type = xnn_binary_multiply;
+          break;
+        case BuiltinOperator_PRELU:
+          binary_op_type = xnn_binary_prelu;
+          break;
+        case BuiltinOperator_SQUARED_DIFFERENCE:
+          binary_op_type = xnn_binary_squared_difference;
+          break;
+        case BuiltinOperator_SUB:
+          binary_op_type = xnn_binary_subtract;
+          break;
+        default:
+          TF_LITE_KERNEL_LOG(
+              logging_context,
+              "failed to delegate %s node #%d as a binary operator",
+              EnumNameBuiltinOperator(op_type), node_index);
+          return kTfLiteError;
+      }
+
+      // Create the subgraph node.
+      const xnn_status status =
+          xnn_define_binary(subgraph, binary_op_type, &params,
+                            /*input1_id=*/input_output_tensors.at(input1_id),
+                            /*input2_id=*/input_output_tensors.at(input2_id),
+                            /*output_id=*/input_output_tensors.at(output_id),
+                            /*flags=*/0);
       if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_DIV),
-                           node_index);
+        TF_LITE_KERNEL_LOG(
+            logging_context,
+            "failed to delegate %s node #%d (binary_op_type=%i, status=%i)",
+            EnumNameBuiltinOperator(BuiltinOperator_DIV), node_index,
+            binary_op_type, status);
         return kTfLiteError;
       }
     }
@@ -4052,33 +4032,266 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitEluNode(
+  static TfLiteStatus VisitUnaryNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
+      tflite::BuiltinOperator op_type, const TfLiteTensor* tensors,
       const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_ELU, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQInt8Type(delegate, logging_context, input_tensor,
-                                      node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQInt8Type(delegate, logging_context, output_tensor,
-                                      node->outputs->data[0], node_index));
+    // Get the input and output tensors.
+    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(logging_context, node, 1, 1,
+                                                   op_type, node_index));
+    const int input_id = node->inputs->data[0];
+    const int output_id = node->outputs->data[0];
+    const TfLiteTensor& input_tensor = tensors[input_id];
+    const TfLiteTensor& output_tensor = tensors[output_id];
+
+    // Check the input tensor shape.
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(
+        logging_context, input_tensor, /*min_num_dims=*/0,
+        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, input_id, op_type, node_index));
+
+    // Check the input/output tensor types.
+    switch (op_type) {
+      case BuiltinOperator_ABS:
+      case BuiltinOperator_CEIL:
+      case BuiltinOperator_COS:
+      case BuiltinOperator_EXP:
+      case BuiltinOperator_FLOOR:
+      case BuiltinOperator_GELU:
+      case BuiltinOperator_HARD_SWISH:
+      case BuiltinOperator_NEG:
+      case BuiltinOperator_RELU_N1_TO_1:
+      case BuiltinOperator_RELU:
+      case BuiltinOperator_RELU6:
+      case BuiltinOperator_ROUND:
+      case BuiltinOperator_RSQRT:
+      case BuiltinOperator_SIN:
+      case BuiltinOperator_SQRT:
+      case BuiltinOperator_SQUARE:
+        TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+            logging_context, input_tensor, input_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+            logging_context, output_tensor, output_id, node_index));
+        break;
+      case BuiltinOperator_DEQUANTIZE:
+        TF_LITE_ENSURE_STATUS(CheckTensorQInt8OrQUInt8Type(
+            delegate, logging_context, input_tensor, input_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+            logging_context, output_tensor, output_id, node_index));
+        break;
+      case BuiltinOperator_ELU:
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQInt8Type(
+            delegate, logging_context, input_tensor, input_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQInt8Type(
+            delegate, logging_context, output_tensor, output_id, node_index));
+        break;
+      case BuiltinOperator_LOGISTIC:
+      case BuiltinOperator_TANH:
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, input_tensor, input_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, output_tensor, output_id, node_index));
+        break;
+      case BuiltinOperator_LEAKY_RELU: {
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, input_tensor, input_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, output_tensor, output_id, node_index));
+        const TfLiteLeakyReluParams* leaky_relu_params =
+            static_cast<const TfLiteLeakyReluParams*>(node->builtin_data);
+        if (!std::isnormal(leaky_relu_params->alpha) ||
+            leaky_relu_params->alpha == 0.0f) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context, "unsupported alpha %g in LEAKY_RELU node #%d",
+              leaky_relu_params->alpha, node_index);
+          return kTfLiteError;
+        }
+        const float input_scale =
+            GetTensorScaleOrDefault(input_tensor, std::nanf(""));
+        const float output_scale =
+            GetTensorScaleOrDefault(output_tensor, std::nanf(""));
+        if (std::isnormal(input_scale) && std::isnormal(output_scale)) {
+          const float positive_scale = input_scale / output_scale;
+          if (positive_scale < 1.0f / 256.0f || positive_scale > 128.0f) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                logging_context,
+                "unsupported positive input-to-output scale "
+                "%g in LEAKY_RELU node #%d",
+                positive_scale, node_index);
+            return kTfLiteError;
+          }
+          const float negative_scale =
+              positive_scale * leaky_relu_params->alpha;
+          if (negative_scale < -127.99609375f || negative_scale > 128.0f ||
+              std::fabs(negative_scale) < 1.0f / 256.0f) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                logging_context,
+                "unsupported negative input-to-output scale "
+                "%g in LEAKY_RELU node #%d",
+                negative_scale, node_index);
+            return kTfLiteError;
+          }
+        }
+        break;
+      }
+      case BuiltinOperator_QUANTIZE: {
+        TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+            delegate, logging_context, input_tensor, input_id, node_index));
+        TF_LITE_ENSURE_STATUS(CheckTensorQInt8OrQUInt8Type(
+            delegate, logging_context, output_tensor, output_id, node_index));
+        const xnn_datatype input_datatype =
+            GetXNNPackDatatype(logging_context, input_tensor, input_id);
+        const xnn_datatype output_datatype =
+            GetXNNPackDatatype(logging_context, output_tensor, output_id);
+        bool supported_combination = false;
+        switch (input_datatype) {
+          case xnn_datatype_fp32:
+            supported_combination = true;
+            break;
+          case xnn_datatype_qint8:
+          case xnn_datatype_quint8:
+            if (input_datatype == output_datatype) {
+              const float input_scale =
+                  GetTensorScaleOrDefault(input_tensor, std::nanf(""));
+              const float output_scale =
+                  GetTensorScaleOrDefault(output_tensor, std::nanf(""));
+              const float input_output_scale = input_scale / output_scale;
+              if (input_output_scale < 1.0f / 256.0f ||
+                  input_output_scale > 128.0f) {
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    logging_context,
+                    "unsupported input-to-output scale in QUANTIZE node #%d",
+                    node_index);
+                return kTfLiteError;
+              }
+              supported_combination = true;
+            }
+            break;
+          default:
+            break;
+        }
+        if (!supported_combination) {
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "unsupported combination of input type (%s) and "
+              "output type (%s) in QUANTIZE node #%d",
+              TfLiteTypeGetName(input_tensor.type),
+              TfLiteTypeGetName(output_tensor.type), node_index);
+          return kTfLiteError;
+        }
+        break;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(
+            logging_context,
+            "failed to delegate %s node #%d as a binary operator",
+            EnumNameBuiltinOperator(op_type), node_index);
+        return kTfLiteError;
+    }
 
     if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_elu(
-          subgraph, /*alpha=*/1.0f,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
+      // Setup the unary op params.
+      union xnn_unary_params params;
+
+      // Set the binary op type and any special params associated with it.
+      enum xnn_unary_operator unary_op_type = xnn_unary_invalid;
+      switch (op_type) {
+        case BuiltinOperator_ABS:
+          unary_op_type = xnn_unary_abs;
+          break;
+        case BuiltinOperator_CEIL:
+          unary_op_type = xnn_unary_ceiling;
+          break;
+        case BuiltinOperator_COS:
+          unary_op_type = xnn_unary_cosine;
+          break;
+        case BuiltinOperator_DEQUANTIZE:
+        case BuiltinOperator_QUANTIZE:
+          unary_op_type = xnn_unary_convert;
+          break;
+        case BuiltinOperator_ELU:
+          unary_op_type = xnn_unary_elu;
+          params.elu.alpha = 1.0f;
+          break;
+        case BuiltinOperator_EXP:
+          unary_op_type = xnn_unary_exp;
+          break;
+        case BuiltinOperator_FLOOR:
+          unary_op_type = xnn_unary_floor;
+          break;
+        case BuiltinOperator_GELU: {
+          const TfLiteGeluParams* gelu_params =
+              static_cast<const TfLiteGeluParams*>(node->builtin_data);
+          unary_op_type =
+              gelu_params->approximate ? xnn_unary_approxgelu : xnn_unary_gelu;
+          break;
+        }
+        case BuiltinOperator_HARD_SWISH:
+          unary_op_type = xnn_unary_hardswish;
+          break;
+        case BuiltinOperator_LEAKY_RELU: {
+          const TfLiteLeakyReluParams* leaky_relu_params =
+              static_cast<const TfLiteLeakyReluParams*>(node->builtin_data);
+          params.leaky_relu.negative_slope = leaky_relu_params->alpha;
+          unary_op_type = xnn_unary_leaky_relu;
+          break;
+        }
+        case BuiltinOperator_LOGISTIC:
+          unary_op_type = xnn_unary_sigmoid;
+          break;
+        case BuiltinOperator_NEG:
+          unary_op_type = xnn_unary_negate;
+          break;
+        case BuiltinOperator_RELU:
+          params.clamp.min = 0.0f;
+          params.clamp.max = std::numeric_limits<float>::infinity();
+          unary_op_type = xnn_unary_clamp;
+          break;
+        case BuiltinOperator_RELU_N1_TO_1:
+          params.clamp.min = -1.0f;
+          params.clamp.max = 1.0f;
+          unary_op_type = xnn_unary_clamp;
+          break;
+        case BuiltinOperator_RELU6:
+          params.clamp.min = 0.0f;
+          params.clamp.max = 6.0f;
+          unary_op_type = xnn_unary_clamp;
+          break;
+        case BuiltinOperator_ROUND:
+          unary_op_type = xnn_unary_bankers_rounding;
+          break;
+        case BuiltinOperator_RSQRT:
+          unary_op_type = xnn_unary_reciprocal_square_root;
+          break;
+        case BuiltinOperator_SIN:
+          unary_op_type = xnn_unary_sine;
+          break;
+        case BuiltinOperator_SQRT:
+          unary_op_type = xnn_unary_square_root;
+          break;
+        case BuiltinOperator_SQUARE:
+          unary_op_type = xnn_unary_square;
+          break;
+        case BuiltinOperator_TANH:
+          unary_op_type = xnn_unary_tanh;
+          break;
+        default:
+          TF_LITE_KERNEL_LOG(
+              logging_context,
+              "failed to delegate %s node #%d as a binary operator",
+              EnumNameBuiltinOperator(op_type), node_index);
+          return kTfLiteError;
+      }
+
+      // Create the subgraph node.
+      const xnn_status status =
+          xnn_define_unary(subgraph, unary_op_type, &params,
+                           /*input_id=*/input_output_tensors.at(input_id),
+                           /*output_id=*/input_output_tensors.at(output_id),
+                           /*flags=*/0);
       if (status != xnn_status_success) {
         TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_ELU),
+                           EnumNameBuiltinOperator(BuiltinOperator_DIV),
                            node_index);
         return kTfLiteError;
       }
@@ -4104,11 +4317,11 @@ class Subgraph {
         logging_context, axis_tensor, node->inputs->data[1],
         BuiltinOperator_EXPAND_DIMS, node_index));
 
-    const size_t num_new_axes = NumElements(&axis_tensor);
+    const int64_t num_new_axes = NumElements(&axis_tensor);
     if (num_new_axes != 1) {
       TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "unexpected number of axes (%d) in node #%d: "
-                               "TFLite only supports 1 new axes",
+                               "unexpected number of axes (%" PRId64
+                               ") in node #%d: TFLite only supports 1 new axes",
                                num_new_axes, node_index);
       return kTfLiteError;
     }
@@ -4226,8 +4439,12 @@ class Subgraph {
                                   (input_tensor.type == kTfLiteFloat32 &&
                                    (filter_tensor.type == kTfLiteInt4 ||
                                     filter_tensor.type == kTfLiteInt8)));
+    bool supported_srq = (input_tensor.type == kTfLiteInt8 &&
+                          (filter_tensor.type == kTfLiteInt4 ||
+                           filter_tensor.type == kTfLiteInt8));
     if (input_tensor.type != output_tensor.type ||
-        ((input_tensor.type != filter_tensor.type) && !dynamically_quantized)) {
+        ((input_tensor.type != filter_tensor.type) &&
+         !(dynamically_quantized || supported_srq))) {
       TF_LITE_MAYBE_KERNEL_LOG(
           logging_context,
           "unsupported mixed types in FULLY_CONNECTED operator #%d",
@@ -4298,265 +4515,79 @@ class Subgraph {
         std::vector<size_t> filter_dims(
             &filter_tensor.dims->data[0],
             &filter_tensor.dims->data[NumDimensions(&filter_tensor)]);
-        int32_t zero_point_value = filter_params->zero_point->data[0];
         uint32_t kernel_id = XNN_INVALID_VALUE_ID;
-        status = xnn_define_channelwise_quantized_tensor_value_v2(
-            subgraph, filter_datatype, zero_point_value,
-            filter_params->scale->data, filter_dims.size(), /*channel_dim=*/0,
-            filter_dims.data(), GetTensorData<int8_t>(&filter_tensor),
-            XNN_INVALID_VALUE_ID,
-            /*flags=*/0, &kernel_id);
-        if (status != xnn_status_success) {
-          TF_LITE_KERNEL_LOG(
-              logging_context, "failed to update filter tensor %s node #%d",
-              EnumNameBuiltinOperator(BuiltinOperator_FULLY_CONNECTED),
-              node_index);
-          return kTfLiteError;
-        }
-        status = xnn_define_fully_connected(
-            subgraph, output_min, output_max, dq_quantized_id, kernel_id,
-            /*bias_id=*/bias_tensor_id >= 0
-                ? input_output_tensors.at(bias_tensor_id)
-                : XNN_INVALID_VALUE_ID,
-            /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-            /*flags=*/fc_params->keep_num_dims
-                ? 0
-                : XNN_FLAG_TENSORFLOW_RESHAPE_2D);
-        if (status != xnn_status_success) {
-          TF_LITE_KERNEL_LOG(
-              logging_context, "failed to delegate %s node #%d",
-              EnumNameBuiltinOperator(BuiltinOperator_FULLY_CONNECTED),
-              node_index);
-          return kTfLiteError;
-        }
-      } else {
-        const xnn_status status = xnn_define_fully_connected(
-            subgraph, output_min, output_max,
-            /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-            /*filter_id=*/input_output_tensors.at(node->inputs->data[1]),
-            /*bias_id=*/bias_tensor_id >= 0
-                ? input_output_tensors.at(bias_tensor_id)
-                : XNN_INVALID_VALUE_ID,
-            /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-            /*flags=*/fc_params->keep_num_dims
-                ? 0
-                : XNN_FLAG_TENSORFLOW_RESHAPE_2D);
-        if (status != xnn_status_success) {
-          TF_LITE_KERNEL_LOG(
-              logging_context, "failed to delegate %s node #%d",
-              EnumNameBuiltinOperator(BuiltinOperator_FULLY_CONNECTED),
-              node_index);
-          return kTfLiteError;
-        }
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitFloorNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_FLOOR, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_floor(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_FLOOR),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitGeluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_GELU, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    const TfLiteGeluParams* gelu_params =
-        static_cast<const TfLiteGeluParams*>(node->builtin_data);
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_unary(
-          subgraph,
-          /*type=*/gelu_params->approximate ? xnn_unary_approxgelu
-                                            : xnn_unary_gelu,
-          /*params=*/nullptr,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_GELU),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitHardSwishNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_HARD_SWISH, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_hardswish(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_HARD_SWISH),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitLeakyReluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const TfLiteLeakyReluParams* leaky_relu_params,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_LEAKY_RELU, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-
-    if (!std::isnormal(leaky_relu_params->alpha) ||
-        leaky_relu_params->alpha == 0.0f) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "unsupported alpha %g in LEAKY_RELU node #%d",
-                               leaky_relu_params->alpha, node_index);
-      return kTfLiteError;
-    }
-
-    const float input_scale =
-        GetTensorScaleOrDefault(input_tensor, std::nanf(""));
-    const float output_scale =
-        GetTensorScaleOrDefault(output_tensor, std::nanf(""));
-    if (std::isnormal(input_scale) && std::isnormal(output_scale)) {
-      const float positive_scale = input_scale / output_scale;
-      if (positive_scale < 1.0f / 256.0f || positive_scale > 128.0f) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "unsupported positive input-to-output scale "
-                                 "%g in LEAKY_RELU node #%d",
-                                 positive_scale, node_index);
-        return kTfLiteError;
-      }
-
-      const float negative_scale = positive_scale * leaky_relu_params->alpha;
-      if (negative_scale < -127.99609375f || negative_scale > 128.0f ||
-          std::fabs(negative_scale) < 1.0f / 256.0f) {
-        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "unsupported negative input-to-output scale "
-                                 "%g in LEAKY_RELU node #%d",
-                                 negative_scale, node_index);
-        return kTfLiteError;
-      }
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_leaky_relu(
-          subgraph, leaky_relu_params->alpha,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_LEAKY_RELU),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitLogisticNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_LOGISTIC, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_sigmoid(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_LOGISTIC),
-                           node_index);
-        return kTfLiteError;
+        switch (filter_datatype) {
+          case xnn_datatype_qcint4:
+          case xnn_datatype_qcint8: {
+            int32_t zero_point_value = filter_params->zero_point->data[0];
+            status = xnn_define_channelwise_quantized_tensor_value_v2(
+                subgraph, filter_datatype, zero_point_value,
+                filter_params->scale->data, filter_dims.size(),
+                /*channel_dim=*/0, filter_dims.data(),
+                GetTensorData<int8_t>(&filter_tensor), XNN_INVALID_VALUE_ID,
+                /*flags=*/0, &kernel_id);
+            break;
+          }
+          case xnn_datatype_qbint4: {
+            const auto* quantization_params =
+                reinterpret_cast<const TfLiteBlockwiseQuantization*>(
+                    tensors[node->inputs->data[1]].quantization.params);
+            const TfLiteTensor& scale_tensor =
+                tensors[quantization_params->scale];
+            status = xnn_define_blockwise_quantized_tensor_value_v2(
+                subgraph, filter_datatype, 0,
+                reinterpret_cast<const uint16_t*>(scale_tensor.data.data),
+                filter_dims.size(), quantization_params->quantized_dimension,
+                quantization_params->blocksize, filter_dims.data(),
+                GetTensorData<int8_t>(&filter_tensor), XNN_INVALID_VALUE_ID,
+                /*flags=*/0, xnn_datatype_fp16, &kernel_id);
+            break;
+          }
+          default:
+            return kTfLiteError;
+        }
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to update filter tensor %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_FULLY_CONNECTED),
+              node_index);
+          return kTfLiteError;
+        }
+        status = xnn_define_fully_connected(
+            subgraph, output_min, output_max, dq_quantized_id, kernel_id,
+            /*bias_id=*/bias_tensor_id >= 0
+                ? input_output_tensors.at(bias_tensor_id)
+                : XNN_INVALID_VALUE_ID,
+            /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
+            /*flags=*/fc_params->keep_num_dims
+                ? 0
+                : XNN_FLAG_TENSORFLOW_RESHAPE_2D);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_FULLY_CONNECTED),
+              node_index);
+          return kTfLiteError;
+        }
+      } else {
+        const xnn_status status = xnn_define_fully_connected(
+            subgraph, output_min, output_max,
+            /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
+            /*filter_id=*/input_output_tensors.at(node->inputs->data[1]),
+            /*bias_id=*/bias_tensor_id >= 0
+                ? input_output_tensors.at(bias_tensor_id)
+                : XNN_INVALID_VALUE_ID,
+            /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
+            /*flags=*/fc_params->keep_num_dims
+                ? 0
+                : XNN_FLAG_TENSORFLOW_RESHAPE_2D);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_FULLY_CONNECTED),
+              node_index);
+          return kTfLiteError;
+        }
       }
     }
 
@@ -4688,44 +4719,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitMaximumNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_MAXIMUM, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_maximum2(
-          subgraph,
-          /*input1_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*input2_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_MAXIMUM),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitMediaPipeDeconvolutionNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -4962,140 +4955,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitMinimumNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_MINIMUM, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_minimum2(
-          subgraph,
-          /*input1_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*input2_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_MINIMUM),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitMulNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteMulParams* mul_params,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_MUL, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input1_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input1_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_MUL, node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input2_tensor,
-                                       node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input2_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[1],
-        BuiltinOperator_MUL, node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-
-    const float scale_min = 1.0f / 65536.0f;
-    const float scale_max = 256.0f;
-    TF_LITE_ENSURE_STATUS(CheckTensorsInputProductOutputScale(
-        logging_context, input1_tensor, input2_tensor, output_tensor, scale_min,
-        scale_max, BuiltinOperator_MUL, node_index));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    if (mul_params != nullptr) {
-      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-          logging_context, node_index, mul_params->activation, &output_min,
-          &output_max));
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_multiply2(
-          subgraph, output_min, output_max,
-          /*input1_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*input2_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_MUL),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitNegNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_NEG, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_negate(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_NEG),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitPadNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -5178,139 +5037,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitPreluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_set<int>& quasi_static_tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_PRELU, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input_tensor, 1, XNN_MAX_TENSOR_DIMS,
-        node->inputs->data[0], BuiltinOperator_PRELU, node_index));
-
-    const TfLiteTensor& slope_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, slope_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckSlopeTensorShape(
-        logging_context, slope_tensor, node->inputs->data[1],
-        BuiltinOperator_PRELU, node_index));
-    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
-      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-          logging_context, slope_tensor, node->inputs->data[1],
-          BuiltinOperator_PRELU, node_index));
-    }
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, output_tensor, 1, XNN_MAX_TENSOR_DIMS,
-        node->outputs->data[0], BuiltinOperator_PRELU, node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_prelu(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*slope_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_PRELU),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitQuantizeNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_QUANTIZE, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorQInt8OrQUInt8Type(delegate, logging_context, output_tensor,
-                                     node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_QUANTIZE, node_index));
-
-    const xnn_datatype input_datatype = GetXNNPackDatatype(
-        logging_context, input_tensor, node->inputs->data[0]);
-    const xnn_datatype output_datatype = GetXNNPackDatatype(
-        logging_context, output_tensor, node->outputs->data[0]);
-    bool supported_combination = false;
-    switch (input_datatype) {
-      case xnn_datatype_fp32:
-        supported_combination = true;
-        break;
-      case xnn_datatype_qint8:
-      case xnn_datatype_quint8:
-        if (input_datatype == output_datatype) {
-          const float input_scale =
-              GetTensorScaleOrDefault(input_tensor, std::nanf(""));
-          const float output_scale =
-              GetTensorScaleOrDefault(output_tensor, std::nanf(""));
-          const float input_output_scale = input_scale / output_scale;
-          if (input_output_scale < 1.0f / 256.0f ||
-              input_output_scale > 128.0f) {
-            TF_LITE_MAYBE_KERNEL_LOG(
-                logging_context,
-                "unsupported input-to-output scale in QUANTIZE node #%d",
-                node_index);
-            return kTfLiteError;
-          }
-          supported_combination = true;
-        }
-        break;
-      default:
-        break;
-    }
-    if (!supported_combination) {
-      TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                               "unsupported combination of input type (%s) and "
-                               "output type (%s) in QUANTIZE node #%d",
-                               TfLiteTypeGetName(input_tensor.type),
-                               TfLiteTypeGetName(output_tensor.type),
-                               node_index);
-      return kTfLiteError;
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_convert(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_QUANTIZE),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitReadVariableNode(
       xnn_subgraph_t subgraph, Delegate& delegate,
       TfLiteContext* logging_context, int node_index, const TfLiteNode* node,
@@ -5322,20 +5048,26 @@ class Subgraph {
     const int resource_tensor_id = node->inputs->data[0];
     const int output_tensor_id = node->outputs->data[0];
     const TfLiteTensor& output_tensor = tensors[output_tensor_id];
+    // This could be a scalar or unranked tensor, we don't support
+    // unranked tensor so skip it.
+    // TODO(b/245990811): try to support this, we can delay associating
+    // dim and type with this tensor, assuming that another operation will
+    // provide it, then check that we have dim and type later when
+    // defining tensors.
+    if (output_tensor.dims->size == 0) {
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       output_tensor_id, node_index));
 
     if (subgraph == nullptr) {
-      // This could be a scalar or unranked tensor, we don't support
-      // unranked tensor so skip it.
-      // TODO(b/245990811): try to support this, we can delay associating
-      // dim and type with this tensor, assuming that another operation will
-      // provide it, then check that we have dim and type later when
-      // defining tensors.
-      if (output_tensor.dims->size == 0) {
+      ResourceInfo& resource_info =
+          delegate.GetResourceInfo(resource_tensor_id);
+      if (!resource_info.AddProxyValue(tensors, output_tensor_id,
+                                       XNN_VALUE_FLAG_EXTERNAL_INPUT)) {
         return kTfLiteError;
       }
-      return delegate.AssociateVariableWithTensor(
-          resource_tensor_id, &tensors[node->outputs->data[0]],
-          logging_context);
     } else {
       const xnn_status status = xnn_define_copy(
           subgraph, input_output_tensors.at(resource_tensor_id),
@@ -5350,39 +5082,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitReluNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, float output_min, float output_max,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_RELU, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_clamp(
-          subgraph, output_min, output_max,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_RELU),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitReshapeNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -5576,39 +5275,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitRoundNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_ROUND, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_bankers_rounding(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_ROUND),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitSliceNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -5884,78 +5550,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitSquareNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_SQUARE, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_SQUARE, node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_square(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_SQUARE),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitTanhNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_TANH, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
-                                       node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_tanh(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_TANH),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitTransposeNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -6003,119 +5597,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitSqrtNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_SQRT, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_square_root(
-          subgraph,
-          /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_SQRT),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitRsqrtNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 1, 1, BuiltinOperator_RSQRT, node_index));
-
-    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_reciprocal_square_root(
-          subgraph, /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_RSQRT),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
-  static TfLiteStatus VisitSquaredDifferenceNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_SQUARED_DIFFERENCE,
-        node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input1_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input1_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_SQUARED_DIFFERENCE, node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input2_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input2_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[1],
-        BuiltinOperator_SQUARED_DIFFERENCE, node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_squared_difference(
-          subgraph,
-          /*input1_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*input2_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(
-            logging_context, "failed to delegate %s node #%d",
-            EnumNameBuiltinOperator(BuiltinOperator_SQUARED_DIFFERENCE),
-            node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitStridedSliceNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -6246,13 +5727,14 @@ class Subgraph {
       // inside our kernels; check here and punt those to the default
       // delegate implementation for it to decide how to handle them.
       const int64_t extent = input_tensor.dims->data[i];
-      const size_t offset = begins[i] < 0 ? begins[i] + extent : begins[i];
-      const size_t size =
+      const int64_t offset = begins[i] < 0 ? begins[i] + extent : begins[i];
+      const int64_t size =
           ends[i] <= 0 ? ends[i] + extent - offset : ends[i] - offset;
       if (offset + size > extent) {
         TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "offset %zu + size %zu exceeds extent %zu in "
-                                 "STRIDED_SLICE node #%d for dimension %zu",
+                                 "offset %" PRId64 " + size %" PRId64
+                                 " exceeds extent %" PRId64
+                                 " in STRIDED_SLICE node #%d for dimension %zu",
                                  offset, size, extent, node_index, i);
         return kTfLiteError;
       }
@@ -6367,7 +5849,7 @@ class Subgraph {
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
             xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
-                                    /*dims=*/nullptr, &query_proj.dims->data[3],
+                                    /*dims=*/nullptr, &kConstantClampData,
                                     XNN_INVALID_VALUE_ID, 0, &scale_orig_id));
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
@@ -6719,72 +6201,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitSubNode(
-      xnn_subgraph_t subgraph, const Delegate& delegate,
-      TfLiteContext* logging_context, int node_index, TfLiteNode* node,
-      const TfLiteTensor* tensors, const TfLiteSubParams* sub_params,
-      const std::unordered_map<int, uint32_t>& input_output_tensors) {
-    TF_LITE_ENSURE_STATUS(CheckNumInputsAndOutputs(
-        logging_context, node, 2, 1, BuiltinOperator_SUB, node_index));
-
-    const TfLiteTensor& input1_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input1_tensor,
-                                       node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input1_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[0],
-        BuiltinOperator_SUB, node_index));
-
-    const TfLiteTensor& input2_tensor = tensors[node->inputs->data[1]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input2_tensor,
-                                       node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(
-        logging_context, input2_tensor, /*min_num_dims=*/0,
-        /*max_num_dims=*/XNN_MAX_TENSOR_DIMS, node->inputs->data[1],
-        BuiltinOperator_SUB, node_index));
-
-    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
-                                       node->outputs->data[0], node_index));
-
-    const float scale_min = 1.0f / 1024.0f;
-    const float scale_max = 256.0f;
-    TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
-        logging_context, input1_tensor, output_tensor, scale_min, scale_max,
-        BuiltinOperator_SUB, node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorsInputOutputScale(
-        logging_context, input2_tensor, output_tensor, scale_min, scale_max,
-        BuiltinOperator_SUB, node_index));
-
-    float output_min = -std::numeric_limits<float>::infinity();
-    float output_max = +std::numeric_limits<float>::infinity();
-    if (sub_params != nullptr) {
-      TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
-          logging_context, node_index, sub_params->activation, &output_min,
-          &output_max));
-    }
-
-    if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_subtract(
-          subgraph, output_min, output_max,
-          /*input1_id=*/input_output_tensors.at(node->inputs->data[0]),
-          /*input2_id=*/input_output_tensors.at(node->inputs->data[1]),
-          /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
-                           EnumNameBuiltinOperator(BuiltinOperator_SUB),
-                           node_index);
-        return kTfLiteError;
-      }
-    }
-
-    return kTfLiteOk;
-  }
-
   static TfLiteStatus VisitTransposeConvNode(
       xnn_subgraph_t subgraph, const Delegate& delegate,
       TfLiteContext* logging_context, int node_index, TfLiteNode* node,
@@ -7061,16 +6477,17 @@ class Subgraph {
   static TfLiteStatus VisitVarHandleNode(xnn_subgraph_t subgraph,
                                          Delegate& delegate,
                                          TfLiteContext* logging_context,
-                                         int node_index,
-                                         const TfLiteNode* node) {
+                                         int node_index, TfLiteNode* node) {
     if (!delegate.support_variable_ops()) {
       return kTfLiteError;
     }
+    const TfLiteVarHandleParams* params =
+        static_cast<const TfLiteVarHandleParams*>(node->builtin_data);
+    ResourceInfo& resource_info =
+        delegate.GetResourceInfo(node->outputs->data[0]);
+    const int global_id = delegate.GetGlobalId(params);
+    resource_info.SetVarHandle(node_index, node, global_id);
     if (subgraph == nullptr) {
-      const TfLiteVarHandleParams* params =
-          static_cast<const TfLiteVarHandleParams*>(node->builtin_data);
-      delegate.DefineVariable(params, node->outputs->data[0], logging_context,
-                              node_index);
       // Always return error here because we don't know the type of this
       // variable yet, so we pretend that we can't handle this. Later, after
       // ReadVariable/AssignVariable tells us the data type, and we decide if
@@ -7082,10 +6499,6 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  inline bool NeedToSetUpVariableTensors() const {
-    return has_variables_ && !variables_set_up_;
-  }
-
   inline bool EnableSubgraphReshaping() const {
     return enable_subgraph_reshaping_;
   }
@@ -7094,17 +6507,17 @@ class Subgraph {
 
  private:
   Subgraph(Delegate& delegate, xnn_runtime_t runtime,
-           const std::unordered_set<int>& externals, std::vector<int>& inputs,
-           std::vector<int>& outputs,
-           std::unordered_map<int, uint32_t>& tflite_tensor_to_xnnpack)
+           const std::unordered_set<int>& externals, std::vector<int> inputs,
+           std::vector<int> outputs,
+           std::unordered_map<int, uint32_t> tflite_tensor_to_xnnpack)
       : runtime_(runtime, &xnn_delete_runtime) {
     for (int t : externals) {
       externals_[t] = nullptr;
     }
-    tflite_tensor_to_xnnpack_ = tflite_tensor_to_xnnpack;
-    inputs_ = inputs;
-    outputs_ = outputs;
-    has_variables_ = !delegate.GetAllVariableTensors().empty();
+    tflite_tensor_to_xnnpack_ = std::move(tflite_tensor_to_xnnpack);
+    inputs_ = std::move(inputs);
+    outputs_ = std::move(outputs);
+    resources_ = delegate.local_id_to_resources_;
     enable_subgraph_reshaping_ = delegate.enable_subgraph_reshaping();
     delegate_ = &delegate;
   }
@@ -7125,14 +6538,11 @@ class Subgraph {
   // Mapping from TFLite Tensor IDs for tensors in the delegated subgraph to
   // the XNNPACK ID.
   std::unordered_map<int, uint32_t> tflite_tensor_to_xnnpack_;
+  // Mapping from tensors to a "resource" ID.
+  std::unordered_map<int, ResourceInfo> resources_;
   // Memory location to use for 0-size external tensors, as TFLite init their
   // data pointer to nullptr, and XNNPACK requires valid data pointers.
   char dummy_data_{0};
-  // Persistent tensors need to be set up in all cases (even without external
-  // inputs or outputs), but does not need to be set up again for further
-  // invoke calls.
-  bool has_variables_ = false;
-  bool variables_set_up_ = false;
   bool enable_subgraph_reshaping_ = false;
   Delegate* delegate_;
 };
@@ -7153,7 +6563,7 @@ TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
   static_unpack_nodes_.clear();
   static_sparse_weights_.clear();
   f16_input_tensor_for_dequant_f32_tensor_.clear();
-  variable_holder_.ClearTensorIdToGlobalId();
+  local_id_to_resources_.clear();
 
   TfLiteIntArray* execution_plan = nullptr;
   if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
@@ -7183,14 +6593,6 @@ TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
   std::unordered_set<int> quasi_static_tensors;
   // Set of quasi-static tensors consumed by the delegated nodes.
   std::unordered_set<int> quasi_static_tensors_to_unpack;
-  // Record all VarHandle nodes. At the point of visiting it, we don't know if
-  // it can be delegated yet, because we don't know the type of the variable -
-  // we rely on ReadVariable/AssignVariable to tell us the type. So the first
-  // pass of VisitNode will associate the variable with a tensor, and after
-  // the graph is walked once, we check all VarHandle nodes and decide if we
-  // can handle them based on checking the global id of the variable tensor.
-  // Maps VarHandle node index to local tensor id (i.e. output tensor id).
-  std::unordered_map<int, int> variable_handles;
 
   TfLiteIntArray* nodes_to_delegate =
       TfLiteIntArrayCreate(execution_plan->size);
@@ -7314,11 +6716,6 @@ TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
       }
     }
 
-    // Record node_index as we need it to check if node is delegated or not.
-    if (registration->builtin_code == kTfLiteBuiltinVarHandle) {
-      variable_handles[node_index] = node->outputs->data[0];
-    }
-
     if (Subgraph::VisitNode(
             /*subgraph=*/nullptr, /*delegate=*/*this, context, registration,
             node, node_index, quasi_static_tensors,
@@ -7347,10 +6744,11 @@ TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
   }
 
   // Record which resource variables can be delegated.
-  // A resource variable can be delegated if the global id can be found.
-  for (const auto& n : variable_handles) {
-    if (CanVariableBeDelegated(n.second)) {
-      nodes_to_delegate->data[nodes_to_delegate->size++] = n.first;
+  for (const auto& i : local_id_to_resources_) {
+    if (i.second.GetProxyValue() >= 0) {
+      // We can delegate this value.
+      nodes_to_delegate->data[nodes_to_delegate->size++] =
+          i.second.GetVarHandleNodeIndex();
     }
   }
 
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
index e7f171307277..ccd2840a8a13 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
@@ -58,6 +58,8 @@ typedef struct {
   // Number of threads to use in the thread pool.
   // 0 or negative value means no thread pool used.
   int32_t num_threads;
+  // Flags to pass to `xnn_create_runtime`
+  uint32_t runtime_flags;
   // Bitfield with any combination of the following binary options:
   // - TFLITE_XNNPACK_DELEGATE_FLAG_QS8
   // - TFLITE_XNNPACK_DELEGATE_FLAG_QU8
diff --git a/tensorflow/lite/examples/label_image/bitmap_helpers.cc b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
index d3698f3b2221..32d7f443fc49 100644
--- a/tensorflow/lite/examples/label_image/bitmap_helpers.cc
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <fstream>
 #include <iostream>
 #include <string>
+#include <vector>
 
 #include "tensorflow/lite/examples/label_image/label_image.h"
 #include "tensorflow/lite/examples/label_image/log.h"
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 9368f00d0fc5..1d441b46b3cc 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <sys/uio.h>    // NOLINT(build/include_order)
 #include <unistd.h>     // NOLINT(build/include_order)
 
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
diff --git a/tensorflow/lite/examples/label_image/label_image_test.cc b/tensorflow/lite/examples/label_image/label_image_test.cc
index d4e2e8727048..02410987e628 100644
--- a/tensorflow/lite/examples/label_image/label_image_test.cc
+++ b/tensorflow/lite/examples/label_image/label_image_test.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/examples/label_image/label_image.h"
 
+#include <cstdint>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
diff --git a/tensorflow/lite/examples/minimal/minimal.cc b/tensorflow/lite/examples/minimal/minimal.cc
index 86b598e2734f..007469fc57bb 100644
--- a/tensorflow/lite/examples/minimal/minimal.cc
+++ b/tensorflow/lite/examples/minimal/minimal.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdio>
+#include <cstdlib>
+#include <memory>
 
 #include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/interpreter.h"
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/android_info.cc b/tensorflow/lite/experimental/acceleration/compatibility/android_info.cc
index 711b6b7fd6a9..9bf7ecb68929 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/android_info.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/android_info.cc
@@ -69,6 +69,8 @@ absl::Status RequestAndroidInfo(AndroidInfo* info_out) {
        model.find("Emulator") != std::string::npos ||                  // NOLINT
        model.find("Android SDK built for x86") !=                      // NOLINT
            std::string::npos ||                                        // NOLINT
+       model.find("Android SDK built for arm64") !=                    // NOLINT
+           std::string::npos ||                                        // NOLINT
        manufacturer.find("Genymotion") != std::string::npos ||         // NOLINT
        product.find("sdk_google") != std::string::npos ||              // NOLINT
        product.find("google_sdk") != std::string::npos ||              // NOLINT
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
index 4360e6a615f6..6d14959b1a9b 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
@@ -294,8 +294,8 @@ cc_library(
     hdrs = ["decode_jpeg_register.h"],
     copts = tflite_copts(),
     deps = [
+        ":decode_jpeg_status",
         ":libjpeg_decoder",
-        "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc
index b1e2d619904c..ea6e7ff5ad55 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc
@@ -14,17 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
-#include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
@@ -124,7 +124,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   std::unique_ptr<LibjpegDecoder> decoder =
       LibjpegDecoder::Create(decoder_status);
   if (decoder_status.code != kTfLiteOk) {
-    TF_LITE_KERNEL_LOG(context, decoder_status.error_message.c_str());
+    TF_LITE_KERNEL_LOG(context, "%s", decoder_status.error_message.c_str());
     return kTfLiteError;
   }
 
@@ -166,7 +166,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     output_array_offset += kOutputImageSize;
 
     if (decode_status.code != kTfLiteOk) {
-      TF_LITE_KERNEL_LOG(context, decode_status.error_message.c_str());
+      TF_LITE_KERNEL_LOG(context, "%s", decode_status.error_message.c_str());
       return kTfLiteError;
     }
   }
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc
index 21f2b6d5b608..f8e74dd17bf9 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc
@@ -14,9 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h"
 
-#include <algorithm>
-#include <iostream>
-#include <iterator>
+#include <cstdint>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc
index c93f3800d116..0403a5e5b084 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h"
 
-#include <iostream>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
index de00c24e9cb9..f99e300af83b 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
@@ -115,11 +115,6 @@ class Validator {
 
   std::unique_ptr<tools::ModelLoader> model_loader_;
   const ComputeSettings* compute_settings_;
-  // Optional. Interpreter that runs on CPU.
-  std::unique_ptr<Interpreter> golden_interpreter_;
-  // Interpreter that runs with delegate enabled, using the compute settings
-  // passed to the Validator constructor.
-  std::unique_ptr<Interpreter> interpreter_;
   // Op resolver used to create the interpreters. Depending on the
   // compute_settings_, it may or may not include the default delegate.
   std::unique_ptr<::tflite::MutableOpResolver> resolver_;
@@ -129,6 +124,11 @@ class Validator {
   TfLiteOpaqueDelegatePtr opaque_delegate_ =
       TfLiteOpaqueDelegatePtr(nullptr, [](TfLiteOpaqueDelegate*) {});
   std::unique_ptr<tflite::delegates::DelegatePluginInterface> delegate_plugin_;
+  // Optional. Interpreter that runs on CPU.
+  std::unique_ptr<Interpreter> golden_interpreter_;
+  // Interpreter that runs with delegate enabled, using the compute settings
+  // passed to the Validator constructor.
+  std::unique_ptr<Interpreter> interpreter_;
   int validation_entrypoint_index_ = -1;
   Subgraph* validation_entrypoint_ = nullptr;
   Subgraph* main_model_ = nullptr;
diff --git a/tensorflow/lite/experimental/genai/BUILD b/tensorflow/lite/experimental/genai/BUILD
index 144734c0ba36..a625295714bf 100644
--- a/tensorflow/lite/experimental/genai/BUILD
+++ b/tensorflow/lite/experimental/genai/BUILD
@@ -26,12 +26,9 @@ cc_library(
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/experimental/resource:cache_buffer",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels:reference_ops",
         "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
         "@flatbuffers",
     ],
@@ -88,7 +85,7 @@ pybind_extension(
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite/c:common",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/experimental/genai/genai_ops_wrapper.cc b/tensorflow/lite/experimental/genai/genai_ops_wrapper.cc
index 8fa8451909e5..cd57ea2cc55d 100644
--- a/tensorflow/lite/experimental/genai/genai_ops_wrapper.cc
+++ b/tensorflow/lite/experimental/genai/genai_ops_wrapper.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "tensorflow/lite/experimental/genai/genai_ops.h"
diff --git a/tensorflow/lite/experimental/genai/kvcache.cc b/tensorflow/lite/experimental/genai/kvcache.cc
index 59fa3abd7ed5..f4f8bacf43eb 100644
--- a/tensorflow/lite/experimental/genai/kvcache.cc
+++ b/tensorflow/lite/experimental/genai/kvcache.cc
@@ -267,10 +267,10 @@ TfLiteStatus KVCacheEval(TfLiteContext* context, TfLiteNode* node) {
   v_ptr = v_ptr + sizeof(float) * op_data->layer_index * elements_in_one_block;
 
   // 0. Ensure output ptr is pointing to the cache data
-  TF_LITE_ENSURE_EQ(context, k_ptr, op_data->key_cache_ptr);
-  TF_LITE_ENSURE_EQ(context, v_ptr, op_data->value_cache_ptr);
-  TF_LITE_ENSURE_EQ(context, k_ptr, kfull->data.data);
-  TF_LITE_ENSURE_EQ(context, v_ptr, vfull->data.data);
+  TF_LITE_ENSURE(context, k_ptr == op_data->key_cache_ptr);
+  TF_LITE_ENSURE(context, v_ptr == op_data->value_cache_ptr);
+  TF_LITE_ENSURE(context, k_ptr == kfull->data.data);
+  TF_LITE_ENSURE(context, v_ptr == vfull->data.data);
 
   // 1. Determine which slots the inputs take up, and which slots are in the
   //    existing span of the cache.
diff --git a/tensorflow/lite/experimental/litert/BUILD b/tensorflow/lite/experimental/litert/BUILD
deleted file mode 100644
index 23b07d5602d7..000000000000
--- a/tensorflow/lite/experimental/litert/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
diff --git a/tensorflow/lite/experimental/litert/build_common/BUILD b/tensorflow/lite/experimental/litert/build_common/BUILD
deleted file mode 100644
index d47ea17f2756..000000000000
--- a/tensorflow/lite/experimental/litert/build_common/BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-exports_files(srcs = [
-    "export_litert_only_darwin.lds",
-    "export_litert_only_linux.lds",
-    "export_litert_runtime_only_darwin.lds",
-    "export_litert_runtime_only_linux.lds",
-])
-
-bzl_library(
-    name = "special_rule_bzl",
-    srcs = ["special_rule.bzl"],
-    visibility = ["//visibility:private"],
-)
-
-bzl_library(
-    name = "litert_build_defs_bzl",
-    srcs = ["litert_build_defs.bzl"],
-    visibility = ["//visibility:private"],
-)
diff --git a/tensorflow/lite/experimental/litert/build_common/export_litert_only_darwin.lds b/tensorflow/lite/experimental/litert/build_common/export_litert_only_darwin.lds
deleted file mode 100644
index a51afcee0a21..000000000000
--- a/tensorflow/lite/experimental/litert/build_common/export_litert_only_darwin.lds
+++ /dev/null
@@ -1,8 +0,0 @@
-# Compiler Plugin
-*LiteRt*CompilerPlugin*
-
-# Compiled Result
-*LiteRt*CompiledResult*
-
-# Dispatch
-*LiteRtDispatch*
diff --git a/tensorflow/lite/experimental/litert/build_common/export_litert_only_linux.lds b/tensorflow/lite/experimental/litert/build_common/export_litert_only_linux.lds
deleted file mode 100644
index 97b05c1d655a..000000000000
--- a/tensorflow/lite/experimental/litert/build_common/export_litert_only_linux.lds
+++ /dev/null
@@ -1,29 +0,0 @@
-VERS_1.0 {
-
-  /*
-    Export abi-stable "vendor" implemented symbols.
-
-    TODO: Add all vendor symbols. Also export qnn libc++ symbols
-    (statically linked) as "protected" as needed.
-  */
-
-  global:
-
-    /* Compiler Plugin */
-
-    LiteRt*CompilerPlugin*;
-
-    /* Compiled Result */
-
-    LiteRt*CompiledResult*;
-
-    /* Dispatch */
-
-    LiteRtDispatch*;
-
-  local:
-
-    /* Hide everything else */
-
-    *;
-};
diff --git a/tensorflow/lite/experimental/litert/build_common/export_litert_runtime_only_darwin.lds b/tensorflow/lite/experimental/litert/build_common/export_litert_runtime_only_darwin.lds
deleted file mode 100644
index 9638faa6b23e..000000000000
--- a/tensorflow/lite/experimental/litert/build_common/export_litert_runtime_only_darwin.lds
+++ /dev/null
@@ -1,2 +0,0 @@
-# All LiteRt C APIs
-LiteRt*
diff --git a/tensorflow/lite/experimental/litert/build_common/export_litert_runtime_only_linux.lds b/tensorflow/lite/experimental/litert/build_common/export_litert_runtime_only_linux.lds
deleted file mode 100644
index 6948af4950cf..000000000000
--- a/tensorflow/lite/experimental/litert/build_common/export_litert_runtime_only_linux.lds
+++ /dev/null
@@ -1,20 +0,0 @@
-VERS_1.0 {
-
-  /*
-    Export abi-stable "vendor" implemented symbols.
-
-    TODO: Add all vendor symbols. Also export qnn libc++ symbols
-    (statically linked) as "protected" as needed.
-  */
-
-  global:
-
-    /* All LiteRt C APIs */
-    LiteRt*;
-
-  local:
-
-    /* Hide everything else */
-
-    *;
-};
diff --git a/tensorflow/lite/experimental/litert/build_common/litert_build_defs.bzl b/tensorflow/lite/experimental/litert/build_common/litert_build_defs.bzl
deleted file mode 100644
index 2dadfd729076..000000000000
--- a/tensorflow/lite/experimental/litert/build_common/litert_build_defs.bzl
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Common LiteRT Build Utilities."""
-
-####################################################################################################
-# Util
-
-_LRT_SO_PREFIX = "libLiteRt"
-_SO_EXT = ".so"
-_SHARED_LIB_SUFFIX = "_so"
-
-# Public
-
-def make_linkopt(opt):
-    return "-Wl,{}".format(opt)
-
-def make_rpaths(rpaths):
-    return make_linkopt("-rpath={}".format(":".join(rpaths)))
-
-def append_rule_kwargs(rule_kwargs, **append):
-    for k, v in append.items():
-        append_to = rule_kwargs.pop(k, [])
-        append_to += v
-        rule_kwargs[k] = append_to
-
-# Private
-
-def _valild_shared_lib_name(name):
-    return name.endswith(_SHARED_LIB_SUFFIX)
-
-def _valid_so_name(name):
-    return name.startswith(_LRT_SO_PREFIX) and name.endswith(_SO_EXT)
-
-def _make_target_ref(name):
-    return ":{}".format(name)
-
-####################################################################################################
-# Explicitly Link System Libraries ("ungrte")
-
-_SYS_RPATHS_X86_64 = [
-    "/usr/lib/x86_64-linux-gnu",
-    "/lib/x86_64-linux-gnu",
-]
-_SYS_RPATHS_LINKOPT_X86_64 = make_rpaths(_SYS_RPATHS_X86_64)
-
-_SYS_ELF_INTERPRETER_X86_64 = "/lib64/ld-linux-x86-64.so.2"
-_SYS_ELF_INTERPRETER_LINKOPT_X86_64 = make_linkopt("--dynamic-linker={}".format(_SYS_ELF_INTERPRETER_X86_64))
-
-####################################################################################################
-# Symbol Hiding
-
-_EXPORT_LRT_ONLY_SCRIPT_LINUX = "//tensorflow/lite/experimental/litert/build_common:export_litert_only_linux.lds"
-_EXPORT_LRT_ONLY_SCRIPT_DARWIN = "//tensorflow/lite/experimental/litert/build_common:export_litert_only_darwin.lds"
-_EXPORT_LRT_ONLY_LINKOPT_LINUX = make_linkopt("--version-script=$(location {})".format(_EXPORT_LRT_ONLY_SCRIPT_LINUX))
-_EXPORT_LRT_ONLY_LINKOPT_DARWIN = make_linkopt("-exported_symbols_list,$(location {})".format(_EXPORT_LRT_ONLY_SCRIPT_DARWIN))
-
-def symbol_opts():
-    """Defines linker flags whether to include symbols or not."""
-    return select({
-        "//tensorflow:debug": [],
-        "//conditions:default": [
-            # Omit symbol table, for all non debug builds
-            "-Wl,-s",
-        ],
-    })
-
-def export_lrt_only_script():
-    return select({
-        "//tensorflow:linux_x86_64": [_EXPORT_LRT_ONLY_SCRIPT_LINUX],
-        "//tensorflow:android": [_EXPORT_LRT_ONLY_SCRIPT_LINUX],
-        "//tensorflow:macos": [_EXPORT_LRT_ONLY_SCRIPT_DARWIN],
-        "//tensorflow:ios": [_EXPORT_LRT_ONLY_SCRIPT_DARWIN],
-        "//conditions:default": [],
-    })
-
-def export_lrt_only_linkopt():
-    return select({
-        "//tensorflow:linux_x86_64": [_EXPORT_LRT_ONLY_LINKOPT_LINUX],
-        "//tensorflow:android": [_EXPORT_LRT_ONLY_LINKOPT_LINUX],
-        "//tensorflow:macos": [_EXPORT_LRT_ONLY_LINKOPT_DARWIN],
-        "//tensorflow:ios": [_EXPORT_LRT_ONLY_LINKOPT_DARWIN],
-        "//conditions:default": [],
-    }) + symbol_opts()
-
-_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_LINUX = "//tensorflow/lite/experimental/litert/build_common:export_litert_runtime_only_linux.lds"
-_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_DARWIN = "//tensorflow/lite/experimental/litert/build_common:export_litert_runtime_only_darwin.lds"
-_EXPORT_LRT_RUNTIME_ONLY_LINKOPT_LINUX = make_linkopt("--version-script=$(location {})".format(_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_LINUX))
-_EXPORT_LRT_RUNTIME_ONLY_LINKOPT_DARWIN = make_linkopt("-exported_symbols_list,$(location {})".format(_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_DARWIN))
-
-# TODO b/391390553: Add "-Wl,--no-undefined" to make sure all symbols are defined.
-_EXPORT_LRT_COMMON_LINKOPTS_LINUX = [
-    "-Wl,--no-export-dynamic",  # Only inc syms referenced by dynamic obj.
-    "-Wl,--gc-sections",  # Eliminate unused code and data.
-    "-Wl,--as-needed",  # Don't link unused libs.a
-]
-
-def export_lrt_runtime_only_script():
-    return select({
-        "//tensorflow:linux_x86_64": [_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_LINUX],
-        "//tensorflow:android": [_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_LINUX],
-        "//tensorflow:macos": [_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_DARWIN],
-        "//tensorflow:ios": [_EXPORT_LRT_RUNTIME_ONLY_SCRIPT_DARWIN],
-        "//conditions:default": [],
-    })
-
-def export_lrt_runtime_only_linkopt():
-    return select({
-        "//tensorflow:linux_x86_64": _EXPORT_LRT_COMMON_LINKOPTS_LINUX + [_EXPORT_LRT_RUNTIME_ONLY_LINKOPT_LINUX],
-        "//tensorflow:android": _EXPORT_LRT_COMMON_LINKOPTS_LINUX + [
-            "-Wl,-z,max-page-size=16384",
-            _EXPORT_LRT_RUNTIME_ONLY_LINKOPT_LINUX,
-        ],
-        "//tensorflow:macos": [_EXPORT_LRT_RUNTIME_ONLY_LINKOPT_DARWIN],
-        "//tensorflow:ios": [_EXPORT_LRT_RUNTIME_ONLY_LINKOPT_DARWIN],
-        "//conditions:default": [],
-    }) + symbol_opts()
-
-####################################################################################################
-# Macros
-
-# Private
-
-def _litert_base(
-        rule,
-        ungrte = False,
-        **cc_rule_kwargs):
-    """
-    Base rule for LiteRT targets.
-
-    Args:
-      rule: The underlying rule to use (e.g., cc_test, cc_library).
-      ungrte: Whether to link against system libraries ("ungrte").
-      **cc_rule_kwargs: Keyword arguments to pass to the underlying rule.
-    """
-    if ungrte:
-        append_rule_kwargs(
-            cc_rule_kwargs,
-            linkopts = select({
-                "//tensorflow:linux_x86_64": [_SYS_ELF_INTERPRETER_LINKOPT_X86_64, _SYS_RPATHS_LINKOPT_X86_64],
-                "//conditions:default": [],
-            }),
-        )
-    rule(**cc_rule_kwargs)
-
-# Public
-
-def litert_test(
-        ungrte = False,
-        use_sys_malloc = False,
-        **cc_test_kwargs):
-    """
-    LiteRT test rule.
-
-    Args:
-      ungrte: Whether to link against system libraries ("ungrte").
-      use_sys_malloc: Whether to use the system malloc.
-      **cc_test_kwargs: Keyword arguments to pass to the underlying rule.
-    """
-    if use_sys_malloc:
-        # copybara:uncomment cc_test_kwargs["malloc"] = "//base:system_malloc"
-        pass
-
-    append_rule_kwargs(
-        cc_test_kwargs,
-        deps = ["@com_google_googletest//:gtest_main"],
-    )
-
-    _litert_base(
-        native.cc_test,
-        ungrte,
-        **cc_test_kwargs
-    )
-
-def litert_lib(
-        ungrte = False,
-        **cc_lib_kwargs):
-    """
-    LiteRT library rule.
-
-    Args:
-      ungrte: Whether to link against system libraries ("ungrte").
-      **cc_lib_kwargs: Keyword arguments to pass to the underlying rule.
-    """
-    _litert_base(
-        native.cc_library,
-        ungrte,
-        **cc_lib_kwargs
-    )
-
-def litert_bin(
-        ungrte = False,
-        export_litert_only = False,
-        **cc_bin_kwargs):
-    """
-    LiteRT binary rule.
-
-    Args:
-      ungrte: Whether to link against system libraries ("ungrte").
-      export_litert_only: Whether to export only LiteRT symbols.
-      **cc_bin_kwargs: Keyword arguments to pass to the underlying rule.
-    """
-    if export_litert_only:
-        append_rule_kwargs(
-            cc_bin_kwargs,
-            linkopts = export_lrt_only_linkopt(),
-            deps = export_lrt_only_script(),
-        )
-
-    _litert_base(
-        native.cc_binary,
-        ungrte,
-        **cc_bin_kwargs
-    )
-
-def litert_dynamic_lib(
-        name,
-        shared_lib_name,
-        so_name,
-        export_litert_only = False,
-        ungrte = False,
-        **cc_lib_kwargs):
-    """
-    LiteRT dynamic library rule.
-
-    Args:
-      name: The name of the library.
-      shared_lib_name: The name of the shared library.
-      so_name: The name of the shared object file.
-      export_litert_only: Whether to export only LiteRT symbols.
-      ungrte: Whether to link against system libraries ("ungrte").
-      **cc_lib_kwargs: Keyword arguments to pass to the underlying rule.
-    """
-    if not _valild_shared_lib_name(shared_lib_name):
-        fail("\"shared_lib_name\" must end with \"_so\"")
-    if not _valid_so_name(so_name):
-        fail("\"so_name\" must be \"libLiteRt*.so\"")
-
-    lib_name = name
-    cc_lib_kwargs["name"] = lib_name
-
-    lib_target_ref = _make_target_ref(lib_name)
-
-    vis = cc_lib_kwargs.get("visibility", None)
-
-    # Share tags for all targets.
-    tags = cc_lib_kwargs.get("tags", [])
-
-    litert_lib(
-        ungrte = ungrte,
-        **cc_lib_kwargs
-    )
-
-    user_link_flags = []
-    additional_linker_inputs = []
-    if export_litert_only:
-        user_link_flags = export_lrt_only_linkopt()
-        additional_linker_inputs = export_lrt_only_script()
-
-    native.cc_shared_library(
-        name = shared_lib_name,
-        shared_lib_name = so_name,
-        user_link_flags = user_link_flags,
-        additional_linker_inputs = additional_linker_inputs,
-        tags = tags,
-        visibility = vis,
-        deps = [lib_target_ref],
-    )
-
-def copy_file(name, src, target, visibility = None):
-    input_path = "$(location %s)" % src
-    output_path = "$(@D)/" + target
-
-    native.genrule(
-        name = name,
-        srcs = [src],
-        outs = [target],
-        visibility = visibility,
-        cmd = "cp %s %s" % (input_path, output_path),
-    )
-
-def gtest_main_no_heapcheck_deps():
-    # copybara:uncomment_begin(google-only)
-    # return ["//testing/base/public:gunit_main_no_heapcheck"]
-    # copybara:uncomment_end
-    # copybara:comment_begin(oss-only)
-    return ["@com_google_googletest//:gtest_main"]
-    # copybara:comment_end
diff --git a/tensorflow/lite/experimental/litert/build_common/special_rule.bzl b/tensorflow/lite/experimental/litert/build_common/special_rule.bzl
deleted file mode 100644
index e6a3c1c47fcf..000000000000
--- a/tensorflow/lite/experimental/litert/build_common/special_rule.bzl
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2025 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""External versions of LiteRT build rules that differ outside of Google."""
-
-def lite_rt_friends():
-    """Internal visibility for packages outside of LiteRT code location.
-
-    Return the package group declaration for internal code locations that need
-    visibility to LiteRT APIs"""
-
-    return []
-
-def gles_deps():
-    """This is a no-op outside of Google."""
-    return []
-
-def gles_headers():
-    """This is a no-op outside of Google."""
-    return []
-
-def gles_linkopts():
-    """This is a no-op outside of Google."""
-    return []
diff --git a/tensorflow/lite/experimental/litert/c/BUILD b/tensorflow/lite/experimental/litert/c/BUILD
deleted file mode 100644
index 31556edb9a6f..000000000000
--- a/tensorflow/lite/experimental/litert/c/BUILD
+++ /dev/null
@@ -1,521 +0,0 @@
-# copybara:uncomment_begin(google-only)
-# load("//devtools/deps/check:deps_check.bzl", "check_dependencies")
-#
-# copybara:uncomment_end
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "copy_file", "export_lrt_runtime_only_linkopt", "export_lrt_runtime_only_script")
-load("//tensorflow/lite/experimental/litert/build_common:special_rule.bzl", "gles_deps", "gles_headers", "gles_linkopts")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "litert_common",
-    srcs = ["litert_common.cc"],
-    hdrs = ["litert_common.h"],
-)
-
-cc_test(
-    name = "litert_common_test",
-    srcs = ["litert_common_test.cc"],
-    deps = [
-        ":litert_common",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_any",
-    hdrs = ["litert_any.h"],
-)
-
-cc_library(
-    name = "litert_environment",
-    srcs = ["litert_environment.cc"],
-    hdrs = ["litert_environment.h"],
-    deps = [
-        ":litert_any",
-        ":litert_common",
-        ":litert_logging",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "litert_logging",
-    srcs = [
-        "litert_logging.cc",
-    ],
-    hdrs = [
-        "litert_logging.h",
-    ],
-    deps = [
-        ":litert_common",
-        "//tensorflow/lite:minimal_logging",
-    ],
-)
-
-cc_test(
-    name = "litert_logging_test",
-    srcs = [
-        "litert_logging_test.cc",
-    ],
-    deps = [
-        ":litert_common",
-        ":litert_logging",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_layout",
-    hdrs = ["litert_layout.h"],
-    deps = [
-        ":litert_common",
-        ":litert_op_code",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "litert_model",
-    srcs = ["litert_model.cc"],
-    hdrs = ["litert_model.h"],
-    deps = [
-        ":litert_common",
-        ":litert_layout",
-        ":litert_op_code",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/model:model_load",
-        "//tensorflow/lite/experimental/litert/core/model:model_serialize",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "litert_model_test",
-    srcs = ["litert_model_test.cc"],
-    deps = [
-        ":litert_common",
-        ":litert_model",
-        ":litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_op_code",
-    hdrs = ["litert_op_code.h"],
-    deps = ["//tensorflow/lite:builtin_ops"],
-)
-
-cc_library(
-    name = "litert_options",
-    srcs = ["litert_options.cc"],
-    hdrs = [
-        "litert_options.h",
-    ],
-    deps = [
-        ":litert_common",
-        ":litert_op_code",
-        "//tensorflow/compiler/mlir/lite/core:model_builder_base",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/core/model",
-    ],
-)
-
-cc_test(
-    name = "litert_options_test",
-    srcs = ["litert_options_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":litert_common",
-        ":litert_options",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_event",
-    srcs = ["litert_event.cc"],
-    hdrs = ["litert_event.h"],
-    deps = [
-        ":litert_common",
-        ":litert_logging",
-        "//tensorflow/lite/experimental/litert/runtime:event",
-    ],
-)
-
-cc_library(
-    name = "litert_tensor_buffer",
-    srcs = [
-        "litert_tensor_buffer.cc",
-        "litert_tensor_buffer_requirements.cc",
-    ],
-    hdrs = [
-        "litert_tensor_buffer.h",
-        "litert_tensor_buffer_requirements.h",
-    ],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-landroid",
-        ],
-        "//conditions:default": [],
-    }) + gles_linkopts(),
-    deps = [
-        ":litert_common",
-        ":litert_event",
-        ":litert_logging",
-        ":litert_model",
-        "//tensorflow/lite/experimental/litert/runtime:tensor_buffer",
-        "@com_google_absl//absl/types:span",
-        "@opencl_headers",
-    ] + gles_deps(),
-)
-
-cc_test(
-    name = "litert_tensor_buffer_test",
-    srcs = [
-        "litert_tensor_buffer_test.cc",
-    ],
-    # require GPU to run OpenCL tests.
-    tags = [
-        "requires-gpu-nvidia",
-    ],
-    deps = [
-        ":litert_common",
-        ":litert_model",
-        ":litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_layout",
-        "//tensorflow/lite/experimental/litert/runtime:tensor_buffer",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "litert_tensor_buffer_requirements_test",
-    srcs = [
-        "litert_tensor_buffer_requirements_test.cc",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":litert_common",
-        ":litert_tensor_buffer",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_dispatch_delegate",
-    hdrs = [
-        "litert_dispatch_delegate.h",
-    ],
-    deps = [
-        "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/utils:simple_opaque_delegate",
-        "//tensorflow/lite/experimental/litert/runtime/dispatch:dispatch_delegate",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-    ],
-)
-
-cc_library(
-    name = "litert_compiled_model_options",
-    srcs = ["litert_compiled_model_options.cc"],
-    hdrs = [
-        "litert_compiled_model_options.h",
-    ],
-    deps = [
-        ":litert_accelerator_options",
-        ":litert_common",
-        ":litert_logging",
-    ],
-)
-
-cc_test(
-    name = "litert_compiled_model_options_test",
-    srcs = ["litert_compiled_model_options_test.cc"],
-    deps = [
-        ":litert_accelerator_options",
-        ":litert_common",
-        ":litert_compiled_model_options",
-        "//tensorflow/lite/experimental/litert/core:accelerator",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_compiled_model",
-    srcs = ["litert_compiled_model.cc"],
-    hdrs = [
-        "litert_compiled_model.h",
-    ],
-    deps = [
-        ":litert_common",
-        ":litert_compiled_model_options",
-        ":litert_environment",
-        ":litert_logging",
-        ":litert_model",
-        ":litert_tensor_buffer",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/runtime:compiled_model",
-    ],
-)
-
-cc_test(
-    name = "litert_compiled_model_test",
-    srcs = [
-        "litert_compiled_model_test.cc",
-    ],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/simple_model.tflite",
-    ],
-    deps = [
-        ":litert_common",
-        ":litert_compiled_model",
-        ":litert_compiled_model_options",
-        ":litert_environment",
-        ":litert_model",
-        ":litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-# The same test as `litert_compiled_model_test` but using the shared library `libLiteRtRuntimeCApi.so`.
-cc_test(
-    name = "litert_compiled_model_shared_lib_test",
-    srcs = [
-        "litert_compiled_model_test.cc",
-    ],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/simple_model.tflite",
-    ],
-    deps = [
-        ":litert_runtime_c_api_shared_lib",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_accelerator_options",
-    srcs = ["litert_accelerator_options.cc"],
-    hdrs = ["litert_accelerator_options.h"],
-    deps = [
-        ":litert_common",
-        "//tensorflow/lite/experimental/litert/core:accelerator",
-    ],
-)
-
-cc_test(
-    name = "litert_accelerator_options_test",
-    srcs = ["litert_accelerator_options_test.cc"],
-    deps = [
-        ":litert_accelerator_options",
-        ":litert_common",
-        "//tensorflow/lite/experimental/litert/core:accelerator",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_accelerator",
-    srcs = ["litert_accelerator.cc"],
-    hdrs = ["litert_accelerator.h"],
-    deps = [
-        ":litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core:accelerator",
-        "//tensorflow/lite/experimental/litert/core:environment",
-    ],
-)
-
-cc_test(
-    name = "litert_accelerator_test",
-    srcs = ["litert_accelerator_test.cc"],
-    deps = [
-        ":litert_accelerator",
-        ":litert_accelerator_registration",
-        ":litert_common",
-        ":litert_environment",
-        "//tensorflow/lite/experimental/litert/core:accelerator",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_accelerator_registration",
-    srcs = ["litert_accelerator_registration.cc"],
-    hdrs = ["litert_accelerator_registration.h"],
-    deps = [
-        ":litert_accelerator",
-        ":litert_accelerator_options",
-        ":litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core:accelerator",
-        "//tensorflow/lite/experimental/litert/core:environment",
-    ],
-)
-
-cc_test(
-    name = "litert_accelerator_registration_test",
-    srcs = ["litert_accelerator_registration_test.cc"],
-    deps = [
-        ":litert_accelerator",
-        ":litert_accelerator_options",
-        ":litert_accelerator_registration",
-        ":litert_common",
-        ":litert_environment",
-        "//tensorflow/lite/experimental/litert/core:accelerator",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-filegroup(
-    name = "litert_model_srcs",
-    srcs = ["litert_model.cc"],
-    visibility = ["//tensorflow/lite/experimental/litert/core/model:__pkg__"],
-)
-
-filegroup(
-    name = "litert_model_hdrs",
-    srcs = ["litert_model.h"],
-    visibility = ["//tensorflow/lite/experimental/litert/core/model:__pkg__"],
-)
-
-# Collection of all C API targets.
-LITERT_C_API_COMMON_DEPS = [
-    ":litert_accelerator",
-    ":litert_accelerator_registration",
-    ":litert_any",
-    ":litert_common",
-    ":litert_compiled_model",
-    ":litert_compiled_model_options",
-    ":litert_dispatch_delegate",
-    ":litert_event",
-    ":litert_environment",
-    ":litert_layout",
-    ":litert_logging",
-    ":litert_model",
-    ":litert_op_code",
-    ":litert_options",
-    ":litert_tensor_buffer",
-]
-
-# This test verifies that the C API header files can build via C compiler.
-cc_test(
-    name = "litert_c_api_common_test",
-    srcs = ["litert_c_api_common_test.c"],
-    copts = ["--std=c11"],
-    linkopts = ["-ldl"],
-    deps = LITERT_C_API_COMMON_DEPS,
-)
-
-# Build `litert/c:litert_runtime_c_api_so` for `libLiteRtRuntimeCApi.so`.
-cc_shared_library(
-    name = "litert_runtime_c_api_so",
-    additional_linker_inputs = export_lrt_runtime_only_script(),
-    shared_lib_name = "libLiteRtRuntimeCApi.so",
-    user_link_flags = export_lrt_runtime_only_linkopt() + select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }) + ["-Wl,-soname=libLiteRtRuntimeCApi.so"],
-    deps = LITERT_C_API_COMMON_DEPS,
-)
-
-cc_library(
-    name = "litert_dispatch_headers",
-    hdrs = [
-        ":litert_environment.h",
-        ":litert_accelerator.h",
-        ":litert_accelerator_options.h",
-        ":litert_any.h",
-        ":litert_common.h",
-        ":litert_compiled_model.h",
-        ":litert_compiled_model_options.h",
-        ":litert_event.h",
-        ":litert_layout.h",
-        ":litert_logging.h",
-        ":litert_model.h",
-        ":litert_op_code.h",
-        ":litert_options.h",
-        ":litert_tensor_buffer.h",
-        ":litert_tensor_buffer_requirements.h",
-        # Needed for litert/c/litert_op_code.h
-        "//tensorflow/lite:builtin_ops.h",
-        # Neeeded for litert/c/litert_model.h
-        "//tensorflow/lite/c:tensorflowlite_c_api_hdrs_filegroup",
-        "//tensorflow/lite/core/c:headers_filegroup",
-    ],  # Export all header files (.h) in this directory
-    deps = [
-        "@opencl_headers",
-    ],
-)
-
-copy_file(
-    name = "copy_litert_runtime_c_api_so",
-    src = "//tensorflow/lite/experimental/litert/c:litert_runtime_c_api_so",
-    target = "libLiteRtRuntimeCApi.so",
-)
-
-# This is cc_library target based on `libLiteRtRuntimeCApi.so`.
-cc_library(
-    name = "litert_runtime_c_api_shared_lib",
-    srcs = [":litert_runtime_c_api_so"],
-    hdrs = glob(["litert_*.h"]) + [
-        # Needed for litert/c/litert_op_code.h
-        "//tensorflow/lite:builtin_ops.h",
-        # Neeeded for litert/c/litert_model.h
-        "//tensorflow/lite/c:tensorflowlite_c_api_hdrs_filegroup",
-        "//tensorflow/lite/core/c:headers_filegroup",
-    ],
-    linkstatic = 1,
-    deps = [
-        # only depend on headers
-        "@opencl_headers",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-    ] + gles_headers(),
-)
-
-# copybara:uncomment_begin(google-only)
-# # Check that litert runtime doesn't depend on MLIR.
-# check_dependencies(
-#     of = [":litert_runtime_c_api_shared_lib"],
-#     dont_match_regexp = "^//third_party/llvm/llvm-project/mlir",
-# )
-# copybara:uncomment_end
-
-exports_files(srcs = glob(["litert_*.h"]))
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator.cc b/tensorflow/lite/experimental/litert/c/litert_accelerator.cc
deleted file mode 100644
index 806f6fe80d25..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator.h"
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Gets the number of accelerators registered to LiteRT.
-LiteRtStatus LiteRtGetNumAccelerators(LiteRtEnvironment environment,
-                                      LiteRtParamIndex* num_accelerators) {
-  if (!num_accelerators) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!environment) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  *num_accelerators = environment->GetAcceleratorRegistry().size();
-  return kLiteRtStatusOk;
-}
-
-// Gets the accelerator at given index that is registered to LiteRT.
-LiteRtStatus LiteRtGetAccelerator(LiteRtEnvironment environment,
-                                  LiteRtParamIndex index,
-
-                                  LiteRtAccelerator* accelerator) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!environment) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  litert::Expected<LiteRtAccelerator> registered_accelerator =
-      environment->GetAcceleratorRegistry().Get(index);
-  if (!registered_accelerator.HasValue()) {
-    return registered_accelerator.Error().Status();
-  }
-  *accelerator = registered_accelerator.Value();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetAcceleratorName(LiteRtAccelerator accelerator,
-                                      char const** name) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!accelerator->GetName) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!name) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  return accelerator->GetName(accelerator, name);
-}
-
-LiteRtStatus LiteRtGetAcceleratorId(LiteRtAccelerator accelerator,
-                                    LiteRtAcceleratorId* id) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!id) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!accelerator->env) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  litert::Expected<LiteRtParamIndex> index =
-      accelerator->env->GetAcceleratorRegistry().FindAcceleratorIndex(
-          accelerator);
-  if (!index.HasValue()) {
-    return index.Error().Status();
-  }
-  *id = index.Value();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetAcceleratorVersion(LiteRtAccelerator accelerator,
-                                         LiteRtApiVersion* version) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!accelerator->GetVersion) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!version) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  return accelerator->GetVersion(accelerator, version);
-}
-
-LiteRtStatus LiteRtGetAcceleratorHardwareSupport(
-    LiteRtAccelerator accelerator, LiteRtHwAcceleratorSet* supported_hardware) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!accelerator->GetHardwareSupport) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!supported_hardware) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  return accelerator->GetHardwareSupport(accelerator, supported_hardware);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator.h b/tensorflow/lite/experimental/litert/c/litert_accelerator.h
deleted file mode 100644
index 3d20c15e45e8..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_H_
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LITERT_DEFINE_HANDLE(LiteRtAccelerator);
-
-typedef size_t LiteRtAcceleratorId;
-
-// Gets the number of accelerators registered to LiteRT.
-LiteRtStatus LiteRtGetNumAccelerators(LiteRtEnvironment environment,
-                                      LiteRtParamIndex* num_accelerators);
-
-// Gets the accelerator at given index that is registered to LiteRT.
-LiteRtStatus LiteRtGetAccelerator(LiteRtEnvironment environment,
-                                  LiteRtParamIndex index,
-                                  LiteRtAccelerator* accelerator);
-
-// Fetches the name of the accelerator.
-//
-// Note: client code does not need to manage the `name` lifetime.
-LiteRtStatus LiteRtGetAcceleratorName(LiteRtAccelerator accelerator,
-                                      char const** name);
-
-// Fetches the accelerator identifier.
-//
-// The identifier is a runtime unique number, provided by the registrar to the
-// accelerator upon registration.
-LiteRtStatus LiteRtGetAcceleratorId(LiteRtAccelerator accelerator,
-                                    LiteRtAcceleratorId* id);
-
-// Fetches the version of the accelerator implementation.
-//
-// Note: This is NOT the LiteRT version. It's the accelerator specific software
-// implementation version.
-LiteRtStatus LiteRtGetAcceleratorVersion(LiteRtAccelerator accelerator,
-                                         LiteRtApiVersion* version);
-
-// Fetches the accelerator hardware.
-//
-// `supported_hardware` is a bitfield of `LiteRtHwAccelerators` values.
-LiteRtStatus LiteRtGetAcceleratorHardwareSupport(
-    LiteRtAccelerator accelerator, LiteRtHwAcceleratorSet* supported_hardware);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator_options.cc b/tensorflow/lite/experimental/litert/c/litert_accelerator_options.cc
deleted file mode 100644
index f28f75dadb93..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator_options.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-extern "C" {
-
-LiteRtStatus LiteRtGetNextAcceleratorCompilationOptions(
-    LiteRtAcceleratorCompilationOptions* options) {
-  if (!options || !*options) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *options = (*options)->next;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtAppendAcceleratorCompilationOptions(
-    LiteRtAcceleratorCompilationOptions* options,
-    LiteRtAcceleratorCompilationOptions appended_options) {
-  if (!options) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (appended_options && !appended_options->ReleaseData) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  while (*options) {
-    options = &((*options)->next);
-  }
-  *options = appended_options;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetAcceleratorCompilationOptionsIdentifier(
-    LiteRtAcceleratorCompilationOptions options, const char** identifier) {
-  if (!options || !identifier) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *identifier = options->identifier;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDestroyAcceleratorCompilationOptions(
-    LiteRtAcceleratorCompilationOptions options) {
-  while (options) {
-    if (!options->ReleaseData) {
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-    LiteRtAcceleratorCompilationOptions next = options->next;
-    options->ReleaseData(options);
-    options = next;
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetAcceleratorCompilationOptionsVersion(
-    LiteRtAcceleratorCompilationOptions options, LiteRtApiVersion* version) {
-  if (!options || !version) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *version = options->version;
-  return kLiteRtStatusOk;
-}
-
-}  // extern "C"
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator_options.h b/tensorflow/lite/experimental/litert/c/litert_accelerator_options.h
deleted file mode 100644
index 91e7bfb2499a..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator_options.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_OPTIONS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_OPTIONS_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct LiteRtAcceleratorCompilationOptionsHeader;
-typedef struct LiteRtAcceleratorCompilationOptionsHeader*
-    LiteRtAcceleratorCompilationOptions;
-
-// Gets the next link in the option list.
-//
-// Sets `accelerator_options` to NULL if none are left.
-LiteRtStatus LiteRtGetNextAcceleratorCompilationOptions(
-    LiteRtAcceleratorCompilationOptions* accelerator_options);
-
-// Appends a new compilation option object to the list.
-//
-// This goes through the links in the option list and appends the given link.
-//
-// `options` must be non-null, `*options` may however be null, in which case
-// this call is equivalent to `*options = appended_options`.
-LiteRtStatus LiteRtAppendAcceleratorCompilationOptions(
-    LiteRtAcceleratorCompilationOptions* options,
-    LiteRtAcceleratorCompilationOptions appended_options);
-
-// Gets the accelerator option structure identifier.
-LiteRtStatus LiteRtGetAcceleratorCompilationOptionsIdentifier(
-    LiteRtAcceleratorCompilationOptions options, const char** identifier);
-
-// Gets the accelerator option structure identifier.
-LiteRtStatus LiteRtGetAcceleratorCompilationOptionsVersion(
-    LiteRtAcceleratorCompilationOptions options, LiteRtApiVersion* version);
-
-// Releases an accelerator option structure list.
-//
-// Equivalent to calling the destructor passed to the function above.
-//
-// Warning: This should not be called manually after the option structure has
-// been added to the compilation options.
-LiteRtStatus LiteRtDestroyAcceleratorCompilationOptions(
-    LiteRtAcceleratorCompilationOptions options);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_OPTIONS_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator_options_test.cc b/tensorflow/lite/experimental/litert/c/litert_accelerator_options_test.cc
deleted file mode 100644
index 4e6aaf48be10..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator_options_test.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace {
-using testing::StrEq;
-
-struct DummyAccleratorCompilationOptions {
-  static constexpr const char* const kIdentifier = "dummy-accelerator";
-  static constexpr const LiteRtApiVersion kVersion = {0, 1, 0};
-
-  // This NEEDS to be the first non-static field of the structure.
-  LiteRtAcceleratorCompilationOptionsHeader link;
-
-  int dummy_option = 3;
-
-  // Allocates and sets the basic structure for the accelerator options.
-  static LiteRtStatus Create(LiteRtAcceleratorCompilationOptions* options) {
-    if (!options) {
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-    *options = reinterpret_cast<LiteRtAcceleratorCompilationOptions>(
-        new DummyAccleratorCompilationOptions());
-    LiteRtSetAcceleratorCompilationOptionsDestructor(*options, Destroy);
-    LiteRtSetAcceleratorCompilationOptionsIdentifier(*options, kIdentifier);
-    LiteRtSetAcceleratorCompilationOptionsVersion(*options, kVersion);
-    return kLiteRtStatusOk;
-  }
-
- private:
-  // Destroys the options.
-  static void Destroy(LiteRtAcceleratorCompilationOptions options) {
-    delete reinterpret_cast<DummyAccleratorCompilationOptions*>(options);
-  }
-};
-
-class LiteRtAcceleratorOptionsTest : public testing::Test {
- public:
-  void SetUp() override {
-    LITERT_ASSERT_OK(DummyAccleratorCompilationOptions::Create(&options_));
-  }
-
-  void TearDown() override {
-    LITERT_EXPECT_OK(LiteRtDestroyAcceleratorCompilationOptions(options_));
-    options_ = nullptr;
-  }
-
-  LiteRtAcceleratorCompilationOptions options_ = nullptr;
-};
-
-TEST_F(LiteRtAcceleratorOptionsTest, CreateAndDestroyDoesntLeak) {}
-
-TEST_F(LiteRtAcceleratorOptionsTest, GetIdentifier) {
-  const char* identifier = nullptr;
-  LITERT_EXPECT_OK(
-      LiteRtGetAcceleratorCompilationOptionsIdentifier(options_, &identifier));
-  EXPECT_THAT(identifier,
-              StrEq(DummyAccleratorCompilationOptions::kIdentifier));
-  EXPECT_THAT(
-      LiteRtGetAcceleratorCompilationOptionsIdentifier(nullptr, &identifier),
-      testing::litert::IsError(kLiteRtStatusErrorInvalidArgument));
-  EXPECT_EQ(LiteRtGetAcceleratorCompilationOptionsIdentifier(options_, nullptr),
-            kLiteRtStatusErrorInvalidArgument);
-}
-
-TEST_F(LiteRtAcceleratorOptionsTest, GetVersion) {
-  LiteRtApiVersion version;
-  EXPECT_EQ(LiteRtGetAcceleratorCompilationOptionsVersion(options_, &version),
-            kLiteRtStatusOk);
-  EXPECT_EQ(version.major, DummyAccleratorCompilationOptions::kVersion.major);
-  EXPECT_EQ(version.minor, DummyAccleratorCompilationOptions::kVersion.minor);
-  EXPECT_EQ(version.patch, DummyAccleratorCompilationOptions::kVersion.patch);
-  EXPECT_EQ(LiteRtGetAcceleratorCompilationOptionsVersion(nullptr, &version),
-            kLiteRtStatusErrorInvalidArgument);
-  EXPECT_EQ(LiteRtGetAcceleratorCompilationOptionsVersion(options_, nullptr),
-            kLiteRtStatusErrorInvalidArgument);
-}
-
-TEST_F(LiteRtAcceleratorOptionsTest, CreatingAndDestroyingAListWorks) {
-  LiteRtAcceleratorCompilationOptions appended_options1, appended_options2;
-  ASSERT_EQ(DummyAccleratorCompilationOptions::Create(&appended_options1),
-            kLiteRtStatusOk);
-  ASSERT_EQ(DummyAccleratorCompilationOptions::Create(&appended_options2),
-            kLiteRtStatusOk);
-
-  EXPECT_EQ(
-      LiteRtAppendAcceleratorCompilationOptions(&options_, appended_options1),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtAppendAcceleratorCompilationOptions(&options_, appended_options2),
-      kLiteRtStatusOk);
-
-  // Iterate through the list to check that the links have been correctly added.
-
-  LiteRtAcceleratorCompilationOptions options_it = options_;
-  ASSERT_EQ(LiteRtGetNextAcceleratorCompilationOptions(&options_it),
-            kLiteRtStatusOk);
-  EXPECT_EQ(options_it, appended_options1);
-
-  ASSERT_EQ(LiteRtGetNextAcceleratorCompilationOptions(&options_it),
-            kLiteRtStatusOk);
-  EXPECT_EQ(options_it, appended_options2);
-
-  // The list is destroyed in the `TearDown()` function.
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator_registration.cc b/tensorflow/lite/experimental/litert/c/litert_accelerator_registration.cc
deleted file mode 100644
index 6dc859c8ad7a..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator_registration.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h"
-
-#include <memory>
-#include <utility>
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator.h"
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-LiteRtStatus LiteRtCreateAccelerator(LiteRtAccelerator* accelerator) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *accelerator =
-      litert::internal::AcceleratorRegistry::CreateEmptyAccelerator().release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDestroyAccelerator(LiteRtAccelerator accelerator) {
-  litert::internal::AcceleratorRegistry::DestroyAccelerator(accelerator);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtRegisterAccelerator(LiteRtEnvironment environment,
-                                       LiteRtAccelerator accelerator,
-                                       void* data, void (*ReleaseData)(void*)) {
-  std::unique_ptr<void, void (*)(void*)> data_guard(data, ReleaseData);
-  litert::internal::AcceleratorRegistry::Ptr accelerator_guard(accelerator);
-  if (!accelerator_guard) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  accelerator_guard->env = environment;
-  litert::Expected<LiteRtAccelerator> registered_accelerator =
-      environment->GetAcceleratorRegistry().RegisterAccelerator(
-          std::move(accelerator_guard));
-  if (!registered_accelerator.HasValue()) {
-    return registered_accelerator.Error().Status();
-  }
-  registered_accelerator.Value()->data = data_guard.release();
-  registered_accelerator.Value()->ReleaseData = ReleaseData;
-  return kLiteRtStatusOk;
-}
-
-// Sets the function used to retrieve the accelerator name.
-LiteRtStatus LiteRtSetAcceleratorGetName(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*GetName)(LiteRtAccelerator accelerator, const char** name)) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  accelerator->GetName = GetName;
-  return kLiteRtStatusOk;
-}
-
-// Sets the function used to retrieve the accelerator version.
-LiteRtStatus LiteRtSetAcceleratorGetVersion(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*GetVersion)(LiteRtAccelerator accelerator,
-                               LiteRtApiVersion* version)) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  accelerator->GetVersion = GetVersion;
-  return kLiteRtStatusOk;
-}
-
-// Sets the function used to retrieve the accelerator hardware support.
-LiteRtStatus LiteRtSetAcceleratorGetHardwareSupport(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*GetHardwareSupport)(
-        LiteRtAccelerator accelerator,
-        LiteRtHwAcceleratorSet* supported_hardware)) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  accelerator->GetHardwareSupport = GetHardwareSupport;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtSetDelegateFunction(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*CreateDelegate)(LiteRtAccelerator accelerator,
-                                   LiteRtAcceleratorCompilationOptions options,
-                                   void** delegate),
-    void (*DestroyDelegate)(void* delegate)) {
-  if (!accelerator) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  accelerator->CreateDelegate = CreateDelegate;
-  accelerator->DestroyDelegate = DestroyDelegate;
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h b/tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h
deleted file mode 100644
index 548af23f713d..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_REGISTRATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_REGISTRATION_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator.h"
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Creates an empty accelerator handle.
-LiteRtStatus LiteRtCreateAccelerator(LiteRtAccelerator* accelerator);
-
-// Destroys an accelerator handle.
-//
-// Warning: This SHOULD NOT BE CALLED after a call to
-// `LiteRtRegisterAccelerator`.
-LiteRtStatus LiteRtDestroyAccelerator(LiteRtAccelerator accelerator);
-
-// Sets the registration data AND clean-up function, then registers the
-// accelerator with the LiteRT environment.
-//
-// - `data` and `ReleaseData` may be null.
-//
-// Note: After this function returns successfully, `data` is managed by the
-// LiteRT environment. `ReleaseData` is called to release its memory.
-//
-// Warning: In case of failure, `accelerator` is released and `data` is released
-// using `ReleaseData`.
-LiteRtStatus LiteRtRegisterAccelerator(LiteRtEnvironment environment,
-                                       LiteRtAccelerator accelerator,
-                                       void* data, void (*ReleaseData)(void*));
-
-// Sets the function used to retrieve the accelerator name.
-LiteRtStatus LiteRtSetAcceleratorGetName(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*GetName)(LiteRtAccelerator accelerator, const char** name));
-
-// Sets the function used to retrieve the accelerator implementation version.
-//
-// Note: This is NOT the LiteRT version. It's the accelerator specific software
-// implementation version.
-LiteRtStatus LiteRtSetAcceleratorGetVersion(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*GetVersion)(LiteRtAccelerator accelerator,
-                               LiteRtApiVersion* version));
-
-// Sets the function used to retrieve the accelerator hardware support.
-LiteRtStatus LiteRtSetAcceleratorGetHardwareSupport(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*GetHardwareSupport)(
-        LiteRtAccelerator accelerator,
-        LiteRtHwAcceleratorSet* supported_hardware));
-
-// Set the function used to return a Delegate to apply the accelerator by the
-// compiled model and its destructor. The returned Delegate object is owned by
-// the compiled model. Used void** for the Delegate instead of
-// TfLiteOpaqueDelegate** to avoid TFLite dependency.
-LiteRtStatus LiteRtSetDelegateFunction(
-    LiteRtAccelerator accelerator,
-    LiteRtStatus (*CreateDelegate)(LiteRtAccelerator accelerator,
-                                   LiteRtAcceleratorCompilationOptions options,
-                                   void** delegate),
-    void (*DestroyDelegate)(void* delegate));
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ACCELERATOR_REGISTRATION_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator_registration_test.cc b/tensorflow/lite/experimental/litert/c/litert_accelerator_registration_test.cc
deleted file mode 100644
index 2380214c4b71..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator_registration_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h"
-
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator.h"
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-namespace {
-
-class DummyAccelerator {
- public:
-  static std::unique_ptr<DummyAccelerator> CpuAccelerator() {
-    auto accelerator = std::make_unique<DummyAccelerator>();
-    accelerator->hardware_support_ = kLiteRtHwAcceleratorCpu;
-    return accelerator;
-  }
-
-  static void Destroy(void* dummy_accelerator) {
-    DummyAccelerator* instance =
-        reinterpret_cast<DummyAccelerator*>(dummy_accelerator);
-    delete instance;
-  }
-
-  static LiteRtStatus GetName(LiteRtAccelerator accelerator,
-                              const char** name) {
-    return kLiteRtStatusOk;
-  }
-
-  static LiteRtStatus GetVersion(LiteRtAccelerator accelerator,
-                                 LiteRtApiVersion* version) {
-    return kLiteRtStatusOk;
-  }
-
-  static LiteRtStatus GetHardwareSupport(
-      LiteRtAccelerator accelerator,
-      LiteRtHwAcceleratorSet* supported_hardware) {
-    return kLiteRtStatusOk;
-  }
-
-  static LiteRtStatus CreateDelegate(
-      LiteRtAccelerator accelerator,
-      LiteRtAcceleratorCompilationOptions options, void** delegate) {
-    return kLiteRtStatusOk;
-  }
-
-  static void DestroyDelegate(void* delegate) {}
-
-  LiteRtHwAccelerators hardware_support_;
-};
-
-TEST(LiteRtAcceleratorRegistrationTest, SetAcceleratorGetNameWorks) {
-  LiteRtAcceleratorT accelerator;
-  EXPECT_EQ(LiteRtSetAcceleratorGetName(nullptr, DummyAccelerator::GetName),
-            kLiteRtStatusErrorInvalidArgument);
-  LiteRtSetAcceleratorGetName(&accelerator, DummyAccelerator::GetName);
-  EXPECT_EQ(accelerator.GetName, DummyAccelerator::GetName);
-}
-
-TEST(LiteRtAcceleratorRegistrationTest, SetAcceleratorGetVersionWorks) {
-  LiteRtAcceleratorT accelerator;
-  EXPECT_EQ(
-      LiteRtSetAcceleratorGetVersion(nullptr, DummyAccelerator::GetVersion),
-      kLiteRtStatusErrorInvalidArgument);
-  LiteRtSetAcceleratorGetVersion(&accelerator, DummyAccelerator::GetVersion);
-  EXPECT_EQ(accelerator.GetVersion, DummyAccelerator::GetVersion);
-}
-
-TEST(LiteRtAcceleratorRegistrationTest, SetAcceleratorGetHardwareSupportWorks) {
-  LiteRtAcceleratorT accelerator;
-  EXPECT_EQ(LiteRtSetAcceleratorGetHardwareSupport(
-                nullptr, DummyAccelerator::GetHardwareSupport),
-            kLiteRtStatusErrorInvalidArgument);
-  LiteRtSetAcceleratorGetHardwareSupport(&accelerator,
-                                         DummyAccelerator::GetHardwareSupport);
-  EXPECT_EQ(accelerator.GetHardwareSupport,
-            DummyAccelerator::GetHardwareSupport);
-}
-
-TEST(LiteRtAcceleratorRegistrationTest, SetDelegateFunctionsWorks) {
-  LiteRtAcceleratorT accelerator;
-  EXPECT_EQ(LiteRtSetDelegateFunction(nullptr, DummyAccelerator::CreateDelegate,
-                                      DummyAccelerator::DestroyDelegate),
-            kLiteRtStatusErrorInvalidArgument);
-  LiteRtSetDelegateFunction(&accelerator, DummyAccelerator::CreateDelegate,
-                            DummyAccelerator::DestroyDelegate);
-  EXPECT_EQ(accelerator.CreateDelegate, DummyAccelerator::CreateDelegate);
-  EXPECT_EQ(accelerator.DestroyDelegate, DummyAccelerator::DestroyDelegate);
-}
-
-TEST(LiteRtAcceleratorRegistrationTest, CreateDestroyAcceleratorDoesntLeak) {
-  LiteRtAccelerator accelerator;
-  ASSERT_EQ(LiteRtCreateAccelerator(&accelerator), kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtDestroyAccelerator(accelerator), kLiteRtStatusOk);
-}
-
-TEST(LiteRtAcceleratorRegistrationTest, RegisterAcceleratorWorks) {
-  LiteRtEnvironment env_;
-  LiteRtEnvironmentCreate(/*num_options=*/0, /*options=*/nullptr, &env_);
-  auto dummy_accelerator = DummyAccelerator::CpuAccelerator();
-  LiteRtAccelerator accelerator;
-  LiteRtCreateAccelerator(&accelerator);
-  LiteRtSetAcceleratorGetName(accelerator, DummyAccelerator::GetName);
-  LiteRtSetAcceleratorGetVersion(accelerator, DummyAccelerator::GetVersion);
-  LiteRtSetAcceleratorGetHardwareSupport(accelerator,
-                                         DummyAccelerator::GetHardwareSupport);
-  LiteRtRegisterAccelerator(env_, accelerator, dummy_accelerator.release(),
-                            DummyAccelerator::Destroy);
-  LiteRtDestroyEnvironment(env_);
-}
-
-TEST(LiteRtAcceleratorRegistrationTest,
-     RegisterAcceleratorFailsForNullAccelerator) {
-  LiteRtEnvironment env;
-  LiteRtEnvironmentCreate(/*num_options=*/0, /*options=*/nullptr, &env);
-  // We check that the memory is correctly deallocated if the registration
-  // fails.
-  auto dummy_accelerator = DummyAccelerator::CpuAccelerator();
-  EXPECT_EQ(LiteRtRegisterAccelerator(env, nullptr, dummy_accelerator.release(),
-                                      DummyAccelerator::Destroy),
-            kLiteRtStatusErrorInvalidArgument);
-  LiteRtDestroyEnvironment(env);
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/c/litert_accelerator_test.cc b/tensorflow/lite/experimental/litert/c/litert_accelerator_test.cc
deleted file mode 100644
index 917f8cfbfb3a..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_accelerator_test.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator.h"
-
-#include <memory>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-#define LITERT_ENSURE_OK(expr)       \
-  do {                               \
-    LiteRtStatus status = (expr);    \
-    if (status != kLiteRtStatusOk) { \
-      return status;                 \
-    }                                \
-  } while (0)
-
-namespace {
-using testing::Eq;
-using testing::Ne;
-using testing::NotNull;
-using testing::StrEq;
-
-class DummyAccelerator {
- public:
-  // `hardware_support` is a bitfield of `LiteRtHwAccelerators` values.
-  static LiteRtStatus RegisterAccelerator(int hardware_support,
-                                          LiteRtEnvironment env) {
-    auto dummy_accelerator = std::make_unique<DummyAccelerator>();
-    dummy_accelerator->hardware_support_ = hardware_support;
-    LiteRtAccelerator accelerator;
-    LiteRtCreateAccelerator(&accelerator);
-    LITERT_ENSURE_OK(
-        LiteRtSetAcceleratorGetName(accelerator, DummyAccelerator::GetName));
-    LITERT_ENSURE_OK(LiteRtSetAcceleratorGetVersion(
-        accelerator, DummyAccelerator::GetVersion));
-    LITERT_ENSURE_OK(LiteRtSetAcceleratorGetHardwareSupport(
-        accelerator, DummyAccelerator::GetHardwareSupport));
-    LITERT_ENSURE_OK(LiteRtRegisterAccelerator(env, accelerator,
-                                               dummy_accelerator.release(),
-                                               DummyAccelerator::Destroy));
-    return kLiteRtStatusOk;
-  }
-
-  static void Destroy(void* dummy_accelerator) {
-    DummyAccelerator* instance =
-        reinterpret_cast<DummyAccelerator*>(dummy_accelerator);
-    delete instance;
-  }
-
-  static LiteRtStatus GetName(LiteRtAccelerator accelerator,
-                              const char** name) {
-    if (!accelerator || !accelerator->data || !name) {
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-    DummyAccelerator& self =
-        *reinterpret_cast<DummyAccelerator*>(accelerator->data);
-    if (self.name_.empty()) {
-      self.name_.append("Dummy");
-      if (self.hardware_support_ & kLiteRtHwAcceleratorCpu) {
-        self.name_.append("Cpu");
-      }
-      if (self.hardware_support_ & kLiteRtHwAcceleratorGpu) {
-        self.name_.append("Gpu");
-      }
-      self.name_.append("Accelerator");
-    }
-    *name = self.name_.c_str();
-    return kLiteRtStatusOk;
-  }
-
-  static LiteRtStatus GetVersion(LiteRtAccelerator accelerator,
-                                 LiteRtApiVersion* version) {
-    if (!version) {
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-    version->major = 1;
-    version->minor = 2;
-    version->patch = 3;
-    return kLiteRtStatusOk;
-  }
-
-  static LiteRtStatus GetHardwareSupport(
-      LiteRtAccelerator accelerator,
-      LiteRtHwAcceleratorSet* supported_hardware) {
-    if (!accelerator || !accelerator->data || !supported_hardware) {
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-
-    const DummyAccelerator& self =
-        *reinterpret_cast<DummyAccelerator*>(accelerator->data);
-    *supported_hardware = self.hardware_support_;
-    return kLiteRtStatusOk;
-  }
-
-  int hardware_support_;
-  std::string name_;
-};
-
-class LiteRtAcceleratorTest : public testing::Test {
- public:
-  LiteRtEnvironment env_;
-  void SetUp() override {
-    LiteRtEnvironmentCreate(/*num_options=*/0, nullptr, &env_);
-    DummyAccelerator::RegisterAccelerator(kLiteRtHwAcceleratorCpu, env_);
-  }
-
-  void TearDown() override { LiteRtDestroyEnvironment(env_); }
-};
-
-TEST_F(LiteRtAcceleratorTest, IteratingOverAcceleratorsWorks) {
-  // CPU accelerator is registered in the SetUp function.
-  DummyAccelerator::RegisterAccelerator(kLiteRtHwAcceleratorGpu, env_);
-
-  LiteRtParamIndex num_accelerators = 0;
-  ASSERT_THAT(LiteRtGetNumAccelerators(env_, &num_accelerators),
-              kLiteRtStatusOk);
-  ASSERT_THAT(num_accelerators, 2);
-
-  EXPECT_THAT(LiteRtGetAccelerator(env_, 0, nullptr),
-              kLiteRtStatusErrorInvalidArgument);
-  LiteRtAccelerator accelerator0;
-  ASSERT_THAT(LiteRtGetAccelerator(env_, 0, &accelerator0), kLiteRtStatusOk);
-  EXPECT_THAT(accelerator0, NotNull());
-
-  EXPECT_THAT(LiteRtGetAccelerator(env_, 1, nullptr),
-              kLiteRtStatusErrorInvalidArgument);
-  LiteRtAccelerator accelerator1;
-  ASSERT_THAT(LiteRtGetAccelerator(env_, 1, &accelerator1), kLiteRtStatusOk);
-  EXPECT_THAT(accelerator1, NotNull());
-
-  EXPECT_THAT(accelerator0, Ne(accelerator1));
-
-  LiteRtAccelerator accelerator2;
-  EXPECT_THAT(LiteRtGetAccelerator(env_, 2, &accelerator2),
-              kLiteRtStatusErrorNotFound);
-}
-
-TEST_F(LiteRtAcceleratorTest, GetAcceleratorNameWorks) {
-  LiteRtParamIndex num_accelerators = 0;
-  ASSERT_THAT(LiteRtGetNumAccelerators(env_, &num_accelerators),
-              kLiteRtStatusOk);
-  ASSERT_THAT(num_accelerators, 1);
-
-  LiteRtAccelerator accelerator;
-  ASSERT_THAT(LiteRtGetAccelerator(env_, 0, &accelerator), kLiteRtStatusOk);
-  const char* name = nullptr;
-  ASSERT_THAT(LiteRtGetAcceleratorName(accelerator, &name), kLiteRtStatusOk);
-  EXPECT_THAT(name, StrEq("DummyCpuAccelerator"));
-
-  EXPECT_THAT(LiteRtGetAcceleratorName(nullptr, &name),
-              kLiteRtStatusErrorInvalidArgument);
-  EXPECT_THAT(LiteRtGetAcceleratorName(accelerator, nullptr),
-              kLiteRtStatusErrorInvalidArgument);
-  // Make the accelerator invalid.
-  accelerator->GetName = nullptr;
-  EXPECT_THAT(LiteRtGetAcceleratorName(accelerator, &name),
-              kLiteRtStatusErrorInvalidArgument);
-}
-
-TEST_F(LiteRtAcceleratorTest, GetAcceleratorIdWorks) {
-  LiteRtParamIndex num_accelerators = 0;
-  ASSERT_THAT(LiteRtGetNumAccelerators(env_, &num_accelerators),
-              kLiteRtStatusOk);
-  ASSERT_THAT(num_accelerators, 1);
-
-  LiteRtAccelerator accelerator;
-  ASSERT_THAT(LiteRtGetAccelerator(env_, 0, &accelerator), kLiteRtStatusOk);
-  LiteRtAcceleratorId accelerator_id;
-  ASSERT_THAT(LiteRtGetAcceleratorId(accelerator, &accelerator_id),
-              kLiteRtStatusOk);
-  EXPECT_THAT(accelerator_id, Eq(0));
-
-  EXPECT_THAT(LiteRtGetAcceleratorId(nullptr, &accelerator_id),
-              kLiteRtStatusErrorInvalidArgument);
-  EXPECT_THAT(LiteRtGetAcceleratorId(accelerator, nullptr),
-              kLiteRtStatusErrorInvalidArgument);
-  // Make the accelerator invalid.
-  accelerator->env = nullptr;
-  EXPECT_THAT(LiteRtGetAcceleratorId(accelerator, &accelerator_id),
-              kLiteRtStatusErrorInvalidArgument);
-}
-
-TEST_F(LiteRtAcceleratorTest, GetAcceleratorVersionWorks) {
-  LiteRtParamIndex num_accelerators = 0;
-  ASSERT_THAT(LiteRtGetNumAccelerators(env_, &num_accelerators),
-              kLiteRtStatusOk);
-  ASSERT_THAT(num_accelerators, 1);
-
-  LiteRtAccelerator accelerator;
-  ASSERT_THAT(LiteRtGetAccelerator(env_, 0, &accelerator), kLiteRtStatusOk);
-  LiteRtApiVersion version;
-  ASSERT_THAT(LiteRtGetAcceleratorVersion(accelerator, &version),
-              kLiteRtStatusOk);
-  EXPECT_THAT(version.major, Eq(1));
-  EXPECT_THAT(version.minor, Eq(2));
-  EXPECT_THAT(version.patch, Eq(3));
-
-  EXPECT_THAT(LiteRtGetAcceleratorVersion(nullptr, &version),
-              kLiteRtStatusErrorInvalidArgument);
-  EXPECT_THAT(LiteRtGetAcceleratorVersion(accelerator, nullptr),
-              kLiteRtStatusErrorInvalidArgument);
-  // Make the accelerator invalid.
-  accelerator->GetVersion = nullptr;
-  EXPECT_THAT(LiteRtGetAcceleratorVersion(accelerator, &version),
-              kLiteRtStatusErrorInvalidArgument);
-}
-
-TEST_F(LiteRtAcceleratorTest, GetAcceleratorHardwareSupportWorks) {
-  LiteRtParamIndex num_accelerators = 0;
-  ASSERT_THAT(LiteRtGetNumAccelerators(env_, &num_accelerators),
-              kLiteRtStatusOk);
-  ASSERT_THAT(num_accelerators, 1);
-
-  LiteRtAccelerator accelerator;
-  ASSERT_THAT(LiteRtGetAccelerator(env_, 0, &accelerator), kLiteRtStatusOk);
-  int hardware_support;
-  ASSERT_THAT(
-      LiteRtGetAcceleratorHardwareSupport(accelerator, &hardware_support),
-      kLiteRtStatusOk);
-  EXPECT_THAT(hardware_support & kLiteRtHwAcceleratorCpu, true);
-  EXPECT_THAT(hardware_support & kLiteRtHwAcceleratorGpu, false);
-  EXPECT_THAT(hardware_support & kLiteRtHwAcceleratorNpu, false);
-
-  EXPECT_THAT(LiteRtGetAcceleratorHardwareSupport(nullptr, &hardware_support),
-              kLiteRtStatusErrorInvalidArgument);
-  EXPECT_THAT(LiteRtGetAcceleratorHardwareSupport(accelerator, nullptr),
-              kLiteRtStatusErrorInvalidArgument);
-  // Make the accelerator invalid.
-  accelerator->GetHardwareSupport = nullptr;
-  EXPECT_THAT(
-      LiteRtGetAcceleratorHardwareSupport(accelerator, &hardware_support),
-      kLiteRtStatusErrorInvalidArgument);
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/c/litert_any.h b/tensorflow/lite/experimental/litert/c/litert_any.h
deleted file mode 100644
index e8e67b0c80f2..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_any.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ANY_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ANY_H_
-
-#include <stdbool.h>  // NOLINT: To use bool type in C
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-typedef enum {
-  kLiteRtAnyTypeNone = 0,
-  kLiteRtAnyTypeBool = 1,
-  kLiteRtAnyTypeInt = 2,
-  kLiteRtAnyTypeReal = 3,
-  kLiteRtAnyTypeString = 8,
-  kLiteRtAnyTypeVoidPtr = 9,
-} LiteRtAnyType;
-
-inline const char* LiteRtAnyTypeToString(LiteRtAnyType type) {
-  switch (type) {
-    case kLiteRtAnyTypeNone:
-      return "kLiteRtAnyTypeNone";
-    case kLiteRtAnyTypeBool:
-      return "kLiteRtAnyTypeBool";
-    case kLiteRtAnyTypeInt:
-      return "kLiteRtAnyTypeInt";
-    case kLiteRtAnyTypeReal:
-      return "kLiteRtAnyTypeReal";
-    case kLiteRtAnyTypeString:
-      return "kLiteRtAnyTypeString";
-    case kLiteRtAnyTypeVoidPtr:
-      return "kLiteRtAnyTypeVoidPtr";
-  }
-  return "Unknown";
-}
-
-typedef struct {
-  LiteRtAnyType type;
-  union {
-    bool bool_value;
-    int64_t int_value;
-    double real_value;
-    const char* str_value;
-    const void* ptr_value;
-  };
-} LiteRtAny;
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ANY_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_c_api_common_test.c b/tensorflow/lite/experimental/litert/c/litert_c_api_common_test.c
deleted file mode 100644
index 1596ed7e0224..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_c_api_common_test.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file exists to verify that the below header files can build, link,
-// and run as C code.
-#ifdef __cplusplus
-#error "This file should be compiled as C code, not as C++."
-#endif
-
-// Include all the header files in the litert/c directory.
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_layout.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"  // NOLINT
-
-int main(void) { return 0; }
diff --git a/tensorflow/lite/experimental/litert/c/litert_common.cc b/tensorflow/lite/experimental/litert/c/litert_common.cc
deleted file mode 100644
index 0ac05439d079..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_common.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-extern "C" {
-
-int LiteRtCompareApiVersion(LiteRtApiVersion v1, LiteRtApiVersion v2) {
-  if (v1.major > v2.major) {
-    return 1;
-  } else if (v1.major == v2.major) {
-    if (v1.minor > v2.minor) {
-      return 1;
-    } else if (v1.minor == v2.minor) {
-      if (v1.patch > v2.patch) {
-        return 1;
-      } else if (v1.patch == v2.patch) {
-        return 0;
-      }
-    }
-  }
-  return -1;
-}
-
-const char* LiteRtGetStatusString(LiteRtStatus status) {
-  switch (status) {
-    // NOLINTNEXTLINE(preprocessor-macros)
-#define LITERT_STATUS_STR_CASE(STATUS) \
-  case STATUS:                         \
-    return #STATUS;
-    LITERT_STATUS_STR_CASE(kLiteRtStatusOk);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorInvalidArgument);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorMemoryAllocationFailure);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorRuntimeFailure);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorMissingInputTensor);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorUnsupported);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorNotFound);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorTimeoutExpired);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorFileIO);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorInvalidFlatbuffer);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorDynamicLoading);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorSerialization);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorCompilation);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorIndexOOB);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorInvalidIrType);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorInvalidGraphInvariant);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorGraphModification);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorInvalidToolConfig);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusLegalizeNoMatch);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorInvalidLegalization);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorWrongVersion);
-    LITERT_STATUS_STR_CASE(kLiteRtStatusErrorUnknown);
-#undef LITERT_STATUS_STR_CASE
-  }
-}
-
-}  // extern "C"
diff --git a/tensorflow/lite/experimental/litert/c/litert_common.h b/tensorflow/lite/experimental/litert/c/litert_common.h
deleted file mode 100644
index df5ba4377f0e..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_common.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMMON_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMMON_H_
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// Declares canonical opaque type.
-#define LITERT_DEFINE_HANDLE(name) typedef struct name##T* name
-
-#if __ANDROID_API__ >= 26
-#define LITERT_HAS_AHWB_SUPPORT 1
-#else
-#define LITERT_HAS_AHWB_SUPPORT 0
-#endif  // __ANDROID_API__ >= 26
-
-#if defined(__linux__) || defined(__ANDROID__)
-#define LITERT_HAS_SYNC_FENCE_SUPPORT 1
-#else
-#define LITERT_HAS_SYNC_FENCE_SUPPORT 0
-#endif
-
-#if defined(__ANDROID__)
-#define LITERT_HAS_ION_SUPPORT 1
-#define LITERT_HAS_DMABUF_SUPPORT 1
-#define LITERT_HAS_FASTRPC_SUPPORT 1
-#define LITERT_HAS_OPENGL_SUPPORT 1
-#define LITERT_HAS_OPENCL_SUPPORT 1
-// copybara:comment_begin(google-only)
-#elif defined(GOOGLE_UNSUPPORTED_OS_LOONIX)
-#define LITERT_HAS_ION_SUPPORT 0
-#define LITERT_HAS_DMABUF_SUPPORT 1
-#define LITERT_HAS_FASTRPC_SUPPORT 0
-#define LITERT_HAS_OPENCL_SUPPORT 1
-// copybara:comment_end
-#else
-#define LITERT_HAS_ION_SUPPORT 0
-#define LITERT_HAS_DMABUF_SUPPORT 0
-#define LITERT_HAS_FASTRPC_SUPPORT 0
-#define LITERT_HAS_OPENCL_SUPPORT 1
-#define LITERT_HAS_OPENGL_SUPPORT 0
-#endif
-
-#define LITERT_API_VERSION_MAJOR 0
-#define LITERT_API_VERSION_MINOR 1
-#define LITERT_API_VERSION_PATCH 0
-
-typedef struct LiteRtApiVersion {
-  int major;
-  int minor;
-  int patch;
-} LiteRtApiVersion;
-
-// Compares `v1` and `v2`.
-//
-// Returns 0 if they are the same, a negative number if v1 < v2 and a positive
-// number if v1 > v2.
-int LiteRtCompareApiVersion(LiteRtApiVersion v1, LiteRtApiVersion v2);
-
-typedef enum {
-  kLiteRtStatusOk = 0,
-
-  // Generic errors.
-  kLiteRtStatusErrorInvalidArgument = 1,
-  kLiteRtStatusErrorMemoryAllocationFailure = 2,
-  kLiteRtStatusErrorRuntimeFailure = 3,
-  kLiteRtStatusErrorMissingInputTensor = 4,
-  kLiteRtStatusErrorUnsupported = 5,
-  kLiteRtStatusErrorNotFound = 6,
-  kLiteRtStatusErrorTimeoutExpired = 7,
-  kLiteRtStatusErrorWrongVersion = 8,
-  kLiteRtStatusErrorUnknown = 9,
-
-  // File and loading related errors.
-  kLiteRtStatusErrorFileIO = 500,
-  kLiteRtStatusErrorInvalidFlatbuffer = 501,
-  kLiteRtStatusErrorDynamicLoading = 502,
-  kLiteRtStatusErrorSerialization = 503,
-  kLiteRtStatusErrorCompilation = 504,
-
-  // IR related errors.
-  kLiteRtStatusErrorIndexOOB = 1000,
-  kLiteRtStatusErrorInvalidIrType = 1001,
-  kLiteRtStatusErrorInvalidGraphInvariant = 1002,
-  kLiteRtStatusErrorGraphModification = 1003,
-
-  // Tool related errors.
-  kLiteRtStatusErrorInvalidToolConfig = 1500,
-
-  // Legalization related errors.
-  kLiteRtStatusLegalizeNoMatch = 2000,
-  kLiteRtStatusErrorInvalidLegalization = 2001,
-} LiteRtStatus;
-
-// Returns a string describing the status value.
-const char* LiteRtGetStatusString(LiteRtStatus status);
-
-typedef enum : int {
-  kLiteRtHwAcceleratorNone = 0,
-  kLiteRtHwAcceleratorCpu = 1 << 0,
-  kLiteRtHwAcceleratorGpu = 1 << 1,
-  kLiteRtHwAcceleratorNpu = 1 << 2,
-} LiteRtHwAccelerators;
-
-// A bit field of `LiteRtHwAccelerators` values.
-typedef int LiteRtHwAcceleratorSet;
-
-// For indexing into LiteRT collections or counting LiteRT things.
-typedef size_t LiteRtParamIndex;
-
-#if defined(_WIN32)
-// Provides posix_memalign() missing in Windows.
-#include <errno.h>
-
-#define posix_memalign(p, a, s) \
-  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
-#endif  // defined(_WIN32)
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMMON_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_common_test.cc b/tensorflow/lite/experimental/litert/c/litert_common_test.cc
deleted file mode 100644
index 4fa1f71688c5..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_common_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace {
-
-using testing::Eq;
-using testing::Gt;
-using testing::Lt;
-using testing::StrEq;
-
-TEST(CompareLiteRtApiVersionTest, Works) {
-  // Equality case.
-  EXPECT_THAT(LiteRtCompareApiVersion({1, 2, 3}, {1, 2, 3}), Eq(0));
-  // First is greater than second at patch level.
-  EXPECT_THAT(LiteRtCompareApiVersion({1, 1, 2}, {1, 1, 1}), Gt(0));
-  // First is greater than second at minor level.
-  EXPECT_THAT(LiteRtCompareApiVersion({1, 2, 0}, {1, 1, 2}), Gt(0));
-  // First is greater than second at major level.
-  EXPECT_THAT(LiteRtCompareApiVersion({2, 0, 0}, {1, 1, 2}), Gt(0));
-  // First is smaller than second at patch level.
-  EXPECT_THAT(LiteRtCompareApiVersion({1, 1, 1}, {1, 1, 2}), Lt(0));
-  // First is smaller than second at minor level.
-  EXPECT_THAT(LiteRtCompareApiVersion({1, 1, 2}, {1, 2, 0}), Lt(0));
-  // First is smaller than second at major level.
-  EXPECT_THAT(LiteRtCompareApiVersion({1, 1, 2}, {2, 0, 0}), Lt(0));
-}
-
-TEST(GetStatusString, Works) {
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusOk), StrEq("kLiteRtStatusOk"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorInvalidArgument),
-              StrEq("kLiteRtStatusErrorInvalidArgument"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorMemoryAllocationFailure),
-              StrEq("kLiteRtStatusErrorMemoryAllocationFailure"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorRuntimeFailure),
-              StrEq("kLiteRtStatusErrorRuntimeFailure"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorMissingInputTensor),
-              StrEq("kLiteRtStatusErrorMissingInputTensor"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorUnsupported),
-              StrEq("kLiteRtStatusErrorUnsupported"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorNotFound),
-              StrEq("kLiteRtStatusErrorNotFound"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorTimeoutExpired),
-              StrEq("kLiteRtStatusErrorTimeoutExpired"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorFileIO),
-              StrEq("kLiteRtStatusErrorFileIO"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorInvalidFlatbuffer),
-              StrEq("kLiteRtStatusErrorInvalidFlatbuffer"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorDynamicLoading),
-              StrEq("kLiteRtStatusErrorDynamicLoading"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorSerialization),
-              StrEq("kLiteRtStatusErrorSerialization"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorCompilation),
-              StrEq("kLiteRtStatusErrorCompilation"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorIndexOOB),
-              StrEq("kLiteRtStatusErrorIndexOOB"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorInvalidIrType),
-              StrEq("kLiteRtStatusErrorInvalidIrType"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorInvalidGraphInvariant),
-              StrEq("kLiteRtStatusErrorInvalidGraphInvariant"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorGraphModification),
-              StrEq("kLiteRtStatusErrorGraphModification"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorInvalidToolConfig),
-              StrEq("kLiteRtStatusErrorInvalidToolConfig"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusLegalizeNoMatch),
-              StrEq("kLiteRtStatusLegalizeNoMatch"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorInvalidLegalization),
-              StrEq("kLiteRtStatusErrorInvalidLegalization"));
-  EXPECT_THAT(LiteRtGetStatusString(kLiteRtStatusErrorWrongVersion),
-              StrEq("kLiteRtStatusErrorWrongVersion"));
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/c/litert_compiled_model.cc b/tensorflow/lite/experimental/litert/c/litert_compiled_model.cc
deleted file mode 100644
index 4d3cdeb0c9b6..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_compiled_model.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model.h"
-
-#include <stddef.h>
-
-#include <memory>
-#include <utility>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/runtime/compiled_model.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LiteRtStatus LiteRtCreateCompiledModel(
-    LiteRtEnvironment environment, LiteRtModel model,
-    LiteRtCompilationOptions compilation_options,
-    LiteRtCompiledModel* compiled_model) {
-  // We guard the compilation options. Since we consume them, we still need to
-  // release them if there's an error.
-  LiteRtCompiledModelT::OptionsPtr compilation_options_guard(
-      compilation_options);
-
-  if (!model || !compiled_model) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_compiled_model = LiteRtCompiledModelT::Create(
-      environment, model, std::move(compilation_options_guard));
-  if (!created_compiled_model) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_compiled_model.Error().Message().c_str());
-    return created_compiled_model.Error().Status();
-  }
-  *compiled_model = created_compiled_model->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompiledModelInputBufferRequirements(
-    LiteRtCompiledModel compiled_model, LiteRtParamIndex signature_index,
-    LiteRtParamIndex input_index,
-    LiteRtTensorBufferRequirements* buffer_requirements) {
-  if (!compiled_model || !buffer_requirements) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto res = compiled_model->GetInputBufferRequirementsCApi(signature_index,
-                                                            input_index);
-  if (!res) {
-    LITERT_LOG(LITERT_ERROR, "%s", res.Error().Message().c_str());
-    return res.Error().Status();
-  }
-  *buffer_requirements = res.Value();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompiledModelOutputBufferRequirements(
-    LiteRtCompiledModel compiled_model, LiteRtParamIndex signature_index,
-    LiteRtParamIndex output_index,
-    LiteRtTensorBufferRequirements* buffer_requirements) {
-  if (!compiled_model || !buffer_requirements) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto res = compiled_model->GetOutputBufferRequirementsCApi(signature_index,
-                                                             output_index);
-  if (!res) {
-    LITERT_LOG(LITERT_ERROR, "%s", res.Error().Message().c_str());
-    return res.Error().Status();
-  }
-  *buffer_requirements = res.Value();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtRunCompiledModel(LiteRtCompiledModel compiled_model,
-                                    LiteRtParamIndex signature_index,
-                                    size_t num_input_buffers,
-                                    LiteRtTensorBuffer* input_buffers,
-                                    size_t num_output_buffers,
-                                    LiteRtTensorBuffer* output_buffers) {
-  if (!compiled_model || (num_input_buffers > 0 && !input_buffers) ||
-      (num_output_buffers > 0 && !output_buffers)) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  bool async = false;
-  auto res =
-      compiled_model->RunCApi(signature_index, num_input_buffers, input_buffers,
-                              num_output_buffers, output_buffers, &async);
-  if (!res) {
-    LITERT_LOG(LITERT_ERROR, "%s", res.Error().Message().c_str());
-    return res.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtRunCompiledModelAsync(LiteRtCompiledModel compiled_model,
-                                         LiteRtParamIndex signature_index,
-                                         size_t num_input_buffers,
-                                         LiteRtTensorBuffer* input_buffers,
-                                         size_t num_output_buffers,
-                                         LiteRtTensorBuffer* output_buffers,
-                                         bool* async) {
-  if (!compiled_model || (num_input_buffers > 0 && !input_buffers) ||
-      (num_output_buffers > 0 && !output_buffers)) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  if (async) {
-    *async = true;
-  }
-  bool async_ = true;
-  bool* async_ptr = async ? async : &async_;
-
-  auto res =
-      compiled_model->RunCApi(signature_index, num_input_buffers, input_buffers,
-                              num_output_buffers, output_buffers, async_ptr);
-  if (!res) {
-    LITERT_LOG(LITERT_ERROR, "%s", res.Error().Message().c_str());
-    return res.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompiledModel(LiteRtCompiledModel compiled_model) {
-  delete compiled_model;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_compiled_model.h b/tensorflow/lite/experimental/litert/c/litert_compiled_model.h
deleted file mode 100644
index 4eb240288cdf..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_compiled_model.h
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_H_
-
-#include <stddef.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// The LiteRtCompiledModel is a higher level inference API. It is created by
-// provided model with compilation options. Internally, it instantiates runtime
-// and applies Delegates mapped to the compilation options.
-// It also supports getting LiteRtTensorBufferRequirements to create
-// input/output TensorBuffers, and it allows to invoke the model with the
-// input/output TensorBuffers.
-//
-// Example user flow:
-//
-// 1. Create LiteRtCompiledModel
-// 2. Query the model input/output LiteRtTensorBufferRequirements
-// 3. Create input/output LiteRtTensorBuffer
-// 4. Fill the input LiteRtTensorBuffer with input data
-// 5. Invoke the model with the input/output LiteRtTensorBuffer
-// 6. Evaluate the output LiteRtTensorBuffer
-
-LITERT_DEFINE_HANDLE(LiteRtCompiledModel);
-
-// Creates a LiteRtCompiledModel from a LiteRtModel object.
-// The model is loaded into memory and the caller takes ownership of the
-// returned object.
-LiteRtStatus LiteRtCreateCompiledModel(
-    LiteRtEnvironment environment, LiteRtModel model,
-    LiteRtCompilationOptions compilation_options,
-    LiteRtCompiledModel* compiled_model);
-
-// Returns the buffer requirements for the given n-th input tensor. The returned
-// LiteRtTensorBufferRequirements is used to create the input tensor
-// buffer.
-//
-// Parameters:
-// - compiled_model: the target `LiteRtCompiledModel` object.
-// - signature_index: the index of the signature in `LiteRtModel`.
-// - input_index: the index of the input tensor in the signature (subgraph).
-// - buffer_requirements: the returned `LiteRtTensorBufferRequirements`.
-LiteRtStatus LiteRtGetCompiledModelInputBufferRequirements(
-    LiteRtCompiledModel compiled_model, LiteRtParamIndex signature_index,
-    LiteRtParamIndex input_index,
-    LiteRtTensorBufferRequirements* buffer_requirements);
-
-// Returns the buffer requirements for the given n-th output tensor. The
-// returned LiteRtTensorBufferRequirements is used to create the output tensor
-// buffer.
-//
-// Parameters:
-// - compiled_model: the target `LiteRtCompiledModel` object.
-// - signature_index: the index of the signature in `LiteRtModel`.
-// - input_index: the index of the input tensor in the signature (subgraph).
-// - buffer_requirements: the returned `LiteRtTensorBufferRequirements`.
-LiteRtStatus LiteRtGetCompiledModelOutputBufferRequirements(
-    LiteRtCompiledModel compiled_model, LiteRtParamIndex signature_index,
-    LiteRtParamIndex output_index,
-    LiteRtTensorBufferRequirements* buffer_requirements);
-
-// Runs the model of the given signature synchronously, with the provided
-// input/output LiteRtTensorBuffer.
-//
-// Parameters:
-// - compiled_model: the target `LiteRtCompiledModel` object.
-// - signature_index: the index of the signature in `LiteRtModel`.
-// - num_input_buffers: the number of input `LiteRtTensorBuffer`.
-// - input_buffers: the array of input `LiteRtTensorBuffer`.
-// - num_output_buffers: the number of output `LiteRtTensorBuffer`.
-// - output_buffers: the array of output LiteRtTensorBuffer.
-LiteRtStatus LiteRtRunCompiledModel(LiteRtCompiledModel compiled_model,
-                                    LiteRtParamIndex signature_index,
-                                    size_t num_input_buffers,
-                                    LiteRtTensorBuffer* input_buffers,
-                                    size_t num_output_buffers,
-                                    LiteRtTensorBuffer* output_buffers);
-
-// Runs the model of the given signature asynchronously, if possible, with the
-// provided input/output LiteRtTensorBuffers. If asynchronous execution is
-// possible, then the function sets parameter `async` to true; if asynchronous
-// execution is not possible, then the function runs the model synchronously and
-// sets parameter `async` to false. Note that:
-//
-// - Asynchronous execution is possible only in certain cases, based on the ops
-//   included in the model, the selected HW accelerator(s), and the capability
-//   of the user device hardware.
-//
-// - If asynchronous execution is indeed possible, it may be that only some
-//   parts of the model are run asynchronously (e.g., ops mapped to the GPU)
-//   while other parts of the model are still run synchronously with the
-//   invocation of this call (e.g., ops mapped to the CPU).
-//
-// - In case of asynchronous execution some or all of the output tensor buffers
-//   will have a synchronization event attached to them and the caller is
-//   responsible for passing such events to a downstream processing step.
-//
-// Parameters:
-// - async: optional boolean to let the caller know if the model is being run
-//   asynchronously.
-LiteRtStatus LiteRtRunCompiledModelAsync(
-    LiteRtCompiledModel compiled_model, LiteRtParamIndex signature_index,
-    size_t num_input_buffers, LiteRtTensorBuffer* input_buffers,
-    size_t num_output_buffers, LiteRtTensorBuffer* output_buffers, bool* async);
-
-void LiteRtDestroyCompiledModel(LiteRtCompiledModel compiled_model);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.cc b/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.cc
deleted file mode 100644
index 8e573d6eaa09..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-
-#define LRT_CHECK_NON_NULL(handle)                          \
-  if (!(handle)) {                                          \
-    LITERT_LOG(LITERT_ERROR, #handle " must not be null."); \
-    return kLiteRtStatusErrorInvalidArgument;               \
-  }
-
-#define LRT_REQUIRE_VERSION(MAJOR, MINOR, PATCH)                               \
-  if (LiteRtCompareApiVersion(options->version, {(MAJOR), (MINOR), (PATCH)}) < \
-      0) {                                                                     \
-    LITERT_LOG(LITERT_ERROR,                                                   \
-               "The version of this option object is too old. Expected at "    \
-               "least %d.%d.%d, got %d.%d.%d",                                 \
-               (MAJOR), (MINOR), (PATCH), options->version.major,              \
-               options->version.minor, options->version.patch);                \
-    return kLiteRtStatusErrorWrongVersion;                                     \
-  }
-
-extern "C" {
-
-struct LiteRtCompilationOptionsT {
-  // This should be updated every time a field is added/edited.
-  //
-  // - Renaming a field: increment patch;
-  // - Adding or deprecating a field: set patch to 0, increment minor.
-  // - Breaking layout compatibility: set patch and minor to 0, increment major.
-  //
-  // Note: Changing a default value does not impact the version.
-  LiteRtApiVersion version = {.major = 0, .minor = 0, .patch = 1};
-  LiteRtHwAcceleratorSet hardware_accelerators = kLiteRtHwAcceleratorNone;
-  LiteRtAcceleratorCompilationOptions accelerator_compilation_options = nullptr;
-};
-
-LiteRtStatus LiteRtCreateCompilationOptions(LiteRtCompilationOptions* options) {
-  LRT_CHECK_NON_NULL(options);
-  *options = new LiteRtCompilationOptionsT;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDestroyCompilationOptions(LiteRtCompilationOptions options) {
-  LiteRtDestroyAcceleratorCompilationOptions(
-      options->accelerator_compilation_options);
-  delete options;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtSetCompilationOptionsHardwareAccelerators(
-    LiteRtCompilationOptions options,
-    LiteRtHwAcceleratorSet hardware_accelerators) {
-  LRT_CHECK_NON_NULL(options);
-  if ((hardware_accelerators &
-       (kLiteRtHwAcceleratorCpu | kLiteRtHwAcceleratorGpu |
-        kLiteRtHwAcceleratorNpu)) != hardware_accelerators) {
-    LITERT_LOG(LITERT_ERROR,
-               "Invalid bitfield value for hardware accelerator set: %d.",
-               hardware_accelerators);
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  options->hardware_accelerators = hardware_accelerators;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompilationOptionsHardwareAccelerators(
-    LiteRtCompilationOptions options,
-    LiteRtHwAcceleratorSet* hardware_accelerators) {
-  LRT_CHECK_NON_NULL(options);
-  LRT_CHECK_NON_NULL(hardware_accelerators);
-  LRT_REQUIRE_VERSION(0, 0, 0);
-  *hardware_accelerators = options->hardware_accelerators;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtAddAcceleratorCompilationOptions(
-    LiteRtCompilationOptions options,
-    LiteRtAcceleratorCompilationOptions accelerator_compilation_options) {
-  LRT_CHECK_NON_NULL(options);
-  LRT_CHECK_NON_NULL(accelerator_compilation_options);
-
-  LiteRtAppendAcceleratorCompilationOptions(
-      &options->accelerator_compilation_options,
-      accelerator_compilation_options);
-
-  return kLiteRtStatusOk;
-}
-
-// Retrieves the head of the accelerator compilation option list.
-//
-// Note: The following elements may be retrieved with
-// `LiteRtGetNextAcceleratorCompilationOptions`.
-LiteRtStatus LiteRtGetAcceleratorCompilationOptions(
-    LiteRtCompilationOptions options,
-    LiteRtAcceleratorCompilationOptions* accelerator_compilation_options) {
-  LRT_CHECK_NON_NULL(options);
-  LRT_CHECK_NON_NULL(accelerator_compilation_options);
-  *accelerator_compilation_options = options->accelerator_compilation_options;
-  return kLiteRtStatusOk;
-}
-
-}  // extern "C"
diff --git a/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h b/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h
deleted file mode 100644
index 936d042bbb52..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_OPTIONS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_OPTIONS_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// The compilation options for the LiteRtCompiledModel.
-LITERT_DEFINE_HANDLE(LiteRtCompilationOptions);
-
-// Creates a compilation option object.
-//
-// This doesn't need to be passed to `LiteRtDestroyCompilationOptions` after
-// calling `LiteRtCreateCompiledModel`.
-LiteRtStatus LiteRtCreateCompilationOptions(LiteRtCompilationOptions* options);
-
-// Destroys a compilation option object.
-//
-// WARNING: this only needs to be called if the options were not passed to
-// `LiteRtCreateCompiledModel`.
-LiteRtStatus LiteRtDestroyCompilationOptions(LiteRtCompilationOptions options);
-
-// Sets the requested hardware accelerators to apply during model compilation.
-LiteRtStatus LiteRtSetCompilationOptionsHardwareAccelerators(
-    LiteRtCompilationOptions options,
-    LiteRtHwAcceleratorSet hardware_accelerators);
-
-// Gets the hardware accelerators to apply during model compilation.
-LiteRtStatus LiteRtGetCompilationOptionsHardwareAccelerators(
-    LiteRtCompilationOptions options,
-    LiteRtHwAcceleratorSet* hardware_accelerators);
-
-// Adds compilation options for a specific accelerator to the accelerator
-// compilation option list.
-//
-// Note: Multiple accelerator options may be added to the options object.
-//
-// Note: `accelerator_compilation_options`'s ownership is transferred to
-// `options`.
-LiteRtStatus LiteRtAddAcceleratorCompilationOptions(
-    LiteRtCompilationOptions options,
-    LiteRtAcceleratorCompilationOptions accelerator_compilation_options);
-
-// Retrieves the head of the accelerator compilation option list.
-//
-// Note: The following elements may be retrieved with
-// `LiteRtGetNextAcceleratorCompilationOptions`.
-LiteRtStatus LiteRtGetAcceleratorCompilationOptions(
-    LiteRtCompilationOptions options,
-    LiteRtAcceleratorCompilationOptions* accelerator_compilation_options);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_COMPILED_MODEL_OPTIONS_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_compiled_model_options_test.cc b/tensorflow/lite/experimental/litert/c/litert_compiled_model_options_test.cc
deleted file mode 100644
index 737e0bf701a7..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_compiled_model_options_test.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-namespace {
-
-TEST(LiteRtCompiledModelOptionsTest, CreateAndDestroyDontLeak) {
-  LiteRtCompilationOptions options;
-  ASSERT_EQ(LiteRtCreateCompilationOptions(&options), kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtDestroyCompilationOptions(options), kLiteRtStatusOk);
-}
-
-TEST(LiteRtCompiledModelOptionsTest, CreateWithANullPointerErrors) {
-  EXPECT_EQ(LiteRtCreateCompilationOptions(nullptr),
-            kLiteRtStatusErrorInvalidArgument);
-}
-
-TEST(LiteRtCompiledModelOptionsTest, SetAndGetHardwareAcceleratorsWorks) {
-  LiteRtCompilationOptions options;
-  ASSERT_EQ(LiteRtCreateCompilationOptions(&options), kLiteRtStatusOk);
-
-  LiteRtHwAcceleratorSet hardware_accelerators;
-
-  EXPECT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                options, kLiteRtHwAcceleratorNone),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetCompilationOptionsHardwareAccelerators(
-                options, &hardware_accelerators),
-            kLiteRtStatusOk);
-  EXPECT_EQ(hardware_accelerators, kLiteRtHwAcceleratorNone);
-
-  EXPECT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                options, kLiteRtHwAcceleratorCpu),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetCompilationOptionsHardwareAccelerators(
-                options, &hardware_accelerators),
-            kLiteRtStatusOk);
-  EXPECT_EQ(hardware_accelerators, kLiteRtHwAcceleratorCpu);
-
-  EXPECT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                options, kLiteRtHwAcceleratorGpu),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetCompilationOptionsHardwareAccelerators(
-                options, &hardware_accelerators),
-            kLiteRtStatusOk);
-  EXPECT_EQ(hardware_accelerators, kLiteRtHwAcceleratorGpu);
-
-  EXPECT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                options, kLiteRtHwAcceleratorNpu),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetCompilationOptionsHardwareAccelerators(
-                options, &hardware_accelerators),
-            kLiteRtStatusOk);
-  EXPECT_EQ(hardware_accelerators, kLiteRtHwAcceleratorNpu);
-
-  EXPECT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                options, (kLiteRtHwAcceleratorCpu | kLiteRtHwAcceleratorGpu |
-                          kLiteRtHwAcceleratorNpu) +
-                             1),
-            kLiteRtStatusErrorInvalidArgument);
-  EXPECT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                nullptr, kLiteRtHwAcceleratorNone),
-            kLiteRtStatusErrorInvalidArgument);
-
-  EXPECT_EQ(LiteRtDestroyCompilationOptions(options), kLiteRtStatusOk);
-}
-
-struct DummyAccleratorCompilationOptions {
-  // This NEEDS to be the first non-static field of the structure.
-  LiteRtAcceleratorCompilationOptionsHeader link;
-
-  // Allocates and sets the basic structure for the accelerator options.
-  static LiteRtStatus Create(LiteRtAcceleratorCompilationOptions* options) {
-    if (!options) {
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-    *options = reinterpret_cast<LiteRtAcceleratorCompilationOptions>(
-        new DummyAccleratorCompilationOptions());
-    LiteRtSetAcceleratorCompilationOptionsDestructor(*options, Destroy);
-    return kLiteRtStatusOk;
-  }
-
- private:
-  // Destroys the options.
-  static void Destroy(LiteRtAcceleratorCompilationOptions options) {
-    delete reinterpret_cast<DummyAccleratorCompilationOptions*>(options);
-  }
-};
-
-TEST(LiteRtCompiledModelOptionsTest, AddAcceleratorCompilationOptionsWorks) {
-  LiteRtCompilationOptions options;
-  ASSERT_EQ(LiteRtCreateCompilationOptions(&options), kLiteRtStatusOk);
-
-  LiteRtAcceleratorCompilationOptions accelerator_options1,
-      accelerator_options2;
-  ASSERT_EQ(DummyAccleratorCompilationOptions::Create(&accelerator_options1),
-            kLiteRtStatusOk);
-  ASSERT_EQ(DummyAccleratorCompilationOptions::Create(&accelerator_options2),
-            kLiteRtStatusOk);
-
-  EXPECT_EQ(
-      LiteRtAddAcceleratorCompilationOptions(nullptr, accelerator_options1),
-      kLiteRtStatusErrorInvalidArgument);
-  EXPECT_EQ(LiteRtAddAcceleratorCompilationOptions(options, nullptr),
-            kLiteRtStatusErrorInvalidArgument);
-
-  EXPECT_EQ(
-      LiteRtAddAcceleratorCompilationOptions(options, accelerator_options1),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtAddAcceleratorCompilationOptions(options, accelerator_options2),
-      kLiteRtStatusOk);
-
-  LiteRtAcceleratorCompilationOptions options_it = nullptr;
-  EXPECT_EQ(LiteRtGetAcceleratorCompilationOptions(options, &options_it),
-            kLiteRtStatusOk);
-  EXPECT_EQ(options_it, accelerator_options1);
-
-  EXPECT_EQ(LiteRtGetNextAcceleratorCompilationOptions(&options_it),
-            kLiteRtStatusOk);
-  EXPECT_EQ(options_it, accelerator_options2);
-
-  EXPECT_EQ(LiteRtDestroyCompilationOptions(options), kLiteRtStatusOk);
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/c/litert_compiled_model_test.cc b/tensorflow/lite/experimental/litert/c/litert_compiled_model_test.cc
deleted file mode 100644
index 544daffb1529..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_compiled_model_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model.h"
-
-#include <cstddef>
-#include <cstring>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-
-using testing::FloatNear;
-using testing::Pointwise;
-
-namespace litert {
-namespace {
-
-TEST(CompiledModelTest, Basic) {
-  auto path = testing::GetTestFilePath(kModelFileName);
-
-  LiteRtModel model;
-  ASSERT_EQ(LiteRtCreateModelFromFile(path.c_str(), &model), kLiteRtStatusOk);
-
-  LiteRtCompilationOptions compilation_options;
-  ASSERT_EQ(LiteRtCreateCompilationOptions(&compilation_options),
-            kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                compilation_options, kLiteRtHwAcceleratorCpu),
-            kLiteRtStatusOk);
-
-  LiteRtEnvironment environment;
-  LiteRtEnvOption options = {};
-  ASSERT_EQ(LiteRtEnvironmentCreate(/*num_options=*/0, &options, &environment),
-            kLiteRtStatusOk);
-
-  LiteRtCompiledModel compiled_model;
-  ASSERT_EQ(LiteRtCreateCompiledModel(environment, model, compilation_options,
-                                      &compiled_model),
-            kLiteRtStatusOk);
-
-  LiteRtSubgraph subgraph;
-  ASSERT_EQ(LiteRtGetModelSubgraph(model, 0, &subgraph), kLiteRtStatusOk);
-
-  LiteRtParamIndex num_inputs;
-  ASSERT_EQ(LiteRtGetNumSubgraphInputs(subgraph, &num_inputs), kLiteRtStatusOk);
-
-  std::vector<LiteRtTensorBuffer> input_tensor_buffers;
-  input_tensor_buffers.reserve(num_inputs);
-  for (auto i = 0; i < num_inputs; ++i) {
-    LiteRtTensorBufferRequirements tensor_buffer_requirements;
-    ASSERT_EQ(LiteRtGetCompiledModelInputBufferRequirements(
-                  compiled_model, /*signature_index=*/0, i,
-                  &tensor_buffer_requirements),
-              kLiteRtStatusOk);
-    LiteRtTensorBufferType tensor_buffer_type;
-    EXPECT_EQ(
-        LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-            tensor_buffer_requirements, /*type_index=*/0, &tensor_buffer_type),
-        kLiteRtStatusOk);
-    size_t tensor_buffer_size;
-    EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                  tensor_buffer_requirements, &tensor_buffer_size),
-              kLiteRtStatusOk);
-    LiteRtTensorBuffer tensor_buffer;
-    EXPECT_EQ(
-        LiteRtCreateManagedTensorBuffer(tensor_buffer_type, &kInput0TensorType,
-                                        tensor_buffer_size, &tensor_buffer),
-        kLiteRtStatusOk);
-    input_tensor_buffers.push_back(tensor_buffer);
-  }
-
-  LiteRtParamIndex num_outputs;
-  ASSERT_EQ(LiteRtGetNumSubgraphOutputs(subgraph, &num_outputs),
-            kLiteRtStatusOk);
-
-  std::vector<LiteRtTensorBuffer> output_tensor_buffers;
-  output_tensor_buffers.reserve(num_outputs);
-  for (auto i = 0; i < num_outputs; ++i) {
-    LiteRtTensorBufferRequirements tensor_buffer_requirements;
-    ASSERT_EQ(LiteRtGetCompiledModelOutputBufferRequirements(
-                  compiled_model, /*signature_index=*/0, i,
-                  &tensor_buffer_requirements),
-              kLiteRtStatusOk);
-    LiteRtTensorBufferType tensor_buffer_type;
-    EXPECT_EQ(
-        LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-            tensor_buffer_requirements, /*type_index=*/0, &tensor_buffer_type),
-        kLiteRtStatusOk);
-    size_t tensor_buffer_size;
-    EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                  tensor_buffer_requirements, &tensor_buffer_size),
-              kLiteRtStatusOk);
-    LiteRtTensorBuffer tensor_buffer;
-    EXPECT_EQ(
-        LiteRtCreateManagedTensorBuffer(tensor_buffer_type, &kInput0TensorType,
-                                        tensor_buffer_size, &tensor_buffer),
-        kLiteRtStatusOk);
-    output_tensor_buffers.push_back(tensor_buffer);
-  }
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_tensor_buffers[0], &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_tensor_buffers[0]),
-              kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_tensor_buffers[1], &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_tensor_buffers[1]),
-              kLiteRtStatusOk);
-  }
-
-  ASSERT_EQ(LiteRtRunCompiledModel(
-                compiled_model, /*signature_index=*/0,
-                input_tensor_buffers.size(), input_tensor_buffers.data(),
-                output_tensor_buffers.size(), output_tensor_buffers.data()),
-            kLiteRtStatusOk);
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffers[0], &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-3), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffers[0]),
-              kLiteRtStatusOk);
-  }
-
-  LiteRtDestroyCompiledModel(compiled_model);
-  LiteRtDestroyModel(model);
-  LiteRtDestroyEnvironment(environment);
-
-  for (auto tensor_buffer : input_tensor_buffers) {
-    LiteRtDestroyTensorBuffer(tensor_buffer);
-  }
-  for (auto tensor_buffer : output_tensor_buffers) {
-    LiteRtDestroyTensorBuffer(tensor_buffer);
-  }
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h b/tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h
deleted file mode 100644
index 5e0b928bfac4..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_DISPATCH_DELEGATE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_DISPATCH_DELEGATE_H_
-
-#include <stddef.h>
-
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-typedef struct LiteRtDispatchDelegateOptions LiteRtDispatchDelegateOptions;
-typedef struct LiteRtEnvironmentT* LiteRtEnvironment;
-
-// Returns DispatchDelegateOptions populated with default values.
-LiteRtDispatchDelegateOptions* LiteRtCreateDefaultDispatchDelegateOptions(
-    LiteRtEnvironment environment);
-
-TfLiteStatus LiteRtAddDispatchDelegateOption(
-    LiteRtDispatchDelegateOptions* options, LiteRtDispatchOption option);
-
-void LiteRtDestroyDispatchDelegateOptions(
-    LiteRtDispatchDelegateOptions* options);
-
-// Create a delegate that uses the Dispatch API for execution. Takes ownership
-// of the passed `options`. Must outlive the TFL interpreter.
-TfLiteOpaqueDelegate* LiteRtCreateDispatchDelegate(
-    LiteRtEnvironment environment, LiteRtDispatchDelegateOptions* options);
-
-// Do any needed cleanup and delete 'delegate'.
-void LiteRtDestroyDispatchDelegate(TfLiteOpaqueDelegate* delegate);
-
-//
-// Common option helpers
-//
-
-// Alloc base is the address of the first byte of flatbuffer model in memory. It
-// is used by ops to find the start of npu byte code appended to the file.
-TfLiteStatus LiteRtDispatchDelegateAddAllocBaseOption(
-    LiteRtDispatchDelegateOptions* options, const void* alloc_base);
-
-// Alloc fd is the file descriptor for an mmapped flatbuffer. It is used by ops
-// to find the start of npu byte code appended to the file.
-TfLiteStatus LiteRtDispatchDelegateAddAllocFdOption(
-    LiteRtDispatchDelegateOptions* options, int alloc_fd);
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_DISPATCH_DELEGATE_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_environment.cc b/tensorflow/lite/experimental/litert/c/litert_environment.cc
deleted file mode 100644
index c680c9082a23..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_environment.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LiteRtStatus LiteRtEnvironmentCreate(int num_options,
-                                     const LiteRtEnvOption* options,
-                                     LiteRtEnvironment* environment) {
-  auto status = LiteRtEnvironmentT::CreateWithOptions(
-      absl::MakeSpan(options, num_options));
-  if (!status) {
-    return status.Error().Status();
-  }
-  *environment = status->release();
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyEnvironment(LiteRtEnvironment environment) {
-  if (environment != nullptr) {
-    delete environment;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_environment.h b/tensorflow/lite/experimental/litert/c/litert_environment.h
deleted file mode 100644
index 5a39c5ee0a02..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_environment.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ENVIRONMENT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ENVIRONMENT_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-typedef enum {
-  kLiteRtEnvOptionTagCompilerPluginLibraryDir = 0,
-  kLiteRtEnvOptionTagDispatchLibraryDir = 1,
-  kLiteRtEnvOptionTagOpenClDeviceId = 2,
-  kLiteRtEnvOptionTagOpenClPlatformId = 3,
-  kLiteRtEnvOptionTagOpenClContext = 4,
-  kLiteRtEnvOptionTagOpenClCommandQueue = 5,
-} LiteRtEnvOptionTag;
-
-typedef struct {
-  LiteRtEnvOptionTag tag;
-  LiteRtAny value;
-} LiteRtEnvOption;
-
-LITERT_DEFINE_HANDLE(LiteRtEnvironment);
-
-// Create a LiteRT environment with options.
-LiteRtStatus LiteRtEnvironmentCreate(int num_options,
-                                     const LiteRtEnvOption* options,
-                                     LiteRtEnvironment* environment);
-
-void LiteRtDestroyEnvironment(LiteRtEnvironment environment);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_ENVIRONMENT_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_event.cc b/tensorflow/lite/experimental/litert/c/litert_event.cc
deleted file mode 100644
index 9485dd975dcb..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_event.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-
-#include <fcntl.h>
-
-#include <cstdint>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/runtime/event.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LiteRtStatus LiteRtCreateEventFromSyncFenceFd(int sync_fence_fd, bool owns_fd,
-                                              LiteRtEvent* event) {
-#if LITERT_HAS_SYNC_FENCE_SUPPORT
-  *event = new LiteRtEventT{.fd = sync_fence_fd, .owns_fd = owns_fd};
-  return kLiteRtStatusOk;
-#else
-  return kLiteRtStatusErrorUnsupported;
-#endif
-}
-
-LiteRtStatus LiteRtGetEventSyncFenceFd(LiteRtEvent event, int* sync_fence_fd) {
-#if LITERT_HAS_SYNC_FENCE_SUPPORT
-  *sync_fence_fd = event->fd;
-  return kLiteRtStatusOk;
-#else
-  return kLiteRtStatusErrorUnsupported;
-#endif
-}
-
-LiteRtStatus LiteRtEventWait(LiteRtEvent event, int64_t timeout_in_ms) {
-  if (auto status = event->Wait(timeout_in_ms); !status) {
-    LITERT_LOG(LITERT_ERROR, "%s", status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyEvent(LiteRtEvent event) { delete event; }
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_event.h b/tensorflow/lite/experimental/litert/c/litert_event.h
deleted file mode 100644
index 20a42738a822..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_event.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_EVENT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_EVENT_H_
-
-#include <stdbool.h>  // NOLINT: To use bool type in C
-#include <stdint.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-LITERT_DEFINE_HANDLE(LiteRtEvent);
-
-LiteRtStatus LiteRtCreateEventFromSyncFenceFd(int sync_fence_fd, bool owns_fd,
-                                              LiteRtEvent* event);
-
-LiteRtStatus LiteRtGetEventSyncFenceFd(LiteRtEvent event, int* sync_fence_fd);
-
-// Pass -1 for timeout_in_ms for indefinite wait.
-LiteRtStatus LiteRtEventWait(LiteRtEvent event, int64_t timeout_in_ms);
-
-void LiteRtDestroyEvent(LiteRtEvent event);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_EVENT_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_layout.h b/tensorflow/lite/experimental/litert/c/litert_layout.h
deleted file mode 100644
index b641985b9793..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_layout.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LAYOUT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LAYOUT_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// Max number of dimensions in any ranked tensor type.
-#define LITERT_TENSOR_MAX_RANK 8
-
-// The shape information for tensor types of fixed rank.
-typedef struct {
-  // The number of dimensions.
-  uint32_t rank;
-
-  // Dimension sizes, array of length `rank`. Dynamic dimensions are anything
-  // less than 0. Everything from [rank, LITERT_MAX_RANK) is undefined.
-  int32_t dimensions[LITERT_TENSOR_MAX_RANK];
-
-  // Strides for a nomimal NWHC layout. NULL if unused.
-  const uint32_t* strides;
-} LiteRtLayout;
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LAYOUT_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_logging.cc b/tensorflow/lite/experimental/litert/c/litert_logging.cc
deleted file mode 100644
index 66f92cd9e795..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_logging.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-
-#include <cstdarg>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/logger.h"
-#include "tensorflow/lite/minimal_logging.h"
-
-class LiteRtLoggerT {
- public:
-  LiteRtLogSeverity GetMinSeverity() {
-    return ConvertSeverity(
-        tflite::logging_internal::MinimalLogger::GetMinimumLogSeverity());
-  }
-
-  void SetMinSeverity(LiteRtLogSeverity severity) {
-    tflite::logging_internal::MinimalLogger::SetMinimumLogSeverity(
-        ConvertSeverity(severity));
-  }
-
-  void Log(LiteRtLogSeverity severity, const char* format, va_list args) {
-    tflite::logging_internal::MinimalLogger::LogFormatted(
-        ConvertSeverity(severity), format, args);
-  }
-
- private:
-  static tflite::LogSeverity ConvertSeverity(LiteRtLogSeverity severity) {
-    return static_cast<tflite::LogSeverity>(severity);
-  }
-
-  static LiteRtLogSeverity ConvertSeverity(tflite::LogSeverity severity) {
-    return static_cast<LiteRtLogSeverity>(severity);
-  }
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LiteRtStatus LiteRtCreateLogger(LiteRtLogger* logger) {
-  if (!logger) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *logger = new LiteRtLoggerT;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetMinLoggerSeverity(LiteRtLogger logger,
-                                        LiteRtLogSeverity* min_severity) {
-  if (!logger || !min_severity) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *min_severity = logger->GetMinSeverity();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtSetMinLoggerSeverity(LiteRtLogger logger,
-                                        LiteRtLogSeverity min_severity) {
-  if (!logger) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  logger->SetMinSeverity(min_severity);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtLoggerLog(LiteRtLogger logger, LiteRtLogSeverity severity,
-                             const char* format, ...) {
-  if (!logger || !format) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  va_list args;
-  va_start(args, format);
-  logger->Log(severity, format, args);
-  va_end(args);
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyLogger(LiteRtLogger logger) {
-  if (logger != nullptr) {
-    delete logger;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-namespace {
-LiteRtLoggerT StaticLogger;
-LiteRtLogger DefaultLogger = &StaticLogger;
-}  // namespace
-
-LiteRtStatus LiteRtSetDefaultLogger(LiteRtLogger logger) {
-  if (!logger) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  DefaultLogger = logger;
-  return kLiteRtStatusOk;
-}
-
-LiteRtLogger LiteRtGetDefaultLogger() { return DefaultLogger; }
diff --git a/tensorflow/lite/experimental/litert/c/litert_logging.h b/tensorflow/lite/experimental/litert/c/litert_logging.h
deleted file mode 100644
index 4570e76327b7..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_logging.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LOGGING_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LOGGING_H_
-
-#include <stdarg.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-LITERT_DEFINE_HANDLE(LiteRtLogger);
-
-// WARNING: The values of the following enum are to be kept in sync with
-// tflite::LogSeverity.
-typedef enum {
-  kLiteRtLogSeverityVerbose = 0,
-  kLiteRtLogSeverityInfo = 1,
-  kLiteRtLogSeverityWarning = 2,
-  kLiteRtLogSeverityError = 3,
-  kLiteRtLogSeveritySilent = 4,
-} LiteRtLogSeverity;
-
-#define LITERT_VERBOSE kLiteRtLogSeverityVerbose
-#define LITERT_INFO kLiteRtLogSeverityInfo
-#define LITERT_WARNING kLiteRtLogSeverityWarning
-#define LITERT_ERROR kLiteRtLogSeverityError
-#define LITERT_SILENT kLiteRtLogSeveritySilent
-
-LiteRtStatus LiteRtCreateLogger(LiteRtLogger* logger);
-LiteRtStatus LiteRtGetMinLoggerSeverity(LiteRtLogger logger,
-                                        LiteRtLogSeverity* min_severity);
-LiteRtStatus LiteRtSetMinLoggerSeverity(LiteRtLogger logger,
-                                        LiteRtLogSeverity min_severity);
-LiteRtStatus LiteRtLoggerLog(LiteRtLogger logger, LiteRtLogSeverity severity,
-                             const char* format, ...);
-void LiteRtDestroyLogger(LiteRtLogger logger);
-
-LiteRtLogger LiteRtGetDefaultLogger();
-LiteRtStatus LiteRtSetDefaultLogger(LiteRtLogger logger);
-LiteRtStatus LiteRtDefaultLoggerLog(LiteRtLogSeverity severity,
-                                    const char* format, ...);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#define LITERT_LOGGER_LOG_PROD(logger, severity, format, ...)                  \
-  {                                                                            \
-    LiteRtLogSeverity __min_severity__;                                        \
-    if (LiteRtGetMinLoggerSeverity(logger, &__min_severity__) !=               \
-        kLiteRtStatusOk) {                                                     \
-      __min_severity__ = kLiteRtLogSeverityVerbose;                            \
-    }                                                                          \
-    if (severity >= __min_severity__) {                                        \
-      LiteRtLoggerLog(logger, severity, "[%s:%d] " format, __FILE__, __LINE__, \
-                      ##__VA_ARGS__);                                          \
-    }                                                                          \
-  }
-
-#ifndef NDEBUG
-#define LITERT_LOGGER_LOG LITERT_LOGGER_LOG_PROD
-#else
-#define LITERT_LOGGER_LOG(logger, severity, format, ...)             \
-  do {                                                               \
-    LITERT_LOGGER_LOG_PROD(logger, severity, format, ##__VA_ARGS__); \
-  } while (false)
-#endif
-
-#define LITERT_LOG(severity, format, ...) \
-  LITERT_LOGGER_LOG(LiteRtGetDefaultLogger(), severity, format, ##__VA_ARGS__);
-
-#define LITERT_ABORT abort()
-
-#define LITERT_FATAL(format, ...)                              \
-  do {                                                         \
-    LITERT_LOG(kLiteRtLogSeverityError, format, ##__VA_ARGS__) \
-    LITERT_ABORT;                                              \
-  } while (0)
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_LOGGING_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_logging_test.cc b/tensorflow/lite/experimental/litert/c/litert_logging_test.cc
deleted file mode 100644
index 148fc778f189..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_logging_test.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-
-#include <gtest/gtest.h>  // NOLINT: Need when ANDROID_API_LEVEL >= 26
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-TEST(Layout, Creation) {
-  LiteRtLogger logger;
-  ASSERT_EQ(LiteRtCreateLogger(&logger), kLiteRtStatusOk);
-  LiteRtDestroyLogger(logger);
-}
-
-TEST(Layout, MinLogging) {
-  LiteRtLogger logger;
-  ASSERT_EQ(LiteRtCreateLogger(&logger), kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtSetMinLoggerSeverity(logger, LITERT_SILENT), kLiteRtStatusOk);
-  LiteRtLogSeverity min_severity;
-  ASSERT_EQ(LiteRtGetMinLoggerSeverity(logger, &min_severity), kLiteRtStatusOk);
-  ASSERT_EQ(min_severity, LITERT_SILENT);
-  LiteRtDestroyLogger(logger);
-}
diff --git a/tensorflow/lite/experimental/litert/c/litert_model.cc b/tensorflow/lite/experimental/litert/c/litert_model.cc
deleted file mode 100644
index af83f970c94c..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_model.cc
+++ /dev/null
@@ -1,506 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <tuple>
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_load.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_serialize.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//
-// Model
-//
-
-LiteRtStatus LiteRtCreateModelFromFile(const char* filename,
-                                       LiteRtModel* model) {
-  if (!filename || !model) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto new_model = litert::internal::LoadModelFromFile(filename);
-  if (!new_model) {
-    return new_model.Error().Status();
-  }
-  *model = new_model->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCreateModelFromBuffer(const void* buffer_addr,
-                                         size_t buffer_size,
-                                         LiteRtModel* model) {
-  if (!buffer_addr || !buffer_size || !model) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto new_model = litert::internal::LoadModelFromBuffer(
-      litert::BufferRef<uint8_t>(buffer_addr, buffer_size));
-  if (!new_model) {
-    return new_model.Error().Status();
-  }
-  *model = new_model->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumModelSubgraphs(LiteRtModel model,
-                                        LiteRtParamIndex* num_subgraphs) {
-  if (model == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_subgraphs = model->Subgraphs().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetModelSubgraph(LiteRtModel model,
-                                    LiteRtParamIndex subgraph_index,
-                                    LiteRtSubgraph* subgraph) {
-  if (model == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (subgraph_index >= model->Subgraphs().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *subgraph = &model->Subgraph(subgraph_index);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetMainModelSubgraphIndex(
-    LiteRtModel model, LiteRtParamIndex* main_subgraph_index) {
-  if (!model || !main_subgraph_index) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *main_subgraph_index = LiteRtModelT::kMainSubgraphIndex;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetModelMetadata(LiteRtModel model, const char* metadata_key,
-                                    const void** metadata_buffer,
-                                    size_t* metadata_buffer_size) {
-  if (!model || !metadata_key || !metadata_buffer || !metadata_buffer_size) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto m_buf = model->FindMetadata(metadata_key);
-  if (!m_buf) {
-    return m_buf.Error().Status();
-  }
-  *metadata_buffer = m_buf->Data();
-  *metadata_buffer_size = m_buf->Size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumModelSignatures(LiteRtModel model,
-                                         LiteRtParamIndex* num_signatures) {
-  if (!model || !num_signatures) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_signatures = model->Signatures().size();
-  return kLiteRtStatusOk;
-}
-
-// Get the signature at the given index in the model
-LiteRtStatus LiteRtGetModelSignature(LiteRtModel model,
-                                     LiteRtParamIndex signature_index,
-                                     LiteRtSignature* signature) {
-  if (!model || !signature) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (signature_index >= model->Signatures().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *signature = model->Signatures().at(signature_index);
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyModel(LiteRtModel model) { delete model; }
-
-LiteRtStatus LiteRtSerializeModel(LiteRtModel model, uint8_t** buf,
-                                  size_t* size, size_t* offset,
-                                  bool destroy_model,
-                                  LiteRtModelSerializationOptions options) {
-  auto serialized = litert::internal::SerializeModel(
-      std::move(*model), options.bytecode_alignment);
-  // Even if we fail to serialize, we still need to destroy the model if
-  // requested. This is because the model may have been partially serialized
-  // and we don't want to leak memory. Also if ownership of the model is
-  // transferred to the caller, we need to ensure that the model is destroyed
-  // when the caller is done with it.
-  if (destroy_model) {
-    delete model;
-  }
-  if (!serialized) {
-    return serialized.Error().Status();
-  }
-  std::tie(*buf, *size, *offset) = serialized->Release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtPushOp(LiteRtOpList op_list, LiteRtOp op,
-                          LiteRtParamIndex index) {
-  if (!op_list || !op) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  op_list->Push(op, index);
-  return kLiteRtStatusOk;
-}
-
-//
-// Signature
-//
-
-LiteRtStatus LiteRtGetDefaultSignatureKey(const char** signature_key) {
-  if (!signature_key) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *signature_key = LiteRtSignatureT::kDefaultSignatureKey.data();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSignatureKey(LiteRtSignature signature,
-                                   const char** signature_key) {
-  if (!signature || !signature_key) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *signature_key = signature->Key().data();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSignatureSubgraph(LiteRtSignature signature,
-                                        LiteRtSubgraph* subgraph) {
-  if (signature == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *subgraph = &signature->GetSubgraph();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumSignatureInputs(LiteRtSignature signature,
-                                         LiteRtParamIndex* num_inputs) {
-  if (!signature || !num_inputs) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_inputs = signature->InputNames().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSignatureInputName(LiteRtSignature signature,
-                                         LiteRtParamIndex input_idx,
-                                         const char** input_name) {
-  if (!signature || !input_name) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (input_idx >= signature->InputNames().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *input_name = signature->InputNames().at(input_idx).data();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumSignatureOutputs(LiteRtSignature signature,
-                                          LiteRtParamIndex* num_outputs) {
-  if (!signature || !num_outputs) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_outputs = signature->OutputNames().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSignatureOutputName(LiteRtSignature signature,
-                                          LiteRtParamIndex output_idx,
-                                          const char** output_name) {
-  if (!signature || !output_name) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (output_idx >= signature->OutputNames().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *output_name = signature->OutputNames().at(output_idx).data();
-  return kLiteRtStatusOk;
-}
-
-//
-// Subgraph
-//
-
-LiteRtStatus LiteRtGetNumSubgraphInputs(LiteRtSubgraph subgraph,
-                                        LiteRtParamIndex* num_inputs) {
-  if (!subgraph || !num_inputs) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_inputs = subgraph->Inputs().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSubgraphInput(LiteRtSubgraph subgraph,
-                                    LiteRtParamIndex input_index,
-                                    LiteRtTensor* input) {
-  if (!subgraph || !input) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (input_index < 0 || input_index >= subgraph->Inputs().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *input = subgraph->Inputs()[input_index];
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumSubgraphOutputs(LiteRtSubgraph subgraph,
-                                         LiteRtParamIndex* num_outputs) {
-  if (!subgraph || !num_outputs) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_outputs = subgraph->Outputs().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSubgraphOutput(LiteRtSubgraph subgraph,
-                                     LiteRtParamIndex output_index,
-                                     LiteRtTensor* output) {
-  if (!subgraph || !output) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (output_index < 0 || output_index >= subgraph->Outputs().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *output = subgraph->Outputs()[output_index];
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumSubgraphOps(LiteRtSubgraph subgraph,
-                                     LiteRtParamIndex* num_ops) {
-  if (!subgraph || !num_ops) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_ops = subgraph->Ops().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSubgraphOp(LiteRtSubgraph subgraph,
-                                 LiteRtParamIndex op_index, LiteRtOp* op) {
-  if (!subgraph || !op) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (op_index < 0 || op_index >= subgraph->Ops().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *op = subgraph->Ops()[op_index];
-  return kLiteRtStatusOk;
-}
-
-//
-// Op
-//
-
-LiteRtStatus LiteRtGetOpCode(LiteRtOp op, LiteRtOpCode* code) {
-  if (!op || !code) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *code = op->OpCode();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumOpInputs(LiteRtOp op, LiteRtParamIndex* num_inputs) {
-  if (!op || !num_inputs) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_inputs = op->Inputs().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetOpInput(LiteRtOp op, LiteRtParamIndex input_index,
-                              LiteRtTensor* input) {
-  if (!op || !input) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (input_index < 0 || input_index >= op->Inputs().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *input = op->Inputs()[input_index];
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumOpOutputs(LiteRtOp op, LiteRtParamIndex* num_outputs) {
-  if (!op || !num_outputs) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_outputs = op->Outputs().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetOpOutput(LiteRtOp op, LiteRtParamIndex output_index,
-                               LiteRtTensor* output) {
-  if (!op || !output) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (output_index < 0 || output_index >= op->Outputs().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *output = op->Outputs()[output_index];
-  return kLiteRtStatusOk;
-}
-
-//
-// Weights
-//
-
-LiteRtStatus LiteRtGetWeightsBytes(LiteRtWeights weights, const void** addr,
-                                   size_t* size) {
-  if (!weights || !addr || !size) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *addr = weights->Buffer().Data();
-  *size = weights->Buffer().Size();
-  return kLiteRtStatusOk;
-}
-
-//
-// Tensor
-//
-
-LiteRtStatus LiteRtGetTensorWeights(LiteRtTensor tensor,
-                                    LiteRtWeights* weights) {
-  if (!tensor || !weights) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *weights = &tensor->Weights();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumTensorUses(LiteRtTensor tensor,
-                                    LiteRtParamIndex* num_uses) {
-  if (!tensor || !num_uses) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_uses = tensor->Users().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorUse(LiteRtTensor tensor, LiteRtParamIndex use_index,
-                                LiteRtOp* user,
-                                LiteRtParamIndex* user_arg_index) {
-  if (!tensor || !user || !user_arg_index) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (use_index < 0 || use_index >= tensor->Users().size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *user = tensor->Users()[use_index];
-  *user_arg_index = tensor->UserArgInds()[use_index];
-  return kLiteRtStatusOk;
-}
-
-// Null if subgraph input or constant.
-LiteRtStatus LiteRtGetTensorDefiningOp(LiteRtTensor tensor,
-                                       bool* has_defining_op,
-                                       LiteRtTensorDefiningOp* defining_op) {
-  if (!tensor || !has_defining_op || !defining_op) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (tensor->DefiningOp() != nullptr) {
-    *has_defining_op = true;
-    defining_op->op = tensor->DefiningOp();
-    defining_op->op_output_index = tensor->DefiningOpOutInd();
-  } else {
-    *has_defining_op = false;
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorTypeId(LiteRtTensor tensor,
-                                   LiteRtTensorTypeId* type_id) {
-  if (!tensor || !type_id) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *type_id = tensor->Type().first;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetUnrankedTensorType(
-    LiteRtTensor tensor, LiteRtUnrankedTensorType* unranked_tensor_type) {
-  if (!tensor || !unranked_tensor_type) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (tensor->Type().first != kLiteRtUnrankedTensorType) {
-    return kLiteRtStatusErrorInvalidIrType;
-  }
-  *unranked_tensor_type = tensor->Type().second.unranked_tensor_type;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetRankedTensorType(
-    LiteRtTensor tensor, LiteRtRankedTensorType* ranked_tensor_type) {
-  if (!tensor || !ranked_tensor_type) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (tensor->Type().first != kLiteRtRankedTensorType) {
-    return kLiteRtStatusErrorInvalidIrType;
-  }
-  *ranked_tensor_type = tensor->Type().second.ranked_tensor_type;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorName(LiteRtTensor tensor, const char** name) {
-  if (!tensor || !name) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *name = tensor->Name().data();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetQuantizationTypeId(LiteRtTensor tensor,
-                                         LiteRtQuantizationTypeId* q_type_id) {
-  if (!tensor || !q_type_id) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *q_type_id = tensor->Qparams().first;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetPerTensorQuantization(
-    LiteRtTensor tensor, LiteRtQuantizationPerTensor* per_tensor_quantization) {
-  if (!tensor || !per_tensor_quantization) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (tensor->Qparams().first != kLiteRtQuantizationPerTensor) {
-    return kLiteRtStatusErrorInvalidIrType;
-  }
-  auto& per_tensor = tensor->Qparams().second.per_tensor;
-  per_tensor_quantization->scale = per_tensor.scale;
-  per_tensor_quantization->zero_point = per_tensor.zero_point;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetPerChannelQuantization(
-    LiteRtTensor tensor,
-    LiteRtQuantizationPerChannel* per_channel_quantization) {
-  if (!tensor || !per_channel_quantization) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (tensor->Qparams().first != kLiteRtQuantizationPerChannel) {
-    return kLiteRtStatusErrorInvalidIrType;
-  }
-  auto& per_channel = tensor->Qparams().second.per_channel;
-  per_channel_quantization->scales = per_channel.scales;
-  per_channel_quantization->zero_points = per_channel.zero_points;
-  per_channel_quantization->num_channels = per_channel.num_channels;
-  per_channel_quantization->quantized_dimension =
-      per_channel.quantized_dimension;
-  return kLiteRtStatusOk;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_model.h b/tensorflow/lite/experimental/litert/c/litert_model.h
deleted file mode 100644
index ba55d759e23e..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_model.h
+++ /dev/null
@@ -1,372 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_MODEL_H_
-
-#include <stdbool.h>  // NOLINT: To use bool type in C
-#include <stddef.h>
-#include <stdint.h>
-
-#include "tensorflow/lite/core/c/c_api_types.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-//
-// Handles + Common
-//
-
-// Constant data behind a tensor stored in the model.
-LITERT_DEFINE_HANDLE(LiteRtWeights);
-
-// Values/edges of the models graph.
-LITERT_DEFINE_HANDLE(LiteRtTensor);
-
-// Operations/nodes of the models graph.
-LITERT_DEFINE_HANDLE(LiteRtOp);
-
-// Fundamental block of program, i.e. a function body.
-LITERT_DEFINE_HANDLE(LiteRtSubgraph);
-
-// Signature of the model.
-LITERT_DEFINE_HANDLE(LiteRtSignature);
-
-// A collection of subgraph + metadata + signature.
-LITERT_DEFINE_HANDLE(LiteRtModel);
-
-// Append only list of ops.
-LITERT_DEFINE_HANDLE(LiteRtOpList);
-
-//
-// LiteRtTensor + Types
-//
-
-// Get the string name associated with this tensor. This is an optional
-// attribute and if not set will return a zero-length string.
-LiteRtStatus LiteRtGetTensorName(LiteRtTensor tensor, const char** name);
-
-// TENSOR TYPES
-
-// Primitive types for elements in a tensor.
-typedef enum {
-  kLiteRtElementTypeNone = kTfLiteNoType,
-  kLiteRtElementTypeBool = kTfLiteBool,
-  kLiteRtElementTypeInt4 = kTfLiteInt4,
-  kLiteRtElementTypeInt8 = kTfLiteInt8,
-  kLiteRtElementTypeInt16 = kTfLiteInt16,
-  kLiteRtElementTypeInt32 = kTfLiteInt32,
-  kLiteRtElementTypeInt64 = kTfLiteInt64,
-  kLiteRtElementTypeUInt8 = kTfLiteUInt8,
-  kLiteRtElementTypeUInt16 = kTfLiteUInt16,
-  kLiteRtElementTypeUInt32 = kTfLiteUInt32,
-  kLiteRtElementTypeUInt64 = kTfLiteUInt64,
-  kLiteRtElementTypeFloat16 = kTfLiteFloat16,
-  kLiteRtElementTypeBFloat16 = kTfLiteBFloat16,
-  kLiteRtElementTypeFloat32 = kTfLiteFloat32,
-  kLiteRtElementTypeFloat64 = kTfLiteFloat64,
-  kLiteRtElementTypeComplex64 = kTfLiteComplex64,
-  kLiteRtElementTypeComplex128 = kTfLiteComplex128,
-  kLiteRtElementTypeTfResource = kTfLiteResource,
-  kLiteRtElementTypeTfString = kTfLiteString,
-  kLiteRtElementTypeTfVariant = kTfLiteVariant,
-} LiteRtElementType;
-
-// Tensor whose rank is dynamic.
-typedef struct {
-  // The primitive element type of the constituent data.
-  LiteRtElementType element_type;
-} LiteRtUnrankedTensorType;
-
-// Tensor whose rank is static but dimenions may be dynamic.
-typedef struct {
-  // The primitive element type of the constituent data.
-  LiteRtElementType element_type;
-
-  // Shape information.
-  LiteRtLayout layout;
-} LiteRtRankedTensorType;
-
-// The identifier for tensor type union.
-typedef enum {
-  // Type with fix ranked and possibly dynamic dimensions.
-  kLiteRtRankedTensorType = 0,
-
-  // Type with dynamic rank.
-  kLiteRtUnrankedTensorType = 1,
-} LiteRtTensorTypeId;
-
-// Get type identifier from tensor.
-LiteRtStatus LiteRtGetTensorTypeId(LiteRtTensor tensor,
-                                   LiteRtTensorTypeId* type_id);
-
-// Get unranked tensor type info, return bad status if not unranked.
-LiteRtStatus LiteRtGetUnrankedTensorType(
-    LiteRtTensor tensor, LiteRtUnrankedTensorType* unranked_tensor_type);
-
-// Get ranked tensor type info, return bad status if not ranked.
-LiteRtStatus LiteRtGetRankedTensorType(
-    LiteRtTensor tensor, LiteRtRankedTensorType* ranked_tensor_type);
-
-// QUANTIZATION
-
-// Schema for tensors quantized with one set of q-params.
-typedef struct {
-  // Scaling factor.
-  float scale;
-
-  // The value that float:0 maps to in q-space.
-  int64_t zero_point;
-} LiteRtQuantizationPerTensor;
-
-// Schema for tensors quantized with one set of q-params per channel.
-typedef struct {
-  int32_t quantized_dimension;
-  uint64_t num_channels;
-  float* scales;
-  int64_t* zero_points;
-} LiteRtQuantizationPerChannel;
-
-// The identifier for quantization scheme type union.
-typedef enum {
-  // Tag for tensors without quantization.
-  kLiteRtQuantizationNone = 0,
-
-  // Basic quantization, one set of q-params per tensor.
-  kLiteRtQuantizationPerTensor = 1,
-
-  // [NOT IMPLEMENTED YET] Q-params for each element accross a single dimension.
-  kLiteRtQuantizationPerChannel = 2,
-
-  // [NOT IMPLEMENTED YET] Q-params accross blocks of fixed size (e.g. 2048).
-  kLiteRtQuantizationBlockWise = 3,
-} LiteRtQuantizationTypeId;
-
-// Get the identifier for the type of quantization for a given tensor.
-LiteRtStatus LiteRtGetQuantizationTypeId(LiteRtTensor tensor,
-                                         LiteRtQuantizationTypeId* q_type_id);
-
-// Get the per-tensor quantization information for a given tensor if it has it.
-LiteRtStatus LiteRtGetPerTensorQuantization(
-    LiteRtTensor tensor, LiteRtQuantizationPerTensor* per_tensor_quantization);
-
-// Get the per-channel quantization information for a given tensor if it has it.
-LiteRtStatus LiteRtGetPerChannelQuantization(
-    LiteRtTensor tensor,
-    LiteRtQuantizationPerChannel* per_channel_quantization);
-
-// EDGES
-
-// Information about the about that defines a tensor.
-typedef struct LiteRtTensorDefiningOp {
-  // The defining op itself.
-  LiteRtOp op;
-
-  // The op output index that defines the specific tensor.
-  LiteRtParamIndex op_output_index;
-} LiteRtTensorDefiningOp;
-
-// Information about a reference to a tensor in the graph.
-typedef struct LiteRtTensorUserOp {
-  // The referring op itself.
-  LiteRtOp op;
-
-  // Index of which operand the op refers to a specific tensor on.
-  LiteRtParamIndex op_input_index;
-} LiteRtTensorUserOp;
-
-// Get all the ops that reference given tensor, and at what operand index.
-LiteRtStatus LiteRtGetNumTensorUses(LiteRtTensor tensor,
-                                    LiteRtParamIndex* num_uses);
-LiteRtStatus LiteRtGetTensorUse(LiteRtTensor tensor, LiteRtParamIndex use_index,
-                                LiteRtOp* user,
-                                LiteRtParamIndex* user_arg_index);
-
-// Get the op that defines this tensor and the corresponding output index. If
-// tensor is a subgraph input, has_defining_op will be false.
-LiteRtStatus LiteRtGetTensorDefiningOp(LiteRtTensor tensor,
-                                       bool* has_defining_op,
-                                       LiteRtTensorDefiningOp* defining_op);
-
-// WEIGHTS (constant data)
-
-// Get static weights associated with a given tensor. All tensors have weights,
-// null weights have size = 0;
-LiteRtStatus LiteRtGetTensorWeights(LiteRtTensor tensor,
-                                    LiteRtWeights* weights);
-
-//
-// LiteRtWeights
-//
-
-// Get opaque array from given tensor weights.
-LiteRtStatus LiteRtGetWeightsBytes(LiteRtWeights weights, const void** addr,
-                                   size_t* size);
-
-//
-// LiteRtOp
-//
-
-// Get code corresponding to operation type for given op.
-LiteRtStatus LiteRtGetOpCode(LiteRtOp op, LiteRtOpCode* code);
-
-// Get input tensors of given op.
-LiteRtStatus LiteRtGetNumOpInputs(LiteRtOp op, LiteRtParamIndex* num_inputs);
-LiteRtStatus LiteRtGetOpInput(LiteRtOp op, LiteRtParamIndex input_index,
-                              LiteRtTensor* input);
-
-// Get output tensors of given op.
-LiteRtStatus LiteRtGetNumOpOutputs(LiteRtOp op, LiteRtParamIndex* num_outputs);
-LiteRtStatus LiteRtGetOpOutput(LiteRtOp op, LiteRtParamIndex output_index,
-                               LiteRtTensor* output);
-
-//
-// LiteRtSubgraph
-//
-
-// Get input tensors for given subgraph.
-LiteRtStatus LiteRtGetNumSubgraphInputs(LiteRtSubgraph subgraph,
-                                        LiteRtParamIndex* num_inputs);
-LiteRtStatus LiteRtGetSubgraphInput(LiteRtSubgraph subgraph,
-                                    LiteRtParamIndex input_index,
-                                    LiteRtTensor* input);
-
-// Get output tensors for given subgraph.
-LiteRtStatus LiteRtGetNumSubgraphOutputs(LiteRtSubgraph subgraph,
-                                         LiteRtParamIndex* num_outputs);
-LiteRtStatus LiteRtGetSubgraphOutput(LiteRtSubgraph subgraph,
-                                     LiteRtParamIndex output_index,
-                                     LiteRtTensor* output);
-
-// Get all ops in given subgraph in a topological order.
-LiteRtStatus LiteRtGetNumSubgraphOps(LiteRtSubgraph subgraph,
-                                     LiteRtParamIndex* num_ops);
-LiteRtStatus LiteRtGetSubgraphOp(LiteRtSubgraph subgraph,
-                                 LiteRtParamIndex op_index, LiteRtOp* op);
-
-//
-// LiteRtSignature
-//
-
-// Default signature key. This is the key that is used if the model does not
-// define any signatures.
-LiteRtStatus LiteRtGetDefaultSignatureKey(const char** signature_key);
-
-// Get the signature key string defined in the model.
-LiteRtStatus LiteRtGetSignatureKey(LiteRtSignature signature,
-                                   const char** signature_key);
-
-// Get the associated subgraph for the given signature.
-LiteRtStatus LiteRtGetSignatureSubgraph(LiteRtSignature signature,
-                                        LiteRtSubgraph* subgraph);
-
-// Get the number of inputs for the given signature.
-LiteRtStatus LiteRtGetNumSignatureInputs(LiteRtSignature signature,
-                                         LiteRtParamIndex* num_inputs);
-
-// Get the name of the i-th of input tensor name for the given signature.
-LiteRtStatus LiteRtGetSignatureInputName(LiteRtSignature signature,
-                                         LiteRtParamIndex input_idx,
-                                         const char** input_name);
-
-// Get the number of outputs for the given signature.
-LiteRtStatus LiteRtGetNumSignatureOutputs(LiteRtSignature signature,
-                                          LiteRtParamIndex* num_outputs);
-
-// Get the name of the i-th of output tensor name for the given signature.
-LiteRtStatus LiteRtGetSignatureOutputName(LiteRtSignature signature,
-                                          LiteRtParamIndex output_idx,
-                                          const char** output_name);
-
-//
-// LiteRtModel
-//
-
-LiteRtStatus LiteRtCreateModelFromFile(const char* filename,
-                                       LiteRtModel* model);
-
-LiteRtStatus LiteRtCreateModelFromBuffer(const void* buffer_addr,
-                                         size_t buffer_size,
-                                         LiteRtModel* model);
-
-// Get the metadata buffer associated with given key if it exists.
-LiteRtStatus LiteRtGetModelMetadata(LiteRtModel model, const char* metadata_key,
-                                    const void** metadata_buffer,
-                                    size_t* metadata_buffer_size);
-
-// Get the index of the entry subgraph.
-// TODO: b/365299994 - Figure out signatures.
-LiteRtStatus LiteRtGetMainModelSubgraphIndex(
-    LiteRtModel model, LiteRtParamIndex* main_subgraph_index);
-
-// Get number of subgraphs in model.
-LiteRtStatus LiteRtGetNumModelSubgraphs(LiteRtModel model,
-                                        LiteRtParamIndex* num_subgraphs);
-
-// Get subgraph at given index in model.
-LiteRtStatus LiteRtGetModelSubgraph(LiteRtModel model,
-                                    LiteRtParamIndex subgraph_index,
-                                    LiteRtSubgraph* subgraph);
-
-// Get the number of signatures defined in the model.
-LiteRtStatus LiteRtGetNumModelSignatures(LiteRtModel model,
-                                         LiteRtParamIndex* num_signatures);
-
-// Get the signature at the given index in the model
-LiteRtStatus LiteRtGetModelSignature(LiteRtModel model,
-                                     LiteRtParamIndex signature_index,
-                                     LiteRtSignature* signature);
-
-// Destroy the given model, freeing any memory it owns.
-void LiteRtDestroyModel(LiteRtModel model);
-
-//
-// Utility Types
-//
-
-// An append only list of ops.
-LiteRtStatus LiteRtPushOp(LiteRtOpList op_list, LiteRtOp op,
-                          LiteRtParamIndex partition_index);
-
-//
-// Serialization related functions
-//
-
-// Options for model serialization.
-typedef struct LiteRtModelSerializationOptions {
-  // Alignment for bytecode assets that are appended to the model.
-  // Alignment is enforced relative to the first byte of the flatbuffer.
-  size_t bytecode_alignment;
-} LiteRtModelSerializationOptions;
-
-// Serializes model to valid tflite flatbuffer bytes.
-//
-// This destroys the model before it returns unless destroy_model is false.
-// Caller takes ownership of `buf`. Flatbuffers are packed into their arrays
-// back to front, so the valid flatbuffer is buf[offset, size]. See the above
-// options for more details.
-LiteRtStatus LiteRtSerializeModel(LiteRtModel model, uint8_t** buf,
-                                  size_t* size, size_t* offset,
-                                  bool destroy_model,
-                                  LiteRtModelSerializationOptions options);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_model_test.cc b/tensorflow/lite/experimental/litert/c/litert_model_test.cc
deleted file mode 100644
index 8f41902e8b70..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_model_test.cc
+++ /dev/null
@@ -1,390 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <initializer_list>
-#include <string>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace {
-
-using ::litert::BufferRef;
-using ::litert::OwningBufferRef;
-using ::testing::ElementsAreArray;
-using ::testing::litert::IsError;
-
-TEST(LiteRtWeightsTest, GetNullWeights) {
-  LiteRtWeightsT weights = {};
-
-  const void* addr;
-  size_t size;
-  LITERT_ASSERT_OK(LiteRtGetWeightsBytes(&weights, &addr, &size));
-
-  EXPECT_EQ(addr, nullptr);
-  EXPECT_EQ(size, 0);
-}
-
-TEST(LiteRtWeightsTest, GetWeights) {
-  static constexpr std::array kData = {1, 2, 3};
-  const uint8_t* kDataPtr = reinterpret_cast<const uint8_t*>(kData.data());
-  const auto kDataSize = kData.size() * sizeof(int32_t);
-
-  LiteRtWeightsT weights;
-  SetWeightsFromOwnedBuffer(weights,
-                            OwningBufferRef<uint8_t>(kDataPtr, kDataSize));
-
-  const void* addr;
-  size_t size;
-  LITERT_ASSERT_OK(LiteRtGetWeightsBytes(&weights, &addr, &size));
-
-  EXPECT_NE(addr, nullptr);
-  EXPECT_EQ(size, 3 * sizeof(int32_t));
-
-  EXPECT_THAT(absl::MakeConstSpan(reinterpret_cast<const int32_t*>(addr), 3),
-              ElementsAreArray(kData));
-}
-
-TEST(LiteRtTensorTest, GetUnrankedType) {
-  static constexpr auto kElementType = kLiteRtElementTypeFloat32;
-  static constexpr auto kId = kLiteRtUnrankedTensorType;
-
-  TensorType type;
-  type.first = kId;
-  type.second.unranked_tensor_type.element_type = kElementType;
-
-  LiteRtTensorT tensor;
-  tensor.SetType(std::move(type));
-
-  LiteRtTensorTypeId id;
-  LITERT_ASSERT_OK(LiteRtGetTensorTypeId(&tensor, &id));
-  ASSERT_EQ(id, kId);
-
-  LiteRtUnrankedTensorType unranked;
-  LITERT_ASSERT_OK(LiteRtGetUnrankedTensorType(&tensor, &unranked));
-  EXPECT_EQ(unranked.element_type, kElementType);
-}
-
-TEST(LiteRtTensorTest, GetRankedTensorType) {
-  static constexpr auto kElementType = kLiteRtElementTypeFloat32;
-  static constexpr auto kId = kLiteRtRankedTensorType;
-
-  LiteRtTensorT tensor;
-  tensor.SetType(MakeRankedTensorType(kElementType, {3, 3}));
-
-  LiteRtTensorTypeId id;
-  LITERT_ASSERT_OK(LiteRtGetTensorTypeId(&tensor, &id));
-  ASSERT_EQ(id, kId);
-
-  LiteRtRankedTensorType ranked;
-  LITERT_ASSERT_OK(LiteRtGetRankedTensorType(&tensor, &ranked));
-  EXPECT_EQ(ranked.element_type, kElementType);
-  ASSERT_EQ(ranked.layout.rank, 2);
-  EXPECT_THAT(absl::MakeConstSpan(ranked.layout.dimensions, 2),
-              ElementsAreArray({3, 3}));
-}
-
-TEST(LiteRtTensorTest, GetUses) {
-  LiteRtTensorT tensor;
-
-  LiteRtOpT user;
-  tensor.Users().push_back(&user);
-  tensor.UserArgInds().push_back(0);
-
-  LiteRtOpT other_user;
-  tensor.Users().push_back(&other_user);
-  tensor.UserArgInds().push_back(1);
-
-  LiteRtParamIndex num_uses;
-  LITERT_ASSERT_OK(LiteRtGetNumTensorUses(&tensor, &num_uses));
-  ASSERT_EQ(num_uses, 2);
-
-  LiteRtOp actual_user;
-  LiteRtParamIndex actual_user_arg_index;
-  LITERT_ASSERT_OK(LiteRtGetTensorUse(&tensor, /*use_index=*/0, &actual_user,
-                                      &actual_user_arg_index));
-  ASSERT_EQ(actual_user, &user);
-  ASSERT_EQ(actual_user_arg_index, 0);
-
-  LITERT_ASSERT_OK(LiteRtGetTensorUse(&tensor, /*use_index=*/1, &actual_user,
-                                      &actual_user_arg_index));
-  ASSERT_EQ(actual_user, &other_user);
-  ASSERT_EQ(actual_user_arg_index, 1);
-}
-
-TEST(LiteRtTensorTest, GetDefiningOp) {
-  LiteRtTensorT tensor;
-
-  LiteRtOpT def_op;
-  tensor.SetDefiningOp(def_op, 0);
-
-  LiteRtTensorDefiningOp actual_def_op;
-  bool has_defining_op;
-  LITERT_ASSERT_OK(
-      LiteRtGetTensorDefiningOp(&tensor, &has_defining_op, &actual_def_op));
-  ASSERT_TRUE(has_defining_op);
-  EXPECT_EQ(actual_def_op.op, &def_op);
-  EXPECT_EQ(actual_def_op.op_output_index, 0);
-}
-
-TEST(LiteRtTensorTest, NoDefiningOp) {
-  LiteRtTensorT tensor;
-
-  LiteRtTensorDefiningOp actual_def_op;
-  bool has_defining_op;
-  LITERT_ASSERT_OK(
-      LiteRtGetTensorDefiningOp(&tensor, &has_defining_op, &actual_def_op));
-  ASSERT_FALSE(has_defining_op);
-}
-
-TEST(LiteRtTensorTest, Name) {
-  static constexpr const char kName[] = "foo";
-
-  LiteRtTensorT tensor;
-  tensor.SetName(std::string(kName));
-
-  const char* name;
-  LITERT_ASSERT_OK(LiteRtGetTensorName(&tensor, &name));
-  EXPECT_STREQ(name, kName);
-}
-
-TEST(LiteRtTensorTest, QuantizationNone) {
-  LiteRtTensorT tensor;
-
-  LiteRtQuantizationTypeId q_type_id;
-  LITERT_ASSERT_OK(LiteRtGetQuantizationTypeId(&tensor, &q_type_id));
-  EXPECT_EQ(q_type_id, kLiteRtQuantizationNone);
-
-  LiteRtQuantizationPerTensor per_tensor_quantization;
-  EXPECT_NE(LiteRtGetPerTensorQuantization(&tensor, &per_tensor_quantization),
-            kLiteRtStatusOk);
-}
-
-TEST(LiteRtTensorTest, QuantizationPerTensor) {
-  static constexpr auto kScale = 1.0;
-  static constexpr auto kZeroPoint = 1;
-
-  LiteRtTensorT tensor;
-  tensor.SetQarams(MakePerTensorQuantization(kScale, kZeroPoint));
-
-  LiteRtQuantizationTypeId q_type_id;
-  LITERT_ASSERT_OK(LiteRtGetQuantizationTypeId(&tensor, &q_type_id));
-  ASSERT_EQ(q_type_id, kLiteRtQuantizationPerTensor);
-
-  LiteRtQuantizationPerTensor per_tensor_quantization;
-  LITERT_ASSERT_OK(
-      LiteRtGetPerTensorQuantization(&tensor, &per_tensor_quantization));
-
-  EXPECT_EQ(per_tensor_quantization.scale, kScale);
-  EXPECT_EQ(per_tensor_quantization.zero_point, kZeroPoint);
-}
-
-TEST(LiteRtTensorTest, QuantizationPerChannel) {
-  static constexpr size_t kNumChannels = 2;
-  static constexpr size_t kQuantizedDimension = 0;
-  static constexpr float kScales[kNumChannels] = {1.0, 2.0};
-  static constexpr int64_t kZps[kNumChannels] = {2, 3};
-
-  LiteRtTensorT tensor;
-
-  {
-    auto per_channel =
-        MakePerChannelQuantization(kScales, kZps, kQuantizedDimension, tensor);
-    tensor.SetQarams(per_channel);
-  }
-
-  LiteRtQuantizationTypeId q_type_id;
-  LITERT_ASSERT_OK(LiteRtGetQuantizationTypeId(&tensor, &q_type_id));
-  ASSERT_EQ(q_type_id, kLiteRtQuantizationPerChannel);
-
-  LiteRtQuantizationPerChannel per_channel_quantization;
-  LITERT_ASSERT_OK(
-      LiteRtGetPerChannelQuantization(&tensor, &per_channel_quantization));
-
-  EXPECT_THAT(
-      absl::MakeConstSpan(per_channel_quantization.scales, kNumChannels),
-      testing::ElementsAreArray(kScales));
-  EXPECT_THAT(
-      absl::MakeConstSpan(per_channel_quantization.zero_points, kNumChannels),
-      testing::ElementsAreArray(kZps));
-  ASSERT_EQ(per_channel_quantization.num_channels, kNumChannels);
-  ASSERT_EQ(per_channel_quantization.quantized_dimension, kQuantizedDimension);
-}
-
-TEST(LiteRtOpTest, GetOpCode) {
-  static constexpr auto kCode = kLiteRtOpCodeTflCustom;
-
-  LiteRtOpT op;
-  op.SetOpCode(kCode);
-
-  LiteRtOpCode code;
-  LITERT_ASSERT_OK(LiteRtGetOpCode(&op, &code));
-  EXPECT_EQ(code, kCode);
-}
-
-TEST(LiteRtOpTest, GetInputs) {
-  LiteRtTensorT input1;
-  LiteRtTensorT input2;
-
-  LiteRtOpT op;
-  op.Inputs().push_back(&input1);
-  op.Inputs().push_back(&input2);
-
-  LiteRtParamIndex num_inputs;
-  LITERT_ASSERT_OK(LiteRtGetNumOpInputs(&op, &num_inputs));
-  ASSERT_EQ(num_inputs, 2);
-
-  LiteRtTensor actual_input;
-  LITERT_ASSERT_OK(LiteRtGetOpInput(&op, /*input_index=*/0, &actual_input));
-  EXPECT_EQ(actual_input, &input1);
-
-  LITERT_ASSERT_OK(LiteRtGetOpInput(&op, /*input_index=*/1, &actual_input));
-  EXPECT_EQ(actual_input, &input2);
-}
-
-TEST(LiteRtOpTest, GetOutputs) {
-  LiteRtTensorT output1;
-  LiteRtTensorT output2;
-
-  LiteRtOpT op;
-  op.Outputs().push_back(&output1);
-  op.Outputs().push_back(&output2);
-
-  LiteRtParamIndex num_outputs;
-  LITERT_ASSERT_OK(LiteRtGetNumOpOutputs(&op, &num_outputs));
-  ASSERT_EQ(num_outputs, 2);
-
-  LiteRtTensor actual_output;
-  LITERT_ASSERT_OK(LiteRtGetOpOutput(&op, /*output_index=*/0, &actual_output));
-  EXPECT_EQ(actual_output, &output1);
-
-  LITERT_ASSERT_OK(LiteRtGetOpOutput(&op, /*output_index=*/1, &actual_output));
-  EXPECT_EQ(actual_output, &output2);
-}
-
-TEST(LiteRtSubgraphTest, GetInputs) {
-  LiteRtTensorT input1;
-  LiteRtTensorT input2;
-
-  LiteRtSubgraphT subgraph;
-  subgraph.Inputs().push_back(&input1);
-  subgraph.Inputs().push_back(&input2);
-
-  LiteRtParamIndex num_inputs;
-  LITERT_ASSERT_OK(LiteRtGetNumSubgraphInputs(&subgraph, &num_inputs));
-
-  LiteRtTensor actual_input;
-  LITERT_ASSERT_OK(
-      LiteRtGetSubgraphInput(&subgraph, /*input_index=*/0, &actual_input));
-  EXPECT_EQ(actual_input, &input1);
-
-  LITERT_ASSERT_OK(
-      LiteRtGetSubgraphInput(&subgraph, /*input_index=*/1, &actual_input));
-  EXPECT_EQ(actual_input, &input2);
-}
-
-TEST(LiteRtSubgraphTest, GetOutputs) {
-  LiteRtTensorT output1;
-  LiteRtTensorT output2;
-
-  LiteRtSubgraphT subgraph;
-  subgraph.Outputs().push_back(&output1);
-  subgraph.Outputs().push_back(&output2);
-
-  LiteRtParamIndex num_outputs;
-  LITERT_ASSERT_OK(LiteRtGetNumSubgraphOutputs(&subgraph, &num_outputs));
-
-  LiteRtTensor actual_output;
-  LITERT_ASSERT_OK(
-      LiteRtGetSubgraphOutput(&subgraph, /*output_index=*/0, &actual_output));
-  EXPECT_EQ(actual_output, &output1);
-
-  LITERT_ASSERT_OK(
-      LiteRtGetSubgraphOutput(&subgraph, /*output_index=*/1, &actual_output));
-  EXPECT_EQ(actual_output, &output2);
-}
-
-TEST(LiteRtSubgraphTest, GetOps) {
-  LiteRtSubgraphT subgraph;
-  auto& op1 = subgraph.EmplaceOp();
-  auto& op2 = subgraph.EmplaceOp();
-
-  LiteRtParamIndex num_ops;
-  LITERT_ASSERT_OK(LiteRtGetNumSubgraphOps(&subgraph, &num_ops));
-  ASSERT_EQ(num_ops, 2);
-
-  LiteRtOp actual_op;
-  LITERT_ASSERT_OK(LiteRtGetSubgraphOp(&subgraph, /*op_index=*/0, &actual_op));
-  ASSERT_EQ(actual_op, &op1);
-
-  LITERT_ASSERT_OK(LiteRtGetSubgraphOp(&subgraph, /*op_index=*/1, &actual_op));
-  ASSERT_EQ(actual_op, &op2);
-}
-
-TEST(LiteRtModelTest, GetMetadata) {
-  static constexpr absl::string_view kKey = "KEY";
-  static constexpr absl::string_view kData = "DATA";
-
-  LiteRtModelT model;
-  model.PushMetadata(kKey, kData);
-
-  const void* metadata;
-  size_t metadata_size;
-  LITERT_ASSERT_OK(
-      LiteRtGetModelMetadata(&model, kKey.data(), &metadata, &metadata_size));
-  EXPECT_EQ(BufferRef(metadata, metadata_size).StrView(), kData);
-}
-
-TEST(LiteRtModelTest, GetSubgraph) {
-  LiteRtModelT model;
-  auto& subgraph = model.EmplaceSubgraph();
-
-  LiteRtSubgraph actual_subgraph;
-  LITERT_ASSERT_OK(LiteRtGetModelSubgraph(&model, 0, &actual_subgraph));
-  EXPECT_EQ(actual_subgraph, &subgraph);
-}
-
-TEST(LiteRtModelTest, GetSubgraphOOB) {
-  LiteRtModelT model;
-
-  LiteRtSubgraph actual_subgraph;
-  EXPECT_THAT(LiteRtGetModelSubgraph(&model, 0, &actual_subgraph),
-              IsError(kLiteRtStatusErrorIndexOOB));
-}
-
-TEST(LiteRtOpListTest, PushOps) {
-  LiteRtOpListT op_list;
-  LiteRtOpT op;
-
-  LITERT_ASSERT_OK(LiteRtPushOp(&op_list, &op, 0));
-  auto vec = op_list.Values();
-  ASSERT_EQ(vec.size(), 1);
-  EXPECT_EQ(vec.front().first, &op);
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/c/litert_op_code.h b/tensorflow/lite/experimental/litert/c/litert_op_code.h
deleted file mode 100644
index 529360e87dc4..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_op_code.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OP_CODE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OP_CODE_H_
-
-#include "tensorflow/lite/builtin_ops.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-typedef enum {
-  kLiteRtOpCodeTflAdd = kTfLiteBuiltinAdd,
-  kLiteRtOpCodeTflAveragePool2d = kTfLiteBuiltinAveragePool2d,
-  kLiteRtOpCodeTflConcatenation = kTfLiteBuiltinConcatenation,
-  kLiteRtOpCodeTflConv2d = kTfLiteBuiltinConv2d,
-  kLiteRtOpCodeTflDepthwiseConv2d = kTfLiteBuiltinDepthwiseConv2d,
-  kLiteRtOpCodeTflDepthToSpace = kTfLiteBuiltinDepthToSpace,
-  kLiteRtOpCodeTflDequantize = kTfLiteBuiltinDequantize,
-  kLiteRtOpCodeTflEmbeddingLookup = kTfLiteBuiltinEmbeddingLookup,
-  kLiteRtOpCodeTflFloor = kTfLiteBuiltinFloor,
-  kLiteRtOpCodeTflFullyConnected = kTfLiteBuiltinFullyConnected,
-  kLiteRtOpCodeTflHashtableLookup = kTfLiteBuiltinHashtableLookup,
-  kLiteRtOpCodeTflL2Normalization = kTfLiteBuiltinL2Normalization,
-  kLiteRtOpCodeTflL2Pool2d = kTfLiteBuiltinL2Pool2d,
-  kLiteRtOpCodeTflLocalResponseNormalization =
-      kTfLiteBuiltinLocalResponseNormalization,
-  kLiteRtOpCodeTflLogistic = kTfLiteBuiltinLogistic,
-  kLiteRtOpCodeTflLshProjection = kTfLiteBuiltinLshProjection,
-  kLiteRtOpCodeTflLstm = kTfLiteBuiltinLstm,
-  kLiteRtOpCodeTflMaxPool2d = kTfLiteBuiltinMaxPool2d,
-  kLiteRtOpCodeTflMul = kTfLiteBuiltinMul,
-  kLiteRtOpCodeTflRelu = kTfLiteBuiltinRelu,
-  kLiteRtOpCodeTflReluN1To1 = kTfLiteBuiltinReluN1To1,
-  kLiteRtOpCodeTflRelu6 = kTfLiteBuiltinRelu6,
-  kLiteRtOpCodeTflReshape = kTfLiteBuiltinReshape,
-  kLiteRtOpCodeTflResizeBilinear = kTfLiteBuiltinResizeBilinear,
-  kLiteRtOpCodeTflRnn = kTfLiteBuiltinRnn,
-  kLiteRtOpCodeTflSoftmax = kTfLiteBuiltinSoftmax,
-  kLiteRtOpCodeTflSpaceToDepth = kTfLiteBuiltinSpaceToDepth,
-  kLiteRtOpCodeTflSvdf = kTfLiteBuiltinSvdf,
-  kLiteRtOpCodeTflTanh = kTfLiteBuiltinTanh,
-  kLiteRtOpCodeTflConcatEmbeddings = kTfLiteBuiltinConcatEmbeddings,
-  kLiteRtOpCodeTflSkipGram = kTfLiteBuiltinSkipGram,
-  kLiteRtOpCodeTflCall = kTfLiteBuiltinCall,
-  kLiteRtOpCodeTflCustom = kTfLiteBuiltinCustom,
-  kLiteRtOpCodeTflEmbeddingLookupSparse = kTfLiteBuiltinEmbeddingLookupSparse,
-  kLiteRtOpCodeTflPad = kTfLiteBuiltinPad,
-  kLiteRtOpCodeTflUnidirectionalSequenceRnn =
-      kTfLiteBuiltinUnidirectionalSequenceRnn,
-  kLiteRtOpCodeTflGather = kTfLiteBuiltinGather,
-  kLiteRtOpCodeTflBatchToSpaceNd = kTfLiteBuiltinBatchToSpaceNd,
-  kLiteRtOpCodeTflSpaceToBatchNd = kTfLiteBuiltinSpaceToBatchNd,
-  kLiteRtOpCodeTflTranspose = kTfLiteBuiltinTranspose,
-  kLiteRtOpCodeTflMean = kTfLiteBuiltinMean,
-  kLiteRtOpCodeTflSub = kTfLiteBuiltinSub,
-  kLiteRtOpCodeTflDiv = kTfLiteBuiltinDiv,
-  kLiteRtOpCodeTflSqueeze = kTfLiteBuiltinSqueeze,
-  kLiteRtOpCodeTflUnidirectionalSequenceLstm =
-      kTfLiteBuiltinUnidirectionalSequenceLstm,
-  kLiteRtOpCodeTflStridedSlice = kTfLiteBuiltinStridedSlice,
-  kLiteRtOpCodeTflBidirectionalSequenceRnn =
-      kTfLiteBuiltinBidirectionalSequenceRnn,
-  kLiteRtOpCodeTflExp = kTfLiteBuiltinExp,
-  kLiteRtOpCodeTflTopkV2 = kTfLiteBuiltinTopkV2,
-  kLiteRtOpCodeTflSplit = kTfLiteBuiltinSplit,
-  kLiteRtOpCodeTflLogSoftmax = kTfLiteBuiltinLogSoftmax,
-  kLiteRtOpCodeTflDelegate = kTfLiteBuiltinDelegate,
-  kLiteRtOpCodeTflBidirectionalSequenceLstm =
-      kTfLiteBuiltinBidirectionalSequenceLstm,
-  kLiteRtOpCodeTflCast = kTfLiteBuiltinCast,
-  kLiteRtOpCodeTflPrelu = kTfLiteBuiltinPrelu,
-  kLiteRtOpCodeTflMaximum = kTfLiteBuiltinMaximum,
-  kLiteRtOpCodeTflArgMax = kTfLiteBuiltinArgMax,
-  kLiteRtOpCodeTflMinimum = kTfLiteBuiltinMinimum,
-  kLiteRtOpCodeTflLess = kTfLiteBuiltinLess,
-  kLiteRtOpCodeTflNeg = kTfLiteBuiltinNeg,
-  kLiteRtOpCodeTflPadv2 = kTfLiteBuiltinPadv2,
-  kLiteRtOpCodeTflGreater = kTfLiteBuiltinGreater,
-  kLiteRtOpCodeTflGreaterEqual = kTfLiteBuiltinGreaterEqual,
-  kLiteRtOpCodeTflLessEqual = kTfLiteBuiltinLessEqual,
-  kLiteRtOpCodeTflSelect = kTfLiteBuiltinSelect,
-  kLiteRtOpCodeTflSlice = kTfLiteBuiltinSlice,
-  kLiteRtOpCodeTflSin = kTfLiteBuiltinSin,
-  kLiteRtOpCodeTflTransposeConv = kTfLiteBuiltinTransposeConv,
-  kLiteRtOpCodeTflSparseToDense = kTfLiteBuiltinSparseToDense,
-  kLiteRtOpCodeTflTile = kTfLiteBuiltinTile,
-  kLiteRtOpCodeTflExpandDims = kTfLiteBuiltinExpandDims,
-  kLiteRtOpCodeTflEqual = kTfLiteBuiltinEqual,
-  kLiteRtOpCodeTflNotEqual = kTfLiteBuiltinNotEqual,
-  kLiteRtOpCodeTflLog = kTfLiteBuiltinLog,
-  kLiteRtOpCodeTflSum = kTfLiteBuiltinSum,
-  kLiteRtOpCodeTflSqrt = kTfLiteBuiltinSqrt,
-  kLiteRtOpCodeTflRsqrt = kTfLiteBuiltinRsqrt,
-  kLiteRtOpCodeTflShape = kTfLiteBuiltinShape,
-  kLiteRtOpCodeTflPow = kTfLiteBuiltinPow,
-  kLiteRtOpCodeTflArgMin = kTfLiteBuiltinArgMin,
-  kLiteRtOpCodeTflFakeQuant = kTfLiteBuiltinFakeQuant,
-  kLiteRtOpCodeTflReduceProd = kTfLiteBuiltinReduceProd,
-  kLiteRtOpCodeTflReduceMax = kTfLiteBuiltinReduceMax,
-  kLiteRtOpCodeTflPack = kTfLiteBuiltinPack,
-  kLiteRtOpCodeTflLogicalOr = kTfLiteBuiltinLogicalOr,
-  kLiteRtOpCodeTflOneHot = kTfLiteBuiltinOneHot,
-  kLiteRtOpCodeTflLogicalAnd = kTfLiteBuiltinLogicalAnd,
-  kLiteRtOpCodeTflLogicalNot = kTfLiteBuiltinLogicalNot,
-  kLiteRtOpCodeTflUnpack = kTfLiteBuiltinUnpack,
-  kLiteRtOpCodeTflReduceMin = kTfLiteBuiltinReduceMin,
-  kLiteRtOpCodeTflFloorDiv = kTfLiteBuiltinFloorDiv,
-  kLiteRtOpCodeTflReduceAny = kTfLiteBuiltinReduceAny,
-  kLiteRtOpCodeTflSquare = kTfLiteBuiltinSquare,
-  kLiteRtOpCodeTflZerosLike = kTfLiteBuiltinZerosLike,
-  kLiteRtOpCodeTflFill = kTfLiteBuiltinFill,
-  kLiteRtOpCodeTflFloorMod = kTfLiteBuiltinFloorMod,
-  kLiteRtOpCodeTflRange = kTfLiteBuiltinRange,
-  kLiteRtOpCodeTflResizeNearestNeighbor = kTfLiteBuiltinResizeNearestNeighbor,
-  kLiteRtOpCodeTflLeakyRelu = kTfLiteBuiltinLeakyRelu,
-  kLiteRtOpCodeTflSquaredDifference = kTfLiteBuiltinSquaredDifference,
-  kLiteRtOpCodeTflMirrorPad = kTfLiteBuiltinMirrorPad,
-  kLiteRtOpCodeTflAbs = kTfLiteBuiltinAbs,
-  kLiteRtOpCodeTflSplitV = kTfLiteBuiltinSplitV,
-  kLiteRtOpCodeTflUnique = kTfLiteBuiltinUnique,
-  kLiteRtOpCodeTflCeil = kTfLiteBuiltinCeil,
-  kLiteRtOpCodeTflReverseV2 = kTfLiteBuiltinReverseV2,
-  kLiteRtOpCodeTflAddN = kTfLiteBuiltinAddN,
-  kLiteRtOpCodeTflGatherNd = kTfLiteBuiltinGatherNd,
-  kLiteRtOpCodeTflCos = kTfLiteBuiltinCos,
-  kLiteRtOpCodeTflWhere = kTfLiteBuiltinWhere,
-  kLiteRtOpCodeTflRank = kTfLiteBuiltinRank,
-  kLiteRtOpCodeTflElu = kTfLiteBuiltinElu,
-  kLiteRtOpCodeTflReverseSequence = kTfLiteBuiltinReverseSequence,
-  kLiteRtOpCodeTflMatrixDiag = kTfLiteBuiltinMatrixDiag,
-  kLiteRtOpCodeTflQuantize = kTfLiteBuiltinQuantize,
-  kLiteRtOpCodeTflMatrixSetDiag = kTfLiteBuiltinMatrixSetDiag,
-  kLiteRtOpCodeTflRound = kTfLiteBuiltinRound,
-  kLiteRtOpCodeTflHardSwish = kTfLiteBuiltinHardSwish,
-  kLiteRtOpCodeTflIf = kTfLiteBuiltinIf,
-  kLiteRtOpCodeTflWhile = kTfLiteBuiltinWhile,
-  kLiteRtOpCodeTflNonMaxSuppressionV4 = kTfLiteBuiltinNonMaxSuppressionV4,
-  kLiteRtOpCodeTflNonMaxSuppressionV5 = kTfLiteBuiltinNonMaxSuppressionV5,
-  kLiteRtOpCodeTflScatterNd = kTfLiteBuiltinScatterNd,
-  kLiteRtOpCodeTflSelectV2 = kTfLiteBuiltinSelectV2,
-  kLiteRtOpCodeTflDensify = kTfLiteBuiltinDensify,
-  kLiteRtOpCodeTflSegmentSum = kTfLiteBuiltinSegmentSum,
-  kLiteRtOpCodeTflBatchMatmul = kTfLiteBuiltinBatchMatmul,
-  kLiteRtOpCodeTflPlaceholderForGreaterOpCodeTfls =
-      kTfLiteBuiltinPlaceholderForGreaterOpCodes,
-  kLiteRtOpCodeTflCumsum = kTfLiteBuiltinCumsum,
-  kLiteRtOpCodeTflCallOnce = kTfLiteBuiltinCallOnce,
-  kLiteRtOpCodeTflBroadcastTo = kTfLiteBuiltinBroadcastTo,
-  kLiteRtOpCodeTflRfft2d = kTfLiteBuiltinRfft2d,
-  kLiteRtOpCodeTflConv3d = kTfLiteBuiltinConv3d,
-  kLiteRtOpCodeTflImag = kTfLiteBuiltinImag,
-  kLiteRtOpCodeTflReal = kTfLiteBuiltinReal,
-  kLiteRtOpCodeTflComplexAbs = kTfLiteBuiltinComplexAbs,
-  kLiteRtOpCodeTflHashtable = kTfLiteBuiltinHashtable,
-  kLiteRtOpCodeTflHashtableFind = kTfLiteBuiltinHashtableFind,
-  kLiteRtOpCodeTflHashtableImport = kTfLiteBuiltinHashtableImport,
-  kLiteRtOpCodeTflHashtableSize = kTfLiteBuiltinHashtableSize,
-  kLiteRtOpCodeTflReduceAll = kTfLiteBuiltinReduceAll,
-  kLiteRtOpCodeTflConv3dTranspose = kTfLiteBuiltinConv3dTranspose,
-  kLiteRtOpCodeTflVarHandle = kTfLiteBuiltinVarHandle,
-  kLiteRtOpCodeTflReadVariable = kTfLiteBuiltinReadVariable,
-  kLiteRtOpCodeTflAssignVariable = kTfLiteBuiltinAssignVariable,
-  kLiteRtOpCodeTflBroadcastArgs = kTfLiteBuiltinBroadcastArgs,
-  kLiteRtOpCodeTflRandomStandardNormal = kTfLiteBuiltinRandomStandardNormal,
-  kLiteRtOpCodeTflBucketize = kTfLiteBuiltinBucketize,
-  kLiteRtOpCodeTflRandomUniform = kTfLiteBuiltinRandomUniform,
-  kLiteRtOpCodeTflMultinomial = kTfLiteBuiltinMultinomial,
-  kLiteRtOpCodeTflGelu = kTfLiteBuiltinGelu,
-  kLiteRtOpCodeTflDynamicUpdateSlice = kTfLiteBuiltinDynamicUpdateSlice,
-  kLiteRtOpCodeTflRelu0To1 = kTfLiteBuiltinRelu0To1,
-  kLiteRtOpCodeTflUnsortedSegmentProd = kTfLiteBuiltinUnsortedSegmentProd,
-  kLiteRtOpCodeTflUnsortedSegmentMax = kTfLiteBuiltinUnsortedSegmentMax,
-  kLiteRtOpCodeTflUnsortedSegmentSum = kTfLiteBuiltinUnsortedSegmentSum,
-  kLiteRtOpCodeTflAtan2 = kTfLiteBuiltinAtan2,
-  kLiteRtOpCodeTflUnsortedSegmentMin = kTfLiteBuiltinUnsortedSegmentMin,
-  kLiteRtOpCodeTflSign = kTfLiteBuiltinSign,
-  kLiteRtOpCodeTflBitcast = kTfLiteBuiltinBitcast,
-  kLiteRtOpCodeTflBitwiseXor = kTfLiteBuiltinBitwiseXor,
-  kLiteRtOpCodeTflRightShift = kTfLiteBuiltinRightShift,
-  kLiteRtOpCodeShloLogistic = kTfLiteBuiltinStablehloLogistic,
-  kLiteRtOpCodeShloAdd = kTfLiteBuiltinStablehloAdd,
-  kLiteRtOpCodeShloDivide = kTfLiteBuiltinStablehloDivide,
-  kLiteRtOpCodeShloMultiply = kTfLiteBuiltinStablehloMultiply,
-  kLiteRtOpCodeShloMaximum = kTfLiteBuiltinStablehloMaximum,
-  kLiteRtOpCodeShloReshape = kTfLiteBuiltinStablehloReshape,
-  kLiteRtOpCodeShloClamp = kTfLiteBuiltinStablehloClamp,
-  kLiteRtOpCodeShloConcatenate = kTfLiteBuiltinStablehloConcatenate,
-  kLiteRtOpCodeShloBroadcastInDim = kTfLiteBuiltinStablehloBroadcastInDim,
-  kLiteRtOpCodeShloConvolution = kTfLiteBuiltinStablehloConvolution,
-  kLiteRtOpCodeShloSlice = kTfLiteBuiltinStablehloSlice,
-  kLiteRtOpCodeShloCustomCall = kTfLiteBuiltinStablehloCustomCall,
-  kLiteRtOpCodeShloReduce = kTfLiteBuiltinStablehloReduce,
-  kLiteRtOpCodeShloAbs = kTfLiteBuiltinStablehloAbs,
-  kLiteRtOpCodeShloAnd = kTfLiteBuiltinStablehloAnd,
-  kLiteRtOpCodeShloCosine = kTfLiteBuiltinStablehloCosine,
-  kLiteRtOpCodeShloExponential = kTfLiteBuiltinStablehloExponential,
-  kLiteRtOpCodeShloFloor = kTfLiteBuiltinStablehloFloor,
-  kLiteRtOpCodeShloLog = kTfLiteBuiltinStablehloLog,
-  kLiteRtOpCodeShloMinimum = kTfLiteBuiltinStablehloMinimum,
-  kLiteRtOpCodeShloNegate = kTfLiteBuiltinStablehloNegate,
-  kLiteRtOpCodeShloOr = kTfLiteBuiltinStablehloOr,
-  kLiteRtOpCodeShloPower = kTfLiteBuiltinStablehloPower,
-  kLiteRtOpCodeShloRemainder = kTfLiteBuiltinStablehloRemainder,
-  kLiteRtOpCodeShloRsqrt = kTfLiteBuiltinStablehloRsqrt,
-  kLiteRtOpCodeShloSelect = kTfLiteBuiltinStablehloSelect,
-  kLiteRtOpCodeShloSubtract = kTfLiteBuiltinStablehloSubtract,
-  kLiteRtOpCodeShloTanh = kTfLiteBuiltinStablehloTanh,
-  kLiteRtOpCodeShloScatter = kTfLiteBuiltinStablehloScatter,
-  kLiteRtOpCodeShloCompare = kTfLiteBuiltinStablehloCompare,
-  kLiteRtOpCodeShloConvert = kTfLiteBuiltinStablehloConvert,
-  kLiteRtOpCodeShloDynamicSlice = kTfLiteBuiltinStablehloDynamicSlice,
-  kLiteRtOpCodeShloDynamicUpdateSlice =
-      kTfLiteBuiltinStablehloDynamicUpdateSlice,
-  kLiteRtOpCodeShloPad = kTfLiteBuiltinStablehloPad,
-  kLiteRtOpCodeShloIota = kTfLiteBuiltinStablehloIota,
-  kLiteRtOpCodeShloGeneral = kTfLiteBuiltinStablehloDotGeneral,
-  kLiteRtOpCodeShloWindow = kTfLiteBuiltinStablehloReduceWindow,
-  kLiteRtOpCodeShloSort = kTfLiteBuiltinStablehloSort,
-  kLiteRtOpCodeShloWhile = kTfLiteBuiltinStablehloWhile,
-  kLiteRtOpCodeShloGather = kTfLiteBuiltinStablehloGather,
-  kLiteRtOpCodeShloTranspose = kTfLiteBuiltinStablehloTranspose,
-  kLiteRtOpCodeTflDilate = kTfLiteBuiltinDilate,
-  kLiteRtOpCodeShloRngBitGenerator = kTfLiteBuiltinStablehloRngBitGenerator,
-  kLiteRtOpCodeTflReduceWindow = kTfLiteBuiltinReduceWindow,
-  kLiteRtOpCodeShloComposite = kTfLiteBuiltinStablehloComposite,
-} LiteRtOpCode;
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OP_CODE_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_options.cc b/tensorflow/lite/experimental/litert/c/litert_options.cc
deleted file mode 100644
index a5d74384158a..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_options.cc
+++ /dev/null
@@ -1,423 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-
-#include <cstdint>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//
-// Op Options
-//
-
-LiteRtStatus LiteRtGetAddFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation) {
-  if (op->OpCode() != kLiteRtOpCodeTflAdd) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  const auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorNotFound;
-  }
-  *fused_activation = opts.AsAddOptions()->fused_activation_function;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetBatchMatmulAdjXOption(LiteRtOp op, bool* adj_x) {
-  if (op->OpCode() != kLiteRtOpCodeTflBatchMatmul) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *adj_x = opts.AsBatchMatMulOptions()->adj_x;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetBatchMatmulAdjYOption(LiteRtOp op, bool* adj_y) {
-  if (op->OpCode() != kLiteRtOpCodeTflBatchMatmul) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *adj_y = opts.AsBatchMatMulOptions()->adj_y;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetBatchMatmulAsymmetricQuantizeInputOption(
-    LiteRtOp op, bool* asymmetric_quantize_input) {
-  if (op->OpCode() != kLiteRtOpCodeTflBatchMatmul) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *asymmetric_quantize_input =
-      opts.AsBatchMatMulOptions()->asymmetric_quantize_inputs;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetConcatenationFusedActivationOption(
-    LiteRtOp op, uint32_t* fused_activation) {
-  if (op->OpCode() != kLiteRtOpCodeTflConcatenation) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *fused_activation = opts.AsConcatenationOptions()->fused_activation_function;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetConcatenationAxisOption(LiteRtOp op, int32_t* axis) {
-  if (op->OpCode() != kLiteRtOpCodeTflConcatenation) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *axis = opts.AsConcatenationOptions()->axis;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetDivFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation) {
-  if (op->OpCode() != kLiteRtOpCodeTflDiv) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *fused_activation = opts.AsDivOptions()->fused_activation_function;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetFullyConnectedFusedActivationOption(
-    LiteRtOp op, uint32_t* fused_activation) {
-  if (op->OpCode() != kLiteRtOpCodeTflFullyConnected) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *fused_activation = opts.AsFullyConnectedOptions()->fused_activation_function;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetFullyConnectedKeepNumDimsOption(LiteRtOp op,
-                                                      bool* keep_num_dims) {
-  if (op->OpCode() != kLiteRtOpCodeTflFullyConnected) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *keep_num_dims = opts.AsFullyConnectedOptions()->keep_num_dims;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtFullyConnectedGetQuantizedBiasTypeOption(
-    LiteRtOp op, uint32_t* quantized_bias_type) {
-  if (op->OpCode() != kLiteRtOpCodeTflFullyConnected) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *quantized_bias_type = opts.AsFullyConnectedOptions()->quantized_bias_type;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetFullyConnectedAsymmetricQuantizeInputOption(
-    LiteRtOp op, bool* asymmetric_quantize_input) {
-  if (op->OpCode() != kLiteRtOpCodeTflFullyConnected) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *asymmetric_quantize_input =
-      opts.AsFullyConnectedOptions()->asymmetric_quantize_inputs;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetFullyConnectedWeightsFormatOption(
-    LiteRtOp op, uint32_t* weights_format) {
-  if (op->OpCode() != kLiteRtOpCodeTflFullyConnected) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *weights_format = opts.AsFullyConnectedOptions()->weights_format;
-  return kLiteRtStatusOk;
-}
-LiteRtStatus LiteRtGetMulFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation) {
-  if (op->OpCode() != kLiteRtOpCodeTflMul) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *fused_activation = opts.AsMulOptions()->fused_activation_function;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSoftmaxBetaOption(LiteRtOp op, float* beta) {
-  if (op->OpCode() != kLiteRtOpCodeTflSoftmax) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *beta = opts.AsSoftmaxOptions()->beta;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetStridedSliceBeginMaskOption(LiteRtOp op,
-                                                  int32_t* begin_mask) {
-  if (op->OpCode() != kLiteRtOpCodeTflStridedSlice) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *begin_mask = opts.AsStridedSliceOptions()->begin_mask;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetStridedSliceEndMaskOption(LiteRtOp op,
-                                                int32_t* end_mask) {
-  if (op->OpCode() != kLiteRtOpCodeTflStridedSlice) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *end_mask = opts.AsStridedSliceOptions()->end_mask;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetStridedSliceEllipsisMaskOption(LiteRtOp op,
-                                                     int32_t* ellipsis_mask) {
-  if (op->OpCode() != kLiteRtOpCodeTflStridedSlice) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *ellipsis_mask = opts.AsStridedSliceOptions()->ellipsis_mask;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetStridedSliceNewAxisMaskOption(LiteRtOp op,
-                                                    int32_t* new_axis_mask) {
-  if (op->OpCode() != kLiteRtOpCodeTflStridedSlice) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *new_axis_mask = opts.AsStridedSliceOptions()->new_axis_mask;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetStridedSliceShrinkAxisMaskOption(
-    LiteRtOp op, int32_t* shrink_axis_mask) {
-  if (op->OpCode() != kLiteRtOpCodeTflStridedSlice) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *shrink_axis_mask = opts.AsStridedSliceOptions()->shrink_axis_mask;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetStridedSliceOffsetOption(LiteRtOp op, bool* offset) {
-  if (op->OpCode() != kLiteRtOpCodeTflStridedSlice) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *offset = opts.AsStridedSliceOptions()->offset;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSubFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation) {
-  if (op->OpCode() != kLiteRtOpCodeTflSub) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *fused_activation = opts.AsSubOptions()->fused_activation_function;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetReshapeNewShapeOption(LiteRtOp op,
-                                            const int32_t** new_shape,
-                                            int32_t* new_shape_size) {
-  if (op->OpCode() != kLiteRtOpCodeTflReshape) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    *new_shape_size = -1;
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (opts.AsReshapeOptions() == nullptr) {
-    *new_shape_size = -1;
-    return kLiteRtStatusOk;
-  } else {
-    *new_shape = opts.AsReshapeOptions()->new_shape.data();
-    *new_shape_size = opts.AsReshapeOptions()->new_shape.size();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSumKeepDimsOption(LiteRtOp op, bool* keepdims) {
-  if (op->OpCode() != kLiteRtOpCodeTflSum) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  // Sum OP options is stored as ReducerOptions.
-  *keepdims = opts.AsReducerOptions()->keep_dims;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetPackAxisOption(LiteRtOp op, int32_t* axis) {
-  if (op->OpCode() != kLiteRtOpCodeTflPack) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *axis = opts.AsPackOptions()->axis;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetGatherAxisOption(LiteRtOp op, int32_t* axis) {
-  if (op->OpCode() != kLiteRtOpCodeTflGather) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *axis = opts.AsGatherOptions()->axis;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetGatherBatchDimsOption(LiteRtOp op, int32_t* batch_dims) {
-  if (op->OpCode() != kLiteRtOpCodeTflGather) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *batch_dims = opts.AsGatherOptions()->batch_dims;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetMeanKeepDimsOption(LiteRtOp op, bool* keepdims) {
-  if (op->OpCode() != kLiteRtOpCodeTflMean) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  // Mean OP options is stored as ReducerOptions.
-  *keepdims = opts.AsReducerOptions()->keep_dims;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSplitNumSplitsOption(LiteRtOp op, int32_t* num_splits) {
-  if (op->OpCode() != kLiteRtOpCodeTflSplit) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_splits = opts.AsSplitOptions()->num_splits;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSHLOCompositeOpName(LiteRtOp op, const char** name) {
-  if (op->OpCode() != kLiteRtOpCodeShloComposite) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions2(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *name = opts.AsStableHLOCompositeOptions()->name.data();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetSHLOCompositeOpDecompositionSubgraphIndex(
-    LiteRtOp op, int32_t* subgraph_index) {
-  if (op->OpCode() != kLiteRtOpCodeShloComposite) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& opts = detail::GetTflOptions2(*op);
-  if (opts.value == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *subgraph_index =
-      opts.AsStableHLOCompositeOptions()->decomposition_subgraph_index;
-  return kLiteRtStatusOk;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_options.h b/tensorflow/lite/experimental/litert/c/litert_options.h
deleted file mode 100644
index 1f6b8791abcc..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_options.h
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OPTIONS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OPTIONS_H_
-
-#include <stdbool.h>  // NOLINT: To use bool type in C
-#include <stdint.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-LITERT_DEFINE_HANDLE(LiteRtOp);
-
-//==============================================================================
-//
-//  Get option APIs for LiteRt ADD op.
-//  Options:
-//  - FusedActivationOption : uint32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetAddFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation);
-
-//==============================================================================
-//
-//  Get option APIs for LiteRt BatchMatmul op.
-//  Options:
-//  - AdjXOption : bool
-//  - AdjYOption : bool
-//  - AsymmtericQuantizeInputOption : bool
-//
-//==============================================================================
-LiteRtStatus LiteRtGetBatchMatmulAdjXOption(LiteRtOp op, bool* adj_x);
-LiteRtStatus LiteRtGetBatchMatmulAdjYOption(LiteRtOp op, bool* adj_y);
-LiteRtStatus LiteRtGetBatchMatmulAsymmetricQuantizeInputOption(
-    LiteRtOp op, bool* asymmetric_quantize_input);
-
-//==============================================================================
-//
-//  Get option APIs for LiteRt Concatenation op.
-//  Options:
-//  - FusedActivationOption : uint32_t
-//  - AxisOption : int32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetConcatenationFusedActivationOption(
-    LiteRtOp op, uint32_t* fused_activation);
-LiteRtStatus LiteRtGetConcatenationAxisOption(LiteRtOp op, int32_t* axis);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Div op.
-//  Options:
-//  - FusedActivationOption : uint32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetDivFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt FullyConnected op.
-//  Options:
-//  - FusedActivationOption : uint32_t
-//  - WeightsFormatOption : uint32_t
-//  - KeepNumDimsOption : bool
-//  - QuantizedBiasTypeOption : uint32_t
-//  - AsymmtericQuantizeInputOption : bool
-//
-//==============================================================================
-LiteRtStatus LiteRtGetFullyConnectedFusedActivationOption(
-    LiteRtOp op, uint32_t* fused_activation);
-LiteRtStatus LiteRtGetFullyConnectedWeightsFormatOption(
-    LiteRtOp op, uint32_t* weights_format);
-LiteRtStatus LiteRtGetFullyConnectedKeepNumDimsOption(LiteRtOp op,
-                                                      bool* keep_num_dims);
-LiteRtStatus LiteRtFullyConnectedGetQuantizedBiasTypeOption(
-    LiteRtOp op, uint32_t* quantized_bias_type);
-LiteRtStatus LiteRtGetFullyConnectedAsymmetricQuantizeInputOption(
-    LiteRtOp op, bool* asymmetric_quantize_input);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Mul op.
-//  Options:
-//  - FusedActivationOption : uint32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetMulFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Softmax op.
-//  Options:
-//  - BetaOption : float
-//
-//==============================================================================
-LiteRtStatus LiteRtGetSoftmaxBetaOption(LiteRtOp op, float* beta);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt StridedSlice op.
-//  Options:
-//  - BeginMaskOption : int32_t
-//  - EndMaskOption : int32_t
-//  - EllipsisMaskOption : int32_t
-//  - NewAxisMaskOption : int32_t
-//  - ShrinkAxisMaskOption : int32_t
-//  - OffsetOption : bool
-
-//==============================================================================
-LiteRtStatus LiteRtGetStridedSliceBeginMaskOption(LiteRtOp op,
-                                                  int32_t* begin_mask);
-LiteRtStatus LiteRtGetStridedSliceEndMaskOption(LiteRtOp op, int32_t* end_mask);
-LiteRtStatus LiteRtGetStridedSliceEllipsisMaskOption(LiteRtOp op,
-                                                     int32_t* ellipsis_mask);
-LiteRtStatus LiteRtGetStridedSliceNewAxisMaskOption(LiteRtOp op,
-                                                    int32_t* new_axis_mask);
-LiteRtStatus LiteRtGetStridedSliceShrinkAxisMaskOption(
-    LiteRtOp op, int32_t* shrink_axis_mask);
-LiteRtStatus LiteRtGetStridedSliceOffsetOption(LiteRtOp op, bool* offset);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Sub op.
-//  Options:
-//  - FusedActivationOption : uint32_t
-//  - (Not supported) PotScaleInt16Option : bool
-//
-//==============================================================================
-LiteRtStatus LiteRtGetSubFusedActivationOption(LiteRtOp op,
-                                               uint32_t* fused_activation);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Reshape op.
-//  Options:
-//  - new_shape : int32_t[]
-//
-//==============================================================================
-LiteRtStatus LiteRtGetReshapeNewShapeOption(LiteRtOp op,
-                                            const int32_t** new_shape,
-                                            int32_t* new_shape_size);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Sum op.
-//  Options:
-// - KeepdimsOption : bool
-//
-//==============================================================================
-LiteRtStatus LiteRtGetSumKeepDimsOption(LiteRtOp op, bool* keepdims);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Pack op.
-//  Options:
-// - axisOption : int32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetPackAxisOption(LiteRtOp op, int32_t* axis);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Gather op.
-//  Options:
-// - axisOption : int32_t
-// - batch_dims : int32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetGatherAxisOption(LiteRtOp op, int32_t* axis);
-LiteRtStatus LiteRtGetGatherBatchDimsOption(LiteRtOp op, int32_t* batch_dims);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Mean op.
-//  Options:
-// - keepdimsOption : bool
-//
-//==============================================================================
-LiteRtStatus LiteRtGetMeanKeepDimsOption(LiteRtOp op, bool* keepdims);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt Split op.
-//  Options:
-// - num_splits : int32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetSplitNumSplitsOption(LiteRtOp op, int32_t* num_splits);
-
-//==============================================================================
-//
-// Get option APIs for LiteRt SHLO Composite op.
-//  Options:
-// - name : string
-// - decomposition_subgraph_index : int32_t
-//
-//==============================================================================
-LiteRtStatus LiteRtGetSHLOCompositeOpName(LiteRtOp op, const char** name);
-LiteRtStatus LiteRtGetSHLOCompositeOpDecompositionSubgraphIndex(
-    LiteRtOp op, int32_t* decomposition_subgraph_index);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_OPTIONS_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_options_test.cc b/tensorflow/lite/experimental/litert/c/litert_options_test.cc
deleted file mode 100644
index 18b08883af9c..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_options_test.cc
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-
-#include <cstdint>
-
-#include <gmock/gmock.h>  // IWYU pragma: keep
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace {
-using testing::litert::IsError;
-
-TEST(GetOpOptionTest, TestGetAddOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_add_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  uint32_t fused_activation;
-  LITERT_ASSERT_OK(LiteRtGetAddFusedActivationOption(op, &fused_activation));
-  ASSERT_EQ(fused_activation, 0);
-}
-
-TEST(GetOpOptionTest, TestGetBatchMatmulOptions) {
-  auto model =
-      litert::testing::LoadTestFileModel("simple_batch_matmul_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  bool adj_x;
-  LITERT_ASSERT_OK(LiteRtGetBatchMatmulAdjXOption(op, &adj_x));
-  ASSERT_EQ(adj_x, false);
-
-  bool adj_y;
-  LITERT_ASSERT_OK(LiteRtGetBatchMatmulAdjYOption(op, &adj_y));
-  ASSERT_EQ(adj_y, false);
-
-  bool asymmetric_quantize_input;
-  LITERT_ASSERT_OK(LiteRtGetBatchMatmulAsymmetricQuantizeInputOption(
-      op, &asymmetric_quantize_input));
-  ASSERT_EQ(asymmetric_quantize_input, false);
-}
-
-TEST(GetOpOptionTest, TestGetConcatenationOptions) {
-  auto model =
-      litert::testing::LoadTestFileModel("simple_concatenation_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  uint32_t fused_activation;
-  LITERT_ASSERT_OK(
-      LiteRtGetConcatenationFusedActivationOption(op, &fused_activation));
-  ASSERT_EQ(fused_activation, 0);
-
-  int32_t axis;
-  LITERT_ASSERT_OK(LiteRtGetConcatenationAxisOption(op, &axis));
-  ASSERT_EQ(axis, 2);
-}
-
-TEST(GetOpOptionTest, TestGetDivOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_div_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  uint32_t fused_activation;
-  LITERT_ASSERT_OK(LiteRtGetDivFusedActivationOption(op, &fused_activation));
-  ASSERT_EQ(fused_activation, 0);
-}
-
-TEST(GetOpOptionTest, TestGetFullyConnectedOptions) {
-  auto model =
-      litert::testing::LoadTestFileModel("simple_fully_connected_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  uint32_t fused_activation;
-  LITERT_ASSERT_OK(
-      LiteRtGetFullyConnectedFusedActivationOption(op, &fused_activation));
-  ASSERT_EQ(fused_activation, 0);
-
-  uint32_t weights_format;
-  LITERT_ASSERT_OK(
-      LiteRtGetFullyConnectedWeightsFormatOption(op, &weights_format));
-  ASSERT_EQ(weights_format, 0);
-
-  bool keep_num_dims;
-  LITERT_ASSERT_OK(
-      LiteRtGetFullyConnectedKeepNumDimsOption(op, &keep_num_dims));
-  ASSERT_EQ(keep_num_dims, true);
-
-  uint32_t quantized_bias_type;
-  LITERT_ASSERT_OK(
-      LiteRtFullyConnectedGetQuantizedBiasTypeOption(op, &quantized_bias_type));
-  ASSERT_EQ(quantized_bias_type, 0);
-
-  bool asymmetric_quantize_input;
-  LITERT_ASSERT_OK(LiteRtGetFullyConnectedAsymmetricQuantizeInputOption(
-      op, &asymmetric_quantize_input));
-  ASSERT_EQ(asymmetric_quantize_input, false);
-}
-
-TEST(GetOpOptionTest, TestGetMulOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_mul_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  uint32_t fused_activation;
-  LITERT_ASSERT_OK(LiteRtGetMulFusedActivationOption(op, &fused_activation));
-  ASSERT_EQ(fused_activation, 0);
-}
-
-TEST(GetOpOptionTest, TestGetSoftmaxOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_softmax_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  float beta;
-  LITERT_ASSERT_OK(LiteRtGetSoftmaxBetaOption(op, &beta));
-  EXPECT_FLOAT_EQ(beta, 1.0);
-}
-
-TEST(GetOpOptionTest, TestGetStridedSliceOptions) {
-  auto model =
-      litert::testing::LoadTestFileModel("simple_strided_slice_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  int32_t begin_mask;
-  LITERT_ASSERT_OK(LiteRtGetStridedSliceBeginMaskOption(op, &begin_mask));
-  ASSERT_EQ(begin_mask, 0);
-
-  int32_t end_mask;
-  LITERT_ASSERT_OK(LiteRtGetStridedSliceEndMaskOption(op, &end_mask));
-  ASSERT_EQ(end_mask, 0);
-
-  int32_t ellipsis_mask;
-  LITERT_ASSERT_OK(LiteRtGetStridedSliceEllipsisMaskOption(op, &ellipsis_mask));
-  ASSERT_EQ(ellipsis_mask, 0);
-
-  int32_t new_axis_mask;
-  LITERT_ASSERT_OK(LiteRtGetStridedSliceNewAxisMaskOption(op, &new_axis_mask));
-  ASSERT_EQ(new_axis_mask, 0);
-
-  int32_t shrink_axis_mask;
-  LITERT_ASSERT_OK(
-      LiteRtGetStridedSliceShrinkAxisMaskOption(op, &shrink_axis_mask));
-  ASSERT_EQ(shrink_axis_mask, 0);
-
-  bool offset;
-  LITERT_ASSERT_OK(LiteRtGetStridedSliceOffsetOption(op, &offset));
-  ASSERT_EQ(offset, false);
-}
-
-TEST(GetOpOptionTest, TestGetSubOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_sub_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  uint32_t fused_activation;
-  LITERT_ASSERT_OK(LiteRtGetSubFusedActivationOption(op, &fused_activation));
-  ASSERT_EQ(fused_activation, 0);
-}
-
-TEST(GetOpOptionTest, TestGetNullReshapeOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_reshape_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  const int32_t* new_shape = nullptr;
-  int32_t new_shape_size;
-
-  EXPECT_THAT(LiteRtGetReshapeNewShapeOption(op, &new_shape, &new_shape_size),
-              IsError(kLiteRtStatusErrorInvalidArgument));
-  ASSERT_EQ(new_shape_size, -1);
-}
-
-TEST(GetOpOptionTest, TestGetSumOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_sum_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  bool keepdims;
-  LITERT_ASSERT_OK(LiteRtGetSumKeepDimsOption(op, &keepdims));
-  ASSERT_EQ(keepdims, true);
-}
-
-TEST(GetOpOptionTest, TestGetPackOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_pack_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  int32_t axis;
-  LITERT_ASSERT_OK(LiteRtGetPackAxisOption(op, &axis));
-  ASSERT_EQ(axis, 0);
-}
-
-TEST(GetOpOptionTest, TestGetGatherOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_gather_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  int32_t axis;
-  LITERT_ASSERT_OK(LiteRtGetGatherAxisOption(op, &axis));
-  ASSERT_EQ(axis, 0);
-
-  int32_t batch_dims;
-  LITERT_ASSERT_OK(LiteRtGetGatherBatchDimsOption(op, &batch_dims));
-  ASSERT_EQ(batch_dims, 0);
-}
-
-TEST(GetOpOptionTest, TestGetMeanOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_mean_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  bool keepdims;
-  LITERT_ASSERT_OK(LiteRtGetMeanKeepDimsOption(op, &keepdims));
-  ASSERT_EQ(keepdims, false);
-}
-
-TEST(GetOpOptionTest, TestGetSplitOptions) {
-  auto model = litert::testing::LoadTestFileModel("simple_split_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-  auto op = ops.front().Get();
-
-  int32_t num_splits;
-  LITERT_ASSERT_OK(LiteRtGetSplitNumSplitsOption(op, &num_splits));
-  ASSERT_EQ(num_splits, 3);
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.cc b/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.cc
deleted file mode 100644
index 09767753e945..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.cc
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-#include <cstddef>
-#include <cstdint>
-
-#include "absl/types/span.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-#if LITERT_HAS_OPENGL_SUPPORT
-#include <GLES3/gl31.h>
-#include <GLES3/gl32.h>
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LiteRtStatus LiteRtCreateTensorBufferFromHostMemory(
-    const LiteRtRankedTensorType* tensor_type, void* host_buffer_addr,
-    size_t size, LiteRtHostMemoryDeallocator deallocator,
-    LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !host_buffer_addr || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromHostMemory(
-      *tensor_type,
-      absl::MakeSpan(static_cast<uint8_t*>(host_buffer_addr), size),
-      deallocator);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-#if LITERT_HAS_AHWB_SUPPORT
-LiteRtStatus LiteRtCreateTensorBufferFromAhwb(
-    const LiteRtRankedTensorType* tensor_type, AHardwareBuffer* ahwb,
-    size_t ahwb_offset, LiteRtAhwbDeallocator deallocator,
-    LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !ahwb || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromAhwb(
-      *tensor_type, ahwb, ahwb_offset, deallocator);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferAhwb(LiteRtTensorBuffer tensor_buffer,
-                                       AHardwareBuffer** ahwb) {
-  if (!tensor_buffer || !ahwb) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto ahwb_buffer = tensor_buffer->GetAhwbBuffer();
-  if (!ahwb_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s", ahwb_buffer.Error().Message().c_str());
-    return ahwb_buffer.Error().Status();
-  }
-
-  *ahwb = *ahwb_buffer;
-  return kLiteRtStatusOk;
-}
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-#if LITERT_HAS_ION_SUPPORT
-LiteRtStatus LiteRtCreateTensorBufferFromIonBuffer(
-    const LiteRtRankedTensorType* tensor_type, void* ion_buffer_addr,
-    int ion_buffer_fd, size_t ion_buffer_size, size_t ion_buffer_offset,
-    LiteRtIonDeallocator deallocator, LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromIonBuffer(
-      *tensor_type, ion_buffer_addr, ion_buffer_fd, ion_buffer_size,
-      ion_buffer_offset, deallocator);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferIonBuffer(LiteRtTensorBuffer tensor_buffer,
-                                            void** ion_buffer_addr,
-                                            int* ion_buffer_fd) {
-  if (!tensor_buffer || !ion_buffer_addr || !ion_buffer_fd) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto ion_buffer = tensor_buffer->GetIonBuffer();
-  if (!ion_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s", ion_buffer.Error().Message().c_str());
-    return ion_buffer.Error().Status();
-  }
-
-  *ion_buffer_addr = ion_buffer->first;
-  *ion_buffer_fd = ion_buffer->second;
-  return kLiteRtStatusOk;
-}
-#endif  // LITERT_HAS_ION_SUPPORT
-
-#if LITERT_HAS_DMABUF_SUPPORT
-LiteRtStatus LiteRtCreateTensorBufferFromDmaBufBuffer(
-    const LiteRtRankedTensorType* tensor_type, void* dmabuf_buffer_addr,
-    int dmabuf_buffer_fd, size_t dmabuf_buffer_size,
-    size_t dmabuf_buffer_offset, LiteRtDmaBufDeallocator deallocator,
-    LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromDmaBufBuffer(
-      *tensor_type, dmabuf_buffer_addr, dmabuf_buffer_fd, dmabuf_buffer_size,
-      dmabuf_buffer_offset, deallocator);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferDmaBufBuffer(LiteRtTensorBuffer tensor_buffer,
-                                               void** dmabuf_buffer_addr,
-                                               int* dmabuf_buffer_fd) {
-  if (!tensor_buffer || !dmabuf_buffer_addr || !dmabuf_buffer_fd) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto dmabuf_buffer = tensor_buffer->GetDmaBufBuffer();
-  if (!dmabuf_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s", dmabuf_buffer.Error().Message().c_str());
-    return dmabuf_buffer.Error().Status();
-  }
-
-  *dmabuf_buffer_addr = dmabuf_buffer->first;
-  *dmabuf_buffer_fd = dmabuf_buffer->second;
-  return kLiteRtStatusOk;
-}
-#endif  // LITERT_HAS_DMABUF_SUPPORT
-
-#if LITERT_HAS_OPENCL_SUPPORT
-LiteRtStatus LiteRtCreateTensorBufferFromOpenClBuffer(
-    const LiteRtRankedTensorType* tensor_type, cl_mem cl_mem_addr,
-    size_t opencl_buffer_size, LiteRtOpenClDeallocator deallocator,
-    LiteRtTensorBuffer* buffer) {
-  if (!tensor_type || !buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromOpenClBuffer(
-      *tensor_type, cl_mem_addr, opencl_buffer_size);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferOpenClBuffer(LiteRtTensorBuffer tensor_buffer,
-                                               cl_mem* cl_mem_addr) {
-  if (!tensor_buffer || !cl_mem_addr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto opencl_buffer = tensor_buffer->GetOpenClBuffer();
-  if (!opencl_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s", opencl_buffer.Error().Message().c_str());
-    return opencl_buffer.Error().Status();
-  }
-
-  *cl_mem_addr = (*opencl_buffer)->GetMemoryPtr();
-  return kLiteRtStatusOk;
-}
-#endif  // LITERT_HAS_OPENCL_SUPPORT
-
-#if LITERT_HAS_FASTRPC_SUPPORT
-LiteRtStatus LiteRtCreateTensorBufferFromFastRpcBuffer(
-    const LiteRtRankedTensorType* tensor_type, void* fastrpc_buffer_addr,
-    int fastrpc_buffer_fd, size_t fastrpc_buffer_size,
-    size_t fastrpc_buffer_offset, LiteRtFastRpcDeallocator deallocator,
-    LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromFastRpcBuffer(
-      *tensor_type, fastrpc_buffer_addr, fastrpc_buffer_fd, fastrpc_buffer_size,
-      fastrpc_buffer_offset, deallocator);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferFastRpcBuffer(
-    LiteRtTensorBuffer tensor_buffer, void** fastrpc_buffer_addr,
-    int* fastrpc_buffer_fd) {
-  if (!tensor_buffer || !fastrpc_buffer_addr || !fastrpc_buffer_fd) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto fastrpc_buffer = tensor_buffer->GetFastRpcBuffer();
-  if (!fastrpc_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s", fastrpc_buffer.Error().Message().c_str());
-    return fastrpc_buffer.Error().Status();
-  }
-
-  *fastrpc_buffer_addr = fastrpc_buffer->first;
-  *fastrpc_buffer_fd = fastrpc_buffer->second;
-  return kLiteRtStatusOk;
-}
-#endif  // LITERT_HAS_FASTRPC_SUPPORT
-
-#if LITERT_HAS_OPENGL_SUPPORT
-LiteRtStatus LiteRtCreateTensorBufferFromGlTexture(
-    const LiteRtRankedTensorType* tensor_type, GLenum target, GLuint id,
-    GLenum format, size_t size_bytes, GLint layer,
-    LiteRtGlTextureDeallocator deallocator, LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromGlTexture(
-      *tensor_type, target, id, format, size_bytes, layer, deallocator);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferGlTexture(LiteRtTensorBuffer tensor_buffer,
-                                            GLenum* target, GLuint* id,
-                                            GLenum* format, size_t* size_bytes,
-                                            GLint* layer) {
-  if (!tensor_buffer || !target || !id || !format || !size_bytes || !layer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  litert::Expected<litert::internal::GlTexture*> gl_texture =
-      tensor_buffer->GetGlTexture();
-  if (!gl_texture) {
-    LITERT_LOG(LITERT_ERROR, "%s", gl_texture.Error().Message().c_str());
-    return gl_texture.Error().Status();
-  }
-  *target = (*gl_texture)->target();
-  *id = (*gl_texture)->id();
-  *format = (*gl_texture)->format();
-  *size_bytes = (*gl_texture)->size_bytes();
-  *layer = (*gl_texture)->layer();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCreateTensorBufferFromGlBuffer(
-    const LiteRtRankedTensorType* tensor_type, GLenum target, GLuint id,
-    size_t bytes_size, size_t offset, LiteRtGlBufferDeallocator deallocator,
-    LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateFromGlBuffer(
-      *tensor_type, target, id, bytes_size, offset, deallocator);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().data());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferGlBuffer(LiteRtTensorBuffer tensor_buffer,
-                                           GLenum* target, GLuint* id,
-                                           size_t* bytes_size, size_t* offset) {
-  if (!tensor_buffer || !target || !id) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto gl_buffer_expected = tensor_buffer->GetGlBuffer();
-  if (!gl_buffer_expected) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               gl_buffer_expected.Error().Message().c_str());
-    return gl_buffer_expected.Error().Status();
-  }
-  *target = (*gl_buffer_expected)->target();
-  *id = (*gl_buffer_expected)->id();
-  *bytes_size = (*gl_buffer_expected)->bytes_size();
-  *offset = (*gl_buffer_expected)->offset();
-  return kLiteRtStatusOk;
-}
-
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-LiteRtStatus LiteRtCreateManagedTensorBuffer(
-    LiteRtTensorBufferType buffer_type,
-    const LiteRtRankedTensorType* tensor_type, size_t buffer_size,
-    LiteRtTensorBuffer* tensor_buffer) {
-  if (!tensor_type || !tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto created_tensor_buffer = LiteRtTensorBufferT::CreateManaged(
-      buffer_type, *tensor_type, buffer_size);
-  if (!created_tensor_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               created_tensor_buffer.Error().Message().c_str());
-    return created_tensor_buffer.Error().Status();
-  }
-  *tensor_buffer = created_tensor_buffer->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDuplicateTensorBuffer(LiteRtTensorBuffer tensor_buffer) {
-  if (!tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  tensor_buffer->Duplicate();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferType(LiteRtTensorBuffer tensor_buffer,
-                                       LiteRtTensorBufferType* buffer_type) {
-  if (!tensor_buffer || !buffer_type) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *buffer_type = tensor_buffer->buffer_type();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferTensorType(
-    LiteRtTensorBuffer tensor_buffer, LiteRtRankedTensorType* tensor_type) {
-  if (!tensor_buffer || !tensor_type) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *tensor_type = tensor_buffer->tensor_type();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferSize(LiteRtTensorBuffer tensor_buffer,
-                                       size_t* buffer_size) {
-  if (!tensor_buffer || !buffer_size) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *buffer_size = tensor_buffer->buffer_size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferOffset(LiteRtTensorBuffer tensor_buffer,
-                                         size_t* buffer_offset) {
-  if (!tensor_buffer || !buffer_offset) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *buffer_offset = tensor_buffer->buffer_offset();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferHostMemory(LiteRtTensorBuffer tensor_buffer,
-                                             void** host_memory_addr) {
-  if (!tensor_buffer || !host_memory_addr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto host_buffer = tensor_buffer->GetHostBuffer();
-  if (!host_buffer) {
-    LITERT_LOG(LITERT_ERROR, "%s", host_buffer.Error().Message().c_str());
-    return host_buffer.Error().Status();
-  }
-
-  *host_memory_addr = *host_buffer;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtHasTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
-                                        bool* has_event) {
-  if (!tensor_buffer || !has_event) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *has_event = tensor_buffer->HasEvent();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
-                                        LiteRtEvent* event) {
-  if (!tensor_buffer || !event) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto result = tensor_buffer->GetEvent();
-  if (!result) {
-    LITERT_LOG(LITERT_ERROR, "%s", result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-  *event = *result;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtSetTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
-                                        LiteRtEvent event) {
-  if (!tensor_buffer || !event) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  tensor_buffer->SetEvent(event);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtClearTensorBufferEvent(LiteRtTensorBuffer tensor_buffer) {
-  if (!tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  tensor_buffer->ClearEvent();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtLockTensorBuffer(LiteRtTensorBuffer tensor_buffer,
-                                    void** host_mem_addr) {
-  if (!tensor_buffer || !host_mem_addr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto mapped_addr = tensor_buffer->Lock();
-  if (!mapped_addr) {
-    LITERT_LOG(LITERT_ERROR, "%s", mapped_addr.Error().Message().c_str());
-    return mapped_addr.Error().Status();
-  }
-
-  *host_mem_addr = *mapped_addr;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtUnlockTensorBuffer(LiteRtTensorBuffer tensor_buffer) {
-  if (!tensor_buffer) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  if (auto status = tensor_buffer->Unlock(); !status) {
-    LITERT_LOG(LITERT_ERROR, "%s", status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyTensorBuffer(LiteRtTensorBuffer tensor_buffer) {
-  if (tensor_buffer->Unref()) {
-    delete tensor_buffer;
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h b/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h
deleted file mode 100644
index 61c32a99b39c..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h
+++ /dev/null
@@ -1,260 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_H_
-
-#include <memory.h>
-#include <stddef.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#if LITERT_HAS_OPENCL_SUPPORT
-#include <CL/cl.h>
-#endif  // LITERT_HAS_OPENCL_SUPPORT
-#if LITERT_HAS_OPENGL_SUPPORT
-#include <GLES3/gl31.h>
-#include <GLES3/gl32.h>
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include <android/hardware_buffer.h>
-#else
-// Define a place holder AHardwareBuffer struct just to enable compilation.
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-typedef struct AHardwareBuffer AHardwareBuffer;
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-LITERT_DEFINE_HANDLE(LiteRtTensorBuffer);
-
-#define LITERT_HOST_MEMORY_BUFFER_ALIGNMENT 64
-
-typedef enum {
-  kLiteRtTensorBufferTypeUnknown = 0,
-  kLiteRtTensorBufferTypeHostMemory = 1,
-  kLiteRtTensorBufferTypeAhwb = 2,
-  kLiteRtTensorBufferTypeIon = 3,
-  kLiteRtTensorBufferTypeDmaBuf = 4,
-  kLiteRtTensorBufferTypeFastRpc = 5,
-  kLiteRtTensorBufferTypeOpenCl = 6,
-  kLiteRtTensorBufferTypeGlBuffer = 7,
-  kLiteRtTensorBufferTypeGlTexture = 8,
-} LiteRtTensorBufferType;
-
-typedef void (*LiteRtHostMemoryDeallocator)(void* addr);
-typedef void (*LiteRtAhwbDeallocator)(AHardwareBuffer* ahwb);
-typedef void (*LiteRtIonDeallocator)(void* ion_buffer_addr);
-typedef void (*LiteRtDmaBufDeallocator)(void* dmabuf_buffer_addr);
-typedef void (*LiteRtFastRpcDeallocator)(void* fastrpc_buffer_addr);
-typedef void (*LiteRtOpenClDeallocator)(void* opencl_buffer_addr);
-typedef void (*LiteRtGlBufferDeallocator)(void* gl_buffer_addr);
-typedef void (*LiteRtGlTextureDeallocator)(void* gl_texture_addr);
-
-// /////////////////////////////////////////////////////////////////////////////
-// TensorBuffers.
-// /////////////////////////////////////////////////////////////////////////////
-
-// Create a tensor buffer from an existing host memory buffer of a given size,
-// with optional host memory buffer deallocator (it can be NULL). Return an
-// error if the passed host memory buffer doesn't satisfy
-// LITERT_HOST_MEMORY_BUFFER_ALIGNMENT alignment.
-LiteRtStatus LiteRtCreateTensorBufferFromHostMemory(
-    const LiteRtRankedTensorType* tensor_type, void* host_buffer_addr,
-    size_t host_buffer_size, LiteRtHostMemoryDeallocator deallocator,
-    LiteRtTensorBuffer* buffer);
-
-// Return an error if the backing buffer is not allocated on the host memory.
-LiteRtStatus LiteRtGetTensorBufferHostMemory(LiteRtTensorBuffer tensor_buffer,
-                                             void** host_memory_addr);
-
-#if LITERT_HAS_AHWB_SUPPORT
-// Create a tensor buffer from an existing AHardwareBuffer, with optional
-// AHardwareBuffer deallocator (it can be NULL). An non-zero `buffer_offset` can
-// be used to specify multiple tensor buffers sharing the same underlying AHWB,
-// in which case the provided AHWB must be sufficiently large to accomodate for
-// the allocation needed for all tensor buffers sharing it.
-LiteRtStatus LiteRtCreateTensorBufferFromAhwb(
-    const LiteRtRankedTensorType* tensor_type, AHardwareBuffer* ahwb,
-    size_t ahwb_offset, LiteRtAhwbDeallocator deallocator,
-    LiteRtTensorBuffer* buffer);
-
-// Return an error if the backing buffer is not an AhardwareBuffer.
-LiteRtStatus LiteRtGetTensorBufferAhwb(LiteRtTensorBuffer tensor_buffer,
-                                       AHardwareBuffer** ahwb);
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-#if LITERT_HAS_ION_SUPPORT
-// Create a tensor buffer from an existing ION buffer of a given size, with
-// optional ION buffer deallocator (it can be NULL). An non-zero
-// `ion_buffer_offset` can be used to specify multiple tensor buffers sharing
-// the same underlying ION buffer, in which case parameter `ion_buffer_size`
-// must be the entire size of the underlying ION memory buffer, including the
-// allocation needed for all tensor buffers sharing it.
-LiteRtStatus LiteRtCreateTensorBufferFromIonBuffer(
-    const LiteRtRankedTensorType* tensor_type, void* ion_buffer_addr,
-    int ion_buffer_fd, size_t ion_buffer_size, size_t ion_buffer_offset,
-    LiteRtIonDeallocator deallocator, LiteRtTensorBuffer* buffer);
-
-// Return an error if the backing buffer is not an ION buffer.
-LiteRtStatus LiteRtGetTensorBufferIonBuffer(LiteRtTensorBuffer buffer,
-                                            void** ion_buffer_addr,
-                                            int* ion_buffer_fd);
-#endif  // LITERT_HAS_ION_SUPPORT
-
-#if LITERT_HAS_DMABUF_SUPPORT
-// Create a tensor buffer from an existing DMA-BUF buffer of a given size, with
-// optional DMA-BUF buffer deallocator (it can be NULL). An non-zero
-// `dmabuf_buffer_offset` can be used to specify multiple tensor buffers sharing
-// the same underlying ION buffer, in which case parameter `ion_buffer_size`
-// must be the entire size of the underlying ION memory buffer, including the
-// allocation needed for all tensor buffers sharing it.
-LiteRtStatus LiteRtCreateTensorBufferFromDmaBufBuffer(
-    const LiteRtRankedTensorType* tensor_type, void* dmabuf_buffer_addr,
-    int dmabuf_buffer_fd, size_t dmabuf_buffer_size,
-    size_t dmabuf_buffer_offset, LiteRtDmaBufDeallocator deallocator,
-    LiteRtTensorBuffer* buffer);
-
-// Return an error if the backing buffer is not an DMA-BUF buffer.
-LiteRtStatus LiteRtGetTensorBufferDmaBufBuffer(LiteRtTensorBuffer tensor_buffer,
-                                               void** dmabuf_buffer_addr,
-                                               int* dmabuf_buffer_fd);
-#endif  // LITERT_HAS_DMABUF_SUPPORT
-
-#if LITERT_HAS_FASTRPC_SUPPORT
-// Create a tensor buffer from an existing FastRPC memory buffer of a given
-// size, with optional FastRPC memory buffer deallocator (it can be NULL). An
-// non-zero `fastrpc_buffer_offset` can be used to specify multiple tensor
-// buffers sharing the same underlying FastRPC memory buffer, in which case
-// parameter `fastrpc_buffer_size` must be the entire size of the underlying
-// FastRPC memory buffer, including the allocation needed for all tensor buffers
-// sharing it.
-LiteRtStatus LiteRtCreateTensorBufferFromFastRpcBuffer(
-    const LiteRtRankedTensorType* tensor_type, void* fastrpc_buffer_addr,
-    int fastrpc_fd, size_t fastrpc_buffer_size, size_t fastrpc_buffer_offset,
-    LiteRtFastRpcDeallocator deallocator, LiteRtTensorBuffer* buffer);
-
-// Return an error if the backing buffer is not a FastRPC memory buffer.
-LiteRtStatus LiteRtGetTensorBufferFastRpcBuffer(
-    LiteRtTensorBuffer tensor_buffer, void** fastrpc_buffer_addr,
-    int* fastrpc_buffer_fd);
-#endif  // LITERT_HAS_FASTRPC_SUPPORT
-
-#if LITERT_HAS_OPENCL_SUPPORT
-// Create a tensor buffer from an existing OpenCL buffer of a given size, with
-// optional opencl memory buffer deallocator (it can be NULL). An non-zero
-// `opencl_buffer_offset` can be used to specify multiple tensor buffers sharing
-// the same underlying OpenCL buffer, in which case parameter
-// `opencl_buffer_size` must be the entire size of the underlying OpenCL
-// memory buffer, including the allocation needed for all tensor buffers
-// sharing it.
-LiteRtStatus LiteRtCreateTensorBufferFromOpenClBuffer(
-    const LiteRtRankedTensorType* tensor_type, cl_mem cl_mem_addr,
-    size_t opencl_buffer_size, LiteRtOpenClDeallocator deallocator,
-    LiteRtTensorBuffer* buffer);
-
-// Return an error if the backing buffer is not a OpenCL buffer.
-LiteRtStatus LiteRtGetTensorBufferOpenClBuffer(LiteRtTensorBuffer tensor_buffer,
-                                               cl_mem* cl_mem_addr);
-#endif  // LITERT_HAS_OPENCL_SUPPORT
-
-#if LITERT_HAS_OPENGL_SUPPORT
-LiteRtStatus LiteRtCreateTensorBufferFromGlTexture(
-    const LiteRtRankedTensorType* tensor_type, GLenum target, GLuint id,
-    GLenum format, size_t size_bytes, GLint layer,
-    LiteRtGlTextureDeallocator deallocator, LiteRtTensorBuffer* buffer);
-
-LiteRtStatus LiteRtGetTensorBufferGlTexture(LiteRtTensorBuffer tensor_buffer,
-                                            GLenum* target, GLuint* id,
-                                            GLenum* format, size_t* size_bytes,
-                                            GLint* layer);
-
-LiteRtStatus LiteRtCreateTensorBufferFromGlBuffer(
-    const LiteRtRankedTensorType* tensor_type, GLenum target, GLuint id,
-    size_t bytes_size, size_t offset, LiteRtGlBufferDeallocator deallocator,
-    LiteRtTensorBuffer* buffer);
-
-LiteRtStatus LiteRtGetTensorBufferGlBuffer(LiteRtTensorBuffer tensor_buffer,
-                                           GLenum* target, GLuint* id,
-                                           size_t* bytes_size, size_t* offset);
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-// Create a buffer backed by managed memory for a given size.
-LiteRtStatus LiteRtCreateManagedTensorBuffer(
-    LiteRtTensorBufferType buffer_type,
-    const LiteRtRankedTensorType* tensor_type, size_t buffer_size,
-    LiteRtTensorBuffer* buffer);
-
-// Create a duplicate of the current tensor buffer. It will increase the
-// reference count of a managed tensor buffer. And the number decreases when
-// LiteRtDestroyTensorBuffer() is called.
-LiteRtStatus LiteRtDuplicateTensorBuffer(LiteRtTensorBuffer tensor_buffer);
-
-LiteRtStatus LiteRtGetTensorBufferType(LiteRtTensorBuffer tensor_buffer,
-                                       LiteRtTensorBufferType* buffer_type);
-
-LiteRtStatus LiteRtGetTensorBufferTensorType(
-    LiteRtTensorBuffer tensor_buffer, LiteRtRankedTensorType* tensor_type);
-
-LiteRtStatus LiteRtGetTensorBufferSize(LiteRtTensorBuffer tensor_buffer,
-                                       size_t* size);
-
-LiteRtStatus LiteRtGetTensorBufferOffset(LiteRtTensorBuffer tensor_buffer,
-                                         size_t* offset);
-
-LiteRtStatus LiteRtHasTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
-                                        bool* has_event);
-
-// Return an event attached a given tensor buffer, or NULL if no such event
-// exists. The tensor buffer retains ownership of the returned event.
-LiteRtStatus LiteRtGetTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
-                                        LiteRtEvent* event);
-
-// Attach a given event to a given tensor buffer. The tensor buffer takes
-// ownership of the event.
-LiteRtStatus LiteRtSetTensorBufferEvent(LiteRtTensorBuffer tensor_buffer,
-                                        LiteRtEvent event);
-
-// Remove any event that may have been previously attached to the given tensor
-// buffer and deallocate such event.
-LiteRtStatus LiteRtClearTensorBufferEvent(LiteRtTensorBuffer tensor_buffer);
-
-// Lock a tensor buffer and map it to host memory, potentially synchronizing on
-// an event that was previously attached to the tensor buffer with
-// `LiteRtSetTensorBufferEvent`.
-LiteRtStatus LiteRtLockTensorBuffer(LiteRtTensorBuffer tensor_buffer,
-                                    void** host_mem_addr);
-
-// Unlock a tensor buffer and (potentially) unmap it from host memory.
-LiteRtStatus LiteRtUnlockTensorBuffer(LiteRtTensorBuffer buffer);
-
-// Destroy a tensor buffer. If the tensor buffer is managed, the number of
-// references to it is decreased and released the underlying TensorBufferT when
-// the last reference is removed.
-void LiteRtDestroyTensorBuffer(LiteRtTensorBuffer buffer);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.cc b/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.cc
deleted file mode 100644
index fce2e4049f88..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer_requirements.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-LiteRtStatus LiteRtCreateTensorBufferRequirements(
-    int num_supported_tensor_buffer_types,
-    const LiteRtTensorBufferType* supported_tensor_buffer_types,
-    size_t buffer_size, int num_strides, const uint32_t* strides,
-    LiteRtTensorBufferRequirements* requirements) {
-  if (num_supported_tensor_buffer_types < 1 || !supported_tensor_buffer_types ||
-      !requirements) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *requirements = new LiteRtTensorBufferRequirementsT(
-      num_supported_tensor_buffer_types, supported_tensor_buffer_types,
-      buffer_size, std::vector<uint32_t>(strides, strides + num_strides));
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-    LiteRtTensorBufferRequirements requirements, int* num_types) {
-  if (!requirements || !num_types) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_types = requirements->SupportedBufferTypes().size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-    LiteRtTensorBufferRequirements requirements, int type_index,
-    LiteRtTensorBufferType* type) {
-  if (!requirements || type_index < 0 ||
-      type_index >= requirements->SupportedBufferTypes().size()) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *type = requirements->SupportedBufferTypes()[type_index];
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferRequirementsBufferSize(
-    LiteRtTensorBufferRequirements requirements, size_t* buffer_size) {
-  if (!requirements || !buffer_size) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *buffer_size = requirements->BufferSize();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetTensorBufferRequirementsStrides(
-    LiteRtTensorBufferRequirements requirements, int* num_strides,
-    const uint32_t** strides) {
-  if (!requirements || !num_strides || !strides) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto& s = requirements->Strides();
-  *num_strides = s.size();
-  *strides = s.data();
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyTensorBufferRequirements(
-    LiteRtTensorBufferRequirements requirements) {
-  delete requirements;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
diff --git a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h b/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h
deleted file mode 100644
index 1c691a3ee38e..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-LITERT_DEFINE_HANDLE(LiteRtTensorBufferRequirements);
-
-LiteRtStatus LiteRtCreateTensorBufferRequirements(
-    int num_supported_tensor_buffer_types,
-    const LiteRtTensorBufferType* supported_tensor_buffer_types,
-    size_t buffer_size, int num_strides, const uint32_t* strides,
-    LiteRtTensorBufferRequirements* requirements);
-
-LiteRtStatus LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-    LiteRtTensorBufferRequirements requirements, int* num_types);
-
-LiteRtStatus LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-    LiteRtTensorBufferRequirements requirements, int type_index,
-    LiteRtTensorBufferType* type);
-
-LiteRtStatus LiteRtGetTensorBufferRequirementsBufferSize(
-    LiteRtTensorBufferRequirements requirements, size_t* buffer_size);
-
-LiteRtStatus LiteRtGetTensorBufferRequirementsStrides(
-    LiteRtTensorBufferRequirements requirements, int* num_strides,
-    const uint32_t** strides);
-
-void LiteRtDestroyTensorBufferRequirements(
-    LiteRtTensorBufferRequirements requirements);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_C_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
diff --git a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements_test.cc b/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements_test.cc
deleted file mode 100644
index 6a61eff786cb..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-
-#include <array>
-#include <cstdint>
-#include <cstring>
-
-#include <gtest/gtest.h>  // NOLINT: Need when ANDROID_API_LEVEL >= 26
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-namespace {
-
-constexpr const LiteRtTensorBufferType kSupportedTensorBufferTypes[] = {
-    kLiteRtTensorBufferTypeHostMemory,
-    kLiteRtTensorBufferTypeAhwb,
-    kLiteRtTensorBufferTypeIon,
-    kLiteRtTensorBufferTypeFastRpc,
-};
-
-constexpr const size_t kNumSupportedTensorBufferTypes =
-    sizeof(kSupportedTensorBufferTypes) /
-    sizeof(kSupportedTensorBufferTypes[0]);
-
-constexpr const size_t kBufferSize = 1234;
-
-}  // namespace
-
-TEST(TensorBufferRequirements, NoStrides) {
-  LiteRtTensorBufferRequirements requirements;
-  ASSERT_EQ(LiteRtCreateTensorBufferRequirements(
-                kNumSupportedTensorBufferTypes, kSupportedTensorBufferTypes,
-                kBufferSize,
-                /*num_strides=*/0, /*strides=*/nullptr, &requirements),
-            kLiteRtStatusOk);
-
-  int num_types;
-  ASSERT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                requirements, &num_types),
-            kLiteRtStatusOk);
-  ASSERT_EQ(num_types, kNumSupportedTensorBufferTypes);
-
-  for (auto i = 0; i < num_types; ++i) {
-    LiteRtTensorBufferType type;
-    ASSERT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                  requirements, i, &type),
-              kLiteRtStatusOk);
-    ASSERT_EQ(type, kSupportedTensorBufferTypes[i]);
-  }
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(requirements, &size),
-            kLiteRtStatusOk);
-  ASSERT_EQ(size, kBufferSize);
-
-  LiteRtDestroyTensorBufferRequirements(requirements);
-}
-
-TEST(TensorBufferRequirements, WithStrides) {
-  constexpr std::array<uint32_t, 3> kStrides = {1, 2, 3};
-
-  LiteRtTensorBufferRequirements requirements;
-  ASSERT_EQ(LiteRtCreateTensorBufferRequirements(
-                kNumSupportedTensorBufferTypes, kSupportedTensorBufferTypes,
-                kBufferSize, kStrides.size(), kStrides.data(), &requirements),
-            kLiteRtStatusOk);
-
-  int num_strides;
-  const uint32_t* strides;
-  ASSERT_EQ(LiteRtGetTensorBufferRequirementsStrides(requirements, &num_strides,
-                                                     &strides),
-            kLiteRtStatusOk);
-  ASSERT_EQ(num_strides, kStrides.size());
-  for (auto i = 0; i < kStrides.size(); ++i) {
-    ASSERT_EQ(strides[i], kStrides[i]);
-  }
-
-  LiteRtDestroyTensorBufferRequirements(requirements);
-}
diff --git a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_test.cc b/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_test.cc
deleted file mode 100644
index c77388d382f5..000000000000
--- a/tensorflow/lite/experimental/litert/c/litert_tensor_buffer_test.cc
+++ /dev/null
@@ -1,439 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-#include <cstdint>
-#include <cstring>
-
-#include <gtest/gtest.h>  // NOLINT: Need when ANDROID_API_LEVEL >= 26
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/event.h"
-#include "tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/gl_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/ion_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h"
-
-namespace {
-constexpr const float kTensorData[] = {10, 20, 30, 40};
-
-constexpr const int32_t kTensorDimensions[] = {sizeof(kTensorData) /
-                                               sizeof(kTensorData[0])};
-
-constexpr const LiteRtRankedTensorType kTensorType = {
-    /*.element_type=*/kLiteRtElementTypeFloat32,
-    ::litert::BuildLayout(kTensorDimensions)};
-
-}  // namespace
-
-TEST(TensorBuffer, HostMemory) {
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeHostMemory;
-
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  LiteRtTensorBufferType buffer_type;
-  ASSERT_EQ(LiteRtGetTensorBufferType(tensor_buffer, &buffer_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(buffer_type, kTensorBufferType);
-
-  LiteRtRankedTensorType tensor_type;
-  ASSERT_EQ(LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(tensor_type.element_type, kLiteRtElementTypeFloat32);
-  ASSERT_EQ(tensor_type.layout.rank, 1);
-  ASSERT_EQ(tensor_type.layout.dimensions[0], kTensorType.layout.dimensions[0]);
-  ASSERT_EQ(tensor_type.layout.strides, nullptr);
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferSize(tensor_buffer, &size), kLiteRtStatusOk);
-  ASSERT_EQ(size, sizeof(kTensorData));
-
-  size_t offset;
-  ASSERT_EQ(LiteRtGetTensorBufferOffset(tensor_buffer, &offset),
-            kLiteRtStatusOk);
-  ASSERT_EQ(offset, 0);
-
-  void* host_mem_addr;
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  std::memcpy(host_mem_addr, kTensorData, sizeof(kTensorData));
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  ASSERT_EQ(std::memcmp(host_mem_addr, kTensorData, sizeof(kTensorData)), 0);
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-
-TEST(TensorBuffer, Ahwb) {
-  if (!litert::internal::AhwbBuffer::IsSupported()) {
-    GTEST_SKIP() << "AHardwareBuffers are not supported on this platform; "
-                    "skipping the test";
-  }
-
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeAhwb;
-
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  LiteRtTensorBufferType buffer_type;
-  ASSERT_EQ(LiteRtGetTensorBufferType(tensor_buffer, &buffer_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(buffer_type, kTensorBufferType);
-
-  LiteRtRankedTensorType tensor_type;
-  ASSERT_EQ(LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(tensor_type.element_type, kLiteRtElementTypeFloat32);
-  ASSERT_EQ(tensor_type.layout.rank, 1);
-  ASSERT_EQ(tensor_type.layout.dimensions[0], kTensorType.layout.dimensions[0]);
-  ASSERT_EQ(tensor_type.layout.strides, nullptr);
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferSize(tensor_buffer, &size), kLiteRtStatusOk);
-  ASSERT_EQ(size, sizeof(kTensorData));
-
-  size_t offset;
-  ASSERT_EQ(LiteRtGetTensorBufferOffset(tensor_buffer, &offset),
-            kLiteRtStatusOk);
-  ASSERT_EQ(offset, 0);
-
-  void* host_mem_addr;
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  std::memcpy(host_mem_addr, kTensorData, sizeof(kTensorData));
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  ASSERT_EQ(std::memcmp(host_mem_addr, kTensorData, sizeof(kTensorData)), 0);
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-
-TEST(TensorBuffer, Ion) {
-  if (!litert::internal::IonBuffer::IsSupported()) {
-    GTEST_SKIP()
-        << "ION buffers are not supported on this platform; skipping the test";
-  }
-
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeIon;
-
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  LiteRtTensorBufferType buffer_type;
-  ASSERT_EQ(LiteRtGetTensorBufferType(tensor_buffer, &buffer_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(buffer_type, kTensorBufferType);
-
-  LiteRtRankedTensorType tensor_type;
-  ASSERT_EQ(LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(tensor_type.element_type, kLiteRtElementTypeFloat32);
-  ASSERT_EQ(tensor_type.layout.rank, 1);
-  ASSERT_EQ(tensor_type.layout.dimensions[0], kTensorType.layout.dimensions[0]);
-  ASSERT_EQ(tensor_type.layout.strides, nullptr);
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferSize(tensor_buffer, &size), kLiteRtStatusOk);
-  ASSERT_EQ(size, sizeof(kTensorData));
-
-  size_t offset;
-  ASSERT_EQ(LiteRtGetTensorBufferOffset(tensor_buffer, &offset),
-            kLiteRtStatusOk);
-  ASSERT_EQ(offset, 0);
-
-  void* host_mem_addr;
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  std::memcpy(host_mem_addr, kTensorData, sizeof(kTensorData));
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  ASSERT_EQ(std::memcmp(host_mem_addr, kTensorData, sizeof(kTensorData)), 0);
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-
-TEST(TensorBuffer, DmaBuf) {
-  if (!litert::internal::DmaBufBuffer::IsSupported()) {
-    GTEST_SKIP()
-        << "DMA-BUF buffers are not supported on this platform; skipping "
-           "the test";
-  }
-
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeDmaBuf;
-
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  LiteRtTensorBufferType buffer_type;
-  ASSERT_EQ(LiteRtGetTensorBufferType(tensor_buffer, &buffer_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(buffer_type, kTensorBufferType);
-
-  LiteRtRankedTensorType tensor_type;
-  ASSERT_EQ(LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(tensor_type.element_type, kLiteRtElementTypeFloat32);
-  ASSERT_EQ(tensor_type.layout.rank, 1);
-  ASSERT_EQ(tensor_type.layout.dimensions[0], kTensorType.layout.dimensions[0]);
-  ASSERT_EQ(tensor_type.layout.strides, nullptr);
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferSize(tensor_buffer, &size), kLiteRtStatusOk);
-  ASSERT_EQ(size, sizeof(kTensorData));
-
-  size_t offset;
-  ASSERT_EQ(LiteRtGetTensorBufferOffset(tensor_buffer, &offset),
-            kLiteRtStatusOk);
-  ASSERT_EQ(offset, 0);
-
-  void* host_mem_addr;
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  std::memcpy(host_mem_addr, kTensorData, sizeof(kTensorData));
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  ASSERT_EQ(std::memcmp(host_mem_addr, kTensorData, sizeof(kTensorData)), 0);
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-
-TEST(TensorBuffer, FastRpc) {
-  if (!litert::internal::FastRpcBuffer::IsSupported()) {
-    GTEST_SKIP()
-        << "FastRPC buffers are not supported on this platform; skipping "
-           "the test";
-  }
-
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeFastRpc;
-
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  LiteRtTensorBufferType buffer_type;
-  ASSERT_EQ(LiteRtGetTensorBufferType(tensor_buffer, &buffer_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(buffer_type, kTensorBufferType);
-
-  LiteRtRankedTensorType tensor_type;
-  ASSERT_EQ(LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(tensor_type.element_type, kLiteRtElementTypeFloat32);
-  ASSERT_EQ(tensor_type.layout.rank, 1);
-  ASSERT_EQ(tensor_type.layout.dimensions[0], kTensorType.layout.dimensions[0]);
-  ASSERT_EQ(tensor_type.layout.strides, nullptr);
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferSize(tensor_buffer, &size), kLiteRtStatusOk);
-  ASSERT_EQ(size, sizeof(kTensorData));
-
-  size_t offset;
-  ASSERT_EQ(LiteRtGetTensorBufferOffset(tensor_buffer, &offset),
-            kLiteRtStatusOk);
-  ASSERT_EQ(offset, 0);
-
-  void* host_mem_addr;
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  std::memcpy(host_mem_addr, kTensorData, sizeof(kTensorData));
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  ASSERT_EQ(std::memcmp(host_mem_addr, kTensorData, sizeof(kTensorData)), 0);
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-
-TEST(TensorBuffer, Event) {
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeHostMemory;
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  bool has_event = true;
-  ASSERT_EQ(LiteRtHasTensorBufferEvent(tensor_buffer, &has_event),
-            kLiteRtStatusOk);
-  EXPECT_FALSE(has_event);
-
-  LiteRtEvent event = new LiteRtEventT;
-  ASSERT_EQ(LiteRtSetTensorBufferEvent(tensor_buffer, event), kLiteRtStatusOk);
-
-  has_event = false;
-  ASSERT_EQ(LiteRtHasTensorBufferEvent(tensor_buffer, &has_event),
-            kLiteRtStatusOk);
-  EXPECT_TRUE(has_event);
-
-  LiteRtEvent actual_event;
-  ASSERT_EQ(LiteRtGetTensorBufferEvent(tensor_buffer, &actual_event),
-            kLiteRtStatusOk);
-  ASSERT_EQ(actual_event, event);
-
-  ASSERT_EQ(LiteRtClearTensorBufferEvent(tensor_buffer), kLiteRtStatusOk);
-  ASSERT_EQ(actual_event, event);
-
-  has_event = true;
-  ASSERT_EQ(LiteRtHasTensorBufferEvent(tensor_buffer, &has_event),
-            kLiteRtStatusOk);
-  EXPECT_FALSE(has_event);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-
-TEST(TensorBuffer, OpenCL) {
-// MSAN does not support GPU tests.
-#if defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
-  GTEST_SKIP() << "GPU tests are not supported In msan or tsan";
-#endif
-
-  if (!litert::internal::OpenClBuffer::IsSupported()) {
-    GTEST_SKIP() << "OpenCL buffers are not supported on this platform; "
-                    "skipping the test";
-  }
-
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeOpenCl;
-
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  LiteRtTensorBufferType buffer_type;
-  ASSERT_EQ(LiteRtGetTensorBufferType(tensor_buffer, &buffer_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(buffer_type, kTensorBufferType);
-
-  LiteRtRankedTensorType tensor_type;
-  ASSERT_EQ(LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(tensor_type.element_type, kLiteRtElementTypeFloat32);
-  ASSERT_EQ(tensor_type.layout.rank, 1);
-  ASSERT_EQ(tensor_type.layout.dimensions[0], kTensorType.layout.dimensions[0]);
-  ASSERT_EQ(tensor_type.layout.strides, nullptr);
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferSize(tensor_buffer, &size), kLiteRtStatusOk);
-  ASSERT_EQ(size, sizeof(kTensorData));
-
-  size_t offset;
-  ASSERT_EQ(LiteRtGetTensorBufferOffset(tensor_buffer, &offset),
-            kLiteRtStatusOk);
-  ASSERT_EQ(offset, 0);
-
-  void* host_mem_addr;
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  std::memcpy(host_mem_addr, kTensorData, sizeof(kTensorData));
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  ASSERT_EQ(std::memcmp(host_mem_addr, kTensorData, sizeof(kTensorData)), 0);
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-
-#if LITERT_HAS_OPENGL_SUPPORT
-TEST(TensorBuffer, GlBuffer) {
-// MSAN does not support GPU tests.
-#if defined(MEMORY_SANITIZER)
-  GTEST_SKIP() << "GPU tests are not supported In msan";
-#endif
-
-  if (!litert::internal::GlBuffer::IsSupported()) {
-    GTEST_SKIP() << "OpenGL buffers are not supported on this platform; "
-                    "skipping the test";
-  }
-
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeGlBuffer;
-
-  LiteRtTensorBuffer tensor_buffer;
-  ASSERT_EQ(
-      LiteRtCreateManagedTensorBuffer(kTensorBufferType, &kTensorType,
-                                      sizeof(kTensorData), &tensor_buffer),
-      kLiteRtStatusOk);
-
-  LiteRtTensorBufferType buffer_type;
-  ASSERT_EQ(LiteRtGetTensorBufferType(tensor_buffer, &buffer_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(buffer_type, kTensorBufferType);
-
-  LiteRtRankedTensorType tensor_type;
-  ASSERT_EQ(LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type),
-            kLiteRtStatusOk);
-  ASSERT_EQ(tensor_type.element_type, kLiteRtElementTypeFloat32);
-  ASSERT_EQ(tensor_type.layout.rank, 1);
-  ASSERT_EQ(tensor_type.layout.dimensions[0], kTensorType.layout.dimensions[0]);
-  ASSERT_EQ(tensor_type.layout.strides, nullptr);
-
-  size_t size;
-  ASSERT_EQ(LiteRtGetTensorBufferSize(tensor_buffer, &size), kLiteRtStatusOk);
-  ASSERT_EQ(size, sizeof(kTensorData));
-
-  size_t offset;
-  ASSERT_EQ(LiteRtGetTensorBufferOffset(tensor_buffer, &offset),
-            kLiteRtStatusOk);
-  ASSERT_EQ(offset, 0);
-
-  void* host_mem_addr;
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  std::memcpy(host_mem_addr, kTensorData, sizeof(kTensorData));
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr),
-            kLiteRtStatusOk);
-  ASSERT_EQ(std::memcmp(host_mem_addr, kTensorData, sizeof(kTensorData)), 0);
-  ASSERT_EQ(LiteRtUnlockTensorBuffer(tensor_buffer), kLiteRtStatusOk);
-
-  LiteRtDestroyTensorBuffer(tensor_buffer);
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT
diff --git a/tensorflow/lite/experimental/litert/cc/BUILD b/tensorflow/lite/experimental/litert/cc/BUILD
deleted file mode 100644
index 56ff8c56aa29..000000000000
--- a/tensorflow/lite/experimental/litert/cc/BUILD
+++ /dev/null
@@ -1,558 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "litert_environment",
-    hdrs = ["litert_environment.h"],
-    deps = [
-        ":litert_any",
-        ":litert_expected",
-        ":litert_handle",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_environment",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "litert_environment_test",
-    srcs = [
-        "litert_environment_test.cc",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-llog"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":litert_any",
-        ":litert_compiled_model",
-        ":litert_environment",
-        ":litert_expected",
-        ":litert_model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:simple_model",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_event",
-    hdrs = ["litert_event.h"],
-    deps = [
-        ":litert_expected",
-        ":litert_handle",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_event",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "litert_any",
-    hdrs = ["litert_any.h"],
-    deps = [
-        ":litert_expected",
-        ":litert_macros",
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "litert_any_test",
-    srcs = [
-        "litert_any_test.cc",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-llog"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_model",
-    srcs = ["litert_model.cc"],
-    hdrs = [
-        "litert_consts.h",
-        "litert_model.h",
-    ],
-    deps = [
-        ":litert_buffer_ref",
-        ":litert_detail",
-        ":litert_element_type",
-        ":litert_expected",
-        ":litert_handle",
-        ":litert_layout",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "litert_model_test",
-    srcs = [
-        "litert_model_test.cc",
-    ],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-    ],
-    deps = [
-        ":litert_element_type",
-        ":litert_layout",
-        ":litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_handle",
-    hdrs = ["litert_handle.h"],
-)
-
-cc_library(
-    name = "litert_tensor_buffer",
-    hdrs = [
-        "litert_tensor_buffer.h",
-        "litert_tensor_buffer_requirements.h",
-    ],
-    deps = [
-        ":litert_detail",
-        ":litert_event",
-        ":litert_expected",
-        ":litert_handle",
-        ":litert_macros",
-        ":litert_model",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_event",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "@com_google_absl//absl/types:span",
-        "@opencl_headers",
-    ],
-)
-
-cc_test(
-    name = "litert_tensor_buffer_test",
-    srcs = [
-        "litert_tensor_buffer_test.cc",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":litert_layout",
-        ":litert_macros",
-        ":litert_model",
-        ":litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/runtime:tensor_buffer",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/gpu/gl:egl_environment",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "litert_tensor_buffer_requirements",
-    hdrs = [
-        "litert_tensor_buffer_requirements.h",
-    ],
-    deps = [
-        ":litert_detail",
-        ":litert_handle",
-        ":litert_macros",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "litert_tensor_buffer_requirements_test",
-    srcs = [
-        "litert_tensor_buffer_requirements_test.cc",
-    ],
-    deps = [
-        ":litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_buffer_ref",
-    hdrs = [
-        "litert_buffer_ref.h",
-    ],
-    deps = [
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "litert_macros",
-    hdrs = ["litert_macros.h"],
-    deps = [
-        ":litert_expected",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "@com_google_absl//absl/log:absl_check",
-    ],
-)
-
-cc_test(
-    name = "litert_macros_test",
-    srcs = ["litert_macros_test.cc"],
-    deps = [
-        ":litert_expected",
-        ":litert_macros",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_expected",
-    hdrs = ["litert_expected.h"],
-    deps = [
-        ":litert_detail",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-    ],
-)
-
-cc_test(
-    name = "litert_expected_test",
-    srcs = ["litert_expected_test.cc"],
-    deps = [
-        ":litert_buffer_ref",
-        ":litert_expected",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_detail",
-    hdrs = ["litert_detail.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "@com_google_absl//absl/log:absl_check",
-    ],
-)
-
-# Dispatch Delegate of LiteRt.
-# Warning: This API is not ABI stable and is subject to change.
-cc_library(
-    name = "litert_dispatch_delegate",
-    hdrs = [
-        "litert_dispatch_delegate.h",
-    ],
-    deps = [
-        "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/utils:simple_opaque_delegate",
-        "//tensorflow/lite/experimental/litert/c:litert_environment",
-        "//tensorflow/lite/experimental/litert/runtime/dispatch:dispatch_delegate",
-    ],
-)
-
-cc_test(
-    name = "litert_buffer_ref_test",
-    srcs = ["litert_buffer_ref_test.cc"],
-    deps = [
-        ":litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_element_type",
-    hdrs = ["litert_element_type.h"],
-    deps = ["//tensorflow/lite/experimental/litert/c:litert_model"],
-)
-
-cc_test(
-    name = "litert_element_type_test",
-    srcs = ["litert_element_type_test.cc"],
-    deps = [
-        ":litert_element_type",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_model_predicates",
-    srcs = ["litert_model_predicates.cc"],
-    hdrs = ["litert_model_predicates.h"],
-    deps = [
-        ":litert_detail",
-        ":litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "litert_layout",
-    hdrs = ["litert_layout.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_layout",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "litert_model_predicates_test",
-    srcs = ["litert_model_predicates_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-    ],
-    deps = [
-        ":litert_element_type",
-        ":litert_model",
-        ":litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "litert_layout_test",
-    srcs = ["litert_layout_test.cc"],
-    deps = [
-        ":litert_layout",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_compiled_model",
-    srcs = ["litert_compiled_model.cc"],
-    hdrs = ["litert_compiled_model.h"],
-    deps = [
-        ":litert_environment",
-        ":litert_expected",
-        ":litert_handle",
-        ":litert_macros",
-        ":litert_model",
-        ":litert_tensor_buffer",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:cc_api_stable",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_compiled_model",
-        "//tensorflow/lite/experimental/litert/c:litert_compiled_model_options",
-        "//tensorflow/lite/experimental/litert/c:litert_environment",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "litert_compiled_model_test",
-    srcs = ["litert_compiled_model_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/simple_model.tflite",
-    ],
-    deps = [
-        ":litert_compiled_model",
-        ":litert_environment",
-        ":litert_model",
-        ":litert_tensor_buffer",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:simple_model",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-# copybara:uncomment_begin(google-only)
-# cc_test(
-#     name = "litert_compiled_model_gpu_test",
-#     srcs = ["litert_compiled_model_gpu_test.cc"],
-#     data = [
-#         "//tensorflow/lite/experimental/litert/test:testdata/simple_model.tflite",
-#     ],
-#     tags = [
-#         "requires-gpu-nvidia",
-#     ],
-#     deps = [
-#         ":litert_compiled_model",
-#         ":litert_environment",
-#         ":litert_model",
-#         ":litert_tensor_buffer",
-#         "@com_google_googletest//:gtest_main",
-#         "@com_google_absl//absl/debugging:leak_check",
-#         "@com_google_absl//absl/log:absl_log",
-#         "@com_google_absl//absl/strings:string_view",
-#         "@com_google_absl//absl/types:span",
-#         "//third_party/odml/infra/ml_drift_delegate/litert:ml_drift_cl_accelerator",  # buildcleaner: keep
-#         "//tensorflow/lite:framework",
-#         "//tensorflow/lite/c:c_api_opaque",
-#         "//tensorflow/lite/c:common",
-#         "//tensorflow/lite/experimental/litert/c:litert_common",
-#         "//tensorflow/lite/experimental/litert/test:common",
-#         "//tensorflow/lite/experimental/litert/test:simple_model",
-#         "//tensorflow/lite/kernels:builtin_ops",
-#     ],
-# )
-# copybara:uncomment_end
-
-cc_library(
-    name = "litert_tensor_buffer_utils",
-    srcs = ["litert_tensor_buffer_utils.cc"],
-    hdrs = ["litert_tensor_buffer_utils.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-    ],
-)
-
-cc_library(
-    name = "litert_op_options",
-    srcs = ["litert_op_options.cc"],
-    hdrs = ["litert_op_options.h"],
-    deps = [
-        ":litert_expected",
-        ":litert_macros",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "litert_op_options_test",
-    srcs = ["litert_op_options_test.cc"],
-    deps = [
-        ":litert_op_options",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_shared_library",
-    srcs = ["litert_shared_library.cc"],
-    hdrs = ["litert_shared_library.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "test_litert_shared_library_impl",
-    srcs = ["test_shared_library.cc"],
-)
-
-cc_shared_library(
-    name = "test_litert_shared_library",
-    shared_lib_name = "test_shared_library.so",
-    deps = [":test_litert_shared_library_impl"],
-)
-
-cc_test(
-    name = "litert_shared_library_test",
-    srcs = ["litert_shared_library_test.cc"],
-    data = [":test_litert_shared_library"],
-    defines = ["LITERT_DEFINE_GTEST_STATUS_PRINTER"],
-    deps = [
-        ":litert_shared_library",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-exports_files(srcs = glob(["litert_*.h"]))
diff --git a/tensorflow/lite/experimental/litert/cc/litert_any.h b/tensorflow/lite/experimental/litert/cc/litert_any.h
deleted file mode 100644
index 97483ce3d63d..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_any.h
+++ /dev/null
@@ -1,221 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ANY_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ANY_H_
-
-#include <any>
-#include <cstdint>
-#include <limits>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-
-namespace litert {
-
-inline std::any ToStdAny(LiteRtAny litert_any) {
-  std::any res;
-  switch (litert_any.type) {
-    case kLiteRtAnyTypeNone:
-      break;
-    case kLiteRtAnyTypeBool:
-      res = litert_any.bool_value;
-      break;
-    case kLiteRtAnyTypeInt:
-      res = litert_any.int_value;
-      break;
-    case kLiteRtAnyTypeReal:
-      res = litert_any.real_value;
-      break;
-    case kLiteRtAnyTypeString:
-      res = litert_any.str_value;
-      break;
-    case kLiteRtAnyTypeVoidPtr:
-      res = litert_any.ptr_value;
-      break;
-  }
-  return res;
-}
-
-inline Expected<LiteRtAny> ToLiteRtAny(const std::any& any) {
-  LiteRtAny result;
-  if (!any.has_value()) {
-    result.type = kLiteRtAnyTypeNone;
-    return result;
-
-  } else if (any.type() == typeid(LiteRtAny::bool_value)) {
-    result.type = kLiteRtAnyTypeBool;
-    result.bool_value = std::any_cast<decltype(LiteRtAny::bool_value)>(any);
-    return result;
-
-  } else if (any.type() == typeid(int8_t)) {
-    result.type = kLiteRtAnyTypeInt;
-    result.int_value = std::any_cast<int8_t>(any);
-    return result;
-
-  } else if (any.type() == typeid(int16_t)) {
-    result.type = kLiteRtAnyTypeInt;
-    result.int_value = std::any_cast<int16_t>(any);
-    return result;
-
-  } else if (any.type() == typeid(int32_t)) {
-    result.type = kLiteRtAnyTypeInt;
-    result.int_value = std::any_cast<int32_t>(any);
-    return result;
-
-  } else if (any.type() == typeid(int64_t)) {
-    result.type = kLiteRtAnyTypeInt;
-    result.int_value = std::any_cast<int64_t>(any);
-    return result;
-
-  } else if (any.type() == typeid(float)) {
-    result.type = kLiteRtAnyTypeReal;
-    result.real_value = std::any_cast<float>(any);
-    return result;
-
-  } else if (any.type() == typeid(double)) {
-    result.type = kLiteRtAnyTypeReal;
-    result.real_value = std::any_cast<double>(any);
-    return result;
-
-  } else if (any.type() == typeid(LiteRtAny::str_value)) {
-    result.type = kLiteRtAnyTypeString;
-    result.str_value = std::any_cast<decltype(LiteRtAny::str_value)>(any);
-    return result;
-
-  } else if (any.type() == typeid(absl::string_view)) {
-    result.type = kLiteRtAnyTypeString;
-    result.str_value = std::any_cast<absl::string_view>(any).data();
-    return result;
-
-  } else if (any.type() == typeid(LiteRtAny::ptr_value)) {
-    result.type = kLiteRtAnyTypeVoidPtr;
-    result.ptr_value = std::any_cast<decltype(LiteRtAny::ptr_value)>(any);
-    return result;
-
-  } else {
-    return Error(kLiteRtStatusErrorInvalidArgument,
-                 "Invalid argument for ToLiteRtAny");
-  }
-}
-
-namespace internal {
-
-inline Expected<void> CheckType(const LiteRtAny& any,
-                                const LiteRtAnyType type) {
-  if (any.type != kLiteRtAnyTypeString) {
-    return Error(kLiteRtStatusErrorInvalidArgument,
-                 absl::StrFormat("Wrong LiteRtAny type. Expected %s, got %s.",
-                                 LiteRtAnyTypeToString(type),
-                                 LiteRtAnyTypeToString(any.type)));
-  }
-  return {};
-}
-
-template <class T>
-Expected<T> GetInt(const LiteRtAny& any) {
-  LITERT_RETURN_IF_ERROR(CheckType(any, kLiteRtAnyTypeInt));
-  if (any.int_value > std::numeric_limits<T>::max() ||
-      any.int_value < std::numeric_limits<T>::lowest()) {
-    return Error(
-        kLiteRtStatusErrorInvalidArgument,
-        absl::StrFormat("LiteRtAny integer is out of range. %v <= %v <= %v",
-                        std::numeric_limits<T>::lowest(), any.int_value,
-                        std::numeric_limits<T>::max()));
-  }
-  return any.int_value;
-}
-
-template <class T>
-Expected<T> GetReal(const LiteRtAny& any) {
-  LITERT_RETURN_IF_ERROR(CheckType(any, kLiteRtAnyTypeReal));
-  if (any.real_value > std::numeric_limits<T>::max() ||
-      any.real_value < std::numeric_limits<T>::lowest()) {
-    return Error(
-        kLiteRtStatusErrorInvalidArgument,
-        absl::StrFormat(
-            "LiteRtAny integer is out of range. %v <= %v <= %v failed.",
-            std::numeric_limits<T>::lowest(), any.real_value,
-            std::numeric_limits<T>::max()));
-  }
-  return any.real_value;
-}
-}  // namespace internal
-
-// Extracts the value from a LiteRtAny object with type checking.
-template <class T>
-inline Expected<T> Get(const LiteRtAny& any);
-
-template <>
-inline Expected<bool> Get(const LiteRtAny& any) {
-  LITERT_RETURN_IF_ERROR(internal::CheckType(any, kLiteRtAnyTypeBool));
-  return any.bool_value;
-}
-
-template <>
-inline Expected<int8_t> Get(const LiteRtAny& any) {
-  return internal::GetInt<int8_t>(any);
-}
-
-template <>
-inline Expected<int16_t> Get(const LiteRtAny& any) {
-  return internal::GetInt<int16_t>(any);
-}
-
-template <>
-inline Expected<int32_t> Get(const LiteRtAny& any) {
-  return internal::GetInt<int32_t>(any);
-}
-
-template <>
-inline Expected<int64_t> Get(const LiteRtAny& any) {
-  return internal::GetInt<int64_t>(any);
-}
-
-template <>
-inline Expected<float> Get(const LiteRtAny& any) {
-  return internal::GetReal<float>(any);
-}
-
-template <>
-inline Expected<double> Get(const LiteRtAny& any) {
-  return internal::GetReal<double>(any);
-}
-
-template <>
-inline Expected<std::string> Get(const LiteRtAny& any) {
-  LITERT_RETURN_IF_ERROR(internal::CheckType(any, kLiteRtAnyTypeString));
-  return std::string(any.str_value);
-}
-
-template <>
-inline Expected<absl::string_view> Get(const LiteRtAny& any) {
-  LITERT_RETURN_IF_ERROR(internal::CheckType(any, kLiteRtAnyTypeString));
-  return absl::string_view(any.str_value);
-}
-
-template <>
-inline Expected<const void*> Get(const LiteRtAny& any) {
-  LITERT_RETURN_IF_ERROR(internal::CheckType(any, kLiteRtAnyTypeVoidPtr));
-  return any.ptr_value;
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ANY_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_any_test.cc b/tensorflow/lite/experimental/litert/cc/litert_any_test.cc
deleted file mode 100644
index c6640ab8060c..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_any_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <any>
-#include <cstdint>
-
-#include <gtest/gtest.h>  // NOLINT: Need when ANDROID_API_LEVEL >= 26
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-
-TEST(Any, ConversionNone) {
-  EXPECT_FALSE(
-      litert::ToStdAny(LiteRtAny{/*.type=*/kLiteRtAnyTypeNone}).has_value());
-
-  ASSERT_EQ(litert::ToLiteRtAny(std::any())->type, kLiteRtAnyTypeNone);
-}
-
-TEST(Any, ConversionBool) {
-  ASSERT_EQ(std::any_cast<bool>(litert::ToStdAny(LiteRtAny{
-                /*.type=*/kLiteRtAnyTypeBool, {/*.bool_value=*/true}})),
-            true);
-  ASSERT_EQ(std::any_cast<bool>(litert::ToStdAny(LiteRtAny{
-                /*.type=*/kLiteRtAnyTypeBool, {/*.bool_value=*/false}})),
-            false);
-
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(true))->type, kLiteRtAnyTypeBool);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(true))->bool_value, true);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(false))->type, kLiteRtAnyTypeBool);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(false))->bool_value, false);
-}
-
-TEST(Any, ConversionInt) {
-  LiteRtAny litert_any;
-  litert_any.type = kLiteRtAnyTypeInt;
-  litert_any.int_value = 1234;
-  ASSERT_EQ(std::any_cast<int64_t>(litert::ToStdAny(litert_any)), 1234);
-
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(static_cast<int8_t>(12)))->type,
-            kLiteRtAnyTypeInt);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(static_cast<int8_t>(12)))->int_value,
-            12);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(static_cast<int16_t>(1234)))->type,
-            kLiteRtAnyTypeInt);
-  ASSERT_EQ(
-      litert::ToLiteRtAny(std::any(static_cast<int16_t>(1234)))->int_value,
-      1234);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(static_cast<int32_t>(1234)))->type,
-            kLiteRtAnyTypeInt);
-  ASSERT_EQ(
-      litert::ToLiteRtAny(std::any(static_cast<int32_t>(1234)))->int_value,
-      1234);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(static_cast<int64_t>(1234)))->type,
-            kLiteRtAnyTypeInt);
-  ASSERT_EQ(
-      litert::ToLiteRtAny(std::any(static_cast<int64_t>(1234)))->int_value,
-      1234);
-}
-
-TEST(Any, ConversionReal) {
-  LiteRtAny litert_any;
-  litert_any.type = kLiteRtAnyTypeReal;
-  litert_any.real_value = 123.4;
-  ASSERT_EQ(std::any_cast<double>(litert::ToStdAny(litert_any)), 123.4);
-
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(static_cast<float>(1.2)))->type,
-            kLiteRtAnyTypeReal);
-  EXPECT_NEAR(
-      litert::ToLiteRtAny(std::any(static_cast<float>(1.2)))->real_value, 1.2,
-      1e-7);
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(static_cast<double>(1.2)))->type,
-            kLiteRtAnyTypeReal);
-  EXPECT_NEAR(
-      litert::ToLiteRtAny(std::any(static_cast<double>(1.2)))->real_value, 1.2,
-      1e-7);
-}
-
-TEST(Any, ConversionString) {
-  constexpr const char* kTestString = "test";
-  LiteRtAny litert_any;
-  litert_any.type = kLiteRtAnyTypeString;
-  litert_any.str_value = kTestString;
-  ASSERT_EQ(std::any_cast<const char*>(litert::ToStdAny(litert_any)),
-            kTestString);
-
-  ASSERT_EQ(litert::ToLiteRtAny(std::any("test"))->type, kLiteRtAnyTypeString);
-  EXPECT_STREQ(litert::ToLiteRtAny(std::any("test"))->str_value, "test");
-}
-
-TEST(Any, ConversionPtr) {
-  const void* kTestPtr = reinterpret_cast<const void*>(1234);
-  LiteRtAny litert_any;
-  litert_any.type = kLiteRtAnyTypeVoidPtr;
-  litert_any.ptr_value = kTestPtr;
-  ASSERT_EQ(std::any_cast<const void*>(litert::ToStdAny(litert_any)), kTestPtr);
-
-  ASSERT_EQ(litert::ToLiteRtAny(std::any(kTestPtr))->type,
-            kLiteRtAnyTypeVoidPtr);
-  EXPECT_EQ(litert::ToLiteRtAny(std::any(kTestPtr))->ptr_value, kTestPtr);
-}
diff --git a/tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h b/tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h
deleted file mode 100644
index c81b5d12524a..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h
+++ /dev/null
@@ -1,356 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_BUFFER_REF_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_BUFFER_REF_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <tuple>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-
-namespace litert {
-
-//===----------------------------------------------------------------------===//
-//
-//                                                            << BUFFER REF >>
-//
-// Read, read/write, and owning views of buffers of arbitrary byte width types.
-//
-// Serialized model artifacts and assets are frequently large strings that with
-// (annoyingly) non-standard char type and left padded. The following classes
-// simplify handling such buffers in an efficient copy free manner. They also
-// provide read and write left-padded aware interpretebility through standard
-// signed char strings types. This is used for making manual edits to flatbuffer
-// metadata or dierctly to serialized flatbuffer.
-// NOTE: std::basic_xxx<unsigned char> not supported by our C++ toolchain.
-//
-// Pre-allocated buffers can be transferred to these classes or allocation can
-// be internalized. XBufferRefs can be implictly upcasted to non-owning
-// read/write or read-only to provide other routines with an appropriate view of
-// the data. E.g.:
-//
-// ```
-// void ReadBuffer(BufferRef r_buf) { std::cerr << r_buf.StrView(); }
-// void WriteToBuffer(MutableBufferRef rw_buf) { rw_buf.WriteTo("SomeData"); }
-// ...
-// OwningBuffer<uint8_t> buf(size);
-// WriteToBuffer(buf); // Implicitly convert to read/write with no ownership.
-// ReadBuffer(buf); // Implicitly convert to read-only.
-// ```
-//
-//===----------------------------------------------------------------------===//
-
-// Allocation/Deallocation behavior for owning buffer refs. An allocator is a
-// trivially constructible/destructible object that overrides () for allocating
-// and freeing memory.
-
-// Malloc/free based memory.
-template <typename ByteT = uint8_t>
-struct Mallocator {
-  void operator()(ByteT* d) {
-    if (d != nullptr) {
-      free(d);
-    }
-  }
-
-  ByteT* operator()(size_t bytes) {
-    return reinterpret_cast<ByteT*>(malloc(bytes));
-  }
-};
-
-// New/delete based memory.
-template <typename ByteT = uint8_t>
-struct Newlocator {
-  void operator()(ByteT* d) {
-    if (d != nullptr) {
-      delete[] d;
-    }
-  }
-
-  ByteT* operator()(size_t bytes) { return new ByteT[bytes]; }
-};
-
-//
-// Read-Only Bytes
-//
-
-// Immutable and non-owning view of a buffer.
-template <typename ByteT = uint8_t>
-class BufferRef {
- public:
-  using TupleT = std::tuple<const ByteT* const, const size_t, const size_t>;
-
-  // Null buffer.
-  BufferRef() : size_(0), offset_(0), data_(nullptr) {}
-
-  // Construct from already allocated buffer. Methods will only expose
-  // data[offset, offset + size].
-  BufferRef(const ByteT* data, size_t size, size_t offset = 0)
-      : size_(size), offset_(offset), data_(const_cast<ByteT*>(data)) {}
-  BufferRef(const void* data, size_t size, size_t offset = 0)
-      : size_(size),
-        offset_(offset),
-        data_(const_cast<ByteT*>(reinterpret_cast<const ByteT*>(data))) {}
-  explicit BufferRef(absl::Span<const ByteT> data)
-      : size_(data.size()),
-        offset_(0),
-        data_(const_cast<ByteT*>(data.data())) {}
-
-  // Start of actual data.
-  const ByteT* Data() const { return data_ + offset_; }
-
-  // Size of actual data.
-  size_t Size() const { return size_ - offset_; }
-
-  // Get buffer details in tuple form.
-  TupleT Get() const { return TupleT(data_, size_, offset_); }
-
-  // Start of actual data as signed char. Might not be null terminated.
-  const char* StrData() const { return reinterpret_cast<const char*>(Data()); }
-
-  // Convenience view of actual data as a string. Makes null terminated.
-  absl::string_view StrView() const {
-    return absl::string_view(StrData(), Size());
-  }
-
-  // Const view of actual data.
-  absl::Span<const ByteT> Span() const {
-    return absl::MakeConstSpan(Data(), Size());
-  }
-
-  // Copy the buffer data to a vector.
-  std::vector<ByteT> ToVec() const {
-    return std::vector<ByteT>(StrData(), StrData() + Size());
-  }
-
-  // Write the string data to a stream.
-  void WriteStr(std::ostream& out) const { out.write(StrData(), Size()); }
-
-  // Print info about this buffer.
-  void Dump(std::ostream& out) const {
-    out << absl::StreamFormat("%s[%lu:%lu]\n", TypeName(), offset_, size_);
-  }
-
-  BufferRef(const BufferRef& other) = default;
-  BufferRef& operator=(const BufferRef& other) = default;
-
-  virtual ~BufferRef() = default;
-
- protected:
-  size_t size_;
-  size_t offset_;
-  ByteT* data_ = nullptr;
-
-  // Debug name.
-  virtual absl::string_view TypeName() const { return "BufferRef"; }
-};
-template <typename ByteT = uint8_t>
-BufferRef(const ByteT*, size_t, size_t) -> BufferRef<ByteT>;
-
-//
-// Read-Write Non-Owning Bytes
-//
-
-// Writeable (but still non-owning) version of BufferRef.
-template <typename ByteT>
-class MutableBufferRef : public BufferRef<ByteT> {
- public:
-  using TupleT = std::tuple<ByteT* const, const size_t, const size_t>;
-
-  // Null buffer.
-  MutableBufferRef()
-      : BufferRef<ByteT>((ByteT*)nullptr, /*size*/ 0, /*offset*/ 0) {}
-
-  // Create a mutable view from pre-allocated non-const buffer.
-  MutableBufferRef(ByteT* data, size_t size, size_t offset = 0)
-      : BufferRef<ByteT>(data, size, offset) {}
-  MutableBufferRef(void* data, size_t size, size_t offset = 0)
-      : BufferRef<ByteT>(data, size, offset) {}
-  explicit MutableBufferRef(absl::Span<ByteT> data) : BufferRef<ByteT>(data) {}
-  explicit MutableBufferRef(absl::Span<const ByteT> data) = delete;
-  MutableBufferRef(const ByteT*, size_t, size_t) = delete;
-  MutableBufferRef(const void*, size_t, size_t) = delete;
-
-  // Mutable start of actual data.
-  ByteT* Data() { return this->data_ + this->offset_; }
-
-  // Get the mutable start of actual data as a char pointer.
-  char* StrData() { return reinterpret_cast<char*>(Data()); }
-
-  // Get buffer info in tuple form.
-  TupleT Get() { return TupleT(this->data_, this->size_, this->offset_); }
-
-  // Mutable span of actual data.
-  absl::Span<ByteT> Span() { return absl::MakeSpan(Data(), this->Size()); }
-
-  // Write string into the actual buffer at offset. Returns false if the entire
-  // string cannot fit into the actual buffer.
-  bool WriteInto(absl::string_view str, size_t offset = 0) {
-    if (str.size() > this->Size() - offset) {
-      return false;
-    }
-    std::memcpy(Data() + offset, str.data(), str.size());
-    return true;
-  }
-
-  MutableBufferRef(const MutableBufferRef& other) = default;
-  MutableBufferRef& operator=(const MutableBufferRef& other) = default;
-
- protected:
-  // Debug name.
-  absl::string_view TypeName() const override { return "MutableBufferRef"; }
-};
-template <typename ByteT>
-MutableBufferRef(ByteT*, size_t, size_t) -> MutableBufferRef<ByteT>;
-
-//
-// Read-Write Owning Bytes
-//
-
-// Writable and owning buffer reference. Can allocate new buffers internally and
-// take ownership of existing buffers. Does not support resizing.
-template <typename ByteT = uint8_t, class Allocator = Newlocator<ByteT>>
-class OwningBufferRef : public MutableBufferRef<ByteT> {
- public:
-  using TupleT = std::tuple<ByteT* const, const size_t, const size_t>;
-  using WeakTupleT = std::tuple<ByteT*&, size_t&, size_t&>;
-
-  // Null buffer.
-  OwningBufferRef()
-      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, /*size*/ 0,
-                                /*offset*/ 0) {}
-
-  // Initialize a new buffer reference and allocate internally.
-  explicit OwningBufferRef(size_t size)
-      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, size, /*offset*/ 0) {
-    this->data_ = (ByteT*)Allocator()(size);
-  }
-
-  // Take ownership of given buffer.
-  OwningBufferRef(ByteT* data, size_t size, size_t offset = 0)
-      : MutableBufferRef<ByteT>(data, size, offset) {}
-  OwningBufferRef(void* data, size_t size, size_t offset = 0)
-      : MutableBufferRef<ByteT>(data, size, offset) {}
-  explicit OwningBufferRef(absl::Span<ByteT> data)
-      : MutableBufferRef<ByteT>(data) {}
-
-  // Copy the given buffer.
-  OwningBufferRef(const ByteT* data, size_t size)
-      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, size,
-                                /*offset*/ 0) {
-    this->data_ = (ByteT*)Allocator()(size);
-    std::memcpy(this->data_, data, size);
-  }
-  explicit OwningBufferRef(absl::Span<const ByteT> data)
-      : OwningBufferRef<ByteT, Allocator>(data.data(), data.size()) {}
-
-  // Copy data from givens string.
-  explicit OwningBufferRef(absl::string_view data)
-      : OwningBufferRef<ByteT, Allocator>(
-            reinterpret_cast<const ByteT*>(data.data()), data.size()) {}
-
-  // Copy data from given c-style string.
-  explicit OwningBufferRef(const char* data)
-      : OwningBufferRef<ByteT, Allocator>(absl::string_view(data)) {}
-
-  // Drop reference to any owned memory.
-  void Drop() {
-    this->data_ = nullptr;
-    this->size_ = 0;
-    this->offset_ = 0;
-  }
-
-  // Get the buffer details and drop references to them.
-  TupleT Release() {
-    auto res = std::make_tuple(this->data_, this->size_, this->offset_);
-    Drop();
-    return res;
-  }
-
-  // Get weak references to buffer data. Takes ownership of anything that
-  // is swapped in.
-  WeakTupleT GetWeak() {
-    return WeakTupleT(this->data_, this->size_, this->offset_);
-  }
-
-  // Free any owned memory.
-  void Reset() {
-    Allocator()(this->data_);
-    Drop();
-  }
-
-  // Reset any existing data and copy in given ro buffer.
-  void Assign(const ByteT* buf, size_t size, size_t offset = 0) {
-    Reset();
-    this->size_ = size;
-    this->data_ = (ByteT*)Allocator()(this->size_);
-    std::memcpy(this->data_, buf, this->size_);
-    this->offset_ = offset;
-  }
-
-  OwningBufferRef(OwningBufferRef&& other)
-      : MutableBufferRef<ByteT>(other.data_, other.size_, other.offset_) {
-    other.Drop();
-  }
-
-  OwningBufferRef& operator=(OwningBufferRef&& other) {
-    if (this != &other) {
-      Reset();
-      this->data_ = other.data_;
-      this->size_ = other.size_;
-      this->offset_ = other.offset_;
-      other.Drop();
-    }
-    return *this;
-  }
-
-  OwningBufferRef(const OwningBufferRef& other)
-      : MutableBufferRef<ByteT>(/*data*/ (ByteT*)nullptr, other.size_,
-                                other.offset_) {
-    Assign(other.data_, other.size_, other.offset_);
-  }
-
-  OwningBufferRef& operator=(const OwningBufferRef& other) {
-    Assign(other.data_, other.size_, other.offset_);
-    return *this;
-  }
-
-  ~OwningBufferRef() override { Reset(); }
-
- protected:
-  // Debug string.
-  absl::string_view TypeName() const override { return "OwningBufferRef"; }
-};
-
-template <typename ByteT = uint8_t, class Allocator = Newlocator<ByteT>>
-OwningBufferRef(const ByteT*, size_t) -> OwningBufferRef<ByteT, Allocator>;
-
-template <typename ByteT = uint8_t, class Allocator = Newlocator<ByteT>>
-OwningBufferRef(ByteT*, size_t) -> OwningBufferRef<ByteT, Allocator>;
-
-template <typename ByteT = char, class Allocator = Newlocator<ByteT>>
-OwningBufferRef(const char*) -> OwningBufferRef<ByteT, Allocator>;
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_BUFFER_REF_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_buffer_ref_test.cc b/tensorflow/lite/experimental/litert/cc/litert_buffer_ref_test.cc
deleted file mode 100644
index a2900d0c8946..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_buffer_ref_test.cc
+++ /dev/null
@@ -1,332 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <sstream>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-using litert::BufferRef;
-using litert::Mallocator;
-using litert::MutableBufferRef;
-using litert::Newlocator;
-using litert::OwningBufferRef;
-using litert::internal::FbBufToStr;
-using testing::ElementsAreArray;
-using testing::Eq;
-using testing::Pointwise;
-using testing::StartsWith;
-
-namespace {
-
-static constexpr size_t kOffset = 4;
-
-static constexpr absl::string_view kData = "SomeRawBuffer";
-static constexpr absl::string_view kOtherData = "SOMERawBuffer";
-
-absl::Span<const uint8_t> MakeConstFbData(absl::string_view data) {
-  const uint8_t* fb_data = reinterpret_cast<const uint8_t*>(data.data());
-  return absl::MakeConstSpan(fb_data, data.size());
-}
-
-absl::Span<uint8_t> MakeFbData(absl::string_view data) {
-  const uint8_t* c_fb_data = reinterpret_cast<const uint8_t*>(data.data());
-  uint8_t* fb_data = const_cast<uint8_t*>(c_fb_data);
-  return absl::MakeSpan(fb_data, data.size());
-}
-
-std::vector<uint8_t> MakeFbDataVec(absl::string_view data) {
-  const uint8_t* c_fb_data = reinterpret_cast<const uint8_t*>(data.data());
-  uint8_t* fb_data = const_cast<uint8_t*>(c_fb_data);
-  return std::vector<uint8_t>(fb_data, fb_data + data.size());
-}
-
-template <class Allocator = Newlocator<uint8_t>, typename ByteT = uint8_t>
-absl::Span<ByteT> MakeInternalTestBuffer(absl::string_view data) {
-  ByteT* buffer = Allocator()(data.size());
-  std::memcpy(buffer, data.data(), data.size());
-  return absl::MakeSpan(reinterpret_cast<ByteT*>(buffer), data.size());
-}
-
-//
-// flatbuffer_tools.h
-//
-
-TEST(FbBufToStringTest, ConstSpan) {
-  EXPECT_THAT(FbBufToStr(MakeConstFbData(kData)), Pointwise(Eq(), kData));
-}
-
-TEST(FbBufToStringTest, Span) {
-  EXPECT_THAT(FbBufToStr(MakeFbData(kData)), Pointwise(Eq(), kData));
-}
-
-TEST(FbBufToStringTest, ConstPointer) {
-  auto data = MakeConstFbData(kData);
-  EXPECT_THAT(FbBufToStr(data.data(), data.size()), Pointwise(Eq(), kData));
-}
-
-TEST(FbBufToStringTest, Pointer) {
-  auto data = MakeFbData(kData);
-  EXPECT_THAT(FbBufToStr(data.data(), data.size()), Pointwise(Eq(), kData));
-}
-
-//
-// BufferRef (read-only)
-//
-
-TEST(BufferRefTest, Dump) {
-  BufferRef buf(kData.data(), kData.size());
-  std::stringstream out;
-  buf.Dump(out);
-  EXPECT_THAT(out.str(), StartsWith("BufferRef"));
-}
-
-TEST(BufferRefTest, WithData) {
-  auto data = MakeConstFbData(kData);
-  BufferRef buf(data.data(), data.size());
-  EXPECT_EQ(buf.Span(), data);
-  EXPECT_EQ(buf.StrView(), kData);
-}
-
-TEST(BufferRefTest, WithDataAndOffset) {
-  auto data = MakeConstFbData(kData);
-  BufferRef buf(data.data(), data.size(), kOffset);
-  EXPECT_EQ(buf.Span(), data.subspan(kOffset, buf.Size()));
-  EXPECT_EQ(buf.StrView(), kData.substr(kOffset, buf.Size()));
-}
-
-TEST(BufferRefTest, ToVec) {
-  auto data = MakeConstFbData(kData);
-  BufferRef buf(data.data(), data.size());
-  EXPECT_THAT(buf.ToVec(), ElementsAreArray(data));
-}
-
-TEST(BufferRefTest, WriteStr) {
-  auto data = MakeConstFbData(kData);
-  BufferRef buf(data.data(), data.size());
-  std::stringstream out;
-  buf.WriteStr(out);
-  EXPECT_EQ(out.str(), kData);
-}
-
-TEST(BufferRefTest, WriteStrOffset) {
-  auto data = MakeConstFbData(kData);
-  BufferRef buf(data.data(), data.size(), kOffset);
-  std::stringstream out;
-  buf.WriteStr(out);
-  EXPECT_EQ(out.str(), kData.substr(kOffset, buf.Size()));
-}
-
-TEST(BufferRefTest, TupleGet) {
-  auto input = MakeConstFbData(kData);
-  BufferRef buf(input);
-  auto [data, size, offset] = buf.Get();
-  ASSERT_EQ(offset, 0);
-  EXPECT_EQ(input, buf.Span());
-}
-
-//
-// MutableBufferRef (read/write)
-//
-
-TEST(MutableBufferRefTest, Dump) {
-  MutableBufferRef<char> buf;
-  std::stringstream out;
-  buf.Dump(out);
-  EXPECT_THAT(out.str(), StartsWith("MutableBufferRef"));
-}
-
-TEST(MutableBufferRefTest, WriteInto) {
-  auto v_data = MakeFbDataVec(kOtherData);
-  MutableBufferRef buf(v_data.data(), v_data.size());
-  ASSERT_TRUE(buf.WriteInto("Some"));
-  EXPECT_THAT(buf.Span(), ElementsAreArray(v_data));
-  EXPECT_EQ(buf.StrView(), kData);
-}
-
-TEST(MutableBufferRefTest, WriteIntoOffsetBuf) {
-  auto v_data = MakeFbDataVec(kOtherData);
-  static constexpr absl::string_view kExpData = "RAWBuffer";
-  MutableBufferRef buf(v_data.data(), v_data.size(), kOffset);
-  ASSERT_TRUE(buf.WriteInto("RAW"));
-  EXPECT_THAT(buf.Span(), ElementsAreArray(MakeConstFbData(kExpData)));
-  EXPECT_EQ(buf.StrView(), kExpData);
-}
-
-TEST(MutableBufferRefTest, WriteIntoOffsetData) {
-  auto v_data = MakeFbDataVec(kOtherData);
-  static constexpr absl::string_view kExpData = "SOMERAWBuffer";
-  MutableBufferRef buf(v_data.data(), v_data.size());
-  ASSERT_TRUE(buf.WriteInto("RAW", kOffset));
-  EXPECT_THAT(buf.Span(), ElementsAreArray(MakeConstFbData(kExpData)));
-  EXPECT_EQ(buf.StrView(), kExpData);
-}
-
-TEST(MutableBufferRefTest, TupleGet) {
-  auto input = MakeInternalTestBuffer("FOO");
-  MutableBufferRef buf(input);
-  auto [data, size, offset] = buf.Get();
-  *data = 'b';
-  EXPECT_EQ(buf.StrView(), "bOO");
-  delete[] input.data();
-}
-
-//
-// OwningBufferRef (read/write with memory management)
-//
-
-TEST(OwningBufferRefTest, Dump) {
-  OwningBufferRef buf;
-  std::stringstream out;
-  buf.Dump(out);
-  EXPECT_THAT(out.str(), StartsWith("OwningBufferRef"));
-}
-
-TEST(OwningBufferRefTest, MoveCstor) {
-  auto raw = MakeInternalTestBuffer<Mallocator<uint8_t>>(kData);
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> buf(raw.data(), raw.size());
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> other(std::move(buf));
-  EXPECT_EQ(other.StrView(), kData);
-}
-
-TEST(OwningBufferRefTest, MoveAssign) {
-  auto raw = MakeInternalTestBuffer<Mallocator<uint8_t>>(kData);
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> buf(raw.data(), raw.size());
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> other = std::move(buf);
-  EXPECT_EQ(other.StrView(), kData);
-}
-
-TEST(OwningBufferRefTest, CopyCstor) {
-  auto raw = MakeInternalTestBuffer<Mallocator<uint8_t>>(kData);
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> buf(raw.data(), raw.size());
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> other(buf);
-  other.WriteInto("SOME");
-  EXPECT_EQ(buf.StrView(), kData);
-  EXPECT_EQ(other.StrView(), "SOMERawBuffer");
-}
-
-TEST(OwningBufferRefTest, CopyAssign) {
-  auto raw = MakeInternalTestBuffer<Mallocator<uint8_t>>(kData);
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> buf(raw.data(), raw.size());
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> other = buf;
-  other.WriteInto("SOME");
-  EXPECT_EQ(buf.StrView(), kData);
-  EXPECT_EQ(other.StrView(), "SOMERawBuffer");
-}
-
-TEST(OwningBufferRefTest, InternalMalloc) {
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> buf(kData.size());
-  ASSERT_EQ(buf.Size(), kData.size());
-  ASSERT_NE(buf.Data(), nullptr);
-
-  buf.WriteInto(kData);
-  EXPECT_EQ(buf.StrView(), kData);
-}
-
-TEST(OwningBufferRefTest, InternalNew) {
-  OwningBufferRef buf(kData.size());
-  ASSERT_EQ(buf.Size(), kData.size());
-  ASSERT_NE(buf.Data(), nullptr);
-
-  buf.WriteInto(kData);
-  EXPECT_EQ(buf.StrView(), kData);
-}
-
-TEST(OwningBufferRefTest, TakeOwnershipMalloc) {
-  auto malloc_buffer = MakeInternalTestBuffer<Mallocator<uint8_t>>(kData);
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> buf(malloc_buffer.data(),
-                                                    malloc_buffer.size());
-  EXPECT_EQ(buf.StrView(), kData);
-}
-
-TEST(OwningBufferRefTest, TakeOwnershipNew) {
-  auto new_buffer = MakeInternalTestBuffer(kData);
-  OwningBufferRef buf(new_buffer.data(), new_buffer.size());
-  EXPECT_EQ(buf.StrView(), kData);
-}
-
-TEST(OwningBufferRefTest, TakeOwnershipOffset) {
-  auto malloc_buffer = MakeInternalTestBuffer<Mallocator<uint8_t>>(kData);
-  OwningBufferRef<uint8_t, Mallocator<uint8_t>> buf(malloc_buffer.data(),
-                                                    malloc_buffer.size(),
-                                                    /*offset=*/4);
-  EXPECT_EQ(buf.StrView(), "RawBuffer");
-}
-
-TEST(OwningBufferRefTest, CopyBuffer) {
-  auto const_buf = MakeConstFbData(kData);
-  OwningBufferRef buf(const_buf.data(), const_buf.size());
-  buf.WriteInto("SOME");
-  EXPECT_EQ(buf.StrView(), "SOMERawBuffer");
-  EXPECT_EQ(FbBufToStr(const_buf), "SomeRawBuffer");
-}
-
-TEST(OwningBufferRefTest, ImplicitUpCasts) {
-  OwningBufferRef buf(kData.size());
-  BufferRef c_buf = buf;
-
-  buf.WriteInto(kData);
-  EXPECT_EQ(c_buf.StrView(), buf.StrView());
-}
-
-TEST(OwningBufferRefTest, TupleGetWeak) {
-  auto input = MakeInternalTestBuffer("FOO");
-
-  OwningBufferRef buf;
-  auto [data, size, offset] = buf.GetWeak();
-
-  data = input.data();
-  size = input.size();
-  offset = 0;
-
-  ASSERT_EQ(buf.Size(), input.size());
-  ASSERT_EQ(buf.Size(), input.size());
-
-  buf.WriteInto("BAR");
-
-  EXPECT_EQ(buf.StrView(), "BAR");
-  EXPECT_EQ(buf.Span(), input);
-}
-
-TEST(OwningBufferRefTest, TupleRelease) {
-  OwningBufferRef<char> buf("BAZ");
-
-  auto [data, size, offset] = buf.Release();
-
-  EXPECT_EQ(buf.Size(), 0);
-  EXPECT_EQ(absl::string_view(data, size), "BAZ");
-
-  delete[] data;
-}
-
-TEST(OwningBufferRefTest, Assign) {
-  auto const_buf = MakeConstFbData(kData);
-  OwningBufferRef buf;
-  buf.Assign(const_buf.data(), const_buf.size());
-  buf.WriteInto("SOME");
-  EXPECT_EQ(buf.StrView(), "SOMERawBuffer");
-  EXPECT_EQ(FbBufToStr(const_buf), "SomeRawBuffer");
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/cc/litert_compiled_model.cc b/tensorflow/lite/experimental/litert/cc/litert_compiled_model.cc
deleted file mode 100644
index d7c6872034be..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_compiled_model.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-
-namespace litert {
-
-Expected<size_t> CompiledModel::FindInputIndex(
-    size_t signature_index, absl::string_view input_name) const {
-  LITERT_ASSIGN_OR_RETURN(const Signature& signature,
-                          model_.GetSignature(signature_index));
-  const std::vector<absl::string_view>& input_names = signature.InputNames();
-  auto it = std::find(input_names.begin(), input_names.end(), input_name);
-  if (it != input_names.end()) {
-    return std::distance(input_names.begin(), it);
-  }
-  return Unexpected(kLiteRtStatusErrorNotFound, "Failed to find input");
-}
-
-Expected<size_t> CompiledModel::FindOutputIndex(
-    size_t signature_index, absl::string_view output_name) const {
-  LITERT_ASSIGN_OR_RETURN(const Signature& signature,
-                          model_.GetSignature(signature_index));
-  const std::vector<absl::string_view>& output_names = signature.OutputNames();
-  auto it = std::find(output_names.begin(), output_names.end(), output_name);
-  if (it != output_names.end()) {
-    return std::distance(output_names.begin(), it);
-  }
-  return Unexpected(kLiteRtStatusErrorNotFound, "Failed to find output");
-}
-
-Expected<TensorBuffer> CompiledModel::CreateBufferImpl(
-    const TensorBufferRequirements& buffer_requirements,
-    const RankedTensorType& tensor_type) {
-  LITERT_ASSIGN_OR_RETURN(
-      const std::vector<LiteRtTensorBufferType>& supported_types,
-      buffer_requirements.SupportedTypes());
-  if (supported_types.empty()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Input doesn't support any tensor buffer types");
-  }
-  // For simplicity we just pick the first supported tensor buffer type.
-  LiteRtTensorBufferType tensor_buffer_type = supported_types[0];
-  LITERT_ASSIGN_OR_RETURN(size_t buffer_size, buffer_requirements.BufferSize());
-
-  LITERT_ASSIGN_OR_RETURN(TensorBuffer buffer,
-                          TensorBuffer::CreateManaged(
-                              tensor_buffer_type, tensor_type, buffer_size));
-  return buffer;
-}
-
-Expected<TensorBuffer> CompiledModel::CreateInputOutputBuffer(
-    absl::string_view signature_name, absl::string_view tensor_name,
-    bool is_input) const {
-  LITERT_ASSIGN_OR_RETURN(size_t signature_index,
-                          model_.GetSignatureIndex(signature_name));
-  LITERT_ASSIGN_OR_RETURN(Signature signature,
-                          model_.GetSignature(signature_index));
-
-  LITERT_ASSIGN_OR_RETURN(Subgraph subgraph, model_.Subgraph(signature.Key()));
-
-  Expected<Tensor> tensor_expected =
-      is_input ? subgraph.Input(tensor_name) : subgraph.Output(tensor_name);
-  Expected<TensorBufferRequirements> buffer_requirements_expected =
-      is_input ? GetInputBufferRequirements(signature_index, tensor_name)
-               : GetOutputBufferRequirements(signature_index, tensor_name);
-
-  LITERT_ASSIGN_OR_RETURN(const Tensor& tensor, tensor_expected);
-  LITERT_ASSIGN_OR_RETURN(const TensorBufferRequirements& buffer_requirements,
-                          buffer_requirements_expected);
-  LITERT_ASSIGN_OR_RETURN(const RankedTensorType& tensor_type,
-                          tensor.RankedTensorType());
-
-  return CreateBufferImpl(buffer_requirements, tensor_type);
-}
-
-Expected<std::vector<TensorBuffer>> CompiledModel::CreateInputOutputBuffers(
-    size_t signature_index, bool is_input) const {
-  LITERT_ASSIGN_OR_RETURN(const Signature& signature,
-                          model_.GetSignature(signature_index));
-  LITERT_ASSIGN_OR_RETURN(const Subgraph subgraph,
-                          model_.Subgraph(signature.Key()));
-  std::vector<TensorBuffer> tensor_buffers;
-  std::vector<absl::string_view> tensor_names;
-
-  tensor_names = is_input ? signature.InputNames() : signature.OutputNames();
-  tensor_buffers.reserve(tensor_names.size());
-
-  for (int i = 0; i < tensor_names.size(); ++i) {
-    LITERT_ASSIGN_OR_RETURN(
-        TensorBuffer tensor_buffer,
-        CreateInputOutputBuffer(signature.Key(), tensor_names[i], is_input));
-    tensor_buffers.push_back(std::move(tensor_buffer));
-  }
-
-  return tensor_buffers;
-}
-
-Expected<void> CompiledModel::RunCApiHelper(LiteRtParamIndex signature_index,
-                                            size_t num_input_buffers,
-                                            LiteRtTensorBuffer* input_buffers,
-                                            size_t num_output_buffers,
-                                            LiteRtTensorBuffer* output_buffers,
-                                            bool& async) const {
-  LiteRtStatus status =
-      async ? LiteRtRunCompiledModelAsync(
-                  Get(), signature_index, num_input_buffers, input_buffers,
-                  num_output_buffers, output_buffers, &async)
-            : LiteRtRunCompiledModel(Get(), signature_index, num_input_buffers,
-                                     input_buffers, num_output_buffers,
-                                     output_buffers);
-  if (status != kLiteRtStatusOk) {
-    return Unexpected(status, "Failed to invoke the compiled model");
-  }
-  return {};
-}
-
-Expected<void> CompiledModel::RunHelper(
-    size_t signature_index, const std::vector<TensorBuffer>& input_buffers,
-    const std::vector<TensorBuffer>& output_buffers, bool& async) const {
-  auto input_buffers_ptr =
-      std::make_unique<LiteRtTensorBuffer[]>(input_buffers.size());
-  for (int i = 0; i < input_buffers.size(); ++i) {
-    input_buffers_ptr[i] = input_buffers[i].Get();
-  }
-  auto output_buffers_ptr =
-      std::make_unique<LiteRtTensorBuffer[]>(output_buffers.size());
-  for (int i = 0; i < output_buffers.size(); ++i) {
-    output_buffers_ptr[i] = output_buffers[i].Get();
-  }
-  return RunCApiHelper(signature_index, input_buffers.size(),
-                       input_buffers_ptr.get(), output_buffers.size(),
-                       output_buffers_ptr.get(), async);
-}
-
-Expected<void> CompiledModel::RunHelper(
-    absl::string_view signature_key,
-    const absl::flat_hash_map<absl::string_view, TensorBuffer>& input_map,
-    const absl::flat_hash_map<absl::string_view, TensorBuffer>& output_map,
-    bool& async) const {
-  auto signature_index = model_.GetSignatureIndex(signature_key);
-  if (!signature_index) {
-    return Unexpected(kLiteRtStatusErrorNotFound,
-                      "Failed to get signature_index");
-  }
-  auto subgraph = model_.Subgraph(signature_key);
-  if (!subgraph) {
-    return Unexpected(kLiteRtStatusErrorNotFound, "Failed to get subgraph");
-  }
-  auto input_tensors = subgraph->Inputs();
-  size_t num_inputs = input_tensors.size();
-  auto input_buffers_ptr = std::make_unique<LiteRtTensorBuffer[]>(num_inputs);
-  for (int i = 0; i < num_inputs; ++i) {
-    absl::string_view input_name = input_tensors[i].Name();
-    auto it = input_map.find(input_name);
-    if (it == input_map.end()) {
-      return Unexpected(kLiteRtStatusErrorNotFound,
-                        "The given map is missing some input TensorBuffers");
-    }
-    input_buffers_ptr[i] = it->second.Get();
-  }
-  auto output_tensors = subgraph->Outputs();
-  size_t num_outputs = output_tensors.size();
-  auto output_buffers_ptr = std::make_unique<LiteRtTensorBuffer[]>(num_outputs);
-  for (int i = 0; i < num_outputs; ++i) {
-    absl::string_view output_name = output_tensors[i].Name();
-    auto it = output_map.find(output_name);
-    if (it == output_map.end()) {
-      return Unexpected(kLiteRtStatusErrorNotFound,
-                        "The given map is missing some output TensorBuffers");
-    }
-    output_buffers_ptr[i] = it->second.Get();
-  }
-  return RunCApiHelper(*signature_index, num_inputs, input_buffers_ptr.get(),
-                       num_outputs, output_buffers_ptr.get(), async);
-}
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_compiled_model.h b/tensorflow/lite/experimental/litert/cc/litert_compiled_model.h
deleted file mode 100644
index 558e3d09a9ea..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_compiled_model.h
+++ /dev/null
@@ -1,361 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_COMPILED_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_COMPILED_MODEL_H_
-
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-
-namespace litert {
-
-// The CompiledModel is a higher level inference API. It is created by
-// provided model with compilation options. Internally, it instantiates runtime
-// and applies Delegates mapped to the compilation options.
-// It also supports getting BufferRequirements to create input/output
-// TensorBuffers, and it allows to invoke the model with the input/output
-// TensorBuffers.
-//
-// Example user flow:
-//
-// 1. Create CompiledModel
-// 2. Query the model input/output requirements
-// 3. Create input/output TensorBuffers
-// 4. Fill the input TensorBuffers with input data
-// 5. Invoke the model with the input/output TensorBuffers
-// 6. Evaluate the output TensorBuffers
-
-class CompiledModel
-    : public internal::Handle<LiteRtCompiledModel, LiteRtDestroyCompiledModel> {
- public:
-  class Options {
-    struct Deleter {
-      void operator()(LiteRtCompilationOptions options) {
-        LiteRtDestroyCompilationOptions(options);
-      }
-    };
-
-    // Were making the default constructor private to avoid  null options
-    // created by mistake.
-    Options() = default;
-    explicit Options(LiteRtCompilationOptions options) : impl_(options) {}
-
-   public:
-    using Ptr = std::unique_ptr<LiteRtCompilationOptionsT, Deleter>;
-
-    // Creates a new LiteRtCompilationOptions object wrapped in a `unique_ptr`.
-    static Expected<Options> Create() {
-      LiteRtCompilationOptions options;
-      if (auto status = LiteRtCreateCompilationOptions(&options);
-          status != kLiteRtStatusOk) {
-        return Error(status, "Could not create default compilation options");
-      }
-      return Options(options);
-    }
-
-    // Create a NULL pointer.
-    static Options None() { return {}; }
-
-    Ptr GetUnderlyingPtr() { return Ptr(release()); }
-
-    Expected<void> SetHardwareAccelerators(
-        LiteRtHwAcceleratorSet accelerators) {
-      if (auto status = LiteRtSetCompilationOptionsHardwareAccelerators(
-              get(), accelerators);
-          status != kLiteRtStatusOk) {
-        return Error(
-            status,
-            "Could not set hardware accelerators in compilation options");
-      }
-      return {};
-    }
-
-    // Mimic unique_ptr API.
-
-    Ptr::pointer release() noexcept { return impl_.release(); }
-    void reset(Ptr::pointer ptr = nullptr) noexcept { return impl_.reset(ptr); }
-
-    Ptr::pointer get() const noexcept { return impl_.get(); }
-    Ptr::pointer operator->() const noexcept { return impl_.operator->(); }
-    auto& operator*() const noexcept { return impl_.operator*(); }
-
-   private:
-    Ptr impl_;
-  };
-
-  CompiledModel() = default;
-
-  // Creates a CompiledModel instance.
-  //
-  // If `owned` is `true`, then the created object takes ownership of the
-  // `compiled_model` handle.
-  explicit CompiledModel(LiteRtModel litert_model,
-                         LiteRtCompiledModel compiled_model, bool owned = true)
-      : internal::Handle<LiteRtCompiledModel, LiteRtDestroyCompiledModel>(
-            compiled_model, owned),
-        model_(Model::CreateFromNonOwnedHandle(litert_model)) {}
-
-  // Creates a CompiledModel from a TFLite file.
-  //
-  // The model is loaded into memory and the caller takes ownership of the
-  // returned CompiledModel object. The caller should keep the model alive
-  // until the CompiledModel is destroyed.
-  //
-  // The given environment must outlive the compiled model and any execution
-  // running it.
-  static Expected<CompiledModel> Create(litert::Environment& env,
-                                        litert::Model& model,
-                                        Options&& compilation_options) {
-    LiteRtModel litert_model = model.Get();
-    LiteRtCompiledModel compiled_model;
-    LITERT_RETURN_IF_ERROR(LiteRtCreateCompiledModel(
-        env.Get(), litert_model, compilation_options.release(),
-        &compiled_model));
-    return CompiledModel(litert_model, compiled_model);
-  }
-
-  static Expected<CompiledModel> Create(
-      litert::Environment& env, litert::Model& model,
-      LiteRtHwAccelerators hardware_accelerator = kLiteRtHwAcceleratorCpu) {
-    LITERT_ASSIGN_OR_RETURN(Options options, Options::Create());
-    options.SetHardwareAccelerators(hardware_accelerator);
-    return Create(env, model, std::move(options));
-  }
-
-  // Get input buffer requirements for the given signature and input name.
-  Expected<TensorBufferRequirements> GetInputBufferRequirements(
-      absl::string_view signature_name, absl::string_view input_name) {
-    LITERT_ASSIGN_OR_RETURN(size_t signature_index,
-                            model_.GetSignatureIndex(signature_name));
-    return GetInputBufferRequirements(signature_index, input_name);
-  }
-
-  // Returns the buffer requirements for the given n-th input tensor. The
-  // returned TensorBufferRequirements is used to create the input tensor
-  // buffer.
-  Expected<TensorBufferRequirements> GetInputBufferRequirements(
-      size_t signature_index, size_t input_index) const {
-    LiteRtTensorBufferRequirements buffer_requirements;
-    LITERT_RETURN_IF_ERROR(LiteRtGetCompiledModelInputBufferRequirements(
-        Get(), signature_index, input_index, &buffer_requirements));
-    return TensorBufferRequirements(buffer_requirements, /*owned=*/false);
-  }
-
-  // The same as above except this function takes input tensor name.
-  Expected<TensorBufferRequirements> GetInputBufferRequirements(
-      size_t signature_index, absl::string_view input_name) const {
-    LITERT_ASSIGN_OR_RETURN(size_t input_index,
-                            FindInputIndex(signature_index, input_name));
-    return GetInputBufferRequirements(signature_index, input_index);
-  }
-
-  // Get output buffer requirements for the given signature and output name.
-  Expected<TensorBufferRequirements> GetOutputBufferRequirements(
-      absl::string_view signature_name, absl::string_view output_name) {
-    LITERT_ASSIGN_OR_RETURN(size_t signature_index,
-                            model_.GetSignatureIndex(signature_name));
-    return GetOutputBufferRequirements(signature_index, output_name);
-  }
-
-  // Returns the buffer requirements for the given output tensor. The returned
-  // TensorBufferRequirements is used to create the output tensor
-  // buffer.
-  Expected<TensorBufferRequirements> GetOutputBufferRequirements(
-      size_t signature_index, size_t output_index) const {
-    LiteRtTensorBufferRequirements buffer_requirements;
-    LITERT_RETURN_IF_ERROR(LiteRtGetCompiledModelOutputBufferRequirements(
-        Get(), signature_index, output_index, &buffer_requirements));
-    return TensorBufferRequirements(buffer_requirements, /*owned=*/false);
-  }
-
-  // The same as above except this function takes output tensor name.
-  Expected<TensorBufferRequirements> GetOutputBufferRequirements(
-      size_t signature_index, absl::string_view output_name) const {
-    LITERT_ASSIGN_OR_RETURN(size_t output_index,
-                            FindOutputIndex(signature_index, output_name));
-    return GetOutputBufferRequirements(signature_index, output_index);
-  }
-
-  // Creates an input tensor buffer for the given signature and input name.
-  Expected<TensorBuffer> CreateInputBuffer(absl::string_view signature_name,
-                                           absl::string_view input_name) const {
-    return CreateInputOutputBuffer(signature_name, input_name,
-                                   /*is_input=*/true);
-  }
-
-  // Creates an output tensor buffer for the given signature and output name.
-  Expected<TensorBuffer> CreateOutputBuffer(
-      absl::string_view signature_name, absl::string_view output_name) const {
-    return CreateInputOutputBuffer(signature_name, output_name,
-                                   /*is_input=*/false);
-  }
-
-  // A helper function to create input tensor buffers for the given signature.
-  // It uses BufferRequirements and RankedTensorType to create the input tensor
-  // buffers.
-  Expected<std::vector<TensorBuffer>> CreateInputBuffers(
-      absl::string_view signature_name) const {
-    LITERT_ASSIGN_OR_RETURN(size_t signature_index,
-                            model_.GetSignatureIndex(signature_name));
-    return CreateInputOutputBuffers(signature_index, /*is_input=*/true);
-  }
-
-  // A helper function to creates the input tensor buffers for the given
-  // signature. It uses BufferRequirements and RankedTensorType to create the
-  // input tensor buffers.
-  Expected<std::vector<TensorBuffer>> CreateInputBuffers(
-      size_t signature_index) const {
-    return CreateInputOutputBuffers(signature_index, /*is_input=*/true);
-  }
-
-  // A helper function to create output tensor buffers for the given signature.
-  // It uses BufferRequirements and RankedTensorType to create the output tensor
-  // buffers.
-  Expected<std::vector<TensorBuffer>> CreateOutputBuffers(
-      absl::string_view signature_name) const {
-    LITERT_ASSIGN_OR_RETURN(size_t signature_index,
-                            model_.GetSignatureIndex(signature_name));
-    return CreateOutputBuffers(signature_index);
-  }
-
-  // A helper function to creates the output tensor buffers for the given
-  // signature. It uses BufferRequirements and RankedTensorType to create the
-  // output tensor buffers.
-  Expected<std::vector<TensorBuffer>> CreateOutputBuffers(
-      size_t signature_index) const {
-    return CreateInputOutputBuffers(signature_index, /*is_input=*/false);
-  }
-
-  // Runs the model of the given signature index synchronously with the provided
-  // input/output TensorBuffers.
-  Expected<void> Run(size_t signature_index,
-                     const std::vector<TensorBuffer>& input_buffers,
-                     const std::vector<TensorBuffer>& output_buffers) const {
-    bool async = false;
-    return RunHelper(signature_index, input_buffers, output_buffers, async);
-  }
-
-  // Runs the model of the given signature index asynchronously, if possible,
-  // with the provided input/output TensorBuffers. If asynchronous execution is
-  // possible then the function returns true in parameter `async`; otherwise the
-  // function runs the model synchronously.
-  Expected<void> RunAsync(size_t signature_index,
-                          const std::vector<TensorBuffer>& input_buffers,
-                          const std::vector<TensorBuffer>& output_buffers,
-                          bool& async) const {
-    async = true;
-    return RunHelper(signature_index, input_buffers, output_buffers, async);
-  }
-
-  // Runs the model of the given signature key synchronously with the provided
-  // input/output TensorBuffers.
-  Expected<void> Run(absl::string_view signature_key,
-                     const std::vector<TensorBuffer>& input_buffers,
-                     const std::vector<TensorBuffer>& output_buffers) const {
-    LITERT_ASSIGN_OR_RETURN(size_t signature_index,
-                            model_.GetSignatureIndex(signature_key));
-    return Run(signature_index, input_buffers, output_buffers);
-  }
-
-  // Runs the model of the given signature key synchronously with the provided
-  // input/output TensorBuffer map.
-  Expected<void> Run(
-      absl::string_view signature_key,
-      const absl::flat_hash_map<absl::string_view, TensorBuffer>& input_map,
-      const absl::flat_hash_map<absl::string_view, TensorBuffer>& output_map)
-      const {
-    bool async = false;
-    return RunHelper(signature_key, input_map, output_map, async);
-  }
-
-  // Runs the model of the given signature key asynchronously, if possible, with
-  // the provided input/output TensorBuffer map. If asynchronous execution is
-  // possible then the function returns true in parameter `async`; otherwise the
-  // function runs the model synchronously.
-  Expected<void> RunAsync(
-      absl::string_view signature_key,
-      const absl::flat_hash_map<absl::string_view, TensorBuffer>& input_map,
-      const absl::flat_hash_map<absl::string_view, TensorBuffer>& output_map,
-      bool& async) const {
-    async = true;
-    return RunHelper(signature_key, input_map, output_map, async);
-  }
-
- private:
-  // Returns the signature input index for the given input tensor name.
-  Expected<size_t> FindInputIndex(size_t signature_index,
-                                  absl::string_view input_name) const;
-
-  // Returns the signature output index for the given output tensor name.
-  Expected<size_t> FindOutputIndex(size_t signature_index,
-                                   absl::string_view output_name) const;
-
-  // Creates a TensorBuffer with the given buffer requirements and tensor type.
-  static Expected<TensorBuffer> CreateBufferImpl(
-      const TensorBufferRequirements& buffer_requirements,
-      const RankedTensorType& tensor_type);
-
-  // Creates a TensorBuffer for the given signature and tensor name.
-  Expected<TensorBuffer> CreateInputOutputBuffer(
-      absl::string_view signature_name, absl::string_view tensor_name,
-      bool is_input) const;
-
-  // Creates a vector of TensorBuffers for the given signature subgraph.
-  Expected<std::vector<TensorBuffer>> CreateInputOutputBuffers(
-      size_t signature_index, bool is_input) const;
-
-  Expected<void> RunCApiHelper(LiteRtParamIndex signature_index,
-                               size_t num_input_buffers,
-                               LiteRtTensorBuffer* input_buffers,
-                               size_t num_output_buffers,
-                               LiteRtTensorBuffer* output_buffers,
-                               bool& async) const;
-
-  Expected<void> RunHelper(size_t signature_index,
-                           const std::vector<TensorBuffer>& input_buffers,
-                           const std::vector<TensorBuffer>& output_buffers,
-                           bool& async) const;
-
-  Expected<void> RunHelper(
-      absl::string_view signature_key,
-      const absl::flat_hash_map<absl::string_view, TensorBuffer>& input_map,
-      const absl::flat_hash_map<absl::string_view, TensorBuffer>& output_map,
-      bool& async) const;
-
-  Model model_;
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_COMPILED_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_compiled_model_gpu_test.cc b/tensorflow/lite/experimental/litert/cc/litert_compiled_model_gpu_test.cc
deleted file mode 100644
index 1ccceef6a45f..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_compiled_model_gpu_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstring>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/debugging/leak_check.h"
-#include "absl/log/absl_log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-
-using testing::FloatNear;
-using testing::Pointwise;
-
-namespace litert {
-namespace {
-
-TEST(CompiledModelGpuTest, Basic) {
-  // MSAN does not support GPU tests.
-#if defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
-  GTEST_SKIP() << "GPU tests are not supported in MSAN";
-#endif
-  // To workaround the memory leak in Nvidia's driver
-  absl::LeakCheckDisabler disable_leak_check;
-
-  auto model = testing::LoadTestFileModel(kModelFileName);
-  ASSERT_TRUE(model);
-
-  auto env = litert::Environment::Create({});
-  ASSERT_TRUE(env);
-
-  auto res_compiled_model =
-      CompiledModel::Create(*env, model, kLiteRtHwAcceleratorGpu);
-  ASSERT_TRUE(res_compiled_model) << "Failed to initialize CompiledModel";
-
-  auto& compiled_model = *res_compiled_model;
-  auto signatures = model.GetSignatures().Value();
-  EXPECT_EQ(signatures.size(), 1);
-
-  auto signature_key = signatures[0].Key();
-  EXPECT_EQ(signature_key, Model::DefaultSignatureKey());
-  size_t signature_index = 0;
-
-  auto input_buffers_res = compiled_model.CreateInputBuffers(signature_index);
-  EXPECT_TRUE(input_buffers_res);
-  auto& input_buffers = *input_buffers_res;
-
-  auto output_buffers_res = compiled_model.CreateOutputBuffers(signature_index);
-  EXPECT_TRUE(output_buffers_res);
-  auto& output_buffers = *output_buffers_res;
-
-  // Fill model inputs.
-  auto input_names = signatures[0].InputNames();
-  EXPECT_EQ(input_names.size(), 2);
-  EXPECT_EQ(input_names.at(0), "arg0");
-  EXPECT_EQ(input_names.at(1), "arg1");
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  auto output_names = signatures[0].OutputNames();
-  EXPECT_EQ(output_names.size(), 1);
-  EXPECT_EQ(output_names.at(0), "tfl.add");
-  {
-    auto lock_and_addr =
-        litert::TensorBufferScopedLock::Create<const float>(output_buffers[0]);
-    ASSERT_TRUE(lock_and_addr);
-    auto output = absl::MakeSpan(lock_and_addr->second, kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_compiled_model_test.cc b/tensorflow/lite/experimental/litert/cc/litert_compiled_model_test.cc
deleted file mode 100644
index e543aaaea5a3..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_compiled_model_test.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-
-#include <cstring>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/absl_log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-
-using ::testing::ElementsAre;
-using testing::FloatNear;
-using testing::Pointwise;
-
-namespace litert {
-namespace {
-
-TEST(CompiledModelTest, Basic) {
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(Environment env, litert::Environment::Create({}));
-
-  // Create Model and check signatures.
-  Model model = testing::LoadTestFileModel(kModelFileName);
-  ASSERT_TRUE(model);
-
-  LITERT_ASSERT_OK_AND_ASSIGN(std::vector<Signature> signatures,
-                              model.GetSignatures());
-  EXPECT_EQ(signatures.size(), 1);
-  absl::string_view signature_key = signatures[0].Key();
-  EXPECT_EQ(signature_key, Model::DefaultSignatureKey());
-  size_t signature_index = 0;
-
-  std::vector<absl::string_view> input_names = signatures[0].InputNames();
-  EXPECT_THAT(input_names, ElementsAre("arg0", "arg1"));
-
-  std::vector<absl::string_view> output_names = signatures[0].OutputNames();
-  EXPECT_THAT(output_names, ElementsAre("tfl.add"));
-
-  // Create CompiledModel.
-  LITERT_ASSERT_OK_AND_ASSIGN(CompiledModel compiled_model,
-                              CompiledModel::Create(env, model));
-
-  // Check CompiledModel buffer requirements.
-  // input and output expect host memory.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements input_buffer_requirements_arg0,
-      compiled_model.GetInputBufferRequirements(signature_index,
-                                                /*input_name=*/"arg0"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> input_buffer_types_arg0,
-      input_buffer_requirements_arg0.SupportedTypes());
-  EXPECT_THAT(input_buffer_types_arg0,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements input_buffer_requirements_arg1,
-      compiled_model.GetInputBufferRequirements(signature_index,
-                                                /*input_name=*/"arg1"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> input_buffer_types_arg1,
-      input_buffer_requirements_arg1.SupportedTypes());
-  EXPECT_THAT(input_buffer_types_arg1,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements output_buffer_requirements,
-      compiled_model.GetOutputBufferRequirements(signature_index,
-                                                 /*output_name=*/"tfl.add"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> output_buffer_types,
-      output_buffer_requirements.SupportedTypes());
-  EXPECT_THAT(output_buffer_types,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  // Create and fill input and output buffers.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<TensorBuffer> input_buffers,
-      compiled_model.CreateInputBuffers(signature_index));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<TensorBuffer> output_buffers,
-      compiled_model.CreateOutputBuffers(signature_index));
-
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model with input and output buffers.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  {
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        auto lock_and_addr,
-        litert::TensorBufferScopedLock::Create<const float>(output_buffers[0]));
-    auto output = absl::MakeSpan(lock_and_addr.second, kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
-
-TEST(CompiledModelTest, RunWithInputOutputMap) {
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(Environment env, litert::Environment::Create({}));
-
-  // Create Model and check signatures.
-  Model model = testing::LoadTestFileModel(kModelFileName);
-  ASSERT_TRUE(model);
-
-  LITERT_ASSERT_OK_AND_ASSIGN(std::vector<Signature> signatures,
-                              model.GetSignatures());
-  EXPECT_EQ(signatures.size(), 1);
-  absl::string_view signature_key = signatures[0].Key();
-  EXPECT_EQ(signature_key, Model::DefaultSignatureKey());
-  size_t signature_index = 0;
-
-  std::vector<absl::string_view> input_names = signatures[0].InputNames();
-  EXPECT_THAT(input_names, ElementsAre("arg0", "arg1"));
-
-  std::vector<absl::string_view> output_names = signatures[0].OutputNames();
-  EXPECT_THAT(output_names, ElementsAre("tfl.add"));
-
-  // Create CompiledModel.
-  LITERT_ASSERT_OK_AND_ASSIGN(CompiledModel compiled_model,
-                              CompiledModel::Create(env, model));
-
-  // Check CompiledModel buffer requirements.
-  // input and output expect host memory.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements input_buffer_requirements_arg0,
-      compiled_model.GetInputBufferRequirements(signature_index,
-                                                /*input_name=*/"arg0"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> input_buffer_types_arg0,
-      input_buffer_requirements_arg0.SupportedTypes());
-  EXPECT_THAT(input_buffer_types_arg0,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements input_buffer_requirements_arg1,
-      compiled_model.GetInputBufferRequirements(signature_index,
-                                                /*input_name=*/"arg1"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> input_buffer_types_arg1,
-      input_buffer_requirements_arg1.SupportedTypes());
-  EXPECT_THAT(input_buffer_types_arg1,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements output_buffer_requirements,
-      compiled_model.GetOutputBufferRequirements(signature_index,
-                                                 /*output_name=*/"tfl.add"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> output_buffer_types,
-      output_buffer_requirements.SupportedTypes());
-  EXPECT_THAT(output_buffer_types,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  // Create and fill input and output buffers.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBuffer input_buffer0,
-      compiled_model.CreateInputBuffer(signature_key, "arg0"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBuffer input_buffer1,
-      compiled_model.CreateInputBuffer(signature_key, "arg1"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBuffer output_buffer0,
-      compiled_model.CreateOutputBuffer(signature_key, "tfl.add"));
-
-  ASSERT_TRUE(input_buffer0.Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffer1.Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Create input and output map.
-  absl::flat_hash_map<absl::string_view, TensorBuffer> input_map;
-  input_map["arg0"] = std::move(input_buffer0);
-  input_map["arg1"] = std::move(input_buffer1);
-
-  absl::flat_hash_map<absl::string_view, TensorBuffer> output_map;
-  output_map["tfl.add"] = std::move(output_buffer0);
-
-  // Execute model with input and output maps instead of buffers.
-  compiled_model.Run(signature_key, input_map, output_map);
-
-  // Check model output.
-  {
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        auto lock_and_addr, litert::TensorBufferScopedLock::Create<const float>(
-                                output_map["tfl.add"]));
-    auto output = absl::MakeSpan(lock_and_addr.second, kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_consts.h b/tensorflow/lite/experimental/litert/cc/litert_consts.h
deleted file mode 100644
index 14ac9a0b00e8..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_consts.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_CONSTS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_CONSTS_H_
-
-#include <cstddef>
-
-namespace litert {
-
-// The following constants are used to properly size absl::InlinedVector<>
-// uses used in the LiteRT code. Their values don't need to be exact; they
-// are just optimization hints.
-static constexpr size_t kExpectedMaxTensorRank = 6;
-static constexpr size_t kExpectedMaxNumOfTensorUses = 8;
-static constexpr size_t kExpectedMaxNumOfOpInputs = 4;
-static constexpr size_t kExpectedMaxNumOfOpOutputs = 8;
-static constexpr size_t kExpectedMaxNumOfSubgraphInputs = 4;
-static constexpr size_t kExpectedMaxNumOfSubgraphOutputs = 4;
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_CONSTS_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_detail.h b/tensorflow/lite/experimental/litert/cc/litert_detail.h
deleted file mode 100644
index 566d8468fa81..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_detail.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DETAIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DETAIL_H_
-
-#include <cstddef>
-#include <functional>
-#include <optional>
-#include <utility>
-
-#include "absl/log/absl_check.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-namespace litert {
-
-// See "std::construct_at" from C++20.
-template <class T, class... Args>
-T* ConstructAt(T* p, Args&&... args) {
-  return ::new (static_cast<void*>(p)) T(std::forward<Args>(args)...);
-}
-
-// Reduce all over zipped iters of same size.
-template <typename LeftVals, typename RightVals = LeftVals>
-bool AllZip(const LeftVals& lhs, const RightVals& rhs,
-            std::function<bool(const typename LeftVals::value_type&,
-                               const typename RightVals::value_type&)>
-                bin_pred) {
-  if (lhs.size() != rhs.size()) {
-    return false;
-  }
-  for (auto i = 0; i < lhs.size(); ++i) {
-    if (!bin_pred(lhs.at(i), rhs.at(i))) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Reduce any over zipped iters of same size.
-template <typename LeftVals, typename RightVals = LeftVals>
-bool AnyZip(const LeftVals& lhs, const RightVals& rhs,
-            std::function<bool(const typename LeftVals::value_type&,
-                               const typename RightVals::value_type&)>
-                bin_pred) {
-  auto neg = [&](const auto& l, const auto& r) { return !bin_pred(l, r); };
-  return !(AllZip(lhs, rhs, neg));
-}
-
-// Does element exist in range.
-template <class It, class T>
-bool Contains(It begin, It end, const T& val) {
-  return std::find(begin, end, val) != end;
-}
-
-// Does element exist in range satisfying pred.
-template <class It, class UPred>
-bool ContainsIf(It begin, It end, UPred u_pred) {
-  return std::find_if(begin, end, u_pred) != end;
-}
-
-// Get the ind of the given element if it is present.
-template <class T, class It>
-std::optional<size_t> FindInd(It begin, It end, T val) {
-  auto it = std::find(begin, end, val);
-  return (it == end) ? std::nullopt : std::make_optional(it - begin);
-}
-
-namespace internal {
-
-// Call function "get" and assert it returns value equal to given expected
-// value.
-template <class F, class Expected, typename... Args>
-inline void AssertEq(F get, Expected expected, Args&&... args) {
-  auto status = get(std::forward<Args>(args)...);
-  ABSL_CHECK_EQ(status, expected);
-}
-
-// Call function "get" and assert it returns true.
-template <class F, typename... Args>
-inline void AssertTrue(F get, Args&&... args) {
-  AssertEq(get, true, std::forward<Args>(args)...);
-}
-
-// Call function "get" and assert it returns an OK LiteRtStatus.
-template <class F, typename... Args>
-inline void AssertOk(F get, Args&&... args) {
-  AssertEq(get, kLiteRtStatusOk, std::forward<Args>(args)...);
-}
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DETAIL_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h b/tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h
deleted file mode 100644
index 4eef84e90fd0..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DISPATCH_DELEGATE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DISPATCH_DELEGATE_H_
-
-#include <memory>
-
-#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-
-namespace litert {
-
-using DispatchDelegateOptionsPtr =
-    std::unique_ptr<LiteRtDispatchDelegateOptions,
-                    void (*)(LiteRtDispatchDelegateOptions*)>;
-
-using DispatchDelegatePtr = tflite::TfLiteOpaqueDelegateUniquePtr;
-
-DispatchDelegateOptionsPtr CreateDispatchDelegateOptionsPtr(
-    LiteRtEnvironmentT& environment);
-
-DispatchDelegatePtr CreateDispatchDelegatePtr(
-    LiteRtEnvironmentT& environment, DispatchDelegateOptionsPtr&& options);
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_DISPATCH_DELEGATE_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_element_type.h b/tensorflow/lite/experimental/litert/cc/litert_element_type.h
deleted file mode 100644
index 84b032b3820a..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_element_type.h
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ELEMENT_TYPE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ELEMENT_TYPE_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-
-namespace litert {
-
-// Data type of tensor elements. C++ equivalent to LiteRtElementType.
-enum class ElementType {
-  None = kLiteRtElementTypeNone,
-  Bool = kLiteRtElementTypeBool,
-  Int4 = kLiteRtElementTypeInt4,
-  Int8 = kLiteRtElementTypeInt8,
-  Int16 = kLiteRtElementTypeInt16,
-  Int32 = kLiteRtElementTypeInt32,
-  Int64 = kLiteRtElementTypeInt64,
-  UInt8 = kLiteRtElementTypeUInt8,
-  UInt16 = kLiteRtElementTypeUInt16,
-  UInt32 = kLiteRtElementTypeUInt32,
-  UInt64 = kLiteRtElementTypeUInt64,
-  Float16 = kLiteRtElementTypeFloat16,
-  BFloat16 = kLiteRtElementTypeBFloat16,
-  Float32 = kLiteRtElementTypeFloat32,
-  Float64 = kLiteRtElementTypeFloat64,
-  Complex64 = kLiteRtElementTypeComplex64,
-  Complex128 = kLiteRtElementTypeComplex128,
-  TfResource = kLiteRtElementTypeTfResource,
-  TfString = kLiteRtElementTypeTfString,
-  TfVariant = kLiteRtElementTypeTfVariant,
-};
-
-// Get number of bytes of a single element of given type.
-inline constexpr std::optional<size_t> GetByteWidth(ElementType ty) {
-  if (ty == ElementType::Bool)
-    return 1;
-  else if (ty == ElementType::Int8)
-    return 1;
-  else if (ty == ElementType::Int16)
-    return 2;
-  else if (ty == ElementType::Int32)
-    return 4;
-  else if (ty == ElementType::Int64)
-    return 8;
-  else if (ty == ElementType::UInt8)
-    return 1;
-  else if (ty == ElementType::UInt16)
-    return 2;
-  else if (ty == ElementType::UInt32)
-    return 4;
-  else if (ty == ElementType::UInt64)
-    return 8;
-  else if (ty == ElementType::Float16)
-    return 2;
-  else if (ty == ElementType::BFloat16)
-    return 2;
-  else if (ty == ElementType::Float32)
-    return 4;
-  else if (ty == ElementType::Float64)
-    return 8;
-  else
-    return std::nullopt;
-}
-
-// Get number of bytes of a single element of given type via template.
-template <ElementType Ty>
-inline constexpr size_t GetByteWidth() {
-  constexpr auto byte_width = GetByteWidth(Ty);
-  static_assert(byte_width.has_value(), "Type does not have byte width");
-  return byte_width.value();
-}
-
-template <class>
-constexpr bool dependent_false = false;  // workaround before CWG2518/P2593R1
-
-// Get the litert::ElementType associated with given C++ type.
-template <typename T>
-inline constexpr ElementType GetElementType() {
-  static_assert(dependent_false<T>, "Uknown C++ type");
-  return ElementType::None;
-}
-
-template <>
-inline constexpr ElementType GetElementType<bool>() {
-  return ElementType::Bool;
-}
-
-template <>
-inline constexpr ElementType GetElementType<int8_t>() {
-  return ElementType::Int8;
-}
-
-template <>
-inline constexpr ElementType GetElementType<uint8_t>() {
-  return ElementType::UInt8;
-}
-
-template <>
-inline constexpr ElementType GetElementType<int16_t>() {
-  return ElementType::Int16;
-}
-
-template <>
-inline constexpr ElementType GetElementType<uint16_t>() {
-  return ElementType::UInt16;
-}
-
-template <>
-inline constexpr ElementType GetElementType<int32_t>() {
-  return ElementType::Int32;
-}
-
-template <>
-inline constexpr ElementType GetElementType<uint32_t>() {
-  return ElementType::UInt32;
-}
-
-template <>
-inline constexpr ElementType GetElementType<int64_t>() {
-  return ElementType::Int64;
-}
-
-template <>
-inline constexpr ElementType GetElementType<uint64_t>() {
-  return ElementType::UInt64;
-}
-
-template <>
-inline constexpr ElementType GetElementType<float>() {
-  return ElementType::Float32;
-}
-
-template <>
-inline constexpr ElementType GetElementType<double>() {
-  return ElementType::Float64;
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ELEMENT_TYPE_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_element_type_test.cc b/tensorflow/lite/experimental/litert/cc/litert_element_type_test.cc
deleted file mode 100644
index 929bc499f32c..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_element_type_test.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-
-#include <cstddef>
-#include <cstdint>
-
-#include <gtest/gtest.h>
-
-namespace litert {
-
-namespace {
-
-template <typename T>
-class ElementTypeTest : public ::testing::Test {
- public:
-  size_t Size() const { return sizeof(T); }
-};
-
-TYPED_TEST_SUITE_P(ElementTypeTest);
-
-TYPED_TEST_P(ElementTypeTest, TypeAndSize) {
-  const size_t size = GetByteWidth<GetElementType<TypeParam>()>();
-  EXPECT_EQ(size, this->Size());
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ElementTypeTest, TypeAndSize);
-
-using Types =
-    ::testing::Types<bool, uint8_t, int8_t, int16_t, uint16_t, uint32_t,
-                     int32_t, uint64_t, int64_t, float, double>;
-
-INSTANTIATE_TYPED_TEST_SUITE_P(ElementTypeTestSuite, ElementTypeTest, Types);
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_environment.h b/tensorflow/lite/experimental/litert/cc/litert_environment.h
deleted file mode 100644
index 69faebdea892..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_environment.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ENVIRONMENT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ENVIRONMENT_H_
-
-#include <any>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
-namespace litert {
-
-class Environment
-    : public internal::Handle<LiteRtEnvironment, LiteRtDestroyEnvironment> {
- public:
-  explicit Environment(LiteRtEnvironment env)
-      : internal::Handle<LiteRtEnvironment, LiteRtDestroyEnvironment>(env,
-                                                                      true) {}
-
-  enum class OptionTag {
-    CompilerPluginLibraryDir = kLiteRtEnvOptionTagCompilerPluginLibraryDir,
-    DispatchLibraryDir = kLiteRtEnvOptionTagDispatchLibraryDir,
-  };
-
-  struct Option {
-    OptionTag tag;
-    std::any value;
-  };
-
-  static Expected<Environment> Create(absl::Span<const Option> options) {
-    auto c_options = ConvertOptions(options);
-    if (!c_options) {
-      return c_options.Error();
-    }
-    LiteRtEnvironment env;
-    if (auto status =
-            LiteRtEnvironmentCreate(c_options->size(), c_options->data(), &env);
-        status != kLiteRtStatusOk) {
-      return Error(status);
-    } else {
-      return Environment(env);
-    }
-  }
-
- private:
-  static Expected<std::vector<LiteRtEnvOption>> ConvertOptions(
-      absl::Span<const Option> options) {
-    std::vector<LiteRtEnvOption> c_options;
-    c_options.reserve(options.size());
-
-    for (auto& option : options) {
-      auto litert_any = ToLiteRtAny(option.value);
-      if (!litert_any) {
-        return litert_any.Error();
-      }
-
-      LiteRtEnvOption c_option = {
-          /*.tag=*/static_cast<LiteRtEnvOptionTag>(option.tag),
-          /*.value=*/*litert_any,
-      };
-      c_options.push_back(c_option);
-    }
-
-    return c_options;
-  }
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_ENVIRONMENT_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_environment_test.cc b/tensorflow/lite/experimental/litert/cc/litert_environment_test.cc
deleted file mode 100644
index 0f012aedfee6..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_environment_test.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-
-namespace litert {
-namespace {
-
-TEST(EnvironmentTest, Default) {
-  auto env = litert::Environment::Create({});
-  EXPECT_TRUE(env);
-}
-
-TEST(EnvironmentTest, Options) {
-  constexpr absl::string_view kDispatchLibraryDir = "/data/local/tmp";
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  EXPECT_TRUE(env);
-}
-
-TEST(EnvironmentTest, CompiledModelBasic) {
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(Environment env, litert::Environment::Create({}));
-
-  // Create Model and check signatures.
-  Model model = testing::LoadTestFileModel(kModelFileName);
-  ASSERT_TRUE(model);
-
-  // Create CompiledModel.
-  auto compiled_model = CompiledModel::Create(env, model);
-  EXPECT_TRUE(compiled_model);
-}
-
-TEST(EnvironmentTest, StringLifeCycle) {
-  std::string dispatch_library_dir = "/data/local/tmp";
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          absl::string_view(dispatch_library_dir),
-      },
-  };
-
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-
-  EXPECT_TRUE(env);
-
-  // Change the string value but the environment should still have a copy.
-  dispatch_library_dir = "";
-
-  // Create Model and check signatures.
-  Model model = testing::LoadTestFileModel(kModelFileName);
-  ASSERT_TRUE(model);
-
-  // Create CompiledModel.
-  auto compiled_model = CompiledModel::Create(*env, model);
-  EXPECT_TRUE(compiled_model);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_event.h b/tensorflow/lite/experimental/litert/cc/litert_event.h
deleted file mode 100644
index a618d3e8e478..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_event.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EVENT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EVENT_H_
-
-#include <cstdint>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
-
-namespace litert {
-
-class Event : public internal::Handle<LiteRtEvent, LiteRtDestroyEvent> {
- public:
-  // Parameter `owned` indicates if the created TensorBufferRequirements object
-  // should take ownership of the provided `requirements` handle.
-  explicit Event(LiteRtEvent event, bool owned = true)
-      : internal::Handle<LiteRtEvent, LiteRtDestroyEvent>(event, owned) {}
-
-  static Expected<Event> CreateFromSyncFenceFd(int sync_fence_fd,
-                                               bool owns_fd) {
-    LiteRtEvent event;
-    if (auto status =
-            LiteRtCreateEventFromSyncFenceFd(sync_fence_fd, owns_fd, &event);
-        status != kLiteRtStatusOk) {
-      return Error(status, "Failed to create event from sync fence fd");
-    }
-    return Event(event);
-  }
-
-  Expected<int> GetSyncFenceFd(LiteRtEvent event) {
-    int fd;
-    if (auto status = LiteRtGetEventSyncFenceFd(Get(), &fd);
-        status != kLiteRtStatusOk) {
-      return Error(status, "Failed to get sync fence fd from event");
-    }
-    return fd;
-  }
-
-  // Pass -1 for timeout_in_ms for indefinite wait.
-  Expected<void> Wait(int64_t timeout_in_ms) {
-    if (auto status = LiteRtEventWait(Get(), timeout_in_ms);
-        status != kLiteRtStatusOk) {
-      return Error(status, "Failed to wait on event");
-    }
-    return {};
-  }
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EVENT_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_expected.h b/tensorflow/lite/experimental/litert/cc/litert_expected.h
deleted file mode 100644
index fa850ebb8d00..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_expected.h
+++ /dev/null
@@ -1,369 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EXPECTED_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EXPECTED_H_
-
-#include <initializer_list>
-#include <memory>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "absl/log/absl_check.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-
-namespace litert {
-
-// An "Expected" incapsulates the result of some routine which may have an
-// unexpected result. Unexpected results in this context are a standard
-// LiteRtStatus plus extra usability data such as error messages. This is
-// similar to an absl::StatusOr or std::expected (C++23) but better integrated
-// with LiteRtStatus as the canonical status code.
-
-// C++ wrapper around LiteRtStatus code. Provides a status as well
-// as an error message.
-class Error {
- public:
-  // Construct Unexpected from status and optional error message.
-  //
-  // NOTE: kLiteRtStatusOk should not be passed to Unexpected.
-  explicit Error(LiteRtStatus status, std::string message = "")
-      : status_(status), message_(std::move(message)) {
-    ABSL_DCHECK(status != kLiteRtStatusOk);
-  }
-
-  // Get the status.
-  constexpr LiteRtStatus Status() const { return status_; }
-
-  // Get the error message, empty string if none was attached.
-  const std::string& Message() const { return message_; }
-
-  friend std::ostream& operator<<(std::ostream& stream, const Error& error) {
-    stream << LiteRtGetStatusString(error.Status());
-    if (!error.Message().empty()) {
-      stream << ": " << error.Message();
-    }
-    return stream;
-  }
-
-  template <class Sink>
-  friend void AbslStringify(Sink& sink, const Error& error) {
-    absl::Format(&sink, "%s", LiteRtGetStatusString(error.Status()));
-    if (!error.Message().empty()) {
-      absl::Format(&sink, ": %v", error.Message());
-    }
-  }
-
- private:
-  LiteRtStatus status_;
-  std::string message_;
-};
-
-class Unexpected {
- public:
-  template <class... Args>
-  constexpr explicit Unexpected(Args&&... args)
-      : error_(std::forward<Args>(args)...) {}
-
-  // Allow for implicit conversion from convertible Error value inplace.
-  // NOLINTNEXTLINE(*-explicit-constructor)
-  Unexpected(class Error&& e) : error_(std::move(e)) {}
-
-  Unexpected(Unexpected&& other) = default;
-  Unexpected(const Unexpected& other) = default;
-  Unexpected& operator=(Unexpected&& other) = default;
-  Unexpected& operator=(const Unexpected& other) = default;
-
-  constexpr const class Error& Error() const& noexcept { return error_; }
-  constexpr class Error& Error() & noexcept { return error_; }
-  constexpr const class Error&& Error() const&& noexcept {
-    return std::move(error_);
-  }
-  constexpr class Error&& Error() && noexcept { return std::move(error_); }
-
-  template <class Sink>
-  friend void AbslStringify(Sink& sink, const Unexpected& unexpected) {
-    AbslStringify(sink, unexpected.Error());
-  }
-
- private:
-  class Error error_;
-};
-
-// Utility for generic return values that may be a statused failure. Expecteds
-// store and own the lifetime of either an Unexpected, or a T. T may be any
-// type, primitive or non-primitive.
-//
-// No dynamic allocations occur during initialization, so the underlying T is
-// only movable (as opposed to something like "release"). Arguments should be
-// constructed in place at the time of initializing the expected if possible.
-//
-// Unexpected&& and T&& may be implicitly casted
-// to an Expected. For example,
-//
-// Expected<Foo> Bar() {
-//   bool success = ...
-//   if (!success) { return Unexpected(kLiteRtStatus, "Bad Baz"); }
-//   return Foo();
-// }
-//
-template <class T>
-class Expected {
- public:
-  // Construct Expected with T inplace.
-
-  // Construct T from initializer list inplace.
-  template <class U>
-  Expected(std::initializer_list<U> il) : has_value_(true), value_(il) {}
-
-  // Construct T from forwarded args inplace.
-  template <class... Args>
-  explicit Expected(Args&&... args)
-      : has_value_(true), value_(std::forward<Args>(args)...) {}
-
-  // NOLINTBEGIN(*-explicit-constructor)
-
-  // Allow for implicit conversion from convertible T value inplace.
-  Expected(const T& t) : has_value_(true), value_(t) {}
-  Expected(T&& t) : has_value_(true), value_(std::move(t)) {}
-
-  // Construct from Unexpected inplace.
-
-  // Allow for implicit conversion from Error.
-  Expected(const Unexpected& err) : has_value_(false), unexpected_(err) {}
-  Expected(Unexpected&& err) : has_value_(false), unexpected_(std::move(err)) {}
-  Expected(const class Error& e) : has_value_(false), unexpected_(e) {}
-
-  // NOLINTEND(*-explicit-constructor)
-
-  // Copy/move
-
-  Expected(Expected&& other) : has_value_(other.HasValue()) {
-    if (HasValue()) {
-      ConstructAt(std::addressof(value_), std::move(other.value_));
-    } else {
-      ConstructAt(std::addressof(unexpected_), std::move(other.unexpected_));
-    }
-  }
-
-  Expected(const Expected& other) : has_value_(other.has_value_) {
-    if (HasValue()) {
-      ConstructAt(std::addressof(value_), other.value_);
-      value_ = other.value_;
-    } else {
-      ConstructAt(std::addressof(unexpected_), other.unexpected_);
-    }
-  }
-
-  Expected& operator=(Expected&& other) {
-    if (this != &other) {
-      this->~Expected();
-      has_value_ = other.has_value_;
-      if (HasValue()) {
-        value_ = std::move(other.Value());
-      } else {
-        unexpected_ = std::move(other.unexpected_);
-      }
-    }
-    return *this;
-  }
-
-  Expected& operator=(const Expected& other) {
-    ~Expected();
-    has_value_ = other.has_value_;
-    if (HasValue()) {
-      value_ = other.value_;
-    } else {
-      unexpected_ = other.unexpected_;
-    }
-    return *this;
-  }
-
-  ~Expected() {
-    if (has_value_ && std::is_destructible<T>()) {
-      value_.~T();
-    } else {
-      unexpected_.~Unexpected();
-    }
-  }
-
-  // Observers for T value, program exits if it doesn't have one.
-  const T& Value() const& {
-    CheckVal();
-    return value_;
-  }
-
-  T& Value() & {
-    CheckVal();
-    return value_;
-  }
-
-  const T&& Value() const&& {
-    CheckVal();
-    return std::move(value_);
-  }
-
-  T&& Value() && {
-    CheckVal();
-    return std::move(value_);
-  }
-
-  const T* operator->() const {
-    CheckVal();
-    return &value_;
-  }
-
-  T* operator->() {
-    CheckVal();
-    return &value_;
-  }
-
-  const T& operator*() const& { return Value(); }
-
-  T& operator*() & { return Value(); }
-
-  const T&& operator*() const&& { return std::move(Value()); }
-
-  T&& operator*() && { return std::move(Value()); }
-
-  // Observer for Unexpected, program exits if it doesn't have one.
-  const class Error& Error() const& {
-    CheckNoVal();
-    return unexpected_.Error();
-  }
-
-  class Error& Error() & {
-    CheckNoVal();
-    return unexpected_.Error();
-  }
-
-  const class Error&& Error() const&& {
-    CheckNoVal();
-    return std::move(unexpected_.Error());
-  }
-
-  class Error&& Error() && {
-    CheckNoVal();
-    return std::move(unexpected_.Error());
-  }
-
-  // Does this expected contain a T Value. It contains an unexpected if not.
-  bool HasValue() const { return has_value_; }
-
-  // Convert to bool for HasValue.
-  explicit operator bool() const { return HasValue(); }
-
- private:
-  bool has_value_;
-  union {
-    T value_;
-    Unexpected unexpected_;
-  };
-  void CheckNoVal() const { ABSL_CHECK(!HasValue()); }
-  void CheckVal() const { ABSL_CHECK(HasValue()); }
-};
-
-namespace internal {
-template <class T>
-struct CanBeAbslFormated {
-  template <class U>
-  static constexpr auto Check(int)
-      -> decltype(absl::StrCat(std::declval<U>()), true) {
-    return true;
-  }
-  template <class U>
-  static constexpr bool Check(...) {
-    return false;
-  }
-  enum { value = Check<T>(0) };
-};
-}  // namespace internal
-
-template <class Sink, class T>
-void AbslStringify(Sink& sink, const Expected<T>& expected) {
-  if (!expected.HasValue()) {
-    absl::Format(&sink, "%v", expected.Error());
-  } else {
-    if constexpr (std::is_same_v<T, void>) {
-      sink.Append("void expected value");
-    } else {
-      if constexpr (internal::CanBeAbslFormated<T>::value) {
-        absl::Format(&sink, "%v", expected.Value());
-      } else {
-        absl::Format(&sink, "unformattable expected value");
-      }
-    }
-  }
-}
-
-template <>
-class Expected<void> {
- public:
-  // Implicit construction is used to simplify returning a valid value, e.g., in
-  // "return {};"
-  Expected() : unexpected_(std::nullopt) {}
-
-  // NOLINTBEGIN(*-explicit-constructor)
-
-  // Construct from Unexpected inplace.
-  Expected(const Unexpected& err) : unexpected_(err) {}
-  Expected(Unexpected&& err) : unexpected_(std::move(err)) {}
-
-  // Allow for implicit conversion from Error.
-  Expected(const Error& e) : unexpected_(e) {}
-
-  // NOLINTEND(*-explicit-constructor)
-
-  // Observer for Unexpected, program exits if it doesn't have one.
-  const class Error& Error() const& {
-    CheckNoVal();
-    return unexpected_->Error();
-  }
-
-  class Error& Error() & {
-    CheckNoVal();
-    return unexpected_->Error();
-  }
-
-  const class Error&& Error() const&& {
-    CheckNoVal();
-    return std::move(unexpected_->Error());
-  }
-
-  class Error&& Error() && {
-    CheckNoVal();
-    return std::move(unexpected_->Error());
-  }
-
-  // Does this expected contain a T Value. It contains an unexpected if not.
-  bool HasValue() const { return !unexpected_.has_value(); }
-
-  // Convert to bool for HasValue.
-  explicit operator bool() const { return HasValue(); }
-
- private:
-  std::optional<Unexpected> unexpected_;
-  void CheckNoVal() const { ABSL_CHECK(!HasValue()); }
-  void CheckVal() const { ABSL_CHECK(HasValue()); }
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_EXPECTED_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_expected_test.cc b/tensorflow/lite/experimental/litert/cc/litert_expected_test.cc
deleted file mode 100644
index ad68a834dbe8..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_expected_test.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#include <cstdint>
-#include <initializer_list>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-
-namespace litert {
-
-namespace {
-using testing::StrEq;
-
-static constexpr LiteRtStatus kErrorStatus = kLiteRtStatusErrorInvalidArgument;
-
-struct TypeWithAllocation {
-  TypeWithAllocation(std::initializer_list<int> il) : allocated(il) {}
-  std::vector<int> allocated;
-};
-
-struct TypeWithFields {
-  TypeWithFields(int i_, int j_) : i(i_), j(j_) {}
-  int i;
-  int j;
-};
-
-TEST(ExpectedTest, PrimitiveExplicit) {
-  Expected<float> exp(1.0);
-  ASSERT_TRUE(exp.HasValue());
-}
-
-TEST(ExpectedTest, PrimitiveImplicit) {
-  Expected<float> exp = 1.0;
-  ASSERT_TRUE(exp.HasValue());
-}
-
-TEST(ExpectedTest, ClassWithAllocation) {
-  Expected<TypeWithAllocation> exp(TypeWithAllocation({1, 2, 3}));
-  ASSERT_TRUE(exp.HasValue());
-}
-
-TEST(ExpectedTest, ClassWithFields) {
-  Expected<TypeWithFields> exp(TypeWithFields(1, 2));
-  ASSERT_TRUE(exp.HasValue());
-}
-
-TEST(ExpectedTest, FromErrorExplicit) {
-  Expected<TypeWithAllocation> exp((Unexpected(kErrorStatus, "MESSAGE")));
-  ASSERT_FALSE(exp.HasValue());
-}
-
-TEST(ExpectedTest, FromErrorImplicit) {
-  Expected<TypeWithAllocation> exp = Unexpected(kErrorStatus);
-  ASSERT_FALSE(exp.HasValue());
-}
-
-TEST(ExpectedTest, CopyCstorError) {
-  const Expected<int> exp = Unexpected(kErrorStatus);
-  Expected<int> other(exp);
-  ASSERT_FALSE(other.HasValue());
-  EXPECT_EQ(other.Error().Status(), kErrorStatus);
-}
-
-TEST(ExpectedTest, CopyCstorVal) {
-  const Expected<int> exp = 2;
-  Expected<int> other(exp);
-  ASSERT_TRUE(other.HasValue());
-  EXPECT_EQ(other.Value(), 2);
-}
-
-TEST(ExpectedTest, CopyAssignError) {
-  const Expected<int> exp = Unexpected(kErrorStatus);
-  ASSERT_FALSE(exp.HasValue());
-  Expected<int> other = exp;
-  ASSERT_FALSE(other.HasValue());
-  EXPECT_EQ(other.Error().Status(), kErrorStatus);
-}
-
-TEST(ExpectedTest, CopyAssignVal) {
-  const Expected<int> exp = 2;
-  Expected<int> other = exp;
-  ASSERT_TRUE(other.HasValue());
-  EXPECT_EQ(other.Value(), 2);
-}
-
-TEST(ExpectedTest, MoveCstorError) {
-  Expected<int> exp = Unexpected(kErrorStatus);
-  Expected<int> other(std::move(exp));
-  ASSERT_FALSE(other.HasValue());
-  EXPECT_EQ(other.Error().Status(), kErrorStatus);
-}
-
-TEST(ExpectedTest, MoveCstorVal) {
-  Expected<int> exp = 2;
-  Expected<int> other(std::move(exp));
-  ASSERT_TRUE(other.HasValue());
-  EXPECT_EQ(other.Value(), 2);
-}
-
-TEST(ExpectedTest, MoveAssignError) {
-  Expected<int> exp = Unexpected(kErrorStatus);
-  Expected<int> other = std::move(exp);
-  ASSERT_FALSE(other.HasValue());
-  EXPECT_EQ(other.Error().Status(), kErrorStatus);
-}
-
-TEST(ExpectedTest, MoveAssignVal) {
-  Expected<int> exp = 2;
-  Expected<int> other = std::move(exp);
-  ASSERT_TRUE(other.HasValue());
-  EXPECT_EQ(other.Value(), 2);
-}
-
-TEST(ExpectedTest, Indirection) {
-  Expected<TypeWithFields> exp(TypeWithFields(1, 2));
-  EXPECT_EQ(exp->i, 1);
-  EXPECT_EQ(exp->j, 2);
-}
-
-TEST(ExpectedTest, Dereference) {
-  Expected<TypeWithFields> exp(TypeWithFields(1, 2));
-  const auto& val = *exp;
-  EXPECT_EQ(val.i, 1);
-  EXPECT_EQ(val.j, 2);
-}
-
-TEST(UnexpectedTest, WithStatus) {
-  Unexpected err(kErrorStatus);
-  EXPECT_EQ(err.Error().Status(), kErrorStatus);
-  EXPECT_TRUE(err.Error().Message().empty());
-}
-
-TEST(UnexpectedTest, WithMessage) {
-  Unexpected err(kErrorStatus, "MESSAGE");
-  EXPECT_EQ(err.Error().Status(), kErrorStatus);
-  EXPECT_EQ(err.Error().Message(), "MESSAGE");
-}
-
-TEST(UnexpectedTest, WithLocalMessageString) {
-  // Message is a string with scoped lifetime.
-  Unexpected err(kErrorStatus, absl::StrCat("MESSAGE", 1));
-  EXPECT_EQ(err.Error().Status(), kErrorStatus);
-  EXPECT_EQ(err.Error().Message(), "MESSAGE1");
-}
-
-Expected<OwningBufferRef<uint8_t>> Go() {
-  std::string data = "21234";
-  OwningBufferRef<uint8_t> buf(data.c_str());
-  return buf;
-}
-
-Expected<OwningBufferRef<uint8_t>> Forward() {
-  auto thing = Go();
-  if (!thing.HasValue()) {
-    return thing.Error();
-  }
-  // No copy elision here.
-  return thing;
-}
-
-TEST(ExpectedTest, ForwardBufThroughFuncs) {
-  auto res = Forward();
-  EXPECT_TRUE(res.HasValue());
-  EXPECT_EQ(res->StrView(), "21234");
-}
-
-TEST(ExpectedWithNoValue, WithoutError) {
-  Expected<void> expected = {};
-  EXPECT_TRUE(expected.HasValue());
-}
-
-TEST(ExpectedWithNoValue, WithError) {
-  Expected<void> expected(Unexpected(kErrorStatus, "MESSAGE"));
-  EXPECT_FALSE(expected.HasValue());
-  EXPECT_EQ(expected.Error().Status(), kErrorStatus);
-  EXPECT_EQ(expected.Error().Message(), "MESSAGE");
-}
-
-TEST(ExpectedWithNoValue, OStreamOutput) {
-  Expected<void> expected(Unexpected(kErrorStatus, "MESSAGE"));
-  std::ostringstream oss;
-  oss << expected.Error();
-  EXPECT_THAT(oss.str(), testing::HasSubstr("MESSAGE"));
-}
-
-TEST(ExpectedTest, PrintingWorks) {
-  EXPECT_THAT(absl::StrCat(Expected<int>(3)), StrEq("3"));
-
-  EXPECT_THAT(absl::StrCat(Expected<void>()), StrEq("void expected value"));
-
-  EXPECT_THAT(absl::StrCat(Unexpected(kLiteRtStatusErrorNotFound)),
-              StrEq("kLiteRtStatusErrorNotFound"));
-
-  EXPECT_THAT(absl::StrCat(Unexpected(kLiteRtStatusErrorNotFound,
-                                      "Error not found message")),
-              StrEq("kLiteRtStatusErrorNotFound: Error not found message"));
-
-  EXPECT_THAT(absl::StrCat(Error(kLiteRtStatusErrorNotFound)),
-              StrEq("kLiteRtStatusErrorNotFound"));
-
-  EXPECT_THAT(absl::StrCat(
-                  Error(kLiteRtStatusErrorNotFound, "Error not found message")),
-              StrEq("kLiteRtStatusErrorNotFound: Error not found message"));
-
-  struct UnknownStruct {};
-  EXPECT_THAT(absl::StrCat(Expected<UnknownStruct>({})),
-              StrEq("unformattable expected value"));
-}
-
-}  // namespace
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_handle.h b/tensorflow/lite/experimental/litert/cc/litert_handle.h
deleted file mode 100644
index 503eaad335b7..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_handle.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_HANDLE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_HANDLE_H_
-
-#include <memory>
-#include <type_traits>
-
-namespace litert {
-namespace internal {
-
-template <typename H>
-inline void DummyDeleter(H) {}
-
-// This class is used to wrap and manage the lifetime of opaque handles from the
-// C API into an equivalent C++ object. The class is a wrapper on
-// std::unique_ptr<> that has a default constructor and doesn't crash if the
-// deleter is null.
-template <typename H, void (*deleter)(H)>
-class Handle {
- public:
-  Handle() = default;
-  explicit Handle(H handle, bool owned) noexcept
-      : ptr_(handle, owned ? deleter : DummyDeleter<H>) {}
-
-  Handle(Handle&& other) noexcept { *this = std::move(other); }
-
-  Handle& operator=(Handle&& other) noexcept {
-    std::swap(ptr_, other.ptr_);
-    return *this;
-  }
-
-  // Return true if the underlying LiteRtTensorBuffer handle is valid.
-  explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }
-
-  // Return the underlying LiteRtTensorBuffer handle.
-  H Get() const noexcept { return ptr_.get(); }
-
-  H Release() noexcept { return ptr_.release(); }
-
-  bool IsOwned() const noexcept {
-    return ptr_.get_deleter() != DummyDeleter<H>;
-  }
-
- private:
-  std::unique_ptr<std::remove_pointer_t<H>, void (*)(H)> ptr_ = {nullptr,
-                                                                 DummyDeleter};
-};
-
-// This class is similar to Handle, but the managed opaque handle is not owned
-// (i.e., it will not be destroyed).
-template <typename H>
-class NonOwnedHandle : public Handle<H, DummyDeleter<H>> {
- public:
-  explicit NonOwnedHandle(H handle) noexcept
-      : Handle<H, DummyDeleter<H>>(handle, /*owned=*/false) {}
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_HANDLE_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_layout.h b/tensorflow/lite/experimental/litert/cc/litert_layout.h
deleted file mode 100644
index a928e34c543a..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_layout.h
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_LAYOUT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_LAYOUT_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <initializer_list>
-#include <iterator>
-#include <optional>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_layout.h"
-
-namespace litert {
-
-// Small standalone helper functions for working with
-// c layout api.
-
-static constexpr size_t kTensorMaxRank = LITERT_TENSOR_MAX_RANK;
-
-// Build layout from given iterator of dimensions.
-template <class Begin, class End>
-inline constexpr LiteRtLayout BuildLayout(Begin begin, End end,
-                                          const uint32_t* strides = nullptr) {
-  LiteRtLayout res{static_cast<uint32_t>(end - begin), {}, strides};
-  auto i = 0;
-
-  for (auto* it = begin; it < end && i < kTensorMaxRank; ++it) {
-    res.dimensions[i] = *it;
-    ++i;
-  }
-
-  return res;
-}
-
-// Build layout from given iterable of dimensions.
-template <class Dims>
-inline constexpr LiteRtLayout BuildLayout(const Dims& dims,
-                                          const uint32_t* strides = nullptr) {
-  return BuildLayout(std::cbegin(dims), std::cend(dims), strides);
-}
-
-// Build layout from literal dimensions.
-inline constexpr LiteRtLayout BuildLayout(std::initializer_list<int32_t> dims,
-                                          const uint32_t* strides = nullptr) {
-  return BuildLayout(dims.begin(), dims.end(), strides);
-}
-
-// Compute the number of elements in dims iterator. Nullopt if there exists
-// a dynamic dimension.
-template <class Begin, class End>
-inline constexpr std::optional<size_t> NumElements(Begin begin, End end) {
-  if (end - begin == 0) {
-    return {};
-  }
-  size_t res = 1;
-  for (auto* it = begin; it < end; ++it) {
-    if (*it < 0) {
-      return {};
-    }
-    res *= *it;
-  }
-  return res;
-}
-
-// Override for layouts.
-inline constexpr std::optional<size_t> NumElements(const LiteRtLayout& layout) {
-  auto* b = std::cbegin(layout.dimensions);
-  return NumElements(b, b + layout.rank);
-}
-
-// Get dims as span.
-inline constexpr absl::Span<const int32_t> DimsSpan(
-    const LiteRtLayout& layout) {
-  return absl::MakeConstSpan(layout.dimensions, layout.rank);
-}
-
-// Get strides as span if they exist.
-inline constexpr std::optional<absl::Span<const uint32_t>> StridesSpan(
-    const LiteRtLayout& layout) {
-  if (layout.strides) {
-    return absl::MakeConstSpan(layout.strides, layout.rank);
-  }
-  return {};
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_LAYOUT_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_layout_test.cc b/tensorflow/lite/experimental/litert/cc/litert_layout_test.cc
deleted file mode 100644
index 40d9cb9873e0..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_layout_test.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-
-#include <cstdint>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace litert {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-static constexpr int32_t kStaticDims[] = {2, 2};
-static constexpr int32_t kDynDims[] = {-1, 2};
-static constexpr uint32_t kStrides[] = {1, 1};
-
-TEST(LayoutTest, BuildFromDims) {
-  auto layout = BuildLayout(kStaticDims);
-  EXPECT_EQ(layout.rank, 2);
-  EXPECT_THAT(DimsSpan(layout), ElementsAreArray(kStaticDims));
-  EXPECT_EQ(layout.strides, nullptr);
-  EXPECT_FALSE(StridesSpan(layout).has_value());
-}
-
-TEST(LayoutTest, BuildFromDimsWithStrides) {
-  auto layout = BuildLayout(kStaticDims, kStrides);
-  EXPECT_EQ(layout.rank, 2);
-  EXPECT_THAT(DimsSpan(layout), ElementsAreArray(kStaticDims));
-  auto strides = StridesSpan(layout);
-  ASSERT_TRUE(strides.has_value());
-  EXPECT_THAT(*strides, ElementsAreArray(kStrides));
-}
-
-TEST(LayoutTest, NumElements) {
-  auto layout = BuildLayout(kStaticDims);
-  auto num_elements = NumElements(layout);
-  ASSERT_TRUE(num_elements.has_value());
-  EXPECT_EQ(*num_elements, 4);
-}
-
-TEST(LayoutTest, NumElementsDynamic) {
-  auto layout = BuildLayout(kDynDims);
-  auto num_elements = NumElements(layout);
-  ASSERT_FALSE(num_elements.has_value());
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_macros.h b/tensorflow/lite/experimental/litert/cc/litert_macros.h
deleted file mode 100644
index cdc131511193..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_macros.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MACROS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MACROS_H_
-
-#include <utility>
-
-#include "absl/log/absl_check.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"  // IWYU pragma: keep
-
-#define _CONCAT_NAME_IMPL(x, y) x##y
-
-#define _CONCAT_NAME(x, y) _CONCAT_NAME_IMPL(x, y)
-
-#define _RETURN_VAL(val) return val
-
-#define LITERT_CHECK_STATUS_HAS_CODE(expr, code) ABSL_CHECK(expr == code);
-
-#define LITERT_CHECK_STATUS_OK(expr) \
-  LITERT_CHECK_STATUS_HAS_CODE(expr, kLiteRtStatusOk);
-
-#define LITERT_ENSURE_SUPPORTED(cond, msg) \
-  if (!(cond)) {                           \
-    LITERT_LOG(LITERT_ERROR, "%s", msg);   \
-    return kLiteRtStatusErrorUnsupported;  \
-  }
-
-#define LITERT_ENSURE(expr, fail_stat, msg) \
-  if (!(expr)) {                            \
-    LITERT_LOG(LITERT_ERROR, "%s", msg);    \
-    return fail_stat;                       \
-  }
-
-#define LITERT_RETURN_IF_ERROR_OR_NOT_MATCHED(expr)                          \
-  if (LiteRtStatus status = expr;                                            \
-      (status != kLiteRtStatusOk && status != kLiteRtStatusLegalizeNoMatch)) \
-    return status;
-
-#define LITERT_STACK_ARRAY(ty, var, size, init) \
-  ty* var = (ty*)alloca(sizeof(ty) * size);     \
-  for (ty* e = var; e < var + size; ++e) {      \
-    *e = init;                                  \
-  }
-
-// LITERT_RETURN_IF_ERROR(expr);
-// LITERT_RETURN_IF_ERROR(expr, return_value);
-//
-// Returns the result of `expr` if it represents an LiteRT error status (either
-// `litert::Expected` holding an error, a `LiteRtStatus` or a bool that
-// evaluated to `false`).
-//
-// Returns `return_value` if the result of `expr` represents an error.
-//
-// The result of `expr` may be referenced as `status` in `return_expr`.
-#define LITERT_RETURN_IF_ERROR(...)                                       \
-  LITERT_RETURN_IF_ERROR_SELECT_OVERLOAD(                                 \
-      (__VA_ARGS__, LITERT_RETURN_IF_ERROR_2, LITERT_RETURN_IF_ERROR_1))( \
-      __VA_ARGS__)
-
-// ASSIGN_OR_RETURN(decl, expr)
-// ASSIGN_OR_RETURN(decl, expr, return_value)
-//
-// Evaluates `expr` that should convert to a `litert::Expected` object.
-//
-// - If the object holds a value, move-assigns the value to `decl`.
-// - If the object holds an error, returns the error, casting it to a
-// `LiteRtStatus` if required.
-//
-// `return_value` may be specified to return a custom value in case of error.
-#define LITERT_ASSIGN_OR_RETURN(DECL, ...)                                     \
-  LITERT_ASSIGN_OR_RETURN_SELECT_OVERLOAD((DECL, __VA_ARGS__,                  \
-                                           LITERT_ASSIGN_OR_RETURN_HELPER_3,   \
-                                           LITERT_ASSIGN_OR_RETURN_HELPER_2))( \
-      _CONCAT_NAME(expected_value_or_error_, __LINE__), DECL, __VA_ARGS__)
-
-//////////// Implementation details start here. ///////////////////////
-
-// Converts implicitly to either `LiteRtStatus` or `litert::Expected` holding an
-// error. This allows returning a status in functions using either of these as a
-// return type in `LITERT_RETURN_IF_ERROR`.
-class ErrorStatusReturnHelper {
- public:
-  explicit ErrorStatusReturnHelper(bool expr_result)
-      : error_(kLiteRtStatusErrorUnknown) {}
-  template <class T>
-  explicit ErrorStatusReturnHelper(const litert::Expected<T>& expected)
-      : error_(expected.Error()) {}
-  template <class T>
-  explicit ErrorStatusReturnHelper(litert::Expected<T>&& expected)
-      : error_(std::move(expected.Error())) {}
-  explicit ErrorStatusReturnHelper(LiteRtStatus status) : error_(status) {}
-  explicit ErrorStatusReturnHelper(const litert::Unexpected& unexpected)
-      : error_(unexpected.Error()) {}
-  explicit ErrorStatusReturnHelper(litert::Unexpected&& unexpected)
-      : error_(std::move(unexpected.Error())) {}
-
-  // NOLINTBEGIN(*-explicit-constructor): This class transparently converts to
-  // `LiteRtStatus` and `litert::Exepected`.
-  operator LiteRtStatus() const noexcept { return error_.Status(); }
-
-  template <class T>
-  operator litert::Expected<T>() const noexcept {
-    return litert::Unexpected(error_);
-  }
-  // NOLINTEND(*-explicit-constructor)
-
-  static constexpr bool IsError(bool status) { return !status; }
-
-  static constexpr bool IsError(LiteRtStatus status) {
-    return status != kLiteRtStatusOk;
-  }
-
-  static constexpr bool IsError(const litert::Unexpected&) { return true; }
-
-  template <class T>
-  static constexpr bool IsError(const litert::Expected<T>& expected) {
-    return !expected.HasValue();
-  }
-
- private:
-  litert::Error error_;
-};
-
-#define LITERT_RETURN_IF_ERROR_SELECT_OVERLOAD_HELPER(_1, _2, OVERLOAD, ...) \
-  OVERLOAD
-
-#define LITERT_RETURN_IF_ERROR_SELECT_OVERLOAD(args) \
-  LITERT_RETURN_IF_ERROR_SELECT_OVERLOAD_HELPER args
-
-#define LITERT_RETURN_IF_ERROR_1(EXPR) \
-  LITERT_RETURN_IF_ERROR_2(EXPR, ErrorStatusReturnHelper{std::move(status)})
-
-#define LITERT_RETURN_IF_ERROR_2(EXPR, RETURN_VALUE)                      \
-  do {                                                                    \
-    if (auto status = (EXPR); ErrorStatusReturnHelper::IsError(status)) { \
-      return RETURN_VALUE;                                                \
-    }                                                                     \
-  } while (false)
-
-#define LITERT_ASSIGN_OR_RETURN_SELECT_OVERLOAD_HELPER(_1, _2, _3, OVERLOAD, \
-                                                       ...)                  \
-  OVERLOAD
-
-#define LITERT_ASSIGN_OR_RETURN_SELECT_OVERLOAD(args) \
-  LITERT_ASSIGN_OR_RETURN_SELECT_OVERLOAD_HELPER args
-
-#define LITERT_ASSIGN_OR_RETURN_HELPER_2(TMP_VAR, DECL, EXPR) \
-  LITERT_ASSIGN_OR_RETURN_HELPER_3(                           \
-      TMP_VAR, DECL, EXPR, ErrorStatusReturnHelper(std::move(TMP_VAR)))
-
-#define LITERT_ASSIGN_OR_RETURN_HELPER_3(TMP_VAR, DECL, EXPR, RETURN_VALUE) \
-  auto&& TMP_VAR = (EXPR);                                                  \
-  if (ErrorStatusReturnHelper::IsError(TMP_VAR)) {                          \
-    return RETURN_VALUE;                                                    \
-  }                                                                         \
-  DECL = std::move(TMP_VAR.Value());
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MACROS_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_macros_test.cc b/tensorflow/lite/experimental/litert/cc/litert_macros_test.cc
deleted file mode 100644
index 179b54a82d58..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_macros_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace {
-
-using testing::AllOf;
-using testing::Property;
-
-TEST(LiteRtReturnIfErrorTest, ConvertsResultToLiteRtStatus) {
-  EXPECT_EQ(
-      []() -> LiteRtStatus {
-        LITERT_RETURN_IF_ERROR(
-            Expected<int>(Unexpected(kLiteRtStatusErrorNotFound)));
-        return kLiteRtStatusOk;
-      }(),
-      kLiteRtStatusErrorNotFound);
-  EXPECT_EQ(
-      []() -> LiteRtStatus {
-        LITERT_RETURN_IF_ERROR(Unexpected(kLiteRtStatusErrorNotFound));
-        return kLiteRtStatusOk;
-      }(),
-      kLiteRtStatusErrorNotFound);
-  EXPECT_EQ(
-      []() -> LiteRtStatus {
-        LITERT_RETURN_IF_ERROR(kLiteRtStatusErrorNotFound);
-        return kLiteRtStatusOk;
-      }(),
-      kLiteRtStatusErrorNotFound);
-}
-
-TEST(LiteRtReturnIfErrorTest, ConvertsResultToExpectedHoldingAnError) {
-  EXPECT_THAT(
-      []() -> Expected<void> {
-        LITERT_RETURN_IF_ERROR(
-            Expected<void>(Unexpected(kLiteRtStatusErrorNotFound)));
-        return {};
-      }(),
-      AllOf(Property(&Expected<void>::HasValue, false),
-            Property(&Expected<void>::Error,
-                     Property(&Error::Status, kLiteRtStatusErrorNotFound))));
-  EXPECT_THAT(
-      []() -> Expected<void> {
-        LITERT_RETURN_IF_ERROR(Unexpected(kLiteRtStatusErrorNotFound));
-        return {};
-      }(),
-      AllOf(Property(&Expected<void>::HasValue, false),
-            Property(&Expected<void>::Error,
-                     Property(&Error::Status, kLiteRtStatusErrorNotFound))));
-  EXPECT_THAT(
-      []() -> Expected<void> {
-        LITERT_RETURN_IF_ERROR(kLiteRtStatusErrorNotFound);
-        return {};
-      }(),
-      AllOf(Property(&Expected<void>::HasValue, false),
-            Property(&Expected<void>::Error,
-                     Property(&Error::Status, kLiteRtStatusErrorNotFound))));
-}
-
-TEST(LiteRtReturnIfErrorTest, DoesntReturnOnSuccess) {
-  int canary_value = 0;
-  auto ReturnExpectedIfError = [&canary_value]() -> Expected<void> {
-    LITERT_RETURN_IF_ERROR(Expected<void>());
-    canary_value = 1;
-    return {};
-  };
-  EXPECT_THAT(ReturnExpectedIfError(),
-              Property(&Expected<void>::HasValue, true));
-  EXPECT_EQ(canary_value, 1);
-
-  [&canary_value]() -> LiteRtStatus {
-    LITERT_RETURN_IF_ERROR(kLiteRtStatusOk);
-    canary_value = 2;
-    return kLiteRtStatusOk;
-  }();
-  EXPECT_EQ(canary_value, 2);
-}
-
-TEST(LiteRtAssignOrReturnTest, VariableAssignmentWorks) {
-  int canary_value = 0;
-  auto ChangeCanaryValue = [&canary_value]() -> LiteRtStatus {
-    LITERT_ASSIGN_OR_RETURN(canary_value, Expected<int>(1));
-    return kLiteRtStatusOk;
-  };
-  EXPECT_EQ(ChangeCanaryValue(), kLiteRtStatusOk);
-  EXPECT_EQ(canary_value, 1);
-}
-
-TEST(LiteRtAssignOrReturnTest, MoveOnlyVariableAssignmentWorks) {
-  struct MoveOnly {
-    explicit MoveOnly(int val) : val(val) {};
-    MoveOnly(const MoveOnly&) = delete;
-    MoveOnly& operator=(const MoveOnly&) = delete;
-    MoveOnly(MoveOnly&&) = default;
-    MoveOnly& operator=(MoveOnly&&) = default;
-    int val = 1;
-  };
-
-  MoveOnly canary_value{0};
-  auto ChangeCanaryValue = [&canary_value]() -> LiteRtStatus {
-    LITERT_ASSIGN_OR_RETURN(canary_value, Expected<MoveOnly>(1));
-    return kLiteRtStatusOk;
-  };
-  EXPECT_EQ(ChangeCanaryValue(), kLiteRtStatusOk);
-  EXPECT_EQ(canary_value.val, 1);
-}
-
-TEST(LiteRtAssignOrReturnTest, ReturnsOnFailure) {
-  const Expected<int> InvalidArgumentError =
-      Expected<int>(Unexpected(kLiteRtStatusErrorInvalidArgument));
-
-  int canary_value = 0;
-  auto ErrorWithStatus = [&]() -> LiteRtStatus {
-    LITERT_ASSIGN_OR_RETURN(canary_value, InvalidArgumentError);
-    return kLiteRtStatusOk;
-  };
-  EXPECT_EQ(ErrorWithStatus(), kLiteRtStatusErrorInvalidArgument);
-  EXPECT_EQ(canary_value, 0);
-
-  auto ErrorWithCustomStatus = [&]() -> int {
-    LITERT_ASSIGN_OR_RETURN(canary_value, InvalidArgumentError, 42);
-    return 1;
-  };
-  EXPECT_EQ(ErrorWithCustomStatus(), 42);
-  EXPECT_EQ(canary_value, 0);
-
-  auto ErrorWithExpected = [&]() -> Expected<void> {
-    LITERT_ASSIGN_OR_RETURN(canary_value, InvalidArgumentError);
-    return {};
-  };
-  auto expected_return = ErrorWithExpected();
-  ASSERT_FALSE(expected_return.HasValue());
-  EXPECT_EQ(expected_return.Error().Status(),
-            kLiteRtStatusErrorInvalidArgument);
-  EXPECT_EQ(canary_value, 0);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_model.cc b/tensorflow/lite/experimental/litert/cc/litert_model.cc
deleted file mode 100644
index b67c5c75d237..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_model.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-
-bool Tensor::IsSubgraphOutput() const { return Uses().empty(); }
-
-bool Tensor::IsSubgraphInput() const {
-  // A special case for zero-sized tensors.
-  if (RankedTensorType()->Layout().Rank() == 1 &&
-      RankedTensorType()->Layout().Dimensions()[0] == 0) {
-    return false;
-  }
-  return !HasWeights() && !DefiningOp().has_value();
-}
-
-bool Tensor::IsConstant() const {
-  return HasWeights() && !DefiningOp().has_value();
-}
-
-Tensor::TensorUses Tensor::Uses() const {
-  LiteRtParamIndex num_uses;
-  litert::internal::AssertOk(LiteRtGetNumTensorUses, Get(), &num_uses);
-
-  TensorUses uses;
-  for (auto i = 0; i < num_uses; ++i) {
-    LiteRtOp user;
-    LiteRtParamIndex user_arg_index;
-    litert::internal::AssertOk(LiteRtGetTensorUse, Get(), i, &user,
-                               &user_arg_index);
-    uses.emplace_back(TensorUse{Op(user), user_arg_index});
-  }
-  return uses;
-}
-
-OpInputs Op::Inputs() const {
-  LiteRtParamIndex num_inputs;
-  internal::AssertOk(LiteRtGetNumOpInputs, Get(), &num_inputs);
-
-  OpInputs inputs;
-  for (auto i = 0; i < num_inputs; ++i) {
-    LiteRtTensor input;
-    internal::AssertOk(LiteRtGetOpInput, Get(), i, &input);
-    inputs.emplace_back(Tensor(input));
-  }
-  return inputs;
-}
-
-OpOutputs Op::Outputs() const {
-  LiteRtParamIndex num_outputs;
-  internal::AssertOk(LiteRtGetNumOpOutputs, Get(), &num_outputs);
-
-  OpOutputs outputs;
-  for (auto i = 0; i < num_outputs; ++i) {
-    LiteRtTensor output;
-    internal::AssertOk(LiteRtGetOpOutput, Get(), i, &output);
-    outputs.emplace_back(Tensor(output));
-  }
-  return outputs;
-}
-
-SubgraphInputs Subgraph::Inputs() const {
-  LiteRtParamIndex num_inputs;
-  internal::AssertOk(LiteRtGetNumSubgraphInputs, Get(), &num_inputs);
-
-  SubgraphInputs inputs;
-  for (auto i = 0; i < num_inputs; ++i) {
-    LiteRtTensor input;
-    internal::AssertOk(LiteRtGetSubgraphInput, Get(), i, &input);
-    inputs.emplace_back(Tensor(input));
-  }
-  return inputs;
-}
-
-Expected<Tensor> Subgraph::Input(absl::string_view name) const {
-  LiteRtParamIndex num_inputs;
-  internal::AssertOk(LiteRtGetNumSubgraphInputs, Get(), &num_inputs);
-
-  for (auto i = 0; i < num_inputs; ++i) {
-    LiteRtTensor input;
-    internal::AssertOk(LiteRtGetSubgraphInput, Get(), i, &input);
-    const char* input_name;
-    internal::AssertOk(LiteRtGetTensorName, input, &input_name);
-    if (name == input_name) {
-      return Tensor(input);
-    }
-  }
-  return Unexpected(kLiteRtStatusErrorNotFound, "Failed to find input");
-}
-
-Expected<Tensor> Subgraph::Output(absl::string_view name) const {
-  LiteRtParamIndex num_outputs;
-  internal::AssertOk(LiteRtGetNumSubgraphOutputs, Get(), &num_outputs);
-
-  for (auto i = 0; i < num_outputs; ++i) {
-    LiteRtTensor output;
-    internal::AssertOk(LiteRtGetSubgraphOutput, Get(), i, &output);
-    const char* output_name;
-    internal::AssertOk(LiteRtGetTensorName, output, &output_name);
-    if (name == output_name) {
-      return Tensor(output);
-    }
-  }
-  return Unexpected(kLiteRtStatusErrorNotFound, "Failed to find output");
-}
-
-SubgraphOutputs Subgraph::Outputs() const {
-  LiteRtParamIndex num_outputs;
-  internal::AssertOk(LiteRtGetNumSubgraphOutputs, Get(), &num_outputs);
-
-  SubgraphOutputs outputs;
-  for (auto i = 0; i < num_outputs; ++i) {
-    LiteRtTensor output;
-    internal::AssertOk(LiteRtGetSubgraphOutput, Get(), i, &output);
-    outputs.emplace_back(Tensor(output));
-  }
-  return outputs;
-}
-
-std::vector<Op> Subgraph::Ops() const {
-  LiteRtParamIndex num_ops;
-  internal::AssertOk(LiteRtGetNumSubgraphOps, Get(), &num_ops);
-
-  std::vector<Op> ops;
-  for (auto i = 0; i < num_ops; ++i) {
-    LiteRtOp op;
-    litert::internal::AssertOk(LiteRtGetSubgraphOp, Get(), i, &op);
-    ops.emplace_back(Op(op));
-  }
-  return ops;
-}
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_model.h b/tensorflow/lite/experimental/litert/cc/litert_model.h
deleted file mode 100644
index 54e056d75763..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_model.h
+++ /dev/null
@@ -1,532 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_H_
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_consts.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-
-namespace litert {
-
-using Dimensions = absl::InlinedVector<int32_t, kExpectedMaxTensorRank>;
-using Strides = absl::InlinedVector<uint32_t, kExpectedMaxTensorRank>;
-
-// Tensor layout. C++ equivalent to LiteRtLayout.
-class Layout {
- public:
-  explicit Layout(litert::Dimensions&& dimensions,
-                  litert::Strides&& strides = litert::Strides())
-      : dimensions_(std::move(dimensions)), strides_(std::move(strides)) {}
-
-  explicit Layout(const LiteRtLayout& layout)
-      : dimensions_(layout.dimensions, layout.dimensions + layout.rank) {
-    if (layout.strides) {
-      strides_.reserve(layout.rank);
-      std::copy(layout.strides, layout.strides + layout.rank,
-                std::back_inserter(strides_));
-    }
-  }
-
-  explicit operator LiteRtLayout() const {
-    auto res = BuildLayout(dimensions_);
-    res.strides = HasStrides() ? strides_.data() : nullptr;
-    return res;
-  }
-
-  bool operator==(const Layout& other) const {
-    return dimensions_ == other.dimensions_ && strides_ == other.strides_;
-  }
-
-  uint32_t Rank() const { return dimensions_.size(); }
-
-  absl::Span<const int32_t> Dimensions() const {
-    return absl::MakeSpan(dimensions_.data(), dimensions_.size());
-  }
-
-  bool HasStrides() const { return !strides_.empty(); }
-
-  absl::Span<const uint32_t> Strides() const {
-    const uint32_t* data = HasStrides() ? strides_.data() : nullptr;
-    auto size = HasStrides() ? Rank() : 0;
-    return absl::MakeSpan(data, size);
-  }
-
-  // Get the number of scalar elements in this tensor type. std::nullopt if
-  // not fully static.
-  std::optional<size_t> NumElements() const {
-    return ::litert::NumElements(dimensions_.cbegin(), dimensions_.cend());
-  }
-
- private:
-  litert::Dimensions dimensions_;
-  litert::Strides strides_;
-};
-
-// Type for tensors with known dimensions. C++ equivalent to
-// LiteRtRankedTensorType.
-class RankedTensorType {
- public:
-  RankedTensorType(enum ElementType element_type, class Layout&& layout)
-      : element_type_(element_type), layout_(std::move(layout)) {}
-  explicit RankedTensorType(const LiteRtRankedTensorType& type)
-      : element_type_(static_cast<enum ElementType>(type.element_type)),
-        layout_(type.layout) {}
-
-  explicit operator LiteRtRankedTensorType() const {
-    return LiteRtRankedTensorType{
-        /*.element_type=*/static_cast<LiteRtElementType>(element_type_),
-        /*layout=*/static_cast<LiteRtLayout>(layout_),
-    };
-  }
-
-  bool operator==(const RankedTensorType& other) const {
-    return ElementType() == other.ElementType() && Layout() == other.Layout();
-  }
-
-  enum ElementType ElementType() const { return element_type_; }
-
-  const class Layout& Layout() const { return layout_; }
-
- private:
-  enum ElementType element_type_;
-  class Layout layout_;
-};
-
-// Tensor weights. C++ equivalent of LiteRtWeights.
-class Weights : public internal::NonOwnedHandle<LiteRtWeights> {
- public:
-  Weights() = default;
-  explicit Weights(LiteRtWeights weights)
-      : internal::NonOwnedHandle<LiteRtWeights>(weights) {}
-
-  absl::Span<const uint8_t> Bytes() const {
-    size_t size;
-    const void* addr;
-    internal::AssertOk(LiteRtGetWeightsBytes, Get(), &addr, &size);
-    return absl::MakeSpan(static_cast<const uint8_t*>(addr), size);
-  }
-};
-
-// Tensor. C++ equivalent of LiteRtTensor.
-class Tensor : public internal::NonOwnedHandle<LiteRtTensor> {
- public:
-  Tensor() = default;
-  explicit Tensor(LiteRtTensor tensor)
-      : internal::NonOwnedHandle<LiteRtTensor>(tensor) {}
-
-  enum ElementType ElementType() const {
-    if (TypeId() == kLiteRtUnrankedTensorType) {
-      return static_cast<enum ElementType>(UnrankedTensorType()->element_type);
-    } else {
-      return RankedTensorType()->ElementType();
-    }
-  }
-
-  LiteRtTensorTypeId TypeId() const {
-    LiteRtTensorTypeId type_id;
-    internal::AssertOk(LiteRtGetTensorTypeId, Get(), &type_id);
-    return type_id;
-  }
-
-  Expected<LiteRtUnrankedTensorType> UnrankedTensorType() const {
-    if (TypeId() != kLiteRtUnrankedTensorType) {
-      return Error(kLiteRtStatusErrorInvalidArgument,
-                   "Not an unranked invalid tensor");
-    }
-    LiteRtUnrankedTensorType unranked_tensor_type;
-    internal::AssertOk(LiteRtGetUnrankedTensorType, Get(),
-                       &unranked_tensor_type);
-    return unranked_tensor_type;
-  }
-
-  Expected<class RankedTensorType> RankedTensorType() const {
-    if (TypeId() != kLiteRtRankedTensorType) {
-      return Error(kLiteRtStatusErrorInvalidArgument,
-                   "Not a ranked tensor type");
-    }
-    LiteRtRankedTensorType ranked_tensor_type;
-    internal::AssertOk(LiteRtGetRankedTensorType, Get(), &ranked_tensor_type);
-    return litert::RankedTensorType(ranked_tensor_type);
-  }
-
-  LiteRtQuantizationTypeId QTypeId() const {
-    LiteRtQuantizationTypeId q_type_id;
-    internal::AssertOk(LiteRtGetQuantizationTypeId, Get(), &q_type_id);
-    return q_type_id;
-  }
-
-  bool HasQuantization() const { return QTypeId() != kLiteRtQuantizationNone; }
-
-  LiteRtQuantizationPerTensor PerTensorQuantization() const {
-    internal::AssertEq([&]() { return QTypeId(); },
-                       kLiteRtQuantizationPerTensor);
-    LiteRtQuantizationPerTensor per_tensor_quantization;
-    internal::AssertOk(LiteRtGetPerTensorQuantization, Get(),
-                       &per_tensor_quantization);
-    return per_tensor_quantization;
-  }
-
-  LiteRtQuantizationPerChannel PerChannelQuantization() const {
-    internal::AssertEq([&]() { return QTypeId(); },
-                       kLiteRtQuantizationPerChannel);
-    LiteRtQuantizationPerChannel per_channel_quantization;
-    internal::AssertOk(LiteRtGetPerChannelQuantization, Get(),
-                       &per_channel_quantization);
-    return per_channel_quantization;
-  }
-
-  bool HasWeights() const {
-    auto weights = Weights();
-    return !weights.Bytes().empty();
-  }
-
-  class Weights Weights() const {
-    LiteRtWeights weights;
-    internal::AssertOk(LiteRtGetTensorWeights, Get(), &weights);
-    return litert::Weights(weights);
-  }
-
-  absl::string_view Name() const {
-    const char* name;
-    internal::AssertOk(LiteRtGetTensorName, Get(), &name);
-    return absl::string_view(name);
-  }
-
-  struct TensorUse;
-  using TensorUses =
-      absl::InlinedVector<TensorUse, kExpectedMaxNumOfTensorUses>;
-
-  TensorUses Uses() const;
-
-  template <typename T>
-  Expected<absl::Span<const T>> WeightsData() const {
-    auto ranked_tensor_type = RankedTensorType();
-    if (!ranked_tensor_type) {
-      return ranked_tensor_type.Error();
-    }
-
-    const enum ElementType ty = ranked_tensor_type->ElementType();
-    if (ty != GetElementType<T>()) {
-      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
-    }
-
-    if (!HasWeights()) {
-      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
-    }
-    const absl::Span<const uint8_t> weights = Weights().Bytes();
-
-    auto num_elements = ranked_tensor_type->Layout().NumElements();
-    if (!num_elements.has_value()) {
-      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
-    }
-    auto byte_width = GetByteWidth(ty);
-    if (!byte_width.has_value()) {
-      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
-    }
-
-    if (byte_width.value() * num_elements.value() != weights.size()) {
-      return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
-    }
-
-    return absl::MakeConstSpan(reinterpret_cast<const T*>(weights.data()),
-                               num_elements.value());
-  }
-
-  std::optional<LiteRtTensorDefiningOp> DefiningOp() const {
-    bool has_defining_op;
-    LiteRtTensorDefiningOp defining_op;
-    internal::AssertOk(LiteRtGetTensorDefiningOp, Get(), &has_defining_op,
-                       &defining_op);
-    if (has_defining_op) {
-      return defining_op;
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  bool IsSubgraphOutput() const;
-  bool IsSubgraphInput() const;
-  bool IsConstant() const;
-};
-
-using OpInputs = absl::InlinedVector<Tensor, kExpectedMaxNumOfOpInputs>;
-using OpOutputs = absl::InlinedVector<Tensor, kExpectedMaxNumOfOpOutputs>;
-
-// Operator. C++ equivalent of LiteRtOp.
-class Op : public internal::NonOwnedHandle<LiteRtOp> {
- public:
-  Op() = default;
-  explicit Op(LiteRtOp op) : internal::NonOwnedHandle<LiteRtOp>(op) {}
-
-  LiteRtOpCode Code() const {
-    LiteRtOpCode opcode;
-    internal::AssertOk(LiteRtGetOpCode, Get(), &opcode);
-    return opcode;
-  }
-
-  OpInputs Inputs() const;
-  OpOutputs Outputs() const;
-};
-
-struct Tensor::TensorUse {
-  Op user;
-  LiteRtParamIndex user_arg_ind;
-};
-
-using SubgraphInputs =
-    absl::InlinedVector<Tensor, kExpectedMaxNumOfSubgraphInputs>;
-using SubgraphOutputs =
-    absl::InlinedVector<Tensor, kExpectedMaxNumOfSubgraphOutputs>;
-
-// Model subgraph. C++ equivalent of LiteRtSubgraph.
-class Subgraph : public internal::NonOwnedHandle<LiteRtSubgraph> {
- public:
-  Subgraph() = default;
-  explicit Subgraph(LiteRtSubgraph subgraph)
-      : internal::NonOwnedHandle<LiteRtSubgraph>(subgraph) {}
-
-  SubgraphInputs Inputs() const;
-  SubgraphOutputs Outputs() const;
-  std::vector<Op> Ops() const;
-
-  // Returns the input tensor with the given input signature name.
-  Expected<Tensor> Input(absl::string_view name) const;
-
-  // Returns the output tensor with the given output signature name.
-  Expected<Tensor> Output(absl::string_view name) const;
-};
-
-// Model signature. C++ equivalent of LiteRtSignature.
-class Signature : public internal::NonOwnedHandle<LiteRtSignature> {
- public:
-  Signature() = default;
-  explicit Signature(LiteRtSignature signature)
-      : internal::NonOwnedHandle<LiteRtSignature>(signature) {}
-
-  absl::string_view Key() const {
-    const char* key;
-    internal::AssertOk(LiteRtGetSignatureKey, Get(), &key);
-    return key;
-  }
-
-  LiteRtSubgraph Subgraph() const {
-    LiteRtSubgraph subgraph;
-    internal::AssertOk(LiteRtGetSignatureSubgraph, Get(), &subgraph);
-    return subgraph;
-  }
-
-  std::vector<absl::string_view> InputNames() const {
-    LiteRtParamIndex num_inputs;
-    internal::AssertOk(LiteRtGetNumSignatureInputs, Get(), &num_inputs);
-    std::vector<absl::string_view> input_names;
-    input_names.reserve(num_inputs);
-    for (int i = 0; i < num_inputs; ++i) {
-      const char* input_name;
-      internal::AssertOk(LiteRtGetSignatureInputName, Get(), i, &input_name);
-      input_names.push_back(input_name);
-    }
-    return input_names;
-  }
-
-  std::vector<absl::string_view> OutputNames() const {
-    LiteRtParamIndex num_outputs;
-    internal::AssertOk(LiteRtGetNumSignatureOutputs, Get(), &num_outputs);
-    std::vector<absl::string_view> output_names;
-    output_names.reserve(num_outputs);
-    for (int i = 0; i < num_outputs; ++i) {
-      const char* output_name;
-      internal::AssertOk(LiteRtGetSignatureOutputName, Get(), i, &output_name);
-      output_names.push_back(output_name);
-    }
-    return output_names;
-  }
-};
-
-// Model. C++ equivalent of LiteRtModel.
-class Model : public internal::Handle<LiteRtModel, LiteRtDestroyModel> {
- public:
-  Model() = default;
-
-  static Model CreateFromOwnedHandle(LiteRtModel model) {
-    return Model(model, /*owned=*/true);
-  }
-
-  static Model CreateFromNonOwnedHandle(LiteRtModel model) {
-    return Model(model, /*owned=*/false);
-  }
-
-  static Expected<Model> CreateFromFile(const std::string& filename) {
-    LiteRtModel model;
-    if (auto status = LiteRtCreateModelFromFile(filename.c_str(), &model);
-        status != kLiteRtStatusOk) {
-      return Unexpected(status, "Failed to load model from file");
-    }
-    return CreateFromOwnedHandle(model);
-  }
-
-  static Expected<Model> CreateFromBuffer(BufferRef<uint8_t> buffer) {
-    LiteRtModel model;
-    if (auto status =
-            LiteRtCreateModelFromBuffer(buffer.Data(), buffer.Size(), &model);
-        status != kLiteRtStatusOk) {
-      return Unexpected(status, "Failed to load model from buffer");
-    }
-    return CreateFromOwnedHandle(model);
-  }
-
-  Expected<absl::Span<const uint8_t>> Metadata(
-      const std::string& metadata_key) const {
-    const void* buffer;
-    size_t buffer_size;
-    if (LiteRtGetModelMetadata(Get(), metadata_key.data(), &buffer,
-                               &buffer_size) != kLiteRtStatusOk) {
-      return Unexpected(kLiteRtStatusErrorNotFound, "Metadata key not found");
-    }
-    return absl::MakeSpan(static_cast<const uint8_t*>(buffer), buffer_size);
-  }
-
-  Expected<class Subgraph> MainSubgraph() {
-    LiteRtParamIndex main_subgraph_index;
-    internal::AssertOk(LiteRtGetMainModelSubgraphIndex, Get(),
-                       &main_subgraph_index);
-    return this->Subgraph(main_subgraph_index);
-  }
-
-  size_t NumSubgraphs() const {
-    LiteRtParamIndex num_subgraphs;
-    internal::AssertOk(LiteRtGetNumModelSubgraphs, Get(), &num_subgraphs);
-    return num_subgraphs;
-  }
-
-  Expected<class Subgraph> Subgraph(size_t subgraph_index) {
-    LiteRtSubgraph subgraph;
-    if (LiteRtGetModelSubgraph(Get(), subgraph_index, &subgraph) !=
-        kLiteRtStatusOk) {
-      return Unexpected(kLiteRtStatusErrorNotFound, "Subgraph not found");
-    }
-    return litert::Subgraph(subgraph);
-  }
-
-  Expected<class Subgraph> Subgraph(absl::string_view signature_key) const {
-    auto signature = FindSignature(signature_key);
-    if (!signature) {
-      return Unexpected(kLiteRtStatusErrorNotFound, "Signature not found");
-    }
-    return litert::Subgraph(signature->Subgraph());
-  }
-
-  size_t GetNumSignatures() const {
-    LiteRtParamIndex num_signatures;
-    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
-    return num_signatures;
-  }
-
-  // Returns the list of signatures defined in the model.
-  Expected<std::vector<class Signature>> GetSignatures() const {
-    LiteRtParamIndex num_signatures;
-    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
-    std::vector<class Signature> signatures;
-    signatures.reserve(num_signatures);
-    for (int i = 0; i < num_signatures; ++i) {
-      LiteRtSignature lite_rt_signature;
-      internal::AssertOk(LiteRtGetModelSignature, Get(), i, &lite_rt_signature);
-      Signature signature(lite_rt_signature);
-      signatures.push_back(std::move(signature));
-    }
-    return std::move(signatures);
-  }
-
-  // Returns the signature at the given index.
-  Expected<class Signature> GetSignature(size_t signature_index) const {
-    LiteRtSignature lite_rt_signature;
-    internal::AssertOk(LiteRtGetModelSignature, Get(), signature_index,
-                       &lite_rt_signature);
-    return Signature(lite_rt_signature);
-  }
-
-  // Returns the signature index for the given signature key.
-  Expected<size_t> GetSignatureIndex(absl::string_view signature_key) const {
-    LiteRtParamIndex num_signatures;
-    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
-    for (int i = 0; i < num_signatures; ++i) {
-      LiteRtSignature lite_rt_signature;
-      internal::AssertOk(LiteRtGetModelSignature, Get(), i, &lite_rt_signature);
-      const char* key_cstr;
-      internal::AssertOk(LiteRtGetSignatureKey, lite_rt_signature, &key_cstr);
-      if (absl::string_view(key_cstr) == signature_key) {
-        return i;
-      }
-    }
-    return Unexpected(kLiteRtStatusErrorNotFound, "Signature not found");
-  }
-
-  // Returns the Signature object for the given signature key.
-  Expected<class Signature> FindSignature(
-      absl::string_view signature_key) const {
-    LiteRtParamIndex num_signatures;
-    internal::AssertOk(LiteRtGetNumModelSignatures, Get(), &num_signatures);
-    for (int i = 0; i < num_signatures; ++i) {
-      LiteRtSignature lite_rt_signature;
-      internal::AssertOk(LiteRtGetModelSignature, Get(), i, &lite_rt_signature);
-      const char* key_cstr;
-      internal::AssertOk(LiteRtGetSignatureKey, lite_rt_signature, &key_cstr);
-      if (absl::string_view(key_cstr) == signature_key) {
-        return Signature(lite_rt_signature);
-      }
-    }
-    return Unexpected(kLiteRtStatusErrorNotFound, "Signature not found");
-  }
-
-  static absl::string_view DefaultSignatureKey() {
-    const char* key;
-    internal::AssertOk(LiteRtGetDefaultSignatureKey, &key);
-    return key;
-  }
-
- private:
-  // Parameter `owned` indicates if the created TensorBuffer object should take
-  // ownership of the provided `tensor_buffer` handle.
-  Model(LiteRtModel model, bool owned)
-      : internal::Handle<LiteRtModel, LiteRtDestroyModel>(model, owned) {}
-};
-
-struct SerializationOptions {
-  static LiteRtModelSerializationOptions Defaults() {
-    return LiteRtModelSerializationOptions{};
-  }
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_model_predicates.cc b/tensorflow/lite/experimental/litert/cc/litert_model_predicates.cc
deleted file mode 100644
index 18efea56f7ff..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_model_predicates.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-
-#include <functional>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-namespace litert {
-namespace {
-
-template <typename T>
-bool Any(absl::Span<const T> vals, std::function<bool(const T&)> unary_pred) {
-  for (const auto& val : vals) {
-    if (unary_pred(val)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool UseSoftEqual(const Tensor::TensorUse& actual_use,
-                  const UseInfo& expected_use) {
-  if (expected_use.user_param_ind.has_value() &&
-      actual_use.user_arg_ind != expected_use.user_param_ind.value()) {
-    return false;
-  }
-  if (expected_use.op_code.has_value() &&
-      actual_use.user.Code() != expected_use.op_code.value()) {
-    return false;
-  }
-  return true;
-}
-
-}  // namespace
-
-// Does given tensor have given type and shape info. Optional values considered
-// to be a vacous match.
-bool MatchRankedTensorType(const RankedTensorType& tensor_type,
-                           const TensorTypeInfo& expected) {
-  if (expected.element_type.has_value() &&
-      (tensor_type.ElementType() != expected.element_type.value())) {
-    return false;
-  }
-
-  if (expected.dims.has_value()) {
-    auto actual_dims = tensor_type.Layout().Dimensions();
-    auto expected_dims = absl::MakeConstSpan(expected.dims.value());
-    return AllZip(actual_dims, expected_dims,
-                  [](auto l, auto r) -> bool { return l == r; });
-  }
-  return true;
-}
-
-// Does given op have signature matching given types. Optional values considered
-// to be a vacous match.
-bool MatchOpType(
-    const Op& op,
-    const std::vector<std::optional<TensorTypeInfo>>& expected_inputs,
-    const std::vector<std::optional<TensorTypeInfo>>& expected_outputs) {
-  auto match = [](const Tensor& actual,
-                  const std::optional<TensorTypeInfo>& expected) -> bool {
-    if (!expected.has_value()) {
-      return true;
-    }
-    auto actual_ranked_tensor_type = actual.RankedTensorType();
-    // Don't return a match if the tensor is unranked.
-    if (!actual_ranked_tensor_type) {
-      return false;
-    }
-    return MatchRankedTensorType(*actual_ranked_tensor_type, expected.value());
-  };
-
-  const bool inputs_match = AllZip(absl::MakeConstSpan(op.Inputs()),
-                                   absl::MakeConstSpan(expected_inputs), match);
-  const bool outputs_match =
-      AllZip(absl::MakeConstSpan(op.Outputs()),
-             absl::MakeConstSpan(expected_outputs), match);
-  return inputs_match && outputs_match;
-}
-
-bool MatchUse(const Tensor& tensor, const UseInfo& expected_use) {
-  auto soft_equal = [&expected_use = std::as_const(expected_use)](
-                        const Tensor::TensorUse& actual_use) {
-    return UseSoftEqual(actual_use, expected_use);
-  };
-  return Any<Tensor::TensorUse>(tensor.Uses(), soft_equal);
-}
-
-bool MatchUses(const Tensor& tensor, const std::vector<UseInfo>& expected_uses,
-               bool strict) {
-  const auto uses = tensor.Uses();
-  if (strict && uses.size() != expected_uses.size()) {
-    return false;
-  }
-  auto not_use = [&tensor =
-                      std::as_const(tensor)](const UseInfo& expected_use) {
-    return !MatchUse(tensor, expected_use);
-  };
-  return !Any<UseInfo>(expected_uses, not_use);
-}
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_model_predicates.h b/tensorflow/lite/experimental/litert/cc/litert_model_predicates.h
deleted file mode 100644
index 238e9a455bbb..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_model_predicates.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_PREDICATES_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_PREDICATES_H_
-
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-// Predicates used for matching patterns in the graph. NOTE: All optionals in
-// matcher arguments are considered to be a vacous match.
-
-namespace litert {
-
-struct TensorTypeInfo {
-  std::optional<ElementType> element_type = std::nullopt;
-  std::optional<absl::InlinedVector<int32_t, 4>> dims = std::nullopt;
-
-  explicit TensorTypeInfo(ElementType element_type)
-      : element_type(element_type) {}
-  explicit TensorTypeInfo(absl::InlinedVector<int32_t, 4> dims) : dims(dims) {}
-  TensorTypeInfo(ElementType element_type, absl::InlinedVector<int32_t, 4> dims)
-      : element_type(element_type), dims(dims) {}
-};
-
-struct UseInfo {
-  std::optional<LiteRtOpCode> op_code = std::nullopt;
-  std::optional<LiteRtParamIndex> user_param_ind = std::nullopt;
-};
-
-// Does this tensor have given type and shape info.
-bool MatchRankedTensorType(const RankedTensorType& tensor_type,
-                           const TensorTypeInfo& expected);
-
-// Does this op have signature matching given types.
-bool MatchOpType(
-    const Op& op,
-    const std::vector<std::optional<TensorTypeInfo>>& expected_inputs,
-    const std::vector<std::optional<TensorTypeInfo>>& expected_outputs);
-
-// Does this tensor contain weights whose values match expected_data.
-template <typename T>
-inline bool MatchWeights(const Tensor& tensor,
-                         absl::Span<const T> expected_data) {
-  auto weights = tensor.WeightsData<T>();
-  return weights.HasValue() && *weights == expected_data;
-}
-
-// Does this tensor have a user with the given information.
-bool MatchUse(const Tensor& tensor, const UseInfo& expected_use);
-
-// Does this tensor have matching users. If "strict" is true, then expected_uses
-// size must equal the number of actual uses, otherwise just checks each
-// expected_use match an actual use.
-bool MatchUses(const Tensor& tensor, const std::vector<UseInfo>& expected_uses,
-               bool strict = true);
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_MODEL_PREDICATES_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_model_predicates_test.cc b/tensorflow/lite/experimental/litert/cc/litert_model_predicates_test.cc
deleted file mode 100644
index f16bc764e560..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_model_predicates_test.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-
-#include <optional>
-
-#include <gtest/gtest.h>
-#include "absl/container/inlined_vector.h"
-#include "absl/log/absl_check.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-
-namespace litert {
-
-namespace {
-
-using ::litert::testing::LoadTestFileModel;
-
-TEST(MatchRankedTensorTypeTest, HasAll) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  const auto& input = inputs.front();
-  auto input_tensor_type = input.RankedTensorType();
-  EXPECT_TRUE(input_tensor_type);
-  EXPECT_TRUE(MatchRankedTensorType(
-      *input_tensor_type, TensorTypeInfo(ElementType::Float32, {2, 2})));
-}
-
-TEST(MatchRankedTensorTypeTest, NoMatch) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  const auto& input = inputs.front();
-  auto input_tensor_type = input.RankedTensorType();
-  EXPECT_TRUE(input_tensor_type);
-  EXPECT_FALSE(MatchRankedTensorType(
-      *input_tensor_type, TensorTypeInfo(ElementType::Float32, {3, 2})));
-}
-
-TEST(MatchRankedTensorTypeTest, AnyDims) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  const auto& input = inputs.front();
-  auto input_tensor_type = input.RankedTensorType();
-  EXPECT_TRUE(input_tensor_type);
-  EXPECT_TRUE(MatchRankedTensorType(*input_tensor_type,
-                                    TensorTypeInfo(ElementType::Float32)));
-}
-
-TEST(MatchRankedTensorTypeTest, AnyElementType) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  const auto& input = inputs.front();
-  auto input_tensor_type = input.RankedTensorType();
-  EXPECT_TRUE(input_tensor_type);
-  EXPECT_TRUE(
-      MatchRankedTensorType(*input_tensor_type, TensorTypeInfo({2, 2})));
-}
-
-TEST(MatchOpTypeTest, HasAll) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  TensorTypeInfo expected_type(ElementType::Float32, {2, 2});
-  EXPECT_TRUE(MatchOpType(ops.front(), {expected_type, expected_type},
-                          {expected_type}));
-}
-
-TEST(MatchOpTypeTest, NoMatch) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  TensorTypeInfo expected_type(ElementType::Float32, {2, 2});
-  TensorTypeInfo not_expected_type(ElementType::Int32, {2, 2});
-  EXPECT_FALSE(MatchOpType(ops.front(), {not_expected_type, expected_type},
-                           {expected_type}));
-}
-
-TEST(MatchOpTypeTest, AnyInput) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  TensorTypeInfo expected_type(ElementType::Float32, {2, 2});
-  EXPECT_TRUE(
-      MatchOpType(ops.front(), {std::nullopt, expected_type}, {expected_type}));
-}
-
-TEST(MatchOpTypeTest, AnyOutput) {
-  auto litert_model = LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  TensorTypeInfo expected_type(ElementType::Float32, {2, 2});
-  EXPECT_TRUE(
-      MatchOpType(ops.front(), {std::nullopt, expected_type}, {std::nullopt}));
-}
-
-TEST(MatchWeightsTest, Matches) {
-  auto litert_model = LoadTestFileModel("add_cst.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  const auto& cst = inputs.back();
-  EXPECT_TRUE(MatchWeights(cst, absl::Span<const float>({1.0, 2.0, 3.0, 4.0})));
-}
-
-TEST(MatchWeightsTest, NoMatchBadType) {
-  auto litert_model = LoadTestFileModel("add_cst.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  const auto& cst = inputs.back();
-  EXPECT_FALSE(
-      MatchWeights(cst, absl::Span<const double>({1.0, 2.0, 3.0, 4.0})));
-}
-TEST(MatchWeightsTest, NoMatchBadVals) {
-  auto litert_model = LoadTestFileModel("add_cst.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  const auto& cst = inputs.back();
-  EXPECT_FALSE(
-      MatchWeights(cst, absl::Span<const float>({3.0, 2.0, 3.0, 5.0})));
-}
-
-TEST(MatchUseTest, Match) {
-  auto litert_model = LoadTestFileModel("add_cst.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  EXPECT_TRUE(MatchUse(inputs.back(), UseInfo{kLiteRtOpCodeTflAdd, 1}));
-}
-
-TEST(MatchUseTest, MatchAnyCode) {
-  auto litert_model = LoadTestFileModel("add_cst.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  EXPECT_TRUE(MatchUse(inputs.back(), UseInfo{std::nullopt, 1}));
-}
-
-TEST(MatchUseTest, NoMatch) {
-  auto litert_model = LoadTestFileModel("add_cst.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto ops = subgraph->Ops();
-  const auto inputs = ops.front().Inputs();
-  EXPECT_FALSE(MatchUse(inputs.back(), UseInfo{std::nullopt, 2}));
-}
-
-TEST(MatchUsesTest, StrictMatch) {
-  auto litert_model = LoadTestFileModel("add_simple.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto subgraph_inputs = subgraph->Inputs();
-  const auto& tensor = subgraph_inputs.front();
-  EXPECT_TRUE(
-      MatchUses(tensor, {{kLiteRtOpCodeTflAdd, 0}, {kLiteRtOpCodeTflAdd, 1}}));
-}
-
-TEST(MatchUsesTest, StrictNoMatch) {
-  auto litert_model = LoadTestFileModel("add_simple.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto subgraph_inputs = subgraph->Inputs();
-  const auto& tensor = subgraph_inputs.front();
-  EXPECT_FALSE(MatchUses(tensor, {{kLiteRtOpCodeTflAdd, 0}}));
-}
-
-TEST(MatchUsesTest, NonStrict) {
-  auto litert_model = LoadTestFileModel("add_simple.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  ABSL_CHECK(subgraph);
-  auto subgraph_inputs = subgraph->Inputs();
-  const auto& tensor = subgraph_inputs.front();
-  EXPECT_TRUE(MatchUses(tensor, {{kLiteRtOpCodeTflAdd, 0}}, /*strict=*/false));
-}
-
-}  // namespace
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_model_test.cc b/tensorflow/lite/experimental/litert/cc/litert_model_test.cc
deleted file mode 100644
index a1a80f82e5f3..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_model_test.cc
+++ /dev/null
@@ -1,359 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-
-// Tests for CC Wrapper classes around public C api.
-
-namespace litert {
-
-namespace {
-
-static constexpr const int32_t kTensorDimensions[] = {1, 2, 3};
-
-static constexpr const auto kRank =
-    sizeof(kTensorDimensions) / sizeof(kTensorDimensions[0]);
-
-static constexpr const uint32_t kTensorStrides[] = {6, 3, 1};
-
-static constexpr const LiteRtLayout kLayout = BuildLayout(kTensorDimensions);
-
-static constexpr const LiteRtLayout kLayoutWithStrides =
-    BuildLayout(kTensorDimensions, kTensorStrides);
-
-static constexpr const LiteRtRankedTensorType kTensorType = {
-    /*.element_type=*/kLiteRtElementTypeFloat32,
-    /*.layout=*/kLayout,
-};
-
-//===----------------------------------------------------------------------===//
-//                                CC Model                                    //
-//===----------------------------------------------------------------------===//
-
-TEST(CcModelTest, SimpleModel) {
-  auto model = testing::LoadTestFileModel("one_mul.tflite");
-
-  LiteRtParamIndex num_subgraphs;
-  ASSERT_EQ(LiteRtGetNumModelSubgraphs(model.Get(), &num_subgraphs),
-            kLiteRtStatusOk);
-  EXPECT_EQ(model.NumSubgraphs(), num_subgraphs);
-  EXPECT_EQ(model.NumSubgraphs(), 1);
-
-  LiteRtParamIndex main_subgraph_index;
-  ASSERT_EQ(LiteRtGetMainModelSubgraphIndex(model.Get(), &main_subgraph_index),
-            kLiteRtStatusOk);
-  EXPECT_EQ(main_subgraph_index, 0);
-
-  LiteRtSubgraph litert_subgraph_0;
-  ASSERT_EQ(LiteRtGetModelSubgraph(model.Get(), /*subgraph_index=*/0,
-                                   &litert_subgraph_0),
-            kLiteRtStatusOk);
-
-  auto subgraph_0 = model.Subgraph(0);
-  ASSERT_TRUE(subgraph_0);
-  EXPECT_EQ(subgraph_0->Get(), litert_subgraph_0);
-
-  auto main_subgraph = model.MainSubgraph();
-  EXPECT_EQ(main_subgraph->Get(), subgraph_0->Get());
-}
-
-//===----------------------------------------------------------------------===//
-//                                CC Signature                                //
-//===----------------------------------------------------------------------===//
-
-TEST(CcSignatureTest, Basic) {
-  auto model = testing::LoadTestFileModel("one_mul.tflite");
-
-  auto signatures = model.GetSignatures();
-  ASSERT_TRUE(signatures);
-  ASSERT_EQ(signatures->size(), 1);
-  auto& signature = signatures->at(0);
-  EXPECT_THAT(signature.Key(), Model::DefaultSignatureKey());
-  auto input_names = signature.InputNames();
-  EXPECT_THAT(input_names[0], "arg0");
-  EXPECT_THAT(input_names[1], "arg1");
-  auto output_names = signature.OutputNames();
-  EXPECT_THAT(output_names[0], "tfl.mul");
-}
-
-TEST(CcSignatureTest, Lookup) {
-  auto model = testing::LoadTestFileModel("one_mul.tflite");
-
-  {
-    auto signature = model.FindSignature("nonexistent");
-    ASSERT_FALSE(signature);
-  }
-  auto signature = model.FindSignature(Model::DefaultSignatureKey());
-  ASSERT_TRUE(signature);
-  EXPECT_THAT(signature->Key(), Model::DefaultSignatureKey());
-  auto input_names = signature->InputNames();
-  EXPECT_THAT(input_names[0], "arg0");
-  EXPECT_THAT(input_names[1], "arg1");
-  auto output_names = signature->OutputNames();
-  EXPECT_THAT(output_names[0], "tfl.mul");
-}
-
-//===----------------------------------------------------------------------===//
-//                                CC Layout                                   //
-//===----------------------------------------------------------------------===//
-
-TEST(CcLayoutTest, NoStrides) {
-  Layout layout(kLayout);
-
-  ASSERT_EQ(layout.Rank(), kLayout.rank);
-  for (auto i = 0; i < layout.Rank(); ++i) {
-    ASSERT_EQ(layout.Dimensions()[i], kLayout.dimensions[i]);
-  }
-  ASSERT_FALSE(layout.HasStrides());
-}
-
-TEST(CcLayoutTest, WithStrides) {
-  Layout layout(kLayoutWithStrides);
-
-  ASSERT_EQ(layout.Rank(), kLayoutWithStrides.rank);
-  for (auto i = 0; i < layout.Rank(); ++i) {
-    ASSERT_EQ(layout.Dimensions()[i], kLayoutWithStrides.dimensions[i]);
-  }
-  ASSERT_TRUE(layout.HasStrides());
-  for (auto i = 0; i < layout.Rank(); ++i) {
-    ASSERT_EQ(layout.Strides()[i], kLayoutWithStrides.strides[i]);
-  }
-}
-
-TEST(CcLayoutTest, Equal) {
-  auto&& dims = {2, 2};
-  Layout layout1(BuildLayout(dims));
-  Layout layout2(BuildLayout({2, 2}));
-  ASSERT_TRUE(layout1 == layout2);
-}
-
-TEST(CcLayoutTest, NotEqual) {
-  Layout layout1(BuildLayout({2, 2}, nullptr));
-  Layout layout2(BuildLayout({2, 2}, kTensorStrides));
-  ASSERT_FALSE(layout1 == layout2);
-}
-
-TEST(CcLayoutTest, NumElements) {
-  Layout layout(BuildLayout({2, 2, 3}));
-  auto num_elements = layout.NumElements();
-  ASSERT_TRUE(num_elements.has_value());
-  EXPECT_EQ(num_elements.value(), 12);
-}
-
-//===----------------------------------------------------------------------===//
-//                                CC Op                                       //
-//===----------------------------------------------------------------------===//
-
-TEST(CcOpTest, SimpleSupportedOp) {
-  auto litert_model = testing::LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  const auto ops = subgraph->Ops();
-  const auto& op = ops.front();
-
-  EXPECT_EQ(op.Code(), kLiteRtOpCodeTflMul);
-  EXPECT_EQ(op.Inputs().size(), 2);
-  EXPECT_EQ(op.Outputs().size(), 1);
-}
-
-//===----------------------------------------------------------------------===//
-//                           CC RankedTensorType                              //
-//===----------------------------------------------------------------------===//
-
-TEST(CcRankedTensorTypeTest, Accessors) {
-  Layout layout(kLayout);
-  RankedTensorType tensor_type(kTensorType);
-  ASSERT_EQ(tensor_type.ElementType(),
-            static_cast<ElementType>(kTensorType.element_type));
-  ASSERT_TRUE(tensor_type.Layout() == layout);
-}
-
-//===----------------------------------------------------------------------===//
-//                                CC Tensor                                   //
-//===----------------------------------------------------------------------===//
-
-TEST(CcTensorTest, SimpleModel) {
-  auto litert_model = testing::LoadTestFileModel("one_mul.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-
-  auto inputs = subgraph->Inputs();
-  ASSERT_EQ(inputs.size(), 2);
-
-  {
-    const Tensor& input_tensor = inputs.front();
-    ASSERT_EQ(input_tensor.TypeId(), kLiteRtRankedTensorType);
-
-    auto input_ranked_tensor_type = input_tensor.RankedTensorType();
-    EXPECT_TRUE(input_ranked_tensor_type);
-    ASSERT_EQ(input_ranked_tensor_type->ElementType(), ElementType::Float32);
-
-    EXPECT_FALSE(input_tensor.HasWeights());
-
-    auto input_weights = input_tensor.Weights();
-    ASSERT_EQ(input_weights.Bytes().size(), 0);
-
-    ASSERT_EQ(input_tensor.DefiningOp(), std::nullopt);
-
-    const auto uses = input_tensor.Uses();
-    ASSERT_EQ(uses.size(), 1);
-  }
-
-  auto outputs = subgraph->Outputs();
-  ASSERT_EQ(outputs.size(), 1);
-
-  {
-    const Tensor& output_tensor = outputs.front();
-    ASSERT_EQ(output_tensor.TypeId(), kLiteRtRankedTensorType);
-
-    auto output_defining_op = output_tensor.DefiningOp();
-    EXPECT_TRUE(output_defining_op.has_value());
-
-    ASSERT_TRUE(output_tensor.Uses().empty());
-  }
-}
-
-TEST(CcTensorTest, WeightsData) {
-  auto litert_model = testing::LoadTestFileModel("add_cst.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-
-  auto data = subgraph->Ops().front().Inputs().back().WeightsData<float>();
-  ASSERT_TRUE(data.HasValue());
-  EXPECT_THAT(data.Value(), ::testing::ElementsAreArray({1.0, 2.0, 3.0, 4.0}));
-}
-
-TEST(CcTensorTest, Name) {
-  static constexpr absl::string_view kName = "foo";
-  LiteRtTensorT tensor;
-  tensor.SetName(std::string(kName));
-
-  Tensor cc_tensor(&tensor);
-  EXPECT_EQ(cc_tensor.Name(), kName);
-}
-
-TEST(CcTensorTest, QuantizationNone) {
-  LiteRtTensorT litert_tensor;
-  litert_tensor.Qparams().first = kLiteRtQuantizationNone;
-
-  Tensor tensor(&litert_tensor);
-  EXPECT_EQ(tensor.QTypeId(), kLiteRtQuantizationNone);
-  EXPECT_FALSE(tensor.HasQuantization());
-}
-
-TEST(CcTensorTest, QuantizationPerTensor) {
-  static constexpr auto kScale = 1.0;
-  static constexpr auto kZeroPoint = 1;
-
-  LiteRtTensorT litert_tensor;
-  litert_tensor.SetQarams(MakePerTensorQuantization(kScale, kZeroPoint));
-
-  Tensor tensor(&litert_tensor);
-  ASSERT_EQ(tensor.QTypeId(), kLiteRtQuantizationPerTensor);
-  ASSERT_TRUE(tensor.HasQuantization());
-
-  const auto per_tensor_quantization = tensor.PerTensorQuantization();
-  EXPECT_EQ(per_tensor_quantization.scale, kScale);
-  EXPECT_EQ(per_tensor_quantization.zero_point, kZeroPoint);
-}
-
-TEST(CcTensorTest, QuantizationPerChannel) {
-  static constexpr auto kNumChannels = 2;
-  static constexpr auto kQuantizedDimension = 0;
-  static constexpr float kScales[kNumChannels] = {1.0, 2.0};
-  static constexpr int64_t kZeroPoints[kNumChannels] = {0, 0};
-
-  LiteRtTensorT litert_tensor;
-  auto per_channel = MakePerChannelQuantization(
-      kScales, kZeroPoints, kQuantizedDimension, litert_tensor);
-  litert_tensor.SetQarams(per_channel);
-
-  Tensor tensor(&litert_tensor);
-  ASSERT_EQ(tensor.QTypeId(), kLiteRtQuantizationPerChannel);
-  ASSERT_TRUE(tensor.HasQuantization());
-
-  const auto per_channel_quantization = tensor.PerChannelQuantization();
-  EXPECT_THAT(
-      absl::MakeConstSpan(per_channel_quantization.scales, kNumChannels),
-      ::testing::ElementsAreArray(kScales));
-  EXPECT_THAT(
-      absl::MakeConstSpan(per_channel_quantization.zero_points, kNumChannels),
-      ::testing::ElementsAreArray(kZeroPoints));
-  EXPECT_EQ(per_channel_quantization.num_channels, kNumChannels);
-  EXPECT_EQ(per_channel_quantization.quantized_dimension, kQuantizedDimension);
-}
-
-TEST(CcTensorTest, ZeroSizeTensorTest) {
-  auto litert_model = testing::LoadTestFileModel("scala_reshape.tflite");
-  auto subgraph = litert_model.MainSubgraph();
-  const auto ops = subgraph->Ops();
-  const auto& op = ops.front();
-  EXPECT_FALSE(op.Inputs().at(1).IsSubgraphInput());
-}
-
-//===----------------------------------------------------------------------===//
-//                               CC Subgraph                                  //
-//===----------------------------------------------------------------------===//
-
-TEST(CcSubgraphTest, SimpleModel) {
-  auto model = testing::LoadTestFileModel("one_mul.tflite");
-  auto subgraph = model.MainSubgraph();
-
-  ASSERT_EQ(subgraph->Inputs().size(), 2);
-  ASSERT_EQ(subgraph->Outputs().size(), 1);
-  ASSERT_EQ(subgraph->Ops().size(), 1);
-
-  auto input0_tensor = subgraph->Input("arg0");
-  ASSERT_TRUE(input0_tensor.HasValue());
-  auto input1_tensor = subgraph->Input("arg1");
-  ASSERT_TRUE(input1_tensor.HasValue());
-
-  auto output_tensor = subgraph->Output("tfl.mul");
-  ASSERT_TRUE(output_tensor.HasValue());
-  ASSERT_EQ(output_tensor->TypeId(), kLiteRtRankedTensorType);
-  auto output_ranked_tensor_type = output_tensor->RankedTensorType();
-  EXPECT_TRUE(output_ranked_tensor_type);
-  ASSERT_EQ(output_ranked_tensor_type->ElementType(), ElementType::Float32);
-}
-
-//===----------------------------------------------------------------------===//
-//                               CC ElementType                               //
-//===----------------------------------------------------------------------===//
-
-TEST(CcElementTypeTest, GetByteWidth) {
-  const size_t width = GetByteWidth<ElementType::Bool>();
-  EXPECT_EQ(width, 1);
-}
-
-TEST(CcElementTypeTest, GetElementType) {
-  ElementType ty = GetElementType<float>();
-  EXPECT_EQ(ty, ElementType::Float32);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_op_options.cc b/tensorflow/lite/experimental/litert/cc/litert_op_options.cc
deleted file mode 100644
index c2cdfc6e2d0c..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_op_options.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_op_options.h"
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-
-namespace litert {
-
-LiteRtStatus CompositeOptions::InitFromOp(LiteRtOp op) {
-  LiteRtOpCode opcode;
-  LITERT_RETURN_IF_ERROR(LiteRtGetOpCode(op, &opcode));
-  if (opcode != kLiteRtOpCodeShloComposite) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  const char* op_name;
-  LITERT_RETURN_IF_ERROR(LiteRtGetSHLOCompositeOpName(op, &op_name));
-  name = op_name;
-
-  LITERT_RETURN_IF_ERROR(
-      LiteRtGetSHLOCompositeOpDecompositionSubgraphIndex(op, &subgraph));
-
-  this->op = op;
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_op_options.h b/tensorflow/lite/experimental/litert/cc/litert_op_options.h
deleted file mode 100644
index 70f6de4a3800..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_op_options.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_OP_OPTIONS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_OP_OPTIONS_H_
-
-#include <type_traits>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-
-namespace litert {
-
-struct OpOptions {
-  virtual LiteRtStatus InitFromOp(LiteRtOp op) = 0;
-  virtual ~OpOptions() = default;
-};
-
-// Struct to hold LiteRt composite ops.
-struct CompositeOptions : public OpOptions {
-  // Name for special composites representing manual partitions.
-  static constexpr absl::string_view kNpuCall = "odml.npu_call";
-
-  // The root op.
-  LiteRtOp op;
-  // Decomposition subgraph.
-  int subgraph;
-  // The name of the composite op (stored in model).
-  absl::string_view name;
-
-  LiteRtStatus InitFromOp(LiteRtOp op) override;
-};
-
-// Returns the composite info for the given op if it is a composite op.
-template <typename OptionsT>
-Expected<OptionsT> GetOptionsAs(LiteRtOp op) {
-  if constexpr (std::is_same_v<OptionsT, CompositeOptions>) {
-    CompositeOptions options;
-    LITERT_RETURN_IF_ERROR(options.InitFromOp(op));
-    return options;
-  } else {
-    // TODO: Add more as needed.
-    return Unexpected(kLiteRtStatusErrorInvalidArgument);
-  }
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_OP_OPTIONS_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_op_options_test.cc b/tensorflow/lite/experimental/litert/cc/litert_op_options_test.cc
deleted file mode 100644
index 31324c85e7c1..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_op_options_test.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_op_options.h"
-
-#include <utility>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace litert {
-namespace {
-
-TEST(OpOptionsTest, GetCompositeOptions) {
-  static constexpr auto kOptsType =
-      ::tflite::BuiltinOptions2_StableHLOCompositeOptions;
-  static constexpr absl::string_view kName = "test.composite";
-  static constexpr int kSubgraph = 1;
-
-  LiteRtOpT op;
-  op.SetOpCode(kLiteRtOpCodeShloComposite);
-
-  tflite::StableHLOCompositeOptionsT options;
-  options.name = kName;
-  options.decomposition_subgraph_index = kSubgraph;
-
-  internal::TflOptions2 tfl_options;
-  tfl_options.type = kOptsType;
-  tfl_options.Set(std::move(options));
-  detail::SetTflOptions2(op, std::move(tfl_options));
-
-  auto res = GetOptionsAs<CompositeOptions>(&op);
-  ASSERT_TRUE(res);
-  EXPECT_EQ(res->name, kName);
-  EXPECT_EQ(res->subgraph, kSubgraph);
-}
-
-TEST(OpOptionsTest, GetUnsupportedOptions) {
-  LiteRtOpT op;
-  op.SetOpCode(kLiteRtOpCodeShloAdd);
-  ASSERT_FALSE(GetOptionsAs<CompositeOptions>(&op));
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_shared_library.cc b/tensorflow/lite/experimental/litert/cc/litert_shared_library.cc
deleted file mode 100644
index 62ba5a1902ed..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_shared_library.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-
-#if !LITERT_WINDOWS_OS
-#include <dlfcn.h>
-#endif
-
-#if defined(_GNU_SOURCE) && !defined(__ANDROID__) && !defined(__APPLE__)
-#define LITERT_IMPLEMENT_SHARED_LIBRARY_INFO 1
-#include <link.h>
-#endif
-
-#include <ostream>
-#include <string>
-#include <utility>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#if LITERT_WINDOWS_OS
-// Implement dummy functions from dlfnc.h on Windows.
-namespace {
-
-const char* dlerror() {
-  return "Windows is not supported for loading shared libraries.";
-}
-
-void* dlopen(const char*, int) { return NULL; }
-
-void dlclose(void*) {}
-
-void* dlsym(void*, const char*) { return NULL; }
-
-int dlinfo(void*, int, void*) { return -1; }
-
-#define RTLD_NEXT (void*)-1;
-#define RTLD_DEFAULT (void*)0;
-
-}  // namespace
-#endif
-
-namespace litert {
-
-SharedLibrary::~SharedLibrary() noexcept { Close(); }
-
-SharedLibrary::SharedLibrary(SharedLibrary&& other) noexcept
-    : handle_kind_(other.handle_kind_),
-      path_(std::move(other.path_)),
-      handle_(other.handle_) {
-  other.handle_kind_ = HandleKind::kInvalid;
-  other.handle_ = nullptr;
-}
-
-SharedLibrary& SharedLibrary::operator=(SharedLibrary&& other) noexcept {
-  Close();
-  handle_kind_ = other.handle_kind_;
-  path_ = std::move(other.path_);
-  handle_ = other.handle_;
-  other.handle_kind_ = HandleKind::kInvalid;
-  other.handle_ = nullptr;
-  return *this;
-}
-
-void SharedLibrary::Close() noexcept {
-  if (handle_kind_ == HandleKind::kPath) {
-    dlclose(handle_);
-  }
-  handle_kind_ = HandleKind::kInvalid;
-  path_.clear();
-}
-
-absl::string_view SharedLibrary::DlError() noexcept {
-  const char* error = dlerror();
-  if (!error) {
-    return {};
-  }
-  return error;
-}
-
-Expected<SharedLibrary> SharedLibrary::LoadImpl(
-    SharedLibrary::HandleKind handle_kind, absl::string_view path,
-    RtldFlags flags) {
-  SharedLibrary lib;
-  switch (handle_kind) {
-    case HandleKind::kInvalid:
-      return Error(kLiteRtStatusErrorDynamicLoading,
-                   "This is a logic error. LoadImpl should not be called with "
-                   "HandleKind::kInvalid");
-    case HandleKind::kPath:
-      if (path.empty()) {
-        return Error(kLiteRtStatusErrorDynamicLoading,
-                     "Cannot not load shared library: empty path.");
-      }
-      lib.path_ = path;
-      lib.handle_ = dlopen(lib.Path().c_str(), flags);
-      if (!lib.handle_) {
-        return Error(kLiteRtStatusErrorDynamicLoading,
-                     absl::StrFormat("Could not load shared library %s: %s.",
-                                     lib.path_, DlError()));
-      }
-      break;
-    case HandleKind::kRtldNext:
-      lib.handle_ = RTLD_NEXT;
-      break;
-    case HandleKind::kRtldDefault:
-      lib.handle_ = RTLD_DEFAULT;
-      break;
-  }
-  lib.handle_kind_ = handle_kind;
-  return lib;
-}
-
-Expected<void*> SharedLibrary::LookupSymbolImpl(const char* symbol_name) const {
-  void* symbol = dlsym(handle_, symbol_name);
-
-  if (!symbol) {
-    return Error(kLiteRtStatusErrorDynamicLoading,
-                 absl::StrFormat("Could not load symbol %s: %s.", symbol_name,
-                                 DlError()));
-  }
-  return symbol;
-}
-
-std::ostream& operator<<(std::ostream& os, const SharedLibrary& lib) {
-  static constexpr absl::string_view kHeader = "/// DLL Info ///\n";
-  static constexpr absl::string_view kFooter = "////////////////\n";
-
-  if (lib.handle_ == nullptr) {
-    os << kHeader << "Handle is nullptr.\n" << kFooter;
-    return os;
-  }
-
-  os << kHeader;
-#ifdef RTLD_DI_LMID
-  if (Lmid_t dl_ns_idx; dlinfo(lib.handle_, RTLD_DI_LMID, &dl_ns_idx) != 0) {
-    os << "Error getting lib namespace index: " << dlerror() << ".\n";
-  } else {
-    os << "LIB NAMESPACE INDEX: " << dl_ns_idx << "\n";
-  }
-#else
-  os << "Cannot retrieve namespace index on this platform.\n";
-#endif
-
-#ifdef RTLD_DI_LINKMAP
-  if (link_map* lm; dlinfo(lib.handle_, RTLD_DI_LINKMAP, &lm) != 0) {
-    os << "Error getting linked objects: " << dlerror() << ".\n";
-  } else {
-    os << "LINKED OBJECTS:\n";
-    // Rewind to the start of the linked list.
-    const link_map* link = lm;
-    while (link->l_prev) {
-      link = link->l_prev;
-    }
-    // Print all list elements
-    for (; link != nullptr; link = link->l_next) {
-      os << (link != lm ? "   " : "***") << link->l_name << "\n";
-    }
-  }
-#else
-  os << "Cannot retrieve lib map on this platform.\n";
-#endif
-  return os << kFooter;
-}
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_shared_library.h b/tensorflow/lite/experimental/litert/cc/litert_shared_library.h
deleted file mode 100644
index 90014f167f4a..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_shared_library.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_SHARED_LIBRARY_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_SHARED_LIBRARY_H_
-
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || \
-    defined(__NT__) || defined(_WIN64)
-#define LITERT_WINDOWS_OS 1
-#endif
-
-#if !LITERT_WINDOWS_OS
-#include <dlfcn.h>
-#endif
-
-#include <ostream>
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-
-namespace litert {
-
-struct RtldFlags {
-  int flags;
-
-  static constexpr struct NextTag {
-  } kNext;
-  static constexpr struct DefaultTag {
-  } kDefault;
-
-  // NOLINTNEXTLINE(*-explicit-constructor): we want this to be passed as flags.
-  operator int() { return flags; }
-
-  static constexpr RtldFlags Lazy() {
-    return {
-#if defined(RTLD_LAZY)
-        RTLD_LAZY
-#endif
-    };
-  }
-  static constexpr RtldFlags Now() {
-    return {
-#if defined(RTLD_NOW)
-        RTLD_NOW
-#endif
-    };
-  }
-  static constexpr RtldFlags Default() { return Lazy().Local().DeepBind(); }
-  constexpr RtldFlags& Global() {
-#if defined(RTLD_GLOBAL)
-    flags |= RTLD_GLOBAL;
-#endif
-    return *this;
-  }
-  constexpr RtldFlags& Local() {
-#if defined(RTLD_LOCAL)
-    flags |= RTLD_LOCAL;
-#endif
-    return *this;
-  }
-  constexpr RtldFlags& NoDelete() {
-#if defined(RTLD_NODELETE)
-    flags |= RTLD_NODELETE;
-#endif
-    return *this;
-  }
-  constexpr RtldFlags& NoLoad() {
-#if defined(RTLD_NOLOAD)
-    flags |= RTLD_NOLOAD;
-#endif
-    return *this;
-  }
-  constexpr RtldFlags& DeepBind() {
-#if defined(RTLD_DEEPBIND)
-    flags |= RTLD_DEEPBIND;
-#endif
-    return *this;
-  }
-};
-
-// Wraps a dynamically loaded shared library to offer RAII semantics.
-class SharedLibrary {
- public:
-  SharedLibrary() = default;
-  SharedLibrary(const SharedLibrary&) = delete;
-  SharedLibrary& operator=(const SharedLibrary&) = delete;
-  SharedLibrary(SharedLibrary&&) noexcept;
-  SharedLibrary& operator=(SharedLibrary&&) noexcept;
-  ~SharedLibrary() noexcept;
-
-  // Loads the library at the given path.
-  static Expected<SharedLibrary> Load(absl::string_view path,
-                                      RtldFlags flags) noexcept {
-    return LoadImpl(HandleKind::kPath, path, flags);
-  }
-
-  // Loads the library as the RTLD_NEXT special handle.
-  static Expected<SharedLibrary> Load(RtldFlags::NextTag,
-                                      RtldFlags flags) noexcept {
-    return LoadImpl(HandleKind::kRtldNext, "", flags);
-  }
-
-  // Loads the library as the RTLD_DEFAULT special handle.
-  static Expected<SharedLibrary> Load(RtldFlags::DefaultTag,
-                                      RtldFlags flags) noexcept {
-    return LoadImpl(HandleKind::kRtldDefault, "", flags);
-  }
-
-  // Gets the last shared library operation error if there was one.
-  //
-  // If there was no error, returns an empty view.
-  static absl::string_view DlError() noexcept;
-
-  friend std::ostream& operator<<(std::ostream& os, const SharedLibrary& lib);
-
-  bool Loaded() const noexcept { return handle_kind_ != HandleKind::kInvalid; }
-
-  // Unloads the shared library.
-  //
-  // Note: this is automatically done when the object is destroyed.
-  void Close() noexcept;
-
-  // Looks up a symbol in the shared library.
-  //
-  // Note: This takes a `char*` because the underlying system call requires a
-  // null terminated string which a string view doesn't guarantee.
-  template <class T>
-  Expected<T> LookupSymbol(const char* symbol) const noexcept {
-    static_assert(std::is_pointer_v<T>,
-                  "The template parameter should always be a pointer.");
-    LITERT_ASSIGN_OR_RETURN(void* const raw_symbol, LookupSymbolImpl(symbol));
-    return reinterpret_cast<T>(raw_symbol);
-  }
-
-  // Returns the loaded library path.
-  const std::string& Path() const noexcept { return path_; }
-
-  // Returns the underlying shared library handle.
-  //
-  // Warning: some special handle value may be NULL. Do not rely on this value
-  // to check whether a library is loaded or not.
-  const void* Handle() const noexcept { return handle_; }
-  void* Handle() noexcept { return handle_; }
-
- private:
-  enum class HandleKind { kInvalid, kPath, kRtldNext, kRtldDefault };
-
-  static Expected<SharedLibrary> LoadImpl(HandleKind handle_kind,
-                                          absl::string_view path,
-                                          RtldFlags flags);
-
-  Expected<void*> LookupSymbolImpl(const char* symbol) const;
-
-  HandleKind handle_kind_ = HandleKind::kInvalid;
-  std::string path_;
-  void* handle_ = nullptr;
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_SHARED_LIBRARY_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_shared_library_test.cc b/tensorflow/lite/experimental/litert/cc/litert_shared_library_test.cc
deleted file mode 100644
index 119822ee7a22..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_shared_library_test.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-
-#include <dlfcn.h>
-
-#include <string>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-using testing::Eq;
-using testing::NotNull;
-using testing::StrEq;
-using testing::litert::IsError;
-
-namespace litert {
-namespace {
-
-extern "C" {
-
-const char* TestFunction() { return "local_test_function"; }
-
-}  //  extern "C"
-
-TEST(RtldFlagsTest, FlagFactoryWorks) {
-  EXPECT_THAT(static_cast<int>(RtldFlags::Now()), Eq(RTLD_NOW));
-  EXPECT_THAT(static_cast<int>(RtldFlags::Lazy()), Eq(RTLD_LAZY));
-  EXPECT_THAT(static_cast<int>(RtldFlags::Lazy().Global()),
-              Eq(RTLD_LAZY | RTLD_GLOBAL));
-  EXPECT_THAT(static_cast<int>(RtldFlags::Lazy().Local()),
-              Eq(RTLD_LAZY | RTLD_LOCAL));
-  EXPECT_THAT(static_cast<int>(RtldFlags::Lazy().NoDelete()),
-              Eq(RTLD_LAZY | RTLD_NODELETE));
-  EXPECT_THAT(static_cast<int>(RtldFlags::Lazy().NoLoad()),
-              Eq(RTLD_LAZY | RTLD_NOLOAD));
-  EXPECT_THAT(static_cast<int>(RtldFlags::Lazy().DeepBind()),
-              Eq(RTLD_LAZY | RTLD_DEEPBIND));
-}
-
-TEST(SharedLibraryTest, LoadRtldDefaultWorks) {
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      SharedLibrary lib,
-      SharedLibrary::Load(RtldFlags::kDefault, RtldFlags::Now().Local()));
-
-  EXPECT_THAT(lib.Path(), StrEq(""));
-  EXPECT_EQ(lib.Handle(), RTLD_DEFAULT);
-
-  auto maybe_test_function =
-      lib.LookupSymbol<decltype(&TestFunction)>("TestFunction");
-  if (!maybe_test_function.HasValue()) {
-    GTEST_SKIP() << "TestFunction symbol was stripped from binary.";
-  }
-
-  decltype(&TestFunction) test_function = maybe_test_function.Value();
-  ASSERT_NE(test_function, nullptr);
-  EXPECT_THAT(test_function(), StrEq(TestFunction()));
-}
-
-TEST(SharedLibraryTest, LoadRtldNextWorks) {
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      SharedLibrary lib,
-      SharedLibrary::Load(RtldFlags::kNext, RtldFlags::Now().Local()));
-
-  EXPECT_THAT(lib.Path(), StrEq(""));
-  EXPECT_EQ(lib.Handle(), RTLD_NEXT);
-}
-
-TEST(SharedLibraryTest, LoadEmptyPathFails) {
-  EXPECT_THAT(SharedLibrary::Load("", RtldFlags::Now().Local()), IsError());
-}
-
-TEST(SharedLibraryTest, LoadPathWorks) {
-  const std::string lib_path = absl::StrCat(
-      "third_party/tensorflow/lite/experimental/litert/cc/"
-      "test_shared_library.so");
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      SharedLibrary lib,
-      SharedLibrary::Load(lib_path, RtldFlags::Now().Local()));
-
-  EXPECT_TRUE(lib.Loaded());
-  EXPECT_THAT(lib.Path(), StrEq(lib_path));
-  EXPECT_THAT(lib.Handle(), NotNull());
-
-  using TestFunctionSignature = char* (*)();
-
-  LITERT_ASSERT_OK_AND_ASSIGN(TestFunctionSignature test_function,
-                              lib.LookupSymbol<char* (*)()>("TestFunction"));
-  ASSERT_NE(test_function, nullptr);
-  EXPECT_THAT(test_function(), StrEq("test_shared_library"));
-
-  lib.Close();
-  EXPECT_THAT(lib.Path(), StrEq(""));
-  EXPECT_FALSE(lib.Loaded());
-}
-
-TEST(SharedLibraryTest, ConstructionAndAssignmentWork) {
-  const std::string lib_path = absl::StrCat(
-      "third_party/tensorflow/lite/experimental/litert/cc/"
-      "test_shared_library.so");
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      SharedLibrary lib,
-      SharedLibrary::Load(lib_path, RtldFlags::Now().Local()));
-
-  const void* const lib_handle = lib.Handle();
-
-  SharedLibrary lib2(std::move(lib));
-
-  // NOLINTBEGIN(bugprone-use-after-move): Tests that moving clears up the
-  // object.
-  EXPECT_THAT(lib.Path(), StrEq(""));
-  EXPECT_FALSE(lib.Loaded());
-
-  EXPECT_TRUE(lib2.Loaded());
-  EXPECT_THAT(lib2.Path(), StrEq(lib_path));
-  EXPECT_THAT(lib2.Handle(), Eq(lib_handle));
-
-  lib = std::move(lib2);
-  EXPECT_THAT(lib2.Path(), StrEq(""));
-  EXPECT_FALSE(lib2.Loaded());
-
-  EXPECT_TRUE(lib.Loaded());
-  EXPECT_THAT(lib.Path(), StrEq(lib_path));
-  EXPECT_THAT(lib.Handle(), Eq(lib_handle));
-  // NOLINTEND(bugprone-use-after-move)
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h b/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h
deleted file mode 100644
index e0ebb1947f32..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h
+++ /dev/null
@@ -1,338 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_H_
-
-#include <cstddef>
-#include <cstring>
-#include <utility>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-#if LITERT_HAS_OPENCL_SUPPORT
-#include <CL/cl.h>
-#endif
-
-namespace litert {
-
-// Tensor and associated backing buffer. C++ equivalent of LiteRtTensorBuffer.
-class TensorBuffer
-    : public internal::Handle<LiteRtTensorBuffer, LiteRtDestroyTensorBuffer> {
- public:
-  TensorBuffer() = default;
-
-  // Parameter `owned` indicates if the created TensorBuffer object should take
-  // ownership of the provided `tensor_buffer` handle.
-  explicit TensorBuffer(LiteRtTensorBuffer tensor_buffer, bool owned = true)
-      : internal::Handle<LiteRtTensorBuffer, LiteRtDestroyTensorBuffer>(
-            tensor_buffer, owned) {}
-
-  // Creates a duplicate of the current TensorBuffer object. The returned
-  // object is reference counted so the underlying LiteRtTensorBuffer handle is
-  // not released with the destructor until the last reference is removed.
-  Expected<TensorBuffer> Duplicate() const {
-    if (!IsOwned()) {
-      return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                        "Cannot duplicate a non-owned tensor buffer");
-    }
-    LITERT_RETURN_IF_ERROR(LiteRtDuplicateTensorBuffer(Get()));
-    return TensorBuffer(Get());
-  }
-
-  static Expected<TensorBuffer> CreateManaged(
-      LiteRtTensorBufferType buffer_type, const RankedTensorType& tensor_type,
-      size_t buffer_size) {
-    LiteRtTensorBuffer tensor_buffer;
-    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
-    LITERT_RETURN_IF_ERROR(LiteRtCreateManagedTensorBuffer(
-        buffer_type, &litert_tensor_type, buffer_size, &tensor_buffer));
-    return TensorBuffer(tensor_buffer);
-  }
-
-  // Creates a TensorBuffer object that wraps the provided host memory.
-  // The provided host memory is not owned by the TensorBuffer object and must
-  // outlive the TensorBuffer object.
-  static Expected<TensorBuffer> CreateFromHostMemory(
-      const RankedTensorType& tensor_type, void* host_mem_addr,
-      size_t buffer_size) {
-    LiteRtTensorBuffer tensor_buffer;
-    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
-
-    LITERT_RETURN_IF_ERROR(LiteRtCreateTensorBufferFromHostMemory(
-        &litert_tensor_type, host_mem_addr, buffer_size,
-        /*deallocator=*/nullptr, &tensor_buffer));
-    return TensorBuffer(tensor_buffer);
-  }
-
-  // Creates a TensorBuffer object that wraps an Android Hardware Buffer. Note
-  // that the provided AHardwareBuffer is not owned by the TensorBuffer object
-  // and must outlive the TensorBuffer object. The `ahwb_offset` parameter
-  // specifies the offset in bytes from the start of the AHardwareBuffer where
-  // the tensor data starts.
-  static Expected<TensorBuffer> CreateFromAhwb(
-      const RankedTensorType& tensor_type, AHardwareBuffer* ahwb,
-      size_t ahwb_offset) {
-#if LITERT_HAS_AHWB_SUPPORT
-    LiteRtTensorBuffer tensor_buffer;
-    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
-
-    LITERT_RETURN_IF_ERROR(LiteRtCreateTensorBufferFromAhwb(
-        &litert_tensor_type, ahwb, ahwb_offset,
-        /*deallocator=*/nullptr, &tensor_buffer));
-    return TensorBuffer(tensor_buffer);
-#else
-    return litert::Unexpected(
-        kLiteRtStatusErrorRuntimeFailure,
-        "AHardwareBuffer is not supported on this platform");
-#endif
-  }
-
-  litert::Expected<AHardwareBuffer*> GetAhwb() const {
-#if LITERT_HAS_AHWB_SUPPORT
-    AHardwareBuffer* ahwb;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferAhwb(Get(), &ahwb));
-    return ahwb;
-#else
-    return litert::Unexpected(
-        kLiteRtStatusErrorRuntimeFailure,
-        "AHardwareBuffer is not supported on this platform");
-#endif
-  }
-
-  struct DmaBuf {
-    void* addr;
-    int fd;
-  };
-
-  litert::Expected<DmaBuf> GetDmaBuf() const {
-#if LITERT_HAS_DMABUF_SUPPORT
-    DmaBuf dma_buf;
-    LITERT_RETURN_IF_ERROR(
-        LiteRtGetTensorBufferDmaBufBuffer(Get(), &dma_buf.addr, &dma_buf.fd));
-    return dma_buf;
-#else
-    return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                              "DMA-BUF is not supported on this platform");
-#endif
-  }
-
-#if LITERT_HAS_OPENCL_SUPPORT
-  Expected<cl_mem> GetOpenClBuffer() const {
-    cl_mem cl_mem;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferOpenClBuffer(Get(), &cl_mem));
-    return cl_mem;
-  }
-#endif
-
-#if LITERT_HAS_OPENGL_SUPPORT
-  struct GlTexture {
-    GLenum target;
-    GLuint id;
-    GLenum format;
-    size_t size_bytes;
-    GLint layer;
-  };
-  static Expected<TensorBuffer> CreateFromGlTexture(
-      const RankedTensorType& tensor_type, GLenum target, GLuint id,
-      GLenum format, size_t size_bytes, GLint layer) {
-    LiteRtTensorBuffer tensor_buffer;
-    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
-    LITERT_RETURN_IF_ERROR(LiteRtCreateTensorBufferFromGlTexture(
-        &litert_tensor_type, target, id, format, size_bytes, layer,
-        /*deallocator=*/nullptr, &tensor_buffer));
-    return TensorBuffer(tensor_buffer);
-  }
-
-  Expected<GlTexture> GetGlTexture() const {
-    GlTexture gl_texture;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferGlTexture(
-        Get(), &gl_texture.target, &gl_texture.id, &gl_texture.format,
-        &gl_texture.size_bytes, &gl_texture.layer));
-    return gl_texture;
-  }
-
-  struct GlBuffer {
-    GLenum target;
-    GLuint id;
-    size_t size_bytes;
-    size_t offset;
-  };
-
-  static Expected<TensorBuffer> CreateFromGlBuffer(
-      const RankedTensorType& tensor_type, GLenum target, GLuint id,
-      size_t size_bytes, size_t offset) {
-    LiteRtTensorBuffer tensor_buffer;
-    auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
-    LITERT_RETURN_IF_ERROR(LiteRtCreateTensorBufferFromGlBuffer(
-        &litert_tensor_type, target, id, size_bytes, offset,
-        /*deallocator=*/nullptr, &tensor_buffer));
-    return TensorBuffer(tensor_buffer);
-  }
-
-  Expected<GlBuffer> GetGlBuffer() const {
-    GlBuffer gl_buffer;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferGlBuffer(
-        Get(), &gl_buffer.target, &gl_buffer.id, &gl_buffer.size_bytes,
-        &gl_buffer.offset));
-    return gl_buffer;
-  }
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-  Expected<LiteRtTensorBufferType> BufferType() const {
-    LiteRtTensorBufferType tensor_buffer_type;
-    LITERT_RETURN_IF_ERROR(
-        LiteRtGetTensorBufferType(Get(), &tensor_buffer_type));
-    return tensor_buffer_type;
-  }
-
-  Expected<RankedTensorType> TensorType() const {
-    LiteRtRankedTensorType tensor_type;
-    if (auto status = LiteRtGetTensorBufferTensorType(Get(), &tensor_type);
-        status != kLiteRtStatusOk) {
-      return Unexpected(status, "Failed to get tensor type");
-    }
-    return RankedTensorType(tensor_type);
-  }
-
-  Expected<size_t> Size() const {
-    size_t size;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferSize(Get(), &size));
-    return size;
-  }
-
-  Expected<size_t> Offset() const {
-    size_t offset;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferOffset(Get(), &offset));
-    return offset;
-  }
-
-  bool HasEvent() const {
-    bool has_event;
-    internal::AssertOk(LiteRtHasTensorBufferEvent, Get(), &has_event);
-    return has_event;
-  }
-
-  Expected<Event> GetEvent() const {
-    LiteRtEvent event;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferEvent(Get(), &event));
-    return Event(event, /*owned=*/false);
-  }
-
-  // The function takes ownership of the passed event e.
-  Expected<void> SetEvent(Event&& e) {
-    if (!e.IsOwned()) {
-      return Error(kLiteRtStatusErrorInvalidArgument,
-                   "Expected an owned event");
-    }
-    LITERT_RETURN_IF_ERROR(LiteRtSetTensorBufferEvent(Get(), e.Release()));
-    return {};
-  }
-
-  Expected<void> ClearEvent() {
-    LITERT_RETURN_IF_ERROR(LiteRtClearTensorBufferEvent(Get()));
-    return {};
-  }
-
-  Expected<void*> Lock() {
-    void* host_mem_addr;
-    LITERT_RETURN_IF_ERROR(LiteRtLockTensorBuffer(Get(), &host_mem_addr));
-    return host_mem_addr;
-  }
-
-  Expected<void> Unlock() {
-    LITERT_RETURN_IF_ERROR(LiteRtUnlockTensorBuffer(Get()));
-    return {};
-  }
-
-  // Writes data from the user provided Span<const T> to the tensor buffer.
-  // It returns an error if the provided buffer is bigger than the size of the
-  // tensor buffer.
-  template <typename T>
-  Expected<void> Write(absl::Span<const T> data) {
-    LITERT_ASSIGN_OR_RETURN(void* host_mem_addr, Lock());
-    LITERT_ASSIGN_OR_RETURN(size_t size, Size());
-    if (size < data.size() * sizeof(T)) {
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          "TensorBuffer size is smaller than the given data size");
-    }
-    std::memcpy(host_mem_addr, data.data(), data.size() * sizeof(T));
-    Unlock();
-    return {};
-  }
-
-  // Reads data into the user provided Span<T> from the tensor buffer.
-  // If the provided buffer is smaller than the size of the tensor buffer, the
-  // data will be read up to the size of the provided buffer.
-  // It returns an error if the provided buffer is bigger than the size of the
-  // tensor buffer.
-  template <typename T>
-  Expected<void> Read(absl::Span<T> data) {
-    LITERT_ASSIGN_OR_RETURN(void* host_mem_addr, Lock());
-    LITERT_ASSIGN_OR_RETURN(size_t size, Size());
-    size_t total_read_size = data.size() * sizeof(T);
-    if (size < total_read_size) {
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          "TensorBuffer size is smaller than the given data size");
-    }
-    std::memcpy(data.data(), host_mem_addr, total_read_size);
-    Unlock();
-    return {};
-  }
-};
-
-class TensorBufferScopedLock {
- public:
-  TensorBufferScopedLock(const TensorBufferScopedLock& arg) = delete;
-  TensorBufferScopedLock(TensorBufferScopedLock&& arg) = default;
-  ~TensorBufferScopedLock() { (void)LiteRtUnlockTensorBuffer(tensor_buffer_); }
-
-  template <typename T = void>
-  static Expected<std::pair<TensorBufferScopedLock, T*>> Create(
-      TensorBuffer& tensor_buffer) {
-    return Create<T>(tensor_buffer.Get());
-  }
-
-  template <typename T = void>
-  static Expected<std::pair<TensorBufferScopedLock, T*>> Create(
-      LiteRtTensorBuffer tensor_buffer) {
-    void* host_mem_addr;
-    LITERT_RETURN_IF_ERROR(
-        LiteRtLockTensorBuffer(tensor_buffer, &host_mem_addr));
-    return std::make_pair(TensorBufferScopedLock(tensor_buffer),
-                          static_cast<T*>(host_mem_addr));
-  }
-
- private:
-  explicit TensorBufferScopedLock(LiteRtTensorBuffer& tensor_buffer)
-      : tensor_buffer_(tensor_buffer) {}
-
-  LiteRtTensorBuffer tensor_buffer_;
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h b/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h
deleted file mode 100644
index 881e3662a2ff..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_handle.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-
-namespace litert {
-
-// Requirements for allocating a TensorBuffer, typically specified by a HW
-// accelerator for a given I/O tensor. C++ equivalent to
-// LiteRtTensorBufferRequirements.
-class TensorBufferRequirements
-    : public internal::Handle<LiteRtTensorBufferRequirements,
-                              LiteRtDestroyTensorBufferRequirements> {
- public:
-  TensorBufferRequirements() = default;
-
-  // Parameter `owned` indicates if the created TensorBufferRequirements object
-  // should take ownership of the provided `requirements` handle.
-  explicit TensorBufferRequirements(LiteRtTensorBufferRequirements requirements,
-                                    bool owned = true)
-      : internal::Handle<LiteRtTensorBufferRequirements,
-                         LiteRtDestroyTensorBufferRequirements>(requirements,
-                                                                owned) {}
-
-  static Expected<TensorBufferRequirements> Create(
-      absl::Span<const LiteRtTensorBufferType> buffer_types, size_t buffer_size,
-      absl::Span<const uint32_t> strides =
-          absl::MakeSpan(static_cast<const uint32_t*>(nullptr), 0)) {
-    LiteRtTensorBufferRequirements tensor_buffer_requirements;
-    LITERT_RETURN_IF_ERROR(LiteRtCreateTensorBufferRequirements(
-        buffer_types.size(), buffer_types.data(), buffer_size, strides.size(),
-        strides.data(), &tensor_buffer_requirements));
-    return TensorBufferRequirements(tensor_buffer_requirements);
-  }
-
-  Expected<std::vector<LiteRtTensorBufferType>> SupportedTypes() const {
-    int num_types;
-    LITERT_RETURN_IF_ERROR(
-        LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(Get(),
-                                                                 &num_types));
-    std::vector<LiteRtTensorBufferType> types(num_types);
-    for (auto i = 0; i < num_types; ++i) {
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-              Get(), i, &types[i]));
-    }
-    return types;
-  }
-
-  Expected<size_t> BufferSize() const {
-    size_t buffer_size;
-    LITERT_RETURN_IF_ERROR(
-        LiteRtGetTensorBufferRequirementsBufferSize(Get(), &buffer_size));
-    return buffer_size;
-  }
-
-  Expected<absl::Span<const uint32_t>> Strides() const {
-    int num_strides;
-    const uint32_t* strides;
-    LITERT_RETURN_IF_ERROR(LiteRtGetTensorBufferRequirementsStrides(
-        Get(), &num_strides, &strides));
-    return absl::MakeSpan(strides, num_strides);
-  }
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_REQUIREMENTS_H_
diff --git a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements_test.cc b/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements_test.cc
deleted file mode 100644
index 0dba6aaac276..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-
-#include <gtest/gtest.h>  // NOLINT: Need when ANDROID_API_LEVEL >= 26
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-
-namespace {
-
-constexpr const LiteRtTensorBufferType kSupportedTensorBufferTypes[] = {
-    kLiteRtTensorBufferTypeHostMemory,
-    kLiteRtTensorBufferTypeAhwb,
-    kLiteRtTensorBufferTypeIon,
-    kLiteRtTensorBufferTypeFastRpc,
-};
-
-constexpr const size_t kNumSupportedTensorBufferTypes =
-    sizeof(kSupportedTensorBufferTypes) /
-    sizeof(kSupportedTensorBufferTypes[0]);
-
-constexpr const size_t kBufferSize = 1234;
-
-}  // namespace
-
-TEST(TensorBufferRequirements, Owned) {
-  auto requirements = litert::TensorBufferRequirements::Create(
-      absl::MakeSpan(kSupportedTensorBufferTypes,
-                     kNumSupportedTensorBufferTypes),
-      kBufferSize);
-  ASSERT_TRUE(requirements);
-
-  auto supported_types = requirements->SupportedTypes();
-  ASSERT_TRUE(supported_types);
-  ASSERT_EQ(supported_types->size(), kNumSupportedTensorBufferTypes);
-  for (auto i = 0; i < supported_types->size(); ++i) {
-    ASSERT_EQ((*supported_types)[i], kSupportedTensorBufferTypes[i]);
-  }
-
-  auto size = requirements->BufferSize();
-  ASSERT_TRUE(size);
-  ASSERT_EQ(*size, kBufferSize);
-}
-
-TEST(TensorBufferRequirements, NotOwned) {
-  LiteRtTensorBufferRequirements litert_requirements;
-  ASSERT_EQ(LiteRtCreateTensorBufferRequirements(
-                kNumSupportedTensorBufferTypes, kSupportedTensorBufferTypes,
-                kBufferSize, /*num_strides=*/0, /*strides=*/nullptr,
-                &litert_requirements),
-            kLiteRtStatusOk);
-
-  litert::TensorBufferRequirements requirements(litert_requirements,
-                                                /*owned=*/false);
-
-  auto supported_types = requirements.SupportedTypes();
-  ASSERT_TRUE(supported_types);
-  ASSERT_EQ(supported_types->size(), kNumSupportedTensorBufferTypes);
-  for (auto i = 0; i < supported_types->size(); ++i) {
-    ASSERT_EQ((*supported_types)[i], kSupportedTensorBufferTypes[i]);
-  }
-
-  auto size = requirements.BufferSize();
-  ASSERT_TRUE(size);
-  ASSERT_EQ(*size, kBufferSize);
-
-  ASSERT_EQ(requirements.Get(), litert_requirements);
-
-  LiteRtDestroyTensorBufferRequirements(litert_requirements);
-}
-
-TEST(TensorBufferRequirements, WithStrides) {
-  constexpr std::array<uint32_t, 3> kStrides = {1, 2, 3};
-
-  auto requirements = litert::TensorBufferRequirements::Create(
-      absl::MakeSpan(kSupportedTensorBufferTypes,
-                     kNumSupportedTensorBufferTypes),
-      kBufferSize, absl::MakeSpan(kStrides.data(), kStrides.size()));
-  ASSERT_TRUE(requirements);
-
-  auto strides = requirements->Strides();
-  ASSERT_TRUE(strides);
-  ASSERT_EQ(strides->size(), kStrides.size());
-  for (auto i = 0; i < kStrides.size(); ++i) {
-    ASSERT_EQ((*strides)[i], kStrides[i]);
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_test.cc b/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_test.cc
deleted file mode 100644
index 4eede69a7ec5..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_test.cc
+++ /dev/null
@@ -1,511 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-
-#include <gtest/gtest.h>  // NOLINT: Need when ANDROID_API_LEVEL >= 26
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/ion_buffer.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include <android/hardware_buffer.h>
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-#if LITERT_HAS_OPENGL_SUPPORT
-#include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-namespace {
-
-using ::litert::RankedTensorType;
-using ::litert::TensorBuffer;
-
-constexpr const float kTensorData[] = {10, 20, 30, 40};
-
-constexpr const int32_t kTensorDimensions[] = {sizeof(kTensorData) /
-                                               sizeof(kTensorData[0])};
-
-constexpr const LiteRtRankedTensorType kTensorType = {
-    /*.element_type=*/kLiteRtElementTypeFloat32,
-    ::litert::BuildLayout(kTensorDimensions)};
-
-int GetReferenceCount(const litert::TensorBuffer& tensor_buffer) {
-  LiteRtTensorBufferT* internal_tensor_buffer =
-      static_cast<LiteRtTensorBufferT*>(tensor_buffer.Get());
-  return internal_tensor_buffer->RefCount();
-}
-
-TEST(TensorBuffer, HostMemory) {
-  const litert::RankedTensorType kTensorType(::kTensorType);
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeHostMemory;
-
-  auto tensor_buffer = litert::TensorBuffer::CreateManaged(
-      kTensorBufferType, kTensorType, sizeof(kTensorData));
-  ASSERT_TRUE(tensor_buffer);
-
-  auto tensor_buffer_type = tensor_buffer->BufferType();
-  ASSERT_TRUE(tensor_buffer_type);
-  ASSERT_EQ(*tensor_buffer_type, kTensorBufferType);
-
-  auto tensor_type = tensor_buffer->TensorType();
-  ASSERT_TRUE(tensor_type);
-
-  ASSERT_EQ(tensor_type->ElementType(), litert::ElementType::Float32);
-  ASSERT_EQ(tensor_type->Layout().Rank(), 1);
-  ASSERT_EQ(tensor_type->Layout().Dimensions()[0],
-            kTensorType.Layout().Dimensions()[0]);
-  ASSERT_FALSE(tensor_type->Layout().HasStrides());
-
-  auto size = tensor_buffer->Size();
-  ASSERT_TRUE(size);
-  ASSERT_EQ(*size, sizeof(kTensorData));
-
-  auto offset = tensor_buffer->Offset();
-  ASSERT_TRUE(offset);
-  ASSERT_EQ(*offset, 0);
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    std::memcpy(lock_and_addr->second, kTensorData, sizeof(kTensorData));
-  }
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    ASSERT_EQ(
-        std::memcmp(lock_and_addr->second, kTensorData, sizeof(kTensorData)),
-        0);
-  }
-}
-
-TEST(TensorBuffer, Ahwb) {
-  if (!litert::internal::AhwbBuffer::IsSupported()) {
-    GTEST_SKIP() << "AHardwareBuffers are not supported on this platform; "
-                    "skipping the test";
-  }
-
-  const litert::RankedTensorType kTensorType(::kTensorType);
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeAhwb;
-
-  auto tensor_buffer = litert::TensorBuffer::CreateManaged(
-      kTensorBufferType, kTensorType, sizeof(kTensorData));
-  ASSERT_TRUE(tensor_buffer);
-
-  auto tensor_buffer_type = tensor_buffer->BufferType();
-  ASSERT_TRUE(tensor_buffer_type);
-  ASSERT_EQ(*tensor_buffer_type, kTensorBufferType);
-
-  auto tensor_type = tensor_buffer->TensorType();
-  ASSERT_TRUE(tensor_type);
-
-  ASSERT_EQ(tensor_type->ElementType(), litert::ElementType::Float32);
-  ASSERT_EQ(tensor_type->Layout().Rank(), 1);
-  ASSERT_EQ(tensor_type->Layout().Dimensions()[0],
-            kTensorType.Layout().Dimensions()[0]);
-  ASSERT_FALSE(tensor_type->Layout().HasStrides());
-
-  auto size = tensor_buffer->Size();
-  ASSERT_TRUE(size);
-  ASSERT_EQ(*size, sizeof(kTensorData));
-
-  auto offset = tensor_buffer->Offset();
-  ASSERT_TRUE(offset);
-  ASSERT_EQ(*offset, 0);
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    std::memcpy(lock_and_addr->second, kTensorData, sizeof(kTensorData));
-  }
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    ASSERT_EQ(
-        std::memcmp(lock_and_addr->second, kTensorData, sizeof(kTensorData)),
-        0);
-  }
-}
-
-TEST(TensorBuffer, Ion) {
-  if (!litert::internal::IonBuffer::IsSupported()) {
-    GTEST_SKIP()
-        << "ION buffers are not supported on this platform; skipping the test";
-  }
-
-  const litert::RankedTensorType kTensorType(::kTensorType);
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeIon;
-
-  auto tensor_buffer = litert::TensorBuffer::CreateManaged(
-      kTensorBufferType, kTensorType, sizeof(kTensorData));
-  ASSERT_TRUE(tensor_buffer);
-
-  auto tensor_buffer_type = tensor_buffer->BufferType();
-  ASSERT_TRUE(tensor_buffer_type);
-  ASSERT_EQ(*tensor_buffer_type, kTensorBufferType);
-
-  auto tensor_type = tensor_buffer->TensorType();
-  ASSERT_TRUE(tensor_type);
-
-  ASSERT_EQ(tensor_type->ElementType(), litert::ElementType::Float32);
-  ASSERT_EQ(tensor_type->Layout().Rank(), 1);
-  ASSERT_EQ(tensor_type->Layout().Dimensions()[0],
-            kTensorType.Layout().Dimensions()[0]);
-  ASSERT_FALSE(tensor_type->Layout().HasStrides());
-
-  auto size = tensor_buffer->Size();
-  ASSERT_TRUE(size);
-  ASSERT_EQ(*size, sizeof(kTensorData));
-
-  auto offset = tensor_buffer->Offset();
-  ASSERT_TRUE(offset);
-  ASSERT_EQ(*offset, 0);
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    std::memcpy(lock_and_addr->second, kTensorData, sizeof(kTensorData));
-  }
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    ASSERT_EQ(
-        std::memcmp(lock_and_addr->second, kTensorData, sizeof(kTensorData)),
-        0);
-  }
-}
-
-TEST(TensorBuffer, DmaBuf) {
-  if (!litert::internal::DmaBufBuffer::IsSupported()) {
-    GTEST_SKIP()
-        << "DMA-BUF buffers are not supported on this platform; skipping "
-           "the test";
-  }
-
-  const litert::RankedTensorType kTensorType(::kTensorType);
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeDmaBuf;
-
-  auto tensor_buffer = litert::TensorBuffer::CreateManaged(
-      kTensorBufferType, kTensorType, sizeof(kTensorData));
-  ASSERT_TRUE(tensor_buffer);
-
-  auto tensor_buffer_type = tensor_buffer->BufferType();
-  ASSERT_TRUE(tensor_buffer_type);
-  ASSERT_EQ(*tensor_buffer_type, kTensorBufferType);
-
-  auto tensor_type = tensor_buffer->TensorType();
-  ASSERT_TRUE(tensor_type);
-
-  ASSERT_EQ(tensor_type->ElementType(), litert::ElementType::Float32);
-  ASSERT_EQ(tensor_type->Layout().Rank(), 1);
-  ASSERT_EQ(tensor_type->Layout().Dimensions()[0],
-            kTensorType.Layout().Dimensions()[0]);
-  ASSERT_FALSE(tensor_type->Layout().HasStrides());
-
-  auto size = tensor_buffer->Size();
-  ASSERT_TRUE(size);
-  ASSERT_EQ(*size, sizeof(kTensorData));
-
-  auto offset = tensor_buffer->Offset();
-  ASSERT_TRUE(offset);
-  ASSERT_EQ(*offset, 0);
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    std::memcpy(lock_and_addr->second, kTensorData, sizeof(kTensorData));
-  }
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    ASSERT_EQ(
-        std::memcmp(lock_and_addr->second, kTensorData, sizeof(kTensorData)),
-        0);
-  }
-}
-
-TEST(TensorBuffer, FastRpc) {
-  if (!litert::internal::FastRpcBuffer::IsSupported()) {
-    GTEST_SKIP()
-        << "FastRPC buffers are not supported on this platform; skipping "
-           "the test";
-  }
-
-  const litert::RankedTensorType kTensorType(::kTensorType);
-  constexpr auto kTensorBufferType = kLiteRtTensorBufferTypeFastRpc;
-
-  auto tensor_buffer = litert::TensorBuffer::CreateManaged(
-      kTensorBufferType, kTensorType, sizeof(kTensorData));
-  ASSERT_TRUE(tensor_buffer);
-
-  auto tensor_buffer_type = tensor_buffer->BufferType();
-  ASSERT_TRUE(tensor_buffer_type);
-  ASSERT_EQ(*tensor_buffer_type, kTensorBufferType);
-
-  auto tensor_type = tensor_buffer->TensorType();
-  ASSERT_TRUE(tensor_type);
-
-  ASSERT_EQ(tensor_type->ElementType(), litert::ElementType::Float32);
-  ASSERT_EQ(tensor_type->Layout().Rank(), 1);
-  ASSERT_EQ(tensor_type->Layout().Dimensions()[0],
-            kTensorType.Layout().Dimensions()[0]);
-  ASSERT_FALSE(tensor_type->Layout().HasStrides());
-
-  auto size = tensor_buffer->Size();
-  ASSERT_TRUE(size);
-  ASSERT_EQ(*size, sizeof(kTensorData));
-
-  auto offset = tensor_buffer->Offset();
-  ASSERT_TRUE(offset);
-  ASSERT_EQ(*offset, 0);
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    std::memcpy(lock_and_addr->second, kTensorData, sizeof(kTensorData));
-  }
-
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(*tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    ASSERT_EQ(
-        std::memcmp(lock_and_addr->second, kTensorData, sizeof(kTensorData)),
-        0);
-  }
-}
-
-TEST(TensorBuffer, NotOwned) {
-  LiteRtTensorBuffer litert_tensor_buffer;
-  ASSERT_EQ(LiteRtCreateManagedTensorBuffer(kLiteRtTensorBufferTypeHostMemory,
-                                            &kTensorType, sizeof(kTensorData),
-                                            &litert_tensor_buffer),
-            kLiteRtStatusOk);
-
-  litert::TensorBuffer tensor_buffer(litert_tensor_buffer, /*owned=*/false);
-  ASSERT_EQ(tensor_buffer.Get(), litert_tensor_buffer);
-
-  LiteRtDestroyTensorBuffer(litert_tensor_buffer);
-}
-
-TEST(TensorBuffer, ExternalHostMemory) {
-  // Allocate a tensor buffer with host memory.
-  const int kTensorBufferSize =
-      std::max<int>(sizeof(kTensorData), LITERT_HOST_MEMORY_BUFFER_ALIGNMENT);
-  const litert::RankedTensorType kTensorType(::kTensorType);
-  void* host_memory_ptr;
-  ASSERT_EQ(
-      ::posix_memalign(&host_memory_ptr, LITERT_HOST_MEMORY_BUFFER_ALIGNMENT,
-                       kTensorBufferSize),
-      0);
-
-  std::memcpy(host_memory_ptr, kTensorData, sizeof(kTensorData));
-
-  // Create a tensor buffer that wraps the host memory.
-  auto tensor_buffer_from_external_memory =
-      litert::TensorBuffer::CreateFromHostMemory(kTensorType, host_memory_ptr,
-                                                 kTensorBufferSize);
-
-  auto lock_and_addr_external_memory = litert::TensorBufferScopedLock::Create(
-      *tensor_buffer_from_external_memory);
-  ASSERT_TRUE(lock_and_addr_external_memory);
-  ASSERT_EQ(std::memcmp(lock_and_addr_external_memory->second, kTensorData,
-                        sizeof(kTensorData)),
-            0);
-
-  free(host_memory_ptr);
-}
-
-#if LITERT_HAS_AHWB_SUPPORT
-TEST(TensorBuffer, FromAhwb) {
-  AHardwareBuffer* ahw_buffer = nullptr;
-  if (__builtin_available(android 26, *)) {
-    int error = 0;
-    AHardwareBuffer_Desc desc = {
-        .width = LITERT_HOST_MEMORY_BUFFER_ALIGNMENT,
-        .height = 1,
-        .layers = 1,
-        .format = AHARDWAREBUFFER_FORMAT_BLOB,
-        .usage = AHARDWAREBUFFER_USAGE_CPU_WRITE_RARELY |
-                 AHARDWAREBUFFER_USAGE_CPU_READ_RARELY};
-    error = AHardwareBuffer_allocate(&desc, &ahw_buffer);
-    ASSERT_EQ(error, 0);
-
-    void* host_memory_ptr = nullptr;
-    error =
-        AHardwareBuffer_lock(ahw_buffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_RARELY,
-                             -1, nullptr, &host_memory_ptr);
-    ASSERT_EQ(error, 0);
-
-    std::memcpy(host_memory_ptr, kTensorData, sizeof(kTensorData));
-
-    int fence_file_descriptor = -1;
-    error = AHardwareBuffer_unlock(ahw_buffer, &fence_file_descriptor);
-    ASSERT_EQ(error, 0);
-  } else {
-    GTEST_SKIP() << "AHardwareBuffers are not supported on this platform; "
-                    "skipping the test";
-  }
-
-  {
-    // Create a tensor buffer that wraps the AHardwareBuffer.
-    const litert::RankedTensorType kTensorType(::kTensorType);
-    auto tensor_buffer_from_ahwb =
-        litert::TensorBuffer::CreateFromAhwb(kTensorType, ahw_buffer,
-                                             /*ahwb_offset=*/0);
-
-    auto lock_and_addr_external_memory =
-        litert::TensorBufferScopedLock::Create(*tensor_buffer_from_ahwb);
-    ASSERT_TRUE(lock_and_addr_external_memory);
-    ASSERT_EQ(std::memcmp(lock_and_addr_external_memory->second, kTensorData,
-                          sizeof(kTensorData)),
-              0);
-  }
-
-  if (__builtin_available(android 26, *)) {
-    AHardwareBuffer_release(ahw_buffer);
-  }
-}
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-TEST(TensorBuffer, Duplicate) {
-  LiteRtTensorBuffer litert_tensor_buffer;
-  ASSERT_EQ(LiteRtCreateManagedTensorBuffer(kLiteRtTensorBufferTypeHostMemory,
-                                            &kTensorType, sizeof(kTensorData),
-                                            &litert_tensor_buffer),
-            kLiteRtStatusOk);
-
-  litert::TensorBuffer tensor_buffer(litert_tensor_buffer, /*owned=*/true);
-  ASSERT_EQ(GetReferenceCount(tensor_buffer), 1);
-  {
-    auto duplicated_tensor_buffer = tensor_buffer.Duplicate();
-    ASSERT_TRUE(duplicated_tensor_buffer);
-    ASSERT_EQ(GetReferenceCount(*duplicated_tensor_buffer), 2);
-    // The duplicated tensor buffer should point to the same underlying
-    // LiteRtTensorBuffer object.
-    ASSERT_EQ(duplicated_tensor_buffer->Get(), tensor_buffer.Get());
-
-    // Update tensor buffer using the duplicated tensor buffer.
-    auto lock_and_addr =
-        litert::TensorBufferScopedLock::Create(*duplicated_tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    std::memcpy(lock_and_addr->second, kTensorData, sizeof(kTensorData));
-
-    // When the scope ends, the duplicated tensor buffer should be destroyed.
-    // This should not affect the original tensor buffer.
-  }
-
-  ASSERT_EQ(GetReferenceCount(tensor_buffer), 1);
-  // Check that the original tensor buffer is not affected.
-  {
-    auto lock_and_addr = litert::TensorBufferScopedLock::Create(tensor_buffer);
-    ASSERT_TRUE(lock_and_addr);
-    ASSERT_EQ(
-        std::memcmp(lock_and_addr->second, kTensorData, sizeof(kTensorData)),
-        0);
-  }
-}
-
-TEST(TensorBuffer, ReadWriteBasic) {
-  LiteRtTensorBuffer litert_tensor_buffer;
-  ASSERT_EQ(LiteRtCreateManagedTensorBuffer(kLiteRtTensorBufferTypeHostMemory,
-                                            &kTensorType, sizeof(kTensorData),
-                                            &litert_tensor_buffer),
-            kLiteRtStatusOk);
-
-  litert::TensorBuffer tensor_buffer(litert_tensor_buffer, /*owned=*/true);
-  auto write_success = tensor_buffer.Write<float>(absl::MakeSpan(
-      kTensorData, sizeof(kTensorData) / sizeof(kTensorData[0])));
-  ASSERT_TRUE(write_success);
-  float read_data[sizeof(kTensorData) / sizeof(kTensorData[0])];
-  auto read_success = tensor_buffer.Read<float>(absl::MakeSpan(read_data));
-  ASSERT_TRUE(read_success);
-  ASSERT_EQ(std::memcmp(read_data, kTensorData, sizeof(kTensorData)), 0);
-}
-
-TEST(TensorBuffer, ReadWriteBufferSizeMismatch) {
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBuffer tensor_buffer,
-      TensorBuffer::CreateManaged(kLiteRtTensorBufferTypeHostMemory,
-                                  RankedTensorType(kTensorType),
-                                  sizeof(kTensorData)));
-  {
-    // Write with smaller size of data.
-    auto write_success =
-        tensor_buffer.Write<float>(absl::MakeSpan(kTensorData, 1));
-    ASSERT_TRUE(write_success);
-  }
-  {
-    constexpr const float big_data[] = {10, 20, 30, 40, 50};
-    // Write with larger size of data.
-    auto write_success =
-        tensor_buffer.Write<float>(absl::MakeSpan(big_data, 5));
-    ASSERT_FALSE(write_success);
-  }
-  auto write_success = tensor_buffer.Write<float>(absl::MakeSpan(
-      kTensorData, sizeof(kTensorData) / sizeof(kTensorData[0])));
-  ASSERT_TRUE(write_success);
-  {
-    // Read with smaller size of buffer.
-    float read_data[1];
-    auto read_success = tensor_buffer.Read<float>(absl::MakeSpan(read_data, 1));
-    ASSERT_TRUE(read_success);
-    ASSERT_EQ(read_data[0], kTensorData[0]);
-  }
-  {
-    // Read with larger size of buffer.
-    float read_data[5];
-    auto read_success = tensor_buffer.Read<float>(absl::MakeSpan(read_data, 5));
-    ASSERT_FALSE(read_success);
-  }
-}
-
-#if LITERT_HAS_OPENGL_SUPPORT
-TEST(TensorBuffer, FromGlTexture) {
-  std::unique_ptr<tflite::gpu::gl::EglEnvironment> env;
-  ASSERT_TRUE(tflite::gpu::gl::EglEnvironment::NewEglEnvironment(&env).ok());
-
-  // Create GL texture.
-  tflite::gpu::gl::GlTexture gl_texture(GL_TEXTURE_2D, 1, GL_RGBA8, 1, 1,
-                                        /*has_ownership=*/true);
-  ASSERT_TRUE(gl_texture.is_valid());
-
-  // Create tensor buffer from existing GL texture (e.g. this could be from
-  // Android Camera API).
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBuffer tensor_buffer,
-      TensorBuffer::CreateFromGlTexture(
-          RankedTensorType(kTensorType), gl_texture.target(), gl_texture.id(),
-          gl_texture.format(), gl_texture.bytes_size(), gl_texture.layer()));
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.cc b/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.cc
deleted file mode 100644
index ffda31f56e5d..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.h"
-
-#include <string>
-
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-std::string BufferTypeToString(LiteRtTensorBufferType buffer_type) {
-  switch (buffer_type) {
-    case kLiteRtTensorBufferTypeUnknown:
-      return "Unknown";
-    case kLiteRtTensorBufferTypeHostMemory:
-      return "HostMemory";
-    case kLiteRtTensorBufferTypeAhwb:
-      return "Ahwb";
-    case kLiteRtTensorBufferTypeIon:
-      return "Ion";
-    case kLiteRtTensorBufferTypeDmaBuf:
-      return "DmaBuf";
-    case kLiteRtTensorBufferTypeFastRpc:
-      return "FastRpc";
-    case kLiteRtTensorBufferTypeOpenCl:
-      return "OpenCl";
-    case kLiteRtTensorBufferTypeGlBuffer:
-      return "GlBuffer";
-    case kLiteRtTensorBufferTypeGlTexture:
-      return "GlTexture";
-  }
-  LITERT_LOG(LITERT_ERROR, "Unexpected value for LiteRtTensorBufferType: %d",
-             static_cast<int>(buffer_type));
-  return "UnexpectedBufferType";
-}
diff --git a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.h b/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.h
deleted file mode 100644
index c7530b8af705..000000000000
--- a/tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_UTILS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_UTILS_H_
-
-#include <string>
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-std::string BufferTypeToString(LiteRtTensorBufferType buffer_type);
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CC_LITERT_TENSOR_BUFFER_UTILS_H_
diff --git a/tensorflow/lite/experimental/litert/cc/test_shared_library.cc b/tensorflow/lite/experimental/litert/cc/test_shared_library.cc
deleted file mode 100644
index 37254390ea20..000000000000
--- a/tensorflow/lite/experimental/litert/cc/test_shared_library.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-extern "C" {
-
-const char* TestFunction() { return "test_shared_library"; }
-
-}  // extern "C"
diff --git a/tensorflow/lite/experimental/litert/compiler/BUILD b/tensorflow/lite/experimental/litert/compiler/BUILD
deleted file mode 100644
index 23b07d5602d7..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/BUILD b/tensorflow/lite/experimental/litert/compiler/plugin/BUILD
deleted file mode 100644
index 99b019cee9ba..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/BUILD
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "compiler_plugin",
-    srcs = ["compiler_plugin.cc"],
-    hdrs = ["compiler_plugin.h"],
-    deps = [
-        ":algo",
-        ":compiler_flags",
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_detail",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_op_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_shared_library",
-        "//tensorflow/lite/experimental/litert/core:build_stamp",
-        "//tensorflow/lite/experimental/litert/core:dynamic_loading",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/model:buffer_manager",
-        "//tensorflow/lite/experimental/litert/core/model:ir_allocator",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin_api",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-# copybara:uncomment_begin(no OSS for unique-test-directory)
-# cc_test(
-#     name = "compiler_plugin_test",
-#     srcs = ["compiler_plugin_test.cc"],
-#     data = [
-#         "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-#         "//tensorflow/lite/experimental/litert/vendors/examples:example_plugin_so",
-#     ],
-#     tags = [
-#         # Sanitizer runtimes are incompatible with RTLD_DEEPBIND.
-#         "noasan",
-#         "nomsan",
-#         "nosan",
-#         "notsan",
-#     ],
-#     deps = [
-#         ":compiler_plugin",
-#         "@com_google_googletest//:gtest_main",
-#         "@com_google_absl//absl/strings:string_view",
-#         "//tensorflow/lite/experimental/litert/c:litert_common",
-#         "//tensorflow/lite/experimental/litert/c:litert_model",
-#         "//tensorflow/lite/experimental/litert/c:litert_op_code",
-#         "//tensorflow/lite/experimental/litert/cc:litert_environment",
-#         "//tensorflow/lite/experimental/litert/cc:litert_op_options",
-#         "//tensorflow/lite/experimental/litert/core:build_stamp",
-#         "//tensorflow/lite/experimental/litert/core:filesystem",
-#         "//tensorflow/lite/experimental/litert/core/model",
-#         "//tensorflow/lite/experimental/litert/test:common",
-#         "//tensorflow/lite/experimental/litert/test:matchers",
-#         "//tensorflow/lite/experimental/litert/tools:dump",
-#     ],
-# )
-# copybara:uncomment_end
-
-cc_library(
-    name = "algo",
-    srcs = ["algo.cc"],
-    hdrs = ["algo.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/core:insert_order_map",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/model:model_graph",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log:absl_check",
-    ],
-)
-
-cc_test(
-    name = "algo_test",
-    srcs = ["algo_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-    ],
-    deps = [
-        ":algo",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/model:graph_validation",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "compiler_flags",
-    srcs = ["compiler_flags.cc"],
-    hdrs = ["compiler_flags.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "compiler_flags_test",
-    srcs = ["compiler_flags_test.cc"],
-    deps = [
-        ":compiler_flags",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/algo.cc b/tensorflow/lite/experimental/litert/compiler/plugin/algo.cc
deleted file mode 100644
index eb36733486b3..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/algo.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/compiler/plugin/algo.h"
-
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/absl_check.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/core/insert_order_map.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-
-namespace litert::internal {
-namespace {
-
-//
-// flatlist to partition(s)
-//===----------------------------------------------------------------------===//
-
-class DisjointSets {
- public:
-  static std::vector<std::vector<LiteRtOp>> GetPartitionsFromFlatList(
-      const std::vector<LiteRtOpWithPartitionIndex>& flat_op_list);
-
- private:
-  void Insert(LiteRtOp op, LiteRtOp parent);
-  std::vector<std::vector<LiteRtOp>> GetBuckets();
-  LiteRtOp GetBucket(LiteRtOp op);
-  InsertOrderMap<LiteRtOp, LiteRtOp> map_;
-};
-
-//===----------------------------------------------------------------------===//
-// LiteRt Core union-find algorithm.
-//
-// This algorithm is used to group partitions into sub DAGs.
-// The input to the algorithm is a list of ops with the their partition index.
-//
-//        [ (op_0, 0),
-//          (op_1, 0),
-//          (op_2, 0),
-//            ...
-//          (op_7, 1),
-//          (op_8, 1), ...]
-//
-// Union-find algorithm is run on each partition (list of ops with same
-// partition index).
-//
-// For each partition, the input to the union find algorithm is a list of
-// ops with the same partition index. For example,
-//
-//        [ op_0, op_1, op_2, op3, op_4, op_5 ...]
-//
-// The output of the union find algorithm is a list of list of ops, where each
-// list is a disjoint set(a sub DAG within the original Subgraph). For
-// example,
-//
-//        [ [op_0, op_1, op_6],
-//          [op_2, op_3],
-//          [op_4, op_5] ... ]
-//
-//  Similarly, algorithm on the next parition would return something like
-//
-//        [ [op_7, op_8, op_9],
-//          [op_10, op_11],
-//          [op_12, op_13] ... ]
-//
-// We aggregate all disjoint sets into the result buckets. For example,
-//
-//        [ [op_0, op_1, op_6]
-//          [op_2, op_3] ,
-//          [op_4, op_5],
-//          [op_7, op_8, op_9],
-//          [op_10, op_11],
-//          [op_12, op_13] ... ]
-//===----------------------------------------------------------------------===//
-std::vector<std::vector<LiteRtOp>> DisjointSets::GetPartitionsFromFlatList(
-    const std::vector<LiteRtOpWithPartitionIndex>& flat_op_list) {
-  // Find all unique partition indices. Use unique partition index as key and
-  // store the ops for each partition index as value of the map.
-  absl::flat_hash_map<LiteRtParamIndex, std::vector<LiteRtOp>> partition_map;
-  for (int i = 0; i < flat_op_list.size(); ++i) {
-    partition_map[flat_op_list[i].second].push_back(flat_op_list[i].first);
-  }
-
-  // A vector of disjoint sets, where each partition contains op with the same
-  // partition index.
-  std::vector<DisjointSets> partitions;
-
-  // A vector of all unique partition indices for iterative access. We kept this
-  // vector so vendor plugin returned partition indices does not have to be
-  // zero-based.
-  std::vector<LiteRtParamIndex> flat_partition_indices;
-  for (auto& partition_index : partition_map) {
-    flat_partition_indices.push_back(partition_index.first);
-  }
-
-  // Resize the partitions vector to the number of unique partition indices.
-  partitions.resize(flat_partition_indices.size());
-
-  // Resulting buckets of the union find algorithm.
-  std::vector<std::vector<LiteRtOp>> all_buckets;
-
-  // Run union-find algorithm on each partition.
-  for (int i = 0; i < flat_partition_indices.size(); ++i) {
-    // For each partition, initialize the disjoint sets.
-    for (auto* op : partition_map[flat_partition_indices[i]]) {
-      partitions[i].map_.InsertOrAssign(op, op);
-    }
-    // For each partition, find all disjoint sets.
-    for (auto* op : partition_map[flat_partition_indices[i]]) {
-      for (auto* output : op->Outputs()) {
-        for (auto* user : output->Users()) {
-          if (!partitions[i].map_.Contains(user)) {
-            continue;
-          }
-          partitions[i].Insert(op, user);
-        }
-      }
-    }
-    // Aggregate all disjoint sets into the result buckets.
-    for (auto& bucket : partitions[i].GetBuckets()) {
-      all_buckets.push_back(std::move(bucket));
-    }
-  }
-  return all_buckets;
-}
-
-void DisjointSets::Insert(LiteRtOp op, LiteRtOp parent) {
-  auto* parent_bucket = GetBucket(parent);
-  auto* op_bucket = GetBucket(op);
-  if (op_bucket == parent_bucket) {
-    return;
-  }
-  map_.InsertOrAssign(op_bucket, parent_bucket);
-}
-
-// Get all disjoint sets.
-std::vector<std::vector<LiteRtOp>> DisjointSets::GetBuckets() {
-  // NOLINTBEGIN
-  std::unordered_map<LiteRtOp, std::vector<LiteRtOp>> invert_map;
-  // NOLINTEND
-  for (auto it = map_.Begin(); it != map_.End(); ++it) {
-    auto* bucket = GetBucket(it->first);
-
-    if (invert_map.find(bucket) == invert_map.end()) {
-      invert_map.insert_or_assign(bucket, std::vector<LiteRtOp>{});
-    }
-
-    invert_map[bucket].push_back(it->first);
-  }
-
-  std::vector<std::vector<LiteRtOp>> res;
-  res.reserve(invert_map.size());
-
-  for (auto& entry : invert_map) {
-    res.push_back(std::move(entry.second));
-  }
-
-  return res;
-}
-
-// Gets the pointer which serves as the key for given ops bucket. Collapses
-// paths to amortize.
-LiteRtOp DisjointSets::GetBucket(LiteRtOp op) {
-  auto it = map_.Find(op);
-  auto* parent = it->get().second;
-  if (op != parent) {
-    parent = GetBucket(parent);
-    map_.InsertOrAssign(op, parent);
-  }
-  return parent;
-}
-
-//
-// slice partitions out of a subgraph (into new subgraphs)
-//===----------------------------------------------------------------------===//
-
-class GraphSlicer {
- public:
-  // Slices "partitions" from "root" into the empty subgraph "slice". Assumes
-  // the partition is a valid sub-DAG, and replaces it witha single
-  // tfl.custom_op in "root". A reference to that op is returned.
-  static LiteRtOp SlicePartitionFromGraph(LiteRtSubgraphT& root,
-                                          LiteRtSubgraph slice,
-                                          std::vector<LiteRtOp>& partition);
-
- private:
-  explicit GraphSlicer(LiteRtSubgraph slice) : slice_(slice) {}
-
-  void CloneInto(const LiteRtOpT& op);
-
-  void RerouteTensorsThroughCustomOp(const LiteRtSubgraphT& root);
-
-  LiteRtSubgraph slice_;
-  // Maps tensor in old subgraph to tensor in new subgraph.
-  InsertOrderMap<LiteRtTensor, LiteRtTensor> tensor_map_;
-  LiteRtOp dispatch_op_ = nullptr;
-};
-
-LiteRtOp GraphSlicer::SlicePartitionFromGraph(
-    LiteRtSubgraphT& root, LiteRtSubgraph slice,
-    std::vector<LiteRtOp>& partition) {
-  GraphSlicer slicer(slice);
-
-  // Register input tensors of the sliced partition WRT to their original order
-  // in the root subgraph. This ensures the order of input tensors of the
-  // later outlined custom op is the same as the order of input tensors of the
-  // GraphInputs.
-  absl::flat_hash_set<LiteRtTensor> used_tensors;
-
-  // Get all tensors used in the partition.
-  for (auto* op : partition) {
-    used_tensors.insert(op->Inputs().cbegin(), op->Inputs().cend());
-  }
-  for (auto* old_input : root.Inputs()) {
-    if (used_tensors.contains(old_input)) {
-      auto* new_input = &MakeClone(*slicer.slice_, *old_input);
-      slicer.slice_->Inputs().push_back(new_input);
-      slicer.tensor_map_.InsertOrAssign(old_input, new_input);
-    }
-  }
-
-  for (auto* op : partition) {
-    slicer.CloneInto(*op);
-  }
-
-  for (auto* op : partition) {
-    Drop(*op);
-  }
-
-  // Reuse the storage from the last op in partition to maintain
-  // topological order.
-  slicer.dispatch_op_ = partition.back();
-
-  ABSL_DCHECK(slicer.dispatch_op_->Inputs().empty());
-  ABSL_DCHECK(slicer.dispatch_op_->Outputs().empty());
-  MakeDispatchOp(*slicer.dispatch_op_);
-  slicer.RerouteTensorsThroughCustomOp(root);
-
-  DCE(root);
-
-  return slicer.dispatch_op_;
-}
-
-void GraphSlicer::RerouteTensorsThroughCustomOp(const LiteRtSubgraphT& root) {
-  for (auto it = tensor_map_.Begin(); it != tensor_map_.End(); ++it) {
-    auto* old_tensor = it->first;
-    auto* new_tensor = it->second;
-
-    // Reroute tensors which need to be passed into the scope of the new
-    // subgraph to inputs of the custom op.
-    if (new_tensor->DefiningOp() == nullptr && !IsConstant(*new_tensor)) {
-      AttachInput(old_tensor, *dispatch_op_);
-      continue;
-    }
-
-    // Reroute custom op as the definer of tensors within the removed partition
-    // and referenced later in the root graph.
-    if ((!old_tensor->Users().empty() && !IsConstant(*old_tensor)) ||
-        FindOutput(root, *old_tensor)) {
-      AttachOutput(old_tensor, *dispatch_op_);
-      slice_->Outputs().push_back(new_tensor);
-    }
-  }
-}
-
-void GraphSlicer::CloneInto(const LiteRtOpT& old_op) {
-  auto& new_op = MakeClone(*slice_, old_op);
-
-  for (auto i = 0; i < old_op.NumInputs(); ++i) {
-    auto* old_input = old_op.Inputs().at(i);
-    LiteRtTensor new_input;
-    if (tensor_map_.Contains(old_input)) {
-      // If old_input is already in the map then map[input] is its cloned
-      // counterpart in the new graph.
-      auto it = tensor_map_.Find(old_input);
-      new_input = it->get().second;
-    } else {
-      // Otherwise, it must be a new subgraph input (or constant).
-      new_input = &MakeClone(*slice_, *old_input);
-      if (!IsConstant(*new_input)) {
-        slice_->Inputs().push_back(new_input);
-      }
-
-      tensor_map_.InsertOrAssign(old_input, new_input);
-    }
-
-    AttachInput(new_input, new_op);
-  }
-
-  for (int i = 0; i < old_op.NumOutputs(); ++i) {
-    auto* old_output = old_op.Outputs().at(i);
-    auto* new_output = &MakeClone(*slice_, *old_output);
-    AttachOutput(new_output, new_op);
-
-    // Update the values defined in scope of the new subgraph.
-    tensor_map_.InsertOrAssign(old_output, new_output);
-  }
-}
-
-}  // namespace
-
-std::vector<std::vector<LiteRtOp>> GroupPartitions(
-    const std::vector<LiteRtOpWithPartitionIndex>& ops) {
-  return DisjointSets::GetPartitionsFromFlatList(ops);
-}
-
-LiteRtOp OutlinePartition(LiteRtSubgraphT& root, LiteRtSubgraph slice,
-                          std::vector<LiteRtOp>& partition) {
-  return GraphSlicer::SlicePartitionFromGraph(root, slice, partition);
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/algo.h b/tensorflow/lite/experimental/litert/compiler/plugin/algo.h
deleted file mode 100644
index 8f82ca33ba0f..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/algo.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_ALGO_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_ALGO_H_
-
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-
-// Identifies sub-DAGs of ops connected w.r.t. the use-def chain. Expects
-// all "ops" belong to the same Subgraph. The ops in the input
-// and output will always be the same.
-std::vector<std::vector<LiteRtOp>> GroupPartitions(
-    const std::vector<LiteRtOpWithPartitionIndex>& ops);
-
-// Outlines "partition" from "root" into the empty subgraph "slice". Assumes
-// the partition is a valid sub-DAG, and replaces it with a single
-// tfl.custom_op in "root". A reference to that op is returned.
-LiteRtOp OutlinePartition(LiteRtSubgraphT& root, LiteRtSubgraph slice,
-                          std::vector<LiteRtOp>& partition);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_ALGO_H_
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/algo_test.cc b/tensorflow/lite/experimental/litert/compiler/plugin/algo_test.cc
deleted file mode 100644
index f756f649520c..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/algo_test.cc
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/compiler/plugin/algo.h"
-
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/container/flat_hash_set.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-#include "tensorflow/lite/experimental/litert/core/model/graph_validation.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-
-namespace litert::internal {
-namespace {
-
-TEST(TestPartitionsFromFlatList, SimpleMultiOp) {
-  auto model = litert::testing::LoadTestFileModel("simple_multi_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-
-  // func.func @main(arg0)
-  //   0 = tfl.add arg0, arg0
-  //   1 = tfl.mul 0, 0
-  //   2 = tfl.mul 1, 1
-  //   3 = tfl.add 2, 2
-  //   return 3
-
-  {
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops;
-    selected_ops.push_back({ops.at(1).Get(), 0});
-    selected_ops.push_back({ops.at(2).Get(), 0});
-
-    auto partitions = GroupPartitions(selected_ops);
-    ASSERT_EQ(partitions.size(), 1);
-    ASSERT_EQ(partitions.front().size(), 2);
-
-    EXPECT_EQ(partitions.front().at(0), selected_ops.at(0).first);
-    EXPECT_EQ(partitions.front().at(1), selected_ops.at(1).first);
-  }
-
-  {
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops;
-    selected_ops.push_back({ops.at(1).Get(), 0});
-    selected_ops.push_back({ops.at(3).Get(), 0});
-
-    auto partitions = GroupPartitions(selected_ops);
-    ASSERT_EQ(partitions.size(), 2);
-    ASSERT_EQ(partitions.front().size(), 1);
-    ASSERT_EQ(partitions.back().size(), 1);
-
-    auto p1_op_code = partitions.front().front()->OpCode();
-    auto p2_op_code = partitions.back().front()->OpCode();
-
-    ASSERT_TRUE((p1_op_code == kLiteRtOpCodeTflMul &&
-                 p2_op_code == kLiteRtOpCodeTflAdd) ||
-                (p1_op_code == kLiteRtOpCodeTflAdd &&
-                 p2_op_code == kLiteRtOpCodeTflMul));
-  }
-
-  {
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops;
-
-    auto partitions = GroupPartitions(selected_ops);
-    ASSERT_EQ(partitions.size(), 0);
-  }
-
-  {
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops;
-    selected_ops.push_back({ops.at(0).Get(), 0});
-    selected_ops.push_back({ops.at(1).Get(), 0});
-    selected_ops.push_back({ops.at(2).Get(), 0});
-    selected_ops.push_back({ops.at(3).Get(), 0});
-
-    auto partitions = GroupPartitions(selected_ops);
-    ASSERT_EQ(partitions.size(), 1);
-    ASSERT_EQ(partitions.front().size(), 4);
-
-    EXPECT_EQ(partitions.front().at(0), selected_ops.at(0).first);
-    EXPECT_EQ(partitions.front().at(1), selected_ops.at(1).first);
-    EXPECT_EQ(partitions.front().at(2), selected_ops.at(2).first);
-    EXPECT_EQ(partitions.front().at(3), selected_ops.at(3).first);
-  }
-}
-
-TEST(TestSliceSubgraphSimpleMultiOp, OnePartition) {
-  auto model = litert::testing::LoadTestFileModel("simple_multi_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-
-  // func.func @main(arg0)
-  //   0 = tfl.add arg0, arg0
-  //   1 = tfl.mul 0, 0
-  //   2 = tfl.mul 1, 1
-  //   3 = tfl.add 2, 2
-  //   return 3
-
-  std::vector<LiteRtOp> partition;
-  partition.push_back(ops.at(1).Get());
-  partition.push_back(ops.at(2).Get());
-
-  auto sliced_graph = litert::Subgraph(&model.Get()->EmplaceSubgraph());
-  auto* dispatch_op =
-      OutlinePartition(*subgraph->Get(), sliced_graph.Get(), partition);
-
-  const auto& internal_sliced = *sliced_graph.Get();
-  ASSERT_TRUE(ValidateSubgraphIO(internal_sliced));
-  ASSERT_TRUE(ValidateLocalTopology(internal_sliced.Ops().cbegin(),
-                                    internal_sliced.Ops().cend()));
-
-  auto edited_subgraph_ops = subgraph->Ops();
-
-  ASSERT_EQ(edited_subgraph_ops.size(), 3);
-  ASSERT_EQ(edited_subgraph_ops.at(0).Code(), kLiteRtOpCodeTflAdd);
-  ASSERT_EQ(edited_subgraph_ops.at(1).Code(), kLiteRtOpCodeTflCustom);
-  ASSERT_EQ(edited_subgraph_ops.at(2).Code(), kLiteRtOpCodeTflAdd);
-
-  auto sliced_subgraph_ops = sliced_graph.Ops();
-
-  ASSERT_EQ(sliced_subgraph_ops.size(), 2);
-  ASSERT_EQ(sliced_subgraph_ops[0].Code(), kLiteRtOpCodeTflMul);
-  ASSERT_EQ(sliced_subgraph_ops[1].Code(), kLiteRtOpCodeTflMul);
-
-  ASSERT_EQ(dispatch_op, edited_subgraph_ops.at(1).Get());
-  const Op hal_call(dispatch_op);
-
-  {
-    const auto dispatch_op_ins = hal_call.Inputs();
-
-    ASSERT_EQ(dispatch_op_ins.size(), 1);
-
-    auto hal_input_defining_op = dispatch_op_ins.front().DefiningOp();
-    ASSERT_EQ(hal_input_defining_op->op, edited_subgraph_ops.at(0).Get());
-    ASSERT_EQ(hal_input_defining_op->op_output_index, 0);
-
-    const auto sliced_subgraph_inputs = sliced_graph.Inputs();
-
-    ASSERT_EQ(sliced_subgraph_inputs.size(), 1);
-
-    ASSERT_TRUE(MatchUses(sliced_subgraph_inputs.front(),
-                          {UseInfo{sliced_subgraph_ops.front().Code(), 0},
-                           UseInfo{sliced_subgraph_ops.front().Code(), 0}}));
-    ASSERT_TRUE(sliced_subgraph_inputs.front().IsSubgraphInput());
-  }
-
-  {
-    const auto hal_call_outs = hal_call.Outputs();
-    ASSERT_EQ(hal_call_outs.size(), 1);
-    const auto& hal_call_out = hal_call_outs.front();
-
-    ASSERT_TRUE(MatchUses(hal_call_out,
-                          {UseInfo{edited_subgraph_ops.back().Code(), 0},
-                           UseInfo{edited_subgraph_ops.back().Code(), 1}}));
-
-    auto sliced_subgraph_outputs = sliced_graph.Outputs();
-
-    ASSERT_EQ(sliced_subgraph_outputs.size(), 1);
-
-    const auto defining_op = sliced_subgraph_outputs.front().DefiningOp();
-    ASSERT_EQ(defining_op->op, sliced_subgraph_ops.back().Get());
-    ASSERT_EQ(defining_op->op_output_index, 0);
-
-    ASSERT_TRUE(sliced_subgraph_outputs.front().Uses().empty());
-  }
-}
-
-TEST(TestSliceSubgraphSimpleMultiOp, TwoPartitions) {
-  auto model = litert::testing::LoadTestFileModel("simple_multi_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-
-  // func.func @main(arg0)
-  //   0 = tfl.add arg0, arg0
-  //   1 = tfl.mul 0, 0
-  //   2 = tfl.mul 1, 1
-  //   3 = tfl.add 2, 2
-  //   return 3
-
-  std::vector<LiteRtOp> partition_1;
-  partition_1.push_back(ops.at(0).Get());
-
-  auto sliced_graph_1 = litert::Subgraph(&model.Get()->EmplaceSubgraph());
-  OutlinePartition(*(subgraph->Get()), sliced_graph_1.Get(), partition_1);
-
-  const auto& internal_slice_1 = *sliced_graph_1.Get();
-  ASSERT_TRUE(ValidateSubgraphIO(internal_slice_1));
-  ASSERT_TRUE(ValidateLocalTopology(internal_slice_1.Ops().cbegin(),
-                                    internal_slice_1.Ops().cend()));
-
-  std::vector<LiteRtOp> partition_2;
-  partition_2.push_back(ops.at(2).Get());
-  partition_2.push_back(ops.at(3).Get());
-
-  auto sliced_graph_2 = litert::Subgraph(&model.Get()->EmplaceSubgraph());
-  OutlinePartition(*(subgraph->Get()), sliced_graph_2.Get(), partition_2);
-
-  const auto& internal_slice_2 = *sliced_graph_2.Get();
-  ASSERT_TRUE(ValidateSubgraphIO(internal_slice_2));
-  ASSERT_TRUE(ValidateLocalTopology(internal_slice_2.Ops().cbegin(),
-                                    internal_slice_2.Ops().cend()));
-
-  auto edited_subgraph_ops = subgraph->Ops();
-
-  ASSERT_EQ(edited_subgraph_ops.size(), 3);
-  ASSERT_EQ(edited_subgraph_ops.at(0).Code(), kLiteRtOpCodeTflCustom);
-  ASSERT_EQ(edited_subgraph_ops.at(1).Code(), kLiteRtOpCodeTflMul);
-  ASSERT_EQ(edited_subgraph_ops.at(2).Code(), kLiteRtOpCodeTflCustom);
-
-  {
-    auto sliced_ops = sliced_graph_1.Ops();
-
-    ASSERT_EQ(sliced_ops.size(), 1);
-    ASSERT_EQ(sliced_ops.at(0).Code(), kLiteRtOpCodeTflAdd);
-  }
-
-  {
-    auto sliced_ops = sliced_graph_2.Ops();
-
-    ASSERT_EQ(sliced_ops.size(), 2);
-    ASSERT_EQ(sliced_ops.at(0).Code(), kLiteRtOpCodeTflMul);
-    ASSERT_EQ(sliced_ops.at(1).Code(), kLiteRtOpCodeTflAdd);
-  }
-}
-
-TEST(TestSliceSubgraphSimpleMultiOp, PartitionWithIndex) {
-  auto model = litert::testing::LoadTestFileModel("simple_multi_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-
-  auto ops = subgraph->Ops();
-
-  // func.func @main(arg0)
-  //   0 = tfl.add arg0, arg0
-  //   1 = tfl.mul 0, 0
-  //   2 = tfl.mul 1, 1
-  //   3 = tfl.add 2, 2
-  //   return 3
-
-  {
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops;
-    selected_ops.push_back({ops.at(1).Get(), 0});
-    selected_ops.push_back({ops.at(2).Get(), 1});
-
-    auto partitions = GroupPartitions(selected_ops);
-    ASSERT_EQ(partitions.size(), 2);
-    ASSERT_EQ(partitions.front().size(), 1);
-    ASSERT_EQ(partitions.back().size(), 1);
-
-    absl::flat_hash_set<LiteRtOp> ops_in_partition;
-    for (int i = 0; i < partitions.size(); ++i) {
-      for (const auto& op : partitions.at(i)) {
-        ops_in_partition.insert(op);
-      }
-    }
-    for (int i = 0; i < partitions.size(); ++i) {
-      EXPECT_TRUE(ops_in_partition.contains(selected_ops.at(i).first));
-    }
-  }
-
-  {
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops;
-    selected_ops.push_back({ops.at(0).Get(), 1});
-    selected_ops.push_back({ops.at(1).Get(), 2});
-    selected_ops.push_back({ops.at(2).Get(), 3});
-    selected_ops.push_back({ops.at(3).Get(), 4});
-
-    auto partitions = GroupPartitions(selected_ops);
-    ASSERT_EQ(partitions.size(), 4);
-
-    absl::flat_hash_set<LiteRtOp> ops_in_partition;
-    for (int i = 0; i < partitions.size(); ++i) {
-      for (const auto& op : partitions.at(i)) {
-        ops_in_partition.insert(op);
-      }
-    }
-    for (int i = 0; i < partitions.size(); ++i) {
-      EXPECT_TRUE(ops_in_partition.contains(selected_ops.at(i).first));
-    }
-  }
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.cc b/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.cc
deleted file mode 100644
index 2cc0fea5f090..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h"
-
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-
-namespace {
-static constexpr absl::string_view kPairChar = "=";
-static constexpr absl::string_view kDelim = ",";
-}  // namespace
-
-namespace litert::internal {
-
-void CompilerFlags::Clear() {
-  keys_.clear();
-  values_.clear();
-}
-
-void CompilerFlags::Push(std::string key, std::string value) {
-  keys_.push_back(std::move(key));
-  values_.push_back(std::move(value));
-}
-
-LiteRtStatus CompilerFlags::SetPluginFlags(
-    LiteRtCompilerPlugin handle,
-    decltype(LiteRtCompilerPluginSetFlags) set_flags) const {
-  std::vector<const char*> keys(keys_.size());
-  std::vector<const char*> values(values_.size());
-  for (auto i = 0; i < keys_.size(); ++i) {
-    keys[i] = keys_[i].c_str();
-    values[i] = values_[i].c_str();
-  }
-  return set_flags(handle, keys.size(), keys.data(), values.data());
-}
-
-Expected<CompilerFlags> ParseCompilerFlags(absl::string_view flags_str) {
-  using KeyVal = std::pair<std::string, std::string>;
-
-  CompilerFlags result;
-  if (flags_str.empty()) {
-    return result;
-  }
-
-  for (const auto flag : absl::StrSplit(flags_str, kDelim)) {
-    KeyVal key_value = absl::StrSplit(flag, absl::MaxSplits(kPairChar, 1));
-    result.Push(std::move(key_value.first), std::move(key_value.second));
-  }
-
-  return result;
-}
-
-}  // namespace litert::internal
-
-std::ostream& operator<<(std::ostream& os,
-                         const litert::internal::CompilerFlags& flags) {
-  for (auto i = 0; i < flags.keys_.size(); ++i) {
-    os << flags.keys_[i];
-    const auto& value = flags.values_[i];
-    if (!value.empty()) {
-      os << kPairChar << value;
-    }
-    if (i < flags.keys_.size() - 1) {
-      os << kDelim;
-    }
-  }
-  return os;
-}
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h b/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h
deleted file mode 100644
index 403ff1db527f..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_FLAGS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_FLAGS_H_
-
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-
-namespace litert::internal {
-class CompilerFlags;
-}
-
-// For logging.
-std::ostream& operator<<(std::ostream& os,
-                         const litert::internal::CompilerFlags& flags);
-
-namespace litert::internal {
-
-class CompilerFlags {
- public:
-  CompilerFlags() = default;
-
-  // Clears all flags.
-  void Clear();
-
-  // Pushes a new flag to the end of the list.
-  void Push(std::string key, std::string value = "");
-
-  // Sets the flags on the given plugin.
-  LiteRtStatus SetPluginFlags(
-      LiteRtCompilerPlugin handle,
-      decltype(LiteRtCompilerPluginSetFlags) set_flags) const;
-
- private:
-  friend std::ostream& ::operator<<(std::ostream& os,
-                                    const CompilerFlags& flags);
-
-  std::vector<std::string> keys_;
-  std::vector<std::string> values_;
-};
-
-// Parses a comma-separated (no space) list of compiler flags. Flags may be
-// key-value pairs in the format of "key=value", or just "key". E.g.
-// "key1=value1,key2".
-Expected<CompilerFlags> ParseCompilerFlags(absl::string_view flags_str);
-
-}  // namespace litert::internal
-
-// For logging.
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_FLAGS_H_
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags_test.cc b/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags_test.cc
deleted file mode 100644
index 0fcfdd72c527..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Utility types for mapping LiteRt IR to arbitrary backend specific
-// types. Implementations of these types define mapping for ops and tensors
-// that may be used in a stndalone fashion. They also may be composed
-// to create lowerings of entire graphs with topology.
-
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-
-struct LiteRtCompilerPluginT {
-  using Flag = std::pair<std::string, std::string>;
-  std::vector<Flag> flags;
-};
-
-LiteRtStatus LiteRtCompilerPluginSetFlags(LiteRtCompilerPlugin compiler_plugin,
-                                          LiteRtParamIndex num_flags,
-                                          const char** keys,
-                                          const char** values) {
-  auto& flags = compiler_plugin->flags;
-  flags.resize(num_flags);
-  for (int i = 0; i < num_flags; ++i) {
-    auto& flag = flags[i];
-    flag.first = std::string(keys[i]);
-    flag.second = std::string(values[i]);
-  }
-  return kLiteRtStatusOk;
-}
-
-namespace litert::internal {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::Pair;
-
-TEST(CompilerFlagsTest, SetPluginFlags) {
-  static constexpr const char* kKey1 = "key1";
-  static constexpr const char* kKey2 = "key2";
-  static constexpr const char* kKey3 = "key3";
-  static constexpr const char* kValue1 = "value1";
-  static constexpr const char* kEmtpyVal = "";
-
-  LiteRtCompilerPluginT plugin;
-  CompilerFlags flags;
-  flags.Push(kKey1, kValue1);
-  flags.Push(kKey2, kEmtpyVal);
-  flags.Push(kKey3);
-  LITERT_ASSERT_OK(flags.SetPluginFlags(&plugin, LiteRtCompilerPluginSetFlags));
-
-  EXPECT_THAT(plugin.flags,
-              ElementsAre(Pair(kKey1, kValue1), Pair(kKey2, kEmtpyVal),
-                          Pair(kKey3, kEmtpyVal)));
-}
-
-TEST(CompilerFlagsTest, ParseCompilerFlags) {
-  static constexpr const char* kKey1 = "key1";
-  static constexpr const char* kKey2 = "key2";
-  static constexpr const char* kKey3 = "key3";
-  static constexpr const char* kValue1 = "value1";
-  static constexpr const char* kEmtpyVal = "";
-
-  const auto flags_str =
-      absl::StrCat(kKey1, "=", kValue1, ",", kKey2, "=", kEmtpyVal, ",", kKey3);
-
-  LiteRtCompilerPluginT plugin;
-  CompilerFlags flags;
-  flags.Push(kKey1, kValue1);
-  flags.Push(kKey2, kEmtpyVal);
-  flags.Push(kKey3);
-  LITERT_ASSERT_OK(flags.SetPluginFlags(&plugin, LiteRtCompilerPluginSetFlags));
-
-  EXPECT_THAT(plugin.flags,
-              ElementsAre(Pair(kKey1, kValue1), Pair(kKey2, kEmtpyVal),
-                          Pair(kKey3, kEmtpyVal)));
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.cc b/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.cc
deleted file mode 100644
index 076b0cf8c842..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.cc
+++ /dev/null
@@ -1,638 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h"
-
-#include <stdlib.h>
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/absl_check.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_op_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/algo.h"
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-#include "tensorflow/lite/experimental/litert/core/dynamic_loading.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/core/model/buffer_manager.h"
-#include "tensorflow/lite/experimental/litert/core/model/ir_allocator.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h"
-
-namespace litert::internal {
-
-//
-// CompiledResult
-//
-
-Expected<BufferRef<uint8_t>> CompiledResult::ByteCode(
-    LiteRtParamIndex byte_code_idx) const {
-  const void* data;
-  size_t size;
-  LITERT_RETURN_IF_ERROR(parent_.get_compiled_result_byte_code(
-      compiled_result_handle_, byte_code_idx, &data, &size));
-  return BufferRef(data, size);
-}
-
-Expected<LiteRtParamIndex> CompiledResult::NumByteCodeModules() const {
-  LiteRtParamIndex byte_code_idx;
-  LITERT_RETURN_IF_ERROR(parent_.get_compiled_result_num_byte_code(
-      compiled_result_handle_, &byte_code_idx));
-  return byte_code_idx;
-}
-
-Expected<LiteRtParamIndex> CompiledResult::NumCalls() const {
-  LiteRtParamIndex num_calls;
-  LITERT_RETURN_IF_ERROR(parent_.get_compiled_result_num_calls(
-      compiled_result_handle_, &num_calls));
-  return num_calls;
-}
-
-Expected<CallInfo> CompiledResult::CallInfo(LiteRtParamIndex call_idx) const {
-  const void* data;
-  size_t size;
-  LiteRtParamIndex byte_code_idx;
-
-  LITERT_RETURN_IF_ERROR(parent_.get_compiled_result_call_info(
-      compiled_result_handle_, call_idx, &data, &size, &byte_code_idx));
-
-  absl::string_view call_info_str(reinterpret_cast<const char*>(data), size);
-  return ::litert::internal::CallInfo(call_info_str, byte_code_idx);
-}
-
-CompiledResult::~CompiledResult() {
-  if (compiled_result_handle_ != nullptr) {
-    parent_.destroy_compiled_result(compiled_result_handle_);
-  }
-}
-
-CompiledResult::CompiledResult(CompiledResult&& other)
-    : parent_(other.parent_),
-      compiled_result_handle_(other.compiled_result_handle_) {
-  other.parent_ = {};
-  other.compiled_result_handle_ = nullptr;
-}
-
-CompiledResult& CompiledResult::operator=(CompiledResult&& other) {
-  if (this != &other) {
-    parent_ = other.parent_;
-    other.parent_ = {};
-
-    compiled_result_handle_ = other.compiled_result_handle_;
-    other.compiled_result_handle_ = nullptr;
-  }
-  return *this;
-}
-
-//
-// CompilerPlugin
-//
-
-namespace {
-
-#define RESOLVE_API_FUNC(name, dest) \
-  LITERT_ASSIGN_OR_RETURN(dest, lib.LookupSymbol<decltype(dest)>(name.data()));
-
-LiteRtStatus ResolvePluginApi(SharedLibrary& lib,
-                              LiteRtCompilerPluginApi& result) {
-  RESOLVE_API_FUNC(kLiteRtGetCompilerPluginVersion,
-                   result.get_compiler_plugin_version);
-  RESOLVE_API_FUNC(kLiteRtGetCompilerPluginSupportedHardware,
-                   result.get_compiler_plugin_supported_hardware);
-  RESOLVE_API_FUNC(kLiteRtGetCompilerPluginSocManufacturer,
-                   result.get_compiler_plugin_soc_manufacturer);
-  RESOLVE_API_FUNC(kLiteRtGetNumCompilerPluginSupportedSocModels,
-                   result.get_num_compiler_plugin_supported_models);
-  RESOLVE_API_FUNC(kLiteRtGetCompilerPluginSupportedSocModel,
-                   result.get_compiler_plugin_supported_soc_model);
-
-  RESOLVE_API_FUNC(kLiteRtCreateCompilerPlugin, result.create_compiler_plugin);
-  RESOLVE_API_FUNC(kLiteRtDestroyCompilerPlugin,
-                   result.destroy_compiler_plugin);
-
-  RESOLVE_API_FUNC(kLiteRtCompilerPluginPartition,
-                   result.compiler_plugin_partition);
-  RESOLVE_API_FUNC(kLiteRtCompilerPluginCompile,
-                   result.compiler_plugin_compile);
-
-  RESOLVE_API_FUNC(kLiteRtDestroyCompiledResult,
-                   result.destroy_compiled_result);
-  RESOLVE_API_FUNC(kLiteRtCompiledResultNumByteCodeModules,
-                   result.get_compiled_result_num_byte_code);
-  RESOLVE_API_FUNC(kLiteRtGetCompiledResultByteCode,
-                   result.get_compiled_result_byte_code);
-  RESOLVE_API_FUNC(kLiteRtGetCompiledResultCallInfo,
-                   result.get_compiled_result_call_info);
-  RESOLVE_API_FUNC(kLiteRtGetNumCompiledResultCalls,
-                   result.get_compiled_result_num_calls);
-  RESOLVE_API_FUNC(kLiteRtCompilerPluginSetFlags, result.set_flags);
-
-  return kLiteRtStatusOk;
-}
-
-Expected<std::vector<std::string>> GetSocModels(
-    const LiteRtCompilerPluginApi& api, LiteRtCompilerPlugin plugin_handle) {
-  std::vector<std::string> soc_models;
-
-  LiteRtParamIndex num_models;
-  LITERT_RETURN_IF_ERROR(
-      api.get_num_compiler_plugin_supported_models(plugin_handle, &num_models));
-
-  for (LiteRtParamIndex i = 0; i < num_models; ++i) {
-    const char* model;
-    if (api.get_compiler_plugin_supported_soc_model(plugin_handle, i, &model) !=
-        kLiteRtStatusOk) {
-      continue;
-    }
-    soc_models.push_back(std::string(model));
-  }
-
-  return soc_models;
-}
-
-// Sort plugins so that we first apply those supporting NPU, then those
-// supporting GPU, and finally those supporting CPU.
-void SortPlugins(std::vector<CompilerPlugin>& compiler_plugins) {
-  std::sort(compiler_plugins.begin(), compiler_plugins.end(),
-            [](auto& x, auto& y) {
-              auto x_supported_hardware = x.SupportedHardware();
-              auto y_supported_hardware = y.SupportedHardware();
-              if (x_supported_hardware && y_supported_hardware) {
-                bool x_npu = (*x_supported_hardware & kLiteRtHwAcceleratorNpu);
-                bool x_gpu = (*x_supported_hardware & kLiteRtHwAcceleratorGpu);
-                bool x_cpu = (*x_supported_hardware & kLiteRtHwAcceleratorCpu);
-                bool y_npu = (*y_supported_hardware & kLiteRtHwAcceleratorNpu);
-                bool y_gpu = (*y_supported_hardware & kLiteRtHwAcceleratorGpu);
-                bool y_cpu = (*y_supported_hardware & kLiteRtHwAcceleratorCpu);
-                int x_score = 100 * x_npu + 10 * x_gpu + x_cpu;
-                int y_score = 100 * y_npu + 10 * y_gpu + y_cpu;
-                return x_score < y_score;
-              }
-              return true;
-            });
-}
-
-}  // namespace
-
-Expected<CompilerPlugin> CompilerPlugin::LoadPlugin(
-    const absl::string_view lib_path) {
-  CompilerPlugin plugin;
-  LITERT_LOG(LITERT_INFO, "Loading plugin at: %s", lib_path.data());
-
-  LITERT_ASSIGN_OR_RETURN(
-      plugin.lib_,
-      SharedLibrary::Load(lib_path, RtldFlags::Now().Local().DeepBind()));
-  LITERT_LOG(LITERT_INFO, "Loaded plugin at: %s", lib_path.data());
-
-  LITERT_RETURN_IF_ERROR(ResolvePluginApi(plugin.lib_, plugin.plugin_api_));
-  LITERT_LOG(LITERT_INFO, "Resolved plugin api at: %s", lib_path.data());
-
-  LITERT_RETURN_IF_ERROR(
-      plugin.plugin_api_.create_compiler_plugin(&plugin.plugin_handle_));
-  LITERT_LOG(LITERT_INFO, "Initialize plugin at: %s", lib_path.data());
-
-  auto api_version = plugin.ApiVersion();
-  if (!api_version) {
-    return api_version.Error();
-  }
-
-  if (api_version->major != LITERT_API_VERSION_MAJOR) {
-    LITERT_LOG(
-        LITERT_ERROR,
-        "Unsupported Compiler Plugin version, found version %d.%d.%d and "
-        "expected version %d.%d.%d",
-        api_version.Value().major, api_version.Value().minor,
-        api_version.Value().patch, LITERT_API_VERSION_MAJOR,
-        LITERT_API_VERSION_MINOR, LITERT_API_VERSION_PATCH);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure);
-  }
-
-  // This should never change throughout the lifetime of the compiler
-  // plugin so save to avoid recalling.
-  auto soc_models = GetSocModels(plugin.plugin_api_, plugin.plugin_handle_);
-  if (!soc_models) {
-    return soc_models.Error();
-  }
-  plugin.soc_models_ = *soc_models;
-
-  return plugin;
-}
-
-Expected<std::vector<CompilerPlugin>> CompilerPlugin::LoadPlugins(
-    absl::Span<const absl::string_view> lib_search_paths) {
-  std::vector<std::string> plugin_lib_paths;
-  for (auto search_path : lib_search_paths) {
-    // Skip paths that are not valid.
-    if (Exists(search_path)) {
-      LITERT_RETURN_IF_ERROR(
-          FindLiteRtCompilerPluginSharedLibs(search_path, plugin_lib_paths));
-    }
-  }
-
-  std::vector<CompilerPlugin> loaded_plugins;
-  loaded_plugins.reserve(lib_search_paths.size());
-
-  for (const auto& lib_path : plugin_lib_paths) {
-    LITERT_LOG(LITERT_INFO, "Loading plugin at: %s", lib_path.c_str());
-    auto plugin = LoadPlugin(lib_path);
-    if (!plugin.HasValue()) {
-      continue;
-    }
-    loaded_plugins.push_back(std::move(plugin.Value()));
-  }
-
-  // Sort plugins.
-  SortPlugins(loaded_plugins);
-
-  return loaded_plugins;
-}
-
-CompilerPlugin::CompilerPlugin(CompilerPlugin&& other)
-    : soc_models_(std::move(other.soc_models_)),
-      lib_(std::move(other.lib_)),
-      plugin_api_(std::move(other.plugin_api_)),
-      plugin_handle_(std::move(other.plugin_handle_)) {
-  other.soc_models_ = {};
-  other.plugin_api_ = {};
-  other.lib_.Close();
-  other.plugin_handle_ = nullptr;
-}
-
-CompilerPlugin& CompilerPlugin::operator=(CompilerPlugin&& other) {
-  if (this != &other) {
-    std::swap(soc_models_, other.soc_models_);
-    std::swap(lib_, other.lib_);
-    std::swap(plugin_api_, other.plugin_api_);
-    std::swap(plugin_handle_, other.plugin_handle_);
-  }
-  return *this;
-}
-
-CompilerPlugin::~CompilerPlugin() {
-  if (plugin_handle_ != nullptr) {
-    plugin_api_.destroy_compiler_plugin(plugin_handle_);
-  }
-}
-
-std::string CompilerPlugin::DebugString() const {
-  std::string version_str = "?";
-  if (auto version = ApiVersion(); version) {
-    version_str = absl::StrFormat("%d.%d.%d", version->major, version->minor,
-                                  version->patch);
-  }
-  return absl::StrFormat("%s compiler plugin (ver %s)", SocManufacturer(),
-                         version_str);
-}
-
-Expected<LiteRtApiVersion> CompilerPlugin::ApiVersion() const {
-  LiteRtApiVersion api_version;
-  LITERT_RETURN_IF_ERROR(plugin_api_.get_compiler_plugin_version(&api_version));
-  return api_version;
-}
-
-Expected<LiteRtHwAccelerators> CompilerPlugin::SupportedHardware() const {
-  LiteRtHwAccelerators supported_hardware;
-  LITERT_RETURN_IF_ERROR(plugin_api_.get_compiler_plugin_supported_hardware(
-      plugin_handle_, &supported_hardware));
-  return supported_hardware;
-}
-
-Expected<std::vector<LiteRtOpWithPartitionIndex>> CompilerPlugin::Partition(
-    const Subgraph& subgraph) {
-  LiteRtOpListT ops;
-  LITERT_RETURN_IF_ERROR(plugin_api_.compiler_plugin_partition(
-      plugin_handle_, subgraph.Get(), &ops));
-  return ops.Values();
-}
-
-Expected<CompiledResult> CompilerPlugin::Compile(LiteRtModel partitions,
-                                                 absl::string_view soc_model) {
-  CompiledResult result = MakeResult();
-  // If the user has passed an soc_model, then we use it; otherwise we let the
-  // backend pick the appropriate one by passing nullptr as soc_model. This is
-  // important for on-device compilation, where the backend must determine the
-  // SoC model based on the user device.
-  const char* soc_model_str = !soc_model.empty() ? soc_model.data() : nullptr;
-  LITERT_RETURN_IF_ERROR(plugin_api_.compiler_plugin_compile(
-      plugin_handle_, soc_model_str, partitions,
-      &result.compiled_result_handle_));
-  return result;
-}
-
-namespace {
-
-LiteRtStatus PartitionSubgraph(
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops,
-    LiteRtSubgraphT& subgraph, PartitionResult& result,
-    BufferManager* buffer_manager) {
-  // Group selected ops into connected islands.
-  auto islands = GroupPartitions(selected_ops);
-  if (islands.empty()) {
-    return kLiteRtStatusOk;
-  }
-
-  // For each connected island, slice into new subgraph and replace use with
-  // single dispatch op.
-  for (auto& island : islands) {
-    auto& new_subgraph = result.second.EmplaceBack(buffer_manager);
-    auto* dispatch_op = OutlinePartition(subgraph, &new_subgraph, island);
-    result.first.push_back(dispatch_op);
-  }
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace
-
-Expected<PartitionResult> PartitionModel(CompilerPlugin& compiler_plugin,
-                                         LiteRtModelT& model) {
-  // This algorithm decides the subgraphs to be partitioned by the plugin. This
-  // is a trivial process with the exception of composite ops and their
-  // decomposition subgraphs. Currently, we deploy the most naive approach to
-  // handling composite ops.
-  //
-  // There are two cases to consider:
-  // 1. The composite op is an "odml.npu_call", in which case it represents a
-  // parition which was explictly requested by the model author.
-  //
-  // In this case, the the composite itself is always selected, regardless of
-  // whether the plugin selects it. Its subgraph is not passed to the partition
-  // function and it is passed in its entirety to the compilation function.
-  //
-  // More advanced behavior could include:
-  // * Ensuring the plugin can compile the entire partition, and inlining it if
-  // not.
-  //
-  // 2. Standard non npu_call composite ops. Currently these are treated as a
-  // regular op, and their decomposition subgraphs are completely ignored in all
-  // phases of plugin application.
-  //
-  // More advanced behavior could include:
-  // * Allowing the plugin to compile the decomposition subgraph in the case
-  // it cannot lower the composite directly. Potentially inline in this case
-  // contingent on the availability of a suitable CPU kernel for the composite
-  // op.
-  //
-  // ASSUMPTIONS:
-  // * npu_call ops ARE NOT nested within decompositions of other npu_call ops.
-  // * Standard composite ops ARE allowed to be nested within decompositions of
-  // npu_call ops.
-  // * No two npu_call ops share the same subgraph.
-
-  // Find decomposition subgraphs and npu_call ops. These will be used to filter
-  // subgraphs passed to the plugin and pass on auto-selected npu_call
-  // partitions.
-  absl::flat_hash_set<uint32_t> decomp_subgraphs;
-  std::vector<CompositeOptions> npu_calls;
-
-  ForEachIr(&model, [&](LiteRtOp op) {
-    auto info = GetOptionsAs<CompositeOptions>(op);
-    if (!info) {
-      return;
-    }
-    decomp_subgraphs.insert(info->subgraph);
-    if (info->name == CompositeOptions::kNpuCall) {
-      npu_calls.push_back(std::move(*info));
-    }
-  });
-
-  // Build partition result via calling plugin on non-decomposition subgraphs.
-  PartitionResult result;
-  for (auto i = 0; i < model.Subgraphs().size(); ++i) {
-    if (decomp_subgraphs.contains(i)) {
-      continue;
-    }
-    auto* subgraph = model.Subgraphs()[i];
-    auto selected_ops = compiler_plugin.Partition(Subgraph(subgraph));
-    // TODO ensure selected ops don't contain npu_calls.
-    if (!selected_ops) {
-      return selected_ops.Error();
-    }
-
-    LITERT_RETURN_IF_ERROR(PartitionSubgraph(
-        std::move(*selected_ops), *subgraph, result, model.Buffers()));
-    LITERT_LOG(LITERT_INFO, "PartitionSubgraph: %d, selected num ops: %lu", i,
-               selected_ops->size());
-  }
-
-  // Add npu_call partitions to result. Update the npu_call ops to be dispatch
-  // ops.
-  std::vector<size_t> decomps_to_compile;
-  for (auto& npu_call : npu_calls) {
-    auto* op = npu_call.op;
-    MakeDispatchOp(*op);
-    result.first.push_back(op);
-    decomps_to_compile.push_back(npu_call.subgraph);
-  }
-  model.TransferSubgraphTo(result.second, std::move(decomps_to_compile));
-
-  return result;
-}
-
-Expected<PartitionResult> PartitionModelDirect(
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops, LiteRtModelT& model) {
-  if (model.Subgraphs().size() != 1) {
-    // Only single subgraphs supported for direct partitioning.
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure);
-  }
-  // Accumulate partition results for each subgraph in model.
-  PartitionResult result;
-  auto* subgraph = model.Subgraphs().front();
-  LITERT_RETURN_IF_ERROR(PartitionSubgraph(std::move(selected_ops), *subgraph,
-                                           result, model.Buffers()));
-  ABSL_DCHECK_EQ(result.first.size(), result.second.Size());
-  return result;
-}
-
-Expected<void> ApplyPluginWithPartition(CompilerPlugin& compiler_plugin,
-                                        LiteRtModelT& model,
-                                        PartitionResult partitions,
-                                        absl::string_view soc_model) {
-  auto& dispatch_ops = partitions.first;
-  auto& subgraphs = partitions.second;
-
-  // Wrap the partitioned subgraphs in a LiteRtModel.
-  LiteRtModelT sliced_model;
-  sliced_model.TransferSubgraphsFrom(std::move(subgraphs));
-
-  // Copy op codes.
-  const auto& op_codes = detail::GetTflOpCodes(model);
-
-  LiteRtModelT::TflOpCodes codes;
-  codes.reserve(op_codes.size());
-  for (const auto& op_code : op_codes) {
-    codes.emplace_back(std::make_unique<TflOpCode>(*op_code));
-  }
-
-  detail::SetTflOpCodes(sliced_model, std::move(codes));
-
-  // Pass sliced subgraphs to plugin for compilation.
-  auto compiled_result = compiler_plugin.Compile(&sliced_model, soc_model);
-  if (!compiled_result) {
-    return compiled_result.Error();
-  }
-
-  // Register byte code buffers as external buffers. Map the byte code indices
-  // to the registered buffer ids.
-  auto num_byte_code = compiled_result->NumByteCodeModules();
-  if (!num_byte_code) {
-    return num_byte_code.Error();
-  }
-
-  std::vector<LiteRtParamIndex> byte_code_idx_to_buf_id(*num_byte_code);
-
-  for (auto i = 0; i < *num_byte_code; ++i) {
-    auto byte_code = compiled_result->ByteCode(i);
-    if (!byte_code) {
-      return byte_code.Error();
-    }
-
-    // TODO: This copy could probably be avoided.
-    OwningBufferRef<uint8_t> owned_byte_code(byte_code->Data(),
-                                             byte_code->Size());
-    const auto buf_id =
-        model.Buffers()->RegisterOwnedBuffer(std::move(owned_byte_code));
-
-    byte_code_idx_to_buf_id[i] = buf_id;
-  }
-
-  // Register byte code buffers and add edges from dispatch ops to them.
-  for (auto i = 0; i < dispatch_ops.size(); ++i) {
-    auto* dispatch_op = dispatch_ops.at(i);
-
-    auto call_info = compiled_result->CallInfo(i);
-    if (!call_info) {
-      return call_info.Error();
-    }
-    auto [name, byte_code_idx] = *call_info;
-    const auto buf_id = byte_code_idx_to_buf_id[byte_code_idx];
-
-    model.AttachAssetToOp(dispatch_op, buf_id, std::string(name));
-  }
-
-  // Tag the model with make/model from the plugin.
-  auto build_stamp =
-      MakeBuildStamp(compiler_plugin.SocManufacturer(), soc_model);
-  if (!build_stamp) {
-    return build_stamp.Error();
-  }
-
-  if (auto status =
-          model.PushMetadata(kLiteRtBuildStampKey, std::move(*build_stamp));
-      status != kLiteRtStatusOk) {
-    return Error(status);
-  }
-
-  return {};
-}
-
-Expected<void> ApplyPlugin(CompilerPlugin& compiler_plugin, LiteRtModelT& model,
-                           absl::string_view soc_model) {
-  // Collect partitions to pass to compilation.
-  auto partitions = PartitionModel(compiler_plugin, model);
-  if (!partitions) {
-    return partitions.Error();
-  }
-  return ApplyPluginWithPartition(compiler_plugin, model,
-                                  std::move(*partitions), soc_model);
-}
-
-Expected<ApplyPluginsResult> ApplyPlugins(
-    LiteRtEnvironment environment, LiteRtModel model,
-    LiteRtHwAcceleratorSet selected_hw_accelerators) {
-  auto option =
-      environment->GetOption(kLiteRtEnvOptionTagCompilerPluginLibraryDir);
-  if (!option.has_value() || option->type != kLiteRtAnyTypeString) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "Compiler plugin is not configured");
-  }
-  std::string compiler_plugin_lib_path = option->str_value;
-
-  const std::array<const absl::string_view, 1>
-      compiler_plugin_lib_search_paths = {compiler_plugin_lib_path};
-
-  auto compiler_plugins = litert::internal::CompilerPlugin::LoadPlugins(
-      compiler_plugin_lib_search_paths);
-  if (!compiler_plugins) {
-    return compiler_plugins.Error();
-  }
-  if (compiler_plugins->empty()) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "No compiler plugin found");
-  }
-
-  OwningBufferRef<uint8_t> new_flatbuffer;
-  std::vector<std::string> success_messages;
-  std::vector<std::string> error_messages;
-
-  ApplyPluginsResult result;
-  result.num_applied_plugins = 0;
-  for (auto& compiler_plugin : *compiler_plugins) {
-    auto plugin_name = compiler_plugin.DebugString();
-
-    auto plugin_supported_hardware = compiler_plugin.SupportedHardware();
-    if (!plugin_supported_hardware) {
-      error_messages.push_back(absl::StrCat(
-          plugin_name, " ", plugin_supported_hardware.Error().Message()));
-      continue;
-    }
-
-    if (*plugin_supported_hardware & selected_hw_accelerators) {
-      if (auto status = ApplyPlugin(compiler_plugin, *model); !status) {
-        error_messages.push_back(
-            absl::StrCat(plugin_name, " ", status.Error().Message()));
-        continue;
-      }
-
-      success_messages.push_back(absl::StrCat(plugin_name));
-      result.num_applied_plugins++;
-    }
-  }
-
-  result.new_flatbuffer = std::move(new_flatbuffer);
-  result.success_message = absl::StrJoin(success_messages, ", ");
-  result.error_message = absl::StrJoin(error_messages, ", ");
-
-  return result;
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h b/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h
deleted file mode 100644
index 794d95944a60..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_PLUGIN_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_PLUGIN_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h"
-
-// C++ wrappers and high-level functions for managing compiler plugins
-// and applying them to models.
-
-namespace litert::internal {
-
-// Name and index of byte code.
-using CallInfo = std::pair<absl::string_view, LiteRtParamIndex>;
-
-// Wraps vendor compiled result. Must be outlived by the CompilerPlugin
-// the generated it.
-class CompiledResult {
- public:
-  friend class CompilerPlugin;
-
-  // Number of byte code modules compiled by the plugin.
-  Expected<LiteRtParamIndex> NumByteCodeModules() const;
-
-  // Get the single module of compiled byte code. This contains the
-  // compilation result for all entry points.
-  Expected<BufferRef<uint8_t>> ByteCode(
-      LiteRtParamIndex byte_code_idx = 0) const;
-
-  // Get information regarding the "ith" entry points in the compiled module.
-  // There will be oe entry point for each subgraph compiled for.
-  Expected<CallInfo> CallInfo(LiteRtParamIndex call_idx) const;
-
-  // Get the number of entry points in the compiled module. This will be equal
-  // to the number of subgraphs passed to the compilation step.
-  Expected<LiteRtParamIndex> NumCalls() const;
-
-  explicit CompiledResult(const LiteRtCompilerPluginApi& parent)
-      : parent_(parent) {}
-
-  CompiledResult(CompiledResult&& other);
-  CompiledResult& operator=(CompiledResult&& other);
-  CompiledResult(const CompiledResult& other) = delete;
-  CompiledResult& operator=(const CompiledResult& other) = delete;
-
-  ~CompiledResult();
-
- private:
-  LiteRtCompilerPluginApi parent_;
-  LiteRtCompiledResult compiled_result_handle_ = nullptr;
-};
-
-// Wraps vendor compiler plugin.
-class CompilerPlugin {
- public:
-  std::string DebugString() const;
-
-  // Get the compiler plugin's API version.
-  Expected<LiteRtApiVersion> ApiVersion() const;
-
-  // Get the supported HW accelerators (e.g., GPU, NPU).
-  Expected<LiteRtHwAccelerators> SupportedHardware() const;
-
-  // Get the manufacturer associated with this plugin. NOTE: SocManufacturer
-  // string returned by the underlying plugin are expected to have static
-  // lifetime.
-  absl::string_view SocManufacturer() const {
-    return plugin_api_.get_compiler_plugin_soc_manufacturer();
-  }
-
-  // Get list of unique soc models targetable by this plugin.
-  const std::vector<std::string>& SocModels() const { return soc_models_; }
-
-  // Selects ops for the plugin to compile.
-  Expected<std::vector<LiteRtOpWithPartitionIndex>> Partition(
-      const Subgraph& subgraph);
-
-  // Compile given LiteRtSubgraphs. Result object must be outlived by
-  // this CompilerPlugin.
-  Expected<CompiledResult> Compile(LiteRtModel partitions,
-                                   absl::string_view soc_model = "");
-
-  // Search for shared library files with prefix "libLiteRtCompilerPlugin" in
-  // the directories passed through "lib_search_paths". Populates
-  // "loaded_plugins" with resolved plugin apis for each found library that can
-  // be successfully loaded. Additionally initializes the compiler plugin
-  // instances and stores handle.
-  static Expected<std::vector<CompilerPlugin>> LoadPlugins(
-      absl::Span<const absl::string_view> lib_search_paths);
-
-  // Set compiler flags within the plugin.
-  LiteRtStatus SetFlags(const CompilerFlags& flags) {
-    return flags.SetPluginFlags(plugin_handle_, plugin_api_.set_flags);
-  }
-
-  CompilerPlugin(CompilerPlugin&& other);
-  CompilerPlugin& operator=(CompilerPlugin&& other);
-  CompilerPlugin(const CompilerPlugin& other) = delete;
-  CompilerPlugin& operator=(const CompilerPlugin& other) = delete;
-
-  // Destroys any living `LiteRtCompilerPlugin` and frees reference
-  // to dynamically loaded library.
-  ~CompilerPlugin();
-
- private:
-  static Expected<CompilerPlugin> LoadPlugin(absl::string_view lib_path);
-  CompilerPlugin() = default;
-
-  std::vector<std::string> soc_models_;
-  SharedLibrary lib_;
-  LiteRtCompilerPluginApi plugin_api_ = {};
-  LiteRtCompilerPlugin plugin_handle_ = nullptr;
-
-  // Internal LiteRtCompiledResult wrapper.
-
-  CompiledResult MakeResult() const { return CompiledResult(plugin_api_); }
-};
-
-// Higher level functions for applying plugin to graph.
-//===---------------------------------------------------------------------------
-
-// Dispatch op references and their subgraph to be compiled.
-using PartitionResult =
-    std::pair<std::vector<LiteRtOp>, typename LiteRtSubgraphT::Alloc>;
-
-// Applies just the partition phase of the plugin on the model. Returns
-// references newly allocated subgraphs removed from input and their
-// corresponding dispatch ops in the input.
-Expected<PartitionResult> PartitionModel(CompilerPlugin& compiler_plugin,
-                                         LiteRtModelT& model);
-
-// Same as "PartitionModel" choose partitions directly based on the selected
-// ops. Selected ops may contain any ops in the the main subgraph of the model.
-// This function will separate them into DAGs and slice the model accordingly.
-Expected<PartitionResult> PartitionModelDirect(
-    std::vector<LiteRtOpWithPartitionIndex> selected_ops, LiteRtModelT& model);
-
-// Applies both the partition and compile steps to the model. Generated
-// byte_code will be internalized within the model for later serialization.
-Expected<void> ApplyPlugin(CompilerPlugin& compiler_plugin, LiteRtModelT& model,
-                           absl::string_view soc_model = "");
-
-// Applies the compilation step to the model given a predetermined partition.
-Expected<void> ApplyPluginWithPartition(CompilerPlugin& compiler_plugin,
-                                        LiteRtModelT& model,
-                                        PartitionResult partitions,
-                                        absl::string_view soc_model = "");
-
-// Apply all available plugins providing the selected HW accelerators to the
-// given model, modify the model accordingly, and return (1) the number of
-// compiler plugins successfully applied, (2) a new flatbuffer backing the
-// modified model, (3) a string listing the compiler plugins that were
-// successfully applied, and (4) a string listing the compiler plugins that
-// failed to apply with an associated error message.
-struct ApplyPluginsResult {
-  size_t num_applied_plugins;
-  OwningBufferRef<uint8_t> new_flatbuffer;
-  std::string success_message;
-  std::string error_message;
-};
-
-Expected<ApplyPluginsResult> ApplyPlugins(
-    LiteRtEnvironment environment, LiteRtModel model,
-    LiteRtHwAcceleratorSet selected_hw_accelerators);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_COMPILER_PLUGIN_COMPILER_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin_test.cc b/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin_test.cc
deleted file mode 100644
index eefcc254ced4..000000000000
--- a/tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin_test.cc
+++ /dev/null
@@ -1,477 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h"
-
-#include <array>
-#include <sstream>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_op_options.h"
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/tools/dump.h"
-
-namespace litert::internal {
-namespace {
-
-using testing::UniqueTestDirectory;
-
-constexpr absl::string_view kTestPluginSearchPath =
-    "third_party/tensorflow/lite/experimental/litert/vendors/examples";
-
-constexpr absl::string_view kTestManufacturer = "ExampleSocManufacturer";
-constexpr absl::string_view kTestModels = "ExampleSocModel";
-
-TEST(CompilerPluginTest, LoadTestPlugin) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  ASSERT_EQ(plugins->size(), 1);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-  ASSERT_EQ(plugins->front().SocModels().size(), 1);
-  EXPECT_EQ(plugins->front().SocModels().front(), kTestModels);
-}
-
-TEST(CompilerPluginTest, LoadTestPluginWithMalformed) {
-  const auto dir = UniqueTestDirectory::Create();
-  ASSERT_TRUE(dir);
-  Touch(Join({dir->Str(), "notLibLiteRt.so"}));
-
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  ASSERT_EQ(plugins->size(), 1);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-}
-
-TEST(CompilerPluginTest, MultipleValidPlugins) {
-  auto plugins = CompilerPlugin::LoadPlugins(
-      {kTestPluginSearchPath, kTestPluginSearchPath});
-
-  ASSERT_EQ(plugins->size(), 2);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-  EXPECT_EQ(plugins->back().SocManufacturer(), kTestManufacturer);
-}
-
-TEST(CompilerPluginTest, MoveAssign) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  ASSERT_EQ(plugins->size(), 1);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-
-  CompilerPlugin other = std::move(plugins->front());
-
-  EXPECT_EQ(other.SocManufacturer(), kTestManufacturer);
-}
-
-TEST(CompilerPluginTest, MoveConstruct) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  ASSERT_EQ(plugins->size(), 1);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-
-  CompilerPlugin other(std::move(plugins->front()));
-
-  EXPECT_EQ(other.SocManufacturer(), kTestManufacturer);
-}
-
-TEST(CompilerPluginTest, SocModels) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-
-  EXPECT_THAT(plugins->front().SocModels(),
-              ::testing::ElementsAreArray({kTestModels}));
-}
-
-TEST(CompilerPluginTest, SetFlags) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  LITERT_ASSERT_OK(plugins->front().SetFlags(CompilerFlags()));
-}
-
-TEST(CompilerPluginTest, Partition) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-
-  auto model = testing::LoadTestFileModel("mul_simple.tflite");
-  auto subgraph = model.MainSubgraph();
-  auto ops = plugins->front().Partition(*subgraph);
-  ASSERT_TRUE(ops);
-
-  EXPECT_EQ(ops->size(), 2);
-}
-
-TEST(CompilerPluginTest, Compile) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  EXPECT_EQ(plugins->front().SocManufacturer(), kTestManufacturer);
-
-  auto model_wrap = testing::LoadTestFileModel("mul_simple.tflite");
-  auto& model = *model_wrap.Get();
-
-  auto result = plugins->front().Compile(&model);
-  ASSERT_TRUE(result);
-
-  auto byte_code = result->ByteCode();
-  ASSERT_TRUE(byte_code && byte_code->Size() > 0);
-
-  auto num_calls = result->NumCalls();
-  ASSERT_TRUE(num_calls);
-  ASSERT_EQ(*num_calls, 1);
-
-  auto call_info = result->CallInfo(0);
-  ASSERT_TRUE(call_info);
-}
-
-TEST(CompilerPluginTest, Dump) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-
-  std::stringstream dump;
-  Dump(plugins->front(), dump);
-
-  ASSERT_EQ(dump.view(),
-            "SocManufacturer: ExampleSocManufacturer\nSocModels: { "
-            "ExampleSocModel }\n");
-}
-
-TEST(PartitionModelTest, Simple) {
-  auto model_wrap = testing::LoadTestFileModel("mul_simple.tflite");
-  auto& model = *model_wrap.Get();
-
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  auto& plugin = plugins->front();
-
-  auto partition_result = PartitionModel(plugin, model);
-  ASSERT_TRUE(partition_result);
-  ASSERT_EQ(model.NumSubgraphs(), 1);
-
-  const auto& [ops, subgraphs] = *partition_result;
-
-  EXPECT_EQ(ops.size(), 1);
-  EXPECT_EQ(ops.front()->OpCode(), kLiteRtOpCodeTflCustom);
-
-  EXPECT_EQ(subgraphs.Size(), 1);
-  EXPECT_EQ(subgraphs.Elements().front()->Ops().size(), 2);
-}
-
-TEST(PartitionModelTest, PartitionDirect) {
-  auto model_wrap = testing::LoadTestFileModel("mul_simple.tflite");
-  auto& model = *model_wrap.Get();
-
-  std::vector<LiteRtOpWithPartitionIndex> selected_ops = {
-      {model.MainSubgraph()->Ops().front(), 0},
-      {model.MainSubgraph()->Ops().back(), 0}};
-
-  auto partition_result = PartitionModelDirect(std::move(selected_ops), model);
-  ASSERT_TRUE(partition_result);
-  ASSERT_EQ(model.NumSubgraphs(), 1);
-
-  const auto& [ops, subgraphs] = *partition_result;
-
-  EXPECT_EQ(ops.size(), 1);
-  EXPECT_EQ(ops.front()->OpCode(), kLiteRtOpCodeTflCustom);
-
-  EXPECT_EQ(subgraphs.Size(), 1);
-  EXPECT_EQ(subgraphs.Elements().front()->Ops().size(), 2);
-}
-
-TEST(PartitionModelTest, MultiSubgraph) {
-  auto model_wrap = testing::LoadTestFileModel("multi_subgraph_mul.tflite");
-  auto& model = *model_wrap.Get();
-
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  auto& plugin = plugins->front();
-
-  auto partition_result = PartitionModel(plugin, model);
-  ASSERT_TRUE(partition_result);
-  ASSERT_EQ(model.NumSubgraphs(), 2);
-
-  const auto& [ops, subgraphs] = *partition_result;
-
-  EXPECT_EQ(ops.size(), 2);
-  EXPECT_EQ(ops.front()->OpCode(), kLiteRtOpCodeTflCustom);
-  EXPECT_EQ(ops.back()->OpCode(), kLiteRtOpCodeTflCustom);
-
-  EXPECT_EQ(subgraphs.Size(), 2);
-  EXPECT_EQ(subgraphs.Elements().front()->Ops().size(), 1);
-  EXPECT_EQ(subgraphs.Elements().back()->Ops().size(), 1);
-}
-
-TEST(PartitionModelTest, CstMultiSubgraph) {
-  auto model_wrap = testing::LoadTestFileModel("multi_use_cst.tflite");
-  auto& model = *model_wrap.Get();
-  ASSERT_EQ(model.MainSubgraph()->Ops().size(), 3);
-
-  std::vector<LiteRtOpWithPartitionIndex> selected_ops = {
-      {model.MainSubgraph()->Ops().front(), 0},
-      {model.MainSubgraph()->Ops().back(), 0},
-  };
-  auto partition_result = PartitionModelDirect(std::move(selected_ops), model);
-  ASSERT_TRUE(partition_result);
-
-  const auto& [ops, subgraphs] = *partition_result;
-
-  EXPECT_EQ(ops.size(), 2);
-  EXPECT_EQ(ops.front()->OpCode(), kLiteRtOpCodeTflCustom);
-  EXPECT_EQ(ops.back()->OpCode(), kLiteRtOpCodeTflCustom);
-
-  EXPECT_EQ(subgraphs.Size(), 2);
-  EXPECT_EQ(subgraphs.Elements().front()->Ops().size(), 1);
-  EXPECT_EQ(subgraphs.Elements().back()->Ops().size(), 1);
-
-  const auto& cst_1 =
-      subgraphs.Elements().front()->Ops().front()->Input(1).Weights();
-  const auto& cst_2 =
-      subgraphs.Elements().back()->Ops().front()->Input(1).Weights();
-
-  // Both weights should have the same object managed by the same buffer
-  // manager.
-  ASSERT_EQ(cst_1.GetBufferManager(), model.Buffers());
-  ASSERT_EQ(cst_2.GetBufferManager(), model.Buffers());
-  ASSERT_GT(cst_1.Buffer().Size(), 0);
-  ASSERT_GT(cst_2.Buffer().Size(), 0);
-  EXPECT_EQ(cst_1.GetBufferId(), cst_2.GetBufferId());
-  ASSERT_EQ(cst_1.Buffer().Data(), cst_2.Buffer().Data());
-}
-
-TEST(ApplyTest, Simple) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  auto model_wrap = testing::LoadTestFileModel("mul_simple.tflite");
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap.Get();
-
-  ASSERT_TRUE(ApplyPlugin(plugins->front(), model));
-  ASSERT_EQ(model.NumSubgraphs(), 1);
-
-  auto& subgraph = *model.MainSubgraph();
-  ASSERT_EQ(subgraph.Ops().size(), 1);
-
-  auto* op = subgraph.Ops().front();
-
-  EXPECT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-  EXPECT_TRUE(model.FindOpAsset(op));
-
-  EXPECT_TRUE(model.FindMetadata(kLiteRtBuildStampKey));
-}
-
-TEST(ApplyTest, WithPartition) {
-  auto model_wrap = testing::LoadTestFileModel("mul_simple.tflite");
-  auto& model = *model_wrap.Get();
-
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  auto& plugin = plugins->front();
-
-  auto partition_result = PartitionModel(plugin, model);
-  ASSERT_TRUE(partition_result);
-  ASSERT_EQ(model.NumSubgraphs(), 1);
-
-  ASSERT_TRUE(ApplyPluginWithPartition(plugins->front(), model,
-                                       std::move(*partition_result)));
-
-  auto& subgraph = model.Subgraph(0);
-  ASSERT_EQ(subgraph.Ops().size(), 1);
-
-  auto* op = subgraph.Ops().front();
-
-  EXPECT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-  EXPECT_TRUE(model.FindOpAsset(op));
-}
-
-TEST(ApplyTest, MultiSubgraph) {
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-  ASSERT_EQ(plugins->size(), 1);
-  auto model_wrap = testing::LoadTestFileModel("multi_subgraph_mul.tflite");
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap.Get();
-
-  ASSERT_TRUE(ApplyPlugin(plugins->front(), model));
-  ASSERT_EQ(model.NumSubgraphs(), 2);
-
-  {
-    auto& subgraph = model.Subgraph(0);
-    ASSERT_EQ(subgraph.Ops().size(), 1);
-
-    auto* op = subgraph.Ops().front();
-
-    EXPECT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-    EXPECT_TRUE(model.FindOpAsset(op));
-  }
-
-  {
-    auto& subgraph = model.Subgraph(1);
-    ASSERT_EQ(subgraph.Ops().size(), 1);
-
-    auto* op = subgraph.Ops().front();
-
-    EXPECT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-    EXPECT_TRUE(model.FindOpAsset(op));
-  }
-
-  EXPECT_TRUE(model.FindMetadata(kLiteRtBuildStampKey));
-}
-
-TEST(ApplyTest, ApplyPlugins) {
-  auto model_wrap = testing::LoadTestFileModel("mul_simple.tflite");
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap.Get();
-
-  const std::array environment_options = {
-      litert::Environment::Option{
-          /*.tag=*/litert::Environment::OptionTag::CompilerPluginLibraryDir,
-          /*.value=*/kTestPluginSearchPath,
-      },
-  };
-  auto env = litert::Environment::Create(environment_options);
-  ASSERT_TRUE(env);
-
-  LiteRtHwAccelerators compilation_options = static_cast<LiteRtHwAccelerators>(
-      kLiteRtHwAcceleratorCpu | kLiteRtHwAcceleratorGpu |
-      kLiteRtHwAcceleratorNpu);
-  auto new_flatbuffer =
-      litert::internal::ApplyPlugins(env->Get(), &model, compilation_options);
-  ASSERT_TRUE(new_flatbuffer);
-
-  ASSERT_EQ(model.NumSubgraphs(), 1);
-
-  auto& subgraph = *model.MainSubgraph();
-  ASSERT_EQ(subgraph.Ops().size(), 1);
-
-  auto* op = subgraph.Ops().front();
-
-  EXPECT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-  EXPECT_TRUE(model.FindOpAsset(op));
-
-  EXPECT_TRUE(model.FindMetadata(kLiteRtBuildStampKey));
-}
-
-TEST(PartitionTest, MappedCompositeOp) {
-  auto model_wrap = testing::LoadTestFileModel("rms_norm_composite.tflite");
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap.Get();
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  auto partition_result = PartitionModel(plugins->front(), model);
-  ASSERT_TRUE(partition_result);
-  // One new subgraph for the consumed composite op only, decomp not consumed.
-  ASSERT_EQ(partition_result->second.Size(), 1);
-}
-
-TEST(PartitionTest, SimpleNpuCallComposite) {
-  auto model_wrap = testing::LoadTestFileModel("simple_composite.tflite");
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap.Get();
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  auto* decomp = model.Subgraphs()[1];
-
-  auto partition_result = PartitionModel(plugins->front(), model);
-  ASSERT_TRUE(partition_result);
-
-  auto& ops = partition_result->first;
-  ASSERT_EQ(ops.size(), 1);
-  ASSERT_EQ(ops.front()->OpCode(), kLiteRtOpCodeTflCustom);
-
-  auto& sgs = partition_result->second;
-  ASSERT_EQ(sgs.Size(), 1);
-  ASSERT_EQ(sgs.Elements().front(), decomp);
-}
-
-TEST(PartitionTest, MultiNpuCallComposite) {
-  auto model_wrap = testing::LoadTestFileModel("multi_composite.tflite");
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap.Get();
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  ASSERT_EQ(model.NumSubgraphs(), 4);
-  auto* decomp1 = model.Subgraphs()[1];
-  auto* non_npu_call_decomop = model.Subgraphs()[2];
-  auto* decomp2 = model.Subgraphs()[3];
-
-  auto partition_result = PartitionModel(plugins->front(), model);
-  ASSERT_TRUE(partition_result);
-
-  {
-    // Subgraphs to be compiled will be moved to the result from the model.
-    // Non-npu-call decompositions will be reindexed.
-    ASSERT_EQ(model.NumSubgraphs(), 2);
-    ASSERT_EQ(model.Subgraphs()[1], non_npu_call_decomop);
-    auto opts = GetOptionsAs<CompositeOptions>(model.Subgraph(0).Ops()[1]);
-    ASSERT_TRUE(opts);
-    ASSERT_EQ(opts->subgraph, 1);
-  }
-
-  {
-    // All npu call ops are now dispatch ops.
-    auto& ops = partition_result->first;
-
-    ASSERT_EQ(ops.size(), 2);
-    auto* first_dispatch_op = ops.front();
-    auto* second_dispatch_op = ops.back();
-
-    ASSERT_EQ(first_dispatch_op->OpCode(), kLiteRtOpCodeTflCustom);
-    ASSERT_EQ(first_dispatch_op, model.Subgraphs()[0]->Ops().front());
-
-    ASSERT_EQ(second_dispatch_op->OpCode(), kLiteRtOpCodeTflCustom);
-    ASSERT_EQ(second_dispatch_op, model.Subgraphs()[0]->Ops().back());
-  }
-
-  {
-    // Bodies to compile are the decompositions of npu call ops.
-    auto& sgs = partition_result->second;
-
-    ASSERT_EQ(sgs.Size(), 2);
-    ASSERT_EQ(sgs.Elements().front(), decomp1);
-    ASSERT_EQ(sgs.Elements().back(), decomp2);
-  }
-}
-
-TEST(PartitionTest, NestedNpuCallComposite) {
-  auto model_wrap = testing::LoadTestFileModel("nested_composite.tflite");
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap.Get();
-  auto plugins = CompilerPlugin::LoadPlugins({kTestPluginSearchPath});
-
-  ASSERT_EQ(model.NumSubgraphs(), 3);
-
-  auto partition_result = PartitionModel(plugins->front(), model);
-  ASSERT_TRUE(partition_result);
-
-  auto& ops = partition_result->first;
-  ASSERT_EQ(ops.size(), 1);
-  ASSERT_EQ(ops.front()->OpCode(), kLiteRtOpCodeTflCustom);
-
-  auto& sgs = partition_result->second;
-  ASSERT_EQ(sgs.Size(), 1);
-  ASSERT_EQ(sgs.Elements().front()->Op(0).OpCode(), kLiteRtOpCodeShloComposite);
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/BUILD b/tensorflow/lite/experimental/litert/core/BUILD
deleted file mode 100644
index 74b7bff1d13f..000000000000
--- a/tensorflow/lite/experimental/litert/core/BUILD
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-        "//third_party/odml/infra/ml_drift_delegate/litert:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "build_stamp",
-    srcs = ["build_stamp.cc"],
-    hdrs = ["build_stamp.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "build_stamp_test",
-    srcs = ["build_stamp_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-    ],
-    deps = [
-        ":build_stamp",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "dynamic_loading",
-    srcs = ["dynamic_loading.cc"],
-    hdrs = ["dynamic_loading.h"],
-    linkopts = ["-ldl"],
-    deps = [
-        ":filesystem",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",  # buildcleaner: keep
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "insert_order_map",
-    hdrs = ["insert_order_map.h"],
-    deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_test(
-    name = "insert_order_map_test",
-    srcs = ["insert_order_map_test.cc"],
-    deps = [
-        ":insert_order_map",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "environment",
-    srcs = ["environment.cc"],
-    hdrs = [
-        "environment.h",
-        "//tensorflow/lite/experimental/litert/c:litert_environment.h",
-    ],
-    deps = [
-        ":accelerator",
-        "//tensorflow/lite:shared_library",
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "environment_test",
-    srcs = ["environment_test.cc"],
-    deps = [
-        ":environment",
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "accelerator",
-    srcs = ["accelerator.cc"],
-    hdrs = ["accelerator.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_shared_library",
-    ],
-)
-
-cc_library(
-    name = "accelerator_model_compilation_data",
-    hdrs = ["accelerator_model_compilation_data.h"],
-    deps = [
-        ":accelerator",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "accelerator_model_compilation_data_test",
-    srcs = ["accelerator_model_compilation_data_test.cc"],
-    deps = [
-        ":accelerator_model_compilation_data",
-        "//tensorflow/lite/experimental/litert/c:litert_accelerator_options",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "filesystem",
-    srcs = ["filesystem.cc"],
-    hdrs = ["filesystem.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_detail",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "dispatch_op_schema",
-    srcs = ["dispatch_op_schema.cc"],
-    hdrs = ["dispatch_op_schema.h"],
-    copts = ["-DFLATBUFFERS_LOCALE_INDEPENDENT=0"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "@flatbuffers//:runtime_cc",
-    ],
-)
-
-cc_test(
-    name = "filesystem_test",
-    srcs = ["filesystem_test.cc"],
-    deps = [
-        ":filesystem",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-# copybara:uncomment_begin(no OSS for unique-test-directory)
-# cc_test(
-#     name = "dynamic_loading_test",
-#     srcs = ["dynamic_loading_test.cc"],
-#     tags = [
-#         # Sanitizer runtimes are incompatible with RTLD_DEEPBIND.
-#         "noasan",
-#         "nomsan",
-#         "nosan",
-#     ],
-#     deps = [
-#         ":dynamic_loading",
-#         ":filesystem",
-#         "@com_google_googletest//:gtest_main",
-#         "@com_google_absl//absl/strings:string_view",
-#         "//tensorflow/lite/experimental/litert/c:litert_logging",  # buildcleaner: keep
-#         "//tensorflow/lite/experimental/litert/test:common",
-#         "//tensorflow/lite/experimental/litert/test:matchers",
-#     ],
-# )
-# copybara:uncomment_end
-
-cc_test(
-    name = "dispatch_op_schema_test",
-    srcs = ["dispatch_op_schema_test.cc"],
-    deps = [
-        ":dispatch_op_schema",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/core/accelerator.cc b/tensorflow/lite/experimental/litert/core/accelerator.cc
deleted file mode 100644
index 99cb70644d1f..000000000000
--- a/tensorflow/lite/experimental/litert/core/accelerator.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-#include <cstddef>
-#include <utility>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-
-namespace litert::internal {
-
-void AcceleratorRegistry::DestroyAccelerator(LiteRtAcceleratorT* accelerator) {
-  if (accelerator && accelerator->ReleaseData) {
-    accelerator->env = nullptr;
-    accelerator->ReleaseData(accelerator->data);
-  }
-  delete accelerator;
-}
-
-Expected<LiteRtAcceleratorT*> AcceleratorRegistry::RegisterAccelerator(
-    Ptr accelerator) {
-  if (!accelerator) {
-    return Error(kLiteRtStatusErrorInvalidArgument,
-                 "Cannot register a null accelerator.");
-  }
-  accelerators_.push_back(std::move(accelerator));
-  return accelerators_.back().get();
-}
-
-Expected<LiteRtAcceleratorT*> AcceleratorRegistry::Get(LiteRtParamIndex idx) {
-  if (idx >= size()) {
-    return Error(kLiteRtStatusErrorNotFound, "Cannot find accelerator.");
-  }
-  return accelerators_[idx].get();
-}
-
-Expected<LiteRtParamIndex> AcceleratorRegistry::FindAcceleratorIndex(
-    LiteRtAcceleratorT* accelerator) {
-  for (size_t idx = 0; idx < accelerators_.size(); ++idx) {
-    if (accelerator == accelerators_[idx].get()) {
-      return static_cast<LiteRtParamIndex>(idx);
-    }
-  }
-  return Error(kLiteRtStatusErrorNotFound,
-               "The accelerator is not registered in the LiteRT environment.");
-}
-
-void AcceleratorRegistry::TakeOwnershipOfSharedLibrary(SharedLibrary lib) {
-  accelerator_shared_libraries_.push_back(std::move(lib));
-}
-
-}  // namespace litert::internal
-
-extern "C" {
-
-LiteRtStatus LiteRtSetAcceleratorCompilationOptionsDestructor(
-    LiteRtAcceleratorCompilationOptionsHeader* options,
-    void (*Destructor)(LiteRtAcceleratorCompilationOptionsHeader*)) {
-  if (!options) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  options->ReleaseData = Destructor;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtSetAcceleratorCompilationOptionsIdentifier(
-    LiteRtAcceleratorCompilationOptionsHeader* options,
-    const char* identifier) {
-  if (!options) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  options->identifier = identifier;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtSetAcceleratorCompilationOptionsVersion(
-    LiteRtAcceleratorCompilationOptionsHeader* options,
-    LiteRtApiVersion version) {
-  if (!options) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  options->version = version;
-  return kLiteRtStatusOk;
-}
-
-}  // extern "C"
diff --git a/tensorflow/lite/experimental/litert/core/accelerator.h b/tensorflow/lite/experimental/litert/core/accelerator.h
deleted file mode 100644
index 52e9009c3f64..000000000000
--- a/tensorflow/lite/experimental/litert/core/accelerator.h
+++ /dev/null
@@ -1,201 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ACCELERATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ACCELERATOR_H_
-
-#include <cstddef>
-#include <memory>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-
-extern "C" {
-
-// We need to forward declare this to avoid a dependency loop.
-struct LiteRtCompiledModelT;
-struct LiteRtEnvironmentT;
-
-// This must be the very first field (or base) of every accelerator option
-// object in order to make the option objects part of a list.
-struct LiteRtAcceleratorCompilationOptionsHeader {
-  // Pointer to the next link structure.
-  LiteRtAcceleratorCompilationOptionsHeader* next;
-
-  // Identifier for the configuration structure. Used by the accelerator
-  // implementation to go through the list and reinterpret the link to its
-  // actual type.
-  const char* identifier;
-
-  // NOLINTBEGIN(*-readability-class-member-naming)
-
-  // A destructor for this link's data. Releases the memory stored in this link
-  // AS WELL AS THE LINK ITSELF.
-  //
-  // We need this for option objects that may own some of their data. The most
-  // common use case here being helper functions that build a path from other
-  // program inputs. If the options structure doesn't own the data, then the
-  // user must ensure that the string outlives the compiled model, which may be
-  // tricky. This let's the user define a function that will be called to clean
-  // up the data.
-  void (*ReleaseData)(LiteRtAcceleratorCompilationOptionsHeader*);
-
-  // NOLINTEND(*-readability-class-member-naming)
-
-  // The version of the option structure. This allows the consumer code to know
-  // the size of the structure and the fields that are accessible.
-  //
-  // Note: THIS SHOULD BE INCREMENTED EVERY TIME AN OPTION IS ADDED.
-  LiteRtApiVersion version;
-};
-
-// Sets the destructor options destructor.
-//
-// We need this for option objects that may own some of their data. The most
-// common use case here being helper functions that build a path from other
-// program inputs. If the options structure doesn't own the data, then the user
-// must ensure that the string outlives the compiled model, which may be tricky.
-// This lets the user define a function that will be called to clean up the
-// data.
-LiteRtStatus LiteRtSetAcceleratorCompilationOptionsDestructor(
-    LiteRtAcceleratorCompilationOptionsHeader* options,
-    void (*Destructor)(LiteRtAcceleratorCompilationOptionsHeader*));
-
-// Sets the identifier for an acceleration compilation option object.
-//
-// Warning: The identifier's lifetime is not managed by the object.
-LiteRtStatus LiteRtSetAcceleratorCompilationOptionsIdentifier(
-    LiteRtAcceleratorCompilationOptionsHeader* options, const char* identifier);
-
-// Sets the version to the accelerator options version.
-//
-// Note: This should probably be in sync with the accelerator code version.
-LiteRtStatus LiteRtSetAcceleratorCompilationOptionsVersion(
-    LiteRtAcceleratorCompilationOptionsHeader* options,
-    LiteRtApiVersion version);
-
-struct LiteRtAcceleratorT {
-  // Points to the type-erased accelerator state.
-  void* data;
-
-  // Points to the environment that owns this accelerator.
-  LiteRtEnvironmentT* env;
-
-  // NOLINTBEGIN(*-readability-class-member-naming)
-
-  // Releases the the data.
-  //
-  // This function is used by the framework to clean up the accelerator. It
-  // should not be called by client code.
-  void (*ReleaseData)(void*);
-
-  // Retrieves the accelerator name.
-  LiteRtStatus (*GetName)(LiteRtAcceleratorT* accelerator, const char** name);
-
-  // Retrieves the accelerator version.
-  LiteRtStatus (*GetVersion)(LiteRtAcceleratorT* accelerator,
-                             LiteRtApiVersion* version);
-
-  // Retrieves the accelerator hardware support.
-  LiteRtStatus (*GetHardwareSupport)(
-      LiteRtAcceleratorT* accelerator,
-      LiteRtHwAcceleratorSet* supported_hardware);
-
-  // Creates a delegate for the accelerator.
-  // Used void** instead of TfLiteOpaqueDelegate** to avoid TFLite dependency.
-  LiteRtStatus (*CreateDelegate)(
-      LiteRtAcceleratorT* accelerator,
-      LiteRtAcceleratorCompilationOptionsHeader* compilation_options,
-      void** delegate);
-
-  // Destroys created delegate for the accelerator.
-  // The function signature is matched with existing TfLiteOpaqueDelegate
-  // interface to use.
-  // Used void* instead of TfLiteOpaqueDelegate* to avoid TFLite dependency.
-  void (*DestroyDelegate)(void* delegate);
-
-  // NOLINTEND(*-readability-class-member-naming)
-};
-
-}  // extern "C"
-
-#ifdef __cplusplus
-
-namespace litert::internal {
-
-// Holds a list of accelerators.
-//
-// This is a helper class for the LiteRT environment that manages the
-// accelerators (and their resources) that are registered with it.
-class AcceleratorRegistry {
- public:
-  struct Deleter {
-    void operator()(LiteRtAcceleratorT* accelerator) {
-      DestroyAccelerator(accelerator);
-    }
-  };
-
-  // Wraps a pointer for LiteRtAcceleratorT with a custom deleter that handles
-  // cleaning up the accelerator internal data.
-  using Ptr = std::unique_ptr<::LiteRtAcceleratorT, Deleter>;
-
-  // Internal implementation for the C API.
-  [[nodiscard]]
-  static Ptr CreateEmptyAccelerator() {
-    return Ptr(new LiteRtAcceleratorT());
-  }
-
-  // Internal implementation for the C API.
-  static void DestroyAccelerator(::LiteRtAcceleratorT* accelerator);
-
-  // Registers an accelerator.
-  Expected<LiteRtAcceleratorT*> RegisterAccelerator(Ptr accelerator);
-
-  // Returns the idx-th accelerator that was registered.
-  [[nodiscard]]
-  Expected<LiteRtAcceleratorT*> Get(LiteRtParamIndex idx);
-
-  // Goes through accelerators and find the index of the given one.
-  Expected<LiteRtParamIndex> FindAcceleratorIndex(
-      LiteRtAcceleratorT* accelerator);
-
-  // Gives ownership of the shared library to the registry.
-  //
-  // This should be called when an accelerator is loaded from a shared library
-  // to tie the library lifetime to the registry.
-  //
-  // The library will be closed when the registry is destroyed.
-  void TakeOwnershipOfSharedLibrary(SharedLibrary library);
-
-  // Returns the number of accelerators that have been registered.
-  size_t size() const { return accelerators_.size(); }
-  auto begin() const { return accelerators_.begin(); }
-  auto begin() { return accelerators_.begin(); }
-  auto end() const { return accelerators_.end(); }
-  auto end() { return accelerators_.end(); }
-
- private:
-  std::vector<Ptr> accelerators_;
-  // Some accelerators are loaded as shared libraries. This list keeps these
-  // libraries loaded while the environment uses them.
-  std::vector<SharedLibrary> accelerator_shared_libraries_;
-};
-
-}  // namespace litert::internal
-
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ACCELERATOR_H_
diff --git a/tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data.h b/tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data.h
deleted file mode 100644
index 3644b898af74..000000000000
--- a/tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ACCELERATOR_MODEL_COMPILATION_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ACCELERATOR_MODEL_COMPILATION_DATA_H_
-
-#include <memory>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-namespace litert {
-
-// Holds environment data that accelerators may need to prepare their
-// delegates.
-//
-// These options are automatically added to the compilation options list
-// during the creation of the compiled model.
-struct ModelCompilationData : public LiteRtAcceleratorCompilationOptionsHeader {
-  static constexpr absl::string_view kIdentifier =
-      "environment-compilation-options";
-  static constexpr LiteRtApiVersion kVersion = {1, 0, 0};
-
-  struct Deleter {
-    void operator()(ModelCompilationData* options) { delete options; }
-  };
-
-  using Ptr = std::unique_ptr<ModelCompilationData, Deleter>;
-
-  static Expected<Ptr> Create() {
-    Ptr data(new ModelCompilationData());
-    LITERT_RETURN_IF_ERROR(LiteRtSetAcceleratorCompilationOptionsIdentifier(
-        data.get(), kIdentifier.data()));
-    LITERT_RETURN_IF_ERROR(
-        LiteRtSetAcceleratorCompilationOptionsVersion(data.get(), kVersion));
-    LITERT_RETURN_IF_ERROR(LiteRtSetAcceleratorCompilationOptionsDestructor(
-        data.get(), [](LiteRtAcceleratorCompilationOptionsHeader* options) {
-          Deleter()(reinterpret_cast<ModelCompilationData*>(options));
-        }));
-    return data;
-  }
-
-  // Pointer to the start of the model file memory allocation.
-  const char* allocation_base;
-
- private:
-  ModelCompilationData() = default;
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ACCELERATOR_MODEL_COMPILATION_DATA_H_
diff --git a/tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data_test.cc b/tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data_test.cc
deleted file mode 100644
index 5c66e4fd1648..000000000000
--- a/tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data_test.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace litert {
-namespace {
-using testing::Eq;
-using testing::StrEq;
-
-TEST(ModelCompilationDataTest, CreateSetsUpAllNecessaryFields) {
-  LITERT_ASSERT_OK_AND_ASSIGN(auto ptr, ModelCompilationData::Create());
-
-  const char* identifier = nullptr;
-  LITERT_EXPECT_OK(
-      LiteRtGetAcceleratorCompilationOptionsIdentifier(ptr.get(), &identifier));
-  EXPECT_THAT(identifier, StrEq(ModelCompilationData::kIdentifier));
-
-  LiteRtApiVersion version;
-  LITERT_EXPECT_OK(
-      LiteRtGetAcceleratorCompilationOptionsVersion(ptr.get(), &version));
-  EXPECT_THAT(LiteRtCompareApiVersion(version, ModelCompilationData::kVersion),
-              Eq(0));
-}
-
-}  // namespace
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/core/accelerator_test.cc b/tensorflow/lite/experimental/litert/core/accelerator_test.cc
deleted file mode 100644
index 2451540d1005..000000000000
--- a/tensorflow/lite/experimental/litert/core/accelerator_test.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-#include <gtest/gtest.h>
-
-namespace litert::internal {
-namespace {
-
-TEST(AcceleratorRegistryTest, CreateEmptyAcceleratorWorks) {
-  [[maybe_unused]]
-  auto accelerator_squeleton = AcceleratorRegistry::CreateEmptyAccelerator();
-}
-
-TEST(AcceleratorRegistryTest, AcceleratorCanBeRegisteredAndRetrieved) {
-  AcceleratorRegistry registry;
-
-  auto registered_accelerator1 = registry.RegisterAccelerator(
-      AcceleratorRegistry::CreateEmptyAccelerator());
-  ASSERT_TRUE(registered_accelerator1);
-
-  auto registered_accelerator2 = registry.RegisterAccelerator(
-      AcceleratorRegistry::CreateEmptyAccelerator());
-  ASSERT_TRUE(registered_accelerator2);
-
-  ASSERT_NE(registered_accelerator1, registered_accelerator2);
-
-  auto queried_accelerator1 = registry.Get(0);
-  ASSERT_TRUE(queried_accelerator1);
-  EXPECT_EQ(queried_accelerator1, registered_accelerator1);
-
-  auto queried_accelerator2 = registry.Get(1);
-  ASSERT_TRUE(queried_accelerator2);
-  EXPECT_EQ(queried_accelerator2, registered_accelerator2);
-
-  EXPECT_FALSE(registry.Get(2));
-  EXPECT_FALSE(registry.Get(-1));
-
-  auto idx1 = registry.FindAcceleratorIndex(queried_accelerator1.Value());
-  ASSERT_TRUE(idx1);
-  EXPECT_EQ(idx1.Value(), 0);
-
-  auto idx2 = registry.FindAcceleratorIndex(queried_accelerator2.Value());
-  ASSERT_TRUE(idx2);
-  EXPECT_EQ(idx2.Value(), 1);
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/build_stamp.cc b/tensorflow/lite/experimental/litert/core/build_stamp.cc
deleted file mode 100644
index 9b7e942c3662..000000000000
--- a/tensorflow/lite/experimental/litert/core/build_stamp.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-
-#include <cstdint>
-#include <tuple>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert::internal {
-
-namespace {
-// Simple metadata added to the flatbuffer related to compiler plugin.
-struct BuildStamp {
-  char soc_manufacturer[kSocManufacturerMaxLen + 1] = {};
-  char soc_model[kSocModelMaxLen + 1] = {};
-};
-
-}  // namespace
-
-Expected<OwningBufferRef<uint8_t>> MakeBuildStamp(
-    absl::string_view soc_manufacturer, absl::string_view soc_model) {
-  if (soc_manufacturer.size() >= kSocManufacturerMaxLen ||
-      soc_model.size() >= kSocModelMaxLen) {
-    LITERT_LOG(LITERT_ERROR, "%s", "Soc Make/Model strings too large\n");
-    return Unexpected(kLiteRtStatusErrorInvalidArgument);
-  }
-  BuildStamp stamp;
-  soc_manufacturer.copy(stamp.soc_manufacturer, soc_manufacturer.size());
-  soc_model.copy(stamp.soc_model, soc_model.size());
-  return OwningBufferRef<uint8_t>(reinterpret_cast<const uint8_t*>(&stamp),
-                                  sizeof(stamp));
-}
-
-// Parse a serialized build stamp from the given buf.
-Expected<std::tuple<absl::string_view, absl::string_view>> ParseBuildStamp(
-    BufferRef<uint8_t> buf) {
-  if (buf.Size() != sizeof(BuildStamp)) {
-    LITERT_LOG(LITERT_ERROR, "%s", "Build stamp size mismatch\n");
-    return Unexpected(kLiteRtStatusErrorInvalidArgument);
-  }
-  const BuildStamp* stamp = reinterpret_cast<const BuildStamp*>(buf.Data());
-  return std::make_tuple(absl::string_view(stamp->soc_manufacturer),
-                         absl::string_view(stamp->soc_model));
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/build_stamp.h b/tensorflow/lite/experimental/litert/core/build_stamp.h
deleted file mode 100644
index bf9ee91934e5..000000000000
--- a/tensorflow/lite/experimental/litert/core/build_stamp.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_BUILD_STAMP_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_BUILD_STAMP_H_
-
-#include <stddef.h>
-
-#include <cstdint>
-#include <tuple>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert::internal {
-
-// TODO update this library to use the flexbuffers api.
-
-// Shared "custom_code" for all dispatch ops.
-static constexpr absl::string_view kLiteRtDispatchOpCustomCode = "DISPATCH_OP";
-
-//
-// Build Stamp
-//
-
-// Maximum size of string for soc_manufacturer.
-static constexpr size_t kSocManufacturerMaxLen = 124;
-
-// Maximum size of string for soc_model.
-static constexpr size_t kSocModelMaxLen = 124;
-
-// Metadata key to lookup the build stamp.
-static constexpr absl::string_view kLiteRtBuildStampKey = "LiteRtStamp";
-
-// Make a serialized build stamp that can go directly in the flatbuffer.
-Expected<OwningBufferRef<uint8_t>> MakeBuildStamp(
-    absl::string_view soc_manufacturer, absl::string_view soc_model);
-
-// Parse a serialized build stamp from the given buf.
-Expected<std::tuple<absl::string_view, absl::string_view>> ParseBuildStamp(
-    BufferRef<uint8_t> buf);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_BUILD_STAMP_H_
diff --git a/tensorflow/lite/experimental/litert/core/build_stamp_test.cc b/tensorflow/lite/experimental/litert/core/build_stamp_test.cc
deleted file mode 100644
index a0c3ce4fbbf1..000000000000
--- a/tensorflow/lite/experimental/litert/core/build_stamp_test.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace litert::internal {
-
-namespace {
-
-using ::testing::litert::IsError;
-
-static constexpr absl::string_view kSocModel = "TestSocModel";
-static constexpr absl::string_view kSocMan = "TestSocMan";
-
-TEST(TestBuildStamp, MakeBuildStampInputsTooLarge) {
-  // NOLINTNEXTLINE
-  std::string long_manufacturer(256, 'a');
-  auto res = MakeBuildStamp(long_manufacturer, kSocModel);
-  EXPECT_THAT(res, IsError(kLiteRtStatusErrorInvalidArgument));
-}
-
-TEST(TestBuildStamp, MakeBuildStamp) {
-  auto stamp = MakeBuildStamp(kSocMan, kSocModel);
-  auto pstamp = ParseBuildStamp(*stamp);
-  auto [man, model] = *pstamp;
-  EXPECT_EQ(man, kSocMan);
-  EXPECT_EQ(model, kSocModel);
-}
-
-}  // namespace
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/dispatch_op_schema.cc b/tensorflow/lite/experimental/litert/core/dispatch_op_schema.cc
deleted file mode 100644
index ed2226ef664c..000000000000
--- a/tensorflow/lite/experimental/litert/core/dispatch_op_schema.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/dispatch_op_schema.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <utility>
-
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-
-namespace litert {
-namespace internal {
-namespace {
-
-static constexpr const char kBytecodeSizeKey[] = "bytecode_size";
-static constexpr const char kBytecodeOffsetKey[] = "bytecode_offset";
-static constexpr const char kNameKey[] = "name";
-
-}  // namespace
-
-OwningBufferRef<uint8_t> MakeDispatchOpOptions(DispatchOpOptions options) {
-  flexbuffers::Builder fbb;
-
-  // Set maximum width for scalars to 64 bits. This prevents any upsizing of
-  // the buffer when updating the bytecode size and offset in place.
-  fbb.ForceMinimumBitWidth(flexbuffers::BIT_WIDTH_64);
-
-  auto start = fbb.StartMap();
-
-  fbb.UInt(kBytecodeSizeKey, options.bytecode_size);
-  fbb.UInt(kBytecodeOffsetKey, options.bytecode_offset);
-  fbb.String(kNameKey, options.name);
-
-  fbb.EndMap(start);
-  fbb.Finish();
-
-  auto buf = fbb.GetBuffer();
-  OwningBufferRef<uint8_t> res;
-  res.Assign(buf.data(), buf.size());
-
-  return res;
-}
-
-bool UpdateDispatchOpOptionsInPlace(DispatchOpOptions options,
-                                    MutableBufferRef<uint8_t> buffer) {
-  auto opts = flexbuffers::GetRoot(buffer.Data(), buffer.Size()).AsMap();
-
-  // Update name if same len.
-  const auto name_ok = opts[kNameKey].MutateString(options.name);
-
-  // Update bytecode size and offset. Since min scalar bit width is set to max
-  // possible value, it shouldn't fail in theory.
-  const auto size_ok = opts[kBytecodeSizeKey].MutateUInt(options.bytecode_size);
-  const auto offset_ok =
-      opts[kBytecodeOffsetKey].MutateUInt(options.bytecode_offset);
-
-  return name_ok && size_ok && offset_ok;
-}
-
-DispatchOpOptions GetDispatchOpOptions(BufferRef<uint8_t> buffer) {
-  const auto opts = flexbuffers::GetRoot(buffer.Data(), buffer.Size()).AsMap();
-
-  const size_t bytecode_size = opts[kBytecodeSizeKey].AsUInt64();
-  const size_t bytecode_offset = opts[kBytecodeOffsetKey].AsUInt64();
-  std::string name(opts[kNameKey].AsString().c_str());
-
-  return DispatchOpOptions{
-      bytecode_size,
-      bytecode_offset,
-      std::move(name),
-  };
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/core/dispatch_op_schema.h b/tensorflow/lite/experimental/litert/core/dispatch_op_schema.h
deleted file mode 100644
index c6da400883d1..000000000000
--- a/tensorflow/lite/experimental/litert/core/dispatch_op_schema.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DISPATCH_OP_SCHEMA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DISPATCH_OP_SCHEMA_H_
-
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-
-// Utilities for working with the dispatch op custom options buffer. These
-// functions leverage the flexbuffer api under the hood which allows for inplace
-// updates.
-
-namespace litert {
-namespace internal {
-
-// Schema representing the custom options data for dispatch ops. Primarly used
-// to for tracking location of bytecode.
-struct DispatchOpOptions {
-  // The size of the bytecode for the dispatch op.
-  size_t bytecode_size;
-
-  // The offset of the bytecode for the dispatch op relative to the start of the
-  // model file.
-  size_t bytecode_offset;
-
-  // Name of specific dispatch op or entry point to be called in a shared
-  // bytecode module.
-  std::string name;
-};
-
-// Get a serialized representation of the dispatch op options. These should
-// be stored directly in the custom options of the dispatch op.
-OwningBufferRef<uint8_t> MakeDispatchOpOptions(DispatchOpOptions options);
-
-// Update the dispatch op options in the given buffer with the given options.
-// The buffer should be the custom options buffer of the dispatch op. Fails if
-// the passed values would resize the buffer.
-bool UpdateDispatchOpOptionsInPlace(DispatchOpOptions options,
-                                    MutableBufferRef<uint8_t> buffer);
-
-// Get the dispatch op options from the given buffer. The buffer should be the
-// custom options buffer of the dispatch op.
-DispatchOpOptions GetDispatchOpOptions(BufferRef<uint8_t> buffer);
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DISPATCH_OP_SCHEMA_H_
diff --git a/tensorflow/lite/experimental/litert/core/dispatch_op_schema_test.cc b/tensorflow/lite/experimental/litert/core/dispatch_op_schema_test.cc
deleted file mode 100644
index 53f784b50a67..000000000000
--- a/tensorflow/lite/experimental/litert/core/dispatch_op_schema_test.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/dispatch_op_schema.h"
-
-#include <cstddef>
-
-#include <gtest/gtest.h>
-
-namespace litert {
-namespace internal {
-namespace {
-
-static constexpr size_t kBufferSize = 100;
-static constexpr size_t kBufferOffset = 200;
-static constexpr const char kName[] = "test_name";
-
-TEST(DispatchOpSchemaTest, DispatchOpOptions) {
-  DispatchOpOptions options = {
-      kBufferSize,
-      kBufferOffset,
-      kName,
-  };
-
-  auto buffer = MakeDispatchOpOptions(options);
-  ASSERT_GT(buffer.Size(), 0);
-
-  auto parsed_options = GetDispatchOpOptions(buffer);
-  ASSERT_EQ(parsed_options.bytecode_size, kBufferSize);
-  ASSERT_EQ(parsed_options.bytecode_offset, kBufferOffset);
-  ASSERT_EQ(parsed_options.name, kName);
-}
-
-TEST(DispatchOpSchemaTest, UpdateDispatchOpOptions) {
-  DispatchOpOptions options = {
-      kBufferSize,
-      kBufferOffset,
-      kName,
-  };
-
-  auto buffer = MakeDispatchOpOptions(options);
-  ASSERT_GT(buffer.Size(), 0);
-
-  static constexpr size_t kNewBufferSize = 1000;
-  static constexpr size_t kNewBufferOffset = 2000;
-
-  DispatchOpOptions new_options = {
-      kNewBufferSize,
-      kNewBufferOffset,
-      kName,
-  };
-
-  ASSERT_TRUE(UpdateDispatchOpOptionsInPlace(new_options, buffer));
-
-  auto parsed_options = GetDispatchOpOptions(buffer);
-  ASSERT_EQ(parsed_options.bytecode_size, kNewBufferSize);
-  ASSERT_EQ(parsed_options.bytecode_offset, kNewBufferOffset);
-  ASSERT_EQ(parsed_options.name, kName);
-}
-
-}  // namespace
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/core/dynamic_loading.cc b/tensorflow/lite/experimental/litert/core/dynamic_loading.cc
deleted file mode 100644
index 30ccb9e0c6d9..000000000000
--- a/tensorflow/lite/experimental/litert/core/dynamic_loading.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/dynamic_loading.h"
-
-#include <dlfcn.h>
-#include <unistd.h>
-
-// clang-format off
-#ifndef __ANDROID__
-#if __has_include(<link.h>)
-#include <link.h>
-#endif
-#endif
-// clang-format on
-
-#include <filesystem>  // NOLINT
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-
-namespace litert::internal {
-
-static constexpr absl::string_view kSo = ".so";
-
-LiteRtStatus FindLiteRtSharedLibsHelper(const std::string& search_path,
-                                        const std::string& lib_pattern,
-                                        bool full_match,
-                                        std::vector<std::string>& results) {
-  if (!Exists(search_path)) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  // TODO implement path glob in core/filesystem.h and remove filesystem
-  // include from this file.
-  for (const auto& entry : std::filesystem::directory_iterator(
-           search_path,
-           std::filesystem::directory_options::skip_permission_denied)) {
-    const auto& path = entry.path();
-    if (access(path.c_str(), R_OK) != 0) {
-      continue;
-    }
-    if (entry.is_regular_file()) {
-      if (full_match) {
-        if (path.string().find(lib_pattern) != -1) {
-          LITERT_LOG(LITERT_VERBOSE, "Found shared library: %s", path.c_str());
-          results.push_back(path);
-        }
-      } else {
-        const auto stem = path.stem().string();
-        const auto ext = path.extension().string();
-        if (stem.find(lib_pattern) == 0 && kSo == ext) {
-          LITERT_LOG(LITERT_VERBOSE, "Found shared library: %s", path.c_str());
-          results.push_back(path);
-        }
-      }
-    } else if (entry.is_directory()) {
-      FindLiteRtSharedLibsHelper(path, lib_pattern, full_match, results);
-    }
-  }
-
-  return kLiteRtStatusOk;
-}
-
-static const char kCompilerPluginLibPatternFmt[] = "CompilerPlugin";
-
-LiteRtStatus FindLiteRtCompilerPluginSharedLibs(
-    absl::string_view search_path, std::vector<std::string>& results) {
-  std::string root(search_path);
-  const std::string lib_pattern =
-      absl::StrCat(kLiteRtSharedLibPrefix, kCompilerPluginLibPatternFmt);
-  return FindLiteRtSharedLibsHelper(root, lib_pattern, /*full_match=*/false,
-                                    results);
-}
-
-static const char kDispatchLibPatternFmt[] = "Dispatch";
-
-LiteRtStatus FindLiteRtDispatchSharedLibs(absl::string_view search_path,
-                                          std::vector<std::string>& results) {
-  std::string root(search_path.data());
-  const std::string lib_pattern =
-      absl::StrCat(kLiteRtSharedLibPrefix, kDispatchLibPatternFmt);
-  return FindLiteRtSharedLibsHelper(root, lib_pattern, /*full_match=*/false,
-                                    results);
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/dynamic_loading.h b/tensorflow/lite/experimental/litert/core/dynamic_loading.h
deleted file mode 100644
index 28e68517cbaa..000000000000
--- a/tensorflow/lite/experimental/litert/core/dynamic_loading.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DYNAMIC_LOADING_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DYNAMIC_LOADING_H_
-
-#include <dlfcn.h>
-#include <stdlib.h>
-
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-
-namespace litert::internal {
-
-constexpr absl::string_view kLiteRtSharedLibPrefix = "libLiteRt";
-
-// Loads shared library at given path. Logging can be disabled to probe for
-// shared libraries.
-LiteRtStatus OpenLib(absl::string_view so_path, void** lib_handle,
-                     bool log_failure = true);
-
-// Find all litert shared libraries in "search_path" and return
-// kLiteRtStatusErrorInvalidArgument if the provided search_path doesn't
-// exist. All internal dynamically linked dependencies for litert should be
-// prefixed with "libLiteRtCompilerPlugin".
-LiteRtStatus FindLiteRtCompilerPluginSharedLibs(
-    absl::string_view search_path, std::vector<std::string>& results);
-
-// Find all litert shared libraries in "search_path" and return
-// kLiteRtStatusErrorInvalidArgument if the provided search_path doesn't
-// exist. All internal dynamically linked dependencies for litert should be
-// prefixed with "libLiteRtDispatch".
-LiteRtStatus FindLiteRtDispatchSharedLibs(absl::string_view search_path,
-                                          std::vector<std::string>& results);
-
-// Find shared libraries for a given pattern in "search_path" and return
-// kLiteRtStatusErrorInvalidArgument if the provided search_path doesn't
-// exist.
-LiteRtStatus FindLiteRtSharedLibsHelper(const std::string& search_path,
-                                        const std::string& lib_pattern,
-                                        bool full_match,
-                                        std::vector<std::string>& results);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_DYNAMIC_LOADING_H_
diff --git a/tensorflow/lite/experimental/litert/core/dynamic_loading_test.cc b/tensorflow/lite/experimental/litert/core/dynamic_loading_test.cc
deleted file mode 100644
index 7c5b967e3ce6..000000000000
--- a/tensorflow/lite/experimental/litert/core/dynamic_loading_test.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/dynamic_loading.h"
-
-#include <string>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace litert::internal {
-namespace {
-
-using litert::testing::UniqueTestDirectory;
-using ::testing::Contains;
-using ::testing::HasSubstr;
-
-constexpr absl::string_view kNotLiteRtSo = "notLibLiteRt.so";
-constexpr absl::string_view kLiteRtSo1 = "libLiteRtCompilerPlugin_1.so";
-constexpr absl::string_view kLiteRtSo2 = "libLiteRtCompilerPlugin_2.so";
-constexpr absl::string_view kLiteRtSo3 = "libLiteRtDispatch_1.so";
-constexpr absl::string_view kLiteRtSo4 = "libLiteRtDispatch_2.so";
-
-TEST(TestDynamicLoading, GlobNoMatch) {
-  const auto dir = UniqueTestDirectory::Create();
-  ASSERT_TRUE(dir);
-  Touch(Join({dir->Str(), kNotLiteRtSo}));
-
-  std::vector<std::string> results;
-  LITERT_ASSERT_OK(litert::internal::FindLiteRtCompilerPluginSharedLibs(
-      dir->Str(), results));
-  EXPECT_EQ(results.size(), 0);
-  std::vector<std::string> results2;
-  LITERT_ASSERT_OK(
-      litert::internal::FindLiteRtDispatchSharedLibs(dir->Str(), results2));
-  EXPECT_EQ(results2.size(), 0);
-}
-
-TEST(TestDynamicLoading, GlobOneMatch) {
-  const auto dir = UniqueTestDirectory::Create();
-  ASSERT_TRUE(dir);
-  Touch(Join({dir->Str(), kLiteRtSo1}));
-  Touch(Join({dir->Str(), kLiteRtSo3}));
-  Touch(Join({dir->Str(), kNotLiteRtSo}));
-
-  std::vector<std::string> results;
-  LITERT_ASSERT_OK(litert::internal::FindLiteRtCompilerPluginSharedLibs(
-      dir->Str(), results));
-  ASSERT_EQ(results.size(), 1);
-  EXPECT_TRUE(absl::string_view(results.front()).ends_with(kLiteRtSo1));
-
-  std::vector<std::string> results2;
-  LITERT_ASSERT_OK(
-      litert::internal::FindLiteRtDispatchSharedLibs(dir->Str(), results2));
-  ASSERT_EQ(results2.size(), 1);
-  EXPECT_TRUE(absl::string_view(results2.front()).ends_with(kLiteRtSo3));
-}
-
-TEST(TestDynamicLoading, GlobMultiMatch) {
-  const auto dir = UniqueTestDirectory::Create();
-  ASSERT_TRUE(dir);
-  Touch(Join({dir->Str(), kLiteRtSo1}));
-  Touch(Join({dir->Str(), kLiteRtSo2}));
-  Touch(Join({dir->Str(), kLiteRtSo3}));
-  Touch(Join({dir->Str(), kLiteRtSo4}));
-  Touch(Join({dir->Str(), kNotLiteRtSo}));
-
-  std::vector<std::string> results;
-  LITERT_ASSERT_OK(litert::internal::FindLiteRtCompilerPluginSharedLibs(
-      dir->Str(), results));
-  ASSERT_EQ(results.size(), 2);
-  EXPECT_THAT(results, Contains(HasSubstr(kLiteRtSo1)));
-  EXPECT_THAT(results, Contains(HasSubstr(kLiteRtSo2)));
-
-  std::vector<std::string> results2;
-  LITERT_ASSERT_OK(
-      litert::internal::FindLiteRtDispatchSharedLibs(dir->Str(), results2));
-  ASSERT_EQ(results2.size(), 2);
-  EXPECT_THAT(results2, Contains(HasSubstr(kLiteRtSo3)));
-  EXPECT_THAT(results2, Contains(HasSubstr(kLiteRtSo4)));
-}
-
-TEST(TestDynamicLoadingHelper, HelperWithFullMatch) {
-  const auto dir = UniqueTestDirectory::Create();
-  ASSERT_TRUE(dir);
-  Touch(Join({dir->Str(), kLiteRtSo1}));
-  Touch(Join({dir->Str(), kLiteRtSo2}));
-  Touch(Join({dir->Str(), kLiteRtSo3}));
-  Touch(Join({dir->Str(), kLiteRtSo4}));
-  Touch(Join({dir->Str(), kNotLiteRtSo}));
-
-  std::vector<std::string> results;
-  LITERT_ASSERT_OK(litert::internal::FindLiteRtSharedLibsHelper(
-      std::string(dir->Str()), std::string(kLiteRtSo4), true, results));
-  ASSERT_EQ(results.size(), 1);
-  EXPECT_THAT(results, Contains(HasSubstr(kLiteRtSo4)));
-}
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/environment.cc b/tensorflow/lite/experimental/litert/core/environment.cc
deleted file mode 100644
index 0851e48fdaa6..000000000000
--- a/tensorflow/lite/experimental/litert/core/environment.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-#if !defined(_WIN32)
-#include <dlfcn.h>
-#endif  // !defined(_WIN32)
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/shared_library.h"
-
-litert::Expected<LiteRtEnvironmentT::Ptr> LiteRtEnvironmentT::CreateWithOptions(
-    absl::Span<const LiteRtEnvOption> options) {
-  LITERT_LOG(LITERT_INFO, "Creating LiteRT environment with options");
-  auto env = std::make_unique<LiteRtEnvironmentT>();
-  for (auto& option : options) {
-    if (option.value.type == kLiteRtAnyTypeString) {
-      std::string str_copy = std::string(option.value.str_value);
-      env->string_options_.push_back(std::move(str_copy));
-      LiteRtAny litert_any;
-      litert_any.type = kLiteRtAnyTypeString;
-      litert_any.str_value = env->string_options_.back().c_str();
-      env->options_[option.tag] = litert_any;
-    } else {
-      env->options_[option.tag] = option.value;
-    }
-  }
-
-#if defined(_WIN32)
-  void* lib_opencl = nullptr;
-#else   // defined(_WIN32)
-  // Find `LiteRtRegisterAcceleratorGpuOpenCl` to register the GPU delegate.
-  static void* lib_opencl =
-      tflite::SharedLibrary::LoadLibrary("libLiteRtGpuAccelerator.so");
-  if (!lib_opencl) {
-    // If the library is not found, find the symbol in the current library.
-    lib_opencl = RTLD_DEFAULT;
-  }
-#endif  // defined(_WIN32)
-  auto opencl_registrar_func = reinterpret_cast<void (*)(LiteRtEnvironment)>(
-      tflite::SharedLibrary::GetLibrarySymbol(
-          lib_opencl, "LiteRtRegisterAcceleratorGpuOpenCl"));
-  if (opencl_registrar_func) {
-    LITERT_LOG(LITERT_INFO, "Found GPU Accelerator");
-    opencl_registrar_func(env.get());
-  }
-
-  return env;
-}
diff --git a/tensorflow/lite/experimental/litert/core/environment.h b/tensorflow/lite/experimental/litert/core/environment.h
deleted file mode 100644
index 98f7dd970de0..000000000000
--- a/tensorflow/lite/experimental/litert/core/environment.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ENVIRONMENT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ENVIRONMENT_H_
-
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator.h"
-
-// A singleton class that contains global LiteRT environment options.
-class LiteRtEnvironmentT {
- public:
-  using Ptr = std::unique_ptr<LiteRtEnvironmentT>;
-
-  LiteRtEnvironmentT() = default;
-  // Create an environment instance with options.
-  static litert::Expected<Ptr> CreateWithOptions(
-      absl::Span<const LiteRtEnvOption> options);
-
-  ~LiteRtEnvironmentT() = default;
-
-  std::optional<LiteRtAny> GetOption(LiteRtEnvOptionTag tag) const {
-    auto i = options_.find(tag);
-    if (i != options_.end()) {
-      return i->second;
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  litert::internal::AcceleratorRegistry& GetAcceleratorRegistry() {
-    return accelerators_;
-  }
-
- private:
-  std::map<LiteRtEnvOptionTag, LiteRtAny> options_;
-  litert::internal::AcceleratorRegistry accelerators_;
-  std::vector<std::string> string_options_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_ENVIRONMENT_H_
diff --git a/tensorflow/lite/experimental/litert/core/environment_test.cc b/tensorflow/lite/experimental/litert/core/environment_test.cc
deleted file mode 100644
index b0d199a53173..000000000000
--- a/tensorflow/lite/experimental/litert/core/environment_test.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-#include <any>
-#include <array>
-#include <utility>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-
-namespace litert::internal {
-namespace {
-
-TEST(LiteRtEnvironmentT, CreateWithOptions) {
-  const std::array<LiteRtEnvOption, 1> environment_options = {
-      LiteRtEnvOption{
-          kLiteRtEnvOptionTagCompilerPluginLibraryDir,
-          *ToLiteRtAny(std::any("sample path")),
-      },
-  };
-  auto env = LiteRtEnvironmentT::CreateWithOptions(environment_options);
-  ASSERT_TRUE(env);
-
-  auto option = (*env)->GetOption(kLiteRtEnvOptionTagCompilerPluginLibraryDir);
-  ASSERT_TRUE(option.has_value());
-  ASSERT_EQ(option->type, kLiteRtAnyTypeString);
-  ASSERT_STREQ(option->str_value, "sample path");
-}
-
-TEST(LiteRtEnvironmentT, CheckStringCopy) {
-  LiteRtEnvironmentT::Ptr env;
-
-  // The passed string becomes obsolete after the scope.
-  {
-    const std::array<LiteRtEnvOption, 1> environment_options = {
-        LiteRtEnvOption{
-            kLiteRtEnvOptionTagCompilerPluginLibraryDir,
-            *ToLiteRtAny(std::any("sample path")),
-        },
-    };
-    auto res = LiteRtEnvironmentT::CreateWithOptions(environment_options);
-    ASSERT_TRUE(res);
-    env = std::move(*res);
-  }
-
-  auto option = env->GetOption(kLiteRtEnvOptionTagCompilerPluginLibraryDir);
-  ASSERT_TRUE(option.has_value());
-  ASSERT_EQ(option->type, kLiteRtAnyTypeString);
-  ASSERT_STREQ(option->str_value, "sample path");
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/filesystem.cc b/tensorflow/lite/experimental/litert/core/filesystem.cc
deleted file mode 100644
index e97a583aee27..000000000000
--- a/tensorflow/lite/experimental/litert/core/filesystem.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <filesystem>  // NOLINT
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-
-namespace litert::internal {
-
-namespace {
-
-using StdPath = std::filesystem::path;
-
-StdPath MakeStdPath(absl::string_view path) {
-  return StdPath(std::string(path.begin(), path.end()));
-}
-
-bool StdExists(const StdPath& std_path) {
-  return std::filesystem::exists(std_path);
-}
-
-size_t StdSize(const StdPath& std_path) {
-  return std::filesystem::file_size(std_path);
-}
-
-LiteRtStatus StdIFRead(const StdPath& std_path, char* data, size_t size) {
-  std::ifstream in_file_stream(std_path, std::ifstream::binary);
-  if (!in_file_stream) {
-    return kLiteRtStatusErrorFileIO;
-  }
-
-  in_file_stream.read(data, size);
-  if (!in_file_stream) {
-    return kLiteRtStatusErrorFileIO;
-  }
-
-  in_file_stream.close();
-  return kLiteRtStatusOk;
-}
-
-}  // namespace
-
-void Touch(absl::string_view path) { std::ofstream(MakeStdPath(path)); }
-
-std::string Join(const std::vector<absl::string_view>& paths) {
-  StdPath std_path;
-  for (auto subpath : paths) {
-    std_path /= MakeStdPath(subpath);
-  }
-  return std_path.generic_string();
-}
-
-bool Exists(absl::string_view path) { return StdExists(MakeStdPath(path)); }
-
-Expected<size_t> Size(absl::string_view path) {
-  auto std_path = MakeStdPath(path);
-  if (!StdExists(std_path)) {
-    return Error(kLiteRtStatusErrorNotFound,
-                 absl::StrFormat("File not found: %s", std_path.c_str()));
-  }
-  return StdSize(std_path);
-}
-
-Expected<OwningBufferRef<uint8_t>> LoadBinaryFile(absl::string_view path) {
-  auto std_path = MakeStdPath(path);
-
-  if (!StdExists(std_path)) {
-    return Error(kLiteRtStatusErrorNotFound,
-                 absl::StrFormat("File not found: %s", std_path.c_str()));
-  }
-
-  OwningBufferRef<uint8_t> buf(StdSize(std_path));
-  LITERT_RETURN_IF_ERROR(StdIFRead(std_path, buf.StrData(), buf.Size()));
-
-  return buf;
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/filesystem.h b/tensorflow/lite/experimental/litert/core/filesystem.h
deleted file mode 100644
index 3de517dfd4d5..000000000000
--- a/tensorflow/lite/experimental/litert/core/filesystem.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_FILESYSTEM_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_FILESYSTEM_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-// Generic file operations. Try to encapsulate the std filesystem header as much
-// as possible because its technically unapproved.
-
-namespace litert::internal {
-
-// Append all given subpaths together (e.g. os.path.join).
-std::string Join(const std::vector<absl::string_view>& paths);
-
-// Make a new empty file at the given path.
-void Touch(absl::string_view path);
-
-// Does this file exist.
-bool Exists(absl::string_view path);
-
-// Get size of file.
-Expected<size_t> Size(absl::string_view path);
-
-// Load the bytes of the file at given path.
-Expected<OwningBufferRef<uint8_t>> LoadBinaryFile(absl::string_view path);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_FILESYSTEM_H_
diff --git a/tensorflow/lite/experimental/litert/core/filesystem_test.cc b/tensorflow/lite/experimental/litert/core/filesystem_test.cc
deleted file mode 100644
index d961d469d101..000000000000
--- a/tensorflow/lite/experimental/litert/core/filesystem_test.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-
-#include <gtest/gtest.h>
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-
-namespace litert::internal {
-namespace {
-
-static constexpr absl::string_view kPrefix = "a/prefix";
-static constexpr absl::string_view kInfix = "an/infix";
-static constexpr absl::string_view kSuffix = "suffix.ext";
-
-TEST(FilesystemTest, JoinTwo) {
-  const auto path = Join({kPrefix, kSuffix});
-  EXPECT_EQ(path, absl::StrFormat("%s/%s", kPrefix, kSuffix));
-}
-
-TEST(FilesystemTest, JoinMany) {
-  const auto path = Join({kPrefix, kInfix, kSuffix});
-  EXPECT_EQ(path, absl::StrFormat("%s/%s/%s", kPrefix, kInfix, kSuffix));
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/insert_order_map.h b/tensorflow/lite/experimental/litert/core/insert_order_map.h
deleted file mode 100644
index f1c9ca468049..000000000000
--- a/tensorflow/lite/experimental/litert/core/insert_order_map.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_INSERT_ORDER_MAP_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_INSERT_ORDER_MAP_H_
-
-#include <cstddef>
-#include <functional>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-
-namespace litert::internal {
-
-// A map implementation that iterates in the same order as initial insertion.
-template <class Key, class Val>
-class InsertOrderMap {
- public:
-  using Pair = std::pair<Key, Val>;
-  using Values = std::vector<Pair>;
-  using ValRef = std::reference_wrapper<Pair>;
-  using Map = absl::flat_hash_map<Key, size_t>;
-  using Iterator = typename Values::iterator;
-
-  InsertOrderMap() = default;
-
-  std::optional<ValRef> Find(const Key& key) {
-    if (auto it = map_.find(key); it != map_.end()) {
-      const auto ind = it->second;
-      return values_[ind];
-    }
-    return {};
-  }
-
-  bool Contains(const Key& key) const { return map_.find(key) != map_.end(); }
-
-  void InsertOrAssign(const Key& key, const Val& val) {
-    if (auto it = map_.find(key); it != map_.end()) {
-      const auto ind = it->second;
-      values_[ind].second = val;
-    } else {
-      values_.push_back({key, val});
-      map_.insert({key, values_.size() - 1});
-    }
-  }
-
-  size_t Size() const { return values_.size(); }
-
-  void Clear() {
-    values_.clear();
-    map_.clear();
-  }
-
-  Iterator Begin() { return values_.begin(); }
-
-  Iterator End() { return values_.end(); }
-
- private:
-  Values values_;
-  Map map_;
-};
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_INSERT_ORDER_MAP_H_
diff --git a/tensorflow/lite/experimental/litert/core/insert_order_map_test.cc b/tensorflow/lite/experimental/litert/core/insert_order_map_test.cc
deleted file mode 100644
index 6c24a01be97b..000000000000
--- a/tensorflow/lite/experimental/litert/core/insert_order_map_test.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/insert_order_map.h"
-
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace litert::internal {
-namespace {
-
-using ::testing::ElementsAre;
-
-using TestMap = InsertOrderMap<int, const char*>;
-
-static constexpr int k1 = 1;
-static constexpr int k2 = 2;
-static constexpr int k3 = 3;
-static constexpr int k4 = 4;
-static constexpr const char kV1[] = "1";
-static constexpr const char kV2[] = "2";
-static constexpr const char kV3[] = "3";
-static constexpr const char kV4[] = "4";
-
-TestMap MakeTestMap() {
-  TestMap map;
-  map.InsertOrAssign(k1, kV1);
-  map.InsertOrAssign(k2, kV2);
-  map.InsertOrAssign(k3, kV3);
-  return map;
-}
-
-TEST(InsertOrderMapTest, IterateInInsertOrder) {
-  auto map = MakeTestMap();
-  ASSERT_EQ(map.Size(), 3);
-
-  std::vector<TestMap::Pair> values(map.Begin(), map.End());
-  EXPECT_THAT(values,
-              ElementsAre(std::make_pair(k1, kV1), std::make_pair(k2, kV2),
-                          std::make_pair(k3, kV3)));
-}
-
-TEST(InsertOrderMapTest, IterateInInsertOrderWithUpdate) {
-  auto map = MakeTestMap();
-  ASSERT_EQ(map.Size(), 3);
-
-  map.InsertOrAssign(k1, kV4);
-  std::vector<TestMap::Pair> values(map.Begin(), map.End());
-  EXPECT_THAT(values,
-              ElementsAre(std::make_pair(k1, kV4), std::make_pair(k2, kV2),
-                          std::make_pair(k3, kV3)));
-}
-
-TEST(InsertOrderMapTest, FindExisting) {
-  auto map = MakeTestMap();
-  ASSERT_EQ(map.Size(), 3);
-
-  auto val = map.Find(k1);
-  ASSERT_TRUE(val.has_value());
-  EXPECT_EQ(val->get().first, k1);
-  EXPECT_EQ(val->get().second, kV1);
-
-  EXPECT_TRUE(map.Contains(k1));
-}
-
-TEST(InsertOrderMapTest, FindMissing) {
-  auto map = MakeTestMap();
-  ASSERT_EQ(map.Size(), 3);
-
-  EXPECT_EQ(map.Find(k4), std::nullopt);
-  EXPECT_FALSE(map.Contains(k4));
-}
-
-TEST(InsertOrderMapTest, Clear) {
-  auto map = MakeTestMap();
-  ASSERT_EQ(map.Size(), 3);
-
-  map.Clear();
-  EXPECT_EQ(map.Size(), 0);
-  EXPECT_EQ(map.Begin(), map.End());
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/BUILD b/tensorflow/lite/experimental/litert/core/model/BUILD
deleted file mode 100644
index 7145d6f1cb30..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/BUILD
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:special_rule.bzl", "lite_rt_friends")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-    ] + lite_rt_friends(),
-)
-
-cc_library(
-    name = "model",
-    srcs = ["model.cc"],
-    hdrs = [
-        "model.h",
-        "//tensorflow/lite/experimental/litert/c:litert_model_hdrs",
-    ],
-    deps = [
-        ":buffer_manager",
-        ":ir_allocator",
-        "//tensorflow/compiler/mlir/lite/core:model_builder_base",
-        "//tensorflow/lite/core/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_layout",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "model_test",
-    srcs = ["model_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/simple_model.tflite",
-    ],
-    deps = [
-        ":buffer_manager",
-        ":model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "model_load",
-    srcs = ["model_load.cc"],
-    hdrs = ["model_load.h"],
-    deps = [
-        ":buffer_manager",
-        ":flatbuffer_to_litert",
-        ":model",
-        ":model_graph",
-        "//tensorflow/compiler/mlir/lite/core:model_builder_base",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "model_file_test",
-    srcs = ["model_file_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-        # copybara:uncomment "//tensorflow/lite/java/demo/app/src/main/assets:mobilenet_v1_1.0_224.tflite",
-    ],
-    deps = [
-        ":buffer_manager",
-        ":graph_validation",
-        ":model",
-        ":model_file_test_util",
-        ":model_load",
-        ":model_serialize",
-        "//tensorflow/compiler/mlir/lite/schema:schema_fbs_with_mutable",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_element_type",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core:dispatch_op_schema",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:test_models",
-        "//tensorflow/lite/schema:schema_fbs_with_mutable",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "model_serialize",
-    srcs = ["model_serialize.cc"],
-    hdrs = ["model_serialize.h"],
-    deps = [
-        ":litert_to_flatbuffer",
-        ":model",
-        "//tensorflow/compiler/mlir/lite/schema:schema_fbs_with_mutable",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/core:build_stamp",
-        "//tensorflow/lite/experimental/litert/core:dispatch_op_schema",
-        "//tensorflow/lite/experimental/litert/core:insert_order_map",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/schema:schema_fbs_with_mutable",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_library(
-    name = "flatbuffer_to_litert",
-    srcs = ["flatbuffer_to_litert.cc"],
-    hdrs = ["flatbuffer_to_litert.h"],
-    deps = [
-        ":model",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_layout",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
-cc_test(
-    name = "flatbuffer_to_litert_test",
-    srcs = ["flatbuffer_to_litert_test.cc"],
-    deps = [
-        ":flatbuffer_to_litert",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "litert_to_flatbuffer",
-    srcs = ["litert_to_flatbuffer.cc"],
-    hdrs = ["litert_to_flatbuffer.h"],
-    deps = [
-        ":model",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "litert_to_flatbuffer_test",
-    srcs = ["litert_to_flatbuffer_test.cc"],
-    deps = [
-        ":litert_to_flatbuffer",
-        ":model",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_layout",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "model_buffer",
-    srcs = ["model_buffer.cc"],
-    hdrs = ["model_buffer.h"],
-    deps = [
-        ":model",
-        ":model_load",
-        ":model_serialize",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "model_file_test_util",
-    testonly = 1,
-    srcs = ["model_file_test_util.cc"],
-    hdrs = ["model_file_test_util.h"],
-    deps = [
-        ":flatbuffer_to_litert",
-        ":model",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_detail",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "ir_allocator",
-    hdrs = ["ir_allocator.h"],
-    deps = ["@com_google_absl//absl/types:span"],
-)
-
-cc_test(
-    name = "ir_allocator_test",
-    srcs = ["ir_allocator_test.cc"],
-    deps = [
-        ":ir_allocator",
-        ":model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "model_graph",
-    srcs = ["model_graph.cc"],
-    hdrs = [
-        "model_graph.h",
-        "//tensorflow/lite/experimental/litert/cc:litert_consts.h",
-    ],
-    deps = [
-        ":model",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_detail",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:absl_check",
-    ],
-)
-
-cc_library(
-    name = "graph_validation",
-    srcs = ["graph_validation.cc"],
-    hdrs = ["graph_validation.h"],
-    deps = [
-        ":model",
-        ":model_graph",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_detail",
-    ],
-)
-
-cc_library(
-    name = "buffer_manager",
-    hdrs = ["buffer_manager.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-    ],
-)
-
-cc_test(
-    name = "model_graph_test",
-    srcs = ["model_graph_test.cc"],
-    deps = [
-        ":graph_validation",
-        ":ir_allocator",
-        ":model",
-        ":model_graph",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "model_buffer_test",
-    srcs = ["model_buffer_test.cc"],
-    deps = [
-        ":model",
-        ":model_buffer",
-        ":model_load",
-        "//tensorflow/compiler/mlir/lite:allocation",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:model_builder",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:cc_api_stable",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/core:dispatch_op_schema",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_cascade_model_npu",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "buffer_manager_test",
-    srcs = ["buffer_manager_test.cc"],
-    deps = [
-        ":buffer_manager",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/core/model/buffer_manager.h b/tensorflow/lite/experimental/litert/core/model/buffer_manager.h
deleted file mode 100644
index af6b97f15c05..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/buffer_manager.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_BUFFER_MANAGER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_BUFFER_MANAGER_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert::internal {
-
-// Extra info about how the buffer is handled during load or serialization.
-struct BufferContext {
-  using Ref = std::reference_wrapper<BufferContext>;
-
-  // Whether the buffer should be appended to the flatbuffer during
-  // serialization.
-  bool should_append = false;
-};
-
-// Container type for efficiently holding data buffers used by the model. These
-// buffers may be owned or non-owned by the model. Uses id based indexing.
-class BufferManager {
- public:
-  using Ptr = std::unique_ptr<BufferManager>;
-
-  // Unique identifier for a buffer. 0 is reserved for empty buffers.
-  using BufferId = uint32_t;
-  static constexpr BufferId kEmptyBufferId = 0;
-
-  // Register a buffer that is not owned by the model. Caller must ensure the
-  // buffer outlives the model.
-  BufferId RegisterNonOwnedBuffer(
-      BufferRef<uint8_t> buffer,
-      std::optional<BufferContext> context = std::nullopt) {
-    auto&& ctx = context.has_value() ? std::move(*context) : BufferContext{};
-    buffers_.emplace_back(BufferWithContext(buffer, std::move(ctx)));
-    return buffers_.size() - 1;
-  }
-
-  // Register a buffer that is owned by the model.
-  BufferId RegisterOwnedBuffer(
-      OwningBufferRef<uint8_t>&& buffer,
-      std::optional<BufferContext> context = std::nullopt) {
-    auto&& ctx = context.has_value() ? std::move(*context) : BufferContext{};
-    buffers_.emplace_back(BufferWithContext(buffer, std::move(ctx)));
-    return buffers_.size() - 1;
-  }
-
-  // Get a view of the buffer at the given id.
-  Expected<BufferRef<uint8_t>> GetBuffer(BufferId id) {
-    if (id >= buffers_.size()) {
-      return Error(kLiteRtStatusErrorIndexOOB);
-    }
-    return GetView(buffers_[id].first);
-  }
-
-  // Get the context of the buffer at the given id.
-  Expected<BufferContext::Ref> GetContext(BufferId id) {
-    if (id >= buffers_.size()) {
-      return Error(kLiteRtStatusErrorIndexOOB);
-    }
-    return std::ref(buffers_[id].second);
-  }
-
-  // Number of buffers. Ids will be 0 <-> num - 1.
-  size_t NumBuffers() const { return buffers_.size(); }
-
-  BufferManager() {
-    // Zero is reserved for empty buffers.
-    buffers_.emplace_back(
-        BufferWithContext(BufferRef<uint8_t>(), BufferContext{}));
-  }
-  BufferManager(const BufferManager&) = delete;
-  BufferManager& operator=(const BufferManager&) = delete;
-  BufferManager(BufferManager&& other) = default;
-  BufferManager& operator=(BufferManager&& other) = default;
-
- private:
-  using BufferType = std::variant<BufferRef<uint8_t>, OwningBufferRef<uint8_t>>;
-  using BufferWithContext = std::pair<BufferType, BufferContext>;
-
-  static BufferRef<uint8_t> GetView(const BufferType& buffer) {
-    BufferRef<uint8_t> res;
-    std::visit([&res](auto&& arg) { res = arg; }, buffer);
-    return res;
-  }
-
-  std::vector<BufferWithContext> buffers_;
-};
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_BUFFER_MANAGER_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/buffer_manager_test.cc b/tensorflow/lite/experimental/litert/core/model/buffer_manager_test.cc
deleted file mode 100644
index b077eda8b4f3..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/buffer_manager_test.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/buffer_manager.h"
-
-#include <cstdint>
-#include <utility>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-
-namespace litert::internal {
-
-namespace {
-
-static constexpr absl::string_view kData = "foo";
-
-TEST(BufferManagerTest, EmptyFirstBuffer) {
-  BufferManager manager;
-
-  EXPECT_EQ(manager.NumBuffers(), 1);
-  EXPECT_EQ(manager.GetBuffer(BufferManager::kEmptyBufferId)->Size(), 0);
-}
-
-TEST(BufferManagerTest, RegisterNonOwnedBuffer) {
-  BufferManager manager;
-
-  OwningBufferRef<uint8_t> buffer(kData);
-  const auto id = manager.RegisterNonOwnedBuffer(buffer);
-
-  EXPECT_EQ(manager.NumBuffers(), 2);
-  EXPECT_EQ(manager.GetBuffer(id)->StrView(), kData);
-}
-
-TEST(BufferManagerTest, RegisterOwnedBuffer) {
-  BufferManager manager;
-
-  OwningBufferRef<uint8_t> buffer(kData);
-  const auto id = manager.RegisterOwnedBuffer(std::move(buffer));
-
-  EXPECT_EQ(manager.NumBuffers(), 2);
-  EXPECT_EQ(manager.GetBuffer(id)->StrView(), kData);
-}
-
-TEST(BufferManagerTest, RegisterWithContext) {
-  BufferManager manager;
-
-  OwningBufferRef<uint8_t> buffer(kData);
-  BufferContext context = {true};
-  const auto id = manager.RegisterNonOwnedBuffer(buffer, context);
-
-  EXPECT_EQ(manager.NumBuffers(), 2);
-  EXPECT_EQ(manager.GetBuffer(id)->StrView(), kData);
-  EXPECT_EQ(manager.GetContext(id)->get().should_append, true);
-}
-
-}  // namespace
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.cc b/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.cc
deleted file mode 100644
index 0e2c44cf22d4..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h"
-
-#include <utility>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace litert::internal {
-
-LiteRtStatus IsOpSupported(const tflite::OperatorT& op) {
-  // TODO: b/365299994 - Check for supported options.
-
-  if (!op.intermediates.empty()) {
-    // TODO: b/365299994 - Support intermediates.
-    LITERT_LOG(LITERT_ERROR, "Intermediate tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  if (op.large_custom_options_size != 0) {
-    // TODO: b/365299994 - Support large custom options.
-    LITERT_LOG(LITERT_ERROR, "Large custom options not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  for (auto m_input : op.mutating_variable_inputs) {
-    if (m_input) {
-      // TODO: b/365299994 - Support mutating variable inputs.
-      LITERT_LOG(LITERT_ERROR, "Mutating variable inputs not yet supported.");
-      return kLiteRtStatusErrorUnsupported;
-    }
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus IsBufferSupported(const tflite::BufferT& buffer) {
-  if (buffer.offset != 0) {
-    // TODO: b/365299994 - Support buffer with offset.
-    LITERT_LOG(LITERT_ERROR, "Buffers with offset not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus IsTensorSupported(const TflTensor& tensor) {
-  if (tensor.is_variable) {
-    // TODO: b/365299994 - Support variable tensors.
-    LITERT_LOG(LITERT_ERROR, "Variable tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  if (!tensor.variant_tensors.empty()) {
-    // TODO: b/365299994 - Support variant tensors.
-    LITERT_LOG(LITERT_ERROR, "Variant tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  if (tensor.sparsity) {
-    // TODO: b/365299994 - Support sparsity tensors.
-    LITERT_LOG(LITERT_ERROR, "Sparsity tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtElementType MapElementType(TflElementType type) {
-  switch (type) {
-    case tflite::TensorType_FLOAT32:
-      return kLiteRtElementTypeFloat32;
-    case tflite::TensorType_FLOAT16:
-      return kLiteRtElementTypeFloat16;
-    case tflite::TensorType_INT32:
-      return kLiteRtElementTypeInt32;
-    case tflite::TensorType_BOOL:
-      return kLiteRtElementTypeBool;
-    case tflite::TensorType_INT16:
-      return kLiteRtElementTypeInt16;
-    case tflite::TensorType_INT8:
-      return kLiteRtElementTypeInt8;
-    case tflite::TensorType_UINT8:
-      return kLiteRtElementTypeUInt8;
-    default:
-      return kLiteRtElementTypeNone;
-  }
-}
-
-Expected<TensorType> MapTensorType(const TflTensorType& tfl_tensor_type) {
-  const auto& [element_type, shape] = tfl_tensor_type;
-  auto ranked_shape = AsDynamicShape(shape);
-  if (!ranked_shape) {
-    LITERT_LOG(LITERT_ERROR, "Only ranked tensors currently supported");
-    return Error(kLiteRtStatusErrorUnsupported);
-  }
-
-  auto litert_element_type = MapElementType(element_type);
-  if (litert_element_type == kLiteRtElementTypeNone) {
-    LITERT_LOG(LITERT_ERROR, "Element type not currently supported");
-    return Error(kLiteRtStatusErrorUnsupported);
-  }
-
-  TensorTypeDetail detail;
-  detail.ranked_tensor_type.element_type = litert_element_type;
-  detail.ranked_tensor_type.layout = BuildLayout(*ranked_shape);
-
-  return std::make_pair(kLiteRtRankedTensorType, detail);
-}
-
-Expected<Quantization> MapQuantization(const TflQuantization* tfl_quantization,
-                                       ScratchBufferProvider buffer_provider) {
-  if (!IsQuantized(tfl_quantization)) {
-    return MakeEmptyQuantization();
-  }
-
-  if (auto tfl_qparams = AsPerTensorQparams(tfl_quantization)) {
-    return MakePerTensorQuantization(tfl_qparams->second, tfl_qparams->first);
-  }
-
-  if (auto tfl_qparams = AsPerChannelQparams(tfl_quantization)) {
-    [[maybe_unused]] const auto& [quantized_dimension, num_channels,
-                                  zero_points, scales] = *tfl_qparams;
-    return MakePerChannelQuantization(scales, zero_points, quantized_dimension,
-                                      buffer_provider);
-  }
-
-  LITERT_LOG(LITERT_ERROR, "Uknown tfl quantization type");
-  return Error(kLiteRtStatusErrorUnsupported);
-}
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h b/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h
deleted file mode 100644
index 92a7d11cdf03..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_FLATBUFFER_TO_LITERT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_FLATBUFFER_TO_LITERT_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-namespace litert::internal {
-
-LiteRtStatus IsOpSupported(const TflOp& op);
-
-LiteRtStatus IsBufferSupported(const TflBuffer& buffer);
-
-// Checks if the misc non-type non quantization parts of this tensor are
-// supported in the litet model api.
-LiteRtStatus IsTensorSupported(const TflTensor& tensor);
-
-LiteRtElementType MapElementType(TflElementType element_type);
-
-Expected<TensorType> MapTensorType(const TflTensorType& tfl_tensor_type);
-
-Expected<Quantization> MapQuantization(const TflQuantization* tfl_quantization,
-                                       ScratchBufferProvider buffer_provider);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_FLATBUFFER_TO_LITERT_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert_test.cc b/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert_test.cc
deleted file mode 100644
index 2ff1cb18ffa8..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-namespace litert::internal {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-TEST(FlatbufferToLiteRtTest, MapStaticTensorType) {
-  static constexpr int32_t kDims[] = {2, 2};
-  static constexpr auto kDimsSpan = absl::MakeConstSpan(kDims);
-
-  auto t = MapTensorType(std::make_pair(TflElementType::TensorType_INT32,
-                                        TflShapeInfo(kDimsSpan)));
-  ASSERT_TRUE(t);
-
-  ASSERT_EQ(t->first, kLiteRtRankedTensorType);
-  auto& ranked = t->second.ranked_tensor_type;
-  EXPECT_EQ(ranked.element_type, kLiteRtElementTypeInt32);
-  EXPECT_EQ(absl::MakeSpan(ranked.layout.dimensions, ranked.layout.rank),
-            kDimsSpan);
-}
-
-TEST(FlatbufferToLiteRtTest, MapDynamicTensorType) {
-  static constexpr int32_t kDims[] = {-1, 2};
-  static constexpr auto kDimsSpan = absl::MakeConstSpan(kDims);
-
-  auto t = MapTensorType(std::make_pair(TflElementType::TensorType_INT32,
-                                        TflShapeInfo(kDimsSpan)));
-  ASSERT_TRUE(t);
-
-  ASSERT_EQ(t->first, kLiteRtRankedTensorType);
-  auto& ranked = t->second.ranked_tensor_type;
-  EXPECT_EQ(ranked.element_type, kLiteRtElementTypeInt32);
-  EXPECT_EQ(absl::MakeSpan(ranked.layout.dimensions, ranked.layout.rank),
-            kDimsSpan);
-}
-
-TEST(FlatbufferToLiteRtTest, MapNoQuantization) {
-  LiteRtTensorT tensor;
-  auto q = MapQuantization(nullptr, tensor);
-  ASSERT_TRUE(q);
-  ASSERT_EQ(q->first, kLiteRtQuantizationNone);
-}
-
-TEST(FlatbufferToLiteRtTest, MapPerTensorQuantization) {
-  static constexpr float kScale = 1.0;
-  static constexpr int64_t kZp = 2;
-
-  TflQuantization tfl_q;
-  tfl_q.scale.assign({kScale});
-  tfl_q.zero_point.assign({kZp});
-
-  LiteRtTensorT tensor;
-  auto q = MapQuantization(&tfl_q, tensor);
-  ASSERT_TRUE(q);
-  ASSERT_EQ(q->first, kLiteRtQuantizationPerTensor);
-  EXPECT_EQ(q->second.per_tensor.scale, kScale);
-  EXPECT_EQ(q->second.per_tensor.zero_point, kZp);
-}
-
-TEST(FlatbufferToLiteRtTest, MapPerChannelQuantization) {
-  static constexpr size_t kRank = 2;
-  static constexpr float kScales[kRank] = {1.0, 2.0};
-  static constexpr int64_t kZps[kRank] = {2, 3};
-  static constexpr size_t kQDim = 1;
-
-  TflQuantization tfl_q;
-  tfl_q.scale.assign(kScales, kScales + kRank);
-  tfl_q.zero_point.assign(kZps, kZps + kRank);
-  tfl_q.quantized_dimension = kQDim;
-
-  LiteRtTensorT tensor;
-  auto q = MapQuantization(&tfl_q, tensor);
-  ASSERT_TRUE(q);
-  ASSERT_EQ(q->first, kLiteRtQuantizationPerChannel);
-  EXPECT_THAT(absl::MakeConstSpan(q->second.per_channel.scales, kRank),
-              ElementsAreArray(kScales));
-
-  EXPECT_THAT(absl::MakeConstSpan(q->second.per_channel.zero_points, kRank),
-              ElementsAreArray(kZps));
-  EXPECT_EQ(q->second.per_channel.quantized_dimension, kQDim);
-  EXPECT_EQ(q->second.per_channel.num_channels, kRank);
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/graph_validation.cc b/tensorflow/lite/experimental/litert/core/model/graph_validation.cc
deleted file mode 100644
index a9a942c1bfaa..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/graph_validation.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/graph_validation.h"
-
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-
-namespace litert::internal {
-
-bool ValidateLocalTopology(const LiteRtOpT& litert_op) {
-  // Check number of in edges equals number of inputs and each input index
-  // appears on an in edge.
-  for (auto i = 0; i < litert_op.Inputs().size(); ++i) {
-    const auto& litert_tensor = litert_op.Input(i);
-
-    auto input_use =
-        GetTensorUses(litert_tensor, FindUseInds(litert_tensor, litert_op));
-
-    if (!ContainsIf(input_use.cbegin(), input_use.cend(),
-                    [i](auto u) { return u.second == i; })) {
-      LITERT_LOG(LITERT_WARNING,
-                 "Input tensor %d not connected to op on correct index.", i);
-      return false;
-    }
-  }
-
-  // Similar to above for outputs.
-  for (auto i = 0; i < litert_op.Outputs().size(); ++i) {
-    const auto& litert_tensor = litert_op.Output(i);
-
-    if (litert_tensor.DefiningOp() != &litert_op) {
-      LITERT_LOG(LITERT_WARNING, "Output back edge doesn't refer to this op.");
-      return false;
-    }
-
-    if (litert_tensor.DefiningOpOutInd() != i) {
-      LITERT_LOG(LITERT_WARNING, "Output back edge ind is incorrect.");
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool ValidateSubgraphIO(const LiteRtSubgraphT& litert_subgraph) {
-  auto num_implied_inputs = 0;
-  auto num_implied_outputs = 0;
-  for (auto* tensor : litert_subgraph.Tensors()) {
-    const auto implied_out = tensor->NumUses() == 0;
-    const auto implied_in =
-        !IsConstant(*tensor) && tensor->DefiningOp() == nullptr;
-
-    if (implied_out && implied_in) {
-      LITERT_LOG(LITERT_WARNING, "Graph contains a dead tensor");
-      return false;
-    }
-
-    const auto is_io = IsIO(litert_subgraph, *tensor);
-
-    if (implied_in) {
-      if (!is_io) {
-        LITERT_LOG(LITERT_WARNING,
-                   "Implied input not reflected in subgraph io %lu",
-                   tensor - litert_subgraph.Tensors().at(0));
-        return false;
-      }
-      ++num_implied_inputs;
-    }
-
-    if (implied_out) {
-      if (!is_io) {
-        LITERT_LOG(LITERT_WARNING,
-                   "Implied output not reflected in subgraph io");
-        return false;
-      }
-      ++num_implied_outputs;
-    }
-  }
-
-  if (num_implied_inputs != litert_subgraph.NumInputs()) {
-    LITERT_LOG(
-        LITERT_WARNING,
-        "Number of implied %lu inputs not equal to number of actual inputs %lu",
-        num_implied_inputs, litert_subgraph.NumInputs());
-    return false;
-  }
-
-  if (num_implied_outputs != litert_subgraph.NumOutputs()) {
-    LITERT_LOG(LITERT_WARNING,
-               "Number of implied %lu outputs not equal to number of actual "
-               "outputs %lu",
-               num_implied_outputs, litert_subgraph.NumOutputs());
-    return false;
-  }
-
-  return true;
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/graph_validation.h b/tensorflow/lite/experimental/litert/core/model/graph_validation.h
deleted file mode 100644
index c0a199294f86..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/graph_validation.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_GRAPH_VALIDATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_GRAPH_VALIDATION_H_
-
-#include <algorithm>
-
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-
-// Helper functions for validating the structure of IR graphs.
-
-namespace litert::internal {
-
-// Checks the double-linked edges to immediate neighbors are valid.
-bool ValidateLocalTopology(const LiteRtOpT& litert_op);
-
-// Runs ValidateLocalTopology across given LiteRtOp iterator.
-template <class OpIt>
-bool ValidateLocalTopology(OpIt start, OpIt end) {
-  return std::all_of(start, end,
-                     [](const auto* op) { return ValidateLocalTopology(*op); });
-}
-
-// Checks the following are bijections:
-// * non-const tensor with no defining op <-> subgraph input
-// * tensor with no users <-> subgraph output (assuming no side effect ops)
-// These are used to figure out the i/o signatures when building a subgraph
-// from scratch.
-bool ValidateSubgraphIO(const LiteRtSubgraphT& litert_subgraph);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_GRAPH_VALIDATION_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/ir_allocator.h b/tensorflow/lite/experimental/litert/core/model/ir_allocator.h
deleted file mode 100644
index 43433c1ecd02..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/ir_allocator.h
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_IR_ALLOCATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_IR_ALLOCATOR_H_
-
-#include <algorithm>
-#include <cstddef>
-#include <functional>
-#include <iterator>
-#include <list>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/types/span.h"
-
-namespace litert::internal {
-
-// A list of IR objects scoped to the same block (subgraph) that provides
-// pointer stability. Facilitates management of memory and c-like access
-// to elements.
-template <class Ir>
-class IrAllocator {
- private:
-  using Storage = std::list<Ir>;
-  using Refs = std::vector<Ir*>;
-
- public:
-  // Emplace a new element onto the list.
-  template <class... Args>
-  Ir& EmplaceBack(Args&&... args) {
-    auto& emp = storage_.emplace_back(std::forward<Args>(args)...);
-    refs_->push_back(&emp);
-    return emp;
-  }
-
-  // Get the array of (stable) pointers to underlying elements. Suitable
-  // for passing through c-like interface. Consituent pointers are always
-  // guarateed to be stable (unless explicitly erased). The array of pointers
-  // itself is guaranteed to be stable so long as no length-changing operations
-  // occur, moving this class does not invalidate pointers or array.
-  absl::Span<Ir*> Elements() const {
-    return absl::MakeSpan(refs_->data(), refs_->size());
-  }
-
-  // Remove elements from the allocator if they match the predicate.
-  // Returns the number of elements removed.
-  size_t RemoveIf(std::function<bool(const Ir& ir)> pred) {
-    auto ref_it = refs_->begin();
-    for (auto it = storage_.begin(); it != storage_.end();) {
-      if (!pred(*it)) {
-        *ref_it = &*it;
-        ++ref_it;
-        ++it;
-        continue;
-      }
-      it = storage_.erase(it);
-    }
-    const size_t removed = refs_->end() - ref_it;
-    refs_->resize(refs_->size() - removed);
-    return removed;
-  }
-
-  // Cuts all but the first `size` elements from storage. Does nothing if `size`
-  // is greater or equal to current size.
-  void ResizeDown(size_t size) {
-    if (size >= Size()) {
-      return;
-    }
-    storage_.resize(size);
-    refs_->resize(size);
-  }
-
-  // Transfers the ownership of given allocator to this one. If `indices` is
-  // provided, only the objects at the given indices are transferred.
-  void TransferFrom(IrAllocator& other,
-                    std::optional<std::vector<size_t>> indices = std::nullopt) {
-    if (!indices) {
-      storage_.splice(storage_.cend(), other.storage_);
-      refs_->insert(refs_->end(), other.refs_->cbegin(), other.refs_->cend());
-      other.ResetRefs();
-      return;
-    }
-
-    auto& inds = *indices;
-    std::sort(inds.begin(), inds.end());
-    std::vector<typename Storage::iterator> its;
-    auto i = 0;
-    auto it = other.storage_.begin();
-    for (auto ind : inds) {
-      std::advance(it, ind - i);
-      i = ind;
-      its.push_back(it);
-    }
-    for (auto it : its) {
-      storage_.splice(storage_.cend(), other.storage_, it);
-    }
-
-    ResetRefs();
-    other.ResetRefs();
-  }
-
-  // Override for rvalues.
-  void TransferFrom(IrAllocator&& other) { TransferFrom(other, std::nullopt); }
-
-  // Transfers the object at the given index to the back of the given allocator.
-  void TransferTo(IrAllocator& other,
-                  std::optional<std::vector<size_t>> indices = std::nullopt) {
-    other.TransferFrom(*this, std::move(indices));
-  }
-
-  // Number of elements stored by this allocator.
-  size_t Size() const { return storage_.size(); }
-
-  IrAllocator() { refs_ = std::make_unique<Refs>(); }
-
-  // IR is generally semantically movable (without reference invalidation)
-  // but not copyable. IrAllocators reflect that, note moving lists
-  // does not invalidate references.
-  IrAllocator(const IrAllocator& other) = delete;
-  IrAllocator& operator=(const IrAllocator& other) = delete;
-  IrAllocator(IrAllocator&& other) = default;
-  IrAllocator& operator=(IrAllocator&& other) = default;
-
- private:
-  void ResetRefs() {
-    refs_->resize(storage_.size());
-    auto it = storage_.begin();
-    for (auto i = 0; i < storage_.size(); ++i, ++it) {
-      refs_->at(i) = &*it;
-    }
-  }
-
-  Storage storage_;
-  std::unique_ptr<Refs> refs_;
-};
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_IR_ALLOCATOR_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/ir_allocator_test.cc b/tensorflow/lite/experimental/litert/core/model/ir_allocator_test.cc
deleted file mode 100644
index dd895dce211e..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/ir_allocator_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/ir_allocator.h"
-
-#include <cstddef>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-static constexpr auto kCustomOpCode = kLiteRtOpCodeTflCustom;
-static constexpr auto kNonCustomOpCode = kLiteRtOpCodeTflSoftmax;
-
-TEST(IrAllocatorTest, EmplaceBack) {
-  IrAllocator<LiteRtOpT> ops;
-
-  LiteRtOpT my_op;
-  my_op.SetOpCode(kCustomOpCode);
-
-  ops.EmplaceBack(std::move(my_op));
-  ASSERT_EQ(ops.Elements().size(), 1);
-  EXPECT_EQ(ops.Elements().at(0)->OpCode(), kCustomOpCode);
-}
-
-TEST(IrAllocatorTest, RemoveIf) {
-  IrAllocator<LiteRtOpT> ops;
-
-  LiteRtOpT my_op;
-  my_op.SetOpCode(kNonCustomOpCode);
-  ops.EmplaceBack(std::move(my_op));
-
-  LiteRtOpT my_op2;
-  my_op2.SetOpCode(kCustomOpCode);
-  ops.EmplaceBack(std::move(my_op2));
-
-  LiteRtOpT my_op3;
-  my_op3.SetOpCode(kCustomOpCode);
-  ops.EmplaceBack(std::move(my_op3));
-
-  LiteRtOpT my_op4;
-  my_op4.SetOpCode(kNonCustomOpCode);
-  ops.EmplaceBack(std::move(my_op4));
-
-  auto pred = [](const auto& op) { return op.OpCode() != kCustomOpCode; };
-  ASSERT_EQ(ops.RemoveIf(pred), 2);
-
-  ASSERT_EQ(ops.Elements().size(), 2);
-  ASSERT_EQ(ops.Elements().at(0)->OpCode(), kCustomOpCode);
-  ASSERT_EQ(ops.Elements().at(1)->OpCode(), kCustomOpCode);
-}
-
-TEST(IrAllocatorTest, ResizeDown) {
-  IrAllocator<LiteRtOpT> ops;
-
-  LiteRtOp op1 = nullptr;
-  {
-    LiteRtOpT my_op;
-    my_op.SetOpCode(kNonCustomOpCode);
-    op1 = &ops.EmplaceBack(std::move(my_op));
-  }
-
-  {
-    LiteRtOpT my_op2;
-    my_op2.SetOpCode(kCustomOpCode);
-    ops.EmplaceBack(std::move(my_op2));
-  }
-
-  ops.ResizeDown(1);
-
-  ASSERT_EQ(ops.Size(), 1);
-  EXPECT_EQ(ops.Elements().at(0), op1);
-}
-
-TEST(IrAllocatorTest, Transfer) {
-  IrAllocator<LiteRtOpT> ops;
-  auto& op1 = ops.EmplaceBack();
-  auto& op2 = ops.EmplaceBack();
-
-  IrAllocator<LiteRtOpT> other_ops;
-  auto& other_op1 = other_ops.EmplaceBack();
-  auto& other_op2 = other_ops.EmplaceBack();
-
-  ops.TransferFrom(std::move(other_ops));
-
-  EXPECT_THAT(ops.Elements(),
-              ElementsAreArray({&op1, &op2, &other_op1, &other_op2}));
-}
-
-TEST(IrAllocatorTest, TransferWithIndices) {
-  IrAllocator<LiteRtOpT> ops;
-  auto& op1 = ops.EmplaceBack();
-  auto& op2 = ops.EmplaceBack();
-
-  IrAllocator<LiteRtOpT> other_ops;
-  auto& other_op1 = other_ops.EmplaceBack();
-  auto& other_op2 = other_ops.EmplaceBack();
-  auto& other_op3 = other_ops.EmplaceBack();
-  auto& other_op4 = other_ops.EmplaceBack();
-
-  std::vector<size_t> indices = {1, 3};
-  ops.TransferFrom(other_ops, std::move(indices));
-
-  EXPECT_THAT(other_ops.Elements(), ElementsAreArray({&other_op1, &other_op3}));
-  EXPECT_THAT(ops.Elements(),
-              ElementsAreArray({&op1, &op2, &other_op2, &other_op4}));
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.cc b/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.cc
deleted file mode 100644
index 9bec2f4c1ce3..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h"
-
-#include <memory>
-#include <utility>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace litert::internal {
-
-namespace {
-
-Expected<TflElementType> MapElementType(LiteRtElementType litert_element_type) {
-  switch (litert_element_type) {
-    case kLiteRtElementTypeFloat32:
-      return tflite::TensorType_FLOAT32;
-    case kLiteRtElementTypeFloat16:
-      return tflite::TensorType_FLOAT16;
-    case kLiteRtElementTypeInt32:
-      return tflite::TensorType_INT32;
-    case kLiteRtElementTypeBool:
-      return tflite::TensorType_BOOL;
-    case kLiteRtElementTypeInt16:
-      return tflite::TensorType_INT16;
-    case kLiteRtElementTypeInt8:
-      return tflite::TensorType_INT8;
-    default:
-      return Error(kLiteRtStatusErrorUnsupported);
-  }
-}
-
-template <class LiteRtTenzorType>
-Expected<TflTensorType> MapTensorTypeDetail(
-    const LiteRtTenzorType& litert_tensor_type) {
-  return Error(kLiteRtStatusErrorUnsupported);
-}
-
-template <>
-Expected<TflTensorType> MapTensorTypeDetail<LiteRtRankedTensorType>(
-    const LiteRtRankedTensorType& litert_tensor_type) {
-  auto tfl_element_type = MapElementType(litert_tensor_type.element_type);
-  if (!tfl_element_type) {
-    return tfl_element_type.Error();
-  }
-
-  auto litert_shape = absl::MakeConstSpan(litert_tensor_type.layout.dimensions,
-                                          litert_tensor_type.layout.rank);
-  return std::make_pair(*tfl_element_type, TflShapeInfo(litert_shape));
-}
-
-template <class LiteRtQuantDetail>
-Expected<TflQuantizationPtr> MapQuantizationDetail(
-    const LiteRtQuantDetail& litert_quantization) {
-  return Error(kLiteRtStatusErrorUnsupported);
-}
-
-template <>
-Expected<TflQuantizationPtr> MapQuantizationDetail<LiteRtQuantizationPerTensor>(
-    const LiteRtQuantizationPerTensor& litert_quantization) {
-  auto tfl_quantization = std::make_unique<TflQuantization>();
-  tfl_quantization->scale.assign({litert_quantization.scale});
-  tfl_quantization->zero_point.assign({litert_quantization.zero_point});
-  return tfl_quantization;
-}
-
-template <>
-Expected<TflQuantizationPtr>
-MapQuantizationDetail<LiteRtQuantizationPerChannel>(
-    const LiteRtQuantizationPerChannel& litert_quantization) {
-  auto tfl_quantization = std::make_unique<TflQuantization>();
-
-  for (int i = 0; i < litert_quantization.num_channels; ++i) {
-    tfl_quantization->scale.push_back(litert_quantization.scales[i]);
-    tfl_quantization->zero_point.push_back(litert_quantization.zero_points[i]);
-  }
-  tfl_quantization->quantized_dimension =
-      litert_quantization.quantized_dimension;
-  return tfl_quantization;
-}
-
-}  // namespace
-
-Expected<TflTensorType> MapTensorType(const TensorType& litert_tensor_type) {
-  switch (litert_tensor_type.first) {
-    case kLiteRtRankedTensorType:
-      return MapTensorTypeDetail(litert_tensor_type.second.ranked_tensor_type);
-    default:
-      return Error(kLiteRtStatusErrorUnsupported);
-  }
-}
-
-Expected<TflQuantizationPtr> MapQuantization(
-    const Quantization& litert_quantization) {
-  switch (litert_quantization.first) {
-    case kLiteRtQuantizationNone:
-      return TflQuantizationPtr(nullptr);
-    case kLiteRtQuantizationPerTensor:
-      return MapQuantizationDetail(litert_quantization.second.per_tensor);
-    case kLiteRtQuantizationPerChannel:
-      return MapQuantizationDetail(litert_quantization.second.per_channel);
-    default:
-      return Error(kLiteRtStatusErrorUnsupported);
-  }
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h b/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h
deleted file mode 100644
index 4fbe51bf9d3a..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_LITERT_TO_FLATBUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_LITERT_TO_FLATBUFFER_H_
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-namespace litert::internal {
-
-Expected<TflTensorType> MapTensorType(const TensorType& litert_tensor_type);
-
-Expected<TflQuantizationPtr> MapQuantization(
-    const Quantization& litert_quantization);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_LITERT_TO_FLATBUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer_test.cc b/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer_test.cc
deleted file mode 100644
index 3f5c8fdf101f..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer_test.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-namespace litert::internal {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-TEST(LiteRtToFlatbufferTest, MapNoQuantization) {
-  Quantization q;
-  auto tfl_q = MapQuantization(q);
-  ASSERT_TRUE(tfl_q);
-  EXPECT_EQ(tfl_q.Value(), nullptr);
-}
-
-TEST(LiteRtToFlatbufferTest, MapPerTensorQuantization) {
-  static constexpr float kScale = 1.0;
-  static constexpr int64_t kZp = 2;
-
-  Quantization q;
-  q.first = kLiteRtQuantizationPerTensor;
-  q.second.per_tensor.scale = kScale;
-  q.second.per_tensor.zero_point = kZp;
-
-  auto tfl_q = MapQuantization(q);
-  ASSERT_TRUE(tfl_q);
-  EXPECT_THAT(tfl_q->get()->scale, ElementsAreArray({kScale}));
-  EXPECT_THAT(tfl_q->get()->zero_point, ElementsAreArray({kZp}));
-}
-
-TEST(LiteRtToFlatbufferTest, MapPerChannelQuantization) {
-  static constexpr size_t kRank = 2;
-  static constexpr size_t kQuantizedDimension = 1;
-  static constexpr float kScales[kRank] = {1.0, 2.0};
-  static constexpr int64_t kZps[kRank] = {2, 3};
-
-  Quantization q;
-  q.first = kLiteRtQuantizationPerChannel;
-  q.second.per_channel.scales = const_cast<float*>(kScales);
-  q.second.per_channel.zero_points = const_cast<int64_t*>(kZps);
-  q.second.per_channel.num_channels = kRank;
-  q.second.per_channel.quantized_dimension = kQuantizedDimension;
-
-  auto tfl_q = MapQuantization(q);
-  ASSERT_TRUE(tfl_q);
-  EXPECT_THAT(tfl_q->get()->scale, ElementsAreArray(kScales));
-  EXPECT_THAT(tfl_q->get()->zero_point, ElementsAreArray(kZps));
-}
-
-TEST(LiteRtToFlatbufferTest, MapDynamicTensorType) {
-  static constexpr int32_t kDims[] = {-1, 2};
-
-  TensorType t;
-  t.first = kLiteRtRankedTensorType;
-  t.second.ranked_tensor_type.element_type = kLiteRtElementTypeFloat32;
-  t.second.ranked_tensor_type.layout = BuildLayout(kDims);
-
-  auto tfl_t = MapTensorType(t);
-  ASSERT_TRUE(tfl_t);
-  EXPECT_EQ(tfl_t->first, TflElementType::TensorType_FLOAT32);
-  EXPECT_TRUE(tfl_t->second.has_rank);
-  EXPECT_THAT(tfl_t->second.shape, ElementsAreArray({1, 2}));
-  EXPECT_THAT(tfl_t->second.shape_signature, ElementsAreArray(kDims));
-}
-
-TEST(LiteRtToFlatbufferTest, MapStaticTensorType) {
-  static constexpr int32_t kDims[] = {2, 2};
-
-  TensorType t;
-  t.first = kLiteRtRankedTensorType;
-  t.second.ranked_tensor_type.element_type = kLiteRtElementTypeFloat32;
-  t.second.ranked_tensor_type.layout = BuildLayout(kDims);
-
-  auto tfl_t = MapTensorType(t);
-  ASSERT_TRUE(tfl_t);
-  EXPECT_EQ(tfl_t->first, TflElementType::TensorType_FLOAT32);
-  EXPECT_TRUE(tfl_t->second.has_rank);
-  EXPECT_THAT(tfl_t->second.shape, ElementsAreArray({2, 2}));
-  EXPECT_TRUE(tfl_t->second.shape_signature.empty());
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model.cc b/tensorflow/lite/experimental/litert/core/model/model.cc
deleted file mode 100644
index 60e90ba01d50..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/log/absl_check.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-using ::litert::BufferRef;
-using ::litert::internal::TflBuffer;
-using ::litert::internal::TflBufferPtr;
-using ::litert::internal::TflOpCode;
-using ::litert::internal::TflOpCodePtr;
-using ::litert::internal::TflOptions;
-using ::litert::internal::TflOptions2;
-
-std::optional<std::string> GetCustomOpCode(const LiteRtModelT& model,
-                                           const LiteRtOpT& op) {
-  if (op.OpCode() != kLiteRtOpCodeTflCustom) {
-    return {};
-  }
-  const auto& tfl_op_codes = detail::GetTflOpCodes(model);
-  const auto tfl_op_code_ind = detail::GetTflOpCodeInd(op);
-  return tfl_op_codes[tfl_op_code_ind]->custom_code;
-}
-
-TensorType MakeRankedTensorType(LiteRtElementType element_type,
-                                absl::Span<const int32_t> dims) {
-  TensorType tensor_type;
-  tensor_type.first = kLiteRtRankedTensorType;
-  auto& ranked = tensor_type.second.ranked_tensor_type;
-  ranked.element_type = element_type;
-  ABSL_DCHECK_LE(dims.size(), LITERT_TENSOR_MAX_RANK);
-  ranked.layout.rank = dims.size();
-  std::copy(dims.begin(), dims.end(), ranked.layout.dimensions);
-  // Strides not yet supported.
-  ranked.layout.strides = nullptr;
-  return tensor_type;
-}
-
-Quantization MakePerTensorQuantization(float scale, int64_t zero_point) {
-  Quantization quantization;
-  quantization.first = kLiteRtQuantizationPerTensor;
-  quantization.second.per_tensor.scale = scale;
-  quantization.second.per_tensor.zero_point = zero_point;
-  return quantization;
-}
-
-LiteRtSignatureT MakeDefaultSignature(LiteRtSubgraph subgraph) {
-  auto tensor_name = [](auto* tensor) { return std::string(tensor->Name()); };
-
-  auto in_start = subgraph->Inputs().cbegin();
-  auto in_end = subgraph->Inputs().cend();
-  std::vector<std::string> input_names(subgraph->NumInputs());
-  std::transform(in_start, in_end, input_names.begin(), tensor_name);
-
-  auto out_start = subgraph->Outputs().cbegin();
-  auto out_end = subgraph->Outputs().cend();
-  std::vector<std::string> output_names(subgraph->NumOutputs());
-  std::transform(out_start, out_end, output_names.begin(), tensor_name);
-
-  std::string name(LiteRtSignatureT::kDefaultSignatureKey);
-  return LiteRtSignatureT(subgraph, std::move(input_names),
-                          std::move(output_names), std::move(name));
-}
-
-::litert::Expected<LiteRtSubgraph> LookupSubgraph(
-    const LiteRtModelT& model, absl::string_view signature_key) {
-  auto sig = model.FindSignature(signature_key);
-  if (!sig) {
-    return sig.Error();
-  }
-  return &sig->get().GetSubgraph();
-}
-
-void LiteRtModelT::TransferSubgraphTo(LiteRtSubgraphT::Alloc& dest,
-                                      std::vector<size_t> indices) {
-  if (indices.empty()) {
-    return;
-  }
-  std::sort(indices.begin(), indices.end());
-  std::vector<int> new_inds(subgraphs_.Size(), 0);
-  auto num_removed = 0;
-  auto i = indices.begin();
-  for (size_t j = 0; j < new_inds.size(); ++j) {
-    if (i != indices.end() && *i == j) {
-      ++num_removed;
-      // Keep track of removed sgs just for dcheck.
-      new_inds[j] = -1;
-      ++i;
-      continue;
-    }
-    new_inds[j] = j - num_removed;
-  }
-
-  ForEachIr(
-      this, [&](LiteRtSubgraph subgraph, int32_t subgraph_index, LiteRtOp op) {
-        if (op->OpCode() != kLiteRtOpCodeShloComposite) {
-          return;
-        }
-        auto opts = detail::TakeTflOptions2(*op);
-        auto& decomp_ind =
-            opts.AsStableHLOCompositeOptions()->decomposition_subgraph_index;
-        const auto new_ind = new_inds[decomp_ind];
-
-        // This op is either in a removed subgraph or refers to a subgraph that
-        // is not being removed.
-        ABSL_DCHECK((subgraph_index == -1) || (new_ind >= 0));
-
-        decomp_ind = new_ind;
-        detail::SetTflOptions2(*op, std::move(opts));
-      });
-
-  subgraphs_.TransferTo(dest, std::move(indices));
-}
-
-namespace detail {
-
-void SetTflOpCodeInd(LiteRtOpT& litert_op, int32_t tfl_op_code_ind) {
-  litert_op.tfl_op_code_ind_ = tfl_op_code_ind;
-}
-
-int32_t GetTflOpCodeInd(const LiteRtOpT& litert_op) {
-  return litert_op.tfl_op_code_ind_;
-}
-
-const TflOptions& GetTflOptions(const LiteRtOpT& litert_op) {
-  return litert_op.tfl_option_;
-}
-
-const TflOptions2& GetTflOptions2(const LiteRtOpT& litert_op) {
-  return litert_op.tfl_option_2_;
-}
-
-TflOptions&& TakeTflOptions(LiteRtOpT& litert_op) {
-  return std::move(litert_op.tfl_option_);
-}
-
-TflOptions2&& TakeTflOptions2(LiteRtOpT& litert_op) {
-  return std::move(litert_op.tfl_option_2_);
-}
-
-const std::vector<TflOpCodePtr>& GetTflOpCodes(
-    const LiteRtModelT& litert_model) {
-  return litert_model.tfl_operator_codes_;
-}
-
-std::vector<TflOpCodePtr>&& TakeTflOpCodes(LiteRtModelT& litert_model) {
-  return std::move(litert_model.tfl_operator_codes_);
-}
-
-// new stuff start
-void SetTflFlatbuffer(LiteRtModelT& litert_model,
-                      LiteRtModelT::TflFlatbuffer&& tfl_flatbuffer) {
-  litert_model.tfl_flatbuffer_ = std::move(tfl_flatbuffer);
-}
-
-const LiteRtModelT::TflFlatbuffer& GetTflFlatbuffer(
-    const LiteRtModelT& litert_model) {
-  return litert_model.tfl_flatbuffer_;
-}
-// new stuff end
-
-}  // namespace detail
diff --git a/tensorflow/lite/experimental/litert/core/model/model.h b/tensorflow/lite/experimental/litert/core/model/model.h
deleted file mode 100644
index a361a88aa708..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model.h
+++ /dev/null
@@ -1,1007 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_H_
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <list>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/absl_check.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"  // IWYU pragma: export
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/model/buffer_manager.h"
-#include "tensorflow/lite/experimental/litert/core/model/ir_allocator.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Internal LiteRtIR
-//
-// These are the backing definitions for the opaque types in the c api
-// (c/litert_model.h).
-//
-// < STORAGE DETAIL >
-//
-// Unless deleted as a result of calls c api client, the lifetime of all "IR
-// Objects" (definitions of opaque types) are designed to be transitively owned
-// by the LiteRtModelT which is generally the longset living object. See various
-// "Emplace" methods.
-//
-// Since c api clients interface with pointers to IR Ojbects, a form of pointer
-// stability is desirable. Classes in this file enforce that pointers to IR
-// Objects are valid for their entire life time. Thus a c api client may store
-// pointers and depend on referential equality of IR Objects thoughout different
-// calls. This also facilitates storing edge/parent-references as pointers
-// within IR Objects.
-//
-// Direct copying is generally not allowed for IR Objects since copying
-// instances of mutually recursive types is not entirely well-defined.
-//
-// IR Objects are generally default constructible to facilitate stable storage
-// and iterative construction.
-//
-// < EXPOSING TFLITE SCHEMA >
-//
-// Direct access to tflite schema types is limited to the "detail" namespace.
-// This indicates that encapsulating all the details of the flatbuffer is a WIP.
-// Future implementations may use different data forms (new litert serialized
-// format, tflite runtime types etc).
-//
-// < USAGE NOTE >
-//
-// The classes here contain only simple getters & setters. Care should be taken
-// to leave the IR in a valid state when using setters since the graph is
-// doubly-linked. Higher-level functionality for correct graph mutation can be
-// found in "model_graph.h".
-////////////////////////////////////////////////////////////////////////////////
-
-// All tflite schema type usage.
-namespace detail {
-
-// OP
-
-// Placeholder for the ind of the dispatch op code added during serialization.
-static constexpr auto kDispatchOpCodeTflInd = -1;
-
-void SetTflOpCodeInd(LiteRtOpT& litert_op, int32_t tfl_op_code_ind);
-
-int32_t GetTflOpCodeInd(const LiteRtOpT& litert_op);
-
-template <class Arg>
-void SetTflOptions(LiteRtOpT& litert_op, Arg&& arg);
-
-template <class Arg>
-void SetTflOptions2(LiteRtOpT& litert_op, Arg&& arg);
-
-const ::litert::internal::TflOptions& GetTflOptions(const LiteRtOpT& litert_op);
-
-const ::litert::internal::TflOptions2& GetTflOptions2(
-    const LiteRtOpT& litert_op);
-
-::litert::internal::TflOptions&& TakeTflOptions(LiteRtOpT& litert_op);
-
-::litert::internal::TflOptions2&& TakeTflOptions2(LiteRtOpT& litert_op);
-
-void ClearTflOptions(LiteRtOpT& litert_op);
-
-// MODEL
-
-const std::vector<::litert::internal::TflOpCodePtr>& GetTflOpCodes(
-    const LiteRtModelT& litert_model);
-
-template <class Arg>
-void SetTflOpCodes(LiteRtModelT& litert_model, Arg&& arg);
-
-std::vector<::litert::internal::TflOpCodePtr>&& TakeTflOpCodes(
-    LiteRtModelT& litert_model);
-
-void SetTflFlatbuffer(LiteRtModelT& litert_model,
-                      ::litert::internal::FlatbufferWrapper&& tfl_flatbuffer);
-
-const ::litert::internal::FlatbufferWrapper& GetTflFlatbuffer(
-    const LiteRtModelT& litert_model);
-
-}  // namespace detail
-
-//
-// Helpers for conceptual unions from C api.
-//
-
-// // For requesting opaque data stored within IR.
-using ScratchBufferProvider = std::function<uint8_t*(size_t size)>;
-
-// TENSOR TYPE
-
-// Detail convenience type for tensor type union.
-typedef union {
-  LiteRtUnrankedTensorType unranked_tensor_type;
-  LiteRtRankedTensorType ranked_tensor_type;
-} TensorTypeDetail;
-
-// Union and identifier for tensor types.
-using TensorType = std::pair<LiteRtTensorTypeId, TensorTypeDetail>;
-
-// Construct tensor type union as ranked tensor. NOTE: Copies data in `dims`.
-TensorType MakeRankedTensorType(LiteRtElementType element_type,
-                                absl::Span<const int32_t> dims);
-
-// QUANTIZATION TYPE
-
-// Detail convenience type for quantization type union.
-typedef union {
-  LiteRtQuantizationPerTensor per_tensor;
-  LiteRtQuantizationPerChannel per_channel;
-} QuantizationDetail;
-
-// Union and identifier for quantization types.
-using Quantization = std::pair<LiteRtQuantizationTypeId, QuantizationDetail>;
-
-// Make default type with quantization info.
-inline Quantization MakeEmptyQuantization() {
-  return Quantization(kLiteRtQuantizationNone, QuantizationDetail());
-}
-
-// Construct quantization type as per tensor.
-Quantization MakePerTensorQuantization(float scale, int64_t zero_point);
-
-// Construct quantization type as per channel, requires buffer callback to
-// store data.
-template <class Scales, class ZeroPoints>
-Quantization MakePerChannelQuantization(const Scales& scales,
-                                        const ZeroPoints& zero_points,
-                                        int32_t quantized_dim,
-                                        ScratchBufferProvider buffer_provider) {
-  const auto size = std::size(scales);
-  ABSL_DCHECK_EQ(size, std::size(zero_points));
-
-  Quantization res;
-  res.first = kLiteRtQuantizationPerChannel;
-
-  res.second.per_channel.num_channels = size;
-  res.second.per_channel.quantized_dimension = quantized_dim;
-
-  const size_t scales_buf_size = size * sizeof(float);
-  const size_t zeros_buf_size = size * sizeof(int64_t);
-  auto* scales_buf = reinterpret_cast<float*>(buffer_provider(scales_buf_size));
-  auto* zeros_buf = reinterpret_cast<int64_t*>(buffer_provider(zeros_buf_size));
-  std::copy(std::cbegin(scales), std::cend(scales), scales_buf);
-  std::copy(std::cbegin(zero_points), std::cend(zero_points), zeros_buf);
-
-  res.second.per_channel.scales = scales_buf;
-  res.second.per_channel.zero_points = zeros_buf;
-
-  return res;
-}
-
-//
-// Tensor
-//
-
-// Constant data associated with a tensor.
-class LiteRtWeightsT {
- private:
-  using OwnedBuffer = ::litert::OwningBufferRef<uint8_t>;
-
- public:
-  using BufferId = ::litert::internal::BufferManager::BufferId;
-  using BufferManager = ::litert::internal::BufferManager;
-
-  // Underlying data.
-  ::litert::BufferRef<uint8_t> Buffer() const {
-    auto buf = GetBufferManager()->GetBuffer(buffer_id_);
-    ABSL_DCHECK(buf.HasValue());
-    return *buf;
-  }
-
-  // Set the buffer manager, expects a stable pointer. A default buffer manager
-  // will be initialized for convenience but most cases will share a single
-  // buffer manager owned by the model.
-  void SetBufferManager(BufferManager* buffer_manager) {
-    buffer_manager_ = buffer_manager;
-  }
-
-  // Get the underlying buffer manager.
-  BufferManager* GetBufferManager() const {
-    if (std::holds_alternative<BufferManager*>(buffer_manager_)) {
-      return std::get<BufferManager*>(buffer_manager_);
-    } else {
-      return std::get<BufferManager::Ptr>(buffer_manager_).get();
-    }
-  }
-
-  // Set from a pre-registered buffer. This expects buffer was registered
-  // with the same manager.
-  void SetBufferId(BufferId buffer_id) { buffer_id_ = buffer_id; }
-
-  // Get the id generated for the buffer by the manager.
-  BufferId GetBufferId() const { return buffer_id_; }
-
-  // IR is generally, default constructible and movable but not copyable.
-  LiteRtWeightsT() = default;
-  explicit LiteRtWeightsT(BufferManager* buffer_manager)
-      : buffer_manager_(buffer_manager) {}
-  LiteRtWeightsT(const LiteRtWeightsT&) = delete;
-  LiteRtWeightsT(LiteRtWeightsT&&) = default;
-  LiteRtWeightsT& operator=(const LiteRtWeightsT&) = delete;
-  LiteRtWeightsT& operator=(LiteRtWeightsT&&) = default;
-
- private:
-  BufferId buffer_id_ = BufferManager::kEmptyBufferId;
-  std::variant<BufferManager*, BufferManager::Ptr> buffer_manager_ =
-      std::make_unique<BufferManager>();
-};
-
-// Set weights via an unowned buffer. Caller is responsible for ensuring the
-// buffer outlives the weights. Registers the buffer with the manager.
-inline void SetWeightsFromUnownedBuffer(
-    LiteRtWeightsT& weights, ::litert::BufferRef<uint8_t> buffer,
-    std::optional<litert::internal::BufferContext> context = std::nullopt) {
-  auto* manager = weights.GetBufferManager();
-  auto buf_id = manager->RegisterNonOwnedBuffer(buffer, context);
-  weights.SetBufferId(buf_id);
-}
-
-// Set weights via an unowned buffer. Caller is responsible for ensuring the
-// buffer outlives the weights. Registers the buffer with the manager.
-inline void SetWeightsFromOwnedBuffer(
-    LiteRtWeightsT& weights, ::litert::OwningBufferRef<uint8_t>&& buffer,
-    std::optional<litert::internal::BufferContext> context = std::nullopt) {
-  auto* manager = weights.GetBufferManager();
-  auto buf_id = manager->RegisterOwnedBuffer(std::move(buffer), context);
-  weights.SetBufferId(buf_id);
-}
-
-// Fundamental value in a litert program, "edges" in the graph.
-class LiteRtTensorT {
- private:
-  using UserData = std::unique_ptr<uint8_t[]>;
-
- public:
-  using Ref = std::reference_wrapper<LiteRtTensorT>;
-  using Use = std::pair<LiteRtOp, LiteRtParamIndex>;
-  using UseVec = std::vector<Use>;
-  using Alloc = ::litert::internal::IrAllocator<LiteRtTensorT>;
-
-  // The ops that take this tensor as input.
-  const std::vector<LiteRtOp>& Users() const { return users_; }
-  std::vector<LiteRtOp>& Users() { return users_; }
-
-  // Which operand index users take this tensor on, respects the ordering of
-  // users..
-  const std::vector<LiteRtParamIndex>& UserArgInds() const {
-    return user_arg_inds_;
-  }
-  std::vector<LiteRtParamIndex>& UserArgInds() { return user_arg_inds_; }
-
-  // Number of uses, same as number of user arg inds.
-  size_t NumUses() const { return users_.size(); }
-
-  // Get the ith use.
-  Use GetUse(size_t ind) const {
-    return {users_.at(ind), user_arg_inds_.at(ind)};
-  }
-
-  // Remove the use at the given index.
-  void RemoveUse(size_t ind) {
-    users_.erase(users_.begin() + ind);
-    user_arg_inds_.erase(user_arg_inds_.begin() + ind);
-  }
-
-  // Get the op that outputs this tensor, null if constant or subgraph input.
-  LiteRtOp DefiningOp() const { return defining_op_; }
-
-  // Get the output index of the op that defines this tensor, only meaningful
-  // if it has a defining op.
-  LiteRtParamIndex DefiningOpOutInd() const { return defining_op_out_ind_; }
-
-  // Update the defining op of this tensor. The caller is required to update the
-  // given op's output if not already correct.
-  void SetDefiningOp(LiteRtOpT& defining_op, LiteRtParamIndex out_ind) {
-    defining_op_ = &defining_op;
-    defining_op_out_ind_ = out_ind;
-  }
-
-  // Set the defining op to none.
-  void ClearDefiningOp() {
-    defining_op_ = nullptr;
-    defining_op_out_ind_ = 0;
-  }
-
-  // Any constant data associated with this tensor.
-  const LiteRtWeightsT& Weights() const { return weights_; }
-  LiteRtWeightsT& Weights() { return weights_; }
-
-  // Authored name associated with this tensor. May be empty.
-  absl::string_view Name() const { return name_; }
-
-  // Update the name associated with this tensor.
-  void SetName(std::string name) { name_ = std::move(name); }
-
-  // Get quantization information for this tensor.
-  const Quantization& Qparams() const { return quantization_; }
-  Quantization& Qparams() { return quantization_; }
-
-  // Set quantization information.
-  template <class Arg>
-  void SetQarams(Arg&& arg) {
-    quantization_ = std::forward<Arg>(arg);
-  }
-
-  // Get the tensor type of this tensor.
-  const TensorType& Type() const { return tensor_type_; }
-  TensorType& Type() { return tensor_type_; }
-
-  // Set the tensor type.
-  template <class Arg>
-  void SetType(Arg&& arg) {
-    tensor_type_ = std::forward<Arg>(arg);
-  }
-
-  // Get a new buffer that will live as long as this tensor. Used for storing
-  // various buffers passed through c-api (dims, quantization etc).
-  // NOTE: This is just scratch data unrelated to weights buffer.
-  uint8_t* RequestScratchBuffer(size_t size) {
-    user_data_.push_back(std::make_unique<uint8_t[]>(size));
-    return user_data_.back().get();
-  }
-
-  // Allow for implicit conversion to scratch buffer provider.
-  // NOTE: This is just scratch data unrelated to weights buffer.
-  // NOLINTNEXTLINE
-  operator ScratchBufferProvider() & {
-    return [this](auto s) { return this->RequestScratchBuffer(s); };
-  }
-
-  // IR is generally, default constructible and movable but not copyable.
-  LiteRtTensorT() = default;
-  LiteRtTensorT(::litert::internal::BufferManager* buffer_manager)
-      : weights_(buffer_manager) {}
-  LiteRtTensorT(const LiteRtTensorT&) = delete;
-  LiteRtTensorT(LiteRtTensorT&&) = default;
-  LiteRtTensorT& operator=(const LiteRtTensorT&) = delete;
-  LiteRtTensorT& operator=(LiteRtTensorT&&) = default;
-
- private:
-  std::vector<LiteRtOp> users_;
-  std::vector<LiteRtParamIndex> user_arg_inds_;
-
-  LiteRtOp defining_op_ = nullptr;
-  LiteRtParamIndex defining_op_out_ind_;
-
-  LiteRtWeightsT weights_;
-  Quantization quantization_;
-  TensorType tensor_type_;
-
-  std::string name_;
-
-  std::vector<UserData> user_data_;
-};
-
-// Helper to get multiple uses at once.
-template <class Inds>
-LiteRtTensorT::UseVec GetTensorUses(const LiteRtTensorT& tensor,
-                                    const Inds& inds) {
-  auto start = std::cbegin(inds);
-  auto end = std::cend(inds);
-  LiteRtTensorT::UseVec uses(end - start);
-  auto get = [&tensor = std::as_const(tensor)](auto i) {
-    return tensor.GetUse(i);
-  };
-  std::transform(start, end, uses.begin(), get);
-  return uses;
-}
-
-//
-// Op
-//
-
-// Fundamental unit of compute of a litert program, or "nodes" in the graph.
-class LiteRtOpT {
- public:
-  using Ref = std::reference_wrapper<LiteRtOpT>;
-  using Alloc = ::litert::internal::IrAllocator<LiteRtOpT>;
-
-  // Input tensors for this op.
-  const std::vector<LiteRtTensor>& Inputs() const { return inputs_; }
-  std::vector<LiteRtTensor>& Inputs() { return inputs_; }
-
-  // Access input at given ind.
-  LiteRtTensorT& Input(size_t ind) { return *Inputs().at(ind); }
-  const LiteRtTensorT& Input(size_t ind) const { return *Inputs().at(ind); }
-
-  // Number of input tensors.
-  size_t NumInputs() const { return inputs_.size(); }
-
-  // Output tensors for this op.
-  const std::vector<LiteRtTensor>& Outputs() const { return outputs_; }
-  std::vector<LiteRtTensor>& Outputs() { return outputs_; }
-
-  // Number of output tensors.
-  size_t NumOutputs() const { return outputs_.size(); }
-
-  // Access output at given ind.
-  LiteRtTensorT& Output(size_t ind) { return *Outputs().at(ind); }
-  const LiteRtTensorT& Output(size_t ind) const { return *Outputs().at(ind); }
-
-  // Remove the ith entry of input list.
-  void RemoveInput(size_t ind) { inputs_.erase(inputs_.begin() + ind); }
-
-  // Remove the ith entry of output list.
-  void RemoveOutput(size_t ind) { outputs_.erase(outputs_.begin() + ind); }
-
-  // Get any custom options attached to this op. Empty if there are none.
-  litert::BufferRef<uint8_t> CustomOptions() const { return custom_options_; }
-
-  // Attach custom opaque optins to this op.
-  template <class... Args>
-  void SetCustomOptions(Args&&... args) {
-    custom_options_ =
-        ::litert::OwningBufferRef<uint8_t>(std::forward<Args>(args)...);
-  }
-
-  // Sets the custom options to zero length buffer.
-  void ClearCustomOptions() { custom_options_.Reset(); }
-
-  // Get the op code.
-  LiteRtOpCode OpCode() const { return litert_op_code_; }
-
-  // Set the op code.
-  void SetOpCode(LiteRtOpCode litert_op_code) {
-    litert_op_code_ = litert_op_code;
-  }
-
-  // IR is generally, default constructible and movable but not copyable.
-  LiteRtOpT() = default;
-  LiteRtOpT(const LiteRtOpT&) = delete;
-  LiteRtOpT(LiteRtOpT&&) = default;
-  LiteRtOpT& operator=(const LiteRtOpT&) = delete;
-  LiteRtOpT& operator=(LiteRtOpT&&) = default;
-
-  // Friendship for internal tflite details.
-  friend void detail::SetTflOpCodeInd(LiteRtOpT& litert_op,
-                                      int32_t tfl_op_code_ind);
-
-  friend int32_t detail::GetTflOpCodeInd(const LiteRtOpT& litert_op);
-
-  template <class Arg>
-  friend void detail::SetTflOptions(LiteRtOpT& litert_op, Arg&& arg);
-
-  template <class Arg>
-  friend void detail::SetTflOptions2(LiteRtOpT& litert_op, Arg&& arg);
-
-  friend const ::litert::internal::TflOptions& detail::GetTflOptions(
-      const LiteRtOpT& litert_op);
-
-  friend const ::litert::internal::TflOptions2& detail::GetTflOptions2(
-      const LiteRtOpT& litert_op);
-
-  friend ::litert::internal::TflOptions&& detail::TakeTflOptions(
-      LiteRtOpT& litert_op);
-
-  friend ::litert::internal::TflOptions2&& detail::TakeTflOptions2(
-      LiteRtOpT& litert_op);
-
-  friend void detail::ClearTflOptions(LiteRtOpT& litert_op);
-
- private:
-  LiteRtOpCode litert_op_code_;
-
-  ::litert::OwningBufferRef<uint8_t> custom_options_;
-
-  std::vector<LiteRtTensor> inputs_;
-  std::vector<LiteRtTensor> outputs_;
-
-  // TFLITE
-  int32_t tfl_op_code_ind_ = detail::kDispatchOpCodeTflInd;
-  ::litert::internal::TflOptions tfl_option_;
-  ::litert::internal::TflOptions2 tfl_option_2_;
-};
-
-// Clears any attribute data and sets the op to be a dispatch op.
-inline void MakeDispatchOp(LiteRtOpT& op) {
-  detail::ClearTflOptions(op);
-  op.ClearCustomOptions();
-  op.SetOpCode(kLiteRtOpCodeTflCustom);
-  detail::SetTflOpCodeInd(op, detail::kDispatchOpCodeTflInd);
-}
-
-//
-// Subgraph
-//
-
-// Fundamental block of a litert program. Manages the storage of all
-// ops and tensor within.
-class LiteRtSubgraphT {
- public:
-  using Ref = std::reference_wrapper<LiteRtSubgraphT>;
-  using Alloc = ::litert::internal::IrAllocator<LiteRtSubgraphT>;
-
-  // Get a stable pointer for all of the tensors in this subgraph.
-  absl::Span<LiteRtTensor> Tensors() { return tensors_.Elements(); }
-  absl::Span<const LiteRtTensor> Tensors() const { return tensors_.Elements(); }
-
-  // Access the tensor at given ind.
-  LiteRtTensorT& Tensor(size_t ind) { return *Tensors().at(ind); }
-  const LiteRtTensorT& Tensor(size_t ind) const { return *Tensors().at(ind); }
-
-  // Get a stable pointer for all of the ops in this subgraph. Will
-  // be a valid toplological order.
-  absl::Span<LiteRtOp> Ops() { return ops_.Elements(); }
-  absl::Span<const LiteRtOp> Ops() const { return ops_.Elements(); }
-
-  // Access op at the given ind.
-  LiteRtOpT& Op(size_t ind) { return *Ops().at(ind); }
-  const LiteRtOpT& Op(size_t ind) const { return *Ops().at(ind); }
-
-  // All the subgraph input tensors, these also exist in Tensors.
-  const std::vector<LiteRtTensor>& Inputs() const { return inputs_; }
-  std::vector<LiteRtTensor>& Inputs() { return inputs_; }
-
-  // Number of inputs tensors.
-  size_t NumInputs() const { return inputs_.size(); }
-
-  // Access the subgraph input at given ind.
-  LiteRtTensorT& Input(size_t ind) { return *Inputs().at(ind); }
-  const LiteRtTensorT& Input(size_t ind) const { return *Inputs().at(ind); }
-
-  // All the subgraph output tensors, these also exist in Tensors.
-  const std::vector<LiteRtTensor>& Outputs() const { return outputs_; }
-  std::vector<LiteRtTensor>& Outputs() { return outputs_; }
-
-  // Number of outputs tensors.
-  size_t NumOutputs() const { return outputs_.size(); }
-
-  // Access the subgraph output at given ind.
-  LiteRtTensorT& Output(size_t ind) { return *Outputs().at(ind); }
-  const LiteRtTensorT& Output(size_t ind) const { return *Outputs().at(ind); }
-
-  // Clear the entry for the ith input.
-  void ClearInput(size_t ind) { inputs_.erase(inputs_.begin() + ind); }
-
-  // Clear the entry for the ith output.
-  void ClearOutput(size_t ind) { outputs_.erase(outputs_.begin() + ind); }
-
-  // Construct a new tensor which will be owned by this subgraph and get a
-  // reference to it.
-  template <class... Args>
-  LiteRtTensorT& EmplaceTensor(Args&&... args) {
-    if (buffer_manager_ == nullptr) {
-      std::cerr << "Emplacing tensor without buffer manager \n";
-      return tensors_.EmplaceBack(std::forward<Args>(args)...);
-    } else {
-      // std::cerr << "Emplacing tensor with buffer manager \n";
-      return tensors_.EmplaceBack(buffer_manager_, std::forward<Args>(args)...);
-    }
-  }
-
-  // Construct a new op which will be owned by this subgraph and get a
-  // reference to it.
-  template <class... Args>
-  LiteRtOpT& EmplaceOp(Args&&... args) {
-    return ops_.EmplaceBack(std::forward<Args>(args)...);
-  }
-
-  // De-allocates ops that pass given predicate. Returns number of ops removed.
-  size_t RemoveOpIf(std::function<bool(const LiteRtOpT& op)> pred) {
-    return ops_.RemoveIf(pred);
-  }
-
-  // De-allocates tensors that pass given predicate. Returns number of tensors
-  // removed.
-  size_t RemoveTensorIf(std::function<bool(const LiteRtTensorT& tensor)> pred) {
-    return tensors_.RemoveIf(pred);
-  }
-
-  // IR is generally, default constructible and movable but not copyable.
-  LiteRtSubgraphT() = default;
-  LiteRtSubgraphT(::litert::internal::BufferManager* buffer_manager)
-      : buffer_manager_(buffer_manager) {};
-  LiteRtSubgraphT(const LiteRtSubgraphT&) = delete;
-  LiteRtSubgraphT(LiteRtSubgraphT&&) = default;
-  LiteRtSubgraphT& operator=(const LiteRtSubgraphT&) = delete;
-  LiteRtSubgraphT& operator=(LiteRtSubgraphT&&) = default;
-
-  // Get the buffer manager for this subgraph.
-  ::litert::internal::BufferManager* GetBufferManager() const {
-    return buffer_manager_;
-  }
-
- private:
-  // If null, tensors emplaced will own their own buffer managers.
-  ::litert::internal::BufferManager* buffer_manager_ = nullptr;
-
-  LiteRtTensorT::Alloc tensors_;
-
-  LiteRtOpT::Alloc ops_;
-
-  std::vector<LiteRtTensor> inputs_;
-  std::vector<LiteRtTensor> outputs_;
-};
-
-//
-// Signature
-//
-
-class LiteRtSignatureT {
- private:
-  using StrVec = std::vector<std::string>;
-
- public:
-  using Ptr = std::unique_ptr<LiteRtSignatureT>;
-  using Ref = std::reference_wrapper<LiteRtSignatureT>;
-  using Alloc = ::litert::internal::IrAllocator<LiteRtSignatureT>;
-
-  static constexpr absl::string_view kDefaultSignatureKey =
-      "<placeholder signature>";
-
-  LiteRtSignatureT(LiteRtSubgraph subgraph, StrVec input_names,
-                   StrVec output_names, std::string key)
-      : key_(std::move(key)),
-        subgraph_(subgraph),
-        input_names_(std::move(input_names)),
-        output_names_(std::move(output_names)) {}
-
-  // String named inputs for called subgraph.
-  const StrVec& InputNames() const { return input_names_; }
-
-  // String named outputs for called subgraph.
-  const StrVec& OutputNames() const { return output_names_; }
-
-  // Get the callable subgraph.
-  const LiteRtSubgraphT& GetSubgraph() const { return *subgraph_; }
-  LiteRtSubgraphT& GetSubgraph() { return *subgraph_; }
-
-  // Name of the callable signature.
-  absl::string_view Key() const { return key_; }
-
-  bool operator==(const LiteRtSignatureT& other) const {
-    const auto key_eq = key_ == other.key_;
-    const auto subgraph_eq = subgraph_ == other.subgraph_;
-    const auto input_names_eq = input_names_ == other.input_names_;
-    const auto output_names_eq = output_names_ == other.output_names_;
-    return key_eq && subgraph_eq && input_names_eq && output_names_eq;
-  }
-
-  // IR is generally, default constructible and movable but not copyable.
-  LiteRtSignatureT() = default;
-  LiteRtSignatureT(const LiteRtSignatureT&) = delete;
-  LiteRtSignatureT(LiteRtSignatureT&&) = default;
-  LiteRtSignatureT& operator=(const LiteRtSignatureT&) = delete;
-  LiteRtSignatureT& operator=(LiteRtSignatureT&&) = default;
-
- private:
-  std::string key_;
-
-  LiteRtSubgraph subgraph_;
-
-  StrVec input_names_;
-  StrVec output_names_;
-};
-
-// Make a basic signature from information in the given subgraph. Used with the
-// main subgraph when no explicit signatures have been authored.
-LiteRtSignatureT MakeDefaultSignature(LiteRtSubgraph subgraph);
-
-//
-// Model
-//
-
-// Root-level graph object for litert programs. Manages the storage
-// of all litert graph objects within.
-class LiteRtModelT {
- public:
-  using Ref = std::reference_wrapper<LiteRtModelT>;
-  using Ptr = std::unique_ptr<LiteRtModelT>;
-  using TflOpCodes = std::vector<litert::internal::TflOpCodePtr>;
-
-  using BufferManager = ::litert::internal::BufferManager;
-  using BufferId = BufferManager::BufferId;
-
-  using OpAssetReference = std::pair<BufferId, std::string>;
-  using OpAssetMap = absl::flat_hash_map<LiteRtOp, OpAssetReference>;
-
-  using MetadataMap = absl::flat_hash_map<std::string, BufferId>;
-
-  using TflFlatbuffer = ::litert::internal::FlatbufferWrapper;
-
-  // TODO replace this with the index of the default signature.
-  static constexpr const size_t kMainSubgraphIndex = 0;
-
-  // SUBGRAPHS
-
-  // Get a stable pointer for all of the subgraphs within this model.
-  absl::Span<LiteRtSubgraph> Subgraphs() { return subgraphs_.Elements(); }
-  absl::Span<const LiteRtSubgraph> Subgraphs() const {
-    return subgraphs_.Elements();
-  }
-
-  // Access subgraph at given ind.
-  LiteRtSubgraphT& Subgraph(size_t ind) { return *Subgraphs().at(ind); }
-  const LiteRtSubgraphT& Subgraph(size_t ind) const {
-    return *Subgraphs().at(ind);
-  }
-
-  // Number of subraphs.
-  size_t NumSubgraphs() const { return subgraphs_.Elements().size(); }
-
-  // Default entry point of this model.
-  const LiteRtSubgraphT* MainSubgraph() const {
-    return &Subgraph(kMainSubgraphIndex);
-  }
-  LiteRtSubgraph MainSubgraph() { return &Subgraph(kMainSubgraphIndex); }
-
-  // Look up signature by key.
-  litert::Expected<LiteRtSignatureT::Ref> FindSignature(
-      absl::string_view signature_key) const {
-    for (LiteRtSignature sig : signatures_.Elements()) {
-      if (sig->Key() == signature_key) {
-        return std::ref(*sig);
-      }
-    }
-    return ::litert::Error(kLiteRtStatusErrorNotFound, "Signature not found");
-  }
-
-  // Build a new subgraph and get a stable reference to it.
-  template <class... Args>
-  LiteRtSubgraphT& EmplaceSubgraph(Args&&... args) {
-    return subgraphs_.EmplaceBack(Buffers(), std::forward<Args>(args)...);
-  }
-
-  // Transfers given subgraphs into this model. New subgraphs are appended.
-  void TransferSubgraphsFrom(LiteRtSubgraphT::Alloc&& subgraphs) {
-    // TODO: Consider mergeing buffer managers here.
-    subgraphs_.TransferFrom(std::move(subgraphs));
-  }
-
-  // Cut all by the first `size` subgraphs. Does nothing if given size is
-  // greater or equal to current.
-  void ResizeSubgraphsDown(size_t size) { subgraphs_.ResizeDown(size); }
-
-  // Transfers the subgraph at the given index to the back of the given
-  // allocator. Also updates any IR owned by the model that refers to subgraphs
-  // by index (e.g. composites). Does not update any IR in the subgraphs being
-  // transferred.
-  void TransferSubgraphTo(LiteRtSubgraphT::Alloc& dest,
-                          std::vector<size_t> indices);
-
-  // SIGNATURES
-
-  // All signatures registered with this model.
-  absl::Span<LiteRtSignature> Signatures() const {
-    return signatures_.Elements();
-  }
-
-  // Construct a new signature for this model.
-  template <class... Args>
-  LiteRtSignatureT& EmplaceSignature(Args&&... args) {
-    return signatures_.EmplaceBack(std::forward<Args>(args)...);
-  }
-
-  // METADATA
-
-  // Look up metadata by key, getting a view of its buffer as a string
-  // if it exists.
-  litert::Expected<litert::BufferRef<uint8_t>> FindMetadata(
-      absl::string_view key) const {
-    if (auto it = metadata_.find(key); it != metadata_.end()) {
-      const auto buf_id = it->second;
-      return Buffers()->GetBuffer(buf_id);
-    }
-    return ::litert::Error(kLiteRtStatusErrorNotFound);
-  }
-
-  // Metadata key-val pair iterator.
-  MetadataMap::iterator MetadataBegin() { return metadata_.begin(); }
-  MetadataMap::iterator MetadataEnd() { return metadata_.end(); }
-
-  // Adds a new metadata buffer to the model. Fails if it already exists.
-  template <class... Args>
-  LiteRtStatus PushMetadata(absl::string_view key, Args&&... args) {
-    if (metadata_.contains(key)) {
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-    const auto buf_id = Buffers()->RegisterOwnedBuffer(
-        ::litert::OwningBufferRef<uint8_t>(std::forward<Args>(args)...));
-    metadata_.emplace(std::make_pair(std::string(key), buf_id));
-    return kLiteRtStatusOk;
-  }
-
-  // BUFFERS
-
-  // Get stable pointer to buffer manager object.
-  BufferManager* Buffers() const { return buffer_manager_.get(); }
-
-  // Attach an asset to the given op. An asset is a non-tensor buffer
-  // that is used by the op. Assets may be referenced by multiple ops.
-  // Each edge from an op to an asset is identified by a name. All buffers
-  // are appended to the model upon serialization and referenced by offset
-  // relative to the start of the model within the referring op's custom
-  // options.
-  void AttachAssetToOp(LiteRtOp op, BufferId buf_id, std::string name) {
-    OpAssetReference ref = {buf_id, std::move(name)};
-    external_buffer_map_.emplace(op, std::move(ref));
-  }
-
-  // Returns an immutable view of the external buffer and the name of the edge
-  // if the given op has one attached.
-  litert::Expected<OpAssetReference> FindOpAsset(LiteRtOp op) {
-    if (auto it = external_buffer_map_.find(op);
-        it != external_buffer_map_.end()) {
-      return it->second;
-    }
-    return ::litert::Error(kLiteRtStatusErrorNotFound);
-  }
-
-  // IR is generally, default constructible and movable but not copyable.
-  LiteRtModelT() = default;
-  LiteRtModelT(const LiteRtModelT&) = delete;
-  LiteRtModelT(LiteRtModelT&&) = default;
-  LiteRtModelT& operator=(const LiteRtModelT&) = delete;
-  LiteRtModelT& operator=(LiteRtModelT&&) = default;
-
-  // TFLITE
-
-  // Friendship for internal tflite details.
-  friend const TflOpCodes& detail::GetTflOpCodes(
-      const LiteRtModelT& litert_model);
-
-  template <class Arg>
-  friend void detail::SetTflOpCodes(LiteRtModelT& litert_model, Arg&& arg);
-
-  friend TflOpCodes&& detail::TakeTflOpCodes(LiteRtModelT& litert_model);
-
-  friend void detail::SetTflFlatbuffer(LiteRtModelT& litert_model,
-                                       TflFlatbuffer&& tfl_flatbuffer);
-
-  friend const TflFlatbuffer& detail::GetTflFlatbuffer(
-      const LiteRtModelT& litert_model);
-
-  explicit LiteRtModelT(TflFlatbuffer&& tfl_flatbuffer)
-      : tfl_flatbuffer_(std::move(tfl_flatbuffer)) {}
-
- private:
-  LiteRtSubgraphT::Alloc subgraphs_;
-  LiteRtSignatureT::Alloc signatures_;
-
-  MetadataMap metadata_;
-  OpAssetMap external_buffer_map_;
-
-  // Use unique ptr here to keep stable.
-  BufferManager::Ptr buffer_manager_ = std::make_unique<BufferManager>();
-
-  // TFLITE
-  TflOpCodes tfl_operator_codes_;
-  TflFlatbuffer tfl_flatbuffer_;
-};
-
-// Get the custom op code from a given op if it is a custom op.
-std::optional<std::string> GetCustomOpCode(const LiteRtModelT& model,
-                                           const LiteRtOpT& op);
-
-// Lookup subgraph by signature name.
-::litert::Expected<LiteRtSubgraph> LookupSubgraph(
-    const LiteRtModelT& model, absl::string_view signature_key);
-
-namespace detail {
-
-template <class Arg>
-void SetTflOptions(LiteRtOpT& litert_op, Arg&& arg) {
-  litert_op.tfl_option_ = std::forward<Arg>(arg);
-}
-
-template <class Arg>
-void SetTflOptions2(LiteRtOpT& litert_op, Arg&& arg) {
-  litert_op.tfl_option_2_ = std::forward<Arg>(arg);
-}
-
-inline void ClearTflOptions(LiteRtOpT& litert_op) {
-  litert_op.tfl_option_2_.Reset();
-  litert_op.tfl_option_.Reset();
-}
-
-template <class Arg>
-void SetTflOpCodes(LiteRtModelT& litert_model, Arg&& arg) {
-  litert_model.tfl_operator_codes_ = std::forward<Arg>(arg);
-}
-
-}  // namespace detail
-
-//
-// Misc Ir Containers
-//
-
-using LiteRtOpWithPartitionIndex = std::pair<LiteRtOp, LiteRtParamIndex>;
-
-// Used for communicating selections of ops in when partitioning.
-class LiteRtOpListT {
- public:
-  void Push(LiteRtOp op, LiteRtParamIndex partition_index = 0) {
-    values_.push_back(LiteRtOpWithPartitionIndex(op, partition_index));
-  }
-
-  std::vector<LiteRtOpWithPartitionIndex> Values() const {
-    std::vector<LiteRtOpWithPartitionIndex> ops;
-    ops.reserve(values_.size());
-    ops.assign(values_.begin(), values_.end());
-
-    return ops;
-  }
-
- private:
-  // Investigate if this is possible with vector (hit some issues).
-  std::list<LiteRtOpWithPartitionIndex> values_;
-};
-
-//
-// Traversal Utils
-//
-
-// Apply func to all the IR in the given model. Iteration behavior is determined
-// by the callback signature.
-template <class F>
-void ForEachIr(LiteRtModel model, F func) {
-  // Per subgraph callbacks.
-  using SgF1 = std::function<void(LiteRtSubgraph)>;
-  using SgF2 = std::function<void(LiteRtSubgraph, int32_t subgraph_ind)>;
-
-  // Per op callbacks.
-  using OpF1 = std::function<void(LiteRtOp)>;
-  using OpF2 = std::function<void(LiteRtSubgraph, LiteRtOp)>;
-  using OpF3 =
-      std::function<void(LiteRtSubgraph, int32_t subgraph_ind, LiteRtOp)>;
-
-  constexpr bool kIsSgOpF1 = std::is_convertible_v<F, SgF1>;
-  constexpr bool kIsSgF2 = std::is_convertible_v<F, SgF2>;
-  constexpr bool kIsOpF1 = std::is_convertible_v<F, OpF1>;
-  constexpr bool kIsOpF2 = std::is_convertible_v<F, OpF2>;
-  constexpr bool kIsOpF3 = std::is_convertible_v<F, OpF3>;
-
-  for (int i = 0; i < model->NumSubgraphs(); ++i) {
-    auto subgraph = model->Subgraphs()[i];
-
-    if constexpr (kIsSgF2) {
-      func(subgraph, i);
-    } else if constexpr (kIsSgOpF1) {
-      func(subgraph);
-    } else {
-      for (int j = 0; j < subgraph->Ops().size(); ++j) {
-        auto* op = subgraph->Ops()[j];
-        if constexpr (kIsOpF1) {
-          func(op);
-        } else if constexpr (kIsOpF2) {
-          func(subgraph, op);
-        } else if constexpr (kIsOpF3) {
-          func(subgraph, i, op);
-        }
-      }
-    }
-  }
-}
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/model_buffer.cc b/tensorflow/lite/experimental/litert/core/model/model_buffer.cc
deleted file mode 100644
index 3353b3adbf10..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_buffer.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model_buffer.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_load.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_serialize.h"
-
-namespace litert {
-namespace internal {
-
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    LiteRtModelT&& model,
-    const absl::flat_hash_map<std::string, OwningBufferRef<uint8_t>>&
-        custom_code_to_npu_bytecode,
-    size_t bytecode_alignment) {
-  for (const auto& subgraph : model.Subgraphs()) {
-    for (auto op : subgraph->Ops()) {
-      if (op->OpCode() == kLiteRtOpCodeTflCustom) {
-        auto custom_code = GetCustomOpCode(model, *op);
-        if (!custom_code) {
-          continue;
-        }
-
-        auto iter = custom_code_to_npu_bytecode.find(*custom_code);
-        if (iter == custom_code_to_npu_bytecode.end()) {
-          return Error(kLiteRtStatusErrorUnsupported,
-                       absl::StrFormat("Unexpected custom code: %s",
-                                       custom_code->c_str()));
-        }
-
-        LiteRtOpT* custom_op = op;
-        OwningBufferRef<uint8_t> byte_code(iter->second);
-        const auto buf_id =
-            model.Buffers()->RegisterOwnedBuffer(std::move(byte_code));
-        model.AttachAssetToOp(custom_op, buf_id, "");
-      }
-    }
-  }
-
-  return SerializeModel(std::move(model), bytecode_alignment);
-}
-
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    absl::string_view tfl_file,
-    const absl::flat_hash_map<std::string, std::string>&
-        custom_code_to_npu_file,
-    size_t bytecode_alignment) {
-  auto model = LoadModelFromFile(tfl_file);
-  if (!model) {
-    return model.Error();
-  }
-
-  absl::flat_hash_map<std::string, OwningBufferRef<uint8_t>>
-      custom_code_to_npu_bytecode;
-  for (auto& iter : custom_code_to_npu_file) {
-    auto npu_file_buf = LoadBinaryFile(iter.second);
-    if (!npu_file_buf) {
-      return npu_file_buf.Error();
-    }
-    custom_code_to_npu_bytecode[iter.first] = std::move(*npu_file_buf);
-  }
-
-  return GetModelBufWithByteCode(
-      std::move(**model), custom_code_to_npu_bytecode, bytecode_alignment);
-}
-
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    LiteRtModelT&& model, BufferRef<uint8_t> npu_byte_code,
-    size_t bytecode_alignment) {
-  absl::flat_hash_map<std::string, OwningBufferRef<uint8_t>>
-      custom_code_to_npu_bytecode;
-  for (const auto& subgraph : model.Subgraphs()) {
-    for (auto op : subgraph->Ops()) {
-      if (op->OpCode() == kLiteRtOpCodeTflCustom) {
-        auto custom_code = GetCustomOpCode(model, *op);
-        if (!custom_code) {
-          continue;
-        }
-        OwningBufferRef<uint8_t> byte_code(npu_byte_code.Data(),
-                                           npu_byte_code.Size());
-        custom_code_to_npu_bytecode[*custom_code] = std::move(byte_code);
-      }
-    }
-  }
-
-  return GetModelBufWithByteCode(std::move(model), custom_code_to_npu_bytecode,
-                                 bytecode_alignment);
-}
-
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    absl::string_view tfl_file, absl::string_view npu_file,
-    size_t bytecode_alignment) {
-  auto model = LoadModelFromFile(tfl_file);
-  if (!model) {
-    return model.Error();
-  }
-
-  auto npu_file_buf = LoadBinaryFile(npu_file);
-  if (!npu_file_buf) {
-    return npu_file_buf.Error();
-  }
-
-  return GetModelBufWithByteCode(std::move(**model), std::move(*npu_file_buf),
-                                 bytecode_alignment);
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/core/model/model_buffer.h b/tensorflow/lite/experimental/litert/core/model/model_buffer.h
deleted file mode 100644
index 623e86f19b28..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_buffer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_BUFFER_H_
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-
-// Get a buffer that is the concatenation of given tflite file and one or more
-// NPU byte code files. Adds metadata containing the offset/size of npu byte
-// code. TFL custom ops are mapped to NPU byte code by their custom code, which
-// must be non-null.
-//
-// NOTE: this is intended to be used for testing and tools and may be removed in
-// the future.
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    absl::string_view tfl_file,
-    const absl::flat_hash_map<std::string, std::string>&
-        custom_code_to_npu_file,
-    size_t bytecode_alignment = 1);
-
-// Same as above, but with a map specifying NPU byte code buffers.
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    LiteRtModelT&& model,
-    const absl::flat_hash_map<std::string, OwningBufferRef<uint8_t>>&
-        custom_code_to_npu_bytecode,
-    size_t bytecode_alignment = 1);
-
-// Same as above, but only a single NPU byte code file is specified.
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    absl::string_view tfl_file, absl::string_view npu_file,
-    size_t bytecode_alignment = 1);
-
-// Same as above, but only a single NPU byte code buffer is specified.
-Expected<OwningBufferRef<uint8_t>> GetModelBufWithByteCode(
-    LiteRtModelT&& model, BufferRef<uint8_t> npu_byte_code,
-    size_t bytecode_alignment = 1);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/model_buffer_test.cc b/tensorflow/lite/experimental/litert/core/model/model_buffer_test.cc
deleted file mode 100644
index 00eb7f557f04..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_buffer_test.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model_buffer.h"
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/mlir/lite/allocation.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/core/dispatch_op_schema.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_load.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
-#include "tensorflow/lite/stderr_reporter.h"
-
-namespace litert::internal {
-namespace {
-
-static constexpr absl::string_view kNpuFile = kGoogleTensorModelFileName;
-static constexpr absl::string_view kTfliteFile = "simple_model_npu.tflite";
-static constexpr absl::string_view kCascadedTfliteFile =
-    "simple_cascade_model_npu.tflite";
-
-TEST(GetModelBufWithByteCode, CreateInterpreter) {
-  auto model_with_byte_code =
-      GetModelBufWithByteCode(testing::GetTestFilePath(kTfliteFile),
-                              testing::GetTestFilePath(kNpuFile));
-  ASSERT_TRUE(model_with_byte_code);
-
-  auto alloc = std::make_unique<tflite::MemoryAllocation>(
-      model_with_byte_code->Data(), model_with_byte_code->Size(),
-      tflite::DefaultErrorReporter());
-
-  auto fb_model = tflite::FlatBufferModel::BuildFromBuffer(
-      reinterpret_cast<const char*>(alloc->base()), alloc->bytes());
-  ASSERT_NE(fb_model, nullptr);
-
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  tflite::InterpreterBuilder(*fb_model, resolver)(&interpreter);
-  EXPECT_NE(interpreter, nullptr);
-}
-
-TEST(GetModelBufWithByteCode, CheckAppended) {
-  auto model_with_byte_code =
-      GetModelBufWithByteCode(testing::GetTestFilePath(kTfliteFile),
-                              testing::GetTestFilePath(kNpuFile));
-  ASSERT_TRUE(model_with_byte_code);
-
-  auto model = LoadModelFromBuffer(*model_with_byte_code);
-  ASSERT_TRUE(model);
-
-  auto* op = model->get()->Subgraphs().front()->Ops().front();
-  ASSERT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-  auto dispatch_opts = GetDispatchOpOptions(op->CustomOptions());
-  EXPECT_EQ(dispatch_opts.name, "");
-  EXPECT_LE(dispatch_opts.bytecode_offset + dispatch_opts.bytecode_size,
-            model_with_byte_code->Size());
-}
-
-TEST(GetModelBufWithByteCode, CreateInterpreterWithMultpleNpuNodes) {
-  absl::flat_hash_map<std::string, std::string> custom_code_to_npu_file = {
-      {"DISPATCH_OP_1", testing::GetTestFilePath(kNpuFile)},
-      {"DISPATCH_OP_2", testing::GetTestFilePath(kNpuFile)},
-  };
-
-  auto model_with_byte_code = GetModelBufWithByteCode(
-      testing::GetTestFilePath(kCascadedTfliteFile), custom_code_to_npu_file);
-  ASSERT_TRUE(model_with_byte_code);
-
-  auto alloc = std::make_unique<tflite::MemoryAllocation>(
-      model_with_byte_code->Data(), model_with_byte_code->Size(),
-      tflite::DefaultErrorReporter());
-
-  auto fb_model = tflite::FlatBufferModel::BuildFromBuffer(
-      reinterpret_cast<const char*>(alloc->base()), alloc->bytes());
-  ASSERT_NE(fb_model, nullptr);
-
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  tflite::InterpreterBuilder(*fb_model, resolver)(&interpreter);
-  EXPECT_NE(interpreter, nullptr);
-}
-
-TEST(GetModelBufWithByteCode, CheckAppendedWithMultipleNpuOps) {
-  absl::flat_hash_map<std::string, std::string> custom_code_to_npu_file = {
-      {"DISPATCH_OP_1", testing::GetTestFilePath(kNpuFile)},
-      {"DISPATCH_OP_2", testing::GetTestFilePath(kNpuFile)},
-  };
-
-  auto model_with_byte_code = GetModelBufWithByteCode(
-      testing::GetTestFilePath(kCascadedTfliteFile), custom_code_to_npu_file);
-  ASSERT_TRUE(model_with_byte_code);
-
-  auto model = LoadModelFromBuffer(*model_with_byte_code);
-  ASSERT_TRUE(model);
-
-  for (auto& op : model->get()->Subgraphs().front()->Ops()) {
-    ASSERT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-    auto dispatch_opts = GetDispatchOpOptions(op->CustomOptions());
-    EXPECT_EQ(dispatch_opts.name, "");
-    EXPECT_LE(dispatch_opts.bytecode_offset + dispatch_opts.bytecode_size,
-              model_with_byte_code->Size());
-  }
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model_file_test.cc b/tensorflow/lite/experimental/litert/core/model/model_file_test.cc
deleted file mode 100644
index fb273d68d442..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_file_test.cc
+++ /dev/null
@@ -1,1033 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <filesystem>  // NOLINT
-#include <fstream>
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
-
-// schema/mutable/schema_generated.h and schema/schema_generated.h (included
-// through flatbuffer_tools.h via model.h) have the same #ifdef, thus this line
-// need to be put at the top to ensure we get the "mutable" version.
-#if 1
-#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
-#endif
-
-#include <gmock/gmock.h>  // IWYU pragma: keep
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-#include "tensorflow/lite/experimental/litert/core/dispatch_op_schema.h"
-#include "tensorflow/lite/experimental/litert/core/model/buffer_manager.h"
-#include "tensorflow/lite/experimental/litert/core/model/graph_validation.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_file_test_util.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_load.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_serialize.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/test_models.h"
-#include "tensorflow/lite/schema/mutable/schema_generated.h"
-
-namespace litert::internal {
-namespace {
-
-using ::litert::testing::GetTestFilePath;
-using ::testing::Each;
-using ::testing::ElementsAreArray;
-using ::testing::FloatEq;
-using ::testing::Values;
-using ::testing::litert::IsError;
-
-using ModelFactory = std::function<Expected<Model>()>;
-
-static constexpr absl::string_view kAddSimple = "add_simple.tflite";
-static constexpr absl::string_view kAddCst = "add_cst.tflite";
-static constexpr absl::string_view kDynamicShapeModel =
-    "dynamic_shape_tensor.tflite";
-static constexpr absl::string_view kSimpleMultiOp = "simple_multi_op.tflite";
-static constexpr absl::string_view kOneMul = "one_mul.tflite";
-static constexpr absl::string_view kSimpleMultiSubgraph =
-    "multi_subgraph.tflite";
-static constexpr absl::string_view kCstMultiSubgraph =
-    "cst_multi_subgraph.tflite";
-
-// Load a model, then serialize and re-load. Used to test serialization.
-Expected<Model> LoadModelThroughRoundTrip(absl::string_view filename) {
-  auto model = Model::CreateFromFile(GetTestFilePath(filename));
-  if (!model) {
-    return model.Error();
-  }
-
-  OwningBufferRef buf;
-  auto [data, size, offset] = buf.GetWeak();
-
-  const auto opts = litert::SerializationOptions::Defaults();
-  LITERT_RETURN_IF_ERROR(LiteRtSerializeModel(model->Release(), &data, &size,
-                                              &offset, true, opts));
-
-  // Reload model.
-  LiteRtModel result = nullptr;
-  LITERT_RETURN_IF_ERROR(
-      LiteRtCreateModelFromBuffer(buf.Data(), buf.Size(), &result));
-
-  return Model::CreateFromOwnedHandle(result);
-}
-
-ModelFactory MakeRoundTripFactory(absl::string_view filename) {
-  return [=]() { return LoadModelThroughRoundTrip(filename); };
-}
-
-ModelFactory MakeLoadFactory(absl::string_view filename) {
-  return [=]() { return Model::CreateFromFile(GetTestFilePath(filename)); };
-}
-
-// Test fixture parameterized by a file path to test model.
-class TestWithModelPath : public ::testing::TestWithParam<absl::string_view> {
- protected:
-  std::string GetTestModelPath() const {
-    return testing::GetTestFilePath(GetParam());
-  }
-};
-
-// Test fixture pareterized by a function that loads a model.
-class TestWithModelFactory : public ::testing::TestWithParam<ModelFactory> {
- protected:
-  Expected<Model> LoadModel() { return GetParam()(); }
-};
-
-// Simple tests
-//===---------------------------------------------------------------------------
-
-TEST(ModelLoadTest, BadFilepath) {
-  LiteRtModel model = nullptr;
-  EXPECT_THAT(LiteRtCreateModelFromFile("bad_path", &model),
-              IsError(kLiteRtStatusErrorNotFound));
-}
-
-TEST(ModelLoadTest, BadFileData) {
-  // NOLINTBEGIN
-#ifndef NDEBUG
-  // In debug mode, flatbuffers will `assert` while verifying. This will
-  // cause this test to crash (as expected).
-  GTEST_SKIP();
-#endif
-  std::filesystem::path test_file_path(::testing::TempDir());
-  test_file_path.append("bad_file.txt");
-
-  std::ofstream bad_file;
-  bad_file.open(test_file_path.c_str());
-  bad_file << "not_tflite";
-  bad_file.close();
-
-  LiteRtModel model = nullptr;
-  EXPECT_THAT(LiteRtCreateModelFromFile(test_file_path.c_str(), &model),
-              IsError(kLiteRtStatusErrorInvalidFlatbuffer));
-  // NOLINTEND
-}
-
-TEST(ModelLoadTest, GetCustomOpCode) {
-  auto model = litert::testing::LoadTestFileModel("simple_model_npu.tflite");
-  ASSERT_TRUE(model);
-  const auto& litert_model = *model.Get();
-  const auto& op = *litert_model.MainSubgraph()->Ops().front();
-  auto custom_op_code = GetCustomOpCode(litert_model, op);
-  ASSERT_TRUE(custom_op_code.has_value());
-  EXPECT_EQ(*custom_op_code, "DISPATCH_OP");
-}
-
-TEST(ModelLoadTest, WithMetadata) {
-  constexpr static absl::string_view kMetadataName = "an_soc_manufacturer";
-  constexpr static absl::string_view kMetadataData = "My_Meta_Data";
-
-  auto flatbuffer =
-      FlatbufferWrapper::CreateFromTflFile(GetTestFilePath(kAddSimple));
-  auto tfl_model = flatbuffer->get()->Unpack();
-  PushMetadata(kMetadataName, *tfl_model,
-               BufferRef<uint8_t>(kMetadataData.data(), kMetadataData.size()));
-  auto serialialized = SerializeFlatbuffer(*tfl_model);
-
-  auto litert_model = LoadModelFromBuffer(serialialized);
-  ASSERT_TRUE(litert_model);
-
-  auto metadata = litert_model->get()->FindMetadata(kMetadataName);
-  ASSERT_TRUE(metadata);
-  EXPECT_EQ(metadata->StrView(), kMetadataData);
-}
-
-TEST(ModelSerializeTest, WithMetadata) {
-  auto model = litert::testing::LoadTestFileModel(kAddSimple);
-
-  constexpr static absl::string_view kMetadataName = "an_soc_manufacturer";
-  constexpr static absl::string_view kMetadataData = "My_Meta_Data";
-
-  LITERT_ASSERT_OK(model.Get()->PushMetadata(
-      kMetadataName, OwningBufferRef<uint8_t>(kMetadataData)));
-
-  auto serialized = SerializeModel(std::move(*model.Get()));
-  EXPECT_TRUE(VerifyFlatbuffer(serialized->Span()));
-
-  auto re_loaded = LoadModelFromBuffer(*serialized);
-  auto metadata = re_loaded->get()->FindMetadata(kMetadataName);
-  EXPECT_EQ(metadata->StrView(), kMetadataData);
-}
-
-TEST(ModelLoadTest, WithSignature) {
-  auto model = litert::testing::LoadTestFileModel(kAddSimple);
-  auto& litert_model = *model.Get();
-
-  auto signature =
-      litert_model.FindSignature(LiteRtSignatureT::kDefaultSignatureKey);
-  ASSERT_TRUE(signature);
-
-  EXPECT_EQ(signature->get().InputNames().size(), 1);
-  EXPECT_EQ(signature->get().OutputNames().size(), 1);
-  EXPECT_EQ(&signature->get().GetSubgraph(), litert_model.MainSubgraph());
-}
-
-TEST(ModelLoadTest, NoSignature) {
-  auto model = *Model::CreateFromFile(testing::GetTfliteFilePath(
-      "java/demo/app/src/main/assets/mobilenet_v1_1.0_224.tflite"));
-  if (!model) {
-    GTEST_SKIP() << "Model file is not available.";
-  }
-  auto& litert_model = *model.Get();
-  auto signature =
-      litert_model.FindSignature(LiteRtSignatureT::kDefaultSignatureKey);
-  ASSERT_TRUE(signature);
-  EXPECT_EQ(signature->get().InputNames().size(), 1);
-  EXPECT_EQ(signature->get().OutputNames().size(), 1);
-  EXPECT_EQ(&signature->get().GetSubgraph(), litert_model.MainSubgraph());
-}
-
-TEST(ModelSerializeTest, WithSignature) {
-  auto model = litert::testing::LoadTestFileModel(kAddSimple);
-  auto& litert_model = *model.Get();
-
-  static constexpr char kInput[] = "foo";
-  static constexpr char kOutput[] = "bar";
-  static constexpr char kKey[] = "newKey";
-
-  LiteRtSignatureT signature(litert_model.MainSubgraph(), {kInput}, {kOutput},
-                             kKey);
-  litert_model.EmplaceSignature(std::move(signature));
-
-  auto serialized = SerializeModel(std::move(*model.Get()));
-  EXPECT_TRUE(VerifyFlatbuffer(serialized->Span()));
-
-  auto re_loaded = LoadModelFromBuffer(*serialized);
-  auto re_loaded_signature = re_loaded->get()->FindSignature(kKey);
-  ASSERT_TRUE(re_loaded_signature);
-  const auto& sig = re_loaded_signature->get();
-
-  const auto& inputs = sig.InputNames();
-  const auto& outputs = sig.OutputNames();
-  EXPECT_THAT(inputs, ElementsAreArray({kInput}));
-  EXPECT_THAT(outputs, ElementsAreArray({kOutput}));
-  EXPECT_EQ(&sig.GetSubgraph(), re_loaded->get()->MainSubgraph());
-}
-
-TEST(ModelLoadTest, ReverseSignature) {
-  auto model =
-      litert::testing::LoadTestFileModel("reverse_signature_model.tflite");
-  ASSERT_TRUE(model);
-  auto& litert_model = *model.Get();
-
-  auto signature = litert_model.FindSignature("serving_default");
-  ASSERT_TRUE(signature);
-
-  // Check if the input and output names are in the order of the subgraph
-  // inputs and outputs instead of the signature appearance order.
-  const auto& sig = signature->get();
-  ASSERT_EQ(sig.InputNames().size(), 2);
-  EXPECT_STREQ(sig.InputNames()[0].c_str(), "y");
-  EXPECT_STREQ(sig.InputNames()[1].c_str(), "x");
-  ASSERT_EQ(sig.OutputNames().size(), 2);
-  EXPECT_STREQ(sig.OutputNames()[0].c_str(), "sum");
-  EXPECT_STREQ(sig.OutputNames()[1].c_str(), "prod");
-
-  auto serialized = SerializeModel(std::move(*model.Get()));
-  EXPECT_TRUE(VerifyFlatbuffer(serialized->Span()));
-
-  auto re_loaded = LoadModelFromBuffer(*serialized);
-  auto re_loaded_signature = re_loaded->get()->FindSignature("serving_default");
-  ASSERT_TRUE(re_loaded_signature);
-
-  // Check again with the serialized model.
-  const auto& re_sig = re_loaded_signature->get();
-  ASSERT_EQ(re_sig.InputNames().size(), 2);
-  EXPECT_STREQ(re_sig.InputNames()[0].c_str(), "y");
-  EXPECT_STREQ(re_sig.InputNames()[1].c_str(), "x");
-  ASSERT_EQ(re_sig.OutputNames().size(), 2);
-  EXPECT_STREQ(re_sig.OutputNames()[0].c_str(), "sum");
-  EXPECT_STREQ(re_sig.OutputNames()[1].c_str(), "prod");
-}
-
-TEST(ModelLoadTest, WithOffsetTensorBuffer) {
-  static constexpr absl::string_view kTensorData = "SOME_TENSOR_DATA";
-
-  auto flatbuffer =
-      FlatbufferWrapper::CreateFromTflFile(GetTestFilePath(kAddSimple));
-  auto tfl_model = flatbuffer->get()->Unpack();
-  const auto buf_ind = tfl_model->subgraphs[0]->tensors[0]->buffer;
-  auto& tfl_buffer = tfl_model->buffers[buf_ind];
-  tfl_buffer->offset = 1;
-  tfl_buffer->size = 1;
-  auto model_buf = SerializeFlatbuffer(*tfl_model);
-  auto* packed_tfl = tflite::GetMutableModel(model_buf.Data());
-  auto* buf = packed_tfl->mutable_buffers()->GetMutableObject(buf_ind);
-  ASSERT_TRUE(buf->mutate_offset(model_buf.Size()));
-  ASSERT_TRUE(buf->mutate_size(kTensorData.size()));
-  OwningBufferRef<uint8_t> final_serializd(kTensorData.size() +
-                                           model_buf.Size());
-  std::memcpy(final_serializd.Data(), model_buf.Data(), model_buf.Size());
-  std::memcpy(final_serializd.Data() + model_buf.Size(), kTensorData.data(),
-              kTensorData.size());
-
-  auto litert_model = LoadModelFromBuffer(final_serializd);
-  ASSERT_TRUE(litert_model);
-
-  const auto& weights_buffer =
-      litert_model->get()->Subgraph(0).Tensor(0).Weights();
-  EXPECT_EQ(weights_buffer.Buffer().StrView(), kTensorData);
-
-  // All tensors in the first subgraph should have the same buffer manager as
-  // the model.
-  for (auto* tensor : litert_model->get()->Subgraph(0).Tensors()) {
-    EXPECT_EQ(tensor->Weights().GetBufferManager(),
-              litert_model->get()->Buffers());
-  }
-}
-
-TEST(ModelSerializeTest, WithOffsetTensorBuffer) {
-  static constexpr absl::string_view kTensorData = "SOME_TENSOR_DATA";
-
-  LiteRtModelT root;
-  auto& sg = root.EmplaceSubgraph();
-  auto& tensor = sg.EmplaceTensor();
-  sg.EmplaceOp();
-  tensor.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32, {}));
-  auto& weights = tensor.Weights();
-  weights.SetBufferManager(root.Buffers());
-
-  OwningBufferRef<uint8_t> buffer(kTensorData);
-  BufferContext context;
-  context.should_append = true;
-  SetWeightsFromOwnedBuffer(weights, std::move(buffer), context);
-
-  auto serialized = SerializeModel(std::move(root));
-  ASSERT_TRUE(serialized);
-
-  // Verify the op contains an offset and size to the byte code and the correct
-  // name.
-  auto fb = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(fb);
-
-  auto tfl = fb->get()->Unpack();
-  const auto& tfl_tensor = tfl->subgraphs[0]->tensors[0];
-  const auto tfl_buffer_ind = tfl_tensor->buffer;
-  const auto& tfl_buffer = tfl->buffers[tfl_buffer_ind];
-
-  auto data =
-      serialized->StrView().substr(tfl_buffer->offset, tfl_buffer->size);
-  EXPECT_EQ(data, kTensorData);
-}
-
-TEST(ModelSerializeTest, WithMultipleOffsetTensorBuffer) {
-  static constexpr absl::string_view kTensorData = "SOME_TENSOR_DATA";
-  static constexpr absl::string_view kTensorData2 = "SOME_TENSOR_DATA2";
-
-  LiteRtModelT root;
-  auto& sg = root.EmplaceSubgraph();
-  sg.EmplaceOp();
-
-  {
-    auto& tensor = sg.EmplaceTensor();
-    tensor.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32, {}));
-    auto& weights = tensor.Weights();
-    weights.SetBufferManager(root.Buffers());
-
-    OwningBufferRef<uint8_t> buffer(kTensorData);
-    BufferContext context;
-    context.should_append = true;
-    SetWeightsFromOwnedBuffer(weights, std::move(buffer), context);
-  }
-
-  {
-    auto& tensor = sg.EmplaceTensor();
-    tensor.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32, {}));
-    auto& weights = tensor.Weights();
-    weights.SetBufferManager(root.Buffers());
-
-    OwningBufferRef<uint8_t> buffer(kTensorData2);
-    BufferContext context;
-    context.should_append = true;
-    SetWeightsFromOwnedBuffer(weights, std::move(buffer), context);
-  }
-
-  auto serialized = SerializeModel(std::move(root));
-  ASSERT_TRUE(serialized);
-
-  // Verify the op contains an offset and size to the byte code and the correct
-  // name.
-  auto fb = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(fb);
-
-  auto tfl = fb->get()->Unpack();
-
-  {
-    const auto& tfl_tensor = tfl->subgraphs[0]->tensors[0];
-    const auto tfl_buffer_ind = tfl_tensor->buffer;
-    const auto& tfl_buffer = tfl->buffers[tfl_buffer_ind];
-
-    auto data =
-        serialized->StrView().substr(tfl_buffer->offset, tfl_buffer->size);
-    EXPECT_EQ(data, kTensorData);
-  }
-
-  {
-    const auto& tfl_tensor = tfl->subgraphs[0]->tensors[1];
-    const auto tfl_buffer_ind = tfl_tensor->buffer;
-    const auto& tfl_buffer = tfl->buffers[tfl_buffer_ind];
-
-    auto data =
-        serialized->StrView().substr(tfl_buffer->offset, tfl_buffer->size);
-    EXPECT_EQ(data, kTensorData2);
-  }
-}
-
-TEST(ModelSerializeTest, WithSingleExternalBuffer) {
-  static constexpr absl::string_view kByteCode = "SOME_BYTE_CODE";
-  static constexpr absl::string_view kName = "foo";
-
-  LiteRtModelT root;
-  auto& sg = root.EmplaceSubgraph();
-  auto& op = sg.EmplaceOp();
-
-  OwningBufferRef<uint8_t> buffer(kByteCode);
-  const auto buf_id = root.Buffers()->RegisterOwnedBuffer(std::move(buffer));
-  root.AttachAssetToOp(&op, buf_id, std::string(kName));
-
-  auto serialized = SerializeModel(std::move(root));
-  ASSERT_TRUE(serialized);
-
-  // Verify the op contains an offset and size to the byte code and the correct
-  // name.
-  auto fb = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(fb);
-
-  auto tfl = fb->get()->Unpack();
-  const auto& opts = tfl->subgraphs[0]->operators[0]->custom_options;
-  BufferRef<uint8_t> opts_buffer(opts.data(), opts.size());
-
-  auto dispatch_opts = GetDispatchOpOptions(opts_buffer);
-  EXPECT_EQ(dispatch_opts.name, kName);
-  EXPECT_EQ(serialized->StrView().substr(dispatch_opts.bytecode_offset,
-                                         dispatch_opts.bytecode_size),
-            kByteCode);
-}
-
-TEST(ModelSerializeTest, WithMultipleUniqueExternalBuffer) {
-  static constexpr absl::string_view kByteCode = "SOME_BYTE_CODE";
-  static constexpr absl::string_view kName = "foo";
-  static constexpr absl::string_view kByteCode2 = "SOME_BYTE_CODE2";
-  static constexpr absl::string_view kName2 = "bar";
-
-  LiteRtModelT root;
-  auto& sg = root.EmplaceSubgraph();
-  auto& op = sg.EmplaceOp();
-  auto& op2 = sg.EmplaceOp();
-
-  OwningBufferRef<uint8_t> buffer(kByteCode);
-  const auto buf_id = root.Buffers()->RegisterOwnedBuffer(std::move(buffer));
-  root.AttachAssetToOp(&op, buf_id, std::string(kName));
-
-  OwningBufferRef<uint8_t> buffer2(kByteCode2);
-  const auto buf_id2 = root.Buffers()->RegisterOwnedBuffer(std::move(buffer2));
-  root.AttachAssetToOp(&op2, buf_id2, std::string(kName2));
-
-  auto serialized = SerializeModel(std::move(root));
-  ASSERT_TRUE(serialized);
-
-  // Verify both ops contains an offset and size to the byte code and the
-  // correct name.
-  auto fb = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(fb);
-
-  auto tfl = fb->get()->Unpack();
-
-  {
-    const auto& opts = tfl->subgraphs[0]->operators[0]->custom_options;
-    BufferRef<uint8_t> opts_buffer(opts.data(), opts.size());
-
-    auto dispatch_opts = GetDispatchOpOptions(opts_buffer);
-    EXPECT_EQ(dispatch_opts.name, kName);
-    EXPECT_EQ(serialized->StrView().substr(dispatch_opts.bytecode_offset,
-                                           dispatch_opts.bytecode_size),
-              kByteCode);
-  }
-
-  {
-    const auto& opts = tfl->subgraphs[0]->operators[1]->custom_options;
-    BufferRef<uint8_t> opts_buffer(opts.data(), opts.size());
-
-    auto dispatch_opts = GetDispatchOpOptions(opts_buffer);
-    EXPECT_EQ(dispatch_opts.name, kName2);
-    EXPECT_EQ(serialized->StrView().substr(dispatch_opts.bytecode_offset,
-                                           dispatch_opts.bytecode_size),
-              kByteCode2);
-  }
-}
-
-TEST(ModelSerializeTest, WithSharedExternalBuffer) {
-  static constexpr absl::string_view kByteCode = "SOME_BYTE_CODE";
-  static constexpr absl::string_view kName = "foo";
-  static constexpr absl::string_view kName2 = "bar";
-
-  LiteRtModelT root;
-  auto& sg = root.EmplaceSubgraph();
-  auto& op = sg.EmplaceOp();
-  auto& op2 = sg.EmplaceOp();
-
-  OwningBufferRef<uint8_t> buffer(kByteCode);
-  const auto buf_id = root.Buffers()->RegisterOwnedBuffer(std::move(buffer));
-
-  root.AttachAssetToOp(&op, buf_id, std::string(kName));
-  root.AttachAssetToOp(&op2, buf_id, std::string(kName2));
-
-  auto serialized = SerializeModel(std::move(root));
-  ASSERT_TRUE(serialized);
-
-  // Verify both ops point to the same appended buffer.
-  auto fb = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(fb);
-
-  auto tfl = fb->get()->Unpack();
-
-  {
-    const auto& opts = tfl->subgraphs[0]->operators[0]->custom_options;
-    BufferRef<uint8_t> opts_buffer(opts.data(), opts.size());
-
-    auto dispatch_opts = GetDispatchOpOptions(opts_buffer);
-    EXPECT_EQ(dispatch_opts.name, kName);
-    EXPECT_EQ(serialized->StrView().substr(dispatch_opts.bytecode_offset,
-                                           dispatch_opts.bytecode_size),
-              kByteCode);
-  }
-
-  {
-    const auto& opts = tfl->subgraphs[0]->operators[1]->custom_options;
-    BufferRef<uint8_t> opts_buffer(opts.data(), opts.size());
-
-    auto dispatch_opts = GetDispatchOpOptions(opts_buffer);
-    EXPECT_EQ(dispatch_opts.name, kName2);
-    EXPECT_EQ(serialized->StrView().substr(dispatch_opts.bytecode_offset,
-                                           dispatch_opts.bytecode_size),
-              kByteCode);
-  }
-}
-
-TEST(ModelSerializeTest, WithOffsetTensorBufferAndOpAsset) {
-  static constexpr absl::string_view kTensorData = "SOME_TENSOR_DATA";
-  static constexpr absl::string_view kByteCode = "SOME_BYTE_CODE";
-  static constexpr absl::string_view kName = "name";
-
-  LiteRtModelT root;
-  auto& sg = root.EmplaceSubgraph();
-  auto& op = sg.EmplaceOp();
-  auto& tensor = sg.EmplaceTensor();
-  tensor.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32, {}));
-  auto& weights = tensor.Weights();
-  weights.SetBufferManager(root.Buffers());
-
-  {
-    OwningBufferRef<uint8_t> buffer(kTensorData);
-    BufferContext context;
-    context.should_append = true;
-    SetWeightsFromOwnedBuffer(weights, std::move(buffer), context);
-  }
-
-  {
-    OwningBufferRef<uint8_t> buffer(kByteCode);
-    const auto buf_id = root.Buffers()->RegisterOwnedBuffer(std::move(buffer));
-    root.AttachAssetToOp(&op, buf_id, std::string(kName));
-  }
-
-  auto serialized = SerializeModel(std::move(root));
-  ASSERT_TRUE(serialized);
-
-  auto fb = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(fb);
-  auto tfl = fb->get()->Unpack();
-
-  {
-    const auto& tfl_tensor = tfl->subgraphs[0]->tensors[0];
-    const auto tfl_buffer_ind = tfl_tensor->buffer;
-    const auto& tfl_buffer = tfl->buffers[tfl_buffer_ind];
-
-    auto data =
-        serialized->StrView().substr(tfl_buffer->offset, tfl_buffer->size);
-    EXPECT_EQ(data, kTensorData);
-  }
-
-  {
-    const auto& opts = tfl->subgraphs[0]->operators[0]->custom_options;
-    BufferRef<uint8_t> opts_buffer(opts.data(), opts.size());
-
-    auto dispatch_opts = GetDispatchOpOptions(opts_buffer);
-    EXPECT_EQ(dispatch_opts.name, kName);
-    EXPECT_EQ(serialized->StrView().substr(dispatch_opts.bytecode_offset,
-                                           dispatch_opts.bytecode_size),
-              kByteCode);
-  }
-}
-
-TEST(ModelSerializeTest, WithOffsetTensorBufferAndOpAssetHasAlignment) {
-  static constexpr absl::string_view kTensorData = "SOME_TENSOR_DATA";
-  static constexpr absl::string_view kByteCode = "SOME_BYTE_CODE";
-  static constexpr absl::string_view kName = "name";
-  static constexpr size_t kAlignment = 32;
-
-  LiteRtModelT root;
-  auto& sg = root.EmplaceSubgraph();
-  auto& op = sg.EmplaceOp();
-  auto& tensor = sg.EmplaceTensor();
-  tensor.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32, {}));
-  auto& weights = tensor.Weights();
-  weights.SetBufferManager(root.Buffers());
-
-  {
-    OwningBufferRef<uint8_t> buffer(kTensorData);
-    BufferContext context;
-    context.should_append = true;
-    SetWeightsFromOwnedBuffer(weights, std::move(buffer), context);
-  }
-
-  {
-    OwningBufferRef<uint8_t> buffer(kByteCode);
-    const auto buf_id = root.Buffers()->RegisterOwnedBuffer(std::move(buffer));
-    root.AttachAssetToOp(&op, buf_id, std::string(kName));
-  }
-
-  auto serialized = SerializeModel(std::move(root), kAlignment);
-  ASSERT_TRUE(serialized);
-
-  auto fb = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(fb);
-  auto tfl = fb->get()->Unpack();
-
-  {
-    const auto& tfl_tensor = tfl->subgraphs[0]->tensors[0];
-    const auto tfl_buffer_ind = tfl_tensor->buffer;
-    const auto& tfl_buffer = tfl->buffers[tfl_buffer_ind];
-
-    auto data =
-        serialized->StrView().substr(tfl_buffer->offset, tfl_buffer->size);
-    EXPECT_EQ(data, kTensorData);
-  }
-
-  {
-    const auto& opts = tfl->subgraphs[0]->operators[0]->custom_options;
-    BufferRef<uint8_t> opts_buffer(opts.data(), opts.size());
-
-    auto dispatch_opts = GetDispatchOpOptions(opts_buffer);
-    EXPECT_EQ(dispatch_opts.name, kName);
-    ASSERT_EQ(dispatch_opts.bytecode_offset % kAlignment, 0);
-    EXPECT_EQ(serialized->StrView().substr(dispatch_opts.bytecode_offset,
-                                           dispatch_opts.bytecode_size),
-              kByteCode);
-  }
-}
-
-// Tests that explicitly check litert graph structure.
-//===---------------------------------------------------------------------------
-
-using AddSimpleTest = TestWithModelFactory;
-
-TEST_P(AddSimpleTest, CheckGraph) {
-  auto model = LoadModel();
-  ASSERT_TRUE(model);
-
-  // func(arg0)
-  //  output = tfl.add(arg0, arg0)
-  //  return(output)
-  //
-
-  auto subgraph = model->MainSubgraph();
-  const auto subgraph_inputs = subgraph->Inputs();
-  const auto subgraph_outputs = subgraph->Outputs();
-  const auto ops = subgraph->Ops();
-
-  ASSERT_EQ(subgraph_inputs.size(), 1);
-  ASSERT_EQ(subgraph_outputs.size(), 1);
-
-  const auto& internal_ops = subgraph->Get()->Ops();
-  ASSERT_TRUE(
-      ValidateLocalTopology(internal_ops.cbegin(), internal_ops.cend()));
-  ASSERT_TRUE(ValidateSubgraphIO(*subgraph->Get()));
-
-  ASSERT_EQ(ops.size(), 1);
-  const auto& op = ops.front();
-
-  const TensorTypeInfo float_2by2_type(ElementType::Float32, {2, 2});
-  ASSERT_TRUE(
-      MatchOpType(op, {float_2by2_type, float_2by2_type}, {float_2by2_type}));
-  EXPECT_EQ(op.Code(), kLiteRtOpCodeTflAdd);
-
-  const auto op_inputs = op.Inputs();
-  ASSERT_EQ(op_inputs.size(), 2);
-  ASSERT_EQ(op_inputs.front().Get(), subgraph_inputs.front().Get());
-  ASSERT_EQ(op_inputs.front().Get(), op_inputs.back().Get());
-
-  const auto op_outputs = op.Outputs();
-  ASSERT_EQ(op_outputs.size(), 1);
-  ASSERT_EQ(op_outputs.front().Get(), subgraph_outputs.front().Get());
-
-  ASSERT_FALSE(subgraph_outputs.front().IsConstant());
-  ASSERT_FALSE(subgraph_inputs.front().IsConstant());
-}
-
-INSTANTIATE_TEST_SUITE_P(ModelLoadTests, AddSimpleTest,
-                         Values(MakeLoadFactory(kAddSimple)));
-
-INSTANTIATE_TEST_SUITE_P(ModelSerializeTests, AddSimpleTest,
-                         Values(MakeRoundTripFactory(kAddSimple)));
-
-using AddCstTest = TestWithModelFactory;
-
-TEST_P(AddCstTest, CheckGraph) {
-  auto model = LoadModel();
-  ASSERT_TRUE(model);
-
-  // func(arg0)
-  //  cst = ConstantTensor([1, 2, 3, 4])
-  //  output = tfl.add(arg0, cst)
-  //  return(output)
-  //
-
-  auto subgraph = model->MainSubgraph();
-  const auto subgraph_inputs = subgraph->Inputs();
-  const auto subgraph_outputs = subgraph->Outputs();
-  const auto ops = subgraph->Ops();
-
-  ASSERT_EQ(subgraph_inputs.size(), 1);
-  ASSERT_EQ(subgraph_outputs.size(), 1);
-
-  const auto& internal_ops = subgraph->Get()->Ops();
-  ASSERT_TRUE(
-      ValidateLocalTopology(internal_ops.cbegin(), internal_ops.cend()));
-  ASSERT_TRUE(ValidateSubgraphIO(*subgraph->Get()));
-
-  ASSERT_EQ(ops.size(), 1);
-  const auto& op = ops.front();
-
-  const TensorTypeInfo float_by4_type(ElementType::Float32, {4});
-  ASSERT_TRUE(
-      MatchOpType(op, {float_by4_type, float_by4_type}, {float_by4_type}));
-  EXPECT_EQ(op.Code(), kLiteRtOpCodeTflAdd);
-
-  const auto op_inputs = op.Inputs();
-  ASSERT_EQ(op_inputs.size(), 2);
-  ASSERT_EQ(op_inputs.front().Get(), subgraph_inputs.front().Get());
-  ASSERT_TRUE(MatchWeights(op_inputs.back(),
-                           absl::Span<const float>({1.0, 2.0, 3.0, 4.0})));
-
-  const auto op_outputs = op.Outputs();
-  ASSERT_EQ(op_outputs.size(), 1);
-  ASSERT_EQ(op_outputs.front().Get(), subgraph_outputs.front().Get());
-
-  ASSERT_FALSE(subgraph_outputs.front().IsConstant());
-  ASSERT_FALSE(subgraph_inputs.front().IsConstant());
-}
-
-INSTANTIATE_TEST_SUITE_P(ModelLoadTests, AddCstTest,
-                         Values(MakeLoadFactory(kAddCst)));
-
-INSTANTIATE_TEST_SUITE_P(ModelSerializeTests, AddCstTest,
-                         Values(MakeRoundTripFactory(kAddCst)));
-
-using SimpleMultiOpTest = TestWithModelFactory;
-
-TEST_P(SimpleMultiOpTest, CheckGraph) {
-  auto model = LoadModel();
-  ASSERT_TRUE(model);
-
-  // func.func @main(arg0)
-  //   0 = tfl.add arg0, arg0
-  //   1 = tfl.mul 0, 0
-  //   2 = tfl.mul 1, 1
-  //   3 = tfl.add 2, 2
-  //   return 3
-
-  auto subgraph = model->MainSubgraph();
-  const auto subgraph_inputs = subgraph->Inputs();
-  const auto subgraph_outputs = subgraph->Outputs();
-  const auto ops = subgraph->Ops();
-
-  ASSERT_EQ(subgraph_inputs.size(), 1);
-  ASSERT_EQ(subgraph_outputs.size(), 1);
-
-  const auto& internal_ops = subgraph->Get()->Ops();
-  ASSERT_TRUE(
-      ValidateLocalTopology(internal_ops.cbegin(), internal_ops.cend()));
-  ASSERT_TRUE(ValidateSubgraphIO(*subgraph->Get()));
-
-  ASSERT_EQ(ops.size(), 4);
-
-  for (const auto& op : ops) {
-    const auto inputs = op.Inputs();
-    ASSERT_EQ(inputs.size(), 2);
-    ASSERT_EQ(inputs.front().Get(), inputs.back().Get());
-  }
-
-  const TensorTypeInfo float_2by2_type(ElementType::Float32, {2, 2});
-
-  ASSERT_TRUE(MatchOpType(ops.at(2), {float_2by2_type, float_2by2_type},
-                          {float_2by2_type}));
-  EXPECT_EQ(ops.at(2).Code(), kLiteRtOpCodeTflMul);
-}
-
-INSTANTIATE_TEST_SUITE_P(ModelLoadTests, SimpleMultiOpTest,
-                         Values(MakeLoadFactory(kSimpleMultiOp)));
-
-INSTANTIATE_TEST_SUITE_P(ModelSerializeTests, SimpleMultiOpTest,
-                         Values(MakeRoundTripFactory(kSimpleMultiOp)));
-
-using SimpleMultiSubgraphTest = TestWithModelFactory;
-
-TEST_P(SimpleMultiSubgraphTest, CheckGraph) {
-  auto model_wrap = LoadModel();
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap->Get();
-
-  ASSERT_EQ(model.NumSubgraphs(), 3);
-
-  {
-    auto& main = *model.MainSubgraph();
-    EXPECT_EQ(main.NumInputs(), 1);
-    EXPECT_EQ(main.NumOutputs(), 1);
-    EXPECT_EQ(main.Ops().size(), 1);
-    EXPECT_EQ(main.Tensors().size(), 3);
-    auto& op = main.Op(0);
-    auto* cst = op.Inputs().back();
-    auto data = Tensor(cst).WeightsData<float>();
-    ASSERT_TRUE(data);
-    EXPECT_THAT(*data, Each(FloatEq(-1.0)));
-    EXPECT_TRUE(ValidateLocalTopology(main.Ops().cbegin(), main.Ops().cend()));
-    EXPECT_TRUE(ValidateSubgraphIO(main));
-  }
-
-  {
-    auto& func1 = model.Subgraph(1);
-    EXPECT_EQ(func1.NumInputs(), 1);
-    EXPECT_EQ(func1.NumOutputs(), 1);
-    EXPECT_EQ(func1.Ops().size(), 1);
-    EXPECT_EQ(func1.Tensors().size(), 3);
-    auto& op = func1.Op(0);
-    auto* cst = op.Inputs().back();
-    auto data = Tensor(cst).WeightsData<float>();
-    ASSERT_TRUE(data);
-    EXPECT_THAT(*data, Each(FloatEq(1.0)));
-    EXPECT_TRUE(
-        ValidateLocalTopology(func1.Ops().cbegin(), func1.Ops().cend()));
-    EXPECT_TRUE(ValidateSubgraphIO(func1));
-  }
-
-  {
-    auto& func2 = model.Subgraph(2);
-    EXPECT_EQ(func2.NumInputs(), 1);
-    EXPECT_EQ(func2.NumOutputs(), 1);
-    EXPECT_EQ(func2.Ops().size(), 1);
-    EXPECT_EQ(func2.Tensors().size(), 3);
-    auto& op = func2.Op(0);
-    auto* cst = op.Inputs().back();
-    auto data = Tensor(cst).WeightsData<float>();
-    ASSERT_TRUE(data);
-    EXPECT_THAT(*data, Each(FloatEq(2.0)));
-    EXPECT_TRUE(
-        ValidateLocalTopology(func2.Ops().cbegin(), func2.Ops().cend()));
-    EXPECT_TRUE(ValidateSubgraphIO(func2));
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(ModelLoadTests, SimpleMultiSubgraphTest,
-                         Values(MakeLoadFactory(kSimpleMultiSubgraph)));
-
-INSTANTIATE_TEST_SUITE_P(ModelSerializeTests, SimpleMultiSubgraphTest,
-                         Values(MakeRoundTripFactory(kSimpleMultiSubgraph)));
-
-// Test when flatbuffer export has optimized multiple tensors to share the
-// same buffer.
-using MultiSubgraphDupeConstTest = TestWithModelFactory;
-
-TEST_P(MultiSubgraphDupeConstTest, CheckGraph) {
-  static constexpr std::array kWeights = {1.0, 2.0, 3.0, 4.0};
-
-  auto model_wrap = LoadModel();
-  ASSERT_TRUE(model_wrap);
-  auto& model = *model_wrap->Get();
-
-  ASSERT_EQ(model.NumSubgraphs(), 2);
-
-  {
-    ASSERT_EQ(model.Subgraph(0).Ops().size(), 1);
-    ASSERT_EQ(model.Subgraph(0).Tensors().size(), 3);
-    auto& cst = model.Subgraph(0).Op(0).Input(1);
-    Tensor t(&cst);
-    EXPECT_THAT(*t.WeightsData<float>(), ElementsAreArray(kWeights));
-  }
-
-  {
-    ASSERT_EQ(model.Subgraph(1).Ops().size(), 1);
-    ASSERT_EQ(model.Subgraph(1).Tensors().size(), 3);
-    auto& cst = model.Subgraph(1).Op(0).Input(1);
-    Tensor t(&cst);
-    EXPECT_THAT(*t.WeightsData<float>(), ElementsAreArray(kWeights));
-  }
-  auto buf_id_0 = model.Subgraph(0).Op(0).Input(1).Weights().GetBufferId();
-  auto buf_id_1 = model.Subgraph(1).Op(0).Input(1).Weights().GetBufferId();
-  ASSERT_EQ(buf_id_0, buf_id_1);
-}
-
-INSTANTIATE_TEST_SUITE_P(ModelLoadTests, MultiSubgraphDupeConstTest,
-                         Values(MakeLoadFactory(kCstMultiSubgraph)));
-
-INSTANTIATE_TEST_SUITE_P(ModelSerializeTests, MultiSubgraphDupeConstTest,
-                         Values(MakeRoundTripFactory(kCstMultiSubgraph)));
-
-// Tests that programmatically check litert against tflite models.
-//===---------------------------------------------------------------------------
-
-using ModelLoadOpCheckTest = TestWithModelPath;
-
-TEST_P(ModelLoadOpCheckTest, CheckOps) {
-  const auto model_path = GetTestModelPath();
-
-  auto flatbuffer = FlatbufferWrapper::CreateFromTflFile(model_path);
-  ASSERT_TRUE(flatbuffer);
-  auto expected_fb = flatbuffer->get()->Unpack();
-
-  auto model = LoadModelFromFile(model_path);
-  ASSERT_TRUE(model);
-
-  const auto* subgraph = model->get()->MainSubgraph();
-  const auto& ops = subgraph->Ops();
-
-  const auto& fb_subgraph = *expected_fb->subgraphs.front();
-  const auto& fb_ops = fb_subgraph.operators;
-  const auto& fb_tensors = fb_subgraph.tensors;
-
-  ASSERT_EQ(ops.size(), fb_ops.size());
-
-  auto get_tfl_tensor = [&](uint32_t ind) -> const TflTensor& {
-    return *fb_tensors.at(ind);
-  };
-
-  for (auto i = 0; i < ops.size(); ++i) {
-    ASSERT_TRUE(EqualsFbOp(*ops.at(i), *fb_ops.at(i), get_tfl_tensor));
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(ModelLoadQuantizedOpCheckTest, ModelLoadOpCheckTest,
-                         ::testing::ValuesIn(kAllQModels));
-
-INSTANTIATE_TEST_SUITE_P(ModelLoadDynamicOpCheckTest, ModelLoadOpCheckTest,
-                         ::testing::ValuesIn({kDynamicShapeModel}));
-
-using ModelSerializeOpCheckTest = TestWithModelPath;
-
-TEST_P(ModelSerializeOpCheckTest, CheckOps) {
-  const auto model_path = GetTestModelPath();
-
-  // Save the initial fb for comparison.
-  auto expected_fb_data = FlatbufferWrapper::CreateFromTflFile(model_path);
-  ASSERT_TRUE(expected_fb_data);
-  auto expected_fb = expected_fb_data->get()->Unpack();
-
-  // Round trip the model.
-  auto model = LoadModelFromFile(model_path);
-  ASSERT_TRUE(model);
-  auto serialized = SerializeModel(std::move(**model));
-
-  auto actual_fb_data = FlatbufferWrapper::CreateFromBuffer(*serialized);
-  ASSERT_TRUE(actual_fb_data);
-  auto actual_fb = actual_fb_data->get()->Unpack();
-
-  const auto& expected_fb_subgraph = *expected_fb->subgraphs.front();
-  const auto& expected_fb_ops = expected_fb_subgraph.operators;
-  const auto& expected_fb_tensors = expected_fb_subgraph.tensors;
-
-  const auto& actual_fb_subgraph = *actual_fb->subgraphs.front();
-  const auto& actual_fb_ops = actual_fb_subgraph.operators;
-  const auto& actual_fb_tensors = actual_fb_subgraph.tensors;
-
-  ASSERT_EQ(expected_fb_ops.size(), actual_fb_ops.size());
-  for (auto i = 0; i < actual_fb_ops.size(); ++i) {
-    const auto& expected = *expected_fb_ops.at(i);
-    const auto& actual = *actual_fb_ops.at(i);
-    EXPECT_EQ(expected.inputs.size(), actual.inputs.size());
-    EXPECT_EQ(expected.outputs.size(), actual.outputs.size());
-  }
-
-  ASSERT_EQ(expected_fb_tensors.size(), actual_fb_tensors.size());
-  for (auto i = 0; i < actual_fb_tensors.size(); ++i) {
-    const auto& expected = *expected_fb_tensors.at(i);
-    const auto& actual = *actual_fb_tensors.at(i);
-
-    EXPECT_EQ(actual.type, expected.type);
-    EXPECT_EQ(actual.shape, expected.shape);
-    EXPECT_EQ(actual.shape_signature, expected.shape_signature);
-
-    const auto expected_q_params = expected.quantization.get();
-    const auto actual_q_params = actual.quantization.get();
-
-    const auto neither_quantized =
-        !IsQuantized(expected_q_params) && !IsQuantized(actual_q_params);
-    const auto both_per_tensor = IsPerTensorQuantized(expected_q_params) &&
-                                 IsPerTensorQuantized(actual_q_params);
-    ASSERT_TRUE(neither_quantized || both_per_tensor);
-
-    if (both_per_tensor) {
-      const auto expected_per_tensor = AsPerTensorQparams(expected_q_params);
-      const auto actual_per_tensor = AsPerTensorQparams(actual_q_params);
-      EXPECT_EQ(*expected_per_tensor, *actual_per_tensor);
-    }
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(ModelSerializeOpCheckTest, ModelSerializeOpCheckTest,
-                         ::testing::ValuesIn({kOneMul, kDynamicShapeModel}));
-
-INSTANTIATE_TEST_SUITE_P(ModelSerializeQuantizedOpCheckTest,
-                         ModelSerializeOpCheckTest,
-                         ::testing::ValuesIn(kAllQModels));
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model_file_test_util.cc b/tensorflow/lite/experimental/litert/core/model/model_file_test_util.cc
deleted file mode 100644
index 55bb72fa0c29..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_file_test_util.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model_file_test_util.h"
-
-#include <algorithm>
-
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-namespace litert::internal {
-
-namespace {
-
-template <class LiteRtQType>
-bool EqualsFbQuantizationDetail(LiteRtQType litert_quantization,
-                                const TflQuantization* tfl_quantization) {
-  return false;
-}
-
-template <>
-bool EqualsFbQuantizationDetail<LiteRtQuantizationPerTensor>(
-    LiteRtQuantizationPerTensor litert_quantization,
-    const TflQuantization* tfl_quantization) {
-  auto tfl_q_params = AsPerTensorQparams(tfl_quantization);
-  if (!tfl_q_params) return false;
-  return litert_quantization.zero_point == tfl_q_params->first &&
-         litert_quantization.scale == tfl_q_params->second;
-}
-
-template <>
-bool EqualsFbQuantizationDetail<LiteRtQuantizationPerChannel>(
-    LiteRtQuantizationPerChannel litert_quantization,
-    const TflQuantization* tfl_quantization) {
-  auto tfl_q_params = AsPerChannelQparams(tfl_quantization);
-  if (!tfl_q_params) return false;
-  const auto& [quantized_dimension, num_channels, zero_points, scales] =
-      *tfl_q_params;
-  const auto qd_eq =
-      litert_quantization.quantized_dimension == quantized_dimension;
-  const auto num_chan_eq = litert_quantization.num_channels == num_channels;
-  const auto zeros_eq = std::equal(zero_points.begin(), zero_points.end(),
-                                   litert_quantization.zero_points);
-  const auto scales_eq =
-      std::equal(scales.begin(), scales.end(), litert_quantization.scales);
-  return qd_eq && num_chan_eq && zeros_eq && scales_eq;
-}
-template <class LiteRtTenzorType>
-bool EqualsFbTensorTypeDetail(LiteRtTenzorType litert_tensor_type,
-                              const TflTensorType& tfl_tensor) {
-  LITERT_LOG(LITERT_ERROR, "LiteRtTensorType not supported");
-  return false;
-}
-
-template <>
-bool EqualsFbTensorTypeDetail<LiteRtRankedTensorType>(
-    LiteRtRankedTensorType litert_tensor_type,
-    const TflTensorType& tfl_tensor_type) {
-  auto tfl_shape = AsDynamicShape(tfl_tensor_type.second);
-  if (!tfl_shape) {
-    LITERT_LOG(LITERT_ERROR, "Not ranked shape");
-    return false;
-  }
-
-  if (MapElementType(tfl_tensor_type.first) !=
-      static_cast<LiteRtElementType>(litert_tensor_type.element_type)) {
-    LITERT_LOG(LITERT_ERROR, "Element type not equal");
-    return false;
-  }
-
-  auto same_or_both_dyn = [](auto l, auto r) {
-    const auto same_static = l >= 0 && l == r;
-    const auto both_dyn = l < 0 && r < 0;
-    return same_static || both_dyn;
-  };
-
-  auto& layout = litert_tensor_type.layout;
-  const bool shape_eq =
-      AllZip(*tfl_shape, absl::MakeConstSpan(layout.dimensions, layout.rank),
-             same_or_both_dyn);
-  if (!shape_eq) {
-    LITERT_LOG(LITERT_ERROR, "Shapes are not equal");
-    return false;
-  }
-
-  return true;
-}
-
-}  // namespace
-
-bool EqualsFbQuantization(const Quantization& litert_quantization,
-                          const TflQuantization* tfl_quantization) {
-  switch (litert_quantization.first) {
-    case kLiteRtQuantizationPerTensor:
-      return EqualsFbQuantizationDetail(litert_quantization.second.per_tensor,
-                                        tfl_quantization);
-    case kLiteRtQuantizationPerChannel:
-      return EqualsFbQuantizationDetail(litert_quantization.second.per_channel,
-                                        tfl_quantization);
-    case kLiteRtQuantizationNone:
-      return !IsQuantized(tfl_quantization);
-    default:
-      // Not implemented yet.
-      return false;
-  }
-}
-
-// Compare tensor type within litert tensor to the type within flatbuffer
-// tensor.
-bool EqualsFbTensorType(const TensorType& litert_tensor_type,
-                        const TflTensorType& tfl_tensor_type) {
-  switch (litert_tensor_type.first) {
-    case kLiteRtRankedTensorType:
-      return EqualsFbTensorTypeDetail(
-          litert_tensor_type.second.ranked_tensor_type, tfl_tensor_type);
-    default:
-      LITERT_LOG(LITERT_ERROR, "Tensor kind not supported");
-      // Not implemented yet.
-      return false;
-  }
-}
-
-bool EqualsFbTensor(const LiteRtTensorT& litert_tensor,
-                    const TflTensor& tfl_tensor) {
-  if (!EqualsFbTensorType(litert_tensor.Type(),
-                          {tfl_tensor.type, TflShapeInfo(tfl_tensor)})) {
-    LITERT_LOG(LITERT_ERROR, "Tensor not same type");
-    return false;
-  }
-
-  if (!EqualsFbQuantization(litert_tensor.Qparams(),
-                            tfl_tensor.quantization.get())) {
-    LITERT_LOG(LITERT_ERROR, "Tensor not same quantization");
-    return false;
-  }
-
-  return true;
-}
-
-bool EqualsFbOp(const LiteRtOpT& litert_op, const TflOp& tfl_op,
-                GetTflTensor get_tfl_tensor) {
-  auto check_tensors = [&](auto& litert_tensors, auto& tfl_tensors) {
-    if (litert_tensors.size() != tfl_tensors.size()) {
-      LITERT_LOG(LITERT_ERROR, "Tensors not same size");
-      return false;
-    }
-
-    for (auto i = 0; i < litert_tensors.size(); ++i) {
-      const auto& fb_tensor = get_tfl_tensor(tfl_tensors.at(i)).get();
-      const auto& litert_tensor = *litert_tensors.at(i);
-
-      if (!EqualsFbTensor(litert_tensor, fb_tensor)) {
-        LITERT_LOG(LITERT_ERROR, "Tensor %d not same", i);
-        return false;
-      }
-    }
-
-    return true;
-  };
-
-  return check_tensors(litert_op.Inputs(), tfl_op.inputs) &&
-         check_tensors(litert_op.Outputs(), tfl_op.outputs);
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model_file_test_util.h b/tensorflow/lite/experimental/litert/core/model/model_file_test_util.h
deleted file mode 100644
index df0138e321c0..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_file_test_util.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_FILE_TEST_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_FILE_TEST_UTIL_H_
-
-#include <cstdint>
-#include <functional>
-
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-namespace litert::internal {
-
-// Callback to get a tfl tensor from it's index.
-using GetTflTensor =
-    std::function<std::reference_wrapper<const TflTensor>(uint32_t ind)>;
-
-// Compare q-params for having the same type and values.
-bool EqualsFbQuantization(const Quantization& litert_quantization,
-                          const TflQuantization* tfl_quantization);
-
-// Compare tensor types for having the same shape and element type.
-bool EqualsFbTensorType(const TensorType& litert_tensor_type,
-                        const TflTensorType& tfl_tensor_type);
-
-// Compare litert op to flatbuffer op along with their input/output tensors
-// types and quantization. Takes a callback to lookup tfl tensors the indices
-// within the tfl op.
-bool EqualsFbOp(const LiteRtOpT& litert_op, const TflOp& tfl_op,
-                GetTflTensor get_tfl_tensor);
-
-// Compare litert tensor to flatbuffer tensor for having same types and
-// quantization.
-bool EqualsFbTensor(const LiteRtTensorT& litert_tensor,
-                    const TflTensor& tfl_tensor);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_FILE_TEST_UTIL_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/model_graph.cc b/tensorflow/lite/experimental/litert/core/model/model_graph.cc
deleted file mode 100644
index 4e5c668a6569..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_graph.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/log/absl_check.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-
-namespace {
-
-bool IsOpDead(const LiteRtOpT& op) {
-  return op.Inputs().empty() && op.Outputs().empty();
-}
-
-bool IsTensorDead(const LiteRtTensorT& tensor) {
-  return tensor.DefiningOp() == nullptr && tensor.NumUses() == 0;
-}
-
-}  // namespace
-
-void CloneTo(const LiteRtTensorT& src, LiteRtTensorT& dest) {
-  dest.SetName({src.Name().cbegin(), src.Name().cend()});
-  dest.SetQarams(src.Qparams());
-  dest.SetType(src.Type());
-
-  // Manully copy per-channel quantization params,quant array is owned by
-  // tensor.
-  if (src.Qparams().first == kLiteRtQuantizationPerChannel) {
-    std::vector<float> scales(
-        src.Qparams().second.per_channel.scales,
-        src.Qparams().second.per_channel.scales +
-            src.Qparams().second.per_channel.num_channels);
-    std::vector<int64_t> zero_points(
-        src.Qparams().second.per_channel.zero_points,
-        src.Qparams().second.per_channel.zero_points +
-            src.Qparams().second.per_channel.num_channels);
-    Quantization dest_qparams = MakePerChannelQuantization(
-        scales, zero_points,
-        src.Qparams().second.per_channel.quantized_dimension,
-        [&dest](auto s) { return dest.RequestScratchBuffer(s); });
-    dest.SetQarams(std::move(dest_qparams));
-  }
-
-  // Move weight buffer from src to dest.
-  const auto& src_weights = src.Weights();
-  auto& dest_weights = dest.Weights();
-
-  const auto same_manager =
-      src_weights.GetBufferManager() == dest_weights.GetBufferManager();
-
-  if (same_manager) {
-    dest_weights.SetBufferId(src_weights.GetBufferId());
-  } else {
-    OwningBufferRef<uint8_t> weights_buffer(src_weights.Buffer().Data(),
-                                            src_weights.Buffer().Size());
-    SetWeightsFromOwnedBuffer(dest_weights, std::move(weights_buffer));
-  }
-}
-
-void CloneTo(const LiteRtOpT& src, LiteRtOpT& dest) {
-  dest.SetCustomOptions(src.CustomOptions().Data(), src.CustomOptions().Size());
-  detail::SetTflOptions(dest, detail::GetTflOptions(src));
-  detail::SetTflOpCodeInd(dest, detail::GetTflOpCodeInd(src));
-  dest.SetOpCode(src.OpCode());
-}
-
-LiteRtTensorT& MakeClone(LiteRtSubgraphT& parent, const LiteRtTensorT& src) {
-  auto& new_tensor = parent.EmplaceTensor();
-  CloneTo(src, new_tensor);
-  return new_tensor;
-}
-
-LiteRtOpT& MakeClone(LiteRtSubgraphT& parent, const LiteRtOpT& src) {
-  auto& new_op = parent.EmplaceOp();
-  CloneTo(src, new_op);
-  return new_op;
-}
-
-std::optional<LiteRtParamIndex> FindInput(const LiteRtOpT& op,
-                                          const LiteRtTensorT& tensor) {
-  return FindInd(op.Inputs().cbegin(), op.Inputs().cend(), &tensor);
-}
-
-std::optional<LiteRtParamIndex> FindOutput(const LiteRtOpT& op,
-                                           const LiteRtTensorT& tensor) {
-  return FindInd(op.Outputs().cbegin(), op.Outputs().cend(), &tensor);
-}
-
-std::optional<LiteRtParamIndex> FindInput(const LiteRtSubgraphT& subgraph,
-                                          const LiteRtTensorT& tensor) {
-  return FindInd(subgraph.Inputs().cbegin(), subgraph.Inputs().cend(), &tensor);
-}
-
-std::optional<LiteRtParamIndex> FindOutput(const LiteRtSubgraphT& subgraph,
-                                           const LiteRtTensorT& tensor) {
-  return FindInd(subgraph.Outputs().cbegin(), subgraph.Outputs().cend(),
-                 &tensor);
-}
-
-UseIndices FindUseInds(const LiteRtTensorT& tensor, const LiteRtOpT& op) {
-  UseIndices res;
-  for (auto i = 0; i < tensor.NumUses(); ++i) {
-    if (tensor.Users().at(i) == &op) {
-      res.push_back(i);
-    }
-  }
-  return res;
-}
-
-bool IsConstant(const LiteRtTensorT& tensor) {
-  bool is_zero_sized = false;
-  auto layout = tensor.Type().second.ranked_tensor_type.layout;
-  if (layout.rank == 1) {
-    if (layout.dimensions[0] == 0) {
-      is_zero_sized = true;
-    }
-  }
-  const auto is_const = tensor.Weights().Buffer().Size() > 0 || is_zero_sized;
-  ABSL_DCHECK(!is_const || tensor.DefiningOp() == nullptr)
-      << "Constant tensors should not be defined by an op";
-  return is_const;
-}
-
-void AttachInput(LiteRtTensor tensor, LiteRtOpT& op) {
-  op.Inputs().push_back(tensor);
-  tensor->Users().push_back(&op);
-  tensor->UserArgInds().push_back(op.Inputs().size() - 1);
-}
-
-void AttachOutput(LiteRtTensor tensor, LiteRtOpT& op) {
-  ABSL_DCHECK(tensor->DefiningOp() == nullptr)
-      << "Cannot add an already defined tensor as op output";
-  op.Outputs().push_back(tensor);
-  tensor->SetDefiningOp(op, op.Outputs().size() - 1);
-}
-
-LiteRtTensor DisconnectInput(LiteRtOpT& op, LiteRtParamIndex input_ind) {
-  ABSL_DCHECK(input_ind < op.Inputs().size()) << "Removing tensor index oob";
-  auto& input = op.Input(input_ind);
-
-  // Find the index of the use for the given in edge.
-  auto target_use_ind = -1;
-  for (auto i = 0; i < input.NumUses(); ++i) {
-    if (input.Users().at(i) == &op && input.UserArgInds().at(i) == input_ind) {
-      target_use_ind = i;
-    }
-  }
-  ABSL_DCHECK_GE(target_use_ind, 0) << "Malformed graph";
-
-  // Slide latter input use arg inds to the left.
-  for (auto i = input_ind + 1; i < op.Inputs().size(); ++i) {
-    auto& r_in = op.Input(i);
-    for (auto u = 0; u < r_in.NumUses(); ++u) {
-      auto& r_arg_ind = r_in.UserArgInds().at(u);
-      if (r_in.Users().at(u) == &op && r_arg_ind > input_ind) {
-        r_arg_ind -= 1;
-      }
-    }
-  }
-
-  // Update the edges.
-  input.RemoveUse(target_use_ind);
-  op.RemoveInput(input_ind);
-
-  return &input;
-}
-
-bool IsIO(const LiteRtSubgraphT& subgraph, const LiteRtTensorT& tensor) {
-  return FindInput(subgraph, tensor) || FindOutput(subgraph, tensor);
-}
-
-LiteRtTensor DisconnectOutput(LiteRtOpT& op, LiteRtParamIndex output_ind) {
-  ABSL_DCHECK(output_ind < op.Outputs().size()) << "Removing tensor index oob";
-  auto& output = op.Output(output_ind);
-  output.ClearDefiningOp();
-  op.RemoveOutput(output_ind);
-  return &output;
-}
-
-void Drop(LiteRtOpT& litert_op) {
-  while (!litert_op.Inputs().empty()) {
-    DisconnectInput(litert_op, 0);
-  }
-  while (!litert_op.Outputs().empty()) {
-    DisconnectOutput(litert_op, 0);
-  }
-}
-
-bool DCE(LiteRtSubgraphT& subgraph) {
-  const auto ops_removed = subgraph.RemoveOpIf(IsOpDead);
-
-  auto rm_tensor = [&subgraph = std::as_const(subgraph)](const auto& t) {
-    return IsTensorDead(t) && !IsIO(subgraph, t);
-  };
-  const auto tensors_removed = subgraph.RemoveTensorIf(rm_tensor);
-  LITERT_LOG(LITERT_INFO, "Removed %d ops, %d tensors", ops_removed,
-             tensors_removed);
-
-  return (ops_removed + tensors_removed) > 0;
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model_graph.h b/tensorflow/lite/experimental/litert/core/model/model_graph.h
deleted file mode 100644
index 55e00e90c833..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_graph.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_GRAPH_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_GRAPH_H_
-
-#include <functional>
-#include <optional>
-
-#include "absl/container/inlined_vector.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_consts.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-
-// using IrMapping = absl::flat_hash_map<LiteRtTensor, LiteRtTensor>;
-
-// CLONING
-
-// Clones the basic data between tensors (like name and data) but not
-// things related to incoming/outgoing edges (users, defining op) or weights.
-void CloneTo(const LiteRtTensorT& src, LiteRtTensorT& dest);
-
-// Clones the basic data between ops (like op code and options) but
-// things related to incoming/outgoing edges (input/output tensors).
-void CloneTo(const LiteRtOpT& src, LiteRtOpT& dest);
-
-// Same as clone to, but allocates a the dest tensor into given subgraph.
-LiteRtTensorT& MakeClone(LiteRtSubgraphT& parent, const LiteRtTensorT& src);
-
-// Same as clone to, but allocates a the dest op into given subgraph.
-LiteRtOpT& MakeClone(LiteRtSubgraphT& parent, const LiteRtOpT& src);
-
-// OBSERVERS
-
-// Checks if tensor is input to given op, return its index if so.
-std::optional<LiteRtParamIndex> FindInput(const LiteRtOpT& op,
-                                          const LiteRtTensorT& tensor);
-
-// Checks if tensor is output to given op, return its index if so.
-std::optional<LiteRtParamIndex> FindOutput(const LiteRtOpT& op,
-                                           const LiteRtTensorT& tensor);
-
-// Checks if tensor is input to given subgraph, return its index if so.
-std::optional<LiteRtParamIndex> FindInput(const LiteRtSubgraphT& subgraph,
-                                          const LiteRtTensorT& tensor);
-
-// Checks if tensor is output to given subgraph, return its index if so.
-std::optional<LiteRtParamIndex> FindOutput(const LiteRtSubgraphT& subgraph,
-                                           const LiteRtTensorT& tensor);
-
-// Check if tensor is part of subgraph IO.
-bool IsIO(const LiteRtSubgraphT& subgraph, const LiteRtTensorT& tensor);
-
-using UseIndices =
-    absl::InlinedVector<LiteRtParamIndex, kExpectedMaxNumOfTensorUses>;
-
-// Checks if tensor is used by op, return the use inds for each use of tensor by
-// op (there may be multiple). These are the indexes to call
-// LiteRtTensorT::GetUse with.
-UseIndices FindUseInds(const LiteRtTensorT& tensor, const LiteRtOpT& op);
-
-// Is this tensor a constant tensor?
-bool IsConstant(const LiteRtTensorT& tensor);
-
-// MUTATORS
-
-// Attaches the pre-allocated tensor to be an input of given op.
-void AttachInput(LiteRtTensor tensor, LiteRtOpT& op);
-
-// Attaches the pre-allocated tensor to be an output of given op.
-void AttachOutput(LiteRtTensor tensor, LiteRtOpT& op);
-
-// Remove the input edge from an op. Return the disconnected tensor.
-LiteRtTensor DisconnectInput(LiteRtOpT& op, LiteRtParamIndex input_ind);
-
-// Remove an output edge from an op. Return the disconnected tensor.
-LiteRtTensor DisconnectOutput(LiteRtOpT& op, LiteRtParamIndex output_ind);
-
-// Remove all incoming and outgoing edges from this op. This can prep nodes
-// for removal in DCE.
-void Drop(LiteRtOpT& litert_op);
-
-// Run very naive dead code elimination. Removes only ops/tensors that have no
-// in/out edges. Ops are handled first. Ignores subgraph IO. Not recursive and
-// does only one pass. Returns if the graph was modified.
-// NOTE: This de-allocates removed objects, only use when references to these
-// objects will not be used.
-// TODO: Update this with complete work-list based approach.
-bool DCE(LiteRtSubgraphT& subgraph);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_GRAPH_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/model_graph_test.cc b/tensorflow/lite/experimental/litert/core/model/model_graph_test.cc
deleted file mode 100644
index 4258bc9edb74..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_graph_test.cc
+++ /dev/null
@@ -1,419 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/core/model/graph_validation.h"
-#include "tensorflow/lite/experimental/litert/core/model/ir_allocator.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-namespace {
-
-using ::testing::UnorderedElementsAreArray;
-
-// Custom matcher; example:
-// ```
-// LiteRtTensor tensor ...
-// EXPECT_THAT(tensor, HasRankedType(kLiteRtInt, absl::MakeSpan({2, 2})));
-// ```
-// TODO: Update to use dumping API directly and move to shared header.
-MATCHER_P2(HasRankedType, element_type, shape, "") {
-  if (arg.Type().first != kLiteRtRankedTensorType) {
-    *result_listener << "Not ranked tensor type";
-    return false;
-  }
-  const auto& ranked_tensor_type = arg.Type().second.ranked_tensor_type;
-  const auto& layout = ranked_tensor_type.layout;
-
-  const auto element_type_eq = ranked_tensor_type.element_type == element_type;
-  const auto rank_eq = layout.rank == std::size(shape);
-
-  auto actual_shape = absl::MakeConstSpan(layout.dimensions, layout.rank);
-  auto expected_shape =
-      absl::MakeConstSpan(std::cbegin(shape), std::cend(shape));
-  const auto shape_eq = actual_shape == expected_shape;
-
-  if (shape_eq && element_type_eq && rank_eq) {
-    return true;
-  }
-
-  *result_listener << "\n";
-  if (!shape_eq) {
-    *result_listener << "Not correct shape\n";
-  }
-  if (!element_type_eq) {
-    *result_listener << "Not correct element type\n";
-  }
-  if (!rank_eq) {
-    *result_listener << "Not correct rank\n";
-  }
-
-  *result_listener << absl::StreamFormat("Actual ElementType is: %d\n",
-                                         ranked_tensor_type.element_type);
-  *result_listener << absl::StreamFormat("Actual Rank is: %lu\n", layout.rank);
-  *result_listener << "Actual shape is: { ";
-  for (const auto d : actual_shape) {
-    *result_listener << absl::StreamFormat("%d, ", d);
-  }
-  *result_listener << "}\n";
-
-  return false;
-}
-
-using ::testing::ElementsAreArray;
-
-static constexpr size_t kRank = 1;
-static constexpr int32_t kDims[] = {2};
-static constexpr absl::Span<const int32_t> kDimsSpan(kDims);
-static constexpr auto kType = kLiteRtElementTypeInt32;
-static constexpr absl::string_view kCustomOptions = "OPTIONS";
-static constexpr auto kOpCode = kLiteRtOpCodeTflMul;
-
-LiteRtTensorT TestTensor() {
-  LiteRtTensorT tensor;
-  tensor.Type().first = kLiteRtRankedTensorType;
-  tensor.Type().second.ranked_tensor_type.element_type = kType;
-  tensor.Type().second.ranked_tensor_type.layout.dimensions[0] = kDims[0];
-  tensor.Type().second.ranked_tensor_type.layout.rank = kRank;
-  return tensor;
-}
-
-LiteRtTensorT& TestTensor(LiteRtTensorT& tensor) {
-  tensor.Type().first = kLiteRtRankedTensorType;
-  tensor.Type().second.ranked_tensor_type.element_type = kType;
-  tensor.Type().second.ranked_tensor_type.layout.dimensions[0] = kDims[0];
-  tensor.Type().second.ranked_tensor_type.layout.rank = kRank;
-  return tensor;
-}
-
-LiteRtOpT TestOp() {
-  LiteRtOpT op;
-  op.SetOpCode(kOpCode);
-  op.SetCustomOptions(kCustomOptions);
-  return op;
-}
-
-TEST(ModelGraphTest, CloneTensor) {
-  LiteRtTensorT dest;
-  CloneTo(TestTensor(), dest);
-  EXPECT_THAT(dest, HasRankedType(kType, kDimsSpan));
-}
-
-TEST(ModelQuantizationTypeTest, ClonePerChannelQuantization) {
-  static constexpr std::array kScale = {1.0f, 2.0f};
-  static constexpr std::array kZero = {1L, 2L};
-  static constexpr int32_t kQdim = 0;
-
-  IrAllocator<LiteRtTensorT> tensor_allocator;
-  auto& tensor = tensor_allocator.EmplaceBack();
-  LiteRtTensorT dest;
-  const auto quant = MakePerChannelQuantization(
-      kScale, kZero, kQdim,
-      [&tensor](auto s) { return tensor.RequestScratchBuffer(s); });
-
-  ASSERT_EQ(quant.first, kLiteRtQuantizationPerChannel);
-  const auto& per_channel = quant.second.per_channel;
-
-  const auto size = per_channel.num_channels;
-  ASSERT_EQ(size, 2);
-  EXPECT_EQ(per_channel.quantized_dimension, 0);
-  tensor.SetQarams(quant);
-
-  CloneTo(tensor, dest);
-  // Mimic DCE.
-  tensor_allocator.RemoveIf([](auto& t) { return true; });
-  auto dest_quant = dest.Qparams();
-
-  auto scales = absl::MakeConstSpan(dest_quant.second.per_channel.scales,
-                                    dest_quant.second.per_channel.num_channels);
-  auto zeros = absl::MakeConstSpan(dest_quant.second.per_channel.zero_points,
-                                   dest_quant.second.per_channel.num_channels);
-
-  ASSERT_EQ(scales.size(), 2);
-  ASSERT_EQ(zeros.size(), 2);
-  EXPECT_THAT(scales, ElementsAreArray(kScale));
-  EXPECT_THAT(zeros, ElementsAreArray(kZero));
-}
-
-TEST(ModelGraphTest, MakeCloneTensor) {
-  LiteRtSubgraphT subgraph;
-  auto& dest = MakeClone(subgraph, TestTensor());
-  EXPECT_THAT(dest, HasRankedType(kType, kDimsSpan));
-}
-
-TEST(ModelGraphTest, CloneCstSameManager) {
-  OwningBufferRef<uint8_t> buffer("DATA");
-  LiteRtModelT model;
-  const auto num_buffers = model.Buffers()->NumBuffers();
-  auto& sg = model.EmplaceSubgraph();
-  auto& src = TestTensor(sg.EmplaceTensor());
-  SetWeightsFromUnownedBuffer(src.Weights(), buffer);
-  auto& dest = MakeClone(sg, src);
-  EXPECT_EQ(dest.Weights().Buffer().StrView(), buffer.StrView());
-  EXPECT_EQ(model.Buffers()->NumBuffers(), num_buffers + 1);
-  EXPECT_EQ(dest.Weights().GetBufferId(), src.Weights().GetBufferId());
-  EXPECT_EQ(dest.Weights().GetBufferManager(),
-            src.Weights().GetBufferManager());
-  EXPECT_EQ(dest.Weights().Buffer().Data(), src.Weights().Buffer().Data());
-}
-
-TEST(ModelGraphTest, CloneCstDifferentManager) {
-  OwningBufferRef<uint8_t> buffer("DATA");
-  LiteRtSubgraphT sg;
-  auto& src = TestTensor(sg.EmplaceTensor());
-  SetWeightsFromUnownedBuffer(src.Weights(), buffer);
-  auto& dest = MakeClone(sg, src);
-  EXPECT_EQ(dest.Weights().Buffer().StrView(), buffer.StrView());
-  EXPECT_NE(dest.Weights().GetBufferManager(),
-            src.Weights().GetBufferManager());
-  EXPECT_NE(dest.Weights().Buffer().Data(), src.Weights().Buffer().Data());
-}
-
-TEST(ModelGraphTest, CloneOp) {
-  LiteRtOpT dest;
-  CloneTo(TestOp(), dest);
-  EXPECT_EQ(dest.OpCode(), kOpCode);
-  EXPECT_EQ(dest.CustomOptions().StrView(), kCustomOptions);
-}
-
-TEST(ModelGraphTest, MakeCloneOp) {
-  LiteRtSubgraphT subgraph;
-  auto& dest = MakeClone(subgraph, TestOp());
-  EXPECT_EQ(dest.OpCode(), kOpCode);
-  EXPECT_EQ(dest.CustomOptions().StrView(), kCustomOptions);
-}
-
-TEST(ModelGraphTest, OpFindInput) {
-  auto op = TestOp();
-  auto tensor = TestTensor();
-  AttachInput(&tensor, op);
-  auto input = FindInput(op, tensor);
-  ASSERT_TRUE(input);
-  EXPECT_EQ(*input, 0);
-}
-
-TEST(ModelGraphTest, OpFindOutput) {
-  auto op = TestOp();
-  auto tensor = TestTensor();
-  AttachOutput(&tensor, op);
-  auto output = FindOutput(op, tensor);
-  ASSERT_TRUE(output);
-  EXPECT_EQ(*output, 0);
-}
-
-TEST(ModelGraphTest, SubgraphFindInput) {
-  LiteRtSubgraphT subgraph;
-  auto tensor = TestTensor();
-  subgraph.Inputs().push_back(&tensor);
-  auto input = FindInput(subgraph, tensor);
-  ASSERT_TRUE(input);
-  EXPECT_EQ(*input, 0);
-}
-
-TEST(ModelGraphTest, SubgraphFindOutput) {
-  LiteRtSubgraphT subgraph;
-  auto tensor = TestTensor();
-  subgraph.Outputs().push_back(&tensor);
-  auto output = FindOutput(subgraph, tensor);
-  ASSERT_TRUE(output);
-  EXPECT_EQ(*output, 0);
-}
-
-TEST(ModelGraphTest, TensorFindUseInds) {
-  auto op1 = TestOp();
-  auto op2 = TestOp();
-  auto tensor = TestTensor();
-
-  AttachInput(&tensor, op1);
-  AttachInput(&tensor, op2);
-  AttachInput(&tensor, op1);
-
-  auto use_inds = FindUseInds(tensor, op1);
-  auto uses = GetTensorUses(tensor, use_inds);
-  ASSERT_EQ(uses.size(), 2);
-
-  LiteRtTensorT::UseVec expected = {{&op1, 0}, {&op1, 1}};
-  EXPECT_THAT(uses, UnorderedElementsAreArray(expected));
-}
-
-TEST(ModelGraphTest, OpAttachInput) {
-  auto op = TestOp();
-  auto tensor = TestTensor();
-  AttachInput(&tensor, op);
-  EXPECT_THAT(op.Inputs(), ElementsAreArray({&tensor}));
-  EXPECT_THAT(tensor.Users(), ElementsAreArray({&op}));
-  EXPECT_THAT(tensor.UserArgInds(), ElementsAreArray({0}));
-}
-
-TEST(ModelGraphTest, OpAttachOutput) {
-  auto op = TestOp();
-  auto tensor = TestTensor();
-  AttachOutput(&tensor, op);
-  EXPECT_THAT(op.Outputs(), ElementsAreArray({&tensor}));
-  EXPECT_EQ(tensor.DefiningOp(), &op);
-  EXPECT_EQ(tensor.DefiningOpOutInd(), 0);
-}
-
-TEST(ModelGraphTest, DisconnectInputOp) {
-  auto op = TestOp();
-  auto tensor = TestTensor();
-  AttachInput(&tensor, op);
-  auto disconnected = DisconnectInput(op, 0);
-  EXPECT_EQ(disconnected, &tensor);
-  EXPECT_TRUE(op.Inputs().empty());
-  EXPECT_TRUE(tensor.Users().empty());
-  EXPECT_TRUE(tensor.UserArgInds().empty());
-}
-
-TEST(ModelGraphTest, DisconnectMiddleInputOp) {
-  auto op = TestOp();
-
-  auto tensor1 = TestTensor();
-  auto tensor2 = TestTensor();
-  auto tensor3 = TestTensor();
-
-  AttachInput(&tensor1, op);
-  AttachInput(&tensor2, op);
-  AttachInput(&tensor3, op);
-
-  auto disconnected = DisconnectInput(op, 1);
-
-  EXPECT_EQ(disconnected, &tensor2);
-  ASSERT_EQ(op.Inputs().size(), 2);
-  EXPECT_EQ(op.Inputs().front(), &tensor1);
-  EXPECT_EQ(op.Inputs().back(), &tensor3);
-  ASSERT_TRUE(tensor2.Users().empty());
-  ASSERT_TRUE(tensor2.UserArgInds().empty());
-
-  ASSERT_TRUE(ValidateLocalTopology(op));
-}
-
-TEST(ModelGraphTest, DisconnectOutputOp) {
-  auto op = TestOp();
-  auto tensor = TestTensor();
-  AttachOutput(&tensor, op);
-  auto disconnected = DisconnectOutput(op, 0);
-  EXPECT_EQ(disconnected, &tensor);
-  EXPECT_EQ(tensor.DefiningOp(), nullptr);
-  EXPECT_TRUE(op.Outputs().empty());
-}
-
-TEST(ModelGraphTest, DropOp) {
-  LiteRtOpT op;
-
-  LiteRtTensorT input1;
-  LiteRtTensorT input2;
-  LiteRtTensorT output;
-
-  AttachInput(&input1, op);
-  AttachInput(&input2, op);
-  AttachOutput(&output, op);
-
-  Drop(op);
-
-  EXPECT_TRUE(op.Inputs().empty());
-  EXPECT_TRUE(op.Outputs().empty());
-  EXPECT_TRUE(input1.Users().empty());
-  EXPECT_TRUE(input2.Users().empty());
-  EXPECT_EQ(output.DefiningOp(), nullptr);
-}
-
-TEST(ModelGraphTestDCE, NoDeadCode) {
-  LiteRtSubgraphT subgraph;
-
-  auto& input = subgraph.EmplaceTensor();
-  auto& output = subgraph.EmplaceTensor();
-
-  auto& op = subgraph.EmplaceOp();
-
-  AttachInput(&input, op);
-  AttachOutput(&output, op);
-
-  subgraph.Inputs().push_back(&input);
-  subgraph.Outputs().push_back(&output);
-
-  ASSERT_FALSE(DCE(subgraph));
-  EXPECT_EQ(subgraph.Ops().size(), 1);
-  EXPECT_EQ(subgraph.Tensors().size(), 2);
-
-  ASSERT_TRUE(
-      ValidateLocalTopology(subgraph.Ops().cbegin(), subgraph.Ops().cend()));
-  ASSERT_TRUE(ValidateSubgraphIO(subgraph));
-}
-
-TEST(ModelGraphTestDCE, DeadTensor) {
-  LiteRtSubgraphT subgraph;
-  subgraph.EmplaceTensor();
-
-  ASSERT_TRUE(DCE(subgraph));
-  EXPECT_TRUE(subgraph.Tensors().empty());
-
-  ASSERT_TRUE(
-      ValidateLocalTopology(subgraph.Ops().cbegin(), subgraph.Ops().cend()));
-  ASSERT_TRUE(ValidateSubgraphIO(subgraph));
-}
-
-TEST(ModelGraphTestDCE, DeadOp) {
-  LiteRtSubgraphT subgraph;
-  subgraph.EmplaceOp();
-
-  ASSERT_TRUE(DCE(subgraph));
-  EXPECT_TRUE(subgraph.Ops().empty());
-
-  ASSERT_TRUE(
-      ValidateLocalTopology(subgraph.Ops().cbegin(), subgraph.Ops().cend()));
-  ASSERT_TRUE(ValidateSubgraphIO(subgraph));
-}
-
-TEST(ModelGraphTestDCE, SomeDead) {
-  LiteRtSubgraphT subgraph;
-
-  auto& input = subgraph.EmplaceTensor();
-  auto& output = subgraph.EmplaceTensor();
-
-  auto& op = subgraph.EmplaceOp();
-
-  AttachInput(&input, op);
-  AttachOutput(&output, op);
-
-  // Dead
-  subgraph.EmplaceTensor();
-  subgraph.EmplaceOp();
-
-  subgraph.Inputs().push_back(&input);
-  subgraph.Outputs().push_back(&output);
-
-  ASSERT_TRUE(DCE(subgraph));
-  EXPECT_EQ(subgraph.Ops().size(), 1);
-  EXPECT_EQ(subgraph.Tensors().size(), 2);
-
-  ASSERT_TRUE(
-      ValidateLocalTopology(subgraph.Ops().cbegin(), subgraph.Ops().cend()));
-  ASSERT_TRUE(ValidateSubgraphIO(subgraph));
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model_load.cc b/tensorflow/lite/experimental/litert/core/model/model_load.cc
deleted file mode 100644
index d666cc525dc1..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_load.cc
+++ /dev/null
@@ -1,420 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model_load.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/core/model/buffer_manager.h"
-#include "tensorflow/lite/experimental/litert/core/model/flatbuffer_to_litert.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace litert::internal {
-namespace {
-
-// Provides a view of model-level resources when constructing litert graph.
-class FlatbufferContext {
- public:
-  using LiteRtBufferId = uint32_t;
-  using TflBufferInd = uint32_t;
-  using BufferIdMap = absl::flat_hash_map<TflBufferInd, LiteRtBufferId>;
-
-  FlatbufferContext(const FlatbufferWrapper& tfl_flatbuffer,
-                    BufferManager* buffer_manager)
-      : tfl_flatbuffer_(tfl_flatbuffer), buffer_manager_(buffer_manager) {}
-
-  void SetOpCode(LiteRtOpT& litert_op, uint32_t ind) {
-    const auto builtin_code =
-        PackedModel()->operator_codes()->Get(ind)->builtin_code();
-    litert_op.SetOpCode(static_cast<LiteRtOpCode>(builtin_code));
-    detail::SetTflOpCodeInd(litert_op, ind);
-  }
-
-  // Get the buffer at the given index in the tflite model.
-  Expected<const TflPackedBuffer*> GetTflBuffer(uint32_t ind) const {
-    const auto* packed_model = tfl_flatbuffer_.PackedModel();
-    if (ind >= packed_model->buffers()->size()) {
-      LITERT_LOG(LITERT_ERROR, "Buffer index out of range");
-      return Error(kLiteRtStatusErrorInvalidArgument);
-    }
-    return packed_model->buffers()->Get(ind);
-  }
-
-  BufferManager* GetBufferManager() { return buffer_manager_; }
-
-  const uint8_t* AllocBase() const { return tfl_flatbuffer_.AllocBase(); }
-
-  const TflPackedModel* PackedModel() const {
-    return tfl_flatbuffer_.PackedModel();
-  }
-
-  BufferIdMap& RegisteredTflBufferIds() { return registered_tfl_buffer_ids_; }
-
- private:
-  const FlatbufferWrapper& tfl_flatbuffer_;
-  BufferManager* buffer_manager_;
-  BufferIdMap registered_tfl_buffer_ids_;
-};
-
-LiteRtStatus UnpackOp(FlatbufferContext& context, LiteRtSubgraphT& parent,
-                      const TflPackedOp& tfl_op, LiteRtOpT& litert_op) {
-  // I/O TENSORS
-
-  if (tfl_op.intermediates() && tfl_op.intermediates()->size() != 0) {
-    // TODO: b/365299994 - Support intermediates.
-    LITERT_LOG(LITERT_ERROR, "Intermediate tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  if (tfl_op.mutating_variable_inputs() &&
-      tfl_op.mutating_variable_inputs()->size() != 0) {
-    // TODO: b/365299994 - Support mutating variable inputs.
-    LITERT_LOG(LITERT_ERROR, "Mutating variable inputs not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  const auto num_inputs = tfl_op.inputs()->size();
-  for (auto i = 0; i < num_inputs; ++i) {
-    const auto input_ind = tfl_op.inputs()->Get(i);
-    // Skipping optional input tensor.
-    if (input_ind == -1) {
-      continue;
-    }
-    AttachInput(&parent.Tensor(input_ind), litert_op);
-  }
-
-  const auto num_outputs = tfl_op.outputs()->size();
-  for (auto i = 0; i < num_outputs; ++i) {
-    const auto output_ind = tfl_op.outputs()->Get(i);
-    AttachOutput(&parent.Tensor(output_ind), litert_op);
-  }
-
-  // OPTIONS
-
-  if (tfl_op.large_custom_options_size() != 0) {
-    // TODO: b/365299994 - Support large custom options.
-    LITERT_LOG(LITERT_ERROR, "Large custom options not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  const auto* custom_opts = tfl_op.custom_options();
-  if (custom_opts) {
-    litert_op.SetCustomOptions(custom_opts->data(), custom_opts->size());
-  }
-
-  // TODO figure out how to parse builtins with the packed flatbuffer api.
-  TflOpPtr tfl_op_ptr(tfl_op.UnPack());
-  detail::SetTflOptions(litert_op, std::move(tfl_op_ptr->builtin_options));
-  detail::SetTflOptions2(litert_op, std::move(tfl_op_ptr->builtin_options_2));
-
-  // OP CODE
-
-  context.SetOpCode(litert_op, tfl_op.opcode_index());
-
-  return kLiteRtStatusOk;
-}
-
-Expected<BufferRef<uint8_t>> ReadBuffer(FlatbufferContext& context,
-                                        uint32_t buffer_ind) {
-  auto buffer = context.GetTflBuffer(buffer_ind);
-  if (!buffer) {
-    return buffer.Error();
-  }
-
-  const auto& tfl_buffer = **buffer;
-
-  if (tfl_buffer.offset() != 0) {
-    // Data is appended to the end of the flatbuffer.
-
-    const auto* alloc_base = context.AllocBase();
-    const auto offset = tfl_buffer.offset();
-    const auto size = tfl_buffer.size();
-
-    return BufferRef<uint8_t>(alloc_base + offset, size);
-  } else if (tfl_buffer.data()) {
-    // Data is in the flatbuffer.
-
-    const auto* start = tfl_buffer.data()->data();
-    const auto size = tfl_buffer.data()->size();
-
-    return BufferRef<uint8_t>(start, size);
-  } else {
-    return BufferRef<uint8_t>();
-  }
-}
-
-LiteRtStatus UnpackTensor(FlatbufferContext& context,
-                          const TflPackedTensor& tfl_tensor,
-                          LiteRtTensorT& litert_tensor) {
-  const auto buffer_ind = tfl_tensor.buffer();
-  if (buffer_ind != 0) {
-    auto buffer = ReadBuffer(context, buffer_ind);
-    if (!buffer) {
-      return buffer.Error().Status();
-    }
-
-    auto it = context.RegisteredTflBufferIds().find(buffer_ind);
-    if (it != context.RegisteredTflBufferIds().end()) {
-      litert_tensor.Weights().SetBufferId(it->second);
-    } else {
-      SetWeightsFromUnownedBuffer(litert_tensor.Weights(), *buffer);
-      context.RegisteredTflBufferIds()[buffer_ind] =
-          litert_tensor.Weights().GetBufferId();
-    }
-  }
-
-  // TENSOR TYPE
-
-  TflTensorType tfl_tensor_type(tfl_tensor.type(), TflShapeInfo(tfl_tensor));
-  auto tensor_type = MapTensorType(tfl_tensor_type);
-  if (!tensor_type) {
-    return tensor_type.Error().Status();
-  }
-
-  litert_tensor.SetType(std::move(*tensor_type));
-
-  // QUANTIZATION
-
-  if (tfl_tensor.quantization()) {
-    TflQuantizationPtr tfl_quantization(tfl_tensor.quantization()->UnPack());
-    auto quantization = MapQuantization(tfl_quantization.get(), litert_tensor);
-    if (!quantization) {
-      return quantization.Error().Status();
-    }
-    litert_tensor.SetQarams(std::move(*quantization));
-  }
-
-  // MISC
-
-  if (tfl_tensor.name()) {
-    litert_tensor.SetName(tfl_tensor.name()->str());
-  }
-
-  if (tfl_tensor.is_variable()) {
-    // TODO: b/365299994 - Support variable tensors.
-    LITERT_LOG(LITERT_ERROR, "Variable tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  if (tfl_tensor.variant_tensors() &&
-      tfl_tensor.variant_tensors()->size() != 0) {
-    // TODO: b/365299994 - Support variant tensors.
-    LITERT_LOG(LITERT_ERROR, "Variant tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  if (tfl_tensor.sparsity() != nullptr) {
-    // TODO: b/365299994 - Support sparsity tensors.
-    LITERT_LOG(LITERT_ERROR, "Sparsity tensors not yet supported.");
-    return kLiteRtStatusErrorUnsupported;
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus UnpackSubgraph(FlatbufferContext& context,
-                            const TflPackedSubgraph& tfl_subgraph,
-                            LiteRtSubgraphT& litert_subgraph) {
-  // Unpack tensors.
-  const auto num_tensors = tfl_subgraph.tensors()->size();
-  for (auto i = 0; i < num_tensors; ++i) {
-    const auto* tfl_tensor = tfl_subgraph.tensors()->Get(i);
-    LITERT_RETURN_IF_ERROR(
-        UnpackTensor(context, *tfl_tensor, litert_subgraph.EmplaceTensor()));
-  }
-
-  // Unpack ops, pass litert_subgraph so they can look up the new litert
-  // tensors.
-  const auto num_ops = tfl_subgraph.operators()->size();
-  for (auto i = 0; i < num_ops; ++i) {
-    const auto* tfl_op = tfl_subgraph.operators()->Get(i);
-    LITERT_RETURN_IF_ERROR(UnpackOp(context, litert_subgraph, *tfl_op,
-                                    litert_subgraph.EmplaceOp()));
-  }
-
-  // Update subgraph I/O.
-  const auto num_inputs = tfl_subgraph.inputs()->size();
-  for (auto i = 0; i < num_inputs; ++i) {
-    const auto tfl_input_ind = tfl_subgraph.inputs()->Get(i);
-    litert_subgraph.Inputs().push_back(&litert_subgraph.Tensor(tfl_input_ind));
-  }
-  const auto num_outputs = tfl_subgraph.outputs()->size();
-  for (auto i = 0; i < num_outputs; ++i) {
-    const auto tfl_output_ind = tfl_subgraph.outputs()->Get(i);
-    litert_subgraph.Outputs().push_back(
-        &litert_subgraph.Tensor(tfl_output_ind));
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus UnpackSignatures(std::vector<TflSignaturePtr>& tfl_signatures,
-                              LiteRtModelT& parent) {
-  for (auto& tfl_signature : tfl_signatures) {
-    if (tfl_signature->subgraph_index >= parent.Subgraphs().size()) {
-      LITERT_LOG(LITERT_ERROR,
-                 "Signature does not refer to a valid subgraph index.");
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-
-    auto* litert_subgraph =
-        parent.Subgraphs().at(tfl_signature->subgraph_index);
-
-    auto& tfl_inputs = tfl_signature->inputs;
-    auto& tfl_outputs = tfl_signature->outputs;
-
-    // Tflite signatures map a tensor index to a name. The input & output
-    // indexes of signatures and subgraph are not matched, but the nubmer of
-    // inputs and outputs should be the same.
-    if (tfl_inputs.size() != litert_subgraph->Inputs().size() ||
-        tfl_outputs.size() != litert_subgraph->Outputs().size()) {
-      LITERT_LOG(LITERT_ERROR,
-                 "Signature has incorrect number of input/outputs");
-      return kLiteRtStatusErrorInvalidFlatbuffer;
-    }
-
-    // The tensor names may not be matched between signature and subgraph.
-    // Update the tensor names with the signature names since the signature
-    // names are used for LiteRT APIs.
-    for (auto i = 0; i < tfl_inputs.size(); ++i) {
-      const auto& tfl_input = tfl_inputs.at(i);
-      auto* index_litert_input =
-          litert_subgraph->Tensors().at(tfl_input->tensor_index);
-      index_litert_input->SetName(tfl_input->name);
-    }
-    for (auto i = 0; i < tfl_outputs.size(); ++i) {
-      const auto& tfl_output = tfl_outputs.at(i);
-      auto* index_litert_output =
-          litert_subgraph->Tensors().at(tfl_output->tensor_index);
-      index_litert_output->SetName(tfl_output->name);
-    }
-
-    // Keep signature input/output names in the same order as the subgraph.
-    std::vector<std::string> input_names;
-    input_names.reserve(tfl_inputs.size());
-    for (auto& tensor : litert_subgraph->Inputs()) {
-      input_names.push_back(std::string(tensor->Name()));
-    }
-    std::vector<std::string> output_names;
-    output_names.reserve(tfl_outputs.size());
-    for (auto& tensor : litert_subgraph->Outputs()) {
-      output_names.push_back(std::string(tensor->Name()));
-    }
-
-    parent.EmplaceSignature(litert_subgraph, std::move(input_names),
-                            std::move(output_names),
-                            tfl_signature->signature_key);
-  }
-
-  if (tfl_signatures.empty()) {
-    parent.EmplaceSignature(MakeDefaultSignature(parent.MainSubgraph()));
-  }
-
-  return kLiteRtStatusOk;
-}
-
-Expected<LiteRtModelT::Ptr> UnpackModel(FlatbufferWrapper&& flatbuffer) {
-  auto litert_model = std::make_unique<LiteRtModelT>(std::move(flatbuffer));
-
-  FlatbufferContext context(detail::GetTflFlatbuffer(*litert_model),
-                            litert_model->Buffers());
-  const auto* packed_model = context.PackedModel();
-
-  if (packed_model->subgraphs()) {
-    const auto num_subgraphs = packed_model->subgraphs()->size();
-    for (auto i = 0; i < num_subgraphs; ++i) {
-      const auto* tfl_subgraph = packed_model->subgraphs()->Get(i);
-      LITERT_RETURN_IF_ERROR(UnpackSubgraph(context, *tfl_subgraph,
-                                            litert_model->EmplaceSubgraph()));
-    }
-  }
-
-  // TODO Figure out how to load signatures in packed flatbuffer.
-  if (packed_model->signature_defs()) {
-    std::vector<TflSignaturePtr> tfl_signatures;
-    for (auto i = 0; i < packed_model->signature_defs()->size(); ++i) {
-      const auto* tfl_signature = packed_model->signature_defs()->Get(i);
-      tfl_signatures.push_back(TflSignaturePtr(tfl_signature->UnPack()));
-    }
-    LITERT_RETURN_IF_ERROR(UnpackSignatures(tfl_signatures, *litert_model));
-  } else {
-    litert_model->EmplaceSignature(
-        MakeDefaultSignature(litert_model->MainSubgraph()));
-  }
-
-  if (packed_model->metadata()) {
-    const auto num_metadata = packed_model->metadata()->size();
-    for (auto i = 0; i < num_metadata; ++i) {
-      const auto* tfl_metadata = packed_model->metadata()->Get(i);
-      auto name = tfl_metadata->name()->str();
-      const auto buf_id = tfl_metadata->buffer();
-      auto buf = ReadBuffer(context, buf_id);
-      if (!buf) {
-        return buf.Error();
-      }
-
-      litert_model->PushMetadata(name, buf->Data(), buf->Size());
-    }
-  }
-
-  if (packed_model->operator_codes()) {
-    const auto num_operator_codes = packed_model->operator_codes()->size();
-    std::vector<TflOpCodePtr> tfl_op_codes(num_operator_codes);
-    for (auto i = 0; i < num_operator_codes; ++i) {
-      const auto* tfl_op_code = packed_model->operator_codes()->Get(i);
-      TflOpCodePtr tfl_op_code_ptr(tfl_op_code->UnPack());
-      tfl_op_codes[i] = std::move(tfl_op_code_ptr);
-    }
-    detail::SetTflOpCodes(*litert_model, std::move(tfl_op_codes));
-  }
-
-  return litert_model;
-}
-
-}  // namespace
-
-Expected<LiteRtModelT::Ptr> LoadModelFromBuffer(BufferRef<uint8_t> buffer) {
-  auto flatbuffer = FlatbufferWrapper::CreateFromBuffer(buffer);
-  if (!flatbuffer) {
-    return flatbuffer.Error();
-  }
-  return UnpackModel(std::move(**flatbuffer));
-}
-
-Expected<LiteRtModelT::Ptr> LoadModelFromFile(absl::string_view filename) {
-  auto flatbuffer = FlatbufferWrapper::CreateFromTflFile(filename);
-  if (!flatbuffer) {
-    return flatbuffer.Error();
-  }
-  return UnpackModel(std::move(**flatbuffer));
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model_load.h b/tensorflow/lite/experimental/litert/core/model/model_load.h
deleted file mode 100644
index b6a8c2cdd0f6..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_load.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_LOAD_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_LOAD_H_
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-
-Expected<std::unique_ptr<LiteRtModelT>> LoadModelFromFile(
-    absl::string_view filename);
-
-Expected<std::unique_ptr<LiteRtModelT>> LoadModelFromBuffer(
-    BufferRef<uint8_t> buffer);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_LOAD_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/model_serialize.cc b/tensorflow/lite/experimental/litert/core/model/model_serialize.cc
deleted file mode 100644
index 86102dff6db4..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_serialize.cc
+++ /dev/null
@@ -1,552 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model_serialize.h"
-
-#include <sys/types.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-// schema/mutable/schema_generated.h and schema/schema_generated.h (included
-// through flatbuffer_tools.h via model.h) have the same #ifdef, thus this line
-// need to be put at the top to ensure we get the "mutable" version.
-#if 1
-#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
-#endif
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-#include "tensorflow/lite/experimental/litert/core/dispatch_op_schema.h"
-#include "tensorflow/lite/experimental/litert/core/insert_order_map.h"
-#include "tensorflow/lite/experimental/litert/core/model/litert_to_flatbuffer.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/schema/mutable/schema_generated.h"
-
-namespace litert::internal {
-namespace {
-
-using TensorMap = absl::flat_hash_map<LiteRtTensor, int32_t>;
-
-// This is expected to be used to serialize the dispatch op custom code.
-TflOpCodePtr MakeCustomOpCode(std::string custom_code_name) {
-  auto custom_code = std::make_unique<TflOpCode>();
-  custom_code->builtin_code = ::tflite::BuiltinOperator_CUSTOM;
-  custom_code->custom_code = std::move(custom_code_name);
-  custom_code->version = 1;
-  return custom_code;
-}
-
-// Utility for accessing flatbuffer state and other relevant state.
-class SerializationContext {
- public:
-  // Subgraph and op index pair.
-  using TflOpInd = std::pair<size_t, size_t>;
-  using TflOpAssetMap =
-      absl::flat_hash_map<TflOpInd, LiteRtModelT::OpAssetReference>;
-  using TflBufferInd = uint32_t;
-  using TflOffsetTensorMap =
-      absl::flat_hash_map<TflBufferInd, LiteRtModelT::BufferId>;
-  using TflBufferIdMap =
-      absl::flat_hash_map<LiteRtModelT::BufferId, TflBufferInd>;
-
-  explicit SerializationContext(uint32_t dispatch_op_code_ind,
-                                LiteRtModelT& litert_model,
-                                size_t bytecode_alignment)
-      : tfl_model_(std::make_unique<TflModel>()),
-        dispatch_op_code_ind_(dispatch_op_code_ind),
-        litert_model_(litert_model),
-        bytecode_alignment_(bytecode_alignment) {
-    // Tfl expects empty buffer 0.
-    tfl_model_->buffers.push_back(std::make_unique<TflBuffer>());
-  }
-
-  TflModel& Model() { return *tfl_model_.get(); }
-
-  TflModelPtr Release() && { return std::move(tfl_model_); }
-
-  LiteRtModelT& LitertModel() { return litert_model_; }
-
-  size_t BytecodeAlignment() const { return bytecode_alignment_; }
-
-  LiteRtStatus HandleTensorBuffer(TflTensor& tfl_tensor,
-                                  const LiteRtTensorT& litert_tensor) {
-    const auto litert_buf_id = litert_tensor.Weights().GetBufferId();
-    auto* buffer_manager = litert_tensor.Weights().GetBufferManager();
-
-    auto litert_buf_ctx = buffer_manager->GetContext(litert_buf_id);
-    if (!litert_buf_ctx) {
-      LITERT_LOG(LITERT_ERROR, "Failed to get buffer context");
-      return litert_buf_ctx.Error().Status();
-    }
-
-    auto litert_buf = buffer_manager->GetBuffer(litert_buf_id);
-    if (!litert_buf) {
-      LITERT_LOG(LITERT_ERROR, "Failed to get buffer");
-      return litert_buf.Error().Status();
-    }
-
-    TflBufferInd tfl_buffer_ind;
-    if (buffer_id_map_.contains(litert_buf_id)) {
-      tfl_buffer_ind = buffer_id_map_.at(litert_buf_id);
-    } else {
-      auto& tfl_buffer =
-          tfl_model_->buffers.emplace_back(std::make_unique<TflBuffer>());
-      tfl_buffer_ind = tfl_model_->buffers.size() - 1;
-
-      if (litert_buf_ctx->get().should_append) {
-        tfl_buffer->offset = 1;
-        tfl_buffer->size = 1;
-        offset_tensor_map_.emplace(tfl_buffer_ind, litert_buf_id);
-      } else {
-        tfl_buffer->data.assign(litert_buf->Data(),
-                                litert_buf->Data() + litert_buf->Size());
-      }
-      buffer_id_map_[litert_buf_id] = tfl_buffer_ind;
-    }
-
-    tfl_tensor.buffer = tfl_buffer_ind;
-
-    return kLiteRtStatusOk;
-  }
-
-  // Add to tfl model metadata.
-  void PushMetadata(std::string key, BufferRef<uint8_t> data) {
-    auto& tfl_buffer =
-        tfl_model_->buffers.emplace_back(std::make_unique<TflBuffer>());
-    const auto tfl_buffer_ind = tfl_model_->buffers.size() - 1;
-    tfl_buffer->data.assign(data.Data(), data.Data() + data.Size());
-    tfl_model_->metadata_buffer.push_back(tfl_buffer_ind);
-    auto tfl_metadata = std::make_unique<TflMetadata>();
-    tfl_metadata->name = key;
-    tfl_metadata->buffer = tfl_buffer_ind;
-    tfl_model_->metadata.push_back(std::move(tfl_metadata));
-  }
-
-  // Keep track of the given ops index as having a particular asset.
-  // These will be used to update the ops with the correct offset and size
-  // after the model is fully packed.
-  void AttachAssetToOp(size_t subgraph_ind, size_t op_ind,
-                       LiteRtModelT::OpAssetReference asset) {
-    TflOpInd tfl_op_ind = {subgraph_ind, op_ind};
-    op_asset_map_.emplace(tfl_op_ind, asset);
-  }
-
-  const TflOpAssetMap& OpAssetMap() const { return op_asset_map_; }
-
-  const TflOffsetTensorMap& OffsetTensorMap() const {
-    return offset_tensor_map_;
-  }
-
-  // Get the index in the tfl op codes for the dispatch custom code.
-  // This should be the only new custom code added after loading the initial
-  // tfl.
-  uint32_t DispatchOpCodeInd() const { return dispatch_op_code_ind_; }
-
- private:
-  TflModelPtr tfl_model_;
-  uint32_t dispatch_op_code_ind_;
-  LiteRtModelT& litert_model_;
-
-  TflOpAssetMap op_asset_map_;
-  TflOffsetTensorMap offset_tensor_map_;
-  TflBufferIdMap buffer_id_map_;
-  size_t bytecode_alignment_ = 0;
-};
-
-void SetOptions(const LiteRtOpT& litert_op, TflOp& tfl_op) {
-  tfl_op.builtin_options = detail::GetTflOptions(litert_op);
-  if (litert_op.CustomOptions().Size() != 0) {
-    tfl_op.custom_options = litert_op.CustomOptions().ToVec();
-    tfl_op.custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS;
-  }
-}
-
-LiteRtStatus PackOp(SerializationContext& builder, LiteRtOpT& litert_op,
-                    TflOp& tfl_op, const TensorMap& tensor_map) {
-  // Get index of the op code in the tfl model.
-  auto tfl_op_code_ind = detail::GetTflOpCodeInd(litert_op);
-  const bool is_dispatch_op = tfl_op_code_ind == detail::kDispatchOpCodeTflInd;
-
-  if (is_dispatch_op) {
-    tfl_op_code_ind = builder.DispatchOpCodeInd();
-  }
-
-  tfl_op.opcode_index = tfl_op_code_ind;
-
-  // Look up the tensor indices in the tfl model.
-  for (auto* in : litert_op.Inputs()) {
-    tfl_op.inputs.push_back(tensor_map.at(in));
-  }
-  for (auto* out : litert_op.Outputs()) {
-    tfl_op.outputs.push_back(tensor_map.at(out));
-  }
-
-  // Set generic options.
-  tfl_op.builtin_options = detail::GetTflOptions(litert_op);
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus PackTensor(SerializationContext& builder,
-                        LiteRtTensorT& litert_tensor, TflTensor& tfl_tensor) {
-  auto tfl_tensor_type = MapTensorType(litert_tensor.Type());
-  if (!tfl_tensor_type) {
-    return tfl_tensor_type.Error().Status();
-  }
-  auto [tfl_elem_type, tfl_shape] = *tfl_tensor_type;
-
-  tfl_tensor.type = tfl_elem_type;
-  tfl_tensor.shape.assign(tfl_shape.shape.begin(), tfl_shape.shape.end());
-  tfl_tensor.has_rank = tfl_shape.has_rank;
-  tfl_tensor.shape_signature.assign(tfl_shape.shape_signature.begin(),
-                                    tfl_shape.shape_signature.end());
-
-  auto tfl_quantization = MapQuantization(litert_tensor.Qparams());
-  if (!tfl_quantization) {
-    return tfl_quantization.Error().Status();
-  }
-  tfl_tensor.quantization = std::move(*tfl_quantization);
-
-  LITERT_RETURN_IF_ERROR(builder.HandleTensorBuffer(tfl_tensor, litert_tensor));
-
-  tfl_tensor.name = std::string(litert_tensor.Name());
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus PackSubgraph(SerializationContext& builder,
-                          LiteRtSubgraphT& litert_subgraph,
-                          TflSubgraph& tfl_subgraph, TensorMap& tensor_map,
-                          size_t subgraph_ind) {
-  for (auto* tensor : litert_subgraph.Tensors()) {
-    tfl_subgraph.tensors.push_back(std::make_unique<TflTensor>());
-    tensor_map.insert({tensor, tfl_subgraph.tensors.size() - 1});
-    LITERT_RETURN_IF_ERROR(
-        PackTensor(builder, *tensor, *tfl_subgraph.tensors.back()));
-  }
-
-  for (auto i = 0; i < litert_subgraph.Ops().size(); ++i) {
-    auto* op = litert_subgraph.Ops().at(i);
-
-    tfl_subgraph.operators.push_back(std::make_unique<TflOp>());
-    auto& tfl_op = *tfl_subgraph.operators.back();
-    LITERT_RETURN_IF_ERROR(PackOp(builder, *op, tfl_op, tensor_map));
-
-    // Set custom options.
-    if (auto op_asset = builder.LitertModel().FindOpAsset(op)) {
-      // This mechanism is currently only used for dispatch ops to store
-      // location of bytecode. Here we update the name and placeholder values
-      // for offset and size. These will be updated when the model is fully
-      // packed.
-      auto dispatch_opts = MakeDispatchOpOptions({
-          1,
-          1,
-          std::string(op_asset->second),
-      });
-      tfl_op.custom_options = dispatch_opts.ToVec();
-
-      // Save the "location" of the op and its asset.
-      builder.AttachAssetToOp(subgraph_ind, i, *op_asset);
-
-    } else if (op->CustomOptions().Size() != 0) {
-      tfl_op.custom_options = op->CustomOptions().ToVec();
-    }
-
-    tfl_op.custom_options_format = tflite::CustomOptionsFormat_FLEXBUFFERS;
-  }
-
-  for (auto* in : litert_subgraph.Inputs()) {
-    tfl_subgraph.inputs.push_back(tensor_map.at(in));
-  }
-
-  for (auto* out : litert_subgraph.Outputs()) {
-    tfl_subgraph.outputs.push_back(tensor_map.at(out));
-  }
-
-  return kLiteRtStatusOk;
-}
-
-Expected<TflModelPtr> PackAsTflite(SerializationContext& builder) {
-  auto& litert_model = builder.LitertModel();
-
-  // Pack litert subgraphs into tfl subgraphs and save the mapping of
-  // tensors.
-  TensorMap tensor_map;
-  for (auto i = 0; i < litert_model.Subgraphs().size(); ++i) {
-    auto& litert_subgraph = litert_model.Subgraph(i);
-    auto& tfl_subgraph = *builder.Model().subgraphs.emplace_back(
-        std::make_unique<TflSubgraph>());
-    LITERT_RETURN_IF_ERROR(
-        PackSubgraph(builder, litert_subgraph, tfl_subgraph, tensor_map, i));
-  }
-
-  // Serialize the signatures using saved tensor mapping.
-  for (auto* litert_signature : litert_model.Signatures()) {
-    auto* litert_subgraph = &litert_signature->GetSubgraph();
-
-    auto& tfl_signature = *builder.Model().signature_defs.emplace_back(
-        std::make_unique<TflSignature>());
-    tfl_signature.signature_key = std::string(litert_signature->Key());
-
-    auto begin = litert_model.Subgraphs().cbegin();
-    auto end = litert_model.Subgraphs().cend();
-    const auto litert_subgraph_ind =
-        std::find(begin, end, litert_subgraph) - begin;
-    tfl_signature.subgraph_index = litert_subgraph_ind;
-
-    auto input_ind = 0;
-    for (const auto& litert_name : litert_signature->InputNames()) {
-      auto& tfl_input = *tfl_signature.inputs.emplace_back(
-          std::make_unique<::tflite::TensorMapT>());
-      tfl_input.name = litert_name;
-      tfl_input.tensor_index =
-          tensor_map.find(litert_subgraph->Inputs().at(input_ind))->second;
-      ++input_ind;
-    }
-
-    auto output_ind = 0;
-    for (const auto& litert_name : litert_signature->OutputNames()) {
-      auto& tfl_output = *tfl_signature.outputs.emplace_back(
-          std::make_unique<::tflite::TensorMapT>());
-      tfl_output.name = litert_name;
-      tfl_output.tensor_index =
-          tensor_map.find(litert_subgraph->Outputs().at(output_ind))->second;
-      ++output_ind;
-    }
-  }
-
-  // Serialize metadata.
-  for (auto it = litert_model.MetadataBegin(); it != litert_model.MetadataEnd();
-       ++it) {
-    const auto& [key, buf_id] = *it;
-    auto buf = litert_model.Buffers()->GetBuffer(buf_id);
-    if (!buf) {
-      LITERT_LOG(LITERT_ERROR, "Failed to find metadata buffer");
-      return buf.Error();
-    }
-    builder.PushMetadata(key, *buf);
-  }
-
-  builder.Model().version = 3;
-
-  return std::move(builder).Release();
-}
-
-// Appends external buffers to the back of the serialized tflite model. Updates
-// the ops that references them with the correct offset and size in-place.
-Expected<OwningBufferRef<uint8_t>> SerializeWithAppendedBuffers(
-    SerializationContext& builder, OwningBufferRef<uint8_t> serialized_tfl,
-    LiteRtModelT& litert_model) {
-  if (builder.OpAssetMap().empty() && builder.OffsetTensorMap().empty()) {
-    return serialized_tfl;
-  }
-
-  const auto align = builder.BytecodeAlignment();
-  // Pad the original model to the next multiple of the alignment.
-  auto align_offset = [align](size_t& cur_offset) {
-    cur_offset = (cur_offset + align - 1) & ~(align - 1);
-  };
-
-  size_t cur_offset = serialized_tfl.Size();
-  align_offset(cur_offset);
-
-  // Calculate the offset and size of each op asset.
-  InsertOrderMap<LiteRtModelT::BufferId, std::pair<size_t, size_t>>
-      asset_buffer_offsets;
-  for (auto it = builder.OpAssetMap().cbegin();
-       it != builder.OpAssetMap().cend(); ++it) {
-    const auto& [buf_id, name] = it->second;
-    auto asset_buf = litert_model.Buffers()->GetBuffer(buf_id);
-    if (!asset_buf) {
-      return asset_buf.Error();
-    }
-    if (asset_buffer_offsets.Contains(buf_id)) {
-      continue;
-    }
-    asset_buffer_offsets.InsertOrAssign(buf_id,
-                                        {cur_offset, asset_buf->Size()});
-    cur_offset += asset_buf->Size();
-    align_offset(cur_offset);
-  }
-
-  // Calculate the offset and size of each offset tensor.
-  InsertOrderMap<SerializationContext::TflBufferInd, std::pair<size_t, size_t>>
-      offset_tensor_offsets;
-  for (auto it = builder.OffsetTensorMap().cbegin();
-       it != builder.OffsetTensorMap().cend(); ++it) {
-    const auto& [tfl_buffer_ind, litert_buf_id] = *it;
-    auto litert_buf = litert_model.Buffers()->GetBuffer(litert_buf_id);
-    if (!litert_buf) {
-      LITERT_LOG(LITERT_ERROR, "Failed to find offset tensor buffer");
-      return litert_buf.Error();
-    }
-    if (offset_tensor_offsets.Contains(tfl_buffer_ind)) {
-      continue;
-    }
-    offset_tensor_offsets.InsertOrAssign(tfl_buffer_ind,
-                                         {cur_offset, litert_buf->Size()});
-    cur_offset += litert_buf->Size();
-  }
-
-  // Read serialized tflite in packed form.
-  auto* tfl_model = tflite::GetMutableModel(serialized_tfl.Data());
-
-  // Find the ops that have external buffers and mark them with the future size
-  // and offset.
-  for (auto sg_ind = 0; sg_ind < tfl_model->mutable_subgraphs()->size();
-       ++sg_ind) {
-    auto* sg = tfl_model->mutable_subgraphs()->GetMutableObject(sg_ind);
-
-    for (auto op_ind = 0; op_ind < sg->mutable_operators()->size(); ++op_ind) {
-      SerializationContext::TflOpInd ind = {sg_ind, op_ind};
-
-      auto asset_buffer = builder.OpAssetMap().find(ind);
-      if (asset_buffer == builder.OpAssetMap().end()) {
-        // No external buffer for this op.
-        continue;
-      }
-
-      auto* op = sg->mutable_operators()->GetMutableObject(op_ind);
-
-      // The id of the buffer in the litert model.
-      const auto buf_id = asset_buffer->second.first;
-
-      // The real offset and size of the buffer in the serialized tflite model.
-      const auto offset_and_size = asset_buffer_offsets.Find(buf_id);
-      if (!offset_and_size) {
-        LITERT_LOG(LITERT_ERROR, "Failed to find offset and size for buffer");
-        return Error(kLiteRtStatusErrorInvalidFlatbuffer);
-      }
-      const auto [offset, size] = offset_and_size->get().second;
-
-      // The custom options should have already been set with the name and
-      // placeholder values for size and offset.
-      MutableBufferRef<uint8_t> old_raw_opts(
-          op->mutable_custom_options()->data(),
-          op->mutable_custom_options()->size());
-
-      // Update with real size and offset.
-      DispatchOpOptions dispach_opts(GetDispatchOpOptions(old_raw_opts));
-      dispach_opts.bytecode_offset = offset;
-      dispach_opts.bytecode_size = size;
-
-      if (!UpdateDispatchOpOptionsInPlace(dispach_opts, old_raw_opts)) {
-        LITERT_LOG(LITERT_ERROR, "Failed to update dispatch op options");
-        return Error(kLiteRtStatusErrorInvalidFlatbuffer);
-      }
-    }
-  }
-
-  // Find the buffers that are offset buffers and mark them with the future
-  // size and offset.
-  for (auto i = 0; i < tfl_model->mutable_buffers()->size(); ++i) {
-    auto* tfl_buffer = tfl_model->mutable_buffers()->GetMutableObject(i);
-    auto offset_size = offset_tensor_offsets.Find(i);
-    if (!offset_size) {
-      // Not offset buffer.
-      continue;
-    }
-    const auto [offset, size] = offset_size->get().second;
-    const auto offset_ok = tfl_buffer->mutate_offset(offset);
-    const auto size_ok = tfl_buffer->mutate_size(size);
-    if (!offset_ok || !size_ok) {
-      LITERT_LOG(LITERT_ERROR, "Failed to update offset and size for buffer");
-      return Error(kLiteRtStatusErrorInvalidFlatbuffer);
-    }
-  }
-
-  // Allocate buffer enough for original model and appendd buffers and copy.
-  OwningBufferRef<uint8_t> final_model(cur_offset);
-
-  // Copy serialized tflite model.
-  uint8_t* const start = final_model.Data();
-  std::memcpy(start, serialized_tfl.Data(), serialized_tfl.Size());
-
-  // Copy asset buffers (aligned).
-  for (auto it = asset_buffer_offsets.Begin(); it != asset_buffer_offsets.End();
-       ++it) {
-    const auto buf_id = it->first;
-
-    auto asset_buf = litert_model.Buffers()->GetBuffer(buf_id);
-    if (!asset_buf) {
-      LITERT_LOG(LITERT_ERROR, "Failed to find asset buffer");
-      return asset_buf.Error();
-    }
-    uint8_t* const offset = start + it->second.first;
-    std::memcpy(offset, asset_buf->Data(), asset_buf->Size());
-  }
-
-  // Copy offset tensor buffers.
-  for (auto it = offset_tensor_offsets.Begin();
-       it != offset_tensor_offsets.End(); ++it) {
-    const auto buf_id = it->first;
-
-    auto offset_buf = litert_model.Buffers()->GetBuffer(buf_id);
-    if (!offset_buf) {
-      LITERT_LOG(LITERT_ERROR, "Failed to find offset tensor buffer");
-      return offset_buf.Error();
-    }
-
-    uint8_t* const offset = start + it->second.first;
-    std::memcpy(offset, offset_buf->Data(), offset_buf->Size());
-  }
-
-  return final_model;
-}
-
-}  // namespace
-
-Expected<OwningBufferRef<uint8_t>> SerializeModel(LiteRtModelT&& model,
-                                                  size_t bytecode_alignment) {
-  // Pass the op code list through that was saved during loading. Add one more
-  // op code for the dispatch ops
-  auto tfl_op_codes = detail::TakeTflOpCodes(model);
-  tfl_op_codes.push_back(
-      MakeCustomOpCode(std::string(kLiteRtDispatchOpCustomCode)));
-
-  SerializationContext builder(tfl_op_codes.size() - 1, model,
-                               bytecode_alignment);
-  builder.Model().operator_codes = std::move(tfl_op_codes);
-
-  auto tfl_model = PackAsTflite(builder);
-  if (!tfl_model) {
-    LITERT_LOG(LITERT_ERROR, "Failed to pack as tflite");
-    return tfl_model.Error();
-  }
-
-  auto serialized_tfl = SerializeFlatbuffer(**tfl_model);
-  if (!VerifyFlatbuffer(serialized_tfl.Span())) {
-    LITERT_LOG(LITERT_ERROR, "Failed to verify flatbuffer");
-    return Error(kLiteRtStatusErrorInvalidFlatbuffer);
-  }
-
-  return SerializeWithAppendedBuffers(builder, std::move(serialized_tfl),
-                                      model);
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/model/model_serialize.h b/tensorflow/lite/experimental/litert/core/model/model_serialize.h
deleted file mode 100644
index 0ffa2d878ba8..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_serialize.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_SERIALIZE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_SERIALIZE_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert::internal {
-
-Expected<OwningBufferRef<uint8_t>> SerializeModel(
-    LiteRtModelT&& model, size_t bytecode_alignment = 1);
-
-}  // namespace litert::internal
-
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_MODEL_MODEL_SERIALIZE_H_
diff --git a/tensorflow/lite/experimental/litert/core/model/model_test.cc b/tensorflow/lite/experimental/litert/core/model/model_test.cc
deleted file mode 100644
index 047b9a827c56..000000000000
--- a/tensorflow/lite/experimental/litert/core/model/model_test.cc
+++ /dev/null
@@ -1,515 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/core/model/buffer_manager.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace litert::internal {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-//
-// Model
-//
-
-TEST(ModelTest, GetMetadata) {
-  static constexpr absl::string_view kMetadata = "VALUE";
-  static constexpr absl::string_view kKey = "KEY";
-
-  LiteRtModelT model;
-  LITERT_ASSERT_OK(model.PushMetadata(kKey, kMetadata));
-  auto found_metadata = model.FindMetadata(kKey);
-  ASSERT_TRUE(found_metadata);
-  EXPECT_EQ(found_metadata->StrView(), kMetadata);
-}
-
-TEST(ModelTest, MetadataDNE) {
-  LiteRtModelT model;
-  auto res = model.FindMetadata("FOO");
-  ASSERT_FALSE(res.HasValue());
-}
-
-TEST(ModelTest, EmplaceSubgraph) {
-  LiteRtModelT model;
-  auto& sg = model.EmplaceSubgraph();
-  EXPECT_EQ(model.Subgraphs().size(), 1);
-  auto& tensor = sg.EmplaceTensor();
-  EXPECT_EQ(tensor.Weights().GetBufferManager(), model.Buffers());
-}
-
-TEST(ModelTest, Signature) {
-  static constexpr absl::string_view kSignatureName = "MY_SIGNATURE";
-
-  const std::vector<std::string> inputs = {"input_1", "input_2"};
-  const std::vector<std::string> outputs = {"output_1"};
-
-  LiteRtModelT model;
-  auto& subgraph = model.EmplaceSubgraph();
-
-  auto& signature = model.EmplaceSignature(&subgraph, inputs, outputs,
-                                           std::string(kSignatureName));
-
-  auto found_signature = model.FindSignature(kSignatureName);
-  ASSERT_TRUE(found_signature);
-  EXPECT_EQ(found_signature->get(), signature);
-}
-
-TEST(ModelTest, SignatureDNE) {
-  static constexpr absl::string_view kSignatureName = "MY_SIGNATURE";
-  LiteRtModelT model;
-  auto found_signature = model.FindSignature(kSignatureName);
-  EXPECT_FALSE(found_signature);
-}
-
-TEST(ModelTest, AttachExternalBufferToOp) {
-  static constexpr absl::string_view kBufferData = "BUFFER_DATA";
-  static constexpr absl::string_view kOpName = "OP1";
-  static constexpr absl::string_view kOp2Name = "OP2";
-
-  LiteRtModelT model;
-  auto& subgraph = model.EmplaceSubgraph();
-  auto& op = subgraph.EmplaceOp();
-  auto& op2 = subgraph.EmplaceOp();
-
-  OwningBufferRef<uint8_t> external_buf(kBufferData);
-
-  auto buf1_id = model.Buffers()->RegisterOwnedBuffer(std::move(external_buf));
-
-  model.AttachAssetToOp(&op, buf1_id, std::string(kOpName));
-  model.AttachAssetToOp(&op2, buf1_id, std::string(kOp2Name));
-
-  auto op_1_res = model.FindOpAsset(&op);
-  ASSERT_TRUE(op_1_res);
-  EXPECT_EQ(op_1_res->second, kOpName);
-  EXPECT_EQ(op_1_res->first, buf1_id);
-
-  auto op_2_res = model.FindOpAsset(&op2);
-  ASSERT_TRUE(op_2_res);
-  EXPECT_EQ(op_2_res->second, kOp2Name);
-  EXPECT_EQ(op_2_res->first, buf1_id);
-}
-
-TEST(ModelTest, ExternalBufferNotFound) {
-  LiteRtModelT model;
-  LiteRtOpT op;
-  ASSERT_FALSE(model.FindOpAsset(&op));
-}
-
-//
-// Subgraph
-//
-
-TEST(ModelSubgraphTest, Input) {
-  LiteRtTensorT tensor;
-  LiteRtSubgraphT subgraph;
-  subgraph.Inputs().push_back(&tensor);
-  EXPECT_EQ(&subgraph.Input(0), subgraph.Inputs().front());
-}
-
-TEST(ModelSubgraphTest, Output) {
-  LiteRtTensorT tensor;
-  LiteRtSubgraphT subgraph;
-  subgraph.Outputs().push_back(&tensor);
-  EXPECT_EQ(&subgraph.Output(0), subgraph.Outputs().front());
-}
-
-TEST(ModelSubgraphTest, EmplaceTensor) {
-  LiteRtSubgraphT subgraph;
-  auto& tensor = subgraph.EmplaceTensor();
-  ASSERT_EQ(subgraph.Tensors().size(), 1);
-  EXPECT_THAT(subgraph.Tensors(), ElementsAreArray({&tensor}));
-}
-
-TEST(ModelSubgraphTest, EmplaceOp) {
-  LiteRtSubgraphT subgraph;
-  auto& op = subgraph.EmplaceOp();
-  ASSERT_EQ(subgraph.Ops().size(), 1);
-  EXPECT_THAT(subgraph.Ops(), ElementsAreArray({&op}));
-}
-
-//
-// Op
-//
-
-TEST(ModelOpTest, Input) {
-  LiteRtOpT op;
-  LiteRtTensorT tensor;
-  op.Inputs().push_back(&tensor);
-  EXPECT_EQ(&op.Input(0), op.Inputs().front());
-}
-
-TEST(ModelOpTest, Output) {
-  LiteRtOpT op;
-  LiteRtTensorT tensor;
-  op.Outputs().push_back(&tensor);
-  EXPECT_EQ(&op.Output(0), op.Outputs().front());
-}
-
-TEST(ModelOpTest, CustomOptions) {
-  static constexpr absl::string_view kOpts = "OPTIONS";
-
-  LiteRtOpT op;
-  op.SetCustomOptions(kOpts);
-  EXPECT_EQ(op.CustomOptions().StrView(), kOpts);
-}
-
-TEST(ModelOpTest, Options) {
-  static constexpr auto kOptsType = ::tflite::BuiltinOptions_AddOptions;
-
-  TflOptions options;
-  options.type = kOptsType;
-  options.Set(::tflite::AddOptionsT());
-
-  LiteRtOpT op;
-  detail::SetTflOptions(op, std::move(options));
-
-  ASSERT_EQ(detail::GetTflOptions(op).type, kOptsType);
-}
-
-TEST(ModelOpTest, OpCode) {
-  constexpr static auto kOpCode = kLiteRtOpCodeTflMul;
-
-  LiteRtOpT op;
-  op.SetOpCode(kOpCode);
-  EXPECT_EQ(op.OpCode(), kOpCode);
-}
-
-//
-// Tensor
-//
-
-TEST(ModelTensorTypeTest, MakeRankedTensorType) {
-  static constexpr const int32_t kDims[] = {2, 2};
-  static constexpr auto kDimsSpan = absl::MakeConstSpan(kDims);
-  static constexpr auto kElementType = kLiteRtElementTypeFloat32;
-  const auto tensor_type = MakeRankedTensorType(kElementType, kDimsSpan);
-  ASSERT_EQ(tensor_type.first, kLiteRtRankedTensorType);
-  EXPECT_EQ(tensor_type.second.ranked_tensor_type.element_type, kElementType);
-  const auto& layout = tensor_type.second.ranked_tensor_type.layout;
-  ASSERT_EQ(layout.rank, kDimsSpan.size());
-  EXPECT_THAT(absl::MakeConstSpan(layout.dimensions, kDimsSpan.size()),
-              ElementsAreArray(kDimsSpan));
-}
-
-TEST(ModelQuantizationTypeTest, MakePerTensor) {
-  static constexpr auto kScale = 1.0f;
-  static constexpr auto kZero = 1L;
-  const auto quant = MakePerTensorQuantization(kScale, kZero);
-  ASSERT_EQ(quant.first, kLiteRtQuantizationPerTensor);
-  const auto& per_tensor = quant.second.per_tensor;
-  EXPECT_EQ(per_tensor.scale, kScale);
-  EXPECT_EQ(per_tensor.zero_point, kZero);
-}
-
-TEST(ModelQuantizationTypeTest, MakePerChannel) {
-  static constexpr std::array kScale = {1.0f, 2.0f};
-  static constexpr std::array kZero = {1L, 2L};
-  static constexpr int32_t kQdim = 0;
-
-  LiteRtTensorT tensor;
-  const auto quant = MakePerChannelQuantization(
-      kScale, kZero, kQdim,
-      [&tensor](auto s) { return tensor.RequestScratchBuffer(s); });
-
-  ASSERT_EQ(quant.first, kLiteRtQuantizationPerChannel);
-  const auto& per_channel = quant.second.per_channel;
-
-  const auto size = per_channel.num_channels;
-  ASSERT_EQ(size, 2);
-  EXPECT_EQ(per_channel.quantized_dimension, 0);
-
-  auto scales = absl::MakeConstSpan(per_channel.scales, size);
-  auto zeros = absl::MakeConstSpan(per_channel.zero_points, size);
-
-  EXPECT_THAT(scales, ElementsAreArray(kScale));
-  EXPECT_THAT(zeros, ElementsAreArray(kZero));
-}
-
-TEST(ModelWeightsTest, EmptyWeights) {
-  LiteRtWeightsT weights;
-  EXPECT_EQ(weights.Buffer().Size(), 0);
-}
-
-TEST(ModelWeightsTest, WeightsWithExternalBufferManager) {
-  static constexpr absl::string_view kData = "some_data";
-  BufferManager manager;
-
-  LiteRtWeightsT weights;
-  weights.SetBufferManager(&manager);
-
-  BufferRef<uint8_t> buf(kData.data(), kData.size());
-  SetWeightsFromUnownedBuffer(weights, buf);
-
-  EXPECT_EQ(manager.GetBuffer(weights.GetBufferId())->StrView(), kData);
-  EXPECT_EQ(weights.Buffer().StrView(), kData);
-}
-
-TEST(ModelWeightsTest, WeightsFromUnownedBuffer) {
-  static constexpr absl::string_view kData = "some_data";
-
-  LiteRtWeightsT weights;
-  BufferRef<uint8_t> buf(kData.data(), kData.size());
-  SetWeightsFromUnownedBuffer(weights, buf);
-
-  EXPECT_EQ(weights.Buffer().StrView(), kData);
-}
-
-TEST(ModelWeightsTest, WeightsFromOwnedBuffer) {
-  static constexpr absl::string_view kData = "some_data";
-
-  LiteRtWeightsT weights;
-
-  OwningBufferRef<uint8_t> buf(kData);
-  SetWeightsFromUnownedBuffer(weights, std::move(buf));
-
-  EXPECT_EQ(weights.Buffer().StrView(), kData);
-}
-
-TEST(ModelWeightsTest, OverwriteBuffer) {
-  static constexpr absl::string_view kData = "some_data";
-  static constexpr absl::string_view kData2 = "some_data2";
-
-  LiteRtWeightsT weights;
-
-  {
-    OwningBufferRef<uint8_t> buf(kData);
-    SetWeightsFromOwnedBuffer(weights, std::move(buf));
-  }
-
-  {
-    OwningBufferRef<uint8_t> buf(kData2);
-    SetWeightsFromOwnedBuffer(weights, std::move(buf));
-  }
-
-  EXPECT_EQ(weights.Buffer().StrView(), kData2);
-}
-
-TEST(ModelTensorTest, Name) {
-  static constexpr absl::string_view kName = "TENSOR_NAME";
-
-  LiteRtTensorT tensor;
-  tensor.SetName(std::string(kName.begin(), kName.end()));
-  EXPECT_EQ(tensor.Name(), kName);
-}
-
-TEST(ModelTensorTest, Use) {
-  LiteRtTensorT tensor;
-  tensor.Users().emplace_back();
-  tensor.UserArgInds().push_back(0);
-  auto [user, ind] = tensor.GetUse(0);
-  EXPECT_EQ(user, tensor.Users().front());
-  EXPECT_EQ(ind, 0);
-}
-
-TEST(ModelTensorTest, DefiningOp) {
-  LiteRtTensorT tensor;
-  LiteRtOpT op;
-  tensor.SetDefiningOp(op, 0);
-  EXPECT_EQ(tensor.DefiningOp(), &op);
-  EXPECT_EQ(tensor.DefiningOpOutInd(), 0);
-}
-
-TEST(ModelTest, TransferSubgraphToReindexComposite) {
-  LiteRtModelT model;
-
-  auto& subgraph = model.EmplaceSubgraph();
-  auto& other_subgraph = model.EmplaceSubgraph();
-  auto& decomp_subgraph = model.EmplaceSubgraph();
-
-  auto& composite = subgraph.EmplaceOp();
-  composite.SetOpCode(kLiteRtOpCodeShloComposite);
-  ::tflite::StableHLOCompositeOptionsT opts;
-  opts.name = "composite";
-  opts.decomposition_subgraph_index = 2;
-  TflOptions2 options;
-  options.type = tflite::BuiltinOptions2_StableHLOCompositeOptions;
-  options.Set(std::move(opts));
-  detail::SetTflOptions2(composite, std::move(options));
-
-  LiteRtSubgraphT::Alloc dest;
-  std::vector<size_t> indices = {1};
-  model.TransferSubgraphTo(dest, std::move(indices));
-
-  EXPECT_THAT(model.Subgraphs(),
-              ElementsAreArray({&subgraph, &decomp_subgraph}));
-  EXPECT_THAT(dest.Elements(), ElementsAreArray({&other_subgraph}));
-
-  const auto& new_opts = detail::GetTflOptions2(composite);
-  const auto new_decomp_ind =
-      new_opts.AsStableHLOCompositeOptions()->decomposition_subgraph_index;
-  EXPECT_EQ(new_decomp_ind, 1);
-}
-
-TEST(ModelTest, TransferSubgraphToReindexCompositeNoChange) {
-  LiteRtModelT model;
-
-  auto& subgraph = model.EmplaceSubgraph();
-  auto& decomp_subgraph = model.EmplaceSubgraph();
-  auto& other_subgraph = model.EmplaceSubgraph();
-
-  auto& composite = subgraph.EmplaceOp();
-  composite.SetOpCode(kLiteRtOpCodeShloComposite);
-  ::tflite::StableHLOCompositeOptionsT opts;
-  opts.name = "composite";
-  opts.decomposition_subgraph_index = 1;
-  TflOptions2 options;
-  options.type = tflite::BuiltinOptions2_StableHLOCompositeOptions;
-  ;
-  options.Set(std::move(opts));
-  detail::SetTflOptions2(composite, std::move(options));
-
-  LiteRtSubgraphT::Alloc dest;
-  std::vector<size_t> indices = {2};
-  model.TransferSubgraphTo(dest, std::move(indices));
-
-  EXPECT_THAT(model.Subgraphs(),
-              ElementsAreArray({&subgraph, &decomp_subgraph}));
-  EXPECT_THAT(dest.Elements(), ElementsAreArray({&other_subgraph}));
-
-  const auto& new_opts = detail::GetTflOptions2(composite);
-  const auto new_decomp_ind =
-      new_opts.AsStableHLOCompositeOptions()->decomposition_subgraph_index;
-  EXPECT_EQ(new_decomp_ind, 1);
-}
-
-TEST(ModelTest, TransferSubgraphToReindexCompositeMultiple) {
-  LiteRtModelT model;
-
-  auto& subgraph = model.EmplaceSubgraph();
-  auto& other_subgraph = model.EmplaceSubgraph();
-  auto& other_subgraph2 = model.EmplaceSubgraph();
-  auto& other_subgraph3 = model.EmplaceSubgraph();
-  auto& decomp_subgraph = model.EmplaceSubgraph();
-  auto& other_subgraph4 = model.EmplaceSubgraph();
-
-  auto& composite = subgraph.EmplaceOp();
-  composite.SetOpCode(kLiteRtOpCodeShloComposite);
-  ::tflite::StableHLOCompositeOptionsT opts;
-  opts.name = "composite";
-  opts.decomposition_subgraph_index = 4;
-  TflOptions2 options;
-  options.type = tflite::BuiltinOptions2_StableHLOCompositeOptions;
-  ;
-  options.Set(std::move(opts));
-  detail::SetTflOptions2(composite, std::move(options));
-
-  LiteRtSubgraphT::Alloc dest;
-  std::vector<size_t> indices = {1, 3, 5};
-  model.TransferSubgraphTo(dest, std::move(indices));
-
-  EXPECT_THAT(model.Subgraphs(), ElementsAreArray({&subgraph, &other_subgraph2,
-                                                   &decomp_subgraph}));
-  EXPECT_THAT(
-      dest.Elements(),
-      ElementsAreArray({&other_subgraph, &other_subgraph3, &other_subgraph4}));
-
-  const auto& new_opts = detail::GetTflOptions2(composite);
-  const auto new_decomp_ind =
-      new_opts.AsStableHLOCompositeOptions()->decomposition_subgraph_index;
-  EXPECT_EQ(new_decomp_ind, 2);
-}
-
-//
-// Misc Ir Containers
-//
-
-TEST(ModelOpListTest, Push) {
-  LiteRtOpListT op_list;
-  LiteRtOpT op;
-  op_list.Push(&op);
-  auto vec = op_list.Values();
-  EXPECT_EQ(vec.front().first, &op);
-}
-
-TEST(ModelOpListTest, PushWithIndex) {
-  LiteRtOpListT op_list;
-  LiteRtOpT op;
-  op_list.Push(&op, 1);
-  auto vec = op_list.Values();
-  EXPECT_EQ(vec.front().first, &op);
-  EXPECT_EQ(vec.front().second, 1);
-}
-
-//
-// Traversal Utils
-//
-
-TEST(CcForEachIrTest, OpF3) {
-  LiteRtModelT model;
-  model.EmplaceSubgraph().EmplaceOp();
-
-  int count = 0;
-  ForEachIr(&model, [&](LiteRtSubgraph subgraph, int32_t subgraph_index,
-                        LiteRtOp op) { count++; });
-  EXPECT_EQ(count, 1);
-}
-
-TEST(CcForEachIrTest, OpF1) {
-  LiteRtModelT model;
-  model.EmplaceSubgraph().EmplaceOp();
-
-  int count = 0;
-  ForEachIr(&model, [&](LiteRtOp op) { count++; });
-  EXPECT_EQ(count, 1);
-}
-
-TEST(CcForEachIrTest, OpF2) {
-  LiteRtModelT model;
-  model.EmplaceSubgraph().EmplaceOp();
-
-  int count = 0;
-  ForEachIr(&model, [&](LiteRtSubgraph subgraph, LiteRtOp op) { count++; });
-  EXPECT_EQ(count, 1);
-}
-
-TEST(CcForEachIrTest, SgF1) {
-  LiteRtModelT model;
-  model.EmplaceSubgraph().EmplaceOp();
-
-  int count = 0;
-  ForEachIr(&model, [&](LiteRtSubgraph subgraph) { count++; });
-  EXPECT_EQ(count, 1);
-}
-
-TEST(CcForEachIrTest, SgF2) {
-  LiteRtModelT model;
-  model.EmplaceSubgraph().EmplaceOp();
-
-  int count = 0;
-  ForEachIr(&model,
-            [&](LiteRtSubgraph subgraph, int32_t subgraph_index) { count++; });
-  EXPECT_EQ(count, 1);
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/util/BUILD b/tensorflow/lite/experimental/litert/core/util/BUILD
deleted file mode 100644
index 88fb50a693cb..000000000000
--- a/tensorflow/lite/experimental/litert/core/util/BUILD
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        # copybara:uncomment "//third_party/mediapipe/calculators/tensor:__subpackages__",
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "flatbuffer_tools",
-    srcs = ["flatbuffer_tools.cc"],
-    hdrs = [
-        "flatbuffer_tools.h",
-        "//tensorflow/lite/experimental/litert/cc:litert_consts.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/lite:allocation",
-        "//tensorflow/lite:model_builder",
-        "//tensorflow/lite:stderr_reporter",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@flatbuffers//:runtime_cc",
-    ],
-)
-
-cc_test(
-    name = "flatbuffer_tools_test",
-    srcs = ["flatbuffer_tools_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-    ],
-    deps = [
-        ":flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "tensor_type_util",
-    srcs = [
-        "tensor_type_util.cc",
-    ],
-    hdrs = [
-        "tensor_type_util.h",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "tensor_type_util_test",
-    srcs = ["tensor_type_util_test.cc"],
-    deps = [
-        ":tensor_type_util",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.cc b/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.cc
deleted file mode 100644
index ab67b75b2cbd..000000000000
--- a/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/compiler/mlir/lite/allocation.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-
-#ifndef NDEBUG
-// Make flatbuffers verifier `assert` in debug mode.
-#define FLATBUFFERS_DEBUG_VERIFICATION_FAILURE
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers  // IWYU pragma: keep
-#endif
-
-#include <cstddef>
-#include <cstdint>
-
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "flatbuffers/verifier.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/model_builder.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/stderr_reporter.h"
-
-namespace litert::internal {
-
-using ::flatbuffers::Verifier;
-using ::tflite::VerifyModelBuffer;
-
-namespace {
-
-Expected<uint32_t> FindMetadataInd(const TflModel& model,
-                                   absl::string_view key) {
-  tflite::MetadataT* fb_metadata = nullptr;
-  for (auto& m : model.metadata) {
-    if (m->name == key) {
-      fb_metadata = m.get();
-      break;
-    }
-  }
-  if (fb_metadata == nullptr) {
-    return Error(kLiteRtStatusErrorNotFound);
-  }
-  return fb_metadata->buffer;
-}
-
-}  // namespace
-
-absl::string_view FbBufToStr(const uint8_t* fb_data, size_t size) {
-  auto fb_buf_raw = reinterpret_cast<const char*>(fb_data);
-  return absl::string_view(fb_buf_raw, size);
-}
-
-absl::string_view FbBufToStr(absl::Span<const uint8_t> fb_buf) {
-  auto fb_buf_raw = reinterpret_cast<const char*>(fb_buf.data());
-  const size_t fb_buf_size = fb_buf.size();
-  return absl::string_view(fb_buf_raw, fb_buf_size);
-}
-
-absl::Span<char> FbBufToStr(absl::Span<uint8_t> fb_buf) {
-  return absl::MakeSpan(reinterpret_cast<char*>(fb_buf.data()), fb_buf.size());
-}
-
-absl::Span<char> FbBufToStr(uint8_t* fb_data, size_t size) {
-  return absl::MakeSpan(reinterpret_cast<char*>(fb_data), size);
-}
-
-bool VerifyFlatbuffer(absl::Span<const uint8_t> buf) {
-  return VerifyFlatbuffer(buf.data(), buf.size());
-}
-
-bool VerifyFlatbuffer(const uint8_t* buf, size_t buf_size) {
-  flatbuffers::Verifier::Options options;
-#ifndef NDEBUG
-  options.assert = true;
-#endif
-  flatbuffers::Verifier verifier(buf, buf_size, options);
-  return VerifyModelBuffer(verifier);
-}
-
-Expected<MutableBufferRef<uint8_t>> GetMetadata(absl::string_view key,
-                                                TflModel& model) {
-  auto buffer_ind = FindMetadataInd(model, key);
-  if (!buffer_ind) {
-    // Metadata key already has value.
-    return buffer_ind.Error();
-  }
-  auto& fb_vec = model.buffers.at(*buffer_ind)->data;
-  return MutableBufferRef<uint8_t>(fb_vec.data(), fb_vec.size());
-}
-
-Expected<BufferRef<uint8_t>> GetMetadata(absl::string_view key,
-                                         const TflModel& model) {
-  auto metadata = GetMetadata(key, const_cast<TflModel&>(model));
-  if (!metadata) {
-    return metadata.Error();
-  }
-  return *metadata;
-}
-
-LiteRtStatus PushMetadata(absl::string_view key, TflModel& model,
-                          BufferRef<uint8_t> metadata) {
-  auto buffer_ind = FindMetadataInd(model, key);
-  if (buffer_ind) {
-    // Metadata key already has value.
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto& new_metadata =
-      model.metadata.emplace_back(std::make_unique<tflite::MetadataT>());
-  new_metadata->name.assign(key.data(), key.size());
-
-  const auto new_m_buffer_ind = model.buffers.size();
-  new_metadata->buffer = new_m_buffer_ind;
-
-  auto& new_buffer = model.buffers.emplace_back(std::make_unique<TflBuffer>());
-  new_buffer->data.assign(metadata.Data(), metadata.Data() + metadata.Size());
-
-  return kLiteRtStatusOk;
-}
-
-Expected<MutableBufferRef<uint8_t>> GetTflBuffer(TflModel& tfl_model,
-                                                 uint32_t buffer_ind) {
-  if (buffer_ind >= tfl_model.buffers.size()) {
-    return Error(kLiteRtStatusErrorIndexOOB);
-  }
-  auto& tfl_data = tfl_model.buffers.at(buffer_ind)->data;
-  return MutableBufferRef<uint8_t>(tfl_data.data(), tfl_data.size());
-}
-
-Expected<BufferRef<uint8_t>> GetTflBuffer(const TflModel& tfl_model,
-                                          uint32_t buffer_ind) {
-  auto buffer = GetTflBuffer(const_cast<TflModel&>(tfl_model), buffer_ind);
-  if (!buffer) {
-    return buffer.Error();
-  }
-  return *buffer;
-}
-
-Expected<const TflBuffer*> GetBuffer(const TflModel& tfl_model,
-                                     uint32_t buffer_ind) {
-  if (buffer_ind >= tfl_model.buffers.size()) {
-    return Error(kLiteRtStatusErrorIndexOOB);
-  }
-  return tfl_model.buffers.at(buffer_ind).get();
-}
-
-Expected<TflBufferPtr> TakeBuffer(TflModel& tfl_model, uint32_t buffer_ind) {
-  if (buffer_ind >= tfl_model.buffers.size()) {
-    return Error(kLiteRtStatusErrorIndexOOB);
-  }
-  return std::move(tfl_model.buffers.at(buffer_ind));
-}
-
-Expected<uint32_t> PushTflBuffer(TflModel& tfl_model,
-                                 BufferRef<uint8_t> buffer) {
-  tfl_model.buffers.emplace_back(std::make_unique<::tflite::BufferT>())
-      ->data.assign(buffer.Data(), buffer.Data() + buffer.Size());
-  return tfl_model.buffers.size() - 1;
-}
-
-Expected<TflOpCodeEnum> GetTflOpCode(const TflModel& tfl_model,
-                                     uint32_t op_code_ind) {
-  if (op_code_ind >= tfl_model.operator_codes.size()) {
-    return Error(kLiteRtStatusErrorIndexOOB);
-  }
-  return std::move(tfl_model.operator_codes.at(op_code_ind)->builtin_code);
-}
-
-bool IsRankedTensorType(const TflShapeInfo& tfl_shape) {
-  return tfl_shape.has_rank;
-}
-
-bool IsStaticTensorType(const TflShapeInfo& tfl_shape) {
-  return !IsRankedTensorType(tfl_shape) ||
-         std::none_of(tfl_shape.shape_signature.begin(),
-                      tfl_shape.shape_signature.end(),
-                      [](auto d) { return d < 0; });
-}
-
-Expected<absl::Span<const int32_t>> AsStaticShape(
-    const TflShapeInfo& tfl_shape) {
-  if (!IsStaticTensorType(tfl_shape)) {
-    return Error(kLiteRtStatusErrorInvalidArgument);
-  }
-  return absl::MakeConstSpan(tfl_shape.shape.data(), tfl_shape.shape.size());
-}
-
-Expected<absl::Span<const int32_t>> AsDynamicShape(
-    const TflShapeInfo& tfl_shape) {
-  auto static_shape = AsStaticShape(tfl_shape);
-  if (static_shape) {
-    return static_shape;
-  }
-  if (!IsRankedTensorType(tfl_shape)) {
-    return Error(kLiteRtStatusErrorInvalidArgument);
-  }
-  return absl::MakeConstSpan(tfl_shape.shape_signature.data(),
-                             tfl_shape.shape_signature.size());
-}
-
-bool IsQuantized(const TflQuantization* tfl_quantization) {
-  return tfl_quantization &&
-         (!tfl_quantization->scale.empty() ||
-          tfl_quantization->details.type != tflite::QuantizationDetails_NONE);
-}
-
-bool IsPerChannelQuantized(const TflQuantization* tfl_quantization) {
-  return tfl_quantization && tfl_quantization->scale.size() > 1;
-}
-
-bool IsPerTensorQuantized(const TflQuantization* tfl_quantization) {
-  return tfl_quantization && tfl_quantization->scale.size() == 1;
-}
-
-bool IsBlockwiseQuantized(const TflQuantization* tfl_quantization) {
-  return tfl_quantization &&
-         tfl_quantization->details.type ==
-             tflite::QuantizationDetails_BlockwiseQuantization;
-}
-
-bool IsCustomQuantized(const TflQuantization* tfl_quantization) {
-  return tfl_quantization && tfl_quantization->details.type ==
-                                 tflite::QuantizationDetails_CustomQuantization;
-}
-
-Expected<TflPerTensorQParams> AsPerTensorQparams(
-    const TflQuantization* tfl_quantization) {
-  if (!IsPerTensorQuantized(tfl_quantization)) {
-    return Error(kLiteRtStatusErrorInvalidArgument);
-  }
-  return std::make_pair(tfl_quantization->zero_point.front(),
-                        tfl_quantization->scale.front());
-}
-
-Expected<TflPerChannelQParams> AsPerChannelQparams(
-    const TflQuantization* tfl_quantization) {
-  if (!IsPerChannelQuantized(tfl_quantization)) {
-    return Error(kLiteRtStatusErrorInvalidArgument);
-  }
-  return TflPerChannelQParams(tfl_quantization->quantized_dimension,
-                              tfl_quantization->zero_point.size(),
-                              tfl_quantization->zero_point,
-                              tfl_quantization->scale);
-}
-
-::tflite::Allocation::Ptr MakeAllocation(BufferRef<uint8_t> buf) {
-  return std::make_unique<::tflite::MemoryAllocation>(
-      buf.Data(), buf.Size(), ::tflite::DefaultErrorReporter());
-}
-
-Expected<FlatbufferWrapper::Ptr> FlatbufferWrapper::CreateFromBuffer(
-    OwningBufferRef<uint8_t>&& buffer) {
-  static constexpr size_t k2GiB = 2e+9;
-  if (buffer.Size() < k2GiB &&
-      !VerifyFlatbuffer(buffer.Data(), buffer.Size())) {
-    return Error(kLiteRtStatusErrorInvalidFlatbuffer);
-  }
-
-  auto alloc = MakeAllocation(buffer);
-
-  if (alloc == nullptr) {
-    return Error(kLiteRtStatusErrorFileIO);
-  }
-
-  auto fb_model = ::tflite::FlatBufferModel::BuildFromBuffer(
-      reinterpret_cast<const char*>(alloc->base()), alloc->bytes());
-  if (fb_model == nullptr) {
-    return Error(kLiteRtStatusErrorFileIO);
-  }
-
-  return FlatbufferWrapper::Ptr(new FlatbufferWrapper(
-      std::move(fb_model), std::move(alloc), std::move(buffer)));
-}
-
-Expected<FlatbufferWrapper::Ptr> FlatbufferWrapper::CreateFromBuffer(
-    BufferRef<uint8_t> buffer) {
-  return FlatbufferWrapper::CreateFromBuffer(
-      OwningBufferRef<uint8_t>(buffer.Data(), buffer.Size()));
-}
-
-Expected<FlatbufferWrapper::Ptr> FlatbufferWrapper::CreateFromTflFile(
-    absl::string_view path) {
-  auto buf = LoadBinaryFile(path);
-  if (!buf) {
-    return buf.Error();
-  }
-  return FlatbufferWrapper::CreateFromBuffer(std::move(*buf));
-}
-
-OwningBufferRef<uint8_t> SerializeFlatbuffer(const TflModel& tfl_model) {
-  flatbuffers::FlatBufferBuilder b;
-  auto model_offset = tflite::Model::Pack(b, &tfl_model);
-  tflite::FinishModelBuffer(b, model_offset);
-
-  OwningBufferRef<uint8_t> buffer;
-  auto [new_buf, new_size, new_offset] = buffer.GetWeak();
-  new_buf = b.ReleaseRaw(new_size, new_offset);
-
-  return buffer;
-}
-
-OwningBufferRef<uint8_t> SerializeFlatbuffer(
-    const FlatbufferWrapper& flatbuffer) {
-  auto tfl_model = flatbuffer.Unpack();
-  return SerializeFlatbuffer(*tfl_model);
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h b/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h
deleted file mode 100644
index bf0ccf6604f7..000000000000
--- a/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h
+++ /dev/null
@@ -1,314 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_FLATBUFFER_TOOLS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_FLATBUFFER_TOOLS_H_
-
-#include <cstdint>
-#include <initializer_list>
-#include <memory>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/mlir/lite/allocation.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_consts.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/model_builder.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace litert::internal {
-
-// Flatbuffer IR
-
-using TflTensor = ::tflite::TensorT;
-using TflOp = ::tflite::OperatorT;
-using TflBuffer = ::tflite::BufferT;
-using TflSubgraph = ::tflite::SubGraphT;
-using TflModel = ::tflite::ModelT;
-using TflOpCodeEnum = ::tflite::BuiltinOperator;
-using TflOpCode = ::tflite::OperatorCodeT;
-using TflQuantization = ::tflite::QuantizationParametersT;
-using TflElementType = ::tflite::TensorType;
-using TflOptions = ::tflite::BuiltinOptionsUnion;
-using TflOptions2 = ::tflite::BuiltinOptions2Union;
-using TflSignature = ::tflite::SignatureDefT;
-using TflMetadata = ::tflite::MetadataT;
-
-using TflPackedModel = ::tflite::Model;
-using TflPackedSubgraph = ::tflite::SubGraph;
-using TflPackedOp = ::tflite::Operator;
-using TflPackedTensor = ::tflite::Tensor;
-using TflPackedBuffer = ::tflite::Buffer;
-
-using TflBufferPtr = std::unique_ptr<TflBuffer>;
-using TflModelPtr = std::unique_ptr<TflModel>;
-using TflQuantizationPtr = std::unique_ptr<TflQuantization>;
-using TflOpCodePtr = std::unique_ptr<TflOpCode>;
-using TflSubgraphPtr = std::unique_ptr<TflSubgraph>;
-using TflTensorPtr = std::unique_ptr<TflTensor>;
-using TflOpPtr = std::unique_ptr<TflOp>;
-using TflSignaturePtr = std::unique_ptr<TflSignature>;
-using TflMetadataPtr = std::unique_ptr<TflMetadata>;
-
-// Code and verion.
-using TflOpCodeDetail = std::pair<TflOpCodeEnum, int32_t>;
-
-// Zero-point, scale.
-using TflPerTensorQParams = std::pair<int64_t, float>;
-
-// Quantized dim, num channels, zero-points, scales.
-using TflPerChannelQParams =
-    std::tuple<int32_t, size_t, std::vector<int64_t>, std::vector<float>>;
-
-// Mirror of all the tensor type related fields in flatbuffer tensor definition.
-struct TflShapeInfo {
-  // Fixed or dynamic rank.
-  bool has_rank;
-
-  // Basic shape, all elements are non-negative (even if this is a dynamic
-  // shape).
-  absl::InlinedVector<int32_t, kExpectedMaxTensorRank> shape;
-
-  // Dynamic dyn info. If this is not empty, then its length is equal to shape.
-  // If i is a dyn dim, then shape[i] == 1 and shape_signature[i] < 0. Otherwise
-  // shape_signature[i] == shape[i].
-  absl::InlinedVector<int32_t, kExpectedMaxTensorRank> shape_signature;
-
-  // Convert from a single dims array. Will detect if array is static/dynamic
-  // and populate fields accordingly.
-  explicit TflShapeInfo(absl::Span<const int32_t> shape_data) : has_rank(true) {
-    bool is_dyn = false;
-    shape.reserve(shape_data.size());
-    shape_signature.reserve(shape_data.size());
-    for (auto d : shape_data) {
-      if (d >= 0) {
-        shape.push_back(d);
-        shape_signature.push_back(d);
-      } else {
-        is_dyn = true;
-        shape.push_back(1);
-        shape_signature.push_back(-1);
-      }
-    }
-    if (!is_dyn) {
-      shape_signature.clear();
-    }
-  }
-
-  // Convert from tensor.
-  explicit TflShapeInfo(const TflTensor& tfl_tensor)
-      : has_rank(tfl_tensor.has_rank),
-        shape(tfl_tensor.shape.begin(), tfl_tensor.shape.end()),
-        shape_signature(tfl_tensor.shape_signature.begin(),
-                        tfl_tensor.shape_signature.end()) {}
-
-  explicit TflShapeInfo(const TflPackedTensor& tfl_tensor)
-      : has_rank(tfl_tensor.has_rank()) {
-    if (tfl_tensor.shape()) {
-      shape.assign(tfl_tensor.shape()->begin(), tfl_tensor.shape()->end());
-    }
-
-    if (tfl_tensor.shape_signature()) {
-      shape_signature.assign(tfl_tensor.shape_signature()->begin(),
-                             tfl_tensor.shape_signature()->end());
-    }
-  }
-};
-
-using TflTensorType = std::pair<TflElementType, TflShapeInfo>;
-
-// Flatbuffer bytes util.
-
-// Convenience method to get string view from native flatbuffer chars.
-absl::string_view FbBufToStr(const uint8_t* fb_data, size_t size);
-
-// Span version.
-absl::string_view FbBufToStr(absl::Span<const uint8_t> fb_buf);
-
-// Convenience method to get mutable signed char span from native flatbuffer
-// chars.
-absl::Span<char> FbBufToStr(uint8_t* fb_data, size_t size);
-
-// Span to span version.
-absl::Span<char> FbBufToStr(absl::Span<uint8_t> fb_buf);
-
-// Flatbuffer verifiers.
-
-// Verifies given serialized flatbuffer
-bool VerifyFlatbuffer(const uint8_t* buf, size_t buf_size);
-
-// Override of above with view input.
-bool VerifyFlatbuffer(absl::Span<const uint8_t> buf);
-
-// TFL flatbuffer IR helpers.
-
-// Get the metadata buffer under given key if it exists.
-Expected<BufferRef<uint8_t>> GetMetadata(absl::string_view key,
-                                         const TflModel& model);
-
-// Get the metadata buffer under given key if it exists that can be written to.
-Expected<MutableBufferRef<uint8_t>> GetMutableMetadata(absl::string_view key,
-                                                       TflModel& model);
-
-// Push the given metadata to the given key if the key does not already exist.
-LiteRtStatus PushMetadata(absl::string_view key, TflModel& model,
-                          BufferRef<uint8_t> metadata);
-
-// Get the buffer object at the given index if it exists.
-Expected<BufferRef<uint8_t>> GetTflBuffer(const TflModel& tfl_model,
-                                          uint32_t buffer_ind);
-
-// Get the buffer object at the given index if it exists that can be written to.
-Expected<MutableBufferRef<uint8_t>> GetMutableTflBuffer(TflModel& tfl_model,
-                                                        uint32_t buffer_ind);
-
-// Get a non-owning view of tfl buffer if it exists.
-Expected<const TflBuffer*> GetBuffer(const TflModel& tfl_model,
-                                     uint32_t buffer_ind);
-
-// Move and take ownership of the buffer object at given index if it exists.
-Expected<TflBufferPtr> TakeBuffer(TflModel& tfl_model, uint32_t buffer_ind);
-
-// Add a new buffer to the tflite model, returning its index.
-Expected<uint32_t> PushTflBuffer(TflModel& tfl_model,
-                                 BufferRef<uint8_t> buffer);
-
-// Make a tflite buffer from data.
-template <class T>
-TflBufferPtr MakeTflBuffer(std::initializer_list<T> data) {
-  auto res = std::make_unique<TflBuffer>();
-  const auto byte_size = data.size() * sizeof(T);
-  res->data.resize(byte_size);
-  for (auto it = data.begin(); it != data.end(); ++it) {
-    auto* write_to =
-        reinterpret_cast<T*>(res->data.data()) + (it - data.begin());
-    *write_to = *it;
-  }
-  res->size = res->data.size();
-  res->offset = 0;
-  return res;
-}
-
-// Get the op code from the model at the given index if it exists.
-Expected<TflOpCodeEnum> GetTflOpCode(const TflModel& tfl_model,
-                                     uint32_t op_code_ind);
-
-// Is tensor fixed rank, with possible dynamic dims.
-bool IsRankedTensorType(const TflShapeInfo& tfl_shape);
-
-// Is ranked tensor type with static shape.
-bool IsStaticTensorType(const TflShapeInfo& tfl_shape);
-
-// Get static shape info if given is indeed a static shape.
-Expected<absl::Span<const int32_t>> AsStaticShape(
-    const TflShapeInfo& tfl_shape);
-
-// Get ranked dynamic shape info if given is indeed a ranked. Still works with
-// static shapes.
-Expected<absl::Span<const int32_t>> AsDynamicShape(
-    const TflShapeInfo& tfl_shape);
-
-// Is the tensor quantized.
-bool IsQuantized(const TflQuantization* tfl_quantization);
-
-// Is the tensor per-tensor quantized.
-bool IsPerTensorQuantized(const TflQuantization* tfl_quantization);
-
-// Is the tensor per-channel quantized.
-bool IsPerChannelQuantized(const TflQuantization* tfl_quantization);
-
-// Is the tensor block-wise quantized.
-bool IsBlockWiseQuantized(const TflQuantization* tfl_quantization);
-
-// Does tensor have custom quantization.
-bool IsCustomQuantized(const TflQuantization* tfl_quantization);
-
-// Get the per-tensor tensor q-params if given tensor has them.
-Expected<TflPerTensorQParams> AsPerTensorQparams(
-    const TflQuantization* tfl_quantization);
-
-// Get the per-channel tensor q-params if given tensor has them.
-Expected<TflPerChannelQParams> AsPerChannelQparams(
-    const TflQuantization* tfl_quantization);
-
-// Flatbuffer management helpers.
-
-// Make a tfl allocation from buffer.
-::tflite::Allocation::Ptr MakeAllocation(BufferRef<uint8_t> buf);
-
-// Wrapper around a tflite model buffer.
-class FlatbufferWrapper {
- public:
-  using Ptr = std::unique_ptr<FlatbufferWrapper>;
-
-  // TODO Don't return a unique_ptr, this can just be a move only type, all the
-  // fields are unique_ptrs. Load flatbuffer from file.
-  static Expected<Ptr> CreateFromTflFile(absl::string_view path);
-
-  // Load flatbuffer from allocated buffer that will be copied.
-  static Expected<Ptr> CreateFromBuffer(BufferRef<uint8_t> buffer);
-
-  // Load flatbuffer from allocated buffer and take ownership.
-  static Expected<Ptr> CreateFromBuffer(OwningBufferRef<uint8_t>&& buffer);
-
-  // Underlying buffer.
-  BufferRef<uint8_t> Buf() const {
-    return BufferRef<uint8_t>(alloc_->base(), alloc_->bytes());
-  }
-
-  // Underlying model object.
-  const ::tflite::FlatBufferModel& FlatbufferModel() const {
-    return *fb_model_;
-  }
-
-  // Packed schema object.
-  const TflPackedModel* PackedModel() const { return fb_model_->GetModel(); }
-
-  // Unpack the contained flatbuffer.
-  TflModelPtr Unpack() const {
-    return TflModelPtr(fb_model_->GetModel()->UnPack());
-  }
-
-  // Address of first byte of the raw model buffer.
-  const uint8_t* AllocBase() const { return Buf().Data(); }
-
-  // Default construct for compatibility.
-  FlatbufferWrapper() = default;
-
- private:
-  FlatbufferWrapper(::tflite::FlatBufferModel::Ptr fb_model,
-                    ::tflite::Allocation::Ptr alloc,
-                    OwningBufferRef<uint8_t>&& model_buf)
-      : fb_model_(std::move(fb_model)),
-        alloc_(std::move(alloc)),
-        model_buf_(std::forward<OwningBufferRef<uint8_t>>(model_buf)) {}
-
-  ::tflite::FlatBufferModel::Ptr fb_model_;
-  ::tflite::Allocation::Ptr alloc_;
-  OwningBufferRef<uint8_t> model_buf_;
-};
-
-// Re-serialize the unpacked model from flatbuffer wrapper.
-OwningBufferRef<uint8_t> SerializeFlatbuffer(
-    const FlatbufferWrapper& flatbuffer);
-OwningBufferRef<uint8_t> SerializeFlatbuffer(const TflModel& tfl_model);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_FLATBUFFER_TOOLS_H_
diff --git a/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools_test.cc b/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools_test.cc
deleted file mode 100644
index bc4fd6c49364..000000000000
--- a/tensorflow/lite/experimental/litert/core/util/flatbuffer_tools_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-
-#include <cstdint>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace litert::internal {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::ElementsAreArray;
-using ::testing::Lt;
-
-FlatbufferWrapper::Ptr TestFlatbuffer(
-    absl::string_view filename = "one_mul.tflite") {
-  const auto tfl_path = testing::GetTestFilePath(filename);
-  return *FlatbufferWrapper::CreateFromTflFile(tfl_path);
-}
-
-static const absl::string_view kKey = "MyKey";
-static const absl::string_view kData = "MyData";
-
-TEST(FlatbufferToolsTest, Metadata) {
-  auto flatbuffer = TestFlatbuffer();
-  ASSERT_NE(flatbuffer, nullptr);
-  auto tfl_model = flatbuffer->Unpack();
-
-  LITERT_ASSERT_OK(PushMetadata(
-      kKey, *tfl_model, BufferRef<uint8_t>(kData.data(), kData.size())));
-
-  auto metadata = GetMetadata(kKey, *tfl_model);
-  ASSERT_TRUE(metadata);
-  EXPECT_EQ(metadata->StrView(), kData);
-}
-
-TEST(FlatbufferToolsTest, GetMetadataNotFound) {
-  auto flatbuffer = TestFlatbuffer();
-  auto tfl_model = flatbuffer->Unpack();
-  ASSERT_NE(flatbuffer, nullptr);
-  EXPECT_FALSE(GetMetadata(kKey, *tfl_model));
-}
-
-TEST(FlatbufferToolsTest, TflBuffer) {
-  auto flatbuffer = TestFlatbuffer();
-  ASSERT_NE(flatbuffer, nullptr);
-  auto tfl_model = flatbuffer->Unpack();
-
-  auto ind = PushTflBuffer((*tfl_model),
-                           BufferRef<uint8_t>(kData.data(), kData.size()));
-  ASSERT_TRUE(ind);
-
-  auto buf = GetTflBuffer((*tfl_model), *ind);
-  ASSERT_TRUE(buf);
-  ASSERT_EQ(buf->StrView(), kData);
-}
-
-TEST(FlatbufferToolsTest, GetTflBufferNotFound) {
-  auto flatbuffer = TestFlatbuffer();
-  ASSERT_NE(flatbuffer, nullptr);
-  auto tfl_model = flatbuffer->Unpack();
-
-  auto buf = GetTflBuffer((*tfl_model), 100);
-  ASSERT_FALSE(buf);
-}
-
-TEST(FlatbufferToolsTest, GetTflOpCode) {
-  auto flatbuffer = TestFlatbuffer();
-  ASSERT_NE(flatbuffer, nullptr);
-  auto tfl_model = flatbuffer->Unpack();
-
-  auto op_code = GetTflOpCode((*tfl_model), 0);
-  ASSERT_TRUE(op_code);
-}
-
-TEST(FlatbufferToolsTest, GetTflOpCodeNotFound) {
-  auto flatbuffer = TestFlatbuffer();
-  ASSERT_NE(flatbuffer, nullptr);
-  auto tfl_model = flatbuffer->Unpack();
-
-  auto op_code = GetTflOpCode((*tfl_model), 100);
-  ASSERT_FALSE(op_code);
-}
-
-TEST(FlatbufferToolsTest, StaticTensorTypeTest) {
-  auto flatbuffer = TestFlatbuffer();
-  auto tfl_model = flatbuffer->Unpack();
-  auto& tensor = tfl_model->subgraphs.front()->tensors.front();
-
-  TflShapeInfo shape(*tensor);
-
-  ASSERT_TRUE(IsRankedTensorType(shape));
-  ASSERT_TRUE(IsStaticTensorType(shape));
-
-  auto static_shape = AsStaticShape(shape);
-
-  ASSERT_TRUE(static_shape);
-  ASSERT_THAT(*static_shape, ElementsAreArray({2, 2}));
-}
-
-TEST(FlatbufferToolsTest, UnrankedTensorTypeTest) {
-  auto flatbuffer = TestFlatbuffer("unranked_tensor.tflite");
-  auto tfl_model = flatbuffer->Unpack();
-  auto& tensor = tfl_model->subgraphs.front()->tensors.front();
-
-  TflShapeInfo shape(*tensor);
-
-  ASSERT_FALSE(IsRankedTensorType(shape));
-}
-
-TEST(FlatbufferToolsTest, RankedDynamicTensorTypeTest) {
-  auto flatbuffer = TestFlatbuffer("dynamic_shape_tensor.tflite");
-  auto tfl_model = flatbuffer->Unpack();
-  auto& tensor = tfl_model->subgraphs.front()->tensors.front();
-
-  TflShapeInfo shape(*tensor);
-
-  ASSERT_TRUE(IsRankedTensorType(shape));
-  ASSERT_FALSE(IsStaticTensorType(shape));
-
-  auto dyn_shape = AsDynamicShape(shape);
-
-  ASSERT_TRUE(dyn_shape);
-  ASSERT_THAT(*dyn_shape, ElementsAre(Lt(0), 2));
-}
-
-TEST(FlatbufferToolsTest, PerTensorQuantizedTest) {
-  auto flatbuffer =
-      TestFlatbuffer("single_add_default_a16w8_recipe_quantized.tflite");
-  auto tfl_model = flatbuffer->Unpack();
-  auto& tensor = tfl_model->subgraphs.front()->tensors.front();
-
-  const auto* const q_parms = tensor->quantization.get();
-
-  ASSERT_TRUE(IsQuantized(q_parms));
-  EXPECT_TRUE(IsPerTensorQuantized(q_parms));
-
-  auto per_tensor = AsPerTensorQparams(q_parms);
-  ASSERT_TRUE(per_tensor);
-}
-
-TEST(FlatbufferToolsTest, PerChannelQuantizedTest) {
-  auto flatbuffer = TestFlatbuffer("static_w8_a16_quantized_k_einsum.tflite");
-  auto tfl_model = flatbuffer->Unpack();
-  auto& tensor = tfl_model->subgraphs.front()->tensors[1];
-
-  const auto* const q_parms = tensor->quantization.get();
-
-  ASSERT_TRUE(IsQuantized(q_parms));
-  EXPECT_TRUE(IsPerChannelQuantized(q_parms));
-
-  auto per_channel = AsPerChannelQparams(q_parms);
-  ASSERT_TRUE(per_channel);
-}
-
-}  // namespace
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/core/util/tensor_type_util.cc b/tensorflow/lite/experimental/litert/core/util/tensor_type_util.cc
deleted file mode 100644
index 4e3284374d24..000000000000
--- a/tensorflow/lite/experimental/litert/core/util/tensor_type_util.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/util/tensor_type_util.h"
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace internal {
-
-Expected<Ratio> GetElementSize(LiteRtElementType element_type) {
-  switch (element_type) {
-    case kLiteRtElementTypeInt4:
-      return Ratio{1, 2};
-    case kLiteRtElementTypeBool:
-      return Ratio{1, 1};
-    case kLiteRtElementTypeInt8:
-    case kLiteRtElementTypeUInt8:
-      return Ratio{1, 1};
-    case kLiteRtElementTypeInt16:
-    case kLiteRtElementTypeUInt16:
-    case kLiteRtElementTypeFloat16:
-    case kLiteRtElementTypeBFloat16:
-      return Ratio{2, 1};
-    case kLiteRtElementTypeInt32:
-    case kLiteRtElementTypeUInt32:
-    case kLiteRtElementTypeFloat32:
-      return Ratio{4, 1};
-    case kLiteRtElementTypeInt64:
-    case kLiteRtElementTypeUInt64:
-    case kLiteRtElementTypeFloat64:
-      return Ratio{8, 1};
-    case kLiteRtElementTypeComplex64:
-      return Ratio{16, 1};
-    case kLiteRtElementTypeComplex128:
-      return Ratio{32, 1};
-    default:
-      return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                        "Unexpected element type");
-  }
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/core/util/tensor_type_util.h b/tensorflow/lite/experimental/litert/core/util/tensor_type_util.h
deleted file mode 100644
index 9663b2ac3374..000000000000
--- a/tensorflow/lite/experimental/litert/core/util/tensor_type_util.h
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_TENSOR_TYPE_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_TENSOR_TYPE_UTIL_H_
-
-#include <string>
-
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert::internal {
-
-struct Ratio {
-  using Type = int;
-  Type num;
-  Type denom;
-  std::string ToString() const { return absl::StrCat(num, "/", denom); }
-};
-
-Expected<Ratio> GetElementSize(LiteRtElementType element_type);
-
-// Get the number of elements in a tensor with given dimensions.
-template <typename T>
-Expected<size_t> GetNumElements(absl::Span<T> dimensions) {
-  size_t num_elements = 1;
-  for (auto i = 0; i < dimensions.size(); ++i) {
-    auto dim = dimensions[i];
-    if (dim < 0) {
-      return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                        "Unexpected negative dimension");
-    } else if (dim == 0) {
-      return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                        "Unexpected 0 dimension");
-    }
-    num_elements *= dim;
-  }
-  return num_elements;
-}
-
-inline Expected<size_t> GetNumElements(
-    const LiteRtRankedTensorType& tensor_type) {
-  return GetNumElements(
-      absl::MakeSpan(tensor_type.layout.dimensions, tensor_type.layout.rank));
-}
-
-// Get the minimum number of bytes necessary to represent a packed tensor with a
-// given element type and dimensions.
-template <typename T>
-Expected<size_t> GetNumPackedBytes(LiteRtElementType element_type,
-                                   absl::Span<T> dimensions) {
-  auto element_size = GetElementSize(element_type);
-  if (!element_size) {
-    return element_size.Error();
-  }
-  auto num_elements = GetNumElements(dimensions);
-  if (!num_elements) {
-    return num_elements.Error();
-  }
-  return ((*num_elements * element_size->num) + (element_size->denom - 1)) /
-         element_size->denom;
-}
-
-// Get the number of bytes necessary to represent a packed tensor type, ignoring
-// any stride information.
-inline Expected<size_t> GetNumPackedBytes(
-    const LiteRtRankedTensorType& tensor_type) {
-  return GetNumPackedBytes(
-      tensor_type.element_type,
-      absl::MakeSpan(tensor_type.layout.dimensions, tensor_type.layout.rank));
-}
-
-// Get the minimum number of bytes necessary to represent a possibly unpacked
-// tensor with a given element type, dimensions, and strides.
-template <typename T, typename U>
-Expected<size_t> GetNumBytes(LiteRtElementType element_type,
-                             absl::Span<T> dimensions, absl::Span<U> strides) {
-  if (dimensions.size() != strides.size()) {
-    return Unexpected(
-        kLiteRtStatusErrorInvalidArgument,
-        "Dimensions and strides have different number of elements");
-  }
-  auto element_size = GetElementSize(element_type);
-  if (!element_size) {
-    return element_size.Error();
-  }
-  auto rank = dimensions.size();
-  size_t num_elements = 1;
-  for (auto i = 0; i < rank; ++i) {
-    num_elements += (dimensions[i] - 1) * strides[i];
-  }
-  return ((num_elements * element_size->num) + (element_size->denom - 1)) /
-         element_size->denom;
-}
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_CORE_UTIL_TENSOR_TYPE_UTIL_H_
diff --git a/tensorflow/lite/experimental/litert/core/util/tensor_type_util_test.cc b/tensorflow/lite/experimental/litert/core/util/tensor_type_util_test.cc
deleted file mode 100644
index bfb084140eb0..000000000000
--- a/tensorflow/lite/experimental/litert/core/util/tensor_type_util_test.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/core/util/tensor_type_util.h"
-
-#include <array>
-#include <cstdint>
-
-#include <gtest/gtest.h>  // NOLINT: Need when ANDROID_API_LEVEL >= 26
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-
-using litert::internal::GetNumBytes;
-using litert::internal::GetNumElements;
-using litert::internal::GetNumPackedBytes;
-
-TEST(TensorTypeUtil, GetNumElements) {
-  constexpr std::array<int, 3> dimensions = {3, 2, 1};
-  auto num_elements = GetNumElements(absl::MakeSpan(dimensions));
-  EXPECT_TRUE(num_elements);
-  EXPECT_EQ(*num_elements, 6);
-}
-
-TEST(TensorTypeUtil, GetNumElementsWithUnknownDimension) {
-  constexpr std::array<int, 3> dimensions = {3, -1, 1};
-  auto num_elements = GetNumElements(absl::MakeSpan(dimensions));
-  EXPECT_FALSE(num_elements);
-}
-
-TEST(TensorTypeUtil, GetNumElementsWithZeroDimension) {
-  constexpr std::array<int, 3> dimensions = {3, 0, 1};
-  auto num_elements = GetNumElements(absl::MakeSpan(dimensions));
-  EXPECT_FALSE(num_elements);
-}
-
-TEST(TensorTypeUtil, GetNumPackedBytes) {
-  LiteRtElementType element_type = kLiteRtElementTypeInt32;
-  constexpr std::array<int, 3> dimensions = {3, 2, 1};
-  auto num_bytes = GetNumPackedBytes(element_type, absl::MakeSpan(dimensions));
-  EXPECT_TRUE(num_bytes);
-  EXPECT_EQ(*num_bytes, sizeof(int32_t) * 6);
-}
-
-TEST(TensorTypeUtil, GetNumBytes) {
-  LiteRtElementType element_type = kLiteRtElementTypeInt32;
-  constexpr std::array<int, 3> dimensions = {3, 2, 1};
-  constexpr std::array<int, 3> strides = {1, 4, 8};
-  // The data should be allocated as follows (where 'X' is a used cell and 'o'
-  // is an unused/padding cell):
-  //
-  //     XXXo XXX
-  //
-  // The total is 4 + 3 = 7 cells
-  auto num_bytes = GetNumBytes(element_type, absl::MakeSpan(dimensions),
-                               absl::MakeSpan(strides));
-  EXPECT_TRUE(num_bytes);
-  EXPECT_EQ(*num_bytes, sizeof(int32_t) * 7);
-}
diff --git a/tensorflow/lite/experimental/litert/integration_test/BUILD b/tensorflow/lite/experimental/litert/integration_test/BUILD
deleted file mode 100644
index 23b07d5602d7..000000000000
--- a/tensorflow/lite/experimental/litert/integration_test/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
diff --git a/tensorflow/lite/experimental/litert/python/BUILD b/tensorflow/lite/experimental/litert/python/BUILD
deleted file mode 100644
index eeab7c9f0c21..000000000000
--- a/tensorflow/lite/experimental/litert/python/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2025 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
diff --git a/tensorflow/lite/experimental/litert/runtime/BUILD b/tensorflow/lite/experimental/litert/runtime/BUILD
deleted file mode 100644
index fc01a04c7e3a..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/BUILD
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "gtest_main_no_heapcheck_deps")
-load("//tensorflow/lite/experimental/litert/build_common:special_rule.bzl", "gles_deps", "gles_linkopts", "lite_rt_friends")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "event",
-    srcs = [
-        "event.cc",
-    ],
-    hdrs = [
-        "event.h",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-    ],
-)
-
-cc_library(
-    name = "tensor_buffer",
-    srcs = [
-        "ahwb_buffer.cc",
-        "dmabuf_buffer.cc",
-        "fastrpc_buffer.cc",
-        "gl_buffer.cc",
-        "gl_texture.cc",
-        "ion_buffer.cc",
-        "open_cl_buffer.cc",
-        "tensor_buffer.cc",
-    ],
-    hdrs = [
-        "ahwb_buffer.h",
-        "dmabuf_buffer.h",
-        "event.h",
-        "fastrpc_buffer.h",
-        "gl_buffer.h",
-        "gl_texture.h",
-        "ion_buffer.h",
-        "open_cl_buffer.h",
-        "tensor_buffer.h",
-        "tensor_buffer_requirements.h",
-        "//tensorflow/lite/experimental/litert/c:litert_event.h",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer.h",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer_requirements.h",
-    ],
-    linkopts = gles_linkopts(),
-    deps = [
-        ":environment",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_layout",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_event",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/core/util:tensor_type_util",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:buffer",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:cl_command_queue",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:cl_context",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:opencl_wrapper",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@opencl_headers",
-    ] + gles_deps() + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
-            "//tensorflow/lite/delegates/gpu/gl:gl_texture",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "environment",
-    srcs = [
-        "environment.cc",
-    ],
-    hdrs = [
-        "environment.h",
-    ],
-    visibility = [
-        "//third_party/odml/infra/ml_drift_delegate:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:cl_command_queue",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:cl_context",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:cl_device",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:opencl_wrapper",
-        "@opencl_headers",
-    ],
-)
-
-cc_test(
-    name = "environment_test",
-    srcs = ["environment_test.cc"],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        "requires-gpu-nvidia",
-    ],
-    deps = [
-        ":environment",
-        "@com_google_googletest//:gtest_main",
-        # copybara:uncomment_begin(google-only)
-        # "//third_party/ml_drift/cl:environment",
-        # "//third_party/ml_drift/cl:opencl_wrapper",
-        # copybara:uncomment_end
-        "//tensorflow/lite/experimental/litert/c:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "//tensorflow/lite/experimental/litert/runtime/opencl:opencl_wrapper",
-    ],
-)
-
-cc_library(
-    name = "tfl_utils",
-    srcs = [
-        "tfl_utils.cc",
-    ],
-    hdrs = [
-        "tfl_utils.h",
-    ],
-    deps = [
-        "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_detail",
-        "//tensorflow/lite/experimental/litert/cc:litert_element_type",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-    ],
-)
-
-cc_library(
-    name = "external_litert_buffer_context",
-    srcs = ["external_litert_buffer_context.cc"],
-    hdrs = ["external_litert_buffer_context.h"],
-    visibility = [
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-    ] + lite_rt_friends(),
-    deps = [
-        ":tfl_utils",
-        "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer_requirements",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_library(
-    name = "compiled_model",
-    srcs = ["compiled_model.cc"],
-    hdrs = ["compiled_model.h"],
-    deps = [
-        ":external_litert_buffer_context",
-        ":tensor_buffer",
-        "//tensorflow/compiler/mlir/lite:allocation",
-        "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:model_builder",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:cc_api_stable",
-        "//tensorflow/lite/delegates/utils:simple_opaque_delegate",
-        "//tensorflow/lite/experimental/litert/c:litert_accelerator_options",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_compiled_model_options",
-        "//tensorflow/lite/experimental/litert/c:litert_dispatch_delegate",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_event",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer_requirements",
-        "//tensorflow/lite/experimental/litert/compiler/plugin:compiler_plugin",
-        "//tensorflow/lite/experimental/litert/core:accelerator_model_compilation_data",
-        "//tensorflow/lite/experimental/litert/core:build_stamp",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/runtime/dispatch:dispatch_delegate",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "compiled_model_test",
-    srcs = ["compiled_model_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/simple_model.tflite",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    # require GPU to run OpenCL tests.
-    tags = [
-        "requires-gpu-nvidia",
-    ],
-    deps = [
-        ":compiled_model",
-        ":tensor_buffer",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_compiled_model_options",
-        "//tensorflow/lite/experimental/litert/c:litert_environment",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:simple_model",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "gl_buffer_test",
-    srcs = ["gl_buffer_test.cc"],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-landroid",
-        ],
-        "//conditions:default": [],
-    }),
-    tags = [
-        "notap",
-    ],
-    deps = [
-        ":tensor_buffer",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-    ] + gtest_main_no_heapcheck_deps() + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/gpu/gl:egl_environment",
-            "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "tensor_buffer_conversion",
-    srcs = ["tensor_buffer_conversion.cc"],
-    hdrs = ["tensor_buffer_conversion.h"],
-    linkopts = gles_linkopts(),
-    deps = [
-        ":tensor_buffer",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer_utils",
-        "@com_google_absl//absl/strings:str_format",
-    ] + gles_deps(),
-)
-
-cc_test(
-    name = "tensor_buffer_conversion_test",
-    srcs = ["tensor_buffer_conversion_test.cc"],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        "notap",
-    ],
-    deps = [
-        ":tensor_buffer",
-        ":tensor_buffer_conversion",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_environment",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_layout",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_googletest//:gtest_main",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/gpu/gl:egl_environment",
-        ],
-        "//conditions:default": [],
-    }),
-)
diff --git a/tensorflow/lite/experimental/litert/runtime/accelerators/BUILD b/tensorflow/lite/experimental/litert/runtime/accelerators/BUILD
deleted file mode 100644
index 79c7ccd4664f..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/accelerators/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2025 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "auto_registration",
-    srcs = ["auto_registration.cc"],
-    hdrs = ["auto_registration.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_shared_library",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/runtime/accelerators/auto_registration.cc b/tensorflow/lite/experimental/litert/runtime/accelerators/auto_registration.cc
deleted file mode 100644
index 84b392ee222d..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/accelerators/auto_registration.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/accelerators/auto_registration.h"
-
-#include <dlfcn.h>
-
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-namespace litert {
-
-Expected<void> RegisterSharedObjectAccelerator(
-    LiteRtEnvironmentT& environment, absl::string_view plugin_path,
-    absl::string_view registration_function_name) {
-  LITERT_ASSIGN_OR_RETURN(
-      SharedLibrary lib,
-      SharedLibrary::Load(plugin_path, RtldFlags::Now().Local().DeepBind()));
-  LITERT_ASSIGN_OR_RETURN(auto registration_function,
-                          lib.LookupSymbol<LiteRtStatus (*)(LiteRtEnvironment)>(
-                              registration_function_name.data()));
-  LITERT_RETURN_IF_ERROR(registration_function(&environment));
-  environment.GetAcceleratorRegistry().TakeOwnershipOfSharedLibrary(
-      std::move(lib));
-  return {};
-}
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/accelerators/auto_registration.h b/tensorflow/lite/experimental/litert/runtime/accelerators/auto_registration.h
deleted file mode 100644
index 1fabe51f3952..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/accelerators/auto_registration.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ACCELERATORS_AUTO_REGISTRATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ACCELERATORS_AUTO_REGISTRATION_H_
-
-#include <dlfcn.h>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-namespace litert {
-
-Expected<void> RegisterSharedObjectAccelerator(
-    LiteRtEnvironmentT& environment, absl::string_view plugin_path,
-    absl::string_view registration_function_name);
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ACCELERATORS_AUTO_REGISTRATION_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/BUILD b/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/BUILD
deleted file mode 100644
index d790c0b9c260..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2025 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "dispatch_accelerator",
-    srcs = ["dispatch_accelerator.cc"],
-    hdrs = ["dispatch_accelerator.h"],
-    deps = [
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/experimental/litert/c:litert_accelerator",
-        "//tensorflow/lite/experimental/litert/c:litert_accelerator_options",
-        "//tensorflow/lite/experimental/litert/c:litert_accelerator_registration",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_dispatch_delegate",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "//tensorflow/lite/experimental/litert/cc:litert_dispatch_delegate",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/core:accelerator_model_compilation_data",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/dispatch_accelerator.cc b/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/dispatch_accelerator.cc
deleted file mode 100644
index b20ff82fdb08..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/dispatch_accelerator.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/dispatch_accelerator.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator.h"
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_registration.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-
-namespace litert {
-
-class NpuAccelerator final {
-  constexpr static const absl::string_view kName = "NpuAccelerator";
-  // Warning: this should be incremented every time the code of this accelerator
-  // is updated according to semanting versioning.
-  constexpr static const LiteRtApiVersion kVersion{1, 0, 0};
-  constexpr static const LiteRtHwAcceleratorSet kHwSupport =
-      kLiteRtHwAcceleratorNpu;
-
- public:
-  explicit NpuAccelerator(std::string library_folder)
-      : library_folder_(std::move(library_folder)) {}
-
-  struct Deleter {
-    void operator()(NpuAccelerator* npu_accelerator) { delete npu_accelerator; }
-  };
-  using Ptr = std::unique_ptr<NpuAccelerator, Deleter>;
-
-  static Expected<Ptr> Create(std::string library_folder) {
-    LITERT_RETURN_IF_ERROR(
-        !library_folder.empty(),
-        Error(kLiteRtStatusErrorInvalidArgument,
-              "Dispatch API implementation library folder was not specified."));
-    return Ptr(new NpuAccelerator(std::move(library_folder)));
-  }
-
-  // C API
-
-  // Deletes the accelerator data.
-  static void Destroy(void* npu_accelerator) {
-    Deleter()(reinterpret_cast<NpuAccelerator*>(npu_accelerator));
-  }
-
-  // Stores the accelerator's name in `name`.
-  static LiteRtStatus GetName(LiteRtAccelerator accelerator,
-                              const char** name) {
-    LITERT_ENSURE(accelerator != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Accelerator handle is invalid.");
-    LITERT_ENSURE(name != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Name pointer is null.");
-    *name = kName.data();
-    return kLiteRtStatusOk;
-  }
-
-  // Stores the accelerator's version in `version`.
-  static LiteRtStatus GetVersion(LiteRtAccelerator accelerator,
-                                 LiteRtApiVersion* version) {
-    LITERT_ENSURE(accelerator != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Accelerator handle is invalid.");
-    LITERT_ENSURE(version != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Version pointer is null.");
-    *version = kVersion;
-    return kLiteRtStatusOk;
-  }
-
-  // Stores the accelerator's hardware support in `hw_set`.
-  static LiteRtStatus GetHardwareSupport(LiteRtAccelerator accelerator,
-                                         LiteRtHwAcceleratorSet* hw_set) {
-    LITERT_ENSURE(accelerator != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Accelerator handle is invalid.");
-    LITERT_ENSURE(hw_set != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Harware support pointer is null.");
-    *hw_set = kHwSupport;
-    return kLiteRtStatusOk;
-  }
-
-  // Goes through the options in the linked list and returns the model
-  // compilation data if it exists.
-  static Expected<const litert::ModelCompilationData*> GetModelCompilationData(
-      LiteRtAcceleratorCompilationOptions options) {
-    while (options) {
-      if (options->identifier == litert::ModelCompilationData::kIdentifier) {
-        return reinterpret_cast<litert::ModelCompilationData*>(options);
-      }
-      LiteRtGetNextAcceleratorCompilationOptions(&options);
-    }
-    return Unexpected(kLiteRtStatusErrorNotFound,
-                      "Could not retrieve mode compilation data.");
-  }
-
-  // Creates a Dispatch delegate instance.
-  static LiteRtStatus CreateDelegate(
-      LiteRtAccelerator accelerator,
-      LiteRtAcceleratorCompilationOptions options, void** delegate) {
-    LITERT_ENSURE(delegate != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Delegate pointer is null.");
-    LITERT_ENSURE(accelerator != nullptr, kLiteRtStatusErrorInvalidArgument,
-                  "Accelerator handle is invalid.");
-    LITERT_ENSURE(accelerator->env != nullptr,
-                  kLiteRtStatusErrorInvalidArgument,
-                  "Accelerator is not registered to an environment.");
-
-    LITERT_ASSIGN_OR_RETURN(
-        const litert::ModelCompilationData* compilation_data,
-        GetModelCompilationData(options));
-    const char* allocation_base = compilation_data->allocation_base;
-
-    LITERT_ENSURE(allocation_base != nullptr, kLiteRtStatusErrorRuntimeFailure,
-                  "No model allocation was passed by the runtime.");
-
-    auto dispatch_delegate_options =
-        litert::CreateDispatchDelegateOptionsPtr(*accelerator->env);
-    LITERT_ENSURE(dispatch_delegate_options != nullptr,
-                  kLiteRtStatusErrorRuntimeFailure,
-                  "Dispatch delegate options failed to be created.");
-
-    LITERT_ENSURE(
-        LiteRtDispatchDelegateAddAllocBaseOption(
-            dispatch_delegate_options.get(), allocation_base) == kTfLiteOk,
-        kLiteRtStatusErrorRuntimeFailure,
-        "Could not add allocation base to dispatch delegate options.");
-
-    auto dispatch_delegate = litert::CreateDispatchDelegatePtr(
-        *accelerator->env, std::move(dispatch_delegate_options));
-    LITERT_ENSURE(dispatch_delegate != nullptr,
-                  kLiteRtStatusErrorRuntimeFailure,
-                  "Dispatch delegate failed to be created.");
-
-    *delegate = dispatch_delegate.release();
-    return kLiteRtStatusOk;
-  }
-
-  // Destroys a Dispatch delegate instance.
-  static void DestroyDelegate(void* delegate) {
-    LiteRtDestroyDispatchDelegate(
-        reinterpret_cast<TfLiteOpaqueDelegate*>(delegate));
-  }
-
- private:
-  // Note: we do not directly use the option structure because we want to copy
-  // and own all the option data.
-
-  // Folder to the Dispatch API implementation shared library.
-  std::string library_folder_;
-};
-
-namespace {
-
-struct AcceleratorDestructor {
-  void operator()(LiteRtAccelerator accelerator) {
-    LiteRtDestroyAccelerator(accelerator);
-  }
-};
-
-using AcceleratorGuard =
-    std::unique_ptr<std::pointer_traits<LiteRtAccelerator>::element_type,
-                    AcceleratorDestructor>;
-
-}  // namespace
-}  // namespace litert
-
-extern "C" {
-
-LiteRtStatus LiteRtRegisterNpuAccelerator(
-    LiteRtEnvironmentT* environment, LiteRtNpuAcceleratorOptions* options) {
-  LITERT_ENSURE(environment != nullptr, kLiteRtStatusErrorInvalidArgument,
-                "accelerator handle is invalid");
-  LiteRtAccelerator accelerator_handle;
-  LITERT_RETURN_IF_ERROR(LiteRtCreateAccelerator(&accelerator_handle));
-  litert::AcceleratorGuard accelerator(accelerator_handle);
-
-  LiteRtSetAcceleratorGetName(accelerator.get(),
-                              litert::NpuAccelerator::GetName);
-  LiteRtSetAcceleratorGetVersion(accelerator.get(),
-                                 litert::NpuAccelerator::GetVersion);
-  LiteRtSetAcceleratorGetHardwareSupport(
-      accelerator.get(), litert::NpuAccelerator::GetHardwareSupport);
-
-  LiteRtSetDelegateFunction(accelerator.get(),
-                            litert::NpuAccelerator::CreateDelegate,
-                            litert::NpuAccelerator::DestroyDelegate);
-
-  std::string library_folder;
-  if (options && options->library_folder) {
-    library_folder = options->library_folder;
-  }
-  // Check the environment options if the library folder wasn't set in the
-  // options.
-  if (library_folder.empty()) {
-    if (auto env_library_folder =
-            environment->GetOption(kLiteRtEnvOptionTagDispatchLibraryDir);
-        env_library_folder.has_value()) {
-      LITERT_ASSIGN_OR_RETURN(
-          library_folder, litert::Get<std::string>(env_library_folder.value()));
-    }
-  }
-
-  LITERT_ASSIGN_OR_RETURN(
-      auto accelerator_impl,
-      litert::NpuAccelerator::Create(std::move(library_folder)));
-
-  LITERT_RETURN_IF_ERROR(LiteRtRegisterAccelerator(
-      environment, accelerator.release(), accelerator_impl.release(),
-      litert::NpuAccelerator::Destroy));
-  return kLiteRtStatusOk;
-}
-
-}  // extern "C"
diff --git a/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/dispatch_accelerator.h b/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/dispatch_accelerator.h
deleted file mode 100644
index 9c1d93938eb2..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/accelerators/dispatch/dispatch_accelerator.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ACCELERATORS_DISPATCH_DISPATCH_ACCELERATOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ACCELERATORS_DISPATCH_DISPATCH_ACCELERATOR_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct LiteRtNpuAcceleratorOptions {
-  const char* library_folder;
-};
-
-// Registers the NPU accelerator to the given environment.
-//
-// `options` may be null, in which case the accelerator is registered with
-// a default configuration.
-//
-// If `options.library_folder` is not specified, the library folder is replaced
-// with the `LiteRtEnvOptionTagDispatchLibraryDir` environment option (that was
-// passed upon creation).
-//
-// Once this function has returned, options may be freed or reused.
-LiteRtStatus LiteRtRegisterNpuAccelerator(LiteRtEnvironment environment,
-                                          LiteRtNpuAcceleratorOptions* options);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ACCELERATORS_DISPATCH_DISPATCH_ACCELERATOR_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.cc b/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.cc
deleted file mode 100644
index 5752bb2de175..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h"
-
-#include <cstddef>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace internal {
-
-bool AhwbBuffer::IsSupported() {
-#if LITERT_HAS_AHWB_SUPPORT
-  return true;
-#else
-  return false;
-#endif
-}
-
-Expected<AhwbBuffer> AhwbBuffer::Alloc(size_t size) {
-#if LITERT_HAS_AHWB_SUPPORT
-  AHardwareBuffer* ahwb;
-  AHardwareBuffer_Desc ahwb_desc = {
-      .width = static_cast<uint32_t>(size),
-      .height = 1,
-      .layers = 1,
-      .format = AHARDWAREBUFFER_FORMAT_BLOB,
-      .usage = AHARDWAREBUFFER_USAGE_CPU_WRITE_RARELY |
-               AHARDWAREBUFFER_USAGE_CPU_READ_RARELY |
-               AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER};
-  if (AHardwareBuffer_allocate(&ahwb_desc, &ahwb) != 0) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to allocate AHWB");
-  }
-  return AhwbBuffer{/*.ahwb=*/ahwb};
-#else
-  return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                    "AHardwareBuffers are not supported on this platform");
-#endif  // LITERT_HAS_AHWB_SUPPORT
-}
-
-void AhwbBuffer::Free(AHardwareBuffer* ahwb) {
-#if LITERT_HAS_AHWB_SUPPORT
-  AHardwareBuffer_release(ahwb);
-#endif
-}
-
-Expected<size_t> AhwbBuffer::GetSize(AHardwareBuffer* ahwb) {
-#if LITERT_HAS_AHWB_SUPPORT
-  AHardwareBuffer_Desc ahwb_desc;
-  AHardwareBuffer_describe(ahwb, &ahwb_desc);
-  return static_cast<size_t>(ahwb_desc.width) * ahwb_desc.height *
-         ahwb_desc.layers;
-#else
-  return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                    "AHardwareBuffers are not supported on this platform");
-#endif  // LITERT_HAS_AHWB_SUPPORT
-}
-
-Expected<void*> AhwbBuffer::Lock(AHardwareBuffer* ahwb, LiteRtEvent event) {
-#if LITERT_HAS_AHWB_SUPPORT
-  int fence = -1;
-  if (event) {
-    if (auto status = LiteRtGetEventSyncFenceFd(event, &fence);
-        status != kLiteRtStatusOk) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to get sync fence fd from event");
-    }
-  }
-  void* host_addr;
-  if (AHardwareBuffer_lock(ahwb,
-                           AHARDWAREBUFFER_USAGE_CPU_READ_RARELY |
-                               AHARDWAREBUFFER_USAGE_CPU_WRITE_RARELY,
-                           fence, /*rect=*/nullptr, &host_addr) != 0) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure, "Failed to lock AHWB");
-  }
-  return host_addr;
-#else
-  return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                    "AHardwareBuffers are not supported on this platform");
-#endif
-}
-
-Expected<void> AhwbBuffer::Unlock(AHardwareBuffer* ahwb) {
-#if LITERT_HAS_AHWB_SUPPORT
-  if (AHardwareBuffer_unlock(ahwb, /*fence=*/nullptr) != 0) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to unlock AHWB");
-  }
-  return {};
-#else
-  return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                    "AHardwareBuffers are not supported on this platform");
-#endif
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h b/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h
deleted file mode 100644
index e7575109878d..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_AHWB_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_AHWB_BUFFER_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include <android/hardware_buffer.h>
-#else
-// Define a place holder AHardwareBuffer struct just to enable compilation.
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-typedef struct AHardwareBuffer AHardwareBuffer;
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-namespace litert {
-namespace internal {
-
-struct AhwbBuffer {
-  AHardwareBuffer* ahwb;
-
-  static bool IsSupported();
-  static Expected<AhwbBuffer> Alloc(size_t size);
-  static void Free(AHardwareBuffer* ahwb);
-  static Expected<size_t> GetSize(AHardwareBuffer* ahwb);
-  static Expected<void*> Lock(AHardwareBuffer* ahwb,
-                              LiteRtEvent event = nullptr);
-  static Expected<void> Unlock(AHardwareBuffer* ahwb);
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_AHWB_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/compiled_model.cc b/tensorflow/lite/experimental/litert/runtime/compiled_model.cc
deleted file mode 100644
index 7c04a98b4bb1..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/compiled_model.cc
+++ /dev/null
@@ -1,598 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/compiled_model.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/cleanup/cleanup.h"
-#include "tensorflow/lite/experimental/litert/c/litert_accelerator_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/accelerator_model_compilation_data.h"
-
-#if defined(__ANDROID__)
-#include <android/hardware_buffer.h>
-#endif
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/mlir/lite/allocation.h"
-#include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/interpreter_builder.h"
-#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
-#include "tensorflow/lite/stderr_reporter.h"
-
-using litert::Error;
-using litert::Expected;
-using litert::OwningBufferRef;
-using litert::TensorBuffer;
-using litert::Unexpected;
-using litert::internal::ExternalLiteRtBufferContext;
-
-Expected<void> LiteRtCompiledModelT::Initialize() {
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  tflite::InterpreterBuilder(*fb_model_, resolver)(&interp_);
-  if (interp_ == nullptr) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to build TFL interpreter");
-  }
-
-  signature_keys_ = interp_->signature_keys();
-  if (signature_keys_.empty()) {
-    static auto* default_signature_key =
-        new std::string(LiteRtSignatureT::kDefaultSignatureKey);
-    signature_keys_.push_back(default_signature_key);
-  }
-  // Register the ExternalLiteRtBufferContext for TensorBuffer handshaking.
-  buffer_context_ =
-      std::make_unique<litert::internal::ExternalLiteRtBufferContext>();
-  interp_->SetExternalContext(kTfLiteLiteRtBufferContext,
-                              buffer_context_.get());
-
-  return {};
-}
-
-Expected<LiteRtCompiledModelT::Ptr> LiteRtCompiledModelT::Create(
-    LiteRtEnvironmentT* env, LiteRtModel model,
-    OptionsPtr compilation_options) {
-  auto compiled_model = std::make_unique<LiteRtCompiledModelT>();
-
-  std::optional<OwningBufferRef<uint8_t>> new_flatbuffer;
-  LiteRtHwAcceleratorSet hardware_accelerators = kLiteRtHwAcceleratorNone;
-  if (compilation_options) {
-    LiteRtGetCompilationOptionsHardwareAccelerators(compilation_options.get(),
-                                                    &hardware_accelerators);
-  }
-  // TODO: b/379317134 - Support other delegates with compilation options.
-  if (hardware_accelerators != kLiteRtHwAcceleratorNone) {
-    LITERT_LOG(LITERT_INFO, "Applying compiler plugins...");
-    if (auto result =
-            litert::internal::ApplyPlugins(env, model, hardware_accelerators);
-        !result) {
-      LITERT_LOG(LITERT_WARNING, "Failed to apply compiler plugins: %s",
-                 result.Error().Message().c_str());
-    } else {
-      if (result->num_applied_plugins > 0) {
-        LITERT_LOG(LITERT_INFO, "Successfully applied %d compiler plugins: %s",
-                   result->num_applied_plugins,
-                   result->success_message.c_str());
-        new_flatbuffer = std::move(result->new_flatbuffer);
-      }
-      if (!result->error_message.empty()) {
-        LITERT_LOG(LITERT_WARNING, "Some compiler plugins failed to apply: %s",
-                   result->error_message.c_str());
-      }
-    }
-  }
-
-  const char* model_buffer = nullptr;
-  size_t model_buffer_size = 0;
-  // The following code gets the original FB pointer from LiteRtModel.
-  // TODO b/383120429 - Use a better way of getting the FB pointer.
-  if (new_flatbuffer) {
-    model_buffer = reinterpret_cast<const char*>(new_flatbuffer->Data());
-    model_buffer_size = new_flatbuffer->Size();
-
-  } else if (auto init_model_buffer = detail::GetTflFlatbuffer(*model).Buf();
-             init_model_buffer.Size() != 0) {
-    // Use the saved the original FB pointer when the LiteRtModel was created
-    // from a buffer.
-    model_buffer = init_model_buffer.StrData();
-    model_buffer_size = init_model_buffer.Size();
-
-  } else {
-    // TODO b/383120429 - Once LiteRtModel provide tflite::Model object, switch
-    // to use it to initialize Interpreter instead of serializing LiteRtModel.
-    auto [data, size, offset] = compiled_model->model_buf_.GetWeak();
-    const auto opts = litert::SerializationOptions::Defaults();
-    if (LiteRtSerializeModel(model, &data, &size, &offset,
-                             /*destroy_model=*/false,
-                             opts) != kLiteRtStatusOk) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to serialize model");
-    }
-    compiled_model->alloc_ = std::make_unique<tflite::MemoryAllocation>(
-        compiled_model->model_buf_.Data(), compiled_model->model_buf_.Size(),
-        tflite::DefaultErrorReporter());
-    model_buffer =
-        reinterpret_cast<const char*>(compiled_model->alloc_->base());
-    model_buffer_size = compiled_model->alloc_->bytes();
-  }
-
-  compiled_model->fb_model_ =
-      tflite::FlatBufferModel::BuildFromBuffer(model_buffer, model_buffer_size);
-  if (compiled_model->fb_model_ == nullptr) {
-    return Unexpected(kLiteRtStatusErrorFileIO,
-                      "Failed to build flatbuffer from buffer");
-  }
-
-  if (auto res = compiled_model->Initialize(); !res.HasValue()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to inizialize compiled model");
-  }
-
-  // TODO: b/397399776 - Auto register accelerators
-
-  // If no compilation options were passed, we create a default object. This
-  // allows us to add (for instance) accelerator compilation options.
-  if (!compilation_options) {
-    LiteRtCompilationOptions tmp_options = nullptr;
-    LITERT_RETURN_IF_ERROR(LiteRtCreateCompilationOptions(&tmp_options));
-    compilation_options.reset(tmp_options);
-  }
-
-  // Add a new link in the accelerator compilation options that holds some data
-  // that is computed during model compilation.
-  LITERT_ASSIGN_OR_RETURN(auto model_compilation_data,
-                          litert::ModelCompilationData::Create());
-  model_compilation_data->allocation_base = model_buffer;
-  LITERT_RETURN_IF_ERROR(LiteRtAddAcceleratorCompilationOptions(
-      compilation_options.get(), model_compilation_data.release()));
-
-  // Retrieve the accelerator options list.
-  LiteRtAcceleratorCompilationOptions accelerator_options = nullptr;
-  LITERT_RETURN_IF_ERROR(LiteRtGetAcceleratorCompilationOptions(
-      compilation_options.get(), &accelerator_options));
-
-  // Apply accelerators matching the requested hardware support to the
-  // model in the order they were registered.
-  for (auto& accelerator : env->GetAcceleratorRegistry()) {
-    LiteRtHwAcceleratorSet accelerator_supported_hardware;
-    LITERT_RETURN_IF_ERROR(accelerator->GetHardwareSupport(
-        accelerator.get(), &accelerator_supported_hardware));
-    if (hardware_accelerators & accelerator_supported_hardware) {
-      TfLiteOpaqueDelegate* delegate_ptr = nullptr;
-      LITERT_RETURN_IF_ERROR(
-          accelerator->CreateDelegate(accelerator.get(), accelerator_options,
-                                      reinterpret_cast<void**>(&delegate_ptr)));
-
-      auto delegate = tflite::TfLiteOpaqueDelegateUniquePtr(
-          delegate_ptr, reinterpret_cast<void (*)(TfLiteOpaqueDelegate*)>(
-                            accelerator->DestroyDelegate));
-
-      if (compiled_model->interp_->ModifyGraphWithDelegate(delegate_ptr) !=
-          kTfLiteOk) {
-        return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                          "Failed to modify graph with delegate");
-      }
-      compiled_model->RegisterDelegate(std::move(delegate));
-    }
-  }
-
-  // Apply the dispatch delegate, unconditionally, since the loaded model may
-  // have been compiled for NPU at AOT.
-  // TODO: b/394958439 - Get the DispatchDelegate from the AcceleratorRegistry.
-  auto dispatch_delegate_options =
-      litert::CreateDispatchDelegateOptionsPtr(*env);
-  LiteRtDispatchDelegateAddAllocBaseOption(dispatch_delegate_options.get(),
-                                           model_buffer);
-
-  auto* allocation = compiled_model->fb_model_->allocation();
-  if (allocation != nullptr &&
-      allocation->type() == tflite::Allocation::Type::kMMap) {
-    auto& mmap_allocation =
-        static_cast<const tflite::MMAPAllocation&>(*allocation);
-    int flatbuffer_fd = mmap_allocation.fd();
-    LiteRtDispatchDelegateAddAllocFdOption(dispatch_delegate_options.get(),
-                                           flatbuffer_fd);
-  }
-
-  auto dispatch_delegate = litert::CreateDispatchDelegatePtr(
-      *env, std::move(dispatch_delegate_options));
-  if (auto status = compiled_model->interp_->ModifyGraphWithDelegate(
-          dispatch_delegate.get());
-      status != kTfLiteOk) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to modify graph with delegate");
-  }
-
-  compiled_model->RegisterDelegate(std::move(dispatch_delegate));
-
-  compiled_model->CheckCpuTensors();
-  return compiled_model;
-}
-
-void LiteRtCompiledModelT::CheckCpuTensors() {
-  cpu_tensors_.clear();
-  for (int subgraph_no = 0; subgraph_no < interp_->subgraphs_size();
-       ++subgraph_no) {
-    auto* subgraph = interp_->subgraph(subgraph_no);
-    auto& execution_plan = subgraph->execution_plan();
-    auto& nodes_and_registration = subgraph->nodes_and_registration();
-    for (int execution_plan_index = 0;
-         execution_plan_index < execution_plan.size(); execution_plan_index++) {
-      int node_index = execution_plan[execution_plan_index];
-      auto& node = nodes_and_registration[node_index].first;
-      const TfLiteRegistration& registration =
-          nodes_and_registration[node_index].second;
-
-      if (registration.builtin_code == kTfLiteBuiltinDelegate) {
-        continue;
-      }
-      if (registration.builtin_code == kTfLiteBuiltinCustom &&
-          litert::internal::kLiteRtDispatchOpCustomCode ==
-              registration.custom_name)
-        continue;
-      for (int i = 0; i < node.inputs->size; ++i) {
-        int input_tensor_index = node.inputs->data[i];
-        if (input_tensor_index == kTfLiteOptionalTensor) continue;
-        cpu_tensors_.insert(subgraph->tensor(input_tensor_index));
-      }
-    }
-  }
-}
-
-litert::Expected<LiteRtTensorBufferRequirements>
-LiteRtCompiledModelT::GetTensorBufferRequirements(const TfLiteTensor* tensor) {
-  // Use the buffer context to get the buffer requirements only if the tensor
-  // is not a CPU tensor.
-  if (cpu_tensors_.find(tensor) == cpu_tensors_.end()) {
-    auto requirements = buffer_context_->GetBufferRequirement(tensor);
-    if (requirements) {
-      return (*requirements)->Get();
-    }
-  } else {
-    LITERT_LOG(LITERT_VERBOSE, "Tensor %s is shared with CPU.\n", tensor->name);
-  }
-  LiteRtTensorBufferRequirements litert_cpu_buffer_requirements;
-  LiteRtTensorBufferType cpu_buffer_type[] = {
-      kLiteRtTensorBufferTypeHostMemory};
-  uint32_t cpu_buffer_strides[] = {0};
-  auto res = LiteRtCreateTensorBufferRequirements(
-      /*num_supported_tensor_buffer_types=*/1, cpu_buffer_type, tensor->bytes,
-      /*num_strides=*/1, cpu_buffer_strides, &litert_cpu_buffer_requirements);
-  if (res != kLiteRtStatusOk) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to create CPU buffer requirements");
-  }
-  cpu_buffer_requirements_[tensor] =
-      litert::TensorBufferRequirements(litert_cpu_buffer_requirements);
-  return litert_cpu_buffer_requirements;
-}
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtCompiledModelT::GetInputBufferRequirements(
-    absl::string_view signature_key, size_t input_index) {
-  auto runner = GetSignatureRunner(signature_key);
-  if (runner == nullptr) {
-    return Unexpected(kLiteRtStatusErrorNotFound,
-                      "Failed to get signature runner");
-  }
-  auto input_names = runner->subgraph_input_names();
-  if (input_index >= input_names.size()) {
-    return Unexpected(kLiteRtStatusErrorIndexOOB, "Input index out of range");
-  }
-  auto input_name = input_names[input_index];
-  auto* input_tensor = runner->input_tensor(input_name);
-  if (input_tensor == nullptr) {
-    return Unexpected(kLiteRtStatusErrorNotFound, "Failed to get input tensor");
-  }
-
-  return GetTensorBufferRequirements(input_tensor);
-}
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtCompiledModelT::GetOutputBufferRequirements(
-    absl::string_view signature_key, size_t output_index) {
-  auto runner = GetSignatureRunner(signature_key);
-  if (runner == nullptr) {
-    return Unexpected(kLiteRtStatusErrorNotFound,
-                      "Failed to get signature runner");
-  }
-  auto output_names = runner->subgraph_output_names();
-  if (output_index >= output_names.size()) {
-    return Unexpected(kLiteRtStatusErrorIndexOOB, "Output index out of range");
-  }
-  auto output_name = output_names[output_index];
-  auto* output_tensor = runner->output_tensor(output_name);
-  if (output_tensor == nullptr) {
-    return Unexpected(kLiteRtStatusErrorNotFound,
-                      "Failed to get output tensor");
-  }
-
-  return GetTensorBufferRequirements(output_tensor);
-}
-
-tflite::SignatureRunner* LiteRtCompiledModelT::GetSignatureRunner(
-    absl::string_view signature_key) {
-  if (signature_runners_.contains(signature_key)) {
-    return signature_runners_[signature_key];
-  }
-  auto runner = interp_->GetSignatureRunner(
-      signature_key == LiteRtSignatureT::kDefaultSignatureKey
-          ? nullptr
-          : std::string(signature_key).c_str());
-  signature_runners_[signature_key] = runner;
-  return runner;
-}
-
-Expected<void> LiteRtCompiledModelT::RegisterBuffer(
-    tflite::SignatureRunner* runner, TfLiteTensor* tensor,
-    const char* tensor_name, LiteRtTensorBuffer buffer, bool is_input,
-    std::vector<LiteRtTensorBuffer>& locked_buffers) {
-  bool backend_requires_cpu_buffer = false;
-
-  auto requirements = buffer_context_->GetBufferRequirement(tensor);
-  if (requirements) {
-    auto supported_types = (*requirements)->SupportedTypes();
-    if (!supported_types) {
-      return supported_types.Error();
-    }
-
-    for (auto& type : *supported_types) {
-      if (type == buffer->buffer_type()) {
-        // Register tensor buffer if it can be used by the backend.
-        buffer->Duplicate();
-        TensorBuffer duplicated_buffer(buffer);
-        if (auto status = buffer_context_->RegisterTensorBuffer(
-                tensor, std::move(duplicated_buffer));
-            status != kLiteRtStatusOk) {
-          return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                            "Failed to register tensor buffer");
-        }
-        // Mark the tensor as non-CPU to avoid TFLite from allocating it.
-        tensor->allocation_type = kTfLiteNonCpu;
-        tensor->data.data = nullptr;
-        return {};
-      }
-      if (type == kLiteRtTensorBufferTypeHostMemory) {
-        backend_requires_cpu_buffer = true;
-      }
-    }
-  } else {
-    // If the BufferRequirement is not registered, assumes the backend requires
-    // CPU buffer.
-    backend_requires_cpu_buffer = true;
-  }
-
-  if (backend_requires_cpu_buffer) {
-    // When backend requires CPU buffer.
-    bool buffer_is_cpu_compatible =
-        buffer->buffer_type() == kLiteRtTensorBufferTypeHostMemory ||
-        buffer->buffer_type() == kLiteRtTensorBufferTypeOpenCl;
-#if defined(__ANDROID__)
-    if (buffer->buffer_type() == kLiteRtTensorBufferTypeAhwb) {
-      if (__builtin_available(android 26, *)) {
-        auto ahwb = buffer->GetAhwbBuffer();
-        if (ahwb) {
-          // TODO: b/382330322 - Update logic to check if the AHWB (stride) is
-          // CPU compatible.
-          AHardwareBuffer_Desc desc;
-          AHardwareBuffer_describe(*ahwb, &desc);
-          buffer_is_cpu_compatible = true;
-        }
-      }
-    }
-#endif
-    if (buffer_is_cpu_compatible) {
-      void* host_mem_addr;
-      if (auto status = LiteRtLockTensorBuffer(buffer, &host_mem_addr);
-          status != kLiteRtStatusOk) {
-        return Unexpected(status, "Failed to lock the tensor buffer");
-      }
-      locked_buffers.push_back(buffer);
-      TfLiteCustomAllocation custom_allocation{host_mem_addr, tensor->bytes};
-      if (is_input) {
-        runner->SetCustomAllocationForInputTensor(tensor_name,
-                                                  custom_allocation,
-                                                  /*flags=*/0);
-      } else {
-        runner->SetCustomAllocationForOutputTensor(tensor_name,
-                                                   custom_allocation,
-                                                   /*flags=*/0);
-      }
-      return {};
-    }
-  }
-
-  // If the tensor is shared with CPU, register tensor buffer as is and let
-  // accelerator handle the conversion.
-  if (cpu_tensors_.find(tensor) != cpu_tensors_.end()) {
-    void* host_mem_addr;
-    if (auto status = LiteRtLockTensorBuffer(buffer, &host_mem_addr);
-        status != kLiteRtStatusOk) {
-      return Unexpected(status, "Failed to lock the tensor buffer");
-    }
-    locked_buffers.push_back(buffer);
-    TfLiteCustomAllocation custom_allocation{host_mem_addr, tensor->bytes};
-    if (is_input) {
-      runner->SetCustomAllocationForInputTensor(tensor_name, custom_allocation,
-                                                /*flags=*/0);
-    } else {
-      runner->SetCustomAllocationForOutputTensor(tensor_name, custom_allocation,
-                                                 /*flags=*/0);
-    }
-    return {};
-  }
-  // TODO: b/382330322 - Add buffer conversion logic instead of returning error.
-  return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                    "The given buffer type is not supported.");
-}
-
-Expected<void> LiteRtCompiledModelT::Run(
-    absl::string_view signature_key,
-    const std::vector<LiteRtTensorBuffer>& input_buffers,
-    const std::vector<LiteRtTensorBuffer>& output_buffers, bool& async) {
-  auto runner = GetSignatureRunner(signature_key);
-  if (runner == nullptr) {
-    return Unexpected(kLiteRtStatusErrorNotFound,
-                      "Failed to get signature runner");
-  }
-  size_t num_inputs = input_buffers.size();
-  if (num_inputs != runner->subgraph_input_names().size()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Input buffer size mismatch");
-  }
-  size_t num_outputs = output_buffers.size();
-  if (num_outputs != runner->subgraph_output_names().size()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Output buffer size mismatch");
-  }
-
-  // In general output buffer events are assigned by the runtime and not the
-  // caller; here we check for any violation of that condition.
-  for (auto litert_output_buffer : output_buffers) {
-    if (litert_output_buffer->HasEvent()) {
-      return Error(kLiteRtStatusErrorInvalidArgument,
-                   "Output buffers cannot have events attached");
-    }
-  }
-
-  // The collection of locked buffers. It is used to unlock the buffers after
-  // the inference is done.
-  std::vector<LiteRtTensorBuffer> locked_buffers;
-  locked_buffers.reserve(num_inputs + num_outputs);
-  auto unlock_buffers = absl::MakeCleanup([&locked_buffers]() {
-    for (auto locked_buffer : locked_buffers) {
-      LiteRtUnlockTensorBuffer(locked_buffer);
-    }
-  });
-  for (int i = 0; i < num_inputs; ++i) {
-    const auto& input_name = runner->subgraph_input_names()[i];
-    auto* input_tensor = runner->input_tensor(input_name);
-    auto res =
-        RegisterBuffer(runner, input_tensor, input_name, input_buffers[i],
-                       /*is_input=*/true, locked_buffers);
-    if (!res) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        absl::StrCat("Failed to register input tensor buffer: ",
-                                     res.Error().Message()));
-    }
-  }
-
-  for (int i = 0; i < runner->subgraph_output_names().size(); ++i) {
-    const auto& output_name = runner->subgraph_output_names()[i];
-    auto* output_tensor = runner->output_tensor(output_name);
-    auto res = RegisterBuffer(runner, const_cast<TfLiteTensor*>(output_tensor),
-                              output_name, output_buffers[i],
-                              /*is_input=*/false, locked_buffers);
-    if (!res) {
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrCat("Failed to register output tensor buffer: ",
-                       res.Error().Message()));
-    }
-  }
-
-  if (auto res = runner->AllocateTensors(); res != kTfLiteOk) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to allocate tensors");
-  }
-
-  if (auto res = runner->Invoke(); res != kTfLiteOk) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure, "Failed to invoke");
-  }
-
-  if (async) {
-    // If the caller requested async execution, then set async to true if any of
-    // the output buffers have been assigned a synchronization event.
-    async = false;
-    for (auto& tb : output_buffers) {
-      async |= tb->HasEvent();
-    }
-  } else {
-    // If the caller has not requested async execution, then wait on
-    // synchronization events that have been attached to the outputs.
-    for (auto& tb : output_buffers) {
-      if (tb->HasEvent()) {
-        auto event = tb->GetEvent();
-        if (auto status = litert::Event(*event, /*owned=*/false)
-                              .Wait(/*timeout_in_ms=*/-1);
-            !status) {
-          return status;
-        }
-      }
-    }
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtCompiledModelT::RunCApi(
-    size_t signature_index, size_t num_input_buffers,
-    LiteRtTensorBuffer* input_buffers, size_t num_output_buffers,
-    LiteRtTensorBuffer* output_buffers, bool* async) {
-  if (signature_index >= signature_keys_.size()) {
-    return litert::Unexpected(
-        kLiteRtStatusErrorIndexOOB,
-        "Signature index is out of range of signature keys");
-  }
-  std::vector<LiteRtTensorBuffer> input_buffers_vec;
-  input_buffers_vec.reserve(num_input_buffers);
-  for (int i = 0; i < num_input_buffers; ++i) {
-    input_buffers_vec.push_back(std::move(input_buffers[i]));
-  }
-  std::vector<LiteRtTensorBuffer> output_buffers_vec;
-  output_buffers_vec.reserve(num_output_buffers);
-  for (int i = 0; i < num_output_buffers; ++i) {
-    output_buffers_vec.push_back(std::move(output_buffers[i]));
-  }
-  bool async_ = async ? *async : false;
-  auto result = Run(*signature_keys_[signature_index], input_buffers_vec,
-                    output_buffers_vec, async_);
-  if (async) {
-    *async = async_;
-  }
-  return result;
-}
diff --git a/tensorflow/lite/experimental/litert/runtime/compiled_model.h b/tensorflow/lite/experimental/litert/runtime/compiled_model.h
deleted file mode 100644
index c98f64d6b4ee..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/compiled_model.h
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_COMPILED_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_COMPILED_MODEL_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/mlir/lite/allocation.h"
-#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/model_builder.h"
-
-// The LiteRtCompiledModelT is internal implementation of CompiledModel C++ API.
-class LiteRtCompiledModelT {
- public:
-  using Ptr = std::unique_ptr<LiteRtCompiledModelT>;
-  struct OptionsDeleter {
-    void operator()(LiteRtCompilationOptionsT* options) {
-      LiteRtDestroyCompilationOptions(options);
-    }
-  };
-  using OptionsPtr = std::unique_ptr<LiteRtCompilationOptionsT, OptionsDeleter>;
-
-  LiteRtCompiledModelT() = default;
-  ~LiteRtCompiledModelT() = default;
-
-  // Creates a LiteRtCompiledModelT from a LiteRtModel object.
-  // The model is loaded into memory and the caller takes ownership of the
-  // returned object.
-  static litert::Expected<Ptr> Create(LiteRtEnvironmentT* env,
-                                      LiteRtModel model,
-                                      OptionsPtr compilation_options = nullptr);
-
-  // Returns the buffer requirements for the n-th input tensor. The returned
-  // LiteRtTensorBufferRequirements is used to create the input tensor
-  // buffer.
-  litert::Expected<LiteRtTensorBufferRequirements> GetInputBufferRequirements(
-      absl::string_view signature_key, size_t input_index);
-
-  // The same as GetInputBufferRequirements() for C API.
-  litert::Expected<LiteRtTensorBufferRequirements>
-  GetInputBufferRequirementsCApi(size_t signature_index, size_t input_index) {
-    if (signature_index >= signature_keys_.size()) {
-      return litert::Unexpected(
-          kLiteRtStatusErrorIndexOOB,
-          "Signature index is out of range of signature keys");
-    }
-    return GetInputBufferRequirements(*signature_keys_[signature_index],
-                                      input_index);
-  }
-
-  // Returns the buffer requirements for the n-th output tensor. The returned
-  // LiteRtTensorBufferRequirements is used to create the output tensor
-  // buffer.
-  litert::Expected<LiteRtTensorBufferRequirements> GetOutputBufferRequirements(
-      absl::string_view signature_key, size_t output_index);
-
-  // The same as GetOutputBufferRequirements() for C API.
-  litert::Expected<LiteRtTensorBufferRequirements>
-  GetOutputBufferRequirementsCApi(size_t signature_index, size_t output_index) {
-    if (signature_index >= signature_keys_.size()) {
-      return litert::Unexpected(
-          kLiteRtStatusErrorIndexOOB,
-          "Signature index is out of range of signature keys");
-    }
-    return GetOutputBufferRequirements(*signature_keys_[signature_index],
-                                       output_index);
-  }
-
-  // Runs the model of the given signature with the provided input/output
-  // litert::TensorBuffers. If parameter `async` is true, then the model is run
-  // asynchronously, if possible. Upon returning, the function sets parameter
-  // `async` to true if asynchronous execution was requested and possible,
-  // otherwise it sets it to false.
-  litert::Expected<void> Run(
-      absl::string_view signature_key,
-      const std::vector<LiteRtTensorBuffer>& input_buffers,
-      const std::vector<LiteRtTensorBuffer>& output_buffers, bool& async);
-
-  // The same as Run() for C API.
-  litert::Expected<void> RunCApi(size_t signature_index,
-                                 size_t num_input_buffers,
-                                 LiteRtTensorBuffer* input_buffers,
-                                 size_t num_output_buffers,
-                                 LiteRtTensorBuffer* output_buffers,
-                                 bool* async);
-
- private:
-  // Processes the model and initializes the internal states.
-  // This is called in the public Create*() methods.
-  litert::Expected<void> Initialize();
-
-  // Returns the buffer requirements for the given tensor.
-  litert::Expected<LiteRtTensorBufferRequirements> GetTensorBufferRequirements(
-      const TfLiteTensor* tensor);
-
-  // Returns the SignatureRunner for the given signature key.
-  // If the signature key is not found, returns nullptr.
-  tflite::SignatureRunner* GetSignatureRunner(absl::string_view signature_key);
-
-  // Registers the TensorBuffer for the given tensor with the SignatureRunner.
-  // If the TensorBuffer can be directly consumed as CPU Tensors, they'll be
-  // locked and use it with CustomAllocation. The locked buffer is kept in the
-  // `locked_buffers`. Caller is responsible for unlocking of these buffers.
-  // If the TensorBuffer can be consumed by the delegate, then `tensor` will be
-  // marked as non-CPU to avoid TFLite from allocating it.
-  litert::Expected<void> RegisterBuffer(
-      tflite::SignatureRunner* runner, TfLiteTensor* tensor,
-      const char* tensor_name, LiteRtTensorBuffer buffer, bool is_input,
-      std::vector<LiteRtTensorBuffer>& locked_buffers);
-
-  void RegisterDelegate(tflite::TfLiteOpaqueDelegateUniquePtr&& delegate) {
-    delegates_.push_back(std::move(delegate));
-  }
-
-  // Checks the CPU Tensors and stores them in the `cpu_tensors_` set.
-  void CheckCpuTensors();
-
-  // Map from signature key to SignatureRunner. This is used to lazy calling
-  // GetSignatureRunner() which is expensive.
-  absl::flat_hash_map<absl::string_view, tflite::SignatureRunner*>
-      signature_runners_;
-
-  // The buffer requirement maps for CPU buffers. For delegates with CPU
-  // buffers, they don't register TensorBufferRequirements. Instead, the
-  // CompiledModel creates the TensorBufferRequirements and stores them
-  // in this map.
-  absl::flat_hash_map<const TfLiteTensor*, litert::TensorBufferRequirements>
-      cpu_buffer_requirements_;
-
-  // The Interpreter and related objects used to run the model.
-  std::unique_ptr<::tflite::Interpreter> interp_;
-  std::unique_ptr<::tflite::FlatBufferModel> fb_model_;
-  std::unique_ptr<::tflite::Allocation> alloc_;
-  litert::OwningBufferRef<uint8_t> model_buf_;
-  std::vector<const std::string*> signature_keys_;
-
-  // The ExternalLiteRtBufferContext used to register tensor buffers with
-  // Delegates.
-  // Note: The ExternalLiteRtBufferContext must be destroyed after the
-  // Interpreter.
-  std::unique_ptr<litert::internal::ExternalLiteRtBufferContext>
-      buffer_context_;
-
-  std::vector<tflite::TfLiteOpaqueDelegateUniquePtr> delegates_;
-
-  // The set of CPU Tensors. This is used to manage TensorBufferRequirements
-  // for shared CPU Tensors.
-  absl::flat_hash_set<const void*> cpu_tensors_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_COMPILED_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/compiled_model_test.cc b/tensorflow/lite/experimental/litert/runtime/compiled_model_test.cc
deleted file mode 100644
index 14b0ac5f94db..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/compiled_model_test.cc
+++ /dev/null
@@ -1,547 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/compiled_model.h"
-
-#include <cstddef>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-
-namespace litert {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-// Creates a tensor buffer of the given tensor, buffer type, and size.
-Expected<LiteRtTensorBufferT*> CreateBufferOfType(
-    const LiteRtTensorT& tensor, LiteRtTensorBufferType buffer_type,
-    size_t bytes) {
-  const LiteRtRankedTensorType ranked_tensor_type =
-      tensor.Type().second.ranked_tensor_type;
-
-  LiteRtTensorBufferT* tensor_buffer;
-  LITERT_RETURN_IF_ERROR(LiteRtCreateManagedTensorBuffer(
-      buffer_type, &ranked_tensor_type, bytes, &tensor_buffer));
-
-  return tensor_buffer;
-}
-
-// Creates input or output tensor buffers of the given model, buffer type and
-// size.
-Expected<std::vector<LiteRtTensorBufferT*>> CreateInputOutputBuffersOfType(
-    LiteRtModelT& model, absl::string_view signature_key,
-    LiteRtTensorBufferType buffer_type, size_t bytes, bool is_input) {
-  LITERT_ASSIGN_OR_RETURN(const LiteRtSignatureT& signature,
-                          model.FindSignature(signature_key));
-  const LiteRtSubgraphT& subgraph = signature.GetSubgraph();
-
-  const std::vector<LiteRtTensorT*>& tensors =
-      is_input ? subgraph.Inputs() : subgraph.Outputs();
-
-  std::vector<LiteRtTensorBufferT*> tensor_buffers;
-  tensor_buffers.reserve(tensors.size());
-
-  for (int i = 0; i < tensors.size(); ++i) {
-    LITERT_ASSIGN_OR_RETURN(
-        LiteRtTensorBufferT * tensor_buffer,
-        CreateBufferOfType(*tensors[i], buffer_type, bytes));
-    tensor_buffers.push_back(tensor_buffer);
-  }
-  return tensor_buffers;
-}
-
-// Creates input buffers of the given model, buffer type, and size.
-Expected<std::vector<LiteRtTensorBufferT*>> CreateInputBuffersOfType(
-    LiteRtModelT& model, absl::string_view signature_key,
-    LiteRtTensorBufferType buffer_type, size_t bytes) {
-  return CreateInputOutputBuffersOfType(model, signature_key, buffer_type,
-                                        bytes, /*is_input=*/true);
-}
-
-// Creates output buffers of the given model, buffer type, and size.
-Expected<std::vector<LiteRtTensorBufferT*>> CreateOutputBuffersOfType(
-    LiteRtModelT& model, absl::string_view signature_key,
-    LiteRtTensorBufferType buffer_type, size_t bytes) {
-  return CreateInputOutputBuffersOfType(model, signature_key, buffer_type,
-                                        bytes, /*is_input=*/false);
-}
-
-// Creates a tensor buffer of the given tensor and buffer requirements.
-Expected<LiteRtTensorBufferT*> CreateBufferFromRequirements(
-    const LiteRtTensorT& tensor,
-    const LiteRtTensorBufferRequirementsT& requirements) {
-  return CreateBufferOfType(tensor, requirements.SupportedBufferTypes().at(0),
-                            requirements.BufferSize());
-}
-
-// Creates input or output tensor buffers of the given model and requirements.
-Expected<std::vector<LiteRtTensorBufferT*>>
-CreateInputOutputBuffersFromRequirements(LiteRtModelT& model,
-                                         absl::string_view signature_key,
-                                         LiteRtCompiledModelT& compiled_model,
-                                         bool is_input) {
-  LITERT_ASSIGN_OR_RETURN(const LiteRtSignatureT& signature,
-                          model.FindSignature(signature_key));
-  const LiteRtSubgraphT& subgraph = signature.GetSubgraph();
-
-  const std::vector<LiteRtTensorT*>& tensors =
-      is_input ? subgraph.Inputs() : subgraph.Outputs();
-
-  std::vector<LiteRtTensorBufferT*> tensor_buffers;
-  tensor_buffers.reserve(tensors.size());
-
-  for (int i = 0; i < tensors.size(); ++i) {
-    Expected<LiteRtTensorBufferRequirementsT*> requirements_expected =
-        is_input ? compiled_model.GetInputBufferRequirements(signature_key, i)
-                 : compiled_model.GetOutputBufferRequirements(signature_key, i);
-    LITERT_ASSIGN_OR_RETURN(LiteRtTensorBufferRequirementsT * requirements,
-                            requirements_expected);
-
-    LITERT_ASSIGN_OR_RETURN(
-        LiteRtTensorBufferT * tensor_buffer,
-        CreateBufferFromRequirements(*tensors[i], *requirements));
-    tensor_buffers.push_back(tensor_buffer);
-  }
-  return tensor_buffers;
-}
-
-// Creates input buffers of the given model and requirements.
-Expected<std::vector<LiteRtTensorBufferT*>> CreateInputBuffersFromRequirements(
-    LiteRtModelT& model, absl::string_view signature_key,
-    LiteRtCompiledModelT& compiled_model) {
-  return CreateInputOutputBuffersFromRequirements(model, signature_key,
-                                                  compiled_model,
-                                                  /*is_input=*/true);
-}
-
-// Creates output buffers of the given model and requirements.
-Expected<std::vector<LiteRtTensorBufferT*>> CreateOutputBuffersFromRequirements(
-    LiteRtModelT& model, absl::string_view signature_key,
-    LiteRtCompiledModelT& compiled_model) {
-  return CreateInputOutputBuffersFromRequirements(model, signature_key,
-                                                  compiled_model,
-                                                  /*is_input=*/false);
-}
-
-TEST(CompiledModelTest, Basic) {
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(LiteRtEnvironmentT::Ptr env,
-                              LiteRtEnvironmentT::CreateWithOptions({}));
-  LiteRtEnvironmentT* env_ptr = env.release();
-
-  // Create LiteRtModel and check signatures.
-  std::string path = testing::GetTestFilePath(kModelFileName);
-  LiteRtModel model;
-  ASSERT_EQ(LiteRtCreateModelFromFile(path.c_str(), &model), kLiteRtStatusOk);
-
-  absl::Span<LiteRtSignature> signatures = model->Signatures();
-  ASSERT_EQ(signatures.size(), 1);
-  absl::string_view signature_key = signatures[0]->Key();
-  EXPECT_EQ(signature_key, LiteRtSignatureT::kDefaultSignatureKey);
-
-  const std::vector<std::string>& input_names = signatures[0]->InputNames();
-  EXPECT_THAT(input_names, ElementsAre("arg0", "arg1"));
-
-  const std::vector<std::string>& output_names = signatures[0]->OutputNames();
-  EXPECT_THAT(output_names, ElementsAre("tfl.add"));
-
-  // Create CompiledModel with options.
-  LiteRtCompilationOptions compilation_options;
-  ASSERT_EQ(LiteRtCreateCompilationOptions(&compilation_options),
-            kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                compilation_options, kLiteRtHwAcceleratorCpu),
-            kLiteRtStatusOk);
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtCompiledModelT::Ptr compiled_model,
-      LiteRtCompiledModelT::Create(
-          env_ptr, model,
-          LiteRtCompiledModelT::OptionsPtr(compilation_options)));
-
-  // Check CompiledModel buffer requirements.
-  // input and output expect host memory.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * input_buffer_requirements_arg0,
-      compiled_model->GetInputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*input_index=*/0));
-  const std::vector<LiteRtTensorBufferType>& input_buffer_types_arg0 =
-      input_buffer_requirements_arg0->SupportedBufferTypes();
-  EXPECT_THAT(input_buffer_types_arg0,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * input_buffer_requirements_arg1,
-      compiled_model->GetInputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*input_index=*/1));
-  const std::vector<LiteRtTensorBufferType>& input_buffer_types_arg1 =
-      input_buffer_requirements_arg1->SupportedBufferTypes();
-  EXPECT_THAT(input_buffer_types_arg1,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * output_buffer_requirements,
-      compiled_model->GetOutputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*output_index=*/0));
-  const std::vector<LiteRtTensorBufferType>& output_buffer_types =
-      output_buffer_requirements->SupportedBufferTypes();
-  EXPECT_THAT(output_buffer_types,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  // Create and fill input and output LiteRtTensorBuffers. Buffers are
-  // created to match CompiledModel's TensorBufferRequirements.
-  LITERT_ASSERT_OK_AND_ASSIGN(std::vector<LiteRtTensorBuffer> input_buffers,
-                              CreateInputBuffersFromRequirements(
-                                  *model, signature_key, *compiled_model));
-  LITERT_ASSERT_OK_AND_ASSIGN(std::vector<LiteRtTensorBuffer> output_buffers,
-                              CreateOutputBuffersFromRequirements(
-                                  *model, signature_key, *compiled_model));
-
-  LiteRtTensorBuffer& input_0_buffer = input_buffers[0];
-  {
-    TensorBuffer cpu_buffer(input_0_buffer, /*owned=*/false);
-    cpu_buffer.Write<float>(
-        absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size));
-  }
-  LiteRtTensorBuffer& input_1_buffer = input_buffers[1];
-  {
-    TensorBuffer cpu_buffer(input_1_buffer, /*owned=*/false);
-    cpu_buffer.Write<float>(
-        absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size));
-  }
-
-  // Execute model.
-  bool async = false;
-  compiled_model->Run(signature_key, input_buffers, output_buffers, async);
-
-  // Check model output.
-  {
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_buffers[0], &host_mem_addr),
-              kLiteRtStatusOk);
-    absl::Span<const float> output = absl::MakeSpan(
-        static_cast<const float*>(host_mem_addr), kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_buffers[0]), kLiteRtStatusOk);
-  }
-
-  // Since Buffers in LiteRtTensorBuffer, we need to destroy them explicitly.
-  for (auto& input_buffer : input_buffers) {
-    LiteRtDestroyTensorBuffer(input_buffer);
-  }
-  for (auto& output_buffer : output_buffers) {
-    LiteRtDestroyTensorBuffer(output_buffer);
-  }
-
-  LiteRtDestroyModel(model);
-  LiteRtDestroyEnvironment(env_ptr);
-}
-
-TEST(CompiledModelTest, UseAhwbBuffer) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices";
-#endif
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(LiteRtEnvironmentT::Ptr env,
-                              LiteRtEnvironmentT::CreateWithOptions({}));
-  LiteRtEnvironmentT* env_ptr = env.release();
-
-  // Create LiteRtModel and check signatures.
-  std::string path = testing::GetTestFilePath(kModelFileName);
-  LiteRtModel model;
-  ASSERT_EQ(LiteRtCreateModelFromFile(path.c_str(), &model), kLiteRtStatusOk);
-
-  absl::Span<LiteRtSignature> signatures = model->Signatures();
-  ASSERT_EQ(signatures.size(), 1);
-  absl::string_view signature_key = signatures[0]->Key();
-  EXPECT_EQ(signature_key, LiteRtSignatureT::kDefaultSignatureKey);
-
-  const std::vector<std::string>& input_names = signatures[0]->InputNames();
-  EXPECT_THAT(input_names, ElementsAre("arg0", "arg1"));
-
-  const std::vector<std::string>& output_names = signatures[0]->OutputNames();
-  EXPECT_THAT(output_names, ElementsAre("tfl.add"));
-
-  // Create CompiledModel with options.
-  LiteRtCompilationOptions compilation_options;
-  ASSERT_EQ(LiteRtCreateCompilationOptions(&compilation_options),
-            kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                compilation_options, kLiteRtHwAcceleratorCpu),
-            kLiteRtStatusOk);
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtCompiledModelT::Ptr compiled_model,
-      LiteRtCompiledModelT::Create(
-          env_ptr, model,
-          LiteRtCompiledModelT::OptionsPtr(compilation_options)));
-
-  // Check input and output buffer requirements expect host memory.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * input_buffer_requirements_arg0,
-      compiled_model->GetInputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*input_index=*/0));
-  const std::vector<LiteRtTensorBufferType>& input_buffer_types_arg0 =
-      input_buffer_requirements_arg0->SupportedBufferTypes();
-  EXPECT_THAT(input_buffer_types_arg0,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * input_buffer_requirements_arg1,
-      compiled_model->GetInputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*input_index=*/1));
-  const std::vector<LiteRtTensorBufferType>& input_buffer_types_arg1 =
-      input_buffer_requirements_arg1->SupportedBufferTypes();
-  EXPECT_THAT(input_buffer_types_arg1,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * output_buffer_requirements,
-      compiled_model->GetOutputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*output_index=*/0));
-  const std::vector<LiteRtTensorBufferType>& output_buffer_types =
-      output_buffer_requirements->SupportedBufferTypes();
-  EXPECT_THAT(output_buffer_types,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  // Create and fill input and output buffers. CompiledModel's
-  // TensorBufferRequirements expect host memory,but we create AHWB buffers.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBuffer> input_buffers,
-      CreateInputBuffersOfType(*model, signature_key,
-                               kLiteRtTensorBufferTypeAhwb,
-                               sizeof(float) * kTestInput0Size));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBuffer> output_buffers,
-      CreateOutputBuffersOfType(*model, signature_key,
-                                kLiteRtTensorBufferTypeAhwb,
-                                sizeof(float) * kTestOutputSize));
-
-  LiteRtTensorBuffer& input_0_buffer = input_buffers[0];
-  EXPECT_EQ(input_0_buffer->buffer_type(), kLiteRtTensorBufferTypeAhwb);
-  {
-    TensorBuffer ahwb_buffer(input_0_buffer, /*owned=*/false);
-    ahwb_buffer.Write<float>(
-        absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size));
-  }
-  LiteRtTensorBuffer& input_1_buffer = input_buffers[1];
-  {
-    TensorBuffer ahwb_buffer(input_1_buffer, /*owned=*/false);
-    ahwb_buffer.Write<float>(
-        absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size));
-  }
-
-  // Execute model.
-  bool async = false;
-  compiled_model->Run(signature_key, input_buffers, output_buffers, async);
-
-  // Check model output.
-  {
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_buffers[0], &host_mem_addr),
-              kLiteRtStatusOk);
-    absl::Span<const float> output = absl::MakeSpan(
-        static_cast<const float*>(host_mem_addr), kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_buffers[0]), kLiteRtStatusOk);
-  }
-
-  // Since Buffers in LiteRtTensorBuffer, we need to destroy them explicitly.
-  for (auto& input_buffer : input_buffers) {
-    LiteRtDestroyTensorBuffer(input_buffer);
-  }
-  for (auto& output_buffer : output_buffers) {
-    LiteRtDestroyTensorBuffer(output_buffer);
-  }
-
-  LiteRtDestroyModel(model);
-  LiteRtDestroyEnvironment(env_ptr);
-}
-
-TEST(CompiledModelTest, UseOpenCLBuffer) {
-  // MSAN does not support GPU tests.
-#if defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
-  GTEST_SKIP() << "GPU tests are not supported In msan";
-#endif
-
-  if (!litert::internal::OpenClBuffer::IsSupported()) {
-    GTEST_SKIP() << "OpenCL buffers are not supported on this platform; "
-                    "skipping the test";
-  }
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(LiteRtEnvironmentT::Ptr env,
-                              LiteRtEnvironmentT::CreateWithOptions({}));
-  LiteRtEnvironmentT* env_ptr = env.release();
-
-  // Create LiteRtModel and check signatures.
-  std::string path = testing::GetTestFilePath(kModelFileName);
-  LiteRtModel model;
-  ASSERT_EQ(LiteRtCreateModelFromFile(path.c_str(), &model), kLiteRtStatusOk);
-
-  absl::Span<LiteRtSignature> signatures = model->Signatures();
-  ASSERT_EQ(signatures.size(), 1);
-  absl::string_view signature_key = signatures[0]->Key();
-  EXPECT_EQ(signature_key, LiteRtSignatureT::kDefaultSignatureKey);
-
-  const std::vector<std::string>& input_names = signatures[0]->InputNames();
-  EXPECT_THAT(input_names, ElementsAre("arg0", "arg1"));
-
-  const std::vector<std::string>& output_names = signatures[0]->OutputNames();
-  EXPECT_THAT(output_names, ElementsAre("tfl.add"));
-
-  // Create CompiledModel with options.
-  LiteRtCompilationOptions compilation_options;
-  ASSERT_EQ(LiteRtCreateCompilationOptions(&compilation_options),
-            kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtSetCompilationOptionsHardwareAccelerators(
-                compilation_options, kLiteRtHwAcceleratorCpu),
-            kLiteRtStatusOk);
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtCompiledModelT::Ptr compiled_model,
-      LiteRtCompiledModelT::Create(
-          env_ptr, model,
-          LiteRtCompiledModelT::OptionsPtr(compilation_options)));
-
-  // Check ComiledModel buffer requirements.
-  // input and output expect host memory.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * input_buffer_requirements_arg0,
-      compiled_model->GetInputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*input_index=*/0));
-  const std::vector<LiteRtTensorBufferType>& input_buffer_types_arg0 =
-      input_buffer_requirements_arg0->SupportedBufferTypes();
-  EXPECT_THAT(input_buffer_types_arg0,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * input_buffer_requirements_arg1,
-      compiled_model->GetInputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*input_index=*/1));
-  const std::vector<LiteRtTensorBufferType>& input_buffer_types_arg1 =
-      input_buffer_requirements_arg1->SupportedBufferTypes();
-  EXPECT_THAT(input_buffer_types_arg1,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferRequirementsT * output_buffer_requirements,
-      compiled_model->GetOutputBufferRequirements(
-          /*signature_key=*/LiteRtSignatureT::kDefaultSignatureKey,
-          /*output_index=*/0));
-  const std::vector<LiteRtTensorBufferType>& output_buffer_types =
-      output_buffer_requirements->SupportedBufferTypes();
-  EXPECT_THAT(output_buffer_types,
-              ElementsAre(kLiteRtTensorBufferTypeHostMemory));
-
-  // Create and fill input and output buffers. CompiledModel's
-  // TensorBufferRequirements expect host memory,but we create OpenCL buffers.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBuffer> input_buffers,
-      CreateInputBuffersOfType(*model, signature_key,
-                               kLiteRtTensorBufferTypeOpenCl,
-                               sizeof(float) * kTestInput0Size));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBuffer> output_buffers,
-      CreateOutputBuffersOfType(*model, signature_key,
-                                kLiteRtTensorBufferTypeOpenCl,
-                                sizeof(float) * kTestOutputSize));
-
-  // Fill model inputs.
-  LiteRtTensorBuffer& input_0_buffer = input_buffers[0];
-  EXPECT_EQ(input_0_buffer->buffer_type(), kLiteRtTensorBufferTypeOpenCl);
-  {
-    TensorBuffer opencl_buffer(input_0_buffer, /*owned=*/false);
-    opencl_buffer.Write<float>(
-        absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size));
-  }
-  LiteRtTensorBuffer& input_1_buffer = input_buffers[1];
-  {
-    TensorBuffer opencl_buffer(input_1_buffer, /*owned=*/false);
-    opencl_buffer.Write<float>(
-        absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size));
-  }
-
-  // Execute model.
-  bool async = false;
-  compiled_model->Run(signature_key, input_buffers, output_buffers, async);
-
-  // Check model output.
-  {
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_buffers[0], &host_mem_addr),
-              kLiteRtStatusOk);
-    absl::Span<const float> output = absl::MakeSpan(
-        static_cast<const float*>(host_mem_addr), kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_buffers[0]), kLiteRtStatusOk);
-  }
-
-  // Since Buffers in LiteRtTensorBuffer, we need to destroy them explicitly.
-  for (auto& input_buffer : input_buffers) {
-    LiteRtDestroyTensorBuffer(input_buffer);
-  }
-  for (auto& output_buffer : output_buffers) {
-    LiteRtDestroyTensorBuffer(output_buffer);
-  }
-
-  LiteRtDestroyModel(model);
-  LiteRtDestroyEnvironment(env_ptr);
-}
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/compiler/BUILD b/tensorflow/lite/experimental/litert/runtime/compiler/BUILD
deleted file mode 100644
index 43bef76096cb..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/compiler/BUILD
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_test(
-    name = "jit_compilation_qualcomm_test",
-    srcs = ["jit_compilation_qualcomm_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:simple_model",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:qnn_compiler_plugin_so",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch:dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_compiled_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "jit_compilation_mediatek_test",
-    srcs = ["jit_compilation_mediatek_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:simple_model",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek/compiler:compiler_plugin_so",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek/dispatch:dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        "no_oss",
-        "nobuilder",
-        "notap",
-    ],
-    deps = [
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_compiled_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/runtime/compiler/jit_compilation_mediatek_test.cc b/tensorflow/lite/experimental/litert/runtime/compiler/jit_compilation_mediatek_test.cc
deleted file mode 100644
index 959e7d78d785..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/compiler/jit_compilation_mediatek_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <array>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-
-constexpr const char* kCompilerPluginLibSearchPath = "/data/local/tmp";
-
-using testing::FloatNear;
-using testing::Pointwise;
-
-TEST(JitCompilation, MediaTek) {
-  const std::array environment_options = {
-      litert::Environment::Option{
-          /*.tag=*/litert::Environment::OptionTag::CompilerPluginLibraryDir,
-          /*.value=*/kCompilerPluginLibSearchPath,
-      },
-  };
-  LITERT_ASSERT_OK_AND_ASSIGN(auto environment,
-                              litert::Environment::Create(environment_options));
-
-  auto model_path = litert::testing::GetTestFilePath(kModelFileName);
-  LITERT_ASSERT_OK_AND_ASSIGN(auto model,
-                              litert::Model::CreateFromFile(model_path));
-
-  auto num_signatures = model.GetNumSignatures();
-  ASSERT_EQ(num_signatures, 1);
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "MediaTek NPU";
-#endif
-
-  LITERT_ASSERT_OK_AND_ASSIGN(auto compiled_model,
-                              litert::CompiledModel::Create(
-                                  environment, model, kLiteRtHwAcceleratorNpu));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      auto input_buffers,
-      compiled_model.CreateInputBuffers(/*signature_index=*/0));
-  EXPECT_EQ(input_buffers.size(), 2);
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      auto output_buffers,
-      compiled_model.CreateOutputBuffers(/*signature_index=*/0));
-  EXPECT_EQ(output_buffers.size(), 1);
-
-  LITERT_ASSERT_OK(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  LITERT_ASSERT_OK(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(/*signature_index=*/0, input_buffers, output_buffers);
-
-  // Check model output.
-  {
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        auto lock_and_addr,
-        litert::TensorBufferScopedLock::Create<const float>(output_buffers[0]));
-    auto output = absl::MakeSpan(lock_and_addr.second, kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/runtime/compiler/jit_compilation_qualcomm_test.cc b/tensorflow/lite/experimental/litert/runtime/compiler/jit_compilation_qualcomm_test.cc
deleted file mode 100644
index 18d1d8ced631..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/compiler/jit_compilation_qualcomm_test.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <array>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-
-constexpr const char* kCompilerPluginLibSearchPath = "/data/local/tmp";
-
-using testing::FloatNear;
-using testing::Pointwise;
-
-TEST(JitCompilation, Qualcomm) {
-  const std::array environment_options = {
-      litert::Environment::Option{
-          /*.tag=*/litert::Environment::OptionTag::CompilerPluginLibraryDir,
-          /*.value=*/kCompilerPluginLibSearchPath,
-      },
-  };
-  LITERT_ASSERT_OK_AND_ASSIGN(auto environment,
-                              litert::Environment::Create(environment_options));
-
-  auto model_path = litert::testing::GetTestFilePath(kModelFileName);
-  LITERT_ASSERT_OK_AND_ASSIGN(auto model,
-                              litert::Model::CreateFromFile(model_path));
-
-  auto num_signatures = model.GetNumSignatures();
-  ASSERT_EQ(num_signatures, 1);
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "Qualcomm HTP";
-#endif
-
-  LITERT_ASSERT_OK_AND_ASSIGN(auto compiled_model,
-                              litert::CompiledModel::Create(
-                                  environment, model, kLiteRtHwAcceleratorNpu));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      auto input_buffers,
-      compiled_model.CreateInputBuffers(/*signature_index=*/0));
-  EXPECT_EQ(input_buffers.size(), 2);
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      auto output_buffers,
-      compiled_model.CreateOutputBuffers(/*signature_index=*/0));
-  EXPECT_EQ(output_buffers.size(), 1);
-
-  LITERT_ASSERT_OK(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  LITERT_ASSERT_OK(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(/*signature_index=*/0, input_buffers, output_buffers);
-
-  // Check model output.
-  {
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        auto lock_and_addr,
-        litert::TensorBufferScopedLock::Create<const float>(output_buffers[0]));
-    auto output = absl::MakeSpan(lock_and_addr.second, kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/BUILD b/tensorflow/lite/experimental/litert/runtime/dispatch/BUILD
deleted file mode 100644
index facab80c00b7..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/BUILD
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:special_rule.bzl", "gles_linkopts")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-        "//third_party/odml/infra/perf/mobile_tests/litert:__subpackages__",
-    ],
-)
-
-# Dispatch API implementation, it is used by the dispatch delegate to call the vendor's dispatch
-# API.
-cc_library(
-    name = "dispatch",
-    srcs = [
-        "litert_dispatch.cc",
-    ],
-    hdrs = [
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch.h",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_api.h",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_event",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core:dynamic_loading",
-    ],
-)
-
-cc_library(
-    name = "dispatch_delegate",
-    srcs = [
-        "dispatch_delegate.cc",
-        "dispatch_delegate_kernel.cc",
-    ],
-    hdrs = [
-        "dispatch_delegate_kernel.h",
-        "dispatch_delegate_options.h",
-        "//tensorflow/lite/experimental/litert/c:litert_dispatch_delegate.h",
-        "//tensorflow/lite/experimental/litert/cc:litert_dispatch_delegate.h",
-    ],
-    deps = [
-        ":dispatch",
-        "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/c:c_api_opaque_without_op_resolver",
-        "//tensorflow/lite/delegates/utils:simple_opaque_delegate",
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_event",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/core:build_stamp",
-        "//tensorflow/lite/experimental/litert/core:dispatch_op_schema",
-        "//tensorflow/lite/experimental/litert/core:dynamic_loading",
-        "//tensorflow/lite/experimental/litert/core:environment",
-        "//tensorflow/lite/experimental/litert/runtime:external_litert_buffer_context",
-        "//tensorflow/lite/experimental/litert/runtime:tfl_utils",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "dispatch_delegate_google_tensor_test",
-    srcs = ["dispatch_delegate_google_tensor_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/shared_input_cpu_npu.tflite",
-        "//tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch:dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }) + gles_linkopts(),
-    deps = [
-        ":dispatch_delegate",
-        "@com_google_googletest//:gtest_main",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/darwinn/driver_shared/fence:fence_test_util",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/experimental/litert/c:litert_runtime_c_api_shared_lib",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_compiled_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_event",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/core/model:model_buffer",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/runtime:external_litert_buffer_context",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-    ],
-)
-
-cc_test(
-    name = "dispatch_delegate_qualcomm_test",
-    srcs = ["dispatch_delegate_qualcomm_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/shared_input_cpu_npu.tflite",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch:dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":dispatch_delegate",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/experimental/litert/c:litert_runtime_c_api_shared_lib",
-        "//tensorflow/lite/experimental/litert/cc:litert_compiled_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_dispatch_delegate",
-        "//tensorflow/lite/experimental/litert/cc:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/runtime:external_litert_buffer_context",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "dispatch_delegate_mediatek_test",
-    srcs = ["dispatch_delegate_mediatek_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:testdata/shared_input_cpu_npu.tflite",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek/dispatch:dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        "no_oss",
-        "nobuilder",
-        "notap",
-    ],
-    deps = [
-        ":dispatch_delegate",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/litert/c:litert_runtime_c_api_shared_lib",
-        "//tensorflow/lite/experimental/litert/cc:litert_compiled_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/core/model:model_buffer",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/runtime:external_litert_buffer_context",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/README.md b/tensorflow/lite/experimental/litert/runtime/dispatch/README.md
deleted file mode 100644
index 5a2e33e0806a..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-## Google Tensor
-
-Test case can dispatch_delegate_google_tensor_test can be run on a device with a
-Pixel 9 device with the following comands
-
-$ ../../google/run_test_on_android.sh dispatch_delegate_google_tensor_test
-
-## Qualcomm
-
-Test case can dispatch_delegate_qualcomm_test can be run on a Samsung S24 device
-with the following comands
-
-$ ../../google/run_test_on_android.sh dispatch_delegate_qualcomm_test
-
-## MediaTek
-
-Test case can dispatch_delegate_mediatek_test can be run on a device with a
-MetiaTek mt6989 SoC with the following comands
-
-$ ../../google/run_test_on_android.sh dispatch_delegate_mediatek_test
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate.cc b/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate.cc
deleted file mode 100644
index fafccd922fef..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-#include "tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h"
-#include "tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-namespace {
-
-using ::litert::internal::kLiteRtDispatchOpCustomCode;
-
-// A TFL Delegate that can recognize subgraphs that run on Dispatch API capable
-// accelerators, e.g. TPU, DSP, ... It replaces such subgraphs and offloads
-// their work through the Dispatch API.
-class DispatchDelegate : public tflite::SimpleOpaqueDelegateInterface {
- public:
-  static TfLiteOpaqueDelegate* Create(LiteRtDispatchDelegateOptions* options_) {
-    litert::DispatchDelegateOptionsPtr options(
-        options_, LiteRtDestroyDispatchDelegateOptions);
-    if (!options) {
-      LITERT_LOG(LITERT_ERROR, "Null input");
-      return nullptr;
-    }
-
-    std::unique_ptr<DispatchDelegate> managed_sb_delegate(
-        new DispatchDelegate(std::move(options)));
-    return tflite::TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(
-        std::move(managed_sb_delegate),
-        kTfLiteDelegateFlagsAllowDynamicTensors);
-  }
-
-  bool IsNodeSupportedByDelegate(const TfLiteOperator* op,
-                                 const TfLiteOpaqueNode* node,
-                                 TfLiteOpaqueContext* context) const override;
-
-  TfLiteStatus Initialize(TfLiteOpaqueContext* context) override;
-
-  const char* Name() const override;
-
-  std::unique_ptr<tflite::SimpleOpaqueDelegateKernelInterface>
-  CreateDelegateKernelInterface() override;
-
- private:
-  static constexpr absl::string_view kDelegateName = "DispatchDelegate";
-
-  explicit DispatchDelegate(litert::DispatchDelegateOptionsPtr&& options)
-      : options_(std::move(options)) {}
-
-  litert::DispatchDelegateOptionsPtr options_;
-  int dispatch_graph_name_id_ = 0;
-};
-
-bool DispatchDelegate::IsNodeSupportedByDelegate(
-    const TfLiteOperator* op, const TfLiteOpaqueNode* node,
-    TfLiteOpaqueContext* context) const {
-  auto custom_code = absl::string_view(TfLiteOperatorGetCustomName(op));
-  return custom_code == kLiteRtDispatchOpCustomCode;
-}
-
-TfLiteStatus DispatchDelegate::Initialize(TfLiteOpaqueContext* context) {
-  return kTfLiteOk;
-}
-
-const char* DispatchDelegate::Name() const { return kDelegateName.data(); }
-
-std::unique_ptr<tflite::SimpleOpaqueDelegateKernelInterface>
-DispatchDelegate::CreateDelegateKernelInterface() {
-  std::string dispatch_graph_name =
-      absl::StrFormat("DispatchGraph_%d", dispatch_graph_name_id_++);
-
-  auto kernel = litert::internal::DispatchDelegateKernel::Create(
-      std::move(dispatch_graph_name), *options_);
-  if (kernel) {
-    return std::move(*kernel);
-  } else {
-    LITERT_FATAL("Failed to create a dispatch delegate kernel: %s",
-                 kernel.Error().Message().c_str());
-    return nullptr;
-  }
-}
-
-}  // namespace
-
-LiteRtDispatchDelegateOptions* LiteRtCreateDefaultDispatchDelegateOptions(
-    LiteRtEnvironment environment) {
-  return new LiteRtDispatchDelegateOptions(*environment);
-}
-
-TfLiteStatus LiteRtAddDispatchDelegateOption(
-    LiteRtDispatchDelegateOptions* options, LiteRtDispatchOption option) {
-  if (!options) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kTfLiteError;
-  }
-
-  options->AddOption(option);
-  return kTfLiteOk;
-}
-
-TfLiteStatus LiteRtDispatchDelegateAddAllocBaseOption(
-    LiteRtDispatchDelegateOptions* options, const void* alloc_base) {
-  AddAllocBaseOption(alloc_base, *options);
-  return kTfLiteOk;
-}
-
-TfLiteStatus LiteRtDispatchDelegateAddAllocFdOption(
-    LiteRtDispatchDelegateOptions* options, int alloc_fd) {
-  AddAllocFdOption(alloc_fd, *options);
-  return kTfLiteOk;
-}
-
-void LiteRtDestroyDispatchDelegateOptions(
-    LiteRtDispatchDelegateOptions* options) {
-  delete options;
-}
-
-TfLiteOpaqueDelegate* LiteRtCreateDispatchDelegate(
-    LiteRtEnvironment environment, LiteRtDispatchDelegateOptions* options) {
-  if (!options) {
-    options = LiteRtCreateDefaultDispatchDelegateOptions(environment);
-  }
-  return DispatchDelegate::Create(options);
-}
-
-void LiteRtDestroyDispatchDelegate(TfLiteOpaqueDelegate* delegate) {
-  tflite::TfLiteOpaqueDelegateFactory::DeleteSimpleDelegate(delegate);
-}
-
-namespace litert {
-
-DispatchDelegateOptionsPtr CreateDispatchDelegateOptionsPtr(
-    LiteRtEnvironmentT& environment) {
-  return {LiteRtCreateDefaultDispatchDelegateOptions(&environment),
-          LiteRtDestroyDispatchDelegateOptions};
-}
-
-DispatchDelegatePtr CreateDispatchDelegatePtr(
-    LiteRtEnvironmentT& environment, DispatchDelegateOptionsPtr&& options) {
-  return DispatchDelegatePtr(
-      LiteRtCreateDispatchDelegate(&environment, options.release()),
-      LiteRtDestroyDispatchDelegate);
-}
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_google_tensor_test.cc b/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_google_tensor_test.cc
deleted file mode 100644
index 924c2c6a921d..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_google_tensor_test.cc
+++ /dev/null
@@ -1,521 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-
-#if defined(__ANDROID__)
-#include "platforms/darwinn/tachyon/core/fence/fence.h"
-#endif
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "third_party/darwinn/driver_shared/fence/fence_test_util.h"
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_buffer.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/signature_runner.h"
-
-using litert::testing::MakeRuntimeFromTestFileWithNpuModel;
-using testing::FloatNear;
-using testing::Pointwise;
-using Fence = std::shared_ptr<platforms::darwinn::tachyon::Fence>;
-using ::testing::ElementsAre;
-
-namespace litert {
-namespace {
-
-constexpr absl::string_view kNpuFile = kGoogleTensorModelFileName;
-constexpr absl::string_view kTfliteFile = "simple_model_npu.tflite";
-constexpr absl::string_view kDispatchLibraryDir = "/data/local/tmp";
-
-litert::Expected<Environment> CreateDefaultEnvironment() {
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  return litert::Environment::Create(absl::MakeConstSpan(environment_options));
-}
-
-TEST(DispatchDelegate, GoogleTensorCpuBuffer) {
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      testing::TflRuntime::Ptr runtime,
-      MakeRuntimeFromTestFileWithNpuModel(kTfliteFile, kNpuFile));
-  tflite::Interpreter& interpreter = runtime->Interpreter();
-
-  LITERT_ASSERT_OK_AND_ASSIGN(Environment env, CreateDefaultEnvironment());
-
-  internal::ExternalLiteRtBufferContext buffer_context;
-  interpreter.SetExternalContext(kTfLiteLiteRtBufferContext, &buffer_context);
-
-  EXPECT_EQ(interpreter.nodes_size(), 1);
-  EXPECT_EQ(interpreter.inputs().size(), 2);
-  EXPECT_EQ(interpreter.outputs().size(), 1);
-  ASSERT_EQ(interpreter.execution_plan().size(), 1);
-
-  DispatchDelegateOptionsPtr dispatch_delegate_options =
-      CreateDispatchDelegateOptionsPtr(*env.Get());
-  LiteRtDispatchDelegateAddAllocBaseOption(dispatch_delegate_options.get(),
-                                           runtime->Flatbuffer().Buf().Data());
-  DispatchDelegatePtr dispatch_delegate = CreateDispatchDelegatePtr(
-      *env.Get(), std::move(dispatch_delegate_options));
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "GoogleTensor eTPU";
-#endif
-
-  ASSERT_EQ(interpreter.ModifyGraphWithDelegate(dispatch_delegate.get()),
-            kTfLiteOk);
-
-  // Get the list of signatures and check it.
-  auto signature_defs = interpreter.signature_keys();
-  ASSERT_EQ(signature_defs.size(), 1);
-
-  tflite::impl::SignatureRunner* runner =
-      interpreter.GetSignatureRunner(/*signature_key=*/nullptr);
-  ASSERT_NE(runner, nullptr);
-
-  EXPECT_EQ(runner->AllocateTensors(), kTfLiteOk);
-
-  // Fill model inputs.
-  ASSERT_STREQ(runner->input_names()[0], "arg0");
-  TfLiteTensor* input_0_tensor = runner->input_tensor("arg0");
-  ASSERT_NE(input_0_tensor, nullptr);
-  float* input_0 = input_0_tensor->data.f;
-  std::memcpy(input_0, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-
-  ASSERT_STREQ(runner->input_names()[1], "arg1");
-  TfLiteTensor* input_1_tensor = runner->input_tensor("arg1");
-  ASSERT_NE(input_1_tensor, nullptr);
-  auto* input_1 = input_1_tensor->data.f;
-  std::memcpy(input_1, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-
-  EXPECT_EQ(runner->Invoke(), kTfLiteOk);
-
-  // Check model output.
-  ASSERT_STREQ(runner->output_names()[0], "tfl.custom");
-  auto output_tensor = runner->output_tensor("tfl.custom");
-  ASSERT_NE(output_tensor, nullptr);
-  auto output = absl::MakeSpan(output_tensor->data.f, kTestOutputSize);
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output, Pointwise(::testing::FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, GoogleTensorHwBuffer) {
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(Environment env, CreateDefaultEnvironment());
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      testing::TflRuntime::Ptr runtime,
-      MakeRuntimeFromTestFileWithNpuModel(kTfliteFile, kNpuFile));
-  tflite::Interpreter& interpreter = runtime->Interpreter();
-
-  internal::ExternalLiteRtBufferContext buffer_context;
-  interpreter.SetExternalContext(kTfLiteLiteRtBufferContext, &buffer_context);
-
-  EXPECT_EQ(interpreter.nodes_size(), 1);
-  EXPECT_EQ(interpreter.inputs().size(), 2);
-  EXPECT_EQ(interpreter.outputs().size(), 1);
-  ASSERT_EQ(interpreter.execution_plan().size(), 1);
-
-  DispatchDelegateOptionsPtr dispatch_delegate_options =
-      CreateDispatchDelegateOptionsPtr(*env.Get());
-  LiteRtDispatchDelegateAddAllocBaseOption(dispatch_delegate_options.get(),
-                                           runtime->Flatbuffer().Buf().Data());
-  DispatchDelegatePtr dispatch_delegate = CreateDispatchDelegatePtr(
-      *env.Get(), std::move(dispatch_delegate_options));
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "GoogleTensor eTPU";
-#endif
-
-  ASSERT_EQ(interpreter.ModifyGraphWithDelegate(dispatch_delegate.get()),
-            kTfLiteOk);
-
-  // Create and register tensor buffers for all inputs and outputs.
-  std::vector<litert::TensorBuffer> input_buffers;
-  for (int i = 0; i < interpreter.inputs().size(); ++i) {
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        TensorBufferRequirements * input_buffer_requirements,
-        buffer_context.GetBufferRequirement(interpreter.input_tensor(i)));
-    ASSERT_EQ(input_buffer_requirements->SupportedTypes()->at(0),
-              kLiteRtTensorBufferTypeAhwb);
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        TensorBuffer input_buffer,
-        buffer_context.CreateBufferForTensor(interpreter.input_tensor(i)));
-    ASSERT_TRUE(input_buffer.IsOwned());
-    ASSERT_EQ(*input_buffer.BufferType(), kLiteRtTensorBufferTypeAhwb);
-    LITERT_ASSERT_OK_AND_ASSIGN(TensorBuffer duplicate_buffer,
-                                input_buffer.Duplicate());
-    auto status = buffer_context.RegisterTensorBuffer(
-        interpreter.input_tensor(i), std::move(duplicate_buffer));
-    ASSERT_EQ(status, kLiteRtStatusOk);
-    input_buffers.push_back(std::move(input_buffer));
-  }
-
-  std::vector<litert::TensorBuffer> output_buffers;
-  for (int i = 0; i < interpreter.outputs().size(); ++i) {
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        TensorBufferRequirements * output_buffer_requirements,
-        buffer_context.GetBufferRequirement(interpreter.output_tensor(i)));
-    ASSERT_NE(output_buffer_requirements, nullptr);
-    ASSERT_EQ(output_buffer_requirements->SupportedTypes()->at(0),
-              kLiteRtTensorBufferTypeAhwb);
-    LITERT_ASSERT_OK_AND_ASSIGN(
-        TensorBuffer output_buffer,
-        buffer_context.CreateBufferForTensor(interpreter.output_tensor(i)));
-    ASSERT_TRUE(output_buffer.IsOwned());
-    ASSERT_EQ(*output_buffer.BufferType(), kLiteRtTensorBufferTypeAhwb);
-    LITERT_ASSERT_OK_AND_ASSIGN(TensorBuffer duplicate_buffer,
-                                output_buffer.Duplicate());
-    auto status = buffer_context.RegisterTensorBuffer(
-        interpreter.output_tensor(i), std::move(duplicate_buffer));
-    ASSERT_EQ(status, kLiteRtStatusOk);
-    output_buffers.push_back(std::move(output_buffer));
-  }
-
-  // Get the list of signatures and check it.
-  auto signature_defs = interpreter.signature_keys();
-  ASSERT_EQ(signature_defs.size(), 1);
-
-  tflite::impl::SignatureRunner* runner =
-      interpreter.GetSignatureRunner(/*signature_key=*/nullptr);
-  ASSERT_NE(runner, nullptr);
-
-  EXPECT_EQ(runner->AllocateTensors(), kTfLiteOk);
-
-  // Fill model inputs.
-  ASSERT_STREQ(runner->input_names()[0], "arg0");
-  auto& input_0_buffer = input_buffers[0];
-  input_0_buffer.Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size));
-
-  ASSERT_STREQ(runner->input_names()[1], "arg1");
-  auto& input_1_buffer = input_buffers[1];
-  input_1_buffer.Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size));
-
-  EXPECT_EQ(runner->Invoke(), kTfLiteOk);
-
-  // Check model output.
-  ASSERT_STREQ(runner->output_names()[0], "tfl.custom");
-  TensorBuffer& output_buffer = output_buffers[0];
-  float output_buffer_data[kTestOutputSize];
-  auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-  auto read_success = output_buffer.Read<float>(output_span);
-  ASSERT_TRUE(read_success);
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                   << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, CompiledModel) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "GoogleTensor eTPU";
-#endif
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(Environment env, CreateDefaultEnvironment());
-
-  // Create Model and check signatures.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      OwningBufferRef<uint8_t> model_with_byte_code,
-      internal::GetModelBufWithByteCode(testing::GetTestFilePath(kTfliteFile),
-                                        testing::GetTestFilePath(kNpuFile)));
-  LITERT_ASSERT_OK_AND_ASSIGN(Model model,
-                              Model::CreateFromBuffer(model_with_byte_code));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(std::vector<Signature> signatures,
-                              model.GetSignatures());
-  EXPECT_EQ(signatures.size(), 1);
-  Signature& signature = signatures.at(0);
-  EXPECT_EQ(signature.Key(), Model::DefaultSignatureKey());
-  size_t signature_index = 0;
-
-  std::vector<absl::string_view> input_names = signature.InputNames();
-  EXPECT_THAT(input_names, ElementsAre("arg0", "arg1"));
-
-  std::vector<absl::string_view> output_names = signature.OutputNames();
-  EXPECT_THAT(output_names, ElementsAre("tfl.custom"));
-
-  // Create CompiledModel.
-  LITERT_ASSERT_OK_AND_ASSIGN(CompiledModel compiled_model,
-                              CompiledModel::Create(env, model));
-
-  // Check CompiledModel buffer requirements.
-  // input and output expect AHWB.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements input_buffer_requirements_arg0,
-      compiled_model.GetInputBufferRequirements(signature_index,
-                                                /*input_name=*/"arg0"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> input_buffer_types_arg0,
-      input_buffer_requirements_arg0.SupportedTypes());
-  EXPECT_THAT(input_buffer_types_arg0,
-              ElementsAre(kLiteRtTensorBufferTypeAhwb));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements input_buffer_requirements_arg1,
-      compiled_model.GetInputBufferRequirements(signature_index,
-                                                /*input_name=*/"arg1"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> input_buffer_types_arg1,
-      input_buffer_requirements_arg1.SupportedTypes());
-  EXPECT_THAT(input_buffer_types_arg1,
-              ElementsAre(kLiteRtTensorBufferTypeAhwb));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      TensorBufferRequirements output_buffer_requirements,
-      compiled_model.GetOutputBufferRequirements(signature_index,
-                                                 /*output_name=*/"tfl.custom"));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<LiteRtTensorBufferType> output_buffer_types,
-      output_buffer_requirements.SupportedTypes());
-  EXPECT_THAT(output_buffer_types, ElementsAre(kLiteRtTensorBufferTypeAhwb));
-
-  // Create and fill input and output tensor buffers.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<TensorBuffer> input_buffers,
-      compiled_model.CreateInputBuffers(signature_index));
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<TensorBuffer> output_buffers,
-      compiled_model.CreateOutputBuffers(signature_index));
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute compiled model.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  float output_buffer_data[kTestOutputSize];
-  absl::Span<float> output_span =
-      absl::MakeSpan(output_buffer_data, kTestOutputSize);
-  ASSERT_TRUE(output_buffers[0].Read(output_span));
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                   << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, CompiledModelSharedInput) {
-  auto model_with_byte_code = internal::GetModelBufWithByteCode(
-      testing::GetTestFilePath("shared_input_cpu_npu.tflite"),
-      testing::GetTestFilePath(kNpuFile));
-  ASSERT_TRUE(model_with_byte_code);
-  auto model = Model::CreateFromBuffer(*model_with_byte_code);
-  ASSERT_TRUE(model);
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "GoogleTensor eTPU";
-#endif
-  auto options = CompiledModel::Options::Create();
-  ASSERT_TRUE(options);
-  ASSERT_TRUE(options->SetHardwareAccelerators(kLiteRtHwAcceleratorCpu));
-
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-  auto res_compiled_model =
-      CompiledModel::Create(*env, *model, std::move(*options));
-  ASSERT_TRUE(res_compiled_model) << "Failed to initialize CompiledModel";
-  auto& compiled_model = *res_compiled_model;
-
-  size_t signature_index = 0;
-  auto signature = *model->GetSignature(signature_index);
-  auto input_buffers = *compiled_model.CreateInputBuffers(signature_index);
-  auto output_buffers = *compiled_model.CreateOutputBuffers(signature_index);
-
-  // Fill model inputs.
-  auto input_names = signature.InputNames();
-  EXPECT_EQ(input_names.size(), 2);
-  EXPECT_EQ(input_names.at(0), "arg0");
-  EXPECT_EQ(input_names.at(1), "arg1");
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  auto output_names = signature.OutputNames();
-  EXPECT_EQ(output_names.size(), 2);
-  {
-    EXPECT_EQ(output_names.at(0), "tfl.add");
-    float output_buffer_data[kTestOutputSize];
-    auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-    ASSERT_TRUE(output_buffers[0].Read(output_span));
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                     << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-  {
-    EXPECT_EQ(output_names.at(1), "tfl.custom");
-    float output_buffer_data[kTestOutputSize];
-    auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-    ASSERT_TRUE(output_buffers[1].Read(output_span));
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                     << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
-
-TEST(DispatchDelegate, CompiledModelAsync) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP()
-      << "The rest of this test is specific to Android devices with  a "
-         "GoogleTensor eTPU";
-#endif
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(Environment env, CreateDefaultEnvironment());
-
-  // Create Model and check signatures.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      OwningBufferRef<uint8_t> model_with_byte_code,
-      internal::GetModelBufWithByteCode(testing::GetTestFilePath(kTfliteFile),
-                                        testing::GetTestFilePath(kNpuFile)));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(Model model,
-                              Model::CreateFromBuffer(model_with_byte_code));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(std::vector<Signature> signatures,
-                              model.GetSignatures());
-  EXPECT_EQ(signatures.size(), 1);
-  Signature& signature = signatures.at(0);
-  absl::string_view signature_key = signature.Key();
-  EXPECT_EQ(signature_key, Model::DefaultSignatureKey());
-  size_t signature_index = 0;
-
-  std::vector<absl::string_view> input_names = signature.InputNames();
-  EXPECT_THAT(input_names, ElementsAre("arg0", "arg1"));
-
-  std::vector<absl::string_view> output_names = signature.OutputNames();
-  EXPECT_THAT(output_names, ElementsAre("tfl.custom"));
-
-  // Create CompiledModel.
-  LITERT_ASSERT_OK_AND_ASSIGN(CompiledModel compiled_model,
-                              CompiledModel::Create(env, model));
-
-  // Create and fill input and output tensor buffers.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<TensorBuffer> input_buffers,
-      compiled_model.CreateInputBuffers(signature_index));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      std::vector<TensorBuffer> output_buffers,
-      compiled_model.CreateOutputBuffers(signature_index));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(auto input_0_cpu_addr_and_lock,
-                              TensorBufferScopedLock::Create(input_buffers[0]));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(auto input_1_cpu_addr_and_lock,
-                              TensorBufferScopedLock::Create(input_buffers[1]));
-
-  // Attach events to input buffers.
-  Fence input_fence_0 = platforms::darwinn::fence_util::CreateFence();
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      Event input_event_0,
-      litert::Event::CreateFromSyncFenceFd(input_fence_0->GetFd(),
-                                           /*owns_fd=*/false));
-  input_buffers[0].SetEvent(std::move(input_event_0));
-
-  Fence input_fence_1 = platforms::darwinn::fence_util::CreateFence();
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      Event input_event_1,
-      litert::Event::CreateFromSyncFenceFd(input_fence_1->GetFd(),
-                                           /*owns_fd=*/false));
-  input_buffers[1].SetEvent(std::move(input_event_1));
-
-  // Start the model asynchronously.
-  bool async;
-  compiled_model.RunAsync(signature_index, input_buffers, output_buffers,
-                          async);
-  ASSERT_TRUE(async);
-  ASSERT_TRUE(output_buffers[0].HasEvent());
-
-  // Set input values.
-  std::memcpy(input_0_cpu_addr_and_lock.second, kTestInput0Tensor,
-              sizeof(kTestInput0Tensor));
-  std::memcpy(input_1_cpu_addr_and_lock.second, kTestInput1Tensor,
-              sizeof(kTestInput1Tensor));
-
-  // Signal input fences so that the inference can start.
-  ASSERT_OK(input_fence_0->Signal(/*success=*/true));
-  ASSERT_OK(input_fence_1->Signal(/*success=*/true));
-
-  // Check model output.
-  float output_buffer_data[kTestOutputSize];
-  absl::Span<float> output_span =
-      absl::MakeSpan(output_buffer_data, kTestOutputSize);
-  // The next read operation will block on the output buffer's sync fence.
-  ASSERT_TRUE(output_buffers[0].Read(output_span));
-  // Print and confirm the output values are correct.
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                   << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.cc b/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.cc
deleted file mode 100644
index 58ddd4a0dbe2..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.cc
+++ /dev/null
@@ -1,664 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/c/c_api_opaque.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/core/dispatch_op_schema.h"
-#include "tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h"
-#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/tfl_utils.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-namespace litert {
-namespace internal {
-
-DispatchDelegateKernel::~DispatchDelegateKernel() {
-  for (size_t i = 0; i < input_tensor_buffer_handles_.size(); ++i) {
-    (void)LiteRtDispatchDetachInput(invocation_context_, i,
-                                    input_tensor_buffer_handles_[i]);
-  }
-
-  for (size_t i = 0; i < output_tensor_buffer_handles_.size(); ++i) {
-    (void)LiteRtDispatchDetachOutput(invocation_context_, i,
-                                     output_tensor_buffer_handles_[i]);
-  }
-
-  if (invocation_context_) {
-    (void)LiteRtDispatchInvocationContextDestroy(invocation_context_);
-  }
-
-  for (auto& buffer_handle : input_tensor_buffer_handles_) {
-    (void)LiteRtDispatchUnregisterTensorBuffer(device_context_, buffer_handle);
-  }
-
-  for (auto& buffer_handle : output_tensor_buffer_handles_) {
-    (void)LiteRtDispatchUnregisterTensorBuffer(device_context_, buffer_handle);
-  }
-
-  if (device_context_) {
-    (void)LiteRtDispatchDeviceContextDestroy(device_context_);
-  }
-
-  input_tensor_buffers_.clear();
-  output_tensor_buffers_.clear();
-}
-
-Expected<DispatchDelegateKernel::Ptr> DispatchDelegateKernel::Create(
-    std::string&& graph_name, const LiteRtDispatchDelegateOptions& options) {
-  auto dispatch_options = options.GetDispatchOptions();
-  if (auto status = LiteRtDispatchInitialize(dispatch_options.data(),
-                                             dispatch_options.size());
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to initialize Dispatch API: %d", status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to initialize Dispatch API");
-  }
-
-  const char* vendor_id;
-  if (auto status = LiteRtDispatchGetVendorId(&vendor_id);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get Dispatch API vendor ID: %d",
-               status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to get Dispatch API vendor ID");
-  }
-  LITERT_LOG(LITERT_INFO, "Dispatch API vendor ID: %s", vendor_id);
-
-  const char* build_id;
-  if (auto status = LiteRtDispatchGetBuildId(&build_id);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get Dispatch API build ID: %d", status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to get Dispatch API build ID");
-  }
-  LITERT_LOG(LITERT_INFO, "Dispatch API build ID: %s", build_id);
-
-  LiteRtApiVersion api_version;
-  if (auto status = LiteRtDispatchGetApiVersion(&api_version);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get LiteRT Dispatch API version: %d",
-               status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to get LiteRT Dispatch API version");
-  }
-  LITERT_LOG(LITERT_INFO, "Dispatch API version: %d.%d.%d", api_version.major,
-             api_version.minor, api_version.patch);
-  // Check if the versions mach.
-  if (api_version.major != LITERT_API_VERSION_MAJOR ||
-      api_version.minor < LITERT_API_VERSION_MINOR) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Found Dispatch API with an unsupported version");
-  }
-
-  int capabilities;
-  if (auto status = LiteRtDispatchGetCapabilities(&capabilities);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get Dispatch API capabilities: %d",
-               status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to get Dispatch API capabilities");
-  }
-  LITERT_LOG(LITERT_INFO, "Dispatch API capabilities: %d", capabilities);
-
-  if (!(capabilities & kLiteRtDispatchCapabilitiesBasic)) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Dispatch API has insufficient capabilities");
-  }
-
-  bool async_dispatch = (capabilities & kLiteRtDispatchCapabilitiesAsync);
-  if (async_dispatch) {
-    LITERT_LOG(LITERT_INFO, "Found async dispatch capabilities");
-  }
-
-  LiteRtDispatchDeviceContext device_context;
-  if (auto status = LiteRtDispatchDeviceContextCreate(&device_context);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get Dispatch API device context: %d",
-               status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to create Dispatch API device context");
-  }
-
-  return Ptr(new DispatchDelegateKernel(options, std::move(graph_name),
-                                        device_context, async_dispatch));
-}
-
-TfLiteStatus DispatchDelegateKernel::Init(
-    TfLiteOpaqueContext* context, const TfLiteOpaqueDelegateParams* params) {
-  if (params->nodes_to_replace->size != 1) {
-    LITERT_LOG(LITERT_ERROR,
-               "Models with more than one dispatch node are not yet supported");
-    return kTfLiteError;
-  }
-
-  auto node_id = params->nodes_to_replace->data[0];
-  TfLiteOpaqueNode* node;
-  TfLiteOperator* op;
-  if (auto status = TfLiteOpaqueContextGetNodeAndRegistration(context, node_id,
-                                                              &node, &op);
-      status != kTfLiteOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get node and registration: %d", status);
-    return status;
-  }
-
-  const void* init_data;
-  int init_data_size;
-  if (auto status = TfLiteOpaqueNodeGetCustomInitialData(node, &init_data,
-                                                         &init_data_size);
-      status != kTfLiteOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get custom initial data: %d", status);
-    return status;
-  }
-  if (!init_data || !init_data_size) {
-    LITERT_LOG(LITERT_ERROR, "Found custom op with missing initial data");
-    return kTfLiteError;
-  }
-
-  BufferRef<uint8_t> custom_opts(init_data, init_data_size);
-
-  // Read offset and size (relative to alloc_base) from the custom options (and
-  // name).
-  const auto dispatch_opts = GetDispatchOpOptions(custom_opts);
-  if (dispatch_opts.bytecode_offset == 0) {
-    LITERT_LOG(LITERT_ERROR, "Found dispatch op with missing bytecode offset");
-    return kTfLiteError;
-  }
-
-  // Find pointer to the start of the loaded model buffer.
-  const auto alloc_base = FindAllocBase(options_);
-  if (!alloc_base) {
-    LITERT_LOG(LITERT_ERROR,
-               "Could not find requried delegate options \"alloc_base\"");
-    return kTfLiteError;
-  }
-
-  const auto alloc_fd = FindAllocFd(options_);
-
-  // Get location of bytecode in the model buffer relative to alloc_base.
-  LiteRtMemBuffer exec_bytecode_buffer = {
-      /*.fd=*/alloc_fd ? *alloc_fd : -1,
-      /*.base_addr=*/*alloc_base,
-      /*.offset=*/dispatch_opts.bytecode_offset,
-      /*.size=*/dispatch_opts.bytecode_size};
-  const auto& function_name = dispatch_opts.name;
-  const int num_inputs = params->input_tensors->size;
-  const int num_outputs = params->output_tensors->size;
-
-  if (auto status = LiteRtDispatchInvocationContextCreate(
-          device_context_, kLiteRtDispatchExecutableTypeMlModel,
-          &exec_bytecode_buffer, function_name.data(), num_inputs, num_outputs,
-          &invocation_context_);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create invocation context: %d", status);
-    return kTfLiteError;
-  }
-
-  input_tensor_buffers_require_cpu_sync_.resize(num_inputs);
-  input_tensor_buffers_.resize(num_inputs);
-  input_tensor_buffer_handles_.resize(num_inputs);
-  input_tensor_buffer_used_size_.resize(num_inputs);
-
-  output_tensor_buffers_require_cpu_sync_.resize(num_outputs);
-  output_tensor_buffers_.resize(num_outputs);
-  output_tensor_buffer_handles_.resize(num_outputs);
-  output_tensor_buffer_used_size_.resize(num_outputs);
-
-  void* external_context;
-  TfLiteOpaqueContextGetExternalContext(context, &external_context,
-                                        kTfLiteLiteRtBufferContext);
-  if (!external_context) {
-    LITERT_LOG(LITERT_ERROR, "External context not found");
-    return kTfLiteError;
-  }
-
-  auto* buffer_context =
-      reinterpret_cast<litert::internal::ExternalLiteRtBufferContext*>(
-          external_context);
-
-  // Register input and output buffer requirements.
-  size_t num_node_inputs = TfLiteOpaqueNodeNumberOfInputs(node);
-  for (size_t i = 0; i < num_node_inputs; ++i) {
-    auto* tfl_opaque_tensor = TfLiteOpaqueNodeGetInput(context, node, i);
-    if (!tfl_opaque_tensor) {
-      LITERT_LOG(LITERT_ERROR, "Failed to get TFL node input %d", i);
-      return kTfLiteError;
-    }
-    auto tensor_type = ConvertTensorType(tfl_opaque_tensor);
-    if (!tensor_type) {
-      LITERT_LOG(LITERT_ERROR, "%s", tensor_type.Error().Message().c_str());
-      return kTfLiteError;
-    }
-    auto input_buffer_requirements =
-        GetBufferRequirements(*tensor_type, i, /*is_input=*/true);
-    if (auto res = buffer_context->RegisterBufferRequirement(
-            tfl_opaque_tensor, std::move(*input_buffer_requirements));
-        res != kLiteRtStatusOk) {
-      LITERT_LOG(LITERT_ERROR, "Failed to register buffer requirement");
-      return kTfLiteError;
-    }
-  }
-
-  size_t num_node_outputs = TfLiteOpaqueNodeNumberOfOutputs(node);
-  for (size_t i = 0; i < num_node_outputs; ++i) {
-    auto* tfl_opaque_tensor = TfLiteOpaqueNodeGetOutput(context, node, i);
-    if (!tfl_opaque_tensor) {
-      LITERT_LOG(LITERT_ERROR, "Failed to get TFL node output %d", i);
-      return kTfLiteError;
-    }
-    auto tensor_type = ConvertTensorType(tfl_opaque_tensor);
-    if (!tensor_type) {
-      LITERT_LOG(LITERT_ERROR, "%s", tensor_type.Error().Message().c_str());
-      return kTfLiteError;
-    }
-    auto output_buffer_requirements =
-        GetBufferRequirements(*tensor_type, i, /*is_input=*/false);
-    if (auto res = buffer_context->RegisterBufferRequirement(
-            tfl_opaque_tensor, std::move(*output_buffer_requirements));
-        res != kLiteRtStatusOk) {
-      LITERT_LOG(LITERT_ERROR, "Failed to register buffer requirement");
-      return kTfLiteError;
-    }
-  }
-
-  return kTfLiteOk;
-}
-
-Expected<TensorBufferRequirements>
-DispatchDelegateKernel::GetBufferRequirements(
-    const RankedTensorType& tensor_type, int io_tensor_index,
-    bool is_input) const {
-  auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(tensor_type);
-  LiteRtTensorBufferRequirements tensor_buffer_requirements;
-  if (is_input) {
-    if (auto status = LiteRtDispatchGetInputRequirements(
-            invocation_context_, /*input_index=*/io_tensor_index,
-            &litert_tensor_type, &tensor_buffer_requirements);
-        status != kLiteRtStatusOk) {
-      LITERT_LOG(LITERT_ERROR,
-                 "Failed to get tensor buffer requirements for input %d: %d",
-                 io_tensor_index, status);
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to get tensor buffer requirements for input");
-    }
-
-  } else {
-    if (auto status = LiteRtDispatchGetOutputRequirements(
-            invocation_context_, /*output_index=*/io_tensor_index,
-            &litert_tensor_type, &tensor_buffer_requirements);
-        status != kLiteRtStatusOk) {
-      LITERT_LOG(LITERT_ERROR,
-                 "Failed to get tensor buffer requirements for output %d: %d",
-                 io_tensor_index, status);
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to get tensor buffer requirements for output");
-    }
-  }
-
-  return TensorBufferRequirements(tensor_buffer_requirements,
-                                  /*owned=*/true);
-}
-
-TfLiteStatus DispatchDelegateKernel::CreateAndSetBuffer(
-    const TfLiteOpaqueTensor* tfl_opaque_tensor, int buffer_index,
-    bool is_input) {
-  auto& cached_tensor_buffer = is_input ? input_tensor_buffers_[buffer_index]
-                                        : output_tensor_buffers_[buffer_index];
-
-  auto tensor_type = ConvertTensorType(tfl_opaque_tensor);
-  if (!tensor_type) {
-    LITERT_LOG(LITERT_ERROR, "%s", tensor_type.Error().Message().c_str());
-    return kTfLiteError;
-  }
-
-  // Check if we can reuse a cached tensor buffer or we need to create a new
-  // one.
-  if (static_cast<bool>(cached_tensor_buffer)) {
-    if (auto cached_tensor_type = cached_tensor_buffer.TensorType();
-        !cached_tensor_type) {
-      LITERT_LOG(LITERT_ERROR, "%s",
-                 cached_tensor_type.Error().Message().c_str());
-      return kTfLiteError;
-    }
-
-    if (tensor_type->Layout() == cached_tensor_buffer.TensorType()->Layout()) {
-      // We can reuse the cached tensor buffer.
-      return kTfLiteOk;
-    }
-
-    // We cannot reuse the cached tensor buffer; proceed below.
-  }
-
-  auto tensor_buffer_requirements =
-      GetBufferRequirements(*tensor_type, buffer_index, is_input);
-  if (!tensor_buffer_requirements) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               tensor_buffer_requirements.Error().Message().c_str());
-    return kTfLiteError;
-  }
-
-  auto supported_tensor_buffer_types =
-      tensor_buffer_requirements->SupportedTypes();
-  if (!supported_tensor_buffer_types) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               supported_tensor_buffer_types.Error().Message().c_str());
-    return kTfLiteError;
-  }
-
-  if (supported_tensor_buffer_types->empty()) {
-    LITERT_LOG(LITERT_ERROR,
-               "Insufficient number of supported tensor buffer types");
-    return kTfLiteError;
-  }
-
-  // For now we simply pick the first buffer type that's supported.
-  LiteRtTensorBufferType tensor_buffer_type =
-      (*supported_tensor_buffer_types)[0];
-
-  auto tensor_buffer_size = tensor_buffer_requirements->BufferSize();
-  if (!tensor_buffer_size) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               tensor_buffer_size.Error().Message().c_str());
-    return kTfLiteError;
-  }
-
-  auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(*tensor_type);
-  LiteRtTensorBuffer litert_tensor_buffer;
-  if (auto status = LiteRtCreateManagedTensorBuffer(
-          tensor_buffer_type, &litert_tensor_type, *tensor_buffer_size,
-          &litert_tensor_buffer);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create managed tensor buffer: %d",
-               status);
-    return kTfLiteError;
-  }
-
-  return RegisterLiteRtTensorBuffer(TensorBuffer(litert_tensor_buffer),
-                                    *tensor_buffer_size, buffer_index,
-                                    is_input);
-}
-
-TfLiteStatus DispatchDelegateKernel::RegisterLiteRtTensorBuffer(
-    TensorBuffer&& tensor_buffer, size_t buffer_used_size, int buffer_index,
-    bool is_input) {
-  LiteRtTensorBufferHandle buffer_handle;
-  if (auto status = LiteRtDispatchRegisterTensorBuffer(
-          device_context_, tensor_buffer.Get(), &buffer_handle);
-      status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to register tensor buffer: %d", status);
-    return kTfLiteError;
-  }
-
-  if (is_input) {
-    if (auto status = LiteRtDispatchAttachInput(invocation_context_,
-                                                buffer_index, buffer_handle);
-        status != kLiteRtStatusOk) {
-      (void)LiteRtDispatchUnregisterTensorBuffer(device_context_,
-                                                 buffer_handle);
-      LITERT_LOG(LITERT_ERROR, "Failed to attach tensor buffer to input %d: %d",
-                 buffer_index, status);
-      return kTfLiteError;
-    }
-
-    if (tensor_buffer.HasEvent()) {
-      auto event = tensor_buffer.GetEvent();
-      if (!event) {
-        LITERT_LOG(LITERT_ERROR,
-                   "Failed to get event from tensor buffer %d: %s",
-                   buffer_index, event.Error().Message().c_str());
-        return kTfLiteError;
-      }
-
-      if (!async_dispatch_) {
-        // If the Dispatch API runtime doesn't support async execution, then
-        // wait for the event on the CPU.
-        LITERT_LOG(LITERT_WARNING, "Waiting on an input event on the CPU...");
-        if (auto status = event->Wait(/*timeout_in_ms=*/-1); !status) {
-          LITERT_LOG(LITERT_ERROR, "Failed to wait on event: %s",
-                     status.Error().Message().c_str());
-          return kTfLiteError;
-        }
-
-      } else {
-        if (auto status = LiteRtDispatchAttachInputEvent(
-                invocation_context_, buffer_index, event->Get());
-            status != kLiteRtStatusOk) {
-          LITERT_LOG(LITERT_ERROR, "Failed to attach event to input %d: %d",
-                     buffer_index, status);
-          return kTfLiteError;
-        }
-      }
-    }
-
-  } else {
-    if (auto status = LiteRtDispatchAttachOutput(invocation_context_,
-                                                 buffer_index, buffer_handle);
-        status != kLiteRtStatusOk) {
-      (void)LiteRtDispatchUnregisterTensorBuffer(device_context_,
-                                                 buffer_handle);
-      LITERT_LOG(LITERT_ERROR,
-                 "Failed to attach tensor buffer to output %d: %d",
-                 buffer_index, status);
-      return kTfLiteError;
-    }
-  }
-
-  if (is_input) {
-    input_tensor_buffers_[buffer_index] = std::move(tensor_buffer);
-    input_tensor_buffer_handles_[buffer_index] = buffer_handle;
-    input_tensor_buffer_used_size_[buffer_index] = buffer_used_size;
-  } else {
-    output_tensor_buffers_[buffer_index] = std::move(tensor_buffer);
-    output_tensor_buffer_handles_[buffer_index] = buffer_handle;
-    output_tensor_buffer_used_size_[buffer_index] = buffer_used_size;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus DispatchDelegateKernel::Prepare(TfLiteOpaqueContext* context,
-                                             TfLiteOpaqueNode* node) {
-  return kTfLiteOk;
-}
-
-TfLiteStatus DispatchDelegateKernel::RegisterLiteRtTensorBuffers(
-    TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
-  void* external_context;
-  TfLiteOpaqueContextGetExternalContext(context, &external_context,
-                                        kTfLiteLiteRtBufferContext);
-  auto* buffer_context =
-      reinterpret_cast<litert::internal::ExternalLiteRtBufferContext*>(
-          external_context);
-
-  size_t num_node_inputs = TfLiteOpaqueNodeNumberOfInputs(node);
-  for (size_t i = 0; i < num_node_inputs; ++i) {
-    auto* tfl_opaque_tensor = TfLiteOpaqueNodeGetInput(context, node, i);
-    auto tensor_buffer = buffer_context->GetTensorBuffer(tfl_opaque_tensor);
-    if (tensor_buffer.HasValue()) {
-      // TODO - b/379176766: If the provided TensorBuffer is not supported
-      // types, we need to create a new one and convert the data from the
-      // provided TensorBuffer.
-      auto buffer_size = tensor_buffer->Size();
-      if (!buffer_size) {
-        LITERT_LOG(LITERT_ERROR, "%s", buffer_size.Error().Message().c_str());
-        return kTfLiteError;
-      }
-      if (auto status = RegisterLiteRtTensorBuffer(std::move(*tensor_buffer),
-                                                   *buffer_size, i,
-                                                   /*is_input=*/true);
-          status != kTfLiteOk) {
-        return status;
-      }
-      input_tensor_buffers_require_cpu_sync_[i] = false;
-    } else {
-      LITERT_LOG(LITERT_VERBOSE,
-                 "Input#%d TensorBuffer is not registered. Create a new one",
-                 i);
-      if (auto status =
-              CreateAndSetBuffer(tfl_opaque_tensor, i, /*is_input=*/true);
-          status != kTfLiteOk) {
-        return status;
-      }
-      input_tensor_buffers_require_cpu_sync_[i] = true;
-    }
-  }
-
-  size_t num_node_outputs = TfLiteOpaqueNodeNumberOfOutputs(node);
-  for (size_t i = 0; i < num_node_outputs; ++i) {
-    auto* tfl_opaque_tensor = TfLiteOpaqueNodeGetOutput(context, node, i);
-    auto tensor_buffer = buffer_context->GetTensorBuffer(tfl_opaque_tensor);
-    if (tensor_buffer.HasValue()) {
-      // TODO - b/379176766: If the provided TensorBuffer is not supported
-      // types, we need to create a new one and convert the data back to the
-      // provided TensorBuffer.
-      auto buffer_size = tensor_buffer->Size();
-      if (!buffer_size) {
-        LITERT_LOG(LITERT_ERROR, "%s", buffer_size.Error().Message().c_str());
-        return kTfLiteError;
-      }
-      if (auto status = RegisterLiteRtTensorBuffer(std::move(*tensor_buffer),
-                                                   *buffer_size, i,
-                                                   /*is_input=*/false);
-          status != kTfLiteOk) {
-        return status;
-      }
-      output_tensor_buffers_require_cpu_sync_[i] = false;
-    } else {
-      LITERT_LOG(LITERT_VERBOSE,
-                 "Output#%d TensorBuffer is not registered. Create a new one",
-                 i);
-      if (auto status =
-              CreateAndSetBuffer(tfl_opaque_tensor, i, /*is_input=*/false);
-          status != kTfLiteOk) {
-        return status;
-      }
-      output_tensor_buffers_require_cpu_sync_[i] = true;
-    }
-  }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus DispatchDelegateKernel::Eval(TfLiteOpaqueContext* context,
-                                          TfLiteOpaqueNode* node) {
-  if (auto status = RegisterLiteRtTensorBuffers(context, node);
-      status != kTfLiteOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to register tensor buffers: %d", status);
-    return kTfLiteError;
-  }
-
-  size_t num_node_inputs = TfLiteOpaqueNodeNumberOfInputs(node);
-  if (num_node_inputs != input_tensor_buffers_.size()) {
-    LITERT_LOG(LITERT_ERROR, "Invalid number of inputs");
-    return kTfLiteError;
-  }
-
-  for (size_t i = 0; i < num_node_inputs; ++i) {
-    if (!input_tensor_buffers_require_cpu_sync_[i]) {
-      continue;
-    }
-    auto* tfl_opaque_tensor = TfLiteOpaqueNodeGetInput(context, node, i);
-    void* tensor_data = TfLiteOpaqueTensorData(tfl_opaque_tensor);
-    auto& tensor_buffer = input_tensor_buffers_[i];
-
-    auto lock_and_addr = TensorBufferScopedLock::Create(tensor_buffer);
-    if (!lock_and_addr) {
-      LITERT_LOG(LITERT_ERROR, "%s", lock_and_addr.Error().Message().c_str());
-      return kTfLiteError;
-    }
-
-    size_t buffer_size = input_tensor_buffer_used_size_[i];
-    std::memcpy(lock_and_addr->second, tensor_data, buffer_size);
-  }
-
-  size_t num_node_outputs = TfLiteOpaqueNodeNumberOfOutputs(node);
-  if (num_node_outputs != output_tensor_buffers_.size()) {
-    LITERT_LOG(LITERT_ERROR, "Invalid number of outputs");
-    return kTfLiteError;
-  }
-
-  if (async_dispatch_) {
-    std::vector<LiteRtEvent> output_events(num_node_outputs);
-    if (auto status = LiteRtDispatchInvokeAsync(
-            invocation_context_, output_events.size(), output_events.data());
-        status != kLiteRtStatusOk) {
-      LITERT_LOG(LITERT_ERROR, "Failed to invoke context asynchronously: %d",
-                 status);
-      return kTfLiteError;
-    }
-    for (size_t i = 0; i < output_events.size(); ++i) {
-      auto output_event = output_events[i];
-      if (output_event) {
-        auto& tensor_buffer = output_tensor_buffers_[i];
-        if (auto status = tensor_buffer.SetEvent(Event(output_event));
-            !status) {
-          LITERT_LOG(LITERT_ERROR,
-                     "Failed to set event on output tensor buffer: %s",
-                     status.Error().Message().c_str());
-          return kTfLiteError;
-        }
-      }
-    }
-
-  } else {
-    if (auto status = LiteRtDispatchInvoke(invocation_context_);
-        status != kLiteRtStatusOk) {
-      LITERT_LOG(LITERT_ERROR, "Failed to invoke context: %d", status);
-      return kTfLiteError;
-    }
-  }
-
-  for (size_t i = 0; i < num_node_outputs; ++i) {
-    if (!output_tensor_buffers_require_cpu_sync_[i]) {
-      continue;
-    }
-    auto* tfl_opaque_tensor = TfLiteOpaqueNodeGetOutput(context, node, i);
-    void* tensor_data = TfLiteOpaqueTensorData(tfl_opaque_tensor);
-    auto& tensor_buffer = output_tensor_buffers_[i];
-
-    auto lock_and_addr = TensorBufferScopedLock::Create(tensor_buffer);
-    if (!lock_and_addr) {
-      LITERT_LOG(LITERT_ERROR, "%s", lock_and_addr.Error().Message().c_str());
-      return kTfLiteError;
-    }
-
-    size_t buffer_size = output_tensor_buffer_used_size_[i];
-    std::memcpy(tensor_data, lock_and_addr->second, buffer_size);
-  }
-
-  return kTfLiteOk;
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h b/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h
deleted file mode 100644
index 4880ee04d780..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_kernel.h
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_KERNEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_KERNEL_H_
-
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-namespace litert {
-namespace internal {
-
-// A TFL kernel that the interpreter calls to dispatch execution through the
-// Dispatch API.
-class DispatchDelegateKernel
-    : public tflite::SimpleOpaqueDelegateKernelInterface {
- public:
-  using Ptr = std::unique_ptr<tflite::SimpleOpaqueDelegateKernelInterface>;
-
-  ~DispatchDelegateKernel() override;
-
-  static Expected<Ptr> Create(std::string&& graph_name,
-                              const LiteRtDispatchDelegateOptions& options);
-
-  TfLiteStatus Init(TfLiteOpaqueContext* context,
-                    const TfLiteOpaqueDelegateParams* params) override;
-
-  TfLiteStatus Prepare(TfLiteOpaqueContext* context,
-                       TfLiteOpaqueNode* node) override;
-
-  TfLiteStatus Eval(TfLiteOpaqueContext* context,
-                    TfLiteOpaqueNode* node) override;
-
- private:
-  DispatchDelegateKernel(const LiteRtDispatchDelegateOptions& options,
-                         std::string&& graph_name,
-                         LiteRtDispatchDeviceContext device_context,
-                         bool async_dispatch)
-      : options_(options),
-        graph_name_(std::move(graph_name)),
-        device_context_(device_context),
-        async_dispatch_(async_dispatch) {}
-
-  Expected<TensorBufferRequirements> GetBufferRequirements(
-      const RankedTensorType& tensor_type, int io_tensor_index,
-      bool is_input) const;
-
-  // Creates a new tensor buffer for the given tensor. After that the created
-  // tensor buffer is registered with RegisterLiteRtTensorBuffer().
-  TfLiteStatus CreateAndSetBuffer(const TfLiteOpaqueTensor* tfl_opaque_tensor,
-                                  int buffer_index, bool is_input);
-
-  // Registers the given LiteRtTensorBuffer (and its size) with the Dispatch
-  // API.
-  // Also update the internal state (input_tensor_buffers_, etc.) to keep track
-  // of the registered tensor buffers.
-  TfLiteStatus RegisterLiteRtTensorBuffer(TensorBuffer&& tensor_buffer,
-                                          size_t used_size, int buffer_index,
-                                          bool is_input);
-
-  // Registers LiteRtTensorBuffers for all inputs and outputs of the given
-  // node.
-  // Also update the internal state (input_tensor_buffers_, etc.) to keep track
-  // of the registered tensor buffers.
-  TfLiteStatus RegisterLiteRtTensorBuffers(TfLiteOpaqueContext* context,
-                                           TfLiteOpaqueNode* node);
-
-  const LiteRtDispatchDelegateOptions& options_;
-  std::string graph_name_;
-  LiteRtDispatchDeviceContext device_context_;
-  LiteRtDispatchInvocationContext invocation_context_ = nullptr;
-  const bool async_dispatch_;
-
-  // Indicates whether the input tensor buffer requires a CPU sync before
-  // invoking the Dispatch API.
-  std::vector<bool> input_tensor_buffers_require_cpu_sync_;
-
-  std::vector<TensorBuffer> input_tensor_buffers_;
-  std::vector<LiteRtTensorBufferHandle> input_tensor_buffer_handles_;
-  std::vector<size_t> input_tensor_buffer_used_size_;
-
-  // Indicates whether the output tensor buffer requires a CPU sync after
-  // invoking the Dispatch API.
-  std::vector<bool> output_tensor_buffers_require_cpu_sync_;
-
-  std::vector<TensorBuffer> output_tensor_buffers_;
-  std::vector<LiteRtTensorBufferHandle> output_tensor_buffer_handles_;
-  std::vector<size_t> output_tensor_buffer_used_size_;
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_KERNEL_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_mediatek_test.cc b/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_mediatek_test.cc
deleted file mode 100644
index 642c8dc4773e..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_mediatek_test.cc
+++ /dev/null
@@ -1,398 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstring>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_compiled_model_options.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_buffer.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/signature_runner.h"
-
-namespace litert {
-namespace {
-
-using ::litert::testing::MakeRuntimeFromTestFileWithNpuModel;
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-static constexpr absl::string_view kNpuFile = kMediaTekModelFileName;
-static constexpr absl::string_view kTfliteFile = "simple_model_npu.tflite";
-static constexpr absl::string_view kDispatchLibraryDir = "/data/local/tmp";
-
-TEST(DispatchDelegate, MediaTekCpuBuffer) {
-  auto runtime = MakeRuntimeFromTestFileWithNpuModel(kTfliteFile, kNpuFile);
-  ASSERT_TRUE(runtime) << "Failed to initialize tflite interpreter";
-  auto& rt = **runtime;
-  auto& interpreter = rt.Interpreter();
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-
-  litert::internal::ExternalLiteRtBufferContext buffer_context;
-  interpreter.SetExternalContext(kTfLiteLiteRtBufferContext, &buffer_context);
-
-  EXPECT_EQ(interpreter.nodes_size(), 1);
-  EXPECT_EQ(interpreter.inputs().size(), 2);
-  EXPECT_EQ(interpreter.outputs().size(), 1);
-  ASSERT_EQ(interpreter.execution_plan().size(), 1);
-
-  auto dispatch_delegate_options =
-      CreateDispatchDelegateOptionsPtr(*env->Get());
-  LiteRtDispatchDelegateAddAllocBaseOption(dispatch_delegate_options.get(),
-                                           rt.Flatbuffer().Buf().Data());
-  auto dispatch_delegate = CreateDispatchDelegatePtr(
-      *env->Get(), std::move(dispatch_delegate_options));
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "MediaTek NPU";
-#endif
-
-  ASSERT_EQ(interpreter.ModifyGraphWithDelegate(dispatch_delegate.get()),
-            kTfLiteOk);
-
-  // Get the list of signatures and check it.
-  auto signature_defs = interpreter.signature_keys();
-  ASSERT_EQ(signature_defs.size(), 1);
-
-  tflite::impl::SignatureRunner* runner =
-      interpreter.GetSignatureRunner(/*signature_key=*/nullptr);
-  ASSERT_NE(runner, nullptr);
-
-  EXPECT_EQ(runner->AllocateTensors(), kTfLiteOk);
-
-  // Fill model inputs.
-  ASSERT_STREQ(runner->input_names()[0], "arg0");
-  auto input_0_tensor = runner->input_tensor("arg0");
-  ASSERT_NE(input_0_tensor, nullptr);
-  auto* input_0 = input_0_tensor->data.f;
-  std::memcpy(input_0, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-
-  ASSERT_STREQ(runner->input_names()[1], "arg1");
-  auto input_1_tensor = runner->input_tensor("arg1");
-  ASSERT_NE(input_1_tensor, nullptr);
-  auto* input_1 = input_1_tensor->data.f;
-  std::memcpy(input_1, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-
-  EXPECT_EQ(runner->Invoke(), kTfLiteOk);
-
-  // Check model output.
-  ASSERT_STREQ(runner->output_names()[0], "tfl.custom");
-  auto output_tensor = runner->output_tensor("tfl.custom");
-  ASSERT_NE(output_tensor, nullptr);
-  auto output = absl::MakeSpan(output_tensor->data.f, kTestOutputSize);
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output, Pointwise(::testing::FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, MediaTekHwBuffer) {
-  auto runtime = MakeRuntimeFromTestFileWithNpuModel(kTfliteFile, kNpuFile);
-  ASSERT_TRUE(runtime) << "Failed to initialize tflite interpreter";
-  auto& rt = **runtime;
-  auto& interpreter = rt.Interpreter();
-
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-
-  litert::internal::ExternalLiteRtBufferContext buffer_context;
-  interpreter.SetExternalContext(kTfLiteLiteRtBufferContext, &buffer_context);
-
-  EXPECT_EQ(interpreter.nodes_size(), 1);
-  EXPECT_EQ(interpreter.inputs().size(), 2);
-  EXPECT_EQ(interpreter.outputs().size(), 1);
-  ASSERT_EQ(interpreter.execution_plan().size(), 1);
-
-  auto dispatch_delegate_options =
-      CreateDispatchDelegateOptionsPtr(*env->Get());
-  LiteRtDispatchDelegateAddAllocBaseOption(dispatch_delegate_options.get(),
-                                           rt.Flatbuffer().Buf().Data());
-  auto dispatch_delegate = CreateDispatchDelegatePtr(
-      *env->Get(), std::move(dispatch_delegate_options));
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "MediaTek NPU";
-#endif
-
-  ASSERT_EQ(interpreter.ModifyGraphWithDelegate(dispatch_delegate.get()),
-            kTfLiteOk);
-
-  // Create and register tensor buffers for all inputs and outputs.
-
-  std::vector<litert::TensorBuffer> input_buffers;
-  for (int i = 0; i < interpreter.inputs().size(); ++i) {
-    auto input_buffer_requirements =
-        buffer_context.GetBufferRequirement(interpreter.input_tensor(i));
-    ASSERT_TRUE(input_buffer_requirements);
-    ASSERT_EQ((*input_buffer_requirements)->SupportedTypes().Value()[0],
-              kLiteRtTensorBufferTypeAhwb);
-    auto input_buffer =
-        buffer_context.CreateBufferForTensor(interpreter.input_tensor(i));
-    ASSERT_TRUE(input_buffer);
-    ASSERT_TRUE(input_buffer->IsOwned());
-    ASSERT_EQ(*input_buffer->BufferType(), kLiteRtTensorBufferTypeAhwb);
-    auto duplicate_buffer = (*input_buffer).Duplicate();
-    ASSERT_TRUE(duplicate_buffer);
-    auto status = buffer_context.RegisterTensorBuffer(
-        interpreter.input_tensor(i), std::move(*duplicate_buffer));
-    ASSERT_EQ(status, kLiteRtStatusOk);
-    input_buffers.push_back(std::move(*input_buffer));
-  }
-
-  std::vector<litert::TensorBuffer> output_buffers;
-  for (int i = 0; i < interpreter.outputs().size(); ++i) {
-    auto output_buffer_requirements =
-        buffer_context.GetBufferRequirement(interpreter.output_tensor(i));
-    ASSERT_TRUE(output_buffer_requirements);
-    ASSERT_EQ((*output_buffer_requirements)->SupportedTypes().Value()[0],
-              kLiteRtTensorBufferTypeAhwb);
-    auto output_buffer =
-        buffer_context.CreateBufferForTensor(interpreter.output_tensor(i));
-    ASSERT_TRUE(output_buffer.HasValue());
-    ASSERT_TRUE(output_buffer->IsOwned());
-    ASSERT_EQ(*output_buffer->BufferType(), kLiteRtTensorBufferTypeAhwb);
-    auto duplicate_buffer = (*output_buffer).Duplicate();
-    ASSERT_TRUE(duplicate_buffer);
-    auto status = buffer_context.RegisterTensorBuffer(
-        interpreter.output_tensor(i), std::move(*duplicate_buffer));
-    ASSERT_EQ(status, kLiteRtStatusOk);
-    output_buffers.push_back(std::move(*output_buffer));
-  }
-
-  // Get the list of signatures and check it.
-  auto signature_defs = interpreter.signature_keys();
-  ASSERT_EQ(signature_defs.size(), 1);
-
-  tflite::impl::SignatureRunner* runner =
-      interpreter.GetSignatureRunner(/*signature_key=*/nullptr);
-  ASSERT_NE(runner, nullptr);
-
-  EXPECT_EQ(runner->AllocateTensors(), kTfLiteOk);
-
-  // Fill model inputs.
-  ASSERT_STREQ(runner->input_names()[0], "arg0");
-  auto& input_0_buffer = input_buffers[0];
-  input_0_buffer.Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size));
-
-  ASSERT_STREQ(runner->input_names()[1], "arg1");
-  auto& input_1_buffer = input_buffers[1];
-  input_1_buffer.Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size));
-
-  EXPECT_EQ(runner->Invoke(), kTfLiteOk);
-
-  // Check model output.
-  ASSERT_STREQ(runner->output_names()[0], "tfl.custom");
-  auto& output_buffer = output_buffers[0];
-  float output_buffer_data[kTestOutputSize];
-  auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-  auto read_success = output_buffer.Read<float>(output_span);
-  ASSERT_TRUE(read_success);
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                   << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, CompiledModel) {
-  auto model_with_byte_code =
-      internal::GetModelBufWithByteCode(testing::GetTestFilePath(kTfliteFile),
-                                        testing::GetTestFilePath(kNpuFile));
-  ASSERT_TRUE(model_with_byte_code);
-  auto model = Model::CreateFromBuffer(*model_with_byte_code);
-  ASSERT_TRUE(model);
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "MediaTek NPU";
-#endif
-
-  auto options = CompiledModel::Options::Create();
-  ASSERT_TRUE(options);
-  ASSERT_TRUE(options->SetHardwareAccelerators(kLiteRtHwAcceleratorCpu));
-
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-  auto res_compiled_model =
-      CompiledModel::Create(*env, *model, std::move(*options));
-  ASSERT_TRUE(res_compiled_model) << "Failed to initialize CompiledModel";
-  auto& compiled_model = *res_compiled_model;
-
-  auto signatures = model->GetSignatures();
-  ASSERT_TRUE(signatures);
-  EXPECT_EQ(signatures->size(), 1);
-  auto& signature = signatures->at(0);
-  auto signature_key = signature.Key();
-  EXPECT_EQ(signature_key, Model::DefaultSignatureKey());
-  size_t signature_index = 0;
-
-  auto input_buffers_res = compiled_model.CreateInputBuffers(signature_index);
-  EXPECT_TRUE(input_buffers_res);
-  auto& input_buffers = *input_buffers_res;
-
-  auto output_buffers_res = compiled_model.CreateOutputBuffers(signature_index);
-  EXPECT_TRUE(output_buffers_res);
-  auto& output_buffers = *output_buffers_res;
-
-  // Fill model inputs.
-  auto input_names = signature.InputNames();
-  EXPECT_EQ(input_names.size(), 2);
-  EXPECT_EQ(input_names.at(0), "arg0");
-  EXPECT_EQ(input_names.at(1), "arg1");
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  auto output_names = signature.OutputNames();
-  EXPECT_EQ(output_names.size(), 1);
-  EXPECT_EQ(output_names.at(0), "tfl.custom");
-  float output_buffer_data[kTestOutputSize];
-  auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-  ASSERT_TRUE(output_buffers[0].Read(output_span));
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                   << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, CompiledModelSharedInput) {
-  auto model_with_byte_code = internal::GetModelBufWithByteCode(
-      testing::GetTestFilePath("shared_input_cpu_npu.tflite"),
-      testing::GetTestFilePath(kNpuFile));
-  ASSERT_TRUE(model_with_byte_code);
-  auto model = Model::CreateFromBuffer(*model_with_byte_code);
-  ASSERT_TRUE(model);
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "MediaTek NPU";
-#endif
-  auto options = CompiledModel::Options::Create();
-  ASSERT_TRUE(options);
-  ASSERT_TRUE(options->SetHardwareAccelerators(kLiteRtHwAcceleratorCpu));
-
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-  auto res_compiled_model =
-      CompiledModel::Create(*env, *model, std::move(*options));
-  ASSERT_TRUE(res_compiled_model) << "Failed to initialize CompiledModel";
-  auto& compiled_model = *res_compiled_model;
-
-  size_t signature_index = 0;
-  auto signature = *model->GetSignature(signature_index);
-  auto input_buffers = *compiled_model.CreateInputBuffers(signature_index);
-  auto output_buffers = *compiled_model.CreateOutputBuffers(signature_index);
-
-  // Fill model inputs.
-  auto input_names = signature.InputNames();
-  EXPECT_EQ(input_names.size(), 2);
-  EXPECT_EQ(input_names.at(0), "arg0");
-  EXPECT_EQ(input_names.at(1), "arg1");
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  auto output_names = signature.OutputNames();
-  EXPECT_EQ(output_names.size(), 2);
-  {
-    EXPECT_EQ(output_names.at(0), "tfl.add");
-    float output_buffer_data[kTestOutputSize];
-    auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-    ASSERT_TRUE(output_buffers[0].Read(output_span));
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                     << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-  {
-    EXPECT_EQ(output_names.at(1), "tfl.custom");
-    float output_buffer_data[kTestOutputSize];
-    auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-    ASSERT_TRUE(output_buffers[1].Read(output_span));
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                     << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h b/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h
deleted file mode 100644
index 66a0c8a065bd..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_options.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_OPTIONS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_OPTIONS_H_
-
-#include <any>
-#include <cstdint>
-#include <map>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-class LiteRtDispatchDelegateOptions {
- public:
-  explicit LiteRtDispatchDelegateOptions(LiteRtEnvironmentT& environment) {
-    auto option = environment.GetOption(kLiteRtEnvOptionTagDispatchLibraryDir);
-    if (!option.has_value()) {
-      return;
-    }
-
-    if (option->type != kLiteRtAnyTypeString) {
-      LITERT_LOG(LITERT_WARNING,
-                 "Ingoring option kLiteRtEnvOptionTagDispatchLibraryDir due "
-                 "to invalid value");
-      return;
-    }
-
-    LiteRtDispatchOption dispatch_option = {
-        /*.name=*/kDispatchOptionSharedLibraryDir,
-        /*.value=*/*option,
-    };
-    AddOption(dispatch_option);
-  }
-
-  // Push a new dispatch option.
-  void AddOption(LiteRtDispatchOption option) { options_.push_back(option); }
-
-  // Get all dispatch options.
-  const std::vector<LiteRtDispatchOption>& GetDispatchOptions() const {
-    return options_;
-  }
-
-  // Find a dispatch option under the given name if it exists.
-  litert::Expected<std::any> FindDispatchOption(absl::string_view name) const {
-    for (const auto& option : options_) {
-      if (option.name != name) {
-        continue;
-      }
-      return litert::ToStdAny(option.value);
-    }
-    return litert::Unexpected(kLiteRtStatusErrorInvalidArgument);
-  }
-
- private:
-  std::vector<LiteRtDispatchOption> options_;
-};
-
-//
-// Common options
-//
-
-static constexpr absl::string_view kAllocBase = "alloc_base";
-static constexpr absl::string_view kAllocFd = "alloc_fd";
-
-inline void AddAllocBaseOption(const void* alloc_base,
-                               LiteRtDispatchDelegateOptions& opts) {
-  LiteRtAny opt;
-  opt.type = kLiteRtAnyTypeVoidPtr;
-  opt.ptr_value = alloc_base;
-  opts.AddOption(LiteRtDispatchOption{kAllocBase.data(), opt});
-}
-
-inline litert::Expected<const void*> FindAllocBase(
-    const LiteRtDispatchDelegateOptions& opts) {
-  auto alloc_base = opts.FindDispatchOption(kAllocBase);
-  if (!alloc_base) {
-    return alloc_base.Error();
-  }
-  return std::any_cast<const void*>(*alloc_base);
-}
-
-inline void AddAllocFdOption(int alloc_fd,
-                             LiteRtDispatchDelegateOptions& opts) {
-  LiteRtAny opt;
-  opt.type = kLiteRtAnyTypeVoidPtr;
-  opt.int_value = alloc_fd;
-  opts.AddOption(LiteRtDispatchOption{kAllocBase.data(), opt});
-}
-
-inline litert::Expected<int> FindAllocFd(
-    const LiteRtDispatchDelegateOptions& opts) {
-  auto alloc_fd = opts.FindDispatchOption(kAllocFd);
-  if (!alloc_fd) {
-    return alloc_fd.Error();
-  }
-  return std::any_cast<int>(*alloc_fd);
-}
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DISPATCH_DISPATCH_DELEGATE_OPTIONS_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_qualcomm_test.cc b/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_qualcomm_test.cc
deleted file mode 100644
index 29d19b554223..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/dispatch_delegate_qualcomm_test.cc
+++ /dev/null
@@ -1,395 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <cstring>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_dispatch_delegate.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/signature_runner.h"
-
-namespace litert {
-namespace {
-
-using ::litert::testing::MakeRuntimeFromTestFileWithNpuModel;
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-static constexpr absl::string_view kNpuFile = kQualcommModelFileName;
-static constexpr absl::string_view kTfliteFile = "simple_model_npu.tflite";
-static constexpr absl::string_view kDispatchLibraryDir = "/data/local/tmp";
-
-TEST(DispatchDelegate, QualcommCpuBuffer) {
-  auto runtime = MakeRuntimeFromTestFileWithNpuModel(kTfliteFile, kNpuFile);
-  ASSERT_TRUE(runtime) << "Failed to initialize tflite interpreter";
-  auto& rt = **runtime;
-  auto& interpreter = rt.Interpreter();
-
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-
-  litert::internal::ExternalLiteRtBufferContext buffer_context;
-  interpreter.SetExternalContext(kTfLiteLiteRtBufferContext, &buffer_context);
-
-  EXPECT_EQ(interpreter.nodes_size(), 1);
-  EXPECT_EQ(interpreter.inputs().size(), 2);
-  EXPECT_EQ(interpreter.outputs().size(), 1);
-  ASSERT_EQ(interpreter.execution_plan().size(), 1);
-
-  auto dispatch_delegate_options =
-      CreateDispatchDelegateOptionsPtr(*env->Get());
-  LiteRtDispatchDelegateAddAllocBaseOption(dispatch_delegate_options.get(),
-                                           rt.Flatbuffer().Buf().Data());
-  auto dispatch_delegate = CreateDispatchDelegatePtr(
-      *env->Get(), std::move(dispatch_delegate_options));
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "Qualcomm HTP";
-#endif
-
-  ASSERT_EQ(interpreter.ModifyGraphWithDelegate(dispatch_delegate.get()),
-            kTfLiteOk);
-
-  // Get the list of signatures and check it.
-  auto signature_defs = interpreter.signature_keys();
-  ASSERT_EQ(signature_defs.size(), 1);
-
-  tflite::impl::SignatureRunner* runner =
-      interpreter.GetSignatureRunner(/*signature_key=*/nullptr);
-  ASSERT_NE(runner, nullptr);
-
-  EXPECT_EQ(runner->AllocateTensors(), kTfLiteOk);
-
-  // Fill model inputs.
-  ASSERT_STREQ(runner->input_names()[0], "arg0");
-  auto input_0_tensor = runner->input_tensor("arg0");
-  ASSERT_NE(input_0_tensor, nullptr);
-  auto* input_0 = input_0_tensor->data.f;
-  std::memcpy(input_0, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-
-  ASSERT_STREQ(runner->input_names()[1], "arg1");
-  auto input_1_tensor = runner->input_tensor("arg1");
-  ASSERT_NE(input_1_tensor, nullptr);
-  auto* input_1 = input_1_tensor->data.f;
-  std::memcpy(input_1, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-
-  EXPECT_EQ(runner->Invoke(), kTfLiteOk);
-
-  // Check model output.
-  ASSERT_STREQ(runner->output_names()[0], "tfl.custom");
-  auto output_tensor = runner->output_tensor("tfl.custom");
-  ASSERT_NE(output_tensor, nullptr);
-  auto output = absl::MakeSpan(output_tensor->data.f, kTestOutputSize);
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output, Pointwise(::testing::FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, QualcommHwBuffer) {
-  auto runtime = MakeRuntimeFromTestFileWithNpuModel(kTfliteFile, kNpuFile);
-  ASSERT_TRUE(runtime) << "Failed to initialize tflite interpreter";
-  auto& rt = **runtime;
-  auto& interpreter = rt.Interpreter();
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-
-  litert::internal::ExternalLiteRtBufferContext buffer_context;
-  interpreter.SetExternalContext(kTfLiteLiteRtBufferContext, &buffer_context);
-
-  EXPECT_EQ(interpreter.nodes_size(), 1);
-  EXPECT_EQ(interpreter.inputs().size(), 2);
-  EXPECT_EQ(interpreter.outputs().size(), 1);
-  ASSERT_EQ(interpreter.execution_plan().size(), 1);
-
-  auto dispatch_delegate_options =
-      CreateDispatchDelegateOptionsPtr(*env->Get());
-  LiteRtDispatchDelegateAddAllocBaseOption(dispatch_delegate_options.get(),
-                                           rt.Flatbuffer().Buf().Data());
-  auto dispatch_delegate = CreateDispatchDelegatePtr(
-      *env->Get(), std::move(dispatch_delegate_options));
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "Qualcomm HTP";
-#endif
-
-  ASSERT_EQ(interpreter.ModifyGraphWithDelegate(dispatch_delegate.get()),
-            kTfLiteOk);
-
-  // Create and register tensor buffers for all inputs and outputs.
-
-  std::vector<litert::TensorBuffer> input_buffers;
-  for (int i = 0; i < interpreter.inputs().size(); ++i) {
-    auto input_buffer_requirements =
-        buffer_context.GetBufferRequirement(interpreter.input_tensor(i));
-    ASSERT_TRUE(input_buffer_requirements);
-    ASSERT_EQ((*input_buffer_requirements)->SupportedTypes().Value()[0],
-              kLiteRtTensorBufferTypeFastRpc);
-    auto input_buffer =
-        buffer_context.CreateBufferForTensor(interpreter.input_tensor(i));
-    ASSERT_TRUE(input_buffer);
-    ASSERT_TRUE(input_buffer->IsOwned());
-    ASSERT_EQ(*input_buffer->BufferType(), kLiteRtTensorBufferTypeFastRpc);
-    auto duplicate_buffer = (*input_buffer).Duplicate();
-    ASSERT_TRUE(duplicate_buffer);
-    auto status = buffer_context.RegisterTensorBuffer(
-        interpreter.input_tensor(i), std::move(*duplicate_buffer));
-    ASSERT_EQ(status, kLiteRtStatusOk);
-    input_buffers.push_back(std::move(*input_buffer));
-  }
-
-  std::vector<litert::TensorBuffer> output_buffers;
-  for (int i = 0; i < interpreter.outputs().size(); ++i) {
-    auto output_buffer_requirements =
-        buffer_context.GetBufferRequirement(interpreter.output_tensor(i));
-    ASSERT_TRUE(output_buffer_requirements);
-    ASSERT_EQ((*output_buffer_requirements)->SupportedTypes().Value()[0],
-              kLiteRtTensorBufferTypeFastRpc);
-    auto output_buffer =
-        buffer_context.CreateBufferForTensor(interpreter.output_tensor(i));
-    ASSERT_TRUE(output_buffer.HasValue());
-    ASSERT_TRUE(output_buffer->IsOwned());
-    ASSERT_EQ(*output_buffer->BufferType(), kLiteRtTensorBufferTypeFastRpc);
-    auto duplicate_buffer = (*output_buffer).Duplicate();
-    ASSERT_TRUE(duplicate_buffer);
-    auto status = buffer_context.RegisterTensorBuffer(
-        interpreter.output_tensor(i), std::move(*duplicate_buffer));
-    ASSERT_EQ(status, kLiteRtStatusOk);
-    output_buffers.push_back(std::move(*output_buffer));
-  }
-
-  // Get the list of signatures and check it.
-  auto signature_defs = interpreter.signature_keys();
-  ASSERT_EQ(signature_defs.size(), 1);
-
-  tflite::impl::SignatureRunner* runner =
-      interpreter.GetSignatureRunner(/*signature_key=*/nullptr);
-  ASSERT_NE(runner, nullptr);
-
-  EXPECT_EQ(runner->AllocateTensors(), kTfLiteOk);
-
-  // Fill model inputs.
-  ASSERT_STREQ(runner->input_names()[0], "arg0");
-  auto& input_0_buffer = input_buffers[0];
-  input_0_buffer.Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size));
-
-  ASSERT_STREQ(runner->input_names()[1], "arg1");
-  auto& input_1_buffer = input_buffers[1];
-  input_1_buffer.Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size));
-
-  EXPECT_EQ(runner->Invoke(), kTfLiteOk);
-
-  // Check model output.
-  ASSERT_STREQ(runner->output_names()[0], "tfl.custom");
-  auto& output_buffer = output_buffers[0];
-  float output_buffer_data[kTestOutputSize];
-  auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-  auto read_success = output_buffer.Read<float>(output_span);
-  ASSERT_TRUE(read_success);
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                   << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, CompiledModel) {
-  auto model_with_byte_code =
-      internal::GetModelBufWithByteCode(testing::GetTestFilePath(kTfliteFile),
-                                        testing::GetTestFilePath(kNpuFile));
-  ASSERT_TRUE(model_with_byte_code);
-  auto model = Model::CreateFromBuffer(*model_with_byte_code);
-  ASSERT_TRUE(model);
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "Qualcomm HTP";
-#endif
-  auto options = CompiledModel::Options::Create();
-  ASSERT_TRUE(options);
-  ASSERT_TRUE(options->SetHardwareAccelerators(kLiteRtHwAcceleratorCpu));
-
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-  auto res_compiled_model =
-      CompiledModel::Create(*env, *model, std::move(*options));
-  ASSERT_TRUE(res_compiled_model) << "Failed to initialize CompiledModel";
-  auto& compiled_model = *res_compiled_model;
-
-  auto signatures = model->GetSignatures();
-  ASSERT_TRUE(signatures);
-  EXPECT_EQ(signatures->size(), 1);
-  auto& signature = signatures->at(0);
-  auto signature_key = signature.Key();
-  EXPECT_EQ(signature_key, Model::DefaultSignatureKey());
-  size_t signature_index = 0;
-
-  auto input_buffers_res = compiled_model.CreateInputBuffers(signature_index);
-  EXPECT_TRUE(input_buffers_res);
-  auto& input_buffers = *input_buffers_res;
-
-  auto output_buffers_res = compiled_model.CreateOutputBuffers(signature_index);
-  EXPECT_TRUE(output_buffers_res);
-  auto& output_buffers = *output_buffers_res;
-
-  // Fill model inputs.
-  auto input_names = signature.InputNames();
-  EXPECT_EQ(input_names.size(), 2);
-  EXPECT_EQ(input_names.at(0), "arg0");
-  EXPECT_EQ(input_names.at(1), "arg1");
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  auto output_names = signature.OutputNames();
-  EXPECT_EQ(output_names.size(), 1);
-  EXPECT_EQ(output_names.at(0), "tfl.custom");
-  float output_buffer_data[kTestOutputSize];
-  auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-  ASSERT_TRUE(output_buffers[0].Read(output_span));
-  for (auto i = 0; i < kTestOutputSize; ++i) {
-    ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                   << kTestOutputTensor[i];
-  }
-  EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-}
-
-TEST(DispatchDelegate, QualcommSharedInput) {
-  auto model_with_byte_code = internal::GetModelBufWithByteCode(
-      testing::GetTestFilePath("shared_input_cpu_npu.tflite"),
-      testing::GetTestFilePath(kNpuFile));
-  ASSERT_TRUE(model_with_byte_code);
-  auto model = Model::CreateFromBuffer(*model_with_byte_code);
-  ASSERT_TRUE(model);
-
-#if !defined(__ANDROID__)
-  GTEST_SKIP() << "The rest of this test is specific to Android devices with a "
-                  "Qualcomm HTP";
-#endif
-  auto options = CompiledModel::Options::Create();
-  ASSERT_TRUE(options);
-  ASSERT_TRUE(options->SetHardwareAccelerators(kLiteRtHwAcceleratorCpu));
-
-  const std::vector<litert::Environment::Option> environment_options = {
-      litert::Environment::Option{
-          litert::Environment::OptionTag::DispatchLibraryDir,
-          kDispatchLibraryDir,
-      },
-  };
-  auto env =
-      litert::Environment::Create(absl::MakeConstSpan(environment_options));
-  ASSERT_TRUE(env);
-  auto res_compiled_model =
-      CompiledModel::Create(*env, *model, std::move(*options));
-  ASSERT_TRUE(res_compiled_model) << "Failed to initialize CompiledModel";
-  auto& compiled_model = *res_compiled_model;
-
-  size_t signature_index = 0;
-  auto signature = *model->GetSignature(signature_index);
-  auto input_buffers = *compiled_model.CreateInputBuffers(signature_index);
-  auto output_buffers = *compiled_model.CreateOutputBuffers(signature_index);
-
-  // Fill model inputs.
-  auto input_names = signature.InputNames();
-  EXPECT_EQ(input_names.size(), 2);
-  EXPECT_EQ(input_names.at(0), "arg0");
-  EXPECT_EQ(input_names.at(1), "arg1");
-  ASSERT_TRUE(input_buffers[0].Write<float>(
-      absl::MakeConstSpan(kTestInput0Tensor, kTestInput0Size)));
-  ASSERT_TRUE(input_buffers[1].Write<float>(
-      absl::MakeConstSpan(kTestInput1Tensor, kTestInput1Size)));
-
-  // Execute model.
-  compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  // Check model output.
-  auto output_names = signature.OutputNames();
-  EXPECT_EQ(output_names.size(), 2);
-  {
-    EXPECT_EQ(output_names.at(0), "tfl.add");
-    float output_buffer_data[kTestOutputSize];
-    auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-    ASSERT_TRUE(output_buffers[0].Read(output_span));
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                     << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-  {
-    EXPECT_EQ(output_names.at(1), "tfl.custom");
-    float output_buffer_data[kTestOutputSize];
-    auto output_span = absl::MakeSpan(output_buffer_data, kTestOutputSize);
-    ASSERT_TRUE(output_buffers[1].Read(output_span));
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << "Result: " << output_span.at(i) << "\t"
-                     << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output_span, Pointwise(FloatNear(1e-5), kTestOutputTensor));
-  }
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/dispatch/litert_dispatch.cc b/tensorflow/lite/experimental/litert/runtime/dispatch/litert_dispatch.cc
deleted file mode 100644
index 550fb1357fd4..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dispatch/litert_dispatch.cc
+++ /dev/null
@@ -1,577 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-#include <dlfcn.h>
-
-#include <cstring>
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/dynamic_loading.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h"
-
-#define INVOKE_FUNC(function, ...)                                \
-  if (!TheApi.interface) {                                        \
-    LITERT_LOG(LITERT_ERROR, "Dispatch API interface not found"); \
-    return kLiteRtStatusErrorRuntimeFailure;                      \
-  }                                                               \
-  if (!TheApi.interface->function) {                              \
-    LITERT_LOG(LITERT_ERROR, #function " not found");             \
-    return kLiteRtStatusErrorRuntimeFailure;                      \
-  }                                                               \
-  return TheApi.interface->function(__VA_ARGS__);
-
-#define INVOKE_ASYNC_FUNC(function, ...)                                \
-  if (!TheApi.async_interface) {                                        \
-    LITERT_LOG(LITERT_ERROR, "Dispatch API async interface not found"); \
-    return kLiteRtStatusErrorRuntimeFailure;                            \
-  }                                                                     \
-  if (!TheApi.async_interface->function) {                              \
-    LITERT_LOG(LITERT_ERROR, #function " not found");                   \
-    return kLiteRtStatusErrorRuntimeFailure;                            \
-  }                                                                     \
-  return TheApi.async_interface->function(__VA_ARGS__);
-
-#define INVOKE_GRAPH_FUNC(function, ...)                                \
-  if (!TheApi.graph_interface) {                                        \
-    LITERT_LOG(LITERT_ERROR, "Dispatch API graoh interface not found"); \
-    return kLiteRtStatusErrorRuntimeFailure;                            \
-  }                                                                     \
-  if (!TheApi.graph_interface->function) {                              \
-    LITERT_LOG(LITERT_ERROR, #function " not found");                   \
-    return kLiteRtStatusErrorRuntimeFailure;                            \
-  }                                                                     \
-  return TheApi.graph_interface->function(__VA_ARGS__);
-
-namespace {
-
-bool IsTheApiInitialized = false;
-LiteRtDispatchApi TheApi = {
-    /*.version=*/{/*.major=*/0, /*.minor=*/0, /*.patch=*/0},
-    /*.interface=*/nullptr,
-    /*.async_interface=*/nullptr,
-    /*.graph_interface=*/nullptr,
-};
-
-LiteRtStatus Initialize(const LiteRtDispatchOption* options, int num_options) {
-  INVOKE_FUNC(initialize, options, num_options);
-}
-
-litert::Expected<std::string> GetSharedLibraryPath(
-    const LiteRtDispatchOption* options, int num_options) {
-  std::vector<std::string> dispatch_lib_paths;
-  for (auto i = 0; i < num_options; ++i) {
-    auto& option = options[i];
-    if (!strcmp(option.name, kDispatchOptionSharedLibraryDir)) {
-      litert::internal::FindLiteRtDispatchSharedLibs(option.value.str_value,
-                                                     dispatch_lib_paths);
-    }
-  }
-  if (dispatch_lib_paths.empty()) {
-    LITERT_LOG(LITERT_ERROR, "No dispatch library found");
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure);
-  }
-  if (dispatch_lib_paths.size() > 1) {
-    LITERT_LOG(LITERT_WARNING, "Multiple dispatch libraries found");
-  }
-  return dispatch_lib_paths[0];
-}
-}  // namespace
-
-// /////////////////////////////////////////////////////////////////////////////
-// Basic Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtStatus LiteRtDispatchInitialize(const LiteRtDispatchOption* options,
-                                      int num_options) {
-  if (IsTheApiInitialized) {
-    return kLiteRtStatusOk;
-  }
-
-  // TODO(piyu): support Android systems where libraries are not unpacked in the
-  // system directory.
-  auto shared_lib_path = GetSharedLibraryPath(options, num_options);
-  if (!shared_lib_path) {
-    return kLiteRtStatusErrorNotFound;
-  }
-  LITERT_LOG(LITERT_INFO, "Loading shared library: %s",
-             shared_lib_path.Value().c_str());
-  void* lib_handle =
-      ::dlopen(shared_lib_path.Value().c_str(), RTLD_NOW | RTLD_LOCAL);
-  if (!lib_handle) {
-    LITERT_LOG(LITERT_ERROR, "Failed to load dispatch library: %s %s",
-               ::dlerror(), shared_lib_path.Value().data());
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  using LiteRtDispatchGetApi_t = LiteRtStatus (*)(LiteRtDispatchApi*);
-  auto LiteRtDispatchGetApi = reinterpret_cast<LiteRtDispatchGetApi_t>(
-      ::dlsym(lib_handle, "LiteRtDispatchGetApi"));
-  if (!LiteRtDispatchGetApi) {
-    ::dlclose(lib_handle);
-    LITERT_LOG(LITERT_ERROR, "LiteRtDispatchGetApi not found");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  if (auto status = LiteRtDispatchGetApi(&TheApi); status != kLiteRtStatusOk) {
-    ::dlclose(lib_handle);
-    return status;
-  }
-
-  if (TheApi.version.major != LITERT_API_VERSION_MAJOR) {
-    ::dlclose(lib_handle);
-    LITERT_LOG(
-        LITERT_ERROR,
-        "Unsupported Dispatch API runtime version, found version %d.%d.%d and "
-        "expected version %d.%d.%d",
-        TheApi.version.major, TheApi.version.minor, TheApi.version.patch,
-        LITERT_API_VERSION_MAJOR, LITERT_API_VERSION_MINOR,
-        LITERT_API_VERSION_PATCH);
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  auto status = Initialize(options, num_options);
-  if (status == kLiteRtStatusOk) {
-    IsTheApiInitialized = true;
-  }
-  return status;
-}
-
-LiteRtStatus LiteRtDispatchGetApiVersion(LiteRtApiVersion* api_version) {
-  if (!api_version) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *api_version = TheApi.version;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDispatchGetVendorId(const char** vendor_id) {
-  if (!vendor_id) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(get_vendor_id, vendor_id);
-}
-
-LiteRtStatus LiteRtDispatchGetBuildId(const char** build_id) {
-  if (!build_id) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(get_build_id, build_id);
-}
-
-LiteRtStatus LiteRtDispatchGetCapabilities(int* capabilities) {
-  if (!capabilities) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(get_capabilities, capabilities);
-}
-
-LiteRtStatus LiteRtDispatchDeviceContextCreate(
-    LiteRtDispatchDeviceContext* device_context) {
-  if (!device_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(device_context_create, device_context);
-}
-
-LiteRtStatus LiteRtDispatchDeviceContextDestroy(
-    LiteRtDispatchDeviceContext device_context) {
-  if (!device_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(device_context_destroy, device_context);
-}
-
-LiteRtStatus LiteRtDispatchGetInputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int input_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (!invocation_context || !tensor_type || !tensor_buffer_requirements) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(get_input_requirements, invocation_context, input_index,
-              tensor_type, tensor_buffer_requirements);
-}
-
-LiteRtStatus LiteRtDispatchGetOutputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int output_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (!invocation_context || !tensor_type || !tensor_buffer_requirements) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(get_output_requirements, invocation_context, output_index,
-              tensor_type, tensor_buffer_requirements);
-}
-
-LiteRtStatus LiteRtDispatchRegisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBuffer tensor_buffer,
-    LiteRtTensorBufferHandle* tensor_buffer_handle) {
-  if (!device_context || !tensor_buffer || !tensor_buffer_handle) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(register_tensor_buffer, device_context, tensor_buffer,
-              tensor_buffer_handle);
-}
-
-LiteRtStatus LiteRtDispatchUnregisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (!device_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(unregister_tensor_buffer, device_context, tensor_buffer_handle);
-}
-
-LiteRtStatus LiteRtDispatchInvocationContextCreate(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs,
-    LiteRtDispatchInvocationContext* invocation_context) {
-  if (!device_context || !exec_bytecode_buffer || !invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(invocation_context_create, device_context, exec_type,
-              exec_bytecode_buffer, function_name, num_inputs, num_outputs,
-              invocation_context);
-}
-
-LiteRtStatus LiteRtDispatchInvocationContextDestroy(
-    LiteRtDispatchInvocationContext invocation_context) {
-  if (!invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(invocation_context_destroy, invocation_context);
-}
-
-LiteRtStatus LiteRtDispatchAttachInput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (!invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(attach_input, invocation_context, graph_input_index,
-              tensor_buffer_handle);
-}
-
-LiteRtStatus LiteRtDispatchAttachOutput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (!invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!TheApi.interface) {
-    LITERT_LOG(LITERT_ERROR, "Dispatch API interface not found");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-  if (!TheApi.interface->attach_output) {
-    LITERT_LOG(LITERT_ERROR, "attach_output_tensor_buffer not found");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-  INVOKE_FUNC(attach_output, invocation_context, graph_output_index,
-              tensor_buffer_handle);
-}
-
-LiteRtStatus LiteRtDispatchDetachInput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (!invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(detach_input, invocation_context, graph_input_index,
-              tensor_buffer_handle);
-}
-
-LiteRtStatus LiteRtDispatchDetachOutput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (!invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(detach_output, invocation_context, graph_output_index,
-              tensor_buffer_handle);
-}
-
-LiteRtStatus LiteRtDispatchInvoke(
-    LiteRtDispatchInvocationContext invocation_context) {
-  if (!invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(invoke, invocation_context);
-}
-
-LiteRtStatus LiteRtDispatchStartMetricsCollection(
-    LiteRtDispatchInvocationContext invocation_context, int detail_level) {
-  if (!invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (detail_level < 0) {
-    LITERT_LOG(LITERT_ERROR, "Invalid detail level");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(start_metrics_collection, invocation_context, detail_level);
-}
-
-LiteRtStatus LiteRtDispatchStopMetricsCollection(
-    LiteRtDispatchInvocationContext invocation_context,
-    LiteRtDispatchMetrics* metrics) {
-  if (!invocation_context || !metrics) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(stop_metrics_collection, invocation_context, metrics);
-}
-
-LiteRtStatus LiteRtDispatchGetNumMetrics(LiteRtDispatchMetrics metrics,
-                                         int* num_metrics) {
-  if (!metrics || !num_metrics) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(get_num_metrics, metrics, num_metrics);
-}
-
-LiteRtStatus LiteRtDispatchGetMetric(LiteRtDispatchMetrics metrics,
-                                     int metric_index, LiteRtMetric* metric) {
-  if (!metrics || !metric) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(get_metric, metrics, metric_index, metric);
-}
-
-LiteRtStatus LiteRtDispatchDestroyMetrics(LiteRtDispatchMetrics metrics) {
-  if (!metrics) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_FUNC(destroy_metrics, metrics);
-}
-
-// /////////////////////////////////////////////////////////////////////////////
-// Async Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtStatus LiteRtDispatchAttachInputEvent(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtEvent input_event) {
-  if (!invocation_context || !input_event) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_ASYNC_FUNC(attach_input_event, invocation_context, graph_input_index,
-                    input_event);
-}
-
-LiteRtStatus LiteRtDispatchInvokeAsync(
-    LiteRtDispatchInvocationContext invocation_context, int num_output_events,
-    LiteRtEvent* output_events) {
-  if (!invocation_context || !output_events) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_ASYNC_FUNC(invoke_async, invocation_context, num_output_events,
-                    output_events);
-}
-
-// /////////////////////////////////////////////////////////////////////////////
-// Graph Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtStatus LiteRtDispatchGraphCreate(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph* graph) {
-  if (!device_context || !graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(graph_create, device_context, graph);
-}
-
-LiteRtStatus LiteRtDispatchGraphDestroy(LiteRtDispatchGraph graph) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(graph_destroy, graph);
-}
-
-LiteRtStatus LiteRtDispatchAddNode(LiteRtDispatchGraph graph,
-                                   LiteRtDispatchNodeId node_id,
-                                   LiteRtDispatchNodeType node_type) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(add_node, graph, node_id, node_type);
-}
-
-LiteRtStatus LiteRtDispatchAddEdge(LiteRtDispatchGraph graph,
-                                   LiteRtDispatchEdgeId edge_id) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(add_edge, graph, edge_id);
-}
-
-LiteRtStatus LiteRtDispatchConnectNodeInput(LiteRtDispatchGraph graph,
-                                            LiteRtDispatchNodeId node_id,
-                                            int input_index,
-                                            LiteRtDispatchEdgeId edge_id) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(connect_node_input, graph, node_id, input_index, edge_id);
-}
-
-LiteRtStatus LiteRtDispatchConnectNodeOutput(LiteRtDispatchGraph graph,
-                                             LiteRtDispatchNodeId node_id,
-                                             int output_index,
-                                             LiteRtDispatchEdgeId edge_id) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(connect_node_output, graph, node_id, output_index, edge_id);
-}
-
-LiteRtStatus LiteRtDispatchConnectGraphInput(LiteRtDispatchGraph graph,
-                                             int input_index,
-                                             LiteRtDispatchEdgeId edge_id) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(connect_graph_input, graph, input_index, edge_id);
-}
-
-LiteRtStatus LiteRtDispatchConnectGraphOutput(LiteRtDispatchGraph graph,
-                                              int output_index,
-                                              LiteRtDispatchEdgeId edge_id) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(connect_graph_output, graph, output_index, edge_id);
-}
-
-LiteRtStatus LiteRtDispatchLoadExecutable(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType type, const LiteRtMemBuffer* bytecode_buffer,
-    LiteRtDispatchExecutableHandle* exec_handle) {
-  if (!device_context || !bytecode_buffer || !exec_handle) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  if (!TheApi.graph_interface) {
-    LITERT_LOG(LITERT_ERROR, "Dispatch API graph interface not found");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-  if (!TheApi.graph_interface->load_executable) {
-    LITERT_LOG(LITERT_ERROR, "load_executable not found");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-  INVOKE_GRAPH_FUNC(load_executable, device_context, type, bytecode_buffer,
-                    exec_handle);
-}
-
-LiteRtStatus LiteRtDispatchUnloadExecutable(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableHandle exec_handle) {
-  if (!device_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(unload_executable, device_context, exec_handle);
-}
-
-LiteRtStatus LiteRtDispatchAssignNodeFunction(
-    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
-    LiteRtDispatchExecutableHandle exec_handle, const char* function_name) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(assign_node_function, graph, node_id, exec_handle,
-                    function_name);
-}
-
-LiteRtStatus LiteRtDispatchAnnotateGraph(LiteRtDispatchGraph graph,
-                                         const char* key, const char* value) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(annotate_graph, graph, key, value);
-}
-
-LiteRtStatus LiteRtDispatchAnnotateNode(LiteRtDispatchGraph graph,
-                                        LiteRtDispatchNodeId node_id,
-                                        const char* key, const char* value) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(annotate_node, graph, node_id, key, value);
-}
-
-LiteRtStatus LiteRtDispatchAnnotateEdge(LiteRtDispatchGraph graph,
-                                        LiteRtDispatchEdgeId edge_id,
-                                        const char* key, const char* value) {
-  if (!graph) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(annotate_edge, graph, edge_id, key, value);
-}
-
-LiteRtStatus LiteRtDispatchInvocationContextCreateFromGraph(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph,
-    LiteRtDispatchInvocationContext* invocation_context) {
-  if (!device_context || !graph || !invocation_context) {
-    LITERT_LOG(LITERT_ERROR, "Null input");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  INVOKE_GRAPH_FUNC(invocation_context_create_from_graph, device_context, graph,
-                    invocation_context);
-}
diff --git a/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.cc b/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.cc
deleted file mode 100644
index ab86134b1f72..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h"
-
-#include <cstddef>
-#include <memory>
-#include <utility>
-
-#include "absl/base/attributes.h"
-#include "absl/base/const_init.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#if LITERT_HAS_DMA_BUF_SUPPORT
-#include <dlfcn.h>
-#include <sys/mman.h>
-#endif  // LITERT_HAS_DMA_BUF_SUPPORT
-
-namespace litert {
-namespace internal {
-
-#if LITERT_HAS_DMA_BUF_SUPPORT
-namespace {
-
-class DmaBufLibrary {
- public:
-  using Ptr = std::unique_ptr<DmaBufLibrary>;
-
-  ~DmaBufLibrary() {
-    if (allocator_) {
-      free_allocator_(allocator_);
-    }
-  }
-
-  static Expected<Ptr> Create() {
-    DlHandle dlhandle(::dlopen("libdmabufheap.so", RTLD_LAZY | RTLD_LOCAL),
-                      ::dlclose);
-    if (!dlhandle) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "libdmabufheap.so not found");
-    }
-
-    auto create_allocator = reinterpret_cast<CreateAllocator>(
-        ::dlsym(dlhandle.get(), "CreateDmabufHeapBufferAllocator"));
-    if (!create_allocator) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "CreateDmabufHeapBufferAllocator not found");
-    }
-
-    auto free_allocator = reinterpret_cast<FreeAllocator>(
-        ::dlsym(dlhandle.get(), "FreeDmabufHeapBufferAllocator"));
-    if (!free_allocator) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "FreeDmabufHeapBufferAllocator not found");
-    }
-
-    auto alloc_buffer = reinterpret_cast<AllocBuffer>(
-        ::dlsym(dlhandle.get(), "DmabufHeapAlloc"));
-    if (!alloc_buffer) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "DmabufHeapAlloc not found");
-    }
-
-    void* allocator = create_allocator();
-    if (!allocator) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "CreateDmabufHeapBufferAllocator failed");
-    }
-
-    return Ptr(new DmaBufLibrary(std::move(dlhandle), allocator, free_allocator,
-                                 alloc_buffer));
-  }
-
-  Expected<DmaBufBuffer> Alloc(size_t size) {
-    int fd = alloc_buffer_(allocator_, kDmaBufHeap, size, /*flags=*/0,
-                           /*legacy_align=*/0);
-    if (fd < 0) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to allocate DMA-BUF buffer");
-    }
-    void* addr =
-        ::mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-    if (addr == MAP_FAILED) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to mem-map DMA-BUF buffer");
-    }
-    records_[addr] = Record{.fd = fd, .addr = addr, .size = size};
-    return DmaBufBuffer{.fd = fd, .addr = addr};
-  }
-
-  void Free(void* addr) {
-    auto iter = records_.find(addr);
-    if (iter == records_.end()) {
-      return;
-    }
-    auto& record = iter->second;
-    ::munmap(record.addr, record.size);
-    ::close(record.fd);
-    records_.erase(iter);
-  }
-
- private:
-  static constexpr const char* kDmaBufHeap = "system";
-
-  struct Record {
-    int fd;
-    void* addr;
-    size_t size;
-  };
-
-  using DlHandle = std::unique_ptr<void, int (*)(void*)>;
-  using CreateAllocator = void* (*)();
-  using FreeAllocator = void (*)(void*);
-  using AllocBuffer = int (*)(void*, const char*, size_t, unsigned int, size_t);
-
-  DmaBufLibrary(DlHandle&& dlhandle, void* allocator,
-                FreeAllocator free_allocator, AllocBuffer alloc_buffer)
-      : dlhandle_(std::move(dlhandle)) {
-    allocator_ = allocator;
-    free_allocator_ = free_allocator;
-    alloc_buffer_ = alloc_buffer;
-  }
-
-  DlHandle dlhandle_;
-  void* allocator_;
-  FreeAllocator free_allocator_;
-  AllocBuffer alloc_buffer_;
-  absl::node_hash_map<void*, Record> records_;
-};
-
-DmaBufLibrary* TheDmaBufLibrary;
-ABSL_CONST_INIT absl::Mutex TheMutex(absl::kConstInit);
-
-Expected<void> InitLibraryIfNeededUnlocked() {
-  if (!TheDmaBufLibrary) {
-    if (auto library = DmaBufLibrary::Create(); library) {
-      TheDmaBufLibrary = library->release();
-    } else {
-      return Unexpected(library.Error());
-    }
-  }
-  return {};
-}
-
-}  // namespace
-#endif  // LITERT_HAS_DMA_BUF_SUPPORT
-
-bool DmaBufBuffer::IsSupported() {
-#if LITERT_HAS_DMA_BUF_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  auto status = InitLibraryIfNeededUnlocked();
-  return static_cast<bool>(status);
-#else   // LITERT_HAS_DMA_BUF_SUPPORT
-  return false;
-#endif  // LITERT_HAS_DMA_BUF_SUPPORT
-}
-
-Expected<DmaBufBuffer> DmaBufBuffer::Alloc(size_t size) {
-#if LITERT_HAS_DMA_BUF_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  if (auto status = InitLibraryIfNeededUnlocked(); !status) {
-    return Unexpected(status.Error());
-  }
-  return TheDmaBufLibrary->Alloc(size);
-#else   // LITERT_HAS_DMA_BUF_SUPPORT
-  return Unexpected(kLiteRtStatusErrorUnsupported,
-                    "DmaBufBuffer::Alloc not implemented for this platform");
-#endif  // LITERT_HAS_DMA_BUF_SUPPORT
-}
-
-void DmaBufBuffer::Free(void* addr) {
-#if LITERT_HAS_DMA_BUF_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  if (TheDmaBufLibrary) {
-    TheDmaBufLibrary->Free(addr);
-  }
-#endif  // LITERT_HAS_DMA_BUF_SUPPORT
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h b/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h
deleted file mode 100644
index ad0cb37dfe37..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DMABUF_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DMABUF_BUFFER_H_
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace internal {
-
-struct DmaBufBuffer {
-  int fd;
-  void* addr;
-
-  static bool IsSupported();
-  static Expected<DmaBufBuffer> Alloc(size_t size);
-  static void Free(void* addr);
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_DMABUF_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/environment.cc b/tensorflow/lite/experimental/litert/runtime/environment.cc
deleted file mode 100644
index eeef3751ecae..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/environment.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/environment.h"
-
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace internal {
-
-EnvironmentSingleton::EnvironmentSingleton(LiteRtEnvironmentT* environment) {
-  cl_device_id device_id = nullptr;
-  cl_platform_id platform_id = nullptr;
-  cl_context context = nullptr;
-  cl_command_queue command_queue = nullptr;
-  if (environment) {
-    auto device_option =
-        environment->GetOption(kLiteRtEnvOptionTagOpenClDeviceId);
-    if (device_option.has_value() && device_option->type == kLiteRtAnyTypeInt) {
-      device_id = reinterpret_cast<cl_device_id>(device_option->int_value);
-    }
-    auto platform_option =
-        environment->GetOption(kLiteRtEnvOptionTagOpenClPlatformId);
-    if (platform_option.has_value() &&
-        platform_option->type == kLiteRtAnyTypeInt) {
-      platform_id =
-          reinterpret_cast<cl_platform_id>(platform_option->int_value);
-    }
-    auto context_option =
-        environment->GetOption(kLiteRtEnvOptionTagOpenClContext);
-    if (context_option.has_value() &&
-        context_option->type == kLiteRtAnyTypeInt) {
-      context = reinterpret_cast<cl_context>(context_option->int_value);
-    }
-    auto command_queue_option =
-        environment->GetOption(kLiteRtEnvOptionTagOpenClCommandQueue);
-    if (command_queue_option.has_value() &&
-        command_queue_option->type == kLiteRtAnyTypeInt) {
-      command_queue =
-          reinterpret_cast<cl_command_queue>(command_queue_option->int_value);
-    }
-  }
-  if (device_id && platform_id) {
-    device_ = litert::cl::ClDevice(device_id, platform_id);
-  } else {
-    auto status = litert::cl::CreateDefaultGPUDevice(&device_);
-    if (!status.ok()) {
-      LITERT_LOG(LITERT_ERROR, "Failed to create OpenCL device");
-    }
-  }
-  if (context) {
-    context_ = litert::cl::ClContext(context, /*has_ownership=*/false);
-  } else {
-    auto status = litert::cl::CreateClContext(device_, &context_);
-    if (!status.ok()) {
-      LITERT_LOG(LITERT_ERROR, "Failed to create OpenCL contxt");
-    }
-  }
-  if (command_queue) {
-    command_queue_ =
-        litert::cl::ClCommandQueue(command_queue, /*has_ownership=*/false);
-  } else {
-    auto status =
-        litert::cl::CreateClCommandQueue(device_, context_, &command_queue_);
-    if (!status.ok()) {
-      LITERT_LOG(LITERT_ERROR, "Failed to create OpenCL command queue");
-    }
-  }
-}
-
-EnvironmentSingleton* EnvironmentSingleton::instance_ = nullptr;
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/environment.h b/tensorflow/lite/experimental/litert/runtime/environment.h
deleted file mode 100644
index 5d08c0d93993..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/environment.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ENVIRONMENT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ENVIRONMENT_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-
-namespace litert {
-namespace internal {
-
-// Inner singleton class that is for storing the MLD global environment.
-// This class is used to store OpenCL, OpenGL environment objects.
-class EnvironmentSingleton {
- public:
-  EnvironmentSingleton(const EnvironmentSingleton&) = delete;
-  EnvironmentSingleton& operator=(const EnvironmentSingleton&) = delete;
-  ~EnvironmentSingleton() = default;
-  litert::cl::ClDevice* getDevice() { return &device_; }
-  litert::cl::ClContext* getContext() { return &context_; }
-  litert::cl::ClCommandQueue* getCommandQueue() { return &command_queue_; }
-
-  static EnvironmentSingleton& GetInstance() {
-    if (instance_ == nullptr) {
-      instance_ = new EnvironmentSingleton(nullptr);
-    }
-    return *instance_;
-  }
-
-  // Create the singleton instance with the given environment.
-  // It will fail if the singleton instance already exists.
-  static Expected<EnvironmentSingleton*> Create(
-      LiteRtEnvironmentT* environment) {
-    if (instance_ == nullptr) {
-      instance_ = new EnvironmentSingleton(environment);
-    } else {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Environment singleton already exists");
-    }
-    return instance_;
-  }
-
- private:
-  // Load the OpenCL device, context and command queue from the environment if
-  // available. Otherwise, create the default device, context and command queue.
-  explicit EnvironmentSingleton(LiteRtEnvironmentT* environment);
-
-  litert::cl::ClDevice device_;
-  litert::cl::ClContext context_;
-  litert::cl::ClCommandQueue command_queue_;
-  static EnvironmentSingleton* instance_;
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ENVIRONMENT_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/environment_test.cc b/tensorflow/lite/experimental/litert/runtime/environment_test.cc
deleted file mode 100644
index 4fab10abfbc5..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/environment_test.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/environment.h"
-
-#include <any>
-#include <array>
-#include <cstdint>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "third_party/ml_drift/cl/environment.h"
-#include "third_party/ml_drift/cl/opencl_wrapper.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace {
-
-TEST(EnvironmentSingletonTest, OpenClEnvironment) {
-  // MSAN does not support GPU tests.
-#if defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
-  GTEST_SKIP() << "GPU tests are not supported in MSAN";
-#endif
-
-  if (!ml_drift::cl::LoadOpenCL().ok()) {
-    GTEST_SKIP() << "OpenCL not loaded for ml_drift";
-  }
-  if (!litert::cl::LoadOpenCL().ok()) {
-    GTEST_SKIP() << "OpenCL not loaded for litert";
-  }
-
-  ml_drift::cl::Environment env;
-  ASSERT_OK(ml_drift::cl::CreateEnvironment(&env));
-
-  const std::array<LiteRtEnvOption, 2> environment_options = {
-      LiteRtEnvOption{
-          /*.tag=*/kLiteRtEnvOptionTagOpenClContext,
-          /*.value=*/
-          *ToLiteRtAny(
-              std::any(reinterpret_cast<int64_t>(env.context().context()))),
-      },
-      LiteRtEnvOption{
-          /*.tag=*/kLiteRtEnvOptionTagOpenClCommandQueue,
-          /*.value=*/
-          *ToLiteRtAny(
-              std::any(reinterpret_cast<int64_t>(env.queue()->queue()))),
-      },
-  };
-  auto litert_envt = LiteRtEnvironmentT::CreateWithOptions(environment_options);
-  ASSERT_TRUE(litert_envt);
-  auto singleton_env =
-      litert::internal::EnvironmentSingleton::Create(litert_envt->get());
-  ASSERT_TRUE(singleton_env);
-  EXPECT_EQ((*singleton_env)->getContext()->context(), env.context().context());
-  EXPECT_EQ((*singleton_env)->getCommandQueue()->queue(), env.queue()->queue());
-
-  // Create another singleton environment should fail.
-  auto another_singleton_env =
-      litert::internal::EnvironmentSingleton::Create(litert_envt->get());
-  EXPECT_FALSE(another_singleton_env);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/event.cc b/tensorflow/lite/experimental/litert/runtime/event.cc
deleted file mode 100644
index 71879f33ddc0..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/event.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/event.h"
-
-#include <fcntl.h>
-
-#include <cerrno>
-#include <cstdint>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#if LITERT_HAS_SYNC_FENCE_SUPPORT
-#include <poll.h>
-#include <unistd.h>
-#endif  // LITERT_HAS_SYNC_FENCE_SUPPORT
-
-using litert::Error;
-using litert::Expected;
-
-Expected<void> LiteRtEventT::Wait(int64_t timeout_in_ms) {
-#if LITERT_HAS_SYNC_FENCE_SUPPORT
-  struct pollfd fds = {
-      .fd = fd,
-      .events = POLLIN,
-  };
-
-  int ret;
-  do {
-    ret = ::poll(&fds, 1, timeout_in_ms);
-    if (ret == 1) {
-      break;
-    } else if (ret == 0) {
-      return Error(kLiteRtStatusErrorTimeoutExpired, "Timeout expired");
-    }
-  } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
-
-  if (ret < 0) {
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Error waiting for fence");
-  }
-
-  return {};
-
-#else
-  return Error(kLiteRtStatusErrorUnsupported,
-               "LiteRtEventWait not implemented for this platform");
-#endif
-}
-
-#if LITERT_HAS_SYNC_FENCE_SUPPORT
-namespace {
-inline bool IsFdValid(int fd) {
-  return ::fcntl(fd, F_GETFD) != -1 || errno != EBADF;
-}
-}  // namespace
-#endif
-
-LiteRtEventT::~LiteRtEventT() {
-#if LITERT_HAS_SYNC_FENCE_SUPPORT
-  if (owns_fd && IsFdValid(fd)) {
-    ::close(fd);
-  }
-#endif
-}
diff --git a/tensorflow/lite/experimental/litert/runtime/event.h b/tensorflow/lite/experimental/litert/runtime/event.h
deleted file mode 100644
index 8cc665e95f2a..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/event.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EVENT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EVENT_H_
-
-#include <cstdint>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-struct LiteRtEventT {
-#if LITERT_HAS_SYNC_FENCE_SUPPORT
-  int fd = -1;
-  bool owns_fd = false;
-#endif
-  ~LiteRtEventT();
-  litert::Expected<void> Wait(int64_t timeout_in_ms);
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EVENT_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.cc b/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.cc
deleted file mode 100644
index 63ace18c1a85..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h"
-
-#include <utility>
-
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/runtime/tfl_utils.h"
-
-namespace litert {
-namespace internal {
-
-LiteRtStatus ExternalLiteRtBufferContext::RegisterBufferRequirement(
-    const TfLiteOpaqueTensor* tensor,
-    TensorBufferRequirements&& buffer_requirements) {
-  if (buffer_requirements_.find(tensor) != buffer_requirements_.end()) {
-    LITERT_LOG(LITERT_ERROR,
-               "RegisterBufferRequirement already exists for tensor: %p",
-               tensor);
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-  buffer_requirements_[tensor] = std::move(buffer_requirements);
-  return kLiteRtStatusOk;
-}
-
-litert::Expected<TensorBufferRequirements*>
-ExternalLiteRtBufferContext::GetBufferRequirement(
-    const TfLiteOpaqueTensor* tensor) {
-  auto it = buffer_requirements_.find(tensor);
-  if (it == buffer_requirements_.end()) {
-    return litert::Unexpected(kLiteRtStatusErrorNotFound,
-                              "Buffer requirement not found");
-  }
-  return &(it->second);
-}
-
-LiteRtStatus ExternalLiteRtBufferContext::RegisterTensorBuffer(
-    const TfLiteOpaqueTensor* tensor, TensorBuffer&& tensor_buffer) {
-  tensor_buffers_[tensor] = std::move(tensor_buffer);
-  return kLiteRtStatusOk;
-}
-
-litert::Expected<TensorBuffer> ExternalLiteRtBufferContext::GetTensorBuffer(
-    const TfLiteOpaqueTensor* tensor) {
-  auto it = tensor_buffers_.find(tensor);
-  if (it == tensor_buffers_.end()) {
-    return litert::Unexpected(kLiteRtStatusErrorNotFound,
-                              "Tensor buffer not found");
-  }
-
-  auto duplicate_tensor_buffer = it->second.Duplicate();
-  if (!duplicate_tensor_buffer) {
-    return litert::Unexpected(duplicate_tensor_buffer.Error());
-  }
-  return std::move(duplicate_tensor_buffer.Value());
-}
-
-litert::Expected<TensorBuffer>
-ExternalLiteRtBufferContext::CreateBufferForTensor(
-    const TfLiteOpaqueTensor* tensor) {
-  auto tensor_buffer_requirements = GetBufferRequirement(tensor);
-  if (!tensor_buffer_requirements) {
-    return litert::Unexpected(tensor_buffer_requirements.Error());
-  }
-
-  auto tensor_type = litert::internal::ConvertTensorType(tensor);
-  if (!tensor_type) {
-    return litert::Unexpected(tensor_type.Error());
-  }
-
-  auto supported_tensor_buffer_types =
-      (*tensor_buffer_requirements)->SupportedTypes();
-  if (!supported_tensor_buffer_types) {
-    return litert::Unexpected(supported_tensor_buffer_types.Error());
-  }
-  if (supported_tensor_buffer_types->empty()) {
-    return litert::Unexpected(
-        kLiteRtStatusErrorRuntimeFailure,
-        "Insufficient number of supported tensor buffer types");
-  }
-
-  // For now we simply pick the first buffer type that's supported.
-  LiteRtTensorBufferType tensor_buffer_type =
-      (*supported_tensor_buffer_types)[0];
-
-  auto tensor_buffer_size = (*tensor_buffer_requirements)->BufferSize();
-  if (!tensor_buffer_size) {
-    return litert::Unexpected(tensor_buffer_size.Error());
-  }
-  auto litert_tensor_type = static_cast<LiteRtRankedTensorType>(*tensor_type);
-
-  LiteRtTensorBuffer litert_tensor_buffer;
-  if (auto status = LiteRtCreateManagedTensorBuffer(
-          tensor_buffer_type, &litert_tensor_type, *tensor_buffer_size,
-          &litert_tensor_buffer);
-      status != kLiteRtStatusOk) {
-    return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                              "Failed to create managed tensor buffer");
-  }
-
-  return TensorBuffer(litert_tensor_buffer, /*owned=*/true);
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h b/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h
deleted file mode 100644
index 4419030d5670..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/external_litert_buffer_context.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EXTERNAL_LITERT_BUFFER_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EXTERNAL_LITERT_BUFFER_CONTEXT_H_
-
-#include <memory>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_requirements.h"
-
-namespace litert {
-namespace internal {
-
-class ExternalLiteRtBufferContext : public TfLiteExternalContext {
- public:
-  ExternalLiteRtBufferContext() = default;
-  ~ExternalLiteRtBufferContext() = default;
-
-  // Registers a tensor buffer requirements for the given tensor.
-  // The registered TensorBufferRequirements object is owned by
-  // ExternalLiteRtBufferContext.
-  // Note: Currently, the system pre-registers tensor buffer requirements before
-  // they're actually used. A more efficient approach would be to query
-  // DelegateKernel only when these requirements are needed.
-  LiteRtStatus RegisterBufferRequirement(
-      const TfLiteOpaqueTensor* tensor,
-      TensorBufferRequirements&& buffer_requirements);
-
-  inline LiteRtStatus RegisterBufferRequirement(
-      const TfLiteTensor* tensor,
-      TensorBufferRequirements&& buffer_requirements) {
-    return RegisterBufferRequirement(
-        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor),
-        std::move(buffer_requirements));
-  }
-
-  inline LiteRtStatus RegisterLiteRtBufferRequirement(
-      const TfLiteTensor* tensor,
-      LiteRtTensorBufferRequirements& litert_buffer_requirements) {
-    return RegisterBufferRequirement(
-        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor),
-        TensorBufferRequirements(litert_buffer_requirements,
-                                 /*owned=*/true));
-  }
-
-  // Gets a registered tensor buffer requirements for the given tensor.
-  // The returned TensorBufferRequirements object is still owned by
-  // ExternalLiteRtBufferContext.
-  litert::Expected<TensorBufferRequirements*> GetBufferRequirement(
-      const TfLiteOpaqueTensor* tensor);
-
-  inline litert::Expected<TensorBufferRequirements*> GetBufferRequirement(
-      const TfLiteTensor* tensor) {
-    return GetBufferRequirement(
-        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor));
-  }
-
-  // Registers a tensor buffer for the given tensor.
-  // The registered TensorBuffer object is owned by ExternalLiteRtBufferContext.
-  LiteRtStatus RegisterTensorBuffer(const TfLiteOpaqueTensor* tensor,
-                                    TensorBuffer&& tensor_buffer);
-
-  inline LiteRtStatus RegisterTensorBuffer(const TfLiteTensor* tensor,
-                                           TensorBuffer&& tensor_buffer) {
-    return RegisterTensorBuffer(
-        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor),
-        std::move(tensor_buffer));
-  }
-
-  // Gets a registered tensor buffer for the given tensor.
-  // The returned TensorBuffer object is duplication (reference counted)
-  // of registered TensorBuffer.
-  litert::Expected<TensorBuffer> GetTensorBuffer(
-      const TfLiteOpaqueTensor* tensor);
-
-  inline litert::Expected<TensorBuffer> GetTensorBuffer(
-      const TfLiteTensor* tensor) {
-    return GetTensorBuffer(reinterpret_cast<const TfLiteOpaqueTensor*>(tensor));
-  }
-
-  // Creates a tensor buffer for the given tensor.
-  // The callers takes ownership of the returned TensorBuffer object.
-  litert::Expected<TensorBuffer> CreateBufferForTensor(
-      const TfLiteOpaqueTensor* tensor);
-
-  inline litert::Expected<TensorBuffer> CreateBufferForTensor(
-      const TfLiteTensor* tensor) {
-    return CreateBufferForTensor(
-        reinterpret_cast<const TfLiteOpaqueTensor*>(tensor));
-  }
-
- private:
-  absl::flat_hash_map<const TfLiteOpaqueTensor*, TensorBufferRequirements>
-      buffer_requirements_;
-  absl::flat_hash_map<const TfLiteOpaqueTensor*, TensorBuffer> tensor_buffers_;
-
-  ExternalLiteRtBufferContext(const ExternalLiteRtBufferContext&) = delete;
-  ExternalLiteRtBufferContext& operator=(const ExternalLiteRtBufferContext&) =
-      delete;
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_EXTERNAL_LITERT_BUFFER_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.cc b/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.cc
deleted file mode 100644
index d0ec124b3177..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "absl/base/attributes.h"
-#include "absl/base/const_init.h"
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#if LITERT_HAS_FASTRPC_SUPPORT
-#include <dlfcn.h>
-#endif  // LITERT_HAS_FASTRPC_SUPPORT
-
-namespace litert {
-namespace internal {
-
-#if LITERT_HAS_FASTRPC_SUPPORT
-namespace {
-
-class FastRpcMemLibrary {
- public:
-  using Ptr = std::unique_ptr<FastRpcMemLibrary>;
-
-  static Expected<Ptr> Create() {
-    DlHandle dlhandle(::dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL),
-                      ::dlclose);
-    if (!dlhandle) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "libcdsprpc.so not found");
-    }
-
-    auto rpcmem_alloc =
-        reinterpret_cast<RpcMemAlloc>(::dlsym(dlhandle.get(), "rpcmem_alloc"));
-    if (!rpcmem_alloc) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "rpcmem_alloc not found");
-    }
-
-    auto rpcmem_free =
-        reinterpret_cast<RpcMemFree>(::dlsym(dlhandle.get(), "rpcmem_free"));
-    if (!rpcmem_free) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "rpcmem_free not found");
-    }
-
-    auto rpcmem_to_fd =
-        reinterpret_cast<RpcMemToFd>(::dlsym(dlhandle.get(), "rpcmem_to_fd"));
-    if (!rpcmem_to_fd) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "rpcmem_to_fd not found");
-    }
-
-    return Ptr(new FastRpcMemLibrary(std::move(dlhandle), rpcmem_alloc,
-                                     rpcmem_free, rpcmem_to_fd));
-  }
-
-  void* Alloc(size_t size) const {
-    return rpcmem_alloc_(kRpcmemHeapIdSystem, kRpcmemDefaultFlags, size);
-  }
-
-  void Free(void* buffer) const { return rpcmem_free_(buffer); }
-
-  int ToFd(void* buffer) const { return rpcmem_to_fd_(buffer); }
-
- private:
-  static constexpr int kRpcmemHeapIdSystem = 25;
-  static constexpr uint32_t kRpcmemDefaultFlags = 1;
-
-  using DlHandle = std::unique_ptr<void, int (*)(void*)>;
-  using RpcMemAlloc = void* (*)(int, uint32_t, int);
-  using RpcMemFree = void (*)(void*);
-  using RpcMemToFd = int (*)(void*);
-
-  FastRpcMemLibrary(DlHandle&& dlhandle, RpcMemAlloc rpcmem_alloc,
-                    RpcMemFree rpcmem_free, RpcMemToFd rpcmem_to_fd)
-      : dlhandle_(std::move(dlhandle)) {
-    rpcmem_alloc_ = rpcmem_alloc;
-    rpcmem_free_ = rpcmem_free;
-    rpcmem_to_fd_ = rpcmem_to_fd;
-  }
-
-  DlHandle dlhandle_;
-  RpcMemAlloc rpcmem_alloc_;
-  RpcMemFree rpcmem_free_;
-  RpcMemToFd rpcmem_to_fd_;
-};
-
-FastRpcMemLibrary* TheFastRpcMemLibrary;
-ABSL_CONST_INIT absl::Mutex TheMutex(absl::kConstInit);
-
-Expected<void> InitLibraryIfNeededUnlocked() {
-  if (!TheFastRpcMemLibrary) {
-    if (auto library = FastRpcMemLibrary::Create(); library) {
-      TheFastRpcMemLibrary = library->release();
-    } else {
-      return Unexpected(library.Error());
-    }
-  }
-  return {};
-}
-
-}  // namespace
-#endif  // LITERT_HAS_FASTRPC_SUPPORT
-
-bool FastRpcBuffer::IsSupported() {
-#if LITERT_HAS_FASTRPC_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  auto status = InitLibraryIfNeededUnlocked();
-  return static_cast<bool>(status);
-#else   // LITERT_HAS_FASTRPC_SUPPORT
-  return false;
-#endif  // LITERT_HAS_FASTRPC_SUPPORT
-}
-
-Expected<FastRpcBuffer> FastRpcBuffer::Alloc(size_t size) {
-#if LITERT_HAS_FASTRPC_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  if (auto status = InitLibraryIfNeededUnlocked(); !status) {
-    return status.Error();
-  }
-  void* addr = TheFastRpcMemLibrary->Alloc(size);
-  int fd = TheFastRpcMemLibrary->ToFd(addr);
-  return FastRpcBuffer{.fd = fd, .addr = addr};
-#else   // LITERT_HAS_FASTRPC_SUPPORT
-  return Unexpected(kLiteRtStatusErrorUnsupported,
-                    "FastRpcBuffer::Alloc not implemented for this platform");
-#endif  // LITERT_HAS_FASTRPC_SUPPORT
-}
-
-void FastRpcBuffer::Free(void* addr) {
-#if LITERT_HAS_FASTRPC_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  if (TheFastRpcMemLibrary) {
-    TheFastRpcMemLibrary->Free(addr);
-  }
-#endif  // LITERT_HAS_FASTRPC_SUPPORT
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h b/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h
deleted file mode 100644
index d45a16763a0f..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_FASTRPC_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_FASTRPC_BUFFER_H_
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace internal {
-
-struct FastRpcBuffer {
-  int fd;
-  void* addr;
-
-  static bool IsSupported();
-  static Expected<FastRpcBuffer> Alloc(size_t size);
-  static void Free(void* addr);
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_FASTRPC_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/gl_buffer.cc b/tensorflow/lite/experimental/litert/runtime/gl_buffer.cc
deleted file mode 100644
index b4850a971be5..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/gl_buffer.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#if LITERT_HAS_OPENGL_SUPPORT
-
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
-#include <GLES3/gl31.h>
-#include <GLES3/gl32.h>
-#include <stdlib.h>
-
-#include <cstddef>
-#include <memory>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/runtime/gl_buffer.h"
-
-namespace litert {
-namespace internal {
-
-#if LITERT_HAS_AHWB_SUPPORT
-
-PFNGLBUFFERSTORAGEEXTERNALEXTPROC glBufferStorageExternalEXT;
-PFNEGLGETNATIVECLIENTBUFFERANDROIDPROC eglGetNativeClientBufferANDROID;
-PFNEGLDUPNATIVEFENCEFDANDROIDPROC eglDupNativeFenceFDANDROID;
-PFNEGLCREATESYNCKHRPROC eglCreateSyncKHR;
-PFNEGLWAITSYNCKHRPROC eglWaitSyncKHR;
-PFNEGLCLIENTWAITSYNCKHRPROC eglClientWaitSyncKHR;
-PFNEGLDESTROYSYNCKHRPROC eglDestroySyncKHR;
-
-bool IsAhwbToGlInteropSupported() {
-  static const bool extensions_allowed = [] {
-    eglGetNativeClientBufferANDROID =
-        reinterpret_cast<PFNEGLGETNATIVECLIENTBUFFERANDROIDPROC>(
-            eglGetProcAddress("eglGetNativeClientBufferANDROID"));
-    glBufferStorageExternalEXT =
-        reinterpret_cast<PFNGLBUFFERSTORAGEEXTERNALEXTPROC>(
-            eglGetProcAddress("glBufferStorageExternalEXT"));
-    eglDupNativeFenceFDANDROID =
-        reinterpret_cast<PFNEGLDUPNATIVEFENCEFDANDROIDPROC>(
-            eglGetProcAddress("eglDupNativeFenceFDANDROID"));
-    eglCreateSyncKHR = reinterpret_cast<PFNEGLCREATESYNCKHRPROC>(
-        eglGetProcAddress("eglCreateSyncKHR"));
-    eglWaitSyncKHR = reinterpret_cast<PFNEGLWAITSYNCKHRPROC>(
-        eglGetProcAddress("eglWaitSyncKHR"));
-    eglClientWaitSyncKHR = reinterpret_cast<PFNEGLCLIENTWAITSYNCKHRPROC>(
-        eglGetProcAddress("eglClientWaitSyncKHR"));
-    eglDestroySyncKHR = reinterpret_cast<PFNEGLDESTROYSYNCKHRPROC>(
-        eglGetProcAddress("eglDestroySyncKHR"));
-    return eglClientWaitSyncKHR && eglWaitSyncKHR &&
-           eglGetNativeClientBufferANDROID && glBufferStorageExternalEXT &&
-           eglCreateSyncKHR && eglDupNativeFenceFDANDROID && eglDestroySyncKHR;
-  }();
-  return extensions_allowed;
-}
-
-Expected<GlBuffer> GlBuffer::AllocFromAhwbBuffer(AhwbBuffer& ahwb_buffer) {
-  LITERT_RETURN_IF_ERROR(
-      IsAhwbToGlInteropSupported(),
-      Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                 "AHardwareBuffer to GL interop is not supported"));
-  LITERT_RETURN_IF_ERROR(
-      ahwb_buffer.ahwb != nullptr,
-      Unexpected(kLiteRtStatusErrorRuntimeFailure, "AHardwareBuffer is null"));
-
-  // Create GL buffer id.
-  GLuint gl_id;
-  glGenBuffers(1, &gl_id);
-  glBindBuffer(GL_SHADER_STORAGE_BUFFER, gl_id);
-
-  // Create EGLClientBuffer from AHardwareBuffer.
-  EGLClientBuffer native_buffer =
-      eglGetNativeClientBufferANDROID(ahwb_buffer.ahwb);
-  LITERT_RETURN_IF_ERROR(
-      native_buffer != nullptr,
-      Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to create EGLClientBuffer from AHardwareBuffer"));
-
-  LITERT_ASSIGN_OR_RETURN(
-      size_t size_bytes,
-      litert::internal::AhwbBuffer::GetSize(ahwb_buffer.ahwb));
-  LITERT_RETURN_IF_ERROR(size_bytes != 0,
-                         Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                                    "AHardwareBuffer size is 0"));
-
-  // Create OpenGl buffer object backed by the AHardwareBuffer.
-  glBufferStorageExternalEXT(
-      GL_SHADER_STORAGE_BUFFER, 0, size_bytes, native_buffer,
-      GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_COHERENT_BIT_EXT |
-          GL_MAP_PERSISTENT_BIT_EXT);
-  // Check for OpenGL errors.
-  absl::Status status = tflite::gpu::gl::GetOpenGlErrors();
-  if (!status.ok()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      absl::StrCat("glBufferStorageExternalEXT: Failed to "
-                                   "create GL buffer from AHardwareBuffer: ",
-                                   status.message()));
-  }
-  // Unbind the buffer.
-  glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
-
-  // Create GL buffer object.
-  tflite::gpu::gl::GlBuffer tflite_gl_buffer(GL_SHADER_STORAGE_BUFFER, /*id=*/0,
-                                             size_bytes, /*offset=*/0,
-                                             /*has_ownership=*/false);
-  return GlBuffer(std::move(tflite_gl_buffer), ahwb_buffer.ahwb);
-}
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-Expected<GlBuffer> GlBuffer::Alloc(size_t bytes_size) {
-  tflite::gpu::gl::GlBuffer tflite_gl_buffer;
-
-  if (!tflite::gpu::gl::CreateReadWriteShaderStorageBuffer<std::byte>(
-           bytes_size, &tflite_gl_buffer)
-           .ok()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to allocate GL buffer");
-  };
-
-  return GlBuffer(std::move(tflite_gl_buffer));
-}
-
-template Expected<float*> GlBuffer::Lock<float>();
-template Expected<char*> GlBuffer::Lock<char>();
-template Expected<void> GlBuffer::Unlock<float>();
-template Expected<void> GlBuffer::Unlock<char>();
-
-template <typename T>
-Expected<T*> GlBuffer::Lock() {
-  absl::MutexLock lock(&mutex_);
-  if (data_ == nullptr) {
-    // Ensure the data is aligned.
-    if (auto rc =
-            posix_memalign(&data_, LITERT_HOST_MEMORY_BUFFER_ALIGNMENT, size_);
-        rc) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to allocate aligned memory");
-    }
-    if (auto status = tflite_gl_buffer_.Read(
-            absl::MakeSpan(static_cast<T*>(data_), size_ / sizeof(T)));
-        !status.ok()) {
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrCat("Failed to read GL buffer: ", status.message()));
-    }
-  }
-  return Expected<T*>(static_cast<T*>(data_));
-}
-
-template <typename T>
-Expected<void> GlBuffer::Unlock() {
-  absl::MutexLock lock(&mutex_);
-  if (data_ == nullptr) {
-    return Error(
-        kLiteRtStatusErrorRuntimeFailure,
-        "Cannot unlock a buffer that wasn't locked in the first place");
-  }
-  if (auto status = tflite_gl_buffer_.Write(
-          absl::MakeSpan(static_cast<const T*>(data_), size_ / sizeof(T)));
-      !status.ok()) {
-    return Unexpected(
-        kLiteRtStatusErrorRuntimeFailure,
-        absl::StrCat("Failed to write GL buffer: ", status.message()));
-  }
-  return Expected<void>();
-}
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // LITERT_HAS_OPENGL_SUPPORT
diff --git a/tensorflow/lite/experimental/litert/runtime/gl_buffer.h b/tensorflow/lite/experimental/litert/runtime/gl_buffer.h
deleted file mode 100644
index 28e8bc63f01f..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/gl_buffer.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_GL_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_GL_BUFFER_H_
-
-#if LITERT_HAS_OPENGL_SUPPORT
-
-#include <GLES3/gl31.h>
-#include <GLES3/gl32.h>
-
-#include <cstddef>
-#include <cstdlib>
-
-#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include "tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h"
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-namespace litert {
-namespace internal {
-
-class GlBuffer {
- public:
-  explicit GlBuffer(tflite::gpu::gl::GlBuffer&& tflite_gl_buffer
-#if LITERT_HAS_AHWB_SUPPORT
-                    ,
-                    AHardwareBuffer* ahwb = nullptr
-#endif  // LITERT_HAS_AHWB_SUPPORT
-                    )
-      : tflite_gl_buffer_(std::move(tflite_gl_buffer)),
-        deallocator_(nullptr),
-        size_(tflite_gl_buffer_.bytes_size())
-#if LITERT_HAS_AHWB_SUPPORT
-        ,
-        ahwb_(ahwb)
-#endif  // LITERT_HAS_AHWB_SUPPORT
-  {
-  }
-
-  GlBuffer(GLenum target, GLuint id, size_t bytes_size, size_t offset,
-           LiteRtGlBufferDeallocator deallocator) {
-    if (deallocator != nullptr) {
-      tflite_gl_buffer_ = tflite::gpu::gl::GlBuffer(
-          target, id, bytes_size, offset, /*has_ownership=*/false);
-      deallocator_ = std::move(deallocator);
-    } else {
-      tflite_gl_buffer_ = tflite::gpu::gl::GlBuffer(
-          target, id, bytes_size, offset, /*has_ownership=*/true);
-      deallocator_ = nullptr;
-    }
-  }
-  GlBuffer(GlBuffer&& other) {
-    tflite_gl_buffer_ = std::move(other.tflite_gl_buffer_);
-    deallocator_ = std::move(other.deallocator_);
-    data_ = other.data_;
-    size_ = other.size_;
-    other.data_ = nullptr;
-    other.size_ = 0;
-  }
-
-  ~GlBuffer() {
-    if (deallocator_ != nullptr) {
-      deallocator_(reinterpret_cast<void*>(tflite_gl_buffer_.id()));
-    }
-    if (data_ != nullptr) {
-      free(data_);
-    };
-  }
-
-  static bool IsSupported() { return true; }
-  static Expected<GlBuffer> Alloc(size_t bytes_size);
-
-#if LITERT_HAS_AHWB_SUPPORT
-  static Expected<GlBuffer> AllocFromAhwbBuffer(AhwbBuffer& ahwb_buffer);
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-  template <typename T>
-  Expected<T*> Lock();
-
-  template <typename T>
-  Expected<void> Unlock();
-
-  GLenum target() const { return tflite_gl_buffer_.target(); }
-  GLuint id() const { return tflite_gl_buffer_.id(); }
-  size_t bytes_size() const { return tflite_gl_buffer_.bytes_size(); }
-  size_t offset() const { return tflite_gl_buffer_.offset(); }
-
- private:
-  absl::Mutex mutex_;
-  tflite::gpu::gl::GlBuffer tflite_gl_buffer_;
-  LiteRtGlBufferDeallocator deallocator_;
-  // The cpu memory buffer pointer.
-  void* data_ = nullptr;
-  // The size of the buffer in bytes.
-  size_t size_ = 0;
-#if LITERT_HAS_AHWB_SUPPORT
-  AHardwareBuffer* ahwb_ = nullptr;
-#endif  // LITERT_HAS_AHWB_SUPPORT
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_GL_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/gl_buffer_test.cc b/tensorflow/lite/experimental/litert/runtime/gl_buffer_test.cc
deleted file mode 100644
index 526f26dd7f66..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/gl_buffer_test.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#if LITERT_HAS_OPENGL_SUPPORT
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
-#include "tensorflow/lite/experimental/litert/runtime/gl_buffer.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include "tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h"
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-namespace litert {
-namespace internal {
-namespace {
-
-TEST(Buffer, GlBufferAlloc) {
-  if (!GlBuffer::IsSupported()) {
-    GTEST_SKIP() << "OpenGL buffers are not supported on this platform";
-  }
-  std::unique_ptr<tflite::gpu::gl::EglEnvironment> env;
-  ASSERT_TRUE(tflite::gpu::gl::EglEnvironment::NewEglEnvironment(&env).ok());
-
-  auto buffer = GlBuffer::Alloc(4 * sizeof(float));
-  ASSERT_TRUE(buffer);
-}
-
-#if LITERT_HAS_AHWB_SUPPORT
-TEST(Buffer, GlBufferAllocFromAhwb) {
-  if (!GlBuffer::IsSupported()) {
-    GTEST_SKIP() << "OpenGL buffers are not supported on this platform";
-  }
-  // TODO(gcarranza): Incorporate this into LiteRT environment.
-  std::unique_ptr<tflite::gpu::gl::EglEnvironment> env;
-  ASSERT_TRUE(tflite::gpu::gl::EglEnvironment::NewEglEnvironment(&env).ok());
-
-  LITERT_ASSERT_OK_AND_ASSIGN(AhwbBuffer ahwb_buffer,
-                              AhwbBuffer::Alloc(4 * sizeof(float)));
-  LITERT_ASSERT_OK_AND_ASSIGN(GlBuffer gl_buffer,
-                              GlBuffer::AllocFromAhwbBuffer(ahwb_buffer));
-  // TODO(gcarranza): Add test to verify buffer content is the same.
-}
-#endif  // LITERT_HAS_AHWB_SUPPORT
-
-}  // namespace
-}  // namespace internal
-}  // namespace litert
-
-#endif  // LITERT_HAS_OPENGL_SUPPORT
diff --git a/tensorflow/lite/experimental/litert/runtime/gl_texture.cc b/tensorflow/lite/experimental/litert/runtime/gl_texture.cc
deleted file mode 100644
index a681665de21e..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/gl_texture.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-#if LITERT_HAS_OPENGL_SUPPORT
-
-#include "tensorflow/lite/delegates/gpu/gl/gl_texture.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/gl_texture.h"
-
-namespace litert {
-namespace internal {
-
-GlTexture::GlTexture(GLenum target, GLuint id, GLenum format, size_t size_bytes,
-                     GLint layer, LiteRtGlTextureDeallocator deallocator) {
-  if (deallocator != nullptr) {
-    tflite_gl_texture_ = tflite::gpu::gl::GlTexture(
-        target, id, format, size_bytes, layer, /*has_ownership=*/false);
-    deallocator_ = std::move(deallocator);
-  } else {
-    tflite_gl_texture_ = tflite::gpu::gl::GlTexture(
-        target, id, format, size_bytes, layer, /*has_ownership=*/true);
-    deallocator_ = nullptr;
-  }
-}
-
-GlTexture::GlTexture(GlTexture&& other) {
-  tflite_gl_texture_ = std::move(other.tflite_gl_texture_);
-  deallocator_ = std::move(other.deallocator_);
-}
-
-GlTexture::~GlTexture() {
-  if (deallocator_ != nullptr) {
-    deallocator_(reinterpret_cast<void*>(tflite_gl_texture_.id()));
-  }
-}
-
-}  // namespace internal
-}  // namespace litert
-#endif  // LITERT_HAS_OPENGL_SUPPORT
diff --git a/tensorflow/lite/experimental/litert/runtime/gl_texture.h b/tensorflow/lite/experimental/litert/runtime/gl_texture.h
deleted file mode 100644
index 25f76b55f67b..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/gl_texture.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_GL_TEXTURE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_GL_TEXTURE_H_
-
-#if LITERT_HAS_OPENGL_SUPPORT
-
-#include <GLES3/gl31.h>
-#include <GLES3/gl32.h>
-
-#include <cstddef>
-
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/delegates/gpu/gl/gl_texture.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-namespace litert {
-namespace internal {
-
-class GlTexture {
- public:
-  // GlTexture() = default;
-
-  GlTexture(GLenum target, GLuint id, GLenum format, size_t size_bytes,
-            GLint layer, LiteRtGlTextureDeallocator deallocator);
-
-  GlTexture(GlTexture&& other);
-
-  ~GlTexture();
-
-  GLenum target() const { return tflite_gl_texture_.target(); }
-  GLuint id() const { return tflite_gl_texture_.id(); }
-  GLenum format() const { return tflite_gl_texture_.format(); }
-  size_t size_bytes() const { return tflite_gl_texture_.bytes_size(); }
-  GLint layer() const { return tflite_gl_texture_.layer(); }
-
- private:
-  absl::Mutex mutex_;
-  tflite::gpu::gl::GlTexture tflite_gl_texture_;
-  LiteRtGlTextureDeallocator deallocator_;
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_GL_TEXTURE_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/ion_buffer.cc b/tensorflow/lite/experimental/litert/runtime/ion_buffer.cc
deleted file mode 100644
index 41a3ee09c826..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/ion_buffer.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/ion_buffer.h"
-
-#include <cstddef>
-#include <memory>
-#include <utility>
-
-#include "absl/base/attributes.h"
-#include "absl/base/const_init.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#if LITERT_HAS_ION_SUPPORT
-#include <dlfcn.h>
-#include <sys/mman.h>
-#endif  // LITERT_HAS_ION_SUPPORT
-
-namespace litert {
-namespace internal {
-
-#if LITERT_HAS_ION_SUPPORT
-namespace {
-
-class IonLibrary {
- public:
-  using Ptr = std::unique_ptr<IonLibrary>;
-
-  ~IonLibrary() {
-    if (client_fd_ > 0) {
-      ion_close_(client_fd_);
-    }
-  }
-
-  static Expected<Ptr> Create() {
-    DlHandle dlhandle(::dlopen("libion.so", RTLD_NOW | RTLD_LOCAL), ::dlclose);
-    if (!dlhandle) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "libion.so not found");
-    }
-
-    auto ion_open =
-        reinterpret_cast<IonOpen>(::dlsym(dlhandle.get(), "ion_open"));
-    if (!ion_open) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure, "ion_open not found");
-    }
-
-    auto ion_close =
-        reinterpret_cast<IonClose>(::dlsym(dlhandle.get(), "ion_close"));
-    if (!ion_close) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "ion_close not found");
-    }
-
-    auto ion_alloc_fd =
-        reinterpret_cast<IonAllocFd>(::dlsym(dlhandle.get(), "ion_alloc_fd"));
-    if (!ion_alloc_fd) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "ion_alloc_fd not found");
-    }
-
-    int client_fd = ion_open();
-    if (client_fd < 0) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to open ion device");
-    }
-
-    return Ptr(new IonLibrary(std::move(dlhandle), client_fd, ion_close,
-                              ion_alloc_fd));
-  }
-
-  Expected<IonBuffer> Alloc(size_t size, size_t alignment) {
-    int heap_id_mask = 1 << kIonHeapId;
-    int fd;
-    if (auto status = ion_alloc_fd_(client_fd_, size, alignment, heap_id_mask,
-                                    kIonFlags, &fd);
-        status != 0) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to allocate DMA-BUF buffer");
-    }
-    void* addr =
-        ::mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-    if (addr == MAP_FAILED) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to mem-map DMA-BUF buffer");
-    }
-    records_[addr] = Record{.fd = fd, .addr = addr, .size = size};
-    return IonBuffer{.fd = fd, .addr = addr};
-  }
-
-  void Free(void* addr) {
-    auto iter = records_.find(addr);
-    if (iter == records_.end()) {
-      return;
-    }
-    auto& record = iter->second;
-    ::munmap(record.addr, record.size);
-    ::close(record.fd);
-    records_.erase(iter);
-  }
-
- private:
-  static constexpr const int kIonHeapId = 25;
-  static constexpr const int kIonFlags = 1;
-
-  struct Record {
-    int fd;
-    void* addr;
-    size_t size;
-  };
-
-  using DlHandle = std::unique_ptr<void, int (*)(void*)>;
-  using IonOpen = int (*)();
-  using IonClose = int (*)(int);
-  using IonAllocFd = int (*)(int, size_t, size_t, unsigned int, unsigned int,
-                             int*);
-
-  IonLibrary(DlHandle&& dlhandle, int client_fd, IonClose ion_close,
-             IonAllocFd ion_alloc_fd)
-      : dlhandle_(std::move(dlhandle)),
-        client_fd_(client_fd),
-        ion_close_(ion_close),
-        ion_alloc_fd_(ion_alloc_fd) {}
-
-  DlHandle dlhandle_;
-  int client_fd_;
-  IonClose ion_close_;
-  IonAllocFd ion_alloc_fd_;
-  absl::node_hash_map<void*, Record> records_;
-};
-
-IonLibrary* TheIonLibrary;
-ABSL_CONST_INIT absl::Mutex TheMutex(absl::kConstInit);
-
-Expected<void> InitLibraryIfNeededUnlocked() {
-  if (!TheIonLibrary) {
-    if (auto library = IonLibrary::Create(); library) {
-      TheIonLibrary = library->release();
-    } else {
-      return Unexpected(library.Error());
-    }
-  }
-  return {};
-}
-
-}  // namespace
-#endif  // LITERT_HAS_ION_SUPPORT
-
-bool IonBuffer::IsSupported() {
-#if LITERT_HAS_ION_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  auto status = InitLibraryIfNeededUnlocked();
-  return static_cast<bool>(status);
-#else   // LITERT_HAS_ION_SUPPORT
-  return false;
-#endif  // LITERT_HAS_ION_SUPPORT
-}
-
-Expected<IonBuffer> IonBuffer::Alloc(size_t size, size_t alignment) {
-#if LITERT_HAS_ION_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  if (auto status = InitLibraryIfNeededUnlocked(); !status) {
-    return status.Error();
-  }
-  return TheIonLibrary->Alloc(size, alignment);
-#else   // LITERT_HAS_ION_SUPPORT
-  return Unexpected(kLiteRtStatusErrorUnsupported,
-                    "IonBuffer::Alloc not implemented for this platform");
-#endif  // LITERT_HAS_ION_SUPPORT
-}
-
-void IonBuffer::Free(void* addr) {
-#if LITERT_HAS_ION_SUPPORT
-  absl::MutexLock lock(&TheMutex);
-  if (TheIonLibrary) {
-    TheIonLibrary->Free(addr);
-  }
-#endif  // LITERT_HAS_ION_SUPPORT
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/ion_buffer.h b/tensorflow/lite/experimental/litert/runtime/ion_buffer.h
deleted file mode 100644
index c82a180f33ff..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/ion_buffer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ION_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ION_BUFFER_H_
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace internal {
-
-struct IonBuffer {
-  int fd;
-  void* addr;
-
-  static bool IsSupported();
-  static Expected<IonBuffer> Alloc(size_t size, size_t alignment);
-  static void Free(void* addr);
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_ION_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/open_cl_buffer.cc b/tensorflow/lite/experimental/litert/runtime/open_cl_buffer.cc
deleted file mode 100644
index 3b8b7e3ec006..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/open_cl_buffer.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h"
-
-#include <stdlib.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <utility>
-#include <vector>
-
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/runtime/environment.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace internal {
-
-template Expected<float*> OpenClBuffer::Lock<float>();
-template Expected<char*> OpenClBuffer::Lock<char>();
-template Expected<void> OpenClBuffer::Unlock<float>();
-template Expected<void> OpenClBuffer::Unlock<char>();
-
-template <typename T>
-Expected<T*> OpenClBuffer::Lock() {
-  absl::MutexLock lock(&mutex_);
-  // The buffer has not been locked, so we need to read from the OpenCL
-  // buffer.
-  if (data_ == nullptr) {
-    litert::cl::ClCommandQueue* queue =
-        EnvironmentSingleton::GetInstance().getCommandQueue();
-    std::vector<T> result;
-    auto status = buffer_.ReadData(queue, &result);
-    if (!status.ok()) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to read OpenCL buffer");
-    }
-    // Ensure the data is aligned.
-    if (auto rc =
-            posix_memalign(&data_, LITERT_HOST_MEMORY_BUFFER_ALIGNMENT, size_);
-        rc) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to allocate aligned memory");
-    }
-    // Copy the data from the OpenCL buffer to the aligned memory.
-    // TODO(piyu): Consider adding support in MLD OpenCL buffer to directly
-    // write to the aligned memory.
-    std::copy(result.begin(), result.end(), static_cast<T*>(data_));
-  }
-  return Expected<T*>(static_cast<T*>(data_));
-}
-
-template <typename T>
-Expected<void> OpenClBuffer::Unlock() {
-  absl::MutexLock lock(&mutex_);
-  litert::cl::ClCommandQueue* queue =
-      EnvironmentSingleton::GetInstance().getCommandQueue();
-  // The buffer has not been locked, so we don't need to write back.
-  if (data_ == nullptr) {
-    return Error(
-        kLiteRtStatusErrorRuntimeFailure,
-        "Cannot unlock a buffer that wasn't locked in the first place");
-  }
-  size_t write_size = (size_ + sizeof(T) - 1) / sizeof(T);
-  auto status = buffer_.WriteData(
-      queue, absl::MakeSpan(static_cast<T*>(data_), write_size));
-
-  if (status.ok()) {
-    return Expected<void>();
-  }
-  return Unexpected(
-      kLiteRtStatusErrorRuntimeFailure,
-      "The data failed to write to the OpenCL buffer when unlocked");
-}
-
-bool OpenClBuffer::IsSupported() {
-  static bool is_supported = ::litert::cl::LoadOpenCL().ok();
-  return is_supported;
-}
-
-Expected<OpenClBuffer> OpenClBuffer::Alloc(size_t bytes_size) {
-  LITERT_RETURN_IF_ERROR(
-      IsSupported(),
-      Unexpected(kLiteRtStatusErrorRuntimeFailure, "OpenCL is not supported"));
-
-  litert::cl::Buffer buffer;
-
-  litert::cl::ClContext* cl_context =
-      EnvironmentSingleton::GetInstance().getContext();
-  auto result =
-      litert::cl::CreateReadWriteBuffer(bytes_size, cl_context, &buffer);
-  if (!result.ok()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to create OpenCL buffer");
-  }
-
-  return Expected<OpenClBuffer>(std::move(buffer), bytes_size);
-}
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h b/tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h
deleted file mode 100644
index e78f069b9fb1..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPEN_CL_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPEN_CL_BUFFER_H_
-
-#include <cstddef>
-#include <cstdlib>
-#include <utility>
-
-#include "absl/synchronization/mutex.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace internal {
-/**
- * The OpenCL buffer class that provides GPU memory allocation and two-way sync
- * between the CPU memory and the GPU OpenCL buffer.
- */
-class OpenClBuffer {
- public:
-  OpenClBuffer(OpenClBuffer&& other) {
-    data_ = other.data_;
-    buffer_ = std::move(other.buffer_);
-    size_ = other.size_;
-    other.data_ = nullptr;
-    other.size_ = 0;
-  }
-
-  OpenClBuffer(litert::cl::Buffer buffer, size_t size)
-      : buffer_(std::move(buffer)), size_(size) {}
-
-  OpenClBuffer(cl_mem buffer, size_t size, LiteRtOpenClDeallocator deallocator)
-      : deallocator_(deallocator), size_(size) {
-    if (deallocator_ != nullptr) {
-      buffer_ = litert::cl::CreateBufferShared(buffer);
-    } else {  // The buffer will be deallocated automatically.
-      buffer_ = litert::cl::Buffer(buffer, size);
-    }
-  }
-
-  ~OpenClBuffer() {
-    if (deallocator_ != nullptr) {
-      deallocator_(buffer_.GetMemoryPtr());
-    }
-    if (data_ != nullptr) {
-      free(data_);
-    };
-  }
-
-  cl_mem GetMemoryPtr() { return buffer_.GetMemoryPtr(); }
-  // Allocates a CPU memory and conducts a copy from the OpenCL buffer to the
-  // CPU memory.
-  template <typename T>
-  Expected<T*> Lock();
-
-  // Writes the data from the CPU memory to the OpenCL buffer.
-  template <typename T>
-  Expected<void> Unlock();
-
-  static bool IsSupported();
-  static Expected<OpenClBuffer> Alloc(size_t bytes_size);
-  size_t size_bytes() const { return size_; }
-
- private:
-  absl::Mutex mutex_;
-  // The cpu memory buffer pointer.
-  void* data_ = nullptr;
-  litert::cl::Buffer buffer_;
-  LiteRtOpenClDeallocator deallocator_ = nullptr;
-  // The size of the buffer in bytes.
-  size_t size_ = 0;
-};
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPEN_CL_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/BUILD b/tensorflow/lite/experimental/litert/runtime/opencl/BUILD
deleted file mode 100644
index 71293b37bff9..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/BUILD
+++ /dev/null
@@ -1,112 +0,0 @@
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "cl_command_queue",
-    srcs = [
-        "cl_command_queue.cc",
-    ],
-    hdrs = [
-        "cl_command_queue.h",
-    ],
-    deps = [
-        ":cl_context",
-        ":cl_device",
-        ":opencl_wrapper",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@opencl_headers",
-    ],
-)
-
-cc_library(
-    name = "cl_device",
-    srcs = [
-        "cl_device.cc",
-    ],
-    hdrs = [
-        "cl_device.h",
-    ],
-    deps = [
-        ":opencl_wrapper",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@opencl_headers",
-    ],
-)
-
-cc_library(
-    name = "cl_context",
-    srcs = [
-        "cl_context.cc",
-    ],
-    hdrs = [
-        "cl_context.h",
-    ],
-    deps = [
-        ":cl_device",
-        ":opencl_wrapper",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@opencl_headers",
-    ],
-)
-
-cc_library(
-    name = "opencl_wrapper",
-    srcs = [
-        "opencl_wrapper.cc",
-    ],
-    hdrs = [
-        "opencl_wrapper.h",
-    ],
-    visibility = [
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-        "//third_party/odml/infra:__subpackages__",
-    ],
-    deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@opencl_headers",
-    ],
-)
-
-cc_library(
-    name = "buffer",
-    srcs = [
-        "buffer.cc",
-    ],
-    hdrs = [
-        "buffer.h",
-    ],
-    deps = [
-        ":cl_command_queue",
-        ":cl_context",
-        ":opencl_wrapper",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@opencl_headers",
-    ],
-)
-
-cc_test(
-    name = "buffer_test",
-    srcs = ["buffer_test.cc"],
-    # require GPU to run OpenCL tests.
-    tags = [
-        "requires-gpu-nvidia",
-    ],
-    deps = [
-        ":buffer",
-        ":cl_command_queue",
-        ":cl_context",
-        ":cl_device",
-        ":opencl_wrapper",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/buffer.cc b/tensorflow/lite/experimental/litert/runtime/opencl/buffer.cc
deleted file mode 100644
index c2878a483951..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/buffer.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is a copy of third_party/ml_drift/cl/buffer.cc.
-#include "tensorflow/lite/experimental/litert/runtime/opencl/buffer.h"
-
-#include <cstddef>
-#include <string>
-#include <utility>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace cl {
-absl::Status CreateClBuffer(cl_context context, size_t size_in_bytes,
-                            bool read_only, void* data, cl_mem* result) {
-  cl_mem_flags flags = read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
-  if (data) {
-    flags |= CL_MEM_COPY_HOST_PTR;
-  }
-  cl_int error_code;
-  *result = clCreateBuffer(context, flags, size_in_bytes, data, &error_code);
-  if (!*result) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
-                     std::to_string(error_code)));
-  }
-  return absl::OkStatus();
-}
-absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
-                          const void* data, ClContext* context,
-                          Buffer* result) {
-  cl_mem buffer;
-  auto status = CreateClBuffer(context->context(), size_in_bytes, gpu_read_only,
-                               const_cast<void*>(data), &buffer);
-  if (!status.ok()) {
-    return status;
-  }
-  *result = Buffer(buffer, size_in_bytes);
-
-  return absl::OkStatus();
-}
-
-Buffer::Buffer(cl_mem buffer, size_t size_in_bytes, bool is_sub_buffer)
-    : buffer_(buffer), size_(size_in_bytes), is_sub_buffer_(is_sub_buffer) {}
-
-Buffer::Buffer(cl_mem buffer)
-    : buffer_(buffer), size_(0), is_sub_buffer_(false), owner_(false) {}
-
-Buffer::Buffer(Buffer&& buffer)
-    : buffer_(buffer.buffer_),
-      size_(buffer.size_),
-      is_sub_buffer_(buffer.is_sub_buffer_),
-      owner_(buffer.owner_) {
-  buffer.buffer_ = nullptr;
-  buffer.size_ = 0;
-  buffer.is_sub_buffer_ = false;
-}
-
-Buffer& Buffer::operator=(Buffer&& buffer) {
-  if (this != &buffer) {
-    Release();
-    std::swap(size_, buffer.size_);
-    std::swap(buffer_, buffer.buffer_);
-    std::swap(is_sub_buffer_, buffer.is_sub_buffer_);
-    std::swap(owner_, buffer.owner_);
-  }
-  return *this;
-}
-
-void Buffer::Release() {
-  if (owner_ && buffer_) {
-    clReleaseMemObject(buffer_);
-    buffer_ = nullptr;
-    size_ = 0;
-    is_sub_buffer_ = false;
-  }
-}
-
-Buffer CreateBufferShared(cl_mem buffer) { return Buffer(buffer); }
-
-absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, ClContext* context,
-                                  Buffer* result) {
-  return CreateBuffer(size_in_bytes, true, nullptr, context, result);
-}
-
-absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
-                                  ClContext* context, Buffer* result) {
-  return CreateBuffer(size_in_bytes, true, data, context, result);
-}
-
-absl::Status CreateReadWriteBuffer(size_t size_in_bytes, ClContext* context,
-                                   Buffer* result) {
-  return CreateBuffer(size_in_bytes, false, nullptr, context, result);
-}
-
-}  // namespace cl
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/buffer.h b/tensorflow/lite/experimental/litert/runtime/opencl/buffer.h
deleted file mode 100644
index b1cb09f06550..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/buffer.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is a copy of third_party/ml_drift/cl/buffer.h.
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_BUFFER_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-
-namespace litert {
-namespace cl {
-
-// Buffer represent linear GPU data storage with arbitrary data format.
-// Buffer is moveable but not copyable.
-class Buffer {
- public:
-  Buffer() = default;  // just for using Buffer as a class members
-  Buffer(cl_mem buffer, size_t size_in_bytes, bool is_sub_buffer = false);
-  explicit Buffer(cl_mem buffer);
-
-  // Move only
-  Buffer(Buffer&& buffer);
-  Buffer& operator=(Buffer&& buffer);
-  Buffer(const Buffer&) = delete;
-  Buffer& operator=(const Buffer&) = delete;
-
-  ~Buffer() { Release(); }
-
-  // for profiling and memory statistics
-  uint64_t GetMemorySizeInBytes() const { return size_; }
-
-  cl_mem GetMemoryPtr() const { return buffer_; }
-
-  bool IsSubBuffer() const { return is_sub_buffer_; }
-
-  // Writes data to a buffer. Data should point to a region that
-  // has exact size in bytes as size_in_bytes(constructor parameter).
-  template <typename T>
-  absl::Status WriteData(ClCommandQueue* queue, absl::Span<T> data);
-
-  // Reads data from Buffer into CPU memory.
-  template <typename T>
-  absl::Status ReadData(ClCommandQueue* queue, std::vector<T>* result) const;
-
- private:
-  void Release();
-
-  cl_mem buffer_ = nullptr;
-  size_t size_ = 0;
-  bool is_sub_buffer_ = false;
-  bool owner_ = true;
-};
-
-Buffer CreateBufferShared(cl_mem buffer);
-
-absl::Status CreateClBuffer(cl_context context, size_t size_in_bytes,
-                            bool read_only, void* data, cl_mem* result);
-
-absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
-                          const void* data, ClContext* context, Buffer* result);
-
-absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, ClContext* context,
-                                  Buffer* result);
-
-absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void* data,
-                                  ClContext* context, Buffer* result);
-
-absl::Status CreateReadWriteBuffer(size_t size_in_bytes, ClContext* context,
-                                   Buffer* result);
-
-absl::Status CreateReadWriteSubBuffer(const Buffer& parent,
-                                      size_t origin_in_bytes,
-                                      size_t size_in_bytes, ClContext* context,
-                                      Buffer* result);
-
-template <typename T>
-absl::Status Buffer::WriteData(ClCommandQueue* queue,
-                               const absl::Span<T> data) {
-  if (sizeof(T) * data.size() > size_) {
-    return absl::InvalidArgumentError(
-        "absl::Span<T> data size is greater from buffer allocated size.");
-  }
-  auto status = queue->EnqueueWriteBuffer(buffer_, size_, data.data());
-  if (!status.ok()) {
-    return status;
-  }
-  return absl::OkStatus();
-}
-
-template <typename T>
-absl::Status Buffer::ReadData(ClCommandQueue* queue,
-                              std::vector<T>* result) const {
-  if (size_ % sizeof(T) != 0) {
-    return absl::UnknownError("Wrong element size(typename T is not correct?");
-  }
-
-  const int elements_count = size_ / sizeof(T);
-  result->resize(elements_count);
-
-  return queue->EnqueueReadBuffer(buffer_, size_, result->data());
-}
-
-}  // namespace cl
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/buffer_test.cc b/tensorflow/lite/experimental/litert/runtime/opencl/buffer_test.cc
deleted file mode 100644
index 84280ef6af23..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/buffer_test.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2024 The ML Drift Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/opencl/buffer.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-namespace litert {
-namespace internal {
-
-TEST(OpenCLTest, BufferTestFloat) {
-  // MSAN does not support GPU tests.
-#if defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
-  GTEST_SKIP() << "GPU tests are not supported In msan";
-#endif
-
-  if (!litert::cl::LoadOpenCL().ok()) {
-    GTEST_SKIP() << "OpenCL buffers are not supported on this platform; "
-                    "skipping the test";
-  }
-  const std::vector<float> data = {1.0, 2.0, 3.0, -4.0, 5.1};
-  litert::cl::Buffer buffer;
-  litert::cl::ClContext context;
-  litert::cl::ClDevice device;
-  litert::cl::ClCommandQueue queue;
-  ASSERT_TRUE(CreateDefaultGPUDevice(&device).ok());
-  ASSERT_TRUE(CreateClContext(device, &context).ok());
-  ASSERT_TRUE(CreateClCommandQueue(device, context, &queue).ok());
-  ASSERT_TRUE(CreateReadWriteBuffer(sizeof(float) * 5, &context, &buffer).ok());
-  ASSERT_TRUE(
-      buffer.WriteData(&queue, absl::MakeConstSpan(data.data(), data.size()))
-          .ok());
-  std::vector<float> gpu_data;
-  ASSERT_TRUE(buffer.ReadData<float>(&queue, &gpu_data).ok());
-
-  EXPECT_THAT(gpu_data, Pointwise(FloatNear(0.0f), data));
-}
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.cc b/tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.cc
deleted file mode 100644
index 278862c3f87d..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is a copy of third_party/ml_drift/cl/cl_command_queue.cc.
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace cl {
-namespace {
-
-absl::StatusOr<cl_command_queue> CreateClCommandQueueWithProperties(
-    const ClDevice& device, const ClContext& context,
-    cl_command_queue_properties queue_properties) {
-  int error_code;
-  cl_command_queue queue;
-  if (clCreateCommandQueueWithProperties) {
-    std::vector<cl_queue_properties> props;
-    if (queue_properties != 0) {
-      props.push_back(CL_QUEUE_PROPERTIES);
-      props.push_back(queue_properties);
-    }
-    props.push_back(0);
-
-    queue = clCreateCommandQueueWithProperties(context.context(), device.id(),
-                                               props.data(), &error_code);
-  } else {
-    // Backwards compatibility for OpenCL versions before 2.0.
-    queue = clCreateCommandQueue(context.context(), device.id(),
-                                 queue_properties, &error_code);
-  }
-  if (!queue) {
-    return absl::UnknownError(absl::StrCat(
-        "Failed to create a command queue - ", std::to_string(error_code)));
-  }
-  return queue;
-}
-
-}  // namespace
-
-ClCommandQueue::ClCommandQueue() = default;
-
-ClCommandQueue::ClCommandQueue(cl_command_queue queue, bool has_ownership)
-    : queue_(queue), has_ownership_(has_ownership) {}
-
-ClCommandQueue::ClCommandQueue(ClCommandQueue&& queue)
-    : queue_(queue.queue_), has_ownership_(queue.has_ownership_) {
-  queue.queue_ = nullptr;
-}
-
-ClCommandQueue& ClCommandQueue::operator=(ClCommandQueue&& queue) {
-  if (this != &queue) {
-    Release();
-    std::swap(queue_, queue.queue_);
-    has_ownership_ = queue.has_ownership_;
-  }
-  return *this;
-}
-
-ClCommandQueue::~ClCommandQueue() { Release(); }
-
-void ClCommandQueue::Release() {
-  if (has_ownership_ && queue_) {
-    clReleaseCommandQueue(queue_);
-    queue_ = nullptr;
-  }
-}
-
-absl::Status ClCommandQueue::EnqueueWriteBuffer(cl_mem memory,
-                                                size_t size_in_bytes,
-                                                const void* data, bool async) {
-  const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
-  auto error_code = clEnqueueWriteBuffer(
-      queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
-  if (error_code != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
-                     std::to_string(error_code)));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ClCommandQueue::EnqueueReadBuffer(cl_mem memory,
-                                               size_t size_in_bytes, void* data,
-                                               bool async) {
-  const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
-  auto error_code = clEnqueueReadBuffer(
-      queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
-  if (error_code != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
-                     std::to_string(error_code)));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ClCommandQueue::WaitForCompletion() {
-  auto error_code = clFinish(queue_);
-  if (error_code != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to clFinish - ", std::to_string(error_code)));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status CreateClCommandQueue(const ClDevice& device,
-                                  const ClContext& context,
-                                  ClCommandQueue* result) {
-  auto queue = CreateClCommandQueueWithProperties(device, context, 0);
-  if (!queue.ok()) {
-    return queue.status();
-  }
-  *result = ClCommandQueue(*queue, true);
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h b/tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h
deleted file mode 100644
index a7691d52e6c6..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/cl_command_queue.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is a copy of third_party/ml_drift/cl/cl_command_queue.h.
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_COMMAND_QUEUE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_COMMAND_QUEUE_H_
-
-#include <cstddef>
-#include <string>
-
-#include "absl/status/status.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-
-namespace litert {
-namespace cl {
-
-// A wrapper around opencl command queue
-class ClCommandQueue {
- public:
-  ClCommandQueue();
-  ClCommandQueue(cl_command_queue queue, bool has_ownership);
-
-  // Move only
-  ClCommandQueue(ClCommandQueue&& queue);
-  ClCommandQueue& operator=(ClCommandQueue&& queue);
-  ClCommandQueue(const ClCommandQueue&) = delete;
-  ClCommandQueue& operator=(const ClCommandQueue&) = delete;
-
-  virtual ~ClCommandQueue();
-
-  cl_command_queue queue() const { return queue_; }
-
-  absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
-                                  const void* data, bool async = false);
-  absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
-                                 void* data, bool async = false);
-
-  absl::Status WaitForCompletion();
-
- protected:
-  void Release();
-
-  cl_command_queue queue_ = nullptr;
-  bool has_ownership_ = false;
-};
-
-class ProfilingCommandQueue : public ClCommandQueue {
- public:
-  ProfilingCommandQueue();
-  explicit ProfilingCommandQueue(cl_command_queue queue);
-
-  // Move only
-  ProfilingCommandQueue(ProfilingCommandQueue&& queue);
-  ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue);
-  ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
-  ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
-
- private:
-  std::string current_label_;
-};
-
-absl::Status CreateClCommandQueue(const ClDevice& device,
-                                  const ClContext& context,
-                                  ClCommandQueue* result);
-
-}  // namespace cl
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_COMMAND_QUEUE_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/cl_context.cc b/tensorflow/lite/experimental/litert/runtime/opencl/cl_context.cc
deleted file mode 100644
index 5eb5f4949d37..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/cl_context.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h"
-
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace cl {
-namespace {
-
-absl::Status CreateClContext(const ClDevice& device,
-                             const std::vector<cl_context_properties>& props,
-                             ClContext* result) {
-  int error_code;
-  cl_device_id device_id = device.id();
-  std::vector<cl_context_properties> props_local = props;
-  if (!props_local.empty()) {
-    props_local.push_back(0);
-  }
-  cl_context_properties* properties_ptr =
-      props_local.empty() ? nullptr : props_local.data();
-  cl_context context = clCreateContext(properties_ptr, 1, &device_id, nullptr,
-                                       nullptr, &error_code);
-  if (!context) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to create a compute context - ", error_code));
-  }
-
-  *result = ClContext(context, true);
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-ClContext::ClContext() = default;
-
-ClContext::ClContext(cl_context context, bool has_ownership)
-    : context_(context), has_ownership_(has_ownership) {}
-
-ClContext::ClContext(cl_context context, bool has_ownership, ClDevice& device)
-    : context_(context), has_ownership_(has_ownership) {}
-
-ClContext::ClContext(ClContext&& context)
-    : context_(context.context_), has_ownership_(context.has_ownership_) {
-  context.context_ = nullptr;
-}
-
-ClContext& ClContext::operator=(ClContext&& context) {
-  if (this != &context) {
-    Release();
-    std::swap(context_, context.context_);
-    has_ownership_ = context.has_ownership_;
-  }
-  return *this;
-}
-
-ClContext::~ClContext() { Release(); }
-
-void ClContext::Release() {
-  if (has_ownership_ && context_) {
-    clReleaseContext(context_);
-    context_ = nullptr;
-  }
-}
-
-absl::Status CreateClContext(const ClDevice& device, ClContext* result) {
-  std::vector<cl_context_properties> props;
-  return CreateClContext(device, props, result);
-}
-
-absl::Status CreateClGlContext(const ClDevice& device,
-                               cl_context_properties egl_context,
-                               cl_context_properties egl_display,
-                               ClContext* result) {
-  cl_context_properties platform =
-      reinterpret_cast<cl_context_properties>(device.platform());
-
-  std::vector<cl_context_properties> props = {CL_GL_CONTEXT_KHR,   egl_context,
-                                              CL_EGL_DISPLAY_KHR,  egl_display,
-                                              CL_CONTEXT_PLATFORM, platform};
-
-  return CreateClContext(device, props, result);
-}
-
-}  // namespace cl
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h b/tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h
deleted file mode 100644
index 880e42b7c4a5..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/cl_context.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_CONTEXT_H_
-
-#include "absl/status/status.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-
-namespace litert {
-namespace cl {
-
-// A RAII wrapper around opencl context
-class ClContext {
- public:
-  ClContext();
-  ClContext(cl_context context, bool has_ownership);
-  ClContext(cl_context context, bool has_ownership, ClDevice& device);
-  // Move only
-  ClContext(ClContext&& context);
-  ClContext& operator=(ClContext&& context);
-  ClContext(const ClContext&) = delete;
-  ClContext& operator=(const ClContext&) = delete;
-
-  ~ClContext();
-
-  cl_context context() const { return context_; }
-
- private:
-  void Release();
-
-  cl_context context_ = nullptr;
-  bool has_ownership_ = false;
-};
-
-absl::Status CreateClContext(const ClDevice& device, ClContext* result);
-absl::Status CreateClGlContext(const ClDevice& device,
-                               cl_context_properties egl_context,
-                               cl_context_properties egl_display,
-                               ClContext* result);
-
-}  // namespace cl
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/cl_device.cc b/tensorflow/lite/experimental/litert/runtime/opencl/cl_device.cc
deleted file mode 100644
index 5677e50927a3..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/cl_device.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// this is a copy of ml_drift/cl/cl_device.cc
-#include "tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h"
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-namespace litert {
-namespace cl {
-
-ClDevice::ClDevice(cl_device_id id, cl_platform_id platform_id)
-    : id_(id), platform_id_(platform_id) {}
-
-ClDevice::ClDevice(const ClDevice& device) = default;
-
-ClDevice& ClDevice::operator=(const ClDevice& device) {
-  if (this != &device) {
-    id_ = device.id_;
-    platform_id_ = device.platform_id_;
-  }
-  return *this;
-}
-
-ClDevice::ClDevice(ClDevice&& device)
-    : id_(device.id_), platform_id_(device.platform_id_) {
-  device.id_ = nullptr;
-  device.platform_id_ = nullptr;
-}
-
-ClDevice& ClDevice::operator=(ClDevice&& device) {
-  if (this != &device) {
-    id_ = nullptr;
-    platform_id_ = nullptr;
-    std::swap(id_, device.id_);
-    std::swap(platform_id_, device.platform_id_);
-  }
-  return *this;
-}
-
-absl::Status CreateDefaultGPUDevice(ClDevice* result) {
-  cl_uint num_platforms;
-  cl_int status = clGetPlatformIDs(0, nullptr, &num_platforms);
-  if (status != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrFormat("clGetPlatformIDs returned %d", status));
-  }
-  if (num_platforms == 0) {
-    return absl::UnknownError("No supported OpenCL platform.");
-  }
-  std::vector<cl_platform_id> platforms(num_platforms);
-  status = clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
-  if (status != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrFormat("clGetPlatformIDs returned %d", status));
-  }
-
-  cl_platform_id platform_id = platforms[0];
-  cl_uint num_devices;
-  status =
-      clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
-  if (status != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrFormat("clGetDeviceIDs returned %d", status));
-  }
-  if (num_devices == 0) {
-    return absl::UnknownError("No GPU on current platform.");
-  }
-
-  std::vector<cl_device_id> devices(num_devices);
-  status = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, num_devices,
-                          devices.data(), nullptr);
-  if (status != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrFormat("clGetDeviceIDs returned %d", status));
-  }
-
-  *result = ClDevice(devices[0], platform_id);
-  LoadOpenCLFunctionExtensions(platform_id);
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h b/tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h
deleted file mode 100644
index 71d93e64ace8..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/cl_device.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2024 The ML Drift Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_DEVICE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_DEVICE_H_
-
-#include <string>
-
-#include "absl/status/status.h"
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-namespace litert {
-namespace cl {
-
-// A wrapper around opencl device id
-class ClDevice {
- public:
-  ClDevice() = default;
-  ClDevice(cl_device_id id, cl_platform_id platform_id);
-
-  ClDevice(ClDevice&& device);
-  ClDevice& operator=(ClDevice&& device);
-  ClDevice(const ClDevice&);
-  ClDevice& operator=(const ClDevice&);
-
-  ~ClDevice() = default;
-
-  cl_device_id id() const { return id_; }
-  cl_platform_id platform() const { return platform_id_; }
-  std::string GetPlatformVersion() const;
-
- private:
-  cl_device_id id_ = nullptr;
-  cl_platform_id platform_id_ = nullptr;
-};
-
-absl::Status CreateDefaultGPUDevice(ClDevice* result);
-
-template <typename T>
-T GetDeviceInfo(cl_device_id id, cl_device_info info) {
-  T result;
-  cl_int error = clGetDeviceInfo(id, info, sizeof(T), &result, nullptr);
-  if (error != CL_SUCCESS) {
-    return {};
-  }
-  return result;
-}
-
-template <typename T>
-absl::Status GetDeviceInfo(cl_device_id id, cl_device_info info, T* result) {
-  cl_int error = clGetDeviceInfo(id, info, sizeof(T), result, nullptr);
-  if (error != CL_SUCCESS) {
-    return absl::InvalidArgumentError("cl error:" + std::to_string(error));
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_CL_DEVICE_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.cc b/tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.cc
deleted file mode 100644
index 79c4e33e2eb7..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.cc
+++ /dev/null
@@ -1,470 +0,0 @@
-// Copyright 2024 The Tensorflow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is copied from third_party/ml_drift/cl/opencl_wrapper.cc.
-#include "tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h"
-
-#if defined(_WIN32)
-#define __WINDOWS__
-#endif
-
-#ifdef __WINDOWS__
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#include <string>
-
-#include "absl/strings/str_cat.h"
-
-namespace litert {
-namespace cl {
-
-#ifdef __ANDROID__
-#define LoadFunction(function)                                                 \
-  if (use_wrapper) {                                                           \
-    function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \
-  } else {                                                                     \
-    function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));  \
-  }
-
-namespace {
-
-// Loads a library from Android SP-HAL namespace which includes libraries from
-// the path /vendor/lib[64] directly and several sub-folders in it.
-// First tries using dlopen(), which should work if the process is running with
-// linker namespace "sphal" (so has permissions to sphal paths).
-// If it fails, for example if process is running with linker default namespace
-// because it's a sub-process of the app, then tries loading the library using
-// a sphal helper loader function from Vendor NDK support library.
-void* AndroidDlopenSphalLibrary(const char* filename, int dlopen_flags) {
-  void* lib = dlopen(filename, dlopen_flags);
-  if (lib != nullptr) {
-    return lib;
-  }
-  static void* (*android_load_sphal_library)(const char*, int) = nullptr;
-  if (android_load_sphal_library != nullptr) {
-    return android_load_sphal_library(filename, dlopen_flags);
-  }
-  android_load_sphal_library =
-      reinterpret_cast<decltype(android_load_sphal_library)>(
-          dlsym(RTLD_NEXT, "android_load_sphal_library"));
-  if (android_load_sphal_library == nullptr) {
-    void* vndk = dlopen("libvndksupport.so", RTLD_NOW);
-    if (vndk != nullptr) {
-      android_load_sphal_library =
-          reinterpret_cast<decltype(android_load_sphal_library)>(
-              dlsym(vndk, "android_load_sphal_library"));
-    }
-    if (android_load_sphal_library == nullptr) {
-      return nullptr;
-    }
-  }
-  return android_load_sphal_library(filename, dlopen_flags);
-}
-
-}  // namespace
-
-#elif defined(__WINDOWS__)
-#define LoadFunction(function) \
-  function =                   \
-      reinterpret_cast<PFN_##function>(GetProcAddress(libopencl, #function));
-#else
-#define LoadFunction(function) \
-  function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));
-#endif
-
-#define LoadFunctionExtension(plat_id, function) \
-  function = reinterpret_cast<PFN_##function>(   \
-      clGetExtensionFunctionAddressForPlatform(plat_id, #function));
-
-#ifdef __WINDOWS__
-void LoadOpenCLFunctions(HMODULE libopencl);
-#else
-void LoadOpenCLFunctions(void* libopencl, bool use_wrapper);
-#endif
-
-absl::Status LoadOpenCL() {
-#ifdef __WINDOWS__
-  HMODULE libopencl = LoadLibraryA("OpenCL.dll");
-  if (libopencl) {
-    LoadOpenCLFunctions(libopencl);
-    return absl::OkStatus();
-  } else {
-    DWORD error_code = GetLastError();
-    return absl::UnknownError(absl::StrCat(
-        "Can not open OpenCL library on this device, error code - ",
-        error_code));
-  }
-#else
-  void* libopencl = nullptr;
-#ifdef __APPLE__
-  static const char* kClLibName =
-      "/System/Library/Frameworks/OpenCL.framework/OpenCL";
-#else
-  static const char* kClLibName = "libOpenCL.so";
-#endif
-#ifdef __ANDROID__
-  libopencl = AndroidDlopenSphalLibrary(kClLibName, RTLD_NOW | RTLD_LOCAL);
-  if (!libopencl) {
-    // Legacy Pixel phone or auto path?
-    libopencl =
-        AndroidDlopenSphalLibrary("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
-    if (!libopencl) {
-      libopencl =
-          AndroidDlopenSphalLibrary("libOpenCL-car.so", RTLD_NOW | RTLD_LOCAL);
-    }
-    if (libopencl) {
-      typedef void (*enableOpenCL_t)();
-      enableOpenCL_t enableOpenCL =
-          reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
-      enableOpenCL();
-      LoadOpenCLFunctions(libopencl, true);
-      return absl::OkStatus();
-    }
-  }
-#else
-  libopencl = dlopen(kClLibName, RTLD_NOW | RTLD_LOCAL);
-#endif
-  if (libopencl) {
-    LoadOpenCLFunctions(libopencl, false);
-    return absl::OkStatus();
-  }
-  // record error
-  std::string error(dlerror());
-
-  // Check if OpenCL functions are found via OpenCL ICD Loader.
-  LoadOpenCLFunctions(libopencl, /*use_wrapper=*/false);
-  if (clGetPlatformIDs != nullptr) {
-    cl_uint num_platforms;
-    cl_int status = clGetPlatformIDs(0, nullptr, &num_platforms);
-    if (status == CL_SUCCESS && num_platforms != 0) {
-      return absl::OkStatus();
-    }
-    return absl::UnknownError("OpenCL is not supported.");
-  }
-  return absl::UnknownError(
-      absl::StrCat("Can not open OpenCL library on this device - ", error));
-#endif
-}
-
-void LoadOpenCLFunctionExtensions(cl_platform_id platform_id) {
-  // cl_khr_command_buffer extension
-  LoadFunctionExtension(platform_id, clCreateCommandBufferKHR);
-  LoadFunctionExtension(platform_id, clRetainCommandBufferKHR);
-  LoadFunctionExtension(platform_id, clReleaseCommandBufferKHR);
-  LoadFunctionExtension(platform_id, clFinalizeCommandBufferKHR);
-  LoadFunctionExtension(platform_id, clEnqueueCommandBufferKHR);
-  LoadFunctionExtension(platform_id, clCommandNDRangeKernelKHR);
-  LoadFunctionExtension(platform_id, clGetCommandBufferInfoKHR);
-}
-
-#ifdef __WINDOWS__
-void LoadOpenCLFunctions(HMODULE libopencl) {
-#else
-void LoadOpenCLFunctions(void* libopencl, bool use_wrapper) {
-#ifdef __ANDROID__
-  typedef void* (*loadOpenCLPointer_t)(const char* name);
-  loadOpenCLPointer_t loadOpenCLPointer;
-  if (use_wrapper) {
-    loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
-        dlsym(libopencl, "loadOpenCLPointer"));
-  }
-#endif
-#endif
-
-  LoadFunction(clGetPlatformIDs);
-  LoadFunction(clGetPlatformInfo);
-  LoadFunction(clGetDeviceIDs);
-  LoadFunction(clGetDeviceInfo);
-  LoadFunction(clCreateSubDevices);
-  LoadFunction(clRetainDevice);
-  LoadFunction(clReleaseDevice);
-  LoadFunction(clCreateContext);
-  LoadFunction(clCreateContextFromType);
-  LoadFunction(clRetainContext);
-  LoadFunction(clReleaseContext);
-  LoadFunction(clGetContextInfo);
-  LoadFunction(clCreateCommandQueueWithProperties);
-  LoadFunction(clRetainCommandQueue);
-  LoadFunction(clReleaseCommandQueue);
-  LoadFunction(clGetCommandQueueInfo);
-  LoadFunction(clCreateBuffer);
-  LoadFunction(clCreateSubBuffer);
-  LoadFunction(clCreateImage);
-  LoadFunction(clCreatePipe);
-  LoadFunction(clRetainMemObject);
-  LoadFunction(clReleaseMemObject);
-  LoadFunction(clGetSupportedImageFormats);
-  LoadFunction(clGetMemObjectInfo);
-  LoadFunction(clGetImageInfo);
-  LoadFunction(clGetPipeInfo);
-  LoadFunction(clSetMemObjectDestructorCallback);
-  LoadFunction(clSVMAlloc);
-  LoadFunction(clSVMFree);
-  LoadFunction(clCreateSamplerWithProperties);
-  LoadFunction(clRetainSampler);
-  LoadFunction(clReleaseSampler);
-  LoadFunction(clGetSamplerInfo);
-  LoadFunction(clCreateProgramWithSource);
-  LoadFunction(clCreateProgramWithBinary);
-  LoadFunction(clCreateProgramWithBuiltInKernels);
-  LoadFunction(clRetainProgram);
-  LoadFunction(clReleaseProgram);
-  LoadFunction(clBuildProgram);
-  LoadFunction(clCompileProgram);
-  LoadFunction(clLinkProgram);
-  LoadFunction(clUnloadPlatformCompiler);
-  LoadFunction(clGetProgramInfo);
-  LoadFunction(clGetProgramBuildInfo);
-  LoadFunction(clCreateKernel);
-  LoadFunction(clCreateKernelsInProgram);
-  LoadFunction(clRetainKernel);
-  LoadFunction(clReleaseKernel);
-  LoadFunction(clSetKernelArg);
-  LoadFunction(clSetKernelArgSVMPointer);
-  LoadFunction(clSetKernelExecInfo);
-  LoadFunction(clGetKernelInfo);
-  LoadFunction(clGetKernelArgInfo);
-  LoadFunction(clGetKernelWorkGroupInfo);
-  LoadFunction(clWaitForEvents);
-  LoadFunction(clGetEventInfo);
-  LoadFunction(clCreateUserEvent);
-  LoadFunction(clRetainEvent);
-  LoadFunction(clReleaseEvent);
-  LoadFunction(clSetUserEventStatus);
-  LoadFunction(clSetEventCallback);
-  LoadFunction(clGetEventProfilingInfo);
-  LoadFunction(clFlush);
-  LoadFunction(clFinish);
-  LoadFunction(clEnqueueReadBuffer);
-  LoadFunction(clEnqueueReadBufferRect);
-  LoadFunction(clEnqueueWriteBuffer);
-  LoadFunction(clEnqueueWriteBufferRect);
-  LoadFunction(clEnqueueFillBuffer);
-  LoadFunction(clEnqueueCopyBuffer);
-  LoadFunction(clEnqueueCopyBufferRect);
-  LoadFunction(clEnqueueReadImage);
-  LoadFunction(clEnqueueWriteImage);
-  LoadFunction(clEnqueueFillImage);
-  LoadFunction(clEnqueueCopyImage);
-  LoadFunction(clEnqueueCopyImageToBuffer);
-  LoadFunction(clEnqueueCopyBufferToImage);
-  LoadFunction(clEnqueueMapBuffer);
-  LoadFunction(clEnqueueMapImage);
-  LoadFunction(clEnqueueUnmapMemObject);
-  LoadFunction(clEnqueueMigrateMemObjects);
-  LoadFunction(clEnqueueNDRangeKernel);
-  LoadFunction(clEnqueueNativeKernel);
-  LoadFunction(clEnqueueMarkerWithWaitList);
-  LoadFunction(clEnqueueBarrierWithWaitList);
-  LoadFunction(clEnqueueSVMFree);
-  LoadFunction(clEnqueueSVMMemcpy);
-  LoadFunction(clEnqueueSVMMemFill);
-  LoadFunction(clEnqueueSVMMap);
-  LoadFunction(clEnqueueSVMUnmap);
-  LoadFunction(clGetExtensionFunctionAddressForPlatform);
-  LoadFunction(clCreateImage2D);
-  LoadFunction(clCreateImage3D);
-  LoadFunction(clEnqueueMarker);
-  LoadFunction(clEnqueueWaitForEvents);
-  LoadFunction(clEnqueueBarrier);
-  LoadFunction(clUnloadCompiler);
-  LoadFunction(clGetExtensionFunctionAddress);
-  LoadFunction(clCreateCommandQueue);
-  LoadFunction(clCreateSampler);
-  LoadFunction(clEnqueueTask);
-
-  // OpenGL sharing
-  LoadFunction(clCreateFromGLBuffer);
-  LoadFunction(clCreateFromGLTexture);
-  LoadFunction(clEnqueueAcquireGLObjects);
-  LoadFunction(clEnqueueReleaseGLObjects);
-
-  // cl_khr_egl_event extension
-  LoadFunction(clCreateEventFromEGLSyncKHR);
-
-  // EGL sharing
-  LoadFunction(clCreateFromEGLImageKHR);
-  LoadFunction(clEnqueueAcquireEGLObjectsKHR);
-  LoadFunction(clEnqueueReleaseEGLObjectsKHR);
-
-  // OpenCL 3.0
-  LoadFunction(clCreateBufferWithProperties);
-  LoadFunction(clCreateImageWithProperties);
-}
-
-// No OpenCL support, do not set function addresses
-PFN_clGetPlatformIDs clGetPlatformIDs;
-PFN_clGetPlatformInfo clGetPlatformInfo;
-PFN_clGetDeviceIDs clGetDeviceIDs;
-PFN_clGetDeviceInfo clGetDeviceInfo;
-PFN_clCreateSubDevices clCreateSubDevices;
-PFN_clRetainDevice clRetainDevice;
-PFN_clReleaseDevice clReleaseDevice;
-PFN_clCreateContext clCreateContext;
-PFN_clCreateContextFromType clCreateContextFromType;
-PFN_clRetainContext clRetainContext;
-PFN_clReleaseContext clReleaseContext;
-PFN_clGetContextInfo clGetContextInfo;
-PFN_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
-PFN_clRetainCommandQueue clRetainCommandQueue;
-PFN_clReleaseCommandQueue clReleaseCommandQueue;
-PFN_clGetCommandQueueInfo clGetCommandQueueInfo;
-PFN_clCreateBuffer clCreateBuffer;
-PFN_clCreateSubBuffer clCreateSubBuffer;
-PFN_clCreateImage clCreateImage;
-PFN_clCreatePipe clCreatePipe;
-PFN_clRetainMemObject clRetainMemObject;
-PFN_clReleaseMemObject clReleaseMemObject;
-PFN_clGetSupportedImageFormats clGetSupportedImageFormats;
-PFN_clGetMemObjectInfo clGetMemObjectInfo;
-PFN_clGetImageInfo clGetImageInfo;
-PFN_clGetPipeInfo clGetPipeInfo;
-PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
-PFN_clSVMAlloc clSVMAlloc;
-PFN_clSVMFree clSVMFree;
-PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties;
-PFN_clRetainSampler clRetainSampler;
-PFN_clReleaseSampler clReleaseSampler;
-PFN_clGetSamplerInfo clGetSamplerInfo;
-PFN_clCreateProgramWithSource clCreateProgramWithSource;
-PFN_clCreateProgramWithBinary clCreateProgramWithBinary;
-PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
-PFN_clRetainProgram clRetainProgram;
-PFN_clReleaseProgram clReleaseProgram;
-PFN_clBuildProgram clBuildProgram;
-PFN_clCompileProgram clCompileProgram;
-PFN_clLinkProgram clLinkProgram;
-PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler;
-PFN_clGetProgramInfo clGetProgramInfo;
-PFN_clGetProgramBuildInfo clGetProgramBuildInfo;
-PFN_clCreateKernel clCreateKernel;
-PFN_clCreateKernelsInProgram clCreateKernelsInProgram;
-PFN_clRetainKernel clRetainKernel;
-PFN_clReleaseKernel clReleaseKernel;
-PFN_clSetKernelArg clSetKernelArg;
-PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
-PFN_clSetKernelExecInfo clSetKernelExecInfo;
-PFN_clGetKernelInfo clGetKernelInfo;
-PFN_clGetKernelArgInfo clGetKernelArgInfo;
-PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
-PFN_clWaitForEvents clWaitForEvents;
-PFN_clGetEventInfo clGetEventInfo;
-PFN_clCreateUserEvent clCreateUserEvent;
-PFN_clRetainEvent clRetainEvent;
-PFN_clReleaseEvent clReleaseEvent;
-PFN_clSetUserEventStatus clSetUserEventStatus;
-PFN_clSetEventCallback clSetEventCallback;
-PFN_clGetEventProfilingInfo clGetEventProfilingInfo;
-PFN_clFlush clFlush;
-PFN_clFinish clFinish;
-PFN_clEnqueueReadBuffer clEnqueueReadBuffer;
-PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect;
-PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer;
-PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
-PFN_clEnqueueFillBuffer clEnqueueFillBuffer;
-PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer;
-PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
-PFN_clEnqueueReadImage clEnqueueReadImage;
-PFN_clEnqueueWriteImage clEnqueueWriteImage;
-PFN_clEnqueueFillImage clEnqueueFillImage;
-PFN_clEnqueueCopyImage clEnqueueCopyImage;
-PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
-PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
-PFN_clEnqueueMapBuffer clEnqueueMapBuffer;
-PFN_clEnqueueMapImage clEnqueueMapImage;
-PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
-PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
-PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
-PFN_clEnqueueNativeKernel clEnqueueNativeKernel;
-PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
-PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
-PFN_clEnqueueSVMFree clEnqueueSVMFree;
-PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
-PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill;
-PFN_clEnqueueSVMMap clEnqueueSVMMap;
-PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap;
-PFN_clGetExtensionFunctionAddressForPlatform
-    clGetExtensionFunctionAddressForPlatform;
-PFN_clCreateImage2D clCreateImage2D;
-PFN_clCreateImage3D clCreateImage3D;
-PFN_clEnqueueMarker clEnqueueMarker;
-PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents;
-PFN_clEnqueueBarrier clEnqueueBarrier;
-PFN_clUnloadCompiler clUnloadCompiler;
-PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
-PFN_clCreateCommandQueue clCreateCommandQueue;
-PFN_clCreateSampler clCreateSampler;
-PFN_clEnqueueTask clEnqueueTask;
-
-// OpenGL sharing
-PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
-PFN_clCreateFromGLTexture clCreateFromGLTexture;
-PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
-PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
-
-// cl_khr_egl_event extension
-PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
-
-// EGL sharing
-PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
-PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
-PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
-
-// cl_khr_command_buffer extension
-PFN_clCreateCommandBufferKHR clCreateCommandBufferKHR;
-PFN_clRetainCommandBufferKHR clRetainCommandBufferKHR;
-PFN_clReleaseCommandBufferKHR clReleaseCommandBufferKHR;
-PFN_clFinalizeCommandBufferKHR clFinalizeCommandBufferKHR;
-PFN_clEnqueueCommandBufferKHR clEnqueueCommandBufferKHR;
-PFN_clCommandNDRangeKernelKHR clCommandNDRangeKernelKHR;
-PFN_clGetCommandBufferInfoKHR clGetCommandBufferInfoKHR;
-
-// OpenCL 3.0
-PFN_clCreateBufferWithProperties clCreateBufferWithProperties;
-PFN_clCreateImageWithProperties clCreateImageWithProperties;
-
-cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
-                           const cl_image_format* image_format,
-                           const cl_image_desc* image_desc, void* host_ptr,
-                           cl_int* errcode_ret) {
-  if (clCreateImage) {  // clCreateImage available since OpenCL 1.2
-    return clCreateImage(context, flags, image_format, image_desc, host_ptr,
-                         errcode_ret);
-  } else {
-    return clCreateImage2D(context, flags, image_format,
-                           image_desc->image_width, image_desc->image_height,
-                           image_desc->image_row_pitch, host_ptr, errcode_ret);
-  }
-}
-
-cl_mem CreateImage3DLegacy(cl_context context, cl_mem_flags flags,
-                           const cl_image_format* image_format,
-                           const cl_image_desc* image_desc, void* host_ptr,
-                           cl_int* errcode_ret) {
-  if (clCreateImage) {  // clCreateImage available since OpenCL 1.2
-    return clCreateImage(context, flags, image_format, image_desc, host_ptr,
-                         errcode_ret);
-  } else {
-    return clCreateImage3D(context, flags, image_format,
-                           image_desc->image_width, image_desc->image_height,
-                           image_desc->image_depth, image_desc->image_row_pitch,
-                           image_desc->image_slice_pitch, host_ptr,
-                           errcode_ret);
-  }
-}
-}  // namespace cl
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h b/tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h
deleted file mode 100644
index cfbeb805dbb4..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/opencl/opencl_wrapper.h
+++ /dev/null
@@ -1,737 +0,0 @@
-// Copyright 2024 The TensorFlow Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is copied from third_party/ml_drift/cl/opencl_wrapper.h.
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
-
-#include <cstddef>
-
-#include "absl/status/status.h"
-#include <CL/cl.h>           // IWYU pragma: export
-#include <CL/cl_egl.h>       // IWYU pragma: export
-#include <CL/cl_ext.h>       // IWYU pragma: export
-#include <CL/cl_gl.h>        // IWYU pragma: export
-#include <CL/cl_platform.h>  // IWYU pragma: export
-
-namespace litert {
-namespace cl {
-
-absl::Status LoadOpenCL();
-void LoadOpenCLFunctionExtensions(cl_platform_id platform_id);
-
-typedef cl_int(CL_API_CALL *PFN_clGetPlatformIDs)(
-    cl_uint /* num_entries */, cl_platform_id * /* platforms */,
-    cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetPlatformInfo)(
-    cl_platform_id /* platform */, cl_platform_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetDeviceIDs)(
-    cl_platform_id /* platform */, cl_device_type /* device_type */,
-    cl_uint /* num_entries */, cl_device_id * /* devices */,
-    cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetDeviceInfo)(
-    cl_device_id /* device */, cl_device_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clCreateSubDevices)(
-    cl_device_id /* in_device */,
-    const cl_device_partition_property * /* properties */,
-    cl_uint /* num_devices */, cl_device_id * /* out_devices */,
-    cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clRetainDevice)(cl_device_id /* device */)
-    CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clReleaseDevice)(cl_device_id /* device */)
-    CL_API_SUFFIX__VERSION_1_2;
-typedef cl_context(CL_API_CALL *PFN_clCreateContext)(
-    const cl_context_properties * /* properties */, cl_uint /* num_devices */,
-    const cl_device_id * /* devices */,
-    void(CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t,
-                                         void *),
-    void * /* user_data */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_context(CL_API_CALL *PFN_clCreateContextFromType)(
-    const cl_context_properties * /* properties */,
-    cl_device_type /* device_type */,
-    void(CL_CALLBACK * /* pfn_notify*/)(const char *, const void *, size_t,
-                                        void *),
-    void * /* user_data */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clRetainContext)(cl_context /* context */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clReleaseContext)(cl_context /* context */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetContextInfo)(
-    cl_context /* context */, cl_context_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueueWithProperties)(
-    cl_context /* context */, cl_device_id /* device */,
-    const cl_queue_properties * /* properties */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clRetainCommandQueue)(
-    cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clReleaseCommandQueue)(
-    cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetCommandQueueInfo)(
-    cl_command_queue /* command_queue */,
-    cl_command_queue_info /* param_name */, size_t /* param_value_size */,
-    void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_mem(CL_API_CALL *PFN_clCreateBuffer)(
-    cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */,
-    void * /* host_ptr */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_mem(CL_API_CALL *PFN_clCreateSubBuffer)(
-    cl_mem /* buffer */, cl_mem_flags /* flags */,
-    cl_buffer_create_type /* buffer_create_type */,
-    const void * /* buffer_create_info */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_mem(CL_API_CALL *PFN_clCreateImage)(
-    cl_context /* context */, cl_mem_flags /* flags */,
-    const cl_image_format * /* image_format */,
-    const cl_image_desc * /* image_desc */, void * /* host_ptr */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_mem(CL_API_CALL *PFN_clCreatePipe)(
-    cl_context /* context */, cl_mem_flags /* flags */,
-    cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
-    const cl_pipe_properties * /* properties */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clRetainMemObject)(cl_mem /* memobj */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clReleaseMemObject)(cl_mem /* memobj */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetSupportedImageFormats)(
-    cl_context /* context */, cl_mem_flags /* flags */,
-    cl_mem_object_type /* image_type */, cl_uint /* num_entries */,
-    cl_image_format * /* image_formats */,
-    cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetMemObjectInfo)(
-    cl_mem /* memobj */, cl_mem_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetImageInfo)(
-    cl_mem /* image */, cl_image_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetPipeInfo)(
-    cl_mem /* pipe */, cl_pipe_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clSetMemObjectDestructorCallback)(
-    cl_mem /* memobj */,
-    void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
-                                       void * /*user_data*/),
-    void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
-typedef void *(CL_API_CALL *PFN_clSVMAlloc)(
-    cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
-    cl_uint /* alignment */)CL_API_SUFFIX__VERSION_2_0;
-typedef void(CL_API_CALL *PFN_clSVMFree)(cl_context /* context */,
-                                         void * /* svm_pointer */)
-    CL_API_SUFFIX__VERSION_2_0;
-typedef cl_sampler(CL_API_CALL *PFN_clCreateSamplerWithProperties)(
-    cl_context /* context */,
-    const cl_sampler_properties * /* normalized_coords */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clRetainSampler)(cl_sampler /* sampler */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clReleaseSampler)(cl_sampler /* sampler */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetSamplerInfo)(
-    cl_sampler /* sampler */, cl_sampler_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithSource)(
-    cl_context /* context */, cl_uint /* count */, const char ** /* strings */,
-    const size_t * /* lengths */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBinary)(
-    cl_context /* context */, cl_uint /* num_devices */,
-    const cl_device_id * /* device_list */, const size_t * /* lengths */,
-    const unsigned char ** /* binaries */, cl_int * /* binary_status */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBuiltInKernels)(
-    cl_context /* context */, cl_uint /* num_devices */,
-    const cl_device_id * /* device_list */, const char * /* kernel_names */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clRetainProgram)(cl_program /* program */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clReleaseProgram)(cl_program /* program */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clBuildProgram)(
-    cl_program /* program */, cl_uint /* num_devices */,
-    const cl_device_id * /* device_list */, const char * /* options */,
-    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
-                                         void * /* user_data */),
-    void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clCompileProgram)(
-    cl_program /* program */, cl_uint /* num_devices */,
-    const cl_device_id * /* device_list */, const char * /* options */,
-    cl_uint /* num_input_headers */, const cl_program * /* input_headers */,
-    const char ** /* header_include_names */,
-    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
-                                         void * /* user_data */),
-    void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_program(CL_API_CALL *PFN_clLinkProgram)(
-    cl_context /* context */, cl_uint /* num_devices */,
-    const cl_device_id * /* device_list */, const char * /* options */,
-    cl_uint /* num_input_programs */, const cl_program * /* input_programs */,
-    void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */,
-                                         void * /* user_data */),
-    void * /* user_data */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clUnloadPlatformCompiler)(
-    cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clGetProgramInfo)(
-    cl_program /* program */, cl_program_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetProgramBuildInfo)(
-    cl_program /* program */, cl_device_id /* device */,
-    cl_program_build_info /* param_name */, size_t /* param_value_size */,
-    void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_kernel(CL_API_CALL *PFN_clCreateKernel)(
-    cl_program /* program */, const char * /* kernel_name */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clCreateKernelsInProgram)(
-    cl_program /* program */, cl_uint /* num_kernels */,
-    cl_kernel * /* kernels */,
-    cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clRetainKernel)(cl_kernel /* kernel */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clReleaseKernel)(cl_kernel /* kernel */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clSetKernelArg)(
-    cl_kernel /* kernel */, cl_uint /* arg_index */, size_t /* arg_size */,
-    const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clSetKernelArgSVMPointer)(
-    cl_kernel /* kernel */, cl_uint /* arg_index */,
-    const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clSetKernelExecInfo)(
-    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
-    size_t /* param_value_size */,
-    const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clGetKernelInfo)(
-    cl_kernel /* kernel */, cl_kernel_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetKernelArgInfo)(
-    cl_kernel /* kernel */, cl_uint /* arg_indx */,
-    cl_kernel_arg_info /* param_name */, size_t /* param_value_size */,
-    void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clGetKernelWorkGroupInfo)(
-    cl_kernel /* kernel */, cl_device_id /* device */,
-    cl_kernel_work_group_info /* param_name */, size_t /* param_value_size */,
-    void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clWaitForEvents)(
-    cl_uint /* num_events */,
-    const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clGetEventInfo)(
-    cl_event /* event */, cl_event_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_event(CL_API_CALL *PFN_clCreateUserEvent)(cl_context /* context */,
-                                                     cl_int * /* errcode_ret */)
-    CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int(CL_API_CALL *PFN_clRetainEvent)(cl_event /* event */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clReleaseEvent)(cl_event /* event */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clSetUserEventStatus)(
-    cl_event /* event */,
-    cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int(CL_API_CALL *PFN_clSetEventCallback)(
-    cl_event /* event */, cl_int /* command_exec_callback_type */,
-    void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
-    void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int(CL_API_CALL *PFN_clGetEventProfilingInfo)(
-    cl_event /* event */, cl_profiling_info /* param_name */,
-    size_t /* param_value_size */, void * /* param_value */,
-    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clFlush)(cl_command_queue /* command_queue */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clFinish)(cl_command_queue /* command_queue */)
-    CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBuffer)(
-    cl_command_queue /* command_queue */, cl_mem /* buffer */,
-    cl_bool /* blocking_read */, size_t /* offset */, size_t /* size */,
-    void * /* ptr */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBufferRect)(
-    cl_command_queue /* command_queue */, cl_mem /* buffer */,
-    cl_bool /* blocking_read */, const size_t * /* buffer_offset */,
-    const size_t * /* host_offset */, const size_t * /* region */,
-    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */,
-    size_t /* host_row_pitch */, size_t /* host_slice_pitch */,
-    void * /* ptr */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBuffer)(
-    cl_command_queue /* command_queue */, cl_mem /* buffer */,
-    cl_bool /* blocking_write */, size_t /* offset */, size_t /* size */,
-    const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBufferRect)(
-    cl_command_queue /* command_queue */, cl_mem /* buffer */,
-    cl_bool /* blocking_write */, const size_t * /* buffer_offset */,
-    const size_t * /* host_offset */, const size_t * /* region */,
-    size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */,
-    size_t /* host_row_pitch */, size_t /* host_slice_pitch */,
-    const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueFillBuffer)(
-    cl_command_queue /* command_queue */, cl_mem /* buffer */,
-    const void * /* pattern */, size_t /* pattern_size */, size_t /* offset */,
-    size_t /* size */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBuffer)(
-    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
-    cl_mem /* dst_buffer */, size_t /* src_offset */, size_t /* dst_offset */,
-    size_t /* size */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferRect)(
-    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
-    cl_mem /* dst_buffer */, const size_t * /* src_origin */,
-    const size_t * /* dst_origin */, const size_t * /* region */,
-    size_t /* src_row_pitch */, size_t /* src_slice_pitch */,
-    size_t /* dst_row_pitch */, size_t /* dst_slice_pitch */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueReadImage)(
-    cl_command_queue /* command_queue */, cl_mem /* image */,
-    cl_bool /* blocking_read */, const size_t * /* origin[3] */,
-    const size_t * /* region[3] */, size_t /* row_pitch */,
-    size_t /* slice_pitch */, void * /* ptr */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteImage)(
-    cl_command_queue /* command_queue */, cl_mem /* image */,
-    cl_bool /* blocking_write */, const size_t * /* origin[3] */,
-    const size_t * /* region[3] */, size_t /* input_row_pitch */,
-    size_t /* input_slice_pitch */, const void * /* ptr */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueFillImage)(
-    cl_command_queue /* command_queue */, cl_mem /* image */,
-    const void * /* fill_color */, const size_t * /* origin[3] */,
-    const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImage)(
-    cl_command_queue /* command_queue */, cl_mem /* src_image */,
-    cl_mem /* dst_image */, const size_t * /* src_origin[3] */,
-    const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImageToBuffer)(
-    cl_command_queue /* command_queue */, cl_mem /* src_image */,
-    cl_mem /* dst_buffer */, const size_t * /* src_origin[3] */,
-    const size_t * /* region[3] */, size_t /* dst_offset */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferToImage)(
-    cl_command_queue /* command_queue */, cl_mem /* src_buffer */,
-    cl_mem /* dst_image */, size_t /* src_offset */,
-    const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef void *(CL_API_CALL *PFN_clEnqueueMapBuffer)(
-    cl_command_queue /* command_queue */, cl_mem /* buffer */,
-    cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
-    size_t /* offset */, size_t /* size */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */, cl_event * /* event */,
-    cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
-typedef void *(CL_API_CALL *PFN_clEnqueueMapImage)(
-    cl_command_queue /* command_queue */, cl_mem /* image */,
-    cl_bool /* blocking_map */, cl_map_flags /* map_flags */,
-    const size_t * /* origin[3] */, const size_t * /* region[3] */,
-    size_t * /* image_row_pitch */, size_t * /* image_slice_pitch */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */, cl_event * /* event */,
-    cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueUnmapMemObject)(
-    cl_command_queue /* command_queue */, cl_mem /* memobj */,
-    void * /* mapped_ptr */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueMigrateMemObjects)(
-    cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */,
-    const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueNDRangeKernel)(
-    cl_command_queue /* command_queue */, cl_kernel /* kernel */,
-    cl_uint /* work_dim */, const size_t * /* global_work_offset */,
-    const size_t * /* global_work_size */, const size_t * /* local_work_size */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueNativeKernel)(
-    cl_command_queue /* command_queue */,
-    void(CL_CALLBACK * /*user_func*/)(void *), void * /* args */,
-    size_t /* cb_args */, cl_uint /* num_mem_objects */,
-    const cl_mem * /* mem_list */, const void ** /* args_mem_loc */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueMarkerWithWaitList)(
-    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrierWithWaitList)(
-    cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMFree)(
-    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
-    void *[] /* svm_pointers[] */,
-    void(CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
-                                          cl_uint /* num_svm_pointers */,
-                                          void *[] /* svm_pointers[] */,
-                                          void * /* user_data */),
-    void * /* user_data */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemcpy)(
-    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
-    void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemFill)(
-    cl_command_queue /* command_queue */, void * /* svm_ptr */,
-    const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMap)(
-    cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
-    cl_map_flags /* flags */, void * /* svm_ptr */, size_t /* size */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMUnmap)(
-    cl_command_queue /* command_queue */, void * /* svm_ptr */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
-typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddressForPlatform)(
-    cl_platform_id /* platform */,
-    const char * /* func_name */)CL_API_SUFFIX__VERSION_1_2;
-typedef cl_mem(CL_API_CALL *PFN_clCreateImage2D)(
-    cl_context /* context */, cl_mem_flags /* flags */,
-    const cl_image_format * /* image_format */, size_t /* image_width */,
-    size_t /* image_height */, size_t /* image_row_pitch */,
-    void * /* host_ptr */, cl_int * /* errcode_ret */);
-typedef cl_mem(CL_API_CALL *PFN_clCreateImage3D)(
-    cl_context /* context */, cl_mem_flags /* flags */,
-    const cl_image_format * /* image_format */, size_t /* image_width */,
-    size_t /* image_height */, size_t /* image_depth */,
-    size_t /* image_row_pitch */, size_t /* image_slice_pitch */,
-    void * /* host_ptr */, cl_int * /* errcode_ret */);
-typedef cl_int(CL_API_CALL *PFN_clEnqueueMarker)(
-    cl_command_queue /* command_queue */, cl_event * /* event */);
-typedef cl_int(CL_API_CALL *PFN_clEnqueueWaitForEvents)(
-    cl_command_queue /* command_queue */, cl_uint /* num_events */,
-    const cl_event * /* event_list */);
-typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrier)(
-    cl_command_queue /* command_queue */);
-typedef cl_int(CL_API_CALL *PFN_clUnloadCompiler)();
-typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddress)(
-    const char * /* func_name */);
-typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueue)(
-    cl_context /* context */, cl_device_id /* device */,
-    cl_command_queue_properties /* properties */, cl_int * /* errcode_ret */);
-typedef cl_sampler(CL_API_CALL *PFN_clCreateSampler)(
-    cl_context /* context */, cl_bool /* normalized_coords */,
-    cl_addressing_mode /* addressing_mode */, cl_filter_mode /* filter_mode */,
-    cl_int * /* errcode_ret */);
-typedef cl_int(CL_API_CALL *PFN_clEnqueueTask)(
-    cl_command_queue /* command_queue */, cl_kernel /* kernel */,
-    cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */, cl_event * /* event */);
-
-// OpenGL sharing
-typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLBuffer)(cl_context, cl_mem_flags,
-                                                      cl_GLuint, int *);
-typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLTexture)(
-    cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */,
-    cl_GLint /* miplevel */, cl_GLuint /* texture */,
-    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireGLObjects)(
-    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
-    const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */, cl_event * /* event */);
-typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseGLObjects)(
-    cl_command_queue /* command_queue */, cl_uint /* num_objects */,
-    const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */,
-    cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-// cl_khr_egl_event extension
-
-// CLeglDisplayKHR is an opaque handle to an EGLDisplay
-typedef void *CLeglDisplayKHR;
-
-// CLeglSyncKHR is an opaque handle to an EGLSync object
-typedef void *CLeglSyncKHR;
-
-typedef cl_event(CL_API_CALL *PFN_clCreateEventFromEGLSyncKHR)(
-    cl_context /* context */, CLeglSyncKHR /* sync */,
-    CLeglDisplayKHR /* display */, cl_int * /* errcode_ret */);
-
-// EGL sharing
-typedef cl_mem(CL_API_CALL *PFN_clCreateFromEGLImageKHR)(
-    cl_context /*context*/, CLeglDisplayKHR /*display*/,
-    CLeglImageKHR /*image*/, cl_mem_flags /*flags*/,
-    const cl_egl_image_properties_khr * /*properties*/,
-    cl_int * /*errcode_ret*/);
-typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireEGLObjectsKHR)(
-    cl_command_queue /*command_queue*/, cl_uint /*num_objects*/,
-    const cl_mem * /*mem_objects*/, cl_uint /*num_events_in_wait_list*/,
-    const cl_event * /*event_wait_list*/, cl_event * /*event*/);
-typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseEGLObjectsKHR)(
-    cl_command_queue /*command_queue*/, cl_uint /*num_objects*/,
-    const cl_mem * /*mem_objects*/, cl_uint /*num_events_in_wait_list*/,
-    const cl_event * /*event_wait_list*/, cl_event * /*event*/);
-
-// cl_khr_command_buffer
-typedef cl_command_buffer_khr(CL_API_CALL *PFN_clCreateCommandBufferKHR)(
-    cl_uint /*num_queues*/, const cl_command_queue * /*queues*/,
-    const cl_command_buffer_properties_khr * /*properties*/,
-    cl_int * /*errcode_ret*/);
-
-typedef cl_int(CL_API_CALL *PFN_clRetainCommandBufferKHR)(
-    cl_command_buffer_khr /*command_buffer*/);
-
-typedef cl_int(CL_API_CALL *PFN_clReleaseCommandBufferKHR)(
-    cl_command_buffer_khr /*command_buffer*/);
-
-typedef cl_int(CL_API_CALL *PFN_clFinalizeCommandBufferKHR)(
-    cl_command_buffer_khr /*command_buffer*/);
-
-typedef cl_int(CL_API_CALL *PFN_clEnqueueCommandBufferKHR)(
-    cl_uint /*num_queues*/, cl_command_queue * /*queues*/,
-    cl_command_buffer_khr /*command_buffer*/,
-    cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/,
-    cl_event * /*event*/);
-
-#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION >= CL_MAKE_VERSION(0, 9, 5)
-typedef cl_int(CL_API_CALL *PFN_clCommandNDRangeKernelKHR)(
-    cl_command_buffer_khr /*command_buffer*/,
-    cl_command_queue /*command_queue*/,
-    const cl_command_properties_khr * /*properties*/, cl_kernel /*kernel*/,
-    cl_uint /*work_dim*/, const size_t * /*global_work_offset*/,
-    const size_t * /*global_work_size*/, const size_t * /*local_work_size*/,
-    cl_uint /*num_sync_points_in_wait_list*/,
-    const cl_sync_point_khr * /*sync_point_wait_list*/,
-    cl_sync_point_khr * /*sync_point*/,
-    cl_mutable_command_khr * /*mutable_handle*/);
-#else
-typedef cl_int(CL_API_CALL *PFN_clCommandNDRangeKernelKHR)(
-    cl_command_buffer_khr /*command_buffer*/,
-    cl_command_queue /*command_queue*/,
-    const cl_ndrange_kernel_command_properties_khr * /*properties*/,
-    cl_kernel /*kernel*/, cl_uint /*work_dim*/,
-    const size_t * /*global_work_offset*/, const size_t * /*global_work_size*/,
-    const size_t * /*local_work_size*/,
-    cl_uint /*num_sync_points_in_wait_list*/,
-    const cl_sync_point_khr * /*sync_point_wait_list*/,
-    cl_sync_point_khr * /*sync_point*/,
-    cl_mutable_command_khr * /*mutable_handle*/);
-#endif
-
-typedef cl_int(CL_API_CALL *PFN_clGetCommandBufferInfoKHR)(
-    cl_command_buffer_khr /*command_buffer*/,
-    cl_command_buffer_info_khr /*param_name*/, size_t /*param_value_size*/,
-    void * /*param_value*/, size_t * /*param_value_size_ret*/);
-
-// OpenCL 3.0
-typedef cl_mem(CL_API_CALL *PFN_clCreateBufferWithProperties)(
-    cl_context /*context*/, const cl_mem_properties * /*properties*/,
-    cl_mem_flags /*flags*/, size_t /*size*/, void * /*host_ptr*/,
-    cl_int * /*errcode_ret*/);
-typedef cl_mem(CL_API_CALL *PFN_clCreateImageWithProperties)(
-    cl_context /*context*/, const cl_mem_properties * /*properties*/,
-    cl_mem_flags /*flags*/, const cl_image_format * /*image_format*/,
-    const cl_image_desc * /*image_desc*/, void * /*host_ptr*/,
-    cl_int * /*errcode_ret*/);
-
-extern PFN_clGetPlatformIDs clGetPlatformIDs;
-extern PFN_clGetPlatformInfo clGetPlatformInfo;
-extern PFN_clGetDeviceIDs clGetDeviceIDs;
-extern PFN_clGetDeviceInfo clGetDeviceInfo;
-extern PFN_clCreateSubDevices clCreateSubDevices;
-extern PFN_clRetainDevice clRetainDevice;
-extern PFN_clReleaseDevice clReleaseDevice;
-extern PFN_clCreateContext clCreateContext;
-extern PFN_clCreateContextFromType clCreateContextFromType;
-extern PFN_clRetainContext clRetainContext;
-extern PFN_clReleaseContext clReleaseContext;
-extern PFN_clGetContextInfo clGetContextInfo;
-extern PFN_clCreateCommandQueueWithProperties
-    clCreateCommandQueueWithProperties;
-extern PFN_clRetainCommandQueue clRetainCommandQueue;
-extern PFN_clReleaseCommandQueue clReleaseCommandQueue;
-extern PFN_clGetCommandQueueInfo clGetCommandQueueInfo;
-extern PFN_clCreateBuffer clCreateBuffer;
-extern PFN_clCreateSubBuffer clCreateSubBuffer;
-extern PFN_clCreateImage clCreateImage;
-extern PFN_clCreatePipe clCreatePipe;
-extern PFN_clRetainMemObject clRetainMemObject;
-extern PFN_clReleaseMemObject clReleaseMemObject;
-extern PFN_clGetSupportedImageFormats clGetSupportedImageFormats;
-extern PFN_clGetMemObjectInfo clGetMemObjectInfo;
-extern PFN_clGetImageInfo clGetImageInfo;
-extern PFN_clGetPipeInfo clGetPipeInfo;
-extern PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
-extern PFN_clSVMAlloc clSVMAlloc;
-extern PFN_clSVMFree clSVMFree;
-extern PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties;
-extern PFN_clRetainSampler clRetainSampler;
-extern PFN_clReleaseSampler clReleaseSampler;
-extern PFN_clGetSamplerInfo clGetSamplerInfo;
-extern PFN_clCreateProgramWithSource clCreateProgramWithSource;
-extern PFN_clCreateProgramWithBinary clCreateProgramWithBinary;
-extern PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
-extern PFN_clRetainProgram clRetainProgram;
-extern PFN_clReleaseProgram clReleaseProgram;
-extern PFN_clBuildProgram clBuildProgram;
-extern PFN_clCompileProgram clCompileProgram;
-extern PFN_clLinkProgram clLinkProgram;
-extern PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler;
-extern PFN_clGetProgramInfo clGetProgramInfo;
-extern PFN_clGetProgramBuildInfo clGetProgramBuildInfo;
-extern PFN_clCreateKernel clCreateKernel;
-extern PFN_clCreateKernelsInProgram clCreateKernelsInProgram;
-extern PFN_clRetainKernel clRetainKernel;
-extern PFN_clReleaseKernel clReleaseKernel;
-extern PFN_clSetKernelArg clSetKernelArg;
-extern PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
-extern PFN_clSetKernelExecInfo clSetKernelExecInfo;
-extern PFN_clGetKernelInfo clGetKernelInfo;
-extern PFN_clGetKernelArgInfo clGetKernelArgInfo;
-extern PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
-extern PFN_clWaitForEvents clWaitForEvents;
-extern PFN_clGetEventInfo clGetEventInfo;
-extern PFN_clCreateUserEvent clCreateUserEvent;
-extern PFN_clRetainEvent clRetainEvent;
-extern PFN_clReleaseEvent clReleaseEvent;
-extern PFN_clSetUserEventStatus clSetUserEventStatus;
-extern PFN_clSetEventCallback clSetEventCallback;
-extern PFN_clGetEventProfilingInfo clGetEventProfilingInfo;
-extern PFN_clFlush clFlush;
-extern PFN_clFinish clFinish;
-extern PFN_clEnqueueReadBuffer clEnqueueReadBuffer;
-extern PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect;
-extern PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer;
-extern PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
-extern PFN_clEnqueueFillBuffer clEnqueueFillBuffer;
-extern PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer;
-extern PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
-extern PFN_clEnqueueReadImage clEnqueueReadImage;
-extern PFN_clEnqueueWriteImage clEnqueueWriteImage;
-extern PFN_clEnqueueFillImage clEnqueueFillImage;
-extern PFN_clEnqueueCopyImage clEnqueueCopyImage;
-extern PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
-extern PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
-extern PFN_clEnqueueMapBuffer clEnqueueMapBuffer;
-extern PFN_clEnqueueMapImage clEnqueueMapImage;
-extern PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
-extern PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
-extern PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
-extern PFN_clEnqueueNativeKernel clEnqueueNativeKernel;
-extern PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
-extern PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
-extern PFN_clEnqueueSVMFree clEnqueueSVMFree;
-extern PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
-extern PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill;
-extern PFN_clEnqueueSVMMap clEnqueueSVMMap;
-extern PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap;
-extern PFN_clGetExtensionFunctionAddressForPlatform
-    clGetExtensionFunctionAddressForPlatform;
-extern PFN_clCreateImage2D clCreateImage2D;
-extern PFN_clCreateImage3D clCreateImage3D;
-extern PFN_clEnqueueMarker clEnqueueMarker;
-extern PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents;
-extern PFN_clEnqueueBarrier clEnqueueBarrier;
-extern PFN_clUnloadCompiler clUnloadCompiler;
-extern PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
-extern PFN_clCreateCommandQueue clCreateCommandQueue;
-extern PFN_clCreateSampler clCreateSampler;
-extern PFN_clEnqueueTask clEnqueueTask;
-
-// OpenGL sharing
-extern PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
-extern PFN_clCreateFromGLTexture clCreateFromGLTexture;
-extern PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
-extern PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
-
-// cl_khr_egl_event extension
-extern PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
-
-// EGL sharing
-extern PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
-extern PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
-extern PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
-
-// cl_khr_command_buffer extension
-extern PFN_clCreateCommandBufferKHR clCreateCommandBufferKHR;
-extern PFN_clRetainCommandBufferKHR clRetainCommandBufferKHR;
-extern PFN_clReleaseCommandBufferKHR clReleaseCommandBufferKHR;
-extern PFN_clFinalizeCommandBufferKHR clFinalizeCommandBufferKHR;
-extern PFN_clEnqueueCommandBufferKHR clEnqueueCommandBufferKHR;
-extern PFN_clCommandNDRangeKernelKHR clCommandNDRangeKernelKHR;
-extern PFN_clGetCommandBufferInfoKHR clGetCommandBufferInfoKHR;
-
-// OpenCL 3.0
-extern PFN_clCreateBufferWithProperties clCreateBufferWithProperties;
-extern PFN_clCreateImageWithProperties clCreateImageWithProperties;
-
-// For convenient image creation
-// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
-// otherwise it will use legacy clCreateImage2D
-cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
-                           const cl_image_format *image_format,
-                           const cl_image_desc *image_desc, void *host_ptr,
-                           cl_int *errcode_ret);
-
-// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
-// otherwise it will use legacy clCreateImage3D
-cl_mem CreateImage3DLegacy(cl_context context, cl_mem_flags flags,
-                           const cl_image_format *image_format,
-                           const cl_image_desc *image_desc, void *host_ptr,
-                           cl_int *errcode_ret);
-
-}  // namespace cl
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/tensor_buffer.cc b/tensorflow/lite/experimental/litert/runtime/tensor_buffer.cc
deleted file mode 100644
index c43a683b7ecd..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tensor_buffer.cc
+++ /dev/null
@@ -1,599 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-
-#include <stdlib.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/util/tensor_type_util.h"
-#include "tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/dmabuf_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/fastrpc_buffer.h"
-#if LITERT_HAS_OPENGL_SUPPORT
-#include "tensorflow/lite/experimental/litert/runtime/gl_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/gl_texture.h"
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-#include "tensorflow/lite/experimental/litert/runtime/ion_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h"
-
-using litert::Expected;
-using litert::Unexpected;
-
-namespace {
-
-template <typename T>
-void Copy(size_t array_size, const T* array, std::vector<T>& vec) {
-  vec.clear();
-  vec.reserve(array_size);
-  std::copy(array, array + array_size, std::back_inserter(vec));
-  array = vec.data();
-}
-
-}  // namespace
-
-LiteRtTensorBufferT::LiteRtTensorBufferT(
-    const LiteRtRankedTensorType& tensor_type,
-    LiteRtTensorBufferType buffer_type, size_t buffer_size,
-    size_t buffer_offset)
-    : tensor_type_(tensor_type),
-      buffer_type_(buffer_type),
-      buffer_size_(buffer_size),
-      buffer_offset_(buffer_offset),
-      ref_(1) {
-  // Copy local memory passed by the caller.
-  Copy(tensor_type_.layout.rank, tensor_type_.layout.dimensions, dimensions_);
-  if (tensor_type_.layout.strides) {
-    Copy(tensor_type_.layout.rank, tensor_type_.layout.strides, strides_);
-  }
-}
-
-LiteRtTensorBufferT::~LiteRtTensorBufferT() {
-  switch (buffer_type()) {
-    case kLiteRtTensorBufferTypeUnknown:
-      // Nothing to do.
-      break;
-    case kLiteRtTensorBufferTypeHostMemory:
-      if (auto& buffer = std::get<HostBuffer>(buffer_); buffer.deallocator) {
-        buffer.deallocator(buffer.addr);
-      }
-      break;
-    case kLiteRtTensorBufferTypeAhwb:
-      if (auto& buffer = std::get<AhwbBuffer>(buffer_); buffer.deallocator) {
-        buffer.deallocator(buffer.ahwb);
-      }
-      break;
-    case kLiteRtTensorBufferTypeIon:
-      if (auto& buffer = std::get<IonBuffer>(buffer_); buffer.deallocator) {
-        buffer.deallocator(buffer.addr);
-      }
-      break;
-    case kLiteRtTensorBufferTypeDmaBuf:
-      if (auto& buffer = std::get<DmaBufBuffer>(buffer_); buffer.deallocator) {
-        buffer.deallocator(buffer.addr);
-      }
-      break;
-    case kLiteRtTensorBufferTypeFastRpc:
-      if (auto& buffer = std::get<FastRpcBuffer>(buffer_); buffer.deallocator) {
-        buffer.deallocator(buffer.addr);
-      }
-      break;
-    case kLiteRtTensorBufferTypeOpenCl:
-      // internal opencl buffer is auto-disposed by the
-      // litert::internal::OpenClBuffer destructor.
-      break;
-    case kLiteRtTensorBufferTypeGlBuffer:
-      // internal gl buffer is auto-disposed by the
-      // litert::internal::GlBuffer destructor.
-    case kLiteRtTensorBufferTypeGlTexture:
-      // internal gl texture is auto-disposed by the
-      // litert::internal::GlTexture destructor.
-      break;
-  }
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromHostMemory(
-    const LiteRtRankedTensorType& tensor_type, absl::Span<uint8_t> host_memory,
-    LiteRtHostMemoryDeallocator deallocator) {
-  Ptr tensor_buffer(new LiteRtTensorBufferT(
-      tensor_type, kLiteRtTensorBufferTypeHostMemory, host_memory.size()));
-  tensor_buffer->buffer_ = HostBuffer{
-      .addr = host_memory.data(),
-      .deallocator = deallocator,
-  };
-
-  if (auto status = tensor_buffer->IsValid(); !status) {
-    return Unexpected(status.Error());
-  }
-
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr>
-LiteRtTensorBufferT::CreateManagedOnHostMemory(
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  void* host_memory_ptr;
-  if (auto rc = posix_memalign(
-          &host_memory_ptr, LITERT_HOST_MEMORY_BUFFER_ALIGNMENT, buffer_size);
-      rc) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to allocate aligned memory");
-  }
-
-  LiteRtHostMemoryDeallocator deallocator = ::free;
-  auto tensor_buffer = CreateFromHostMemory(
-      tensor_type,
-      absl::MakeSpan(static_cast<uint8_t*>(host_memory_ptr), buffer_size),
-      deallocator);
-  if (!tensor_buffer) {
-    return Unexpected(tensor_buffer.Error());
-  }
-
-  return std::move(*tensor_buffer);
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromAhwb(
-    const LiteRtRankedTensorType& tensor_type, AHardwareBuffer* ahwb,
-    size_t ahwb_offset, LiteRtAhwbDeallocator deallocator) {
-  auto buffer_size = litert::internal::AhwbBuffer::GetSize(ahwb);
-  if (!buffer_size) {
-    return Unexpected(buffer_size.Error());
-  }
-
-  Ptr tensor_buffer(new LiteRtTensorBufferT(
-      tensor_type, kLiteRtTensorBufferTypeAhwb, *buffer_size, ahwb_offset));
-  tensor_buffer->buffer_ = AhwbBuffer{
-      .ahwb = ahwb,
-      .deallocator = deallocator,
-  };
-
-  if (auto status = tensor_buffer->IsValid(); !status) {
-    return Unexpected(status.Error());
-  }
-
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateManagedAhwbBuffer(
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  auto buffer = litert::internal::AhwbBuffer::Alloc(buffer_size);
-  if (!buffer) {
-    return Unexpected(buffer.Error());
-  }
-  return CreateFromAhwb(tensor_type, buffer->ahwb, /*ahwb_offset=*/0,
-                        /*deallocator=*/litert::internal::AhwbBuffer::Free);
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromIonBuffer(
-    const LiteRtRankedTensorType& tensor_type, void* ion_buffer_addr,
-    int ion_buffer_fd, size_t ion_buffer_size, size_t ion_buffer_offset,
-    LiteRtIonDeallocator deallocator) {
-  if (!ion_buffer_addr) {
-    return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                      "Invalid ION buffer address");
-  }
-  if (ion_buffer_fd < 0) {
-    return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                      "Invalid ION buffer fd");
-  }
-
-  Ptr tensor_buffer(
-      new LiteRtTensorBufferT(tensor_type, kLiteRtTensorBufferTypeIon,
-                              ion_buffer_size, ion_buffer_offset));
-  tensor_buffer->buffer_ = IonBuffer{
-      .addr = ion_buffer_addr,
-      .fd = ion_buffer_fd,
-      .deallocator = deallocator,
-  };
-
-  if (auto status = tensor_buffer->IsValid(); !status) {
-    return Unexpected(status.Error());
-  }
-
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateManagedIonBuffer(
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  auto buffer = litert::internal::IonBuffer::Alloc(
-      buffer_size, /*alignment=*/LITERT_HOST_MEMORY_BUFFER_ALIGNMENT);
-  if (!buffer) {
-    return Unexpected(buffer.Error());
-  }
-  return CreateFromIonBuffer(tensor_type, buffer->addr, buffer->fd, buffer_size,
-                             /*ion_buffer_offset=*/0,
-                             litert::internal::IonBuffer::Free);
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromDmaBufBuffer(
-    const LiteRtRankedTensorType& tensor_type, void* dmabuf_buffer_addr,
-    int dmabuf_buffer_fd, size_t dmabuf_buffer_size,
-    size_t dmabuf_buffer_offset, LiteRtDmaBufDeallocator deallocator) {
-  if (!dmabuf_buffer_addr) {
-    return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                      "Invalid DMA-BUF buffer address");
-  }
-  if (dmabuf_buffer_fd < 0) {
-    return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                      "Invalid DMA-BUF buffer fd");
-  }
-
-  Ptr tensor_buffer(
-      new LiteRtTensorBufferT(tensor_type, kLiteRtTensorBufferTypeDmaBuf,
-                              dmabuf_buffer_size, dmabuf_buffer_offset));
-  tensor_buffer->buffer_ = DmaBufBuffer{
-      .addr = dmabuf_buffer_addr,
-      .fd = dmabuf_buffer_fd,
-      .deallocator = deallocator,
-  };
-
-  if (auto status = tensor_buffer->IsValid(); !status) {
-    return Unexpected(status.Error());
-  }
-
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr>
-LiteRtTensorBufferT::CreateManagedDmaBufBuffer(
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  auto buffer = litert::internal::DmaBufBuffer::Alloc(buffer_size);
-  if (!buffer) {
-    return Unexpected(buffer.Error());
-  }
-  return CreateFromDmaBufBuffer(tensor_type, buffer->addr, buffer->fd,
-                                buffer_size, /*dmabuf_buffer_offset=*/0,
-                                litert::internal::DmaBufBuffer::Free);
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromFastRpcBuffer(
-    const LiteRtRankedTensorType& tensor_type, void* fastrpc_buffer_addr,
-    int fastrpc_buffer_fd, size_t fastrpc_buffer_size,
-    size_t fastrpc_buffer_offset, LiteRtFastRpcDeallocator deallocator) {
-  if (!fastrpc_buffer_addr) {
-    return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                      "Invalid FastRPC buffer address");
-  }
-  if (fastrpc_buffer_fd < 0) {
-    return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                      "Invalid FastRPC buffer fd");
-  }
-
-  Ptr tensor_buffer(
-      new LiteRtTensorBufferT(tensor_type, kLiteRtTensorBufferTypeFastRpc,
-                              fastrpc_buffer_size, fastrpc_buffer_offset));
-  tensor_buffer->buffer_ = FastRpcBuffer{
-      .addr = fastrpc_buffer_addr,
-      .fd = fastrpc_buffer_fd,
-      .deallocator = deallocator,
-  };
-
-  if (auto status = tensor_buffer->IsValid(); !status) {
-    return Unexpected(status.Error());
-  }
-
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr>
-LiteRtTensorBufferT::CreateManagedFastRpcBuffer(
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  auto buffer = litert::internal::FastRpcBuffer::Alloc(buffer_size);
-  if (!buffer) {
-    return Unexpected(buffer.Error());
-  }
-  return CreateFromFastRpcBuffer(tensor_type, buffer->addr, buffer->fd,
-                                 buffer_size, /*fastrpc_buffer_offset=*/0,
-                                 litert::internal::FastRpcBuffer::Free);
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromOpenClBuffer(
-    const LiteRtRankedTensorType& tensor_type, cl_mem buffer,
-    size_t buffer_size, LiteRtOpenClDeallocator deallocator) {
-  Ptr tensor_buffer(new LiteRtTensorBufferT(
-      tensor_type, kLiteRtTensorBufferTypeOpenCl, buffer_size));
-  tensor_buffer->buffer_.emplace<litert::internal::OpenClBuffer>(
-      buffer, buffer_size, deallocator);
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr>
-LiteRtTensorBufferT::CreateManagedOpenClBuffer(
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  auto buffer = litert::internal::OpenClBuffer::Alloc(buffer_size);
-  if (!buffer) {
-    return Unexpected(buffer.Error());
-  }
-  Ptr tensor_buffer(new LiteRtTensorBufferT(
-      tensor_type, kLiteRtTensorBufferTypeOpenCl, buffer_size));
-  tensor_buffer->buffer_.emplace<litert::internal::OpenClBuffer>(
-      std::move(*buffer));
-  return tensor_buffer;
-}
-
-#if LITERT_HAS_OPENGL_SUPPORT
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromGlTexture(
-    const LiteRtRankedTensorType& tensor_type, GLenum target, GLuint id,
-    GLenum format, size_t size_bytes, GLint layer,
-    LiteRtGlTextureDeallocator deallocator) {
-  Ptr tensor_buffer(new LiteRtTensorBufferT(
-      tensor_type, kLiteRtTensorBufferTypeGlTexture, size_bytes));
-  tensor_buffer->buffer_.emplace<litert::internal::GlTexture>(
-      litert::internal::GlTexture(target, id, format, size_bytes, layer,
-                                  deallocator));
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateFromGlBuffer(
-    const LiteRtRankedTensorType& tensor_type, GLenum target, GLuint id,
-    size_t bytes_size, size_t offset, LiteRtGlBufferDeallocator deallocator) {
-  Ptr tensor_buffer(new LiteRtTensorBufferT(
-      tensor_type, kLiteRtTensorBufferTypeGlBuffer, bytes_size));
-  tensor_buffer->buffer_.emplace<litert::internal::GlBuffer>(
-      target, id, bytes_size, offset, deallocator);
-  return tensor_buffer;
-}
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateManagedGlBuffer(
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  auto buffer = litert::internal::GlBuffer::Alloc(buffer_size);
-  if (!buffer) {
-    return Unexpected(buffer.Error());
-  }
-  Ptr tensor_buffer(new LiteRtTensorBufferT(
-      tensor_type, kLiteRtTensorBufferTypeGlBuffer, buffer_size));
-  tensor_buffer->buffer_.emplace<litert::internal::GlBuffer>(
-      std::move(*buffer));
-  return tensor_buffer;
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-Expected<LiteRtTensorBufferT::Ptr> LiteRtTensorBufferT::CreateManaged(
-    LiteRtTensorBufferType buffer_type,
-    const LiteRtRankedTensorType& tensor_type, size_t buffer_size) {
-  switch (buffer_type) {
-    case kLiteRtTensorBufferTypeHostMemory:
-      return CreateManagedOnHostMemory(tensor_type, buffer_size);
-    case kLiteRtTensorBufferTypeAhwb:
-      return CreateManagedAhwbBuffer(tensor_type, buffer_size);
-    case kLiteRtTensorBufferTypeIon:
-      return CreateManagedIonBuffer(tensor_type, buffer_size);
-    case kLiteRtTensorBufferTypeDmaBuf:
-      return CreateManagedDmaBufBuffer(tensor_type, buffer_size);
-    case kLiteRtTensorBufferTypeFastRpc:
-      return CreateManagedFastRpcBuffer(tensor_type, buffer_size);
-    case kLiteRtTensorBufferTypeOpenCl: {
-      return CreateManagedOpenClBuffer(tensor_type, buffer_size);
-    }
-    case kLiteRtTensorBufferTypeGlBuffer: {
-#if LITERT_HAS_OPENGL_SUPPORT
-      return CreateManagedGlBuffer(tensor_type, buffer_size);
-#else
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "OpenGL buffers are not supported");
-#endif
-    }
-    case kLiteRtTensorBufferTypeGlTexture: {
-      return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                        "LiteRT does not support managed GL textures.");
-    }
-    default:
-      return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                        "Unexpected tensor type");
-  }
-}
-
-Expected<void> LiteRtTensorBufferT::IsValid() {
-  // Check for static dimensions.
-  for (auto i = 0; i < tensor_type_.layout.rank; ++i) {
-    if (tensor_type_.layout.dimensions[i] <= 0) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "TensorBuffer must have all static dimensions");
-    }
-  }
-
-  // Check for valid offset.
-  if (buffer_offset() >= buffer_size()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Invalid buffer offset");
-  }
-
-  // Check for sufficient size.
-  if (auto num_bytes = litert::internal::GetNumPackedBytes(tensor_type_);
-      !num_bytes) {
-    return Unexpected(num_bytes.Error());
-  } else if (*num_bytes > buffer_size() - buffer_offset()) {
-    const std::string error_message = absl::StrFormat(
-        "Insufficient buffer size: Required %d bytes, actual size %d bytes",
-        *num_bytes, buffer_size() - buffer_offset());
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure, error_message);
-  }
-
-  // Check for proper alignment.
-  if (buffer_type() == kLiteRtTensorBufferTypeHostMemory) {
-    auto host_buffer = GetHostBuffer();
-    if (!host_buffer) {
-      return Unexpected(host_buffer.Error());
-    }
-    if (reinterpret_cast<uintptr_t>(*host_buffer) %
-        LITERT_HOST_MEMORY_BUFFER_ALIGNMENT) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Unaligned host memory pointer");
-    }
-  }
-
-  return {};
-}
-
-Expected<void*> LiteRtTensorBufferT::GetHostBuffer() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeHostMemory) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  return std::get<HostBuffer>(buffer_).addr;
-}
-
-Expected<AHardwareBuffer*> LiteRtTensorBufferT::GetAhwbBuffer() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeAhwb) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  return std::get<AhwbBuffer>(buffer_).ahwb;
-}
-
-Expected<std::pair<void*, int>> LiteRtTensorBufferT::GetIonBuffer() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeIon) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  auto buffer = std::get<IonBuffer>(buffer_);
-  return std::make_pair(buffer.addr, buffer.fd);
-}
-
-Expected<std::pair<void*, int>> LiteRtTensorBufferT::GetDmaBufBuffer() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeDmaBuf) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  auto buffer = std::get<DmaBufBuffer>(buffer_);
-  return std::make_pair(buffer.addr, buffer.fd);
-}
-
-Expected<std::pair<void*, int>> LiteRtTensorBufferT::GetFastRpcBuffer() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeFastRpc) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  auto buffer = std::get<FastRpcBuffer>(buffer_);
-  return std::make_pair(buffer.addr, buffer.fd);
-}
-
-Expected<litert::internal::OpenClBuffer*>
-LiteRtTensorBufferT::GetOpenClBuffer() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeOpenCl) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  return &std::get<litert::internal::OpenClBuffer>(buffer_);
-}
-
-#if LITERT_HAS_OPENGL_SUPPORT
-Expected<litert::internal::GlTexture*> LiteRtTensorBufferT::GetGlTexture() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeGlTexture) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  return &std::get<litert::internal::GlTexture>(buffer_);
-}
-
-Expected<litert::internal::GlBuffer*> LiteRtTensorBufferT::GetGlBuffer() {
-  if (buffer_type_ != kLiteRtTensorBufferTypeGlBuffer) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unexpected tensor buffer type");
-  }
-  return &std::get<litert::internal::GlBuffer>(buffer_);
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-Expected<void*> LiteRtTensorBufferT::Lock() {
-  if (event_) {
-    // Only AHWB supports waiting on an input sync fence when locking the
-    // buffer. For all other buffer types we wait here.
-    if (buffer_type() != kLiteRtTensorBufferTypeAhwb) {
-      if (auto status = event_->Wait(/*timeout_in_ms=*/-1); !status) {
-        return status.Error();
-      }
-    }
-  }
-
-  switch (buffer_type()) {
-    case kLiteRtTensorBufferTypeHostMemory:
-      return *GetHostBuffer();
-    case kLiteRtTensorBufferTypeAhwb:
-      return litert::internal::AhwbBuffer::Lock(
-          *GetAhwbBuffer(), event_ ? event_->Get() : nullptr);
-    case kLiteRtTensorBufferTypeIon:
-      return GetIonBuffer()->first;
-    case kLiteRtTensorBufferTypeDmaBuf:
-      return GetDmaBufBuffer()->first;
-    case kLiteRtTensorBufferTypeFastRpc:
-      return GetFastRpcBuffer()->first;
-    case kLiteRtTensorBufferTypeOpenCl: {
-      auto opencl_buffer = *GetOpenClBuffer();
-      auto host_memory_ptr = opencl_buffer->Lock<float>();
-      if (host_memory_ptr.HasValue()) {
-        return Expected<void*>(host_memory_ptr.Value());
-      } else {
-        return Unexpected(host_memory_ptr.Error());
-      }
-    }
-    case kLiteRtTensorBufferTypeGlBuffer: {
-#if LITERT_HAS_OPENGL_SUPPORT
-      auto gl_buffer = *GetGlBuffer();
-      auto host_memory_ptr = gl_buffer->Lock<float>();
-      if (host_memory_ptr.HasValue()) {
-        return Expected<void*>(host_memory_ptr.Value());
-      } else {
-        return Unexpected(host_memory_ptr.Error());
-      }
-#else
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "OpenGL buffers are not supported");
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-    }
-    default:
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Unexpected tensor buffer type");
-  }
-}
-
-Expected<void> LiteRtTensorBufferT::Unlock() {
-  switch (buffer_type()) {
-    case kLiteRtTensorBufferTypeAhwb: {
-      auto ahwb = std::get<AhwbBuffer>(buffer_).ahwb;
-      return litert::internal::AhwbBuffer::Unlock(ahwb);
-    }
-    case kLiteRtTensorBufferTypeOpenCl: {
-      auto opencl_buffer = *GetOpenClBuffer();
-      return opencl_buffer->Unlock<float>();
-    }
-    case kLiteRtTensorBufferTypeGlBuffer: {
-#if LITERT_HAS_OPENGL_SUPPORT
-      auto gl_buffer = *GetGlBuffer();
-      return gl_buffer->Unlock<float>();
-#else
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "OpenGL buffers are not supported");
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-    }
-    default:
-      return {};
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/runtime/tensor_buffer.h b/tensorflow/lite/experimental/litert/runtime/tensor_buffer.h
deleted file mode 100644
index 36a1b102e242..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tensor_buffer.h
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_H_
-
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#if LITERT_HAS_OPENGL_SUPPORT
-#include <GLES3/gl31.h>
-#include <GLES3/gl32.h>
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-#include "absl/types/span.h"
-#include <CL/cl.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_event.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#if LITERT_HAS_OPENGL_SUPPORT
-#include "tensorflow/lite/experimental/litert/runtime/gl_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/gl_texture.h"
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-#include "tensorflow/lite/experimental/litert/runtime/open_cl_buffer.h"
-
-class LiteRtTensorBufferT {
- public:
-  using Ptr = std::unique_ptr<LiteRtTensorBufferT>;
-
-  ~LiteRtTensorBufferT();
-
-  // Make this class non-copiable because it includes raw pointers and resource
-  // handles.
-  LiteRtTensorBufferT(const LiteRtTensorBufferT&) = delete;
-  LiteRtTensorBufferT(LiteRtTensorBufferT&&) = delete;
-  LiteRtTensorBufferT& operator=(const LiteRtTensorBufferT&) = delete;
-  LiteRtTensorBufferT& operator=(LiteRtTensorBufferT&&) = delete;
-
-  static litert::Expected<Ptr> CreateFromHostMemory(
-      const LiteRtRankedTensorType& tensor_type,
-      absl::Span<uint8_t> host_memory,
-      LiteRtHostMemoryDeallocator deallocator = nullptr);
-
-  static litert::Expected<Ptr> CreateFromAhwb(
-      const LiteRtRankedTensorType& tensor_type, AHardwareBuffer* ahwb,
-      size_t ahwb_offset, LiteRtAhwbDeallocator deallocator = nullptr);
-
-  static litert::Expected<Ptr> CreateFromIonBuffer(
-      const LiteRtRankedTensorType& tensor_type, void* ion_buffer_addr,
-      int ion_buffer_fd, size_t ion_buffer_size, size_t ion_buffer_offset,
-      LiteRtIonDeallocator deallocator = nullptr);
-
-  static litert::Expected<Ptr> CreateFromDmaBufBuffer(
-      const LiteRtRankedTensorType& tensor_type, void* dmabuf_buffer_addr,
-      int dmabuf_buffer_fd, size_t dmabuf_buffer_size,
-      size_t dmabuf_buffer_offset,
-      LiteRtDmaBufDeallocator deallocator = nullptr);
-
-  static litert::Expected<Ptr> CreateFromFastRpcBuffer(
-      const LiteRtRankedTensorType& tensor_type, void* fastrpc_buffer_addr,
-      int fastrpc_buffer_fd, size_t fastrpc_buffer_size,
-      size_t fastrpc_buffer_offset,
-      LiteRtFastRpcDeallocator deallocator = nullptr);
-
-  static litert::Expected<Ptr> CreateFromOpenClBuffer(
-      const LiteRtRankedTensorType& tensor_type, cl_mem buffer,
-      size_t opencl_buffer_size, LiteRtOpenClDeallocator deallocator = nullptr);
-
-#if LITERT_HAS_OPENGL_SUPPORT
-  static litert::Expected<Ptr> CreateFromGlBuffer(
-      const LiteRtRankedTensorType& tensor_type, GLenum target, GLuint id,
-      size_t bytes_size, size_t offset,
-      LiteRtGlBufferDeallocator deallocator = nullptr);
-
-  static litert::Expected<Ptr> CreateFromGlTexture(
-      const LiteRtRankedTensorType& tensor_type, GLenum target, GLuint id,
-      GLenum format, size_t size_bytes, GLint layer,
-      LiteRtGlTextureDeallocator deallocator = nullptr);
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-  static litert::Expected<Ptr> CreateManaged(
-      LiteRtTensorBufferType buffer_type,
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-
-  LiteRtRankedTensorType tensor_type() const { return tensor_type_; }
-  LiteRtTensorBufferType buffer_type() const { return buffer_type_; }
-  size_t buffer_size() const { return buffer_size_; }
-  size_t buffer_offset() const { return buffer_offset_; }
-
-  bool HasEvent() const { return event_.has_value(); }
-
-  litert::Expected<LiteRtEvent> GetEvent() const {
-    if (!HasEvent()) {
-      return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                           "TensorBuffer has no event");
-    }
-    return event_->Get();
-  }
-
-  void SetEvent(LiteRtEvent e) { event_ = litert::Event(e, true); }
-  void ClearEvent() { event_ = std::nullopt; }
-
-  litert::Expected<void*> GetHostBuffer();
-  litert::Expected<AHardwareBuffer*> GetAhwbBuffer();
-  litert::Expected<std::pair<void*, int>> GetIonBuffer();
-  litert::Expected<std::pair<void*, int>> GetDmaBufBuffer();
-  litert::Expected<std::pair<void*, int>> GetFastRpcBuffer();
-  litert::Expected<litert::internal::OpenClBuffer*> GetOpenClBuffer();
-#if LITERT_HAS_OPENGL_SUPPORT
-  litert::Expected<litert::internal::GlTexture*> GetGlTexture();
-  litert::Expected<litert::internal::GlBuffer*> GetGlBuffer();
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-  litert::Expected<void*> Lock();
-  litert::Expected<void> Unlock();
-
-  // Used to duplicate the current tensor buffer. Internally it increases
-  // reference count to the underlying buffer.
-  void Duplicate() const { Ref(); }
-
-  // Increments reference count by one.
-  void Ref() const { ref_.fetch_add(1, std::memory_order_relaxed); }
-
-  // Decrements reference count by one.  If the count remains
-  // positive, returns false.  When the count reaches zero, returns
-  // true.
-  bool Unref() const {
-    if (ref_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
-      return true;
-    }
-    return false;
-  }
-
-  // Gets the current reference count.
-  int RefCount() const { return ref_.load(std::memory_order_relaxed); }
-
- private:
-  struct HostBuffer {
-    void* addr;
-    LiteRtHostMemoryDeallocator deallocator;
-  };
-
-  struct AhwbBuffer {
-    AHardwareBuffer* ahwb;
-    LiteRtAhwbDeallocator deallocator;
-  };
-
-  struct IonBuffer {
-    void* addr;
-    int fd;
-    LiteRtIonDeallocator deallocator;
-  };
-
-  struct DmaBufBuffer {
-    void* addr;
-    int fd;
-    LiteRtDmaBufDeallocator deallocator;
-  };
-
-  struct FastRpcBuffer {
-    void* addr;
-    int fd;
-    LiteRtFastRpcDeallocator deallocator;
-  };
-
-  LiteRtTensorBufferT(const LiteRtRankedTensorType& tensor_type,
-                      LiteRtTensorBufferType buffer_type, size_t buffer_size,
-                      size_t buffer_offset = 0);
-
-  static litert::Expected<Ptr> CreateManagedOnHostMemory(
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-
-  static litert::Expected<Ptr> CreateManagedAhwbBuffer(
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-
-  static litert::Expected<Ptr> CreateManagedIonBuffer(
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-
-  static litert::Expected<Ptr> CreateManagedDmaBufBuffer(
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-
-  static litert::Expected<Ptr> CreateManagedFastRpcBuffer(
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-
-  static litert::Expected<Ptr> CreateManagedOpenClBuffer(
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-
-#if LITERT_HAS_OPENGL_SUPPORT
-  static litert::Expected<Ptr> CreateManagedGlBuffer(
-      const LiteRtRankedTensorType& tensor_type, size_t buffer_size);
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-  litert::Expected<void> IsValid();
-
-  LiteRtRankedTensorType tensor_type_;
-  std::vector<std::decay_t<decltype(LiteRtLayout::dimensions[0])>> dimensions_;
-  std::vector<std::decay_t<decltype(LiteRtLayout::strides[0])>> strides_;
-  LiteRtTensorBufferType buffer_type_;
-  size_t buffer_size_;
-  size_t buffer_offset_;
-  std::variant<HostBuffer, AhwbBuffer, IonBuffer, DmaBufBuffer, FastRpcBuffer,
-               litert::internal::OpenClBuffer
-#if LITERT_HAS_OPENGL_SUPPORT
-               ,
-               litert::internal::GlBuffer, litert::internal::GlTexture
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-               >
-      buffer_;
-  std::optional<litert::Event> event_;
-  mutable std::atomic_int_fast32_t ref_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.cc b/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.cc
deleted file mode 100644
index fab7728dda66..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.h"
-
-#include "absl/strings/str_format.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_tensor_buffer_utils.h"
-
-#if LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_AHWB_SUPPORT
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
-#include <GLES3/gl31.h>
-#include <GLES3/gl32.h>
-
-#include <cstring>
-
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/runtime/ahwb_buffer.h"
-#include "tensorflow/lite/experimental/litert/runtime/gl_buffer.h"
-
-#endif  // LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_AHWB_SUPPORT
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-
-namespace litert {
-namespace internal {
-
-#if LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_OPENCL_SUPPORT
-
-// TODO(b/383176413): Add gl-cl interop extension.
-Expected<void> CopyGlToCl(GlBuffer& src, OpenClBuffer& dest) {
-  if (src.target() != GL_SHADER_STORAGE_BUFFER) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unsupported GL target for conversion to OpenCL");
-  }
-  size_t cl_size = dest.size_bytes();
-  if (src.bytes_size() != cl_size) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "GL buffer size does not match OpenCL size");
-  }
-  LITERT_ASSIGN_OR_RETURN(void* host_src, src.Lock<char>());
-  LITERT_ASSIGN_OR_RETURN(void* host_dest, dest.Lock<char>());
-  std::memcpy(host_dest, host_src, src.bytes_size());
-  LITERT_RETURN_IF_ERROR(dest.Unlock<char>());
-  LITERT_RETURN_IF_ERROR(src.Unlock<char>());
-  return {};
-}
-
-Expected<LiteRtTensorBufferT::Ptr> TensorBufferConvertGlToCl(
-    LiteRtTensorBufferT& tensor_buffer_gl) {
-  // Create a new CL tensor buffer.
-  LITERT_ASSIGN_OR_RETURN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_cl,
-      LiteRtTensorBufferT::CreateManaged(kLiteRtTensorBufferTypeOpenCl,
-                                         tensor_buffer_gl.tensor_type(),
-                                         tensor_buffer_gl.buffer_size()));
-  LITERT_ASSIGN_OR_RETURN(OpenClBuffer * cl_buffer,
-                          tensor_buffer_cl->GetOpenClBuffer());
-  LITERT_ASSIGN_OR_RETURN(GlBuffer * gl_buffer, tensor_buffer_gl.GetGlBuffer());
-  CopyGlToCl(*gl_buffer, *cl_buffer);
-  return tensor_buffer_cl;
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_CL_SUPPORT
-
-#if LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_AHWB_SUPPORT
-Expected<void> CopyGlToAhwb(GlBuffer& src, AhwbBuffer& dest) {
-  if (src.target() != GL_SHADER_STORAGE_BUFFER) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unsupported GL target for conversion to AHWB");
-  }
-  LITERT_ASSIGN_OR_RETURN(size_t ahwb_size, AhwbBuffer::GetSize(dest.ahwb));
-  if (src.bytes_size() != ahwb_size) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "GL buffer size does not match AHWB size");
-  }
-  LITERT_ASSIGN_OR_RETURN(void* host_src, src.Lock<char>());
-  LITERT_ASSIGN_OR_RETURN(void* host_dest, AhwbBuffer::Lock(dest.ahwb));
-  std::memcpy(host_dest, host_src, src.bytes_size());
-  LITERT_RETURN_IF_ERROR(AhwbBuffer::Unlock(dest.ahwb));
-  LITERT_RETURN_IF_ERROR(src.Unlock<char>());
-  return {};
-}
-
-Expected<LiteRtTensorBufferT::Ptr> TensorBufferConvertGlToAhwb(
-    LiteRtTensorBufferT& tensor_buffer_gl) {
-  // Create a new AHWB tensor buffer.
-  LITERT_ASSIGN_OR_RETURN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_ahwb,
-      LiteRtTensorBufferT::CreateManaged(kLiteRtTensorBufferTypeAhwb,
-                                         tensor_buffer_gl.tensor_type(),
-                                         tensor_buffer_gl.buffer_size()));
-  LITERT_ASSIGN_OR_RETURN(AHardwareBuffer * ahwb,
-                          tensor_buffer_ahwb->GetAhwbBuffer());
-  AhwbBuffer ahwb_buffer{.ahwb = ahwb};
-  LITERT_ASSIGN_OR_RETURN(GlBuffer * gl_buffer, tensor_buffer_gl.GetGlBuffer());
-  CopyGlToAhwb(*gl_buffer, ahwb_buffer);
-  return tensor_buffer_ahwb;
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_AHWB_SUPPORT
-
-#if LITERT_HAS_OPENGL_SUPPORT
-Expected<void> CopyHostToGl(void* host_src, GlBuffer& dest) {
-  LITERT_ASSIGN_OR_RETURN(void* host_dest, dest.Lock<char>());
-  std::memcpy(host_dest, host_src, dest.bytes_size());
-  return {};
-}
-
-Expected<LiteRtTensorBufferT::Ptr> TensorBufferConvertHostToGl(
-    LiteRtTensorBufferT& tensor_buffer_host) {
-  // Create a new GL tensor buffer.
-  LITERT_ASSIGN_OR_RETURN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_gl,
-      LiteRtTensorBufferT::CreateManaged(kLiteRtTensorBufferTypeGlBuffer,
-                                         tensor_buffer_host.tensor_type(),
-                                         tensor_buffer_host.buffer_size()));
-  LITERT_ASSIGN_OR_RETURN(void* host_memory,
-                          tensor_buffer_host.GetHostBuffer());
-  LITERT_ASSIGN_OR_RETURN(GlBuffer * gl_buffer,
-                          tensor_buffer_gl->GetGlBuffer());
-  CopyHostToGl(host_memory, *gl_buffer);
-  return tensor_buffer_gl;
-}
-#endif
-
-Expected<LiteRtTensorBufferT::Ptr> TensorBufferConvertHostTo(
-    LiteRtTensorBufferType buffer_type, LiteRtTensorBufferT& tensor_buffer) {
-  switch (buffer_type) {
-    case kLiteRtTensorBufferTypeGlBuffer:
-#if LITERT_HAS_OPENGL_SUPPORT
-      return TensorBufferConvertHostToGl(tensor_buffer);
-#else
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrFormat("This buffer conversion is not supported: %s -> %s",
-                          BufferTypeToString(tensor_buffer.buffer_type()),
-                          BufferTypeToString(buffer_type)));
-#endif
-    default:
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrFormat("This buffer conversion is not supported: %s -> %s",
-                          BufferTypeToString(tensor_buffer.buffer_type()),
-                          BufferTypeToString(buffer_type)));
-  }
-}
-
-Expected<LiteRtTensorBufferT::Ptr> TensorBufferConvertGlTo(
-    LiteRtTensorBufferType buffer_type, LiteRtTensorBufferT& tensor_buffer) {
-  switch (buffer_type) {
-    case kLiteRtTensorBufferTypeAhwb:
-#if LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_AHWB_SUPPORT
-      return TensorBufferConvertGlToAhwb(tensor_buffer);
-#else
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrFormat("This buffer conversion is not supported: %s -> %s",
-                          BufferTypeToString(tensor_buffer.buffer_type()),
-                          BufferTypeToString(buffer_type)));
-#endif
-    case kLiteRtTensorBufferTypeOpenCl:
-#if LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_OPENCL_SUPPORT
-      return TensorBufferConvertGlToCl(tensor_buffer);
-#else
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrFormat("This buffer conversion is not supported: %s -> %s",
-                          BufferTypeToString(tensor_buffer.buffer_type()),
-                          BufferTypeToString(buffer_type)));
-#endif
-    default:
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrFormat("This buffer conversion is not supported: %s -> %s",
-                          BufferTypeToString(tensor_buffer.buffer_type()),
-                          BufferTypeToString(buffer_type)));
-  }
-}
-
-Expected<LiteRtTensorBufferT::Ptr> TensorBufferConvertTo(
-    LiteRtTensorBufferType buffer_type, LiteRtTensorBufferT& tensor_buffer) {
-  switch (tensor_buffer.buffer_type()) {
-    case kLiteRtTensorBufferTypeHostMemory:
-      return TensorBufferConvertHostTo(buffer_type, tensor_buffer);
-    case kLiteRtTensorBufferTypeGlBuffer:
-      return TensorBufferConvertGlTo(buffer_type, tensor_buffer);
-    default:
-      return Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          absl::StrFormat("This buffer conversion is not supported: %s -> %s",
-                          BufferTypeToString(tensor_buffer.buffer_type()),
-                          BufferTypeToString(buffer_type)));
-  }
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.h b/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.h
deleted file mode 100644
index 4707edf37c46..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_CONVERSION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_CONVERSION_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-
-namespace litert {
-namespace internal {
-
-// Converts the given tensor buffer to the specified buffer type. A new tensor
-// buffer is created and returned. This function locks/unlocks the tensor buffer
-// and will involve a copy.
-// TODO(b/383176413): Investigate zero/fast-copy conversions.
-Expected<LiteRtTensorBufferT::Ptr> TensorBufferConvertTo(
-    LiteRtTensorBufferType buffer_type, LiteRtTensorBufferT& tensor_buffer);
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_CONVERSION_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion_test.cc b/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion_test.cc
deleted file mode 100644
index 4f403f7a23e5..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer_conversion.h"
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-#include "tensorflow/lite/experimental/litert/core/environment.h"
-#include "tensorflow/lite/experimental/litert/runtime/tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-#if LITERT_HAS_OPENGL_SUPPORT
-#include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
-#endif  // LITERT_HAS_OPENGL_SUPPORT
-
-namespace {
-
-constexpr const float kTensorData[] = {10, 20, 30, 40};
-
-constexpr const int32_t kTensorDimensions[] = {sizeof(kTensorData) /
-                                               sizeof(kTensorData[0])};
-
-constexpr const LiteRtRankedTensorType kTensorType = {
-    /*.element_type=*/kLiteRtElementTypeFloat32,
-    ::litert::BuildLayout(kTensorDimensions)};
-
-TEST(TensorBufferConversionTest, HostToGl) {
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(LiteRtEnvironmentT::Ptr litert_env,
-                              LiteRtEnvironmentT::CreateWithOptions({}));
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_host,
-      LiteRtTensorBufferT::CreateManaged(kLiteRtTensorBufferTypeHostMemory,
-                                         kTensorType, sizeof(kTensorData)));
-  // Write data to the host memory.
-  LITERT_ASSERT_OK_AND_ASSIGN(void* host_memory,
-                              tensor_buffer_host->GetHostBuffer());
-  std::memcpy(host_memory, kTensorData, sizeof(kTensorData));
-
-#if LITERT_HAS_OPENGL_SUPPORT
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_gl,
-      litert::internal::TensorBufferConvertTo(kLiteRtTensorBufferTypeGlBuffer,
-                                              *tensor_buffer_host));
-
-  // Ensure that data was copied correctly from host to GL.
-  LITERT_ASSERT_OK_AND_ASSIGN(void* host_gl, tensor_buffer_gl->Lock());
-  ASSERT_EQ(std::memcmp(host_gl, kTensorData, sizeof(kTensorData)), 0);
-#else
-  // Since GL support is not enabled, the conversion should fail.
-  EXPECT_FALSE(litert::internal::TensorBufferConvertTo(
-      kLiteRtTensorBufferTypeGlBuffer, *tensor_buffer_host));
-#endif
-}
-
-#if LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_AHWB_SUPPORT
-TEST(TensorBufferConversionTest, GlToAhwb) {
-  std::unique_ptr<tflite::gpu::gl::EglEnvironment> env;
-  ASSERT_TRUE(tflite::gpu::gl::EglEnvironment::NewEglEnvironment(&env).ok());
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_gl,
-      LiteRtTensorBufferT::CreateManaged(kLiteRtTensorBufferTypeGlBuffer,
-                                         kTensorType, sizeof(kTensorData)));
-  // Write data to the GL buffer.
-  LITERT_ASSERT_OK_AND_ASSIGN(litert::internal::GlBuffer * gl_buffer,
-                              tensor_buffer_gl->GetGlBuffer());
-  LITERT_ASSERT_OK_AND_ASSIGN(float* data, gl_buffer->Lock<float>());
-  std::memcpy(data, kTensorData, sizeof(kTensorData));
-  gl_buffer->Unlock<float>();
-
-  // Convert.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_ahwb,
-      litert::internal::TensorBufferConvertTo(kLiteRtTensorBufferTypeAhwb,
-                                              *tensor_buffer_gl));
-  // Ensure that data was copied correctly from Gl to Ahwb.
-  LITERT_ASSERT_OK_AND_ASSIGN(void* host_ahwb, tensor_buffer_ahwb->Lock());
-  ASSERT_EQ(std::memcmp(host_ahwb, kTensorData, sizeof(kTensorData)), 0);
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_AHWB_SUPPORT
-
-#if LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_OPENCL_SUPPORT
-TEST(TensorBufferConversionTest, GlToCl) {
-  // Environment setup.
-  LITERT_ASSERT_OK_AND_ASSIGN(LiteRtEnvironmentT::Ptr litert_env,
-                              LiteRtEnvironmentT::CreateWithOptions({}));
-  if (!litert::internal::OpenClBuffer::IsSupported()) {
-    GTEST_SKIP() << "OpenCL buffers are not supported on this platform; "
-                    "skipping the test";
-  }
-
-  std::unique_ptr<tflite::gpu::gl::EglEnvironment> env;
-  ASSERT_TRUE(tflite::gpu::gl::EglEnvironment::NewEglEnvironment(&env).ok());
-
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_gl,
-      LiteRtTensorBufferT::CreateManaged(kLiteRtTensorBufferTypeGlBuffer,
-                                         kTensorType, sizeof(kTensorData)));
-  // Write data to the GL buffer.
-  LITERT_ASSERT_OK_AND_ASSIGN(litert::internal::GlBuffer * gl_buffer,
-                              tensor_buffer_gl->GetGlBuffer());
-  LITERT_ASSERT_OK_AND_ASSIGN(float* data, gl_buffer->Lock<float>());
-  std::memcpy(data, kTensorData, sizeof(kTensorData));
-  gl_buffer->Unlock<float>();
-
-  // Convert.
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      LiteRtTensorBufferT::Ptr tensor_buffer_cl,
-      litert::internal::TensorBufferConvertTo(kLiteRtTensorBufferTypeOpenCl,
-                                              *tensor_buffer_gl));
-
-  // Ensure that data was copied correctly from Gl to CL.
-  LITERT_ASSERT_OK_AND_ASSIGN(void* host_cl, tensor_buffer_cl->Lock());
-  ASSERT_EQ(std::memcmp(host_cl, kTensorData, sizeof(kTensorData)), 0);
-}
-#endif  // LITERT_HAS_OPENGL_SUPPORT && LITERT_HAS_OPENCL_SUPPORT
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_requirements.h b/tensorflow/lite/experimental/litert/runtime/tensor_buffer_requirements.h
deleted file mode 100644
index 04f461966889..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tensor_buffer_requirements.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_REQUIREMENTS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_REQUIREMENTS_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-
-class LiteRtTensorBufferRequirementsT {
- public:
-  LiteRtTensorBufferRequirementsT(
-      int num_supported_tensor_buffer_types,
-      const LiteRtTensorBufferType* supported_tensor_buffer_types,
-      size_t buffer_size, std::vector<uint32_t>&& strides)
-      : supported_buffer_types_(
-            supported_tensor_buffer_types,
-            supported_tensor_buffer_types + num_supported_tensor_buffer_types),
-        buffer_size_(buffer_size),
-        strides_(std::move(strides)) {}
-  const std::vector<LiteRtTensorBufferType>& SupportedBufferTypes() const {
-    return supported_buffer_types_;
-  }
-  size_t BufferSize() const { return buffer_size_; }
-  const std::vector<uint32_t>& Strides() const { return strides_; }
-
- private:
-  std::vector<LiteRtTensorBufferType> supported_buffer_types_;
-  size_t buffer_size_;
-  // Stride per each dimension.
-  std::vector<uint32_t> strides_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TENSOR_BUFFER_REQUIREMENTS_H_
diff --git a/tensorflow/lite/experimental/litert/runtime/tfl_utils.cc b/tensorflow/lite/experimental/litert/runtime/tfl_utils.cc
deleted file mode 100644
index 37e419c68b00..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tfl_utils.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/runtime/tfl_utils.h"
-
-#include <cstddef>
-#include <utility>
-
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_detail.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-namespace litert {
-namespace internal {
-
-Expected<ElementType> ConvertElementType(TfLiteType tfl_type) {
-  switch (tfl_type) {
-    case kTfLiteNoType:
-      return ElementType::None;
-    case kTfLiteBool:
-      return ElementType::Bool;
-    case kTfLiteInt4:
-      return ElementType::Int4;
-    case kTfLiteInt8:
-      return ElementType::Int8;
-    case kTfLiteInt16:
-      return ElementType::Int16;
-    case kTfLiteInt32:
-      return ElementType::Int32;
-    case kTfLiteInt64:
-      return ElementType::Int64;
-    case kTfLiteUInt8:
-      return ElementType::UInt8;
-    case kTfLiteUInt16:
-      return ElementType::UInt16;
-    case kTfLiteUInt32:
-      return ElementType::UInt32;
-    case kTfLiteUInt64:
-      return ElementType::UInt64;
-    case kTfLiteFloat16:
-      return ElementType::Float16;
-    case kTfLiteBFloat16:
-      return ElementType::BFloat16;
-    case kTfLiteFloat32:
-      return ElementType::Float32;
-    case kTfLiteFloat64:
-      return ElementType::Float64;
-    case kTfLiteComplex64:
-      return ElementType::Complex64;
-    case kTfLiteComplex128:
-      return ElementType::Complex128;
-    case kTfLiteResource:
-      return ElementType::TfResource;
-    case kTfLiteString:
-      return ElementType::TfString;
-    case kTfLiteVariant:
-      return ElementType::TfVariant;
-    default:
-      return Unexpected(kLiteRtStatusErrorInvalidArgument,
-                        "Unsupported TfLiteType");
-  }
-}
-
-Expected<RankedTensorType> ConvertTensorType(
-    const TfLiteOpaqueTensor* tfl_opaque_tensor) {
-  auto tfl_type = TfLiteOpaqueTensorType(tfl_opaque_tensor);
-  auto element_type = ConvertElementType(tfl_type);
-  if (!element_type) {
-    return Unexpected(element_type.Error());
-  }
-
-  size_t rank = TfLiteOpaqueTensorNumDims(tfl_opaque_tensor);
-  Dimensions dimensions(rank);
-  for (size_t i = 0; i < rank; ++i) {
-    dimensions[i] = TfLiteOpaqueTensorDim(tfl_opaque_tensor, i);
-  }
-
-  return RankedTensorType(*element_type, Layout(std::move(dimensions)));
-}
-
-}  // namespace internal
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/runtime/tfl_utils.h b/tensorflow/lite/experimental/litert/runtime/tfl_utils.h
deleted file mode 100644
index 6bfd7972c0e6..000000000000
--- a/tensorflow/lite/experimental/litert/runtime/tfl_utils.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TFL_UTILS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TFL_UTILS_H_
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-struct TfLiteOpaqueTensor;
-
-namespace litert {
-namespace internal {
-
-Expected<ElementType> ConvertElementType(TfLiteType tfl_type);
-
-Expected<RankedTensorType> ConvertTensorType(
-    const TfLiteOpaqueTensor* tfl_opaque_tensor);
-
-}  // namespace internal
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_RUNTIME_TFL_UTILS_H_
diff --git a/tensorflow/lite/experimental/litert/test/BUILD b/tensorflow/lite/experimental/litert/test/BUILD
deleted file mode 100644
index 67a7e51876c9..000000000000
--- a/tensorflow/lite/experimental/litert/test/BUILD
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        # copybara:uncomment "//third_party/mediapipe/calculators/tensor:__subpackages__",
-        # copybara:uncomment "//third_party/odml/litert:__subpackages__",
-        "//tensorflow/lite/experimental/litert:__subpackages__",
-    ],
-)
-
-# TODO: b/365295276 - Make custom rule and move to `.sh`.
-OUT_DIR = "$(RULEDIR)"
-
-CONVERTER = "//tensorflow/compiler/mlir/lite:tf_tfl_translate"
-
-CMD = """
-for mlir_file in $(SRCS); do
-    $(location {converter}) --input-mlir $$mlir_file --o={out_dir}/testdata/$$(basename $$mlir_file .mlir).tflite
-done
-""".format(
-    converter = CONVERTER,
-    out_dir = OUT_DIR,
-)
-
-genrule(
-    name = "mlir_test_data",
-    srcs = glob(["testdata/*.mlir"]),
-    outs = [s.removesuffix(".mlir") + ".tflite" for s in glob(["testdata/*.mlir"])],
-    cmd = CMD,
-    tools = [CONVERTER],
-)
-
-filegroup(
-    name = "tflite_test_data",
-    srcs = glob(["testdata/*.tflite"]),
-)
-
-cc_library(
-    name = "common",
-    testonly = 1,
-    srcs = [
-        "common.cc",
-    ],
-    hdrs = [
-        "common.h",
-    ],
-    deps = [
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_opaque",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:cc_api_stable",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "//tensorflow/lite/experimental/litert/core/model:model_buffer",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform",
-    ],
-)
-
-cc_library(
-    name = "simple_model",
-    testonly = 1,
-    hdrs = [
-        "testdata/simple_model_test_vectors.h",
-    ],
-    data = [
-        "testdata/simple_model.tflite",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_layout",
-    ],
-)
-
-cc_library(
-    name = "simple_model_npu",
-    testonly = 1,
-    srcs = [],
-    hdrs = [
-        "testdata/simple_model_test_vectors.h",
-    ],
-    data = [
-        "testdata/simple_model_google_tensor.bin",
-        "testdata/simple_model_mtk.bin",
-        "testdata/simple_model_npu.tflite",
-        "testdata/simple_model_qualcomm.bin",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_layout",
-    ],
-)
-
-cc_library(
-    name = "simple_cascade_model_npu",
-    testonly = 1,
-    srcs = [],
-    hdrs = [
-        "testdata/simple_model_test_vectors.h",
-    ],
-    data = [
-        "testdata/simple_cascade_model_npu.tflite",
-        "testdata/simple_model_google_tensor.bin",
-        "testdata/simple_model_mtk.bin",
-        "testdata/simple_model_qualcomm.bin",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_layout",
-    ],
-)
-
-cc_library(
-    name = "test_models",
-    hdrs = ["test_models.h"],
-    deps = [
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "matchers",
-    testonly = True,
-    hdrs = ["matchers.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "matchers_test",
-    srcs = ["matchers_test.cc"],
-    deps = [
-        ":matchers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-# Use this library if you want to enforce an OSS environment for your test.
-cc_library(
-    name = "matchers_oss",
-    testonly = True,
-    hdrs = ["matchers.h"],
-    defines = ["LITERT_DEFINE_GTEST_STATUS_PRINTER"],
-    tags = ["avoid_dep"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-exports_files(srcs = [
-    "testdata/simple_model_google_tensor.bin",
-    "testdata/simple_model_qualcomm.bin",
-])
diff --git a/tensorflow/lite/experimental/litert/test/common.cc b/tensorflow/lite/experimental/litert/test/common.cc
deleted file mode 100644
index 930118be2b13..000000000000
--- a/tensorflow/lite/experimental/litert/test/common.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/test/common.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <filesystem>  // NOLINT
-#include <ios>
-#include <memory>
-#include <random>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/base/attributes.h"
-#include "absl/base/const_init.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tsl/platform/platform.h"
-
-namespace litert {
-namespace testing {
-
-Expected<UniqueTestDirectory> UniqueTestDirectory::Create() {
-  constexpr size_t kMaxTries = 1000;
-  ABSL_CONST_INIT static absl::Mutex mutex(absl::kConstInit);
-
-  // We don't want multiple threads to create the same directory.
-  absl::MutexLock l(&mutex);
-
-  auto tmp_dir = std::filesystem::temp_directory_path();
-  std::random_device dev;
-  std::mt19937 prng(dev());
-  std::uniform_int_distribution<uint64_t> rand(0);
-  std::stringstream ss;
-
-  for (auto i = 0; i < kMaxTries; ++i) {
-    ss.clear();
-    ss << std::hex << rand(prng);
-    auto path = tmp_dir / ss.str();
-    if (std::filesystem::create_directory(path)) {
-      LITERT_LOG(LITERT_INFO, "Created unique temporary directory %s",
-                 path.c_str());
-      return UniqueTestDirectory(path);
-    }
-  }
-
-  return Error(kLiteRtStatusErrorRuntimeFailure,
-               "Could not create a unique temporary directory");
-}
-
-UniqueTestDirectory::~UniqueTestDirectory() {
-  std::filesystem::remove_all(tmpdir_);
-}
-
-std::string GetTestFilePath(absl::string_view filename) {
-  static constexpr absl::string_view kTestDataDir =
-      "tensorflow/lite/experimental/litert/"
-      "test/testdata/";
-
-  if constexpr (!tsl::kIsOpenSource) {
-    return internal::Join({"third_party", kTestDataDir, filename});
-  } else {
-    return internal::Join({kTestDataDir, filename});
-  }
-}
-
-std::string GetTfliteFilePath(absl::string_view filename) {
-  static constexpr absl::string_view kTestDataDir = "tensorflow/lite/";
-
-  if constexpr (!tsl::kIsOpenSource) {
-    return internal::Join({"third_party", kTestDataDir, filename});
-  } else {
-    return internal::Join({kTestDataDir, filename});
-  }
-}
-
-Model LoadTestFileModel(absl::string_view filename) {
-  return *Model::CreateFromFile(GetTestFilePath(filename));
-}
-
-Expected<TflRuntime::Ptr> TflRuntime::CreateFromFlatBuffer(
-    internal::FlatbufferWrapper::Ptr flatbuffer) {
-  ::tflite::Interpreter::Ptr interp;
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  tflite::InterpreterBuilder(flatbuffer->FlatbufferModel(), resolver)(&interp);
-  if (interp == nullptr) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure);
-  }
-  return TflRuntime::Ptr(
-      new TflRuntime(std::move(flatbuffer), std::move(interp)));
-}
-
-}  // namespace testing
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/test/common.h b/tensorflow/lite/experimental/litert/test/common.h
deleted file mode 100644
index bd23b79174f3..000000000000
--- a/tensorflow/lite/experimental/litert/test/common.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_COMMON_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_COMMON_H_
-
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_buffer.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/interpreter.h"
-
-namespace litert {
-namespace testing {
-
-// A x-platform compatible replacement for testing::UniqueTestDirectory.
-class UniqueTestDirectory {
- public:
-  static Expected<UniqueTestDirectory> Create();
-  ~UniqueTestDirectory();
-
-  UniqueTestDirectory(const UniqueTestDirectory&) = delete;
-  UniqueTestDirectory(UniqueTestDirectory&&) = default;
-  UniqueTestDirectory& operator=(const UniqueTestDirectory&) = delete;
-  UniqueTestDirectory& operator=(UniqueTestDirectory&&) = default;
-
-  absl::string_view Str() const { return tmpdir_; }
-
- private:
-  explicit UniqueTestDirectory(std::string&& tmpdir)
-      : tmpdir_(std::move(tmpdir)) {}
-  std::string tmpdir_;
-};
-
-std::string GetTestFilePath(absl::string_view filename);
-std::string GetTfliteFilePath(absl::string_view filename);
-
-Model LoadTestFileModel(absl::string_view filename);
-
-class TflRuntime {
- public:
-  using Ptr = std::unique_ptr<TflRuntime>;
-
-  static Expected<Ptr> CreateFromFlatBuffer(
-      internal::FlatbufferWrapper::Ptr flatbuffer);
-
-  ::tflite::Interpreter& Interpreter() { return *interpreter_; }
-
-  const internal::FlatbufferWrapper& Flatbuffer() const { return *flatbuffer_; }
-
- private:
-  TflRuntime(internal::FlatbufferWrapper::Ptr flatbuffer,
-             ::tflite::Interpreter::Ptr interpreter)
-      : flatbuffer_(std::move(flatbuffer)),
-        interpreter_(std::move(interpreter)) {}
-
-  internal::FlatbufferWrapper::Ptr flatbuffer_;
-  ::tflite::Interpreter::Ptr interpreter_;
-};
-
-inline Expected<TflRuntime::Ptr> MakeRuntimeFromTestFile(
-    absl::string_view filename) {
-  auto flatbuffer =
-      internal::FlatbufferWrapper::CreateFromTflFile(GetTestFilePath(filename));
-  if (!flatbuffer) {
-    return flatbuffer.Error();
-  }
-  return TflRuntime::CreateFromFlatBuffer(std::move(*flatbuffer));
-}
-
-inline Expected<TflRuntime::Ptr> MakeRuntimeFromTestFileWithNpuModel(
-    absl::string_view filename, absl::string_view npu_filename) {
-  auto buf = internal::GetModelBufWithByteCode(GetTestFilePath(filename),
-                                               GetTestFilePath(npu_filename));
-  if (!buf) {
-    return buf.Error();
-  }
-  auto flatbuffer =
-      internal::FlatbufferWrapper::CreateFromBuffer(std::move(*buf));
-  if (!flatbuffer) {
-    return flatbuffer.Error();
-  }
-  return TflRuntime::CreateFromFlatBuffer(std::move(*flatbuffer));
-}
-
-}  // namespace testing
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_COMMON_H_
diff --git a/tensorflow/lite/experimental/litert/test/matchers.h b/tensorflow/lite/experimental/litert/test/matchers.h
deleted file mode 100644
index c1af62b63b41..000000000000
--- a/tensorflow/lite/experimental/litert/test/matchers.h
+++ /dev/null
@@ -1,356 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_MATCHERS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_MATCHERS_H_
-
-#include <optional>
-#include <ostream>
-#include <string>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-// Is equivalent to `ASSERT_THAT(expr, testing::litert::IsOk())`
-#define LITERT_ASSERT_OK(EXPR) ASSERT_THAT((EXPR), ::testing::litert::IsOk())
-
-// Is equivalent to `EXPECT_THAT(expr, testing::litert::IsOk())`
-#define LITERT_EXPECT_OK(EXPR) EXPECT_THAT((EXPR), ::testing::litert::IsOk())
-
-// Checks that the result of `EXPR` (a `litert::Expected` object) is not an
-// error and assigns the value it holds to `DECL` as if:
-// ```
-// DECL = std::move(EXPR.Value());
-// ```
-//
-// ```cpp
-// Expected<Something> BuildSomething();
-//
-// Will fail the test if `BuildSomething()`'s returned value holds an error.
-// Otherwise defines and assigns the returned `Something` value to `smth`
-// ASSERT_OK_AND_ASSIGN(Something smth, BuildSomething());
-// ```
-#define LITERT_ASSERT_OK_AND_ASSIGN(DECL, EXPR) \
-  LITERT_ASSERT_OK_AND_ASSIGN_HELPER2(__LINE__, DECL, EXPR)
-
-#define LITERT_ASSERT_OK_AND_ASSIGN_HELPER1(LINE, DECL, EXPR) \
-  auto&& litert_expected_value_or_error_##LINE = (EXPR);      \
-  LITERT_ASSERT_OK(litert_expected_value_or_error_##LINE);    \
-  DECL = std::move(litert_expected_value_or_error_##LINE.Value());
-
-#define LITERT_ASSERT_OK_AND_ASSIGN_HELPER2(LINE, DECL, EXPR) \
-  LITERT_ASSERT_OK_AND_ASSIGN_HELPER1(LINE, DECL, EXPR)
-
-namespace testing::litert {
-
-// Matches `litert::Expected` values that hold a success value and
-// `LiteRtStatusOk`.
-//
-// See `IsOk()` function below for usage examples.
-class IsOkMatcher {
- public:
-  // Implicitly builds and wraps the matcher implementation in a GTest
-  // Matcher object.
-  template <class T>
-  // NOLINTNEXTLINE(*-explicit-constructor): This needs to be implicit.
-  operator testing::Matcher<T>() const {
-    return ::testing::Matcher<T>(new Impl<const T&>());
-  }
-
-  template <class V>
-  class Impl : public ::testing::MatcherInterface<V> {
-    template <class T>
-    bool MatchAndExplainImpl(const ::litert::Expected<T>& value,
-                             testing::MatchResultListener* listener) const {
-      return value.HasValue();
-    }
-
-    bool MatchAndExplainImpl(const ::litert::Unexpected& unexpected,
-                             testing::MatchResultListener* listener) const {
-      return false;
-    }
-
-    bool MatchAndExplainImpl(const ::litert::Error& e,
-                             testing::MatchResultListener* listener) const {
-      return false;
-    }
-
-    bool MatchAndExplainImpl(const LiteRtStatus& status,
-                             testing::MatchResultListener* listener) const {
-      if (status != kLiteRtStatusOk) {
-        *listener << "status is " << LiteRtGetStatusString(status);
-        return false;
-      }
-      return true;
-    }
-
-   public:
-    using is_gtest_matcher = void;
-
-    bool MatchAndExplain(
-        V value, testing::MatchResultListener* listener) const override {
-      return MatchAndExplainImpl(value, listener);
-    }
-
-    void DescribeTo(std::ostream* os) const override {
-      if (os) {
-        *os << "is ok.";
-      }
-    }
-
-    void DescribeNegationTo(std::ostream* os) const override {
-      if (os) {
-        *os << "is not ok.";
-      }
-    }
-  };
-};
-
-// Matches `litert::Expected` values that hold a success value and
-// `LiteRtStatusOk`.
-//
-// Note: you might want to use the convenience macros:
-//   - `LITERT_EXPECT_OK(expr)`
-//   - `LITERT_ASSERT_OK(expr)`
-//   - `ASSERT_OK_AND_ASSIGN(type var, expr)`
-//
-// ```cpp
-// LiteRtStatus DoSomething();
-//
-// // Will fail the test if DoSomething() doesn't return kLiteRtStatusOk.
-// EXPECT_THAT(DoSomething(), IsOk());
-// ```
-//
-// This also works for `Expected` objects.
-//
-// Note: You probably want `ASSERT_OK_AND_ASSIGN` when working with `Expected`.
-//
-// ```cpp
-// Expected<Something> BuildSomething();
-//
-// // Will fail the test if BuildSomething()'s returned value holds an error.
-// // Note that the returned value is unused.
-// EXPECT_THAT(BuildSomething(), IsOk());
-// ```
-inline IsOkMatcher IsOk() { return IsOkMatcher(); }
-
-// Matches `litert::Expected` values that hold an error and
-// `LiteRtStatusError*` values.
-//
-// See `IsError(...)` functions below for usage examples.
-class IsErrorMatcher {
- public:
-  IsErrorMatcher(std::optional<LiteRtStatus> status,
-                 std::optional<std::string> msg)
-      : impl_(status, msg) {}
-
-  // Implicitly builds and wraps the matcher implementation in a GTest
-  // Matcher object.
-  template <class T>
-  // NOLINTNEXTLINE(*-explicit-constructor): This needs to be implicit.
-  operator testing::Matcher<T>() const {
-    return ::testing::Matcher<T>(new Impl<const T&>(impl_));
-  }
-
- private:
-  class ImplBase {
-   public:
-    ImplBase() = default;
-
-    explicit ImplBase(std::optional<LiteRtStatus> status,
-                      std::optional<std::string> msg)
-        : status_(status), msg_(std::move(msg)) {};
-
-   protected:
-    bool MatchAndExplainImpl(const LiteRtStatus status,
-                             const absl::string_view msg,
-                             testing::MatchResultListener* listener) const {
-      if (status == kLiteRtStatusOk ||
-          (status_.has_value() && status != status_.value())) {
-        if (listener) {
-          *listener << "status doesn't match";
-        }
-        return false;
-      }
-      if (msg_.has_value() && msg != msg_.value()) {
-        if (listener) {
-          *listener << "message doesn't match";
-        }
-        return false;
-      }
-      return true;
-    }
-
-    template <class T>
-    bool MatchAndExplainImpl(const ::litert::Expected<T>& value,
-                             testing::MatchResultListener* listener) const {
-      if (value.HasValue()) {
-        *listener << "expected holds a value (but should hold an error)";
-        return false;
-      }
-      return MatchAndExplainImpl(value.Error(), listener);
-    }
-
-    bool MatchAndExplainImpl(const ::litert::Unexpected& e,
-                             testing::MatchResultListener* listener) const {
-      return MatchAndExplainImpl(e.Error().Status(), e.Error().Message(),
-                                 listener);
-    }
-
-    bool MatchAndExplainImpl(const ::litert::Error& e,
-                             testing::MatchResultListener* listener) const {
-      return MatchAndExplainImpl(e.Status(), e.Message(), listener);
-    }
-
-    bool MatchAndExplainImpl(const LiteRtStatus& status,
-                             testing::MatchResultListener* listener) const {
-      return MatchAndExplainImpl(status, {}, listener);
-    }
-
-    void DescribeImpl(std::ostream* os, const bool negation) const {
-      if (os) {
-        *os << "is" << (negation ? " not" : "") << " an error";
-        const char* sep = " with ";
-        if (status_.has_value()) {
-          *os << sep << "status " << LiteRtGetStatusString(status_.value());
-          sep = " and ";
-        }
-        if (msg_.has_value()) {
-          *os << sep << "message matching: '" << msg_.value() << "'";
-        }
-        *os << ".";
-      }
-    }
-
-   private:
-    std::optional<LiteRtStatus> status_;
-    std::optional<std::string> msg_;
-  };
-
-  template <class V>
-  class Impl : public ::testing::MatcherInterface<V>, ImplBase {
-   public:
-    using is_gtest_matcher = void;
-
-    Impl() = default;
-    explicit Impl(const ImplBase& base) : ImplBase(base) {}
-
-    bool MatchAndExplain(
-        V value, testing::MatchResultListener* listener) const override {
-      return MatchAndExplainImpl(value, listener);
-    }
-
-    void DescribeTo(std::ostream* os) const override {
-      DescribeImpl(os, /*negation=*/false);
-    }
-
-    void DescribeNegationTo(std::ostream* os) const override {
-      DescribeImpl(os, /*negation=*/true);
-    }
-  };
-
-  ImplBase impl_;
-};
-
-// Matches `litert::Expected`, `litert::Unexpected`, `litert::Error` and
-// `LiteRtStatus` values that hold an error.
-//
-// Note: This will always match `true` for `litert::Unexpected` and
-// `litert::Error`. This can be useful to test template code that might always
-// return an error for certain specialisations.
-//
-// ```cpp
-// LiteRtStatus DoSomething();
-//
-// // Will fail the test if `DoSomething()` returns `kLiteRtStatusOk`.
-// EXPECT_THAT(DoSomething(), IsError());
-// ```
-//
-// This also works for `Expected` objects.
-//
-// ```cpp
-// Expected<Something> BuildSomething();
-//
-// // Will fail the test if BuildSomething()'s returned object holds a value.
-// EXPECT_THAT(BuildSomething(), IsError());
-// ```
-inline IsErrorMatcher IsError() {
-  return IsErrorMatcher(/*status=*/std::nullopt, /*msg=*/std::nullopt);
-}
-
-// Matches `litert::Expected`, `litert::Unexpected`, `litert::Error` and
-// `LiteRtStatus` values that hold a specific error status.
-//
-// ```cpp
-// Expected<Something> BuildSomething();
-//
-// // Will fail the test if BuildSomething()'s returned object holds a value or
-// // if the error status is not `kLiteRtStatusErrorSystemError`.
-// EXPECT_THAT(BuildSomething(), IsError(kLiteRtStatusErrorSystemError));
-// ```
-inline IsErrorMatcher IsError(LiteRtStatus status) {
-  return IsErrorMatcher(status, /*msg=*/std::nullopt);
-}
-
-// Matches `litert::Expected` and `LiteRtStatus` values that have a specific
-// error status and error message.
-//
-// Warning: This will always return `false` for `LiteRtStatus` objects as those
-// do not convey a message.
-//
-// ```cpp
-// Expected<Something> BuildSomething();
-//
-// // Will fail the test if BuildSomething()'s returned object holds a value.
-// EXPECT_THAT(BuildSomething(), IsError(kLiteRtStatusErrorSystemError,
-//                                       "System is not initialised"));
-// ```
-inline IsErrorMatcher IsError(LiteRtStatus status, std::string msg) {
-  return IsErrorMatcher(status, std::move(msg));
-}
-
-}  // namespace testing::litert
-
-// GTest doesn't use `AbslStringify` if `GTEST_USE_ABSL` is not defined. This
-// provides a fallback implementation.
-//
-// This is defined here instead of with `litert::Expected` because those
-// functions should only be used for testing.
-#if defined(LITERT_DEFINE_GTEST_STATUS_PRINTER) && !defined(GTEST_USE_ABSL)
-#include "absl/strings/str_format.h"
-
-namespace litert {
-
-inline void PrintTo(const Error& error, std::ostream* os) {
-  *os << absl::StrFormat("%v", error);
-}
-
-inline void PrintTo(const Unexpected& unexpected, std::ostream* os) {
-  *os << absl::StrFormat("%v", unexpected);
-}
-
-template <class T>
-void PrintTo(const Expected<T>& expected, std::ostream* os) {
-  *os << absl::StrFormat("%v", expected);
-}
-
-}  // namespace litert
-
-#endif
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_MATCHERS_H_
diff --git a/tensorflow/lite/experimental/litert/test/matchers_test.cc b/tensorflow/lite/experimental/litert/test/matchers_test.cc
deleted file mode 100644
index 27eb266d3c35..000000000000
--- a/tensorflow/lite/experimental/litert/test/matchers_test.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-#include <type_traits>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest-spi.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-using testing::Not;
-using testing::StrEq;
-using testing::litert::IsError;
-using testing::litert::IsOk;
-
-namespace litert {
-namespace {
-
-struct CopyOnly {
-  CopyOnly() = default;
-  CopyOnly(const CopyOnly&) = default;
-  CopyOnly& operator=(const CopyOnly&) = default;
-};
-
-struct MoveOnly {
-  MoveOnly() = default;
-  MoveOnly(MoveOnly&&) = default;
-  MoveOnly& operator=(MoveOnly&&) = default;
-};
-
-TEST(IsOkMatcherTest, Works) {
-  const Expected<int> error = Error(kLiteRtStatusErrorNotFound, "not found");
-  EXPECT_THAT(kLiteRtStatusOk, IsOk());
-  EXPECT_THAT(Expected<int>(3), IsOk());
-
-  EXPECT_THAT(error, Not(IsOk()));
-  EXPECT_THAT(Unexpected(kLiteRtStatusErrorFileIO), Not(IsOk()));
-  EXPECT_THAT(Error(kLiteRtStatusErrorInvalidArgument), Not(IsOk()));
-
-  EXPECT_THAT(kLiteRtStatusErrorUnsupported, Not(IsOk()));
-
-  EXPECT_THAT(testing::DescribeMatcher<Expected<int>>(IsOk()), StrEq("is ok."));
-  EXPECT_THAT(testing::DescribeMatcher<Expected<int>>(Not(IsOk())),
-              StrEq("is not ok."));
-
-  testing::StringMatchResultListener listener;
-  EXPECT_FALSE(testing::ExplainMatchResult(
-      IsOk(), kLiteRtStatusErrorUnsupported, &listener));
-  EXPECT_THAT(listener.str(), StrEq("status is kLiteRtStatusErrorUnsupported"));
-
-  listener.Clear();
-  EXPECT_FALSE(testing::ExplainMatchResult(IsOk(), error, &listener));
-  EXPECT_THAT(listener.str(), StrEq(""));
-
-  listener.Clear();
-  EXPECT_FALSE(testing::ExplainMatchResult(IsOk(), error.Error(), &listener));
-  EXPECT_THAT(listener.str(), StrEq(""));
-}
-
-// No, I'm not creating a templated test fixture just for that. This only
-// contains non-fatal failures that are propagated to the test.
-//
-// The type of the error wrapper that fails is the test failure stack trace when
-// debug options are specified.
-template <class ErrorWrapper>
-void TestErrorWrapper() {
-  const ErrorWrapper error = Error(kLiteRtStatusErrorNotFound, "not found");
-  EXPECT_THAT(error, IsError());
-  EXPECT_THAT(error, IsError(kLiteRtStatusErrorNotFound));
-  EXPECT_THAT(error, IsError(kLiteRtStatusErrorNotFound, "not found"));
-  // This checks against the wrong status.
-  EXPECT_THAT(error, Not(IsError(kLiteRtStatusErrorInvalidArgument)));
-  // This checks against the wrong message.
-  EXPECT_THAT(error, Not(IsError(kLiteRtStatusErrorNotFound, "oob")));
-
-  testing::StringMatchResultListener listener;
-  EXPECT_FALSE(testing::ExplainMatchResult(
-      IsError(kLiteRtStatusErrorInvalidArgument), error, &listener));
-  EXPECT_THAT(listener.str(), StrEq("status doesn't match"));
-
-  listener.Clear();
-  EXPECT_FALSE(testing::ExplainMatchResult(
-      IsError(kLiteRtStatusErrorNotFound, "oob"), error, &listener));
-  EXPECT_THAT(listener.str(), StrEq("message doesn't match"));
-}
-
-TEST(IsErrorMatcherTest, Works) {
-  TestErrorWrapper<Expected<int>>();
-  TestErrorWrapper<Unexpected>();
-  TestErrorWrapper<Error>();
-
-  EXPECT_THAT(kLiteRtStatusErrorUnsupported, IsError());
-  EXPECT_THAT(kLiteRtStatusOk, Not(IsError()));
-  EXPECT_THAT(Expected<int>(3), Not(IsError()));
-
-  EXPECT_THAT(testing::DescribeMatcher<Expected<int>>(IsError()),
-              StrEq("is an error."));
-  EXPECT_THAT(testing::DescribeMatcher<Expected<int>>(Not(IsError())),
-              StrEq("is not an error."));
-  EXPECT_THAT(
-      testing::DescribeMatcher<Expected<int>>(
-          IsError(kLiteRtStatusErrorUnsupported)),
-      testing::StrEq("is an error with status kLiteRtStatusErrorUnsupported."));
-  EXPECT_THAT(testing::DescribeMatcher<Expected<int>>(
-                  IsError(kLiteRtStatusErrorUnsupported, "unsupported")),
-              testing::StrEq("is an error with status "
-                             "kLiteRtStatusErrorUnsupported and message "
-                             "matching: 'unsupported'."));
-
-  testing::StringMatchResultListener listener;
-  EXPECT_FALSE(
-      testing::ExplainMatchResult(IsError(), kLiteRtStatusOk, &listener));
-  EXPECT_THAT(listener.str(), StrEq("status doesn't match"));
-
-  listener.Clear();
-  EXPECT_FALSE(
-      testing::ExplainMatchResult(IsError(), Expected<int>(3), &listener));
-  EXPECT_THAT(listener.str(),
-              StrEq("expected holds a value (but should hold an error)"));
-}
-
-TEST(LitertAssertOk, Works) {
-  LITERT_ASSERT_OK(Expected<int>(3));
-  LITERT_ASSERT_OK(kLiteRtStatusOk);
-  EXPECT_FATAL_FAILURE(
-      LITERT_ASSERT_OK(Error(kLiteRtStatusErrorInvalidArgument)), "is ok");
-}
-TEST(LitertExpectOk, Works) {
-  LITERT_EXPECT_OK(Expected<int>(3));
-  LITERT_EXPECT_OK(kLiteRtStatusOk);
-  EXPECT_NONFATAL_FAILURE(
-      LITERT_EXPECT_OK(Error(kLiteRtStatusErrorInvalidArgument)), "is ok");
-}
-
-TEST(AssertOkAndAssign, DefineAVariableWorks) {
-  LITERT_ASSERT_OK_AND_ASSIGN(auto expected, Expected<int>(3));
-  static_assert(std::is_same_v<decltype(expected), int>,
-                "Type should be deduced to int.");
-  EXPECT_EQ(expected, 3);
-
-  LITERT_ASSERT_OK_AND_ASSIGN([[maybe_unused]] auto copy_only,
-                              Expected<CopyOnly>(CopyOnly()));
-  LITERT_ASSERT_OK_AND_ASSIGN([[maybe_unused]] auto move_only,
-                              Expected<MoveOnly>(MoveOnly()));
-}
-
-TEST(AssertOkAndAssign, AssignAVariableWorks) {
-  int expected = 0;
-  LITERT_ASSERT_OK_AND_ASSIGN(expected, Expected<int>(3));
-  EXPECT_EQ(expected, 3);
-
-  [[maybe_unused]] CopyOnly copy_only;
-  [[maybe_unused]] MoveOnly move_only;
-  LITERT_ASSERT_OK_AND_ASSIGN(copy_only, Expected<CopyOnly>(CopyOnly()));
-  LITERT_ASSERT_OK_AND_ASSIGN(move_only, Expected<MoveOnly>(MoveOnly()));
-}
-
-void TestAssertOkAndAssignFailure() {
-  LITERT_ASSERT_OK_AND_ASSIGN(
-      [[maybe_unused]] int expected,
-      Expected<int>(Unexpected(kLiteRtStatusErrorInvalidArgument)));
-}
-
-TEST(AssertOkAndAssign, FailuresStopsExecution) {
-  EXPECT_FATAL_FAILURE(TestAssertOkAndAssignFailure(), "is ok");
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/test/test_models.h b/tensorflow/lite/experimental/litert/test/test_models.h
deleted file mode 100644
index ddad473d40bb..000000000000
--- a/tensorflow/lite/experimental/litert/test/test_models.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MODELS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MODELS_H_
-
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-
-// ///////////////////////////////////////////////////////////////////////////
-// FP32 models.
-// ///////////////////////////////////////////////////////////////////////////
-
-// Attention sub-module of a toy model.
-static constexpr absl::string_view kAttentionModel = "attention.tflite";
-
-// Attention vector einsum sub-module of a toy LLM.
-static constexpr absl::string_view kAttnVecEinsumModel =
-    "attn_vec_einsum.tflite";
-
-//  Feed forward sub-module of a toy LLM.
-static constexpr absl::string_view kFeedForwardModel = "ff.tflite";
-
-// Key einsume sub-module of a toy LLM.
-static constexpr absl::string_view kKeyEinsumModel = "k_einsum.tflite";
-
-// Value einsum sub-module of a toy LLM.
-static constexpr absl::string_view kValueEinsumModel = "v_einsum.tflite";
-
-// Query einsum sub-module of a toy LLM.
-static constexpr absl::string_view kQueryEinsumModel = "q_einsum.tflite";
-
-// RMS Normalization sub-module of a toy LLM.
-static constexpr absl::string_view kRMSNormModel = "norm.tflite";
-
-// ROPE sub-module of a toy LLM.
-static constexpr absl::string_view kROPEModel = "rope.tflite";
-
-// ROPE sub-module of a toy LLM, uses embedding_lookup op for sin/cos.
-static constexpr absl::string_view kLookUpROPEModel = "lookup_rope.tflite";
-
-// Scale dot product attentionsub-module of a toy LLM.
-static constexpr absl::string_view kSDPAModel = "sdpa.tflite";
-
-// Transformer block sub-module of a toy LLM.
-static constexpr absl::string_view kTransformerBlockModel =
-    "transformer.tflite";
-
-// ///////////////////////////////////////////////////////////////////////////
-// Quantized models.
-// ///////////////////////////////////////////////////////////////////////////
-
-// Quantized model with a single mul op.
-// Mul: <8x100x32x4xint16>, <8x100x32x4xint16> -> <8x100x32x4xint16>
-static constexpr absl::string_view kQSimpleMul16x16Model = "mul_quant.tflite";
-
-// Quantized model with a mul op and a add op.
-// Mul: <8x100x32x4xint16>, <8x100x32x4xint16> -> <8x100x32x4xint16>
-// Add: <8x100x32x4xint16>, <8x100x32x4xint16> -> <8x100x32x4xint16>
-static constexpr absl::string_view kQMulAdd16x16Model =
-    "simple_quantized_ops.tflite";
-
-// Single add op i16 activations and i8 weights and dynamic shape.
-// Add: <?x32x32int16>, <?x32x32int16> -> <?x32x32int16>
-static constexpr absl::string_view kQSingleDynAdd16x8Model =
-    "single_add_default_a16w8_recipe_quantized.tflite";
-
-// Single add op i8 activations and i8 weights and dynamic shape.
-// Add: <?x32x32int8>, <?x32x32int8> -> <?x32x32int8>
-static constexpr absl::string_view kQSingleDynAdd8x8Model =
-    "single_add_default_a8w8_recipe_quantized.tflite";
-
-// Single mul op i16 activations and i8 weights and dynamic shape.
-// Mul: <?x32x32int16>, <?x32x32int16> -> <?x32x32int16>
-static constexpr absl::string_view kQSingleDynMul16x8Model =
-    "single_mul_default_a16w8_recipe_quantized.tflite";
-
-// Single mul op i8 activations and i8 weights and dynamic shape.
-// Mul: <?x32x32int8>, <?x32x32int8> -> <?x32x32int8>
-static constexpr absl::string_view kQSingleDynMul8x8Model =
-    "single_mul_default_a8w8_recipe_quantized.tflite";
-
-// Single rsqrt op i16 activations and i8 weights and dynamic shape.
-// RSQRT: <?x32x32int16> -> <?x32x32int16>
-static constexpr absl::string_view kQSingleDynRsqrt16x8Model =
-    "single_rsqrt_default_a16w8_recipe_quantized.tflite";
-
-// Single rsqrt op i8 activations and i8 weights and dynamic shape.
-// RSQRT: <?x32x32int8> -> <?x32x32int8>
-static constexpr absl::string_view kQSingleDynRsqrt8x8Model =
-    "single_rsqrt_default_a8w8_recipe_quantized.tflite";
-
-// Quantized einsum model with i16 activations and i8 weights.
-static constexpr absl::string_view kQQueryEinsum16x8Model =
-    "static_w8_a16_quantized_q_einsum.tflite";
-
-static constexpr absl::string_view kQKeyEinsum16x8Model =
-    "static_w8_a16_quantized_k_einsum.tflite";
-
-static constexpr absl::string_view kQVauleEinsum16x8Model =
-    "static_w8_a16_quantized_v_einsum.tflite";
-
-static constexpr absl::string_view kQAttnVecEinsum16x8Model =
-    "static_w8_a16_quantized_attn_vec_einsum.tflite";
-
-static constexpr absl::string_view kQSDPAModel =
-    "static_a8w8_quantized_sdpa.tflite";
-
-// All the quantized test models.
-static constexpr auto kAllQModels = absl::MakeConstSpan((absl::string_view[]){
-    kQSimpleMul16x16Model, kQMulAdd16x16Model, kQSingleDynAdd16x8Model,
-    kQSingleDynAdd8x8Model, kQSingleDynMul16x8Model, kQSingleDynMul8x8Model,
-    kQSingleDynRsqrt16x8Model, kQSingleDynRsqrt8x8Model});
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TEST_MODELS_H_
diff --git a/tensorflow/lite/experimental/litert/test/testdata/add_cst.mlir b/tensorflow/lite/experimental/litert/test/testdata/add_cst.mlir
deleted file mode 100644
index 502a32a78451..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/add_cst.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-	%cst = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>
-  %0 = tfl.add %arg0, %cst {fused_activation_function = "NONE"} : tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/add_simple.mlir b/tensorflow/lite/experimental/litert/test/testdata/add_simple.mlir
deleted file mode 100644
index 32945b4c8be2..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/add_simple.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/cos_mul.mlir b/tensorflow/lite/experimental/litert/test/testdata/cos_mul.mlir
deleted file mode 100644
index e6f996a706f6..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/cos_mul.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x100x32x2xf32>, %arg1: tensor<8x100x1x2xf32>) -> tensor<8x100x32x2xf32> {
-  %0 = "tfl.cos"(%arg1) : (tensor<8x100x1x2xf32>) -> tensor<8x100x1x2xf32>
-  %1 = tfl.mul(%arg0, %0) <{fused_activation_function = "NONE"}> : (tensor<8x100x32x2xf32>, tensor<8x100x1x2xf32>) -> tensor<8x100x32x2xf32>
-  return %1 : tensor<8x100x32x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/cst_multi_subgraph.mlir b/tensorflow/lite/experimental/litert/test/testdata/cst_multi_subgraph.mlir
deleted file mode 100644
index 8a11bf4f58ba..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/cst_multi_subgraph.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-    %0 = "tfl.pseudo_const"() <{value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf32>}> : () -> tensor<4xf32>
-    %1 = tfl.mul %arg0, %0 {fused_activation_function = "NONE"} : tensor<4xf32>
-    return %1 : tensor<4xf32>
-  }
-  func.func @other(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-    %0 = "tfl.pseudo_const"() <{value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf32>}> : () -> tensor<4xf32>
-    %1 = tfl.mul %arg0, %0 {fused_activation_function = "NONE"} : tensor<4xf32>
-    return %1 : tensor<4xf32>
-  }
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/dynamic_shape_tensor.mlir b/tensorflow/lite/experimental/litert/test/testdata/dynamic_shape_tensor.mlir
deleted file mode 100644
index 7024ce189b77..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/dynamic_shape_tensor.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
-  %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<?x2xf32>
-  return %0 : tensor<?x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/fully_connected_3d.mlir b/tensorflow/lite/experimental/litert/test/testdata/fully_connected_3d.mlir
deleted file mode 100644
index a3db1d9a887a..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/fully_connected_3d.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x100x128xf32>, %arg1: tensor<128x128xf32>, %arg2: none) -> tensor<8x100x128xf32> {
-  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) <{asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<8x100x128xf32>, tensor<128x128xf32>, none) -> tensor<8x100x128xf32>
-  return %0 : tensor<8x100x128xf32>
-}
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/mul_simple.mlir b/tensorflow/lite/experimental/litert/test/testdata/mul_simple.mlir
deleted file mode 100644
index dd02656c2f37..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/mul_simple.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.mul %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  %1 = tfl.mul %0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/multi_composite.mlir b/tensorflow/lite/experimental/litert/test/testdata/multi_composite.mlir
deleted file mode 100644
index 60a65cdfe4f3..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/multi_composite.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-func.func @main(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = stablehlo.composite "odml.npu_call" %arg0, %arg1 {decomposition = @decomp1} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = stablehlo.composite "odml.regular_composite" %arg0, %0 {decomposition = @decomp2} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %2 = stablehlo.composite "odml.npu_call" %arg0, %1 {decomposition = @decomp3} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %2 : tensor<2x2xf32>
-}
-
-func.func private @decomp1(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-func.func private @decomp2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-func.func private @decomp3(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/multi_op_multi_subgraph.mlir b/tensorflow/lite/experimental/litert/test/testdata/multi_op_multi_subgraph.mlir
deleted file mode 100644
index 433d166fe3c1..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/multi_op_multi_subgraph.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-    %0 = tfl.mul %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %1 = tfl.mul %0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %2 = tfl.sub %1, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %3 = tfl.sub %2, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    return %3 : tensor<2x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/multi_subgraph.mlir b/tensorflow/lite/experimental/litert/test/testdata/multi_subgraph.mlir
deleted file mode 100644
index 7c1f0fe4e0f5..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/multi_subgraph.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-module {
-
-func.func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-	%cst = arith.constant dense<[-1.0, -1.0, -1.0, -1.0]> : tensor<4xf32>
-  %0 = tfl.add %arg0, %cst {fused_activation_function = "NONE"} : tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-func.func @func1(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-	%cst = arith.constant dense<[1.0, 1.0, 1.0, 1.0]> : tensor<4xf32>
-  %0 = tfl.add %arg0, %cst {fused_activation_function = "NONE"} : tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-func.func @func2(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-	%cst = arith.constant dense<[2.0, 2.0, 2.0, 2.0]> : tensor<4xf32>
-  %0 = tfl.add %arg0, %cst {fused_activation_function = "NONE"} : tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/multi_subgraph_mul.mlir b/tensorflow/lite/experimental/litert/test/testdata/multi_subgraph_mul.mlir
deleted file mode 100644
index 607100dbc389..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/multi_subgraph_mul.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-module {
-
-func.func @main(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.mul %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-func.func @func1(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>) -> tensor<4x4xf32> {
-  %0 = tfl.mul %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<4x4xf32>
-  return %0 : tensor<4x4xf32>
-}
-
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/multi_use_cst.mlir b/tensorflow/lite/experimental/litert/test/testdata/multi_use_cst.mlir
deleted file mode 100644
index 617c27db761e..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/multi_use_cst.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-func.func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-	%cst = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>
-  %0 = tfl.add %arg0, %cst {fused_activation_function = "NONE"} : tensor<4xf32>
-  %1 = tfl.add %0, %0 {fused_activation_function = "NONE"} : tensor<4xf32>
-  %2 = tfl.add %1, %cst {fused_activation_function = "NONE"} : tensor<4xf32>
-  return %2 : tensor<4xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/nested_composite.mlir b/tensorflow/lite/experimental/litert/test/testdata/nested_composite.mlir
deleted file mode 100644
index 32ca6e26f2bf..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/nested_composite.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-func.func @main(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = stablehlo.composite "odml.npu_call" %arg0, %arg1 {decomposition = @decomp1} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-func.func private @decomp1(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = stablehlo.composite "odml.regular_composite" %arg0, %arg1 {decomposition = @decomp2} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-func.func private @decomp2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/one_mul.mlir b/tensorflow/lite/experimental/litert/test/testdata/one_mul.mlir
deleted file mode 100644
index afabf1903ee8..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/one_mul.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.mul %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/rms_norm.mlir b/tensorflow/lite/experimental/litert/test/testdata/rms_norm.mlir
deleted file mode 100644
index 476c9829a5bd..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/rms_norm.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-module attributes {tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.metadata = {CONVERSION_METADATA = "\10\00\00\00\00\00\00\00\08\00\0E\00\08\00\04\00\08\00\00\00\10\00\00\00$\00\00\00\00\00\06\00\08\00\04\00\06\00\00\00\04\00\00\00\00\00\00\00\0C\00\18\00\14\00\10\00\0C\00\04\00\0C\00\00\00zs\F5|\1F\CE)\0D\01\00\00\00\02\00\00\00\04\00\00\00\06\00\00\002.19.0\00\00", min_runtime_version = "1.10.0\00\00\00\00\00\00\00\00\00\00"}, tfl.schema_version = 3 : i32} {
-  func.func @main(%arg0: tensor<8x128x1024xf32> {tf_saved_model.index_path = ["args_0"]}) -> (tensor<8x128x1024xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "serving_default_args_0:0", outputs = "StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
-    %0 = tfl.mul %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<8x128x1024xf32>
-    %1 = "tfl.pseudo_const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
-    %2 = "tfl.sum"(%0, %1) <{keep_dims = false}> : (tensor<8x128x1024xf32>, tensor<1xi32>) -> tensor<8x128xf32>
-    %3 = "tfl.pseudo_const"() <{value = dense<1.024000e+03> : tensor<f32>}> : () -> tensor<f32>
-    %4 = tfl.div(%2, %3) <{fused_activation_function = "NONE"}> : (tensor<8x128xf32>, tensor<f32>) -> tensor<8x128xf32>
-    %5 = "tfl.pseudo_const"() <{value = dense<9.99999997E-7> : tensor<f32>}> : () -> tensor<f32>
-    %6 = tfl.add(%4, %5) <{fused_activation_function = "NONE"}> : (tensor<8x128xf32>, tensor<f32>) -> tensor<8x128xf32>
-    %7 = "tfl.pseudo_const"() <{value = dense<[8, 128, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
-    %8 = "tfl.reshape"(%6, %7) : (tensor<8x128xf32>, tensor<3xi32>) -> tensor<8x128x1xf32>
-    %9 = "tfl.rsqrt"(%8) : (tensor<8x128x1xf32>) -> tensor<8x128x1xf32>
-    %10 = tfl.mul(%arg0, %9) <{fused_activation_function = "NONE"}> : (tensor<8x128x1024xf32>, tensor<8x128x1xf32>) -> tensor<8x128x1024xf32>
-    return %10 : tensor<8x128x1024xf32>
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/rms_norm_composite.mlir b/tensorflow/lite/experimental/litert/test/testdata/rms_norm_composite.mlir
deleted file mode 100644
index 6995e4d739ab..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/rms_norm_composite.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-module {
-
-func.func @main(%arg0: tensor<1x128x2304xf32>, %arg1: tensor<2304xf32>) -> tensor<1x128x2304xf32> {
-  %0 = stablehlo.composite "odml.rms_norm" %arg0, %arg1 {composite_attributes = {epsilon = 9.99999997E-7 : f32}, decomposition = @odml.rms_norm.impl} : (tensor<1x128x2304xf32>, tensor<2304xf32>) -> tensor<1x128x2304xf32>
-  return %0 : tensor<1x128x2304xf32>
-}
-
-func.func @odml.rms_norm.impl(%arg0: tensor<1x128x2304xf32>, %arg1: tensor<2304xf32>) -> tensor<1x128x2304xf32> {
-    %0 = tfl.mul %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<1x128x2304xf32>
-    %1 = "tfl.pseudo_const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
-    %2 = "tfl.sum"(%0, %1) <{keep_dims = false}> : (tensor<1x128x2304xf32>, tensor<1xi32>) -> tensor<1x128xf32>
-    %3 = "tfl.pseudo_const"() <{value = dense<4.34027781E-4> : tensor<f32>}> : () -> tensor<f32>
-    %4 = tfl.mul(%2, %3) <{fused_activation_function = "NONE"}> : (tensor<1x128xf32>, tensor<f32>) -> tensor<1x128xf32>
-    %5 = "tfl.pseudo_const"() <{value = dense<9.99999997E-7> : tensor<f32>}> : () -> tensor<f32>
-    %6 = tfl.add(%4, %5) <{fused_activation_function = "NONE"}> : (tensor<1x128xf32>, tensor<f32>) -> tensor<1x128xf32>
-    %7 = "tfl.pseudo_const"() <{value = dense<[1, 128, 1]> : tensor<3xi32>}> : () -> tensor<3xi32>
-    %8 = "tfl.reshape"(%6, %7) : (tensor<1x128xf32>, tensor<3xi32>) -> tensor<1x128x1xf32>
-    %9 = "tfl.rsqrt"(%8) : (tensor<1x128x1xf32>) -> tensor<1x128x1xf32>
-    %10 = tfl.mul(%arg0, %9) <{fused_activation_function = "NONE"}> : (tensor<1x128x2304xf32>, tensor<1x128x1xf32>) -> tensor<1x128x2304xf32>
-    %11 = tfl.mul(%10, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x128x2304xf32>, tensor<2304xf32>) -> tensor<1x128x2304xf32>
-    return %11 : tensor<1x128x2304xf32>
-  }
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/scala_reshape.mlir b/tensorflow/lite/experimental/litert/test/testdata/scala_reshape.mlir
deleted file mode 100644
index 0b655f704eb5..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/scala_reshape.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x1xf32>) -> tensor<f32> {
-  %cst = arith.constant dense<[]> : tensor<0xi32>
-  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<1x1xf32>, tensor<0xi32>) -> tensor<f32>
-  return %0 : tensor<f32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/shared_input_cpu_npu.mlir b/tensorflow/lite/experimental/litert/test/testdata/shared_input_cpu_npu.mlir
deleted file mode 100644
index 42a5059e8861..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/shared_input_cpu_npu.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-  func.func @main(%x: tensor<2xf32>, %y: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
-    %cpu_out = tfl.add %x, %y {fused_activation_function = "NONE"} : tensor<2xf32>
-    %npu_out = "tfl.custom"(%x, %y) {custom_code = "DISPATCH_OP", custom_option = #tfl<const_bytes: "simple">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-    func.return %cpu_out, %npu_out : tensor<2xf32>, tensor<2xf32>
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_add_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_add_op.mlir
deleted file mode 100644
index 0902f5966f82..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_add_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x1xf32>, %arg1: tensor<1x128x1xf32>) -> tensor<1x128x1xf32> {
-  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1x128x1xf32>
-  return %0 : tensor<1x128x1xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_average_poll_2d.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_average_poll_2d.mlir
deleted file mode 100644
index 979610cdaa0e..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_average_poll_2d.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x1728x2304x3xf32>) -> tensor<1x432x576x3xf32> {
-  %0 = "tfl.average_pool_2d"(%arg0) <{filter_height = 4 : i32, filter_width = 4 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 4 : i32}> : (tensor<1x1728x2304x3xf32>) -> tensor<1x432x576x3xf32>
-  return %0 : tensor<1x432x576x3xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_batch_matmul_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_batch_matmul_op.mlir
deleted file mode 100644
index e756a0dab87c..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_batch_matmul_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x4x256x128xf32>, %arg1: tensor<1x4x128x128xf32>) -> tensor<1x4x256x128xf32> {
-  %0 = "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<1x4x256x128xf32>, tensor<1x4x128x128xf32>) -> tensor<1x4x256x128xf32>
-  return %0 : tensor<1x4x256x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_cascade_model_npu.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_cascade_model_npu.mlir
deleted file mode 100644
index 5e262cb67871..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_cascade_model_npu.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-  func.func @main(%x1: tensor<2xf32>, %x2: tensor<2xf32>, %x3: tensor<2xf32>) -> tensor<2xf32> {
-    %t1 = "tfl.custom"(%x1, %x2) {custom_code = "DISPATCH_OP_1", custom_option = #tfl<const_bytes: "">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-    %out = "tfl.custom"(%t1, %x3) {custom_code = "DISPATCH_OP_2", custom_option = #tfl<const_bytes: "">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-    func.return %out : tensor<2xf32>
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_cast_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_cast_op.mlir
deleted file mode 100644
index 6066c665713b..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_cast_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x100x1xi32>) -> tensor<8x100x1xf32> {
-  %0 = "tfl.cast"(%arg0) : (tensor<8x100x1xi32>) -> tensor<8x100x1xf32>
-  return %0 : tensor<8x100x1xf32>
-}
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_composite.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_composite.mlir
deleted file mode 100644
index 79c64f423039..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_composite.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-module attributes {tfl.description = "MLIR Converted.", tfl.metadata = {min_runtime_version = "1.5.0\00\00\00\00\00\00\00\00\00\00\00"}, tfl.schema_version = 3 : i32, tf_saved_model.semantics} {
-  func.func @main(%arg0: tensor<2x2xf32> { tf_saved_model.index_path = ["arg0"] }, %arg1: tensor<2x2xf32> { tf_saved_model.index_path = ["arg1"]}) -> (tensor<2x2xf32> {tf_saved_model.index_path = ["output"] }) attributes {tf.entry_function = {inputs = "arg0,arg1", outputs = "output"}, tf_saved_model.exported_names = ["serving_default"]} {
-    %0 = stablehlo.composite "odml.npu_call" %arg0, %arg1 {decomposition = @decomp} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-    return %0 : tensor<2x2xf32>
-  }
-  func.func private @decomp(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-    %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    return %0 : tensor<2x2xf32>
-  }
-}
-
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_concatenation_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_concatenation_op.mlir
deleted file mode 100644
index e1e9bd36ae01..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_concatenation_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<128x4x1x256xf32>, %arg1: tensor<128x4x1x256xf32>) -> tensor<128x4x2x256xf32> {
-  %0 = "tfl.concatenation"(%arg0, %arg1) <{axis = 2 : i32, fused_activation_function = "NONE"}> : (tensor<128x4x1x256xf32>, tensor<128x4x1x256xf32>) -> tensor<128x4x2x256xf32>
-  return %0 : tensor<128x4x2x256xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_conv_2d_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_conv_2d_op.mlir
deleted file mode 100644
index 4eb0e0a04d32..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_conv_2d_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x216x288x24xf32>, %arg1: tensor<24x3x3x24xf32>, %arg2: tensor<24xf32>) -> tensor<1x216x288x24xf32> {
-  %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x216x288x24xf32>, tensor<24x3x3x24xf32>, tensor<24xf32>) -> tensor<1x216x288x24xf32>
-  return %0 : tensor<1x216x288x24xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_cos_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_cos_op.mlir
deleted file mode 100644
index 70ea46c1988b..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_cos_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x100x1x2xf32>) -> tensor<8x100x1x2xf32> {
-  %0 = "tfl.cos"(%arg0) : (tensor<8x100x1x2xf32>) -> tensor<8x100x1x2xf32>
-  return %0 : tensor<8x100x1x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_depth_to_space_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_depth_to_space_op.mlir
deleted file mode 100644
index 2682b724b88c..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_depth_to_space_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x216x288x12xf32>) -> tensor<1x432x576x3xf32> {
-  %0 = "tfl.depth_to_space"(%arg0) <{block_size = 2 : i32}> : (tensor<1x216x288x12xf32>) -> tensor<1x432x576x3xf32>
-  return %0 : tensor<1x432x576x3xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_depthwise_conv_2d_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_depthwise_conv_2d_op.mlir
deleted file mode 100644
index 706295d3e270..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_depthwise_conv_2d_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x40x40x192xf32>, %arg1: tensor<1x3x3x192xf32>, %arg2: tensor<192xf32>) -> tensor<1x32x32x192xf32> {
-  %0 = "tfl.depthwise_conv_2d"(%arg0, %arg1, %arg2) <{depth_multiplier = 1 : i32, dilation_h_factor = 4 : i32, dilation_w_factor = 4 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x40x40x192xf32>, tensor<1x3x3x192xf32>, tensor<192xf32>) -> tensor<1x32x32x192xf32>
-  return %0 : tensor<1x32x32x192xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_div_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_div_op.mlir
deleted file mode 100644
index 3748d45bcd52..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_div_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x8x128xf32>, %arg1: tensor<1x128x8x128xf32>) -> tensor<1x128x8x128xf32> {
-  %0 = tfl.div %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1x128x8x128xf32>
-  return %0 : tensor<1x128x8x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_dynamic_update_slice_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_dynamic_update_slice_op.mlir
deleted file mode 100644
index a10606eccd41..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_dynamic_update_slice_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x64x4x64xf32>, %arg1: tensor<1x1x4x64xf32>) -> tensor<1x64x4x64xf32> {
-  %cst = "tfl.pseudo_const"() <{value = dense<[0, 1, 0, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
-  %0 = "tfl.dynamic_update_slice"(%arg0, %arg1, %cst) : (tensor<1x64x4x64xf32>, tensor<1x1x4x64xf32>, tensor<4xi32>) -> tensor<1x64x4x64xf32>
-  return %0 : tensor<1x64x4x64xf32>
-}
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_embedding_lookup_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_embedding_lookup_op.mlir
deleted file mode 100644
index 75b8000bb97a..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_embedding_lookup_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<5xi32>) -> tensor<5x1x2xf32> {
-  %table = "tfl.pseudo_const"() <{value = dense<"0x00010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001"> : tensor<20x1x2xf32>}> : () -> tensor<20x1x2xf32>
-  %0 = "tfl.embedding_lookup"(%arg0, %table) : (tensor<5xi32>, tensor<20x1x2xf32>) -> tensor<5x1x2xf32>
-  return %0 : tensor<5x1x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_floor_mod_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_floor_mod_op.mlir
deleted file mode 100644
index 6bd3f1fa79d7..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_floor_mod_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<5xf32>, %arg1: tensor<5xf32>) -> tensor<5xf32> {
-  %0 = "tfl.floor_mod"(%arg0, %arg1) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
-  return %0 : tensor<5xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_fully_connected_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_fully_connected_op.mlir
deleted file mode 100644
index 5cad12066263..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_fully_connected_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<128x2048xf32>, %arg1: tensor<2304x2048xf32>, %arg2: none) -> tensor<128x2304xf32> {
-  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) <{asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<128x2048xf32>, tensor<2304x2048xf32>, none) -> tensor<128x2304xf32>
-  return %0 : tensor<128x2304xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_gather_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_gather_op.mlir
deleted file mode 100644
index 6b0375c77c24..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_gather_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x3x6xf32>, %arg1: tensor<4x5xi32>) -> tensor<4x5x3x6xf32> {
-  %0 = "tfl.gather"(%arg0, %arg1) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<2x3x6xf32>, tensor<4x5xi32>) -> tensor<4x5x3x6xf32>
-  return %0 : tensor<4x5x3x6xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_gelu_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_gelu_op.mlir
deleted file mode 100644
index 39ebcf24e972..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_gelu_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x100x1xf32>) -> tensor<8x100x1xf32> {
-  %0 = "tfl.gelu"(%arg0) : (tensor<8x100x1xf32>) -> tensor<8x100x1xf32>
-  return %0 : tensor<8x100x1xf32>
-}
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_greater_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_greater_op.mlir
deleted file mode 100644
index b368def16d6e..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_greater_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x1x64xi32>, %arg1: tensor<1x1x64xi32>) -> tensor<1x1x64xi1> {
-  %0 = "tfl.greater"(%arg0, %arg1) : (tensor<1x1x64xi32>, tensor<1x1x64xi32>) -> tensor<1x1x64xi1>
-  return %0 : tensor<1x1x64xi1>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_hard_swish_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_hard_swish_op.mlir
deleted file mode 100644
index 5c95ca2bb4e5..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_hard_swish_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x216x288x48xf32>) -> tensor<1x216x288x48xf32> {
-  %0 = "tfl.hard_swish"(%arg0) : (tensor<1x216x288x48xf32>) -> tensor<1x216x288x48xf32>
-  return %0 : tensor<1x216x288x48xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_leaky_relu_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_leaky_relu_op.mlir
deleted file mode 100644
index 13dacd398449..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_leaky_relu_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x32x32x192xf32>) -> tensor<1x32x32x192xf32> {
-  %0 = "tfl.leaky_relu"(%arg0) <{alpha = 2.000000e-01 : f32}> : (tensor<1x32x32x192xf32>) -> tensor<1x32x32x192xf32>
-  return %0 : tensor<1x32x32x192xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_less_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_less_op.mlir
deleted file mode 100644
index 06370a186ddc..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_less_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x1x64xi32>, %arg1: tensor<1x1x64xi32>) -> tensor<1x1x64xi1> {
-  %0 = "tfl.less"(%arg0, %arg1) : (tensor<1x1x64xi32>, tensor<1x1x64xi32>) -> tensor<1x1x64xi1>
-  return %0 : tensor<1x1x64xi1>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_logical_and_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_logical_and_op.mlir
deleted file mode 100644
index e58307caceb3..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_logical_and_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x64x64xi1>, %arg1: tensor<1x64x64xi1>) -> tensor<1x64x64xi1> {
-  %0 = "tfl.logical_and"(%arg0, %arg1) : (tensor<1x64x64xi1>, tensor<1x64x64xi1>) -> tensor<1x64x64xi1>
-  return %0 : tensor<1x64x64xi1>
-}
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_mean_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_mean_op.mlir
deleted file mode 100644
index 56b4fcb8a9f3..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_mean_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>) -> tensor<2xf32> {
-  %cst = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
-  %0 = "tfl.mean"(%arg0, %cst) <{keep_dims = false}> : (tensor<2x2xf32>, tensor<1xi32>) -> tensor<2xf32>
-  return %0 : tensor<2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_model.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_model.mlir
deleted file mode 100644
index d88a5d5923c7..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_model.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<2xf32>
-  return %0 : tensor<2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_model_google_tensor.bin b/tensorflow/lite/experimental/litert/test/testdata/simple_model_google_tensor.bin
deleted file mode 100644
index 208cb9836715..000000000000
Binary files a/tensorflow/lite/experimental/litert/test/testdata/simple_model_google_tensor.bin and /dev/null differ
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_model_mtk.bin b/tensorflow/lite/experimental/litert/test/testdata/simple_model_mtk.bin
deleted file mode 100644
index b6702cc8b180..000000000000
Binary files a/tensorflow/lite/experimental/litert/test/testdata/simple_model_mtk.bin and /dev/null differ
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_model_npu.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_model_npu.mlir
deleted file mode 100644
index f4959fb63e62..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_model_npu.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-  func.func @main(%x: tensor<2xf32>, %y: tensor<2xf32>) -> tensor<2xf32> {
-    %out = "tfl.custom"(%x, %y) {custom_code = "DISPATCH_OP", custom_option = #tfl<const_bytes: "">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-    func.return %out : tensor<2xf32>
-  }
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_model_qualcomm.bin b/tensorflow/lite/experimental/litert/test/testdata/simple_model_qualcomm.bin
deleted file mode 100644
index a66f76296d76..000000000000
Binary files a/tensorflow/lite/experimental/litert/test/testdata/simple_model_qualcomm.bin and /dev/null differ
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h b/tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h
deleted file mode 100644
index 2068cb028b8a..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TESTDATA_SIMPLE_MODEL_TEST_VECTORS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TESTDATA_SIMPLE_MODEL_TEST_VECTORS_H_
-
-#include <cstddef>
-#include <cstdint>
-
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_layout.h"
-
-constexpr const char* kModelFileName = "simple_model.tflite";
-constexpr const char* kQualcommModelFileName = "simple_model_qualcomm.bin";
-constexpr const char* kGoogleTensorModelFileName =
-    "simple_model_google_tensor.bin";
-constexpr const char* kMediaTekModelFileName = "simple_model_mtk.bin";
-
-constexpr const int32_t kTestInput0Dimensions[] = {2};
-constexpr const int32_t kNumTestInput0Dimensions =
-    sizeof(kTestInput0Dimensions) / sizeof(kTestInput0Dimensions[0]);
-constexpr const int32_t kTestInput1Dimensions[] = {2};
-constexpr const int32_t kNumTestInput1Dimensions =
-    sizeof(kTestInput1Dimensions) / sizeof(kTestInput1Dimensions[0]);
-constexpr const int32_t kTestOutputDimensions[] = {2};
-constexpr const int32_t kNumTestOutputDimensions =
-    sizeof(kTestOutputDimensions) / sizeof(kTestOutputDimensions[0]);
-
-constexpr const float kTestInput0Tensor[] = {1, 2};
-constexpr const float kTestInput1Tensor[] = {10, 20};
-constexpr const float kTestOutputTensor[] = {11, 22};
-
-constexpr const float kTestInput0Tensor_2[] = {10, 20};
-constexpr const float kTestInput1Tensor_2[] = {100, 200};
-constexpr const float kTestOutputTensor_2[] = {110, 220};
-
-constexpr const size_t kTestInput0Size =
-    sizeof(kTestInput0Tensor) / sizeof(kTestInput0Tensor[0]);
-constexpr const size_t kTestInput1Size =
-    sizeof(kTestInput1Tensor) / sizeof(kTestInput1Tensor[0]);
-constexpr const size_t kTestOutputSize =
-    sizeof(kTestOutputTensor) / sizeof(kTestOutputTensor[0]);
-
-constexpr const LiteRtRankedTensorType kInput0TensorType = {
-    /*.element_type=*/kLiteRtElementTypeFloat32,
-    ::litert::BuildLayout(kTestInput0Dimensions)};
-
-constexpr const LiteRtRankedTensorType kInput1TensorType = {
-    /*.element_type=*/kLiteRtElementTypeFloat32,
-    ::litert::BuildLayout(kTestInput1Dimensions)};
-
-constexpr const LiteRtRankedTensorType kOutputTensorType = {
-    /*.element_type=*/kLiteRtElementTypeFloat32,
-    ::litert::BuildLayout(kTestOutputDimensions)};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TEST_TESTDATA_SIMPLE_MODEL_TEST_VECTORS_H_
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_mul_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_mul_op.mlir
deleted file mode 100644
index 7fb5ac2d2187..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_mul_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x2304xf32>, %arg1: tensor<1x128x2304xf32>) -> tensor<1x128x2304xf32> {
-  %0 = tfl.mul %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1x128x2304xf32>
-  return %0 : tensor<1x128x2304xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_multi_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_multi_op.mlir
deleted file mode 100644
index 07757fddec1b..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_multi_op.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-    %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %1 = tfl.mul %0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %2 = tfl.mul %1, %1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %3 = tfl.add %2, %2 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    return %3 : tensor<2x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_pack_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_pack_op.mlir
deleted file mode 100644
index e94d4815d954..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_pack_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<4xi32> {
-  // %cst = "tfl.pseudo_const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
-  %0 = "tfl.pack"(%arg0, %arg1, %arg2, %arg3) <{axis = 0 : i32, values_count = 4 : i32}> : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_reshape_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_reshape_op.mlir
deleted file mode 100644
index 515db6e424e6..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_reshape_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x4x256xf32>, %arg1: tensor<4xi32>) -> tensor<128x4x1x256xf32> {
-  %0 = "tfl.reshape"(%arg0, %arg1) : (tensor<1x128x4x256xf32>, tensor<4xi32>) -> tensor<128x4x1x256xf32>
-  return %0 : tensor<128x4x1x256xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_resize_bilinear_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_resize_bilinear_op.mlir
deleted file mode 100644
index 1cd9be9729f4..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_resize_bilinear_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x54x72x96xf32>) -> tensor<1x108x144x96xf32> {
-  %cst = "tfl.pseudo_const"() <{value = dense<[108, 144]> : tensor<2xi32>}> : () -> tensor<2xi32>
-  %0 = "tfl.resize_bilinear"(%arg0, %cst) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x54x72x96xf32>, tensor<2xi32>) -> tensor<1x108x144x96xf32>
-  return %0 : tensor<1x108x144x96xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_rsqrt_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_rsqrt_op.mlir
deleted file mode 100644
index 5083f3f3a303..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_rsqrt_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x1xf32>) -> tensor<1x128x1xf32> {
-  %0 = "tfl.rsqrt"(%arg0) : (tensor<1x128x1xf32>) -> tensor<1x128x1xf32>
-  return %0 : tensor<1x128x1xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_select_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_select_op.mlir
deleted file mode 100644
index 2405e5d36268..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_select_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x8x128xi1>, %arg1: tensor<1x128x8x128xf32>, %arg2: tensor<1x128x8x128xf32>) -> tensor<1x128x8x128xf32> {
-  %0 = "tfl.select"(%arg0, %arg1, %arg2) : (tensor<1x128x8x128xi1>, tensor<1x128x8x128xf32>, tensor<1x128x8x128xf32>) -> tensor<1x128x8x128xf32>
-  return %0 : tensor<1x128x8x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_select_v2_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_select_v2_op.mlir
deleted file mode 100644
index a8d80ecc80f9..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_select_v2_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x1x1x100xi1>, %arg1: tensor<8x100x32x100xf32>, %arg2: tensor<8x100x32x100xf32>) -> tensor<8x100x32x100xf32> {
-  %0 = "tfl.select_v2"(%arg0, %arg1, %arg2) : (tensor<8x1x1x100xi1>, tensor<8x100x32x100xf32>, tensor<8x100x32x100xf32>) -> tensor<8x100x32x100xf32>
-  return %0 : tensor<8x100x32x100xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_sin_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_sin_op.mlir
deleted file mode 100644
index 431d3b930654..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_sin_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x100x1x2xf32>) -> tensor<8x100x1x2xf32> {
-  %0 = "tfl.sin"(%arg0) : (tensor<8x100x1x2xf32>) -> tensor<8x100x1x2xf32>
-  return %0 : tensor<8x100x1x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_slice_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_slice_op.mlir
deleted file mode 100644
index 4adfa00a204c..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_slice_op.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x8x256xf32>) -> tensor<1x128x4x128xf32> {
-  %cst_0 = "tfl.pseudo_const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
-  %cst_1 = "tfl.pseudo_const"() <{value = dense<[1, 128, 4, 128]> : tensor<4xi32>}> : () -> tensor<4xi32>
-  %0 = "tfl.slice"(%arg0, %cst_0, %cst_1) : (tensor<1x128x8x256xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x128x4x128xf32>
-  return %0 : tensor<1x128x4x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_softmax_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_softmax_op.mlir
deleted file mode 100644
index bb3a83a3787f..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_softmax_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<8x128xf32>) -> tensor<8x128xf32> {
-  %0 = "tfl.softmax"(%arg0) <{beta = 1.000000e+00 : f32}> : (tensor<8x128xf32>) -> tensor<8x128xf32>
-  return %0 : tensor<8x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_space_to_depth_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_space_to_depth_op.mlir
deleted file mode 100644
index 3e3368164862..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_space_to_depth_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x432x576x6xf32>) -> tensor<1x216x288x24xf32> {
-  %0 = "tfl.space_to_depth"(%arg0) <{block_size = 2 : i32}> : (tensor<1x432x576x6xf32>) -> tensor<1x216x288x24xf32>
-  return %0 : tensor<1x216x288x24xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_split_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_split_op.mlir
deleted file mode 100644
index 38c99095a013..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_split_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x4x3x3xf32>) -> tensor<1x4x3x1xf32> {
-  %cst = "tfl.pseudo_const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
-  %0:3 = "tfl.split"(%cst, %arg0) <{num_splits = 3 : i32}> : (tensor<i32>, tensor<1x4x3x3xf32>) -> (tensor<1x4x3x1xf32>, tensor<1x4x3x1xf32>, tensor<1x4x3x1xf32>)
-  return %0#0 : tensor<1x4x3x1xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_stablehlo_scatter_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_stablehlo_scatter_op.mlir
deleted file mode 100644
index 9d098eb0b9f6..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_stablehlo_scatter_op.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x4x256xf32>, %arg1: tensor<131072x4xi32>, %arg2: tensor<131072xf32>) -> tensor<1x128x4x256xf32> {
-  %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) <{scatter_dimension_numbers = #stablehlo.scatter<inserted_window_dims = [0, 1, 2, 3], scatter_dims_to_operand_dims = [0, 1, 2, 3], index_vector_dim = 1>}> ({
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-      stablehlo.return %arg4 : tensor<f32>
-    }) : (tensor<1x128x4x256xf32>, tensor<131072x4xi32>, tensor<131072xf32>) -> tensor<1x128x4x256xf32>
-  return %0 : tensor<1x128x4x256xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_strided_slice_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_strided_slice_op.mlir
deleted file mode 100644
index 373eff80ff3c..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_strided_slice_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x4x256xf32>, %arg1: tensor<4xi32>, %arg2: tensor<4xi32>, %arg3: tensor<4xi32>) -> tensor<1x128x4x128xf32> {
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<1x128x4x256xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x128x4x128xf32>
-  return %0 : tensor<1x128x4x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_sub_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_sub_op.mlir
deleted file mode 100644
index e1483fed87d8..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_sub_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x4x128xf32>, %arg1: tensor<1x128x4x128xf32>) -> tensor<1x128x4x128xf32> {
-  %0 = tfl.sub %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1x128x4x128xf32>
-  return %0 : tensor<1x128x4x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_sum_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_sum_op.mlir
deleted file mode 100644
index bb4613d5b4b6..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_sum_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x2304xf32>) -> tensor<1x128x1xf32> {
-  %cst = "tfl.pseudo_const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
-  %0 = "tfl.sum"(%arg0, %cst) <{keep_dims = true}> : (tensor<1x128x2304xf32>, tensor<1xi32>) -> tensor<1x128x1xf32>
-  return %0 : tensor<1x128x1xf32>
-}
-}
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_tanh_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_tanh_op.mlir
deleted file mode 100644
index ce1d0302c8a8..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_tanh_op.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<1x128x8x128xf32>) -> tensor<1x128x8x128xf32> {
-  %0 = "tfl.tanh"(%arg0) : (tensor<1x128x8x128xf32>) -> tensor<1x128x8x128xf32>
-  return %0 : tensor<1x128x8x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/simple_transpose_op.mlir b/tensorflow/lite/experimental/litert/test/testdata/simple_transpose_op.mlir
deleted file mode 100644
index f24d72216897..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/simple_transpose_op.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<128x4x2x128xf32>) -> tensor<128x2x4x128xf32> {
-  %cst = "tfl.pseudo_const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<128x4x2x128xf32>, tensor<4xi32>) -> tensor<128x2x4x128xf32>
-  return %0 : tensor<128x2x4x128xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/two_adds.mlir b/tensorflow/lite/experimental/litert/test/testdata/two_adds.mlir
deleted file mode 100644
index 463dd456dc5c..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/two_adds.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-  %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  %1 = tfl.add %0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/two_partition.mlir b/tensorflow/lite/experimental/litert/test/testdata/two_partition.mlir
deleted file mode 100644
index 738c83091103..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/two_partition.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-module {
-func.func @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-    %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %1 = tfl.mul %0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %2 = tfl.add %1, %1 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    %3 = tfl.mul %2, %2 {fused_activation_function = "NONE"} : tensor<2x2xf32>
-    return %3 : tensor<2x2xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/test/testdata/unranked_tensor.mlir b/tensorflow/lite/experimental/litert/test/testdata/unranked_tensor.mlir
deleted file mode 100644
index 4e2403a7fadb..000000000000
--- a/tensorflow/lite/experimental/litert/test/testdata/unranked_tensor.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-module {
-func.func @main(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
-}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/tools/BUILD b/tensorflow/lite/experimental/litert/tools/BUILD
deleted file mode 100644
index 86091160dee5..000000000000
--- a/tensorflow/lite/experimental/litert/tools/BUILD
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/vendors/qualcomm:qualcomm_build_defs.bzl", "litert_cc_bin_with_qnn")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "apply_plugin",
-    srcs = ["apply_plugin.cc"],
-    hdrs = ["apply_plugin.h"],
-    deps = [
-        ":dump",
-        ":outstream",
-        ":tool_display",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/compiler/plugin:compiler_flags",
-        "//tensorflow/lite/experimental/litert/compiler/plugin:compiler_plugin",
-        "//tensorflow/lite/experimental/litert/core/model:model_serialize",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_test(
-    name = "apply_plugin_test",
-    srcs = ["apply_plugin_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/vendors/examples:example_plugin_so",
-    ],
-    tags = [
-        "noasan",
-        "nomsan",
-        "nosan",
-        "notsan",
-    ],
-    deps = [
-        ":apply_plugin",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/core:build_stamp",
-        "//tensorflow/lite/experimental/litert/core:dispatch_op_schema",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-litert_cc_bin_with_qnn(
-    name = "apply_plugin_main",
-    srcs = ["apply_plugin_main.cc"],
-    data = [
-        # copybara:uncomment_begin(google-only)
-        # "//platforms/darwinn/compiler:compiler_api_wrapper",
-        # copybara:uncomment_end
-        "//tensorflow/lite/experimental/litert/vendors/examples:example_plugin_so",
-        "//tensorflow/lite/experimental/litert/vendors/google_tensor/compiler:google_tensor_compiler_plugin_so",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:qnn_compiler_plugin_so",
-    ],
-    export_litert_only = 1,
-    include_system = 1,
-    linkstatic = 1,
-    # copybara:uncomment malloc = "//base:system_malloc",
-    tags = [
-        "noasan",
-        "nobuilder",
-        "nomsan",
-        "nosan",
-    ],
-    ungrte = True,
-    deps = [
-        ":apply_plugin",
-        ":outstream",
-        "//tensorflow/lite/experimental/litert/compiler/plugin:compiler_flags",
-        "//tensorflow/lite/experimental/litert/core:build_stamp",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-# Fork of "apply_plugin_main" without the "ungrte" so this tool can be used as part of larger
-# integration test pipelines with example_plugin.
-cc_binary(
-    name = "apply_plugin_main_for_test",
-    testonly = 1,
-    srcs = ["apply_plugin_main.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/vendors/examples:example_plugin_so",
-    ],
-    linkstatic = 1,
-    tags = [
-        "noasan",
-        "nomsan",
-        "nosan",
-    ],
-    deps = [
-        ":apply_plugin",
-        ":outstream",
-        "//tensorflow/lite/experimental/litert/compiler/plugin:compiler_flags",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "tool_display",
-    srcs = ["tool_display.cc"],
-    hdrs = ["tool_display.h"],
-    deps = [
-        ":outstream",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "tool_display_test",
-    srcs = ["tool_display_test.cc"],
-    data = ["//tensorflow/lite/experimental/litert/test:mlir_test_data"],
-    deps = [
-        ":tool_display",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "dump",
-    srcs = ["dump.cc"],
-    hdrs = ["dump.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/compiler/plugin:compiler_plugin",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "dump_test",
-    srcs = ["dump_test.cc"],
-    data = ["//tensorflow/lite/experimental/litert/test:mlir_test_data"],
-    deps = [
-        ":dump",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "outstream",
-    hdrs = ["outstream.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_binary(
-    name = "run_model",
-    srcs = ["run_model.cc"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_compiled_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_environment",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/flags:parse",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/tools/README.md b/tensorflow/lite/experimental/litert/tools/README.md
deleted file mode 100644
index d93870b25465..000000000000
--- a/tensorflow/lite/experimental/litert/tools/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-## run_model
-
-This is a simple tool to run a model with the CompiledModel API.
-
-```
-run_model --graph=<model_path>
-```
-
-If you're using the Dispatch API, you need to pass the Dispatch library
-(libLiteRtDispatch_xxx.so) location via `--dispatch_library_dir`
-
-```
-run_model --graph=<model_path> --dispatch_library_dir=<dispatch_library_dir>
-```
diff --git a/tensorflow/lite/experimental/litert/tools/apply_plugin.cc b/tensorflow/lite/experimental/litert/tools/apply_plugin.cc
deleted file mode 100644
index 81f9a92f1f6e..000000000000
--- a/tensorflow/lite/experimental/litert/tools/apply_plugin.cc
+++ /dev/null
@@ -1,515 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/tools/apply_plugin.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/log/absl_check.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_serialize.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/tools/dump.h"
-#include "tensorflow/lite/experimental/litert/tools/tool_display.h"
-
-namespace litert::tools {
-
-using ::litert::BufferRef;
-using ::litert::internal::CompilerFlags;
-using ::litert::internal::CompilerPlugin;
-using ::litert::internal::Dump;
-using ::litert::internal::PartitionResult;
-using ::litert::internal::SerializeModel;
-using ::litert::internal::VerifyFlatbuffer;
-using ::litert::tools::ApplyPluginRun;
-
-#define LITERT_ENSURE_CONFIG(expr)              \
-  if (!(expr)) {                                \
-    return kLiteRtStatusErrorInvalidToolConfig; \
-  }
-
-namespace {
-
-class Context {
- public:
-  using Ptr = std::unique_ptr<Context>;
-
-  explicit Context(ApplyPluginRun::Ptr run)
-      : run_(std::move(run)),
-        display_(ToolDisplay(std::move(run_->dump_out),
-                             Context::CmdStr(run_->cmd))) {}
-
-  ApplyPluginRun::Cmd Cmd() const { return run_->cmd; }
-
-  absl::Span<const absl::string_view> LibSearchPaths() const {
-    return absl::MakeConstSpan(run_->lib_search_paths.data(),
-                               run_->lib_search_paths.size());
-  }
-
-  absl::string_view SocModelTarget() const {
-    ABSL_CHECK_EQ(run_->soc_models.size(), 1);
-    return run_->soc_models.front();
-  }
-
-  absl::string_view SocManufacturer() const {
-    return run_->soc_manufacturer.value();
-  }
-
-  std::ostream& Out(size_t out_ind = 0) {
-    ABSL_CHECK_GE(run_->outs.size(), 1);
-    return run_->outs.at(out_ind);
-  }
-
-  const CompilerFlags& Flags() const { return run_->compiler_flags; }
-
-  OutStream SwapOut(OutStream out) {
-    ABSL_CHECK_EQ(run_->outs.size(), 1);
-    auto res = run_->outs.front();
-    run_->outs.at(0) = out;
-    return res;
-  }
-
-  uint32_t NumOuts() const { return run_->outs.size(); }
-
-  const ApplyPluginRun& Run() const { return *run_; }
-  ApplyPluginRun& Run() { return *run_; }
-
-  ToolDisplay& Dump() { return display_; }
-
-  static absl::string_view CmdStr(ApplyPluginRun::Cmd cmd);
-
- private:
-  ApplyPluginRun::Ptr run_;
-  ToolDisplay display_;
-};
-
-void DumpSubgraphs(ToolDisplay& display, absl::string_view label,
-                   absl::Span<LiteRtSubgraph> subgraphs) {
-  for (auto* subgraph : subgraphs) {
-    display.Labeled();
-    display.Indented() << absl::StreamFormat("(%s graph)", label);
-    Dump(*subgraph, display.Display());
-  }
-}
-
-void DumpCompilationRequest(ToolDisplay& display, absl::string_view soc_model,
-                            size_t num_subgraphs, const CompilerFlags& flags) {
-  display.Labeled() << absl::StreamFormat(
-                           "Requesting compilation for target `%s` on %lu "
-                           "partitions with flags: ",
-                           soc_model, num_subgraphs)
-                    << flags << "\n";
-}
-
-void DumpCompilationResult(ToolDisplay& display, size_t byte_code_size,
-                           size_t num_entry_points) {
-  display.Labeled() << absl::StreamFormat(
-      "Compiled %lu partitions into %lu bytes\n", num_entry_points,
-      byte_code_size);
-}
-
-void DumpModelStats(ToolDisplay& display, BufferRef<uint8_t> buf) {
-  display.Labeled() << absl::StreamFormat(
-      "Serialized a model of size %lu bytes\n", buf.Size());
-}
-
-void DumpPartitionResult(ToolDisplay& display, const PartitionResult& result) {
-  display.Labeled() << absl::StreamFormat(
-      "Partitioning yielded %lu new subgraphs\n", result.second.Size());
-
-  DumpSubgraphs(display, "new subgraphs", result.second.Elements());
-}
-
-absl::string_view Context::CmdStr(ApplyPluginRun::Cmd cmd) {
-  switch (cmd) {
-    case ApplyPluginRun::Cmd::INFO:
-      return "INFO";
-    case ApplyPluginRun::Cmd::NOOP:
-      return "NOOP";
-    case ApplyPluginRun::Cmd::PARTITION:
-      return "PARTITION";
-    case ApplyPluginRun::Cmd::COMPILE:
-      return "COMPILE";
-    case ApplyPluginRun::Cmd::APPLY:
-      return "APPLY";
-  }
-}
-
-Expected<std::vector<CompilerPlugin>> LoadAllPlugins(Context& ctx) {
-  ctx.Dump().Start("Load Plugins");
-  ctx.Dump().Labeled() << "Loading plugins from: ";
-  const auto paths = ctx.LibSearchPaths();
-  for (auto it = paths.begin(); it < paths.end(); ++it) {
-    ctx.Dump().Display() << *it;
-    if (it < paths.end() - 1) {
-      ctx.Dump().Display() << ", ";
-    }
-  }
-  ctx.Dump().Display() << "\n";
-
-  auto plugins = CompilerPlugin::LoadPlugins(ctx.LibSearchPaths());
-  if (!plugins.HasValue()) {
-    ctx.Dump().Fail();
-    return plugins;
-  }
-  ctx.Dump().Labeled() << "Found plugins\n";
-  ctx.Dump().Labeled() << absl::StreamFormat("Loaded %lu plugins\n",
-                                             plugins.Value().size());
-
-  ctx.Dump().Done();
-  return plugins;
-}
-
-Expected<CompilerPlugin> LoadPlugin(Context& ctx) {
-  auto plugins = LoadAllPlugins(ctx);
-  if (!plugins) {
-    return plugins.Error();
-  }
-
-  ctx.Dump().Start("Select Plugin");
-
-  for (auto& plugin : *plugins) {
-    if (plugin.SocManufacturer() == ctx.Run().soc_manufacturer) {
-      ctx.Dump().Labeled() << absl::StreamFormat("Selected plugin for: %s\n",
-                                                 plugin.SocManufacturer());
-      ctx.Dump().Done();
-      return std::move(plugin);
-    }
-  }
-
-  ctx.Dump().Fail();
-  return Unexpected(kLiteRtStatusErrorNotFound);
-}
-
-Expected<Model> LoadModel(Context& ctx) {
-  ctx.Dump().Start("Load Model");
-  ctx.Dump().Labeled() << absl::StreamFormat("Loading model from: %s\n",
-                                             ctx.Run().model.value());
-  auto model_result = Model::CreateFromFile(ctx.Run().model->data());
-  if (!model_result.HasValue()) {
-    ctx.Dump().Labeled() << "Failed to load model from file.";
-    ctx.Dump().Fail();
-    return model_result;
-  }
-
-  ctx.Dump().Labeled();
-  Dump(*model_result.Value().Get(), ctx.Dump().Display());
-  ctx.Dump().Done();
-
-  return model_result;
-}
-
-//
-// INFO Command
-//
-
-LiteRtStatus ValidateInfoRun(const ApplyPluginRun& run) {
-  LITERT_ENSURE_CONFIG(!run.lib_search_paths.empty());
-  LITERT_ENSURE_CONFIG(run.outs.size() == 1);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus Info(Context& ctx) {
-  auto plugins = LoadAllPlugins(ctx);
-  if (!plugins) {
-    return plugins.Error().Status();
-  }
-
-  for (auto& plugin : *plugins) {
-    ctx.Out() << absl::StreamFormat("< LiteRtCompilerPlugin > \"%s\" | ",
-                                    plugin.SocManufacturer());
-    const auto& models = plugin.SocModels();
-    for (auto it = models.begin(); it < models.end(); ++it) {
-      ctx.Out() << absl::StreamFormat("\"%s\"", *it);
-      if (it < models.end() - 1) {
-        ctx.Out() << ", ";
-      }
-    }
-    ctx.Out() << "\n";
-  }
-  return kLiteRtStatusOk;
-}
-
-//
-// NOOP Command
-//
-
-LiteRtStatus ValidateNoopRun(const ApplyPluginRun& run) {
-  LITERT_ENSURE_CONFIG(run.model.has_value());
-  LITERT_ENSURE_CONFIG(run.outs.size() == 1);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus Noop(Context& ctx) {
-  auto model = LoadModel(ctx);
-  if (!model) {
-    return model.Error().Status();
-  }
-
-  auto serialized = SerializeModel(std::move(*model->Get()));
-  if (!serialized) {
-    return serialized.Error().Status();
-  }
-  LITERT_ENSURE(VerifyFlatbuffer(serialized->Span()),
-                kLiteRtStatusErrorInvalidFlatbuffer,
-                "Failed to invalidate flatbuffer");
-  serialized->WriteStr(ctx.Out());
-  return kLiteRtStatusOk;
-}
-
-//
-// PARTITION Command
-//
-
-LiteRtStatus ValidatePartitionRun(const ApplyPluginRun& run) {
-  LITERT_ENSURE_CONFIG(!run.lib_search_paths.empty());
-  LITERT_ENSURE_CONFIG(run.model.has_value() && !run.model.value().empty());
-  LITERT_ENSURE_CONFIG(run.soc_manufacturer.has_value());
-  LITERT_ENSURE_CONFIG(!run.outs.empty());
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus Partition(Context& ctx) {
-  auto plugin = LoadPlugin(ctx);
-  if (!plugin) {
-    return plugin.Error().Status();
-  }
-
-  auto model_wrap = LoadModel(ctx);
-  if (!model_wrap) {
-    return model_wrap.Error().Status();
-  }
-  auto& model = *model_wrap->Get();
-
-  ctx.Dump().Start("Partitioning model");
-  auto partition_result = PartitionModel(*plugin, model);
-  if (!partition_result) {
-    return partition_result.Error().Status();
-  }
-  ctx.Dump().Done();
-  DumpPartitionResult(ctx.Dump(), *partition_result);
-
-  auto& new_subgraphs = partition_result->second;
-  model.TransferSubgraphsFrom(std::move(new_subgraphs));
-
-  ctx.Dump().Start("Serializing model");
-  auto serialized = SerializeModel(std::move(model));
-  DumpModelStats(ctx.Dump(), *serialized);
-  ctx.Dump().Done();
-
-  ctx.Dump().Start("Verifying flatbuffer");
-  LITERT_ENSURE(VerifyFlatbuffer(serialized->Span()),
-                kLiteRtStatusErrorInvalidFlatbuffer,
-                "Failed to invalidate flatbuffer");
-  ctx.Dump().Done();
-
-  ctx.Dump().Start("Writing to out");
-  serialized->WriteStr(ctx.Out());
-  ctx.Dump().Done();
-
-  return kLiteRtStatusOk;
-}
-
-//
-// COMPILE Command
-//
-
-LiteRtStatus ValidateCompileRun(const ApplyPluginRun& run) {
-  LITERT_ENSURE_CONFIG(!run.lib_search_paths.empty());
-  LITERT_ENSURE_CONFIG(run.model.has_value());
-  LITERT_ENSURE_CONFIG(run.soc_manufacturer.has_value());
-  // TODO: implement multi target compilation.
-  LITERT_ENSURE_SUPPORTED(run.soc_models.size() == 1,
-                          "Multi target compilation not implemented.");
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus Compile(Context& ctx) {
-  auto model_wrap = LoadModel(ctx);
-  if (!model_wrap) {
-    return model_wrap.Error().Status();
-  }
-  auto& model = *model_wrap->Get();
-
-  auto plugin = LoadPlugin(ctx);
-  if (!plugin) {
-    return plugin.Error().Status();
-  }
-
-  ctx.Dump().Start("Compiling");
-  DumpCompilationRequest(ctx.Dump(), ctx.SocModelTarget(), model.NumSubgraphs(),
-                         ctx.Flags());
-  plugin->SetFlags(ctx.Flags());
-  auto compilation_result = plugin->Compile(&model, ctx.SocModelTarget());
-  if (!compilation_result) {
-    ctx.Dump().Fail();
-    return compilation_result.Error().Status();
-  }
-
-  auto num_byte_code = compilation_result->NumByteCodeModules();
-  if (*num_byte_code < 1) {
-    ctx.Dump().Fail();
-    return compilation_result.Error().Status();
-  }
-  if (!num_byte_code) {
-    ctx.Dump().Fail();
-    return compilation_result.Error().Status();
-  }
-  for (int i = 0; i < ctx.NumOuts(); ++i) {
-    auto byte_code = compilation_result->ByteCode(i);
-    if (!byte_code) {
-      ctx.Dump().Fail();
-      return compilation_result.Error().Status();
-    }
-    auto num_calls = compilation_result->NumCalls();
-    if (!num_calls) {
-      ctx.Dump().Fail();
-      return compilation_result.Error().Status();
-    }
-
-    DumpCompilationResult(ctx.Dump(), byte_code->Size(), *num_calls);
-    byte_code->WriteStr(ctx.Out(i));
-  }
-  ctx.Dump().Done();
-
-  return kLiteRtStatusOk;
-}
-
-//
-// APPLY Command
-//
-
-LiteRtStatus ValidateApplyRun(const ApplyPluginRun& run) {
-  LITERT_ENSURE_CONFIG(!run.lib_search_paths.empty());
-  LITERT_ENSURE_CONFIG(run.model.has_value());
-  LITERT_ENSURE_CONFIG(run.soc_manufacturer.has_value());
-  LITERT_ENSURE_CONFIG(run.outs.size() == run.soc_models.size());
-  // TODO: implement multi target compilation.
-  LITERT_ENSURE_SUPPORTED(run.soc_models.size() == 1,
-                          "Multi target compilation not implemented.");
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus Apply(Context& ctx) {
-  auto model_wrap = LoadModel(ctx);
-  if (!model_wrap) {
-    return model_wrap.Error().Status();
-  }
-  auto& model = *model_wrap->Get();
-
-  auto plugin = LoadPlugin(ctx);
-  if (!plugin) {
-    return plugin.Error().Status();
-  }
-
-  ctx.Dump().Start("Applying plugin");
-  plugin->SetFlags(ctx.Flags());
-  if (auto status =
-          litert::internal::ApplyPlugin(*plugin, model, ctx.SocModelTarget());
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "%s", status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  ctx.Dump().Done();
-
-  ctx.Dump().Start("Serializing model");
-  auto serialized = SerializeModel(std::move(model));
-  DumpModelStats(ctx.Dump(), *serialized);
-  ctx.Dump().Done();
-
-  ctx.Dump().Start("Verifying flatbuffer");
-  LITERT_ENSURE(VerifyFlatbuffer(serialized->Span()),
-                kLiteRtStatusErrorInvalidFlatbuffer,
-                "Failed to invalidate flatbuffer");
-  ctx.Dump().Done();
-
-  ctx.Dump().Start("Writing to out");
-  serialized->WriteStr(ctx.Out());
-  ctx.Dump().Done();
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace
-
-LiteRtStatus ApplyPlugin(ApplyPluginRun::Ptr run) {
-  Context context(std::move(run));
-  DumpPreamble(context.Dump());
-
-  switch (context.Cmd()) {
-    case ApplyPluginRun::Cmd::INFO:
-      if (auto stat = ValidateInfoRun(context.Run()); stat != kLiteRtStatusOk) {
-        context.Dump().Labeled() << "Invalid arguments for INFO command\n";
-        return stat;
-      }
-      return Info(context);
-
-    case ApplyPluginRun::Cmd::PARTITION:
-      if (auto stat = ValidatePartitionRun(context.Run());
-          stat != kLiteRtStatusOk) {
-        context.Dump().Labeled() << "Invalid arguments for PARTITION command\n";
-        return stat;
-      }
-      return Partition(context);
-
-    case ApplyPluginRun::Cmd::COMPILE:
-      if (auto stat = ValidateCompileRun(context.Run());
-          stat != kLiteRtStatusOk) {
-        context.Dump().Labeled() << "Invalid arguments for COMPILE command\n";
-        return stat;
-      }
-      return Compile(context);
-
-    case ApplyPluginRun::Cmd::APPLY:
-      if (auto stat = ValidateApplyRun(context.Run());
-          stat != kLiteRtStatusOk) {
-        context.Dump().Labeled() << "Invalid arguments for APPLY command\n";
-        return stat;
-      }
-      return Apply(context);
-
-    case ApplyPluginRun::Cmd::NOOP:
-
-      if (auto stat = ValidateNoopRun(context.Run()); stat != kLiteRtStatusOk) {
-        context.Dump().Labeled() << "Invalid arguments for NOP command\n";
-        return stat;
-      }
-      return Noop(context);
-
-    default:
-      return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::tools
diff --git a/tensorflow/lite/experimental/litert/tools/apply_plugin.h b/tensorflow/lite/experimental/litert/tools/apply_plugin.h
deleted file mode 100644
index 35a976885d18..000000000000
--- a/tensorflow/lite/experimental/litert/tools/apply_plugin.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_APPLY_PLUGIN_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_APPLY_PLUGIN_H_
-
-#include <iostream>
-#include <memory>
-#include <optional>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h"
-#include "tensorflow/lite/experimental/litert/tools/outstream.h"
-
-namespace litert::tools {
-
-using ::litert::internal::CompilerFlags;
-
-struct ApplyPluginRun {
-  // NOTE: All StrFlagT are expected to have static storage duration.
-  using Ptr = std::unique_ptr<ApplyPluginRun>;
-
-  // A specific command implemented by the tool to run.
-  enum class Cmd {
-    // Displays info about all plugins found in given search paths.
-    //
-    // FLAG SEMANTICS:
-    // "lib_search_paths": Required, at least one.
-    // "model": Ignored.
-    // "soc_manufacturer": Optional, filters plugins to display.
-    // "soc_models": Ignored.
-    // "outs": Required, must be size one.
-    // "dump_out": Optional.
-    INFO,
-
-    // Does nothing and simply de-serializes and re-serializes the given model.
-    // This is intended for testing and internal debugging only.
-    //
-    // FLAG SEMANTICS:
-    // "lib_search_paths": Ignored.
-    // "model": Required.
-    // "soc_manufacturer": Ignored.
-    // "soc_models": Ignored.
-    // "outs": Required, must be size one.
-    // "dump_out": Optional.
-    NOOP,
-
-    // Runs the entire end to end flow. This is the standard compiler plugin
-    // usage. A seperate compilation step will occur for each sco_model tag that
-    // is supported by the loaded plugin, and a new output model will be
-    // generated for each. Partitioning is invariant accross different soc_model
-    // targets from the same manufacturer, so only one compilation step will
-    // occur even if multiple targest are requested.
-    //
-    // FLAG SEMANTICS:
-    // "lib_search_paths": Required, at least one.
-    // "model": Required.
-    // "soc_manufacturer": Required.
-    // "soc_models": Required, at least one.
-    // "outs": Required, must be size equal to "soc_models".
-    // "dump_out": Optional.
-    //
-    // TODO: Support multi target compilation.
-    APPLY,
-
-    // Only run the partiion step and skip compilation. Writes a ".tflite" model
-    // to "out" where selected partitions are manifested as new standard
-    // flatbuffer subgraphs added to the input model.
-    // The partitions original locations are replaced with a single custom op
-    // the contains an identifier to the corresponding partition (new subgraph).
-    // This is intended for testing and development.
-    //
-    // FLAG SEMANTICS:
-    // "lib_search_paths": Required, at least one.
-    // "model": Required.
-    // "soc_manufacturer": Required.
-    // "soc_models": Ignored.
-    // "outs": Required, must be size one.
-    // "dump_out": Optional.
-    PARTITION,
-
-    // Skip partitioning and run the entire input model through compilation
-    // directly. Fails if any ops in the input model are unsupported by the
-    // plugin. Writes the raw compiled result to the "out" stream without any
-    // wrapping flatbuffer. Runs multi-target compilation as in "APPLY",
-    // Intended for testing and development.
-    //
-    // FLAG SEMANTICS:
-    // "lib_search_paths": Required, at least one.
-    // "model": Required.
-    // "soc_manufacturer": Required.
-    // "soc_models": Required, at least one.
-    // "out": Required, must be size equal to "soc_models".
-    // "dump_out": Optional.
-    //
-    // TODO: Support multi target compilation.
-    COMPILE,
-  };
-
-  // A command to run, see above.
-  Cmd cmd;
-
-  // Collection of paths on local files system dictating where the tool should
-  // look for suitable LiteRtCompilerPlugin shared libraries. The tool will
-  // select the first ".so" file found with prefix "libLiteRtPlugin" that has
-  // the "soc_manufacturer" tag passed. Providing more than one plugin shared
-  // library for the same manufacturer results in an error.
-  std::vector<absl::string_view> lib_search_paths = {};
-
-  // Path to ".tflite" model the tool should operated on.
-  std::optional<absl::string_view> model = {};
-
-  // A tag representing a manufacturer the tool should target for compilation.
-  // This is used to select the appropriate plugin if multiple plugins are found
-  // in "lib_search_paths".
-  std::optional<absl::string_view> soc_manufacturer = {};
-
-  // Collection of soc models tags the tool should target for compilation.
-  std::vector<absl::string_view> soc_models = {};
-
-  // Where the tool should write its result file(s) to. If the command runs
-  // compilation, an "out" stream should be passed for each "soc_model" target
-  // requested for compilation. Output for the "ith" target will be written to
-  // the "ith" outs stream.
-  std::vector<OutStream> outs = {std::cout};
-
-  // Where to direct logging for this run. Passing nullopt here indicates
-  // "silent" behavior and should only be used when this tool is part of a
-  // larger pipeline like an end2end test.
-  UserStream dump_out;
-
-  // Compiler flags to pass to the plugin. Only relevant for "APPLY" and
-  // "COMPILE" commands.
-  CompilerFlags compiler_flags;
-};
-
-LiteRtStatus ApplyPlugin(ApplyPluginRun::Ptr run);
-
-}  // namespace litert::tools
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_APPLY_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/litert/tools/apply_plugin_main.cc b/tensorflow/lite/experimental/litert/tools/apply_plugin_main.cc
deleted file mode 100644
index d47e6390d481..000000000000
--- a/tensorflow/lite/experimental/litert/tools/apply_plugin_main.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expruns or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_flags.h"
-#include "tensorflow/lite/experimental/litert/tools/apply_plugin.h"
-#include "tensorflow/lite/experimental/litert/tools/outstream.h"
-
-using ::litert::tools::ApplyPlugin;
-using ::litert::tools::ApplyPluginRun;
-using ::litert::tools::UserStream;
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> cmd(
-    llvm::cl::Positional,
-    llvm::cl::desc("Routine to run (apply, partition, compile, info, noop)."),
-    llvm::cl::init("partition"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> model(
-    "model", llvm::cl::desc("Path to flatbuffer file."), llvm::cl::init(""));
-
-// TODO: b/366821557 - Support path to pre-compiled plugin in flags.
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> soc_manufacturer(
-    "soc_man",
-    llvm::cl::desc("String identifier of SoC manufacturer (e.g., GoogleTensor, "
-                   "Qualcomm)."),
-    llvm::cl::init("ExampleSocManufacturer"));
-
-// TODO: Support multi target compilation.
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> soc_model("soc_model",
-                                            llvm::cl::desc("Target SoC model."),
-                                            llvm::cl::init("ExampleSocModel"));
-
-// NOLINTNEXTLINE
-static llvm::cl::list<std::string> libs(
-    "libs",
-    llvm::cl::desc("List of directories in which to search for suitable "
-                   "compiler plugin shared libraries."),
-    llvm::cl::list_init(llvm::ArrayRef<std::string>{
-        "third_party/tensorflow/lite/experimental/litert/vendors/examples",
-        "third_party/tensorflow/lite/experimental/litert/vendors/qualcomm/"
-        "compiler",
-        "third_party/tensorflow/lite/experimental/litert/vendors/"
-        "google_tensor/compiler"}));
-
-// NOLINTNEXTLINE
-static llvm::cl::list<std::string> outs(
-    "o",
-    llvm::cl::desc("Path to files for output, \"-\" indicates standard out, "
-                   "\"--\" for standard err, \"none\" for null stream."),
-    llvm::cl::list_init(llvm::ArrayRef<std::string>{"-"}));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> err(
-    "err",
-    llvm::cl::desc("Path to file for err output, \"-\" indicates standard out, "
-                   "\"--\" for standard err, \"none\" for null stream."),
-    llvm::cl::init("--"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> compiler_flags(
-    "compiler-flags",
-    llvm::cl::desc("List of comma separated (no space) compiler flags. Flags "
-                   "may be key-value pairs "
-                   "in the format of \"key=value\", or just \"key\". E.g. "
-                   "\"--compiler-flags=key1=value1,key2\""));
-
-ApplyPluginRun::Ptr ParseFlags() {
-  auto res = std::make_unique<ApplyPluginRun>();
-
-  if (!model.empty()) {
-    res->model = model;
-  }
-
-  res->compiler_flags = *litert::internal::ParseCompilerFlags(compiler_flags);
-
-  res->soc_manufacturer = soc_manufacturer;
-  res->soc_models.push_back(soc_model);
-
-  res->lib_search_paths.assign(libs.begin(), libs.end());
-
-  if (cmd == "apply") {
-    res->cmd = ApplyPluginRun::Cmd::APPLY;
-  } else if (cmd == "partition") {
-    res->cmd = ApplyPluginRun::Cmd::PARTITION;
-  } else if (cmd == "compile") {
-    res->cmd = ApplyPluginRun::Cmd::COMPILE;
-  } else if (cmd == "info") {
-    res->cmd = ApplyPluginRun::Cmd::INFO;
-  } else if (cmd == "noop") {
-    res->cmd = ApplyPluginRun::Cmd::NOOP;
-  } else {
-    return nullptr;
-  }
-
-  return res;
-}
-
-int main(int argc, char* argv[]) {
-  llvm::cl::ParseCommandLineOptions(argc, argv);
-
-  auto run = ParseFlags();
-  if (run == nullptr) {
-    return 1;
-  }
-
-  run->outs.clear();
-  std::vector<std::unique_ptr<litert::tools::UserStream>> oss;
-  for (const auto& out : outs) {
-    oss.push_back(std::make_unique<litert::tools::UserStream>(
-        UserStream::MakeFromFlag(out)));
-    run->outs.push_back(oss.back()->Get());
-  }
-
-  run->dump_out = UserStream::MakeFromFlag(err);
-
-  run->dump_out.Get() << absl::StreamFormat(
-      "CMD: %s\nMODEL: %s\nSOC_MANUFACTURER: %s\nSOC_MODEL: %s\n", cmd, model,
-      soc_manufacturer, soc_model);
-
-  return ApplyPlugin(std::move(run));
-}
diff --git a/tensorflow/lite/experimental/litert/tools/apply_plugin_test.cc b/tensorflow/lite/experimental/litert/tools/apply_plugin_test.cc
deleted file mode 100644
index b86bc5ec19f8..000000000000
--- a/tensorflow/lite/experimental/litert/tools/apply_plugin_test.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/tools/apply_plugin.h"
-
-#include <cstdint>
-#include <memory>
-#include <sstream>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_check.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/build_stamp.h"
-#include "tensorflow/lite/experimental/litert/core/dispatch_op_schema.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace litert::tools {
-namespace {
-
-using ::litert::internal::kLiteRtBuildStampKey;
-using ::litert::internal::ParseBuildStamp;
-using ::testing::HasSubstr;
-using ::testing::litert::IsError;
-
-static constexpr absl::string_view kPluginSearchPath =
-    "third_party/tensorflow/lite/experimental/litert/vendors/examples";
-
-static constexpr absl::string_view kSocManufacturer = "ExampleSocManufacturer";
-
-static constexpr absl::string_view kSocModel = "ExampleSocModel";
-
-absl::string_view TestModelPath(absl::string_view filename) {
-  static char kModelPath[512] = {};
-  const auto model_path = ::litert::testing::GetTestFilePath(filename);
-  ABSL_CHECK(model_path.size() < 512);
-  model_path.copy(kModelPath, model_path.size(), 0);
-  return kModelPath;
-}
-
-ApplyPluginRun::Ptr MakeBaseRun(
-    ApplyPluginRun::Cmd cmd, absl::string_view model_path = "one_mul.tflite") {
-  auto run = std::make_unique<ApplyPluginRun>();
-  run->cmd = cmd;
-  run->lib_search_paths.push_back(kPluginSearchPath);
-  run->model.emplace(TestModelPath(model_path));
-  run->soc_manufacturer.emplace(kSocManufacturer);
-  run->soc_models.push_back(kSocModel);
-  run->outs.clear();
-  return run;
-}
-
-TEST(TestApplyPluginTool, TestInfoBadConfig) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::INFO);
-  run->lib_search_paths.clear();
-  EXPECT_THAT(ApplyPlugin(std::move(run)),
-              IsError(kLiteRtStatusErrorInvalidToolConfig));
-}
-
-TEST(TestApplyPluginTool, TestInfo) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::INFO);
-  std::stringstream out;
-  run->outs.push_back(out);
-  LITERT_ASSERT_OK(ApplyPlugin(std::move(run)));
-  EXPECT_THAT(out.str(),
-              ::testing::HasSubstr(
-                  "< LiteRtCompilerPlugin > \"ExampleSocManufacturer\" | "
-                  "\"ExampleSocModel\""));
-}
-
-TEST(TestApplyPluginTool, TestNoopBadConfig) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::NOOP);
-  run->model.reset();
-  EXPECT_THAT(ApplyPlugin(std::move(run)),
-              IsError(kLiteRtStatusErrorInvalidToolConfig));
-}
-
-TEST(TestApplyPluginTool, TestNoop) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::NOOP);
-  std::stringstream out;
-  run->outs.push_back(out);
-  LITERT_ASSERT_OK(ApplyPlugin(std::move(run)));
-
-  auto model = Model::CreateFromBuffer(
-      BufferRef<uint8_t>(out.view().data(), out.view().size()));
-  EXPECT_EQ(model->Get()->NumSubgraphs(), 1);
-}
-
-TEST(TestApplyPluginTool, TestPartitionBadConfig) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::PARTITION);
-  run->model.reset();
-  EXPECT_THAT(ApplyPlugin(std::move(run)),
-              IsError(kLiteRtStatusErrorInvalidToolConfig));
-}
-
-TEST(TestApplyPluginTool, TestPartition) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::PARTITION);
-  std::stringstream out;
-  run->outs.push_back(out);
-  LITERT_ASSERT_OK(ApplyPlugin(std::move(run)));
-  EXPECT_FALSE(out.str().empty());
-}
-
-TEST(TestApplyPluginTool, TestCompileBadConfig) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::COMPILE);
-  run->model.reset();
-  EXPECT_THAT(ApplyPlugin(std::move(run)),
-              IsError(kLiteRtStatusErrorInvalidToolConfig));
-}
-
-TEST(TestApplyPluginTool, TestCompile) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::COMPILE);
-  std::stringstream out;
-  run->outs.push_back(out);
-  LITERT_ASSERT_OK(ApplyPlugin(std::move(run)));
-  EXPECT_FALSE(out.str().empty());
-  EXPECT_THAT(out.str(), HasSubstr("Partition_0_with_1_muls"));
-}
-
-TEST(TestApplyPluginTool, TestApplyBadConfig) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::APPLY);
-  run->model.reset();
-  EXPECT_THAT(ApplyPlugin(std::move(run)),
-              IsError(kLiteRtStatusErrorInvalidToolConfig));
-}
-
-TEST(TestApplyPluginTool, TestApply) {
-  auto run = MakeBaseRun(ApplyPluginRun::Cmd::APPLY);
-  std::stringstream out;
-  run->outs.push_back(out);
-  LITERT_ASSERT_OK(ApplyPlugin(std::move(run)));
-
-  const auto out_str = out.str();
-  BufferRef<uint8_t> serialized(out_str.data(), out_str.size());
-
-  auto model = Model::CreateFromBuffer(serialized);
-  EXPECT_EQ(model->Get()->NumSubgraphs(), 1);
-
-  {
-    auto stamp_buffer = model->Get()->FindMetadata(kLiteRtBuildStampKey);
-    auto stamp = ParseBuildStamp(*stamp_buffer);
-    auto [man, soc_model] = *stamp;
-    EXPECT_EQ(man, kSocManufacturer);
-    EXPECT_EQ(soc_model, kSocModel);
-  }
-
-  auto* op = model->Get()->MainSubgraph()->Ops().front();
-  ASSERT_EQ(op->OpCode(), kLiteRtOpCodeTflCustom);
-
-  const auto options = internal::GetDispatchOpOptions(op->CustomOptions());
-  const auto& [size, offset, name] = options;
-  EXPECT_EQ(name, "Partition_0");
-  ASSERT_LE(offset + size, serialized.Size());
-
-  EXPECT_THAT(serialized.StrView().substr(offset, size),
-              HasSubstr("Partition_0_with_1_muls"));
-}
-
-TEST(TestApplyPluginTool, TestCompileToMultiByteCode) {
-  auto run =
-      MakeBaseRun(ApplyPluginRun::Cmd::COMPILE, "multi_subgraph_mul.tflite");
-  std::stringstream out_0;
-  std::stringstream out_1;
-  run->outs.push_back(out_0);
-  run->outs.push_back(out_1);
-
-  LITERT_ASSERT_OK(ApplyPlugin(std::move(run)));
-  EXPECT_FALSE(out_0.str().empty());
-  EXPECT_FALSE(out_1.str().empty());
-  EXPECT_THAT(out_0.str(), HasSubstr("Partition_0_with_1_muls"));
-  EXPECT_THAT(out_1.str(), HasSubstr("Partition_1_with_1_muls"));
-}
-
-}  // namespace
-}  // namespace litert::tools
diff --git a/tensorflow/lite/experimental/litert/tools/dump.cc b/tensorflow/lite/experimental/litert/tools/dump.cc
deleted file mode 100644
index 0edb8e7386cf..000000000000
--- a/tensorflow/lite/experimental/litert/tools/dump.cc
+++ /dev/null
@@ -1,404 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/tools/dump.h"
-
-#include <dlfcn.h>
-
-#ifndef __ANDROID__
-#if __has_include(<link.h>)
-#include <link.h>
-#endif
-#endif
-
-#include <cstdint>
-#include <ostream>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-
-namespace {
-
-static constexpr int kMaxDisplayCount = 16;
-
-void DumpNode(const LiteRtTensorT& tensor, std::ostream& out) {
-  switch (tensor.Type().first) {
-    case kLiteRtRankedTensorType:
-      Dump(tensor.Type().second.ranked_tensor_type, out);
-      break;
-    case kLiteRtUnrankedTensorType:
-      Dump(tensor.Type().second.unranked_tensor_type.element_type, out);
-      break;
-    default:
-      out << "UKNOWN_TENSOR_TYPE" << tensor.Type().first;
-  }
-  Dump(tensor.Qparams(), out);
-}
-
-void DumpNode(const LiteRtOpT& op, std::ostream& out) {
-  Dump(op.OpCode(), out);
-}
-
-void DumpSignature(const std::vector<LiteRtTensor>& ins,
-                   const std::vector<LiteRtTensor>& outs, std::ostream& out) {
-  out << "(";
-  for (auto it = ins.begin(); it < ins.end(); ++it) {
-    DumpNode(**it, out);
-    if (it != ins.end() - 1) {
-      out << ", ";
-    }
-  }
-  out << ")";
-
-  out << " -> ";
-  const bool paren_outs = outs.size() != 1;
-  if (paren_outs) {
-    out << "(";
-  }
-  for (auto it = outs.begin(); it < outs.end(); ++it) {
-    DumpNode(**it, out);
-    if (it != outs.end() - 1) {
-      out << ", ";
-    }
-  }
-  if (paren_outs) {
-    out << ")";
-  }
-}
-
-}  // namespace
-
-void Dump(LiteRtOpCode code, std::ostream& out) {
-  switch (code) {
-    case kLiteRtOpCodeTflAdd:
-      out << "TFL_ADD";
-      break;
-    case kLiteRtOpCodeTflMul:
-      out << "TFL_MUL";
-      break;
-    case kLiteRtOpCodeTflCustom:
-      out << "TFL_CUSTOM_OP";
-      break;
-    case kLiteRtOpCodeTflSlice:
-      out << "TFL_SLICE";
-      break;
-    case kLiteRtOpCodeTflDiv:
-      out << "TFL_DIV";
-      break;
-    case kLiteRtOpCodeTflRsqrt:
-      out << "TFL_RSQRT";
-      break;
-    case kLiteRtOpCodeTflTanh:
-      out << "TFL_TANH";
-      break;
-    case kLiteRtOpCodeTflSub:
-      out << "TFL_SUB";
-      break;
-    case kLiteRtOpCodeTflReshape:
-      out << "TFL_RESHAPE";
-      break;
-    case kLiteRtOpCodeTflBatchMatmul:
-      out << "TFL_BATCH_MATMUL";
-      break;
-    case kLiteRtOpCodeTflSum:
-      out << "TFL_SUM";
-      break;
-    case kLiteRtOpCodeTflConcatenation:
-      out << "TFL_CONCATENATION";
-      break;
-    case kLiteRtOpCodeTflSoftmax:
-      out << "TFL_SOFTMAX";
-      break;
-    case kLiteRtOpCodeTflCast:
-      out << "TFL_CAST";
-      break;
-    case kLiteRtOpCodeTflTranspose:
-      out << "TFL_TRANSPOSE";
-      break;
-    case kLiteRtOpCodeTflSin:
-      out << "TFL_SIN";
-      break;
-    case kLiteRtOpCodeTflCos:
-      out << "TFL_COS";
-      break;
-    case kLiteRtOpCodeTflSelect:
-      out << "TFL_SELECT";
-      break;
-    case kLiteRtOpCodeTflSelectV2:
-      out << "TFL_SELECT_V2";
-      break;
-    case kLiteRtOpCodeTflFullyConnected:
-      out << "TFL_FULLY_CONNECTED";
-      break;
-    case kLiteRtOpCodeTflEmbeddingLookup:
-      out << "TFL_EMBEDDING_LOOKUP";
-      break;
-    case kLiteRtOpCodeTflLogicalAnd:
-      out << "TFL_LOGICAL_AND";
-      break;
-    case kLiteRtOpCodeTflLess:
-      out << "TFL_LESS";
-      break;
-    case kLiteRtOpCodeTflGreater:
-      out << "TFL_GREATER";
-      break;
-    case kLiteRtOpCodeTflGelu:
-      out << "TFL_GELU";
-      break;
-    case kLiteRtOpCodeTflDynamicUpdateSlice:
-      out << "TFL_DYNAMIC_UPDATE_SLICE";
-      break;
-    case kLiteRtOpCodeTflPack:
-      out << "TFL_PACK";
-      break;
-    case kLiteRtOpCodeTflQuantize:
-      out << "TFL_QUANTIZE";
-      break;
-    default:
-      out << "UKNOWN_OP_CODE: " << code;
-      break;
-  }
-};
-
-// Dump details about the given LiteRtElementType to the given stream.
-void Dump(LiteRtElementType type, std::ostream& out) {
-  switch (type) {
-    case kLiteRtElementTypeFloat32:
-      out << "f32";
-      break;
-    case kLiteRtElementTypeInt32:
-      out << "i32";
-      break;
-    case kLiteRtElementTypeFloat64:
-      out << "f64";
-      break;
-    case kLiteRtElementTypeInt64:
-      out << "i64";
-      break;
-    case kLiteRtElementTypeFloat16:
-      out << "f16";
-      break;
-    case kLiteRtElementTypeInt16:
-      out << "i16";
-      break;
-    case kLiteRtElementTypeInt8:
-      out << "i8";
-      break;
-    case kLiteRtElementTypeUInt8:
-      out << "ui8";
-      break;
-    case kLiteRtElementTypeBool:
-      out << "i1";
-      break;
-    default:
-      out << "UKNNOWN_ELEMENT_TYPE: " << type;
-  }
-}
-
-void Dump(const LiteRtRankedTensorType& type, std::ostream& out) {
-  out << "<";
-  for (int i = 0; i < type.layout.rank; ++i) {
-    out << type.layout.dimensions[i] << "x";
-  }
-  Dump(type.element_type, out);
-  out << ">";
-}
-
-void Dump(const LiteRtTensorT& tensor, std::ostream& out) {
-  out << "LiteRtTensor : ";
-  DumpNode(tensor, out);
-  out << " [ ";
-  if (tensor.DefiningOp() == nullptr) {
-    out << "*";
-  } else {
-    DumpNode(*tensor.DefiningOp(), out);
-  }
-  out << " ] ";
-
-  out << "(";
-  for (auto it = tensor.Users().begin(); it < tensor.Users().end(); ++it) {
-    DumpNode(**it, out);
-    if (it != tensor.Users().end() - 1) {
-      out << ", ";
-    }
-  }
-  out << ")";
-  out << "\n";
-}
-
-void Dump(const LiteRtOpT& op, std::ostream& out) {
-  out << "LiteRtOp : [ ";
-  DumpNode(op, out);
-  out << " ] ";
-  DumpSignature(op.Inputs(), op.Outputs(), out);
-  out << "\n";
-}
-
-void Dump(const LiteRtSubgraphT& subgraph, std::ostream& out) {
-  constexpr absl::string_view kSubgraphTpl =
-      "LiteRtSubgraph : [ #ops=%d #tensors=%d ] ";
-  out << absl::StreamFormat(kSubgraphTpl, subgraph.Ops().size(),
-                            subgraph.Tensors().size());
-  DumpSignature(subgraph.Inputs(), subgraph.Outputs(), out);
-  out << "\n";
-}
-
-void Dump(const CompilerPlugin& plugin, std::ostream& out) {
-  constexpr absl::string_view kPluginDumpTpl =
-      "SocManufacturer: %s\nSocModels: { ";
-  out << absl::StreamFormat(kPluginDumpTpl, plugin.SocManufacturer());
-
-  for (auto it = plugin.SocModels().begin(); it < plugin.SocModels().end();
-       ++it) {
-    out << *it;
-    if (it != plugin.SocModels().end() - 1) {
-      out << ",";
-    }
-    out << " ";
-  }
-
-  out << "}\n";
-}
-
-
-void Dump(const LiteRtModelT& model, std::ostream& out) {
-  out << absl::StreamFormat("LiteRtModel : [ #subgraphs=%d ]\n",
-                            model.Subgraphs().size());
-}
-
-void DumpOptions(const LiteRtOpT& op, std::ostream& out) {
-  auto& opts = detail::GetTflOptions(op);
-  if (opts.value == nullptr) {
-    out << "null options\n";
-    return;
-  }
-  switch (op.OpCode()) {
-    case kLiteRtOpCodeTflAdd:
-      out << "fused_activation_function: "
-          << opts.AsAddOptions()->fused_activation_function << "\n";
-      break;
-    case kLiteRtOpCodeTflMul:
-      out << "fused_activation_function: "
-          << opts.AsMulOptions()->fused_activation_function << "\n";
-      break;
-    case kLiteRtOpCodeTflBatchMatmul:
-      out << "adj_x: " << opts.AsBatchMatMulOptions()->adj_x << "\n";
-      out << "adj_y: " << opts.AsBatchMatMulOptions()->adj_y << "\n";
-      out << "asymmetric_quantize_input: "
-          << opts.AsBatchMatMulOptions()->asymmetric_quantize_inputs << "\n";
-      break;
-    case kLiteRtOpCodeTflConcatenation:
-      out << "axis: " << opts.AsConcatenationOptions()->axis << "\n";
-      out << "fused_activation_function: "
-          << opts.AsConcatenationOptions()->fused_activation_function << "\n";
-      break;
-    case kLiteRtOpCodeTflDiv:
-      out << "fused_activation_function: "
-          << opts.AsDivOptions()->fused_activation_function << "\n";
-      break;
-    case kLiteRtOpCodeTflFullyConnected:
-      out << "weights_format: "
-          << opts.AsFullyConnectedOptions()->weights_format << "\n";
-      out << "keep_num_dims: " << opts.AsFullyConnectedOptions()->keep_num_dims
-          << "\n";
-      out << "quantized_bias_type: "
-          << opts.AsFullyConnectedOptions()->quantized_bias_type << "\n";
-      out << "asymmetric_quantize_input: "
-          << opts.AsFullyConnectedOptions()->asymmetric_quantize_inputs << "\n";
-      out << "fused_activation_function: "
-          << opts.AsFullyConnectedOptions()->fused_activation_function << "\n";
-      break;
-    case kLiteRtOpCodeTflSoftmax:
-      out << "beta: " << opts.AsSoftmaxOptions()->beta << "\n";
-      break;
-    case kLiteRtOpCodeTflStridedSlice:
-      out << "begin_mask: " << opts.AsStridedSliceOptions()->begin_mask << "\n";
-      out << "end_mask: " << opts.AsStridedSliceOptions()->end_mask << "\n";
-      out << "ellipsis_mask: " << opts.AsStridedSliceOptions()->ellipsis_mask
-          << "\n";
-      out << "new_axis_mask: " << opts.AsStridedSliceOptions()->new_axis_mask
-          << "\n";
-      out << "shrink_axis_mask: "
-          << opts.AsStridedSliceOptions()->shrink_axis_mask << "\n";
-      out << "offset: " << opts.AsStridedSliceOptions()->offset << "\n";
-      break;
-    case kLiteRtOpCodeTflSub:
-      out << "fused_activation_function: "
-          << opts.AsSubOptions()->fused_activation_function << "\n";
-      break;
-    case kLiteRtOpCodeTflReshape:
-      out << "new_shape: ";
-      if (opts.AsReshapeOptions() != nullptr) {
-        const int32_t* new_shape = opts.AsReshapeOptions()->new_shape.data();
-        int32_t new_shape_size = opts.AsReshapeOptions()->new_shape.size();
-        for (int i = 0; i < new_shape_size; ++i) {
-          out << new_shape[i] << " ";
-        }
-      }
-      break;
-    case kLiteRtOpCodeTflSum:
-      out << "keepdims: " << opts.AsReducerOptions()->keep_dims << "\n";
-      break;
-    case kLiteRtOpCodeTflPack:
-      out << "axis: " << opts.AsPackOptions()->axis << "\n";
-      break;
-    default:
-      out << "No options for op code: " << op.OpCode();
-      break;
-  }
-}
-
-void Dump(Quantization quantization, std::ostream& out) {
-  int max_display_count;
-  switch (quantization.first) {
-    case kLiteRtQuantizationNone:
-      return;
-    case kLiteRtQuantizationPerTensor:
-      out << absl::StreamFormat(" <q PerTensor [ .z = %ld, .s = %f ]>",
-                                quantization.second.per_tensor.zero_point,
-                                quantization.second.per_tensor.scale);
-      return;
-    case kLiteRtQuantizationPerChannel:
-      max_display_count =
-          kMaxDisplayCount < quantization.second.per_channel.num_channels
-              ? kMaxDisplayCount
-              : quantization.second.per_channel.num_channels;
-      out << absl::StreamFormat(" <q PerChannel [ .z = [ ");
-      for (int i = 0; i < max_display_count; ++i) {
-        out << absl::StreamFormat(
-            "%ld, ", quantization.second.per_channel.zero_points[i]);
-      }
-      out << "...], .s = [ ";
-      for (int i = 0; i < max_display_count; ++i) {
-        out << absl::StreamFormat("%f, ",
-                                  quantization.second.per_channel.scales[i]);
-      }
-      out << "...], ";
-      out << absl::StreamFormat(
-          ".d = %d>", quantization.second.per_channel.quantized_dimension);
-      return;
-    default:
-      out << " <q UNKNOWN>";
-      return;
-  }
-}
-
-}  // namespace litert::internal
diff --git a/tensorflow/lite/experimental/litert/tools/dump.h b/tensorflow/lite/experimental/litert/tools/dump.h
deleted file mode 100644
index 89254ae48e29..000000000000
--- a/tensorflow/lite/experimental/litert/tools/dump.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_DUMP_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_DUMP_H_
-
-#include <iostream>
-#include <istream>
-#include <ostream>
-
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/compiler/plugin/compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-
-namespace litert::internal {
-
-//
-// LiteRt IR
-//
-
-// Dump details about the given LiteRtOpT to the given stream.
-void Dump(const LiteRtOpT& op, std::ostream& out = std::cerr);
-
-// Dump details about the given LiteRtSubgraphT to the given stream.
-void Dump(const LiteRtSubgraphT& subgraph, std::ostream& out = std::cerr);
-
-// Dump details about the given LiteRtTensorT to the given stream.
-void Dump(const LiteRtTensorT& tensor, std::ostream& out = std::cerr);
-
-// Dump details about the given LiteRtOpCode to the given stream.
-void Dump(LiteRtOpCode code, std::ostream& out = std::cerr);
-
-// Dump details about the given LiteRtElementType to the given stream.
-void Dump(LiteRtElementType type, std::ostream& out = std::cerr);
-
-// Dump details about the given LiteRtRankedTensorType to the given stream.
-void Dump(const LiteRtRankedTensorType& type, std::ostream& out = std::cerr);
-
-// Dump details about the given LiteRtModel to the given stream.
-void Dump(const LiteRtModelT& model, std::ostream& out = std::cerr);
-
-// Dump details about the given quantization params.
-void Dump(Quantization quantization, std::ostream& out = std::cerr);
-
-// Dump details about options
-void DumpOptions(const LiteRtOpT& op, std::ostream& out = std::cerr);
-
-//
-// Library Utilities
-//
-
-// Dumps details about the loaded LiteRtCompilerPlugin library.
-void Dump(const CompilerPlugin& plugin, std::ostream& out = std::cerr);
-
-}  // namespace litert::internal
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_DUMP_H_
diff --git a/tensorflow/lite/experimental/litert/tools/dump_test.cc b/tensorflow/lite/experimental/litert/tools/dump_test.cc
deleted file mode 100644
index ff89547c2350..000000000000
--- a/tensorflow/lite/experimental/litert/tools/dump_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/tools/dump.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <sstream>
-#include <utility>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-
-namespace {
-
-using ::litert::internal::Dump;
-using ::litert::internal::DumpOptions;
-using ::litert::testing::LoadTestFileModel;
-
-TEST(DumpTest, TestDump) {
-  auto model = LoadTestFileModel("one_mul.tflite");
-
-  {
-    std::ostringstream model_dump;
-    Dump(*model.Get(), model_dump);
-    EXPECT_EQ(model_dump.view(), "LiteRtModel : [ #subgraphs=1 ]\n");
-  }
-
-  {
-    const LiteRtTensorT& in_tensor = model.Get()->Subgraph(0).Input(0);
-    std::ostringstream in_tensor_dump;
-    Dump(in_tensor, in_tensor_dump);
-    EXPECT_EQ(in_tensor_dump.view(),
-              "LiteRtTensor : <2x2xf32> [ * ] (TFL_MUL)\n");
-  }
-
-  {
-    const LiteRtTensorT& out_tensor = model.Get()->Subgraph(0).Output(0);
-    std::ostringstream out_tensor_dump;
-    Dump(out_tensor, out_tensor_dump);
-    EXPECT_EQ(out_tensor_dump.view(),
-              "LiteRtTensor : <2x2xf32> [ TFL_MUL ] ()\n");
-  }
-
-  {
-    const LiteRtOpT& op = model.Get()->Subgraph(0).Op(0);
-    std::ostringstream op_dump;
-    Dump(op, op_dump);
-    EXPECT_EQ(op_dump.view(),
-              "LiteRtOp : [ TFL_MUL ] (<2x2xf32>, <2x2xf32>) -> <2x2xf32>\n");
-  }
-
-  {
-    const LiteRtSubgraphT& subgraph = model.Get()->Subgraph(0);
-    std::ostringstream subgraph_dump;
-    Dump(subgraph, subgraph_dump);
-    EXPECT_EQ(
-        subgraph_dump.view(),
-        "LiteRtSubgraph : [ #ops=1 #tensors=3 ] (<2x2xf32>, <2x2xf32>) -> "
-        "<2x2xf32>\n");
-  }
-}
-
-TEST(DumpTest, TestDumpOptions) {
-  auto model = LoadTestFileModel("simple_strided_slice_op.tflite");
-  const LiteRtOpT& op = model.Get()->Subgraph(0).Op(0);
-  std::ostringstream op_dump;
-  DumpOptions(op, op_dump);
-  EXPECT_EQ(op_dump.view(),
-            "begin_mask: 0\n"
-            "end_mask: 0\n"
-            "ellipsis_mask: 0\n"
-            "new_axis_mask: 0\n"
-            "shrink_axis_mask: 0\n"
-            "offset: 0\n");
-}
-
-TEST(DumpTest, TestDumpPerTensorQuantization) {
-  QuantizationDetail per_tensor_detail;
-  per_tensor_detail.per_tensor.scale = 1.0;
-  per_tensor_detail.per_tensor.zero_point = 2;
-  std::ostringstream q_dump;
-  Dump(std::make_pair(kLiteRtQuantizationPerTensor, per_tensor_detail), q_dump);
-  EXPECT_EQ(q_dump.view(), " <q PerTensor [ .z = 2, .s = 1.000000 ]>");
-}
-
-TEST(DumpTest, TestDumpPerChannelQuantization) {
-  static constexpr size_t kRank = 2;
-  static constexpr size_t kQuantizedDimension = 1;
-  static constexpr float kScales[kRank] = {1.0, 2.0};
-  static constexpr int64_t kZps[kRank] = {2, 3};
-  QuantizationDetail per_channel_detail;
-  per_channel_detail.per_channel.scales = const_cast<float*>(kScales);
-  per_channel_detail.per_channel.zero_points = const_cast<int64_t*>(kZps);
-  per_channel_detail.per_channel.quantized_dimension = kQuantizedDimension;
-  per_channel_detail.per_channel.num_channels = kRank;
-  std::ostringstream q_dump;
-  Dump(std::make_pair(kLiteRtQuantizationPerChannel, per_channel_detail),
-       q_dump);
-  EXPECT_FALSE(q_dump.view().empty());
-}
-
-TEST(DumpTest, TestDumpNoQuantization) {
-  QuantizationDetail none_detail;
-  std::ostringstream q_dump;
-  Dump(std::make_pair(kLiteRtQuantizationNone, none_detail), q_dump);
-  EXPECT_TRUE(q_dump.view().empty());
-}
-
-TEST(DumpTest, TestDumpUnknownQuantization) {
-  QuantizationDetail detail;
-  std::ostringstream q_dump;
-  Dump(std::make_pair(kLiteRtQuantizationBlockWise, detail), q_dump);
-  EXPECT_EQ(q_dump.view(), " <q UNKNOWN>");
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/tools/outstream.h b/tensorflow/lite/experimental/litert/tools/outstream.h
deleted file mode 100644
index a920f2183959..000000000000
--- a/tensorflow/lite/experimental/litert/tools/outstream.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_OUTSTREAM_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_OUTSTREAM_H_
-
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <memory>
-#include <ostream>
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-
-namespace litert::tools {
-
-using OutStream = std::reference_wrapper<std::ostream>;
-using OutStreamPtr = std::unique_ptr<std::ostream>;
-
-// Out stream configured by a user by flag.
-class UserStream {
- public:
-  // Parse the flag and get a configured stream.
-  static UserStream MakeFromFlag(absl::string_view flag) {
-    if (flag == kCerr) {
-      LITERT_LOG(LITERT_INFO, "Setup cerr stream\n", "");
-      return UserStream(std::cerr);
-    } else if (flag == kCout) {
-      LITERT_LOG(LITERT_INFO, "Setup cout stream\n", "");
-      return UserStream(std::cout);
-    } else if (flag == kNone) {
-      LITERT_LOG(LITERT_INFO, "Setup null stream\n", "");
-      return UserStream();
-    } else {
-      // File stream.
-      LITERT_LOG(LITERT_INFO, "Setup file stream\n", "");
-      auto ofstream = std::make_unique<std::ofstream>();
-      ofstream->open(flag.data());
-      return UserStream(std::move(ofstream));
-    }
-  }
-
-  // Get the actual stream to write to.
-  OutStream Get() { return used_; }
-
-  // Silent stream.
-  UserStream()
-      : stored_(std::make_unique<std::ostream>(nullptr)), used_(*stored_) {}
-  // From reference to external stream (cerr, cout)
-  explicit UserStream(OutStream ostream) : stored_(nullptr), used_(ostream) {}
-  // From stream to internalize.
-  explicit UserStream(OutStreamPtr ostream)
-      : stored_(std::move(ostream)), used_(*stored_) {}
-
-  UserStream(UserStream&&) = default;
-  UserStream& operator=(UserStream&&) = default;
-
- private:
-  // These are used in the various CLI's flags that configure output streams.
-  static constexpr absl::string_view kCerr = "--";
-  static constexpr absl::string_view kCout = "-";
-  static constexpr absl::string_view kNone = "none";
-
-  OutStreamPtr stored_;
-  OutStream used_;
-};
-
-}  // namespace litert::tools
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_OUTSTREAM_H_
diff --git a/tensorflow/lite/experimental/litert/tools/run_model.cc b/tensorflow/lite/experimental/litert/tools/run_model.cc
deleted file mode 100644
index ab604307c56f..000000000000
--- a/tensorflow/lite/experimental/litert/tools/run_model.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdlib>
-#include <cstring>
-#include <string>
-#include <vector>
-
-#include "absl/flags/flag.h"
-#include "absl/flags/parse.h"
-#include "absl/log/absl_log.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_compiled_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_environment.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-ABSL_FLAG(std::string, graph, "", "Model filename to use for testing.");
-ABSL_FLAG(std::string, dispatch_library_dir, "",
-          "Path to the dispatch library.");
-
-namespace litert {
-namespace {
-
-Expected<void> RunModel() {
-  if (absl::GetFlag(FLAGS_graph).empty()) {
-    return Error(kLiteRtStatusErrorInvalidArgument,
-                 "Model filename is empty. Use --graph to provide it.");
-  }
-
-  ABSL_LOG(INFO) << "Model: " << absl::GetFlag(FLAGS_graph);
-  LITERT_ASSIGN_OR_RETURN(auto model,
-                          Model::CreateFromFile(absl::GetFlag(FLAGS_graph)));
-
-  const std::string dispatch_library_dir =
-      absl::GetFlag(FLAGS_dispatch_library_dir);
-
-  std::vector<litert::Environment::Option> environment_options = {};
-  if (!dispatch_library_dir.empty()) {
-    environment_options.push_back(litert::Environment::Option{
-        litert::Environment::OptionTag::DispatchLibraryDir,
-        absl::string_view(dispatch_library_dir)});
-  };
-
-  LITERT_ASSIGN_OR_RETURN(
-      auto env,
-      litert::Environment::Create(absl::MakeConstSpan(environment_options)));
-
-  ABSL_LOG(INFO) << "Create CompiledModel";
-  LITERT_ASSIGN_OR_RETURN(
-      auto compiled_model,
-      CompiledModel::Create(env, model, kLiteRtHwAcceleratorNone));
-
-  LITERT_ASSIGN_OR_RETURN(auto signatures, model.GetSignatures());
-  size_t signature_index = 0;
-
-  ABSL_LOG(INFO) << "Prepare input buffers";
-
-  LITERT_ASSIGN_OR_RETURN(auto input_buffers,
-                          compiled_model.CreateInputBuffers(signature_index));
-
-  ABSL_LOG(INFO) << "Prepare output buffers";
-
-  LITERT_ASSIGN_OR_RETURN(auto output_buffers,
-                          compiled_model.CreateOutputBuffers(signature_index));
-
-  ABSL_LOG(INFO) << "Run model";
-  auto status =
-      compiled_model.Run(signature_index, input_buffers, output_buffers);
-
-  ABSL_LOG(INFO) << "Model run completed";
-
-  return status;
-}
-
-}  // namespace
-}  // namespace litert
-
-int main(int argc, char** argv) {
-  absl::ParseCommandLine(argc, argv);
-
-  auto res = litert::RunModel();
-  if (!res) {
-    LITERT_LOG(LITERT_ERROR, "%s", res.Error().Message().c_str());
-    return EXIT_FAILURE;
-  }
-  return EXIT_SUCCESS;
-}
diff --git a/tensorflow/lite/experimental/litert/tools/tool_display.cc b/tensorflow/lite/experimental/litert/tools/tool_display.cc
deleted file mode 100644
index 2067d7826adb..000000000000
--- a/tensorflow/lite/experimental/litert/tools/tool_display.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/tools/tool_display.h"
-
-#include <ostream>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/tools/outstream.h"
-
-namespace litert::tools {
-
-std::string ToolDisplay::MakeLabel(absl::string_view tool_label) {
-  return absl::StrFormat(
-      "[LITERT_TOOLS%s] ",
-      tool_label.empty() ? tool_label : absl::StrFormat(":%s", tool_label));
-}
-
-std::ostream& ToolDisplay::Display() { return ostream_.Get(); }
-
-std::ostream& ToolDisplay::Labeled() {
-  Display() << label_;
-  return Display();
-}
-
-std::ostream& ToolDisplay::Indented() {
-  Display() << "\t";
-  return Display();
-}
-
-void ToolDisplay::Start(const absl::string_view scope_name) {
-  static constexpr absl::string_view kStartFmt = "Starting %s...\n";
-  Labeled() << absl::StreamFormat(kStartFmt, scope_name);
-}
-
-void ToolDisplay::Done(const absl::string_view scope_name) {
-  static constexpr absl::string_view kDoneFmt = "%s Done!\n";
-  Labeled() << "";
-  Indented() << absl::StreamFormat(kDoneFmt, scope_name);
-}
-
-void ToolDisplay::Fail() {
-  Labeled() << "";
-  Indented() << "Failed\n";
-}
-
-ToolDisplay::LoggedScope ToolDisplay::StartS(absl::string_view scope_name) {
-  return LoggedScope(*this, scope_name);
-}
-
-void ToolDisplay::LoggedScope::Start() { parent_.Start(scope_name_); }
-
-void ToolDisplay::LoggedScope::Done() { parent_.Done(scope_name_); }
-
-ToolDisplay::LoggedScope::~LoggedScope() { Done(); }
-
-ToolDisplay::LoggedScope::LoggedScope(ToolDisplay& parent,
-                                      absl::string_view scope_name)
-    : parent_(parent), scope_name_(scope_name) {
-  Start();
-}
-
-static constexpr absl::string_view kArt = R"(
-    __    _ __       ____  __
-   / /   (_/ /____  / __ \/ /_
-  / /   / / __/ _ \/ /_/ / __/
- / /___/ / /_/  __/ _, _/ /_
-/_____/_/\__/\___/_/ |_|\__/
-)";
-
-void DumpPreamble(ToolDisplay& display) { display.Display() << kArt << "\n"; }
-
-}  // namespace litert::tools
diff --git a/tensorflow/lite/experimental/litert/tools/tool_display.h b/tensorflow/lite/experimental/litert/tools/tool_display.h
deleted file mode 100644
index 583d07ee3480..000000000000
--- a/tensorflow/lite/experimental/litert/tools/tool_display.h
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_TOOL_DISPLAY_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_TOOL_DISPLAY_H_
-
-#include <memory>
-#include <ostream>
-#include <string>
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/tools/outstream.h"
-
-namespace litert::tools {
-
-// Utility class for interactive logging for usage in command line tools only.
-// Allows user to explicitly set target stream.
-class ToolDisplay {
- public:
-  using Ptr = std::unique_ptr<ToolDisplay>;
-  // Construct configured ToolDisplay. Label is used for prefixing dumps
-  // in "LabeledStream".
-  explicit ToolDisplay(UserStream&& ostream, absl::string_view tool_label = "")
-      : label_(MakeLabel(tool_label)),
-        ostream_(std::forward<UserStream>(ostream)) {}
-  explicit ToolDisplay(OutStream ostream, absl::string_view tool_label = "")
-      : label_(MakeLabel(tool_label)), ostream_(UserStream(ostream)) {}
-
-  ToolDisplay(const ToolDisplay&) = delete;
-  ToolDisplay& operator=(const ToolDisplay&) = delete;
-  ToolDisplay(ToolDisplay&&) = delete;
-  ToolDisplay& operator=(ToolDisplay&&) = delete;
-
-  // Get out stream.
-  std::ostream& Display();
-
-  // Get Display with label prefix.
-  std::ostream& Labeled();
-
-  // Get Display with indent.
-  std::ostream& Indented();
-
-  // Log string indicating a sub rountine is beginning.
-  void Start(absl::string_view scope_name);
-
-  // Log string indicating a sub rountine is done and succeeded.
-  void Done(absl::string_view scope_name = "");
-
-  // Log string indicating a sub rountine is done and failed.
-  void Fail();
-
-  // Logs "start/finish" messages automatically.
-  class LoggedScope {
-    friend class ToolDisplay;
-
-   public:
-    LoggedScope(const LoggedScope&) = delete;
-    LoggedScope& operator=(const LoggedScope&) = delete;
-    LoggedScope(LoggedScope&&) = delete;
-    LoggedScope& operator=(LoggedScope&&) = delete;
-
-    ~LoggedScope();
-
-   private:
-    explicit LoggedScope(ToolDisplay& parent, absl::string_view scope_name);
-
-    void Start();
-    void Done();
-
-    ToolDisplay& parent_;
-    // These should all be from literals.
-    absl::string_view scope_name_;
-  };
-
-  // Get object that prints a start message and an exit message
-  // automatically when it goes out of scope.
-  [[maybe_unused]] LoggedScope StartS(absl::string_view scope_name);
-
- private:
-  static std::string MakeLabel(absl::string_view tool_label);
-  std::string label_;
-  UserStream ostream_;
-};
-
-// Print art and info at cli startup.
-void DumpPreamble(ToolDisplay& display);
-
-}  // namespace litert::tools
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_TOOLS_TOOL_DISPLAY_H_
diff --git a/tensorflow/lite/experimental/litert/tools/tool_display_test.cc b/tensorflow/lite/experimental/litert/tools/tool_display_test.cc
deleted file mode 100644
index 94027f663c30..000000000000
--- a/tensorflow/lite/experimental/litert/tools/tool_display_test.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/tools/tool_display.h"
-
-#include <sstream>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-
-namespace {
-
-using ::litert::tools::ToolDisplay;
-using ::testing::EndsWith;
-using ::testing::StartsWith;
-
-static constexpr absl::string_view kToolName = "test-tool";
-static constexpr absl::string_view kLabel = "[LITERT_TOOLS:test-tool]";
-static constexpr absl::string_view kStartLabel = "Test Routine";
-static constexpr absl::string_view kDisplayInfo = "info";
-
-TEST(TestToolDisplay, Display) {
-  std::stringstream out;
-  ToolDisplay display(out, kToolName);
-  display.Display() << kDisplayInfo;
-  EXPECT_EQ(out.view(), kDisplayInfo);
-}
-
-TEST(TestToolDisplay, Indented) {
-  std::stringstream out;
-  ToolDisplay display(out, kToolName);
-  display.Indented() << kDisplayInfo;
-  EXPECT_EQ(out.view(), absl::StrFormat("\t%s", kDisplayInfo));
-}
-
-TEST(TestToolDisplay, Labeled) {
-  std::stringstream out;
-  ToolDisplay display(out, kToolName);
-  display.Labeled() << kDisplayInfo;
-  EXPECT_EQ(out.view(), absl::StrFormat("%s %s", kLabel, kDisplayInfo));
-}
-
-TEST(TestToolDisplay, LabeledNoToolName) {
-  std::stringstream out;
-  ToolDisplay display(out);
-  display.Labeled() << kDisplayInfo;
-  EXPECT_EQ(out.view(),
-            absl::StrFormat("%s %s", "[LITERT_TOOLS]", kDisplayInfo));
-}
-
-TEST(TestToolDisplay, Start) {
-  std::stringstream out;
-  ToolDisplay display(out, kToolName);
-  display.Start(kStartLabel);
-  EXPECT_EQ(out.view(),
-            absl::StrFormat("%s Starting %s...\n", kLabel, kStartLabel));
-}
-
-TEST(TestToolDisplay, Done) {
-  std::stringstream out;
-  ToolDisplay display(out, kToolName);
-  display.Done(kStartLabel);
-  EXPECT_EQ(out.view(),
-            absl::StrFormat("%s \t%s Done!\n", kLabel, kStartLabel));
-}
-
-TEST(TestToolDisplay, Fail) {
-  std::stringstream out;
-  ToolDisplay display(out, kToolName);
-  display.Fail();
-  EXPECT_EQ(out.view(), absl::StrFormat("%s \tFailed\n", kLabel));
-}
-
-TEST(TestLoggedScope, EnterExit) {
-  std::stringstream out;
-  ToolDisplay display(out, kToolName);
-  {
-    auto s = display.StartS(kStartLabel);
-  }
-  EXPECT_THAT(out.view(), StartsWith(absl::StrFormat("%s Starting %s...\n",
-                                                     kLabel, kStartLabel)));
-  EXPECT_THAT(out.view(), EndsWith(absl::StrFormat("%s \t%s Done!\n", kLabel,
-                                                   kStartLabel)));
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/vendors/c/BUILD b/tensorflow/lite/experimental/litert/vendors/c/BUILD
deleted file mode 100644
index 0692c1f0cd4a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/c/BUILD
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "litert_compiler_plugin",
-    hdrs = ["litert_compiler_plugin.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-    ],
-)
-
-cc_library(
-    name = "litert_compiler_plugin_api",
-    hdrs = ["litert_compiler_plugin_api.h"],
-    deps = [
-        ":litert_compiler_plugin",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-# This library is used to build the C API header files for the vendor dispatch API.
-# All the vendor dispatch .so tragets should depend on this library.
-cc_library(
-    name = "litert_dispatch_c_api",
-    hdrs = [
-        "litert_dispatch.h",
-        "litert_dispatch_api.h",
-    ],
-    deps = [
-        # only depend on the headers, not the implementation.
-        "//tensorflow/lite/experimental/litert/c:litert_dispatch_headers",
-    ],
-)
-
-# This test verifies that the C API header files can build via C compiler.
-cc_test(
-    name = "litert_vendor_c_api_common_test",
-    srcs = ["litert_vendor_c_api_common_test.c"],
-    copts = ["--std=c11"],
-    linkopts = ["-ldl"],
-    deps = [
-        ":litert_compiler_plugin",
-        ":litert_compiler_plugin_api",
-        ":litert_dispatch_c_api",
-    ],
-)
-
-exports_files(srcs = glob(["litert_*.h"]))
diff --git a/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h b/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h
deleted file mode 100644
index 6b971d420160..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_H_
-
-#include <stddef.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-LITERT_DEFINE_HANDLE(LiteRtCompilerPlugin);
-
-// Artifact produced from compiling a selected partition of ops.
-LITERT_DEFINE_HANDLE(LiteRtCompiledResult);
-
-//
-// Plugin
-//
-
-LiteRtStatus LiteRtGetCompilerPluginVersion(LiteRtApiVersion* api_version);
-
-// Name associated with the manufacturer this plugin relates to (e.g,
-// GoogleTensor, Qualcomm).
-const char* LiteRtGetCompilerPluginSocManufacturer();
-
-LiteRtStatus LiteRtCreateCompilerPlugin(LiteRtCompilerPlugin* compiler_plugin);
-
-void LiteRtDestroyCompilerPlugin(LiteRtCompilerPlugin compiler_plugin);
-
-// Return the HW supported by this plugin (e.g., GPU, NPU)
-LiteRtStatus LiteRtGetCompilerPluginSupportedHardware(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtHwAccelerators* supported_hardware);
-
-// Number of SoC models supported by this plugin.
-LiteRtStatus LiteRtGetNumCompilerPluginSupportedSocModels(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtParamIndex* num_supported_soc_models);
-
-// Gets the name of the SoC model at the given index. The memory
-// associated with the returned name is owned by the plugin.
-LiteRtStatus LiteRtGetCompilerPluginSupportedSocModel(
-    LiteRtCompilerPlugin compiler_plugin, LiteRtParamIndex soc_model_idx,
-    const char** soc_model_name);
-
-// Select desired ops for compilation. This will only be called once
-// per subgraph, plugins should select all supportable ops.
-LiteRtStatus LiteRtCompilerPluginPartition(LiteRtCompilerPlugin compiler_plugin,
-                                           LiteRtSubgraph subgraph,
-                                           LiteRtOpList selected_ops);
-
-// Prepare result to pass to the runtime for given model containing partitioned
-// subgraphs. Optionally, handles a SoC model (parameter `soc_model` can be NULL
-// to specify a default SoC model).
-LiteRtStatus LiteRtCompilerPluginCompile(LiteRtCompilerPlugin compiler_plugin,
-                                         const char* soc_model,
-                                         LiteRtModel partitions,
-                                         LiteRtCompiledResult* compiled_result);
-
-// Set any flags for the compiler do use during compilation. Flag data may be
-// released or reused after this function returns. Flags are string key ->
-// optional string value pairs. A non-existent value is represented by an empty
-// string. Calling this function will unset any previously set flags.
-LiteRtStatus LiteRtCompilerPluginSetFlags(LiteRtCompilerPlugin compiler_plugin,
-                                          LiteRtParamIndex num_flags,
-                                          const char** keys,
-                                          const char** values);
-
-//
-// Compiled Partition
-//
-
-void LiteRtDestroyCompiledResult(LiteRtCompiledResult result);
-
-// Get the buffer for the compiled byte code for the given index.
-LiteRtStatus LiteRtGetCompiledResultByteCode(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex byte_code_idx,
-    const void** byte_code, size_t* byte_code_size);
-
-// The number of individual byte code modules.
-LiteRtStatus LiteRtCompiledResultNumByteCodeModules(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_byte_code);
-
-// Get per-op info related to a particular compiled partition as well as the
-// index of the respective byte code buffer.
-LiteRtStatus LiteRtGetCompiledResultCallInfo(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex call_idx,
-    const void** call_info, size_t* call_info_size,
-    LiteRtParamIndex* byte_code_idx);
-
-// Get the number of calls that will be made to the HAL for this graph.
-// This should equal the number of partitions given for compilation which
-// is equal to the number of custom ops in the final model.
-LiteRtStatus LiteRtGetNumCompiledResultCalls(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_calls);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h b/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h
deleted file mode 100644
index a39482a28f77..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_API_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_API_H_
-
-#include <stddef.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-
-// Wrapper for dynamically loaded LiteRtCompilerPlugin library. See
-// "litert_compiler_plugin.h".
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-//
-// Api Interface
-//
-
-typedef LiteRtStatus (*LiteRtGetCompilerPluginVersionT)(LiteRtApiVersion*);
-
-typedef const char* (*LiteRtGetCompilerPluginSocManufacturerT)();
-
-typedef LiteRtStatus (*LiteRtCreateCompilerPluginT)(LiteRtCompilerPlugin*);
-
-typedef void (*LiteRtDestroyCompilerPluginT)(LiteRtCompilerPlugin);
-
-typedef LiteRtStatus (*LiteRtGetCompilerPluginSupportedHardwareT)(
-    LiteRtCompilerPlugin, LiteRtHwAccelerators*);
-
-typedef LiteRtStatus (*LiteRtGetNumCompilerPluginSupportedSocModelsT)(
-    LiteRtCompilerPlugin, LiteRtParamIndex*);
-
-typedef LiteRtStatus (*LiteRtGetCompilerPluginSupportedSocModelT)(
-    LiteRtCompilerPlugin, LiteRtParamIndex soc_model_idx,
-    const char** soc_moel_idx);
-
-typedef LiteRtStatus (*LiteRtCompilerPluginPartitionT)(
-    LiteRtCompilerPlugin, LiteRtSubgraph subgraph, LiteRtOpList selected_ops);
-
-typedef LiteRtStatus (*LiteRtCompilerPluginCompileT)(
-    LiteRtCompilerPlugin, const char* soc_model, LiteRtModel partitions,
-    LiteRtCompiledResult* compiled_result);
-
-typedef void (*LiteRtDestroyCompiledResultT)(LiteRtCompiledResult);
-
-typedef LiteRtStatus (*LiteRtGetCompiledResultByteCodeT)(
-    LiteRtCompiledResult, LiteRtParamIndex byte_code_idx,
-    const void** byte_code, size_t* byte_code_size);
-
-typedef LiteRtStatus (*LiteRtCompiledResultNumByteCodeModulesT)(
-    LiteRtCompiledResult, LiteRtParamIndex* num_byte_code);
-
-typedef LiteRtStatus (*LiteRtGetCompiledResultCallInfoT)(
-    LiteRtCompiledResult, LiteRtParamIndex call_idx, const void** call_info,
-    size_t* call_info_size, LiteRtParamIndex* byte_code_idx);
-
-typedef LiteRtStatus (*LiteRtGetNumCompiledResultCallsT)(
-    LiteRtCompiledResult, LiteRtParamIndex* num_calls);
-
-typedef LiteRtStatus (*LiteRtCompilerPluginSetFlagsT)(
-    LiteRtCompilerPlugin compiler_plugin, LiteRtParamIndex num_flags,
-    const char** keys, const char** values);
-
-//
-// Function Pointer Container
-//
-
-// Wraps all resolved functions from api interface.
-struct LiteRtCompilerPluginApi {
-  LiteRtGetCompilerPluginVersionT get_compiler_plugin_version;
-  LiteRtGetCompilerPluginSocManufacturerT get_compiler_plugin_soc_manufacturer;
-  LiteRtCreateCompilerPluginT create_compiler_plugin;
-  LiteRtDestroyCompilerPluginT destroy_compiler_plugin;
-
-  LiteRtGetCompilerPluginSupportedHardwareT
-      get_compiler_plugin_supported_hardware;
-  LiteRtGetNumCompilerPluginSupportedSocModelsT
-      get_num_compiler_plugin_supported_models;
-  LiteRtGetCompilerPluginSupportedSocModelT
-      get_compiler_plugin_supported_soc_model;
-
-  LiteRtCompilerPluginPartitionT compiler_plugin_partition;
-  LiteRtCompilerPluginCompileT compiler_plugin_compile;
-
-  LiteRtDestroyCompiledResultT destroy_compiled_result;
-  LiteRtGetCompiledResultByteCodeT get_compiled_result_byte_code;
-  LiteRtCompiledResultNumByteCodeModulesT get_compiled_result_num_byte_code;
-  LiteRtGetCompiledResultCallInfoT get_compiled_result_call_info;
-  LiteRtGetNumCompiledResultCallsT get_compiled_result_num_calls;
-
-  LiteRtCompilerPluginSetFlagsT set_flags;
-};
-
-#ifdef __cplusplus
-}
-
-#include "absl/strings/string_view.h"
-
-static constexpr absl::string_view kLiteRtGetCompilerPluginVersion =
-    "LiteRtGetCompilerPluginVersion";
-
-static constexpr absl::string_view kLiteRtGetCompilerPluginSupportedHardware =
-    "LiteRtGetCompilerPluginSupportedHardware";
-
-static constexpr absl::string_view kLiteRtGetCompilerPluginSocManufacturer =
-    "LiteRtGetCompilerPluginSocManufacturer";
-static constexpr absl::string_view
-    kLiteRtGetNumCompilerPluginSupportedSocModels =
-        "LiteRtGetNumCompilerPluginSupportedSocModels";
-static constexpr absl::string_view kLiteRtGetCompilerPluginSupportedSocModel =
-    "LiteRtGetCompilerPluginSupportedSocModel";
-
-static constexpr absl::string_view kLiteRtCreateCompilerPlugin =
-    "LiteRtCreateCompilerPlugin";
-static constexpr absl::string_view kLiteRtDestroyCompilerPlugin =
-    "LiteRtDestroyCompilerPlugin";
-
-static constexpr absl::string_view kLiteRtCompilerPluginPartition =
-    "LiteRtCompilerPluginPartition";
-static constexpr absl::string_view kLiteRtCompilerPluginCompile =
-    "LiteRtCompilerPluginCompile";
-
-static constexpr absl::string_view kLiteRtDestroyCompiledResult =
-    "LiteRtDestroyCompiledResult";
-static constexpr absl::string_view kLiteRtGetCompiledResultByteCode =
-    "LiteRtGetCompiledResultByteCode";
-static constexpr absl::string_view kLiteRtCompiledResultNumByteCodeModules =
-    "LiteRtCompiledResultNumByteCodeModules";
-static constexpr absl::string_view kLiteRtGetCompiledResultCallInfo =
-    "LiteRtGetCompiledResultCallInfo";
-static constexpr absl::string_view kLiteRtGetNumCompiledResultCalls =
-    "LiteRtGetNumCompiledResultCalls";
-
-static constexpr absl::string_view kLiteRtCompilerPluginSetFlags =
-    "LiteRtCompilerPluginSetFlags";
-
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_COMPILER_PLUGIN_API_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h b/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h
deleted file mode 100644
index 7487daf9c9ae..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h
+++ /dev/null
@@ -1,309 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_H_
-
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// /////////////////////////////////////////////////////////////////////////////
-// Basic Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-LITERT_DEFINE_HANDLE(LiteRtDispatchDeviceContext);
-LITERT_DEFINE_HANDLE(LiteRtDispatchInvocationContext);
-LITERT_DEFINE_HANDLE(LiteRtDispatchMetrics);
-
-typedef uint64_t LiteRtTensorBufferHandle;
-
-typedef enum LiteRtDispatchCapabilities {
-  kLiteRtDispatchCapabilitiesNone = 0,
-  kLiteRtDispatchCapabilitiesBasic = 1,  // The vendor supports the Basic API
-  kLiteRtDispatchCapabilitiesAsync = 2,  // The vendor supports the Async API
-  kLiteRtDispatchCapabilitiesGraph = 4,  // The vendor supports the Graph API
-} LiteRtDispatchCapabilities;
-
-// Types of executable that can run on the HW accelerators.
-typedef enum LiteRtDispatchExecutableType {
-  kLiteRtDispatchExecutableTypeUnknown = 0,
-  kLiteRtDispatchExecutableTypeDspLibrary = 1,  // DSP library
-  kLiteRtDispatchExecutableTypeMlModel = 2,     // Vendor-specific ML model
-} LiteRtDispatchExecutableType;
-
-typedef struct LiteRtDispatchOption {
-  const char* name;
-  LiteRtAny value;
-} LiteRtDispatchOption;
-
-typedef struct LiteRtMetric {
-  const char* name;
-  LiteRtAny value;
-} LiteRtMetric;
-
-typedef struct LiteRtMemBuffer {
-  int fd;  // File descriptor for an mmapped buffer, -1 if unused.
-  const void* base_addr;  // Base address of the buffer.
-  size_t offset;          // Offset of the buffer from the base address.
-  size_t size;            // Buffer size.
-} LiteRtMemBuffer;
-
-// This option can be used to specify a directory from where to load shared
-// libraries.
-static const char* kDispatchOptionSharedLibraryDir = "shared_library_dir";
-
-// Initialize the Dispatch API runtime.
-//
-// This function should be called before calling any other Dispatch API
-// functions.
-LiteRtStatus LiteRtDispatchInitialize(const LiteRtDispatchOption* options,
-                                      int num_options);
-
-// Return the version of the Dispatch API runtime.
-LiteRtStatus LiteRtDispatchGetApiVersion(LiteRtApiVersion* api_version);
-
-// Return the vendor id of the Dispatch API runtime.
-//
-// This function returns a pointer to a statically allocated string that is the
-// ID of vendor providing the Dispatch API runtime.
-LiteRtStatus LiteRtDispatchGetVendorId(const char** vendor_id);
-
-// Return the build ID of the Dispatch API runtime.
-//
-// This function returns a pointer to a statically allocated string that is the
-// ID of the Dispatch API runtime build.
-LiteRtStatus LiteRtDispatchGetBuildId(const char** build_id);
-
-// Return the capabilities supported by the Dispatch API runtime as a set of the
-// values specified in LiteRtDispatchCapabilities.
-LiteRtStatus LiteRtDispatchGetCapabilities(int* capabilities);
-
-// Create a `LiteRtDispatchDeviceContext` object.
-//
-// The returned object is used to talk with the underlying HW. The caller owns
-// the memory associated with the context and should call
-// LiteRtDispatchDeviceContextDestroy() to release it. Return NULL in case of
-// error.
-LiteRtStatus LiteRtDispatchDeviceContextCreate(
-    LiteRtDispatchDeviceContext* device_context);
-
-// Release a `LiteRtDispatchDeviceContext` object.
-//
-// The given context should be release only after releasing all associated
-// objects.
-LiteRtStatus LiteRtDispatchDeviceContextDestroy(
-    LiteRtDispatchDeviceContext device_context);
-
-// Given a tensor type for an invocation context input, obtain the attributes
-// the HW requires for the associated tensor buffer. The returned
-// `tensor_buffer_requirements` object is owned by the caller.
-LiteRtStatus LiteRtDispatchGetInputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int input_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
-
-// Given a tensor type for an invocation context output, obtain the attributes
-// the HW requires for the associated tensor buffer. The returned
-// `tensor_buffer_requirements` object is owned by the caller.
-LiteRtStatus LiteRtDispatchGetOutputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int output_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
-
-// Registers a buffer with the given device context.
-// Note: The memory backing the buffer should be valid until
-// `LiteRtDispatchUnregisterTensorBuffer` is called.
-LiteRtStatus LiteRtDispatchRegisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBuffer tensor_buffer,
-    LiteRtTensorBufferHandle* tensor_buffer_handle);
-
-// Unregisters the registered buffer associated with the given
-// `LiteRtTensorBufferHandle`.
-// Note: The registered `LiteRtTensorBufferHandle` is supposed to be
-// unregistered with this function before the associated `ThrContext` is deleted
-// by calling `LiteRtDispatchDeviceContextDestroy`.
-LiteRtStatus LiteRtDispatchUnregisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-// Create an invocation context to run a given function from a given
-// executable. Parameter `function_name` is required if the provided executable
-// includes multiple functions.
-LiteRtStatus LiteRtDispatchInvocationContextCreate(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs,
-    LiteRtDispatchInvocationContext* invocation_context);
-
-LiteRtStatus LiteRtDispatchInvocationContextDestroy(
-    LiteRtDispatchInvocationContext invocation_context);
-
-LiteRtStatus LiteRtDispatchAttachInput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-LiteRtStatus LiteRtDispatchAttachOutput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-LiteRtStatus LiteRtDispatchDetachInput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-LiteRtStatus LiteRtDispatchDetachOutput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-LiteRtStatus LiteRtDispatchInvoke(
-    LiteRtDispatchInvocationContext invocation_context);
-
-// Start collection of HW-specific metrics at a specific level of detail (>= 0).
-LiteRtStatus LiteRtDispatchStartMetricsCollection(
-    LiteRtDispatchInvocationContext invocation_context, int detail_level);
-
-// Stop collection of HW-specific metrics and report the collected
-// metrics. Note: The caller is responsible for deallocating the returned
-// metrics by calling `LiteRtDispatchDestroyMetrics`.
-LiteRtStatus LiteRtDispatchStopMetricsCollection(
-    LiteRtDispatchInvocationContext invocation_context,
-    LiteRtDispatchMetrics* metrics);
-
-LiteRtStatus LiteRtDispatchGetNumMetrics(LiteRtDispatchMetrics metrics,
-                                         int* num_metrics);
-
-// Fetch a specific metric. The runtime owns the returned object.
-LiteRtStatus LiteRtDispatchGetMetric(LiteRtDispatchMetrics metrics,
-                                     int metric_index, LiteRtMetric* metric);
-
-LiteRtStatus LiteRtDispatchDestroyMetrics(LiteRtDispatchMetrics metrics);
-
-// /////////////////////////////////////////////////////////////////////////////
-// Async Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtStatus LiteRtDispatchAttachInputEvent(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtEvent input_event);
-
-LiteRtStatus LiteRtDispatchInvokeAsync(
-    LiteRtDispatchInvocationContext invocation_context, int num_output_events,
-    LiteRtEvent* output_events);
-
-// /////////////////////////////////////////////////////////////////////////////
-// Graph Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-typedef uint64_t LiteRtDispatchNodeId;
-typedef uint64_t LiteRtDispatchEdgeId;
-typedef uint64_t LiteRtDispatchExecutableHandle;
-
-LITERT_DEFINE_HANDLE(LiteRtDispatchGraph);
-
-// Types of graph nodes.
-typedef enum LiteRtDispatchNodeType {
-  kLiteRtDispatchNodeTypeUnknown = 0,
-  kLiteRtDispatchNodeTypeDsp =
-      1,  // Can execute both ML models and Dsp libraries
-  kLiteRtDispatchNodeTypeNpu = 2,  // Can execute only ML models
-} LiteRtDispatchNodeType;
-
-LiteRtStatus LiteRtDispatchGraphCreate(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph** graph);
-
-LiteRtStatus LiteRtDispatchGraphDestroy(LiteRtDispatchGraph* graph);
-
-// Add a compute node to a given graph. Parameter node_id should be unique to
-// the graph.
-LiteRtStatus LiteRtDispatchAddNode(LiteRtDispatchGraph* graph,
-                                   LiteRtDispatchNodeId node_id,
-                                   LiteRtDispatchNodeType node_type);
-
-// Add an edge a given graph. Parameter edge_id should be unique to the graph.
-LiteRtStatus LiteRtDispatchAddEdge(LiteRtDispatchGraph* graph,
-                                   LiteRtDispatchEdgeId edge_id);
-
-// Connect a given node's input.
-LiteRtStatus LiteRtDispatchConnectNodeInput(LiteRtDispatchGraph* graph,
-                                            LiteRtDispatchNodeId node_id,
-                                            int input_index,
-                                            LiteRtDispatchEdgeId edge_id);
-
-// Connect a given node's output.
-LiteRtStatus LiteRtDispatchConnectNodeOutput(LiteRtDispatchGraph* graph,
-                                             LiteRtDispatchNodeId node_id,
-                                             int output_index,
-                                             LiteRtDispatchEdgeId edge_id);
-
-// Connect a given graph's input.
-LiteRtStatus LiteRtDispatchConnectGraphInput(LiteRtDispatchGraph* graph,
-                                             int input_index,
-                                             LiteRtDispatchEdgeId edge_id);
-
-// Connect a given graph's output.
-LiteRtStatus LiteRtDispatchConnectGraphOutput(LiteRtDispatchGraph* graph,
-                                              int output_index,
-                                              LiteRtDispatchEdgeId edge_id);
-
-LiteRtStatus LiteRtDispatchLoadExecutable(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType type, const LiteRtMemBuffer* bytecode_buffer,
-    LiteRtDispatchExecutableHandle* exec_handle);
-
-LiteRtStatus LiteRtDispatchUnloadExecutable(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableHandle exec_handle);
-
-// Assign an executable function to a graph node. Parameter `function_name` is
-// mandatory if the given executable includes multiple functions.
-LiteRtStatus LiteRtDispatchAssignNodeFunction(
-    LiteRtDispatchGraph* graph, LiteRtDispatchNodeId node_id,
-    LiteRtDispatchExecutableHandle exec_handle, const char* function_name);
-
-// Add an annotation to an entire graph.
-LiteRtStatus LiteRtDispatchAnnotateGraph(LiteRtDispatchGraph* graph,
-                                         const char* key, const char* value);
-
-// Add an annotation to a specified node.
-LiteRtStatus LiteRtDispatchAnnotateNode(LiteRtDispatchGraph* graph,
-                                        LiteRtDispatchNodeId node_id,
-                                        const char* key, const char* value);
-
-// Add an annotation to a specified edge.
-LiteRtStatus LiteRtDispatchAnnotateEdge(LiteRtDispatchGraph* graph,
-                                        LiteRtDispatchEdgeId edge_id,
-                                        const char* key, const char* value);
-
-LiteRtStatus LiteRtDispatchInvocationContextCreateFromGraph(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph* graph,
-    LiteRtDispatchInvocationContext* invocation_context);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h b/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h
deleted file mode 100644
index 527a19c2630e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_API_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_API_H_
-
-#include <stddef.h>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// /////////////////////////////////////////////////////////////////////////////
-
-typedef LiteRtStatus (*LiteRtDispatchInitializeT)(
-    const LiteRtDispatchOption* options, int num_options);
-
-typedef LiteRtStatus (*LiteRtDispatchGetVendorIdT)(const char** vendor_id);
-
-typedef LiteRtStatus (*LiteRtDispatchGetBuildIdT)(const char** build_id);
-
-typedef LiteRtStatus (*LiteRtDispatchGetCapabilitiesT)(int* capabilities);
-
-typedef LiteRtStatus (*LiteRtDispatchDeviceContextCreateT)(
-    LiteRtDispatchDeviceContext* device_context);
-
-typedef LiteRtStatus (*LiteRtDispatchDeviceContextDestroyT)(
-    LiteRtDispatchDeviceContext device_context);
-
-typedef LiteRtStatus (*LiteRtDispatchGetInputRequirementsT)(
-    LiteRtDispatchInvocationContext invocation_context, int input_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
-
-typedef LiteRtStatus (*LiteRtDispatchGetOutputRequirementsT)(
-    LiteRtDispatchInvocationContext invocation_context, int output_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements);
-
-typedef LiteRtStatus (*LiteRtDispatchRegisterTensorBufferT)(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBuffer tensor_buffer,
-    LiteRtTensorBufferHandle* tensor_buffer_handle);
-
-typedef LiteRtStatus (*LiteRtDispatchUnregisterTensorBufferT)(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBufferHandle handle);
-
-typedef LiteRtStatus (*LiteRtDispatchInvocationContextCreateT)(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs,
-    LiteRtDispatchInvocationContext* invocation_context);
-
-typedef LiteRtStatus (*LiteRtDispatchInvocationContextDestroyT)(
-    LiteRtDispatchInvocationContext invocation_context);
-
-typedef LiteRtStatus (*LiteRtDispatchAttachInputT)(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-typedef LiteRtStatus (*LiteRtDispatchAttachOutputT)(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-typedef LiteRtStatus (*LiteRtDispatchDetachInputT)(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-typedef LiteRtStatus (*LiteRtDispatchDetachOutputT)(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle);
-
-typedef LiteRtStatus (*LiteRtDispatchInvokeT)(
-    LiteRtDispatchInvocationContext invocation_context);
-
-typedef LiteRtStatus (*LiteRtDispatchStartMetricsCollectionT)(
-    LiteRtDispatchInvocationContext invocation_context, int detail_level);
-
-typedef LiteRtStatus (*LiteRtDispatchStopMetricsCollectionT)(
-    LiteRtDispatchInvocationContext invocation_context,
-    LiteRtDispatchMetrics* metrics);
-
-typedef LiteRtStatus (*LiteRtDispatchGetNumMetricsT)(
-    LiteRtDispatchMetrics metrics, int* num_metrics);
-
-typedef LiteRtStatus (*LiteRtDispatchGetMetricT)(LiteRtDispatchMetrics metrics,
-                                                 int metric_index,
-                                                 LiteRtMetric* metric);
-
-typedef LiteRtStatus (*LiteRtDispatchDestroyMetricsT)(
-    LiteRtDispatchMetrics metrics);
-
-typedef struct LiteRtDispatchInterface {
-  LiteRtDispatchInitializeT initialize;
-  LiteRtDispatchGetVendorIdT get_vendor_id;
-  LiteRtDispatchGetBuildIdT get_build_id;
-  LiteRtDispatchGetCapabilitiesT get_capabilities;
-  LiteRtDispatchDeviceContextCreateT device_context_create;
-  LiteRtDispatchDeviceContextDestroyT device_context_destroy;
-  LiteRtDispatchGetInputRequirementsT get_input_requirements;
-  LiteRtDispatchGetOutputRequirementsT get_output_requirements;
-  LiteRtDispatchRegisterTensorBufferT register_tensor_buffer;
-  LiteRtDispatchUnregisterTensorBufferT unregister_tensor_buffer;
-  LiteRtDispatchInvocationContextCreateT invocation_context_create;
-  LiteRtDispatchInvocationContextDestroyT invocation_context_destroy;
-  LiteRtDispatchAttachInputT attach_input;
-  LiteRtDispatchAttachOutputT attach_output;
-  LiteRtDispatchDetachInputT detach_input;
-  LiteRtDispatchDetachOutputT detach_output;
-  LiteRtDispatchInvokeT invoke;
-  LiteRtDispatchStartMetricsCollectionT start_metrics_collection;
-  LiteRtDispatchStopMetricsCollectionT stop_metrics_collection;
-  LiteRtDispatchGetNumMetricsT get_num_metrics;
-  LiteRtDispatchGetMetricT get_metric;
-  LiteRtDispatchDestroyMetricsT destroy_metrics;
-} LiteRtDispatchInterface;
-
-// /////////////////////////////////////////////////////////////////////////////
-
-typedef LiteRtStatus (*LiteRtDispatchAttachInputEventT)(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtEvent input_event);
-
-typedef LiteRtStatus (*LiteRtDispatchInvokeAsyncT)(
-    LiteRtDispatchInvocationContext invocation_context, int num_output_events,
-    LiteRtEvent* output_events);
-
-typedef struct LiteRtDispatchAsyncInterface {
-  LiteRtDispatchAttachInputEventT attach_input_event;
-  LiteRtDispatchInvokeAsyncT invoke_async;
-} LiteRtDispatchAsyncInterface;
-
-// /////////////////////////////////////////////////////////////////////////////
-
-typedef LiteRtStatus (*LiteRtDispatchGraphCreateT)(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph* graph);
-
-typedef LiteRtStatus (*LiteRtDispatchGraphDestroyT)(LiteRtDispatchGraph graph);
-
-typedef LiteRtStatus (*LiteRtDispatchAddNodeT)(
-    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
-    LiteRtDispatchNodeType node_type);
-
-typedef LiteRtStatus (*LiteRtDispatchAddEdgeT)(LiteRtDispatchGraph graph,
-                                               LiteRtDispatchEdgeId edge_id);
-
-typedef LiteRtStatus (*LiteRtDispatchConnectNodeInputT)(
-    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id, int input_index,
-    LiteRtDispatchEdgeId edge_id);
-
-typedef LiteRtStatus (*LiteRtDispatchConnectNodeOutputT)(
-    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id, int output_index,
-    LiteRtDispatchEdgeId edge_id);
-
-typedef LiteRtStatus (*LiteRtDispatchConnectGraphInputT)(
-    LiteRtDispatchGraph graph, int input_index, LiteRtDispatchEdgeId edge_id);
-
-typedef LiteRtStatus (*LiteRtDispatchConnectGraphOutputT)(
-    LiteRtDispatchGraph graph, int output_index, LiteRtDispatchEdgeId edge_id);
-
-typedef LiteRtStatus (*LiteRtDispatchLoadExecutableT)(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType type, const LiteRtMemBuffer* bytecode_buffer,
-    LiteRtDispatchExecutableHandle* exec_handle);
-
-typedef LiteRtStatus (*LiteRtDispatchUnloadExecutableT)(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableHandle exec_handle);
-
-typedef LiteRtStatus (*LiteRtDispatchAssignNodeFunctionT)(
-    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
-    LiteRtDispatchExecutableHandle exec_handle, const char* function_name);
-
-typedef LiteRtStatus (*LiteRtDispatchInvocationContextCreateFromGraphT)(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph,
-    LiteRtDispatchInvocationContext* invocation_context);
-
-typedef LiteRtStatus (*LiteRtDispatchAnnotateGraphT)(LiteRtDispatchGraph graph,
-                                                     const char* key,
-                                                     const char* value);
-
-typedef LiteRtStatus (*LiteRtDispatchAnnotateNodeT)(
-    LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id, const char* key,
-    const char* value);
-
-typedef LiteRtStatus (*LiteRtDispatchAnnotateEdgeT)(
-    LiteRtDispatchGraph graph, LiteRtDispatchEdgeId edge_id, const char* key,
-    const char* value);
-
-typedef struct LiteRtDispatchGraphInterface {
-  LiteRtDispatchGraphCreateT graph_create;
-  LiteRtDispatchGraphDestroyT graph_destroy;
-  LiteRtDispatchAddNodeT add_node;
-  LiteRtDispatchAddEdgeT add_edge;
-  LiteRtDispatchConnectNodeInputT connect_node_input;
-  LiteRtDispatchConnectNodeOutputT connect_node_output;
-  LiteRtDispatchConnectGraphInputT connect_graph_input;
-  LiteRtDispatchConnectGraphOutputT connect_graph_output;
-  LiteRtDispatchLoadExecutableT load_executable;
-  LiteRtDispatchUnloadExecutableT unload_executable;
-  LiteRtDispatchAssignNodeFunctionT assign_node_function;
-  LiteRtDispatchAnnotateGraphT annotate_graph;
-  LiteRtDispatchAnnotateNodeT annotate_node;
-  LiteRtDispatchAnnotateEdgeT annotate_edge;
-  LiteRtDispatchInvocationContextCreateFromGraphT
-      invocation_context_create_from_graph;
-} LiteRtDispatchGraphInterface;
-
-// /////////////////////////////////////////////////////////////////////////////
-
-// FIXME See Vulkan and OpenCL extensions.
-typedef struct LiteRtDispatchApi {
-  LiteRtApiVersion version;
-  LiteRtDispatchInterface* interface;
-  LiteRtDispatchAsyncInterface* async_interface;
-  LiteRtDispatchGraphInterface* graph_interface;
-} LiteRtDispatchApi;
-
-LiteRtStatus LiteRtDispatchGetApi(LiteRtDispatchApi* api);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_C_LITERT_DISPATCH_API_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/c/litert_vendor_c_api_common_test.c b/tensorflow/lite/experimental/litert/vendors/c/litert_vendor_c_api_common_test.c
deleted file mode 100644
index 60cedbb92703..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/c/litert_vendor_c_api_common_test.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// This file exists to verify that the below header files can build, link,
-// and run as C code.
-#ifdef __cplusplus
-#error "This file should be compiled as C code, not as C++."
-#endif
-
-// Include all the header files in the litert/c directory.
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin_api.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"  // NOLINT
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h"  // NOLINT
-
-int main(void) {
-  return 0;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/BUILD b/tensorflow/lite/experimental/litert/vendors/cc/BUILD
deleted file mode 100644
index 25e6c26462cf..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/BUILD
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "litert_compiler_plugin",
-    hdrs = ["litert_compiler_plugin.h"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "conversion",
-    hdrs = ["conversion.h"],
-    deps = [
-        ":backend_ir",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_library(
-    name = "backend_ir",
-    hdrs = ["backend_ir.h"],
-    deps = ["//tensorflow/lite/experimental/litert/c:litert_common"],
-)
-
-cc_library(
-    name = "partition_with_capabilities",
-    hdrs = ["partition_with_capabilities.h"],
-    deps = [
-        ":conversion",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-    ],
-)
-
-cc_library(
-    name = "convert_graph",
-    hdrs = ["convert_graph.h"],
-    deps = [
-        ":conversion",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-    ],
-)
-
-cc_library(
-    name = "ir_types",
-    hdrs = ["ir_types.h"],
-    deps = [
-        ":backend_ir",
-        ":conversion",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-    ],
-)
-
-cc_test(
-    name = "partition_with_capabilities_test",
-    srcs = ["partition_with_capabilities_test.cc"],
-    deps = [
-        ":partition_with_capabilities",
-        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/model:model_graph",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/vendors/examples:example_conversion_impl",
-        "//tensorflow/lite/experimental/litert/vendors/examples:example_ir",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "convert_graph_test",
-    srcs = ["convert_graph_test.cc"],
-    deps = [
-        ":backend_ir",
-        ":convert_graph",
-        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/model:model_graph",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/vendors/examples:example_conversion_impl",
-        "//tensorflow/lite/experimental/litert/vendors/examples:example_ir",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h b/tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h
deleted file mode 100644
index 34cf95bd3643..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_BACKEND_IR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_BACKEND_IR_H_
-
-#include <functional>
-#include <string>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-namespace litert {
-
-// Interfaces and types for managing backend IR to be targeted by LiteRt for
-// compilation.
-
-// Memory Management
-//===---------------------------------------------------------------------------
-
-// Callable for allocating a new instance of a backend IR type. This facilitates
-// external memory management for the backend IR implementented by the backend.
-// It is encouraged for implementations provide pointer stability (consider
-// std::list for storage).
-template <class BackendIr, class... Args>
-using BackendIrAllocator = std::function<BackendIr*(Args&&... args)>;
-
-// Allocator for backend tensors.
-template <class BackendTensor>
-using TensorAllocator = BackendIrAllocator<BackendTensor>;
-
-// Allocator for backend ops.
-template <class BackendOp>
-using OpAllocator = BackendIrAllocator<BackendOp>;
-
-// Graph Construction
-//===---------------------------------------------------------------------------
-
-// Wrapper for an in memory graph for a particular backend. Implementations
-// should contain an instance of a backend graph that can be iteratively
-// constructed via calls to this interface.
-template <class BackendOp, class BackendTensor>
-class BackendGraphBuilder {
- public:
-  // Hook called to initialize state for a new backend graph with a name. This
-  // will be called once per-instance before any other method.
-  virtual void InitGraph(std::string graph_name) = 0;
-
-  // Hook called to register a backend tensor once it
-  // has been converted. This will be called once per tensor.
-  virtual LiteRtStatus RegisterTensor(BackendTensor& tensor) = 0;
-
-  // Hook called to register a backend op once it has been converted. This will
-  // be called once per op (in a toplogogical order). All input/output tensors
-  // will have been registered before called.
-  virtual LiteRtStatus RegisterOp(BackendOp& op) = 0;
-
-  // Hook called to register a graph when graph
-  // conversion is completed. Backend graph context should be stored as internal
-  // state. This will be called once per instance after all ops/tensors have
-  // been finalized.
-  virtual LiteRtStatus FinalizeGraph() = 0;
-
-  virtual ~BackendGraphBuilder() = default;
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_BACKEND_IR_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/conversion.h b/tensorflow/lite/experimental/litert/vendors/cc/conversion.h
deleted file mode 100644
index 139ba594bb1e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/conversion.h
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Utility types for mapping LiteRt IR to arbitrary backend specific
-// types. Implementations of these types define mapping for ops and tensors
-// that may be used in a stndalone fashion. They also may be composed
-// to create lowerings of entire graphs with topology.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERSION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERSION_H_
-
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <variant>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
-
-namespace litert {
-
-// Interfaces and types for implementing "conversions" that map LiteRt IR to
-// backend IR.
-// NOTE: Conversions depend on external memory management for the backend IR
-// types. User defined conversions are usually expected to leverage callbacks
-// to allocate backend IR types rather than constructing them directly.
-
-// Conversion Result Type
-//===---------------------------------------------------------------------------
-
-// Result of a one->many general mapping from LiteRt op to any number of
-// backend specific ops. Does not own the memory of the backend ops or tensors.
-template <class BackendOp, class BackendTensor>
-struct GeneralConversionResult {
-  // Ops emitted from translation pattern.
-  std::vector<BackendOp*> ops;
-
-  // Any backend tensors used within the results ops. Not relevant when
-  // size of backend ops == 1. This does not include input/output tensors of the
-  // op being converted.
-  std::vector<BackendTensor*> intermediate_tensors;
-};
-
-// The result of a one->one specialized mapping from LiteRt op to backend op.
-template <class BackendOp>
-using SimpleConversionResult = BackendOp*;
-
-// A tag-type for a conversion result that is a non-error non-match.
-struct NoMatch {};
-
-// Type union for conversion results.
-// TODO(lukeboyer): Update conversion result types to handle the case where
-// backend ops add extra inputs.
-template <class BackendOp, class BackendTensor>
-using ConversionResult =
-    std::variant<SimpleConversionResult<BackendOp>,
-                 GeneralConversionResult<BackendOp, BackendTensor>, NoMatch>;
-
-// Short hand for holds_alternative.
-template <class Result, class BackendOp, class BackendTensor>
-bool ConversionIsA(const ConversionResult<BackendOp, BackendTensor>& result) {
-  return std::holds_alternative<Result>(result);
-}
-
-// Short hand for holds_alternative.
-template <class BackendOp, class BackendTensor>
-bool ConversionMatched(
-    const ConversionResult<BackendOp, BackendTensor>& result) {
-  return !std::holds_alternative<NoMatch>(result);
-}
-
-// Short hand for holds_alternative.
-template <class BackendOp, class BackendTensor>
-bool IsSimpleResult(const ConversionResult<BackendOp, BackendTensor>& result) {
-  return ConversionIsA<SimpleConversionResult<BackendOp>>(result);
-}
-
-// Short hand for holds_alternative.
-template <class BackendOp, class BackendTensor>
-bool IsGeneralResult(const ConversionResult<BackendOp, BackendTensor>& result) {
-  return ConversionIsA<GeneralConversionResult<BackendOp, BackendTensor>>(
-      result);
-}
-
-// Short hand for std::get. Also checks if match and wraps in expected.
-template <class Result, class BackendOp, class BackendTensor>
-Expected<Result> GetConversionResult(
-    const ConversionResult<BackendOp, BackendTensor>& result) {
-  if (ConversionMatched(result)) {
-    return Expected<Result>(std::get<Result>(result));
-  }
-  return Error(kLiteRtStatusLegalizeNoMatch);
-}
-
-// Get simple result if there was a match.
-template <class BackendOp, class BackendTensor>
-Expected<SimpleConversionResult<BackendOp>> GetSimpleConversionResult(
-    const ConversionResult<BackendOp, BackendTensor>& result) {
-  if (!IsSimpleResult(result)) {
-    return Error(kLiteRtStatusErrorInvalidArgument);
-  }
-  return GetConversionResult<SimpleConversionResult<BackendOp>>(result);
-}
-
-// Get general result if there was a match.
-template <class BackendOp, class BackendTensor>
-Expected<GeneralConversionResult<BackendOp, BackendTensor>>
-GetGeneralConversionResult(
-    const ConversionResult<BackendOp, BackendTensor>& result) {
-  if (!IsGeneralResult(result)) {
-    return Error(kLiteRtStatusErrorInvalidArgument);
-  }
-  return GetConversionResult<GeneralConversionResult<BackendOp, BackendTensor>>(
-      result);
-}
-
-// Common IR Conversion
-//===---------------------------------------------------------------------------
-
-// User defined callback for converting a LiteRt tensor to a backend tensor.
-// These are leveraged in various higher-level conversion routines.
-// TensorConverters should not stack allocate memory for the backend tensor. In
-// most situations, these will be bound to an external allocator.
-template <class BackendTensor>
-using TensorConverter =
-    std::function<Expected<BackendTensor*>(const Tensor& litert_tensor)>;
-
-// User defined callback for creating a TensorConverter. This facilitates
-// TensoConverters that are bound to an external allocator.
-template <class BackendTensor>
-using TensorConverterFactory = std::function<TensorConverter<BackendTensor>(
-    TensorAllocator<BackendTensor> alloc)>;
-
-// Mapping from LiteRt tensor to backend tensor, used during iterative graph
-// conversions to store current scope.
-template <class BackendTensor>
-using TensorMap = absl::flat_hash_map<LiteRtTensor, BackendTensor*>;
-
-// User-defined hook that calls backend to determine if an op is supported.
-template <class BackendOp>
-using Capability = std::function<bool(const BackendOp* op)>;
-
-// Legalization
-//===---------------------------------------------------------------------------
-
-// A legalization is a particlar type of user-defined conversion that is
-// scheduled for execution on a particular type of LiteRtOp. They may be
-// one-to-one or one-to-many conversions.
-template <class BackendOp, class BackendTensor>
-class Legalization {
- private:
-  using Self = Legalization<BackendOp, BackendTensor>;
-
- public:
-  using Result = ConversionResult<BackendOp, BackendTensor>;
-  using TensorConverter = TensorConverter<BackendTensor>;
-  using TensorConverterFactory = TensorConverterFactory<BackendTensor>;
-  using Ptr = std::unique_ptr<Self>;
-  using TensorAllocator = TensorAllocator<BackendTensor>;
-  using OpAllocator = OpAllocator<BackendOp>;
-  using Tensors = std::vector<BackendTensor*>;
-
-  // The type of op to schedule on.
-  virtual LiteRtOpCode OpToMatch() const = 0;
-
-  // Invoke this legalization on the given LiteRt op. All new backend IR will be
-  // allocated via given allocators. NOTE: In most cases, input and output
-  // converters will be the same. They are separated here for compatibility with
-  // graph-level conversions routines.
-  Expected<Result> Legalize(const Op& litert_op,
-                            TensorConverterFactory input_converter,
-                            TensorConverterFactory output_converter,
-                            TensorAllocator tensor_allocator,
-                            OpAllocator op_allocator) const {
-    const auto litert_inputs = litert_op.Inputs();
-    Tensors inputs(litert_inputs.size());
-    auto convert_input = input_converter(tensor_allocator);
-
-    for (size_t i = 0; i < litert_inputs.size(); ++i) {
-      const auto& litert_input = litert_inputs[i];
-      auto result = convert_input(litert_input);
-      if (!result) {
-        return result.Error();
-      }
-      inputs[i] = *result;
-    }
-
-    const auto litert_outputs = litert_op.Outputs();
-    Tensors outputs(litert_outputs.size());
-    auto convert_output = output_converter(tensor_allocator);
-
-    for (size_t i = 0; i < litert_outputs.size(); ++i) {
-      const auto& litert_output = litert_outputs[i];
-      auto result = convert_output(litert_output);
-      if (!result) {
-        return result.Error();
-      }
-      outputs[i] = *result;
-    }
-
-    return LegalizeImpl(litert_op, inputs, outputs, tensor_allocator,
-                        op_allocator);
-  }
-
-  virtual ~Legalization() = default;
-
- private:
-  // The user defined implementation of a legalization. Users must use the
-  // given allocators to allocate any new backend IR types (e.g. intermediate
-  // ops/tensors in the case of a one-to-many legalization). BackendTensors
-  // corresponding to LiteRt inputs and outputs have been pre-converted.
-  virtual Expected<Result> LegalizeImpl(const Op& litert_op,
-                                        const Tensors& inputs,
-                                        const Tensors& outputs,
-                                        TensorAllocator tensor_allocator,
-                                        OpAllocator op_allocator) const = 0;
-};
-
-// Collection of legalizations for a specific backend.
-template <class BackendOp, class BackendTensor>
-using Legalizations =
-    std::vector<typename Legalization<BackendOp, BackendTensor>::Ptr>;
-
-// Map for instance lookup by op code.
-template <class BackendOp, class BackendTensor>
-using LegalizationMap =
-    absl::flat_hash_map<LiteRtOpCode,
-                        const Legalization<BackendOp, BackendTensor>*>;
-
-// Construct a LegalizationMap from a collection of legalizations.
-// TODO: Consider wrapping the legalization map in a class to avoid
-// re-constructing it & better syntax.
-template <class BackendOp, class BackendTensor>
-LegalizationMap<BackendOp, BackendTensor> MakeLegalizationMap(
-    const Legalizations<BackendOp, BackendTensor>& legalizations) {
-  LegalizationMap<BackendOp, BackendTensor> map;
-  for (const auto& l : legalizations) {
-    map.insert({l->OpToMatch(), l.get()});
-  }
-  return map;
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERSION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h b/tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h
deleted file mode 100644
index cd7221c7bba0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERT_GRAPH_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERT_GRAPH_H_
-
-#include <string>
-#include <utility>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
-
-namespace litert {
-
-// Performs iterative graph conversion with user provided hooks. This function
-// traverses the IR in toplogical order, converting ops and tensors with given
-// tensor converter and legalizations. Registers converted ops and tensors with
-// the backend graph builder after they have been converted. The following are
-// true:
-// * Each tensor and op will be converted & registered at most once.
-// * An ops input and output tensors will be registered before the op is
-// converted (and before its registered).
-// * The graph builder will be initialized before any registration.
-// * The graph builder will be finalized after all registration.
-template <class Ir>
-LiteRtStatus ConvertGraph(
-    const Subgraph& subgraph, std::string graph_name,
-    typename Ir::TensorConverterFactory tensor_converter_factory,
-    typename Ir::TensorAllocator tensor_alloc,
-    typename Ir::OpAllocator op_alloc,
-    const typename Ir::Legalizations& legalizations,
-    typename Ir::GraphBuilder& builder) {
-  // Store mapping between evaluated litert tensors and corresponding backend
-  // tensors.
-  typename Ir::TensorMap tensor_map;
-
-  // Initialize backend graph builder.
-  builder.InitGraph(std::move(graph_name));
-
-  // Convert tensor, add to scope and register in backend graph builder.
-  auto handle_tensor = [&tensor_map, &builder](
-                           const auto& litert_tensor,
-                           auto tensor_converter) -> Ir::TensorResult {
-    auto converted = tensor_converter(litert_tensor);
-    if (!converted) {
-      LITERT_LOG(LITERT_ERROR, "Failed to convert tensor %lu",
-                 litert_tensor.Get());
-      return converted.Error();
-    }
-
-    if (auto status = builder.RegisterTensor(**converted);
-        status != kLiteRtStatusOk) {
-      LITERT_LOG(LITERT_ERROR, "Failed to register tensor %lu, with status %d",
-                 litert_tensor.Get(), status);
-      return Error(status);
-    }
-
-    tensor_map.insert({litert_tensor.Get(), *converted});
-    return *converted;
-  };
-
-  // Wrap provided tensor conversion logic for converting subgraph or op input
-  // tensors. We want functionality that provides user-defined conversions with
-  // tensors to be aware of the tensor map and graph builder registration.
-  auto input_tensor_convert_factory = [tensor_converter_factory, &tensor_map,
-                                       handle_tensor](auto tensor_alloc) {
-    return [tensor_alloc, tensor_converter_factory, &tensor_map,
-            handle_tensor](const Tensor& litert_tensor) -> Ir::TensorResult {
-      auto tensor_converter = tensor_converter_factory(tensor_alloc);
-
-      // Check if tensor has been converted already.
-      auto it = tensor_map.find(litert_tensor.Get());
-      const auto in_scope = it != tensor_map.end();
-      if (in_scope) {
-        LITERT_LOG(LITERT_VERBOSE, "Tensor %lu is in scope",
-                   litert_tensor.Get());
-        return it->second;
-      }
-
-      // If its a subgraph input or constant, we can convert it and add to
-      // scope.
-      const auto is_cst = litert_tensor.IsConstant();
-      const auto is_sg_input = litert_tensor.IsSubgraphInput();
-      if (is_sg_input || is_cst) {
-        return handle_tensor(litert_tensor, tensor_converter);
-      }
-
-      // Tensor must be added to scope before conversion, or not have a parent
-      // (e.g. subgraph input or constant) so error at this point.
-      LITERT_LOG(LITERT_ERROR, "Tensor %lu not handled", litert_tensor.Get());
-      return Error(kLiteRtStatusErrorInvalidArgument);
-    };
-  };
-
-  // Wrap provided tensor conversion logic for op output tensors. Adds to map
-  // and backend graph after conversion.
-  auto output_tensor_convert_factory = [tensor_converter_factory,
-                                        handle_tensor](auto tensor_alloc) {
-    return [tensor_alloc, tensor_converter_factory,
-            handle_tensor](const Tensor& litert_tensor) {
-      auto tensor_converter = tensor_converter_factory(tensor_alloc);
-      return handle_tensor(litert_tensor, tensor_converter);
-    };
-  };
-
-  // Convert all ops in subgraph in toplogical order.
-  auto legalization_map = Ir::MakeLegalizationMap(legalizations);
-  for (const auto& op : subgraph.Ops()) {
-    auto it = legalization_map.find(op.Code());
-    if (it == legalization_map.end()) {
-      LITERT_LOG(LITERT_ERROR, "No legalization found for op %d", op.Code());
-      return kLiteRtStatusErrorUnsupported;
-    }
-
-    auto result = it->second->Legalize(op, input_tensor_convert_factory,
-                                       output_tensor_convert_factory,
-                                       tensor_alloc, op_alloc);
-    if (!result) {
-      LITERT_LOG(LITERT_ERROR, "Failed to legalize op %d, with status %d",
-                 op.Code(), result.Error().Status());
-      return result.Error().Status();
-    }
-
-    auto simple_result = GetSimpleConversionResult(*result);
-    if (simple_result) {
-      if (auto stat = builder.RegisterOp(**simple_result);
-          stat != kLiteRtStatusOk) {
-        LITERT_LOG(LITERT_ERROR, "Failed to register op %d, with status %d",
-                   op.Code(), stat);
-        return stat;
-      }
-    }
-
-    auto general_result = GetGeneralConversionResult(*result);
-    if (general_result) {
-      for (auto* tensor : general_result->intermediate_tensors) {
-        if (auto stat = builder.RegisterTensor(*tensor);
-            stat != kLiteRtStatusOk) {
-          LITERT_LOG(LITERT_ERROR,
-                     "Failed to register tensor %d, with status %d", tensor->id,
-                     stat);
-          return stat;
-        }
-      }
-
-      for (auto* op : general_result->ops) {
-        if (auto stat = builder.RegisterOp(*op); stat != kLiteRtStatusOk) {
-          LITERT_LOG(LITERT_ERROR, "Failed to register op %d, with status %d",
-                     op->op_code, stat);
-          return stat;
-        }
-      }
-    }
-  }
-
-  builder.FinalizeGraph();
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_CONVERT_GRAPH_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/convert_graph_test.cc b/tensorflow/lite/experimental/litert/vendors/cc/convert_graph_test.cc
deleted file mode 100644
index 659eea25b35b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/convert_graph_test.cc
+++ /dev/null
@@ -1,390 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h"
-
-#include <array>
-#include <cstdint>
-#include <string>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
-
-namespace litert {
-namespace {
-
-using ::litert::example::ExampleOpAllocator;
-using ::litert::example::ExampleOpType;
-using ::litert::example::ExampleTensorAllocator;
-using ::litert::example::ExampleTypes;
-using ::litert::example::MakeAllLegalizations;
-using ::litert::example::MakeTensorConverter;
-using ::testing::AllOf;
-using ::testing::ElementsAreArray;
-using ::testing::Expectation;
-using ::testing::ExpectationSet;
-using ::testing::Field;
-using ::testing::Return;
-
-static constexpr std::array kDims = {2, 2};
-static constexpr auto kElementType = kLiteRtElementTypeFloat32;
-static constexpr absl::string_view kGraphName = "graph_name";
-
-TensorType GetTestTensorType() {
-  return MakeRankedTensorType(kElementType, absl::MakeConstSpan(kDims));
-}
-
-class MockGraphBuilder
-    : public BackendGraphBuilder<ExampleTypes::Op, ExampleTypes::Tensor> {
- public:
-  MOCK_METHOD(void, InitGraph, (std::string name), (override));
-  MOCK_METHOD(LiteRtStatus, RegisterTensor, (ExampleTypes::Tensor & tensor),
-              (override));
-  MOCK_METHOD(LiteRtStatus, RegisterOp, (ExampleTypes::Op & op), (override));
-  MOCK_METHOD(LiteRtStatus, FinalizeGraph, (), (override));
-};
-
-TEST(ConvertGraphTest, ConvertSingleSimpleConversion) {
-  LiteRtSubgraphT subgraph;
-
-  auto& op = subgraph.EmplaceOp();
-  op.SetOpCode(kLiteRtOpCodeTflMul);
-
-  auto& input1 = subgraph.EmplaceTensor();
-  input1.SetType(GetTestTensorType());
-  input1.SetName("input1");
-
-  auto& input2 = subgraph.EmplaceTensor();
-  input2.SetType(GetTestTensorType());
-  input2.SetName("input2");
-
-  auto& output = subgraph.EmplaceTensor();
-  output.SetType(GetTestTensorType());
-  output.SetName("output");
-
-  internal::AttachInput(&input1, op);
-  internal::AttachInput(&input2, op);
-  internal::AttachOutput(&output, op);
-
-  subgraph.Inputs().push_back(&input1);
-  subgraph.Inputs().push_back(&input2);
-  subgraph.Outputs().push_back(&output);
-
-  Subgraph litert_subgraph(&subgraph);
-
-  ExampleOpAllocator op_alloc;
-  ExampleTensorAllocator tensor_alloc;
-
-  MockGraphBuilder builder;
-
-  Expectation init_graph =
-      EXPECT_CALL(builder, InitGraph(std::string(kGraphName))).Times(1);
-
-  ExpectationSet reg_inputs;
-  reg_inputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                input1.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-  reg_inputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                input2.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  ExpectationSet reg_outputs;
-  reg_outputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                output.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  auto match_reg_op_args =
-      AllOf(Field(&ExampleTypes::Op::op_code, ExampleOpType::MUL),
-            Field(&ExampleTypes::Op::input_names,
-                  ElementsAreArray({input1.Name(), input2.Name()})),
-            Field(&ExampleTypes::Op::output_names,
-                  ElementsAreArray({output.Name()})));
-
-  Expectation reg_op = EXPECT_CALL(builder, RegisterOp(match_reg_op_args))
-                           .Times(1)
-                           .After(reg_inputs, reg_outputs)
-                           .WillOnce(Return(kLiteRtStatusOk));
-
-  Expectation finalize_graph = EXPECT_CALL(builder, FinalizeGraph())
-                                   .Times(1)
-                                   .After(reg_op)
-                                   .WillOnce(Return(kLiteRtStatusOk));
-
-  auto stat = ConvertGraph<ExampleTypes>(
-      litert_subgraph, std::string(kGraphName), MakeTensorConverter,
-      tensor_alloc, op_alloc, MakeAllLegalizations(), builder);
-
-  LITERT_ASSERT_OK(stat);
-}
-
-TEST(ConvertGraphTest, ConvertSingleGeneralConversion) {
-  LiteRtSubgraphT subgraph;
-
-  auto& op = subgraph.EmplaceOp();
-  op.SetOpCode(kLiteRtOpCodeTflAdd);
-
-  tflite::AddOptionsT add_opts;
-  add_opts.fused_activation_function = tflite::ActivationFunctionType_RELU;
-  internal::TflOptions tfl_opts;
-  tfl_opts.Set(std::move(add_opts));
-  detail::SetTflOptions(op, std::move(tfl_opts));
-
-  auto& input1 = subgraph.EmplaceTensor();
-  input1.SetType(GetTestTensorType());
-  input1.SetName("input1");
-
-  auto& input2 = subgraph.EmplaceTensor();
-  input2.SetType(GetTestTensorType());
-  input2.SetName("input2");
-
-  auto& output = subgraph.EmplaceTensor();
-  output.SetType(GetTestTensorType());
-  output.SetName("output");
-
-  internal::AttachInput(&input1, op);
-  internal::AttachInput(&input2, op);
-  internal::AttachOutput(&output, op);
-
-  subgraph.Inputs().push_back(&input1);
-  subgraph.Inputs().push_back(&input2);
-  subgraph.Outputs().push_back(&output);
-
-  Subgraph litert_subgraph(&subgraph);
-
-  ExampleOpAllocator op_alloc;
-  ExampleTensorAllocator tensor_alloc;
-
-  MockGraphBuilder builder;
-
-  Expectation init_graph =
-      EXPECT_CALL(builder, InitGraph(std::string(kGraphName))).Times(1);
-
-  ExpectationSet reg_inputs;
-  reg_inputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                input1.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-  reg_inputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                input2.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  ExpectationSet reg_intermediates;
-  reg_intermediates +=
-      EXPECT_CALL(builder,
-                  RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                       example::kIntermediateTensorName)))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  ExpectationSet reg_outputs;
-  reg_outputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                output.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  auto match_reg_add_args =
-      AllOf(Field(&ExampleTypes::Op::op_code, ExampleOpType::ADD),
-            Field(&ExampleTypes::Op::input_names,
-                  ElementsAreArray({input1.Name(), input2.Name()})),
-            Field(&ExampleTypes::Op::output_names,
-                  ElementsAreArray({example::kIntermediateTensorName})));
-
-  Expectation reg_add = EXPECT_CALL(builder, RegisterOp(match_reg_add_args))
-                            .Times(1)
-                            .After(reg_inputs, reg_intermediates)
-                            .WillOnce(Return(kLiteRtStatusOk));
-
-  auto match_reg_relu_args =
-      AllOf(Field(&ExampleTypes::Op::op_code, ExampleOpType::RELU),
-            Field(&ExampleTypes::Op::input_names,
-                  ElementsAreArray({example::kIntermediateTensorName})),
-            Field(&ExampleTypes::Op::output_names,
-                  ElementsAreArray({output.Name()})));
-
-  Expectation reg_relu = EXPECT_CALL(builder, RegisterOp(match_reg_relu_args))
-                             .Times(1)
-                             .After(reg_add, reg_intermediates, reg_outputs)
-                             .WillOnce(Return(kLiteRtStatusOk));
-
-  Expectation finalize_graph = EXPECT_CALL(builder, FinalizeGraph())
-                                   .Times(1)
-                                   .After(reg_relu)
-                                   .WillOnce(Return(kLiteRtStatusOk));
-
-  auto stat = ConvertGraph<ExampleTypes>(
-      litert_subgraph, std::string(kGraphName), MakeTensorConverter,
-      tensor_alloc, op_alloc, MakeAllLegalizations(), builder);
-
-  LITERT_ASSERT_OK(stat);
-}
-
-TEST(ConvertGraphTest, ConvertMultipleOps) {
-  LiteRtSubgraphT subgraph;
-
-  auto& op = subgraph.EmplaceOp();
-  op.SetOpCode(kLiteRtOpCodeTflMul);
-
-  auto& input1 = subgraph.EmplaceTensor();
-  input1.SetType(GetTestTensorType());
-  input1.SetName("input1");
-
-  auto& input2 = subgraph.EmplaceTensor();
-  input2.SetType(GetTestTensorType());
-  input2.SetName("input2");
-
-  auto& output1 = subgraph.EmplaceTensor();
-  output1.SetType(GetTestTensorType());
-  output1.SetName("output1");
-
-  auto& cst = subgraph.EmplaceTensor();
-  OwningBufferRef<uint8_t> weights(8);
-  SetWeightsFromUnownedBuffer(cst.Weights(), weights);
-  cst.SetName("cst");
-  cst.SetType(GetTestTensorType());
-
-  auto& op2 = subgraph.EmplaceOp();
-  op2.SetOpCode(kLiteRtOpCodeTflAdd);
-
-  auto& output2 = subgraph.EmplaceTensor();
-  output2.SetType(GetTestTensorType());
-  output2.SetName("output2");
-
-  internal::AttachInput(&input1, op);
-  internal::AttachInput(&input2, op);
-  internal::AttachOutput(&output1, op);
-
-  internal::AttachInput(&output1, op2);
-  internal::AttachInput(&cst, op2);
-  internal::AttachOutput(&output2, op2);
-
-  subgraph.Inputs().push_back(&input1);
-  subgraph.Inputs().push_back(&input2);
-  subgraph.Outputs().push_back(&output2);
-
-  Subgraph litert_subgraph(&subgraph);
-
-  ExampleOpAllocator op_alloc;
-  ExampleTensorAllocator tensor_alloc;
-
-  MockGraphBuilder builder;
-
-  Expectation init_graph =
-      EXPECT_CALL(builder, InitGraph(std::string(kGraphName))).Times(1);
-
-  ExpectationSet reg_inputs;
-  reg_inputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                input1.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-  reg_inputs +=
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                input2.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  Expectation reg_output1 =
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                output1.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  Expectation reg_cst =
-      EXPECT_CALL(builder, RegisterTensor(
-                               Field(&ExampleTypes::Tensor::name, cst.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  Expectation reg_output2 =
-      EXPECT_CALL(builder, RegisterTensor(Field(&ExampleTypes::Tensor::name,
-                                                output2.Name())))
-          .Times(1)
-          .After(init_graph)
-          .WillOnce(Return(kLiteRtStatusOk));
-
-  auto match_reg_op1_args =
-      AllOf(Field(&ExampleTypes::Op::op_code, ExampleOpType::MUL),
-            Field(&ExampleTypes::Op::input_names,
-                  ElementsAreArray({input1.Name(), input2.Name()})),
-            Field(&ExampleTypes::Op::output_names,
-                  ElementsAreArray({output1.Name()})));
-
-  Expectation reg_op1 = EXPECT_CALL(builder, RegisterOp(match_reg_op1_args))
-                            .Times(1)
-                            .After(reg_inputs, reg_output1)
-                            .WillOnce(Return(kLiteRtStatusOk));
-
-  auto match_reg_op2_args =
-      AllOf(Field(&ExampleTypes::Op::op_code, ExampleOpType::ADD),
-            Field(&ExampleTypes::Op::input_names,
-                  ElementsAreArray({output1.Name(), cst.Name()})),
-            Field(&ExampleTypes::Op::output_names,
-                  ElementsAreArray({output2.Name()})));
-
-  Expectation reg_op2 = EXPECT_CALL(builder, RegisterOp(match_reg_op2_args))
-                            .Times(1)
-                            .After(reg_op1, reg_cst, reg_output2, reg_output1)
-                            .WillOnce(Return(kLiteRtStatusOk));
-
-  Expectation finalize_graph = EXPECT_CALL(builder, FinalizeGraph())
-                                   .Times(1)
-                                   .After(reg_op2)
-                                   .WillOnce(Return(kLiteRtStatusOk));
-
-  auto stat = ConvertGraph<ExampleTypes>(
-      litert_subgraph, std::string(kGraphName), MakeTensorConverter,
-      tensor_alloc, op_alloc, MakeAllLegalizations(), builder);
-
-  LITERT_ASSERT_OK(stat);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/ir_types.h b/tensorflow/lite/experimental/litert/vendors/cc/ir_types.h
deleted file mode 100644
index a1da917de18a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/ir_types.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_IR_TYPES_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_IR_TYPES_H_
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
-
-namespace litert {
-
-// Holds particular backends IR template aliases for convenience.
-template <class BackendOp, class BackendTensor>
-struct IrTypes {
-  using Op = BackendOp;
-  using Tensor = BackendTensor;
-  using OpAllocator = OpAllocator<Op>;
-  using TensorAllocator = TensorAllocator<Tensor>;
-  using GraphBuilder = BackendGraphBuilder<Op, Tensor>;
-  using GeneralConversionResult = GeneralConversionResult<Op, Tensor>;
-  using SimpleConversionResult = SimpleConversionResult<Op>;
-  using ConversionResult = Expected<ConversionResult<Op, Tensor>>;
-  using Legalization = Legalization<Op, Tensor>;
-  using Legalizations = Legalizations<Op, Tensor>;
-  using LegalizationMap = LegalizationMap<Op, Tensor>;
-  using TensorConverter = TensorConverter<Tensor>;
-  using TensorResult = Expected<Tensor*>;
-  using TensorConverterFactory = TensorConverterFactory<Tensor>;
-  using TensorMap = TensorMap<Tensor>;
-  using Capability = Capability<Op>;
-  // NOLINTNEXTLINE
-  inline static auto MakeLegalizationMap =
-      litert::MakeLegalizationMap<Op, Tensor>;
-};
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_IR_TYPES_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h b/tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h
deleted file mode 100644
index 654457f0f75e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_LITERT_COMPILER_PLUGIN_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_LITERT_COMPILER_PLUGIN_H_
-
-#include <memory>
-
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-
-namespace litert {
-
-// Deleter for incomplete compiler plugin type.
-struct LiteRtCompilerPluginDeleter {
-  void operator()(LiteRtCompilerPlugin plugin) {
-    if (plugin != nullptr) {
-      LiteRtDestroyCompilerPlugin(plugin);
-    }
-  }
-};
-
-// Smart pointer wrapper for incomplete plugin type.
-using PluginPtr =
-    std::unique_ptr<LiteRtCompilerPluginT, LiteRtCompilerPluginDeleter>;
-
-// Initialize a plugin via c-api and wrap result in smart pointer.
-inline PluginPtr CreatePlugin() {
-  LiteRtCompilerPlugin plugin;
-  LITERT_CHECK_STATUS_OK(LiteRtCreateCompilerPlugin(&plugin));
-  return PluginPtr(plugin);
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_LITERT_COMPILER_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h b/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h
deleted file mode 100644
index a462d1744c38..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_PARTITION_WITH_CAPABILITIES_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_PARTITION_WITH_CAPABILITIES_H_
-
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
-
-namespace litert {
-
-// Higher-level functions for partitioning by leveraging user-defined
-// conversions. This method selects ops for partitioning via a callback that
-// checks if an op is supported by the backend.
-
-// Selects ops for partitioning from given subgraph based on given Capability
-// check. Returns all ops in the given supbgraph that are supported by the
-// backend. Suitable for use in implementing LiteRtCompilerPluginPartition. Any
-// allocations of new backend ir types will be done through given external
-// allocators.
-// NOTE: A missing legalization or any legalization failure will result in
-// an op not being supported, rather than a failure of this function.
-template <class Ir>
-Expected<std::vector<LiteRtOp>> PartitionWithCapabilities(
-    const typename Ir::Legalizations& legalizations,
-    typename Ir::Capability capability,
-    typename Ir::TensorConverterFactory convert_tensor_fact,
-    typename Ir::TensorAllocator tensor_allocator,
-    typename Ir::OpAllocator op_allocator, const Subgraph& litert_subgraph) {
-  std::vector<LiteRtOp> results;
-
-  // Build map for legalization lookup by op code.
-  auto map = Ir::MakeLegalizationMap(legalizations);
-
-  // Convert all ops from the given subgraph and check backend support.
-  for (const auto& litert_op : litert_subgraph.Ops()) {
-    const auto code = litert_op.Code();
-    LITERT_LOG(LITERT_INFO, "Checking support for LiteRtOp: %d", code);
-
-    auto it = map.find(code);
-    if (it == map.end()) {
-      LITERT_LOG(LITERT_WARNING, "No legalization found for LiteRtOp: %d",
-                 code);
-      continue;
-    }
-
-    // Call user-defined conversion.
-    auto result = it->second->Legalize(litert_op, convert_tensor_fact,
-                                       convert_tensor_fact, tensor_allocator,
-                                       op_allocator);
-    if (!result) {
-      LITERT_LOG(LITERT_WARNING, "Failed to legalize LiteRtOp: %d", code);
-      continue;
-    }
-
-    if (auto simple_result = GetSimpleConversionResult(*result)) {
-      if (capability(*simple_result)) {
-        LITERT_LOG(LITERT_INFO, "Selected LiteRtOp: %d", litert_op.Code());
-        results.push_back(litert_op.Get());
-      }
-      continue;
-    }
-
-    // Check all ops emitted from a one-to-many conversion are supported.
-    if (auto gen_result = GetGeneralConversionResult(*result)) {
-      const auto b_ops_start = gen_result->ops.cbegin();
-      const auto b_ops_end = gen_result->ops.cend();
-      if (std::all_of(b_ops_start, b_ops_end, capability)) {
-        LITERT_LOG(LITERT_INFO, "Selected LiteRtOp: %d", litert_op.Code());
-        results.push_back(litert_op.Get());
-      }
-      continue;
-    }
-  }
-
-  return results;
-}
-
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_CC_PARTITION_WITH_CAPABILITIES_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities_test.cc b/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities_test.cc
deleted file mode 100644
index cfdb49ec5eec..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities_test.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Utility types for mapping LiteRt IR to arbitrary backend specific
-// types. Implementations of these types define mapping for ops and tensors
-// that may be used in a stndalone fashion. They also may be composed
-// to create lowerings of entire graphs with topology.
-
-#include "tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h"
-
-#include <array>
-#include <utility>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
-
-namespace litert {
-namespace {
-
-using ::litert::example::ExampleLegalizeAdd;
-using ::litert::example::ExampleLegalizeMul;
-using ::litert::example::ExampleOpAllocator;
-using ::litert::example::ExampleOpType;
-using ::litert::example::ExampleTensorAllocator;
-using ::litert::example::ExampleTypes;
-using ::litert::example::MakeTensorConverter;
-
-bool ExampleCapability(const ExampleTypes::Op* op) {
-  return op->op_code == ExampleOpType::ADD ||
-         op->op_code == ExampleOpType::RELU;
-}
-
-TEST(PartitionWithCapabilitiesTest, EmptyGraph) {
-  ExampleTypes::Legalizations legalizations;
-  legalizations.push_back(ExampleLegalizeAdd::Make());
-
-  LiteRtSubgraphT subgraph;
-  Subgraph litert_subgraph(&subgraph);
-
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  auto ops = PartitionWithCapabilities<ExampleTypes>(
-      legalizations, ExampleCapability, MakeTensorConverter, tensor_alloc,
-      op_alloc, litert_subgraph);
-  ASSERT_TRUE(ops);
-  EXPECT_TRUE(ops->empty());
-}
-
-TEST(PartitionWithCapabilitiesTest, SingleSelectedOp) {
-  static constexpr std::array kDims = {2, 2};
-
-  ExampleTypes::Legalizations legalizations;
-  legalizations.push_back(ExampleLegalizeAdd::Make());
-
-  LiteRtSubgraphT subgraph;
-
-  const auto type = MakeRankedTensorType(kLiteRtElementTypeFloat32, kDims);
-
-  auto& input1 = subgraph.EmplaceTensor();
-  input1.SetType(type);
-
-  auto& input2 = subgraph.EmplaceTensor();
-  input2.SetType(type);
-
-  auto& output = subgraph.EmplaceTensor();
-  output.SetType(type);
-
-  auto& op = subgraph.EmplaceOp();
-  op.SetOpCode(kLiteRtOpCodeTflAdd);
-
-  internal::AttachInput(&input1, op);
-  internal::AttachInput(&input2, op);
-  internal::AttachOutput(&output, op);
-
-  Subgraph litert_subgraph(&subgraph);
-
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  auto ops = PartitionWithCapabilities<ExampleTypes>(
-      legalizations, ExampleCapability, MakeTensorConverter, tensor_alloc,
-      op_alloc, litert_subgraph);
-
-  ASSERT_TRUE(ops);
-  EXPECT_EQ(ops->size(), 1);
-}
-
-TEST(PartitionWithCapabilitiesTest, MultiSelectedOp) {
-  static constexpr std::array kDims = {2, 2};
-
-  ExampleTypes::Legalizations legalizations;
-  legalizations.push_back(ExampleLegalizeAdd::Make());
-
-  LiteRtSubgraphT subgraph;
-
-  const auto type = MakeRankedTensorType(kLiteRtElementTypeFloat32, kDims);
-
-  auto& add1_input = subgraph.EmplaceTensor();
-  add1_input.SetType(type);
-  auto& add1_output = subgraph.EmplaceTensor();
-  add1_output.SetType(type);
-  auto& add1 = subgraph.EmplaceOp();
-  add1.SetOpCode(kLiteRtOpCodeTflAdd);
-
-  internal::AttachInput(&add1_input, add1);
-  internal::AttachInput(&add1_input, add1);
-  internal::AttachOutput(&add1_output, add1);
-
-  auto& mul_output = subgraph.EmplaceTensor();
-  mul_output.SetType(type);
-  auto& mul = subgraph.EmplaceOp();
-  mul.SetOpCode(kLiteRtOpCodeTflMul);
-
-  internal::AttachInput(&add1_output, mul);
-  internal::AttachOutput(&mul_output, mul);
-
-  auto& add2_output = subgraph.EmplaceTensor();
-  add2_output.SetType(type);
-  auto& add2 = subgraph.EmplaceOp();
-  add2.SetOpCode(kLiteRtOpCodeTflAdd);
-
-  internal::AttachInput(&mul_output, add2);
-  internal::AttachInput(&mul_output, add2);
-  internal::AttachOutput(&add2_output, add2);
-
-  Subgraph litert_subgraph(&subgraph);
-
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  auto ops = PartitionWithCapabilities<ExampleTypes>(
-      legalizations, ExampleCapability, MakeTensorConverter, tensor_alloc,
-      op_alloc, litert_subgraph);
-
-  ASSERT_TRUE(ops);
-
-  ASSERT_EQ(ops->size(), 2);
-  EXPECT_EQ(ops->front(), &add1);
-  EXPECT_EQ(ops->back(), &add2);
-}
-
-TEST(PartitionWithCapabilitiesTest, WithGeneralResult) {
-  static constexpr std::array kDims = {2, 2};
-
-  ExampleTypes::Legalizations legalizations;
-  legalizations.push_back(ExampleLegalizeAdd::Make());
-
-  LiteRtSubgraphT subgraph;
-
-  const auto type = MakeRankedTensorType(kLiteRtElementTypeFloat32, kDims);
-
-  auto& add1_input = subgraph.EmplaceTensor();
-  add1_input.SetType(type);
-  auto& add1_output = subgraph.EmplaceTensor();
-  add1_output.SetType(type);
-  auto& add1 = subgraph.EmplaceOp();
-  add1.SetOpCode(kLiteRtOpCodeTflAdd);
-
-  internal::AttachInput(&add1_input, add1);
-  internal::AttachInput(&add1_input, add1);
-  internal::AttachOutput(&add1_output, add1);
-
-  tflite::AddOptionsT add_opts;
-  add_opts.fused_activation_function = tflite::ActivationFunctionType_RELU;
-  internal::TflOptions tfl_opts;
-  tfl_opts.Set(std::move(add_opts));
-  detail::SetTflOptions(add1, std::move(tfl_opts));
-
-  Subgraph litert_subgraph(&subgraph);
-
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  auto ops = PartitionWithCapabilities<ExampleTypes>(
-      legalizations, ExampleCapability, MakeTensorConverter, tensor_alloc,
-      op_alloc, litert_subgraph);
-
-  ASSERT_TRUE(ops);
-
-  ASSERT_EQ(ops->size(), 1);
-  EXPECT_EQ(ops->front(), &add1);
-}
-
-}  // namespace
-
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/BUILD b/tensorflow/lite/experimental/litert/vendors/examples/BUILD
deleted file mode 100644
index 16427953b936..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/BUILD
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_dynamic_lib")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:private"],
-)
-
-litert_dynamic_lib(
-    name = "example_plugin",
-    srcs = [
-        "example_plugin.cc",
-        "example_plugin_common.cc",
-        "example_plugin_common.h",
-    ],
-    hdrs = ["//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin.h"],
-    export_litert_only = True,
-    linkstatic = 1,
-    shared_lib_name = "example_plugin_so",
-    so_name = "libLiteRtCompilerPlugin_Example.so",
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/cc:litert_op_options",
-    ],
-)
-
-cc_test(
-    name = "example_plugin_test",
-    srcs = [
-        "example_plugin_test.cc",
-    ],
-    data = ["//tensorflow/lite/experimental/litert/test:mlir_test_data"],
-    deps = [
-        ":example_plugin",  # buildcleaner: keep
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/vendors/cc:litert_compiler_plugin",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "example_conversion_impl",
-    srcs = ["example_conversion_impl.cc"],
-    hdrs = ["example_conversion_impl.h"],
-    visibility = ["//tensorflow/lite/experimental/litert/vendors/cc:__pkg__"],
-    deps = [
-        ":example_ir",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_element_type",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/vendors/cc:backend_ir",
-        "//tensorflow/lite/experimental/litert/vendors/cc:conversion",
-        "//tensorflow/lite/experimental/litert/vendors/cc:ir_types",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "example_conversion_impl_test",
-    srcs = ["example_conversion_impl_test.cc"],
-    deps = [
-        ":example_conversion_impl",
-        ":example_ir",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/core/model:model_graph",
-        "//tensorflow/lite/experimental/litert/core/util:flatbuffer_tools",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/vendors/cc:conversion",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "example_ir",
-    srcs = ["example_ir.cc"],
-    hdrs = ["example_ir.h"],
-    visibility = ["//tensorflow/lite/experimental/litert/vendors/cc:__pkg__"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/vendors/cc:backend_ir",
-        "//tensorflow/lite/experimental/litert/vendors/cc:ir_types",
-    ],
-)
-
-cc_library(
-    name = "example_plugin_with_conversions",
-    srcs = [
-        "example_plugin_common.cc",
-        "example_plugin_common.h",
-        "example_plugin_with_conversions.cc",
-    ],
-    deps = [
-        ":example_conversion_impl",
-        ":example_ir",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin",
-        "//tensorflow/lite/experimental/litert/vendors/cc:convert_graph",
-        "//tensorflow/lite/experimental/litert/vendors/cc:partition_with_capabilities",
-        "@com_google_absl//absl/strings:str_format",
-    ],
-)
-
-cc_test(
-    name = "example_plugin_with_conversions_test",
-    srcs = ["example_plugin_with_conversions_test.cc"],
-    data = ["//tensorflow/lite/experimental/litert/test:mlir_test_data"],
-    deps = [
-        ":example_plugin_with_conversions",  # buildcleaner: keep
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin",
-        "//tensorflow/lite/experimental/litert/vendors/cc:litert_compiler_plugin",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.cc
deleted file mode 100644
index fa6e163aee4b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h"
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
-
-namespace litert::example {
-
-TensorConverter<ExampleTensor> MakeTensorConverter(
-    TensorAllocator<ExampleTensor> alloc) {
-  return [alloc](const Tensor& litert_tensor) -> Expected<ExampleTensor*> {
-    auto& tensor = *alloc();
-    tensor.name = litert_tensor.Name();
-
-    auto litert_type = litert_tensor.RankedTensorType();
-    if (!litert_type) {
-      return Error(litert_type.Error().Status());
-    }
-
-    const auto litert_dims = litert_type->Layout().Dimensions();
-
-    tensor.dims.assign(litert_dims.cbegin(), litert_dims.cend());
-
-    switch (litert_tensor.RankedTensorType()->ElementType()) {
-      case ElementType::Float32:
-        tensor.type = ExampleTensorType::FLOAT;
-        break;
-      case ElementType::Int32:
-        tensor.type = ExampleTensorType::INT;
-        break;
-      default:
-        return Error(kLiteRtStatusErrorInvalidArgument);
-    }
-
-    return &tensor;
-  };
-}
-
-ExampleTypes::Legalizations MakeAllLegalizations() {
-  ExampleTypes::Legalizations legalizations;
-  legalizations.push_back(ExampleLegalizeMul::Make());
-  legalizations.push_back(ExampleLegalizeAdd::Make());
-  return legalizations;
-}
-
-}  // namespace litert::example
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h b/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h
deleted file mode 100644
index 0215806cf1cb..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_CONVERSION_IMPL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_CONVERSION_IMPL_H_
-
-#include <cstdint>
-#include <memory>
-#include <string>
-
-#include "absl/log/absl_check.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/ir_types.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
-
-namespace litert::example {
-
-// Conversion type implementations for the fictional "example" backend.
-
-ExampleTypes::TensorConverter MakeTensorConverter(
-    ExampleTypes::TensorAllocator alloc);
-
-static constexpr absl::string_view kIntermediateTensorName =
-    "intermediate_bin_output";
-
-// Example legalization for simple binary ops.
-template <ExampleOpType BackendOpType, LiteRtOpCode LiteRtOpType>
-class ExampleBinOpLegalization : public Legalization<ExampleOp, ExampleTensor> {
- private:
-  using Self = ExampleBinOpLegalization<BackendOpType, LiteRtOpType>;
-
- public:
-  using Ptr = std::unique_ptr<Self>;
-
-  static Ptr Make() { return std::make_unique<Self>(); }
-
-  // Return the litert op code to match on.
-  constexpr LiteRtOpCode OpToMatch() const override { return LiteRtOpType; }
-
-  // Determines if the given litert op has a fused relu attribute.
-  bool HasFusedRelu(const Op& litert_op) const {
-    if constexpr (LiteRtOpType != kLiteRtOpCodeTflAdd) {
-      return false;
-    }
-    uint32_t faf;
-    if (LiteRtGetAddFusedActivationOption(litert_op.Get(), &faf) !=
-        kLiteRtStatusOk) {
-      return false;
-    }
-    return faf == 1;
-  }
-
-  // Transforms LiteRtAdd op into example op definition using the tensor
-  // converter to map tensors within.
-  ExampleTypes::ConversionResult LegalizeImpl(
-      const Op& litert_op, const Tensors& inputs, const Tensors& outputs,
-      ExampleTypes::TensorAllocator tensor_allocator,
-      ExampleTypes::OpAllocator op_allocator) const override {
-    ABSL_DCHECK_EQ(litert_op.Code(), LiteRtOpType);
-
-    auto& bin_op = *op_allocator();
-    bin_op.op_code = BackendOpType;
-
-    if (inputs.size() != 2 || outputs.size() != 1) {
-      return Error(kLiteRtStatusErrorInvalidArgument);
-    }
-
-    for (const auto* input : inputs) {
-      bin_op.inputs.push_back(input->id);
-      bin_op.input_names.push_back(input->name);
-    }
-
-    auto& output_tensor = *outputs.front();
-    if (!HasFusedRelu(litert_op)) {
-      bin_op.outputs.push_back(output_tensor.id);
-      bin_op.output_names.push_back(output_tensor.name);
-      return Expected<Result>(&bin_op);
-    }
-
-    auto* bin_output = tensor_allocator();
-    bin_output->dims = output_tensor.dims;
-    bin_output->type = output_tensor.type;
-    bin_output->name = std::string(kIntermediateTensorName);
-    bin_op.outputs.push_back(bin_output->id);
-    bin_op.output_names.push_back(bin_output->name);
-
-    auto& relu = *op_allocator();
-    relu.op_code = ExampleOpType::RELU;
-    relu.inputs.push_back(bin_output->id);
-    relu.input_names.push_back(bin_output->name);
-    relu.outputs.push_back(output_tensor.id);
-    relu.output_names.push_back(output_tensor.name);
-
-    ExampleTypes::GeneralConversionResult result;
-    result.ops.push_back(&bin_op);
-    result.ops.push_back(&relu);
-    result.intermediate_tensors.push_back(bin_output);
-
-    return ExampleTypes::ConversionResult(result);
-  }
-};
-
-using ExampleLegalizeAdd =
-    ExampleBinOpLegalization<ExampleOpType::ADD, kLiteRtOpCodeTflAdd>;
-using ExampleLegalizeMul =
-    ExampleBinOpLegalization<ExampleOpType::MUL, kLiteRtOpCodeTflMul>;
-
-ExampleTypes::Legalizations MakeAllLegalizations();
-
-}  // namespace litert::example
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_CONVERSION_IMPL_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl_test.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl_test.cc
deleted file mode 100644
index 407b6c79a94e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl_test.cc
+++ /dev/null
@@ -1,213 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h"
-
-#include <array>
-#include <string>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model_graph.h"
-#include "tensorflow/lite/experimental/litert/core/util/flatbuffer_tools.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/conversion.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace litert::example {
-namespace {
-
-using ::testing::ElementsAreArray;
-using ::testing::HasSubstr;
-
-TEST(ExampleConversionImplTest, ConvertTensor) {
-  static constexpr std::array kDims = {2, 2};
-  static constexpr absl::string_view kName = "foo";
-
-  LiteRtTensorT litert_tensor;
-  litert_tensor.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32,
-                                             absl::MakeConstSpan(kDims)));
-  litert_tensor.SetName(std::string(kName));
-
-  ExampleTensorAllocator tensor_alloc;
-  auto tensor_convert = MakeTensorConverter(tensor_alloc);
-
-  auto& example_tensor = **tensor_convert(Tensor(&litert_tensor));
-  EXPECT_EQ(example_tensor.type, ExampleTensorType::FLOAT);
-  EXPECT_THAT(example_tensor.dims, ElementsAreArray(kDims));
-  EXPECT_EQ(example_tensor.name, kName);
-}
-
-TEST(ExampleConversionImplTest, ExampleGraphBuilder) {
-  ExampleTensor input;
-  input.type = ExampleTensorType::FLOAT;
-  input.dims = {2, 2};
-  input.id = 1;
-
-  ExampleTensor output;
-  output.type = ExampleTensorType::INT;
-  output.dims = {3, 3};
-  output.id = 2;
-
-  ExampleOp op;
-  op.op_code = ExampleOpType::ADD;
-  op.inputs = {1};
-  op.outputs = {2};
-
-  ExampleGraphBuilder builder;
-  static constexpr absl::string_view kName = "FOO_GRAPH";
-
-  builder.InitGraph(std::string(kName));
-  LITERT_ASSERT_OK(builder.RegisterTensor(input));
-  LITERT_ASSERT_OK(builder.RegisterOp(op));
-  LITERT_ASSERT_OK(builder.RegisterTensor(output));
-  LITERT_ASSERT_OK(builder.FinalizeGraph());
-
-  const auto serialized = builder.Serialize();
-  EXPECT_THAT(serialized, HasSubstr("1FLOAT[2, 2]"));
-  EXPECT_THAT(serialized, HasSubstr("2INT[3, 3]"));
-  EXPECT_THAT(serialized, HasSubstr("ADD(1)->(2)"));
-  EXPECT_THAT(serialized, HasSubstr("FINALIZED"));
-  EXPECT_THAT(serialized, HasSubstr(kName));
-}
-
-TEST(ExampleConversionImplTest, LegalizeAddSimpleResult) {
-  static constexpr std::array kDims = {2, 2};
-
-  LiteRtTensorT input1;
-  input1.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32,
-                                      absl::MakeConstSpan(kDims)));
-  input1.SetName("input1");
-
-  LiteRtTensorT input2;
-  input2.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32,
-                                      absl::MakeConstSpan(kDims)));
-  input2.SetName("input2");
-
-  LiteRtTensorT output;
-  output.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32,
-                                      absl::MakeConstSpan(kDims)));
-  output.SetName("output");
-
-  LiteRtOpT op;
-  op.SetOpCode(kLiteRtOpCodeTflAdd);
-  internal::AttachInput(&input1, op);
-  internal::AttachInput(&input2, op);
-  internal::AttachOutput(&output, op);
-
-  tflite::AddOptionsT add_opts;
-  add_opts.fused_activation_function = tflite::ActivationFunctionType_NONE;
-  internal::TflOptions tfl_opts;
-  tfl_opts.Set(std::move(add_opts));
-  detail::SetTflOptions(op, std::move(tfl_opts));
-
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  ExampleLegalizeAdd legalize_add;
-  EXPECT_EQ(legalize_add.OpToMatch(), kLiteRtOpCodeTflAdd);
-
-  auto legalized =
-      legalize_add.Legalize(Op(&op), MakeTensorConverter, MakeTensorConverter,
-                            tensor_alloc, op_alloc);
-
-  ASSERT_TRUE(legalized);
-
-  auto simple_result = GetSimpleConversionResult(*legalized);
-  ASSERT_TRUE(simple_result);
-  auto& example_op = **simple_result;
-
-  EXPECT_EQ(example_op.op_code, ExampleOpType::ADD);
-  EXPECT_THAT(example_op.inputs, ElementsAreArray({0, 1}));
-  EXPECT_THAT(example_op.input_names,
-              ElementsAreArray({input1.Name(), input2.Name()}));
-  EXPECT_THAT(example_op.outputs, ElementsAreArray({2}));
-  EXPECT_THAT(example_op.output_names, ElementsAreArray({output.Name()}));
-}
-
-TEST(ExampleConversionImplTest, LegalizeAddGeneralResult) {
-  static constexpr std::array kDims = {2, 2};
-  LiteRtTensorT input1;
-  input1.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32,
-                                      absl::MakeConstSpan(kDims)));
-  input1.SetName("input1");
-
-  LiteRtTensorT input2;
-  input2.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32,
-                                      absl::MakeConstSpan(kDims)));
-  input2.SetName("input2");
-
-  LiteRtTensorT output;
-  output.SetType(MakeRankedTensorType(kLiteRtElementTypeFloat32,
-                                      absl::MakeConstSpan(kDims)));
-  output.SetName("output");
-
-  LiteRtOpT op;
-  op.SetOpCode(kLiteRtOpCodeTflAdd);
-  internal::AttachInput(&input1, op);
-  internal::AttachInput(&input2, op);
-  internal::AttachOutput(&output, op);
-
-  tflite::AddOptionsT add_opts;
-  add_opts.fused_activation_function = tflite::ActivationFunctionType_RELU;
-  internal::TflOptions tfl_opts;
-  tfl_opts.Set(std::move(add_opts));
-  detail::SetTflOptions(op, std::move(tfl_opts));
-
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  auto legalize_add = ExampleLegalizeAdd::Make();
-  EXPECT_EQ(legalize_add->OpToMatch(), kLiteRtOpCodeTflAdd);
-
-  auto legalized =
-      legalize_add->Legalize(Op(&op), MakeTensorConverter, MakeTensorConverter,
-                             tensor_alloc, op_alloc);
-  ASSERT_TRUE(legalized);
-
-  auto gen_result = GetGeneralConversionResult(*legalized);
-  ASSERT_TRUE(gen_result);
-
-  ASSERT_EQ(gen_result->ops.size(), 2);
-  EXPECT_EQ(gen_result->ops[0]->op_code, ExampleOpType::ADD);
-  EXPECT_THAT(gen_result->ops[0]->inputs, ElementsAreArray({0, 1}));
-  EXPECT_THAT(gen_result->ops[0]->input_names,
-              ElementsAreArray({input1.Name(), input2.Name()}));
-  EXPECT_THAT(gen_result->ops[0]->outputs, ElementsAreArray({3}));
-  EXPECT_THAT(gen_result->ops[0]->output_names,
-              ElementsAreArray({kIntermediateTensorName}));
-  EXPECT_EQ(gen_result->ops[1]->op_code, ExampleOpType::RELU);
-  EXPECT_THAT(gen_result->ops[1]->inputs, ElementsAreArray({3}));
-  EXPECT_THAT(gen_result->ops[1]->input_names,
-              ElementsAreArray({kIntermediateTensorName}));
-  EXPECT_THAT(gen_result->ops[1]->outputs, ElementsAreArray({2}));
-  EXPECT_THAT(gen_result->ops[1]->output_names,
-              ElementsAreArray({output.Name()}));
-  EXPECT_EQ(gen_result->intermediate_tensors.size(), 1);
-  EXPECT_EQ(gen_result->intermediate_tensors.front()->id, 3);
-  EXPECT_EQ(gen_result->intermediate_tensors.front()->name,
-            kIntermediateTensorName);
-}
-
-}  // namespace
-
-}  // namespace litert::example
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_ir.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_ir.cc
deleted file mode 100644
index da06b617d9f1..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_ir.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
-
-#include <ostream>
-#include <string>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-
-namespace litert::example {
-
-namespace {
-
-template <class It>
-void PrintWithCommas(It start, It end, std::ostream& out) {
-  for (auto it = start; it < end; ++it) {
-    out << std::to_string(*it);
-    if (it != end - 1) {
-      out << ", ";
-    }
-  }
-}
-
-}  // namespace
-
-LiteRtStatus ExampleGraphBuilder::RegisterOp(ExampleOp& op) {
-  switch (op.op_code) {
-    case ExampleOpType::ADD:
-      example_graph_ << "ADD";
-      break;
-    case ExampleOpType::MUL:
-      example_graph_ << "MUL";
-      break;
-    case ExampleOpType::RELU:
-      example_graph_ << "RELU";
-      break;
-  }
-  example_graph_ << "(";
-  PrintWithCommas(op.inputs.cbegin(), op.inputs.cend(), example_graph_);
-  example_graph_ << ")->(";
-  PrintWithCommas(op.outputs.cbegin(), op.outputs.cend(), example_graph_);
-  example_graph_ << ")";
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus ExampleGraphBuilder::RegisterTensor(ExampleTensor& tensor) {
-  example_graph_ << std::to_string(tensor.id);
-  switch (tensor.type) {
-    case ExampleTensorType::FLOAT:
-      example_graph_ << "FLOAT";
-      break;
-    case ExampleTensorType::INT:
-      example_graph_ << "INT";
-      break;
-  }
-  example_graph_ << "[";
-  PrintWithCommas(tensor.dims.cbegin(), tensor.dims.cend(), example_graph_);
-  example_graph_ << "]";
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus ExampleGraphBuilder::FinalizeGraph() {
-  example_graph_ << "FINALIZED";
-  return kLiteRtStatusOk;
-}
-
-void ExampleGraphBuilder::InitGraph(std::string graph_name) {
-  example_graph_ << "name=" << graph_name << "\n";
-}
-
-std::string ExampleGraphBuilder::Serialize() const {
-  return example_graph_.str();
-}
-
-}  // namespace litert::example
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_ir.h b/tensorflow/lite/experimental/litert/vendors/examples/example_ir.h
deleted file mode 100644
index e423a53f382b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_ir.h
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_IR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_IR_H_
-
-#include <cstdint>
-#include <list>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/vendors/cc/backend_ir.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/ir_types.h"
-
-namespace litert::example {
-
-// Example IR wrapper types for an imaginary backend.
-
-// Example backend knows only float and int 32.
-enum class ExampleTensorType {
-  FLOAT,
-  INT,
-};
-
-// Example backend tensor wrapper that stores the type and shape and unique ID.
-struct ExampleTensor {
-  using Id = int32_t;
-  ExampleTensorType type;
-  std::vector<uint32_t> dims;
-  std::string name;
-  Id id = -1;
-};
-
-// Example backend knows only a few simple ops.
-enum class ExampleOpType {
-  ADD,
-  MUL,
-  RELU,
-};
-
-// Example backend op that stores op type as well as input and output tensor
-// IDs and names.
-struct ExampleOp {
-  ExampleOpType op_code;
-  std::vector<ExampleTensor::Id> inputs;
-  std::vector<std::string> input_names;
-  std::vector<ExampleTensor::Id> outputs;
-  std::vector<std::string> output_names;
-};
-
-// Simple allocator(s) for example example IR types that provides pointer
-// stability.
-template <class E>
-class ExampleIrAllocatorBase {
- public:
-  ExampleIrAllocatorBase(const ExampleIrAllocatorBase&) = delete;
-  ExampleIrAllocatorBase& operator=(const ExampleIrAllocatorBase&) = delete;
-  ExampleIrAllocatorBase() = default;
-
- protected:
-  std::list<E> ir_;
-};
-
-// Allocator for example tensors that provides pointer stability and unique IDs.
-class ExampleTensorAllocator : public ExampleIrAllocatorBase<ExampleTensor> {
- private:
-  using Alloc = BackendIrAllocator<ExampleTensor>;
-
- public:
-  ExampleTensor* operator()() {
-    auto& tensor = this->ir_.emplace_back();
-    tensor.id = this->next_id_++;
-    return &tensor;
-  }
-
-  // Return lambda instead of implicit copy construction when converting to
-  // function type.
-  // NOLINTNEXTLINE
-  operator Alloc() {
-    return [this]() { return this->operator()(); };
-  }
-
-  ExampleTensorAllocator(const ExampleTensorAllocator&) = delete;
-  ExampleTensorAllocator& operator=(const ExampleTensorAllocator&) = delete;
-  ExampleTensorAllocator() = default;
-
- private:
-  uint32_t next_id_ = 0;
-};
-
-// Allocator for example ops that provides pointer stability.
-class ExampleOpAllocator : public ExampleIrAllocatorBase<ExampleOp> {
- private:
-  using Alloc = BackendIrAllocator<ExampleOp>;
-
- public:
-  ExampleOp* operator()() { return &this->ir_.emplace_back(); }
-
-  // Return lambda instead of implicit copy construction when converting to
-  // function type.
-  // NOLINTNEXTLINE
-  operator Alloc() {
-    return [this]() { return this->operator()(); };
-  }
-
-  ExampleOpAllocator(const ExampleOpAllocator&) = delete;
-  ExampleOpAllocator& operator=(const ExampleOpAllocator&) = delete;
-  ExampleOpAllocator() = default;
-};
-
-// Builder for graph conversion to example IR. The internal example IR graph is
-// simply a string representation of the graph.
-class ExampleGraphBuilder
-    : public BackendGraphBuilder<ExampleOp, ExampleTensor> {
- public:
-  // Prefixes ir string.
-  void InitGraph(std::string graph_name) override;
-
-  // Registers tensor into the currrent graph by simply appending its string
-  // representation.
-  LiteRtStatus RegisterTensor(ExampleTensor& tensor) override;
-
-  // Registers op into the currrent graph by simply appending its string
-  // representation.
-  LiteRtStatus RegisterOp(ExampleOp& op) override;
-
-  // Simply appends tag to IR string.
-  LiteRtStatus FinalizeGraph() override;
-
-  // Gets the serialized IR representation.
-  std::string Serialize() const;
-
- private:
-  std::stringstream example_graph_;
-};
-
-using ExampleTypes = IrTypes<ExampleOp, ExampleTensor>;
-
-}  // namespace litert::example
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_IR_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_plugin.cc
deleted file mode 100644
index f253501e81ab..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdlib>
-#include <memory>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_op_options.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h"
-
-// A simple compiler plugin example that implements everything directly.
-// This plugin matches on mul ops, and emits "byte code" that is simply
-// a string representative of the ops consumed.
-
-// Plugins can hold state.
-struct LiteRtCompilerPluginT {};
-
-LiteRtStatus LiteRtCreateCompilerPlugin(LiteRtCompilerPlugin* compiler_plugin) {
-  *compiler_plugin = new LiteRtCompilerPluginT;
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompilerPlugin(LiteRtCompilerPlugin compiler_plugin) {
-  delete compiler_plugin;
-}
-
-LiteRtStatus LiteRtCompilerPluginPartition(LiteRtCompilerPlugin compiler_plugin,
-                                           LiteRtSubgraph subgraph,
-                                           LiteRtOpList selected_ops) {
-  ::litert::Subgraph main_subgraph(subgraph);
-  for (const auto& op : main_subgraph.Ops()) {
-    if (op.Code() == kLiteRtOpCodeTflMul) {
-      LITERT_RETURN_IF_ERROR(LiteRtPushOp(selected_ops, op.Get(), 0));
-    } else if (op.Code() == kLiteRtOpCodeTflSub) {
-      LITERT_RETURN_IF_ERROR(LiteRtPushOp(selected_ops, op.Get(), 1));
-    } else if (op.Code() == kLiteRtOpCodeShloComposite) {
-      const auto opts =
-          litert::GetOptionsAs<litert::CompositeOptions>(op.Get());
-      if (!opts) {
-        return opts.Error().Status();
-      }
-      if (opts->name == "odml.rms_norm") {
-        LITERT_RETURN_IF_ERROR(LiteRtPushOp(selected_ops, op.Get(), 0));
-      }
-    }
-  }
-  return kLiteRtStatusOk;
-}
-
-namespace {
-
-LiteRtStatus CompileSinglePartition(LiteRtParamIndex partition_index,
-                                    LiteRtSubgraph subgraph,
-                                    LiteRtCompiledResultT& result,
-                                    int byte_code_idx) {
-  const litert::Subgraph sg(subgraph);
-  int num_muls_in_partition = 0;
-  for (const auto& op : sg.Ops()) {
-    if (op.Code() != kLiteRtOpCodeTflMul && op.Code() != kLiteRtOpCodeTflSub) {
-      return kLiteRtStatusErrorUnsupported;
-    }
-    if (op.Code() == kLiteRtOpCodeTflMul) {
-      ++num_muls_in_partition;
-    }
-  }
-
-  {
-    char* byte_code_append;
-    (void)asprintf(&byte_code_append,
-                   "Partition_%lu_with_%d_muls:", partition_index,
-                   num_muls_in_partition);
-    result.byte_code[byte_code_idx].append(byte_code_append);
-    free(byte_code_append);
-  }
-
-  {
-    char* per_op_data;
-    (void)asprintf(&per_op_data, "Partition_%lu", partition_index);
-    result.per_op_data.push_back(per_op_data);
-    free(per_op_data);
-  }
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace
-
-LiteRtStatus LiteRtCompilerPluginCompile(
-    LiteRtCompilerPlugin compiler_plugin, const char* soc_model,
-    LiteRtModel partitions, LiteRtCompiledResult* compiled_result) {
-  auto model = litert::Model::CreateFromNonOwnedHandle(partitions);
-  const auto num_partitions = model.NumSubgraphs();
-  auto result = std::make_unique<LiteRtCompiledResultT>();
-  result->byte_code.resize(num_partitions);
-  for (auto i = 0; i < num_partitions; ++i) {
-    LITERT_RETURN_IF_ERROR(
-        CompileSinglePartition(i, model.Subgraph(i)->Get(), *result, i));
-  }
-
-  *compiled_result = result.release();
-
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.cc
deleted file mode 100644
index 19c84dc55e78..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h"
-
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-
-//
-// Configurations
-//
-
-namespace litert::example {
-namespace {
-
-constexpr char kPluginManufacturer[] = "ExampleSocManufacturer";
-constexpr char kPluginSocModel[] = "ExampleSocModel";
-
-}  // namespace
-}  // namespace litert::example
-
-LiteRtStatus LiteRtCompilerPluginSetFlags(LiteRtCompilerPlugin compiler_plugin,
-                                          LiteRtParamIndex num_flags,
-                                          const char** keys,
-                                          const char** values) {
-  // IMPLEMENT ME
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginVersion(LiteRtApiVersion* api_version) {
-  if (!api_version) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  api_version->major = LITERT_API_VERSION_MAJOR;
-  api_version->minor = LITERT_API_VERSION_MINOR;
-  api_version->patch = LITERT_API_VERSION_PATCH;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedHardware(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtHwAccelerators* supported_hardware) {
-  if (!compiler_plugin || !supported_hardware) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *supported_hardware = kLiteRtHwAcceleratorCpu;
-  return kLiteRtStatusOk;
-}
-
-const char* LiteRtGetCompilerPluginSocManufacturer() {
-  return litert::example::kPluginManufacturer;
-}
-
-LiteRtStatus LiteRtGetNumCompilerPluginSupportedSocModels(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtParamIndex* num_supported_soc_models) {
-  if (!compiler_plugin || !num_supported_soc_models) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_supported_soc_models = 1;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedSocModel(
-    LiteRtCompilerPlugin compiler_plugin, LiteRtParamIndex soc_model_idx,
-    const char** soc_model_name) {
-  if (!compiler_plugin || !soc_model_name) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (soc_model_idx != 0) {
-    return kLiteRtStatusErrorUnsupported;
-  }
-  *soc_model_name = litert::example::kPluginSocModel;
-  return kLiteRtStatusOk;
-}
-
-//
-// Compiled Result Definition
-//
-
-LiteRtStatus LiteRtGetCompiledResultByteCode(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex byte_code_idx,
-    const void** byte_code, size_t* byte_code_size) {
-  if (!compiled_result) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *byte_code = compiled_result->byte_code[byte_code_idx].data();
-  *byte_code_size = compiled_result->byte_code[byte_code_idx].size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompiledResultCallInfo(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex call_idx,
-    const void** call_info, size_t* call_info_size,
-    LiteRtParamIndex* byte_code_idx) {
-  if (call_idx >= compiled_result->per_op_data.size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-  *call_info = compiled_result->per_op_data.at(call_idx).data();
-  *call_info_size = compiled_result->per_op_data.at(call_idx).size();
-  *byte_code_idx = 0;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumCompiledResultCalls(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_calls) {
-  if (!compiled_result) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_calls = compiled_result->per_op_data.size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCompiledResultNumByteCodeModules(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_byte_code) {
-  *num_byte_code = compiled_result->byte_code.size();
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompiledResult(LiteRtCompiledResult compiled_result) {
-  delete compiled_result;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h b/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h
deleted file mode 100644
index cc7c0f60df4e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_PLUGIN_COMMON_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_PLUGIN_COMMON_H_
-
-#include <string>
-#include <vector>
-
-// Simple compiled result def holds byte code and per op data.
-struct LiteRtCompiledResultT {
-  std::vector<std::string> byte_code;
-  std::vector<std::string> per_op_data;
-};
-
-namespace litert::example {}  // namespace litert::example
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_EXAMPLES_EXAMPLE_PLUGIN_COMMON_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_test.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_test.cc
deleted file mode 100644
index db3ade837bb3..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h"
-
-namespace litert {
-namespace {
-
-TEST(TestDummyPlugin, GetConfigInfo) {
-  ASSERT_STREQ(LiteRtGetCompilerPluginSocManufacturer(),
-               "ExampleSocManufacturer");
-
-  auto plugin = CreatePlugin();
-
-  LiteRtParamIndex num_supported_soc_models;
-  LITERT_ASSERT_OK(LiteRtGetNumCompilerPluginSupportedSocModels(
-      plugin.get(), &num_supported_soc_models));
-  ASSERT_EQ(num_supported_soc_models, 1);
-
-  const char* soc_model_name;
-  LITERT_ASSERT_OK(LiteRtGetCompilerPluginSupportedSocModel(plugin.get(), 0,
-                                                            &soc_model_name));
-  ASSERT_STREQ(soc_model_name, "ExampleSocModel");
-}
-
-TEST(TestCallDummyPlugin, PartitionSimpleMultiAdd) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("simple_multi_op.tflite");
-
-  LiteRtOpListT selected_op_list;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginPartition(
-      plugin.get(), model.Subgraph(0)->Get(), &selected_op_list));
-  const auto selected_ops = selected_op_list.Values();
-
-  ASSERT_EQ(selected_ops.size(), 2);
-  ASSERT_EQ(selected_ops[0].first->OpCode(), kLiteRtOpCodeTflMul);
-  ASSERT_EQ(selected_ops[1].first->OpCode(), kLiteRtOpCodeTflMul);
-}
-
-TEST(TestCallDummyPlugin, CompileMulSubgraph) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("mul_simple.tflite");
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginCompile(
-      plugin.get(), /*soc_model=*/nullptr, model.Get(), &compiled));
-
-  const void* byte_code;
-  size_t byte_code_size;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultByteCode(
-      compiled, /*byte_code_idx=*/0, &byte_code, &byte_code_size));
-
-  absl::string_view byte_code_string(reinterpret_cast<const char*>(byte_code),
-                                     byte_code_size);
-  ASSERT_EQ(byte_code_string, "Partition_0_with_2_muls:");
-
-  LiteRtParamIndex byte_code_idx;
-  const void* op_data;
-  size_t op_data_size;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultCallInfo(
-      compiled, /*call_idx=*/0, &op_data, &op_data_size, &byte_code_idx));
-
-  absl::string_view op_data_string(reinterpret_cast<const char*>(op_data),
-                                   op_data_size);
-  ASSERT_EQ(op_data_string, "Partition_0");
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_with_conversions.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_with_conversions.cc
deleted file mode 100644
index 273323d88f63..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_with_conversions.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/strings/str_format.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/convert_graph.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/partition_with_capabilities.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_conversion_impl.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_ir.h"
-#include "tensorflow/lite/experimental/litert/vendors/examples/example_plugin_common.h"
-
-using ::litert::PartitionWithCapabilities;
-using ::litert::example::ExampleGraphBuilder;
-using ::litert::example::ExampleOpAllocator;
-using ::litert::example::ExampleOpType;
-using ::litert::example::ExampleTensorAllocator;
-using ::litert::example::ExampleTypes;
-using ::litert::example::MakeAllLegalizations;
-using ::litert::example::MakeTensorConverter;
-
-// Example plugin implementations that leverage the pluggable conversion
-// infrastructure. Implementations of common interfaces are provided in
-// example_conversion_impl.h. These are passed to higher-level litert functions
-// to perform the actual conversion.
-// The primary benifit of this approach is the re-use of conversion logic
-// between the partition and compile phases.
-
-// Plugins can hold state.
-struct LiteRtCompilerPluginT {
-  ExampleTypes::Legalizations legalizations;
-};
-
-namespace {
-
-bool MulCapability(const ExampleTypes::Op* op) {
-  return op->op_code == ExampleOpType::MUL;
-}
-
-}  // namespace
-
-// Initialize example plugin and register legalizations.
-LiteRtStatus LiteRtCreateCompilerPlugin(LiteRtCompilerPlugin* compiler_plugin) {
-  auto* plugin = new LiteRtCompilerPluginT;
-  plugin->legalizations = MakeAllLegalizations();
-  *compiler_plugin = plugin;
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompilerPlugin(LiteRtCompilerPlugin compiler_plugin) {
-  delete compiler_plugin;
-}
-
-// Leverage the convert_type PartitionViaCapabilties algorithm for partitioning
-// implementation.
-LiteRtStatus LiteRtCompilerPluginPartition(LiteRtCompilerPlugin compiler_plugin,
-                                           LiteRtSubgraph subgraph,
-                                           LiteRtOpList selected_ops) {
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  auto ops = PartitionWithCapabilities<ExampleTypes>(
-      compiler_plugin->legalizations, MulCapability, MakeTensorConverter,
-      tensor_alloc, op_alloc, ::litert::Subgraph(subgraph));
-  if (!ops) {
-    return ops.Error().Status();
-  }
-
-  for (auto* op : *ops) {
-    LITERT_RETURN_IF_ERROR(LiteRtPushOp(selected_ops, op, 0));
-  }
-
-  return kLiteRtStatusOk;
-}
-
-namespace {
-
-LiteRtStatus CompileSinglePartition(
-    const ExampleTypes::Legalizations& legalizations, std::string name,
-    LiteRtSubgraph subgraph, LiteRtCompiledResultT& result) {
-  ::litert::Subgraph litert_subgraph(subgraph);
-
-  ExampleTensorAllocator tensor_alloc;
-  ExampleOpAllocator op_alloc;
-
-  ExampleGraphBuilder builder;
-
-  LITERT_RETURN_IF_ERROR(::litert::ConvertGraph<ExampleTypes>(
-      litert_subgraph, name, MakeTensorConverter, tensor_alloc, op_alloc,
-      legalizations, builder));
-
-  // This example plugin only supports a single byte code module.
-  result.byte_code[0].append(builder.Serialize());
-  result.per_op_data.push_back(std::move(name));
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace
-
-// Plugin compiler implementation that leverages the pluggable convert_types
-// infrastructure.
-LiteRtStatus LiteRtCompilerPluginCompile(
-    LiteRtCompilerPlugin compiler_plugin, const char* soc_model,
-    LiteRtModel partitions, LiteRtCompiledResult* compiled_result) {
-  auto model = litert::Model::CreateFromNonOwnedHandle(partitions);
-  const auto num_partitions = model.NumSubgraphs();
-  auto result = std::make_unique<LiteRtCompiledResultT>();
-  result->byte_code.resize(num_partitions);
-  for (auto i = 0; i < num_partitions; ++i) {
-    auto name = absl::StrFormat("partition_%lu", i);
-    LITERT_RETURN_IF_ERROR(
-        CompileSinglePartition(compiler_plugin->legalizations, std::move(name),
-                               model.Subgraph(i)->Get(), *result));
-  }
-
-  *compiled_result = result.release();
-
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_with_conversions_test.cc b/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_with_conversions_test.cc
deleted file mode 100644
index 77d1ceb84cef..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/examples/example_plugin_with_conversions_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h"
-
-namespace litert {
-namespace {
-
-using ::testing::HasSubstr;
-
-TEST(ExamplePluginWithConvertTypesTest, GetConfigInfo) {
-  ASSERT_STREQ(LiteRtGetCompilerPluginSocManufacturer(),
-               "ExampleSocManufacturer");
-
-  auto plugin = CreatePlugin();
-
-  LiteRtParamIndex num_supported_soc_models;
-  LITERT_ASSERT_OK(LiteRtGetNumCompilerPluginSupportedSocModels(
-      plugin.get(), &num_supported_soc_models));
-  ASSERT_EQ(num_supported_soc_models, 1);
-
-  const char* soc_model_name;
-  LITERT_ASSERT_OK(LiteRtGetCompilerPluginSupportedSocModel(plugin.get(), 0,
-                                                            &soc_model_name));
-  ASSERT_STREQ(soc_model_name, "ExampleSocModel");
-}
-
-TEST(ExamplePluginWithConvertTypesTest, PartitionSimpleMultiAdd) {
-  auto plugin = CreatePlugin();
-  auto model = litert::testing::LoadTestFileModel("simple_multi_op.tflite");
-
-  LiteRtOpListT selected_op_list;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginPartition(
-      plugin.get(), model.Get()->MainSubgraph(), &selected_op_list));
-  const auto selected_ops = selected_op_list.Values();
-
-  ASSERT_EQ(selected_ops.size(), 2);
-  ASSERT_EQ(selected_ops[0].first->OpCode(), kLiteRtOpCodeTflMul);
-  ASSERT_EQ(selected_ops[1].first->OpCode(), kLiteRtOpCodeTflMul);
-}
-
-TEST(ExamplePluginWithConvertTypesTest, CompileMulSubgraph) {
-  static constexpr absl::string_view kName = "partition_0";
-
-  auto plugin = CreatePlugin();
-  auto model = litert::testing::LoadTestFileModel("mul_simple.tflite");
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginCompile(
-      plugin.get(), /*soc_model=*/nullptr, model.Get(), &compiled));
-
-  const void* byte_code;
-  size_t byte_code_size;
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultByteCode(
-      compiled, /*byte_code_idx=*/0, &byte_code, &byte_code_size));
-  absl::string_view byte_code_str(reinterpret_cast<const char*>(byte_code),
-                                  byte_code_size);
-
-  EXPECT_THAT(byte_code_str, HasSubstr(kName));
-  EXPECT_THAT(byte_code_str, HasSubstr("0FLOAT[2, 2]"));
-  EXPECT_THAT(byte_code_str, HasSubstr("1FLOAT[2, 2]"));
-  EXPECT_THAT(byte_code_str, HasSubstr("2FLOAT[2, 2]"));
-  EXPECT_THAT(byte_code_str, HasSubstr("MUL"));
-  EXPECT_THAT(byte_code_str, HasSubstr("FINALIZED"));
-
-  LiteRtParamIndex num_call_infos;
-  LITERT_ASSERT_OK(LiteRtGetNumCompiledResultCalls(compiled, &num_call_infos));
-
-  ASSERT_EQ(num_call_infos, 1);
-
-  const void* op_data;
-  size_t op_data_size;
-  LiteRtParamIndex byte_code_idx;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultCallInfo(
-      compiled, 0, &op_data, &op_data_size, &byte_code_idx));
-
-  absl::string_view op_data_str(reinterpret_cast<const char*>(op_data),
-                                op_data_size);
-  EXPECT_EQ(op_data_str, kName);
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/BUILD b/tensorflow/lite/experimental/litert/vendors/google_tensor/BUILD
deleted file mode 100644
index 6a3d9458dd5d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/BUILD
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_lib", "make_rpaths")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-litert_lib(
-    name = "adapter",
-    srcs = ["adapter.cc"],
-    hdrs = ["adapter.h"],
-    data = [
-        # copybara:uncomment_begin(google-only)
-        # "//platforms/darwinn/compiler:compiler_api_wrapper",
-        # copybara:uncomment_end
-    ],
-    linkopts = [
-        # TODO(abhirs): Make this work for OS.
-        #copybara:comment_begin(google-only)
-        make_rpaths(["platforms/darwinn/compiler"]),
-        # copybara:uncomment_end
-    ],
-    tags = [
-        # Don't build/test in OS until libcompiler_api_wrapper.so is available.
-        "nobuilder",
-        "no_oss",
-    ],
-    ungrte = False,
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core:dynamic_loading",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_test(
-    name = "adapter_test",
-    srcs = ["adapter_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-    ],
-    linkstatic = 1,
-    tags = [
-        # Tests with ungrte deps do not currently work on forge.
-        "no-remote-exec",
-        "notap",
-        #Don't build/test in OS until libcompiler_api_wrapper.so is available.
-        "nobuilder",
-        "no_oss",
-        # Sanitizer runtime doesn't work with anything that uses dlopen.
-        "nosan",
-    ],
-    # This test can only be run on Android and Linux.
-    target_compatible_with = select({
-        "@platforms//os:android": [],
-        "@platforms//os:linux": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    deps = [
-        ":adapter",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.cc
deleted file mode 100644
index b0e1c25c591e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.h"
-
-#include <dlfcn.h>
-
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace google_tensor {
-
-Adapter::Adapter() : api_(new Api) {}
-
-Adapter::~Adapter() {
-  if (dlib_handle_) {
-    dlclose(dlib_handle_);  // Use dlclose directly
-  }
-}
-
-litert::Expected<Adapter::Ptr> Adapter::Create(
-    std::optional<std::string> shared_library_dir) {
-  Ptr adapter(new Adapter);
-  auto status = adapter->LoadSymbols(shared_library_dir);
-  if (!status.HasValue()) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create Adapter: %s",
-               status.Error().Message().c_str());
-    return status.Error();
-  }
-  return adapter;
-}
-
-litert::Expected<void> Adapter::LoadSymbols(
-    std::optional<std::string> shared_library_dir) {
-  constexpr auto kLibTensorTPUCompiler = "libcompiler_api_wrapper.so";
-
-  const std::vector<std::string> so_paths = {
-      shared_library_dir.has_value()
-          ? absl::StrCat(*shared_library_dir, "/", kLibTensorTPUCompiler)
-          : kLibTensorTPUCompiler};
-
-  // Use dlopen directly
-  for (const auto& path : so_paths) {
-    dlib_handle_ = dlopen(path.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (dlib_handle_) {
-      void* init_func = dlsym(dlib_handle_, "Initialize");
-      if (init_func) {
-        (*reinterpret_cast<void (*)()>(init_func))();
-      }
-      break;  // Found the library
-    }
-  }
-
-  if (!dlib_handle_) {
-    const std::string error_message =
-        "Failed to load Tensor TPU compiler library: " + std::string(dlerror());
-    LITERT_LOG(LITERT_ERROR, "Failed to load Tensor TPU compiler library: %s",
-               error_message.c_str());  // Include dlerror() for more info
-    return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure, error_message);
-  }
-
-  api_->compile =
-      reinterpret_cast<Compile>(dlsym(dlib_handle_, "CompileFlatbuffer"));
-  if (!api_->compile) {
-    const std::string error_message =
-        "Failed to load Tensor TPU compiler API: " + std::string(dlerror());
-    LITERT_LOG(LITERT_ERROR, "Failed to load Tensor TPU compiler API: %s",
-               error_message.c_str());  // Include dlerror()
-    return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure, error_message);
-  }
-
-  LITERT_LOG(LITERT_INFO, "Tensor TPU compiler API symbols loaded");
-  return {};
-}
-
-}  // namespace google_tensor
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.h b/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.h
deleted file mode 100644
index 6f1d6867a99a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_ADAPTER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_ADAPTER_H_
-#include <memory>
-#include <optional>
-#include <string>
-
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace google_tensor {
-
-typedef absl::Status (*Compile)(absl::string_view serialized_tfl_buffer,
-                                absl::string_view soc_model,
-                                std::string* compiled_code);
-
-// This class adapts the google tensor compiler API for dynamic loading.
-class Adapter {
- public:
-  // A smart pointer for managing TensorAdapter objects.
-  using Ptr = std::unique_ptr<Adapter>;
-  struct Api;
-
-  Adapter();
-  ~Adapter();
-
-  // Creates a new TensorAdapter and loads the compiler API symbols.
-  static litert::Expected<Ptr> Create(
-      std::optional<std::string> shared_library_dir);
-
-  // Returns a reference to the loaded API.
-  const Api& api() const { return *api_; }
-
- private:
-  // Loads the symbols from the compiler library.
-  litert::Expected<void> LoadSymbols(
-      std::optional<std::string> shared_library_dir);
-
-  void* dlib_handle_ = nullptr;
-  std::unique_ptr<Api> api_;
-};
-
-struct Adapter::Api {
-  // The function pointer to the compiler wrapper API.
-  Compile compile = nullptr;
-};
-
-}  // namespace google_tensor
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_ADAPTER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter_test.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter_test.cc
deleted file mode 100644
index 0d05a0507e99..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/adapter_test.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.h"
-
-#include <sys/types.h>
-
-#include <optional>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-
-namespace litert {
-namespace google_tensor {
-
-TEST(AdapterTest, CreateSuccess) {
-  auto adapter_result = Adapter::Create(/*shared_library_dir=*/
-                                        std::nullopt);
-  if (!adapter_result.HasValue()) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create Adapter: %s",
-               adapter_result.Error().Message().c_str());
-  }
-  ASSERT_TRUE(adapter_result.HasValue());
-}
-
-TEST(AdapterTest, CreateFailure) {
-  auto kLibDarwinnCompilerNoLib = "libcompiler_api_wrapper_no_lib.so";
-  auto adapter_result = Adapter::Create(kLibDarwinnCompilerNoLib);
-  ASSERT_FALSE(adapter_result.HasValue());
-}
-
-TEST(AdapterTest, CompileSuccess) {
-  auto adapter_result = Adapter::Create(/*shared_library_dir=*/
-                                        std::nullopt);
-  if (!adapter_result.HasValue()) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create Adapter: %s",
-               adapter_result.Error().Message().c_str());
-  }
-
-  auto model = litert::testing::LoadTestFileModel("mul_simple.tflite");
-  LiteRtModel litert_model = model.Get();
-
-  LITERT_LOG(LITERT_INFO, "%s", "Serializing model");
-  litert::OwningBufferRef buf;
-
-  // Using weak pointer to link the data to the buffer.
-  auto [data, size, offset] = buf.GetWeak();
-
-  const auto opts = litert::SerializationOptions::Defaults();
-  auto status =
-      LiteRtSerializeModel(litert_model, &data, &size, &offset, false, opts);
-  if (status != kLiteRtStatusOk) {
-    LITERT_LOG(LITERT_ERROR, "Failed to serialize model");
-  }
-
-  absl::string_view buffer_str(reinterpret_cast<const char*>(buf.Data()),
-                               buf.Size());
-
-  ASSERT_FALSE(buffer_str.empty());
-  LITERT_LOG(LITERT_INFO, "buffer_str size: %d", buffer_str.size());
-  LITERT_LOG(LITERT_INFO, "Compling model...");
-  absl::string_view soc_model = "P25";
-  std::string compiled_code;
-  auto compile_status = adapter_result.Value()->api().compile(
-      buffer_str, soc_model, &compiled_code);
-  ASSERT_OK(compile_status);
-  ASSERT_FALSE(compiled_code.empty());
-}
-
-}  // namespace google_tensor
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/BUILD b/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/BUILD
deleted file mode 100644
index 13f2bb6fce7c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/BUILD
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_dynamic_lib")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:private"],
-)
-
-litert_dynamic_lib(
-    name = "compiler_plugin",
-    srcs = ["compiler_plugin.cc"],
-    hdrs = ["//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin.h"],
-    export_litert_only = True,
-    linkstatic = 1,
-    shared_lib_name = "google_tensor_compiler_plugin_so",
-    so_name = "libLiteRtCompilerPlugin_google_tensor.so",
-    tags = [
-        # Don't build/test in OS until google tensor is available.
-        "nobuilder",
-        "no_oss",
-        "notap",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_buffer_ref",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/vendors/google_tensor:adapter",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_test(
-    name = "compiler_plugin_test",
-    srcs = [
-        "compiler_plugin_test.cc",
-    ],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-    ],
-    linkstatic = True,
-    tags = [
-        # Tests with ungrte deps do not currently work on forge.
-        "no-remote-exec",
-        "notap",
-        # Don't build/test in OS until google tensor is available.
-        "nobuilder",
-        "no_oss",
-        # Sanatizer runtime doesn't work with anything that loads a shared library.
-        "nosan",
-    ],
-    # This test can only be run on Android and Linux.
-    target_compatible_with = select({
-        "@platforms//os:android": [],
-        "@platforms//os:linux": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    deps = [
-        ":compiler_plugin",  # buildcleaner: keep
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/vendors/cc:litert_compiler_plugin",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/compiler_plugin.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/compiler_plugin.cc
deleted file mode 100644
index 2b6f25f3e35d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/compiler_plugin.cc
+++ /dev/null
@@ -1,342 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-
-#include <cstddef>
-#include <cstdlib>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_buffer_ref.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/adapter.h"
-
-//
-// Configurations
-//
-
-namespace google_tensor {
-
-constexpr char kPluginManufacturer[] = "GoogleTensor";
-
-constexpr const char* kPluginSocModels[] = {
-    "P25",
-};  // get the name for plugin soc model
-
-constexpr LiteRtOpCode kUnSupportedOps[] = {
-    kLiteRtOpCodeTflAssignVariable,
-    kLiteRtOpCodeTflBidirectionalSequenceLstm,
-    kLiteRtOpCodeTflBroadcastArgs,
-    kLiteRtOpCodeTflBucketize,
-    kLiteRtOpCodeTflCallOnce,
-    kLiteRtOpCodeTflComplexAbs,
-    kLiteRtOpCodeTflConv3d,
-    kLiteRtOpCodeTflConv3dTranspose,
-    kLiteRtOpCodeTflDensify,
-    kLiteRtOpCodeTflFakeQuant,
-    kLiteRtOpCodeTflHashtable,
-    kLiteRtOpCodeTflHashtableFind,
-    kLiteRtOpCodeTflHashtableImport,
-    kLiteRtOpCodeTflHashtableSize,
-    kLiteRtOpCodeTflImag,
-    kLiteRtOpCodeTflLocalResponseNormalization,
-    kLiteRtOpCodeTflMatrixDiag,
-    kLiteRtOpCodeTflMatrixSetDiag,
-    kLiteRtOpCodeTflMultinomial,
-    kLiteRtOpCodeTflNonMaxSuppressionV4,
-    kLiteRtOpCodeTflNonMaxSuppressionV5,
-    kLiteRtOpCodeTflRandomStandardNormal,
-    kLiteRtOpCodeTflRandomUniform,
-    kLiteRtOpCodeTflRank,
-    kLiteRtOpCodeTflReadVariable,
-    kLiteRtOpCodeTflReal,
-    kLiteRtOpCodeTflReduceProd,
-    kLiteRtOpCodeTflReverseSequence,
-    kLiteRtOpCodeTflRfft2d,
-    kLiteRtOpCodeTflSegmentSum,
-    kLiteRtOpCodeTflShape,
-    kLiteRtOpCodeTflSparseToDense,
-    kLiteRtOpCodeTflSvdf,
-    kLiteRtOpCodeTflUnidirectionalSequenceRnn,
-    kLiteRtOpCodeTflUnique,
-    kLiteRtOpCodeTflUnsortedSegmentMax,
-    kLiteRtOpCodeTflUnsortedSegmentMin,
-    kLiteRtOpCodeTflUnsortedSegmentProd,
-    kLiteRtOpCodeTflUnsortedSegmentSum,
-    kLiteRtOpCodeTflVarHandle,
-    kLiteRtOpCodeTflWhere,
-};
-// clang format on
-
-constexpr auto kNumPluginSocModels =
-    sizeof(kPluginSocModels) / sizeof(kPluginSocModels[0]);
-
-}  // namespace google_tensor
-
-LiteRtStatus LiteRtGetCompilerPluginVersion(LiteRtApiVersion* api_version) {
-  if (api_version == nullptr) {
-    LITERT_LOG(LITERT_ERROR, "%s", "api_version is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  api_version->major = LITERT_API_VERSION_MAJOR;
-  api_version->minor = LITERT_API_VERSION_MINOR;
-  api_version->patch = LITERT_API_VERSION_PATCH;
-  return kLiteRtStatusOk;
-}
-
-const char* LiteRtGetCompilerPluginSocManufacturer() {
-  return google_tensor::kPluginManufacturer;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedHardware(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtHwAccelerators* supported_hardware) {
-  if (!compiler_plugin || !supported_hardware) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "compiler_plugin or supported_hardware is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *supported_hardware = kLiteRtHwAcceleratorNpu;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumCompilerPluginSupportedSocModels(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtParamIndex* num_supported_soc_models) {
-  if (compiler_plugin == nullptr || num_supported_soc_models == nullptr) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "compiler_plugin or num_supported_soc_models is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_supported_soc_models = google_tensor::kNumPluginSocModels;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedSocModel(
-    LiteRtCompilerPlugin compiler_plugin, LiteRtParamIndex soc_model_idx,
-    const char** soc_model_name) {
-  if (compiler_plugin == nullptr ||
-      soc_model_idx >= google_tensor::kNumPluginSocModels ||
-      soc_model_name == nullptr) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "compiler_plugin or soc_model_idx or soc_model_name is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *soc_model_name = google_tensor::kPluginSocModels[soc_model_idx];
-  return kLiteRtStatusOk;
-}
-
-//
-// Compiled Result Definition
-//
-
-// TODO (abhirs): Revisit this struct after updating the compiler api wrapper to
-// return multiple bytecodes.
-struct LiteRtCompiledResultT {
-  std::string byte_code;
-  std::vector<std::string> per_op_data;
-};
-
-LiteRtStatus LiteRtGetCompiledResultByteCode(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex byte_code_idx,
-    const void** byte_code, size_t* byte_code_size) {
-  if (!compiled_result || !byte_code || !byte_code_size ||
-      (byte_code_idx != 0)) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "compiled_result or byte_code or byte_code_size"
-               "or byte_code_idx is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *byte_code = compiled_result->byte_code.data();
-  *byte_code_size = compiled_result->byte_code.size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCompiledResultNumByteCodeModules(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_byte_code) {
-  if (!compiled_result || !num_byte_code) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "compiled_result or num_byte_code is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_byte_code = 1;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompiledResultCallInfo(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex call_idx,
-    const void** call_info, size_t* call_info_size,
-    LiteRtParamIndex* byte_code_idx) {
-  if (!compiled_result || !call_info || !call_info_size) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "compiled_result or call_info or call_info_size is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (call_idx >= compiled_result->per_op_data.size()) {
-    LITERT_LOG(LITERT_ERROR, "%s", "call_idx is out of bounds");
-    return kLiteRtStatusErrorIndexOOB;
-  }
-
-  *call_info = compiled_result->per_op_data.at(call_idx).data();
-  *call_info_size = compiled_result->per_op_data.at(call_idx).size();
-  *byte_code_idx = 0;
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumCompiledResultCalls(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_calls) {
-  if (!compiled_result || !num_calls) {
-    LITERT_LOG(LITERT_ERROR, "%s", "compiled_result or num_calls is nullptr");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_calls = compiled_result->per_op_data.size();
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompiledResult(LiteRtCompiledResult compiled_result) {
-  delete compiled_result;
-}
-
-//
-// Plugin Definition
-//
-
-// Plugins can hold state.
-struct LiteRtCompilerPluginT {};
-
-LiteRtStatus LiteRtCompilerPluginSetFlags(LiteRtCompilerPlugin compiler_plugin,
-                                          LiteRtParamIndex num_flags,
-                                          const char** keys,
-                                          const char** values) {
-  // IMPLEMENT ME
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCreateCompilerPlugin(LiteRtCompilerPlugin* compiler_plugin) {
-  *compiler_plugin = new LiteRtCompilerPluginT;
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompilerPlugin(LiteRtCompilerPlugin compiler_plugin) {
-  if (compiler_plugin == nullptr) {
-    return;
-  }
-  delete compiler_plugin;
-}
-
-namespace google_tensor {
-//  TODO(abhirs): update the function to use the darwinn inbuilt way of
-//  finding supportedops
-bool IsOpSupported(const litert::Op& op) {
-  for (auto unsupported_op : kUnSupportedOps) {
-    if (unsupported_op == op.Code()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-}  // namespace google_tensor
-
-LiteRtStatus LiteRtCompilerPluginPartition(LiteRtCompilerPlugin compiler_plugin,
-                                           LiteRtSubgraph subgraph,
-                                           LiteRtOpList selected_ops) {
-  ::litert::Subgraph graph(subgraph);
-  for (const auto& op : graph.Ops()) {
-    if (!google_tensor::IsOpSupported(op)) {
-      continue;
-    }
-
-    LITERT_RETURN_IF_ERROR(LiteRtPushOp(selected_ops, op.Get(), 0));
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCompilerPluginCompile(
-    LiteRtCompilerPlugin compiler_plugin, const char* soc_model,
-    LiteRtModel partitions, LiteRtCompiledResult* compiled_result) {
-  if (compiler_plugin == nullptr || soc_model == nullptr ||
-      partitions == nullptr || compiled_result == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  auto model = litert::Model::CreateFromNonOwnedHandle(partitions);
-  const auto num_partitions = model.NumSubgraphs();
-  LITERT_LOG(LITERT_INFO,
-             "Starting GoogleTensor Compilation for %d subgraphs, soc_model=%s",
-             num_partitions, soc_model);
-
-  // Serialize model.
-  LITERT_LOG(LITERT_INFO, "%s", "Serializing model");
-  litert::OwningBufferRef buf;
-  auto [data, size, offset] = buf.GetWeak();
-  const auto opts = litert::SerializationOptions::Defaults();
-  LITERT_RETURN_IF_ERROR(
-      LiteRtSerializeModel(partitions, &data, &size, &offset, false, opts));
-  // TODO(abhirs): add support for serializing subgraphs
-
-  absl::string_view buffer_str(reinterpret_cast<const char*>(buf.Data()),
-                               buf.Size());
-
-  // Loading Google Tensor Compiler Adapter
-  LITERT_LOG(LITERT_INFO, "%s", "Loading Google Tensor Compiler Adapter");
-  auto adapter_result = litert::google_tensor::Adapter::Create(
-      /*shared_library_dir=*/std::nullopt);
-  if (!adapter_result.HasValue()) {
-    const auto& error_message = adapter_result.Error().Message();
-    LITERT_LOG(LITERT_ERROR, "Failed to create adapter: %s",
-               error_message.c_str());
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  // Compile model.
-  LITERT_LOG(LITERT_INFO, "%s", "Compiling model...");
-  // TODO(abhirs): add support for multiple bytecodes
-  absl::string_view soc_model_view(soc_model);
-  std::string compiled;
-  auto compile_status = adapter_result.Value()->api().compile(
-      buffer_str, soc_model_view, &compiled);
-
-  if (!compile_status.ok()) {
-    LITERT_LOG(
-        LITERT_ERROR, "%s",
-        absl::StrCat("Failed to compile model: ", compile_status.message())
-            .c_str());
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  // Result
-  auto result = std::make_unique<LiteRtCompiledResultT>();
-
-  result->byte_code = std::string(compiled.data(), compiled.size());
-  // Generate per_op_data.
-  for (auto i = 0; i < num_partitions; ++i) {
-    result->per_op_data.emplace_back(absl::StrFormat("Partition_%d", i));
-  }
-  *compiled_result = result.release();
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/compiler_plugin_test.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/compiler_plugin_test.cc
deleted file mode 100644
index 13e2a7f6b4c1..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/compiler/compiler_plugin_test.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h"
-
-namespace litert {
-namespace {
-
-TEST(TestGoogleTensorPlugin, GetConfigInfo) {
-  ASSERT_STREQ(LiteRtGetCompilerPluginSocManufacturer(), "GoogleTensor");
-
-  auto plugin = CreatePlugin();
-
-  LiteRtParamIndex num_supported_soc_models;
-  LITERT_ASSERT_OK(LiteRtGetNumCompilerPluginSupportedSocModels(
-      plugin.get(), &num_supported_soc_models));
-  ASSERT_EQ(num_supported_soc_models, 1);
-
-  const char* soc_model_name;
-  LITERT_ASSERT_OK(LiteRtGetCompilerPluginSupportedSocModel(plugin.get(), 0,
-                                                            &soc_model_name));
-  ASSERT_STREQ(soc_model_name, "P25");
-}
-
-TEST(TestCallGoogleTensorPlugin, PartitionSimpleMultiAdd) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("simple_multi_op.tflite");
-
-  LiteRtOpListT selected_op_list;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginPartition(
-      plugin.get(), model.Subgraph(0)->Get(), &selected_op_list));
-  const auto selected_ops = selected_op_list.Values();
-
-  ASSERT_EQ(selected_ops.size(), 4);
-  ASSERT_EQ(selected_ops[0].first->OpCode(), kLiteRtOpCodeTflAdd);
-  ASSERT_EQ(selected_ops[1].first->OpCode(), kLiteRtOpCodeTflMul);
-}
-
-TEST(TestCallGoogleTensorPlugin, CompileMulSubgraph) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("mul_simple.tflite");
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(
-      LiteRtCompilerPluginCompile(plugin.get(), "P25", model.Get(), &compiled));
-
-  const void* byte_code;
-  size_t byte_code_size;
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultByteCode(compiled, 0, &byte_code,
-                                                   &byte_code_size));
-  absl::string_view byte_code_string(reinterpret_cast<const char*>(byte_code),
-                                     byte_code_size);
-  ASSERT_FALSE(byte_code_string.empty());
-
-  const void* op_data;
-  size_t op_data_size;
-  LiteRtParamIndex byte_code_idx;
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultCallInfo(
-      compiled, 0, &op_data, &op_data_size, &byte_code_idx));
-  absl::string_view op_data_string(reinterpret_cast<const char*>(op_data),
-                                   op_data_size);
-  ASSERT_EQ("Partition_0", op_data_string);
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/BUILD b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/BUILD
deleted file mode 100644
index 7c69e0aa080f..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/BUILD
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "copy_file", "litert_dynamic_lib")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-litert_dynamic_lib(
-    name = "dispatch_api",
-    srcs = [
-        "dispatch_api.cc",
-        "litert_dispatch_device_context.cc",
-        "litert_dispatch_graph.cc",
-        "litert_dispatch_invocation_context.cc",
-        "southbound.cc",
-    ],
-    hdrs = [
-        "dispatch_api.h",
-        "litert_dispatch_device_context.h",
-        "litert_dispatch_graph.h",
-        "litert_dispatch_invocation_context.h",
-        "litert_dispatch_metrics.h",
-        "southbound.h",
-        # copybara:uncomment "//third_party/odml/infra/southbound:sb_api.h",
-    ],
-    export_litert_only = True,
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    linkstatic = 1,
-    shared_lib_name = "dispatch_api_so",
-    so_name = "libLiteRtDispatch_GoogleTensor.so",
-    tags = [
-        # Don't build/test in OSS until Southbound is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_any",
-        "//tensorflow/lite/experimental/litert/c:litert_runtime_c_api_shared_lib",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core/util:tensor_type_util",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-# This is cc_library target for `libLiteRtDispatch_GoogleTensor.so`.
-cc_library(
-    name = "dispatch_api_shared_lib",
-    srcs = [":dispatch_api_so"],
-    linkstatic = 1,
-)
-
-# Copies the shared library so that it is available for use in test data as libLiteRtDispatch_GoogleTensor.so.
-copy_file(
-    name = "copy_dispatch_api_so",
-    src = "//tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch:dispatch_api_so",
-    target = "libLiteRtDispatch_GoogleTensor.so",
-)
-
-cc_test(
-    name = "dispatch_api_google_tensor_test",
-    srcs = [
-        "dispatch_api_google_tensor_test.cc",
-    ],
-    data = [
-        ":dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        # Don't build/test in OSS until Southbound is available.
-        "nobuilder",
-        "no_oss",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "dispatch_api_async_google_tensor_test",
-    srcs = [
-        "dispatch_api_async_google_tensor_test.cc",
-    ],
-    data = [
-        ":dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        # Don't build/test in OSS until Southbound is available.
-        "nobuilder",
-        "no_oss",
-    ],
-    deps = [
-        "@com_google_googletest//:gtest_main",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/darwinn/driver_shared/fence:fence_test_util",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_event",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.cc
deleted file mode 100644
index e5d0e33d3188..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.cc
+++ /dev/null
@@ -1,639 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h"
-
-#include <cstdio>
-#include <cstring>
-#include <optional>
-#include <set>
-#include <string>
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include <android/hardware_buffer.h>
-#endif
-
-#include "absl/strings/string_view.h"
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_metrics.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h"
-
-namespace {
-
-litert::google_tensor::Southbound* TheSouthbound;
-char BuildId[256];
-
-}  // namespace
-
-namespace litert {
-namespace google_tensor {
-
-// /////////////////////////////////////////////////////////////////////////////
-// Basic Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-const char* GetSharedLibraryDir(const LiteRtDispatchOption* options,
-                                int num_options) {
-  for (auto i = 0; i < num_options; ++i) {
-    auto& option = options[i];
-    if (!strcmp(option.name, kDispatchOptionSharedLibraryDir)) {
-      return option.value.str_value;
-    }
-  }
-  return nullptr;
-}
-
-LiteRtStatus Initialize(const LiteRtDispatchOption* options, int num_options) {
-  auto* shared_library_dir = GetSharedLibraryDir(options, num_options);
-  std::optional<std::string> shared_library_dir_opt =
-      shared_library_dir ? std::make_optional(std::string(shared_library_dir))
-                         : std::nullopt;
-
-  if (auto southbound =
-          litert::google_tensor::Southbound::Create(shared_library_dir_opt);
-      !southbound) {
-    LITERT_LOG(LITERT_INFO, "Initialization failure: %s",
-               southbound.Error().Message().c_str());
-    return southbound.Error().Status();
-  } else {
-    TheSouthbound = southbound->release();
-  }
-
-  auto thr_initialize = TheSouthbound->api().thr_initialize;
-  if (!thr_initialize) {
-    LITERT_LOG(LITERT_INFO, "thr_initialize not found");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-  if (auto status = thr_initialize(); status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_INFO, "thr_initialize failed: %d", status);
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  auto thr_get_vendor_api_version =
-      TheSouthbound->api().thr_get_vendor_api_version;
-  const char* sb_api_version =
-      thr_get_vendor_api_version ? thr_get_vendor_api_version() : "N.A.";
-  auto thr_get_vendor_id = TheSouthbound->api().thr_get_vendor_id;
-  const char* sb_vendor_id = thr_get_vendor_id ? thr_get_vendor_id() : "N.A.";
-  snprintf(
-      BuildId, sizeof(BuildId),
-      "GoogleTensor Dispatch API version %d.%d.%d, Darwinn API version %s, "
-      "vendor id: %s",
-      LITERT_API_VERSION_MAJOR, LITERT_API_VERSION_MINOR,
-      LITERT_API_VERSION_PATCH, sb_api_version, sb_vendor_id);
-  BuildId[sizeof(BuildId) - 1] = 0;
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetVendorId(const char** vendor_id) {
-  *vendor_id = "Google";
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetBuildId(const char** build_id) {
-  *build_id = BuildId;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetCapabilities(int* capabilities) {
-  *capabilities = kLiteRtDispatchCapabilitiesBasic |
-                  kLiteRtDispatchCapabilitiesAsync |
-                  kLiteRtDispatchCapabilitiesGraph;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus DeviceContextCreate(LiteRtDispatchDeviceContext* device_context) {
-  if (auto result = LiteRtDispatchDeviceContextT::Create(*TheSouthbound);
-      result) {
-    *device_context = result->release();
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to create device context: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus DeviceContextDestroy(LiteRtDispatchDeviceContext device_context) {
-  delete device_context;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetInputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int input_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (auto result =
-          invocation_context->GetInputRequirements(input_index, *tensor_type);
-      result) {
-    *tensor_buffer_requirements = *result;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to get input requirements: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus GetOutputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int output_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (auto result =
-          invocation_context->GetOutputRequirements(output_index, *tensor_type);
-      result) {
-    *tensor_buffer_requirements = *result;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to get output requirements: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus RegisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context, LiteRtTensorBuffer buffer,
-    LiteRtTensorBufferHandle* tensor_buffer_handle) {
-  if (auto status = device_context->RegisterTensorBuffer(buffer); status) {
-    *tensor_buffer_handle = *status;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to register buffer: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-}
-
-LiteRtStatus UnregisterTensorBuffer(LiteRtDispatchDeviceContext device_context,
-                                    LiteRtTensorBufferHandle handle) {
-  if (auto status = device_context->UnregisterTensorBuffer(handle); status) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to unregister buffer: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-}
-
-LiteRtStatus InvocationContextCreate(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs,
-    LiteRtDispatchInvocationContext* invocation_context) {
-  if (auto result = LiteRtDispatchInvocationContextT::CreateFromBytecode(
-          *TheSouthbound, device_context, exec_type, exec_bytecode_buffer,
-          function_name, num_inputs, num_outputs);
-      result) {
-    *invocation_context = result->release();
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to create invocation context: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus InvocationContextDestroy(
-    LiteRtDispatchInvocationContext invocation_context) {
-  delete invocation_context;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus AttachInput(LiteRtDispatchInvocationContext invocation_context,
-                         int graph_input_index,
-                         LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = invocation_context->AttachInput(graph_input_index,
-                                                    tensor_buffer_handle);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to attach input: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus AttachOutput(LiteRtDispatchInvocationContext invocation_context,
-                          int graph_output_index,
-                          LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = invocation_context->AttachOutput(graph_output_index,
-                                                     tensor_buffer_handle);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to attach output: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-
-LiteRtStatus DetachInput(LiteRtDispatchInvocationContext invocation_context,
-                         int graph_input_index,
-                         LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = invocation_context->DetachInput(graph_input_index,
-                                                    tensor_buffer_handle);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to detatch input: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus DetachOutput(LiteRtDispatchInvocationContext invocation_context,
-                          int graph_output_index,
-                          LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = invocation_context->DetachOutput(graph_output_index,
-                                                     tensor_buffer_handle);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to detatch output: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus Invoke(LiteRtDispatchInvocationContext invocation_context) {
-  if (auto result = invocation_context->Invoke(); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to invoke: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-// /////////////////////////////////////////////////////////////////////////////
-// Async Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtStatus AttachInputEvent(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtEvent input_event) {
-  if (auto result =
-          invocation_context->AttachInputEvent(graph_input_index, input_event);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to attach input event: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus InvokeAsync(LiteRtDispatchInvocationContext invocation_context,
-                         int num_output_events, LiteRtEvent* output_events) {
-  if (auto result =
-          invocation_context->InvokeAsync(num_output_events, output_events);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to invoke asynchronously: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-// /////////////////////////////////////////////////////////////////////////////
-// Metrics API
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtStatus StartMetricsCollection(
-    LiteRtDispatchInvocationContext invocation_context, int detail_level) {
-  if (auto result = invocation_context->StartMetricsCollection(detail_level);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to start metrics collection: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus StopMetricsCollection(
-    LiteRtDispatchInvocationContext invocation_context,
-    LiteRtDispatchMetrics* metrics) {
-  if (auto result = invocation_context->StopMetricsCollection(metrics);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to stop metrics collection: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus GetNumMetrics(LiteRtDispatchMetrics metrics, int* num_metrics) {
-  if (metrics == nullptr) {
-    LITERT_LOG(LITERT_ERROR,
-               "GetNumMetrics failed: metrics should not be null");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_metrics = metrics->GetNumMetrics();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetMetric(LiteRtDispatchMetrics metrics, int metric_index,
-                       LiteRtMetric* metric) {
-  if (metrics == nullptr) {
-    LITERT_LOG(LITERT_ERROR, "GetMetric failed: metrics should not be null");
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *metric = metrics->GetMetric(metric_index);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus DestroyMetrics(LiteRtDispatchMetrics metrics) {
-  if (metrics) {
-    delete metrics;
-  }
-  return kLiteRtStatusOk;
-}
-
-// /////////////////////////////////////////////////////////////////////////////
-// Graph Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtStatus GraphCreate(LiteRtDispatchDeviceContext device_context,
-                         LiteRtDispatchGraph* graph) {
-  if (auto result = device_context->CreateGraph(); result) {
-    *graph = *result;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to create graph: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus GraphDestroy(LiteRtDispatchGraph graph) {
-  if (auto result = graph->device_context()->DestroyGraph(graph); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to delete graph: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus AddNode(LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
-                     LiteRtDispatchNodeType node_type) {
-  if (auto result = graph->AddNode(node_id, node_type); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to add node: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus AddEdge(LiteRtDispatchGraph graph, LiteRtDispatchEdgeId edge_id) {
-  if (auto result = graph->AddEdge(edge_id); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to add edge: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus ConnectNodeInput(LiteRtDispatchGraph graph,
-                              LiteRtDispatchNodeId node_id, int input_index,
-                              LiteRtDispatchEdgeId edge_id) {
-  if (auto result = graph->ConnectNodeInput(node_id, input_index, edge_id);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to connect node input: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus ConnectNodeOutput(LiteRtDispatchGraph graph,
-                               LiteRtDispatchNodeId node_id, int output_index,
-                               LiteRtDispatchEdgeId edge_id) {
-  if (auto result = graph->ConnectNodeOutput(node_id, output_index, edge_id);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to connect node output: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus ConnectGraphInput(LiteRtDispatchGraph graph, int input_index,
-                               LiteRtDispatchEdgeId edge_id) {
-  if (auto result = graph->ConnectGraphInput(input_index, edge_id); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to connect graph input: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus ConnectGraphOutput(LiteRtDispatchGraph graph, int output_index,
-                                LiteRtDispatchEdgeId edge_id) {
-  if (auto result = graph->ConnectGraphOutput(output_index, edge_id); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to connect graph output: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus LoadExecutable(LiteRtDispatchDeviceContext device_context,
-                            LiteRtDispatchExecutableType type,
-                            const LiteRtMemBuffer* bytecode_buffer,
-                            LiteRtDispatchExecutableHandle* exec_handle) {
-  if (auto result = device_context->LoadExecutable(type, bytecode_buffer);
-      result) {
-    *exec_handle = *result;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to load executable: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus UnloadExecutable(LiteRtDispatchDeviceContext device_context,
-                              LiteRtDispatchExecutableHandle exec_handle) {
-  if (auto result = device_context->UnloadExecutable(exec_handle); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to unload executable: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus AssignNodeFunction(LiteRtDispatchGraph graph,
-                                LiteRtDispatchNodeId node_id,
-                                LiteRtDispatchExecutableHandle exec_handle,
-                                const char* function_name) {
-  if (auto result =
-          graph->AssignNodeFunction(node_id, exec_handle, function_name);
-      result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to assign node function: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus AnnotateGraph(LiteRtDispatchGraph graph, const char* key,
-                           const char* value) {
-  if (auto result = graph->AnnotateGraph(key, value); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to annotate graph: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus AnnotateNode(LiteRtDispatchGraph graph,
-                          LiteRtDispatchNodeId node_id, const char* key,
-                          const char* value) {
-  if (auto result = graph->AnnotateNode(node_id, key, value); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to annotate node: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus AnnotateEdge(LiteRtDispatchGraph graph,
-                          LiteRtDispatchEdgeId edge_id, const char* key,
-                          const char* value) {
-  if (auto result = graph->AnnotateEdge(edge_id, key, value); result) {
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to annotate edge: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus InvocationContextCreateFromGraph(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph,
-    LiteRtDispatchInvocationContext* invocation_context) {
-  if (auto result = LiteRtDispatchInvocationContextT::CreateFromGraph(
-          *TheSouthbound, device_context, graph);
-      result) {
-    *invocation_context = result->release();
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to create invocation context: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-}  // namespace google_tensor
-}  // namespace litert
-
-// /////////////////////////////////////////////////////////////////////////////
-
-namespace {
-
-LiteRtDispatchInterface TheInterface = {
-    .initialize = litert::google_tensor::Initialize,
-    .get_vendor_id = litert::google_tensor::GetVendorId,
-    .get_build_id = litert::google_tensor::GetBuildId,
-    .get_capabilities = litert::google_tensor::GetCapabilities,
-    .device_context_create = litert::google_tensor::DeviceContextCreate,
-    .device_context_destroy = litert::google_tensor::DeviceContextDestroy,
-    .get_input_requirements = litert::google_tensor::GetInputRequirements,
-    .get_output_requirements = litert::google_tensor::GetOutputRequirements,
-    .register_tensor_buffer = litert::google_tensor::RegisterTensorBuffer,
-    .unregister_tensor_buffer = litert::google_tensor::UnregisterTensorBuffer,
-    .invocation_context_create = litert::google_tensor::InvocationContextCreate,
-    .invocation_context_destroy =
-        litert::google_tensor::InvocationContextDestroy,
-    .attach_input = litert::google_tensor::AttachInput,
-    .attach_output = litert::google_tensor::AttachOutput,
-    .detach_input = litert::google_tensor::DetachInput,
-    .detach_output = litert::google_tensor::DetachOutput,
-    .invoke = litert::google_tensor::Invoke,
-    .start_metrics_collection = litert::google_tensor::StartMetricsCollection,
-    .stop_metrics_collection = litert::google_tensor::StopMetricsCollection,
-    .get_num_metrics = litert::google_tensor::GetNumMetrics,
-    .get_metric = litert::google_tensor::GetMetric,
-    .destroy_metrics = litert::google_tensor::DestroyMetrics,
-};
-
-LiteRtDispatchAsyncInterface TheAsyncInterface = {
-    .attach_input_event = litert::google_tensor::AttachInputEvent,
-    .invoke_async = litert::google_tensor::InvokeAsync,
-};
-
-LiteRtDispatchGraphInterface TheGraphInterface = {
-    .graph_create = litert::google_tensor::GraphCreate,
-    .graph_destroy = litert::google_tensor::GraphDestroy,
-    .add_node = litert::google_tensor::AddNode,
-    .add_edge = litert::google_tensor::AddEdge,
-    .connect_node_input = litert::google_tensor::ConnectNodeInput,
-    .connect_node_output = litert::google_tensor::ConnectNodeOutput,
-    .connect_graph_input = litert::google_tensor::ConnectGraphInput,
-    .connect_graph_output = litert::google_tensor::ConnectGraphOutput,
-    .load_executable = litert::google_tensor::LoadExecutable,
-    .unload_executable = litert::google_tensor::UnloadExecutable,
-    .assign_node_function = litert::google_tensor::AssignNodeFunction,
-    .annotate_graph = litert::google_tensor::AnnotateGraph,
-    .annotate_node = litert::google_tensor::AnnotateNode,
-    .annotate_edge = litert::google_tensor::AnnotateEdge,
-    .invocation_context_create_from_graph =
-        litert::google_tensor::InvocationContextCreateFromGraph,
-};
-
-LiteRtDispatchApi TheApi = {
-    .version = {.major = LITERT_API_VERSION_MAJOR,
-                .minor = LITERT_API_VERSION_MINOR,
-                .patch = LITERT_API_VERSION_PATCH},
-    .interface = &TheInterface,
-    .async_interface = &TheAsyncInterface,
-    .graph_interface = &TheGraphInterface,
-};
-
-}  // namespace
-
-LiteRtStatus LiteRtDispatchGetApi(LiteRtDispatchApi* api) {
-  *api = TheApi;
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h
deleted file mode 100644
index 00e0559c085d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_DISPATCH_API_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_DISPATCH_API_H_
-
-#include <cstddef>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-namespace litert {
-namespace google_tensor {
-
-LiteRtStatus GraphCreate(LiteRtDispatchDeviceContext device_context,
-                         LiteRtDispatchGraph* graph);
-LiteRtStatus GraphDestroy(LiteRtDispatchGraph graph);
-LiteRtStatus AddNode(LiteRtDispatchGraph graph, LiteRtDispatchNodeId node_id,
-                     LiteRtDispatchNodeType node_type);
-LiteRtStatus AddEdge(LiteRtDispatchGraph graph, LiteRtDispatchEdgeId edge_id);
-LiteRtStatus ConnectNodeInput(LiteRtDispatchGraph graph,
-                              LiteRtDispatchNodeId node_id, int input_index,
-                              LiteRtDispatchEdgeId edge_id);
-LiteRtStatus ConnectNodeOutput(LiteRtDispatchGraph graph,
-                               LiteRtDispatchNodeId node_id, int output_index,
-                               LiteRtDispatchEdgeId edge_id);
-LiteRtStatus ConnectGraphInput(LiteRtDispatchGraph graph, int input_index,
-                               LiteRtDispatchEdgeId edge_id);
-LiteRtStatus ConnectGraphOutput(LiteRtDispatchGraph graph, int output_index,
-                                LiteRtDispatchEdgeId edge_id);
-LiteRtStatus LoadExecutable(LiteRtDispatchDeviceContext device_context,
-                            LiteRtDispatchExecutableType type,
-                            const void* bytecode, size_t bytecode_size,
-                            LiteRtDispatchExecutableHandle* exec_handle);
-LiteRtStatus UnloadExecutable(LiteRtDispatchDeviceContext device_context,
-                              LiteRtDispatchExecutableHandle exec_handle);
-LiteRtStatus AssignNodeFunction(LiteRtDispatchGraph graph,
-                                LiteRtDispatchNodeId node_id,
-                                LiteRtDispatchExecutableHandle exec_handle,
-                                const char* function_name);
-LiteRtStatus AnnotateGraph(LiteRtDispatchGraph graph, const char* key,
-                           const char* value);
-LiteRtStatus AnnotateNode(LiteRtDispatchGraph graph,
-                          LiteRtDispatchNodeId node_id, const char* key,
-                          const char* value);
-LiteRtStatus AnnotateEdge(LiteRtDispatchGraph graph,
-                          LiteRtDispatchEdgeId edge_id, const char* key,
-                          const char* value);
-LiteRtStatus InvocationContextCreateFromGraph(
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph,
-    LiteRtDispatchInvocationContext* invocation_context);
-
-}  // namespace google_tensor
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_DISPATCH_API_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api_async_google_tensor_test.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api_async_google_tensor_test.cc
deleted file mode 100644
index 762792a135e0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api_async_google_tensor_test.cc
+++ /dev/null
@@ -1,340 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <any>
-#include <cstddef>
-#include <cstring>
-#include <memory>
-
-#if defined(__ANDROID__)
-#include "platforms/darwinn/tachyon/core/fence/fence.h"
-#endif
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/types/span.h"
-#include "third_party/darwinn/driver_shared/fence/fence_test_util.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-using ::testing::Pointwise;
-using Fence = std::shared_ptr<platforms::darwinn::tachyon::Fence>;
-
-TEST(DispatchApiAsync, GoogleTensor) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP()
-      << "This test is specific to Android devices with a GoogleTensor eTPU";
-#endif
-
-  LiteRtDispatchOption dispatch_option = {
-      /*.name=*/kDispatchOptionSharedLibraryDir,
-      /*.value=*/*litert::ToLiteRtAny(std::any("/data/local/tmp")),
-  };
-  ASSERT_EQ(
-      LiteRtDispatchInitialize(/*options=*/&dispatch_option, /*num_options=*/1),
-      kLiteRtStatusOk);
-
-  const char* vendor_id;
-  EXPECT_EQ(LiteRtDispatchGetVendorId(&vendor_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "vendor_id: " << vendor_id;
-
-  const char* build_id;
-  EXPECT_EQ(LiteRtDispatchGetBuildId(&build_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "build_id: " << build_id;
-
-  LiteRtApiVersion api_version;
-  EXPECT_EQ(LiteRtDispatchGetApiVersion(&api_version), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "api_version: " << api_version.major << "."
-                 << api_version.minor << "." << api_version.patch;
-
-  int capabilities;
-  EXPECT_EQ(LiteRtDispatchGetCapabilities(&capabilities), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "capabilities: " << capabilities;
-
-  LiteRtDispatchDeviceContext device_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchDeviceContextCreate(&device_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "device_context: " << device_context;
-
-  auto model_file_name =
-      litert::testing::GetTestFilePath(kGoogleTensorModelFileName);
-  auto model = litert::internal::LoadBinaryFile(model_file_name);
-  EXPECT_TRUE(model) << model.Error();
-  ABSL_LOG(INFO) << "Loaded model " << model_file_name << ", " << model->Size()
-                 << " bytes";
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Set up an invocation context for a given model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtMemBuffer exec_bytecode_buffer = {/*.fd=*/-1,
-                                          /*.base_addr=*/model->Data(),
-                                          /*.offset=*/0,
-                                          /*.size=*/model->Size()};
-  LiteRtDispatchInvocationContext invocation_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchInvocationContextCreate(
-                device_context, kLiteRtDispatchExecutableTypeMlModel,
-                &exec_bytecode_buffer, /*function_name=*/nullptr,
-                /*num_inputs=*/2, /*num_outputs=*/1, &invocation_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "Invocation context: " << invocation_context;
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Determine tensor buffer requirements.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  int num_tensor_buffer_types;
-  LiteRtTensorBufferRequirements input_0_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/0, &kInput0TensorType,
-                &input_0_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_0_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_0_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_0_tensor_buffer_requirements, /*type_index=*/0,
-                &input_0_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_0_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t input_0_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_0_tensor_buffer_requirements, &input_0_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_0_tensor_buffer_size, sizeof(kTestInput0Tensor));
-
-  LiteRtTensorBufferRequirements input_1_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/1, &kInput1TensorType,
-                &input_1_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_1_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_1_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_1_tensor_buffer_requirements, /*type_index=*/0,
-                &input_1_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_1_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t input_1_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_1_tensor_buffer_requirements, &input_1_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_1_tensor_buffer_size, sizeof(kTestInput1Tensor));
-
-  LiteRtTensorBufferRequirements output_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetOutputRequirements(
-                invocation_context, /*output_index=*/0, &kOutputTensorType,
-                &output_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                output_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType output_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                output_tensor_buffer_requirements, /*type_index=*/0,
-                &output_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(output_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t output_tensor_buffer_size;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                output_tensor_buffer_requirements, &output_tensor_buffer_size),
-            kLiteRtStatusOk);
-  EXPECT_GE(output_tensor_buffer_size, sizeof(kTestOutputTensor));
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Allocate tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBuffer input_0_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_0_tensor_buffer_type, &kInput0TensorType,
-                input_0_tensor_buffer_size, &input_0_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer input_1_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_1_tensor_buffer_type, &kInput1TensorType,
-                input_1_tensor_buffer_size, &input_1_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer output_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                output_tensor_buffer_type, &kOutputTensorType,
-                output_tensor_buffer_size, &output_tensor_buffer),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Register tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBufferHandle input_1_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_1_tensor_buffer, &input_1_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle input_0_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_0_tensor_buffer, &input_0_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle output_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, output_tensor_buffer, &output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Attach tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Attach sync fences to input buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  Fence input_fence_0 = platforms::darwinn::fence_util::CreateFence();
-  Fence input_fence_1 = platforms::darwinn::fence_util::CreateFence();
-
-  LiteRtEvent input_event_0;
-  ASSERT_EQ(LiteRtCreateEventFromSyncFenceFd(input_fence_0->GetFd(),
-                                             /*owns_fd=*/false, &input_event_0),
-            kLiteRtStatusOk);
-
-  LiteRtEvent input_event_1;
-  ASSERT_EQ(LiteRtCreateEventFromSyncFenceFd(input_fence_1->GetFd(),
-                                             /*owns_fd=*/false, &input_event_1),
-            kLiteRtStatusOk);
-
-  ASSERT_EQ(LiteRtDispatchAttachInputEvent(
-                invocation_context, /*graph_input_index=*/0, input_event_0),
-            kLiteRtStatusOk);
-  ASSERT_EQ(LiteRtDispatchAttachInputEvent(
-                invocation_context, /*graph_input_index=*/1, input_event_1),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking execution...";
-  LiteRtEvent output_event = nullptr;
-  EXPECT_EQ(LiteRtDispatchInvokeAsync(invocation_context, 1, &output_event),
-            kLiteRtStatusOk);
-  ASSERT_NE(output_event, nullptr);
-
-  // Attach output event to output tensor buffer.
-  ASSERT_EQ(LiteRtSetTensorBufferEvent(output_tensor_buffer, output_event),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Signal input fences.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ASSERT_OK(input_fence_0->Signal(/*success=*/true));
-  ASSERT_OK(input_fence_1->Signal(/*success=*/true));
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(testing::FloatNear(1e-3), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Clean up resources.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtDestroyEvent(input_event_0);
-  LiteRtDestroyEvent(input_event_1);
-  LiteRtDestroyEvent(output_event);
-
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchUnregisterTensorBuffer(device_context, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_1_handle),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_0_handle),
-      kLiteRtStatusOk);
-  LiteRtDestroyTensorBuffer(output_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_1_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_0_tensor_buffer);
-  EXPECT_EQ(LiteRtDispatchInvocationContextDestroy(invocation_context),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDeviceContextDestroy(device_context),
-            kLiteRtStatusOk);
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api_google_tensor_test.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api_google_tensor_test.cc
deleted file mode 100644
index 2d2cca562552..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api_google_tensor_test.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <any>
-#include <cstddef>
-#include <cstring>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-using ::testing::Pointwise;
-
-TEST(DispatchApi, GoogleTensor) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP()
-      << "This test is specific to Android devices with a GoogleTensor eTPU";
-#endif
-
-  LiteRtDispatchOption dispatch_option = {
-      /*.name=*/kDispatchOptionSharedLibraryDir,
-      /*.value=*/*litert::ToLiteRtAny(std::any("/data/local/tmp")),
-  };
-  ASSERT_EQ(
-      LiteRtDispatchInitialize(/*options=*/&dispatch_option, /*num_options=*/1),
-      kLiteRtStatusOk);
-
-  const char* vendor_id;
-  EXPECT_EQ(LiteRtDispatchGetVendorId(&vendor_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "vendor_id: " << vendor_id;
-
-  const char* build_id;
-  EXPECT_EQ(LiteRtDispatchGetBuildId(&build_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "build_id: " << build_id;
-
-  LiteRtApiVersion api_version;
-  EXPECT_EQ(LiteRtDispatchGetApiVersion(&api_version), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "api_version: " << api_version.major << "."
-                 << api_version.minor << "." << api_version.patch;
-
-  int capabilities;
-  EXPECT_EQ(LiteRtDispatchGetCapabilities(&capabilities), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "capabilities: " << capabilities;
-
-  LiteRtDispatchDeviceContext device_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchDeviceContextCreate(&device_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "device_context: " << device_context;
-
-  auto model_file_name =
-      litert::testing::GetTestFilePath(kGoogleTensorModelFileName);
-  auto model = litert::internal::LoadBinaryFile(model_file_name);
-  EXPECT_TRUE(model) << model.Error();
-  ABSL_LOG(INFO) << "Loaded model " << model_file_name << ", " << model->Size()
-                 << " bytes";
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Set up an invocation context for a given model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtMemBuffer exec_bytecode_buffer = {/*.fd=*/-1,
-                                          /*.base_addr=*/model->Data(),
-                                          /*.offset=*/0,
-                                          /*.size=*/model->Size()};
-  LiteRtDispatchInvocationContext invocation_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchInvocationContextCreate(
-                device_context, kLiteRtDispatchExecutableTypeMlModel,
-                &exec_bytecode_buffer, /*function_name=*/nullptr,
-                /*num_inputs=*/2, /*num_outputs=*/1, &invocation_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "Invocation context: " << invocation_context;
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Determine tensor buffer requirements.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  int num_tensor_buffer_types;
-  LiteRtTensorBufferRequirements input_0_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/0, &kInput0TensorType,
-                &input_0_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_0_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_0_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_0_tensor_buffer_requirements, /*type_index=*/0,
-                &input_0_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_0_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t input_0_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_0_tensor_buffer_requirements, &input_0_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_0_tensor_buffer_size, sizeof(kTestInput0Tensor));
-
-  LiteRtTensorBufferRequirements input_1_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/1, &kInput1TensorType,
-                &input_1_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_1_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_1_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_1_tensor_buffer_requirements, /*type_index=*/0,
-                &input_1_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_1_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t input_1_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_1_tensor_buffer_requirements, &input_1_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_1_tensor_buffer_size, sizeof(kTestInput1Tensor));
-
-  LiteRtTensorBufferRequirements output_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetOutputRequirements(
-                invocation_context, /*output_index=*/0, &kOutputTensorType,
-                &output_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                output_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType output_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                output_tensor_buffer_requirements, /*type_index=*/0,
-                &output_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(output_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t output_tensor_buffer_size;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                output_tensor_buffer_requirements, &output_tensor_buffer_size),
-            kLiteRtStatusOk);
-  EXPECT_GE(output_tensor_buffer_size, sizeof(kTestOutputTensor));
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Allocate tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBuffer input_0_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_0_tensor_buffer_type, &kInput0TensorType,
-                input_0_tensor_buffer_size, &input_0_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer input_1_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_1_tensor_buffer_type, &kInput1TensorType,
-                input_1_tensor_buffer_size, &input_1_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer output_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                output_tensor_buffer_type, &kOutputTensorType,
-                output_tensor_buffer_size, &output_tensor_buffer),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Register tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBufferHandle input_1_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_1_tensor_buffer, &input_1_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle input_0_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_0_tensor_buffer, &input_0_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle output_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, output_tensor_buffer, &output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Attach tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking execution...";
-  EXPECT_EQ(LiteRtDispatchInvoke(invocation_context), kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(testing::FloatNear(1e-3), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Clean up resources.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchUnregisterTensorBuffer(device_context, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_1_handle),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_0_handle),
-      kLiteRtStatusOk);
-  LiteRtDestroyTensorBuffer(output_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_1_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_0_tensor_buffer);
-  EXPECT_EQ(LiteRtDispatchInvocationContextDestroy(invocation_context),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDeviceContextDestroy(device_context),
-            kLiteRtStatusOk);
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.cc
deleted file mode 100644
index 342c469a7cdb..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.cc
+++ /dev/null
@@ -1,294 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h"
-
-#include <cstddef>
-#include <memory>
-
-#if __ANDROID__
-#include <android/hardware_buffer.h>
-#endif  // __ANDROID__
-
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h"
-
-using litert::Error;
-using litert::Expected;
-using litert::Unexpected;
-
-LiteRtDispatchDeviceContextT::~LiteRtDispatchDeviceContextT() {
-  if (!thr_graphs_.empty()) {
-    auto thr_graph_delete = southbound_.api().thr_graph_delete;
-    if (!thr_graph_delete) {
-      LITERT_LOG(LITERT_ERROR, "thr_graph_delete not found");
-    } else {
-      for (auto* thr_graph : thr_graphs_) {
-        thr_graph_delete(thr_graph);
-      }
-    }
-  }
-
-  if (thr_context_) {
-    auto thr_context_delete = southbound_.api().thr_context_delete;
-    if (!thr_context_delete) {
-      LITERT_LOG(LITERT_ERROR, "thr_context_delete not found");
-    } else {
-      thr_context_delete(thr_context_);
-    }
-  }
-}
-
-Expected<LiteRtDispatchDeviceContextT::Ptr>
-LiteRtDispatchDeviceContextT::Create(
-    const litert::google_tensor::Southbound& southbound) {
-  Ptr device_context(new LiteRtDispatchDeviceContextT(southbound));
-
-  auto thr_context_create = southbound.api().thr_context_create;
-  if (!thr_context_create) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "thr_context_create not found");
-  }
-
-  device_context->thr_context_ = thr_context_create();
-  return device_context;
-}
-
-Expected<LiteRtTensorBufferHandle>
-LiteRtDispatchDeviceContextT::RegisterTensorBuffer(
-    LiteRtTensorBuffer tensor_buffer) {
-  LiteRtTensorBufferType tensor_buffer_type;
-  if (auto status =
-          LiteRtGetTensorBufferType(tensor_buffer, &tensor_buffer_type);
-      status != kLiteRtStatusOk) {
-    return Error(status, "Failed to get buffer type");
-  }
-
-  if (tensor_buffer_type != kLiteRtTensorBufferTypeAhwb) {
-    return Error(kLiteRtStatusErrorUnsupported, "Unsupported buffer type");
-  }
-
-  size_t tensor_buffer_size;
-  if (auto status =
-          LiteRtGetTensorBufferSize(tensor_buffer, &tensor_buffer_size);
-      status != kLiteRtStatusOk) {
-    return Error(status, "Failed to get buffer size");
-  }
-
-  size_t tensor_buffer_offset;
-  if (auto status =
-          LiteRtGetTensorBufferOffset(tensor_buffer, &tensor_buffer_offset);
-      status != kLiteRtStatusOk) {
-    if (status == kLiteRtStatusErrorNotFound) {
-      tensor_buffer_offset = 0;
-    } else {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to get buffer offset");
-    }
-  }
-
-  LiteRtRankedTensorType tensor_type;
-  if (auto status =
-          LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type);
-      status != kLiteRtStatusOk) {
-    return Error(status, "Failed to get tensor buffer type");
-  }
-
-  auto* tensor_strides = tensor_type.layout.strides;
-  if (tensor_strides != nullptr) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Tensor strides are not supported");
-  }
-
-  AHardwareBuffer* ahwb;
-#if LITERT_HAS_AHWB_SUPPORT
-  if (auto status = LiteRtGetTensorBufferAhwb(tensor_buffer, &ahwb);
-      status != kLiteRtStatusOk) {
-    return Error(status, "Failed to get AHWB");
-  }
-#else
-  return Error(kLiteRtStatusErrorRuntimeFailure,
-               "AHardwareBuffer is not supported on this platform");
-#endif
-
-  ThrBufferHandle thr_buffer_handle;
-
-  if (tensor_buffer_offset == 0) {
-    auto thr_register_buffer = southbound_.api().thr_register_buffer;
-    if (!thr_register_buffer) {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "thr_register_buffer not found");
-    }
-
-    if (auto status = thr_register_buffer(
-            thr_context_, ThrBufferType::kThrBufferTypeAHardwareBuffer, ahwb,
-            tensor_buffer_size, &thr_buffer_handle);
-        status != kThrStatusSuccess) {
-      LITERT_LOG(LITERT_ERROR, "thr_register_buffer failed: %d", status);
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "thr_register_buffer failed");
-    }
-
-  } else {
-    auto thr_register_buffer_with_offset =
-        southbound_.api().thr_register_buffer_with_offset;
-    if (!thr_register_buffer_with_offset) {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "thr_register_buffer_with_offset not found");
-    }
-
-    if (auto status = thr_register_buffer_with_offset(
-            thr_context_, ThrBufferType::kThrBufferTypeAHardwareBuffer, ahwb,
-            tensor_buffer_offset, tensor_buffer_size, &thr_buffer_handle);
-        status != kThrStatusSuccess) {
-      LITERT_LOG(LITERT_ERROR, "thr_register_buffer_with_offset failed: %d",
-                 status);
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "thr_register_buffer_with_offset failed");
-    }
-  }
-
-  return thr_buffer_handle;
-}
-
-litert::Expected<void> LiteRtDispatchDeviceContextT::UnregisterTensorBuffer(
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto thr_unregister_buffer = southbound_.api().thr_unregister_buffer;
-  if (!thr_unregister_buffer) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_unregister_buffer not found");
-  }
-
-  ThrBufferHandle thr_buffer_handle = tensor_buffer_handle;
-  if (auto status = thr_unregister_buffer(thr_context_, thr_buffer_handle);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_unregister_buffer failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_unregister_buffer failed");
-  }
-
-  return {};
-}
-
-litert::Expected<LiteRtDispatchGraph>
-LiteRtDispatchDeviceContextT::CreateGraph() {
-  auto thr_graph_create = southbound_.api().thr_graph_create;
-  if (!thr_graph_create) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_create not found");
-  }
-
-  ThrGraph* thr_graph = thr_graph_create(thr_context_);
-  if (!thr_graph) {
-    return Error(kLiteRtStatusErrorRuntimeFailure, "thr_graph_create failed");
-  }
-
-  return new LiteRtDispatchGraphT(southbound_, thr_graph, this);
-}
-
-litert::Expected<void> LiteRtDispatchDeviceContextT::DestroyGraph(
-    LiteRtDispatchGraph graph) {
-  auto thr_graph_delete = southbound_.api().thr_graph_delete;
-  if (!thr_graph_delete) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_delete not found");
-  }
-
-  thr_graphs_.erase(graph->thr_graph());
-
-  ThrGraph* thr_graph = graph->thr_graph();
-  if (auto status = thr_graph_delete(thr_graph); status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_destroy failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure, "thr_graph_destroy failed");
-  }
-
-  delete graph;
-  return {};
-}
-
-litert::Expected<LiteRtDispatchExecutableHandle>
-LiteRtDispatchDeviceContextT::LoadExecutable(
-    LiteRtDispatchExecutableType type, const LiteRtMemBuffer* bytecode_buffer) {
-  auto thr_load_sq_container = southbound_.api().thr_load_sq_container;
-  if (!thr_load_sq_container) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_load_sq_container not found");
-  }
-
-  ThrSqContainerType thr_type;
-  switch (type) {
-    case kLiteRtDispatchExecutableTypeDspLibrary:
-      thr_type = kThrSqContainerTypeFunctionLibrary;
-      break;
-    case kLiteRtDispatchExecutableTypeMlModel:
-      thr_type = kThrSqContainerTypeMlModel;
-      break;
-    default:
-      LITERT_LOG(LITERT_ERROR, "Unexpected executable type: %d", type);
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Unexpected executable type");
-  }
-
-  ThrSqContainerHandle sq_handle;
-  ThrStatus status;
-  if (bytecode_buffer->fd >= 0 &&
-      // Unfortunately thrLoadSqContainerFd doesn't support passing an
-      // offset. So if the offset is non-zero, we fallback to passing a CPU
-      // memory address right below.
-      (bytecode_buffer->offset == 0)) {
-    bool lazy_loading = false;
-    status = southbound_.api().thr_load_sq_container_fd(
-        thr_context_, thr_type, bytecode_buffer->fd, bytecode_buffer->size,
-        lazy_loading, &sq_handle);
-  } else {
-    auto bytecode_ptr =
-        static_cast<const uint8_t*>(bytecode_buffer->base_addr) +
-        bytecode_buffer->offset;
-    status = southbound_.api().thr_load_sq_container(
-        thr_context_, thr_type, bytecode_ptr, bytecode_buffer->size,
-        &sq_handle);
-  }
-  if (status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_load_sq_container failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_load_sq_container failed");
-  }
-
-  return sq_handle;
-}
-
-litert::Expected<void> LiteRtDispatchDeviceContextT::UnloadExecutable(
-    LiteRtDispatchExecutableHandle exec_handle) {
-  auto thr_unload_sq_container = southbound_.api().thr_unload_sq_container;
-  if (!thr_unload_sq_container) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_unload_sq_container not found");
-  }
-
-  ThrSqContainerHandle sq_handle = exec_handle;
-  if (auto status = thr_unload_sq_container(thr_context_, sq_handle);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_unload_sq_container failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_unload_sq_container failed");
-  }
-
-  return {};
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h
deleted file mode 100644
index 4a7074d49ede..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
-
-#include <cstddef>
-#include <memory>
-
-#include "absl/container/flat_hash_set.h"
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h"
-
-class LiteRtDispatchDeviceContextT {
- public:
-  using Ptr = std::unique_ptr<LiteRtDispatchDeviceContextT>;
-
-  ~LiteRtDispatchDeviceContextT();
-
-  static litert::Expected<Ptr> Create(
-      const litert::google_tensor::Southbound& southbound);
-
-  litert::Expected<LiteRtTensorBufferHandle> RegisterTensorBuffer(
-      LiteRtTensorBuffer tensor_buffer);
-
-  litert::Expected<void> UnregisterTensorBuffer(
-      LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<LiteRtDispatchGraph> CreateGraph();
-  litert::Expected<void> DestroyGraph(LiteRtDispatchGraph graph);
-
-  litert::Expected<LiteRtDispatchExecutableHandle> LoadExecutable(
-      LiteRtDispatchExecutableType type,
-      const LiteRtMemBuffer* bytecode_buffer);
-
-  litert::Expected<void> UnloadExecutable(
-      LiteRtDispatchExecutableHandle exec_handle);
-
-  ThrContext* thr_context() { return thr_context_; }
-
-  void add_graph(ThrGraph* graph) { thr_graphs_.insert(graph); }
-
- private:
-  explicit LiteRtDispatchDeviceContextT(
-      const litert::google_tensor::Southbound& southbound)
-      : southbound_(southbound) {}
-
-  const litert::google_tensor::Southbound& southbound_;
-  ThrContext* thr_context_ = nullptr;
-  absl::flat_hash_set<ThrGraph*> thr_graphs_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.cc
deleted file mode 100644
index d3530b56d57f..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.cc
+++ /dev/null
@@ -1,305 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h"
-
-#include <set>
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-using litert::Error;
-using litert::Expected;
-
-namespace {
-
-// We store THR names in a global set as a workaround to b/369144429.
-std::set<std::string>* ThrNames = new std::set<std::string>();
-
-absl::string_view ThrNodeIdStr(LiteRtDispatchNodeId node_id) {
-  auto str = "node_" + std::to_string(node_id);
-  auto iter = ThrNames->find(str);
-  if (iter == ThrNames->end()) {
-    iter = ThrNames->insert(iter, str);
-  }
-  return *iter;
-}
-
-}  // namespace
-
-absl::string_view ThrEdgeIdStr(LiteRtDispatchEdgeId edge_id) {
-  auto str = "edge_" + std::to_string(edge_id);
-  auto iter = ThrNames->find(str);
-  if (iter == ThrNames->end()) {
-    iter = ThrNames->insert(iter, str);
-  }
-  return *iter;
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::AddNode(
-    LiteRtDispatchNodeId node_id, LiteRtDispatchNodeType node_type) {
-  auto thr_graph_add_sq_node = southbound_.api().thr_graph_add_sq_node;
-  if (!thr_graph_add_sq_node) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_add_sq_node not found");
-  }
-
-  auto thr_node_id = ThrNodeIdStr(node_id);
-  ThrNodeType thr_node_type;
-  switch (node_type) {
-    case kLiteRtDispatchNodeTypeDsp:
-      thr_node_type = kThrNodeTypeDsp;
-      break;
-    case kLiteRtDispatchNodeTypeNpu:
-      thr_node_type = kThrNodeTypeNpu;
-      break;
-    default:
-      LITERT_LOG(LITERT_ERROR, "Unexpected node type: %d", node_type);
-      return Error(kLiteRtStatusErrorRuntimeFailure, "Unexpected node type");
-  }
-
-  if (auto status =
-          thr_graph_add_sq_node(thr_graph_, thr_node_id.data(), thr_node_type);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_add_sq_node failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_add_sq_node failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::AddEdge(
-    LiteRtDispatchEdgeId edge_id) {
-  auto thr_graph_add_edge = southbound_.api().thr_graph_add_edge;
-  if (!thr_graph_add_edge) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_add_edge not found");
-  }
-
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  ThrEdgeType thr_edge_type = kThrEdgeNoType;
-  if (auto status =
-          thr_graph_add_edge(thr_graph_, thr_edge_id.data(), thr_edge_type);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_add_edge failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure, "thr_graph_add_edge failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::ConnectNodeInput(
-    LiteRtDispatchNodeId node_id, int input_index,
-    LiteRtDispatchEdgeId edge_id) {
-  auto thr_graph_connect_node_input =
-      southbound_.api().thr_graph_connect_node_input;
-  if (!thr_graph_connect_node_input) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_connect_node_input not found");
-  }
-
-  int next_input_index = NextNodeInputIndex(node_id);
-  if (input_index != next_input_index) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected input index %d, expected %d",
-               input_index, next_input_index);
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Unexpected input index");
-  }
-
-  auto thr_node_id = ThrNodeIdStr(node_id);
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  if (auto status = thr_graph_connect_node_input(thr_graph_, thr_node_id.data(),
-                                                 thr_edge_id.data());
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_set_input_edge failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_set_input_edge failed");
-  }
-
-  AddInputEdge(input_index, edge_id);
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::ConnectNodeOutput(
-    LiteRtDispatchNodeId node_id, int output_index,
-    LiteRtDispatchEdgeId edge_id) {
-  auto thr_graph_connect_node_output =
-      southbound_.api().thr_graph_connect_node_output;
-  if (!thr_graph_connect_node_output) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_connect_node_output not found");
-  }
-
-  int next_output_index = NextNodeOutputIndex(node_id);
-  if (output_index != next_output_index) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected output index %d, expected %d",
-               output_index, next_output_index);
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Unexpected output index");
-  }
-
-  auto thr_node_id = ThrNodeIdStr(node_id);
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  if (auto status = thr_graph_connect_node_output(
-          thr_graph_, thr_node_id.data(), thr_edge_id.data());
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_set_output_edge failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_set_output_edge failed");
-  }
-
-  AddOutputEdge(output_index, edge_id);
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::ConnectGraphInput(
-    int input_index, LiteRtDispatchEdgeId edge_id) {
-  int next_input_index = NextGraphInputIndex();
-  if (input_index != next_input_index) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected input index %d, expected %d",
-               input_index, next_input_index);
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Unexpected input index");
-  }
-
-  auto thr_graph_set_input_edge = southbound_.api().thr_graph_set_input_edge;
-  if (!thr_graph_set_input_edge) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_set_input_edge not found");
-  }
-
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  if (auto status = thr_graph_set_input_edge(thr_graph_, thr_edge_id.data());
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_set_input_edge failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_set_input_edge failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::ConnectGraphOutput(
-    int output_index, LiteRtDispatchEdgeId edge_id) {
-  int next_output_index = NextGraphOutputIndex();
-  if (output_index != next_output_index) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected output index %d, expected %d",
-               output_index, next_output_index);
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Unexpected output index");
-  }
-
-  auto thr_graph_set_output_edge = southbound_.api().thr_graph_set_output_edge;
-  if (!thr_graph_set_output_edge) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_set_output_edge not found");
-  }
-
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  if (auto status = thr_graph_set_output_edge(thr_graph_, thr_edge_id.data());
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_set_output_edge failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_set_output_edge failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::AssignNodeFunction(
-    LiteRtDispatchNodeId node_id, LiteRtDispatchExecutableHandle exec_handle,
-    const char* function_name) {
-  auto thr_graph_assign_sq = southbound_.api().thr_graph_assign_sq;
-  if (!thr_graph_assign_sq) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_assign_sq not found");
-  }
-
-  auto thr_node_id = ThrNodeIdStr(node_id);
-  ThrSqContainerHandle sq_handle = exec_handle;
-  // An empty function name represent no function name being provided and
-  // therefore we must pass a nullptr to the call below, otherwise the SB API
-  // will expect a model with a signature. See b/378913220.
-  const char* function_name_ptr =
-      absl::string_view(function_name).empty() ? nullptr : function_name;
-  if (auto status = thr_graph_assign_sq(thr_graph_, thr_node_id.data(),
-                                        sq_handle, function_name_ptr);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_assign_sq failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_assign_sq failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::AnnotateGraph(const char* key,
-                                                           const char* value) {
-  auto thr_graph_annotate_graph = southbound_.api().thr_graph_annotate_graph;
-  if (!thr_graph_annotate_graph) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_annotate_graph not found");
-  }
-
-  if (auto status = thr_graph_annotate_graph(thr_graph_, key, value);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_annotate_graph failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_annotate_graph failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::AnnotateNode(
-    LiteRtDispatchNodeId node_id, const char* key, const char* value) {
-  auto thr_graph_annotate_node = southbound_.api().thr_graph_annotate_node;
-  if (!thr_graph_annotate_node) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_annotate_node not found");
-  }
-
-  auto thr_node_id = ThrNodeIdStr(node_id);
-  if (auto status =
-          thr_graph_annotate_node(thr_graph_, thr_node_id.data(), key, value);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_annotate_node failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_annotate_node failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchGraphT::AnnotateEdge(
-    LiteRtDispatchEdgeId edge_id, const char* key, const char* value) {
-  auto thr_graph_annotate_edge = southbound_.api().thr_graph_annotate_edge;
-  if (!thr_graph_annotate_edge) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_annotate_edge not found");
-  }
-
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  if (auto status =
-          thr_graph_annotate_edge(thr_graph_, thr_edge_id.data(), key, value);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_graph_annotate_edge failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_graph_annotate_edge failed");
-  }
-
-  return {};
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h
deleted file mode 100644
index 6586e58f9bd6..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_GRAPH_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_GRAPH_H_
-
-#include <cstddef>
-#include <map>
-
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h"
-
-class LiteRtDispatchGraphT {
- public:
-  LiteRtDispatchGraphT(const litert::google_tensor::Southbound& southbound,
-                       ThrGraph* thr_graph,
-                       LiteRtDispatchDeviceContext device_context)
-      : southbound_(southbound),
-        thr_graph_(thr_graph),
-        device_context_(device_context) {}
-
-  ThrGraph* thr_graph() { return thr_graph_; }
-
-  LiteRtDispatchDeviceContext device_context() { return device_context_; }
-
-  litert::Expected<void> AddNode(LiteRtDispatchNodeId node_id,
-                                 LiteRtDispatchNodeType node_type);
-  litert::Expected<void> AddEdge(LiteRtDispatchEdgeId edge_id);
-
-  litert::Expected<LiteRtDispatchEdgeId> InputEdge(int input_index) const {
-    return IoEdge(input_index, input_edges_);
-  }
-
-  litert::Expected<LiteRtDispatchEdgeId> OutputEdge(int output_index) const {
-    return IoEdge(output_index, output_edges_);
-  }
-
-  size_t NumOutputs() const { return output_edges_.size(); }
-
-  litert::Expected<void> ConnectNodeInput(LiteRtDispatchNodeId node_id,
-                                          int input_index,
-                                          LiteRtDispatchEdgeId edge_id);
-
-  litert::Expected<void> ConnectNodeOutput(LiteRtDispatchNodeId node_id,
-                                           int output_index,
-                                           LiteRtDispatchEdgeId edge_id);
-
-  litert::Expected<void> ConnectGraphInput(int input_index,
-                                           LiteRtDispatchEdgeId edge_id);
-
-  litert::Expected<void> ConnectGraphOutput(int output_index,
-                                            LiteRtDispatchEdgeId edge_id);
-
-  litert::Expected<void> AssignNodeFunction(
-      LiteRtDispatchNodeId node_id, LiteRtDispatchExecutableHandle exec_handle,
-      const char* function_name);
-
-  litert::Expected<void> AnnotateGraph(const char* key, const char* value);
-
-  litert::Expected<void> AnnotateNode(LiteRtDispatchNodeId node_id,
-                                      const char* key, const char* value);
-
-  litert::Expected<void> AnnotateEdge(LiteRtDispatchEdgeId edge_id,
-                                      const char* key, const char* value);
-
- private:
-  using NextNodeIoIndexMap = std::map<LiteRtDispatchNodeId, int>;
-  using IoIndexToEdgeIdMap = std::map<int, LiteRtDispatchEdgeId>;
-
-  int NextNodeOutputIndex(LiteRtDispatchNodeId node_id) {
-    return NextNodeIoIndex(node_id, next_node_output_index_);
-  }
-
-  int NextGraphInputIndex() { return next_graph_input_index_++; }
-
-  int NextGraphOutputIndex() { return next_graph_output_index_++; }
-
-  int NextNodeIoIndex(LiteRtDispatchNodeId node_id, NextNodeIoIndexMap& map) {
-    return map[node_id]++;
-  }
-
-  litert::Expected<LiteRtDispatchEdgeId> IoEdge(
-      int io_index, const IoIndexToEdgeIdMap& map) const {
-    auto iter = map.find(io_index);
-    if (iter == map.end()) {
-      return litert::Unexpected(kLiteRtStatusErrorNotFound,
-                                "Unexpected graph input/output index");
-    }
-    return iter->second;
-  }
-
-  int NextNodeInputIndex(LiteRtDispatchNodeId node_id) {
-    return NextNodeIoIndex(node_id, next_node_input_index_);
-  }
-
-  void AddInputEdge(int input_index, LiteRtDispatchEdgeId edge_id) {
-    input_edges_[input_index] = edge_id;
-  }
-
-  void AddOutputEdge(int output_index, LiteRtDispatchEdgeId edge_id) {
-    output_edges_[output_index] = edge_id;
-  }
-
-  const litert::google_tensor::Southbound& southbound_;
-  ThrGraph* thr_graph_;
-  LiteRtDispatchDeviceContext device_context_;
-  NextNodeIoIndexMap next_node_input_index_;
-  NextNodeIoIndexMap next_node_output_index_;
-  int next_graph_input_index_ = 0;
-  int next_graph_output_index_ = 0;
-  IoIndexToEdgeIdMap input_edges_;
-  IoIndexToEdgeIdMap output_edges_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_GRAPH_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.cc
deleted file mode 100644
index ac0a845c56ea..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.cc
+++ /dev/null
@@ -1,613 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h"
-
-#include <cstddef>
-
-#include "absl/strings/string_view.h"
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/core/util/tensor_type_util.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_device_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_graph.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_metrics.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h"
-
-using litert::Error;
-using litert::Expected;
-using litert::Unexpected;
-
-extern absl::string_view ThrEdgeIdStr(LiteRtDispatchEdgeId edge_id);
-
-namespace {
-
-constexpr const size_t kEdgeTpuPadding = 64;
-
-template <class X, class Align>
-inline constexpr auto Pad(X x, Align align) {
-  return ((x + align - 1) / align) * align;
-}
-
-}  // namespace
-
-litert::Expected<LiteRtDispatchInvocationContextT::Ptr>
-LiteRtDispatchInvocationContextT::CreateFromBytecode(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs) {
-  auto graph = device_context->CreateGraph();
-  if (!graph) {
-    return graph.Error();
-  }
-
-  LiteRtDispatchNodeId node_id = 0;
-  LiteRtDispatchNodeType node_type;
-  switch (exec_type) {
-    case kLiteRtDispatchExecutableTypeDspLibrary:
-      node_type = kLiteRtDispatchNodeTypeDsp;
-      break;
-    case kLiteRtDispatchExecutableTypeMlModel:
-      node_type = kLiteRtDispatchNodeTypeNpu;
-      break;
-    default:
-      LITERT_LOG(LITERT_ERROR, "Unexpected executable type: %d", exec_type);
-      return Error(kLiteRtStatusErrorInvalidArgument,
-                   "Unexpected executable type");
-  }
-
-  if (auto status = (*graph)->AddNode(node_id, node_type); !status) {
-    return status.Error();
-  }
-
-  auto exec_handle =
-      device_context->LoadExecutable(exec_type, exec_bytecode_buffer);
-  if (!exec_handle) {
-    return exec_handle.Error();
-  }
-
-  if (auto status =
-          (*graph)->AssignNodeFunction(node_id, *exec_handle, function_name);
-      !status) {
-    return status.Error();
-  }
-
-  LiteRtDispatchEdgeId next_edge_id = 0;
-
-  for (auto input_index = 0; input_index < num_inputs; ++input_index) {
-    LiteRtDispatchEdgeId edge_id = next_edge_id++;
-    if (auto status = (*graph)->AddEdge(edge_id); !status) {
-      return status.Error();
-    }
-    if (auto status = (*graph)->ConnectGraphInput(input_index, edge_id);
-        !status) {
-      return status.Error();
-    }
-    if (auto status = (*graph)->ConnectNodeInput(node_id, input_index, edge_id);
-        !status) {
-      return status.Error();
-    }
-  }
-
-  for (auto output_index = 0; output_index < num_outputs; ++output_index) {
-    LiteRtDispatchEdgeId edge_id = next_edge_id++;
-    if (auto status = (*graph)->AddEdge(edge_id); !status) {
-      return status.Error();
-    }
-    if (auto status =
-            (*graph)->ConnectNodeOutput(node_id, output_index, edge_id);
-        !status) {
-      return status.Error();
-    }
-    if (auto status = (*graph)->ConnectGraphOutput(output_index, edge_id);
-        !status) {
-      return status.Error();
-    }
-  }
-
-  auto invocation_context = CreateFromGraph(southbound, device_context, *graph);
-  if (!invocation_context) {
-    return invocation_context.Error();
-  }
-
-  (*invocation_context)->AttachExecutable(*exec_handle);
-
-  return invocation_context;
-}
-
-litert::Expected<LiteRtDispatchInvocationContextT::Ptr>
-LiteRtDispatchInvocationContextT::CreateFromGraph(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph) {
-  auto thr_invocation_context_get = southbound.api().thr_invocation_context_get;
-  if (!thr_invocation_context_get) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_get not found");
-  }
-
-  ThrGraph* thr_graph = graph->thr_graph();
-  auto thr_icontext =
-      thr_invocation_context_get(thr_graph, device_context->thr_context());
-  if (!thr_icontext) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_get failed");
-  }
-
-  device_context->add_graph(thr_graph);
-  return Ptr(new LiteRtDispatchInvocationContextT(southbound, thr_icontext,
-                                                  device_context, graph));
-}
-
-LiteRtDispatchInvocationContextT::~LiteRtDispatchInvocationContextT() {
-  auto thr_invocation_context_delete =
-      southbound_.api().thr_invocation_context_delete;
-  if (!thr_invocation_context_delete) {
-    LITERT_LOG(LITERT_ERROR, "thr_invocation_context_delete not found");
-  } else {
-    ThrGraph* thr_graph = graph_->thr_graph();
-    if (auto status =
-            thr_invocation_context_delete(thr_graph, thr_invocation_context_);
-        status != kThrStatusSuccess) {
-      LITERT_LOG(LITERT_ERROR, "thr_invocation_context_delete failed: %d",
-                 status);
-    }
-  }
-
-  if (exec_handle_) {
-    device_context_->UnloadExecutable(*exec_handle_);
-  }
-}
-
-namespace {
-
-Expected<LiteRtTensorBufferRequirements> GetTensorBufferRequirements(
-    const LiteRtRankedTensorType& tensor_type) {
-  auto* tensor_strides = tensor_type.layout.strides;
-  if (tensor_strides != nullptr) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Tensor strides are not supported on GoogleTensor");
-  }
-
-  LiteRtTensorBufferType supported_tensor_buffer_types[] = {
-      kLiteRtTensorBufferTypeAhwb,
-  };
-  int num_supported_tensor_buffer_types =
-      sizeof(supported_tensor_buffer_types) /
-      sizeof(supported_tensor_buffer_types[0]);
-
-  auto buffer_size = litert::internal::GetNumPackedBytes(tensor_type);
-  if (!buffer_size) {
-    return Unexpected(buffer_size.Error());
-  }
-
-  size_t padded_buffer_size = Pad(*buffer_size, kEdgeTpuPadding);
-
-  LiteRtTensorBufferRequirements requirements;
-  if (auto status = LiteRtCreateTensorBufferRequirements(
-          num_supported_tensor_buffer_types, supported_tensor_buffer_types,
-          padded_buffer_size, /*num_strides=*/0, /*strides=*/nullptr,
-          &requirements);
-      status != kLiteRtStatusOk) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to create tensor buffer requirements");
-  }
-
-  return requirements;
-}
-}  // namespace
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtDispatchInvocationContextT::GetInputRequirements(
-    int input_index, const LiteRtRankedTensorType& tensor_type) {
-  return GetTensorBufferRequirements(tensor_type);
-}
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtDispatchInvocationContextT::GetOutputRequirements(
-    int output_index, const LiteRtRankedTensorType& tensor_type) {
-  return GetTensorBufferRequirements(tensor_type);
-}
-
-namespace {
-
-litert::Expected<void> AttachBufferHelper(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchInvocationContext invocation_context,
-    LiteRtDispatchEdgeId edge_id,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto thr_invocation_context_attach_buffer =
-      southbound.api().thr_invocation_context_attach_buffer;
-  if (!thr_invocation_context_attach_buffer) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_attach_buffer not found");
-  }
-
-  ThrInvocationContext* thr_icontext =
-      invocation_context->thr_invocation_context();
-  ThrContext* thr_context = invocation_context->device_context()->thr_context();
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  ThrBufferHandle thr_buffer_handle = tensor_buffer_handle;
-  if (auto status = thr_invocation_context_attach_buffer(
-          thr_icontext, thr_context, thr_edge_id.data(), thr_buffer_handle);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_invocation_context_attach_buffer failed: %d",
-               status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_attach_buffer failed");
-  }
-
-  return {};
-}
-
-}  // namespace
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::AttachInput(
-    int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = graph_->InputEdge(graph_input_index); result) {
-    auto edge_id = *result;
-    return AttachBufferHelper(southbound_, this, edge_id, tensor_buffer_handle);
-  } else {
-    return result.Error();
-  }
-}
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::AttachOutput(
-    int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = graph_->OutputEdge(graph_output_index); result) {
-    auto edge_id = *result;
-    return AttachBufferHelper(southbound_, this, edge_id, tensor_buffer_handle);
-  } else {
-    return result.Error();
-  }
-}
-
-namespace {
-
-litert::Expected<void> DetachTensorBufferHelper(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchInvocationContext invocation_context,
-    LiteRtDispatchEdgeId edge_id,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto thr_invocation_context_detach_buffer =
-      southbound.api().thr_invocation_context_detach_buffer;
-  if (!thr_invocation_context_detach_buffer) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_detach_buffer not found");
-  }
-
-  ThrInvocationContext* thr_icontext =
-      invocation_context->thr_invocation_context();
-  ThrContext* thr_context = invocation_context->device_context()->thr_context();
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  ThrBufferHandle thr_buffer_handle = tensor_buffer_handle;
-  if (auto status = thr_invocation_context_detach_buffer(
-          thr_icontext, thr_context, thr_edge_id.data(), thr_buffer_handle);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_invocation_context_detach_buffer failed: %d",
-               status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_detach_buffer failed");
-  }
-
-  return {};
-}
-
-}  // namespace
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::DetachInput(
-    int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = graph_->InputEdge(graph_input_index); result) {
-    auto edge_id = *result;
-    return DetachTensorBufferHelper(southbound_, this, edge_id,
-                                    tensor_buffer_handle);
-  } else {
-    return result.Error();
-  }
-}
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::DetachOutput(
-    int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto result = graph_->OutputEdge(graph_output_index); result) {
-    auto edge_id = *result;
-    return DetachTensorBufferHelper(southbound_, this, edge_id,
-                                    tensor_buffer_handle);
-  } else {
-    return result.Error();
-  }
-}
-
-namespace {
-
-litert::Expected<void> PrepareForInvoke(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchInvocationContext invocation_context,
-    bool create_output_sync_fence) {
-  auto thr_invocation_context_prepare_for_invoke =
-      southbound.api().thr_invocation_context_prepare_for_invoke;
-  if (!thr_invocation_context_prepare_for_invoke) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_prepare_for_invoke not found");
-  }
-
-  ThrInvocationContext* thr_icontext =
-      invocation_context->thr_invocation_context();
-  if (auto status = thr_invocation_context_prepare_for_invoke(
-          thr_icontext, create_output_sync_fence);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR,
-               "thr_invocation_context_prepare_for_invoke failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_prepare_for_invoke failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> InvokeOnce(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchInvocationContext invocation_context) {
-  auto thr_invocation_context_invoke_once =
-      southbound.api().thr_invocation_context_invoke_once;
-  if (!thr_invocation_context_invoke_once) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_invoke_once not found");
-  }
-
-  ThrInvocationContext* thr_icontext =
-      invocation_context->thr_invocation_context();
-  if (auto status = thr_invocation_context_invoke_once(thr_icontext);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_invocation_context_invoke_once failed: %d",
-               status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_invoke_once failed");
-  }
-
-  return {};
-}
-
-litert::Expected<void> Wait(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchInvocationContext invocation_context) {
-  auto thr_invocation_context_wait =
-      southbound.api().thr_invocation_context_wait;
-  if (!thr_invocation_context_wait) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_wait not found");
-  }
-
-  ThrInvocationContext* thr_icontext =
-      invocation_context->thr_invocation_context();
-  if (auto status = thr_invocation_context_wait(thr_icontext);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR, "thr_invocation_context_wait failed: %d", status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_wait failed");
-  }
-
-  return {};
-}
-
-}  // namespace
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::Invoke() {
-  if (auto result = PrepareForInvoke(southbound_, this,
-                                     /*create_output_sync_fence=*/false);
-      !result) {
-    return result.Error();
-  }
-  if (auto result = InvokeOnce(southbound_, this); !result) {
-    return result.Error();
-  }
-  return Wait(southbound_, this);
-}
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::AttachInputEvent(
-    int graph_input_index, LiteRtEvent input_event) {
-  int input_fence_fd;
-  if (auto status = LiteRtGetEventSyncFenceFd(input_event, &input_fence_fd);
-      status != kLiteRtStatusOk) {
-    return Error(status, "Failed to get sync fence fd from event");
-  }
-
-  auto edge = graph_->InputEdge(graph_input_index);
-  if (!edge) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected graph input index: %d",
-               graph_input_index);
-    return edge.Error();
-  }
-  auto edge_id = *edge;
-
-  auto thr_invocation_context_attach_input_buffer_sync_fence =
-      southbound_.api().thr_invocation_context_attach_input_buffer_sync_fence;
-  if (!thr_invocation_context_attach_input_buffer_sync_fence) {
-    return Error(
-        kLiteRtStatusErrorRuntimeFailure,
-        "thr_invocation_context_attach_input_buffer_sync_fence not found");
-  }
-
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  if (auto status = thr_invocation_context_attach_input_buffer_sync_fence(
-          thr_invocation_context_, thr_edge_id.data(), input_fence_fd);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(
-        LITERT_ERROR,
-        "thr_invocation_context_attach_input_buffer_sync_fence failed: %d",
-        status);
-    return Error(
-        kLiteRtStatusErrorRuntimeFailure,
-        "thr_invocation_context_attach_input_buffer_sync_fence failed");
-  }
-
-  input_sync_fences_[thr_edge_id.data()] = input_fence_fd;
-  return {};
-}
-
-namespace {
-
-litert::Expected<void> GetOutputEvent(
-    const litert::google_tensor::Southbound& southbound,
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtEvent* output_event) {
-  auto edge = invocation_context->graph()->OutputEdge(graph_output_index);
-  if (!edge) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected graph output index: %d",
-               graph_output_index);
-    return edge.Error();
-  }
-  auto edge_id = *edge;
-
-  auto thr_invocation_context_get_output_buffer_sync_fence =
-      southbound.api().thr_invocation_context_get_output_buffer_sync_fence;
-  if (!thr_invocation_context_get_output_buffer_sync_fence) {
-    return Error(
-        kLiteRtStatusErrorRuntimeFailure,
-        "thr_invocation_context_get_output_buffer_sync_fence not found");
-  }
-
-  ThrInvocationContext* thr_icontext =
-      invocation_context->thr_invocation_context();
-  auto thr_edge_id = ThrEdgeIdStr(edge_id);
-  int output_fence_fd;
-  if (auto status = thr_invocation_context_get_output_buffer_sync_fence(
-          thr_icontext, thr_edge_id.data(), &output_fence_fd);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR,
-               "thr_invocation_context_get_output_buffer_sync_fence failed: %d",
-               status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_get_output_buffer_sync_fence failed");
-  }
-
-  if (auto status = LiteRtCreateEventFromSyncFenceFd(
-          output_fence_fd, /*owns_fd=*/false, output_event);
-      status != kLiteRtStatusOk) {
-    return Error(status, "Failed to create event from sync fence fd");
-  }
-
-  return {};
-}
-
-}  // namespace
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::InvokeAsync(
-    int num_output_events, LiteRtEvent* output_events) {
-  if (num_output_events != graph_->NumOutputs()) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected number of output events: %d",
-               num_output_events);
-    return Error(kLiteRtStatusErrorInvalidArgument,
-                 "Unexpected number of output events");
-  }
-
-  if (auto status = PrepareForInvoke(southbound_, this,
-                                     /*create_output_sync_fence=*/true);
-      !status) {
-    return status.Error();
-  }
-
-  if (auto status = InvokeOnce(southbound_, this); !status) {
-    return status.Error();
-  }
-
-  // Deatach input fences.
-  auto thr_invocation_context_detach_input_buffer_sync_fence =
-      southbound_.api().thr_invocation_context_detach_input_buffer_sync_fence;
-  if (!thr_invocation_context_detach_input_buffer_sync_fence) {
-    return Error(
-        kLiteRtStatusErrorRuntimeFailure,
-        "thr_invocation_context_detach_input_buffer_sync_fence not found");
-  }
-  for (const auto& p : input_sync_fences_) {
-    const auto& thr_edge_id = p.first;
-    auto input_fence_fd = p.second;
-    if (auto status = thr_invocation_context_detach_input_buffer_sync_fence(
-            thr_invocation_context_, thr_edge_id.data(), input_fence_fd);
-        status != kThrStatusSuccess) {
-      return Error(
-          kLiteRtStatusErrorRuntimeFailure,
-          "thr_invocation_context_deatch_input_buffer_sync_fence failed");
-    }
-  }
-  input_sync_fences_.clear();
-
-  // Extract output events.
-  for (auto graph_output_index = 0; graph_output_index < num_output_events;
-       ++graph_output_index) {
-    if (auto status = GetOutputEvent(southbound_, this, graph_output_index,
-                                     &output_events[graph_output_index]);
-        !status) {
-      LITERT_LOG(LITERT_ERROR, "Failed to get event for output %d",
-                 graph_output_index);
-      return status.Error();
-    }
-  }
-
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::StartMetricsCollection(
-    int detail_level) {
-  auto thr_invocation_context_start_metrics_collection =
-      southbound_.api().thr_invocation_context_start_metrics_collection;
-  if (!thr_invocation_context_start_metrics_collection) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_start_metrics_collection not found");
-  }
-  if (auto status = thr_invocation_context_start_metrics_collection(
-          thr_invocation_context_, detail_level);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR,
-               "thr_invocation_context_start_metrics_collection failed: %d",
-               status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_start_metrics_collection failed");
-  }
-  return {};
-}
-
-litert::Expected<void> LiteRtDispatchInvocationContextT::StopMetricsCollection(
-    LiteRtDispatchMetrics* metrics) {
-  auto thr_invocation_context_stop_metrics_collection =
-      southbound_.api().thr_invocation_context_stop_metrics_collection;
-  if (!thr_invocation_context_stop_metrics_collection) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_stop_metrics_collection not found");
-  }
-  ThrInvocationMetrics thr_metrics{.version = 0};
-  if (auto status = thr_invocation_context_stop_metrics_collection(
-          thr_invocation_context_, &thr_metrics);
-      status != kThrStatusSuccess) {
-    LITERT_LOG(LITERT_ERROR,
-               "thr_invocation_context_stop_metrics_collection failed: %d",
-               status);
-    *metrics = new LiteRtDispatchMetricsT(/*num_metrics=*/0,
-                                          /*metric_names=*/nullptr,
-                                          /*metric_values=*/nullptr);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "thr_invocation_context_stop_metrics_collection failed");
-  }
-  *metrics = new LiteRtDispatchMetricsT(thr_metrics.num_metrics,
-                                        thr_metrics.metric_keys,
-                                        thr_metrics.metric_values);
-  return {};
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h
deleted file mode 100644
index 8cbae593d087..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_invocation_context.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
-
-#include <cstddef>
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_event.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/dispatch_api.h"
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h"
-
-class LiteRtDispatchInvocationContextT {
- public:
-  using Ptr = std::unique_ptr<LiteRtDispatchInvocationContextT>;
-
-  static litert::Expected<Ptr> CreateFromBytecode(
-      const litert::google_tensor::Southbound& southbound,
-      LiteRtDispatchDeviceContext device_context,
-      LiteRtDispatchExecutableType exec_type,
-      const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-      int num_inputs, int num_outputs);
-
-  static litert::Expected<Ptr> CreateFromGraph(
-      const litert::google_tensor::Southbound& southbound,
-      LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph);
-
-  ~LiteRtDispatchInvocationContextT();
-
-  litert::Expected<LiteRtTensorBufferRequirements> GetInputRequirements(
-      int input_index, const LiteRtRankedTensorType& tensor_type);
-  litert::Expected<LiteRtTensorBufferRequirements> GetOutputRequirements(
-      int output_index, const LiteRtRankedTensorType& tensor_type);
-
-  litert::Expected<void> AttachInput(
-      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-  litert::Expected<void> AttachOutput(
-      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<void> DetachInput(
-      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-  litert::Expected<void> DetachOutput(
-      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<void> Invoke();
-  litert::Expected<void> InvokeAsync(int num_output_events,
-                                     LiteRtEvent* output_events);
-  litert::Expected<void> StartMetricsCollection(int detail_level);
-  litert::Expected<void> StopMetricsCollection(LiteRtDispatchMetrics* metrics);
-
-  litert::Expected<void> AttachInputEvent(int graph_input_index,
-                                          LiteRtEvent input_event);
-
-  ThrInvocationContext* thr_invocation_context() {
-    return thr_invocation_context_;
-  }
-
-  LiteRtDispatchDeviceContext device_context() { return device_context_; }
-
-  LiteRtDispatchGraph graph() { return graph_; }
-
- private:
-  LiteRtDispatchInvocationContextT(
-      const litert::google_tensor::Southbound& southbound,
-      ThrInvocationContext* thr_invocation_context,
-      LiteRtDispatchDeviceContext device_context, LiteRtDispatchGraph graph)
-      : southbound_(southbound),
-        thr_invocation_context_(thr_invocation_context),
-        device_context_(device_context),
-        graph_(graph) {}
-
-  void AttachExecutable(LiteRtDispatchExecutableHandle exec_handle) {
-    exec_handle_ = exec_handle;
-  }
-
-  const litert::google_tensor::Southbound& southbound_;
-  ThrInvocationContext* thr_invocation_context_;
-  LiteRtDispatchDeviceContext device_context_;
-  LiteRtDispatchGraph graph_;
-  std::optional<LiteRtDispatchExecutableHandle> exec_handle_;
-  std::map<std::string, int> input_sync_fences_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_metrics.h b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_metrics.h
deleted file mode 100644
index a33a69d4adc2..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/litert_dispatch_metrics.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_METRICS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_METRICS_H_
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_any.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-class LiteRtDispatchMetricsT {
- public:
-  // Construct a LiteRtDispatchMetricsT object using C-style arrays and strings.
-  // `metric_names` is an array of C-style strings representing metric names.
-  // `metric_values` is an array of int64_t values representing metric values.
-  // Both `metric_names` and `metric_values` have `num_metrics` elements.
-  //
-  // NOTE: The values in the arrays are copied into the LiteRtDispatchMetricsT.
-  LiteRtDispatchMetricsT(int num_metrics, const char** metric_names,
-                         const int64_t* metric_values)
-      : metric_names_(metric_names, metric_names + num_metrics),
-        metric_values_(metric_values, metric_values + num_metrics) {}
-  int GetNumMetrics() const { return metric_names_.size(); }
-  LiteRtMetric GetMetric(int metric_index) const {
-    if (metric_index < 0 || metric_index >= GetNumMetrics()) {
-      return LiteRtMetric{.name = "invalid_metric",
-                          .value = LiteRtAny{.type = kLiteRtAnyTypeNone}};
-    }
-    return LiteRtMetric{
-        .name = metric_names_[metric_index].c_str(),
-        .value =
-            LiteRtAny{
-                .type = kLiteRtAnyTypeInt,
-                .int_value = metric_values_[metric_index],
-            },
-    };
-  }
-
- private:
-  const std::vector<std::string> metric_names_;
-  const std::vector<int64_t> metric_values_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_LITERT_DISPATCH_METRICS_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.cc b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.cc
deleted file mode 100644
index e103c289d582..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h"
-
-#include <dlfcn.h>
-
-#include <memory>
-#include <optional>
-#include <string>
-
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-#define Load(H, S)                                                 \
-  H = reinterpret_cast<decltype(&S)>(::dlsym(dlib_handle_, #S));   \
-  if (!H) {                                                        \
-    LITERT_LOG(LITERT_WARNING, "Failed to load symbol %s: %s", #S, \
-               ::dlerror());                                       \
-  }
-
-namespace litert {
-namespace google_tensor {
-
-namespace {
-
-// The SouthBound APIs are implemented in the EdgeTPU libraries.
-// It used to be implemented in the libedgetpu_util.so and has been moved to
-// libedgetpu_litert.so in newer Android builds.
-constexpr const char* kLiteRtLibPath = "/vendor/lib64/libedgetpu_litert.so";
-constexpr const char* kEdgeTpuUtilLibPath = "/vendor/lib64/libedgetpu_util.so";
-
-}  // namespace
-
-Southbound::Southbound() : api_(new ThrFunctions) {}
-
-Southbound::~Southbound() {
-  if (dlib_handle_) {
-    ::dlclose(dlib_handle_);
-  }
-}
-
-Expected<Southbound::Ptr> Southbound::Create(
-    std::optional<std::string> shared_library_dir) {
-  Ptr southbound(new Southbound);
-  if (auto status = southbound->LoadSymbols(shared_library_dir); !status) {
-    return Unexpected(status.Error());
-  }
-
-  return southbound;
-}
-
-Expected<void> Southbound::LoadSymbols(
-    std::optional<std::string> shared_library_dir) {
-  // Always load the Southbound API library from the vendor partition.
-  (void)shared_library_dir;
-
-  // Try loading the new EdgeTPU LiteRT library first. If it fails, it might be
-  // because the Android build is too old. In that case, load the old EdgeTPU
-  // utility library.
-  dlib_handle_ = ::dlopen(kLiteRtLibPath, RTLD_NOW | RTLD_LOCAL);
-  if (!dlib_handle_) {
-    dlib_handle_ = ::dlopen(kEdgeTpuUtilLibPath, RTLD_NOW | RTLD_LOCAL);
-    if (!dlib_handle_) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Failed to load Southbound shared library");
-    }
-  }
-
-  // Binds all supported symbols from the shared library to the function
-  // pointers.
-  Load(api_->thr_initialize, thrInitialize);
-
-  Load(api_->thr_get_vendor_api_version, thrGetVendorApiVersion);
-  Load(api_->thr_get_vendor_id, thrGetVendorId);
-
-  Load(api_->thr_context_create, thrContextCreate);
-  Load(api_->thr_context_delete, thrContextDelete);
-
-  Load(api_->thr_graph_create, thrGraphCreate);
-  Load(api_->thr_graph_delete, thrGraphDelete);
-
-  Load(api_->thr_graph_add_edge, thrGraphAddEdge);
-  Load(api_->thr_graph_add_sq_node, thrGraphAddSqNode);
-
-  Load(api_->thr_graph_connect_node_input, thrGraphConnectNodeInput);
-  Load(api_->thr_graph_connect_node_output, thrGraphConnectNodeOutput);
-
-  Load(api_->thr_graph_set_input_edge, thrGraphSetInputEdge);
-  Load(api_->thr_graph_set_output_edge, thrGraphSetOutputEdge);
-
-  Load(api_->thr_graph_annotate_graph, thrGraphAnnotateGraph);
-  Load(api_->thr_graph_annotate_edge, thrGraphAnnotateEdge);
-  Load(api_->thr_graph_annotate_node, thrGraphAnnotateNode);
-
-  Load(api_->thr_load_sq_container, thrLoadSqContainer);
-  Load(api_->thr_load_sq_container_fd, thrLoadSqContainerFd);
-  Load(api_->thr_load_sq_container_file, thrLoadSqContainerFile);
-  Load(api_->thr_unload_sq_container, thrUnloadSqContainer);
-
-  Load(api_->thr_graph_assign_sq, thrGraphAssignSq);
-  Load(api_->thr_sq_query_scratch_pad, thrSqQueryScratchPad);
-  Load(api_->thr_sq_attach_scratch_pad_buffer, thrSqAttachScratchPadBuffer);
-
-  Load(api_->thr_register_buffer, thrRegisterBuffer);
-  Load(api_->thr_register_buffer_with_offset, thrRegisterBufferWithOffset);
-  Load(api_->thr_unregister_buffer, thrUnregisterBuffer);
-
-  Load(api_->thr_invocation_context_get, thrInvocationContextGet);
-  Load(api_->thr_invocation_context_delete, thrInvocationContextDelete);
-
-  Load(api_->thr_invocation_context_attach_buffer,
-       thrInvocationContextAttachBuffer);
-  Load(api_->thr_invocation_context_detach_buffer,
-       thrInvocationContextDetachBuffer);
-
-  Load(api_->thr_invocation_context_prepare_for_invoke,
-       thrInvocationContextPrepareForInvoke);
-  Load(api_->thr_invocation_context_invoke_once,
-       thrInvocationContextInvokeOnce);
-  Load(api_->thr_invocation_context_wait, thrInvocationContextWait);
-
-  Load(api_->thr_invocation_context_attach_input_buffer_sync_fence,
-       thrInvocationContextAttachInputBufferSyncFence);
-  Load(api_->thr_invocation_context_get_output_buffer_sync_fence,
-       thrInvocationContextGetOutputBufferSyncFence);
-  Load(api_->thr_invocation_context_detach_input_buffer_sync_fence,
-       thrInvocationContextDetachInputBufferSyncFence);
-
-  Load(api_->thr_invocation_context_query_node_scratch_pad,
-       thrInvocationContextQueryNodeScratchPad);
-  Load(api_->thr_invocation_context_attach_scratch_pad_buffer,
-       thrInvocationContextAttachScratchPadBuffer);
-
-  Load(api_->thr_invocation_context_start_metrics_collection,
-       thrInvocationContextStartMetricsCollection);
-  Load(api_->thr_invocation_context_stop_metrics_collection,
-       thrInvocationContextStopMetricsCollection);
-
-  Load(api_->thr_vendor_set_system_attribute_str,
-       thrVendorSetSystemAttributeStr);
-  Load(api_->thr_vendor_set_system_attribute_int64,
-       thrVendorSetSystemAttributeInt64);
-
-  LITERT_LOG(LITERT_INFO, "SouthBound symbols loaded");
-  return {};
-}
-
-}  // namespace google_tensor
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h b/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h
deleted file mode 100644
index 5fd7cbd45bcc..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/google_tensor/dispatch/southbound.h
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_SOUTHBOUND_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_SOUTHBOUND_H_
-
-#include <memory>
-#include <optional>
-#include <string>
-
-#include "third_party/odml/infra/southbound/sb_api.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace google_tensor {
-
-class Southbound {
- public:
-  using Ptr = std::unique_ptr<Southbound>;
-  struct ThrFunctions;
-
-  Southbound(Southbound&) = delete;
-  Southbound(Southbound&&) = delete;
-  Southbound& operator=(const Southbound&) = delete;
-  Southbound& operator=(Southbound&&) = delete;
-
-  ~Southbound();
-
-  static Expected<Ptr> Create(std::optional<std::string> shared_library_dir);
-
-  const ThrFunctions& api() const { return *api_; }
-
- private:
-  Southbound();
-  Expected<void> LoadSymbols(std::optional<std::string> shared_library_dir);
-
-  void* dlib_handle_ = nullptr;
-  std::unique_ptr<ThrFunctions> api_;
-};
-
-// A convenient struct for holding function pointers to SouthBound symbols.
-// These function pointers will be loaded to the shared library on device during
-// runtime.
-struct Southbound::ThrFunctions {
-  decltype(&thrInitialize) thr_initialize = nullptr;
-
-  decltype(&thrGetVendorApiVersion) thr_get_vendor_api_version = nullptr;
-  decltype(&thrGetVendorId) thr_get_vendor_id = nullptr;
-
-  decltype(&thrContextCreate) thr_context_create = nullptr;
-  decltype(&thrContextDelete) thr_context_delete = nullptr;
-
-  decltype(&thrGraphCreate) thr_graph_create = nullptr;
-  decltype(&thrGraphDelete) thr_graph_delete = nullptr;
-
-  decltype(&thrGraphAddEdge) thr_graph_add_edge = nullptr;
-  decltype(&thrGraphAddSqNode) thr_graph_add_sq_node = nullptr;
-
-  decltype(&thrGraphConnectNodeInput) thr_graph_connect_node_input = nullptr;
-  decltype(&thrGraphConnectNodeOutput) thr_graph_connect_node_output = nullptr;
-
-  decltype(&thrGraphSetInputEdge) thr_graph_set_input_edge = nullptr;
-  decltype(&thrGraphSetOutputEdge) thr_graph_set_output_edge = nullptr;
-
-  decltype(&thrGraphAnnotateGraph) thr_graph_annotate_graph = nullptr;
-  decltype(&thrGraphAnnotateEdge) thr_graph_annotate_edge = nullptr;
-  decltype(&thrGraphAnnotateNode) thr_graph_annotate_node = nullptr;
-
-  decltype(&thrLoadSqContainer) thr_load_sq_container = nullptr;
-  decltype(&thrLoadSqContainerFd) thr_load_sq_container_fd = nullptr;
-  decltype(&thrLoadSqContainerFile) thr_load_sq_container_file = nullptr;
-  decltype(&thrUnloadSqContainer) thr_unload_sq_container = nullptr;
-
-  decltype(&thrGraphAssignSq) thr_graph_assign_sq = nullptr;
-  decltype(&thrSqQueryScratchPad) thr_sq_query_scratch_pad = nullptr;
-  decltype(&thrSqAttachScratchPadBuffer) thr_sq_attach_scratch_pad_buffer =
-      nullptr;
-
-  decltype(&thrRegisterBuffer) thr_register_buffer = nullptr;
-  decltype(&thrRegisterBufferWithOffset) thr_register_buffer_with_offset =
-      nullptr;
-  decltype(&thrUnregisterBuffer) thr_unregister_buffer = nullptr;
-
-  decltype(&thrInvocationContextGet) thr_invocation_context_get = nullptr;
-  decltype(&thrInvocationContextDelete) thr_invocation_context_delete = nullptr;
-
-  decltype(&thrInvocationContextAttachBuffer)
-      thr_invocation_context_attach_buffer = nullptr;
-  decltype(&thrInvocationContextDetachBuffer)
-      thr_invocation_context_detach_buffer = nullptr;
-
-  decltype(&thrInvocationContextPrepareForInvoke)
-      thr_invocation_context_prepare_for_invoke = nullptr;
-  decltype(&thrInvocationContextInvokeOnce) thr_invocation_context_invoke_once =
-      nullptr;
-  decltype(&thrInvocationContextWait) thr_invocation_context_wait = nullptr;
-
-  decltype(&thrInvocationContextAttachInputBufferSyncFence)
-      thr_invocation_context_attach_input_buffer_sync_fence = nullptr;
-  decltype(&thrInvocationContextGetOutputBufferSyncFence)
-      thr_invocation_context_get_output_buffer_sync_fence = nullptr;
-  decltype(&thrInvocationContextDetachInputBufferSyncFence)
-      thr_invocation_context_detach_input_buffer_sync_fence = nullptr;
-
-  decltype(&thrInvocationContextQueryNodeScratchPad)
-      thr_invocation_context_query_node_scratch_pad = nullptr;
-  decltype(&thrInvocationContextAttachScratchPadBuffer)
-      thr_invocation_context_attach_scratch_pad_buffer = nullptr;
-
-  decltype(&thrInvocationContextStartMetricsCollection)
-      thr_invocation_context_start_metrics_collection = nullptr;
-  decltype(&thrInvocationContextStopMetricsCollection)
-      thr_invocation_context_stop_metrics_collection = nullptr;
-
-  decltype(&thrVendorSetSystemAttributeStr)
-      thr_vendor_set_system_attribute_str = nullptr;
-  decltype(&thrVendorSetSystemAttributeInt64)
-      thr_vendor_set_system_attribute_int64 = nullptr;
-};
-
-}  // namespace google_tensor
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_GOOGLE_TENSOR_DISPATCH_SOUTHBOUND_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/BUILD b/tensorflow/lite/experimental/litert/vendors/mediatek/BUILD
deleted file mode 100644
index 538897f123dd..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/BUILD
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/vendors/mediatek:mediatek_build_defs.bzl", "litert_cc_lib_with_mtk")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-litert_cc_lib_with_mtk(
-    name = "neuron_adapter_api",
-    srcs = [
-        "neuron_adapter_api.cc",
-    ],
-    hdrs = [
-        "neuron_adapter_api.h",
-    ],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "notap",
-    ],
-    deps = [
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/neuro_pilot:latest_host_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_shared_library",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/BUILD b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/BUILD
deleted file mode 100644
index d64a9eefa281..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/BUILD
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_dynamic_lib", "litert_test")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert/vendors/mediatek/compiler:__subpackages__"],
-)
-
-litert_dynamic_lib(
-    name = "compiler_plugin",
-    srcs = ["compiler_plugin.cc"],
-    hdrs = ["//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin.h"],
-    export_litert_only = True,
-    shared_lib_name = "compiler_plugin_so",
-    so_name = "libLiteRtCompilerPlugin_MediaTek.so",
-    tags = [
-        # Don't build/test in OS until MediaTek SDK is available.
-        "nobuilder",
-        "notap",
-    ],
-    ungrte = True,
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":compile_model",
-        ":create_model",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek:neuron_adapter_api",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "create_model",
-    srcs = ["create_model.cc"],
-    hdrs = ["create_model.h"],
-    tags = [
-        # Don't build/test in OS until MediaTek SDK is available.
-        "nobuilder",
-        "notap",
-    ],
-    deps = [
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/neuro_pilot:latest_host_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek:neuron_adapter_api",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations:add_op_legalization",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations:operand_map",
-    ],
-)
-
-cc_library(
-    name = "compile_model",
-    srcs = ["compile_model.cc"],
-    hdrs = ["compile_model.h"],
-    tags = [
-        # Don't build/test in OS until MediaTek SDK is available.
-        "nobuilder",
-        "notap",
-    ],
-    deps = [
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/neuro_pilot:latest_host_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek:neuron_adapter_api",
-    ],
-)
-
-litert_test(
-    name = "compiler_plugin_test",
-    srcs = [
-        "compiler_plugin_test.cc",
-    ],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-    ],
-    linkstatic = True,
-    tags = [
-        # Tests with ungrte deps do not currently work on forge.
-        "no-remote-exec",
-        "notap",
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "no_oss",
-        # Sanitizer runtime doesn't work with anything that loads libQnnHtp.so.
-        "nosan",
-    ],
-    # Currently this test can only be run on Android because we don't have x86 shared libraries for
-    # MTK.
-    target_compatible_with = select({
-        "@platforms//os:android": [],
-        "@platforms//os:linux": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    ungrte = True,
-    use_sys_malloc = True,
-    deps = [
-        ":compiler_plugin",  # buildcleaner: keep
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers_oss",
-        "//tensorflow/lite/experimental/litert/test:test_models",
-        "//tensorflow/lite/experimental/litert/vendors/cc:litert_compiler_plugin",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek:neuron_adapter_api",
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.cc
deleted file mode 100644
index 6ce7486cc074..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h"
-
-#include <optional>
-#include <string>
-
-#include "neuron/api/NeuronAdapter.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-Expected<NeuronCompilationPtr> CompileModel(
-    const NeuronAdapterApi& neuron_adapter_api, NeuronModel* model,
-    std::optional<std::string> soc_model) {
-#if defined(__ANDROID__)
-  if (soc_model) {
-    return Error(kLiteRtStatusErrorInvalidArgument,
-                 "JIT compilation for a specific SoC is not supported");
-  }
-#else
-  // TODO: Support offline compilation for a specific SoC by setting environment
-  // variables MTKNN_ADAPTER_DLA_PLATFORM and MTKNN_ADAPTER_DLA_DIR and fetching
-  // the content of the generated DLA file.
-  return Error(kLiteRtStatusErrorInvalidArgument,
-               "AOT compilation is not supported yet");
-#endif
-
-  // Per MediaTek recommendation, Compilation_create,
-  // Compilation_createWithOptions, and Compilation_setOptimizationString
-  // should be used as follow:
-  // - AOT Compilation: Compilation_createWithOptions only
-  // - JIT Compilation: Compilation_create and Compilation_setOptimizationString
-  // The code below takes care of those conditions.
-
-  // NOLINTBEGIN
-  const auto compile_options =
-#if __ANDROID__
-      std::string(neuron_adapter_api.JitCompileOptions());
-#else
-      std::string(neuron_adapter_api.AotCompileOptions());
-#endif
-  // NOLINTEND
-
-  auto compilation =
-#if __ANDROID__
-      neuron_adapter_api.CreateCompilation(model);
-#else
-      neuron_adapter_api.CreateCompilation(model, compile_options);
-#endif
-  if (!compilation) {
-    return compilation.Error();
-  }
-
-  if (neuron_adapter_api.api().compilation_set_priority(
-          compilation->get(), NEURON_PRIORITY_HIGH) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to set compilation priority");
-  }
-
-  if (neuron_adapter_api.api().compilation_set_preference(
-          compilation->get(), NEURON_PREFER_SUSTAINED_SPEED) !=
-      NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to set compilation preference");
-  }
-
-#if __ANDROID__
-  if (!compile_options.empty()) {
-    if (auto status =
-            neuron_adapter_api.api().compilation_set_optimization_string(
-                compilation->get(), compile_options.c_str());
-        status != NEURON_NO_ERROR) {
-      LITERT_LOG(LITERT_INFO,
-                 "NeuronCompilation_setOptimizationString failed with error %d",
-                 status);
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to set optimization string");
-    }
-  }
-#endif
-
-  if (auto status =
-          neuron_adapter_api.api().compilation_finish(compilation->get());
-      status != NEURON_NO_ERROR) {
-    LITERT_LOG(LITERT_INFO, "NeuronCompilation_finish failed with error %d",
-               status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to finish compilation");
-  }
-
-  return compilation;
-}
-
-}  // namespace litert::mediatek
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h
deleted file mode 100644
index 3e30c0d8451b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_COMPILE_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_COMPILE_MODEL_H_
-
-#include <optional>
-#include <string>
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-Expected<NeuronCompilationPtr> CompileModel(
-    const NeuronAdapterApi& neuron_adapter_api, NeuronModel* model,
-    std::optional<std::string> soc_model);
-
-}  // namespace litert::mediatek
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_COMPILE_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compiler_plugin.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compiler_plugin.cc
deleted file mode 100644
index 3447cf5828ce..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compiler_plugin.cc
+++ /dev/null
@@ -1,335 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compile_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-//
-// Configurations
-//
-
-using litert::Error;
-using litert::Expected;
-using litert::mediatek::NeuronAdapterApi;
-using litert::mediatek::NeuronCompilationPtr;
-using litert::mediatek::NeuronModelPtr;
-
-namespace {
-
-constexpr char kPluginManufacturer[] = "MediaTek";
-
-// clang-format off
-constexpr std::pair<const char*, const char*> kPluginSocModels[] = {
-    {"mt6853", "mt6853"},
-    {"mt6877", "mt6877"},
-    {"mt6878", "mt6878"},
-    {"mt6879", "mt6879"},
-    {"mt6886", "mt6886"},
-    {"mt6893", "mt6893"},
-    {"mt6895", "mt6895"},
-    {"mt6897", "mt6897"},
-    {"mt6983", "mt6983"},
-    {"mt6985", "mt6985"},
-    {"mt6989", "mt6989"},
-    {"mt6991", "mt6991"},
-};
-
-constexpr LiteRtOpCode kSupportedOps[] = {
-    kLiteRtOpCodeTflAdd,
-};
-// clang-format on
-
-constexpr auto kNumPluginSocModels =
-    sizeof(kPluginSocModels) / sizeof(kPluginSocModels[0]);
-
-std::optional<const char*> FindSocModel(absl::string_view soc_model_name) {
-  std::optional<const char*> soc_model;
-  for (auto i = 0; i < kNumPluginSocModels; ++i) {
-    if (soc_model_name == kPluginSocModels[i].first) {
-      soc_model = kPluginSocModels[i].second;
-      break;
-    }
-  }
-  return soc_model;
-}
-
-}  // namespace
-
-LiteRtStatus LiteRtGetCompilerPluginVersion(LiteRtApiVersion* api_version) {
-  if (api_version == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  api_version->major = LITERT_API_VERSION_MAJOR;
-  api_version->minor = LITERT_API_VERSION_MINOR;
-  api_version->patch = LITERT_API_VERSION_PATCH;
-  return kLiteRtStatusOk;
-}
-
-const char* LiteRtGetCompilerPluginSocManufacturer() {
-  return kPluginManufacturer;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedHardware(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtHwAccelerators* supported_hardware) {
-  if (!compiler_plugin || !supported_hardware) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *supported_hardware = kLiteRtHwAcceleratorNpu;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumCompilerPluginSupportedSocModels(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtParamIndex* num_supported_soc_models) {
-  if (!compiler_plugin || !num_supported_soc_models) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_supported_soc_models = kNumPluginSocModels;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedSocModel(
-    LiteRtCompilerPlugin compiler_plugin, LiteRtParamIndex soc_model_idx,
-    const char** soc_model_name) {
-  if (!compiler_plugin || !soc_model_name) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (soc_model_idx < 0 || soc_model_idx >= kNumPluginSocModels) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *soc_model_name = kPluginSocModels[soc_model_idx].first;
-  return kLiteRtStatusOk;
-}
-
-//
-// Compiled Result Definition
-//
-
-// TODO: Revisit this struct after we extend the compiler plugin API to return
-// results with more than one single bytecode.
-struct LiteRtCompiledResultT {
-  using Bytecode = std::vector<uint8_t>;
-  std::vector<Bytecode> bytecodes;
-  std::vector<std::string> graph_names;
-};
-
-LiteRtStatus LiteRtCompiledResultNumByteCodeModules(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_byte_code) {
-  if (!compiled_result || !num_byte_code) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_byte_code = compiled_result->bytecodes.size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompiledResultByteCode(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex byte_code_idx,
-    const void** byte_code, size_t* byte_code_size) {
-  if (!compiled_result || !byte_code || !byte_code_size ||
-      (byte_code_idx >= compiled_result->bytecodes.size())) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *byte_code = compiled_result->bytecodes[byte_code_idx].data();
-  *byte_code_size = compiled_result->bytecodes[byte_code_idx].size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompiledResultCallInfo(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex call_idx,
-    const void** call_info, size_t* call_info_size,
-    LiteRtParamIndex* byte_code_idx) {
-  if (!compiled_result || !call_info || !call_info_size) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (call_idx >= compiled_result->graph_names.size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-
-  auto& graph_name = compiled_result->graph_names[call_idx];
-  *call_info = graph_name.data();
-  *call_info_size = graph_name.size();
-  // MTK should have one byte code per call.
-  *byte_code_idx = call_idx;
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumCompiledResultCalls(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_calls) {
-  if (!compiled_result || !num_calls) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_calls = compiled_result->bytecodes.size();
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompiledResult(LiteRtCompiledResult compiled_result) {
-  delete compiled_result;
-}
-
-//
-// Plugin Definition
-//
-
-// Plugins can hold state.
-struct LiteRtCompilerPluginT {};
-
-LiteRtStatus LiteRtCompilerPluginSetFlags(LiteRtCompilerPlugin compiler_plugin,
-                                          LiteRtParamIndex num_flags,
-                                          const char** keys,
-                                          const char** values) {
-  // IMPLEMENT ME
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCreateCompilerPlugin(LiteRtCompilerPlugin* compiler_plugin) {
-  auto* plugin = new LiteRtCompilerPluginT;
-  *compiler_plugin = plugin;
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompilerPlugin(LiteRtCompilerPlugin compiler_plugin) {
-  delete compiler_plugin;
-}
-
-namespace {
-
-// TODO update this function to match the new legalizations.
-bool IsOpSupported(const litert::Op& op) {
-  // NOTE: Currently we are demoing by just mapping simple f32 mul ops.  Use a
-  // very loose guard for now -- only checking if op code is supported.
-  for (auto supported_op : kSupportedOps) {
-    if (op.Code() == supported_op) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
-
-LiteRtStatus LiteRtCompilerPluginPartition(LiteRtCompilerPlugin compiler_plugin,
-                                           LiteRtSubgraph subgraph,
-                                           LiteRtOpList selected_ops) {
-  litert::Subgraph graph(subgraph);
-  for (const auto& op : graph.Ops()) {
-    if (!IsOpSupported(op)) {
-      continue;
-    }
-
-    LITERT_RETURN_IF_ERROR(LiteRtPushOp(selected_ops, op.Get(), 0));
-  }
-
-  return kLiteRtStatusOk;
-}
-
-namespace {
-
-Expected<std::vector<uint8_t>> CompilePartition(
-    NeuronAdapterApi& neuron_adapter_api, const litert::Subgraph& partition,
-    const std::string& graph_name, std::optional<std::string> soc_model) {
-  auto model = CreateModel(neuron_adapter_api, partition, graph_name);
-  if (!model) {
-    return model.Error();
-  }
-
-  auto compilation = CompileModel(neuron_adapter_api, model->get(), soc_model);
-  if (!compilation) {
-    return compilation.Error();
-  }
-
-  size_t bytecode_size;
-  if (neuron_adapter_api.api().compilation_get_compiled_network_size(
-          compilation->get(), &bytecode_size) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to get compiled network size");
-  }
-
-  std::vector<uint8_t> bytecode(bytecode_size);
-  if (neuron_adapter_api.api().compilation_store_compiled_network(
-          compilation->get(), bytecode.data(), bytecode.size()) !=
-      NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to get compiled network");
-  }
-
-  return bytecode;
-}
-
-}  // namespace
-
-LiteRtStatus LiteRtCompilerPluginCompile(
-    LiteRtCompilerPlugin compiler_plugin, const char* soc_model,
-    LiteRtModel partitions, LiteRtCompiledResult* compiled_result) {
-  auto model = litert::Model::CreateFromNonOwnedHandle(partitions);
-  const auto num_partitions = model.NumSubgraphs();
-
-  LITERT_LOG(LITERT_INFO,
-             "Starting MediaTek Compilation for %d subgraphs, soc_model=%s",
-             num_partitions, soc_model);
-
-  auto opt_soc_model = soc_model ? FindSocModel(soc_model) : std::nullopt;
-  if (opt_soc_model) {
-    LITERT_LOG(LITERT_ERROR, "Compiling for MediaTek architecture: %s",
-               *opt_soc_model);
-  } else if (soc_model) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected SoC model: %s", soc_model);
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  // Initialize SDK and load qnn shared libraries.
-
-  auto api = NeuronAdapterApi::Create(/*shared_library_dir=*/std::nullopt);
-  if (!api) {
-    return api.Error().Status();
-  }
-
-  auto result = std::make_unique<LiteRtCompiledResultT>();
-
-  for (auto i = 0; i < num_partitions; ++i) {
-    auto graph_name = absl::StrFormat("Partition_%d", i);
-    auto bytecode =
-        CompilePartition(**api, *model.Subgraph(i), graph_name, opt_soc_model);
-    if (!bytecode) {
-      LITERT_LOG(LITERT_INFO, "%s", bytecode.Error().Message().c_str());
-      return bytecode.Error().Status();
-    }
-
-    result->bytecodes.emplace_back(*bytecode);
-    result->graph_names.emplace_back(graph_name);
-  }
-
-  *compiled_result = result.release();
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compiler_plugin_test.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compiler_plugin_test.cc
deleted file mode 100644
index 309ef9bc2d95..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/compiler_plugin_test.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/test_models.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h"
-
-namespace litert {
-namespace {
-
-using ::testing::Values;
-
-// clang-format off
-const auto kSupportedOps = Values(
-    "add_cst.tflite",
-    "add_simple.tflite",
-    "simple_add_op.tflite");
-// clang-format on
-
-TEST(TestQnnPlugin, GetConfigInfo) {
-#ifndef __ANDROID__
-  GTEST_SKIP() << "Loading shared lib not currently supported on linux.";
-#endif  // __ANDROID__
-
-  EXPECT_STREQ(LiteRtGetCompilerPluginSocManufacturer(), "MediaTek");
-
-  auto plugin = CreatePlugin();
-
-  LiteRtParamIndex num_supported_soc_models;
-  ASSERT_EQ(LiteRtGetNumCompilerPluginSupportedSocModels(
-                plugin.get(), &num_supported_soc_models),
-            kLiteRtStatusOk);
-  ASSERT_EQ(num_supported_soc_models, 12);
-
-  const char* config_id;
-  ASSERT_EQ(
-      LiteRtGetCompilerPluginSupportedSocModel(plugin.get(), 0, &config_id),
-      kLiteRtStatusOk);
-  EXPECT_STREQ(config_id, "mt6853");
-}
-
-TEST(TestQnnPlugin, PartitionAdd) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("add_simple.tflite");
-
-  LiteRtOpListT selected_op_list;
-  ASSERT_EQ(LiteRtCompilerPluginPartition(
-                plugin.get(), model.Subgraph(0)->Get(), &selected_op_list),
-            kLiteRtStatusOk);
-  const auto selected_ops = selected_op_list.Values();
-
-  ASSERT_EQ(selected_ops.size(), 1);
-  EXPECT_EQ(selected_ops[0].first->OpCode(), kLiteRtOpCodeTflAdd);
-}
-
-// /////////////////////////////////////////////////////////////////////////////
-
-class MtkPluginOpCompatibilityTest
-    : public ::testing::TestWithParam<std::string> {};
-
-TEST_P(MtkPluginOpCompatibilityTest, SupportedOpsTest) {
-#ifndef __ANDROID__
-  GTEST_SKIP() << "Loading shared lib not currently supported on linux.";
-#endif  // __ANDROID__
-
-  LITERT_LOG(LITERT_INFO, "Testing TFLite model: %s", GetParam().c_str());
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel(GetParam());
-
-  LiteRtCompiledResult compiled;
-  ASSERT_EQ(LiteRtCompilerPluginCompile(plugin.get(), /*soc_model=*/nullptr,
-                                        model.Get(), &compiled),
-            kLiteRtStatusOk);
-
-  LiteRtParamIndex num_byte_code;
-  ASSERT_EQ(LiteRtCompiledResultNumByteCodeModules(compiled, &num_byte_code),
-            kLiteRtStatusOk);
-  ASSERT_EQ(num_byte_code, 1);
-
-  const void* byte_code;
-  size_t byte_code_size;
-
-  ASSERT_EQ(LiteRtGetCompiledResultByteCode(compiled, /*byte_code_idx=*/0,
-                                            &byte_code, &byte_code_size),
-            kLiteRtStatusOk);
-
-  absl::string_view byte_code_string(reinterpret_cast<const char*>(byte_code),
-                                     byte_code_size);
-  ASSERT_FALSE(byte_code_string.empty());
-
-  const void* op_data;
-  size_t op_data_size;
-  LiteRtParamIndex byte_code_idx;
-
-  ASSERT_EQ(LiteRtGetCompiledResultCallInfo(compiled, /*call_idx=*/0, &op_data,
-                                            &op_data_size, &byte_code_idx),
-            kLiteRtStatusOk);
-
-  EXPECT_EQ(byte_code_idx, 0);
-
-  absl::string_view op_data_string(reinterpret_cast<const char*>(op_data),
-                                   op_data_size);
-  EXPECT_EQ(op_data_string, "Partition_0");
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-INSTANTIATE_TEST_SUITE_P(SupportedOpsTest, MtkPluginOpCompatibilityTest,
-                         kSupportedOps);
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.cc
deleted file mode 100644
index 253d3b5bcfd4..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h"
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "neuron/api/NeuronAdapter.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-Expected<NeuronModelPtr> CreateModel(const NeuronAdapterApi& neuron_adapter_api,
-                                     const litert::Subgraph& partition,
-                                     const std::string& model_name) {
-  auto model = neuron_adapter_api.CreateModel();
-  if (!model) {
-    return model.Error();
-  }
-
-  if (neuron_adapter_api.api().model_set_name(
-          model->get(), model_name.c_str()) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Failed to set model name");
-  }
-
-  OperandMap operand_map(neuron_adapter_api, model->get());
-
-  std::vector<uint32_t> input_indices;
-  for (const auto& input : partition.Inputs()) {
-    auto operand_index = operand_map.GetOperandIndex(input);
-    if (!operand_index) {
-      return operand_index.Error();
-    }
-    input_indices.push_back(*operand_index);
-  }
-
-  std::vector<uint32_t> output_indices;
-  for (const auto& output : partition.Outputs()) {
-    auto operand_index = operand_map.GetOperandIndex(output);
-    if (!operand_index) {
-      return operand_index.Error();
-    }
-    output_indices.push_back(*operand_index);
-  }
-
-  if (neuron_adapter_api.api().model_identify_inputs_and_outputs(
-          model->get(), input_indices.size(), input_indices.data(),
-          output_indices.size(), output_indices.data()) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to identify model I/Os");
-  }
-
-  for (const auto& op : partition.Ops()) {
-    Expected<void> status;
-    switch (op.Code()) {
-      case kLiteRtOpCodeTflAdd:
-        status =
-            LegalizeAddOp(neuron_adapter_api, model->get(), operand_map, op);
-        break;
-
-      default:
-        return Error(kLiteRtStatusErrorRuntimeFailure, "Unsupported op");
-    }
-
-    if (!status) {
-      return status.Error();
-    }
-  }
-
-  if (neuron_adapter_api.api().model_finish(model->get()) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Failed to finish model");
-  }
-
-  return model;
-}
-
-}  // namespace litert::mediatek
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h
deleted file mode 100644
index 6e958d691a80..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/create_model.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_CREATE_MODEL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_CREATE_MODEL_H_
-
-#include <string>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-// Create a new NeuronModel Graph from given LiteRt Graph.
-Expected<NeuronModelPtr> CreateModel(const NeuronAdapterApi& neuron_adapter_api,
-                                     const Subgraph& partition,
-                                     const std::string& model_name);
-
-}  // namespace litert::mediatek
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_CREATE_MODEL_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/BUILD b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/BUILD
deleted file mode 100644
index a13ad644f2f0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/BUILD
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert/vendors/mediatek/compiler:__subpackages__"],
-)
-
-cc_library(
-    name = "operand_map",
-    srcs = ["operand_map.cc"],
-    hdrs = ["operand_map.h"],
-    tags = [
-        # Don't build/test in OS until MediaTek SDK is available.
-        "nobuilder",
-        "notap",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_element_type",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek:neuron_adapter_api",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_library(
-    name = "add_op_legalization",
-    srcs = ["add_op_legalization.cc"],
-    hdrs = ["add_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until MediaTek SDK is available.
-        "nobuilder",
-        "notap",
-    ],
-    deps = [
-        "operand_map",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek:neuron_adapter_api",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.cc
deleted file mode 100644
index 19f2c5247478..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h"
-
-#include <cstdint>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-Expected<void> LegalizeAddOp(const NeuronAdapterApi& neuron_adapter_api,
-                             NeuronModel* model, OperandMap& operand_map,
-                             const litert::Op& op) {
-  std::vector<uint32_t> input_indices;
-  for (auto& input : op.Inputs()) {
-    auto id = operand_map.GetOperandIndex(input);
-    if (!id) {
-      return id.Error();
-    }
-    input_indices.push_back(*id);
-  }
-
-  // A NEURON_ADD operation takes a 3rd scalar operand, which is used to pass a
-  // TfLiteFusedActivation value.
-  uint32_t tfl_fused_activation;
-  if (auto status =
-          LiteRtGetAddFusedActivationOption(op.Get(), &tfl_fused_activation);
-      status != kLiteRtStatusOk) {
-    return Error(status, "Failed to get fused activation");
-  }
-  auto fused_activation_operand_index =
-      operand_map.AddScalarInt32(tfl_fused_activation);
-  if (!fused_activation_operand_index) {
-    return fused_activation_operand_index.Error();
-  }
-  input_indices.push_back(*fused_activation_operand_index);
-
-  std::vector<uint32_t> output_indices;
-  for (auto& output : op.Outputs()) {
-    auto id = operand_map.GetOperandIndex(output);
-    if (!id) {
-      return id.Error();
-    }
-    output_indices.push_back(*id);
-  }
-
-  if (neuron_adapter_api.api().model_add_operation(
-          model, /*type=*/NEURON_ADD, input_indices.size(),
-          input_indices.data(), output_indices.size(),
-          output_indices.data()) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to set value of NEURON_ADD fused activation");
-  }
-
-  return {};
-}
-
-}  // namespace litert::mediatek
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h
deleted file mode 100644
index d774d6bcb972..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/add_op_legalization.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-Expected<void> LegalizeAddOp(const NeuronAdapterApi& neuron_adapter_api,
-                             NeuronModel* model, OperandMap& operand_map,
-                             const litert::Op& op);
-
-}  // namespace litert::mediatek
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.cc
deleted file mode 100644
index 36a0f51bfe0a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <iterator>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-namespace {
-
-class OperandType : public NeuronOperandType {
- public:
-  static Expected<OperandType> Create(const Tensor& t) {
-    auto ranked_tensor_type = t.RankedTensorType();
-    if (!ranked_tensor_type) {
-      return ranked_tensor_type.Error();
-    }
-
-    auto tensor_dimensions = ranked_tensor_type->Layout().Dimensions();
-    std::vector<uint32_t> mtk_dimensions;
-    mtk_dimensions.reserve(tensor_dimensions.size());
-    std::copy(tensor_dimensions.begin(), tensor_dimensions.end(),
-              std::back_inserter(mtk_dimensions));
-
-    int32_t mtk_type;
-    switch (ranked_tensor_type->ElementType()) {
-      case ElementType::Float32:
-        mtk_type = NEURON_TENSOR_FLOAT32;
-        break;
-      case ElementType::Int32:
-        mtk_type = NEURON_TENSOR_INT32;
-        break;
-      default:
-        return Error(kLiteRtStatusErrorRuntimeFailure,
-                     "Unsupported element type");
-    }
-
-    return OperandType(mtk_type, std::move(mtk_dimensions));
-  }
-
-  OperandType(const OperandType&) = delete;
-
-  OperandType(OperandType&& other) : dimensions_(std::move(other.dimensions_)) {
-    // Copy all the scalar fields from other.
-    *static_cast<NeuronOperandType*>(this) =
-        *static_cast<NeuronOperandType*>(&other);
-    // Reset the pointer fields by using own data.
-    dimensions = dimensions_.data();
-  };
-
-  OperandType& operator=(const OperandType&) = delete;
-  OperandType& operator=(OperandType&& other) = delete;
-
- private:
-  explicit OperandType(int32_t mtk_type, std::vector<uint32_t>&& mtk_dimensions)
-      : dimensions_(std::move(mtk_dimensions)) {
-    this->type = mtk_type;
-    this->dimensionCount = dimensions_.size();
-    this->dimensions = dimensions_.data();
-  };
-
-  std::vector<uint32_t> dimensions_;
-};
-
-}  // namespace
-
-// /////////////////////////////////////////////////////////////////////////////
-
-Expected<uint32_t> OperandMap::Register(const NeuronOperandType& operand_type) {
-  if (neuron_adapter_api_.api().model_add_operand(model_, &operand_type) !=
-      NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to register model operand");
-  }
-  return AllocateOperandIndex();
-}
-
-Expected<uint32_t> OperandMap::Register(const Tensor& t) {
-  auto operand_type = OperandType::Create(t);
-  if (!operand_type) {
-    return operand_type.Error();
-  }
-
-  auto operand_index =
-      Register(static_cast<const NeuronOperandType&>(*operand_type));
-  if (!operand_index) {
-    return operand_index.Error();
-  }
-
-  if (t.HasWeights()) {
-    auto weights = t.Weights().Bytes();
-    if (neuron_adapter_api_.api().model_set_operand_value(
-            model_, *operand_index, weights.data(), weights.size()) !=
-        NEURON_NO_ERROR) {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to set value of tensor weights");
-    }
-  }
-
-  map_[t.Get()] = *operand_index;
-  return *operand_index;
-}
-
-}  // namespace litert::mediatek
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h b/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h
deleted file mode 100644
index 51ee8c935400..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/compiler/legalizations/operand_map.h
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_OPERAND_MAP_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_OPERAND_MAP_H_
-
-#include <cstdint>
-#include <map>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace litert::mediatek {
-
-// This class takes care of registering Tensors and scalars with a given
-// NeuronModel and returing their "operand index", which is how the MTK SDK
-// handles them.
-class OperandMap {
- public:
-  OperandMap(const NeuronAdapterApi& neuron_adapter_api, NeuronModel* model)
-      : neuron_adapter_api_(neuron_adapter_api), model_(model) {}
-
-  // Add a scalar operand to the model.
-  Expected<uint32_t> AddScalarBool(bool value) {
-    return AddScalar(NEURON_BOOL, value);
-  }
-  Expected<uint32_t> AddScalarInt32(int32_t value) {
-    return AddScalar(NEURON_INT32, value);
-  }
-  Expected<uint32_t> AddScalarFloat32(float value) {
-    return AddScalar(NEURON_FLOAT32, value);
-  }
-
-  // Find the operand index for a given tensor and, if not done already, add the
-  // tensor as an operand in the model.
-  Expected<uint32_t> GetOperandIndex(const Tensor& t) {
-    auto i = map_.find(t.Get());
-    if (i != map_.end()) {
-      return i->second;
-    } else {
-      return Register(t);
-    }
-  }
-
- private:
-  Expected<uint32_t> Register(const Tensor& t);
-  Expected<uint32_t> Register(const NeuronOperandType& operand_type);
-  uint32_t AllocateOperandIndex() { return next_operand_index_++; }
-
-  template <typename T>
-  Expected<uint32_t> AddScalar(int32_t mtk_type, T value) {
-    const NeuronOperandType scalar_type = {
-        .type = mtk_type,
-        .dimensionCount = 0,
-        .dimensions = nullptr,
-    };
-    auto operand_index = Register(scalar_type);
-    if (!operand_index) {
-      return operand_index.Error();
-    }
-    if (neuron_adapter_api_.api().model_set_operand_value(
-            model_, *operand_index, &value, sizeof(value)) != NEURON_NO_ERROR) {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to set value of scalar operand");
-    }
-    return operand_index;
-  }
-
-  const NeuronAdapterApi& neuron_adapter_api_;
-  NeuronModel* model_;
-  int next_operand_index_ = 0;
-  absl::flat_hash_map<LiteRtTensor, uint32_t> map_;
-};
-
-}  // namespace litert::mediatek
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_COMPILER_LEGALIZATIONS_OPERAND_MAP_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/BUILD b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/BUILD
deleted file mode 100644
index af205715e997..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/BUILD
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_dynamic_lib")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-litert_dynamic_lib(
-    name = "dispatch_api",
-    srcs = [
-        "dispatch_api.cc",
-        "litert_dispatch_device_context.cc",
-        "litert_dispatch_invocation_context.cc",
-    ],
-    hdrs = [
-        "litert_dispatch_device_context.h",
-        "litert_dispatch_invocation_context.h",
-    ],
-    export_litert_only = True,
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    linkstatic = 1,
-    shared_lib_name = "dispatch_api_so",
-    so_name = "libLiteRtDispatch_Mediatek.so",
-    tags = [
-        # Remove when sdk is available to bazel.
-        "nobuilder",
-        "notap",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/neuro_pilot:latest_host_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_runtime_c_api_shared_lib",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/core:dynamic_loading",
-        "//tensorflow/lite/experimental/litert/core/util:tensor_type_util",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-        "//tensorflow/lite/experimental/litert/vendors/mediatek:neuron_adapter_api",
-    ],
-)
-
-# This is cc_library target for `libLiteRtDispatch_Mediatek.so`.
-cc_library(
-    name = "dispatch_api_shared_lib",
-    srcs = [":dispatch_api_so"],
-    linkstatic = 1,
-)
-
-cc_test(
-    name = "dispatch_api_mediatek_test",
-    srcs = [
-        "dispatch_api_mediatek_test.cc",
-    ],
-    data = [
-        ":dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        "no-remote-exec",
-        "no_oss",
-        "nobuilder",
-        "nosan",
-        "notap",
-    ],
-    deps = [
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/neuro_pilot:latest_host_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/README.md b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/README.md
deleted file mode 100644
index 35a6130c76d3..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-Test case can dispatch_api_mediatek_test can be run on a device with a MetiaTek
-mt6989 SoC with the following comands
-
-$ ../../../google/run_test_on_android.sh dispatch_api_mediatek_test
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/dispatch_api.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/dispatch_api.cc
deleted file mode 100644
index 7ab2eccf7578..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/dispatch_api.cc
+++ /dev/null
@@ -1,326 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdio>
-#include <cstring>
-#include <optional>
-#include <string>
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include <android/hardware_buffer.h>
-#endif
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-namespace {
-
-litert::mediatek::NeuronAdapterApi* TheNeuronAdapter;
-char BuildId[256];
-
-}  // namespace
-
-namespace litert {
-namespace mediatek {
-
-// /////////////////////////////////////////////////////////////////////////////
-// Basic Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-const char* GetSharedLibraryDir(const LiteRtDispatchOption* options,
-                                int num_options) {
-  for (auto i = 0; i < num_options; ++i) {
-    auto& option = options[i];
-    if (!strcmp(option.name, kDispatchOptionSharedLibraryDir)) {
-      return option.value.str_value;
-    }
-  }
-  return nullptr;
-}
-
-LiteRtStatus LiteRtInitialize(const LiteRtDispatchOption* options,
-                              int num_options) {
-  auto* shared_library_dir = GetSharedLibraryDir(options, num_options);
-  std::optional<std::string> shared_library_dir_opt =
-      shared_library_dir ? std::make_optional(std::string(shared_library_dir))
-                         : std::nullopt;
-
-  if (auto neuron_adapter_api =
-          litert::mediatek::NeuronAdapterApi::Create(shared_library_dir_opt);
-      neuron_adapter_api) {
-    TheNeuronAdapter = neuron_adapter_api->release();
-  } else {
-    LITERT_LOG(LITERT_INFO, "Initialization failure: %s",
-               neuron_adapter_api.Error().Message().c_str());
-    return neuron_adapter_api.Error().Status();
-  }
-
-  auto get_version = TheNeuronAdapter->api().get_version;
-  if (!get_version) {
-    LITERT_LOG(LITERT_ERROR, "get_version not found");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  NeuronRuntimeVersion version;
-  if (get_version(&version) != NEURON_NO_ERROR) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get version");
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-  LITERT_LOG(LITERT_INFO, "Neuron SDK version: %d.%d.%d", version.major,
-             version.minor, version.patch);
-
-  snprintf(BuildId, sizeof(BuildId),
-           "MediaTek Dispatch API version %d.%d.%d, NeuronAdaptor API version "
-           "%d.%d.%d",
-           LITERT_API_VERSION_MAJOR, LITERT_API_VERSION_MINOR,
-           LITERT_API_VERSION_PATCH, version.major, version.minor,
-           version.patch);
-  BuildId[sizeof(BuildId) - 1] = 0;
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetVendorId(const char** vendor_id) {
-  *vendor_id = "MediaTek";
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetBuildId(const char** build_id) {
-  *build_id = BuildId;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCapabilities(int* capabilities) {
-  *capabilities = kLiteRtDispatchCapabilitiesBasic;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDeviceContextCreate(
-    LiteRtDispatchDeviceContext* device_context) {
-  if (auto context = LiteRtDispatchDeviceContextT::Create(*TheNeuronAdapter);
-      context) {
-    *device_context = context->release();
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to create device context: %s",
-               context.Error().Message().c_str());
-    return context.Error().Status();
-  }
-}
-
-LiteRtStatus LiteRtDeviceContextDestroy(
-    LiteRtDispatchDeviceContext device_context) {
-  delete device_context;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetInputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int input_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (auto requirements =
-          invocation_context->GetInputRequirements(input_index, *tensor_type);
-      requirements) {
-    *tensor_buffer_requirements = *requirements;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to get tensor buffer requirements: %s",
-               requirements.Error().Message().c_str());
-    return requirements.Error().Status();
-  }
-}
-
-LiteRtStatus LiteRtGetOutputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int output_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (auto requirements =
-          invocation_context->GetOutputRequirements(output_index, *tensor_type);
-      requirements) {
-    *tensor_buffer_requirements = *requirements;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to get tensor buffer requirements: %s",
-               requirements.Error().Message().c_str());
-    return requirements.Error().Status();
-  }
-}
-
-LiteRtStatus LiteRtRegisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBuffer tensor_buffer,
-    LiteRtTensorBufferHandle* tensor_buffer_handle) {
-  if (auto result = device_context->RegisterTensorBuffer(tensor_buffer);
-      result) {
-    *tensor_buffer_handle = *result;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to register tensor buffer: %s",
-               result.Error().Message().c_str());
-    return result.Error().Status();
-  }
-}
-
-LiteRtStatus LiteRtUnregisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto status =
-          device_context->UnregisterTensorBuffer(tensor_buffer_handle);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to unregister tensor buffer: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtInvocationContextCreate(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs,
-    LiteRtDispatchInvocationContext* invocation_context) {
-  auto context = LiteRtDispatchInvocationContextT::Create(
-      *TheNeuronAdapter, device_context, exec_type, exec_bytecode_buffer,
-      function_name, num_inputs, num_outputs);
-  if (!context) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create context from context binary: %s",
-               context.Error().Message().c_str());
-    return context.Error().Status();
-  }
-  *invocation_context = context->release();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtInvocationContextDestroy(
-    LiteRtDispatchInvocationContext invocation_context) {
-  delete invocation_context;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtAttachInput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto status = invocation_context->AttachInput(graph_input_index,
-                                                    tensor_buffer_handle);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to attach input: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtAttachOutput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto status = invocation_context->AttachOutput(graph_output_index,
-                                                     tensor_buffer_handle);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to attach output: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDetachInput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_input_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto status = invocation_context->DetachInput(graph_input_index,
-                                                    tensor_buffer_handle);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to detach input: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtDetachOutput(
-    LiteRtDispatchInvocationContext invocation_context, int graph_output_index,
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto status = invocation_context->DetachOutput(graph_output_index,
-                                                     tensor_buffer_handle);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to detach output: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtInvoke(LiteRtDispatchInvocationContext invocation_context) {
-  if (auto status = invocation_context->Invoke(); !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to invoke context: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-}  // namespace mediatek
-}  // namespace litert
-
-// /////////////////////////////////////////////////////////////////////////////
-
-namespace {
-
-LiteRtDispatchInterface TheInterface = {
-    .initialize = litert::mediatek::LiteRtInitialize,
-    .get_vendor_id = litert::mediatek::LiteRtGetVendorId,
-    .get_build_id = litert::mediatek::LiteRtGetBuildId,
-    .get_capabilities = litert::mediatek::LiteRtGetCapabilities,
-    .device_context_create = litert::mediatek::LiteRtDeviceContextCreate,
-    .device_context_destroy = litert::mediatek::LiteRtDeviceContextDestroy,
-    .get_input_requirements = litert::mediatek::LiteRtGetInputRequirements,
-    .get_output_requirements = litert::mediatek::LiteRtGetOutputRequirements,
-    .register_tensor_buffer = litert::mediatek::LiteRtRegisterTensorBuffer,
-    .unregister_tensor_buffer = litert::mediatek::LiteRtUnregisterTensorBuffer,
-    .invocation_context_create =
-        litert::mediatek::LiteRtInvocationContextCreate,
-    .invocation_context_destroy =
-        litert::mediatek::LiteRtInvocationContextDestroy,
-    .attach_input = litert::mediatek::LiteRtAttachInput,
-    .attach_output = litert::mediatek::LiteRtAttachOutput,
-    .detach_input = litert::mediatek::LiteRtDetachInput,
-    .detach_output = litert::mediatek::LiteRtDetachOutput,
-    .invoke = litert::mediatek::LiteRtInvoke,
-};
-
-LiteRtDispatchApi TheApi = {
-    .version = {.major = LITERT_API_VERSION_MAJOR,
-                .minor = LITERT_API_VERSION_MINOR,
-                .patch = LITERT_API_VERSION_PATCH},
-    .interface = &TheInterface,
-    .async_interface = nullptr,
-    .graph_interface = nullptr,
-};
-
-}  // namespace
-
-LiteRtStatus LiteRtDispatchGetApi(LiteRtDispatchApi* api) {
-  *api = TheApi;
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/dispatch_api_mediatek_test.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/dispatch_api_mediatek_test.cc
deleted file mode 100644
index 9926f55e6884..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/dispatch_api_mediatek_test.cc
+++ /dev/null
@@ -1,638 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <any>
-#include <cstddef>
-#include <cstring>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-using ::testing::Pointwise;
-
-TEST(MediaTek, DispatchApiWithAhwb) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP()
-      << "This test is specific to Android devices with a MediaTek NPU";
-#endif
-
-  LiteRtDispatchOption dispatch_option = {
-      /*.name=*/kDispatchOptionSharedLibraryDir,
-      /*.value=*/*litert::ToLiteRtAny(std::any("/data/local/tmp")),
-  };
-  ASSERT_EQ(
-      LiteRtDispatchInitialize(/*options=*/&dispatch_option, /*num_options=*/1),
-      kLiteRtStatusOk);
-
-  const char* vendor_id;
-  EXPECT_EQ(LiteRtDispatchGetVendorId(&vendor_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "vendor_id: " << vendor_id;
-
-  const char* build_id;
-  EXPECT_EQ(LiteRtDispatchGetBuildId(&build_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "build_id: " << build_id;
-
-  LiteRtApiVersion api_version;
-  EXPECT_EQ(LiteRtDispatchGetApiVersion(&api_version), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "api_version: " << api_version.major << "."
-                 << api_version.minor << "." << api_version.patch;
-
-  int capabilities;
-  EXPECT_EQ(LiteRtDispatchGetCapabilities(&capabilities), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "capabilities: " << capabilities;
-
-  LiteRtDispatchDeviceContext device_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchDeviceContextCreate(&device_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "device_context: " << device_context;
-
-  auto model_file_name =
-      litert::testing::GetTestFilePath(kMediaTekModelFileName);
-  auto model = litert::internal::LoadBinaryFile(model_file_name);
-  EXPECT_TRUE(model) << model.Error();
-  ABSL_LOG(INFO) << "Loaded model " << model_file_name << ", " << model->Size()
-                 << " bytes";
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Set up an invocation context for a given model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtMemBuffer exec_bytecode_buffer = {/*.fd=*/-1,
-                                          /*.base_addr=*/model->Data(),
-                                          /*.offset=*/0,
-                                          /*.size=*/model->Size()};
-  LiteRtDispatchInvocationContext invocation_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchInvocationContextCreate(
-                device_context, kLiteRtDispatchExecutableTypeMlModel,
-                &exec_bytecode_buffer, /*function_name=*/nullptr,
-                /*num_inputs=*/2, /*num_outputs=*/1, &invocation_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "Invocation context: " << invocation_context;
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Determine tensor buffer requirements.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  int num_tensor_buffer_types;
-  LiteRtTensorBufferRequirements input_0_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/0, &kInput0TensorType,
-                &input_0_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_0_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_0_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_0_tensor_buffer_requirements, /*type_index=*/0,
-                &input_0_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_0_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t input_0_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_0_tensor_buffer_requirements, &input_0_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_0_tensor_buffer_size, sizeof(kTestInput0Tensor));
-
-  LiteRtTensorBufferRequirements input_1_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/1, &kInput1TensorType,
-                &input_1_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_1_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_1_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_1_tensor_buffer_requirements, /*type_index=*/0,
-                &input_1_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_1_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t input_1_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_1_tensor_buffer_requirements, &input_1_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_1_tensor_buffer_size, sizeof(kTestInput1Tensor));
-
-  LiteRtTensorBufferRequirements output_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetOutputRequirements(
-                invocation_context, /*output_index=*/0, &kOutputTensorType,
-                &output_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                output_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType output_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                output_tensor_buffer_requirements, /*type_index=*/0,
-                &output_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(output_tensor_buffer_type, kLiteRtTensorBufferTypeAhwb);
-  size_t output_tensor_buffer_size;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                output_tensor_buffer_requirements, &output_tensor_buffer_size),
-            kLiteRtStatusOk);
-  EXPECT_GE(output_tensor_buffer_size, sizeof(kTestOutputTensor));
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Allocate tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBuffer input_0_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_0_tensor_buffer_type, &kInput0TensorType,
-                input_0_tensor_buffer_size, &input_0_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer input_1_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_1_tensor_buffer_type, &kInput1TensorType,
-                input_1_tensor_buffer_size, &input_1_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer output_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                output_tensor_buffer_type, &kOutputTensorType,
-                output_tensor_buffer_size, &output_tensor_buffer),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Register tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBufferHandle input_1_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_1_tensor_buffer, &input_1_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle input_0_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_0_tensor_buffer, &input_0_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle output_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, output_tensor_buffer, &output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Attach tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking execution...";
-  EXPECT_EQ(LiteRtDispatchInvoke(invocation_context), kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(testing::FloatNear(1e-3), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with more data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor_2,
-                sizeof(kTestInput0Tensor_2));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor_2,
-                sizeof(kTestInput1Tensor_2));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model once more.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking second execution...";
-  EXPECT_EQ(LiteRtDispatchInvoke(invocation_context), kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor_2[i];
-    }
-    EXPECT_THAT(output,
-                Pointwise(testing::FloatNear(1e-3), kTestOutputTensor_2));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Clean up resources.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchUnregisterTensorBuffer(device_context, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_1_handle),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_0_handle),
-      kLiteRtStatusOk);
-  LiteRtDestroyTensorBuffer(output_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_1_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_0_tensor_buffer);
-  EXPECT_EQ(LiteRtDispatchInvocationContextDestroy(invocation_context),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDeviceContextDestroy(device_context),
-            kLiteRtStatusOk);
-}
-
-TEST(MediaTek, DispatchApiWithDmaBuf) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP()
-      << "This test is specific to Android devices with a MediaTek NPU";
-#endif
-
-  EXPECT_EQ(LiteRtDispatchInitialize(/*options=*/nullptr, /*num_options=*/0),
-            kLiteRtStatusOk);
-
-  const char* vendor_id;
-  EXPECT_EQ(LiteRtDispatchGetVendorId(&vendor_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "vendor_id: " << vendor_id;
-
-  const char* build_id;
-  EXPECT_EQ(LiteRtDispatchGetBuildId(&build_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "build_id: " << build_id;
-
-  LiteRtApiVersion api_version;
-  EXPECT_EQ(LiteRtDispatchGetApiVersion(&api_version), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "api_version: " << api_version.major << "."
-                 << api_version.minor << "." << api_version.patch;
-
-  int capabilities;
-  EXPECT_EQ(LiteRtDispatchGetCapabilities(&capabilities), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "capabilities: " << capabilities;
-
-  LiteRtDispatchDeviceContext device_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchDeviceContextCreate(&device_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "device_context: " << device_context;
-
-  auto model_file_name =
-      litert::testing::GetTestFilePath(kMediaTekModelFileName);
-  auto model = litert::internal::LoadBinaryFile(model_file_name);
-  EXPECT_TRUE(model) << model.Error();
-  ABSL_LOG(INFO) << "Loaded model " << model_file_name << ", " << model->Size()
-                 << " bytes";
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Set up an invocation context for a given model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtMemBuffer exec_bytecode_buffer = {/*.fd=*/-1,
-                                          /*.base_addr=*/model->Data(),
-                                          /*.offset=*/0,
-                                          /*.size=*/model->Size()};
-  LiteRtDispatchInvocationContext invocation_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchInvocationContextCreate(
-                device_context, kLiteRtDispatchExecutableTypeMlModel,
-                &exec_bytecode_buffer, /*function_name=*/nullptr,
-                /*num_inputs=*/2, /*num_outputs=*/1, &invocation_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "Invocation context: " << invocation_context;
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Determine tensor buffer requirements.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  int num_tensor_buffer_types;
-  LiteRtTensorBufferRequirements input_0_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/0, &kInput0TensorType,
-                &input_0_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_0_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 2);
-  LiteRtTensorBufferType input_0_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_0_tensor_buffer_requirements, /*type_index=*/1,
-                &input_0_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_0_tensor_buffer_type, kLiteRtTensorBufferTypeDmaBuf);
-  size_t input_0_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_0_tensor_buffer_requirements, &input_0_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_0_tensor_buffer_size, sizeof(kTestInput0Tensor));
-
-  LiteRtTensorBufferRequirements input_1_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/1, &kInput1TensorType,
-                &input_1_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_1_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 2);
-  LiteRtTensorBufferType input_1_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_1_tensor_buffer_requirements, /*type_index=*/1,
-                &input_1_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_1_tensor_buffer_type, kLiteRtTensorBufferTypeDmaBuf);
-  size_t input_1_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_1_tensor_buffer_requirements, &input_1_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_1_tensor_buffer_size, sizeof(kTestInput1Tensor));
-
-  LiteRtTensorBufferRequirements output_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetOutputRequirements(
-                invocation_context, /*output_index=*/0, &kOutputTensorType,
-                &output_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                output_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 2);
-  LiteRtTensorBufferType output_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                output_tensor_buffer_requirements, /*type_index=*/1,
-                &output_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(output_tensor_buffer_type, kLiteRtTensorBufferTypeDmaBuf);
-  size_t output_tensor_buffer_size;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                output_tensor_buffer_requirements, &output_tensor_buffer_size),
-            kLiteRtStatusOk);
-  EXPECT_GE(output_tensor_buffer_size, sizeof(kTestOutputTensor));
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Allocate tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBuffer input_0_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_0_tensor_buffer_type, &kInput0TensorType,
-                input_0_tensor_buffer_size, &input_0_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer input_1_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_1_tensor_buffer_type, &kInput1TensorType,
-                input_1_tensor_buffer_size, &input_1_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer output_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                output_tensor_buffer_type, &kOutputTensorType,
-                output_tensor_buffer_size, &output_tensor_buffer),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Register tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBufferHandle input_1_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_1_tensor_buffer, &input_1_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle input_0_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_0_tensor_buffer, &input_0_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle output_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, output_tensor_buffer, &output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Attach tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking execution...";
-  EXPECT_EQ(LiteRtDispatchInvoke(invocation_context), kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(testing::FloatNear(1e-3), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with more data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor_2,
-                sizeof(kTestInput0Tensor_2));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor_2,
-                sizeof(kTestInput1Tensor_2));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model once more.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking second execution...";
-  EXPECT_EQ(LiteRtDispatchInvoke(invocation_context), kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor_2[i];
-    }
-    EXPECT_THAT(output,
-                Pointwise(testing::FloatNear(1e-3), kTestOutputTensor_2));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Clean up resources.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchUnregisterTensorBuffer(device_context, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_1_handle),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_0_handle),
-      kLiteRtStatusOk);
-  LiteRtDestroyTensorBuffer(output_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_1_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_0_tensor_buffer);
-  EXPECT_EQ(LiteRtDispatchInvocationContextDestroy(invocation_context),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDeviceContextDestroy(device_context),
-            kLiteRtStatusOk);
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.cc
deleted file mode 100644
index b5732323d3f8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h"
-
-#include <sys/mman.h>
-
-#include <cstddef>
-#include <memory>
-
-#include "neuron/api/NeuronAdapter.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-using litert::Error;
-
-LiteRtDispatchDeviceContextT::~LiteRtDispatchDeviceContextT() = default;
-
-litert::Expected<LiteRtDispatchDeviceContextT::Ptr>
-LiteRtDispatchDeviceContextT::Create(
-    const litert::mediatek::NeuronAdapterApi& neuron_adapter_api) {
-  return std::unique_ptr<LiteRtDispatchDeviceContextT>(
-      new LiteRtDispatchDeviceContextT(neuron_adapter_api));
-}
-
-litert::Expected<LiteRtTensorBufferHandle>
-LiteRtDispatchDeviceContextT::RegisterTensorBuffer(
-    LiteRtTensorBuffer tensor_buffer) {
-  LiteRtTensorBufferType tensor_buffer_type;
-  LITERT_RETURN_IF_ERROR(
-      LiteRtGetTensorBufferType(tensor_buffer, &tensor_buffer_type));
-
-  if (tensor_buffer_type != kLiteRtTensorBufferTypeAhwb &&
-      tensor_buffer_type != kLiteRtTensorBufferTypeDmaBuf) {
-    return Error(kLiteRtStatusErrorUnsupported, "Unsupported buffer type");
-  }
-
-  size_t tensor_buffer_size;
-  LITERT_RETURN_IF_ERROR(
-      LiteRtGetTensorBufferSize(tensor_buffer, &tensor_buffer_size));
-
-  size_t tensor_buffer_offset;
-  if (auto status =
-          LiteRtGetTensorBufferOffset(tensor_buffer, &tensor_buffer_offset);
-      status != kLiteRtStatusOk) {
-    if (status == kLiteRtStatusErrorNotFound) {
-      tensor_buffer_offset = 0;
-    } else {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to get buffer offset");
-    }
-  }
-
-  LiteRtRankedTensorType tensor_type;
-  LITERT_RETURN_IF_ERROR(
-      LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type));
-
-  auto* tensor_strides = tensor_type.layout.strides;
-  if (tensor_strides != nullptr) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Tensor strides are not supported");
-  }
-
-  switch (tensor_buffer_type) {
-    case kLiteRtTensorBufferTypeAhwb:
-#if LITERT_HAS_AHWB_SUPPORT
-      AHardwareBuffer* ahwb;
-      if (auto status = LiteRtGetTensorBufferAhwb(tensor_buffer, &ahwb);
-          status != kLiteRtStatusOk) {
-        return Error(status, "Failed to get AHWB");
-      }
-#else
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "AHardwareBuffer is not supported on this platform");
-#endif
-      NeuronMemory* neuron_memory;
-#ifdef __ANDROID__
-      if (neuron_adapter_api_.api().memory_create_from_ahwb(
-              ahwb, &neuron_memory) != NEURON_NO_ERROR) {
-        return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                                  "Failed to create NeuronMemory from AHWB");
-      }
-      return neuron_memory_registry_.Register(neuron_memory, tensor_buffer_size,
-                                              tensor_buffer_offset);
-#else
-      (void)neuron_adapter_api_;
-      return litert::Unexpected(
-          kLiteRtStatusErrorRuntimeFailure,
-          "AHardwareBuffer is not supported on this platform");
-#endif
-      break;
-
-    case kLiteRtTensorBufferTypeDmaBuf:
-
-      int fd;
-#if LITERT_HAS_DMA_BUF_SUPPORT
-      void* addr;
-      if (auto status = LiteRtGetTensorBufferDmaBuf(tensor_buffer, &addr, &fd);
-          status != kLiteRtStatusOk) {
-        return Error(status, "Failed to get DMA-BUF");
-      }
-#else
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "DMA-BUF is not supported on this platform");
-#endif
-      if (neuron_adapter_api_.api().memory_create_from_fd(
-              tensor_buffer_size, /*protect*/ PROT_READ | PROT_WRITE, fd,
-              tensor_buffer_offset, &neuron_memory) != NEURON_NO_ERROR) {
-        return litert::Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                                  "Failed to create NeuronMemory from DMA-BUF");
-      }
-      return neuron_memory_registry_.Register(neuron_memory, tensor_buffer_size,
-                                              tensor_buffer_offset);
-      break;
-
-    default:
-      LITERT_LOG(LITERT_ERROR, "Unsupported buffer type: %d",
-                 tensor_buffer_type);
-      return litert::Unexpected(kLiteRtStatusErrorUnsupported);
-  }
-}
-
-LiteRtDispatchDeviceContextT::NeuronMemoryRegistry::~NeuronMemoryRegistry() {
-  for (auto i = 0; i < records_.size(); ++i) {
-    auto& record = records_[i];
-    if (record.neuron_memory != nullptr) {
-      neuron_adapter_api_.api().memory_free(record.neuron_memory);
-    }
-  }
-}
-
-LiteRtTensorBufferHandle
-LiteRtDispatchDeviceContextT::NeuronMemoryRegistry::Register(
-    NeuronMemory* neuron_memory, size_t size, size_t offset) {
-  int dest_index = -1;
-  for (auto i = 0; i < records_.size(); ++i) {
-    if (!records_[i].neuron_memory) {
-      dest_index = i;
-      break;
-    }
-  }
-  if (dest_index < 0) {
-    dest_index = records_.size();
-    records_.push_back({});
-  }
-  auto& dest = records_[dest_index];
-  dest = {neuron_memory, size, offset};
-  return dest_index;
-}
-
-litert::Expected<void>
-LiteRtDispatchDeviceContextT::NeuronMemoryRegistry::Unregister(
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto record = Find(tensor_buffer_handle);
-  if (!record) {
-    return record.Error();
-  } else {
-    auto& mem = (*record)->neuron_memory;
-    neuron_adapter_api_.api().memory_free(mem);
-    mem = nullptr;
-    return {};
-  }
-}
-
-litert::Expected<LiteRtDispatchDeviceContextT::NeuronMemoryInfo*>
-LiteRtDispatchDeviceContextT::NeuronMemoryRegistry::Find(
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (tensor_buffer_handle < 0 || tensor_buffer_handle >= records_.size()) {
-    return litert::Unexpected(kLiteRtStatusErrorInvalidArgument,
-                              "Invalid tensor buffer handle");
-  }
-  return &records_[tensor_buffer_handle];
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h
deleted file mode 100644
index 483701fe919a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
-
-#include <memory>
-
-#include "neuron/api/NeuronAdapter.h"
-#include "absl/container/flat_hash_set.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-class LiteRtDispatchDeviceContextT {
- public:
-  using Ptr = std::unique_ptr<LiteRtDispatchDeviceContextT>;
-  struct NeuronMemoryInfo {
-    NeuronMemory* neuron_memory;
-    size_t size;
-    size_t offset;
-  };
-
-  ~LiteRtDispatchDeviceContextT();
-
-  static litert::Expected<Ptr> Create(
-      const litert::mediatek::NeuronAdapterApi& neuron_adapter_api);
-
-  litert::Expected<LiteRtTensorBufferHandle> RegisterTensorBuffer(
-      LiteRtTensorBuffer tensor_buffer);
-
-  litert::Expected<void> UnregisterTensorBuffer(
-      LiteRtTensorBufferHandle tensor_buffer_handle) {
-    return neuron_memory_registry_.Unregister(tensor_buffer_handle);
-  }
-
-  litert::Expected<NeuronMemoryInfo> GetNeuronMemoryInfo(
-      LiteRtTensorBufferHandle tensor_buffer_handle) {
-    auto record = neuron_memory_registry_.Find(tensor_buffer_handle);
-    if (!record) {
-      return record.Error();
-    } else {
-      return NeuronMemoryInfo(**record);
-    }
-  }
-
- private:
-  class NeuronMemoryRegistry {
-   public:
-    explicit NeuronMemoryRegistry(
-        const litert::mediatek::NeuronAdapterApi& neuron_adapter_api)
-        : neuron_adapter_api_(neuron_adapter_api) {}
-    ~NeuronMemoryRegistry();
-    LiteRtTensorBufferHandle Register(NeuronMemory* neuron_memory, size_t size,
-                                      size_t offset);
-    litert::Expected<void> Unregister(
-        LiteRtTensorBufferHandle tensor_buffer_handle);
-    litert::Expected<NeuronMemoryInfo*> Find(
-        LiteRtTensorBufferHandle tensor_buffer_handle);
-
-   private:
-    const litert::mediatek::NeuronAdapterApi& neuron_adapter_api_;
-    std::vector<NeuronMemoryInfo> records_;
-  };
-
-  explicit LiteRtDispatchDeviceContextT(
-      const litert::mediatek::NeuronAdapterApi& neuron_adapter_api)
-      : neuron_adapter_api_(neuron_adapter_api),
-        neuron_memory_registry_(neuron_adapter_api) {}
-
-  const litert::mediatek::NeuronAdapterApi& neuron_adapter_api_;
-  NeuronMemoryRegistry neuron_memory_registry_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.cc
deleted file mode 100644
index 96e901c37871..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.cc
+++ /dev/null
@@ -1,415 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h"
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_device_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-using litert::Error;
-using litert::Expected;
-using litert::mediatek::NeuronCompilationPtr;
-using litert::mediatek::NeuronExecutionPtr;
-using litert::mediatek::NeuronModelPtr;
-
-namespace {
-
-Expected<std::pair<NeuronModelPtr, NeuronCompilationPtr>> LoadFromCachedNetwork(
-    const litert::mediatek::NeuronAdapterApi& neuron_adapter_api,
-    const void* bytecode_addr, size_t bytecode_size) {
-  NeuronModel* model;
-  NeuronCompilation* compilation;
-  if (neuron_adapter_api.api().model_restore_from_compiled_network(
-          &model, &compilation, bytecode_addr, bytecode_size) !=
-      NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to restore model from compiled network");
-  }
-  return std::make_pair(
-      NeuronModelPtr{model, neuron_adapter_api.api().model_free},
-      NeuronCompilationPtr{compilation,
-                           neuron_adapter_api.api().compilation_free});
-}
-
-uint16_t GetRestoreDlaExtensionOperandType(
-    const litert::mediatek::NeuronAdapterApi& neuron_adapter_api) {
-  NeuronRuntimeVersion version;
-  neuron_adapter_api.api().get_version(&version);
-  // The values below were suggested by MTK.
-  if (version.major >= 8) {
-    return 0x0200;
-  } else {
-    return 0x0100;
-  }
-}
-
-Expected<std::pair<NeuronModelPtr, NeuronCompilationPtr>> LoadFromDlaBytecode(
-    const litert::mediatek::NeuronAdapterApi& neuron_adapter_api,
-    const void* bytecode_addr, size_t bytecode_size, int num_inputs,
-    int num_outputs) {
-  Expected<NeuronModelPtr> model = neuron_adapter_api.CreateModel();
-  if (!model) {
-    return model.Error();
-  }
-
-  // fake input, the real outputs are loaded by compiled network.
-  constexpr const NeuronOperandType fake_io_operand_type{
-      .type = NEURON_TENSOR_FLOAT32,
-      .dimensionCount = 0,
-      .scale = 0.0f,
-      .zeroPoint = 0,
-  };
-
-  std::vector<uint32_t> input_op_number;
-  input_op_number.reserve(num_inputs);
-  for (auto i = 0; i < num_inputs; i++) {
-    if (neuron_adapter_api.api().model_add_operand(
-            model->get(), &fake_io_operand_type) != NEURON_NO_ERROR) {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to add input operand");
-    }
-    input_op_number.emplace_back(i);
-  }
-
-  const uint16_t kNetworkOperandRestoreData =
-      GetRestoreDlaExtensionOperandType(neuron_adapter_api);
-  constexpr const uint16_t kRestoreDlaExtensionOperationType = 0;
-  constexpr const char* kExtensionRestoreCompiledNetwork =
-      "com.mediatek.compiled_network";
-
-  int32_t operand_type;
-  if (neuron_adapter_api.api().model_get_extension_operand_type(
-          model->get(), kExtensionRestoreCompiledNetwork,
-          kNetworkOperandRestoreData, &operand_type) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to getextension operand");
-  }
-
-  const NeuronOperandType extension_operand_type{
-      .type = operand_type,
-      .dimensionCount = 0,
-      .scale = 0.0f,
-      .zeroPoint = 0,
-  };
-  if (neuron_adapter_api.api().model_add_operand(
-          model->get(), &extension_operand_type) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to add extension operand");
-  }
-  input_op_number.emplace_back(input_op_number.size());
-  if (neuron_adapter_api.api().model_set_operand_value(
-          model->get(), input_op_number.back(), bytecode_addr, bytecode_size) !=
-      NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to set extension operand value");
-  }
-
-  std::vector<uint32_t> output_op_number;
-  for (auto i = 0; i < num_outputs; i++) {
-    if (neuron_adapter_api.api().model_add_operand(
-            model->get(), &fake_io_operand_type) != NEURON_NO_ERROR) {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to add output operand");
-    }
-    output_op_number.emplace_back(input_op_number.size() + i);
-  }
-
-  int32_t operation_type;
-  if (neuron_adapter_api.api().model_get_extension_operation_type(
-          model->get(), kExtensionRestoreCompiledNetwork,
-          kRestoreDlaExtensionOperationType,
-          &operation_type) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to get extension operation");
-  }
-
-  // Add extension operation
-  if (neuron_adapter_api.api().model_add_operation(
-          model->get(), static_cast<NeuronOperationType>(operation_type),
-          input_op_number.size(), input_op_number.data(),
-          output_op_number.size(),
-          output_op_number.data()) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to add extension operation");
-  }
-
-  if (neuron_adapter_api.api().model_identify_inputs_and_outputs(
-          model->get(), input_op_number.size() - 1, input_op_number.data(),
-          output_op_number.size(),
-          output_op_number.data()) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Failed to identify I/Os");
-  }
-
-  if (neuron_adapter_api.api().model_finish(model->get()) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure, "Failed to finish model");
-  }
-
-  auto compilation = neuron_adapter_api.CreateCompilation(model->get());
-  if (!compilation) {
-    return compilation.Error();
-  }
-
-  if (neuron_adapter_api.api().compilation_set_priority(
-          compilation->get(), NEURON_PRIORITY_HIGH) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to set compilation priority");
-  }
-
-  if (neuron_adapter_api.api().compilation_set_preference(
-          compilation->get(), NEURON_PREFER_SUSTAINED_SPEED) !=
-      NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to set compilation preference");
-  }
-
-  // We use AOT compile options since the DLA file was compiled ahead of time.
-  const auto compile_options =
-      std::string(neuron_adapter_api.AotCompileOptions());
-  if (!compile_options.empty()) {
-    if (neuron_adapter_api.api().compilation_set_optimization_string(
-            compilation->get(), compile_options.c_str()) != NEURON_NO_ERROR) {
-      return Error(kLiteRtStatusErrorRuntimeFailure,
-                   "Failed to set optimization string");
-    }
-  }
-
-  if (neuron_adapter_api.api().compilation_finish(compilation->get()) !=
-      NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to finish compilation");
-  }
-
-  return std::make_pair(std::move(*model), std::move(*compilation));
-}
-
-Expected<std::pair<NeuronModelPtr, NeuronCompilationPtr>>
-LoadModelAndCompilation(
-    const litert::mediatek::NeuronAdapterApi& neuron_adapter_api,
-    const void* bytecode_addr, size_t bytecode_size, int num_inputs,
-    int num_outputs) {
-  if (auto result = LoadFromDlaBytecode(neuron_adapter_api, bytecode_addr,
-                                        bytecode_size, num_inputs, num_outputs);
-      !result) {
-    return LoadFromCachedNetwork(neuron_adapter_api, bytecode_addr,
-                                 bytecode_size);
-  } else {
-    return result;
-  }
-}
-
-}  // namespace
-
-Expected<LiteRtDispatchInvocationContextT::Ptr>
-LiteRtDispatchInvocationContextT::Create(
-    litert::mediatek::NeuronAdapterApi& neuron_adapter_api,
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs) {
-  auto exec_bytecode_ptr =
-      static_cast<const uint8_t*>(exec_bytecode_buffer->base_addr) +
-      exec_bytecode_buffer->offset;
-  auto model_and_compilation = LoadModelAndCompilation(
-      neuron_adapter_api, exec_bytecode_ptr, exec_bytecode_buffer->size,
-      num_inputs, num_outputs);
-  if (!model_and_compilation) {
-    return model_and_compilation.Error();
-  }
-
-  auto& model = model_and_compilation->first;
-  auto& compilation = model_and_compilation->second;
-
-  auto execution = neuron_adapter_api.CreateExecution(compilation.get());
-  if (!execution) {
-    return execution.Error();
-  }
-
-  if (neuron_adapter_api.api().execution_set_boost_hint(
-          execution->get(), 100) != NEURON_NO_ERROR) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "Failed to set execution boost hint");
-  }
-
-  return Ptr(new LiteRtDispatchInvocationContextT(
-      neuron_adapter_api, device_context, model.release(),
-      compilation.release(), execution->release(), num_inputs, num_outputs));
-}
-
-LiteRtDispatchInvocationContextT::~LiteRtDispatchInvocationContextT() {
-  if (execution_) {
-    neuron_adapter_api_.api().execution_free(execution_);
-  }
-  if (compilation_) {
-    neuron_adapter_api_.api().compilation_free(compilation_);
-  }
-  if (model_) {
-    neuron_adapter_api_.api().model_free(model_);
-  }
-}
-
-LiteRtDispatchInvocationContextT::IoRequirementsBuilder::IoRequirementsBuilder(
-    size_t buffer_size, const std::vector<uint32_t>& padded_dimensions)
-    : buffer_size_(buffer_size) {
-  auto rank = padded_dimensions.size();
-  strides_.resize(rank);
-  strides_[0] = 1;
-  for (auto i = 1; i < rank; ++i) {
-    strides_[i] = padded_dimensions[i - 1];
-  }
-}
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtDispatchInvocationContextT::IoRequirementsBuilder::Create() {
-  static constexpr std::array kSupportedTensorBufferTypes = {
-#if defined(__ANDROID__)
-      kLiteRtTensorBufferTypeAhwb,
-#endif  // __ANDROID__
-      kLiteRtTensorBufferTypeDmaBuf,
-  };
-
-  LiteRtTensorBufferRequirements requirements;
-  if (auto status = LiteRtCreateTensorBufferRequirements(
-          kSupportedTensorBufferTypes.size(),
-          kSupportedTensorBufferTypes.data(), buffer_size_, strides_.size(),
-          strides_.data(), &requirements);
-      status != kLiteRtStatusOk) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "Failed to create tensor buffer requirements");
-  }
-
-  return requirements;
-}
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtDispatchInvocationContextT::GetInputRequirements(
-    int input_index, const LiteRtRankedTensorType& tensor_type) {
-  if (!input_requirements_builders_[input_index]) {
-    size_t buffer_size;
-    if (neuron_adapter_api_.api().compilation_get_input_padded_size(
-            compilation_, input_index, &buffer_size) != NEURON_NO_ERROR) {
-      return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                           "Failed to get input padded size");
-    }
-
-    std::vector<uint32_t> padded_dimensions(tensor_type.layout.rank);
-    if (neuron_adapter_api_.api().compilation_get_input_padded_dimensions(
-            compilation_, input_index, padded_dimensions.data()) !=
-        NEURON_NO_ERROR) {
-      return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                           "Failed to get input padded dimensions");
-    }
-
-    input_requirements_builders_[input_index] =
-        std::make_unique<IoRequirementsBuilder>(buffer_size, padded_dimensions);
-  }
-
-  return input_requirements_builders_[input_index]->Create();
-}
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtDispatchInvocationContextT::GetOutputRequirements(
-    int output_index, const LiteRtRankedTensorType& tensor_type) {
-  if (!output_requirements_builders_[output_index]) {
-    size_t buffer_size;
-    if (neuron_adapter_api_.api().compilation_get_output_padded_size(
-            compilation_, output_index, &buffer_size) != NEURON_NO_ERROR) {
-      return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                           "Failed to get output padded size");
-    }
-
-    std::vector<uint32_t> padded_dimensions(tensor_type.layout.rank);
-    if (neuron_adapter_api_.api().compilation_get_output_padded_dimensions(
-            compilation_, output_index, padded_dimensions.data()) !=
-        NEURON_NO_ERROR) {
-      return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                           "Failed to get output padded dimensions");
-    }
-
-    output_requirements_builders_[output_index] =
-        std::make_unique<IoRequirementsBuilder>(buffer_size, padded_dimensions);
-  }
-
-  return output_requirements_builders_[output_index]->Create();
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::AttachInput(
-    int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto neuron_memory_info =
-      device_context_->GetNeuronMemoryInfo(tensor_buffer_handle);
-  if (!neuron_memory_info) {
-    return litert::Error(neuron_memory_info.Error());
-  }
-
-  if (neuron_adapter_api_.api().execution_set_input_from_memory(
-          execution_, graph_input_index, nullptr,
-          neuron_memory_info->neuron_memory, neuron_memory_info->offset,
-          neuron_memory_info->size) != NEURON_NO_ERROR) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "Failed to set execution input from memory");
-  }
-  return {};
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::AttachOutput(
-    int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto neuron_memory_info =
-      device_context_->GetNeuronMemoryInfo(tensor_buffer_handle);
-  if (!neuron_memory_info) {
-    return litert::Error(neuron_memory_info.Error());
-  }
-
-  if (neuron_adapter_api_.api().execution_set_output_from_memory(
-          execution_, graph_output_index, nullptr,
-          neuron_memory_info->neuron_memory, neuron_memory_info->offset,
-          neuron_memory_info->size) != NEURON_NO_ERROR) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "Failed to set execution output from memory");
-  }
-  return {};
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::DetachInput(
-    int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  // Nothing to do.
-  return {};
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::DetachOutput(
-    int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  // Nothing to do.
-  return {};
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::Invoke() {
-  if (neuron_adapter_api_.api().execution_compute(execution_) !=
-      NEURON_NO_ERROR) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "Failed to execute network");
-  }
-  return {};
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h b/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h
deleted file mode 100644
index f58ee976b693..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/dispatch/litert_dispatch_invocation_context.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
-
-#include <optional>
-
-#include "neuron/api/NeuronAdapter.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-class LiteRtDispatchInvocationContextT {
- public:
-  using Ptr = std::unique_ptr<LiteRtDispatchInvocationContextT>;
-
-  static litert::Expected<Ptr> Create(
-      litert::mediatek::NeuronAdapterApi& neuron_adapter_api,
-      LiteRtDispatchDeviceContext device_context,
-      LiteRtDispatchExecutableType exec_type,
-      const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-      int num_inputs, int num_outputs);
-
-  ~LiteRtDispatchInvocationContextT();
-
-  litert::Expected<LiteRtTensorBufferRequirements> GetInputRequirements(
-      int input_index, const LiteRtRankedTensorType& tensor_type);
-
-  litert::Expected<LiteRtTensorBufferRequirements> GetOutputRequirements(
-      int output_index, const LiteRtRankedTensorType& tensor_type);
-
-  litert::Expected<void> AttachInput(
-      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-  litert::Expected<void> AttachOutput(
-      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<void> DetachInput(
-      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-  litert::Expected<void> DetachOutput(
-      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<void> Invoke();
-
- private:
-  class IoRequirementsBuilder {
-   public:
-    IoRequirementsBuilder(size_t buffer_size,
-                          const std::vector<uint32_t>& padded_dimensions);
-    litert::Expected<LiteRtTensorBufferRequirements> Create();
-
-   private:
-    size_t buffer_size_;
-    std::vector<uint32_t> strides_;
-  };
-
-  LiteRtDispatchInvocationContextT(
-      const litert::mediatek::NeuronAdapterApi& neuron_adapter_api,
-      LiteRtDispatchDeviceContext device_context, NeuronModel* model,
-      NeuronCompilation* compilation, NeuronExecution* execution,
-      int num_inputs, int num_outputs)
-      : neuron_adapter_api_(neuron_adapter_api),
-        device_context_(device_context),
-        model_(model),
-        compilation_(compilation),
-        execution_(execution),
-        input_requirements_builders_(num_inputs),
-        output_requirements_builders_(num_outputs) {}
-
-  const litert::mediatek::NeuronAdapterApi& neuron_adapter_api_;
-  LiteRtDispatchDeviceContext device_context_;
-  NeuronModel* model_;
-  NeuronCompilation* compilation_;
-  NeuronExecution* execution_;
-  std::vector<std::unique_ptr<IoRequirementsBuilder>>
-      input_requirements_builders_;
-  std::vector<std::unique_ptr<IoRequirementsBuilder>>
-      output_requirements_builders_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/mediatek_build_defs.bzl b/tensorflow/lite/experimental/litert/vendors/mediatek/mediatek_build_defs.bzl
deleted file mode 100644
index e3f78f33343c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/mediatek_build_defs.bzl
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Build definitions for Mediatek backend."""
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "append_rule_kwargs", "litert_lib", "make_rpaths")
-
-_MTK_STD_LIBS_HOST = [
-    # copybara:uncomment_begin(google-only)
-    # "//third_party/neuro_pilot:latest/host/lib/libc++.so.1",
-    # "//third_party/neuro_pilot:latest/host/lib/libstdc++.so.6",
-    # copybara:uncomment_end
-]  # @unused
-
-_MTK_NEURON_ADAPTER_SO = [
-    # copybara:uncomment_begin(google-only)
-    # "//third_party/neuro_pilot:latest/host/lib/libneuron_adapter.so",
-    # copybara:uncomment_end
-]
-
-# TODO: Make rpaths dynamic with "$(location {})".
-_MTK_HOST_RPATHS = [
-    # copybara:uncomment_begin(google-only)
-    # "third_party/neuro_pilot/latest/host/lib",
-    # copybara:uncomment_end
-]
-
-def _litert_with_mtk_base(
-        litert_rule,
-        use_custom_std_libs = False,
-        **litert_rule_kwargs):
-    if use_custom_std_libs:
-        # TODO: Figure out strategy for custom libcc.
-        fail("Custom libcc not yet supported")
-
-    append_rule_kwargs(
-        litert_rule_kwargs,
-        data = select({
-            "//tensorflow:linux_x86_64": _MTK_NEURON_ADAPTER_SO,
-            "//conditions:default": [],
-        }),
-        linkopts = select({
-            "//tensorflow:linux_x86_64": [make_rpaths(_MTK_HOST_RPATHS)],
-            "//conditions:default": [],
-        }),
-    )
-
-    litert_rule(**litert_rule_kwargs)
-
-def litert_cc_lib_with_mtk(
-        use_custom_std_libs = False,
-        **litert_lib_kwargs):
-    """Creates a litert_lib target with mtk dependencies.
-
-    Args:
-        use_custom_std_libs: Whether to use a custom libcc provided by vendor. Not yet supported.
-        **litert_lib_kwargs: Keyword arguments passed to litert_lib.
-    """
-    _litert_with_mtk_base(
-        litert_lib,
-        use_custom_std_libs,
-        **litert_lib_kwargs
-    )
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.cc b/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.cc
deleted file mode 100644
index 43a6095405be..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h"
-
-#include <dlfcn.h>
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-
-#define LOAD_SYMB(S, H)                                                   \
-  if (auto maybe_H = dlib_.LookupSymbol<void*>(#S); maybe_H.HasValue()) { \
-    H = reinterpret_cast<decltype(&S)>(std::move(maybe_H).Value());       \
-  } else {                                                                \
-    LITERT_LOG(LITERT_WARNING, "Failed to load symbol %s: %s", #S,        \
-               dlib_.DlError());                                          \
-  }
-
-namespace litert {
-namespace mediatek {
-
-NeuronAdapterApi::NeuronAdapterApi() : api_(new Api) {}
-
-litert::Expected<NeuronAdapterApi::Ptr> NeuronAdapterApi::Create(
-    std::optional<std::string> shared_library_dir) {
-  std::unique_ptr<NeuronAdapterApi> neuron_adapter_api(new NeuronAdapterApi);
-  if (auto status = neuron_adapter_api->LoadSymbols(shared_library_dir);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to load NeuronAdapter shared library: %s",
-               status.Error().Message().c_str());
-    return status.Error();
-  }
-
-  return neuron_adapter_api;
-}
-
-litert::Expected<void> NeuronAdapterApi::LoadSymbols(
-    std::optional<std::string> shared_library_dir) {
-  constexpr auto kLibNeuronAdapterLib = "libneuron_adapter.so";
-
-  const std::vector<std::string> so_paths = {
-      // The following preinstalled library is for system partition
-      // applications.
-      "libneuronusdk_adapter.mtk.so", "libneuron_adapter_mgvi.so",
-      kLibNeuronAdapterLib,
-      // Finally, the app may want to provide their own version of the library.
-      shared_library_dir.has_value()
-          ? absl::StrCat(*shared_library_dir, "/", kLibNeuronAdapterLib)
-          : kLibNeuronAdapterLib};
-  for (auto& so_path : so_paths) {
-    auto maybe_dlib = SharedLibrary::Load(so_path, RtldFlags::Default());
-    if (maybe_dlib.HasValue()) {
-      dlib_ = std::move(maybe_dlib).Value();
-    }
-  }
-
-  if (!dlib_.Loaded()) {
-    return litert::Error(kLiteRtStatusErrorDynamicLoading,
-                         "Failed to load NeuronAdapter shared library");
-  }
-
-  LITERT_LOG(LITERT_INFO, "Loaded NeuronAdapter shared library.");
-
-  // Binds all supported symbols from the shared library to the function
-  // pointers.
-  LOAD_SYMB(NeuronCompilation_create, api_->compilation_create);
-  LOAD_SYMB(NeuronCompilation_createWithOptions,
-            api_->compilation_create_with_options);
-  LOAD_SYMB(NeuronCompilation_finish, api_->compilation_finish);
-  LOAD_SYMB(NeuronCompilation_free, api_->compilation_free);
-  LOAD_SYMB(NeuronCompilation_getInputPaddedDimensions,
-            api_->compilation_get_input_padded_dimensions);
-  LOAD_SYMB(NeuronCompilation_getInputPaddedSize,
-            api_->compilation_get_input_padded_size);
-  LOAD_SYMB(NeuronCompilation_getOutputPaddedDimensions,
-            api_->compilation_get_output_padded_dimensions);
-  LOAD_SYMB(NeuronCompilation_getOutputPaddedSize,
-            api_->compilation_get_output_padded_size);
-  LOAD_SYMB(NeuronCompilation_setOptimizationString,
-            api_->compilation_set_optimization_string);
-  LOAD_SYMB(NeuronCompilation_setPreference, api_->compilation_set_preference);
-  LOAD_SYMB(NeuronCompilation_setPriority, api_->compilation_set_priority);
-  LOAD_SYMB(NeuronExecution_compute, api_->execution_compute);
-  LOAD_SYMB(NeuronExecution_create, api_->execution_create);
-  LOAD_SYMB(NeuronExecution_free, api_->execution_free);
-  LOAD_SYMB(NeuronCompilation_getCompiledNetworkSize,
-            api_->compilation_get_compiled_network_size);
-  LOAD_SYMB(NeuronCompilation_storeCompiledNetwork,
-            api_->compilation_store_compiled_network);
-  LOAD_SYMB(NeuronExecution_setBoostHint, api_->execution_set_boost_hint);
-  LOAD_SYMB(NeuronExecution_setInputFromMemory,
-            api_->execution_set_input_from_memory);
-  LOAD_SYMB(NeuronExecution_setOutputFromMemory,
-            api_->execution_set_output_from_memory);
-  LOAD_SYMB(NeuronMemory_createFromAHardwareBuffer,
-            api_->memory_create_from_ahwb);
-  LOAD_SYMB(NeuronMemory_createFromFd, api_->memory_create_from_fd);
-  LOAD_SYMB(NeuronMemory_free, api_->memory_free);
-  LOAD_SYMB(NeuronModel_addOperand, api_->model_add_operand);
-  LOAD_SYMB(NeuronModel_addOperation, api_->model_add_operation);
-  LOAD_SYMB(NeuronModel_create, api_->model_create);
-  LOAD_SYMB(NeuronModel_finish, api_->model_finish);
-  LOAD_SYMB(NeuronModel_free, api_->model_free);
-  LOAD_SYMB(NeuronModel_getExtensionOperandType,
-            api_->model_get_extension_operand_type);
-  LOAD_SYMB(NeuronModel_getExtensionOperationType,
-            api_->model_get_extension_operation_type);
-  LOAD_SYMB(NeuronModel_identifyInputsAndOutputs,
-            api_->model_identify_inputs_and_outputs);
-  LOAD_SYMB(NeuronModel_restoreFromCompiledNetwork,
-            api_->model_restore_from_compiled_network);
-  LOAD_SYMB(NeuronModel_setName, api_->model_set_name);
-  LOAD_SYMB(NeuronModel_setOperandValue, api_->model_set_operand_value);
-  LOAD_SYMB(Neuron_getVersion, api_->get_version);
-
-  LITERT_LOG(LITERT_INFO, "NeuronAdapter symbols loaded");
-  return {};
-}
-
-Expected<NeuronModelPtr> NeuronAdapterApi::CreateModel() const {
-  NeuronModel* model;
-  if (api().model_create(&model) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to create NeuroModel");
-  }
-  return NeuronModelPtr{model, api().model_free};
-}
-
-Expected<NeuronCompilationPtr> NeuronAdapterApi::CreateCompilation(
-    NeuronModel* model) const {
-  NeuronCompilation* compilation;
-  if (api().compilation_create(model, &compilation) != NEURON_NO_ERROR) {
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to create NeuronCompilation");
-  }
-  return NeuronCompilationPtr{compilation, api().compilation_free};
-}
-
-Expected<NeuronCompilationPtr> NeuronAdapterApi::CreateCompilation(
-    NeuronModel* model, const std::string& compile_options) const {
-  NeuronCompilation* compilation;
-  if (auto status = api().compilation_create_with_options(
-          model, &compilation, compile_options.c_str());
-      status != NEURON_NO_ERROR) {
-    LITERT_LOG(LITERT_ERROR,
-               "NeuronCompilation_createWithOptions failed with error %d",
-               status);
-    return Error(kLiteRtStatusErrorRuntimeFailure,
-                 "Failed to create NeuronCompilation");
-  }
-  return NeuronCompilationPtr{compilation, api().compilation_free};
-}
-
-Expected<NeuronExecutionPtr> NeuronAdapterApi::CreateExecution(
-    NeuronCompilation* compilation) const {
-  NeuronExecution* execution;
-  if (api().execution_create(compilation, &execution) != NEURON_NO_ERROR) {
-    return litert::Error(kLiteRtStatusErrorRuntimeFailure,
-                         "Failed to create execution");
-  }
-  return NeuronExecutionPtr{execution, api().execution_free};
-}
-
-}  // namespace mediatek
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h b/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h
deleted file mode 100644
index c80b2d8b9fc5..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/neuron_adapter_api.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_NEURON_ADAPTER_API_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_NEURON_ADAPTER_API_H_
-
-#include <memory>
-#include <optional>
-#include <string>
-
-#include "neuron/api/NeuronAdapter.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-
-#if LITERT_HAS_AHWB_SUPPORT
-#include <android/hardware_buffer.h>
-#else
-struct AHardwareBuffer {};
-#endif
-
-namespace litert::mediatek {
-
-using NeuronModelPtr = std::unique_ptr<NeuronModel, void (*)(NeuronModel*)>;
-using NeuronCompilationPtr =
-    std::unique_ptr<NeuronCompilation, void (*)(NeuronCompilation*)>;
-using NeuronExecutionPtr =
-    std::unique_ptr<NeuronExecution, void (*)(NeuronExecution*)>;
-
-class NeuronAdapterApi {
- public:
-  using Ptr = std::unique_ptr<NeuronAdapterApi>;
-  struct Api;
-
-  NeuronAdapterApi(NeuronAdapterApi&) = delete;
-  NeuronAdapterApi(NeuronAdapterApi&&) = delete;
-  NeuronAdapterApi& operator=(const NeuronAdapterApi&) = delete;
-  NeuronAdapterApi& operator=(NeuronAdapterApi&&) = delete;
-
-  static Expected<Ptr> Create(std::optional<std::string> shared_library_dir);
-
-  const Api& api() const { return *api_; }
-
-  absl::string_view AotCompileOptions() const {
-    // Option `import_forever` has been recommended by MediaTek to reduce memory
-    // footprint when using the same I/O buffers across multiple invocations.
-    return "--apusys-config \"{ \\\"import_forever\\\": true }\"";
-  }
-
-  absl::string_view JitCompileOptions() const { return ""; }
-
-  Expected<NeuronModelPtr> CreateModel() const;
-
-  Expected<NeuronCompilationPtr> CreateCompilation(NeuronModel* model) const;
-
-  Expected<NeuronCompilationPtr> CreateCompilation(
-      NeuronModel* model, const std::string& compile_options) const;
-
-  Expected<NeuronExecutionPtr> CreateExecution(
-      NeuronCompilation* compilation) const;
-
- private:
-  NeuronAdapterApi();
-  litert::Expected<void> LoadSymbols(
-      std::optional<std::string> shared_library_dir);
-
-  // Handle to the shared library that implements the Neuron API.
-  //
-  // This will keep the shared library open until the NeuronAdapterApi object is
-  // destroyed.
-  SharedLibrary dlib_;
-  std::unique_ptr<Api> api_;
-};
-
-// This is not part of the provided NeuronAdapter header for some reason.
-int NeuronCompilation_createWithOptions(NeuronModel* model,
-                                        NeuronCompilation** compilation,
-                                        const char* options);
-
-// A convenient struct for holding function pointers to NeuronAdapter API
-// symbols. These function pointers will be loaded to the shared library on
-// device during runtime.
-struct NeuronAdapterApi::Api {
-  decltype(&NeuronCompilation_create) compilation_create = nullptr;
-  decltype(&NeuronCompilation_createWithOptions)
-      compilation_create_with_options = nullptr;
-  decltype(&NeuronCompilation_finish) compilation_finish = nullptr;
-  decltype(&NeuronCompilation_free) compilation_free = nullptr;
-  decltype(&NeuronCompilation_getCompiledNetworkSize)
-      compilation_get_compiled_network_size = nullptr;
-  decltype(&NeuronCompilation_getInputPaddedDimensions)
-      compilation_get_input_padded_dimensions = nullptr;
-  decltype(&NeuronCompilation_getInputPaddedSize)
-      compilation_get_input_padded_size = nullptr;
-  decltype(&NeuronCompilation_getOutputPaddedDimensions)
-      compilation_get_output_padded_dimensions = nullptr;
-  decltype(&NeuronCompilation_getOutputPaddedSize)
-      compilation_get_output_padded_size = nullptr;
-  decltype(&NeuronCompilation_setOptimizationString)
-      compilation_set_optimization_string = nullptr;
-  decltype(&NeuronCompilation_setPreference) compilation_set_preference =
-      nullptr;
-  decltype(&NeuronCompilation_setPriority) compilation_set_priority = nullptr;
-  decltype(&NeuronCompilation_storeCompiledNetwork)
-      compilation_store_compiled_network = nullptr;
-  decltype(&NeuronExecution_compute) execution_compute = nullptr;
-  decltype(&NeuronExecution_create) execution_create = nullptr;
-  decltype(&NeuronExecution_free) execution_free = nullptr;
-  decltype(&NeuronExecution_setBoostHint) execution_set_boost_hint = nullptr;
-  decltype(&NeuronExecution_setInputFromMemory)
-      execution_set_input_from_memory = nullptr;
-  decltype(&NeuronExecution_setOutputFromMemory)
-      execution_set_output_from_memory = nullptr;
-  decltype(&NeuronMemory_createFromAHardwareBuffer) memory_create_from_ahwb =
-      nullptr;
-  decltype(&NeuronMemory_createFromFd) memory_create_from_fd = nullptr;
-  decltype(&NeuronMemory_free) memory_free = nullptr;
-  decltype(&NeuronModel_addOperand) model_add_operand = nullptr;
-  decltype(&NeuronModel_addOperation) model_add_operation = nullptr;
-  decltype(&NeuronModel_create) model_create = nullptr;
-  decltype(&NeuronModel_finish) model_finish = nullptr;
-  decltype(&NeuronModel_free) model_free = nullptr;
-  decltype(&NeuronModel_getExtensionOperandType)
-      model_get_extension_operand_type = nullptr;
-  decltype(&NeuronModel_getExtensionOperationType)
-      model_get_extension_operation_type = nullptr;
-  decltype(&NeuronModel_identifyInputsAndOutputs)
-      model_identify_inputs_and_outputs = nullptr;
-  decltype(&NeuronModel_restoreFromCompiledNetwork)
-      model_restore_from_compiled_network = nullptr;
-  decltype(&NeuronModel_setName) model_set_name = nullptr;
-  decltype(&NeuronModel_setOperandValue) model_set_operand_value = nullptr;
-  decltype(&Neuron_getVersion) get_version = nullptr;
-};
-
-}  // namespace litert::mediatek
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_MEDIATEK_NEURON_ADAPTER_API_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/mediatek/supported_soc.csv b/tensorflow/lite/experimental/litert/vendors/mediatek/supported_soc.csv
deleted file mode 100644
index 0d7926506116..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/mediatek/supported_soc.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-# manufacturer,model,android_api_level
-Mediatek,MT6897,UNKNOWN
-Mediatek,MT6895Z_A/TCZA,UNKNOWN
-Mediatek,MT6985,UNKNOWN
-Mediatek,MT6989,UNKNOWN
-Mediatek,MT6983,UNKNOWN
-Mediatek,MT6895Z/TCZA,UNKNOWN
-Mediatek,MT6895Z_B/TCZA,UNKNOWN
-Mediatek,MT6991,UNKNOWN
-Mediatek,MT6983Z/CZA,UNKNOWN
-Mediatek,MT6983W/CZA,UNKNOWN
-Mediatek,MT6895,UNKNOWN
-Mediatek,MT6983Z/TCZA,UNKNOWN
-Mediatek,MT6991(ENG),UNKNOWN
-Mediatek,MT6895Z/CZA,UNKNOWN
-Mediatek,MT6989(ENG),UNKNOWN
-Mediatek,MT6985(ENG),UNKNOWN
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/BUILD
deleted file mode 100644
index 9e3c65e3f96c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/BUILD
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_lib", "litert_test")
-load("//tensorflow/lite/experimental/litert/vendors/qualcomm:qualcomm_build_defs.bzl", "litert_cc_lib_with_qnn")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "common",
-    hdrs = ["common.h"],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-    ],
-)
-
-litert_lib(
-    name = "qnn_log",
-    srcs = ["qnn_log.cc"],
-    hdrs = ["qnn_log.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-    ],
-)
-
-cc_library(
-    name = "qnn_manager_hdr",
-    hdrs = ["qnn_manager.h"],
-    deps = [
-        ":common",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_shared_library",
-    ],
-)
-
-litert_cc_lib_with_qnn(
-    name = "qnn_manager",
-    srcs = [
-        "qnn_manager.cc",
-    ],
-    hdrs = ["qnn_manager.h"],
-    include_system = True,
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    ungrte = True,
-    deps = [
-        ":common",
-        ":qnn_log",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_shared_library",
-        "//tensorflow/lite/experimental/litert/core:dynamic_loading",
-    ],
-)
-
-litert_test(
-    name = "qnn_manager_test",
-    srcs = ["qnn_manager_test.cc"],
-    linkstatic = True,
-    tags = [
-        # Tests with ungrte deps do not currently work on forge.
-        "no-remote-exec",
-        "notap",
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "no_oss",
-        # Sanitizer runtime doesn't work with anything that loads libQnnHtp.so.
-        "nosan",
-    ],
-    # This test can be run only on Android and Linux.
-    target_compatible_with = select({
-        "@platforms//os:android": [],
-        "@platforms//os:linux": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    deps = [
-        ":qnn_manager",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers_oss",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/tools:dump",
-    ],
-)
-
-cc_library(
-    name = "context_binary_info",
-    srcs = ["context_binary_info.cc"],
-    hdrs = ["context_binary_info.h"],
-    deps = [
-        ":qnn_manager",
-        ":qnn_tensor",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-    ],
-)
-
-cc_library(
-    name = "qnn_tensor",
-    srcs = ["qnn_tensor.cc"],
-    hdrs = ["qnn_tensor.h"],
-    deps = [
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/common.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/common.h
deleted file mode 100644
index 34b8971460c4..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/common.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMMON_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMMON_H_
-
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemInterface.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-#define LITERT_RETURN_STATUS_IF_QNN_NOT_OK(expr) \
-  if (QNN_SUCCESS != (expr)) {                   \
-    return kLiteRtStatusErrorNotFound;           \
-  }
-
-// Pointers to functions of a dynamically loaded QNN library.
-typedef QNN_INTERFACE_VER_TYPE QnnApi;
-
-// Pointers to functions of a dynamically loaded QNN system library.
-typedef QNN_SYSTEM_INTERFACE_VER_TYPE QnnSystemApi;
-
-// QNN backend library should be on DT_RUNPATH (-rpath).
-static const char kLibQnnHtpSo[] = "libQnnHtp.so";
-
-// QNN backend library should be on DT_RUNPATH (-rpath).
-static const char kLibQnnSystemSo[] = "libQnnSystem.so";
-
-// Map LiteRT element type to Qnn counterpart.
-inline LiteRtStatus LegalizeElementType(litert::ElementType litert_type,
-                                        Qnn_DataType_t* qnn_type) {
-  switch (litert_type) {
-    case litert::ElementType::Bool:
-      *qnn_type = QNN_DATATYPE_BOOL_8;
-      break;
-    case litert::ElementType::Int4:
-      *qnn_type = QNN_DATATYPE_SFIXED_POINT_4;
-      break;
-    case litert::ElementType::Int8:
-      *qnn_type = QNN_DATATYPE_INT_8;
-      break;
-    case litert::ElementType::Int16:
-      *qnn_type = QNN_DATATYPE_INT_16;
-      break;
-    case litert::ElementType::Int32:
-      *qnn_type = QNN_DATATYPE_INT_32;
-      break;
-    case litert::ElementType::Int64:
-      *qnn_type = QNN_DATATYPE_INT_64;
-      break;
-    case litert::ElementType::UInt8:
-      *qnn_type = QNN_DATATYPE_UINT_8;
-      break;
-    case litert::ElementType::UInt16:
-      *qnn_type = QNN_DATATYPE_UINT_16;
-      break;
-    case litert::ElementType::UInt32:
-      *qnn_type = QNN_DATATYPE_UINT_32;
-      break;
-    case litert::ElementType::UInt64:
-      *qnn_type = QNN_DATATYPE_UINT_64;
-      break;
-    case litert::ElementType::Float16:
-      *qnn_type = QNN_DATATYPE_FLOAT_16;
-      break;
-    case litert::ElementType::Float32:
-      *qnn_type = QNN_DATATYPE_FLOAT_32;
-      break;
-    case litert::ElementType::Float64:
-      *qnn_type = QNN_DATATYPE_FLOAT_64;
-      break;
-    default:
-      return kLiteRtStatusErrorUnsupported;
-  }
-  return kLiteRtStatusOk;
-}
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMMON_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/BUILD
deleted file mode 100644
index aa9457f14a53..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/BUILD
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_dynamic_lib", "litert_lib", "litert_test")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:private"],
-)
-
-litert_dynamic_lib(
-    name = "qnn_compiler_plugin",
-    srcs = ["qnn_compiler_plugin.cc"],
-    hdrs = ["//tensorflow/lite/experimental/litert/vendors/c:litert_compiler_plugin.h"],
-    export_litert_only = True,
-    shared_lib_name = "qnn_compiler_plugin_so",
-    so_name = "libLiteRtCompilerPlugin_Qualcomm.so",
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    ungrte = True,
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":qnn_compose_graph",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-litert_test(
-    name = "qnn_compiler_plugin_test",
-    srcs = [
-        "qnn_compiler_plugin_test.cc",
-    ],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-    ],
-    linkstatic = True,
-    tags = [
-        # Tests with ungrte deps do not currently work on forge.
-        "no-remote-exec",
-        "notap",
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "no_oss",
-        # Sanitizer runtime doesn't work with anything that loads libQnnHtp.so.
-        "nosan",
-    ],
-    # This test can be run only on Android and Linux.
-    target_compatible_with = select({
-        "@platforms//os:android": [],
-        "@platforms//os:linux": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    use_sys_malloc = True,
-    deps = [
-        ":qnn_compiler_plugin",  # buildcleaner: keep
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers_oss",
-        "//tensorflow/lite/experimental/litert/test:test_models",
-        "//tensorflow/lite/experimental/litert/vendors/cc:litert_compiler_plugin",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations:quantize_op_legalization",
-    ],
-)
-
-litert_lib(
-    name = "qnn_compose_graph",
-    srcs = ["qnn_compose_graph.cc"],
-    hdrs = ["qnn_compose_graph.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":graph_mapper",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_element_type",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/core/model",
-        "//tensorflow/lite/experimental/litert/tools:dump",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:cast_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:concatenation_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:dynamic_update_slice_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:elementwise_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:embedding_lookup_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:fully_connected_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:gather_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:gelu_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:matmul_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:mean_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:pack_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:quantize_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:reduce_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:reshape_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:rms_norm_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:select_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:slice_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:softmax_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:split_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:tanh_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders:transpose_op_builder",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:quantize_params_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-litert_lib(
-    name = "graph_mapper",
-    srcs = [
-        "graph_mapper.cc",
-    ],
-    hdrs = ["graph_mapper.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_element_type",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/BUILD
deleted file mode 100644
index fa0e3f55e19b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/BUILD
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:__subpackages__"],
-)
-
-cc_library(
-    name = "qnn_tensor",
-    srcs = ["qnn_tensor.cc"],
-    hdrs = ["qnn_tensor.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-    ],
-)
-
-cc_test(
-    name = "qnn_tensor_test",
-    srcs = ["qnn_tensor_test.cc"],
-    data = [
-        "//tensorflow/lite/experimental/litert/test:mlir_test_data",
-        "//tensorflow/lite/experimental/litert/test:tflite_test_data",
-    ],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "no_oss",
-    ],
-    deps = [
-        ":qnn_tensor",
-        "@com_google_googletest//:gtest_main",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-        "//tensorflow/lite/experimental/litert/test:test_models",
-    ],
-)
-
-cc_library(
-    name = "qnn_op",
-    srcs = ["qnn_op.cc"],
-    hdrs = ["qnn_op.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-    ],
-)
-
-cc_test(
-    name = "qnn_op_test",
-    srcs = ["qnn_op_test.cc"],
-    data = ["//tensorflow/lite/experimental/litert/test:mlir_test_data"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "no_oss",
-    ],
-    deps = [
-        ":qnn_op",
-        "@com_google_googletest//:gtest_main",
-        "@com_google_absl//absl/strings",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-    ],
-)
-
-cc_test(
-    name = "op_compatibility_test",
-    srcs = ["op_compatibility_test.cc"],
-    data = ["//tensorflow/lite/experimental/litert/test:mlir_test_data"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "no_oss",
-    ],
-    deps = [
-        ":qnn_op",
-        "@com_google_googletest//:gtest_main",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:matchers",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/op_compatibility_test.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/op_compatibility_test.cc
deleted file mode 100644
index 477711417441..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/op_compatibility_test.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include <gtest/gtest.h>
-#include "absl/strings/match.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-
-namespace {
-
-static constexpr absl::string_view kOpTpl = "simple_%s_op.tflite";
-struct OpInfo {
-  std::string op_name;
-  std::string expected_type_name;
-};
-
-// TODOL: b/365299994 - Add "stablehlo_scatter" once muti subgraphs is
-// supported.
-// clang-format off
-const auto kSupportedOps = testing::Values(
-    OpInfo{"add", "ElementWiseAdd"},
-    OpInfo{"mul", "ElementWiseMultiply"},
-    OpInfo{"batch_matmul", "MatMul"},
-    OpInfo{"concatenation", "Concat"},
-    OpInfo{"div", "ElementWiseDivide"},
-    OpInfo{"fully_connected", "FullyConnected"},
-    OpInfo{"reshape", "Reshape"},
-    OpInfo{"rsqrt", "ElementWiseRsqrt"},
-    OpInfo{"select_v2", "ElementWiseSelect"},
-    OpInfo{"select", "ElementWiseSelect"},
-    OpInfo{"strided_slice", "StridedSlice"},
-    OpInfo{"slice", "StridedSlice"},
-    OpInfo{"softmax", "Softmax"},
-    OpInfo{"sub", "ElementWiseSubtract"},
-    OpInfo{"tanh", "Tanh"},
-    OpInfo{"transpose", "Transpose"});
-// clang-format on
-
-class OpCompatibilityTest : public ::testing::TestWithParam<OpInfo> {};
-
-TEST_P(OpCompatibilityTest, SupportedOpsTest) {
-  auto test_params = GetParam();
-  std::string model_path = absl::StrFormat(kOpTpl, test_params.op_name);
-  auto model = litert::testing::LoadTestFileModel(model_path);
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-  auto ops = subgraph->Ops();
-
-  Qnn_OpConfig_t qnn_op = litert::qnn::BuildDefaultOp();
-  LITERT_ASSERT_OK(litert::qnn::LegalizeOp(ops.front().Get(), qnn_op));
-
-  EXPECT_TRUE(absl::StrContains(qnn_op.v1.name, test_params.op_name));
-  EXPECT_STREQ(qnn_op.v1.packageName, "qti.aisw");
-  EXPECT_STREQ(qnn_op.v1.typeName, test_params.expected_type_name.c_str());
-
-  EXPECT_EQ(qnn_op.v1.numOfInputs, 0);
-  EXPECT_EQ(qnn_op.v1.numOfOutputs, 0);
-  EXPECT_EQ(qnn_op.v1.numOfParams, 0);
-
-  litert::qnn::ResetOp(qnn_op);
-}
-
-INSTANTIATE_TEST_SUITE_P(SupportedOpsTest, OpCompatibilityTest, kSupportedOps);
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.cc
deleted file mode 100644
index 0a6949afaf78..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-// A macro dance to create a unique literal string given a prefix.
-#define STRINGIFY(x) #x
-#define QNN_OP_NAME(prefix) STRINGIFY(prefix##__COUNTER)
-
-namespace litert::qnn {
-
-namespace {
-
-// Maps "op-code" related information (name, packageName, typeName) from src
-// to dest.
-LiteRtStatus LegalizeOpType(const Op& src, Qnn_OpConfig_t& dest) {
-  switch (src.Code()) {
-    case kLiteRtOpCodeTflMul:
-      dest.v1.name = QNN_OP_NAME(mul_);
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "ElementWiseMultiply";
-      break;
-    case kLiteRtOpCodeTflAdd:
-      dest.v1.name = QNN_OP_NAME("add");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "ElementWiseAdd";
-      break;
-    case kLiteRtOpCodeTflBatchMatmul:
-      dest.v1.name = QNN_OP_NAME("batch_matmul");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "MatMul";
-      break;
-    case kLiteRtOpCodeTflConcatenation:
-      dest.v1.name = QNN_OP_NAME("concatenation");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "Concat";
-      break;
-    case kLiteRtOpCodeTflDiv:
-      dest.v1.name = QNN_OP_NAME("div");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "ElementWiseDivide";
-      break;
-    case kLiteRtOpCodeTflFullyConnected:
-      dest.v1.name = QNN_OP_NAME("fully_connected");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "FullyConnected";
-      break;
-    case kLiteRtOpCodeTflReshape:
-      dest.v1.name = QNN_OP_NAME("reshape");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "Reshape";
-      break;
-    case kLiteRtOpCodeTflRsqrt:
-      dest.v1.name = QNN_OP_NAME("rsqrt");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "ElementWiseRsqrt";
-      break;
-    case kLiteRtOpCodeTflSelectV2:
-      dest.v1.name = QNN_OP_NAME("select_v2");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "ElementWiseSelect";
-      break;
-    case kLiteRtOpCodeTflSelect:
-      dest.v1.name = QNN_OP_NAME("select");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "ElementWiseSelect";
-      break;
-    case kLiteRtOpCodeTflStridedSlice:
-      dest.v1.name = QNN_OP_NAME("strided_slice");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "StridedSlice";
-      break;
-    case kLiteRtOpCodeTflSlice:
-      dest.v1.name = QNN_OP_NAME("slice");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "StridedSlice";
-      break;
-    case kLiteRtOpCodeTflSoftmax:
-      dest.v1.name = QNN_OP_NAME("softmax");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "Softmax";
-      break;
-    case kLiteRtOpCodeTflSub:
-      dest.v1.name = QNN_OP_NAME("sub");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "ElementWiseSubtract";
-      break;
-    case kLiteRtOpCodeTflTanh:
-      dest.v1.name = QNN_OP_NAME("tanh");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "Tanh";
-      break;
-    case kLiteRtOpCodeTflTranspose:
-      dest.v1.name = QNN_OP_NAME("transpose");
-      dest.v1.packageName = "qti.aisw";
-      dest.v1.typeName = "Transpose";
-      break;
-    default:
-      return kLiteRtStatusErrorUnsupported;
-  }
-  return kLiteRtStatusOk;
-}
-
-}  // namespace
-
-Qnn_OpConfig_t BuildDefaultOp() {
-  Qnn_OpConfig_t op = QNN_OPCONFIG_INIT;
-  ResetOp(op);
-  return op;
-}
-Qnn_Param_t BuildDefaultParam() {
-  Qnn_Param_t param = QNN_PARAM_INIT;
-  ResetParam(param);
-  return param;
-}
-
-void ResetOp(Qnn_OpConfig_t& op) {
-  op = QNN_OPCONFIG_INIT;
-  op.version = QNN_OPCONFIG_VERSION_1;
-  op.v1 = QNN_OPCONFIG_V1_INIT;
-}
-
-void ResetParam(Qnn_Param_t& param) { param = QNN_PARAM_INIT; }
-LiteRtStatus LegalizeOp(LiteRtOp src, Qnn_OpConfig_t& dest) {
-  ResetOp(dest);
-  Op op(src);
-  return LegalizeOpType(op, dest);
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h
deleted file mode 100644
index 20e0f27f798b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_OP_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_OP_H_
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-
-namespace litert::qnn {
-
-//
-// Initialize QNN Op.
-//
-
-// NOTE: Any referential data within a QNN Op
-// is allocated with "new" and must be explicitly cleaned up with ResetOp.
-
-// Construct a "blank" QNN Op.
-Qnn_OpConfig_t BuildDefaultOp();
-
-// Construct a "blank" QNN Param.
-Qnn_Param_t BuildDefaultParam();
-
-// Reset the given tensor, deallocating anything on the heap that it points to.
-void ResetOp(Qnn_OpConfig_t& op);
-
-// Reset the given param, deallocating anything on the heap that it points to.
-void ResetParam(Qnn_Param_t& param);
-
-//
-// Legalize LiteRt Op to Analogous QNN Construct.
-//
-
-// Map src op onto dest. Resets dest before doing anything. This only handles
-// attribute-like info. It does not set edges (in/out tensors).
-LiteRtStatus LegalizeOp(LiteRtOp src, Qnn_OpConfig_t& dest);
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_OP_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op_test.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op_test.cc
deleted file mode 100644
index dd78cfca40b8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op_test.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/match.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-
-namespace {
-
-using testing::litert::IsError;
-
-TEST(TestInitQnnOp, BuildDefaultOp) {
-  Qnn_OpConfig_t op = litert::qnn::BuildDefaultOp();
-  ASSERT_EQ(op.version, QNN_OPCONFIG_VERSION_1);
-}
-
-TEST(TestLegalizeOp, SimpleSupportedOp) {
-  auto model = litert::testing::LoadTestFileModel("one_mul.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-  auto ops = subgraph->Ops();
-
-  Qnn_OpConfig_t qnn_op = litert::qnn::BuildDefaultOp();
-  LITERT_ASSERT_OK(litert::qnn::LegalizeOp(ops.front().Get(), qnn_op));
-
-  EXPECT_TRUE(absl::StrContains(qnn_op.v1.name, "mul"));
-  EXPECT_STREQ(qnn_op.v1.packageName, "qti.aisw");
-  EXPECT_STREQ(qnn_op.v1.typeName, "ElementWiseMultiply");
-
-  EXPECT_EQ(qnn_op.v1.numOfInputs, 0);
-  EXPECT_EQ(qnn_op.v1.numOfOutputs, 0);
-  EXPECT_EQ(qnn_op.v1.numOfParams, 0);
-
-  litert::qnn::ResetOp(qnn_op);
-}
-
-TEST(TestLegalizeOp, UnsupportedOp) {
-  auto model = litert::testing::LoadTestFileModel("simple_floor_mod_op.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-  auto ops = subgraph->Ops();
-
-  Qnn_OpConfig_t qnn_op = litert::qnn::BuildDefaultOp();
-  EXPECT_THAT(litert::qnn::LegalizeOp(ops.front().Get(), qnn_op),
-              IsError(kLiteRtStatusErrorUnsupported));
-
-  litert::qnn::ResetOp(qnn_op);
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.cc
deleted file mode 100644
index 4a308f6da780..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-
-#include <cstdint>
-
-#include "absl/log/absl_check.h"
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-
-namespace litert::qnn {
-
-namespace {
-
-LiteRtStatus LegalizeShapeInfo(const litert::Layout& src, Qnn_Tensor_t& dest) {
-  LITERT_ENSURE_SUPPORTED(!src.HasStrides(), "Strides not yet supported");
-
-  dest.v2.rank = src.Rank();
-  // Ad-hoc fix: rank 0 tensor needs to be single element 1D tensor in QNN.
-  if (dest.v2.rank == 0) {
-    LITERT_LOG(LITERT_INFO, "Setting rank 0 tensor to single element tensor");
-    dest.v2.rank = 1;
-    dest.v2.dimensions = new uint32_t[1];
-    dest.v2.dimensions[0] = 1;
-    return kLiteRtStatusOk;
-  }
-
-  dest.v2.dimensions = new uint32_t[dest.v2.rank];
-  for (int i = 0; i < dest.v2.rank; ++i) {
-    const auto src_dim = src.Dimensions()[i];
-    LITERT_ENSURE(src_dim >= 1, kLiteRtStatusErrorInvalidArgument,
-                  "Cannot pass dim < 1 to QNN Tensor.");
-
-    dest.v2.dimensions[i] = src.Dimensions()[i];
-  }
-  return kLiteRtStatusOk;
-}
-
-void FreeTensorDims(Qnn_Tensor_t& tensor) {
-  if (tensor.version == QNN_TENSOR_VERSION_2 &&
-      tensor.v2.dimensions != nullptr) {
-    delete[] tensor.v2.dimensions;
-    tensor.v2.dimensions = nullptr;
-    tensor.v2.rank = 0;
-  }
-}
-
-void FreePerChannelQuantization(Qnn_Tensor_t& tensor) {
-  if (tensor.v2.quantizeParams.quantizationEncoding ==
-      QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-    delete[] tensor.v2.quantizeParams.axisScaleOffsetEncoding.scaleOffset;
-    tensor.v2.quantizeParams.axisScaleOffsetEncoding.scaleOffset = nullptr;
-    tensor.v2.quantizeParams.axisScaleOffsetEncoding.numScaleOffsets = 0;
-  }
-}
-
-}  // namespace
-
-void SetInputTensorAttrs(Qnn_Tensor_t& tensor) {
-  ABSL_DCHECK(tensor.version == QNN_TENSOR_VERSION_2);
-  tensor.v2.type = QNN_TENSOR_TYPE_APP_WRITE;
-  tensor.v2.memType = QNN_TENSORMEMTYPE_RAW;
-  tensor.v2.clientBuf = QNN_CLIENT_BUFFER_INIT;
-}
-
-void SetOutputTensorAttrs(Qnn_Tensor_t& tensor) {
-  ABSL_DCHECK(tensor.version == QNN_TENSOR_VERSION_2);
-  tensor.v2.type = QNN_TENSOR_TYPE_APP_READ;
-}
-
-void SetResultTensorAttrs(Qnn_Tensor_t& tensor) {
-  ABSL_DCHECK(tensor.version == QNN_TENSOR_VERSION_2);
-  tensor.v2.memType = QNN_TENSORMEMTYPE_RAW;
-  tensor.v2.type = QNN_TENSOR_TYPE_NATIVE;
-}
-
-void ResetTensor(Qnn_Tensor_t& tensor) {
-  FreeTensorDims(tensor);
-  FreePerChannelQuantization(tensor);
-  tensor = QNN_TENSOR_INIT;
-  tensor.version = QNN_TENSOR_VERSION_2;
-  tensor.v2 = QNN_TENSOR_V2_INIT;
-  tensor.v2.dataFormat = QNN_TENSOR_DATA_FORMAT_DENSE;
-  tensor.v2.memType = QNN_TENSORMEMTYPE_RAW;
-}
-
-Qnn_Tensor_t BuildDefaultTensor(uint32_t id) {
-  Qnn_Tensor_t tensor = QNN_TENSOR_INIT;
-  ResetTensor(tensor);
-  tensor.v2.id = id;
-  return tensor;
-}
-
-Qnn_Tensor_t BuildDefaultTensor() { return BuildDefaultTensor(0); }
-
-Qnn_Tensor_t BuildInputTensor() {
-  auto tensor = BuildDefaultTensor();
-  SetInputTensorAttrs(tensor);
-  return tensor;
-}
-
-Qnn_ClientBuffer_t BuildDefaultClientBuffer() {
-  Qnn_ClientBuffer_t client_buf = QNN_CLIENT_BUFFER_INIT;
-  client_buf.data = nullptr;
-  client_buf.dataSize = 0;
-  return client_buf;
-}
-
-Qnn_Tensor_t BuildOutputTensor() {
-  Qnn_Tensor_t tensor = BuildDefaultTensor();
-  SetOutputTensorAttrs(tensor);
-  return tensor;
-}
-
-uint32_t MoveToId(Qnn_Tensor_t& tensor) {
-  const auto id = tensor.v2.id;
-  ResetTensor(tensor);
-  tensor.v2.id = id;
-  return id;
-}
-
-void SetPerChannelQuantization(
-    Qnn_Tensor_t& tensor,
-    const LiteRtQuantizationPerChannel& lite_rt_quantization_per_channel) {
-  tensor.v2.quantizeParams.quantizationEncoding =
-      QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET;
-
-  tensor.v2.quantizeParams.axisScaleOffsetEncoding = QNN_AXIS_SCALE_OFFSET_INIT;
-  tensor.v2.quantizeParams.axisScaleOffsetEncoding.axis =
-      lite_rt_quantization_per_channel.quantized_dimension;
-  tensor.v2.quantizeParams.axisScaleOffsetEncoding.numScaleOffsets =
-      lite_rt_quantization_per_channel.num_channels;
-
-  // Allocates memory for scaleOffset array.
-  tensor.v2.quantizeParams.axisScaleOffsetEncoding.scaleOffset =
-      new Qnn_ScaleOffset_t[lite_rt_quantization_per_channel.num_channels];
-
-  for (int i = 0; i < lite_rt_quantization_per_channel.num_channels; ++i) {
-    tensor.v2.quantizeParams.axisScaleOffsetEncoding.scaleOffset[i].scale =
-        lite_rt_quantization_per_channel.scales[i];
-    tensor.v2.quantizeParams.axisScaleOffsetEncoding.scaleOffset[i].offset =
-        lite_rt_quantization_per_channel.zero_points[i];
-  }
-}
-
-void SetPerTensorQuantization(
-    Qnn_Tensor_t& tensor,
-    const LiteRtQuantizationPerTensor& lite_rt_quantization_per_tensor) {
-  tensor.v2.quantizeParams.quantizationEncoding =
-      QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
-  tensor.v2.quantizeParams.scaleOffsetEncoding.scale =
-      lite_rt_quantization_per_tensor.scale;
-  tensor.v2.quantizeParams.scaleOffsetEncoding.offset =
-      lite_rt_quantization_per_tensor.zero_point;
-}
-
-LiteRtStatus LegalizeQuntizationParameter(const litert::Tensor& src,
-                                          Qnn_Tensor_t& dest) {
-  LiteRtQuantizationTypeId lite_rt_quantization_type_id = src.QTypeId();
-  switch (lite_rt_quantization_type_id) {
-    case kLiteRtQuantizationPerTensor:
-      SetPerTensorQuantization(dest, src.PerTensorQuantization());
-      return kLiteRtStatusOk;
-    case kLiteRtQuantizationPerChannel:
-      SetPerChannelQuantization(dest, src.PerChannelQuantization());
-      return kLiteRtStatusOk;
-    default:
-      LITERT_LOG(LITERT_ERROR, "Unsupported quantization type.");
-      return kLiteRtStatusErrorInvalidArgument;
-  }
-}
-
-LiteRtStatus LegalizeTensor(const litert::Tensor& src, Qnn_Tensor_t& dest) {
-  if (src.TypeId() != kLiteRtRankedTensorType) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  ResetTensor(dest);
-
-  if (src.HasQuantization()) {
-    LITERT_RETURN_IF_ERROR(LegalizeQuntizationParameter(src, dest));
-  }
-
-  auto src_ranked_tensor_type = src.RankedTensorType();
-  if (!src_ranked_tensor_type) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               src_ranked_tensor_type.Error().Message().c_str());
-    return src_ranked_tensor_type.Error().Status();
-  }
-
-  Qnn_DataType_t* qnn_data_type = &dest.v2.dataType;
-  LITERT_RETURN_IF_ERROR(LegalizeElementType(
-      src_ranked_tensor_type->ElementType(), qnn_data_type));
-
-  LITERT_RETURN_IF_ERROR(
-      LegalizeShapeInfo(src_ranked_tensor_type->Layout(), dest));
-
-  const bool is_subgraph_in = src.IsSubgraphInput();
-  const bool is_subgraph_out = src.IsSubgraphOutput();
-  const bool is_constant = src.IsConstant();
-
-  LITERT_ENSURE(!(is_subgraph_in && is_subgraph_out),
-                kLiteRtStatusErrorInvalidArgument,
-                "Malformed tensor, cannot be both subgraph in and out.");
-  if (is_constant) {
-    LITERT_LOG(LITERT_INFO, "Adding constant tensor %s to qnn graph",
-               dest.v2.name);
-    LITERT_ENSURE(src.HasWeights(), kLiteRtStatusErrorInvalidLegalization,
-                  "Empty weights for constant tensor.");
-    Qnn_ClientBuffer_t client_buf = BuildDefaultClientBuffer();
-    client_buf.data = (void*)src.Weights().Bytes().data();
-    client_buf.dataSize = src.Weights().Bytes().size();
-    dest.v2.clientBuf = client_buf;
-    dest.v2.memType = QNN_TENSORMEMTYPE_RAW;
-    dest.v2.type = QNN_TENSOR_TYPE_STATIC;
-    dest.v2.isDynamicDimensions = nullptr;
-  }
-
-  if (is_subgraph_in) {
-    LITERT_LOG(LITERT_INFO, "Adding subgraph input tensor to qnn graph");
-    SetInputTensorAttrs(dest);
-  }
-  if (is_subgraph_out) {
-    LITERT_LOG(LITERT_INFO, "Adding subgraph output tensor to qnn graph");
-    SetOutputTensorAttrs(dest);
-  }
-  if (!is_constant && !is_subgraph_in && !is_subgraph_out) {
-    LITERT_LOG(LITERT_INFO, "Adding result tensor to qnn graph");
-    SetResultTensorAttrs(dest);
-  }
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h
deleted file mode 100644
index 607cc4c3decb..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_TENSOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_TENSOR_H_
-
-#include <cstdint>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-
-namespace litert::qnn {
-
-//
-// Initialize QNN Tensors.
-//
-
-// NOTE: Within LiteRt land, all Qnn Tensors are treated as "v2". Any
-// referential data (like dimensions : uint32_t*) within a QNN Tensor
-// is allocated with "new" and must be explicitly cleaned up with ResetTensor.
-
-// Construct a "blank" QNN Tensor.
-Qnn_Tensor_t BuildDefaultTensor();
-
-// Construct a "blank" QNN Tensor with given id.
-Qnn_Tensor_t BuildDefaultTensor(uint32_t id);
-
-// Constructa a "blank" QNN Tensor meant to be used as a graph input.
-Qnn_Tensor_t BuildInputTensor();
-
-// Constructa a "blank" QNN Tensor meant to be used as a graph output.
-Qnn_Tensor_t BuildOutputTensor();
-
-Qnn_ClientBuffer_t BuildDefaultClientBuffer();
-
-// Adds attributes to given tensor making it amenable for use as graph input.
-void SetInputTensorAttrs(Qnn_Tensor_t& tensor);
-
-// Adds attributes to given tensor making it amenable for use as graph output.
-void SetOutputTensorAttrs(Qnn_Tensor_t& tensor);
-
-// Adds attributes to given tensor making it amenable for uses a intermediate
-// output.
-void SetResultTensorAttrs(Qnn_Tensor_t& tensor);
-
-// Reset the given tensor, deallocating anything on the heap that it points to.
-void ResetTensor(Qnn_Tensor_t& tensor);
-
-// Resets all fields other than id in the given tensor and returns the id for
-// convenience. Only the id is needed to traffic QNN Tensors after they have
-// been registered with the context.
-uint32_t MoveToId(Qnn_Tensor_t& tensor);
-
-//
-// Legalize LiteRt Tensors to Analogous QNN Construct.
-//
-
-// Map src tensor onto dest. Resets dest before doing anything.
-LiteRtStatus LegalizeTensor(const litert::Tensor& src, Qnn_Tensor_t& dest);
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_IR_QNN_TENSOR_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor_test.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor_test.cc
deleted file mode 100644
index ba38fd211457..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor_test.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/test_models.h"
-
-namespace {
-
-constexpr float kSimpleMulQuantModelOutputScale = 0.00028621565f;
-constexpr float kSimpleMulQuantModelOutputOffset = 0;
-
-TEST(TestInitQnnTensor, BuildDefaultTensor) {
-  Qnn_Tensor_t tensor = litert::qnn::BuildDefaultTensor();
-  ASSERT_EQ(tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(tensor.v2.dataFormat, QNN_TENSOR_DATA_FORMAT_DENSE);
-  EXPECT_EQ(tensor.v2.rank, 0);
-  EXPECT_EQ(tensor.v2.dimensions, nullptr);
-  EXPECT_EQ(tensor.v2.id, 0);
-}
-
-TEST(TestInitQnnTensor, BuildDefaultTensorWithId) {
-  Qnn_Tensor_t tensor = litert::qnn::BuildDefaultTensor(2);
-  ASSERT_EQ(tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(tensor.v2.dataFormat, QNN_TENSOR_DATA_FORMAT_DENSE);
-  EXPECT_EQ(tensor.v2.rank, 0);
-  EXPECT_EQ(tensor.v2.dimensions, nullptr);
-  EXPECT_EQ(tensor.v2.id, 2);
-}
-
-TEST(TestInitQnnTensor, BuildDefaultInputTensor) {
-  Qnn_Tensor_t tensor = litert::qnn::BuildInputTensor();
-  ASSERT_EQ(tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(tensor.v2.type, QNN_TENSOR_TYPE_APP_WRITE);
-  EXPECT_EQ(tensor.v2.memType, QNN_TENSORMEMTYPE_RAW);
-  EXPECT_EQ(tensor.v2.clientBuf.dataSize, 0);
-}
-
-TEST(TestInitQnnTensor, SetInputTensor) {
-  Qnn_Tensor_t tensor = litert::qnn::BuildDefaultTensor();
-  litert::qnn::SetInputTensorAttrs(tensor);
-  ASSERT_EQ(tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(tensor.v2.type, QNN_TENSOR_TYPE_APP_WRITE);
-  EXPECT_EQ(tensor.v2.memType, QNN_TENSORMEMTYPE_RAW);
-  EXPECT_EQ(tensor.v2.clientBuf.dataSize, 0);
-}
-
-TEST(TestInitQnnTensor, BuildDefaultOutputTensor) {
-  Qnn_Tensor_t tensor = litert::qnn::BuildOutputTensor();
-  ASSERT_EQ(tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(tensor.v2.type, QNN_TENSOR_TYPE_APP_READ);
-}
-
-TEST(TestInitQnnTensor, SetOutputTensor) {
-  Qnn_Tensor_t tensor = litert::qnn::BuildDefaultTensor();
-  litert::qnn::SetOutputTensorAttrs(tensor);
-  ASSERT_EQ(tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(tensor.v2.type, QNN_TENSOR_TYPE_APP_READ);
-}
-
-TEST(TestInitQnnTensor, MoveToId) {
-  Qnn_Tensor_t tensor = litert::qnn::BuildDefaultTensor(2);
-
-  litert::qnn::SetOutputTensorAttrs(tensor);
-  ASSERT_EQ(tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(tensor.v2.type, QNN_TENSOR_TYPE_APP_READ);
-
-  EXPECT_EQ(litert::qnn::MoveToId(tensor), 2);
-  EXPECT_EQ(tensor.v2.id, 2);
-  EXPECT_EQ(tensor.v2.type, QNN_TENSOR_TYPE_UNDEFINED);
-}
-
-TEST(TestLegalizeTensor, SimpleSupportedTensorSubgraphInput) {
-  auto model = litert::testing::LoadTestFileModel("one_mul.tflite");
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-  auto outputs = subgraph->Outputs();
-
-  auto qnn_tensor = litert::qnn::BuildDefaultTensor();
-  const auto& output_tensor = outputs.front();
-  LITERT_ASSERT_OK(litert::qnn::LegalizeTensor(output_tensor, qnn_tensor));
-
-  ASSERT_EQ(qnn_tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(qnn_tensor.v2.dataType, QNN_DATATYPE_FLOAT_32);
-  EXPECT_EQ(qnn_tensor.v2.type, QNN_TENSOR_TYPE_APP_READ);
-
-  ASSERT_EQ(qnn_tensor.v2.rank, 2);
-  ASSERT_NE(qnn_tensor.v2.dimensions, nullptr);
-  EXPECT_THAT(absl::MakeConstSpan(qnn_tensor.v2.dimensions, 2),
-              ::testing::ElementsAreArray({2, 2}));
-
-  litert::qnn::ResetTensor(qnn_tensor);
-}
-
-TEST(TestLegalizeTensor, SimpleSupportedTensor) {
-  auto model = litert::testing::LoadTestFileModel("simple_multi_op.tflite");
-
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-  auto ops = subgraph->Ops();
-  auto op_outs = ops.at(1).Outputs();
-
-  auto qnn_tensor = litert::qnn::BuildDefaultTensor();
-  const auto& op_out = op_outs.front();
-  LITERT_ASSERT_OK(litert::qnn::LegalizeTensor(op_out, qnn_tensor));
-
-  ASSERT_EQ(qnn_tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(qnn_tensor.v2.dataType, QNN_DATATYPE_FLOAT_32);
-  EXPECT_EQ(qnn_tensor.v2.type, QNN_TENSOR_TYPE_NATIVE);
-
-  ASSERT_EQ(qnn_tensor.v2.rank, 2);
-  ASSERT_NE(qnn_tensor.v2.dimensions, nullptr);
-  EXPECT_THAT(absl::MakeConstSpan(qnn_tensor.v2.dimensions, 2),
-              ::testing::ElementsAreArray({2, 2}));
-
-  litert::qnn::ResetTensor(qnn_tensor);
-}
-
-TEST(TestLegalizeTensor, SimpleQuantizedTensor) {
-  auto model = litert::testing::LoadTestFileModel(kQSimpleMul16x16Model);
-
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-  auto ops = subgraph->Ops();
-  auto op_outs = ops.at(0).Outputs();
-
-  auto qnn_tensor = litert::qnn::BuildDefaultTensor();
-  const auto& op_out = op_outs.front();
-  LITERT_ASSERT_OK(litert::qnn::LegalizeTensor(op_out, qnn_tensor));
-
-  ASSERT_EQ(qnn_tensor.version, QNN_TENSOR_VERSION_2);
-  EXPECT_EQ(qnn_tensor.v2.dataType, QNN_DATATYPE_INT_16);
-  EXPECT_EQ(qnn_tensor.v2.type, QNN_TENSOR_TYPE_APP_READ);
-
-  ASSERT_EQ(qnn_tensor.v2.quantizeParams.quantizationEncoding,
-            QNN_QUANTIZATION_ENCODING_SCALE_OFFSET);
-  ASSERT_FLOAT_EQ(qnn_tensor.v2.quantizeParams.scaleOffsetEncoding.scale,
-                  kSimpleMulQuantModelOutputScale);
-
-  ASSERT_FLOAT_EQ(qnn_tensor.v2.quantizeParams.scaleOffsetEncoding.offset,
-                  kSimpleMulQuantModelOutputOffset);
-  litert::qnn::ResetTensor(qnn_tensor);
-}
-
-TEST(TestLegalizeTensor, PerChannelQuantizedTensor) {
-  auto model = litert::testing::LoadTestFileModel(kQKeyEinsum16x8Model);
-
-  auto subgraph = model.MainSubgraph();
-  EXPECT_TRUE(subgraph);
-  auto ops = subgraph->Ops();
-  auto op_ins = ops.at(1).Inputs();
-
-  auto qnn_tensor = litert::qnn::BuildDefaultTensor();
-  const auto& per_channel_quant_tensor = op_ins[1];
-  LITERT_ASSERT_OK(
-      litert::qnn::LegalizeTensor(per_channel_quant_tensor, qnn_tensor));
-
-  EXPECT_EQ(qnn_tensor.v2.dataType, QNN_DATATYPE_INT_8);
-
-  LiteRtQuantizationPerChannel per_channel_quant_params =
-      per_channel_quant_tensor.PerChannelQuantization();
-
-  ASSERT_EQ(qnn_tensor.v2.quantizeParams.quantizationEncoding,
-            QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET);
-  EXPECT_EQ(qnn_tensor.v2.quantizeParams.axisScaleOffsetEncoding.axis,
-            per_channel_quant_params.quantized_dimension);
-  EXPECT_EQ(
-      qnn_tensor.v2.quantizeParams.axisScaleOffsetEncoding.numScaleOffsets,
-      per_channel_quant_params.num_channels);
-  for (int i = 0; i < per_channel_quant_params.num_channels; ++i) {
-    ASSERT_FLOAT_EQ(
-        qnn_tensor.v2.quantizeParams.axisScaleOffsetEncoding.scaleOffset[i]
-            .scale,
-        per_channel_quant_params.scales[i]);
-    ASSERT_EQ(
-        qnn_tensor.v2.quantizeParams.axisScaleOffsetEncoding.scaleOffset[i]
-            .offset,
-        per_channel_quant_params.zero_points[i]);
-  }
-  litert::qnn::ResetTensor(qnn_tensor);
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.cc
deleted file mode 100644
index e0be0c0c8650..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <array>
-#include <cstdint>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/HTP/QnnHtpGraph.h"
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnGraph.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn {
-
-inline absl::Span<const QnnGraph_Config_t*> GetDefaultGraphConfigs() {
-  static std::array<QnnHtpGraph_CustomConfig_t, 2> graph_custom_configs;
-  // QNN suggest always enable relax precision.
-  graph_custom_configs[0] = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
-  graph_custom_configs[0].option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-  graph_custom_configs[0].precision = QNN_PRECISION_FLOAT16;
-  // Default use O3 for now.
-  graph_custom_configs[1] = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
-  graph_custom_configs[1].option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-  graph_custom_configs[1].optimizationOption.type =
-      QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-  // Change to 2 if you want to use O2 (default).
-  graph_custom_configs[1].optimizationOption.floatValue = 3;
-
-  static std::array<QnnGraph_Config_t, 2> graph_configs;
-  graph_configs[0] = QNN_GRAPH_CONFIG_INIT;
-  graph_configs[0].option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-  graph_configs[0].customConfig = &graph_custom_configs[0];
-
-  graph_configs[1] = QNN_GRAPH_CONFIG_INIT;
-  graph_configs[1].option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-  graph_configs[1].customConfig = &graph_custom_configs[1];
-
-  static std::array<const QnnGraph_Config_t*, 3> result = {
-      &graph_configs[0], &graph_configs[1], nullptr};
-
-  return absl::MakeSpan(result.data(), result.size());
-}
-
-inline absl::Span<const QnnGraph_Config_t*> GetLegacyGraphConfigs() {
-  static QnnHtpGraph_CustomConfig_t graph_custom_config;
-  // Default use O3 for now.
-  graph_custom_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
-  graph_custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-  graph_custom_config.optimizationOption.type =
-      QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-  // Change to 2 if you want to use O2 (default).
-  graph_custom_config.optimizationOption.floatValue = 3;
-
-  static QnnGraph_Config_t graph_config;
-  graph_config = QNN_GRAPH_CONFIG_INIT;
-  graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-  graph_config.customConfig = &graph_custom_config;
-
-  static std::array<const QnnGraph_Config_t*, 2> result = {&graph_config,
-                                                           nullptr};
-
-  return absl::MakeSpan(result.data(), result.size());
-}
-
-absl::Span<const QnnGraph_Config_t*> GraphMapper::PickGraphConfigHeuristic() {
-  if (qnn_.IsLegacySocModel()) {
-    return GetLegacyGraphConfigs();
-  } else {
-    return GetDefaultGraphConfigs();
-  }
-}
-
-LiteRtStatus GraphMapper::AssignTensorName(Qnn_Tensor_t& qnn_tensor) {
-  char* name = nullptr;
-  const int written = asprintf(&name, "Tensor_%d", cur_tensor_num_++);
-  LITERT_ENSURE(written != -1 && name != nullptr, kLiteRtStatusErrorNotFound,
-                "Failed to make tensor name");
-  qnn_tensor.v2.name = name;
-  return kLiteRtStatusOk;
-}
-
-absl::flat_hash_map<LiteRtTensor, uint32_t>& GraphMapper::CurrentScope() {
-  return current_scope_;
-}
-
-LiteRtStatus GraphMapper::LookupInScope(LiteRtTensor litert_tensor,
-                                        Qnn_Tensor_t& qnn_tensor) {
-  // If we go in topological order, this should never happen. TODO: add
-  // "internal error" status code.
-  const auto qnn_id = CurrentScope().find(litert_tensor);
-  // when qnn_id is not found, the tensor is a constant tensor thats not been
-  // added qnn graph.
-  if (qnn_id == CurrentScope().end()) {
-    LITERT_LOG(LITERT_INFO, "Adding constant tensor %s to qnn graph",
-               qnn_tensor.v2.name);
-    LITERT_RETURN_IF_ERROR(LegalizeAndRegister(litert_tensor, qnn_tensor));
-    LITERT_RETURN_IF_ERROR(PushToScope(litert_tensor, qnn_tensor));
-    // }
-    return kLiteRtStatusOk;
-  }
-  LITERT_LOG(LITERT_INFO, "Found tensor %d in current_scope.", qnn_id->second);
-  ResetTensor(qnn_tensor);
-  qnn_tensor.v2.id = qnn_id->second;
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GraphMapper::PushToScope(LiteRtTensor litert_tensor,
-                                      Qnn_Tensor_t& qnn_tensor) {
-  CurrentScope()[litert_tensor] = MoveToId(qnn_tensor);
-  return kLiteRtStatusOk;
-}
-
-QnnManager& GraphMapper::Qnn() { return qnn_; }
-
-Qnn_GraphHandle_t& GraphMapper::QnnGraph() { return qnn_graph_; }
-
-LiteRtStatus GraphMapper::LegalizeAndRegister(LiteRtTensor litert_tensor,
-                                              Qnn_Tensor_t& qnn_tensor) {
-  litert::Tensor tensor(litert_tensor);
-  LITERT_RETURN_IF_ERROR(LegalizeTensor(tensor, qnn_tensor));
-  LITERT_RETURN_IF_ERROR(AssignTensorName(qnn_tensor));
-
-  // Set tensor as graph output if it is used by other Ops.
-  if (graph_outpus_.contains(litert_tensor)) {
-    LITERT_LOG(LITERT_INFO, "Setting tensor %d as Graph output",
-               qnn_tensor.v2.id);
-    qnn_tensor.v2.type = QNN_TENSOR_TYPE_APP_READ;
-  }
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      qnn_.Api()->tensorCreateGraphTensor(QnnGraph(), &qnn_tensor));
-
-  LITERT_LOG(LITERT_INFO, "Legalized and registered tensor %d",
-             qnn_tensor.v2.id);
-
-  for (int i = 0; i < qnn_tensor.v2.rank; ++i) {
-    LITERT_LOG(LITERT_INFO, "qnn_tensor dim[%d] = %d", i,
-               qnn_tensor.v2.dimensions[i]);
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GraphMapper::IsLiteRtSubgraphSupported() {
-  // For now, we assume all LiteRt subgraphs are supported.
-  // TODO: b/381133565: Implement or remove this function.
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GraphMapper::InitQnnGraph(absl::string_view qnn_graph_name) {
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      qnn_.Api()->graphCreate(context_handle_, qnn_graph_name.data(),
-                              PickGraphConfigHeuristic().data(), &QnnGraph()));
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GraphMapper::Finalize() {
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      qnn_.Api()->graphFinalize(QnnGraph(), nullptr, nullptr));
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h
deleted file mode 100644
index 3e70e9f222e4..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_GRAPH_MAPPER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_GRAPH_MAPPER_H_
-
-#include <cstdint>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnGraph.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn {
-
-// Algorithm class for managing "scope" when mapping litert Subgraphs
-// to QNN Graphs.
-class GraphMapper {
- public:
-  GraphMapper(LiteRtSubgraph subgraph, QnnManager& qnn,
-              Qnn_ContextHandle_t context_handle)
-      : subgraph_(Subgraph(subgraph)),
-        qnn_(qnn),
-        context_handle_(context_handle) {}
-
-  // Legalize given LiteRtTensors attributes into QNN Tensor registered with
-  // QNN context. Result QNN Tensor is empty except for the canonical id
-  // assigned by QNN Api.
-  LiteRtStatus LegalizeAndRegister(LiteRtTensor litert_tensor,
-                                   Qnn_Tensor_t& qnn_tensor);
-
-  // Find ID associated with evaluated litert Tensor and add it to given
-  // QNN Tensor.
-  LiteRtStatus LookupInScope(LiteRtTensor litert_tensor,
-                             Qnn_Tensor_t& qnn_tensor);
-
-  // Adds new mapping to scope. All fields other than ID in given QNN Tensor are
-  // cleared and its ID is added to "current_scope". Expects QNN Tensor has
-  // already been registered with context.
-  LiteRtStatus PushToScope(LiteRtTensor litert_tensor,
-                           Qnn_Tensor_t& qnn_tensor);
-
-  // NOTE: QNN Tensors must be created with a unique name. This will ensure
-  // uniqueness but will want to have more meaningful names in the future.
-  LiteRtStatus AssignTensorName(Qnn_Tensor_t& qnn_tensor);
-
-  // QNN Sdk Accessors
-  QnnManager& Qnn();
-  Qnn_GraphHandle_t& QnnGraph();
-
-  // CC Convenience Accessors
-  const Subgraph& Graph() const { return subgraph_; }
-
-  // Accessor for current scope.
-  // Since each QNN Tensor needs to have a unique name globally within each QNN
-  // context, we maintain "Current scope", which is a map of evaluated
-  // LiteRtTensors to their resolved QNN Tensor ID.
-  absl::flat_hash_map<LiteRtTensor, uint32_t>& CurrentScope();
-
-  // Can implementation handle given LiteRtSubgraph topology (see comment at
-  // bottom of file).
-  LiteRtStatus IsLiteRtSubgraphSupported();
-
-  // Initialize QNN Graph with given name. Call this after parsing
-  // LiteRtSubgraph.
-  LiteRtStatus InitQnnGraph(absl::string_view qnn_graph_name);
-
-  // Finalize QNN Graph. Call this after all ops have been mapped.
-  LiteRtStatus Finalize();
-
-  inline void RegisterOutput(LiteRtTensor litert_tensor) {
-    graph_outpus_.insert(litert_tensor);
-  }
-
-  // Pick graph config based on subgraph.
-  absl::Span<const QnnGraph_Config_t*> PickGraphConfigHeuristic();
-
-  inline bool IsTensorOutput(LiteRtTensor litert_tensor) {
-    return graph_outpus_.contains(litert_tensor);
-  }
-
- private:
-  const Subgraph subgraph_;
-
-  // Set of all outputs of the graph.
-  absl::flat_hash_set<LiteRtTensor> graph_outpus_;
-
-  // Maps evaluated tensors to their resolved QNN Tensor ID.
-  absl::flat_hash_map<LiteRtTensor, uint32_t> current_scope_;
-
-  //
-  // QNN Sdk State
-  //
-  QnnManager& qnn_;
-  Qnn_ContextHandle_t context_handle_;
-  Qnn_GraphHandle_t qnn_graph_ = nullptr;
-
-  //
-  // Tensor Naming
-  //
-
-  uint32_t cur_tensor_num_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_GRAPH_MAPPER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/BUILD
deleted file mode 100644
index 46f27e985fd2..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/BUILD
+++ /dev/null
@@ -1,922 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "litert_lib")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:private"],
-)
-
-litert_lib(
-    name = "legalization",
-    hdrs = ["legalization.h"],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-    ],
-)
-
-litert_lib(
-    name = "add_op_legalization",
-    srcs = ["add_op_legalization.cc"],
-    hdrs = ["add_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "batch_matmul_op_legalization",
-    srcs = ["batch_matmul_op_legalization.cc"],
-    hdrs = ["batch_matmul_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "cast_op_legalization",
-    srcs = ["cast_op_legalization.cc"],
-    hdrs = ["cast_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "concatenation_op_legalization",
-    srcs = ["concatenation_op_legalization.cc"],
-    hdrs = ["concatenation_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "cos_op_legalization",
-    srcs = ["cos_op_legalization.cc"],
-    hdrs = ["cos_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "div_op_legalization",
-    srcs = ["div_op_legalization.cc"],
-    hdrs = ["div_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "dynamic_update_slice_op_legalization",
-    srcs = ["dynamic_update_slice_op_legalization.cc"],
-    hdrs = ["dynamic_update_slice_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "embedding_lookup_op_legalization",
-    srcs = ["embedding_lookup_op_legalization.cc"],
-    hdrs = ["embedding_lookup_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "transpose_op_legalization",
-    srcs = ["transpose_op_legalization.cc"],
-    hdrs = ["transpose_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "fully_connected_op_legalization",
-    srcs = ["fully_connected_op_legalization.cc"],
-    hdrs = ["fully_connected_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "gelu_op_legalization",
-    srcs = ["gelu_op_legalization.cc"],
-    hdrs = ["gelu_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "greater_op_legalization",
-    srcs = ["greater_op_legalization.cc"],
-    hdrs = ["greater_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "less_op_legalization",
-    srcs = ["less_op_legalization.cc"],
-    hdrs = ["less_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "logical_and_op_legalization",
-    srcs = ["logical_and_op_legalization.cc"],
-    hdrs = ["logical_and_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "mul_op_legalization",
-    srcs = ["mul_op_legalization.cc"],
-    hdrs = ["mul_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "pack_op_legalization",
-    srcs = ["pack_op_legalization.cc"],
-    hdrs = ["pack_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "quantize_op_legalization",
-    srcs = ["quantize_op_legalization.cc"],
-    hdrs = ["quantize_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_element_type",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "reshape_op_legalization",
-    srcs = ["reshape_op_legalization.cc"],
-    hdrs = ["reshape_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "rsqrt_op_legalization",
-    srcs = ["rsqrt_op_legalization.cc"],
-    hdrs = ["rsqrt_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "sin_op_legalization",
-    srcs = ["sin_op_legalization.cc"],
-    hdrs = ["sin_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "select_op_legalization",
-    srcs = ["select_op_legalization.cc"],
-    hdrs = ["select_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "slice_op_legalization",
-    srcs = ["slice_op_legalization.cc"],
-    hdrs = ["slice_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "sum_op_legalization",
-    srcs = ["sum_op_legalization.cc"],
-    hdrs = ["sum_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "sub_op_legalization",
-    srcs = ["sub_op_legalization.cc"],
-    hdrs = ["sub_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "softmax_op_legalization",
-    srcs = ["softmax_op_legalization.cc"],
-    hdrs = ["softmax_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "tanh_op_legalization",
-    srcs = ["tanh_op_legalization.cc"],
-    hdrs = ["tanh_op_legalization.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        ":util",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/c:litert_op_code",
-        "//tensorflow/lite/experimental/litert/c:litert_options",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
-
-litert_lib(
-    name = "util",
-    srcs = ["util.cc"],
-    hdrs = ["util.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        ":legalization",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/c:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/cc:litert_macros",
-        "//tensorflow/lite/experimental/litert/cc:litert_model",
-        "//tensorflow/lite/experimental/litert/cc:litert_model_predicates",
-        "//tensorflow/lite/experimental/litert/tools:dump",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler:graph_mapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_op",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR:qnn_tensor",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.cc
deleted file mode 100644
index a2a8da69bdc8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnAddOpTypeName = "ElementWiseAdd";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kAddOpFmt = "add_%d";
-
-LiteRtStatus AddOpLegalization::LegalizeOp(const litert::Op& src,
-                                           Qnn_OpConfig_t& dest,
-                                           GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflAdd) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kAddOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnAddOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized add op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.h
deleted file mode 100644
index c8301cb12466..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/add_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class AddOpLegalization : public Legalization {
- public:
-  AddOpLegalization() = default;
-  ~AddOpLegalization() = default;
-  using Ptr = std::unique_ptr<AddOpLegalization>;
-  static Ptr Create() { return std::make_unique<AddOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_ADD_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.cc
deleted file mode 100644
index 0685a7512430..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnBatchMatmulOpTypeName = "MatMul";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kBatchMatmulOpFmt = "batch_matmul_%d";
-
-LiteRtStatus BatchMatmulOpLegalization::LegalizeOp(const litert::Op& src,
-                                                   Qnn_OpConfig_t& dest,
-                                                   GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflBatchMatmul) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kBatchMatmulOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnBatchMatmulOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized batch_matmul op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.h
deleted file mode 100644
index 60aee1f164f0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/batch_matmul_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_BATCH_MATMUL_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_BATCH_MATMUL_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class BatchMatmulOpLegalization : public Legalization {
- public:
-  BatchMatmulOpLegalization() = default;
-  ~BatchMatmulOpLegalization() = default;
-  using Ptr = std::unique_ptr<BatchMatmulOpLegalization>;
-  static Ptr Create() { return std::make_unique<BatchMatmulOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_BATCH_MATMUL_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.cc
deleted file mode 100644
index 8a3bdef7138a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnCastOpTypeName = "Cast";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kCastOpFmt = "cast_%d";
-
-LiteRtStatus CastOpLegalization::LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                            GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflCast) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kCastOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnCastOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized cast op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.h
deleted file mode 100644
index fecbe54be764..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cast_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CAST_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CAST_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class CastOpLegalization : public Legalization {
- public:
-  CastOpLegalization() = default;
-  ~CastOpLegalization() = default;
-  using Ptr = std::unique_ptr<CastOpLegalization>;
-  static Ptr Create() { return std::make_unique<CastOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CAST_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.cc
deleted file mode 100644
index 11fd3f526fb8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.h"
-
-#include <cstdint>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnConcatenationOpTypeName = "Concat";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kConcatenationOpFmt = "concatenation_%d";
-
-static constexpr int kReduceConcatenationOpOutputSize = 1;
-static constexpr int kReduceConcatenationOpParamSize = 1;
-
-LiteRtStatus ConcatenationOpLegalization::LegalizeOp(
-    const Op& src, Qnn_OpConfig_t& dest, GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflConcatenation) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  DumpLegalization(*src.Get());
-  std::string op_name = absl::StrFormat(kConcatenationOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnConcatenationOpTypeName.data(), dest));
-
-  // Look up op input tensors in scope.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, op_ins.size(), QNN_TENSOR_INIT);
-
-  Qnn_Tensor_t* cur_qnn_op_in = qnn_op_ins;
-  for (const auto& op_in : op_ins) {
-    LITERT_RETURN_IF_ERROR(
-        graph_mapper.LookupInScope(op_in.Get(), *cur_qnn_op_in));
-    ++cur_qnn_op_in;
-  }
-
-  // QNN concatenation op expects 1 output tensor.
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs,
-                     kReduceConcatenationOpOutputSize, QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  // Extract axis option from concatenation op.
-  int32_t axis;
-  LITERT_RETURN_IF_ERROR(LiteRtGetConcatenationAxisOption(src.Get(), &axis));
-
-  // Construct the scalar "axis" param.
-  Qnn_Param_t axis_param = BuildDefaultParam();
-  axis_param.paramType = QNN_PARAMTYPE_SCALAR;
-  axis_param.name = "axis";
-  Qnn_Scalar_t axis_scalar = QNN_SCALAR_INIT;
-  axis_scalar.dataType = QNN_DATATYPE_UINT_32;
-  axis_scalar.int32Value = axis;
-  axis_param.scalarParam = axis_scalar;
-
-  Qnn_Param_t concatenation_params[] = {axis_param};
-  dest.v1.inputTensors = qnn_op_ins;
-  dest.v1.numOfInputs = op_ins.size();
-  dest.v1.outputTensors = qnn_op_outs;
-  dest.v1.numOfOutputs = kReduceConcatenationOpOutputSize;
-  dest.v1.numOfParams = kReduceConcatenationOpParamSize;
-  dest.v1.params = concatenation_params;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  LITERT_LOG(LITERT_INFO, "Legalized concatenation op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.h
deleted file mode 100644
index b3c26971b57c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/concatenation_op_legalization.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CONCATENATION_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CONCATENATION_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class ConcatenationOpLegalization : public Legalization {
- public:
-  ConcatenationOpLegalization() = default;
-  ~ConcatenationOpLegalization() = default;
-  using Ptr = std::unique_ptr<ConcatenationOpLegalization>;
-  static Ptr Create() {
-    return std::make_unique<ConcatenationOpLegalization>();
-  }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_CONCATENATION_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.cc
deleted file mode 100644
index 7bd555b31cef..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnCosOpTypeName = "ElementWiseCos";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kCosOpFmt = "cos_%d";
-
-LiteRtStatus CosOpLegalization::LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                           GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflCos) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kCosOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnCosOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized cos op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.h
deleted file mode 100644
index 6a35da2fb12d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/cos_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_COS_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_COS_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class CosOpLegalization : public Legalization {
- public:
-  CosOpLegalization() = default;
-  ~CosOpLegalization() = default;
-  using UniquePtr = std::unique_ptr<CosOpLegalization>;
-  static UniquePtr Create() { return std::make_unique<CosOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_COS_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.cc
deleted file mode 100644
index 947bad6f719b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnDivOpTypeName = "ElementWiseDivide";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kDivOpFmt = "div_%d";
-
-LiteRtStatus DivOpLegalization::LegalizeOp(const litert::Op& src,
-                                           Qnn_OpConfig_t& dest,
-                                           GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflDiv) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kDivOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnDivOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized div op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.h
deleted file mode 100644
index a22b91248a46..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/div_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DIV_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DIV_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class DivOpLegalization : public Legalization {
- public:
-  DivOpLegalization() = default;
-  ~DivOpLegalization() = default;
-  using Ptr = std::unique_ptr<DivOpLegalization>;
-  static Ptr Create() { return std::make_unique<DivOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DIV_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/dynamic_update_slice_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/dynamic_update_slice_op_legalization.cc
deleted file mode 100644
index 1511802a788a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/dynamic_update_slice_op_legalization.cc
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/dynamic_update_slice_op_legalization.h"
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-
-// Dynamic update slice op info.
-static constexpr int kDynamicUpdateSliceOpOperandIndex = 0;
-static constexpr int kDynamicUpdateSliceOpUpdateIndex = 1;
-static constexpr int kDynamicUpdateSliceOpIndicesIndex = 2;
-
-// ScatterND op config.
-static constexpr absl::string_view kQnnScatterNdOpTypeName = "ScatterNd";
-static constexpr absl::string_view kScatterNdOpFmt = "dus_scatter_nd_%d";
-static constexpr int kScatterNDOpInputSize = 3;
-static constexpr int kScatterNDOpOutputSize = 1;
-static constexpr int kScatterNDOutputRank = 4;
-static constexpr int kScatterNDParamSize = 0;
-
-// Strided slice op config.
-static constexpr absl::string_view kStridedSliceOpTypeName = "StridedSlice";
-static constexpr absl::string_view kStridedSliceOpFmt = "dus_strided_slice_%d";
-static constexpr int kStridedSliceOpInputSize = 1;
-static constexpr int kStridedSliceOpOutputSize = 1;
-static constexpr int kStridedSliceOpOutputRank = 1;
-static constexpr int kStridedSliceParamSize = 1;
-static constexpr absl::string_view kRangesParamName = "ranges";
-static constexpr int kRangesParamRank = 2;
-static constexpr int kRangesParamArgSize = 3;
-
-// Reshape op config.
-static constexpr absl::string_view kReshapeOpTypeName = "Reshape";
-static constexpr absl::string_view kReshapeOpFmt = "dus_reshape_%d";
-static constexpr int kReshapeOpInputSize = 1;
-static constexpr int kReshapeOpOutputSize = 1;
-static constexpr int kReshapeOpOutputRank = 2;
-static constexpr int kReshapeParamSize = 0;
-
-// Transpose op config.
-static constexpr absl::string_view kTransposeOpTypeName = "Transpose";
-static constexpr absl::string_view kTransposeOperandOpFmt =
-    "dus_transpose_operand_%d";
-static constexpr absl::string_view kTransposeUpdateOpFmt =
-    "dus_transpose_update_%d";
-static constexpr absl::string_view kTransposeResultOpFmt =
-    "dus_transpose_result_%d";
-static constexpr int kTransposeOpInputSize = 1;
-static constexpr int kTransposeOpOutputSize = 1;
-static constexpr int kTransposeOpOutputRank = 4;
-static constexpr int kTransposeParamSize = 1;
-static constexpr absl::string_view kPermParamName = "perm";
-static constexpr int kPermParamRank = 1;
-static constexpr int kPermParamArgSize = 4;
-
-LiteRtStatus DynamicUpdateSliceOpLegalization::LegalizeOp(
-    const Op& src, Qnn_OpConfig_t& dest, GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflDynamicUpdateSlice) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  DumpLegalization(*src.Get());
-
-  // Legalize input tensors, lookup operand tensor in scope.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, kScatterNDOpInputSize,
-                     QNN_TENSOR_INIT);
-
-  Qnn_Tensor_t* cur_qnn_op_in = qnn_op_ins;
-  for (const auto& op_in : op_ins) {
-    LITERT_RETURN_IF_ERROR(
-        graph_mapper.LookupInScope(op_in.Get(), *cur_qnn_op_in));
-    ++cur_qnn_op_in;
-  }
-  // Legalize op data type.
-  Qnn_DataType_t OperandDataType, UpdateDataType;
-  LITERT_RETURN_IF_ERROR(LegalizeElementType(
-      op_ins[kDynamicUpdateSliceOpOperandIndex].ElementType(),
-      &OperandDataType));
-  LITERT_RETURN_IF_ERROR(LegalizeElementType(
-      op_ins[kDynamicUpdateSliceOpUpdateIndex].ElementType(), &UpdateDataType));
-
-  //===========================================================================
-  // Step 1.1 Build strided slice op. Extract slice index from input[2]
-  //      input: [0, x, 0, 0] (LiteRT.DUS input[2])
-  //      output: [x]
-  Qnn_OpConfig_t strided_slice_op = BuildDefaultOp();
-  std::string op_name = absl::StrFormat(kStridedSliceOpFmt, op_counter_);
-  LITERT_RETURN_IF_ERROR(
-      SetOpInfo(op_name.c_str(), kDefaultQnnOpPackageName.data(),
-                kStridedSliceOpTypeName.data(), strided_slice_op));
-
-  // Prepare strided slice op params.
-  std::vector<int32_t> ranges = {1, 2, 1};
-  std::vector<uint32_t> ranges_dims = {1, kRangesParamArgSize};
-  Qnn_Param_t range_param = BuildDefaultParam();
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildQnnTesnorParam<int32_t>(
-      ranges.data(), ranges_dims.data(), QNN_DATATYPE_INT_32, kRangesParamRank,
-      kRangesParamName.data(), graph_mapper, range_param));
-
-  // Prepare strided slice op outputs.
-  Qnn_Tensor_t strided_slice_op_out = BuildDefaultTensor();
-  std::vector<uint32_t> slice_op_out_dims = {1};
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnNativeTensor(
-      QNN_DATATYPE_INT_32, kStridedSliceOpOutputRank, slice_op_out_dims.data(),
-      graph_mapper, strided_slice_op_out));
-
-  // Configure strided slice op.
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-      kStridedSliceOpInputSize, &qnn_op_ins[kDynamicUpdateSliceOpIndicesIndex],
-      kStridedSliceOpOutputSize, &strided_slice_op_out, strided_slice_op,
-      kStridedSliceParamSize, &range_param, graph_mapper));
-
-  LITERT_LOG(LITERT_INFO, "Added strided slice op for dus");
-
-  //===========================================================================
-  // Step 1.2 Build reshape op. Construct input tensor shape for QNN.ScatterND
-  // op.
-  //      input: [x] (QNN.StridedSlice output)
-  //      output: [[x]]
-  Qnn_OpConfig_t reshape_op = BuildDefaultOp();
-  std::string reshpae_op_name = absl::StrFormat(kReshapeOpFmt, op_counter_);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kReshapeOpTypeName.data(), reshape_op));
-
-  // Prepare reshape op output tensor.
-  Qnn_Tensor_t reshape_op_out = BuildDefaultTensor();
-  std::vector<uint32_t> reshape_op_out_dims = {1, 1};
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnNativeTensor(
-      QNN_DATATYPE_INT_32, kReshapeOpOutputRank, reshape_op_out_dims.data(),
-      graph_mapper, reshape_op_out));
-
-  // Configure reshape op.
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-      kReshapeOpInputSize, &strided_slice_op_out, kReshapeOpOutputSize,
-      &reshape_op_out, reshape_op, kReshapeParamSize, nullptr, graph_mapper));
-
-  LITERT_LOG(LITERT_INFO, "Added reshape op for dus");
-
-  //===========================================================================
-  // Step 2 Build transpose op. Swap the first two dimensions of the input
-  // tensor[0] and input tensor[1].
-  // op.
-  //      input: [a, b, c, d] (LiteRT.DUS input[0]/input[1] )
-  //      output: [b, a, c, d]
-  Qnn_OpConfig_t transpose_operand_op = BuildDefaultOp();
-  Qnn_OpConfig_t transpose_update_op = BuildDefaultOp();
-  std::string transpose_operand_op_name =
-      absl::StrFormat(kTransposeOperandOpFmt, op_counter_);
-  std::string transpose_update_op_name =
-      absl::StrFormat(kTransposeUpdateOpFmt, op_counter_);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(
-      transpose_operand_op_name.c_str(), kDefaultQnnOpPackageName.data(),
-      kTransposeOpTypeName.data(), transpose_operand_op));
-  LITERT_RETURN_IF_ERROR(SetOpInfo(
-      transpose_update_op_name.c_str(), kDefaultQnnOpPackageName.data(),
-      kTransposeOpTypeName.data(), transpose_update_op));
-
-  // Prepare transpose op params.
-  std::vector<uint32_t> perm = {1, 0, 2, 3};
-  std::vector<uint32_t> perm_dims = {kPermParamArgSize};
-  Qnn_Param_t perm_param = BuildDefaultParam();
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildQnnTesnorParam<uint32_t>(
-      perm.data(), perm_dims.data(), QNN_DATATYPE_UINT_32, kPermParamRank,
-      kPermParamName.data(), graph_mapper, perm_param));
-
-  // Prepare transpose op outputs.
-  Qnn_Tensor_t transpose_operand_op_output = BuildDefaultTensor();
-  Qnn_Tensor_t transpose_update_op_output = BuildDefaultTensor();
-
-  // Cast const int to uint32_t.
-  auto cast_f = [](int const_int) { return static_cast<uint32_t>(const_int); };
-
-  std::vector<uint32_t> transpose_operand_op_output_dims(
-      kTransposeOpOutputRank);
-  std::vector<uint32_t> transpose_update_op_output_dims(kTransposeOpOutputRank);
-  auto operand_dims = src.Inputs()[kDynamicUpdateSliceOpOperandIndex]
-                          .RankedTensorType()
-                          ->Layout()
-                          .Dimensions();
-  transpose_operand_op_output_dims[0] = cast_f(operand_dims[1]);
-  transpose_operand_op_output_dims[1] = cast_f(operand_dims[0]);
-  transpose_operand_op_output_dims[2] = cast_f(operand_dims[2]);
-  transpose_operand_op_output_dims[3] = cast_f(operand_dims[3]);
-
-  auto update_dims = src.Inputs()[kDynamicUpdateSliceOpUpdateIndex]
-                         .RankedTensorType()
-                         ->Layout()
-                         .Dimensions();
-  transpose_update_op_output_dims[0] = cast_f(update_dims[1]);
-  transpose_update_op_output_dims[1] = cast_f(update_dims[0]);
-  transpose_update_op_output_dims[2] = cast_f(update_dims[2]);
-  transpose_update_op_output_dims[3] = cast_f(update_dims[3]);
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnNativeTensor(
-      OperandDataType, kTransposeOpOutputRank,
-      transpose_operand_op_output_dims.data(), graph_mapper,
-      transpose_operand_op_output));
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnNativeTensor(
-      UpdateDataType, kTransposeOpOutputRank,
-      transpose_update_op_output_dims.data(), graph_mapper,
-      transpose_update_op_output));
-
-  // Configure transpose ops.
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-      kTransposeOpInputSize, &qnn_op_ins[kDynamicUpdateSliceOpOperandIndex],
-      kTransposeOpOutputSize, &transpose_operand_op_output,
-      transpose_operand_op, kTransposeParamSize, &perm_param, graph_mapper));
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-      kTransposeOpInputSize, &qnn_op_ins[kDynamicUpdateSliceOpUpdateIndex],
-      kTransposeOpOutputSize, &transpose_update_op_output, transpose_update_op,
-      kTransposeParamSize, &perm_param, graph_mapper));
-
-  //===========================================================================
-  // Step 3 Build ScatterND op.
-  Qnn_OpConfig_t scatter_nd_op = BuildDefaultOp();
-  std::string scatter_nd_op_name =
-      absl::StrFormat(kScatterNdOpFmt, op_counter_);
-  LITERT_RETURN_IF_ERROR(
-      SetOpInfo(scatter_nd_op_name.c_str(), kDefaultQnnOpPackageName.data(),
-                kQnnScatterNdOpTypeName.data(), scatter_nd_op));
-
-  // Prepare scatter nd op output tensor.
-  Qnn_Tensor_t scatter_nd_op_output = BuildDefaultTensor();
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      BuildAndRegisterQnnNativeTensor(OperandDataType, kScatterNDOutputRank,
-                                      transpose_operand_op_output_dims.data(),
-                                      graph_mapper, scatter_nd_op_output));
-
-  // Configure ScatterND op.
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, scatter_nd_op_ins, kScatterNDOpInputSize,
-                     QNN_TENSOR_INIT);
-  scatter_nd_op_ins[0] = transpose_operand_op_output;
-  scatter_nd_op_ins[1] = reshape_op_out;
-  scatter_nd_op_ins[2] = transpose_update_op_output;
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-      kScatterNDOpInputSize, scatter_nd_op_ins, kScatterNDOpOutputSize,
-      &scatter_nd_op_output, scatter_nd_op, kScatterNDParamSize, nullptr,
-      graph_mapper));
-
-  //===========================================================================
-  // Step 4 Build final transpose op. Swap back the first two dimensions of the
-  // scatter nd op output.
-  // op.
-  //      input: [b, a, c, d] (QNN.ScatterND output)
-  //      output: [a, b, c, d]
-  std::string transpose_result_op_name = absl::StrFormat(
-      kTransposeResultOpFmt, /*increase counter*/ op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(transpose_result_op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kTransposeOpTypeName.data(), dest));
-
-  // Legalize op outputs and update scope.
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, op_outs.size(),
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  // Configure transpose op.
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-      kTransposeOpInputSize, &scatter_nd_op_output, kTransposeOpOutputSize,
-      &qnn_op_outs[0], dest, kTransposeParamSize, &perm_param, graph_mapper));
-
-  LITERT_LOG(LITERT_INFO, "Legalized dynamic update slice op");
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/dynamic_update_slice_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/dynamic_update_slice_op_legalization.h
deleted file mode 100644
index 2a497f4f5cfc..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/dynamic_update_slice_op_legalization.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DYNAMIC_UPDATE_SLICE_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DYNAMIC_UPDATE_SLICE_OP_LEGALIZATION_H_
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class DynamicUpdateSliceOpLegalization : public Legalization {
- public:
-  DynamicUpdateSliceOpLegalization() = default;
-  ~DynamicUpdateSliceOpLegalization() = default;
-  using Ptr = std::unique_ptr<DynamicUpdateSliceOpLegalization>;
-  static Ptr Create() {
-    return std::make_unique<DynamicUpdateSliceOpLegalization>();
-  }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_DYNAMIC_UPDATE_SLICE_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.cc
deleted file mode 100644
index ecab067e3846..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnEmbeddingLookupOpTypeName = "Gather";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kEmbeddingLookupOpFmt =
-    "embedding_lookup_%d";
-
-static constexpr int kReduceEmbeddingLookupOpOutputSize = 1;
-static constexpr int kReduceEmbeddingLookupOpParamSize = 1;
-
-static constexpr int kEmbeddingLookupOpTableInputIndex = 1;
-static constexpr int kEmbeddingLookupOpLookipInputIndex = 0;
-static constexpr int kQnnGatherOpTableInputIndex = 0;
-static constexpr int kQnnGatherOpLookupInputIndex = 1;
-
-LiteRtStatus EmbeddingLookupOpLegalization::LegalizeOp(
-    const Op& src, Qnn_OpConfig_t& dest, GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflEmbeddingLookup) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  DumpLegalization(*src.Get());
-  std::string op_name = absl::StrFormat(kEmbeddingLookupOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnEmbeddingLookupOpTypeName.data(), dest));
-
-  // Look up op input tensors in scope.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, op_ins.size(), QNN_TENSOR_INIT);
-
-  LITERT_RETURN_IF_ERROR(graph_mapper.LookupInScope(
-      op_ins[kEmbeddingLookupOpLookipInputIndex].Get(),
-      qnn_op_ins[kQnnGatherOpLookupInputIndex]));
-  LITERT_RETURN_IF_ERROR(graph_mapper.LookupInScope(
-      op_ins[kEmbeddingLookupOpTableInputIndex].Get(),
-      qnn_op_ins[kQnnGatherOpTableInputIndex]));
-
-  // QNN embedding_lookup op expects 1 output tensor.
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs,
-                     kReduceEmbeddingLookupOpOutputSize, QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  // Construct the scalar "axis" param.
-  Qnn_Param_t axis_param = BuildDefaultParam();
-  axis_param.paramType = QNN_PARAMTYPE_SCALAR;
-  axis_param.name = "axis";
-  Qnn_Scalar_t axis_scalar = QNN_SCALAR_INIT;
-  axis_scalar.dataType = QNN_DATATYPE_INT_32;
-  // Embedding lookup op expects axis to always be 0.
-  axis_scalar.int32Value = 0;
-  axis_param.scalarParam = axis_scalar;
-
-  Qnn_Param_t embedding_lookup_params[] = {axis_param};
-  dest.v1.inputTensors = qnn_op_ins;
-  dest.v1.numOfInputs = op_ins.size();
-  dest.v1.outputTensors = qnn_op_outs;
-  dest.v1.numOfOutputs = kReduceEmbeddingLookupOpOutputSize;
-  dest.v1.numOfParams = kReduceEmbeddingLookupOpParamSize;
-  dest.v1.params = embedding_lookup_params;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  LITERT_LOG(LITERT_INFO, "Legalized embedding_lookup op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.h
deleted file mode 100644
index e8bae779d2ae..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/embedding_lookup_op_legalization.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_EMBEDDING_LOOKUP_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_EMBEDDING_LOOKUP_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class EmbeddingLookupOpLegalization : public Legalization {
- public:
-  EmbeddingLookupOpLegalization() = default;
-  ~EmbeddingLookupOpLegalization() = default;
-  using Ptr = std::unique_ptr<EmbeddingLookupOpLegalization>;
-  static Ptr Create() {
-    return std::make_unique<EmbeddingLookupOpLegalization>();
-  }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_EMBEDDING_LOOKUP_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.cc
deleted file mode 100644
index 200172a0a557..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.h"
-
-#include <cstdint>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnFullyConnectedOpTypeName =
-    "FullyConnected";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kFullyConnectedOpFmt = "fully_connected_%d";
-
-LiteRtStatus FullyConnectedOpLegalization::LegalizeOp(
-    const Op& src, Qnn_OpConfig_t& dest, GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflFullyConnected) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kFullyConnectedOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnFullyConnectedOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-
-  LITERT_LOG(LITERT_INFO, "Legalized fully_connected op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.h
deleted file mode 100644
index 0ff2983e59e7..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/fully_connected_op_legalization.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_FULLY_CONNECTED_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_FULLY_CONNECTED_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class FullyConnectedOpLegalization : public Legalization {
- public:
-  FullyConnectedOpLegalization() = default;
-  ~FullyConnectedOpLegalization() = default;
-  using Ptr = std::unique_ptr<FullyConnectedOpLegalization>;
-  static Ptr Create() {
-    return std::make_unique<FullyConnectedOpLegalization>();
-  }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_FULLY_CONNECTED_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.cc
deleted file mode 100644
index 3b769d9bd752..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnGeluOpTypeName = "Gelu";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kGeluOpFmt = "gelu_%d";
-
-LiteRtStatus GeluOpLegalization::LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                            GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflGelu) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  const std::string op_name = absl::StrFormat(kGeluOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnGeluOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized gelu op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.h
deleted file mode 100644
index fdb31f5300d0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/gelu_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GELU_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GELU_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class GeluOpLegalization : public Legalization {
- public:
-  GeluOpLegalization() = default;
-  ~GeluOpLegalization() = default;
-  using Ptr = std::unique_ptr<GeluOpLegalization>;
-  static Ptr Create() { return std::make_unique<GeluOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GELU_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.cc
deleted file mode 100644
index d07ca4f086c7..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Ungreater required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnGreaterOpTypeName = "ElementWiseGreater";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kGreaterOpFmt = "greater_%d";
-
-LiteRtStatus GreaterOpLegalization::LegalizeOp(const litert::Op& src,
-                                               Qnn_OpConfig_t& dest,
-                                               GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflGreater) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kGreaterOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnGreaterOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized greater op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.h
deleted file mode 100644
index bb353420291c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/greater_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GREATER_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GREATER_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class GreaterOpLegalization : public Legalization {
- public:
-  GreaterOpLegalization() = default;
-  ~GreaterOpLegalization() = default;
-  using Ptr = std::unique_ptr<GreaterOpLegalization>;
-  static Ptr Create() { return std::make_unique<GreaterOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_GREATER_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h
deleted file mode 100644
index 5f7c8ef96062..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LEGALIZATION_H_
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-
-#define STRINGIFY(x) #x
-#define QNN_OP_NAME(prefix) STRINGIFY(prefix##__COUNTER__)
-
-namespace litert::qnn {
-
-class Legalization {
- public:
-  Legalization() = default;
-  virtual ~Legalization() = default;
-
-  virtual LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                  GraphMapper& graph_mapper) = 0;
-
-  // Sets the op name, package name, and type.
-  // Note: All argument strings can't be de-allocated until the op has been
-  // registered with the qnn api. i.e graphAddNode().
-  inline LiteRtStatus SetOpInfo(const char* name, const char* op_package_name,
-                                const char* op_type, Qnn_OpConfig_t& op) {
-    op.v1.name = name;
-    op.v1.packageName = op_package_name;
-    op.v1.typeName = op_type;
-    return kLiteRtStatusOk;
-  }
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.cc
deleted file mode 100644
index 23d45e4ba4a6..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnLessOpTypeName = "ElementWiseLess";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kLessOpFmt = "less_%d";
-
-LiteRtStatus LessOpLegalization::LegalizeOp(const litert::Op& src,
-                                            Qnn_OpConfig_t& dest,
-                                            GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflLess) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kLessOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnLessOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized less op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.h
deleted file mode 100644
index b16c5335f01a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/less_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LESS_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LESS_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class LessOpLegalization : public Legalization {
- public:
-  LessOpLegalization() = default;
-  ~LessOpLegalization() = default;
-  using Ptr = std::unique_ptr<LessOpLegalization>;
-  static Ptr Create() { return std::make_unique<LessOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LESS_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.cc
deleted file mode 100644
index 1a1bc4dbdc7a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnLogicalAndOpTypeName = "ElementWiseAnd";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kLogicalAndOpFmt = "logical_and_%d";
-
-LiteRtStatus LogicalAndOpLegalization::LegalizeOp(const Op& src,
-                                                  Qnn_OpConfig_t& dest,
-                                                  GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflLogicalAnd) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kLogicalAndOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnLogicalAndOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized logical_and op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.h
deleted file mode 100644
index ec5c5c2a03bf..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/logical_and_op_legalization.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LOGICAL_AND_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LOGICAL_AND_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class LogicalAndOpLegalization : public Legalization {
- public:
-  LogicalAndOpLegalization() = default;
-  ~LogicalAndOpLegalization() = default;
-  using UniquePtr = std::unique_ptr<LogicalAndOpLegalization>;
-  static UniquePtr Create() {
-    return std::make_unique<LogicalAndOpLegalization>();
-  }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_LOGICAL_AND_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.cc
deleted file mode 100644
index 4185740e2cb2..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnMulOpTypeName = "ElementWiseMultiply";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kMulOpFmt = "mul_%d";
-
-LiteRtStatus MulOpLegalization::LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                           GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflMul) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kMulOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnMulOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized mul op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.h
deleted file mode 100644
index 098d0954430d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/mul_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_MUL_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_MUL_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class MulOpLegalization : public Legalization {
- public:
-  MulOpLegalization() = default;
-  ~MulOpLegalization() = default;
-  using Ptr = std::unique_ptr<MulOpLegalization>;
-  static Ptr Create() { return std::make_unique<MulOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_MUL_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/pack_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/pack_op_legalization.cc
deleted file mode 100644
index 6e1f3d350813..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/pack_op_legalization.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/pack_op_legalization.h"
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-// Pack op config.
-static constexpr absl::string_view kQnnPackOpTypeName = "Pack";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kPackOpFmt = "pack_%d";
-static constexpr absl::string_view kPackOpAxisParamName = "axis";
-static constexpr int kPackOpAxisParamSize = 1;
-static constexpr int kPackScalarsOpOutputRank = 2;
-
-// Reshape op config.
-static constexpr absl::string_view kReshapeOpTypeName = "Reshape";
-static constexpr absl::string_view kReshapeOpFmt = "pack_reshape_%d";
-static constexpr int kReshapeOpInputSize = 1;
-static constexpr int kReshapeOpOutputSize = 1;
-static constexpr int kReshapeParamSize = 0;
-
-LiteRtStatus PackOpLegalization::LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                            GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflPack) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string pack_op_name = absl::StrFormat(kPackOpFmt, op_counter_);
-  DumpLegalization(*src.Get());
-
-  // Legalize input tensors, lookup operand tensor in scope.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, op_ins.size(), QNN_TENSOR_INIT);
-  Qnn_Tensor_t* cur_qnn_op_in = qnn_op_ins;
-  for (const auto& op_in : op_ins) {
-    LITERT_RETURN_IF_ERROR(
-        graph_mapper.LookupInScope(op_in.Get(), *cur_qnn_op_in));
-    ++cur_qnn_op_in;
-  }
-
-  // Legalize output tensors.
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, op_outs.size(),
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  // Get axis option and build QNN scalar param.
-  int32_t axis;
-  LITERT_RETURN_IF_ERROR(LiteRtGetPackAxisOption(src.Get(), &axis));
-  uint32_t axis_value = static_cast<uint32_t>(axis);
-
-  Qnn_Param_t axis_param = BuildDefaultParam();
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildQnnScalarParam<uint32_t>(
-      axis_value, QNN_DATATYPE_UINT_32, kPackOpAxisParamName.data(),
-      graph_mapper, axis_param));
-
-  // Qnn does not support Packing scalars, scalar value are legalized as 1D
-  // tensor with single element. In such case, we need to add a reshape op to
-  // convert result packed 2D tensor to 1D tensor.
-  auto input_layout = op_ins[0].RankedTensorType()->Layout();
-  if (input_layout.Rank() == 0) {
-    // prepare Pack op output tensor.
-    Qnn_Tensor_t pack_op_out = BuildDefaultTensor();
-    uint32_t pack_op_out_rank = kPackScalarsOpOutputRank;
-    Qnn_DataType_t PackOpDataType = QNN_DATATYPE_UNDEFINED;
-
-    LITERT_RETURN_IF_ERROR(
-        LegalizeElementType(op_ins[0].ElementType(), &PackOpDataType));
-    std::vector<uint32_t> pack_op_out_dims = {
-        static_cast<uint32_t>(op_ins.size())};
-
-    LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnNativeTensor(
-        PackOpDataType, pack_op_out_rank, pack_op_out_dims.data(), graph_mapper,
-        pack_op_out));
-
-    // Build Pack op.
-    Qnn_OpConfig_t pack_op = BuildDefaultOp();
-    LITERT_RETURN_IF_ERROR(SetOpInfo(pack_op_name.c_str(),
-                                     kDefaultQnnOpPackageName.data(),
-                                     kQnnPackOpTypeName.data(), pack_op));
-    LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-        op_ins.size(), qnn_op_ins, op_outs.size(), &pack_op_out, pack_op,
-        kPackOpAxisParamSize, &axis_param, graph_mapper));
-
-    // Build Reshape op.
-    std::string reshape_op_name = absl::StrFormat(kReshapeOpFmt, op_counter_);
-    LITERT_RETURN_IF_ERROR(SetOpInfo(reshape_op_name.c_str(),
-                                     kDefaultQnnOpPackageName.data(),
-                                     kReshapeOpTypeName.data(), dest));
-    LITERT_RETURN_STATUS_IF_QNN_NOT_OK(BuildAndRegisterQnnOp(
-        kReshapeOpInputSize, &pack_op_out, kReshapeOpOutputSize, qnn_op_outs,
-        dest, kReshapeParamSize, nullptr, graph_mapper));
-  } else {
-    LITERT_RETURN_IF_ERROR(SetOpInfo(pack_op_name.c_str(),
-                                     kDefaultQnnOpPackageName.data(),
-                                     kQnnPackOpTypeName.data(), dest));
-    BuildAndRegisterQnnOp(op_ins.size(), qnn_op_ins, op_outs.size(),
-                          qnn_op_outs, dest, kPackOpAxisParamSize, &axis_param,
-                          graph_mapper);
-  }
-  op_counter_++;
-
-  LITERT_LOG(LITERT_INFO, "Legalized pack op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/pack_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/pack_op_legalization.h
deleted file mode 100644
index 42bd24f95b78..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/pack_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_PACK_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_PACK_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class PackOpLegalization : public Legalization {
- public:
-  PackOpLegalization() = default;
-  ~PackOpLegalization() = default;
-  using Ptr = std::unique_ptr<PackOpLegalization>;
-  static Ptr Create() { return std::make_unique<PackOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_PACK_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.cc
deleted file mode 100644
index bf16efb34744..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.h"
-
-#include <cmath>
-#include <limits>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnConvertOpTypeName = "Convert";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kConvertOpFmt = "q_convert_%d";
-
-static constexpr absl::string_view kQnnQuantizeOpTypeName = "Quantize";
-static constexpr absl::string_view kQuantizeOpFmt = "quantize_%d";
-
-static constexpr absl::string_view kQnnCastOpTypeName = "Cast";
-static constexpr absl::string_view kCastOpFmt = "q_cast_%d";
-
-// SFIXED_8 and UFIXED_8 offset diff
-static constexpr int kSUFixed8OffsetDiff = 128;
-// SFIXED_16 and UFIXED_16 offset diff
-static constexpr int kSUFixed16OffsetDiff = 32768;
-
-LiteRtStatus QuantizeOpLegalization::LegalizeQuantizeOpAsConvertOp(
-    const litert::Op& src, Qnn_OpConfig_t& dest) {
-  std::string op_name = absl::StrFormat(kConvertOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnConvertOpTypeName.data(), dest));
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QuantizeOpLegalization::LegalizeQuantizeOpAsCastOp(
-    const litert::Op& src, Qnn_OpConfig_t& dest) {
-  std::string op_name = absl::StrFormat(kCastOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnCastOpTypeName.data(), dest));
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QuantizeOpLegalization::LegalizeQuantizeOpAsQuantizeOp(
-    const litert::Op& src, Qnn_OpConfig_t& dest) {
-  std::string op_name = absl::StrFormat(kQuantizeOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnQuantizeOpTypeName.data(), dest));
-  return kLiteRtStatusOk;
-}
-
-inline bool IsTensorUInt8(Tensor& tensor) {
-  return tensor.RankedTensorType()->ElementType() == ElementType::UInt8;
-}
-inline bool IsTensorInt8(Tensor& tensor) {
-  return tensor.RankedTensorType()->ElementType() == ElementType::Int8;
-}
-inline bool IsTensorUInt16(Tensor& tensor) {
-  return tensor.RankedTensorType()->ElementType() == ElementType::UInt16;
-}
-inline bool IsTensorInt16(Tensor& tensor) {
-  return tensor.RankedTensorType()->ElementType() == ElementType::Int16;
-}
-
-inline bool IsTensorPerTensorQuantized(Tensor& tensor) {
-  return (IsTensorInt8(tensor) || IsTensorUInt8(tensor) ||
-          IsTensorInt16(tensor) || IsTensorUInt16(tensor)) &&
-         tensor.QTypeId() == kLiteRtQuantizationPerTensor;
-}
-
-inline bool WithinCastRange(Tensor& input_tensor, Tensor& output_tensor,
-                            const int offst_diff) {
-  return (std::fabs(input_tensor.PerTensorQuantization().scale -
-                    output_tensor.PerTensorQuantization().scale)) <
-             std::numeric_limits<float>::epsilon() &&
-         std::abs(input_tensor.PerTensorQuantization().zero_point -
-                  output_tensor.PerTensorQuantization().zero_point) ==
-             offst_diff;
-}
-
-LiteRtStatus QuantizeOpLegalization::ConfigureQnnOp(const litert::Op& src,
-                                                    Qnn_OpConfig_t& dest) {
-  const bool is_input_tensor_per_tensor_quantized =
-      IsTensorPerTensorQuantized(src.Inputs().front());
-  const bool is_output_tensor_per_tensor_quantized =
-      IsTensorPerTensorQuantized(src.Outputs().front());
-
-  if (is_input_tensor_per_tensor_quantized &&
-      is_output_tensor_per_tensor_quantized) {
-    // Check if the input and output tensors are int8/uint8 or int16/uint16.
-    const bool is_input_tensor_int8 = IsTensorInt8(src.Inputs().front());
-    const bool is_input_tensor_uint8 = IsTensorUInt8(src.Inputs().front());
-    const bool is_input_tensor_int16 = IsTensorInt16(src.Inputs().front());
-    const bool is_input_tensor_uint16 = IsTensorUInt16(src.Inputs().front());
-    const bool is_output_tensor_int8 = IsTensorInt8(src.Outputs().front());
-    const bool is_output_tensor_uint8 = IsTensorUInt8(src.Outputs().front());
-    const bool is_output_tensor_int16 = IsTensorInt16(src.Outputs().front());
-    const bool is_output_tensor_uint16 = IsTensorUInt16(src.Outputs().front());
-
-    if ((is_input_tensor_int8 && is_output_tensor_uint8) ||
-        (is_input_tensor_uint8 && is_output_tensor_int8)) {
-      // Case if the input and output tensors are int8/uint8.
-      const bool is_quantization_range_within_cast_range = WithinCastRange(
-          src.Inputs().front(), src.Outputs().front(), kSUFixed8OffsetDiff);
-      if (is_quantization_range_within_cast_range) {
-        LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-            LegalizeQuantizeOpAsCastOp(src, dest));
-        LITERT_LOG(LITERT_INFO, "Configured quantize op to Cast Op");
-        return kLiteRtStatusOk;
-      }
-    } else if ((is_input_tensor_int16 && is_output_tensor_uint16) ||
-               (is_input_tensor_uint16 && is_output_tensor_int16)) {
-      // Case if the input and output tensors are int16/uint16.
-      const bool is_quantization_range_within_cast_range = WithinCastRange(
-          src.Inputs().front(), src.Outputs().front(), kSUFixed16OffsetDiff);
-      if (is_quantization_range_within_cast_range) {
-        LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-            LegalizeQuantizeOpAsCastOp(src, dest));
-        LITERT_LOG(LITERT_INFO, "Configured quantize op to Cast Op");
-        return kLiteRtStatusOk;
-      }
-    }
-    LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-        LegalizeQuantizeOpAsConvertOp(src, dest));
-    LITERT_LOG(LITERT_INFO, "Configured quantize op to Convert Op");
-    return kLiteRtStatusOk;
-  }
-
-  // Not per tensor quantized, legalize to Quantize Op.
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(LegalizeQuantizeOpAsQuantizeOp(src, dest));
-  LITERT_LOG(LITERT_INFO, "Legalized quantize op to Quantize Op");
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QuantizeOpLegalization::LegalizeOp(const litert::Op& src,
-                                                Qnn_OpConfig_t& dest,
-                                                GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflQuantize) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  ConfigureQnnOp(src, dest);
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized quantize Op");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.h
deleted file mode 100644
index 7621f701b87a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_QUANTIZE_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_QUANTIZE_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class QuantizeOpLegalization : public Legalization {
- public:
-  QuantizeOpLegalization() = default;
-  ~QuantizeOpLegalization() = default;
-  using Ptr = std::unique_ptr<QuantizeOpLegalization>;
-  static Ptr Create() { return std::make_unique<QuantizeOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
-  LiteRtStatus ConfigureQnnOp(const litert::Op& src, Qnn_OpConfig_t& dest);
-
- private:
-  // Requantization: legalize quantize to QNN Convert Op.
-  // Quantization range is not within QNN cast Op range.
-  LiteRtStatus LegalizeQuantizeOpAsConvertOp(const litert::Op& src,
-                                             Qnn_OpConfig_t& dest);
-
-  // Ignore Requantization: legalize quantize to QNN Cast Op.
-  // Quantization range is within QNN cast Op range. Directly use QNN Cast Op.
-  LiteRtStatus LegalizeQuantizeOpAsCastOp(const litert::Op& src,
-                                          Qnn_OpConfig_t& dest);
-
-  // Quantization: legalize quantize to QNN Quantize Op.
-  LiteRtStatus LegalizeQuantizeOpAsQuantizeOp(const litert::Op& src,
-                                              Qnn_OpConfig_t& dest);
-
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_QUANTIZE_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.cc
deleted file mode 100644
index 1127b0f3188c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnReshapeOpTypeName = "Reshape";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kReshapeOpFmt = "reshape_%d";
-
-static constexpr int kReshapeOpInputSize = 1;
-static constexpr int kReshapeOpOutputSize = 1;
-
-LiteRtStatus ReshapeOpLegalization::LegalizeOp(const litert::Op& src,
-                                               Qnn_OpConfig_t& dest,
-                                               GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflReshape) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kReshapeOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnReshapeOpTypeName.data(), dest));
-  DumpLegalization(*src.Get());
-  // Look up op input tensors in scope.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, kReshapeOpInputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LookupInScope(op_ins.front().Get(), qnn_op_ins[0]));
-
-  // Legalize op outputs and update scope.
-
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, kReshapeOpOutputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  dest.v1.numOfInputs = kReshapeOpInputSize;
-  dest.v1.inputTensors = qnn_op_ins;
-
-  dest.v1.numOfOutputs = kReshapeOpOutputSize;
-  dest.v1.outputTensors = qnn_op_outs;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  LITERT_LOG(LITERT_INFO, "Legalized reshape op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.h
deleted file mode 100644
index e8553639fc09..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/reshape_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RESHAPE_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RESHAPE_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class ReshapeOpLegalization : public Legalization {
- public:
-  ReshapeOpLegalization() = default;
-  ~ReshapeOpLegalization() = default;
-  using Ptr = std::unique_ptr<ReshapeOpLegalization>;
-  static Ptr Create() { return std::make_unique<ReshapeOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RESHAPE_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.cc
deleted file mode 100644
index 363434821d6d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnRsqrtOpTypeName = "ElementWiseRsqrt";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kRsqrtOpFmt = "rsqrt_%d";
-
-LiteRtStatus RsqrtOpLegalization::LegalizeOp(const litert::Op& src,
-                                             Qnn_OpConfig_t& dest,
-                                             GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflRsqrt) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kRsqrtOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnRsqrtOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized rsqrt op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.h
deleted file mode 100644
index 5971e9f98cd5..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/rsqrt_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RSQRT_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RSQRT_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class RsqrtOpLegalization : public Legalization {
- public:
-  RsqrtOpLegalization() = default;
-  ~RsqrtOpLegalization() = default;
-  using Ptr = std::unique_ptr<RsqrtOpLegalization>;
-  static Ptr Create() { return std::make_unique<RsqrtOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_RSQRT_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.cc
deleted file mode 100644
index 9c6da052221b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnSelectOpTypeName = "ElementWiseSelect";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kSelectOpFmt = "select_%d";
-
-LiteRtStatus SelectOpLegalization::LegalizeOp(const Op& src,
-                                              Qnn_OpConfig_t& dest,
-                                              GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflSelect &&
-      src.Code() != kLiteRtOpCodeTflSelectV2) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kSelectOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnSelectOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-
-  return kLiteRtStatusOk;
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized select op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.h
deleted file mode 100644
index 526498a4bb4b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/select_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SELECT_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SELECT_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class SelectOpLegalization : public Legalization {
- public:
-  SelectOpLegalization() = default;
-  ~SelectOpLegalization() = default;
-  using Ptr = std::unique_ptr<SelectOpLegalization>;
-  static Ptr Create() { return std::make_unique<SelectOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SELECT_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.cc
deleted file mode 100644
index 17932971f8ce..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnSinOpTypeName = "ElementWiseSin";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kSinOpFmt = "sin_%d";
-
-LiteRtStatus SinOpLegalization::LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                           GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflSin) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kSinOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnSinOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized sin op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.h
deleted file mode 100644
index e87296eeb10f..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sin_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SIN_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SIN_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class SinOpLegalization : public Legalization {
- public:
-  SinOpLegalization() = default;
-  ~SinOpLegalization() = default;
-  using UniquePtr = std::unique_ptr<SinOpLegalization>;
-  static UniquePtr Create() { return std::make_unique<SinOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SIN_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.cc
deleted file mode 100644
index 6749bd654eb5..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.h"
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnSliceOpTypeName = "StridedSlice";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kSliceOpFmt = "slice_%d";
-
-static constexpr int kSliceOpInputSize = 1;
-static constexpr int kSliceOpOutputSize = 1;
-static constexpr int kSliceOpParamSize = 1;
-// QNN StridedSlice op packs "start", "end", and "stride" into a single tensor
-// param "ranges".
-static constexpr int kRangesParamArgSize = 3;
-static constexpr int kRangesParamRank = 2;
-
-LiteRtStatus SliceOpLegalization::LegalizeOp(const Op& src,
-                                             Qnn_OpConfig_t& dest,
-                                             GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflSlice) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  DumpLegalization(*src.Get());
-  std::string op_name = absl::StrFormat(kSliceOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnSliceOpTypeName.data(), dest));
-
-  // QNN strided slice op expects 1 input tensor.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, kSliceOpInputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LookupInScope(op_ins.front().Get(), qnn_op_ins[0]));
-
-  // QNN strided slice op expects 1 output tensor.
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, kSliceOpOutputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  const auto& src_input_tensor = op_ins.front();
-  auto src_input_tensor_type = src_input_tensor.RankedTensorType();
-  if (!src_input_tensor_type) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               src_input_tensor_type.Error().Message().c_str());
-    return src_input_tensor_type.Error().Status();
-  }
-
-  auto src_input_tensor_rank = src_input_tensor_type->Layout().Rank();
-
-  // Prepare qnn strided slice parameters.
-
-  auto src_begin_indices = op_ins.at(1).WeightsData<int32_t>();
-  if (!src_begin_indices) {
-    return src_begin_indices.Error().Status();
-  }
-
-  auto src_size_indices = op_ins.at(2).WeightsData<int32_t>();
-  if (!src_size_indices) {
-    return src_size_indices.Error().Status();
-  }
-
-  // Check if src_begin_indices and src_size_indices are weights tensors.
-  if (src_begin_indices->empty() || src_size_indices->empty()) {
-    return kLiteRtStatusErrorInvalidLegalization;
-  }
-
-  LITERT_STACK_ARRAY(int32_t, range_tensor_data,
-                     src_input_tensor_rank* kRangesParamArgSize,
-                     /*init value*/ 0);
-  for (int i = 0; i < src_input_tensor_rank; ++i) {
-    // Copy begin, end, and stride values from src_begin_indices and
-    // src_size_indices to range_tensor_data. Stride is always 1.
-    range_tensor_data[i * kRangesParamArgSize] = src_begin_indices->at(i);
-    range_tensor_data[i * kRangesParamArgSize + 1] =
-        src_begin_indices->at(i) + src_size_indices->at(i);
-    range_tensor_data[i * kRangesParamArgSize + 2] = 1;
-  }
-
-  Qnn_ClientBuffer_t range_tensor_client_buf = BuildDefaultClientBuffer();
-  range_tensor_client_buf.data = range_tensor_data;
-  range_tensor_client_buf.dataSize =
-      src_input_tensor_rank * kRangesParamArgSize * sizeof(int32_t);
-
-  // Construct the const tensor "ranges".
-  Qnn_Tensor_t range_tensor = BuildDefaultTensor();
-  graph_mapper.AssignTensorName(range_tensor);
-  range_tensor.v2.dataType = QNN_DATATYPE_INT_32;
-  range_tensor.v2.type = QNN_TENSOR_TYPE_STATIC;
-  range_tensor.v2.rank = kRangesParamRank;
-  range_tensor.v2.dimensions = new uint32_t[kRangesParamRank];
-  range_tensor.v2.dimensions[0] = src_input_tensor_rank;
-  range_tensor.v2.dimensions[1] = kRangesParamArgSize;
-  range_tensor.v2.memType = QNN_TENSORMEMTYPE_RAW;
-  range_tensor.v2.clientBuf = range_tensor_client_buf;
-  range_tensor.v2.isDynamicDimensions = nullptr;
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->tensorCreateGraphTensor(graph_mapper.QnnGraph(),
-                                                        &range_tensor));
-
-  Qnn_Param_t range_param = BuildDefaultParam();
-  range_param.paramType = QNN_PARAMTYPE_TENSOR;
-  range_param.name = "ranges";
-  range_param.tensorParam = range_tensor;
-
-  Qnn_Param_t strided_slice_params[] = {range_param};
-  dest.v1.inputTensors = qnn_op_ins;
-  dest.v1.numOfInputs = kSliceOpInputSize;
-  dest.v1.outputTensors = qnn_op_outs;
-  dest.v1.numOfOutputs = kSliceOpOutputSize;
-  dest.v1.numOfParams = kSliceOpParamSize;
-  dest.v1.params = strided_slice_params;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  LITERT_LOG(LITERT_INFO, "Legalized slice op", "");
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.h
deleted file mode 100644
index 1430d1e1fa43..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/slice_op_legalization.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SLICE_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SLICE_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class SliceOpLegalization : public Legalization {
- public:
-  SliceOpLegalization() = default;
-  ~SliceOpLegalization() = default;
-  using Ptr = std::unique_ptr<SliceOpLegalization>;
-  static Ptr Create() { return std::make_unique<SliceOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SLICE_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.cc
deleted file mode 100644
index 4466c91cb31f..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.h"
-
-#include <cstdint>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnSoftmaxOpTypeName = "Softmax";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kSoftmaxOpFmt = "softmax_%d";
-
-static constexpr int kSoftmaxOpInputSize = 1;
-static constexpr int kSoftmaxOpOutputSize = 1;
-static constexpr int kSoftmaxOpParamSize = 1;
-
-LiteRtStatus SoftmaxOpLegalization::LegalizeOp(const Op& src,
-                                               Qnn_OpConfig_t& dest,
-                                               GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflSoftmax) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  DumpLegalization(*src.Get());
-  std::string op_name = absl::StrFormat(kSoftmaxOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnSoftmaxOpTypeName.data(), dest));
-
-  // QNN reduce softmax op expects 1 input tensor.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, kSoftmaxOpInputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LookupInScope(op_ins.front().Get(), qnn_op_ins[0]));
-
-  // QNN softmax op expects 1 output tensor.
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, kSoftmaxOpOutputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  // Prepare QNN reduce softmax parameters.
-
-  // Extract beta option from softmax op.
-  float beta;
-  LITERT_RETURN_IF_ERROR(LiteRtGetSoftmaxBetaOption(src.Get(), &beta));
-  Qnn_Param_t beta_param = BuildDefaultParam();
-  beta_param.paramType = QNN_PARAMTYPE_SCALAR;
-  beta_param.name = "beta";
-  Qnn_Scalar_t keep_dims_scalar = QNN_SCALAR_INIT;
-  keep_dims_scalar.dataType = QNN_DATATYPE_FLOAT_32;
-  keep_dims_scalar.floatValue = beta;
-  beta_param.scalarParam = keep_dims_scalar;
-
-  Qnn_Param_t reduce_softmax_params[] = {beta_param};
-  dest.v1.inputTensors = qnn_op_ins;
-  dest.v1.numOfInputs = kSoftmaxOpInputSize;
-  dest.v1.outputTensors = qnn_op_outs;
-  dest.v1.numOfOutputs = kSoftmaxOpOutputSize;
-  dest.v1.numOfParams = kSoftmaxOpParamSize;
-  dest.v1.params = reduce_softmax_params;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  LITERT_LOG(LITERT_INFO, "Legalized softmax op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.h
deleted file mode 100644
index b4ecb005003c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/softmax_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SOFTMAX_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SOFTMAX_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class SoftmaxOpLegalization : public Legalization {
- public:
-  SoftmaxOpLegalization() = default;
-  ~SoftmaxOpLegalization() = default;
-  using Ptr = std::unique_ptr<SoftmaxOpLegalization>;
-  static Ptr Create() { return std::make_unique<SoftmaxOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SOFTMAX_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.cc
deleted file mode 100644
index 09ff1cbbc4dc..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnSubOpTypeName = "ElementWiseSubtract";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kSubOpFmt = "sub_%d";
-
-LiteRtStatus SubOpLegalization::LegalizeOp(const litert::Op& src,
-                                           Qnn_OpConfig_t& dest,
-                                           GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflSub) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kSubOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnSubOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized sub op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.h
deleted file mode 100644
index 3f05f8e04a7d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sub_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUB_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUB_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class SubOpLegalization : public Legalization {
- public:
-  SubOpLegalization() = default;
-  ~SubOpLegalization() = default;
-  using Ptr = std::unique_ptr<SubOpLegalization>;
-  static Ptr Create() { return std::make_unique<SubOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUB_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.cc
deleted file mode 100644
index 40fe0c10f878..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.h"
-
-#include <cstdint>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model_predicates.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnSumOpTypeName = "ReduceSum";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kSumOpFmt = "sum_%d";
-
-static constexpr int kReduceSumOpInputSize = 1;
-static constexpr int kReduceSumOpOutputSize = 1;
-static constexpr int kReduceSumOpParamSize = 1;
-static constexpr int kReduceSumOpParamRank = 1;
-
-LiteRtStatus SumOpLegalization::LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                                           GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflSum) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  DumpLegalization(*src.Get());
-  std::string op_name = absl::StrFormat(kSumOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnSumOpTypeName.data(), dest));
-
-  // QNN reduce sum op expects 1 input tensor.
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, kReduceSumOpInputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LookupInScope(src.Inputs().front().Get(), qnn_op_ins[0]));
-
-  // QNN sum op expects 1 output tensor.
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, kReduceSumOpOutputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(graph_mapper.LegalizeAndRegister(
-      src.Outputs().front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(src.Outputs().front().Get(), qnn_op_outs[0]));
-
-  // Prepare QNN reduce sum parameters.
-  const auto inputs = src.Inputs();
-  const auto& src_axes = inputs.at(1);
-
-  // Check if src_axes are weights tensors.
-  if (!src_axes.HasWeights()) {
-    LITERT_LOG(LITERT_ERROR, "Sum op axes are not weights tensors");
-    return kLiteRtStatusErrorInvalidLegalization;
-  }
-
-  auto src_axes_tensor_type = src_axes.RankedTensorType();
-  if (!src_axes_tensor_type) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               src_axes_tensor_type.Error().Message().c_str());
-    return src_axes_tensor_type.Error().Status();
-  }
-
-  int32_t dest_axes_size = src_axes_tensor_type->Layout().Dimensions()[0];
-  auto src_axes_data = src_axes.Weights().Bytes();
-  Qnn_ClientBuffer_t axes_tensor_client_buf = BuildDefaultClientBuffer();
-  axes_tensor_client_buf.data = (void*)src_axes_data.data();
-  axes_tensor_client_buf.dataSize = src_axes_data.size();
-
-  // Extract keepdims option from sum op.
-  bool keep_dims;
-  LITERT_RETURN_IF_ERROR(LiteRtGetSumKeepDimsOption(src.Get(), &keep_dims));
-
-  // Construct the scalar "keep_dims" param.
-  if (keep_dims) {
-    Qnn_Param_t range_param = BuildDefaultParam();
-    range_param.paramType = QNN_PARAMTYPE_SCALAR;
-    range_param.name = "keep_dims";
-    Qnn_Scalar_t keep_dims_scalar = QNN_SCALAR_INIT;
-    keep_dims_scalar.dataType = QNN_DATATYPE_BOOL_8;
-    keep_dims_scalar.bool8Value = true;
-    range_param.scalarParam = keep_dims_scalar;
-  }
-
-  // Construct the const tensor "axes".
-  Qnn_Tensor_t range_tensor = BuildDefaultTensor();
-  graph_mapper.AssignTensorName(range_tensor);
-  range_tensor.v2.dataType = QNN_DATATYPE_INT_32;
-  range_tensor.v2.type = QNN_TENSOR_TYPE_STATIC;
-  range_tensor.v2.rank = kReduceSumOpParamRank;
-  range_tensor.v2.dimensions = new uint32_t[kReduceSumOpParamRank];
-  range_tensor.v2.dimensions[0] = dest_axes_size;
-  range_tensor.v2.memType = QNN_TENSORMEMTYPE_RAW;
-  range_tensor.v2.clientBuf = axes_tensor_client_buf;
-  range_tensor.v2.isDynamicDimensions = nullptr;
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->tensorCreateGraphTensor(graph_mapper.QnnGraph(),
-                                                        &range_tensor));
-
-  Qnn_Param_t range_param = BuildDefaultParam();
-  range_param.paramType = QNN_PARAMTYPE_TENSOR;
-  range_param.name = "axes";
-  range_param.tensorParam = range_tensor;
-
-  Qnn_Param_t reduce_sum_params[] = {range_param};
-  dest.v1.inputTensors = qnn_op_ins;
-  dest.v1.numOfInputs = kReduceSumOpInputSize;
-  dest.v1.outputTensors = qnn_op_outs;
-  dest.v1.numOfOutputs = kReduceSumOpOutputSize;
-  dest.v1.numOfParams = kReduceSumOpParamSize;
-  dest.v1.params = reduce_sum_params;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  LITERT_LOG(LITERT_INFO, "Legalized sum op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.h
deleted file mode 100644
index a50e946ad069..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/sum_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUM_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUM_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class SumOpLegalization : public Legalization {
- public:
-  SumOpLegalization() = default;
-  ~SumOpLegalization() = default;
-  using Ptr = std::unique_ptr<SumOpLegalization>;
-  static Ptr Create() { return std::make_unique<SumOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_SUM_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.cc
deleted file mode 100644
index 121c564c1e95..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnTanhOpTypeName = "Tanh";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kTanhOpFmt = "tanh_%d";
-
-LiteRtStatus TanhOpLegalization::LegalizeOp(const litert::Op& src,
-                                            Qnn_OpConfig_t& dest,
-                                            GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflTanh) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  std::string op_name = absl::StrFormat(kTanhOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnTanhOpTypeName.data(), dest));
-  LITERT_RETURN_IF_ERROR(LegalizeSimpleOp(src, dest, graph_mapper));
-  LITERT_LOG(LITERT_INFO, "Legalized tanh op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.h
deleted file mode 100644
index 486e321ae8e2..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/tanh_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TANH_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TANH_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class TanhOpLegalization : public Legalization {
- public:
-  TanhOpLegalization() = default;
-  ~TanhOpLegalization() = default;
-  using Ptr = std::unique_ptr<TanhOpLegalization>;
-  static Ptr Create() { return std::make_unique<TanhOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const litert::Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TANH_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.cc
deleted file mode 100644
index 487ecce2e66d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.h"
-
-#include <cstdint>
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-namespace litert::qnn {
-
-static constexpr absl::string_view kQnnTransposeOpTypeName = "Transpose";
-static constexpr absl::string_view kDefaultQnnOpPackageName = "qti.aisw";
-static constexpr absl::string_view kTransposeOpFmt = "transpose_%d";
-
-static constexpr int kTransposeOpInputSize = 1;
-static constexpr int kTransposeOpOutputSize = 1;
-static constexpr int kTransposeOpParamSize = 1;
-static constexpr int kTransposeOpParamRank = 1;
-
-LiteRtStatus TransposeOpLegalization::LegalizeOp(const Op& src,
-                                                 Qnn_OpConfig_t& dest,
-                                                 GraphMapper& graph_mapper) {
-  if (src.Code() != kLiteRtOpCodeTflTranspose) {
-    return kLiteRtStatusLegalizeNoMatch;
-  }
-  DumpLegalization(*src.Get());
-  std::string op_name = absl::StrFormat(kTransposeOpFmt, op_counter_++);
-  LITERT_RETURN_IF_ERROR(SetOpInfo(op_name.c_str(),
-                                   kDefaultQnnOpPackageName.data(),
-                                   kQnnTransposeOpTypeName.data(), dest));
-
-  // QNN transpose op expects 1 input tensor.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, kTransposeOpInputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LookupInScope(op_ins.front().Get(), qnn_op_ins[0]));
-
-  // QNN transpose op expects 1 output tensor.
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, kTransposeOpOutputSize,
-                     QNN_TENSOR_INIT);
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.LegalizeAndRegister(op_outs.front().Get(), qnn_op_outs[0]));
-  LITERT_RETURN_IF_ERROR(
-      graph_mapper.PushToScope(op_outs.front().Get(), qnn_op_outs[0]));
-
-  // Prepare QNN transpose parameters.
-  auto perm = Tensor(op_ins.at(1).Get());
-
-  // Check if src_axes are weights tensors.
-  if (!perm.HasWeights()) {
-    return kLiteRtStatusErrorInvalidLegalization;
-  }
-  auto perm_data = perm.Weights().Bytes();
-  int32_t dest_axes_size = perm_data.size();
-  Qnn_ClientBuffer_t perm_tensor_client_buf = BuildDefaultClientBuffer();
-  perm_tensor_client_buf.data = (void*)perm_data.data();
-  perm_tensor_client_buf.dataSize = dest_axes_size;
-
-  // Construct the const tensor "perm".
-  Qnn_Tensor_t perm_tensor = BuildDefaultTensor();
-  graph_mapper.AssignTensorName(perm_tensor);
-  perm_tensor.v2.dataType = QNN_DATATYPE_INT_32;
-  perm_tensor.v2.type = QNN_TENSOR_TYPE_STATIC;
-  perm_tensor.v2.rank = kTransposeOpParamRank;
-  perm_tensor.v2.dimensions = new uint32_t[kTransposeOpParamRank];
-  perm_tensor.v2.dimensions[0] = dest_axes_size;
-  perm_tensor.v2.memType = QNN_TENSORMEMTYPE_RAW;
-  perm_tensor.v2.clientBuf = perm_tensor_client_buf;
-  perm_tensor.v2.isDynamicDimensions = nullptr;
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->tensorCreateGraphTensor(graph_mapper.QnnGraph(),
-                                                        &perm_tensor));
-
-  Qnn_Param_t perm_param = BuildDefaultParam();
-  perm_param.paramType = QNN_PARAMTYPE_TENSOR;
-  perm_param.name = "perm";
-  perm_param.tensorParam = perm_tensor;
-
-  Qnn_Param_t transpose_params[] = {perm_param};
-  dest.v1.inputTensors = qnn_op_ins;
-  dest.v1.numOfInputs = kTransposeOpInputSize;
-  dest.v1.outputTensors = qnn_op_outs;
-  dest.v1.numOfOutputs = kTransposeOpOutputSize;
-  dest.v1.numOfParams = kTransposeOpParamSize;
-  dest.v1.params = transpose_params;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  LITERT_LOG(LITERT_INFO, "Legalized transpose op", "");
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.h
deleted file mode 100644
index 39d7fc645c8e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/transpose_op_legalization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TRANSPOSE_OP_LEGALIZATION_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TRANSPOSE_OP_LEGALIZATION_H_
-
-#include <alloca.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/legalization.h"
-
-namespace litert::qnn {
-
-class TransposeOpLegalization : public Legalization {
- public:
-  TransposeOpLegalization() = default;
-  ~TransposeOpLegalization() = default;
-  using Ptr = std::unique_ptr<TransposeOpLegalization>;
-  static Ptr Create() { return std::make_unique<TransposeOpLegalization>(); }
-
-  LiteRtStatus LegalizeOp(const Op& src, Qnn_OpConfig_t& dest,
-                          GraphMapper& graph_mapper);
-
- private:
-  // Counter to ensure unique op names.
-  uint32_t op_counter_ = 0;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_TRANSPOSE_OP_LEGALIZATION_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.cc
deleted file mode 100644
index 5cd0646a907f..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h"
-
-#include <cstdint>
-#include <sstream>
-#include <string>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/tools/dump.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn {
-
-using ::litert::internal::Dump;
-using ::litert::internal::DumpOptions;
-
-// Dump source Op details.
-void DumpLegalization(const LiteRtOpT& op) {
-  std::ostringstream dump;
-  // TODO Make dump tools part of stable api.
-  Dump(op, dump);
-  DumpOptions(op, dump);
-  std::string s = dump.str();
-  LITERT_LOG(LITERT_INFO, "%s", s.data());
-}
-
-LiteRtStatus LegalizeSimpleOp(const Op& src, Qnn_OpConfig_t& dest,
-                              GraphMapper& graph_mapper) {
-  DumpLegalization(*src.Get());
-  // Look up op input tensors in scope.
-  const auto op_ins = src.Inputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_ins, op_ins.size(), QNN_TENSOR_INIT);
-
-  Qnn_Tensor_t* cur_qnn_op_in = qnn_op_ins;
-  for (const auto& op_in : op_ins) {
-    LITERT_RETURN_IF_ERROR(
-        graph_mapper.LookupInScope(op_in.Get(), *cur_qnn_op_in));
-    ++cur_qnn_op_in;
-  }
-
-  // Legalize op outputs and update scope.
-
-  const auto op_outs = src.Outputs();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, qnn_op_outs, op_outs.size(),
-                     QNN_TENSOR_INIT);
-
-  Qnn_Tensor_t* cur_qnn_op_out = qnn_op_outs;
-  for (const auto& op_out : op_outs) {
-    LITERT_RETURN_IF_ERROR(
-        graph_mapper.LegalizeAndRegister(op_out.Get(), *cur_qnn_op_out));
-    LITERT_RETURN_IF_ERROR(
-        graph_mapper.PushToScope(op_out.Get(), *cur_qnn_op_out));
-    ++cur_qnn_op_out;
-  }
-  dest.v1.numOfInputs = op_ins.size();
-  dest.v1.inputTensors = qnn_op_ins;
-
-  dest.v1.numOfOutputs = op_outs.size();
-  dest.v1.outputTensors = qnn_op_outs;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), dest));
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus BuildAndRegisterQnnNativeTensor(Qnn_DataType_t param_data_type,
-                                             uint32_t rank, uint32_t* dims,
-                                             GraphMapper& graph_mapper,
-                                             Qnn_Tensor_t& tensor) {
-  graph_mapper.AssignTensorName(tensor);
-  tensor.v2.dataType = param_data_type;
-  tensor.v2.type = QNN_TENSOR_TYPE_NATIVE;
-  tensor.v2.rank = rank;
-  tensor.v2.dimensions = dims;
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->tensorCreateGraphTensor(graph_mapper.QnnGraph(),
-                                                        &tensor));
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus BuildAndRegisterQnnOp(uint32_t input_size, Qnn_Tensor_t* op_ins,
-                                   uint32_t output_size, Qnn_Tensor_t* op_outs,
-                                   Qnn_OpConfig_t& op, uint32_t param_size,
-                                   Qnn_Param_t* params,
-                                   GraphMapper& graph_mapper) {
-  op.v1.numOfInputs = input_size;
-  op.v1.inputTensors = op_ins;
-  op.v1.numOfOutputs = output_size;
-  op.v1.outputTensors = op_outs;
-  op.v1.numOfParams = param_size;
-  op.v1.params = params;
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->graphAddNode(graph_mapper.QnnGraph(), op));
-
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h
deleted file mode 100644
index fb80708537b7..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/util.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_UTIL_H_
-
-#include <cstdint>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_tensor.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-
-namespace litert::qnn {
-
-// Use this function to legalize a LiteRtOp to a Qnn Op when:
-// 1. Source input/output tensor and destination input/ouptut tensor are 1 : 1
-// mapped
-// 2. Assigning params to destination OP does not depending on input tensor of
-// source OP.
-LiteRtStatus LegalizeSimpleOp(const Op& src, Qnn_OpConfig_t& dest,
-                              GraphMapper& graph_mapper);
-
-// Dump source Op details.
-void DumpLegalization(const LiteRtOpT& op);
-
-// Build and register a QNN native tensor in the QNN graph.
-LiteRtStatus BuildAndRegisterQnnNativeTensor(Qnn_DataType_t param_data_type,
-                                             uint32_t rank, uint32_t* dims,
-                                             GraphMapper& graph_mapper,
-                                             Qnn_Tensor_t& tensor);
-
-// Build and register a QNN op in the QNN graph.
-LiteRtStatus BuildAndRegisterQnnOp(uint32_t input_size, Qnn_Tensor_t* op_ins,
-                                   uint32_t output_size, Qnn_Tensor_t* op_outs,
-                                   Qnn_OpConfig_t& op, uint32_t param_size,
-                                   Qnn_Param_t* params,
-                                   GraphMapper& graph_mapper);
-
-// Build and register a QNN tensor param in the QNN graph.
-template <typename T>
-LiteRtStatus BuildQnnTesnorParam(T* param_data, uint32_t* param_dims,
-                                 Qnn_DataType_t param_data_type,
-                                 uint32_t param_rank, const char* param_name,
-                                 GraphMapper& graph_mapper,
-                                 Qnn_Param_t& param) {
-  // Build ClientBuffer for the param tensor.
-  Qnn_ClientBuffer_t tensor_client_buf = BuildDefaultClientBuffer();
-  tensor_client_buf.data = param_data;
-  tensor_client_buf.dataSize = sizeof(param_data);
-
-  // Build QNN param tensor.
-  Qnn_Tensor_t param_tensor = BuildDefaultTensor();
-  graph_mapper.AssignTensorName(param_tensor);
-  param_tensor.v2.dataType = param_data_type;
-  param_tensor.v2.type = QNN_TENSOR_TYPE_STATIC;
-  param_tensor.v2.rank = param_rank;
-  param_tensor.v2.dimensions = param_dims;
-  param_tensor.v2.memType = QNN_TENSORMEMTYPE_RAW;
-  param_tensor.v2.clientBuf = tensor_client_buf;
-
-  // Register param tensor in QNN graph.
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(
-      graph_mapper.Qnn().Api()->tensorCreateGraphTensor(graph_mapper.QnnGraph(),
-                                                        &param_tensor));
-  param.paramType = QNN_PARAMTYPE_TENSOR;
-  param.name = param_name;
-  param.tensorParam = param_tensor;
-  return kLiteRtStatusOk;
-}
-
-template <typename T>
-LiteRtStatus BuildQnnScalarParam(T& param_data, Qnn_DataType_t param_data_type,
-                                 const char* param_name,
-                                 GraphMapper& graph_mapper,
-                                 Qnn_Param_t& param) {
-  // Build QNN scalar.
-  Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
-  scalar.dataType = param_data_type;
-
-  // Build QNN scalar param.
-  switch (param_data_type) {
-    case QNN_DATATYPE_BOOL_8:
-      scalar.bool8Value = param_data;
-      break;
-    case QNN_DATATYPE_UINT_32:
-      scalar.uint32Value = param_data;
-      break;
-    case QNN_DATATYPE_INT_32:
-      scalar.int32Value = param_data;
-      break;
-    default:
-      return kLiteRtStatusErrorUnsupported;
-  }
-  param.paramType = QNN_PARAMTYPE_SCALAR;
-  param.name = param_name;
-  param.scalarParam = scalar;
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_LEGALIZATIONS_UTIL_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compiler_plugin.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compiler_plugin.cc
deleted file mode 100644
index b9f426fa603f..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compiler_plugin.cc
+++ /dev/null
@@ -1,442 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/HTP/QnnHtpDevice.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-using ::litert::qnn::QnnManager;
-using LiteRtBufferId = uint32_t;
-using LiteRtContextHandleIdx = uint32_t;
-using WeightSharingMap =
-    absl::flat_hash_map<LiteRtBufferId, LiteRtContextHandleIdx>;
-
-//
-// Configurations
-//
-
-namespace {
-
-constexpr char kPluginManufacturer[] = "Qualcomm";
-constexpr LiteRtParamIndex kDefaultPartitionIndex = 0;
-
-// clang-format off
-constexpr std::pair<const char*, QnnHtpDevice_Arch_t> kPluginSocModels[] = {
-    {"V68", QNN_HTP_DEVICE_ARCH_V68},
-    {"V69", QNN_HTP_DEVICE_ARCH_V69},
-    {"V73", QNN_HTP_DEVICE_ARCH_V73},
-    {"V75", QNN_HTP_DEVICE_ARCH_V75},
-    {"V79", QNN_HTP_DEVICE_ARCH_V79},
-};
-
-constexpr const char* kSocModelsSupportsWeightSharing[] = {
-  "V73",
-  "V75",
-  "V79",
-};
-
-constexpr LiteRtOpCode kSupportedOps[] = {
-  kLiteRtOpCodeTflAdd,
-  kLiteRtOpCodeTflDiv,
-  kLiteRtOpCodeTflMul,
-  kLiteRtOpCodeTflRsqrt,
-  kLiteRtOpCodeTflSlice,
-  kLiteRtOpCodeTflSelect,
-  kLiteRtOpCodeTflSelectV2,
-  kLiteRtOpCodeTflSub,
-  kLiteRtOpCodeTflTanh,
-  kLiteRtOpCodeTflBatchMatmul,
-  kLiteRtOpCodeTflReshape,
-  kLiteRtOpCodeTflSum,
-  kLiteRtOpCodeTflConcatenation,
-  kLiteRtOpCodeTflSoftmax,
-  kLiteRtOpCodeTflCast,
-  kLiteRtOpCodeTflTranspose,
-  kLiteRtOpCodeTflSin,
-  kLiteRtOpCodeTflCos,
-  kLiteRtOpCodeTflFullyConnected,
-  kLiteRtOpCodeTflEmbeddingLookup,
-  kLiteRtOpCodeTflLogicalAnd,
-  kLiteRtOpCodeTflLess,
-  kLiteRtOpCodeTflGreater,
-  kLiteRtOpCodeTflGelu,
-  kLiteRtOpCodeTflDynamicUpdateSlice,
-  kLiteRtOpCodeTflPack,
-  kLiteRtOpCodeTflQuantize,
-};
-// clang-format on
-
-static constexpr absl::string_view kEntryPointNameFmt = "qnn_partition_%d";
-
-constexpr auto kNumPluginSocModels =
-    sizeof(kPluginSocModels) / sizeof(kPluginSocModels[0]);
-
-std::optional<QnnHtpDevice_Arch_t> FindSocModel(
-    absl::string_view soc_model_name) {
-  std::optional<QnnHtpDevice_Arch_t> soc_model;
-  for (auto i = 0; i < kNumPluginSocModels; ++i) {
-    if (soc_model_name == kPluginSocModels[i].first) {
-      soc_model = kPluginSocModels[i].second;
-      break;
-    }
-  }
-  return soc_model;
-}
-
-bool IsWeightSharingSupported(absl::string_view soc_model_name) {
-  return std::find(std::begin(kSocModelsSupportsWeightSharing),
-                   std::end(kSocModelsSupportsWeightSharing),
-                   soc_model_name) != std::end(kSocModelsSupportsWeightSharing);
-}
-
-}  // namespace
-
-LiteRtStatus LiteRtGetCompilerPluginVersion(LiteRtApiVersion* api_version) {
-  if (api_version == nullptr) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  api_version->major = LITERT_API_VERSION_MAJOR;
-  api_version->minor = LITERT_API_VERSION_MINOR;
-  api_version->patch = LITERT_API_VERSION_PATCH;
-  return kLiteRtStatusOk;
-}
-
-const char* LiteRtGetCompilerPluginSocManufacturer() {
-  return kPluginManufacturer;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedHardware(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtHwAccelerators* supported_hardware) {
-  if (!compiler_plugin || !supported_hardware) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *supported_hardware = kLiteRtHwAcceleratorNpu;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumCompilerPluginSupportedSocModels(
-    LiteRtCompilerPlugin compiler_plugin,
-    LiteRtParamIndex* num_supported_soc_models) {
-  if (!compiler_plugin || !num_supported_soc_models) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_supported_soc_models = kNumPluginSocModels;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompilerPluginSupportedSocModel(
-    LiteRtCompilerPlugin compiler_plugin, LiteRtParamIndex soc_model_idx,
-    const char** soc_model_name) {
-  if (!compiler_plugin || !soc_model_name) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (soc_model_idx < 0 || soc_model_idx >= kNumPluginSocModels) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *soc_model_name = kPluginSocModels[soc_model_idx].first;
-  return kLiteRtStatusOk;
-}
-
-//
-// Compiled Result Definition
-//
-
-struct LiteRtCompiledResultT {
-  std::vector<std::vector<char>> context_bin;
-  std::vector<std::string> graph_names;
-};
-
-LiteRtStatus LiteRtGetCompiledResultByteCode(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex byte_code_idx,
-    const void** byte_code, size_t* byte_code_size) {
-  if (!compiled_result || !byte_code || !byte_code_size) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  *byte_code = compiled_result->context_bin[byte_code_idx].data();
-  *byte_code_size = compiled_result->context_bin[byte_code_idx].size();
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetCompiledResultCallInfo(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex call_idx,
-    const void** call_info, size_t* call_info_size,
-    LiteRtParamIndex* byte_code_idx) {
-  if (!compiled_result || !call_info || !call_info_size) {
-    return kLiteRtStatusErrorInvalidArgument;
-  } else if (call_idx >= compiled_result->graph_names.size()) {
-    return kLiteRtStatusErrorIndexOOB;
-  }
-
-  *call_info = compiled_result->graph_names.at(call_idx).data();
-  *call_info_size = compiled_result->graph_names.at(call_idx).size();
-  *byte_code_idx = 0;
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtGetNumCompiledResultCalls(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_calls) {
-  if (!compiled_result || !num_calls) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-  *num_calls = compiled_result->graph_names.size();
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompiledResult(LiteRtCompiledResult compiled_result) {
-  delete compiled_result;
-}
-
-LiteRtStatus LiteRtCompiledResultNumByteCodeModules(
-    LiteRtCompiledResult compiled_result, LiteRtParamIndex* num_byte_code) {
-  *num_byte_code = compiled_result->context_bin.size();
-  return kLiteRtStatusOk;
-}
-
-//
-// Plugin Definition
-//
-
-// Plugins can hold state.
-struct LiteRtCompilerPluginT {
-  // A "key-only" flag will have an empty string as the value.
-  using Flag = std::pair<std::string, std::string>;
-  std::vector<Flag> flags;
-};
-
-LiteRtStatus LiteRtCompilerPluginSetFlags(LiteRtCompilerPlugin compiler_plugin,
-                                          LiteRtParamIndex num_flags,
-                                          const char** keys,
-                                          const char** values) {
-  auto& flags = compiler_plugin->flags;
-  flags.resize(num_flags);
-  for (int i = 0; i < num_flags; ++i) {
-    auto& flag = flags[i];
-    flag.first = std::string(keys[i]);
-    flag.second = std::string(values[i]);
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCreateCompilerPlugin(LiteRtCompilerPlugin* compiler_plugin) {
-  auto* plugin = new LiteRtCompilerPluginT;
-  *compiler_plugin = plugin;
-  return kLiteRtStatusOk;
-}
-
-void LiteRtDestroyCompilerPlugin(LiteRtCompilerPlugin compiler_plugin) {
-  delete compiler_plugin;
-}
-
-LiteRtStatus LiteRtCompilerPluginPartition(LiteRtCompilerPlugin compiler_plugin,
-                                           LiteRtSubgraph subgraph,
-                                           LiteRtOpList selected_ops) {
-  ::litert::Subgraph graph(subgraph);
-
-  auto backend_configs = QnnManager::DefaultBackendConfigs();
-  // TODO: pass soc_model as parameter
-  auto qnn_manager = QnnManager::Create(backend_configs, std::nullopt,
-                                        {QNN_HTP_DEVICE_ARCH_V75});
-  if (!qnn_manager) {
-    LITERT_LOG(LITERT_ERROR, "%s", qnn_manager.Error().Message().data());
-    return qnn_manager.Error().Status();
-  }
-  LITERT_LOG(LITERT_INFO, "%s", "QNN manager created");
-
-  for (const auto& op : graph.Ops()) {
-    // default constructed, won't add tensor to QNN
-    ::qnn::TensorPool tensor_pool;
-    std::vector<::qnn::TensorWrapperRef> input_tensors;
-    for (const auto& input : op.Inputs()) {
-      ::qnn::TensorWrapper* res{nullptr};
-      LITERT_RETURN_IF_ERROR(
-          litert::qnn::ConvertTensor(input, tensor_pool, res));
-      input_tensors.emplace_back(*res);
-    }
-
-    std::vector<::qnn::TensorWrapperRef> output_tensors;
-    for (const auto& output : op.Outputs()) {
-      ::qnn::TensorWrapper* res{nullptr};
-      LITERT_RETURN_IF_ERROR(
-          litert::qnn::ConvertTensor(output, tensor_pool, res));
-      output_tensors.emplace_back(*res);
-    }
-
-    std::vector<::qnn::OpWrapper> op_wrappers;
-    LITERT_RETURN_IF_ERROR(litert::qnn::ConvertOp(
-        op, tensor_pool, input_tensors, output_tensors, op_wrappers));
-    if (std::all_of(
-            op_wrappers.begin(), op_wrappers.end(),
-            [&qnn_manager](const ::qnn::OpWrapper& op_wrapper) -> bool {
-              return kLiteRtStatusOk ==
-                     (*qnn_manager)->ValidateOp(op_wrapper.GetOpConfig());
-            })) {
-      LITERT_RETURN_IF_ERROR(
-          // Use default partition index if vendor doesn't support multiple
-          // partitions.
-          LiteRtPushOp(selected_ops, op.Get(), kDefaultPartitionIndex));
-    }
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus LiteRtCompilerPluginCompile(
-    LiteRtCompilerPlugin compiler_plugin, const char* soc_model,
-    LiteRtModel partitions, LiteRtCompiledResult* compiled_result) {
-  auto model = litert::Model::CreateFromNonOwnedHandle(partitions);
-  const auto num_partitions = model.NumSubgraphs();
-
-  LITERT_LOG(LITERT_INFO,
-             "Starting QNN Compilation for %d subgraphs, soc_model=%s",
-             num_partitions, soc_model);
-
-  auto opt_soc_model = soc_model ? FindSocModel(soc_model) : std::nullopt;
-  if (opt_soc_model) {
-    LITERT_LOG(LITERT_ERROR, "Compiling QNN architecture: %d", *opt_soc_model);
-  } else if (soc_model) {
-    LITERT_LOG(LITERT_ERROR, "Unexpected SoC model: %s", soc_model);
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  auto result = std::make_unique<LiteRtCompiledResultT>();
-  // Prepare one context binary per partition, since each partition is a
-  // separate subgraph that maps to a single Dispatch Op in the compiled the
-  // model.
-  result->context_bin.resize(num_partitions);
-
-  // Initialize SDK and load qnn shared libraries.
-  LITERT_LOG(LITERT_INFO, "%s", "Creating QNN manager");
-  auto backend_configs = QnnManager::DefaultBackendConfigs();
-  auto qnn_manager = QnnManager::Create(
-      backend_configs, /*shared_library_dir=*/std::nullopt, opt_soc_model);
-  if (!qnn_manager) {
-    LITERT_LOG(LITERT_ERROR, "%s", qnn_manager.Error().Message().c_str());
-    return qnn_manager.Error().Status();
-  }
-  LITERT_LOG(LITERT_INFO, "%s", "QNN manager created");
-
-  // Map of LiteRt buffer id to context handle index.
-  // This map memerizes the last context handle index of a weight was registered
-  // in.
-  WeightSharingMap weight_sharing_map;
-  LiteRtContextHandleIdx next_context_handle_idx = 0;
-
-  std::vector<QnnManager::ContextHandle> context_handles;
-
-  // Compile each partition (subgraph) individually.
-  for (int partition_idx = 0; partition_idx < num_partitions; ++partition_idx) {
-    LiteRtContextHandleIdx context_handle_idx = next_context_handle_idx;
-    uint64_t largest_weight_size = 0;
-    // Check all weights in this subgraph, see if any of them were previously
-    // seen and added to existing qnn context, use the largest weight size to
-    // determine which context to use.
-    for (const auto& op : model.Subgraph(partition_idx)->Ops()) {
-      for (const auto& input : op.Inputs()) {
-        if (input.IsConstant()) {
-          auto buffer_id = input.Weights().Get()->GetBufferId();
-          auto it = weight_sharing_map.find(buffer_id);
-          if (it != weight_sharing_map.end()) {
-            if (input.Weights().Get()->Buffer().Size() >= largest_weight_size) {
-              context_handle_idx = it->second;
-              largest_weight_size = input.Weights().Get()->Buffer().Size();
-            }
-          }
-        }
-      }
-    }
-    // If we didn't find a existing context handle for this subgraph, create a
-    // new one.
-    if (context_handle_idx == next_context_handle_idx) {
-      // Initialize context.
-      LITERT_LOG(LITERT_INFO, "%s", "Creating context handle");
-      // We enable weight sharing by default, this could lead to issue when
-      // support legacy SoC.
-      // TODO: use option to control weight sharing.
-      auto context_configs = QnnManager::WeightSharingContextConfigs();
-      if (!IsWeightSharingSupported(soc_model)) {
-        context_configs = QnnManager::DefaultContextConfigs();
-      }
-      auto context_handle =
-          (*qnn_manager)->CreateContextHandle(context_configs);
-      if (!context_handle) {
-        LITERT_LOG(LITERT_ERROR, "%s",
-                   context_handle.Error().Message().c_str());
-        return context_handle.Error().Status();
-      }
-      context_handles.push_back(std::move(context_handle.Value()));
-      LITERT_LOG(LITERT_INFO, "%s", "Context handle created");
-      ++next_context_handle_idx;
-    }
-    // Set context handle index for all weight buffers in this subgraph.
-    for (const auto& op : model.Subgraph(partition_idx)->Ops()) {
-      for (const auto& input : op.Inputs()) {
-        if (input.IsConstant()) {
-          auto buffer_id = input.Weights().Get()->GetBufferId();
-          weight_sharing_map[buffer_id] = context_handle_idx;
-        }
-      }
-    }
-
-    // Compose graphs.
-    LITERT_LOG(LITERT_INFO, "%s", "Composing graph");
-    std::string& entry_point_name = result->graph_names.emplace_back();
-    entry_point_name = absl::StrFormat(kEntryPointNameFmt, partition_idx);
-    LiteRtSubgraph partition = model.Subgraph(partition_idx)->Get();
-    LITERT_RETURN_IF_ERROR(litert::qnn::ComposeGraph(
-        **qnn_manager, context_handles[context_handle_idx].get(), partition,
-        entry_point_name));
-    LITERT_LOG(LITERT_INFO, "%s", "Graph composed");
-  }
-
-  // Generate context binary.
-  result->context_bin.resize(next_context_handle_idx);
-  for (int i = 0; i < next_context_handle_idx; ++i) {
-    LITERT_LOG(LITERT_INFO, "%s", "Generating context binary");
-    LITERT_RETURN_IF_ERROR((*qnn_manager)
-                               ->GenerateContextBinary(context_handles[i].get(),
-                                                       result->context_bin[i]));
-    LITERT_LOG(LITERT_INFO, "Context binary %d generated", i);
-  }
-  *compiled_result = result.release();
-
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compiler_plugin_test.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compiler_plugin_test.cc
deleted file mode 100644
index e4328adcaa20..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compiler_plugin_test.cc
+++ /dev/null
@@ -1,388 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/matchers.h"
-#include "tensorflow/lite/experimental/litert/test/test_models.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/cc/litert_compiler_plugin.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/IR/qnn_op.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/legalizations/quantize_op_legalization.h"
-
-namespace litert {
-namespace {
-
-using ::testing::Values;
-
-// clang-format off
-// TODO: Add support and uncomment these models.
-const auto kSupportedOps =
-                  Values(
-                    "rms_norm_composite.tflite",
-                    "simple_add_op.tflite",
-                    "simple_div_op.tflite",
-                    "simple_mul_op.tflite",
-                    "simple_rsqrt_op.tflite",
-                    "simple_slice_op.tflite",
-                    "simple_sub_op.tflite",
-                    "simple_sum_op.tflite",
-                    "simple_tanh_op.tflite",
-                    "simple_reshape_op.tflite",
-                    "simple_batch_matmul_op.tflite",
-                    "rms_norm.tflite",
-                    "simple_concatenation_op.tflite",
-                    "simple_softmax_op.tflite",
-                    "simple_cast_op.tflite",
-                    "simple_transpose_op.tflite",
-                    "simple_sin_op.tflite",
-                    "simple_cos_op.tflite",
-                    "simple_select_op.tflite",
-                    "simple_select_v2_op.tflite",
-                    "simple_fully_connected_op.tflite",
-                    "fully_connected_3d.tflite",
-                    "simple_embedding_lookup_op.tflite",
-                    "simple_logical_and_op.tflite",
-                    "simple_less_op.tflite",
-                    "simple_greater_op.tflite",
-                    "simple_gelu_op.tflite",
-                    "simple_dynamic_update_slice_op.tflite",
-                    "simple_pack_op.tflite",
-                    "simple_gather_op.tflite",
-                    "simple_mean_op.tflite",
-                    "simple_split_op.tflite",
-                    kFeedForwardModel,
-                    kKeyEinsumModel,
-                    kQueryEinsumModel,
-                    kValueEinsumModel,
-                    kAttnVecEinsumModel,
-                    kROPEModel,
-                    kLookUpROPEModel,
-                    kRMSNormModel,
-                    kSDPAModel,
-                    kAttentionModel,
-                    kTransformerBlockModel
-                    // kQSimpleMul16x16Model,
-                    // kQMulAdd16x16Model,
-                    // kQQueryEinsum16x8Model,
-                    // kQKeyEinsum16x8Model,
-                    // kQVauleEinsum16x8Model,
-                    // kQAttnVecEinsum16x8Model
-                    );
-
-const auto kSupportedSocModels = Values(
-    "V68",
-    "V69",
-    "V73",
-    "V75",
-    "V79"
-);
-// clang-format on
-
-TEST(TestQnnPlugin, GetConfigInfo) {
-  EXPECT_STREQ(LiteRtGetCompilerPluginSocManufacturer(), "Qualcomm");
-
-  auto plugin = CreatePlugin();
-
-  LiteRtParamIndex num_supported_soc_models;
-  LITERT_ASSERT_OK(LiteRtGetNumCompilerPluginSupportedSocModels(
-      plugin.get(), &num_supported_soc_models));
-  ASSERT_EQ(num_supported_soc_models, 5);
-
-  const char* config_id;
-  LITERT_ASSERT_OK(
-      LiteRtGetCompilerPluginSupportedSocModel(plugin.get(), 0, &config_id));
-  EXPECT_STREQ(config_id, "V68");
-}
-
-TEST(TestQnnPlugin, PartitionMulOps) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("one_mul.tflite");
-
-  LiteRtOpListT selected_op_list;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginPartition(
-      plugin.get(), model.Subgraph(0)->Get(), &selected_op_list));
-  const auto selected_ops = selected_op_list.Values();
-
-  ASSERT_EQ(selected_ops.size(), 1);
-  EXPECT_EQ(selected_ops[0].first->OpCode(), kLiteRtOpCodeTflMul);
-}
-
-TEST(TestQnnPlugin, CompileMulSubgraph) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("one_mul.tflite");
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(
-      LiteRtCompilerPluginCompile(plugin.get(), "V75", model.Get(), &compiled));
-
-  const void* byte_code;
-  size_t byte_code_size;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultByteCode(
-      compiled, /*byte_code_idx=*/0, &byte_code, &byte_code_size));
-
-  absl::string_view byte_code_string(reinterpret_cast<const char*>(byte_code),
-                                     byte_code_size);
-  ASSERT_FALSE(byte_code_string.empty());
-
-  const void* op_data;
-  size_t op_data_size;
-  LiteRtParamIndex byte_code_idx;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultCallInfo(
-      compiled, /*call_idx=*/0, &op_data, &op_data_size, &byte_code_idx));
-
-  absl::string_view op_data_string(reinterpret_cast<const char*>(op_data),
-                                   op_data_size);
-  ASSERT_EQ("qnn_partition_0", op_data_string);
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-TEST(TestQnnPlugin, ShareContextBinary) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("cst_multi_subgraph.tflite");
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(
-      LiteRtCompilerPluginCompile(plugin.get(), "V75", model.Get(), &compiled));
-  uint64_t num_byte_code;
-  LITERT_ASSERT_OK(
-      LiteRtCompiledResultNumByteCodeModules(compiled, &num_byte_code));
-  ASSERT_EQ(num_byte_code, 1);
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-TEST(TestQnnPlugin, NotShareContextBinary) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("multi_subgraph.tflite");
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(
-      LiteRtCompilerPluginCompile(plugin.get(), "V75", model.Get(), &compiled));
-  uint64_t num_byte_code;
-  LITERT_ASSERT_OK(
-      LiteRtCompiledResultNumByteCodeModules(compiled, &num_byte_code));
-  ASSERT_EQ(num_byte_code, 3);
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-TEST(TestLegalization, QuantizeOpLegalizedToCastOp) {
-  static constexpr absl::string_view kQnnOpName = "Cast";
-  static constexpr int kSUFixed8OffsetDiff = 128;
-  const auto input_quantization_params = MakePerTensorQuantization(
-      /*scale=*/1.0f, /*zero_point=*/0);
-  const auto output_quantization_params = MakePerTensorQuantization(
-      /*scale=*/1.0f, /*zero_point=*/kSUFixed8OffsetDiff);
-  LiteRtOpT quantize_op;
-  LiteRtTensorT input_tensor;
-  LiteRtTensorT output_tensor;
-  // Set quantization params, tensor type for input and output tensors.
-  input_tensor.SetQarams(input_quantization_params);
-  TensorType input_tensor_type =
-      MakeRankedTensorType(kLiteRtElementTypeInt8, {1, 1});
-  input_tensor.SetType(input_tensor_type);
-  output_tensor.SetQarams(output_quantization_params);
-  TensorType output_tensor_type =
-      MakeRankedTensorType(kLiteRtElementTypeUInt8, {1, 1});
-  output_tensor.SetType(output_tensor_type);
-  quantize_op.Inputs().push_back(&input_tensor);
-  quantize_op.Outputs().push_back(&output_tensor);
-  quantize_op.SetOpCode(kLiteRtOpCodeTflQuantize);
-
-  qnn::QuantizeOpLegalization legalization;
-  Qnn_OpConfig_t legalized_qnn_op = qnn::BuildDefaultOp();
-  litert::Op litert_quantize_op(&quantize_op);
-  LITERT_ASSERT_OK(
-      legalization.ConfigureQnnOp(litert_quantize_op, legalized_qnn_op));
-  absl::string_view qnn_op_name(legalized_qnn_op.v1.typeName);
-  EXPECT_EQ(qnn_op_name, kQnnOpName);
-}
-
-TEST(TestLegalization, QuantizeOpLegalizedToConvertOp) {
-  static constexpr absl::string_view kQnnOpName = "Convert";
-  static constexpr int kSUFixed8OffsetDiff = 0;
-  const auto input_quantization_params = MakePerTensorQuantization(
-      /*scale=*/1.0f, /*zero_point=*/0);
-  const auto output_quantization_params = MakePerTensorQuantization(
-      /*scale=*/1.0f, /*zero_point=*/kSUFixed8OffsetDiff);
-  LiteRtOpT quantize_op;
-  LiteRtTensorT input_tensor;
-  LiteRtTensorT output_tensor;
-  // Set quantization params, tensor type for input and output tensors.
-  input_tensor.SetQarams(input_quantization_params);
-  TensorType input_tensor_type =
-      MakeRankedTensorType(kLiteRtElementTypeInt8, {1, 1});
-  input_tensor.SetType(input_tensor_type);
-  output_tensor.SetQarams(output_quantization_params);
-  TensorType output_tensor_type =
-      MakeRankedTensorType(kLiteRtElementTypeUInt8, {1, 1});
-  output_tensor.SetType(output_tensor_type);
-  quantize_op.Inputs().push_back(&input_tensor);
-  quantize_op.Outputs().push_back(&output_tensor);
-  quantize_op.SetOpCode(kLiteRtOpCodeTflQuantize);
-
-  qnn::QuantizeOpLegalization legalization;
-  Qnn_OpConfig_t legalized_qnn_op = qnn::BuildDefaultOp();
-  litert::Op litert_quantize_op(&quantize_op);
-  LITERT_ASSERT_OK(
-      legalization.ConfigureQnnOp(litert_quantize_op, legalized_qnn_op));
-  absl::string_view qnn_op_name(legalized_qnn_op.v1.typeName);
-  EXPECT_EQ(qnn_op_name, kQnnOpName);
-}
-
-TEST(TestLegalization, QuantizeOpLegalizedToQuantizeOp) {
-  static constexpr absl::string_view kQnnOpName = "Quantize";
-  const auto output_quantization_params = MakePerTensorQuantization(
-      /*scale=*/1.0f, /*zero_point=*/0);
-  LiteRtOpT quantize_op;
-  LiteRtTensorT input_tensor;
-  LiteRtTensorT output_tensor;
-  // Set quantization params, tensor type for input and output tensors.
-  TensorType input_tensor_type =
-      MakeRankedTensorType(kLiteRtElementTypeFloat32, {1, 1});
-  input_tensor.SetType(input_tensor_type);
-  output_tensor.SetQarams(output_quantization_params);
-  TensorType output_tensor_type =
-      MakeRankedTensorType(kLiteRtElementTypeInt16, {1, 1});
-  output_tensor.SetType(output_tensor_type);
-  quantize_op.Inputs().push_back(&input_tensor);
-  quantize_op.Outputs().push_back(&output_tensor);
-  quantize_op.SetOpCode(kLiteRtOpCodeTflQuantize);
-
-  qnn::QuantizeOpLegalization legalization;
-  Qnn_OpConfig_t legalized_qnn_op = qnn::BuildDefaultOp();
-  litert::Op litert_quantize_op(&quantize_op);
-  LITERT_ASSERT_OK(
-      legalization.ConfigureQnnOp(litert_quantize_op, legalized_qnn_op));
-  absl::string_view qnn_op_name(legalized_qnn_op.v1.typeName);
-  EXPECT_EQ(qnn_op_name, kQnnOpName);
-}
-
-class QnnPlyginSupportedSocCompilationTest
-    : public ::testing::TestWithParam<std::string> {};
-
-TEST_P(QnnPlyginSupportedSocCompilationTest, CompileMulSubgraph) {
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel("one_mul.tflite");
-  auto soc_model = GetParam();
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginCompile(plugin.get(), soc_model.c_str(),
-                                               model.Get(), &compiled));
-
-  const void* byte_code;
-  size_t byte_code_size;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultByteCode(
-      compiled, /*byte_code_idx=*/0, &byte_code, &byte_code_size));
-
-  absl::string_view byte_code_string(reinterpret_cast<const char*>(byte_code),
-                                     byte_code_size);
-  ASSERT_FALSE(byte_code_string.empty());
-
-  const void* op_data;
-  size_t op_data_size;
-  LiteRtParamIndex byte_code_idx;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultCallInfo(
-      compiled, /*call_idx=*/0, &op_data, &op_data_size, &byte_code_idx));
-
-  absl::string_view op_data_string(reinterpret_cast<const char*>(op_data),
-                                   op_data_size);
-  ASSERT_EQ("qnn_partition_0", op_data_string);
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-INSTANTIATE_TEST_SUITE_P(SupportedOpsTest, QnnPlyginSupportedSocCompilationTest,
-                         kSupportedSocModels);
-
-class QnnPluginOpValidationTest : public ::testing::TestWithParam<std::string> {
-};
-
-TEST_P(QnnPluginOpValidationTest, SupportedOpsTest) {
-  LITERT_LOG(LITERT_INFO, "Validating TFLite model: %s", GetParam().c_str());
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel(GetParam());
-
-  const auto subgraph = model.MainSubgraph();
-  LiteRtSubgraph litert_subgraph = subgraph->Get();
-
-  LiteRtOpListT selected_ops;
-  LITERT_ASSERT_OK(LiteRtCompilerPluginPartition(plugin.get(), litert_subgraph,
-                                                 &selected_ops));
-
-  EXPECT_EQ(selected_ops.Values().size(), litert_subgraph->Ops().size());
-}
-
-INSTANTIATE_TEST_SUITE_P(SupportedOpsTest, QnnPluginOpValidationTest,
-                         kSupportedOps);
-
-class QnnPluginOpCompatibilityTest
-    : public ::testing::TestWithParam<std::string> {};
-
-TEST_P(QnnPluginOpCompatibilityTest, SupportedOpsTest) {
-  LITERT_LOG(LITERT_INFO, "Testing TFLite model: %s", GetParam().c_str());
-  auto plugin = CreatePlugin();
-  auto model = testing::LoadTestFileModel(GetParam());
-
-  LiteRtCompiledResult compiled;
-  LITERT_ASSERT_OK(
-      LiteRtCompilerPluginCompile(plugin.get(), "V75", model.Get(), &compiled));
-
-  const void* byte_code;
-  size_t byte_code_size;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultByteCode(
-      compiled, /*byte_code_idx=*/0, &byte_code, &byte_code_size));
-
-  absl::string_view byte_code_string(reinterpret_cast<const char*>(byte_code),
-                                     byte_code_size);
-  ASSERT_FALSE(byte_code_string.empty());
-
-  const void* op_data;
-  size_t op_data_size;
-  LiteRtParamIndex byte_code_idx;
-
-  LITERT_ASSERT_OK(LiteRtGetCompiledResultCallInfo(
-      compiled, /*call_idx=*/0, &op_data, &op_data_size, &byte_code_idx));
-
-  absl::string_view op_data_string(reinterpret_cast<const char*>(op_data),
-                                   op_data_size);
-  ASSERT_EQ("qnn_partition_0", op_data_string);
-
-  LiteRtDestroyCompiledResult(compiled);
-}
-
-INSTANTIATE_TEST_SUITE_P(SupportedOpsTest, QnnPluginOpCompatibilityTest,
-                         kSupportedOps);
-
-}  // namespace
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.cc
deleted file mode 100644
index 37c71efd3e3a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.cc
+++ /dev/null
@@ -1,558 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h"
-
-#include <alloca.h>
-#include <stdbool.h>
-#include <stdio.h>
-
-#include <cstdint>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_op_code.h"
-#include "tensorflow/lite/experimental/litert/c/litert_options.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_element_type.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_model.h"
-#include "tensorflow/lite/experimental/litert/core/model/model.h"
-#include "tensorflow/lite/experimental/litert/tools/dump.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/graph_mapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn {
-
-using ::litert::internal::Dump;
-LiteRtStatus ConvertDataType(const litert::ElementType litert_type,
-                             const bool is_quantized,
-                             Qnn_DataType_t& qnn_type) {
-  qnn_type = QNN_DATATYPE_UNDEFINED;
-  switch (litert_type) {
-    case litert::ElementType::Bool:
-      qnn_type = QNN_DATATYPE_BOOL_8;
-      break;
-    case litert::ElementType::Int4:
-      qnn_type = QNN_DATATYPE_SFIXED_POINT_4;
-      break;
-    case litert::ElementType::Int8:
-      qnn_type =
-          is_quantized ? QNN_DATATYPE_SFIXED_POINT_8 : QNN_DATATYPE_INT_8;
-      break;
-    case litert::ElementType::Int16:
-      qnn_type =
-          is_quantized ? QNN_DATATYPE_SFIXED_POINT_16 : QNN_DATATYPE_INT_16;
-      break;
-    case litert::ElementType::Int32:
-      qnn_type =
-          is_quantized ? QNN_DATATYPE_SFIXED_POINT_32 : QNN_DATATYPE_INT_32;
-      break;
-    case litert::ElementType::Int64:
-      qnn_type = QNN_DATATYPE_INT_64;
-      break;
-    case litert::ElementType::UInt8:
-      qnn_type =
-          is_quantized ? QNN_DATATYPE_UFIXED_POINT_8 : QNN_DATATYPE_UINT_8;
-      break;
-    case litert::ElementType::UInt16:
-      qnn_type =
-          is_quantized ? QNN_DATATYPE_UFIXED_POINT_16 : QNN_DATATYPE_UINT_16;
-      break;
-    case litert::ElementType::UInt32:
-      qnn_type =
-          is_quantized ? QNN_DATATYPE_UFIXED_POINT_32 : QNN_DATATYPE_UINT_32;
-      break;
-    case litert::ElementType::UInt64:
-      qnn_type = QNN_DATATYPE_UINT_64;
-      break;
-    case litert::ElementType::Float16:
-      qnn_type = QNN_DATATYPE_FLOAT_16;
-      break;
-    case litert::ElementType::Float32:
-      qnn_type = QNN_DATATYPE_FLOAT_32;
-      break;
-    case litert::ElementType::Float64:
-      qnn_type = QNN_DATATYPE_FLOAT_64;
-      break;
-    default:
-      return kLiteRtStatusErrorUnsupported;
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus ConvertTensor(const litert::Tensor& litert_tensor,
-                           ::qnn::TensorPool& tensor_pool,
-                           ::qnn::TensorWrapper*& tensor_wrapper,
-                           bool is_tensor_read_and_write) {
-  tensor_wrapper = nullptr;
-
-  if (litert_tensor.TypeId() != kLiteRtRankedTensorType) {
-    return kLiteRtStatusErrorInvalidArgument;
-  }
-
-  const auto ranked_tensor_type = litert_tensor.RankedTensorType();
-  if (!ranked_tensor_type) {
-    LITERT_LOG(LITERT_ERROR, "%s", ranked_tensor_type.Error().Message().data());
-    return ranked_tensor_type.Error().Status();
-  }
-
-  Qnn_DataType_t qnn_data_type;
-  LITERT_RETURN_IF_ERROR(ConvertDataType(ranked_tensor_type->ElementType(),
-                                         litert_tensor.HasQuantization(),
-                                         qnn_data_type));
-
-  std::vector<std::uint32_t> dimentions;
-  const auto litert_layout = ranked_tensor_type->Layout();
-  if (litert_layout.Rank() == 0) {
-    dimentions.resize(1, 1);
-  } else {
-    dimentions.resize(litert_layout.Rank());
-    for (size_t i = 0; i < dimentions.size(); ++i) {
-      dimentions[i] = litert_layout.Dimensions()[i];
-    }
-  }
-
-  ::qnn::QuantizeParamsWrapperVariant quantize_params;
-  switch (litert_tensor.QTypeId()) {
-    case kLiteRtQuantizationPerTensor: {
-      const auto per_tensor_quant = litert_tensor.PerTensorQuantization();
-      quantize_params.emplace<::qnn::ScaleOffsetQuantizeParamsWrapper>(
-          per_tensor_quant.scale, per_tensor_quant.zero_point);
-      break;
-    }
-    case kLiteRtQuantizationPerChannel: {
-      const auto per_channel_quant = litert_tensor.PerChannelQuantization();
-      // convert zero points from std::int64_t to std::int32_t
-      std::vector<std::int32_t> zero_points(per_channel_quant.num_channels);
-      for (size_t i = 0; i < zero_points.size(); ++i) {
-        zero_points[i] = per_channel_quant.zero_points[i];
-      }
-      quantize_params.emplace<::qnn::AxisScaleOffsetQuantizeParamsWrapper>(
-          per_channel_quant.quantized_dimension,
-          absl::Span<const float>{per_channel_quant.scales,
-                                  per_channel_quant.num_channels},
-          absl::Span<const std::int32_t>{zero_points.data(),
-                                         zero_points.size()});
-      break;
-    }
-    case kLiteRtQuantizationBlockWise: {
-      LITERT_LOG(LITERT_ERROR, "Unsupported quantization type.");
-      return kLiteRtStatusErrorInvalidArgument;
-    }
-    case kLiteRtQuantizationNone:
-    default:
-      break;
-  }
-
-  if (litert_tensor.IsSubgraphInput()) {
-    auto& res = tensor_pool.CreateInputTensor(qnn_data_type, quantize_params,
-                                              dimentions);
-    tensor_wrapper = &res;
-  } else if (litert_tensor.IsSubgraphOutput() || is_tensor_read_and_write) {
-    auto& res = tensor_pool.CreateOutpuTensor(qnn_data_type, quantize_params,
-                                              dimentions);
-    tensor_wrapper = &res;
-  } else if (litert_tensor.IsConstant()) {
-    LITERT_ENSURE(litert_tensor.HasWeights(),
-                  kLiteRtStatusErrorInvalidLegalization,
-                  "Empty weights for constant tensor.");
-    auto& res = tensor_pool.CreateStaticTensor(
-        qnn_data_type, quantize_params, dimentions,
-        litert_tensor.Weights().Bytes().size(),
-        reinterpret_cast<const void*>(litert_tensor.Weights().Bytes().data()));
-    tensor_wrapper = &res;
-  } else {
-    auto& res = tensor_pool.CreateNativeTensor(qnn_data_type, quantize_params,
-                                               dimentions);
-    tensor_wrapper = &res;
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus ConvertOp(
-    const litert::Op& litert_op, ::qnn::TensorPool& tensor_pool,
-    const std::vector<::qnn::TensorWrapperRef>& input_tensors,
-    const std::vector<::qnn::TensorWrapperRef>& output_tensors,
-    std::vector<::qnn::OpWrapper>& op_wrappers) {
-  switch (litert_op.Code()) {
-    case LiteRtOpCode::kLiteRtOpCodeTflCast: {
-      op_wrappers =
-          ::qnn::BuildCastOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflConcatenation: {
-      int32_t axis{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetConcatenationAxisOption(litert_op.Get(), &axis));
-      op_wrappers = ::qnn::BuildConcatenationOp(tensor_pool, input_tensors,
-                                                output_tensors, axis);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflAdd: {
-      uint32_t fused_activation{};
-      LITERT_RETURN_IF_ERROR(LiteRtGetAddFusedActivationOption(
-          litert_op.Get(), &fused_activation));
-      op_wrappers = ::qnn::BuildElementwiseAddOp(tensor_pool, input_tensors,
-                                                 output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflLogicalAnd: {
-      op_wrappers = ::qnn::BuildElementwiseAndOp(tensor_pool, input_tensors,
-                                                 output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflCos: {
-      op_wrappers = ::qnn::BuildElementwiseCosOp(tensor_pool, input_tensors,
-                                                 output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflDiv: {
-      uint32_t fused_activation{};
-      LITERT_RETURN_IF_ERROR(LiteRtGetDivFusedActivationOption(
-          litert_op.Get(), &fused_activation));
-      op_wrappers = ::qnn::BuildElementwiseDivOp(tensor_pool, input_tensors,
-                                                 output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflGreater: {
-      op_wrappers = ::qnn::BuildElementwiseGreaterOp(tensor_pool, input_tensors,
-                                                     output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflLess: {
-      op_wrappers = ::qnn::BuildElementwiseLessOp(tensor_pool, input_tensors,
-                                                  output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflMul: {
-      uint32_t fused_activation{};
-      LITERT_RETURN_IF_ERROR(LiteRtGetMulFusedActivationOption(
-          litert_op.Get(), &fused_activation));
-      op_wrappers = ::qnn::BuildElementwiseMulOp(tensor_pool, input_tensors,
-                                                 output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflRsqrt: {
-      op_wrappers = ::qnn::BuildElementwiseRsqrtOp(tensor_pool, input_tensors,
-                                                   output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSin: {
-      op_wrappers = ::qnn::BuildElementwiseSinOp(tensor_pool, input_tensors,
-                                                 output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSquaredDifference: {
-      op_wrappers = ::qnn::BuildElementwiseSquaredDifferenceOp(
-          tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSquare: {
-      op_wrappers = ::qnn::BuildElementwiseSquareOp(tensor_pool, input_tensors,
-                                                    output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSub: {
-      uint32_t fused_activation{};
-      LITERT_RETURN_IF_ERROR(LiteRtGetSubFusedActivationOption(
-          litert_op.Get(), &fused_activation));
-      op_wrappers = ::qnn::BuildElementwiseSubOp(tensor_pool, input_tensors,
-                                                 output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflEmbeddingLookup: {
-      op_wrappers = ::qnn::BuildEmbeddingLookupOp(tensor_pool, input_tensors,
-                                                  output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflFullyConnected: {
-      uint32_t fused_activation{};
-      LITERT_RETURN_IF_ERROR(LiteRtGetFullyConnectedFusedActivationOption(
-          litert_op.Get(), &fused_activation));
-      bool keep_num_dims{};
-      LITERT_RETURN_IF_ERROR(LiteRtGetFullyConnectedKeepNumDimsOption(
-          litert_op.Get(), &keep_num_dims));
-      op_wrappers = ::qnn::BuildFullyConnectedOp(tensor_pool, input_tensors,
-                                                 output_tensors, keep_num_dims);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflGather: {
-      int32_t axis{};
-      LITERT_RETURN_IF_ERROR(LiteRtGetGatherAxisOption(litert_op.Get(), &axis));
-      int32_t batch_dims{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetGatherBatchDimsOption(litert_op.Get(), &batch_dims));
-      op_wrappers = ::qnn::BuildGatherOp(tensor_pool, input_tensors,
-                                         output_tensors, axis, batch_dims);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflGelu: {
-      op_wrappers =
-          ::qnn::BuildGeluOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflBatchMatmul: {
-      bool adj_x{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetBatchMatmulAdjXOption(litert_op.Get(), &adj_x));
-      bool adj_y{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetBatchMatmulAdjYOption(litert_op.Get(), &adj_y));
-      op_wrappers = ::qnn::BuildMatmulOp(tensor_pool, input_tensors,
-                                         output_tensors, adj_x, adj_y);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflMean: {
-      bool keep_dims{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetMeanKeepDimsOption(litert_op.Get(), &keep_dims));
-      op_wrappers = ::qnn::BuildMeanOp(tensor_pool, input_tensors,
-                                       output_tensors, keep_dims);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflQuantize: {
-      op_wrappers =
-          ::qnn::BuildQuantizeOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSum: {
-      bool keep_dims{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetSumKeepDimsOption(litert_op.Get(), &keep_dims));
-      op_wrappers = ::qnn::BuildReduceSumOp(tensor_pool, input_tensors,
-                                            output_tensors, keep_dims);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflReshape: {
-      op_wrappers =
-          ::qnn::BuildReshapeOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSelect:
-    case LiteRtOpCode::kLiteRtOpCodeTflSelectV2: {
-      op_wrappers =
-          ::qnn::BuildSelectOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSlice: {
-      op_wrappers =
-          ::qnn::BuildSliceOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSoftmax: {
-      float beta{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetSoftmaxBetaOption(litert_op.Get(), &beta));
-      op_wrappers = ::qnn::BuildSoftmaxOp(tensor_pool, input_tensors,
-                                          output_tensors, beta);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflSplit: {
-      int32_t num_splits{};
-      LITERT_RETURN_IF_ERROR(
-          LiteRtGetSplitNumSplitsOption(litert_op.Get(), &num_splits));
-      op_wrappers = ::qnn::BuildSplitOp(tensor_pool, input_tensors,
-                                        output_tensors, num_splits);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflTanh: {
-      op_wrappers =
-          ::qnn::BuildTanhOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflTranspose: {
-      op_wrappers =
-          ::qnn::BuildTransposeOp(tensor_pool, input_tensors, output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeTflPack: {
-      int32_t axis{};
-      LiteRtGetPackAxisOption(litert_op.Get(), &axis);
-      op_wrappers =
-          ::qnn::BuildPackOp(tensor_pool, input_tensors, output_tensors, axis);
-      break;
-    }
-
-    case LiteRtOpCode::kLiteRtOpCodeTflDynamicUpdateSlice: {
-      op_wrappers = ::qnn::BuildDynamicUpdateSliceOp(tensor_pool, input_tensors,
-                                                     output_tensors);
-      break;
-    }
-    case LiteRtOpCode::kLiteRtOpCodeShloComposite: {
-      // TODO(yunandrew): Support custom epsilon for RMS Norm.
-      float epsilon = 9.99999997E-7;
-      op_wrappers = ::qnn::BuildRmsNormOp(tensor_pool, input_tensors,
-                                          output_tensors, epsilon);
-      break;
-    }
-    default: {
-      LITERT_LOG(LITERT_ERROR,
-                 "LiteRT Op Code: %d is not supported in Qualcomm Compiler.",
-                 litert_op.Code());
-      return kLiteRtStatusErrorUnsupported;
-    }
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus MapGraph(QnnManager& qnn, Qnn_ContextHandle_t context_handle,
-                      LiteRtSubgraph subgraph,
-                      absl::string_view qnn_graph_name) {
-  GraphMapper graph_mapper(subgraph, qnn, context_handle);
-  LITERT_RETURN_IF_ERROR(graph_mapper.IsLiteRtSubgraphSupported());
-  LITERT_RETURN_IF_ERROR(graph_mapper.InitQnnGraph(qnn_graph_name));
-
-  //
-  // Legalize subgraph inputs and update tensors in scope
-  //
-
-  ::qnn::TensorPool tensor_pool(
-      [&qnn, &graph_mapper](::qnn::TensorWrapper& tensor_wrapper) {
-        qnn.Api()->tensorCreateGraphTensor(graph_mapper.QnnGraph(),
-                                           &tensor_wrapper.GetQnnTensor());
-      });
-  absl::flat_hash_map<LiteRtTensor, ::qnn::TensorWrapper*>
-      litert_tensor_to_wrapper;
-
-  for (const auto& subgraph_input : graph_mapper.Graph().Inputs()) {
-    ::qnn::TensorWrapper* tensor_wrapper{nullptr};
-    LITERT_RETURN_IF_ERROR(
-        ConvertTensor(subgraph_input, tensor_pool, tensor_wrapper));
-    litert_tensor_to_wrapper.emplace(subgraph_input.Get(), tensor_wrapper);
-  }
-
-  for (const auto& subgraph_output : graph_mapper.Graph().Outputs()) {
-    graph_mapper.RegisterOutput(subgraph_output.Get());
-  }
-  //
-  // Topologically traverse graph, legalizing and updating tensors in scope
-  //
-
-  std::ostringstream dump;
-  for (const auto& op : graph_mapper.Graph().Ops()) {
-    // Dump op info.
-    dump.clear();
-    Dump(*op.Get(), dump);
-    std::string s = dump.str();
-    LITERT_LOG(LITERT_INFO, "%s", s.data());
-
-    std::vector<::qnn::TensorWrapperRef> input_tensors;
-    for (const auto& input : op.Inputs()) {
-      if (const auto it = litert_tensor_to_wrapper.find(input.Get());
-          it == litert_tensor_to_wrapper.end()) {
-        ::qnn::TensorWrapper* tensor_wrapper{nullptr};
-        LITERT_RETURN_IF_ERROR(
-            ConvertTensor(input, tensor_pool, tensor_wrapper));
-        // add into map to capture re-used static tensor
-        litert_tensor_to_wrapper.emplace(input.Get(), tensor_wrapper);
-        input_tensors.emplace_back(*tensor_wrapper);
-      } else {
-        input_tensors.emplace_back(*(it->second));
-      }
-    }
-
-    std::vector<::qnn::TensorWrapperRef> output_tensors;
-    for (const auto& output : op.Outputs()) {
-      bool is_tensor_read_and_write = graph_mapper.IsTensorOutput(output.Get());
-      ::qnn::TensorWrapper* tensor_wrapper{nullptr};
-      LITERT_RETURN_IF_ERROR(ConvertTensor(output, tensor_pool, tensor_wrapper,
-                                           is_tensor_read_and_write));
-      litert_tensor_to_wrapper.emplace(output.Get(), tensor_wrapper);
-      output_tensors.emplace_back(*tensor_wrapper);
-    }
-
-    std::vector<::qnn::OpWrapper> op_wrappers;
-    LITERT_RETURN_IF_ERROR(
-        ConvertOp(op, tensor_pool, input_tensors, output_tensors, op_wrappers));
-
-    for (const auto& op_wrapper : op_wrappers) {
-      qnn.Api()->graphAddNode(graph_mapper.QnnGraph(),
-                              op_wrapper.GetOpConfig());
-    }
-  }
-
-  LITERT_RETURN_STATUS_IF_QNN_NOT_OK(graph_mapper.Finalize());
-
-  return kLiteRtStatusOk;
-}
-
-//===----------------------------------------------------------------------===//
-//
-//                                           [WIP] LiteRT SUBGRAPH -> QNN GRAPH
-//
-// Core driver for IR translation. Traverses LiteRt Subgraph, iteratively
-// "legalizing" (mapping) LiteRt entities to their QNN counterpart.
-//
-// APPROACH:
-//
-// To support the general case we will need a driver loop that either
-// traverses input recursively through edges or just iterates topologically.
-//
-// The algorithm is pretty straightforward:
-// * Store mapping between already evaluated LiteRtTensors and their
-//   newly constructed Qnn Tensor counterpart.
-// * Look up QNN Tensors when setting QNN Op inputs.
-// * Add new QNN Tensor when setting QNN Op outputs.
-//
-// NOTES ON QNN API:
-//
-// After QNN Tensors are registered in the context, they need only
-// be stored as their ID. QNN Tensor and "id" : uint32_t are used
-// interchangeably.
-//
-//===----------------------------------------------------------------------===//
-
-LiteRtStatus ComposeGraph(QnnManager& qnn, Qnn_ContextHandle_t context_handle,
-                          LiteRtSubgraph subgraph,
-                          absl::string_view qnn_graph_name) {
-  LITERT_RETURN_IF_ERROR(
-      MapGraph(qnn, context_handle, subgraph, qnn_graph_name));
-  return kLiteRtStatusOk;
-}
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h
deleted file mode 100644
index 3c43e1901acb..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/compiler/qnn_compose_graph.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_QNN_COMPOSE_GRAPH_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_QNN_COMPOSE_GRAPH_H_
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn {
-
-LiteRtStatus ConvertDataType(const litert::ElementType litert_type,
-                             const bool is_quantized, Qnn_DataType_t& qnn_type);
-
-LiteRtStatus ConvertTensor(const litert::Tensor& litert_tensor,
-                           ::qnn::TensorPool& tensor_pool,
-                           ::qnn::TensorWrapper*& tensor_wrapper,
-                           bool is_tensor_read_and_write = false);
-
-LiteRtStatus ConvertOp(
-    const litert::Op& litert_op, ::qnn::TensorPool& tensor_pool,
-    const std::vector<::qnn::TensorWrapperRef>& input_tensors,
-    const std::vector<::qnn::TensorWrapperRef>& output_tensors,
-    std::vector<::qnn::OpWrapper>& op_wrappers);
-
-// Composes a new QNN Graph from given LiteRt Graph. Qnn Graph is written to
-// context behind "qnn". Uses given graph_name to name entry point.
-LiteRtStatus ComposeGraph(QnnManager& qnn, Qnn_ContextHandle_t context_handle,
-                          LiteRtSubgraph subgraph,
-                          absl::string_view qnn_graph_name);
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_COMPILER_QNN_COMPOSE_GRAPH_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.cc
deleted file mode 100644
index 366f3a228b06..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h"
-
-#include <cstddef>
-#include <utility>
-#include <vector>
-
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemContext.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h"
-
-namespace litert {
-namespace qnn {
-
-namespace {
-
-Expected<void> InsertQnnTensors(int num_qnn_tensors, Qnn_Tensor_t* qnn_tensors,
-                                std::vector<QnnTensor>* tensors) {
-  tensors->clear();
-  tensors->reserve(num_qnn_tensors);
-  for (auto i = 0; i < num_qnn_tensors; ++i) {
-    auto tensor = QnnTensor::Create(qnn_tensors[i]);
-    if (!tensor) {
-      return Unexpected(tensor.Error());
-    }
-    tensors->push_back(std::move(*tensor));
-  }
-  return {};
-}
-
-Expected<void> InsertQnnGraphInfos(
-    int num_qnn_graph_infos, QnnSystemContext_GraphInfo_t* qnn_graph_infos,
-    std::vector<GraphInfo>* graphs) {
-  graphs->clear();
-  graphs->reserve(num_qnn_graph_infos);
-  for (auto i = 0; i < num_qnn_graph_infos; ++i) {
-    auto graph = GraphInfo::Create(qnn_graph_infos[i]);
-    if (!graph) {
-      return Unexpected(graph.Error());
-    }
-    graphs->push_back(std::move(*graph));
-  }
-
-  return {};
-}
-
-}  // namespace
-
-Expected<GraphInfo> GraphInfo::Create(
-    const QnnSystemContext_GraphInfo_t& graph_info) {
-  GraphInfo info;
-  auto status = info.Init(graph_info);
-  if (status) {
-    return info;
-  } else {
-    return Unexpected(status.Error());
-  }
-}
-
-Expected<void> GraphInfo::Init(const QnnSystemContext_GraphInfo_t& graph_info) {
-  if (graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1) {
-    const auto& graph_info_ = graph_info.graphInfoV1;
-    name_ = graph_info_.graphName;
-    LITERT_LOG(LITERT_INFO, "Found qnn graph: %s", name_.c_str());
-
-    if (auto status = InsertQnnTensors(graph_info_.numGraphInputs,
-                                       graph_info_.graphInputs, &inputs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-    if (auto status = InsertQnnTensors(graph_info_.numGraphOutputs,
-                                       graph_info_.graphOutputs, &outputs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-
-  } else if (graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) {
-    const auto& graph_info_ = graph_info.graphInfoV2;
-    name_ = graph_info_.graphName;
-    LITERT_LOG(LITERT_INFO, "Found qnn graph: %s", name_.c_str());
-
-    if (auto status = InsertQnnTensors(graph_info_.numGraphInputs,
-                                       graph_info_.graphInputs, &inputs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-    if (auto status = InsertQnnTensors(graph_info_.numGraphOutputs,
-                                       graph_info_.graphOutputs, &outputs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-  } else if (graph_info.version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
-    const auto& graph_info_ = graph_info.graphInfoV3;
-    name_ = graph_info_.graphName;
-    LITERT_LOG(LITERT_INFO, "Found qnn graph: %s", name_.c_str());
-
-    if (auto status = InsertQnnTensors(graph_info_.numGraphInputs,
-                                       graph_info_.graphInputs, &inputs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-    if (auto status = InsertQnnTensors(graph_info_.numGraphOutputs,
-                                       graph_info_.graphOutputs, &outputs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-
-  } else {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unsupported graph info version.");
-  }
-  return {};
-}
-
-Expected<void> ContextBinaryInfo::Init(
-    const QnnSystemContext_BinaryInfo_t& binary_info) {
-  if (binary_info.version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
-    const auto& context_binary_info = binary_info.contextBinaryInfoV1;
-    if (auto status = InsertQnnTensors(context_binary_info.numContextTensors,
-                                       context_binary_info.contextTensors,
-                                       &context_tensors_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-    if (auto status = InsertQnnGraphInfos(context_binary_info.numGraphs,
-                                          context_binary_info.graphs, &graphs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-
-  } else if (binary_info.version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
-    const auto& context_binary_info = binary_info.contextBinaryInfoV2;
-    if (auto status = InsertQnnTensors(context_binary_info.numContextTensors,
-                                       context_binary_info.contextTensors,
-                                       &context_tensors_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-    if (auto status = InsertQnnGraphInfos(context_binary_info.numGraphs,
-                                          context_binary_info.graphs, &graphs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-  } else if (binary_info.version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
-    const auto& context_binary_info = binary_info.contextBinaryInfoV3;
-    if (auto status = InsertQnnTensors(context_binary_info.numContextTensors,
-                                       context_binary_info.contextTensors,
-                                       &context_tensors_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-    if (auto status = InsertQnnGraphInfos(context_binary_info.numGraphs,
-                                          context_binary_info.graphs, &graphs_);
-        !status) {
-      return Unexpected(status.Error());
-    }
-  } else {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unsupported context binary version.");
-  }
-  return {};
-}
-
-Expected<ContextBinaryInfo> ContextBinaryInfo::Create(
-    QnnManager& qnn, const void* exec_bytecode_ptr, size_t exec_bytecode_size) {
-  auto system_context_handle = qnn.CreateSystemContextHandle();
-  if (!system_context_handle) {
-    return Unexpected(system_context_handle.Error());
-  }
-
-  const QnnSystemContext_BinaryInfo_t* binary_info = nullptr;
-  Qnn_ContextBinarySize_t binary_info_size = 0;
-  if (auto status = qnn.SystemApi()->systemContextGetBinaryInfo(
-          system_context_handle->get(), const_cast<void*>(exec_bytecode_ptr),
-          exec_bytecode_size, &binary_info, &binary_info_size);
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get context binary info: %d", status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to get context binary info");
-  }
-
-  if (!binary_info) {
-    LITERT_LOG(LITERT_ERROR, "Null binary info", "");
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure, "Null binary info");
-  }
-
-  ContextBinaryInfo info;
-  auto status = info.Init(*binary_info);
-
-  if (status) {
-    return info;
-  } else {
-    return Unexpected(status.Error());
-  }
-}
-
-}  // namespace qnn
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h
deleted file mode 100644
index 2e07186a6b0b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CONTEXT_BINARY_INFO_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CONTEXT_BINARY_INFO_H_
-
-#include <cstddef>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h"
-
-namespace litert {
-namespace qnn {
-
-class GraphInfo {
- public:
-  static Expected<GraphInfo> Create(
-      const QnnSystemContext_GraphInfo_t& graph_info);
-  const std::string& Name() const { return name_; }
-  const std::vector<QnnTensor>& Inputs() const { return inputs_; }
-  const std::vector<QnnTensor>& Outputs() const { return outputs_; }
-
- private:
-  GraphInfo() = default;
-  Expected<void> Init(const QnnSystemContext_GraphInfo_t& graph_info);
-  std::string name_;
-  std::vector<QnnTensor> inputs_;
-  std::vector<QnnTensor> outputs_;
-};
-
-class ContextBinaryInfo {
- public:
-  static Expected<ContextBinaryInfo> Create(QnnManager& qnn,
-                                            const void* exec_bytecode_ptr,
-                                            size_t exec_bytecode_size);
-  const std::vector<QnnTensor>& ContextTensors() const {
-    return context_tensors_;
-  }
-  const std::vector<GraphInfo>& Graphs() const { return graphs_; }
-
- private:
-  ContextBinaryInfo() = default;
-  Expected<void> Init(const QnnSystemContext_BinaryInfo_t& binary_info);
-  std::vector<QnnTensor> context_tensors_;
-  std::vector<GraphInfo> graphs_;
-};
-
-}  // namespace qnn
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CONTEXT_BINARY_INFO_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/BUILD
deleted file mode 100644
index 902bb5b5b49b..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert/vendors/qualcomm:__subpackages__"],
-)
-
-cc_library(
-    name = "tensor_pool",
-    srcs = ["tensor_pool.cc"],
-    hdrs = ["tensor_pool.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:quantize_params_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "common",
-    hdrs = ["common.h"],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/BUILD
deleted file mode 100644
index 172103fabbb6..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/BUILD
+++ /dev/null
@@ -1,388 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert/vendors/qualcomm:__subpackages__"],
-)
-
-cc_library(
-    name = "op_builder",
-    srcs = ["op_builder.cc"],
-    hdrs = ["op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "elementwise_op_builder",
-    srcs = ["elementwise_op_builder.cc"],
-    hdrs = ["elementwise_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:param_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:quantize_params_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "cast_op_builder",
-    srcs = ["cast_op_builder.cc"],
-    hdrs = ["cast_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "concatenation_op_builder",
-    srcs = ["concatenation_op_builder.cc"],
-    hdrs = ["concatenation_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "embedding_lookup_op_builder",
-    srcs = ["embedding_lookup_op_builder.cc"],
-    hdrs = ["embedding_lookup_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "fully_connected_op_builder",
-    srcs = ["fully_connected_op_builder.cc"],
-    hdrs = ["fully_connected_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "gather_op_builder",
-    srcs = ["gather_op_builder.cc"],
-    hdrs = ["gather_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils:log",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "gelu_op_builder",
-    srcs = ["gelu_op_builder.cc"],
-    hdrs = ["gelu_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "matmul_op_builder",
-    srcs = ["matmul_op_builder.cc"],
-    hdrs = ["matmul_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "mean_op_builder",
-    srcs = ["mean_op_builder.cc"],
-    hdrs = ["mean_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils:log",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "quantize_op_builder",
-    srcs = ["quantize_op_builder.cc"],
-    hdrs = ["quantize_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "reduce_op_builder",
-    srcs = ["reduce_op_builder.cc"],
-    hdrs = ["reduce_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils:log",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "reshape_op_builder",
-    srcs = ["reshape_op_builder.cc"],
-    hdrs = ["reshape_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "select_op_builder",
-    srcs = ["select_op_builder.cc"],
-    hdrs = ["select_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "slice_op_builder",
-    srcs = ["slice_op_builder.cc"],
-    hdrs = ["slice_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils:log",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "softmax_op_builder",
-    srcs = ["softmax_op_builder.cc"],
-    hdrs = ["softmax_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "split_op_builder",
-    srcs = ["split_op_builder.cc"],
-    hdrs = ["split_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "tanh_op_builder",
-    srcs = ["tanh_op_builder.cc"],
-    hdrs = ["tanh_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "transpose_op_builder",
-    srcs = ["transpose_op_builder.cc"],
-    hdrs = ["transpose_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils:log",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "pack_op_builder",
-    srcs = ["pack_op_builder.cc"],
-    hdrs = ["pack_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "dynamic_update_slice_op_builder",
-    srcs = ["dynamic_update_slice_op_builder.cc"],
-    hdrs = ["dynamic_update_slice_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_logging",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:quantize_params_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "rms_norm_op_builder",
-    srcs = ["rms_norm_op_builder.cc"],
-    hdrs = ["rms_norm_op_builder.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":op_builder",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:tensor_pool",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:op_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.cc
deleted file mode 100644
index 38b166056f48..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildCastOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& op = CreateOpWrapper(res, QNN_OP_CAST);
-  op.AddInputTensor(inputs[0]);
-  op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.h
deleted file mode 100644
index 4de521da9838..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/cast_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_CAST_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_CAST_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildCastOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_CAST_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.cc
deleted file mode 100644
index 1c0f399fe0f7..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildConcatenationOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const std::int32_t axis) {
-  std::vector<OpWrapper> res;
-
-  auto& concat_op = CreateOpWrapper(res, QNN_OP_CONCAT);
-  for (const auto& input : inputs) {
-    concat_op.AddInputTensor(input);
-  }
-  concat_op.AddOutputTensor(outputs[0]);
-
-  std::uint32_t adjusted_axis =
-      (axis >= 0) ? axis : axis + inputs[0].get().GetRank();
-  concat_op.AddScalarParam<std::uint32_t>(QNN_OP_CONCAT_PARAM_AXIS,
-                                          adjusted_axis);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.h
deleted file mode 100644
index ed0784e27a91..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/concatenation_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_CONCATENATION_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_CONCATENATION_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildConcatenationOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const std::int32_t axis);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_CONCATENATION_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.cc
deleted file mode 100644
index b356188becb7..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.h"
-
-#include <cstdint>
-#include <numeric>
-#include <vector>
-
-#include "third_party/qairt/latest/include/QNN/QnnOpDef.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-namespace {
-constexpr int kInputIdx = 0;
-constexpr int kUpdateIdx = 1;
-constexpr int kIndicesIdx = 2;
-constexpr int kOutputIdx = 0;
-}  // namespace
-
-std::vector<OpWrapper> BuildDynamicUpdateSliceOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-  // Dynamic Update Slice:
-  //  in[0] operand: [1, 64, 4, 64]
-  //  in[1] updates: [1, 1, 4, 64]
-  //  in[2] start_indices: [4] -> data: [0, x, 0, 0]
-
-  // reduceSum and reshape in[2] -> index tensor
-
-  // Create static tensor table
-  //  shape: [64]
-  //  data: [0,...,63]
-
-  // QNN ElementWiseNotEqual:
-  //  in[0]: table
-  //  in[1]: index tensor
-  //  out[0]: condition tensor
-
-  // reshape condition tensor due to QNN broadcast rules
-  //  in[0]: [64]
-  //  out[0]: [64, 1, 1]
-
-  // QNN ElementWiseSelect:
-  //  in[0] condition: [64, 1, 1]
-  //  in[1] input: [1, 64, 4, 64]
-  //  in[2] updates: [1, 1, 4, 64]
-
-  // CAUTION!!! only support Gemma2 use case now
-
-  auto& input_tensor = inputs[kInputIdx].get();
-  auto& update_tensor = inputs[kUpdateIdx].get();
-  auto& indices_tensor = inputs[kIndicesIdx].get();
-  auto& output_tensor = outputs[kOutputIdx].get();
-
-  if (input_tensor.GetRank() != update_tensor.GetRank()) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "QNN LiteRT Delegate only supports Dynamic Update Slice when "
-               "operand and updates have the same rank.");
-    return {};
-  }
-
-  if (indices_tensor.GetDataType() != QNN_DATATYPE_INT_32) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "Dynamic Update Slice only supports QNN_DATATYPE_INT_32 "
-               "start_indices.");
-    return {};
-  }
-
-  // reduce sum
-  auto& reduce_sum_op = CreateOpWrapper(res, QNN_OP_REDUCE_SUM);
-  reduce_sum_op.AddInputTensor(indices_tensor);
-
-  std::vector<uint32_t> axis_data = {0};
-  TensorWrapper& axis_tensor = tensor_pool.CreateStaticTensor(
-      QNN_DATATYPE_UINT_32, QuantizeParamsWrapperVariant{}, {1},
-      sizeof(std::uint32_t), axis_data.data());
-  reduce_sum_op.AddTensorParam(QNN_OP_REDUCE_SUM_PARAM_AXES, axis_tensor);
-
-  // create intermediate tensor
-  TensorWrapper& one_dim_index =
-      tensor_pool.CloneNativeTensorFrom(indices_tensor, {1});
-  reduce_sum_op.AddOutputTensor(one_dim_index);
-
-  // ElementwiseNotEqual
-  // get table dims from in[0]->Dims[1]
-  if (input_tensor.GetRank() < 2) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "Dynamic Update Slice only supports operand tensor rank >= 2");
-    return {};
-  }
-  uint32_t table_size = input_tensor.GetDim(1);
-  std::vector<uint32_t> static_table_dims = {table_size};
-  std::vector<int32_t> table_data(table_size);
-  std::iota(table_data.begin(), table_data.end(), 0);
-
-  // create static table tensor
-  TensorWrapper& static_table = tensor_pool.CreateStaticTensor(
-      QNN_DATATYPE_INT_32, QuantizeParamsWrapperVariant{}, static_table_dims,
-      table_size * sizeof(std::int32_t), table_data.data());
-
-  OpWrapper& not_equal_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_NOT_EQUAL);
-  not_equal_op.AddInputTensor(static_table);
-  not_equal_op.AddInputTensor(one_dim_index);
-
-  TensorWrapper& not_equal_out = tensor_pool.CreateNativeTensor(
-      QNN_DATATYPE_BOOL_8, QuantizeParamsWrapperVariant{}, static_table_dims);
-  not_equal_op.AddOutputTensor(not_equal_out);
-
-  // reshape not equal output to [N, 1, 1]
-  OpWrapper& reshape_op = CreateOpWrapper(res, QNN_OP_RESHAPE);
-
-  reshape_op.AddInputTensor(not_equal_out);
-  TensorWrapper& reshape_out =
-      tensor_pool.CloneNativeTensorFrom(not_equal_out, {table_size, 1, 1});
-  reshape_op.AddOutputTensor(reshape_out);
-
-  // Select
-  OpWrapper& select_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_SELECT);
-
-  select_op.AddInputTensor(reshape_out);
-  select_op.AddInputTensor(input_tensor);
-  select_op.AddInputTensor(update_tensor);
-  select_op.AddOutputTensor(output_tensor);
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.h
deleted file mode 100644
index c5a74c1a7c5e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/dynamic_update_slice_op_builder.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_DYNAMIC_UPDATE_SLICE_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_DYNAMIC_UPDATE_SLICE_OP_BUILDER_H_
-
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildDynamicUpdateSliceOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_DYNAMIC_UPDATE_SLICE_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.cc
deleted file mode 100644
index 7d471fca7422..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildElementwiseAddOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_ADD);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  // TODO: fused activation
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseSubOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_SUBTRACT);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  // TODO: fused activation
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseMulOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_MULTIPLY);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  // TODO: fused activation
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseDivOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_DIVIDE);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  // TODO: fused activation
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseSinOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_SIN);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseCosOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_COS);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseRsqrtOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_RSQRT);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseSquareOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  OpWrapper& elementwise_op =
-      CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_MULTIPLY);
-  elementwise_op.AddInputTensor(inputs[0]);
-  elementwise_op.AddInputTensor(inputs[0]);
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseSquaredDifferenceOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op =
-      CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_SQUARED_DIFFERENCE);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseLessOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_BINARY);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-  elementwise_op.AddScalarParam<std::uint32_t>(
-      QNN_OP_ELEMENT_WISE_BINARY_PARAM_OPERATION,
-      QNN_OP_ELEMENT_WISE_BINARY_OPERATION_LESS);
-
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseGreaterOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_BINARY);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-  elementwise_op.AddScalarParam<std::uint32_t>(
-      QNN_OP_ELEMENT_WISE_BINARY_PARAM_OPERATION,
-      QNN_OP_ELEMENT_WISE_BINARY_OPERATION_GREATER);
-
-  return res;
-}
-
-std::vector<OpWrapper> BuildElementwiseAndOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& elementwise_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_BINARY);
-  for (const auto& input : inputs) {
-    elementwise_op.AddInputTensor(input);
-  }
-  elementwise_op.AddOutputTensor(outputs[0]);
-  elementwise_op.AddScalarParam<std::uint32_t>(
-      QNN_OP_ELEMENT_WISE_BINARY_PARAM_OPERATION,
-      QNN_OP_ELEMENT_WISE_BINARY_OPERATION_AND);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.h
deleted file mode 100644
index d28f77134675..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/elementwise_op_builder.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_ELEMENTWISE_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_ELEMENTWISE_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildElementwiseAddOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseSubOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseMulOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseDivOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseSinOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseCosOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseRsqrtOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseSquareOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseSquaredDifferenceOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseLessOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseGreaterOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-std::vector<OpWrapper> BuildElementwiseAndOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_ELEMENTWISE_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.cc
deleted file mode 100644
index efcd1ab1b9f0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.h"
-
-#include <numeric>
-
-namespace qnn {
-namespace {
-constexpr int kTableIdx = 1;
-constexpr int kIndicesIdx = 0;
-constexpr int kOutputIdx = 0;
-}  // namespace
-
-std::vector<OpWrapper> BuildEmbeddingLookupOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  TensorWrapper& table_tensor = inputs[kTableIdx];
-  TensorWrapper& indices_tensor = inputs[kIndicesIdx];
-  TensorWrapper& output_tensor = outputs[kOutputIdx];
-
-  auto& gather_op = CreateOpWrapper(res, QNN_OP_GATHER);
-  // Case: QInt8 table with QInt16 output
-  if (table_tensor.IsQuant8() && output_tensor.IsQuant16()) {
-    QNN_LOG_WARNING(
-        "The data type of embedding lookup table is int8, but output data type "
-        "is int16. Int8 table will be cast to int16.");
-    std::vector<std::int16_t> int16_data;
-    size_t data_len =
-        std::accumulate(table_tensor.GetDims().begin(),
-                        table_tensor.GetDims().end(), 1, std::multiplies<>());
-    // TODO: do not cast
-    auto* int8_data = reinterpret_cast<const std::int8_t*>(
-        table_tensor.GetStaticTensorData());
-    for (int i = 0; i < data_len; ++i) {
-      int16_data.emplace_back(static_cast<std::int16_t>(int8_data[i]));
-    }
-
-    TensorWrapper& int16_table_tensor = tensor_pool.CreateStaticTensor(
-        output_tensor.GetDataType(), table_tensor.GetQuantParams(),
-        table_tensor.GetDims(),
-        sizeof(decltype(int16_data)::value_type) * int16_data.size(),
-        reinterpret_cast<void*>(int16_data.data()));
-
-    gather_op.AddInputTensor(int16_table_tensor);
-  } else {
-    gather_op.AddInputTensor(table_tensor);
-  }
-
-  gather_op.AddInputTensor(indices_tensor);
-  gather_op.AddOutputTensor(output_tensor);
-  gather_op.AddScalarParam<std::int32_t>(QNN_OP_GATHER_PARAM_AXIS, 0);
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.h
deleted file mode 100644
index 175f65dac0a5..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/embedding_lookup_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_EMBEDDING_LOOKUP_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_EMBEDDING_LOOKUP_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildEmbeddingLookupOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_EMBEDDING_LOOKUP_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.cc
deleted file mode 100644
index 1a1a96c18739..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.h"
-
-namespace qnn {
-
-namespace {
-constexpr int kBiasIdx = 2;
-}
-
-std::vector<OpWrapper> BuildFullyConnectedOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const bool keep_num_dims) {
-  std::vector<OpWrapper> res;
-
-  OpWrapper& fully_connected_op = CreateOpWrapper(res, QNN_OP_FULLY_CONNECTED);
-
-  TensorWrapper& input_tensor = inputs[0];
-  fully_connected_op.AddInputTensor(input_tensor);
-  TensorWrapper& weight_tensor = inputs[1];
-  fully_connected_op.AddInputTensor(weight_tensor);
-  if (inputs.size() - 1 >= kBiasIdx) {
-    TensorWrapper& bias_tensor = inputs[kBiasIdx];
-    fully_connected_op.AddInputTensor(bias_tensor);
-  }
-
-  TensorWrapper& output_tensor = outputs[0];
-  if (keep_num_dims) {
-    auto& input_dims = input_tensor.GetDims();
-    std::uint32_t input_size = std::accumulate(
-        input_dims.begin(), input_dims.end(), 1, std::multiplies<>());
-    const std::uint32_t num_units = weight_tensor.GetDim(0);
-    const std::uint32_t num_input_elem = weight_tensor.GetDim(1);
-
-    // input_size must be divisible by num_input_elem. This should be validated
-    // by QNN.
-    const std::uint32_t batch_size = input_size / num_input_elem;
-    // QNN output should always be rank 2
-    qnn::TensorWrapper& fully_connected_out = tensor_pool.CloneNativeTensorFrom(
-        output_tensor, {batch_size, num_units});
-
-    fully_connected_op.AddOutputTensor(fully_connected_out);
-    // TODO: fused activation
-
-    qnn::OpWrapper& reshape_op = CreateOpWrapper(res, QNN_OP_RESHAPE);
-    reshape_op.AddInputTensor(fully_connected_out);
-    reshape_op.AddOutputTensor(output_tensor);
-  } else {
-    fully_connected_op.AddOutputTensor(outputs[0]);
-    // TODO: fused activation
-  }
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.h
deleted file mode 100644
index 3031be6f3002..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/fully_connected_op_builder.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_FULLY_CONNECTED_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_FULLY_CONNECTED_OP_BUILDER_H_
-
-#include <numeric>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildFullyConnectedOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const bool keep_num_dims);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_FULLY_CONNECTED_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.cc
deleted file mode 100644
index aee0c4476708..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.h"
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildGatherOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const std::int32_t axis,
-    const std::int32_t batch_dims) {
-  std::vector<OpWrapper> res;
-
-  if (batch_dims != 0) {
-    QNN_LOG_ERROR("The batch dimension of Gather OP is not equal to 0.");
-    return res;
-  }
-
-  auto& gather_op = CreateOpWrapper(res, QNN_OP_GATHER);
-  for (const auto& input : inputs) {
-    gather_op.AddInputTensor(input);
-  }
-  for (const auto& output : outputs) {
-    gather_op.AddOutputTensor(output);
-  }
-  const std::int32_t adjusted_axis =
-      axis >= 0 ? axis : axis + inputs[0].get().GetRank();
-  gather_op.AddScalarParam<std::int32_t>(QNN_OP_GATHER_PARAM_AXIS,
-                                         adjusted_axis);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.h
deleted file mode 100644
index 00b078c4f36e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gather_op_builder.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_GATHER_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_GATHER_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildGatherOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const std::int32_t axis,
-    const std::int32_t batch_dims);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_GATHER_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.cc
deleted file mode 100644
index e43ccdf567e9..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildGeluOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  CreateSimpleActivationOp(res, QNN_OP_GELU, inputs[0], outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.h
deleted file mode 100644
index 77a72154ee89..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/gelu_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_GELU_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_GELU_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildGeluOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_GELU_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.cc
deleted file mode 100644
index d929d741510c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildMatmulOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const bool adj_x,
-    const bool adj_y) {
-  std::vector<OpWrapper> res;
-
-  auto& matmul_op = CreateOpWrapper(res, QNN_OP_MAT_MUL);
-  for (const auto& input : inputs) {
-    matmul_op.AddInputTensor(input);
-  }
-  matmul_op.AddOutputTensor(outputs[0]);
-  matmul_op.AddScalarParam<bool>(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, adj_x);
-  matmul_op.AddScalarParam<bool>(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, adj_y);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.h
deleted file mode 100644
index 40958ebb9c4d..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/matmul_op_builder.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_MATMUL_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_MATMUL_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildMatmulOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const bool adj_x,
-    const bool adj_y);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_MATMUL_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.cc
deleted file mode 100644
index 56679a9558e7..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.h"
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildMeanOp(TensorPool& tensor_pool,
-                                   const std::vector<TensorWrapperRef>& inputs,
-                                   const std::vector<TensorWrapperRef>& outputs,
-                                   const bool keep_dim) {
-  std::vector<OpWrapper> res;
-
-  TensorWrapper& axis_tensor = inputs[1];
-  if (!axis_tensor.IsTensorStatic() || axis_tensor.GetRank() != 1) {
-    QNN_LOG_ERROR(
-        "The axis tensor is not static, or the rank of axis tensor is not "
-        "equal to 1.");
-    return res;
-  }
-
-  TensorWrapper& input_tensor = inputs[0];
-
-  // TODO: cannot direcly cast
-  auto* axis_data =
-      reinterpret_cast<const std::int32_t*>(axis_tensor.GetStaticTensorData());
-  std::vector<std::uint32_t> adjusted_axis_data;
-  for (size_t i = 0; i < axis_tensor.GetDim(0); ++i) {
-    std::uint32_t adjusted_axis = axis_data[i] >= 0
-                                      ? axis_data[i]
-                                      : axis_data[i] + input_tensor.GetRank();
-    if (std::find(adjusted_axis_data.begin(), adjusted_axis_data.end(),
-                  adjusted_axis) == adjusted_axis_data.end()) {
-      adjusted_axis_data.emplace_back(adjusted_axis);
-    }
-  }
-  TensorWrapper& adjusted_axis_tensor = tensor_pool.CreateStaticTensor(
-      QNN_DATATYPE_UINT_32, axis_tensor.GetQuantParams(),
-      {static_cast<const std::uint32_t>(adjusted_axis_data.size())},
-      sizeof(std::uint32_t) * adjusted_axis_data.size(),
-      adjusted_axis_data.data());
-
-  auto& reduce_op = CreateOpWrapper(res, QNN_OP_REDUCE_MEAN);
-  reduce_op.AddInputTensor(input_tensor);
-  reduce_op.AddOutputTensor(outputs[0]);
-  reduce_op.AddTensorParam(QNN_OP_REDUCE_MEAN_PARAM_AXES, adjusted_axis_tensor);
-  reduce_op.AddScalarParam<bool>(QNN_OP_REDUCE_MEAN_PARAM_KEEP_DIMS, keep_dim);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.h
deleted file mode 100644
index 50127647c90c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/mean_op_builder.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_MEAN_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_MEAN_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildMeanOp(TensorPool& tensor_pool,
-                                   const std::vector<TensorWrapperRef>& inputs,
-                                   const std::vector<TensorWrapperRef>& outputs,
-                                   const bool keep_dims);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_MEAN_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.cc
deleted file mode 100644
index 84d484228f4a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-
-namespace qnn {
-
-OpWrapper& CreateOpWrapper(std::vector<OpWrapper>& ops, const char* op_type) {
-  const auto op_count = ops.size();
-  const auto name = "op_type_" + std::string(op_type) + "_op_count_" +
-                    std::to_string(op_count);
-  return ops.emplace_back(std::move(name), op_type);
-}
-
-OpWrapper& CreateSimpleActivationOp(std::vector<OpWrapper>& ops,
-                                    const char* op_type,
-                                    const TensorWrapper& input_tensor,
-                                    const TensorWrapper& output_tensor) {
-  auto& ret = CreateOpWrapper(ops, op_type);
-  ret.AddInputTensor(input_tensor);
-  ret.AddOutputTensor(output_tensor);
-  return ret;
-}
-
-/*
-LiteRtStatus OpMapper::AddFusedActivationNode(
-    const tflite::ActivationFunctionType activation,
-    const TensorWrapper& input_tensor, const TensorWrapper& output_tensor) {
-  switch (activation) {
-    case tflite::ActivationFunctionType_RELU: {
-      OpWrapper& activation_op =
-          CreateSimpleActivationOp(QNN_OP_RELU, input_tensor, output_tensor);
-      break;
-    }
-    case tflite::ActivationFunctionType_RELU_N1_TO_1: {
-      OpWrapper& activation_op = CreateSimpleActivationOp(
-          QNN_OP_RELU_MIN_MAX, input_tensor, output_tensor);
-      activation_op.AddScalarParam<float>(QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
-                                          -1.f);
-      activation_op.AddScalarParam<float>(QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
-                                          1.f);
-      break;
-    }
-    case tflite::ActivationFunctionType_RELU6: {
-      OpWrapper& activation_op = CreateSimpleActivationOp(
-          QNN_OP_RELU_MIN_MAX, input_tensor, output_tensor);
-      activation_op.AddScalarParam<float>(QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE,
-                                          0.f);
-      activation_op.AddScalarParam<float>(QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE,
-                                          6.f);
-      break;
-    }
-    case tflite::ActivationFunctionType_TANH: {
-      OpWrapper& activation_op =
-          CreateSimpleActivationOp(QNN_OP_TANH, input_tensor, output_tensor);
-      break;
-    }
-    default:
-      return kLiteRtStatusErrorUnsupported;
-  }
-
-  return kLiteRtStatusOk;
-}
-*/
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h
deleted file mode 100644
index c8f200132dee..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_OP_BUILDER_H_
-
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-OpWrapper& CreateOpWrapper(std::vector<OpWrapper>& ops, const char* op_type);
-
-OpWrapper& CreateSimpleActivationOp(std::vector<OpWrapper>& ops,
-                                    const char* op_type,
-                                    const TensorWrapper& input_tensor,
-                                    const TensorWrapper& output_tensor);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.cc
deleted file mode 100644
index 97dc4c5c9561..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.h"
-
-#include <cstdint>
-#include <vector>
-
-#include "third_party/qairt/latest/include/QNN/QnnOpDef.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildPackOp(TensorPool& tensor_pool,
-                                   const std::vector<TensorWrapperRef>& inputs,
-                                   const std::vector<TensorWrapperRef>& outputs,
-                                   const int32_t axis) {
-  std::vector<OpWrapper> res;
-
-  // pack op with only one input would violate op definition of qnn
-  // we'll replace it with reshape op
-  if (inputs.size() == 1) {
-    auto& op = CreateOpWrapper(res, QNN_OP_RESHAPE);
-    op.AddInputTensor(inputs[0]);
-    op.AddOutputTensor(outputs[0]);
-    return res;
-  }
-
-  if (outputs[0].get().GetRank() != inputs[0].get().GetRank() + 1) {
-    auto& concat_op = CreateOpWrapper(res, QNN_OP_CONCAT);
-    for (const auto& input : inputs) {
-      concat_op.AddInputTensor(input);
-    }
-    concat_op.AddOutputTensor(outputs[0]);
-  } else {
-    auto& pack_op = CreateOpWrapper(res, QNN_OP_PACK);
-    for (const auto& input : inputs) {
-      pack_op.AddInputTensor(input);
-    }
-    std::uint32_t adjusted_axis =
-        axis < 0 ? axis + inputs[0].get().GetRank() : axis;
-    pack_op.AddScalarParam<std::uint32_t>(QNN_OP_PACK_PARAM_AXIS,
-                                          adjusted_axis);
-    pack_op.AddOutputTensor(outputs[0]);
-  }
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.h
deleted file mode 100644
index b0e39cc74ccd..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/pack_op_builder.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_PACK_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_PACK_OP_BUILDER_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildPackOp(TensorPool& tensor_pool,
-                                   const std::vector<TensorWrapperRef>& inputs,
-                                   const std::vector<TensorWrapperRef>& outputs,
-                                   const int32_t axis);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_PACK_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.cc
deleted file mode 100644
index 7900ccefd16e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildQuantizeOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  const char* qnn_op = nullptr;
-  if (inputs[0].get().IsPerTensorQuantWithOffsetDiff(outputs[0].get())) {
-    qnn_op = QNN_OP_CAST;
-  } else if ((inputs[0].get().IsQuant8() || inputs[0].get().IsQuant16()) &&
-             (outputs[0].get().IsQuant8() || outputs[0].get().IsQuant16())) {
-    qnn_op = QNN_OP_CONVERT;
-  } else {
-    qnn_op = QNN_OP_QUANTIZE;
-  }
-
-  auto& quantize_op = CreateOpWrapper(res, qnn_op);
-  quantize_op.AddInputTensor(inputs[0]);
-  quantize_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.h
deleted file mode 100644
index f76a92b50fe8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/quantize_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_QUANTIZE_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_QUANTIZE_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildQuantizeOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_QUANTIZE_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.cc
deleted file mode 100644
index b06a061b6e8c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.h"
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildReduceSumOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const bool keep_dims) {
-  std::vector<OpWrapper> res;
-
-  TensorWrapper& axis_tensor = inputs[1];
-  if (!axis_tensor.IsTensorStatic() || axis_tensor.GetRank() != 1) {
-    QNN_LOG_ERROR(
-        "The axis tensor is not static, or the rank of axis tensor is not "
-        "equal to 1.");
-    return res;
-  }
-
-  TensorWrapper& input_tensor = inputs[0];
-
-  // TODO: cannot direcly cast
-  auto* axis_data =
-      reinterpret_cast<const std::int32_t*>(axis_tensor.GetStaticTensorData());
-  std::vector<std::uint32_t> adjusted_axis_data;
-  for (size_t i = 0; i < axis_tensor.GetDim(0); ++i) {
-    std::uint32_t adjusted_axis = axis_data[i] >= 0
-                                      ? axis_data[i]
-                                      : axis_data[i] + input_tensor.GetRank();
-    if (std::find(adjusted_axis_data.begin(), adjusted_axis_data.end(),
-                  adjusted_axis) == adjusted_axis_data.end()) {
-      adjusted_axis_data.emplace_back(adjusted_axis);
-    }
-  }
-  TensorWrapper& adjusted_axis_tensor = tensor_pool.CreateStaticTensor(
-      QNN_DATATYPE_UINT_32, axis_tensor.GetQuantParams(),
-      {static_cast<const std::uint32_t>(adjusted_axis_data.size())},
-      sizeof(std::uint32_t) * adjusted_axis_data.size(),
-      adjusted_axis_data.data());
-
-  OpWrapper& reduce_op = CreateOpWrapper(res, QNN_OP_REDUCE_SUM);
-  reduce_op.AddInputTensor(input_tensor);
-  reduce_op.AddOutputTensor(outputs[0]);
-  reduce_op.AddTensorParam(QNN_OP_REDUCE_SUM_PARAM_AXES, adjusted_axis_tensor);
-  reduce_op.AddScalarParam<bool>(QNN_OP_REDUCE_SUM_PARAM_KEEP_DIMS, keep_dims);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.h
deleted file mode 100644
index cb43106587d9..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reduce_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_REDUCE_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_REDUCE_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildReduceSumOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const bool keep_dims);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_REDUCE_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.cc
deleted file mode 100644
index 5055effd0668..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildReshapeOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& reshape_op = CreateOpWrapper(res, QNN_OP_RESHAPE);
-  reshape_op.AddInputTensor(inputs[0]);
-  reshape_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.h
deleted file mode 100644
index 6b14ad38bbd0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/reshape_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_RESHAPE_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_RESHAPE_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildReshapeOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_RESHAPE_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.cc
deleted file mode 100644
index 69ef3abd473f..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.h"
-
-#include <cstdint>
-#include <cstring>
-#include <vector>
-
-#include "third_party/qairt/latest/include/QNN/QnnOpDef.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-static constexpr int kInputIndex = 0;
-static constexpr int kAxisIndex = 1;
-
-std::vector<OpWrapper> BuildRmsNormOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const float epsilon) {
-  std::vector<OpWrapper> res;
-  auto& rms_norm_op = CreateOpWrapper(res, QNN_OP_RMS_NORM);
-
-  // Constructs axis param tensor.
-  std::vector<std::uint32_t> axis_data;
-  axis_data.reserve(inputs[kAxisIndex].get().GetRank());
-  axis_data.emplace_back(inputs[kInputIndex].get().GetRank() - 1);
-  TensorWrapper& axis_tensor = tensor_pool.CreateStaticTensor(
-      QNN_DATATYPE_UINT_32, inputs[kInputIndex].get().GetQuantParams(), {1},
-      sizeof(std::uint32_t) * axis_data.size(), axis_data.data());
-
-  // Construct beta static all 0 tensor.
-  std::vector<int8_t> beta_data;
-  beta_data.reserve(GetDataTypeSize(inputs[kAxisIndex].get().GetDataType()) *
-                    inputs[kAxisIndex].get().GetTensorSize());
-  std::memset(beta_data.data(), 0, beta_data.size());
-  TensorWrapper& beta_tensor = tensor_pool.CreateStaticTensor(
-      inputs[kAxisIndex].get().GetDataType(),
-      inputs[kAxisIndex].get().GetQuantParams(),
-      inputs[kAxisIndex].get().GetDims(), sizeof(int8_t) * beta_data.size(),
-      beta_data.data());
-
-  for (const auto& input : inputs) {
-    rms_norm_op.AddInputTensor(input);
-  }
-  rms_norm_op.AddInputTensor(beta_tensor);
-
-  rms_norm_op.AddScalarParam<float>(QNN_OP_RMS_NORM_PARAM_EPSILON, epsilon);
-  rms_norm_op.AddTensorParam(QNN_OP_RMS_NORM_PARAM_AXES, axis_tensor);
-  rms_norm_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.h
deleted file mode 100644
index f97e35fd5871..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/rms_norm_op_builder.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2025 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_RMS_NORM_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_RMS_NORM_OP_BUILDER_H_
-
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildRmsNormOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const float epsilon);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_RMS_NORM_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.cc
deleted file mode 100644
index 0b6595eb2bfb..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildSelectOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  auto& select_op = CreateOpWrapper(res, QNN_OP_ELEMENT_WISE_SELECT);
-  for (const auto& input : inputs) {
-    select_op.AddInputTensor(input);
-  }
-  select_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.h
deleted file mode 100644
index e5a4431f99dd..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/select_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SELECT_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SELECT_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildSelectOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SELECT_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.cc
deleted file mode 100644
index 4bea271c8b4e..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.h"
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-
-namespace qnn {
-
-namespace {
-constexpr int kDefaultStrideValue = 1;
-constexpr int kSizeNegative = -1;
-constexpr int kRangeNumElements = 3;
-}  // namespace
-
-std::vector<OpWrapper> BuildSliceOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  TensorWrapper& input_tensor = inputs[0];
-  TensorWrapper& begin_tensor = inputs[1];
-  TensorWrapper& size_tensor = inputs[2];
-  if (!begin_tensor.IsTensorStatic() || !size_tensor.IsTensorStatic()) {
-    QNN_LOG_ERROR(
-        "The begin tensor and size tensor of Slice OP is not static.");
-    return res;
-  }
-
-  const auto input_rank = input_tensor.GetRank();
-  auto begin_data =
-      reinterpret_cast<const std::int32_t*>(begin_tensor.GetStaticTensorData());
-  auto size_data =
-      reinterpret_cast<const std::int32_t*>(size_tensor.GetStaticTensorData());
-  std::vector<std::int32_t> range_data;
-  range_data.reserve(input_rank * kRangeNumElements);
-  for (size_t i = 0; i < input_rank; ++i) {
-    range_data.emplace_back(begin_data[i]);
-    if (size_data[i] == kSizeNegative) {
-      range_data.emplace_back(input_tensor.GetDim(i));
-    } else {
-      range_data.emplace_back(begin_data[i] + size_data[i]);
-    }
-    range_data.emplace_back(kDefaultStrideValue);
-  }
-  TensorWrapper& range_tensor = tensor_pool.CreateStaticTensor(
-      QNN_DATATYPE_INT_32, begin_tensor.GetQuantParams(),
-      {input_rank, kRangeNumElements}, sizeof(std::int32_t) * range_data.size(),
-      range_data.data());
-
-  auto& slice_op = CreateOpWrapper(res, QNN_OP_STRIDED_SLICE);
-  slice_op.AddTensorParam(QNN_OP_STRIDED_SLICE_PARAM_RANGES, range_tensor);
-  slice_op.AddInputTensor(input_tensor);
-  slice_op.AddOutputTensor(outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.h
deleted file mode 100644
index 7eb9c013dccc..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/slice_op_builder.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SLICE_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SLICE_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildSliceOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SLICE_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.cc
deleted file mode 100644
index 3b76b8193398..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildSoftmaxOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const float beta) {
-  std::vector<OpWrapper> res;
-
-  auto& softmax_op = CreateOpWrapper(res, QNN_OP_SOFTMAX);
-  softmax_op.AddInputTensor(inputs[0]);
-  softmax_op.AddOutputTensor(outputs[0]);
-  softmax_op.AddScalarParam<float>(QNN_OP_SOFTMAX_PARAM_BETA, beta);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.h
deleted file mode 100644
index bac0ea1c0d76..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/softmax_op_builder.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SOFTMAX_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SOFTMAX_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildSoftmaxOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs, const float beta);
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SOFTMAX_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.cc
deleted file mode 100644
index 40cd4d0f11dd..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.h"
-
-namespace qnn {
-
-namespace {
-constexpr int kSplitIndexRank = 1;
-constexpr int kinputAxisIndex = 0;
-}  // namespace
-
-std::vector<OpWrapper> BuildSplitOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs,
-    const std::uint32_t num_splits) {
-  std::vector<OpWrapper> res;
-
-  const TensorWrapper& axis_tensor = inputs[0];
-  if (!axis_tensor.IsTensorStatic()) {
-    return res;
-  }
-
-  const TensorWrapper& input_tensor = inputs[1];
-  auto* axis_data =
-      reinterpret_cast<const std::int32_t*>(axis_tensor.GetStaticTensorData());
-  std::uint32_t axis =
-      axis_data[0] >= 0 ? axis_data[0] : axis_data[0] + input_tensor.GetRank();
-
-  const std::uint32_t slice_size = input_tensor.GetDim(axis) / num_splits;
-  // The split_indice will do N cuts, split the dimension into N+1 clips
-  // so 0 will not be included in the split_indice
-  // for example, when we split 12 into 4 clip, the split index will be {3,6,9}
-  std::vector<std::uint32_t> split_indice;
-  split_indice.reserve(num_splits);
-  for (int i = 1; i < num_splits; i++) {
-    split_indice.emplace_back(static_cast<std::uint32_t>(i * slice_size));
-  }
-  TensorWrapper& split_indice_tensor = tensor_pool.CreateStaticTensor(
-      QNN_DATATYPE_UINT_32, axis_tensor.GetQuantParams(), {num_splits - 1},
-      sizeof(std::uint32_t) * split_indice.size(), split_indice.data());
-
-  auto& split_op = CreateOpWrapper(res, QNN_OP_SPLIT);
-  split_op.AddInputTensor(input_tensor);
-  for (const auto& output : outputs) {
-    split_op.AddOutputTensor(output);
-  }
-  split_op.AddScalarParam<std::uint32_t>(QNN_OP_SPLIT_PARAM_AXIS, axis);
-  split_op.AddTensorParam(QNN_OP_SPLIT_PARAM_SPLIT_INDEX, split_indice_tensor);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.h
deleted file mode 100644
index 76fafd15cba3..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/split_op_builder.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SPLIT_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SPLIT_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildSplitOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs,
-    const std::uint32_t num_splits);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_SPLIT_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.cc
deleted file mode 100644
index a1789bf8a307..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.cc
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildTanhOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  CreateSimpleActivationOp(res, QNN_OP_TANH, inputs[0], outputs[0]);
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.h
deleted file mode 100644
index 1ede3ba202ba..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/tanh_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_TANH_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_TANH_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildTanhOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_TANH_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.cc
deleted file mode 100644
index 54892ad66a7c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.h"
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildTransposeOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs) {
-  std::vector<OpWrapper> res;
-
-  TensorWrapper& perm_tensor = inputs[1];
-  if (!perm_tensor.IsTensorStatic()) {
-    QNN_LOG_ERROR("The param 'perm' of Transpose OP is not static.");
-    return res;
-  }
-
-  auto& transpose_op = CreateOpWrapper(res, QNN_OP_TRANSPOSE);
-  transpose_op.AddInputTensor(inputs[0]);
-  transpose_op.AddOutputTensor(outputs[0]);
-  transpose_op.AddTensorParam(
-      QNN_OP_TRANSPOSE_PARAM_PERM,
-      tensor_pool.CloneStaticTensorFrom(perm_tensor, QNN_DATATYPE_UINT_32));
-
-  return res;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.h
deleted file mode 100644
index 7f32710f29b3..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/transpose_op_builder.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_TRANSPOSE_OP_BUILDER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_TRANSPOSE_OP_BUILDER_H_
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/builders/op_builder.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-std::vector<OpWrapper> BuildTransposeOp(
-    TensorPool& tensor_pool, const std::vector<TensorWrapperRef>& inputs,
-    const std::vector<TensorWrapperRef>& outputs);
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_BUILDERS_TRANSPOSE_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/common.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/common.h
deleted file mode 100644
index abfd6e3b5383..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/common.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_COMMON_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_COMMON_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-typedef enum LiteRtQnnLogLevel {  // NOLINT(modernize-use-using)
-  /// Disable delegate and QNN backend logging messages.
-  kLogOff = 0,
-  kLogLevelError = 1,
-  kLogLevelWarn = 2,
-  kLogLevelInfo = 3,
-  kLogLevelVerbose = 4,
-  kLogLevelDebug = 5,
-} LiteRtQnnLogLevel;
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_COMMON_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.cc
deleted file mode 100644
index 275915b07a5c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h"
-
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-namespace qnn {
-
-TensorPool::TensorPool() = default;
-
-TensorPool::TensorPool(std::function<void(TensorWrapper&)> tensor_callback)
-    : tensor_callback_{tensor_callback}, tensor_wrappers_{} {}
-
-TensorWrapper& TensorPool::CreateInputTensor(
-    Qnn_DataType_t data_type, const QuantizeParamsWrapperVariant& quant_params,
-    const std::vector<std::uint32_t>& dimentions) {
-  const auto id = tensor_wrappers_.size();
-  auto& back = tensor_wrappers_.emplace_back(
-      id, QNN_TENSOR_TYPE_APP_WRITE, data_type, quant_params, dimentions);
-
-  if (tensor_callback_) {
-    tensor_callback_(back);
-  }
-
-  return back;
-}
-
-TensorWrapper& TensorPool::CreateOutpuTensor(
-    Qnn_DataType_t data_type, const QuantizeParamsWrapperVariant& quant_params,
-    const std::vector<std::uint32_t>& dimentions) {
-  const auto id = tensor_wrappers_.size();
-  auto& back = tensor_wrappers_.emplace_back(
-      id, QNN_TENSOR_TYPE_APP_READ, data_type, quant_params, dimentions);
-
-  if (tensor_callback_) {
-    tensor_callback_(back);
-  }
-
-  return back;
-}
-
-TensorWrapper& TensorPool::CreateNativeTensor(
-    Qnn_DataType_t data_type, const QuantizeParamsWrapperVariant& quant_params,
-    const std::vector<std::uint32_t>& dimentions) {
-  const auto id = tensor_wrappers_.size();
-  auto& back = tensor_wrappers_.emplace_back(
-      id, QNN_TENSOR_TYPE_NATIVE, data_type, quant_params, dimentions);
-
-  if (tensor_callback_) {
-    tensor_callback_(back);
-  }
-
-  return back;
-}
-
-TensorWrapper& TensorPool::CreateStaticTensor(
-    Qnn_DataType_t data_type, const QuantizeParamsWrapperVariant& quant_params,
-    const std::vector<std::uint32_t>& dimentions, std::uint32_t bytes,
-    const void* data) {
-  const auto id = tensor_wrappers_.size();
-  auto& back =
-      tensor_wrappers_.emplace_back(id, QNN_TENSOR_TYPE_STATIC, data_type,
-                                    quant_params, dimentions, bytes, data);
-
-  if (tensor_callback_) {
-    tensor_callback_(back);
-  }
-
-  return back;
-}
-
-TensorWrapper& TensorPool::CloneNativeTensorFrom(const TensorWrapper& src) {
-  const auto id = tensor_wrappers_.size();
-  auto& back = tensor_wrappers_.emplace_back(
-      id, QNN_TENSOR_TYPE_NATIVE, src.GetDataType(), src.quantize_params_,
-      src.dimentions_);
-
-  if (tensor_callback_) {
-    tensor_callback_(back);
-  }
-
-  return back;
-}
-
-TensorWrapper& TensorPool::CloneNativeTensorFrom(
-    const TensorWrapper& src, const std::vector<std::uint32_t>& dimentions) {
-  const auto id = tensor_wrappers_.size();
-  auto& back = tensor_wrappers_.emplace_back(id, QNN_TENSOR_TYPE_NATIVE,
-                                             src.GetDataType(),
-                                             src.quantize_params_, dimentions);
-
-  if (tensor_callback_) {
-    tensor_callback_(back);
-  }
-
-  return back;
-}
-
-TensorWrapper& TensorPool::CloneStaticTensorFrom(const TensorWrapper& src,
-                                                 Qnn_DataType_t data_type) {
-  const auto id = tensor_wrappers_.size();
-  auto& back = tensor_wrappers_.emplace_back(
-      id, QNN_TENSOR_TYPE_STATIC, data_type, src.quantize_params_,
-      src.dimentions_, src.owned_data_.size(), src.owned_data_.data());
-
-  if (tensor_callback_) {
-    tensor_callback_(back);
-  }
-
-  return back;
-}
-
-}  // namespace qnn
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h
deleted file mode 100644
index 352ec6da8fd0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/tensor_pool.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_TENSOR_POOL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_TENSOR_POOL_H_
-
-#include <functional>
-#include <list>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-class TensorPool {
- public:
-  TensorPool();
-
-  TensorPool(std::function<void(TensorWrapper&)> tensor_callback);
-
-  TensorWrapper& CreateInputTensor(
-      Qnn_DataType_t data_type,
-      const QuantizeParamsWrapperVariant& quant_params,
-      const std::vector<std::uint32_t>& dimentions);
-
-  TensorWrapper& CreateOutpuTensor(
-      Qnn_DataType_t data_type,
-      const QuantizeParamsWrapperVariant& quant_params,
-      const std::vector<std::uint32_t>& dimentions);
-
-  TensorWrapper& CreateNativeTensor(
-      Qnn_DataType_t data_type,
-      const QuantizeParamsWrapperVariant& quant_params,
-      const std::vector<std::uint32_t>& dimentions);
-
-  TensorWrapper& CreateStaticTensor(
-      Qnn_DataType_t data_type,
-      const QuantizeParamsWrapperVariant& quant_params,
-      const std::vector<std::uint32_t>& dimentions, std::uint32_t bytes,
-      const void* data);
-
-  TensorWrapper& CloneNativeTensorFrom(const TensorWrapper& src);
-
-  TensorWrapper& CloneNativeTensorFrom(
-      const TensorWrapper& src, const std::vector<std::uint32_t>& dimentions);
-
-  TensorWrapper& CloneStaticTensorFrom(const TensorWrapper& src,
-                                       Qnn_DataType_t data_type);
-
- private:
-  std::function<void(TensorWrapper&)> tensor_callback_{};
-  std::list<TensorWrapper> tensor_wrappers_{};
-};
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_TENSOR_POOL_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/BUILD
deleted file mode 100644
index cd94aa18910c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/BUILD
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert/vendors/qualcomm:__subpackages__"],
-)
-
-cc_library(
-    name = "log",
-    srcs = select({
-        "//tensorflow:android": ["log_android.cc"],
-        "//conditions:default": ["log_default.cc"],
-    }),
-    hdrs = ["log.h"],
-    linkopts = select({
-        "//tensorflow:android": ["-llog"],
-        "//conditions:default": [],
-    }),
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core:common",
-    ],
-)
-
-cc_test(
-    name = "utils_test",
-    srcs = [
-        "utils_test.cc",
-    ],
-    tags = [
-        # Tests with ungrte deps do not currently work on forge.
-        "no-remote-exec",
-        "notap",
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-        "no_oss",
-        # Sanitizer runtime doesn't work with anything that loads libQnnHtp.so.
-        "nosan",
-    ],
-    deps = [
-        ":log",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h
deleted file mode 100644
index f89b4131dea4..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_UTILS_LOG_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_UTILS_LOG_H_
-
-#include <cstdio>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/common.h"
-
-namespace qnn {
-
-class QNNLogger {
- public:
-  // Logging hook that takes variadic args.
-  static void Log(LiteRtQnnLogLevel severity, const char* format, ...);
-
-  // Set file descriptor
-  static void SetLogFilePointer(FILE* fp);
-
-  // Set log level
-  static void SetLogLevel(LiteRtQnnLogLevel log_level);
-
- private:
-  // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
-  static FILE* log_file_pointer_;
-  static LiteRtQnnLogLevel log_level_;
-  // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
-};
-}  // namespace qnn
-
-// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
-#define QNN_LOG_VERBOSE(format, ...)                                  \
-  ::qnn::QNNLogger::Log(kLogLevelVerbose, ("VERBOSE: [Qnn] " format), \
-                        ##__VA_ARGS__);
-
-// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
-#define QNN_LOG_INFO(format, ...) \
-  ::qnn::QNNLogger::Log(kLogLevelInfo, ("INFO: [Qnn] " format), ##__VA_ARGS__);
-
-// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
-#define QNN_LOG_WARNING(format, ...)                               \
-  ::qnn::QNNLogger::Log(kLogLevelWarn, ("WARNING: [Qnn] " format), \
-                        ##__VA_ARGS__);
-
-// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
-#define QNN_LOG_ERROR(format, ...)                                \
-  ::qnn::QNNLogger::Log(kLogLevelError, ("ERROR: [Qnn] " format), \
-                        ##__VA_ARGS__);
-
-// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
-#define QNN_LOG_DEBUG(format, ...)                                \
-  ::qnn::QNNLogger::Log(kLogLevelDebug, ("DEBUG: [Qnn] " format), \
-                        ##__VA_ARGS__);
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_UTILS_LOG_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log_android.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log_android.cc
deleted file mode 100644
index ec13856cda94..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log_android.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include <android/log.h>
-
-#include "log.h"
-
-namespace qnn {
-namespace {
-
-int GetPlatformSeverity(LiteRtQnnLogLevel severity) {
-  switch (severity) {
-    case kLogLevelError:
-      return ANDROID_LOG_ERROR;
-    case kLogLevelWarn:
-      return ANDROID_LOG_WARN;
-    case kLogLevelInfo:
-      return ANDROID_LOG_INFO;
-    case kLogLevelVerbose:
-      return ANDROID_LOG_VERBOSE;
-    default:
-      return ANDROID_LOG_DEBUG;
-  }
-}
-
-}  // namespace
-
-// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
-FILE* QNNLogger::log_file_pointer_ = stderr;
-LiteRtQnnLogLevel QNNLogger::log_level_ = kLogLevelInfo;
-// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
-void QNNLogger::SetLogFilePointer(FILE* fp) { log_file_pointer_ = fp; }
-void QNNLogger::SetLogLevel(LiteRtQnnLogLevel log_level) {
-  log_level_ = log_level;
-}
-// NOLINTNEXTLINE(cert-dcl50-cpp)
-void QNNLogger::Log(LiteRtQnnLogLevel severity, const char* format, ...) {
-  if (severity > log_level_) {
-    return;
-  }
-
-  // Pass to LogFormatted
-  va_list args;
-  va_start(args, format);
-
-  // First log to Android's explicit log(cat) API.
-  va_list args_copy;
-  va_copy(args_copy, args);
-  __android_log_vprint(GetPlatformSeverity(severity), "qnn", format, args_copy);
-  va_end(args_copy);
-
-  // Print to file pointer.
-  vfprintf(log_file_pointer_, format, args);
-  fputc('\n', log_file_pointer_);
-
-  va_end(args);
-}
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log_default.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log_default.cc
deleted file mode 100644
index b1758e494b60..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log_default.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include <stdarg.h>
-
-#include "log.h"
-
-namespace qnn {
-
-// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
-FILE* QNNLogger::log_file_pointer_ = stderr;
-LiteRtQnnLogLevel QNNLogger::log_level_ = kLogLevelInfo;
-// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
-void QNNLogger::SetLogFilePointer(FILE* fp) { log_file_pointer_ = fp; }
-void QNNLogger::SetLogLevel(LiteRtQnnLogLevel log_level) {
-  log_level_ = log_level;
-}
-// NOLINTNEXTLINE(cert-dcl50-cpp)
-void QNNLogger::Log(LiteRtQnnLogLevel severity, const char* format, ...) {
-  if (severity > log_level_) {
-    return;
-  }
-
-  // Pass to LogFormatted
-  va_list args;
-  va_start(args, format);
-
-  // Print to file pointer.
-  vfprintf(log_file_pointer_, format, args);
-  fputc('\n', log_file_pointer_);
-
-  va_end(args);
-}
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/utils_test.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/utils_test.cc
deleted file mode 100644
index cd4ce0d78dd1..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/utils_test.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include <filesystem>
-#include <fstream>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-
-namespace litert {
-namespace {
-
-bool IsPrefix(std::string_view prefix, std::string_view full) {
-  return prefix == full.substr(0, prefix.size());
-}
-
-bool CheckLoggoing(const std::string log_path, LiteRtQnnLogLevel log_level) {
-  std::ifstream fin(log_path);
-  std::string msg;
-  while (std::getline(fin, msg)) {
-    // Log severity: DEBUG > VERBOSE > INFO > WARN > ERROR
-    switch (log_level) {
-      case kLogOff:
-        if (IsPrefix("ERROR:", msg)) return false;
-        [[fallthrough]];
-      case kLogLevelError:
-        if (IsPrefix("WARNING:", msg)) return false;
-        [[fallthrough]];
-      case kLogLevelWarn:
-        if (IsPrefix("INFO:", msg)) return false;
-        [[fallthrough]];
-      case kLogLevelInfo:
-        if (IsPrefix("VERBOSE:", msg)) return false;
-        [[fallthrough]];
-      case kLogLevelVerbose:
-        if (IsPrefix("DEBUG:", msg)) return false;
-        [[fallthrough]];
-      default:
-        break;
-    }
-  }
-  return true;
-}
-
-}  // namespace
-
-class LiteRtLog : public ::testing::TestWithParam<LiteRtQnnLogLevel> {};
-INSTANTIATE_TEST_SUITE_P(, LiteRtLog,
-                         ::testing::Values(kLogOff, kLogLevelError,
-                                           kLogLevelWarn, kLogLevelInfo,
-                                           kLogLevelVerbose, kLogLevelDebug));
-
-TEST_P(LiteRtLog, SanityTest) {
-  // Create temp file for log
-  std::filesystem::path temp_path =
-      std::filesystem::temp_directory_path() / "temp.log";
-  std::ofstream fout(temp_path);
-  ASSERT_TRUE(fout.is_open());
-
-  // Set log file pointer
-  FILE* file_ptr = fopen(temp_path.c_str(), "w");
-  ASSERT_NE(file_ptr, nullptr);
-  qnn::QNNLogger::SetLogFilePointer(file_ptr);
-
-  // Set log_level and print message to file
-  LiteRtQnnLogLevel log_level = GetParam();
-  qnn::QNNLogger::SetLogLevel(log_level);
-  QNN_LOG_VERBOSE("This is a verbose message.");
-  QNN_LOG_INFO("This is an info message.");
-  QNN_LOG_WARNING("This is a warning message.");
-  QNN_LOG_ERROR("This is an error message.");
-  QNN_LOG_DEBUG("This is a debug message.");
-  qnn::QNNLogger::SetLogFilePointer(stderr);
-  fclose(file_ptr);
-
-  // Check logging messages are as expected
-  ASSERT_EQ(CheckLoggoing(temp_path.string(), log_level), true);
-
-  // Delete the temporary log file
-  std::filesystem::remove(temp_path);
-}
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/BUILD
deleted file mode 100644
index 13ca6a5a0787..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/BUILD
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert/vendors/qualcomm:__subpackages__"],
-)
-
-cc_library(
-    name = "quantize_params_wrapper",
-    srcs = ["quantize_params_wrapper.cc"],
-    hdrs = ["quantize_params_wrapper.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        "@com_google_absl//absl/types:span",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-    ],
-)
-
-cc_library(
-    name = "tensor_wrapper",
-    srcs = ["tensor_wrapper.cc"],
-    hdrs = ["tensor_wrapper.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        ":quantize_params_wrapper",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils:log",
-    ],
-)
-
-cc_library(
-    name = "param_wrapper",
-    srcs = ["param_wrapper.cc"],
-    hdrs = ["param_wrapper.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils:log",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
-
-cc_library(
-    name = "op_wrapper",
-    srcs = ["op_wrapper.cc"],
-    hdrs = ["op_wrapper.h"],
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    deps = [
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:param_wrapper",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers:tensor_wrapper",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.cc
deleted file mode 100644
index 4ba3d7d646ac..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h"
-
-#include <string>
-#include <utility>
-
-namespace qnn {
-
-OpWrapper::OpWrapper(std::string name, const char* op_type)
-    : name_{std::move(name)} {
-  qnn_op_.v1.packageName = QNN_OP_PACKAGE_NAME_QTI_AISW;
-  qnn_op_.v1.typeName = op_type;
-  qnn_op_.v1.name = name_.c_str();
-}
-
-OpWrapper::OpWrapper(const OpWrapper& other)
-    : qnn_op_{other.qnn_op_},
-      name_{other.name_},
-      params_{other.params_},
-      input_tensors_{other.input_tensors_},
-      output_tensors_{other.output_tensors_} {
-  qnn_op_.v1.name = name_.c_str();
-  qnn_op_.v1.params = params_.data();
-  qnn_op_.v1.inputTensors = input_tensors_.data();
-  qnn_op_.v1.outputTensors = output_tensors_.data();
-}
-
-OpWrapper::OpWrapper(OpWrapper&& other)
-    : qnn_op_{other.qnn_op_},
-      name_{std::move(other.name_)},
-      params_{std::move(other.params_)},
-      input_tensors_{std::move(other.input_tensors_)},
-      output_tensors_{std::move(other.output_tensors_)} {
-  qnn_op_.v1.name = name_.c_str();
-  qnn_op_.v1.params = params_.data();
-  qnn_op_.v1.inputTensors = input_tensors_.data();
-  qnn_op_.v1.outputTensors = output_tensors_.data();
-}
-
-OpWrapper::~OpWrapper() = default;
-
-void OpWrapper::AddInputTensor(const TensorWrapper& tensor) {
-  auto& back = input_tensors_.emplace_back();
-  tensor.CloneTo(back);
-
-  qnn_op_.v1.numOfInputs = input_tensors_.size();
-  qnn_op_.v1.inputTensors = input_tensors_.data();
-}
-
-void OpWrapper::AddOutputTensor(const TensorWrapper& tensor) {
-  auto& back = output_tensors_.emplace_back();
-  tensor.CloneTo(back);
-
-  qnn_op_.v1.numOfOutputs = output_tensors_.size();
-  qnn_op_.v1.outputTensors = output_tensors_.data();
-}
-
-void OpWrapper::AddTensorParam(const char* name, const TensorWrapper& tensor) {
-  TensorParamWrapper param_wrapper(name, tensor);
-
-  auto& back = params_.emplace_back();
-  param_wrapper.CloneTo(back);
-
-  qnn_op_.v1.numOfParams = params_.size();
-  qnn_op_.v1.params = params_.data();
-}
-
-const Qnn_OpConfig_t& OpWrapper::GetOpConfig() const { return qnn_op_; }
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h
deleted file mode 100644
index 275386dfb1a5..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/op_wrapper.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_OP_WRAPPER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_OP_WRAPPER_H_
-
-#include "third_party/qairt/latest/include/QNN/QnnOpDef.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-class OpWrapper final {
- public:
-  explicit OpWrapper(std::string name, const char* op_type);
-
-  OpWrapper(const OpWrapper& other);
-
-  OpWrapper(OpWrapper&& other);
-
-  ~OpWrapper();
-
-  void AddInputTensor(const TensorWrapper& tensor);
-
-  void AddOutputTensor(const TensorWrapper& tensor);
-
-  template <typename T>
-  void AddScalarParam(const char* name, const T data,
-                      const bool is_quant = false) {
-    ScalarParamWrapper param_wrapper(name, data, is_quant);
-
-    auto& back = params_.emplace_back();
-    param_wrapper.CloneTo(back);
-
-    qnn_op_.v1.numOfParams = params_.size();
-    qnn_op_.v1.params = params_.data();
-  }
-
-  void AddTensorParam(const char* name, const TensorWrapper& tensor);
-
-  const Qnn_OpConfig_t& GetOpConfig() const;
-
- private:
-  Qnn_OpConfig_t qnn_op_ = QNN_OPCONFIG_INIT;
-  std::string name_{};  // human readable name
-  std::vector<Qnn_Param_t> params_{};
-  std::vector<Qnn_Tensor_t> input_tensors_{};
-  std::vector<Qnn_Tensor_t> output_tensors_{};
-};
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_OP_WRAPPER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.cc
deleted file mode 100644
index 0e8d3a2e2cf2..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.h"
-
-namespace qnn {
-
-void ScalarParamWrapper::CloneTo(Qnn_Param_t& dst) const { dst = qnn_param_; }
-
-TensorParamWrapper::TensorParamWrapper(const char* name,
-                                       const TensorWrapper& tensor) {
-  qnn_param_.name = name;
-  qnn_param_.paramType = QNN_PARAMTYPE_TENSOR;
-  tensor.CloneTo(qnn_param_.tensorParam);
-}
-
-void TensorParamWrapper::CloneTo(Qnn_Param_t& dst) const { dst = qnn_param_; }
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.h
deleted file mode 100644
index 3aacd42cab15..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/param_wrapper.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_PARAM_WRAPPER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_PARAM_WRAPPER_H_
-
-#include <type_traits>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-namespace qnn {
-
-class ScalarParamWrapper {
- public:
-  template <typename T>
-  explicit ScalarParamWrapper(const char* name, const T data,
-                              const bool is_quant) {
-    qnn_param_.name = name;
-    qnn_param_.paramType = QNN_PARAMTYPE_SCALAR;
-    if constexpr (std::is_same_v<T, bool>) {
-      qnn_param_.scalarParam.dataType = QNN_DATATYPE_BOOL_8;
-      qnn_param_.scalarParam.bool8Value = data;
-    } else if constexpr (std::is_same_v<T, std::uint8_t>) {
-      qnn_param_.scalarParam.dataType =
-          is_quant ? QNN_DATATYPE_UFIXED_POINT_8 : QNN_DATATYPE_UINT_8;
-      qnn_param_.scalarParam.uint8Value = data;
-    } else if constexpr (std::is_same_v<T, std::int8_t>) {
-      qnn_param_.scalarParam.dataType =
-          is_quant ? QNN_DATATYPE_SFIXED_POINT_8 : QNN_DATATYPE_INT_8;
-      qnn_param_.scalarParam.int8Value = data;
-    } else if constexpr (std::is_same_v<T, std::uint16_t>) {
-      qnn_param_.scalarParam.dataType =
-          is_quant ? QNN_DATATYPE_UFIXED_POINT_16 : QNN_DATATYPE_UINT_16;
-      qnn_param_.scalarParam.uint16Value = data;
-    } else if constexpr (std::is_same_v<T, std::int16_t>) {
-      qnn_param_.scalarParam.dataType =
-          is_quant ? QNN_DATATYPE_SFIXED_POINT_16 : QNN_DATATYPE_INT_16;
-      qnn_param_.scalarParam.int16Value = data;
-    } else if constexpr (std::is_same_v<T, std::uint32_t>) {
-      qnn_param_.scalarParam.dataType =
-          is_quant ? QNN_DATATYPE_UFIXED_POINT_32 : QNN_DATATYPE_UINT_32;
-      qnn_param_.scalarParam.uint32Value = data;
-    } else if constexpr (std::is_same_v<T, std::int32_t>) {
-      qnn_param_.scalarParam.dataType =
-          is_quant ? QNN_DATATYPE_SFIXED_POINT_32 : QNN_DATATYPE_INT_32;
-      qnn_param_.scalarParam.int32Value = data;
-    } else if constexpr (std::is_same_v<T, float>) {
-      qnn_param_.scalarParam.dataType = QNN_DATATYPE_FLOAT_32;
-      qnn_param_.scalarParam.floatValue = data;
-    } else {
-      QNN_LOG_ERROR("Unsupported data type for scalar param.");
-    }
-  }
-
-  void CloneTo(Qnn_Param_t& dst) const;
-
- private:
-  Qnn_Param_t qnn_param_ = QNN_PARAM_INIT;
-};
-
-class TensorParamWrapper {
- public:
-  explicit TensorParamWrapper(const char* name, const TensorWrapper& tensor);
-
-  void CloneTo(Qnn_Param_t& dst) const;
-
- private:
-  Qnn_Param_t qnn_param_ = QNN_PARAM_INIT;
-};
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_PARAM_WRAPPER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.cc
deleted file mode 100644
index c87e3fec52d8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.h"
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-
-namespace qnn {
-
-UndefinedQuantizeParamsWrapper::UndefinedQuantizeParamsWrapper() = default;
-
-UndefinedQuantizeParamsWrapper::UndefinedQuantizeParamsWrapper(
-    const UndefinedQuantizeParamsWrapper&) = default;
-
-UndefinedQuantizeParamsWrapper::UndefinedQuantizeParamsWrapper(
-    UndefinedQuantizeParamsWrapper&&) = default;
-
-void UndefinedQuantizeParamsWrapper::CloneTo(Qnn_QuantizeParams_t& dst) {
-  dst = qnn_quantize_param_;
-}
-
-ScaleOffsetQuantizeParamsWrapper::ScaleOffsetQuantizeParamsWrapper(
-    const float scale, const std::int32_t zero_point) {
-  qnn_quantize_param_.encodingDefinition = QNN_DEFINITION_DEFINED;
-  qnn_quantize_param_.quantizationEncoding =
-      QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
-  qnn_quantize_param_.scaleOffsetEncoding.scale = scale;
-  qnn_quantize_param_.scaleOffsetEncoding.offset = -1 * zero_point;
-}
-
-ScaleOffsetQuantizeParamsWrapper::ScaleOffsetQuantizeParamsWrapper(
-    const ScaleOffsetQuantizeParamsWrapper&) = default;
-
-ScaleOffsetQuantizeParamsWrapper::ScaleOffsetQuantizeParamsWrapper(
-    ScaleOffsetQuantizeParamsWrapper&&) = default;
-
-void ScaleOffsetQuantizeParamsWrapper::CloneTo(Qnn_QuantizeParams_t& dst) {
-  dst = qnn_quantize_param_;
-}
-
-AxisScaleOffsetQuantizeParamsWrapper::AxisScaleOffsetQuantizeParamsWrapper(
-    const std::int32_t axis, const absl::Span<const float> scales,
-    const absl::Span<const std::int32_t> zero_points)
-    : scale_offsets_(scales.size()) {
-  assert(scales.size() == zero_points.size());
-  for (size_t i = 0; i < scale_offsets_.size(); ++i) {
-    scale_offsets_[i].scale = scales[i];
-    scale_offsets_[i].offset = -1 * zero_points[i];
-  }
-
-  qnn_quantize_param_.encodingDefinition = QNN_DEFINITION_DEFINED;
-  qnn_quantize_param_.quantizationEncoding =
-      QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET;
-  qnn_quantize_param_.axisScaleOffsetEncoding.axis = axis;
-  qnn_quantize_param_.axisScaleOffsetEncoding.numScaleOffsets =
-      scale_offsets_.size();
-  qnn_quantize_param_.axisScaleOffsetEncoding.scaleOffset =
-      scale_offsets_.data();
-}
-
-AxisScaleOffsetQuantizeParamsWrapper::AxisScaleOffsetQuantizeParamsWrapper(
-    const AxisScaleOffsetQuantizeParamsWrapper& rhs)
-    : qnn_quantize_param_{rhs.qnn_quantize_param_},
-      scale_offsets_{rhs.scale_offsets_} {
-  qnn_quantize_param_.axisScaleOffsetEncoding.scaleOffset =
-      scale_offsets_.data();
-}
-
-AxisScaleOffsetQuantizeParamsWrapper::AxisScaleOffsetQuantizeParamsWrapper(
-    AxisScaleOffsetQuantizeParamsWrapper&& rhs)
-    : qnn_quantize_param_{rhs.qnn_quantize_param_},
-      scale_offsets_{std::move(rhs.scale_offsets_)} {
-  qnn_quantize_param_.axisScaleOffsetEncoding.scaleOffset =
-      scale_offsets_.data();
-}
-
-void AxisScaleOffsetQuantizeParamsWrapper::CloneTo(Qnn_QuantizeParams_t& dst) {
-  dst = qnn_quantize_param_;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.h
deleted file mode 100644
index 78f12f33c6f8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_QUANTIZE_PARAMS_WRAPPER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_QUANTIZE_PARAMS_WRAPPER_H_
-
-#include <cstdint>
-#include <variant>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-
-namespace qnn {
-
-class UndefinedQuantizeParamsWrapper final {
- public:
-  UndefinedQuantizeParamsWrapper();
-
-  UndefinedQuantizeParamsWrapper(const UndefinedQuantizeParamsWrapper&);
-
-  UndefinedQuantizeParamsWrapper(UndefinedQuantizeParamsWrapper&&);
-
-  void CloneTo(Qnn_QuantizeParams_t& dst);
-
- private:
-  Qnn_QuantizeParams_t qnn_quantize_param_ = QNN_QUANTIZE_PARAMS_INIT;
-};
-
-class ScaleOffsetQuantizeParamsWrapper final {
- public:
-  explicit ScaleOffsetQuantizeParamsWrapper(const float scale,
-                                            const std::int32_t zero_point);
-
-  ScaleOffsetQuantizeParamsWrapper(const ScaleOffsetQuantizeParamsWrapper&);
-
-  ScaleOffsetQuantizeParamsWrapper(ScaleOffsetQuantizeParamsWrapper&&);
-
-  void CloneTo(Qnn_QuantizeParams_t& dst);
-
- private:
-  Qnn_QuantizeParams_t qnn_quantize_param_ = QNN_QUANTIZE_PARAMS_INIT;
-};
-
-class AxisScaleOffsetQuantizeParamsWrapper final {
- public:
-  explicit AxisScaleOffsetQuantizeParamsWrapper(
-      const std::int32_t axis, const absl::Span<const float> scales,
-      const absl::Span<const std::int32_t> zero_points);
-
-  AxisScaleOffsetQuantizeParamsWrapper(
-      const AxisScaleOffsetQuantizeParamsWrapper& rhs);
-
-  AxisScaleOffsetQuantizeParamsWrapper(
-      AxisScaleOffsetQuantizeParamsWrapper&& rhs);
-
-  void CloneTo(Qnn_QuantizeParams_t& dst);
-
- private:
-  Qnn_QuantizeParams_t qnn_quantize_param_ = QNN_QUANTIZE_PARAMS_INIT;
-  std::vector<Qnn_ScaleOffset_t> scale_offsets_;
-};
-
-using QuantizeParamsWrapperVariant =
-    std::variant<UndefinedQuantizeParamsWrapper,
-                 ScaleOffsetQuantizeParamsWrapper,
-                 AxisScaleOffsetQuantizeParamsWrapper>;
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_QUANTIZE_PARAMS_WRAPPER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.cc
deleted file mode 100644
index da00ed6d04e8..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc. All Rights Reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h"
-
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <iostream>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/utils/log.h"
-
-namespace qnn {
-
-std::size_t GetDataTypeSize(const Qnn_DataType_t data_type) {
-  std::size_t bytes = 0;
-  switch (data_type) {
-    case QNN_DATATYPE_INT_8:
-    case QNN_DATATYPE_UINT_8:
-    case QNN_DATATYPE_SFIXED_POINT_8:
-    case QNN_DATATYPE_UFIXED_POINT_8:
-    case QNN_DATATYPE_BOOL_8:
-      bytes = 1;
-      break;
-    case QNN_DATATYPE_INT_16:
-    case QNN_DATATYPE_UINT_16:
-    case QNN_DATATYPE_FLOAT_16:
-    case QNN_DATATYPE_SFIXED_POINT_16:
-    case QNN_DATATYPE_UFIXED_POINT_16:
-      bytes = 2;
-      break;
-    case QNN_DATATYPE_INT_32:
-    case QNN_DATATYPE_UINT_32:
-    case QNN_DATATYPE_FLOAT_32:
-    case QNN_DATATYPE_SFIXED_POINT_32:
-    case QNN_DATATYPE_UFIXED_POINT_32:
-      bytes = 4;
-      break;
-    case QNN_DATATYPE_INT_64:
-    case QNN_DATATYPE_UINT_64:
-    case QNN_DATATYPE_FLOAT_64:
-      bytes = 8;
-      break;
-    case QNN_DATATYPE_UNDEFINED:
-    case QNN_DATATYPE_SFIXED_POINT_4:
-    case QNN_DATATYPE_UFIXED_POINT_4:
-    default:
-      bytes = 0;
-      break;
-  }
-  return bytes;
-}
-
-TensorWrapper::TensorWrapper() = default;
-
-TensorWrapper::TensorWrapper(
-    std::uint32_t id, Qnn_TensorType_t tensor_type, Qnn_DataType_t data_type,
-    const QuantizeParamsWrapperVariant& quantize_params,
-    const std::vector<std::uint32_t>& dimentions)
-    : name_{std::to_string(id)},
-      dimentions_{dimentions},
-      quantize_params_{quantize_params} {
-  qnn_tensor_.v2.name = name_.c_str();
-  qnn_tensor_.v2.type = tensor_type;
-  qnn_tensor_.v2.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
-  qnn_tensor_.v2.dataType = data_type;
-  std::visit(
-      [this](auto&& quantize_params) -> void {
-        quantize_params.CloneTo(qnn_tensor_.v2.quantizeParams);
-      },
-      quantize_params_);
-  qnn_tensor_.v2.rank = dimentions_.size();
-  qnn_tensor_.v2.dimensions = dimentions_.data();
-  qnn_tensor_.v2.memType = QNN_TENSORMEMTYPE_RAW;
-}
-
-TensorWrapper::TensorWrapper(
-    std::uint32_t id, Qnn_TensorType_t tensor_type, Qnn_DataType_t data_type,
-    const QuantizeParamsWrapperVariant& quantize_params,
-    const std::vector<std::uint32_t>& dimentions, std::uint32_t bytes,
-    const void* data)
-    : TensorWrapper(id, tensor_type, data_type, quantize_params, dimentions) {
-  SetTensorData(bytes, data);
-}
-
-TensorWrapper::TensorWrapper(const TensorWrapper& other)
-    : qnn_tensor_{other.qnn_tensor_},
-      name_{other.name_},
-      dimentions_{other.dimentions_},
-      quantize_params_{other.quantize_params_},
-      owned_data_{other.owned_data_} {
-  qnn_tensor_.v2.name = name_.c_str();
-  qnn_tensor_.v2.dimensions = dimentions_.data();
-  qnn_tensor_.v2.clientBuf.data = owned_data_.data();
-  std::visit(
-      [this](auto&& quant_params) -> void {
-        quant_params.CloneTo(qnn_tensor_.v2.quantizeParams);
-      },
-      quantize_params_);
-}
-
-TensorWrapper::TensorWrapper(TensorWrapper&& other)
-    : qnn_tensor_{other.qnn_tensor_},
-      name_{std::move(other.name_)},
-      dimentions_{std::move(other.dimentions_)},
-      quantize_params_{std::move(other.quantize_params_)},
-      owned_data_{std::move(other.owned_data_)} {
-  qnn_tensor_.v2.name = name_.c_str();
-  qnn_tensor_.v2.dimensions = dimentions_.data();
-  qnn_tensor_.v2.clientBuf.data = owned_data_.data();
-  std::visit(
-      [this](auto&& quant_params) -> void {
-        quant_params.CloneTo(qnn_tensor_.v2.quantizeParams);
-      },
-      quantize_params_);
-}
-
-TensorWrapper::~TensorWrapper() = default;
-
-std::uint32_t TensorWrapper::GetDim(size_t index) const {
-  return dimentions_[index];
-}
-
-Qnn_DataType_t TensorWrapper::GetDataType() const {
-  return qnn_tensor_.v2.dataType;
-}
-
-void TensorWrapper::CloneTo(Qnn_Tensor_t& dst) const { dst = qnn_tensor_; }
-
-std::uint32_t TensorWrapper::GetRank() const { return qnn_tensor_.v2.rank; }
-
-Qnn_TensorType_t TensorWrapper::GetTensorType() const {
-  return qnn_tensor_.v2.type;
-}
-
-size_t TensorWrapper::GetTensorSize() const {
-  return std::accumulate(GetDims().begin(), GetDims().end(),
-                         GetDataTypeSize(GetDataType()), std::multiplies<>());
-}
-
-void TensorWrapper::SetDataType(Qnn_DataType_t data_type) {
-  qnn_tensor_.v2.dataType = data_type;
-}
-
-void TensorWrapper::SetTensorData(std::uint32_t bytes, const void* data) {
-  if (!IsSubgraphInput() && !IsTensorStatic()) {
-    QNN_LOG_ERROR(
-        "Cannot set tensor data of tensor type other than "
-        "QNN_TENSOR_TYPE_APP_WRITE or QNN_TENSOR_TYPE_STATIC.");
-    return;
-  }
-
-  if (bytes != GetTensorSize()) {
-    QNN_LOG_WARNING("Bytes: %u != TensorSize(): %lu, use TensorSize() instead.",
-                    bytes, GetTensorSize());
-    bytes = GetTensorSize();
-  }
-
-  owned_data_.resize(bytes);
-  std::memcpy(owned_data_.data(), reinterpret_cast<const char*>(data), bytes);
-
-  qnn_tensor_.v2.clientBuf.dataSize = owned_data_.size();
-  qnn_tensor_.v2.clientBuf.data = owned_data_.data();
-}
-
-bool TensorWrapper::IsPerTensorQuantWithOffsetDiff(
-    const TensorWrapper& rhs) const {
-  const auto& lhs_quant = qnn_tensor_.v2.quantizeParams;
-  const auto& rhs_quant = rhs.qnn_tensor_.v2.quantizeParams;
-
-  if (lhs_quant.encodingDefinition != QNN_DEFINITION_DEFINED ||
-      rhs_quant.encodingDefinition != QNN_DEFINITION_DEFINED) {
-    return false;
-  }
-
-  if (lhs_quant.quantizationEncoding !=
-          QNN_QUANTIZATION_ENCODING_SCALE_OFFSET ||
-      rhs_quant.quantizationEncoding !=
-          QNN_QUANTIZATION_ENCODING_SCALE_OFFSET) {
-    return false;
-  }
-
-  const auto lhs_scale = lhs_quant.scaleOffsetEncoding.scale;
-  const auto lhs_offset = lhs_quant.scaleOffsetEncoding.offset;
-  const auto rhs_scale = rhs_quant.scaleOffsetEncoding.scale;
-  const auto rhs_offset = rhs_quant.scaleOffsetEncoding.offset;
-  if ((GetDataType() == QNN_DATATYPE_SFIXED_POINT_8 &&
-       rhs.GetDataType() == QNN_DATATYPE_UFIXED_POINT_8) ||
-      (GetDataType() == QNN_DATATYPE_UFIXED_POINT_8 &&
-       rhs.GetDataType() == QNN_DATATYPE_SFIXED_POINT_8)) {
-    constexpr int kSUFixed8OffsetDiff = 128;
-    if (std::fabs(lhs_scale - rhs_scale) <
-            std::numeric_limits<float>::epsilon() &&
-        std::abs(lhs_offset - rhs_offset) == kSUFixed8OffsetDiff) {
-      return true;
-    }
-  } else if ((GetDataType() == QNN_DATATYPE_SFIXED_POINT_16 &&
-              rhs.GetDataType() == QNN_DATATYPE_UFIXED_POINT_16) ||
-             (GetDataType() == QNN_DATATYPE_UFIXED_POINT_16 &&
-              rhs.GetDataType() == QNN_DATATYPE_SFIXED_POINT_16)) {
-    constexpr int kSUFixed16OffsetDiff = 32768;
-    if (std::fabs(lhs_scale - rhs_scale) <
-            std::numeric_limits<float>::epsilon() &&
-        std::abs(lhs_offset - rhs_offset) == kSUFixed16OffsetDiff) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h
deleted file mode 100644
index ae99cbd127ea..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/tensor_wrapper.h
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) Qualcomm Innovation Center, Inc.
-// All Rights Reserved.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_TENSOR_WRAPPER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_TENSOR_WRAPPER_H_
-
-#include <string>
-#include <vector>
-
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/core/wrappers/quantize_params_wrapper.h"
-
-namespace qnn {
-
-std::size_t GetDataTypeSize(const Qnn_DataType_t data_type);
-
-class TensorWrapper final {
-  friend class TensorPool;
-
- public:
-  explicit TensorWrapper();
-
-  explicit TensorWrapper(std::uint32_t id, Qnn_TensorType_t tensor_type,
-                         Qnn_DataType_t data_type,
-                         const QuantizeParamsWrapperVariant& quantize_params,
-                         const std::vector<std::uint32_t>& dimentions);
-
-  explicit TensorWrapper(std::uint32_t id, Qnn_TensorType_t tensor_type,
-                         Qnn_DataType_t data_type,
-                         const QuantizeParamsWrapperVariant& quantize_params,
-                         const std::vector<std::uint32_t>& dimentions,
-                         std::uint32_t bytes, const void* data);
-
-  TensorWrapper(const TensorWrapper& other);
-
-  TensorWrapper(TensorWrapper&& other);
-
-  ~TensorWrapper();
-
-  void CloneTo(Qnn_Tensor_t& dst) const;
-
-  Qnn_Tensor_t& GetQnnTensor() { return qnn_tensor_; }
-
-  std::uint32_t GetRank() const;
-
-  std::uint32_t GetDim(size_t index) const;
-
-  const std::vector<std::uint32_t>& GetDims() const { return dimentions_; };
-
-  const QuantizeParamsWrapperVariant& GetQuantParams() const {
-    return quantize_params_;
-  };
-
-  bool IsPerTensorQuantWithOffsetDiff(const TensorWrapper& rhs) const;
-
-  bool IsQuant8() const {
-    return GetDataType() == QNN_DATATYPE_SFIXED_POINT_8 ||
-           GetDataType() == QNN_DATATYPE_UFIXED_POINT_8;
-  }
-
-  bool IsQuant16() const {
-    return GetDataType() == QNN_DATATYPE_SFIXED_POINT_16 ||
-           GetDataType() == QNN_DATATYPE_UFIXED_POINT_16;
-  }
-
-  Qnn_DataType_t GetDataType() const;
-
-  void SetDataType(Qnn_DataType_t data_type);
-
-  bool IsSubgraphInput() const {
-    return GetTensorType() == QNN_TENSOR_TYPE_APP_WRITE;
-  }
-
-  bool IsSubgraphOutput() const {
-    return GetTensorType() == QNN_TENSOR_TYPE_APP_READ;
-  }
-
-  bool IsTensorStatic() const {
-    return GetTensorType() == QNN_TENSOR_TYPE_STATIC;
-  }
-
-  void SetTensorData(std::uint32_t bytes, const void* data);
-
-  const std::vector<std::byte>& GetTensorData() const { return owned_data_; }
-
-  // Allocate memory on owned_data_ for output tensors
-  void AllocateOutputTensorBuffer() {
-    owned_data_.resize(GetTensorSize());
-    qnn_tensor_.v2.clientBuf.dataSize = owned_data_.size();
-    qnn_tensor_.v2.clientBuf.data = owned_data_.data();
-  }
-
-  const void* GetStaticTensorData() const {
-    return qnn_tensor_.v2.clientBuf.data;
-  };
-
-  size_t GetTensorSize() const;
-
- private:
-  Qnn_TensorType_t GetTensorType() const;
-
-  Qnn_Tensor_t qnn_tensor_{.version = QNN_TENSOR_VERSION_2,
-                           .v2 = QNN_TENSOR_V2_INIT};
-  std::string name_{};
-  std::vector<std::uint32_t> dimentions_{};
-  QuantizeParamsWrapperVariant quantize_params_{};
-  std::vector<std::byte> owned_data_{};
-};
-
-using TensorWrapperRef = std::reference_wrapper<TensorWrapper>;
-
-}  // namespace qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_CORE_WRAPPERS_TENSOR_WRAPPER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/BUILD
deleted file mode 100644
index cd1c05f59e01..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/BUILD
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "copy_file", "litert_dynamic_lib")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-litert_dynamic_lib(
-    name = "dispatch_api",
-    srcs = [
-        "dispatch_api.cc",
-        "litert_dispatch_device_context.cc",
-        "litert_dispatch_invocation_context.cc",
-    ],
-    hdrs = [
-        "litert_dispatch_device_context.h",
-        "litert_dispatch_invocation_context.h",
-        "registry.h",
-    ],
-    export_litert_only = True,
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }) + ["-Wl,-soname=libLiteRtDispatch_Qualcomm.so"],
-    linkstatic = 1,
-    shared_lib_name = "dispatch_api_so",
-    so_name = "libLiteRtDispatch_Qualcomm.so",
-    tags = [
-        # Don't build/test in OS until qnn is available.
-        "nobuilder",
-    ],
-    visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-    deps = [
-        "@com_google_absl//absl/log:absl_check",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/c:litert_runtime_c_api_shared_lib",
-        "//tensorflow/lite/experimental/litert/cc:litert_expected",
-        "//tensorflow/lite/experimental/litert/core/util:tensor_type_util",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:common",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:context_binary_info",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager",
-    ],
-)
-
-# This is cc_library target for `libLiteRtDispatch_Qualcomm.so`.
-cc_library(
-    name = "dispatch_api_shared_lib",
-    srcs = [":dispatch_api_so"],
-    linkstatic = 1,
-)
-
-# Copies the shared library so that it is available for use in test data as libLiteRtDispatch_Qualcomm.so.
-copy_file(
-    name = "copy_dispatch_api_so",
-    src = "//tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch:dispatch_api_so",
-    target = "libLiteRtDispatch_Qualcomm.so",
-)
-
-cc_test(
-    name = "dispatch_api_qualcomm_test",
-    srcs = [
-        "dispatch_api_qualcomm_test.cc",
-    ],
-    data = [
-        ":dispatch_api_so",
-    ],
-    linkopts = select({
-        "//tensorflow:android": ["-landroid"],
-        "//conditions:default": [],
-    }),
-    linkstatic = 1,
-    tags = [
-        "no-remote-exec",
-        "no_oss",
-        "notap",
-    ],
-    deps = [
-        "//tensorflow/lite/experimental/litert/c:litert_common",
-        "//tensorflow/lite/experimental/litert/c:litert_tensor_buffer",
-        "//tensorflow/lite/experimental/litert/cc:litert_any",
-        "//tensorflow/lite/experimental/litert/core:filesystem",
-        "//tensorflow/lite/experimental/litert/test:common",
-        "//tensorflow/lite/experimental/litert/test:simple_model_npu",
-        "//tensorflow/lite/experimental/litert/vendors/c:litert_dispatch_c_api",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:absl_log",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/dispatch_api.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/dispatch_api.cc
deleted file mode 100644
index f377e1a26581..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/dispatch_api.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <optional>
-
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch_api.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace {
-
-using ::litert::qnn::QnnManager;
-
-static std::unique_ptr<QnnManager> TheQnnManager;
-
-QnnManager& Qnn() { return *TheQnnManager; }
-
-char BuildId[256];
-
-// /////////////////////////////////////////////////////////////////////////////
-// Basic Execution API
-// /////////////////////////////////////////////////////////////////////////////
-
-const char* GetSharedLibraryDir(const LiteRtDispatchOption* options,
-                                int num_options) {
-  for (auto i = 0; i < num_options; ++i) {
-    auto& option = options[i];
-    if (!strcmp(option.name, kDispatchOptionSharedLibraryDir)) {
-      return option.value.str_value;
-    }
-  }
-  return nullptr;
-}
-
-LiteRtStatus Initialize(const LiteRtDispatchOption* options, int num_options) {
-  auto* shared_library_dir = GetSharedLibraryDir(options, num_options);
-  std::optional<std::string> shared_library_dir_opt =
-      shared_library_dir ? std::make_optional(std::string(shared_library_dir))
-                         : std::nullopt;
-
-  auto configs = QnnManager::DefaultBackendConfigs();
-  if (auto qnn_manager = QnnManager::Create(configs, shared_library_dir_opt);
-      !qnn_manager) {
-    LITERT_LOG(LITERT_ERROR, "%s", qnn_manager.Error().Message().c_str());
-    return qnn_manager.Error().Status();
-  } else {
-    std::swap(TheQnnManager, *qnn_manager);
-  }
-
-  Qnn_ApiVersion_t qnn_api_version;
-  if (auto status = Qnn().Api()->backendGetApiVersion(&qnn_api_version);
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get QNN API version: %d", status);
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  const char* build_id;
-  if (auto status = Qnn().Api()->backendGetBuildId(&build_id);
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to get QNN build ID: %d", status);
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  snprintf(BuildId, sizeof(BuildId),
-           "Qualcomm Dispatch API version %d.%d.%d, QNN API version %d.%d.%d, "
-           "build id: %s",
-           LITERT_API_VERSION_MAJOR, LITERT_API_VERSION_MINOR,
-           LITERT_API_VERSION_PATCH, qnn_api_version.coreApiVersion.major,
-           qnn_api_version.coreApiVersion.minor,
-           qnn_api_version.coreApiVersion.patch, build_id);
-  BuildId[sizeof(BuildId) - 1] = 0;
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetVendorId(const char** vendor_id) {
-  *vendor_id = "Qualcomm";
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetBuildId(const char** build_id) {
-  *build_id = BuildId;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetCapabilities(int* capabilities) {
-  *capabilities = kLiteRtDispatchCapabilitiesBasic;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus DeviceContextCreate(LiteRtDispatchDeviceContext* device_context) {
-  if (auto context = LiteRtDispatchDeviceContextT::Create(Qnn()); context) {
-    *device_context = context->release();
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to create device context: %s",
-               context.Error().Message().c_str());
-    return context.Error().Status();
-  }
-}
-
-LiteRtStatus DeviceContextDestroy(LiteRtDispatchDeviceContext device_context) {
-  delete device_context;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus GetInputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int input_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (auto requirements =
-          invocation_context->GetInputRequirements(input_index, *tensor_type);
-      requirements) {
-    *tensor_buffer_requirements = *requirements;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to get tensor buffer requirements: %s",
-               requirements.Error().Message().c_str());
-    return requirements.Error().Status();
-  }
-}
-
-LiteRtStatus GetOutputRequirements(
-    LiteRtDispatchInvocationContext invocation_context, int output_index,
-    const LiteRtRankedTensorType* tensor_type,
-    LiteRtTensorBufferRequirements* tensor_buffer_requirements) {
-  if (auto requirements =
-          invocation_context->GetOutputRequirements(output_index, *tensor_type);
-      requirements) {
-    *tensor_buffer_requirements = *requirements;
-    return kLiteRtStatusOk;
-  } else {
-    LITERT_LOG(LITERT_ERROR, "Failed to get tensor buffer requirements: %s",
-               requirements.Error().Message().c_str());
-    return requirements.Error().Status();
-  }
-}
-
-LiteRtStatus RegisterTensorBuffer(
-    LiteRtDispatchDeviceContext device_context, LiteRtTensorBuffer buffer,
-    LiteRtTensorBufferHandle* tensor_buffer_handle) {
-  if (auto status = device_context->RegisterTensorBuffer(buffer); !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to register buffer: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  } else {
-    *tensor_buffer_handle = *status;
-    return kLiteRtStatusOk;
-  }
-}
-
-LiteRtStatus UnregisterTensorBuffer(LiteRtDispatchDeviceContext device_context,
-                                    LiteRtTensorBufferHandle handle) {
-  if (auto status = device_context->UnregisterTensorBuffer(handle); !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to unregister buffer: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  } else {
-    return kLiteRtStatusOk;
-  }
-}
-
-LiteRtStatus InvocationContextCreate(
-    LiteRtDispatchDeviceContext device_context,
-    LiteRtDispatchExecutableType exec_type,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name,
-    int num_inputs, int num_outputs,
-    LiteRtDispatchInvocationContext* invocation_context) {
-  auto context = LiteRtDispatchInvocationContextT::Create(
-      Qnn(), *device_context, exec_bytecode_buffer, function_name);
-  if (!context) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create context from context binary: %s",
-               context.Error().Message().c_str());
-    return context.Error().Status();
-  }
-  *invocation_context = context->release();
-  device_context->SetInvocationContext(*invocation_context);
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus InvocationContextDestroy(
-    LiteRtDispatchInvocationContext invocation_context) {
-  delete invocation_context;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus AttachInput(LiteRtDispatchInvocationContext invocation_context,
-                         int graph_input_index,
-                         LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto status = invocation_context->AttachInput(graph_input_index,
-                                                    tensor_buffer_handle);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to attach input buffer: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus AttachOutput(LiteRtDispatchInvocationContext invocation_context,
-                          int graph_output_index,
-                          LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (auto status = invocation_context->AttachOutput(graph_output_index,
-                                                     tensor_buffer_handle);
-      !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to attach output buffer: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus DetachInput(LiteRtDispatchInvocationContext invocation_context,
-                         int graph_input_index,
-                         LiteRtTensorBufferHandle tensor_buffer_handle) {
-  // Nothing to do here.
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus DetachOutput(LiteRtDispatchInvocationContext invocation_context,
-                          int graph_output_index,
-                          LiteRtTensorBufferHandle tensor_buffer_handle) {
-  // Nothing to do here.
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus Invoke(LiteRtDispatchInvocationContext invocation_context) {
-  if (auto status = invocation_context->Execute(); !status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to execute invocation context: %s",
-               status.Error().Message().c_str());
-    return status.Error().Status();
-  }
-  return kLiteRtStatusOk;
-}
-
-// /////////////////////////////////////////////////////////////////////////////
-
-LiteRtDispatchInterface TheInterface = {
-    /*.initialize=*/Initialize,
-    /*.get_vendor_id=*/GetVendorId,
-    /*.get_build_id=*/GetBuildId,
-    /*.get_capabilities=*/GetCapabilities,
-    /*.device_context_create=*/DeviceContextCreate,
-    /*.device_context_destroy=*/DeviceContextDestroy,
-    /*.get_input_requirements=*/GetInputRequirements,
-    /*.get_output_requirements=*/GetOutputRequirements,
-    /*.register_tensor_buffer=*/RegisterTensorBuffer,
-    /*.unregister_tensor_buffer=*/UnregisterTensorBuffer,
-    /*.invocation_context_create=*/InvocationContextCreate,
-    /*.invocation_context_destroy=*/InvocationContextDestroy,
-    /*.attach_input=*/AttachInput,
-    /*.attach_output=*/AttachOutput,
-    /*.detach_input=*/DetachInput,
-    /*.detach_output=*/DetachOutput,
-    /*.invoke=*/Invoke,
-};
-
-LiteRtDispatchApi TheApi = {
-    /*.version=*/{/*.major=*/LITERT_API_VERSION_MAJOR,
-                  /*.minor=*/LITERT_API_VERSION_MINOR,
-                  /*.patch=*/LITERT_API_VERSION_PATCH},
-    /*.interface=*/&TheInterface,
-    /*.async_interface=*/nullptr,
-    /*.graph_interface=*/nullptr,
-};
-
-}  // namespace
-
-LiteRtStatus LiteRtDispatchGetApi(LiteRtDispatchApi* api) {
-  *api = TheApi;
-  return kLiteRtStatusOk;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/dispatch_api_qualcomm_test.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/dispatch_api_qualcomm_test.cc
deleted file mode 100644
index c1ae8d1c53d4..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/dispatch_api_qualcomm_test.cc
+++ /dev/null
@@ -1,544 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <any>
-#include <cstddef>
-#include <cstring>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/absl_log.h"
-#include "absl/log/log.h"
-#include "absl/types/span.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_any.h"
-#include "tensorflow/lite/experimental/litert/core/filesystem.h"
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/test/testdata/simple_model_test_vectors.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-
-using ::testing::Pointwise;
-
-TEST(Qualcomm, DispatchApiWithFastRpc) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP()
-      << "This test is specific to Android devices with a Qualcomm NPU";
-#endif
-
-  LiteRtDispatchOption dispatch_option = {
-      /*.name=*/kDispatchOptionSharedLibraryDir,
-      /*.value=*/*litert::ToLiteRtAny(std::any("/data/local/tmp")),
-  };
-  ASSERT_EQ(
-      LiteRtDispatchInitialize(/*options=*/&dispatch_option, /*num_options=*/1),
-      kLiteRtStatusOk);
-
-  const char* vendor_id;
-  EXPECT_EQ(LiteRtDispatchGetVendorId(&vendor_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "vendor_id: " << vendor_id;
-
-  const char* build_id;
-  EXPECT_EQ(LiteRtDispatchGetBuildId(&build_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "build_id: " << build_id;
-
-  LiteRtApiVersion api_version;
-  EXPECT_EQ(LiteRtDispatchGetApiVersion(&api_version), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "api_version: " << api_version.major << "."
-                 << api_version.minor << "." << api_version.patch;
-
-  int capabilities;
-  EXPECT_EQ(LiteRtDispatchGetCapabilities(&capabilities), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "capabilities: " << capabilities;
-
-  LiteRtDispatchDeviceContext device_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchDeviceContextCreate(&device_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "device_context: " << device_context;
-
-  auto model_file_name =
-      litert::testing::GetTestFilePath(kQualcommModelFileName);
-  auto model = litert::internal::LoadBinaryFile(model_file_name);
-  EXPECT_TRUE(model) << model.Error();
-  ABSL_LOG(INFO) << "Loaded model " << model_file_name << ", " << model->Size()
-                 << " bytes";
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Set up an invocation context for a given model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtMemBuffer exec_bytecode_buffer = {/*.fd=*/-1,
-                                          /*.base_addr=*/model->Data(),
-                                          /*.offset=*/0,
-                                          /*.size=*/model->Size()};
-  LiteRtDispatchInvocationContext invocation_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchInvocationContextCreate(
-                device_context, kLiteRtDispatchExecutableTypeMlModel,
-                &exec_bytecode_buffer, /*function_name=*/"simple",
-                /*num_inputs=*/2, /*num_outputs=*/1, &invocation_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "Invocation context: " << invocation_context;
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Determine tensor buffer requirements.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  int num_tensor_buffer_types;
-  LiteRtTensorBufferRequirements input_0_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/0, &kInput0TensorType,
-                &input_0_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_0_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_0_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_0_tensor_buffer_requirements, /*type_index=*/0,
-                &input_0_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_0_tensor_buffer_type, kLiteRtTensorBufferTypeFastRpc);
-  size_t input_0_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_0_tensor_buffer_requirements, &input_0_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_0_tensor_buffer_size, sizeof(kTestInput0Tensor));
-
-  LiteRtTensorBufferRequirements input_1_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/1, &kInput1TensorType,
-                &input_1_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_1_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_1_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_1_tensor_buffer_requirements, /*type_index=*/0,
-                &input_1_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_1_tensor_buffer_type, kLiteRtTensorBufferTypeFastRpc);
-  size_t input_1_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_1_tensor_buffer_requirements, &input_1_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_1_tensor_buffer_size, sizeof(kTestInput1Tensor));
-
-  LiteRtTensorBufferRequirements output_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetOutputRequirements(
-                invocation_context, /*output_index=*/0, &kOutputTensorType,
-                &output_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                output_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType output_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                output_tensor_buffer_requirements, /*type_index=*/0,
-                &output_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(output_tensor_buffer_type, kLiteRtTensorBufferTypeFastRpc);
-  size_t output_tensor_buffer_size;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                output_tensor_buffer_requirements, &output_tensor_buffer_size),
-            kLiteRtStatusOk);
-  EXPECT_GE(output_tensor_buffer_size, sizeof(kTestOutputTensor));
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Allocate tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBuffer input_0_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_0_tensor_buffer_type, &kInput0TensorType,
-                input_0_tensor_buffer_size, &input_0_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer input_1_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_1_tensor_buffer_type, &kInput1TensorType,
-                input_1_tensor_buffer_size, &input_1_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer output_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                output_tensor_buffer_type, &kOutputTensorType,
-                output_tensor_buffer_size, &output_tensor_buffer),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Register tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBufferHandle input_1_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_1_tensor_buffer, &input_1_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle input_0_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_0_tensor_buffer, &input_0_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle output_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, output_tensor_buffer, &output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Attach tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking execution...";
-  EXPECT_EQ(LiteRtDispatchInvoke(invocation_context), kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(testing::FloatNear(1e-3), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Clean up resources.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchUnregisterTensorBuffer(device_context, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_1_handle),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_0_handle),
-      kLiteRtStatusOk);
-  LiteRtDestroyTensorBuffer(output_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_1_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_0_tensor_buffer);
-  EXPECT_EQ(LiteRtDispatchInvocationContextDestroy(invocation_context),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDeviceContextDestroy(device_context),
-            kLiteRtStatusOk);
-}
-
-TEST(Qualcomm, DispatchApiWithDmaBuf) {
-#if !defined(__ANDROID__)
-  GTEST_SKIP()
-      << "This test is specific to Android devices with a Qualcomm NPU";
-#endif
-
-  EXPECT_EQ(LiteRtDispatchInitialize(/*options=*/nullptr, /*num_options=*/0),
-            kLiteRtStatusOk);
-
-  const char* vendor_id;
-  EXPECT_EQ(LiteRtDispatchGetVendorId(&vendor_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "vendor_id: " << vendor_id;
-
-  const char* build_id;
-  EXPECT_EQ(LiteRtDispatchGetBuildId(&build_id), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "build_id: " << build_id;
-
-  LiteRtApiVersion api_version;
-  EXPECT_EQ(LiteRtDispatchGetApiVersion(&api_version), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "api_version: " << api_version.major << "."
-                 << api_version.minor << "." << api_version.patch;
-
-  int capabilities;
-  EXPECT_EQ(LiteRtDispatchGetCapabilities(&capabilities), kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "capabilities: " << capabilities;
-
-  LiteRtDispatchDeviceContext device_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchDeviceContextCreate(&device_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "device_context: " << device_context;
-
-  auto model_file_name =
-      litert::testing::GetTestFilePath(kQualcommModelFileName);
-  auto model = litert::internal::LoadBinaryFile(model_file_name);
-  EXPECT_TRUE(model) << model.Error();
-  ABSL_LOG(INFO) << "Loaded model " << model_file_name << ", " << model->Size()
-                 << " bytes";
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Set up an invocation context for a given model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtMemBuffer exec_bytecode_buffer = {/*.fd=*/-1,
-                                          /*.base_addr=*/model->Data(),
-                                          /*.offset=*/0,
-                                          /*.size=*/model->Size()};
-  LiteRtDispatchInvocationContext invocation_context = nullptr;
-  EXPECT_EQ(LiteRtDispatchInvocationContextCreate(
-                device_context, kLiteRtDispatchExecutableTypeMlModel,
-                &exec_bytecode_buffer, /*function_name=*/"simple",
-                /*num_inputs=*/2, /*num_outputs=*/1, &invocation_context),
-            kLiteRtStatusOk);
-  ABSL_LOG(INFO) << "Invocation context: " << invocation_context;
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Determine tensor buffer requirements.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  int num_tensor_buffer_types;
-  LiteRtTensorBufferRequirements input_0_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/0, &kInput0TensorType,
-                &input_0_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_0_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_0_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_0_tensor_buffer_requirements, /*type_index=*/1,
-                &input_0_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_0_tensor_buffer_type, kLiteRtTensorBufferTypeDmaBuf);
-  size_t input_0_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_0_tensor_buffer_requirements, &input_0_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_0_tensor_buffer_size, sizeof(kTestInput0Tensor));
-
-  LiteRtTensorBufferRequirements input_1_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetInputRequirements(
-                invocation_context, /*input_index=*/1, &kInput1TensorType,
-                &input_1_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                input_1_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType input_1_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                input_1_tensor_buffer_requirements, /*type_index=*/1,
-                &input_1_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(input_1_tensor_buffer_type, kLiteRtTensorBufferTypeDmaBuf);
-  size_t input_1_tensor_buffer_size;
-  EXPECT_EQ(
-      LiteRtGetTensorBufferRequirementsBufferSize(
-          input_1_tensor_buffer_requirements, &input_1_tensor_buffer_size),
-      kLiteRtStatusOk);
-  EXPECT_GE(input_1_tensor_buffer_size, sizeof(kTestInput1Tensor));
-
-  LiteRtTensorBufferRequirements output_tensor_buffer_requirements;
-  EXPECT_EQ(LiteRtDispatchGetOutputRequirements(
-                invocation_context, /*output_index=*/0, &kOutputTensorType,
-                &output_tensor_buffer_requirements),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtGetNumTensorBufferRequirementsSupportedBufferTypes(
-                output_tensor_buffer_requirements, &num_tensor_buffer_types),
-            kLiteRtStatusOk);
-  EXPECT_GE(num_tensor_buffer_types, 1);
-  LiteRtTensorBufferType output_tensor_buffer_type;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsSupportedTensorBufferType(
-                output_tensor_buffer_requirements, /*type_index=*/1,
-                &output_tensor_buffer_type),
-            kLiteRtStatusOk);
-  EXPECT_EQ(output_tensor_buffer_type, kLiteRtTensorBufferTypeDmaBuf);
-  size_t output_tensor_buffer_size;
-  EXPECT_EQ(LiteRtGetTensorBufferRequirementsBufferSize(
-                output_tensor_buffer_requirements, &output_tensor_buffer_size),
-            kLiteRtStatusOk);
-  EXPECT_GE(output_tensor_buffer_size, sizeof(kTestOutputTensor));
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Allocate tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBuffer input_0_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_0_tensor_buffer_type, &kInput0TensorType,
-                input_0_tensor_buffer_size, &input_0_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer input_1_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                input_1_tensor_buffer_type, &kInput1TensorType,
-                input_1_tensor_buffer_size, &input_1_tensor_buffer),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBuffer output_tensor_buffer;
-  EXPECT_EQ(LiteRtCreateManagedTensorBuffer(
-                output_tensor_buffer_type, &kOutputTensorType,
-                output_tensor_buffer_size, &output_tensor_buffer),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Register tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  LiteRtTensorBufferHandle input_1_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_1_tensor_buffer, &input_1_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle input_0_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, input_0_tensor_buffer, &input_0_handle),
-            kLiteRtStatusOk);
-
-  LiteRtTensorBufferHandle output_handle;
-  EXPECT_EQ(LiteRtDispatchRegisterTensorBuffer(
-                device_context, output_tensor_buffer, &output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Attach tensor buffers.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchAttachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Fill the input buffers with data.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Filling inputs with data";
-    void* host_mem_addr;
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_0_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput0Tensor, sizeof(kTestInput0Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_0_tensor_buffer), kLiteRtStatusOk);
-
-    ASSERT_EQ(LiteRtLockTensorBuffer(input_1_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    std::memcpy(host_mem_addr, kTestInput1Tensor, sizeof(kTestInput1Tensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(input_1_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Execute model.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  ABSL_LOG(INFO) << "Invoking execution...";
-  EXPECT_EQ(LiteRtDispatchInvoke(invocation_context), kLiteRtStatusOk);
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Check output for correctness.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  {
-    ABSL_LOG(INFO) << "Checking output...";
-    void* host_mem_addr;
-    ASSERT_EQ(LiteRtLockTensorBuffer(output_tensor_buffer, &host_mem_addr),
-              kLiteRtStatusOk);
-    auto output = absl::MakeSpan(static_cast<const float*>(host_mem_addr),
-                                 kTestOutputSize);
-    for (auto i = 0; i < kTestOutputSize; ++i) {
-      ABSL_LOG(INFO) << output[i] << "\t" << kTestOutputTensor[i];
-    }
-    EXPECT_THAT(output, Pointwise(testing::FloatNear(1e-3), kTestOutputTensor));
-    ASSERT_EQ(LiteRtUnlockTensorBuffer(output_tensor_buffer), kLiteRtStatusOk);
-  }
-
-  // ///////////////////////////////////////////////////////////////////////////
-  // Clean up resources.
-  // ///////////////////////////////////////////////////////////////////////////
-
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/0, input_0_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachInput(invocation_context,
-                                      /*graph_input_index=*/1, input_1_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDetachOutput(invocation_context,
-                                       /*graph_output_index=*/0, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchUnregisterTensorBuffer(device_context, output_handle),
-            kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_1_handle),
-      kLiteRtStatusOk);
-  EXPECT_EQ(
-      LiteRtDispatchUnregisterTensorBuffer(device_context, input_0_handle),
-      kLiteRtStatusOk);
-  LiteRtDestroyTensorBuffer(output_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_1_tensor_buffer);
-  LiteRtDestroyTensorBuffer(input_0_tensor_buffer);
-  EXPECT_EQ(LiteRtDispatchInvocationContextDestroy(invocation_context),
-            kLiteRtStatusOk);
-  EXPECT_EQ(LiteRtDispatchDeviceContextDestroy(device_context),
-            kLiteRtStatusOk);
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.cc
deleted file mode 100644
index adf0ed86f80c..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h"
-
-#include <cstddef>
-#include <cstdint>
-
-#include "absl/log/absl_check.h"
-#include "third_party/qairt/latest/include/QNN/HTP/QnnHtpMem.h"
-#include "third_party/qairt/latest/include/QNN/QnnBackend.h"
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnMem.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-using litert::Expected;
-using litert::Unexpected;
-using litert::qnn::QnnManager;
-
-Expected<LiteRtDispatchDeviceContextT::Ptr>
-LiteRtDispatchDeviceContextT::Create(QnnManager& qnn) {
-  return Ptr(new LiteRtDispatchDeviceContextT(qnn));
-}
-
-Expected<LiteRtTensorBuffer> LiteRtDispatchDeviceContextT::GetTensorBuffer(
-    LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto registry_entry = tensor_buffer_registry_.Get(tensor_buffer_handle);
-  if (!registry_entry) {
-    return Unexpected(registry_entry.Error());
-  }
-
-  return (*registry_entry)->tensor_buffer;
-}
-
-Expected<Qnn_MemHandle_t> LiteRtDispatchDeviceContextT::GetMemHandle(
-    LiteRtTensorBufferHandle tensor_buffer_handle, const Qnn_Tensor_t& tensor) {
-  auto registry_entry = tensor_buffer_registry_.Get(tensor_buffer_handle);
-  if (!registry_entry) {
-    return Unexpected(registry_entry.Error());
-  }
-
-  if (!(*registry_entry)->qnn_mem_handle) {
-    auto qnn_mem_handle =
-        RegisterTensorBuffer((*registry_entry)->tensor_buffer, tensor);
-    if (!qnn_mem_handle) {
-      return Unexpected(qnn_mem_handle.Error());
-    }
-    (*registry_entry)->qnn_mem_handle = *qnn_mem_handle;
-  }
-
-  return (*registry_entry)->qnn_mem_handle;
-}
-
-Expected<Qnn_MemHandle_t> LiteRtDispatchDeviceContextT::RegisterTensorBuffer(
-    LiteRtTensorBuffer tensor_buffer, const Qnn_Tensor_t& tensor) {
-  LiteRtTensorBufferType tensor_buffer_type;
-  if (auto status =
-          LiteRtGetTensorBufferType(tensor_buffer, &tensor_buffer_type);
-      status != kLiteRtStatusOk) {
-    return Unexpected(status, "Failed to get tensor buffer type");
-  }
-
-  size_t tensor_buffer_size;
-  if (auto status =
-          LiteRtGetTensorBufferSize(tensor_buffer, &tensor_buffer_size);
-      status != kLiteRtStatusOk) {
-    return Unexpected(status, "Failed to get tensor buffer size");
-  }
-
-  size_t tensor_buffer_offset;
-  if (auto status =
-          LiteRtGetTensorBufferOffset(tensor_buffer, &tensor_buffer_offset);
-      status != kLiteRtStatusOk) {
-    return Unexpected(status, "Failed to get tensor buffer offset");
-  }
-
-  LiteRtRankedTensorType tensor_type;
-  if (auto status =
-          LiteRtGetTensorBufferTensorType(tensor_buffer, &tensor_type);
-      status != kLiteRtStatusOk) {
-    return Unexpected(status, "Failed to get tensor buffer's type");
-  }
-
-  auto element_type =
-      static_cast<enum litert::ElementType>(tensor_type.element_type);
-  Qnn_DataType_t tensor_data_type;
-  if (auto status = LegalizeElementType(element_type, &tensor_data_type);
-      status != kLiteRtStatusOk) {
-    return Unexpected(status, "Failed to legalize datatype");
-  }
-
-  uint32_t tensor_rank = tensor_type.layout.rank;
-  uint32_t* tensor_dimensions = reinterpret_cast<uint32_t*>(
-      const_cast<int32_t*>(tensor_type.layout.dimensions));
-  auto* tensor_strides = tensor_type.layout.strides;
-  if (tensor_strides != nullptr) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Tensor strides are not supported by QNN");
-  }
-
-  void* buffer_host_addr;
-  int buffer_fd;
-  (void)buffer_host_addr;
-
-  switch (tensor_buffer_type) {
-    case kLiteRtTensorBufferTypeFastRpc:
-#if LITERT_HAS_FASTRPC_SUPPORT
-      if (auto status = LiteRtGetTensorBufferFastRpcBuffer(
-              tensor_buffer, &buffer_host_addr, &buffer_fd);
-          status != kLiteRtStatusOk) {
-        return Unexpected(status, "Failed to get FastRPC buffer");
-      }
-#else
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "FastRPC support is missing on this platform");
-#endif  // LRT_HAS_FASTRPC_SUPPORT
-      break;
-
-    case kLiteRtTensorBufferTypeDmaBuf:
-#if LITERT_HAS_DMABUF_SUPPORT
-      if (auto status = LiteRtGetTensorBufferDmaBufBuffer(
-              tensor_buffer, &buffer_host_addr, &buffer_fd);
-          status != kLiteRtStatusOk) {
-        return Unexpected(status, "Failed to get DMA-BUF buffer");
-      }
-#else
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "DmaBuf support is missing on this platform");
-#endif  // LRT_HAS_DMABUF_SUPPORT
-      break;
-
-    default:
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Unsupported tensor buffer type");
-  }
-
-  QnnMemHtp_Descriptor_t mem_htp_descriptor = {};
-  mem_htp_descriptor.type = QNN_HTP_MEM_SHARED_BUFFER;
-  mem_htp_descriptor.size = tensor_buffer_size;
-  mem_htp_descriptor.sharedBufferConfig =
-      QnnHtpMem_SharedBufferConfig_t{buffer_fd, tensor_buffer_offset};
-
-  Qnn_MemDescriptor_t mem_descriptor = {};
-  mem_descriptor.memShape = {tensor_rank, tensor_dimensions, nullptr};
-  mem_descriptor.dataType = tensor_data_type;
-  mem_descriptor.memType = QNN_MEM_TYPE_CUSTOM;
-  mem_descriptor.customInfo = &mem_htp_descriptor;
-
-  if (invocation_context_ == nullptr) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Missing invocation context");
-  }
-
-  Qnn_ContextHandle_t context_handle = invocation_context_->ContextHandle();
-
-  Qnn_MemHandle_t mem_handle = nullptr;
-  if (auto status = qnn_manager_.Api()->memRegister(
-          context_handle, &mem_descriptor, 1UL, &mem_handle);
-      status != QNN_SUCCESS) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to register tensor buffer");
-  }
-
-  if (!mem_handle) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to register buffer: null mem_handle");
-  }
-
-  return mem_handle;
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h
deleted file mode 100644
index bd375c5137fc..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
-
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-class LiteRtDispatchDeviceContextT {
- public:
-  using Ptr = std::unique_ptr<LiteRtDispatchDeviceContextT>;
-
-  ~LiteRtDispatchDeviceContextT() = default;
-
-  static litert::Expected<Ptr> Create(litert::qnn::QnnManager& qnn_manager);
-
-  litert::Expected<LiteRtTensorBufferHandle> RegisterTensorBuffer(
-      LiteRtTensorBuffer tensor_buffer) {
-    return tensor_buffer_registry_.Register(
-        TensorBufferRegistryEntry(tensor_buffer));
-  }
-
-  litert::Expected<void> UnregisterTensorBuffer(
-      LiteRtTensorBufferHandle tensor_buffer_handle) {
-    return tensor_buffer_registry_.Unregister(tensor_buffer_handle);
-  }
-
-  litert::Expected<LiteRtTensorBuffer> GetTensorBuffer(
-      LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<Qnn_MemHandle_t> GetMemHandle(
-      LiteRtTensorBufferHandle tensor_buffer_handle,
-      const Qnn_Tensor_t& tensor);
-
-  void SetInvocationContext(
-      LiteRtDispatchInvocationContextT* invocation_context) {
-    invocation_context_ = invocation_context;
-  }
-
- private:
-  struct TensorBufferRegistryEntry {
-    LiteRtTensorBuffer tensor_buffer;
-    Qnn_MemHandle_t qnn_mem_handle = nullptr;
-    explicit TensorBufferRegistryEntry(LiteRtTensorBuffer tensor_buffer_)
-        : tensor_buffer(tensor_buffer_) {}
-  };
-
-  using TensorBufferRegistry = litert::qnn::Registry<LiteRtTensorBufferHandle,
-                                                     TensorBufferRegistryEntry>;
-
-  LiteRtDispatchDeviceContextT(litert::qnn::QnnManager& qnn_manager)
-      : qnn_manager_(qnn_manager) {}
-
-  litert::Expected<Qnn_MemHandle_t> RegisterTensorBuffer(
-      LiteRtTensorBuffer tensor_buffer, const Qnn_Tensor_t& tensor);
-
-  litert::qnn::QnnManager& qnn_manager_;
-  TensorBufferRegistry tensor_buffer_registry_;
-  LiteRtDispatchInvocationContextT* invocation_context_ = nullptr;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_DEVICE_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.cc
deleted file mode 100644
index 6d05088cf066..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h"
-
-#include <cstddef>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_model.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/core/util/tensor_type_util.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_device_context.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-using litert::Expected;
-using litert::Unexpected;
-using litert::qnn::QnnManager;
-
-LiteRtDispatchInvocationContextT::LiteRtDispatchInvocationContextT(
-    litert::qnn::QnnManager& qnn_manager,
-    const litert::qnn::ContextBinaryInfo& context_binary_info,
-    LiteRtDispatchDeviceContextT& device_context,
-    QnnManager::ContextHandle&& context_handle,
-    Qnn_ProfileHandle_t profile_handle, int graph_index,
-    Qnn_GraphHandle_t graph_handle)
-    : qnn_manager_(qnn_manager),
-      device_context_(device_context),
-      context_handle_(std::move(context_handle)),
-      profile_handle_(profile_handle),
-      graph_index_(graph_index),
-      graph_handle_(graph_handle),
-      inputs_(context_binary_info.Graphs()[graph_index].Inputs()),
-      outputs_(context_binary_info.Graphs()[graph_index].Outputs()) {}
-
-Expected<LiteRtDispatchInvocationContextT::Ptr>
-LiteRtDispatchInvocationContextT::Create(
-    QnnManager& qnn, LiteRtDispatchDeviceContextT& device_context,
-    const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name) {
-  auto exec_bytecode_ptr =
-      static_cast<const uint8_t*>(exec_bytecode_buffer->base_addr) +
-      exec_bytecode_buffer->offset;
-  auto context_binary_info = litert::qnn::ContextBinaryInfo::Create(
-      qnn, exec_bytecode_ptr, exec_bytecode_buffer->size);
-  if (!context_binary_info) {
-    return Unexpected(context_binary_info.Error());
-  }
-
-  const auto& graphs = context_binary_info->Graphs();
-  if (graphs.empty()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure, "No graph found");
-  }
-
-  int graph_index = -1;
-  // If the function_name is not specified and there is only one graph, then
-  // take that graph.
-  if (absl::string_view(function_name).empty() && graphs.size() == 1) {
-    graph_index = 0;
-    const auto& graph = graphs[graph_index];
-    function_name = graph.Name().c_str();
-  } else {
-    for (auto i = 0; i < graphs.size(); ++i) {
-      const auto& graph = graphs[i];
-      if (graph.Name() == absl::string_view(function_name)) {
-        graph_index = i;
-        break;
-      }
-    }
-  }
-  if (graph_index < 0) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Function name not found");
-  }
-
-  auto configs = QnnManager::DefaultContextConfigs();
-  Qnn_ProfileHandle_t profile_handle = nullptr;
-  auto context_handle = qnn.CreateContextHandle(
-      configs,
-      absl::MakeSpan(static_cast<const uint8_t*>(exec_bytecode_ptr),
-                     exec_bytecode_buffer->size),
-      profile_handle);
-  if (!context_handle) {
-    return Unexpected(context_handle.Error());
-  }
-
-  Qnn_GraphHandle_t graph_handle;
-  if (auto status = qnn.Api()->graphRetrieve(context_handle->get(),
-                                             function_name, &graph_handle);
-      status != QNN_SUCCESS) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to retrieve graph");
-  }
-
-  return Ptr(new LiteRtDispatchInvocationContextT(
-      qnn, std::move(*context_binary_info), device_context,
-      std::move(*context_handle), profile_handle, graph_index, graph_handle));
-}
-
-namespace {
-
-Expected<LiteRtTensorBufferRequirements> GetTensorBufferRequirements(
-    const LiteRtRankedTensorType& tensor_type) {
-  auto* tensor_strides = tensor_type.layout.strides;
-  if (tensor_strides != nullptr) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Tensor strides are not supported by QNN");
-  }
-
-  static constexpr std::array<const LiteRtTensorBufferType, 2>
-      kSupportedTensorBufferTypes = {
-          kLiteRtTensorBufferTypeFastRpc,
-          kLiteRtTensorBufferTypeDmaBuf,
-      };
-
-  auto buffer_size = litert::internal::GetNumPackedBytes(tensor_type);
-  if (!buffer_size) {
-    return Unexpected(buffer_size.Error());
-  }
-
-  LiteRtTensorBufferRequirements requirements;
-  if (auto status = LiteRtCreateTensorBufferRequirements(
-          kSupportedTensorBufferTypes.size(),
-          kSupportedTensorBufferTypes.data(), *buffer_size, /*num_strides=*/0,
-          /*strides=*/nullptr, &requirements);
-      status != kLiteRtStatusOk) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure, "Not implemented");
-  }
-
-  return requirements;
-}
-
-}  // namespace
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtDispatchInvocationContextT::GetInputRequirements(
-    int input_index, const LiteRtRankedTensorType& tensor_type) {
-  return GetTensorBufferRequirements(tensor_type);
-}
-
-Expected<LiteRtTensorBufferRequirements>
-LiteRtDispatchInvocationContextT::GetOutputRequirements(
-    int output_index, const LiteRtRankedTensorType& tensor_type) {
-  return GetTensorBufferRequirements(tensor_type);
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::AttachInput(
-    int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (graph_input_index < 0 || graph_input_index >= inputs_.size()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Invalid graph_input_index");
-  }
-
-  auto& tensor = inputs_[graph_input_index];
-  return AttachBuffer(tensor.Tensor(), tensor_buffer_handle);
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::AttachOutput(
-    int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  if (graph_output_index < 0 || graph_output_index >= outputs_.size()) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Invalid graph_output_index");
-  }
-
-  auto& tensor = outputs_[graph_output_index];
-  return AttachBuffer(tensor.Tensor(), tensor_buffer_handle);
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::AttachBuffer(
-    Qnn_Tensor_t& tensor, LiteRtTensorBufferHandle tensor_buffer_handle) {
-  auto tensor_buffer = device_context_.GetTensorBuffer(tensor_buffer_handle);
-  if (!tensor_buffer) {
-    return Unexpected(tensor_buffer.Error());
-  }
-
-  auto mem_handle = device_context_.GetMemHandle(tensor_buffer_handle, tensor);
-  if (!mem_handle) {
-    return Unexpected(mem_handle.Error());
-  }
-
-  if (tensor.version == QNN_TENSOR_VERSION_1) {
-    tensor.v1.memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-    tensor.v1.memHandle = *mem_handle;
-
-  } else if (tensor.version == QNN_TENSOR_VERSION_2) {
-    if (tensor.v2.isDynamicDimensions != nullptr) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Dynamic dimensions not yet supported");
-    }
-    tensor.v2.memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-    tensor.v2.memHandle = *mem_handle;
-
-  } else {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unsupported QNN tensor version");
-  }
-
-  return {};
-}
-
-Expected<void> LiteRtDispatchInvocationContextT::Execute() {
-  const size_t num_ins = inputs_.size();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, inputs, num_ins, QNN_TENSOR_INIT);
-  for (size_t i = 0; i < num_ins; ++i) {
-    *(inputs + i) = inputs_.at(i).Tensor();
-  }
-
-  const size_t num_outs = outputs_.size();
-  LITERT_STACK_ARRAY(Qnn_Tensor_t, outputs, num_outs, QNN_TENSOR_INIT);
-  for (size_t i = 0; i < num_outs; ++i) {
-    *(outputs + i) = outputs_.at(i).Tensor();
-  }
-
-  if (auto status = qnn_manager_.Api()->graphExecute(
-          graph_handle_, inputs, num_ins, outputs, num_outs,
-          /*profileHandle=*/nullptr, /*signalHandle=*/nullptr);
-      status != QNN_SUCCESS) {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to execute graph");
-  }
-
-  return {};
-}
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h
deleted file mode 100644
index 177592388163..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/litert_dispatch_invocation_context.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
-
-#include <memory>
-
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer.h"
-#include "tensorflow/lite/experimental/litert/c/litert_tensor_buffer_requirements.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/vendors/c/litert_dispatch.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/context_binary_info.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-class LiteRtDispatchDeviceContextT;
-
-class LiteRtDispatchInvocationContextT {
- public:
-  using Ptr = std::unique_ptr<LiteRtDispatchInvocationContextT>;
-
-  ~LiteRtDispatchInvocationContextT() = default;
-
-  static litert::Expected<Ptr> Create(
-      litert::qnn::QnnManager& qnn_manager,
-      LiteRtDispatchDeviceContextT& device_context,
-      const LiteRtMemBuffer* exec_bytecode_buffer, const char* function_name);
-
-  litert::Expected<LiteRtTensorBufferRequirements> GetInputRequirements(
-      int input_index, const LiteRtRankedTensorType& tensor_type);
-  litert::Expected<LiteRtTensorBufferRequirements> GetOutputRequirements(
-      int output_index, const LiteRtRankedTensorType& tensor_type);
-
-  litert::Expected<void> AttachInput(
-      int graph_input_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<void> AttachOutput(
-      int graph_output_index, LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::Expected<void> Execute();
-
-  Qnn_ContextHandle_t ContextHandle() { return context_handle_.get(); }
-
- private:
-  LiteRtDispatchInvocationContextT(
-      litert::qnn::QnnManager& qnn_manager,
-      const litert::qnn::ContextBinaryInfo& context_binary_info,
-      LiteRtDispatchDeviceContextT& device_context,
-      litert::qnn::QnnManager::ContextHandle&& context_handle,
-      Qnn_ProfileHandle_t profile_handle, int graph_index,
-      Qnn_GraphHandle_t graph_handle);
-
-  litert::Expected<void> AttachBuffer(
-      Qnn_Tensor_t& tensor, LiteRtTensorBufferHandle tensor_buffer_handle);
-
-  litert::qnn::QnnManager& qnn_manager_;
-  LiteRtDispatchDeviceContextT& device_context_;
-  litert::qnn::QnnManager::ContextHandle context_handle_;
-  Qnn_ProfileHandle_t profile_handle_;
-  int graph_index_;
-  Qnn_GraphHandle_t graph_handle_;
-  std::vector<litert::qnn::QnnTensor> inputs_;
-  std::vector<litert::qnn::QnnTensor> outputs_;
-};
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_LITERT_DISPATCH_INVOCATION_CONTEXT_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h
deleted file mode 100644
index b2d74a57cc18..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/dispatch/registry.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_REGISTRY_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_REGISTRY_H_
-
-#include <vector>
-
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace qnn {
-
-template <typename H, typename V>
-class Registry {
- public:
-  Expected<H> Register(const V& value) {
-    // TODO: improve this linear search by keeping an index to the first unused
-    // element.
-    for (auto i = 0; i < entries_.size(); ++i) {
-      auto& entry = entries_[i];
-      if (!entry.used) {
-        entry.value = value;
-        entry.used = true;
-        return static_cast<H>(i);
-      }
-    }
-    // Grow the set of entries.
-    H handle = static_cast<H>(entries_.size());
-    entries_.emplace_back(value);
-    return handle;
-  }
-
-  Expected<void> Unregister(H handle) {
-    if (handle < 0 || handle >= entries_.size()) {
-      return Unexpected(kLiteRtStatusErrorNotFound, "Unexpected handle");
-    }
-    entries_[handle].used = false;
-    return {};
-  }
-
-  Expected<V*> Get(H handle) {
-    if (handle < 0 || handle >= entries_.size()) {
-      return Unexpected(kLiteRtStatusErrorNotFound, "Unexpected handle");
-    }
-    return &entries_[handle].value;
-  }
-
- private:
-  struct Entry {
-    V value;
-    bool used;
-    explicit Entry(const V& v) : value(v), used(true) {}
-  };
-
-  std::vector<Entry> entries_;
-};
-
-}  // namespace qnn
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_DISPATCH_REGISTRY_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.cc
deleted file mode 100644
index a09679921925..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h"
-
-#include <cstdarg>
-#include <cstdint>
-#include <cstdio>
-#include <iostream>
-
-#include "third_party/qairt/latest/include/QNN/QnnLog.h"
-
-namespace litert::qnn {
-namespace {
-
-void DefaultStdOutLogger(const char* fmt, QnnLog_Level_t level,
-                         uint64_t timestamp, va_list argp) {
-  const char* levelStr = "";
-  switch (level) {
-    case QNN_LOG_LEVEL_ERROR:
-      levelStr = " ERROR ";
-      break;
-    case QNN_LOG_LEVEL_WARN:
-      levelStr = "WARNING";
-      break;
-    case QNN_LOG_LEVEL_INFO:
-      levelStr = "  INFO ";
-      break;
-    case QNN_LOG_LEVEL_DEBUG:
-      levelStr = " DEBUG ";
-      break;
-    case QNN_LOG_LEVEL_VERBOSE:
-      levelStr = "VERBOSE";
-      break;
-    case QNN_LOG_LEVEL_MAX:
-      levelStr = "UNKNOWN";
-      break;
-  }
-  char buffer1[256];
-  char buffer2[256];
-  double ms = timestamp;
-  snprintf(buffer1, sizeof(buffer1), "%8.1fms [%-7s] ", ms, levelStr);
-  buffer1[sizeof(buffer1) - 1] = 0;
-  vsnprintf(buffer2, sizeof(buffer2), fmt, argp);
-  buffer2[sizeof(buffer1) - 2] = 0;
-  std::cout << buffer1 << buffer2;
-}
-
-}  // namespace
-
-QnnLog_Callback_t GetDefaultStdOutLogger() { return DefaultStdOutLogger; }
-
-}  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h
deleted file mode 100644
index 934a164b49f9..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_LOG_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_LOG_H_
-
-#include "third_party/qairt/latest/include/QNN/QnnLog.h"
-
-namespace litert::qnn {
-
-// Gets a default logger implementation to stdout.
-// This is used when initializing qnn logging.
-QnnLog_Callback_t GetDefaultStdOutLogger();
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_LOG_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.cc
deleted file mode 100644
index d495313889da..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.cc
+++ /dev/null
@@ -1,419 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-#include <cstdint>
-#include <filesystem>  // NOLINT
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/HTP/QnnHtpContext.h"
-#include "third_party/qairt/latest/include/QNN/HTP/QnnHtpDevice.h"
-#include "third_party/qairt/latest/include/QNN/QnnBackend.h"
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnContext.h"
-#include "third_party/qairt/latest/include/QNN/QnnDevice.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnLog.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemCommon.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemContext.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemInterface.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-#include "tensorflow/lite/experimental/litert/core/dynamic_loading.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_log.h"
-
-namespace litert::qnn {
-
-namespace {
-
-constexpr char kLibQnnGetProvidersSymbol[] = "QnnInterface_getProviders";
-
-constexpr char kLibQnnSystemGetProvidersSymbol[] =
-    "QnnSystemInterface_getProviders";
-
-typedef Qnn_ErrorHandle_t (*QnnInterfaceGetProvidersFn_t)(
-    const QnnInterface_t*** provider_list, uint32_t* num_providers);
-
-typedef Qnn_ErrorHandle_t (*QnnSystemInterfaceGetProvidersFn_t)(
-    const QnnSystemInterface_t***, uint32_t*);
-
-absl::Span<const QnnInterface_t*> LoadProvidersFromLib(SharedLibrary& lib) {
-  LITERT_ASSIGN_OR_RETURN(
-      QnnInterfaceGetProvidersFn_t get_providers,
-      lib.LookupSymbol<QnnInterfaceGetProvidersFn_t>(kLibQnnGetProvidersSymbol),
-      absl::Span<const QnnInterface_t*>());
-
-  const QnnInterface_t** interface_providers = nullptr;
-  uint32_t num_providers = 0;
-  if (QNN_SUCCESS != get_providers(&interface_providers, &num_providers)) {
-    LITERT_LOG(LITERT_ERROR, "%s", "Failed to get providers\n");
-    return {};
-  }
-
-  return absl::MakeSpan(interface_providers, num_providers);
-}
-
-absl::Span<const QnnSystemInterface_t*> LoadSystemProvidersFromLib(
-    SharedLibrary& lib) {
-  LITERT_ASSIGN_OR_RETURN(QnnSystemInterfaceGetProvidersFn_t get_providers,
-                          lib.LookupSymbol<QnnSystemInterfaceGetProvidersFn_t>(
-                              kLibQnnSystemGetProvidersSymbol),
-                          absl::Span<const QnnSystemInterface_t*>());
-
-  const QnnSystemInterface_t** interface_providers = nullptr;
-  uint32_t num_providers = 0;
-  if (QNN_SUCCESS != get_providers(&interface_providers, &num_providers)) {
-    LITERT_LOG(LITERT_ERROR, "%s", "Failed to get system providers\n");
-    return {};
-  }
-
-  return absl::MakeSpan(interface_providers, num_providers);
-}
-
-}  // namespace
-
-QnnManager::~QnnManager() {
-  (void)FreeDevice();
-  (void)FreeBackend();
-  (void)FreeLogging();
-}
-
-LiteRtStatus QnnManager::LoadLib(absl::string_view path) {
-  LITERT_LOG(LITERT_INFO, "Loading qnn shared library from \"%s\"",
-             path.data());
-  LITERT_ASSIGN_OR_RETURN(lib_,
-                          SharedLibrary::Load(path, RtldFlags::Default()));
-  LITERT_LOG(LITERT_INFO, "Loaded qnn shared library", "");
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QnnManager::LoadSystemLib(absl::string_view path) {
-  LITERT_ASSIGN_OR_RETURN(lib_system_,
-                          SharedLibrary::Load(path, RtldFlags::Default()));
-  return kLiteRtStatusOk;
-}
-
-const QnnApi* QnnManager::Api() const {
-  if (interface_ == nullptr) {
-    return nullptr;
-  }
-  return &interface_->QNN_INTERFACE_VER_NAME;
-}
-
-LiteRtStatus QnnManager::ResolveApi() {
-  if (!lib_.Loaded()) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "Cannot resolve functions: libQnn*.so has not been loaded.\n");
-    return kLiteRtStatusErrorDynamicLoading;
-  }
-
-  auto providers = LoadProvidersFromLib(lib_);
-  for (const auto& prov : providers) {
-    const bool major =
-        prov->apiVersion.coreApiVersion.major == QNN_API_VERSION_MAJOR;
-
-    const bool minor =
-        prov->apiVersion.coreApiVersion.minor == QNN_API_VERSION_MINOR;
-
-    const bool patch =
-        prov->apiVersion.coreApiVersion.patch == QNN_API_VERSION_PATCH;
-
-    if (major && minor && patch) {
-      interface_ = prov;
-      break;
-    }
-  }
-
-  if (interface_ == nullptr) {
-    LITERT_LOG(LITERT_ERROR, "%s", "No valid interface was provided\n");
-    return kLiteRtStatusErrorDynamicLoading;
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QnnManager::ResolveSystemApi() {
-  if (!lib_.Loaded()) {
-    LITERT_LOG(LITERT_ERROR, "%s",
-               "Cannot resolve functions: libQnn*.so has not been loaded.\n");
-    return kLiteRtStatusErrorDynamicLoading;
-  }
-
-  auto system_providers = LoadSystemProvidersFromLib(lib_system_);
-  for (const auto& system_prov : system_providers) {
-    const bool major =
-        system_prov->systemApiVersion.major == QNN_SYSTEM_API_VERSION_MAJOR;
-
-    const bool minor =
-        system_prov->systemApiVersion.minor == QNN_SYSTEM_API_VERSION_MINOR;
-
-    const bool patch =
-        system_prov->systemApiVersion.patch == QNN_SYSTEM_API_VERSION_PATCH;
-
-    if (major && minor && patch) {
-      system_interface_ = system_prov;
-      break;
-    }
-  }
-
-  if (system_interface_ == nullptr) {
-    LITERT_LOG(LITERT_ERROR, "%s", "No valid system interface was provided\n");
-    return kLiteRtStatusErrorDynamicLoading;
-  }
-
-  return kLiteRtStatusOk;
-}
-
-const QnnSystemApi* QnnManager::SystemApi() const {
-  if (system_interface_ == nullptr) {
-    return nullptr;
-  }
-  return &system_interface_->QNN_SYSTEM_INTERFACE_VER_NAME;
-}
-
-LiteRtStatus QnnManager::FreeLogging() {
-  if (log_handle_ != nullptr) {
-    if (QNN_SUCCESS != Api()->logFree(log_handle_)) {
-      LITERT_LOG(LITERT_ERROR, "%s", "Failed to free logging\n");
-      return kLiteRtStatusErrorNotFound;
-    }
-  }
-  log_handle_ = nullptr;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QnnManager::FreeBackend() {
-  if (backend_handle_ != nullptr) {
-    if (QNN_SUCCESS != Api()->backendFree(backend_handle_)) {
-      LITERT_LOG(LITERT_ERROR, "%s", "Failed to free backend\n");
-      return kLiteRtStatusErrorNotFound;
-    }
-  }
-  backend_handle_ = nullptr;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QnnManager::FreeDevice() {
-  if (device_handle_ != nullptr) {
-    if (QNN_SUCCESS != Api()->deviceFree(device_handle_)) {
-      LITERT_LOG(LITERT_ERROR, "%s", "Failed to free device\n");
-      return kLiteRtStatusErrorNotFound;
-    }
-  }
-  device_handle_ = nullptr;
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QnnManager::GenerateContextBinary(
-    Qnn_ContextHandle_t context_handle, std::vector<char>& buffer) {
-  Qnn_ContextBinarySize_t bin_size = 0;
-  if (QNN_SUCCESS != Api()->contextGetBinarySize(context_handle, &bin_size)) {
-    LITERT_LOG(LITERT_ERROR, "%s", "Failed to get context bin size\n");
-    return kLiteRtStatusErrorNotFound;
-  }
-  buffer.clear();
-  buffer.resize(bin_size);
-
-  Qnn_ContextBinarySize_t written_bin_size = 0;
-  if (QNN_SUCCESS != Api()->contextGetBinary(context_handle, buffer.data(),
-                                             buffer.size(),
-                                             &written_bin_size)) {
-    LITERT_LOG(LITERT_ERROR, "%s", "Failed to generated context binary \n");
-    return kLiteRtStatusErrorNotFound;
-  }
-
-  LITERT_LOG(LITERT_INFO, "Serialized a context bin of size (bytes): %lu\n",
-             written_bin_size);
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QnnManager::ValidateOp(const Qnn_OpConfig_t& op_config) {
-  if (Qnn_ErrorHandle_t error =
-          Api()->backendValidateOpConfig(BackendHandle(), op_config);
-      QNN_SUCCESS != error) {
-    LITERT_LOG(LITERT_ERROR, "Failed to validate op %s\n, error: %lld",
-               op_config.v1.name, static_cast<long long>(error));
-    return kLiteRtStatusErrorInvalidLegalization;
-  }
-
-  return kLiteRtStatusOk;
-}
-
-LiteRtStatus QnnManager::Init(absl::Span<const QnnBackend_Config_t*> configs,
-                              std::optional<std::string> shared_library_dir,
-                              std::optional<QnnHtpDevice_Arch_t> soc_model) {
-  if (shared_library_dir.has_value()) {
-    setenv("ADSP_LIBRARY_PATH", shared_library_dir->data(), /*overwrite=*/1);
-  }
-
-  auto lib_qnn_htp_so_path = kLibQnnHtpSo;
-  // If shared_library_dir is provided, we will try to find the libQnnHtp.so
-  // in the directory.
-  if (shared_library_dir.has_value()) {
-    std::vector<std::string> results;
-    litert::internal::FindLiteRtSharedLibsHelper(
-        shared_library_dir->data(), kLibQnnHtpSo, /*full_match=*/true, results);
-    if (!results.empty()) {
-      lib_qnn_htp_so_path = results[0].c_str();
-      shared_library_dir =
-          std::filesystem::path(lib_qnn_htp_so_path).parent_path();
-    }
-  }
-
-  LITERT_RETURN_IF_ERROR(LoadLib(lib_qnn_htp_so_path));
-  LITERT_RETURN_IF_ERROR(ResolveApi());
-
-  auto lib_qnn_system_so_path =
-      shared_library_dir.has_value()
-          ? absl::StrFormat("%s/%s", shared_library_dir->data(),
-                            kLibQnnSystemSo)
-          : kLibQnnSystemSo;
-  LITERT_RETURN_IF_ERROR(LoadSystemLib(lib_qnn_system_so_path));
-  LITERT_RETURN_IF_ERROR(ResolveSystemApi());
-
-  if (auto status = Api()->logCreate(GetDefaultStdOutLogger(),
-                                     QNN_LOG_LEVEL_INFO, &LogHandle());
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create QNN logger: %d", status);
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  if (auto status =
-          Api()->backendCreate(LogHandle(), configs.data(), &BackendHandle());
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create QNN backend: %d", status);
-    return kLiteRtStatusErrorRuntimeFailure;
-  }
-
-  if (soc_model.has_value()) {
-    soc_model_ = *soc_model;
-    LITERT_LOG(LITERT_INFO,
-               "Initializing QNN backend for device architecture %d",
-               *soc_model);
-    QnnHtpDevice_CustomConfig_t arch_custom_config = {};
-    arch_custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
-    arch_custom_config.arch.arch = *soc_model;
-    arch_custom_config.arch.deviceId = 0;
-
-    QnnDevice_Config_t arch_device_config = {};
-    arch_device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-    arch_device_config.customConfig = &arch_custom_config;
-
-    const QnnDevice_Config_t* device_configs[2] = {
-        &arch_device_config,
-        nullptr,
-    };
-
-    if (auto status =
-            Api()->deviceCreate(nullptr, device_configs, &DeviceHandle());
-        status != QNN_SUCCESS) {
-      LITERT_LOG(LITERT_ERROR, "Failed to create QNN device: %d", status);
-      return kLiteRtStatusErrorRuntimeFailure;
-    }
-  }
-
-  return kLiteRtStatusOk;
-}
-
-Expected<QnnManager::SystemContextHandle>
-QnnManager::CreateSystemContextHandle() {
-  QnnSystemContext_Handle_t system_context_handle;
-  if (auto status = SystemApi()->systemContextCreate(&system_context_handle);
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create QNN system context: %d", status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to create QNN system context");
-  }
-  auto deleter = SystemApi()->systemContextFree;
-  return SystemContextHandle{system_context_handle, deleter};
-}
-
-Expected<QnnManager::ContextHandle> QnnManager::CreateContextHandle(
-    absl::Span<const QnnContext_Config_t*> configs) {
-  Qnn_ContextHandle_t context_handle;
-  if (auto status = Api()->contextCreate(BackendHandle(), DeviceHandle(),
-                                         configs.data(), &context_handle);
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create QNN context: %d", status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to create QNN context");
-  }
-  auto deleter = Api()->contextFree;
-  return ContextHandle{context_handle, /*profile=*/nullptr, deleter};
-}
-
-Expected<QnnManager::ContextHandle> QnnManager::CreateContextHandle(
-    absl::Span<const QnnContext_Config_t*> configs,
-    absl::Span<const uint8_t> bytecode, Qnn_ProfileHandle_t profile_handle) {
-  Qnn_ContextHandle_t context_handle;
-  if (auto status = Api()->contextCreateFromBinary(
-          BackendHandle(), DeviceHandle(), configs.data(), bytecode.data(),
-          bytecode.size(), &context_handle, profile_handle);
-      status != QNN_SUCCESS) {
-    LITERT_LOG(LITERT_ERROR, "Failed to create QNN context: %d", status);
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Failed to create QNN context");
-  }
-  auto deleter = Api()->contextFree;
-  return ContextHandle{context_handle, profile_handle, deleter};
-}
-
-Expected<QnnManager::Ptr> QnnManager::Create(
-    absl::Span<const QnnBackend_Config_t*> configs,
-    std::optional<std::string> shared_library_dir,
-    std::optional<QnnHtpDevice_Arch_t> soc_model) {
-  Ptr qnn_manager(new QnnManager);
-  if (auto status = qnn_manager->Init(configs, shared_library_dir, soc_model);
-      status != kLiteRtStatusOk) {
-    return Unexpected(status, "Failed to set up QNN manager");
-  }
-  return qnn_manager;
-}
-
-absl::Span<const QnnBackend_Config_t*> QnnManager::DefaultBackendConfigs() {
-  static const QnnBackend_Config_t* configs[] = {nullptr};
-  return absl::MakeSpan(configs);
-}
-
-absl::Span<const QnnContext_Config_t*> QnnManager::DefaultContextConfigs() {
-  static const QnnContext_Config_t* configs[] = {nullptr};
-  return absl::MakeSpan(configs);
-}
-
-absl::Span<const QnnContext_Config_t*>
-QnnManager::WeightSharingContextConfigs() {
-  static QnnHtpContext_CustomConfig_t customConfig =
-      QNN_HTP_CONTEXT_CUSTOM_CONFIG_INIT;
-  customConfig.option = QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED;
-  customConfig.weightSharingEnabled = true;
-  static QnnContext_Config_t contextConfig = QNN_CONTEXT_CONFIG_INIT;
-  contextConfig.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
-  contextConfig.customConfig = &customConfig;
-  static const QnnContext_Config_t* configs[2] = {&contextConfig, nullptr};
-  return absl::MakeSpan(configs);
-}
-
-};  // namespace litert::qnn
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h
deleted file mode 100644
index 30d00ab71697..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h
+++ /dev/null
@@ -1,239 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_MANAGER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_MANAGER_H_
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "third_party/qairt/latest/include/QNN/HTP/QnnHtpDevice.h"
-#include "third_party/qairt/latest/include/QNN/QnnBackend.h"
-#include "third_party/qairt/latest/include/QNN/QnnCommon.h"
-#include "third_party/qairt/latest/include/QNN/QnnContext.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemContext.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemInterface.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_macros.h"  // IWYU pragma: keep
-#include "tensorflow/lite/experimental/litert/cc/litert_shared_library.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/common.h"
-
-//===----------------------------------------------------------------------===//
-//
-//                                                                   QnnManger
-//
-// Syntactic sugar for various Qnn Sdk routines.
-//
-// Provides various utilities for linking shared libraries at runtime
-// against Qnn symbols as well as convience getters and storage of handles
-// (pointers). Provides simple wrappers for freeing handles and returning
-// LiteRtStatus rather than Qnn ones. Additionally exposes hooks for dumping
-// api and shared libarary details.
-//
-// Does not own any memory and will always have trivial cstor/dstor. The
-// user is responsible for freeing any Qnn handles explicitly. Note,
-// Qnn handles will be automatically freed when the library is unloaded
-// if they have been already.
-//
-//===----------------------------------------------------------------------===//
-
-namespace litert::qnn {
-
-class QnnManager;
-
-namespace internal {
-
-void Dump(const QnnManager& qnn, std::ostream& out);
-
-}  // namespace internal
-
-class QnnManager {
-  friend void internal::Dump(const QnnManager& qnn, std::ostream& out);
-
- public:
-  using Ptr = std::unique_ptr<QnnManager>;
-  using SystemContextHandle =
-      std::unique_ptr<std::remove_pointer<QnnSystemContext_Handle_t>::type,
-                      QnnSystemContext_FreeFn_t>;
-  class ContextHandle;
-
-  ~QnnManager();
-
-  static Expected<Ptr> Create(
-      absl::Span<const QnnBackend_Config_t*> configs,
-      std::optional<std::string> shared_library_dir = std::nullopt,
-      std::optional<QnnHtpDevice_Arch_t> soc_model = std::nullopt);
-
-  static absl::Span<const QnnBackend_Config_t*> DefaultBackendConfigs();
-  static absl::Span<const QnnContext_Config_t*> DefaultContextConfigs();
-  static absl::Span<const QnnContext_Config_t*> WeightSharingContextConfigs();
-
-  // Get resolved function pointers for qnn sdk calls. Nullptr if functions
-  // have not been resolved yet.
-  const QnnApi* Api() const;
-
-  // Get resolved function pointers for qnn sdk calls. Nullptr if functions
-  // have not been resolved yet.
-  const QnnSystemApi* SystemApi() const;
-
-  //
-  // QNN SDK Objects.
-  //
-
-  // Create system context handle.
-  Expected<SystemContextHandle> CreateSystemContextHandle();
-
-  // Create a context handle for compilation.
-  Expected<ContextHandle> CreateContextHandle(
-      absl::Span<const QnnContext_Config_t*> configs);
-
-  // Create a context handle for inference, from a given bytecode.
-  Expected<ContextHandle> CreateContextHandle(
-      absl::Span<const QnnContext_Config_t*> configs,
-      absl::Span<const uint8_t> bytecode, Qnn_ProfileHandle_t profile_handle);
-
-  //
-  // Context Binary
-  //
-
-  // Generates QNN context binary from current context. Writes to given
-  // buffer.
-  LiteRtStatus GenerateContextBinary(Qnn_ContextHandle_t context_handle,
-                                     std::vector<char>& buffer);
-
-  LiteRtStatus ValidateOp(const Qnn_OpConfig_t& op_config);
-
-  bool IsLegacySocModel() { return soc_model_ == QNN_HTP_DEVICE_ARCH_V68; }
-
- private:
-  QnnManager() = default;
-
-  LiteRtStatus Init(absl::Span<const QnnBackend_Config_t*> configs,
-                    std::optional<std::string> shared_library_dir,
-                    std::optional<QnnHtpDevice_Arch_t> soc_model);
-
-  //
-  // Manage libQnn*.so Loading
-  //
-
-  // Loads the libQnn*.so at given path.
-  LiteRtStatus LoadLib(absl::string_view path);
-
-  // Loads the libQnnSystem.so at given path.
-  LiteRtStatus LoadSystemLib(absl::string_view path);
-
-  //
-  // Resolve and Access QNN SDK Functions
-  //
-
-  // Resolve all available QNN SDK functions from (already) loaded so. If
-  // multiple providers are found, selects the first one with a suitable
-  // version. Fails if none can be found.
-  LiteRtStatus ResolveApi();
-
-  // Resolve all available QNN SDK functions from (already) loaded so. If
-  // multiple providers are found, selects the first one with a suitable
-  // version. Fails if none can be found.
-  LiteRtStatus ResolveSystemApi();
-
-  // Get qnn log handle. Nullptr if logCreate has not been successfully called.
-  Qnn_LogHandle_t& LogHandle() { return log_handle_; }
-
-  // Get qnn backend handle. Nullptr if backendCreate has not been successfully
-  // called.
-  Qnn_BackendHandle_t& BackendHandle() { return backend_handle_; }
-
-  // Get qnn device handle. Nullptr if deviceCreate has not been successfully
-  // called.
-  Qnn_DeviceHandle_t& DeviceHandle() { return device_handle_; }
-
-  // Signal QNN SDK to free any memory related to the device. Does nothing
-  // if deviceCreate has not been called.
-  LiteRtStatus FreeDevice();
-
-  // Signal QNN SDK to free any memory related to logging. Does nothing
-  // if logCreate has not been called.
-  LiteRtStatus FreeLogging();
-
-  // Signal QNN SDK to free any memory related to backend. Does nothing
-  // if backendCreate has not been called.
-  LiteRtStatus FreeBackend();
-
-  // Handle to the shared library that implements the API. The library is
-  // released when the manager is destroyed.
-  SharedLibrary lib_;
-
-  // Handle to the system shared library that implements the API. The library is
-  // released when the manager is destroyed.
-  SharedLibrary lib_system_;
-
-  const QnnInterface_t* interface_ = nullptr;
-  const QnnSystemInterface_t* system_interface_ = nullptr;
-
-  Qnn_LogHandle_t log_handle_ = nullptr;
-  Qnn_BackendHandle_t backend_handle_ = nullptr;
-  Qnn_DeviceHandle_t device_handle_ = nullptr;
-  QnnHtpDevice_Arch_t soc_model_ = QNN_HTP_DEVICE_ARCH_UNKNOWN;
-};
-
-// Unfortunately we can't use std::unique_ptr with a deleter because
-// QnnContext_FreeFn_t takes a profile handle as a second argument.
-class QnnManager::ContextHandle {
- public:
-  ContextHandle(Qnn_ContextHandle_t context_handle, Qnn_ProfileHandle_t profile,
-                QnnContext_FreeFn_t free_fn)
-      : context_handle_(context_handle), profile_(profile), free_fn_(free_fn) {}
-
-  ~ContextHandle() {
-    if (context_handle_ && free_fn_) {
-      free_fn_(context_handle_, profile_);
-    }
-  }
-
-  ContextHandle(ContextHandle&& other) { *this = std::move(other); }
-
-  ContextHandle(const ContextHandle& other) = delete;
-
-  ContextHandle& operator=(ContextHandle&& other) {
-    std::swap(context_handle_, other.context_handle_);
-    std::swap(profile_, other.profile_);
-    std::swap(free_fn_, other.free_fn_);
-    return *this;
-  }
-
-  ContextHandle& operator=(const ContextHandle& other) = delete;
-
-  Qnn_ContextHandle_t get() const noexcept { return context_handle_; }
-  explicit operator bool() const noexcept { return context_handle_ != nullptr; }
-
- private:
-  Qnn_ContextHandle_t context_handle_ = nullptr;
-  Qnn_ProfileHandle_t profile_ = nullptr;
-  QnnContext_FreeFn_t free_fn_ = nullptr;
-};
-
-}  // namespace litert::qnn
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_MANAGER_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager_test.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager_test.cc
deleted file mode 100644
index 742af4f508dd..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager_test.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-#include <sstream>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/litert/test/common.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h"
-
-namespace {
-
-using ::litert::qnn::QnnManager;
-using ::litert::qnn::internal::Dump;
-using ::testing::HasSubstr;
-
-// NOTE: This tests that all of the dynamic loading works properly and
-// the QNN SDK instance can be properly initialized and destroyed.
-
-TEST(QnnManagerTest, SetupQnnManager) {
-  auto configs = QnnManager::DefaultBackendConfigs();
-  auto qnn = QnnManager::Create(configs);
-  ASSERT_TRUE(qnn);
-}
-
-TEST(QnnManagerTest, Dump) {
-  auto configs = QnnManager::DefaultBackendConfigs();
-  auto qnn = QnnManager::Create(configs);
-  ASSERT_TRUE(qnn);
-
-  std::ostringstream dump;
-  Dump(**qnn, dump);
-
-  EXPECT_THAT(dump.str(), HasSubstr("< QnnInterface_t >"));
-  EXPECT_THAT(dump.str(), HasSubstr("< QnnSystemInterface_t >"));
-}
-
-}  // namespace
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.cc
deleted file mode 100644
index 557a5d2f9ed5..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h"
-
-#include <algorithm>
-#include <iterator>
-#include <utility>
-
-#include "absl/log/absl_check.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/c/litert_common.h"
-#include "tensorflow/lite/experimental/litert/c/litert_logging.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace qnn {
-
-QnnTensor::QnnTensor(const QnnTensor& other) : QnnTensor(other.Tensor()) {
-  auto status = DeepCopy();
-  // This should never fail because the input QnnTensor was already deep-copied.
-  if (!status) {
-    LITERT_LOG(LITERT_ERROR, "Failed to build QnnTensor: %s",
-               status.Error().Message().c_str());
-    ABSL_CHECK(status);
-  }
-}
-
-QnnTensor::QnnTensor(QnnTensor&& other) {
-  tensor_ = other.tensor_;
-  // Swap managed memory.
-  std::swap(name_, other.name_);
-  std::swap(dimensions_, other.dimensions_);
-  std::swap(is_dynamic_dimensions_, other.is_dynamic_dimensions_);
-}
-
-Expected<QnnTensor> QnnTensor::Create(const Qnn_Tensor_t& tensor) {
-  QnnTensor qnn_tensor(tensor);
-  if (auto status = qnn_tensor.DeepCopy(); !status) {
-    return Unexpected(status.Error());
-  }
-  return qnn_tensor;
-}
-
-Expected<void> QnnTensor::DeepCopy() {
-  if (tensor_.version == QNN_TENSOR_VERSION_1) {
-    dimensions_.reserve(tensor_.v1.rank);
-    std::copy(tensor_.v1.dimensions, tensor_.v1.dimensions + tensor_.v1.rank,
-              std::back_inserter(dimensions_));
-    tensor_.v1.dimensions = dimensions_.data();
-
-    // FIXME: Implement deep copy for quantizeParams.
-    if (tensor_.v1.quantizeParams.quantizationEncoding ==
-            QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION ||
-        tensor_.v1.quantizeParams.quantizationEncoding ==
-            QNN_QUANTIZATION_ENCODING_VECTOR) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Unsupported QNN quantization");
-    }
-
-  } else if (tensor_.version == QNN_TENSOR_VERSION_2) {
-    dimensions_.reserve(tensor_.v2.rank);
-    std::copy(tensor_.v2.dimensions, tensor_.v2.dimensions + tensor_.v2.rank,
-              std::back_inserter(dimensions_));
-    tensor_.v2.dimensions = dimensions_.data();
-
-    if (tensor_.v2.isDynamicDimensions) {
-      is_dynamic_dimensions_.reserve(tensor_.v2.rank);
-      std::copy(tensor_.v2.isDynamicDimensions,
-                tensor_.v2.isDynamicDimensions + tensor_.v2.rank,
-                std::back_inserter(is_dynamic_dimensions_));
-      tensor_.v2.isDynamicDimensions = is_dynamic_dimensions_.data();
-    }
-
-    // FIXME: Implement deep copy for quantizeParams.
-    if (tensor_.v2.quantizeParams.quantizationEncoding ==
-            QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION ||
-        tensor_.v2.quantizeParams.quantizationEncoding ==
-            QNN_QUANTIZATION_ENCODING_VECTOR) {
-      return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                        "Unsupported QNN quantization");
-    }
-
-  } else {
-    return Unexpected(kLiteRtStatusErrorRuntimeFailure,
-                      "Unsupported QNN tensor version");
-  }
-
-  return {};
-}
-
-}  // namespace qnn
-}  // namespace litert
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h
deleted file mode 100644
index 84466856a5ce..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_tensor.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_TENSOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_TENSOR_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/QnnTypes.h"
-#include "tensorflow/lite/experimental/litert/cc/litert_expected.h"
-
-namespace litert {
-namespace qnn {
-
-class QnnTensor {
- public:
-  static Expected<QnnTensor> Create(const Qnn_Tensor_t& tensor);
-
-  QnnTensor(const QnnTensor& other);
-  QnnTensor(QnnTensor&& other);
-
-  QnnTensor& operator=(const QnnTensor&) = delete;
-  QnnTensor& operator=(QnnTensor&&) = delete;
-
-  Qnn_Tensor_t& Tensor() { return tensor_; }
-  const Qnn_Tensor_t& Tensor() const { return tensor_; }
-
-  size_t Rank() const { return dimensions_.size(); }
-  const uint32_t* Dimensions() const { return dimensions_.data(); }
-
- private:
-  explicit QnnTensor(const Qnn_Tensor_t& tensor) : tensor_(tensor) {}
-  Expected<void> DeepCopy();
-
-  Qnn_Tensor_t tensor_;
-  std::string name_;
-  std::vector<uint32_t> dimensions_;
-  std::vector<uint8_t> is_dynamic_dimensions_;
-};
-
-}  // namespace qnn
-}  // namespace litert
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_QNN_TENSOR_H_
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/qualcomm_build_defs.bzl b/tensorflow/lite/experimental/litert/vendors/qualcomm/qualcomm_build_defs.bzl
deleted file mode 100644
index d4c9c70db367..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/qualcomm_build_defs.bzl
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Build definitions for QualComm backend."""
-
-load("//tensorflow/lite/experimental/litert/build_common:litert_build_defs.bzl", "append_rule_kwargs", "litert_bin", "litert_lib", "make_rpaths")
-
-_QNN_LIBCC_X86_64 = [
-    # copybara:uncomment_begin(google-only)
-    # "//third_party/qairt/latest:lib/x86_64-linux-clang/libc++.so.1",
-    # "//third_party/qairt/latest:lib/x86_64-linux-clang/libc++abi.so.1",
-    # copybara:uncomment_end
-]  # @unused
-
-# TODO: Make rpaths dynamic with "$(location {})".
-_QNN_LIB_RPATHS_X86_64 = [
-    # copybara:uncomment_begin(google-only)
-    # "third_party/qairt/latest/lib/x86_64-linux-clang",
-    # copybara:uncomment_end
-]
-
-_QNN_LIB_HTP_X86_64 = [
-    # copybara:uncomment_begin(google-only)
-    # "//third_party/qairt/latest:lib/x86_64-linux-clang/libQnnHtp.so",
-    # copybara:uncomment_end
-]
-
-_QNN_LIB_SYSTEM_X86_64 = [
-    # copybara:uncomment_begin(google-only)
-    # "//third_party/qairt/latest:lib/x86_64-linux-clang/libQnnSystem.so",
-    # copybara:uncomment_end
-]
-
-def _litert_with_qnn_base(
-        litert_rule,
-        backend,
-        include_system,
-        use_custom_libcc,
-        **litert_rule_kwargs):
-    if backend != "htp":
-        fail("Only htp currently supported")
-
-    if use_custom_libcc:
-        # TODO: Figure out strategy for custom libcc.
-        fail("Custom libcc not yet supported")
-
-    data_x86_64 = []
-    data_x86_64.extend(_QNN_LIB_HTP_X86_64)
-    if include_system:
-        data_x86_64.extend(_QNN_LIB_SYSTEM_X86_64)
-    data = select({
-        "//tensorflow:linux_x86_64": data_x86_64,
-        "//conditions:default": [],
-    })
-
-    append_rule_kwargs(
-        litert_rule_kwargs,
-        data = data,
-        linkopts = select({
-            "//tensorflow:linux_x86_64": [make_rpaths(_QNN_LIB_RPATHS_X86_64)],
-            "//conditions:default": [],
-        }),
-    )
-
-    litert_rule(**litert_rule_kwargs)
-
-def litert_cc_lib_with_qnn(
-        backend = "htp",
-        include_system = False,
-        use_custom_libcc = False,
-        **litert_lib_kwargs):
-    """Creates a litert_lib target with QualComm backend dependencies.
-
-    Args:
-        backend: The backend to use. Currently only "htp" is supported.
-        include_system: Whether to include libQnnSystem.so.
-        use_custom_libcc: Whether to use a custom libcc. Not yet supported.
-        **litert_lib_kwargs: Keyword arguments passed to litert_lib.
-    """
-    _litert_with_qnn_base(
-        litert_lib,
-        backend,
-        include_system,
-        use_custom_libcc,
-        **litert_lib_kwargs
-    )
-
-def litert_cc_bin_with_qnn(
-        backend = "htp",
-        include_system = False,
-        use_custom_libcc = False,
-        **litert_bin_kwargs):
-    """Creates a litert_bin target with QualComm backend dependencies.
-
-    Args:
-        backend: The backend to use. Currently only "htp" is supported.
-        include_system: Whether to include libQnnSystem.so.
-        use_custom_libcc: Whether to use a custom libcc. Not yet supported.
-        **litert_bin_kwargs: Keyword arguments passed to litert_bin.
-    """
-    _litert_with_qnn_base(
-        litert_bin,
-        backend,
-        include_system,
-        use_custom_libcc,
-        **litert_bin_kwargs
-    )
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/supported_soc.csv b/tensorflow/lite/experimental/litert/vendors/qualcomm/supported_soc.csv
deleted file mode 100644
index a5b0197f3d73..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/supported_soc.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-# manufacturer,model,runtime_library_version
-Qualcomm,SM8750,v79
-Qualcomm,SM8650,v75
-Qualcomm,SM8635,v73
-Qualcomm,SM8550,v73
-Qualcomm,SM7675,v73
-Qualcomm,SM7550,v73
-Qualcomm,SM7435,v73
-Qualcomm,SM6450,v73
-Qualcomm,QCM8550LA,v73
-Qualcomm,QCM8550LE,v73
-Qualcomm,SM8475,v69
-Qualcomm,SM8450,v69
-Qualcomm,SM7475,v69
-Qualcomm,SM7450,v69
-Qualcomm,SM7425,v69
-Qualcomm,SXR2230P,v69
-Qualcomm,SXR2250P,v69
-Qualcomm,SM8350,v68
-Qualcomm,SM8350P,v68
-Qualcomm,SM7350,v68
-Qualcomm,SM7325,v68
-Qualcomm,SM7315,v68
-Qualcomm,QCM6490,v68
-Qualcomm,SM8250,v66
-Qualcomm,SM8150,v66
-Qualcomm,SM7250,v66
-Qualcomm,SM7225,v66
-Qualcomm,SM7125,v66
-Qualcomm,SM6350,v66
-Qualcomm,SM6225,v66
-Qualcomm,SM6150,v66
-Qualcomm,SM6125,v66
-Qualcomm,SM4350,v66
-Qualcomm,QRB5165U,v66
-Qualcomm,QRB5165LE,v66
-Qualcomm,QCS7230LA,v66
-Qualcomm,QCS7230LE,v66
-Qualcomm,SM6375,v66
-Qualcomm,SM7150,v65
-Qualcomm,SDM845,v65
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/BUILD b/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/BUILD
deleted file mode 100644
index 45df0fef3b5a..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2024 Google LLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/lite/experimental/litert:__subpackages__"],
-)
-
-cc_library(
-    name = "dump",
-    srcs = ["dump.cc"],
-    hdrs = ["dump.h"],
-    tags = ["nobuilder"],
-    deps = [
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        # copybara:uncomment "//third_party/qairt/latest:qnn_lib_headers",
-        "//tensorflow/lite/experimental/litert/vendors/qualcomm:qnn_manager_hdr",
-    ],
-)
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.cc b/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.cc
deleted file mode 100644
index 0e94b6b03858..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h"
-
-#include <ostream>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "third_party/qairt/latest/include/QNN/QnnInterface.h"
-#include "third_party/qairt/latest/include/QNN/System/QnnSystemInterface.h"
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn::internal {
-namespace {
-
-static constexpr absl::string_view kNullDumpTpl = "%s : nullptr\n";
-
-void Dump(const QnnInterface_t* interface, std::ostream& out) {
-  static constexpr absl::string_view kQnnInterfaceHeader = "< QnnInterface_t >";
-  // NOLINTBEGIN
-  static constexpr absl::string_view kQnnInterfaceDumpTpl =
-      "\
-  %s\n\
-  name: %s\n\
-  backend_id: %u\n\
-  core_api_version: %u.%u.%u\n\
-  backend_api_version: %u.%u.%u\n";
-  // NOLINTEND
-
-  if (interface == nullptr) {
-    out << absl::StreamFormat(kNullDumpTpl, kQnnInterfaceHeader);
-    return;
-  }
-
-  const auto core_version = interface->apiVersion.coreApiVersion;
-  const auto backend_version = interface->apiVersion.backendApiVersion;
-
-  out << absl::StreamFormat(kQnnInterfaceDumpTpl, kQnnInterfaceHeader,
-                            interface->providerName, interface->backendId,
-                            core_version.major, core_version.minor,
-                            core_version.patch, backend_version.major,
-                            backend_version.minor, backend_version.patch);
-}
-
-void Dump(const QnnSystemInterface_t* interface, std::ostream& out) {
-  static constexpr absl::string_view kQnnSystemInterfaceHeader =
-      "< QnnSystemInterface_t >";
-  // NOLINTBEGIN
-  static constexpr absl::string_view kQnnSystemInterfaceDumpTpl =
-      "\
-  %s\n\
-  name: %s\n\
-  backend_id: %u\n\
-  system_api_version: %u.%u.%u\n";
-  // NOLINTEND
-
-  if (interface == nullptr) {
-    out << absl::StreamFormat(kNullDumpTpl, kQnnSystemInterfaceHeader);
-    return;
-  }
-
-  const auto system_version = interface->systemApiVersion;
-
-  out << absl::StreamFormat(kQnnSystemInterfaceDumpTpl,
-                            kQnnSystemInterfaceHeader, interface->providerName,
-                            interface->backendId, system_version.major,
-                            system_version.minor, system_version.patch);
-}
-
-}  // namespace
-
-void Dump(const QnnManager& qnn, std::ostream& out) {
-  Dump(qnn.interface_, out);
-  Dump(qnn.system_interface_, out);
-}
-}  // namespace litert::qnn::internal
diff --git a/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h b/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h
deleted file mode 100644
index b64650249af0..000000000000
--- a/tensorflow/lite/experimental/litert/vendors/qualcomm/tools/dump.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2024 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_TOOLS_DUMP_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_TOOLS_DUMP_H_
-
-#include <iostream>
-#include <ostream>
-
-#include "tensorflow/lite/experimental/litert/vendors/qualcomm/qnn_manager.h"
-
-namespace litert::qnn::internal {
-
-void Dump(const QnnManager& qnn, std::ostream& out = std::cerr);
-
-}
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_LITERT_VENDORS_QUALCOMM_TOOLS_DUMP_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft.cc b/tensorflow/lite/experimental/microfrontend/lib/fft.cc
index 8a107e2b492e..0f30eec49167 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/fft.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft.cc
@@ -16,6 +16,8 @@ limitations under the License.
 
 #include <string.h>
 
+#include <cstdint>
+
 #define FIXED_POINT 16
 #include "kiss_fft.h"
 #include "kiss_fftr.h"
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_util.cc b/tensorflow/lite/experimental/microfrontend/lib/fft_util.cc
index b913f3c0365e..18e0d36b53d7 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/fft_util.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_util.cc
@@ -16,6 +16,9 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <cstdint>
+#include <cstdlib>
+
 #define FIXED_POINT 16
 #include "kiss_fft.h"
 #include "kiss_fftr.h"
diff --git a/tensorflow/lite/experimental/resource/resource_variable.cc b/tensorflow/lite/experimental/resource/resource_variable.cc
index efa8ac979cb5..ff34db90d430 100644
--- a/tensorflow/lite/experimental/resource/resource_variable.cc
+++ b/tensorflow/lite/experimental/resource/resource_variable.cc
@@ -77,7 +77,9 @@ TfLiteStatus ResourceVariable::AssignFrom(const TfLiteTensor* tensor) {
     tensor_.bytes = old_bytes;
   }
 
-  memcpy(tensor_.data.raw, tensor->data.raw, tensor_.bytes);
+  if (tensor->data.raw) {
+    memcpy(tensor_.data.raw, tensor->data.raw, tensor_.bytes);
+  }
   is_initialized_ = true;
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/g3doc/android/delegates/gpu_native.md b/tensorflow/lite/g3doc/android/delegates/gpu_native.md
index 2221c2066f9c..87ecb7291cec 100644
--- a/tensorflow/lite/g3doc/android/delegates/gpu_native.md
+++ b/tensorflow/lite/g3doc/android/delegates/gpu_native.md
@@ -1,20 +1,20 @@
 # GPU acceleration delegate with C/C++ API
 
 Using graphics processing units (GPUs) to run your machine learning (ML) models
-can dramatically improve the performance and the user experience
-of your ML-enabled applications. On Android devices, you can enable
-GPU-accelerated execution of your models using a
-[*delegate*](../../performance/delegates) and one of the following APIs:
+can dramatically improve the performance and the user experience of your
+ML-enabled applications. On Android devices, you can enable GPU-accelerated
+execution of your models using a
+[*delegate*](https://ai.google.dev/edge/litert/performance/delegates) and one of
+the following APIs:
 
-- Interpreter API - [guide](./gpu)
-- Task library API - [guide](./gpu_task)
-- Native (C/C++) API - this guide
+-   Interpreter API - [guide](./gpu)
+-   Task library API - [guide](./gpu_task.md)
+-   Native (C/C++) API - this guide
 
-This guide covers advanced
-uses of the GPU delegate for the C API, C++ API, and use of quantized models.
-For more information about using the GPU delegate for TensorFlow Lite,
-including best practices and advanced techniques, see the
-[GPU delegates](../../performance/gpu) page.
+This guide covers advanced uses of the GPU delegate for the C API, C++ API, and
+use of quantized models. For more information about using the GPU delegate for
+TensorFlow Lite, including best practices and advanced techniques, see the
+[GPU delegates](https://ai.google.dev/edge/litert/performance/gpu) page.
 
 ## Enable GPU acceleration
 
@@ -65,9 +65,10 @@ thread in which `Interpreter::ModifyGraphWithDelegate()` was called.
 
 #### With TensorFlow Lite in Google Play Services:
 
-If you are using TensorFlow Lite in Google Play Services [C API](../native),
-you’ll need to use the Java/Kotlin API to check if a GPU delegate is available
-for your device before initializing the TensorFlow Lite runtime.
+If you are using TensorFlow Lite in Google Play Services
+[C API](https://ai.google.dev/edge/litert/android/native), you’ll need to use
+the Java/Kotlin API to check if a GPU delegate is available for your device
+before initializing the TensorFlow Lite runtime.
 
 Add the GPU delegate gradle dependencies to your application:
 
@@ -170,5 +171,6 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
   </devsite-selector>
 </div>
 
-For more information about running quantized models with GPU acceleration,
-see [GPU delegate](../../performance/gpu#quantized-models) overview.
\ No newline at end of file
+For more information about running quantized models with GPU acceleration, see
+[GPU delegate](https://ai.google.dev/edge/litert/performance/gpu#quantized_models)
+overview.
diff --git a/tensorflow/lite/g3doc/android/tutorials/object_detection.md b/tensorflow/lite/g3doc/android/tutorials/object_detection.md
index e9ecc441d651..640b08231197 100644
--- a/tensorflow/lite/g3doc/android/tutorials/object_detection.md
+++ b/tensorflow/lite/g3doc/android/tutorials/object_detection.md
@@ -147,7 +147,7 @@ convert data such as images, into a tensor data format that can be processed by
 the model you are using.
 
 The example app uses the TensorFlow Lite
-[Task library for vision](../../inference_with_metadata/task_library/overview#supported_tasks)
+[Task library for vision](../../inference_with_metadata/task_library/overview.md#supported-tasks)
 to enable execution of the object detection machine learning model. The
 following instructions explain how to add the required library dependencies to
 your own Android app project.
@@ -263,7 +263,7 @@ device, such as Graphics Processing Units (GPUs), Tensor Processing Units
 TensorFlow Lite models is recommended, but not required.
 
 The object detector is initialized using the current settings on the thread that
-is using it. You can use CPU and [NNAPI](../../android/delegates/nnapi)
+is using it. You can use CPU and [NNAPI](../../android/delegates/nnapi.md)
 delegates with detectors that are created on the main thread and used on a
 background thread, but the thread that initialized the detector must use the GPU
 delegate.
@@ -290,7 +290,7 @@ when (currentDelegate) {
 ```
 
 For more information about using hardware acceleration delegates with TensorFlow
-Lite, see [TensorFlow Lite Delegates](../../performance/delegates).
+Lite, see [TensorFlow Lite Delegates](../../performance/delegates.md).
 
 ## Prepare data for the model
 
diff --git a/tensorflow/lite/g3doc/android/tutorials/text_classification.md b/tensorflow/lite/g3doc/android/tutorials/text_classification.md
index 19baf52bb17a..7b614894a464 100644
--- a/tensorflow/lite/g3doc/android/tutorials/text_classification.md
+++ b/tensorflow/lite/g3doc/android/tutorials/text_classification.md
@@ -7,7 +7,7 @@ physical Android device but can also run on a device emulator.
 The
 [example application](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android)
 uses TensorFlow Lite to classify text as either positive or negative, using the
-[Task library for natural language (NL)](../../inference_with_metadata/task_library/overview#supported_tasks)
+[Task library for natural language (NL)](https://ai.google.dev/edge/litert/libraries/task_library/overview)
 to enable execution of the text classification machine learning models.
 
 If you are updating an existing project, you can use the example application as
@@ -31,7 +31,7 @@ text being correctly classified as either positive or negative.
 
 For more information on how the models in this tutorial are generated, refer to
 the
-[Text classification with TensorFlow Lite Model Maker](https://www.tensorflow.org/lite/models/modify/model_maker/text_classification)
+[Text classification with TensorFlow Lite Model Maker](https://ai.google.dev/edge/litert/libraries/modify/text_classification)
 tutorial.
 
 ## Models and dataset
@@ -41,7 +41,7 @@ This tutorial uses models that were trained using the
 Treebank) dataset. SST-2 contains 67,349 movie reviews for training and 872
 movie reviews for testing, with each review categorized as either positive or
 negative. The models used in this app were trained using the TensorFlow Lite
-[Model Maker](https://www.tensorflow.org/lite/models/modify/model_maker/text_classification)
+[Model Maker](https://ai.google.dev/edge/litert/libraries/modify/text_classification)
 tool.
 
 The example application uses the following pre-trained models:
@@ -149,10 +149,10 @@ implement text classification features to your production applications:
 ## How the example app works {:#how_it_works}
 
 The application uses the
-[Task library for natural language (NL)](../../inference_with_metadata/task_library/overview#supported_tasks)
+[Task library for natural language (NL)](https://ai.google.dev/edge/litert/libraries/task_library/overview)
 package to implement the text classification models. The two models, Average
 Word Vector and MobileBERT, were trained using the TensorFlow Lite
-[Model Maker](https://www.tensorflow.org/lite/models/modify/model_maker/text_classification).
+[Model Maker](https://ai.google.dev/edge/litert/libraries/modify/text_classification).
 The application runs on CPU by default, with the option of hardware acceleration
 using the NNAPI delegate.
 
@@ -237,10 +237,10 @@ model with parameters before running predictions with the model.
 
 A TensorFlow Lite model is stored as a `*.tflite` file. The model file contains
 the prediction logic and typically includes
-[metadata](../../models/convert/metadata) about how to interpret prediction
-results, such as prediction class names. Typically, model files are stored in
-the `src/main/assets` directory of your development project, as in the code
-example:
+[metadata](https://ai.google.dev/edge/litert/models/metadata) about how to
+interpret prediction results, such as prediction class names. Typically, model
+files are stored in the `src/main/assets` directory of your development project,
+as in the code example:
 
 -   `<project>/src/main/assets/mobilebert.tflite`
 -   `<project>/src/main/assets/wordvec.tflite`
@@ -475,7 +475,7 @@ user interface.
 ## Next steps
 
 *   Train and implement the models from scratch with the
-    [Text classification with TensorFlow Lite Model Maker](https://www.tensorflow.org/lite/models/modify/model_maker/text_classification)
+    [Text classification with TensorFlow Lite Model Maker](https://ai.google.dev/edge/litert/libraries/modify/text_classification)
     tutorial.
 *   Explore more
     [text processing tools for TensorFlow](https://www.tensorflow.org/text).
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 4ff1efcfd2d6..38255e081862 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <stddef.h>
 
 #include <algorithm>
-#include <memory>
 #include <tuple>
 #include <vector>
 
diff --git a/tensorflow/lite/java/jni/BUILD b/tensorflow/lite/java/jni/BUILD
index 137ca32b0489..9638ee998825 100644
--- a/tensorflow/lite/java/jni/BUILD
+++ b/tensorflow/lite/java/jni/BUILD
@@ -1,48 +1,33 @@
-package(default_visibility = ["//tensorflow/lite:__subpackages__"])
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
-licenses(["notice"])  # Apache 2.0
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite:__subpackages__"],
+    licenses = ["notice"],
+)
 
-# Helper target for exposing JNI headers across multiple platforms.
-cc_library(
+# We need special handling for JNI inclusion for Android. Rather than duplicating this logic
+# for every target that uses JNI, we use a single proxy target that
+# encapsulates it.
+alias(
     name = "jni",
-    hdrs = select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            ":jni.h",
-            ":jni_md.h",
-        ],
-    }),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["."],
+    actual = select({
+        # The Android toolchain makes <jni.h> available in the system include
+        # path.
+        # Aliases need to resolve to a single target however, so alias to an
+        # empty library instead.
+        # (Making this target a cc_library with empty deps for the Android case
+        # doesn't work, because go/cpp-features#layering-check requires targets
+        # to _directly_ depend on libraries they include, and cc_library doesn't
+        # have any direct equivalent to java_library's 'export' attribute).
+        "//tensorflow:android": ":empty",
+        # For non-Android toolchains, depend on the JDK JNI headers.
+        "//conditions:default": "@bazel_tools//tools/jdk:jni",
     }),
     visibility = ["//visibility:public"],
 )
 
-# Silly rules to make
-# #include <jni.h>
-# in the source headers work
-# (in combination with the "includes" attribute of the tf_cuda_library rule
-# above. Not needed when using the Android toolchain).
-#
-# Inspired from:
-# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
-# but hopefully there is a simpler alternative to this.
-genrule(
-    name = "copy_jni_h",
-    srcs = ["@bazel_tools//tools/jdk:jni_header"],
-    outs = ["jni.h"],
-    cmd = "cp -f $< $@",
-)
-
-genrule(
-    name = "copy_jni_md_h",
-    srcs = select({
-        "//tensorflow:macos": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
-        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
-    }),
-    outs = ["jni_md.h"],
-    cmd = "cp -f $< $@",
+cc_library(
+    name = "empty",
+    compatible_with = get_compatible_with_portable(),
 )
diff --git a/tensorflow/lite/java/src/main/native/jni_utils.cc b/tensorflow/lite/java/src/main/native/jni_utils.cc
index 68ada930f910..11e76ccfcb61 100644
--- a/tensorflow/lite/java/src/main/native/jni_utils.cc
+++ b/tensorflow/lite/java/src/main/native/jni_utils.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <vector>
+
 #include "tensorflow/lite/c/jni/jni_utils.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc
index cc78d6a53297..7898917f3318 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapperexperimental_jni.cc
@@ -18,11 +18,6 @@ limitations under the License.
 #include <stdio.h>
 #include <time.h>
 
-#include <atomic>
-#include <map>
-#include <utility>
-#include <vector>
-
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index e5c263ea6b56..186a87f22abe 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -660,6 +660,7 @@ BUILTIN_KERNEL_SRCS = [
     "fully_connected.cc",
     "gather.cc",
     "gather_nd.cc",
+    "hadamard_rotation.cc",
     "hashtable.cc",
     "hashtable_find.cc",
     "hashtable_import.cc",
@@ -883,7 +884,7 @@ pybind_extension(
     deps = [
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:mutable_op_resolver",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -1499,6 +1500,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -1722,6 +1724,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -1835,6 +1838,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2558,6 +2562,7 @@ cc_test(
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -3379,6 +3384,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "hadamard_rotation_test",
+    size = "small",
+    srcs = ["hadamard_rotation_test.cc"],
+    deps = [
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework_stable",
+        "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 tflite_portable_test_suite_combined(
     combine_conditions = {"deps": [":test_main"]},
     # TODO(b/229985981) : Remove `nnapi_args` after adding Relu0To1 is completed.
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index acea524f3618..923ee43aa1a0 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -935,6 +935,30 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       }
       return kTfLiteOk;
     } break;
+    case kTfLiteFloat16: {
+      if (kernel_type == kReference) {
+        reference_ops::Tanh(
+            GetTensorShape(input), GetTensorData<Eigen::half>(input),
+            GetTensorShape(output), GetTensorData<Eigen::half>(output));
+      } else {
+        optimized_ops::Tanh(
+            GetTensorShape(input), GetTensorData<Eigen::half>(input),
+            GetTensorShape(output), GetTensorData<Eigen::half>(output));
+      }
+      return kTfLiteOk;
+    } break;
+    case kTfLiteBFloat16: {
+      if (kernel_type == kReference) {
+        reference_ops::Tanh(
+            GetTensorShape(input), GetTensorData<Eigen::bfloat16>(input),
+            GetTensorShape(output), GetTensorData<Eigen::bfloat16>(output));
+      } else {
+        optimized_ops::Tanh(
+            GetTensorShape(input), GetTensorData<Eigen::bfloat16>(input),
+            GetTensorShape(output), GetTensorData<Eigen::bfloat16>(output));
+      }
+      return kTfLiteOk;
+    } break;
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
@@ -1007,7 +1031,7 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32: {
       if (kernel_type == kReference) {
-        reference_ops::Logistic(
+        reference_ops::Logistic<float>(
             GetTensorShape(input), GetTensorData<float>(input),
             GetTensorShape(output), GetTensorData<float>(output));
       } else {
@@ -1017,6 +1041,30 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     }
+    case kTfLiteFloat16: {
+      if (kernel_type == kReference) {
+        reference_ops::Logistic<Eigen::half>(
+            GetTensorShape(input), GetTensorData<Eigen::half>(input),
+            GetTensorShape(output), GetTensorData<Eigen::half>(output));
+      } else {
+        optimized_ops::Logistic<Eigen::half>(
+            GetTensorShape(input), GetTensorData<Eigen::half>(input),
+            GetTensorShape(output), GetTensorData<Eigen::half>(output));
+      }
+      break;
+    }
+    case kTfLiteBFloat16: {
+      if (kernel_type == kReference) {
+        reference_ops::Logistic<Eigen::bfloat16>(
+            GetTensorShape(input), GetTensorData<Eigen::bfloat16>(input),
+            GetTensorShape(output), GetTensorData<Eigen::bfloat16>(output));
+      } else {
+        optimized_ops::Logistic<Eigen::bfloat16>(
+            GetTensorShape(input), GetTensorData<Eigen::bfloat16>(input),
+            GetTensorShape(output), GetTensorData<Eigen::bfloat16>(output));
+      }
+      break;
+    }
     case kTfLiteInt16: {
       LogisticParams params;
       if (kernel_type == kReference || (data->input_multiplier > 0)) {
@@ -1548,6 +1596,13 @@ TfLiteStatus GeluPrepare(TfLiteContext* context, TfLiteNode* node) {
                              ? reference_ops::GeluTransformApproximate
                              : reference_ops::GeluTransform,
                          data->lut_uint8);
+  } else if (input->type == kTfLiteInt16) {
+    LUTPopulate<int16_t>(input->params.scale, input->params.zero_point,
+                         output->params.scale, output->params.zero_point,
+                         params->approximate
+                             ? reference_ops::GeluTransformApproximate
+                             : reference_ops::GeluTransform,
+                         data->lut_int16);
   }
   return GenericPrepare(context, node);
 }
@@ -1578,6 +1633,12 @@ TfLiteStatus GeluEval(TfLiteContext* context, TfLiteNode* node) {
           MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)),
           data->lut_int8, GetTensorData<int8_t>(output));
       return kTfLiteOk;
+    case kTfLiteInt16:
+      reference_ops::LookupTableInt16(
+          GetTensorData<int16_t>(input),
+          MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)),
+          data->lut_int16, GetTensorData<int16_t>(output));
+      return kTfLiteOk;
     default:
       TF_LITE_KERNEL_LOG(
           context, "Only float32, int8 and uint8 supported currently, got %s.",
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 226b10e62009..225739646118 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -185,15 +185,13 @@ class BaseActivationsOpModel : public SingleOpModel {
   int input_;
   int output_;
 };
-
+template <typename T>
 class FloatActivationsOpModel : public BaseActivationsOpModel {
  public:
   using BaseActivationsOpModel::BaseActivationsOpModel;
 
-  void SetInput(const std::vector<float>& data) {
-    PopulateTensor(input_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  void SetInput(const std::vector<T>& data) { PopulateTensor(input_, data); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
 };
 
 // Our fixed-point math function implementations have roughly 12 bits of
@@ -286,8 +284,9 @@ class SoftmaxOpTest : public SingleOpTest {
 };
 
 TEST(FloatActivationsOpTest, Elu) {
-  FloatActivationsOpModel m(BuiltinOperator_ELU,
-                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  FloatActivationsOpModel<float> m(
+      BuiltinOperator_ELU,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0, -6, 2, -4,     //
       3, -2, 10, -0.1,  //
@@ -323,8 +322,9 @@ TEST(QuantizedActivationsOpTest, EluInt8) {
 }
 
 TEST(FloatActivationsOpTest, Relu) {
-  FloatActivationsOpModel m(BuiltinOperator_RELU,
-                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  FloatActivationsOpModel<float> m(
+      BuiltinOperator_RELU,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0, -6, 2, 4,   //
       3, -2, 10, 1,  //
@@ -337,8 +337,9 @@ TEST(FloatActivationsOpTest, Relu) {
 }
 
 TEST(FloatActivationsOpTest, Relu0To1) {
-  FloatActivationsOpModel m(BuiltinOperator_RELU_0_TO_1,
-                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  FloatActivationsOpModel<float> m(
+      BuiltinOperator_RELU_0_TO_1,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0.0, -0.6, 0.2, -0.4,  //
       0.3, -2.0, 1.1, -0.1,  //
@@ -352,8 +353,9 @@ TEST(FloatActivationsOpTest, Relu0To1) {
 }
 
 TEST(FloatActivationsOpTest, Relu1) {
-  FloatActivationsOpModel m(BuiltinOperator_RELU_N1_TO_1,
-                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  FloatActivationsOpModel<float> m(
+      BuiltinOperator_RELU_N1_TO_1,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0.0, -0.6, 0.2, -0.4,  //
       0.3, -2.0, 1.1, -0.1,  //
@@ -367,8 +369,9 @@ TEST(FloatActivationsOpTest, Relu1) {
 }
 
 TEST(FloatActivationsOpTest, Relu6) {
-  FloatActivationsOpModel m(BuiltinOperator_RELU6,
-                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  FloatActivationsOpModel<float> m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0, -6, 2, 4,   //
       3, -2, 10, 1,  //
@@ -420,9 +423,10 @@ void TestFloatHardSwish(int size, std::minstd_rand* random_engine) {
   std::vector<float> float_ref_output_values;
   EvalTestReferenceHardSwish(size, float_input_values,
                              &float_ref_output_values);
-  FloatActivationsOpModel m(BuiltinOperator_HARD_SWISH,
-                            /*input=*/{TensorType_FLOAT32, {1, 1, 1, size}},
-                            /*output=*/{TensorType_FLOAT32, {1, 1, 1, size}});
+  FloatActivationsOpModel<float> m(
+      BuiltinOperator_HARD_SWISH,
+      /*input=*/{TensorType_FLOAT32, {1, 1, 1, size}},
+      /*output=*/{TensorType_FLOAT32, {1, 1, 1, size}});
   m.SetInput(float_input_values);
 
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -555,8 +559,9 @@ TEST(QuantizedActivationsOpTest, HardSwishBias) {
 }
 
 TEST_P(TanhOpTest, Tanh) {
-  FloatActivationsOpModel m(GetRegistration(), BuiltinOperator_TANH,
-                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  FloatActivationsOpModel<float> m(
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0, -6, 2, 4,   //
       3, -2, 10, 1,  //
@@ -568,6 +573,64 @@ TEST_P(TanhOpTest, Tanh) {
                              })));
 }
 
+TEST_P(TanhOpTest, TanhFloat16) {
+  FloatActivationsOpModel<Eigen::half> m(
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  m.SetInput({
+      Eigen::half(0),
+      Eigen::half(-6),
+      Eigen::half(2),
+      Eigen::half(4),
+      Eigen::half(3),
+      Eigen::half(-2),
+      Eigen::half(10),
+      Eigen::half(1),
+  });
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     0,
+                                     -0.9999877,
+                                     0.9640275,
+                                     0.999329,
+                                     0.99505475,
+                                     -0.9640275,
+                                     1,
+                                     0.7615941,
+                                 },
+                                 1e-1)));
+}
+
+TEST_P(TanhOpTest, TanhBFloat16) {
+  FloatActivationsOpModel<Eigen::bfloat16> m(
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_BFLOAT16, {1, 2, 4, 1}});
+  m.SetInput({
+      Eigen::bfloat16(0.5),
+      Eigen::bfloat16(-3),
+      Eigen::bfloat16(1.5),
+      Eigen::bfloat16(0.75),
+      Eigen::bfloat16(-1.5),
+      Eigen::bfloat16(2.5),
+      Eigen::bfloat16(-0.75),
+      Eigen::bfloat16(0.25),
+  });
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     0.462117,
+                                     -0.995054,
+                                     0.905148,
+                                     0.635148,
+                                     -0.905148,
+                                     0.986614,
+                                     -0.635148,
+                                     0.244918,
+                                 },
+                                 1e-1)));
+}
+
 TEST(QuantizedActivationsOpTest, Relu6Uint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
@@ -1119,20 +1182,87 @@ TEST_P(TanhOpTest, TanhInt16General) {
                   kQuantizedToleranceInt16)));
 }
 
-TEST_P(LogisticOpTest, Sigmoid) {
-  FloatActivationsOpModel m(GetRegistration(), BuiltinOperator_LOGISTIC,
-                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+TEST_P(LogisticOpTest, SigmoidFloat32) {
+  FloatActivationsOpModel<float> m(
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
-      0, -6, 2, 4,   //
-      3, -2, 10, 1,  //
+      0,
+      -6,
+      2,
+      4,
+      3,
+      -2,
+      10,
+      1,
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 0.5, 0.002473, 0.880797, 0.982014,       //
-                                 0.952574, 0.119203, 0.999955, 0.731059,  //
+                                 0.5,
+                                 0.002473,
+                                 0.880797,
+                                 0.982014,
+                                 0.952574,
+                                 0.119203,
+                                 0.999955,
+                                 0.731059,
                              })));
 }
 
+TEST_P(LogisticOpTest, SigmoidFloat16) {
+  FloatActivationsOpModel<Eigen::half> m(
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  m.SetInput({
+      Eigen::half{-1.2f},
+      Eigen::half{-6.0f},
+      Eigen::half{2.0f},
+      Eigen::half{4.0f},
+      Eigen::half{3.0f},
+      Eigen::half{-2.0f},
+      Eigen::half{10.0f},
+      Eigen::half{1.0f},
+  });
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     0.231475,
+                                     0.002473,
+                                     0.880797,
+                                     0.982014,
+                                     0.952574,
+                                     0.119203,
+                                     0.999955,
+                                     0.731059,
+                                 },
+                                 0.002f)));
+}
+
+TEST_P(LogisticOpTest, SigmoidBFloat16) {
+  FloatActivationsOpModel<Eigen::bfloat16> m(
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_BFLOAT16, {1, 2, 4, 1}});
+  m.SetInput({
+      Eigen::bfloat16{-1.2f}, Eigen::bfloat16{-8.0f}, Eigen::bfloat16{-5.0f},
+      Eigen::bfloat16{-3.0f},  //
+      Eigen::bfloat16{3.2f}, Eigen::bfloat16{4.0f}, Eigen::bfloat16{6.5f},
+      Eigen::bfloat16{10.0f},  //
+  });
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     0.231475,
+                                     0.000335,
+                                     0.00669,
+                                     0.047426,
+                                     0.96083,
+                                     0.98201,
+                                     0.998498,
+                                     0.999955,
+                                 },
+                                 0.002f)));
+}
+
 TEST_P(LogisticOpTest, SigmoidUint8) {
   QuantizedActivationsOpModel m(GetRegistration(), BuiltinOperator_LOGISTIC,
                                 /*input=*/{TensorType_UINT8, {89}, -10, 10});
@@ -1355,9 +1485,9 @@ TEST_P(LogisticOpTest, SigmoidInt16General) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax4DInplace) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f,
-                            {TensorType_FLOAT32, {1, 2, 1, 4}},
-                            TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(GetRegistration(), 0.1f,
+                                   {TensorType_FLOAT32, {1, 2, 1, 4}},
+                                   TensorType_FLOAT32);
   m.SetInput({
       0, -6, 2, 4,   // depth = 0
       3, -2, 10, 1,  // depth = 1
@@ -1376,9 +1506,9 @@ TEST_P(SoftmaxOpTest, Softmax4DInplace) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax4D) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f,
-                            {TensorType_FLOAT32, {1, 2, 1, 4}},
-                            TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(GetRegistration(), 0.1f,
+                                   {TensorType_FLOAT32, {1, 2, 1, 4}},
+                                   TensorType_FLOAT32);
   m.SetInput({
       0, -6, 2, 4,   // depth = 0
       3, -2, 10, 1,  // depth = 1
@@ -1390,9 +1520,9 @@ TEST_P(SoftmaxOpTest, Softmax4D) {
                              })));
 
   // Same input, but a different shape.
-  FloatActivationsOpModel m2(GetRegistration(), 0.1f,
-                             {TensorType_FLOAT32, {4, 1, 1, 2}},
-                             TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m2(GetRegistration(), 0.1f,
+                                    {TensorType_FLOAT32, {4, 1, 1, 2}},
+                                    TensorType_FLOAT32);
   m2.SetInput({
       0, -6,  //
       2, 4,   //
@@ -1928,9 +2058,9 @@ TEST_P(SoftmaxOpTest, Softmax4DInt8Int16) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax3D) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f,
-                            {TensorType_FLOAT32, {1, 2, 4}},
-                            TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(GetRegistration(), 0.1f,
+                                   {TensorType_FLOAT32, {1, 2, 4}},
+                                   TensorType_FLOAT32);
   m.SetInput({
       0, -6, 2, 4,   // depth = 0
       3, -2, 10, 1,  // depth = 1
@@ -1942,9 +2072,9 @@ TEST_P(SoftmaxOpTest, Softmax3D) {
                              })));
 
   // Same input, but a different shape.
-  FloatActivationsOpModel m2(GetRegistration(), 0.1f,
-                             {TensorType_FLOAT32, {4, 1, 2}},
-                             TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m2(GetRegistration(), 0.1f,
+                                    {TensorType_FLOAT32, {4, 1, 2}},
+                                    TensorType_FLOAT32);
   m2.SetInput({
       0, -6,  //
       2, 4,   //
@@ -2039,8 +2169,8 @@ TEST_P(SoftmaxOpTest, Softmax3DUint8Int16) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax1D) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f, {TensorType_FLOAT32, {8}},
-                            TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(
+      GetRegistration(), 0.1f, {TensorType_FLOAT32, {8}}, TensorType_FLOAT32);
   m.SetInput({0, -6, 2, 4, 3, -2, 10, 1});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
@@ -2050,8 +2180,8 @@ TEST_P(SoftmaxOpTest, Softmax1D) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax1DMax) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f, {TensorType_FLOAT32, {8}},
-                            TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(
+      GetRegistration(), 0.1f, {TensorType_FLOAT32, {8}}, TensorType_FLOAT32);
   m.SetInput({std::numeric_limits<float>::max(), -6, 2, 4, 3, -2, 10, 1});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(),
@@ -2059,8 +2189,8 @@ TEST_P(SoftmaxOpTest, Softmax1DMax) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax1DInf) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f, {TensorType_FLOAT32, {8}},
-                            TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(
+      GetRegistration(), 0.1f, {TensorType_FLOAT32, {8}}, TensorType_FLOAT32);
   m.SetInput({std::numeric_limits<float>::infinity(), -6, 2, 4, 3, -2, 10, 1});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   auto output = m.GetOutput();
@@ -2096,8 +2226,9 @@ TEST_P(SoftmaxOpTest, Softmax1DUint8Int16) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax2D) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f,
-                            {TensorType_FLOAT32, {2, 4}}, TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(GetRegistration(), 0.1f,
+                                   {TensorType_FLOAT32, {2, 4}},
+                                   TensorType_FLOAT32);
   m.SetInput({
       0, -6, 2, 4,   //
       3, -2, 10, 1,  //
@@ -2109,8 +2240,9 @@ TEST_P(SoftmaxOpTest, Softmax2D) {
                              })));
 
   // Same input, but a different shape.
-  FloatActivationsOpModel m2(GetRegistration(), 0.1f,
-                             {TensorType_FLOAT32, {4, 2}}, TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m2(GetRegistration(), 0.1f,
+                                    {TensorType_FLOAT32, {4, 2}},
+                                    TensorType_FLOAT32);
   m2.SetInput({
       0, -6,  //
       2, 4,   //
@@ -2127,8 +2259,9 @@ TEST_P(SoftmaxOpTest, Softmax2D) {
 }
 
 TEST_P(SoftmaxOpTest, Softmax2DMultithreading) {
-  FloatActivationsOpModel m(GetRegistration(), 0.1f,
-                            {TensorType_FLOAT32, {16, 4}}, TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m(GetRegistration(), 0.1f,
+                                   {TensorType_FLOAT32, {16, 4}},
+                                   TensorType_FLOAT32);
   m.SetInput({
       0, -6, 2,  4,  //  Thread 1.
       3, -2, 10, 1,  //
@@ -2169,8 +2302,9 @@ TEST_P(SoftmaxOpTest, Softmax2DMultithreading) {
                              })));
 
   // Same input, but a different shape.
-  FloatActivationsOpModel m2(GetRegistration(), 0.1f,
-                             {TensorType_FLOAT32, {16, 2}}, TensorType_FLOAT32);
+  FloatActivationsOpModel<float> m2(GetRegistration(), 0.1f,
+                                    {TensorType_FLOAT32, {16, 2}},
+                                    TensorType_FLOAT32);
   m2.SetInput({
       0,  -6,  // Thread 1
       2,  4,   //
@@ -2300,8 +2434,9 @@ TEST_P(SoftmaxOpTest, Softmax2DUint8Int16) {
 //     print('lsm2', sess.run(lsm2))
 
 TEST_P(LogSoftmaxOpTest, LogSoftmax) {
-  FloatActivationsOpModel m(GetRegistration(), BuiltinOperator_LOG_SOFTMAX,
-                            /*input=*/{TensorType_FLOAT32, {2, 4}});
+  FloatActivationsOpModel<float> m(GetRegistration(),
+                                   BuiltinOperator_LOG_SOFTMAX,
+                                   /*input=*/{TensorType_FLOAT32, {2, 4}});
   m.SetInput({
       0, -6, 2, 4,   //
       3, -2, 10, 1,  //
@@ -2313,8 +2448,9 @@ TEST_P(LogSoftmaxOpTest, LogSoftmax) {
                              })));
 
   // Same input, but a different shape.
-  FloatActivationsOpModel m2(GetRegistration(), BuiltinOperator_LOG_SOFTMAX,
-                             /*input=*/{TensorType_FLOAT32, {4, 2}});
+  FloatActivationsOpModel<float> m2(GetRegistration(),
+                                    BuiltinOperator_LOG_SOFTMAX,
+                                    /*input=*/{TensorType_FLOAT32, {4, 2}});
   m2.SetInput({
       0, -6,  //
       2, 4,   //
@@ -2843,6 +2979,46 @@ TEST(QuantizedGeluOpTest, GeluUInt8Approximate) {
               })));
 }
 
+void GeluInt16Test(bool approximate) {
+  // Define an input for GELU op.
+  const auto gelu_input = {
+      -6.0f, -5.5f, -5.0f, -4.5f, -4.0f, -3.5f, -3.0f, -2.5f, -2.0f,
+      -1.5f, -1.0f, -0.5f, 0.0f,  0.5f,  1.0f,  1.5f,  2.0f,  2.5f,
+      3.0f,  3.5f,  4.0f,  4.5f,  5.0f,  5.5f,  6.0f,
+  };
+
+  // Initialize the float GELU op model and run it. An output will be generated
+  // and compared with the quantized GELU op model output.
+  FloatGeluOpModel model(
+      {TensorType_FLOAT32, {1, static_cast<int>(gelu_input.size())}},
+      approximate);
+  model.SetInput(gelu_input);
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  const auto float_gelu_output = model.GetOutput();
+
+  // Initialize the quantized GELU op model and run it.
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedGeluOpModel quant_model({TensorType_INT16,
+                                    {1, static_cast<int>(gelu_input.size())},
+                                    6 * kMin,
+                                    6 * kMax},
+                                   approximate);
+  quant_model.SetInput<int16_t>(gelu_input);
+  ASSERT_EQ(quant_model.Invoke(), kTfLiteOk);
+
+  // Compare the float and quantized GELU op model outputs.
+  EXPECT_THAT(quant_model.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear(float_gelu_output, kQuantizedToleranceInt16)));
+}
+
+TEST(QuantizedGeluOpTest, GeluInt16) { GeluInt16Test(/*approximate=*/false); }
+
+TEST(QuantizedGeluOpTest, GeluInt16Approximate) {
+  GeluInt16Test(/*approximate=*/true);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     TanhOpTest, TanhOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kTanhKernelMap)));
diff --git a/tensorflow/lite/kernels/atan2.cc b/tensorflow/lite/kernels/atan2.cc
index 3bb5fedca702..d7012bb46df0 100644
--- a/tensorflow/lite/kernels/atan2.cc
+++ b/tensorflow/lite/kernels/atan2.cc
@@ -14,6 +14,7 @@
 
 #include <cmath>
 
+#include "Eigen/Core"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -23,13 +24,10 @@ namespace ops {
 namespace builtin {
 namespace atan2 {
 
-TfLiteStatus EnsureSameShape(
-    TfLiteContext* context,
-    const TfLiteTensor* a, const TfLiteTensor* b) {
-  TF_LITE_ENSURE_EQ(context,
-                    tflite::NumDimensions(a),
+TfLiteStatus EnsureSameShape(TfLiteContext* context, const TfLiteTensor* a,
+                             const TfLiteTensor* b) {
+  TF_LITE_ENSURE_EQ(context, tflite::NumDimensions(a),
                     tflite::NumDimensions(b));
-
   return TfLiteStatus::kTfLiteOk;
 }
 
@@ -40,23 +38,21 @@ TfLiteStatus Atan2Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input_y = tflite::GetInput(context, node, 0);
   const TfLiteTensor* input_x = tflite::GetInput(context, node, 1);
   TfLiteTensor* output = tflite::GetOutput(context, node, 0);
-
   // Validate size and type constraints
   TF_LITE_ENSURE_OK(context, EnsureSameShape(context, input_y, input_x));
   TF_LITE_ENSURE_TYPES_EQ(context, input_y->type, input_x->type);
   TF_LITE_ENSURE_TYPES_EQ(context, input_y->type, output->type);
-  TF_LITE_ENSURE(context,
-                 input_y->type == kTfLiteFloat32 ||
-                 input_y->type == kTfLiteFloat64);
+  TF_LITE_ENSURE(context, input_y->type == kTfLiteFloat32 ||
+                              input_y->type == kTfLiteFloat64 ||
+                              input_y->type == kTfLiteBFloat16 ||
+                              input_y->type == kTfLiteFloat16);
 
   TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input_y->dims);
-
   return context->ResizeTensor(context, output, output_shape);
 }
 
-template<typename Float>
-TfLiteStatus Atan2(const TfLiteTensor* input_y,
-                   const TfLiteTensor* input_x,
+template <typename Float>
+TfLiteStatus Atan2(const TfLiteTensor* input_y, const TfLiteTensor* input_x,
                    TfLiteTensor* output) {
   const Float* data_y = tflite::GetTensorData<Float>(input_y);
   const Float* data_x = tflite::GetTensorData<Float>(input_x);
@@ -64,9 +60,8 @@ TfLiteStatus Atan2(const TfLiteTensor* input_y,
 
   const int64_t num_elements = NumElements(input_y);
   for (int64_t i = 0; i < num_elements; ++i) {
-    data_output[i] = std::atan2(data_y[i], data_x[i]);
+    data_output[i] = static_cast<Float>(std::atan2((data_y[i]), data_x[i]));
   }
-
   return TfLiteStatus::kTfLiteOk;
 }
 
@@ -82,11 +77,16 @@ TfLiteStatus Atan2Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat64:
       TF_LITE_ENSURE_OK(context, Atan2<double>(input_y, input_x, output));
       break;
+    case kTfLiteFloat16:
+      TF_LITE_ENSURE_OK(context, Atan2<Eigen::half>(input_y, input_x, output));
+      break;
+    case kTfLiteBFloat16:
+      TF_LITE_ENSURE_OK(context,
+                        Atan2<Eigen::bfloat16>(input_y, input_x, output));
+      break;
     default: {
-      TF_LITE_KERNEL_LOG(
-          context,
-          "Unsupported datatype for atan2 output: %s",
-          TfLiteTypeGetName(output->type));
+      TF_LITE_KERNEL_LOG(context, "Unsupported datatype for atan2 output: %s",
+                         TfLiteTypeGetName(output->type));
       return TfLiteStatus::kTfLiteError;
     }
   }
@@ -97,8 +97,8 @@ TfLiteStatus Atan2Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace atan2
 
 TfLiteRegistration* Register_ATAN2() {
-  static TfLiteRegistration r = {
-    nullptr, nullptr, atan2::Atan2Prepare, atan2::Atan2Eval};
+  static TfLiteRegistration r = {nullptr, nullptr, atan2::Atan2Prepare,
+                                 atan2::Atan2Eval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/atan2_test.cc b/tensorflow/lite/kernels/atan2_test.cc
index 55985cec92aa..309ba79f284f 100644
--- a/tensorflow/lite/kernels/atan2_test.cc
+++ b/tensorflow/lite/kernels/atan2_test.cc
@@ -23,7 +23,6 @@ namespace {
 
 template <typename T>
 tflite::TensorType GetTTEnum();
-
 template <>
 tflite::TensorType GetTTEnum<float>() {
   return tflite::TensorType_FLOAT32;
@@ -34,10 +33,19 @@ tflite::TensorType GetTTEnum<double>() {
   return tflite::TensorType_FLOAT64;
 }
 
+template <>
+tflite::TensorType GetTTEnum<Eigen::half>() {
+  return tflite::TensorType_FLOAT16;
+}
+
+template <>
+tflite::TensorType GetTTEnum<Eigen::bfloat16>() {
+  return tflite::TensorType_BFLOAT16;
+}
+
 class Atan2Model : public tflite::SingleOpModel {
  public:
-  Atan2Model(tflite::TensorData y,
-             tflite::TensorData x,
+  Atan2Model(tflite::TensorData y, tflite::TensorData x,
              tflite::TensorData output) {
     y_ = AddInput(y);
     x_ = AddInput(x);
@@ -47,9 +55,7 @@ class Atan2Model : public tflite::SingleOpModel {
   }
 
   template <typename T>
-  std::vector<T> GetOutput(
-      const std::vector<T>& y,
-      const std::vector<T>& x) {
+  std::vector<T> GetOutput(const std::vector<T>& y, const std::vector<T>& x) {
     PopulateTensor<T>(y_, y);
     PopulateTensor<T>(x_, x);
     Invoke();
@@ -68,7 +74,7 @@ class Atan2Test : public ::testing::Test {
   using FloatType = Float;
 };
 
-using TestTypes = ::testing::Types<float, double>;
+using TestTypes = ::testing::Types<float, double, Eigen::half, Eigen::bfloat16>;
 
 TYPED_TEST_SUITE(Atan2Test, TestTypes);
 
@@ -78,13 +84,16 @@ TYPED_TEST(Atan2Test, TestScalar) {
   tflite::TensorData x = {GetTTEnum<Float>(), {}};
   tflite::TensorData output = {GetTTEnum<Float>(), {}};
   Atan2Model m(y, x, output);
-  auto got = m.GetOutput<Float>({0.0}, {0.0});
+
+  auto got = m.GetOutput<Float>({Float(0.0)}, {Float(0.0)});
   ASSERT_EQ(got.size(), 1);
   EXPECT_FLOAT_EQ(got[0], 0.0);
-
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({1.0}, {0.0})[0], M_PI/2);
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({0.0}, {1.0})[0], 0.0);
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({-1.0}, {0.0})[0], -M_PI/2);
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0)}, {Float(0.0)})[0],
+                  Float(M_PI / 2));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0)}, {Float(1.0)})[0],
+                  Float(0.0));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0)}, {Float(0.0)})[0],
+                  Float(-M_PI / 2));
 }
 
 TYPED_TEST(Atan2Test, TestBatch) {
@@ -93,15 +102,14 @@ TYPED_TEST(Atan2Test, TestBatch) {
   tflite::TensorData x = {GetTTEnum<Float>(), {4, 2, 1}};
   tflite::TensorData output = {GetTTEnum<Float>(), {4, 2, 1}};
   Atan2Model m(y, x, output);
-
-  std::vector<Float> y_data = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
-  std::vector<Float> x_data = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1};
-
+  std::vector<Float> y_data = {Float(0.1), Float(0.2), Float(0.3), Float(0.4),
+                               Float(0.5), Float(0.6), Float(0.7), Float(0.8)};
+  std::vector<Float> x_data = {Float(0.8), Float(0.7), Float(0.6), Float(0.5),
+                               Float(0.4), Float(0.3), Float(0.2), Float(0.1)};
   auto got = m.GetOutput<Float>(y_data, x_data);
-
   ASSERT_EQ(got.size(), 8);
   for (int i = 0; i < 8; ++i) {
-    EXPECT_FLOAT_EQ(got[i], std::atan2(y_data[i], x_data[i]));
+    EXPECT_FLOAT_EQ(got[i], Float(std::atan2(y_data[i], x_data[i])));
   }
 }
 
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index b5f416ea68ce..7fffc9ec3cb1 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -571,36 +571,12 @@ TfLiteStatus EvalInt8Int32(TfLiteContext* context, const OpData* data,
                            const RuntimeShape& rhs_shape,
                            const TfLiteTensor* rhs,
                            const RuntimeShape& output_shape,
-                           TfLiteTensor* output, bool transpose_lhs) {
-  // Reuse params struct from FullyConnected Op.
-  FullyConnectedParams op_params;
-  int32_t input_offset = -lhs->params.zero_point;
-  int32_t weights_offset = -rhs->params.zero_point;
-  int32_t output_offset = output->params.zero_point;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = weights_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  op_params.lhs_cacheable = IsConstantTensor(lhs);
-  op_params.rhs_cacheable = IsConstantTensor(rhs);
-
+                           TfLiteTensor* output) {
   // Set BatchMatMul lhs param to rhs(filter) and rhs param to lhs(input). For
   // the reason, see comment of Eval() function.
-  if (kernel_type == kReference) {
-    reference_ops::BatchMatMul<int8, int8, int32>(
-        rhs_shape, GetTensorData<int8>(rhs), lhs_shape,
-        GetTensorData<int8>(lhs), GetTensorShape(output),
-        GetTensorData<int32>(output));
-  } else {
-    optimized_ops::BatchMatMul(
-        op_params, rhs_shape, GetTensorData<int8_t>(rhs), lhs_shape,
-        GetTensorData<int8_t>(lhs), GetTensorShape(output),
-        GetTensorData<int32_t>(output),
-        CpuBackendContext::GetFromContext(context), transpose_lhs);
-  }
+  reference_ops::BatchMatMul<int8, int8, int32>(
+      rhs_shape, GetTensorData<int8>(rhs), lhs_shape, GetTensorData<int8>(lhs),
+      GetTensorShape(output), GetTensorData<int32>(output));
   return kTfLiteOk;
 }
 
@@ -665,7 +641,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     } else {
       return EvalInt8Int32<kernel_type>(context, data, lhs_shape, lhs,
                                         rhs_shape, rhs, GetTensorShape(output),
-                                        output, transpose_lhs);
+                                        output);
     }
   } else if (lhs->type == kTfLiteInt16 && rhs->type == kTfLiteInt16) {
     return EvalInt16<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
@@ -799,7 +775,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* rhs_tensor = rhs;
   bool implicit_transpose_possible = true;
   if (lhs->type == kTfLiteFloat32 || kernel_type == kReference ||
-      rhs->type == kTfLiteInt16) {
+      rhs->type == kTfLiteInt16 ||
+      (rhs->type == kTfLiteInt8 && output->type == kTfLiteInt32)) {
     implicit_transpose_possible = false;
   }
   bool do_implicit_transpose = !adj_y && implicit_transpose_possible;
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 40f3b8128254..6472da7ca660 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/lstm_eval.h"
@@ -320,7 +319,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, input_gate_bias_tensor);
   if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+    TF_LITE_ENSURE(context, input_gate_bias == nullptr);
   } else {
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index cd8556938249..47bd2156ebfe 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -365,6 +365,14 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
       Comparison<float, reference_ops::LessFn>(input1, input2, output,
                                                requires_broadcast);
       break;
+    case kTfLiteFloat16:
+      Comparison<Eigen::half, reference_ops::LessFn>(input1, input2, output,
+                                                     requires_broadcast);
+      break;
+    case kTfLiteBFloat16:
+      Comparison<Eigen::bfloat16, reference_ops::LessFn>(input1, input2, output,
+                                                         requires_broadcast);
+      break;
     case kTfLiteInt16:
       Comparison<int16_t, reference_ops::LessFn>(input1, input2, output,
                                                  requires_broadcast);
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 204b390c1a30..9bf934e07389 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -392,6 +392,36 @@ TEST(ComparisonsTest, LessFloat) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
+TEST(ComparisonsTest, LessFloat16) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT16,
+                          BuiltinOperator_LESS);
+  model.PopulateTensor<Eigen::half>(
+      model.input1(),
+      {Eigen::half(0.1), Eigen::half(0.9), Eigen::half(0.7), Eigen::half(0.3)});
+  model.PopulateTensor<Eigen::half>(
+      model.input2(),
+      {Eigen::half(0.1), Eigen::half(0.2), Eigen::half(0.6), Eigen::half(0.5)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(ComparisonsTest, LessBFloat16) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_BFLOAT16,
+                          BuiltinOperator_LESS);
+  model.PopulateTensor<Eigen::bfloat16>(
+      model.input1(), {Eigen::bfloat16(0.1), Eigen::bfloat16(0.9),
+                       Eigen::bfloat16(0.7), Eigen::bfloat16(0.3)});
+  model.PopulateTensor<Eigen::bfloat16>(
+      model.input2(), {Eigen::bfloat16(0.1), Eigen::bfloat16(0.2),
+                       Eigen::bfloat16(0.6), Eigen::bfloat16(0.5)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(ComparisonsTest, LessInt) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32,
                           BuiltinOperator_LESS);
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 4f116440fd20..fbf153b90d73 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstring>
 #include <limits>
 
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -91,6 +92,12 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node, int axis,
     case kTfLiteFloat32:
       TF_LITE_CONCATENATION(float);
       break;
+    case kTfLiteFloat16:
+      TF_LITE_CONCATENATION(Eigen::half);
+      break;
+    case kTfLiteBFloat16:
+      TF_LITE_CONCATENATION(Eigen::bfloat16);
+      break;
     case kTfLiteInt32:
       TF_LITE_CONCATENATION(int32);
       break;
@@ -142,10 +149,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
   TF_LITE_ENSURE(context,
-                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
-                     input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
-                     input_type == kTfLiteInt32 || input_type == kTfLiteInt64 ||
-                     input_type == kTfLiteBool || input_type == kTfLiteUInt32);
+                 input_type == kTfLiteFloat32 || input_type == kTfLiteFloat16 ||
+                     input_type == kTfLiteBFloat16 ||
+                     input_type == kTfLiteUInt8 || input_type == kTfLiteInt8 ||
+                     input_type == kTfLiteInt16 || input_type == kTfLiteInt32 ||
+                     input_type == kTfLiteInt64 || input_type == kTfLiteBool ||
+                     input_type == kTfLiteUInt32);
 
   // Check to see if we can calculate the output now.
   bool all_inputs_at_prepare = true;
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 685abd5d5e75..28692ae1528d 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -108,6 +108,29 @@ TEST(ConcatenationOpTest, ThreeDimensionalOneInput) {
   EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
 }
 
+TEST(ConcatenationOpTest, ThreeDimensionalOneInputBFloat16) {
+  ConcatenationOpModel<Eigen::bfloat16> m({TensorType_BFLOAT16, {2, 1, 2}},
+                                          /*axis=*/1,
+                                          /*num_inputs=*/1);
+  m.SetInput(
+      0,
+      {static_cast<Eigen::bfloat16>(1.0f), static_cast<Eigen::bfloat16>(3.0f),
+       static_cast<Eigen::bfloat16>(4.0f), static_cast<Eigen::bfloat16>(7.0f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
+}
+
+TEST(ConcatenationOpTest, ThreeDimensionalOneInputFloat16) {
+  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2}},
+                                      /*axis=*/1,
+                                      /*num_inputs=*/1);
+  m.SetInput(0,
+             {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(3.0f),
+              static_cast<Eigen::half>(4.0f), static_cast<Eigen::half>(7.0f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
+}
+
 TEST(ConcatenationOpTest, ThreeDimensionalOneInputUInt32) {
   ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 1, 2}}, /*axis=*/1,
                                     /*num_inputs=*/1);
@@ -152,6 +175,61 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInput) {
                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}));
 }
 
+TEST(ConcatenationOpTest, FiveDimensionalTwoInputBFloat16) {
+  ConcatenationOpModel<Eigen::bfloat16> m(
+      {TensorType_BFLOAT16, {2, 1, 2, 1, 3}},
+      /*axis=*/0,
+      /*num_inputs=*/2);
+  m.SetInput(
+      0,
+      {static_cast<Eigen::bfloat16>(1.0f), static_cast<Eigen::bfloat16>(2.0f),
+       static_cast<Eigen::bfloat16>(3.0f), static_cast<Eigen::bfloat16>(4.0f),
+       static_cast<Eigen::bfloat16>(5.0f), static_cast<Eigen::bfloat16>(6.0f),
+       static_cast<Eigen::bfloat16>(7.0f), static_cast<Eigen::bfloat16>(8.0f),
+       static_cast<Eigen::bfloat16>(9.0f), static_cast<Eigen::bfloat16>(10.0f),
+       static_cast<Eigen::bfloat16>(11.0f),
+       static_cast<Eigen::bfloat16>(12.0f)});
+  m.SetInput(
+      1,
+      {static_cast<Eigen::bfloat16>(13.0f), static_cast<Eigen::bfloat16>(14.0f),
+       static_cast<Eigen::bfloat16>(15.0f), Eigen::bfloat16{16.0f},
+       static_cast<Eigen::bfloat16>(17.0f), static_cast<Eigen::bfloat16>(18.0f),
+       static_cast<Eigen::bfloat16>(19.0f), static_cast<Eigen::bfloat16>(20.0f),
+       static_cast<Eigen::bfloat16>(21.0f), static_cast<Eigen::bfloat16>(22.0f),
+       static_cast<Eigen::bfloat16>(23.0f),
+       static_cast<Eigen::bfloat16>(24.0f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                        13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}));
+}
+
+TEST(ConcatenationOpTest, FiveDimensionalTwoInputFloat16) {
+  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
+                                      /*axis=*/0,
+                                      /*num_inputs=*/2);
+  m.SetInput(
+      0, {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(2.0f),
+          static_cast<Eigen::half>(3.0f), static_cast<Eigen::half>(4.0f),
+          static_cast<Eigen::half>(5.0f), static_cast<Eigen::half>(6.0f),
+          static_cast<Eigen::half>(7.0f), Eigen::half{8.0f},
+          static_cast<Eigen::half>(9.0f), static_cast<Eigen::half>(10.0f),
+          static_cast<Eigen::half>(11.0f), static_cast<Eigen::half>(12.0f)});
+  m.SetInput(
+      1, {static_cast<Eigen::half>(13.0f), static_cast<Eigen::half>(14.0f),
+          Eigen::half{15.0f}, static_cast<Eigen::half>(16.0f),
+          Eigen::half{17.0f}, static_cast<Eigen::half>(18.0f),
+          static_cast<Eigen::half>(19.0f), static_cast<Eigen::half>(20.0f),
+          static_cast<Eigen::half>(21.0f), static_cast<Eigen::half>(22.0f),
+          static_cast<Eigen::half>(23.0f), static_cast<Eigen::half>(24.0f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                        13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}));
+}
+
 TEST(ConcatenationOpTest, FiveDimensionalTwoInputUInt32) {
   ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 1, 2, 1, 3}},
                                     /*axis=*/0,
diff --git a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc
index 05f242397f44..5046ace8c07e 100644
--- a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc
+++ b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <initializer_list>
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
diff --git a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc
index 0ffacfbae80b..3e3591f6471f 100644
--- a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc
+++ b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <functional>
-#include <memory>
+#include <initializer_list>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/core/interpreter.h"
diff --git a/tensorflow/lite/kernels/dynamic_update_slice.cc b/tensorflow/lite/kernels/dynamic_update_slice.cc
index 776379058cc1..5c5cbcd8f963 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice.cc
@@ -219,6 +219,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       DynamicUpdateSlice<int8_t>(operand, update, indices_data_i64.data(),
                                  output);
       break;
+    case kTfLiteInt16:
+      DynamicUpdateSlice<int16_t>(operand, update, indices_data_i64.data(),
+                                  output);
+      break;
     case kTfLiteInt32:
       DynamicUpdateSlice<int32_t>(operand, update, indices_data_i64.data(),
                                   output);
diff --git a/tensorflow/lite/kernels/dynamic_update_slice_test.cc b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
index 867f2b9b8cc0..373a719d5ac4 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice_test.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
@@ -177,6 +177,21 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestI8) {
                                                        7, -2, 9}));
 }
 
+TEST(DynamicUpdateSliceOpTest, SimpleTestI16) {
+  DynamicUpdateSliceOpModel m({TensorType_INT16, {3, 3}},
+                              {TensorType_INT16, {2, 1}},
+                              {TensorType_INT32, {2}});
+  m.SetInput<int16_t>({1, 2, 3,  //
+                       4, 5, 6,  //
+                       7, 8, 9});
+  m.SetUpdate<int16_t>({-1, -2});
+  m.SetStartIndices<int32_t>({1, 1});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAreArray({1, 2, 3,   //
+                                                        4, -1, 6,  //
+                                                        7, -2, 9}));
+}
+
 TEST(DynamicUpdateSliceOpTest, SimpleTestI32) {
   DynamicUpdateSliceOpModel m({TensorType_INT32, {3, 3}},
                               {TensorType_INT32, {2, 1}},
diff --git a/tensorflow/lite/kernels/eigen_support_test.cc b/tensorflow/lite/kernels/eigen_support_test.cc
index af6a9ad7a36d..727ecc59961c 100644
--- a/tensorflow/lite/kernels/eigen_support_test.cc
+++ b/tensorflow/lite/kernels/eigen_support_test.cc
@@ -71,11 +71,13 @@ TEST(EigenSupport, SingleThreaded) {
   EXPECT_EQ(thread_pool_device->numThreadsInPool(), 1);
 
   bool executed = false;
-  auto notification =
-      thread_pool_device->enqueue([&executed]() { executed = true; });
-  ASSERT_NE(notification, nullptr);
-  notification->Wait();
-  delete notification;
+  // NOLINTNEXTLINE: clang-tidy missing-includes false positive
+  Eigen::Barrier barrier(1);
+  thread_pool_device->enqueueNoNotification([&executed, &barrier]() {
+    executed = true;
+    barrier.Notify();
+  });
+  barrier.Wait();
   EXPECT_TRUE(executed);
 
   DecrementUsageCounter(&context);
@@ -91,11 +93,13 @@ TEST(EigenSupport, MultiThreaded) {
   EXPECT_EQ(thread_pool_device->numThreads(), 2);
 
   bool executed = false;
-  auto notification =
-      thread_pool_device->enqueue([&executed]() { executed = true; });
-  ASSERT_NE(notification, nullptr);
-  notification->Wait();
-  delete notification;
+  // NOLINTNEXTLINE: clang-tidy missing-includes false positive
+  Eigen::Barrier barrier(1);
+  thread_pool_device->enqueueNoNotification([&executed, &barrier]() {
+    executed = true;
+    barrier.Notify();
+  });
+  barrier.Wait();
   EXPECT_TRUE(executed);
 
   DecrementUsageCounter(&context);
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index d92701059822..158bd8c63bf9 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -29,8 +29,8 @@ limitations under the License.
 //   When indices are out of bound, the ops will not succeed.
 //
 
-#include <stdint.h>
-
+#include <cinttypes>
+#include <cstdint>
 #include <cstring>
 
 #include "tensorflow/lite/c/c_api_types.h"
@@ -104,17 +104,17 @@ TfLiteStatus EvalSimple(TfLiteContext* context, TfLiteNode* node,
     // Propagate empty tensor if input is empty
     return kTfLiteOk;
   }
-  const int64_t row_bytes = value->bytes / row_size;
+  const size_t row_bytes = value->bytes / row_size;
 
   char* output_raw = GetTensorData<char>(output);
   const char* value_raw = GetTensorData<char>(value);
   const int32_t* lookup_data = GetTensorData<int32_t>(lookup);
   for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
-    int64_t idx = lookup_data[i];
+    const int32_t idx = lookup_data[i];
     if (idx >= row_size || idx < 0) {
       TF_LITE_KERNEL_LOG(context,
                          "Embedding Lookup: index out of bounds. "
-                         "Got %d, and bounds are [0, %d]",
+                         "Got %" PRId32 ", and bounds are [0, %d]",
                          idx, row_size - 1);
       return kTfLiteError;
     } else {
@@ -142,11 +142,11 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   const int32_t* lookup_data = GetTensorData<int32_t>(lookup);
 
   for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
-    int idx = lookup_data[i];
+    const int32_t idx = lookup_data[i];
     if (idx >= row_size || idx < 0) {
       TF_LITE_KERNEL_LOG(context,
                          "Embedding Lookup: index out of bounds. "
-                         "Got %d, and bounds are [0, %d]",
+                         "Got %" PRId32 ", and bounds are [0, %d]",
                          idx, row_size - 1);
       return kTfLiteError;
     } else {
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index 4d240dd7b361..da8211002e08 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -42,7 +42,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteFloat16 ||
+                              input->type == kTfLiteBFloat16);
   output->type = input->type;
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
   return context->ResizeTensor(context, output, output_size);
@@ -55,13 +57,38 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context,
                     GetOutputSafe(context, node, kOutputTensor, &output));
-
-  if (type == kGenericOptimized) {
-    optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
-                         GetTensorShape(output), GetTensorData<float>(output));
-  } else {
-    reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
-                         GetTensorShape(output), GetTensorData<float>(output));
+  if (input->type == kTfLiteFloat32) {
+    if (type == kGenericOptimized) {
+      optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                           GetTensorShape(output),
+                           GetTensorData<float>(output));
+    } else {
+      reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                           GetTensorShape(output),
+                           GetTensorData<float>(output));
+    }
+  }
+  if (input->type == kTfLiteFloat16) {
+    if (type == kGenericOptimized) {
+      optimized_ops::Floor(
+          GetTensorShape(input), GetTensorData<Eigen::half>(input),
+          GetTensorShape(output), GetTensorData<Eigen::half>(output));
+    } else {
+      reference_ops::Floor(
+          GetTensorShape(input), GetTensorData<Eigen::half>(input),
+          GetTensorShape(output), GetTensorData<Eigen::half>(output));
+    }
+  }
+  if (input->type == kTfLiteBFloat16) {
+    if (type == kGenericOptimized) {
+      optimized_ops::Floor(
+          GetTensorShape(input), GetTensorData<Eigen::bfloat16>(input),
+          GetTensorShape(output), GetTensorData<Eigen::bfloat16>(output));
+    } else {
+      reference_ops::Floor(
+          GetTensorShape(input), GetTensorData<Eigen::bfloat16>(input),
+          GetTensorShape(output), GetTensorData<Eigen::bfloat16>(output));
+    }
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
index 5e51ca12abf1..86ea68ad39e5 100644
--- a/tensorflow/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -29,8 +29,8 @@ using ::testing::ElementsAreArray;
 class FloorOpModel : public SingleOpModel {
  public:
   FloorOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    input_ = AddInput(input_type);
+    output_ = AddOutput(input_type);
     SetBuiltinOp(BuiltinOperator_FLOOR, BuiltinOptions_NONE, 0);
     BuildInterpreter({
         input_shape,
@@ -38,8 +38,10 @@ class FloorOpModel : public SingleOpModel {
   }
 
   int input() { return input_; }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  private:
@@ -51,7 +53,7 @@ TEST(FloorOpTest, SingleDim) {
   FloorOpModel model({2}, TensorType_FLOAT32);
   model.PopulateTensor<float>(model.input(), {8.5, 0.0});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutput<float>(), ElementsAreArray({8, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
 }
 
@@ -70,9 +72,66 @@ TEST(FloorOpTest, MultiDims) {
   }
   model.PopulateTensor<float>(model.input(), input);
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      model.GetOutput(),
-      Pointwise(FloatingPointEq(), {0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
+  EXPECT_THAT(model.GetOutput<float>(),
+              ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+TEST(FloorOpTest, SingleDimFloat16) {
+  FloorOpModel model({2}, TensorType_FLOAT16);
+  model.PopulateTensor<>(model.input(), {Eigen::half(8.5), Eigen::half(0.0)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput<Eigen::half>(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(FloorOpTest, MultiDimsFloat16) {
+  FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT16);
+  model.PopulateTensor<Eigen::half>(model.input(), {
+                                                       Eigen::half(0.75),
+                                                       Eigen::half(8.25),
+                                                       Eigen::half(0.49),
+                                                       Eigen::half(9.99),
+                                                       Eigen::half(0.5),
+                                                       Eigen::half(-0.25),
+                                                       Eigen::half(-8.75),
+                                                       Eigen::half(-0.99),
+                                                       Eigen::half(-9.49),
+                                                       Eigen::half(-0.5),
+                                                   });
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput<Eigen::half>(),
+              ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+TEST(FloorOpTest, SingleDimBFloat16) {
+  FloorOpModel model({2}, TensorType_BFLOAT16);
+  model.PopulateTensor<>(model.input(),
+                         {Eigen::bfloat16(8.5), Eigen::bfloat16(0.0)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput<Eigen::bfloat16>(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(FloorOpTest, MultiDimsBFloat16) {
+  FloorOpModel model({2, 1, 1, 5}, TensorType_BFLOAT16);
+  model.PopulateTensor<Eigen::bfloat16>(model.input(),
+                                        {
+                                            Eigen::bfloat16(1.75),
+                                            Eigen::bfloat16(8.5),
+                                            Eigen::bfloat16(1.49),
+                                            Eigen::bfloat16(9.01),
+                                            Eigen::bfloat16(1.5),
+                                            Eigen::bfloat16(-1.25),
+                                            Eigen::bfloat16(-8.99),
+                                            Eigen::bfloat16(-1.99),
+                                            Eigen::bfloat16(-9.5),
+                                            Eigen::bfloat16(-1.5),
+                                        });
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput<Eigen::bfloat16>(),
+              ElementsAreArray({1, 8, 1, 9, 1, -2, -9, -2, -10, -2}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
 }
 
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 8bfb045bc1b4..287cf22365d4 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <vector>
 
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/sparse_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -120,13 +122,15 @@ TfLiteStatus VerifyPerChannelQuantization(TfLiteContext* context,
 
 TfLiteStatus VerifyQuantizationZeroPoint(const TfLiteTensor* tensor,
                                          int expected_value) {
-  const auto* params =
-      reinterpret_cast<TfLiteAffineQuantization*>(tensor->quantization.params);
-  if (params && params->zero_point &&
-      std::any_of(params->zero_point->data,
-                  params->zero_point->data + params->zero_point->size,
-                  [expected_value](int v) { return v != expected_value; })) {
-    return kTfLiteError;
+  if (tensor->quantization.type == kTfLiteAffineQuantization) {
+    const auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
+        tensor->quantization.params);
+    if (params && params->zero_point &&
+        std::any_of(params->zero_point->data,
+                    params->zero_point->data + params->zero_point->size,
+                    [expected_value](int v) { return v != expected_value; })) {
+      return kTfLiteError;
+    }
   }
   return kTfLiteOk;
 }
@@ -947,6 +951,82 @@ struct SparseHybridFullyConnectedTask : cpu_backend_threadpool::Task {
   TfLiteTensor* output;
 };
 
+inline int8_t SignExtendInt4(int8_t value) { return (value ^ 0x8) - 8; }
+
+TfLiteStatus EvalBlockwise4Bit(
+    TfLiteContext* context, TfLiteNode* node,
+    TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias,
+    TfLiteTensor* input_quantized, TfLiteTensor* scaling_factors,
+    TfLiteTensor* accum_scratch, TfLiteTensor* input_offsets,
+    TfLiteTensor* output) {
+  const auto quantization_params =
+      static_cast<const TfLiteBlockwiseQuantization*>(
+          filter->quantization.params);
+
+  const size_t blocksize = quantization_params->blocksize;
+  const size_t input_channels = filter->dims->data[1];
+  const size_t output_channels = filter->dims->data[0];
+  const size_t batch_size = NumElements(input) / input_channels;
+  const size_t num_blocks = input_channels / blocksize;
+  const TfLiteTensor& scale = context->tensors[quantization_params->scale];
+  int num_scales = NumElements(&scale);
+  std::vector<float> dequantized_scale(num_scales, 0);
+  const Eigen::half* half_data = reinterpret_cast<const Eigen::half*>(
+      GetTensorData<TfLiteFloat16>(&scale));
+  reference_ops::Dequantize(GetTensorShape(&scale), half_data,
+                            GetTensorShape(&scale), dequantized_scale.data());
+  float* output_ptr = GetTensorData<float>(output);
+  memset(output_ptr, 0, NumElements(output) * sizeof(float));
+  std::vector<int8_t> quant_data(NumElements(input));
+  std::vector<float> input_scales(batch_size);
+  std::vector<int32_t> input_zero_points(batch_size);
+
+  const float* input_ptr = GetTensorData<float>(input);
+  tensor_utils::BatchQuantizeFloats(input_ptr, batch_size, input_channels,
+                                    quant_data.data(), input_scales.data(),
+                                    input_zero_points.data(),
+                                    /*do_asymmetric=*/true);
+
+  const float* bias_data = nullptr;
+  if (bias) {
+    bias_data = GetTensorData<float>(bias);
+  }
+  const size_t k2 = (input_channels + 1) & 0xFFFFFFFFFFFFFFFE;
+  const uint8_t* kernel = GetTensorData<uint8_t>(filter);
+  for (size_t mi = 0; mi < batch_size; mi++) {
+    for (size_t ni = 0; ni < output_channels; ni++) {
+      float kfsum = 0.0;
+      for (size_t bi = 0; bi < num_blocks; bi++) {
+        int32_t ksum = 0;
+        int32_t c_ref_acc = 0;
+        for (size_t ki = 0; ki < blocksize; ki++) {
+          const size_t k_index = bi * blocksize + ki;
+          const size_t nb_index = (ni * k2 + k_index) / 2;
+          const int8_t k_value = int8_t(
+              (k_index % 2 == 0) ? (kernel[nb_index] & static_cast<int8_t>(0xF))
+                                 : (kernel[nb_index] >> 4));
+          const int32_t kernel_value = SignExtendInt4(k_value);
+          ksum += kernel_value;
+          c_ref_acc +=
+              static_cast<int32_t>(quant_data[mi * input_channels + k_index]) *
+              static_cast<float>(kernel_value);
+        }
+        size_t scale_index = ni * num_blocks + bi;
+        float scale = dequantized_scale[scale_index];
+        output_ptr[mi * output_channels + ni] += c_ref_acc * scale;
+        kfsum += scale * ksum;
+      }
+      output_ptr[mi * output_channels + ni] -= (input_zero_points[mi] * kfsum);
+      output_ptr[mi * output_channels + ni] *= input_scales[mi];
+      if (bias_data != nullptr) {
+        output_ptr[mi * output_channels + ni] += bias_data[ni];
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus EvalHybridDense4Bit(
     TfLiteContext* context, TfLiteNode* node,
     TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input,
@@ -1134,6 +1214,7 @@ void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(filter), filter_data, GetTensorShape(bias),
         GetTensorData<int32_t>(bias), GetTensorShape(output),
+        input->params.scale, output->params.scale, filter->params.scale,
         GetTensorData<int8_t>(output));
   } else {
     optimized_integer_ops::FullyConnected(
@@ -1162,12 +1243,14 @@ void FullyConnectedInt16(const OpData* data, const TfLiteTensor* input,
         op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
         GetTensorShape(filter), filter_data, GetTensorShape(bias),
         GetTensorData<int32_t>(bias), GetTensorShape(output),
+        input->params.scale, output->params.scale, filter->params.scale,
         GetTensorData<int16_t>(output));
   } else {
     reference_integer_ops::FullyConnected(
         op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
         GetTensorShape(filter), filter_data, GetTensorShape(bias),
         GetTensorData<int64_t>(bias), GetTensorShape(output),
+        input->params.scale, output->params.scale, filter->params.scale,
         GetTensorData<int16_t>(output));
   }
 }
@@ -1191,12 +1274,16 @@ void FullyConnectedPerChannelInt8(const OpData* data, const TfLiteTensor* input,
   op_params.rhs_cacheable = IsConstantTensor(input);
 
   if (kernel_type == kReference) {
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    const float* filter_scales = affine_quantization->scale->data;
     reference_integer_ops::FullyConnectedPerChannel(
-        op_params, data->per_channel_output_multiplier.data(),
-        data->per_channel_output_shift.data(), GetTensorShape(input),
-        GetTensorData<int8_t>(input), GetTensorShape(filter), filter_data,
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(filter), filter_data, GetTensorShape(bias),
+        GetTensorData<int32_t>(bias), GetTensorShape(output),
+        input->params.scale, output->params.scale, filter_scales,
+        GetTensorData<int8_t>(output));
   } else {
     optimized_integer_ops::FullyConnectedPerChannel(
         op_params, data->per_channel_output_multiplier.data(),
@@ -1220,21 +1307,24 @@ void FullyConnectedPerChannelInt16(
   op_params.output_offset = output->params.zero_point;
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  const float* filter_scales = affine_quantization->scale->data;
 
   if (data->quantized_bias_type == kTfLiteInt32) {
     reference_integer_ops::FullyConnectedPerChannel(
-        op_params, data->per_channel_output_multiplier.data(),
-        data->per_channel_output_shift.data(), GetTensorShape(input),
-        GetTensorData<int16_t>(input), GetTensorShape(filter), filter_data,
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int16_t>(output));
+        op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
+        GetTensorShape(filter), filter_data, GetTensorShape(bias),
+        GetTensorData<int32_t>(bias), GetTensorShape(output),
+        input->params.scale, output->params.scale, filter_scales,
+        GetTensorData<int16_t>(output));
   } else {
     reference_integer_ops::FullyConnectedPerChannel(
-        op_params, data->per_channel_output_multiplier.data(),
-        data->per_channel_output_shift.data(), GetTensorShape(input),
-        GetTensorData<int16_t>(input), GetTensorShape(filter), filter_data,
-        GetTensorShape(bias), GetTensorData<int64_t>(bias),
-        GetTensorShape(output), GetTensorData<int16_t>(output));
+        op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
+        GetTensorShape(filter), filter_data, GetTensorShape(bias),
+        GetTensorData<int64_t>(bias), GetTensorShape(output),
+        input->params.scale, output->params.scale, filter_scales,
+        GetTensorData<int16_t>(output));
   }
 }
 
@@ -1295,9 +1385,18 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     TF_LITE_ENSURE_OK(
         context, GetTemporarySafe(context, node, /*index=*/3, &input_offsets));
     if (data->op_data_4bit) {
-      return EvalHybridDense4Bit(context, node, params, data, input, filter,
-                                 bias, input_quantized, scaling_factors,
-                                 accum_scratch, input_offsets, output);
+      switch (filter->quantization.type) {
+        case kTfLiteAffineQuantization:
+          return EvalHybridDense4Bit(context, node, params, data, input, filter,
+                                     bias, input_quantized, scaling_factors,
+                                     accum_scratch, input_offsets, output);
+        case kTfLiteBlockwiseQuantization:
+          return EvalBlockwise4Bit(context, node, params, data, input, filter,
+                                   bias, input_quantized, scaling_factors,
+                                   accum_scratch, input_offsets, output);
+        default:
+          return kTfLiteError;
+      }
     }
     TfLiteTensor* row_sums;
     TF_LITE_ENSURE_OK(context,
@@ -1324,7 +1423,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
               op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
               GetTensorShape(filter), GetTensorData<uint8_t>(filter),
               GetTensorShape(bias), GetTensorData<int32_t>(bias),
-              GetTensorShape(output), GetTensorData<uint8_t>(output));
+              GetTensorShape(output), input->params.scale, output->params.scale,
+              filter->params.scale, GetTensorData<uint8_t>(output));
         } else {
           optimized_ops::FullyConnected(
               op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
@@ -1445,7 +1545,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
               op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
               GetTensorShape(filter), GetTensorData<uint8_t>(filter),
               GetTensorShape(bias), GetTensorData<int32_t>(bias),
-              GetTensorShape(output), GetTensorData<int16_t>(output));
+              GetTensorShape(output), input->params.scale, output->params.scale,
+              filter->params.scale, GetTensorData<int16_t>(output));
         } else {
           optimized_ops::FullyConnected(
               op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 4aeb3de86d55..e79e5edf30c9 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -80,6 +80,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check conditions for different types.
   switch (input->type) {
     case kTfLiteFloat32:
+    case kTfLiteFloat16:
+    case kTfLiteBFloat16:
     case kTfLiteUInt8:
     case kTfLiteInt4:
     case kTfLiteInt8:
@@ -215,6 +217,12 @@ TfLiteStatus DispatchEvalInputType(TfLiteContext* const context,
   switch (input->type) {
     case kTfLiteFloat32:
       return Gather<float, PosT>(context, *params, input, positions, output);
+    case kTfLiteFloat16:
+      return Gather<Eigen::half, PosT>(context, *params, input, positions,
+                                       output);
+    case kTfLiteBFloat16:
+      return Gather<Eigen::bfloat16, PosT>(context, *params, input, positions,
+                                           output);
     case kTfLiteUInt8:
       return Gather<uint8_t, PosT>(context, *params, input, positions, output);
     case kTfLiteInt4:
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
index 10a620473756..fcf93335b310 100644
--- a/tensorflow/lite/kernels/gather_nd.cc
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -43,6 +43,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (params->type) {
+    case kTfLiteBFloat16:
+    case kTfLiteFloat16:
     case kTfLiteFloat32:
     case kTfLiteUInt8:
     case kTfLiteInt8:
@@ -137,6 +139,12 @@ TfLiteStatus EvalGatherNd(TfLiteContext* context, const TfLiteTensor* params,
 
   TfLiteStatus status = kTfLiteError;
   switch (params->type) {
+    case kTfLiteBFloat16:
+      status = GatherNd<Eigen::bfloat16, IndicesT>(params, indices, output);
+      break;
+    case kTfLiteFloat16:
+      status = GatherNd<Eigen::half, IndicesT>(params, indices, output);
+      break;
     case kTfLiteFloat32:
       status = GatherNd<float, IndicesT>(params, indices, output);
       break;
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
index f1c5aac8fc80..2bd9a0235ebe 100644
--- a/tensorflow/lite/kernels/gather_nd_test.cc
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -221,6 +221,46 @@ TEST(GatherNdOpTest, DuplicateIndexingIntoRank3Tensor) {
               Pointwise(FloatingPointEq(), {-2.1, 2.2, 2.3, -2.1, 2.2, 2.3}));
 }
 
+TEST(GatherNdOpTest, BFloat16Int32) {
+  GatherNdOpModel m({TensorType_BFLOAT16, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<Eigen::bfloat16>(
+      {Eigen::bfloat16(1.1), Eigen::bfloat16(-1.2), Eigen::bfloat16(1.3),
+       Eigen::bfloat16(-2.1), Eigen::bfloat16(2.2), Eigen::bfloat16(2.3),  //
+       Eigen::bfloat16(3.1), Eigen::bfloat16(3.2), Eigen::bfloat16(-3.3),
+       Eigen::bfloat16(-4.1), Eigen::bfloat16(-4.2), Eigen::bfloat16(4.3),  //
+       Eigen::bfloat16(5.1), Eigen::bfloat16(-5.2), Eigen::bfloat16(5.3),
+       Eigen::bfloat16(6.1), Eigen::bfloat16(-6.2), Eigen::bfloat16(6.3)});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutput<Eigen::bfloat16>(),
+              Pointwise(FloatingPointEq(),
+                        {Eigen::bfloat16(-2.1), Eigen::bfloat16(2.2),
+                         Eigen::bfloat16(2.3), Eigen::bfloat16(3.1),
+                         Eigen::bfloat16(3.2), Eigen::bfloat16(-3.3)}));
+}
+
+TEST(GatherNdOpTest, Float16Int32) {
+  GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<Eigen::half>(
+      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
+       Eigen::half(2.2), Eigen::half(2.3),  //
+       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
+       Eigen::half(-4.2), Eigen::half(4.3),  //
+       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
+       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(
+      m.GetOutput<Eigen::half>(),
+      Pointwise(FloatingPointEq(),
+                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
+                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+}
+
 TEST(GatherNdOpTest, Float32Int32) {
   GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
                     {TensorType_INT32, {2, 2}});
@@ -234,6 +274,46 @@ TEST(GatherNdOpTest, Float32Int32) {
               Pointwise(FloatingPointEq(), {-2.1, 2.2, 2.3, 3.1, 3.2, -3.3}));
 }
 
+TEST(GatherNdOpTest, BFloat16Int64) {
+  GatherNdOpModel m({TensorType_BFLOAT16, {3, 2, 3}},
+                    {TensorType_INT64, {2, 2}});
+  m.SetInput<Eigen::bfloat16>(
+      {Eigen::bfloat16(1.1), Eigen::bfloat16(-1.2), Eigen::bfloat16(1.3),
+       Eigen::bfloat16(-2.1), Eigen::bfloat16(2.2), Eigen::bfloat16(2.3),  //
+       Eigen::bfloat16(3.1), Eigen::bfloat16(3.2), Eigen::bfloat16(-3.3),
+       Eigen::bfloat16(-4.1), Eigen::bfloat16(-4.2), Eigen::bfloat16(4.3),  //
+       Eigen::bfloat16(5.1), Eigen::bfloat16(-5.2), Eigen::bfloat16(5.3),
+       Eigen::bfloat16(6.1), Eigen::bfloat16(-6.2), Eigen::bfloat16(6.3)});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutput<Eigen::bfloat16>(),
+              Pointwise(FloatingPointEq(),
+                        {Eigen::bfloat16(-2.1), Eigen::bfloat16(2.2),
+                         Eigen::bfloat16(2.3), Eigen::bfloat16(3.1),
+                         Eigen::bfloat16(3.2), Eigen::bfloat16(-3.3)}));
+}
+
+TEST(GatherNdOpTest, Float16Int64) {
+  GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
+                    {TensorType_INT64, {2, 2}});
+  m.SetInput<Eigen::half>(
+      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
+       Eigen::half(2.2), Eigen::half(2.3),  //
+       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
+       Eigen::half(-4.2), Eigen::half(4.3),  //
+       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
+       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(
+      m.GetOutput<Eigen::half>(),
+      Pointwise(FloatingPointEq(),
+                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
+                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+}
+
 TEST(GatherNdOpTest, Float32Int64) {
   GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
                     {TensorType_INT64, {2, 2}});
@@ -359,6 +439,46 @@ TEST(GatherNdOpTest, Int64Int64) {
               ElementsAreArray({-2LL, 2LL, 2LL, 3LL, 3LL, -3LL}));
 }
 
+TEST(GatherNdOpTest, BFloat16Int16) {
+  GatherNdOpModel m({TensorType_BFLOAT16, {3, 2, 3}},
+                    {TensorType_INT16, {2, 2}});
+  m.SetInput<Eigen::bfloat16>(
+      {Eigen::bfloat16(1.1), Eigen::bfloat16(-1.2), Eigen::bfloat16(1.3),
+       Eigen::bfloat16(-2.1), Eigen::bfloat16(2.2), Eigen::bfloat16(2.3),  //
+       Eigen::bfloat16(3.1), Eigen::bfloat16(3.2), Eigen::bfloat16(-3.3),
+       Eigen::bfloat16(-4.1), Eigen::bfloat16(-4.2), Eigen::bfloat16(4.3),  //
+       Eigen::bfloat16(5.1), Eigen::bfloat16(-5.2), Eigen::bfloat16(5.3),
+       Eigen::bfloat16(6.1), Eigen::bfloat16(-6.2), Eigen::bfloat16(6.3)});
+  m.SetPositions<int16_t>({0, 1, 1, 0});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutput<Eigen::bfloat16>(),
+              Pointwise(FloatingPointEq(),
+                        {Eigen::bfloat16(-2.1), Eigen::bfloat16(2.2),
+                         Eigen::bfloat16(2.3), Eigen::bfloat16(3.1),
+                         Eigen::bfloat16(3.2), Eigen::bfloat16(-3.3)}));
+}
+
+TEST(GatherNdOpTest, Float16Int16) {
+  GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
+                    {TensorType_INT16, {2, 2}});
+  m.SetInput<Eigen::half>(
+      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
+       Eigen::half(2.2), Eigen::half(2.3),  //
+       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
+       Eigen::half(-4.2), Eigen::half(4.3),  //
+       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
+       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetPositions<int16_t>({0, 1, 1, 0});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(
+      m.GetOutput<Eigen::half>(),
+      Pointwise(FloatingPointEq(),
+                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
+                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+}
+
 TEST(GatherNdOpTest, Float32Int16) {
   GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
                     {TensorType_INT16, {2, 2}});
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 84928047db76..580a4a8f85ef 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -250,8 +250,8 @@ TEST_P(GatherOpTest, LastAxis0DIndex) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
 }
 
-using TestTypes =
-    testing::Types<int8_t, uint8_t, int16_t, int32_t, int64_t, float>;
+using TestTypes = testing::Types<int8_t, uint8_t, int16_t, int32_t, int64_t,
+                                 float, Eigen::half, Eigen::bfloat16>;
 
 template <typename T>
 struct TypedGatherOpTest : public testing::Test {};
@@ -263,10 +263,12 @@ TYPED_TEST(TypedGatherOpTest, Int32Indices) {
     TensorType tensor_type = GetTensorType<TypeParam>();
     GatherOpModel<TypeParam, int32_t> m(
         {tensor_type, {2, 2}}, {TensorType_INT32, {2}}, constant_tensor,
-        {13, 120, 14, 15}, {1, 0});
+        {TypeParam(13), TypeParam(120), TypeParam(14), TypeParam(15)}, {1, 0});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray({14, 15, 13, 120}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreArray({TypeParam(14), TypeParam(15), TypeParam(13),
+                                  TypeParam(120)}));
   }
 }
 
@@ -275,10 +277,12 @@ TYPED_TEST(TypedGatherOpTest, Int64Indices) {
     TensorType tensor_type = GetTensorType<TypeParam>();
     GatherOpModel<TypeParam, int64_t> m(
         {tensor_type, {2, 2}}, {TensorType_INT64, {2}}, constant_tensor,
-        {13, 120, 14, 15}, {1, 0});
+        {TypeParam(13), TypeParam(120), TypeParam(14), TypeParam(15)}, {1, 0});
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray({14, 15, 13, 120}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreArray({TypeParam(14), TypeParam(15), TypeParam(13),
+                                  TypeParam(120)}));
   }
 }
 
@@ -307,21 +311,40 @@ TYPED_TEST(TypedGatherOpTest, BatchDims2) {
     GatherOpModel<TypeParam, int32_t> m(
         {tensor_type, {2, 2, 3, 5}}, {TensorType_INT32, {2, 2, 2}},
         constant_tensor,
-        {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-         30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-         45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59},
+        {TypeParam(0),  TypeParam(1),  TypeParam(2),  TypeParam(3),
+         TypeParam(4),  TypeParam(5),  TypeParam(6),  TypeParam(7),
+         TypeParam(8),  TypeParam(9),  TypeParam(10), TypeParam(11),
+         TypeParam(12), TypeParam(13), TypeParam(14), TypeParam(15),
+         TypeParam(16), TypeParam(17), TypeParam(18), TypeParam(19),
+         TypeParam(20), TypeParam(21), TypeParam(22), TypeParam(23),
+         TypeParam(24), TypeParam(25), TypeParam(26), TypeParam(27),
+         TypeParam(28), TypeParam(29), TypeParam(30), TypeParam(31),
+         TypeParam(32), TypeParam(33), TypeParam(34), TypeParam(35),
+         TypeParam(36), TypeParam(37), TypeParam(38), TypeParam(39),
+         TypeParam(40), TypeParam(41), TypeParam(42), TypeParam(43),
+         TypeParam(44), TypeParam(45), TypeParam(46), TypeParam(47),
+         TypeParam(48), TypeParam(49), TypeParam(50), TypeParam(51),
+         TypeParam(52), TypeParam(53), TypeParam(54), TypeParam(55),
+         TypeParam(56), TypeParam(57), TypeParam(58), TypeParam(59)},
         {1, 0, 0, 1, 1, 0, 0, 1},
         /*axis=*/2,
         /*batch_dims=*/2);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
     ASSERT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2, 5}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreArray({5,  6,  7,  8,  9,  0,  1,  2,  3,  4,
-                                  15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
-                                  35, 36, 37, 38, 39, 30, 31, 32, 33, 34,
-                                  45, 46, 47, 48, 49, 50, 51, 52, 53, 54}));
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(
+            {TypeParam(5),  TypeParam(6),  TypeParam(7),  TypeParam(8),
+             TypeParam(9),  TypeParam(0),  TypeParam(1),  TypeParam(2),
+             TypeParam(3),  TypeParam(4),  TypeParam(15), TypeParam(16),
+             TypeParam(17), TypeParam(18), TypeParam(19), TypeParam(20),
+             TypeParam(21), TypeParam(22), TypeParam(23), TypeParam(24),
+             TypeParam(35), TypeParam(36), TypeParam(37), TypeParam(38),
+             TypeParam(39), TypeParam(30), TypeParam(31), TypeParam(32),
+             TypeParam(33), TypeParam(34), TypeParam(45), TypeParam(46),
+             TypeParam(47), TypeParam(48), TypeParam(49), TypeParam(50),
+             TypeParam(51), TypeParam(52), TypeParam(53), TypeParam(54)}));
   }
 }
 
diff --git a/tensorflow/lite/kernels/hadamard_rotation.cc b/tensorflow/lite/kernels/hadamard_rotation.cc
new file mode 100644
index 000000000000..3a577b95d456
--- /dev/null
+++ b/tensorflow/lite/kernels/hadamard_rotation.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace aeq_hadamard_rotation {
+
+static const int kInputTensor = 0;
+static const int kOutputTensor = 0;
+
+struct OpData {
+  bool is_initialized = false;
+  int hadamard_size = 0;
+  std::vector<int> random_binary_vector;
+};
+
+// Fast Walsh Hadamard Transform. Updates `data` in place.
+void FWHT(float* data, int n) {
+  if ((n & (n - 1)) != 0) {
+    std::cerr << "Error: Input size must be a power of 2." << std::endl;
+    return;
+  }
+
+  int h = 1;
+  while (h < n) {
+    for (int i = 0; i < n; i += h * 2) {
+      for (int j = i; j < i + h; ++j) {
+        float x = data[j];
+        float y = data[j + h];
+        data[j] = x + y;
+        data[j + h] = x - y;
+      }
+    }
+    h *= 2;
+  }
+  for (int k = 0; k < n; ++k) {
+    data[k] /= std::sqrt(n);
+  }
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  OpData* op_data = new OpData();
+  op_data->is_initialized = false;
+  return op_data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  if (!op_data->is_initialized) {
+    const uint8_t* buffer =
+        reinterpret_cast<const uint8_t*>(node->custom_initial_data);
+    const size_t length = node->custom_initial_data_size;
+    auto flexbuffer_map = flexbuffers::GetRoot(buffer, length).AsMap();
+    int32_t hadamard_size = flexbuffer_map["hadamard_size"].AsInt32();
+    std::vector<int> vec;
+    const auto& vector = flexbuffer_map["random_binary_vector"].AsVector();
+    vec.reserve(vector.size());
+    for (size_t i = 0; i < vector.size(); i++) {
+      vec.push_back(vector[i].AsInt8());
+    }
+    op_data->hadamard_size = hadamard_size;
+    op_data->random_binary_vector = vec;
+    op_data->is_initialized = true;
+  }
+
+  // Prepare the inputs.
+  const TfLiteTensor* input_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor, &input_tensor));
+
+  TF_LITE_ENSURE(context, input_tensor->type == kTfLiteFloat32 ||
+                              input_tensor->type == kTfLiteInt32);
+
+  return kTfLiteOk;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete static_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor, &input_tensor));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  int hadamard_size = op_data->hadamard_size;
+  int input_batch = 1;
+  int input_features = input_tensor->dims->data[0];
+  int input_feature_size = input_tensor->dims->data[1];
+  if (input_tensor->dims->size == 3) {
+    input_batch = input_tensor->dims->data[0];
+    input_features = input_tensor->dims->data[1];
+    input_feature_size = input_tensor->dims->data[2];
+  }
+  int num_hadamards_per_feature = input_feature_size / hadamard_size;
+  for (int batch = 0; batch < input_batch; ++batch) {
+    int chunk_start = batch * input_features * num_hadamards_per_feature;
+    for (int chunk = 0; chunk < input_features * num_hadamards_per_feature;
+         ++chunk) {
+      for (int i = 0; i < hadamard_size; ++i) {
+        output->data.f[chunk_start + i] =
+            input_tensor->data.f[chunk_start + i] *
+            op_data->random_binary_vector[i];
+      }
+      // Update output->data.f in place.
+      FWHT(&output->data.f[chunk_start], hadamard_size);
+      chunk_start += hadamard_size;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace aeq_hadamard_rotation
+
+TfLiteRegistration* Register_HADAMARD_ROTATION() {
+  static TfLiteRegistration r = {
+      aeq_hadamard_rotation::Init, aeq_hadamard_rotation::Free,
+      aeq_hadamard_rotation::Prepare, aeq_hadamard_rotation::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/hadamard_rotation_test.cc b/tensorflow/lite/kernels/hadamard_rotation_test.cc
new file mode 100644
index 000000000000..04ac698a4f39
--- /dev/null
+++ b/tensorflow/lite/kernels/hadamard_rotation_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_HADAMARD_ROTATION();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class BaseHadamardRotationOpModel : public SingleOpModel {
+ public:
+  BaseHadamardRotationOpModel(const TensorData& input,
+                              const TensorData& output) {
+    input1_ = AddInput(input);
+    output1_ = AddOutput(output);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("hadamard_size", 2);
+      auto start = fbb.StartVector("random_binary_vector");
+      fbb.Add(1);
+      fbb.Add(1);
+      fbb.EndVector(start, false, false);
+    });
+    fbb.Finish();
+    SetCustomOp("aeq.hadamard_rotation", fbb.GetBuffer(),
+                Register_HADAMARD_ROTATION);
+    BuildInterpreter({GetShape(input1_)});
+  }
+
+  int input1() { return input1_; }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor<T>(input1_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput1() {
+    return ExtractVector<T>(output1_);
+  }
+
+  std::vector<int> GetOutputShape1() { return GetTensorShape(output1_); }
+
+ protected:
+  int input1_;
+  int output1_;
+};
+
+TEST(HadamardRotationOpTest, BasicTest) {
+  BaseHadamardRotationOpModel m({TensorType_FLOAT32, {1, 4}},
+                                {TensorType_FLOAT32, {1, 4}});
+
+  m.SetInput1<float>({
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+  });
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      // Equals to [FWHT([1.0, 1.0]), FWHT([1.0, 1.0])] normalized by sqrt(2)
+      ElementsAreArray(ArrayFloatNear({1.41421354, 0, 1.41421354, 0}, 1e-6)));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/if_test.cc b/tensorflow/lite/kernels/if_test.cc
index 5fd734bba86b..cd34ca705965 100644
--- a/tensorflow/lite/kernels/if_test.cc
+++ b/tensorflow/lite/kernels/if_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -170,10 +171,12 @@ TEST_F(IfTest, TestWithXNNPACK) {
   builder_->BuildFloatIfSubgraph(&interpreter_->primary_subgraph(), 3);
 
   const auto opt = TfLiteXNNPackDelegateOptionsDefault();
-  TfLiteDelegate* xnnpack_delegate = TfLiteXNNPackDelegateCreate(&opt);
+  std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)> xnnpack_delegate(
+      TfLiteXNNPackDelegateCreate(&opt), TfLiteXNNPackDelegateDelete);
   interpreter_->primary_subgraph().MarkAsDelegationSkippable();
   interpreter_->subgraph(1)->MarkAsDelegationSkippable();
-  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(xnnpack_delegate), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(std::move(xnnpack_delegate)),
+            kTfLiteOk);
   ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1}),
             kTfLiteOk);
   ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1}),
@@ -201,7 +204,6 @@ TEST_F(IfTest, TestWithXNNPACK) {
   interpreter_->typed_input_tensor<bool>(0)[0] = true;
   ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
-  TfLiteXNNPackDelegateDelete(xnnpack_delegate);
 }
 
 TEST_F(IfTest, TestInputIsOutput) {
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 2a1f510c131b..353cdcdf23e4 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -2,7 +2,7 @@ load("@bazel_skylib//lib:selects.bzl", "selects")
 load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite:special_rules.bzl", "tflite_extra_arm_config_settings", "tflite_portable_test_suite_combined")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
@@ -20,18 +20,11 @@ HARD_FP_FLAGS_IF_APPLICABLE = select({
 })
 
 NEON_FLAGS_IF_APPLICABLE = select({
-    ":arm": [
-        "-O3",
-    ],
-    ":armeabi-v7a": [
-        "-O3",
-        "-mfpu=neon",
-    ],
     ":armhf": [
         "-O3",
         "-mfpu=neon",
     ],
-    ":armv7a": [
+    ":armv7": [
         "-O3",
         "-mfpu=neon",
     ],
@@ -94,212 +87,89 @@ cc_library(
 )
 
 config_setting(
-    name = "aarch64",
-    values = {
-        "cpu": "aarch64",
-    },
-)
-
-config_setting(
-    name = "arm",
-    values = {
-        "cpu": "arm",
-    },
+    name = "armv7",
+    constraint_values = [
+        "@platforms//cpu:armv7",
+    ],
 )
 
 config_setting(
-    name = "arm64-v8a",
-    values = {
-        "cpu": "arm64-v8a",
-    },
+    name = "aarch64",
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+    ],
 )
 
 config_setting(
     name = "armhf",
-    values = {
-        "cpu": "armhf",
-    },
-)
-
-config_setting(
-    name = "armv7a",
-    values = {
-        "cpu": "armv7a",
-    },
-)
-
-config_setting(
-    name = "armeabi-v7a",
-    values = {
-        "cpu": "armeabi-v7a",
-    },
-)
-
-config_setting(
-    name = "haswell",
-    values = {
-        "cpu": "haswell",
-    },
-)
-
-config_setting(
-    name = "ios_x86_64",
-    values = {
-        "cpu": "ios_x86_64",
-    },
-)
-
-config_setting(
-    name = "tvos_x86_64",
-    values = {
-        "cpu": "tvos_x86_64",
-    },
-)
-
-config_setting(
-    name = "ios_armv7",
-    values = {
-        "cpu": "ios_armv7",
-    },
-)
-
-config_setting(
-    name = "ios_arm64",
-    values = {
-        "cpu": "ios_arm64",
-    },
-)
-
-config_setting(
-    name = "ios_arm64e",
-    values = {
-        "cpu": "ios_arm64e",
-    },
-)
-
-config_setting(
-    name = "ios_sim_arm64",
-    values = {
-        "cpu": "ios_sim_arm64",
-    },
-)
-
-config_setting(
-    name = "visionos_arm64",
-    values = {
-        "cpu": "visionos_arm64",
-    },
-)
-
-config_setting(
-    name = "visionos_sim_arm64",
-    values = {
-        "cpu": "visionos_sim_arm64",
-    },
-)
-
-config_setting(
-    name = "k8",
-    values = {
-        "cpu": "k8",
-    },
+    constraint_values = [
+        "@platforms//cpu:armv7e-mf",
+    ],
 )
 
 config_setting(
     name = "x86",
-    values = {
-        "cpu": "x86",
-    },
+    constraint_values = [
+        "@platforms//cpu:x86_32",
+    ],
 )
 
 config_setting(
     name = "x86_64",
-    values = {
-        "cpu": "x86_64",
-    },
+    constraint_values = [
+        "@platforms//cpu:x86_64",
+    ],
 )
 
-config_setting(
-    name = "darwin",
-    values = {
-        "cpu": "darwin",
-    },
+selects.config_setting_group(
+    name = "x86_any",
+    match_any = [
+        ":x86_32_any",
+        ":x86_64_any",
+    ],
 )
 
-config_setting(
-    name = "darwin_x86_64",
-    values = {
-        "cpu": "darwin_x86_64",
-    },
+selects.config_setting_group(
+    name = "arm_any",
+    match_any = [
+        ":arm32_any",
+        ":aarch64_any",
+    ],
 )
 
-config_setting(
-    name = "darwin_arm64",
-    values = {
-        "cpu": "darwin_arm64",
-    },
+selects.config_setting_group(
+    name = "arm32_any",
+    match_any = [
+        "@platforms//cpu:aarch32",
+        "@platforms//cpu:armv7",
+        "@platforms//cpu:armv7-m",
+        "@platforms//cpu:armv7e-m",
+        "@platforms//cpu:armv7e-mf",
+    ],
 )
 
-config_setting(
-    name = "freebsd",
-    values = {
-        "cpu": "freebsd",
-    },
+selects.config_setting_group(
+    name = "aarch64_any",
+    match_any = [
+        "@platforms//cpu:aarch64",
+        "@platforms//cpu:arm64e",
+    ],
 )
 
 config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
+    name = "x86_32_any",
+    constraint_values = [
+        "@platforms//cpu:x86_32",
+    ],
 )
 
 config_setting(
-    name = "raspberry_pi_with_neon",
-    define_values = {
-        "raspberry_pi_with_neon": "true",
-    },
-    values = {
-        "cpu": "armeabi",
-    },
-)
-
-selects.config_setting_group(
-    name = "x86_any",
-    match_any = [
-        ":haswell",
-        ":ios_x86_64",
-        ":k8",
-        ":x86",
-        ":x86_64",
-        ":darwin",
-        ":darwin_x86_64",
-        ":freebsd",
-        ":windows",
-        ":tvos_x86_64",
+    name = "x86_64_any",
+    constraint_values = [
+        "@platforms//cpu:x86_64",
     ],
 )
 
-selects.config_setting_group(
-    name = "arm_any",
-    match_any = [
-        ":aarch64",
-        ":arm",
-        ":arm64-v8a",
-        ":armeabi-v7a",
-        ":armhf",
-        ":armv7a",
-        ":ios_armv7",
-        ":ios_arm64",
-        ":ios_arm64e",
-        ":ios_sim_arm64",
-        ":darwin_arm64",
-        ":raspberry_pi_with_neon",
-        ":visionos_arm64",
-        ":visionos_sim_arm64",
-    ] + tflite_extra_arm_config_settings(),
-)
-
 cc_library(
     name = "common",
     srcs = ["common.cc"],
@@ -433,44 +303,6 @@ cc_library(
     ],
 )
 
-selects.config_setting_group(
-    name = "arm32_any",
-    match_any = [
-        ":armeabi-v7a",
-        ":armhf",
-        ":armv7a",
-        ":ios_armv7",
-    ],
-)
-
-selects.config_setting_group(
-    name = "aarch64_any",
-    match_any = [
-        ":arm",
-        ":aarch64",
-        ":arm64-v8a",
-        ":darwin_arm64",
-        ":ios_arm64",
-        ":ios_arm64e",
-        ":ios_sim_arm64",
-        ":visionos_arm64",
-        ":visionos_sim_arm64",
-    ],
-)
-
-selects.config_setting_group(
-    name = "x86_64_any",
-    match_any = [
-        ":darwin_x86_64",
-        ":haswell",
-        ":ios_x86_64",
-        ":k8",
-        ":tvos_x86_64",
-        ":windows",
-        ":x86_64",
-    ],
-)
-
 cc_library(
     name = "optimized_4bit",
     srcs = select({
@@ -1489,10 +1321,7 @@ cc_test(
     srcs = ["optimized/avx2_quantization_utils_test.cc"],
     copts = select(
         {
-            ":haswell": [
-                "-mavx2",
-            ],
-            ":k8": [
+            ":x86_64": [
                 "-mavx2",
             ],
             "//conditions:default": [
diff --git a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
index 6d20210f41f6..2f075d7ca543 100644
--- a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
@@ -136,117 +136,6 @@ inline void BatchMatMul(const FullyConnectedParams& params,
   }
 }
 
-inline void BatchMatMul(const FullyConnectedParams& params,
-                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
-                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
-                        const RuntimeShape& output_shape, int32_t* output_data,
-                        CpuBackendContext* context,
-                        bool transpose_lhs = false) {
-  using ::tflite::cpu_backend_gemm::Gemm;
-  using ::tflite::cpu_backend_gemm::GemmParams;
-  using ::tflite::cpu_backend_gemm::MatrixParams;
-
-  const RuntimeShape extended_lhs_shape =
-      RuntimeShape::ExtendedShape(5, lhs_shape);
-  const RuntimeShape extended_rhs_shape =
-      RuntimeShape::ExtendedShape(5, rhs_shape);
-
-  // Determine which dimension is the broadcast dimension.
-  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
-    if (lhs_dim == rhs_dim) return lhs_dim;
-    if (lhs_dim == 1) return rhs_dim;
-    TFLITE_DCHECK_EQ(rhs_dim, 1);
-    return lhs_dim;
-  };
-
-  // Compute the "extent" for iterating on this dimension.
-  // If we are broadcasting, then don't advance (i.e return 0).
-  auto extent = [](const RuntimeShape& shape, int x) {
-    if (shape.Dims(x) == 1) {
-      return 0;
-    }
-    int prod = 1;
-    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
-      prod *= shape.Dims(i);
-    }
-    return prod;
-  };
-
-  const int batch_dim0 =
-      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
-  const int batch_dim1 =
-      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
-  const int batch_dim2 =
-      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
-
-  const int lhs_ext0 = extent(extended_lhs_shape, 0);
-  const int lhs_ext1 = extent(extended_lhs_shape, 1);
-  const int lhs_ext2 = extent(extended_lhs_shape, 2);
-  const int rhs_ext0 = extent(extended_rhs_shape, 0);
-  const int rhs_ext1 = extent(extended_rhs_shape, 1);
-  const int rhs_ext2 = extent(extended_rhs_shape, 2);
-
-  // Set params for each matrix multiply.
-  const int lhs_rows = extended_lhs_shape.Dims(3);
-  const int rhs_cols = extended_rhs_shape.Dims(4);
-  const int accum_depth = extended_lhs_shape.Dims(4);
-
-  const int32 input_offset = params.input_offset;
-  const int32 weights_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-
-  MatrixParams<int8_t> lhs_params;
-  if (transpose_lhs) {
-    lhs_params.order = cpu_backend_gemm::Order::kColMajor;
-  } else {
-    lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
-  }
-  lhs_params.rows = lhs_rows;
-  lhs_params.cols = accum_depth;
-  lhs_params.zero_point = -weights_offset;
-
-  MatrixParams<int8_t> rhs_params;
-  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
-  rhs_params.rows = accum_depth;
-  rhs_params.cols = rhs_cols;
-  rhs_params.zero_point = -input_offset;
-
-  MatrixParams<int32_t> dst_params;
-  dst_params.order = cpu_backend_gemm::Order::kColMajor;
-  dst_params.rows = lhs_rows;
-  dst_params.cols = rhs_cols;
-  dst_params.zero_point = output_offset;
-
-  for (int b0 = 0; b0 < batch_dim0; ++b0) {
-    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
-    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
-    for (int b1 = 0; b1 < batch_dim1; ++b1) {
-      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
-      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
-      for (int b2 = 0; b2 < batch_dim2; ++b2) {
-        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
-        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
-        int32_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
-                                          b1 * batch_dim2 + b2) *
-                                             lhs_rows * rhs_cols;
-
-        GemmParams<int32_t, int32_t> gemm_params;
-        gemm_params.clamp_min = output_activation_min;
-        gemm_params.clamp_max = output_activation_max;
-        gemm_params.multiplier_fixedpoint = output_multiplier;
-        gemm_params.multiplier_exponent = output_shift;
-        cpu_backend_gemm::Gemm(lhs_params, lhs_ptr2, rhs_params, rhs_ptr2,
-                               dst_params, out_ptr, gemm_params, context);
-      }
-    }
-  }
-}
-
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index d340d2fec437..e83346ea6b9f 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -3915,13 +3915,14 @@ inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
   }
 }
 
-inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
-                     const RuntimeShape& output_shape, float* output_data) {
+template <typename T>
+inline void Logistic(const RuntimeShape& input_shape, const T* input_data,
+                     const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("Logistic");
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
   output_map.array() =
-      input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+      input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<T>());
 }
 
 // Convenience version that allows, for example, generated-code calls to be
@@ -4029,8 +4030,9 @@ inline void Logistic(const LogisticParams& params,
   }
 }
 
-inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
-                 const RuntimeShape& output_shape, float* output_data) {
+template <typename T>
+inline void Tanh(const RuntimeShape& input_shape, const T* input_data,
+                 const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("Tanh");
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
@@ -4222,8 +4224,9 @@ inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
   output_map.array() = input_map.array().template cast<DstT>();
 }
 
-inline void Floor(const RuntimeShape& input_shape, const float* input_data,
-                  const RuntimeShape& output_shape, float* output_data) {
+template <typename T>
+inline void Floor(const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("Floor");
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
diff --git a/tensorflow/lite/kernels/internal/reference/floor.h b/tensorflow/lite/kernels/internal/reference/floor.h
index 0693fd42987a..3ef844b5249b 100644
--- a/tensorflow/lite/kernels/internal/reference/floor.h
+++ b/tensorflow/lite/kernels/internal/reference/floor.h
@@ -23,13 +23,15 @@ namespace tflite {
 
 namespace reference_ops {
 
-inline void Floor(const RuntimeShape& input_shape, const float* input_data,
-                  const RuntimeShape& output_shape, float* output_data) {
+template <typename T>
+inline void Floor(const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
     int offset = i;
-    output_data[offset] = std::floor(input_data[offset]);
+    output_data[offset] =
+        static_cast<T>(std::floor(static_cast<float>(input_data[offset])));
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h
index ba51cbcfe3e8..bccc62200625 100644
--- a/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
 
 #include <algorithm>
+#include <cmath>
+#include <cstdint>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -62,6 +64,59 @@ inline void FullyConnected(
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    float input_scale, float output_scale, float filter_scale,
+    uint8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            static_cast<double>(filter_scale) /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(acc) * effective_output_scale));
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc_scaled);
+    }
+  }
+}
+
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8_t* input_data, const RuntimeShape& filter_shape,
@@ -164,6 +219,60 @@ inline void FullyConnected(
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    float input_scale, float output_scale, float filter_scale,
+    int16_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(output_offset, 0);
+  // TODO(b/62193649): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
+                                       output_shape, output_dim_count - 1);
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32_t accum = bias_data[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < accum_depth; ++d) {
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t filter_val =
+            filter_data[out_c * accum_depth + d] + filter_offset;
+        accum += filter_val * input_val;
+      }
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            static_cast<double>(filter_scale) /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(accum) * effective_output_scale));
+      // Saturate, cast to int16_t, and store to output array.
+      acc_scaled = std::max(acc_scaled, output_activation_min - output_offset);
+      acc_scaled = std::min(acc_scaled, output_activation_max - output_offset);
+      acc_scaled += output_offset;
+      output_data[out_c + output_depth * b] = acc_scaled;
+    }
+  }
+}
+
 inline void ShuffledFullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
     const uint8_t* input_data, const RuntimeShape& weights_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/gelu.h b/tensorflow/lite/kernels/internal/reference/gelu.h
index 98b49f3bfc78..70025e1f093c 100644
--- a/tensorflow/lite/kernels/internal/reference/gelu.h
+++ b/tensorflow/lite/kernels/internal/reference/gelu.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_GELU_H_
 
 #include <cmath>
+#include <cstdint>
 #include <functional>
 
 #include "Eigen/Core"  // from @eigen_archive
@@ -76,6 +77,15 @@ inline void Gelu(const RuntimeShape& input_shape, const T* input_data,
   }
 }
 
+// LookupTableInt16 is a specialized function for int16_t inputs and outputs.
+// It internally calls LUTLookup for table access.
+inline void LookupTableInt16(const int16_t* input_data, int num_elements,
+                             const int16_t* lut, int16_t* output_data) {
+  for (int i = 0; i < num_elements; ++i) {
+    output_data[i] = LUTLookup(input_data[i], lut);
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index c6d060779348..f249beef8503 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
 
 #include <algorithm>
+#include <cmath>
+#include <cstdint>
 
 #include "tensorflow/lite/kernels/internal/common.h"
 
@@ -74,6 +76,61 @@ void FullyConnectedPerChannel(
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnectedPerChannel(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const InputType* input_data, const RuntimeShape& filter_shape,
+    const WeightType* filter_data, const RuntimeShape& bias_shape,
+    const BiasType* bias_data, const RuntimeShape& output_shape,
+    float input_scale, float output_scale, const float* filter_scales,
+    OutputType* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      BiasType acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += filter_val * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+
+      const float scale = filter_scales[out_c];
+      const double filter_scale = static_cast<double>(scale);
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            filter_scale /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(acc) * effective_output_scale));
+
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
+    }
+  }
+}
+
 template <typename InputType, typename WeightType, typename OutputType,
           typename BiasType>
 void FullyConnected(const FullyConnectedParams& params,
@@ -122,6 +179,59 @@ void FullyConnected(const FullyConnectedParams& params,
   }
 }
 
+// This implementation receives the scales in float and performs requant in
+// float to avoid loss of precision.
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnected(const FullyConnectedParams& params,
+                    const RuntimeShape& input_shape,
+                    const InputType* input_data,
+                    const RuntimeShape& filter_shape,
+                    const WeightType* filter_data,
+                    const RuntimeShape& bias_shape, const BiasType* bias_data,
+                    const RuntimeShape& output_shape, float input_scale,
+                    float output_scale, float filter_scale,
+                    OutputType* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_depth = output_shape.Dims(output_dim_count - 1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      BiasType acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      const double effective_output_scale = static_cast<double>(input_scale) *
+                                            static_cast<double>(filter_scale) /
+                                            static_cast<double>(output_scale);
+      int32_t acc_scaled = static_cast<int32_t>(
+          round(static_cast<double>(acc) * effective_output_scale));
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
+    }
+  }
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/logistic.h b/tensorflow/lite/kernels/internal/reference/logistic.h
index 64b7133bec6c..8a621869f1a1 100644
--- a/tensorflow/lite/kernels/internal/reference/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/logistic.h
@@ -27,8 +27,9 @@ limitations under the License.
 namespace tflite {
 namespace reference_ops {
 
-inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
-                     const RuntimeShape& output_shape, float* output_data) {
+template <typename T>
+inline void Logistic(const RuntimeShape& input_shape, const T* input_data,
+                     const RuntimeShape& output_shape, T* output_data) {
   const float cutoff_upper = 16.619047164916992188f;
   const float cutoff_lower = -9.f;
 
@@ -43,7 +44,7 @@ inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
   // optimized kernels. (check the definition of scalar_logistic_op<float>)
 
   for (int i = 0; i < flat_size; i++) {
-    float val = input_data[i];
+    T val = input_data[i];
     float result;
     if (val > cutoff_upper) {
       result = 1.0f;
@@ -52,7 +53,7 @@ inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
     } else {
       result = 1.f / (1.f + std::exp(-val));
     }
-    output_data[i] = result;
+    output_data[i] = static_cast<T>(result);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/round.h b/tensorflow/lite/kernels/internal/reference/round.h
index 9bd8f3f2b23d..9f26a00d17ef 100644
--- a/tensorflow/lite/kernels/internal/reference/round.h
+++ b/tensorflow/lite/kernels/internal/reference/round.h
@@ -34,15 +34,16 @@ inline float RoundToNearest(float value) {
   }
 }
 
-inline void Round(const RuntimeShape& input_shape, const float* input_data,
-                  const RuntimeShape& output_shape, float* output_data) {
+template <typename Scalar>
+inline void Round(const RuntimeShape& input_shape, const Scalar* input_data,
+                  const RuntimeShape& output_shape, Scalar* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
     // Note that this implementation matches that of tensorFlow tf.round
     // and corresponds to the bankers rounding method.
     // cfenv (for fesetround) is not yet supported universally on Android, so
     // using a work around.
-    output_data[i] = RoundToNearest(input_data[i]);
+    output_data[i] = static_cast<Scalar>(RoundToNearest(input_data[i]));
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/tanh.h b/tensorflow/lite/kernels/internal/reference/tanh.h
index 3a05c474dd3c..d8a14d9b541b 100644
--- a/tensorflow/lite/kernels/internal/reference/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -26,14 +26,13 @@ limitations under the License.
 namespace tflite {
 namespace reference_ops {
 
-inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
-                 const RuntimeShape& output_shape, float* output_data) {
+template <typename T>
+inline void Tanh(const RuntimeShape& input_shape, const T* input_data,
+                 const RuntimeShape& output_shape, T* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
-    float val = input_data[i];
-    float result = std::tanh(val);
-    output_data[i] = result;
+    output_data[i] = static_cast<T>(std::tanh(input_data[i]));
   }
 }
 
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 9f74aad97553..a88ba32428f2 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -25,10 +25,10 @@ limitations under the License.
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -982,7 +982,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+    TF_LITE_ENSURE(context, input_gate_bias == nullptr);
   } else {
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
@@ -1061,7 +1061,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
         context, node, kInputLayerNormCoefficientsTensor);
     if (use_cifg) {
-      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients, nullptr);
+      TF_LITE_ENSURE(context, input_layer_norm_coefficients == nullptr);
     } else {
       TF_LITE_ENSURE(context, input_layer_norm_coefficients != nullptr);
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 08e6d991d6cd..2f3024a71228 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include "Eigen/Core"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -183,6 +184,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16:
       TFLiteOperation<kernel_type, int16_t, OpType>(context, node, op_context);
       break;
+    case kTfLiteFloat16:
+      TFLiteOperation<kernel_type, Eigen::half, OpType>(context, node,
+                                                        op_context);
+      break;
+    case kTfLiteBFloat16:
+      TFLiteOperation<kernel_type, Eigen::bfloat16, OpType>(context, node,
+                                                            op_context);
+      break;
     default:
       TF_LITE_KERNEL_LOG(context,
                          "Type %d is currently not supported by Maximum.",
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index 30b85ae4889f..babdb4f69fad 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "Eigen/Core"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
@@ -244,5 +245,126 @@ TEST(MaximumOpTest, Int32WithBroadcastTest5D) {
       {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2, 1, 1}}, data1,
       data2, {1, 0, -1, -2, 2, 2});
 }
+
+TEST(MaximumOpTest, Float16Test) {
+  std::initializer_list<Eigen::half> data1 = {
+      Eigen::half(1.0),  Eigen::half(0.0),  Eigen::half(-1.0),
+      Eigen::half(11.0), Eigen::half(-2.0), Eigen::half(-1.44)};
+  std::initializer_list<Eigen::half> data2 = {
+      Eigen::half(-1.0), Eigen::half(0.0),  Eigen::half(1.0),
+      Eigen::half(12.0), Eigen::half(-3.0), Eigen::half(-1.43)};
+  TestModel<Eigen::half>(
+      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
+      data2,
+      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(1.0), Eigen::half(12.0),
+       Eigen::half(-2.0), Eigen::half(-1.43)});
+  TestModel<Eigen::half>(
+      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
+      data2,
+      {Eigen::half(-1.0), Eigen::half(0.0), Eigen::half(-1.0),
+       Eigen::half(11.0), Eigen::half(-3.0), Eigen::half(-1.44)});
+}
+
+TEST(MaximumOpTest, BFloat16Test) {
+  std::initializer_list<Eigen::bfloat16> data1 = {
+      Eigen::bfloat16(1.0),  Eigen::bfloat16(0.0),  Eigen::bfloat16(-1.0),
+      Eigen::bfloat16(11.0), Eigen::bfloat16(-2.0), Eigen::bfloat16(-1.44)};
+  std::initializer_list<Eigen::bfloat16> data2 = {
+      Eigen::bfloat16(-1.0), Eigen::bfloat16(0.0),  Eigen::bfloat16(1.0),
+      Eigen::bfloat16(12.0), Eigen::bfloat16(-3.0), Eigen::bfloat16(-1.43)};
+  TestModel<Eigen::bfloat16>(
+      BuiltinOperator_MAXIMUM, {TensorType_BFLOAT16, {3, 1, 2}},
+      {TensorType_BFLOAT16, {3, 1, 2}}, {TensorType_BFLOAT16, {3, 1, 2}}, data1,
+      data2,
+      {Eigen::bfloat16(1.0), Eigen::bfloat16(0.0), Eigen::bfloat16(1.0),
+       Eigen::bfloat16(12.0), Eigen::bfloat16(-2.0), Eigen::bfloat16(-1.43)});
+  TestModel<Eigen::bfloat16>(
+      BuiltinOperator_MINIMUM, {TensorType_BFLOAT16, {3, 1, 2}},
+      {TensorType_BFLOAT16, {3, 1, 2}}, {TensorType_BFLOAT16, {3, 1, 2}}, data1,
+      data2,
+      {Eigen::bfloat16(-1.0), Eigen::bfloat16(0.0), Eigen::bfloat16(-1.0),
+       Eigen::bfloat16(11.0), Eigen::bfloat16(-3.0), Eigen::bfloat16(-1.44)});
+}
+
+TEST(MaximumOpTest, BFloat16WithBroadcastTest5DScalarY) {
+  std::initializer_list<Eigen::bfloat16> data1 = {
+      Eigen::bfloat16(1.0),  Eigen::bfloat16(0.0), Eigen::bfloat16(-1.0),
+      Eigen::bfloat16(-2.0), Eigen::bfloat16(3.0), Eigen::bfloat16(11.0)};
+  std::initializer_list<Eigen::bfloat16> data2 = {Eigen::bfloat16(2.0)};
+  TestModel<Eigen::bfloat16>(
+      BuiltinOperator_MAXIMUM, {TensorType_BFLOAT16, {3, 1, 2, 1, 1}},
+      {TensorType_BFLOAT16, {1}}, {TensorType_BFLOAT16, {3, 1, 2, 1, 1}}, data1,
+      data2,
+      {Eigen::bfloat16(2.0), Eigen::bfloat16(2.0), Eigen::bfloat16(2.0),
+       Eigen::bfloat16(2.0), Eigen::bfloat16(3.0), Eigen::bfloat16(11.0)});
+  TestModel<Eigen::bfloat16>(
+      BuiltinOperator_MINIMUM, {TensorType_BFLOAT16, {3, 1, 2, 1, 1}},
+      {TensorType_BFLOAT16, {1}}, {TensorType_BFLOAT16, {3, 1, 2, 1, 1}}, data1,
+      data2,
+      {Eigen::bfloat16(1.0), Eigen::bfloat16(0.0), Eigen::bfloat16(-1.0),
+       Eigen::bfloat16(-2.0), Eigen::bfloat16(2.0), Eigen::bfloat16(2.0)});
+}
+
+TEST(MaximumOpTest, Float16WithBroadcastTest5DScalarY) {
+  std::initializer_list<Eigen::half> data1 = {
+      Eigen::half(1.0),  Eigen::half(0.0), Eigen::half(-1.0),
+      Eigen::half(-2.0), Eigen::half(3.0), Eigen::half(11.0)};
+  std::initializer_list<Eigen::half> data2 = {Eigen::half(2.0)};
+  TestModel<Eigen::half>(
+      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
+      data2,
+      {Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0),
+       Eigen::half(3.0), Eigen::half(11.0)});
+  TestModel<Eigen::half>(
+      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
+      data2,
+      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
+       Eigen::half(2.0), Eigen::half(2.0)});
+}
+
+TEST(MaximumOpTest, Float16WithBroadcastTest5D) {
+  std::initializer_list<Eigen::half> data1 = {
+      Eigen::half(1.0),  Eigen::half(0.0),   Eigen::half(-1.0),
+      Eigen::half(-2.0), Eigen::half(-1.44), Eigen::half(11.0)};
+  std::initializer_list<Eigen::half> data2 = {Eigen::half(0.5),
+                                              Eigen::half(2.0)};
+  TestModel<Eigen::half>(
+      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
+      data2,
+      {Eigen::half(1.0), Eigen::half(2.0), Eigen::half(0.5), Eigen::half(2.0),
+       Eigen::half(0.5), Eigen::half(11.0)});
+  TestModel<Eigen::half>(
+      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
+      data2,
+      {Eigen::half(0.5), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
+       Eigen::half(-1.44), Eigen::half(2.0)});
+}
+
+TEST(MaximumOpTest, BFloat16WithBroadcastTest5D) {
+  std::initializer_list<Eigen::bfloat16> data1 = {
+      Eigen::bfloat16(1.0),  Eigen::bfloat16(0.0),   Eigen::bfloat16(-1.0),
+      Eigen::bfloat16(-2.0), Eigen::bfloat16(-1.44), Eigen::bfloat16(11.0)};
+  std::initializer_list<Eigen::bfloat16> data2 = {Eigen::bfloat16(0.5),
+                                                  Eigen::bfloat16(2.0)};
+  TestModel<Eigen::bfloat16>(
+      BuiltinOperator_MAXIMUM, {TensorType_BFLOAT16, {3, 1, 1, 1, 2}},
+      {TensorType_BFLOAT16, {2}}, {TensorType_BFLOAT16, {3, 1, 1, 1, 2}}, data1,
+      data2,
+      {Eigen::bfloat16(1.0), Eigen::bfloat16(2.0), Eigen::bfloat16(0.5),
+       Eigen::bfloat16(2.0), Eigen::bfloat16(0.5), Eigen::bfloat16(11.0)});
+  TestModel<Eigen::bfloat16>(
+      BuiltinOperator_MINIMUM, {TensorType_BFLOAT16, {3, 1, 1, 1, 2}},
+      {TensorType_BFLOAT16, {2}}, {TensorType_BFLOAT16, {3, 1, 1, 1, 2}}, data1,
+      data2,
+      {Eigen::bfloat16(0.5), Eigen::bfloat16(0.0), Eigen::bfloat16(-1.0),
+       Eigen::bfloat16(-2.0), Eigen::bfloat16(-1.44), Eigen::bfloat16(2.0)});
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/neg.cc b/tensorflow/lite/kernels/neg.cc
index d2e938284200..f40c0d05e268 100644
--- a/tensorflow/lite/kernels/neg.cc
+++ b/tensorflow/lite/kernels/neg.cc
@@ -67,6 +67,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                             GetTensorShape(output),
                             GetTensorData<float>(output));
       break;
+    case kTfLiteInt8:
+      reference_ops::Negate(GetTensorShape(input), GetTensorData<int8_t>(input),
+                            GetTensorShape(output),
+                            GetTensorData<int8_t>(output));
+      break;
+    case kTfLiteInt16:
+      reference_ops::Negate(
+          GetTensorShape(input), GetTensorData<int16_t>(input),
+          GetTensorShape(output), GetTensorData<int16_t>(output));
+      break;
+    case kTfLiteFloat16:
+      reference_ops::Negate(
+          GetTensorShape(input), GetTensorData<Eigen::half>(input),
+          GetTensorShape(output), GetTensorData<Eigen::half>(output));
+      break;
+    case kTfLiteBFloat16:
+      reference_ops::Negate(
+          GetTensorShape(input), GetTensorData<Eigen::half>(input),
+          GetTensorShape(output), GetTensorData<Eigen::half>(output));
+      break;
     default:
       TF_LITE_KERNEL_LOG(
           context,
diff --git a/tensorflow/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
index 5627f3262519..fe9cc68bdf8a 100644
--- a/tensorflow/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -48,12 +48,15 @@ class NegOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  int input() const { return input_; }
+  int output() const { return output_; }
+
  protected:
   int input_;
   int output_;
 };
 
-TEST(NegOpModel, NegFloat) {
+TEST(NegOpModel, NegFloat32) {
   NegOpModel m({TensorType_FLOAT32, {2, 3}}, {TensorType_FLOAT32, {2, 3}});
   m.SetInput<float>({-2.0f, -1.0f, 0.f, 1.0f, 2.0f, 3.0f});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -62,6 +65,31 @@ TEST(NegOpModel, NegFloat) {
       Pointwise(FloatingPointEq(), {2.0f, 1.0f, 0.f, -1.0f, -2.0f, -3.0f}));
 }
 
+TEST(NegOpModel, NegFloat16) {
+  NegOpModel m({TensorType_FLOAT16, {6}}, {TensorType_FLOAT16, {6}});
+  m.SetInput<Eigen::half>({Eigen::half(-2.0f), Eigen::half(-1.0f),
+                           Eigen::half(0.f), Eigen::half(1.0f),
+                           Eigen::half(2.0f), Eigen::half(3.0f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<Eigen::half>(),
+              ElementsAreArray({Eigen::half(2.0f), Eigen::half(1.0f),
+                                Eigen::half(0.f), Eigen::half(-1.0f),
+                                Eigen::half(-2.0f), Eigen::half(-3.0f)}));
+}
+
+TEST(NegOpModel, NegBfloat16) {
+  NegOpModel m({TensorType_BFLOAT16, {6}}, {TensorType_BFLOAT16, {6}});
+  m.SetInput<Eigen::bfloat16>({Eigen::bfloat16(-2.0f), Eigen::bfloat16(-1.0f),
+                               Eigen::bfloat16(0.f), Eigen::bfloat16(1.0f),
+                               Eigen::bfloat16(2.0f), Eigen::bfloat16(3.0f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.GetOutput<Eigen::bfloat16>(),
+      ElementsAreArray({Eigen::bfloat16(2.0f), Eigen::bfloat16(1.0f),
+                        Eigen::bfloat16(0.f), Eigen::bfloat16(-1.0f),
+                        Eigen::bfloat16(-2.0f), Eigen::bfloat16(-3.0f)}));
+}
+
 TEST(NegOpModel, NegInt32) {
   NegOpModel m({TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 3}});
   m.SetInput<int32_t>({-2, -1, 0, 1, 2, 3});
@@ -76,5 +104,61 @@ TEST(NegOpModel, NegInt64) {
   EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({2, 1, 0, -1, -2, -3}));
 }
 
+class NegOpQuantizedModel : public NegOpModel {
+ public:
+  NegOpQuantizedModel(const TensorData& input, const TensorData& output)
+      : NegOpModel(SymmetricInt16Scaling(std::move(input)),
+                   SymmetricInt16Scaling(std::move(output))) {}
+
+  template <typename integer_dtype>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
+  }
+
+ private:
+  TensorData SymmetricInt16Scaling(TensorData tensor) {
+    if (tensor.type == TensorType_INT16) {
+      CHECK_EQ(std::abs(tensor.min), tensor.max);
+      tensor.scale = tensor.max / std::numeric_limits<int16_t>::max();
+      tensor.zero_point = 0;
+      tensor.min = 0;
+      tensor.max = 0;
+    }
+    return tensor;
+  }
+};
+
+template <typename T>
+float GetTolerance(float min, float max) {
+  const float kQuantizedStep =
+      2.0 * (max - min) /
+      (std::numeric_limits<T>::max() - std::numeric_limits<T>::min());
+  return kQuantizedStep;
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTests() {
+  const float kQuantizedTolerance = GetTolerance<integer_dtype>(-128.0, 128.0);
+  const std::vector<float> input = {-128.0f, -9, 0, 8, 127};
+  const std::vector<float> result = {128.0f, 9, 0, -8, -127};
+
+  NegOpQuantizedModel m({tensor_type, {5}, -128.0, 128.0},
+                        {tensor_type, {5}, -128.0, 128.0});
+
+  m.QuantizeAndPopulate<integer_dtype>(m.input(), input);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
+              ElementsAreArray(ArrayFloatNear(result, kQuantizedTolerance)));
+}
+
+TEST(NegOpQuantizedModel, NegInt8) {
+  QuantizedTests<TensorType_INT8, int8_t>();
+}
+
+TEST(NegOpQuantizedModel, NegInt16) {
+  QuantizedTests<TensorType_INT16, int16_t>();
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 41fdf735dbb1..16af797e6e48 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -58,6 +58,12 @@ struct PadContext {
       case kTfLiteInt32:
         SetResizingCategory<int32_t>(context);
         break;
+      case kTfLiteInt8:
+        SetResizingCategory<int8_t>(context);
+        break;
+      case kTfLiteInt16:
+        SetResizingCategory<int16_t>(context);
+        break;
       default:
         TF_LITE_KERNEL_LOG(context,
                            "Padding type %s is currently not supported by Pad.",
@@ -117,8 +123,12 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 PadContext* op_context) {
   if (op_context->paddings->type == kTfLiteInt64) {
     TF_LITE_ENSURE(context, (std::is_same_v<PaddingIntegerType, int64_t>));
-  } else {
+  } else if (op_context->paddings->type == kTfLiteInt32) {
     TF_LITE_ENSURE(context, (std::is_same_v<PaddingIntegerType, int32_t>));
+  } else if (op_context->paddings->type == kTfLiteInt8) {
+    TF_LITE_ENSURE(context, (std::is_same_v<PaddingIntegerType, int8_t>));
+  } else {
+    TF_LITE_ENSURE(context, (std::is_same_v<PaddingIntegerType, int16_t>));
   }
   // Ensures the paddings array is dims x 2.
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 0),
@@ -161,6 +171,12 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
     case kTfLiteInt32: {
       return ResizeOutputTensor<int32_t>(context, op_context);
     }
+    case kTfLiteInt8: {
+      return ResizeOutputTensor<int8_t>(context, op_context);
+    }
+    case kTfLiteInt16: {
+      return ResizeOutputTensor<int16_t>(context, op_context);
+    }
     default:
       TF_LITE_KERNEL_LOG(context,
                          "Padding type %s is currently not supported by Pad.",
@@ -176,10 +192,13 @@ tflite::PadParams GetPadParams(TfLiteContext* context,
                                const PadContext& op_context) {
   tflite::PadParams op_params;
   if (!(op_context.paddings->type == kTfLiteInt64 &&
-        std::is_same_v<PaddingIntegerType,
-                       int64_t>)&&!(op_context.paddings->type == kTfLiteInt32 &&
-                                    std::is_same_v<PaddingIntegerType,
-                                                   int32_t>)) {
+        std::is_same_v<PaddingIntegerType, int64_t>) &&
+      !(op_context.paddings->type == kTfLiteInt32 &&
+        std::is_same_v<PaddingIntegerType, int32_t>) &&
+      !(op_context.paddings->type == kTfLiteInt8 &&
+        std::is_same_v<PaddingIntegerType, int8_t>) &&
+      !(op_context.paddings->type == kTfLiteInt16 &&
+        std::is_same_v<PaddingIntegerType, int16_t>)) {
     TF_LITE_KERNEL_LOG(context, "Padding type %s doesn't match typename.",
                        TfLiteTypeGetName(op_context.paddings->type));
     return op_params;
@@ -205,6 +224,12 @@ tflite::PadParams GetPadParams(TfLiteContext* context,
     case kTfLiteInt32: {
       return GetPadParams<int32_t>(context, op_context);
     }
+    case kTfLiteInt8: {
+      return GetPadParams<int8_t>(context, op_context);
+    }
+    case kTfLiteInt16: {
+      return GetPadParams<int16_t>(context, op_context);
+    }
     default:
       TF_LITE_KERNEL_LOG(context,
                          "Padding type %s is currently not supported by Pad.",
@@ -332,14 +357,76 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         }
       }
     } break;
+    case kTfLiteFloat16: {
+      Eigen::half pad_value =
+          op_context.constant_values == nullptr
+              ? static_cast<Eigen::half>(0.f)
+              : *GetTensorData<Eigen::half>(op_context.constant_values);
+      if (kernel_type == kReference) {
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(reference_ops, PadImageStyle, Eigen::half, pad_value);
+        } else {
+          TF_LITE_PAD(reference_ops, Pad, Eigen::half, pad_value);
+        }
+      } else if (kernel_type == kGenericOptimized) {
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(optimized_ops, PadImageStyle, Eigen::half, pad_value);
+        } else {
+          TF_LITE_PAD(optimized_ops, Pad, Eigen::half, pad_value);
+        }
+      }
+    } break;
+    case kTfLiteBFloat16: {
+      Eigen::bfloat16 pad_value =
+          op_context.constant_values == nullptr
+              ? static_cast<Eigen::bfloat16>(0.f)
+              : *GetTensorData<Eigen::bfloat16>(op_context.constant_values);
+      if (kernel_type == kReference) {
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(reference_ops, PadImageStyle, Eigen::bfloat16, pad_value);
+        } else {
+          TF_LITE_PAD(reference_ops, Pad, Eigen::bfloat16, pad_value);
+        }
+      } else if (kernel_type == kGenericOptimized) {
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(optimized_ops, PadImageStyle, Eigen::bfloat16, pad_value);
+        } else {
+          TF_LITE_PAD(optimized_ops, Pad, Eigen::bfloat16, pad_value);
+        }
+      }
+    } break;
     case kTfLiteUInt8: {
       EvalInt<uint8_t>(context, op_context, op_params);
     } break;
     case kTfLiteInt8: {
-      EvalInt<int8_t>(context, op_context, op_params);
+      if (op_context.input->quantization.type != kTfLiteNoQuantization) {
+        EvalInt<int8_t>(context, op_context, op_params);
+      } else {
+        int8_t pad_value =
+            op_context.constant_values == nullptr
+                ? 0
+                : *GetTensorData<int8_t>(op_context.constant_values);
+        if (kernel_type == kReference) {
+          TF_LITE_PAD(reference_ops, Pad, int8_t, pad_value);
+        } else if (kernel_type == kGenericOptimized) {
+          TF_LITE_PAD(optimized_ops, Pad, int8_t, pad_value);
+        }
+      }
     } break;
     case kTfLiteInt16: {
-      EvalInt<int16_t>(context, op_context, op_params);
+      if (op_context.input->quantization.type != kTfLiteNoQuantization) {
+        EvalInt<int16_t>(context, op_context, op_params);
+      } else {
+        int16_t pad_value =
+            op_context.constant_values == nullptr
+                ? 0
+                : *GetTensorData<int16_t>(op_context.constant_values);
+        if (kernel_type == kReference) {
+          TF_LITE_PAD(reference_ops, Pad, int16_t, pad_value);
+        } else if (kernel_type == kGenericOptimized) {
+          TF_LITE_PAD(optimized_ops, Pad, int16_t, pad_value);
+        }
+      }
     } break;
     case kTfLiteInt32: {
       int32_t pad_value =
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index c36558970224..9a28c9453290 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -207,6 +207,10 @@ TEST_F(PadOpTest, Int32PaddingTooFewDimensions) { TooFewDimensions<int32_t>(); }
 
 TEST_F(PadOpTest, Int64PaddingTooFewDimensions) { TooFewDimensions<int64_t>(); }
 
+TEST_F(PadOpTest, Int8PaddingTooFewDimensions) { TooFewDimensions<int8_t>(); }
+
+TEST_F(PadOpTest, Int16PaddingTooFewDimensions) { TooFewDimensions<int16_t>(); }
+
 template <typename padding_integer_type>
 void UnequalDimensions() {
   EXPECT_DEATH(PadOpConstModel<padding_integer_type>(
@@ -223,6 +227,12 @@ TEST_F(PadOpTest, Int64PaddingUnequalDimensions) {
   UnequalDimensions<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingUnequalDimensions) { UnequalDimensions<int8_t>(); }
+
+TEST_F(PadOpTest, Int16PaddingUnequalDimensions) {
+  UnequalDimensions<int16_t>();
+}
+
 template <typename padding_integer_type>
 void InvalidPadValue() {
   EXPECT_DEATH(PadOpConstModel<int32_t>({TensorType_FLOAT32, {1, 1, 2, 1}},
@@ -235,6 +245,10 @@ TEST_F(PadOpTest, Int32PaddingInvalidPadValue) { InvalidPadValue<int32_t>(); }
 
 TEST_F(PadOpTest, Int64PaddingInvalidPadValue) { InvalidPadValue<int64_t>(); }
 
+TEST_F(PadOpTest, Int8PaddingInvalidPadValue) { InvalidPadValue<int8_t>(); }
+
+TEST_F(PadOpTest, Int16PaddingInvalidPadValue) { InvalidPadValue<int16_t>(); }
+
 TEST_F(PadOpTest, Int64PaddingOverflow) {
   EXPECT_DEATH(PadOpConstModel<int64_t>(
                    {TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
@@ -270,6 +284,10 @@ TEST_F(PadOpTest, Int32PaddingSimpleConstTest) { SimpleConstTest<int32_t>(); }
 
 TEST_F(PadOpTest, Int64PaddingSimpleConstTest) { SimpleConstTest<int64_t>(); }
 
+TEST_F(PadOpTest, Int8PaddingSimpleConstTest) { SimpleConstTest<int8_t>(); }
+
+TEST_F(PadOpTest, Int16PaddingSimpleConstTest) { SimpleConstTest<int16_t>(); }
+
 template <typename padding_integer_type>
 void SimpleConstImageStyleTest() {
   // Padding is represented as four 2-D lists representing above padding and
@@ -292,6 +310,14 @@ TEST_F(PadOpTest, Int64PaddingSimpleConstImageStyleTest) {
   SimpleConstImageStyleTest<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingSimpleConstImageStyleTest) {
+  SimpleConstImageStyleTest<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingSimpleConstImageStyleTest) {
+  SimpleConstImageStyleTest<int16_t>();
+}
+
 // Optimized versions may choose to handle zero-sized images differently.
 template <typename padding_integer_type>
 void ZeroHeightConstImageStyleTest() {
@@ -312,6 +338,14 @@ TEST_F(PadOpTest, Int64PaddingZeroHeightConstImageStyleTest) {
   ZeroHeightConstImageStyleTest<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingZeroHeightConstImageStyleTest) {
+  ZeroHeightConstImageStyleTest<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingZeroHeightConstImageStyleTest) {
+  ZeroHeightConstImageStyleTest<int16_t>();
+}
+
 // Optimized versions may choose to handle zero-sized images differently.
 template <typename padding_integer_type>
 void ZeroWidthConstImageStyleTest() {
@@ -332,6 +366,14 @@ TEST_F(PadOpTest, Int64PaddingZeroWidthConstImageStyleTest) {
   ZeroWidthConstImageStyleTest<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingZeroWidthConstImageStyleTest) {
+  ZeroWidthConstImageStyleTest<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingZeroWidthConstImageStyleTest) {
+  ZeroWidthConstImageStyleTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleConst1DTest() {
   PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {2}}, {1, 2},
@@ -350,6 +392,12 @@ TEST_F(PadOpTest, Int64PaddingSimpleConst1DTest) {
   SimpleConst1DTest<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingSimpleConst1DTest) { SimpleConst1DTest<int8_t>(); }
+
+TEST_F(PadOpTest, Int16PaddingSimpleConst1DTest) {
+  SimpleConst1DTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleConst1DDim0Test() {
   if (SingleOpModel::GetForceUseNnapi()) {
@@ -371,6 +419,14 @@ TEST_F(PadOpTest, Int64PaddingSimpleConst1DDim0Test) {
   SimpleConst1DDim0Test<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingSimpleConst1DDim0Test) {
+  SimpleConst1DDim0Test<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingSimpleConst1DDim0Test) {
+  SimpleConst1DDim0Test<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleDynamicTest() {
   PadOpDynamicModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 2, 1}},
@@ -391,6 +447,12 @@ TEST_F(PadOpTest, Int64PaddingSimpleDynamicTest) {
   SimpleDynamicTest<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingSimpleDynamicTest) { SimpleDynamicTest<int8_t>(); }
+
+TEST_F(PadOpTest, Int16PaddingSimpleDynamicTest) {
+  SimpleDynamicTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void DynamicUnequalDimensions() {
   if (SingleOpModel::GetForceUseNnapi()) {
@@ -412,6 +474,14 @@ TEST_F(PadOpTest, Int64PaddingDynamicUnequalDimensions) {
   DynamicUnequalDimensions<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingDynamicUnequalDimensions) {
+  DynamicUnequalDimensions<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingDynamicUnequalDimensions) {
+  DynamicUnequalDimensions<int16_t>();
+}
+
 template <typename padding_integer_type>
 void AdvancedConstTestV2() {
   PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 3, 1}},
@@ -435,6 +505,14 @@ TEST_F(PadOpTest, Int64PaddingAdvancedConstTest) {
   AdvancedConstTestV2<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingAdvancedConstTest) {
+  AdvancedConstTestV2<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingAdvancedConstTest) {
+  AdvancedConstTestV2<int16_t>();
+}
+
 template <typename padding_integer_type>
 void AdvancedConstImageStyleTest() {
   PadOpConstModel<int32_t> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
@@ -455,6 +533,14 @@ TEST_F(PadOpTest, Int64PaddingAdvancedConstImageStyleTest) {
   AdvancedConstImageStyleTest<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingAdvancedConstImageStyleTest) {
+  AdvancedConstImageStyleTest<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingAdvancedConstImageStyleTest) {
+  AdvancedConstImageStyleTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void AdvancedDynamicTest() {
   PadOpDynamicModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 3, 1}},
@@ -476,6 +562,14 @@ TEST_F(PadOpTest, Int64PaddingAdvancedDynamicTest) {
   AdvancedDynamicTest<int64_t>();
 }
 
+TEST_F(PadOpTest, Int8PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTest<int8_t>();
+}
+
+TEST_F(PadOpTest, Int16PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTest<int16_t>();
+}
+
 std::vector<Matcher<float>> DequantizedArrayNear(
     const std::vector<float>& values, const float min, const float max) {
   const float quantization_tolerance = (max - min) / 255.0;
@@ -639,6 +733,14 @@ TEST_F(PadV2OpTest, Int64PaddingTooManyDimensions) {
   TooManyDimensions<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingTooManyDimensions) {
+  TooManyDimensions<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingTooManyDimensions) {
+  TooManyDimensions<int16_t>();
+}
+
 template <typename padding_integer_type>
 void UnequalDimensionsV2() {
   typedef PadV2OpConstModel<float, padding_integer_type> f;
@@ -655,6 +757,14 @@ TEST_F(PadV2OpTest, Int64PaddingUnequalDimensions) {
   UnequalDimensionsV2<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingUnequalDimensions) {
+  UnequalDimensionsV2<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingUnequalDimensions) {
+  UnequalDimensionsV2<int16_t>();
+}
+
 template <typename padding_integer_type>
 void InvalidPadValueV2() {
   typedef PadV2OpConstModel<float, padding_integer_type> f;
@@ -671,6 +781,12 @@ TEST_F(PadV2OpTest, Int64PaddingInvalidPadValue) {
   InvalidPadValueV2<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingInvalidPadValue) { InvalidPadValueV2<int8_t>(); }
+
+TEST_F(PadV2OpTest, Int16PaddingInvalidPadValue) {
+  InvalidPadValueV2<int16_t>();
+}
+
 TEST_F(PadV2OpTest, Int64PaddingOverflow) {
   EXPECT_DEATH(PadOpConstModel<int64_t>(
                    {TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
@@ -711,6 +827,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleConstTestUint8) {
   SimpleConstTestUint8<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleConstTestUint8) {
+  SimpleConstTestUint8<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleConstTestUint8) {
+  SimpleConstTestUint8<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleConstTestInt8() {
   // Padding is represented as four 2-D lists representing above padding and
@@ -733,6 +857,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleConstTestInt8) {
   SimpleConstTestInt8<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleConstTestInt8) {
+  SimpleConstTestInt8<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleConstTestInt8) {
+  SimpleConstTestInt8<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleConstFloat32ValuedTestUint8() {
   // Padding is represented as four 2-D lists representing above padding and
@@ -755,6 +887,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleConstFloat32ValuedTestUint8) {
   SimpleConstFloat32ValuedTestUint8<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleConstFloat32ValuedTestUint8) {
+  SimpleConstFloat32ValuedTestUint8<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleConstFloat32ValuedTestUint8) {
+  SimpleConstFloat32ValuedTestUint8<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleConstFloat32ValuedTestInt8() {
   // Padding is represented as four 2-D lists representing above padding and
@@ -777,6 +917,77 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleConstFloat32ValuedTestInt8) {
   SimpleConstFloat32ValuedTestInt8<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleConstFloat32ValuedTestInt8) {
+  SimpleConstFloat32ValuedTestInt8<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleConstFloat32ValuedTestInt8) {
+  SimpleConstFloat32ValuedTestInt8<int16_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConstFloat16ValuedTest() {
+  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+      Eigen::half{4.0f}, {TensorType_FLOAT16});
+  m.SetInput({Eigen::half{1.5f}, Eigen::half{2.5f}, Eigen::half{3.5f},
+              Eigen::half{4.5}});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4},
+           Eigen::half{4}, Eigen::half{1.5}, Eigen::half{2.5}, Eigen::half{4},
+           Eigen::half{4}, Eigen::half{3.5}, Eigen::half{4.5}, Eigen::half{4},
+           Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4}})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(PadV2OpTest, Int32PaddingSimpleConstFloat16) {
+  SimpleConstFloat16ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleConstFloat16) {
+  SimpleConstFloat16ValuedTest<int64_t>();
+}
+
+TEST_F(PadV2OpTest, Int8PaddingSimpleConstFloat16) {
+  SimpleConstFloat16ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleConstFloat16) {
+  SimpleConstFloat16ValuedTest<int16_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConstBFloat16ValuedTest() {
+  PadV2OpConstModel<Eigen::bfloat16, padding_integer_type> m(
+      {TensorType_BFLOAT16, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+      Eigen::bfloat16{6.0f}, {TensorType_BFLOAT16});
+  m.SetInput({Eigen::bfloat16{1.0f}, Eigen::bfloat16{2.0f},
+              Eigen::bfloat16{3.0f}, Eigen::bfloat16{4.0}});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 6, 6, 6, 6, 1, 2, 6, 6, 3, 4,
+                                               6, 6, 6, 6, 6}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(PadV2OpTest, Int32PaddingSimpleConstBFloat16) {
+  SimpleConstBFloat16ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleConstBFloat16) {
+  SimpleConstBFloat16ValuedTest<int64_t>();
+}
+
+TEST_F(PadV2OpTest, Int8PaddingSimpleConstBFloat16) {
+  SimpleConstBFloat16ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleConstBFloat16) {
+  SimpleConstBFloat16ValuedTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void Simple4DConstFloat32ValuedTest() {
   // Padding is represented as four 2-D lists representing above padding and
@@ -798,6 +1009,73 @@ TEST_F(PadV2OpTest, Int64PaddingSimple4DConstFloat32ValuedTest) {
   Simple4DConstFloat32ValuedTest<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimple4DConstFloat32ValuedTest) {
+  Simple4DConstFloat32ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimple4DConstFloat32ValuedTest) {
+  Simple4DConstFloat32ValuedTest<int16_t>();
+}
+
+template <typename padding_integer_type>
+void Simple4DConstFloat16ValuedTest() {
+  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+      {TensorType_FLOAT16, {1, 1, 2, 1}}, {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1},
+      Eigen::half{7.0}, {TensorType_FLOAT16});
+  m.SetInput({Eigen::half{3.0f}, Eigen::half{6.0f}});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 7, 6, 7, 7, 7, 7, 7}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
+}
+
+TEST_F(PadV2OpTest, Int32PaddingSimple4DConstFloat16ValuedTest) {
+  Simple4DConstFloat16ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimple4DConstFloat16ValuedTest) {
+  Simple4DConstFloat16ValuedTest<int64_t>();
+}
+
+TEST_F(PadV2OpTest, Int8PaddingSimple4DConstFloat16ValuedTest) {
+  Simple4DConstFloat16ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimple4DConstFloat16ValuedTest) {
+  Simple4DConstFloat16ValuedTest<int16_t>();
+}
+
+template <typename padding_integer_type>
+void Simple4DConstBFloat16ValuedTest() {
+  PadV2OpConstModel<Eigen::bfloat16, padding_integer_type> m(
+      {TensorType_BFLOAT16, {1, 1, 2, 1}}, {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1},
+      Eigen::bfloat16{5.0}, {TensorType_BFLOAT16});
+  m.SetInput({Eigen::bfloat16{3.2f}, Eigen::bfloat16{6.4f}});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {Eigen::bfloat16{3.2f}, Eigen::bfloat16{5.0f}, Eigen::bfloat16{6.4f},
+           Eigen::bfloat16{5.0f}, Eigen::bfloat16{5.0f}, Eigen::bfloat16{5.0f},
+           Eigen::bfloat16{5.0f}, Eigen::bfloat16{5.0f}})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
+}
+
+TEST_F(PadV2OpTest, Int32PaddingSimple4DConstBFloat16ValuedTest) {
+  Simple4DConstBFloat16ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimple4DConstBFloat16ValuedTest) {
+  Simple4DConstBFloat16ValuedTest<int64_t>();
+}
+
+TEST_F(PadV2OpTest, Int8PaddingSimple4DConstBFloat16ValuedTest) {
+  Simple4DConstBFloat16ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimple4DConstBFloat16ValuedTest) {
+  Simple4DConstBFloat16ValuedTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleConstInt32ValuedTest() {
   // Padding is represented as four 2-D lists representing above padding and
@@ -820,6 +1098,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleConstInt32ValuedTest) {
   SimpleConstInt32ValuedTest<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleConstInt32ValuedTest) {
+  SimpleConstInt32ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleConstInt32ValuedTest) {
+  SimpleConstInt32ValuedTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleDynamicTestV2() {
   PadV2OpDynamicModel<float, padding_integer_type> m(
@@ -840,6 +1126,74 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleDynamicTest) {
   SimpleDynamicTestV2<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleDynamicTest) {
+  SimpleDynamicTestV2<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicTest) {
+  SimpleDynamicTestV2<int16_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleDynamicTestV2Float16() {
+  PadV2OpDynamicModel<Eigen::half, padding_integer_type> m(
+      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, Eigen::half{0.0},
+      {TensorType_FLOAT16});
+  m.SetInput({Eigen::half{1.0f}, Eigen::half{2.0f}, Eigen::half{3.0f},
+              Eigen::half{4.0f}});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(PadV2OpTest, Int32PaddingSimpleDynamicTestFloat16) {
+  SimpleDynamicTestV2Float16<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleDynamicTestFloat16) {
+  SimpleDynamicTestV2Float16<int64_t>();
+}
+
+TEST_F(PadV2OpTest, Int8PaddingSimpleDynamicTestFloat16) {
+  SimpleDynamicTestV2Float16<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicTestFloat16) {
+  SimpleDynamicTestV2Float16<int16_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleDynamicTestV2BFloat16() {
+  PadV2OpDynamicModel<Eigen::bfloat16, padding_integer_type> m(
+      {TensorType_BFLOAT16, {1, 2, 2, 1}}, {4, 2}, Eigen::bfloat16{2.0},
+      {TensorType_BFLOAT16});
+  m.SetInput({Eigen::bfloat16{5.0f}, Eigen::bfloat16{6.0f},
+              Eigen::bfloat16{7.0f}, Eigen::bfloat16{8.0f}});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 2, 2, 2, 2, 5, 6, 2, 2, 7, 8,
+                                               2, 2, 2, 2, 2}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(PadV2OpTest, Int32PaddingSimpleDynamicTestBFloat16) {
+  SimpleDynamicTestV2BFloat16<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleDynamicTestBFloat16) {
+  SimpleDynamicTestV2BFloat16<int64_t>();
+}
+
+TEST_F(PadV2OpTest, Int8PaddingSimpleDynamicTestBFloat16) {
+  SimpleDynamicTestV2BFloat16<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicTestBFloat16) {
+  SimpleDynamicTestV2BFloat16<int16_t>();
+}
+
 template <typename padding_integer_type>
 void PadV2OpDynamicUnequalDimensions() {
   if (SingleOpModel::GetForceUseNnapi()) {
@@ -861,6 +1215,14 @@ TEST_F(PadV2OpTest, Int64PaddingDynamicUnequalDimensions) {
   PadV2OpDynamicUnequalDimensions<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingDynamicUnequalDimensions) {
+  PadV2OpDynamicUnequalDimensions<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingDynamicUnequalDimensions) {
+  PadV2OpDynamicUnequalDimensions<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleDynamicValuedTest() {
   PadV2OpDynamicModel<float, padding_integer_type> m(
@@ -881,6 +1243,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleDynamicValuedTest) {
   SimpleDynamicValuedTest<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void SimpleTensorWithDim0Test() {
   PadV2OpDynamicModel<float, padding_integer_type> m(
@@ -906,6 +1276,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimpleTensorWithDim0Test) {
   SimpleTensorWithDim0Test<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimpleTensorWithDim0Test) {
+  SimpleTensorWithDim0Test<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimpleTensorWithDim0Test) {
+  SimpleTensorWithDim0Test<int16_t>();
+}
+
 template <typename padding_integer_type>
 void Simple5DConstFloat32ValuedTest() {
   PadV2OpConstModel<float, padding_integer_type> m(
@@ -926,6 +1304,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimple5DConstFloat32ValuedTest) {
   Simple5DConstFloat32ValuedTest<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimple5DConstFloat32ValuedTest) {
+  Simple5DConstFloat32ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimple5DConstFloat32ValuedTest) {
+  Simple5DConstFloat32ValuedTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void Simple5DConstInt32ValuedTest() {
   PadV2OpConstModel<int32_t, padding_integer_type> m(
@@ -949,6 +1335,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimple5DConstInt32ValuedTest) {
   Simple5DConstInt32ValuedTest<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimple5DConstInt32ValuedTest) {
+  Simple5DConstInt32ValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimple5DConstInt32ValuedTest) {
+  Simple5DConstInt32ValuedTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void Simple5DDynamicValuedTest() {
   PadV2OpDynamicModel<float, padding_integer_type> m(
@@ -972,6 +1366,14 @@ TEST_F(PadV2OpTest, Int64PaddingSimple5DDynamicValuedTest) {
   Simple5DDynamicValuedTest<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingSimple5DDynamicValuedTest) {
+  Simple5DDynamicValuedTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingSimple5DDynamicValuedTest) {
+  Simple5DDynamicValuedTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void AdvancedConstTest() {
   PadV2OpConstModel<float, padding_integer_type> m(
@@ -993,6 +1395,14 @@ TEST_F(PadV2OpTest, Int64PaddingAdvancedConstTest) {
   AdvancedConstTest<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingAdvancedConstTest) {
+  AdvancedConstTest<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingAdvancedConstTest) {
+  AdvancedConstTest<int16_t>();
+}
+
 template <typename padding_integer_type>
 void AdvancedDynamicTestV2() {
   PadV2OpDynamicModel<float, padding_integer_type> m(
@@ -1014,6 +1424,14 @@ TEST_F(PadV2OpTest, Int64PaddingAdvancedDynamicTest) {
   AdvancedDynamicTestV2<int64_t>();
 }
 
+TEST_F(PadV2OpTest, Int8PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTestV2<int8_t>();
+}
+
+TEST_F(PadV2OpTest, Int16PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTestV2<int16_t>();
+}
+
 class QuantizedPadV2OpTest : public ::testing::Test {
  protected:
   std::vector<Matcher<float>> DequantizedArrayNear(
diff --git a/tensorflow/lite/kernels/parse_example/BUILD b/tensorflow/lite/kernels/parse_example/BUILD
index b18d23b6c7d6..bbf62c1decb9 100644
--- a/tensorflow/lite/kernels/parse_example/BUILD
+++ b/tensorflow/lite/kernels/parse_example/BUILD
@@ -24,6 +24,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
+        "//tensorflow/core/platform:hash",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/core/c:common",
@@ -31,6 +32,11 @@ cc_library(
         "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@flatbuffers",
     ] + select({
         "//tensorflow:android": [
@@ -111,6 +117,11 @@ cc_library(
         "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/lite/kernels/parse_example/parse_example.cc b/tensorflow/lite/kernels/parse_example/parse_example.cc
index ec87aabfc86c..6d6e77f02bb6 100644
--- a/tensorflow/lite/kernels/parse_example/parse_example.cc
+++ b/tensorflow/lite/kernels/parse_example/parse_example.cc
@@ -16,24 +16,35 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <map>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <utility>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 #include "tensorflow/core/util/presized_cuckoo_map.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
@@ -46,7 +57,6 @@ namespace parse_example {
 namespace {
 
 namespace tf = ::tensorflow;
-using tf::Status;
 using tf::StringPiece;
 using tf::tstring;
 using tf::example::CopyOrMoveBlock;
@@ -116,7 +126,7 @@ bool ParseExample(StringRef serialized, Example* example) {
   return ParseExample(&stream, example);
 }
 
-Status FastParseSerializedExample(
+absl::Status FastParseSerializedExample(
     StringRef serialized_example, const tstring& example_name,
     const size_t example_index, const FastParseExampleConfig& config,
     bool* quick_filter, int quick_filter_size,
@@ -139,7 +149,7 @@ Status FastParseSerializedExample(
     // I.e. last entry in the map overwrites all the previous ones.
     tensorflow::example::parsed::FeatureMapEntry& name_and_feature =
         parsed_example[parsed_example_size - i - 1];
-    const StringPiece feature_name = name_and_feature.first;
+    const absl::string_view feature_name = name_and_feature.first;
     tensorflow::example::parsed::Feature& feature = name_and_feature.second;
     if (feature_name.length() >= quick_filter_size ||
         !quick_filter[feature_name.length()]) {
@@ -153,7 +163,7 @@ Status FastParseSerializedExample(
     size_t d = d_and_type.first;
     bool is_dense = d_and_type.second == Type::Dense;
 
-    auto example_error = [&](StringPiece suffix) {
+    auto example_error = [&](absl::string_view suffix) {
       return tf::errors::Internal("Name: ", example_name,
                                   ", Key: ", feature_name,
                                   ", Index: ", example_index, ".  ", suffix);
@@ -164,7 +174,7 @@ Status FastParseSerializedExample(
     };
 
     tf::DataType example_dtype;
-    if (feature.ParseDataType(&example_dtype) != absl::OkStatus()) {
+    if (!feature.ParseDataType(&example_dtype).ok()) {
       return parse_error();
     }
     if (is_dense) {
@@ -184,7 +194,7 @@ Status FastParseSerializedExample(
         const std::size_t num_elements = config.dense[d].elements_per_stride;
         const std::size_t offset = example_index * num_elements;
 
-        auto shape_error = [&](size_t size, StringPiece type_str) {
+        auto shape_error = [&](size_t size, absl::string_view type_str) {
           return example_error(absl::StrCat(
               "Number of ", type_str,
               " values != expected.  "
@@ -238,7 +248,7 @@ Status FastParseSerializedExample(
               "Expected type: ", DataTypeString(config.dense[d].dtype)));
         }
 
-        auto shape_error = [&](size_t size, StringPiece type_str) {
+        auto shape_error = [&](size_t size, absl::string_view type_str) {
           return example_error(
               absl::StrCat("Number of ", type_str,
                            " values is not a multiple of stride length. Saw ",
@@ -452,7 +462,7 @@ inline void CopyToBuffer(absl::Span<const tstring> vec, char* tensor_buffer,
   }
 }
 
-Status FastParseExampleLite(
+absl::Status FastParseExampleLite(
     const FastParseExampleConfig& config, const TfLiteTensor* serialized,
     absl::Span<const tstring> example_names, bool* quick_filter,
     int quick_filter_size, const std::unique_ptr<ConfigIndex>& config_index,
@@ -465,7 +475,7 @@ Status FastParseExampleLite(
   std::vector<tf::Tensor> fixed_dense_values(config.dense.size());
   std::vector<SparseBuffer> sparse_buffers(config.sparse.size());
   std::vector<SparseBuffer> varlen_dense_buffers(config.dense.size());
-  Status status_of_minibatch;
+  absl::Status status_of_minibatch;
   for (size_t e = 0; e < count; ++e) {
     status_of_minibatch = FastParseSerializedExample(
         GetString(serialized, e),
@@ -971,8 +981,8 @@ TfLiteStatus EvalParseExample(TfLiteContext* context, TfLiteNode* node) {
       data->config, serialized, {}, data->quick_filter, data->quick_filter_size,
       data->config_index, data->config_index_size, &data->hasher, &data->got,
       stats, context);
-  if (status != absl::OkStatus()) {
-    TF_LITE_KERNEL_LOG(context, status.ToString().c_str());
+  if (!status.ok()) {
+    TF_LITE_KERNEL_LOG(context, "%s", status.ToString().c_str());
     return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index 401bee6a2424..4634c5ce5e7a 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -147,7 +147,7 @@ TEST(FloatPoolingOpTest, AveragePool) {
       3, 2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {2.75, 5.75}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({2.75, 5.75})));
 }
 
 TEST(FloatPoolingOpTest, AveragePoolActivationRelu) {
@@ -161,7 +161,7 @@ TEST(FloatPoolingOpTest, AveragePoolActivationRelu) {
       3, 2, -10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {0.0, 0.75}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({0.0, 0.75})));
 }
 
 TEST(FloatPoolingOpTest, AveragePoolActivationRelu1) {
@@ -175,14 +175,14 @@ TEST(FloatPoolingOpTest, AveragePoolActivationRelu1) {
       -3, -2, -10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {-1.0, 0.75}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({-1.0, 0.75})));
 
   m.SetInput({
       0, -6, -2, -4,   //
       -3, -2, 10, -7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {-1.0, -0.75}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({-1.0, -0.75})));
 }
 
 TEST(FloatPoolingOpTest, AveragePoolActivationRelu6) {
@@ -196,14 +196,14 @@ TEST(FloatPoolingOpTest, AveragePoolActivationRelu6) {
       -3, -2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {0.0, 6.0}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({0.0, 6.0})));
 
   m.SetInput({
       0, 6, 12, 4,  //
       3, 2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {2.75, 6.0}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({2.75, 6.0})));
 }
 
 TEST(FloatPoolingOpTest, AveragePoolPaddingSameStride1) {
@@ -217,9 +217,8 @@ TEST(FloatPoolingOpTest, AveragePoolPaddingSameStride1) {
       3, 2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      m.GetOutput(),
-      Pointwise(FloatingPointEq(), {2.75, 5.0, 5.75, 5.5, 2.5, 6.0, 8.5, 7.0}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {2.75, 5.0, 5.75, 5.5, 2.5, 6.0, 8.5, 7.0})));
 }
 
 TEST(FloatPoolingOpTest, AveragePoolPaddingValidStride1) {
@@ -233,7 +232,8 @@ TEST(FloatPoolingOpTest, AveragePoolPaddingValidStride1) {
       3, 2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {2.75, 5.0, 5.75}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({2.75, 5.0, 5.75})));
 }
 
 TEST(QuantizedPoolingOpTest, AveragePool) {
@@ -643,7 +643,7 @@ TEST(FloatPoolingOpTest, MaxPoolActivationRelu) {
       -3, -2, 10.5, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {0.0, 10.5}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({0.0, 10.5})));
 }
 
 TEST(FloatPoolingOpTest, MaxPoolActivationRelu1) {
@@ -657,14 +657,14 @@ TEST(FloatPoolingOpTest, MaxPoolActivationRelu1) {
       -3, -2, -0.3, 0.7,    //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {-1.0, 0.7}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({-1.0, 0.7})));
 
   m.SetInput({
       -2.75, -6, -2, -4,  //
       -3, -2, 10, -7,     //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {-1.0, 1.0}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({-1.0, 1.0})));
 }
 
 TEST(FloatPoolingOpTest, MaxPoolActivationRelu6) {
@@ -678,14 +678,14 @@ TEST(FloatPoolingOpTest, MaxPoolActivationRelu6) {
       -3, -2, 10, 7,    //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {0.0, 6.0}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({0.0, 6.0})));
 
   m.SetInput({
       0, 4.5, 12, 4,  //
       3, 2, 10, 7,    //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {4.5, 6.0}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({4.5, 6.0})));
 }
 
 TEST(FloatPoolingOpTest, MaxPoolPaddingSameStride1) {
@@ -1063,7 +1063,7 @@ TEST(FloatPoolingOpTest, L2Pool) {
       3, 2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {3.5, 6.5}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3.5, 6.5})));
 }
 
 TEST(FloatPoolingOpTest, L2PoolActivationRelu) {
@@ -1118,7 +1118,7 @@ TEST(FloatPoolingOpTest, L2PoolPaddingSame) {
       3, 2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {3.5, 6.5}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3.5, 6.5})));
 }
 
 TEST(FloatPoolingOpTest, L2PoolPaddingSameSlide1) {
@@ -1149,7 +1149,7 @@ TEST(FloatPoolingOpTest, L2PoolPaddingValidSlide1) {
       3, 2, 10, 7,  //
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), Pointwise(FloatingPointEq(), {3.5, 6.0, 6.5}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3.5, 6.0, 6.5})));
 }
 
 #if GTEST_HAS_DEATH_TEST
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 479f3d5c996a..a6cdc7b6cc80 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -375,10 +375,10 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 6);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU_REF());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM_REF(),
              /* min_version = */ 1,
@@ -500,7 +500,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_MATRIX_DIAG, Register_MATRIX_DIAG());
   AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE_REF(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_MATRIX_SET_DIAG, Register_MATRIX_SET_DIAG());
   AddBuiltin(BuiltinOperator_IF, Register_IF());
   AddBuiltin(BuiltinOperator_WHILE, Register_WHILE());
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
index 354ae2850ebc..196f9ae60897 100644
--- a/tensorflow/lite/kernels/reverse.cc
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -50,7 +50,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
       input->type != kTfLiteUInt8 && input->type != kTfLiteInt8 &&
       input->type != kTfLiteInt16 && input->type != kTfLiteInt64 &&
-      input->type != kTfLiteBool) {
+      input->type != kTfLiteBool && input->type != kTfLiteFloat16 &&
+      input->type != kTfLiteBFloat16) {
     TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by reverse.",
                        TfLiteTypeGetName(input->type));
     return kTfLiteError;
@@ -148,6 +149,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                    GetTensorData<bool>(output));
       break;
     }
+    case kTfLiteFloat16: {
+      reference_ops::Reverse<Eigen::half>(axes, num_axes, GetTensorShape(input),
+                                          GetTensorData<Eigen::half>(input),
+                                          GetTensorData<Eigen::half>(output));
+      break;
+    }
+    case kTfLiteBFloat16: {
+      reference_ops::Reverse<Eigen::bfloat16>(
+          axes, num_axes, GetTensorShape(input),
+          GetTensorData<Eigen::bfloat16>(input),
+          GetTensorData<Eigen::bfloat16>(output));
+      break;
+    }
     default: {
       TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by reverse.",
                          TfLiteTypeGetName(output->type));
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
index ad4f209944e1..4301b0120f53 100644
--- a/tensorflow/lite/kernels/reverse_test.cc
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -352,5 +352,94 @@ TEST(ReverseOpTest, Int16MultiDimensions) {
                         17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
 }
 
+// float16 tests.
+TEST(ReverseOpTest, Float16OneDimension) {
+  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4}},
+                                    {TensorType_INT32, {1}});
+  model.PopulateTensor<Eigen::half>(
+      model.input(),
+      {Eigen::half(1), Eigen::half(2), Eigen::half(3), Eigen::half(4)});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({Eigen::half(4), Eigen::half(3), Eigen::half(2),
+                                Eigen::half(1)}));
+}
+
+TEST(ReverseOpTest, Float16MultiDimensions) {
+  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4, 3, 2}},
+                                    {TensorType_INT32, {1}});
+  model.PopulateTensor<Eigen::half>(
+      model.input(),
+      {Eigen::half(1),  Eigen::half(2),  Eigen::half(3),  Eigen::half(4),
+       Eigen::half(5),  Eigen::half(6),  Eigen::half(7),  Eigen::half(8),
+       Eigen::half(9),  Eigen::half(10), Eigen::half(11), Eigen::half(12),
+       Eigen::half(13), Eigen::half(14), Eigen::half(15), Eigen::half(16),
+       Eigen::half(17), Eigen::half(18), Eigen::half(19), Eigen::half(20),
+       Eigen::half(21), Eigen::half(22), Eigen::half(23), Eigen::half(24)});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({Eigen::half(5),  Eigen::half(6),  Eigen::half(3),
+                        Eigen::half(4),  Eigen::half(1),  Eigen::half(2),
+                        Eigen::half(11), Eigen::half(12), Eigen::half(9),
+                        Eigen::half(10), Eigen::half(7),  Eigen::half(8),
+                        Eigen::half(17), Eigen::half(18), Eigen::half(15),
+                        Eigen::half(16), Eigen::half(13), Eigen::half(14),
+                        Eigen::half(23), Eigen::half(24), Eigen::half(21),
+                        Eigen::half(22), Eigen::half(19), Eigen::half(20)}));
+}
+
+// bfloat16 tests.
+TEST(ReverseOpTest, BFloat16OneDimension) {
+  ReverseOpModel<Eigen::bfloat16> model({TensorType_BFLOAT16, {4}},
+                                        {TensorType_INT32, {1}});
+  model.PopulateTensor<Eigen::bfloat16>(
+      model.input(), {Eigen::bfloat16(1), Eigen::bfloat16(2),
+                      Eigen::bfloat16(3), Eigen::bfloat16(4)});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({Eigen::bfloat16(4), Eigen::bfloat16(3),
+                                Eigen::bfloat16(2), Eigen::bfloat16(1)}));
+}
+
+TEST(ReverseOpTest, BFloat16MultiDimensions) {
+  ReverseOpModel<Eigen::bfloat16> model({TensorType_BFLOAT16, {4, 3, 2}},
+                                        {TensorType_INT32, {1}});
+  model.PopulateTensor<Eigen::bfloat16>(
+      model.input(),
+      {Eigen::bfloat16(1),  Eigen::bfloat16(2),  Eigen::bfloat16(3),
+       Eigen::bfloat16(4),  Eigen::bfloat16(5),  Eigen::bfloat16(6),
+       Eigen::bfloat16(7),  Eigen::bfloat16(8),  Eigen::bfloat16(9),
+       Eigen::bfloat16(10), Eigen::bfloat16(11), Eigen::bfloat16(12),
+       Eigen::bfloat16(13), Eigen::bfloat16(14), Eigen::bfloat16(15),
+       Eigen::bfloat16(16), Eigen::bfloat16(17), Eigen::bfloat16(18),
+       Eigen::bfloat16(19), Eigen::bfloat16(20), Eigen::bfloat16(21),
+       Eigen::bfloat16(22), Eigen::bfloat16(23), Eigen::bfloat16(24)});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray(
+          {Eigen::bfloat16(5),  Eigen::bfloat16(6),  Eigen::bfloat16(3),
+           Eigen::bfloat16(4),  Eigen::bfloat16(1),  Eigen::bfloat16(2),
+           Eigen::bfloat16(11), Eigen::bfloat16(12), Eigen::bfloat16(9),
+           Eigen::bfloat16(10), Eigen::bfloat16(7),  Eigen::bfloat16(8),
+           Eigen::bfloat16(17), Eigen::bfloat16(18), Eigen::bfloat16(15),
+           Eigen::bfloat16(16), Eigen::bfloat16(13), Eigen::bfloat16(14),
+           Eigen::bfloat16(23), Eigen::bfloat16(24), Eigen::bfloat16(21),
+           Eigen::bfloat16(22), Eigen::bfloat16(19), Eigen::bfloat16(20)}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/round.cc b/tensorflow/lite/kernels/round.cc
index 883fcc53ba04..498ade11cdaf 100644
--- a/tensorflow/lite/kernels/round.cc
+++ b/tensorflow/lite/kernels/round.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/round.h"
 
+#include "Eigen/Core"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -37,7 +38,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  if (input->type != kTfLiteFloat32 && input->type != kTfLiteFloat16 &&
+      input->type != kTfLiteBFloat16) {
+    TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by round.",
+                       TfLiteTypeGetName(input->type));
+    return kTfLiteError;
+  }
   output->type = input->type;
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
   return context->ResizeTensor(context, output, output_size);
@@ -49,9 +55,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context,
                     GetOutputSafe(context, node, kOutputTensor, &output));
-
-  optimized_ops::Round(GetTensorShape(input), GetTensorData<float>(input),
-                       GetTensorShape(output), GetTensorData<float>(output));
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::Round<float>(
+          GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      break;
+    }
+    case kTfLiteFloat16: {
+      optimized_ops::Round<Eigen::half>(
+          GetTensorShape(input), GetTensorData<Eigen::half>(input),
+          GetTensorShape(output), GetTensorData<Eigen::half>(output));
+      break;
+    }
+    case kTfLiteBFloat16: {
+      optimized_ops::Round<Eigen::bfloat16>(
+          GetTensorShape(input), GetTensorData<Eigen::bfloat16>(input),
+          GetTensorShape(output), GetTensorData<Eigen::bfloat16>(output));
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(context, "Type '%s' is not supported by round.",
+                         TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+    }
+  }
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
index a582d65e4047..c3752827f3e6 100644
--- a/tensorflow/lite/kernels/round_test.cc
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <initializer_list>
 #include <vector>
 
+#include "Eigen/Core"
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -25,11 +26,12 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename T>
 class RoundOpModel : public SingleOpModel {
  public:
-  RoundOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+  RoundOpModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(GetTensorType<T>());
+    output_ = AddOutput(GetTensorType<T>());
     SetBuiltinOp(BuiltinOperator_ROUND, BuiltinOptions_NONE, 0);
     BuildInterpreter({
         input_shape,
@@ -38,7 +40,7 @@ class RoundOpModel : public SingleOpModel {
 
   int input() { return input_; }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  private:
@@ -47,7 +49,7 @@ class RoundOpModel : public SingleOpModel {
 };
 
 TEST(RoundOpTest, SingleDim) {
-  RoundOpModel model({6}, TensorType_FLOAT32);
+  RoundOpModel<float> model({6});
   model.PopulateTensor<float>(model.input(), {8.5, 0.0, 3.5, 4.2, -3.5, -4.5});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({8, 0, 4, 4, -4, -4}));
@@ -55,7 +57,7 @@ TEST(RoundOpTest, SingleDim) {
 }
 
 TEST(RoundOpTest, MultiDims) {
-  RoundOpModel model({2, 1, 1, 6}, TensorType_FLOAT32);
+  RoundOpModel<float> model({2, 1, 1, 6});
   model.PopulateTensor<float>(
       model.input(), {0.0001, 8.0001, 0.9999, 9.9999, 0.5, -0.0001, -8.0001,
                       -0.9999, -9.9999, -0.5, -2.5, 1.5});
@@ -65,5 +67,70 @@ TEST(RoundOpTest, MultiDims) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
 }
 
+TEST(RoundOpTest, Float16SingleDim) {
+  RoundOpModel<Eigen::half> model({6});
+  model.PopulateTensor<Eigen::half>(
+      model.input(), {Eigen::half(8.5), Eigen::half(0.0), Eigen::half(3.5),
+                      Eigen::half(4.2), Eigen::half(-3.5), Eigen::half(-4.5)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({Eigen::half(8), Eigen::half(0), Eigen::half(4),
+                        Eigen::half(4), Eigen::half(-4), Eigen::half(-4)}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({6}));
+}
+
+TEST(RoundOpTest, Float16MultiDims) {
+  RoundOpModel<Eigen::half> model({2, 1, 1, 6});
+  model.PopulateTensor<Eigen::half>(
+      model.input(),
+      {Eigen::half(0.0001), Eigen::half(8.0001), Eigen::half(0.9999),
+       Eigen::half(9.9999), Eigen::half(0.5), Eigen::half(-0.0001),
+       Eigen::half(-8.0001), Eigen::half(-0.9999), Eigen::half(-9.9999),
+       Eigen::half(-0.5), Eigen::half(-2.5), Eigen::half(1.5)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({Eigen::half(0), Eigen::half(8), Eigen::half(1),
+                        Eigen::half(10), Eigen::half(0), Eigen::half(0),
+                        Eigen::half(-8), Eigen::half(-1), Eigen::half(-10),
+                        Eigen::half(-0), Eigen::half(-2), Eigen::half(2)}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
+}
+
+TEST(RoundOpTest, BFloat16SingleDim) {
+  RoundOpModel<Eigen::bfloat16> model({6});
+  model.PopulateTensor<Eigen::bfloat16>(
+      model.input(),
+      {Eigen::bfloat16(8.5), Eigen::bfloat16(0.0), Eigen::bfloat16(3.5),
+       Eigen::bfloat16(4.2), Eigen::bfloat16(-3.5), Eigen::bfloat16(-4.5)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({Eigen::bfloat16(8), Eigen::bfloat16(0),
+                                Eigen::bfloat16(4), Eigen::bfloat16(4),
+                                Eigen::bfloat16(-4), Eigen::bfloat16(-4)}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({6}));
+}
+
+TEST(RoundOpTest, BFloat16MultiDims) {
+  RoundOpModel<Eigen::bfloat16> model({2, 1, 1, 6});
+  model.PopulateTensor<Eigen::bfloat16>(
+      model.input(),
+      {Eigen::bfloat16(0.0001), Eigen::bfloat16(8.0001),
+       Eigen::bfloat16(0.9999), Eigen::bfloat16(9.9999), Eigen::bfloat16(0.5),
+       Eigen::bfloat16(-0.0001), Eigen::bfloat16(-8.0001),
+       Eigen::bfloat16(-0.9999), Eigen::bfloat16(-9.9999),
+       Eigen::bfloat16(-0.5), Eigen::bfloat16(-2.5), Eigen::bfloat16(1.5)});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray(
+          {Eigen::bfloat16(0), Eigen::bfloat16(8), Eigen::bfloat16(1),
+           Eigen::bfloat16(10), Eigen::bfloat16(0), Eigen::bfloat16(0),
+           Eigen::bfloat16(-8), Eigen::bfloat16(-1), Eigen::bfloat16(-10),
+           Eigen::bfloat16(-0), Eigen::bfloat16(-2), Eigen::bfloat16(2)}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/shim/shape.h b/tensorflow/lite/kernels/shim/shape.h
index bacde309660b..fffad2b32717 100644
--- a/tensorflow/lite/kernels/shim/shape.h
+++ b/tensorflow/lite/kernels/shim/shape.h
@@ -57,7 +57,7 @@ class Shape {
   int Dim(const int idx) const;
 
   // Returns the rank of the shape
-  const int Rank() const { return has_value_ ? value_.size() : kUnknownRank; }
+  int Rank() const { return has_value_ ? value_.size() : kUnknownRank; }
 
   // Whether all the dimensions of the shape are known
   bool FullyDefined() const;
diff --git a/tensorflow/lite/kernels/shim/tflite_tensor_view.cc b/tensorflow/lite/kernels/shim/tflite_tensor_view.cc
index f89d437485f4..5a322a4e7496 100644
--- a/tensorflow/lite/kernels/shim/tflite_tensor_view.cc
+++ b/tensorflow/lite/kernels/shim/tflite_tensor_view.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/shim/tensor_view.h"
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 2b9a773238e9..d8ff57364fe0 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "Eigen/Core"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -254,6 +255,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteString:
       TF_LITE_SLICE(string);
       break;
+    case kTfLiteFloat16:
+      TF_LITE_SLICE(Eigen::half);
+      break;
+    case kTfLiteBFloat16:
+      TF_LITE_SLICE(Eigen::bfloat16);
+      break;
     default:
       TF_LITE_KERNEL_LOG(
           context, "Type %d is currently not supported by Slice.", input->type);
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 80b35f592814..4a016c44a454 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "Eigen/Core"
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/test_util.h"
@@ -309,6 +310,74 @@ TEST_P(SliceOpTest, SliceBool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({false, true}));
 }
 
+TEST_P(SliceOpTest, SliceFloat16) {
+  SliceOpModel<Eigen::half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                       {2, 1, -1, 1}, TensorType_INT32,
+                                       TensorType_FLOAT16, GetParam());
+  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(1), Eigen::half(2),
+              Eigen::half(2), Eigen::half(2), Eigen::half(3), Eigen::half(3),
+              Eigen::half(3), Eigen::half(4), Eigen::half(4), Eigen::half(4),
+              Eigen::half(5), Eigen::half(5), Eigen::half(5), Eigen::half(6),
+              Eigen::half(6), Eigen::half(6)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({Eigen::half(3), Eigen::half(3), Eigen::half(3),
+                        Eigen::half(5), Eigen::half(5), Eigen::half(5)}));
+}
+
+TEST_P(SliceOpTest, SliceBFloat16) {
+  SliceOpModel<Eigen::bfloat16, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                           {2, 1, -1, 1}, TensorType_INT32,
+                                           TensorType_BFLOAT16, GetParam());
+  m.SetInput({Eigen::bfloat16(1), Eigen::bfloat16(1), Eigen::bfloat16(1),
+              Eigen::bfloat16(2), Eigen::bfloat16(2), Eigen::bfloat16(2),
+              Eigen::bfloat16(3), Eigen::bfloat16(3), Eigen::bfloat16(3),
+              Eigen::bfloat16(4), Eigen::bfloat16(4), Eigen::bfloat16(4),
+              Eigen::bfloat16(5), Eigen::bfloat16(5), Eigen::bfloat16(5),
+              Eigen::bfloat16(6), Eigen::bfloat16(6), Eigen::bfloat16(6)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({Eigen::bfloat16(3), Eigen::bfloat16(3),
+                                Eigen::bfloat16(3), Eigen::bfloat16(5),
+                                Eigen::bfloat16(5), Eigen::bfloat16(5)}));
+}
+
+TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1Float16) {
+  SliceOpModel<Eigen::half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
+                                       {2, -1, 1, 1}, TensorType_INT32,
+                                       TensorType_FLOAT16, GetParam());
+  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(2), Eigen::half(2),
+              Eigen::half(3), Eigen::half(3), Eigen::half(4), Eigen::half(4),
+              Eigen::half(5), Eigen::half(5), Eigen::half(6), Eigen::half(6),
+              Eigen::half(7), Eigen::half(7), Eigen::half(8), Eigen::half(8),
+              Eigen::half(9), Eigen::half(9)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1, 1}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({Eigen::half(5), Eigen::half(6), Eigen::half(8),
+                                Eigen::half(9)}));
+}
+
+TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1BFloat16) {
+  SliceOpModel<Eigen::bfloat16, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
+                                           {2, -1, 1, 1}, TensorType_INT32,
+                                           TensorType_BFLOAT16, GetParam());
+  m.SetInput({Eigen::bfloat16(1), Eigen::bfloat16(1), Eigen::bfloat16(2),
+              Eigen::bfloat16(2), Eigen::bfloat16(3), Eigen::bfloat16(3),
+              Eigen::bfloat16(4), Eigen::bfloat16(4), Eigen::bfloat16(5),
+              Eigen::bfloat16(5), Eigen::bfloat16(6), Eigen::bfloat16(6),
+              Eigen::bfloat16(7), Eigen::bfloat16(7), Eigen::bfloat16(8),
+              Eigen::bfloat16(8), Eigen::bfloat16(9), Eigen::bfloat16(9)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1, 1}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({Eigen::bfloat16(5), Eigen::bfloat16(6),
+                                Eigen::bfloat16(8), Eigen::bfloat16(9)}));
+}
+
 INSTANTIATE_TEST_SUITE_P(SliceOpTest, SliceOpTest,
                          ::testing::Values(TestType::kConst,
                                            TestType::kDynamic));
diff --git a/tensorflow/lite/kernels/stablehlo_composite_test.cc b/tensorflow/lite/kernels/stablehlo_composite_test.cc
index 65a935ca94a1..612da57d0cdd 100644
--- a/tensorflow/lite/kernels/stablehlo_composite_test.cc
+++ b/tensorflow/lite/kernels/stablehlo_composite_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <numeric>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -101,9 +102,11 @@ TEST_F(CompositeTest, TestXNNPACKDelegation) {
                                    interpreter_->subgraph(1));
 
   const auto opt = TfLiteXNNPackDelegateOptionsDefault();
-  TfLiteDelegate* xnnpack_delegate = TfLiteXNNPackDelegateCreate(&opt);
+  std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)> xnnpack_delegate(
+      TfLiteXNNPackDelegateCreate(&opt), TfLiteXNNPackDelegateDelete);
   interpreter_->primary_subgraph().MarkAsDelegationSkippable();
-  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(xnnpack_delegate), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(std::move(xnnpack_delegate)),
+            kTfLiteOk);
   ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2, 3}),
             kTfLiteOk);
   ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2, 3}),
@@ -131,7 +134,6 @@ TEST_F(CompositeTest, TestXNNPACKDelegation) {
 
   ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
-  TfLiteXNNPackDelegateDelete(xnnpack_delegate);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/stablehlo_scatter.cc b/tensorflow/lite/kernels/stablehlo_scatter.cc
index be67dc39e911..8caad1ef4499 100644
--- a/tensorflow/lite/kernels/stablehlo_scatter.cc
+++ b/tensorflow/lite/kernels/stablehlo_scatter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
@@ -314,6 +315,8 @@ TfLiteStatus EvalWithIndexType(TfLiteContext* context, TfLiteNode* node,
       return EvalWithTypes<IndexType, uint32_t>(context, node);
     case kTfLiteUInt64:
       return EvalWithTypes<IndexType, uint64_t>(context, node);
+    case kTfLiteBool:
+      return EvalWithTypes<IndexType, bool>(context, node);
     default:
       TF_LITE_KERNEL_LOG(
           context, "(Index Type: %s, Data Type: %s) currently not supported.\n",
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index 764364a2db13..5450c481de0c 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
+#include "Eigen/Core"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -253,6 +254,16 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
           op_params, op_context.effective_input_shape, op_context.input,
           GetTensorShape(op_context.output), op_context.output);
       break;
+    case kTfLiteFloat16:
+      reference_ops::StridedSlice<Eigen::half>(
+          op_params, op_context.effective_input_shape, op_context.input,
+          GetTensorShape(op_context.output), op_context.output);
+      break;
+    case kTfLiteBFloat16:
+      reference_ops::StridedSlice<Eigen::bfloat16>(
+          op_params, op_context.effective_input_shape, op_context.input,
+          GetTensorShape(op_context.output), op_context.output);
+      break;
     case kTfLiteInt32:
       reference_ops::StridedSlice<int32_t>(
           op_params, op_context.effective_input_shape, op_context.input,
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index e769831c8391..6ba4ef3b7897 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -152,8 +152,8 @@ class StridedSliceOpModel : public SingleOpModel {
 template <typename T>
 class StridedSliceOpTest : public ::testing::Test {};
 
-using DataTypes =
-    ::testing::Types<float, uint8_t, uint32_t, int8_t, int16_t, int32_t>;
+using DataTypes = ::testing::Types<float, Eigen::half, Eigen::bfloat16, uint8_t,
+                                   uint32_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(StridedSliceOpTest, DataTypes);
 
 template <typename TypeParam, typename T = TypeParam>
@@ -165,6 +165,24 @@ auto ElementsAreTypedArray(std::vector<T> x) {
   }
 }
 
+// Casts input vector to specified type, converting to string for std::string
+// type.
+template <typename T>
+std::vector<T> CastVector(const std::vector<int>& input_data) {
+  std::vector<T> casted_input(input_data.size());
+
+  if constexpr (std::is_same_v<T, std::string>) {
+    std::transform(input_data.begin(), input_data.end(), casted_input.begin(),
+                   [](int x) { return std::to_string(x); });
+  } else if constexpr (std::is_same_v<T, int>) {
+    return input_data;
+  } else {
+    std::transform(input_data.begin(), input_data.end(), casted_input.begin(),
+                   [](int x) { return static_cast<T>(x); });
+  }
+  return casted_input;
+}
+
 #if GTEST_HAS_DEATH_TEST
 TYPED_TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(StridedSliceOpModel<TypeParam>({2, 2, 2, 2, 2, 2}, {5}, {5}, {5},
@@ -194,13 +212,15 @@ TYPED_TEST(StridedSliceOpTest, Offset) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {10}, {1}, {1}, {1},
-        std::vector<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1}, {3}, {1}, 0,
-        0, 0, 0, 0, constant_tensors, /*offset=*/true);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    StridedSliceOpModel<TypeParam> m({10}, {1}, {1}, {1}, input_data, {1}, {3},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors,
+                                     /*offset=*/true);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3})));
     if (constant_tensors) {
       EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLitePersistentRo);
     } else {
@@ -215,13 +235,15 @@ TYPED_TEST(StridedSliceOpTest, OffsetArray) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {3, 4}, {2}, {2}, {2},
-        std::vector<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1},
-        {2, 2}, {1, 1}, 0, 0, 0, 0, 0, constant_tensors, /*offset=*/true);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+    StridedSliceOpModel<TypeParam> m({3, 4}, {2}, {2}, {2}, input_data, {0, 1},
+                                     {2, 2}, {1, 1}, 0, 0, 0, 0, 0,
+                                     constant_tensors, /*offset=*/true);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 5, 6})));
     if (constant_tensors) {
       EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLitePersistentRo);
     } else {
@@ -231,47 +253,56 @@ TYPED_TEST(StridedSliceOpTest, OffsetArray) {
 }
 
 TYPED_TEST(StridedSliceOpTest, OffsetConstant) {
-  StridedSliceOpModel<TypeParam> m(
-      {3, 4}, {2}, {2}, {2},
-      std::vector<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1},
-      {2, 2}, {1, 1}, 0, 0, 0, 0, 0, /*constant_tensors*/ false,
-      /*offset=*/true);
+  const std::vector<TypeParam> input_data =
+      CastVector<TypeParam>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  StridedSliceOpModel<TypeParam> m({3, 4}, {2}, {2}, {2}, input_data, {0, 1},
+                                   {2, 2}, {1, 1}, 0, 0, 0, 0, 0,
+                                   /*constant_tensors*/ false,
+                                   /*offset=*/true);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 5, 6}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                 CastVector<TypeParam>({1, 2, 5, 6})));
   EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
 }
 
 TYPED_TEST(StridedSliceOpTest, OffsetConstantStride) {
   const int height = 5;
   const int width = 6;
-  std::vector<TypeParam> input_data(height * width);
+  std::vector<int> input_data(height * width);
   std::iota(input_data.begin(), input_data.end(), 0);
 
-  StridedSliceOpModel<TypeParam> m({height, width}, {2}, {2}, {2}, input_data,
-                                   {0, 1}, {4, 3}, {2, 2}, 0, 0, 0, 0, 0,
+  auto casted_input_data = CastVector<TypeParam>(input_data);
+
+  StridedSliceOpModel<TypeParam> m({height, width}, {2}, {2}, {2},
+                                   casted_input_data, {0, 1}, {4, 3}, {2, 2}, 0,
+                                   0, 0, 0, 0,
                                    /*constant_tensors*/ false,
                                    /*offset=*/true);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 3, 13, 15}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                 CastVector<TypeParam>({1, 3, 13, 15})));
   EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
 }
 
 TYPED_TEST(StridedSliceOpTest, OffsetConstantNegativeStride) {
   const int height = 5;
   const int width = 6;
-  std::vector<TypeParam> input_data(height * width);
+  std::vector<int> input_data(height * width);
   std::iota(input_data.begin(), input_data.end(), 0);
 
-  StridedSliceOpModel<TypeParam> m({height, width}, {2}, {2}, {2}, input_data,
-                                   {4, 4}, {-4, -3}, {-2, -2}, 0, 0, 0, 0, 0,
+  auto casted_input_data = CastVector<TypeParam>(input_data);
+
+  StridedSliceOpModel<TypeParam> m({height, width}, {2}, {2}, {2},
+                                   casted_input_data, {4, 4}, {-4, -3},
+                                   {-2, -2}, 0, 0, 0, 0, 0,
                                    /*constant_tensors*/ false,
                                    /*offset=*/true);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreTypedArray<TypeParam>({28, 26, 16, 14}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                 CastVector<TypeParam>({28, 26, 16, 14})));
   EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
 }
 
@@ -281,11 +312,14 @@ TYPED_TEST(StridedSliceOpTest, In1D) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {1}, {3},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {1}, {3},
                                      {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({2, 3})));
   }
 }
 
@@ -295,11 +329,14 @@ TYPED_TEST(StridedSliceOpTest, In1DConst) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {1}, {3},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {1}, {3},
                                      {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({2, 3})));
   }
 }
 
@@ -309,10 +346,9 @@ TYPED_TEST(StridedSliceOpTest, In1D_Int32End) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    std::vector<TypeParam> values;
-    for (int i = 0; i < 32768; i++) {
-      values.push_back(i);
-    }
+    std::vector<TypeParam> values(32768);
+    std::iota(values.begin(), values.end(), TypeParam(0));
+
     StridedSliceOpModel<TypeParam> m({32768}, {1}, {1}, {1}, values, {0},
                                      {32768}, {1}, 0, 0, 0, 0, 0,
                                      constant_tensors);
@@ -328,8 +364,10 @@ TYPED_TEST(StridedSliceOpTest, In1D_EmptyOutput) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {10},
-                                     {3}, {1}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {10}, {3},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0}));
   }
@@ -341,11 +379,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_NegativeBegin) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {-3},
-                                     {3}, {1}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-3}, {3},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({2, 3})));
   }
 }
 
@@ -355,11 +396,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {-5},
-                                     {3}, {1}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-5}, {3},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3})));
   }
 }
 
@@ -369,12 +413,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_NegativeEnd) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {1},
-                                     {-2}, {1}, 0, 0, 0, 0, 0,
-                                     constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {1}, {-2},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({2})));
   }
 }
 
@@ -384,11 +430,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {-3},
-                                     {5}, {1}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-3}, {5},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2, 3, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({2, 3, 4})));
   }
 }
 
@@ -398,11 +447,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_BeginMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {1}, {3},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {1}, {3},
                                      {1}, 1, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3})));
   }
 }
 
@@ -412,13 +464,15 @@ TYPED_TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {-2},
-                                     {-3}, {-1}, 0, 0, 0, 0, 0,
-                                     constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-2}, {-3},
+                                     {-1}, 0, 0, 0, 0, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({3}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({3})));
   }
 }
 
@@ -428,11 +482,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {5}, {2},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {5}, {2},
                                      {-1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({4}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({4})));
   }
 }
 
@@ -442,12 +499,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {2},
-                                     {-4}, {-1}, 0, 0, 0, 0, 0,
-                                     constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {2}, {-4},
+                                     {-1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({3, 2}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({3, 2})));
   }
 }
 
@@ -457,12 +516,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {-3},
-                                     {-5}, {-1}, 0, 0, 0, 0, 0,
-                                     constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-3}, {-5},
+                                     {-1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2, 1}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({2, 1})));
   }
 }
 
@@ -472,11 +533,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_EndMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {1}, {3},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {1}, {3},
                                      {1}, 0, 1, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2, 3, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({2, 3, 4})));
   }
 }
 
@@ -486,11 +550,13 @@ TYPED_TEST(StridedSliceOpTest, In1D_NegStride) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({3}, {1}, {1}, {1}, {1, 2, 3}, {-1}, {-4},
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({1, 2, 3});
+    StridedSliceOpModel<TypeParam> m({3}, {1}, {1}, {1}, input_data, {-1}, {-4},
                                      {-1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({3, 2, 1}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({3, 2, 1})));
   }
 }
 
@@ -500,11 +566,13 @@ TYPED_TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2}, {1}, {1}, {1}, {1, 2}, {0}, {2}, {2},
-                                     0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({1, 2});
+    StridedSliceOpModel<TypeParam> m({2}, {1}, {1}, {1}, input_data, {0}, {2},
+                                     {2}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({1})));
   }
 }
 
@@ -514,11 +582,13 @@ TYPED_TEST(StridedSliceOpTest, In1D_OddLenStride2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({3}, {1}, {1}, {1}, {1, 2, 3}, {0}, {3},
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({1, 2, 3});
+    StridedSliceOpModel<TypeParam> m({3}, {1}, {1}, {1}, input_data, {0}, {3},
                                      {2}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3})));
   }
 }
 
@@ -528,13 +598,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_Identity) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {0, 0}, {2, 3}, {1, 1}, 0, 0, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {0, 0},
+                                     {2, 3}, {1, 1}, 0, 0, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4, 5, 6})));
   }
 }
 
@@ -544,12 +616,15 @@ TYPED_TEST(StridedSliceOpTest, In2D) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {1, 0}, {2, 2}, {1, 1}, 0, 0, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {1, 0},
+                                     {2, 2}, {1, 1}, 0, 0, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({4, 5}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({4, 5})));
   }
 }
 
@@ -559,12 +634,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_Stride2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {0, 0}, {2, 3}, {2, 2}, 0, 0, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {0, 0},
+                                     {2, 3}, {2, 2}, 0, 0, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3})));
   }
 }
 
@@ -574,12 +652,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_NegStride) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {1, -1}, {2, -4}, {2, -1}, 0, 0, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {1, -1},
+                                     {2, -4}, {2, -1}, 0, 0, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({6, 5, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({6, 5, 4})));
   }
 }
 
@@ -589,12 +670,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_BeginMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {1, 0}, {2, 2}, {1, 1}, 1, 0, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {1, 0},
+                                     {2, 2}, {1, 1}, 1, 0, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 4, 5}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 4, 5})));
   }
 }
 
@@ -604,12 +688,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_EndMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {1, 0}, {2, 2}, {1, 1}, 0, 2, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {1, 0},
+                                     {2, 2}, {1, 1}, 0, 2, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({4, 5, 6})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
@@ -618,12 +705,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {1, -2}, {2, -4}, {1, -1}, 2, 0, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {1, -2},
+                                     {2, -4}, {1, -1}, 2, 0, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({6, 5, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({6, 5, 4})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
@@ -632,12 +722,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {1, -2}, {2, -3}, {1, -1}, 0, 2, 0, 0, 0,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {1, -2},
+                                     {2, -3}, {1, -1}, 0, 2, 0, 0, 0,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({5, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({5, 4})));
   }
 }
 
@@ -647,9 +740,11 @@ TYPED_TEST(StridedSliceOpTest, In3D_Identity) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {2, 3, 2}, {1, 1, 1}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {2, 3, 2}, {1, 1, 1}, 0, 0, 0,
+                                     0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 2}));
     EXPECT_THAT(m.GetOutput(),
@@ -662,8 +757,9 @@ TYPED_TEST(StridedSliceOpTest, In3D_NegStride) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3},
-                                     {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
                                      {-1, -1, -1}, {-3, -4, -3}, {-1, -1, -1},
                                      0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -678,12 +774,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_Strided2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {2, 3, 2}, {2, 2, 2}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {2, 3, 2}, {2, 2, 2}, 0, 0, 0,
+                                     0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 5}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 5})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
@@ -692,11 +791,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {1}, {2},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {1}, {2},
                                      {1}, 0, 0, 0, 0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_TRUE(m.GetOutputShape().empty());
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({2})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1_NegativeSlice) {
@@ -706,12 +808,15 @@ TYPED_TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1_NegativeSlice) {
       continue;
     }
     // This is equivalent to tf.range(4)[-1].
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {0, 1, 2, 3}, {-1},
-                                     {0}, {1}, 0, 0, 0, 0, 1, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({0, 1, 2, 3});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-1}, {0},
+                                     {1}, 0, 0, 0, 0, 1, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_TRUE(m.GetOutputShape().empty());
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({3}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({3})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxis3_NegativeSlice) {
@@ -721,13 +826,16 @@ TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxis3_NegativeSlice) {
       continue;
     }
     // This is equivalent to tf.range(4)[:, tf.newaxis][-2, -1].
-    StridedSliceOpModel<TypeParam> m({4, 1}, {2}, {2}, {2}, {0, 1, 2, 3},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({0, 1, 2, 3});
+    StridedSliceOpModel<TypeParam> m({4, 1}, {2}, {2}, {2}, input_data,
                                      {-2, -1}, {-1, 0}, {1, 1}, 0, 0, 0, 0, 3,
                                      constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_TRUE(m.GetOutputShape().empty());
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({2})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxis2_BeginEndAxis1_NegativeSlice) {
@@ -737,13 +845,16 @@ TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxis2_BeginEndAxis1_NegativeSlice) {
       continue;
     }
     // This is equivalent to tf.range(4)[:, tf.newaxis][:, -1].
-    StridedSliceOpModel<TypeParam> m({4, 1}, {2}, {2}, {2}, {0, 1, 2, 3},
-                                     {0, -1}, {0, 0}, {1, 1}, 1, 1, 0, 0, 2,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({0, 1, 2, 3});
+    StridedSliceOpModel<TypeParam> m({4, 1}, {2}, {2}, {2}, input_data, {0, -1},
+                                     {0, 0}, {1, 1}, 1, 1, 0, 0, 2,
                                      constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({0, 1, 2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({0, 1, 2, 3})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
@@ -752,11 +863,14 @@ TYPED_TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {1}, {1},
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {1}, {1},
                                      {1}, 1, 0, 0, 0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_TRUE(m.GetOutputShape().empty());
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({1})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
@@ -765,12 +879,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {0, 0}, {1, 3}, {1, 1}, 0, 0, 0, 0, 1,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {0, 0},
+                                     {1, 3}, {1, 1}, 0, 0, 0, 0, 1,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
@@ -779,12 +896,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {0, 0}, {2, 1}, {1, 1}, 0, 0, 0, 0, 2,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {0, 0},
+                                     {2, 1}, {1, 1}, 0, 0, 0, 0, 2,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 4})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
@@ -793,12 +913,15 @@ TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {0, 0}, {1, 1}, {1, 1}, 0, 0, 0, 0, 3,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {0, 0},
+                                     {1, 1}, {1, 1}, 0, 0, 0, 0, 3,
                                      constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_TRUE(m.GetOutputShape().empty());
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({1})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
@@ -807,13 +930,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 3, 2}, {1, 1, 1}, 0, 0, 0, 0, 1, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 3, 2}, {1, 1, 1}, 0, 0, 0,
+                                     0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4, 5, 6})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
@@ -822,12 +947,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {2, 1, 2}, {1, 1, 1}, 0, 0, 0, 0, 2, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {2, 1, 2}, {1, 1, 1}, 0, 0, 0,
+                                     0, 2, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 7, 8}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 7, 8})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
@@ -836,12 +964,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 1, 2}, {1, 1, 1}, 0, 0, 0, 0, 3, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 1, 2}, {1, 1, 1}, 0, 0, 0,
+                                     0, 3, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
@@ -850,13 +981,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {2, 3, 1}, {1, 1, 1}, 0, 0, 0, 0, 4, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {2, 3, 1}, {1, 1, 1}, 0, 0, 0,
+                                     0, 4, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 3, 5, 7, 9, 11}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3, 5, 7, 9, 11})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
@@ -865,12 +998,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 3, 1}, {1, 1, 1}, 0, 0, 0, 0, 5, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 3, 1}, {1, 1, 1}, 0, 0, 0,
+                                     0, 5, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 3, 5}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3, 5})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
@@ -879,36 +1015,44 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {2, 1, 1}, {1, 1, 1}, 0, 0, 0, 0, 6, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {2, 1, 1}, {1, 1, 1}, 0, 0, 0,
+                                     0, 6, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 7}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 7})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
   for (bool constant_tensors : {true, false}) {
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 1, 1}, {1, 1, 1}, 0, 0, 0, 0, 7, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 1, 1}, {1, 1, 1}, 0, 0, 0,
+                                     0, 7, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_TRUE(m.GetOutputShape().empty());
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({1})));
   }
 
   // This tests catches a very subtle bug that was fixed by cl/188403234.
 }
 TYPED_TEST(StridedSliceOpTest, RunTwice) {
-  StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                   {1, 0}, {2, 2}, {1, 1}, 1, 0, 0, 0, 0,
-                                   false);
+  const std::vector<TypeParam> input_data =
+      CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+  StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {1, 0},
+                                   {2, 2}, {1, 1}, 1, 0, 0, 0, 0, false);
 
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 4, 5}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                 CastVector<TypeParam>({1, 2, 4, 5})));
 
-  auto setup_inputs = [&m]() {
-    m.template SetInput<TypeParam>({1, 2, 3, 4, 5, 6},
+  auto setup_inputs = [&m, &input_data]() {
+    m.template SetInput<TypeParam>(input_data,
                                    std::is_same<std::string, TypeParam>());
     m.SetBegin({1, 0});
     m.SetEnd({2, 2});
@@ -918,7 +1062,8 @@ TYPED_TEST(StridedSliceOpTest, RunTwice) {
   setup_inputs();
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   // Prior to cl/188403234 this was {4, 5}.
-  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 4, 5}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                 CastVector<TypeParam>({1, 2, 4, 5})));
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
   for (bool constant_tensors : {true, false}) {
@@ -926,13 +1071,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 3, 2}, {1, 1, 1}, 0, 0, 0, 0, 1, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 3, 2}, {1, 1, 1}, 0, 0, 0,
+                                     0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4, 5, 6})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1int8) {
@@ -941,13 +1088,15 @@ TYPED_TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1int8) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 3, 2}, {1, 1, 1}, 0, 0, 0, 0, 1, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 3, 2}, {1, 1, 1}, 0, 0, 0,
+                                     0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4, 5, 6})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In5D_Identity) {
@@ -956,15 +1105,16 @@ TYPED_TEST(StridedSliceOpTest, In5D_Identity) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>(
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
     StridedSliceOpModel<TypeParam> m(
-        {2, 2, 2, 1, 2}, {5}, {5}, {5},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {0, 0, 0, 0, 0}, {2, 1, 2, 1, 2}, {1, 1, 1, 1, 1}, 0, 0, 0, 0, 0,
-        constant_tensors);
+        {2, 2, 2, 1, 2}, {5}, {5}, {5}, input_data, {0, 0, 0, 0, 0},
+        {2, 1, 2, 1, 2}, {1, 1, 1, 1, 1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 1, 2}));
     EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 9, 10, 11, 12}));
+                ElementsAreTypedArray<TypeParam>(
+                    CastVector<TypeParam>({1, 2, 3, 4, 9, 10, 11, 12})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In5D_IdentityShrinkAxis1) {
@@ -973,14 +1123,15 @@ TYPED_TEST(StridedSliceOpTest, In5D_IdentityShrinkAxis1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>(
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
     StridedSliceOpModel<TypeParam> m(
-        {2, 2, 2, 1, 2}, {5}, {5}, {5},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {0, 0, 0, 0, 0}, {2, 1, 2, 1, 2}, {1, 1, 1, 1, 1}, 0, 0, 0, 0, 1,
-        constant_tensors);
+        {2, 2, 2, 1, 2}, {5}, {5}, {5}, input_data, {0, 0, 0, 0, 0},
+        {2, 1, 2, 1, 2}, {1, 1, 1, 1, 1}, 0, 0, 0, 0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 3, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_SmallBegin) {
@@ -989,13 +1140,14 @@ TYPED_TEST(StridedSliceOpTest, In3D_SmallBegin) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {1}, {1}, {1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {0},
-        {1}, {1}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {1}, {1}, {1}, input_data, {0},
+                                     {1}, {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 2}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4, 5, 6})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_SmallBeginWithhrinkAxis1) {
@@ -1004,13 +1156,14 @@ TYPED_TEST(StridedSliceOpTest, In3D_SmallBeginWithhrinkAxis1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {1}, {1}, {1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {0},
-        {1}, {1}, 0, 0, 0, 0, 1, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {1}, {1}, {1}, input_data, {0},
+                                     {1}, {1}, 0, 0, 0, 0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4, 5, 6})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, In3D_BackwardSmallBeginEndMask) {
@@ -1019,8 +1172,9 @@ TYPED_TEST(StridedSliceOpTest, In3D_BackwardSmallBeginEndMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({1, 1, 2}, {1}, {1}, {1}, {1, 2}, {1}, {0},
-                                     {1}, 0, 1, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({1, 2});
+    StridedSliceOpModel<TypeParam> m({1, 1, 2}, {1}, {1}, {1}, input_data, {1},
+                                     {0}, {1}, 0, 1, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0, 1, 2}));
   }
@@ -1031,8 +1185,9 @@ TYPED_TEST(StridedSliceOpTest, In3D_BackwardSmallBegin) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({1, 1, 2}, {1}, {1}, {1}, {1, 2}, {1}, {0},
-                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({1, 2});
+    StridedSliceOpModel<TypeParam> m({1, 1, 2}, {1}, {1}, {1}, input_data, {1},
+                                     {0}, {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0, 1, 2}));
   }
@@ -1043,7 +1198,8 @@ TYPED_TEST(StridedSliceOpTest, In3D_Backward) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({1, 1, 2}, {3}, {3}, {3}, {1, 2},
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({1, 2});
+    StridedSliceOpModel<TypeParam> m({1, 1, 2}, {3}, {3}, {3}, input_data,
                                      {1, 0, 0}, {0, -1, -1}, {1, 1, 1}, 6, 7, 0,
                                      0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -1052,26 +1208,32 @@ TYPED_TEST(StridedSliceOpTest, In3D_Backward) {
 }
 
 TEST(StridedSliceOpTest, In1D_String_NegativeBegin) {
-  StridedSliceOpModel<std::string> m({4}, {1}, {1}, {1}, {"a", "b", "c", "d"},
-                                     {-3}, {3}, {1}, 0, 0, 0, 0, 0, false);
+  std::vector<std::string> input_data = CastVector<std::string>(
+      {1, 2, 3, 4});  // input_data = {"a", "b", "c", "d"}
+  StridedSliceOpModel<std::string> m({4}, {1}, {1}, {1}, input_data, {-3}, {3},
+                                     {1}, 0, 0, 0, 0, 0, false);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  std::vector<std::string> output_data =
+      CastVector<std::string>({2, 3});  // output_data = {"b", "c"}
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"b", "c"}));
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray(output_data));
 }
 
 TEST(StridedSliceOpTest, In3D_String_BackwardSmallBegin) {
-  StridedSliceOpModel<std::string> m({1, 1, 2}, {1}, {1}, {1}, {"a", "b"}, {1},
+  std::vector<std::string> input_data =
+      CastVector<std::string>({1, 2});  // input_data = {"a", "b"}
+
+  StridedSliceOpModel<std::string> m({1, 1, 2}, {1}, {1}, {1}, input_data, {1},
                                      {0}, {1}, 0, 1, 0, 0, 0, false);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0, 1, 2}));
 }
 
 TEST(StridedSliceOpTest, In3D_String_SmallBeginWithhrinkAxis1) {
-  StridedSliceOpModel<std::string> m(
-      {2, 3, 2}, {1}, {1}, {1},
-
-      {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"}, {0}, {1},
-      {1}, 0, 0, 0, 0, 1, false);
+  std::vector<std::string> input_data =
+      CastVector<std::string>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  StridedSliceOpModel<std::string> m({2, 3, 2}, {1}, {1}, {1}, input_data, {0},
+                                     {1}, {1}, 0, 0, 0, 0, 1, false);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
   EXPECT_THAT(m.GetStringOutput(),
@@ -1079,11 +1241,11 @@ TEST(StridedSliceOpTest, In3D_String_SmallBeginWithhrinkAxis1) {
 }
 
 TEST(StridedSliceOpTest, In5D_String_IdentityShrinkAxis1) {
-  StridedSliceOpModel<std::string> m(
-      {2, 2, 2, 1, 2}, {5}, {5}, {5},
-      {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
-       "14", "15", "16"},
-      {0, 0, 0, 0, 0}, {2, 1, 2, 1, 2}, {1, 1, 1, 1, 1}, 0, 0, 0, 0, 1, false);
+  std::vector<std::string> input_data = CastVector<std::string>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  StridedSliceOpModel<std::string> m({2, 2, 2, 1, 2}, {5}, {5}, {5}, input_data,
+                                     {0, 0, 0, 0, 0}, {2, 1, 2, 1, 2},
+                                     {1, 1, 1, 1, 1}, 0, 0, 0, 0, 1, false);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1, 2}));
   EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"1", "2", "3", "4"}));
@@ -1094,13 +1256,16 @@ TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxis_Endmask_AtSameAxis) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 2}, {2}, {2}, {2}, {0, 1, 2, 3},
-                                     {0, -1}, {0, 0}, {1, -1}, 1, 1, 0, 0, 1,
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({0, 1, 2, 3});
+    StridedSliceOpModel<TypeParam> m({2, 2}, {2}, {2}, {2}, input_data, {0, -1},
+                                     {0, 0}, {1, -1}, 1, 1, 0, 0, 1,
                                      constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({1})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, EllipsisMask1_NewAxisMask2) {
@@ -1109,14 +1274,16 @@ TYPED_TEST(StridedSliceOpTest, EllipsisMask1_NewAxisMask2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 1, 2, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 1,
+                                     2, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 1, 1}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 3, 5, 7, 9, 11}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3, 5, 7, 9, 11})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, EllipsisMask2_NewAxisMask1) {
@@ -1125,14 +1292,16 @@ TYPED_TEST(StridedSliceOpTest, EllipsisMask2_NewAxisMask1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2, 1, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2,
+                                     1, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 3, 5, 7, 9, 11}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3, 5, 7, 9, 11})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, EllipsisMask2_NewAxisMask5) {
@@ -1141,9 +1310,11 @@ TYPED_TEST(StridedSliceOpTest, EllipsisMask2_NewAxisMask5) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2, 5, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2,
+                                     5, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 2, 1}));
@@ -1157,13 +1328,16 @@ TYPED_TEST(StridedSliceOpTest, EllipsisMask2_NewAxisMask2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2, 2, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2,
+                                     2, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 3, 5}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3, 5})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, EllipsisMask4_NewAxisMask2) {
@@ -1172,14 +1346,16 @@ TYPED_TEST(StridedSliceOpTest, EllipsisMask4_NewAxisMask2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 4, 2, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 4,
+                                     2, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 3, 2}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({1, 2, 3, 4, 5, 6}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 3, 4, 5, 6})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, EllipsisMask2) {
@@ -1188,13 +1364,16 @@ TYPED_TEST(StridedSliceOpTest, EllipsisMask2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 2, 1}, {1, 1, 1}, 0, 0, 2,
+                                     0, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 3, 5}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 3, 5})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, NewAxisMask2) {
@@ -1203,13 +1382,16 @@ TYPED_TEST(StridedSliceOpTest, NewAxisMask2) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 3, 1}, {1, 1, 1}, 0, 0, 0, 2, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 3, 1}, {1, 1, 1}, 0, 0, 0,
+                                     2, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, NewAxisMask1) {
@@ -1218,13 +1400,16 @@ TYPED_TEST(StridedSliceOpTest, NewAxisMask1) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m(
-        {2, 3, 2}, {3}, {3}, {3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
-        {0, 0, 0}, {1, 3, 1}, {1, 1, 1}, 0, 0, 0, 1, 0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    StridedSliceOpModel<TypeParam> m({2, 3, 2}, {3}, {3}, {3}, input_data,
+                                     {0, 0, 0}, {1, 3, 1}, {1, 1, 1}, 0, 0, 0,
+                                     1, 0, constant_tensors);
 
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1, 2}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1, 2, 7, 8}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({1, 2, 7, 8})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, NoInfiniteLoop) {
@@ -1245,12 +1430,14 @@ TYPED_TEST(StridedSliceOpTest, MinusThreeMinusFourMinusOne) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {-3},
-                                     {-4}, {-1}, 0, 0, 0, 0, 0,
-                                     constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-3}, {-4},
+                                     {-1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({2})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, MinusFourMinusThreeOne) {
@@ -1259,12 +1446,14 @@ TYPED_TEST(StridedSliceOpTest, MinusFourMinusThreeOne) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, {1, 2, 3, 4}, {-4},
-                                     {-3}, {1}, 0, 0, 0, 0, 0,
-                                     constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4});
+    StridedSliceOpModel<TypeParam> m({4}, {1}, {1}, {1}, input_data, {-4}, {-3},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({1}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({1})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, OneOneOne) {
@@ -1273,8 +1462,9 @@ TYPED_TEST(StridedSliceOpTest, OneOneOne) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({1}, {1}, {1}, {1}, {2}, {1}, {1}, {1}, 0,
-                                     0, 0, 0, 0, constant_tensors);
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({2});
+    StridedSliceOpModel<TypeParam> m({1}, {1}, {1}, {1}, input_data, {1}, {1},
+                                     {1}, 0, 0, 0, 0, 0, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0}));
   }
@@ -1285,11 +1475,13 @@ TYPED_TEST(StridedSliceOpTest, OneOneOneShrinkAxis) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({3}, {1}, {1}, {1}, {1, 2, 3}, {1}, {1},
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({1, 2, 3});
+    StridedSliceOpModel<TypeParam> m({3}, {1}, {1}, {1}, input_data, {1}, {1},
                                      {1}, 0, 0, 0, 0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>({2}));
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreTypedArray<TypeParam>(CastVector<TypeParam>({2})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, OneOneOneShrinkAxisOOB) {
@@ -1298,8 +1490,9 @@ TYPED_TEST(StridedSliceOpTest, OneOneOneShrinkAxisOOB) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({1}, {1}, {1}, {1}, {2}, {1}, {1}, {1}, 0,
-                                     0, 0, 0, 1, constant_tensors);
+    const std::vector<TypeParam> input_data = CastVector<TypeParam>({2});
+    StridedSliceOpModel<TypeParam> m({1}, {1}, {1}, {1}, input_data, {1}, {1},
+                                     {1}, 0, 0, 0, 0, 1, constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), IsEmpty());
   }
@@ -1334,22 +1527,26 @@ TYPED_TEST(StridedSliceOpTest, NegEndMask) {
       // NNAPI does not support graphs with all constant inputs.
       continue;
     }
-    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                     {0, -1}, {2, -3}, {1, -1}, 0, 0b10, 0, 0,
-                                     0, constant_tensors);
+    const std::vector<TypeParam> input_data =
+        CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+    StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {0, -1},
+                                     {2, -3}, {1, -1}, 0, 0b10, 0, 0, 0,
+                                     constant_tensors);
     ASSERT_EQ(m.Invoke(), kTfLiteOk);
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
-    EXPECT_THAT(m.GetOutput(),
-                ElementsAreTypedArray<TypeParam>({3, 2, 1, 6, 5, 4}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                   CastVector<TypeParam>({3, 2, 1, 6, 5, 4})));
   }
 }
 TYPED_TEST(StridedSliceOpTest, NoopOffset) {
-  StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
-                                   {0, -1}, {2, -3}, {1, -1}, 0, 0b10, 0, 0, 0);
+  const std::vector<TypeParam> input_data =
+      CastVector<TypeParam>({1, 2, 3, 4, 5, 6});
+  StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, input_data, {0, -1},
+                                   {2, -3}, {1, -1}, 0, 0b10, 0, 0, 0);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreTypedArray<TypeParam>({3, 2, 1, 6, 5, 4}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreTypedArray<TypeParam>(
+                                 CastVector<TypeParam>({3, 2, 1, 6, 5, 4})));
 }
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 965a0fda7f9b..e4f922ded2c0 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -101,7 +101,7 @@ uint32_t DistanceBetweenSignAndMagnitudeNumbers(uint32_t sam1, uint32_t sam2) {
 //   - returns true if both numbers are NAN.
 //   - returns false if exact one of numbers is NAN.
 //   - treats really large numbers as almost equal to infinity.
-//   - thinks +0.0 and -0.0 are 0 DLP's apart.
+//   - thinks +0.0 and -0.0 are 0 ULP's apart.
 bool AlmostEquals(float lhs, float rhs, uint32_t max_ulps) {
   if (std::isnan(lhs) || std::isnan(rhs)) {
     return std::isnan(lhs) && std::isnan(rhs);
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 04167e897fb8..518b5c7d69bc 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -16,14 +16,20 @@ limitations under the License.
 #include <math.h>
 
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -562,7 +568,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, lstm::full::kInputGateBiasTensor);
   if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+    TF_LITE_ENSURE(context, input_gate_bias == nullptr);
   } else {
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
@@ -642,7 +648,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
         context, node, lstm::full::kInputLayerNormCoefficientsTensor);
     if (use_cifg) {
-      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients, nullptr);
+      TF_LITE_ENSURE(context, input_layer_norm_coefficients == nullptr);
     } else {
       TF_LITE_ENSURE(context, input_layer_norm_coefficients != nullptr);
       TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
diff --git a/tensorflow/lite/kernels/while_test.cc b/tensorflow/lite/kernels/while_test.cc
index 0e0a3e43a727..1dee795c72e6 100644
--- a/tensorflow/lite/kernels/while_test.cc
+++ b/tensorflow/lite/kernels/while_test.cc
@@ -15,8 +15,10 @@ limitations under the License.
 #include <stdint.h>
 
 #include <memory>
+#include <utility>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -44,10 +46,12 @@ TEST_F(WhileTest, TestWithXNNPACK) {
   builder_->BuildFloatWhileSubgraph(&interpreter_->primary_subgraph(), 2);
 
   const auto opt = TfLiteXNNPackDelegateOptionsDefault();
-  TfLiteDelegate* xnnpack_delegate = TfLiteXNNPackDelegateCreate(&opt);
+  std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)> xnnpack_delegate(
+      TfLiteXNNPackDelegateCreate(&opt), TfLiteXNNPackDelegateDelete);
   interpreter_->primary_subgraph().MarkAsDelegationSkippable();
   interpreter_->subgraph(1)->MarkAsDelegationSkippable();
-  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(xnnpack_delegate), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(std::move(xnnpack_delegate)),
+            kTfLiteOk);
   ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1}),
             kTfLiteOk);
   ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1}),
@@ -71,7 +75,6 @@ TEST_F(WhileTest, TestWithXNNPACK) {
   ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
-  TfLiteXNNPackDelegateDelete(xnnpack_delegate);
 }
 
 TEST_F(WhileTest, TestInputIsOutput) {
diff --git a/tensorflow/lite/nnapi/sl/SupportLibrary.cc b/tensorflow/lite/nnapi/sl/SupportLibrary.cc
index 85461969f9b5..d233cee3d265 100644
--- a/tensorflow/lite/nnapi/sl/SupportLibrary.cc
+++ b/tensorflow/lite/nnapi/sl/SupportLibrary.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <dlfcn.h>
 
 #include <cinttypes>
-#include <cstring>
 #include <memory>
 #include <string>
 
diff --git a/tensorflow/lite/objc/BUILD.apple b/tensorflow/lite/objc/BUILD.apple
index 856ed56ad359..0f1dbf754293 100644
--- a/tensorflow/lite/objc/BUILD.apple
+++ b/tensorflow/lite/objc/BUILD.apple
@@ -95,7 +95,8 @@ objc_library(
 # directory name. (See: b/174508866)
 ios_unit_test(
     name = "tests",
-    size = "medium",
+    size = "small",
+    timeout = "moderate",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
diff --git a/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
index 6c97d768e15a..312821c21ae2 100644
--- a/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
+++ b/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
@@ -1,11 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
-    <device id="retina4_7" orientation="portrait">
-        <adaptation id="fullscreen"/>
-    </device>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="23727" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <device id="retina4_7" orientation="portrait" appearance="light"/>
     <dependencies>
         <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="23721"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
     <scenes>
@@ -21,8 +19,8 @@
                         <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                         <subviews>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TensorFlow Lite Test" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" minimumScaleFactor="0.25" translatesAutoresizingMaskIntoConstraints="NO" id="zIC-MS-HeK">
-                                <rect key="frame" x="16" y="314" width="343" height="39"/>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" misplaced="YES" text="TensorFlow Lite Test" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" minimumScaleFactor="0.25" translatesAutoresizingMaskIntoConstraints="NO" id="zIC-MS-HeK">
+                                <rect key="frame" x="16" y="313" width="343" height="39"/>
                                 <fontDescription key="fontDescription" type="boldSystem" pointSize="32"/>
                                 <color key="textColor" red="1" green="0.50329624702372611" blue="0.013296667412401542" alpha="0.84705882352941175" colorSpace="custom" customColorSpace="displayP3"/>
                                 <nil key="highlightedColor"/>
diff --git a/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/Main.storyboard b/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/Main.storyboard
index 602ef636aa9c..4f233d9ec689 100644
--- a/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/Main.storyboard
+++ b/tensorflow/lite/objc/apps/TestApp/TestApp/Base.lproj/Main.storyboard
@@ -1,11 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
-    <device id="retina4_7" orientation="portrait">
-        <adaptation id="fullscreen"/>
-    </device>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="23727" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina4_7" orientation="portrait" appearance="light"/>
     <dependencies>
         <deployment version="2304" identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="23721"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
     <scenes>
@@ -28,8 +26,8 @@
                                 </constraints>
                                 <items>
                                     <barButtonItem style="plain" id="Ywd-KS-s96">
-                                        <segmentedControl key="customView" opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="top" segmentControlStyle="bar" selectedSegmentIndex="0" id="8kc-88-CHj" userLabel="Model Control">
-                                            <rect key="frame" x="16" y="7.5" width="343" height="29"/>
+                                        <segmentedControl key="customView" opaque="NO" contentMode="scaleToFill" misplaced="YES" contentHorizontalAlignment="left" contentVerticalAlignment="top" segmentControlStyle="bar" selectedSegmentIndex="0" id="8kc-88-CHj" userLabel="Model Control">
+                                            <rect key="frame" x="16" y="6" width="343" height="32"/>
                                             <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                             <segments>
                                                 <segment title="Add"/>
@@ -43,8 +41,8 @@
                                     </barButtonItem>
                                 </items>
                             </toolbar>
-                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="UWb-3E-O5r" userLabel="Bottom Invoke Toolbar">
-                                <rect key="frame" x="0.0" y="140" width="375" height="44"/>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" misplaced="YES" translatesAutoresizingMaskIntoConstraints="NO" id="UWb-3E-O5r" userLabel="Bottom Invoke Toolbar">
+                                <rect key="frame" x="0.0" y="139" width="375" height="44"/>
                                 <items>
                                     <barButtonItem title="Invoke Interpreter" width="374" id="He4-7G-biW">
                                         <connections>
@@ -56,7 +54,7 @@
                             <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" misplaced="YES" editable="NO" adjustsFontForContentSizeCategory="YES" selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="7Ws-3t-76I">
                                 <rect key="frame" x="0.0" y="194" width="375" height="488"/>
                                 <color key="backgroundColor" red="0.12820077356385221" green="0.40366933178860925" blue="0.96080166101455688" alpha="1" colorSpace="custom" customColorSpace="displayP3"/>
-                                <color key="textColor" cocoaTouchSystemColor="tableCellGroupedBackgroundColor"/>
+                                <color key="textColor" systemColor="tableCellGroupedBackgroundColor"/>
                                 <fontDescription key="fontDescription" type="system" pointSize="14"/>
                                 <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                             </textView>
@@ -86,4 +84,9 @@
             <point key="canvasLocation" x="136.80000000000001" y="132.68365817091455"/>
         </scene>
     </scenes>
+    <resources>
+        <systemColor name="tableCellGroupedBackgroundColor">
+            <color white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+        </systemColor>
+    </resources>
 </document>
diff --git a/tensorflow/lite/objc/tests/TFLInterpreterTests.m b/tensorflow/lite/objc/tests/TFLInterpreterTests.m
index 18decd9f4d16..c0963d1d5a3a 100644
--- a/tensorflow/lite/objc/tests/TFLInterpreterTests.m
+++ b/tensorflow/lite/objc/tests/TFLInterpreterTests.m
@@ -20,9 +20,9 @@
 
 /**
  * Regular expression for TensorFlow Lite runtime version string, e.g. "1.14.0", "0.1.2-alpha.1",
- * "0.3.4-beta2", "1.14.0-rc.3".
+ * "0.3.4-beta2", "1.14.0-rc.3", "2.20.0-dev0+selfbuilt".
  */
-static NSString *const kTFLVersionRegex = @"^\\d+\\.\\d+\\.\\d+(-[a-zA-Z0-9.-]+)?$";
+static NSString *const kTFLVersionRegex = @"^\\d+\\.\\d+\\.\\d+(-[a-zA-Z0-9.-]+)?(\\+\\w+)?$";
 
 /** Float model resource name. */
 static NSString *const kAddFloatModelResourceName = @"add";
@@ -90,6 +90,7 @@ - (void)tearDown {
 #pragma mark - Tests
 
 - (void)testTFLVersion {
+  NSLog(@"TFLVersion: %@", TFLVersion);
   NSRange range = [TFLVersion rangeOfString:kTFLVersionRegex options:NSRegularExpressionSearch];
   XCTAssertNotEqual(range.location, NSNotFound);
 }
diff --git a/tensorflow/lite/optional_debug_tools_test.cc b/tensorflow/lite/optional_debug_tools_test.cc
index 66030815a1e0..8f8d0da7d873 100644
--- a/tensorflow/lite/optional_debug_tools_test.cc
+++ b/tensorflow/lite/optional_debug_tools_test.cc
@@ -69,6 +69,9 @@ TEST(OptionalDebugTools, PrintInterpreterStateWithDelegate) {
   ASSERT_TRUE(model);
 
   // Create and instantiate an interpreter with a delegate.
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(
       InterpreterBuilder(
@@ -76,9 +79,6 @@ TEST(OptionalDebugTools, PrintInterpreterStateWithDelegate) {
           &interpreter),
       kTfLiteOk);
   ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
   ASSERT_EQ(interpreter->ModifyGraphWithDelegate(xnnpack_delegate.get()),
             kTfLiteOk);
 
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index e71e7ea1fb2d..95e4319f981f 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -188,6 +188,19 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "memory_latency_logger",
+    srcs = ["memory_latency_logger.cc"],
+    hdrs = ["memory_latency_logger.h"],
+    deps = [
+        ":memory_usage_monitor",
+        "//tensorflow/lite:framework_stable",
+        "//tensorflow/lite/tools:logging",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+    ],
+)
+
 cc_library(
     name = "profile_summary_formatter",
     srcs = ["profile_summary_formatter.cc"],
diff --git a/tensorflow/lite/profiling/memory_latency_logger.cc b/tensorflow/lite/profiling/memory_latency_logger.cc
new file mode 100644
index 000000000000..d485a1b205c5
--- /dev/null
+++ b/tensorflow/lite/profiling/memory_latency_logger.cc
@@ -0,0 +1,93 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/profiling/memory_latency_logger.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <ios>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/lite/profiling/memory_usage_monitor.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace profiling {
+namespace memory {
+
+MemoryLatencyLogger::MemoryLatencyLogger() {
+  mem_monitor_ =
+      std::make_unique<MemoryUsageMonitor>(/*sampling_interval_ms=*/50);
+}
+void MemoryLatencyLogger::Start() {
+  if (start_ != absl::UnixEpoch()) {
+    TFLITE_LOG(INFO) << "MemoryLatencyLogger start called multiple times.";
+    return;
+  }
+  start_ = absl::Now();
+  mem_monitor_->Start();
+}
+
+void MemoryLatencyLogger::Stop(absl::string_view log_message) {
+  if (start_ == absl::UnixEpoch()) {
+    TFLITE_LOG(INFO)
+        << "MemoryLatencyLogger hasn't started yet or has stopped!";
+    return;
+  }
+
+  absl::Time stop = absl::Now();
+  mem_monitor_->Stop();
+  int space_count =
+      35 - log_message.size();  // used for better user readability.
+  std::string space(std::max(space_count, 0), '-');
+  std::stringstream message_stream;
+  message_stream << log_message << " " << space << " latency: " << std::fixed
+                 << std::setprecision(1)
+                 << absl::ToDoubleMilliseconds(stop - start_) << " ms,";
+
+  // Check each value before logging. If the value is not available, or if
+  // the value is < 0, log "unknown". Sometimes this can happen on machines that
+  // only support mallinfo() and not mallinfo2(). mallinfo() uses an int to
+  // store the current in-use memory, which can overflow if the program
+  // allocates more than 2GB of memory.
+  if (mem_monitor_->GetPeakMemUsageInMB() < 0) {
+    message_stream << " peak alloc: unknown,";
+  } else {
+    message_stream << " peak alloc: " << mem_monitor_->GetPeakMemUsageInMB()
+                   << " MB,";
+  }
+  if (mem_monitor_->GetPeakInUseMemoryInMB() < 0) {
+    message_stream << " peak in-use: unknown,";
+  } else {
+    message_stream << " peak in-use: " << mem_monitor_->GetPeakInUseMemoryInMB()
+                   << " MB,";
+  }
+  if (mem_monitor_->GetCurrentInUseMemoryInMB() < 0) {
+    message_stream << " current in-use: unknown";
+  } else {
+    message_stream << " current in-use: "
+                   << mem_monitor_->GetCurrentInUseMemoryInMB() << " MB";
+  }
+  TFLITE_LOG(INFO) << message_stream.str();
+}
+
+}  // namespace memory
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/lite/profiling/memory_latency_logger.h b/tensorflow/lite/profiling/memory_latency_logger.h
new file mode 100644
index 000000000000..8273ab1ee7f0
--- /dev/null
+++ b/tensorflow/lite/profiling/memory_latency_logger.h
@@ -0,0 +1,56 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PROFILING_MEMORY_LATENCY_LOGGER_H_
+#define TENSORFLOW_LITE_PROFILING_MEMORY_LATENCY_LOGGER_H_
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "tensorflow/lite/profiling/memory_usage_monitor.h"
+
+namespace tflite {
+namespace profiling {
+namespace memory {
+
+// This class is used to measure the memory and latency of the surrounding code
+// block. Example usage:
+//   MemoryLatencyLogger logger;
+//   logger.Start();
+//     Code block
+//   logger.Stop("Code block");
+
+// This class is thread-unsafe.
+class MemoryLatencyLogger {
+ public:
+  MemoryLatencyLogger();
+  // Starts the memory and latency monitoring.
+  void Start();
+  // Stops the memory and latency monitoring and logs the results.
+  void Stop(absl::string_view log_message);
+
+ private:
+  // The memory usage monitor.
+  std::unique_ptr<MemoryUsageMonitor> mem_monitor_;
+  // The start time of the memory and latency monitoring.
+  absl::Time start_;
+};
+
+}  // namespace memory
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_MEMORY_LATENCY_LOGGER_H_
diff --git a/tensorflow/lite/profiling/memory_usage_monitor.cc b/tensorflow/lite/profiling/memory_usage_monitor.cc
index 2139b8114b35..e00a5ec63d73 100644
--- a/tensorflow/lite/profiling/memory_usage_monitor.cc
+++ b/tensorflow/lite/profiling/memory_usage_monitor.cc
@@ -57,9 +57,10 @@ void MemoryUsageMonitor::Start() {
       if (current_peak_bytes > peak_mem_footprint_bytes_) {
         peak_mem_footprint_bytes_ = current_peak_bytes;
       }
-      if (static_cast<int64_t>(mem_info.in_use_allocated_bytes) >
-          peak_in_use_mem_bytes_) {
-        peak_in_use_mem_bytes_ = mem_info.in_use_allocated_bytes;
+      int64_t current_in_use_bytes =
+          static_cast<int64_t>(mem_info.in_use_allocated_bytes);
+      if (current_in_use_bytes > peak_in_use_mem_bytes_) {
+        peak_in_use_mem_bytes_ = current_in_use_bytes;
       }
       if (stop_signal_->HasBeenNotified()) break;
       sampler_->SleepFor(sampling_interval_);
diff --git a/tensorflow/lite/profiling/memory_usage_monitor.h b/tensorflow/lite/profiling/memory_usage_monitor.h
index 621e0d33740a..6576c0831bfc 100644
--- a/tensorflow/lite/profiling/memory_usage_monitor.h
+++ b/tensorflow/lite/profiling/memory_usage_monitor.h
@@ -75,7 +75,12 @@ class MemoryUsageMonitor {
   }
 
   float GetCurrentInUseMemoryInMB() const {
-    return BytesToMegabytes(sampler_->GetMemoryUsage().in_use_allocated_bytes);
+    int64_t in_use_mem_bytes =
+        sampler_->GetMemoryUsage().in_use_allocated_bytes;
+    if (in_use_mem_bytes < 0) {
+      return kInvalidMemUsageMB;
+    }
+    return BytesToMegabytes(in_use_mem_bytes);
   }
 
   float GetPeakInUseMemoryInMB() const {
diff --git a/tensorflow/lite/profiling/model_runtime_info.cc b/tensorflow/lite/profiling/model_runtime_info.cc
index f12f3fdbfe3b..915dbec0fc55 100644
--- a/tensorflow/lite/profiling/model_runtime_info.cc
+++ b/tensorflow/lite/profiling/model_runtime_info.cc
@@ -162,10 +162,10 @@ TfLiteStatus TfliteNodeToNode(const TfLiteNode& node,
   return kTfLiteOk;
 }
 }  // namespace
-TfLiteStatus GenerateModelRuntimeInfo(const tflite::Interpreter& interpreter,
-                                      absl::string_view output_file_path) {
-  tflite::profiling::ModelRuntimeDetails model_runtime_details;
 
+TfLiteStatus GenerateModelRuntimeInfo(
+    const tflite::Interpreter& interpreter,
+    ModelRuntimeDetails& model_runtime_details) {
   const size_t num_subgraphs = interpreter.subgraphs_size();
 
   for (int i = 0; i < num_subgraphs; ++i) {
@@ -224,7 +224,17 @@ TfLiteStatus GenerateModelRuntimeInfo(const tflite::Interpreter& interpreter,
     runtime_subgraph->mutable_execution_plan()->Add(
         subgraph.execution_plan().begin(), subgraph.execution_plan().end());
   }
+  return kTfLiteOk;
+}
 
+TfLiteStatus GenerateModelRuntimeInfo(const tflite::Interpreter& interpreter,
+                                      absl::string_view output_file_path) {
+  ModelRuntimeDetails model_runtime_details;
+  auto status = GenerateModelRuntimeInfo(interpreter, model_runtime_details);
+  if (status != kTfLiteOk) {
+    TFLITE_LOG(ERROR) << "Failed to generate model runtime info: " << status;
+    return status;
+  }
   std::ofstream ofs(std::string(output_file_path),
                     std::ios::out | std::ios::binary);
   if (ofs.good()) {
diff --git a/tensorflow/lite/profiling/model_runtime_info.h b/tensorflow/lite/profiling/model_runtime_info.h
index a88b80d22814..70f1dae41e2c 100644
--- a/tensorflow/lite/profiling/model_runtime_info.h
+++ b/tensorflow/lite/profiling/model_runtime_info.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/profiling/proto/model_runtime_info.pb.h"
 
 namespace tflite {
 namespace profiling {
@@ -26,6 +27,11 @@ namespace profiling {
 // the given output file path.
 TfLiteStatus GenerateModelRuntimeInfo(const Interpreter &interpreter,
                                       absl::string_view output_file_path);
+
+// Generates a ModelRuntimeInfo proto for the given interpreter and writes it to
+// the given model_runtime_details proto.
+TfLiteStatus GenerateModelRuntimeInfo(
+    const Interpreter &interpreter, ModelRuntimeDetails &model_runtime_details);
 }  // namespace profiling
 }  // namespace tflite
 
diff --git a/tensorflow/lite/profiling/proto/model_runtime_info.proto b/tensorflow/lite/profiling/proto/model_runtime_info.proto
index af8c6edb74e7..3c5bde18c9a3 100644
--- a/tensorflow/lite/profiling/proto/model_runtime_info.proto
+++ b/tensorflow/lite/profiling/proto/model_runtime_info.proto
@@ -19,6 +19,7 @@ package tflite.profiling;
 
 import "tensorflow/lite/profiling/proto/profiling_info.proto";
 
+option java_package = "tflite.profiling";
 option java_multiple_files = true;
 
 // Corresponds to a TFLite Model.
diff --git a/tensorflow/lite/profiling/proto/profiling_info.proto b/tensorflow/lite/profiling/proto/profiling_info.proto
index 5d33571efcab..b37b48cd7933 100644
--- a/tensorflow/lite/profiling/proto/profiling_info.proto
+++ b/tensorflow/lite/profiling/proto/profiling_info.proto
@@ -17,6 +17,7 @@ syntax = "proto2";
 
 package tflite.profiling;
 
+option java_package = "tflite.profiling";
 option java_multiple_files = true;
 
 message BenchmarkProfilingData {
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index b36513bf117b..c5ed908c59b3 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -135,7 +135,6 @@ py_strict_test(
     srcs = ["tflite_convert_test.py"],
     data = [
         ":tflite_convert.par",
-        "//tensorflow/lite/python:tflite_convert",  # Necessary because the test runs the binary from the command line
         "@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb",
     ],
     # Increased thread count for reducing timeout failures.
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 2519835376fe..7bd4c5e94112 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -453,6 +453,7 @@ def build_conversion_flags(
     model_origin_framework=lite_constants.UNSET,
     canonicalizing_inf_as_min_max_float=True,
     serialize_debug_metadata=False,
+    unsafe_fuse_dynamic_shaped_broadcast=False,
     **_,
 ):
   """Builds protocol buffer describing a conversion of a model.
@@ -593,6 +594,11 @@ def build_conversion_flags(
       MIN/MAX float value and output of converter only contains finite values.
     serialize_debug_metadata: When set to true, serialize debug metadata in the
       flatbuffer.
+    unsafe_fuse_dynamic_shaped_broadcast: When set to true, allows fusion of
+      dynamic shaped broadcast ops. It helps fusing implicit broadcasting ops
+      when output shape has dynamic dimensions, but it may cause incorrect
+      results when broadcasting ops are introduced by explicit broadcasting in
+      the source model.
 
   Returns:
     conversion_flags: protocol buffer describing the conversion process.
@@ -727,6 +733,9 @@ def build_conversion_flags(
   )
 
   conversion_flags.serialize_debug_metadata = serialize_debug_metadata
+  conversion_flags.unsafe_fuse_dynamic_shaped_broadcast = (
+      unsafe_fuse_dynamic_shaped_broadcast
+  )
 
   return conversion_flags
 
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index e4db4bfcba3b..0fadab15fe2d 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -216,7 +216,7 @@ def __init__(self, interpreter=None, signature_key=None):
     self._signature_key = signature_key
     signature_defs = interpreter._get_full_signature_list()
     if signature_key not in signature_defs:
-      raise ValueError('Invalid signature_key provided.')
+      raise ValueError(f'Invalid signature_key provided: "{signature_key}".')
     self._signature_def = signature_defs[signature_key]
     self._outputs = self._signature_def['outputs'].items()
     self._inputs = self._signature_def['inputs']
@@ -488,7 +488,7 @@ def __init__(
           x for x in self._custom_op_registerers if not isinstance(x, str)
       ]
       self._interpreter = _interpreter_wrapper.CreateWrapperFromFile(
-          model_path,
+          os.fspath(model_path),
           op_resolver_id,
           custom_op_registerers_by_name,
           custom_op_registerers_by_func,
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index f216ce447911..ad7ec933c8ec 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -15,6 +15,7 @@
 """TensorFlow Lite Python Interface: Sanity check."""
 import ctypes
 import io
+import pathlib
 import sys
 from unittest import mock
 
@@ -96,6 +97,16 @@ def assertQuantizationParamsEqual(self, scales, zero_points,
     self.assertAllEqual(zero_points, params['zero_points'])
     self.assertEqual(quantized_dimension, params['quantized_dimension'])
 
+  def testPathLikeModel(self):
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=pathlib.Path(
+            resource_loader.get_path_to_datafile(
+                'testdata/permute_float.tflite'
+            )
+        ),
+    )
+    interpreter.allocate_tensors()
+
   def testThreads_NegativeValue(self):
     with self.assertRaisesRegex(ValueError, 'num_threads should >= 1'):
       interpreter_wrapper.Interpreter(
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 75e5b280e966..932e91c7250b 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -16,7 +16,7 @@ cc_library(
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -42,10 +42,10 @@ cc_library(
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/kernels:reference_ops",
         "//tensorflow/lite/kernels/internal:compatibility",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -56,7 +56,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:stateful_error_reporter",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -66,7 +66,7 @@ cc_library(
     hdrs = ["python_utils.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -97,7 +97,7 @@ pybind_extension(
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ] + select({
         ":tflite_pip_with_flex": ["//tensorflow/lite/delegates/flex:delegate"],
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
index a9105c5e58c7..d7ec5349408d 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <functional>
+#include <stdexcept>
 #include <string>
+#include <vector>
 
 #include "pybind11/functional.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
index dc6a2adf7ced..945ac3a0d3f3 100644
--- a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 
+#include <Python.h>
+
+#include <cstdarg>
+#include <cstdio>
 #include <string>
 
 namespace tflite {
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 3b62c3a62b78..55afc80c3297 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -685,6 +685,7 @@ def __init__(self):
     self.model_origin_framework = constants.UNSET
     self.canonicalizing_inf_as_min_max_float = True
     self._experimental_strict_qdq = False
+    self._experimental_unsafe_fuse_dynamic_shaped_broadcast = False
 
     # Debug parameters
     self.ir_dump_dir = None
@@ -854,6 +855,9 @@ def _get_base_converter_args(self):
             self.canonicalizing_inf_as_min_max_float
         ),
         "serialize_debug_metadata": self.serialize_debug_metadata,
+        "unsafe_fuse_dynamic_shaped_broadcast": (
+            self._experimental_unsafe_fuse_dynamic_shaped_broadcast
+        ),
     }
 
     if self.saved_model_dir:
diff --git a/tensorflow/lite/python/metrics/BUILD b/tensorflow/lite/python/metrics/BUILD
index c94bb6247fab..8106f3cf86e3 100644
--- a/tensorflow/lite/python/metrics/BUILD
+++ b/tensorflow/lite/python/metrics/BUILD
@@ -16,9 +16,9 @@ cc_library(
     ),
     hdrs = ["wrapper/metrics_wrapper.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//tensorflow/python:__pkg__"],
+    visibility = ["//visibility:private"],
     deps = [
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
     ] + if_portable(
         if_false = [
             "//learning/brain/google/monitoring:metrics_exporter",
@@ -44,8 +44,8 @@ pybind_extension(
     deps = [
         ":metrics_wrapper_lib",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
         "@com_google_protobuf//:protobuf",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 91bf164d1c8f..966b4d1afd35 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -27,11 +27,11 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantize_model",
         "//tensorflow/lite/tools/optimize/calibration:calibration_reader",
         "//tensorflow/lite/tools/optimize/calibration:calibrator_lib",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -54,7 +54,7 @@ pybind_extension(
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 8b68b85ea15c..544bc79940be 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
 #include "tensorflow/compiler/mlir/lite/offset_buffer.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 3dc6065f5198..b64c3e5fb576 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -184,7 +184,7 @@ pybind_extension(
         ":test_registerer",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py
index 009111bc543d..e510ad421031 100644
--- a/tensorflow/lite/python/tflite_convert_test.py
+++ b/tensorflow/lite/python/tflite_convert_test.py
@@ -54,7 +54,7 @@ def _run(self,
            expected_ops_in_converted_model=None,
            expected_output_shapes=None):
     output_file = os.path.join(self.get_temp_dir(), 'model.tflite')
-    tflite_bin = resource_loader.get_path_to_datafile('tflite_convert')
+    tflite_bin = resource_loader.get_path_to_datafile('tflite_convert.par')
     cmdline = '{0} --output_file={1} {2}'.format(tflite_bin, output_file,
                                                  flags_str)
 
diff --git a/tensorflow/lite/swift/BUILD.apple b/tensorflow/lite/swift/BUILD.apple
index 92f93975c0b2..617c6f60026b 100644
--- a/tensorflow/lite/swift/BUILD.apple
+++ b/tensorflow/lite/swift/BUILD.apple
@@ -128,7 +128,8 @@ apple_static_xcframework(
 
 ios_unit_test(
     name = "Tests",
-    size = "medium",
+    size = "small",
+    timeout = "moderate",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
diff --git a/tensorflow/lite/swift/Tests/TensorFlowLiteTests.swift b/tensorflow/lite/swift/Tests/TensorFlowLiteTests.swift
index f0b2302f722d..cdb43b5328fe 100644
--- a/tensorflow/lite/swift/Tests/TensorFlowLiteTests.swift
+++ b/tensorflow/lite/swift/Tests/TensorFlowLiteTests.swift
@@ -20,9 +20,9 @@ class TensorFlowLiteTests: XCTestCase {
 
   func testRuntime_Version() {
     #if swift(>=5.0)
-    let pattern = #"^(\d+)\.(\d+)\.(\d+)([+-][-.0-9A-Za-z]+)?$"#
+    let pattern = #"^(\d+)\.(\d+)\.(\d+)([+-][-.0-9A-Za-z]+)?(\+\w+)?$"#
     #else
-    let pattern = "^(\\d+)\\.(\\d+)\\.(\\d+)([+-][-.0-9A-Za-z]+)?$"
+    let pattern = "^(\\d+)\\.(\\d+)\\.(\\d+)([+-][-.0-9A-Za-z]+)?(\\+\\w+)?$"
     #endif  // swift(>=5.0)
     XCTAssertNotNil(TensorFlowLite.Runtime.version.range(of: pattern, options: .regularExpression))
   }
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index f6614e731e46..4da42f800237 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -639,8 +639,8 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/python/interpreter_wrapper:numpy",
         "//tensorflow/lite/python/interpreter_wrapper:python_utils",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -707,7 +707,7 @@ pybind_extension(
         ":string_util_lib",
         "//tensorflow/lite/python/interpreter_wrapper:numpy",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index d31a2f83d677..aa41da3c01f4 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -151,6 +151,8 @@ cc_library(
         ":types_proto_cc",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
index cd5684bfbaf5..6b32d9582760 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
index 7aa694395fc1..b3f9d22f1f2e 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
index 62968789dfb2..be724d3234b4 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 3d84bfa0bbbe..160cd87f819d 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
+#include <iostream>
 #include <memory>
+#include <ostream>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
index 3c9a6b968d6e..ca581832f7b4 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index c6b4b6fa228b..813407cc5648 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <algorithm>
+#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index b9c3b7e7c2d3..282eb19e23f7 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index a6681d8da76a..8f694bfabe5d 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <cstddef>
+#include <cstdlib>
+#include <limits>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
index 1686ee9c1eb8..03c91392de15 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index 2fea3f4d3575..497bea157134 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <iostream>
+#include <cstddef>
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
index 76d45982d32d..5fb2ced50e9f 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
index a25ad134e62b..75e2573741dc 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
index 47bd42688008..70c06efabe41 100644
--- a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
@@ -13,6 +13,7 @@
  limitations under the License.
  ==============================================================================*/
 #include <algorithm>
+#include <cstddef>
 #include <string>
 
 #include "absl/log/check.h"
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
index 240d0ae90232..cbd0ee4ebfd9 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index a80c96bf1a5a..c53bd5766b96 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc b/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
index 3600ead24892..97d16595c1e9 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index d13006b14f4b..220b55bad85b 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
index 627abba6ad19..3a726f003c45 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
-#include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
index 1fd133e2bd4d..4bb514913f60 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index 2357684027cd..e85e1609d0c1 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status RemoveTrivialConcatenationInput::Run(Model* model,
-                                                          std::size_t op_index,
-                                                          bool* modified) {
+absl::Status RemoveTrivialConcatenationInput::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
   *modified = false;
   // TensorFlow allows Concatenation nodes to have 0-D inputs,
   // and they are then treated as empty i.e. omitted from concatenation,
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
index 21282cc0bbd6..92cfdfbbcba6 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
@@ -65,9 +65,8 @@ bool IsFakeQuantTrivial(GraphTransformation* transformation, const Model& model,
 }  // namespace
 
 // Removes FakeQuant ops that are trivial (have no effect, are redundant, etc).
-::tensorflow::Status RemoveTrivialFakeQuant::Run(Model* model,
-                                                 std::size_t op_index,
-                                                 bool* modified) {
+absl::Status RemoveTrivialFakeQuant::Run(Model* model, std::size_t op_index,
+                                         bool* modified) {
   *modified = false;
   const auto op_it = model->operators.begin() + op_index;
   auto* op = op_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index e7c483e97f9f..791235632369 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -97,8 +97,9 @@ bool IsTrivialFusedActivationFunc(
 // Attempts to remove both fused and unfused activation functions if the
 // quantization params indicate that the representable values fall inside the
 // activation range.
-::tensorflow::Status RemoveTrivialQuantizedActivationFunc::Run(
-    Model* model, std::size_t op_index, bool* modified) {
+absl::Status RemoveTrivialQuantizedActivationFunc::Run(Model* model,
+                                                       std::size_t op_index,
+                                                       bool* modified) {
   *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
index 5172426e8b25..acf38ba4bc4a 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -71,9 +71,9 @@ bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
 
 // Attempts to remove min/max functions if the quantization params indicate that
 // the representable values fall inside the clip range.
-::tensorflow::Status RemoveTrivialQuantizedMinMax::Run(Model* model,
-                                                       std::size_t op_index,
-                                                       bool* modified) {
+absl::Status RemoveTrivialQuantizedMinMax::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
   *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
index fe450547c201..83e5e20f1706 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
@@ -82,9 +82,8 @@ bool IsReshapeTrivial(const Model& model, const Operator& op,
 
 }  // namespace
 
-::tensorflow::Status RemoveTrivialReshape::Run(Model* model,
-                                               std::size_t op_index,
-                                               bool* modified) {
+absl::Status RemoveTrivialReshape::Run(Model* model, std::size_t op_index,
+                                       bool* modified) {
   *modified = false;
   const auto reshape_it = model->operators.begin() + op_index;
   auto* reshape_op = reshape_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
index c92189f768e8..6845f4999243 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
@@ -50,8 +50,8 @@ bool IsSliceTrivial(const Model& model, const Operator& op,
 
 }  // namespace
 
-::tensorflow::Status RemoveTrivialSlice::Run(Model* model, std::size_t op_index,
-                                             bool* modified) {
+absl::Status RemoveTrivialSlice::Run(Model* model, std::size_t op_index,
+                                     bool* modified) {
   *modified = false;
   const auto reshape_it = model->operators.begin() + op_index;
   auto* slice_op = reshape_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
index 4a403609cfca..8334c77d412a 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
@@ -28,8 +28,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status RemoveUnusedOp::Run(Model* model, std::size_t op_index,
-                                         bool* modified) {
+absl::Status RemoveUnusedOp::Run(Model* model, std::size_t op_index,
+                                 bool* modified) {
   *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 9f517c2fbc1a..89241941c47a 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -65,9 +65,8 @@ bool IsMoveOperator(OperatorType optype) {
 
 // Swap elementwise operators such that all value operators occur before all
 // element move operators, e.g. negation then transpose.
-::tensorflow::Status ReorderElementwiseUnary::Run(Model* model,
-                                                  std::size_t op_index,
-                                                  bool* modified) {
+absl::Status ReorderElementwiseUnary::Run(Model* model, std::size_t op_index,
+                                          bool* modified) {
   *modified = false;
   const auto element_op_it = model->operators.begin() + op_index;
   std::unique_ptr<Operator>& element_op = *element_op_it;
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index 53f123c0ea4a..1266bdb87f33 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -105,9 +105,8 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
 
 // Swaps reshape-transpose to transpose-reshape whenever possible. This is
 // possible when the reshape does not affect memory ordering.
-::tensorflow::Status ReorderReshapeTranspose::Run(Model* model,
-                                                  std::size_t op_index,
-                                                  bool* modified) {
+absl::Status ReorderReshapeTranspose::Run(Model* model, std::size_t op_index,
+                                          bool* modified) {
   *modified = false;
   auto transpose_it = model->operators.begin() + op_index;
 
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index a4abf5901e2e..7fdcd79cd8f0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
index 2d72946f8722..19722812bf0b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -191,9 +191,9 @@ bool EvaluateBinaryOperatorOnConstantInputs(Model* model,
 }
 }  // namespace
 
-::tensorflow::Status ResolveConstantBinaryOperator::Run(Model* model,
-                                                        std::size_t op_index,
-                                                        bool* modified) {
+absl::Status ResolveConstantBinaryOperator::Run(Model* model,
+                                                std::size_t op_index,
+                                                bool* modified) {
   *modified = false;
   const auto binary_it = model->operators.begin() + op_index;
   const auto* binary_op = binary_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index a3038fcc2723..fa3c7ac5b125 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -140,9 +140,9 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
 }  // namespace
 
 // Resolves the concatenation operator if all its inputs are constant arrays.
-::tensorflow::Status ResolveConstantConcatenation::Run(Model* model,
-                                                       std::size_t op_index,
-                                                       bool* modified) {
+absl::Status ResolveConstantConcatenation::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
   *modified = false;
   const auto concat_it = model->operators.begin() + op_index;
   const auto* concat_base_op = concat_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index 386dd6159170..07fdb2b2a024 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -64,9 +64,8 @@ void GetBoundsForQuantizedDataType(ArrayDataType quantized_data_type,
   }
 }
 
-::tensorflow::Status ResolveConstantFakeQuant::Run(Model* model,
-                                                   std::size_t op_index,
-                                                   bool* modified) {
+absl::Status ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index,
+                                           bool* modified) {
   *modified = false;
   const auto fakequant_it = model->operators.begin() + op_index;
   const auto* fakequant_base_op = fakequant_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
index 6b8bb77e5d83..63ec4d15106b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
@@ -46,9 +46,8 @@ bool ComputeFillArray(Model* model, FillOperator* op) {
   return true;
 }
 
-::tensorflow::Status ResolveConstantFill::Run(Model* model,
-                                              std::size_t op_index,
-                                              bool* modified) {
+absl::Status ResolveConstantFill::Run(Model* model, std::size_t op_index,
+                                      bool* modified) {
   *modified = false;
   const auto fill_it = model->operators.begin() + op_index;
   auto* base_op = fill_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
index 988fdec62532..0395023d5350 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
@@ -67,9 +70,8 @@ inline void Gather(const Array& input_array, const Array& coords_array,
 // Resolves a constant Gather operation.
 // This simply performs the gather and produces the output array with the
 // appropriate values.
-::tensorflow::Status ResolveConstantGather::Run(Model* model,
-                                                std::size_t op_index,
-                                                bool* modified) {
+absl::Status ResolveConstantGather::Run(Model* model, std::size_t op_index,
+                                        bool* modified) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
index 95e86b86f763..c1d33256ce36 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -54,9 +54,8 @@ void Pack(Model* model, PackOperator const& op) {
 
 }  // namespace
 
-::tensorflow::Status ResolveConstantPack::Run(Model* model,
-                                              std::size_t op_index,
-                                              bool* modified) {
+absl::Status ResolveConstantPack::Run(Model* model, std::size_t op_index,
+                                      bool* modified) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
index 366d4e1818b5..36d10572f3d6 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -63,9 +63,9 @@ bool ComputeRandomUniformArray(Model* model, RandomUniformOperator* op) {
   return true;
 }
 
-::tensorflow::Status ResolveConstantRandomUniform::Run(Model* model,
-                                                       std::size_t op_index,
-                                                       bool* modified) {
+absl::Status ResolveConstantRandomUniform::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
   *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
index c895c2fb61d6..c6746c37e2ba 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -27,9 +27,8 @@ limitations under the License.
 namespace toco {
 
 // Resolves a constant reshape operation by copying the buffer.
-::tensorflow::Status ResolveConstantReshape::Run(Model* model,
-                                                 std::size_t op_index,
-                                                 bool* modified) {
+absl::Status ResolveConstantReshape::Run(Model* model, std::size_t op_index,
+                                         bool* modified) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
index 2ba99231a460..9d1397b5c32c 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
@@ -31,9 +31,8 @@ namespace toco {
 // This implementation is looking strictly for all-or-nothing on the select
 // condition. It's possible to enhance this by looking per-element and possibly
 // producing a Mul op.
-::tensorflow::Status ResolveConstantSelect::Run(Model* model,
-                                                std::size_t op_index,
-                                                bool* modified) {
+absl::Status ResolveConstantSelect::Run(Model* model, std::size_t op_index,
+                                        bool* modified) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
index ed54a07ddbbb..418cb1d35321 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
@@ -24,9 +24,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status ResolveConstantShapeOrRank::Run(Model* model,
-                                                     std::size_t op_index,
-                                                     bool* modified) {
+absl::Status ResolveConstantShapeOrRank::Run(Model* model, std::size_t op_index,
+                                             bool* modified) {
   *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
index aea0342bfe19..f82220381ed7 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -91,9 +91,8 @@ bool Slice(SliceOperator const& op, Array const& input_array,
 
 }  // namespace
 
-::tensorflow::Status ResolveConstantSlice::Run(Model* model,
-                                               std::size_t op_index,
-                                               bool* modified) {
+absl::Status ResolveConstantSlice::Run(Model* model, std::size_t op_index,
+                                       bool* modified) {
   *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 09f8026ad88b..ded40fc42834 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -103,9 +103,9 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
 
 }  // anonymous namespace
 
-::tensorflow::Status ResolveConstantStridedSlice::Run(Model* model,
-                                                      std::size_t op_index,
-                                                      bool* modified) {
+absl::Status ResolveConstantStridedSlice::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
   *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
index e98ca1afbcd8..20c7607e2e34 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -106,9 +106,8 @@ void Transpose(Model* model, const Array& input_array,
 
 }  // namespace
 
-::tensorflow::Status ResolveConstantTranspose::Run(Model* model,
-                                                   std::size_t op_index,
-                                                   bool* modified) {
+absl::Status ResolveConstantTranspose::Run(Model* model, std::size_t op_index,
+                                           bool* modified) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
index f922d9e800cc..0ae55721f082 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -117,9 +117,9 @@ bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
   return true;
 }
 
-::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
-                                                       std::size_t op_index,
-                                                       bool* modified) {
+absl::Status ResolveConstantUnaryOperator::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
   *modified = false;
   const auto unary_it = model->operators.begin() + op_index;
   const auto* unary_op = unary_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
index f954d4023230..909be68f705e 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
@@ -25,9 +25,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status ResolveGatherAttributes::Run(Model* model,
-                                                  std::size_t op_index,
-                                                  bool* modified) {
+absl::Status ResolveGatherAttributes::Run(Model* model, std::size_t op_index,
+                                          bool* modified) {
   *modified = false;
   auto* gather_op = model->operators[op_index].get();
   if (gather_op->type != OperatorType::kGather) return absl::OkStatus();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
index 137c51c7a2d5..3e97d97a59f0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
@@ -26,9 +26,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status ResolvePadAttributes::Run(Model* model,
-                                               std::size_t op_index,
-                                               bool* modified) {
+absl::Status ResolvePadAttributes::Run(Model* model, std::size_t op_index,
+                                       bool* modified) {
   *modified = false;
   const auto pad_it = model->operators.begin() + op_index;
   auto* pad_op = pad_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
index b47d7ecd1016..c831fae19f73 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
@@ -44,9 +44,8 @@ bool ResolveAttributes(Model* model, T* op) {
   return true;
 }
 
-::tensorflow::Status ResolveReduceAttributes::Run(Model* model,
-                                                  std::size_t op_index,
-                                                  bool* modified) {
+absl::Status ResolveReduceAttributes::Run(Model* model, std::size_t op_index,
+                                          bool* modified) {
   *modified = false;
   Operator* op = model->operators[op_index].get();
   switch (op->type) {
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
index e1af84c51d12..e47fef20938b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <algorithm>
+#include <cstddef>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
index 0bd549f4ed14..1420050e27c7 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
+#include <cstddef>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 45534fa95795..f9f7dffbfe9e 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -43,9 +43,9 @@ int PadAttributeArray(Array* attribute_array, std::vector<int> pad_values,
   return mask;
 }
 
-::tensorflow::Status ResolveStridedSliceAttributes::Run(Model* model,
-                                                        std::size_t op_index,
-                                                        bool* modified) {
+absl::Status ResolveStridedSliceAttributes::Run(Model* model,
+                                                std::size_t op_index,
+                                                bool* modified) {
   *modified = false;
   const auto slice_it = model->operators.begin() + op_index;
   auto* slice_op = slice_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index 14490d5695df..0db8f5d58f14 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -27,9 +27,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status ResolveTensorFlowMerge::Run(Model* model,
-                                                 std::size_t op_index,
-                                                 bool* modified) {
+absl::Status ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index,
+                                         bool* modified) {
   *modified = false;
   const auto merge_it = model->operators.begin() + op_index;
   const auto* merge_op = merge_it->get();
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 11a930222e37..b8b046380bd7 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -28,9 +28,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status ResolveTensorFlowSwitch::Run(Model* model,
-                                                  std::size_t op_index,
-                                                  bool* modified) {
+absl::Status ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index,
+                                          bool* modified) {
   *modified = false;
   const auto switch_it = model->operators.begin() + op_index;
   const auto* switch_op = switch_it->get();
@@ -100,7 +99,7 @@ ::tensorflow::Status ResolveTensorFlowSwitch::Run(Model* model,
         // Let us guard our assumption that only Merge nodes consume the outputs
         // of Switch nodes:
         if (other_op->type != OperatorType::kMerge) {
-          return ::tensorflow::Status(
+          return absl::Status(
               absl::StatusCode::kFailedPrecondition,
               ::absl::StrCat(
                   "Found ", HelpfulOperatorTypeName(*other_op),
diff --git a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
index f2e34c2838f9..8053e36ff66b 100644
--- a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
+++ b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
@@ -29,8 +29,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
-                                           bool* modified) {
+absl::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
+                                   bool* modified) {
   *modified = false;
   Operator* op = model->operators[op_index].get();
   if (op->type != OperatorType::kFullyConnected) {
diff --git a/tensorflow/lite/toco/graph_transformations/tests/identify_l2_pool_test.cc b/tensorflow/lite/toco/graph_transformations/tests/identify_l2_pool_test.cc
index ab487b4cf3bb..136d8e26575e 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/identify_l2_pool_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/identify_l2_pool_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <tuple>
+#include <memory>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
index 3cb220090677..51af1ae75d8b 100644
--- a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index b15c4a4719f6..1d746b036e30 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -26,9 +26,8 @@ limitations under the License.
 
 namespace toco {
 
-::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
-                                                     std::size_t op_index,
-                                                     bool* modified) {
+absl::Status UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index,
+                                             bool* modified) {
   *modified = false;
   // Collapses a partitioned tf.nn.embedding_lookup back into a single Gather.
   // https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 5dd31b8e9fcc..935a397ced6c 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -137,8 +138,8 @@ TransposeOperator* TransposeInput(const std::string& input, Model* model) {
 // Unrolls a BatchMatMul on the batch dimension.
 // We need to slice each batch out of the inputs, matmul them individually, then
 // stack them all back together at the end.
-::tensorflow::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
-                                            bool* modified) {
+absl::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
+                                    bool* modified) {
   *modified = false;
   auto batch_op_it = model->operators.begin() + op_index;
   if (batch_op_it->get()->type != OperatorType::kBatchMatMul) {
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index b916d80c43ba..bc2b8ec50264 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index cdab4662922f..39f44fda65d2 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -52,7 +52,6 @@ cc_library(
         "//tensorflow/lite/toco:types_proto_cc",
         "//tensorflow/lite/toco/logging:conversion_log_util",
         "//tensorflow/lite/toco/logging:toco_conversion_log_proto_cc",
-        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:absl_log",
         "@com_google_absl//absl/status",
@@ -60,6 +59,7 @@ cc_library(
         "@com_google_protobuf//:protobuf_headers",
         "@flatbuffers//:runtime_cc",
         "@local_tsl//tsl/platform:status",
+        "@local_xla//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
     ] + select({
         # This is required when running `tflite_convert` from `bazel`.
         # It requires to link with TensorFlow Ops to get the op definitions.
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 18f4227259e3..e33f7ac8997e 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -14,7 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/tflite/export.h"
 
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <set>
 #include <string>
+#include <vector>
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/log/log.h"
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index 55030247d2ef..43bb17df85dd 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_split.h"
-#include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
index 9cfdc9cb34e8..ccf51e54579f 100644
--- a/tensorflow/lite/toco/toco_convert.cc
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -64,9 +64,8 @@ void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
 
   // Checks the input file permissions and reads the contents.
   CheckFrozenModelPermissions(parsed_toco_flags.input_file);
-  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
-                                graph_def_contents, port::file::Defaults())
-            .ok());
+  CHECK_OK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                   graph_def_contents, port::file::Defaults()));
 }
 }  // namespace
 
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 437ce3b43c48..e9fab6043d41 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -246,6 +246,7 @@ cc_library(
     name = "tool_params",
     srcs = ["tool_params.cc"],
     hdrs = ["tool_params.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [":logging"],
 )
@@ -265,6 +266,7 @@ cc_library(
     name = "command_line_flags",
     srcs = ["command_line_flags.cc"],
     hdrs = ["command_line_flags.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":logging",
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
index b49c45e12ec6..2e2f1299e006 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
@@ -20,6 +20,9 @@ limitations under the License.
 #include <sys/stat.h>
 
 #include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
index 03ca95276e07..5470abdf0819 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include <errno.h>
 #include <sys/stat.h>
 
-#include <fstream>
-#include <iterator>
-#include <sstream>
+#include <cstddef>
+#include <cstdint>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard
index bfa36129419f..07b1226e9002 100644
--- a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard
@@ -1,7 +1,8 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="23727" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <device id="retina6_12" orientation="portrait" appearance="light"/>
     <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="23721"/>
         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
@@ -11,10 +12,10 @@
             <objects>
                 <viewController id="01J-lp-oVM" sceneMemberID="viewController">
                     <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <rect key="frame" x="0.0" y="0.0" width="393" height="852"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                         <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                     </view>
                 </viewController>
                 <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
diff --git a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard
index adcfe1ef4e70..adac5adc675b 100644
--- a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard
@@ -1,11 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14269.12" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
-    <device id="retina4_7" orientation="portrait">
-        <adaptation id="fullscreen"/>
-    </device>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="23727" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina4_7" orientation="portrait" appearance="light"/>
     <dependencies>
         <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14252.5"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="23721"/>
         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
@@ -18,8 +16,14 @@
                         <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                         <subviews>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="j0O-Lq-1tJ">
-                                <rect key="frame" x="64" y="20" width="247" height="63"/>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="Vd4-Gf-qKO">
+                                <rect key="frame" x="26" y="101" width="333" height="556"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                            <button opaque="NO" contentMode="scaleToFill" misplaced="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="j0O-Lq-1tJ">
+                                <rect key="frame" x="69" y="30" width="247" height="63"/>
                                 <constraints>
                                     <constraint firstAttribute="height" constant="63" id="8VO-Ln-L2h"/>
                                 </constraints>
@@ -29,13 +33,8 @@
                                     <action selector="onBenchmarkModel:" destination="BYZ-38-t0r" eventType="touchUpInside" id="Rb1-hs-Mub"/>
                                 </connections>
                             </button>
-                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="Vd4-Gf-qKO">
-                                <rect key="frame" x="26" y="101" width="333" height="556"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
-                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
-                            </textView>
                         </subviews>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                         <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                         <constraints>
                             <constraint firstItem="Vd4-Gf-qKO" firstAttribute="top" secondItem="j0O-Lq-1tJ" secondAttribute="bottom" constant="18" id="Kd3-pP-C1k"/>
@@ -46,7 +45,6 @@
                             <constraint firstItem="Vd4-Gf-qKO" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="26" id="aXc-6M-kyL"/>
                             <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="Vd4-Gf-qKO" secondAttribute="bottom" constant="10" id="tz5-wP-LZs"/>
                         </constraints>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                     </view>
                     <connections>
                         <outlet property="resultsView" destination="Vd4-Gf-qKO" id="dBT-f6-SYw"/>
diff --git a/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
index e388c5f69e25..852f272e6bba 100644
--- a/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
+++ b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   cpuinfo
   GIT_REPOSITORY https://github.com/pytorch/cpuinfo
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 8a1772a0c5c447df2d18edf33ec4603a8c9c04a6
+  GIT_TAG b73ae6ce38d5dd0b7fe46dbe0a4b5f4bab91c7ea
   GIT_PROGRESS TRUE
   SOURCE_DIR "${CMAKE_BINARY_DIR}/cpuinfo"
 )
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index d4ab4b509305..1b94062c7956 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 5b4978cae19292232a27bdf0f495819bf5297167
+  GIT_TAG 240217afed5486735a54444e7d42bbf894da2483
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index 5d9fbb042d13..0548c7268178 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite", "cc_test_with_tflite")
 
@@ -16,6 +17,7 @@ cc_library_with_tflite(
     hdrs = [
         "delegate_provider.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     tflite_deps = [
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
index c6cbcf8e7aab..5465e3613f68 100644
--- a/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
@@ -34,6 +34,8 @@ class XnnpackDelegateProvider : public DelegateProvider {
     default_params_.AddParam("xnnpack_weight_cache_file_path",
                              ToolParam::Create<std::string>(""));
     default_params_.AddParam("xnnpack_slinky", ToolParam::Create<bool>(false));
+    default_params_.AddParam("xnnpack_runtime_flags",
+                             ToolParam::Create<int>(0));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -67,6 +69,8 @@ std::vector<Flag> XnnpackDelegateProvider::CreateFlags(
                        "enable the Slinky optimizer. "
                        "(Ignored if --use_xnnpack is false, or if XNNPACK is "
                        "built without Slinky.)"),
+      CreateFlag<int>("xnnpack_runtime_flags", params,
+                      "Extra flags to pass to XNNPACK runtime."),
   };
   return flags;
 }
@@ -79,6 +83,8 @@ void XnnpackDelegateProvider::LogParams(const ToolParams& params,
   LOG_TOOL_PARAM(params, std::string, "xnnpack_weight_cache_file_path",
                  "xnnpack_weight_cache_file_path", verbose);
   LOG_TOOL_PARAM(params, bool, "xnnpack_slinky", "Use Slinky", verbose);
+  LOG_TOOL_PARAM(params, int, "xnnpack_runtime_flags",
+                 "Extra flags for XNNPACK runtime", verbose);
 }
 
 TfLiteDelegatePtr XnnpackDelegateProvider::CreateTfLiteDelegate(
@@ -94,6 +100,8 @@ TfLiteDelegatePtr XnnpackDelegateProvider::CreateTfLiteDelegate(
     if (params.Get<bool>("xnnpack_slinky")) {
       opts.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SLINKY;
     }
+    opts.runtime_flags = params.Get<int>("xnnpack_runtime_flags");
+
     const std::string path =
         params.Get<std::string>("xnnpack_weight_cache_file_path");
     if (!path.empty()) {
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 926b530b0639..45043c88572d 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/types/optional.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index ab1a88c413ad..73ef09e51f96 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/types/optional.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index cee8012dadc6..897cccd53879 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/types/optional.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Base.lproj/Main.storyboard b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Base.lproj/Main.storyboard
index 9b576fb06254..66b36b3caecb 100644
--- a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Base.lproj/Main.storyboard
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Base.lproj/Main.storyboard
@@ -1,8 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="20037" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="23727" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
     <device id="retina6_1" orientation="portrait" appearance="light"/>
     <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="20020"/>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="23721"/>
         <capability name="Safe area layout guides" minToolsVersion="9.0"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
@@ -15,8 +16,8 @@
                         <rect key="frame" x="0.0" y="0.0" width="414" height="896"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                         <subviews>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="j0O-Lq-1tJ">
-                                <rect key="frame" x="64" y="44" width="286" height="63"/>
+                            <button opaque="NO" contentMode="scaleToFill" misplaced="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="j0O-Lq-1tJ">
+                                <rect key="frame" x="64" y="97" width="286" height="63"/>
                                 <constraints>
                                     <constraint firstAttribute="height" constant="63" id="8VO-Ln-L2h"/>
                                 </constraints>
@@ -27,7 +28,7 @@
                                 </connections>
                             </button>
                             <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" textAlignment="natural" selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="Vd4-Gf-qKO">
-                                <rect key="frame" x="26" y="125" width="368" height="727"/>
+                                <rect key="frame" x="26" y="177" width="368" height="641"/>
                                 <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
                                 <fontDescription key="fontDescription" type="system" pointSize="14"/>
                                 <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
index 1d7e697dd8d3..63dae61ebf90 100644
--- a/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 
-#include "absl/types/optional.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/logging.h"
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
index 721eabbbaee2..bbf3f14a373c 100644
--- a/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdlib>
 
-#include "absl/types/optional.h"
 #include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
 #include "tensorflow/lite/tools/logging.h"
 
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 671bdacd42e7..aa14fe169c12 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -243,6 +243,9 @@ TfLiteDelegatePtr CreateXNNPACKDelegate(
   xnnpack_settings_builder.fbb_.AddElement<int32_t>(
       XNNPackSettings::VT_FLAGS, static_cast<int32_t>(xnnpack_options->flags),
       0);
+  xnnpack_settings_builder.fbb_.AddElement<int32_t>(
+      XNNPackSettings::VT_RUNTIME_FLAGS,
+      static_cast<int32_t>(xnnpack_options->runtime_flags), 0);
   xnnpack_settings_builder.add_weight_cache_file_path(weight_cache_file_path);
   flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings =
       xnnpack_settings_builder.Finish();
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index ac06c22c3497..e50425bfcd22 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -21,7 +21,6 @@
 """
 
 import copy
-import functools
 import random
 import re
 import struct
@@ -490,12 +489,7 @@ def get_options_as(
   ):
     raise err
 
-  @functools.singledispatch
-  def _get_opts(unused_op):
-    return None
-
-  @_get_opts.register
-  def _(op: schema_fb.Operator):
+  if isinstance(op, schema_fb.Operator):
     if not is_opt_1_type:
       enum_val = getattr(schema_fb.BuiltinOptions2, base_type_name)
       opts_creator = schema_fb.BuiltinOptions2Creator
@@ -510,8 +504,7 @@ def _(op: schema_fb.Operator):
       return None
     return opts_creator(enum_val, raw_ops)
 
-  @_get_opts.register
-  def _(op: schema_fb.OperatorT):
+  elif isinstance(op, schema_fb.OperatorT):
     if is_opt_1_type:
       raw_ops_t = op.builtinOptions
     else:
@@ -520,4 +513,5 @@ def _(op: schema_fb.OperatorT):
       return None
     return raw_ops_t
 
-  return _get_opts(op)
+  else:
+    return None
diff --git a/tensorflow/lite/tools/optimize/sparsity/BUILD b/tensorflow/lite/tools/optimize/sparsity/BUILD
index 6777187872a1..1cab4ca38d0f 100644
--- a/tensorflow/lite/tools/optimize/sparsity/BUILD
+++ b/tensorflow/lite/tools/optimize/sparsity/BUILD
@@ -24,7 +24,7 @@ pybind_extension(
     deps = [
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/tools/pip_package/BUILD b/tensorflow/lite/tools/pip_package/BUILD
deleted file mode 100644
index a7c4c254e861..000000000000
--- a/tensorflow/lite/tools/pip_package/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-load("//tensorflow/lite/tools/pip_package/utils:py_manylinux_compliance_test.bzl", "verify_manylinux_compliance_test")
-load("//tensorflow/lite/tools/pip_package/utils:py_wheel.bzl", "py_wheel")
-
-package(
-    default_visibility = ["//visibility:private"],
-)
-
-MANYLINUX_X86_64_TAG = "manylinux_2_17_x86_64"
-
-genrule(
-    name = "setup_py",
-    srcs = ["//tensorflow/lite/tools/pip_package:setup_with_binary.py"],
-    outs = ["setup.py"],
-    cmd = "cat $< > $@",
-)
-
-py_wheel(
-    name = "litert_wheel",
-    srcs = [
-        "//tensorflow/lite/experimental/genai:pywrap_genai_ops.so",
-        "//tensorflow/lite/profiling/proto:model_runtime_info_py",
-        "//tensorflow/lite/profiling/proto:profiling_info_py",
-        "//tensorflow/lite/python:interpreter",
-        "//tensorflow/lite/python:schema_py",
-        "//tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper.so",
-        "//tensorflow/lite/python/metrics:metrics_interface",
-        "//tensorflow/lite/python/metrics:metrics_portable.py",
-    ],
-    platform_name = MANYLINUX_X86_64_TAG,
-    setup_py = ":setup_py",
-    version = "1.1.2",
-)
-
-verify_manylinux_compliance_test(
-    name = "manylinux_compliance_test",
-    aarch64_compliance_tag = "manylinux_2_17_aarch64",
-    ppc64le_compliance_tag = "manylinux_2_17_ppc64le",
-    wheel = ":litert_wheel",
-    x86_64_compliance_tag = MANYLINUX_X86_64_TAG,
-)
diff --git a/tensorflow/lite/tools/pip_package/utils/BUILD b/tensorflow/lite/tools/pip_package/utils/BUILD
deleted file mode 100644
index 32762c6253e9..000000000000
--- a/tensorflow/lite/tools/pip_package/utils/BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-load("//tensorflow:pytype.default.bzl", "pytype_strict_binary")
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "manylinux_compliance_test",
-    srcs = ["manylinux_compliance_test.py"],
-    visibility = ["//visibility:public"],
-)
-
-pytype_strict_binary(
-    name = "wheel_builder",
-    srcs = [
-        "wheel_builder.py",
-    ],
-    main = "wheel_builder.py",
-    deps = [
-        "@pypi_setuptools//:pkg",
-        "@pypi_wheel//:pkg",
-    ],
-)
diff --git a/tensorflow/lite/tools/pip_package/utils/manylinux_compliance_test.py b/tensorflow/lite/tools/pip_package/utils/manylinux_compliance_test.py
deleted file mode 100644
index 3ff09f3cb691..000000000000
--- a/tensorflow/lite/tools/pip_package/utils/manylinux_compliance_test.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import argparse
-import io
-import platform
-import re
-import sys
-from auditwheel import main_show
-
-
-def parse_args():
-  """Arguments parser."""
-  parser = argparse.ArgumentParser(
-      description="Helper for manylinux compliance verification",
-      fromfile_prefix_chars="@",
-  )
-  parser.add_argument(
-      "--wheel-path", required=True, help="Path of the wheel, mandatory"
-  )
-  parser.add_argument(
-      "--aarch64-compliance-tag",
-      required=True,
-      help="ManyLinux compliance tag for aarch64",
-  )
-  parser.add_argument(
-      "--x86_64-compliance-tag",
-      required=True,
-      help="ManyLinux compliance tag for x86_64",
-  )
-  parser.add_argument(
-      "--ppc64le-compliance-tag",
-      required=True,
-      help="ManyLinux compliance tag for ppc64le",
-  )
-  return parser.parse_args()
-
-
-def get_auditwheel_output(wheel_path: str) -> None:
-  """Run "auditwheel show" on the wheel and return the output.
-
-  Args:
-    wheel_path: path of the wheel file
-
-  Returns:
-    "auditwheel show" output
-  """
-  stringio = io.StringIO()
-  previous_stdout = sys.stdout
-  sys.stdout = stringio
-
-  auditwheel_parser = argparse.ArgumentParser(
-      description="Cross-distro Python wheels."
-  )
-  sub_parsers = auditwheel_parser.add_subparsers(metavar="command", dest="cmd")
-  main_show.configure_parser(sub_parsers)
-  auditwheel_args = argparse.Namespace(
-      WHEEL_FILE=wheel_path,
-      verbose=1,
-  )
-  main_show.execute(args=auditwheel_args, p=auditwheel_parser)
-
-  sys.stdout = previous_stdout
-  return stringio.getvalue()
-
-
-def verify_manylinux_compliance(
-    auditwheel_log: str,
-    compliance_tag: str,
-) -> None:
-  """Verify manylinux compliance.
-
-  Args:
-    auditwheel_log: "auditwheel show" execution results
-    compliance_tag: manyLinux compliance tag
-
-  Raises:
-    RuntimeError: if the wheel is not manyLinux compliant.
-  """
-  regex = r'platform tag to\s+"{}"'.format(compliance_tag)
-  alt_regex = regex.replace("2014", "_2_17")
-  if not (
-      re.search(regex, auditwheel_log) or re.search(alt_regex, auditwheel_log)
-  ):
-    raise RuntimeError(
-        ("The wheel is not compliant with the tag {tag}.\n{result}").format(
-            tag=compliance_tag, result=auditwheel_log
-        )
-    )
-
-
-def test_manylinux_compliance(args):
-  machine_type = platform.uname().machine
-  supported_machine_types = ["x86_64", "aarch64", "ppc64le"]
-  if machine_type not in supported_machine_types:
-    raise RuntimeError(
-        "Unsupported machine type {machine_type}. The supported are:"
-        " {supported_types}".format(
-            machine_type=machine_type, supported_types=supported_machine_types
-        )
-    )
-  if machine_type == "x86_64":
-    compliance_tag = args.x86_64_compliance_tag
-  elif machine_type == "aarch64":
-    compliance_tag = args.aarch64_compliance_tag
-  else:
-    compliance_tag = args.ppc64le_compliance_tag
-  auditwheel_output = get_auditwheel_output(args.wheel_path)
-  verify_manylinux_compliance(
-      auditwheel_output,
-      compliance_tag,
-  )
-
-
-if __name__ == "__main__":
-  test_manylinux_compliance(parse_args())
diff --git a/tensorflow/lite/tools/pip_package/utils/py_manylinux_compliance_test.bzl b/tensorflow/lite/tools/pip_package/utils/py_manylinux_compliance_test.bzl
deleted file mode 100644
index f7718ea36e89..000000000000
--- a/tensorflow/lite/tools/pip_package/utils/py_manylinux_compliance_test.bzl
+++ /dev/null
@@ -1,27 +0,0 @@
-""" Macros for manylinux compliance verification test. """
-
-load("//tensorflow:strict.default.bzl", "py_strict_test")
-
-def verify_manylinux_compliance_test(
-        name,
-        wheel,
-        aarch64_compliance_tag,
-        x86_64_compliance_tag,
-        ppc64le_compliance_tag,
-        test_tags = []):
-    py_strict_test(
-        name = name,
-        srcs = [Label("//tensorflow/lite/tools/pip_package/utils:manylinux_compliance_test")],
-        data = [
-            wheel,
-        ],
-        deps = ["@pypi_auditwheel//:pkg"],
-        args = [
-            "--wheel-path=$(location {})".format(wheel),
-            "--aarch64-compliance-tag={}".format(aarch64_compliance_tag),
-            "--x86_64-compliance-tag={}".format(x86_64_compliance_tag),
-            "--ppc64le-compliance-tag={}".format(ppc64le_compliance_tag),
-        ],
-        main = "manylinux_compliance_test.py",
-        tags = ["manual"] + test_tags,
-    )
diff --git a/tensorflow/lite/tools/pip_package/utils/py_wheel.bzl b/tensorflow/lite/tools/pip_package/utils/py_wheel.bzl
deleted file mode 100644
index 824e6f464c5c..000000000000
--- a/tensorflow/lite/tools/pip_package/utils/py_wheel.bzl
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Rule to build a python wheel generically.
-
-This rule is used to build a python wheel from a list of source files. It takes a list of source
-files, a setup.py file, and a version string as input. It then uses a python script,
-wheel_builder.py, to generate the wheel file. The wheel builder binary is responsible for preparing
-the build environment and calling the setuptools command to generate the wheel file.
-"""
-
-load(
-    "@python_version_repo//:py_version.bzl",
-    "HERMETIC_PYTHON_VERSION",
-)
-
-def _get_full_wheel_name(wheel_name, version, platform_name):
-    python_version = HERMETIC_PYTHON_VERSION.replace(".", "")
-    wheel_version = version.replace("-dev", ".dev").replace("-", "")
-    return "{wheel_name}-{wheel_version}-cp{python_version}-cp{python_version}-{wheel_platform_tag}.whl".format(
-        wheel_name = wheel_name,
-        wheel_version = wheel_version,
-        python_version = python_version,
-        wheel_platform_tag = platform_name,
-    )
-
-def _py_wheel_impl(ctx):
-    executable = ctx.executable.wheel_binary
-    filelist_lists = [src.files.to_list() for src in ctx.attr.srcs]
-    filelist = [f for filelist in filelist_lists for f in filelist]
-    wheel_name = _get_full_wheel_name("ai_edge_litert", ctx.attr.version, ctx.attr.platform_name)
-    output_file = ctx.actions.declare_file("dist/{wheel_name}".format(wheel_name = wheel_name))
-
-    args = ctx.actions.args()
-    args.add("--setup_py", ctx.file.setup_py.path)
-    args.add("--output", output_file.dirname)
-    args.add("--version", ctx.attr.version)
-
-    for f in filelist:
-        args.add("--src", f.path)
-
-    if ctx.attr.platform_name:
-        args.add("--platform", ctx.attr.platform_name)
-
-    args.set_param_file_format("flag_per_line")
-    args.use_param_file("@%s", use_always = False)
-
-    ctx.actions.run(
-        mnemonic = "WheelBuilder",
-        arguments = [args],
-        inputs = filelist + [ctx.file.setup_py],
-        outputs = [output_file],
-        executable = executable,
-    )
-    return [DefaultInfo(files = depset(direct = [output_file]))]
-
-py_wheel = rule(
-    implementation = _py_wheel_impl,
-    attrs = {
-        "srcs": attr.label_list(
-            allow_files = True,
-        ),
-        "pyproject": attr.label(
-            allow_single_file = [".toml"],
-        ),
-        "setup_py": attr.label(
-            allow_single_file = [".py"],
-            mandatory = True,
-        ),
-        "platform_name": attr.string(),
-        "version": attr.string(mandatory = True),
-        "wheel_binary": attr.label(
-            default = Label("//tensorflow/lite/tools/pip_package/utils:wheel_builder"),
-            executable = True,
-            cfg = "exec",
-        ),
-    },
-)
diff --git a/tensorflow/lite/tools/pip_package/utils/wheel_builder.py b/tensorflow/lite/tools/pip_package/utils/wheel_builder.py
deleted file mode 100644
index 7d749fce0d71..000000000000
--- a/tensorflow/lite/tools/pip_package/utils/wheel_builder.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2025 The Tensorflow Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This script is used to build a python wheel from a list of source files.
-
-It takes a list of source files, a setup.py file, and a version string as input.
-It then uses a python script, wheel_builder.py, to generate the wheel file. The
-wheel builder binary is responsible for preparing the build environment and
-calling the setuptools command to generate the wheel file.
-
-args:
-  --setup_py: Path to the setup.py file.
-  --output: Output directory for the wheel.
-  --version: Version of the wheel.
-  --src: List of source files for the wheel.
-  --platform: Platform name to be passed to build module.
-
-output:
-  A python wheel file is created in the output directory. The name of the wheel
-  file is based on various factors, including the version and platform.
-"""
-
-import argparse
-import glob
-import os
-import shutil
-import subprocess
-import sys
-import tomllib
-from typing import Optional
-
-
-def parse_args() -> argparse.Namespace:
-  """Arguments parser."""
-  parser = argparse.ArgumentParser(
-      description="Helper for building python wheel from pyproject.toml",
-      fromfile_prefix_chars="@",
-  )
-  parser.add_argument("--pyproject", help="location of pyproject.toml file")
-  parser.add_argument("--setup_py", help="location of setup.py file")
-  parser.add_argument("--output", help="output directory")
-  parser.add_argument("--version", help="version of the wheel")
-  parser.add_argument(
-      "--src", help="single source file for the wheel", action="append"
-  )
-  parser.add_argument(
-      "--platform",
-      required=True,
-      help="Platform name to be passed to build module",
-  )
-  return parser.parse_args()
-
-
-def get_project_name(pyproject_path: str) -> str:
-  with open(pyproject_path, "rb") as f:
-    pyproject = tomllib.load(f)
-    try:
-      return pyproject["project"]["name"]
-    except KeyError as e:
-      raise ValueError(
-          "Invalid pyproject.toml file. Please check the project name."
-          " Dynamically generated project names are not supported."
-      ) from e
-
-
-def create_empty_init_files(dst_dir: str) -> None:
-  """Create __init__.py files."""
-  dir_list = [f for f in os.scandir(dst_dir) if f.is_dir()]
-  for dir_name in dir_list:
-    with open(os.path.join(dir_name, "__init__.py"), "w"):
-      pass
-
-
-def create_init_files(dst_dir: str, meta_dict: Optional[dict[str, str]] = None):
-  create_empty_init_files(dst_dir)
-
-  if meta_dict:
-    with open(os.path.join(dst_dir, "__init__.py"), "w") as f:
-      for key, value in meta_dict.items():
-        f.write(f"{key} = \"{value}\"\n")
-
-
-def construct_meta_dict(args) -> dict[str, str]:
-  return {
-      "__version__": args.version,
-  }
-
-
-def prepare_build_tree(tree_path, args, project_name: str):
-  """Prepares the build tree for the wheel build.
-
-  Args:
-    tree_path: Path to the build tree.
-    args: Command line arguments.
-    project_name: Name of the project.
-  """
-  src_dir = os.path.join(tree_path, project_name.replace("-", "_"))
-  os.makedirs(src_dir)
-
-  shutil.copyfile(args.setup_py, os.path.join(tree_path, "setup.py"))
-
-  for src in args.src:
-    shutil.copyfile(src, os.path.join(src_dir, os.path.basename(src)))
-
-  meta_dict = construct_meta_dict(args)
-
-  create_init_files(src_dir, meta_dict)
-
-
-def build_pyproject_wheel(
-    buildtree_path: str, platform_name: Optional[str] = None
-):
-  """Builds a python wheel from a pyproject.toml file.
-
-  Args:
-    buildtree_path: Path to the build tree.
-    platform_name: Platform name to be passed to build module.
-  """
-  env = os.environ.copy()
-
-  command = [
-      sys.executable,
-      "-m",
-      "build",
-      "-w",
-      "-o",
-      os.getcwd(),
-  ]
-
-  if platform_name:
-    command.append(
-        # This is due to setuptools not making it possible to pass the
-        # platform name as a dynamic pyproject.toml property.
-        f"--config-setting=--build-option=--plat-name={platform_name}"
-    )
-
-  subprocess.run(
-      command,
-      check=True,
-      cwd=buildtree_path,
-      env=env,
-  )
-
-
-def build_setup_py_wheel(
-    buildtree_path: str,
-    output_dir: str,
-    version: str,
-    platform_name: Optional[str] = None,
-):
-  """Builds a python wheel from a setup.py file.
-
-  Args:
-    buildtree_path: Path to the build tree.
-    output_dir: Output directory for the wheel.
-    version: Version of the wheel.
-    platform_name: Platform name to be passed to build module.
-  """
-  env = os.environ.copy()
-
-  env["PROJECT_NAME"] = "ai_edge_litert"
-  env["PACKAGE_VERSION"] = version
-
-  command = [
-      sys.executable,
-      f"{buildtree_path}/setup.py",
-      "bdist_wheel",
-      f"--plat-name={platform_name}",
-  ]
-
-  subprocess.run(
-      command,
-      check=True,
-      cwd=buildtree_path,
-      env=env,
-  )
-
-  for filename in glob.glob(os.path.join(buildtree_path, "dist/*.whl")):
-    shutil.copy(filename, output_dir)
-
-
-if __name__ == "__main__":
-  build_dir = os.path.join(os.getcwd(), "wheel_build")
-  arg_data = parse_args()
-
-  prepare_build_tree(build_dir, arg_data, "ai_edge_litert")
-  build_setup_py_wheel(
-      build_dir, arg_data.output, arg_data.version, arg_data.platform
-  )
diff --git a/tensorflow/lite/tools/signature/signature_def_util_test.cc b/tensorflow/lite/tools/signature/signature_def_util_test.cc
index a7e08e2e55d6..7de5870205ee 100644
--- a/tensorflow/lite/tools/signature/signature_def_util_test.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/tools/signature/signature_def_util.h"
 
+#include <map>
+#include <memory>
 #include <string>
 
 #include <gtest/gtest.h>
@@ -156,11 +158,11 @@ TEST_F(SimpleSignatureDefUtilTest, ClearSignatureDefTest) {
 TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefErrorsTest) {
   std::map<string, SignatureDef> test_signature_def_map;
   std::string model_output;
-  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       SetSignatureDefMap(model_, test_signature_def_map, &model_output)));
   SignatureDef test_signature_def;
   test_signature_def_map[kDefaultServingSignatureDefKey] = test_signature_def;
-  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       SetSignatureDefMap(model_, test_signature_def_map, nullptr)));
 }
 
diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index b8c18b24c5cc..9c26eca06f52 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 #include <random>
+#include <string>
 
 #include "absl/types/span.h"
 #include "tensorflow/lite/c/c_api_types.h"
@@ -86,7 +87,14 @@ TfLiteStatus ConvertToArray(const TfLiteTensor& tflite_tensor,
 InputTensorData CreateRandomTensorData(const TfLiteTensor& tensor,
                                        float low_range, float high_range) {
   int num_elements = NumElements(tensor.dims);
-  switch (tensor.type) {
+  return CreateRandomTensorData(tensor.name, tensor.type, num_elements,
+                                low_range, high_range);
+}
+
+InputTensorData CreateRandomTensorData(std::string name, TfLiteType type,
+                                       int num_elements, float low_range,
+                                       float high_range) {
+  switch (type) {
     case kTfLiteComplex64: {
       return CreateInputTensorData<std::complex<float>>(
           num_elements,
@@ -114,7 +122,7 @@ InputTensorData CreateRandomTensorData(const TfLiteTensor& tensor,
       // compiler that supports __fp16 type. Note: when using Clang and *not*
       // linking with compiler-rt, a definition of __gnu_h2f_ieee and
       // __gnu_f2h_ieee must be supplied.
-      TFLITE_LOG(FATAL) << "Populating the tensor " << tensor.name
+      TFLITE_LOG(FATAL) << "Populating the tensor " << name
                         << " of type FLOAT16 is disabled.";
 #endif  // TFLITE_ENABLE_FP16_CPU_BENCHMARKS
       break;
@@ -168,8 +176,8 @@ InputTensorData CreateRandomTensorData(const TfLiteTensor& tensor,
           num_elements, std::uniform_int_distribution<uint32_t>(0, 1));
     }
     default: {
-      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << tensor.name
-                        << " of type " << tensor.type;
+      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << name
+                        << " of type " << type;
     }
   }
   return InputTensorData();
diff --git a/tensorflow/lite/tools/utils.h b/tensorflow/lite/tools/utils.h
index 9e163f7a6f11..bcad3cd55b04 100644
--- a/tensorflow/lite/tools/utils.h
+++ b/tensorflow/lite/tools/utils.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -43,6 +44,12 @@ struct InputTensorData {
 InputTensorData CreateRandomTensorData(const TfLiteTensor& tensor,
                                        float low_range, float high_range);
 
+// Returns random test data for tensor of given name, type and size.
+// Data returned should be between 'low_range' and 'high_range'.
+InputTensorData CreateRandomTensorData(std::string name, TfLiteType type,
+                                       int num_elements, float low_range,
+                                       float high_range);
+
 // Fills out params 'low_range' and 'high_range' with range for tensor type
 // 'type'. Note that these ranges returned are just dummy used only for
 // benchmarking and/or testing purposes.
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 7d4c3c6ca25e..82fb3974639c 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -1048,7 +1048,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     case BuiltinOperator_DYNAMIC_UPDATE_SLICE:
-      if (op_sig.inputs.at(0).type == kTfLiteFloat16) {
+      if (op_sig.inputs.at(0).type == kTfLiteInt16) {
+        return 4;
+      } else if (op_sig.inputs.at(0).type == kTfLiteFloat16) {
         return 3;
       } else if (op_sig.inputs.at(2).type == kTfLiteInt64) {
         return 2;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index a0f76a327802..7afd313e0e33 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -1433,5 +1433,9 @@ TEST(OpVersionTest, VersioningDynamicUpdateSliceTest) {
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
       std::vector<TfLiteType>{kTfLiteFloat16, kTfLiteFloat16, kTfLiteInt32});
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteInt16, kTfLiteInt16, kTfLiteInt32});
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
 }
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index acfcd2754dcc..3c121a92d47a 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -432,6 +432,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_DYNAMIC_UPDATE_SLICE, 1}, "2.9.0"},
               {{BuiltinOperator_DYNAMIC_UPDATE_SLICE, 2}, "2.17.0"},
               {{BuiltinOperator_DYNAMIC_UPDATE_SLICE, 3}, "2.19.0"},
+              {{BuiltinOperator_DYNAMIC_UPDATE_SLICE, 4}, "2.20.0"},
               {{BuiltinOperator_UNSORTED_SEGMENT_PROD, 1}, "2.10.0"},
               {{BuiltinOperator_UNSORTED_SEGMENT_MAX, 1}, "2.10.0"},
               {{BuiltinOperator_UNSORTED_SEGMENT_MIN, 1}, "2.11.0"},
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 8bcf15862312..9fca5d2c38f3 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -53,7 +53,6 @@ tf_staging/tensorflow/go/stream_executor/BUILD:
 tf_staging/tensorflow/go/tsl/profiler/protobuf/BUILD:
 tf_staging/tensorflow/go/tsl/protobuf/BUILD:
 tf_staging/tensorflow/java/README.md:
-tf_staging/tensorflow/java/src/main/native/BUILD:
 tf_staging/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h:
 tf_staging/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h:
 tf_staging/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h:
@@ -95,7 +94,7 @@ tf_staging/tensorflow/lite/delegates/coreml/builders/BUILD:
 tf_staging/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h:
 tf_staging/tensorflow/lite/delegates/gpu/cl/serialization_generated.h:
 tf_staging/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h:
-tf_staging/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h:
+tf_staging/tensorflow/lite/delegates/gpu/common/task/tflite_serialization_base_generated.h:
 tf_staging/tensorflow/lite/delegates/hexagon/hexagon_nn/BUILD:
 tf_staging/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD:
 tf_staging/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_app_using_stable_delegate.cc:
@@ -109,11 +108,9 @@ tf_staging/tensorflow/lite/experimental/acceleration/configuration/configuration
 tf_staging/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h:
 tf_staging/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h:
 tf_staging/tensorflow/lite/experimental/acceleration/mini_benchmark/special_rules.bzl:
-tf_staging/tensorflow/lite/experimental/litert/build_common/special_rule.bzl:
 tf_staging/tensorflow/lite/interpreter.h:
 tf_staging/tensorflow/lite/interpreter_builder.h:
 tf_staging/tensorflow/lite/ios/BUILD:
-tf_staging/tensorflow/lite/java/jni/BUILD:
 tf_staging/tensorflow/lite/kernels/builtin_op_kernels.h:
 tf_staging/tensorflow/lite/kernels/register.h:
 tf_staging/tensorflow/lite/lib_package/BUILD:
@@ -127,8 +124,6 @@ tf_staging/tensorflow/lite/signature_runner.h:
 tf_staging/tensorflow/lite/special_rules.bzl:
 tf_staging/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/AndroidManifest.xml:
 tf_staging/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/build_defs.bzl:
-tf_staging/tensorflow/lite/tools/pip_package/BUILD:
-tf_staging/tensorflow/lite/tools/pip_package/utils/BUILD:
 tf_staging/tensorflow/lite/tools/verifier.h:
 tf_staging/tensorflow/lite/tools/verifier_internal.h:
 tf_staging/tensorflow/python/autograph/core/config:.py
@@ -179,13 +174,16 @@ tf_staging/tensorflow/tools/pip_package/simple_console_for_windows:.py
 tf_staging/tensorflow/tools/pip_package/utils/BUILD:
 tf_staging/tensorflow/tools/pip_package/xla_build/CMakeLists.txt:
 tf_staging/tensorflow/tools/toolchains/BUILD:
+tf_staging/tensorflow/tools/toolchains/android/BUILD:
 tf_staging/tensorflow/tools/toolchains/clang6/BUILD:
 tf_staging/tensorflow/tools/toolchains/cpus/py/BUILD:
 tf_staging/tensorflow/tools/toolchains/cpus/py3/BUILD:
 tf_staging/tensorflow/tools/toolchains/cross_compile/cc/BUILD:
 tf_staging/tensorflow/tools/toolchains/cross_compile/config/BUILD:
 tf_staging/tensorflow/tools/toolchains/embedded/arm-linux/BUILD:
+tf_staging/tensorflow/tools/toolchains/ios/BUILD:
 tf_staging/tensorflow/tools/toolchains/java/BUILD:
+tf_staging/tensorflow/tools/toolchains/linux/BUILD:
 tf_staging/tensorflow/tools/toolchains/python/BUILD:
 tf_staging/tensorflow/tools/toolchains/remote/BUILD:
 tf_staging/tensorflow/tools/toolchains/remote_config/BUILD:
@@ -199,167 +197,40 @@ tf_staging/tensorflow/tools/toolchains/win_1803/py38/BUILD:
 tf_staging/tensorflow/tools/toolchains/win_1803/py39/BUILD:
 tf_staging/tensorflow/virtual_root_template_v1.__init__:.py
 tf_staging/tensorflow/virtual_root_template_v2.__init__:.py
-tf_staging/third_party/BUILD:
 tf_staging/third_party/__init__:.py
-tf_staging/third_party/absl/com_google_absl.BUILD:
-tf_staging/third_party/android/BUILD:
 tf_staging/third_party/android/android.bzl.tpl:
 tf_staging/third_party/android/android_configure.BUILD.tpl:
 tf_staging/third_party/android/android_configure.bzl:
 tf_staging/third_party/arm_neon_2_x86_sse.BUILD:
-tf_staging/third_party/boringssl/BUILD:
-tf_staging/third_party/clang_toolchain/BUILD:
-tf_staging/third_party/clang_toolchain/cc_configure_clang.bzl:
-tf_staging/third_party/clang_toolchain/download_clang.bzl:
 tf_staging/third_party/codegen.BUILD:
-tf_staging/third_party/compute_library/BUILD:
-tf_staging/third_party/compute_library/build_defs.bzl:
 tf_staging/third_party/coremltools.BUILD:
 tf_staging/third_party/cub.BUILD:
-tf_staging/third_party/curl.BUILD:
-tf_staging/third_party/cython.BUILD:
-tf_staging/third_party/ducc/BUILD:
-tf_staging/third_party/ducc/ducc0_custom_lowlevel_threading.h:
-tf_staging/third_party/ducc/fft.cc:
-tf_staging/third_party/ducc/fft.h:
-tf_staging/third_party/ducc/threading.cc:
-tf_staging/third_party/ducc/threading.h:
-tf_staging/third_party/eigen3/BUILD:
-tf_staging/third_party/eigen3/LICENSE:
-tf_staging/third_party/eigen3/eigen_archive.BUILD:
-tf_staging/third_party/fft2d/BUILD:
 tf_staging/third_party/fft2d/LICENSE:
 tf_staging/third_party/fft2d/fft.h:
 tf_staging/third_party/fft2d/fft2d.BUILD:
 tf_staging/third_party/fft2d/fft2d.h:
 tf_staging/third_party/gif.BUILD:
 tf_staging/third_party/gif_fix_strtok_r.patch:
-tf_staging/third_party/git/BUILD.tpl:
-tf_staging/third_party/git/BUILD:
-tf_staging/third_party/git/git_configure.bzl:
-tf_staging/third_party/googleapis/BUILD:
 tf_staging/third_party/googleapis/build_rules.bzl:
 tf_staging/third_party/googleapis/googleapis.BUILD:
 tf_staging/third_party/googleapis/repository_rules.bzl:
-tf_staging/third_party/gpus/BUILD:
-tf_staging/third_party/gpus/compiler_common_tools.bzl:
-tf_staging/third_party/gpus/crosstool/BUILD.rocm.tpl:
-tf_staging/third_party/gpus/crosstool/BUILD.sycl.tpl:
-tf_staging/third_party/gpus/crosstool/BUILD.tpl:
-tf_staging/third_party/gpus/crosstool/BUILD:
-tf_staging/third_party/gpus/crosstool/LICENSE:
-tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
-tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
-tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
-tf_staging/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
-tf_staging/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
-tf_staging/third_party/gpus/cuda/BUILD.tpl:
-tf_staging/third_party/gpus/cuda/BUILD.windows.tpl:
-tf_staging/third_party/gpus/cuda/BUILD:
-tf_staging/third_party/gpus/cuda/LICENSE:
-tf_staging/third_party/gpus/cuda/build_defs.bzl.tpl:
-tf_staging/third_party/gpus/cuda/cuda_config.h.tpl:
-tf_staging/third_party/gpus/cuda/cuda_config.py.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/BUILD:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_configure.bzl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_nvprune.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_nvrtc.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl:
-tf_staging/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl:
-tf_staging/third_party/gpus/cuda_configure.bzl:
-tf_staging/third_party/gpus/find_cuda_config:.py
-tf_staging/third_party/gpus/rocm/BUILD.tpl:
-tf_staging/third_party/gpus/rocm/BUILD:
-tf_staging/third_party/gpus/rocm/build_defs.bzl.tpl:
-tf_staging/third_party/gpus/rocm/rocm_config.h.tpl:
-tf_staging/third_party/gpus/rocm_configure.bzl:
-tf_staging/third_party/gpus/sycl/BUILD.tpl:
-tf_staging/third_party/gpus/sycl/BUILD:
-tf_staging/third_party/gpus/sycl/build_defs.bzl.tpl:
-tf_staging/third_party/gpus/sycl_configure.bzl:
-tf_staging/third_party/grpc/BUILD:
 tf_staging/third_party/icu/udata.patch:
-tf_staging/third_party/implib_so/BUILD:
-tf_staging/third_party/implib_so/get_symbols.py:
-tf_staging/third_party/implib_so/make_stub.py:
 tf_staging/third_party/linenoise.BUILD:
-tf_staging/third_party/llvm_openmp/BUILD:
-tf_staging/third_party/llvm_openmp/cmake_vars.bzl:
-tf_staging/third_party/llvm_openmp/expand_cmake_vars:.py
-tf_staging/third_party/llvm_openmp/openmp.bzl:
-tf_staging/third_party/mkl/BUILD:
-tf_staging/third_party/mkl_dnn/LICENSE:
-tf_staging/third_party/mkl_dnn/mkldnn_acl.BUILD:
-tf_staging/third_party/mkl_dnn/mkldnn_v1.BUILD:
 tf_staging/third_party/mpi/.gitignore:
-tf_staging/third_party/nccl/BUILD:
-tf_staging/third_party/nccl/LICENSE:
-tf_staging/third_party/nccl/archive.BUILD:
-tf_staging/third_party/nccl/archive.patch:
-tf_staging/third_party/nccl/build_defs.bzl.tpl:
-tf_staging/third_party/nccl/generated_names.bzl.tpl:
-tf_staging/third_party/nccl/hermetic/BUILD:
-tf_staging/third_party/nccl/hermetic/cuda_nccl.BUILD.tpl:
-tf_staging/third_party/nccl/hermetic/nccl_configure.bzl:
-tf_staging/third_party/nccl/nccl_configure.bzl:
-tf_staging/third_party/nccl/system.BUILD.tpl:
 tf_staging/third_party/nlohmann_json.BUILD:
-tf_staging/third_party/nvshmem/BUILD:
-tf_staging/third_party/nvshmem/nvshmem.BUILD:
-tf_staging/third_party/nvshmem/workspace.bzl:
-tf_staging/third_party/nvtx/BUILD:
-tf_staging/third_party/nvtx/LICENSE:
-tf_staging/third_party/ortools/BUILD:
-tf_staging/third_party/ortools/glpk.BUILD:
-tf_staging/third_party/ortools/ortools.patch:
 tf_staging/third_party/png.BUILD:
 tf_staging/third_party/png_fix_rpi.patch:
 tf_staging/third_party/pprof.BUILD:
-tf_staging/third_party/protobuf/BUILD:
 tf_staging/third_party/py/BUILD.tpl:
-tf_staging/third_party/py/BUILD:
-tf_staging/third_party/py/ml_dtypes/BUILD:
-tf_staging/third_party/py/non_hermetic/ml_dtypes/BUILD:
 tf_staging/third_party/py/non_hermetic/ml_dtypes/LICENSE:
-tf_staging/third_party/py/non_hermetic/numpy/BUILD:
-tf_staging/third_party/py/numpy/BUILD:
 tf_staging/third_party/py/python_configure.bzl:
 tf_staging/third_party/py/python_init_pip.bzl:
 tf_staging/third_party/py/python_init_repositories.bzl:
 tf_staging/third_party/py/python_init_rules.bzl:
 tf_staging/third_party/py/python_init_toolchains.bzl:
 tf_staging/third_party/py/python_repo.bzl:
-tf_staging/third_party/pybind11.BUILD:
-tf_staging/third_party/pybind11_bazel/BUILD:
-tf_staging/third_party/pybind11_protobuf/BUILD:
-tf_staging/third_party/python_runtime/BUILD:
-tf_staging/third_party/remote_config/BUILD.tpl:
-tf_staging/third_party/remote_config/BUILD:
-tf_staging/third_party/remote_config/common.bzl:
-tf_staging/third_party/remote_config/remote_platform_configure.bzl:
-tf_staging/third_party/repo.bzl:
-tf_staging/third_party/six.BUILD:
-tf_staging/third_party/snappy.BUILD:
-tf_staging/third_party/spirv_llvm_translator/spirv_llvm_translator.BUILD:
 tf_staging/third_party/sqlite.BUILD:
-tf_staging/third_party/stablehlo/BUILD:
 tf_staging/third_party/systemlibs/BUILD.tpl:
-tf_staging/third_party/systemlibs/BUILD:
 tf_staging/third_party/systemlibs/absl_py.BUILD:
 tf_staging/third_party/systemlibs/absl_py.absl.flags.BUILD:
 tf_staging/third_party/systemlibs/absl_py.absl.logging.BUILD:
@@ -384,19 +255,15 @@ tf_staging/third_party/systemlibs/sqlite.BUILD:
 tf_staging/third_party/systemlibs/syslibs_configure.bzl:
 tf_staging/third_party/systemlibs/zlib.BUILD:
 tf_staging/third_party/tensorrt/BUILD.tpl:
-tf_staging/third_party/tensorrt/BUILD:
 tf_staging/third_party/tensorrt/LICENSE:
 tf_staging/third_party/tensorrt/build_defs.bzl.tpl:
-tf_staging/third_party/tensorrt/plugin/BUILD:
 tf_staging/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl:
 tf_staging/third_party/tensorrt/tensorrt/tensorrt_config.py.tpl:
 tf_staging/third_party/tensorrt/tensorrt_configure.bzl:
 tf_staging/third_party/tensorrt/workspace.bzl:
-tf_staging/third_party/tf_runtime/BUILD:
 tf_staging/third_party/tf_toolchains.BUILD:
 tf_staging/third_party/tflite_mobilenet.BUILD:
 tf_staging/third_party/tflite_mobilenet_float.BUILD:
 tf_staging/third_party/tflite_mobilenet_quant.BUILD:
 tf_staging/third_party/tflite_ovic_testdata.BUILD:
 tf_staging/third_party/tflite_smartreply.BUILD:
-tf_staging/third_party/zlib.BUILD:
diff --git a/tensorflow/py.default.bzl b/tensorflow/py.default.bzl
index bad528e901bb..0b27e8a5b7f8 100644
--- a/tensorflow/py.default.bzl
+++ b/tensorflow/py.default.bzl
@@ -5,8 +5,12 @@ Tensorflow is loading the Python rules directly from rules_python, these shims
 can be removed.
 """
 
-py_test = native.py_test
+load("@rules_python//python:py_binary.bzl", _py_binary = "py_binary")
+load("@rules_python//python:py_library.bzl", _py_library = "py_library")
+load("@rules_python//python:py_test.bzl", _py_test = "py_test")
 
-py_binary = native.py_binary
+py_test = _py_test
 
-py_library = native.py_library
+py_binary = _py_binary
+
+py_library = _py_library
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 700f22570e22..817062241b04 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -10,7 +10,19 @@ load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # Placeholder: load py_proto_library
-load("//tensorflow:tensorflow.bzl", "VERSION", "cc_header_only_library", "if_google", "if_oss", "if_windows", "if_xla_available", "tf_enable_mlir_bridge", "tf_python_pybind_static_deps", "tsl_async_value_deps")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "VERSION",
+    "cc_header_only_library",
+    "if_google",
+    "if_nccl",
+    "if_oss",
+    "if_windows",
+    "if_xla_available",
+    "tf_enable_mlir_bridge",
+    "tf_python_pybind_static_deps",
+    "tsl_async_value_deps",
+)
 load(
     "//tensorflow:tensorflow.default.bzl",
     "get_compatible_with_portable",
@@ -289,7 +301,6 @@ py_strict_library(
         "//tensorflow/python/platform:sysconfig",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler",
-        "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model",
@@ -515,8 +526,8 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_proto",
         "//tensorflow/python/lib/core:pybind11_status",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -539,7 +550,7 @@ tf_python_pybind_extension(
     deps = [
         "@com_google_absl//absl/container:fixed_array",
         "@pybind11",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:py_exception_registry",
@@ -560,7 +571,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ] + if_pywrap(["//tensorflow/lite/toco/python:toco_python_api"]),
 )
@@ -748,6 +759,7 @@ pywrap_tensorflow_macro(
     ] + tsl_async_value_deps(),
     win_def_file = ":pywrap_tensorflow_filtered_def_file",
     deps = [
+        ":_pywrap_library_dependency_enforcer",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c:checkpoint_reader",
@@ -834,11 +846,11 @@ pywrap_tensorflow_macro(
         "//tensorflow/python/util:function_parameter_canonicalizer",
         "//tensorflow/python/util:kernel_registry",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",
         "@local_xla//xla/backends/profiler/cpu:python_tracer",
         "@local_xla//xla/stream_executor:stream_executor_impl",
         "@local_xla//xla/tsl/profiler/rpc:profiler_server_impl",
@@ -969,6 +981,7 @@ pywrap_aware_filegroup(
         "//tensorflow/compiler/aot:tfcompile_lib",  # tfcompile
         "@local_xla//xla:status_macros",  # tfcompile
         "@local_xla//xla/hlo/ir:hlo",  # tfcompile
+        "@local_xla//xla/tsl/concurrency:async_value",  # tfcompile
     ]),
     visibility = ["//visibility:private"],
 )
@@ -1106,8 +1119,8 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ] + if_pywrap(["//tensorflow/compiler/mlir/python:mlir"]),
 )
@@ -1137,8 +1150,8 @@ tf_python_pybind_extension(
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core/config:flags_headers",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/types:optional",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -1179,7 +1192,7 @@ cc_library(
         "//tensorflow/core/config:flags_headers",
         "//tensorflow/core/framework:pywrap_required_hdrs",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -1244,13 +1257,14 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
+        "@dlpack",
         "@pybind11",
         # copybara:uncomment "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ] + if_pywrap(
@@ -1308,18 +1322,19 @@ tf_python_pybind_extension(
         "//tensorflow/core/config:flags_headers",
         "//tensorflow/core/framework:pywrap_required_hdrs",
         "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -1354,7 +1369,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -1369,10 +1384,18 @@ tf_python_pybind_extension(
     features = ["-layering_check"],
     deps = [
         ":pywrap_densor_device_headers",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/dtensor/cc:dtensor_device_cc",
+        "//tensorflow/dtensor/cc:tensor_layout",
         "//tensorflow/dtensor/proto:layout_proto_cc",
+        "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status_headers",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
+        "//tensorflow/python/util:cpp_python_util",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
@@ -1435,7 +1458,10 @@ pybind_extension(
     name = "_pywrap_tensorflow_internal",
     srcs = ["pywrap_tensorflow_internal.cc"],
     pywrap_only = True,
-    deps = [],
+    deps = [
+        ":_pywrap_library_dependency_enforcer",
+        "@pybind11",
+    ],
 )
 
 pybind_extension(
@@ -1458,11 +1484,19 @@ pybind_extension(
 cc_library(
     name = "_protobuf_inline_symbols_enforcer",
     srcs = ["protobuf_inline_symbols_enforcer.cc"],
+    local_defines = select({
+        "//tensorflow:windows": [
+            "PROTOBUF_USE_DLLS",
+            "LIBPROTOBUF_EXPORTS",
+        ],
+        "//conditions:default": [],
+    }),
     deps = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/core/framework:attr_value_proto_cc",
         "//tensorflow/core/framework:function_proto_cc",
         "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/grappler/costs:op_performance_data_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/dtensor/proto:layout_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
@@ -1470,33 +1504,23 @@ cc_library(
 )
 
 cc_library(
-    name = "_pywrap_lib_filter",
-    deps = if_pywrap(
-        if_true = [
-            "@pybind11_abseil//pybind11_abseil:absl_casters",
-            "@pybind11_abseil//pybind11_abseil:import_status_module",
-            "@pybind11_abseil//pybind11_abseil:status_casters",
-            "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
-        ],
-    ),
-)
-
-cc_library(
-    name = "_pywrap_lib_exclusion_filter",
-    deps = if_pywrap(
-        if_true = [
-            "@com_google_protobuf//:protobuf",
-            "@com_google_protobuf//:protobuf_lite",
-            "@zlib//:zlib",
-        ],
-    ),
+    name = "_pywrap_library_dependency_enforcer",
+    srcs = ["pywrap_library_dependency_enforcer.cc"],
+    hdrs = ["pywrap_library_dependency_enforcer.h"],
+    alwayslink = True,
 )
 
 cc_library(
     name = "_cuda_deps_filter",
     srcs = [],
     deps = if_pywrap(
-        if_true = if_cuda_is_configured(["@local_xla//xla/tsl/cuda:cupti"]),
+        if_true = (
+            if_cuda_is_configured([
+                "@local_xla//xla/tsl/cuda:cupti",
+            ]) + if_nccl([
+                "@local_config_nccl//:nccl",
+            ])
+        ),
     ),
 )
 
@@ -1504,40 +1528,86 @@ pywrap_library(
     name = "_pywrap_tensorflow",
     # buildifier: disable=unsorted-dict-items
     # @unsorted-dict-items
+    common_lib_def_files_or_filters = {
+        "tensorflow/python/_pywrap_tensorflow_common": ":_pywrap_tensorflow.def",
+    },
+    # buildifier: disable=unsorted-dict-items
+    # @unsorted-dict-items
     common_lib_filters = {
-        "tensorflow/libtensorflow_framework.so.2": [
-            "//tensorflow:tensorflow_framework_pywrap_filter",
-            ":_cuda_deps_filter",
-            "@com_google_absl//absl/debugging:leak_check",
-            "@com_google_absl//absl/flags:flag",
-            "@com_google_absl//absl/log:die_if_null",
-            "@com_google_absl//absl/random",
-        ],
-        "tensorflow/libtensorflow_cc.so.2": "//tensorflow:tensorflow_cc_pywrap_filter",
+        "tensorflow/tensorflow_framework": select({
+            "//tensorflow:windows": [],
+            "//conditions:default": [
+                "//tensorflow:tensorflow_framework_pywrap_filter",
+                ":_cuda_deps_filter",
+                "@com_google_absl//absl/debugging:leak_check",
+                "@com_google_absl//absl/flags:flag",
+                "@com_google_absl//absl/log:die_if_null",
+                "@com_google_absl//absl/random",
+            ],
+        }),
+        "tensorflow/tensorflow_cc": select({
+            "//tensorflow:windows": [],
+            "//conditions:default": ["//tensorflow:tensorflow_cc_pywrap_filter"],
+        }),
     },
     # buildifier: disable=unsorted-dict-items
     # @unsorted-dict-items
     common_lib_linkopts = {
-        "tensorflow/libtensorflow_framework.so.2": [
-            "-z defs",
-            "-lpthread",
-            "-ldl",
-            "-lm",
-        ],
-        "tensorflow/libtensorflow_cc.so.2": [
-            "-z defs",
-            "-lpthread",
-            "-ldl",
-            "-lm",
-        ],
+        "tensorflow/tensorflow_framework": select({
+            "//tensorflow:windows": [],
+            "//tensorflow:macos": [
+                "-lpthread",
+                "-ldl",
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-z defs",
+                "-lpthread",
+                "-ldl",
+                "-lm",
+            ],
+        }),
+        "tensorflow/tensorflow_cc": select({
+            "//tensorflow:windows": [
+                "-DEFAULTLIB:ws2_32.lib",
+                "-DEFAULTLIB:advapi32.lib",
+                "-DEFAULTLIB:crypt32.lib",
+                "-DEFAULTLIB:Normaliz.lib",
+                "-DEFAULTLIB:ntdll.lib",
+            ],
+            "//tensorflow:macos": [
+                "-lpthread",
+                "-ldl",
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-z defs",
+                "-lpthread",
+                "-ldl",
+                "-lm",
+            ],
+        }),
     },
     # buildifier: disable=unsorted-dict-items
     # @unsorted-dict-items
     common_lib_version_scripts = {
-        "tensorflow/libtensorflow_cc.so.2": "//tensorflow:tf_version_script.lds",
+        "tensorflow/tensorflow_cc": select({
+            "//tensorflow:windows": None,
+            "//tensorflow:macos": "//tensorflow:tf_exported_symbols.lds",
+            "//conditions:default": "//tensorflow:tf_version_script.lds",
+        }),
+    },
+    # buildifier: disable=unsorted-dict-items
+    # @unsorted-dict-items
+    common_lib_versions = {
+        "tensorflow/tensorflow_framework": "2",
+        "tensorflow/tensorflow_cc": "2",
     },
-    pywrap_lib_exclusion_filter = ":_pywrap_lib_exclusion_filter",
-    pywrap_lib_filter = ":_pywrap_lib_filter",
+    enable_common_lib_starlark_only_filter = select({
+        "//tensorflow:windows": False,
+        "//tensorflow:macos": False,
+        "//conditions:default": True,
+    }),
     starlark_only_deps = [
         "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
         "//tensorflow/compiler/mlir/quantization/stablehlo/python:pywrap_quantization",
@@ -1557,7 +1627,6 @@ pywrap_library(
         "//tensorflow/python/util:_function_parameter_canonicalizer_binding_for_test",
     ],
     visibility = ["//visibility:public"],
-    win_def_file = "_pywrap_tensorflow.def",
     deps = [
         "//tensorflow/compiler/mlir/lite/python:_pywrap_converter_api",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:pywrap_function_lib",
@@ -1627,9 +1696,24 @@ pywrap_library(
 )
 
 pywrap_common_library(
-    name = "tensorflow_common_framework",
+    name = "pywrap_tensorflow_framework",
     dep = ":_pywrap_tensorflow",
-    filter_name = "libtensorflow_framework.so.2",
+    filter_name = "tensorflow_framework",
+)
+
+pywrap_common_library(
+    name = "pywrap_tensorflow_common",
+    dep = ":_pywrap_tensorflow",
+    filter_name = "_pywrap_tensorflow_common",
+)
+
+alias(
+    name = "tensorflow_common_framework",
+    actual = select({
+        "//conditions:default": ":pywrap_tensorflow_framework",
+        "//tensorflow:macos": ":pywrap_tensorflow_framework",
+        "//tensorflow:windows": ":pywrap_tensorflow_common",
+    }),
 )
 
 pywrap_binaries(
diff --git a/tensorflow/python/_pywrap_tensorflow.def b/tensorflow/python/_pywrap_tensorflow.def
index 1a0348914f1c..2a6329d9d73e 100644
--- a/tensorflow/python/_pywrap_tensorflow.def
+++ b/tensorflow/python/_pywrap_tensorflow.def
@@ -24,7 +24,6 @@ EXPORTS
   ?ExperimentalConvertSavedModelV1ToMlir@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBV23@00_N111PEAUTSL_Status@@@Z
   ?ExperimentalRunPassPipeline@tensorflow@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBV23@0_NPEAUTSL_Status@@@Z
   ?ExperimentalWriteBytecode@tensorflow@@YAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@0PEAUTSL_Status@@@Z
-  ?ExperimentalTFLiteToTosaBytecode@tensorflow@@YAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@0_NAEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@3@2PEAUTSL_Status@@@Z
   ?InfoAboutUnusedCPUFeatures@port@tensorflow@@YAXXZ
   ?Relu@ops@tensorflow@@YA?AVStatus@lts_20230802@absl@@PEAVAbstractContext@2@QEAVAbstractTensorHandle@2@PEAPEAV72@PEBD3@Z
   ?UnrefNonInlined@Status@lts_20230802@absl@@CAX_K@Z
@@ -46,7 +45,6 @@ EXPORTS
   ?ForEachPayload@Status@lts_20230802@absl@@QEBAXV?$FunctionRef@$$A6AXV?$basic_string_view@DU?$char_traits@D@std@@@std@@AEBVCord@lts_20230802@absl@@@Z@23@@Z
   ??BCord@lts_20230802@absl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
   ?ReadRecord@RecordReader@io@tsl@@QEAA?AVStatus@lts_20230802@absl@@PEA_KPEAVtstring@3@@Z
-  ?IsOutOfRange@errors@tsl@@YA_NAEBVStatus@lts_20230802@absl@@@Z
   ?FastUInt64ToBufferLeft@strings@tsl@@YA_K_KPEAD@Z
   ?StrCat@strings@tsl@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBVAlphaNum@12@0@Z
   ??1RecordWriter@io@tsl@@QEAA@XZ
@@ -79,10 +77,8 @@ EXPORTS
   ??BCord@lts_20230802@absl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
   ?DeleteFile@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
   ?ReadFileToString@tsl@@YA?AVStatus@lts_20230802@absl@@PEAVEnv@1@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAV67@@Z
-  ?WriteStringToFile@tsl@@YA?AVStatus@lts_20230802@absl@@PEAVEnv@1@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBV?$basic_string_view@DU?$char_traits@D@std@@@7@@Z
   ?GetChildren@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z
   ?CreateDir@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
-  ?IsAlreadyExists@errors@tsl@@YA_NAEBVStatus@lts_20230802@absl@@@Z
   ?RecursivelyCreateDir@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
   ?CopyFile@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@0@Z
   ?StrCat@strings@tsl@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBVAlphaNum@12@@Z
@@ -92,12 +88,8 @@ EXPORTS
   ?IsDirectory@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
   ?lock@mutex@tsl@@QEAAXXZ
   ?unlock@mutex@tsl@@QEAAXXZ
-  ?notify_one@condition_variable@tsl@@QEAAXXZ
-  ?wait@condition_variable@tsl@@QEAAXAEAVmutex_lock@2@@Z
   ?StartCancel@CancellationManager@tsl@@QEAAXXZ
   ??0CancellationManager@tsl@@QEAA@XZ
-  ??0mutex@tsl@@QEAA@XZ
-  ??0condition_variable@tsl@@QEAA@XZ
   ?Default@Env@tsl@@SAPEAV12@XZ
   ??1CancellationManager@tsl@@QEAA@XZ
   ?DestructorOutOfLine@TensorShapeRep@tensorflow@@AEAAXXZ
@@ -135,7 +127,6 @@ EXPORTS
   ?FormatConvertImpl@str_format_internal@lts_20230802@absl@@YA?AU?$ArgConvertResult@$0IAAAE@@123@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@VFormatConversionSpecImpl@123@PEAVFormatSinkImpl@123@@Z
   ?ThrowStdOutOfRange@base_internal@lts_20230802@absl@@YAXPEBD@Z
   ?Capture@StackTrace@tensorflow@@SA?AV?$shared_ptr@VStackTrace@tensorflow@@@std@@H@Z
-  ??0mutex@tsl@@QEAA@XZ
   ?lock@mutex@tsl@@QEAAXXZ
   ?ImportStatusModule@google@pybind11@@YA?AVmodule_@2@_N@Z
   ?GetRegisteredXlaOpsForDevice@tensorflow@@YA?AV?$StatusOr@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@lts_20230802@absl@@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
@@ -208,23 +199,10 @@ EXPORTS
   ?tf2_execution_enabled@tensorflow@@YA_NXZ
   ?InstallStacktraceHandler@testing@tsl@@YAXXZ
   ?SetBufferAndMode@raw_ostream@llvm@@AEAAXPEAD_KW4BufferKind@12@@Z
-  stablehloVersionFromCompatibilityRequirement
   ?flush_nonempty@raw_ostream@llvm@@AEAAXXZ
   ??1raw_ostream@llvm@@UEAA@XZ
   ?write@raw_ostream@llvm@@QEAAAEAV12@PEBD_K@Z
-  stablehloSerializePortableArtifactFromModule
   ?str@Twine@llvm@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
-  stablehloDeserializePortableArtifactNoError
-  mlirAttributeIsADenseElements
-  stablehloEvalModule
-  mlirArrayAttrGetNumElements
-  mlirArrayAttrGetElement
-  stablehloGetApiVersion
-  stablehloGetSmallerVersion
-  stablehloGetCurrentVersion
-  stablehloGetMinimumVersion
-  stablehloSerializePortableArtifactFromStringRef
-  stablehloDeserializePortableArtifact
   ?changeColor@raw_ostream@llvm@@UEAAAEAV12@W4Colors@12@_N1@Z
   ?resetColor@raw_ostream@llvm@@UEAAAEAV12@XZ
   ?NumTensors@InterpreterWrapper@interpreter_wrapper@tflite@@QEBAHH@Z
@@ -528,7 +506,6 @@ EXPORTS
   ?Prepare@CalibrationWrapper@calibration_wrapper@tflite@@QEAAPEAU_object@@XZ
   ?FeedTensor@CalibrationWrapper@calibration_wrapper@tflite@@QEAAPEAU_object@@PEAU4@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
   ?FeedTensor@CalibrationWrapper@calibration_wrapper@tflite@@QEAAPEAU_object@@PEAU4@@Z
-  ?QuantizeModel@CalibrationWrapper@calibration_wrapper@tflite@@QEAAPEAU_object@@HH_NHH0@Z
   ?QuantizeModel@CalibrationWrapper@calibration_wrapper@tflite@@QEAAPEAU_object@@HH_NPEBD@Z
   ?Calibrate@CalibrationWrapper@calibration_wrapper@tflite@@QEAAPEAU_object@@XZ
   ?StrCat@lts_20230802@absl@@YA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBVAlphaNum@12@000@Z
@@ -657,8 +634,6 @@ EXPORTS
   ?ExportToTensorBoard@ProfilerSessionWrapper@pywrap@profiler@tensorflow@@QEAA?AVStatus@lts_20230802@absl@@XZ
   ?StartProfilerServer@ProfilerServer@profiler@tsl@@QEAAXH@Z
   ??1ProfilerServer@profiler@tsl@@QEAA@XZ
-  ?Trace@pywrap@profiler@tensorflow@@YA?AVStatus@lts_20230802@absl@@PEBD00_NHHAEBV?$flat_hash_map@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$variant@HV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@UStringHash@container_internal@lts_20230802@absl@@UStringEq@567@V?$allocator@U?$pair@$$CBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$variant@HV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@2@@56@@Z
-  ?Monitor@pywrap@profiler@tensorflow@@YA?AVStatus@lts_20230802@absl@@PEBDHH_NPEAV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
   ?Create@SessionSnapshot@profiler@tensorflow@@SA?AV?$StatusOr@VSessionSnapshot@profiler@tensorflow@@@lts_20230802@absl@@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@V?$optional@V?$vector@V?$unique_ptr@VXSpace@profiler@tensorflow@@U?$default_delete@VXSpace@profiler@tensorflow@@@std@@@std@@V?$allocator@V?$unique_ptr@VXSpace@profiler@tensorflow@@U?$default_delete@VXSpace@profiler@tensorflow@@@std@@@std@@@2@@std@@@8@@Z
   ??0LogMessage@internal@tsl@@QEAA@PEBDHW4LogSeverity@lts_20230802@absl@@@Z
   ?ConvertMultiXSpacesToToolData@profiler@tensorflow@@YA?AV?$StatusOr@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@lts_20230802@absl@@AEBVSessionSnapshot@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@AEBV?$flat_hash_map@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$variant@HV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@UStringHash@container_internal@lts_20230802@absl@@UStringEq@567@V?$allocator@U?$pair@$$CBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$variant@HV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@2@@45@@Z
@@ -863,7 +838,6 @@ EXPORTS
   ?NameRangesForNode@tensorflow@@YA?AVStatus@lts_20230802@absl@@AEBVAttrSlice@1@AEBVOpDef@1@PEAV?$FlatMap@V?$basic_string_view@DU?$char_traits@D@std@@@std@@U?$pair@HH@2@U?$hash@V?$basic_string_view@DU?$char_traits@D@std@@@std@@X@tsl@@U?$equal_to@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@2@@gtl@tsl@@2@Z
   ?ParseTensorName@tensorflow@@YA?AUTensorId@1@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
   ?Hash64@tsl@@YA_KPEBD_K1@Z
-  ?IsFailedPrecondition@errors@tsl@@YA_NAEBVStatus@lts_20230802@absl@@@Z
   ?HasAtomicMove@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEA_N@Z
   ?Stat@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAUFileStatistics@2@@Z
   ?NewAppendableFile@Env@tsl@@QEAA?AVStatus@lts_20230802@absl@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAV?$unique_ptr@VWritableFile@tsl@@U?$default_delete@VWritableFile@tsl@@@std@@@7@@Z
@@ -1079,7 +1053,6 @@ EXPORTS
   ?Output@OpDefBuilder@tensorflow@@QEAAAEAV12@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
   ??0OpKernel@tensorflow@@QEAA@PEAVOpKernelConstruction@1@@Z
   ??1OpKernel@tensorflow@@UEAA@XZ
-  ?allocate_output@OpKernelContext@tensorflow@@QEAA?AVStatus@lts_20230802@absl@@HAEBVTensorShape@2@PEAPEAVTensor@2@@Z
   ?CheckIsAlignedAndSingleElement@Tensor@tensorflow@@AEBAXXZ
   ?CheckNotInComputeAsync@tensorflow@@YAXPEAVOpKernelContext@1@PEBD@Z
   ?CtxFailureWithWarning@OpKernelContext@tensorflow@@QEAAXPEBDHAEBVStatus@lts_20230802@absl@@@Z
@@ -1199,13 +1172,11 @@ EXPORTS
   ?Attr@OpDefBuilder@tensorflow@@QEAAAEAV12@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
   ?SetShapeFn@OpDefBuilder@tensorflow@@QEAAAEAV12@V?$function@$$A6A?AVStatus@lts_20230802@absl@@PEAVInferenceContext@shape_inference@tensorflow@@@Z@std@@@Z
   ??0Tensor@tensorflow@@QEAA@XZ
-  ?allocate_temp@OpKernelContext@tensorflow@@QEAA?AVStatus@lts_20230802@absl@@W4DataType@2@AEBVTensorShape@2@PEAVTensor@2@UAllocatorAttributes@tsl@@@Z
   ?MakeRefCountingHandle@ResourceHandle@tensorflow@@SA?AV12@PEAVResourceBase@2@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@AEBVTypeIndex@2@AEBV?$vector@UDtypeAndPartialTensorShape@tensorflow@@V?$allocator@UDtypeAndPartialTensorShape@tensorflow@@@std@@@5@AEBV?$optional@VManagedStackTrace@tensorflow@@@5@@Z
   ??1ResourceHandle@tensorflow@@QEAA@XZ
   ?set_output@OpKernelContext@tensorflow@@QEAAXHAEBVTensor@2@@Z
   ?MatchSignature@OpKernelContext@tensorflow@@QEAA?AVStatus@lts_20230802@absl@@V?$Span@$$CBW4DataType@tensorflow@@@45@0@Z
   ?input@OpKernelContext@tensorflow@@QEBAAEBVTensor@2@H@Z
-  ?allocate_output@OpKernelContext@tensorflow@@QEAA?AVStatus@lts_20230802@absl@@V?$basic_string_view@DU?$char_traits@D@std@@@std@@AEBVTensorShape@2@PEAPEAVTensor@2@@Z
   ?CheckTypeAndIsAligned@Tensor@tensorflow@@AEBAXW4DataType@2@@Z
   ?ValidateType@ResourceHandle@tensorflow@@QEBA?AVStatus@lts_20230802@absl@@AEBVTypeIndex@2@@Z
   ?IsSameSize@TensorShape@tensorflow@@QEBA_NAEBV12@@Z
@@ -1246,4 +1217,38 @@ EXPORTS
   ??6LogMessage@log_internal@lts_20230802@absl@@QEAAAEAV0123@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
   ??1LogMessage@log_internal@lts_20230802@absl@@QEAA@XZ
   ??1Mutex@lts_20230802@absl@@QEAA@XZ
-  
+  ?WriteStringToFile@tsl@@YA?AVStatus@lts_20230802@absl@@PEAVEnv@1@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$basic_string_view@DU?$char_traits@D@std@@@7@@Z
+  ?kTypeToCppTypeMap@FieldDescriptor@protobuf@google@@0QBW4CppType@123@B
+  ?kTypeToCppTypeMap@FieldDescriptor@protobuf@google@@0QBW4CppType@123@B
+  ?kTypeToCppTypeMap@FieldDescriptor@protobuf@google@@0QBW4CppType@123@B
+  ?PywrapSavedModelToStablehlo@pywrap@tensorflow_to_stablehlo@mlir@@YA?AV?$StatusOr@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@lts_20230802@absl@@V?$basic_string_view@DU?$char_traits@D@std@@@std@@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@8@10@Z
+  ?PywrapTfModuleToStablehlo@pywrap@tensorflow_to_stablehlo@mlir@@YA?AV?$StatusOr@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@lts_20230802@absl@@V?$basic_string_view@DU?$char_traits@D@std@@@std@@0@Z
+  ?kTypeToCppTypeMap@FieldDescriptor@protobuf@google@@0QBW4CppType@123@B
+  ?kTypeToCppTypeMap@FieldDescriptor@protobuf@google@@0QBW4CppType@123@B
+  ?ProcessStepStats@StatSummarizer@tensorflow@@QEAAXAEBVStepStats@2@@Z
+  ?PrintStepStats@StatSummarizer@tensorflow@@QEBAXXZ
+  ?GetOutputString@StatsCalculator@tsl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
+  ??1StatSummarizer@tensorflow@@QEAA@XZ
+  ??0StatSummarizer@tensorflow@@QEAA@AEBVGraphDef@1@@Z
+  ??0StatSummarizer@tensorflow@@QEAA@AEBUStatSummarizerOptions@tsl@@@Z
+  ?kTypeToCppTypeMap@FieldDescriptor@protobuf@google@@0QBW4CppType@123@B
+  ?kTypeToCppTypeMap@FieldDescriptor@protobuf@google@@0QBW4CppType@123@B
+  ?QuantizeModel@CalibrationWrapper@calibration_wrapper@tflite@@QEAAPEAU_object@@HH_NHH00@Z
+  ?CreateTypeInfoAndReturnTypeIdImpl@AsyncValue@tsl@@CAGAEBUTypeInfo@12@@Z
+  ?pywrap_library_dependency_symbol@python@tensorflow@@YAXXZ
+  ?NotifyAvailable@AsyncValue@tsl@@IEAAXVState@12@@Z
+  ??1CondVar@lts_20230802@absl@@QEAA@XZ
+  ?Lock@Mutex@lts_20230802@absl@@QEAAXXZ
+  ?Signal@CondVar@lts_20230802@absl@@QEAAXXZ
+  ?Unlock@Mutex@lts_20230802@absl@@QEAAXXZ
+  ?Wait@CondVar@lts_20230802@absl@@QEAAXPEAVMutex@23@@Z
+  ?UnimplementedError@lts_20230802@absl@@YA?AVStatus@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
+  ?NotFoundError@lts_20230802@absl@@YA?AVStatus@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
+  ?InternalError@lts_20230802@absl@@YA?AVStatus@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
+  ?InvalidArgumentError@lts_20230802@absl@@YA?AVStatus@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
+  ?IsAlreadyExists@lts_20230802@absl@@YA_NAEBVStatus@12@@Z
+  ?AlreadyExistsError@lts_20230802@absl@@YA?AVStatus@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
+  ?IsOutOfRange@lts_20230802@absl@@YA_NAEBVStatus@12@@Z
+  ?FailedPreconditionError@lts_20230802@absl@@YA?AVStatus@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
+  ?PermissionDeniedError@lts_20230802@absl@@YA?AVStatus@12@V?$basic_string_view@DU?$char_traits@D@std@@@std@@@Z
+  ?IsFailedPrecondition@lts_20230802@absl@@YA_NAEBVStatus@12@@Z
diff --git a/tensorflow/python/_pywrap_tfe.pyi b/tensorflow/python/_pywrap_tfe.pyi
index 0c272a999f86..7b53e396ef53 100644
--- a/tensorflow/python/_pywrap_tfe.pyi
+++ b/tensorflow/python/_pywrap_tfe.pyi
@@ -214,6 +214,7 @@ def TFE_DeleteConfigKeyValue(arg0: object, arg1: str) -> None: ...
 def TFE_DeleteContext(arg0: object) -> None: ...
 def TFE_DeleteContextOptions(arg0: TFE_ContextOptions) -> None: ...
 def TFE_DeleteExecutor(arg0: TFE_Executor) -> None: ...
+def TFE_DlpackDevice(arg0: object) -> tuple: ...
 def TFE_EnableCollectiveOps(arg0: object, arg1: bytes) -> None: ...
 def TFE_ExecutorClearError(arg0: TFE_Executor) -> None: ...
 def TFE_ExecutorIsAsync(arg0: TFE_Executor) -> bool: ...
diff --git a/tensorflow/python/checkpoint/checkpoint_adapter.py b/tensorflow/python/checkpoint/checkpoint_adapter.py
index b0e8b02beeff..f599d11ed256 100644
--- a/tensorflow/python/checkpoint/checkpoint_adapter.py
+++ b/tensorflow/python/checkpoint/checkpoint_adapter.py
@@ -65,7 +65,7 @@ def update_restore_inputs(
     Override this method if the arguments to restore op need to be updated as
     per the resharding required.
     Args:
-      checkpoint_key: The cehckpopoint key as requested by the caller
+      checkpoint_key: The checkpoint key as requested by the caller
       shape_and_slice_spec: The shape and slice spec as requested by caller
 
     Returns:
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index a3ad2f02378b..807a94def6ac 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -73,9 +73,9 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/types:optional",
         "@eigen_archive//:eigen3",
+        "@local_xla//third_party/python_runtime:headers",
         "@local_xla//xla/tsl/python/lib/core:numpy",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
@@ -111,14 +111,17 @@ tf_python_pybind_extension(
         "_pywrap_debug_events_writer.pyi",
     ],
     deps = [
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_absl",
         "//tensorflow/python/lib/core:pybind11_proto",
         "//tensorflow/python/lib/core:pybind11_status",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -131,14 +134,17 @@ tf_python_pybind_extension(
         "_pywrap_events_writer.pyi",
     ],
     deps = [
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_absl",
         "//tensorflow/python/lib/core:pybind11_proto",
         "//tensorflow/python/lib/core:pybind11_status",
-        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -197,12 +203,16 @@ tf_python_pybind_extension(
         "_pywrap_device_lib.pyi",
     ],
     deps = [
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal_headers_lib",
+        "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/python/lib/core:pybind11_proto",
         "//tensorflow/python/lib/core:pybind11_status",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -231,6 +241,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:replay_log_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:master_proto_cc",
diff --git a/tensorflow/python/client/debug_events_writer_wrapper.cc b/tensorflow/python/client/debug_events_writer_wrapper.cc
index 252927e096e2..c1f0c48d9a8b 100644
--- a/tensorflow/python/client/debug_events_writer_wrapper.cc
+++ b/tensorflow/python/client/debug_events_writer_wrapper.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/strings/string_view.h"
+#include <cstdint>
+#include <string>
+
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/python/client/device_lib_wrapper.cc b/tensorflow/python/client/device_lib_wrapper.cc
index 517fec76c58e..ff4f7b5aacf4 100644
--- a/tensorflow/python/client/device_lib_wrapper.cc
+++ b/tensorflow/python/client/device_lib_wrapper.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/core/common_runtime/device.h"
diff --git a/tensorflow/python/client/events_writer_wrapper.cc b/tensorflow/python/client/events_writer_wrapper.cc
index 8d555c6a9908..43eca6bc3fc8 100644
--- a/tensorflow/python/client/events_writer_wrapper.cc
+++ b/tensorflow/python/client/events_writer_wrapper.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/strings/string_view.h"
+#include <string>
+
+#include "absl/status/status.h"
 #include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
diff --git a/tensorflow/python/client/session_ref.cc b/tensorflow/python/client/session_ref.cc
index 1cc345277f32..3ba85798583a 100644
--- a/tensorflow/python/client/session_ref.cc
+++ b/tensorflow/python/client/session_ref.cc
@@ -15,9 +15,17 @@ limitations under the License.
 #include "tensorflow/python/client/session_ref.h"
 
 #include <stdlib.h>
+
+#include <cstdint>
 #include <memory>
+#include <string>
+#include <unordered_map>
 #include <utility>
+#include <vector>
 
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/python/client/session_ref.h b/tensorflow/python/client/session_ref.h
index 362bb3e52045..1f98483f69dc 100644
--- a/tensorflow/python/client/session_ref.h
+++ b/tensorflow/python/client/session_ref.h
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 39d02d3d2242..629ecd7bab28 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 3, 6)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 5, 15)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index 48e9f07636b5..ee5432d9e67a 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -16,11 +16,17 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:dispatcher_client",
+        "//tensorflow/core/data/service:grpc_util",
+        "//tensorflow/core/data/service:server_lib",
         "//tensorflow/core/data/service:server_lib_headers_lib",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
-        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
@@ -62,7 +68,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/core/data/service:py_utils",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -77,7 +83,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/core/data/service/snapshot:path_utils",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index 63c634734402..7138136b39ae 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "Python.h"
-#include "absl/strings/str_cat.h"
+#include "absl/status/status.h"
 #include "pybind11/chrono.h"  // from @pybind11
 #include "pybind11/complex.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
diff --git a/tensorflow/python/debug/examples/v2/BUILD b/tensorflow/python/debug/examples/v2/BUILD
index edc3d9b2c7c2..17446a75d79a 100644
--- a/tensorflow/python/debug/examples/v2/BUILD
+++ b/tensorflow/python/debug/examples/v2/BUILD
@@ -40,6 +40,7 @@ sh_test(
         "no_windows",
         "noasan",  # TODO(b/143150907)
         "nomsan",  # TODO(b/143150907)
+        "requires-mem:16g",
         "v2only",
     ],
 )
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 2be66767aebe..ffc648800e19 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -2578,6 +2578,9 @@ distribute_py_strict_test(
 tpu_py_strict_test(
     name = "tpu_strategy_model_parallelism_test",
     srcs = ["tpu_strategy_model_parallelism_test.py"],
+    args = [
+        "--xla_tpu_reserve_recv_ready_sync_flags=true",
+    ],
     disable_experimental = True,  # b/202779350
     disable_mlir_bridge = False,
     disable_v3_4chips = False,
diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py
index 35903dc7b254..405cd28ed082 100644
--- a/tensorflow/python/distribute/cluster_resolver/__init__.py
+++ b/tensorflow/python/distribute/cluster_resolver/__init__.py
@@ -25,6 +25,7 @@
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
 from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import  ExecutableLocation
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
index f74089ed0415..2eb31344e342 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -14,12 +14,30 @@
 # ==============================================================================
 """Implementation of Cluster Resolvers for Kubernetes."""
 
+import enum
+
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
 from tensorflow.python.training import server_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('distribute.cluster_resolver.KubernetesExecutableLocation')
+class ExecutableLocation(enum.Enum):
+  """Defines where the executable runs on.
+
+  This is used to determine how to resolve the configuration
+  to talk with the kube api server.
+
+  `WITHIN_CLUSTER` means that the TensorFlow code you are running is running
+  in a pod within the cluster itself.
+  `OFF_CLUSTER` means any other enviroment outside the cluster.
+  """
+
+  WITHIN_CLUSTER = 0
+  OFF_CLUSTER = 1
+
+
 @tf_export('distribute.cluster_resolver.KubernetesClusterResolver')
 class KubernetesClusterResolver(ClusterResolver):
   """ClusterResolver for Kubernetes.
@@ -55,11 +73,14 @@ class KubernetesClusterResolver(ClusterResolver):
     ```
   """
 
-  def __init__(self,
-               job_to_label_mapping=None,
-               tf_server_port=8470,
-               rpc_layer='grpc',
-               override_client=None):
+  def __init__(
+      self,
+      job_to_label_mapping=None,
+      tf_server_port=8470,
+      rpc_layer='grpc',
+      override_client=None,
+      executable_location=ExecutableLocation.WITHIN_CLUSTER,
+  ):
     """Initializes a new KubernetesClusterResolver.
 
     This initializes a new Kubernetes ClusterResolver. The ClusterResolver
@@ -80,17 +101,29 @@ def __init__(self,
         between tasks in Kubernetes. Defaults to 'grpc'.
       override_client: The Kubernetes client (usually automatically retrieved
         using `from kubernetes import client as k8sclient`). If you pass this
-        in, you are responsible for setting Kubernetes credentials manually.
+        in, you are responsible for setting Kubernetes credentials manually and
+        calling `k8sconfig.load_kube_config()` or
+        `k8sconfig.load_incluster_config()` before using this ClusterResolver.
+      executable_location: Parameter that specifies whether or not this
+        TensorFlow code is running from within a K8S cluster or not.
 
     Raises:
       ImportError: If the Kubernetes Python client is not installed and no
         `override_client` is passed in.
       RuntimeError: If autoresolve_task is not a boolean or a callable.
+      ValueError: If `executable_location` is not a valid value.
     """
     try:
       from kubernetes import config as k8sconfig  # pylint: disable=g-import-not-at-top
 
-      k8sconfig.load_kube_config()
+      if not override_client:
+        if executable_location == ExecutableLocation.OFF_CLUSTER:
+          k8sconfig.load_kube_config()
+        elif executable_location == ExecutableLocation.WITHIN_CLUSTER:
+          k8sconfig.load_incluster_config()
+        else:
+          raise ValueError('The executable location provided is invalid.')
+
     except ImportError:
       if not override_client:
         raise ImportError('The Kubernetes Python client must be installed '
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
index 9e663728c49d..5e359f0eaa05 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -14,6 +14,9 @@
 # ==============================================================================
 """Tests for K8sClusterResolver."""
 
+import sys
+
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import ExecutableLocation
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -21,6 +24,10 @@
 mock = test.mock
 
 
+def _mock_kubernetes_module():
+  sys.modules['kubernetes'] = mock.MagicMock()
+
+
 def _mock_kubernetes_client(ret):
   mock_client = mock.MagicMock()
   mock_client.list_pod_for_all_namespaces.side_effect = (
@@ -68,6 +75,32 @@ def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
                            server_lib.ClusterSpec(
                                cluster_spec.as_dict()).as_cluster_def())
 
+  def testSingleItemSuccessfulRetrievalInCluster(self):
+    ret = _create_pod_list(
+        ('tensorflow-abc123', 'Running', '10.1.2.3'),
+    )
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client({'job-name=tensorflow': ret}),
+        executable_location=ExecutableLocation.WITHIN_CLUSTER,
+    )
+
+    actual_cluster_spec = cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.1.2.3:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+
+  def testValueErrorRaisedOnInvalidExecutableLocation(self):
+
+    _mock_kubernetes_module()
+
+    with self.assertRaisesRegexp(ValueError, '.*'):
+      KubernetesClusterResolver(executable_location=None)
+
   def testSingleItemSuccessfulRetrieval(self):
     ret = _create_pod_list(('tensorflow-abc123', 'Running', '10.1.2.3'),)
 
diff --git a/tensorflow/python/dlpack/dlpack_test.py b/tensorflow/python/dlpack/dlpack_test.py
index a719d8f34401..5f835911c375 100644
--- a/tensorflow/python/dlpack/dlpack_test.py
+++ b/tensorflow/python/dlpack/dlpack_test.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for DLPack functions."""
+
 from absl.testing import parameterized
 import numpy as np
 
-
 from tensorflow.python.dlpack import dlpack
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
 
 int_dtypes = [
     np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32,
@@ -79,6 +80,12 @@ def testRoundTrip(self, dtype, shape):
     else:
       self.assertEqual(tf_tensor_device, tf_tensor2.device)
 
+  def testRoundTripWithoutToDlpack(self):
+    np_array = np.random.randint(0, 10, [42])
+    self.assertAllEqual(
+        np.from_dlpack(constant_op.constant(np_array).cpu()), np_array
+    )
+
   def testTensorsCanBeConsumedOnceOnly(self):
     np.random.seed(42)
     np_array = np.random.randint(0, 10, (2, 3, 4))
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index a78445cb6c44..b9d2f4bca3a2 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -69,7 +69,6 @@ cc_library(
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:stack_trace",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/debugging:leak_check",
         "@com_google_absl//absl/hash",
@@ -79,6 +78,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
         "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_xla//third_party/python_runtime:headers",
         "@local_xla//xla/tsl/platform:status",
         "@local_xla//xla/tsl/python/lib/core:numpy",
         "@pybind11",
@@ -200,6 +200,7 @@ py_strict_library(
     # copybara:uncomment_begin(google-only)
     # visibility = [
     # "//smartass/brain/ops:__subpackages__",
+    # "//third_party/py/courier:__subpackages__",
     # "//third_party/py/jax_tpu_embedding:__subpackages__",
     # "//tensorflow:internal",
     # ],
@@ -250,7 +251,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -320,6 +321,7 @@ cuda_py_strict_test(
     srcs = ["context_cross_platform_gpu_test.py"],
     tags = [
         "no_oss",
+        "noasan",  # TODO(b/417285186): Re-enable once the test is fixed.
     ],
     tpu_ops_enabled = True,
     xla_enabled = True,
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 5bfa389e92a0..0cc1e08cc0c1 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -134,7 +134,6 @@ TFE_Context* GetContextHandle(PyObject* py_context) {
   return ctx;
 }
 
-
 // Helper function to convert `v` to a tensorflow::DataType and store it in
 // `*out`. Returns true on success, false otherwise.
 // Note that we assume that v is a python int (not long) representing a
@@ -590,8 +589,10 @@ static PyObject* EagerTensor_datatype_enum(EagerTensor* self) {
 // Getter for `_shape_tuple`.
 static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   auto handle = self->handle;
-  int n = TFE_TensorHandleNumDims(handle, &self->status);
-  TF_Code code = TF_GetCode(&self->status);
+  int n;
+  Py_BEGIN_ALLOW_THREADS;
+  n = TFE_TensorHandleNumDims(handle, &self->status);
+  Py_END_ALLOW_THREADS TF_Code code = TF_GetCode(&self->status);
   if (code != TF_OK) {
     RaiseExceptionTypeFromTFStatus(&self->status);
     // Cleanup self->status before returning.
@@ -601,7 +602,10 @@ static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   PyObject* shape = PyTuple_New(n);
   if (PyErr_Occurred()) return nullptr;
   for (int i = 0; i < n; ++i) {
-    int64_t dim_c_value = TFE_TensorHandleDim(handle, i, &self->status);
+    int64_t dim_c_value;
+    Py_BEGIN_ALLOW_THREADS;
+    dim_c_value = TFE_TensorHandleDim(handle, i, &self->status);
+    Py_END_ALLOW_THREADS;
     PyObject* dim;
     // The C++ convention is -1 for unknown/variable axis lengths. Translate
     // that to the Python "None" convention. Unknown axis lengths are unusual
@@ -632,7 +636,10 @@ static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
 
 // Getter for `_rank`.
 static PyObject* EagerTensor_rank(EagerTensor* self) {
-  int num_dims = TFE_TensorHandleNumDims(self->handle, &self->status);
+  int num_dims;
+  Py_BEGIN_ALLOW_THREADS;
+  num_dims = TFE_TensorHandleNumDims(self->handle, &self->status);
+  Py_END_ALLOW_THREADS;
   if (tensorflow::MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
     // Cleanup self->status before returning.
     self->status.status = absl::OkStatus();
@@ -648,7 +655,10 @@ static PyObject* EagerTensor_rank(EagerTensor* self) {
 // Getter for `_num_elements`.
 static PyObject* EagerTensor_num_elements(EagerTensor* self) {
   auto handle = self->handle;
-  int n = TFE_TensorHandleNumElements(handle, &self->status);
+  int n;
+  Py_BEGIN_ALLOW_THREADS;
+  n = TFE_TensorHandleNumElements(handle, &self->status);
+  Py_END_ALLOW_THREADS;
   if (tensorflow::MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
     // Cleanup self->status before returning.
     self->status.status = absl::OkStatus();
@@ -686,7 +696,19 @@ static int EagerTensor_settensor_shape(EagerTensor* self, PyObject* value,
 // Function `_copy_to_device`.
 static PyObject* EagerTensor_copy_to_device(EagerTensor* self, PyObject* args,
                                             PyObject* kwds) {
+#if PY_VERSION_HEX <= 0x030C0000  // <= Python 3.12
   if (!_PyArg_NoKeywords("copy_to_device", kwds)) return nullptr;
+#else
+  const char* keyname = "copy_to_device";
+  if (kwds != NULL && PyDict_Size(kwds) > 0) {
+    PyErr_SetString(PyExc_TypeError, "Function does not accept keyword args.");
+    return nullptr;
+  }
+
+  if (!PyArg_ParseTuple(args, "s", &keyname)) {
+    return nullptr;
+  }
+#endif
 
   const char* device_name = nullptr;
   if (!PyArg_ParseTuple(args, "O&:copy_to_device", ConvertDeviceName,
@@ -873,6 +895,8 @@ static int EagerTensor_traverse(PyObject* self, visitproc visit, void* arg) {
 #if PY_VERSION_HEX < 0x030C0000  // < Python 3.12
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_VISIT(dict);
+#elif PY_VERSION_HEX >= 0x030D0000  // >= Python 3.13
+  PyObject_VisitManagedDict(self, visit, arg);
 #else
   _PyObject_VisitManagedDict(self, visit, arg);
 #endif  // PY_VERSION_HEX < 0x030C0000
@@ -896,6 +920,8 @@ extern int EagerTensor_clear(PyObject* self) {
 #if PY_VERSION_HEX < 0x030C0000  // < Python 3.12
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_CLEAR(dict);
+#elif PY_VERSION_HEX >= 0x030D0000  // >= Python 3.13
+  PyObject_ClearManagedDict(self);
 #else
   _PyObject_ClearManagedDict(self);
 #endif  // PY_VERSION_HEX < 0x030C0000
@@ -950,12 +976,12 @@ static PyTypeObject _EagerTensorType = {
     // clang-format off
     PyVarObject_HEAD_INIT(nullptr, 0)
     // clang-format on
-    "EagerTensor",                      /* tp_name */
-    sizeof(EagerTensor),                /* tp_basicsize */
-    0,                                  /* tp_itemsize */
-    (destructor)EagerTensor_dealloc,    /* tp_dealloc */
+    "EagerTensor",                   /* tp_name */
+    sizeof(EagerTensor),             /* tp_basicsize */
+    0,                               /* tp_itemsize */
+    (destructor)EagerTensor_dealloc, /* tp_dealloc */
 #if PY_VERSION_HEX < 0x03080000
-    nullptr,                            /* tp_print */
+    nullptr, /* tp_print */
 #else
     0, /* tp_vectorcall_offset */
 #endif
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 303775b2c905..1ff9e314e6d3 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -105,12 +105,8 @@ def testNumpyValueWithCast(self):
     ctx = context.context()
     # Bad dtype value.
     with self.assertRaisesRegex(TypeError, "Invalid dtype argument value"):
-      # The chosen `dtype` value here needs to be both not defined as a value of
-      # TF_DataType but also representable in the same number of bits as the max
-      # value of TF_DataType. At 12/20/24, where the max value of TF_DataType is
-      # 30, so using e.g. 63 would fail ASAN due to 63 not being representable
-      # in 5 bits.
-      ops.EagerTensor(values, device=ctx.device_name, dtype=31)
+      # The max value of TF_DataType is 32, so using 33 for the dtype fails.
+      ops.EagerTensor(values, device=ctx.device_name, dtype=33)
 
   def testNumpyOrderHandling(self):
     n = np.array([[1, 2], [3, 4]], order="F")
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index f8fbd8db7e3b..2527ef165d3e 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -4571,7 +4571,7 @@ def test_1D_shape_succeeds(self):
                           self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
-    # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
+    # TODO(ispir/cassandrax): Switch to categorical_column_with_keys when ready.
     animal = fc._indicator_column(
         fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index cb98bf60c031..265bbb8e2697 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -4381,7 +4381,7 @@ def test_1D_shape_succeeds(self):
                         self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
-    # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
+    # TODO(ispir/cassandrax): Switch to categorical_column_with_keys when ready.
     animal = fc.indicator_column(
         fc.categorical_column_with_hash_bucket('animal', 4))
     transformation_cache = fc.FeatureTransformationCache({
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 02391b904c5e..fbc6055dfad4 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -159,7 +159,7 @@ tf_python_pybind_extension(
         "//tensorflow/python:python_op_gen_headers_lib",
         "//tensorflow/python/lib/core:pybind11_absl",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -692,11 +692,11 @@ tf_python_pybind_extension(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -760,7 +760,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",  # for core/platform/logging.h
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -775,7 +775,7 @@ tf_python_pybind_extension(
     starlark_only = True,
     deps = [
         ":py_context_manager",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -809,8 +809,8 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -830,7 +830,7 @@ tf_python_pybind_extension(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -868,9 +868,9 @@ cc_library(
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -913,13 +913,13 @@ tf_python_pybind_extension(
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core/config:flags_headers",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ] + if_pywrap(
         if_true = [
@@ -979,9 +979,9 @@ cc_library(
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -1023,13 +1023,13 @@ tf_python_pybind_extension(
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core/config:flags_headers",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ] + if_pywrap(
         if_true = [
@@ -1078,10 +1078,10 @@ cc_library(
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:function_parameter_canonicalizer",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
     alwayslink = 1,
 )
@@ -1106,9 +1106,9 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/python/util:function_parameter_canonicalizer_hdrs",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ] + if_pywrap([":python_api_dispatcher"]),
 )
@@ -1141,8 +1141,8 @@ cc_library(
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/strings",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -1179,13 +1179,13 @@ tf_python_pybind_extension(
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core/config:flags_headers",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ] + if_static(
         extra_deps = [
@@ -3238,7 +3238,7 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:protobuf",
         "//tensorflow/python/lib/core:pybind11_status",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ],
 )
@@ -3295,7 +3295,7 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core:framework",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/framework/composite_tensor_gradient.py b/tensorflow/python/framework/composite_tensor_gradient.py
index 48ca71ce0f45..afa6a45ca006 100644
--- a/tensorflow/python/framework/composite_tensor_gradient.py
+++ b/tensorflow/python/framework/composite_tensor_gradient.py
@@ -56,7 +56,7 @@ def get_gradient_components(self, value):
     """Returns the components of `value` that should be included in gradients.
 
     This method may not call TensorFlow ops, since any new ops added to the
-    graph would not be propertly tracked by the gradient mechanisms.
+    graph would not be properly tracked by the gradient mechanisms.
 
     Args:
       value: A `CompositeTensor` value.
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 4c359c511a1b..0f31d0bf04d7 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -491,6 +491,18 @@ def __reduce__(self):
     __name__, "uint4"
 )
 
+int2 = DType(types_pb2.DT_INT2)
+doc_typealias.document(obj=int2, doc="Signed 2-bit integer.")
+tf_export("dtypes.experimental.int2", "experimental.int2").export_constant(
+    __name__, "int2"
+)
+
+uint2 = DType(types_pb2.DT_UINT2)
+doc_typealias.document(obj=uint2, doc="Unsigned 2-bit integer.")
+tf_export("dtypes.experimental.uint2", "experimental.uint2").export_constant(
+    __name__, "uint2"
+)
+
 resource_ref = DType(types_pb2.DT_RESOURCE_REF)
 variant_ref = DType(types_pb2.DT_VARIANT_REF)
 float16_ref = DType(types_pb2.DT_HALF_REF)
@@ -523,6 +535,8 @@ def __reduce__(self):
 float8_e5m2fnuz_ref = DType(types_pb2.DT_FLOAT8_E5M2FNUZ_REF)
 int4_ref = DType(types_pb2.DT_INT4_REF)
 uint4_ref = DType(types_pb2.DT_UINT4_REF)
+int2_ref = DType(types_pb2.DT_INT2_REF)
+uint2_ref = DType(types_pb2.DT_UINT2_REF)
 
 # Maintain an intern table so that we don't have to create a large
 # number of small objects.
@@ -555,6 +569,8 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E5M2FNUZ: float8_e5m2fnuz,
     types_pb2.DT_INT4: int4,
     types_pb2.DT_UINT4: uint4,
+    types_pb2.DT_INT2: int2,
+    types_pb2.DT_UINT2: uint2,
     types_pb2.DT_RESOURCE: resource,
     types_pb2.DT_VARIANT: variant,
     types_pb2.DT_HALF_REF: float16_ref,
@@ -585,6 +601,8 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: float8_e5m2fnuz_ref,
     types_pb2.DT_INT4_REF: int4_ref,
     types_pb2.DT_UINT4_REF: uint4_ref,
+    types_pb2.DT_INT2_REF: int2_ref,
+    types_pb2.DT_UINT2_REF: uint2_ref,
     types_pb2.DT_RESOURCE_REF: resource_ref,
     types_pb2.DT_VARIANT_REF: variant_ref,
 }
@@ -619,6 +637,8 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E5M2FNUZ: "float8_e5m2fnuz",
     types_pb2.DT_INT4: "int4",
     types_pb2.DT_UINT4: "uint4",
+    types_pb2.DT_INT2: "int2",
+    types_pb2.DT_UINT2: "uint2",
     types_pb2.DT_RESOURCE: "resource",
     types_pb2.DT_VARIANT: "variant",
     types_pb2.DT_HALF_REF: "float16_ref",
@@ -649,6 +669,8 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: "float8_e5m2fnuz_ref",
     types_pb2.DT_INT4_REF: "int4_ref",
     types_pb2.DT_UINT4_REF: "uint4_ref",
+    types_pb2.DT_INT2_REF: "int2_ref",
+    types_pb2.DT_UINT2_REF: "uint2_ref",
     types_pb2.DT_RESOURCE_REF: "resource_ref",
     types_pb2.DT_VARIANT_REF: "variant_ref",
 }
@@ -684,6 +706,8 @@ def __reduce__(self):
 _np_float8_e5m2fnuz = ml_dtypes.float8_e5m2fnuz
 _np_int4 = ml_dtypes.int4
 _np_uint4 = ml_dtypes.uint4
+_np_int2 = ml_dtypes.int2
+_np_uint2 = ml_dtypes.uint2
 
 # Custom struct dtype for directly-fed ResourceHandles of supported type(s).
 np_resource = np.dtype([("resource", np.ubyte)])
@@ -720,6 +744,8 @@ def __reduce__(self):
     _np_float8_e5m2fnuz: float8_e5m2fnuz,
     _np_int4: int4,
     _np_uint4: uint4,
+    _np_int2: int2,
+    _np_uint2: uint2,
 }
 
 # Map (some) NumPy platform dtypes to TF ones using their fixed-width
@@ -770,6 +796,8 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E5M2FNUZ: _np_float8_e5m2fnuz,
     types_pb2.DT_INT4: _np_int4,
     types_pb2.DT_UINT4: _np_uint4,
+    types_pb2.DT_INT2: _np_int2,
+    types_pb2.DT_UINT2: _np_uint2,
     # Ref types
     types_pb2.DT_HALF_REF: np.float16,
     types_pb2.DT_FLOAT_REF: np.float32,
@@ -799,6 +827,8 @@ def __reduce__(self):
     types_pb2.DT_FLOAT8_E5M2FNUZ_REF: _np_float8_e5m2fnuz,
     types_pb2.DT_INT4_REF: _np_int4,
     types_pb2.DT_UINT4_REF: _np_uint4,
+    types_pb2.DT_INT2_REF: _np_int2,
+    types_pb2.DT_UINT2_REF: _np_uint2,
 }
 
 _QUANTIZED_DTYPES_NO_REF = frozenset([qint8, quint8, qint16, quint16, qint32])
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 047f81408471..00a019a8760c 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -108,6 +108,8 @@ def testNumpyConversion(self):
     )
     self.assertIs(dtypes.int4, dtypes.as_dtype(dtypes._np_int4))
     self.assertIs(dtypes.uint4, dtypes.as_dtype(dtypes._np_uint4))
+    self.assertIs(dtypes.int2, dtypes.as_dtype(dtypes._np_int2))
+    self.assertIs(dtypes.uint2, dtypes.as_dtype(dtypes._np_uint2))
     with self.assertRaises(TypeError):
       dtypes.as_dtype(np.dtype([("f1", np.uint), ("f2", np.int32)]))
 
@@ -138,6 +140,8 @@ def testRealDtype(self):
         dtypes.float8_e5m2fnuz,
         dtypes.int4,
         dtypes.uint4,
+        dtypes.int2,
+        dtypes.uint2,
     ]:
       self.assertIs(dtype.real_dtype, dtype)
     self.assertIs(dtypes.complex64.real_dtype, dtypes.float32)
@@ -169,6 +173,8 @@ def testStringConversion(self):
     self.assertIs(dtypes.float8_e5m2fnuz, dtypes.as_dtype("float8_e5m2fnuz"))
     self.assertIs(dtypes.int4, dtypes.as_dtype("int4"))
     self.assertIs(dtypes.uint4, dtypes.as_dtype("uint4"))
+    self.assertIs(dtypes.int2, dtypes.as_dtype("int2"))
+    self.assertIs(dtypes.uint2, dtypes.as_dtype("uint2"))
     self.assertIs(dtypes.float32_ref, dtypes.as_dtype("float32_ref"))
     self.assertIs(dtypes.float64_ref, dtypes.as_dtype("float64_ref"))
     self.assertIs(dtypes.int32_ref, dtypes.as_dtype("int32_ref"))
@@ -189,6 +195,8 @@ def testStringConversion(self):
                   dtypes.as_dtype("float8_e4m3fn_ref"))
     self.assertIs(dtypes.int4_ref, dtypes.as_dtype("int4_ref"))
     self.assertIs(dtypes.uint4_ref, dtypes.as_dtype("uint4_ref"))
+    self.assertIs(dtypes.int2_ref, dtypes.as_dtype("int2_ref"))
+    self.assertIs(dtypes.uint2_ref, dtypes.as_dtype("uint2_ref"))
     with self.assertRaises(TypeError):
       dtypes.as_dtype("not_a_type")
 
@@ -224,6 +232,8 @@ def testIsInteger(self):
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_integer, False)
     self.assertEqual(dtypes.as_dtype("int4").is_integer, True)
     self.assertEqual(dtypes.as_dtype("uint4").is_integer, True)
+    self.assertEqual(dtypes.as_dtype("int2").is_integer, True)
+    self.assertEqual(dtypes.as_dtype("uint2").is_integer, True)
     self.assertEqual(dtypes.as_dtype("qint8").is_integer, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_integer, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_integer, False)
@@ -251,6 +261,8 @@ def testIsFloating(self):
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_floating, True)
     self.assertEqual(dtypes.as_dtype("int4").is_floating, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_floating, False)
+    self.assertEqual(dtypes.as_dtype("int2").is_floating, False)
+    self.assertEqual(dtypes.as_dtype("uint2").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint8").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_floating, False)
@@ -278,6 +290,8 @@ def testIsComplex(self):
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_complex, False)
     self.assertEqual(dtypes.as_dtype("int4").is_complex, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("int2").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("uint2").is_complex, False)
     self.assertEqual(dtypes.as_dtype("qint8").is_complex, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_complex, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_complex, False)
@@ -305,6 +319,8 @@ def testIsUnsigned(self):
     self.assertEqual(dtypes.as_dtype("float8_e5m2fnuz").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("int4").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("uint4").is_unsigned, True)
+    self.assertEqual(dtypes.as_dtype("int2").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("uint2").is_unsigned, True)
     self.assertEqual(dtypes.as_dtype("qint8").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_unsigned, False)
@@ -388,6 +404,12 @@ def testMinMax(self):
       if numpy_dtype == dtypes.uint4.as_numpy_dtype:
         self.assertEqual(dtype.min, 0)
         self.assertEqual(dtype.max, 15)
+      if numpy_dtype == dtypes.int2.as_numpy_dtype:
+        self.assertEqual(dtype.min, -2)
+        self.assertEqual(dtype.max, 1)
+      if numpy_dtype == dtypes.uint2.as_numpy_dtype:
+        self.assertEqual(dtype.min, 0)
+        self.assertEqual(dtype.max, 3)
 
   def testLimitsUndefinedError(self):
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/framework/experimental/unified_api.cc b/tensorflow/python/framework/experimental/unified_api.cc
index ea1047ff8d90..49170e48d0f1 100644
--- a/tensorflow/python/framework/experimental/unified_api.cc
+++ b/tensorflow/python/framework/experimental/unified_api.cc
@@ -78,7 +78,6 @@ using tensorflow::Safe_TF_StatusPtr;
 using tensorflow::Status;
 using tensorflow::string;
 using tensorflow::TFE_TensorHandleToNumpy;
-using tensorflow::core::RefCountPtr;
 
 using tensorflow::errors::Internal;
 using tensorflow::errors::InvalidArgument;
diff --git a/tensorflow/python/framework/extension_type.py b/tensorflow/python/framework/extension_type.py
index d83e5b3b6d40..37d172cc5400 100644
--- a/tensorflow/python/framework/extension_type.py
+++ b/tensorflow/python/framework/extension_type.py
@@ -436,7 +436,7 @@ def _deserialize(cls, state):  # TypeSpec API.
 
   def __reduce__(self):
     # Use value_type instead of spec_type, as spec_type is a nested class.
-    # Pickle support of nested class requries Pickle protocol version 4, which
+    # Pickle support of nested class requires Pickle protocol version 4, which
     # is not enabled by default until py 3.8.
     #
     # https://www.python.org/dev/peps/pep-3154/#serializing-more-lookupable-objects
@@ -448,7 +448,7 @@ def _to_components(self, value):  # TypeSpec API.
       return value._tf_extension_type_packed_variant  # pylint: disable=protected-access
 
     tensor_or_composite = (tensor.Tensor, composite_tensor.CompositeTensor)
-    # Retireve fields by the order of spec dict to preserve field ordering. This
+    # Retrieve fields by the order of spec dict to preserve field ordering. This
     # is needed as nest.flatten would sort dictionary entries by key.
     value_tuple = tuple(value.__dict__[key] for key in self.__dict__)
     return tuple(
diff --git a/tensorflow/python/framework/extension_type_field.py b/tensorflow/python/framework/extension_type_field.py
index afd84fb7d9d1..782996e45b86 100644
--- a/tensorflow/python/framework/extension_type_field.py
+++ b/tensorflow/python/framework/extension_type_field.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Meatadata about fields for user-defined ExtensionType classes."""
+"""Metadata about fields for user-defined ExtensionType classes."""
 
 import collections
 import collections.abc
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 7cc817d45ee3..34c3a3bec391 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -1390,4 +1390,6 @@ def _type_list_to_str(types):
     dtypes.float8_e5m2fnuz: "f8e5m2fnuz",
     dtypes.int4: "i4",
     dtypes.uint4: "u4",
+    dtypes.int2: "i2",
+    dtypes.uint2: "u2",
 }
diff --git a/tensorflow/python/framework/memory_checker.py b/tensorflow/python/framework/memory_checker.py
index e72f80c4d184..07de990fc64b 100644
--- a/tensorflow/python/framework/memory_checker.py
+++ b/tensorflow/python/framework/memory_checker.py
@@ -81,7 +81,7 @@ def record_snapshot(self):
     that if there is a leak, it's happening similarly on every snapshot.
 
     The recommended number of `record_snapshot()` call depends on the testing
-    code complexity and the allcoation pattern.
+    code complexity and the allocation pattern.
     """
     self._python_memory_checker.record_snapshot()
 
diff --git a/tensorflow/python/framework/op_allowlist_namespace_test.py b/tensorflow/python/framework/op_allowlist_namespace_test.py
index 97255e296450..dc5d5beaeaf1 100644
--- a/tensorflow/python/framework/op_allowlist_namespace_test.py
+++ b/tensorflow/python/framework/op_allowlist_namespace_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""# Test that buidling using op_allowlist works with ops with namespaces."""
+"""# Test that building using op_allowlist works with ops with namespaces."""
 
 from tensorflow.python.framework import test_namespace_ops
 from tensorflow.python.platform import googletest
diff --git a/tensorflow/python/framework/op_callbacks.py b/tensorflow/python/framework/op_callbacks.py
index 16298e45ac34..589e8cfbee2a 100644
--- a/tensorflow/python/framework/op_callbacks.py
+++ b/tensorflow/python/framework/op_callbacks.py
@@ -40,61 +40,47 @@ def add_op_callback(callback_fn):
        op type involved (b/140334369).
 
   Args:
-    callback_fn: A callback_fn that has the following signature:
-      def callback_fn(op_type,
-                      inputs,
-                      attrs,
-                      outputs,
-                      op_name=None,
-                      graph=None):
-        # op_type: The type of the op, as a string. E.g., "MatMul".
-        #          For the special case of FuncGraph execution, op_type
-        #          takes the name of the graph name, e.g.,
-        #          "__inference_my_func_24".
-        # inputs: (`tuple` of `Tensor`s) Input tensors to the op or the
-        #         FuncGraph.
-        #         - In eager execution, these are `EagerTensor`s.
-        #         - In graph construction, these are non-eager `Tensor`s
-        #           that form the inputs to the just-created op.
-        # attrs: The attributes of the op or FuncGraph of which the execution
-        #        or creation caused the current invocation of the callback.
-        #        This is applicable to both eager- and graph-based execution,
-        #        as well as graph construction.
-        #        This is a tuple of alternating attribute keys and attribute
-        #        values. E.g., `('adjoint_a', False, 'adjoint_b', False)`.
-        # outputs: (`tuple of `Tensor`s) Output tensors from the op or
-        #          FuncGraph.
-        #          In eager execution, these are `EagerTensor`s.
-        #          In graph construction, these are non-eager `Tensor`s that
-        #          are the outputs of the just-created op.
-        # op_name: Name of the op.
-        #          - If the current invocation of the callback is due to the
-        #            eager execution of an op or FuncGraph, this will be
-        #            `None`, as op names are meaningless in eager execution.
-        #          - In graph construction, this is the name of the op, e.g.,
-        #            "MatMul_2".
-        # graph: The graph that the op belongs to (if any).
-        #        - In eager execution of an op or FuncGraph, this is `None`.
-        #        - In graph construction, this is the op's enclosing graph
-        #          as a `tf.Graph` object.
+    callback_fn: A callback_fn that has the following signature: def
+      callback_fn(op_type, inputs, attrs, outputs, op_name=None, graph=None): #
+      op_type: The type of the op, as a string. E.g., "MatMul". #          For
+        the special case of FuncGraph execution, op_type #          takes the
+        name of the graph name, e.g., #          "__inference_my_func_24". #
+        inputs: (`tuple` of `Tensor`s) Input tensors to the op or the #
+        FuncGraph. #         - In eager execution, these are `EagerTensor`s. #
+        - In graph construction, these are non-eager `Tensor`s #           that
+        form the inputs to the just-created op. # attrs: The attributes of the
+        op or FuncGraph of which the execution #        or creation caused the
+        current invocation of the callback. #        This is applicable to both
+        eager- and graph-based execution, #        as well as graph
+        construction. # This is a tuple of alternating attribute keys and
+        attribute # values. E.g., `('adjoint_a', False, 'adjoint_b', False)`. #
+        outputs: (`tuple of `Tensor`s) Output tensors from the op or #
+        FuncGraph. #          In eager execution, these are `EagerTensor`s. #
+        In graph construction, these are non-eager `Tensor`s that #          are
+        the outputs of the just-created op. # op_name: Name of the op. #
+        - If the current invocation of the callback is due to the #
+        eager execution of an op or FuncGraph, this will be #            `None`,
+        as op names are meaningless in eager execution. #          - In graph
+        construction, this is the name of the op, e.g., #            "MatMul_2".
         #
-        # Return values:
-        #   This callback function is expected to return `None` or
-        #   a `list` or `tuple` of `Tensor`s with its length matching
-        #   `len(outputs)`, in the order that corresponds to that of the
-        #   `outputs` argument.
-        #   If the return value is `None`, downstream execution or graph
-        #   construction will be unaffected.
-        #   However, if the return value is a `list` or `tuple` of `Tensor`s,
-        #   - In eager execution, these returned `Tensor`s should be
-        #     `EagerTensor`s. Their values will replace the original values of
-        #     `outputs` for downstream eager execution. (*Not implemented yet*).
-        #   - In graph construction, these returned `Tensor`s should be
-        #     non-eager `Tensor`s. Their values will replace the original
-        #     `outputs` for downstream graph construction.
+      graph: The graph that the op belongs to (if any). #        - In eager
+        execution of an op or FuncGraph, this is `None`. #        - In graph
+        construction, this is the op's enclosing graph #          as a
+        `tf.Graph` object. # # Return values: #   This callback function is
+        expected to return `None` or #   a `list` or `tuple` of `Tensor`s with
+        its length matching #   `len(outputs)`, in the order that corresponds to
+        that of the #   `outputs` argument. #   If the return value is `None`,
+        downstream execution or graph #   construction will be unaffected. #
+        However, if the return value is a `list` or `tuple` of `Tensor`s, #   -
+        In eager execution, these returned `Tensor`s should be #
+        `EagerTensor`s. Their values will replace the original values of #
+        `outputs` for downstream eager execution. (*Not implemented yet*). #   -
+        In graph construction, these returned `Tensor`s should be #
+        non-eager `Tensor`s. Their values will replace the original #
+        `outputs` for downstream graph construction.
 
   Raises:
-    ValueEror: If `callback_fn` is `None` or not callable.
+    ValueError: If `callback_fn` is `None` or not callable.
   """
   # TODO(b/139668041): Implement support for overriding `EagerTensor`s from
   # callback.
diff --git a/tensorflow/python/framework/op_def_registry.cc b/tensorflow/python/framework/op_def_registry.cc
index f22b7d21449a..f8e4985b89a9 100644
--- a/tensorflow/python/framework/op_def_registry.cc
+++ b/tensorflow/python/framework/op_def_registry.cc
@@ -37,7 +37,7 @@ PYBIND11_MODULE(_op_def_registry, m) {
     }
 
     // Explicitly convert to py::bytes because std::string is implicitly
-    // convertable to py::str by default.
+    // convertible to py::str by default.
     return py::reinterpret_borrow<py::object>(py::bytes(serialized_op_def));
   });
 }
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 64af2c19ec51..0c413bb57f3e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -372,6 +372,27 @@ def __array__(self, dtype=None) -> np.ndarray:
 
     return np.array(a, dtype=dtype)
 
+  def __dlpack__(
+      self, *, stream=None, max_version=None, dl_device=None, copy=None  # pylint: disable=redefined-outer-name
+  ):
+    del max_version  # Unused
+    if stream is not None:
+      raise RuntimeError(
+          "tf.Tensor does not support DLPack export with a non-None stream"
+      )
+    if dl_device is not None:
+      raise RuntimeError(
+          "tf.Tensor does not support DLPack export with a non-None dl_device"
+      )
+    if copy:
+      raise RuntimeError(
+          "tf.Tensor does not support DLPack export with a copy=True"
+      )
+    return pywrap_tfe.TFE_ToDlpackCapsule(self)
+
+  def __dlpack_device__(self):
+    return pywrap_tfe.TFE_DlpackDevice(self)
+
   def __hash__(self) -> int:
     # EagerTensors are never hashable.
     raise TypeError("Tensor is unhashable. "
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 6ca94896d51b..a2b0db20f330 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -98,6 +98,8 @@ const std::unordered_map<string, string> dtype_type{
     {"_dtypes.float8_e5m2fnuz", "_atypes.Float8e5m2fnuz"},
     {"_dtypes.int4", "_atypes.Int4"},
     {"_dtypes.uint4", "_atypes.UInt4"},
+    {"_dtypes.int2", "_atypes.Int2"},
+    {"_dtypes.uint2", "_atypes.UInt2"},
 };
 
 string AttrVarName(const string& attr_name,
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 3a7a57cddbe4..68723c63b2fe 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -68,7 +68,7 @@ absl::Status ReadOpListFromFile(const string& filename,
     }
     s = input_buffer->ReadLine(&line_contents);
   }
-  if (!errors::IsOutOfRange(s)) return s;
+  if (!absl::IsOutOfRange(s)) return s;
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index d02861d2e129..febfdc89a914 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/python/framework/python_op_gen.h"
 
+#include <string>
 #include <unordered_set>
 #include <vector>
 
@@ -31,8 +32,7 @@ namespace {
 
 void ExpectHasSubstr(const string& s, const string& expected) {
   EXPECT_TRUE(absl::StrContains(s, expected))
-      << "'Generated ops "
-      << " does not contain '" << expected << "'";
+      << "'Generated ops '" << s << "' does not contain '" << expected << "'";
 }
 
 void ExpectDoesNotHaveSubstr(const string& s, const string& expected) {
@@ -59,21 +59,23 @@ TEST(PythonOpGen, TypeAnnotateAllOps) {
       GetPythonOps(ops, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
                    /* source_file_list= */ {});
 
-  const string all_types =
+  const std::string all_types =
       ", \"_atypes.BFloat16\", \"_atypes.Bool\", \"_atypes.Complex128\", "
       "\"_atypes.Complex64\", \"_atypes.Float16\", \"_atypes.Float32\", "
       "\"_atypes.Float64\", "
       "\"_atypes.Float8e4m3b11fnuz\", \"_atypes.Float8e4m3fn\", "
       "\"_atypes.Float8e4m3fnuz\", \"_atypes.Float8e5m2\", "
       "\"_atypes.Float8e5m2fnuz\", "
-      "\"_atypes.Half\", \"_atypes.Int16\", "
-      "\"_atypes.Int32\", \"_atypes.Int4\", \"_atypes.Int64\", "
+      "\"_atypes.Half\", \"_atypes.Int16\", \"_atypes.Int2\", "
+      "\"_atypes.Int32\", \"_atypes.Int4\", "
+      "\"_atypes.Int64\", "
       "\"_atypes.Int8\", "
       "\"_atypes.QInt16\", \"_atypes.QInt32\", \"_atypes.QInt8\", "
       "\"_atypes.QUInt16\", "
       "\"_atypes.QUInt8\", \"_atypes.Resource\", \"_atypes.String\", "
-      "\"_atypes.UInt16\", "
-      "\"_atypes.UInt32\", \"_atypes.UInt4\", \"_atypes.UInt64\", "
+      "\"_atypes.UInt16\", \"_atypes.UInt2\", "
+      "\"_atypes.UInt32\", \"_atypes.UInt4\", "
+      "\"_atypes.UInt64\", "
       "\"_atypes.UInt8\", "
       "\"_atypes.Variant\")";
 
diff --git a/tensorflow/python/framework/tensor.py b/tensorflow/python/framework/tensor.py
index 823d31d38eeb..6c1f98153f0c 100644
--- a/tensorflow/python/framework/tensor.py
+++ b/tensorflow/python/framework/tensor.py
@@ -405,7 +405,7 @@ def get_shape(self) -> tensor_shape.TensorShape:
     ...   tf.TensorSpec([3,5]))
     Result shape: (None, 5)
 
-    Tracing may fail if a shape missmatch can be detected:
+    Tracing may fail if a shape mismatch can be detected:
 
     >>> cf = my_matmul.get_concrete_function(
     ...   tf.TensorSpec([None,3]),
@@ -433,7 +433,6 @@ def get_shape(self) -> tensor_shape.TensorShape:
 
     Returns:
       A `tf.TensorShape` representing the shape of this tensor.
-
     """
     return self.shape
 
@@ -1006,7 +1005,7 @@ def is_subtype_of(self, other):
     )
 
   def placeholder_value(self, placeholder_context):
-    """Generates a graph placholder with the given TensorSpec information."""
+    """Generates a graph placeholder with the given TensorSpec information."""
     if placeholder_context.unnest_only:
       return self
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 35bf60a4d7bf..e1409e56f936 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -185,6 +185,24 @@ def SlowAppendUInt4ArrayToTensorProto(tensor_proto, proto_values):
   tensor_proto.int_val.extend(x.tolist())
 
 
+def SlowAppendInt2ArrayToTensorProto(tensor_proto, proto_values):
+  # The actual bit representation of int2 as a bit-field is
+  # implementation-defined, so we need to explicitly cast each
+  # value to an int for packing.
+  x = numpy_compat.np_asarray(
+      proto_values, dtype=dtypes.int2.as_numpy_dtype).astype(np.int8)
+  tensor_proto.int_val.extend(x.tolist())
+
+
+def SlowAppendUInt2ArrayToTensorProto(tensor_proto, proto_values):
+  # The actual bit representation of int2 as a bit-field is
+  # implementation-defined, so we need to explicitly cast each
+  # value to an int for packing.
+  x = numpy_compat.np_asarray(
+      proto_values, dtype=dtypes.uint2.as_numpy_dtype).astype(np.int8)
+  tensor_proto.int_val.extend(x.tolist())
+
+
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
       np.float16: _MediumAppendFloat16ArrayToTensorProto,
@@ -233,6 +251,8 @@ def SlowAppendUInt4ArrayToTensorProto(tensor_proto, proto_values):
       ),
       dtypes.int4.as_numpy_dtype: SlowAppendInt4ArrayToTensorProto,
       dtypes.uint4.as_numpy_dtype: SlowAppendUInt4ArrayToTensorProto,
+      dtypes.int2.as_numpy_dtype: SlowAppendInt2ArrayToTensorProto,
+      dtypes.uint2.as_numpy_dtype: SlowAppendUInt2ArrayToTensorProto,
   }
 else:
 
@@ -299,6 +319,8 @@ def SlowAppendBoolArrayToTensorProto(tensor_proto, proto_values):
       dtypes.qint32.as_numpy_dtype: SlowAppendQIntArrayToTensorProto,
       dtypes.int4.as_numpy_dtype: SlowAppendInt4ArrayToTensorProto,
       dtypes.uint4.as_numpy_dtype: SlowAppendUInt4ArrayToTensorProto,
+      dtypes.int2.as_numpy_dtype: SlowAppendInt2ArrayToTensorProto,
+      dtypes.uint2.as_numpy_dtype: SlowAppendUInt2ArrayToTensorProto,
   }
 
 
@@ -376,8 +398,8 @@ def _FlattenToStrings(nested_strings):
     dtypes.float8_e4m3b11fnuz,
     dtypes.float8_e5m2fnuz,
     dtypes.bfloat16,
-    # int4/uint4 intentionally not listed, since their binary representation
-    # is implementation-dependent.
+    # int4 / uint4 / int2 / uint2 intentionally not listed, since their binary
+    # representation is implementation-dependent.
 ])
 
 
@@ -777,6 +799,8 @@ def MakeNdarray(tensor):
       dtypes.quint16,
       dtypes.int4,
       dtypes.uint4,
+      dtypes.int2,
+      dtypes.uint2,
   ]:
     values = np.fromiter(tensor.int_val, dtype=dtype)
   elif tensor_dtype == dtypes.int64:
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 9949b706405f..425a6f756b46 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -437,6 +437,54 @@ def testUInt4(self):
         t,
     )
 
+  def testInt2(self):
+    test_type = dtypes.int2.as_numpy_dtype
+    t = tensor_util.make_tensor_proto(
+        np.array(
+            [-2, -1, 0, 1],
+            dtype=test_type,
+        )
+    )
+    self.assertProtoEquals(
+        """
+      dtype: DT_INT2
+      tensor_shape {
+        dim {
+          size: 4
+        }
+      }
+      int_val: -2
+      int_val: -1
+      int_val: 0
+      int_val: 1
+      """,
+        t,
+    )
+
+  def testUInt2(self):
+    test_type = dtypes.uint2.as_numpy_dtype
+    t = tensor_util.make_tensor_proto(
+        np.array(
+            [0, 1, 2, 3],
+            dtype=test_type,
+        )
+    )
+    self.assertProtoEquals(
+        """
+      dtype: DT_UINT2
+      tensor_shape {
+        dim {
+          size: 4
+        }
+      }
+      int_val: 0
+      int_val: 1
+      int_val: 2
+      int_val: 3
+      """,
+        t,
+    )
+
   def testLargeInt(self):
     value = np.iinfo(np.int64).max
     t = tensor_util.make_tensor_proto(value)
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 7ef6d0b804f2..9206c5fa6ba2 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -704,7 +704,7 @@ def wrapper(func: _F) -> _F:
 
 
 def enable_output_all_intermediates(fn: _F) -> _F:
-  """Force-enable outputing all intermediates from functional control flow ops.
+  """Force-enable outputting all intermediates from functional control flow ops.
 
   Args:
     fn: the function to be wrapped
@@ -739,7 +739,7 @@ def assert_no_new_pyobjects_executing_eagerly(
   a bit of Python.
 
   Args:
-    warmup_iters: The numer of warmup iterations, excluded from measuring.
+    warmup_iters: The number of warmup iterations, excluded from measuring.
 
   Returns:
     A decorator function which can be applied to the test function.
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index beb278b1624f..4a9870dc25e5 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -628,9 +628,11 @@ def __make_cmp_key(self, value):
     if isinstance(value, np.ndarray):
       return (np.ndarray, value.shape,
               TypeSpec.__nested_list_to_tuple(value.tolist()))
-    raise ValueError(f"Cannot generate a hashable key for {self} because "
-                     f"the _serialize() method "
-                     f"returned an unsupproted value of type {type(value)}")
+    raise ValueError(
+        f"Cannot generate a hashable key for {self} because "
+        "the _serialize() method "
+        f"returned an unsupported value of type {type(value)}"
+    )
 
   @staticmethod
   def __nested_list_to_tuple(value):
@@ -782,7 +784,7 @@ class LegacyTypeSpecBatchEncoder(TypeSpecBatchEncoder):
   """TypeSpecBatchEncoder for legacy composite tensor classes.
 
   TODO(edloper): Update existing composite tensors to use non-legacy
-    CompositTensorBatchEncoders.
+    CompositeTensorBatchEncoders.
   """
 
   def batch(self, type_spec, batch_size):
diff --git a/tensorflow/python/grappler/remapper_test.py b/tensorflow/python/grappler/remapper_test.py
index 068e3ec91c04..c66722afdda0 100644
--- a/tensorflow/python/grappler/remapper_test.py
+++ b/tensorflow/python/grappler/remapper_test.py
@@ -78,8 +78,7 @@ def setUp(self):
     super(RemapperTest, self).setUp()
     # GeluApproximate fusion on GPU requires cublasLt.
     os.environ['TF_USE_CUBLASLT'] = '1'
-    # GeluExact fusion and conv runtime fusion on GPU requires cuDNN frontend.
-    os.environ['TF_CUDNN_USE_FRONTEND'] = '1'
+
     os.environ['TF_CUDNN_USE_RUNTIME_FUSION'] = '1'
 
   def maybe_skip_test(self, mode):
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 527a001f096c..1d478acef5bf 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -616,7 +616,7 @@ class Callback:
 
   1. You should pack all your callbacks into a single `callbacks.CallbackList`
      so they can all be called together.
-  2. You will need to manually call all the `on_*` methods at the apropriate
+  2. You will need to manually call all the `on_*` methods at the appropriate
      locations in your loop. Like this:
 
      ```
@@ -637,10 +637,10 @@ class Callback:
      ```
 
   Attributes:
-      params: Dict. Training parameters
-          (eg. verbosity, batch size, number of epochs...).
-      model: Instance of `keras.models.Model`.
-          Reference of the model being trained.
+      params: Dict. Training parameters (eg. verbosity, batch size, number of
+        epochs...).
+      model: Instance of `keras.models.Model`. Reference of the model being
+        trained.
 
   The `logs` dictionary that callback methods
   take as argument will contain keys for quantities relevant to
@@ -1627,19 +1627,18 @@ class BackupAndRestore(Callback):
   ...   pass
   >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
   ...             batch_size=1, callbacks=[callback], verbose=0)
-  >>> # Only 6 more epochs are run, since first trainning got interrupted at
+  >>> # Only 6 more epochs are run, since first training got interrupted at
   >>> # zero-indexed epoch 4, second training will continue from 4 to 9.
   >>> len(history.history['loss'])
   6
 
   Args:
-      backup_dir: String, path to store the checkpoint.
-        e.g. backup_dir = os.path.join(working_dir, 'backup')
-        This is the directory in which the system stores temporary files to
-        recover the model from jobs terminated unexpectedly. The directory
-        cannot be reused elsewhere to store other files, e.g. by
-        BackupAndRestore callback of another training, or by another callback
-        (ModelCheckpoint) of the same training.
+      backup_dir: String, path to store the checkpoint. e.g. backup_dir =
+        os.path.join(working_dir, 'backup') This is the directory in which the
+        system stores temporary files to recover the model from jobs terminated
+        unexpectedly. The directory cannot be reused elsewhere to store other
+        files, e.g. by BackupAndRestore callback of another training, or by
+        another callback (ModelCheckpoint) of the same training.
   """
 
   def __init__(self, backup_dir):
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_v1.py b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
index e0d6d38570dc..2362d88a92b9 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
@@ -626,7 +626,7 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
     inputs = flatten_per_replica_values(strategy, inputs)
     targets = flatten_per_replica_values(strategy, targets)
     # Expand 1-dimensional inputs.
-    # TODO(b/124535720): Remove once this standarize data logic is shared with
+    # TODO(b/124535720): Remove once this standardize data logic is shared with
     # main flow.
     inputs, targets = nest.map_structure(
         training_utils_v1.standardize_single_array, (inputs, targets))
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index dfa0f207a01f..bf27b3918154 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -877,7 +877,7 @@ def no_ragged_support(inputs, layer_name):
 
 
 def is_split_variable(v):
-  """Returns True if `v` is either a PartionedVariable or a ShardedVariable."""
+  """Returns True if `v` is either a PartitionedVariable or a ShardedVariable."""
   return hasattr(v, '_variable_list') or hasattr(v, '_variables')
 
 
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 81f202d4d8b0..8bee5daf01e4 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilites for `Model.compile`."""
+"""Utilities for `Model.compile`."""
 
 import copy
 
@@ -571,7 +571,7 @@ def _create_pseudo_names(tensors, prefix):
   """Creates pseudo {input | output} names for subclassed Models.
 
   Warning: this function should only be used to define default
-  names for `Metics` and `SavedModel`. No other use cases should
+  names for `Metrics` and `SavedModel`. No other use cases should
   rely on a `Model`'s input or output names.
 
   Example with dict:
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index f9db6cfe4a0c..aec3e81cb0a8 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -1619,7 +1619,7 @@ def pack_x_y_sample_weight(x, y=None, sample_weight=None):
     # For single x-input, we do no tuple wrapping since in this case
     # there is no ambiguity. This also makes NumPy and Dataset
     # consistent in that the user does not have to wrap their Dataset
-    # data in an unecessary tuple
+    # data in an unnecessary tuple
     if not nest.is_nested(x):
       return x
     else:
diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index 2735c83370ab..71b5c1ecfd63 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -244,7 +244,7 @@ def __hash__(self):
   # operators to run when the left operand is an ndarray, because it
   # accords the Tensor class higher priority than an ndarray, or a
   # numpy matrix.
-  # In the future explore chaning this to using numpy's __numpy_ufunc__
+  # In the future explore changing this to using numpy's __numpy_ufunc__
   # mechanism, which allows more control over how Tensors interact
   # with ndarrays.
   __array_priority__ = 100
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 0f46e17d3783..5ea6306f31bb 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -358,7 +358,7 @@ def call(self, inputs, training=None, mask=None):  # pylint: disable=redefined-o
     if not self._has_explicit_input_shape:
       if not tensor_util.is_tf_type(inputs) and not isinstance(
           inputs, np_arrays.ndarray):
-        # This is a Sequential with mutiple inputs. This is technically an
+        # This is a Sequential with multiple inputs. This is technically an
         # invalid use case of Sequential, but we tolerate it for backwards
         # compatibility.
         self._use_legacy_deferred_behavior = True
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 1e94ca45aef0..9369ffa45639 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -767,7 +767,8 @@ def train_step(self, data):
 
     This method can be overridden to support custom training logic.
     For concrete examples of how to override this method see
-    [Customizing what happends in fit](https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit).
+    [Customizing what happens in
+    fit](https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit).
     This method is called by `Model.make_train_function`.
 
     This method should contain the mathematical logic for one step of training.
@@ -786,7 +787,6 @@ def train_step(self, data):
       `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
       values of the `Model`'s metrics are returned. Example:
       `{'loss': 0.2, 'accuracy': 0.7}`.
-
     """
     # These are the only transformations `Model.fit` applies to user-input
     # data when a `tf.data.Dataset` is provided.
@@ -1235,7 +1235,7 @@ def fit(self,
         if self.stop_training:
           break
 
-      # If eval data_hanlder exists, delete it after all epochs are done.
+      # If eval data_handler exists, delete it after all epochs are done.
       if getattr(self, '_eval_data_handler', None) is not None:
         del self._eval_data_handler
       callbacks.on_train_end(logs=training_logs)
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index ca70099e066f..be9c360264fc 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -125,7 +125,7 @@ class RespectCompiledTrainableState(object):
   at `Model.compile` time will be used when training that model. In order to
   respect this requirement, it may be necessary to set the trainable value of
   layers to their compile time values before beginning a training endpoint and
-  restore the values before returing from said endpoint. This scope checks if
+  restore the values before returning from said endpoint. This scope checks if
   any layer's trainable state has changed since Model compile, and performs this
   set and un-set bookkeeping.
 
diff --git a/tensorflow/python/keras/engine/training_utils_v1.py b/tensorflow/python/keras/engine/training_utils_v1.py
index 5bfa27a38234..58a4dd0a8beb 100644
--- a/tensorflow/python/keras/engine/training_utils_v1.py
+++ b/tensorflow/python/keras/engine/training_utils_v1.py
@@ -1690,7 +1690,7 @@ def infer_steps_for_dataset(model,
       (dataset.options().experimental_distribute.auto_shard_policy !=
        options_lib.AutoShardPolicy.OFF)):
     # If the dataset would be auto-sharded, we should not infer a local
-    # steps_per_epoch due to the possible inbalanced sharding between workers.
+    # steps_per_epoch due to the possible imbalanced sharding between workers.
     return None
 
   size = backend.get_value(cardinality.cardinality(dataset))
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index fe1a5022c529..874e1fd1bc5d 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -916,17 +916,15 @@ def _check_variables(self, created_variables, accessed_variables):
     ]
     if untracked_new_vars:
       variable_str = '\n'.join('  {}'.format(i) for i in untracked_new_vars)
-      error_str = textwrap.dedent(
-          '''
+      error_str = textwrap.dedent("""
           The following Variables were created within a Lambda layer ({name})
           but are not tracked by said layer:
           {variable_str}
           The layer cannot safely ensure proper Variable reuse across multiple
-          calls, and consquently this behavior is disallowed for safety. Lambda
+          calls, and consequently this behavior is disallowed for safety. Lambda
           layers are not well suited to stateful computation; instead, writing a
           subclassed Layer is the recommend way to define layers with
-          Variables.'''
-      ).format(name=self.name, variable_str=variable_str)
+          Variables.""").format(name=self.name, variable_str=variable_str)
       raise ValueError(error_str)
 
     untracked_used_vars = [
@@ -1393,17 +1391,15 @@ def _check_variables(self, created_variables, accessed_variables):
     ]
     if untracked_new_vars:
       variable_str = '\n'.join('  {}'.format(i) for i in untracked_new_vars)
-      error_str = textwrap.dedent(
-          '''
+      error_str = textwrap.dedent("""
           The following Variables were created within a Lambda layer ({name})
           but are not tracked by said layer:
           {variable_str}
           The layer cannot safely ensure proper Variable reuse across multiple
-          calls, and consquently this behavior is disallowed for safety. Lambda
+          calls, and consequently this behavior is disallowed for safety. Lambda
           layers are not well suited to stateful computation; instead, writing a
           subclassed Layer is the recommend way to define layers with
-          Variables.'''
-      ).format(name=self.name, variable_str=variable_str)
+          Variables.""").format(name=self.name, variable_str=variable_str)
       raise ValueError(error_str)
 
     untracked_used_vars = [
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 68461de0841f..b78e1b9f9d3b 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -33,7 +33,7 @@ class _Merge(Layer):
   """
 
   def __init__(self, **kwargs):
-    """Intializes a Merge layer.
+    """Initializes a Merge layer.
 
     Args:
       **kwargs: standard layer keyword arguments.
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index f03e7de0ed93..b8567ce8df30 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -1216,7 +1216,7 @@ def _ragged_tensor_apply_loss(loss_fn, y_true, y_pred, y_pred_extra_dim=False):
   """
 
   def rt_is_equiv_dense(rt):
-    """Returns true if this RaggedTensor has the same row_lenghts across
+    """Returns true if this RaggedTensor has the same row_lengths across
 
        all ragged dimensions and thus can be converted to a dense tensor
        without loss of information.
@@ -1661,10 +1661,10 @@ def _ragged_tensor_categorical_crossentropy(y_true,
     from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
       we assume that `y_pred` encodes a probability distribution.
     label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
-      example, if `0.1`, use `0.1 / num_classes` for non-target labels
-      and `0.9 + 0.1 / num_classes` for target labels.
+      example, if `0.1`, use `0.1 / num_classes` for non-target labels and `0.9
+      + 0.1 / num_classes` for target labels.
     axis: The axis along which to compute crossentropy (the features axis).
-        Defaults to -1.
+      Defaults to -1.
 
   Returns:
     Categorical crossentropy loss value.
@@ -1676,7 +1676,7 @@ def _ragged_tensor_categorical_crossentropy(y_true,
   When used by CategoricalCrossentropy() with the default reduction
   (SUM_OVER_BATCH_SIZE), the reduction averages the loss over the
   number of elements independent of the batch. E.g. if the RaggedTensor
-  has 2 batches with [2, 1] values respectivly the resulting loss is
+  has 2 batches with [2, 1] values respectively the resulting loss is
   the sum of the individual loss values divided by 3.
   """
   fn = functools.partial(
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 96e63a57d4d2..ddce729ca681 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -1980,46 +1980,44 @@ class AUC(Metric):
   Use `sample_weight` of 0 to mask values.
 
   Args:
-    num_thresholds: (Optional) Defaults to 200. The number of thresholds to
-      use when discretizing the roc curve. Values must be > 1.
+    num_thresholds: (Optional) Defaults to 200. The number of thresholds to use
+      when discretizing the roc curve. Values must be > 1.
     curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
       [default] or 'PR' for the Precision-Recall-curve.
     summation_method: (Optional) Specifies the [Riemann summation method](
-        https://en.wikipedia.org/wiki/Riemann_sum) used.
-        'interpolation' (default) applies mid-point summation scheme for `ROC`.
-        For PR-AUC, interpolates (true/false) positives but not the ratio that
-        is precision (see Davis & Goadrich 2006 for details);
-        'minoring' applies left summation
-        for increasing intervals and right summation for decreasing intervals;
-        'majoring' does the opposite.
+        https://en.wikipedia.org/wiki/Riemann_sum) used. 'interpolation'
+          (default) applies mid-point summation scheme for `ROC`. For PR-AUC,
+          interpolates (true/false) positives but not the ratio that is
+          precision (see Davis & Goadrich 2006 for details); 'minoring' applies
+          left summation for increasing intervals and right summation for
+          decreasing intervals; 'majoring' does the opposite.
     name: (Optional) string name of the metric instance.
     dtype: (Optional) data type of the metric result.
     thresholds: (Optional) A list of floating point values to use as the
       thresholds for discretizing the curve. If set, the `num_thresholds`
       parameter is ignored. Values should be in [0, 1]. Endpoint thresholds
-      equal to {-epsilon, 1+epsilon} for a small positive epsilon value will
-      be automatically included with these to correctly handle predictions
-      equal to exactly 0 or 1.
-    multi_label: boolean indicating whether multilabel data should be
-      treated as such, wherein AUC is computed separately for each label and
-      then averaged across labels, or (when False) if the data should be
-      flattened into a single label before AUC computation. In the latter
-      case, when multilabel data is passed to AUC, each label-prediction pair
-      is treated as an individual data point. Should be set to False for
-      multi-class data.
+      equal to {-epsilon, 1+epsilon} for a small positive epsilon value will be
+      automatically included with these to correctly handle predictions equal to
+      exactly 0 or 1.
+    multi_label: boolean indicating whether multilabel data should be treated as
+      such, wherein AUC is computed separately for each label and then averaged
+      across labels, or (when False) if the data should be flattened into a
+      single label before AUC computation. In the latter case, when multilabel
+      data is passed to AUC, each label-prediction pair is treated as an
+      individual data point. Should be set to False for multi-class data.
     num_labels: (Optional) The number of labels, used when `multi_label` is
       True. If `num_labels` is not specified, then state variables get created
       on the first call to `update_state`.
     label_weights: (Optional) list, array, or tensor of non-negative weights
-      used to compute AUCs for multilabel data. When `multi_label` is True,
-      the weights are applied to the individual label AUCs when they are
-      averaged to produce the multi-label AUC. When it's False, they are used
-      to weight the individual label predictions in computing the confusion
-      matrix on the flattened data. Note that this is unlike class_weights in
-      that class_weights weights the example depending on the value of its
-      label, whereas label_weights depends only on the index of that label
-      before flattening; therefore `label_weights` should not be used for
-      multi-class data.
+      used to compute AUCs for multilabel data. When `multi_label` is True, the
+      weights are applied to the individual label AUCs when they are averaged to
+      produce the multi-label AUC. When it's False, they are used to weight the
+      individual label predictions in computing the confusion matrix on the
+      flattened data. Note that this is unlike class_weights in that
+      class_weights weights the example depending on the value of its label,
+      whereas label_weights depends only on the index of that label before
+      flattening; therefore `label_weights` should not be used for multi-class
+      data.
     from_logits: boolean indicating whether the predictions (`y_pred` in
       `update_state`) are probabilities or sigmoid logits. As a rule of thumb,
       when using a keras loss, the `from_logits` constructor argument of the
@@ -2045,12 +2043,12 @@ class AUC(Metric):
   Usage with `compile()` API:
 
   ```python
-  # Reports the AUC of a model outputing a probability.
+  # Reports the AUC of a model outputting a probability.
   model.compile(optimizer='sgd',
                 loss=tf.keras.losses.BinaryCrossentropy(),
                 metrics=[tf.keras.metrics.AUC()])
 
-  # Reports the AUC of a model outputing a logit.
+  # Reports the AUC of a model outputting a logit.
   model.compile(optimizer='sgd',
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 metrics=[tf.keras.metrics.AUC(from_logits=True)])
diff --git a/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
index b88950152664..7e9905899493 100644
--- a/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/loss_scale_optimizer.py
@@ -612,7 +612,7 @@ def apply_gradients(self,
     if experimental_aggregate_gradients:
       # We must aggregate the gradients here instead of in
       # self.optimizer.apply_gradients, so that any NaN or Inf gradients are
-      # propogated to each replica. If any replica has a NaN or Inf gradient,
+      # propagated to each replica. If any replica has a NaN or Inf gradient,
       # they must all have a NaN or Inf gradient so that they all skip the step.
       # pylint: disable=protected-access
       grads_and_vars = self._optimizer._transform_unaggregated_gradients(
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index b5eaccc3c579..4f14e1a87002 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -451,7 +451,7 @@ def clone_model(model, input_tensors=None, clone_function=None):
           model, input_tensors=input_tensors, layer_fn=clone_function)
 
 
-# "Clone" a subclassed model by reseting all of the attributes.
+# "Clone" a subclassed model by resetting all of the attributes.
 def _in_place_subclassed_model_reset(model):
   """Substitute for model cloning that works for subclassed models.
 
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index d239d4995134..3c140f104275 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -480,7 +480,7 @@ def _aggregate_gradients(self, grads_and_vars):
       grads_and_vars: List of (gradient, variable) pairs.
 
     Returns:
-      A list of (aggregrated_gradient, variable) pairs. By default, this calls
+      A list of (aggregated_gradient, variable) pairs. By default, this calls
       `self.gradient_aggregator`.
     """
     return self.gradient_aggregator(grads_and_vars)
@@ -619,7 +619,7 @@ def apply_gradients(self,
       name: Optional name for the returned operation. Default to the name passed
         to the `Optimizer` constructor.
       experimental_aggregate_gradients: Whether to sum gradients from different
-        replicas in the presense of `tf.distribute.Strategy`. If False, it's
+        replicas in the presence of `tf.distribute.Strategy`. If False, it's
         user responsibility to aggregate the gradients. Default to True.
 
     Returns:
@@ -1452,7 +1452,7 @@ class RestoredOptimizer(OptimizerV2):
   Holds slot variables and hyperparameters when an optimizer is restored from a
   SavedModel. These variables may be referenced in functions along with ops
   created by the original optimizer, but currently we do not support using the
-  optimizer object iself (e.g. through `apply_gradients`).
+  optimizer object itself (e.g. through `apply_gradients`).
   """
   # TODO(allenl): Make the restored optimizer functional by tracing its apply
   # methods.
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 4b4a321f9bdd..035f7530b2dd 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -1001,7 +1001,7 @@ def _estimate_step_duration(self, current, now):
     if current:
       # there are a few special scenarios here:
       # 1) somebody is calling the progress bar without ever supplying step 1
-      # 2) somebody is calling the progress bar and supplies step one mulitple
+      # 2) somebody is calling the progress bar and supplies step one multiple
       #    times, e.g. as part of a finalizing call
       # in these cases, we just fall back to the simple calculation
       if self._time_after_first_step is not None and current > 1:
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index fde05826279e..d52579e395b1 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -354,19 +354,19 @@ def _update_confusion_matrix_variables_optimized(
       to `bool`.
     y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
       the range `[0, 1]`.
-    thresholds: A sorted floating point `Tensor` with value in `[0, 1]`.
-      It need to be evenly distributed (the diff between each element need to be
-      the same).
+    thresholds: A sorted floating point `Tensor` with value in `[0, 1]`. It need
+      to be evenly distributed (the diff between each element need to be the
+      same).
     multi_label: Optional boolean indicating whether multidimensional
       prediction/labels should be treated as multilabel responses, or flattened
-      into a single label. When True, the valus of `variables_to_update` must
+      into a single label. When True, the values of `variables_to_update` must
       have a second dimension equal to the number of labels in y_true and
       y_pred, and those tensors must not be RaggedTensors.
     sample_weights: Optional `Tensor` whose rank is either 0, or the same rank
       as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions
       must be either `1`, or the same as the corresponding `y_true` dimension).
-    label_weights: Optional tensor of non-negative weights for multilabel
-      data. The weights are applied when calculating TP, FP, FN, and TN without
+    label_weights: Optional tensor of non-negative weights for multilabel data.
+      The weights are applied when calculating TP, FP, FN, and TN without
       explicit multilabel handling (i.e. when the data is to be flattened).
     thresholds_with_epsilon: Optional boolean indicating whether the leading and
       tailing thresholds has any epsilon added for floating point imprecisions.
@@ -485,7 +485,7 @@ def is_evenly_distributed_thresholds(thresholds):
 
   We could leverage evenly distributed thresholds to use less memory when
   calculate metrcis like AUC where each individual threshold need to be
-  evaluted.
+  evaluated.
 
   Args:
     thresholds: A python list or tuple, or 1D numpy array whose value is ranged
@@ -548,7 +548,7 @@ def update_confusion_matrix_variables(variables_to_update,
       be either `1`, or the same as the corresponding `y_true` dimension).
     multi_label: Optional boolean indicating whether multidimensional
       prediction/labels should be treated as multilabel responses, or flattened
-      into a single label. When True, the valus of `variables_to_update` must
+      into a single label. When True, the values of `variables_to_update` must
       have a second dimension equal to the number of labels in y_true and
       y_pred, and those tensors must not be RaggedTensors.
     label_weights: (optional) tensor of non-negative weights for multilabel
diff --git a/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py
index f478a85cd63d..8a4c637c11fe 100644
--- a/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py
@@ -23,7 +23,6 @@
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.linalg import linalg_impl
 from tensorflow.python.platform import test as test_lib
 
 
@@ -54,40 +53,41 @@ def testBatchGradientUnknownSize(self):
 class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
   pass  # Filled in below
 
-
-def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
-
-  @test_util.enable_control_flow_v2
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
-  @test_util.run_without_tensor_float_32(
-      'Tests `tf.linalg.expm`, which call matmul. Additionally, calls ops '
-      'which do matmul in their gradient, such as MatrixSolve.')
-  def Test(self):
-
-    def RandomInput():
-      np.random.seed(1)
-      return np.random.uniform(
-          low=-1.0, high=1.0,
-          size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-
-    if functor_.__name__ == 'matrix_square_root':
-      # Square the input matrix to ensure that its matrix square root exists
-      f = lambda x: functor_(math_ops.matmul(x, x), **kwargs_)
-    else:
-      f = functor_
-
-    # Optimal stepsize for central difference is O(epsilon^{1/3}).
-    epsilon = np.finfo(dtype_).eps
-    delta = epsilon**(1.0 / 3.0)
-    # tolerance obtained by looking at actual differences using
-    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-    tol = 1e-6 if dtype_ == np.float64 else 0.05
-
-    theoretical, numerical = gradient_checker_v2.compute_gradient(
-        f, [RandomInput()], delta=delta)
-    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
-
-  return Test
+# TODO(b/417809163): re-enable this test when upstream issues are resolved
+# see commit msg for details
+# def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
+#
+#  @test_util.enable_control_flow_v2
+#  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+#  @test_util.run_without_tensor_float_32(
+#      'Tests `tf.linalg.expm`, which call matmul. Additionally, calls ops '
+#      'which do matmul in their gradient, such as MatrixSolve.')
+#  def Test(self):
+
+#    def RandomInput():
+#      np.random.seed(1)
+#      return np.random.uniform(
+#          low=-1.0, high=1.0,
+#          size=np.prod(shape_)).reshape(shape_).astype(dtype_)
+
+#    if functor_.__name__ == 'matrix_square_root':
+#      # Square the input matrix to ensure that its matrix square root exists
+#      f = lambda x: functor_(math_ops.matmul(x, x), **kwargs_)
+#    else:
+#      f = functor_
+
+#    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+#    epsilon = np.finfo(dtype_).eps
+#    delta = epsilon**(1.0 / 3.0)
+# tolerance obtained by looking at actual differences using
+# np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+#    tol = 1e-6 if dtype_ == np.float64 else 0.05
+
+#    theoretical, numerical = gradient_checker_v2.compute_gradient(
+#        f, [RandomInput()], delta=delta)
+#    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+#  return Test
 
 
 class MatrixBinaryFunctorGradientTest(test_lib.TestCase):
@@ -230,45 +230,48 @@ def Test(self):
       for extra in [(), (2,), (3,)] + [(3, 2)] * (size < 10):
         shape = extra + (size, size)
         name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
-        _AddTest(
-            MatrixUnaryFunctorGradientTest, 'MatrixInverseGradient', name,
-            _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_inverse, dtype,
-                                               shape))
-        _AddTest(
-            MatrixUnaryFunctorGradientTest, 'MatrixAdjointInverseGradient',
-            name, _GetMatrixUnaryFunctorGradientTest(
-                lambda x: linalg_ops.matrix_inverse(x, adjoint=True),
-                dtype, shape))
-
-        if True:  # not test_lib.is_built_with_rocm():
-          # TODO(rocm) :
-          # re-enable this test when upstream issues are resolved
-          # see commit msg for details
-          _AddTest(
-              MatrixUnaryFunctorGradientTest, 'MatrixExponentialGradient', name,
-              _GetMatrixUnaryFunctorGradientTest(linalg_impl.matrix_exponential,
-                                                 dtype, shape))
-        _AddTest(
-            MatrixUnaryFunctorGradientTest, 'MatrixDeterminantGradient', name,
-            _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_determinant,
-                                               dtype, shape))
-        _AddTest(
-            MatrixUnaryFunctorGradientTest, 'LogMatrixDeterminantGradient',
-            name,
-            _GetMatrixUnaryFunctorGradientTest(
-                lambda x: linalg_ops.log_matrix_determinant(x)[1], dtype,
-                shape))
+        # _AddTest(
+        #     MatrixUnaryFunctorGradientTest, 'MatrixInverseGradient', name,
+        #     _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_inverse,
+        #                                        dtype, shape))
+        #        _AddTest(
+        #            MatrixUnaryFunctorGradientTest,
+        #            'MatrixAdjointInverseGradient', name,
+        #            _GetMatrixUnaryFunctorGradientTest(
+        #                lambda x: linalg_ops.matrix_inverse(x, adjoint=True),
+        #                dtype, shape))
+
+        #        if True:  # not test_lib.is_built_with_rocm():
+        # TODO(b/417809163):
+        # re-enable this test when upstream issues are resolved
+        # see commit msg for details
+        # _AddTest(
+        #     MatrixUnaryFunctorGradientTest, 'MatrixExponentialGradient', name,
+        #     _GetMatrixUnaryFunctorGradientTest(linalg_impl.matrix_exponential,
+        #                                         dtype, shape))
+        #        _AddTest(
+        #            MatrixUnaryFunctorGradientTest,
+        #            'MatrixDeterminantGradient', name,
+        #            _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_determinant,
+        #                                               dtype, shape))
+        #        _AddTest(
+        #            MatrixUnaryFunctorGradientTest,
+        #            'LogMatrixDeterminantGradient',
+        #            name,
+        #            _GetMatrixUnaryFunctorGradientTest(lambda x:
+        #                linalg_ops.log_matrix_determinant(x)[1], dtype, shape))
 
         # The numerical Jacobian is consistently invalid for these four shapes
         # because the matrix square root of the perturbed input doesn't exist
         if shape in {(2, 5, 5), (3, 5, 5), (3, 10, 10), (3, 2, 5, 5)}:
-          # Alternative shape that consistently produces a valid numerical Jacobian
+          # Alternative shape that consistently produces a valid numerical
+          # Jacobian
           shape = extra + (size + 1, size + 1)
           name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
-        _AddTest(
-            MatrixUnaryFunctorGradientTest, 'MatrixSquareRootGradient', name,
-            _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_square_root,
-                                               dtype, shape))
+  #        _AddTest(
+  #            MatrixUnaryFunctorGradientTest, 'MatrixSquareRootGradient', name,
+  #            _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_square_root,
+  #                                               dtype, shape))
 
   # Tests for gradients of matrix_solve_ls
   for dtype in np.float32, np.float64:
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index ee4a22e5f471..e405d95dfd33 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -122,7 +122,7 @@ def _testCompareToExplicitDerivative(self, dtype):
     delta = 1e-3
     np_dtype = dtype.as_numpy_dtype
     try:
-      from scipy import misc  # pylint: disable=g-import-not-at-top
+      from scipy import differentiate  # pylint: disable=g-import-not-at-top
       from scipy import special  # pylint: disable=g-import-not-at-top
 
       alpha_val = np.logspace(-2, 3, dtype=np_dtype)
@@ -134,9 +134,14 @@ def _testCompareToExplicitDerivative(self, dtype):
       (sample_val, actual_val) = self.evaluate((sample, actual))
 
       u = special.gammainc(alpha_val, sample_val)
-      expected_val = misc.derivative(
-          lambda alpha_prime: special.gammaincinv(alpha_prime, u),
-          alpha_val, dx=delta * alpha_val)
+      expected_val = differentiate.derivative(
+          special.gammaincinv,
+          alpha_val,
+          args=(u,),
+          initial_step=delta * alpha_val,
+          order=2,
+          preserve_shape=True,
+      ).df
 
       self.assertAllClose(actual_val, expected_val, rtol=1e-3, atol=1e-3)
     except ImportError as e:
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index 0079ecbf1e76..bb6f60352794 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -52,7 +52,7 @@ cc_library(
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
     ],
     alwayslink = 1,
 )
@@ -95,7 +95,7 @@ cc_library(
     deps = [
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -170,7 +170,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:tensor_handle",
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
     alwayslink = 1,
@@ -185,7 +185,7 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/python:py_func_headers_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -195,7 +195,7 @@ cc_library(
     srcs = ["safe_pyobject_ptr.cc"],
     hdrs = ["safe_pyobject_ptr.h"],
     deps = [
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -227,7 +227,7 @@ cc_library(
         "//tensorflow/core:framework_internal_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
@@ -272,7 +272,7 @@ cc_library(
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "@local_xla//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
         "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
@@ -285,7 +285,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:script_ops_op_lib",
         "//tensorflow/core/platform:logging",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -302,7 +302,7 @@ cc_library(
 #         "@com_google_absl//absl/log:check",
 #         "@com_google_absl//absl/status",
 #         "//third_party/clif/python:clif",
-#         "//third_party/python_runtime:headers",
+#         "@local_xla//third_party/python_runtime:headers",
 #         "@local_xla//xla/tsl/python/lib/core:numpy",
 #         "//tensorflow/core:framework",
 #     ],
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index b3a8c84adf21..395470f00776 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -231,6 +231,12 @@ absl::Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       } else if (pyarray_type == custom_dtypes.uint4) {
         *out_tf_datatype = TF_UINT4;
         break;
+      } else if (pyarray_type == custom_dtypes.int2) {
+        *out_tf_datatype = TF_INT2;
+        break;
+      } else if (pyarray_type == custom_dtypes.uint2) {
+        *out_tf_datatype = TF_UINT2;
+        break;
       }
 
       return errors::Internal("Unsupported numpy type: ",
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index eb081579a7fd..7eba25553c98 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -34,8 +34,6 @@ absl::Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor,
 // in `ret` may have its own Python reference to `ndarray`s data. After `ret`
 // is destroyed, this reference must (eventually) be decremented via
 // ClearDecrefCache().
-// `convert_string` indicates whether it has to handle tstring conversion.
-// Expected to be removed once tstring migration is done.
 ABSL_MUST_USE_RESULT
 absl::Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
                              Safe_TF_TensorPtr* ret);
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 92b176db9c79..62032943b672 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -213,6 +213,12 @@ absl::Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
     case TF_UINT4:
       *out_pyarray_type = custom_dtypes.uint4;
       break;
+    case TF_INT2:
+      *out_pyarray_type = custom_dtypes.int2;
+      break;
+    case TF_UINT2:
+      *out_pyarray_type = custom_dtypes.uint2;
+      break;
     default:
       return errors::Internal("Tensorflow type ", tf_datatype,
                               " not convertible to numpy dtype.");
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index 3c1c2ff182cd..34fda92c5609 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -27,9 +27,15 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:portable_jpeg_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:file_statistics",
         "//tensorflow/python/lib/core:pybind11_absl",
         "//tensorflow/python/lib/core:pybind11_status",
+        "@com_google_absl//absl/status",
         "@pybind11",
     ],
 )
@@ -51,6 +57,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_absl",
         "//tensorflow/python/lib/core:pybind11_status",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/lib/io/file_io_wrapper.cc b/tensorflow/python/lib/io/file_io_wrapper.cc
index 54ec610b534a..b44d22c8783c 100644
--- a/tensorflow/python/lib/io/file_io_wrapper.cc
+++ b/tensorflow/python/lib/io/file_io_wrapper.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
 #include "tensorflow/core/lib/core/error_codes.pb.h"
@@ -127,7 +130,7 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       [](const std::string& dirname, PyTransactionToken* token) {
         py::gil_scoped_release release;
         const auto status = tensorflow::Env::Default()->CreateDir(dirname);
-        if (tensorflow::errors::IsAlreadyExists(status)) {
+        if (absl::IsAlreadyExists(status)) {
           return;
         }
         tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
@@ -195,7 +198,7 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
         py::gil_scoped_release release;
         const auto status = tensorflow::Env::Default()->IsDirectory(dirname);
         // FAILED_PRECONDITION response means path exists but isn't a dir.
-        if (tensorflow::errors::IsFailedPrecondition(status)) {
+        if (absl::IsFailedPrecondition(status)) {
           return false;
         }
 
@@ -307,7 +310,7 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
              py::gil_scoped_release release;
              tensorflow::tstring result;
              const auto status = self->ReadNBytes(bytes_to_read, &result);
-             if (!status.ok() && !tensorflow::errors::IsOutOfRange(status)) {
+             if (!status.ok() && !absl::IsOutOfRange(status)) {
                result.clear();
                tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
              }
diff --git a/tensorflow/python/lib/io/record_io_wrapper.cc b/tensorflow/python/lib/io/record_io_wrapper.cc
index 3b27d5b8a188..5e732fa34368 100644
--- a/tensorflow/python/lib/io/record_io_wrapper.cc
+++ b/tensorflow/python/lib/io/record_io_wrapper.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 
-#include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -254,7 +255,7 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
                py::gil_scoped_release release;
                status = self->ReadNextRecord(&record);
              }
-             if (tensorflow::errors::IsOutOfRange(status)) {
+             if (absl::IsOutOfRange(status)) {
                // Don't close because the file being read could be updated
                // in-between
                // __next__ calls.
@@ -293,7 +294,7 @@ PYBIND11_MODULE(_pywrap_record_io, m) {
                py::gil_scoped_release release;
                status = self->ReadRecord(&temp_offset, &record);
              }
-             if (tensorflow::errors::IsOutOfRange(status)) {
+             if (absl::IsOutOfRange(status)) {
                throw py::index_error(tensorflow::strings::StrCat(
                    "Out of range at reading offset ", offset));
              }
diff --git a/tensorflow/python/mlir_wrapper.cc b/tensorflow/python/mlir_wrapper.cc
index 662f70ba3e11..e00ff095b62e 100644
--- a/tensorflow/python/mlir_wrapper.cc
+++ b/tensorflow/python/mlir_wrapper.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
-#include <vector>
 
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index f10401e07eb7..c3cb14d9ac8a 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -3199,6 +3199,7 @@ cuda_py_strict_test(
         ":variable_scope",
         ":variables",
         ":while_loop",
+        ":while_v2",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:tf2",
         "//tensorflow/python/autograph/lang:directives",
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 3ff2fad10200..9a8ca17d6549 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1076,9 +1076,9 @@ def strided_slice(input_,
 
   Args:
     input_: A `Tensor`.
-    begin: An `int32` or `int64` `Tensor`.
-    end: An `int32` or `int64` `Tensor`.
-    strides: An `int32` or `int64` `Tensor`.
+    begin: An `int16`, `int32` or `int64` `Tensor`.
+    end: An `int16`, `int32` or `int64` `Tensor`.
+    strides: An `int16`, `int32` or `int64` `Tensor`.
     begin_mask: An `int32` mask.
     end_mask: An `int32` mask.
     ellipsis_mask: An `int32` mask.
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index b7187ee782c4..eeb74168c4c4 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -63,6 +63,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_loop
+from tensorflow.python.ops import while_v2
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import momentum
@@ -1776,5 +1777,19 @@ def whiny(value):
     self.assertAllEqual(whiny(True), 5)
 
 
+class AsyncNoopTest(test_util.TensorFlowTestCase):
+
+  def testAsyncNoop(self):
+
+    @def_function.function
+    def f():
+      x = constant_op.constant(2)
+      with ops.control_dependencies([while_v2.async_noop()]):
+        y = x + 2
+      return y
+
+    self.assertEqual(self.evaluate(f()), 4)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 009ffcd637fa..ad40b37192a7 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -3223,6 +3223,11 @@ def _is_png(contents, name=None):
     'image.decode_png',
     v1=['io.decode_png', 'image.decode_png'])(
         dispatch.add_dispatch_support(gen_image_ops.decode_png))
+decode_webp = tf_export(
+    'io.decode_webp',
+    'image.decode_webp',
+    v1=['io.decode_webp', 'image.decode_webp'],
+)(dispatch.add_dispatch_support(gen_image_ops.decode_web_p))
 
 encode_jpeg = tf_export(
     'io.encode_jpeg',
@@ -3278,17 +3283,18 @@ def decode_image(contents,
                  expand_animations=True):
   """Function for `decode_bmp`, `decode_gif`, `decode_jpeg`, and `decode_png`.
 
-  Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+  Detects whether an image is a BMP, GIF, JPEG, WebP or PNG, and performs the
   appropriate operation to convert the input bytes `string` into a `Tensor`
   of type `dtype`.
 
-  Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
-  opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
-  arrays `[height, width, num_channels]`. Make sure to take this into account
-  when constructing your graph if you are intermixing GIF files with BMP, JPEG,
-  and/or PNG files. Alternately, set the `expand_animations` argument of this
-  function to `False`, in which case the op will return 3-dimensional tensors
-  and will truncate animated GIF files to the first frame.
+  Note: `decode_gif` and `decode_webp` return a 4-D array of
+  `[num_frames, height, width, 3]`, as opposed to the other image
+  formats which always return 3-D arrays of the form `[height, width,
+  num_channels]`. Make sure to take this into account when
+  constructing your graph if you are intermixing animation with static
+  images. Alternately, set the `expand_animations` argument of this
+  function to `False`, in which case the op will return 3-dimensional
+  tensors and will truncate animations to the first frame.
 
   NOTE: If the first frame of an animated GIF does not occupy the entire
   canvas (maximum frame width x maximum frame height), then it fills the
@@ -3304,10 +3310,9 @@ def decode_image(contents,
     name: A name for the operation (optional)
     expand_animations: An optional `bool`. Defaults to `True`. Controls the
       shape of the returned op's output. If `True`, the returned op will produce
-      a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D tensor for all GIFs,
-      whether animated or not. If, `False`, the returned op will produce a 3-D
-      tensor for all file types and will truncate animated GIFs to the first
-      frame.
+      a 4-D tensor for all GIFs and WebP images, animated or not, and a 3-D
+      tensor in all other cases. If, `False`, the returned op will produce a 3-D
+      tensor for all file types and will truncate animations to the first frame.
 
   Returns:
     `Tensor` with type `dtype` and a 3- or 4-dimensional shape, depending on
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 8cbf1ad37e72..370b982f78eb 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -4898,6 +4898,102 @@ def testAnimatedGif(self):
       self.assertAllEqual(image[2], frame2)
 
 
+class WebpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def _path(self, name):
+    base = "tensorflow/core/lib/webp/testdata/"
+    return os.path.join(base, name)
+
+  @parameterized.named_parameters([
+      ("_rgbNoise", "RGB_noise_large_pixels_115x115.webp", (1, 115, 115, 3)),
+      ("_lossless", "lossless_raw.webp", (1, 32, 32, 3)),
+      ("_alpha", "lossy_alpha1.webp", (1, 307, 1000, 4)),
+  ])
+  def testRegularFile(self, filename, expected_dimensions):
+    # Read a real WebP image, via both APIs and check they're equal.
+    with self.cached_session():
+      webp = io_ops.read_file(self._path(filename))
+      image0 = image_ops.decode_webp(webp)
+      image1 = image_ops.decode_image(webp)
+      webp, image0, image1 = self.evaluate([webp, image0, image1])
+      self.assertEqual(image0.shape, expected_dimensions)
+      self.assertAllEqual(image0, image1)
+
+  def testAnimation(self):
+    # Read a WebP animation file, via both APIs and check they're equal.
+    with self.cached_session():
+      webp = io_ops.read_file(self._path("bouncy_ball.webp"))
+      expected_dimensions = (15, 450, 450, 4)
+
+      image0 = image_ops.decode_webp(webp)
+      image1 = image_ops.decode_image(webp, expand_animations=True)
+      webp, image0, image1 = self.evaluate([webp, image0, image1])
+      self.assertEqual(image0.shape, expected_dimensions)
+      self.assertAllEqual(image0, image1)
+
+  def testAnimationFrame0(self):
+    # Read a WebP animation file, via both APIs, but drop
+    # animation. Compare frame 0.
+    with self.cached_session():
+      webp = io_ops.read_file(self._path("bouncy_ball.webp"))
+
+      expected_anim_dimensions = (15, 450, 450, 4)
+      expected_still_dimensions = (450, 450, 4)
+
+      # decode_webp will return all the frames, but we should get the
+      # same frame 0 in both cases.
+      image0 = image_ops.decode_webp(webp)
+      image1 = image_ops.decode_image(webp, expand_animations=False)
+      webp, image0, image1 = self.evaluate([webp, image0, image1])
+      self.assertEqual(image0.shape, expected_anim_dimensions)
+      self.assertEqual(image1.shape, expected_still_dimensions)
+
+      # Compare frame0 of image0 to image1.
+      self.assertAllEqual(image0[0, ...], image1)
+
+  def testChannelsArg(self):
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      with self.cached_session():
+        webp = io_ops.read_file(
+            self._path("RGB_noise_large_pixels_115x115.webp")
+        )
+
+        for channels in 0, 3, 4:
+          image = image_ops.decode_webp(webp, channels=channels)
+          self.assertEqual(
+              image.get_shape().as_list(), [None, None, None, channels or None]
+          )
+
+  def testInvalidChannels(self):
+    with self.cached_session():
+      webp = io_ops.read_file(self._path("RGB_noise_large_pixels_115x115.webp"))
+
+      # DecodeImage supports grayscale, but WebP does not.
+      message = "WebP only supports 3 or 4 channels"
+      with self.assertRaisesRegex(
+          (errors.InvalidArgumentError, ValueError), message
+      ):
+        op = image_ops.decode_webp(webp, channels=1)
+        self.evaluate(op)
+
+  @parameterized.named_parameters(
+      [("_int8", np.int8), ("_int16", np.int16), ("_float32", np.float32)]
+  )
+  def testUnsupportedDtypes(self, dtype):
+    with self.cached_session():
+      webp = io_ops.read_file(self._path("RGB_noise_large_pixels_115x115.webp"))
+
+      message = "WebP only supports uint8"
+      with self.assertRaisesRegex(
+          (errors.InvalidArgumentError, ValueError), message
+      ):
+        # Note: we're testing with decode_image, since decode_webp
+        # *statically* does not support anything other than uint8.
+        op = image_ops.decode_image(webp, dtype=dtype)
+        self.evaluate(op)
+
+
 class ConvertImageTest(test_util.TensorFlowTestCase):
 
   def _convert(self, original, original_dtype, output_dtype, expected):
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 05e9aa177503..edacd3fcaf43 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -87,10 +87,9 @@ def map_fn(fn,
 
   #### Specifying `fn`'s output signature
 
-  If `fn`'s input and output signatures are different, then the output
-  signature must be specified using `fn_output_signature`.  (The input and
-  output signatures are differ if their structures, dtypes, or tensor types do
-  not match).  E.g.:
+  If `fn`'s input and output signatures differ (their structures, dtypes or
+  tensor types do not match), then the output signature must be specified using
+  `fn_output_signature`. For example:
 
   >>> tf.map_fn(fn=tf.strings.length,  # input & output have different dtypes
   ...           elems=tf.constant(["hello", "moon"]),
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index e03a788e527c..aa7bc2dd0770 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -256,9 +256,9 @@ class RaggedTensor(
 
   """
 
-  #=============================================================================
+  # =============================================================================
   # Constructor (private)
-  #=============================================================================
+  # =============================================================================
   @doc_controls.do_not_generate_docs
   def __init__(self, values, row_partition, internal=False):
     """Creates a `RaggedTensor` with a specified partitioning for `values`.
@@ -307,9 +307,9 @@ def __init__(self, values, row_partition, internal=False):
     self._values = values
     self._row_partition = row_partition
 
-  #=============================================================================
+  # =============================================================================
   # Factory Methods
-  #=============================================================================
+  # =============================================================================
 
   @classmethod
   def _from_row_partition(cls, values, row_partition, validate=True):
@@ -863,7 +863,7 @@ def _convert_values_and_partition(cls, values, row_partition, name):
     If `values` is a `RaggedTensor`, then converts `values` and `partition`
     to have compatible row-partitioning dtypes.  In particular, if any of the
     row partitioning tensors are `int64`, then all of the other row
-    partitioning tensors wil be cast to `int64` (if auto_cast_partition_dtype()
+    partitioning tensors will be cast to `int64` (if auto_cast_partition_dtype()
     is true) or an error will be raised (if auto_cast_partition_dtype() is
     false).
 
@@ -895,9 +895,9 @@ def _convert_values_and_partition(cls, values, row_partition, name):
 
     return (values, row_partition)
 
-  #=============================================================================
+  # =============================================================================
   # Accessors
-  #=============================================================================
+  # =============================================================================
 
   @property
   def dtype(self):
@@ -1384,9 +1384,9 @@ def bounding_shape(self, axis=None, name=None, out_type=None):
           [array_ops_stack.stack(ragged_dimensions), inner_dimensions], axis=0)
       return bbox if axis is None else array_ops.gather(bbox, axis)
 
-  #=============================================================================
+  # =============================================================================
   # Transformation
-  #=============================================================================
+  # =============================================================================
 
   def with_values(self, new_values):
     """Returns a copy of `self` with `values` replaced by `new_value`.
@@ -1575,9 +1575,9 @@ def _set_shape(self, shape):
       flat_shape = tensor_shape.as_shape([None] + shape[self.ragged_rank + 1:])
       self.flat_values.set_shape(flat_shape)
 
-  #=============================================================================
+  # =============================================================================
   # Tensor Type Conversions
-  #=============================================================================
+  # =============================================================================
 
   @classmethod
   @dispatch.add_dispatch_support
@@ -1737,7 +1737,7 @@ def from_tensor(cls,
         # If the padding isn't a scalar, then require that all values in the
         # padding match each item in the tensor.  After this block of code,
         # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
-        # use reduce_all for both cases, becaue when you pass an empty `axis`
+        # use reduce_all for both cases, because when you pass an empty `axis`
         # list to reduce_all, it reduces all axes; but we want it to reduce no
         # axes -- i.e., to be a no-op.)
         tensor_rank = array_ops.rank(tensor)
@@ -1929,7 +1929,7 @@ def from_sparse(cls, st_input, name=None, row_splits_dtype=dtypes.int64):
       with ops.control_dependencies(
           _assert_sparse_indices_are_ragged_right(st_input.indices)):
         # Treat sparse row indices as segment ids to generate a splits tensor
-        # thta we can pair with the sparse tensor values.  (Ignore sparse column
+        # that we can pair with the sparse tensor values.  (Ignore sparse column
         # indices.)
         segment_ids = math_ops.cast(st_input.indices[:, 0], row_splits_dtype)
         num_segments = math_ops.cast(st_input.dense_shape[0], row_splits_dtype)
@@ -2061,9 +2061,9 @@ def _to_variant(self, batched_input=False, name=None):
       return gen_ragged_conversion_ops.ragged_tensor_to_variant(
           self.nested_row_splits, self.flat_values, batched_input, name)
 
-  #=============================================================================
+  # =============================================================================
   # String Encoding
-  #=============================================================================
+  # =============================================================================
   def __repr__(self):
     if self._is_eager():
       # The np.array2string in _formatter provides a separator argument, but
@@ -2082,9 +2082,9 @@ def __repr__(self):
       return "tf.RaggedTensor(values=%s, row_splits=%s)" % (self.values,
                                                             self.row_splits)
 
-  #=============================================================================
+  # =============================================================================
   # Eager Execution Mode
-  #=============================================================================
+  # =============================================================================
 
   def numpy(self):
     """Returns a numpy `array` with the values for this `RaggedTensor`.
@@ -2178,9 +2178,9 @@ def _is_eager(self):
       rt = rt.values
     return isinstance(rt, ops.EagerTensor)
 
-  #=============================================================================
+  # =============================================================================
   # Operators
-  #=============================================================================
+  # =============================================================================
   # To avoid circular dependencies, we define stub methods for operators here,
   # and then override them when the ragged_operators module is imported.
 
@@ -2226,9 +2226,9 @@ def stub(*args, **kwargs):
   __rtruediv__ = _overloaded_operator("__rtruediv__")
   del _overloaded_operator
 
-  #=============================================================================
+  # =============================================================================
   # Name Scope
-  #=============================================================================
+  # =============================================================================
 
   # This private function is used by ops.name_scope to ensure that all of the
   # input tensors for the scope belong to the same graph.  Defining this means
@@ -2241,9 +2241,9 @@ def _as_graph_element(self):
       values = values.values
     return values
 
-  #=============================================================================
+  # =============================================================================
   # Composite Tensor
-  #=============================================================================
+  # =============================================================================
 
   @property
   def _type_spec(self):
@@ -2726,12 +2726,12 @@ def _convert_to_ragged_tensor_values(value):
   * Otherwise convert it to Tensor or RaggedTensor.
 
   Args:
-    value: An object of `Tensor`, `RaggedTensor` or registerred RaggedTensor
+    value: An object of `Tensor`, `RaggedTensor` or registered RaggedTensor
       value types, or an object whose type has a registered `Tensor` conversion
       function.
 
   Returns:
-    An object of `Tensor`, `RaggedTensor` or registerred RaggedTensor
+    An object of `Tensor`, `RaggedTensor` or registered RaggedTensor
     value types
   """
   if _is_supported_ragged_values_type(value):
diff --git a/tensorflow/python/ops/ragged/row_partition_test.py b/tensorflow/python/ops/ragged/row_partition_test.py
index 66e3e4088e2d..f4687d20cb66 100644
--- a/tensorflow/python/ops/ragged/row_partition_test.py
+++ b/tensorflow/python/ops/ragged/row_partition_test.py
@@ -45,9 +45,9 @@ def _get_specified_row_partition():
 
 @test_util.run_all_in_graph_and_eager_modes
 class RowPartitionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
-  #=============================================================================
+  # =============================================================================
   # RowPartition class docstring examples
-  #=============================================================================
+  # =============================================================================
 
   def testClassDocStringExamples(self):
     # From section: "Component Tensors"
@@ -71,9 +71,9 @@ def testClassDocStringExamples(self):
     outer_rt = RowPartition.from_row_splits(row_splits=[0, 3, 3, 5])
     del inner_rt, outer_rt
 
-  #=============================================================================
+  # =============================================================================
   # RowPartition Constructor (private)
-  #=============================================================================
+  # =============================================================================
 
   def testRowPartitionConstruction(self):
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
@@ -113,9 +113,9 @@ def testRowPartitionConstructionErrors(self):
           nrows=constant_op.constant(1, dtypes.int32),
           internal=row_partition._row_partition_factory_key)
 
-  #=============================================================================
+  # =============================================================================
   # RowPartition Factory Ops
-  #=============================================================================
+  # =============================================================================
 
   def testFromValueRowIdsWithDerivedNRows(self):
     # nrows is known at graph creation time.
@@ -316,9 +316,9 @@ def testFromValueRowIdsWithBadNRows(self):
       RowPartition.from_value_rowids(
           value_rowids=value_rowids, nrows=array_ops.expand_dims(nrows, 0))
 
-  #=============================================================================
+  # =============================================================================
   # RowPartition.__str__
-  #=============================================================================
+  # =============================================================================
   def testRowPartitionStr(self):
     row_splits = [0, 2, 5, 6, 6, 7]
     rp = RowPartition.from_row_splits(row_splits, validate=False)
@@ -351,136 +351,131 @@ def testRowPartitionStrUniformRowLength(self):
           'descr': 'bad rank for value_rowids',
           'factory': RowPartition.from_value_rowids,
           'value_rowids': [[1, 2], [3, 4]],
-          'nrows': 10
+          'nrows': 10,
       },
       {
           'descr': 'bad rank for nrows',
           'factory': RowPartition.from_value_rowids,
           'value_rowids': [1, 2, 3, 4],
-          'nrows': [10]
+          'nrows': [10],
       },
       {
           'descr': 'negative value_rowid',
           'factory': RowPartition.from_value_rowids,
           'value_rowids': [-5, 2, 3, 4],
-          'nrows': 10
+          'nrows': 10,
       },
       {
           'descr': 'non-monotonic-increasing value_rowid',
           'factory': RowPartition.from_value_rowids,
           'value_rowids': [4, 3, 2, 1],
-          'nrows': 10
+          'nrows': 10,
       },
       {
           'descr': 'value_rowid > nrows',
           'factory': RowPartition.from_value_rowids,
           'value_rowids': [1, 2, 3, 4],
-          'nrows': 2
+          'nrows': 2,
       },
-
       # from_row_splits
       {
           'descr': 'bad rank for row_splits',
           'factory': RowPartition.from_row_splits,
-          'row_splits': [[1, 2], [3, 4]]
+          'row_splits': [[1, 2], [3, 4]],
       },
       {
           'descr': 'row_splits[0] != 0',
           'factory': RowPartition.from_row_splits,
-          'row_splits': [2, 3, 4]
+          'row_splits': [2, 3, 4],
       },
       {
           'descr': 'non-monotonic-increasing row_splits',
           'factory': RowPartition.from_row_splits,
-          'row_splits': [0, 3, 2, 4]
+          'row_splits': [0, 3, 2, 4],
       },
-
       # from_row_lengths
       {
           'descr': 'bad rank for row_lengths',
           'factory': RowPartition.from_row_lengths,
-          'row_lengths': [[1, 2], [1, 0]]
+          'row_lengths': [[1, 2], [1, 0]],
       },
       {
-          'descr': 'negatve row_lengths',
+          'descr': 'negative row_lengths',
           'factory': RowPartition.from_row_lengths,
-          'row_lengths': [3, -1, 2]
+          'row_lengths': [3, -1, 2],
       },
-
       # from_row_starts
       {
           'descr': 'bad rank for row_starts',
           'factory': RowPartition.from_row_starts,
           'nvals': 2,
-          'row_starts': [[1, 2], [3, 4]]
+          'row_starts': [[1, 2], [3, 4]],
       },
       {
           'descr': 'row_starts[0] != 0',
           'factory': RowPartition.from_row_starts,
           'nvals': 5,
-          'row_starts': [2, 3, 4]
+          'row_starts': [2, 3, 4],
       },
       {
           'descr': 'non-monotonic-increasing row_starts',
           'factory': RowPartition.from_row_starts,
           'nvals': 4,
-          'row_starts': [0, 3, 2, 4]
+          'row_starts': [0, 3, 2, 4],
       },
       {
           'descr': 'row_starts[0] > nvals',
           'factory': RowPartition.from_row_starts,
           'nvals': 4,
-          'row_starts': [0, 2, 3, 5]
+          'row_starts': [0, 2, 3, 5],
       },
-
       # from_row_limits
       {
           'descr': 'bad rank for row_limits',
           'factory': RowPartition.from_row_limits,
-          'row_limits': [[1, 2], [3, 4]]
+          'row_limits': [[1, 2], [3, 4]],
       },
       {
           'descr': 'row_limits[0] < 0',
           'factory': RowPartition.from_row_limits,
-          'row_limits': [-1, 3, 4]
+          'row_limits': [-1, 3, 4],
       },
       {
           'descr': 'non-monotonic-increasing row_limits',
           'factory': RowPartition.from_row_limits,
-          'row_limits': [0, 3, 2, 4]
+          'row_limits': [0, 3, 2, 4],
       },
-
       # from_uniform_row_length
       {
           'descr': 'rowlen * nrows != nvals (1)',
           'factory': RowPartition.from_uniform_row_length,
           'nvals': 5,
-          'uniform_row_length': 3
+          'uniform_row_length': 3,
       },
       {
           'descr': 'rowlen * nrows != nvals (2)',
           'factory': RowPartition.from_uniform_row_length,
           'nvals': 5,
-          'uniform_row_length': 6
+          'uniform_row_length': 6,
       },
       {
           'descr': 'rowlen * nrows != nvals (3)',
           'factory': RowPartition.from_uniform_row_length,
           'nvals': 6,
           'uniform_row_length': 3,
-          'nrows': 3
+          'nrows': 3,
       },
       {
           'descr': 'rowlen must be a scalar',
           'factory': RowPartition.from_uniform_row_length,
           'nvals': 4,
-          'uniform_row_length': [2]
+          'uniform_row_length': [2],
       },
       {
           'descr': 'rowlen must be nonnegative',
           'factory': RowPartition.from_uniform_row_length,
           'nvals': 4,
-          'uniform_row_length': -1
+          'uniform_row_length': -1,
       },
   ])
   def testFactoryValidation(self, descr, factory, **kwargs):
diff --git a/tensorflow/python/ops/weak_tensor_constant_op_test.py b/tensorflow/python/ops/weak_tensor_constant_op_test.py
index 75c4e43ed66d..0359e5d65bb9 100644
--- a/tensorflow/python/ops/weak_tensor_constant_op_test.py
+++ b/tensorflow/python/ops/weak_tensor_constant_op_test.py
@@ -191,7 +191,7 @@ def test_constant_python_inputs(self):
       dtypes.uint8,
   )
   def test_constant_python_int_with_dtype_arg(self, dtype):
-    # Test that python int can be implicity casted to any types.
+    # Test that python int can be implicitly casted to any types.
     a = constant_op.constant([1, 2, 3], dtype)
     self.assertIsInstance(a, tensor.Tensor)
     self.assertEqual(a.dtype, dtype)
@@ -213,7 +213,7 @@ def test_constant_python_float_with_dtype_arg(self, dtype):
     self.assertEqual(a.dtype, dtype)
 
   def test_constant_python_complex_with_dtype_arg(self):
-    # Test that python float can be implicity casted to any complex types.
+    # Test that python float can be implicitly casted to any complex types.
     a = constant_op.constant([1.0j, 2.0j, 3.0j], dtypes.complex64)
     self.assertIsInstance(a, tensor.Tensor)
     self.assertEqual(a.dtype, dtypes.complex64)
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index e535dea22acd..572c7a4e8417 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -1405,4 +1405,60 @@ def _set_read_only_resource_inputs_attr(op: ops.Operation, branch_graphs):
   ops.set_int_list_attr(op, acd.READ_ONLY_RESOURCE_INPUTS_ATTR,
                         sorted(read_only_indices))
 
+
+def async_noop(name=None):
+  """Returns a no-op that is implemented as an async kernel.
+
+  This operation may be useful to implement "aggressive inter-op parallelism"
+  because it will cause any immediate downstream operations to be scheduled
+  on different threads.
+
+  Args:
+    name: The name of the operation.
+  """
+
+  with ops.name_scope(name, "async_noop") as name:
+    cond_init_value = constant_op.constant(False, name="cond_init_value")
+
+    func_graph_signature = [tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool)]
+
+    cond_graph = func_graph_module.func_graph_from_py_func(
+        "cond_graph",
+        lambda x: x,
+        [cond_init_value],
+        {},
+        signature=func_graph_signature,
+        func_graph=util.WhileCondFuncGraph(
+            "cond_graph", collections=ops.get_default_graph()._collections
+        ),  # pylint: disable=protected-access
+        add_control_dependencies=False,
+    )
+
+    body_graph = func_graph_module.func_graph_from_py_func(
+        "body_graph",
+        lambda x: x,
+        [cond_init_value],
+        {},
+        signature=func_graph_signature,
+        func_graph=util.WhileBodyFuncGraph(
+            "body_graph", collections=ops.get_default_graph()._collections
+        ),  # pylint: disable=protected-access
+        add_control_dependencies=False,
+    )
+
+    while_op, _ = util.get_op_and_outputs(
+        gen_functional_ops._while(
+            [cond_init_value],
+            util.create_new_tf_function(cond_graph),
+            util.create_new_tf_function(body_graph),
+            output_shapes=[[]],
+            name=name,
+        )
+    )
+
+    # Disable lowering using switch merge.
+    util.maybe_set_lowering_attr(while_op, lower_using_switch_merge=False)
+
+  return while_op
+
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
index 56e352a63c42..552d59ac65af 100644
--- a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
+++ b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
@@ -82,7 +82,7 @@ def rewrite_grad_indexed_slices(grads, body_grad_graph, loop_vars,
 
 
 def _get_tensor_index_in_iterable(iterable, t):
-  """Returns index of first occurence of `t`, raises ValueError if not found."""
+  """Returns index of first occurrence of `t`, raises ValueError if not found."""
   for i, elem in enumerate(iterable):
     if t is elem:
       return i
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index 6a1464ebda55..c465de337a32 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -191,8 +191,8 @@ tf_python_pybind_extension(
         "_pywrap_stacktrace_handler.pyi",
     ],
     deps = [
-        "//third_party/python_runtime:headers",
         "@local_tsl//tsl/platform:stacktrace_handler_hdrs_lib",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -206,7 +206,7 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core/platform:cpu_feature_guard_hdr",  # Only depend on header to avoid ODR issues.
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 564ac3237953..4767590ee130 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -129,12 +129,12 @@ tf_python_pybind_extension(
     ],
     deps = [
         ":profiler_pywrap_impl",
-        "//tensorflow/core/profiler/convert:repository",
-        "//tensorflow/core/profiler/convert:tool_options",
-        "//tensorflow/core/profiler/convert:xplane_to_tools_data",
         "//tensorflow/core/profiler/rpc:profiler_server_for_pybind",
         "//tensorflow/python/lib/core:pybind11_status",
         "@com_google_absl//absl/status",
+        "@org_xprof//xprof/convert:repository",
+        "@org_xprof//xprof/convert:tool_options",
+        "@org_xprof//xprof/convert:xplane_to_tools_data",
         "@pybind11",
     ],
 )
@@ -148,9 +148,9 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_xla//xla/python/profiler/internal:python_hooks",
         "@pybind11",
     ],
@@ -165,14 +165,13 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/convert:xplane_to_tools_data",
         "//tensorflow/core/profiler/lib:profiler_session_for_pybind",
         "//tensorflow/core/profiler/rpc:profiler_server_for_pybind",
-        "//tensorflow/core/profiler/rpc/client:save_profile",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
+        "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_xla//xla/tsl/profiler/convert:xplane_to_trace_events",
         "@local_xla//xla/tsl/profiler/rpc/client:capture_profile",
@@ -196,9 +195,6 @@ tsl_pybind_extension(
         "//tensorflow/core/framework:attr_value_proto_cc_impl",
         "//tensorflow/core/framework:op",
         "//tensorflow/core/framework:tensor",
-        "//tensorflow/core/profiler/convert:repository",
-        "//tensorflow/core/profiler/convert:tool_options",
-        "//tensorflow/core/profiler/convert:xplane_to_tools_data",
         "//tensorflow/python/lib/core:py_exception_registry",
         "//tensorflow/python/lib/core:pybind11_status",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -224,6 +220,7 @@ tsl_pybind_extension(
         "@local_xla//xla/tsl/framework:allocator_registry_impl",
         "@local_xla//xla/tsl/lib/io:table",
         "@local_xla//xla/tsl/platform:env_impl",
+        "@local_xla//xla/tsl/platform:types",
         "@local_xla//xla/tsl/platform/cloud:gcs_file_system",
         "@local_xla//xla/tsl/profiler/backends/cpu:traceme_recorder_impl",
         "@local_xla//xla/tsl/profiler/rpc:profiler_server_impl",
@@ -238,6 +235,8 @@ tsl_pybind_extension(
         "@local_xla//xla/tsl/protobuf:histogram_proto_cc_impl",
         "@local_xla//xla/tsl/protobuf:rpc_options_proto_cc_impl",
         "@local_xla//xla/tsl/protobuf:test_log_proto_cc_impl",
+        "@org_xprof//xprof/convert:tool_options",
+        "@org_xprof//xprof/pywrap:profiler_plugin_impl",
         "@pybind11",
     ] + if_macos([
         "@local_xla//xla/tsl/lib/histogram",
diff --git a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
index f4aabf8d4d7f..e5a640b93106 100644
--- a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
+++ b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
@@ -21,17 +21,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/variant.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/profiler/convert/xplane_to_trace_events.h"
 #include "xla/tsl/profiler/rpc/client/capture_profile.h"
 #include "xla/tsl/profiler/utils/session_manager.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
-#include "tensorflow/core/profiler/rpc/client/save_profile.h"
-#include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include "tsl/profiler/lib/profiler_session.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
@@ -39,7 +34,6 @@ namespace profiler {
 namespace pywrap {
 
 using tsl::profiler::GetRemoteSessionManagerOptionsLocked;
-using tsl::profiler::ValidateHostPortPair;
 
 absl::Status ProfilerSessionWrapper::Start(
     const char* logdir,
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 8ec97b327998..359a0f744605 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -20,12 +20,12 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "pybind11/pybind11.h"  // from @pybind11
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 #include "tensorflow/python/profiler/internal/profiler_pywrap_impl.h"
+#include "xprof/convert/repository.h"  // from @org_xprof
+#include "xprof/convert/tool_options.h"  // from @org_xprof
+#include "xprof/convert/xplane_to_tools_data.h"  // from @org_xprof
 
 namespace py = ::pybind11;
 
@@ -98,8 +98,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
       });
 
   m.def("start_server", [](int port) {
-    auto profiler_server =
-        std::make_unique<tensorflow::profiler::ProfilerServer>();
+    auto profiler_server = std::make_unique<tsl::profiler::ProfilerServer>();
     profiler_server->StartProfilerServer(port);
     // Intentionally release profiler server. Should transfer ownership to
     // caller instead.
diff --git a/tensorflow/python/profiler/internal/python_hooks.h b/tensorflow/python/profiler/internal/python_hooks.h
index 70ee03dd87d5..09d6bfd619e7 100644
--- a/tensorflow/python/profiler/internal/python_hooks.h
+++ b/tensorflow/python/profiler/internal/python_hooks.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/python/profiler/internal/python_hooks.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc b/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc
index acb6896e5f1e..05e07683a251 100644
--- a/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc
+++ b/tensorflow/python/profiler/internal/pywrap_profiler_plugin.cc
@@ -13,27 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
-#include <memory>
-#include <optional>
 #include <string>
 #include <utility>
 #include <variant>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/pjrt/status_casters.h"
+#include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/rpc/client/capture_profile.h"
-#include "xla/tsl/profiler/utils/session_manager.h"
-#include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/convert/tool_options.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
-#include "tensorflow/python/lib/core/pybind11_status.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
+#include "xprof/convert/tool_options.h"  // from @org_xprof
+#include "xprof/pywrap/profiler_plugin_impl.h"  // from @org_xprof
 
 namespace py = ::pybind11;
 
@@ -63,28 +55,6 @@ ToolOptions ToolOptionsFromPythonDict(const py::dict& dictionary) {
   return map;
 }
 
-absl::Status Trace(
-    const char* service_addr, const char* logdir, const char* worker_list,
-    bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
-    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
-        options) {
-  return tsl::profiler::CaptureRemoteTrace(service_addr, logdir, worker_list,
-                                           include_dataset_ops, duration_ms,
-                                           num_tracing_attempts, options);
-}
-
-absl::Status Monitor(const char* service_addr, int duration_ms,
-                     int monitoring_level, bool display_timestamp,
-                     tensorflow::string* result) {
-  TF_RETURN_IF_ERROR(tsl::profiler::ValidateHostPortPair(service_addr));
-  {
-    TF_RETURN_IF_ERROR(tsl::profiler::Monitor(service_addr, duration_ms,
-                                              monitoring_level,
-                                              display_timestamp, result));
-  }
-  return absl::OkStatus();
-}
-
 }  // namespace
 
 PYBIND11_MODULE(_pywrap_profiler_plugin, m) {
@@ -96,8 +66,9 @@ PYBIND11_MODULE(_pywrap_profiler_plugin, m) {
         ToolOptions tool_options = ToolOptionsFromPythonDict(options);
         {
           py::gil_scoped_release release;
-          status = Trace(service_addr, logdir, worker_list, include_dataset_ops,
-                         duration_ms, num_tracing_attempts, tool_options);
+          status = tsl::profiler::CaptureRemoteTrace(
+              service_addr, logdir, worker_list, include_dataset_ops,
+              duration_ms, num_tracing_attempts, tool_options);
         }
         // Py_INCREF and Py_DECREF must be called holding the GIL.
         xla::ThrowIfError(status);
@@ -105,12 +76,13 @@ PYBIND11_MODULE(_pywrap_profiler_plugin, m) {
 
   m.def("monitor", [](const char* service_addr, int duration_ms,
                       int monitoring_level, bool display_timestamp) {
-    tensorflow::string content;
+    tsl::string content;
     absl::Status status;
     {
       py::gil_scoped_release release;
-      status = Monitor(service_addr, duration_ms, monitoring_level,
-                       display_timestamp, &content);
+      status =
+          xprof::pywrap::Monitor(service_addr, duration_ms, monitoring_level,
+                                 display_timestamp, &content);
     }
     // Py_INCREF and Py_DECREF must be called holding the GIL.
     xla::ThrowIfError(status);
@@ -127,32 +99,22 @@ PYBIND11_MODULE(_pywrap_profiler_plugin, m) {
           std::string xspace_path = std::string(py::cast<py::str>(obj));
           xspace_paths.push_back(xspace_path);
         }
-        auto status_or_session_snapshot =
-            tensorflow::profiler::SessionSnapshot::Create(
-                std::move(xspace_paths),
-                /*xspaces=*/std::nullopt);
-        if (!status_or_session_snapshot.ok()) {
-          LOG(ERROR) << status_or_session_snapshot.status().message();
-          return py::make_tuple(py::bytes(""), py::bool_(false));
-        }
 
         std::string tool_name = std::string(py_tool_name);
         ToolOptions tool_options = ToolOptionsFromPythonDict(options);
-        absl::StatusOr<std::string> status_or_tool_data;
+        absl::StatusOr<std::pair<std::string, bool>> result;
+
         {
           py::gil_scoped_release release;
-          status_or_tool_data =
-              tensorflow::profiler::ConvertMultiXSpacesToToolData(
-                  status_or_session_snapshot.value(), tool_name, tool_options);
+          result = xprof::pywrap::XSpaceToToolsData(xspace_paths, tool_name,
+                                                    tool_options);
         }
-        if (!status_or_tool_data.ok()) {
-          LOG(ERROR) << status_or_tool_data.status().message();
-          return py::make_tuple(
-              py::bytes(status_or_tool_data.status().message()),
-              py::bool_(false));
+
+        if (!result.ok()) {
+          xla::ThrowIfError(result.status());
         }
-        return py::make_tuple(py::bytes(status_or_tool_data.value()),
-                              py::bool_(true));
+        return py::make_tuple(py::bytes(result->first),
+                              py::bool_(result->second));
       },
       // TODO: consider defaulting `xspace_path_list` to empty list, since
       // this parameter is only used for two of the tools.
@@ -162,49 +124,33 @@ PYBIND11_MODULE(_pywrap_profiler_plugin, m) {
       "xspace_to_tools_data_from_byte_string",
       [](const py::list& xspace_string_list, const py::list& filenames_list,
          const py::str& py_tool_name, const py::dict options = py::dict()) {
-        std::vector<std::unique_ptr<tensorflow::profiler::XSpace>> xspaces;
-        xspaces.reserve(xspace_string_list.size());
-        std::vector<std::string> xspace_paths;
-        xspace_paths.reserve(filenames_list.size());
-
-        // XSpace string inputs
+        std::vector<std::string> xspace_strings;
+        xspace_strings.reserve(xspace_string_list.size());
         for (py::handle obj : xspace_string_list) {
-          std::string xspace_string = std::string(py::cast<py::bytes>(obj));
-          auto xspace = std::make_unique<tensorflow::profiler::XSpace>();
-          if (!xspace->ParseFromString(xspace_string)) {
-            return py::make_tuple(py::bytes(""), py::bool_(false));
-          }
-          for (int i = 0; i < xspace->hostnames_size(); ++i) {
-            std::string hostname = xspace->hostnames(i);
-            std::replace(hostname.begin(), hostname.end(), ':', '_');
-            xspace->mutable_hostnames(i)->swap(hostname);
-          }
-          xspaces.push_back(std::move(xspace));
+          xspace_strings.push_back(std::string(py::cast<py::bytes>(obj)));
         }
 
-        // XSpace paths.
+        std::vector<std::string> xspace_paths;
+        xspace_paths.reserve(filenames_list.size());
         for (py::handle obj : filenames_list) {
           xspace_paths.push_back(std::string(py::cast<py::str>(obj)));
         }
 
-        auto status_or_session_snapshot =
-            tensorflow::profiler::SessionSnapshot::Create(
-                std::move(xspace_paths), std::move(xspaces));
-        if (!status_or_session_snapshot.ok()) {
-          LOG(ERROR) << status_or_session_snapshot.status().message();
-          return py::make_tuple(py::bytes(""), py::bool_(false));
-        }
-
         std::string tool_name = std::string(py_tool_name);
         ToolOptions tool_options = ToolOptionsFromPythonDict(options);
-        auto status_or_tool_data =
-            tensorflow::profiler::ConvertMultiXSpacesToToolData(
-                status_or_session_snapshot.value(), tool_name, tool_options);
-        if (!status_or_tool_data.ok()) {
-          LOG(ERROR) << status_or_tool_data.status().message();
-          return py::make_tuple(py::bytes(""), py::bool_(false));
+
+        absl::StatusOr<std::pair<std::string, bool>> result;
+        {
+          py::gil_scoped_release release;
+          result = xprof::pywrap::XSpaceToToolsDataFromByteString(
+              xspace_strings, xspace_paths, tool_name, tool_options);
         }
-        return py::make_tuple(py::bytes(status_or_tool_data.value()),
-                              py::bool_(true));
-      });
+
+        if (!result.ok()) {
+          xla::ThrowIfError(result.status());
+        }
+        return py::make_tuple(py::bytes(result->first),
+                              py::bool_(result->second));
+      },
+      py::arg(), py::arg(), py::arg(), py::arg() = py::dict());
 };
diff --git a/tensorflow/python/profiler/profiler_wrapper_test.py b/tensorflow/python/profiler/profiler_wrapper_test.py
index cb16689128fb..663e309b550e 100644
--- a/tensorflow/python/profiler/profiler_wrapper_test.py
+++ b/tensorflow/python/profiler/profiler_wrapper_test.py
@@ -22,8 +22,7 @@
 class ProfilerSessionTest(test_util.TensorFlowTestCase):
 
   def test_xspace_to_tools_data_default_options(self):
-    # filenames only used for `tf_data_bottleneck_analysis` and
-    # `hlo_proto` tools.
+    # filenames only used for `hlo_proto` tool.
     profiler_wrapper_plugin.xspace_to_tools_data([], 'trace_viewer')
 
   def _test_xspace_to_tools_data_options(self, options):
diff --git a/tensorflow/python/protobuf_inline_symbols_enforcer.cc b/tensorflow/python/protobuf_inline_symbols_enforcer.cc
index 24beeeb70fd4..6f3b0ffa8e3c 100644
--- a/tensorflow/python/protobuf_inline_symbols_enforcer.cc
+++ b/tensorflow/python/protobuf_inline_symbols_enforcer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -75,6 +76,10 @@ void protobuf_inline_symbols_enforcer() {
 
   tensorflow::AttrValue attr_value;
   attr_value.default_instance();
+  tensorflow::AttrValue_ListValue list_value;
+  list_value.add_b(false);
+
+  OpPerformanceList performance_list;
 
   tensorflow::ConfigProto config_proto;
   config_proto.default_instance();
diff --git a/tensorflow/python/py_exception_registry_wrapper.cc b/tensorflow/python/py_exception_registry_wrapper.cc
index 0f1a17ccb181..54189c779c62 100644
--- a/tensorflow/python/py_exception_registry_wrapper.cc
+++ b/tensorflow/python/py_exception_registry_wrapper.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include <Python.h>
 
-#include <array>
-
 #include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
diff --git a/tensorflow/python/pywrap_dtensor_device.cc b/tensorflow/python/pywrap_dtensor_device.cc
index a055f784d382..78121b8f1dad 100644
--- a/tensorflow/python/pywrap_dtensor_device.cc
+++ b/tensorflow/python/pywrap_dtensor_device.cc
@@ -13,16 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <Python.h>
+
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/dtensor/cc/dtensor_device.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/proto/layout.pb.h"
 #include "tensorflow/python/eager/pywrap_tensor.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
diff --git a/tensorflow/python/pywrap_library_dependency_enforcer.cc b/tensorflow/python/pywrap_library_dependency_enforcer.cc
new file mode 100644
index 000000000000..d0cb7aa3e85e
--- /dev/null
+++ b/tensorflow/python/pywrap_library_dependency_enforcer.cc
@@ -0,0 +1,23 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/pywrap_library_dependency_enforcer.h"
+
+namespace tensorflow {
+namespace python {
+void pywrap_library_dependency_symbol() {}
+
+}  // namespace python
+}  // namespace tensorflow
diff --git a/tensorflow/python/pywrap_library_dependency_enforcer.h b/tensorflow/python/pywrap_library_dependency_enforcer.h
new file mode 100644
index 000000000000..a450b6985562
--- /dev/null
+++ b/tensorflow/python/pywrap_library_dependency_enforcer.h
@@ -0,0 +1,25 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_PYWRAP_LIBRARY_DEPENDENCY_ENFORCER_H_
+#define TENSORFLOW_PYTHON_PYWRAP_LIBRARY_DEPENDENCY_ENFORCER_H_
+
+namespace tensorflow {
+namespace python {
+void pywrap_library_dependency_symbol();
+}  // namespace python
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_PYWRAP_LIBRARY_DEPENDENCY_ENFORCER_H_
diff --git a/tensorflow/python/pywrap_tensorflow_internal.cc b/tensorflow/python/pywrap_tensorflow_internal.cc
index 3ab0d1f04c67..a63758558a63 100644
--- a/tensorflow/python/pywrap_tensorflow_internal.cc
+++ b/tensorflow/python/pywrap_tensorflow_internal.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "tensorflow/python/pywrap_library_dependency_enforcer.h"
 
 // This logic allows Python to import _pywrap_tensorflow_internal.so by
 // creating a PyInit function and exposing it. It is required in opensource
 // only.
-PYBIND11_MODULE(_pywrap_tensorflow_internal, m){};
+PYBIND11_MODULE(_pywrap_tensorflow_internal, m) {
+  m.def("pywrap_library_dependency_symbol",
+        &tensorflow::python::pywrap_library_dependency_symbol);
+};
diff --git a/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi b/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi
index 460b0bbc73d6..ca762b49d3cd 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi
+++ b/tensorflow/python/saved_model/pywrap_saved_model/metrics.pyi
@@ -19,42 +19,42 @@ kFingerprintNotFound: str
 
 class MetricException(Exception): ...
 
-def AddAsyncCheckpointWriteDuration(*args, **kwargs): ...
-def AddCheckpointReadDuration(*args, **kwargs): ...
-def AddCheckpointWriteDuration(*args, **kwargs): ...
-def AddNumCheckpointShardsWritten(*args, **kwargs): ...
-def AddShardingCallbackDuration(*args, **kwargs): ...
-def AddTrainingTimeSaved(*args, **kwargs): ...
+def AddAsyncCheckpointWriteDuration(api_label: str, microseconds: float) -> None: ...
+def AddCheckpointReadDuration(api_label: str, microseconds: float) -> None: ...
+def AddCheckpointWriteDuration(api_label: str, microseconds: float) -> None: ...
+def AddNumCheckpointShardsWritten(num_shards: int) -> None: ...
+def AddShardingCallbackDuration(callback_duration: int) -> None: ...
+def AddTrainingTimeSaved(api_label: str, microseconds: float) -> None: ...
 def CalculateFileSize(arg0: str) -> int: ...
-def GetAsyncCheckpointWriteDurations(*args, **kwargs): ...
-def GetCheckpointReadDurations(*args, **kwargs): ...
-def GetCheckpointSize(*args, **kwargs): ...
-def GetCheckpointWriteDurations(*args, **kwargs): ...
+def GetAsyncCheckpointWriteDurations(api_label: str) -> bytes: ...
+def GetCheckpointReadDurations(api_label: str) -> bytes: ...
+def GetCheckpointSize(api_label: str, filesize: int) -> int: ...
+def GetCheckpointWriteDurations(api_label: str) -> bytes: ...
 def GetFoundFingerprintOnLoad() -> str: ...
 def GetNumCheckpointShardsWritten() -> int: ...
-def GetRead(*args, **kwargs): ...
+def GetRead(write_version: str) -> int: ...
 def GetReadApi(arg0: str) -> int: ...
 def GetReadFingerprint() -> str: ...
 def GetReadPath() -> str: ...
 def GetReadPathAndSingleprint() -> tuple[str, str]: ...
 def GetShardingCallbackDescription() -> str: ...
 def GetShardingCallbackDuration() -> int: ...
-def GetTrainingTimeSaved(*args, **kwargs): ...
-def GetWrite(*args, **kwargs): ...
+def GetTrainingTimeSaved(api_label: str) -> int: ...
+def GetWrite(write_version: str) -> int: ...
 def GetWriteApi(arg0: str) -> int: ...
 def GetWriteFingerprint() -> str: ...
 def GetWritePath() -> str: ...
 def GetWritePathAndSingleprint() -> tuple[str, str]: ...
-def IncrementRead(*args, **kwargs): ...
+def IncrementRead(write_version: str) -> None: ...
 def IncrementReadApi(arg0: str) -> None: ...
-def IncrementWrite(*args, **kwargs): ...
+def IncrementWrite(write_version: str) -> None: ...
 def IncrementWriteApi(arg0: str) -> None: ...
-def RecordCheckpointSize(*args, **kwargs): ...
-def SetFoundFingerprintOnLoad(*args, **kwargs): ...
-def SetReadFingerprint(*args, **kwargs): ...
-def SetReadPath(*args, **kwargs): ...
-def SetReadPathAndSingleprint(*args, **kwargs): ...
-def SetShardingCallbackDescription(*args, **kwargs): ...
-def SetWriteFingerprint(*args, **kwargs): ...
-def SetWritePath(*args, **kwargs): ...
-def SetWritePathAndSingleprint(*args, **kwargs): ...
+def RecordCheckpointSize(api_label: str, filesize: int) -> None: ...
+def SetFoundFingerprintOnLoad(found_status: str) -> None: ...
+def SetReadFingerprint(fingerprint: bytes) -> None: ...
+def SetReadPath(saved_model_path: str) -> None: ...
+def SetReadPathAndSingleprint(path: str, singleprint: str) -> None: ...
+def SetShardingCallbackDescription(description: str) -> None: ...
+def SetWriteFingerprint(fingerprint: bytes) -> None: ...
+def SetWritePath(saved_model_path: str) -> None: ...
+def SetWritePathAndSingleprint(path: str, singleprint: str) -> None: ...
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 69555bf0a1f8..deadf288a9ac 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -24,10 +24,10 @@ limitations under the License.
 // clang-format on
 
 #include "Python.h"
-#include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "include/dlpack/dlpack.h"  // from @dlpack
 #include "pybind11/chrono.h"  // from @pybind11
 #include "pybind11/complex.h"  // from @pybind11
 #include "pybind11/functional.h"  // from @pybind11
@@ -1827,6 +1827,24 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         py::return_value_policy::reference);
 
   // DLPack functions
+  m.def("TFE_DlpackDevice", [](py::handle& o) {
+    PyObject* eager_tensor_pyobject_ptr = o.ptr();
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+
+    if (!EagerTensor_CheckExact(eager_tensor_pyobject_ptr)) {
+      status->status = tensorflow::errors::InvalidArgument(
+          "The argument to `to_dlpack` must be a TF tensor, not Python object");
+      tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+    }
+
+    TFE_TensorHandle* thandle = EagerTensor_Handle(eager_tensor_pyobject_ptr);
+    auto dl_device = std::unique_ptr<DLDevice>(static_cast<DLDevice*>(
+        tensorflow::TFE_GetDLDevice(thandle, status.get())));
+    tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+    return py::make_tuple(dl_device->device_type, dl_device->device_id);
+  });
+
   m.def("TFE_ToDlpackCapsule", [](py::handle& o) {
     PyObject* eager_tensor_pyobject_ptr = o.ptr();
     tensorflow::Safe_TF_StatusPtr status =
diff --git a/tensorflow/python/tfe_wrapper_monitoring_reader.cc b/tensorflow/python/tfe_wrapper_monitoring_reader.cc
index 5496065a5722..c906a9f851a2 100644
--- a/tensorflow/python/tfe_wrapper_monitoring_reader.cc
+++ b/tensorflow/python/tfe_wrapper_monitoring_reader.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-
 #include "Python.h"
 #include "pybind11/complex.h"  // from @pybind11
 #include "pybind11/functional.h"  // from @pybind11
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 4dd166923e36..5efe521c0595 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -810,13 +810,8 @@ pytype_strict_library(
         ":tpu_embedding_v3_utils",
         "//tensorflow/core/tpu/kernels:sparse_core_layout_proto_py",
         "//tensorflow/python/checkpoint:checkpoint_adapter",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/ops:array_ops",
-        "//tensorflow/python/ops:manip_ops",
-        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:py_checkpoint_reader",
         "//tensorflow/python/util/protobuf",
@@ -832,10 +827,10 @@ tf_py_strict_test(
         "//tensorflow/core/tpu/kernels:sparse_core_layout_proto_py",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/logging",
     ],
 )
 
@@ -861,6 +856,8 @@ pytype_strict_library(
         "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/trackable:constants",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -921,6 +918,10 @@ tf_py_strict_test(
         ":tpu_embedding_v2_utils",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1008,7 +1009,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:pybind11_status_headers",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:status_casters",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
@@ -1033,7 +1034,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/core/tpu/kernels:sparse_core_ops_utils",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ],
 )
@@ -1163,19 +1164,43 @@ tpu_py_strict_test(
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
-        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:remote",
-        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tpu_py_strict_test(
+    name = "tpu_embedding_v3_checkpoint_test",
+    srcs = ["tpu_embedding_v3_checkpoint_test.py"],
+    disable_experimental = True,
+    disable_tfrt = False,
+    disable_v2 = True,
+    disable_v3 = True,
+    # copybara:uncomment disable_v4i = True,
+    # copybara:uncomment disable_v5 = False,
+    # copybara:uncomment disable_v5_grm = True,
+    deps = [
+        ":tpu_embedding_v2_utils",
+        ":tpu_embedding_v3",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/tpu/ops/BUILD b/tensorflow/python/tpu/ops/BUILD
index c0a9a7590770..54ac817ab1ca 100644
--- a/tensorflow/python/tpu/ops/BUILD
+++ b/tensorflow/python/tpu/ops/BUILD
@@ -65,8 +65,15 @@ tf_gen_op_wrapper_py(
         "TPUAnnotateTensorsWithDynamicShape",
         "XlaSparseDenseMatmul",
         "XlaSparseDenseMatmulWithCsrInput",
+        "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput",
         "XlaSparseDenseMatmulGrad",
         "XlaSparseDenseMatmulGradWithCsrInput",
+        "XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput",
+        "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput",
+        "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput",
+        "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput",
+        "XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput",
+        "XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput",
         "XlaSparseDenseMatmulGradWithSgdAndCsrInput",
         "XlaSparseDenseMatmulGradWithAdagradAndCsrInput",
         "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput",
diff --git a/tensorflow/python/tpu/tpu.bzl b/tensorflow/python/tpu/tpu.bzl
index cdc33d357868..116c915bedc1 100644
--- a/tensorflow/python/tpu/tpu.bzl
+++ b/tensorflow/python/tpu/tpu.bzl
@@ -14,6 +14,7 @@
 # =============================================================================
 """Provides python test rules for Cloud TPU."""
 
+load("@rules_python//python:py_test.bzl", "py_test")
 load(
     "//tensorflow/python/tpu:tpu_test_wrapper.bzl",
     _get_kwargs_for_wrapping = "get_kwargs_for_wrapping",
@@ -29,7 +30,7 @@ def tpu_py_test(
         disable_mlir_bridge = True,
         disable_tfrt = None,
         args = [],
-        test_rule = native.py_test,
+        test_rule = py_test,
         **kwargs):
     """Generates identical unit test variants for various Cloud TPU versions.
 
diff --git a/tensorflow/python/tpu/tpu_embedding_for_serving.py b/tensorflow/python/tpu/tpu_embedding_for_serving.py
index 9e5da7dd4840..afcf1b5d815d 100644
--- a/tensorflow/python/tpu/tpu_embedding_for_serving.py
+++ b/tensorflow/python/tpu/tpu_embedding_for_serving.py
@@ -36,6 +36,7 @@
 from tensorflow.python.tpu import tpu_embedding_base
 from tensorflow.python.tpu import tpu_embedding_v2_utils
 from tensorflow.python.tpu import tpu_embedding_v3_utils
+from tensorflow.python.trackable import base as trackable_base
 from tensorflow.python.types import core
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -211,6 +212,26 @@ def _get_sparse_core_table_layouts_str(self) -> bytes:
     )
     return layouts_str.read_value().numpy()
 
+  def _trackable_children(
+      self, save_type=trackable_base.SaveType.CHECKPOINT, **kwargs: Any
+  ):
+    # Remove the trackables added to make sparsecore checkpoint restore work.
+    # These are not required for serializing the model.
+    tc = super()._trackable_children(save_type, **kwargs)
+    if save_type == trackable_base.SaveType.SAVEDMODEL:
+      if tpu_embedding_v3_utils.SPARSECORE_LAYOUTS_CHECKPOINT_KEY in tc:
+        tc.pop(tpu_embedding_v3_utils.SPARSECORE_LAYOUTS_CHECKPOINT_KEY, None)
+      sclt = [
+          k
+          for k, v in tc.items()
+          if isinstance(
+              v, tpu_embedding_v3_utils.SparseCoreStackedTableTrackable
+          )
+      ]
+      for k in sclt:
+        tc.pop(k, None)
+    return tc
+
   def _create_variables_from_stacked_tables(self):
     sc_layouts = sparse_core_layout_pb2.SparseCoreTableLayouts()
     sc_layouts.ParseFromString(self._get_sparse_core_table_layouts_str())
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index efe9cad29a45..41d7a02a4fb9 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -47,47 +47,21 @@
 ClipValueType = Union[Tuple[float, float], float]
 
 
-class _Optimizer(metaclass=abc.ABCMeta):
-  """Base class for all optimizers, with common parameters."""
+class _WithSlotVariables(metaclass=abc.ABCMeta):
+  """Base class that allows slot variables to be created."""
 
   def __init__(
       self,
-      learning_rate: Union[float, Callable[[], float]],
-      use_gradient_accumulation: bool,
-      clip_weight_min: Optional[float],
-      clip_weight_max: Optional[float],
-      weight_decay_factor: Optional[float],
-      multiply_weight_decay_factor_by_learning_rate: bool,
-      clipvalue: Optional[ClipValueType] = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
-      low_dimensional_packing_status: bool = False,
   ):
-    self.learning_rate = learning_rate
-    self.use_gradient_accumulation = use_gradient_accumulation
-    self.clip_weight_min = clip_weight_min
-    self.clip_weight_max = clip_weight_max
-    if not use_gradient_accumulation and clipvalue is not None:
-      raise ValueError(
-          f"When `use_gradient_accumulation` is False, gradient clipping "
-          f"cannot be used and `clipvalue` should be left as None. "
-          f"Received value {clipvalue} for argument `clipvalue`.")
-    if clipvalue is None:
-      clipvalue = (None, None)
-    elif not isinstance(clipvalue, tuple):
-      clipvalue = (-1. * clipvalue, clipvalue)
-    self.clip_gradient_min, self.clip_gradient_max = clipvalue
-
-    self.weight_decay_factor = weight_decay_factor
-    self.multiply_weight_decay_factor_by_learning_rate = (
-        multiply_weight_decay_factor_by_learning_rate)
-
-    if (slot_variable_creation_fn is not None and
-        not callable(slot_variable_creation_fn)):
+    if slot_variable_creation_fn is not None and not callable(
+        slot_variable_creation_fn
+    ):
       raise ValueError(
-          f"Argument `slot_variable_creation_fn` must be either None or a "
-          f"callable. Received: {slot_variable_creation_fn}")
+          "Argument `slot_variable_creation_fn` must be either None or a "
+          f"callable. Received: {slot_variable_creation_fn}"
+      )
     self.slot_variable_creation_fn = slot_variable_creation_fn
-    self.low_dimensional_packing_status = low_dimensional_packing_status
 
   @abc.abstractmethod
   def _slot_names(self) -> List[Text]:
@@ -107,47 +81,6 @@ def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     """
     raise NotImplementedError
 
-  def _set_optimization_parameters(
-      self, parameters: optimization_parameters_pb2.OptimizationParameters):
-    """Sets the optimizer fields in the OptimizationParameters."""
-    if self.use_gradient_accumulation:
-      parameters.gradient_accumulation_status = (
-          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED)
-    else:
-      parameters.gradient_accumulation_status = (
-          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
-
-    if self.clip_weight_min is not None:
-      parameters.clipping_limits.lower.value = self.clip_weight_min
-
-    if self.clip_weight_max is not None:
-      parameters.clipping_limits.upper.value = self.clip_weight_max
-
-    if self.clip_gradient_min is not None:
-      parameters.gradient_clipping_limits.lower.value = self.clip_gradient_min
-
-    if self.clip_gradient_max is not None:
-      parameters.gradient_clipping_limits.upper.value = self.clip_gradient_max
-
-    if self.weight_decay_factor:
-      parameters.weight_decay_factor = self.weight_decay_factor
-      if self.multiply_weight_decay_factor_by_learning_rate:
-        parameters.multiply_weight_decay_factor_by_learning_rate = True
-
-    parameters.low_dimensional_packing_status = (
-        self.low_dimensional_packing_status
-    )
-
-  @abc.abstractmethod
-  def _load(self) -> Callable[..., ops.Operation]:
-    """Returns the load function for the optimizer."""
-    raise NotImplementedError
-
-  @abc.abstractmethod
-  def _retrieve(self) -> Callable[..., core.Tensor]:
-    """Returns the retrieve function for the optimizer."""
-    raise NotImplementedError
-
   def _create_slots(
       self,
       table: "TableConfig",
@@ -199,6 +132,90 @@ def __hash__(self) -> int:
     return hash(tuple(self.__dict__.items()))
 
 
+class _Optimizer(_WithSlotVariables):
+  """Base class for all optimizers, with common parameters."""
+
+  def __init__(
+      self,
+      learning_rate: Union[float, Callable[[], float]],
+      use_gradient_accumulation: bool,
+      clip_weight_min: Optional[float],
+      clip_weight_max: Optional[float],
+      weight_decay_factor: Optional[float],
+      multiply_weight_decay_factor_by_learning_rate: bool,
+      clipvalue: Optional[ClipValueType] = None,
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
+      low_dimensional_packing_status: bool = False,
+  ):
+    super().__init__(slot_variable_creation_fn=slot_variable_creation_fn)
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+    self.clip_weight_min = clip_weight_min
+    self.clip_weight_max = clip_weight_max
+    if not use_gradient_accumulation and clipvalue is not None:
+      raise ValueError(
+          "When `use_gradient_accumulation` is False, gradient clipping "
+          "cannot be used and `clipvalue` should be left as None. "
+          f"Received value {clipvalue} for argument `clipvalue`."
+      )
+    if clipvalue is None:
+      clipvalue = (None, None)
+    elif not isinstance(clipvalue, tuple):
+      clipvalue = (-1.0 * clipvalue, clipvalue)
+    self.clip_gradient_min, self.clip_gradient_max = clipvalue
+
+    self.weight_decay_factor = weight_decay_factor
+    self.multiply_weight_decay_factor_by_learning_rate = (
+        multiply_weight_decay_factor_by_learning_rate
+    )
+
+    self.low_dimensional_packing_status = low_dimensional_packing_status
+
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters
+  ):
+    """Sets the optimizer fields in the OptimizationParameters."""
+    if self.use_gradient_accumulation:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
+      )
+    else:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED
+      )
+
+    if self.clip_weight_min is not None:
+      parameters.clipping_limits.lower.value = self.clip_weight_min
+
+    if self.clip_weight_max is not None:
+      parameters.clipping_limits.upper.value = self.clip_weight_max
+
+    if self.clip_gradient_min is not None:
+      parameters.gradient_clipping_limits.lower.value = self.clip_gradient_min
+
+    if self.clip_gradient_max is not None:
+      parameters.gradient_clipping_limits.upper.value = self.clip_gradient_max
+
+    if self.weight_decay_factor:
+      parameters.weight_decay_factor = self.weight_decay_factor
+      if self.multiply_weight_decay_factor_by_learning_rate:
+        parameters.multiply_weight_decay_factor_by_learning_rate = True
+
+    parameters.low_dimensional_packing_status = (
+        self.low_dimensional_packing_status
+    )
+
+  @abc.abstractmethod
+  def _load(self) -> Callable[..., ops.Operation]:
+    """Returns the load function for the optimizer."""
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def _retrieve(self) -> Callable[..., core.Tensor]:
+    """Returns the retrieve function for the optimizer."""
+    raise NotImplementedError
+
+
 @tf_export("tpu.experimental.embedding.CustomOptimizer")
 class CustomOptimizer(_Optimizer):
   """Optimization parameters for custom optimizer for TPU embeddings.
@@ -1094,6 +1111,125 @@ def _retrieve(self) -> Callable[..., core.Tensor]:
     return tpu_ops.retrieve_tpu_embedding_adam_parameters
 
 
+@tf_export("tpu.experimental.embedding.CustomCombiner")
+class CustomCombiner(_WithSlotVariables):
+  """Custom combiner for TPU embeddings.
+
+  This class gives the user the ability to define a custom combiner for running
+  embedding lookups on TPU with SparseCores.
+
+  The custom computation should be a function which takes the following
+  arguments:
+    (1) valency: an integer scalar that indicates the actual number of valent
+                 IDs in the current sample.
+    (2) vectors: a 2D tensor of shape [max_valency, embedding_dim] that
+                 represents the embedding lookup results to be combined. The
+                 vectors are guaranteed to be in the same order as the embedding
+                 IDs appear in the input sample.
+    (3) weights: this argument is only present if `num_weights` of this class
+                 is greater than 0. It is a 1D tensor of shape [num_weights]
+                 that will be back-propagated to during the backward pass.
+                 Currently only SGD optimizer is supported on these weights,
+                 with the learning rate being the
+                 `combiner_weights_learning_rate` argument of the constructor
+                 of this class.
+
+  The custom computation should return a 1D tensor of shape [embedding_dim] that
+  represents the combined embedding vector. An example combiner computation is
+  as follows (it simply sums over the valent embedding vectors and is
+  semantically equivalent to the `sum` combiner):
+
+  ```python
+  @tf.function
+  def sum_combiner(valency, vectors):
+    max_valency = vectors.shape[0]
+    valid_mask = tf.range(max_valency) < valency
+    vectors_masked = tf.where(
+        tf.expand_dims(valid_mask, axis=-1),
+        vectors,
+        tf.zeros_like(vectors),
+    )
+    return tf.reduce_sum(vectors_masked, axis=0)
+  ```
+
+  The custom computation is defined as a per-sample combiner function and it
+  will be vectorized to the given batch size. This means certain constructs
+  (such as control flow ops) may not be supported in the custom computation.
+
+  NOTE: This combiner can only be used with the `TPUEmbeddingV2` class.
+
+  This class can be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  combiner parameter to set a table specific combiner.
+
+  ```python
+  custom_combiner = tpu_embedding_v2_utils.CustomCombiner(
+      sum_combiner,
+      max_valency=16,
+  )
+
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      combiner=custom_combiner)
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...)
+  ```
+  In this example, the combiner of the first table will be the `sum_combiner`.
+  The second table will use the default `mean` combiner.
+  """
+
+  CUSTOM_COMBINER_SLOT_NAME = "custom_combiner_variables"
+
+  def __init__(
+      self,
+      combiner_computation: core.PolymorphicFunction,
+      max_valency: int,
+      num_weights: int = 0,
+      initializer: Optional[init_ops_v2.Initializer] = None,
+      combiner_weights_learning_rate: Union[float, Callable[[], float]] = 0.01,
+  ) -> Any:
+    super().__init__()
+    self.combiner = "custom_combiner_" + str(hash(combiner_computation))
+    self.combiner_computation = combiner_computation
+    self.max_valency = max_valency
+    self.num_weights = num_weights
+    self.combiner_weights_learning_rate = combiner_weights_learning_rate
+
+    if num_weights > 0 and initializer is None:
+      raise ValueError(
+          "When `num_weights` is greater than 0, `initializer` must be set."
+      )
+
+    self._slot_names_attr = tuple([self.CUSTOM_COMBINER_SLOT_NAME])
+    self._slot_initializers_attr = tuple(
+        [initializer] if initializer is not None else ()
+    )
+
+  def _slot_names(self) -> List[Text]:
+    if self.num_weights > 0:
+      return list(self._slot_names_attr)
+    return []
+
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
+    if self.num_weights > 0:
+      return list(self._slot_initializers_attr)
+    return []
+
+  def __str__(self) -> str:
+    return self.combiner
+
+
 @tf_export("tpu.experimental.embedding.QuantizationConfig")
 class QuantizationConfig:
   """Settings for simulated quantization of the tpu embedding table.
@@ -1201,7 +1337,7 @@ def __init__(
       dim: int,
       initializer: Optional[Callable[[Any], None]] = None,
       optimizer: Optional[_Optimizer] = None,
-      combiner: Text = "mean",
+      combiner: Union[Text, CustomCombiner] = "mean",
       name: Optional[Text] = None,
       quantization_config: QuantizationConfig = None,
       # TODO(b/295372790): Change the type to SparseCoreTableLayout after it is
@@ -1223,10 +1359,11 @@ def __init__(
         `tf.tpu.experimental.embedding.Adagrad` or
         `tf.tpu.experimental.embedding.Adam`. If set will override the global
         optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`.
-      combiner: A string specifying how to reduce if there are multiple entries
-        in a single row. Currently 'mean', 'sqrtn', 'sum' are supported, with
-        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-        with bag-of-words columns. For more information, see
+      combiner: A string or instance of a combiner class specifying how to
+        reduce if there are multiple entries in a single row. Currently 'mean',
+        'sqrtn', 'sum', and custom combiners are supported, with 'mean' the
+        default. 'sqrtn' often achieves good accuracy, in particular with
+        bag-of-words columns. For more information, see
         `tf.nn.embedding_lookup_sparse`.
       name: An optional string used to name the table. Must be defined if
         running on SparseCore.
@@ -1263,11 +1400,19 @@ def __init__(
     if initializer is None:
       initializer = init_ops_v2.TruncatedNormal(mean=0.0,
                                                 stddev=1/math.sqrt(dim))
-    accepted_combiners = ("mean", "sum", "sqrtn")
-    if combiner not in accepted_combiners:
+
+    accepted_str_combiners = ("mean", "sum", "sqrtn")
+    if isinstance(combiner, str):
+      if combiner not in accepted_str_combiners:
+        raise ValueError(
+            f"String argument `combiner` must be in {accepted_str_combiners}. "
+            f"Received: {combiner}")
+
+    elif not isinstance(combiner, CustomCombiner):
       raise ValueError(
-          f"Argument `combiner` must be one of {accepted_combiners}. "
-          f"Received: {combiner}")
+          f"Argument `combiner` should either be a str or a CustomCombiner. "
+          f"Received: {type(combiner)}"
+      )
 
     if name is None:
       logging.warning(
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
index 9e18c8c3a0cf..e0ba452d6880 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
@@ -18,6 +18,10 @@
 
 from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
 from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_embedding_v2_utils
 
@@ -74,6 +78,74 @@ def test_equal_and_hash_function(self, optimizer):
     self.assertNotEqual(hash(opt1), hash(opt3))
 
 
+class TPUEmbeddingCustomCombinerTest(parameterized.TestCase, test.TestCase):
+
+  def get_sum_combiner(self):
+
+    @def_function.function
+    def sum_combiner(valency, vectors):
+      max_valency = vectors.shape[0]
+      valid_mask = array_ops.range(max_valency) < valency
+      vectors_masked = array_ops.where(
+          array_ops.expand_dims(valid_mask, axis=-1),
+          vectors,
+          array_ops.zeros_like(vectors),
+      )
+      return math_ops.reduce_sum(vectors_masked, axis=0)
+
+    return sum_combiner
+
+  def get_positional_weight_combiner(self):
+
+    @def_function.function
+    def positional_weight_combiner(valency, vectors, weights):
+      max_valency = vectors.shape[0]
+      valid_mask = array_ops.range(max_valency) < valency
+      vectors_masked = array_ops.where(
+          array_ops.expand_dims(valid_mask, axis=-1),
+          vectors,
+          array_ops.zeros_like(vectors),
+      )
+      return math_ops.matvec(vectors_masked, weights, transpose_a=True)
+
+    return positional_weight_combiner
+
+  def test_zero_num_weights_combiner_has_no_slots(self):
+    combiner = tpu_embedding_v2_utils.CustomCombiner(
+        self.get_sum_combiner(),
+        max_valency=16,
+        num_weights=0,
+    )
+    self.assertEmpty(combiner._slot_names())
+    self.assertEmpty(combiner._slot_initializers())
+
+  def test_name_starts_with_custom_combiner(self):
+    combiner = tpu_embedding_v2_utils.CustomCombiner(
+        self.get_sum_combiner(),
+        max_valency=16,
+    )
+    self.assertStartsWith(str(combiner), 'custom_combiner')
+
+  def test_non_zero_weights_requires_initializer(self):
+    with self.assertRaisesRegex(ValueError, '`initializer` must be set'):
+      tpu_embedding_v2_utils.CustomCombiner(
+          self.get_positional_weight_combiner(),
+          max_valency=16,
+          num_weights=16,
+      )
+
+  def test_non_zero_weights_has_one_slot_variable(self):
+    combiner = tpu_embedding_v2_utils.CustomCombiner(
+        self.get_positional_weight_combiner(),
+        max_valency=16,
+        num_weights=16,
+        initializer=init_ops_v2.zeros_initializer,
+    )
+    self.assertLen(combiner._slot_names(), 1)
+    self.assertLen(combiner._slot_initializers(), 1)
+    self.assertStartsWith(combiner._slot_names()[0], 'custom_combiner')
+
+
 class ConfigTest(test.TestCase):
 
   def test_table_config_repr(self):
diff --git a/tensorflow/python/tpu/tpu_embedding_v3.py b/tensorflow/python/tpu/tpu_embedding_v3.py
index 87b23148287f..58e1ea430a91 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3.py
@@ -1703,7 +1703,7 @@ def _convert_input_feature_to_list_of_coo_tensors(
           )
       )
     elif isinstance(input_feature, ragged_tensor.RaggedTensor):
-      if not weight:
+      if weight is None:
         weight = array_ops.ones_like(input_feature.values, dtype=dtypes.float32)
       elif isinstance(weight, ragged_tensor.RaggedTensor):
         weight = weight.values
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_additional_test.py b/tensorflow/python/tpu/tpu_embedding_v3_additional_test.py
index 97c87d84346f..e29d7d803c87 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3_additional_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3_additional_test.py
@@ -34,6 +34,8 @@
 from tensorflow.python.framework.constant_op import constant as tf_constant
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load as tf_load
+from tensorflow.python.saved_model import save as tf_save
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.tpu import tpu_embedding_for_serving
 from tensorflow.python.tpu import tpu_embedding_v2_utils
@@ -402,6 +404,14 @@ def testCpuRestoreForStackedTables(self):
         row_lookup[1],
         tf_constant([202.0, 203.0], shape=(1, 2)),
     )
+    saved_model_path = os.path.join(
+        self.create_tempdir().full_path, 'saved_model'
+    )
+    tf_save.save(
+        serving_embedding, saved_model_path
+    )
+    loaded_embedding = tf_load.load(saved_model_path)
+    self.assertLen(loaded_embedding._variables, 2)
 
   def testUnshardedToTpuRestore(self):
     table1 = tpu_embedding_v2_utils.TableConfig(
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py
index d7cc1f2667f8..77fe6df57bb8 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter.py
@@ -15,8 +15,8 @@
 """Checkpoint adapter for TPUEmbedding."""
 
 import collections
-
-from typing import Mapping, Sequence, Optional
+import time
+from typing import Mapping, Optional, Sequence
 
 from absl import logging
 
@@ -52,7 +52,131 @@ def _shard_info_str(shape, shard_info) -> str:
   return full_shape_str + slice_spec
 
 
-class EmbeddingReshardCallback(checkpoint_adapter.ReshardCallback):
+def _shard_from_cpu_to_sc(
+    feature_values: tensor.Tensor,
+    shape_and_slice: str,
+    to_shard_layout: Sequence[sparse_core_layout_pb2.SparseCoreTableLayout],
+) -> tensor.Tensor:
+  """Shards the feature tables from CPU to SparseCore."""
+
+  def pad_value(value, variable_shape, table_shape):
+    return array_ops.pad(
+        value,
+        [
+            [0, variable_shape[0] - table_shape[0]],
+            [0, variable_shape[1] - table_shape[1]],
+        ],
+        "CONSTANT",
+    )
+
+  var_full_shape, shard_info = _parse_shard_info_str(shape_and_slice)
+  if shard_info.offset > var_full_shape:
+    raise ValueError(
+        "Invalid shard offset: {}. Offset should be less than the full shape"
+        " of the variable: {}".format(
+            shard_info.offset,
+            var_full_shape,
+        )
+    )
+  num_sc_per_partition = (
+      to_shard_layout[0].num_sparse_cores // to_shard_layout[0].num_partitions
+  )
+
+  total_rows_per_sc = to_shard_layout[0].total_rows_per_sparse_core_shard
+  total_rows_per_partition = total_rows_per_sc * num_sc_per_partition
+  full_values = {}
+  if (shard_info.shape[0] % total_rows_per_partition) != 0:
+    raise ValueError(
+        "Invalid shard shape: {}. Number of rows in input shard slice should"
+        " be multiple of number of rows in a partition({})".format(
+            shard_info.shape,
+            total_rows_per_partition,
+        )
+    )
+  # From the shard info, get the row offsets corresponding to the slice
+  # being looked up.
+  required_shard_offsets = range(
+      shard_info.offset[0],
+      shard_info.offset[0] + shard_info.shape[0],
+      total_rows_per_partition,
+  )
+  output_shards = []
+  for required_shard_offset in required_shard_offsets:
+    sharded_tensors = []
+    for i in range(num_sc_per_partition):
+      shard_idx = (required_shard_offset // total_rows_per_sc) + i
+      for table_idx, layout in enumerate(to_shard_layout):
+        if table_idx not in full_values:
+          full_values[table_idx] = pad_value(
+              feature_values[table_idx],
+              layout.unsharded_padded_shape,
+              layout.unsharded_shape,
+          )
+
+        table_value = full_values[table_idx]
+        # Apply rotation to get this table's shard index
+        table_shard_offset = (
+            shard_idx
+            + (layout.num_sparse_cores - layout.sparse_core_shard_rotation)
+        ) % layout.num_sparse_cores
+        sharded_tensors.append(
+            table_value[
+                table_shard_offset :: layout.num_sparse_cores,
+                :,
+            ]
+        )
+    output_shards.append(array_ops.concat(sharded_tensors, axis=0))
+    logging.vlog(
+        1,
+        "_shard_from_cpu_to_sc: last output_shards.shape: %s",
+        output_shards[-1].shape,
+    )
+  return array_ops.concat(output_shards, axis=0)
+
+
+def _unshard_from_sc_to_cpu(
+    stacked_table: tensor.Tensor,
+    from_shard_layouts: Sequence[sparse_core_layout_pb2.SparseCoreTableLayout],
+) -> Sequence[tensor.Tensor]:
+  """Undo the shard the feature tables into SparseCore stacked table.
+
+  Args:
+    stacked_table: The value of a SparseCore stacked and sharded table.
+    from_shard_layouts: The target layouts for the target hardware.
+
+  Returns:
+    The unsharded feature tables.
+  """
+  logging.vlog(
+      1,
+      "To unshuffle_from_sc_to_cpu on stacked_table.shape: %s",
+      stacked_table[0].shape,
+  )
+  ret_tensors = []
+
+  for layout in from_shard_layouts:
+    padded_table = tpu_embedding_v3_utils.unshuffle_from_sc_to_cpu(
+        stacked_table[0],
+        num_sparse_cores=layout.num_sparse_cores,
+        offset_in_shard=layout.sparse_core_shard_row_offset,
+        size_in_shard=layout.unsharded_padded_shape[0]
+        // layout.num_sparse_cores,
+        shard_rotation=layout.sparse_core_shard_rotation,
+    )
+
+    orig_table = tpu_embedding_v3_utils.remove_padding_from_sc(
+        padded_table, layout.unsharded_shape
+    )
+
+    logging.vlog(
+        1, "orig_tensors.shape[%s]: %s", layout.table_name, orig_table.shape
+    )
+    ret_tensors.append(orig_table)
+
+  return ret_tensors
+
+
+class EmbeddingUnshardToShardCallback(checkpoint_adapter.ReshardCallback):
   """Reshard callback for embeddings."""
 
   def __init__(
@@ -89,10 +213,23 @@ def object_name(self) -> str:
   def update_restore_inputs(
       self, checkpoint_key: str, shape_and_slice_spec: str
   ) -> tuple[Sequence[str], Sequence[str]]:
+    """Updates checkpoint key and slice spec acorrding to the resharding plan.
+
+    Args:
+      checkpoint_key: The input checkpoint key to be read.
+      shape_and_slice_spec: The shape and slice spec of the checkpoint key to be
+        read.
+
+    Returns:
+      A tuple of (keys, slices) that should be passed to restore_v2 inorder to
+      reshard according to the resharding plan. The restored tensors from
+      restore_v2 op will usually be passed to reshard method of this class to
+      get the final resharded value.
+    """
     keys = []
     slices = []
-    logging.vlog(
-        2,
+    # TODO(b/398016624): Make this a vlog this log after bug is fixed.
+    logging.info(
         "Updating restore v2 inputs for %s: %s",
         checkpoint_key,
         shape_and_slice_spec,
@@ -102,8 +239,8 @@ def update_restore_inputs(
           self._main_checkpoint_name, self._checkpoint_local_names[i]
       )
       # For resharding later, we need to read the full value here.
-      logging.vlog(
-          2,
+      # TODO(b/398016624): Make this a vlog this log after bug is fixed.
+      logging.info(
           "Will read sub key %s: %s",
           sub_checkpoint_key,
           layout.unsharded_shape,
@@ -122,47 +259,144 @@ def update_restore_inputs(
   def reshard(
       self, checkpoint_values: tensor.Tensor, shape_and_slice: str
   ) -> tensor.Tensor:
-    def pad_value(value, variable_shape, table_shape):
-      return array_ops.pad(
-          value,
-          [
-              [0, variable_shape[0] - table_shape[0]],
-              [0, variable_shape[1] - table_shape[1]],
-          ],
-          "CONSTANT",
-      )
+    """Reshards the checkpoint values according to the resharding plan.
+
+    Args:
+      checkpoint_values: The checkpoint values to be resharded.
+      shape_and_slice: The shape and slice spec to be returned after resharding.
 
-    _, shard_info = _parse_shard_info_str(shape_and_slice)
-    num_sc_per_partition = (
-        self._to_shard_layout[0].num_sparse_cores
-        // self._to_shard_layout[0].num_partitions
+    Returns:
+      The resharded tensor slice.
+    """
+    return _shard_from_cpu_to_sc(
+        checkpoint_values, shape_and_slice, self._to_shard_layout
     )
 
-    total_rows = self._to_shard_layout[0].total_rows_per_sparse_core_shard
-    sharded_tensors = []
-    full_values = {}
-    required_shard_offset = shard_info.offset[0]
-    for i in range(num_sc_per_partition):
-      shard_idx = (required_shard_offset // total_rows) + i
-      for table_idx, layout in enumerate(self._to_shard_layout):
-        if table_idx not in full_values:
-          full_values[table_idx] = pad_value(
-              checkpoint_values[table_idx],
-              layout.unsharded_padded_shape,
-              layout.unsharded_shape,
-          )
-        table_value = full_values[table_idx]
-        # Apply rotation to get this table's shard index
-        table_shard_offset = (
-            shard_idx + layout.sparse_core_shard_rotation
-        ) % layout.num_sparse_cores
-        sharded_tensors.append(
-            table_value[
-                table_shard_offset :: layout.num_sparse_cores,
-                :,
-            ]
+
+class EmbeddingReshardCallback(checkpoint_adapter.ReshardCallback):
+  """Reshard callback for embeddings."""
+
+  def __init__(
+      self,
+      object_local_name: str,
+      from_shard_layouts: Sequence[
+          sparse_core_layout_pb2.SparseCoreTableLayout
+      ],  # table name to layout
+      to_shard_layouts: Sequence[
+          sparse_core_layout_pb2.SparseCoreTableLayout
+      ],  # table name to layout
+  ):
+    """Initializes  Reshard callback.
+
+    Args:
+      object_local_name:  The local name of the object being restored.
+      from_shard_layouts: layouts as in checkpoint being restored from.
+      to_shard_layouts: target layouts as specified in the embedding being
+        restored.
+    """
+    logging.info("Creating EmbeddingReshardCallback for %s", object_local_name)
+    self._object_local_name = object_local_name
+    self._from_shard_layouts = from_shard_layouts
+    self._to_shard_layouts = to_shard_layouts
+
+  def object_name(self) -> str:
+    return self._object_local_name
+
+  def update_restore_inputs(
+      self, checkpoint_key: str, shape_and_slice_spec: str
+  ) -> tuple[Sequence[str], Sequence[str]]:
+    """Return the full shape of the stacked that is passed into restore_v2.
+
+    This shape information is required by the restore_v2 process to ensure it
+    loads the complete tensor from the checkpoint. The full tensor is required
+    to perform resharding operations.
+
+    Args:
+      checkpoint_key: The input checkpoint key to be read.
+      shape_and_slice_spec: The shape and slice spec of the checkpoint key to be
+        read.
+
+    Returns:
+      A tuple of (keys, slices) that should be passed to restore_v2 in order to
+      reshard according to the resharding plan. The restored tensors from
+      restore_v2 op will usually be passed to reshard method of this class to
+      get the final resharded value.
+    """
+    logging.vlog(
+        1,
+        "Updating restore v2 inputs for %s[%s]: %s",
+        checkpoint_key,
+        self._object_local_name,
+        shape_and_slice_spec,
+    )
+
+    slices = []
+
+    # use the first layout get the full shape of the stacked table
+    first_layout = self._from_shard_layouts[0]
+    full_vocab_size = (
+        first_layout.total_rows_per_sparse_core_shard
+        * first_layout.num_sparse_cores
+    )
+    stack_dim = first_layout.unsharded_padded_shape[1]
+    full_shape = [full_vocab_size, stack_dim]
+    logging.vlog(
+        1,
+        "Read checkpoint_key %s: %s",
+        checkpoint_key,
+        full_shape,
+    )
+
+    slices.append(
+        _shard_info_str(
+            full_shape,
+            trackable_base.ShardInfo(offset=[0, 0], shape=full_shape),
         )
-    return array_ops.concat(sharded_tensors, axis=0)
+    )
+    return ([checkpoint_key], slices)
+
+  def reshard(
+      self, checkpoint_values: tensor.Tensor, shape_and_slice: str
+  ) -> tensor.Tensor:
+    # unshard
+    stime = time.time()
+    logging.vlog(
+        1,
+        "EmbeddingReshardCallback: starting to reshard [%s]",
+        self._object_local_name,
+    )
+    unsharded_tensors = _unshard_from_sc_to_cpu(
+        checkpoint_values, self._from_shard_layouts
+    )
+
+    ret = _shard_from_cpu_to_sc(
+        unsharded_tensors, shape_and_slice, self._to_shard_layouts
+    )
+
+    etime = time.time()
+    logging.info(
+        "EmbeddingReshardCallback: reshard [%s] took %s",
+        self._object_local_name,
+        etime - stime,
+    )
+    return ret
+
+
+def _reorg_layouts(
+    layouts: Sequence[sparse_core_layout_pb2.SparseCoreTableLayout],
+) -> Mapping[str, Sequence[sparse_core_layout_pb2.SparseCoreTableLayout]]:
+  """Reorg the layouts to be in the order of the logical table."""
+  stacked_name_to_table_names = collections.defaultdict(list)
+  for layout in layouts:
+    stacked_name_to_table_names[layout.stacked_table_name].append(layout)
+  for stacked_name in stacked_name_to_table_names.keys():
+    sorted_layouts = sorted(
+        stacked_name_to_table_names[stacked_name],
+        key=lambda layout: layout.sparse_core_shard_row_offset,
+    )
+    stacked_name_to_table_names[stacked_name] = sorted_layouts
+
+  return stacked_name_to_table_names
 
 
 class TpuEmbeddingV3CheckpointAdapter(
@@ -222,7 +456,7 @@ def initialize_reshard_callbacks(
         )
         logging.info("Creating resharding plan for %s", stacked_name)
         self._checkpoint_to_reshard_callback[sorted_layouts[0].table_name] = (
-            EmbeddingReshardCallback(
+            EmbeddingUnshardToShardCallback(
                 stacked_name,
                 [l.table_name for l in sorted_layouts],
                 sorted_layouts,
@@ -233,8 +467,18 @@ def initialize_reshard_callbacks(
     if not embedding_layouts:
       # TODO(b/326644306): From sharded to unsharded
       raise NotImplementedError("Sharded to Unsharded is not implemented yet.")
-    # TODO(b/326644391): First unshard then shard.
-    raise NotImplementedError("Changing topology is not implemented yet.")
+    # Reshard to different SC Layout
+    from_layouts = _reorg_layouts(list(self._checkpoint_layouts.values()))
+    to_layouts = _reorg_layouts(list(embedding_layouts.values()))
+    for stacked_name in from_layouts.keys():
+      logging.info("Creating resharding plan for %s", stacked_name)
+      self._checkpoint_to_reshard_callback[stacked_name] = (
+          EmbeddingReshardCallback(
+              object_local_name=stacked_name,
+              from_shard_layouts=from_layouts[stacked_name],
+              to_shard_layouts=to_layouts[stacked_name],
+          )
+      )
 
   def is_layouts_same(self, embedding_layouts) -> bool:
     """Returns True if the all the embedding and checkpoint layouts are the same.
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter_test.py b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter_test.py
index 2a55b2067a89..c1320148b5af 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_adapter_test.py
@@ -15,12 +15,16 @@
 """Tests for tpu_embedding_v3_checkpoint_adapter."""
 
 from tensorflow.core.tpu.kernels import sparse_core_layout_pb2
-from tensorflow.python.framework.constant_op import constant as tf_constant
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_embedding_v3_checkpoint_adapter
 
+tf_constant = constant_op.constant
+
 
 def create_layout(
     tables_name: str,
@@ -75,7 +79,7 @@ def test_adapt_unsharded_to_sharded_simple(self):
     callback = adapter.get_reshard_callback("some_feature")
     # Check partition index 1 (second parition)
     self.assertAllEqual(
-        callback.reshard([t], "128 8 8,16:0,8"),
+        callback.reshard([t], "128 8 8,12:0,8"),
         tf_constant([
             [2, 2, 2, 2, 0, 0, 0, 0],
             [10, 10, 10, 10, 0, 0, 0, 0],
@@ -145,29 +149,112 @@ def test_adapt_unsharded_to_sharded_stacked(self):
         updated_slices,
         ["20 4 0,20:0,4", "32 4 0,32:0,4"],
     )
+    actual = callback.reshard([one_t, two_t], "56 8 14,14:0,8")
     self.assertAllEqual(
-        callback.reshard([one_t, two_t], "56 8 14,28:0,8"),
+        actual,
         tf_constant([
             # table one shard 2
             [2, 2, 2, 2, 0, 0, 0, 0],
             [10, 10, 10, 10, 0, 0, 0, 0],
             [18, 18, 18, 18, 0, 0, 0, 0],
             # table two shard 2
-            [53, 53, 53, 53, 0, 0, 0, 0],
-            [61, 61, 61, 61, 0, 0, 0, 0],
-            [69, 69, 69, 69, 0, 0, 0, 0],
-            [77, 77, 77, 77, 0, 0, 0, 0],
+            [51, 51, 51, 51, 0, 0, 0, 0],
+            [59, 59, 59, 59, 0, 0, 0, 0],
+            [67, 67, 67, 67, 0, 0, 0, 0],
+            [75, 75, 75, 75, 0, 0, 0, 0],
             # table one shard 3
             [3, 3, 3, 3, 0, 0, 0, 0],
             [11, 11, 11, 11, 0, 0, 0, 0],
             [19, 19, 19, 19, 0, 0, 0, 0],
             # table two shard 3
-            [54, 54, 54, 54, 0, 0, 0, 0],
-            [62, 62, 62, 62, 0, 0, 0, 0],
-            [70, 70, 70, 70, 0, 0, 0, 0],
-            [78, 78, 78, 78, 0, 0, 0, 0],
+            [52, 52, 52, 52, 0, 0, 0, 0],
+            [60, 60, 60, 60, 0, 0, 0, 0],
+            [68, 68, 68, 68, 0, 0, 0, 0],
+            [76, 76, 76, 76, 0, 0, 0, 0],
         ]),
     )
+    # Check that full resharding works.
+    actual_full = callback.reshard([one_t, two_t], "56 8 0,56:0,8")
+    self.assertAllEqual(
+        actual_full,
+        tf_constant(
+            [
+                # table one shard 0
+                [0, 0, 0, 0, 0, 0, 0, 0],
+                [8, 8, 8, 8, 0, 0, 0, 0],
+                [16, 16, 16, 16, 0, 0, 0, 0],
+                # table two shard 0
+                [57, 57, 57, 57, 0, 0, 0, 0],
+                [65, 65, 65, 65, 0, 0, 0, 0],
+                [73, 73, 73, 73, 0, 0, 0, 0],
+                [81, 81, 81, 81, 0, 0, 0, 0],
+                # table one shard 1
+                [1, 1, 1, 1, 0, 0, 0, 0],
+                [9, 9, 9, 9, 0, 0, 0, 0],
+                [17, 17, 17, 17, 0, 0, 0, 0],
+                # table two shard 1
+                [50, 50, 50, 50, 0, 0, 0, 0],
+                [58, 58, 58, 58, 0, 0, 0, 0],
+                [66, 66, 66, 66, 0, 0, 0, 0],
+                [74, 74, 74, 74, 0, 0, 0, 0],
+                # table one shard 2
+                [2, 2, 2, 2, 0, 0, 0, 0],
+                [10, 10, 10, 10, 0, 0, 0, 0],
+                [18, 18, 18, 18, 0, 0, 0, 0],
+                # table two shard 2
+                [51, 51, 51, 51, 0, 0, 0, 0],
+                [59, 59, 59, 59, 0, 0, 0, 0],
+                [67, 67, 67, 67, 0, 0, 0, 0],
+                [75, 75, 75, 75, 0, 0, 0, 0],
+                # table one shard 3
+                [3, 3, 3, 3, 0, 0, 0, 0],
+                [11, 11, 11, 11, 0, 0, 0, 0],
+                [19, 19, 19, 19, 0, 0, 0, 0],
+                # table two shard 3
+                [52, 52, 52, 52, 0, 0, 0, 0],
+                [60, 60, 60, 60, 0, 0, 0, 0],
+                [68, 68, 68, 68, 0, 0, 0, 0],
+                [76, 76, 76, 76, 0, 0, 0, 0],
+                # table one shard 4
+                [4, 4, 4, 4, 0, 0, 0, 0],
+                [12, 12, 12, 12, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0],
+                # table two shard 4
+                [53, 53, 53, 53, 0, 0, 0, 0],
+                [61, 61, 61, 61, 0, 0, 0, 0],
+                [69, 69, 69, 69, 0, 0, 0, 0],
+                [77, 77, 77, 77, 0, 0, 0, 0],
+                # table one shard 5
+                [5, 5, 5, 5, 0, 0, 0, 0],
+                [13, 13, 13, 13, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0],
+                # table two shard 5
+                [54, 54, 54, 54, 0, 0, 0, 0],
+                [62, 62, 62, 62, 0, 0, 0, 0],
+                [70, 70, 70, 70, 0, 0, 0, 0],
+                [78, 78, 78, 78, 0, 0, 0, 0],
+                # table one shard 6
+                [6, 6, 6, 6, 0, 0, 0, 0],
+                [14, 14, 14, 14, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0],
+                # table two shard 6
+                [55, 55, 55, 55, 0, 0, 0, 0],
+                [63, 63, 63, 63, 0, 0, 0, 0],
+                [71, 71, 71, 71, 0, 0, 0, 0],
+                [79, 79, 79, 79, 0, 0, 0, 0],
+                # table one shard 7
+                [7, 7, 7, 7, 0, 0, 0, 0],
+                [15, 15, 15, 15, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0],
+                # table two shard 7
+                [56, 56, 56, 56, 0, 0, 0, 0],
+                [64, 64, 64, 64, 0, 0, 0, 0],
+                [72, 72, 72, 72, 0, 0, 0, 0],
+                [80, 80, 80, 80, 0, 0, 0, 0],
+            ],
+            dtype=dtypes.float32,
+        ),
+    )
     self.assertAllEqual(callback._checkpoint_local_names, ["one", "two"])
     self.assertAllEqual(
         [l.table_name for l in callback._to_shard_layout],
@@ -202,6 +289,172 @@ def test_is_layouts_same_works(self):
     layout.num_sparse_cores = 3
     self.assertFalse(adapter.is_layouts_same({layout.table_name: layout}))
 
+  def test_adapt_to_different_sharded_stacked(self):
+
+    source_layouts = {
+        "one": create_layout(
+            tables_name="one",
+            stacked_table_name="one_two_three",
+            num_sparse_cores=4,
+            num_partitions=2,
+            unsharded_shape=(6, 5),
+            unsharded_padded_shape=(8, 8),
+            row_offset=0,
+            shard_rotation=0,
+            total_rows_per_sparse_core_shard=6,
+        ),
+        "two": create_layout(
+            tables_name="two",
+            stacked_table_name="one_two_three",
+            num_sparse_cores=4,
+            num_partitions=2,
+            unsharded_shape=(7, 4),
+            unsharded_padded_shape=(8, 8),
+            row_offset=2,
+            shard_rotation=1,
+            total_rows_per_sparse_core_shard=6,
+        ),
+        "three": create_layout(
+            tables_name="three",
+            stacked_table_name="one_two_three",
+            num_sparse_cores=4,
+            num_partitions=2,
+            unsharded_shape=(15, 3),
+            unsharded_padded_shape=(16, 8),
+            row_offset=4,
+            shard_rotation=2,
+            total_rows_per_sparse_core_shard=6,
+        ),
+    }
+    src_layouts_pb = sparse_core_layout_pb2.SparseCoreTableLayouts()
+    src_layouts_pb.tables.extend(source_layouts.values())
+
+    sc_to_sc_adapter = (
+        tpu_embedding_v3_checkpoint_adapter.TpuEmbeddingV3CheckpointAdapter(
+            layouts=src_layouts_pb
+        )
+    )
+
+    target_layouts = {
+        "one": create_layout(
+            tables_name="one",
+            stacked_table_name="one_two_three",
+            num_sparse_cores=8,
+            num_partitions=4,
+            unsharded_shape=(6, 5),
+            unsharded_padded_shape=(8, 8),
+            row_offset=0,
+            shard_rotation=0,
+            total_rows_per_sparse_core_shard=4,
+        ),
+        "two": create_layout(
+            tables_name="two",
+            stacked_table_name="one_two_three",
+            num_sparse_cores=8,
+            num_partitions=4,
+            unsharded_shape=(7, 4),
+            unsharded_padded_shape=(8, 8),
+            row_offset=1,
+            shard_rotation=1,
+            total_rows_per_sparse_core_shard=4,
+        ),
+        "three": create_layout(
+            tables_name="three",
+            stacked_table_name="one_two_three",
+            num_sparse_cores=8,
+            num_partitions=4,
+            unsharded_shape=(15, 3),
+            unsharded_padded_shape=(16, 8),
+            row_offset=2,
+            shard_rotation=2,
+            total_rows_per_sparse_core_shard=4,
+        ),
+    }
+
+    # this take a mapping[str, sparse_core_layout_pb2.SparseCoreTableLayout]
+    sc_to_sc_adapter.initialize_reshard_callbacks(target_layouts)
+    callback = sc_to_sc_adapter.get_reshard_callback("one_two_three")
+    self.assertEqual(callback.object_name(), "one_two_three")
+    updated_keys, updated_slices = callback.update_restore_inputs(
+        "path/to/embedding/one_two/in/checkpoint", "24 8 6,12:0,8"
+    )
+    self.assertAllEqual(
+        updated_keys,
+        [
+            "path/to/embedding/one_two/in/checkpoint",
+        ],
+    )
+    self.assertAllEqual(
+        updated_slices,
+        ["24 8 0,24:0,8"],
+    )
+
+    one_two_three = tf_constant([
+        # table one shard 0
+        [0, 0, 0, 0, 0, 0, 0, 0],
+        [4, 4, 4, 4, 4, 0, 0, 0],
+        [13, 13, 13, 13, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0],
+        [102, 102, 102, 0, 0, 0, 0, 0],
+        [106, 106, 106, 0, 0, 0, 0, 0],
+        [110, 110, 110, 0, 0, 0, 0, 0],
+        [114, 114, 114, 0, 0, 0, 0, 0],
+        # table one shard 1
+        [1, 1, 1, 1, 1, 0, 0, 0],
+        [5, 5, 5, 5, 5, 0, 0, 0],
+        [10, 10, 10, 10, 0, 0, 0, 0],
+        [14, 14, 14, 14, 0, 0, 0, 0],
+        [103, 103, 103, 0, 0, 0, 0, 0],
+        [107, 107, 107, 0, 0, 0, 0, 0],
+        [111, 111, 111, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0],
+        # table one shard 2
+        [2, 2, 2, 2, 2, 0, 0, 0],
+        [6, 6, 6, 6, 6, 0, 0, 0],
+        [11, 11, 11, 11, 0, 0, 0, 0],
+        [15, 15, 15, 15, 0, 0, 0, 0],
+        [100, 100, 100, 0, 0, 0, 0, 0],
+        [104, 104, 104, 0, 0, 0, 0, 0],
+        [108, 108, 108, 0, 0, 0, 0, 0],
+        [112, 112, 112, 0, 0, 0, 0, 0],
+        # table one shard 3
+        [3, 3, 3, 3, 3, 0, 0, 0],
+        [7, 7, 7, 7, 7, 0, 0, 0],
+        [12, 12, 12, 12, 0, 0, 0, 0],
+        [16, 16, 16, 16, 0, 0, 0, 0],
+        [101, 101, 101, 0, 0, 0, 0, 0],
+        [105, 105, 105, 0, 0, 0, 0, 0],
+        [109, 109, 109, 0, 0, 0, 0, 0],
+        [113, 113, 113, 0, 0, 0, 0, 0],
+    ])
+
+    self.assertAllEqual(
+        tf_constant([
+            #  shard 2
+            [2, 2, 2, 2, 2, 0, 0, 0],
+            [11, 11, 11, 11, 0, 0, 0, 0],
+            [100, 100, 100, 0, 0, 0, 0, 0],
+            [108, 108, 108, 0, 0, 0, 0, 0],
+            # shard 3
+            [3, 3, 3, 3, 3, 0, 0, 0],
+            [12, 12, 12, 12, 0, 0, 0, 0],
+            [101, 101, 101, 0, 0, 0, 0, 0],
+            [109, 109, 109, 0, 0, 0, 0, 0],
+            # shard 4
+            [4, 4, 4, 4, 4, 0, 0, 0],
+            [13, 13, 13, 13, 0, 0, 0, 0],
+            [102, 102, 102, 0, 0, 0, 0, 0],
+            [110, 110, 110, 0, 0, 0, 0, 0],
+            # shard 5
+            [5, 5, 5, 5, 5, 0, 0, 0],
+            [14, 14, 14, 14, 0, 0, 0, 0],
+            [103, 103, 103, 0, 0, 0, 0, 0],
+            [111, 111, 111, 0, 0, 0, 0, 0],
+        ]),
+        callback.reshard([one_two_three], "32 8 8,16:0,8"),
+    )
+
 
 if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
   test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_test.py b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_test.py
new file mode 100644
index 000000000000..75c0bdc3b395
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v3_checkpoint_test.py
@@ -0,0 +1,117 @@
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tpu_embedding_v3_checkpoint."""
+import os
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.checkpoint import checkpoint as util
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.framework import config
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_embedding_v3
+
+
+class TPUEmbeddingV3CheckpointTest(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.vocabulary_size = 16384
+    self.embedding_dim = 128
+
+  def test_checkpoint_save_and_restore(self):
+    feature_config_1 = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=tpu_embedding_v2_utils.TableConfig(
+                vocabulary_size=self.vocabulary_size,
+                dim=self.embedding_dim,
+                initializer=init_ops_v2.Constant(1.0),
+                optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1),
+                combiner="sum",
+                name="video"),
+            name="watched",
+            output_shape=[16]))
+
+    feature_config_2 = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=tpu_embedding_v2_utils.TableConfig(
+                vocabulary_size=self.vocabulary_size,
+                dim=self.embedding_dim,
+                initializer=init_ops_v2.Constant(2.0),  # different initializer
+                optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1),
+                combiner="sum",
+                name="video"),
+            name="watched",
+            output_shape=[16]))
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu="")
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    with strategy.scope():
+      model1 = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config_1,
+          optimizer=tpu_embedding_v2_utils.SGD())
+      model1.build()
+
+      # Check saving from inside scope works.
+      checkpoint = util.Checkpoint(model=model1)
+      checkpoint.save(self._get_tmpdir("restore", "save"))
+
+    # Check the variable created by model1
+    expected_shard_shape = (self.vocabulary_size //
+                            strategy.num_replicas_in_sync, self.embedding_dim)
+    self.assertIsInstance(model1._variables["video"]["parameters"],
+                          tpu_embedding_v3.TPUEmbeddingShardedVariable)
+    self.assertLen(model1._variables["video"]["parameters"].values,
+                   strategy.num_replicas_in_sync)
+    self.assertEqual(model1._variables["video"]["parameters"].values[0].shape,
+                     expected_shard_shape)
+    self.assertAllEqual(
+        model1._variables["video"]["parameters"].values[0].numpy(),
+        np.ones(expected_shard_shape) * 1.0)
+
+    with strategy.scope():
+      model2 = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config_2,
+          optimizer=tpu_embedding_v2_utils.SGD())
+
+    checkpoint = util.Checkpoint(model=model2)
+
+    # Load from checkpoint
+    checkpoint.restore(self._get_tmpdir("restore", "save-1"))
+    model2.build()
+
+    # Check the variable restored by model2
+    self.assertAllEqual(
+        model2._variables["video"]["parameters"].values[0].numpy(),
+        np.ones(expected_shard_shape) * 1.0)
+
+  def _get_tmpdir(self, name, subdir=""):
+    segments = [os.environ.get("TEST_TMPDIR", "/tmp"), name] + (
+        [subdir] if subdir else []
+    )
+    return os.path.join(*segments)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  config.enable_mlir_bridge()
+  test.main()
diff --git a/tensorflow/python/trackable/base.py b/tensorflow/python/trackable/base.py
index 5411b4a56d8b..34400013fc92 100644
--- a/tensorflow/python/trackable/base.py
+++ b/tensorflow/python/trackable/base.py
@@ -79,7 +79,7 @@ class WeakTrackableReference(TrackableReference):
   __slots__ = ()
 
   def __init__(self, name, ref):
-    if not isinstance(self, weakref.ref):
+    if not isinstance(ref, weakref.ref):
       ref = weakref.ref(ref)
     super(WeakTrackableReference, self).__init__(name=name, ref=ref)
 
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 6c8241479b2d..26de739df068 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -82,7 +82,7 @@ tf_python_pybind_extension(
             "//tensorflow/core/framework:reader_base",
             "//tensorflow/core:lib_headers_for_pybind",
             "//tensorflow/core/profiler/internal:print_model_analysis",
-            "//third_party/python_runtime:headers",
+            "@local_xla//third_party/python_runtime:headers",
             "@com_google_absl//absl/strings",
             "@eigen_archive//:eigen3",
             "@pybind11",
@@ -102,7 +102,7 @@ tf_python_pybind_extension(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -117,7 +117,7 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ] + if_pywrap([":cpp_nest"]),
 )
@@ -132,7 +132,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
     ],
     alwayslink = 1,
 )
@@ -151,7 +151,7 @@ tf_python_pybind_extension(
         "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ] + if_pywrap([":kernel_registry"]),
 )
@@ -211,7 +211,7 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core/util:port",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
@@ -269,7 +269,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ] + if_pywrap(["@com_google_absl//absl/strings"]),
 )
@@ -282,8 +282,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -464,7 +464,6 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/util:managed_stack_trace",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -475,6 +474,7 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:mutex",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
         "@pybind11_abseil//pybind11_abseil:status_casters",
@@ -499,12 +499,12 @@ cc_library(
     deps = [
         "//tensorflow/core/platform:stack_frame",
         "//tensorflow/core/util:managed_stack_trace",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:fingerprint",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -517,11 +517,11 @@ cc_library(
         "//tensorflow/core/platform:macros",
         "//tensorflow/python/lib/core:py_util",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -530,9 +530,9 @@ cc_library(
     textual_hdrs = ["function_parameter_canonicalizer.h"],
     deps = [
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",
     ],
 )
 
@@ -551,9 +551,10 @@ tf_python_pybind_extension(
     starlark_only = True,
     deps = [
         "//tensorflow/core:lib",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
+        "@local_xla//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
     ] + if_pywrap(
         if_true = [
@@ -1169,6 +1170,8 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/core/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@local_xla//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/util/fast_module_type.cc b/tensorflow/python/util/fast_module_type.cc
index 0aaf02d6a67b..af58de526127 100644
--- a/tensorflow/python/util/fast_module_type.cc
+++ b/tensorflow/python/util/fast_module_type.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <Python.h>
+
 #include <array>
 
 // clang-format off
 // These headers must be at the top, before including Python.h header
 // Otherwise, we get C2039 on MSVC due to 'copysign'
+#include "absl/log/check.h"
 #include "pybind11/complex.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 // clang-format on
diff --git a/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc b/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc
index 0e8d95a815c7..e2e896661aec 100644
--- a/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc
+++ b/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
diff --git a/tensorflow/python/util/nest.cc b/tensorflow/python/util/nest.cc
index 72a88697aee9..41a528655ea4 100644
--- a/tensorflow/python/util/nest.cc
+++ b/tensorflow/python/util/nest.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/util/nest.h"
 
+#include <Python.h>
+
 #include <cstddef>
 #include <string>
 
diff --git a/tensorflow/pytype.default.bzl b/tensorflow/pytype.default.bzl
index 424185f4b634..e4258199dc4d 100644
--- a/tensorflow/pytype.default.bzl
+++ b/tensorflow/pytype.default.bzl
@@ -1,19 +1,21 @@
 """Default (OSS) build versions of Python pytype rules."""
 
+load("@rules_python//python:py_binary.bzl", "py_binary")
+load("@rules_python//python:py_library.bzl", "py_library")
 load("//tensorflow:tensorflow.bzl", _py_test = "py_test")
 
 # Placeholder to use until bazel supports pytype_library.
 def pytype_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
     _ = (pytype_deps, pytype_srcs)  # @unused
-    native.py_library(name = name, **kwargs)
+    py_library(name = name, **kwargs)
 
 # Placeholder to use until bazel supports pytype_strict_binary.
 def pytype_strict_binary(name, **kwargs):
-    native.py_binary(name = name, **kwargs)
+    py_binary(name = name, **kwargs)
 
 # Placeholder to use until bazel supports pytype_strict_library.
 def pytype_strict_library(name, **kwargs):
-    native.py_library(name = name, **kwargs)
+    py_library(name = name, **kwargs)
 
 # Placeholder to use until bazel supports pytype_strict_contrib_test.
 def pytype_strict_contrib_test(name, **kwargs):
diff --git a/tensorflow/security/fuzzing/cc/core/framework/BUILD b/tensorflow/security/fuzzing/cc/core/framework/BUILD
index b3bad514cd13..f9623330fd4a 100644
--- a/tensorflow/security/fuzzing/cc/core/framework/BUILD
+++ b/tensorflow/security/fuzzing/cc/core/framework/BUILD
@@ -10,6 +10,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_fuzztest//fuzztest",
         "@local_xla//xla/tsl/platform:errors",
     ],
diff --git a/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
index 3dcb690f44c3..966451528e02 100644
--- a/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
+++ b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "fuzztest/fuzztest.h"
+#include "absl/status/statusor.h"
 #include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
diff --git a/tensorflow/security/fuzzing/cc/ops/BUILD b/tensorflow/security/fuzzing/cc/ops/BUILD
index dd1edceb5b3c..f631f1b48e47 100644
--- a/tensorflow/security/fuzzing/cc/ops/BUILD
+++ b/tensorflow/security/fuzzing/cc/ops/BUILD
@@ -126,6 +126,7 @@ tf_cc_fuzz_test(
 tf_cc_fuzz_test(
     name = "general_ops_fuzz",
     srcs = ["general_ops_fuzz.cc"],
+    shard_count = 5,
     deps = [
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core/framework:tensor",
diff --git a/tensorflow/security/fuzzing/py/annotation_types.py b/tensorflow/security/fuzzing/py/annotation_types.py
index b03f66e5e29c..874b66c68181 100644
--- a/tensorflow/security/fuzzing/py/annotation_types.py
+++ b/tensorflow/security/fuzzing/py/annotation_types.py
@@ -43,11 +43,13 @@ def _create_dtype_wrapper(name, underlying_dtype: _dtypes.DType):
 Float32 = _create_dtype_wrapper("Float32", _dtypes.float32)
 Float64 = _create_dtype_wrapper("Float64", _dtypes.float64)
 Half = _create_dtype_wrapper("Half", _dtypes.float16)
+Int2 = _create_dtype_wrapper("Int2", _dtypes.int2)
 Int4 = _create_dtype_wrapper("Int4", _dtypes.int4)
 Int8 = _create_dtype_wrapper("Int8", _dtypes.int8)
 Int16 = _create_dtype_wrapper("Int16", _dtypes.int16)
 Int32 = _create_dtype_wrapper("Int32", _dtypes.int32)
 Int64 = _create_dtype_wrapper("Int64", _dtypes.int64)
+UInt2 = _create_dtype_wrapper("UInt2", _dtypes.uint2)
 UInt4 = _create_dtype_wrapper("UInt4", _dtypes.uint4)
 UInt8 = _create_dtype_wrapper("UInt8", _dtypes.uint8)
 UInt16 = _create_dtype_wrapper("UInt16", _dtypes.uint16)
diff --git a/tensorflow/security/fuzzing/tf_fuzzing.bzl b/tensorflow/security/fuzzing/tf_fuzzing.bzl
index 1f8527c45f67..b75a34a15bcf 100644
--- a/tensorflow/security/fuzzing/tf_fuzzing.bzl
+++ b/tensorflow/security/fuzzing/tf_fuzzing.bzl
@@ -1,5 +1,7 @@
 """Definitions for rules to fuzz TensorFlow."""
 
+load("@rules_python//python:py_test.bzl", "py_test")
+
 # TensorFlow fuzzing can be done in open source too, as it is in oss-fuzz.com
 
 # tf_cc_fuzz_test is a cc_test modified to include fuzzing support and dependencies for go/fuzztest.
@@ -104,7 +106,7 @@ def tf_py_fuzz_target(
     tags = tags + ["manual"]
 
     # Now, redirect to py_test
-    native.py_test(
+    py_test(
         name = name,
         python_version = python_version,
         deps = deps,
diff --git a/tensorflow/strict.default.bzl b/tensorflow/strict.default.bzl
index f203d8a1094f..8f0d72ab9b4c 100644
--- a/tensorflow/strict.default.bzl
+++ b/tensorflow/strict.default.bzl
@@ -1,14 +1,16 @@
 """Default (OSS) build versions of Python strict rules."""
 
+load("@rules_python//python:py_binary.bzl", "py_binary")
+load("@rules_python//python:py_library.bzl", "py_library")
 load("//tensorflow:tensorflow.bzl", _py_test = "py_test")
 
 # Placeholder to use until bazel supports py_strict_binary.
 def py_strict_binary(name, **kwargs):
-    native.py_binary(name = name, **kwargs)
+    py_binary(name = name, **kwargs)
 
 # Placeholder to use until bazel supports py_strict_library.
 def py_strict_library(name, **kwargs):
-    native.py_library(name = name, **kwargs)
+    py_library(name = name, **kwargs)
 
 # Placeholder to use until bazel supports py_strict_test.
 def py_strict_test(name, **kwargs):
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index bbf9b02999d7..744e01823c19 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -54,11 +54,11 @@ load(
     "cc_test",
 )
 load(
-    "//third_party/compute_library:build_defs.bzl",
+    "@local_xla//third_party/compute_library:build_defs.bzl",
     "if_enable_acl",
 )
 load(
-    "//third_party/llvm_openmp:openmp.bzl",
+    "@local_xla//third_party/llvm_openmp:openmp.bzl",
     "windows_llvm_openmp_linkopts",
 )
 load(
@@ -82,7 +82,7 @@ load(
     "if_tensorrt_exec",
 )
 load(
-    "@local_xla//third_party/py/rules_pywrap:pywrap.bzl",
+    "@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl",
     "use_pywrap_rules",
     _pybind_extension = "pybind_extension",
     _stripped_cc_info = "stripped_cc_info",
@@ -216,25 +216,12 @@ def if_android_arm64(a):
         "//conditions:default": [],
     })
 
-def if_android_mips(a):
-    return select({
-        clean_dep("//tensorflow:android_mips"): a,
-        "//conditions:default": [],
-    })
-
 def if_not_android(a):
     return select({
         clean_dep("//tensorflow:android"): [],
         "//conditions:default": a,
     })
 
-def if_not_android_mips_and_mips64(a):
-    return select({
-        clean_dep("//tensorflow:android_mips"): [],
-        clean_dep("//tensorflow:android_mips64"): [],
-        "//conditions:default": a,
-    })
-
 def if_android(a):
     return select({
         clean_dep("//tensorflow:android"): a,
@@ -332,7 +319,6 @@ def if_not_fuchsia(a):
 def if_linux_x86_64(a):
     return select({
         clean_dep("//tensorflow:linux_x86_64"): a,
-        clean_dep("//tensorflow:haswell"): a,
         "//conditions:default": [],
     })
 
@@ -355,6 +341,7 @@ def if_libtpu(if_true, if_false = []):
     return select({
         # copybara:uncomment_begin(different config setting in OSS)
         # "//tools/cc_target_os:gce": if_true,
+        # "//buildenv/platforms/settings:chrome_linux": if_false,
         # copybara:uncomment_end_and_comment_begin
         clean_dep("//tensorflow:with_tpu_support"): if_true,
         # copybara:comment_end
@@ -1603,7 +1590,7 @@ def tf_cc_test(
                 "-lpthread",
                 "-lm",
             ],
-            clean_dep("//third_party/compute_library:build_with_acl"): [
+            clean_dep("@local_xla//third_party/compute_library:build_with_acl"): [
                 "-fopenmp",
                 "-lm",
             ],
@@ -1646,7 +1633,7 @@ def tf_cc_shared_test(
                 "-lpthread",
                 "-lm",
             ],
-            clean_dep("//third_party/compute_library:build_with_acl"): [
+            clean_dep("@local_xla//third_party/compute_library:build_with_acl"): [
                 "-fopenmp",
                 "-lm",
             ],
@@ -2230,6 +2217,8 @@ def tf_custom_op_library_additional_deps_impl():
         clean_dep("//tensorflow/core:reader_base"),
     ]
 
+CollectedDepsInfo = provider("CollectedDepsInfo", fields = ["tf_collected_deps"])
+
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -2245,9 +2234,9 @@ def _collect_deps_aspect_impl(target, ctx):
         all_deps += ctx.rule.attr.roots
     for dep in all_deps:
         direct.append(dep.label)
-        if hasattr(dep, "tf_collected_deps"):
-            transitive.append(dep.tf_collected_deps)
-    return struct(tf_collected_deps = depset(direct = direct, transitive = transitive))
+        if CollectedDepsInfo in dep:
+            transitive.append(dep[CollectedDepsInfo].tf_collected_deps)
+    return CollectedDepsInfo(tf_collected_deps = depset(direct = direct, transitive = transitive))
 
 collect_deps_aspect = aspect(
     attr_aspects = ["deps", "data", "roots"],
@@ -2266,9 +2255,9 @@ def _check_deps_impl(ctx):
     required_deps = ctx.attr.required_deps
     disallowed_deps = ctx.attr.disallowed_deps
     for input_dep in ctx.attr.deps:
-        if not hasattr(input_dep, "tf_collected_deps"):
+        if CollectedDepsInfo not in input_dep:
             continue
-        collected_deps = sets.make(input_dep.tf_collected_deps.to_list())
+        collected_deps = sets.make(input_dep[CollectedDepsInfo].tf_collected_deps.to_list())
         for disallowed_dep in disallowed_deps:
             if sets.contains(collected_deps, disallowed_dep.label):
                 fail(
@@ -2465,7 +2454,7 @@ def pywrap_tensorflow_macro_opensource(
     """Builds the pywrap_tensorflow_internal shared object."""
 
     if use_pywrap_rules():
-        native.py_library(
+        _plain_py_library(
             name = name,
             srcs = [],
             deps = [],
diff --git a/tensorflow/tensorflow.default.bzl b/tensorflow/tensorflow.default.bzl
index 9f28a83b5ed8..fc6793e5d741 100644
--- a/tensorflow/tensorflow.default.bzl
+++ b/tensorflow/tensorflow.default.bzl
@@ -1,7 +1,7 @@
 """Default (OSS) build versions of TensorFlow general-purpose build extensions."""
 
 load(
-    "@local_xla//third_party/py/rules_pywrap:pywrap.bzl",
+    "@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl",
     _pywrap_aware_cc_import = "pywrap_aware_cc_import",
     _pywrap_aware_filegroup = "pywrap_aware_filegroup",
     _pywrap_aware_genrule = "pywrap_aware_genrule",
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 37c9e6445e4f..d27dd48a6979 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -14,4 +14,11 @@
 *tsl*
 *lite*
 *TFL*
+*TfLite*
 *quantization*
+*mlir*detail*
+*mlir*func*
+*mlir*TF*
+*mlir*shape*
+*mlir*scf*
+*mlir*quant*
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-feature.pbtxt
index 3f03cccf8b4f..e29e51edc462 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-feature.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "default_value"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-sequence-feature.pbtxt
index 4392e96474a9..5b48875a6763 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-sequence-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-fixed-len-sequence-feature.pbtxt
@@ -5,19 +5,19 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "allow_missing"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "default_value"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-feature.pbtxt
index 5179f61947d7..1d4e68eb77f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-feature.pbtxt
@@ -5,23 +5,23 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "already_sorted"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "index_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "value_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-value.pbtxt
index d33fd4d5d7b6..ea814a518192 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-value.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-value.pbtxt
@@ -4,15 +4,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "dense_shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "indices"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "values"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-var-len-feature.pbtxt
index ead3d39fbc09..58c82ae1dfff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-var-len-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-var-len-feature.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
index 860e59e88307..d8259fd6aea6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "experimental_device_ordinal"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_priority"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "memory_limit"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device.pbtxt
index 0e8c9c50c12f..4830cc16d089 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device.pbtxt
@@ -5,11 +5,11 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "device_type"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "name"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.-physical-device.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.-physical-device.pbtxt
index 5d323c8807fd..e1bca473856e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.-physical-device.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.-physical-device.pbtxt
@@ -5,11 +5,11 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "device_type"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "name"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 476c45b7132c..ee05fba8083b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "experimental_device_ordinal"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_priority"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "memory_limit"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-auto-shard-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-auto-shard-policy.pbtxt
index 6dd9127539ac..43873c02e7c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-auto-shard-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-auto-shard-policy.pbtxt
@@ -56,6 +56,9 @@ tf_class {
   member_method {
     name: "from_bytes"
   }
+  member_method {
+    name: "is_integer"
+  }
   member_method {
     name: "to_bytes"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
index 6cd58f378632..b18447ec7e2a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -5,39 +5,39 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "fault_tolerant_mode"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "job_gc_check_interval_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "job_gc_timeout_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "port"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "protocol"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "work_dir"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_addresses"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_max_concurrent_snapshots"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_timeout_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-sharding-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-sharding-policy.pbtxt
index 0b6ca329e97a..853448de789c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-sharding-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-sharding-policy.pbtxt
@@ -60,6 +60,9 @@ tf_class {
   member_method {
     name: "from_bytes"
   }
+  member_method {
+    name: "is_integer"
+  }
   member_method {
     name: "to_bytes"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
index 336d553c2455..4cf41df09fba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -5,35 +5,35 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "data_transfer_address"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "data_transfer_protocol"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dispatcher_address"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dispatcher_timeout_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "heartbeat_interval_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "port"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "protocol"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_address"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-run-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-run-options.pbtxt
index 41ee01e910e7..8edf6caddecd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-run-options.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "experimental_bucketizing_dynamic_shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_enable_dynamic_batch_size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_xla_options"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
index 2819ca85612e..5fde5f4d7647 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
+    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\', \'executable_location\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\', \'ExecutableLocation.WITHIN_CLUSTER\'], "
   }
   member_method {
     name: "cluster_spec"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-executable-location.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-executable-location.pbtxt
new file mode 100644
index 000000000000..ba6735c2f15b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-executable-location.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.cluster_resolver.KubernetesExecutableLocation"
+tf_class {
+  is_instance: "<enum \'ExecutableLocation\'>"
+  member {
+    name: "OFF_CLUSTER"
+    mtype: "<enum \'ExecutableLocation\'>"
+  }
+  member {
+    name: "WITHIN_CLUSTER"
+    mtype: "<enum \'ExecutableLocation\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt
index 5906ffa850a3..797e01a3a578 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "KubernetesClusterResolver"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "KubernetesExecutableLocation"
+    mtype: "<class \'enum.EnumType\'>"
+  }
   member {
     name: "SimpleClusterResolver"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
index 54f9dbc4a678..39984d58cbdc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
@@ -20,10 +20,18 @@ tf_module {
     name: "float8_e5m2fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "uint2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "uint4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index 649a60a67494..a38bafff65a3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -56,10 +56,18 @@ tf_module {
     name: "float8_e5m2fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "uint2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "uint4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index e288edf8b936..9ce98174f30d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -72,6 +72,10 @@ tf_module {
     name: "decode_png"
     argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
+  member_method {
+    name: "decode_webp"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "draw_bounding_boxes"
     argspec: "args=[\'images\', \'boxes\', \'name\', \'colors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt
index 4054cb43218d..f10cafff75fb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-feature.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "default_value"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt
index 1f2abd9c0ed5..22a710083e8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-fixed-len-sequence-feature.pbtxt
@@ -5,19 +5,19 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "allow_missing"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "default_value"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-lengths.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-lengths.pbtxt
index c809a27091ed..238a1515ff0d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-lengths.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-lengths.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-limits.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-limits.pbtxt
index 84627a6f7f1d..df05c9255125 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-limits.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-limits.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-splits.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-splits.pbtxt
index 82fba5ff702e..bacac6824079 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-splits.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-splits.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-starts.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-starts.pbtxt
index 5f2a5f46b15e..c297c57c330a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-starts.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-row-starts.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt
index 35e852ecb4d1..446723f5e97f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "length"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt
index fdfcd3ed8e09..f5d0746619f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.pbtxt
index f19806bfd293..be7d50978914 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-ragged-feature.pbtxt
@@ -29,23 +29,23 @@ tf_class {
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "partitions"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "row_splits_dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "validate"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "value_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt
index 350bfbe23ee4..2208fc0c22ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-sparse-feature.pbtxt
@@ -5,23 +5,23 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "already_sorted"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "index_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "value_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt
index 309272afb51d..537d7f7299ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.-var-len-feature.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index 4a1521112894..22b50365d29f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -100,6 +100,10 @@ tf_module {
     name: "decode_raw"
     argspec: "args=[\'input_bytes\', \'out_type\', \'little_endian\', \'name\', \'bytes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "decode_webp"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "deserialize_many_sparse"
     argspec: "args=[\'serialized_sparse\', \'dtype\', \'rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
index 49828acf7397..eb0e28681d62 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-state-tuple.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "c"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
@@ -13,7 +13,7 @@ tf_class {
   }
   member {
     name: "h"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 46ee02b25a0a..4ae33e0179ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1236,6 +1236,10 @@ tf_module {
     name: "DecodeWav"
     argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "DecodeWebP"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "DeepCopy"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -5736,57 +5740,85 @@ tf_module {
     name: "XlaSparseDenseMatmul"
     argspec: "args=[\'row_ids\', \'col_ids\', \'values\', \'offsets\', \'embedding_table\', \'max_ids_per_partition\', \'max_unique_ids_per_partition\', \'input_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'accumulator\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'tables\', \'hyperparameters\', \'combiner_weights_learning_rate\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'optimizer_custom_computation\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'embedding_table\', \'weights\', \'input_size\', \'max_valency\', \'num_weights\', \'combiner_computation\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'tables\', \'hyperparameters\', \'num_minibatches_per_physical_sparse_core\', \'custom_computation\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'tables\', \'hyperparameters\', \'num_minibatches_per_physical_sparse_core\', \'custom_computation\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulWithCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulWithStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSplitND"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.-padding-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.-padding-spec.pbtxt
index fae47e5e3589..990cbe28bb09 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.-padding-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.-padding-spec.pbtxt
@@ -44,6 +44,9 @@ tf_class {
   member_method {
     name: "from_bytes"
   }
+  member_method {
+    name: "is_integer"
+  }
   member_method {
     name: "to_bytes"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.-x-l-a-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.-x-l-a-options.pbtxt
index 74cf80998e28..ef3b4d27ed0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.-x-l-a-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.-x-l-a-options.pbtxt
@@ -5,11 +5,11 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "enable_xla_dynamic_padder"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "use_spmd_for_xla_partitioning"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-device-order-mode.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-device-order-mode.pbtxt
index 915a55b6653d..766c106f561d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-device-order-mode.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-device-order-mode.pbtxt
@@ -48,6 +48,9 @@ tf_class {
   member_method {
     name: "from_bytes"
   }
+  member_method {
+    name: "is_integer"
+  }
   member_method {
     name: "to_bytes"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt
index 4c0c2252b6da..2c56e3c322cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt
@@ -5,23 +5,23 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "devices"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "num_cores"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "num_hosts"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "num_of_cores_per_host"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "topology"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
index 829d45dc73f5..09f08b08d492 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.AdagradMomentum"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.AdagradMomentum\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index 0e3dda0dad13..3f5cacc71001 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.Adagrad"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index f52fec3b97fb..febe4a76a5ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.Adam"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-custom-combiner.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-custom-combiner.pbtxt
new file mode 100644
index 000000000000..7293cfb96233
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-custom-combiner.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.tpu.experimental.embedding.CustomCombiner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.CustomCombiner\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "CUSTOM_COMBINER_SLOT_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'combiner_computation\', \'max_valency\', \'num_weights\', \'initializer\', \'combiner_weights_learning_rate\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'0.01\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt
index 08de88d791b1..699d074c904d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.CustomOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.CustomOptimizer\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "custom_computation"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
index 4a1ec7116405..61946268a949 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.FTRL"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FTRL\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 37fb55880ac4..cef70784fac7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.SGD"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
index 052d0fd257ed..1c3789e2d0a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Adam"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CustomCombiner"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CustomOptimizer"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-args.pbtxt
index 442990893e33..7dd0a1401e29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-args.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-args.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "feed_dict"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "fetches"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "options"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-values.pbtxt
index 0b401d59c400..e84582660c77 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-values.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-session-run-values.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "options"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "results"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "run_metadata"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-vocab-info.pbtxt
index 39b946b82f3d..70f8e550eaf5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-vocab-info.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-vocab-info.pbtxt
@@ -5,31 +5,31 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "axis"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "backup_initializer"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "new_vocab"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "new_vocab_size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "num_oov_buckets"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "old_vocab"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "old_vocab_size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.-tensor-sequence-length-pair.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.-tensor-sequence-length-pair.pbtxt
index 03b13a3bdcd1..b9b67838cc82 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.-tensor-sequence-length-pair.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.-tensor-sequence-length-pair.pbtxt
@@ -4,11 +4,11 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "dense_tensor"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "sequence_length"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
index 860e59e88307..d8259fd6aea6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "experimental_device_ordinal"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_priority"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "memory_limit"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device.pbtxt
index 0e8c9c50c12f..4830cc16d089 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device.pbtxt
@@ -5,11 +5,11 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "device_type"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "name"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.-physical-device.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.-physical-device.pbtxt
index 5d323c8807fd..e1bca473856e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.-physical-device.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.-physical-device.pbtxt
@@ -5,11 +5,11 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "device_type"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "name"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 476c45b7132c..ee05fba8083b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "experimental_device_ordinal"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_priority"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "memory_limit"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-auto-shard-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-auto-shard-policy.pbtxt
index 6dd9127539ac..43873c02e7c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-auto-shard-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-auto-shard-policy.pbtxt
@@ -56,6 +56,9 @@ tf_class {
   member_method {
     name: "from_bytes"
   }
+  member_method {
+    name: "is_integer"
+  }
   member_method {
     name: "to_bytes"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
index 6cd58f378632..b18447ec7e2a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -5,39 +5,39 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "fault_tolerant_mode"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "job_gc_check_interval_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "job_gc_timeout_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "port"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "protocol"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "work_dir"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_addresses"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_max_concurrent_snapshots"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_timeout_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-sharding-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-sharding-policy.pbtxt
index 0b6ca329e97a..853448de789c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-sharding-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-sharding-policy.pbtxt
@@ -60,6 +60,9 @@ tf_class {
   member_method {
     name: "from_bytes"
   }
+  member_method {
+    name: "is_integer"
+  }
   member_method {
     name: "to_bytes"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
index 336d553c2455..4cf41df09fba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -5,35 +5,35 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "data_transfer_address"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "data_transfer_protocol"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dispatcher_address"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dispatcher_timeout_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "heartbeat_interval_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "port"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "protocol"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "worker_address"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
index 4feabd057a45..a5f699f1d678 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-options.pbtxt
@@ -5,19 +5,19 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "experimental_fetch_to_device"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_per_replica_buffer_size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_place_dataset_on_device"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_replication_mode"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-run-options.pbtxt
index 41ee01e910e7..8edf6caddecd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-run-options.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "experimental_bucketizing_dynamic_shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_enable_dynamic_batch_size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "experimental_xla_options"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
index 2819ca85612e..5fde5f4d7647 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -17,7 +17,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
+    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\', \'executable_location\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\', \'ExecutableLocation.WITHIN_CLUSTER\'], "
   }
   member_method {
     name: "cluster_spec"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-executable-location.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-executable-location.pbtxt
new file mode 100644
index 000000000000..ba6735c2f15b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-executable-location.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.cluster_resolver.KubernetesExecutableLocation"
+tf_class {
+  is_instance: "<enum \'ExecutableLocation\'>"
+  member {
+    name: "OFF_CLUSTER"
+    mtype: "<enum \'ExecutableLocation\'>"
+  }
+  member {
+    name: "WITHIN_CLUSTER"
+    mtype: "<enum \'ExecutableLocation\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt
index 5906ffa850a3..797e01a3a578 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "KubernetesClusterResolver"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "KubernetesExecutableLocation"
+    mtype: "<class \'enum.EnumType\'>"
+  }
   member {
     name: "SimpleClusterResolver"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
index 54f9dbc4a678..39984d58cbdc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
@@ -20,10 +20,18 @@ tf_module {
     name: "float8_e5m2fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "uint2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "uint4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index 4f7f48b27ef3..75d2ccd9d4a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -64,6 +64,10 @@ tf_module {
     name: "float8_e5m2fnuz"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -76,6 +80,10 @@ tf_module {
     name: "tensorrt"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "uint2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "uint4"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
index e3f7e3639da9..e50f8d54230c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
@@ -5,27 +5,27 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "allow_build_at_runtime"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "max_workspace_size_bytes"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "maximum_cached_engines"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "minimum_segment_size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "precision_mode"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "use_calibration"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 85b36b7a7cd9..d7bd4fcbe5cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -72,6 +72,10 @@ tf_module {
     name: "decode_png"
     argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
+  member_method {
+    name: "decode_webp"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "draw_bounding_boxes"
     argspec: "args=[\'images\', \'boxes\', \'colors\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt
index 4054cb43218d..f10cafff75fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-feature.pbtxt
@@ -5,15 +5,15 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "default_value"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt
index 1f2abd9c0ed5..22a710083e8b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-fixed-len-sequence-feature.pbtxt
@@ -5,19 +5,19 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "allow_missing"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "default_value"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "shape"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-lengths.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-lengths.pbtxt
index c809a27091ed..238a1515ff0d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-lengths.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-lengths.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-limits.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-limits.pbtxt
index 84627a6f7f1d..df05c9255125 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-limits.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-limits.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-splits.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-splits.pbtxt
index 82fba5ff702e..bacac6824079 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-splits.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-splits.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-starts.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-starts.pbtxt
index 5f2a5f46b15e..c297c57c330a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-starts.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-row-starts.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt
index 35e852ecb4d1..446723f5e97f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-uniform-row-length.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "length"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt
index fdfcd3ed8e09..f5d0746619f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.-value-row-ids.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.pbtxt
index f19806bfd293..be7d50978914 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-ragged-feature.pbtxt
@@ -29,23 +29,23 @@ tf_class {
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "partitions"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "row_splits_dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "validate"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "value_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt
index 350bfbe23ee4..2208fc0c22ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-sparse-feature.pbtxt
@@ -5,23 +5,23 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "already_sorted"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "index_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "size"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "value_key"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt
index 309272afb51d..537d7f7299ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.-var-len-feature.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "dtype"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 4e96f212b111..f72c86876355 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -80,6 +80,10 @@ tf_module {
     name: "decode_raw"
     argspec: "args=[\'input_bytes\', \'out_type\', \'little_endian\', \'fixed_length\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "decode_webp"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "deserialize_many_sparse"
     argspec: "args=[\'serialized_sparse\', \'dtype\', \'rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt
index 1139c1c5038e..a7eea0faab38 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profiler-options.pbtxt
@@ -5,19 +5,19 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "delay_ms"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "device_tracer_level"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "host_tracer_level"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "python_tracer_level"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 46ee02b25a0a..4ae33e0179ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1236,6 +1236,10 @@ tf_module {
     name: "DecodeWav"
     argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "DecodeWebP"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "DeepCopy"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -5736,57 +5740,85 @@ tf_module {
     name: "XlaSparseDenseMatmul"
     argspec: "args=[\'row_ids\', \'col_ids\', \'values\', \'offsets\', \'embedding_table\', \'max_ids_per_partition\', \'max_unique_ids_per_partition\', \'input_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'accumulator\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdagradMomentumAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithAdamAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'tables\', \'hyperparameters\', \'combiner_weights_learning_rate\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'optimizer_custom_computation\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithFtrlAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcGradWithSgdAndCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'weights\', \'preserved_valencies\', \'preserved_vectors\', \'preserved_weights\', \'activation_gradients\', \'learning_rate\', \'combiner_weights_learning_rate\', \'embedding_table\', \'max_valency\', \'num_weights\', \'combiner_table_vjp_computation\', \'combiner_weights_vjp_computation\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
+  member_method {
+    name: "XlaSparseDenseMatmulCustomCombinerOnTcWithCsrInput"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_pos_ids\', \'sorted_gains\', \'embedding_table\', \'weights\', \'input_size\', \'max_valency\', \'num_weights\', \'combiner_computation\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'tables\', \'hyperparameters\', \'num_minibatches_per_physical_sparse_core\', \'custom_computation\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'tables\', \'hyperparameters\', \'num_minibatches_per_physical_sparse_core\', \'custom_computation\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulWithCsrInput"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSparseDenseMatmulWithStaticBufferSize"
-    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'num_sparsecores_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "XlaSplitND"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.-x-l-a-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.-x-l-a-options.pbtxt
index 74cf80998e28..ef3b4d27ed0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.-x-l-a-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.-x-l-a-options.pbtxt
@@ -5,11 +5,11 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "enable_xla_dynamic_padder"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "use_spmd_for_xla_partitioning"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-device-order-mode.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-device-order-mode.pbtxt
index 915a55b6653d..766c106f561d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-device-order-mode.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-device-order-mode.pbtxt
@@ -48,6 +48,9 @@ tf_class {
   member_method {
     name: "from_bytes"
   }
+  member_method {
+    name: "is_integer"
+  }
   member_method {
     name: "to_bytes"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt
index 4c0c2252b6da..2c56e3c322cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.-t-p-u-system-metadata.pbtxt
@@ -5,23 +5,23 @@ tf_class {
   is_instance: "<type \'tuple\'>"
   member {
     name: "devices"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "num_cores"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "num_hosts"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "num_of_cores_per_host"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member {
     name: "topology"
-    mtype: "<type \'property\'>"
+    mtype: "<class \'collections._tuplegetter\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
index 829d45dc73f5..09f08b08d492 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.AdagradMomentum"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.AdagradMomentum\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index 0e3dda0dad13..3f5cacc71001 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.Adagrad"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index f52fec3b97fb..febe4a76a5ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.Adam"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-custom-combiner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-custom-combiner.pbtxt
new file mode 100644
index 000000000000..7293cfb96233
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-custom-combiner.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.tpu.experimental.embedding.CustomCombiner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.CustomCombiner\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "CUSTOM_COMBINER_SLOT_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'combiner_computation\', \'max_valency\', \'num_weights\', \'initializer\', \'combiner_weights_learning_rate\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'0.01\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt
index 08de88d791b1..699d074c904d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-custom-optimizer.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.CustomOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.CustomOptimizer\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "custom_computation"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
index 4a1ec7116405..61946268a949 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.FTRL"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FTRL\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 37fb55880ac4..cef70784fac7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.tpu.experimental.embedding.SGD"
 tf_class {
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
   is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._WithSlotVariables\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
index 052d0fd257ed..1c3789e2d0a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Adam"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CustomCombiner"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CustomOptimizer"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 0e1abdd8dd23..53cefe2c70a9 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -429,6 +429,14 @@ tsl::profiler::RemoteProfilerSession::GetServiceAddress
 tsl::profiler::RemoteProfilerSession::WaitForCompletion
 tsl::profiler::RemoteProfilerSession::~RemoteProfilerSession
 
+[//external/local_xla/xla/tsl/concurrency:async_value] # tfcompile
+tsl::AsyncValue::CreateTypeInfoAndReturnTypeIdImpl
+tsl::AsyncValue::TypeInfo
+tsl::internal::ConcreteAsyncValue
+tsl::DummyValueForErrorAsyncValue
+tsl::AsyncValue::NotifyAvailable
+tsl::internal::ConcreteAsyncValue
+
 [//external/local_xla/xla:status_macros] # tfcompile
 xla::status_macros::MakeErrorStream::Impl::Impl
 xla::status_macros::MakeErrorStream::Impl::~Impl
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index 549326b6e9e3..287f5058ab73 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:24.04@sha256:80dd3c3b9c6cecb9f1667e9290b3bc61b78c2678c02cbdae5f0fea92cc6734ab
+FROM ubuntu:24.04@sha256:1e622c5f073b4f6bfad6632f2616c7f59ef256e96fe78bf6a595d1dc4376ac02
 
 LABEL maintainer="Shanqing Cai <cais@google.com>"
 
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 755479e5823f..c63c230ddde0 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -166,10 +166,9 @@ genrule(
             "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         ],
     }) + if_cuda([
-        "@cub_archive//:LICENSE.TXT",
         "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
-        "//third_party/mkl_dnn:LICENSE",
+        "@local_xla//third_party/mkl_dnn:LICENSE",
     ]) + if_enable_mkl(["@local_xla//xla/tsl/mkl:LICENSE"]) + tf_additional_license_deps(),
     outs = ["THIRD_PARTY_TF_C_LICENSES"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
@@ -205,10 +204,9 @@ genrule(
             "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         ],
     }) + if_cuda([
-        "@cub_archive//:LICENSE.TXT",
         "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
-        "//third_party/mkl_dnn:LICENSE",
+        "@local_xla//third_party/mkl_dnn:LICENSE",
     ]) + if_enable_mkl(["@local_xla//xla/tsl/mkl:LICENSE"]) + tf_additional_license_deps(),
     outs = ["THIRD_PARTY_TF_JNI_LICENSES"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 62033c5d3127..e236bbf89a74 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -126,10 +126,9 @@ filegroup(
             "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
         ],
     }) + if_cuda([
-        "@cub_archive//:LICENSE.TXT",
         "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
-        "//third_party/mkl_dnn:LICENSE",
+        "@local_xla//third_party/mkl_dnn:LICENSE",
     ]) + if_enable_mkl(["@local_xla//xla/tsl/mkl:LICENSE"]) + if_not_system_lib(
         "absl_py",
         [
@@ -336,7 +335,9 @@ tf_wheel(
             if_false = [
                 "//tensorflow/python:pywrap_tensorflow_import_lib_file",
             ],
-            if_true = [],
+            if_true = [
+                "//tensorflow/python:pywrap_tensorflow_binaries",
+            ],
         ),
     }) + if_mkl_ml(["@local_xla//xla/tsl/mkl:intel_binary_blob"]),
     xla_aot_compiled = [
@@ -361,9 +362,10 @@ py_test(
     tags = [
         "cpu",
         "manual",
-        "windows_excluded",
     ],
-    deps = if_wheel_dependency(tf_wheel_dep()),
+    deps = if_wheel_dependency(tf_wheel_dep()) + [
+        "@pypi_absl_py//:pkg",
+    ],
 )
 
 py_test(
@@ -380,9 +382,10 @@ py_test(
     tags = [
         "gpu",
         "manual",
-        "windows_excluded",
     ],
-    deps = if_wheel_dependency(tf_wheel_dep()),
+    deps = if_wheel_dependency(tf_wheel_dep()) + [
+        "@pypi_absl_py//:pkg",
+    ],
 )
 
 py_test(
@@ -392,7 +395,6 @@ py_test(
     tags = [
         "cpu",
         "manual",
-        "windows_excluded",
     ],
     deps = [
         ":tf_py_import",
@@ -410,7 +412,6 @@ py_test(
     tags = [
         "gpu",
         "manual",
-        "windows_excluded",
     ],
     deps = [
         ":tf_py_import",
diff --git a/tensorflow/tools/pip_package/setup.py.tpl b/tensorflow/tools/pip_package/setup.py.tpl
index f2a1188f0191..faa12a5a7b87 100644
--- a/tensorflow/tools/pip_package/setup.py.tpl
+++ b/tensorflow/tools/pip_package/setup.py.tpl
@@ -54,10 +54,6 @@ from setuptools.dist import Distribution
 # result for pip.
 _VERSION = '0.0.0'
 
-# Update this version when a new libtpu stable version is released.
-LATEST_RELEASE_LIBTPU_VERSION = '0.0.10'
-NEXT_LIBTPU_VERSION = '0.0.11'
-
 # We use the same setup.py for all tensorflow_* packages and for the nightly
 # equivalents (tf_nightly_*). The package is controlled from the argument line
 # when building the pip package.
@@ -91,7 +87,7 @@ REQUIRED_PACKAGES = [
     'libclang >= 13.0.0',
     'opt_einsum >= 2.3.2',
     'packaging',
-    'protobuf>=4.21.6,<6.0.0dev',
+    'protobuf>=4.21.6',
     'requests >= 2.21.0, < 3',
     'setuptools',
     'six >= 1.12.0',
@@ -309,24 +305,8 @@ matches = []
 for path in so_lib_paths:
   matches.extend(['../' + x for x in find_files('*', path) if '.py' not in x])
 
-# If building a tpu package, LibTPU for Cloud TPU VM can be installed via:
-# $ pip install <tf-tpu project> -f \
-#  https://storage.googleapis.com/libtpu-releases/index.html
-# libtpu is built and uploaded to this link every night (PST).
 if '_tpu' in project_name:
-  # For tensorflow-tpu releases, use a set libtpu version;
-  # For tf-nightly-tpu, use the most recent libtpu-nightly. Because of the
-  # timing of these tests, the UTC date from eight hours ago is expected to be a
-  # valid version.
-  _libtpu_version = standard_or_nightly(
-      LATEST_RELEASE_LIBTPU_VERSION,
-      NEXT_LIBTPU_VERSION + '.dev'
-      + (
-          datetime.datetime.now(tz=datetime.timezone.utc)
-          - datetime.timedelta(hours=8)
-      ).strftime('%Y%m%d') + '+nightly',
-  )
-  REQUIRED_PACKAGES.append([f'libtpu=={_libtpu_version}'])
+  REQUIRED_PACKAGES.append([f'libtpu~=0.0.14'])
   CONSOLE_SCRIPTS.extend([
       'start_grpc_tpu_worker = tensorflow.python.tools.grpc_tpu_worker:run',
       ('start_grpc_tpu_service = '
@@ -430,6 +410,7 @@ setup(
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
         'Programming Language :: Python :: 3.12',
+        'Programming Language :: Python :: 3.13',
         'Programming Language :: Python :: 3 :: Only',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
diff --git a/tensorflow/tools/pip_package/utils/tf_wheel.bzl b/tensorflow/tools/pip_package/utils/tf_wheel.bzl
index e541c1310eba..c4dcd4682c8a 100644
--- a/tensorflow/tools/pip_package/utils/tf_wheel.bzl
+++ b/tensorflow/tools/pip_package/utils/tf_wheel.bzl
@@ -130,6 +130,7 @@ def _tf_wheel_impl(ctx):
         inputs = srcs + headers + xla_aot,
         outputs = [output_file],
         executable = executable,
+        use_default_shell_env = True,
     )
     return [DefaultInfo(files = depset(direct = [output_file]))]
 
diff --git a/tensorflow/tools/pip_package/xla_build/CMakeLists.txt b/tensorflow/tools/pip_package/xla_build/CMakeLists.txt
index 1690338a6257..b45970adfe18 100644
--- a/tensorflow/tools/pip_package/xla_build/CMakeLists.txt
+++ b/tensorflow/tools/pip_package/xla_build/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4.3)
+cmake_minimum_required(VERSION 3.5)
 
 file(GLOB_RECURSE TF_RUNTIME_SRC "*.cc")
 add_library(tf_xla_runtime_objects OBJECT
diff --git a/tensorflow/tools/proto_splitter/cc/BUILD b/tensorflow/tools/proto_splitter/cc/BUILD
index 105cecfae446..3a39ab40cf7d 100644
--- a/tensorflow/tools/proto_splitter/cc/BUILD
+++ b/tensorflow/tools/proto_splitter/cc/BUILD
@@ -210,20 +210,18 @@ cc_library(
 
 cc_library(
     name = "repeated_field_splitter",
-    srcs = ["repeated_field_splitter.cc"],
     hdrs = ["repeated_field_splitter.h"],
     deps = [
         ":composable_splitter",
         ":max_size",
         ":size_splitter",
-        ":split",
         ":util",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
 
diff --git a/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.cc b/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.cc
deleted file mode 100644
index 552009f3916e..000000000000
--- a/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/tools/proto_splitter/cc/repeated_field_splitter.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "tensorflow/core/framework/function.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/tools/proto_splitter/cc/composable_splitter.h"
-#include "tensorflow/tools/proto_splitter/cc/max_size.h"
-#include "tensorflow/tools/proto_splitter/cc/size_splitter.h"
-#include "tensorflow/tools/proto_splitter/cc/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/statusor.h"
-
-namespace tensorflow::tools::proto_splitter {
-
-// Additional bytes added to each node to account for the extra info needed to
-// encode the field key (realistically 3 but making it 5 for some wiggle room).
-constexpr int kExtraBytes = 5;
-
-template <typename ParentMessage, typename RepeatedMessage>
-absl::StatusOr<RepeatedFieldSplitter<ParentMessage, RepeatedMessage>>
-RepeatedFieldSplitter<ParentMessage, RepeatedMessage>::Create(
-    tsl::protobuf::Message* message, ComposableSplitter* parent_splitter,
-    std::vector<FieldType>* fields_in_parent, const FieldType& repeated_field,
-    std::vector<SizeSplitterFactory*>* splitter_factories) {
-  TF_ASSIGN_OR_RETURN(auto field_ret, GetField(*message, {repeated_field}));
-  if (!field_ret.field->is_repeated()) {
-    return absl::FailedPreconditionError("Unable to split non-repeated field.");
-  }
-
-  auto ret = RepeatedFieldSplitter<ParentMessage, RepeatedMessage>(
-      message, parent_splitter, fields_in_parent, repeated_field,
-      splitter_factories);
-  return ret;
-}
-
-template <typename ParentMessage, typename RepeatedMessage>
-absl::StatusOr<int>
-RepeatedFieldSplitter<ParentMessage, RepeatedMessage>::BuildChunksReturnSize() {
-  TF_ASSIGN_OR_RETURN(MutableFieldResult mfr,
-                      GetMutableField(message(), {repeated_field_}));
-  tsl::protobuf::Message* parent = mfr.parent;
-  const tsl::protobuf::FieldDescriptor* repeated_field = mfr.field;
-
-  uint64_t max_size = GetMaxSize();
-  size_t initial_size = GetInitialSize();
-
-  // List of indices at which to split the repeated field. For example, [3, 5]
-  // means that the field list is split into: [:3], [3:5], [5:]
-  std::vector<int> repeated_msg_split;
-  // Track the total byte size of the current node split.
-  uint64_t total_size = 0;
-
-  // Linearly iterate through all nodes. It may be possible to optimize this
-  // further by making best guesses as to where to split the nodes, since
-  // most nodes (aside from constants) are relatively small.
-  int repeated_field_length =
-      parent->GetReflection()->FieldSize(*parent, repeated_field);
-  for (int i = 0; i < repeated_field_length; ++i) {
-    tsl::protobuf::Message* node =
-        parent->GetReflection()->MutableRepeatedMessage(parent, repeated_field,
-                                                        i);
-    auto node_size = node->ByteSizeLong();
-
-    std::vector<FieldType> new_fields = {repeated_field_, i};
-
-    for (auto factory : *splitter_factories_) {
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<SizeSplitter> new_splitter,
-          factory->CreateSplitter(node, this, &new_fields, node_size));
-      if (new_splitter != nullptr) {
-        TF_ASSIGN_OR_RETURN(auto size_diff,
-                            new_splitter->BuildChunksReturnSize());
-        node_size -= size_diff;
-      }
-    }
-    if (total_size + node_size > max_size) {
-      repeated_msg_split.push_back(i);
-      total_size = 0;
-    }
-    total_size += node_size + kExtraBytes;
-  }
-
-  if (!repeated_msg_split.empty()) {
-    auto repeated_nodes_ptrs =
-        parent->GetReflection()
-            ->template MutableRepeatedPtrField<RepeatedMessage>(parent,
-                                                                repeated_field);
-
-    std::vector<RepeatedMessage*> extracted_nodes(repeated_field_length);
-    repeated_nodes_ptrs->ExtractSubrange(0, repeated_field_length,
-                                         &extracted_nodes.at(0));
-    // Last range end is the size of the repeated field.
-    repeated_msg_split.push_back(repeated_field_length);
-
-    int range_start = 0;
-    for (int range_end : repeated_msg_split) {
-      auto new_msg = std::make_shared<ParentMessage>();
-      std::vector<FieldType> empty_fields;
-      auto x = std::make_unique<MessageBytes>(new_msg);
-      TF_RETURN_IF_ERROR(AddChunk(std::move(x), &empty_fields));
-
-      // Move nodes into new_msg.
-      TF_ASSIGN_OR_RETURN(auto new_ret,
-                          GetMutableField(new_msg.get(), repeated_field_));
-
-      for (int j = range_start; j < range_end; ++j) {
-        new_msg->GetReflection()->AddAllocatedMessage(
-            new_msg.get(), new_ret.field, extracted_nodes[j]);
-      }
-
-      range_start = range_end;
-    }
-  }
-
-  // Estimate the size diff by subtracting the first computed chunk size from
-  // the initial size of the repeated field.
-  return initial_size - message()->ByteSizeLong();
-}
-
-// Declare template classes to fix linking error.
-template class RepeatedFieldSplitter<GraphDef, NodeDef>;
-template class RepeatedFieldSplitter<FunctionDefLibrary, FunctionDef>;
-template class RepeatedFieldSplitter<FunctionDef, NodeDef>;
-
-}  // namespace tensorflow::tools::proto_splitter
diff --git a/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.h b/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.h
index 5395f76ad9b6..48c07734955f 100644
--- a/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.h
+++ b/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.h
@@ -15,10 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_TOOLS_PROTO_SPLITTER_CC_REPEATED_FIELD_SPLITTER_H_
 #define TENSORFLOW_TOOLS_PROTO_SPLITTER_CC_REPEATED_FIELD_SPLITTER_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/tools/proto_splitter/cc/composable_splitter.h"
+#include "tensorflow/tools/proto_splitter/cc/max_size.h"
 #include "tensorflow/tools/proto_splitter/cc/size_splitter.h"
 #include "tensorflow/tools/proto_splitter/cc/util.h"
 #include "tsl/platform/protobuf.h"
@@ -49,6 +59,111 @@ class RepeatedFieldSplitter : public SizeSplitter {
   std::vector<SizeSplitterFactory*>* splitter_factories_;
 };
 
+// Additional bytes added to each node to account for the extra info needed to
+// encode the field key (realistically 3 but making it 5 for some wiggle room).
+constexpr int kExtraBytes = 5;
+
+template <typename ParentMessage, typename RepeatedMessage>
+absl::StatusOr<RepeatedFieldSplitter<ParentMessage, RepeatedMessage>>
+RepeatedFieldSplitter<ParentMessage, RepeatedMessage>::Create(
+    tsl::protobuf::Message* message, ComposableSplitter* parent_splitter,
+    std::vector<FieldType>* fields_in_parent, const FieldType& repeated_field,
+    std::vector<SizeSplitterFactory*>* splitter_factories) {
+  TF_ASSIGN_OR_RETURN(auto field_ret, GetField(*message, {repeated_field}));
+  if (!field_ret.field->is_repeated()) {
+    return absl::FailedPreconditionError("Unable to split non-repeated field.");
+  }
+
+  auto ret = RepeatedFieldSplitter<ParentMessage, RepeatedMessage>(
+      message, parent_splitter, fields_in_parent, repeated_field,
+      splitter_factories);
+  return ret;
+}
+
+template <typename ParentMessage, typename RepeatedMessage>
+absl::StatusOr<int>
+RepeatedFieldSplitter<ParentMessage, RepeatedMessage>::BuildChunksReturnSize() {
+  TF_ASSIGN_OR_RETURN(MutableFieldResult mfr,
+                      GetMutableField(message(), {repeated_field_}));
+  tsl::protobuf::Message* parent = mfr.parent;
+  const tsl::protobuf::FieldDescriptor* repeated_field = mfr.field;
+
+  uint64_t max_size = GetMaxSize();
+  size_t initial_size = GetInitialSize();
+
+  // List of indices at which to split the repeated field. For example, [3, 5]
+  // means that the field list is split into: [:3], [3:5], [5:]
+  std::vector<int> repeated_msg_split;
+  // Track the total byte size of the current node split.
+  uint64_t total_size = 0;
+
+  // Linearly iterate through all nodes. It may be possible to optimize this
+  // further by making best guesses as to where to split the nodes, since
+  // most nodes (aside from constants) are relatively small.
+  int repeated_field_length =
+      parent->GetReflection()->FieldSize(*parent, repeated_field);
+  for (int i = 0; i < repeated_field_length; ++i) {
+    tsl::protobuf::Message* node =
+        parent->GetReflection()->MutableRepeatedMessage(parent, repeated_field,
+                                                        i);
+    auto node_size = node->ByteSizeLong();
+
+    std::vector<FieldType> new_fields = {repeated_field_, i};
+
+    for (auto factory : *splitter_factories_) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<SizeSplitter> new_splitter,
+          factory->CreateSplitter(node, this, &new_fields, node_size));
+      if (new_splitter != nullptr) {
+        TF_ASSIGN_OR_RETURN(auto size_diff,
+                            new_splitter->BuildChunksReturnSize());
+        node_size -= size_diff;
+      }
+    }
+    if (total_size + node_size > max_size) {
+      repeated_msg_split.push_back(i);
+      total_size = 0;
+    }
+    total_size += node_size + kExtraBytes;
+  }
+
+  if (!repeated_msg_split.empty()) {
+    auto repeated_nodes_ptrs =
+        parent->GetReflection()
+            ->template MutableRepeatedPtrField<RepeatedMessage>(parent,
+                                                                repeated_field);
+
+    std::vector<RepeatedMessage*> extracted_nodes(repeated_field_length);
+    repeated_nodes_ptrs->ExtractSubrange(0, repeated_field_length,
+                                         &extracted_nodes.at(0));
+    // Last range end is the size of the repeated field.
+    repeated_msg_split.push_back(repeated_field_length);
+
+    int range_start = 0;
+    for (int range_end : repeated_msg_split) {
+      auto new_msg = std::make_shared<ParentMessage>();
+      std::vector<FieldType> empty_fields;
+      auto x = std::make_unique<MessageBytes>(new_msg);
+      TF_RETURN_IF_ERROR(AddChunk(std::move(x), &empty_fields));
+
+      // Move nodes into new_msg.
+      TF_ASSIGN_OR_RETURN(auto new_ret,
+                          GetMutableField(new_msg.get(), repeated_field_));
+
+      for (int j = range_start; j < range_end; ++j) {
+        new_msg->GetReflection()->AddAllocatedMessage(
+            new_msg.get(), new_ret.field, extracted_nodes[j]);
+      }
+
+      range_start = range_end;
+    }
+  }
+
+  // Estimate the size diff by subtracting the first computed chunk size from
+  // the initial size of the repeated field.
+  return initial_size - message()->ByteSizeLong();
+}
+
 }  // namespace tensorflow::tools::proto_splitter
 
 #endif  // TENSORFLOW_TOOLS_PROTO_SPLITTER_CC_REPEATED_FIELD_SPLITTER_H_
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index ddb034601399..7123a2e14468 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -835,7 +835,6 @@ void Generator::Generate(const FileDescriptor& fd) {
   Print();
   Print("using ::tensorflow::strings::ProtoSpaceAndComments;");
   Print("using ::tensorflow::strings::Scanner;");
-  Print("using ::tensorflow::strings::StrCat;");
   AddNamespaceToCurrentSection(package, true /* is_open */);
 
   // Add declarations and definitions.
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
index c4c866ece757..127aaeddd63b 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -1,5 +1,5 @@
 ################################################################################
-FROM ubuntu:22.04@sha256:ed1544e454989078f5dec1bfdabd8c5cc9c48e0705d07b678ab6ae3fb61952d2 as builder
+FROM ubuntu:22.04@sha256:d80997daaa3811b175119350d84305e1ec9129e1799bba0bd1e3120da3ff52c3 as builder
 ################################################################################
 
 # Install devtoolset build dependencies
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats
index f575f2200591..e620e61461d8 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats
@@ -58,8 +58,8 @@ EOF
 
   # grep patterns for targets which are allowed to be extra licenses
   cat > $BATS_TEST_TMPDIR/allowed_to_be_extra <<EOF
-//third_party/mkl
-//third_party/mkl_dnn
+@local_xla//third_party/mkl
+@local_xla//third_party/mkl_dnn
 @absl_py//
 @bazel_tools//src
 @bazel_tools//platforms
diff --git a/tensorflow/tools/toolchains/android/BUILD b/tensorflow/tools/toolchains/android/BUILD
new file mode 100644
index 000000000000..fe32baa142bb
--- /dev/null
+++ b/tensorflow/tools/toolchains/android/BUILD
@@ -0,0 +1,35 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+platform(
+    name = "x86",
+    constraint_values = [
+        "@platforms//os:android",
+        "@platforms//cpu:x86_32",
+    ],
+)
+
+platform(
+    name = "x86_64",
+    constraint_values = [
+        "@platforms//os:android",
+        "@platforms//cpu:x86_64",
+    ],
+)
+
+platform(
+    name = "armeabi-v7a",
+    constraint_values = [
+        "@platforms//os:android",
+        "@platforms//cpu:armv7",
+    ],
+)
+
+platform(
+    name = "arm64-v8a",
+    constraint_values = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:android",
+    ],
+)
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
index 9ab22386abd3..1598845e61e4 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "//third_party/remote_config:common.bzl",
+    "@local_xla//third_party/remote_config:common.bzl",
     "err_out",
     "get_host_environ",
     "raw_exec",
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index cbb047d83d96..774444369e22 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,7 +1,7 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
+load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/tensorflow/tools/toolchains/ios/BUILD b/tensorflow/tools/toolchains/ios/BUILD
new file mode 100644
index 000000000000..b35900758000
--- /dev/null
+++ b/tensorflow/tools/toolchains/ios/BUILD
@@ -0,0 +1,13 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+platform(
+    name = "ios_armv7",
+    constraint_values = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:ios",
+        "@build_bazel_apple_support//constraints:apple",
+        "@build_bazel_apple_support//constraints:device",
+    ],
+)
diff --git a/tensorflow/tools/toolchains/linux/BUILD b/tensorflow/tools/toolchains/linux/BUILD
new file mode 100644
index 000000000000..7ef483e7dc83
--- /dev/null
+++ b/tensorflow/tools/toolchains/linux/BUILD
@@ -0,0 +1,19 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+platform(
+    name = "linux_aarch64",
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+        "@platforms//os:linux",
+    ],
+)
+
+platform(
+    name = "linux_armhf",
+    constraint_values = [
+        "@platforms//cpu:armv7e-mf",
+        "@platforms//os:linux",
+    ],
+)
diff --git a/tensorflow/tools/toolchains/python/python_repo.bzl b/tensorflow/tools/toolchains/python/python_repo.bzl
index 47fe64d7b7b0..2af9b29d7af2 100644
--- a/tensorflow/tools/toolchains/python/python_repo.bzl
+++ b/tensorflow/tools/toolchains/python/python_repo.bzl
@@ -7,7 +7,7 @@ Defaults to 3.10.
 To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
 """
 
-VERSIONS = ["3.9", "3.10", "3.11", "3.12"]
+VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"]
 DEFAULT_VERSION = "3.11"
 WARNING = """
 TF_PYTHON_VERSION environment variable was not set correctly; using Python {}.
diff --git a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
index ddd87ae0cf97..bdcfc6f40892 100644
--- a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+++ b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,9 +1,9 @@
 """Macro that creates external repositories for remote config."""
 
+load("@local_xla//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
-load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index 0fdb0342fae6..f77465e08852 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -42,7 +42,7 @@ def _tf_bind():
     # Needed by Protobuf
     native.bind(
         name = "python_headers",
-        actual = str(Label("//third_party/python_runtime:headers")),
+        actual = str(Label("@local_xla//third_party/python_runtime:headers")),
     )
 
     # Needed by Protobuf
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 5ae5915754a3..85edecae8c67 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -7,6 +7,28 @@ load("@bazel_skylib//lib:versions.bzl", "versions")
 
 # Import external repository rules.
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
+load("@local_xla//third_party/absl:workspace.bzl", absl = "repo")
+load("@local_xla//third_party/benchmark:workspace.bzl", benchmark = "repo")
+load("@local_xla//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
+load("@local_xla//third_party/dlpack:workspace.bzl", dlpack = "repo")
+load("@local_xla//third_party/ducc:workspace.bzl", ducc = "repo")
+load("@local_xla//third_party/eigen3:workspace.bzl", eigen3 = "repo")
+load("@local_xla//third_party/farmhash:workspace.bzl", farmhash = "repo")
+
+# Import third party repository rules. See go/tfbr-thirdparty.
+load("@local_xla//third_party/FP16:workspace.bzl", FP16 = "repo")
+load("@local_xla//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
+load("@local_xla//third_party/git:git_configure.bzl", "git_configure")
+load("@local_xla//third_party/gpus:rocm_configure.bzl", "rocm_configure")
+load("@local_xla//third_party/gpus:sycl_configure.bzl", "sycl_configure")
+load("@local_xla//third_party/hwloc:workspace.bzl", hwloc = "repo")
+load("@local_xla//third_party/implib_so:workspace.bzl", implib_so = "repo")
+load("@local_xla//third_party/nanobind:workspace.bzl", nanobind = "repo")
+load("@local_xla//third_party/nasm:workspace.bzl", nasm = "repo")
+load("@local_xla//third_party/nvshmem:workspace.bzl", nvshmem = "repo")
+load("@local_xla//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
+load("@local_xla//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
+load("@local_xla//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("@rules_jvm_external//:defs.bzl", "maven_install")
 load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
 load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
@@ -17,40 +39,19 @@ load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_confi
 load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
 load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-load("//third_party/absl:workspace.bzl", absl = "repo")
-load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
-load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
-load("//third_party/dlpack:workspace.bzl", dlpack = "repo")
-load("//third_party/ducc:workspace.bzl", ducc = "repo")
-load("//third_party/eigen3:workspace.bzl", eigen3 = "repo")
-load("//third_party/farmhash:workspace.bzl", farmhash = "repo")
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
-
-# Import third party repository rules. See go/tfbr-thirdparty.
-load("//third_party/FP16:workspace.bzl", FP16 = "repo")
-load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
-load("//third_party/git:git_configure.bzl", "git_configure")
-load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
-load("//third_party/gpus:sycl_configure.bzl", "sycl_configure")
 load("//third_party/hexagon:workspace.bzl", hexagon_nn = "repo")
 load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
-load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
-load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/libprotobuf_mutator:workspace.bzl", libprotobuf_mutator = "repo")
+load("//third_party/libwebp:workspace.bzl", libwebp = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
-load("//third_party/nanobind:workspace.bzl", nanobind = "repo")
-load("//third_party/nasm:workspace.bzl", nasm = "repo")
-load("//third_party/nvshmem:workspace.bzl", nvshmem = "repo")
 load("//third_party/opencl_headers:workspace.bzl", opencl_headers = "repo")
 load("//third_party/pasta:workspace.bzl", pasta = "repo")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/py/ml_dtypes:workspace.bzl", ml_dtypes = "repo")
-load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
-load("//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
-load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/ruy:workspace.bzl", ruy = "repo")
 load("//third_party/shardy:workspace.bzl", shardy = "repo")
 load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
@@ -82,6 +83,7 @@ def _initialize_third_party():
     jpeg()
     kissfft()
     libprotobuf_mutator()
+    libwebp()
     ml_dtypes()
     nanobind()
     nasm()
@@ -158,18 +160,18 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "04291b4c49693988f8c95d07968f6f3da3fd89d85bd9e4e26f73abbdfd7a8a45",
-        strip_prefix = "XNNPACK-24794834234a7926d2f553d34e84204c8ac99dfd",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/24794834234a7926d2f553d34e84204c8ac99dfd.zip"),
+        sha256 = "3fc19fcedc7aadd02c2cab647dadd5aef9bf3cf9bafd16d2c7a392c52df5c98b",
+        strip_prefix = "XNNPACK-240217afed5486735a54444e7d42bbf894da2483",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/240217afed5486735a54444e7d42bbf894da2483.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
     # XNNPack dependency.
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "f3ea4fce53f3b31076958dbff229f0048dae15bf454929673c78292a56279d52",
-        strip_prefix = "kleidiai-847ebd19d0192528659b0a0fa2c6057eed674c6a",
-        urls = tf_mirror_urls("https://gitlab.arm.com/kleidi/kleidiai/-/archive/847ebd19d0192528659b0a0fa2c6057eed674c6a/kleidiai-847ebd19d0192528659b0a0fa2c6057eed674c6a.zip"),
+        sha256 = "cb6af19d3ef21a0c683bd0c7c5b455c386dee0b7d0d4bc2cc0e14503019ff1e8",
+        strip_prefix = "kleidiai-1.4.0",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.4.0.zip"),
     )
 
     tf_http_archive(
@@ -182,40 +184,40 @@ def _tf_repositories():
     # LINT.IfChange(pthreadpool)
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "215724985c4845cdcadcb5f26a2a8777943927bb5a172a00e7716fe16a6f3c1b",
-        strip_prefix = "pthreadpool-b1aee199d54003fb557076a201bcac3398af580b",
-        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/b1aee199d54003fb557076a201bcac3398af580b.zip"),
+        sha256 = "745e56516d6a58d183eb33d9017732d87cff43ce9f78908906f9faa52633e421",
+        strip_prefix = "pthreadpool-b92447772365661680f486e39a91dfe6675adafc",
+        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/b92447772365661680f486e39a91dfe6675adafc.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/cmake/DownloadPThreadPool.cmake)
 
     tf_http_archive(
         name = "cpuinfo",
-        sha256 = "4bf314b3f04db2fd984fef38a7e278e702b74297ef0af592b73296edba02b9d4",
-        strip_prefix = "cpuinfo-8a1772a0c5c447df2d18edf33ec4603a8c9c04a6",
+        sha256 = "593ac799e8c9382362e7b29a58917053299fa906e271185204bb571465bb2f79",
+        strip_prefix = "cpuinfo-b73ae6ce38d5dd0b7fe46dbe0a4b5f4bab91c7ea",
         patch_file = ["//third_party/cpuinfo:cpuinfo_ppc64le_support.patch"],
-        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/8a1772a0c5c447df2d18edf33ec4603a8c9c04a6.zip"),
+        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/b73ae6ce38d5dd0b7fe46dbe0a4b5f4bab91c7ea.zip"),
     )
 
     tf_http_archive(
         name = "cudnn_frontend_archive",
-        build_file = "//third_party:cudnn_frontend.BUILD",
-        patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "59fb63e273c845cb85996d536194a7e2b22012810983cbbf06c4a46b09d17a32",
-        strip_prefix = "cudnn-frontend-1.10.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.10.0.zip"),
+        build_file = "@local_xla//third_party:cudnn_frontend.BUILD",
+        patch_file = ["@local_xla//third_party:cudnn_frontend_header_fix.patch"],
+        sha256 = "34dfe01057e43e799af207522aa0c863ad3177f8c1568b6e7a7e4ccf1cbff769",
+        strip_prefix = "cudnn-frontend-1.11.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.11.0.zip"),
     )
 
     tf_http_archive(
         name = "cutlass_archive",
-        build_file = "//third_party:cutlass.BUILD",
-        sha256 = "84cf3fcc47c440a8dde016eb458f8d6b93b3335d9c3a7a16f388333823f1eae0",
-        strip_prefix = "cutlass-afa7b7241aabe598b725c65480bd9fa71121732c",
-        urls = tf_mirror_urls("https://github.com/chsigg/cutlass/archive/afa7b7241aabe598b725c65480bd9fa71121732c.tar.gz"),
+        build_file = "@local_xla//third_party:cutlass.BUILD",
+        sha256 = "a7739ca3dc74e3a5cb57f93fc95224c5e2a3c2dff2c16bb09a5e459463604c08",
+        strip_prefix = "cutlass-3.8.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.8.0.zip"),
     )
 
     tf_http_archive(
         name = "mkl_dnn_v1",
-        build_file = "//third_party/mkl_dnn:mkldnn_v1.BUILD",
+        build_file = "@local_xla//third_party/mkl_dnn:mkldnn_v1.BUILD",
         sha256 = "a50993aa6265b799b040fe745e0010502f9f7103cc53a9525d59646aef006633",
         strip_prefix = "oneDNN-2.7.3",
         urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v2.7.3.tar.gz"),
@@ -223,8 +225,8 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "onednn",
-        build_file = "//third_party/mkl_dnn:mkldnn_v1.BUILD",
-        patch_file = ["//third_party/mkl_dnn:setting_init.patch"],
+        build_file = "@local_xla//third_party/mkl_dnn:mkldnn_v1.BUILD",
+        patch_file = ["@local_xla//third_party/mkl_dnn:setting_init.patch"],
         sha256 = "8356aa9befde4d4ff93f1b016ac4310730b2de0cc0b8c6c7ce306690bc0d7b43",
         strip_prefix = "oneDNN-3.5",
         urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.5.tar.gz"),
@@ -232,18 +234,18 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "mkl_dnn_acl_compatible",
-        build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
+        build_file = "@local_xla//third_party/mkl_dnn:mkldnn_acl.BUILD",
         patch_file = [
-            "//third_party/mkl_dnn:onednn_acl_threadcap.patch",
-            "//third_party/mkl_dnn:onednn_acl_reorder.patch",
-            "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
-            "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
-            "//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
-            "//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
-            "//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
-            "//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
-            "//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
-            "//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_threadcap.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_reorder.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_fix_segfault_during_postop_execute.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_add_bf16_platform_support_check.patch",
+            "@local_xla//third_party/mkl_dnn:onednn_acl_add_sbgemm_matmul_primitive_definition.patch",
         ],
         sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
         strip_prefix = "oneDNN-3.2.1",
@@ -253,10 +255,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "compute_library",
         patch_file = [
-            "//third_party/compute_library:compute_library.patch",
-            "//third_party/compute_library:acl_thread_local_scheduler.patch",
-            "//third_party/compute_library:exclude_omp_scheduler.patch",
-            "//third_party/compute_library:include_string.patch",
+            "@local_xla//third_party/compute_library:compute_library.patch",
+            "@local_xla//third_party/compute_library:acl_thread_local_scheduler.patch",
+            "@local_xla//third_party/compute_library:exclude_omp_scheduler.patch",
+            "@local_xla//third_party/compute_library:include_string.patch",
         ],
         sha256 = "c4ca329a78da380163b2d86e91ba728349b6f0ee97d66e260a694ef37f0b0d93",
         strip_prefix = "ComputeLibrary-23.05.1",
@@ -378,7 +380,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "six_archive",
-        build_file = "//third_party:six.BUILD",
+        build_file = "@local_xla//third_party:six.BUILD",
         sha256 = "1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
         strip_prefix = "six-1.16.0",
         system_build_file = "//third_party/systemlibs:six.BUILD",
@@ -401,7 +403,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "com_google_protobuf",
-        patch_file = ["//third_party/protobuf:protobuf.patch"],
+        patch_file = ["@local_xla//third_party/protobuf:protobuf.patch"],
         sha256 = "f66073dee0bc159157b0bd7f502d7d1ee0bc76b3c1eac9836927511bdc4b3fc1",
         strip_prefix = "protobuf-3.21.9",
         system_build_file = "//third_party/systemlibs:protobuf.BUILD",
@@ -414,9 +416,28 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "com_google_googletest",
-        sha256 = "81964fe578e9bd7c94dfdb09c8e4d6e6759e19967e397dbea48d1c10e45d0df2",
-        strip_prefix = "googletest-release-1.12.1",
-        urls = tf_mirror_urls("https://github.com/google/googletest/archive/refs/tags/release-1.12.1.tar.gz"),
+        # Use the commit on 2025/3/21:
+        # https://github.com/google/googletest/commit/2ae29b52fdff88c52fef655fa0d245fc514ca35b
+        sha256 = "21a3a4021fd5e3127c90547234e2126d24f23571fedefa0d9370bf706a870fba",
+        strip_prefix = "googletest-2ae29b52fdff88c52fef655fa0d245fc514ca35b",
+        # Patch googletest to:
+        #   - avoid dependencies on @fuchsia_sdk,
+        #   - refer to re2 as @com_googlesource_code_re2,
+        #   - refer to abseil as @com_google_absl.
+        #
+        # To update the patch, run:
+        # $ cd ~
+        # $ mkdir -p github
+        # $ cd github
+        # $ git clone https://github.com/google/googletest.git
+        # $ cd googletest
+        # $ git checkout 2ae29b52fdff88c52fef655fa0d245fc514ca35b
+        # ... make local changes to googletest ...
+        # $ git diff > <client-root>/third_party/tensorflow/third_party/googletest/googletest.patch
+        #
+        # The patch path is relative to third_party/tensorflow.
+        patch_file = ["@local_xla//third_party/googletest:googletest.patch"],
+        urls = tf_mirror_urls("https://github.com/google/googletest/archive/2ae29b52fdff88c52fef655fa0d245fc514ca35b.zip"),
     )
 
     tf_http_archive(
@@ -435,7 +456,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "curl",
-        build_file = "//third_party:curl.BUILD",
+        build_file = "@local_xla//third_party:curl.BUILD",
         sha256 = "264537d90e58d2b09dddc50944baf3c38e7089151c8986715e2aaeaaf2b8118f",
         strip_prefix = "curl-8.11.0",
         system_build_file = "//third_party/systemlibs:curl.BUILD",
@@ -449,11 +470,11 @@ def _tf_repositories():
         strip_prefix = "grpc-b54a5b338637f92bfcf4b0bc05e0f57a5fd8fadd",
         system_build_file = "//third_party/systemlibs:grpc.BUILD",
         patch_file = [
-            "//third_party/grpc:generate_cc_env_fix.patch",
-            "//third_party/grpc:register_go_toolchain.patch",
+            "@local_xla//third_party/grpc:generate_cc_env_fix.patch",
+            "@local_xla//third_party/grpc:register_go_toolchain.patch",
         ],
         system_link_files = {
-            "//third_party/systemlibs:BUILD": "bazel/BUILD",
+            "//third_party/systemlibs:BUILD.bazel": "bazel/BUILD",
             "//third_party/systemlibs:grpc.BUILD": "src/compiler/BUILD",
             "//third_party/systemlibs:grpc.bazel.grpc_deps.bzl": "bazel/grpc_deps.bzl",
             "//third_party/systemlibs:grpc.bazel.grpc_extra_deps.bzl": "bazel/grpc_extra_deps.bzl",
@@ -477,8 +498,8 @@ def _tf_repositories():
     # Intel openMP that is part of LLVM sources.
     tf_http_archive(
         name = "llvm_openmp",
-        build_file = "//third_party/llvm_openmp:BUILD",
-        patch_file = ["//third_party/llvm_openmp:openmp_switch_default_patch.patch"],
+        build_file = "@local_xla//third_party/llvm_openmp:BUILD.bazel",
+        patch_file = ["@local_xla//third_party/llvm_openmp:openmp_switch_default_patch.patch"],
         sha256 = "d19f728c8e04fb1e94566c8d76aef50ec926cd2f95ef3bf1e0a5de4909b28b44",
         strip_prefix = "openmp-10.0.1.src",
         urls = tf_mirror_urls("https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/openmp-10.0.1.src.tar.xz"),
@@ -503,7 +524,7 @@ def _tf_repositories():
     # Note: if you update this, you have to update libpng too. See cl/437813808
     tf_http_archive(
         name = "zlib",
-        build_file = "//third_party:zlib.BUILD",
+        build_file = "@local_xla//third_party:zlib.BUILD",
         sha256 = "9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23",
         strip_prefix = "zlib-1.3.1",
         system_build_file = "//third_party/systemlibs:zlib.BUILD",
@@ -522,7 +543,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "snappy",
-        build_file = "//third_party:snappy.BUILD",
+        build_file = "@local_xla//third_party:snappy.BUILD",
         sha256 = "7ee7540b23ae04df961af24309a55484e7016106e979f83323536a1322cedf1b",
         strip_prefix = "snappy-1.2.0",
         system_build_file = "//third_party/systemlibs:snappy.BUILD",
@@ -531,8 +552,8 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "nccl_archive",
-        build_file = "//third_party:nccl/archive.BUILD",
-        patch_file = ["//third_party/nccl:archive.patch"],
+        build_file = "@local_xla//third_party:nccl/archive.BUILD",
+        patch_file = ["@local_xla//third_party/nccl:archive.patch"],
         sha256 = "7b154ad1f8ccafa795ed6696507d402b1b4ccac944c5fceb7f4e29b19a39cc47",
         strip_prefix = "nccl-2.25.1-1",
         urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.25.1-1.tar.gz"),
@@ -540,7 +561,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "nvtx_archive",
-        build_file = "//third_party:nvtx/BUILD",
+        build_file = "@local_xla//third_party:nvtx/BUILD.bazel",
         sha256 = "e4438f921fb88a564b0b92791c1c1fdd0f388901213e6a31fdd0dc3803fb9764",
         strip_prefix = "NVTX-bf31d7859ab3130cbf1ef77c33d18d0ebb8c8d08/c/include",
         urls = tf_mirror_urls("https://github.com/NVIDIA/NVTX/archive/bf31d7859ab3130cbf1ef77c33d18d0ebb8c8d08.tar.gz"),
@@ -623,19 +644,9 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/google/pprof/archive/83db2b799d1f74c40857232cb5eb4c60379fe6c2.tar.gz"),
     )
 
-    # The CUDA 11 toolkit ships with CUB.  We should be able to delete this rule
-    # once TF drops support for CUDA 10.
-    tf_http_archive(
-        name = "cub_archive",
-        build_file = "//third_party:cub.BUILD",
-        sha256 = "162514b3cc264ac89d91898b58450190b8192e2af1142cf8ccac2d59aa160dda",
-        strip_prefix = "cub-1.9.9",
-        urls = tf_mirror_urls("https://github.com/NVlabs/cub/archive/1.9.9.zip"),
-    )
-
     tf_http_archive(
         name = "cython",
-        build_file = "//third_party:cython.BUILD",
+        build_file = "@local_xla//third_party:cython.BUILD",
         sha256 = "0c2eae8a4ceab7955be1e11a4ddc5dcc3aa06ce22ad594262f1555b9d10667f0",
         strip_prefix = "cython-3.0.3",
         system_build_file = "//third_party/systemlibs:cython.BUILD",
@@ -756,8 +767,8 @@ def _tf_repositories():
     # https://github.com/bazelbuild/apple_support/releases
     tf_http_archive(
         name = "build_bazel_apple_support",
-        sha256 = "c4bb2b7367c484382300aee75be598b92f847896fb31bbd22f3a2346adf66a80",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.15.1/apple_support.1.15.1.tar.gz"),
+        sha256 = "d71b02d6df0500f43279e22400db6680024c1c439115c57a9a82e9effe199d7b",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.18.1/apple_support.1.18.1.tar.gz"),
     )
 
     # https://github.com/apple/swift-protobuf/releases
@@ -788,7 +799,7 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/pybind/pybind11/archive/v2.13.4.tar.gz"),
         sha256 = "efc901aa0aab439a3fea6efeaf930b5a349fb06394bf845c64ce15a9cf8f0240",
         strip_prefix = "pybind11-2.13.4",
-        build_file = "//third_party:pybind11.BUILD",
+        build_file = "@local_xla//third_party:pybind11.BUILD",
         system_build_file = "//third_party/systemlibs:pybind11.BUILD",
     )
 
@@ -823,7 +834,21 @@ def _tf_repositories():
         name = "upb",
         sha256 = "61d0417abd60e65ed589c9deee7c124fe76a4106831f6ad39464e1525cef1454",
         strip_prefix = "upb-9effcbcb27f0a665f9f345030188c0b291e32482",
-        patch_file = ["//third_party/grpc:upb_platform_fix.patch"],
+        # How to generate/update the patch files:
+        # 1. go to a temporary directory.
+        # 2. run commands:
+        #      git clone https://github.com/protocolbuffers/upb
+        #      cd upb
+        #      git checkout 9effcbcb27f0a665f9f345030188c0b291e32482
+        # 3. Edit the files as needed.
+        # 4. run command:
+        #      git diff > path-to-the-patch-file
+        patch_file = [
+            "@local_xla//third_party/grpc:upb_platform_fix.patch",
+            # Disables warning-as-error when building upb, as it generates
+            # warnings when compiled with clang.
+            "@local_xla//third_party/grpc:upb_build.patch",
+        ],
         urls = tf_mirror_urls("https://github.com/protocolbuffers/upb/archive/9effcbcb27f0a665f9f345030188c0b291e32482.tar.gz"),
     )
 
@@ -914,6 +939,13 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/google/highway/archive/refs/tags/1.0.7.zip"),
     )
 
+    tf_http_archive(
+        name = "org_xprof",
+        sha256 = "ff84f44bf87b2805fd6badc398b1889049f1d86d2478ecc466680c6939261afe",
+        strip_prefix = "xprof-05e9e316c9cf8de6dd5fdcd8c841723fc1e8a20f",
+        urls = tf_mirror_urls("https://github.com/openxla/xprof/archive/05e9e316c9cf8de6dd5fdcd8c841723fc1e8a20f.zip"),
+    )
+
     # used for adding androidx.annotation dependencies in tflite android jni.
     maven_install(
         artifacts = [
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index b392beae8af5..15fffd35ab75 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -59,6 +59,16 @@ def workspace():
         ],
     )
 
+    # Platforms
+    http_archive(
+        name = "platforms",
+        sha256 = "29742e87275809b5e598dc2f04d86960cc7a55b3067d97221c9abbc9926bff0f",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+            "https://github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+        ],
+    )
+
     # Maven dependencies.
     RULES_JVM_EXTERNAL_TAG = "4.3"
     http_archive(
@@ -68,6 +78,16 @@ def workspace():
         url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG,
     )
 
+    # Platforms
+    http_archive(
+        name = "platforms",
+        sha256 = "29742e87275809b5e598dc2f04d86960cc7a55b3067d97221c9abbc9926bff0f",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+            "https://github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+        ],
+    )
+
     # Load the raw llvm-project.  llvm does not have build rules set up by default,
     # but provides a script for setting up build rules via overlays.
     llvm("llvm-raw")
diff --git a/third_party/BUILD b/third_party/BUILD.bazel
similarity index 100%
rename from third_party/BUILD
rename to third_party/BUILD.bazel
diff --git a/third_party/FP16/BUILD b/third_party/FP16/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/FP16/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/FP16/FP16.BUILD b/third_party/FP16/FP16.BUILD
deleted file mode 100644
index e1018beb443d..000000000000
--- a/third_party/FP16/FP16.BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-# Description:
-#   C/C++ library for conversion to/from half-precision floating-point formats
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "FP16",
-    hdrs = glob(["include/**/*.h"]),
-    includes = ["include"],
-    strip_include_prefix = "include",
-)
diff --git a/third_party/FP16/workspace.bzl b/third_party/FP16/workspace.bzl
deleted file mode 100644
index 4753d6179598..000000000000
--- a/third_party/FP16/workspace.bzl
+++ /dev/null
@@ -1,12 +0,0 @@
-"""Loads the FP16 library, used by TF Lite."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "FP16",
-        strip_prefix = "FP16-4dfe081cf6bcd15db339cf2680b9281b8451eeb3",
-        sha256 = "d973501a40c55126b31accc2d9f08d931ec3cc190c0430309a5e341d3c0ce32a",
-        urls = tf_mirror_urls("https://github.com/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.zip"),
-        build_file = "//third_party/FP16:FP16.BUILD",
-    )
diff --git a/third_party/absl/BUILD b/third_party/absl/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/absl/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/absl/build_dll.patch b/third_party/absl/build_dll.patch
deleted file mode 100644
index 4089106dab2c..000000000000
--- a/third_party/absl/build_dll.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-Index: absl/copts/GENERATED_copts.bzl
-<+>UTF-8
-===================================================================
-diff --git a/absl/copts/GENERATED_copts.bzl b/absl/copts/GENERATED_copts.bzl
---- a/absl/copts/GENERATED_copts.bzl	(revision fb3621f4f897824c0dbe0615fa94543df6192f30)
-+++ b/absl/copts/GENERATED_copts.bzl	(date 1729733688013)
-@@ -11,6 +11,7 @@
-     "/D_CRT_SECURE_NO_WARNINGS",
-     "/D_SCL_SECURE_NO_WARNINGS",
-     "/D_ENABLE_EXTENDED_ALIGNED_STORAGE",
-+    "/DABSL_BUILD_DLL",
- ]
- 
- ABSL_CLANG_CL_TEST_FLAGS = [
diff --git a/third_party/absl/com_google_absl.BUILD b/third_party/absl/com_google_absl.BUILD
deleted file mode 100644
index 8fca145f751e..000000000000
--- a/third_party/absl/com_google_absl.BUILD
+++ /dev/null
@@ -1,5 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache
-
-exports_files(["LICENSE"])
diff --git a/third_party/absl/invert_the_is_inline_bin.patch b/third_party/absl/invert_the_is_inline_bin.patch
deleted file mode 100644
index 28c4e9b986a8..000000000000
--- a/third_party/absl/invert_the_is_inline_bin.patch
+++ /dev/null
@@ -1,108 +0,0 @@
-From 5c9f72faadaca7250b341b99da358e855a8d902e Mon Sep 17 00:00:00 2001
-From: Abseil Team <absl-team@google.com>
-Date: Tue, 5 Sep 2023 10:45:53 -0700
-Subject: [PATCH] Invert the "is inlined" bit of absl::Status
-
-This change makes  RepToPointer/PointerToRep have 0 instructions.
-This makes IsMovedFrom simpler (although this could always have left out the IsInlined check since that bit can never be set on the aligned pointer)
-
-In exchange, it makes CodeToInlinedRep slower, but does not inhibit replacing it with a constant.
-InlinedRepToCode is unaffected.
-
-PiperOrigin-RevId: 562826801
-Change-Id: I2732f04ab293b773edc2efdec546b3a287b980c2
----
- absl/status/status.cc |  4 ++++
- absl/status/status.h  | 23 +++++++++++++----------
- 2 files changed, 17 insertions(+), 10 deletions(-)
-
-diff --git a/absl/status/status.cc b/absl/status/status.cc
-index 577dea4b..911f4b28 100644
---- a/absl/status/status.cc
-+++ b/absl/status/status.cc
-@@ -46,6 +46,10 @@
- namespace absl {
- ABSL_NAMESPACE_BEGIN
- 
-+static_assert(
-+    alignof(status_internal::StatusRep) >= 4,
-+    "absl::Status assumes it can use the bottom 2 bits of a StatusRep*.");
-+
- std::string StatusCodeToString(StatusCode code) {
-   switch (code) {
-     case StatusCode::kOk:
-diff --git a/absl/status/status.h b/absl/status/status.h
-index 595064c0..2dac2fea 100644
---- a/absl/status/status.h
-+++ b/absl/status/status.h
-@@ -51,10 +51,15 @@
- #ifndef ABSL_STATUS_STATUS_H_
- #define ABSL_STATUS_STATUS_H_
- 
-+#include <cassert>
-+#include <cstdint>
- #include <ostream>
- #include <string>
- #include <utility>
- 
-+#include "absl/base/attributes.h"
-+#include "absl/base/config.h"
-+#include "absl/base/optimization.h"
- #include "absl/functional/function_ref.h"
- #include "absl/status/internal/status_internal.h"
- #include "absl/strings/cord.h"
-@@ -644,13 +649,13 @@ class Status final {
-   std::string ToStringSlow(StatusToStringMode mode) const;
- 
-   // Status supports two different representations.
--  //  - When the low bit is off it is an inlined representation.
-+  //  - When the low bit is set it is an inlined representation.
-   //    It uses the canonical error space, no message or payload.
-   //    The error code is (rep_ >> 2).
-   //    The (rep_ & 2) bit is the "moved from" indicator, used in IsMovedFrom().
--  //  - When the low bit is on it is an external representation.
-+  //  - When the low bit is off it is an external representation.
-   //    In this case all the data comes from a heap allocated Rep object.
--  //    (rep_ - 1) is a status_internal::StatusRep* pointer to that structure.
-+  //    rep_ is a status_internal::StatusRep* pointer to that structure.
-   uintptr_t rep_;
- };
- 
-@@ -839,18 +844,16 @@ inline status_internal::Payloads* Status::GetPayloads() {
-   return IsInlined(rep_) ? nullptr : RepToPointer(rep_)->payloads.get();
- }
- 
--inline bool Status::IsInlined(uintptr_t rep) { return (rep & 1) == 0; }
-+inline bool Status::IsInlined(uintptr_t rep) { return (rep & 1) != 0; }
- 
--inline bool Status::IsMovedFrom(uintptr_t rep) {
--  return IsInlined(rep) && (rep & 2) != 0;
--}
-+inline bool Status::IsMovedFrom(uintptr_t rep) { return (rep & 2) != 0; }
- 
- inline uintptr_t Status::MovedFromRep() {
-   return CodeToInlinedRep(absl::StatusCode::kInternal) | 2;
- }
- 
- inline uintptr_t Status::CodeToInlinedRep(absl::StatusCode code) {
--  return static_cast<uintptr_t>(code) << 2;
-+  return (static_cast<uintptr_t>(code) << 2) + 1;
- }
- 
- inline absl::StatusCode Status::InlinedRepToCode(uintptr_t rep) {
-@@ -860,11 +863,11 @@ inline absl::StatusCode Status::InlinedRepToCode(uintptr_t rep) {
- 
- inline status_internal::StatusRep* Status::RepToPointer(uintptr_t rep) {
-   assert(!IsInlined(rep));
--  return reinterpret_cast<status_internal::StatusRep*>(rep - 1);
-+  return reinterpret_cast<status_internal::StatusRep*>(rep);
- }
- 
- inline uintptr_t Status::PointerToRep(status_internal::StatusRep* rep) {
--  return reinterpret_cast<uintptr_t>(rep) + 1;
-+  return reinterpret_cast<uintptr_t>(rep);
- }
- 
- inline void Status::Ref(uintptr_t rep) {
--- 
-2.25.1
diff --git a/third_party/absl/nvidia_jetson.patch b/third_party/absl/nvidia_jetson.patch
deleted file mode 100644
index 5328c3a0d605..000000000000
--- a/third_party/absl/nvidia_jetson.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From 372124e6af36a540e74a2ec31d79d7297a831f98 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Bastien?= <frederic.bastien@gmail.com>
-Date: Thu, 1 Aug 2024 12:38:52 -0700
-Subject: [PATCH] PR #1732: Fix build on NVIDIA Jetson board. Fix #1665
-
-Imported from GitHub PR https://github.com/abseil/abseil-cpp/pull/1732
-
-Fix build on NVIDIA Jetson board. Fix #1665
-
-This patch is already used by the spark project.
-I'm fixing this as this break the build of Tensorflow and JAX on Jetson board.
-Merge 7db2d2ab9fbed1f0fabad10a6ec73533ba71bfff into 6b8ebb35c0414ef5a2b6fd4a0f59057e41beaff9
-
-Merging this change closes #1732
-
-COPYBARA_INTEGRATE_REVIEW=https://github.com/abseil/abseil-cpp/pull/1732 from nouiz:fix_neon_on_jetson 7db2d2ab9fbed1f0fabad10a6ec73533ba71bfff
-PiperOrigin-RevId: 658501520
-Change-Id: If502ede4efc8c877fb3fed227eca6dc7622dd181
----
- absl/base/config.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/absl/base/config.h b/absl/base/config.h
-index 97c9a22a109..ab1e9860a91 100644
---- a/absl/base/config.h
-+++ b/absl/base/config.h
-@@ -926,7 +926,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
- // https://llvm.org/docs/CompileCudaWithLLVM.html#detecting-clang-vs-nvcc-from-code
- #ifdef ABSL_INTERNAL_HAVE_ARM_NEON
- #error ABSL_INTERNAL_HAVE_ARM_NEON cannot be directly set
--#elif defined(__ARM_NEON) && !defined(__CUDA_ARCH__)
-+#elif defined(__ARM_NEON) && !(defined(__NVCC__) && defined(__CUDACC__))
- #define ABSL_INTERNAL_HAVE_ARM_NEON 1
- #endif
- 
diff --git a/third_party/absl/system.BUILD b/third_party/absl/system.BUILD
deleted file mode 100644
index 134d27332732..000000000000
--- a/third_party/absl/system.BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/absl/system.absl.algorithm.BUILD b/third_party/absl/system.absl.algorithm.BUILD
deleted file mode 100644
index ffcb03a82067..000000000000
--- a/third_party/absl/system.absl.algorithm.BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-[cc_library(
-    name = n,
-) for n in [
-    "algorithm",
-    "container",
-]]
diff --git a/third_party/absl/system.absl.base.BUILD b/third_party/absl/system.absl.base.BUILD
deleted file mode 100644
index d6bf8748deea..000000000000
--- a/third_party/absl/system.absl.base.BUILD
+++ /dev/null
@@ -1,107 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-[cc_library(
-    name = n,
-) for n in [
-    "config",
-    "core_headers",
-    "base_internal",
-    "dynamic_annotations",
-    "atomic_hook",
-    "errno_saver",
-    "fast_type_id",
-    "pretty_function",
-]]
-
-cc_library(
-    name = "log_severity",
-    linkopts = ["-labsl_log_severity"],
-)
-
-cc_library(
-    name = "raw_logging_internal",
-    linkopts = ["-labsl_raw_logging_internal"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-    deps = [
-        ":log_severity",
-    ],
-)
-
-cc_library(
-    name = "spinlock_wait",
-    linkopts = ["-labsl_spinlock_wait"],
-    visibility = [
-        "//absl/base:__pkg__",
-    ],
-)
-
-cc_library(
-    name = "malloc_internal",
-    linkopts = [
-        "-labsl_malloc_internal",
-        "-pthread",
-    ],
-    deps = [
-        ":base",
-        ":raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "base",
-    linkopts = [
-        "-labsl_base",
-        "-pthread",
-    ],
-    deps = [
-        ":log_severity",
-        ":raw_logging_internal",
-        ":spinlock_wait",
-    ],
-)
-
-cc_library(
-    name = "throw_delegate",
-    linkopts = ["-labsl_throw_delegate"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-    deps = [
-        ":raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "endian",
-    deps = [
-        ":base",
-    ],
-)
-
-cc_library(
-    name = "exponential_biased",
-    linkopts = ["-labsl_exponential_biased"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-)
-
-cc_library(
-    name = "periodic_sampler",
-    linkopts = ["-labsl_periodic_sampler"],
-    deps = [
-        ":exponential_biased",
-    ],
-)
-
-cc_library(
-    name = "strerror",
-    linkopts = ["-labsl_strerror"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-)
diff --git a/third_party/absl/system.absl.cleanup.BUILD b/third_party/absl/system.absl.cleanup.BUILD
deleted file mode 100644
index eec527b17179..000000000000
--- a/third_party/absl/system.absl.cleanup.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-cc_library(
-    name = "cleanup",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/absl/system.absl.container.BUILD b/third_party/absl/system.absl.container.BUILD
deleted file mode 100644
index 95c162604301..000000000000
--- a/third_party/absl/system.absl.container.BUILD
+++ /dev/null
@@ -1,217 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "compressed_tuple",
-    deps = [
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "fixed_array",
-    deps = [
-        ":compressed_tuple",
-        "//absl/algorithm",
-        "//absl/base:config",
-        "//absl/base:core_headers",
-        "//absl/base:dynamic_annotations",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "inlined_vector_internal",
-    deps = [
-        ":compressed_tuple",
-        "//absl/base:core_headers",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "inlined_vector",
-    deps = [
-        ":inlined_vector_internal",
-        "//absl/algorithm",
-        "//absl/base:core_headers",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "flat_hash_map",
-    deps = [
-        ":container_memory",
-        ":hash_function_defaults",
-        ":raw_hash_map",
-        "//absl/algorithm:container",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "flat_hash_set",
-    deps = [
-        ":container_memory",
-        ":hash_function_defaults",
-        ":raw_hash_set",
-        "//absl/algorithm:container",
-        "//absl/base:core_headers",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "node_hash_map",
-    deps = [
-        ":container_memory",
-        ":hash_function_defaults",
-        ":node_hash_policy",
-        ":raw_hash_map",
-        "//absl/algorithm:container",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "node_hash_set",
-    deps = [
-        ":hash_function_defaults",
-        ":node_hash_policy",
-        ":raw_hash_set",
-        "//absl/algorithm:container",
-        "//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "container_memory",
-    deps = [
-        "//absl/base:config",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "hash_function_defaults",
-    deps = [
-        "//absl/base:config",
-        "//absl/hash",
-        "//absl/strings",
-        "//absl/strings:cord",
-    ],
-)
-
-cc_library(
-    name = "hash_policy_traits",
-    deps = ["//absl/meta:type_traits"],
-)
-
-cc_library(
-    name = "hashtable_debug",
-    deps = [
-        ":hashtable_debug_hooks",
-    ],
-)
-
-cc_library(
-    name = "hashtable_debug_hooks",
-    deps = [
-        "//absl/base:config",
-    ],
-)
-
-cc_library(
-    name = "hashtablez_sampler",
-    linkopts = ["-labsl_hashtablez_sampler"],
-    deps = [
-        "//absl/base",
-        "//absl/base:core_headers",
-        "//absl/base:exponential_biased",
-        "//absl/debugging:stacktrace",
-        "//absl/memory",
-        "//absl/synchronization",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "node_hash_policy",
-    deps = ["//absl/base:config"],
-)
-
-cc_library(
-    name = "raw_hash_map",
-    deps = [
-        ":container_memory",
-        ":raw_hash_set",
-        "//absl/base:throw_delegate",
-    ],
-)
-
-cc_library(
-    name = "common",
-    deps = [
-        "//absl/meta:type_traits",
-        "//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "raw_hash_set",
-    linkopts = ["-labsl_raw_hash_set"],
-    deps = [
-        ":common",
-        ":compressed_tuple",
-        ":container_memory",
-        ":hash_policy_traits",
-        ":hashtable_debug_hooks",
-        ":hashtablez_sampler",
-        ":layout",
-        "//absl/base:config",
-        "//absl/base:core_headers",
-        "//absl/base:endian",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/numeric:bits",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "layout",
-    deps = [
-        "//absl/base:config",
-        "//absl/base:core_headers",
-        "//absl/meta:type_traits",
-        "//absl/strings",
-        "//absl/types:span",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "btree",
-    deps = [
-        ":common",
-        ":compressed_tuple",
-        ":container_memory",
-        ":layout",
-        "//absl/base:core_headers",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/strings",
-        "//absl/strings:cord",
-        "//absl/types:compare",
-        "//absl/utility",
-    ],
-)
diff --git a/third_party/absl/system.absl.debugging.BUILD b/third_party/absl/system.absl.debugging.BUILD
deleted file mode 100644
index 931ffdc9e92e..000000000000
--- a/third_party/absl/system.absl.debugging.BUILD
+++ /dev/null
@@ -1,69 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "stacktrace",
-    linkopts = ["-labsl_stacktrace"],
-    deps = [
-        ":debugging_internal",
-    ],
-)
-
-cc_library(
-    name = "symbolize",
-    linkopts = ["-labsl_symbolize"],
-    deps = [
-        ":debugging_internal",
-        ":demangle_internal",
-        "//absl/base",
-        "//absl/base:dynamic_annotations",
-        "//absl/base:malloc_internal",
-        "//absl/base:raw_logging_internal",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "failure_signal_handler",
-    linkopts = [
-        "-labsl_failure_signal_handler",
-        "-labsl_examine_stack",
-    ],
-    deps = [
-        ":stacktrace",
-        ":symbolize",
-        "//absl/base",
-        "//absl/base:errno_saver",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "debugging_internal",
-    linkopts = ["-labsl_debugging_internal"],
-    deps = [
-        "//absl/base:dynamic_annotations",
-        "//absl/base:errno_saver",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "demangle_internal",
-    linkopts = ["-labsl_demangle_internal"],
-    deps = [
-        "//absl/base",
-    ],
-)
-
-cc_library(
-    name = "leak_check",
-    linkopts = ["-labsl_leak_check"],
-)
-
-cc_library(
-    name = "leak_check_disable",
-    linkopts = ["-labsl_leak_check_disable"],
-    alwayslink = 1,
-)
diff --git a/third_party/absl/system.absl.flags.BUILD b/third_party/absl/system.absl.flags.BUILD
deleted file mode 100644
index aff653c7e5b1..000000000000
--- a/third_party/absl/system.absl.flags.BUILD
+++ /dev/null
@@ -1,155 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "program_name",
-    linkopts = ["-labsl_flags_program_name"],
-    visibility = [
-        "//absl/flags:__pkg__",
-    ],
-    deps = [
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "config",
-    linkopts = ["-labsl_flags_config"],
-    deps = [
-        ":program_name",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "marshalling",
-    linkopts = ["-labsl_flags_marshalling"],
-    deps = [
-        "//absl/base:log_severity",
-        "//absl/strings",
-        "//absl/strings:str_format",
-    ],
-)
-
-cc_library(
-    name = "commandlineflag_internal",
-    linkopts = ["-labsl_flags_commandlineflag_internal"],
-)
-
-cc_library(
-    name = "commandlineflag",
-    linkopts = ["-labsl_flags_commandlineflag"],
-    deps = [
-        ":commandlineflag_internal",
-        "//absl/strings",
-        "//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "private_handle_accessor",
-    linkopts = ["-labsl_flags_private_handle_accessor"],
-    visibility = [
-        "//absl/flags:__pkg__",
-    ],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "reflection",
-    linkopts = ["-labsl_flags_reflection"],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        ":config",
-        ":private_handle_accessor",
-        "//absl/container:flat_hash_map",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "flag_internal",
-    linkopts = ["-labsl_flags_internal"],
-    visibility = ["//absl/base:__subpackages__"],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        ":config",
-        ":marshalling",
-        ":reflection",
-        "//absl/base",
-        "//absl/memory",
-        "//absl/meta:type_traits",
-        "//absl/strings",
-        "//absl/synchronization",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "flag",
-    linkopts = ["-labsl_flags"],
-    deps = [
-        ":config",
-        ":flag_internal",
-        ":reflection",
-        "//absl/base",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "usage_internal",
-    linkopts = ["-labsl_flags_usage_internal"],
-    visibility = [
-        "//absl/flags:__pkg__",
-    ],
-    deps = [
-        ":commandlineflag",
-        ":config",
-        ":flag",
-        ":flag_internal",
-        ":private_handle_accessor",
-        ":program_name",
-        ":reflection",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "usage",
-    linkopts = ["-labsl_flags_usage"],
-    deps = [
-        ":usage_internal",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "parse",
-    linkopts = ["-labsl_flags_parse"],
-    deps = [
-        ":commandlineflag",
-        ":commandlineflag_internal",
-        ":config",
-        ":flag",
-        ":flag_internal",
-        ":private_handle_accessor",
-        ":program_name",
-        ":reflection",
-        ":usage",
-        ":usage_internal",
-        "//absl/strings",
-        "//absl/synchronization",
-    ],
-)
diff --git a/third_party/absl/system.absl.functional.BUILD b/third_party/absl/system.absl.functional.BUILD
deleted file mode 100644
index 9439bd0ba222..000000000000
--- a/third_party/absl/system.absl.functional.BUILD
+++ /dev/null
@@ -1,15 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "any_invocable",
-)
-
-cc_library(
-    name = "bind_front",
-)
-
-cc_library(
-    name = "function_ref",
-)
diff --git a/third_party/absl/system.absl.hash.BUILD b/third_party/absl/system.absl.hash.BUILD
deleted file mode 100644
index 3367340cb25c..000000000000
--- a/third_party/absl/system.absl.hash.BUILD
+++ /dev/null
@@ -1,37 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "hash",
-    linkopts = ["-labsl_hash"],
-    deps = [
-        ":city",
-        ":low_level_hash",
-        "//absl/base:endian",
-        "//absl/container:fixed_array",
-        "//absl/numeric:int128",
-        "//absl/strings",
-        "//absl/types:optional",
-        "//absl/types:variant",
-        "//absl/utility",
-    ],
-)
-
-cc_library(
-    name = "city",
-    linkopts = ["-labsl_city"],
-    deps = [
-        "//absl/base:endian",
-    ],
-)
-
-cc_library(
-    name = "low_level_hash",
-    linkopts = ["-labsl_low_level_hash"],
-    visibility = ["//visibility:private"],
-    deps = [
-        "//absl/base:endian",
-        "//absl/numeric:int128",
-    ],
-)
diff --git a/third_party/absl/system.absl.memory.BUILD b/third_party/absl/system.absl.memory.BUILD
deleted file mode 100644
index 592c004e90e7..000000000000
--- a/third_party/absl/system.absl.memory.BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "memory",
-)
diff --git a/third_party/absl/system.absl.meta.BUILD b/third_party/absl/system.absl.meta.BUILD
deleted file mode 100644
index 966a7ac8de14..000000000000
--- a/third_party/absl/system.absl.meta.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-cc_library(
-    name = "type_traits",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/absl/system.absl.numeric.BUILD b/third_party/absl/system.absl.numeric.BUILD
deleted file mode 100644
index 59a5836ad2dd..000000000000
--- a/third_party/absl/system.absl.numeric.BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "bits",
-)
-
-cc_library(
-    name = "int128",
-    linkopts = ["-labsl_int128"],
-)
-
-cc_library(
-    name = "representation",
-)
diff --git a/third_party/absl/system.absl.random.BUILD b/third_party/absl/system.absl.random.BUILD
deleted file mode 100644
index ac17ce6343b7..000000000000
--- a/third_party/absl/system.absl.random.BUILD
+++ /dev/null
@@ -1,57 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "random",
-    deps = [
-        ":distributions",
-        ":seed_sequences",
-        "//absl/base:endian",
-    ],
-)
-
-cc_library(
-    name = "distributions",
-    linkopts = ["-labsl_random_distributions"],
-    deps = [
-        "//absl/numeric:bits",
-        "//absl/numeric:int128",
-        "//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "seed_gen_exception",
-    linkopts = ["-labsl_random_seed_gen_exception"],
-)
-
-cc_library(
-    name = "seed_sequences",
-    linkopts = [
-        "-labsl_random_internal_platform",
-        "-labsl_random_internal_pool_urbg",
-        "-labsl_random_internal_randen",
-        "-labsl_random_internal_randen_hwaes",
-        "-labsl_random_internal_randen_hwaes_impl",
-        "-labsl_random_internal_randen_slow",
-        "-labsl_random_internal_seed_material",
-        "-labsl_random_seed_sequences",
-        "-pthread",
-    ],
-    deps = [
-        ":seed_gen_exception",
-        "//absl/base",
-        "//absl/base:endian",
-        "//absl/base:raw_logging_internal",
-        "//absl/container:inlined_vector",
-        "//absl/numeric:int128",
-        "//absl/strings",
-        "//absl/types:optional",
-        "//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "bit_gen_ref",
-)
diff --git a/third_party/absl/system.absl.status.BUILD b/third_party/absl/system.absl.status.BUILD
deleted file mode 100644
index e50e9790c829..000000000000
--- a/third_party/absl/system.absl.status.BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "status",
-    linkopts = ["-labsl_status"],
-    deps = [
-        "//absl/base:atomic_hook",
-        "//absl/base:raw_logging_internal",
-        "//absl/container:inlined_vector",
-        "//absl/debugging:stacktrace",
-        "//absl/debugging:symbolize",
-        "//absl/strings",
-        "//absl/strings:cord",
-        "//absl/strings:str_format",
-        "//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "statusor",
-    linkopts = ["-labsl_statusor"],
-    deps = [
-        ":status",
-        "//absl/base:raw_logging_internal",
-        "//absl/strings",
-        "//absl/types:variant",
-        "//absl/utility",
-    ],
-)
diff --git a/third_party/absl/system.absl.strings.BUILD b/third_party/absl/system.absl.strings.BUILD
deleted file mode 100644
index fa9a7a84f67a..000000000000
--- a/third_party/absl/system.absl.strings.BUILD
+++ /dev/null
@@ -1,49 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "strings",
-    linkopts = ["-labsl_strings"],
-    deps = [
-        ":internal",
-        "//absl/base",
-        "//absl/base:throw_delegate",
-        "//absl/memory",
-        "//absl/numeric:bits",
-        "//absl/numeric:int128",
-    ],
-)
-
-cc_library(
-    name = "internal",
-    linkopts = ["-labsl_strings_internal"],
-    deps = [
-        "//absl/base:endian",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "cord",
-    linkopts = ["-labsl_cord"],
-    deps = [
-        ":str_format",
-        "//absl/container:compressed_tuple",
-        "//absl/container:fixed_array",
-        "//absl/container:inlined_vector",
-        "//absl/container:layout",
-    ],
-)
-
-cc_library(
-    name = "str_format",
-    linkopts = ["-labsl_str_format_internal"],
-    deps = [
-        ":strings",
-        "//absl/functional:function_ref",
-        "//absl/numeric:representation",
-        "//absl/types:optional",
-        "//absl/types:span",
-    ],
-)
diff --git a/third_party/absl/system.absl.synchronization.BUILD b/third_party/absl/system.absl.synchronization.BUILD
deleted file mode 100644
index c0fa37aacd7e..000000000000
--- a/third_party/absl/system.absl.synchronization.BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-# Internal data structure for efficiently detecting mutex dependency cycles
-cc_library(
-    name = "graphcycles_internal",
-    linkopts = ["-labsl_graphcycles_internal"],
-    visibility = [
-        "//absl:__subpackages__",
-    ],
-    deps = [
-        "//absl/base",
-        "//absl/base:malloc_internal",
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "synchronization",
-    linkopts = [
-        "-labsl_synchronization",
-        "-pthread",
-    ],
-    deps = [
-        ":graphcycles_internal",
-        "//absl/base",
-        "//absl/base:atomic_hook",
-        "//absl/base:dynamic_annotations",
-        "//absl/base:malloc_internal",
-        "//absl/base:raw_logging_internal",
-        "//absl/debugging:stacktrace",
-        "//absl/debugging:symbolize",
-        "//absl/time",
-    ],
-)
diff --git a/third_party/absl/system.absl.time.BUILD b/third_party/absl/system.absl.time.BUILD
deleted file mode 100644
index fe295c3943b2..000000000000
--- a/third_party/absl/system.absl.time.BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "time",
-    linkopts = [
-        "-labsl_time",
-        "-labsl_civil_time",
-        "-labsl_time_zone",
-    ],
-    deps = [
-        "//absl/base",
-        "//absl/base:raw_logging_internal",
-        "//absl/numeric:int128",
-        "//absl/strings",
-    ],
-)
diff --git a/third_party/absl/system.absl.types.BUILD b/third_party/absl/system.absl.types.BUILD
deleted file mode 100644
index db94fc99185c..000000000000
--- a/third_party/absl/system.absl.types.BUILD
+++ /dev/null
@@ -1,59 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "any",
-    deps = [
-        ":bad_any_cast",
-    ],
-)
-
-cc_library(
-    name = "bad_any_cast",
-    linkopts = ["-labsl_bad_any_cast_impl"],
-)
-
-cc_library(
-    name = "span",
-    deps = [
-        "//absl/base:throw_delegate",
-    ],
-)
-
-cc_library(
-    name = "optional",
-    deps = [
-        ":bad_optional_access",
-    ],
-)
-
-cc_library(
-    name = "bad_optional_access",
-    linkopts = ["-labsl_bad_optional_access"],
-    deps = [
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "bad_variant_access",
-    linkopts = ["-labsl_bad_variant_access"],
-    deps = [
-        "//absl/base:raw_logging_internal",
-    ],
-)
-
-cc_library(
-    name = "variant",
-    deps = [
-        ":bad_variant_access",
-    ],
-)
-
-cc_library(
-    name = "compare",
-    deps = [
-        "//absl/meta:type_traits",
-    ],
-)
diff --git a/third_party/absl/system.absl.utility.BUILD b/third_party/absl/system.absl.utility.BUILD
deleted file mode 100644
index e15049e261c7..000000000000
--- a/third_party/absl/system.absl.utility.BUILD
+++ /dev/null
@@ -1,6 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
-
-cc_library(
-    name = "utility",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/absl/workspace.bzl b/third_party/absl/workspace.bzl
deleted file mode 100644
index d5973b13b395..000000000000
--- a/third_party/absl/workspace.bzl
+++ /dev/null
@@ -1,50 +0,0 @@
-"""Provides the repository macro to import absl."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports absl."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    ABSL_COMMIT = "fb3621f4f897824c0dbe0615fa94543df6192f30"
-    ABSL_SHA256 = "0320586856674d16b0b7a4d4afb22151bdc798490bb7f295eddd8f6a62b46fea"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake)
-
-    SYS_DIRS = [
-        "algorithm",
-        "base",
-        "cleanup",
-        "container",
-        "debugging",
-        "flags",
-        "functional",
-        "hash",
-        "memory",
-        "meta",
-        "numeric",
-        "random",
-        "status",
-        "strings",
-        "synchronization",
-        "time",
-        "types",
-        "utility",
-    ]
-    SYS_LINKS = {
-        "//third_party/absl:system.absl.{name}.BUILD".format(name = n): "absl/{name}/BUILD.bazel".format(name = n)
-        for n in SYS_DIRS
-    }
-
-    tf_http_archive(
-        name = "com_google_absl",
-        sha256 = ABSL_SHA256,
-        build_file = "//third_party/absl:com_google_absl.BUILD",
-        system_build_file = "//third_party/absl:system.BUILD",
-        system_link_files = SYS_LINKS,
-        strip_prefix = "abseil-cpp-{commit}".format(commit = ABSL_COMMIT),
-        urls = tf_mirror_urls("https://github.com/abseil/abseil-cpp/archive/{commit}.tar.gz".format(commit = ABSL_COMMIT)),
-        patch_file = [
-            "//third_party/absl:nvidia_jetson.patch",
-        ],
-    )
diff --git a/third_party/android/BUILD b/third_party/android/BUILD.bazel
similarity index 100%
rename from third_party/android/BUILD
rename to third_party/android/BUILD.bazel
diff --git a/third_party/benchmark/BUILD b/third_party/benchmark/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/benchmark/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/benchmark/workspace.bzl b/third_party/benchmark/workspace.bzl
deleted file mode 100644
index 679133c60c0f..000000000000
--- a/third_party/benchmark/workspace.bzl
+++ /dev/null
@@ -1,14 +0,0 @@
-"""Provides the repo macro to import google benchmark"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports benchmark."""
-    BM_COMMIT = "f7547e29ccaed7b64ef4f7495ecfff1c9f6f3d03"
-    BM_SHA256 = "552ca3d4d1af4beeb1907980f7096315aa24150d6baf5ac1e5ad90f04846c670"
-    tf_http_archive(
-        name = "com_google_benchmark",
-        sha256 = BM_SHA256,
-        strip_prefix = "benchmark-{commit}".format(commit = BM_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/benchmark/archive/{commit}.tar.gz".format(commit = BM_COMMIT)),
-    )
diff --git a/third_party/boringssl/BUILD b/third_party/boringssl/BUILD.bazel
similarity index 100%
rename from third_party/boringssl/BUILD
rename to third_party/boringssl/BUILD.bazel
diff --git a/third_party/clang_toolchain/cc_configure_clang.bzl b/third_party/clang_toolchain/cc_configure_clang.bzl
deleted file mode 100644
index a6b87ab69716..000000000000
--- a/third_party/clang_toolchain/cc_configure_clang.bzl
+++ /dev/null
@@ -1,27 +0,0 @@
-""" Downloads clang and configures the crosstool using bazel's autoconf."""
-
-load("@bazel_tools//tools/cpp:cc_configure.bzl", "cc_autoconf_impl")
-load(":download_clang.bzl", "download_clang")
-
-_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
-_TF_NEED_CUDA = "TF_NEED_CUDA"
-
-def _cc_clang_autoconf(repo_ctx):
-    if repo_ctx.os.environ.get(_TF_DOWNLOAD_CLANG) != "1":
-        return
-    if repo_ctx.os.environ.get(_TF_NEED_CUDA) == "1":
-        # Clang is handled separately for CUDA configs.
-        # See cuda_configure.bzl for more details.
-        return
-
-    download_clang(repo_ctx, out_folder = "extra_tools")
-    overridden_tools = {"gcc": "extra_tools/bin/clang"}
-    cc_autoconf_impl(repo_ctx, overridden_tools)
-
-cc_download_clang_toolchain = repository_rule(
-    environ = [
-        _TF_DOWNLOAD_CLANG,
-        _TF_NEED_CUDA,
-    ],
-    implementation = _cc_clang_autoconf,
-)
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
deleted file mode 100644
index 6e6091bf657a..000000000000
--- a/third_party/clang_toolchain/download_clang.bzl
+++ /dev/null
@@ -1,64 +0,0 @@
-""" Helpers to download a recent clang release."""
-
-def _get_platform_folder(os_name):
-    os_name = os_name.lower()
-    if os_name.startswith("windows"):
-        return "Win"
-    if os_name.startswith("mac os"):
-        return "Mac"
-    if not os_name.startswith("linux"):
-        fail("Unknown platform")
-    return "Linux_x64"
-
-def _download_chromium_clang(
-        repo_ctx,
-        platform_folder,
-        package_version,
-        sha256,
-        out_folder):
-    cds_url = "https://commondatastorage.googleapis.com/chromium-browser-clang"
-    cds_file = "clang-%s.tgz" % package_version
-    cds_full_url = "{0}/{1}/{2}".format(cds_url, platform_folder, cds_file)
-    repo_ctx.download_and_extract(cds_full_url, output = out_folder, sha256 = sha256)
-
-def download_clang(repo_ctx, out_folder):
-    """ Download a fresh clang release and put it into out_folder.
-
-    Clang itself will be located in 'out_folder/bin/clang'.
-    We currently download one of the latest releases of clang by the
-    Chromium project (see
-    https://chromium.googlesource.com/chromium/src/+/master/docs/clang.md).
-
-    Args:
-      repo_ctx: An instance of repository_context object.
-      out_folder: A folder to extract the compiler into.
-    """
-    # TODO(ibiryukov): we currently download and extract some extra tools in the
-    # clang release (e.g., sanitizers). We should probably remove the ones
-    # we don't need and document the ones we want provide in addition to clang.
-
-    # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
-    # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "b4160cb94c54f0b31d0ce14694950dac7b6cd83f"
-    CLANG_SVN_REVISION = "371856"
-    CLANG_SUB_REVISION = 1
-    package_version = "%s-%s-%s" % (
-        CLANG_SVN_REVISION,
-        CLANG_REVISION[:8],
-        CLANG_SUB_REVISION,
-    )
-
-    checksums = {
-        "Linux_x64": "919c19df3ebd7db03b72575b2de5198404357659fc8c85c2d66e679ad4acbafe",
-        "Mac": "5632c516f3ac5fab3654d0a874688cad6c7f99b96845da27ab12336a14187aa2",
-        "Win": "235545b33f4d697190032cb538fdcaba227017c95b752ea8af8f29aab8da7479",
-    }
-
-    platform_folder = _get_platform_folder(repo_ctx.os.name)
-    _download_chromium_clang(
-        repo_ctx,
-        platform_folder,
-        package_version,
-        checksums[platform_folder],
-        out_folder,
-    )
diff --git a/third_party/compute_library/acl_thread_local_scheduler.patch b/third_party/compute_library/acl_thread_local_scheduler.patch
deleted file mode 100644
index 9ebf6b71fdb4..000000000000
--- a/third_party/compute_library/acl_thread_local_scheduler.patch
+++ /dev/null
@@ -1,98 +0,0 @@
-diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
-index 9e8add1f9..cf5e2bf4c 100644
---- a/arm_compute/runtime/Scheduler.h
-+++ b/arm_compute/runtime/Scheduler.h
-@@ -75,7 +75,7 @@ public:
-
- private:
-     static Type                        _scheduler_type;
--    static std::shared_ptr<IScheduler> _custom_scheduler;
-+    static thread_local std::shared_ptr<IScheduler> _custom_scheduler;
-     static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
-
-     Scheduler();
-diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-index a5b9eca56..d1ab19397 100644
---- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
-                                                    const ConvolutionInfo &info)
- {
-     ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info);
--    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
--    const unsigned int num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo     &ci          = CPUInfo::get();
-+    const unsigned int num_threads = CPUInfo::get().get_cpu_num();
-     _pImpl->is_prepared            = false;
-     _pImpl->are_weights_const      = weights->are_values_constant();
-
-diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
-index 722cd36ee..03aef1632 100644
---- a/src/cpu/operators/CpuPool2d.cpp
-+++ b/src/cpu/operators/CpuPool2d.cpp
-@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
-
-     if(run_optimised)
-     {
--        const CPUInfo     &ci          = NEScheduler::get().cpu_info();
--        const unsigned int num_threads = NEScheduler::get().num_threads();
-+        const CPUInfo     &ci          = CPUInfo::get();
-+        const unsigned int num_threads = CPUInfo::get().get_cpu_num();
-
-         auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
-         ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
-diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-index 9c8563140..f7771945a 100644
---- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
-+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
-@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
-                      arm_gemm::Activation activation, const AsmGemmInfo &info)
- {
-     Params         p           = extract_parameters(a, b, d, info);
--    const CPUInfo &ci          = NEScheduler::get().cpu_info();
--    unsigned int   num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo &ci          = CPUInfo::get();
-+    unsigned int   num_threads = CPUInfo::get().get_cpu_num();
-
-     arm_gemm::GemmConfig cfg;
-     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
-     ARM_COMPUTE_UNUSED(c);
-     arm_gemm::Activation act         = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
-     Params               p           = extract_parameters(a, b, d, info);
--    const CPUInfo       &ci          = NEScheduler::get().cpu_info();
--    unsigned int         num_threads = NEScheduler::get().num_threads();
-+    const CPUInfo       &ci          = CPUInfo::get();
-+    unsigned int         num_threads = CPUInfo::get().get_cpu_num();
-     arm_gemm::GemmConfig cfg;
-     cfg.weight_format                           = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-     arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
-diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
-index 0713b9a2a..f15ac2e22 100644
---- a/src/runtime/Scheduler.cpp
-+++ b/src/runtime/Scheduler.cpp
-@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
- Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
- #endif /* ARM_COMPUTE_*_SCHEDULER */
-
--std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
-+thread_local std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
-
- namespace
- {
diff --git a/third_party/compute_library/build_defs.bzl b/third_party/compute_library/build_defs.bzl
deleted file mode 100644
index cd85be26bde5..000000000000
--- a/third_party/compute_library/build_defs.bzl
+++ /dev/null
@@ -1,20 +0,0 @@
-def if_enable_acl(if_true, if_false = []):
-    return select({
-        "@local_xla//third_party/compute_library:build_with_acl": if_true,
-        "//conditions:default": if_false,
-    })
-
-def acl_deps():
-    """Returns the correct set of ACL library dependencies.
-
-      Shorthand for select() to pull in the correct set of ACL library deps
-      for aarch64 platform
-
-    Returns:
-      a select evaluating to a list of library dependencies, suitable for
-      inclusion in the deps attribute of rules.
-    """
-    return select({
-        "@local_xla//third_party/compute_library:build_with_acl": ["@compute_library//:arm_compute"],
-        "//conditions:default": [],
-    })
diff --git a/third_party/compute_library/compute_library.patch b/third_party/compute_library/compute_library.patch
deleted file mode 100644
index 5a86e2f68f9d..000000000000
--- a/third_party/compute_library/compute_library.patch
+++ /dev/null
@@ -1,43 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/BUILD.bazel b/BUILD.bazel
-index f1766d958..0cb51f52d 100644
---- a/BUILD.bazel
-+++ b/BUILD.bazel
-@@ -239,9 +239,11 @@ cc_library(
-             }),
-     visibility = ["//visibility:public"],
-     deps = [
--        "arm_compute",
-         "//:common_defines",
-         "//arm_compute:graph_headers",
-+        "//include",
-+        "//support",
-+        "//utils",
-     ],
-     alwayslink = True,
- )
-@@ -407,7 +409,8 @@ cc_library(
-         "//support",
-         "//utils",
-         "//:arm_compute_sve",
--        "//:arm_compute_sve2"
-+        "//:arm_compute_sve2",
-+        "//:arm_compute_graph"
-     ],
-     alwayslink = True,
- )
diff --git a/third_party/compute_library/exclude_omp_scheduler.patch b/third_party/compute_library/exclude_omp_scheduler.patch
deleted file mode 100644
index 7ccfebbd3690..000000000000
--- a/third_party/compute_library/exclude_omp_scheduler.patch
+++ /dev/null
@@ -1,23 +0,0 @@
-diff --git a/src/BUILD.bazel b/src/BUILD.bazel
-index bf71e534e2..22377f1a32 100644
---- a/src/BUILD.bazel
-+++ b/src/BUILD.bazel
-@@ -971,7 +971,6 @@ filegroup(
- 	"runtime/NEON/functions/NETranspose.cpp",
- 	"runtime/NEON/functions/NEUnstack.cpp",
- 	"runtime/NEON/functions/NEWinogradConvolutionLayer.cpp",
--	"runtime/OMP/OMPScheduler.cpp",
- 	"runtime/OffsetLifetimeManager.cpp",
- 	"runtime/OffsetMemoryPool.cpp",
- 	"runtime/OperatorTensor.cpp",
-@@ -984,6 +983,10 @@ filegroup(
- 	"runtime/Tensor.cpp",
- 	"runtime/TensorAllocator.cpp",
- 	"runtime/Utils.cpp"]  +
-+    select({
-+        "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"],
-+        "//conditions:default": [],
-+    }) +
-     glob(["**/*.h",
-     "**/*.hpp",
-     "**/*.inl"]),
diff --git a/third_party/compute_library/include_string.patch b/third_party/compute_library/include_string.patch
deleted file mode 100644
index 709aeea6210c..000000000000
--- a/third_party/compute_library/include_string.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h
-index 42dca58ea1..9c8b1598e8 100644
---- a/arm_compute/core/utils/logging/IPrinter.h
-+++ b/arm_compute/core/utils/logging/IPrinter.h
-@@ -25,6 +25,7 @@
- #define ARM_COMPUTE_LOGGING_PRINTER_H
- 
- #include "support/Mutex.h"
-+#include <string>
- 
- namespace arm_compute
- {
diff --git a/third_party/cudnn_frontend.BUILD b/third_party/cudnn_frontend.BUILD
deleted file mode 100644
index d6a6a26af6ca..000000000000
--- a/third_party/cudnn_frontend.BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-# Description:
-# The cuDNN Frontend API is a C++ header-only library that demonstrates how
-# to use the cuDNN C backend API.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # MIT
-
-exports_files(["LICENSE.txt"])
-
-filegroup(
-    name = "cudnn_frontend_header_files",
-    srcs = glob([
-        "include/**",
-    ]),
-)
-
-cc_library(
-    name = "cudnn_frontend",
-    hdrs = [":cudnn_frontend_header_files"],
-    include_prefix = "third_party/cudnn_frontend",
-)
diff --git a/third_party/cudnn_frontend_header_fix.patch b/third_party/cudnn_frontend_header_fix.patch
deleted file mode 100644
index 2e87873d537e..000000000000
--- a/third_party/cudnn_frontend_header_fix.patch
+++ /dev/null
@@ -1,52 +0,0 @@
-diff --git a/include/cudnn_frontend.h b/include/cudnn_frontend.h
-index e3f1ec8..373e52d 100644
---- a/include/cudnn_frontend.h
-+++ b/include/cudnn_frontend.h
-@@ -97,7 +97,7 @@
-  *      - Simpler samples on how to use the new API.
-  */
- 
--#include <cudnn.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- #include "cudnn_frontend_ConvDesc.h"
- #include "cudnn_frontend_Heuristics.h"
-diff --git a/include/cudnn_frontend/backend/backend_descriptor.h b/include/cudnn_frontend/backend/backend_descriptor.h
-index 47387a1..3864c1d 100644
---- a/include/cudnn_frontend/backend/backend_descriptor.h
-+++ b/include/cudnn_frontend/backend/backend_descriptor.h
-@@ -3,7 +3,7 @@
- #include <memory>
- 
- #include "../graph_helpers.h"
--#include "cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- namespace cudnn_frontend::detail {
- 
-diff --git a/include/cudnn_frontend/backend/execution_helpers.h b/include/cudnn_frontend/backend/execution_helpers.h
-index 334ffde..d2ca694 100644
---- a/include/cudnn_frontend/backend/execution_helpers.h
-+++ b/include/cudnn_frontend/backend/execution_helpers.h
-@@ -2,7 +2,7 @@
- 
- #include <vector>
- 
--#include "cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- #include "backend_descriptor.h"
- 
-diff --git a/include/cudnn_frontend/backend/plan_helpers.h b/include/cudnn_frontend/backend/plan_helpers.h
-index 1fa458d..8c37d10 100644
---- a/include/cudnn_frontend/backend/plan_helpers.h
-+++ b/include/cudnn_frontend/backend/plan_helpers.h
-@@ -2,7 +2,7 @@
- 
- #include <vector>
- 
--#include "cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- #include "backend_descriptor.h"
- #include "../knobs.h"
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
deleted file mode 100644
index cb33aa940fe4..000000000000
--- a/third_party/curl.BUILD
+++ /dev/null
@@ -1,829 +0,0 @@
-# Description:
-#   curl is a tool for talking to web servers.
-
-licenses(["notice"])  # MIT/X derivative license
-
-exports_files(["COPYING"])
-
-CURL_WIN_COPTS = [
-    "/Iexternal/curl/lib",
-    "/DBUILDING_LIBCURL",
-    "/DHAVE_CONFIG_H",
-    "/DCURL_DISABLE_FTP",
-    "/DCURL_DISABLE_NTLM",
-    "/DCURL_DISABLE_PROXY",
-    "/DHAVE_LIBZ",
-    "/DHAVE_ZLIB_H",
-]
-
-CURL_WIN_SRCS = [
-    "lib/asyn-thread.c",
-    "lib/inet_ntop.c",
-    "lib/system_win32.c",
-    "lib/setup-win32.h",
-]
-
-cc_library(
-    name = "curl",
-    srcs = [
-        "include/curl_config.h",
-        "lib/altsvc.c",
-        "lib/altsvc.h",
-        "lib/amigaos.c",
-        "lib/amigaos.h",
-        "lib/arpa_telnet.h",
-        "lib/asyn.h",
-        "lib/asyn-ares.c",
-        "lib/base64.c",
-        "lib/bufq.c",
-        "lib/bufq.h",
-        "lib/bufref.c",
-        "lib/bufref.h",
-        "lib/c-hyper.c",
-        "lib/c-hyper.h",
-        "lib/cf-h1-proxy.c",
-        "lib/cf-h1-proxy.h",
-        "lib/cf-h2-proxy.c",
-        "lib/cf-h2-proxy.h",
-        "lib/cf-haproxy.c",
-        "lib/cf-haproxy.h",
-        "lib/cf-https-connect.c",
-        "lib/cf-https-connect.h",
-        "lib/cf-socket.c",
-        "lib/cf-socket.h",
-        "lib/cfilters.c",
-        "lib/cfilters.h",
-        "lib/config-amigaos.h",
-        "lib/config-dos.h",
-        "lib/config-mac.h",
-        "lib/config-os400.h",
-        "lib/config-plan9.h",
-        "lib/config-riscos.h",
-        "lib/config-win32.h",
-        "lib/config-win32ce.h",
-        "lib/conncache.c",
-        "lib/conncache.h",
-        "lib/connect.c",
-        "lib/connect.h",
-        "lib/content_encoding.c",
-        "lib/content_encoding.h",
-        "lib/cookie.c",
-        "lib/cookie.h",
-        "lib/curl_addrinfo.c",
-        "lib/curl_addrinfo.h",
-        "lib/curl_base64.h",
-        "lib/curl_ctype.h",
-        "lib/curl_des.c",
-        "lib/curl_des.h",
-        "lib/curl_endian.c",
-        "lib/curl_endian.h",
-        "lib/curl_fnmatch.c",
-        "lib/curl_fnmatch.h",
-        "lib/curl_get_line.c",
-        "lib/curl_get_line.h",
-        "lib/curl_gethostname.c",
-        "lib/curl_gethostname.h",
-        "lib/curl_gssapi.c",
-        "lib/curl_gssapi.h",
-        "lib/curl_hmac.h",
-        "lib/curl_krb5.h",
-        "lib/curl_ldap.h",
-        "lib/curl_md4.h",
-        "lib/curl_md5.h",
-        "lib/curl_memory.h",
-        "lib/curl_memrchr.c",
-        "lib/curl_memrchr.h",
-        "lib/curl_multibyte.c",
-        "lib/curl_multibyte.h",
-        "lib/curl_ntlm_core.c",
-        "lib/curl_ntlm_core.h",
-        "lib/curl_printf.h",
-        "lib/curl_range.c",
-        "lib/curl_range.h",
-        "lib/curl_rtmp.c",
-        "lib/curl_rtmp.h",
-        "lib/curl_sasl.c",
-        "lib/curl_sasl.h",
-        "lib/curl_setup.h",
-        "lib/curl_setup_once.h",
-        "lib/curl_sha256.h",
-        "lib/curl_sha512_256.c",
-        "lib/curl_sha512_256.h",
-        "lib/curl_sspi.c",
-        "lib/curl_sspi.h",
-        "lib/curl_threads.c",
-        "lib/curl_threads.h",
-        "lib/curl_trc.c",
-        "lib/curl_trc.h",
-        "lib/curlx.h",
-        "lib/cw-out.c",
-        "lib/cw-out.h",
-        "lib/dict.c",
-        "lib/dict.h",
-        "lib/dllmain.c",
-        "lib/doh.c",
-        "lib/doh.h",
-        "lib/dynbuf.c",
-        "lib/dynbuf.h",
-        "lib/dynhds.c",
-        "lib/dynhds.h",
-        "lib/easy.c",
-        "lib/easy_lock.h",
-        "lib/easygetopt.c",
-        "lib/easyif.h",
-        "lib/easyoptions.c",
-        "lib/easyoptions.h",
-        "lib/escape.c",
-        "lib/escape.h",
-        "lib/file.c",
-        "lib/file.h",
-        "lib/fileinfo.c",
-        "lib/fileinfo.h",
-        "lib/fopen.c",
-        "lib/fopen.h",
-        "lib/formdata.c",
-        "lib/formdata.h",
-        "lib/ftp.c",
-        "lib/ftp.h",
-        "lib/ftplistparser.c",
-        "lib/ftplistparser.h",
-        "lib/functypes.h",
-        "lib/getenv.c",
-        "lib/getinfo.c",
-        "lib/getinfo.h",
-        "lib/gopher.c",
-        "lib/gopher.h",
-        "lib/hash.c",
-        "lib/hash.h",
-        "lib/headers.c",
-        "lib/headers.h",
-        "lib/hmac.c",
-        "lib/hostasyn.c",
-        "lib/hostip.c",
-        "lib/hostip.h",
-        "lib/hostip4.c",
-        "lib/hostip6.c",
-        "lib/hostsyn.c",
-        "lib/hsts.c",
-        "lib/hsts.h",
-        "lib/http.c",
-        "lib/http.h",
-        "lib/http1.c",
-        "lib/http1.h",
-        "lib/http2.c",
-        "lib/http2.h",
-        "lib/http_aws_sigv4.c",
-        "lib/http_aws_sigv4.h",
-        "lib/http_chunks.c",
-        "lib/http_chunks.h",
-        "lib/http_digest.c",
-        "lib/http_digest.h",
-        "lib/http_negotiate.c",
-        "lib/http_negotiate.h",
-        "lib/http_ntlm.c",
-        "lib/http_ntlm.h",
-        "lib/http_proxy.c",
-        "lib/http_proxy.h",
-        "lib/idn.c",
-        "lib/idn.h",
-        "lib/if2ip.c",
-        "lib/if2ip.h",
-        "lib/imap.c",
-        "lib/imap.h",
-        "lib/inet_ntop.h",
-        "lib/inet_pton.c",
-        "lib/inet_pton.h",
-        "lib/krb5.c",
-        "lib/ldap.c",
-        "lib/llist.c",
-        "lib/llist.h",
-        "lib/macos.c",
-        "lib/macos.h",
-        "lib/md4.c",
-        "lib/md5.c",
-        "lib/memdebug.c",
-        "lib/memdebug.h",
-        "lib/mime.c",
-        "lib/mime.h",
-        "lib/mprintf.c",
-        "lib/mqtt.c",
-        "lib/mqtt.h",
-        "lib/multi.c",
-        "lib/multihandle.h",
-        "lib/multiif.h",
-        "lib/netrc.c",
-        "lib/netrc.h",
-        "lib/nonblock.c",
-        "lib/nonblock.h",
-        "lib/noproxy.c",
-        "lib/noproxy.h",
-        "lib/openldap.c",
-        "lib/parsedate.c",
-        "lib/parsedate.h",
-        "lib/pingpong.c",
-        "lib/pingpong.h",
-        "lib/pop3.c",
-        "lib/pop3.h",
-        "lib/progress.c",
-        "lib/progress.h",
-        "lib/psl.c",
-        "lib/psl.h",
-        "lib/rand.c",
-        "lib/rand.h",
-        "lib/rename.c",
-        "lib/rename.h",
-        "lib/request.c",
-        "lib/request.h",
-        "lib/rtsp.c",
-        "lib/rtsp.h",
-        "lib/select.c",
-        "lib/select.h",
-        "lib/sendf.c",
-        "lib/sendf.h",
-        "lib/setopt.c",
-        "lib/setopt.h",
-        "lib/setup-os400.h",
-        "lib/setup-vms.h",
-        "lib/sha256.c",
-        "lib/share.c",
-        "lib/share.h",
-        "lib/sigpipe.h",
-        "lib/slist.c",
-        "lib/slist.h",
-        "lib/smb.c",
-        "lib/smb.h",
-        "lib/smtp.c",
-        "lib/smtp.h",
-        "lib/sockaddr.h",
-        "lib/socketpair.c",
-        "lib/socketpair.h",
-        "lib/socks.c",
-        "lib/socks.h",
-        "lib/socks_gssapi.c",
-        "lib/socks_sspi.c",
-        "lib/speedcheck.c",
-        "lib/speedcheck.h",
-        "lib/splay.c",
-        "lib/splay.h",
-        "lib/strcase.c",
-        "lib/strcase.h",
-        "lib/strdup.c",
-        "lib/strdup.h",
-        "lib/strerror.c",
-        "lib/strerror.h",
-        "lib/strtok.c",
-        "lib/strtok.h",
-        "lib/strtoofft.c",
-        "lib/strtoofft.h",
-        "lib/system_win32.h",
-        "lib/telnet.c",
-        "lib/telnet.h",
-        "lib/tftp.c",
-        "lib/tftp.h",
-        "lib/timediff.c",
-        "lib/timediff.h",
-        "lib/timeval.c",
-        "lib/timeval.h",
-        "lib/transfer.c",
-        "lib/transfer.h",
-        "lib/url.c",
-        "lib/url.h",
-        "lib/urlapi.c",
-        "lib/urlapi-int.h",
-        "lib/urldata.h",
-        "lib/vauth/cleartext.c",
-        "lib/vauth/cram.c",
-        "lib/vauth/digest.c",
-        "lib/vauth/digest.h",
-        "lib/vauth/digest_sspi.c",
-        "lib/vauth/gsasl.c",
-        "lib/vauth/krb5_gssapi.c",
-        "lib/vauth/krb5_sspi.c",
-        "lib/vauth/ntlm.c",
-        "lib/vauth/ntlm.h",
-        "lib/vauth/ntlm_sspi.c",
-        "lib/vauth/oauth2.c",
-        "lib/vauth/spnego_gssapi.c",
-        "lib/vauth/spnego_sspi.c",
-        "lib/vauth/vauth.c",
-        "lib/vauth/vauth.h",
-        "lib/version.c",
-        "lib/version_win32.c",
-        "lib/version_win32.h",
-        "lib/vquic/curl_msh3.c",
-        "lib/vquic/curl_msh3.h",
-        "lib/vquic/curl_ngtcp2.c",
-        "lib/vquic/curl_ngtcp2.h",
-        "lib/vquic/curl_osslq.h",
-        "lib/vquic/curl_quiche.c",
-        "lib/vquic/curl_quiche.h",
-        "lib/vquic/vquic.c",
-        "lib/vquic/vquic.h",
-        "lib/vquic/vquic_int.h",
-        "lib/vssh/curl_path.c",
-        "lib/vssh/curl_path.h",
-        "lib/vssh/libssh.c",
-        "lib/vssh/libssh2.c",
-        "lib/vssh/ssh.h",
-        "lib/vssh/wolfssh.c",
-        "lib/vtls/bearssl.c",
-        "lib/vtls/bearssl.h",
-        "lib/vtls/cipher_suite.c",
-        "lib/vtls/cipher_suite.h",
-        "lib/vtls/gtls.c",
-        "lib/vtls/gtls.h",
-        "lib/vtls/hostcheck.c",
-        "lib/vtls/hostcheck.h",
-        "lib/vtls/keylog.c",
-        "lib/vtls/keylog.h",
-        "lib/vtls/mbedtls.c",
-        "lib/vtls/mbedtls.h",
-        "lib/vtls/mbedtls_threadlock.c",
-        "lib/vtls/mbedtls_threadlock.h",
-        "lib/vtls/openssl.c",
-        "lib/vtls/openssl.h",
-        "lib/vtls/rustls.c",
-        "lib/vtls/rustls.h",
-        "lib/vtls/schannel.c",
-        "lib/vtls/schannel.h",
-        "lib/vtls/schannel_int.h",
-        "lib/vtls/schannel_verify.c",
-        "lib/vtls/sectransp.h",
-        "lib/vtls/vtls.c",
-        "lib/vtls/vtls.h",
-        "lib/vtls/vtls_int.h",
-        "lib/vtls/wolfssl.c",
-        "lib/vtls/wolfssl.h",
-        "lib/vtls/x509asn1.c",
-        "lib/vtls/x509asn1.h",
-        "lib/warnless.c",
-        "lib/warnless.h",
-        "lib/ws.c",
-        "lib/ws.h",
-    ] + select({
-        "@local_xla//xla/tsl:macos": [
-            "lib/vtls/sectransp.c",
-        ],
-        "@local_xla//xla/tsl:ios": [
-            "lib/vtls/sectransp.c",
-        ],
-        "@local_xla//xla/tsl:windows": CURL_WIN_SRCS,
-        "//conditions:default": [
-        ],
-    }),
-    hdrs = [
-        "include/curl/curl.h",
-        "include/curl/curlver.h",
-        "include/curl/easy.h",
-        "include/curl/header.h",
-        "include/curl/mprintf.h",
-        "include/curl/multi.h",
-        "include/curl/options.h",
-        "include/curl/stdcheaders.h",
-        "include/curl/system.h",
-        "include/curl/typecheck-gcc.h",
-        "include/curl/urlapi.h",
-        "include/curl/websockets.h",
-    ],
-    copts = select({
-        "@local_xla//xla/tsl:windows": CURL_WIN_COPTS,
-        "//conditions:default": [
-            "-Iexternal/curl/lib",
-            "-D_GNU_SOURCE",
-            "-DBUILDING_LIBCURL",
-            "-DHAVE_CONFIG_H",
-            "-DCURL_DISABLE_FTP",
-            "-DCURL_DISABLE_NTLM",  # turning it off in configure is not enough
-            "-DHAVE_LIBZ",
-            "-DHAVE_ZLIB_H",
-            "-Wno-string-plus-int",
-        ],
-    }) + select({
-        "@local_xla//xla/tsl:macos": [
-            "-fno-constant-cfstrings",
-        ],
-        "@local_xla//xla/tsl:windows": [
-            # See curl.h for discussion of write size and Windows
-            "/DCURL_MAX_WRITE_SIZE=16384",
-        ],
-        "//conditions:default": [
-            "-DCURL_MAX_WRITE_SIZE=65536",
-        ],
-    }),
-    defines = ["CURL_STATICLIB"],
-    includes = ["include"],
-    linkopts = select({
-        "@local_xla//xla/tsl:android": [
-            "-pie",
-        ],
-        "@local_xla//xla/tsl:macos": [
-            "-Wl,-framework",
-            "-Wl,CoreFoundation",
-            "-Wl,-framework",
-            "-Wl,SystemConfiguration",
-            "-Wl,-framework",
-            "-Wl,Security",
-        ],
-        "@local_xla//xla/tsl:ios": [],
-        "@local_xla//xla/tsl:windows": [
-            "-DEFAULTLIB:ws2_32.lib",
-            "-DEFAULTLIB:advapi32.lib",
-            "-DEFAULTLIB:crypt32.lib",
-            "-DEFAULTLIB:Normaliz.lib",
-        ],
-        "//conditions:default": [
-            "-lrt",
-        ],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@zlib",
-    ] + select({
-        "@local_xla//xla/tsl:ios": [],
-        "@local_xla//xla/tsl:windows": [],
-        "//conditions:default": [
-            "@boringssl//:ssl",
-        ],
-    }),
-)
-
-CURL_BIN_WIN_COPTS = [
-    "/Iexternal/curl/lib",
-    "/DHAVE_CONFIG_H",
-    "/DCURL_DISABLE_LIBCURL_OPTION",
-]
-
-cc_binary(
-    name = "curl_bin",
-    srcs = [
-        "lib/config-win32.h",
-        "src/slist_wc.c",
-        "src/slist_wc.h",
-        "src/terminal.c",
-        "src/terminal.h",
-        "src/tool_binmode.c",
-        "src/tool_binmode.h",
-        "src/tool_bname.c",
-        "src/tool_bname.h",
-        "src/tool_cb_dbg.c",
-        "src/tool_cb_dbg.h",
-        "src/tool_cb_hdr.c",
-        "src/tool_cb_hdr.h",
-        "src/tool_cb_prg.c",
-        "src/tool_cb_prg.h",
-        "src/tool_cb_rea.c",
-        "src/tool_cb_rea.h",
-        "src/tool_cb_see.c",
-        "src/tool_cb_see.h",
-        "src/tool_cb_soc.c",
-        "src/tool_cb_soc.h",
-        "src/tool_cb_wrt.c",
-        "src/tool_cb_wrt.h",
-        "src/tool_cfgable.c",
-        "src/tool_cfgable.h",
-        "src/tool_dirhie.c",
-        "src/tool_dirhie.h",
-        "src/tool_doswin.c",
-        "src/tool_doswin.h",
-        "src/tool_easysrc.c",
-        "src/tool_easysrc.h",
-        "src/tool_filetime.c",
-        "src/tool_filetime.h",
-        "src/tool_formparse.c",
-        "src/tool_formparse.h",
-        "src/tool_getparam.c",
-        "src/tool_getparam.h",
-        "src/tool_getpass.c",
-        "src/tool_getpass.h",
-        "src/tool_help.c",
-        "src/tool_help.h",
-        "src/tool_helpers.c",
-        "src/tool_helpers.h",
-        "src/tool_homedir.c",
-        "src/tool_homedir.h",
-        "src/tool_hugehelp.c",
-        "src/tool_hugehelp.h",
-        "src/tool_libinfo.c",
-        "src/tool_libinfo.h",
-        "src/tool_main.c",
-        "src/tool_main.h",
-        "src/tool_metalink.c",
-        "src/tool_metalink.h",
-        "src/tool_mfiles.c",
-        "src/tool_mfiles.h",
-        "src/tool_msgs.c",
-        "src/tool_msgs.h",
-        "src/tool_operate.c",
-        "src/tool_operate.h",
-        "src/tool_operhlp.c",
-        "src/tool_operhlp.h",
-        "src/tool_panykey.c",
-        "src/tool_panykey.h",
-        "src/tool_paramhlp.c",
-        "src/tool_paramhlp.h",
-        "src/tool_parsecfg.c",
-        "src/tool_parsecfg.h",
-        "src/tool_progress.c",
-        "src/tool_progress.h",
-        "src/tool_sdecls.h",
-        "src/tool_setopt.c",
-        "src/tool_setopt.h",
-        "src/tool_setup.h",
-        "src/tool_sleep.c",
-        "src/tool_sleep.h",
-        "src/tool_strdup.c",
-        "src/tool_strdup.h",
-        "src/tool_urlglob.c",
-        "src/tool_urlglob.h",
-        "src/tool_util.c",
-        "src/tool_util.h",
-        "src/tool_version.h",
-        "src/tool_vms.c",
-        "src/tool_vms.h",
-        "src/tool_writeenv.c",
-        "src/tool_writeenv.h",
-        "src/tool_writeout.c",
-        "src/tool_writeout.h",
-        "src/tool_writeout_json.c",
-        "src/tool_writeout_json.h",
-        "src/tool_xattr.c",
-        "src/tool_xattr.h",
-    ],
-    copts = select({
-        "@local_xla//xla/tsl:windows": CURL_BIN_WIN_COPTS,
-        "//conditions:default": [
-            "-Iexternal/curl/lib",
-            "-D_GNU_SOURCE",
-            "-DHAVE_CONFIG_H",
-            "-DCURL_DISABLE_LIBCURL_OPTION",
-            "-Wno-string-plus-int",
-        ],
-    }),
-    deps = [":curl"],
-)
-
-genrule(
-    name = "configure",
-    outs = ["include/curl_config.h"],
-    cmd = "\n".join([
-        "cat <<'EOF' >$@",
-        "#ifndef EXTERNAL_CURL_INCLUDE_CURL_CONFIG_H_",
-        "#define EXTERNAL_CURL_INCLUDE_CURL_CONFIG_H_",
-        "",
-        "#if !defined(_WIN32) && !defined(__APPLE__)",
-        "#  include <openssl/opensslv.h>",
-        "#  if defined(OPENSSL_IS_BORINGSSL)",
-        "#    define HAVE_BORINGSSL 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#if defined(_WIN32)",
-        "#  include \"lib/config-win32.h\"",
-        "#  define BUILDING_LIBCURL 1",
-        "#  define CURL_DISABLE_CRYPTO_AUTH 1",
-        "#  define CURL_DISABLE_DICT 1",
-        "#  define CURL_DISABLE_FILE 1",
-        "#  define CURL_DISABLE_GOPHER 1",
-        "#  define CURL_DISABLE_IMAP 1",
-        "#  define CURL_DISABLE_LDAP 1",
-        "#  define CURL_DISABLE_LDAPS 1",
-        "#  define CURL_DISABLE_POP3 1",
-        "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define CURL_DISABLE_SMTP 1",
-        "#  define CURL_DISABLE_TELNET 1",
-        "#  define CURL_DISABLE_TFTP 1",
-        "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define USE_WINDOWS_SSPI 1",
-        "#  define USE_WIN32_IDN 1",
-        "#  define USE_SCHANNEL 1",
-        "#  define WANT_IDN_PROTOTYPES 1",
-        "#elif defined(__APPLE__)",
-        "#  define HAVE_FSETXATTR_6 1",
-        "#  define HAVE_SETMODE 1",
-        "#  define HAVE_SYS_FILIO_H 1",
-        "#  define HAVE_SYS_SOCKIO_H 1",
-        "#  define CURL_OS \"x86_64-apple-darwin15.5.0\"",
-        "#  define USE_SECTRANSP 1",
-        "#else",
-        "#  define CURL_CA_BUNDLE \"/etc/ssl/certs/ca-certificates.crt\"",
-        "#  define GETSERVBYPORT_R_ARGS 6",
-        "#  define GETSERVBYPORT_R_BUFSIZE 4096",
-        "#  define HAVE_BORINGSSL 1",
-        "#  define HAVE_CLOCK_GETTIME_MONOTONIC 1",
-        "#  define HAVE_CRYPTO_CLEANUP_ALL_EX_DATA 1",
-        "#  define HAVE_FSETXATTR_5 1",
-        "#  define HAVE_GETHOSTBYADDR_R 1",
-        "#  define HAVE_GETHOSTBYADDR_R_8 1",
-        "#  define HAVE_GETHOSTBYNAME_R 1",
-        "#  define HAVE_GETHOSTBYNAME_R_6 1",
-        "#  define HAVE_GETSERVBYPORT_R 1",
-        "#  define HAVE_LIBSSL 1",
-        "#  define HAVE_MALLOC_H 1",
-        "#  define HAVE_MSG_NOSIGNAL 1",
-        "#  define HAVE_OPENSSL_CRYPTO_H 1",
-        "#  define HAVE_OPENSSL_ERR_H 1",
-        "#  define HAVE_OPENSSL_PEM_H 1",
-        "#  define HAVE_OPENSSL_PKCS12_H 1",
-        "#  define HAVE_OPENSSL_RSA_H 1",
-        "#  define HAVE_OPENSSL_SSL_H 1",
-        "#  define HAVE_OPENSSL_X509_H 1",
-        "#  define HAVE_RAND_EGD 1",
-        "#  define HAVE_RAND_STATUS 1",
-        "#  define HAVE_SSL_GET_SHUTDOWN 1",
-        "#  define HAVE_TERMIOS_H 1",
-        "#  define CURL_OS \"x86_64-pc-linux-gnu\"",
-        "#  define RANDOM_FILE \"/dev/urandom\"",
-        "#  define USE_OPENSSL 1",
-        "#endif",
-        "",
-        "#if !defined(_WIN32)",
-        "#  define CURL_DISABLE_DICT 1",
-        "#  define CURL_DISABLE_FILE 1",
-        "#  define CURL_DISABLE_GOPHER 1",
-        "#  define CURL_DISABLE_IMAP 1",
-        "#  define CURL_DISABLE_LDAP 1",
-        "#  define CURL_DISABLE_LDAPS 1",
-        "#  define CURL_DISABLE_POP3 1",
-        "#  define CURL_DISABLE_SMTP 1",
-        "#  define CURL_DISABLE_TELNET 1",
-        "#  define CURL_DISABLE_TFTP 1",
-        "#  define CURL_EXTERN_SYMBOL __attribute__ ((__visibility__ (\"default\")))",
-        "#  define ENABLE_IPV6 1",
-        "#  define GETHOSTNAME_TYPE_ARG2 size_t",
-        "#  define GETNAMEINFO_QUAL_ARG1 const",
-        "#  define GETNAMEINFO_TYPE_ARG1 struct sockaddr *",
-        "#  define GETNAMEINFO_TYPE_ARG2 socklen_t",
-        "#  define GETNAMEINFO_TYPE_ARG46 socklen_t",
-        "#  define GETNAMEINFO_TYPE_ARG7 int",
-        "#  define HAVE_ALARM 1",
-        "#  define HAVE_ALLOCA_H 1",
-        "#  define HAVE_ARPA_INET_H 1",
-        "#  define HAVE_ARPA_TFTP_H 1",
-        "#  define HAVE_ASSERT_H 1",
-        "#  define HAVE_BASENAME 1",
-        "#  define HAVE_BOOL_T 1",
-        "#  define HAVE_CONNECT 1",
-        "#  define HAVE_DLFCN_H 1",
-        "#  define HAVE_ERRNO_H 1",
-        "#  define HAVE_FCNTL 1",
-        "#  define HAVE_FCNTL_H 1",
-        "#  define HAVE_FCNTL_O_NONBLOCK 1",
-        "#  define HAVE_FDOPEN 1",
-        "#  define HAVE_FORK 1",
-        "#  define HAVE_FREEADDRINFO 1",
-        "#  define HAVE_FREEIFADDRS 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_FSETXATTR 1",
-        "#  endif",
-        "#  define HAVE_FTRUNCATE 1",
-        "#  define HAVE_GAI_STRERROR 1",
-        "#  define HAVE_GETADDRINFO 1",
-        "#  define HAVE_GETADDRINFO_THREADSAFE 1",
-        "#  define HAVE_GETEUID 1",
-        "#  define HAVE_GETHOSTBYADDR 1",
-        "#  define HAVE_GETHOSTBYNAME 1",
-        "#  define HAVE_GETHOSTNAME 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_GETIFADDRS 1",
-        "#  endif",
-        "#  define HAVE_GETNAMEINFO 1",
-        "#  define HAVE_GETPPID 1",
-        "#  define HAVE_GETPROTOBYNAME 1",
-        "#  define HAVE_GETPWUID 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_GETPWUID_R 1",
-        "#  endif",
-        "#  define HAVE_GETRLIMIT 1",
-        "#  define HAVE_GETTIMEOFDAY 1",
-        "#  define HAVE_GMTIME_R 1",
-        "#  if !defined(__ANDROID__)",
-        "#    define HAVE_IFADDRS_H 1",
-        "#  endif",
-        "#  define HAVE_IF_NAMETOINDEX 1",
-        "#  define HAVE_INET_ADDR 1",
-        "#  define HAVE_INET_NTOP 1",
-        "#  define HAVE_INET_PTON 1",
-        "#  define HAVE_INTTYPES_H 1",
-        "#  define HAVE_IOCTL 1",
-        "#  define HAVE_IOCTL_FIONBIO 1",
-        "#  define HAVE_IOCTL_SIOCGIFADDR 1",
-        "#  define HAVE_LIBGEN_H 1",
-        "#  define HAVE_LIBZ 1",
-        "#  define HAVE_LIMITS_H 1",
-        "#  define HAVE_LL 1",
-        "#  define HAVE_LOCALE_H 1",
-        "#  define HAVE_LOCALTIME_R 1",
-        "#  define HAVE_LONGLONG 1",
-        "#  define HAVE_MEMORY_H 1",
-        "#  define HAVE_NETDB_H 1",
-        "#  define HAVE_NETINET_IN_H 1",
-        "#  define HAVE_NETINET_TCP_H 1",
-        "#  define HAVE_NET_IF_H 1",
-        "#  define HAVE_PERROR 1",
-        "#  define HAVE_PIPE 1",
-        "#  define HAVE_POLL 1",
-        "#  define HAVE_POLL_FINE 1",
-        "#  define HAVE_POLL_H 1",
-        "#  define HAVE_POSIX_STRERROR_R 1",
-        "#  define HAVE_PWD_H 1",
-        "#  define HAVE_RECV 1",
-        "#  define HAVE_SELECT 1",
-        "#  define HAVE_SEND 1",
-        "#  define HAVE_SETJMP_H 1",
-        "#  define HAVE_SETLOCALE 1",
-        "#  define HAVE_SETRLIMIT 1",
-        "#  define HAVE_SETSOCKOPT 1",
-        "#  define HAVE_SGTTY_H 1",
-        "#  define HAVE_SIGACTION 1",
-        "#  define HAVE_SIGINTERRUPT 1",
-        "#  define HAVE_SIGNAL 1",
-        "#  define HAVE_SIGNAL_H 1",
-        "#  define HAVE_SIGSETJMP 1",
-        "#  define HAVE_SIG_ATOMIC_T 1",
-        "#  define HAVE_SOCKADDR_IN6_SIN6_SCOPE_ID 1",
-        "#  define HAVE_SOCKET 1",
-        "#  define HAVE_SOCKETPAIR 1",
-        "#  define HAVE_STDBOOL_H 1",
-        "#  define HAVE_STDINT_H 1",
-        "#  define HAVE_STDIO_H 1",
-        "#  define HAVE_STDLIB_H 1",
-        "#  define HAVE_STRCASECMP 1",
-        "#  define HAVE_STRDUP 1",
-        "#  define HAVE_STRERROR_R 1",
-        "#  define HAVE_STRINGS_H 1",
-        "#  define HAVE_STRING_H 1",
-        "#  define HAVE_STRNCASECMP 1",
-        "#  define HAVE_STRSTR 1",
-        "#  define HAVE_STRTOK_R 1",
-        "#  define HAVE_STRTOLL 1",
-        "#  define HAVE_STRUCT_SOCKADDR_STORAGE 1",
-        "#  define HAVE_STRUCT_TIMEVAL 1",
-        "#  define HAVE_SYS_IOCTL_H 1",
-        "#  define HAVE_SYS_PARAM_H 1",
-        "#  define HAVE_SYS_POLL_H 1",
-        "#  define HAVE_SYS_RESOURCE_H 1",
-        "#  define HAVE_SYS_SELECT_H 1",
-        "#  define HAVE_SYS_SOCKET_H 1",
-        "#  define HAVE_SYS_STAT_H 1",
-        "#  define HAVE_SYS_TIME_H 1",
-        "#  define HAVE_SYS_TYPES_H 1",
-        "#  define HAVE_SYS_UIO_H 1",
-        "#  define HAVE_SYS_UN_H 1",
-        "#  define HAVE_SYS_WAIT_H 1",
-        "#  define HAVE_SYS_XATTR_H 1",
-        "#  define HAVE_TIME_H 1",
-        "#  define HAVE_UNAME 1",
-        "#  define HAVE_UNISTD_H 1",
-        "#  define HAVE_UTIME 1",
-        "#  define HAVE_UTIME_H 1",
-        "#  define HAVE_VARIADIC_MACROS_C99 1",
-        "#  define HAVE_VARIADIC_MACROS_GCC 1",
-        "#  define HAVE_WRITABLE_ARGV 1",
-        "#  define HAVE_WRITEV 1",
-        "#  define HAVE_ZLIB_H 1",
-        "#  define LT_OBJDIR \".libs/\"",
-        "#  define PACKAGE \"curl\"",
-        "#  define PACKAGE_BUGREPORT \"a suitable curl mailing list: https://curl.haxx.se/mail/\"",
-        "#  define PACKAGE_NAME \"curl\"",
-        "#  define PACKAGE_STRING \"curl -\"",
-        "#  define PACKAGE_TARNAME \"curl\"",
-        "#  define PACKAGE_URL \"\"",
-        "#  define PACKAGE_VERSION \"-\"",
-        "#  define RECV_TYPE_ARG1 int",
-        "#  define RECV_TYPE_ARG2 void *",
-        "#  define RECV_TYPE_ARG3 size_t",
-        "#  define RECV_TYPE_ARG4 int",
-        "#  define RECV_TYPE_RETV ssize_t",
-        "#  define RETSIGTYPE void",
-        "#  define SELECT_QUAL_ARG5",
-        "#  define SELECT_TYPE_ARG1 int",
-        "#  define SELECT_TYPE_ARG234 fd_set *",
-        "#  define SELECT_TYPE_ARG5 struct timeval *",
-        "#  define SELECT_TYPE_RETV int",
-        "#  define SEND_QUAL_ARG2 const",
-        "#  define SEND_TYPE_ARG1 int",
-        "#  define SEND_TYPE_ARG2 void *",
-        "#  define SEND_TYPE_ARG3 size_t",
-        "#  define SEND_TYPE_ARG4 int",
-        "#  define SEND_TYPE_RETV ssize_t",
-        "#  define SIZEOF_INT 4",
-        "#  define SIZEOF_LONG 8",
-        "#  define SIZEOF_OFF_T 8",
-        "#  define SIZEOF_CURL_OFF_T 8",
-        "#  define SIZEOF_SHORT 2",
-        "#  define SIZEOF_SIZE_T 8",
-        "#  define SIZEOF_TIME_T 8",
-        "#  define SIZEOF_VOIDP 8",
-        "#  define STDC_HEADERS 1",
-        "#  define STRERROR_R_TYPE_ARG3 size_t",
-        "#  define TIME_WITH_SYS_TIME 1",
-        "#  define VERSION \"-\"",
-        "#  ifndef _DARWIN_USE_64_BIT_INODE",
-        "#    define _DARWIN_USE_64_BIT_INODE 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#endif  // EXTERNAL_CURL_INCLUDE_CURL_CONFIG_H_",
-        "EOF",
-    ]),
-)
diff --git a/third_party/cutlass.BUILD b/third_party/cutlass.BUILD
deleted file mode 100644
index 581ffb15685a..000000000000
--- a/third_party/cutlass.BUILD
+++ /dev/null
@@ -1,37 +0,0 @@
-# Description:
-# CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance
-# matrix-matrix multiplication (GEMM) and related computations at all levels and scales within CUDA.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # MIT
-
-exports_files(["LICENSE.txt"])
-
-filegroup(
-    name = "cutlass_header_files",
-    srcs = glob([
-        "include/**",
-    ]),
-)
-
-filegroup(
-    name = "cutlass_util_header_files",
-    srcs = glob([
-        "tools/util/include/**",
-    ]),
-)
-
-cc_library(
-    name = "cutlass",
-    hdrs = [
-        ":cutlass_header_files",
-        ":cutlass_util_header_files",
-    ],
-    includes = [
-        "include",
-        "tools/util/include",
-    ],
-)
diff --git a/third_party/cython.BUILD b/third_party/cython.BUILD
deleted file mode 100644
index ac8c33162d53..000000000000
--- a/third_party/cython.BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-# Modified version of @cython//:BUILD.bazel
-
-py_library(
-    name = "cython_lib",
-    srcs = glob(
-        ["Cython/**/*.py"],
-        exclude = [
-            "**/Tests/*.py",
-        ],
-    ) + ["cython.py"],
-    data = glob([
-        "Cython/**/*.pyx",
-        "Cython/Utility/*.*",
-        "Cython/Includes/**/*.pxd",
-    ]),
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
-
-# May not be named "cython", since that conflicts with Cython/ on OSX
-py_binary(
-    name = "cython_binary",
-    srcs = ["cython.py"],
-    main = "cython.py",
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-    deps = ["cython_lib"],
-)
diff --git a/third_party/dlpack/BUILD b/third_party/dlpack/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/dlpack/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/dlpack/dlpack.BUILD b/third_party/dlpack/dlpack.BUILD
deleted file mode 100644
index cd52d710ebe9..000000000000
--- a/third_party/dlpack/dlpack.BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Description:
-#   DLPack is a protocol for sharing arrays between deep learning frameworks.
-
-licenses(["notice"])  # Apache 2
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "dlpack",
-    hdrs = [
-        "include/dlpack/dlpack.h",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/dlpack/workspace.bzl b/third_party/dlpack/workspace.bzl
deleted file mode 100644
index 3d7560af3720..000000000000
--- a/third_party/dlpack/workspace.bzl
+++ /dev/null
@@ -1,12 +0,0 @@
-"""DLPack is a protocol for sharing arrays between deep learning frameworks."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "dlpack",
-        strip_prefix = "dlpack-2a7e9f1256ddc48186c86dff7a00e189b47e5310",
-        sha256 = "044d2f5738e677c5f0f1ff9fb616a0245af67d09e42ae3514c73ba50cea0e4a5",
-        urls = tf_mirror_urls("https://github.com/dmlc/dlpack/archive/2a7e9f1256ddc48186c86dff7a00e189b47e5310.tar.gz"),
-        build_file = "//third_party/dlpack:dlpack.BUILD",
-    )
diff --git a/third_party/ducc/ducc.BUILD b/third_party/ducc/ducc.BUILD
deleted file mode 100644
index a1c4956d0a79..000000000000
--- a/third_party/ducc/ducc.BUILD
+++ /dev/null
@@ -1,77 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-exports_files(["LICENSE"])
-
-DUCC_COPTS = [
-    "-frtti",
-    "-fexceptions",
-    "-ffp-contract=fast",
-]
-
-# This library exposes the raw DUCC fft API.  It should be used
-# with caution, since inclusion of the headers will require any
-# dependent targets to be build with exceptions and RTTI enabled.
-# For a better-isolated target, use ":fft_wrapper".
-cc_library(
-    name = "fft",
-    srcs = [
-        "google/ducc0_custom_lowlevel_threading.h",
-        "google/threading.cc",
-        "src/ducc0/infra/aligned_array.h",
-        "src/ducc0/infra/error_handling.h",
-        "src/ducc0/infra/misc_utils.h",
-        "src/ducc0/infra/simd.h",
-        "src/ducc0/infra/string_utils.h",
-        "src/ducc0/infra/threading.cc",
-        "src/ducc0/infra/useful_macros.h",
-        "src/ducc0/math/cmplx.h",
-        "src/ducc0/math/unity_roots.h",
-    ],
-    hdrs = [
-        "google/threading.h",
-        "src/ducc0/fft/fft.h",
-        "src/ducc0/fft/fft1d_impl.h",
-        "src/ducc0/fft/fftnd_impl.h",
-        "src/ducc0/infra/mav.h",
-        "src/ducc0/infra/threading.h",
-    ],
-    copts = DUCC_COPTS,
-    defines = [
-        # Use custom TSL/Eigen threading.
-        "DUCC0_CUSTOM_LOWLEVEL_THREADING=1",
-    ],
-    features = ["-use_header_modules"],
-    include_prefix = "ducc",
-    includes = [
-        ".",  # Needed for google/-relative paths.
-        "google",  # Needed for finding ducc0_custom_lowlevel_threading.h.
-        "src",  # Needed for internal headers.
-    ],
-    # The DUCC FFT source files are dual-licensed as BSD 3 clause and GPLv2.
-    # We choose BSD 3 clause.
-    licenses = ["notice"],
-    visibility = ["//visibility:private"],
-    deps = [
-        # Required for custom threadpool usage:
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:mutex",
-    ],
-)
-
-cc_library(
-    name = "fft_wrapper",
-    srcs = ["google/fft.cc"],
-    hdrs = ["google/fft.h"],
-    copts = DUCC_COPTS,
-    features = ["-use_header_modules"],
-    include_prefix = "ducc",
-    licenses = ["notice"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":fft",
-        "@eigen_archive//:eigen3",
-    ],
-)
diff --git a/third_party/ducc/ducc0_custom_lowlevel_threading.h b/third_party/ducc/ducc0_custom_lowlevel_threading.h
deleted file mode 100644
index 688efe75b595..000000000000
--- a/third_party/ducc/ducc0_custom_lowlevel_threading.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
-#define THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
-
-#include "tsl/platform/mutex.h"
-
-namespace ducc0 {
-namespace detail_threading {
-
-using Mutex = tsl::mutex;
-using UniqueLock = tsl::mutex_lock;
-using LockGuard = tsl::mutex_lock;
-using CondVar = tsl::condition_variable;
-
-// Missing variable used by DUCC threading.cc.
-extern thread_local bool in_parallel_region;
-
-}  // namespace detail_threading
-}  // namespace ducc0
-
-#endif  // THIRD_PARTY_DUCC_GOOGLE_DUCC0_CUSTOM_LOWLEVEL_THREADING_H_
diff --git a/third_party/ducc/fft.cc b/third_party/ducc/fft.cc
deleted file mode 100644
index ec3c66f263b8..000000000000
--- a/third_party/ducc/fft.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "ducc/google/fft.h"
-
-#include <complex>
-#include <cstddef>
-#include <cstdlib>
-#include <exception>
-#include <iostream>
-#include <ostream>
-#include <vector>
-
-#include "ducc/google/threading.h"
-#include "ducc/src/ducc0/fft/fft.h"
-#include "ducc/src/ducc0/fft/fft1d_impl.h"  // IWYU pragma: keep, DUCC definitions.
-#include "ducc/src/ducc0/fft/fftnd_impl.h"  // IWYU pragma: keep, DUCC definitions.
-#include "ducc/src/ducc0/infra/mav.h"
-#include "ducc/src/ducc0/infra/threading.h"
-#include "unsupported/Eigen/CXX11/ThreadPool"
-
-namespace ducc0 {
-
-// Wrappers around DUCC calls.
-namespace google {
-
-using Shape = std::vector<std::size_t>;
-using Stride = std::vector<std::ptrdiff_t>;
-
-template <typename RealScalar>
-void c2c(const std::complex<RealScalar>* in, const Shape& in_shape,
-         const Stride& in_stride, std::complex<RealScalar>* out,
-         const Shape& out_shape, const Stride& out_stride, const Shape& axes,
-         bool forward, RealScalar scale,
-         Eigen::ThreadPoolInterface* thread_pool) {
-  ducc0::cfmav<std::complex<RealScalar>> m_in(in, in_shape, in_stride);
-  ducc0::vfmav<std::complex<RealScalar>> m_out(out, out_shape, out_stride);
-
-  try {
-    if (thread_pool == nullptr) {
-      // Use a fake threadpool.
-      ducc0::google::NoThreadPool no_thread_pool;
-      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
-          no_thread_pool);
-      ducc0::c2c(m_in, m_out, axes, forward, scale, 1);
-    } else {
-      EigenThreadPool eigen_thread_pool(*thread_pool);
-      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
-          eigen_thread_pool);
-      ducc0::c2c(m_in, m_out, axes, forward, scale,
-                 eigen_thread_pool.nthreads());
-    }
-  } catch (const std::exception& ex) {
-    std::cerr << "DUCC FFT c2c failed: " << ex.what() << std::endl;
-    std::abort();
-  }
-}
-
-template <typename RealScalar>
-void r2c(const RealScalar* in, const Shape& in_shape, const Stride& in_stride,
-         std::complex<RealScalar>* out, const Shape& out_shape,
-         const Stride& out_stride, const Shape& axes, bool forward,
-         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool) {
-  ducc0::cfmav<RealScalar> m_in(in, in_shape, in_stride);
-  ducc0::vfmav<std::complex<RealScalar>> m_out(out, out_shape, out_stride);
-
-  try {
-    if (thread_pool == nullptr) {
-      // Use a fake threadpool.
-      ducc0::google::NoThreadPool no_thread_pool;
-      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
-          no_thread_pool);
-      ducc0::r2c(m_in, m_out, axes, forward, scale, 1);
-    } else {
-      EigenThreadPool eigen_thread_pool(*thread_pool);
-      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
-          eigen_thread_pool);
-      ducc0::r2c(m_in, m_out, axes, forward, scale,
-                 eigen_thread_pool.nthreads());
-    }
-  } catch (const std::exception& ex) {
-    std::cerr << "DUCC FFT r2c failed: " << ex.what() << std::endl;
-    std::abort();
-  }
-}
-
-template <typename RealScalar>
-void c2r(const std::complex<RealScalar>* in, const Shape& in_shape,
-         const Stride& in_stride, RealScalar* out, const Shape& out_shape,
-         const Stride& out_stride, const Shape& axes, bool forward,
-         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool) {
-  ducc0::cfmav<std::complex<RealScalar>> m_in(in, in_shape, in_stride);
-  ducc0::vfmav<RealScalar> m_out(out, out_shape, out_stride);
-
-  try {
-    if (thread_pool == nullptr) {
-      // Use a fake threadpool.
-      ducc0::google::NoThreadPool no_thread_pool;
-      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
-          no_thread_pool);
-      ducc0::c2r(m_in, m_out, axes, forward, scale, 1);
-    } else {
-      EigenThreadPool eigen_thread_pool(*thread_pool);
-      ducc0::detail_threading::ScopedUseThreadPool thread_pool_guard(
-          eigen_thread_pool);
-      ducc0::c2r(m_in, m_out, axes, forward, scale,
-                 eigen_thread_pool.nthreads());
-    }
-  } catch (const std::exception& ex) {
-    std::cerr << "DUCC FFT c2r failed: " << ex.what() << std::endl;
-    std::abort();
-  }
-}
-
-#define FFT_DEFINITIONS(RealScalar)                                            \
-  template void c2c<RealScalar>(                                               \
-      const std::complex<RealScalar>* in, const Shape& in_shape,               \
-      const Stride& in_stride, std::complex<RealScalar>* out,                  \
-      const Shape& out_shape, const Stride& out_stride, const Shape& axes,     \
-      bool forward, RealScalar scale,                                          \
-      Eigen::ThreadPoolInterface* thread_pool);                                \
-  template void r2c<RealScalar>(                                               \
-      const RealScalar* in, const Shape& in_shape, const Stride& in_stride,    \
-      std::complex<RealScalar>* out, const Shape& out_shape,                   \
-      const Stride& out_stride, const Shape& axes, bool forward,               \
-      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);              \
-  template void c2r(const std::complex<RealScalar>* in, const Shape& in_shape, \
-                    const Stride& in_stride, RealScalar* out,                  \
-                    const Shape& out_shape, const Stride& out_stride,          \
-                    const Shape& axes, bool forward, RealScalar scale,         \
-                    Eigen::ThreadPoolInterface* thread_pool)
-FFT_DEFINITIONS(float);
-FFT_DEFINITIONS(double);
-#undef FFT_DEFINITIONS
-
-}  // namespace google
-}  // namespace ducc0
\ No newline at end of file
diff --git a/third_party/ducc/fft.h b/third_party/ducc/fft.h
deleted file mode 100644
index 8c1691d36cad..000000000000
--- a/third_party/ducc/fft.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_DUCC_GOOGLE_FFT_H_
-#define THIRD_PARTY_DUCC_GOOGLE_FFT_H_
-
-// Wrapper around the DUCC FFT library to isolate usage of exceptions
-// and RTTI.  Eliminates all direct usage of DUCC headers.
-
-#include <complex>
-#include <cstddef>
-#include <vector>
-
-#include "unsupported/Eigen/CXX11/ThreadPool"
-
-namespace ducc0 {
-namespace google {
-
-using Shape = std::vector<std::size_t>;
-using Stride = std::vector<std::ptrdiff_t>;
-
-template <typename RealScalar>
-void c2c(const std::complex<RealScalar>* in, const Shape& in_shape,
-         const Stride& in_stride, std::complex<RealScalar>* out,
-         const Shape& out_shape, const Stride& out_stride, const Shape& axes,
-         bool forward, RealScalar scale,
-         Eigen::ThreadPoolInterface* thread_pool);
-
-template <typename RealScalar>
-void r2c(const RealScalar* in, const Shape& in_shape, const Stride& in_stride,
-         std::complex<RealScalar>* out, const Shape& out_shape,
-         const Stride& out_stride, const Shape& axes, bool forward,
-         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
-
-template <typename RealScalar>
-void c2r(const std::complex<RealScalar>* in, const Shape& in_shape,
-         const Stride& in_stride, RealScalar* out, const Shape& out_shape,
-         const Stride& out_stride, const Shape& axes, bool forward,
-         RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);
-
-#define FFT_DECLARATIONS(RealScalar)                                        \
-  extern template void c2c<RealScalar>(                                     \
-      const std::complex<RealScalar>* in, const Shape& in_shape,            \
-      const Stride& in_stride, std::complex<RealScalar>* out,               \
-      const Shape& out_shape, const Stride& out_stride, const Shape& axes,  \
-      bool forward, RealScalar scale,                                       \
-      Eigen::ThreadPoolInterface* thread_pool);                             \
-  extern template void r2c<RealScalar>(                                     \
-      const RealScalar* in, const Shape& in_shape, const Stride& in_stride, \
-      std::complex<RealScalar>* out, const Shape& out_shape,                \
-      const Stride& out_stride, const Shape& axes, bool forward,            \
-      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool);           \
-  extern template void c2r(                                                 \
-      const std::complex<RealScalar>* in, const Shape& in_shape,            \
-      const Stride& in_stride, RealScalar* out, const Shape& out_shape,     \
-      const Stride& out_stride, const Shape& axes, bool forward,            \
-      RealScalar scale, Eigen::ThreadPoolInterface* thread_pool)
-FFT_DECLARATIONS(float);
-FFT_DECLARATIONS(double);
-#undef FFT_DECLARATIONS
-
-}  // namespace google
-}  // namespace ducc0
-
-#endif  // THIRD_PARTY_DUCC_GOOGLE_FFT_H_
\ No newline at end of file
diff --git a/third_party/ducc/threading.cc b/third_party/ducc/threading.cc
deleted file mode 100644
index d0793984ab5d..000000000000
--- a/third_party/ducc/threading.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "ducc/google/threading.h"
-
-#include <thread>
-#include <utility>
-
-#include "ducc/src/ducc0/infra/threading.h"
-#include "unsupported/Eigen/CXX11/ThreadPool"
-
-namespace ducc0 {
-
-namespace google {
-
-namespace {
-
-// Default shared global pool.  It is created on first use.
-EigenThreadPool* GetGlobalThreadPoolSingleton() {
-  static Eigen::ThreadPool* eigen_pool =
-      new Eigen::ThreadPool(std::thread::hardware_concurrency());
-  static EigenThreadPool* pool = new EigenThreadPool(*eigen_pool);
-  return pool;
-}
-
-// Thread-local active pool for current execution.
-ducc0::detail_threading::thread_pool*& GetActiveThreadPoolSingleton() {
-  thread_local thread_pool* active_pool = nullptr;
-  return active_pool;
-}
-
-}  // namespace
-}  // namespace google
-
-// Implementations required by ducc0.
-namespace detail_threading {
-
-// Missing variable used by DUCC threading.cc.
-thread_local bool in_parallel_region = false;
-
-thread_pool* set_active_pool(thread_pool* new_pool) {
-  return std::exchange(ducc0::google::GetActiveThreadPoolSingleton(), new_pool);
-}
-
-thread_pool* get_active_pool() {
-  thread_pool* pool = google::GetActiveThreadPoolSingleton();
-  if (pool == nullptr) {
-    // Set to use a global pool.  This may trigger threadpool creation.
-    // Since the active pool is thread-local, this is thread-safe.
-    pool = google::GetGlobalThreadPoolSingleton();
-    set_active_pool(pool);
-  }
-  return pool;
-}
-
-}  // namespace detail_threading
-}  // namespace ducc0
\ No newline at end of file
diff --git a/third_party/ducc/threading.h b/third_party/ducc/threading.h
deleted file mode 100644
index a374e3d7b3d0..000000000000
--- a/third_party/ducc/threading.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
-#define THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
-
-#include "ducc/src/ducc0/infra/threading.h"
-#include "unsupported/Eigen/CXX11/ThreadPool"
-
-namespace ducc0 {
-namespace google {
-
-using std::size_t;
-
-// Pseudo thread-pool for single-threaded execution.
-class NoThreadPool : public ducc0::detail_threading::thread_pool {
- public:
-  size_t nthreads() const override { return 1; }
-  size_t adjust_nthreads(size_t nthreads_in) const override { return 1; };
-  void submit(std::function<void()> work) override { work(); }
-};
-
-// Thread-pool wrapper around Eigen's ThreadPool.
-class EigenThreadPool : public ducc0::detail_threading::thread_pool {
- public:
-  EigenThreadPool(Eigen::ThreadPoolInterface& pool) : pool_{&pool} {}
-  size_t nthreads() const override { return pool_->NumThreads(); }
-  size_t adjust_nthreads(size_t nthreads_in) const override {
-    // If called by a thread in the pool, return 1
-    if (pool_->CurrentThreadId() >= 0) {
-      return 1;
-    } else if (nthreads_in == 0) {
-      return pool_->NumThreads();
-    }
-    return std::min<size_t>(nthreads_in, pool_->NumThreads());
-  };
-  void submit(std::function<void()> work) override {
-    pool_->Schedule(std::move(work));
-  }
-
- private:
-  Eigen::ThreadPoolInterface* pool_;
-};
-
-}  // namespace google
-}  // namespace ducc0
-
-#endif  // THIRD_PARTY_DUCC_GOOGLE_THREADING_H_
diff --git a/third_party/ducc/workspace.bzl b/third_party/ducc/workspace.bzl
deleted file mode 100644
index 99c8b14cd9ff..000000000000
--- a/third_party/ducc/workspace.bzl
+++ /dev/null
@@ -1,21 +0,0 @@
-"""Distinctly Useful Code Collection (DUCC) - CPU FFT Module"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    DUCC_COMMIT = "aa46a4c21e440b3d416c16eca3c96df19c74f316"
-    DUCC_SHA256 = "077cf4bd0bd7eddaa6649a024285fff96e2662c5e6f2fb6ed5c5771f9de093f3"
-    tf_http_archive(
-        name = "ducc",
-        strip_prefix = "ducc-{commit}".format(commit = DUCC_COMMIT),
-        sha256 = DUCC_SHA256,
-        urls = tf_mirror_urls("https://gitlab.mpcdf.mpg.de/mtr/ducc/-/archive/{commit}/ducc-{commit}.tar.gz".format(commit = DUCC_COMMIT)),
-        build_file = "//third_party/ducc:ducc.BUILD",
-        link_files = {
-            "//third_party/ducc:ducc0_custom_lowlevel_threading.h": "google/ducc0_custom_lowlevel_threading.h",
-            "//third_party/ducc:fft.h": "google/fft.h",
-            "//third_party/ducc:fft.cc": "google/fft.cc",
-            "//third_party/ducc:threading.cc": "google/threading.cc",
-            "//third_party/ducc:threading.h": "google/threading.h",
-        },
-    )
diff --git a/third_party/eigen3/LICENSE b/third_party/eigen3/LICENSE
deleted file mode 100644
index eff7afbbc25a..000000000000
--- a/third_party/eigen3/LICENSE
+++ /dev/null
@@ -1,1072 +0,0 @@
-Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
-  http://www.mozilla.org/MPL/2.0/
-  http://www.mozilla.org/MPL/2.0/FAQ.html
-
-Some files contain third-party code under BSD or LGPL licenses, whence
-the other COPYING.* files here.
-
-All the LGPL code is either LGPL 2.1-only, or LGPL 2.1-or-later.
-For this reason, the COPYING.LGPL file contains the LGPL 2.1 text.
-
-If you want to guarantee that the Eigen code that you are #including
-is licensed under the MPL2 and possibly more permissive licenses (like
-BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY 
-For example, with most compilers, you could add this to your project
-      CXXFLAGS: -DEIGEN_MPL2_ONLY 
-This will cause a compilation error to be generated if you #include
-any code that is LGPL licensed.
-
-----------------------------------------------------------------------
-Following applies to:
-./test/mapstaticmethods.cpp
-./test/schur_real.cpp
-./test/prec_inverse_4x4.cpp
-./test/smallvectors.cpp
-./test/redux.cpp
-./test/special_numbers.cpp
-./test/adjoint.cpp
-./test/resize.cpp
-./test/mixingtypes.cpp
-./test/product_trmv.cpp
-./test/sparse_solvers.cpp
-./test/cholesky.cpp
-./test/geo_quaternion.cpp
-./test/miscmatrices.cpp
-./test/stddeque.cpp
-./test/integer_types.cpp
-./test/product_large.cpp
-./test/eigensolver_generic.cpp
-./test/householder.cpp
-./test/geo_orthomethods.cpp
-./test/array_for_matrix.cpp
-./test/sparseLM.cpp
-./test/upperbidiagonalization.cpp
-./test/nomalloc.cpp
-./test/packetmath.cpp
-./test/jacobisvd.cpp
-./test/geo_transformations.cpp
-./test/swap.cpp
-./test/eigensolver_selfadjoint.cpp
-./test/inverse.cpp
-./test/product_selfadjoint.cpp
-./test/product_trsolve.cpp
-./test/product_extra.cpp
-./test/sparse_solver.h
-./test/mapstride.cpp
-./test/mapped_matrix.cpp
-./test/geo_eulerangles.cpp
-./test/eigen2support.cpp
-./test/denseLM.cpp
-./test/stdvector.cpp
-./test/nesting_ops.cpp
-./test/sparse_permutations.cpp
-./test/zerosized.cpp
-./test/exceptions.cpp
-./test/vectorwiseop.cpp
-./test/cwiseop.cpp
-./test/basicstuff.cpp
-./test/product_trmm.cpp
-./test/linearstructure.cpp
-./test/sparse_product.cpp
-./test/stdvector_overload.cpp
-./test/stable_norm.cpp
-./test/umeyama.cpp
-./test/unalignedcount.cpp
-./test/triangular.cpp
-./test/product_mmtr.cpp
-./test/sparse_basic.cpp
-./test/sparse_vector.cpp
-./test/meta.cpp
-./test/real_qz.cpp
-./test/ref.cpp
-./test/eigensolver_complex.cpp
-./test/cholmod_support.cpp
-./test/conjugate_gradient.cpp
-./test/sparse.h
-./test/simplicial_cholesky.cpp
-./test/bicgstab.cpp
-./test/dynalloc.cpp
-./test/product_notemporary.cpp
-./test/geo_hyperplane.cpp
-./test/lu.cpp
-./test/qr.cpp
-./test/hessenberg.cpp
-./test/sizeof.cpp
-./test/main.h
-./test/selfadjoint.cpp
-./test/permutationmatrices.cpp
-./test/superlu_support.cpp
-./test/qtvector.cpp
-./test/geo_homogeneous.cpp
-./test/determinant.cpp
-./test/array_reverse.cpp
-./test/unalignedassert.cpp
-./test/stdlist.cpp
-./test/product_symm.cpp
-./test/corners.cpp
-./test/dontalign.cpp
-./test/visitor.cpp
-./test/geo_alignedbox.cpp
-./test/diagonalmatrices.cpp
-./test/product_small.cpp
-./test/eigensolver_generalized_real.cpp
-./test/umfpack_support.cpp
-./test/first_aligned.cpp
-./test/qr_fullpivoting.cpp
-./test/array_replicate.cpp
-./test/geo_parametrizedline.cpp
-./test/eigen2/eigen2_unalignedassert.cpp
-./test/eigen2/eigen2_prec_inverse_4x4.cpp
-./test/eigen2/eigen2_alignedbox.cpp
-./test/eigen2/eigen2_sparse_product.cpp
-./test/eigen2/eigen2_meta.cpp
-./test/eigen2/eigen2_nomalloc.cpp
-./test/eigen2/eigen2_visitor.cpp
-./test/eigen2/eigen2_packetmath.cpp
-./test/eigen2/eigen2_svd.cpp
-./test/eigen2/eigen2_mixingtypes.cpp
-./test/eigen2/eigen2_qr.cpp
-./test/eigen2/eigen2_cwiseop.cpp
-./test/eigen2/eigen2_geometry_with_eigen2_prefix.cpp
-./test/eigen2/eigen2_smallvectors.cpp
-./test/eigen2/eigen2_commainitializer.cpp
-./test/eigen2/eigen2_sparse_solvers.cpp
-./test/eigen2/eigen2_hyperplane.cpp
-./test/eigen2/eigen2_eigensolver.cpp
-./test/eigen2/eigen2_linearstructure.cpp
-./test/eigen2/eigen2_sizeof.cpp
-./test/eigen2/eigen2_parametrizedline.cpp
-./test/eigen2/eigen2_lu.cpp
-./test/eigen2/eigen2_adjoint.cpp
-./test/eigen2/eigen2_geometry.cpp
-./test/eigen2/eigen2_stdvector.cpp
-./test/eigen2/eigen2_newstdvector.cpp
-./test/eigen2/eigen2_submatrices.cpp
-./test/eigen2/sparse.h
-./test/eigen2/eigen2_swap.cpp
-./test/eigen2/eigen2_triangular.cpp
-./test/eigen2/eigen2_basicstuff.cpp
-./test/eigen2/gsl_helper.h
-./test/eigen2/eigen2_dynalloc.cpp
-./test/eigen2/eigen2_array.cpp
-./test/eigen2/eigen2_map.cpp
-./test/eigen2/main.h
-./test/eigen2/eigen2_miscmatrices.cpp
-./test/eigen2/eigen2_product_large.cpp
-./test/eigen2/eigen2_first_aligned.cpp
-./test/eigen2/eigen2_cholesky.cpp
-./test/eigen2/eigen2_determinant.cpp
-./test/eigen2/eigen2_sum.cpp
-./test/eigen2/eigen2_inverse.cpp
-./test/eigen2/eigen2_regression.cpp
-./test/eigen2/eigen2_product_small.cpp
-./test/eigen2/eigen2_qtvector.cpp
-./test/eigen2/eigen2_sparse_vector.cpp
-./test/eigen2/product.h
-./test/eigen2/eigen2_sparse_basic.cpp
-./test/eigen2/eigen2_bug_132.cpp
-./test/array.cpp
-./test/product_syrk.cpp
-./test/commainitializer.cpp
-./test/conservative_resize.cpp
-./test/qr_colpivoting.cpp
-./test/nullary.cpp
-./test/bandmatrix.cpp
-./test/pastix_support.cpp
-./test/product.h
-./test/block.cpp
-./test/vectorization_logic.cpp
-./test/jacobi.cpp
-./test/diagonal.cpp
-./test/schur_complex.cpp
-./test/sizeoverflow.cpp
-./bench/BenchTimer.h
-./bench/benchFFT.cpp
-./bench/eig33.cpp
-./bench/spbench/spbenchsolver.h
-./bench/spbench/spbenchstyle.h
-./lapack/complex_double.cpp
-./lapack/cholesky.cpp
-./lapack/lapack_common.h
-./lapack/eigenvalues.cpp
-./lapack/single.cpp
-./lapack/lu.cpp
-./lapack/complex_single.cpp
-./lapack/double.cpp
-./demos/mix_eigen_and_c/binary_library.cpp
-./demos/mix_eigen_and_c/binary_library.h
-./demos/mix_eigen_and_c/example.c
-./demos/mandelbrot/mandelbrot.cpp
-./demos/mandelbrot/mandelbrot.h
-./demos/opengl/icosphere.cpp
-./demos/opengl/icosphere.h
-./demos/opengl/camera.cpp
-./demos/opengl/quaternion_demo.h
-./demos/opengl/camera.h
-./demos/opengl/trackball.h
-./demos/opengl/gpuhelper.h
-./demos/opengl/trackball.cpp
-./demos/opengl/gpuhelper.cpp
-./demos/opengl/quaternion_demo.cpp
-./debug/gdb/printers.py
-./unsupported/test/minres.cpp
-./unsupported/test/openglsupport.cpp
-./unsupported/test/jacobisvd.cpp
-./unsupported/test/dgmres.cpp
-./unsupported/test/matrix_square_root.cpp
-./unsupported/test/bdcsvd.cpp
-./unsupported/test/matrix_exponential.cpp
-./unsupported/test/forward_adolc.cpp
-./unsupported/test/polynomialsolver.cpp
-./unsupported/test/matrix_function.cpp
-./unsupported/test/sparse_extra.cpp
-./unsupported/test/matrix_functions.h
-./unsupported/test/svd_common.h
-./unsupported/test/FFTW.cpp
-./unsupported/test/alignedvector3.cpp
-./unsupported/test/autodiff.cpp
-./unsupported/test/gmres.cpp
-./unsupported/test/BVH.cpp
-./unsupported/test/levenberg_marquardt.cpp
-./unsupported/test/matrix_power.cpp
-./unsupported/test/kronecker_product.cpp
-./unsupported/test/splines.cpp
-./unsupported/test/polynomialutils.cpp
-./unsupported/bench/bench_svd.cpp
-./unsupported/Eigen/IterativeSolvers
-./unsupported/Eigen/src/IterativeSolvers/DGMRES.h
-./unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
-./unsupported/Eigen/src/IterativeSolvers/GMRES.h
-./unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
-./unsupported/Eigen/src/IterativeSolvers/Scaling.h
-./unsupported/Eigen/src/IterativeSolvers/MINRES.h
-./unsupported/Eigen/src/SparseExtra/RandomSetter.h
-./unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
-./unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
-./unsupported/Eigen/src/SparseExtra/MarketIO.h
-./unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
-./unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
-./unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
-./unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
-./unsupported/Eigen/src/BVH/BVAlgorithms.h
-./unsupported/Eigen/src/BVH/KdBVH.h
-./unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
-./unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
-./unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
-./unsupported/Eigen/src/Splines/Spline.h
-./unsupported/Eigen/src/Splines/SplineFitting.h
-./unsupported/Eigen/src/Splines/SplineFwd.h
-./unsupported/Eigen/src/SVD/JacobiSVD.h
-./unsupported/Eigen/src/SVD/BDCSVD.h
-./unsupported/Eigen/src/SVD/SVDBase.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
-./unsupported/Eigen/src/MatrixFunctions/StemFunction.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
-./unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
-./unsupported/Eigen/src/MoreVectorization/MathFunctions.h
-./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
-./unsupported/Eigen/src/FFT/ei_fftw_impl.h
-./unsupported/Eigen/src/FFT/ei_kissfft_impl.h
-./unsupported/Eigen/src/Polynomials/PolynomialSolver.h
-./unsupported/Eigen/src/Polynomials/Companion.h
-./unsupported/Eigen/src/Polynomials/PolynomialUtils.h
-./unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
-./unsupported/Eigen/src/Skyline/SkylineProduct.h
-./unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
-./unsupported/Eigen/src/Skyline/SkylineStorage.h
-./unsupported/Eigen/src/Skyline/SkylineUtil.h
-./unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
-./unsupported/Eigen/src/Skyline/SkylineMatrix.h
-./unsupported/Eigen/SparseExtra
-./unsupported/Eigen/AdolcForward
-./unsupported/Eigen/KroneckerProduct
-./unsupported/Eigen/NonLinearOptimization
-./unsupported/Eigen/BVH
-./unsupported/Eigen/OpenGLSupport
-./unsupported/Eigen/ArpackSupport
-./unsupported/Eigen/AutoDiff
-./unsupported/Eigen/Splines
-./unsupported/Eigen/MPRealSupport
-./unsupported/Eigen/MatrixFunctions
-./unsupported/Eigen/MoreVectorization
-./unsupported/Eigen/LevenbergMarquardt
-./unsupported/Eigen/AlignedVector3
-./unsupported/Eigen/FFT
-./unsupported/Eigen/Polynomials
-./unsupported/Eigen/NumericalDiff
-./unsupported/Eigen/Skyline
-./COPYING.README
-./COPYING.README
-./LICENSE
-./LICENSE
-./LICENSE
-./Eigen/Eigen2Support
-./Eigen/src/Eigen2Support/VectorBlock.h
-./Eigen/src/Eigen2Support/Cwise.h
-./Eigen/src/Eigen2Support/Minor.h
-./Eigen/src/Eigen2Support/Lazy.h
-./Eigen/src/Eigen2Support/Memory.h
-./Eigen/src/Eigen2Support/MathFunctions.h
-./Eigen/src/Eigen2Support/Geometry/AlignedBox.h
-./Eigen/src/Eigen2Support/Geometry/Hyperplane.h
-./Eigen/src/Eigen2Support/Geometry/Quaternion.h
-./Eigen/src/Eigen2Support/Geometry/Rotation2D.h
-./Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
-./Eigen/src/Eigen2Support/Geometry/RotationBase.h
-./Eigen/src/Eigen2Support/Geometry/Translation.h
-./Eigen/src/Eigen2Support/Geometry/Scaling.h
-./Eigen/src/Eigen2Support/Geometry/AngleAxis.h
-./Eigen/src/Eigen2Support/Geometry/Transform.h
-./Eigen/src/Eigen2Support/TriangularSolver.h
-./Eigen/src/Eigen2Support/LU.h
-./Eigen/src/Eigen2Support/QR.h
-./Eigen/src/Eigen2Support/SVD.h
-./Eigen/src/Eigen2Support/Meta.h
-./Eigen/src/Eigen2Support/Block.h
-./Eigen/src/Eigen2Support/Macros.h
-./Eigen/src/Eigen2Support/LeastSquares.h
-./Eigen/src/Eigen2Support/CwiseOperators.h
-./Eigen/src/Jacobi/Jacobi.h
-./Eigen/src/misc/Kernel.h
-./Eigen/src/misc/SparseSolve.h
-./Eigen/src/misc/Solve.h
-./Eigen/src/misc/Image.h
-./Eigen/src/SparseCore/SparseColEtree.h
-./Eigen/src/SparseCore/SparseTranspose.h
-./Eigen/src/SparseCore/SparseUtil.h
-./Eigen/src/SparseCore/SparseCwiseBinaryOp.h
-./Eigen/src/SparseCore/SparseDiagonalProduct.h
-./Eigen/src/SparseCore/SparseProduct.h
-./Eigen/src/SparseCore/SparseDot.h
-./Eigen/src/SparseCore/SparseCwiseUnaryOp.h
-./Eigen/src/SparseCore/SparseSparseProductWithPruning.h
-./Eigen/src/SparseCore/SparseBlock.h
-./Eigen/src/SparseCore/SparseDenseProduct.h
-./Eigen/src/SparseCore/CompressedStorage.h
-./Eigen/src/SparseCore/SparseMatrixBase.h
-./Eigen/src/SparseCore/MappedSparseMatrix.h
-./Eigen/src/SparseCore/SparseTriangularView.h
-./Eigen/src/SparseCore/SparseView.h
-./Eigen/src/SparseCore/SparseFuzzy.h
-./Eigen/src/SparseCore/TriangularSolver.h
-./Eigen/src/SparseCore/SparseSelfAdjointView.h
-./Eigen/src/SparseCore/SparseMatrix.h
-./Eigen/src/SparseCore/SparseVector.h
-./Eigen/src/SparseCore/AmbiVector.h
-./Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
-./Eigen/src/SparseCore/SparseRedux.h
-./Eigen/src/SparseCore/SparsePermutation.h
-./Eigen/src/Eigenvalues/RealSchur.h
-./Eigen/src/Eigenvalues/ComplexEigenSolver.h
-./Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
-./Eigen/src/Eigenvalues/ComplexSchur.h
-./Eigen/src/Eigenvalues/RealQZ.h
-./Eigen/src/Eigenvalues/EigenSolver.h
-./Eigen/src/Eigenvalues/HessenbergDecomposition.h
-./Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
-./Eigen/src/Eigenvalues/Tridiagonalization.h
-./Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
-./Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
-./Eigen/src/SuperLUSupport/SuperLUSupport.h
-./Eigen/src/StlSupport/StdDeque.h
-./Eigen/src/StlSupport/StdVector.h
-./Eigen/src/StlSupport/StdList.h
-./Eigen/src/StlSupport/details.h
-./Eigen/src/SparseQR/SparseQR.h
-./Eigen/src/LU/Inverse.h
-./Eigen/src/LU/arch/Inverse_SSE.h
-./Eigen/src/LU/Determinant.h
-./Eigen/src/LU/PartialPivLU.h
-./Eigen/src/LU/FullPivLU.h
-./Eigen/src/UmfPackSupport/UmfPackSupport.h
-./Eigen/src/OrderingMethods/Ordering.h
-./Eigen/src/OrderingMethods/Eigen_Colamd.h
-./Eigen/src/QR/HouseholderQR.h
-./Eigen/src/QR/ColPivHouseholderQR.h
-./Eigen/src/QR/FullPivHouseholderQR.h
-./Eigen/src/SVD/JacobiSVD.h
-./Eigen/src/SVD/UpperBidiagonalization.h
-./Eigen/src/Geometry/OrthoMethods.h
-./Eigen/src/Geometry/AlignedBox.h
-./Eigen/src/Geometry/Hyperplane.h
-./Eigen/src/Geometry/Quaternion.h
-./Eigen/src/Geometry/EulerAngles.h
-./Eigen/src/Geometry/Rotation2D.h
-./Eigen/src/Geometry/ParametrizedLine.h
-./Eigen/src/Geometry/RotationBase.h
-./Eigen/src/Geometry/arch/Geometry_SSE.h
-./Eigen/src/Geometry/Umeyama.h
-./Eigen/src/Geometry/Homogeneous.h
-./Eigen/src/Geometry/Translation.h
-./Eigen/src/Geometry/Scaling.h
-./Eigen/src/Geometry/AngleAxis.h
-./Eigen/src/Geometry/Transform.h
-./Eigen/src/plugins/BlockMethods.h
-./Eigen/src/plugins/CommonCwiseUnaryOps.h
-./Eigen/src/plugins/CommonCwiseBinaryOps.h
-./Eigen/src/plugins/MatrixCwiseUnaryOps.h
-./Eigen/src/plugins/MatrixCwiseBinaryOps.h
-./Eigen/src/Householder/Householder.h
-./Eigen/src/Householder/HouseholderSequence.h
-./Eigen/src/Householder/BlockHouseholder.h
-./Eigen/src/Core/VectorBlock.h
-./Eigen/src/Core/Matrix.h
-./Eigen/src/Core/Ref.h
-./Eigen/src/Core/SelfAdjointView.h
-./Eigen/src/Core/MathFunctions.h
-./Eigen/src/Core/GlobalFunctions.h
-./Eigen/src/Core/MapBase.h
-./Eigen/src/Core/EigenBase.h
-./Eigen/src/Core/GenericPacketMath.h
-./Eigen/src/Core/NestByValue.h
-./Eigen/src/Core/CwiseUnaryOp.h
-./Eigen/src/Core/SolveTriangular.h
-./Eigen/src/Core/Fuzzy.h
-./Eigen/src/Core/Visitor.h
-./Eigen/src/Core/Map.h
-./Eigen/src/Core/NoAlias.h
-./Eigen/src/Core/Diagonal.h
-./Eigen/src/Core/StableNorm.h
-./Eigen/src/Core/CoreIterators.h
-./Eigen/src/Core/products/Parallelizer.h
-./Eigen/src/Core/products/SelfadjointMatrixVector.h
-./Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
-./Eigen/src/Core/products/TriangularSolverMatrix.h
-./Eigen/src/Core/products/GeneralMatrixMatrix.h
-./Eigen/src/Core/products/SelfadjointProduct.h
-./Eigen/src/Core/products/CoeffBasedProduct.h
-./Eigen/src/Core/products/TriangularMatrixVector.h
-./Eigen/src/Core/products/SelfadjointMatrixMatrix.h
-./Eigen/src/Core/products/TriangularSolverVector.h
-./Eigen/src/Core/products/SelfadjointRank2Update.h
-./Eigen/src/Core/products/GeneralBlockPanelKernel.h
-./Eigen/src/Core/products/GeneralMatrixVector.h
-./Eigen/src/Core/products/TriangularMatrixMatrix.h
-./Eigen/src/Core/Reverse.h
-./Eigen/src/Core/BooleanRedux.h
-./Eigen/src/Core/Replicate.h
-./Eigen/src/Core/arch/AltiVec/PacketMath.h
-./Eigen/src/Core/arch/AltiVec/Complex.h
-./Eigen/src/Core/arch/SSE/PacketMath.h
-./Eigen/src/Core/arch/SSE/Complex.h
-./Eigen/src/Core/arch/SSE/MathFunctions.h
-./Eigen/src/Core/arch/NEON/PacketMath.h
-./Eigen/src/Core/arch/NEON/Complex.h
-./Eigen/src/Core/arch/Default/Settings.h
-./Eigen/src/Core/CwiseUnaryView.h
-./Eigen/src/Core/Array.h
-./Eigen/src/Core/ArrayWrapper.h
-./Eigen/src/Core/Swap.h
-./Eigen/src/Core/Transpositions.h
-./Eigen/src/Core/Random.h
-./Eigen/src/Core/IO.h
-./Eigen/src/Core/SelfCwiseBinaryOp.h
-./Eigen/src/Core/VectorwiseOp.h
-./Eigen/src/Core/Select.h
-./Eigen/src/Core/ArrayBase.h
-./Eigen/src/Core/DenseCoeffsBase.h
-./Eigen/src/Core/DiagonalProduct.h
-./Eigen/src/Core/Assign.h
-./Eigen/src/Core/Redux.h
-./Eigen/src/Core/ForceAlignedAccess.h
-./Eigen/src/Core/BandMatrix.h
-./Eigen/src/Core/PlainObjectBase.h
-./Eigen/src/Core/DenseBase.h
-./Eigen/src/Core/Flagged.h
-./Eigen/src/Core/CwiseBinaryOp.h
-./Eigen/src/Core/ProductBase.h
-./Eigen/src/Core/TriangularMatrix.h
-./Eigen/src/Core/Transpose.h
-./Eigen/src/Core/DiagonalMatrix.h
-./Eigen/src/Core/Dot.h
-./Eigen/src/Core/Functors.h
-./Eigen/src/Core/PermutationMatrix.h
-./Eigen/src/Core/NumTraits.h
-./Eigen/src/Core/MatrixBase.h
-./Eigen/src/Core/DenseStorage.h
-./Eigen/src/Core/util/Memory.h
-./Eigen/src/Core/util/StaticAssert.h
-./Eigen/src/Core/util/BlasUtil.h
-./Eigen/src/Core/util/MatrixMapper.h
-./Eigen/src/Core/util/XprHelper.h
-./Eigen/src/Core/util/ForwardDeclarations.h
-./Eigen/src/Core/util/Meta.h
-./Eigen/src/Core/util/Macros.h
-./Eigen/src/Core/util/Constants.h
-./Eigen/src/Core/CwiseNullaryOp.h
-./Eigen/src/Core/Block.h
-./Eigen/src/Core/GeneralProduct.h
-./Eigen/src/Core/CommaInitializer.h
-./Eigen/src/Core/ReturnByValue.h
-./Eigen/src/Core/Stride.h
-./Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
-./Eigen/src/SparseLU/SparseLU_column_dfs.h
-./Eigen/src/SparseLU/SparseLU_panel_dfs.h
-./Eigen/src/SparseLU/SparseLU_relax_snode.h
-./Eigen/src/SparseLU/SparseLU_panel_bmod.h
-./Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
-./Eigen/src/SparseLU/SparseLU_Utils.h
-./Eigen/src/SparseLU/SparseLU_gemm_kernel.h
-./Eigen/src/SparseLU/SparseLU_kernel_bmod.h
-./Eigen/src/SparseLU/SparseLU_pivotL.h
-./Eigen/src/SparseLU/SparseLU_Memory.h
-./Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
-./Eigen/src/SparseLU/SparseLUImpl.h
-./Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
-./Eigen/src/SparseLU/SparseLU_Structs.h
-./Eigen/src/SparseLU/SparseLU.h
-./Eigen/src/SparseLU/SparseLU_column_bmod.h
-./Eigen/src/SparseLU/SparseLU_pruneL.h
-./Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
-./Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
-./Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
-./Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
-./Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
-./Eigen/src/SparseCholesky/SimplicialCholesky.h
-./Eigen/src/Cholesky/LDLT.h
-./Eigen/src/Cholesky/LLT.h
-./Eigen/src/CholmodSupport/CholmodSupport.h
-./Eigen/src/PaStiXSupport/PaStiXSupport.h
-./Eigen/src/MetisSupport/MetisSupport.h
-./Eigen/StdVector
-./Eigen/Core
-./Eigen/OrderingMethods
-./Eigen/SparseLU
-./Eigen/StdList
-./Eigen/StdDeque
-./Eigen/SparseCholesky
-./Eigen/SparseCore
-./scripts/relicense.py
-./scripts/relicense.py
-./blas/BandTriangularSolver.h
-./blas/PackedTriangularMatrixVector.h
-./blas/complex_double.cpp
-./blas/level2_real_impl.h
-./blas/level1_cplx_impl.h
-./blas/level1_impl.h
-./blas/level1_real_impl.h
-./blas/level3_impl.h
-./blas/single.cpp
-./blas/level2_cplx_impl.h
-./blas/PackedSelfadjointProduct.h
-./blas/Rank2Update.h
-./blas/complex_single.cpp
-./blas/PackedTriangularSolverVector.h
-./blas/double.cpp
-./blas/common.h
-./blas/level2_impl.h
-./blas/GeneralRank1Update.h
-
-Mozilla Public License Version 2.0
-==================================
-
-1. Definitions
---------------
-
-1.1. "Contributor"
-    means each individual or legal entity that creates, contributes to
-    the creation of, or owns Covered Software.
-
-1.2. "Contributor Version"
-    means the combination of the Contributions of others (if any) used
-    by a Contributor and that particular Contributor's Contribution.
-
-1.3. "Contribution"
-    means Covered Software of a particular Contributor.
-
-1.4. "Covered Software"
-    means Source Code Form to which the initial Contributor has attached
-    the notice in Exhibit A, the Executable Form of such Source Code
-    Form, and Modifications of such Source Code Form, in each case
-    including portions thereof.
-
-1.5. "Incompatible With Secondary Licenses"
-    means
-
-    (a) that the initial Contributor has attached the notice described
-        in Exhibit B to the Covered Software; or
-
-    (b) that the Covered Software was made available under the terms of
-        version 1.1 or earlier of the License, but not also under the
-        terms of a Secondary License.
-
-1.6. "Executable Form"
-    means any form of the work other than Source Code Form.
-
-1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
-    a separate file or files, that is not Covered Software.
-
-1.8. "License"
-    means this document.
-
-1.9. "Licensable"
-    means having the right to grant, to the maximum extent possible,
-    whether at the time of the initial grant or subsequently, any and
-    all of the rights conveyed by this License.
-
-1.10. "Modifications"
-    means any of the following:
-
-    (a) any file in Source Code Form that results from an addition to,
-        deletion from, or modification of the contents of Covered
-        Software; or
-
-    (b) any new file in Source Code Form that contains any Covered
-        Software.
-
-1.11. "Patent Claims" of a Contributor
-    means any patent claim(s), including without limitation, method,
-    process, and apparatus claims, in any patent Licensable by such
-    Contributor that would be infringed, but for the grant of the
-    License, by the making, using, selling, offering for sale, having
-    made, import, or transfer of either its Contributions or its
-    Contributor Version.
-
-1.12. "Secondary License"
-    means either the GNU General Public License, Version 2.0, the GNU
-    Lesser General Public License, Version 2.1, the GNU Affero General
-    Public License, Version 3.0, or any later versions of those
-    licenses.
-
-1.13. "Source Code Form"
-    means the form of the work preferred for making modifications.
-
-1.14. "You" (or "Your")
-    means an individual or a legal entity exercising rights under this
-    License. For legal entities, "You" includes any entity that
-    controls, is controlled by, or is under common control with You. For
-    purposes of this definition, "control" means (a) the power, direct
-    or indirect, to cause the direction or management of such entity,
-    whether by contract or otherwise, or (b) ownership of more than
-    fifty percent (50%) of the outstanding shares or beneficial
-    ownership of such entity.
-
-2. License Grants and Conditions
---------------------------------
-
-2.1. Grants
-
-Each Contributor hereby grants You a world-wide, royalty-free,
-non-exclusive license:
-
-(a) under intellectual property rights (other than patent or trademark)
-    Licensable by such Contributor to use, reproduce, make available,
-    modify, display, perform, distribute, and otherwise exploit its
-    Contributions, either on an unmodified basis, with Modifications, or
-    as part of a Larger Work; and
-
-(b) under Patent Claims of such Contributor to make, use, sell, offer
-    for sale, have made, import, and otherwise transfer either its
-    Contributions or its Contributor Version.
-
-2.2. Effective Date
-
-The licenses granted in Section 2.1 with respect to any Contribution
-become effective for each Contribution on the date the Contributor first
-distributes such Contribution.
-
-2.3. Limitations on Grant Scope
-
-The licenses granted in this Section 2 are the only rights granted under
-this License. No additional rights or licenses will be implied from the
-distribution or licensing of Covered Software under this License.
-Notwithstanding Section 2.1(b) above, no patent license is granted by a
-Contributor:
-
-(a) for any code that a Contributor has removed from Covered Software;
-    or
-
-(b) for infringements caused by: (i) Your and any other third party's
-    modifications of Covered Software, or (ii) the combination of its
-    Contributions with other software (except as part of its Contributor
-    Version); or
-
-(c) under Patent Claims infringed by Covered Software in the absence of
-    its Contributions.
-
-This License does not grant any rights in the trademarks, service marks,
-or logos of any Contributor (except as may be necessary to comply with
-the notice requirements in Section 3.4).
-
-2.4. Subsequent Licenses
-
-No Contributor makes additional grants as a result of Your choice to
-distribute the Covered Software under a subsequent version of this
-License (see Section 10.2) or under the terms of a Secondary License (if
-permitted under the terms of Section 3.3).
-
-2.5. Representation
-
-Each Contributor represents that the Contributor believes its
-Contributions are its original creation(s) or it has sufficient rights
-to grant the rights to its Contributions conveyed by this License.
-
-2.6. Fair Use
-
-This License is not intended to limit any rights You have under
-applicable copyright doctrines of fair use, fair dealing, or other
-equivalents.
-
-2.7. Conditions
-
-Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
-in Section 2.1.
-
-3. Responsibilities
--------------------
-
-3.1. Distribution of Source Form
-
-All distribution of Covered Software in Source Code Form, including any
-Modifications that You create or to which You contribute, must be under
-the terms of this License. You must inform recipients that the Source
-Code Form of the Covered Software is governed by the terms of this
-License, and how they can obtain a copy of this License. You may not
-attempt to alter or restrict the recipients' rights in the Source Code
-Form.
-
-3.2. Distribution of Executable Form
-
-If You distribute Covered Software in Executable Form then:
-
-(a) such Covered Software must also be made available in Source Code
-    Form, as described in Section 3.1, and You must inform recipients of
-    the Executable Form how they can obtain a copy of such Source Code
-    Form by reasonable means in a timely manner, at a charge no more
-    than the cost of distribution to the recipient; and
-
-(b) You may distribute such Executable Form under the terms of this
-    License, or sublicense it under different terms, provided that the
-    license for the Executable Form does not attempt to limit or alter
-    the recipients' rights in the Source Code Form under this License.
-
-3.3. Distribution of a Larger Work
-
-You may create and distribute a Larger Work under terms of Your choice,
-provided that You also comply with the requirements of this License for
-the Covered Software. If the Larger Work is a combination of Covered
-Software with a work governed by one or more Secondary Licenses, and the
-Covered Software is not Incompatible With Secondary Licenses, this
-License permits You to additionally distribute such Covered Software
-under the terms of such Secondary License(s), so that the recipient of
-the Larger Work may, at their option, further distribute the Covered
-Software under the terms of either this License or such Secondary
-License(s).
-
-3.4. Notices
-
-You may not remove or alter the substance of any license notices
-(including copyright notices, patent notices, disclaimers of warranty,
-or limitations of liability) contained within the Source Code Form of
-the Covered Software, except that You may alter any license notices to
-the extent required to remedy known factual inaccuracies.
-
-3.5. Application of Additional Terms
-
-You may choose to offer, and to charge a fee for, warranty, support,
-indemnity or liability obligations to one or more recipients of Covered
-Software. However, You may do so only on Your own behalf, and not on
-behalf of any Contributor. You must make it absolutely clear that any
-such warranty, support, indemnity, or liability obligation is offered by
-You alone, and You hereby agree to indemnify every Contributor for any
-liability incurred by such Contributor as a result of warranty, support,
-indemnity or liability terms You offer. You may include additional
-disclaimers of warranty and limitations of liability specific to any
-jurisdiction.
-
-4. Inability to Comply Due to Statute or Regulation
----------------------------------------------------
-
-If it is impossible for You to comply with any of the terms of this
-License with respect to some or all of the Covered Software due to
-statute, judicial order, or regulation then You must: (a) comply with
-the terms of this License to the maximum extent possible; and (b)
-describe the limitations and the code they affect. Such description must
-be placed in a text file included with all distributions of the Covered
-Software under this License. Except to the extent prohibited by statute
-or regulation, such description must be sufficiently detailed for a
-recipient of ordinary skill to be able to understand it.
-
-5. Termination
---------------
-
-5.1. The rights granted under this License will terminate automatically
-if You fail to comply with any of its terms. However, if You become
-compliant, then the rights granted under this License from a particular
-Contributor are reinstated (a) provisionally, unless and until such
-Contributor explicitly and finally terminates Your grants, and (b) on an
-ongoing basis, if such Contributor fails to notify You of the
-non-compliance by some reasonable means prior to 60 days after You have
-come back into compliance. Moreover, Your grants from a particular
-Contributor are reinstated on an ongoing basis if such Contributor
-notifies You of the non-compliance by some reasonable means, this is the
-first time You have received notice of non-compliance with this License
-from such Contributor, and You become compliant prior to 30 days after
-Your receipt of the notice.
-
-5.2. If You initiate litigation against any entity by asserting a patent
-infringement claim (excluding declaratory judgment actions,
-counter-claims, and cross-claims) alleging that a Contributor Version
-directly or indirectly infringes any patent, then the rights granted to
-You by any and all Contributors for the Covered Software under Section
-2.1 of this License shall terminate.
-
-5.3. In the event of termination under Sections 5.1 or 5.2 above, all
-end user license agreements (excluding distributors and resellers) which
-have been validly granted by You or Your distributors under this License
-prior to termination shall survive termination.
-
-************************************************************************
-*                                                                      *
-*  6. Disclaimer of Warranty                                           *
-*  -------------------------                                           *
-*                                                                      *
-*  Covered Software is provided under this License on an "as is"       *
-*  basis, without warranty of any kind, either expressed, implied, or  *
-*  statutory, including, without limitation, warranties that the       *
-*  Covered Software is free of defects, merchantable, fit for a        *
-*  particular purpose or non-infringing. The entire risk as to the     *
-*  quality and performance of the Covered Software is with You.        *
-*  Should any Covered Software prove defective in any respect, You     *
-*  (not any Contributor) assume the cost of any necessary servicing,   *
-*  repair, or correction. This disclaimer of warranty constitutes an   *
-*  essential part of this License. No use of any Covered Software is   *
-*  authorized under this License except under this disclaimer.         *
-*                                                                      *
-************************************************************************
-
-************************************************************************
-*                                                                      *
-*  7. Limitation of Liability                                          *
-*  --------------------------                                          *
-*                                                                      *
-*  Under no circumstances and under no legal theory, whether tort      *
-*  (including negligence), contract, or otherwise, shall any           *
-*  Contributor, or anyone who distributes Covered Software as          *
-*  permitted above, be liable to You for any direct, indirect,         *
-*  special, incidental, or consequential damages of any character      *
-*  including, without limitation, damages for lost profits, loss of    *
-*  goodwill, work stoppage, computer failure or malfunction, or any    *
-*  and all other commercial damages or losses, even if such party      *
-*  shall have been informed of the possibility of such damages. This   *
-*  limitation of liability shall not apply to liability for death or   *
-*  personal injury resulting from such party's negligence to the       *
-*  extent applicable law prohibits such limitation. Some               *
-*  jurisdictions do not allow the exclusion or limitation of           *
-*  incidental or consequential damages, so this exclusion and          *
-*  limitation may not apply to You.                                    *
-*                                                                      *
-************************************************************************
-
-8. Litigation
--------------
-
-Any litigation relating to this License may be brought only in the
-courts of a jurisdiction where the defendant maintains its principal
-place of business and such litigation shall be governed by laws of that
-jurisdiction, without reference to its conflict-of-law provisions.
-Nothing in this Section shall prevent a party's ability to bring
-cross-claims or counter-claims.
-
-9. Miscellaneous
-----------------
-
-This License represents the complete agreement concerning the subject
-matter hereof. If any provision of this License is held to be
-unenforceable, such provision shall be reformed only to the extent
-necessary to make it enforceable. Any law or regulation which provides
-that the language of a contract shall be construed against the drafter
-shall not be used to construe this License against a Contributor.
-
-10. Versions of the License
----------------------------
-
-10.1. New Versions
-
-Mozilla Foundation is the license steward. Except as provided in Section
-10.3, no one other than the license steward has the right to modify or
-publish new versions of this License. Each version will be given a
-distinguishing version number.
-
-10.2. Effect of New Versions
-
-You may distribute the Covered Software under the terms of the version
-of the License under which You originally received the Covered Software,
-or under the terms of any subsequent version published by the license
-steward.
-
-10.3. Modified Versions
-
-If you create software not governed by this License, and you want to
-create a new license for such software, you may create and use a
-modified version of this License if you rename the license and remove
-any references to the name of the license steward (except to note that
-such modified license differs from this License).
-
-10.4. Distributing Source Code Form that is Incompatible With Secondary
-Licenses
-
-If You choose to distribute Source Code Form that is Incompatible With
-Secondary Licenses under the terms of this version of the License, the
-notice described in Exhibit B of this License must be attached.
-
-Exhibit A - Source Code Form License Notice
--------------------------------------------
-
-  This Source Code Form is subject to the terms of the Mozilla Public
-  License, v. 2.0. If a copy of the MPL was not distributed with this
-  file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-If it is not possible or desirable to put the notice in a particular
-file, then You may include the notice in a location (such as a LICENSE
-file in a relevant directory) where a recipient would be likely to look
-for such a notice.
-
-You may add additional accurate notices of copyright ownership.
-
-Exhibit B - "Incompatible With Secondary Licenses" Notice
----------------------------------------------------------
-
-  This Source Code Form is "Incompatible With Secondary Licenses", as
-  defined by the Mozilla Public License, v. 2.0.
-
-----------------------------------------------------------------------
-Following applies to:
-./doc/UsingIntelMKL.dox
-./doc/UsingIntelMKL.dox
-./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
-./Eigen/src/Eigenvalues/ComplexSchur_MKL.h
-./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
-./Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
-./Eigen/src/Eigenvalues/RealSchur_MKL.h
-./Eigen/src/Eigenvalues/RealSchur_MKL.h
-./Eigen/src/LU/arch/Inverse_SSE.h
-./Eigen/src/LU/arch/Inverse_SSE.h
-./Eigen/src/LU/PartialPivLU_MKL.h
-./Eigen/src/LU/PartialPivLU_MKL.h
-./Eigen/src/QR/HouseholderQR_MKL.h
-./Eigen/src/QR/HouseholderQR_MKL.h
-./Eigen/src/QR/ColPivHouseholderQR_MKL.h
-./Eigen/src/QR/ColPivHouseholderQR_MKL.h
-./Eigen/src/SVD/JacobiSVD_MKL.h
-./Eigen/src/SVD/JacobiSVD_MKL.h
-./Eigen/src/PardisoSupport/PardisoSupport.h
-./Eigen/src/PardisoSupport/PardisoSupport.h
-./Eigen/src/Core/Assign_MKL.h
-./Eigen/src/Core/Assign_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
-./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
-./Eigen/src/Core/products/GeneralMatrixVector_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
-./Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
-./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
-./Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
-./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
-./Eigen/src/Core/products/TriangularMatrixVector_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
-./Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
-./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
-./Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
-./Eigen/src/Core/util/MKL_support.h
-./Eigen/src/Core/util/MKL_support.h
-./Eigen/src/Cholesky/LLT_MKL.h
-./Eigen/src/Cholesky/LLT_MKL.h
-
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.  *
-   Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the
-   distribution.  * Neither the name of Intel Corporation nor the
-   names of its contributors may be used to endorse or promote
-   products derived from this software without specific prior written
-   permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-----------------------------------------------------------------------
-Following applies to:
-./unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
-./unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
-
-Minpack Copyright Notice (1999) University of Chicago.  All rights
-reserved
-
-Redistribution and use in source and binary forms, with or
-without modification, are permitted provided that the
-following conditions are met:
-
-1. Redistributions of source code must retain the above
-copyright notice, this list of conditions and the following
-disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials
-provided with the distribution.
-
-3. The end-user documentation included with the
-redistribution, if any, must include the following
-acknowledgment:
-
-   "This product includes software developed by the
-   University of Chicago, as Operator of Argonne National
-   Laboratory.
-
-Alternately, this acknowledgment may appear in the software
-itself, if and wherever such third-party acknowledgments
-normally appear.
-
-4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
-WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
-UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
-THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
-OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
-OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
-USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
-THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
-DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
-UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
-BE CORRECTED.
-
-5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
-HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
-ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
-INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
-ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
-PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
-SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
-(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
-EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
-POSSIBILITY OF SUCH LOSS OR DAMAGES.
diff --git a/third_party/eigen3/eigen_archive.BUILD b/third_party/eigen3/eigen_archive.BUILD
deleted file mode 100644
index 78b1fc872f8a..000000000000
--- a/third_party/eigen3/eigen_archive.BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-# Description:
-#   Eigen is a C++ template library for linear algebra: vectors,
-#   matrices, and related algorithms.
-# This is the BUILD file used for the @eigen_archive external repository.
-
-licenses([
-    "reciprocal",  # MPL2
-    "notice",  # Portions BSD
-])
-
-exports_files(["COPYING.MPL2"])
-
-ALL_FILES_WITH_EXTENSIONS = glob(["**/*.*"])
-
-# Top-level headers, excluding anything in one of the  ../src/.. directories.
-EIGEN_HEADERS = glob(
-    [
-        "Eigen/*",
-        "unsupported/Eigen/*",
-        "unsupported/Eigen/CXX11/*",
-    ],
-    exclude = [
-        "**/src/**",
-    ] + ALL_FILES_WITH_EXTENSIONS,
-)
-
-# Internal eigen headers.
-EIGEN_SOURCES = glob(
-    [
-        "Eigen/**/src/**/*.h",
-        "Eigen/**/src/**/*.inc",
-        "unsupported/Eigen/**/src/**/*.h",
-        "unsupported/Eigen/**/src/**/*.inc",
-    ],
-)
-
-cc_library(
-    name = "eigen3",
-    srcs = EIGEN_SOURCES,
-    hdrs = EIGEN_HEADERS,
-    defines = [
-        "EIGEN_MAX_ALIGN_BYTES=64",
-        "EIGEN_ALLOW_UNALIGNED_SCALARS",  # TODO(b/296071640): Remove when underlying bugs are fixed.
-        "EIGEN_USE_AVX512_GEMM_KERNELS=0",  # TODO(b/238649163): Remove this once no longer necessary.
-    ],
-    includes = [
-        ".",  # Third-party libraries include eigen relative to its root.
-        "./mkl_include",  # For using MKL backend for Eigen when available.
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "eigen_header_files",
-    srcs = EIGEN_HEADERS,
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "eigen_source_files",
-    srcs = EIGEN_SOURCES,
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/eigen3/workspace.bzl b/third_party/eigen3/workspace.bzl
deleted file mode 100644
index 24e48e1bde7b..000000000000
--- a/third_party/eigen3/workspace.bzl
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Provides the repository macro to import Eigen."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports Eigen."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    EIGEN_COMMIT = "4c38131a16803130b66266a912029504f2cf23cd"
-    EIGEN_SHA256 = "1a432ccbd597ea7b9faa1557b1752328d6adc1a3db8969f6fe793ff704be3bf0"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
-
-    tf_http_archive(
-        name = "eigen_archive",
-        build_file = "//third_party/eigen3:eigen_archive.BUILD",
-        sha256 = EIGEN_SHA256,
-        strip_prefix = "eigen-{commit}".format(commit = EIGEN_COMMIT),
-        urls = tf_mirror_urls("https://gitlab.com/libeigen/eigen/-/archive/{commit}/eigen-{commit}.tar.gz".format(commit = EIGEN_COMMIT)),
-    )
diff --git a/third_party/farmhash/BUILD b/third_party/farmhash/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/farmhash/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/farmhash/farmhash.BUILD b/third_party/farmhash/farmhash.BUILD
deleted file mode 100644
index 4b8464684ae6..000000000000
--- a/third_party/farmhash/farmhash.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-licenses(["notice"])  # MIT
-
-exports_files(["COPYING"])
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
-
-cc_library(
-    name = "farmhash",
-    srcs = ["src/farmhash.cc"],
-    hdrs = ["src/farmhash.h"],
-    # Disable __builtin_expect support on Windows
-    copts = select({
-        ":windows": ["/DFARMHASH_OPTIONAL_BUILTIN_EXPECT"],
-        "//conditions:default": [],
-    }),
-    includes = ["src/."],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/farmhash/farmhash_gpu.BUILD b/third_party/farmhash/farmhash_gpu.BUILD
deleted file mode 100644
index 78e551de5413..000000000000
--- a/third_party/farmhash/farmhash_gpu.BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Description:
-# This is a modified farmhash to only include GPU-related functions.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # MIT
-
-cc_library(
-    name = "farmhash_gpu",
-    hdrs = ["src/farmhash_gpu.h"],
-    include_prefix = "third_party/farmhash_gpu",
-)
diff --git a/third_party/farmhash/farmhash_support_cuda.patch b/third_party/farmhash/farmhash_support_cuda.patch
deleted file mode 100644
index a5400a0dbfcf..000000000000
--- a/third_party/farmhash/farmhash_support_cuda.patch
+++ /dev/null
@@ -1,289 +0,0 @@
-From eb130493c8042280a01e03c28bb89bd5ae0c5d18 Mon Sep 17 00:00:00 2001
-From: Kaixi Hou <kaixih@nvidia.com>
-Date: Tue, 23 Mar 2021 12:49:18 -0700
-Subject: [PATCH] Add device modifiers for GPUs
-
----
- src/{farmhash.cc => farmhash_gpu.h} | 95 +++++++++++++++++++++++------
- 1 file changed, 75 insertions(+), 20 deletions(-)
- rename src/{farmhash.cc => farmhash_gpu.h} (99%)
-
-diff --git a/src/farmhash.cc b/src/farmhash_gpu.h
-similarity index 99%
-rename from src/farmhash.cc
-rename to src/farmhash_gpu.h
-index cfd4a47..50994b6 100644
---- a/src/farmhash.cc
-+++ b/src/farmhash_gpu.h
-@@ -20,6 +20,17 @@
- //
- // FarmHash, by Geoff Pike
- 
-+#ifndef FARM_HASH_GPU_H_
-+#define FARM_HASH_GPU_H_
-+
-+#include <cstdint>
-+#include <string.h>   // for memcpy and memset
-+
-+#define NAMESPACE_FOR_HASH_FUNCTIONS_GPU util_gpu
-+#define DEVICE_MODIFIER __device__ __host__
-+
-+// We use DEVICE_MODIFIER to remove those code unused by GPUs.
-+#ifndef DEVICE_MODIFIER
- #include "farmhash.h"
- // FARMHASH ASSUMPTIONS: Modify as needed, or use -DFARMHASH_ASSUME_SSE42 etc.
- // Note that if you use -DFARMHASH_ASSUME_SSE42 you likely need -msse42
-@@ -187,7 +198,14 @@
- #define uint64_in_expected_order(x) (x)
- #endif
- 
--namespace NAMESPACE_FOR_HASH_FUNCTIONS {
-+#endif // DEVICE_MODIFIER
-+
-+#define uint32_in_expected_order(x) (x)
-+#define uint64_in_expected_order(x) (x)
-+
-+#define STATIC_INLINE DEVICE_MODIFIER inline
-+
-+namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU {
- 
- STATIC_INLINE uint64_t Fetch64(const char *p) {
-   uint64_t result;
-@@ -201,6 +219,7 @@ STATIC_INLINE uint32_t Fetch32(const char *p) {
-   return uint32_in_expected_order(result);
- }
- 
-+#ifndef DEVICE_MODIFIER
- STATIC_INLINE uint32_t Bswap32(uint32_t val) { return bswap_32(val); }
- STATIC_INLINE uint64_t Bswap64(uint64_t val) { return bswap_64(val); }
- 
-@@ -210,12 +229,14 @@ STATIC_INLINE uint32_t BasicRotate32(uint32_t val, int shift) {
-   // Avoid shifting by 32: doing so yields an undefined result.
-   return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
- }
-+#endif // DEVICE_MODIFIER
- 
- STATIC_INLINE uint64_t BasicRotate64(uint64_t val, int shift) {
-   // Avoid shifting by 64: doing so yields an undefined result.
-   return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
- }
- 
-+#ifndef DEVICE_MODIFIER
- #if defined(_WIN32) && defined(FARMHASH_ROTR)
- 
- STATIC_INLINE uint32_t Rotate32(uint32_t val, int shift) {
-@@ -240,12 +261,18 @@ STATIC_INLINE uint64_t Rotate64(uint64_t val, int shift) {
- }
- 
- #endif
-+#endif // DEVICE_MODIFIER
- 
--}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS
-+STATIC_INLINE uint64_t Rotate64(uint64_t val, int shift) {
-+  return BasicRotate64(val, shift);
-+}
-+
-+}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU
- 
- // FARMHASH PORTABILITY LAYER: debug mode or max speed?
- // One may use -DFARMHASH_DEBUG=1 or -DFARMHASH_DEBUG=0 to force the issue.
- 
-+#ifndef DEVICE_MODIFIER
- #if !defined(FARMHASH_DEBUG) && (!defined(NDEBUG) || defined(_DEBUG))
- #define FARMHASH_DEBUG 1
- #endif
-@@ -345,14 +372,21 @@ STATIC_INLINE __m128i Fetch128(const char* s) {
- 
- #undef PERMUTE3
- #define PERMUTE3(a, b, c) do { std::swap(a, b); std::swap(a, c); } while (0)
-+#endif // DEVICE_MODIFIER
-+
-+struct Pair {
-+  uint64_t first;
-+  uint64_t second;
-+};
- 
--namespace NAMESPACE_FOR_HASH_FUNCTIONS {
-+namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU {
- 
- // Some primes between 2^63 and 2^64 for various uses.
- static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
- static const uint64_t k1 = 0xb492b66fbe98f273ULL;
- static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
- 
-+#ifndef DEVICE_MODIFIER
- // Magic numbers for 32-bit hashing.  Copied from Murmur3.
- static const uint32_t c1 = 0xcc9e2d51;
- static const uint32_t c2 = 0x1b873593;
-@@ -399,28 +433,34 @@ template <> uint128_t DebugTweak(uint128_t x) {
-   }
-   return x;
- }
-+#endif // DEVICE_MODIFIER
-+}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU
- 
--}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS
--
-+#ifndef DEVICE_MODIFIER
- using namespace std;
--using namespace NAMESPACE_FOR_HASH_FUNCTIONS;
--namespace farmhashna {
-+#endif // DEVICE_MODIFIER
-+using namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU;
-+namespace farmhashna_gpu {
- #undef Fetch
- #define Fetch Fetch64
- 
- #undef Rotate
- #define Rotate Rotate64
- 
-+#ifndef DEVICE_MODIFIER
- #undef Bswap
- #define Bswap Bswap64
-+#endif // DEVICE_MODIFIER
- 
- STATIC_INLINE uint64_t ShiftMix(uint64_t val) {
-   return val ^ (val >> 47);
- }
- 
-+#ifndef DEVICE_MODIFIER
- STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v) {
-   return Hash128to64(Uint128(u, v));
- }
-+#endif // DEVICE_MODIFIER
- 
- STATIC_INLINE uint64_t HashLen16(uint64_t u, uint64_t v, uint64_t mul) {
-   // Murmur-inspired hashing.
-@@ -471,7 +511,7 @@ STATIC_INLINE uint64_t HashLen17to32(const char *s, size_t len) {
- 
- // Return a 16-byte hash for 48 bytes.  Quick and dirty.
- // Callers do best to use "random-looking" values for a and b.
--STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
-+STATIC_INLINE Pair WeakHashLen32WithSeeds(
-     uint64_t w, uint64_t x, uint64_t y, uint64_t z, uint64_t a, uint64_t b) {
-   a += w;
-   b = Rotate(b + a + z, 21);
-@@ -479,11 +519,11 @@ STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
-   a += x;
-   a += y;
-   b += Rotate(a, 44);
--  return make_pair(a + z, b + c);
-+  return Pair{a + z, b + c};
- }
- 
- // Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
--STATIC_INLINE pair<uint64_t, uint64_t> WeakHashLen32WithSeeds(
-+STATIC_INLINE Pair WeakHashLen32WithSeeds(
-     const char* s, uint64_t a, uint64_t b) {
-   return WeakHashLen32WithSeeds(Fetch(s),
-                                 Fetch(s + 8),
-@@ -510,7 +550,7 @@ STATIC_INLINE uint64_t HashLen33to64(const char *s, size_t len) {
-                    e + Rotate(f + a, 18) + g, mul);
- }
- 
--uint64_t Hash64(const char *s, size_t len) {
-+DEVICE_MODIFIER uint64_t Hash64(const char *s, size_t len) {
-   const uint64_t seed = 81;
-   if (len <= 32) {
-     if (len <= 16) {
-@@ -527,8 +567,8 @@ uint64_t Hash64(const char *s, size_t len) {
-   uint64_t x = seed;
-   uint64_t y = seed * k1 + 113;
-   uint64_t z = ShiftMix(y * k2 + 113) * k2;
--  pair<uint64_t, uint64_t> v = make_pair(0, 0);
--  pair<uint64_t, uint64_t> w = make_pair(0, 0);
-+  Pair v = {0, 0};
-+  Pair w = {0, 0};
-   x = x * k2 + Fetch(s);
- 
-   // Set end so that after the loop we have 1 to 64 bytes left to process.
-@@ -543,7 +583,9 @@ uint64_t Hash64(const char *s, size_t len) {
-     z = Rotate(z + w.first, 33) * k1;
-     v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
-     w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch(s + 16));
--    std::swap(z, x);
-+    auto tmp = z;
-+    z = x;
-+    x = tmp;
-     s += 64;
-   } while (s != end);
-   uint64_t mul = k1 + ((z & 0xff) << 1);
-@@ -559,12 +601,15 @@ uint64_t Hash64(const char *s, size_t len) {
-   z = Rotate(z + w.first, 33) * mul;
-   v = WeakHashLen32WithSeeds(s, v.second * mul, x + w.first);
-   w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch(s + 16));
--  std::swap(z, x);
-+  auto tmp = z;
-+  z = x;
-+  x = tmp;
-   return HashLen16(HashLen16(v.first, w.first, mul) + ShiftMix(y) * k0 + z,
-                    HashLen16(v.second, w.second, mul) + x,
-                    mul);
- }
- 
-+#ifndef DEVICE_MODIFIER
- uint64_t Hash64WithSeeds(const char *s, size_t len, uint64_t seed0, uint64_t seed1);
- 
- uint64_t Hash64WithSeed(const char *s, size_t len, uint64_t seed) {
-@@ -574,7 +619,9 @@ uint64_t Hash64WithSeed(const char *s, size_t len, uint64_t seed) {
- uint64_t Hash64WithSeeds(const char *s, size_t len, uint64_t seed0, uint64_t seed1) {
-   return HashLen16(Hash64(s, len) - seed0, seed1);
- }
--}  // namespace farmhashna
-+#endif // DEVICE_MODIFIER
-+}  // namespace farmhashna_gpu
-+#ifndef DEVICE_MODIFIER
- namespace farmhashuo {
- #undef Fetch
- #define Fetch Fetch64
-@@ -1864,8 +1911,10 @@ uint128_t Fingerprint128(const char* s, size_t len) {
-   return CityHash128(s, len);
- }
- }  // namespace farmhashcc
--namespace NAMESPACE_FOR_HASH_FUNCTIONS {
-+#endif // DEVICE_MODIFIER
-+namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU {
- 
-+#ifndef DEVICE_MODIFIER
- // BASIC STRING HASHING
- 
- // Hash function for a byte array.  See also Hash(), below.
-@@ -1948,12 +1997,14 @@ uint128_t Hash128WithSeed(const char* s, size_t len, uint128_t seed) {
- uint32_t Fingerprint32(const char* s, size_t len) {
-   return farmhashmk::Hash32(s, len);
- }
-+#endif // DEVICE_MODIFIER
- 
- // Fingerprint function for a byte array.
--uint64_t Fingerprint64(const char* s, size_t len) {
--  return farmhashna::Hash64(s, len);
-+DEVICE_MODIFIER uint64_t Fingerprint64(const char* s, size_t len) {
-+  return farmhashna_gpu::Hash64(s, len);
- }
- 
-+#ifndef DEVICE_MODIFIER
- // Fingerprint function for a byte array.
- uint128_t Fingerprint128(const char* s, size_t len) {
-   return farmhashcc::Fingerprint128(s, len);
-@@ -1961,9 +2012,11 @@ uint128_t Fingerprint128(const char* s, size_t len) {
- 
- // Older and still available but perhaps not as fast as the above:
- //   farmhashns::Hash32{,WithSeed}()
-+#endif // DEVICE_MODIFIER
- 
--}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS
-+}  // namespace NAMESPACE_FOR_HASH_FUNCTIONS_GPU
- 
-+#ifndef DEVICE_MODIFIER
- #if FARMHASHSELFTEST
- 
- #ifndef FARMHASH_SELF_TEST_GUARD
-@@ -11829,3 +11882,5 @@ int main() {
- }
- 
- #endif  // FARMHASHSELFTEST
-+#endif // DEVICE_MODIFIER
-+#endif // FARM_HASH_GPU_H_
--- 
-2.17.1
-
diff --git a/third_party/farmhash/workspace.bzl b/third_party/farmhash/workspace.bzl
deleted file mode 100644
index f273313891d3..000000000000
--- a/third_party/farmhash/workspace.bzl
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Provides the repository macro to import farmhash."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports farmhash."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    FARMHASH_COMMIT = "0d859a811870d10f53a594927d0d0b97573ad06d"
-    FARMHASH_SHA256 = "18392cf0736e1d62ecbb8d695c31496b6507859e8c75541d7ad0ba092dc52115"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/farmhash.cmake)
-
-    tf_http_archive(
-        name = "farmhash_archive",
-        build_file = "//third_party/farmhash:farmhash.BUILD",
-        sha256 = FARMHASH_SHA256,
-        strip_prefix = "farmhash-{commit}".format(commit = FARMHASH_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/farmhash/archive/{commit}.tar.gz".format(commit = FARMHASH_COMMIT)),
-    )
-
-    tf_http_archive(
-        name = "farmhash_gpu_archive",
-        build_file = "//third_party/farmhash:farmhash_gpu.BUILD",
-        patch_file = ["//third_party/farmhash:farmhash_support_cuda.patch"],
-        sha256 = FARMHASH_SHA256,
-        strip_prefix = "farmhash-{commit}".format(commit = FARMHASH_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/farmhash/archive/{commit}.tar.gz".format(commit = FARMHASH_COMMIT)),
-    )
diff --git a/third_party/fft2d/BUILD b/third_party/fft2d/BUILD.bazel
similarity index 100%
rename from third_party/fft2d/BUILD
rename to third_party/fft2d/BUILD.bazel
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 1747d1d315ca..83100b00bf50 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -2,6 +2,7 @@
 
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load("@rules_java//java:defs.bzl", "java_library")
+load("@rules_python//python:defs.bzl", "py_library")
 
 flatc_path = "@flatbuffers//:flatc"
 zip_files = "//tensorflow/lite/tools:zip_files"
@@ -457,7 +458,7 @@ def flatbuffer_py_library(
             ":{}".format(all_srcs_no_include),
         ],
     )
-    native.py_library(
+    py_library(
         name = name,
         srcs = [
             ":{}".format(concat_py_srcs),
diff --git a/third_party/gemmlowp/BUILD b/third_party/gemmlowp/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/gemmlowp/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/gemmlowp/workspace.bzl b/third_party/gemmlowp/workspace.bzl
deleted file mode 100644
index 884f707719a6..000000000000
--- a/third_party/gemmlowp/workspace.bzl
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Provides the repository macro to import gemmlowp."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports gemmlowp."""
-
-    # Attention: tools parse and update these lines.
-    # LINT.IfChange
-    GEMMLOWP_COMMIT = "16e8662c34917be0065110bfcd9cc27d30f52fdf"
-    GEMMLOWP_SHA256 = "7dc418717c8456473fac4ff2288b71057e3dcb72894524c734a4362cdb51fa8b"
-    # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/gemmlowp.cmake)
-
-    tf_http_archive(
-        name = "gemmlowp",
-        sha256 = GEMMLOWP_SHA256,
-        strip_prefix = "gemmlowp-{commit}".format(commit = GEMMLOWP_COMMIT),
-        urls = tf_mirror_urls("https://github.com/google/gemmlowp/archive/{commit}.zip".format(commit = GEMMLOWP_COMMIT)),
-    )
diff --git a/third_party/git/BUILD.tpl b/third_party/git/BUILD.tpl
deleted file mode 100644
index 7b031e74d582..000000000000
--- a/third_party/git/BUILD.tpl
+++ /dev/null
@@ -1,10 +0,0 @@
-# Description:
-# Exports generated files used to generate tensorflow/core/util/version_info.cc
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])
-
-exports_files(
-    glob(["gen/*"]),
-)
diff --git a/third_party/git/git_configure.bzl b/third_party/git/git_configure.bzl
deleted file mode 100644
index 3ce64242af6a..000000000000
--- a/third_party/git/git_configure.bzl
+++ /dev/null
@@ -1,71 +0,0 @@
-"""Repository rule for Git autoconfiguration.
-
-`git_configure` depends on the following environment variables:
-
-  * `PYTHON_BIN_PATH`: location of python binary.
-"""
-
-_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-
-def _fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def _get_python_bin(repository_ctx):
-    """Gets the python bin path."""
-    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-    if python_bin != None:
-        return python_bin
-    python_bin_path = repository_ctx.which("python3")
-    if python_bin_path != None:
-        return str(python_bin_path)
-    python_bin_path = repository_ctx.which("python")
-    if python_bin_path != None:
-        return str(python_bin_path)
-    _fail("Cannot find python in PATH, please make sure " +
-          "python is installed and add its directory in PATH, or --define " +
-          "%s='/something/else'.\nPATH=%s" % (
-              _PYTHON_BIN_PATH,
-              repository_ctx.os.environ.get("PATH", ""),
-          ))
-
-def _git_conf_impl(repository_ctx):
-    repository_ctx.template(
-        "BUILD",
-        Label("//third_party/git:BUILD.tpl"),
-    )
-
-    tensorflow_root_path = str(repository_ctx.path(
-        Label("@org_tensorflow//:BUILD"),
-    ))[:-len("BUILD")]
-    python_script_path = repository_ctx.path(
-        Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"),
-    )
-    generated_files_path = repository_ctx.path("gen")
-
-    r = repository_ctx.execute(
-        ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path],
-    )
-    if r.return_code == 0:
-        unused_var = repository_ctx.path(Label("//:.git/HEAD"))  # pylint: disable=unused-variable
-
-    result = repository_ctx.execute([
-        _get_python_bin(repository_ctx),
-        python_script_path,
-        "--configure",
-        tensorflow_root_path,
-        "--gen_root_path",
-        generated_files_path,
-    ], quiet = False)
-
-    if not result.return_code == 0:
-        _fail(result.stderr)
-
-git_configure = repository_rule(
-    implementation = _git_conf_impl,
-    environ = [
-        _PYTHON_BIN_PATH,
-    ],
-)
diff --git a/third_party/gloo/BUILD b/third_party/gloo/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/gloo/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/gloo/gloo.BUILD b/third_party/gloo/gloo.BUILD
deleted file mode 100644
index 2de0c852ebf0..000000000000
--- a/third_party/gloo/gloo.BUILD
+++ /dev/null
@@ -1,108 +0,0 @@
-# Description:
-#   Gloo is a collective communications library
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-substitions = {
-    "@GLOO_VERSION_MAJOR@": "9999",
-    "@GLOO_VERSION_MINOR@": "0",
-    "@GLOO_VERSION_PATCH@": "0",
-    "#cmakedefine01 GLOO_USE_CUDA": "#define GLOO_USE_CUDA 0",
-    "#cmakedefine01 GLOO_USE_NCCL": "#define GLOO_USE_NCCL 0",
-    "#cmakedefine01 GLOO_USE_ROCM": "#define GLOO_USE_ROCM 0",
-    "#cmakedefine01 GLOO_USE_RCCL": "#define GLOO_USE_RCCL 0",
-    "#cmakedefine01 GLOO_USE_REDIS": "#define GLOO_USE_REDIS 0",
-    "#cmakedefine01 GLOO_USE_IBVERBS": "#define GLOO_USE_IBVERBS 0",
-    "#cmakedefine01 GLOO_USE_MPI": "#define GLOO_USE_MPI 0",
-    "#cmakedefine01 GLOO_USE_LIBUV": "#define GLOO_USE_LIBUV (__APPLE__ ? 1 : 0)",
-    "#cmakedefine01 GLOO_HAVE_TRANSPORT_TCP": "#define GLOO_HAVE_TRANSPORT_TCP 1",
-    "#cmakedefine01 GLOO_HAVE_TRANSPORT_TCP_TLS": "#define GLOO_HAVE_TRANSPORT_TCP_TLS 0",
-    "#cmakedefine01 GLOO_HAVE_TRANSPORT_IBVERBS": "#define GLOO_HAVE_TRANSPORT_IBVERBS 0",
-    "#cmakedefine01 GLOO_HAVE_TRANSPORT_UV": "#define GLOO_HAVE_TRANSPORT_UV 0",
-    "#cmakedefine01 GLOO_USE_AVX": "#define GLOO_USE_AVX __AVX__",
-}
-
-expand_template(
-    name = "config",
-    out = "gloo/config.h",
-    substitutions = substitions,
-    template = "gloo/config.h.in",
-)
-
-cc_library(
-    name = "gloo",
-    srcs = glob(
-        [
-            "gloo/*.cc",
-            "gloo/common/*.cc",
-            "gloo/transport/*.cc",
-        ],
-        exclude = [
-            "gloo/common/linux.cc",
-            "gloo/common/win.cc",
-            "gloo/cuda*.cc",
-        ],
-    ) + [
-        "gloo/rendezvous/context.cc",
-        "gloo/rendezvous/file_store.cc",
-        "gloo/rendezvous/hash_store.cc",
-        "gloo/rendezvous/prefix_store.cc",
-        "gloo/rendezvous/store.cc",
-    ] + select({
-        "@local_xla//xla/tsl:macos": [],
-        "@local_xla//xla/tsl:windows": [],
-        "//conditions:default": [
-            "gloo/common/linux.cc",
-        ],
-    }),
-    copts = [
-        "-fexceptions",
-        "-Wno-unused-variable",
-    ],
-    includes = ["."],
-    textual_hdrs = glob(
-        [
-            "gloo/*.h",
-            "gloo/common/*.h",
-            "gloo/transport/*.h",
-        ],
-        exclude = [
-            "gloo/cuda*.h",
-            "gloo/common/win.h",
-        ],
-    ) + [
-        "gloo/config.h",
-        "gloo/rendezvous/context.h",
-        "gloo/rendezvous/file_store.h",
-        "gloo/rendezvous/hash_store.h",
-        "gloo/rendezvous/prefix_store.h",
-        "gloo/rendezvous/store.h",
-    ],
-)
-
-cc_library(
-    name = "transport_tcp",
-    srcs = glob(["gloo/transport/tcp/*.cc"]),
-    hdrs = glob(["gloo/transport/tcp/*.h"]),
-    copts = ["-fexceptions"],
-    deps = [":gloo"],
-)
-
-cc_library(
-    name = "transport_uv",
-    srcs = glob(["gloo/transport/uv/*.cc"]),
-    hdrs = glob(["gloo/transport/uv/*.h"]),
-    copts = ["-fexceptions"],
-    deps = [
-        ":gloo",
-        "@uv",
-    ],
-)
diff --git a/third_party/gloo/workspace.bzl b/third_party/gloo/workspace.bzl
deleted file mode 100644
index ede168395acd..000000000000
--- a/third_party/gloo/workspace.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-"""Provides the repository macro to import Gloo."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports Gloo."""
-
-    GLOO_COMMIT = "5354032ea08eadd7fc4456477f7f7c6308818509"
-    GLOO_SHA256 = "5759a06e6c8863c58e8ceadeb56f7c701fec89b2559ba33a103a447207bf69c7"
-
-    tf_http_archive(
-        name = "gloo",
-        sha256 = GLOO_SHA256,
-        strip_prefix = "gloo-{commit}".format(commit = GLOO_COMMIT),
-        urls = tf_mirror_urls("https://github.com/facebookincubator/gloo/archive/{commit}.tar.gz".format(commit = GLOO_COMMIT)),
-        build_file = "//third_party/gloo:gloo.BUILD",
-    )
diff --git a/third_party/googleapis/BUILD b/third_party/googleapis/BUILD.bazel
similarity index 100%
rename from third_party/googleapis/BUILD
rename to third_party/googleapis/BUILD.bazel
diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
deleted file mode 100644
index a1d47efcc93a..000000000000
--- a/third_party/gpus/check_cuda_libs.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Verifies that a list of libraries is installed on the system.
-
-NB: DEPRECATED! This script is a part of the deprecated `cuda_configure` rule.
-Please use `hermetic/cuda_configure` instead.
-
-Takes a list of arguments with every two subsequent arguments being a logical
-tuple of (path, check_soname). The path to the library and either True or False
-to indicate whether to check the soname field on the shared library.
-
-Example Usage:
-./check_cuda_libs.py /path/to/lib1.so True /path/to/lib2.so False
-"""
-import os
-import os.path
-import platform
-import shutil
-import subprocess
-import sys
-
-
-class ConfigError(Exception):
-  pass
-
-
-def _is_windows():
-  return platform.system() == "Windows"
-
-
-def check_cuda_lib(path, check_soname=True):
-  """Tests if a library exists on disk and whether its soname matches the filename.
-
-  Args:
-    path: the path to the library.
-    check_soname: whether to check the soname as well.
-
-  Raises:
-    ConfigError: If the library does not exist or if its soname does not match
-    the filename.
-  """
-  if not os.path.isfile(path):
-    raise ConfigError("No library found under: " + path)
-  objdump = shutil.which("objdump")
-  if check_soname and objdump is not None and not _is_windows():
-    # Decode is necessary as in py3 the return type changed from str to bytes
-    output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
-    output = [line for line in output.splitlines() if "SONAME" in line]
-    sonames = [line.strip().split(" ")[-1] for line in output]
-    if not any(soname == os.path.basename(path) for soname in sonames):
-      raise ConfigError("None of the libraries match their SONAME: " + path)
-
-
-def main():
-  try:
-    args = [argv for argv in sys.argv[1:]]
-    if len(args) % 2 == 1:
-      raise ConfigError("Expected even number of arguments")
-    checked_paths = []
-    for i in range(0, len(args), 2):
-      path = args[i]
-      check_cuda_lib(path, check_soname=args[i + 1] == "True")
-      checked_paths.append(path)
-    # pylint: disable=superfluous-parens
-    print(os.linesep.join(checked_paths))
-    # pylint: enable=superfluous-parens
-  except ConfigError as e:
-    sys.stderr.write(str(e))
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/gpus/compiler_common_tools.bzl b/third_party/gpus/compiler_common_tools.bzl
deleted file mode 100644
index bd07f49ec457..000000000000
--- a/third_party/gpus/compiler_common_tools.bzl
+++ /dev/null
@@ -1,174 +0,0 @@
-"""Common compiler functions. """
-
-load(
-    "//third_party/remote_config:common.bzl",
-    "err_out",
-    "raw_exec",
-    "realpath",
-)
-
-def to_list_of_strings(elements):
-    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
-
-    This is to be used to put a list of strings into the bzl file templates
-    so it gets interpreted as list of strings in Starlark.
-
-    Args:
-      elements: list of string elements
-
-    Returns:
-      single string of elements wrapped in quotes separated by a comma."""
-    quoted_strings = ["\"" + element + "\"" for element in elements]
-    return ", ".join(quoted_strings)
-
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-# OSX add " (framework directory)" at the end of line, strip it.
-_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
-
-# TODO(dzc): Once these functions have been factored out of Bazel's
-# cc_configure.bzl, load them from @bazel_tools instead.
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-    """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
-
-def _is_compiler_option_supported(repository_ctx, cc, option):
-    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
-    result = repository_ctx.execute([
-        cc,
-        option,
-        "-o",
-        "/dev/null",
-        "-c",
-        str(repository_ctx.path("tools/cpp/empty.cc")),
-    ])
-    return result.stderr.find(option) == -1
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sys_root):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sys_root:
-        sysroot += ["--sysroot", tf_sys_root]
-    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
-                                      sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    print_resource_dir_supported = _is_compiler_option_supported(
-        repository_ctx,
-        cc,
-        "-print-resource-dir",
-    )
-
-    if print_resource_dir_supported:
-        resource_dir = repository_ctx.execute(
-            [cc, "-print-resource-dir"],
-        ).stdout.strip() + "/share"
-        inc_dirs += "\n" + resource_dir
-
-    compiler_includes = [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
-
-    # The compiler might be on a symlink, e.g. /symlink -> /opt/gcc
-    # The above keeps only the resolved paths to the default includes (e.g. /opt/gcc/include/c++/11)
-    # but Bazel might encounter either (usually reported by the compiler)
-    # especially when a compiler wrapper (e.g. ccache) is used.
-    # So we need to also include paths where symlinks are not resolved.
-
-    # Try to find real path to CC installation to "see through" compiler wrappers
-    # GCC has the path to g++
-    index1 = result.stderr.find("COLLECT_GCC=")
-    if index1 != -1:
-        index1 = result.stderr.find("=", index1)
-        index2 = result.stderr.find("\n", index1)
-        cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname.dirname
-    else:
-        # Clang has the directory
-        index1 = result.stderr.find("InstalledDir: ")
-        if index1 != -1:
-            index1 = result.stderr.find(" ", index1)
-            index2 = result.stderr.find("\n", index1)
-            cc_topdir = repository_ctx.path(result.stderr[index1 + 1:index2]).dirname
-        else:
-            # Fallback to the CC path
-            cc_topdir = repository_ctx.path(cc).dirname.dirname
-
-    # We now have the compiler installation prefix, e.g. /symlink/gcc
-    # And the resolved installation prefix, e.g. /opt/gcc
-    cc_topdir_resolved = str(realpath(repository_ctx, cc_topdir)).strip()
-    cc_topdir = str(cc_topdir).strip()
-
-    # If there is (any!) symlink involved we add paths where the unresolved installation prefix is kept.
-    # e.g. [/opt/gcc/include/c++/11, /opt/gcc/lib/gcc/x86_64-linux-gnu/11/include, /other/path]
-    # adds [/symlink/include/c++/11, /symlink/lib/gcc/x86_64-linux-gnu/11/include]
-    if cc_topdir_resolved != cc_topdir:
-        unresolved_compiler_includes = [
-            cc_topdir + inc[len(cc_topdir_resolved):]
-            for inc in compiler_includes
-            if inc.startswith(cc_topdir_resolved)
-        ]
-        compiler_includes = compiler_includes + unresolved_compiler_includes
-    return compiler_includes
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sys_root):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sys_root,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sys_root,
-    )
-
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp
-    ]
diff --git a/third_party/gpus/crosstool/BUILD.rocm.tpl b/third_party/gpus/crosstool/BUILD.rocm.tpl
deleted file mode 100644
index ac3082fbcb30..000000000000
--- a/third_party/gpus/crosstool/BUILD.rocm.tpl
+++ /dev/null
@@ -1,117 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    target_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
-    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
-    as_files = ":crosstool_wrapper_driver_is_not_gcc",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_identifier = "local_linux",
-    toolchain_config = ":cc-compiler-local-config",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    cpu = "local",
-    compiler = "compiler",
-    toolchain_identifier = "local_linux",
-    host_system_name = "local",
-    target_system_name = "local",
-    target_libc = "local",
-    abi_version = "local",
-    abi_libc_version = "local",
-    cxx_builtin_include_directories = [%{cxx_builtin_include_directories}],
-    host_compiler_path = "%{host_compiler_path}",
-    host_compiler_prefix = "%{host_compiler_prefix}",
-    compile_flags = [
-        "-U_FORTIFY_SOURCE",
-        "-fstack-protector",
-        "-Wall",
-        "-Wunused-but-set-parameter",
-        "-Wno-free-nonheap-object",
-        "-fno-omit-frame-pointer",
-    ],
-    opt_compile_flags = [
-        "-g0",
-        "-O2",
-        "-D_FORTIFY_SOURCE=1",
-        "-DNDEBUG",
-        "-ffunction-sections",
-        "-fdata-sections",
-    ],
-    dbg_compile_flags = ["-g"],
-    cxx_flags = ["-std=c++17"],
-    link_flags = [
-        "-fuse-ld=gold",
-        "-Wl,-no-as-needed",
-        "-Wl,-z,relro,-z,now",
-    ],
-    link_libs = [
-        "-lstdc++",
-        "-lm",
-    ],
-    opt_link_flags = [],
-    unfiltered_compile_flags = [
-        "-Wno-builtin-macro-redefined",
-        "-D__DATE__=\"redacted\"",
-        "-D__TIMESTAMP__=\"redacted\"",
-        "-D__TIME__=\"redacted\"",
-    ] + [%{unfiltered_compile_flags}],
-    linker_bin_path = "%{linker_bin_path}",
-    coverage_compile_flags = ["--coverage"],
-    coverage_link_flags = ["--coverage"],
-    supports_start_end_lib = True,
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-  name = "crosstool_wrapper_driver_is_not_gcc",
-  srcs = [":clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-  data = ["@local_config_rocm//rocm:all_files"],
-)
diff --git a/third_party/gpus/crosstool/BUILD.sycl.tpl b/third_party/gpus/crosstool/BUILD.sycl.tpl
deleted file mode 100644
index a8db760d0c69..000000000000
--- a/third_party/gpus/crosstool/BUILD.sycl.tpl
+++ /dev/null
@@ -1,71 +0,0 @@
-# This file is expanded from a template sycl_configure.bzl
-# Update sycl_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":crosstool_wrapper_driver_is_not_gcc",
-    ar_files = ":crosstool_wrapper_driver_is_not_gcc",
-    as_files = ":crosstool_wrapper_driver_is_not_gcc",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_identifier = "local_linux",
-    toolchain_config = ":cc-compiler-local-config",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    cpu = "local",
-    builtin_include_directories = [%{cxx_builtin_include_directories}],
-    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
-    host_compiler_path = "%{host_compiler_path}",
-    host_compiler_prefix = "%{host_compiler_prefix}",
-    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
-    linker_bin_path = "%{linker_bin_path}",
-    compiler = "unknown",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"]
-)
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
deleted file mode 100644
index b9553d9b99ec..000000000000
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ /dev/null
@@ -1,153 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    target_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = "%{compiler_deps}",
-    compiler_files = "%{compiler_deps}",
-    ar_files = "%{compiler_deps}",
-    as_files = "%{compiler_deps}",
-    dwp_files = ":empty",
-    linker_files = "%{compiler_deps}",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_identifier = "local_linux",
-    toolchain_config = ":cc-compiler-local-config",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    cpu = "local",
-    builtin_include_directories = [%{cxx_builtin_include_directories}],
-    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
-    host_compiler_path = "%{host_compiler_path}",
-    host_compiler_prefix = "%{host_compiler_prefix}",
-    host_compiler_warnings = [%{host_compiler_warnings}],
-    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
-    linker_bin_path = "%{linker_bin_path}",
-    builtin_sysroot = "%{builtin_sysroot}",
-    cuda_path = "%{cuda_toolkit_path}",
-    compiler = "%{compiler}",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = "%{compiler_deps}",
-    compiler_files = "%{compiler_deps}",
-    ar_files = "%{compiler_deps}",
-    as_files = "%{compiler_deps}",
-    dwp_files = ":empty",
-    linker_files = "%{compiler_deps}",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_identifier = "local_darwin",
-    toolchain_config = ":cc-compiler-local-darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    cpu = "darwin",
-    builtin_include_directories = [%{cxx_builtin_include_directories}],
-    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
-    host_compiler_path = "%{host_compiler_path}",
-    host_compiler_prefix = "%{host_compiler_prefix}",
-    host_compiler_warnings = [%{host_compiler_warnings}],
-    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
-    linker_bin_path = "%{linker_bin_path}",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = "%{win_compiler_deps}",
-    compiler_files = "%{win_compiler_deps}",
-    ar_files = "%{win_compiler_deps}",
-    as_files = "%{win_compiler_deps}",
-    dwp_files = ":empty",
-    linker_files = "%{win_compiler_deps}",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_identifier = "local_windows",
-    toolchain_config = ":cc-compiler-windows-config",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    cpu = "x64_windows",
-    builtin_include_directories = [%{cxx_builtin_include_directories}],
-    msvc_cl_path = "%{msvc_cl_path}",
-    msvc_env_include = "%{msvc_env_include}",
-    msvc_env_lib = "%{msvc_env_lib}",
-    msvc_env_path = "%{msvc_env_path}",
-    msvc_env_tmp = "%{msvc_env_tmp}",
-    msvc_lib_path = "%{msvc_lib_path}",
-    msvc_link_path = "%{msvc_link_path}",
-    msvc_ml_path = "%{msvc_ml_path}",
-    compiler = "msvc",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "cuda_nvcc_files",
-    srcs = %{cuda_nvcc_files},
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = [
-      ":cuda_nvcc_files",
-      ":clang/bin/crosstool_wrapper_driver_is_not_gcc"
-    ],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/gpus/crosstool/LICENSE b/third_party/gpus/crosstool/LICENSE
deleted file mode 100644
index d3da228420e9..000000000000
--- a/third_party/gpus/crosstool/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2015, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
deleted file mode 100644
index ffa305c772e8..000000000000
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ /dev/null
@@ -1,1085 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "artifact_name_pattern",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-    "with_feature_set",
-)
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-
-def all_assembly_actions():
-    return [
-        ACTION_NAMES.assemble,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_compile_actions():
-    return [
-        ACTION_NAMES.assemble,
-        ACTION_NAMES.c_compile,
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_c_compile_actions():
-    return [
-        ACTION_NAMES.c_compile,
-    ]
-
-def all_cpp_compile_actions():
-    return [
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-    ]
-
-def all_preprocessed_actions():
-    return [
-        ACTION_NAMES.c_compile,
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-def all_executable_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_executable,
-    ]
-
-def all_shared_library_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-def all_archive_actions():
-    return [ACTION_NAMES.cpp_link_static_library]
-
-def all_strip_actions():
-    return [ACTION_NAMES.strip]
-
-def _library_to_link(flag_prefix, value, iterate = None):
-    return flag_group(
-        flags = [
-            "{}%{{libraries_to_link.{}}}".format(
-                flag_prefix,
-                iterate if iterate else "name",
-            ),
-        ],
-        iterate_over = ("libraries_to_link." + iterate if iterate else None),
-        expand_if_equal = variable_with_value(
-            name = "libraries_to_link.type",
-            value = value,
-        ),
-    )
-
-def _surround_static_library(prefix, suffix):
-    return [
-        flag_group(
-            flags = [prefix, "%{libraries_to_link.name}", suffix],
-            expand_if_true = "libraries_to_link.is_whole_archive",
-        ),
-        flag_group(
-            flags = ["%{libraries_to_link.name}"],
-            expand_if_false = "libraries_to_link.is_whole_archive",
-        ),
-    ]
-
-def _prefix_static_library(prefix):
-    return [
-        flag_group(
-            flags = ["%{libraries_to_link.name}"],
-            expand_if_false = "libraries_to_link.is_whole_archive",
-        ),
-        flag_group(
-            flags = [prefix + "%{libraries_to_link.name}"],
-            expand_if_true = "libraries_to_link.is_whole_archive",
-        ),
-    ]
-
-def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
-    if alwayslink_suffix:
-        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
-    else:
-        flag_groups = _prefix_static_library(alwayslink_prefix)
-    return flag_group(
-        flag_groups = flag_groups,
-        expand_if_equal = variable_with_value(
-            name = "libraries_to_link.type",
-            value = "static_library",
-        ),
-    )
-
-def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
-    return flag_group(
-        iterate_over = iterate_over,
-        expand_if_available = iterate_over,
-        flag_groups = flag_groups,
-        flags = flags,
-    )
-
-def _libraries_to_link_group(flavour):
-    if flavour == "linux":
-        return _iterate_flag_group(
-            iterate_over = "libraries_to_link",
-            flag_groups = [
-                flag_group(
-                    flags = ["-Wl,--start-lib"],
-                    expand_if_equal = variable_with_value(
-                        name = "libraries_to_link.type",
-                        value = "object_file_group",
-                    ),
-                ),
-                _library_to_link("", "object_file_group", "object_files"),
-                flag_group(
-                    flags = ["-Wl,--end-lib"],
-                    expand_if_equal = variable_with_value(
-                        name = "libraries_to_link.type",
-                        value = "object_file_group",
-                    ),
-                ),
-                _library_to_link("", "object_file"),
-                _library_to_link("", "interface_library"),
-                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
-                _library_to_link("-l", "dynamic_library"),
-                _library_to_link("-l:", "versioned_dynamic_library"),
-            ],
-        )
-    elif flavour == "darwin":
-        return _iterate_flag_group(
-            iterate_over = "libraries_to_link",
-            flag_groups = [
-                _library_to_link("", "object_file_group", "object_files"),
-                _library_to_link("", "object_file"),
-                _library_to_link("", "interface_library"),
-                _static_library_to_link("-Wl,-force_load,"),
-                _library_to_link("-l", "dynamic_library"),
-                _library_to_link("-l:", "versioned_dynamic_library"),
-            ],
-        )
-    elif flavour == "msvc":
-        return _iterate_flag_group(
-            iterate_over = "libraries_to_link",
-            flag_groups = [
-                _library_to_link("", "object_file_group", "object_files"),
-                _library_to_link("", "object_file"),
-                _library_to_link("", "interface_library"),
-                _static_library_to_link("/WHOLEARCHIVE:"),
-            ],
-        )
-
-def _action_configs_with_tool(path, actions):
-    return [
-        action_config(
-            action_name = name,
-            enabled = True,
-            tools = [tool(path = path)],
-        )
-        for name in actions
-    ]
-
-def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
-    return _action_configs_with_tool(
-        assembly_path,
-        all_assembly_actions(),
-    ) + _action_configs_with_tool(
-        c_compiler_path,
-        all_c_compile_actions(),
-    ) + _action_configs_with_tool(
-        cc_compiler_path,
-        all_cpp_compile_actions(),
-    ) + _action_configs_with_tool(
-        archiver_path,
-        all_archive_actions(),
-    ) + _action_configs_with_tool(
-        linker_path,
-        all_link_actions(),
-    ) + _action_configs_with_tool(
-        strip_path,
-        all_strip_actions(),
-    )
-
-def _tool_paths(cpu, ctx):
-    if cpu in ["local", "darwin"]:
-        return [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
-                "/ar" if cpu == "local" else "/libtool"
-            )),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif cpu == "x64_windows":
-        return [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    else:
-        fail("Unreachable")
-
-def _sysroot_group():
-    return flag_group(
-        flags = ["--sysroot=%{sysroot}"],
-        expand_if_available = "sysroot",
-    )
-
-def _no_canonical_prefixes_group(extra_flags):
-    return flag_group(
-        flags = [
-            "-no-canonical-prefixes",
-        ] + extra_flags,
-    )
-
-def _cuda_set(cuda_path, actions):
-    if cuda_path:
-        return [flag_set(
-            actions = actions,
-            flag_groups = [
-                flag_group(
-                    flags = ["--cuda-path=" + cuda_path],
-                ),
-            ],
-        )]
-    else:
-        return []
-
-def _nologo():
-    return flag_group(flags = ["/nologo"])
-
-def _features(cpu, compiler, ctx):
-    if cpu in ["local", "darwin"]:
-        return [
-            feature(name = "no_legacy_features"),
-            feature(
-                name = "all_compile_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-MD", "-MF", "%{dependency_file}"],
-                                expand_if_available = "dependency_file",
-                            ),
-                            flag_group(
-                                flags = ["-gsplit-dwarf"],
-                                expand_if_available = "per_object_debug_info_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-frandom-seed=%{output_file}"],
-                                expand_if_available = "output_file",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-D%{preprocessor_defines}"],
-                                iterate_over = "preprocessor_defines",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-include", "%{includes}"],
-                                iterate_over = "includes",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-iquote", "%{quote_include_paths}"],
-                                iterate_over = "quote_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-I%{include_paths}"],
-                                iterate_over = "include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-isystem", "%{system_include_paths}"],
-                                iterate_over = "system_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-F", "%{framework_include_paths}"],
-                                iterate_over = "framework_include_paths",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_cpp_compile_actions(),
-                        flag_groups = [
-                            flag_group(flags = [
-                                "-fmerge-all-constants",
-                            ]),
-                        ] if compiler == "clang" else [],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-Wno-builtin-macro-redefined",
-                                    "-D__DATE__=\"redacted\"",
-                                    "-D__TIMESTAMP__=\"redacted\"",
-                                    "-D__TIME__=\"redacted\"",
-                                ],
-                            ),
-                            flag_group(
-                                flags = ["-fPIC"],
-                                expand_if_available = "pic",
-                            ),
-                            flag_group(
-                                flags = ["-fPIE"],
-                                expand_if_not_available = "pic",
-                            ),
-                            flag_group(
-                                flags = [
-                                    "-U_FORTIFY_SOURCE",
-                                    "-D_FORTIFY_SOURCE=1",
-                                    "-fstack-protector",
-                                    "-Wall",
-                                ] + ctx.attr.host_compiler_warnings + [
-                                    "-fno-omit-frame-pointer",
-                                ],
-                            ),
-                            _no_canonical_prefixes_group(
-                                ctx.attr.extra_no_canonical_prefixes_flags,
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-                        with_features = [with_feature_set(features = ["disable-assertions"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-g0",
-                                    "-O2",
-                                    "-ffunction-sections",
-                                    "-fdata-sections",
-                                ],
-                            ),
-                        ],
-                        with_features = [with_feature_set(features = ["opt"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["-g"])],
-                        with_features = [with_feature_set(features = ["dbg"])],
-                    ),
-                ] + _cuda_set(
-                    ctx.attr.cuda_path,
-                    all_compile_actions(),
-                ) + [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            _iterate_flag_group(
-                                flags = ["%{user_compile_flags}"],
-                                iterate_over = "user_compile_flags",
-                            ),
-                            _sysroot_group(),
-                            flag_group(
-                                expand_if_available = "source_file",
-                                flags = ["-c", "%{source_file}"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_assembly_file",
-                                flags = ["-S"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_preprocess_file",
-                                flags = ["-E"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_file",
-                                flags = ["-o", "%{output_file}"],
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_archive_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_archive_actions(),
-                        flag_groups = [
-                            flag_group(
-                                expand_if_available = "linker_param_file",
-                                flags = ["@%{linker_param_file}"],
-                            ),
-                            flag_group(flags = ["rcsD"]),
-                            flag_group(
-                                flags = ["%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                            flag_group(
-                                iterate_over = "libraries_to_link",
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_equal = variable_with_value(
-                                            name = "libraries_to_link.type",
-                                            value = "object_file",
-                                        ),
-                                    ),
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.object_files}"],
-                                        iterate_over = "libraries_to_link.object_files",
-                                        expand_if_equal = variable_with_value(
-                                            name = "libraries_to_link.type",
-                                            value = "object_file_group",
-                                        ),
-                                    ),
-                                ],
-                                expand_if_available = "libraries_to_link",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_link_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_shared_library_link_actions(),
-                        flag_groups = [flag_group(flags = ["-shared"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = ([
-                            flag_group(flags = ["-Wl,-no-as-needed"])
-                        ] if cpu == "local" else []) + ([
-                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path])
-                        ] if ctx.attr.linker_bin_path else []) + [
-                            flag_group(
-                                flags = ["@%{linker_param_file}"],
-                                expand_if_available = "linker_param_file",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["%{linkstamp_paths}"],
-                                iterate_over = "linkstamp_paths",
-                            ),
-                            flag_group(
-                                flags = ["-o", "%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-L%{library_search_directories}"],
-                                iterate_over = "library_search_directories",
-                            ),
-                            _iterate_flag_group(
-                                iterate_over = "runtime_library_search_directories",
-                                flags = [
-                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
-                                ] if cpu == "local" else [
-                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
-                                ],
-                            ),
-                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
-                            _iterate_flag_group(
-                                flags = ["%{user_link_flags}"],
-                                iterate_over = "user_link_flags",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,--gdb-index"],
-                                expand_if_available = "is_using_fission",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,-S"],
-                                expand_if_available = "strip_debug_symbols",
-                            ),
-                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
-                            _no_canonical_prefixes_group(
-                                ctx.attr.extra_no_canonical_prefixes_flags,
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_executable_link_actions(),
-                        flag_groups = [flag_group(flags = ["-pie"])],
-                    ),
-                ] + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = [
-                            "-Wl,-z,relro,-z,now",
-                        ])],
-                    ),
-                ] if cpu == "local" else []) + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = ["-Wl,--gc-sections"]),
-                            flag_group(
-                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                            ),
-                        ],
-                    ),
-                ] if cpu == "local" else []) + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-                    ),
-                ] if cpu == "darwin" else []) + _cuda_set(
-                    ctx.attr.cuda_path,
-                    all_link_actions(),
-                ) + [
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            _sysroot_group(),
-                        ],
-                    ),
-                ],
-            ),
-            feature(name = "disable-assertions"),
-            feature(
-                name = "opt",
-                implies = ["disable-assertions"],
-            ),
-            feature(name = "fastbuild"),
-            feature(name = "dbg"),
-            feature(name = "supports_dynamic_linker", enabled = True),
-            feature(name = "pic", enabled = True),
-            feature(name = "supports_pic", enabled = True),
-            feature(name = "has_configured_linker_path", enabled = True),
-        ]
-    elif cpu == "x64_windows":
-        return [
-            feature(name = "compiler_param_file"),
-            feature(name = "no_legacy_features"),
-            feature(
-                name = "common_flags",
-                enabled = True,
-                env_sets = [
-                    env_set(
-                        actions = all_compile_actions() + all_link_actions() + all_archive_actions(),
-                        env_entries = [
-                            env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                            env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include),
-                            env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                            env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                            env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_compile_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            _nologo(),
-                            flag_group(
-                                flags = [
-                                    "/DCOMPILER_MSVC",
-                                    "/DNOMINMAX",
-                                    "/D_WIN32_WINNT=0x0600",
-                                    "/D_CRT_SECURE_NO_DEPRECATE",
-                                    "/D_CRT_SECURE_NO_WARNINGS",
-                                    "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                                    "/bigobj",
-                                    "/Zm500",
-                                    "/J",
-                                    "/Gy",
-                                    "/GF",
-                                    "/EHsc",
-                                    "/wd4351",
-                                    "/wd4291",
-                                    "/wd4250",
-                                    "/wd4996",
-                                ],
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/I%{quote_include_paths}"],
-                                iterate_over = "quote_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/I%{include_paths}"],
-                                iterate_over = "include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/I%{system_include_paths}"],
-                                iterate_over = "system_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["/D%{preprocessor_defines}"],
-                                iterate_over = "preprocessor_defines",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [flag_group(flags = ["/showIncludes"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MT"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MD"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MTd"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/MDd"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                        with_features = [with_feature_set(features = ["dbg"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                        with_features = [with_feature_set(features = ["fastbuild"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                        with_features = [with_feature_set(features = ["opt"])],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [
-                            _iterate_flag_group(
-                                flags = ["%{user_compile_flags}"],
-                                iterate_over = "user_compile_flags",
-                            ),
-                        ] + ([
-                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
-                        ] if ctx.attr.host_unfiltered_compile_flags else []),
-                    ),
-                    flag_set(
-                        actions = [ACTION_NAMES.assemble],
-                        flag_groups = [
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/Fo%{output_file}", "/Zi"],
-                                        expand_if_not_available = "output_preprocess_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                                expand_if_not_available = "output_assembly_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/Fo%{output_file}"],
-                                        expand_if_not_available = "output_preprocess_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                                expand_if_not_available = "output_assembly_file",
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/Fa%{output_file}"],
-                                        expand_if_available = "output_assembly_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["/P", "/Fi%{output_file}"],
-                                        expand_if_available = "output_preprocess_file",
-                                    ),
-                                ],
-                                expand_if_available = "output_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/c", "%{source_file}"],
-                                expand_if_available = "source_file",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_archive_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_archive_actions(),
-                        flag_groups = [
-                            _nologo(),
-                            flag_group(
-                                flags = ["/OUT:%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_link_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_shared_library_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DLL"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            _nologo(),
-                            _iterate_flag_group(
-                                flags = ["%{linkstamp_paths}"],
-                                iterate_over = "linkstamp_paths",
-                            ),
-                            flag_group(
-                                flags = ["/OUT:%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_shared_library_link_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/IMPLIB:%{interface_library_output_path}"],
-                                expand_if_available = "interface_library_output_path",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions() +
-                                  all_archive_actions(),
-                        flag_groups = [
-                            _libraries_to_link_group("msvc"),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = ["/SUBSYSTEM:CONSOLE"]),
-                            _iterate_flag_group(
-                                flags = ["%{user_link_flags}"],
-                                iterate_over = "user_link_flags",
-                            ),
-                            flag_group(flags = ["/MACHINE:X64"]),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions() +
-                                  all_archive_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["@%{linker_param_file}"],
-                                expand_if_available = "linker_param_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                        with_features = [with_feature_set(features = ["dbg"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                        ],
-                        with_features = [with_feature_set(features = ["fastbuild"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                                expand_if_available = "def_file_path",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(name = "parse_showincludes", enabled = True),
-            feature(name = "no_stripping", enabled = True),
-            feature(
-                name = "targets_windows",
-                enabled = True,
-                implies = ["copy_dynamic_libraries_to_binary"],
-            ),
-            feature(name = "copy_dynamic_libraries_to_binary"),
-            feature(
-                name = "generate_pdb_file",
-                requires = [
-                    feature_set(features = ["dbg"]),
-                    feature_set(features = ["fastbuild"]),
-                ],
-            ),
-            feature(name = "static_link_msvcrt"),
-            feature(
-                name = "static_link_msvcrt_no_debug",
-                requires = [
-                    feature_set(features = ["fastbuild"]),
-                    feature_set(features = ["opt"]),
-                ],
-            ),
-            feature(
-                name = "dynamic_link_msvcrt_no_debug",
-                requires = [
-                    feature_set(features = ["fastbuild"]),
-                    feature_set(features = ["opt"]),
-                ],
-            ),
-            feature(
-                name = "static_link_msvcrt_debug",
-                requires = [feature_set(features = ["dbg"])],
-            ),
-            feature(
-                name = "dynamic_link_msvcrt_debug",
-                requires = [feature_set(features = ["dbg"])],
-            ),
-            feature(
-                name = "dbg",
-                implies = ["generate_pdb_file"],
-            ),
-            feature(
-                name = "fastbuild",
-                implies = ["generate_pdb_file"],
-            ),
-            feature(
-                name = "opt",
-            ),
-            feature(name = "windows_export_all_symbols"),
-            feature(name = "no_windows_export_all_symbols"),
-            feature(name = "supports_dynamic_linker", enabled = True),
-            feature(
-                name = "supports_interface_shared_libraries",
-                enabled = True,
-            ),
-            feature(name = "has_configured_linker_path", enabled = True),
-        ]
-    else:
-        fail("Unreachable")
-
-def _impl(ctx):
-    cpu = ctx.attr.cpu
-    compiler = ctx.attr.compiler
-
-    if (cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-        target_cpu = "darwin"
-        target_libc = "macosx"
-        compiler = "compiler"
-        action_configs = _action_configs(
-            assembly_path = ctx.attr.host_compiler_path,
-            c_compiler_path = ctx.attr.host_compiler_path,
-            cc_compiler_path = ctx.attr.host_compiler_path,
-            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
-            linker_path = ctx.attr.host_compiler_path,
-            strip_path = ctx.attr.host_compiler_prefix + "/strip",
-        )
-        artifact_name_patterns = []
-    elif (cpu == "local"):
-        toolchain_identifier = "local_linux"
-        target_cpu = "local"
-        target_libc = "local"
-        action_configs = _action_configs(
-            assembly_path = ctx.attr.host_compiler_path,
-            c_compiler_path = ctx.attr.host_compiler_path,
-            cc_compiler_path = ctx.attr.host_compiler_path,
-            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
-            linker_path = ctx.attr.host_compiler_path,
-            strip_path = ctx.attr.host_compiler_prefix + "/strip",
-        )
-        artifact_name_patterns = []
-    elif (cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-        target_cpu = "x64_windows"
-        target_libc = "msvcrt"
-        compiler = "msvc-cl"
-        action_configs = _action_configs(
-            assembly_path = ctx.attr.msvc_ml_path,
-            c_compiler_path = ctx.attr.msvc_cl_path,
-            cc_compiler_path = ctx.attr.msvc_cl_path,
-            archiver_path = ctx.attr.msvc_lib_path,
-            linker_path = ctx.attr.msvc_link_path,
-            strip_path = "fake_tool_strip_not_supported",
-        )
-        artifact_name_patterns = [
-            artifact_name_pattern(
-                category_name = "object_file",
-                prefix = "",
-                extension = ".obj",
-            ),
-            artifact_name_pattern(
-                category_name = "static_library",
-                prefix = "",
-                extension = ".lib",
-            ),
-            artifact_name_pattern(
-                category_name = "alwayslink_static_library",
-                prefix = "",
-                extension = ".lo.lib",
-            ),
-            artifact_name_pattern(
-                category_name = "executable",
-                prefix = "",
-                extension = ".exe",
-            ),
-            artifact_name_pattern(
-                category_name = "dynamic_library",
-                prefix = "",
-                extension = ".dll",
-            ),
-            artifact_name_pattern(
-                category_name = "interface_library",
-                prefix = "",
-                extension = ".if.lib",
-            ),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = _features(cpu, compiler, ctx),
-            action_configs = action_configs,
-            artifact_name_patterns = artifact_name_patterns,
-            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = "local",
-            target_system_name = "local",
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = "local",
-            abi_libc_version = "local",
-            tool_paths = _tool_paths(cpu, ctx),
-            make_variables = [],
-            builtin_sysroot = ctx.attr.builtin_sysroot,
-            cc_target_os = None,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default = "unknown"),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "builtin_sysroot": attr.string(),
-        "cuda_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
deleted file mode 100755
index 35bb5ec2b422..000000000000
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ /dev/null
@@ -1,321 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import shlex
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('%{cpu_compiler}')
-HOST_COMPILER_PATH = ('%{host_compiler_path}')
-
-NVCC_PATH = '%{nvcc_path}'
-PREFIX_DIR = os.path.dirname(HOST_COMPILER_PATH)
-USE_CLANG_COMPILER = '%{use_clang_compiler}'
-NVCC_VERSION = '%{cuda_version}'
-TMPDIR= '%{tmpdir}'
-
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, with the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument(option, nargs='*', action='append')
-  option = option.lstrip('-').replace('-', '_')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.no_canonical_prefixes:
-    opts += ' -no-canonical-prefixes'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--'+a for a in options])
-  return ''
-
-def system(cmd):
-  """Invokes cmd with os.system().
-
-  Args:
-    cmd: The command.
-
-  Returns:
-    The exit code if the process exited with exit() or -signal
-    if the process was terminated by a signal.
-  """
-  retv = os.system(cmd)
-  if os.WIFEXITED(retv):
-    return os.WEXITSTATUS(retv)
-  else:
-    return -os.WTERMSIG(retv)
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, '-O')
-  m_options = GetOptionValue(argv, '-m')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  m_host_options = ''.join([' -m' + m for m in m_options if m not in ['32', '64']])
-  host_compiler_options = ' '.join([host_compiler_options, m_host_options])
-  include_options = GetOptionValue(argv, '-I')
-  out_file = GetOptionValue(argv, '-o')
-  depfiles = GetOptionValue(argv, '-MF')
-  defines = GetOptionValue(argv, '-D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, '-U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, '-std')
-  # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
-  nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
-  nvcc_std_map = {}
-  if int(NVCC_VERSION.split('.')[0]) >= 11:
-      nvcc_std_map["c++1z"] = "c++17"
-      nvcc_allowed_std_options += ["c++17", "c++1z"]
-  std_options = ''.join([' -std=' +
-      (nvcc_std_map[define] if define in nvcc_std_map else define)
-      for define in std_options if define in nvcc_allowed_std_options][-1:])
-  fatbin_options = ''.join([' --fatbin-options=' + option
-      for option in GetOptionValue(argv, '-Xcuda-fatbinary')])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, '-c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  nvccopts = '-D_FORCE_INLINES '
-  capabilities_sm = set(GetOptionValue(argv, "--cuda-gpu-arch"))
-  capabilities_compute = set(GetOptionValue(argv, '--cuda-include-ptx'))
-  # When both "code=sm_xy" and "code=compute_xy" are requested for a single
-  # arch, they can be combined using "code=xy,compute_xy" which avoids a
-  # redundant PTX generation during compilation.
-  capabilities_both = capabilities_sm.intersection(capabilities_compute)
-  for capability in capabilities_both:
-    capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  for capability in capabilities_sm - capabilities_both:
-    capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
-                                                               capability)
-  for capability in capabilities_compute - capabilities_both:
-    capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (capability,
-                                                                    capability)
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-  # Force C++17 dialect (note, everything in just one string!)
-  nvccopts += ' --std c++17 '
-  nvccopts += fatbin_options
-  # The option `-allow-unsupported-compiler` is required for the combination of
-  # NVCC+clang compilers. 
-  # The following message appears if this option is not provided:
-  # unsupported clang version! clang version must be less than 16 and greater
-  # than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used
-  # to override this version check; however, using an unsupported host compiler
-  # may cause compilation failure or incorrect run time execution.
-  # Use at your own risk.
-  if USE_CLANG_COMPILER:
-    nvccopts += ' -allow-unsupported-compiler --expt-extended-lambda --expt-relaxed-constexpr '
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (NVCC_PATH + ' ' + nvccopts +
-           ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + HOST_COMPILER_PATH +
-           ' -I .' +
-           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
-    if log: Log(cmd)
-    exit_status = system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (NVCC_PATH + ' ' + nvccopts +
-         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + HOST_COMPILER_PATH +
-         ' -I .' +
-         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log: Log(cmd)
-  return system(cmd)
-
-
-def main():
-  if TMPDIR and not USE_CLANG_COMPILER:
-    os.environ['TMPDIR'] = TMPDIR
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [shlex.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
deleted file mode 100755
index e97d13f68121..000000000000
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ /dev/null
@@ -1,276 +0,0 @@
-#!/usr/bin/env python
-"""Crosstool wrapper for compiling ROCm programs.
-
-SYNOPSIS:
-  crosstool_wrapper_driver_rocm [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x rocm" is present in the list of arguments passed
-  to this script, it invokes the hipcc compiler. Most arguments are passed
-  as is as a string to --compiler-options of hipcc. When "-x rocm" is not
-  present, this wrapper invokes gcc with the input arguments as is.
-"""
-
-__author__ = 'whchung@gmail.com (Wen-Heng (Jack) Chung)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by rocm_configure.bzl.
-CPU_COMPILER = ('%{cpu_compiler}')
-USE_CLANG = ('%{compiler_is_clang}' == 'True')
-HOST_COMPILER_PATH = ('%{host_compiler_path}')
-
-HIPCC_PATH = '%{hipcc_path}'
-PREFIX_DIR = os.path.dirname(HOST_COMPILER_PATH)
-HIPCC_ENV = '%{hipcc_env}'
-HIP_RUNTIME_PATH = '%{hip_runtime_path}'
-HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
-ROCR_RUNTIME_PATH = '%{rocr_runtime_path}'
-ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
-VERBOSE = '%{crosstool_verbose}'=='1'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to hipcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-  parser.add_argument('--genco', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers or args.no_canonical_prefixes:
-    opts += ' -no-canonical-prefixes'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-  if args.genco:
-    opts += ' --genco'
-
-  return opts
-
-def system(cmd):
-  """Invokes cmd with os.system().
-
-  Args:
-    cmd: The command.
-
-  Returns:
-    The exit code if the process exited with exit() or -signal
-    if the process was terminated by a signal.
-  """
-  retv = os.system(cmd)
-  if os.WIFEXITED(retv):
-    return os.WEXITSTATUS(retv)
-  else:
-    return -os.WTERMSIG(retv)
-
-
-def InvokeHipcc(argv, log=False):
-  """Call hipcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('hipcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  hipcc_allowed_std_options = ["c++11", "c++14", "c++17"]
-  std_options = ''.join([' -std=' + define
-      for define in std_options if define in hipcc_allowed_std_options])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  hipccopts = ' '
-  # In hip-clang environment, we need to make sure that hip header is included
-  # before some standard math header like <complex> is included in any source.
-  # Otherwise, we get build error.
-  # Also we need to retain warning about uninitialised shared variable as
-  # warning only, even when -Werror option is specified.
-  hipccopts += ' --include=hip/hip_runtime.h '
-  # Force C++17 dialect (note, everything in just one string!)
-  hipccopts += ' --std=c++17 '
-  # Use -fno-gpu-rdc by default for early GPU kernel finalization
-  # This flag would trigger GPU kernels be generated at compile time, instead
-  # of link time. This allows the default host compiler (gcc) be used as the
-  # linker for TensorFlow on ROCm platform.
-  hipccopts += ' -fno-gpu-rdc '
-  hipccopts += ' -fcuda-flush-denormals-to-zero '
-  hipccopts += undefines
-  hipccopts += defines
-  hipccopts += std_options
-  hipccopts += m_options
-  hipccopts += ' --rocm-path="%{rocm_path}" '
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (HIPCC_PATH + ' ' + hipccopts +
-           host_compiler_options +
-           ' -I .' + includes + ' ' + srcs + ' -M -o ' + depfile)
-    cmd = HIPCC_ENV.replace(';', ' ') + ' ' + cmd
-    if log: Log(cmd)
-    if VERBOSE: print(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (HIPCC_PATH + ' ' + hipccopts +
-         host_compiler_options + ' -fPIC' +
-         ' -I .' + opt + includes + ' -c ' + srcs + out)
-
-  cmd = HIPCC_ENV.replace(';', ' ') + ' '\
-        + cmd
-  if log: Log(cmd)
-  if VERBOSE: print(cmd)
-  return system(cmd)
-
-
-def main():
-  # ignore PWD env var
-  os.environ['PWD']=''
-
-  parser = ArgumentParser(fromfile_prefix_chars='@')
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--rocm_log', action='store_true')
-  parser.add_argument('-pass-exit-codes', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if VERBOSE: print('PWD=' + os.getcwd())
-  if VERBOSE: print('HIPCC_ENV=' + HIPCC_ENV)
-
-  if args.x and args.x[0] == 'rocm':
-    # compilation for GPU objects
-    if args.rocm_log: Log('-x rocm')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.rocm_log: Log('using hipcc')
-    return InvokeHipcc(leftover, log=args.rocm_log)
-
-  elif args.pass_exit_codes:
-    # link
-    # with hipcc compiler invoked with -fno-gpu-rdc by default now, it's ok to 
-    # use host compiler as linker, but we have to link with HCC/HIP runtime.
-    # Such restriction would be revised further as the bazel script get
-    # improved to fine tune dependencies to ROCm libraries.
-    gpu_linker_flags = [flag for flag in sys.argv[1:]
-                               if not flag.startswith(('--rocm_log'))]
-
-    gpu_linker_flags.append('-L' + ROCR_RUNTIME_PATH)
-    gpu_linker_flags.append('-Wl,-rpath=' + ROCR_RUNTIME_PATH)
-    gpu_linker_flags.append('-l' + ROCR_RUNTIME_LIBRARY)
-    gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
-    gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
-    gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
-    gpu_linker_flags.append("-lrt")
-    gpu_linker_flags.append("-lstdc++")
-
-    if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
-    return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
-
-  else:
-    # compilation for host objects
-
-    # Strip our flags before passing through to the CPU compiler for files which
-    # are not -x rocm. We can't just pass 'leftover' because it also strips -x.
-    # We not only want to pass -x to the CPU compiler, but also keep it in its
-    # relative location in the argv list (the compiler is actually sensitive to
-    # this).
-    cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                               if not flag.startswith(('--rocm_log'))]
-
-    if not USE_CLANG:
-      cpu_compiler_flags.append('-fno-canonical-system-headers')
-
-    # XXX: SE codes need to be built with gcc, but need this macro defined
-    cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
-    if VERBOSE: print(' '.join([CPU_COMPILER] + cpu_compiler_flags))
-    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl
deleted file mode 100644
index 75474521110d..000000000000
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-"""Crosstool wrapper for compiling SYCL program
-SYNOPSIS:
-  crosstool_wrapper_driver_sycl [options passed in by cc_library()
-                                or cc_binary() rule]
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "sycl_compile" is present in the list of arguments passed
-  to this script, it invokes the sycl compiler. When "sycl_compile" is not
-  present, this wrapper invokes gcc with the input arguments as is.
-"""
-
-from __future__ import print_function
-from argparse import ArgumentParser
-import os
-import subprocess
-import sys
-import shlex
-
-CPU_COMPILER = ('%{cpu_compiler}')
-
-def system(cmd):
-  """Invokes cmd with os.system()"""
-
-  ret = os.system(cmd)
-  if os.WIFEXITED(ret):
-    return os.WEXITSTATUS(ret)
-  else:
-    return -os.WTERMSIG(ret)
-
-def call_compiler(argv):
-  parser = ArgumentParser()
-  parser.add_argument('-c', nargs=1, action='append')
-  parser.add_argument('-o', nargs=1, action='append')
-  args, leftover = parser.parse_known_args(argv)
-
-  flags = leftover
-
-  common_flags = []
-  common_flags.append("-fno-finite-math-only")
-  common_flags.append("-fno-fast-math")
-  common_flags.append("-fexceptions")
-
-  in_files, out_files = [], []
-  if args.c:
-    in_files.append('-c')
-    in_files.extend(args.c[0])
-  if args.o:
-    out_files.append('-o')
-    out_files.extend(args.o[0])
-  flags += (common_flags + in_files + out_files)
-  print("cmd: ", " ".join([CPU_COMPILER] + flags))
-  return subprocess.call([CPU_COMPILER] + flags)
-
-def main():
-  parser = ArgumentParser()
-  parser = ArgumentParser(fromfile_prefix_chars='@')
-  parser.add_argument('-sycl_compile', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  return call_compiler(leftover)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
deleted file mode 100644
index e5a942b66c17..000000000000
--- a/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
+++ /dev/null
@@ -1,1161 +0,0 @@
-"""cc_toolchain_config rule for configuring ROCm toolchain on Linux."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool_path",
-    "variable_with_value",
-    "with_feature_set",
-)
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-
-all_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-    ACTION_NAMES.lto_backend,
-]
-
-all_cpp_compile_actions = [
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-]
-
-preprocessor_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.clif_match,
-]
-
-codegen_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.lto_backend,
-]
-
-all_link_actions = [
-    ACTION_NAMES.cpp_link_executable,
-    ACTION_NAMES.cpp_link_dynamic_library,
-    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-]
-
-lto_index_actions = [
-    ACTION_NAMES.lto_index_for_executable,
-    ACTION_NAMES.lto_index_for_dynamic_library,
-    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-]
-
-def _impl(ctx):
-    tool_paths = [
-        tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-        tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-        tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-        tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-        tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-        tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-        tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-        tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-        tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-        tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-        tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-    ]
-
-    action_configs = []
-
-    supports_pic_feature = feature(
-        name = "supports_pic",
-        enabled = True,
-    )
-    supports_start_end_lib_feature = feature(
-        name = "supports_start_end_lib",
-        enabled = True,
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.compile_flags,
-                    ),
-                ] if ctx.attr.compile_flags else []),
-            ),
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.dbg_compile_flags,
-                    ),
-                ] if ctx.attr.dbg_compile_flags else []),
-                with_features = [with_feature_set(features = ["dbg"])],
-            ),
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.opt_compile_flags,
-                    ),
-                ] if ctx.attr.opt_compile_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-            flag_set(
-                actions = all_cpp_compile_actions + [ACTION_NAMES.lto_backend],
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.cxx_flags,
-                    ),
-                ] if ctx.attr.cxx_flags else []),
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.link_flags,
-                    ),
-                ] if ctx.attr.link_flags else []),
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.opt_link_flags,
-                    ),
-                ] if ctx.attr.opt_link_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    dbg_feature = feature(name = "dbg")
-
-    opt_feature = feature(name = "opt")
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ] + all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fdo_optimize_feature = feature(
-        name = "fdo_optimize",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-use=%{fdo_profile_path}",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_compile_actions,
-                flag_groups = ([
-                    flag_group(
-                        flags = ctx.attr.unfiltered_compile_flags,
-                    ),
-                ] if ctx.attr.unfiltered_compile_flags else []),
-            ),
-        ],
-    )
-
-    library_search_directories_feature = feature(
-        name = "library_search_directories",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-L%{library_search_directories}"],
-                        iterate_over = "library_search_directories",
-                        expand_if_available = "library_search_directories",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_libgcc_feature = feature(
-        name = "static_libgcc",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.lto_index_for_executable,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["-static-libgcc"])],
-                with_features = [
-                    with_feature_set(features = ["static_link_cpp_runtimes"]),
-                ],
-            ),
-        ],
-    )
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                ],
-            ),
-        ],
-    )
-
-    per_object_debug_info_feature = feature(
-        name = "per_object_debug_info",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-gsplit-dwarf"],
-                        expand_if_available = "per_object_debug_info_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cs_fdo_optimize_feature = feature(
-        name = "cs_fdo_optimize",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.lto_backend],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-use=%{fdo_profile_path}",
-                            "-Xclang-only=-Wno-profile-instr-unprofiled",
-                            "-Xclang-only=-Wno-profile-instr-out-of-date",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["csprofile"],
-    )
-
-    autofdo_feature = feature(
-        name = "autofdo",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fauto-profile=%{fdo_profile_path}",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    runtime_library_search_directories_feature = feature(
-        name = "runtime_library_search_directories",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "runtime_library_search_directories",
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-Wl,-rpath,$EXEC_ORIGIN/%{runtime_library_search_directories}",
-                                ],
-                                expand_if_true = "is_cc_test",
-                            ),
-                            flag_group(
-                                flags = [
-                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
-                                ],
-                                expand_if_false = "is_cc_test",
-                            ),
-                        ],
-                        expand_if_available =
-                            "runtime_library_search_directories",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(features = ["static_link_cpp_runtimes"]),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "runtime_library_search_directories",
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
-                                ],
-                            ),
-                        ],
-                        expand_if_available =
-                            "runtime_library_search_directories",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(
-                        not_features = ["static_link_cpp_runtimes"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fission_support_feature = feature(
-        name = "fission_support",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--gdb-index"],
-                        expand_if_available = "is_using_fission",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["-shared"])],
-            ),
-        ],
-    )
-
-    random_seed_feature = feature(
-        name = "random_seed",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-frandom-seed=%{output_file}"],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    includes_feature = feature(
-        name = "includes",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.clif_match,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-include", "%{includes}"],
-                        iterate_over = "includes",
-                        expand_if_available = "includes",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fdo_instrument_feature = feature(
-        name = "fdo_instrument",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                ] + all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-generate=%{fdo_instrument_path}",
-                            "-fno-data-sections",
-                        ],
-                        expand_if_available = "fdo_instrument_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    cs_fdo_instrument_feature = feature(
-        name = "cs_fdo_instrument",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.lto_backend,
-                ] + all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fcs-profile-generate=%{cs_fdo_instrument_path}",
-                        ],
-                        expand_if_available = "cs_fdo_instrument_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["csprofile"],
-    )
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.clif_match,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-iquote", "%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["-I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["-isystem", "%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    symbol_counts_feature = feature(
-        name = "symbol_counts",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wl,--print-symbol-counts=%{symbol_counts_output}",
-                        ],
-                        expand_if_available = "symbol_counts_output",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    llvm_coverage_map_format_feature = feature(
-        name = "llvm_coverage_map_format",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-instr-generate",
-                            "-fcoverage-mapping",
-                        ],
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions + [
-                    "objc-executable",
-                    "objc++-executable",
-                ],
-                flag_groups = [
-                    flag_group(flags = ["-fprofile-instr-generate"]),
-                ],
-            ),
-        ],
-        requires = [feature_set(features = ["coverage"])],
-        provides = ["profile"],
-    )
-
-    strip_debug_symbols_feature = feature(
-        name = "strip_debug_symbols",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,-S"],
-                        expand_if_available = "strip_debug_symbols",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    build_interface_libraries_feature = feature(
-        name = "build_interface_libraries",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "%{generate_interface_library}",
-                            "%{interface_library_builder_path}",
-                            "%{interface_library_input_path}",
-                            "%{interface_library_output_path}",
-                        ],
-                        expand_if_available = "generate_interface_library",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(
-                        features = ["supports_interface_shared_libraries"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    libraries_to_link_feature = feature(
-        name = "libraries_to_link",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-Wl,--start-lib"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-Wl,-whole-archive"],
-                                expand_if_true =
-                                    "libraries_to_link.is_whole_archive",
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.object_files}"],
-                                iterate_over = "libraries_to_link.object_files",
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-l%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "dynamic_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-l:%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "versioned_dynamic_library",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["-Wl,-no-whole-archive"],
-                                expand_if_true = "libraries_to_link.is_whole_archive",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,--end-lib"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                    flag_group(
-                        flags = ["-Wl,@%{thinlto_param_file}"],
-                        expand_if_true = "thinlto_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ] + ([flag_group(flags = ctx.attr.link_libs)] if ctx.attr.link_libs else []),
-            ),
-        ],
-    )
-
-    fdo_prefetch_hints_feature = feature(
-        name = "fdo_prefetch_hints",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.lto_backend,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Xclang-only=-mllvm",
-                            "-Xclang-only=-prefetch-hints-file=%{fdo_prefetch_hints_path}",
-                        ],
-                        expand_if_available = "fdo_prefetch_hints_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    gcc_coverage_map_format_feature = feature(
-        name = "gcc_coverage_map_format",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                    "objc-executable",
-                    "objc++-executable",
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-fprofile-arcs", "-ftest-coverage"],
-                        expand_if_available = "gcov_gcno_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [flag_group(flags = ["--coverage"])],
-            ),
-        ],
-        requires = [feature_set(features = ["coverage"])],
-        provides = ["profile"],
-    )
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(flags = ["rcsD"]),
-                    flag_group(
-                        flags = ["%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                flags = ["%{libraries_to_link.name}"],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flags = ["%{libraries_to_link.object_files}"],
-                                iterate_over = "libraries_to_link.object_files",
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    force_pic_flags_feature = feature(
-        name = "force_pic_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.lto_index_for_executable,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-pie"],
-                        expand_if_available = "force_pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dependency_file_feature = feature(
-        name = "dependency_file",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.objc_compile,
-                    ACTION_NAMES.objcpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-MD", "-MF", "%{dependency_file}"],
-                        expand_if_available = "dependency_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_library_linker_tool_path = tool_paths
-    dynamic_library_linker_tool_feature = feature(
-        name = "dynamic_library_linker_tool",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.lto_index_for_dynamic_library,
-                    ACTION_NAMES.lto_index_for_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [" + cppLinkDynamicLibraryToolPath + "],
-                        expand_if_available = "generate_interface_library",
-                    ),
-                ],
-                with_features = [
-                    with_feature_set(
-                        features = ["supports_interface_shared_libraries"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-o", "%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    # Note that we also set --coverage for c++-link-nodeps-dynamic-library. The
-    # generated code contains references to gcov symbols, and the dynamic linker
-    # can't resolve them unless the library is linked against gcov.
-    coverage_feature = feature(
-        name = "coverage",
-        provides = ["profile"],
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = ([
-                    flag_group(flags = ctx.attr.coverage_compile_flags),
-                ] if ctx.attr.coverage_compile_flags else []),
-            ),
-            flag_set(
-                actions = all_link_actions + lto_index_actions,
-                flag_groups = ([
-                    flag_group(flags = ctx.attr.coverage_link_flags),
-                ] if ctx.attr.coverage_link_flags else []),
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_canonical_prefixes_feature = feature(
-        name = "no-canonical-prefixes",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-no-canonical-prefixes",
-                        ]
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    features = [
-        dependency_file_feature,
-        random_seed_feature,
-        pic_feature,
-        per_object_debug_info_feature,
-        preprocessor_defines_feature,
-        includes_feature,
-        include_paths_feature,
-        fdo_instrument_feature,
-        cs_fdo_instrument_feature,
-        cs_fdo_optimize_feature,
-        fdo_prefetch_hints_feature,
-        autofdo_feature,
-        build_interface_libraries_feature,
-        dynamic_library_linker_tool_feature,
-        symbol_counts_feature,
-        shared_flag_feature,
-        linkstamps_feature,
-        output_execpath_flags_feature,
-        runtime_library_search_directories_feature,
-        library_search_directories_feature,
-        archiver_flags_feature,
-        force_pic_flags_feature,
-        fission_support_feature,
-        strip_debug_symbols_feature,
-        coverage_feature,
-        supports_pic_feature,
-    ] + (
-        [
-            supports_start_end_lib_feature,
-        ] if ctx.attr.supports_start_end_lib else []
-    ) + [
-        default_compile_flags_feature,
-        default_link_flags_feature,
-        libraries_to_link_feature,
-        user_link_flags_feature,
-        static_libgcc_feature,
-        fdo_optimize_feature,
-        supports_dynamic_linker_feature,
-        dbg_feature,
-        opt_feature,
-        user_compile_flags_feature,
-        sysroot_feature,
-        unfiltered_compile_flags_feature,
-        build_id_feature,
-        no_canonical_prefixes_feature,
-        linker_bin_path_feature,
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
-        toolchain_identifier = ctx.attr.toolchain_identifier,
-        host_system_name = ctx.attr.host_system_name,
-        target_system_name = ctx.attr.target_system_name,
-        target_cpu = ctx.attr.cpu,
-        target_libc = ctx.attr.target_libc,
-        compiler = ctx.attr.compiler,
-        abi_version = ctx.attr.abi_version,
-        abi_libc_version = ctx.attr.abi_libc_version,
-        tool_paths = tool_paths,
-    )
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True),
-        "compiler": attr.string(mandatory = True),
-        "toolchain_identifier": attr.string(mandatory = True),
-        "host_system_name": attr.string(mandatory = True),
-        "target_system_name": attr.string(mandatory = True),
-        "target_libc": attr.string(mandatory = True),
-        "abi_version": attr.string(mandatory = True),
-        "abi_libc_version": attr.string(mandatory = True),
-        "cxx_builtin_include_directories": attr.string_list(),
-        "compile_flags": attr.string_list(),
-        "dbg_compile_flags": attr.string_list(),
-        "opt_compile_flags": attr.string_list(),
-        "cxx_flags": attr.string_list(),
-        "link_flags": attr.string_list(),
-        "link_libs": attr.string_list(),
-        "opt_link_flags": attr.string_list(),
-        "unfiltered_compile_flags": attr.string_list(),
-        "coverage_compile_flags": attr.string_list(),
-        "coverage_link_flags": attr.string_list(),
-        "supports_start_end_lib": attr.bool(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "linker_bin_path": attr.string(),
-    },
-    provides = [CcToolchainConfigInfo],
-)
-
diff --git a/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
deleted file mode 100644
index 1b4e1c6def04..000000000000
--- a/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
+++ /dev/null
@@ -1,598 +0,0 @@
-"""cc_toolchain_config rule for configuring SYCL toolchains on Linux."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "artifact_name_pattern",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-    "with_feature_set",
-)
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-
-def all_assembly_actions():
-    return [
-        ACTION_NAMES.assemble,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_compile_actions():
-    return [
-        ACTION_NAMES.assemble,
-        ACTION_NAMES.c_compile,
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_c_compile_actions():
-    return [
-        ACTION_NAMES.c_compile,
-    ]
-
-def all_cpp_compile_actions():
-    return [
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-    ]
-
-def all_preprocessed_actions():
-    return [
-        ACTION_NAMES.c_compile,
-        ACTION_NAMES.cpp_compile,
-        ACTION_NAMES.cpp_header_parsing,
-        ACTION_NAMES.cpp_module_codegen,
-        ACTION_NAMES.cpp_module_compile,
-        ACTION_NAMES.linkstamp_compile,
-        ACTION_NAMES.preprocess_assemble,
-    ]
-
-def all_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-def all_executable_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_executable,
-    ]
-
-def all_shared_library_link_actions():
-    return [
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-def all_archive_actions():
-    return [ACTION_NAMES.cpp_link_static_library]
-
-def all_strip_actions():
-    return [ACTION_NAMES.strip]
-
-def _library_to_link(flag_prefix, value, iterate = None):
-    return flag_group(
-        flags = [
-            "{}%{{libraries_to_link.{}}}".format(
-                flag_prefix,
-                iterate if iterate else "name",
-            ),
-        ],
-        iterate_over = ("libraries_to_link." + iterate if iterate else None),
-        expand_if_equal = variable_with_value(
-            name = "libraries_to_link.type",
-            value = value,
-        ),
-    )
-
-def _surround_static_library(prefix, suffix):
-    return [
-        flag_group(
-            flags = [prefix, "%{libraries_to_link.name}", suffix],
-            expand_if_true = "libraries_to_link.is_whole_archive",
-        ),
-        flag_group(
-            flags = ["%{libraries_to_link.name}"],
-            expand_if_false = "libraries_to_link.is_whole_archive",
-        ),
-    ]
-
-def _prefix_static_library(prefix):
-    return [
-        flag_group(
-            flags = ["%{libraries_to_link.name}"],
-            expand_if_false = "libraries_to_link.is_whole_archive",
-        ),
-        flag_group(
-            flags = [prefix + "%{libraries_to_link.name}"],
-            expand_if_true = "libraries_to_link.is_whole_archive",
-        ),
-    ]
-
-def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
-    if alwayslink_suffix:
-        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
-    else:
-        flag_groups = _prefix_static_library(alwayslink_prefix)
-    return flag_group(
-        flag_groups = flag_groups,
-        expand_if_equal = variable_with_value(
-            name = "libraries_to_link.type",
-            value = "static_library",
-        ),
-    )
-
-def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
-    return flag_group(
-        iterate_over = iterate_over,
-        expand_if_available = iterate_over,
-        flag_groups = flag_groups,
-        flags = flags,
-    )
-
-def _libraries_to_link_group(flavour):
-    if flavour == "linux":
-        return _iterate_flag_group(
-            iterate_over = "libraries_to_link",
-            flag_groups = [
-                flag_group(
-                    flags = ["-Wl,--start-lib"],
-                    expand_if_equal = variable_with_value(
-                        name = "libraries_to_link.type",
-                        value = "object_file_group",
-                    ),
-                ),
-                _library_to_link("", "object_file_group", "object_files"),
-                flag_group(
-                    flags = ["-Wl,--end-lib"],
-                    expand_if_equal = variable_with_value(
-                        name = "libraries_to_link.type",
-                        value = "object_file_group",
-                    ),
-                ),
-                _library_to_link("", "object_file"),
-                _library_to_link("", "interface_library"),
-                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
-                _library_to_link("-l", "dynamic_library"),
-                _library_to_link("-l:", "versioned_dynamic_library"),
-            ],
-        )
-
-def _action_configs_with_tool(path, actions):
-    return [
-        action_config(
-            action_name = name,
-            enabled = True,
-            tools = [tool(path = path)],
-        )
-        for name in actions
-    ]
-
-def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
-    return _action_configs_with_tool(
-        assembly_path,
-        all_assembly_actions(),
-    ) + _action_configs_with_tool(
-        c_compiler_path,
-        all_c_compile_actions(),
-    ) + _action_configs_with_tool(
-        cc_compiler_path,
-        all_cpp_compile_actions(),
-    ) + _action_configs_with_tool(
-        archiver_path,
-        all_archive_actions(),
-    ) + _action_configs_with_tool(
-        linker_path,
-        all_link_actions(),
-    ) + _action_configs_with_tool(
-        strip_path,
-        all_strip_actions(),
-    )
-
-def _tool_paths(cpu, ctx):
-    if cpu == "local":
-        return [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-def _sysroot_group():
-    return flag_group(
-        flags = ["--sysroot=%{sysroot}"],
-        expand_if_available = "sysroot",
-    )
-
-def _no_canonical_prefixes_group(extra_flags):
-    return flag_group(
-        flags = [
-            "-no-canonical-prefixes",
-        ] + extra_flags,
-    )
-
-def _nologo():
-    return flag_group(flags = ["/nologo"])
-
-def _features(cpu, compiler, ctx):
-    if cpu == "local":
-        return [
-            feature(name = "no_legacy_features"),
-            feature(
-                name = "all_compile_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-MD", "-MF", "%{dependency_file}"],
-                                expand_if_available = "dependency_file",
-                            ),
-                            flag_group(
-                                flags = ["-gsplit-dwarf"],
-                                expand_if_available = "per_object_debug_info_file",
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_preprocessed_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = ["-frandom-seed=%{output_file}"],
-                                expand_if_available = "output_file",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-D%{preprocessor_defines}"],
-                                iterate_over = "preprocessor_defines",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-include", "%{includes}"],
-                                iterate_over = "includes",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-iquote", "%{quote_include_paths}"],
-                                iterate_over = "quote_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-I%{include_paths}"],
-                                iterate_over = "include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-isystem", "%{system_include_paths}"],
-                                iterate_over = "system_include_paths",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-F", "%{framework_include_paths}"],
-                                iterate_over = "framework_include_paths",
-                            ),
-                        ] + ([
-                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
-                        ] if ctx.attr.host_unfiltered_compile_flags else []),
-                    ),
-                    flag_set(
-                        actions = all_cpp_compile_actions(),
-                        flag_groups = [
-                            flag_group(flags = [
-                                "-fmerge-all-constants",
-                            ]),
-                        ] if compiler == "clang" else [],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-Wno-builtin-macro-redefined",
-                                    "-D__DATE__=\"redacted\"",
-                                    "-D__TIMESTAMP__=\"redacted\"",
-                                    "-D__TIME__=\"redacted\"",
-                                ],
-                            ),
-                            flag_group(
-                                flags = ["-fPIC"],
-                                expand_if_available = "pic",
-                            ),
-                            flag_group(
-                                flags = ["-fPIE"],
-                                expand_if_not_available = "pic",
-                            ),
-                            flag_group(
-                                flags = [
-                                    "-U_FORTIFY_SOURCE",
-                                    "-D_FORTIFY_SOURCE=1",
-                                    "-fstack-protector",
-                                    "-Wall",
-                                ] + ctx.attr.host_compiler_warnings + [
-                                    "-fno-omit-frame-pointer",
-                                ],
-                            ),
-                            _no_canonical_prefixes_group(
-                                ctx.attr.extra_no_canonical_prefixes_flags,
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-                        with_features = [with_feature_set(features = ["disable-assertions"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-g0",
-                                    "-O2",
-                                    "-ffunction-sections",
-                                    "-fdata-sections",
-                                ],
-                            ),
-                        ],
-                        with_features = [with_feature_set(features = ["opt"])],
-                    ),
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [flag_group(flags = ["-g"])],
-                        with_features = [with_feature_set(features = ["dbg"])],
-                    ),
-                ] + [
-                    flag_set(
-                        actions = all_compile_actions(),
-                        flag_groups = [
-                            _iterate_flag_group(
-                                flags = ["%{user_compile_flags}"],
-                                iterate_over = "user_compile_flags",
-                            ),
-                            _sysroot_group(),
-                            flag_group(
-                                expand_if_available = "source_file",
-                                flags = ["-c", "%{source_file}"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_assembly_file",
-                                flags = ["-S"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_preprocess_file",
-                                flags = ["-E"],
-                            ),
-                            flag_group(
-                                expand_if_available = "output_file",
-                                flags = ["-o", "%{output_file}"],
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_archive_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_archive_actions(),
-                        flag_groups = [
-                            flag_group(
-                                expand_if_available = "linker_param_file",
-                                flags = ["@%{linker_param_file}"],
-                            ),
-                            flag_group(flags = ["rcsD"]),
-                            flag_group(
-                                flags = ["%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                            flag_group(
-                                iterate_over = "libraries_to_link",
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_equal = variable_with_value(
-                                            name = "libraries_to_link.type",
-                                            value = "object_file",
-                                        ),
-                                    ),
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.object_files}"],
-                                        iterate_over = "libraries_to_link.object_files",
-                                        expand_if_equal = variable_with_value(
-                                            name = "libraries_to_link.type",
-                                            value = "object_file_group",
-                                        ),
-                                    ),
-                                ],
-                                expand_if_available = "libraries_to_link",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-            feature(
-                name = "all_link_flags",
-                enabled = True,
-                flag_sets = [
-                    flag_set(
-                        actions = all_shared_library_link_actions(),
-                        flag_groups = [flag_group(flags = ["-shared"])],
-                    ),
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = ([
-                            flag_group(flags = ["-Wl,-no-as-needed"]),
-                        ] if cpu == "local" else []) + ([
-                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path]),
-                        ] if ctx.attr.linker_bin_path else []) + [
-                            flag_group(
-                                flags = ["@%{linker_param_file}"],
-                                expand_if_available = "linker_param_file",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["%{linkstamp_paths}"],
-                                iterate_over = "linkstamp_paths",
-                            ),
-                            flag_group(
-                                flags = ["-o", "%{output_execpath}"],
-                                expand_if_available = "output_execpath",
-                            ),
-                            _iterate_flag_group(
-                                flags = ["-L%{library_search_directories}"],
-                                iterate_over = "library_search_directories",
-                            ),
-                            _iterate_flag_group(
-                                iterate_over = "runtime_library_search_directories",
-                                flags = [
-                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
-                                ] if cpu == "local" else [
-                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
-                                ],
-                            ),
-                            _libraries_to_link_group("linux"),
-                            _iterate_flag_group(
-                                flags = ["%{user_link_flags}"],
-                                iterate_over = "user_link_flags",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,--gdb-index"],
-                                expand_if_available = "is_using_fission",
-                            ),
-                            flag_group(
-                                flags = ["-Wl,-S"],
-                                expand_if_available = "strip_debug_symbols",
-                            ),
-                            flag_group(flags = ["-lstdc++"]),
-                            _no_canonical_prefixes_group(
-                                ctx.attr.extra_no_canonical_prefixes_flags,
-                            ),
-                        ],
-                    ),
-                    flag_set(
-                        actions = all_executable_link_actions(),
-                        flag_groups = [flag_group(flags = ["-pie"])],
-                    ),
-                ] + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = [
-                            "-Wl,-z,relro,-z,now",
-                        ])],
-                    ),
-                ]) + ([
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            flag_group(flags = ["-Wl,--gc-sections"]),
-                            flag_group(
-                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                            ),
-                        ],
-                    ),
-                ]) + [
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [
-                            _sysroot_group(),
-                        ],
-                    ),
-                ],
-            ),
-            feature(name = "disable-assertions"),
-            feature(
-                name = "opt",
-                implies = ["disable-assertions"],
-            ),
-            feature(name = "fastbuild"),
-            feature(name = "dbg"),
-            feature(name = "supports_dynamic_linker", enabled = True),
-            feature(name = "pic", enabled = True),
-            feature(name = "supports_pic", enabled = True),
-            feature(name = "has_configured_linker_path", enabled = True),
-        ]
-    else:
-        fail("Unreachable")
-
-def _impl(ctx):
-    cpu = ctx.attr.cpu
-    compiler = ctx.attr.compiler
-
-    if (cpu == "local"):
-        toolchain_identifier = "local_linux"
-        target_cpu = "local"
-        target_libc = "local"
-        action_configs = _action_configs(
-            assembly_path = ctx.attr.host_compiler_path,
-            c_compiler_path = ctx.attr.host_compiler_path,
-            cc_compiler_path = ctx.attr.host_compiler_path,
-            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
-            linker_path = ctx.attr.host_compiler_path,
-            strip_path = ctx.attr.host_compiler_prefix + "/strip",
-        )
-        artifact_name_patterns = []
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = _features(cpu, compiler, ctx),
-            action_configs = action_configs,
-            artifact_name_patterns = artifact_name_patterns,
-            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = "local",
-            target_system_name = "local",
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = "local",
-            abi_libc_version = "local",
-            tool_paths = _tool_paths(cpu, ctx),
-            make_variables = [],
-            builtin_sysroot = ctx.attr.builtin_sysroot,
-            cc_target_os = None,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["local"]),
-        "compiler": attr.string(values = ["clang", "unknown"], default = "unknown"),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "builtin_sysroot": attr.string(),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
deleted file mode 100644
index eb3a1d8c8ddf..000000000000
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ /dev/null
@@ -1,259 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import tempfile
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('%{cpu_compiler}')
-GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
-
-NVCC_PATH = '%{nvcc_path}'
-NVCC_VERSION = '%{cuda_version}'
-NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='-/')
-  parser.add_argument(option, nargs='*', action='append')
-  option = option.lstrip('-/').replace('-', '_')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise Error('No source files found for cuda compilation.')
-
-  out_file = [ f for f in argv if f.startswith('/Fo') ]
-  if len(out_file) != 1:
-    raise Error('Please specify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, '/O')
-  opt = ['-g']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, '/I')
-  includes = ["-I " + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, '/D')
-  defines = [
-      '-D' + define
-      for define in defines
-      if 'BAZEL_CURRENT_REPOSITORY' not in define
-  ]
-
-  undefines, argv = GetOptionValue(argv, '/U')
-  undefines = ['-U' + define for define in undefines]
-
-  fatbin_options, argv = GetOptionValue(argv, '-Xcuda-fatbinary')
-  fatbin_options = ['--fatbin-options=' + option for option in fatbin_options]
-
-  # The rest of the unrecognized options should be passed to host compiler
-  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
-
-  m_options = ["-m64"]
-
-  nvccopts = ['-D_FORCE_INLINES']
-  compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
-  for capability in compute_capabilities:
-    capability = capability[len('sm_'):]
-    nvccopts += [
-        r'-gencode=arch=compute_%s,"code=sm_%s"' % (capability, capability)
-    ]
-  compute_capabilities, argv = GetOptionValue(argv, '--cuda-include-ptx')
-  for capability in compute_capabilities:
-    capability = capability[len('sm_'):]
-    nvccopts += [
-        r'-gencode=arch=compute_%s,"code=compute_%s"' % (capability, capability)
-    ]
-  _, argv = GetOptionValue(argv, '--no-cuda-include-ptx')
-
-  # nvcc doesn't respect the INCLUDE and LIB env vars from MSVC,
-  # so we explicity specify the system include paths and library search paths.
-  if 'INCLUDE' in os.environ:
-    nvccopts += [('--system-include="%s"' % p) for p in os.environ['INCLUDE'].split(";")]
-  if 'LIB' in os.environ:
-    nvccopts += [('--library-path="%s"' % p) for p in os.environ['LIB'].split(";")]
-
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += fatbin_options
-  nvccopts += ['--compiler-options=' + ",".join(host_compiler_options)]
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # Specify a unique temp directory for nvcc to generate intermediate files,
-  # then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  # Provide a unique dir for each compiling action to avoid conflicts.
-  tempdir = tempfile.mkdtemp(dir = NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', tempdir]
-  # Force C++17 dialect (note, everything in just one string!)
-  nvccopts += ['--std c++17']
-  # This is so that nvcc does not complain about MSVC or CLANG.
-  nvccopts += ['-allow-unsupported-compiler']
-  nvccopts += ['--expt-extended-lambda', '--expt-relaxed-constexpr']
-  if log:
-    Log([NVCC_PATH] + nvccopts)
-
-  # Store command line options in a file to avoid hitting the character limit.
-  optsfile = tempfile.NamedTemporaryFile(mode='w', dir=tempdir, delete=False)
-  optsfile.write("\n".join(nvccopts))
-  optsfile.close()
-
-  proc = subprocess.Popen([NVCC_PATH, "--options-file", optsfile.name],
-                          stdout=sys.stdout,
-                          stderr=sys.stderr,
-                          env=os.environ.copy(),
-                          shell=True)
-  proc.wait()
-  return proc.returncode
-
-def ExpandParamsFileForArgv():
-  new_argv = []
-  for arg in sys.argv:
-    if arg.startswith("@"):
-      with open(arg.strip("@")) as f:
-        new_argv.extend([l.strip() for l in f.readlines()])
-    else:
-      new_argv.append(arg)
-
-  sys.argv = new_argv
-
-def ProcessFlagForCommandFile(flag):
-  if flag.startswith("/D") or flag.startswith("-D"):
-    # We need to re-escape /DFOO="BAR" as /DFOO=\"BAR\", so that we get
-    # `#define FOO "BAR"` after expansion as a string literal define
-    if flag.endswith('"') and not flag.endswith('\\"'):
-      flag = '\\"'.join(flag.split('"', 1))
-      flag = '\\"'.join(flag.rsplit('"', 1))
-      return flag
-  return flag
-
-def main():
-  ExpandParamsFileForArgv()
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))
-                             and not flag.startswith(('-nvcc_options'))]
-  output = [flag for flag in cpu_compiler_flags if flag.startswith("/Fo")]
-
-  # Store command line options in a file to avoid hitting the character limit.
-  if len(output) == 1:
-    commandfile_path = output[0][3:] + ".msvc_params"
-    commandfile = open(commandfile_path, "w")
-    cpu_compiler_flags = [ProcessFlagForCommandFile(flag) for flag in cpu_compiler_flags]
-    commandfile.write("\n".join(cpu_compiler_flags))
-    commandfile.close()
-    return subprocess.call([CPU_COMPILER, "@" + commandfile_path])
-  else:
-    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
deleted file mode 100644
index 6fa0d1e3bdd6..000000000000
--- a/third_party/gpus/cuda/BUILD.tpl
+++ /dev/null
@@ -1,320 +0,0 @@
-# NB: DEPRECATED! This file is a part of the deprecated `cuda_configure` rule.
-# Please use `hermetic/cuda_configure` instead.
-
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("@bazel_skylib//lib:selects.bzl", "selects")
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-# Config setting whether TensorFlow is built with CUDA support using clang.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
-selects.config_setting_group(
-    name = "using_clang",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_clang",
-    ],
-)
-
-# Config setting whether TensorFlow is built with CUDA support using nvcc.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
-selects.config_setting_group(
-    name = "using_nvcc",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_nvcc",
-    ],
-)
-
-# Equivalent to using_clang && -c opt.
-selects.config_setting_group(
-    name = "using_clang_opt",
-    match_all = [
-        ":using_clang",
-        ":_opt",
-    ],
-)
-
-config_setting(
-    name = "_opt",
-    values = {"compilation_mode": "opt"},
-)
-
-# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
-# All clients including TensorFlow should use these directives.
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-# See comment on identically named target in hermetic/BUILD.tpl. This is here
-# to keep users who have still not migrated from hermetic cuda from being
-# broken.
-alias(
-  name = "implicit_cuda_headers_dependency",
-  actual = ":cuda_headers",
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/%{cudart_static_lib}"],
-    linkopts = [
-        "-ldl",
-        "-lpthread",
-        %{cudart_static_linkopt}
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/%{cuda_driver_lib}"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/%{cudart_lib}"],
-    data = ["cuda/lib/%{cudart_lib}"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cusolver_headers",
-    hdrs = [":cusolver-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cusolver/include"],
-    strip_include_prefix = "cusolver/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cufft_headers",
-    hdrs = [":cufft-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cufft/include"],
-    strip_include_prefix = "cufft/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cusparse_headers",
-    hdrs = [":cusparse-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cusparse/include"],
-    strip_include_prefix = "cusparse/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "curand_headers",
-    hdrs = [":curand-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["curand/include"],
-    strip_include_prefix = "curand/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/%{cublas_lib}"],
-    data = ["cuda/lib/%{cublas_lib}"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cublasLt",
-    srcs = ["cuda/lib/%{cublasLt_lib}"],
-    data = ["cuda/lib/%{cublasLt_lib}"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/%{cusolver_lib}"],
-    data = ["cuda/lib/%{cusolver_lib}"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/%{cudnn_lib}"],
-    data = ["cuda/lib/%{cudnn_lib}"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/%{cufft_lib}"],
-    data = ["cuda/lib/%{cufft_lib}"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/%{curand_lib}"],
-    data = ["cuda/lib/%{curand_lib}"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cublasLt",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-alias(
-    name = "cub_headers",
-    actual = "%{cub_actual}",
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "nvml_headers",
-    hdrs = [":nvml"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/nvml/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/%{cupti_lib}"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/%{cusparse_lib}"],
-    data = ["cuda/lib/%{cusparse_lib}"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-py_library(
-    name = "cuda_config_py",
-    srcs = ["cuda/cuda_config.py"],
-)
-
-# Build setting that is always true (i.e. it can not be changed on the
-# command line). It is used to create the config settings below that are
-# always or never satisfied.
-bool_setting(
-    name = "true_setting",
-    visibility = ["//visibility:private"],
-    build_setting_default = True,
-)
-
-# Config settings whether TensorFlow is built with CUDA.
-# These configs are never satisfied.
-config_setting(
-    name = "cuda_tools",
-    flag_values = {":true_setting": "False"},
-)
-
-# Flags indicating if we should include CUDA libs.
-bool_flag(
-    name = "include_cuda_libs",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "cuda_libs",
-    flag_values = {":true_setting": "False"},
-)
-
-bool_flag(
-    name = "override_include_cuda_libs",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "overrided_cuda_libs",
-    flag_values = {":true_setting": "False"},
-)
-
-selects.config_setting_group(
-    name = "any_cuda_libs",
-    match_any = [
-        ":cuda_libs",
-        ":overrided_cuda_libs"
-    ],
-)
-
-selects.config_setting_group(
-    name = "cuda_tools_and_libs",
-    match_all = [
-        ":any_cuda_libs",
-        ":cuda_tools"
-    ],
-)
-
-%{copy_rules}
-
-cc_library(
-    # This is not yet fully supported, but we need the rule
-    # to make bazel query happy.
-    name = "nvptxcompiler",
-)
-
-cc_library(
-    # This is not yet fully supported, but we need the rule
-    # to make bazel query happy.
-    name = "nvjitlink",
-)
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
deleted file mode 100644
index 6b25c8398a71..000000000000
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ /dev/null
@@ -1,242 +0,0 @@
-# NB: DEPRECATED! This file is a part of the deprecated `cuda_configure` rule.
-# Hermetic CUDA repository rule doesn't support Windows.
-# Please use `hermetic/cuda_configure`.
-
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("@bazel_skylib//lib:selects.bzl", "selects")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-# Config setting whether TensorFlow is built with CUDA support using clang.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
-selects.config_setting_group(
-    name = "using_clang",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_clang",
-    ],
-)
-
-# Config setting whether TensorFlow is built with CUDA support using nvcc.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
-selects.config_setting_group(
-    name = "using_nvcc",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_nvcc",
-    ],
-)
-
-# Equivalent to using_clang && -c opt.
-selects.config_setting_group(
-    name = "using_clang_opt",
-    match_all = [
-        ":using_clang",
-        ":_opt",
-    ],
-)
-
-config_setting(
-    name = "_opt",
-    values = {"compilation_mode": "opt"},
-)
-
-# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
-# All clients including TensorFlow should use these directives.
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_import(
-    name = "cudart_static",
-    # /WHOLEARCHIVE:cudart_static.lib will cause a
-    # "Internal error during CImplib::EmitThunk" error.
-    # Treat this library as interface library to avoid being whole archived when
-    # linking a DLL that depends on this.
-    # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
-    interface_library = "cuda/lib/%{cudart_static_lib}",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "cuda_driver",
-    interface_library = "cuda/lib/%{cuda_driver_lib}",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "cudart",
-    interface_library = "cuda/lib/%{cudart_lib}",
-    system_provided = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cusolver_headers",
-    hdrs = [":cusolver-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cusolver/include"],
-    strip_include_prefix = "cusolver/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cufft_headers",
-    hdrs = [":cufft-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cufft/include"],
-    strip_include_prefix = "cufft/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "cusparse_headers",
-    hdrs = [":cusparse-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cusparse/include"],
-    strip_include_prefix = "cusparse/include",
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "curand_headers",
-    hdrs = [":curand-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["curand/include"],
-    strip_include_prefix = "curand/include",
-    deps = [":cuda_headers"],
-)
-
-cc_import(
-    name = "cublas",
-    interface_library = "cuda/lib/%{cublas_lib}",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "cublasLt",
-    interface_library = "cuda/lib/%{cublasLt_lib}",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "cusolver",
-    interface_library = "cuda/lib/%{cusolver_lib}",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "cudnn",
-    interface_library = "cuda/lib/%{cudnn_lib}",
-    system_provided = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_import(
-    name = "cufft",
-    interface_library = "cuda/lib/%{cufft_lib}",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "curand",
-    interface_library = "cuda/lib/%{curand_lib}",
-    system_provided = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cublasLt",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-alias(
-    name = "cub_headers",
-    actual = "%{cub_actual}",
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cuda_header_library(
-    name = "nvml_headers",
-    hdrs = [":nvml"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/nvml/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_import(
-    name = "cupti_dsos",
-    interface_library = "cuda/lib/%{cupti_lib}",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "cusparse",
-    interface_library = "cuda/lib/%{cusparse_lib}",
-    system_provided = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-py_library(
-    name = "cuda_config_py",
-    srcs = ["cuda/cuda_config.py"],
-)
-
-%{copy_rules}
diff --git a/third_party/gpus/cuda/LICENSE b/third_party/gpus/cuda/LICENSE
deleted file mode 100644
index d3da228420e9..000000000000
--- a/third_party/gpus/cuda/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2015, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
deleted file mode 100644
index 6c1b68ffb77b..000000000000
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ /dev/null
@@ -1,196 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-    """
-    return select({
-        "@local_config_cuda//:is_cuda_enabled": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_cuda_clang(if_true, if_false = []):
-   """Shorthand for select()'ing on wheteher we're building with cuda-clang.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with cuda-clang.  Otherwise, the select statement evaluates to if_false.
-   """
-   return select({
-       "@local_config_cuda//cuda:using_clang": if_true,
-       "//conditions:default": if_false
-   })
-
-def if_cuda_exec(if_true, if_false = []):
-    """Synonym for if_cuda.
-
-    Selects if_true both in target and in exec configurations. In principle,
-    if_cuda would only need to select if_true in a target configuration, but
-    not in an exec configuration, but this is not currently implemented.
-    """
-    return if_cuda(if_true, if_false)
-
-def cuda_compiler(if_cuda_clang, if_nvcc, neither = []):
-    """Shorthand for select()'ing on wheteher we're building with cuda-clang or nvcc.
-
-     Returns a select statement which evaluates to if_cuda_clang if we're building
-     with cuda-clang, if_nvcc if we're building with NVCC.
-     Otherwise, the select statement evaluates to neither.
-
-    """
-    if %{cuda_is_configured}:
-        return select({
-            "@local_config_cuda//cuda:using_clang": if_cuda_clang,
-            "@local_config_cuda//:is_cuda_compiler_nvcc": if_nvcc,
-            "//conditions:default": neither
-        })
-    else:
-        return select({
-            "//conditions:default": neither
-        })
-
-def if_cuda_clang_opt(if_true, if_false = []):
-   """Shorthand for select()'ing on wheteher we're building with cuda-clang
-   in opt mode.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with cuda-clang in opt mode. Otherwise, the select statement evaluates to
-    if_false.
-
-   """
-   return select({
-       "@local_config_cuda//cuda:using_clang_opt": if_true,
-       "//conditions:default": if_false
-   })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda([
-        "-x", "cuda",
-        "-DGOOGLE_CUDA=1",
-    ] + %{cuda_extra_copts}) + if_cuda_clang_opt(
-        # Some important CUDA optimizations are only enabled at O3.
-        ["-O3"]
-    ) + cuda_compiler(
-        if_cuda_clang = [ "-Xcuda-fatbinary", "--compress-all"],
-        if_nvcc = [
-            "-Xcuda-fatbinary=--compress-all",
-            # Ensure that NVCC matches clang's constexpr behavior.
-            "-nvcc_options=expt-relaxed-constexpr"
-        ]
-    )
-
-def cuda_gpu_architectures():
-    """Returns a list of supported GPU architectures."""
-    return %{cuda_gpu_architectures}
-
-def if_cuda_is_configured(x, no_cuda = []):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if %{cuda_is_configured}:
-      return select({"//conditions:default": x})
-    return select({"//conditions:default": no_cuda})
-
-def if_cuda_newer_than(wanted_ver, if_true, if_false = []):
-    """Tests if CUDA was enabled during the configured process and if the
-    configured version is at least `wanted_ver`. `wanted_ver` needs
-    to be provided as a string in the format `<major>_<minor>`.
-    Example: `11_0`
-    """
-
-    wanted_major = int(wanted_ver.split('_')[0])
-    wanted_minor = int(wanted_ver.split('_')[1])
-
-    # Strip "64_" which appears in the CUDA version on Windows.
-    configured_version = "%{cuda_version}".rsplit("_", 1)[-1]
-    configured_version_parts = configured_version.split('.')
-
-    # On Windows, the major and minor versions are concatenated without a period and the minor only contains one digit.
-    if len(configured_version_parts) == 1:
-        configured_version_parts = [configured_version[0:-1], configured_version[-1:]]
-
-    configured_major = int(configured_version_parts[0])
-    configured_minor = int(configured_version_parts[1])
-
-    if %{cuda_is_configured} and (wanted_major, wanted_minor) <= (configured_major, configured_minor):
-      return select({"//conditions:default": if_true})
-    return select({"//conditions:default": if_false})
-
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], tags = [], deps = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(
-        copts = cuda_default_copts() + copts,
-        tags = tags + ["gpu"],
-        deps = deps + if_cuda_is_configured([
-            "@local_config_cuda//cuda:implicit_cuda_headers_dependency",
-        ]),
-        **kwargs
-    )
-
-def cuda_cc_test(copts = [], **kwargs):
-    """Wrapper over cc_test which adds default CUDA options."""
-    native.cc_test(copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]), **kwargs)
-
-EnableCudaInfo = provider()
-
-def _enable_cuda_flag_impl(ctx):
-    value = ctx.build_setting_value
-    if ctx.attr.enable_override:
-        print(
-            "\n\033[1;33mWarning:\033[0m '--define=using_cuda_nvcc' will be " +
-            "unsupported soon. Use '--@local_config_cuda//:enable_cuda' " +
-            "instead."
-        )
-        value = True
-    return EnableCudaInfo(value = value)
-
-enable_cuda_flag = rule(
-    implementation = _enable_cuda_flag_impl,
-    build_setting = config.bool(flag = True),
-    attrs = {"enable_override": attr.bool()},
-)
-
-def if_version_equal_or_greater_than(
-        lib_version,
-        dist_version,
-        if_true,
-        if_false = []):
-    if tuple([int(x) for x in lib_version.split(".")]) >= tuple([
-        int(x)
-        for x in dist_version.split(".")
-    ]):
-        return if_true
-    return if_false
diff --git a/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/gpus/cuda/cuda_config.h.tpl
deleted file mode 100644
index 03ecd0159f49..000000000000
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_VERSION "%{cuda_version}"
-#define TF_CUDART_VERSION "%{cudart_version}"
-#define TF_CUPTI_VERSION "%{cupti_version}"
-#define TF_CUBLAS_VERSION "%{cublas_version}"
-#define TF_CUSOLVER_VERSION "%{cusolver_version}"
-#define TF_CURAND_VERSION "%{curand_version}"
-#define TF_CUFFT_VERSION "%{cufft_version}"
-#define TF_CUSPARSE_VERSION "%{cusparse_version}"
-#define TF_CUDNN_VERSION "%{cudnn_version}"
-
-#define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
-
-#define TF_CUDA_COMPUTE_CAPABILITIES %{cuda_compute_capabilities}
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/gpus/cuda/cuda_config.py.tpl b/third_party/gpus/cuda/cuda_config.py.tpl
deleted file mode 100644
index 3da256b54b42..000000000000
--- a/third_party/gpus/cuda/cuda_config.py.tpl
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-config = %{cuda_config}
diff --git a/third_party/gpus/cuda/hermetic/BUILD.tpl b/third_party/gpus/cuda/hermetic/BUILD.tpl
deleted file mode 100644
index 58c4638dd55c..000000000000
--- a/third_party/gpus/cuda/hermetic/BUILD.tpl
+++ /dev/null
@@ -1,310 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("@bazel_skylib//lib:selects.bzl", "selects")
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-# Config setting whether TensorFlow is built with CUDA support using clang.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
-selects.config_setting_group(
-    name = "using_clang",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_clang",
-    ],
-)
-
-# Config setting whether TensorFlow is built with CUDA support using nvcc.
-#
-# TODO(b/174244321), DEPRECATED: this target will be removed when all users
-# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
-selects.config_setting_group(
-    name = "using_nvcc",
-    match_all = [
-        "@local_config_cuda//:is_cuda_enabled",
-        "@local_config_cuda//:is_cuda_compiler_nvcc",
-    ],
-)
-
-# Equivalent to using_clang && -c opt.
-selects.config_setting_group(
-    name = "using_clang_opt",
-    match_all = [
-        ":using_clang",
-        ":_opt",
-    ],
-)
-
-config_setting(
-    name = "_opt",
-    values = {"compilation_mode": "opt"},
-)
-
-# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
-# All clients including TensorFlow should use these directives.
-cc_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-    ],
-    deps = [":cudart_headers",
-            ":cublas_headers",
-            ":cccl_headers",
-            ":nvtx_headers",
-            ":nvcc_headers",
-            ":cusolver_headers",
-            ":cufft_headers",
-            ":cusparse_headers",
-            ":curand_headers",
-            ":cupti_headers",
-            ":nvml_headers",
-            ":nvjitlink_headers"],
-)
-
-# This target is needed by the `cuda_library` rule. We can't implicitly
-# depend on `:cuda_headers` directly since the user may explicit depend
-# on `:cuda_headers` and duplicated dependencies are not allowed in Bazel.
-# There is also no good way to deduplicate dependencies, but an alias works
-# just fine.
-alias(
-    name = "implicit_cuda_headers_dependency",
-    actual = ":cuda_headers",
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["@cuda_cudart//:static"],
-    linkopts = [
-        "-ldl",
-        "-lpthread",
-        %{cudart_static_linkopt}
-    ],
-)
-
-alias(
-  name = "cuda_runtime",
-  actual = ":cudart_static",
-)
-
-alias(
-    name = "cuda_driver",
-    actual = select({
-        "@cuda_driver//:forward_compatibility": "@cuda_driver//:nvidia_driver",
-        "//conditions:default": "@cuda_cudart//:cuda_driver",
-    }),
-)
-
-alias(
-  name = "cudart_headers",
-  actual = "@cuda_cudart//:headers",
-)
-
-alias(
-  name = "cudart",
-  actual = "@cuda_cudart//:cudart",
-)
-
-alias(
-  name = "nvtx_headers",
-  actual = "@cuda_nvtx//:headers",
-)
-
-alias(
-  name = "nvml_headers",
-  actual = "@cuda_nvml//:headers",
-)
-
-alias(
-  name = "nvcc_headers",
-  actual = "@cuda_nvcc//:headers",
-)
-
-alias(
-  name = "cccl_headers",
-  actual = "@cuda_cccl//:headers",
-)
-
-alias(
-  name = "cublas_headers",
-  actual = "@cuda_cublas//:headers",
-)
-
-alias(
-  name = "cusolver_headers",
-  actual = "@cuda_cusolver//:headers",
-)
-
-alias(
-  name = "cufft_headers",
-  actual = "@cuda_cufft//:headers",
-)
-
-alias(
-  name = "cusparse_headers",
-  actual = "@cuda_cusparse//:headers",
-)
-
-alias(
-  name = "curand_headers",
-  actual = "@cuda_curand//:headers",
-)
-
-alias(
-  name = "nvjitlink_headers",
-  actual = "@cuda_nvjitlink//:headers",
-)
-
-alias(
-  name = "cublas",
-  actual = "@cuda_cublas//:cublas",
-)
-
-alias(
-  name = "cublasLt",
-  actual = "@cuda_cublas//:cublasLt",
-)
-
-alias(
-  name = "cusolver",
-  actual = "@cuda_cusolver//:cusolver",
-)
-
-alias(
-  name = "cudnn",
-  actual = "@cuda_cudnn//:cudnn",
-)
-
-alias(
-  name = "cudnn_header",
-  actual = "@cuda_cudnn//:headers",
-)
-
-alias(
-  name = "cufft",
-  actual = "@cuda_cufft//:cufft",
-)
-
-alias(
-  name = "curand",
-  actual = "@cuda_curand//:curand",
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cublasLt",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-alias(
-    name = "cub_headers",
-    actual = ":cuda_headers",
-)
-
-alias(
-  name = "cupti_headers",
-  actual = "@cuda_cupti//:headers",
-)
-
-alias(
-  name = "cupti_dsos",
-  actual = "@cuda_cupti//:cupti",
-)
-
-alias(
-  name = "cusparse",
-  actual = "@cuda_cusparse//:cusparse",
-)
-
-alias(
-    name = "cuda-nvvm",
-    actual = "@cuda_nvcc//:nvvm",
-)
-
-alias(
-    name = "nvjitlink",
-    actual = "@cuda_nvjitlink//:nvjitlink"
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-py_library(
-    name = "cuda_config_py",
-    srcs = ["cuda/cuda_config.py"],
-)
-
-# Config setting whether TensorFlow is built with CUDA.
-alias(
-    name = "cuda_tools",
-    actual = "@local_config_cuda//:is_cuda_enabled",
-)
-
-# Flag indicating if we should include CUDA libs.
-bool_flag(
-    name = "include_cuda_libs",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "cuda_libs",
-    flag_values = {":include_cuda_libs": "True"},
-)
-
-# This flag should be used only when someone wants to build the wheel with CUDA
-# dependencies.
-bool_flag(
-    name = "override_include_cuda_libs",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "overrided_cuda_libs",
-    flag_values = {":override_include_cuda_libs": "True"},
-)
-
-selects.config_setting_group(
-    name = "any_cuda_libs",
-    match_any = [
-        ":cuda_libs",
-        ":overrided_cuda_libs"
-    ],
-)
-
-selects.config_setting_group(
-    name = "cuda_tools_and_libs",
-    match_all = [
-        ":any_cuda_libs",
-        ":cuda_tools"
-    ],
-)
-
-cc_library(
-    # This is not yet fully supported, but we need the rule
-    # to make bazel query happy.
-    name = "nvptxcompiler",
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl
deleted file mode 100644
index ce509857e566..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl
+++ /dev/null
@@ -1,19 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-
-exports_files([
-    "version.txt",
-])
-
-cc_library(
-    name = "headers",
-    hdrs = glob([
-        %{comment}"include/cub/**",
-        %{comment}"include/cuda/**",
-        %{comment}"include/nv/**",
-        %{comment}"include/thrust/**",
-    ]),
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_configure.bzl b/third_party/gpus/cuda/hermetic/cuda_configure.bzl
deleted file mode 100644
index af6f306b333c..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_configure.bzl
+++ /dev/null
@@ -1,595 +0,0 @@
-"""Repository rule for hermetic CUDA autoconfiguration.
-
-`cuda_configure` depends on the following environment variables:
-
-  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
-  * `TF_NVCC_CLANG` (deprecated): Whether to use clang for C++ and NVCC for Cuda
-    compilation.
-  * `CUDA_NVCC`: Whether to use NVCC for Cuda compilation.
-  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
-    both host and device code compilation.
-  * `CC`: The compiler path that will be used for both host and device code
-    compilation if `CLANG_CUDA_COMPILER_PATH` is not set.
-  * `TF_SYSROOT`: The sysroot to use when compiling.
-  * `HERMETIC_CUDA_VERSION`: The version of the CUDA toolkit. If not specified,
-    the version will be determined by the `TF_CUDA_VERSION`.
-  * `HERMETIC_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default 
-    is `3.5,5.2`. If not specified, the value will be determined by the
-    `TF_CUDA_COMPUTE_CAPABILITIES`.
-  * `TMPDIR`: specifies the directory to use for temporary files. This
-    environment variable is used by GCC compiler.
-"""
-
-load(
-    "//third_party/gpus:compiler_common_tools.bzl",
-    "get_cxx_inc_directories",
-    "to_list_of_strings",
-)
-load(
-    "//third_party/remote_config:common.bzl",
-    "get_cpu_value",
-    "get_host_environ",
-    "realpath",
-    "which",
-)
-
-def _find_cc(repository_ctx):
-    """Find the C++ compiler."""
-    cc_name = "clang"
-
-    cc_name_from_env = get_host_environ(
-        repository_ctx,
-        _CLANG_CUDA_COMPILER_PATH,
-    ) or get_host_environ(repository_ctx, _CC)
-    if cc_name_from_env:
-        cc_name = cc_name_from_env
-    cc = which(repository_ctx, cc_name, allow_failure = True)
-    if not cc:
-        print(("Cannot find {}, either correct your path," +
-               " or set the CLANG_CUDA_COMPILER_PATH or CC" +
-               " environment variables").format(cc_name))  # buildifier: disable=print
-    return None if not cc else realpath(repository_ctx, cc)
-
-def _auto_configure_fail(msg):
-    """Output failure message when cuda configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def _verify_build_defines(params):
-    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
-
-    Args:
-      params: dict of variables that will be passed to the BUILD.tpl template.
-    """
-    missing = []
-    for param in [
-        "cxx_builtin_include_directories",
-        "extra_no_canonical_prefixes_flags",
-        "host_compiler_path",
-        "host_compiler_prefix",
-        "host_compiler_warnings",
-        "linker_bin_path",
-        "compiler_deps",
-        "msvc_cl_path",
-        "msvc_env_include",
-        "msvc_env_lib",
-        "msvc_env_path",
-        "msvc_env_tmp",
-        "msvc_lib_path",
-        "msvc_link_path",
-        "msvc_ml_path",
-        "unfiltered_compile_flags",
-        "win_compiler_deps",
-    ]:
-        if ("%{" + param + "}") not in params:
-            missing.append(param)
-
-    if missing:
-        _auto_configure_fail(
-            "BUILD.tpl template is missing these variables: " +
-            str(missing) +
-            ".\nWe only got: " +
-            str(params) +
-            ".",
-        )
-
-def get_cuda_version(repository_ctx):
-    return (get_host_environ(repository_ctx, HERMETIC_CUDA_VERSION) or
-            get_host_environ(repository_ctx, TF_CUDA_VERSION))
-
-def enable_cuda(repository_ctx):
-    """Returns whether to build with CUDA support."""
-    return int(get_host_environ(repository_ctx, TF_NEED_CUDA, False))
-
-def _flag_enabled(repository_ctx, flag_name):
-    return get_host_environ(repository_ctx, flag_name) == "1"
-
-def _use_nvcc_and_clang(repository_ctx):
-    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
-    return _flag_enabled(repository_ctx, _TF_NVCC_CLANG)
-
-def _use_nvcc_for_cuda(repository_ctx):
-    # Returns the flag if we need to use NVCC for Cuda.
-    return _flag_enabled(repository_ctx, _CUDA_NVCC)
-
-def _tf_sysroot(repository_ctx):
-    tf_sys_root = get_host_environ(repository_ctx, _TF_SYSROOT, "")
-    if repository_ctx.path(tf_sys_root).exists:
-        return tf_sys_root
-    return ""
-
-def _py_tmpl_dict(d):
-    return {"%{cuda_config}": str(d)}
-
-def _cudart_static_linkopt(cpu_value):
-    """Returns additional platform-specific linkopts for cudart."""
-    return "\"\"," if cpu_value == "Darwin" else "\"-lrt\","
-
-def _compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities.
-
-    Args:
-      repository_ctx: the repo rule's context.
-
-    Returns:
-      list of cuda architectures to compile for. 'compute_xy' refers to
-      both PTX and SASS, 'sm_xy' refers to SASS only.
-    """
-    capabilities = (get_host_environ(
-                        repository_ctx,
-                        _HERMETIC_CUDA_COMPUTE_CAPABILITIES,
-                    ) or
-                    get_host_environ(
-                        repository_ctx,
-                        _TF_CUDA_COMPUTE_CAPABILITIES,
-                    ))
-    capabilities = (capabilities or "compute_35,compute_52").split(",")
-
-    # Map old 'x.y' capabilities to 'compute_xy'.
-    if len(capabilities) > 0 and all([len(x.split(".")) == 2 for x in capabilities]):
-        # If all capabilities are in 'x.y' format, only include PTX for the
-        # highest capability.
-        cc_list = sorted([x.replace(".", "") for x in capabilities])
-        capabilities = [
-            "sm_%s" % x
-            for x in cc_list[:-1]
-        ] + ["compute_%s" % cc_list[-1]]
-    for i, capability in enumerate(capabilities):
-        parts = capability.split(".")
-        if len(parts) != 2:
-            continue
-        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
-
-    # Make list unique
-    capabilities = dict(zip(capabilities, capabilities)).keys()
-
-    # Validate capabilities.
-    for capability in capabilities:
-        if not capability.startswith(("compute_", "sm_")):
-            _auto_configure_fail("Invalid compute capability: %s" % capability)
-        for prefix in ["compute_", "sm_"]:
-            if not capability.startswith(prefix):
-                continue
-            version = capability[len(prefix):]
-
-            # Allow PTX accelerated features: sm_90a, sm_100a, etc.
-            if version.endswith("a"):
-                version = version[:-1]
-            if version.isdigit() and len(version) in (2, 3):
-                continue
-            _auto_configure_fail("Invalid compute capability: %s" % capability)
-
-    return capabilities
-
-def _compute_cuda_extra_copts(compute_capabilities, is_clang):
-    copts = ["--no-cuda-include-ptx=all"] if is_clang else []
-    for capability in compute_capabilities:
-        if capability.startswith("compute_"):
-            capability = capability.replace("compute_", "sm_")
-            copts.append("--cuda-include-ptx=%s" % capability)
-        copts.append("--cuda-gpu-arch=%s" % capability)
-
-    return str(copts)
-
-def _get_cuda_config(repository_ctx):
-    """Detects and returns information about the CUDA installation on the system.
-
-      Args:
-        repository_ctx: The repository context.
-
-      Returns:
-        A struct containing the following fields:
-          cuda_version: The version of CUDA on the system.
-          cudart_version: The CUDA runtime version on the system.
-          cudnn_version: The version of cuDNN on the system.
-          compute_capabilities: A list of the system's CUDA compute capabilities.
-          cpu_value: The name of the host operating system.
-      """
-
-    return struct(
-        cuda_version = get_cuda_version(repository_ctx),
-        cupti_version = repository_ctx.read(repository_ctx.attr.cupti_version),
-        cudart_version = repository_ctx.read(repository_ctx.attr.cudart_version),
-        cublas_version = repository_ctx.read(repository_ctx.attr.cublas_version),
-        cusolver_version = repository_ctx.read(repository_ctx.attr.cusolver_version),
-        curand_version = repository_ctx.read(repository_ctx.attr.curand_version),
-        cufft_version = repository_ctx.read(repository_ctx.attr.cufft_version),
-        cusparse_version = repository_ctx.read(repository_ctx.attr.cusparse_version),
-        cudnn_version = repository_ctx.read(repository_ctx.attr.cudnn_version),
-        compute_capabilities = _compute_capabilities(repository_ctx),
-        cpu_value = get_cpu_value(repository_ctx),
-    )
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_gpu_disabled():
-  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
-       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with GPU support.")
-
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
-
-error_gpu_disabled()
-"""
-
-def _cuda_include_paths(repository_ctx):
-    return ["%s/include" % repository_ctx.path(f).dirname for f in [
-        repository_ctx.attr.cccl_version,
-        repository_ctx.attr.cublas_version,
-        repository_ctx.attr.cudart_version,
-        repository_ctx.attr.cudnn_version,
-        repository_ctx.attr.cufft_version,
-        repository_ctx.attr.cupti_version,
-        repository_ctx.attr.curand_version,
-        repository_ctx.attr.cusolver_version,
-        repository_ctx.attr.cusparse_version,
-        repository_ctx.attr.nvcc_version,
-        repository_ctx.attr.nvjitlink_version,
-        repository_ctx.attr.nvml_version,
-        repository_ctx.attr.nvtx_version,
-    ]]
-
-def _setup_toolchains(repository_ctx, cc, cuda_version):
-    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
-    is_nvcc_for_cuda = _use_nvcc_for_cuda(repository_ctx)
-    tf_sysroot = _tf_sysroot(repository_ctx)
-
-    host_compiler_includes = get_cxx_inc_directories(
-        repository_ctx,
-        cc,
-        tf_sysroot,
-    )
-
-    cuda_defines = {}
-
-    # We do not support hermetic CUDA on Windows.
-    # This ensures the CROSSTOOL file parser is happy.
-    cuda_defines.update({
-        "%{msvc_env_tmp}": "msvc_not_used",
-        "%{msvc_env_path}": "msvc_not_used",
-        "%{msvc_env_include}": "msvc_not_used",
-        "%{msvc_env_lib}": "msvc_not_used",
-        "%{msvc_cl_path}": "msvc_not_used",
-        "%{msvc_ml_path}": "msvc_not_used",
-        "%{msvc_link_path}": "msvc_not_used",
-        "%{msvc_lib_path}": "msvc_not_used",
-        "%{win_compiler_deps}": ":empty",
-    })
-
-    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
-    is_clang_compiler = "clang" in cc
-    if not enable_cuda(repository_ctx):
-        cuda_defines["%{cuda_toolkit_path}"] = ""
-        cuda_defines["%{cuda_nvcc_files}"] = "[]"
-        nvcc_relative_path = ""
-    else:
-        if is_clang_compiler:
-            cuda_defines["%{cuda_toolkit_path}"] = repository_ctx.attr.nvcc_binary.workspace_root
-        else:
-            cuda_defines["%{cuda_toolkit_path}"] = ""
-        cuda_defines["%{cuda_nvcc_files}"] = "if_cuda([\"@{nvcc_archive}//:bin\", \"@{nvcc_archive}//:nvvm\"])".format(
-            nvcc_archive = repository_ctx.attr.nvcc_binary.repo_name,
-        )
-        nvcc_relative_path = "%s/%s" % (
-            repository_ctx.attr.nvcc_binary.workspace_root,
-            repository_ctx.attr.nvcc_binary.name,
-        )
-    if is_clang_compiler:
-        cuda_defines["%{compiler}"] = "clang"
-        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
-        cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
-            host_compiler_includes,
-        )
-    else:
-        cuda_defines["%{compiler}"] = "unknown"
-        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
-        cuda_includes = []
-        if enable_cuda(repository_ctx):
-            cuda_includes = _cuda_include_paths(repository_ctx)
-        cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
-            host_compiler_includes + cuda_includes,
-        )
-    cuda_defines["%{host_compiler_prefix}"] = "/usr/bin"
-    cuda_defines["%{linker_bin_path}"] = ""
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
-    cuda_defines["%{unfiltered_compile_flags}"] = ""
-
-    if not (is_nvcc_and_clang or is_nvcc_for_cuda):
-        cuda_defines["%{host_compiler_path}"] = str(cc)
-        cuda_defines["%{host_compiler_warnings}"] = """
-          # Some parts of the codebase set -Werror and hit this warning, so
-          # switch it off for now.
-          "-Wno-invalid-partial-specialization"
-      """
-        cuda_defines["%{compiler_deps}"] = ":cuda_nvcc_files"
-        repository_ctx.file(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            "",
-        )
-    else:
-        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-        cuda_defines["%{host_compiler_warnings}"] = ""
-        cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
-
-        wrapper_defines = {
-            "%{cpu_compiler}": str(cc),
-            "%{cuda_version}": cuda_version,
-            "%{nvcc_path}": nvcc_relative_path,
-            "%{host_compiler_path}": str(cc),
-            "%{use_clang_compiler}": str(is_clang_compiler),
-            "%{tmpdir}": get_host_environ(
-                repository_ctx,
-                _TMPDIR,
-                "",
-            ),
-        }
-        repository_ctx.template(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            repository_ctx.attr.crosstool_wrapper_driver_is_not_gcc_tpl,
-            wrapper_defines,
-        )
-
-    _verify_build_defines(cuda_defines)
-
-    # Only expand template variables in the BUILD file
-    repository_ctx.template(
-        "crosstool/BUILD",
-        repository_ctx.attr.crosstool_build_tpl,
-        cuda_defines,
-    )
-
-    # No templating of cc_toolchain_config - use attributes and templatize the
-    # BUILD file.
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        repository_ctx.attr.cc_toolchain_config_tpl,
-        {},
-    )
-
-def _create_dummy_repository(repository_ctx):
-    cpu_value = get_cpu_value(repository_ctx)
-
-    # Set up BUILD file for cuda/.
-    repository_ctx.template(
-        "cuda/build_defs.bzl",
-        repository_ctx.attr.build_defs_tpl,
-        {
-            "%{cuda_is_configured}": "False",
-            "%{cuda_extra_copts}": "[]",
-            "%{cuda_gpu_architectures}": "[]",
-            "%{cuda_version}": "0.0",
-        },
-    )
-
-    repository_ctx.template(
-        "cuda/BUILD",
-        repository_ctx.attr.cuda_build_tpl,
-        {
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-        },
-    )
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.h",
-        repository_ctx.attr.cuda_config_tpl,
-        {
-            "%{cuda_version}": "",
-            "%{cudart_version}": "",
-            "%{cupti_version}": "",
-            "%{cublas_version}": "",
-            "%{cusolver_version}": "",
-            "%{curand_version}": "",
-            "%{cufft_version}": "",
-            "%{cusparse_version}": "",
-            "%{cudnn_version}": "",
-            "%{cuda_toolkit_path}": "",
-            "%{cuda_compute_capabilities}": "",
-        },
-    )
-
-    # Set up cuda_config.py, which is used by gen_build_info to provide
-    # static build environment info to the API
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.py",
-        repository_ctx.attr.cuda_config_py_tpl,
-        _py_tmpl_dict({}),
-    )
-
-    cc = None
-    if repository_ctx.os.arch == "amd64" and repository_ctx.os.name == "linux":
-        cc = _find_cc(repository_ctx)
-    if cc:
-        _setup_toolchains(repository_ctx, cc, "")
-    else:
-        repository_ctx.file(
-            "crosstool/error_gpu_disabled.bzl",
-            _DUMMY_CROSSTOOL_BZL_FILE,
-        )
-        repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-def _create_local_cuda_repository(repository_ctx):
-    """Creates the repository containing files set up to build with CUDA."""
-    cuda_config = _get_cuda_config(repository_ctx)
-    cc = _find_cc(repository_ctx)
-
-    # Set up BUILD file for cuda/
-    repository_ctx.template(
-        "cuda/build_defs.bzl",
-        repository_ctx.attr.build_defs_tpl,
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                cuda_config.compute_capabilities,
-                "clang" in cc,
-            ),
-            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
-            "%{cuda_version}": cuda_config.cuda_version,
-        },
-    )
-
-    repository_ctx.template(
-        "cuda/BUILD",
-        repository_ctx.attr.cuda_build_tpl,
-        {
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(
-                cuda_config.cpu_value,
-            ),
-        },
-    )
-
-    # Set up crosstool/
-    _setup_toolchains(repository_ctx, cc, cuda_config.cuda_version)
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.h",
-        repository_ctx.attr.cuda_config_tpl,
-        {
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{cudart_version}": cuda_config.cudart_version,
-            "%{cupti_version}": cuda_config.cupti_version,
-            "%{cublas_version}": cuda_config.cublas_version,
-            "%{cusolver_version}": cuda_config.cusolver_version,
-            "%{curand_version}": cuda_config.curand_version,
-            "%{cufft_version}": cuda_config.cufft_version,
-            "%{cusparse_version}": cuda_config.cusparse_version,
-            "%{cudnn_version}": cuda_config.cudnn_version,
-            "%{cuda_toolkit_path}": "",
-            "%{cuda_compute_capabilities}": ", ".join([
-                cc.split("_")[1]
-                for cc in cuda_config.compute_capabilities
-            ]),
-        },
-    )
-
-    # Set up cuda_config.py, which is used by gen_build_info to provide
-    # static build environment info to the API
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.py",
-        repository_ctx.attr.cuda_config_py_tpl,
-        _py_tmpl_dict({
-            "cuda_version": cuda_config.cuda_version,
-            "cudnn_version": cuda_config.cudnn_version,
-            "cuda_compute_capabilities": cuda_config.compute_capabilities,
-            "cpu_compiler": str(cc),
-        }),
-    )
-
-def _cuda_autoconf_impl(repository_ctx):
-    """Implementation of the cuda_autoconf repository rule."""
-    build_file = repository_ctx.attr.local_config_cuda_build_file
-
-    if not enable_cuda(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    else:
-        _create_local_cuda_repository(repository_ctx)
-
-    repository_ctx.symlink(build_file, "BUILD")
-
-_CC = "CC"
-_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
-_HERMETIC_CUDA_COMPUTE_CAPABILITIES = "HERMETIC_CUDA_COMPUTE_CAPABILITIES"
-_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
-HERMETIC_CUDA_VERSION = "HERMETIC_CUDA_VERSION"
-TF_CUDA_VERSION = "TF_CUDA_VERSION"
-TF_NEED_CUDA = "TF_NEED_CUDA"
-_TF_NVCC_CLANG = "TF_NVCC_CLANG"
-_CUDA_NVCC = "CUDA_NVCC"
-_TF_SYSROOT = "TF_SYSROOT"
-_TMPDIR = "TMPDIR"
-
-_ENVIRONS = [
-    _CC,
-    _CLANG_CUDA_COMPILER_PATH,
-    TF_NEED_CUDA,
-    _TF_NVCC_CLANG,
-    _CUDA_NVCC,
-    TF_CUDA_VERSION,
-    HERMETIC_CUDA_VERSION,
-    _TF_CUDA_COMPUTE_CAPABILITIES,
-    _HERMETIC_CUDA_COMPUTE_CAPABILITIES,
-    _TF_SYSROOT,
-    "TMP",
-    _TMPDIR,
-    "LOCAL_CUDA_PATH",
-    "LOCAL_CUDNN_PATH",
-]
-
-cuda_configure = repository_rule(
-    implementation = _cuda_autoconf_impl,
-    environ = _ENVIRONS,
-    attrs = {
-        "environ": attr.string_dict(),
-        "cccl_version": attr.label(default = Label("@cuda_cccl//:version.txt")),
-        "cublas_version": attr.label(default = Label("@cuda_cublas//:version.txt")),
-        "cudart_version": attr.label(default = Label("@cuda_cudart//:version.txt")),
-        "cudnn_version": attr.label(default = Label("@cuda_cudnn//:version.txt")),
-        "cufft_version": attr.label(default = Label("@cuda_cufft//:version.txt")),
-        "cupti_version": attr.label(default = Label("@cuda_cupti//:version.txt")),
-        "curand_version": attr.label(default = Label("@cuda_curand//:version.txt")),
-        "cusolver_version": attr.label(default = Label("@cuda_cusolver//:version.txt")),
-        "cusparse_version": attr.label(default = Label("@cuda_cusparse//:version.txt")),
-        "nvcc_binary": attr.label(default = Label("@cuda_nvcc//:bin/nvcc")),
-        "nvcc_version": attr.label(default = Label("@cuda_nvcc//:version.txt")),
-        "nvjitlink_version": attr.label(default = Label("@cuda_nvjitlink//:version.txt")),
-        "nvml_version": attr.label(default = Label("@cuda_nvml//:version.txt")),
-        "nvtx_version": attr.label(default = Label("@cuda_nvtx//:version.txt")),
-        "local_config_cuda_build_file": attr.label(default = Label("//third_party/gpus:local_config_cuda.BUILD")),
-        "build_defs_tpl": attr.label(default = Label("//third_party/gpus/cuda:build_defs.bzl.tpl")),
-        "cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda/hermetic:BUILD.tpl")),
-        "cuda_config_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.h.tpl")),
-        "cuda_config_py_tpl": attr.label(default = Label("//third_party/gpus/cuda:cuda_config.py.tpl")),
-        "crosstool_wrapper_driver_is_not_gcc_tpl": attr.label(default = Label("//third_party/gpus/crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl")),
-        "crosstool_build_tpl": attr.label(default = Label("//third_party/gpus/crosstool:BUILD.tpl")),
-        "cc_toolchain_config_tpl": attr.label(default = Label("//third_party/gpus/crosstool:cc_toolchain_config.bzl.tpl")),
-    },
-)
-
-"""Detects and configures the hermetic CUDA toolchain.
-
-Add the following to your WORKSPACE file:
-
-```python
-cuda_configure(name = "local_config_cuda")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""  # buildifier: disable=no-effect
diff --git a/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl
deleted file mode 100644
index d8f125fa3d32..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl
+++ /dev/null
@@ -1,50 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "cublas_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcublas.so.%{libcublas_version}",
-    deps = [":cublasLt"],
-)
-
-cc_import(
-    name = "cublasLt_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcublasLt.so.%{libcublaslt_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "cublas",
-    visibility = ["//visibility:public"],
-    %{comment}deps = [":cublas_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cublas/lib"),
-)
-
-cc_library(
-    name = "cublasLt",
-    visibility = ["//visibility:public"],
-    %{comment}deps = [":cublasLt_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cublas/lib"),
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = [
-        %{comment}"include/cublas.h",
-        %{comment}"include/cublasLt.h",
-        %{comment}"include/cublas_api.h",
-        %{comment}"include/cublas_v2.h",
-    %{comment}],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl
deleted file mode 100644
index fabb310001cd..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl
+++ /dev/null
@@ -1,132 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-
-filegroup(
-    name = "static",
-    srcs = ["lib/libcudart_static.a"],
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
-%{multiline_comment}
-cc_import(
-    name = "cuda_stub",
-    interface_library = "lib/stubs/libcuda.so",
-    system_provided = 1,
-)
-
-cc_import(
-    name = "cudart_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudart.so.%{libcudart_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "cuda_driver",
-    %{comment}deps = [":cuda_stub"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cudart",
-    %{comment}deps = select({
-        %{comment}"@cuda_driver//:forward_compatibility": ["@cuda_driver//:nvidia_driver"],
-        %{comment}"//conditions:default": [":cuda_driver"],
-    %{comment}}) + [
-        %{comment}":cudart_shared_library",
-    %{comment}],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cuda_runtime/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/builtin_types.h",
-        %{comment}"include/channel_descriptor.h",
-        %{comment}"include/common_functions.h",
-        %{comment}"include/cooperative_groups/**",
-        %{comment}"include/cooperative_groups.h",
-        %{comment}"include/cuComplex.h",
-        %{comment}"include/cuda.h",
-        %{comment}"include/cudaEGL.h",
-        %{comment}"include/cudaEGLTypedefs.h",
-        %{comment}"include/cudaGL.h",
-        %{comment}"include/cudaGLTypedefs.h",
-        %{comment}"include/cudaProfilerTypedefs.h",
-        %{comment}"include/cudaTypedefs.h",
-        %{comment}"include/cudaVDPAU.h",
-        %{comment}"include/cudaVDPAUTypedefs.h",
-        %{comment}"include/cuda_awbarrier.h",
-        %{comment}"include/cuda_awbarrier_helpers.h",
-        %{comment}"include/cuda_awbarrier_primitives.h",
-        %{comment}"include/cuda_bf16.h",
-        %{comment}"include/cuda_bf16.hpp",
-        %{comment}"include/cuda_device_runtime_api.h",
-        %{comment}"include/cuda_egl_interop.h",
-        %{comment}"include/cuda_fp16.h",
-        %{comment}"include/cuda_fp16.hpp",
-        %{comment}"include/cuda_fp8.h",
-        %{comment}"include/cuda_fp8.hpp",
-        %{comment}"include/cuda_gl_interop.h",
-        %{comment}"include/cuda_occupancy.h",
-        %{comment}"include/cuda_pipeline.h",
-        %{comment}"include/cuda_pipeline_helpers.h",
-        %{comment}"include/cuda_pipeline_primitives.h",
-        %{comment}"include/cuda_runtime.h",
-        %{comment}"include/cuda_runtime_api.h",
-        %{comment}"include/cuda_surface_types.h",
-        %{comment}"include/cuda_texture_types.h",
-        %{comment}"include/cuda_vdpau_interop.h",
-        %{comment}"include/cudart_platform.h",
-        %{comment}"include/device_atomic_functions.h",
-        %{comment}"include/device_atomic_functions.hpp",
-        %{comment}"include/device_double_functions.h",
-        %{comment}"include/device_functions.h",
-        %{comment}"include/device_launch_parameters.h",
-        %{comment}"include/device_types.h",
-        %{comment}"include/driver_functions.h",
-        %{comment}"include/driver_types.h",
-        %{comment}"include/host_config.h",
-        %{comment}"include/host_defines.h",
-        %{comment}"include/library_types.h",
-        %{comment}"include/math_constants.h",
-        %{comment}"include/math_functions.h",
-        %{comment}"include/mma.h",
-        %{comment}"include/nvfunctional",
-        %{comment}"include/sm_20_atomic_functions.h",
-        %{comment}"include/sm_20_atomic_functions.hpp",
-        %{comment}"include/sm_20_intrinsics.h",
-        %{comment}"include/sm_20_intrinsics.hpp",
-        %{comment}"include/sm_30_intrinsics.h",
-        %{comment}"include/sm_30_intrinsics.hpp",
-        %{comment}"include/sm_32_atomic_functions.h",
-        %{comment}"include/sm_32_atomic_functions.hpp",
-        %{comment}"include/sm_32_intrinsics.h",
-        %{comment}"include/sm_32_intrinsics.hpp",
-        %{comment}"include/sm_35_atomic_functions.h",
-        %{comment}"include/sm_35_intrinsics.h",
-        %{comment}"include/sm_60_atomic_functions.h",
-        %{comment}"include/sm_60_atomic_functions.hpp",
-        %{comment}"include/sm_61_intrinsics.h",
-        %{comment}"include/sm_61_intrinsics.hpp",
-        %{comment}"include/surface_functions.h",
-        %{comment}"include/surface_indirect_functions.h",
-        %{comment}"include/surface_types.h",
-        %{comment}"include/texture_fetch_functions.h",
-        %{comment}"include/texture_indirect_functions.h",
-        %{comment}"include/texture_types.h",
-        %{comment}"include/vector_functions.h",
-        %{comment}"include/vector_functions.hpp",
-        %{comment}"include/vector_types.h",
-    %{comment}]),
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl
deleted file mode 100644
index c3701a624124..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl
+++ /dev/null
@@ -1,78 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "cudnn_ops_infer",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_ops_infer.so.%{libcudnn_ops_infer_version}",
-)
-
-cc_import(
-    name = "cudnn_cnn_infer",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_cnn_infer.so.%{libcudnn_cnn_infer_version}",
-)
-
-cc_import(
-    name = "cudnn_ops_train",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_ops_train.so.%{libcudnn_ops_train_version}",
-)
-
-cc_import(
-    name = "cudnn_cnn_train",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_cnn_train.so.%{libcudnn_cnn_train_version}",
-)
-
-cc_import(
-    name = "cudnn_adv_infer",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_adv_infer.so.%{libcudnn_adv_infer_version}",
-)
-
-cc_import(
-    name = "cudnn_adv_train",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_adv_train.so.%{libcudnn_adv_train_version}",
-)
-
-cc_import(
-    name = "cudnn_main",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn.so.%{libcudnn_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "cudnn",
-    %{comment}deps = [
-      %{comment}":cudnn_ops_infer",
-      %{comment}":cudnn_ops_train",
-      %{comment}":cudnn_cnn_infer",
-      %{comment}":cudnn_cnn_train",
-      %{comment}":cudnn_adv_infer",
-      %{comment}":cudnn_adv_train",
-      %{comment}"@cuda_nvrtc//:nvrtc",
-      %{comment}":cudnn_main",
-    %{comment}],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cudnn/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/cudnn*.h",
-    %{comment}]),
-    include_prefix = "third_party/gpus/cudnn",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl
deleted file mode 100644
index 4e8bcbd84e03..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl
+++ /dev/null
@@ -1,85 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import( 
-    name = "cudnn_ops",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_ops.so.%{libcudnn_ops_version}",
-)
-
-cc_import( 
-    name = "cudnn_cnn",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_cnn.so.%{libcudnn_cnn_version}",
-)
-
-cc_import( 
-    name = "cudnn_adv",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_adv.so.%{libcudnn_adv_version}",
-)
-
-cc_import( 
-    name = "cudnn_graph",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_graph.so.%{libcudnn_graph_version}",
-)
-
-cc_import(
-    name = "cudnn_engines_precompiled",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_engines_precompiled.so.%{libcudnn_engines_precompiled_version}",
-)
-
-cc_import(
-    name = "cudnn_engines_runtime_compiled",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_engines_runtime_compiled.so.%{libcudnn_engines_runtime_compiled_version}",
-)
-
-cc_import(
-    name = "cudnn_heuristic",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn_heuristic.so.%{libcudnn_heuristic_version}",
-)
-
-cc_import(
-    name = "cudnn_main",
-    hdrs = [":headers"],
-    shared_library = "lib/libcudnn.so.%{libcudnn_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "cudnn",
-    %{comment}deps = [
-      %{comment}":cudnn_engines_precompiled",
-      %{comment}":cudnn_ops",
-      %{comment}":cudnn_graph",
-      %{comment}":cudnn_cnn",
-      %{comment}":cudnn_adv",
-      %{comment}":cudnn_engines_runtime_compiled",
-      %{comment}":cudnn_heuristic",
-      %{comment}"@cuda_nvrtc//:nvrtc",
-      %{comment}":cudnn_main",
-    %{comment}],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cudnn/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/cudnn*.h",
-    %{comment}]),
-    include_prefix = "third_party/gpus/cudnn",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl
deleted file mode 100644
index 2e55a742d549..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl
+++ /dev/null
@@ -1,34 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "cufft_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcufft.so.%{libcufft_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "cufft",
-    %{comment}deps = [":cufft_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cufft/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/cudalibxt.h", 
-        %{comment}"include/cufft*.h"
-    %{comment}]),
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl
deleted file mode 100644
index 16d6991b5841..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl
+++ /dev/null
@@ -1,69 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load("@local_config_cuda//cuda:build_defs.bzl", "if_version_equal_or_greater_than")
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "cupti_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcupti.so.%{libcupti_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "cupti",
-    %{comment}deps = [":cupti_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cuda_cupti/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/Openacc/**",
-        %{comment}"include/Openmp/**",
-        %{comment}"include/cuda_stdint.h",
-        %{comment}"include/cupti.h",
-        %{comment}"include/cupti_activity.h",
-        %{comment}"include/cupti_activity_deprecated.h",
-        %{comment}"include/cupti_callbacks.h",
-        %{comment}"include/cupti_checkpoint.h",
-        %{comment}"include/cupti_driver_cbid.h",
-        %{comment}"include/cupti_events.h",
-        %{comment}"include/cupti_metrics.h",
-        %{comment}"include/cupti_nvtx_cbid.h",
-        %{comment}"include/cupti_pcsampling.h",
-        %{comment}"include/cupti_pcsampling_util.h",
-        %{comment}"include/cupti_profiler_target.h",
-        %{comment}"include/cupti_result.h",
-        %{comment}"include/cupti_runtime_cbid.h",
-        %{comment}"include/cupti_sass_metrics.h",
-        %{comment}"include/cupti_target.h",
-        %{comment}"include/cupti_version.h",
-        %{comment}"include/generated_cudaGL_meta.h",
-        %{comment}"include/generated_cudaVDPAU_meta.h",
-        %{comment}"include/generated_cuda_gl_interop_meta.h",
-        %{comment}"include/generated_cuda_meta.h",
-        %{comment}"include/generated_cuda_runtime_api_meta.h",
-        %{comment}"include/generated_cuda_vdpau_interop_meta.h",
-        %{comment}"include/generated_cudart_removed_meta.h",
-        %{comment}"include/generated_nvtx_meta.h",
-        %{comment}"include/nvperf_common.h",
-        %{comment}"include/nvperf_cuda_host.h",
-        %{comment}"include/nvperf_host.h",
-        %{comment}"include/nvperf_target.h",
-    %{comment}]) + if_version_equal_or_greater_than(
-        %{comment}"%{libcupti_minor_version}",
-        %{comment}"2024.0",
-        %{comment}["include/cupti_common.h"],
-    %{comment}),
-    include_prefix = "third_party/gpus/cuda/extras/CUPTI/include",
-    includes = ["include/"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl
deleted file mode 100644
index 746503fcf222..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl
+++ /dev/null
@@ -1,31 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "curand_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcurand.so.%{libcurand_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "curand",
-    %{comment}deps = [":curand_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/curand/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob(["include/curand*.h"]),
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl
deleted file mode 100644
index 30bacf07eebd..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl
+++ /dev/null
@@ -1,39 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "cusolver_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcusolver.so.%{libcusolver_version}",
-    deps = [
-        "@cuda_nvjitlink//:nvjitlink",
-        "@cuda_cusparse//:cusparse",
-        "@cuda_cublas//:cublas",
-        "@cuda_cublas//:cublasLt",
-    ],
-)
-%{multiline_comment}
-cc_library(
-    name = "cusolver",
-    %{comment}deps = [":cusolver_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cusolver/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/cusolver*.h",
-    %{comment}]),
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl
deleted file mode 100644
index b7765ab22508..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl
+++ /dev/null
@@ -1,32 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "cusparse_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libcusparse.so.%{libcusparse_version}",
-    deps = ["@cuda_nvjitlink//:nvjitlink"],
-)
-%{multiline_comment}
-cc_library(
-    name = "cusparse",
-    %{comment}deps = [":cusparse_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cusparse/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = ["include/cusparse.h"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_driver.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_driver.BUILD.tpl
deleted file mode 100644
index 528a1db75d99..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_driver.BUILD.tpl
+++ /dev/null
@@ -1,63 +0,0 @@
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-
-licenses(["restricted"])  # NVIDIA proprietary license
-
-%{multiline_comment}
-cc_import(
-    name = "driver_shared_library",
-    shared_library = "lib/libcuda.so.%{libcuda_version}",
-)
-
-cc_import(
-    name = "libcuda_so_1",
-    shared_library = "lib/libcuda.so.1",
-)
-
-# TODO(ybaturina): remove workaround when min CUDNN version in JAX is updated to
-# 9.3.0.
-# Workaround for adding path of driver library symlink to RPATH of cc_binaries.
-cc_import(
-    name = "libcuda_so",
-    shared_library = "lib/libcuda.so",
-)
-
-# Workaround for adding libcuda.so to NEEDED section of cc_binaries.
-genrule(
-    name = "fake_libcuda_cc",
-    outs = ["libcuda.cc"],
-    cmd = "echo '' > $@",
-)
-
-cc_binary(
-    name = "fake_libcuda_binary",
-    srcs = [":fake_libcuda_cc"],
-    linkopts = ["-Wl,-soname,libcuda.so"],
-    linkshared = True,
-)
-
-cc_import(
-    name = "fake_libcuda",
-    shared_library = ":fake_libcuda_binary",
-)
-%{multiline_comment}
-cc_library(
-    name = "nvidia_driver",
-    %{comment}deps = [
-        %{comment}":libcuda_so",
-        %{comment}":fake_libcuda",
-        %{comment}":libcuda_so_1",
-        %{comment}":driver_shared_library",
-    %{comment}],
-    visibility = ["//visibility:public"],
-)
-
-# Flag indicating if we should enable forward compatibility.
-bool_flag(
-    name = "enable_forward_compatibility",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "forward_compatibility",
-    flag_values = {":enable_forward_compatibility": "True"},
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl b/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl
deleted file mode 100644
index fdda3aaf92ce..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Hermetic CUDA redistributions JSON repository initialization. Consult the WORKSPACE on how to use it."""
-
-load("//third_party:repo.bzl", "tf_mirror_urls")
-load(
-    "//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
-    "CUDA_REDIST_JSON_DICT",
-    "CUDNN_REDIST_JSON_DICT",
-)
-
-def _get_env_var(ctx, name):
-    return ctx.os.environ.get(name)
-
-def _get_json_file_content(repository_ctx, url_to_sha256, json_file_name):
-    if len(url_to_sha256) > 1:
-        (url, sha256) = url_to_sha256
-    else:
-        url = url_to_sha256[0]
-        sha256 = ""
-    repository_ctx.download(
-        url = tf_mirror_urls(url),
-        sha256 = sha256,
-        output = json_file_name,
-    )
-    return repository_ctx.read(repository_ctx.path(json_file_name))
-
-def _cuda_redist_json_impl(repository_ctx):
-    cuda_version = (_get_env_var(repository_ctx, "HERMETIC_CUDA_VERSION") or
-                    _get_env_var(repository_ctx, "TF_CUDA_VERSION"))
-    local_cuda_path = _get_env_var(repository_ctx, "LOCAL_CUDA_PATH")
-    cudnn_version = (_get_env_var(repository_ctx, "HERMETIC_CUDNN_VERSION") or
-                     _get_env_var(repository_ctx, "TF_CUDNN_VERSION"))
-    local_cudnn_path = _get_env_var(repository_ctx, "LOCAL_CUDNN_PATH")
-    supported_cuda_versions = repository_ctx.attr.cuda_json_dict.keys()
-    if (cuda_version and not local_cuda_path and
-        (cuda_version not in supported_cuda_versions)):
-        fail(
-            ("The supported CUDA versions are {supported_versions}." +
-             " Please provide a supported version in HERMETIC_CUDA_VERSION" +
-             " environment variable or add JSON URL for" +
-             " CUDA version={version}.")
-                .format(
-                supported_versions = supported_cuda_versions,
-                version = cuda_version,
-            ),
-        )
-    supported_cudnn_versions = repository_ctx.attr.cudnn_json_dict.keys()
-    if cudnn_version and not local_cudnn_path and (cudnn_version not in supported_cudnn_versions):
-        fail(
-            ("The supported CUDNN versions are {supported_versions}." +
-             " Please provide a supported version in HERMETIC_CUDNN_VERSION" +
-             " environment variable or add JSON URL for" +
-             " CUDNN version={version}.")
-                .format(
-                supported_versions = supported_cudnn_versions,
-                version = cudnn_version,
-            ),
-        )
-    cuda_redistributions = "{}"
-    cudnn_redistributions = "{}"
-    if cuda_version and not local_cuda_path:
-        cuda_redistributions = _get_json_file_content(
-            repository_ctx,
-            repository_ctx.attr.cuda_json_dict[cuda_version],
-            "redistrib_cuda_%s.json" % cuda_version,
-        )
-    if cudnn_version and not local_cudnn_path:
-        cudnn_redistributions = _get_json_file_content(
-            repository_ctx,
-            repository_ctx.attr.cudnn_json_dict[cudnn_version],
-            "redistrib_cudnn_%s.json" % cudnn_version,
-        )
-
-    repository_ctx.file(
-        "distributions.bzl",
-        """CUDA_REDISTRIBUTIONS = {cuda_redistributions}
-
-CUDNN_REDISTRIBUTIONS = {cudnn_redistributions}
-""".format(
-            cuda_redistributions = cuda_redistributions,
-            cudnn_redistributions = cudnn_redistributions,
-        ),
-    )
-    repository_ctx.file(
-        "BUILD",
-        "",
-    )
-
-cuda_redist_json = repository_rule(
-    implementation = _cuda_redist_json_impl,
-    attrs = {
-        "cuda_json_dict": attr.string_list_dict(mandatory = True),
-        "cudnn_json_dict": attr.string_list_dict(mandatory = True),
-    },
-    environ = [
-        "HERMETIC_CUDA_VERSION",
-        "HERMETIC_CUDNN_VERSION",
-        "TF_CUDA_VERSION",
-        "TF_CUDNN_VERSION",
-        "LOCAL_CUDA_PATH",
-        "LOCAL_CUDNN_PATH",
-    ],
-)
-
-def cuda_json_init_repository(
-        cuda_json_dict = CUDA_REDIST_JSON_DICT,
-        cudnn_json_dict = CUDNN_REDIST_JSON_DICT):
-    cuda_redist_json(
-        name = "cuda_redist_json",
-        cuda_json_dict = cuda_json_dict,
-        cudnn_json_dict = cudnn_json_dict,
-    )
diff --git a/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
deleted file mode 100644
index 16ff3c8bea80..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
+++ /dev/null
@@ -1,76 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-
-exports_files([
-    "bin/nvcc",
-    "version.txt",
-])
-
-filegroup(
-    name = "nvvm",
-    srcs = [
-        "nvvm/libdevice/libdevice.10.bc",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "nvlink",
-    srcs = [
-        "bin/nvlink",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "fatbinary",
-    srcs = [
-        "bin/fatbinary",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "bin2c",
-    srcs = [
-        "bin/bin2c",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "ptxas",
-    srcs = [
-        "bin/ptxas",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "bin",
-    srcs = glob([
-        "bin/**",
-        "nvvm/bin/**",
-    ]),
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "link_stub",
-    srcs = [
-        "bin/crt/link.stub",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/crt/**",
-        %{comment}"include/fatbinary_section.h",
-        %{comment}"include/nvPTXCompiler.h",
-    %{comment}]),
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl
deleted file mode 100644
index 5be8d6ef2408..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl
+++ /dev/null
@@ -1,32 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "nvjitlink_shared_library",
-    hdrs = [":headers"],
-    shared_library = "lib/libnvJitLink.so.%{libnvjitlink_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "nvjitlink",
-    %{comment}deps = [":nvjitlink_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/nvjitlink/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = ["include/nvJitLink.h"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
-
diff --git a/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl
deleted file mode 100644
index 65bcb04db5c2..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl
+++ /dev/null
@@ -1,14 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-
-exports_files([
-    "version.txt",
-])
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = ["include/nvml.h"],
-    include_prefix = "third_party/gpus/cuda/nvml/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_nvprune.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_nvprune.BUILD.tpl
deleted file mode 100644
index 986ef0c8f761..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_nvprune.BUILD.tpl
+++ /dev/null
@@ -1,9 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-
-filegroup(
-    name = "nvprune",
-    srcs = [
-        "bin/nvprune",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_nvrtc.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_nvrtc.BUILD.tpl
deleted file mode 100644
index fea4c5d7ce7e..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_nvrtc.BUILD.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-%{multiline_comment}
-cc_import(
-    name = "nvrtc_main",
-    shared_library = "lib/libnvrtc.so.%{libnvrtc_version}",
-)
-
-cc_import(
-    name = "nvrtc_builtins",
-    shared_library = "lib/libnvrtc-builtins.so.%{libnvrtc-builtins_version}",
-)
-%{multiline_comment}
-cc_library(
-    name = "nvrtc",
-    %{comment}deps = [
-        %{comment}":nvrtc_main",
-        %{comment}":nvrtc_builtins",
-    %{comment}],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/cuda_nvrtc/lib"),
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl
deleted file mode 100644
index 72418eeb158d..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl
+++ /dev/null
@@ -1,17 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-
-exports_files([
-    "version.txt",
-])
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/nvToolsExt*.h",
-        %{comment}"include/nvtx3/**",
-    %{comment}]),
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["include"],
-    strip_include_prefix = "include",
-    visibility = ["@local_config_cuda//cuda:__pkg__"],
-)
diff --git a/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl b/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
deleted file mode 100644
index fc11336ba3d1..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
+++ /dev/null
@@ -1,579 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Hermetic CUDA repositories initialization. Consult the WORKSPACE on how to use it."""
-
-load("//third_party:repo.bzl", "tf_mirror_urls")
-load(
-    "//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
-    "CUDA_REDIST_PATH_PREFIX",
-    "CUDNN_REDIST_PATH_PREFIX",
-    "REDIST_VERSIONS_TO_BUILD_TEMPLATES",
-)
-
-OS_ARCH_DICT = {
-    "amd64": "x86_64-unknown-linux-gnu",
-    "aarch64": "aarch64-unknown-linux-gnu",
-    "tegra-aarch64": "tegra-aarch64-unknown-linux-gnu",
-}
-_REDIST_ARCH_DICT = {
-    "linux-x86_64": "x86_64-unknown-linux-gnu",
-    "linux-sbsa": "aarch64-unknown-linux-gnu",
-    "linux-aarch64": "tegra-aarch64-unknown-linux-gnu",
-}
-
-TEGRA = "tegra"
-
-SUPPORTED_ARCHIVE_EXTENSIONS = [
-    ".zip",
-    ".jar",
-    ".war",
-    ".aar",
-    ".tar",
-    ".tar.gz",
-    ".tgz",
-    ".tar.xz",
-    ".txz",
-    ".tar.zst",
-    ".tzst",
-    ".tar.bz2",
-    ".tbz",
-    ".ar",
-    ".deb",
-    ".whl",
-]
-
-def get_env_var(ctx, name):
-    return ctx.os.environ.get(name)
-
-def _get_file_name(url):
-    last_slash_index = url.rfind("/")
-    return url[last_slash_index + 1:]
-
-def get_archive_name(url):
-    # buildifier: disable=function-docstring-return
-    # buildifier: disable=function-docstring-args
-    """Returns the archive name without extension."""
-    filename = _get_file_name(url)
-    for extension in SUPPORTED_ARCHIVE_EXTENSIONS:
-        if filename.endswith(extension):
-            return filename[:-len(extension)]
-    return filename
-
-LIB_EXTENSION = ".so."
-
-def _get_lib_name_and_version(path):
-    extension_index = path.rfind(LIB_EXTENSION)
-    last_slash_index = path.rfind("/")
-    lib_name = path[last_slash_index + 1:extension_index]
-    lib_version = path[extension_index + len(LIB_EXTENSION):]
-    return (lib_name, lib_version)
-
-def _get_main_lib_name(repository_ctx):
-    if repository_ctx.name == "cuda_driver":
-        return "libcuda"
-    else:
-        return "lib{}".format(
-            repository_ctx.name.split("_")[1],
-        ).lower()
-
-def _get_libraries_by_redist_name_in_dir(repository_ctx):
-    lib_dir_path = repository_ctx.path("lib")
-    if not lib_dir_path.exists:
-        return []
-    main_lib_name = _get_main_lib_name(repository_ctx)
-    lib_dir_content = lib_dir_path.readdir()
-    return [
-        str(f)
-        for f in lib_dir_content
-        if (LIB_EXTENSION in str(f) and
-            main_lib_name in str(f).lower())
-    ]
-
-def get_lib_name_to_version_dict(repository_ctx):
-    # buildifier: disable=function-docstring-return
-    # buildifier: disable=function-docstring-args
-    """Returns a dict of library names and major versions."""
-    lib_name_to_version_dict = {}
-    for path in _get_libraries_by_redist_name_in_dir(repository_ctx):
-        lib_name, lib_version = _get_lib_name_and_version(path)
-        major_version_key = "%%{%s_version}" % lib_name.lower()
-        minor_version_key = "%%{%s_minor_version}" % lib_name.lower()
-
-        # We need to find either major or major.minor version if there is no
-        # file with major version. E.g. if we have the following files:
-        # libcudart.so
-        # libcudart.so.12
-        # libcudart.so.12.3.2,
-        # we will save save {"%{libcudart_version}": "12",
-        # "%{libcudart_minor_version}": "12.3.2"}
-        if len(lib_version.split(".")) == 1:
-            lib_name_to_version_dict[major_version_key] = lib_version
-        if len(lib_version.split(".")) == 2:
-            lib_name_to_version_dict[minor_version_key] = lib_version
-            if (major_version_key not in lib_name_to_version_dict or
-                len(lib_name_to_version_dict[major_version_key].split(".")) > 2):
-                lib_name_to_version_dict[major_version_key] = lib_version
-        if len(lib_version.split(".")) >= 3:
-            if major_version_key not in lib_name_to_version_dict:
-                lib_name_to_version_dict[major_version_key] = lib_version
-            if minor_version_key not in lib_name_to_version_dict:
-                lib_name_to_version_dict[minor_version_key] = lib_version
-    return lib_name_to_version_dict
-
-def create_dummy_build_file(repository_ctx, use_comment_symbols = True):
-    repository_ctx.template(
-        "BUILD",
-        repository_ctx.attr.build_templates[0],
-        {
-            "%{multiline_comment}": "'''" if use_comment_symbols else "",
-            "%{comment}": "#" if use_comment_symbols else "",
-        },
-    )
-
-def _get_build_template(repository_ctx, major_lib_version):
-    template = None
-    for i in range(0, len(repository_ctx.attr.versions)):
-        for dist_version in repository_ctx.attr.versions[i].split(","):
-            if dist_version == major_lib_version:
-                template = repository_ctx.attr.build_templates[i]
-                break
-    if not template:
-        fail("No build template found for {} version {}".format(
-            repository_ctx.name,
-            major_lib_version,
-        ))
-    return template
-
-def get_major_library_version(repository_ctx, lib_name_to_version_dict):
-    # buildifier: disable=function-docstring-return
-    # buildifier: disable=function-docstring-args
-    """Returns the major library version provided the versions dict."""
-    main_lib_name = _get_main_lib_name(repository_ctx)
-    key = "%%{%s_version}" % main_lib_name
-    if key not in lib_name_to_version_dict:
-        return ""
-    return lib_name_to_version_dict[key]
-
-def create_build_file(
-        repository_ctx,
-        lib_name_to_version_dict,
-        major_lib_version):
-    # buildifier: disable=function-docstring-args
-    """Creates a BUILD file for the repository."""
-    if len(major_lib_version) == 0:
-        build_template_content = repository_ctx.read(
-            repository_ctx.attr.build_templates[0],
-        )
-        if "_version}" not in build_template_content:
-            create_dummy_build_file(repository_ctx, use_comment_symbols = False)
-        else:
-            create_dummy_build_file(repository_ctx)
-        return
-    build_template = _get_build_template(
-        repository_ctx,
-        major_lib_version.split(".")[0],
-    )
-    repository_ctx.template(
-        "BUILD",
-        build_template,
-        lib_name_to_version_dict | {
-            "%{multiline_comment}": "",
-            "%{comment}": "",
-        },
-    )
-
-def _create_symlinks(repository_ctx, local_path, dirs):
-    for dir in dirs:
-        dir_path = "{path}/{dir}".format(
-            path = local_path,
-            dir = dir,
-        )
-        if not repository_ctx.path(local_path).exists:
-            fail("%s directory doesn't exist!" % dir_path)
-        repository_ctx.symlink(dir_path, dir)
-
-def _create_libcuda_symlinks(
-        repository_ctx,
-        lib_name_to_version_dict):
-    if repository_ctx.name == "cuda_driver":
-        key = "%{libcuda_version}"
-        if key not in lib_name_to_version_dict:
-            return
-        nvidia_driver_path = "lib/libcuda.so.{}".format(
-            lib_name_to_version_dict[key],
-        )
-        if not repository_ctx.path(nvidia_driver_path).exists:
-            fail("%s doesn't exist!" % nvidia_driver_path)
-        repository_ctx.symlink(nvidia_driver_path, "lib/libcuda.so.1")
-        repository_ctx.symlink("lib/libcuda.so.1", "lib/libcuda.so")
-
-def _create_cuda_header_symlinks(repository_ctx):
-    if repository_ctx.name == "cuda_nvcc":
-        repository_ctx.symlink("../cuda_cudart/include/cuda.h", "include/cuda.h")
-
-def _create_cuda_version_file(repository_ctx, lib_name_to_version_dict):
-    key = "%{libcudart_version}"
-    major_cudart_version = lib_name_to_version_dict[key] if key in lib_name_to_version_dict else ""
-    if repository_ctx.name == "cuda_cudart":
-        repository_ctx.file(
-            "cuda_version.bzl",
-            "MAJOR_CUDA_VERSION = \"{}\"".format(major_cudart_version),
-        )
-
-def use_local_path(repository_ctx, local_path, dirs):
-    # buildifier: disable=function-docstring-args
-    """Creates repository using local redistribution paths."""
-    _create_symlinks(
-        repository_ctx,
-        local_path,
-        dirs,
-    )
-    lib_name_to_version_dict = get_lib_name_to_version_dict(repository_ctx)
-    major_version = get_major_library_version(
-        repository_ctx,
-        lib_name_to_version_dict,
-    )
-    create_build_file(
-        repository_ctx,
-        lib_name_to_version_dict,
-        major_version,
-    )
-    _create_libcuda_symlinks(
-        repository_ctx,
-        lib_name_to_version_dict,
-    )
-    _create_cuda_version_file(repository_ctx, lib_name_to_version_dict)
-    repository_ctx.file("version.txt", major_version)
-
-def _use_local_cuda_path(repository_ctx, local_cuda_path):
-    # buildifier: disable=function-docstring-args
-    """ Creates symlinks and initializes hermetic CUDA repository."""
-    use_local_path(
-        repository_ctx,
-        local_cuda_path,
-        ["include", "lib", "bin", "nvvm"],
-    )
-
-def _use_local_cudnn_path(repository_ctx, local_cudnn_path):
-    # buildifier: disable=function-docstring-args
-    """ Creates symlinks and initializes hermetic CUDNN repository."""
-    use_local_path(repository_ctx, local_cudnn_path, ["include", "lib"])
-
-def _download_redistribution(repository_ctx, arch_key, path_prefix):
-    (url, sha256) = repository_ctx.attr.url_dict[arch_key]
-
-    # If url is not relative, then appending prefix is not needed.
-    if not (url.startswith("http") or url.startswith("file:///")):
-        url = path_prefix + url
-    archive_name = get_archive_name(url)
-    file_name = _get_file_name(url)
-
-    print("Downloading and extracting {}".format(url))  # buildifier: disable=print
-    repository_ctx.download(
-        url = tf_mirror_urls(url),
-        output = file_name,
-        sha256 = sha256,
-    )
-    if repository_ctx.attr.override_strip_prefix:
-        strip_prefix = repository_ctx.attr.override_strip_prefix
-    else:
-        strip_prefix = archive_name
-    repository_ctx.extract(
-        archive = file_name,
-        stripPrefix = strip_prefix,
-    )
-    repository_ctx.delete(file_name)
-
-def _get_platform_architecture(repository_ctx):
-    target_arch = get_env_var(repository_ctx, "CUDA_REDIST_TARGET_PLATFORM")
-
-    # We use NVCC compiler as the host compiler.
-    if target_arch and repository_ctx.name != "cuda_nvcc":
-        if target_arch in OS_ARCH_DICT.keys():
-            host_arch = target_arch
-        else:
-            fail(
-                "Unsupported architecture: {arch}, use one of {supported}".format(
-                    arch = target_arch,
-                    supported = OS_ARCH_DICT.keys(),
-                ),
-            )
-    else:
-        host_arch = repository_ctx.os.arch
-
-    if host_arch == "aarch64":
-        uname_result = repository_ctx.execute(["uname", "-a"]).stdout
-        if TEGRA in uname_result:
-            return "{}-{}".format(TEGRA, host_arch)
-    return host_arch
-
-def _use_downloaded_cuda_redistribution(repository_ctx):
-    # buildifier: disable=function-docstring-args
-    """ Downloads CUDA redistribution and initializes hermetic CUDA repository."""
-    major_version = ""
-    cuda_version = (get_env_var(repository_ctx, "HERMETIC_CUDA_VERSION") or
-                    get_env_var(repository_ctx, "TF_CUDA_VERSION"))
-    if not cuda_version:
-        # If no CUDA version is found, comment out all cc_import targets.
-        create_dummy_build_file(repository_ctx)
-        _create_cuda_version_file(repository_ctx, {})
-        repository_ctx.file("version.txt", major_version)
-        return
-
-    if len(repository_ctx.attr.url_dict) == 0:
-        print("{} is not found in redistributions list.".format(
-            repository_ctx.name,
-        ))  # buildifier: disable=print
-        create_dummy_build_file(repository_ctx)
-        _create_cuda_version_file(repository_ctx, {})
-        repository_ctx.file("version.txt", major_version)
-        return
-
-    # Download archive only when GPU config is used.
-    arch_key = OS_ARCH_DICT[_get_platform_architecture(repository_ctx)]
-    if arch_key not in repository_ctx.attr.url_dict.keys():
-        fail(
-            ("The supported platforms are {supported_platforms}." +
-             " Platform {platform} is not supported for {dist_name}.")
-                .format(
-                supported_platforms = repository_ctx.attr.url_dict.keys(),
-                platform = arch_key,
-                dist_name = repository_ctx.name,
-            ),
-        )
-    _download_redistribution(
-        repository_ctx,
-        arch_key,
-        repository_ctx.attr.cuda_redist_path_prefix,
-    )
-    lib_name_to_version_dict = get_lib_name_to_version_dict(repository_ctx)
-    major_version = get_major_library_version(repository_ctx, lib_name_to_version_dict)
-    create_build_file(
-        repository_ctx,
-        lib_name_to_version_dict,
-        major_version,
-    )
-    _create_libcuda_symlinks(
-        repository_ctx,
-        lib_name_to_version_dict,
-    )
-    _create_cuda_header_symlinks(repository_ctx)
-    _create_cuda_version_file(repository_ctx, lib_name_to_version_dict)
-    repository_ctx.file("version.txt", major_version)
-
-def _cuda_repo_impl(repository_ctx):
-    local_cuda_path = get_env_var(repository_ctx, "LOCAL_CUDA_PATH")
-    if local_cuda_path:
-        _use_local_cuda_path(repository_ctx, local_cuda_path)
-    else:
-        _use_downloaded_cuda_redistribution(repository_ctx)
-
-cuda_repo = repository_rule(
-    implementation = _cuda_repo_impl,
-    attrs = {
-        "url_dict": attr.string_list_dict(mandatory = True),
-        "versions": attr.string_list(mandatory = True),
-        "build_templates": attr.label_list(mandatory = True),
-        "override_strip_prefix": attr.string(),
-        "cuda_redist_path_prefix": attr.string(),
-    },
-    environ = [
-        "HERMETIC_CUDA_VERSION",
-        "TF_CUDA_VERSION",
-        "LOCAL_CUDA_PATH",
-        "CUDA_REDIST_TARGET_PLATFORM",
-    ],
-)
-
-def _use_downloaded_cudnn_redistribution(repository_ctx):
-    # buildifier: disable=function-docstring-args
-    """ Downloads CUDNN redistribution and initializes hermetic CUDNN repository."""
-    cudnn_version = None
-    major_version = ""
-    cudnn_version = (get_env_var(repository_ctx, "HERMETIC_CUDNN_VERSION") or
-                     get_env_var(repository_ctx, "TF_CUDNN_VERSION"))
-    cuda_version = (get_env_var(repository_ctx, "HERMETIC_CUDA_VERSION") or
-                    get_env_var(repository_ctx, "TF_CUDA_VERSION"))
-    if not cudnn_version:
-        # If no CUDNN version is found, comment out cc_import targets.
-        create_dummy_build_file(repository_ctx)
-        repository_ctx.file("version.txt", major_version)
-        return
-
-    if len(repository_ctx.attr.url_dict) == 0:
-        print("{} is not found in redistributions list.".format(
-            repository_ctx.name,
-        ))  # buildifier: disable=print
-        create_dummy_build_file(repository_ctx)
-        repository_ctx.file("version.txt", major_version)
-        return
-
-    # Download archive only when GPU config is used.
-    arch_key = OS_ARCH_DICT[_get_platform_architecture(repository_ctx)]
-    if arch_key not in repository_ctx.attr.url_dict.keys():
-        arch_key = "cuda{version}_{arch}".format(
-            version = cuda_version.split(".")[0],
-            arch = arch_key,
-        )
-    if arch_key not in repository_ctx.attr.url_dict.keys():
-        fail(
-            ("The supported platforms are {supported_platforms}." +
-             " Platform {platform} is not supported for {dist_name}.")
-                .format(
-                supported_platforms = repository_ctx.attr.url_dict.keys(),
-                platform = arch_key,
-                dist_name = repository_ctx.name,
-            ),
-        )
-
-    _download_redistribution(
-        repository_ctx,
-        arch_key,
-        repository_ctx.attr.cudnn_redist_path_prefix,
-    )
-
-    lib_name_to_version_dict = get_lib_name_to_version_dict(repository_ctx)
-    major_version = get_major_library_version(
-        repository_ctx,
-        lib_name_to_version_dict,
-    )
-    create_build_file(
-        repository_ctx,
-        lib_name_to_version_dict,
-        major_version,
-    )
-
-    repository_ctx.file("version.txt", major_version)
-
-def _cudnn_repo_impl(repository_ctx):
-    local_cudnn_path = get_env_var(repository_ctx, "LOCAL_CUDNN_PATH")
-    if local_cudnn_path:
-        _use_local_cudnn_path(repository_ctx, local_cudnn_path)
-    else:
-        _use_downloaded_cudnn_redistribution(repository_ctx)
-
-cudnn_repo = repository_rule(
-    implementation = _cudnn_repo_impl,
-    attrs = {
-        "url_dict": attr.string_list_dict(mandatory = True),
-        "versions": attr.string_list(mandatory = True),
-        "build_templates": attr.label_list(mandatory = True),
-        "override_strip_prefix": attr.string(),
-        "cudnn_redist_path_prefix": attr.string(),
-    },
-    environ = [
-        "HERMETIC_CUDNN_VERSION",
-        "TF_CUDNN_VERSION",
-        "HERMETIC_CUDA_VERSION",
-        "TF_CUDA_VERSION",
-        "LOCAL_CUDNN_PATH",
-        "CUDA_REDIST_TARGET_PLATFORM",
-    ],
-)
-
-def _get_redistribution_urls(dist_info):
-    url_dict = {}
-    for arch in _REDIST_ARCH_DICT.keys():
-        arch_key = arch
-        if arch_key == "linux-aarch64" and arch_key not in dist_info:
-            arch_key = "linux-sbsa"
-        if "relative_path" in dist_info[arch_key]:
-            url_dict[_REDIST_ARCH_DICT[arch]] = [
-                dist_info[arch_key]["relative_path"],
-                dist_info[arch_key].get("sha256", ""),
-            ]
-            continue
-
-        if "full_path" in dist_info[arch_key]:
-            url_dict[_REDIST_ARCH_DICT[arch]] = [
-                dist_info[arch_key]["full_path"],
-                dist_info[arch_key].get("sha256", ""),
-            ]
-            continue
-
-        for cuda_version, data in dist_info[arch_key].items():
-            # CUDNN JSON might contain paths for each CUDA version.
-            path_key = "relative_path"
-            if path_key not in data.keys():
-                path_key = "full_path"
-            url_dict["{cuda_version}_{arch}".format(
-                cuda_version = cuda_version,
-                arch = _REDIST_ARCH_DICT[arch],
-            )] = [data[path_key], data.get("sha256", "")]
-    return url_dict
-
-def get_version_and_template_lists(version_to_template):
-    # buildifier: disable=function-docstring-return
-    # buildifier: disable=function-docstring-args
-    """Returns lists of versions and templates provided in the dict."""
-    template_to_version_map = {}
-    for version, template in version_to_template.items():
-        if template not in template_to_version_map.keys():
-            template_to_version_map[template] = [version]
-        else:
-            template_to_version_map[template].append(version)
-    version_list = []
-    template_list = []
-    for template, versions in template_to_version_map.items():
-        version_list.append(",".join(versions))
-        template_list.append(Label(template))
-    return (version_list, template_list)
-
-def cudnn_redist_init_repository(
-        cudnn_redistributions,
-        cudnn_redist_path_prefix = CUDNN_REDIST_PATH_PREFIX,
-        redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
-    # buildifier: disable=function-docstring-args
-    """Initializes CUDNN repository."""
-    if "cudnn" in cudnn_redistributions.keys():
-        url_dict = _get_redistribution_urls(cudnn_redistributions["cudnn"])
-    else:
-        url_dict = {}
-    repo_data = redist_versions_to_build_templates["cudnn"]
-    versions, templates = get_version_and_template_lists(
-        repo_data["version_to_template"],
-    )
-    cudnn_repo(
-        name = repo_data["repo_name"],
-        versions = versions,
-        build_templates = templates,
-        url_dict = url_dict,
-        cudnn_redist_path_prefix = cudnn_redist_path_prefix,
-    )
-
-def cuda_redist_init_repositories(
-        cuda_redistributions,
-        cuda_redist_path_prefix = CUDA_REDIST_PATH_PREFIX,
-        redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
-    # buildifier: disable=function-docstring-args
-    """Initializes CUDA repositories."""
-    for redist_name, _ in redist_versions_to_build_templates.items():
-        if redist_name in ["cudnn", "cuda_nccl"]:
-            continue
-        if redist_name in cuda_redistributions.keys():
-            url_dict = _get_redistribution_urls(cuda_redistributions[redist_name])
-        else:
-            url_dict = {}
-        repo_data = redist_versions_to_build_templates[redist_name]
-        versions, templates = get_version_and_template_lists(
-            repo_data["version_to_template"],
-        )
-        cuda_repo(
-            name = repo_data["repo_name"],
-            versions = versions,
-            build_templates = templates,
-            url_dict = url_dict,
-            cuda_redist_path_prefix = cuda_redist_path_prefix,
-        )
diff --git a/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl b/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
deleted file mode 100644
index 2c1c5d68cacb..000000000000
--- a/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Hermetic CUDA redistribution versions."""
-
-CUDA_REDIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
-CUDNN_REDIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
-
-CUDA_REDIST_JSON_DICT = {
-    "11.8": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_11.8.0.json",
-        "941a950a4ab3b95311c50df7b3c8bca973e0cdda76fc2f4b456d2d5e4dac0281",
-    ],
-    "12.1.1": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.1.1.json",
-        "bafea3cb83a4cf5c764eeedcaac0040d0d3c5db3f9a74550da0e7b6ac24d378c",
-    ],
-    "12.2.0": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.2.0.json",
-        "d883762c6339c8ebb3ffb072facc8f7265cd257d2db16a475fff9a9306ecea89",
-    ],
-    "12.3.1": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.1.json",
-        "b3cc4181d711cf9b6e3718f323b23813c24f9478119911d7b4bceec9b437dbc3",
-    ],
-    "12.3.2": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.2.json",
-        "1b6eacf335dd49803633fed53ef261d62c193e5a56eee5019e7d2f634e39e7ef",
-    ],
-    "12.4.0": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.4.0.json",
-        "a4f496b8d5299939b34c9ef88dc4274821f8c9451b2d7c9bcee53166932da067",
-    ],
-    "12.4.1": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.4.1.json",
-        "9cd815f3b71c2e3686ef2219b7794b81044f9dcefaa8e21dacfcb5bc4d931892",
-    ],
-    "12.5.0": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.5.0.json",
-        "166664b520bfe51f27abcc8c7a934f4cb6ea287f8c399b5f8255f6f4d214569a",
-    ],
-    "12.5.1": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.5.1.json",
-        "7ab9c76014ae4907fa1b51738af599607a5fd8ca3a5c4bb4c3b31338cc642a93",
-    ],
-    "12.6.0": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.0.json",
-        "87740b01676b3d18982982ab96ec7fa1a626d03a96df070a6b0f258d01ff5fab",
-    ],
-    "12.6.1": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.1.json",
-        "22ddfeb81a6f9cee4a708a2e3b4db1c36c7db0a1daa1f33f9c7f2f12a1e790de",
-    ],
-    "12.6.2": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.2.json",
-        "8056da1f5acca8e613da1349d9b8782b774ad0254e3eddcc95734ded4d33f2df",
-    ],
-    "12.6.3": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.3.json",
-        "9c598598457a6463eb92889080c16b2b9dc04150e501b8bfc1536d403ba70aaf",
-    ],
-    "12.8.0": [
-        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.8.0.json",
-        "daa0d766b36feaa933592162c27be5fb63b68fc547ca6886c160a35d96ee8891",
-    ],
-}
-
-CUDNN_REDIST_JSON_DICT = {
-    "8.6": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.6.0.json",
-        "7f6f50bed4fd8216dc10d6ef505771dc0ecc99cce813993ab405cb507a21d51d",
-    ],
-    "8.9.4.25": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.4.25.json",
-        "02258dba8384860c9230fe3c78522e7bd8e350e461ccd37a8d932cb64127ba57",
-    ],
-    "8.9.6": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.6.json",
-        "6069ef92a2b9bb18cebfbc944964bd2b024b76f2c2c35a43812982e0bc45cf0c",
-    ],
-    "8.9.7.29": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.7.29.json",
-        "a0734f26f068522464fa09b2f2c186dfbe6ad7407a88ea0c50dd331f0c3389ec",
-    ],
-    "9.1.1": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.1.1.json",
-        "d22d569405e5683ff8e563d00d6e8c27e5e6a902c564c23d752b22a8b8b3fe20",
-    ],
-    "9.2.0": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.2.0.json",
-        "6852eb279b95d2b5775f7a7737ec133bed059107f863cdd8588f3ae6f13eadd7",
-    ],
-    "9.2.1": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.2.1.json",
-        "9a4198c59b2e66b2b115a736ebe4dc8f3dc6d78161bb494702f824da8fc77b99",
-    ],
-    "9.3.0": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.3.0.json",
-        "d17d9a7878365736758550294f03e633a0b023bec879bf173349bfb34781972e",
-    ],
-    "9.4.0": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.4.0.json",
-        "6eeaafc5cc3d4bb2f283e6298e4c55d4c59d7c83c5d9fd8721a2c0e55aee4e54",
-    ],
-    "9.5.0": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.5.0.json",
-        "3939f0533fdd0d3aa7edd1ac358d43da18e438e5d8f39c3c15bb72519bad7fb5",
-    ],
-    "9.5.1": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.5.1.json",
-        "a5484eef575bbb1fd4f96136cf12244ebc194b661f5ae9ed3b8aaa07e06434b1",
-    ],
-    "9.6.0": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.6.0.json",
-        "6dd9a931d981fe5afc7e7ed0c422a4035b1411db4e28a39cf2429e62e3efcd3e",
-    ],
-    "9.7.0": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.7.0.json",
-        "e715c1d028585d228c4678c2cdc5ad9a34fde54515a1c52aa60e36021a90dd90",
-    ],
-    "9.7.1": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.7.1.json",
-        "f9bc411a4908f0931e7323f89049e3a38453632c4ac5f4aa3220af69ddded9dc",
-    ],
-}
-
-CUDA_12_NCCL_WHEEL_DICT = {
-    "x86_64-unknown-linux-gnu": {
-        "version": "2.25.1",
-        "url": "https://files.pythonhosted.org/packages/11/0c/8c78b7603f4e685624a3ea944940f1e75f36d71bd6504330511f4a0e1557/nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl",
-        "sha256": "362aed5963fb9ea2ed2f264409baae30143498fd0e5c503aeaa1badd88cdc54a",
-    },
-    "aarch64-unknown-linux-gnu": {
-        "version": "2.25.1",
-        "url": "https://files.pythonhosted.org/packages/4b/28/f62adab24f2d4b2165b22145af56a7598ab535feb6ccd172f76b9106ebaa/nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl",
-        "sha256": "4ab428bc915785cc66e8c57cb34c7a64cf739c46702b8db748b6ad6cc7180cf8",
-    },
-}
-
-CUDA_11_NCCL_WHEEL_DICT = {
-    "x86_64-unknown-linux-gnu": {
-        "version": "2.21.5",
-        "url": "https://files.pythonhosted.org/packages/ac/9a/8b6a28b3b87d5fddab0e92cd835339eb8fbddaa71ae67518c8c1b3d05bae/nvidia_nccl_cu11-2.21.5-py3-none-manylinux2014_x86_64.whl",
-        "sha256": "49d8350629c7888701d1fd200934942671cb5c728f49acc5a0b3a768820bed29",
-    },
-}
-
-CUDA_NCCL_WHEELS = {
-    "11.8": CUDA_11_NCCL_WHEEL_DICT,
-    "12.1.0": CUDA_12_NCCL_WHEEL_DICT,
-    "12.1.1": CUDA_12_NCCL_WHEEL_DICT,
-    "12.2.0": CUDA_12_NCCL_WHEEL_DICT,
-    "12.3.1": CUDA_12_NCCL_WHEEL_DICT,
-    "12.3.2": CUDA_12_NCCL_WHEEL_DICT,
-    "12.4.0": CUDA_12_NCCL_WHEEL_DICT,
-    "12.4.1": CUDA_12_NCCL_WHEEL_DICT,
-    "12.5.0": CUDA_12_NCCL_WHEEL_DICT,
-    "12.5.1": CUDA_12_NCCL_WHEEL_DICT,
-    "12.6.0": CUDA_12_NCCL_WHEEL_DICT,
-    "12.6.1": CUDA_12_NCCL_WHEEL_DICT,
-    "12.6.2": CUDA_12_NCCL_WHEEL_DICT,
-    "12.6.3": CUDA_12_NCCL_WHEEL_DICT,
-    "12.8.0": CUDA_12_NCCL_WHEEL_DICT,
-}
-
-REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
-    "nvidia_driver": {
-        "repo_name": "cuda_driver",
-        "version_to_template": {
-            "570": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
-            "560": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
-            "555": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
-            "550": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
-            "545": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
-            "530": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
-            "520": "//third_party/gpus/cuda/hermetic:cuda_driver.BUILD.tpl",
-        },
-    },
-    "cuda_nccl": {
-        "repo_name": "cuda_nccl",
-        "version_to_template": {
-            "2": "//third_party/nccl/hermetic:cuda_nccl.BUILD.tpl",
-        },
-    },
-    "cudnn": {
-        "repo_name": "cuda_cudnn",
-        "version_to_template": {
-            "9": "//third_party/gpus/cuda/hermetic:cuda_cudnn9.BUILD.tpl",
-            "8": "//third_party/gpus/cuda/hermetic:cuda_cudnn.BUILD.tpl",
-        },
-    },
-    "libcublas": {
-        "repo_name": "cuda_cublas",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_cublas.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_cublas.BUILD.tpl",
-        },
-    },
-    "cuda_cudart": {
-        "repo_name": "cuda_cudart",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_cudart.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_cudart.BUILD.tpl",
-        },
-    },
-    "libcufft": {
-        "repo_name": "cuda_cufft",
-        "version_to_template": {
-            "11": "//third_party/gpus/cuda/hermetic:cuda_cufft.BUILD.tpl",
-            "10": "//third_party/gpus/cuda/hermetic:cuda_cufft.BUILD.tpl",
-        },
-    },
-    "cuda_cupti": {
-        "repo_name": "cuda_cupti",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_cupti.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_cupti.BUILD.tpl",
-        },
-    },
-    "libcurand": {
-        "repo_name": "cuda_curand",
-        "version_to_template": {
-            "10": "//third_party/gpus/cuda/hermetic:cuda_curand.BUILD.tpl",
-        },
-    },
-    "libcusolver": {
-        "repo_name": "cuda_cusolver",
-        "version_to_template": {
-            "11": "//third_party/gpus/cuda/hermetic:cuda_cusolver.BUILD.tpl",
-        },
-    },
-    "libcusparse": {
-        "repo_name": "cuda_cusparse",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_cusparse.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_cusparse.BUILD.tpl",
-        },
-    },
-    "libnvjitlink": {
-        "repo_name": "cuda_nvjitlink",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_nvjitlink.BUILD.tpl",
-        },
-    },
-    "cuda_nvrtc": {
-        "repo_name": "cuda_nvrtc",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_nvrtc.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_nvrtc.BUILD.tpl",
-        },
-    },
-    "cuda_cccl": {
-        "repo_name": "cuda_cccl",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
-        },
-    },
-    "cuda_nvcc": {
-        "repo_name": "cuda_nvcc",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_nvcc.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_nvcc.BUILD.tpl",
-        },
-    },
-    "cuda_nvml_dev": {
-        "repo_name": "cuda_nvml",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_nvml.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_nvml.BUILD.tpl",
-        },
-    },
-    "cuda_nvprune": {
-        "repo_name": "cuda_nvprune",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_nvprune.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_nvprune.BUILD.tpl",
-        },
-    },
-    "cuda_nvtx": {
-        "repo_name": "cuda_nvtx",
-        "version_to_template": {
-            "12": "//third_party/gpus/cuda/hermetic:cuda_nvtx.BUILD.tpl",
-            "11": "//third_party/gpus/cuda/hermetic:cuda_nvtx.BUILD.tpl",
-        },
-    },
-}
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
deleted file mode 100644
index c1e6c42e4fd4..000000000000
--- a/third_party/gpus/cuda_configure.bzl
+++ /dev/null
@@ -1,1413 +0,0 @@
-"""Repository rule for CUDA autoconfiguration.
-
-NB: DEPRECATED! Use `hermetic/cuda_configure` rule instead.
-
-`cuda_configure` depends on the following environment variables:
-
-  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
-  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-  * `TF_CUDA_CLANG`: Whether to use clang for C++ and Cuda compilation.
-  * `TF_NVCC_CLANG`: Whether to use clang for C++ and NVCC for Cuda compilation.
-  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
-    both host and device code compilation if TF_CUDA_CLANG is 1.
-  * `TF_SYSROOT`: The sysroot to use when compiling.
-  * `TF_DOWNLOAD_CLANG`: Whether to download a recent release of clang
-    compiler and use it to build tensorflow. When this option is set
-    CLANG_CUDA_COMPILER_PATH is ignored.
-  * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
-    `/usr/local/cuda,usr/`.
-  * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is
-    `/usr/local/cuda`.
-  * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
-    use the system default.
-  * `TF_CUDNN_VERSION`: The version of the cuDNN library.
-  * `CUDNN_INSTALL_PATH` (deprecated): The path to the cuDNN library. Default is
-    `/usr/local/cuda`.
-  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
-    `3.5,5.2`.
-  * `PYTHON_BIN_PATH`: The python binary path.
-  * `TMPDIR`: specifies the directory to use for temporary files. This
-    environment variable is used by GCC compiler.
-"""
-
-load(
-    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
-    "escape_string",
-    "get_env_var",
-)
-load(
-    "@bazel_tools//tools/cpp:windows_cc_configure.bzl",
-    "find_msvc_tool",
-    "find_vc_path",
-    "setup_vc_env_vars",
-)
-load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
-load(
-    "//third_party/remote_config:common.bzl",
-    "config_repo_label",
-    "err_out",
-    "execute",
-    "get_bash_bin",
-    "get_cpu_value",
-    "get_host_environ",
-    "get_python_bin",
-    "is_windows",
-    "raw_exec",
-    "read_dir",
-    "realpath",
-    "which",
-)
-load(
-    ":compiler_common_tools.bzl",
-    "get_cxx_inc_directories",
-    "to_list_of_strings",
-)
-
-_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
-_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
-_TF_SYSROOT = "TF_SYSROOT"
-_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
-_TF_CUDA_VERSION = "TF_CUDA_VERSION"
-_TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
-_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
-_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
-_TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
-_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
-_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-_TMPDIR = "TMPDIR"
-
-def verify_build_defines(params):
-    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
-
-    Args:
-      params: dict of variables that will be passed to the BUILD.tpl template.
-    """
-    missing = []
-    for param in [
-        "cxx_builtin_include_directories",
-        "extra_no_canonical_prefixes_flags",
-        "host_compiler_path",
-        "host_compiler_prefix",
-        "host_compiler_warnings",
-        "linker_bin_path",
-        "compiler_deps",
-        "msvc_cl_path",
-        "msvc_env_include",
-        "msvc_env_lib",
-        "msvc_env_path",
-        "msvc_env_tmp",
-        "msvc_lib_path",
-        "msvc_link_path",
-        "msvc_ml_path",
-        "unfiltered_compile_flags",
-        "win_compiler_deps",
-    ]:
-        if ("%{" + param + "}") not in params:
-            missing.append(param)
-
-    if missing:
-        auto_configure_fail(
-            "BUILD.tpl template is missing these variables: " +
-            str(missing) +
-            ".\nWe only got: " +
-            str(params) +
-            ".",
-        )
-
-def _get_nvcc_tmp_dir_for_windows(repository_ctx):
-    """Return the Windows tmp directory for nvcc to generate intermediate source files."""
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-            "\\",
-            "\\\\",
-        ),
-    )
-    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
-
-def _get_msvc_compiler(repository_ctx):
-    vc_path = find_vc_path(repository_ctx)
-    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
-
-def _get_win_cuda_defines(repository_ctx):
-    """Return CROSSTOOL defines for Windows"""
-
-    # If we are not on Windows, return fake vaules for Windows specific fields.
-    # This ensures the CROSSTOOL file parser is happy.
-    if not is_windows(repository_ctx):
-        return {
-            "%{msvc_env_tmp}": "msvc_not_used",
-            "%{msvc_env_path}": "msvc_not_used",
-            "%{msvc_env_include}": "msvc_not_used",
-            "%{msvc_env_lib}": "msvc_not_used",
-            "%{msvc_cl_path}": "msvc_not_used",
-            "%{msvc_ml_path}": "msvc_not_used",
-            "%{msvc_link_path}": "msvc_not_used",
-            "%{msvc_lib_path}": "msvc_not_used",
-        }
-
-    vc_path = find_vc_path(repository_ctx)
-    if not vc_path:
-        auto_configure_fail(
-            "Visual C++ build tools not found on your machine." +
-            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using",
-        )
-        return {}
-
-    env = setup_vc_env_vars(repository_ctx, vc_path)
-    escaped_paths = escape_string(env["PATH"])
-    escaped_include_paths = escape_string(env["INCLUDE"])
-    escaped_lib_paths = escape_string(env["LIB"])
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-            "\\",
-            "\\\\",
-        ),
-    )
-
-    msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
-    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
-        "\\",
-        "/",
-    )
-    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
-        "\\",
-        "/",
-    )
-    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
-        "\\",
-        "/",
-    )
-
-    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
-    # The generated files are guaranteed to have unique name, so they can share
-    # the same tmp directory
-    escaped_cxx_include_directories = [
-        _get_nvcc_tmp_dir_for_windows(repository_ctx),
-        "C:\\\\botcode\\\\w",
-    ]
-    for path in escaped_include_paths.split(";"):
-        if path:
-            escaped_cxx_include_directories.append(path)
-
-    return {
-        "%{msvc_env_tmp}": escaped_tmp_dir,
-        "%{msvc_env_path}": escaped_paths,
-        "%{msvc_env_include}": escaped_include_paths,
-        "%{msvc_env_lib}": escaped_lib_paths,
-        "%{msvc_cl_path}": msvc_cl_path,
-        "%{msvc_ml_path}": msvc_ml_path,
-        "%{msvc_link_path}": msvc_link_path,
-        "%{msvc_lib_path}": msvc_lib_path,
-        "%{cxx_builtin_include_directories}": to_list_of_strings(
-            escaped_cxx_include_directories,
-        ),
-    }
-
-# TODO(dzc): Once these functions have been factored out of Bazel's
-# cc_configure.bzl, load them from @bazel_tools instead.
-# BEGIN cc_configure common functions.
-def find_cc(repository_ctx, use_cuda_clang):
-    """Find the C++ compiler."""
-    if is_windows(repository_ctx):
-        return _get_msvc_compiler(repository_ctx)
-
-    if use_cuda_clang:
-        target_cc_name = "clang"
-        cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
-        if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
-            return "extra_tools/bin/clang"
-    else:
-        target_cc_name = "gcc"
-        cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
-    if cc_name_from_env:
-        cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = which(repository_ctx, cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
-
-def auto_configure_fail(msg):
-    """Output failure message when cuda configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
-
-# END cc_configure common functions (see TODO above).
-
-def _cuda_include_path(repository_ctx, cuda_config):
-    """Generates the Starlark string with cuda include directories.
-
-      Args:
-        repository_ctx: The repository context.
-        cc: The path to the gcc host compiler.
-
-      Returns:
-        A list of the gcc host compiler include directories.
-      """
-    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
-        cuda_config.cuda_toolkit_path,
-        ".exe" if cuda_config.cpu_value == "Windows" else "",
-    ))
-
-    # The expected exit code of this command is non-zero. Bazel remote execution
-    # only caches commands with zero exit code. So force a zero exit code.
-    cmd = "%s -v /dev/null -o /dev/null ; [ $? -eq 1 ]" % str(nvcc_path)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    target_dir = ""
-    for one_line in err_out(result).splitlines():
-        if one_line.startswith("#$ _TARGET_DIR_="):
-            target_dir = (
-                cuda_config.cuda_toolkit_path + "/" + one_line.replace(
-                    "#$ _TARGET_DIR_=",
-                    "",
-                ) + "/include"
-            )
-    inc_entries = []
-    if target_dir != "":
-        inc_entries.append(realpath(repository_ctx, target_dir))
-    inc_entries.append(realpath(repository_ctx, cuda_config.cuda_toolkit_path + "/include"))
-    return inc_entries
-
-def enable_cuda(repository_ctx):
-    """Returns whether to build with CUDA support."""
-    return int(get_host_environ(repository_ctx, "TF_NEED_CUDA", False))
-
-def matches_version(environ_version, detected_version):
-    """Checks whether the user-specified version matches the detected version.
-
-      This function performs a weak matching so that if the user specifies only
-      the
-      major or major and minor versions, the versions are still considered
-      matching
-      if the version parts match. To illustrate:
-
-          environ_version  detected_version  result
-          -----------------------------------------
-          5.1.3            5.1.3             True
-          5.1              5.1.3             True
-          5                5.1               True
-          5.1.3            5.1               False
-          5.2.3            5.1.3             False
-
-      Args:
-        environ_version: The version specified by the user via environment
-          variables.
-        detected_version: The version autodetected from the CUDA installation on
-          the system.
-      Returns: True if user-specified version matches detected version and False
-        otherwise.
-    """
-    environ_version_parts = environ_version.split(".")
-    detected_version_parts = detected_version.split(".")
-    if len(detected_version_parts) < len(environ_version_parts):
-        return False
-    for i, part in enumerate(detected_version_parts):
-        if i >= len(environ_version_parts):
-            break
-        if part != environ_version_parts[i]:
-            return False
-    return True
-
-_NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
-
-_DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
-
-def compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities.
-
-    Args:
-      repository_ctx: the repo rule's context.
-    Returns: list of cuda architectures to compile for. 'compute_xy' refers to
-      both PTX and SASS, 'sm_xy' refers to SASS only.
-    """
-    capabilities = get_host_environ(
-        repository_ctx,
-        _TF_CUDA_COMPUTE_CAPABILITIES,
-        "compute_35,compute_52",
-    ).split(",")
-
-    # Map old 'x.y' capabilities to 'compute_xy'.
-    if len(capabilities) > 0 and all([len(x.split(".")) == 2 for x in capabilities]):
-        # If all capabilities are in 'x.y' format, only include PTX for the
-        # highest capability.
-        cc_list = sorted([x.replace(".", "") for x in capabilities])
-        capabilities = ["sm_%s" % x for x in cc_list[:-1]] + ["compute_%s" % cc_list[-1]]
-    for i, capability in enumerate(capabilities):
-        parts = capability.split(".")
-        if len(parts) != 2:
-            continue
-        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
-
-    # Make list unique
-    capabilities = dict(zip(capabilities, capabilities)).keys()
-
-    # Validate capabilities.
-    for capability in capabilities:
-        if not capability.startswith(("compute_", "sm_")):
-            auto_configure_fail("Invalid compute capability: %s" % capability)
-        for prefix in ["compute_", "sm_"]:
-            if not capability.startswith(prefix):
-                continue
-            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
-                continue
-            if len(capability) == len(prefix) + 3 and capability.endswith("90a"):
-                continue
-            auto_configure_fail("Invalid compute capability: %s" % capability)
-
-    return capabilities
-
-def lib_name(base_name, cpu_value, version = None, static = False):
-    """Constructs the platform-specific name of a library.
-
-      Args:
-        base_name: The name of the library, such as "cudart"
-        cpu_value: The name of the host operating system.
-        version: The version of the library.
-        static: True the library is static or False if it is a shared object.
-
-      Returns:
-        The platform-specific name of the library.
-      """
-    version = "" if not version else "." + version
-    if cpu_value in ("Linux", "FreeBSD"):
-        if static:
-            return "lib%s.a" % base_name
-        return "lib%s.so%s" % (base_name, version)
-    elif cpu_value == "Windows":
-        return "%s.lib" % base_name
-    elif cpu_value == "Darwin":
-        if static:
-            return "lib%s.a" % base_name
-        return "lib%s%s.dylib" % (base_name, version)
-    else:
-        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
-
-def _lib_path(lib, cpu_value, basedir, version, static):
-    file_name = lib_name(lib, cpu_value, version, static)
-    return "%s/%s" % (basedir, file_name)
-
-def _should_check_soname(version, static):
-    return version and not static
-
-def _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False):
-    return (
-        _lib_path(lib, cpu_value, basedir, version, static),
-        _should_check_soname(version, static),
-    )
-
-def _check_cuda_libs(repository_ctx, script_path, libs):
-    python_bin = get_python_bin(repository_ctx)
-    contents = repository_ctx.read(script_path).splitlines()
-
-    cmd = "from os import linesep;"
-    cmd += "f = open('script.py', 'w');"
-    for line in contents:
-        cmd += "f.write('%s' + linesep);" % line
-    cmd += "f.close();"
-    cmd += "from os import system;"
-    args = " ".join(["\"" + path + "\" " + str(check) for path, check in libs])
-    cmd += "system('%s script.py %s');" % (python_bin, args)
-
-    all_paths = [path for path, _ in libs]
-    checked_paths = execute(repository_ctx, [python_bin, "-c", cmd]).stdout.splitlines()
-
-    # Filter out empty lines from splitting on '\r\n' on Windows
-    checked_paths = [path for path in checked_paths if len(path) > 0]
-    if all_paths != checked_paths:
-        auto_configure_fail("Error with installed CUDA libs. Expected '%s'. Actual '%s'." % (all_paths, checked_paths))
-
-def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
-    """Returns the CUDA and cuDNN libraries on the system.
-
-      Also, verifies that the script actually exist.
-
-      Args:
-        repository_ctx: The repository context.
-        check_cuda_libs_script: The path to a script verifying that the cuda
-          libraries exist on the system.
-        cuda_config: The CUDA config as returned by _get_cuda_config
-
-      Returns:
-        Map of library names to structs of filename and path.
-      """
-    cpu_value = cuda_config.cpu_value
-    stub_dir = "" if is_windows(repository_ctx) else "/stubs"
-
-    check_cuda_libs_params = {
-        "cuda": _check_cuda_lib_params(
-            "cuda",
-            cpu_value,
-            cuda_config.config["cuda_library_dir"] + stub_dir,
-            version = None,
-            static = False,
-        ),
-        "cudart": _check_cuda_lib_params(
-            "cudart",
-            cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cudart_version,
-            static = False,
-        ),
-        "cudart_static": _check_cuda_lib_params(
-            "cudart_static",
-            cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cudart_version,
-            static = True,
-        ),
-        "cublas": _check_cuda_lib_params(
-            "cublas",
-            cpu_value,
-            cuda_config.config["cublas_library_dir"],
-            cuda_config.cublas_version,
-            static = False,
-        ),
-        "cublasLt": _check_cuda_lib_params(
-            "cublasLt",
-            cpu_value,
-            cuda_config.config["cublas_library_dir"],
-            cuda_config.cublas_version,
-            static = False,
-        ),
-        "cusolver": _check_cuda_lib_params(
-            "cusolver",
-            cpu_value,
-            cuda_config.config["cusolver_library_dir"],
-            cuda_config.cusolver_version,
-            static = False,
-        ),
-        "curand": _check_cuda_lib_params(
-            "curand",
-            cpu_value,
-            cuda_config.config["curand_library_dir"],
-            cuda_config.curand_version,
-            static = False,
-        ),
-        "cufft": _check_cuda_lib_params(
-            "cufft",
-            cpu_value,
-            cuda_config.config["cufft_library_dir"],
-            cuda_config.cufft_version,
-            static = False,
-        ),
-        "cudnn": _check_cuda_lib_params(
-            "cudnn",
-            cpu_value,
-            cuda_config.config["cudnn_library_dir"],
-            cuda_config.cudnn_version,
-            static = False,
-        ),
-        "cupti": _check_cuda_lib_params(
-            "cupti",
-            cpu_value,
-            cuda_config.config["cupti_library_dir"],
-            cuda_config.cupti_version,
-            static = False,
-        ),
-        "cusparse": _check_cuda_lib_params(
-            "cusparse",
-            cpu_value,
-            cuda_config.config["cusparse_library_dir"],
-            cuda_config.cusparse_version,
-            static = False,
-        ),
-    }
-
-    # Verify that the libs actually exist at their locations.
-    _check_cuda_libs(repository_ctx, check_cuda_libs_script, check_cuda_libs_params.values())
-
-    paths = {filename: v[0] for (filename, v) in check_cuda_libs_params.items()}
-    return paths
-
-def _cudart_static_linkopt(cpu_value):
-    """Returns additional platform-specific linkopts for cudart."""
-    return "" if cpu_value == "Darwin" else "\"-lrt\","
-
-# TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
-# and nccl_configure.bzl.
-def find_cuda_config(repository_ctx, cuda_libraries):
-    """Returns CUDA config dictionary from running find_cuda_config.py"""
-    python_bin = get_python_bin(repository_ctx)
-    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_cuda_config] + cuda_libraries)
-    if exec_result.return_code:
-        auto_configure_fail("Failed to run find_cuda_config.py: %s" % err_out(exec_result))
-
-    # Parse the dict from stdout.
-    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
-
-def _get_cuda_config(repository_ctx):
-    """Detects and returns information about the CUDA installation on the system.
-
-      Args:
-        repository_ctx: The repository context.
-
-      Returns:
-        A struct containing the following fields:
-          cuda_toolkit_path: The CUDA toolkit installation directory.
-          cudnn_install_basedir: The cuDNN installation directory.
-          cuda_version: The version of CUDA on the system.
-          cudart_version: The CUDA runtime version on the system.
-          cudnn_version: The version of cuDNN on the system.
-          compute_capabilities: A list of the system's CUDA compute capabilities.
-          cpu_value: The name of the host operating system.
-      """
-    config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
-    cpu_value = get_cpu_value(repository_ctx)
-    toolkit_path = config["cuda_toolkit_path"]
-
-    is_windows = cpu_value == "Windows"
-    cuda_version = config["cuda_version"].split(".")
-    cuda_major = cuda_version[0]
-    cuda_minor = cuda_version[1]
-
-    cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
-    cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
-
-    if int(cuda_major) >= 11:
-        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
-        if int(cuda_major) == 11:
-            cudart_version = "64_110" if is_windows else "11.0"
-            cupti_version = cuda_version
-        else:
-            cudart_version = ("64_%s" if is_windows else "%s") % cuda_major
-            cupti_version = cudart_version
-        cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
-        cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
-        curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
-        cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0]
-        cusparse_version = ("64_%s" if is_windows else "%s") % config["cusparse_version"].split(".")[0]
-    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
-        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
-        # It changed from 'x.y' to just 'x' in CUDA 10.1.
-        cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
-        cudart_version = cuda_version
-        cupti_version = cuda_version
-        cublas_version = cuda_lib_version
-        cusolver_version = cuda_lib_version
-        curand_version = cuda_lib_version
-        cufft_version = cuda_lib_version
-        cusparse_version = cuda_lib_version
-    else:
-        cudart_version = cuda_version
-        cupti_version = cuda_version
-        cublas_version = cuda_version
-        cusolver_version = cuda_version
-        curand_version = cuda_version
-        cufft_version = cuda_version
-        cusparse_version = cuda_version
-
-    return struct(
-        cuda_toolkit_path = toolkit_path,
-        cuda_version = cuda_version,
-        cupti_version = cupti_version,
-        cuda_version_major = cuda_major,
-        cudart_version = cudart_version,
-        cublas_version = cublas_version,
-        cusolver_version = cusolver_version,
-        curand_version = curand_version,
-        cufft_version = cufft_version,
-        cusparse_version = cusparse_version,
-        cudnn_version = cudnn_version,
-        compute_capabilities = compute_capabilities(repository_ctx),
-        cpu_value = cpu_value,
-        config = config,
-    )
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        Label("//third_party/gpus/%s.tpl" % tpl),
-        substitutions,
-    )
-
-def _file(repository_ctx, label):
-    repository_ctx.template(
-        label.replace(":", "/"),
-        Label("//third_party/gpus/%s.tpl" % label),
-        {},
-    )
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_gpu_disabled():
-  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
-       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with GPU support.")
-
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
-
-error_gpu_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    cpu_value = get_cpu_value(repository_ctx)
-
-    # Set up BUILD file for cuda/.
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "False",
-            "%{cuda_extra_copts}": "[]",
-            "%{cuda_gpu_architectures}": "[]",
-            "%{cuda_version}": "0.0",
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD",
-        {
-            "%{cuda_driver_lib}": lib_name("cuda", cpu_value),
-            "%{cudart_static_lib}": lib_name(
-                "cudart_static",
-                cpu_value,
-                static = True,
-            ),
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-            "%{cudart_lib}": lib_name("cudart", cpu_value),
-            "%{cublas_lib}": lib_name("cublas", cpu_value),
-            "%{cublasLt_lib}": lib_name("cublasLt", cpu_value),
-            "%{cusolver_lib}": lib_name("cusolver", cpu_value),
-            "%{cudnn_lib}": lib_name("cudnn", cpu_value),
-            "%{cufft_lib}": lib_name("cufft", cpu_value),
-            "%{curand_lib}": lib_name("curand", cpu_value),
-            "%{cupti_lib}": lib_name("cupti", cpu_value),
-            "%{cusparse_lib}": lib_name("cusparse", cpu_value),
-            "%{cub_actual}": ":cuda_headers",
-            "%{copy_rules}": """
-filegroup(name="cuda-include")
-filegroup(name="cublas-include")
-filegroup(name="cusolver-include")
-filegroup(name="cufft-include")
-filegroup(name="cusparse-include")
-filegroup(name="curand-include")
-filegroup(name="cudnn-include")
-""",
-        },
-    )
-
-    # Create dummy files for the CUDA toolkit since they are still required by
-    # tensorflow/compiler/xla/tsl/platform/default/build_config:cuda.
-    repository_ctx.file("cuda/cuda/include/cuda.h")
-    repository_ctx.file("cuda/cuda/include/cublas.h")
-    repository_ctx.file("cuda/cuda/include/cudnn.h")
-    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
-    repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
-    repository_ctx.file(
-        "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
-    )
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublasLt", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusparse", cpu_value))
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    _tpl(
-        repository_ctx,
-        "cuda:cuda_config.h",
-        {
-            "%{cuda_version}": "",
-            "%{cudart_version}": "",
-            "%{cupti_version}": "",
-            "%{cublas_version}": "",
-            "%{cusolver_version}": "",
-            "%{curand_version}": "",
-            "%{cufft_version}": "",
-            "%{cusparse_version}": "",
-            "%{cudnn_version}": "",
-            "%{cuda_toolkit_path}": "",
-            "%{cuda_compute_capabilities}": "",
-        },
-        "cuda/cuda/cuda_config.h",
-    )
-
-    # Set up cuda_config.py, which is used by gen_build_info to provide
-    # static build environment info to the API
-    _tpl(
-        repository_ctx,
-        "cuda:cuda_config.py",
-        _py_tmpl_dict({}),
-        "cuda/cuda/cuda_config.py",
-    )
-
-    # If cuda_configure is not configured to build with GPU support, and the user
-    # attempts to build with --config=cuda, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_gpu_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def make_copy_files_rule(repository_ctx, name, srcs, outs):
-    """Returns a rule to copy a set of files."""
-    cmds = []
-
-    # Copy files.
-    for src, out in zip(srcs, outs):
-        cmds.append('cp -f "%s" "$(location %s)"' % (src, out))
-    outs = [('        "%s",' % out) for out in outs]
-    return """genrule(
-    name = "%s",
-    outs = [
-%s
-    ],
-    cmd = \"""%s \""",
-)""" % (name, "\n".join(outs), " && \\\n".join(cmds))
-
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions = None):
-    """Returns a rule to recursively copy a directory.
-    If exceptions is not None, it must be a list of files or directories in
-    'src_dir'; these will be excluded from copying.
-    """
-    src_dir = _norm_path(src_dir)
-    out_dir = _norm_path(out_dir)
-    outs = read_dir(repository_ctx, src_dir)
-    post_cmd = ""
-    if exceptions != None:
-        outs = [x for x in outs if not any([
-            x.startswith(src_dir + "/" + y)
-            for y in exceptions
-        ])]
-    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
-
-    # '@D' already contains the relative path for a single file, see
-    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
-    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
-    if exceptions != None:
-        for x in exceptions:
-            post_cmd += " ; rm -fR " + out_dir + "/" + x
-    return """genrule(
-    name = "%s",
-    outs = [
-%s
-    ],
-    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
-)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
-
-def _flag_enabled(repository_ctx, flag_name):
-    return get_host_environ(repository_ctx, flag_name) == "1"
-
-def _use_cuda_clang(repository_ctx):
-    # Returns the flag if we need to use clang both for C++ and Cuda.
-    return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
-
-def _use_nvcc_and_clang(repository_ctx):
-    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
-    return _flag_enabled(repository_ctx, "TF_NVCC_CLANG")
-
-def _tf_sysroot(repository_ctx):
-    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
-
-def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    copts = ["--no-cuda-include-ptx=all"] if _use_cuda_clang(repository_ctx) else []
-    for capability in compute_capabilities:
-        if capability.startswith("compute_"):
-            capability = capability.replace("compute_", "sm_")
-            copts.append("--cuda-include-ptx=%s" % capability)
-        copts.append("--cuda-gpu-arch=%s" % capability)
-
-    return str(copts)
-
-def _tpl_path(repository_ctx, filename):
-    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
-
-def _basename(repository_ctx, path_str):
-    """Returns the basename of a path of type string.
-
-    This method is different from path.basename in that it also works if
-    the host platform is different from the execution platform
-    i.e. linux -> windows.
-    """
-
-    num_chars = len(path_str)
-    is_win = is_windows(repository_ctx)
-    for i in range(num_chars):
-        r_i = num_chars - 1 - i
-        if (is_win and path_str[r_i] == "\\") or path_str[r_i] == "/":
-            return path_str[r_i + 1:]
-    return path_str
-
-def _create_local_cuda_repository(repository_ctx):
-    """Creates the repository containing files set up to build with CUDA."""
-
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    tpl_paths = {filename: _tpl_path(repository_ctx, filename) for filename in [
-        "cuda:build_defs.bzl",
-        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        "crosstool:windows/msvc_wrapper_for_nvcc.py",
-        "crosstool:BUILD",
-        "crosstool:cc_toolchain_config.bzl",
-        "cuda:cuda_config.h",
-        "cuda:cuda_config.py",
-    ]}
-    tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD.windows" if is_windows(repository_ctx) else "cuda:BUILD")
-
-    cuda_config = _get_cuda_config(repository_ctx)
-
-    cuda_include_path = cuda_config.config["cuda_include_dir"]
-    cublas_include_path = cuda_config.config["cublas_include_dir"]
-    cudnn_header_dir = cuda_config.config["cudnn_include_dir"]
-    cupti_header_dir = cuda_config.config["cupti_include_dir"]
-    nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"]
-    nvml_header_dir = cuda_config.config["nvml_header_dir"]
-
-    # Create genrule to copy files from the installed CUDA toolkit into execroot.
-    copy_rules = [
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-include",
-            src_dir = cuda_include_path,
-            out_dir = "cuda/include",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-nvvm",
-            src_dir = nvvm_libdevice_dir,
-            out_dir = "cuda/nvvm/libdevice",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-extras",
-            src_dir = cupti_header_dir,
-            out_dir = "cuda/extras/CUPTI/include",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "nvml",
-            src_dir = nvml_header_dir,
-            out_dir = "cuda/nvml/include",
-        ),
-    ]
-
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cublas-include",
-        srcs = [
-            cublas_include_path + "/cublas.h",
-            cublas_include_path + "/cublas_v2.h",
-            cublas_include_path + "/cublas_api.h",
-            cublas_include_path + "/cublasLt.h",
-        ],
-        outs = [
-            "cublas/include/cublas.h",
-            "cublas/include/cublas_v2.h",
-            "cublas/include/cublas_api.h",
-            "cublas/include/cublasLt.h",
-        ],
-    ))
-
-    cusolver_include_path = cuda_config.config["cusolver_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cusolver-include",
-        srcs = [
-            cusolver_include_path + "/cusolver_common.h",
-            cusolver_include_path + "/cusolverDn.h",
-        ],
-        outs = [
-            "cusolver/include/cusolver_common.h",
-            "cusolver/include/cusolverDn.h",
-        ],
-    ))
-
-    cufft_include_path = cuda_config.config["cufft_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cufft-include",
-        srcs = [
-            cufft_include_path + "/cufft.h",
-        ],
-        outs = [
-            "cufft/include/cufft.h",
-        ],
-    ))
-
-    cusparse_include_path = cuda_config.config["cusparse_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cusparse-include",
-        srcs = [
-            cusparse_include_path + "/cusparse.h",
-        ],
-        outs = [
-            "cusparse/include/cusparse.h",
-        ],
-    ))
-
-    curand_include_path = cuda_config.config["curand_include_dir"]
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "curand-include",
-        srcs = [
-            curand_include_path + "/curand.h",
-        ],
-        outs = [
-            "curand/include/curand.h",
-        ],
-    ))
-
-    check_cuda_libs_script = repository_ctx.path(Label("@local_xla//third_party/gpus:check_cuda_libs.py"))
-    cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
-    cuda_lib_srcs = []
-    cuda_lib_outs = []
-    for path in cuda_libs.values():
-        cuda_lib_srcs.append(path)
-        cuda_lib_outs.append("cuda/lib/" + _basename(repository_ctx, path))
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cuda-lib",
-        srcs = cuda_lib_srcs,
-        outs = cuda_lib_outs,
-    ))
-
-    # copy files mentioned in third_party/nccl/build_defs.bzl.tpl
-    file_ext = ".exe" if is_windows(repository_ctx) else ""
-    bin_files = (
-        ["crt/link.stub"] +
-        [f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]]
-    )
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cuda-bin",
-        srcs = [cuda_config.cuda_toolkit_path + "/bin/" + f for f in bin_files],
-        outs = ["cuda/bin/" + f for f in bin_files],
-    ))
-
-    # Select the headers based on the cuDNN version (strip '64_' for Windows).
-    cudnn_headers = ["cudnn.h"]
-    if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "9":
-        cudnn_headers += [
-            "cudnn_adv.h",
-            "cudnn_backend.h",
-            "cudnn_cnn.h",
-            "cudnn_graph.h",
-            "cudnn_ops.h",
-            "cudnn_version.h",
-        ]
-    elif cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8":
-        cudnn_headers += [
-            "cudnn_backend.h",
-            "cudnn_adv_infer.h",
-            "cudnn_adv_train.h",
-            "cudnn_cnn_infer.h",
-            "cudnn_cnn_train.h",
-            "cudnn_ops_infer.h",
-            "cudnn_ops_train.h",
-            "cudnn_version.h",
-        ]
-
-    cudnn_srcs = []
-    cudnn_outs = []
-    for header in cudnn_headers:
-        cudnn_srcs.append(cudnn_header_dir + "/" + header)
-        cudnn_outs.append("cudnn/include/" + header)
-
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cudnn-include",
-        srcs = cudnn_srcs,
-        outs = cudnn_outs,
-    ))
-
-    # Set up BUILD file for cuda/
-    repository_ctx.template(
-        "cuda/build_defs.bzl",
-        tpl_paths["cuda:build_defs.bzl"],
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                cuda_config.compute_capabilities,
-            ),
-            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
-            "%{cuda_version}": cuda_config.cuda_version,
-        },
-    )
-
-    cub_actual = "@cub_archive//:cub"
-    if int(cuda_config.cuda_version_major) >= 11:
-        cub_actual = ":cuda_headers"
-
-    repository_ctx.template(
-        "cuda/BUILD",
-        tpl_paths["cuda:BUILD"],
-        {
-            "%{cuda_driver_lib}": _basename(repository_ctx, cuda_libs["cuda"]),
-            "%{cudart_static_lib}": _basename(repository_ctx, cuda_libs["cudart_static"]),
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
-            "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]),
-            "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]),
-            "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]),
-            "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]),
-            "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]),
-            "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]),
-            "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
-            "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
-            "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
-            "%{cub_actual}": cub_actual,
-            "%{copy_rules}": "\n".join(copy_rules),
-        },
-    )
-
-    is_cuda_clang = _use_cuda_clang(repository_ctx)
-    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
-    tf_sysroot = _tf_sysroot(repository_ctx)
-
-    should_download_clang = is_cuda_clang and _flag_enabled(
-        repository_ctx,
-        _TF_DOWNLOAD_CLANG,
-    )
-    if should_download_clang:
-        download_clang(repository_ctx, "crosstool/extra_tools")
-
-    # Set up crosstool/
-    cc = find_cc(repository_ctx, is_cuda_clang)
-    cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
-
-    host_compiler_includes = get_cxx_inc_directories(
-        repository_ctx,
-        cc_fullpath,
-        tf_sysroot,
-    )
-    cuda_defines = {}
-    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
-    cuda_defines["%{cuda_toolkit_path}"] = ""
-    cuda_defines["%{compiler}"] = "unknown"
-    if is_cuda_clang:
-        cuda_defines["%{cuda_toolkit_path}"] = cuda_config.config["cuda_toolkit_path"]
-        cuda_defines["%{compiler}"] = "clang"
-
-    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX)
-    if not host_compiler_prefix:
-        host_compiler_prefix = "/usr/bin"
-
-    cuda_defines["%{host_compiler_prefix}"] = host_compiler_prefix
-
-    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
-    # https://github.com/bazelbuild/bazel/issues/760).
-    # However, this stops our custom clang toolchain from picking the provided
-    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
-    # toolchain.
-    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
-    #       flag from the CROSSTOOL completely (see
-    #       https://github.com/bazelbuild/bazel/issues/5634)
-    if should_download_clang:
-        cuda_defines["%{linker_bin_path}"] = ""
-    else:
-        cuda_defines["%{linker_bin_path}"] = host_compiler_prefix
-
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
-    cuda_defines["%{unfiltered_compile_flags}"] = ""
-    cuda_defines["%{cuda_nvcc_files}"] = "[]"
-    if is_cuda_clang and not is_nvcc_and_clang:
-        cuda_defines["%{host_compiler_path}"] = str(cc)
-        cuda_defines["%{host_compiler_warnings}"] = """
-        # Some parts of the codebase set -Werror and hit this warning, so
-        # switch it off for now.
-        "-Wno-invalid-partial-specialization"
-    """
-        cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
-        cuda_defines["%{compiler_deps}"] = ":empty"
-        cuda_defines["%{win_compiler_deps}"] = ":empty"
-        repository_ctx.file(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            "",
-        )
-        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
-    else:
-        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-        cuda_defines["%{host_compiler_warnings}"] = ""
-
-        # nvcc has the system include paths built in and will automatically
-        # search them; we cannot work around that, so we add the relevant cuda
-        # system paths to the allowed compiler specific include paths.
-        cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
-            host_compiler_includes + _cuda_include_path(
-                repository_ctx,
-                cuda_config,
-            ) + [cupti_header_dir, cudnn_header_dir, nvml_header_dir],
-        )
-
-        # For gcc, do not canonicalize system header paths; some versions of gcc
-        # pick the shortest possible path for system includes when creating the
-        # .d file - given that includes that are prefixed with "../" multiple
-        # time quickly grow longer than the root of the tree, this can lead to
-        # bazel's header check failing.
-        if not is_cuda_clang:
-            cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
-
-        file_ext = ".exe" if is_windows(repository_ctx) else ""
-        nvcc_path = "%s/nvcc%s" % (cuda_config.config["cuda_binary_dir"], file_ext)
-        cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
-        cuda_defines["%{win_compiler_deps}"] = ":windows_msvc_wrapper_files"
-
-        wrapper_defines = {
-            "%{cpu_compiler}": str(cc),
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{nvcc_path}": nvcc_path,
-            "%{host_compiler_path}": str(cc),
-            "%{use_clang_compiler}": str(is_nvcc_and_clang),
-            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
-            "%{tmpdir}": get_host_environ(
-                repository_ctx,
-                _TMPDIR,
-                "",
-            ),
-        }
-        repository_ctx.template(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-            wrapper_defines,
-        )
-        repository_ctx.file(
-            "crosstool/windows/msvc_wrapper_for_nvcc.bat",
-            content = "@echo OFF\n{} -B external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py %*".format(
-                get_python_bin(repository_ctx),
-            ),
-        )
-        repository_ctx.template(
-            "crosstool/windows/msvc_wrapper_for_nvcc.py",
-            tpl_paths["crosstool:windows/msvc_wrapper_for_nvcc.py"],
-            wrapper_defines,
-        )
-
-    cuda_defines.update(_get_win_cuda_defines(repository_ctx))
-
-    verify_build_defines(cuda_defines)
-
-    # Only expand template variables in the BUILD file
-    repository_ctx.template(
-        "crosstool/BUILD",
-        tpl_paths["crosstool:BUILD"],
-        cuda_defines,
-    )
-
-    # No templating of cc_toolchain_config - use attributes and templatize the
-    # BUILD file.
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        tpl_paths["crosstool:cc_toolchain_config.bzl"],
-        {},
-    )
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.h",
-        tpl_paths["cuda:cuda_config.h"],
-        {
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{cudart_version}": cuda_config.cudart_version,
-            "%{cupti_version}": cuda_config.cupti_version,
-            "%{cublas_version}": cuda_config.cublas_version,
-            "%{cusolver_version}": cuda_config.cusolver_version,
-            "%{curand_version}": cuda_config.curand_version,
-            "%{cufft_version}": cuda_config.cufft_version,
-            "%{cusparse_version}": cuda_config.cusparse_version,
-            "%{cudnn_version}": cuda_config.cudnn_version,
-            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
-            "%{cuda_compute_capabilities}": ", ".join([
-                cc.split("_")[1]
-                for cc in cuda_config.compute_capabilities
-            ]),
-        },
-    )
-
-    # Set up cuda_config.py, which is used by gen_build_info to provide
-    # static build environment info to the API
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.py",
-        tpl_paths["cuda:cuda_config.py"],
-        _py_tmpl_dict({
-            "cuda_version": cuda_config.cuda_version,
-            "cudnn_version": cuda_config.cudnn_version,
-            "cuda_compute_capabilities": cuda_config.compute_capabilities,
-            "cpu_compiler": str(cc),
-        }),
-    )
-
-def _py_tmpl_dict(d):
-    return {"%{cuda_config}": str(d)}
-
-def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with CUDA."""
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                compute_capabilities(repository_ctx),
-            ),
-            "%{cuda_version}": get_host_environ(repository_ctx, _TF_CUDA_VERSION),
-        },
-    )
-    repository_ctx.template(
-        "cuda/BUILD",
-        config_repo_label(remote_config_repo, "cuda:BUILD"),
-        {},
-    )
-    repository_ctx.template(
-        "cuda/build_defs.bzl",
-        config_repo_label(remote_config_repo, "cuda:build_defs.bzl"),
-        {},
-    )
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.h",
-        config_repo_label(remote_config_repo, "cuda:cuda/cuda_config.h"),
-        {},
-    )
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.py",
-        config_repo_label(remote_config_repo, "cuda:cuda/cuda_config.py"),
-        _py_tmpl_dict({}),
-    )
-
-    repository_ctx.template(
-        "crosstool/BUILD",
-        config_repo_label(remote_config_repo, "crosstool:BUILD"),
-        {},
-    )
-
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
-        {},
-    )
-
-    repository_ctx.template(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
-        {},
-    )
-
-def _cuda_autoconf_impl(repository_ctx):
-    """Implementation of the cuda_autoconf repository rule."""
-    build_file = Label("//third_party/gpus:local_config_cuda.BUILD")
-
-    if not enable_cuda(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    elif get_host_environ(repository_ctx, _TF_CUDA_CONFIG_REPO) != None:
-        has_cuda_version = get_host_environ(repository_ctx, _TF_CUDA_VERSION) != None
-        has_cudnn_version = get_host_environ(repository_ctx, _TF_CUDNN_VERSION) != None
-        if not has_cuda_version or not has_cudnn_version:
-            auto_configure_fail("%s and %s must also be set if %s is specified" %
-                                (_TF_CUDA_VERSION, _TF_CUDNN_VERSION, _TF_CUDA_CONFIG_REPO))
-        _create_remote_cuda_repository(
-            repository_ctx,
-            get_host_environ(repository_ctx, _TF_CUDA_CONFIG_REPO),
-        )
-    else:
-        _create_local_cuda_repository(repository_ctx)
-
-    repository_ctx.symlink(build_file, "BUILD")
-
-# For @bazel_tools//tools/cpp:windows_cc_configure.bzl
-_MSVC_ENVVARS = [
-    "BAZEL_VC",
-    "BAZEL_VC_FULL_VERSION",
-    "BAZEL_VS",
-    "BAZEL_WINSDK_FULL_VERSION",
-    "VS90COMNTOOLS",
-    "VS100COMNTOOLS",
-    "VS110COMNTOOLS",
-    "VS120COMNTOOLS",
-    "VS140COMNTOOLS",
-    "VS150COMNTOOLS",
-    "VS160COMNTOOLS",
-]
-
-_ENVIRONS = [
-    _GCC_HOST_COMPILER_PATH,
-    _GCC_HOST_COMPILER_PREFIX,
-    _CLANG_CUDA_COMPILER_PATH,
-    "TF_NEED_CUDA",
-    "TF_CUDA_CLANG",
-    "TF_NVCC_CLANG",
-    _TF_DOWNLOAD_CLANG,
-    _CUDA_TOOLKIT_PATH,
-    _CUDNN_INSTALL_PATH,
-    _TF_CUDA_VERSION,
-    _TF_CUDNN_VERSION,
-    _TF_CUDA_COMPUTE_CAPABILITIES,
-    "NVVMIR_LIBRARY_DIR",
-    _PYTHON_BIN_PATH,
-    "TMP",
-    _TMPDIR,
-    "TF_CUDA_PATHS",
-] + _MSVC_ENVVARS
-
-remote_cuda_configure = repository_rule(
-    implementation = _create_local_cuda_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-
-cuda_configure = repository_rule(
-    implementation = _cuda_autoconf_impl,
-    environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
-    attrs = {
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-"""Detects and configures the local CUDA toolchain.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-cuda_configure(name = "local_config_cuda")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
deleted file mode 100644
index c04dace79fe5..000000000000
--- a/third_party/gpus/find_cuda_config.py
+++ /dev/null
@@ -1,649 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prints CUDA library and header directories and versions found on the system.
-
-NB: DEPRECATED! This script is a part of the deprecated `cuda_configure` rule.
-Please use `hermetic/cuda_configure` instead.
-
-The script searches for CUDA library and header files on the system, inspects
-them to determine their version and prints the configuration to stdout.
-The paths to inspect and the required versions are specified through environment
-variables. If no valid configuration is found, the script prints to stderr and
-returns an error code.
-
-The list of libraries to find is specified as arguments. Supported libraries are
-CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT.
-
-The script takes a list of base directories specified by the TF_CUDA_PATHS
-environment variable as comma-separated glob list. The script looks for headers
-and library files in a hard-coded set of subdirectories from these base paths.
-If TF_CUDA_PATHS is not specified, a OS specific default is used:
-
-  Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
-  Windows: CUDA_PATH environment variable, or
-           C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\*
-
-For backwards compatibility, some libraries also use alternative base
-directories from other environment variables if they are specified. List of
-library-specific environment variables:
-
-  Library   Version env variable  Additional base directories
-  ----------------------------------------------------------------
-  CUDA      TF_CUDA_VERSION       CUDA_TOOLKIT_PATH
-  cuBLAS    TF_CUBLAS_VERSION     CUDA_TOOLKIT_PATH
-  cuDNN     TF_CUDNN_VERSION      CUDNN_INSTALL_PATH
-  NCCL      TF_NCCL_VERSION       NCCL_INSTALL_PATH, NCCL_HDR_PATH
-  TensorRT  TF_TENSORRT_VERSION   TENSORRT_INSTALL_PATH
-
-Versions environment variables can be of the form 'x' or 'x.y' to request a
-specific version, empty or unspecified to accept any version.
-
-The output of a found library is of the form:
-tf_<library>_version: x.y.z
-tf_<library>_header_dir: ...
-tf_<library>_library_dir: ...
-"""
-
-import glob
-import io
-import os
-import platform
-import re
-import shutil
-import subprocess
-import sys
-
-
-class ConfigError(Exception):
-  pass
-
-
-def _is_linux():
-  return platform.system() == "Linux"
-
-
-def _is_windows():
-  return platform.system() == "Windows"
-
-
-def _is_macos():
-  return platform.system() == "Darwin"
-
-
-def _matches_version(actual_version, required_version):
-  """Checks whether some version meets the requirements.
-
-      All elements of the required_version need to be present in the
-      actual_version.
-
-          required_version  actual_version  result
-          -----------------------------------------
-          1                 1.1             True
-          1.2               1               False
-          1.2               1.3             False
-                            1               True
-
-      Args:
-        required_version: The version specified by the user.
-        actual_version: The version detected from the CUDA installation.
-      Returns: Whether the actual version matches the required one.
-  """
-  if actual_version is None:
-    return False
-
-  # Strip spaces from the versions.
-  actual_version = actual_version.strip()
-  required_version = required_version.strip()
-  return actual_version.startswith(required_version)
-
-
-def _at_least_version(actual_version, required_version):
-  actual = [int(v) for v in actual_version.split(".")]
-  required = [int(v) for v in required_version.split(".")]
-  return actual >= required
-
-
-def _get_header_version(path, name):
-  """Returns preprocessor defines in C header file."""
-  for line in io.open(path, "r", encoding="utf-8").readlines():
-    match = re.match(r"\s*#\s*define %s\s+(\d+)" % name, line)
-    if match:
-      return match.group(1)
-  return ""
-
-
-def _cartesian_product(first, second):
-  """Returns all path combinations of first and second."""
-  return [os.path.join(f, s) for f in first for s in second]
-
-
-def _get_ld_config_paths():
-  """Returns all directories from 'ldconfig -p'."""
-  if not _is_linux():
-    return []
-  ldconfig_path = shutil.which("ldconfig") or "/sbin/ldconfig"
-  output = subprocess.check_output([ldconfig_path, "-p"])
-  pattern = re.compile(".* => (.*)")
-  result = set()
-  for line in output.splitlines():
-    try:
-      match = pattern.match(line.decode("ascii"))
-    except UnicodeDecodeError:
-      match = False
-    if match:
-      result.add(os.path.dirname(match.group(1)))
-  return sorted(list(result))
-
-
-def _get_default_cuda_paths(cuda_version):
-  if not cuda_version:
-    cuda_version = "*"
-  elif not "." in cuda_version:
-    cuda_version = cuda_version + ".*"
-
-  if _is_windows():
-    return [
-        os.environ.get(
-            "CUDA_PATH",
-            "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" %
-            cuda_version)
-    ]
-  return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
-         "/usr/local/cudnn"] + _get_ld_config_paths()
-
-
-def _header_paths():
-  """Returns hard-coded set of relative paths to look for header files."""
-  return [
-      "",
-      "include",
-      "include/cuda",
-      "include/*-linux-gnu",
-      "extras/CUPTI/include",
-      "include/cuda/CUPTI",
-      "local/cuda/extras/CUPTI/include",
-      "targets/x86_64-linux/include",
-  ]
-
-
-def _library_paths():
-  """Returns hard-coded set of relative paths to look for library files."""
-  return [
-      "",
-      "lib64",
-      "lib",
-      "lib/*-linux-gnu",
-      "lib/x64",
-      "extras/CUPTI/*",
-      "local/cuda/lib64",
-      "local/cuda/extras/CUPTI/lib64",
-  ]
-
-
-def _not_found_error(base_paths, relative_paths, filepattern):
-  base_paths = "".join(["\n        '%s'" % path for path in sorted(base_paths)])
-  relative_paths = "".join(["\n        '%s'" % path for path in relative_paths])
-  return ConfigError(
-      "Could not find any %s in any subdirectory:%s\nof:%s\n" %
-      (filepattern, relative_paths, base_paths))
-
-
-def _find_file(base_paths, relative_paths, filepattern):
-  for path in _cartesian_product(base_paths, relative_paths):
-    for file in glob.glob(os.path.join(path, filepattern)):
-      return file
-  raise _not_found_error(base_paths, relative_paths, filepattern)
-
-
-def _find_library(base_paths, library_name, required_version):
-  """Returns first valid path to the requested library."""
-  if _is_windows():
-    filepattern = library_name + ".lib"
-  elif _is_macos():
-    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] +
-                                          required_version.split(".")[:1]))
-  else:
-    filepattern = ".".join(["lib" + library_name, "so"] +
-                           required_version.split(".")[:1]) + "*"
-  return _find_file(base_paths, _library_paths(), filepattern)
-
-
-def _find_versioned_file(base_paths, relative_paths, filepatterns,
-                         required_version, get_version):
-  """Returns first valid path to a file that matches the requested version."""
-  if type(filepatterns) not in [list, tuple]:
-    filepatterns = [filepatterns]
-  for path in _cartesian_product(base_paths, relative_paths):
-    for filepattern in filepatterns:
-      for file in glob.glob(os.path.join(path, filepattern)):
-        actual_version = get_version(file)
-        if _matches_version(actual_version, required_version):
-          return file, actual_version
-  raise _not_found_error(
-      base_paths, relative_paths,
-      ", ".join(filepatterns) + " matching version '%s'" % required_version)
-
-
-def _find_header(base_paths, header_name, required_version, get_version):
-  """Returns first valid path to a header that matches the requested version."""
-  return _find_versioned_file(base_paths, _header_paths(), header_name,
-                              required_version, get_version)
-
-
-def _find_cuda_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = int(_get_header_version(path, "CUDA_VERSION"))
-    if not version:
-      return None
-    return "%d.%d" % (version // 1000, version % 1000 // 10)
-
-  cuda_header_path, header_version = _find_header(base_paths, "cuda.h",
-                                                  required_version,
-                                                  get_header_version)
-  cuda_version = header_version  # x.y, see above.
-
-  cuda_library_path = _find_library(base_paths, "cudart", cuda_version)
-
-  def get_nvcc_version(path):
-    pattern = r"Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)"
-    for line in subprocess.check_output([path, "--version"]).splitlines():
-      match = re.match(pattern, line.decode("ascii"))
-      if match:
-        return match.group(1)
-    return None
-
-  nvcc_name = "nvcc.exe" if _is_windows() else "nvcc"
-  nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
-      "",
-      "bin",
-      "local/cuda/bin",
-  ], nvcc_name, cuda_version, get_nvcc_version)
-
-  nvvm_path = _find_file(base_paths, [
-      "nvvm/libdevice",
-      "share/cuda",
-      "lib/nvidia-cuda-toolkit/libdevice",
-      "local/cuda/nvvm/libdevice",
-  ], "libdevice*.10.bc")
-
-  cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
-  nvml_header_dir = _find_file(base_paths, _header_paths(), "nvml.h")
-  cupti_library_path = _find_library(base_paths, "cupti", required_version)
-
-  cuda_binary_dir = os.path.dirname(nvcc_path)
-  nvvm_library_dir = os.path.dirname(nvvm_path)
-
-  # XLA requires the toolkit path to find ptxas and libdevice.
-  # TODO(csigg): pass in both directories instead.
-  cuda_toolkit_paths = (
-      os.path.normpath(os.path.join(cuda_binary_dir, "..")),
-      os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
-  )
-  if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
-    raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" %
-                      cuda_toolkit_paths)
-
-  return {
-      "cuda_version": cuda_version,
-      "cuda_include_dir": os.path.dirname(cuda_header_path),
-      "cuda_library_dir": os.path.dirname(cuda_library_path),
-      "cuda_binary_dir": cuda_binary_dir,
-      "nvvm_library_dir": nvvm_library_dir,
-      "cupti_include_dir": os.path.dirname(cupti_header_path),
-      "cupti_library_dir": os.path.dirname(cupti_library_path),
-      "cuda_toolkit_path": cuda_toolkit_paths[0],
-      "nvml_header_dir": os.path.dirname(nvml_header_dir),
-  }
-
-
-def _find_cublas_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "10.1"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR",
-                       "CUBLAS_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cublas_api.h",
-                                               required_version,
-                                               get_header_version)
-    # cuBLAS uses the major version only.
-    cublas_version = header_version.split(".")[0]
-
-  else:
-    # There is no version info available before CUDA 10.1, just find the file.
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
-    # cuBLAS version is the same as CUDA version (x.y).
-    cublas_version = required_version
-
-  library_path = _find_library(base_paths, "cublas", cublas_version)
-
-  return {
-      "cublas_version": header_version,
-      "cublas_include_dir": os.path.dirname(header_path),
-      "cublas_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cusolver_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR",
-                       "CUSOLVER_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cusolver_common.h",
-                                               required_version,
-                                               get_header_version)
-    cusolver_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h")
-    cusolver_version = required_version
-
-  library_path = _find_library(base_paths, "cusolver", cusolver_version)
-
-  return {
-      "cusolver_version": header_version,
-      "cusolver_include_dir": os.path.dirname(header_path),
-      "cusolver_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_curand_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR",
-                       "CURAND_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "curand.h",
-                                               required_version,
-                                               get_header_version)
-    curand_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "curand.h")
-    curand_version = required_version
-
-  library_path = _find_library(base_paths, "curand", curand_version)
-
-  return {
-      "curand_version": header_version,
-      "curand_include_dir": os.path.dirname(header_path),
-      "curand_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cufft_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cufft.h",
-                                               required_version,
-                                               get_header_version)
-    cufft_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cufft.h")
-    cufft_version = required_version
-
-  library_path = _find_library(base_paths, "cufft", cufft_version)
-
-  return {
-      "cufft_version": header_version,
-      "cufft_include_dir": os.path.dirname(header_path),
-      "cufft_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cudnn_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = [
-        _get_header_version(path, name)
-        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
-    return ".".join(version) if version[0] else None
-
-  header_path, header_version = _find_header(base_paths,
-                                             ("cudnn.h", "cudnn_version.h"),
-                                             required_version,
-                                             get_header_version)
-  cudnn_version = header_version.split(".")[0]
-
-  library_path = _find_library(base_paths, "cudnn", cudnn_version)
-
-  return {
-      "cudnn_version": cudnn_version,
-      "cudnn_include_dir": os.path.dirname(header_path),
-      "cudnn_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cusparse_config(base_paths, required_version, cuda_version):
-
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR",
-                       "CUSPARSE_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cusparse.h",
-                                               required_version,
-                                               get_header_version)
-    cusparse_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
-    cusparse_version = required_version
-
-  library_path = _find_library(base_paths, "cusparse", cusparse_version)
-
-  return {
-      "cusparse_version": header_version,
-      "cusparse_include_dir": os.path.dirname(header_path),
-      "cusparse_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_nccl_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = (
-        _get_header_version(path, name)
-        for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
-    return ".".join(version)
-
-  header_path, header_version = _find_header(base_paths, "nccl.h",
-                                             required_version,
-                                             get_header_version)
-  nccl_version = header_version.split(".")[0]
-
-  library_path = _find_library(base_paths, "nccl", nccl_version)
-
-  return {
-      "nccl_version": nccl_version,
-      "nccl_include_dir": os.path.dirname(header_path),
-      "nccl_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_tensorrt_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = (
-        _get_header_version(path, name)
-        for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR",
-                     "NV_TENSORRT_PATCH"))
-    # `version` is a generator object, so we convert it to a list before using
-    # it (muitiple times below).
-    version = list(version)
-    if not all(version):
-      return None  # Versions not found, make _matches_version returns False.
-    return ".".join(version)
-
-  header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
-                                             required_version,
-                                             get_header_version)
-
-  tensorrt_version = header_version.split(".")[0]
-  library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
-
-  return {
-      "tensorrt_version": header_version,
-      "tensorrt_include_dir": os.path.dirname(header_path),
-      "tensorrt_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _list_from_env(env_name, default=[]):
-  """Returns comma-separated list from environment variable."""
-  if env_name in os.environ:
-    return os.environ[env_name].split(",")
-  return default
-
-
-def _get_legacy_path(env_name, default=[]):
-  """Returns a path specified by a legacy environment variable.
-
-  CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
-  '/usr/lib/x86_64-linux-gnu' would previously find both library and header
-  paths. Detect those and return '/usr', otherwise forward to _list_from_env().
-  """
-  if env_name in os.environ:
-    match = re.match(r"^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name])
-    if match:
-      return [match.group(1)]
-  return _list_from_env(env_name, default)
-
-
-def _normalize_path(path):
-  """Returns normalized path, with forward slashes on Windows."""
-  path = os.path.realpath(path)
-  if _is_windows():
-    path = path.replace("\\", "/")
-  return path
-
-
-def find_cuda_config():
-  """Returns a dictionary of CUDA library and header file paths."""
-  libraries = [argv.lower() for argv in sys.argv[1:]]
-  cuda_version = os.environ.get("TF_CUDA_VERSION", "")
-  base_paths = _list_from_env("TF_CUDA_PATHS",
-                              _get_default_cuda_paths(cuda_version))
-  base_paths = [path for path in base_paths if os.path.exists(path)]
-
-  result = {}
-  if "cuda" in libraries:
-    cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
-    res = _find_cuda_config(cuda_paths, cuda_version)
-
-    result.update(res)
-
-    cuda_version = result["cuda_version"]
-    cublas_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (10, 1):
-      # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
-      cublas_paths = cuda_paths
-    cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
-    result.update(
-        _find_cublas_config(cublas_paths, cublas_version, cuda_version))
-
-    cusolver_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cusolver_paths = cuda_paths
-    cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
-    result.update(
-        _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
-
-    curand_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      curand_paths = cuda_paths
-    curand_version = os.environ.get("TF_CURAND_VERSION", "")
-    result.update(
-        _find_curand_config(curand_paths, curand_version, cuda_version))
-
-    cufft_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cufft_paths = cuda_paths
-    cufft_version = os.environ.get("TF_CUFFT_VERSION", "")
-    result.update(_find_cufft_config(cufft_paths, cufft_version, cuda_version))
-
-    cusparse_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cusparse_paths = cuda_paths
-    cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
-    result.update(
-        _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
-
-  if "cudnn" in libraries:
-    cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
-    cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
-    result.update(_find_cudnn_config(cudnn_paths, cudnn_version))
-
-  if "nccl" in libraries:
-    nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
-    nccl_version = os.environ.get("TF_NCCL_VERSION", "")
-    result.update(_find_nccl_config(nccl_paths, nccl_version))
-
-  if "tensorrt" in libraries:
-    tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
-    tensorrt_version = os.environ.get("TF_TENSORRT_VERSION", "")
-    result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))
-
-  for k, v in result.items():
-    if k.endswith("_dir") or k.endswith("_path"):
-      result[k] = _normalize_path(v)
-
-  return result
-
-
-def main():
-  try:
-    for key, value in sorted(find_cuda_config().items()):
-      print("%s: %s" % (key, value))
-  except ConfigError as e:
-    sys.stderr.write(str(e) + '\n')
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/gpus/find_rocm_config.py b/third_party/gpus/find_rocm_config.py
deleted file mode 100644
index cd64efe6495b..000000000000
--- a/third_party/gpus/find_rocm_config.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prints ROCm library and header directories and versions found on the system.
-
-The script searches for ROCm library and header files on the system, inspects
-them to determine their version and prints the configuration to stdout.
-The path to inspect is specified through an environment variable (ROCM_PATH).
-If no valid configuration is found, the script prints to stderr and
-returns an error code.
-
-The script takes the directory specified by the ROCM_PATH environment variable.
-The script looks for headers and library files in a hard-coded set of
-subdirectories from base path of the specified directory. If ROCM_PATH is not
-specified, then "/opt/rocm" is used as it default value
-
-"""
-
-import io
-import os
-import re
-import sys
-
-
-class ConfigError(Exception):
-  pass
-
-
-def _get_default_rocm_path():
-  return "/opt/rocm"
-
-
-def _get_rocm_install_path():
-  """Determines and returns the ROCm installation path."""
-  rocm_install_path = _get_default_rocm_path()
-  if "ROCM_PATH" in os.environ:
-    rocm_install_path = os.environ["ROCM_PATH"]
-  # rocm_install_path = os.path.realpath(rocm_install_path)
-  return rocm_install_path
-
-
-def _get_composite_version_number(major, minor, patch):
-  return 10000 * major + 100 * minor + patch
-
-
-def _get_header_version(path, name):
-  """Returns preprocessor defines in C header file."""
-  for line in io.open(path, "r", encoding="utf-8"):
-    match = re.match(r"#define %s +(\d+)" % name, line)
-    if match:
-      value = match.group(1)
-      return int(value)
-
-  raise ConfigError('#define "{}" is either\n'.format(name) +
-                    "  not present in file {} OR\n".format(path) +
-                    "  its value is not an integer literal")
-
-
-def _find_rocm_config(rocm_install_path):
-
-  def rocm_version_numbers(path):
-    possible_version_files = [
-        "include/rocm-core/rocm_version.h",  # ROCm 5.2
-        "include/rocm_version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "ROCm version file not found in {}".format(possible_version_files))
-
-    major = _get_header_version(version_file, "ROCM_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCM_VERSION_MINOR")
-    patch = _get_header_version(version_file, "ROCM_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = rocm_version_numbers(rocm_install_path)
-
-  rocm_config = {
-      "rocm_version_number": _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocm_config
-
-
-def _find_hipruntime_config(rocm_install_path):
-
-  def hipruntime_version_number(path):
-    possible_version_files = [
-        "include/hip/hip_version.h",  # ROCm 5.2
-        "hip/include/hip/hip_version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("HIP Runtime version file not found in {}".format(
-          possible_version_files))
-
-    # This header file has an explicit #define for HIP_VERSION, whose value
-    # is (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)
-    # Retreive the major + minor and re-calculate here, since we do not
-    # want get into the business of parsing arith exprs
-    major = _get_header_version(version_file, "HIP_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "HIP_VERSION_MINOR")
-    return 100 * major + minor
-
-  hipruntime_config = {
-      "hipruntime_version_number": hipruntime_version_number(rocm_install_path)
-  }
-
-  return hipruntime_config
-
-
-def _find_miopen_config(rocm_install_path):
-
-  def miopen_version_numbers(path):
-    possible_version_files = [
-        "include/miopen/version.h",  # ROCm 5.2 and prior
-        "miopen/include/miopen/version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          'MIOpen version file "{}" not found'.format(version_file))
-    major = _get_header_version(version_file, "MIOPEN_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "MIOPEN_VERSION_MINOR")
-    patch = _get_header_version(version_file, "MIOPEN_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = miopen_version_numbers(rocm_install_path)
-
-  miopen_config = {
-      "miopen_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return miopen_config
-
-
-def _find_rocblas_config(rocm_install_path):
-
-  def rocblas_version_numbers(path):
-    possible_version_files = [
-        "include/rocblas/internal/rocblas-version.h",  # ROCm 5.2
-        "rocblas/include/internal/rocblas-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "rocblas version file not found in {}".format(
-              possible_version_files))
-    major = _get_header_version(version_file, "ROCBLAS_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCBLAS_VERSION_MINOR")
-    patch = _get_header_version(version_file, "ROCBLAS_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = rocblas_version_numbers(rocm_install_path)
-
-  rocblas_config = {
-      "rocblas_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocblas_config
-
-
-def _find_rocrand_config(rocm_install_path):
-
-  def rocrand_version_number(path):
-    possible_version_files = [
-        "include/rocrand/rocrand_version.h",  # ROCm 5.1
-        "rocrand/include/rocrand_version.h",  # ROCm 5.0 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "rocrand version file not found in {}".format(possible_version_files))
-    version_number = _get_header_version(version_file, "ROCRAND_VERSION")
-    return version_number
-
-  rocrand_config = {
-      "rocrand_version_number": rocrand_version_number(rocm_install_path)
-  }
-
-  return rocrand_config
-
-
-def _find_rocfft_config(rocm_install_path):
-
-  def rocfft_version_numbers(path):
-    possible_version_files = [
-        "include/rocfft/rocfft-version.h",  # ROCm 5.2
-        "rocfft/include/rocfft-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "rocfft version file not found in {}".format(possible_version_files))
-    major = _get_header_version(version_file, "rocfft_version_major")
-    minor = _get_header_version(version_file, "rocfft_version_minor")
-    patch = _get_header_version(version_file, "rocfft_version_patch")
-    return major, minor, patch
-
-  major, minor, patch = rocfft_version_numbers(rocm_install_path)
-
-  rocfft_config = {
-      "rocfft_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocfft_config
-
-
-def _find_hipfft_config(rocm_install_path):
-
-  def hipfft_version_numbers(path):
-    possible_version_files = [
-        "include/hipfft/hipfft-version.h",  # ROCm 5.2
-        "hipfft/include/hipfft-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "hipfft version file not found in {}".format(possible_version_files))
-    major = _get_header_version(version_file, "hipfftVersionMajor")
-    minor = _get_header_version(version_file, "hipfftVersionMinor")
-    patch = _get_header_version(version_file, "hipfftVersionPatch")
-    return major, minor, patch
-
-  major, minor, patch = hipfft_version_numbers(rocm_install_path)
-
-  hipfft_config = {
-      "hipfft_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return hipfft_config
-
-
-def _find_roctracer_config(rocm_install_path):
-
-  def roctracer_version_numbers(path):
-    possible_version_files = [
-        "include/roctracer/roctracer.h",  # ROCm 5.2
-        "roctracer/include/roctracer.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("roctracer version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "ROCTRACER_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCTRACER_VERSION_MINOR")
-    # roctracer header does not have a patch version number
-    patch = 0
-    return major, minor, patch
-
-  major, minor, patch = roctracer_version_numbers(rocm_install_path)
-
-  roctracer_config = {
-      "roctracer_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return roctracer_config
-
-
-def _find_hipsparse_config(rocm_install_path):
-
-  def hipsparse_version_numbers(path):
-    possible_version_files = [
-        "include/hipsparse/hipsparse-version.h",  # ROCm 5.2
-        "hipsparse/include/hipsparse-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("hipsparse version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "hipsparseVersionMajor")
-    minor = _get_header_version(version_file, "hipsparseVersionMinor")
-    patch = _get_header_version(version_file, "hipsparseVersionPatch")
-    return major, minor, patch
-
-  major, minor, patch = hipsparse_version_numbers(rocm_install_path)
-
-  hipsparse_config = {
-      "hipsparse_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return hipsparse_config
-
-def _find_hipsolver_config(rocm_install_path):
-
-  def hipsolver_version_numbers(path):
-    possible_version_files = [
-        "include/hipsolver/internal/hipsolver-version.h",  # ROCm 5.2
-        "hipsolver/include/internal/hipsolver-version.h",  # ROCm 5.1
-        "hipsolver/include/hipsolver-version.h",  # ROCm 5.0 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("hipsolver version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "hipsolverVersionMajor")
-    minor = _get_header_version(version_file, "hipsolverVersionMinor")
-    patch = _get_header_version(version_file, "hipsolverVersionPatch")
-    return major, minor, patch
-
-  major, minor, patch = hipsolver_version_numbers(rocm_install_path)
-
-  hipsolver_config = {
-      "hipsolver_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return hipsolver_config
-
-
-def _find_rocsolver_config(rocm_install_path):
-
-  def rocsolver_version_numbers(path):
-    possible_version_files = [
-        "include/rocsolver/rocsolver-version.h",  # ROCm 5.2
-        "rocsolver/include/rocsolver-version.h",  # ROCm 5.1 and prior
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError("rocsolver version file not found in {}".format(
-          possible_version_files))
-    major = _get_header_version(version_file, "ROCSOLVER_VERSION_MAJOR")
-    minor = _get_header_version(version_file, "ROCSOLVER_VERSION_MINOR")
-    patch = _get_header_version(version_file, "ROCSOLVER_VERSION_PATCH")
-    return major, minor, patch
-
-  major, minor, patch = rocsolver_version_numbers(rocm_install_path)
-
-  rocsolver_config = {
-      "rocsolver_version_number":
-          _get_composite_version_number(major, minor, patch)
-  }
-
-  return rocsolver_config
-
-
-def find_rocm_config():
-  """Returns a dictionary of ROCm components config info."""
-  rocm_install_path = _get_rocm_install_path()
-  if not os.path.exists(rocm_install_path):
-    raise ConfigError(
-        'Specified ROCM_PATH "{}" does not exist'.format(rocm_install_path))
-
-  result = {}
-
-  result["rocm_toolkit_path"] = rocm_install_path
-  result.update(_find_rocm_config(rocm_install_path))
-  result.update(_find_hipruntime_config(rocm_install_path))
-  result.update(_find_miopen_config(rocm_install_path))
-  result.update(_find_rocblas_config(rocm_install_path))
-  result.update(_find_rocrand_config(rocm_install_path))
-  result.update(_find_rocfft_config(rocm_install_path))
-  if result["rocm_version_number"] >= 40100:
-    result.update(_find_hipfft_config(rocm_install_path))
-  result.update(_find_roctracer_config(rocm_install_path))
-  result.update(_find_hipsparse_config(rocm_install_path))
-  if result["rocm_version_number"] >= 40500:
-    result.update(_find_hipsolver_config(rocm_install_path))
-  result.update(_find_rocsolver_config(rocm_install_path))
-
-  return result
-
-
-def main():
-  try:
-    for key, value in sorted(find_rocm_config().items()):
-      print("%s: %s" % (key, value))
-  except ConfigError as e:
-    sys.stderr.write("\nERROR: {}\n\n".format(str(e)))
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/gpus/find_sycl_config.py b/third_party/gpus/find_sycl_config.py
deleted file mode 100644
index 5db84c32ea90..000000000000
--- a/third_party/gpus/find_sycl_config.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prints SYCL library and header directories and versions found on the system.
-
-The script searches for SYCL library and header files on the system, inspects
-them to determine their version and prints the configuration to stdout. The path
-to inspect is specified through an environment variable (SYCL_PATH). If no valid
-configuration is found, the script prints to stderr and returns an error code.
-The script takes the directory specified by the SYCL_PATH environment variable.
-The script looks for headers and library files in a hard-coded set of
-subdirectories from base path of the specified directory. If SYCL_PATH is not
-specified, then "/opt/sycl" is used as it default value
-"""
-
-import io
-import os
-import re
-import sys
-
-
-class ConfigError(Exception):
-  pass
-
-
-def _get_default_sycl_toolkit_path():
-  return "/opt/intel/oneapi/compiler/latest"
-
-
-def _get_toolkit_path():
-  """Determines and returns the SYCL installation path."""
-  sycl_toolkit_path = None
-  sycl_toolkit_path = _get_default_sycl_toolkit_path()
-  if "SYCL_TOOLKIT_PATH" in os.environ:
-    sycl_toolkit_path = os.environ["SYCL_TOOLKIT_PATH"]
-  return os.path.realpath(sycl_toolkit_path)
-
-
-def _get_basekit_path():
-  return _get_toolkit_path().split("/compiler/")[0]
-
-
-def _get_basekit_version():
-  return _get_toolkit_path().split("/compiler/")[1].split("/")[0]
-
-
-def _get_composite_version_number(major, minor, patch):
-  return 10000 * major + 100 * minor + patch
-
-
-def _get_header_version(path, name):
-  """Returns preprocessor defines in C header file."""
-  for line in io.open(path, "r", encoding="utf-8"):
-    match = re.match(r"#define %s +(\d+)" % name, line)
-    if match:
-      value = match.group(1)
-      return int(value)
-
-  raise ConfigError(
-      '#define "{}" is either\n'.format(name)
-      + "  not present in file {} OR\n".format(path)
-      + "  its value is not an integer literal"
-  )
-
-
-def _find_sycl_config(basekit_path):
-  # pylint: disable=missing-function-docstring
-
-  def sycl_version_numbers(path):
-    possible_version_files = [
-        "compiler/latest/linux/include/sycl/version.hpp",
-        "compiler/latest/include/sycl/version.hpp",
-    ]
-    version_file = None
-    for f in possible_version_files:
-      version_file_path = os.path.join(path, f)
-      if os.path.exists(version_file_path):
-        version_file = version_file_path
-        break
-    if not version_file:
-      raise ConfigError(
-          "SYCL version file not found in {}".format(possible_version_files)
-      )
-
-    major = _get_header_version(version_file, "__LIBSYCL_MAJOR_VERSION")
-    minor = _get_header_version(version_file, "__LIBSYCL_MINOR_VERSION")
-    patch = _get_header_version(version_file, "__LIBSYCL_PATCH_VERSION")
-    return major, minor, patch
-
-  major, minor, patch = sycl_version_numbers(basekit_path)
-
-  sycl_config = {
-      "sycl_version_number": _get_composite_version_number(major, minor, patch),
-      "sycl_basekit_version_number": _get_basekit_version(),
-  }
-
-  return sycl_config
-
-
-def find_sycl_config():
-  """Returns a dictionary of SYCL components config info."""
-  basekit_path = _get_basekit_path()
-  toolkit_path = _get_toolkit_path()
-  if not os.path.exists(basekit_path):
-    raise ConfigError(
-        'Specified SYCL_TOOLKIT_PATH "{}" does not exist'.format(basekit_path)
-    )
-
-  result = {}
-
-  result["sycl_basekit_path"] = basekit_path
-  result["sycl_toolkit_path"] = toolkit_path
-  result.update(_find_sycl_config(basekit_path))
-
-  return result
-
-
-def main():
-  try:
-    for key, value in sorted(find_sycl_config().items()):
-      print("%s: %s" % (key, value))
-  except ConfigError as e:
-    sys.stderr.write("\nERROR: {}\n\n".format(str(e)))
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/gpus/local_config_cuda.BUILD b/third_party/gpus/local_config_cuda.BUILD
deleted file mode 100644
index bed22cc5bd90..000000000000
--- a/third_party/gpus/local_config_cuda.BUILD
+++ /dev/null
@@ -1,60 +0,0 @@
-load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
-load("@local_config_cuda//cuda:build_defs.bzl", "enable_cuda_flag")
-
-package(default_visibility = ["//visibility:public"])
-
-# Build flag to enable CUDA support.
-#
-# Enable with '--@local_config_cuda//:enable_cuda', or indirectly with
-# ./configure or '--config=cuda'.
-enable_cuda_flag(
-    name = "enable_cuda",
-    build_setting_default = False,
-    enable_override = select({
-        ":define_using_cuda_nvcc": True,
-        "//conditions:default": False,
-    }),
-)
-
-# Config setting whether CUDA support has been requested.
-#
-# Enable path: ./configure > --config=cuda (.tf_configure.bazelrc)
-#     > --//tensorflow:enable_cuda (.bazelrc) > :is_cuda_enabled
-config_setting(
-    name = "is_cuda_enabled",
-    flag_values = {":enable_cuda": "True"},
-)
-
-# Build flag to select CUDA compiler.
-#
-# Set with '--@local_config_cuda//:cuda_compiler=...', or indirectly with
-# ./configure, '--config=cuda' or '--config=cuda_clang'.
-string_flag(
-    name = "cuda_compiler",
-    build_setting_default = "nvcc",
-    values = [
-        "clang",
-        "nvcc",
-    ],
-)
-
-# Config setting whether CUDA device code should be compiled with clang.
-config_setting(
-    name = "is_cuda_compiler_clang",
-    flag_values = {":cuda_compiler": "clang"},
-)
-
-# Config setting whether CUDA device code should be compiled with nvcc.
-config_setting(
-    name = "is_cuda_compiler_nvcc",
-    flag_values = {":cuda_compiler": "nvcc"},
-)
-
-# Config setting to keep `--define=using_cuda_nvcc=true` working.
-# TODO(b/174244321): Remove when downstream projects have been fixed, along
-# with the enable_cuda_flag rule in cuda:build_defs.bzl.tpl.
-config_setting(
-    name = "define_using_cuda_nvcc",
-    define_values = {"using_cuda_nvcc": "true"},
-    visibility = ["//visibility:private"],
-)
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
deleted file mode 100644
index c97363599738..000000000000
--- a/third_party/gpus/rocm/BUILD.tpl
+++ /dev/null
@@ -1,473 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load("@local_config_rocm//rocm:build_defs.bzl", "rocm_version_number", "select_threshold")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:private"])
-
-bool_flag(
-    name = "use_rocm_hermetic_rpath",
-    build_setting_default = False,
-)
-
-config_setting(
-    name = "build_hermetic",
-    flag_values = {
-        ":use_rocm_hermetic_rpath": "True",
-    },
-)
-
-config_setting(
-    name = "using_hipcc",
-    values = {
-        "define": "using_rocm_hipcc=true",
-    },
-)
-
-cc_library(
-    name = "config",
-    hdrs = [
-        "rocm_config/rocm_config.h",
-    ],
-    include_prefix = "rocm",
-    strip_include_prefix = "rocm_config",
-)
-
-cc_library(
-    name = "config_hermetic",
-    hdrs = [
-        "rocm_config_hermetic/rocm_config.h",
-    ],
-    include_prefix = "rocm",
-    strip_include_prefix = "rocm_config_hermetic",
-)
-
-cc_library(
-    name = "rocm_config",
-    visibility = ["//visibility:public"],
-    deps = select({
-        ":build_hermetic": [
-            ":config_hermetic",
-        ],
-        "//conditions:default": [
-            "config",
-        ],
-    }),
-)
-
-# This target is required to
-# add includes that are used by rocm headers themself 
-# through the virtual includes
-# cleaner solution would be to adjust the xla code
-# and remove include prefix that is used to include rocm headers.
-cc_library(
-    name = "rocm_headers_includes",
-    hdrs = glob([
-        "%{rocm_root}/include/**",
-    ]),
-    strip_include_prefix = "%{rocm_root}/include",
-)
-
-cc_library(
-    name = "rocm_headers",
-    hdrs = glob([
-        "%{rocm_root}/include/**",
-        "%{rocm_root}/lib/llvm/lib/**/*.h",
-    ]),
-    include_prefix = "rocm",
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":rocm_rpath",
-        ":rocm_headers_includes",
-    ],
-)
-
-cc_library(
-    name = "rocm",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":hip",
-        ":hipblas",
-        ":hipblaslt",
-        ":hiprand",
-        ":hipsolver",
-        ":hipsparse",
-        ":hsa_rocr",
-        ":miopen",
-        ":rocblas",
-        ":rocm_config",
-        ":rocprofiler_register",
-        ":rocsolver",
-        ":rocsparse",
-        ":roctracer",
-    ] + select_threshold(
-        above_or_eq = [":hipfft"],
-        below = [":rocfft"],
-        threshold = 40100,
-        value = rocm_version_number(),
-    ),
-)
-
-cc_library(
-    name = "hsa_rocr",
-    srcs = glob(["%{rocm_root}/lib/libhsa-runtime*.so*"]),
-    hdrs = glob(["%{rocm_root}/include/hsa/**"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    linkstatic = 1,
-    strip_include_prefix = "%{rocm_root}",
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rocm_rpath",
-    linkopts = select({
-        ":build_hermetic": [
-            "-Wl,-rpath,%{rocm_toolkit_path}/lib",
-        ],
-        "//conditions:default": [
-            "-Wl,-rpath,/opt/rocm/lib",
-        ],
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hip",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":rocm_hip",
-        ":rocm_rpath",
-    ],
-)
-
-cc_library(
-    name = "rocm_hip",
-    srcs = glob(["%{rocm_root}/lib/libamdhip*.so"]),
-    hdrs = glob(["%{rocm_root}/include/hip/**"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    deps = [
-        ":amd_comgr",
-        ":hsa_rocr",
-        ":rocm_config",
-        ":rocm_smi",
-        ":rocprofiler_register",
-        ":system_libs",
-    ],
-)
-
-cc_library(
-    name = "rocblas",
-    hdrs = glob(["%{rocm_root}/include/rocblas/**"]),
-    data = glob([
-        "%{rocm_root}/lib/librocblas*.so*",
-        "%{rocm_root}/lib/rocblas/**",
-    ]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    # workaround to  bring tensile files to the same fs layout as expected in the lib
-    # rocblas assumes that tensile files are located in ../roblas/libraries directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rocfft",
-    srcs = glob(["%{rocm_root}/lib/librocfft*.so*"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "hipfft",
-    srcs = glob(["%{rocm_root}/lib/libhipfft*.so*"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "hiprand",
-    srcs = glob(["%{rocm_root}/lib/libhiprand*.so*"]),
-    hdrs = glob(["%{rocm_root}/include/hiprand/**"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-        "%{rocm_root}/include/rocrand",
-    ],
-    linkstatic = 1,
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "miopen",
-    hdrs = glob(["%{rocm_root}/include/miopen/**"]),
-    data = glob([
-        "%{rocm_root}/lib/libMIOpen*.so*",
-        "%{rocm_root}/share/miopen/**",
-    ]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    # workaround to  bring miopen db files to the same fs layout as expected in the lib
-    # rocblas assumes that miopen db files are located in ../share/miopen/db directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rccl",
-    srcs = glob(["%{rocm_root}/lib/librccl*.so*"]),
-    hdrs = glob(["%{rocm_root}/include/rccl/**"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    linkopts = ["-lnuma"],
-    linkstatic = 1,
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [
-      ":rocm_config",
-      ":system_libs",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rocprim",
-    srcs = [
-        "%{rocm_root}/include/hipcub/hipcub_version.hpp",
-        "%{rocm_root}/include/rocprim/rocprim_version.hpp",
-    ],
-    hdrs = glob([
-        "%{rocm_root}/include/hipcub/**",
-        "%{rocm_root}/include/rocprim/**",
-    ]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/hipcub",
-        "%{rocm_root}/include/rocprim",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":rocm_config",
-        ":rocm_headers",
-    ],
-)
-
-cc_library(
-    name = "hipsparse",
-    srcs = glob(["%{rocm_root}/lib/libhipsparse*.so*"]),
-    hdrs = glob(["%{rocm_root}/include/hipsparse/**"]),
-    data = glob(["%{rocm_root}/lib/libhipsparse*.so*"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "roctracer",
-    hdrs = glob(["%{rocm_root}/include/roctracer/**"]),
-    data = glob(["%{rocm_root}/lib/libroctracer*.so*"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rocsolver",
-    srcs = glob(["%{rocm_root}/lib/librocsolver*.so*"]),
-    hdrs = glob(["%{rocm_root}/include/rocsolver/**"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rocsparse",
-    srcs = glob(["%{rocm_root}/lib/librocsparse*.so*"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "hipsolver",
-    srcs = glob(["%{rocm_root}/lib/libhipsolver*.so*"]),
-    hdrs = glob(["%{rocm_root}/include/hipsolver/**"]),
-    data = glob(["%{rocm_root}/lib/libhipsolver*.so*"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "hipblas",
-    srcs = glob(["%{rocm_root}/lib/libhipblas.so*"]),
-    hdrs = glob(["%{rocm_root}/include/hipblas/**"]),
-    data = glob(["%{rocm_root}/lib/libhipblas.so*"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "hipblaslt",
-    hdrs = glob(["%{rocm_root}/include/hipblaslt/**"]),
-    data = glob([
-        "%{rocm_root}/lib/hipblaslt/**",
-        "%{rocm_root}/lib/libhipblaslt.so*",
-    ]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    # workaround to  bring tensile files to the same fs layout as expected in the lib
-    # hibplatslt assumes that tensile files are located in ../hipblaslt/libraries directory
-    linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rocrand",
-    srcs = glob(["%{rocm_root}/lib/librocrand*.so*"]),
-    hdrs = glob(["%{rocm_root}/include/rocrand/**"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include/",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rocprofiler_register",
-    srcs = glob([
-        "%{rocm_root}/lib/librocprofiler-register.so*",
-    ]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "amd_comgr",
-    srcs = glob([
-        "%{rocm_root}/lib/libamd_comgr.so*",
-    ]),
-    hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "rocm_smi",
-    srcs = glob([
-        "%{rocm_root}/lib/librocm_smi64.so*",
-        "%{rocm_root}/lib/libroam.so*",
-    ]),
-    hdrs = glob([
-        "%{rocm_root}/include/oam/**",
-        "%{rocm_root}/include/rocm_smi/**",
-    ]),
-    include_prefix = "rocm",
-    includes = [
-        "%{rocm_root}/include",
-    ],
-    strip_include_prefix = "%{rocm_root}",
-    deps = [":rocm_config"],
-)
-
-cc_library(
-    name = "system_libs",
-    srcs = glob([
-        "rocm_dist/usr/lib/**/libelf.so*",
-        "rocm_dist/usr/lib/**/libdrm.so*",
-        "rocm_dist/usr/lib/**/libnuma.so*",
-        "rocm_dist/usr/lib/**/libdrm_amdgpu.so*",
-    ]),
-    data = glob([
-        "rocm_dist/usr/**",
-    ]),
-)
-
-filegroup(
-    name = "rocm_root",
-    srcs = [
-        "%{rocm_root}/bin/clang-offload-bundler",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["%{rocm_root}/**"]),
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/gpus/rocm/build_defs.bzl.tpl
deleted file mode 100644
index d327083e4dc8..000000000000
--- a/third_party/gpus/rocm/build_defs.bzl.tpl
+++ /dev/null
@@ -1,77 +0,0 @@
-# Macros for building ROCm code.
-def if_rocm(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with ROCm.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with ROCm enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_rocm//rocm:using_hipcc": if_true,
-        "//conditions:default": if_false
-    })
-
-def select_threshold(value, above_or_eq, threshold, below):
-    return below if value < threshold else above_or_eq
-
-def rocm_default_copts():
-    """Default options for all ROCm compilations."""
-    return if_rocm(["-x", "rocm"] + %{rocm_extra_copts})
-
-def rocm_copts(opts = []):
-    """Gets the appropriate set of copts for (maybe) ROCm compilation.
-
-      If we're doing ROCm compilation, returns copts for our particular ROCm
-      compiler.  If we're not doing ROCm compilation, returns an empty list.
-
-      """
-    return rocm_default_copts() + select({
-        "//conditions:default": [],
-        "@local_config_rocm//rocm:using_hipcc": ([
-            "",
-        ]),
-    }) + if_rocm_is_configured(opts)
-
-def rocm_gpu_architectures():
-    """Returns a list of supported GPU architectures."""
-    return %{rocm_gpu_architectures}
-
-def rocm_version_number():
-    """Returns a list of supported GPU architectures."""
-    return %{rocm_version_number}
-
-def if_gpu_is_configured(if_true, if_false = []):
-    """Tests if ROCm or CUDA or SYCL was enabled during the configure process."""
-    return select({"//conditions:default": %{gpu_is_configured}})
-
-def if_cuda_or_rocm(if_true, if_false = []):
-    """Tests if ROCm or CUDA was enabled during the configure process.
-
-    Unlike if_rocm() or if_cuda(), this does not require that we are building
-    with --config=rocm or --config=cuda, respectively. Used to allow non-GPU
-    code to depend on ROCm or CUDA libraries.
-
-    """
-    return select({"//conditions:default": %{cuda_or_rocm}})
-
-def if_rocm_is_configured(if_true, if_false = []):
-    """Tests if the ROCm was enabled during the configure process.
-
-    Unlike if_rocm(), this does not require that we are building with
-    --config=rocm. Used to allow non-ROCm code to depend on ROCm libraries.
-    """
-    if %{rocm_is_configured}:
-      return select({"//conditions:default": if_true})
-    return select({"//conditions:default": if_false})
-
-def rocm_hipblaslt():
-    return %{rocm_is_configured} and %{rocm_hipblaslt}
-
-def if_rocm_hipblaslt(x):
-    if %{rocm_is_configured} and (%{rocm_hipblaslt} == "True"):
-      return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def rocm_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default ROCm options."""
-    native.cc_library(copts = rocm_default_copts() + copts, **kwargs)
diff --git a/third_party/gpus/rocm/rocm_config.h.tpl b/third_party/gpus/rocm/rocm_config.h.tpl
deleted file mode 100644
index 52b489d7a730..000000000000
--- a/third_party/gpus/rocm/rocm_config.h.tpl
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef ROCM_ROCM_CONFIG_H_
-#define ROCM_ROCM_CONFIG_H_
-
-#define TF_ROCM_TOOLKIT_PATH "%{rocm_toolkit_path}"
-
-#define TF_ROCM_VERSION %{rocm_version_number}
-#define TF_MIOPEN_VERSION %{miopen_version_number}
-#define TF_HIPRUNTIME_VERSION %{hipruntime_version_number}
-#define TF_HIPBLASLT %{hipblaslt_flag}
-#define TF_HIPRUNTIME_SOVERSION "%{hip_soversion_number}"
-#define TF_ROCBLAS_SOVERSION "%{rocblas_soversion_number}"
-
-#endif  // ROCM_ROCM_CONFIG_H_
diff --git a/third_party/gpus/rocm/rocm_redist.bzl b/third_party/gpus/rocm/rocm_redist.bzl
deleted file mode 100644
index fbb0f1fbeb61..000000000000
--- a/third_party/gpus/rocm/rocm_redist.bzl
+++ /dev/null
@@ -1,18 +0,0 @@
-load(
-    "@local_xla//third_party/gpus/rocm:rocm_redist_ubuntu_20_04.bzl",
-    "rocm_redist_ubuntu_20_04",
-)
-load(
-    "@local_xla//third_party/gpus/rocm:rocm_redist_ubuntu_22_04.bzl",
-    "rocm_redist_ubuntu_22_04",
-)
-load(
-    "@local_xla//third_party/gpus/rocm:rocm_redist_ubuntu_24_04.bzl",
-    "rocm_redist_ubuntu_24_04",
-)
-
-rocm_redist = {
-    "ubuntu_20.04": rocm_redist_ubuntu_20_04,
-    "ubuntu_22.04": rocm_redist_ubuntu_22_04,
-    "ubuntu_24.04": rocm_redist_ubuntu_24_04,
-}
diff --git a/third_party/gpus/rocm/rocm_redist_ubuntu_20_04.bzl b/third_party/gpus/rocm/rocm_redist_ubuntu_20_04.bzl
deleted file mode 100644
index c9793eae3df8..000000000000
--- a/third_party/gpus/rocm/rocm_redist_ubuntu_20_04.bzl
+++ /dev/null
@@ -1,364 +0,0 @@
-rocm_redist_ubuntu_20_04 = {
-    "6.2.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/c/comgr6.2.0/comgr6.2.0_2.8.0.60200-66~20.04_amd64.deb",
-                sha256 = "fabf4a831f21b5248932e08654149bc215da2a816613ad8d05b805d4e226171a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-runtime-amd6.2.0/hip-runtime-amd6.2.0_6.2.41133.60200-66~20.04_amd64.deb",
-                sha256 = "215fae8759742bc048699feaacd6256a3ac2138771b69731dab7779325bb1b41",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev6.2.0/hip-dev6.2.0_6.2.41133.60200-66~20.04_amd64.deb",
-                sha256 = "e901d66275b3b520ee73250caa4a1836be142823083528b4db6cc31a18bfb94d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas6.2.0/hipblas6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "f8a20128b5c26198bd9ecec894f8a4c74fa28ee668e4ef1bf73d0c3edff8c144",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "ab3ee54b33eba013fbf3d9aefe64b54e1918b9fb72790ca0b57fb391cb662cf0",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcc6.2.0/hipcc6.2.0_1.1.1.60200-66~20.04_amd64.deb",
-                sha256 = "a68123c046b8c913705262014463a8a30768167a1b68a78d8455deaf85a802d7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcub-dev6.2.0/hipcub-dev6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "c71fab59f62ad9d4b60aa4217f4db42c6996d83d5ad7ba29e127cc13bda59afc",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft6.2.0/hipfft6.2.0_1.0.14.60200-66~20.04_amd64.deb",
-                sha256 = "25887526ea2e955d4c0afa4749f8db55a49e399a349d43ccf66e0ad99ff78b2a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft-dev6.2.0/hipfft-dev6.2.0_1.0.14.60200-66~20.04_amd64.deb",
-                sha256 = "3cfec840c79c6bce4e83bf6e056e241cc13ff572352b040a952c7642b61d45aa",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver6.2.0/hipsolver6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "cb56dd79ff52eaddfed379831023484d9ec32b9538bc3d02ee34c328457cd20e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver-dev6.2.0/hipsolver-dev6.2.0_2.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "1e968f9405c8b90fbb58dff09d8bab08cf31c8386880fff95e1cb8932320bc37",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse6.2.0/hipsparse6.2.0_3.1.1.60200-66~20.04_amd64.deb",
-                sha256 = "f08ba25b6b950754b5a2bb64c125a01b9f44280f227ff19eeb78e188f0b17320",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse-dev6.2.0/hipsparse-dev6.2.0_3.1.1.60200-66~20.04_amd64.deb",
-                sha256 = "e9464369619bbea7299ac83e17b3cbbabdeb16e6d4da116400532e7737332b65",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand6.2.0/hiprand6.2.0_2.11.0.60200-66~20.04_amd64.deb",
-                sha256 = "2efed49be9413e08e91b3fb67736644bb0e8809fc673d310a0abab65b69eacad",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand-dev6.2.0/hiprand-dev6.2.0_2.11.0.60200-66~20.04_amd64.deb",
-                sha256 = "19564fb2f9616860234aa8bd69cca324a1a3ec33476581ec57200a1dac1d4dcb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-rocr6.2.0/hsa-rocr6.2.0_1.14.0.60200-66~20.04_amd64.deb",
-                sha256 = "e4940a5d47e9e39d603f18936e7921c603fd8dde0e359e0be796f9c1cdacd431",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip6.2.0/miopen-hip6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "638a28c5407c3af7d16e1b0179b7494b0aeb36c314114af148b1bcd52e883db1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "77c9d26c4f0053b71fb86f7a6b489655e27053f9605efca3a16344ccf286e313",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl6.2.0/rccl6.2.0_2.20.5.60200-66~20.04_amd64.deb",
-                sha256 = "2b3ce1ca2e58e891963f26d4bd31ae45894480483f691d371f269e698f75f8eb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl-dev6.2.0/rccl-dev6.2.0_2.20.5.60200-66~20.04_amd64.deb",
-                sha256 = "0dedbffa5bb272d656086a9586e3705551345945f35f4f6be6dc8a27b63127a9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas6.2.0/rocblas6.2.0_4.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "6e5b3caeadf592367f8638db67a70b8dd9231a8257dc2012a9c46e2c5974fff5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas-dev/rocblas-dev_4.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "eaefe5a7d75ef61314b83af5bb85d8e652a730deaa58e1d600b1e9c2e673673c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft6.2.0/rocfft6.2.0_1.0.28.60200-66~20.04_amd64.deb",
-                sha256 = "b2bfe29ab688781bad5bc067ee682658085e22caaf09b18278f2f4b9905081d3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft-dev6.2.0/rocfft-dev6.2.0_1.0.28.60200-66~20.04_amd64.deb",
-                sha256 = "e94d50fd6f24d70649ce046dbfe4dda2587d1d82892d4c126a4c3e91d1570071",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-core/rocm-core_6.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "0e16c9fc58fc904542be4dad63bb2ff34268b5c13957c432e91ec0e4fd149c82",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "14f47d79b508eb259bfe4e0e5f360edb5721b908caf3bb981a4eee4181783be9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev/hip-dev_6.2.41133.60200-66~20.04_amd64.deb",
-                sha256 = "97e6e77eaea56de6cc4ea2c525dd8b9a587546eb99c782c7af46cdc5363b99bf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-device-libs6.2.0/rocm-device-libs6.2.0_1.0.0.60200-66~20.04_amd64.deb",
-                sha256 = "ae055b579d319e1a779783ba774f119fb0e1a731d058a03b36dc5c15214d210a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocminfo6.2.0/rocminfo6.2.0_1.0.0.60200-66~20.04_amd64.deb",
-                sha256 = "3bcf3dc22dbede7da70299cde1484776827808b967d371441f6cf6d3fe8af30d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm6.2.0/rocm-llvm6.2.0_18.0.0.24292.60200-66~20.04_amd64.deb",
-                sha256 = "ce17d2b85407b9539e0feda513fd360a48ebfd971c19af122dda21d60448c9fc",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_18.0.0.24292.60200-66~20.04_amd64.deb",
-                sha256 = "322ca8425c3a8f2ec17c551bad606b96d957b0c1eea07196dd66ac9f15460ed5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-smi-lib6.2.0/rocm-smi-lib6.2.0_7.3.0.60200-66~20.04_amd64.deb",
-                sha256 = "1bbdb32d21dbc12bf9a736f6ca8726df9673e4401465d2b9b537c47b358b67f1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprim-dev6.2.0/rocprim-dev6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "e74e1907eb90a692344626e881cb88eeed5565ac3b487eb94ad4ac02ffd838ed",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprofiler-register6.2.0/rocprofiler-register6.2.0_0.4.0.60200-66~20.04_amd64.deb",
-                sha256 = "4be88c5010c2cf0223c1dd7dc9d4a430fc54ee401ca093de2dcca60dabea763a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocrand-dev/rocrand-dev_3.1.0.60200-66~20.04_amd64.deb",
-                sha256 = "ddd0ac44b08470dfc128d6f6d2598a9728879f5a78bc5290645baebf22433b63",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer6.2.0/roctracer6.2.0_4.1.60200.60200-66~20.04_amd64.deb",
-                sha256 = "b94cdf230b372ebcaf97085cf67f01ef7977f814280fdaf1886797f39899ef41",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer-dev6.2.0/roctracer-dev6.2.0_4.1.60200.60200-66~20.04_amd64.deb",
-                sha256 = "9a85b57eea3790432eae06421081b3e59d3c9841d59646364ecd174f9ed4821a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver6.2.0/rocsolver6.2.0_3.26.0.60200-66~20.04_amd64.deb",
-                sha256 = "87dcd34a9b50f46161ecdb7781ab03c2b311fb7e13aa167c4a9c5e3bcf24b473",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver-dev6.2.0/rocsolver-dev6.2.0_3.26.0.60200-66~20.04_amd64.deb",
-                sha256 = "21e4aa1957e7bc5d293a418a983d9b3c3917fb78eb79d3d4d55a253b9bae7743",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsparse6.2.0/rocsparse6.2.0_3.2.0.60200-66~20.04_amd64.deb",
-                sha256 = "dacc13278f2be1cd847fca30ce409dcf95749df5f1a27635bc6dbd61be488d14",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.101-2_amd64.deb",
-                sha256 = "4cd2e10f9486456a2782487f8bfd39f330f35a4d5bd6d693412b9e4ca2a6acbd",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.101-2_amd64.deb",
-                sha256 = "d4567a30f7d68b4dcf794f8677b96e89083693c94e88279fecf577ceba8b9774",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.176-1.1build1_amd64.deb",
-                sha256 = "78a8761227efc04a1e37527f2f33ba608c6fb5d6c911616346ada5d7b9b72ee3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.12-1_amd64.deb",
-                sha256 = "0b1edf08cf9befecd21fe94e298ac25e476f87fd876ddd4adf42ef713449e637",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.2.0",
-    },
-    "6.1.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/c/comgr6.1.0/comgr6.1.0_2.7.0.60100-82~20.04_amd64.deb",
-                sha256 = "8d2e99edf6d42ddcfa5269165ebfdb05476188478c2fe0ff4ad5c6a6f4c4bc43",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-runtime-amd/hip-runtime-amd_6.1.40091.60100-82~20.04_amd64.deb",
-                sha256 = "c9a1e9d88e22d94022eb8e7297d687cd28bce7122a0448ac3a9856593799d632",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~20.04_amd64.deb",
-                sha256 = "0eef319439a66a317b29a8966896a3aee9d234ceddb7561458f96699c8dc5e6c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas6.1.0/hipblas6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "71c247d7bc0d91b738b2ada4e44fc54e74d6cd86598827b1002207c9a4553151",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas-dev6.1.0/hipblas-dev6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "6280d93934d9d91c6ab1ddda2494bc3f8acb6eb2fc056c7e62cfbf9cac963e32",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcc6.1.0/hipcc6.1.0_1.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "a79627c50fbf88f63935004cb3e7f88a8bcf315bb11aabfd3d55fd7d6c65723b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcub-dev6.1.0/hipcub-dev6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "1f6e7b113687f8d2389f6681dc8f0da8b6208debd00af860025ffaf605e37090",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft6.1.0/hipfft6.1.0_1.0.14.60100-82~20.04_amd64.deb",
-                sha256 = "77e234ad957b75801516d7e201328126067a410eb0179b4c0cf66200dca51579",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft-dev6.1.0/hipfft-dev6.1.0_1.0.14.60100-82~20.04_amd64.deb",
-                sha256 = "9897ae1bcd8f09d8736570787caf42fcf9480de33a7cc26384fbbd1a59a37ee5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver6.1.0/hipsolver6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "5d75bcc5490bee4fc83290792c553e1447aa4a4bd8581263d8c0530eaff8e84c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver-dev6.1.0/hipsolver-dev6.1.0_2.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "3cf5007fe92ecfc6aaf61f0b2fd01de10f3650c83f5dc1f927379f89326b9c88",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse6.1.0/hipsparse6.1.0_3.0.1.60100-82~20.04_amd64.deb",
-                sha256 = "ec7d5084d7bb468b6333f486670a17f9fe2e50b725701e7d1c6507960fe7165d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse-dev6.1.0/hipsparse-dev6.1.0_3.0.1.60100-82~20.04_amd64.deb",
-                sha256 = "16bb3224eecd4bef18618cddaf6d39be5604b1657a530e859ee2d58e291583c9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand6.1.0/hiprand6.1.0_2.10.16.60100-82~20.04_amd64.deb",
-                sha256 = "26e2d7207c520346aee45feada559c534f891c05a42aeaae9eb088f438b02882",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand-dev6.1.0/hiprand-dev6.1.0_2.10.16.60100-82~20.04_amd64.deb",
-                sha256 = "f5bc772653bcdb2dd175843a290c55c2365e6d14f2ed10cd6416af27196662d8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hsa-rocr6.1.0/hsa-rocr6.1.0_1.13.0.60100-82~20.04_amd64.deb",
-                sha256 = "54101ca7659857065a1fb1310e1ca676d65b10f099d2d32779c6e63fff365b9d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip6.1.0/miopen-hip6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "16badbf9ac54e6d3e98155ec8d79b40c05b757c0ff2e8561ee6a141b87f19084",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip-dev6.1.0/miopen-hip-dev6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "9de62c9775ff18a4043cdc01b67fb7f3fa3416964d111faf80ddaae467b3e73a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl6.1.0/rccl6.1.0_2.18.6.60100-82~20.04_amd64.deb",
-                sha256 = "78816c8f233b202f82c38b8ea271bec488606f64d8ec626f27a3fc0df85b2785",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl-dev6.1.0/rccl-dev6.1.0_2.18.6.60100-82~20.04_amd64.deb",
-                sha256 = "8e6e2a1e3378e2ef3322a636d713133fb7380fbeb6e564976623801cb7200aea",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas6.1.0/rocblas6.1.0_4.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "214f600887472639bcb3462ea5aa79583f4f6748c9eaa8bec2d343c1fb6bb9e4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas-dev/rocblas-dev_4.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "32d9d16eda1e94a8d5af14e1bbe94d48bada4846f52b75c4b82f7b81abeaef9e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft6.1.0/rocfft6.1.0_1.0.27.60100-82~20.04_amd64.deb",
-                sha256 = "634cba977370d3959d05de9df70cf624dc4fcf533ce91e7a5dc90e64a16e80de",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft-dev6.1.0/rocfft-dev6.1.0_1.0.27.60100-82~20.04_amd64.deb",
-                sha256 = "cf1ba42d0e456bbf55e51aae75990425874699273be64445d3b80ead7a8e74fb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-core6.1.0/rocm-core6.1.0_6.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "f3ada47a42dcafb981cc2c05720c0f74a70a07d3c41dc48b250d292538097a0d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "dbfec580eed58da19481931278846e70306125725dd64c9f60e5e1aa9718aa45",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~20.04_amd64.deb",
-                sha256 = "0eef319439a66a317b29a8966896a3aee9d234ceddb7561458f96699c8dc5e6c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-device-libs/rocm-device-libs_1.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "e9c346e49703eed446c741384f84feb1ee710fd7039702ab3d92e62ad7f87a30",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocminfo/rocminfo_1.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "e364070bb7ef21a5a8cac39ca30b75f20e373ca2043f68caac0fa44b2e0349bf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm/rocm-llvm_17.0.0.24103.60100-82~20.04_amd64.deb",
-                sha256 = "24811a2baed22cd54c359b3e9d7260ea47112da13d32a08fefc6d2cc6ff1d3ee",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm-dev6.1.0/rocm-llvm-dev6.1.0_17.0.0.24103.60100-82~20.04_amd64.deb",
-                sha256 = "96013f2ca73a6d4883b12edddaaaeeb388f1c47e88d1b47ccb55c25725eb5574",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-smi-lib6.1.0/rocm-smi-lib6.1.0_7.0.0.60100-82~20.04_amd64.deb",
-                sha256 = "bba0a1dbd2a109990a5d3df8d7038b39074578858530388cb054d8c997fa6207",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprim-dev6.1.0/rocprim-dev6.1.0_3.1.0.60100-82~20.04_amd64.deb",
-                sha256 = "b4bd7250279a21c07692d4254ba8c033a6bb41f6a7f72ebec09abdc34de025f2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprofiler-register6.1.0/rocprofiler-register6.1.0_0.3.0.60100-82~20.04_amd64.deb",
-                sha256 = "1b4501a42a5bab66cbfb9396c0e681a382fce48a1f62c1dc6adeb0e4ef565fcc",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocrand-dev6.1.0/rocrand-dev6.1.0_3.0.1.60100-82~20.04_amd64.deb",
-                sha256 = "3e4dbb60ef3ba15cd1fd2ca8a3927b65689f4e9fdc808c8e431ba9254f53345d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer6.1.0/roctracer6.1.0_4.1.60100.60100-82~20.04_amd64.deb",
-                sha256 = "77fd92577f5f09d518fde94c5ba652e0ad06f117bbc9f0cd280ee3bd8c2c44d5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer-dev6.1.0/roctracer-dev6.1.0_4.1.60100.60100-82~20.04_amd64.deb",
-                sha256 = "7ae9a9864204298bd04d41526cecb071bbc40b998f906cfef30abb40750cd834",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver6.1.0/rocsolver6.1.0_3.25.0.60100-82~20.04_amd64.deb",
-                sha256 = "a2b6ecf01b2a701b899abfd0429b354ed3af7a878ee5db19698f7cc831ee0829",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver-dev6.1.0/rocsolver-dev6.1.0_3.25.0.60100-82~20.04_amd64.deb",
-                sha256 = "12685a77b9fab6919accead417657237f755cbb91ae96a7b052edf73fee5f5ce",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsparse6.1.0/rocsparse6.1.0_3.1.2.60100-82~20.04_amd64.deb",
-                sha256 = "1e2fca5caf1cabcc84a114a3865e0aaf518f01babe6cfa56bcda907b5d86ea17",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.101-2_amd64.deb",
-                sha256 = "4cd2e10f9486456a2782487f8bfd39f330f35a4d5bd6d693412b9e4ca2a6acbd",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.101-2_amd64.deb",
-                sha256 = "d4567a30f7d68b4dcf794f8677b96e89083693c94e88279fecf577ceba8b9774",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.176-1.1build1_amd64.deb",
-                sha256 = "78a8761227efc04a1e37527f2f33ba608c6fb5d6c911616346ada5d7b9b72ee3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.12-1_amd64.deb",
-                sha256 = "0b1edf08cf9befecd21fe94e298ac25e476f87fd876ddd4adf42ef713449e637",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.1.0",
-    },
-}
diff --git a/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl b/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl
deleted file mode 100644
index fd70366155d3..000000000000
--- a/third_party/gpus/rocm/rocm_redist_ubuntu_22_04.bzl
+++ /dev/null
@@ -1,364 +0,0 @@
-rocm_redist_ubuntu_22_04 = {
-    "6.2.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/c/comgr6.2.0/comgr6.2.0_2.8.0.60200-66~22.04_amd64.deb",
-                sha256 = "bc5d620e4e0db3746fc6b2279e463f618681f1f95ba973e40b687cef50ca2489",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-runtime-amd6.2.0/hip-runtime-amd6.2.0_6.2.41133.60200-66~22.04_amd64.deb",
-                sha256 = "38e9670bedc7bbdc0b9f38c7a0fe90f73ef80f161cbf63c98d30e422438ce2c5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev6.2.0/hip-dev6.2.0_6.2.41133.60200-66~22.04_amd64.deb",
-                sha256 = "c66cc8c19b57cab740710811457f02a16e24cff761e5c99c3640f63ceefe8281",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas6.2.0/hipblas6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "fbd647e1b13e7aa2c14c9581f9102c069ddab9ecb47a4b226d433ec37b19e92d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "885cf3f3a52ebde9caadf6348a6cda28fd15e3bc52bab0c90b587d72b29ff7ef",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcc6.2.0/hipcc6.2.0_1.1.1.60200-66~22.04_amd64.deb",
-                sha256 = "468026fa8eb70121f0c545557a926ddc41228cef9457b4a00d8fc3a36b04310f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcub-dev6.2.0/hipcub-dev6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "c2c7d2ec5a8a31837c0addfc619ee67a374ea967cc6d43900472005489f62722",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft6.2.0/hipfft6.2.0_1.0.14.60200-66~22.04_amd64.deb",
-                sha256 = "6e649430cc5e247bbd052dff2d681b6bf0ef09d0bc3446a4911f4ab4cd317140",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft-dev6.2.0/hipfft-dev6.2.0_1.0.14.60200-66~22.04_amd64.deb",
-                sha256 = "389b0c83a39adbeeec442adde3fedba2820ed948179a4a0df03d67560501cd97",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver6.2.0/hipsolver6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "adf9aad1fc062445e34cdddbeca80db9c02f4c5f258e01c45e2a6222d15cb66d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver-dev6.2.0/hipsolver-dev6.2.0_2.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "cb46dfbff3943a3167f6173fc381d744eb966a3451bcff49458c696888ec452c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse6.2.0/hipsparse6.2.0_3.1.1.60200-66~22.04_amd64.deb",
-                sha256 = "8c7a216aeef6ceeb3881d3e443a89a0f5c15a17deb5926cba4b787554c8fab87",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse-dev6.2.0/hipsparse-dev6.2.0_3.1.1.60200-66~22.04_amd64.deb",
-                sha256 = "501cad72df5f09572f99c11eebbb1eff49afb6ca8c91bcf4966f81068177a95d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand6.2.0/hiprand6.2.0_2.11.0.60200-66~22.04_amd64.deb",
-                sha256 = "b20c86be57698a944f91048699d0fbde5253bea28ba9d4035ce1de1d3c20f9ac",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand-dev6.2.0/hiprand-dev6.2.0_2.11.0.60200-66~22.04_amd64.deb",
-                sha256 = "9dab6f44b92b6020e183777f6f07219d68de5d10cad7538c7ddcae0192aa3e33",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-rocr6.2.0/hsa-rocr6.2.0_1.14.0.60200-66~22.04_amd64.deb",
-                sha256 = "62d280204d8ff642b464dab03fc344442df6dc5f04e152da20604e8050303c41",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip6.2.0/miopen-hip6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "6c2aa042067e51d5b70a264ca83c92ffaa6e81d00d08b55986917da860e66d85",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "f3452b2bd9c2869c550c7f963cca65fb35a37183ad4a56d96e05c69adb2f1d04",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl6.2.0/rccl6.2.0_2.20.5.60200-66~22.04_amd64.deb",
-                sha256 = "f3205c0a7d736f457ee2262988260e8dc4c495fa74a394ff73a9dfe002aff335",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl-dev6.2.0/rccl-dev6.2.0_2.20.5.60200-66~22.04_amd64.deb",
-                sha256 = "953a248cd44f403e5423185918166bfa29a009519c3d7b5b5a8e067fdf672602",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas6.2.0/rocblas6.2.0_4.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "c306ca3e59b851ebb35872e09e5598adf2e2ebb736c1b200ff4ee204fe262f7e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas-dev/rocblas-dev_4.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "115d0e9ec1b93bf7cba5fa1e3de1428f0d999d931c2dd495e4cdad22b5078936",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft6.2.0/rocfft6.2.0_1.0.28.60200-66~22.04_amd64.deb",
-                sha256 = "0d40fc9aa1da617cd8864258cd1259a0e7444ea0da446297d154b5b3422393af",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft-dev6.2.0/rocfft-dev6.2.0_1.0.28.60200-66~22.04_amd64.deb",
-                sha256 = "8c1e72cf1c165e20960b0c2f3c499900a809d59340d14a0acff95c543c7087f2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-core/rocm-core_6.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "22c80c1a704f4ce7d6a49a8b41acd64f3ed0513cd7f5570a0664a10df5858334",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "9c2ff1dc100e342969bd51a7cd4918048c8b25579de709efde56425d969cd50f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev/hip-dev_6.2.41133.60200-66~22.04_amd64.deb",
-                sha256 = "1101f3edb9dbc9f4914d7f26b5569ec9bde076d52d4125c98d22a99dd730ab51",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-device-libs6.2.0/rocm-device-libs6.2.0_1.0.0.60200-66~22.04_amd64.deb",
-                sha256 = "d5b660df350130e0ab04ddf3e36dd442bde27ae9cbb8e5f12c047b0d3cb05463",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocminfo6.2.0/rocminfo6.2.0_1.0.0.60200-66~22.04_amd64.deb",
-                sha256 = "0d06a84ac53d388089b7b8c80133f60c1eea5bfd85155ecc113efb206a747c25",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm6.2.0/rocm-llvm6.2.0_18.0.0.24292.60200-66~22.04_amd64.deb",
-                sha256 = "4a29539480a7e4b27991ccf533a35526dd3994a457fa84e4c960192c2fa05b46",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_18.0.0.24292.60200-66~22.04_amd64.deb",
-                sha256 = "febb8614cedd98f13ba0624072ffdd13b9a6dc3431380a17a0eaf87583627890",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprim-dev6.2.0/rocprim-dev6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "3d859bb735ff8bf1962ce680e9257dcc574ab36224f50069f833fa19c6d7e69d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-smi-lib6.2.0/rocm-smi-lib6.2.0_7.3.0.60200-66~22.04_amd64.deb",
-                sha256 = "ffd4e064e8a1d52b9e72114e8a1d51c78004a960f1d923448af8ed07a1b6f30b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprofiler-register6.2.0/rocprofiler-register6.2.0_0.4.0.60200-66~22.04_amd64.deb",
-                sha256 = "66df78d8c5e2d1a0ae43cd4a5e41cf75ec120c870a0bbd7da18a2ba4dec42f9c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocrand-dev/rocrand-dev_3.1.0.60200-66~22.04_amd64.deb",
-                sha256 = "317c16a6e0b0b456153437406dd92225e17dbd454fc1304b0c3fef5fbfc69bc2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer6.2.0/roctracer6.2.0_4.1.60200.60200-66~22.04_amd64.deb",
-                sha256 = "9ddf8835f1e94d5004b4c466091c8110cb72e11eda545d0de395395832076c0a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer-dev6.2.0/roctracer-dev6.2.0_4.1.60200.60200-66~22.04_amd64.deb",
-                sha256 = "9a9ed0c66d3a9d9ff50f1fc3a9e9105bb8b1a6d93c1f856682625dfb68ab627f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver6.2.0/rocsolver6.2.0_3.26.0.60200-66~22.04_amd64.deb",
-                sha256 = "5b86bf7b33a3ffa7098878f27d1b119aada69ebb02bd121b47209559c32703be",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver-dev6.2.0/rocsolver-dev6.2.0_3.26.0.60200-66~22.04_amd64.deb",
-                sha256 = "4573f99191fbe3a2afab84fdf5a05e024bd230ca7866d7eba71a5f2560a3a0bf",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsparse6.2.0/rocsparse6.2.0_3.2.0.60200-66~22.04_amd64.deb",
-                sha256 = "4fbc91db9085ecd80a5e051bba56863ae33b22516d727ab3fef15fb500187222",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.110-1ubuntu1_amd64.deb",
-                sha256 = "e5ea68db36b31aab442c790e1c78ecdf53646c16b0cd83db15966632ba04152c",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.110-1ubuntu1_amd64.deb",
-                sha256 = "ae1f0d77668d7275d085ba820206ba91e90833dd1a02b8e251af0c73aa119ba3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.186-1build1_amd64.deb",
-                sha256 = "8effc4d7a0cc341bcf6cb11af0134f3defa6292376ecfdfc697a9b228606345c",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.14-3ubuntu2_amd64.deb",
-                sha256 = "0721c89001fbbd1ada23e89da5d60e762763c1a7b3dc814a2e9a518480a8043d",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.2.0",
-    },
-    "6.1.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/c/comgr6.1.0/comgr6.1.0_2.7.0.60100-82~22.04_amd64.deb",
-                sha256 = "49967e2e98b96a95c618a1db7eacf8892b2700e0cf88960b3b0097da081ec1c8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-runtime-amd/hip-runtime-amd_6.1.40091.60100-82~22.04_amd64.deb",
-                sha256 = "8cb31ffd9d313e19a6e9b7bed8a106d0ed59fe92f479fa042405217f787cae16",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~22.04_amd64.deb",
-                sha256 = "7ca5568b754948576555b07924abbb35e24b7448b6f612738a5fdde6ae7020c9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas6.1.0/hipblas6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "4703e568dd8d6314b81508260b9d799c577ee38ae59655ce6a4782c0f6d3e3ef",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipblas-dev6.1.0/hipblas-dev6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "3b6ddd2df992002afd0684de4ace6a6e86e497c4db95813febd7c0da851f8da5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcc6.1.0/hipcc6.1.0_1.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "e11db2414fec41b45f605616a10793956611850b42406bdf5c4f067e195e502a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipcub-dev6.1.0/hipcub-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "888d7643506f00023b617beb8446d09608216dae075e978c7862a41adb7e94c2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft6.1.0/hipfft6.1.0_1.0.14.60100-82~22.04_amd64.deb",
-                sha256 = "e8de4cd7a377a718e8c4392e02fafbe3f43f38a2397aaf5cd2136eb03c43a5c3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipfft-dev6.1.0/hipfft-dev6.1.0_1.0.14.60100-82~22.04_amd64.deb",
-                sha256 = "afaf929e06c43310b5325a735169e73af85cf5e764d43ef319038d25484201b5",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver6.1.0/hipsolver6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "6bb779344bd39e9da75ee8474d7de5d10e6993d627e6cbd9ac7a3fcf260b1a6a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsolver-dev6.1.0/hipsolver-dev6.1.0_2.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "450c0849203b69da15d593fde712555328715626027df980823df9458f9b4631",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse6.1.0/hipsparse6.1.0_3.0.1.60100-82~22.04_amd64.deb",
-                sha256 = "b3806a85a483da4fa06f8e4edf917c5ceb1a4c00af6426ec61fbec23828291b3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hipsparse-dev6.1.0/hipsparse-dev6.1.0_3.0.1.60100-82~22.04_amd64.deb",
-                sha256 = "7003d85e42e988d9e5b80da0d5b81aa34a393ded1d9d567b0edf06e3ba2fc9b4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand6.1.0/hiprand6.1.0_2.10.16.60100-82~22.04_amd64.deb",
-                sha256 = "bf6678ba14b9baebe6fe39a0aeaeeb2c10b8154a5e9c6d0223d8b01f36d1a7b9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hiprand-dev6.1.0/hiprand-dev6.1.0_2.10.16.60100-82~22.04_amd64.deb",
-                sha256 = "3f2069097efc8a9bbf1cb9be60f7240dcd17a5380614b2b6faf29c7b53b657c4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hsa-rocr6.1.0/hsa-rocr6.1.0_1.13.0.60100-82~22.04_amd64.deb",
-                sha256 = "562904659abd5e905a806b4ffc30af5c25442e3d6143e6a99b4660badced2b86",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip6.1.0/miopen-hip6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "2f26448b8ef551383bf16f0e066dd6f4b7539b51f382b7028b377de5164f8b63",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/m/miopen-hip-dev6.1.0/miopen-hip-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "df6d5a8151f216dc02cd96e45d0ca8133cca51d272ede25eb30898f07d0f3e82",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl6.1.0/rccl6.1.0_2.18.6.60100-82~22.04_amd64.deb",
-                sha256 = "2d367697957bba93c79e8da1d1bc7c8bbd8d07fb7f013de7c83824f9047372f1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rccl-dev6.1.0/rccl-dev6.1.0_2.18.6.60100-82~22.04_amd64.deb",
-                sha256 = "1c4927aa49e4dcb441608c3fa6ec86c9d078aaa767214be2b213c1a8421a3929",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas6.1.0/rocblas6.1.0_4.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "43cb1dd308f08a9d766ed846bd4d345b74fcc3a87e6e7ee727c7e5cf49629416",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocblas-dev/rocblas-dev_4.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "fc8bef370666fad72c01fc131749ccb835b8bfcb1639ed43dda26b9e64702b3b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft6.1.0/rocfft6.1.0_1.0.27.60100-82~22.04_amd64.deb",
-                sha256 = "50d0ad3cb37a69285b6132a17fdefbdea2e18ab6faf8265ead44ef3d7a4d16cb",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocfft-dev6.1.0/rocfft-dev6.1.0_1.0.27.60100-82~22.04_amd64.deb",
-                sha256 = "2e091de9499e493c03a79ca7673b9c8640f896051542ff3c4f635efccc97d10f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-core6.1.0/rocm-core6.1.0_6.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "685993f25f9da6e17cf69bc7dc9cdde0ca33b9955474a11bb903cae0d4a25d66",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "57c4212475dd5a8fe2bdab92eeff71332a0d408615dc2a4254482eb46d13e212",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/h/hip-dev6.1.0/hip-dev6.1.0_6.1.40091.60100-82~22.04_amd64.deb",
-                sha256 = "7ca5568b754948576555b07924abbb35e24b7448b6f612738a5fdde6ae7020c9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-device-libs/rocm-device-libs_1.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "7e155798e1027dd4fc0d49a89865245f3017090e44ea057584b8b86d5ea931cd",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocminfo/rocminfo_1.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "b7cf95b3b20e3accba23de34265ac408603176279412fda116dce47047a36e7b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm/rocm-llvm_17.0.0.24103.60100-82~22.04_amd64.deb",
-                sha256 = "4c245a83e48517d627f34f52c0e7020434dcf4ef4ef073c736afc60e69f8b6f2",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-llvm-dev6.1.0/rocm-llvm-dev6.1.0_17.0.0.24103.60100-82~22.04_amd64.deb",
-                sha256 = "c99854c0b92ea9c530be6c656157d26587b74c4ea1e9e12522570438a189d5b9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocm-smi-lib6.1.0/rocm-smi-lib6.1.0_7.0.0.60100-82~22.04_amd64.deb",
-                sha256 = "4b02aa9e5e09a36303e185def69ae67702a7177a5e6793e00565c8c6fdd32f88",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprim-dev6.1.0/rocprim-dev6.1.0_3.1.0.60100-82~22.04_amd64.deb",
-                sha256 = "ce3ce32ed9692c58d1a6ba089a7c07b27d2935b0f126a1c84b214cd2433ebe48",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocprofiler-register6.1.0/rocprofiler-register6.1.0_0.3.0.60100-82~22.04_amd64.deb",
-                sha256 = "73b877f13ba65c6ba01197452c3b538f50f687d54ae0b3428c85c07bff20dcb7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocrand-dev6.1.0/rocrand-dev6.1.0_3.0.1.60100-82~22.04_amd64.deb",
-                sha256 = "afcdfa0cbc71363ccd9bb71f421343b12263bc88d42fa9a4c78c60bbc3fa17d3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer6.1.0/roctracer6.1.0_4.1.60100.60100-82~22.04_amd64.deb",
-                sha256 = "c98aedc99d252bf40b8069f497d24d60e2eaca25d001471e42ceb4df531ecba7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/roctracer-dev6.1.0/roctracer-dev6.1.0_4.1.60100.60100-82~22.04_amd64.deb",
-                sha256 = "9c2967d988e7a1408a3e4b2c83177eb7c88af939619a9d0d5ab7af2db9489884",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver6.1.0/rocsolver6.1.0_3.25.0.60100-82~22.04_amd64.deb",
-                sha256 = "65a6270f66194e033af1dc4b238bf7ecdfa439933b9c330bcb307caf516e8b3b",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsolver-dev6.1.0/rocsolver-dev6.1.0_3.25.0.60100-82~22.04_amd64.deb",
-                sha256 = "ca40789c82d3e46f2951cb0b1a7d5e8026daf5af6d597693746d95b8a49cd9a1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.1/pool/main/r/rocsparse6.1.0/rocsparse6.1.0_3.1.2.60100-82~22.04_amd64.deb",
-                sha256 = "ede46a9ccd505543425c5f75c6e8180c05c3b865dd638edbe297237664b3fe31",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.101-2_amd64.deb",
-                sha256 = "4cd2e10f9486456a2782487f8bfd39f330f35a4d5bd6d693412b9e4ca2a6acbd",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.101-2_amd64.deb",
-                sha256 = "d4567a30f7d68b4dcf794f8677b96e89083693c94e88279fecf577ceba8b9774",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1_0.176-1.1build1_amd64.deb",
-                sha256 = "78a8761227efc04a1e37527f2f33ba608c6fb5d6c911616346ada5d7b9b72ee3",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.12-1_amd64.deb",
-                sha256 = "0b1edf08cf9befecd21fe94e298ac25e476f87fd876ddd4adf42ef713449e637",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.1.0",
-    },
-}
diff --git a/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl b/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl
deleted file mode 100644
index da9ef00998f9..000000000000
--- a/third_party/gpus/rocm/rocm_redist_ubuntu_24_04.bzl
+++ /dev/null
@@ -1,187 +0,0 @@
-rocm_redist_ubuntu_24_04 = {
-    "6.2.0": {
-        "archives": [
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/c/comgr6.2.0/comgr6.2.0_2.8.0.60200-66~24.04_amd64.deb",
-                sha256 = "7e1ff2d9f2435f5b9db9aa952bb57d1a878a8aa7d96bda61361c107b7e1428e3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev6.2.0/hip-dev6.2.0_6.2.41133.60200-66~24.04_amd64.deb",
-                sha256 = "5e6601ada30432ee0dab0473585bdf1fa7c398f0c655538d48eba9c44e6dc77a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas6.2.0/hipblas6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "7ff8f6308c744c71008959b17ab6338de1c6fd3e4581dd94271e6eca9fdc4c13",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblas-dev6.2.0/hipblas-dev6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "e9f71e71db600d72dcb2b61e64b965b6c60d47bd4bb699e8abec85edb260b819",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblaslt6.2.0/hipblaslt6.2.0_0.8.0.60200-66~24.04_amd64.deb",
-                sha256 = "e5dfd8ba9e49f919a96c102d3a652e8ef0c4d1a63b3f3909c856d40b1745e2a9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipblaslt-dev6.2.0/hipblaslt-dev6.2.0_0.8.0.60200-66~24.04_amd64.deb",
-                sha256 = "639bd47010035ee6719425510be33d2f54483004a909dfa4c64f853d7394a22f",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcc6.2.0/hipcc6.2.0_1.1.1.60200-66~24.04_amd64.deb",
-                sha256 = "c2782a98633e4400f46ba732605e56b2821366db60ec06d88db0615e4d1acf3c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipcub-dev6.2.0/hipcub-dev6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "48fec4d06aef3159db4117125b728242a1eeb480ea3d55d3901d945d4b883694",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft6.2.0/hipfft6.2.0_1.0.14.60200-66~24.04_amd64.deb",
-                sha256 = "8dd73cdbd4f0563f4a0481304771e4cbcac5905eea1f2d8ef41f922cdf9aba85",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipfft-dev6.2.0/hipfft-dev6.2.0_1.0.14.60200-66~24.04_amd64.deb",
-                sha256 = "e3c0a4ebda8d3aacd44b19c6872f23222513be0a5c04f793605088d9183f1be4",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver6.2.0/hipsolver6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "adbba9ffcf8b5e4202efbe45924d87520bf4100ec5464bd0ba3beb61cb535c6c",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsolver-dev6.2.0/hipsolver-dev6.2.0_2.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "01d3dd6195111808b40a5837d3e51d8c27c4700b4bd8bb2d901e39d0474fd98a",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse6.2.0/hipsparse6.2.0_3.1.1.60200-66~24.04_amd64.deb",
-                sha256 = "2ba33a96388cd3edd7b5b8b261fe99cbd569894f4d7db291fc0dd0ff5d7c67ce",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hipsparse-dev6.2.0/hipsparse-dev6.2.0_3.1.1.60200-66~24.04_amd64.deb",
-                sha256 = "6a767f493a722e2d4260a9bc23cf9db66fd275a094b395c768e305f60d6b4fe9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand6.2.0/hiprand6.2.0_2.11.0.60200-66~24.04_amd64.deb",
-                sha256 = "82f182134b415080ba4a12fd7993b6099ee9b9e549c72bfebee24c8486704078",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hiprand-dev6.2.0/hiprand-dev6.2.0_2.11.0.60200-66~24.04_amd64.deb",
-                sha256 = "011d5c28f45cd9d756e0cf6ea6a3d37eabd98a3381ffd961c772ab92a37e4ee8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hsa-rocr6.2.0/hsa-rocr6.2.0_1.14.0.60200-66~24.04_amd64.deb",
-                sha256 = "fa04f707debb75087ea2bf5e327602034eaa3a6900421f2cf32ad5f5f1c887b9",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip6.2.0/miopen-hip6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "2dbf6d126d0de6930e0cd94d0e525e07d3019d90bd7256f3151a7f1fbc2250af",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/m/miopen-hip-dev/miopen-hip-dev_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "df5fdd2218e4d380b133ba402f3734fbe0589d9cdd8618a101b71b968909b4ba",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl6.2.0/rccl6.2.0_2.20.5.60200-66~24.04_amd64.deb",
-                sha256 = "4d7efa4ee6aa2bf69b0aab449cc1d01c25ca65814e1b3cb07f6b59fa8b1608b8",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rccl-dev6.2.0/rccl-dev6.2.0_2.20.5.60200-66~24.04_amd64.deb",
-                sha256 = "4ab4f880344e04d61b6fa746be5c4bdc2841409fb6987ee61e39c6420b4eca42",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas6.2.0/rocblas6.2.0_4.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "521c87ce396c6ce10076cc641b6035451fd68ddb36a684c5a9c9538dfc831ade",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocblas-dev/rocblas-dev_4.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "00f135ce2ae47c35085ef06248ff7d5ce8c12fd0d5b82e7bd77b1dbc0ce7058e",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft6.2.0/rocfft6.2.0_1.0.28.60200-66~24.04_amd64.deb",
-                sha256 = "40c936452e84bfec87236f08de5a9d3f232c397a3305b6143c26697ed56ceda1",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocfft-dev6.2.0/rocfft-dev6.2.0_1.0.28.60200-66~24.04_amd64.deb",
-                sha256 = "eb3904263b396d46799eeea1081d8e8d1a551a890432a803364db2d013849f92",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-core/rocm-core_6.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "af5fcbe8dc2b6cbec30e2d39d30736e8a47a0b9d0ca2be7f179f2947f9c98245",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-hip-libraries/rocm-hip-libraries_6.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "228f07a3caefc41f6efd5345eb9d3630f1db769f9b4abd1313cbcb32d077ce53",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/h/hip-dev/hip-dev_6.2.41133.60200-66~24.04_amd64.deb",
-                sha256 = "cda72054d2011dbb062e75386766d928fd8905c15c88685c3ef87fc963bd88ad",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-device-libs6.2.0/rocm-device-libs6.2.0_1.0.0.60200-66~24.04_amd64.deb",
-                sha256 = "298544f717dfb236b9257b19a0ab81abaaa770128976d4abfdea546cd32d8b02",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocminfo6.2.0/rocminfo6.2.0_1.0.0.60200-66~24.04_amd64.deb",
-                sha256 = "8e78ed8e480b55a496153b150acb22bab39c3bb8cf1e62f9aff7eaf75a3a3a92",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm6.2.0/rocm-llvm6.2.0_18.0.0.24292.60200-66~24.04_amd64.deb",
-                sha256 = "72c388eae7c0f54151b46fbd8fa6e26f1ca81e2b8b415c43411a156b3f25b6e7",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-llvm-dev/rocm-llvm-dev_18.0.0.24292.60200-66~24.04_amd64.deb",
-                sha256 = "3e85a859c5dafa82a9a57dda096d566b821217bacfac995f7cc45ed460b68999",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocm-smi-lib6.2.0/rocm-smi-lib6.2.0_7.3.0.60200-66~24.04_amd64.deb",
-                sha256 = "c094e3022c73fca2aa6c8bb435f93550109531de37fe8de5fbf6cfe1f047b645",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprim-dev6.2.0/rocprim-dev6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "6c832e2feb0885fbe481245825c76a466921b294f530eb0d0da70a44cfe6e608",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocprofiler-register6.2.0/rocprofiler-register6.2.0_0.4.0.60200-66~24.04_amd64.deb",
-                sha256 = "d198d010fedfbe51d3fd19444e2848d430e08f91d19a5b2661b94ac6d1135863",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocrand-dev/rocrand-dev_3.1.0.60200-66~24.04_amd64.deb",
-                sha256 = "2a2a95185ce0e54df226474b2f5cfcdc9e5ede5a6d88a8a70c2635ea2237abba",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer6.2.0/roctracer6.2.0_4.1.60200.60200-66~24.04_amd64.deb",
-                sha256 = "2f2fb6f8d06ace89131934c833b0ea359335a4b45aeec1559b293d7bc14b1d1d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/roctracer-dev6.2.0/roctracer-dev6.2.0_4.1.60200.60200-66~24.04_amd64.deb",
-                sha256 = "c6c781ee87c459aed32e943b389137f98ecd402fb83a3d1c98de9a76abadc3a3",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver6.2.0/rocsolver6.2.0_3.26.0.60200-66~24.04_amd64.deb",
-                sha256 = "5e4b3e38556f0826e5322971635a49a72283d60862ccc4d28efd11c8fb955b47",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsolver-dev6.2.0/rocsolver-dev6.2.0_3.26.0.60200-66~24.04_amd64.deb",
-                sha256 = "5bb6ae92a25f33488f2ee5f123ac4f67ad130e18e4949161715451509be3b89d",
-            ),
-            struct(
-                url = "https://repo.radeon.com/rocm/apt/6.2/pool/main/r/rocsparse6.2.0/rocsparse6.2.0_3.2.0.60200-66~24.04_amd64.deb",
-                sha256 = "1867833a569fbf3f87b82c81bc47f5d62085ea40f12d1cb33475c1f2dec89bc4",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm2_2.4.120-2build1_amd64.deb",
-                sha256 = "f5fb4e7ce17921cc466fb7911abf91495ffb181b36772f68e2e82cb621703112",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libdrm-amdgpu1_2.4.120-2build1_amd64.deb",
-                sha256 = "e149d4daea33f58853b8013fd6c24888429ce7716a4b26d1a1f45181b5a4e73e",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libelf1t64_0.190-1.1build4_amd64.deb",
-                sha256 = "b277e52769302778bd052376ac6687b52954b6605dd5f781bff8631e3504d58f",
-            ),
-            struct(
-                url = "https://mirror.bazel.build/github.com/alekstheod/rocm-deps/releases/download/rocm-6.2.0/libnuma1_2.0.18-1build1_amd64.deb",
-                sha256 = "508daa855e99959acaa945e6a89d218e0be6b5727fd28773580942ff37cf5805",
-            ),
-        ],
-        "rocm_root": "opt/rocm-6.2.0",
-    },
-}
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
deleted file mode 100644
index de86f2f72473..000000000000
--- a/third_party/gpus/rocm_configure.bzl
+++ /dev/null
@@ -1,904 +0,0 @@
-"""Repository rule for ROCm autoconfiguration.
-
-`rocm_configure` depends on the following environment variables:
-
-  * `TF_NEED_ROCM`: Whether to enable building with ROCm.
-  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path.
-  * `TF_ROCM_CLANG`: Whether to use clang for C++ and HIPCC for ROCm compilation.
-  * `TF_SYSROOT`: The sysroot to use when compiling.
-  * `CLANG_COMPILER_PATH`: The clang compiler path that will be used for
-    host code compilation if TF_ROCM_CLANG is 1.
-  * `ROCM_PATH`: The path to the ROCm toolkit. Default is `/opt/rocm`.
-  * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
-"""
-
-load(
-    "//third_party/gpus/rocm:rocm_redist.bzl",
-    "rocm_redist",
-)
-load(
-    "//third_party/remote_config:common.bzl",
-    "config_repo_label",
-    "err_out",
-    "execute",
-    "files_exist",
-    "get_bash_bin",
-    "get_cpu_value",
-    "get_host_environ",
-    "get_python_bin",
-    "raw_exec",
-    "realpath",
-    "relative_to",
-    "which",
-)
-load(
-    ":compiler_common_tools.bzl",
-    "to_list_of_strings",
-)
-load(
-    ":cuda_configure.bzl",
-    "enable_cuda",
-)
-load(
-    ":sycl_configure.bzl",
-    "enable_sycl",
-)
-
-_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
-_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-_CLANG_COMPILER_PATH = "CLANG_COMPILER_PATH"
-_TF_SYSROOT = "TF_SYSROOT"
-_ROCM_TOOLKIT_PATH = "ROCM_PATH"
-_TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
-_TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
-_DISTRIBUTION_PATH = "rocm/rocm_dist"
-_OS = "OS"
-_ROCM_VERSION = "ROCM_VERSION"
-
-_DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
-
-def verify_build_defines(params):
-    """Verify all variables that crosstool/BUILD.rocm.tpl expects are substituted.
-
-    Args:
-      params: dict of variables that will be passed to the BUILD.tpl template.
-    """
-    missing = []
-    for param in [
-        "cxx_builtin_include_directories",
-        "extra_no_canonical_prefixes_flags",
-        "host_compiler_path",
-        "host_compiler_prefix",
-        "linker_bin_path",
-        "unfiltered_compile_flags",
-    ]:
-        if ("%{" + param + "}") not in params:
-            missing.append(param)
-
-    if missing:
-        auto_configure_fail(
-            "BUILD.rocm.tpl template is missing these variables: " +
-            str(missing) +
-            ".\nWe only got: " +
-            str(params) +
-            ".",
-        )
-
-def find_cc(repository_ctx, use_rocm_clang):
-    """Find the C++ compiler."""
-
-    if use_rocm_clang:
-        target_cc_name = "clang"
-        cc_path_envvar = _CLANG_COMPILER_PATH
-    else:
-        target_cc_name = "gcc"
-        cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
-    if cc_name_from_env:
-        cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = which(repository_ctx, cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
-
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    return path
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sysroot:
-        sysroot += ["--sysroot", tf_sysroot]
-
-    # TODO: We pass -no-canonical-prefixes here to match the compiler flags,
-    #       but in rocm_clang CROSSTOOL file that is a `feature` and we should
-    #       handle the case when it's disabled and no flag is passed
-    result = raw_exec(repository_ctx, [
-        cc,
-        "-E",
-        "-x" + lang,
-        "-",
-        "-v",
-    ] + sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    return [
-        str(repository_ctx.path(_cxx_inc_convert(p)))
-        for p in inc_dirs.split("\n")
-    ]
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sysroot,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sysroot,
-    )
-
-    includes_cpp_set = depset(includes_cpp)
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp_set.to_list()
-    ]
-
-def auto_configure_fail(msg):
-    """Output failure message when rocm configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sROCm Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def auto_configure_warning(msg):
-    """Output warning message during auto configuration."""
-    yellow = "\033[1;33m"
-    no_color = "\033[0m"
-    print("\n%sAuto-Configuration Warning:%s %s\n" % (yellow, no_color, msg))
-
-# END cc_configure common functions (see TODO above).
-
-def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
-    """Generates the entries for rocm inc dirs based on rocm_config.
-
-    Args:
-      repository_ctx: The repository context.
-      rocm_config: The path to the gcc host compiler.
-      bash_bin: path to the bash interpreter.
-
-    Returns:
-      A string containing the Starlark string for each of the hipcc
-      compiler include directories, which can be added to the CROSSTOOL
-      file.
-    """
-    inc_dirs = []
-
-    # Add HIP-Clang headers (relative to rocm root)
-    rocm_path = repository_ctx.path(rocm_config.rocm_toolkit_path)
-    clang_path = rocm_path.get_child("llvm/bin/clang")
-    resource_dir_result = execute(repository_ctx, [str(clang_path), "-print-resource-dir"])
-
-    if resource_dir_result.return_code:
-        auto_configure_fail("Failed to run hipcc -print-resource-dir: %s" % err_out(resource_dir_result))
-
-    resource_dir_abs = resource_dir_result.stdout.strip()
-
-    resource_dir_rel = relative_to(repository_ctx, str(rocm_path.realpath), resource_dir_abs, bash_bin)
-
-    resource_dir = str(rocm_path.get_child(resource_dir_rel))
-
-    inc_dirs.append(resource_dir + "/include")
-    inc_dirs.append(resource_dir + "/share")
-
-    return inc_dirs
-
-def _enable_rocm(repository_ctx):
-    enable_rocm = get_host_environ(repository_ctx, "TF_NEED_ROCM")
-    if enable_rocm == "1":
-        if get_cpu_value(repository_ctx) != "Linux":
-            auto_configure_warning("ROCm configure is only supported on Linux")
-            return False
-        return True
-    return False
-
-def _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin):
-    """Returns a list of strings representing AMDGPU targets."""
-    amdgpu_targets_str = get_host_environ(repository_ctx, _TF_ROCM_AMDGPU_TARGETS)
-    if not amdgpu_targets_str:
-        cmd = "%s/bin/rocm_agent_enumerator" % rocm_toolkit_path
-        result = execute(repository_ctx, [bash_bin, "-c", cmd])
-        targets = [target for target in result.stdout.strip().split("\n") if target != "gfx000"]
-        targets = {x: None for x in targets}
-        targets = list(targets.keys())
-        amdgpu_targets_str = ",".join(targets)
-    amdgpu_targets = amdgpu_targets_str.split(",")
-    for amdgpu_target in amdgpu_targets:
-        if amdgpu_target[:3] != "gfx":
-            auto_configure_fail("Invalid AMDGPU target: %s" % amdgpu_target)
-    return amdgpu_targets
-
-def _hipcc_env(repository_ctx):
-    """Returns the environment variable string for hipcc.
-
-    Args:
-        repository_ctx: The repository context.
-
-    Returns:
-        A string containing environment variables for hipcc.
-    """
-    hipcc_env = ""
-    for name in [
-        "HIP_CLANG_PATH",
-        "DEVICE_LIB_PATH",
-        "HIP_VDI_HOME",
-        "HIPCC_VERBOSE",
-        "HIPCC_COMPILE_FLAGS_APPEND",
-        "HIPPCC_LINK_FLAGS_APPEND",
-        "HCC_AMDGPU_TARGET",
-        "HIP_PLATFORM",
-    ]:
-        env_value = get_host_environ(repository_ctx, name)
-        if env_value:
-            hipcc_env = (hipcc_env + " " + name + "=\"" + env_value + "\";")
-    return hipcc_env.strip()
-
-def _crosstool_verbose(repository_ctx):
-    """Returns the environment variable value CROSSTOOL_VERBOSE.
-
-    Args:
-        repository_ctx: The repository context.
-
-    Returns:
-        A string containing value of environment variable CROSSTOOL_VERBOSE.
-    """
-    return get_host_environ(repository_ctx, "CROSSTOOL_VERBOSE", "0")
-
-def _lib_name(lib, version = "", static = False):
-    """Constructs the name of a library on Linux.
-
-    Args:
-      lib: The name of the library, such as "hip"
-      version: The version of the library.
-      static: True the library is static or False if it is a shared object.
-
-    Returns:
-      The platform-specific name of the library.
-    """
-    if static:
-        return "lib%s.a" % lib
-    else:
-        if version:
-            version = ".%s" % version
-        return "lib%s.so%s" % (lib, version)
-
-def _rocm_lib_paths(repository_ctx, lib, basedir):
-    file_name = _lib_name(lib, version = "", static = False)
-    return [
-        repository_ctx.path("%s/lib64/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/lib64/stubs/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/lib/x86_64-linux-gnu/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/lib/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/%s" % (basedir, file_name)),
-    ]
-
-def _batch_files_exist(repository_ctx, libs_paths, bash_bin):
-    all_paths = []
-    for row in libs_paths:
-        lib_paths = row[1]
-        for lib_path in lib_paths:
-            all_paths.append(lib_path)
-    return files_exist(repository_ctx, all_paths, bash_bin)
-
-def _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin):
-    test_results = _batch_files_exist(repository_ctx, libs_paths, bash_bin)
-
-    libs = {}
-    i = 0
-    for row in libs_paths:
-        name = row[0]
-        lib_paths = row[1]
-        optional = (len(row) > 2 and row[2] == True)
-        selected_path = None
-        for path in lib_paths:
-            if test_results[i] and selected_path == None:
-                # For each lib select the first path that exists.
-                selected_path = path
-            i = i + 1
-        if selected_path == None:
-            if optional:
-                libs[name] = None
-                continue
-            else:
-                auto_configure_fail("Cannot find rocm library %s" % name)
-
-        libs[name] = struct(file_name = selected_path.basename, path = realpath(repository_ctx, selected_path, bash_bin))
-
-    return libs
-
-def _find_libs(repository_ctx, rocm_config, miopen_path, rccl_path, bash_bin):
-    """Returns the ROCm libraries on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      rocm_config: The ROCm config as returned by _get_rocm_config
-      bash_bin: the path to the bash interpreter
-
-    Returns:
-      Map of library names to structs of filename and path
-    """
-    libs_paths = [
-        (name, _rocm_lib_paths(repository_ctx, name, path))
-        for name, path in [
-            ("amdhip64", rocm_config.rocm_toolkit_path),
-            ("rocblas", rocm_config.rocm_toolkit_path),
-            ("hiprand", rocm_config.rocm_toolkit_path),
-            ("MIOpen", miopen_path),
-            ("rccl", rccl_path),
-            ("hipsparse", rocm_config.rocm_toolkit_path),
-            ("roctracer64", rocm_config.rocm_toolkit_path),
-            ("rocsolver", rocm_config.rocm_toolkit_path),
-        ]
-    ]
-    if int(rocm_config.rocm_version_number) >= 40500:
-        libs_paths.append(("hipsolver", _rocm_lib_paths(repository_ctx, "hipsolver", rocm_config.rocm_toolkit_path)))
-        libs_paths.append(("hipblas", _rocm_lib_paths(repository_ctx, "hipblas", rocm_config.rocm_toolkit_path)))
-
-    # hipblaslt may be absent even in versions of ROCm where it exists
-    # (it is not installed by default in some containers). Autodetect.
-    libs_paths.append(("hipblaslt", _rocm_lib_paths(repository_ctx, "hipblaslt", rocm_config.rocm_toolkit_path), True))
-    return _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin)
-
-def find_rocm_config(repository_ctx, rocm_path):
-    """Returns ROCm config dictionary from running find_rocm_config.py"""
-    python_bin = get_python_bin(repository_ctx)
-    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_rocm_config], env_vars = {"ROCM_PATH": rocm_path})
-    if exec_result.return_code:
-        auto_configure_fail("Failed to run find_rocm_config.py: %s" % err_out(exec_result))
-
-    # Parse the dict from stdout.
-    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
-
-def _get_rocm_config(repository_ctx, bash_bin, rocm_path, install_path):
-    """Detects and returns information about the ROCm installation on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      bash_bin: the path to the path interpreter
-
-    Returns:
-      A struct containing the following fields:
-        rocm_toolkit_path: The ROCm toolkit installation directory.
-        amdgpu_targets: A list of the system's AMDGPU targets.
-        rocm_version_number: The version of ROCm on the system.
-        miopen_version_number: The version of MIOpen on the system.
-        hipruntime_version_number: The version of HIP Runtime on the system.
-    """
-    config = find_rocm_config(repository_ctx, rocm_path)
-    rocm_toolkit_path = config["rocm_toolkit_path"]
-    rocm_version_number = config["rocm_version_number"]
-    miopen_version_number = config["miopen_version_number"]
-    hipruntime_version_number = config["hipruntime_version_number"]
-    return struct(
-        amdgpu_targets = _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin),
-        rocm_toolkit_path = rocm_toolkit_path,
-        rocm_version_number = rocm_version_number,
-        miopen_version_number = miopen_version_number,
-        hipruntime_version_number = hipruntime_version_number,
-        install_path = install_path,
-    )
-
-def _tpl_path(repository_ctx, labelname):
-    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % labelname))
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        _tpl_path(repository_ctx, tpl),
-        substitutions,
-    )
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_gpu_disabled():
-  fail("ERROR: Building with --config=rocm but TensorFlow is not configured " +
-       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with GPU support.")
-
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
-
-error_gpu_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    # Set up BUILD file for rocm/.
-    _tpl(
-        repository_ctx,
-        "rocm:build_defs.bzl",
-        {
-            "%{rocm_is_configured}": "False",
-            "%{gpu_is_configured}": "if_true" if enable_cuda(repository_ctx) or enable_sycl(repository_ctx) else "if_false",
-            "%{cuda_or_rocm}": "if_true" if enable_cuda(repository_ctx) else "if_false",
-            "%{rocm_extra_copts}": "[]",
-            "%{rocm_gpu_architectures}": "[]",
-            "%{rocm_version_number}": "0",
-            "%{rocm_hipblaslt}": "False",
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "rocm:BUILD",
-        {
-            "%{hip_lib}": _lib_name("hip"),
-            "%{rocblas_lib}": _lib_name("rocblas"),
-            "%{hipblas_lib}": _lib_name("hipblas"),
-            "%{miopen_lib}": _lib_name("miopen"),
-            "%{rccl_lib}": _lib_name("rccl"),
-            "%{hiprand_lib}": _lib_name("hiprand"),
-            "%{hipsparse_lib}": _lib_name("hipsparse"),
-            "%{roctracer_lib}": _lib_name("roctracer64"),
-            "%{rocsolver_lib}": _lib_name("rocsolver"),
-            "%{hipsolver_lib}": _lib_name("hipsolver"),
-            "%{hipblaslt_lib}": _lib_name("hipblaslt"),
-            "%{rocm_headers}": "",
-        },
-    )
-
-    # Create dummy files for the ROCm toolkit since they are still required by
-    # tensorflow/compiler/xla/stream_executor/rocm:rocm_rpath
-    repository_ctx.file("rocm/hip/include/hip/hip_runtime.h", "")
-
-    # Set up rocm_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    _tpl(
-        repository_ctx,
-        "rocm:rocm_config.h",
-        {
-            "%{rocm_toolkit_path}": _DEFAULT_ROCM_TOOLKIT_PATH,
-            "%{hipblaslt_flag}": "0",
-        },
-        "rocm/rocm_config/rocm_config.h",
-    )
-
-    # If rocm_configure is not configured to build with GPU support, and the user
-    # attempts to build with --config=rocm, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_gpu_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
-
-def _flag_enabled(repository_ctx, flag_name):
-    return get_host_environ(repository_ctx, flag_name) == "1"
-
-def _use_rocm_clang(repository_ctx):
-    # Returns the flag if we need to use clang for the host.
-    return _flag_enabled(repository_ctx, "TF_ROCM_CLANG")
-
-def _tf_sysroot(repository_ctx):
-    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
-
-def _compute_rocm_extra_copts(repository_ctx, amdgpu_targets):
-    amdgpu_target_flags = ["--offload-arch=" +
-                           amdgpu_target for amdgpu_target in amdgpu_targets]
-    return str(amdgpu_target_flags)
-
-def _get_file_name(url):
-    last_slash_index = url.rfind("/")
-    return url[last_slash_index + 1:]
-
-def _download_package(repository_ctx, archive):
-    file_name = _get_file_name(archive.url)
-    tmp_dir = "tmp"
-    repository_ctx.file(tmp_dir + "/.idx")  # create tmp dir
-
-    repository_ctx.report_progress("Downloading and extracting {}, expected hash is {}".format(archive.url, archive.sha256))  # buildifier: disable=print
-    repository_ctx.download_and_extract(
-        url = archive.url,
-        output = tmp_dir if archive.url.endswith(".deb") else _DISTRIBUTION_PATH,
-        sha256 = archive.sha256,
-    )
-
-    all_files = repository_ctx.path(tmp_dir).readdir()
-
-    matched_files = [f for f in all_files if _get_file_name(str(f)).startswith("data.")]
-    for f in matched_files:
-        repository_ctx.extract(f, _DISTRIBUTION_PATH)
-
-    repository_ctx.delete(tmp_dir)
-    repository_ctx.delete(file_name)
-
-def _remove_root_dir(path, root_dir):
-    if path.startswith(root_dir + "/"):
-        return path[len(root_dir) + 1:]
-    return path
-
-def _setup_rocm_distro_dir(repository_ctx):
-    """Sets up the rocm hermetic installation directory to be used in hermetic build"""
-    bash_bin = get_bash_bin(repository_ctx)
-    os = repository_ctx.os.environ.get(_OS)
-    rocm_version = repository_ctx.os.environ.get(_ROCM_VERSION)
-    if os and rocm_version:
-        redist = rocm_redist[os][rocm_version]
-        repository_ctx.file("rocm/.index")
-        for archive in redist["archives"]:
-            _download_package(repository_ctx, archive)
-        return _get_rocm_config(repository_ctx, bash_bin, "{}/{}".format(_DISTRIBUTION_PATH, redist["rocm_root"]), "/{}".format(redist["rocm_root"]))
-    else:
-        rocm_path = repository_ctx.os.environ.get(_ROCM_TOOLKIT_PATH, _DEFAULT_ROCM_TOOLKIT_PATH)
-        repository_ctx.report_progress("Using local rocm installation {}".format(rocm_path))  # buildifier: disable=print
-        repository_ctx.symlink(rocm_path, _DISTRIBUTION_PATH)
-        return _get_rocm_config(repository_ctx, bash_bin, _DISTRIBUTION_PATH, _DEFAULT_ROCM_TOOLKIT_PATH)
-
-def _create_local_rocm_repository(repository_ctx):
-    """Creates the repository containing files set up to build with ROCm."""
-
-    tpl_paths = {labelname: _tpl_path(repository_ctx, labelname) for labelname in [
-        "rocm:build_defs.bzl",
-        "rocm:BUILD",
-        "crosstool:BUILD.rocm",
-        "crosstool:hipcc_cc_toolchain_config.bzl",
-        "crosstool:clang/bin/crosstool_wrapper_driver_rocm",
-        "rocm:rocm_config.h",
-    ]}
-
-    rocm_config = _setup_rocm_distro_dir(repository_ctx)
-    rocm_version_number = int(rocm_config.rocm_version_number)
-
-    # For ROCm 5.2 and above, find MIOpen and RCCL in the main rocm lib path
-    miopen_path = rocm_config.rocm_toolkit_path + "/miopen" if rocm_version_number < 50200 else rocm_config.rocm_toolkit_path
-    rccl_path = rocm_config.rocm_toolkit_path + "/rccl" if rocm_version_number < 50200 else rocm_config.rocm_toolkit_path
-
-    # Copy header and library files to execroot.
-    # rocm_toolkit_path
-    rocm_toolkit_path = _remove_root_dir(rocm_config.rocm_toolkit_path, "rocm")
-
-    bash_bin = get_bash_bin(repository_ctx)
-    rocm_libs = _find_libs(repository_ctx, rocm_config, miopen_path, rccl_path, bash_bin)
-    rocm_lib_srcs = []
-    rocm_lib_outs = []
-    for lib in rocm_libs.values():
-        if lib:
-            rocm_lib_srcs.append(lib.path)
-            rocm_lib_outs.append("rocm/lib/" + lib.file_name)
-
-    clang_offload_bundler_path = rocm_toolkit_path + "/llvm/bin/clang-offload-bundler"
-
-    have_hipblaslt = "1" if rocm_libs["hipblaslt"] != None else "0"
-
-    # Set up BUILD file for rocm/
-    repository_ctx.template(
-        "rocm/build_defs.bzl",
-        tpl_paths["rocm:build_defs.bzl"],
-        {
-            "%{rocm_is_configured}": "True",
-            "%{gpu_is_configured}": "if_true",
-            "%{cuda_or_rocm}": "if_true",
-            "%{rocm_extra_copts}": _compute_rocm_extra_copts(
-                repository_ctx,
-                rocm_config.amdgpu_targets,
-            ),
-            "%{rocm_gpu_architectures}": str(rocm_config.amdgpu_targets),
-            "%{rocm_version_number}": str(rocm_version_number),
-            "%{rocm_hipblaslt}": "True" if rocm_libs["hipblaslt"] != None else "False",
-        },
-    )
-
-    repository_dict = {
-        "%{rocm_root}": rocm_toolkit_path,
-        "%{rocm_toolkit_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path)),
-    }
-
-    is_rocm_clang = _use_rocm_clang(repository_ctx)
-    tf_sysroot = _tf_sysroot(repository_ctx)
-
-    if rocm_libs["hipblaslt"] != None:
-        repository_dict["%{hipblaslt_lib}"] = rocm_libs["hipblaslt"].file_name
-
-    if rocm_version_number >= 40500:
-        repository_dict["%{hipsolver_lib}"] = rocm_libs["hipsolver"].file_name
-        repository_dict["%{hipblas_lib}"] = rocm_libs["hipblas"].file_name
-
-    repository_ctx.template(
-        "rocm/BUILD",
-        tpl_paths["rocm:BUILD"],
-        repository_dict,
-    )
-
-    # Set up crosstool/
-    cc = find_cc(repository_ctx, is_rocm_clang)
-    host_compiler_includes = get_cxx_inc_directories(
-        repository_ctx,
-        cc,
-        tf_sysroot,
-    )
-
-    # host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
-
-    rocm_defines = {}
-    rocm_defines["%{builtin_sysroot}"] = tf_sysroot
-    rocm_defines["%{compiler}"] = "unknown"
-    if is_rocm_clang:
-        rocm_defines["%{compiler}"] = "clang"
-    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX, "/usr/bin")
-    rocm_defines["%{host_compiler_prefix}"] = host_compiler_prefix
-    rocm_defines["%{linker_bin_path}"] = rocm_config.rocm_toolkit_path + host_compiler_prefix
-    rocm_defines["%{extra_no_canonical_prefixes_flags}"] = ""
-    rocm_defines["%{unfiltered_compile_flags}"] = ""
-    rocm_defines["%{rocm_hipcc_files}"] = "[]"
-
-    if is_rocm_clang:
-        rocm_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-no-canonical-prefixes\""
-    else:
-        # For gcc, do not canonicalize system header paths; some versions of gcc
-        # pick the shortest possible path for system includes when creating the
-        # .d file - given that includes that are prefixed with "../" multiple
-        # time quickly grow longer than the root of the tree, this can lead to
-        # bazel's header check failing.
-        rocm_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
-
-    rocm_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
-        "-DTENSORFLOW_USE_ROCM=1",
-        "-D__HIP_PLATFORM_AMD__",
-        "-DEIGEN_USE_HIP",
-        "-DUSE_ROCM",
-    ])
-
-    rocm_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-
-    rocm_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
-        host_compiler_includes + _rocm_include_path(repository_ctx, rocm_config, bash_bin),
-    )
-
-    verify_build_defines(rocm_defines)
-
-    # Only expand template variables in the BUILD file
-    repository_ctx.template(
-        "crosstool/BUILD",
-        tpl_paths["crosstool:BUILD.rocm"],
-        rocm_defines,
-    )
-
-    # No templating of cc_toolchain_config - use attributes and templatize the
-    # BUILD file.
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        tpl_paths["crosstool:hipcc_cc_toolchain_config.bzl"],
-        rocm_defines,
-    )
-
-    repository_ctx.template(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_rocm"],
-        {
-            "%{cpu_compiler}": str(cc),
-            "%{compiler_is_clang}": "True" if is_rocm_clang else "False",
-            "%{hipcc_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path + "/bin/hipcc")),
-            "%{hipcc_env}": _hipcc_env(repository_ctx),
-            "%{rocm_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path)),
-            "%{rocr_runtime_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path + "/lib")),
-            "%{rocr_runtime_library}": "hsa-runtime64",
-            "%{hip_runtime_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path + "/lib")),
-            "%{hip_runtime_library}": "amdhip64",
-            "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
-            "%{gcc_host_compiler_path}": str(cc),
-        },
-    )
-
-    # Set up rocm_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "rocm/rocm_config/rocm_config.h",
-        tpl_paths["rocm:rocm_config.h"],
-        {
-            "%{rocm_amdgpu_targets}": ",".join(
-                ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
-            ),
-            "%{rocm_toolkit_path}": rocm_config.install_path,
-            "%{rocm_version_number}": rocm_config.rocm_version_number,
-            "%{miopen_version_number}": rocm_config.miopen_version_number,
-            "%{hipruntime_version_number}": rocm_config.hipruntime_version_number,
-            "%{hipblaslt_flag}": have_hipblaslt,
-            "%{hip_soversion_number}": "6" if int(rocm_config.rocm_version_number) >= 60000 else "5",
-            "%{rocblas_soversion_number}": "4" if int(rocm_config.rocm_version_number) >= 60000 else "3",
-        },
-    )
-
-    # Set up rocm_config.h, which is used by
-    # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "rocm/rocm_config_hermetic/rocm_config.h",
-        tpl_paths["rocm:rocm_config.h"],
-        {
-            "%{rocm_amdgpu_targets}": ",".join(
-                ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
-            ),
-            "%{rocm_toolkit_path}": str(repository_ctx.path(rocm_config.rocm_toolkit_path)),
-            "%{rocm_version_number}": rocm_config.rocm_version_number,
-            "%{miopen_version_number}": rocm_config.miopen_version_number,
-            "%{hipruntime_version_number}": rocm_config.hipruntime_version_number,
-            "%{hipblaslt_flag}": have_hipblaslt,
-            "%{hip_soversion_number}": "6" if int(rocm_config.rocm_version_number) >= 60000 else "5",
-            "%{rocblas_soversion_number}": "4" if int(rocm_config.rocm_version_number) >= 60000 else "3",
-        },
-    )
-
-def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with ROCm."""
-    _tpl(
-        repository_ctx,
-        "rocm:build_defs.bzl",
-        {
-            "%{rocm_is_configured}": "True",
-            "%{gpu_is_configured}": "if_true",
-            "%{cuda_or_rocm}": "if_true",
-            "%{rocm_extra_copts}": _compute_rocm_extra_copts(
-                repository_ctx,
-                [],  #_compute_capabilities(repository_ctx)
-            ),
-        },
-    )
-    repository_ctx.template(
-        "rocm/BUILD",
-        config_repo_label(remote_config_repo, "rocm:BUILD"),
-        {},
-    )
-    repository_ctx.template(
-        "rocm/build_defs.bzl",
-        config_repo_label(remote_config_repo, "rocm:build_defs.bzl"),
-        {},
-    )
-    repository_ctx.template(
-        "rocm/rocm/rocm_config.h",
-        config_repo_label(remote_config_repo, "rocm:rocm/rocm_config.h"),
-        {},
-    )
-    repository_ctx.template(
-        "crosstool/BUILD",
-        config_repo_label(remote_config_repo, "crosstool:BUILD"),
-        {},
-    )
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
-        {},
-    )
-    repository_ctx.template(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
-        {},
-    )
-
-def _rocm_autoconf_impl(repository_ctx):
-    """Implementation of the rocm_autoconf repository rule."""
-    if not _enable_rocm(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    elif get_host_environ(repository_ctx, _TF_ROCM_CONFIG_REPO) != None:
-        _create_remote_rocm_repository(
-            repository_ctx,
-            get_host_environ(repository_ctx, _TF_ROCM_CONFIG_REPO),
-        )
-    else:
-        _create_local_rocm_repository(repository_ctx)
-
-_ENVIRONS = [
-    _GCC_HOST_COMPILER_PATH,
-    _GCC_HOST_COMPILER_PREFIX,
-    "TF_NEED_ROCM",
-    "TF_ROCM_CLANG",
-    "TF_NEED_CUDA",  # Needed by the `if_gpu_is_configured` macro
-    _ROCM_TOOLKIT_PATH,
-    _TF_ROCM_AMDGPU_TARGETS,
-    _OS,
-    _ROCM_VERSION,
-]
-
-remote_rocm_configure = repository_rule(
-    implementation = _create_local_rocm_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "_find_rocm_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_rocm_config.py"),
-        ),
-    },
-)
-
-rocm_configure = repository_rule(
-    implementation = _rocm_autoconf_impl,
-    environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
-    attrs = {
-        "_find_rocm_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_rocm_config.py"),
-        ),
-    },
-)
-"""Detects and configures the local ROCm toolchain.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-rocm_configure(name = "local_config_rocm")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/gpus/sycl/BUILD.tpl b/third_party/gpus/sycl/BUILD.tpl
deleted file mode 100644
index effbbc887993..000000000000
--- a/third_party/gpus/sycl/BUILD.tpl
+++ /dev/null
@@ -1,63 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_sycl",
-    values = {
-        "define": "using_sycl=true",
-    },
-)
-
-cc_library(
-    name = "sycl_headers",
-    hdrs = [
-        %{sycl_headers}
-    ],
-    includes = [
-        ".",
-        "sycl/include",
-        "sycl/include/sycl",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "sycl",
-    srcs = [
-        %{core_sycl_libs}
-    ],
-    data = [
-        %{core_sycl_libs}
-    ],
-    includes = [
-        ".",
-        "sycl/include",
-    ],
-    linkopts = ["-lze_loader"],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "mkl",
-    srcs = [
-        "sycl/lib/%{mkl_intel_ilp64_lib}",
-        "sycl/lib/%{mkl_sequential_lib}",
-        "sycl/lib/%{mkl_core_lib}",
-        %{mkl_sycl_libs}
-    ],
-    data = [
-        "sycl/lib/%{mkl_intel_ilp64_lib}",
-        "sycl/lib/%{mkl_sequential_lib}",
-        "sycl/lib/%{mkl_core_lib}",
-        %{mkl_sycl_libs}
-    ],
-    includes = [
-        ".",
-        "sycl/include",
-    ],
-    # linkopts = ["-Wl,-Bstatic,-lsvml,-lirng,-limf,-lirc,-lirc_s,-Bdynamic"],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-%{copy_rules}
diff --git a/third_party/gpus/sycl/build_defs.bzl.tpl b/third_party/gpus/sycl/build_defs.bzl.tpl
deleted file mode 100644
index d678f84ca88a..000000000000
--- a/third_party/gpus/sycl/build_defs.bzl.tpl
+++ /dev/null
@@ -1,39 +0,0 @@
-# Macros for building SYCL code.
-def if_sycl(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with SYCL.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_sycl//sycl:using_sycl": if_true,
-        "//conditions:default": if_false,
-    })
-
-def sycl_default_copts():
-    """Default options for all SYCL compilations."""
-    return if_sycl(["-x", "sycl"])
-
-def sycl_build_is_configured():
-    """Returns true if SYCL compiler was enabled during the configure process."""
-    return %{sycl_build_is_configured}
-
-def if_sycl_is_configured(x):
-    """Tests if the SYCL was enabled during the configure process.
-
-    Unlike if_sycl(), this does not require that we are building with
-    --config=sycl. Used to allow non-SYCL code to depend on SYCL libraries.
-    """
-    if %{sycl_is_configured}:
-      return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def if_sycl_build_is_configured(x, y):
-    if sycl_build_is_configured():
-      return x
-    return y
-
-def sycl_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default SYCL options."""
-    native.cc_library(copts = sycl_default_copts() + copts, **kwargs)
diff --git a/third_party/gpus/sycl_configure.bzl b/third_party/gpus/sycl_configure.bzl
deleted file mode 100644
index dd80694e7274..000000000000
--- a/third_party/gpus/sycl_configure.bzl
+++ /dev/null
@@ -1,534 +0,0 @@
-"""Repository rule for SYCL autoconfiguration.
-`sycl_configure` depends on the following environment variables:
-  * `TF_NEED_SYCL`: Whether to enable building with SYCL.
-  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-"""
-
-load(
-    "//third_party/remote_config:common.bzl",
-    "err_out",
-    "execute",
-    "files_exist",
-    "get_bash_bin",
-    "get_host_environ",
-    "get_python_bin",
-    "raw_exec",
-    "realpath",
-    "which",
-)
-load(
-    ":compiler_common_tools.bzl",
-    "to_list_of_strings",
-)
-load(
-    ":cuda_configure.bzl",
-    "make_copy_dir_rule",
-    "make_copy_files_rule",
-)
-
-_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
-_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
-
-def _mkl_path(sycl_config):
-    return sycl_config.sycl_basekit_path + "/mkl/" + sycl_config.sycl_basekit_version_number
-
-def _sycl_header_path(repository_ctx, sycl_config, bash_bin):
-    sycl_header_path = sycl_config.sycl_basekit_path + "/compiler/" + sycl_config.sycl_basekit_version_number
-    include_dir = sycl_header_path + "/include"
-    if not files_exist(repository_ctx, [include_dir], bash_bin)[0]:
-        sycl_header_path = sycl_header_path + "/linux"
-        include_dir = sycl_header_path + "/include"
-        if not files_exist(repository_ctx, [include_dir], bash_bin)[0]:
-            auto_configure_fail("Cannot find sycl headers in {}".format(include_dir))
-    return sycl_header_path
-
-def _sycl_include_path(repository_ctx, sycl_config, bash_bin):
-    """Generates the cxx_builtin_include_directory entries for sycl inc dirs.
-    Args:
-      repository_ctx: The repository context.
-      sycl_config: The path to the gcc host compiler.
-    Returns:
-      A string containing the Starlark string for each of the gcc
-      host compiler include directories, which can be added to the CROSSTOOL
-      file.
-    """
-    inc_dirs = []
-
-    inc_dirs.append(_mkl_path(sycl_config) + "/include")
-    inc_dirs.append(_sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include")
-    inc_dirs.append(_sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include/sycl")
-
-    return inc_dirs
-
-def enable_sycl(repository_ctx):
-    """Returns whether to build with SYCL support."""
-    return int(get_host_environ(repository_ctx, "TF_NEED_SYCL", False))
-
-def auto_configure_fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def find_cc(repository_ctx):
-    """Find the C++ compiler."""
-
-    # Return a dummy value for GCC detection here to avoid error
-    target_cc_name = "gcc"
-    cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
-    if cc_name_from_env:
-        cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = which(repository_ctx, cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
-
-def find_sycl_root(repository_ctx, sycl_config):
-    sycl_name = str(repository_ctx.path(sycl_config.sycl_toolkit_path.strip()).realpath)
-    if sycl_name.startswith("/"):
-        return sycl_name
-    fail("Cannot find SYCL compiler, please correct your path")
-
-def find_sycl_include_path(repository_ctx, sycl_config):
-    base_path = find_sycl_root(repository_ctx, sycl_config)
-    bin_path = repository_ctx.path(base_path + "/" + "bin" + "/" + "icpx")
-    icpx_extra = ""
-    if not bin_path.exists:
-        bin_path = repository_ctx.path(base_path + "/" + "bin" + "/" + "clang")
-        if not bin_path.exists:
-            fail("Cannot find SYCL compiler, please correct your path")
-    else:
-        icpx_extra = "-fsycl"
-    gcc_path = repository_ctx.which("gcc")
-    gcc_install_dir = repository_ctx.execute([gcc_path, "-print-libgcc-file-name"])
-    gcc_install_dir_opt = "--gcc-install-dir=" + str(repository_ctx.path(gcc_install_dir.stdout.strip()).dirname)
-    cmd_out = repository_ctx.execute([bin_path, icpx_extra, gcc_install_dir_opt, "-xc++", "-E", "-v", "/dev/null", "-o", "/dev/null"])
-    outlist = cmd_out.stderr.split("\n")
-    real_base_path = str(repository_ctx.path(base_path).realpath).strip()
-    include_dirs = []
-    for l in outlist:
-        if l.startswith(" ") and l.strip().startswith("/") and str(repository_ctx.path(l.strip()).realpath) not in include_dirs:
-            include_dirs.append(str(repository_ctx.path(l.strip()).realpath))
-    return include_dirs
-
-def _lib_name(lib, version = "", static = False):
-    """Constructs the name of a library on Linux.
-    Args:
-      lib: The name of the library, such as "mkl"
-      version: The version of the library.
-      static: True the library is static or False if it is a shared object.
-    Returns:
-      The platform-specific name of the library.
-    """
-    if static:
-        return "lib%s.a" % lib
-    else:
-        if version:
-            version = ".%s" % version
-        return "lib%s.so%s" % (lib, version)
-
-def _sycl_lib_paths(repository_ctx, lib, basedir):
-    file_name = _lib_name(lib, version = "", static = False)
-    return [
-        repository_ctx.path("%s/lib/%s" % (basedir, file_name)),
-        repository_ctx.path("%s/lib/intel64/%s" % (basedir, file_name)),
-    ]
-
-def _batch_files_exist(repository_ctx, libs_paths, bash_bin):
-    all_paths = []
-    for _, lib_paths in libs_paths:
-        for lib_path in lib_paths:
-            all_paths.append(lib_path)
-    return files_exist(repository_ctx, all_paths, bash_bin)
-
-def _select_sycl_lib_paths(repository_ctx, libs_paths, bash_bin):
-    test_results = _batch_files_exist(repository_ctx, libs_paths, bash_bin)
-
-    libs = {}
-    i = 0
-    for name, lib_paths in libs_paths:
-        selected_path = None
-        for path in lib_paths:
-            if test_results[i] and selected_path == None:
-                # For each lib select the first path that exists.
-                selected_path = path
-            i = i + 1
-        if selected_path == None:
-            auto_configure_fail("Cannot find sycl library %s in %s" % (name, path))
-
-        libs[name] = struct(file_name = selected_path.basename, path = realpath(repository_ctx, selected_path, bash_bin))
-
-    return libs
-
-def _find_libs(repository_ctx, sycl_config, bash_bin):
-    """Returns the SYCL libraries on the system.
-    Args:
-      repository_ctx: The repository context.
-      sycl_config: The SYCL config as returned by _get_sycl_config
-      bash_bin: the path to the bash interpreter
-    Returns:
-      Map of library names to structs of filename and path
-    """
-    mkl_path = _mkl_path(sycl_config)
-    sycl_path = _sycl_header_path(repository_ctx, sycl_config, bash_bin)
-    libs_paths = [
-        (name, _sycl_lib_paths(repository_ctx, name, path))
-        for name, path in [
-            ("sycl", sycl_path),
-            ("OpenCL", sycl_path),
-            ("mkl_intel_ilp64", mkl_path),
-            ("mkl_sequential", mkl_path),
-            ("mkl_core", mkl_path),
-        ]
-    ]
-    if sycl_config.sycl_basekit_version_number < "2024":
-        libs_paths.append(("mkl_sycl", _sycl_lib_paths(repository_ctx, "mkl_sycl", mkl_path)))
-    else:
-        libs_paths.append(("mkl_sycl_blas", _sycl_lib_paths(repository_ctx, "mkl_sycl_blas", mkl_path)))
-        libs_paths.append(("mkl_sycl_lapack", _sycl_lib_paths(repository_ctx, "mkl_sycl_lapack", mkl_path)))
-        libs_paths.append(("mkl_sycl_sparse", _sycl_lib_paths(repository_ctx, "mkl_sycl_sparse", mkl_path)))
-        libs_paths.append(("mkl_sycl_dft", _sycl_lib_paths(repository_ctx, "mkl_sycl_dft", mkl_path)))
-        libs_paths.append(("mkl_sycl_vm", _sycl_lib_paths(repository_ctx, "mkl_sycl_vm", mkl_path)))
-        libs_paths.append(("mkl_sycl_rng", _sycl_lib_paths(repository_ctx, "mkl_sycl_rng", mkl_path)))
-        libs_paths.append(("mkl_sycl_stats", _sycl_lib_paths(repository_ctx, "mkl_sycl_stats", mkl_path)))
-        libs_paths.append(("mkl_sycl_data_fitting", _sycl_lib_paths(repository_ctx, "mkl_sycl_data_fitting", mkl_path)))
-    return _select_sycl_lib_paths(repository_ctx, libs_paths, bash_bin)
-
-def find_sycl_config(repository_ctx):
-    """Returns SYCL config dictionary from running find_sycl_config.py"""
-    python_bin = get_python_bin(repository_ctx)
-    exec_result = execute(repository_ctx, [python_bin, repository_ctx.attr._find_sycl_config])
-    if exec_result.return_code:
-        auto_configure_fail("Failed to run find_sycl_config.py: %s" % err_out(exec_result))
-
-    # Parse the dict from stdout.
-    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
-
-def _get_sycl_config(repository_ctx, bash_bin):
-    """Detects and returns information about the SYCL installation on the system.
-    Args:
-      repository_ctx: The repository context.
-      bash_bin: the path to the path interpreter
-    """
-    config = find_sycl_config(repository_ctx)
-    sycl_basekit_path = config["sycl_basekit_path"]
-    sycl_toolkit_path = config["sycl_toolkit_path"]
-    sycl_version_number = config["sycl_version_number"]
-    sycl_basekit_version_number = config["sycl_basekit_version_number"]
-    return struct(
-        sycl_basekit_path = sycl_basekit_path,
-        sycl_toolkit_path = sycl_toolkit_path,
-        sycl_version_number = sycl_version_number,
-        sycl_basekit_version_number = sycl_basekit_version_number,
-    )
-
-def _tpl_path(repository_ctx, labelname):
-    return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % labelname))
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        _tpl_path(repository_ctx, tpl),
-        substitutions,
-    )
-
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-      """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return "\"" + path[len(crosstool_folder) + 1:] + "\""
-    return "\"" + path + "\""
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-
-    result = raw_exec(repository_ctx, [
-        cc,
-        "-no-canonical-prefixes",
-        "-E",
-        "-x" + lang,
-        "-",
-        "-v",
-    ])
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    return [
-        str(repository_ctx.path(_cxx_inc_convert(p)))
-        for p in inc_dirs.split("\n")
-    ]
-
-def get_cxx_inc_directories(repository_ctx, cc):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
-
-    includes_cpp_set = depset(includes_cpp)
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp_set.to_list()
-    ]
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_gpu_disabled():
-  fail("ERROR: Building with --config=sycl but TensorFlow is not configured " +
-       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with GPU support.")
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
-error_gpu_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    # Set up BUILD file for sycl/.
-    _tpl(repository_ctx, "sycl:build_defs.bzl")
-    _tpl(
-        repository_ctx,
-        "sycl:BUILD",
-        {
-            "%{mkl_intel_ilp64_lib}": _lib_name("mkl_intel_ilp64"),
-            "%{mkl_sequential_lib}": _lib_name("mkl_sequential"),
-            "%{mkl_core_lib}": _lib_name("mkl_core"),
-            "%{mkl_sycl_libs}": "",
-            "%{core_sycl_libs}": "",
-            "%{copy_rules}": "",
-            "%{sycl_headers}": "",
-        },
-    )
-
-    # If sycl_configure is not configured to build with SYCL support, and the user
-    # attempts to build with --config=sycl, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_gpu_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-    _tpl(
-        repository_ctx,
-        "sycl:build_defs.bzl",
-        {
-            "%{sycl_is_configured}": "False",
-            "%{sycl_build_is_configured}": "False",
-        },
-    )
-
-def _create_local_sycl_repository(repository_ctx):
-    tpl_paths = {labelname: _tpl_path(repository_ctx, labelname) for labelname in [
-        "sycl:build_defs.bzl",
-        "sycl:BUILD",
-        "crosstool:BUILD.sycl",
-        "crosstool:sycl_cc_toolchain_config.bzl",
-        "crosstool:clang/bin/crosstool_wrapper_driver_sycl",
-    ]}
-
-    bash_bin = get_bash_bin(repository_ctx)
-    sycl_config = _get_sycl_config(repository_ctx, bash_bin)
-
-    # Copy header and library files to execroot.
-    copy_rules = [
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "sycl-include",
-            src_dir = _sycl_header_path(repository_ctx, sycl_config, bash_bin) + "/include",
-            out_dir = "sycl/include",
-        ),
-    ]
-    copy_rules.append(make_copy_dir_rule(
-        repository_ctx,
-        name = "mkl-include",
-        src_dir = _mkl_path(sycl_config) + "/include",
-        out_dir = "sycl/include",
-    ))
-
-    sycl_libs = _find_libs(repository_ctx, sycl_config, bash_bin)
-    sycl_lib_srcs = []
-    sycl_lib_outs = []
-    for lib in sycl_libs.values():
-        sycl_lib_srcs.append(lib.path)
-        sycl_lib_outs.append("sycl/lib/" + lib.file_name)
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "sycl-lib",
-        srcs = sycl_lib_srcs,
-        outs = sycl_lib_outs,
-    ))
-
-    # Set up BUILD file for sycl/
-    repository_ctx.template(
-        "sycl/build_defs.bzl",
-        tpl_paths["sycl:build_defs.bzl"],
-        {
-            "%{sycl_is_configured}": "True",
-            "%{sycl_build_is_configured}": "True",
-        },
-    )
-
-    if sycl_config.sycl_basekit_version_number < "2024":
-        mkl_sycl_libs = '"{}"'.format(
-            "sycl/lib/" + sycl_libs["mkl_sycl"].file_name,
-        )
-    else:
-        mkl_sycl_libs = '"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}",\n"{}"'.format(
-            "sycl/lib/" + sycl_libs["mkl_sycl_blas"].file_name,
-            "sycl/lib/" + sycl_libs["mkl_sycl_lapack"].file_name,
-            "sycl/lib/" + sycl_libs["mkl_sycl_sparse"].file_name,
-            "sycl/lib/" + sycl_libs["mkl_sycl_dft"].file_name,
-            "sycl/lib/" + sycl_libs["mkl_sycl_vm"].file_name,
-            "sycl/lib/" + sycl_libs["mkl_sycl_rng"].file_name,
-            "sycl/lib/" + sycl_libs["mkl_sycl_stats"].file_name,
-            "sycl/lib/" + sycl_libs["mkl_sycl_data_fitting"].file_name,
-        )
-    core_sycl_libs = '"{}",\n"{}"'.format(
-        "sycl/lib/" + sycl_libs["sycl"].file_name,
-        "sycl/lib/" + sycl_libs["OpenCL"].file_name,
-    )
-    repository_dict = {
-        "%{mkl_intel_ilp64_lib}": sycl_libs["mkl_intel_ilp64"].file_name,
-        "%{mkl_sequential_lib}": sycl_libs["mkl_sequential"].file_name,
-        "%{mkl_core_lib}": sycl_libs["mkl_core"].file_name,
-        "%{mkl_sycl_libs}": mkl_sycl_libs,
-        "%{core_sycl_libs}": core_sycl_libs,
-        "%{copy_rules}": "\n".join(copy_rules),
-        "%{sycl_headers}": ('":mkl-include",\n":sycl-include",\n'),
-    }
-    repository_ctx.template(
-        "sycl/BUILD",
-        tpl_paths["sycl:BUILD"],
-        repository_dict,
-    )
-
-    # Set up crosstool/
-
-    cc = find_cc(repository_ctx)
-
-    host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
-
-    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX, "/usr/bin")
-
-    sycl_defines = {}
-
-    sycl_defines["%{host_compiler_prefix}"] = host_compiler_prefix
-    sycl_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-
-    sycl_defines["%{cpu_compiler}"] = str(cc)
-    sycl_defines["%{linker_bin_path}"] = "/usr/bin"
-
-    sycl_internal_inc_dirs = find_sycl_include_path(repository_ctx, sycl_config)
-    cxx_builtin_includes_list = sycl_internal_inc_dirs + _sycl_include_path(repository_ctx, sycl_config, bash_bin) + host_compiler_includes
-
-    sycl_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(cxx_builtin_includes_list)
-    sycl_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
-    sycl_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
-        "-DTENSORFLOW_USE_SYCL=1",
-        "-DMKL_ILP64",
-        "-fPIC",
-    ])
-    sycl_defines["%{sycl_compiler_root}"] = str(sycl_config.sycl_toolkit_path)
-    sycl_defines["%{SYCL_ROOT_DIR}"] = str(sycl_config.sycl_toolkit_path)
-    sycl_defines["%{basekit_path}"] = str(sycl_config.sycl_basekit_path)
-    sycl_defines["%{basekit_version}"] = str(sycl_config.sycl_basekit_version_number)
-    sycl_defines["%{MKL_PATH}"] = _mkl_path(sycl_config)
-
-    # Only expand template variables in the BUILD file
-    repository_ctx.template(
-        "crosstool/BUILD",
-        tpl_paths["crosstool:BUILD.sycl"],
-        sycl_defines,
-    )
-
-    # No templating of cc_toolchain_config - use attributes and templatize the
-    # BUILD file.
-    repository_ctx.template(
-        "crosstool/cc_toolchain_config.bzl",
-        tpl_paths["crosstool:sycl_cc_toolchain_config.bzl"],
-        sycl_defines,
-    )
-
-    repository_ctx.template(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_sycl"],
-        sycl_defines,
-    )
-
-def _sycl_autoconf_imp(repository_ctx):
-    """Implementation of the sycl_autoconf rule."""
-    if not enable_sycl(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    else:
-        _create_local_sycl_repository(repository_ctx)
-
-sycl_configure = repository_rule(
-    # Detects and configures the local SYCL toolchain.
-    # Add the following to your WORKSPACE FILE:
-    # ```python
-    # sycl_configure(name = "local_config_sycl")
-    # ```
-    # Args:
-    #   name: A unique name for this workspace rule.
-    implementation = _sycl_autoconf_imp,
-    local = True,
-    attrs = {
-        "_find_sycl_config": attr.label(
-            default = Label("//third_party/gpus:find_sycl_config.py"),
-        ),
-    },
-)
diff --git a/third_party/grpc/generate_cc_env_fix.patch b/third_party/grpc/generate_cc_env_fix.patch
deleted file mode 100644
index 51832fe9628b..000000000000
--- a/third_party/grpc/generate_cc_env_fix.patch
+++ /dev/null
@@ -1,10 +0,0 @@
---- a/bazel/generate_cc.bzl
-+++ b/bazel/generate_cc.bzl
-@@ -141,6 +141,7 @@ def generate_cc_impl(ctx):
-         outputs = out_files,
-         executable = ctx.executable._protoc,
-         arguments = arguments,
-+        use_default_shell_env = True,
-     )
-
-     return struct(files = depset(out_files))
diff --git a/third_party/grpc/register_go_toolchain.patch b/third_party/grpc/register_go_toolchain.patch
deleted file mode 100644
index eabe6ccbe2be..000000000000
--- a/third_party/grpc/register_go_toolchain.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/bazel/grpc_extra_deps.bzl b/bazel/grpc_extra_deps.bzl
-index 4c1dfad2e8..d3d9ce15ba 100644
---- a/bazel/grpc_extra_deps.bzl
-+++ b/bazel/grpc_extra_deps.bzl
-@@ -33,7 +33,7 @@ def grpc_extra_deps():
-     api_dependencies()
-
-     go_rules_dependencies()
--    go_register_toolchains()
-+    go_register_toolchains(version = "1.18.4")
-
-     apple_rules_dependencies()
-
diff --git a/third_party/grpc/upb_platform_fix.patch b/third_party/grpc/upb_platform_fix.patch
deleted file mode 100644
index 6edd66067ea6..000000000000
--- a/third_party/grpc/upb_platform_fix.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/BUILD b/BUILD
-index ad85b202..2311b2e4 100644
---- a/BUILD
-+++ b/BUILD
-@@ -44,7 +44,7 @@ config_setting(
-
- config_setting(
-     name = "windows",
--    constraint_values = ["@bazel_tools//platforms:windows"],
-+    constraint_values = ["@platforms//os:windows"],
- )
-
- config_setting(
diff --git a/third_party/highwayhash/highwayhash.BUILD b/third_party/highwayhash/highwayhash.BUILD
index c24c987a276a..0314bd443f26 100644
--- a/third_party/highwayhash/highwayhash.BUILD
+++ b/third_party/highwayhash/highwayhash.BUILD
@@ -29,6 +29,11 @@ config_setting(
     values = {"cpu": "aarch64"},
 )
 
+config_setting(
+    name = "cpu_darwin_arm64",
+    values = {"cpu": "darwin_arm64"},
+)
+
 #-----------------------------------------------------------------------------
 # Platform-specific
 
@@ -202,6 +207,7 @@ cc_library(
     hdrs = ["highwayhash/highwayhash_target.h"],
     copts = select({
         ":cpu_aarch64": [],
+        ":cpu_darwin_arm64": [],
         "//conditions:default": ["-DHH_DISABLE_TARGET_SPECIFIC"],
     }),
     textual_hdrs = [
@@ -268,6 +274,7 @@ cc_library(
     ] + select({
         ":cpu_ppc": [":hh_vsx"],
         ":cpu_aarch64": [":hh_neon"],
+        ":cpu_darwin_arm64": [":hh_neon"],
         "//conditions:default": [
             ":hh_avx2",
             ":hh_sse41",
@@ -288,6 +295,7 @@ cc_library(
     ] + select({
         ":cpu_ppc": [":hh_vsx"],
         ":cpu_aarch64": [":hh_neon"],
+        ":cpu_darwin_arm64": [":hh_neon"],
         "//conditions:default": [
             ":hh_avx2",
             ":hh_sse41",
diff --git a/third_party/hwloc/BUILD b/third_party/hwloc/BUILD
deleted file mode 100644
index db5c9ec8873b..000000000000
--- a/third_party/hwloc/BUILD
+++ /dev/null
@@ -1,10 +0,0 @@
-# BUILD file to make this directory a package.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
-
-exports_files(
-    ["static-components.h"],
-)
diff --git a/third_party/hwloc/BUILD.system b/third_party/hwloc/BUILD.system
deleted file mode 100644
index 2989102a1d96..000000000000
--- a/third_party/hwloc/BUILD.system
+++ /dev/null
@@ -1,22 +0,0 @@
-# hwloc: Portable Hardware Locality Library
-
-licenses(["notice"])
-
-config_setting(
-    name = "with_numa_support",
-    define_values = {"with_numa_support": "true"},
-)
-
-filegroup(
-    name = "COPYING",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hwloc",
-    linkopts = select({
-        ":with_numa_support": ["-lhwloc"],
-        "//conditions:default": [],
-    }),
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD
deleted file mode 100644
index ac935ad148c3..000000000000
--- a/third_party/hwloc/hwloc.BUILD
+++ /dev/null
@@ -1,319 +0,0 @@
-# hwloc: Portable Hardware Locality Library
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])
-
-exports_files(["COPYING"])
-
-COMMON_INCLUDE_COPTS = [
-    "-I.",
-    "-Ihwloc",
-    "-Iinclude",
-]
-
-DISABLE_WARNINGS_COPTS = [
-    "-Wno-vla",
-]
-
-VAR_SETTINGS_COPTS = [
-    "-DHWLOC_DUMPED_HWDATA_DIR=",
-    "-DRUNSTATEDIR=",
-]
-
-_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS = {
-    "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
-    "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
-    "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
-    "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
-    "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
-    "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
-    "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
-    "#  undef HWLOC_HAVE_STDINT_H": "#  define HWLOC_HAVE_STDINT_H 1",
-    "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
-    "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
-    "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
-}
-
-_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS = dict(_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS)
-
-_INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS.update({
-    "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
-})
-
-expand_template(
-    name = "include_hwloc_autogen_config_h",
-    out = "include/hwloc/autogen/config.h",
-    substitutions = select({
-        "@local_xla//xla/tsl:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
-        "//conditions:default": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS,
-    }),
-    template = "include/hwloc/autogen/config.h.in",
-)
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS = {
-    "#undef HAVE_CLOCK_GETTIME": "#define HAVE_CLOCK_GETTIME 1",
-    "#undef HAVE_CTYPE_H": "#define HAVE_CTYPE_H 1",
-    "#undef HAVE_DECL_CTL_HW": "#define HAVE_DECL_CTL_HW 0",
-    "#undef HAVE_DECL_FABSF": "#define HAVE_DECL_FABSF 1",
-    "#undef HAVE_DECL_GETEXECNAME": "#define HAVE_DECL_GETEXECNAME 0",
-    "#undef HAVE_DECL_GETMODULEFILENAME": "#define HAVE_DECL_GETMODULEFILENAME 0",
-    "#undef HAVE_DECL_GETPROGNAME": "#define HAVE_DECL_GETPROGNAME 0",
-    "#undef HAVE_DECL_HW_NCPU": "#define HAVE_DECL_HW_NCPU 0",
-    "#undef HAVE_DECL_MODFF": "#define HAVE_DECL_MODFF 1",
-    "#undef HAVE_DECL_PTHREAD_GETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1",
-    "#undef HAVE_DECL_PTHREAD_SETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1",
-    "#undef HAVE_DECL_RUNNING_ON_VALGRIND": "#define HAVE_DECL_RUNNING_ON_VALGRIND 0",
-    "#undef HAVE_DECL_SCHED_GETCPU": "#define HAVE_DECL_SCHED_GETCPU 1",
-    "#undef HAVE_DECL_SNPRINTF": "#define HAVE_DECL_SNPRINTF 1",
-    "#undef HAVE_DECL_STRTOULL": "#define HAVE_DECL_STRTOULL 1",
-    "#undef HAVE_DECL__PUTENV": "#define HAVE_DECL__PUTENV 0",
-    "#undef HAVE_DECL__SC_LARGE_PAGESIZE": "#define HAVE_DECL__SC_LARGE_PAGESIZE 0",
-    "#undef HAVE_DECL__SC_NPROCESSORS_CONF": "#define HAVE_DECL__SC_NPROCESSORS_CONF 1",
-    "#undef HAVE_DECL__SC_NPROCESSORS_ONLN": "#define HAVE_DECL__SC_NPROCESSORS_ONLN 1",
-    "#undef HAVE_DECL__SC_NPROC_CONF": "#define HAVE_DECL__SC_NPROC_CONF 0",
-    "#undef HAVE_DECL__SC_NPROC_ONLN": "#define HAVE_DECL__SC_NPROC_ONLN 0",
-    "#undef HAVE_DECL__SC_PAGESIZE": "#define HAVE_DECL__SC_PAGESIZE 1",
-    "#undef HAVE_DECL__SC_PAGE_SIZE": "#define HAVE_DECL__SC_PAGE_SIZE 1",
-    "#undef HAVE_DECL__STRDUP": "#define HAVE_DECL__STRDUP 0",
-    "#undef HAVE_DIRENT_H": "#define HAVE_DIRENT_H 1",
-    "#undef HAVE_DLFCN_H": "#define HAVE_DLFCN_H 1",
-    "#undef HAVE_FFSL": "#define HAVE_FFSL 1",
-    "#undef HAVE_FFS": "#define HAVE_FFS 1",
-    "#undef HAVE_GETPAGESIZE": "#define HAVE_GETPAGESIZE 1",
-    "#undef HAVE_INTTYPES_H": "#define HAVE_INTTYPES_H 1",
-    "#undef HAVE_LANGINFO_H": "#define HAVE_LANGINFO_H 1",
-    "#undef HAVE_LOCALE_H": "#define HAVE_LOCALE_H 1",
-    "#undef HAVE_MALLOC_H": "#define HAVE_MALLOC_H 1",
-    "#undef HAVE_MEMALIGN": "#define HAVE_MEMALIGN 1",
-    "#undef HAVE_MEMORY_H": "#define HAVE_MEMORY_H 1",
-    "#undef HAVE_MKSTEMP": "#define HAVE_MKSTEMP 1",
-    "#undef HAVE_NL_LANGINFO": "#define HAVE_NL_LANGINFO 1",
-    "#undef HAVE_OPENAT": "#define HAVE_OPENAT 1",
-    "#undef HAVE_POSIX_MEMALIGN": "#define HAVE_POSIX_MEMALIGN 1",
-    "#undef HAVE_PTHREAD_T": "#define HAVE_PTHREAD_T 1",
-    "#undef HAVE_PUTWC": "#define HAVE_PUTWC 1",
-    "#undef HAVE_SETLOCALE": "#define HAVE_SETLOCALE 1",
-    "#undef HAVE_SSIZE_T": "#define HAVE_SSIZE_T 1",
-    "#undef HAVE_STDINT_H": "#define HAVE_STDINT_H 1",
-    "#undef HAVE_STDLIB_H": "#define HAVE_STDLIB_H 1",
-    "#undef HAVE_STRCASECMP": "#define HAVE_STRCASECMP 1",
-    "#undef HAVE_STRFTIME": "#define HAVE_STRFTIME 1",
-    "#undef HAVE_STRINGS_H": "#define HAVE_STRINGS_H 1",
-    "#undef HAVE_STRING_H": "#define HAVE_STRING_H 1",
-    "#undef HAVE_STRNCASECMP": "#define HAVE_STRNCASECMP 1",
-    "#undef HAVE_SYS_MMAN_H": "#define HAVE_SYS_MMAN_H 1",
-    "#undef HAVE_SYS_PARAM_H": "#define HAVE_SYS_PARAM_H 1",
-    "#undef HAVE_SYS_STAT_H": "#define HAVE_SYS_STAT_H 1",
-    "#undef HAVE_SYS_TYPES_H": "#define HAVE_SYS_TYPES_H 1",
-    "#undef HAVE_SYS_UTSNAME_H": "#define HAVE_SYS_UTSNAME_H 1",
-    "#undef HAVE_TIME_H": "#define HAVE_TIME_H 1",
-    "#undef HAVE_UNAME": "#define HAVE_UNAME 1",
-    "#undef HAVE_UNISTD_H": "#define HAVE_UNISTD_H 1",
-    "#undef HAVE_USELOCALE": "#define HAVE_USELOCALE 1",
-    "#undef HAVE_WCHAR_T": "#define HAVE_WCHAR_T 1",
-    "#undef HAVE_X11_KEYSYM_H": "#define HAVE_X11_KEYSYM_H 1",
-    "#undef HAVE_X11_XLIB_H": "#define HAVE_X11_XLIB_H 1",
-    "#undef HAVE_X11_XUTIL_H": "#define HAVE_X11_XUTIL_H 1",
-    "#undef HAVE___PROGNAME": "#define HAVE___PROGNAME 1",
-    "#undef HWLOC_C_HAVE_VISIBILITY": "#define HWLOC_C_HAVE_VISIBILITY 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_ALIGNED": "#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE": "#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_COLD": "#define HWLOC_HAVE_ATTRIBUTE_COLD 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR": "#define HWLOC_HAVE_ATTRIBUTE_CONSTRUCTOR 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_CONST": "#define HWLOC_HAVE_ATTRIBUTE_CONST 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_DEPRECATED": "#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_FORMAT": "#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_HOT": "#define HWLOC_HAVE_ATTRIBUTE_HOT 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_MALLOC": "#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_NONNULL": "#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_NORETURN": "#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION": "#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_PACKED": "#define HWLOC_HAVE_ATTRIBUTE_PACKED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_PURE": "#define HWLOC_HAVE_ATTRIBUTE_PURE 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_SENTINEL": "#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_UNUSED": "#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT": "#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1",
-    "#undef HWLOC_HAVE_ATTRIBUTE": "#define HWLOC_HAVE_ATTRIBUTE 1",
-    "#undef HWLOC_HAVE_CPU_SET_S": "#define HWLOC_HAVE_CPU_SET_S 1",
-    "#undef HWLOC_HAVE_CPU_SET": "#define HWLOC_HAVE_CPU_SET 1",
-    "#undef HWLOC_HAVE_DECL_FFSL": "#define HWLOC_HAVE_DECL_FFSL 1",
-    "#undef HWLOC_HAVE_DECL_FFS": "#define HWLOC_HAVE_DECL_FFS 1",
-    "#undef HWLOC_HAVE_DECL_STRCASECMP": "#define HWLOC_HAVE_DECL_STRCASECMP 1",
-    "#undef HWLOC_HAVE_DECL_STRNCASECMP": "#define HWLOC_HAVE_DECL_STRNCASECMP 1",
-    "#undef HWLOC_HAVE_FFSL": "#define HWLOC_HAVE_FFSL 1",
-    "#undef HWLOC_HAVE_FFS": "#define HWLOC_HAVE_FFS 1",
-    "#undef HWLOC_HAVE_LIBTERMCAP": "#define HWLOC_HAVE_LIBTERMCAP 1",
-    "#undef HWLOC_HAVE_LINUXIO": "#define HWLOC_HAVE_LINUXIO 1",
-    "#undef HWLOC_HAVE_PTHREAD_MUTEX": "#define HWLOC_HAVE_PTHREAD_MUTEX 1",
-    "#undef HWLOC_HAVE_SCHED_SETAFFINITY": "#define HWLOC_HAVE_SCHED_SETAFFINITY 1",
-    "#undef HWLOC_HAVE_STDINT_H": "#define HWLOC_HAVE_STDINT_H 1",
-    "#undef HWLOC_HAVE_SYSCALL": "#define HWLOC_HAVE_SYSCALL 1",
-    "#undef HWLOC_HAVE_X11_KEYSYM": "#define HWLOC_HAVE_X11_KEYSYM 1",
-    "#undef HWLOC_HAVE_X86_CPUID": "#define HWLOC_HAVE_X86_CPUID 1",
-    "#undef HWLOC_SIZEOF_UNSIGNED_INT": "#define HWLOC_SIZEOF_UNSIGNED_INT 4",
-    "#undef HWLOC_SIZEOF_UNSIGNED_LONG": "#define HWLOC_SIZEOF_UNSIGNED_LONG 8",
-    "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
-    "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
-    "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
-    "#undef HWLOC_USE_NCURSES": "#define HWLOC_USE_NCURSES 1",
-    "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
-    "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
-    "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
-    "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
-    "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
-    "#undef HWLOC_X86_64_ARCH": "#define HWLOC_X86_64_ARCH 1",
-    "#undef LT_OBJDIR": "#define LT_OBJDIR \".libs/\"",
-    "#undef PACKAGE_BUGREPORT": "#define PACKAGE_BUGREPORT \"http://github.com/open-mpi/hwloc/issues",
-    "#undef PACKAGE_NAME": "#define PACKAGE_NAME \"hwloc\"",
-    "#undef PACKAGE_STRING": "#define PACKAGE_STRING \"hwloc 2.0.3\"",
-    "#undef PACKAGE_TARNAME": "#define PACKAGE_TARNAME \"hwloc\"",
-    "#undef PACKAGE_URL": "#define PACKAGE_URL \"\"",
-    "#undef PACKAGE_VERSION": "#define PACKAGE_VERSION \"2.0.3\"",
-    "#undef PACKAGE": "#define PACKAGE \"hwloc\"",
-    "#undef SIZEOF_UNSIGNED_INT": "#define SIZEOF_UNSIGNED_INT 4",
-    "#undef SIZEOF_UNSIGNED_LONG": "#define SIZEOF_UNSIGNED_LONG 8",
-    "#undef SIZEOF_VOID_P": "#define SIZEOF_VOID_P 8",
-    "#undef STDC_HEADERS": "#define STDC_HEADERS 1",
-    "# undef _HPUX_SOURCE": "# define _HPUX_SOURCE 1",
-    "# undef _ALL_SOURCE": "# define _ALL_SOURCE 1",
-    "# undef _GNU_SOURCE": "# define _GNU_SOURCE 1",
-    "# undef _POSIX_PTHREAD_SEMANTICS": "# define _POSIX_PTHREAD_SEMANTICS 1",
-    "# undef _TANDEM_SOURCE": "# define _TANDEM_SOURCE 1",
-    "# undef __EXTENSIONS__": "# define __EXTENSIONS__ 1",
-    "#undef VERSION": "#define VERSION \"2.0.3\"",
-    "#undef _HPUX_SOURCE": "#define _HPUX_SOURCE 1",
-    "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
-    "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
-}
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS = {
-    "#undef HAVE_CUDA_RUNTIME_API_H": "#define HAVE_CUDA_RUNTIME_API_H 1",
-    "#undef HAVE_CUDA_H": "#define HAVE_CUDA_H 1",
-    "#undef HAVE_CUDA": "#define HAVE_CUDA 1",
-}
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS = {
-    "#undef HAVE_PROGRAM_INVOCATION_NAME": "#define HAVE_PROGRAM_INVOCATION_NAME 1",
-    "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
-}
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS.update(_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS)
-
-_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS.update(_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS)
-
-expand_template(
-    name = "include_private_hwloc_autogen__config_h",
-    out = "include/private/autogen/config.h",
-    substitutions = if_cuda(
-        _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS,
-        if_false = _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
-    ),
-    template = "include/private/autogen/config.h.in",
-)
-
-expand_template(
-    name = "move_static_components_h",
-    out = "hwloc/static-components.h",
-    substitutions = {"&hwloc_linuxio_component": "//&hwloc_linuxio_component"},
-    template = "@local_xla//third_party/hwloc:static-components.h",
-)
-
-cc_library(
-    name = "hwloc",
-    srcs = [
-        "hwloc/base64.c",
-        "hwloc/bind.c",
-        "hwloc/bitmap.c",
-        "hwloc/components.c",
-        "hwloc/cpukinds.c",
-        "hwloc/diff.c",
-        "hwloc/distances.c",
-        "hwloc/memattrs.c",
-        "hwloc/misc.c",
-        "hwloc/pci-common.c",
-        "hwloc/shmem.c",
-        "hwloc/static-components.h",
-        "hwloc/topology.c",
-        "hwloc/topology-hardwired.c",
-        "hwloc/topology-noos.c",
-        "hwloc/topology-synthetic.c",
-        "hwloc/topology-xml.c",
-        "hwloc/topology-xml-nolibxml.c",
-        "hwloc/traversal.c",
-        "include/hwloc/plugins.h",
-        "include/hwloc/shmem.h",
-        "include/private/autogen/config.h",
-        "include/private/components.h",
-        "include/private/debug.h",
-        "include/private/internal-components.h",
-        "include/private/misc.h",
-        "include/private/private.h",
-        "include/private/xml.h",
-    ] + select({
-        "@local_xla//xla/tsl:linux_x86_64": [
-            "hwloc/topology-linux.c",
-            "hwloc/topology-x86.c",
-            "include/hwloc/linux.h",
-            "include/private/cpuid-x86.h",
-        ],
-        "@local_xla//xla/tsl:linux_aarch64": [
-            "hwloc/topology-linux.c",
-            "include/hwloc/linux.h",
-        ],
-        "@local_xla//xla/tsl:linux_ppc64le": [
-            "hwloc/topology-linux.c",
-            "include/hwloc/linux.h",
-        ],
-        "@local_xla//xla/tsl:linux_s390x": [
-            "hwloc/topology-linux.c",
-            "include/hwloc/linux.h",
-        ],
-        "@local_xla//xla/tsl:freebsd": [
-            "hwloc/topology-freebsd.c",
-            "hwloc/topology-x86.c",
-            "include/private/cpuid-x86.h",
-        ],
-        "//conditions:default": [],
-    }),
-    hdrs = [
-        "include/hwloc.h",
-        "include/hwloc/autogen/config.h",
-        "include/hwloc/bitmap.h",
-        "include/hwloc/cpukinds.h",
-        "include/hwloc/deprecated.h",
-        "include/hwloc/diff.h",
-        "include/hwloc/distances.h",
-        "include/hwloc/export.h",
-        "include/hwloc/helper.h",
-        "include/hwloc/inlines.h",
-        "include/hwloc/memattrs.h",
-        "include/hwloc/rename.h",
-    ],
-    copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS,
-    features = [
-        "-parse_headers",
-        "-layering_check",
-    ],
-    includes = [
-        "hwloc",
-        "include",
-    ],
-    deps = [],
-)
-
-cc_binary(
-    name = "hwloc_print",
-    srcs = ["hwloc_print.cc"],
-    copts = COMMON_INCLUDE_COPTS,
-    deps = [
-        ":hwloc",
-    ],
-)
diff --git a/third_party/hwloc/static-components.h b/third_party/hwloc/static-components.h
deleted file mode 100644
index e83b311ee116..000000000000
--- a/third_party/hwloc/static-components.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
-#define TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
-
-#include <private/internal-components.h>
-static const struct hwloc_component* hwloc_static_components[] = {
-    &hwloc_noos_component,
-    &hwloc_xml_component,
-    &hwloc_synthetic_component,
-    &hwloc_xml_nolibxml_component,
-#ifdef __linux__
-    &hwloc_linux_component,
-    &hwloc_linuxio_component,
-#endif
-#ifdef __FreeBSD__
-    &hwloc_freebsd_component,
-#endif
-#if defined(__x86_64__) || defined(__amd64__) || defined(_M_IX86) || \
-    defined(_M_X64)
-    &hwloc_x86_component,
-#endif
-    NULL};
-
-#endif  // TENSORFLOW_THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
diff --git a/third_party/hwloc/workspace.bzl b/third_party/hwloc/workspace.bzl
deleted file mode 100644
index ce8475cd00cc..000000000000
--- a/third_party/hwloc/workspace.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-"""loads the hwloc library, used by TF."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "hwloc",
-        urls = tf_mirror_urls("https://download.open-mpi.org/release/hwloc/v2.7/hwloc-2.7.1.tar.gz"),
-        sha256 = "4cb0a781ed980b03ad8c48beb57407aa67c4b908e45722954b9730379bc7f6d5",
-        strip_prefix = "hwloc-2.7.1",
-        build_file = "//third_party/hwloc:hwloc.BUILD",
-        system_build_file = "//third_party/hwloc:BUILD.system",
-    )
diff --git a/third_party/implib_so/get_symbols.py b/third_party/implib_so/get_symbols.py
deleted file mode 100644
index 4bcb97dc0616..000000000000
--- a/third_party/implib_so/get_symbols.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Given a .so file, lists symbols that should be included in a stub.
-
-Example usage:
-$ bazel run -c opt @local_xla//third_party/implib_so:get_symbols
-/usr/local/cuda/lib64/libcudart.so > third_party/tsl/tsl/cuda/cudart.symbols
-"""
-
-import argparse
-import importlib
-
-# We can't import implib-gen directly because it has a dash in its name.
-implib = importlib.import_module('implib-gen')
-
-
-def _is_exported_function(s):
-  return (
-      s['Bind'] != 'LOCAL'
-      and s['Type'] == 'FUNC'
-      and s['Ndx'] != 'UND'
-      and s['Name'] not in ['', '_init', '_fini']
-      and s['Default']
-  )
-
-
-def main():
-  parser = argparse.ArgumentParser(
-      description='Extracts a list of symbols from a shared library'
-  )
-  parser.add_argument('library', help='Path to the .so file.')
-  args = parser.parse_args()
-  syms = implib.collect_syms(args.library)
-  funs = [s['Name'] for s in syms if _is_exported_function(s)]
-  for f in sorted(funs):
-    print(f)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/implib_so/implib_so.BUILD b/third_party/implib_so/implib_so.BUILD
deleted file mode 100644
index bbfb2898eb12..000000000000
--- a/third_party/implib_so/implib_so.BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-# Description:
-#   Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
-#   shared libraries.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # MIT
-
-exports_files([
-    "LICENSE.txt",
-])
-
-py_library(
-    name = "implib_gen_lib",
-    srcs = ["implib-gen.py"],
-    data = glob([
-        "arch/**/*.S.tpl",
-        "arch/**/*.ini",
-    ]),
-)
diff --git a/third_party/implib_so/make_stub.py b/third_party/implib_so/make_stub.py
deleted file mode 100644
index f0e1fe564c0c..000000000000
--- a/third_party/implib_so/make_stub.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""Given a list of symbols, generates a stub."""
-
-import argparse
-import configparser
-import os
-import string
-
-from bazel_tools.tools.python.runfiles import runfiles
-
-r = runfiles.Create()
-
-
-def main():
-  parser = argparse.ArgumentParser(
-      description='Generates stubs for CUDA libraries.'
-  )
-  parser.add_argument('symbols', help='File containing a list of symbols.')
-  parser.add_argument(
-      '--outdir', '-o', help='Path to create wrapper at', default='.'
-  )
-  parser.add_argument(
-      '--target',
-      help='Target platform name, e.g. x86_64, aarch64.',
-      required=True,
-  )
-  args = parser.parse_args()
-
-  config_path = r.Rlocation(f'implib_so/arch/{args.target}/config.ini')
-  table_path = r.Rlocation(f'implib_so/arch/{args.target}/table.S.tpl')
-  trampoline_path = r.Rlocation(
-      f'implib_so/arch/{args.target}/trampoline.S.tpl'
-  )
-
-  cfg = configparser.ConfigParser(inline_comment_prefixes=';')
-  cfg.read(config_path)
-  ptr_size = int(cfg['Arch']['PointerSize'])
-
-  with open(args.symbols, 'r') as f:
-    funs = [s.strip() for s in f.readlines()]
-
-  # Generate assembly code, containing a table for the resolved symbols and the
-  # trampolines.
-  lib_name, _ = os.path.splitext(os.path.basename(args.symbols))
-
-  with open(os.path.join(args.outdir, f'{lib_name}.tramp.S'), 'w') as f:
-    with open(table_path, 'r') as t:
-      table_text = string.Template(t.read()).substitute(
-          lib_suffix=lib_name, table_size=ptr_size * (len(funs) + 1)
-      )
-    f.write(table_text)
-
-    with open(trampoline_path, 'r') as t:
-      tramp_tpl = string.Template(t.read())
-
-    for i, name in enumerate(funs):
-      tramp_text = tramp_tpl.substitute(
-          lib_suffix=lib_name, sym=name, offset=i * ptr_size, number=i
-      )
-      f.write(tramp_text)
-
-  # Generates a list of symbols, formatted as a list of C++ strings.
-  with open(os.path.join(args.outdir, f'{lib_name}.inc'), 'w') as f:
-    sym_names = ''.join(f'  "{name}",\n' for name in funs)
-    f.write(sym_names)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/implib_so/workspace.bzl b/third_party/implib_so/workspace.bzl
deleted file mode 100644
index 37f36cc135fd..000000000000
--- a/third_party/implib_so/workspace.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-"""Implib.so is a simple equivalent of Windows DLL import libraries for POSIX
-shared libraries."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "implib_so",
-        strip_prefix = "Implib.so-2cce6cab8ff2c15f9da858ea0b68646a8d62aef2",
-        sha256 = "4ef3089969d57a5b60bb41b8212c478eaa15c56941f86d4bf5e7f98a3afd24e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/2cce6cab8ff2c15f9da858ea0b68646a8d62aef2.tar.gz"),
-        build_file = "//third_party/implib_so:implib_so.BUILD",
-    )
diff --git a/third_party/libwebp/BUILD b/third_party/libwebp/BUILD
new file mode 100644
index 000000000000..94210a033a84
--- /dev/null
+++ b/third_party/libwebp/BUILD
@@ -0,0 +1,3 @@
+# This empty BUILD file is required to make Bazel treat this directory as a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/libwebp/libwebp.BUILD.bazel b/third_party/libwebp/libwebp.BUILD.bazel
new file mode 100644
index 000000000000..c53f8f78b3a5
--- /dev/null
+++ b/third_party/libwebp/libwebp.BUILD.bazel
@@ -0,0 +1,94 @@
+licenses(["notice"])
+
+package(default_visibility = ["//visibility:public"])
+
+C89_FLAGS = select({
+    "@platforms//cpu:x86_32": [
+        "-msse4.1",
+        "-DWEBP_HAVE_SSE41",
+    ],
+    "@platforms//cpu:x86_64": [
+        "-msse4.1",
+        "-DWEBP_HAVE_SSE41",
+    ],
+    "@platforms//cpu:armv7": [
+        "-marm",
+        "-mfpu=neon",
+    ],
+    "//conditions:default": [],
+})
+
+cc_library(
+    name = "webp",
+    srcs = glob(
+        [
+            "src/enc/*.c",
+            "src/enc/*.h",
+            "src/dec/*.c",
+            "src/dec/*.h",
+            "src/mux/*.c",
+            "src/mux/*.h",
+            "src/demux/*.c",
+            "src/demux/*.h",
+            "src/dsp/*.c",
+            "src/dsp/*.h",
+        ],
+    ),
+    hdrs = [
+        "src/webp/decode.h",
+        "src/webp/demux.h",
+        "src/webp/encode.h",
+        "src/webp/format_constants.h",
+        "src/webp/mux.h",
+        "src/webp/mux_types.h",
+        "src/webp/types.h",
+    ],
+    copts = C89_FLAGS,
+    include_prefix = "third_party/libwebp",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":sharpyuv",
+        ":webp_utils",
+    ],
+)
+
+cc_library(
+    name = "webp_utils",
+    srcs = glob(["src/utils/*.c"]) + [
+        "src/dsp/cpu.h",
+        "src/dsp/dsp.h",
+        "src/dsp/lossless_common.h",
+        "src/webp/decode.h",
+        "src/webp/encode.h",
+        "src/webp/format_constants.h",
+        "src/webp/types.h",
+    ],
+    hdrs = glob(["src/utils/*.h"]),
+    copts = C89_FLAGS,
+)
+
+cc_library(
+    name = "sharpyuv",
+    srcs = [
+        "sharpyuv/sharpyuv.c",
+        "sharpyuv/sharpyuv_cpu.c",
+        "sharpyuv/sharpyuv_csp.c",
+        "sharpyuv/sharpyuv_dsp.c",
+        "sharpyuv/sharpyuv_dsp.h",
+        "sharpyuv/sharpyuv_gamma.c",
+        "sharpyuv/sharpyuv_neon.c",
+        "sharpyuv/sharpyuv_sse2.c",
+        "src/dsp/cpu.h",
+        "src/webp/types.h",
+    ],
+    hdrs = [
+        "sharpyuv/sharpyuv.h",
+        "sharpyuv/sharpyuv_cpu.h",
+        "sharpyuv/sharpyuv_csp.h",
+        "sharpyuv/sharpyuv_gamma.h",
+    ],
+    copts = C89_FLAGS,
+    textual_hdrs = [
+        "src/dsp/cpu.c",
+    ],
+)
diff --git a/third_party/libwebp/workspace.bzl b/third_party/libwebp/workspace.bzl
new file mode 100644
index 000000000000..1f2a040ea888
--- /dev/null
+++ b/third_party/libwebp/workspace.bzl
@@ -0,0 +1,13 @@
+"""Point to the libwebp repo on GitHub."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    # Use the same libwebp release as tensorstore
+    tf_http_archive(
+        name = "libwebp",
+        strip_prefix = "libwebp-1.4.0",
+        sha256 = "12af50c45530f0a292d39a88d952637e43fb2d4ab1883c44ae729840f7273381",
+        urls = tf_mirror_urls("https://github.com/webmproject/libwebp/archive/v1.4.0.tar.gz"),
+        build_file = "//third_party/libwebp:libwebp.BUILD.bazel",
+    )
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 5758bb1edd40..e5697aca48d0 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,13 +1,6246 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
---- a/libc/src/stdlib/qsort_pivot.h
-+++ b/libc/src/stdlib/qsort_pivot.h
-@@ -9,6 +9,8 @@
- #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
- #define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
+--- a/clang/lib/Sema/SemaDecl.cpp
++++ b/clang/lib/Sema/SemaDecl.cpp
+@@ -4755,8 +4755,16 @@
+         return;
+     }
+   } else {
+-    Diag(New->getLocation(), diag::warn_cxx_compat_tentative_definition) << New;
+-    Diag(Old->getLocation(), diag::note_previous_declaration);
++    // C++ may not have a tentative definition rule, but it has a different
++    // rule about what constitutes a definition in the first place. See
++    // [basic.def]p2 for details, but the basic idea is: if the old declaration
++    // contains the extern specifier and doesn't have an initializer, it's fine
++    // in C++.
++    if (Old->getStorageClass() != SC_Extern || Old->hasInit()) {
++      Diag(New->getLocation(), diag::warn_cxx_compat_tentative_definition)
++          << New;
++      Diag(Old->getLocation(), diag::note_previous_declaration);
++    }
+   }
  
-+#include "src/__support/macros/attributes.h"
+   if (haveIncompatibleLanguageLinkages(Old, New)) {
+diff -ruN --strip-trailing-cr a/clang/test/Sema/warn-tentative-defn-compat.c b/clang/test/Sema/warn-tentative-defn-compat.c
+--- a/clang/test/Sema/warn-tentative-defn-compat.c
++++ b/clang/test/Sema/warn-tentative-defn-compat.c
+@@ -20,4 +20,7 @@
+                cxx-error {{redefinition of 'k'}}
+ 
+ // Cannot have two declarations with initializers, that is a redefinition in
+-// both C and C++.
++// both C and C++. However, C++ does have a different definition of what makes
++// a declaration a definition.
++extern const int a;
++const int a = 12; // Okay in C and C++
+diff -ruN --strip-trailing-cr a/clang/unittests/Frontend/SearchPathTest.cpp b/clang/unittests/Frontend/SearchPathTest.cpp
+--- a/clang/unittests/Frontend/SearchPathTest.cpp
++++ b/clang/unittests/Frontend/SearchPathTest.cpp
+@@ -51,6 +51,7 @@
+   FileManager FileMgr;
+   SourceManager SourceMgr;
+   std::unique_ptr<CompilerInvocation> Invocation;
++  IntrusiveRefCntPtr<TargetInfo> Target;
+ 
+   void addDirectories(ArrayRef<StringRef> Dirs) {
+     for (StringRef Dir : Dirs) {
+@@ -65,10 +66,9 @@
+     CompilerInvocation::CreateFromArgs(*Invocation, Args, Diags);
+     HeaderSearchOptions HSOpts = Invocation->getHeaderSearchOpts();
+     LangOptions LangOpts = Invocation->getLangOpts();
+-    TargetInfo *Target =
+-        TargetInfo::CreateTargetInfo(Diags, Invocation->getTargetOpts());
++    Target = TargetInfo::CreateTargetInfo(Diags, Invocation->getTargetOpts());
+     auto HeaderInfo = std::make_unique<HeaderSearch>(HSOpts, SourceMgr, Diags,
+-                                                     LangOpts, Target);
++                                                     LangOpts, Target.get());
+     ApplyHeaderSearchOptions(*HeaderInfo, HSOpts, LangOpts,
+                              Target->getTriple());
+     return HeaderInfo;
+diff -ruN --strip-trailing-cr a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
++++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+@@ -23,7 +23,6 @@
+ #include "llvm/ADT/iterator_range.h"
+ #include "llvm/CodeGen/MachineBasicBlock.h"
+ #include "llvm/CodeGen/MachineFunction.h"
+-#include "llvm/CodeGen/MachineInstr.h"
+ #include "llvm/CodeGen/MachineInstrBundle.h"
+ #include "llvm/CodeGen/MachineOperand.h"
+ #include "llvm/CodeGen/RegisterBank.h"
+@@ -586,9 +585,6 @@
+   /// multiple uses.
+   bool hasOneNonDBGUser(Register RegNo) const;
+ 
+-  /// If the register has a single non-Debug instruction using the specified
+-  /// register, returns it; otherwise returns nullptr.
+-  MachineInstr *getOneNonDBGUser(Register RegNo) const;
+ 
+   /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
+   /// non-debug user instructions.
+diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
++++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+@@ -432,11 +432,6 @@
+   return hasSingleElement(use_nodbg_instructions(RegNo));
+ }
+ 
+-MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
+-  auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
+-  return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
+-}
+-
+ bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
+                                               unsigned MaxUsers) const {
+   return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
++++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+@@ -53,20 +53,11 @@
+   /// UnifiedVGPRFile
+   unsigned getVGPRNum(bool UnifiedVGPRFile) const {
+     if (UnifiedVGPRFile) {
+-      return Value[AGPR32] ? getUnifiedVGPRNum(Value[VGPR32], Value[AGPR32])
++      return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
+                            : Value[VGPR32] + Value[AGPR32];
+     }
+     return std::max(Value[VGPR32], Value[AGPR32]);
+   }
+-
+-  /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
+-  /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
+-  inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
+-                                           unsigned NumAGPRs) {
+-    return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
+-           NumAGPRs;
+-  }
+-
+   /// \returns the ArchVGPR32 pressure
+   unsigned getArchVGPRNum() const { return Value[VGPR32]; }
+   /// \returns the AccVGPR32 pressure
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
++++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+@@ -25,13 +25,8 @@
+ 
+ #include "GCNSchedStrategy.h"
+ #include "AMDGPUIGroupLP.h"
+-#include "GCNRegPressure.h"
+ #include "SIMachineFunctionInfo.h"
+-#include "Utils/AMDGPUBaseInfo.h"
+-#include "llvm/ADT/STLExtras.h"
+ #include "llvm/CodeGen/RegisterClassInfo.h"
+-#include "llvm/MC/LaneBitmask.h"
+-#include "llvm/Support/ErrorHandling.h"
+ 
+ #define DEBUG_TYPE "machine-scheduler"
+ 
+@@ -306,11 +301,11 @@
+     HasHighPressure = true;
+     if (SGPRDelta > VGPRDelta) {
+       Cand.RPDelta.CriticalMax =
+-          PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
++        PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
+       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+     } else {
+       Cand.RPDelta.CriticalMax =
+-          PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
++        PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
+       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
+     }
+   }
+@@ -323,7 +318,7 @@
+                                          const RegPressureTracker &RPTracker,
+                                          SchedCandidate &Cand,
+                                          bool IsBottomUp) {
+-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
++  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
+   unsigned SGPRPressure = 0;
+   unsigned VGPRPressure = 0;
+@@ -419,7 +414,7 @@
+       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+                         /*IsBottomUp=*/false);
+       assert(TCand.SU == TopCand.SU &&
+-             "Last pick result should correspond to re-picking right now");
++           "Last pick result should correspond to re-picking right now");
+     }
+ #endif
+   }
+@@ -895,13 +890,13 @@
+   std::vector<MachineInstr *> RegionFirstMIs;
+   RegionFirstMIs.reserve(Regions.size());
+   auto I = Regions.rbegin(), E = Regions.rend();
++  auto *BB = I->first->getParent();
+   do {
+-    const MachineBasicBlock *MBB = I->first->getParent();
+     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
+     RegionFirstMIs.push_back(MI);
+     do {
+       ++I;
+-    } while (I != E && I->first->getParent() == MBB);
++    } while (I != E && I->first->getParent() == BB);
+   } while (I != E);
+   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
+ }
+@@ -1086,46 +1081,31 @@
+   return true;
+ }
+ 
+-/// Allows to easily filter for this stage's debug output.
+-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+-
+ bool PreRARematStage::initGCNSchedStage() {
+-  // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
+-  // regions inbetween the defs and region we sinked the def to. Will need to be
+-  // fixed if there is another pass after this pass.
+-  assert(!S.hasNextStage());
++  if (!GCNSchedStage::initGCNSchedStage())
++    return false;
+ 
+-  if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
+-      DAG.Regions.size() == 1)
++  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
+     return false;
+ 
+-  // Before performing any IR modification record the parent region of each MI
+-  // and the parent MBB of each region.
+-  const unsigned NumRegions = DAG.Regions.size();
+-  RegionBB.reserve(NumRegions);
+-  for (unsigned I = 0; I < NumRegions; ++I) {
+-    RegionBoundaries Region = DAG.Regions[I];
+-    for (auto MI = Region.first; MI != Region.second; ++MI)
+-      MIRegion.insert({&*MI, I});
+-    RegionBB.push_back(Region.first->getParent());
+-  }
++  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
++  // Rematerialization will not help if occupancy is not limited by reg usage.
++  if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
++    return false;
+ 
+-  if (!canIncreaseOccupancyOrReduceSpill())
++  // FIXME: This pass will invalidate cached MBBLiveIns for regions
++  // inbetween the defs and region we sinked the def to. Cached pressure
++  // for regions where a def is sinked from will also be invalidated. Will
++  // need to be fixed if there is another pass after this pass.
++  assert(!S.hasNextStage());
++
++  collectRematerializableInstructions();
++  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+     return false;
+ 
+-  // Rematerialize identified instructions and update scheduler's state.
+-  rematerialize();
+-  if (GCNTrackers)
+-    DAG.RegionLiveOuts.buildLiveRegMap();
+-  REMAT_DEBUG(
+-      dbgs() << "Retrying function scheduling with new min. occupancy of "
+-             << AchievedOcc << " from rematerializing (original was "
+-             << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+-  if (AchievedOcc > DAG.MinOccupancy) {
+-    DAG.MinOccupancy = AchievedOcc;
+-    SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+-    MFI.increaseOccupancy(MF, DAG.MinOccupancy);
+-  }
++  LLVM_DEBUG(
++      dbgs() << "Retrying function scheduling with improved occupancy of "
++             << DAG.MinOccupancy << " from rematerializing\n");
+   return true;
+ }
+ 
+@@ -1513,7 +1493,8 @@
+       dbgs()
+       << "\n\t      *** In shouldRevertScheduling ***\n"
+       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
+-  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
++  ScheduleMetrics MBefore =
++      getScheduleMetrics(DAG.SUnits);
+   LLVM_DEBUG(
+       dbgs()
+       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
+@@ -1546,9 +1527,13 @@
+ }
+ 
+ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
+-  return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
+-         mayCauseSpilling(WavesAfter) ||
+-         (IncreaseOccupancy && WavesAfter < TargetOcc);
++  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
++    return true;
++
++  if (mayCauseSpilling(WavesAfter))
++    return true;
++
++  return false;
+ }
+ 
+ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
+@@ -1698,407 +1683,160 @@
+   return true;
+ }
+ 
+-namespace {
+-/// Models excess register pressure in a region and tracks our progress as we
+-/// identify rematerialization opportunities.
+-struct ExcessRP {
+-  /// Number of excess ArchVGPRs.
+-  unsigned ArchVGPRs = 0;
+-  /// Number of excess AGPRs.
+-  unsigned AGPRs = 0;
+-  /// For unified register files, number of excess VGPRs.
+-  unsigned VGPRs = 0;
+-  /// For unified register files with AGPR usage, number of excess ArchVGPRs to
+-  /// save before we are able to save a whole allocation granule.
+-  unsigned ArchVGPRsToAlignment = 0;
+-  /// Whether the region uses AGPRs.
+-  bool HasAGPRs = false;
+-  /// Whether the subtarget has a unified RF.
+-  bool UnifiedRF;
+-
+-  /// Constructs the excess RP model; determines the excess pressure w.r.t. a
+-  /// maximum number of allowed VGPRs.
+-  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
+-
+-  /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
+-  /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
+-  /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
+-  /// saving these ArchVGPRs helped reduce excess pressure.
+-  bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
+-
+-  /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
+-  /// these ArchVGPRs helped reduce excess pressure.
+-  bool saveAGPRs(unsigned NumRegs);
+-
+-  /// Returns whether there is any excess register pressure.
+-  operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
+-
+-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+-  friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
+-    OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
+-       << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
+-       << Excess.ArchVGPRsToAlignment << " registers)\n";
+-    return OS;
+-  }
+-#endif
++void PreRARematStage::collectRematerializableInstructions() {
++  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
++  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
++    Register Reg = Register::index2VirtReg(I);
++    if (!DAG.LIS->hasInterval(Reg))
++      continue;
+ 
+-private:
+-  static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
+-    unsigned NumSaved = std::min(LeftToSave, NumRegs);
+-    NumRegs -= NumSaved;
+-    LeftToSave -= NumSaved;
+-    return NumSaved;
+-  }
+-};
+-} // namespace
++    // TODO: Handle AGPR and SGPR rematerialization
++    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
++        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
++      continue;
+ 
+-ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
+-                   unsigned MaxVGPRs)
+-    : UnifiedRF(ST.hasGFX90AInsts()) {
+-  unsigned NumArchVGPRs = RP.getArchVGPRNum();
+-  unsigned NumAGPRs = RP.getAGPRNum();
+-  HasAGPRs = NumAGPRs;
+-
+-  if (!UnifiedRF) {
+-    // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
+-    // independently.
+-    if (NumArchVGPRs > MaxVGPRs)
+-      ArchVGPRs = NumArchVGPRs - MaxVGPRs;
+-    if (NumAGPRs > MaxVGPRs)
+-      AGPRs = NumAGPRs - MaxVGPRs;
+-    return;
+-  }
++    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
++    MachineInstr *Def = Op->getParent();
++    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
++      continue;
+ 
+-  // Independently of whether overall VGPR pressure is under the limit, we still
+-  // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
+-  // number of addressable registers in each category.
+-  const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+-  if (NumArchVGPRs > MaxArchVGPRs) {
+-    ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
+-    NumArchVGPRs = MaxArchVGPRs;
+-  }
+-  if (NumAGPRs > MaxArchVGPRs) {
+-    AGPRs = NumAGPRs - MaxArchVGPRs;
+-    NumAGPRs = MaxArchVGPRs;
+-  }
+-
+-  // Check overall VGPR usage against the limit; any excess above addressable
+-  // register limits has already been accounted for.
+-  const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
+-  unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
+-  if (NumVGPRs > MaxVGPRs) {
+-    VGPRs = NumVGPRs - MaxVGPRs;
+-    ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
+-    if (!ArchVGPRsToAlignment)
+-      ArchVGPRsToAlignment = Granule;
+-  }
+-}
++    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
++    if (Def->getParent() == UseI->getParent())
++      continue;
+ 
+-bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
+-  bool Progress = saveRegs(ArchVGPRs, NumRegs);
+-  if (!NumRegs)
+-    return Progress;
+-
+-  if (!UnifiedRF) {
+-    if (UseArchVGPRForAGPRSpill)
+-      Progress |= saveRegs(AGPRs, NumRegs);
+-  } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
+-    // There is progress as long as there are VGPRs left to save, even if the
+-    // save induced by this particular call does not cross an ArchVGPR alignment
+-    // barrier.
+-    Progress = true;
+-
+-    // ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
+-    unsigned NumSavedRegs = 0;
+-
+-    // Count the number of whole ArchVGPR allocation granules we can save.
+-    const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
+-    if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
+-      NumSavedRegs = NumGranules * Granule;
+-      NumRegs -= NumSavedRegs;
+-    }
+-
+-    // We may be able to save one more whole ArchVGPR allocation granule.
+-    if (NumRegs >= ArchVGPRsToAlignment) {
+-      NumSavedRegs += Granule;
+-      ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
+-    } else {
+-      ArchVGPRsToAlignment -= NumRegs;
++    bool HasRematDependency = false;
++    // Check if this instruction uses any registers that are planned to be
++    // rematerialized
++    for (auto &RematEntry : RematerializableInsts) {
++      if (find_if(RematEntry.second,
++                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
++                    for (MachineOperand &MO : Def->operands()) {
++                      if (!MO.isReg())
++                        continue;
++                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
++                        return true;
++                    }
++                    return false;
++                  }) != RematEntry.second.end()) {
++        HasRematDependency = true;
++        break;
++      }
+     }
++    // Do not rematerialize an instruction if it uses an instruction that we
++    // have designated for rematerialization.
++    // FIXME: Allow for rematerialization chains: this requires 1. updating
++    // remat points to account for uses that are rematerialized, and 2. either
++    // rematerializing the candidates in careful ordering, or deferring the MBB
++    // RP walk until the entire chain has been rematerialized.
++    if (HasRematDependency)
++      continue;
+ 
+-    // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
+-    // spilling and have some free ArchVGPR slots.
+-    saveRegs(VGPRs, NumSavedRegs);
+-    if (UseArchVGPRForAGPRSpill)
+-      saveRegs(AGPRs, NumSavedRegs);
+-  } else {
+-    // No AGPR usage in the region i.e., no allocation granule to worry about.
+-    Progress |= saveRegs(VGPRs, NumRegs);
+-  }
+-
+-  return Progress;
+-}
+-
+-bool ExcessRP::saveAGPRs(unsigned NumRegs) {
+-  return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
+-}
++    // Similarly, check if the UseI is planned to be remat.
++    for (auto &RematEntry : RematerializableInsts) {
++      if (find_if(RematEntry.second,
++                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
++                    return Remat.first == UseI;
++                  }) != RematEntry.second.end()) {
++        HasRematDependency = true;
++        break;
++      }
++    }
+ 
+-bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
+-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
++    if (HasRematDependency)
++      break;
+ 
+-  REMAT_DEBUG({
+-    dbgs() << "Collecting rematerializable instructions in ";
+-    MF.getFunction().printAsOperand(dbgs(), false);
+-    dbgs() << '\n';
+-  });
+-
+-  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+-  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
+-  DenseMap<unsigned, ExcessRP> OptRegions;
+-  const Function &F = MF.getFunction();
+-
+-  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
+-  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
+-  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
+-  const unsigned MaxSGPRsIncOcc =
+-      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
+-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
+-  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
+-
+-  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
+-    if (Cond) {
+-      // We won't try to increase occupancy.
+-      IncreaseOccupancy = false;
+-      OptRegions.clear();
+-    }
+-    return Cond;
+-  };
+-
+-  // Collect optimizable regions. If there is spilling in any region we will
+-  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
+-  // occupancy by one in the whole function.
+-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+-    GCNRegPressure &RP = DAG.Pressure[I];
+-
+-    // Check whether SGPR pressures prevents us from eliminating spilling.
+-    unsigned NumSGPRs = RP.getSGPRNum();
+-    if (NumSGPRs > MaxSGPRsNoSpill)
+-      ClearOptRegionsIf(IncreaseOccupancy);
+-
+-    ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
+-    if (Excess) {
+-      ClearOptRegionsIf(IncreaseOccupancy);
+-    } else if (IncreaseOccupancy) {
+-      // Check whether SGPR pressure prevents us from increasing occupancy.
+-      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
+-        if (DAG.MinOccupancy >= WavesPerEU.first)
+-          return false;
+-        continue;
+-      }
+-      if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
+-        // We can only rematerialize ArchVGPRs at this point.
+-        unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
+-        bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
+-        if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
+-          if (DAG.MinOccupancy >= WavesPerEU.first)
+-            return false;
+-          continue;
++    // We are only collecting defs that are defined in another block and are
++    // live-through or used inside regions at MinOccupancy. This means that the
++    // register must be in the live-in set for the region.
++    bool AddedToRematList = false;
++    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
++      auto It = DAG.LiveIns[I].find(Reg);
++      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
++        if (DAG.RegionsWithMinOcc[I]) {
++          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
++          SlotIndex UseIdx =
++              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
++          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
++            RematerializableInsts[I][Def] = UseI;
++            AddedToRematList = true;
++          }
+         }
++
++        // Collect regions with rematerializable reg as live-in to avoid
++        // searching later when updating RP.
++        RematDefToLiveInRegions[Def].push_back(I);
+       }
+     }
+-    if (Excess)
+-      OptRegions.insert({I, Excess});
++    if (!AddedToRematList)
++      RematDefToLiveInRegions.erase(Def);
+   }
+-  if (OptRegions.empty())
+-    return false;
+-
+-#ifndef NDEBUG
+-  if (IncreaseOccupancy)
+-    REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
+-  else
+-    REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
+-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+-    if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
+-      REMAT_DEBUG(dbgs() << "  " << I << ": " << OptIt->getSecond() << '\n');
+-  }
+-#endif
+-
+-  // When we are reducing spilling, the target is the minimum target number of
+-  // waves/EU determined by the subtarget.
+-  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
+-
+-  // Accounts for a reduction in RP in an optimizable region. Returns whether we
+-  // estimate that we have identified enough rematerialization opportunities to
+-  // achieve our goal, and sets Progress to true when this particular reduction
+-  // in pressure was helpful toward that goal.
+-  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
+-                              bool &Progress) -> bool {
+-    ExcessRP &Excess = OptIt->getSecond();
+-    // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
+-    // only when we are just trying to eliminate spilling to memory. At this
+-    // point we err on the conservative side and do not increase
+-    // register-to-register spilling for the sake of increasing occupancy.
+-    Progress |=
+-        Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
+-                             /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
+-    if (!Excess)
+-      OptRegions.erase(OptIt->getFirst());
+-    return OptRegions.empty();
+-  };
+-
+-  // We need up-to-date live-out info. to query live-out register masks in
+-  // regions containing rematerializable instructions.
+-  DAG.RegionLiveOuts.buildLiveRegMap();
+-
+-  // Cache set of registers that are going to be rematerialized.
+-  DenseSet<unsigned> RematRegs;
+-
+-  // Identify rematerializable instructions in the function.
+-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+-    auto Region = DAG.Regions[I];
+-    for (auto MI = Region.first; MI != Region.second; ++MI) {
+-      // The instruction must be trivially rematerializable.
+-      MachineInstr &DefMI = *MI;
+-      if (!isTriviallyReMaterializable(DefMI))
+-        continue;
+-
+-      // We only support rematerializing virtual VGPRs with one definition.
+-      Register Reg = DefMI.getOperand(0).getReg();
+-      if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+-          !DAG.MRI.hasOneDef(Reg))
+-        continue;
+-
+-      // We only care to rematerialize the instruction if it has a single
+-      // non-debug user in a different region. The using MI may not belong to a
+-      // region if it is a lone region terminator.
+-      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+-      if (!UseMI)
+-        continue;
+-      auto UseRegion = MIRegion.find(UseMI);
+-      if (UseRegion != MIRegion.end() && UseRegion->second == I)
+-        continue;
+-
+-      // Do not rematerialize an instruction if it uses or is used by an
+-      // instruction that we have designated for rematerialization.
+-      // FIXME: Allow for rematerialization chains: this requires 1. updating
+-      // remat points to account for uses that are rematerialized, and 2. either
+-      // rematerializing the candidates in careful ordering, or deferring the
+-      // MBB RP walk until the entire chain has been rematerialized.
+-      if (Rematerializations.contains(UseMI) ||
+-          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
+-            return MO.isReg() && RematRegs.contains(MO.getReg());
+-          }))
+-        continue;
++}
+ 
+-      // Do not rematerialize an instruction it it uses registers that aren't
+-      // available at its use. This ensures that we are not extending any live
+-      // range while rematerializing.
+-      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
+-      SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
+-      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
+-        continue;
++bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
++                                              const TargetInstrInfo *TII) {
++  // Temporary copies of cached variables we will be modifying and replacing if
++  // sinking succeeds.
++  SmallVector<
++      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
++      NewRegions;
++  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
++  DenseMap<unsigned, GCNRegPressure> NewPressure;
++  BitVector NewRescheduleRegions;
++  LiveIntervals *LIS = DAG.LIS;
+ 
+-      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
+-      RematInstruction &Remat =
+-          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
+-
+-      bool RematUseful = false;
+-      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+-        // Optimistically consider that moving the instruction out of its
+-        // defining region will reduce RP in the latter; this assumes that
+-        // maximum RP in the region is reached somewhere between the defining
+-        // instruction and the end of the region.
+-        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
+-        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+-        if (ReduceRPInRegion(It, Mask, RematUseful))
+-          return true;
+-      }
++  NewRegions.resize(DAG.Regions.size());
++  NewRescheduleRegions.resize(DAG.Regions.size());
+ 
+-      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
+-        // We are only collecting regions in which the register is a live-in
+-        // (and may be live-through).
+-        auto It = DAG.LiveIns[LIRegion].find(Reg);
+-        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
+-          continue;
+-        Remat.LiveInRegions.insert(LIRegion);
++  // Collect only regions that has a rematerializable def as a live-in.
++  SmallSet<unsigned, 16> ImpactedRegions;
++  for (const auto &It : RematDefToLiveInRegions)
++    ImpactedRegions.insert_range(It.second);
++
++  // Make copies of register pressure and live-ins cache that will be updated
++  // as we rematerialize.
++  for (auto Idx : ImpactedRegions) {
++    NewPressure[Idx] = DAG.Pressure[Idx];
++    NewLiveIns[Idx] = DAG.LiveIns[Idx];
++  }
++  NewRegions = DAG.Regions;
++  NewRescheduleRegions.reset();
++
++  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
++  bool Improved = false;
++  for (auto I : ImpactedRegions) {
++    if (!DAG.RegionsWithMinOcc[I])
++      continue;
+ 
+-        // Account for the reduction in RP due to the rematerialization in an
+-        // optimizable region in which the defined register is a live-in. This
+-        // is exact for live-through region but optimistic in the using region,
+-        // where RP is actually reduced only if maximum RP is reached somewhere
+-        // between the beginning of the region and the rematerializable
+-        // instruction's use.
+-        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
+-          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
+-          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
+-            return true;
+-        }
+-      }
++    Improved = false;
++    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
++    int SGPRUsage = NewPressure[I].getSGPRNum();
++
++    // TODO: Handle occupancy drop due to AGPR and SGPR.
++    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
++    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
++      break;
+ 
+-      // If the instruction is not a live-in or live-out in any optimizable
+-      // region then there is no point in rematerializing it.
+-      if (!RematUseful) {
+-        Rematerializations.pop_back();
+-        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
+-      } else {
+-        RematRegs.insert(Reg);
+-      }
++    // The occupancy of this region could have been improved by a previous
++    // iteration's sinking of defs.
++    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
++      NewRescheduleRegions[I] = true;
++      Improved = true;
++      continue;
+     }
+-  }
+-
+-  if (IncreaseOccupancy) {
+-    // We were trying to increase occupancy but failed, abort the stage.
+-    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
+-    Rematerializations.clear();
+-    return false;
+-  }
+-  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
+-  return !Rematerializations.empty();
+-}
+-
+-void PreRARematStage::rematerialize() {
+-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+-
+-  // Collect regions whose RP changes in unpredictable way; we will have to
+-  // fully recompute their RP after all rematerailizations.
+-  DenseSet<unsigned> RecomputeRP;
+-
+-  // Rematerialize all instructions.
+-  for (auto &[DefMI, Remat] : Rematerializations) {
+-    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+-    Register Reg = DefMI->getOperand(0).getReg();
+-    unsigned SubReg = DefMI->getOperand(0).getSubReg();
+-    unsigned DefRegion = MIRegion.at(DefMI);
+-
+-    // Rematerialize DefMI to its use block.
+-    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
+-                       *DAG.TRI);
+-    Remat.RematMI = &*std::prev(InsertPos);
+-    Remat.RematMI->getOperand(0).setSubReg(SubReg);
+-    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
+-
+-    // Update region boundaries in regions we sinked from (remove defining MI)
+-    // and to (insert MI rematerialized in use block). Only then we can erase
+-    // the original MI.
+-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
+-    auto UseRegion = MIRegion.find(Remat.UseMI);
+-    if (UseRegion != MIRegion.end()) {
+-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
+-                                 Remat.RematMI);
+-    }
+-    DefMI->eraseFromParent();
+-    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
+-
+-    // Collect all regions impacted by the rematerialization and update their
+-    // live-in/RP information.
+-    for (unsigned I : Remat.LiveInRegions) {
+-      ImpactedRegions.insert({I, DAG.Pressure[I]});
+-      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+ 
++    // First check if we have enough trivially rematerializable instructions to
++    // improve occupancy. Optimistically assume all instructions we are able to
++    // sink decreased RP.
++    int TotalSinkableRegs = 0;
++    for (const auto &It : RematerializableInsts[I]) {
++      MachineInstr *Def = It.first;
++      Register DefReg = Def->getOperand(0).getReg();
++      TotalSinkableRegs +=
++          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
+ #ifdef EXPENSIVE_CHECKS
+       // All uses are known to be available / live at the remat point. Thus, the
+       // uses should already be live in to the region.
+-      for (MachineOperand &MO : DefMI->operands()) {
++      for (MachineOperand &MO : Def->operands()) {
+         if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+           continue;
+ 
+@@ -2106,12 +1844,13 @@
+         if (!UseReg.isVirtual())
+           continue;
+ 
+-        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
++        LiveInterval &LI = LIS->getInterval(UseReg);
+         LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
+         if (LI.hasSubRanges() && MO.getSubReg())
+           LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
+ 
+-        LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
++        assert(NewLiveIns[I].contains(UseReg));
++        LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
+         LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
+         // If this register has lanes not covered by the LiveIns, be sure they
+         // do not map to any subrange. ref:
+@@ -2123,64 +1862,126 @@
+         }
+       }
+ #endif
+-
+-      // The register is no longer a live-in in all regions but the one that
+-      // contains the single use. In live-through regions, maximum register
+-      // pressure decreases predictably so we can directly update it. In the
+-      // using region, maximum RP may or may not decrease, so we will mark it
+-      // for re-computation after all materializations have taken place.
+-      LaneBitmask PrevMask = RegionLiveIns[Reg];
+-      RegionLiveIns.erase(Reg);
+-      RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
+-      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
+-        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+-      else
+-        RecomputeRP.insert(I);
+     }
+-    // RP in the region from which the instruction was rematerialized may or may
+-    // not decrease.
+-    ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
+-    RecomputeRP.insert(DefRegion);
+-
+-    // Recompute live interval to reflect the register's rematerialization.
+-    Register RematReg = Remat.RematMI->getOperand(0).getReg();
+-    DAG.LIS->removeInterval(RematReg);
+-    DAG.LIS->createAndComputeVirtRegInterval(RematReg);
+-  }
+-
+-  // All regions impacted by at least one rematerialization must be rescheduled.
+-  // Maximum pressure must also be recomputed for all regions where it changed
+-  // non-predictably and checked against the target occupancy.
+-  AchievedOcc = TargetOcc;
+-  for (auto &[I, OriginalRP] : ImpactedRegions) {
+-    bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
+-    DAG.RescheduleRegions[I] = !IsEmptyRegion;
+-    if (!RecomputeRP.contains(I))
+-      continue;
++    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
++    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
++    // If in the most optimistic scenario, we cannot improve occupancy, then do
++    // not attempt to sink any instructions.
++    if (OptimisticOccupancy <= DAG.MinOccupancy)
++      break;
+ 
+-    GCNRegPressure RP;
+-    if (IsEmptyRegion) {
+-      RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
+-    } else {
+-      GCNDownwardRPTracker RPT(*DAG.LIS);
+-      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
+-                                                      DAG.Regions[I].second);
+-      if (NonDbgMI == DAG.Regions[I].second) {
+-        // Region is non-empty but contains only debug instructions.
+-        RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
+-      } else {
+-        RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
+-        RPT.advance(DAG.Regions[I].second);
+-        RP = RPT.moveMaxPressure();
++    unsigned ImproveOccupancy = 0;
++    SmallVector<MachineInstr *, 4> SinkedDefs;
++    for (auto &It : RematerializableInsts[I]) {
++      MachineInstr *Def = It.first;
++      MachineBasicBlock::iterator InsertPos =
++          MachineBasicBlock::iterator(It.second);
++      Register Reg = Def->getOperand(0).getReg();
++      // Rematerialize MI to its use block.
++      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
++                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
++      MachineInstr *NewMI = &*std::prev(InsertPos);
++      LIS->InsertMachineInstrInMaps(*NewMI);
++      LIS->removeInterval(Reg);
++      LIS->createAndComputeVirtRegInterval(Reg);
++      InsertedMIToOldDef[NewMI] = Def;
++
++      // Update region boundaries in scheduling region we sinked from since we
++      // may sink an instruction that was at the beginning or end of its region
++      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
++                                 /*Removing =*/true);
 +
- #include <stddef.h> // For size_t
++      // Update region boundaries in region we sinked to.
++      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
++
++      LaneBitmask PrevMask = NewLiveIns[I][Reg];
++      // FIXME: Also update cached pressure for where the def was sinked from.
++      // Update RP for all regions that has this reg as a live-in and remove
++      // the reg from all regions as a live-in.
++      for (auto Idx : RematDefToLiveInRegions[Def]) {
++        NewLiveIns[Idx].erase(Reg);
++        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
++          // Def is live-through and not used in this block.
++          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
++        } else {
++          // Def is used and rematerialized into this block.
++          GCNDownwardRPTracker RPT(*LIS);
++          auto *NonDbgMI = &*skipDebugInstructionsForward(
++              NewRegions[Idx].first, NewRegions[Idx].second);
++          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
++          RPT.advance(NewRegions[Idx].second);
++          NewPressure[Idx] = RPT.moveMaxPressure();
++        }
+       }
++
++      SinkedDefs.push_back(Def);
++      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
++      if (ImproveOccupancy > DAG.MinOccupancy)
++        break;
+     }
+-    DAG.Pressure[I] = RP;
+-    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
++
++    // Remove defs we just sinked from all regions' list of sinkable defs
++    for (auto &Def : SinkedDefs)
++      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
++        RematerializableInsts[TrackedIdx].erase(Def);
++
++    if (ImproveOccupancy <= DAG.MinOccupancy)
++      break;
++
++    NewRescheduleRegions[I] = true;
++    Improved = true;
+   }
+-  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
++
++  if (!Improved) {
++    // Occupancy was not improved for all regions that were at MinOccupancy.
++    // Undo sinking and remove newly rematerialized instructions.
++    for (auto &Entry : InsertedMIToOldDef) {
++      MachineInstr *MI = Entry.first;
++      MachineInstr *OldMI = Entry.second;
++      Register Reg = MI->getOperand(0).getReg();
++      LIS->RemoveMachineInstrFromMaps(*MI);
++      MI->eraseFromParent();
++      OldMI->clearRegisterDeads(Reg);
++      LIS->removeInterval(Reg);
++      LIS->createAndComputeVirtRegInterval(Reg);
++    }
++    return false;
++  }
++
++  // Occupancy was improved for all regions.
++  for (auto &Entry : InsertedMIToOldDef) {
++    MachineInstr *MI = Entry.first;
++    MachineInstr *OldMI = Entry.second;
++
++    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
++    DAG.BBLiveInMap.erase(OldMI);
++
++    // Remove OldMI and update LIS
++    Register Reg = MI->getOperand(0).getReg();
++    LIS->RemoveMachineInstrFromMaps(*OldMI);
++    OldMI->eraseFromParent();
++    LIS->removeInterval(Reg);
++    LIS->createAndComputeVirtRegInterval(Reg);
++  }
++
++  // Update live-ins, register pressure, and regions caches.
++  for (auto Idx : ImpactedRegions) {
++    DAG.LiveIns[Idx] = NewLiveIns[Idx];
++    DAG.Pressure[Idx] = NewPressure[Idx];
++    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
++  }
++  DAG.Regions = NewRegions;
++  DAG.RescheduleRegions = NewRescheduleRegions;
++
++  if (GCNTrackers)
++    DAG.RegionLiveOuts.buildLiveRegMap();
++
++  SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
++  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
++
++  return true;
+ }
+ 
+-// Copied from MachineLICM
+ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
+   if (!DAG.TII->isTriviallyReMaterializable(MI))
+     return false;
+@@ -2198,83 +1999,46 @@
+   return true;
+ }
  
- namespace LIBC_NAMESPACE_DECL {
+-void PreRARematStage::finalizeGCNSchedStage() {
+-  // We consider that reducing spilling is always beneficial so we never
+-  // rollback rematerializations in such cases. It's also possible that
+-  // rescheduling lowers occupancy over the one achieved just through remats, in
+-  // which case we do not want to rollback either (the rescheduling was already
+-  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
+-  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
+-  if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+-    return;
+-
+-  REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
+-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+-
+-  // Rollback the rematerializations.
+-  for (const auto &[DefMI, Remat] : Rematerializations) {
+-    MachineInstr &RematMI = *Remat.RematMI;
+-    unsigned DefRegion = MIRegion.at(DefMI);
+-    MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
+-    MachineBasicBlock *MBB = RegionBB[DefRegion];
+-    Register Reg = RematMI.getOperand(0).getReg();
+-    unsigned SubReg = RematMI.getOperand(0).getSubReg();
+-
+-    // Re-rematerialize MI at the end of its original region. Note that it may
+-    // not be rematerialized exactly in the same position as originally within
+-    // the region, but it should not matter much.
+-    TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
+-    MachineInstr *NewMI = &*std::prev(InsertPos);
+-    NewMI->getOperand(0).setSubReg(SubReg);
+-    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
+-
+-    auto UseRegion = MIRegion.find(Remat.UseMI);
+-    if (UseRegion != MIRegion.end()) {
+-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
+-                                 nullptr);
+-    }
+-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
+-
+-    // Erase rematerialized MI.
+-    RematMI.eraseFromParent();
+-    DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
+-
+-    // Recompute live interval for the re-rematerialized register
+-    DAG.LIS->removeInterval(Reg);
+-    DAG.LIS->createAndComputeVirtRegInterval(Reg);
+-
+-    // Re-add the register as a live-in in all regions it used to be one in.
+-    for (unsigned LIRegion : Remat.LiveInRegions)
+-      DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
+-  }
+-
+-  // Reset RP in all impacted regions.
+-  for (auto &[I, OriginalRP] : ImpactedRegions)
+-    DAG.Pressure[I] = OriginalRP;
+-
+-  GCNSchedStage::finalizeGCNSchedStage();
+-}
+-
++// When removing, we will have to check both beginning and ending of the region.
++// When inserting, we will only have to check if we are inserting NewMI in front
++// of a scheduling region and do not need to check the ending since we will only
++// ever be inserting before an already existing MI.
+ void GCNScheduleDAGMILive::updateRegionBoundaries(
+-    RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
+-    MachineInstr *NewMI) {
+-  assert(!NewMI ||
+-         NewMI != RegionBounds.second && "cannot remove at region end");
+-
+-  if (RegionBounds.first == RegionBounds.second) {
+-    assert(NewMI && "cannot remove from an empty region");
+-    RegionBounds.first = NewMI;
+-    return;
++    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
++                              MachineBasicBlock::iterator>> &RegionBoundaries,
++    MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
++  unsigned I = 0, E = RegionBoundaries.size();
++  // Search for first region of the block where MI is located
++  while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
++    ++I;
++
++  for (; I != E; ++I) {
++    if (MI->getParent() != RegionBoundaries[I].first->getParent())
++      return;
++
++    if (Removing && MI == RegionBoundaries[I].first &&
++        MI == RegionBoundaries[I].second) {
++      // MI is in a region with size 1, after removing, the region will be
++      // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
++      RegionBoundaries[I] =
++          std::pair(MI->getParent()->end(), MI->getParent()->end());
++      return;
++    }
++    if (MI == RegionBoundaries[I].first) {
++      if (Removing)
++        RegionBoundaries[I] =
++            std::pair(std::next(MI), RegionBoundaries[I].second);
++      else
++        // Inserted NewMI in front of region, set new RegionBegin to NewMI
++        RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
++                                        RegionBoundaries[I].second);
++      return;
++    }
++    if (Removing && MI == RegionBoundaries[I].second) {
++      RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
++      return;
++    }
+   }
+-
+-  // We only care for modifications at the beginning of a non-empty region since
+-  // the upper region boundary is exclusive.
+-  if (MI != RegionBounds.first)
+-    return;
+-  if (!NewMI)
+-    RegionBounds.first = std::next(MI); // Removal
+-  else
+-    RegionBounds.first = NewMI; // Insertion
+ }
+ 
+ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
++++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+@@ -14,9 +14,7 @@
+ #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+ 
+ #include "GCNRegPressure.h"
+-#include "llvm/ADT/DenseMap.h"
+ #include "llvm/ADT/MapVector.h"
+-#include "llvm/CodeGen/MachineInstr.h"
+ #include "llvm/CodeGen/MachineScheduler.h"
+ 
+ namespace llvm {
+@@ -216,11 +214,6 @@
+   }
+ };
+ 
+-/// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
+-/// boundary is inclusive, the upper boundary is exclusive.
+-using RegionBoundaries =
+-    std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;
+-
+ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+   friend class GCNSchedStage;
+   friend class OccInitialScheduleStage;
+@@ -241,7 +234,8 @@
+   unsigned MinOccupancy;
+ 
+   // Vector of regions recorder for later rescheduling
+-  SmallVector<RegionBoundaries, 32> Regions;
++  SmallVector<std::pair<MachineBasicBlock::iterator,
++                        MachineBasicBlock::iterator>, 32> Regions;
+ 
+   // Records if a region is not yet scheduled, or schedule has been reverted,
+   // or we generally desire to reschedule it.
+@@ -292,13 +286,12 @@
+   // Compute and cache live-ins and pressure for all regions in block.
+   void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
+ 
+-  /// If necessary, updates a region's boundaries following insertion ( \p NewMI
+-  /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
+-  /// For an MI removal, this must be called before the MI is actually erased
+-  /// from its parent MBB.
+-  void updateRegionBoundaries(RegionBoundaries &RegionBounds,
+-                              MachineBasicBlock::iterator MI,
+-                              MachineInstr *NewMI);
++  // Update region boundaries when removing MI or inserting NewMI before MI.
++  void updateRegionBoundaries(
++      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
++                                MachineBasicBlock::iterator>> &RegionBoundaries,
++      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
++      bool Removing = false);
+ 
+   void runSchedStages();
+ 
+@@ -438,73 +431,30 @@
+       : GCNSchedStage(StageID, DAG) {}
+ };
+ 
+-/// Attempts to reduce function spilling or, if there is no spilling, to
+-/// increase function occupancy by one with respect to ArchVGPR usage by sinking
+-/// trivially rematerializable instructions to their use. When the stage
+-/// estimates reducing spilling or increasing occupancy is possible, as few
+-/// instructions as possible are rematerialized to reduce potential negative
+-/// effects on function latency.
+-///
+-/// TODO: We should extend this to work on SGPRs and AGPRs as well.
+ class PreRARematStage : public GCNSchedStage {
+ private:
+-  /// Useful information about a rematerializable instruction.
+-  struct RematInstruction {
+-    /// Single use of the rematerializable instruction's defined register,
+-    /// located in a different block.
+-    MachineInstr *UseMI;
+-    /// Rematerialized version of \p DefMI, set in
+-    /// PreRARematStage::rematerialize. Used for reverting rematerializations.
+-    MachineInstr *RematMI;
+-    /// Set of regions in which the rematerializable instruction's defined
+-    /// register is a live-in.
+-    SmallDenseSet<unsigned, 4> LiveInRegions;
+-
+-    RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
+-  };
+-
+-  /// Maps all MIs to their parent region. MI terminators are considered to be
+-  /// outside the region they delimitate, and as such are not stored in the map.
+-  DenseMap<MachineInstr *, unsigned> MIRegion;
+-  /// Parent MBB to each region, in region order.
+-  SmallVector<MachineBasicBlock *> RegionBB;
+-  /// Collects instructions to rematerialize.
+-  MapVector<MachineInstr *, RematInstruction> Rematerializations;
+-  /// Collects regions whose live-ins or register pressure will change due to
+-  /// rematerializations.
+-  DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
+-  /// In case we need to rollback rematerializations, save lane masks for all
+-  /// rematerialized registers in all regions in which they are live-ins.
+-  DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
+-  /// Target occupancy the stage estimates is reachable through
+-  /// rematerialization. Greater than or equal to the pre-stage min occupancy.
+-  unsigned TargetOcc;
+-  /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
+-  /// Smaller than or equal to the target occupancy.
+-  unsigned AchievedOcc;
+-  /// Whether the stage is attempting to increase occupancy in the abscence of
+-  /// spilling.
+-  bool IncreaseOccupancy;
+-
+-  /// Returns whether remat can reduce spilling or increase function occupancy
+-  /// by 1 through rematerialization. If it can do one, collects instructions in
+-  /// PreRARematStage::Rematerializations and sets the target occupancy in
+-  /// PreRARematStage::TargetOccupancy.
+-  bool canIncreaseOccupancyOrReduceSpill();
++  // Each region at MinOccupancy will have their own list of trivially
++  // rematerializable instructions we can remat to reduce RP. The list maps an
++  // instruction to the position we should remat before, usually the MI using
++  // the rematerializable instruction.
++  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
++      RematerializableInsts;
++
++  // Map a trivially rematerializable def to a list of regions at MinOccupancy
++  // that has the defined reg as a live-in.
++  MapVector<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
++
++  // Collect all trivially rematerializable VGPR instructions with a single def
++  // and single use outside the defining block into RematerializableInsts.
++  void collectRematerializableInstructions();
+ 
+-  /// Whether the MI is trivially rematerializable and does not have any virtual
+-  /// register use.
+   bool isTriviallyReMaterializable(const MachineInstr &MI);
+ 
+-  /// Rematerializes all instructions in PreRARematStage::Rematerializations
+-  /// and stores the achieved occupancy after remat in
+-  /// PreRARematStage::AchievedOcc.
+-  void rematerialize();
+-
+-  /// If remat alone did not increase occupancy to the target one, rollbacks all
+-  /// rematerializations and resets live-ins/RP in all regions impacted by the
+-  /// stage to their pre-stage values.
+-  void finalizeGCNSchedStage() override;
++  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
++  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
++  // instructions. Returns true if we were able to sink instruction(s).
++  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
++                               const TargetInstrInfo *TII);
+ 
+   /// \p Returns true if all the uses in \p InstToRemat defined at \p
+   /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
++++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+@@ -466,7 +466,7 @@
+                             getReservedNumSGPRs(MF));
+ }
+ 
+-unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
++static unsigned getMaxNumPreloadedSGPRs() {
+   using USI = GCNUserSGPRUsageInfo;
+   // Max number of user SGPRs
+   const unsigned MaxUserSGPRs =
+@@ -497,28 +497,42 @@
+ }
+ 
+ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
+-    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
+-  const auto &[Min, Max] = NumVGPRBounds;
++    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
++  // Compute maximum number of VGPRs function can use using default/requested
++  // minimum number of waves per execution unit.
++  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+ 
+   // Check if maximum number of VGPRs was explicitly requested using
+   // "amdgpu-num-vgpr" attribute.
++  unsigned Requested =
++      F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
++  if (Requested != MaxNumVGPRs) {
++    if (hasGFX90AInsts())
++      Requested *= 2;
++
++    // Make sure requested value is compatible with values implied by
++    // default/requested minimum/maximum number of waves per execution unit.
++    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
++      Requested = 0;
++    if (WavesPerEU.second && Requested &&
++        Requested < getMinNumVGPRs(WavesPerEU.second))
++      Requested = 0;
++
++    if (Requested)
++      MaxNumVGPRs = Requested;
++  }
+ 
+-  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
+-  if (Requested != Max && hasGFX90AInsts())
+-    Requested *= 2;
+-
+-  // Make sure requested value is inside the range of possible VGPR usage.
+-  return std::clamp(Requested, Min, Max);
++  return MaxNumVGPRs;
+ }
+ 
+ unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
+-  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
+-  return getBaseMaxNumVGPRs(
+-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
++  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+ }
+ 
+ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+-  return getMaxNumVGPRs(MF.getFunction());
++  const Function &F = MF.getFunction();
++  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
++  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
+ }
+ 
+ void GCNSubtarget::adjustSchedDependency(
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
++++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+@@ -1505,9 +1505,6 @@
+   /// \returns Reserved number of SGPRs for given function \p F.
+   unsigned getReservedNumSGPRs(const Function &F) const;
+ 
+-  /// \returns Maximum number of preloaded SGPRs for the subtarget.
+-  unsigned getMaxNumPreloadedSGPRs() const;
+-
+   /// \returns max num SGPRs. This is the common utility
+   /// function called by MachineFunction and Function
+   /// variants of getMaxNumSGPRs.
+@@ -1576,10 +1573,8 @@
+ 
+   /// \returns max num VGPRs. This is the common utility function
+   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
+-  unsigned
+-  getBaseMaxNumVGPRs(const Function &F,
+-                     std::pair<unsigned, unsigned> NumVGPRBounds) const;
+-
++  unsigned getBaseMaxNumVGPRs(const Function &F,
++                              std::pair<unsigned, unsigned> WavesPerEU) const;
+   /// \returns Maximum number of VGPRs that meets number of waves per execution
+   /// unit requirement for function \p F, or number of VGPRs explicitly
+   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
++++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+@@ -1190,8 +1190,6 @@
+   return IsWave32 ? 8 : 4;
+ }
+ 
+-unsigned getArchVGPRAllocGranule() { return 4; }
+-
+ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
+   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+     return 512;
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
++++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+@@ -309,10 +309,6 @@
+     const MCSubtargetInfo *STI,
+     std::optional<bool> EnableWavefrontSize32 = std::nullopt);
+ 
+-/// For subtargets with a unified VGPR file and mixed ArchVGPR/AGPR usage,
+-/// returns the allocation granule for ArchVGPRs.
+-unsigned getArchVGPRAllocGranule();
+-
+ /// \returns Total number of VGPRs for given subtarget \p STI.
+ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
+ 
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+@@ -1,2539 +0,0 @@
+-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
+-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
+-
+---- |
+-  define void @small_num_vgprs_as_spill() "amdgpu-num-vgpr"="28" {
+-    ret void
+-  }
+-  define void @dont_remat_waves_per_eu() "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-waves-per-eu"="7,7" {
+-    ret void
+-  }
+-  define void @dont_remat_at_max_occ() "amdgpu-waves-per-eu"="8,8" {
+-    ret void
+-  }
+-  define void @reduce_arch_and_acc_vgrp_spill() "amdgpu-waves-per-eu"="8,8" {
+-    ret void
+-  }
+-  define void @reduce_spill_archvgpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
+-    ret void
+-  }
+-  define void @reduce_spill_agpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
+-    ret void
+-  }
+----
+-# User-requested maximum number of VGPRs need to be taken into account by
+-# the scheduler's rematerialization stage. Register usage above that number
+-# is considered like spill; occupancy is "inadvertently" increased when
+-# eliminating spill.
+-name:            small_num_vgprs_as_spill
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: small_num_vgprs_as_spill
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_27]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  ;
+-  ; GFX90A-LABEL: name: small_num_vgprs_as_spill
+-  ; GFX90A: bb.0:
+-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT: bb.1:
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
+-  ; GFX90A-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-
+-  bb.1:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
+-
+-    S_ENDPGM 0
+-...
+-# Min/Max occupancy is 8, but user requests 7, the scheduler's rematerialization
+-# stage should not try to rematerialize instructions.
+----
+-name:            dont_remat_waves_per_eu
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: dont_remat_waves_per_eu
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  ;
+-  ; GFX90A-LABEL: name: dont_remat_waves_per_eu
+-  ; GFX90A: bb.0:
+-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT: bb.1:
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
+-  ; GFX90A-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode,
+-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode,
+-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode,
+-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode,
+-
+-  bb.1:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
+-    S_NOP 0, implicit %35
+-
+-    S_ENDPGM 0
+-...
+-# Min/Max occupancy is 8, the scheduler's rematerialization stage should not
+-# try to rematerialize instructions.
+----
+-name:            dont_remat_at_max_occ
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: dont_remat_at_max_occ
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  ;
+-  ; GFX90A-LABEL: name: dont_remat_at_max_occ
+-  ; GFX90A: bb.0:
+-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT: bb.1:
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX90A-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
+-
+-  bb.1:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31
+-
+-    S_ENDPGM 0
+-...
+-# Min/Max waves/EU is 8. For targets with non-unified RF (gfx908) we are able to
+-# eliminate both ArchVGPR and AGPR spilling by saving 2 ArchVGPRs (one for
+-# spilling AGPR to ArchVGPR). In the unified RF case (gfx90a) the ArchVGPR
+-# allocation granule forces us to remat all eligible ArchVGPRs to eliminate
+-# spilling.
+----
+-name:            reduce_arch_and_acc_vgrp_spill
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: reduce_arch_and_acc_vgrp_spill
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_30]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF32]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  ;
+-  ; GFX90A-LABEL: name: reduce_arch_and_acc_vgrp_spill
+-  ; GFX90A: bb.0:
+-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT: bb.1:
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_27]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF32]]
+-  ; GFX90A-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+-    %32:agpr_32 = IMPLICIT_DEF
+-    %33:agpr_32 = IMPLICIT_DEF
+-    %34:agpr_32 = IMPLICIT_DEF
+-    %35:agpr_32 = IMPLICIT_DEF
+-    %36:agpr_32 = IMPLICIT_DEF
+-    %37:agpr_32 = IMPLICIT_DEF
+-    %38:agpr_32 = IMPLICIT_DEF
+-    %39:agpr_32 = IMPLICIT_DEF
+-    %40:agpr_32 = IMPLICIT_DEF
+-    %41:agpr_32 = IMPLICIT_DEF
+-    %42:agpr_32 = IMPLICIT_DEF
+-    %43:agpr_32 = IMPLICIT_DEF
+-    %44:agpr_32 = IMPLICIT_DEF
+-    %45:agpr_32 = IMPLICIT_DEF
+-    %46:agpr_32 = IMPLICIT_DEF
+-    %47:agpr_32 = IMPLICIT_DEF
+-    %48:agpr_32 = IMPLICIT_DEF
+-    %49:agpr_32 = IMPLICIT_DEF
+-    %50:agpr_32 = IMPLICIT_DEF
+-    %51:agpr_32 = IMPLICIT_DEF
+-    %52:agpr_32 = IMPLICIT_DEF
+-    %53:agpr_32 = IMPLICIT_DEF
+-    %54:agpr_32 = IMPLICIT_DEF
+-    %55:agpr_32 = IMPLICIT_DEF
+-    %56:agpr_32 = IMPLICIT_DEF
+-    %57:agpr_32 = IMPLICIT_DEF
+-    %58:agpr_32 = IMPLICIT_DEF
+-    %59:agpr_32 = IMPLICIT_DEF
+-    %60:agpr_32 = IMPLICIT_DEF
+-    %61:agpr_32 = IMPLICIT_DEF
+-    %62:agpr_32 = IMPLICIT_DEF
+-    %63:agpr_32 = IMPLICIT_DEF
+-
+-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
+-    %65:agpr_32 = IMPLICIT_DEF
+-
+-  bb.1:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
+-    S_NOP 0, implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
+-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44
+-    S_NOP 0, implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
+-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54
+-    S_NOP 0, implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
+-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64
+-    S_NOP 0, implicit %65
+-
+-    S_ENDPGM 0
+-...
+----
+-name:            reduce_spill_archvgpr_above_addressable_limit
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  ;
+-  ; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
+-  ; GFX90A: bb.0:
+-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT: bb.1:
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
+-  ; GFX90A-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+-    %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+-    %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+-    %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+-    %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+-    %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+-    %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+-    %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+-    %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+-    %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+-    %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+-    %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+-    %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+-    %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+-    %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+-    %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+-    %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+-    %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+-    %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+-    %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+-    %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+-    %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+-    %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+-    %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+-    %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+-    %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+-    %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+-    %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+-    %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+-    %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+-    %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+-    %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+-    %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+-    %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+-    %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+-    %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+-    %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+-    %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+-    %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+-    %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+-    %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+-    %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+-    %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+-    %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+-    %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+-    %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+-    %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+-    %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+-    %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+-    %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+-    %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+-    %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+-    %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+-    %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+-    %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+-    %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+-    %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+-    %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+-    %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+-    %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+-    %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+-    %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+-    %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+-    %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+-    %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+-    %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+-    %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+-    %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+-    %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+-    %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+-    %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+-    %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+-    %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+-    %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+-    %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+-    %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+-    %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+-    %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+-    %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+-    %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+-    %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+-    %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+-    %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+-    %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+-    %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+-    %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+-    %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+-    %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+-    %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+-    %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+-    %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+-    %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+-    %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+-    %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+-    %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+-    %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+-    %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+-    %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+-    %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+-    %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+-    %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+-    %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+-    %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+-    %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+-    %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+-    %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+-    %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+-    %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+-    %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+-    %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+-    %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+-    %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+-    %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+-    %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+-    %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+-    %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+-    %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+-    %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+-    %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+-    %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+-    %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+-    %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+-    %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+-    %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+-    %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+-    %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+-    %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+-    %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+-    %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+-    %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+-    %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+-    %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+-    %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+-    %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+-    %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+-    %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+-    %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+-    %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+-    %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+-    %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+-    %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+-    %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+-    %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+-    %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+-    %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+-    %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+-    %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+-    %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+-    %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+-    %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+-    %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+-    %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+-    %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+-    %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+-    %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+-    %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+-    %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+-    %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+-    %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+-    %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+-    %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+-    %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+-    %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+-    %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+-    %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+-    %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+-    %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+-    %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+-    %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+-    %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+-    %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+-    %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+-    %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+-    %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+-    %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+-    %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+-    %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+-    %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+-    %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+-    %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+-    %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+-    %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+-    %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+-    %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+-    %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+-    %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+-    %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+-    %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+-    %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+-    %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+-    %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+-    %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+-    %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+-    %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+-    %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+-    %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+-    %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+-    %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+-    %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+-    %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+-    %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+-    %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+-    %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+-    %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+-
+-    %257:agpr_32 = IMPLICIT_DEF
+-
+-  bb.1:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
+-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
+-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
+-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
+-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
+-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
+-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
+-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
+-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
+-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
+-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
+-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
+-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
+-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
+-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
+-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
+-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
+-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
+-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
+-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
+-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
+-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
+-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254, implicit %255, implicit %256, implicit %257
+-
+-    S_ENDPGM 0
+-...
+----
+-name:            reduce_spill_agpr_above_addressable_limit
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: reduce_spill_agpr_above_addressable_limit
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF33:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF35:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF36:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF37:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF38:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF39:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF40:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF41:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF42:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF43:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF44:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF45:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF46:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF47:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF48:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF49:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF50:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF51:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF52:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF53:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF54:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF55:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF56:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF57:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF58:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF59:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF60:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF61:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF62:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF63:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF64:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF65:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF66:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF67:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF68:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF69:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF70:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF71:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF72:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF73:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF74:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF75:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF76:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF77:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF78:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF79:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF80:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF81:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF82:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF83:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF84:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF85:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF86:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF87:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF88:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF89:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF90:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF91:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF92:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF93:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF94:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF95:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF96:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF97:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF98:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF99:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF100:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF101:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF102:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF103:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF104:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF105:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF106:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF107:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF108:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF109:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF110:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF111:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF112:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF113:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF114:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF115:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF116:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF117:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF118:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF119:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF120:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF121:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF122:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF123:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF124:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF125:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF126:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF127:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF128:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF129:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF130:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF131:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF132:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF133:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF134:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF135:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF136:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF137:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF138:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF139:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF140:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF141:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF142:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF143:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF144:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF145:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF146:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF147:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF148:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF149:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF150:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF151:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF152:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF153:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF154:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF155:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF156:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF157:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF158:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF159:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF160:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF161:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF162:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF163:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF164:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF165:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF166:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF167:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF168:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF169:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF170:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF171:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF172:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF173:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF174:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF175:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF176:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF177:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF178:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF179:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF180:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF181:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF182:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF183:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF184:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF185:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF186:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF187:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF188:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF189:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF190:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF191:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF192:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF193:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF194:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF195:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF196:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF197:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF198:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF199:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF200:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF201:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF202:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF203:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF204:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF205:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF206:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF207:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF208:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF209:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF210:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF211:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF212:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF213:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF214:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF215:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF216:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF217:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF218:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF219:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF220:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF221:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF222:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF223:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF224:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF225:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF226:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF227:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF228:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF229:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF230:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF231:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF232:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF233:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF234:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF235:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF236:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF237:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF238:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF239:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF240:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF241:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF242:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF249:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF250:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF251:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF252:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]], implicit [[DEF129]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]], implicit [[DEF139]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]], implicit [[DEF149]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]], implicit [[DEF159]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]], implicit [[DEF169]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]], implicit [[DEF179]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]], implicit [[DEF189]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]], implicit [[DEF199]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]], implicit [[DEF209]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]], implicit [[DEF219]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  ;
+-  ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit
+-  ; GFX90A: bb.0:
+-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF33:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF34:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF35:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF36:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF37:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF38:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF39:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF40:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF41:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF42:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF43:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF44:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF45:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF46:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF47:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF48:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF49:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF50:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF51:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF52:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF53:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF54:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF55:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF56:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF57:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF58:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF59:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF60:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF61:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF62:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF63:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF64:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF65:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF66:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF67:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF68:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF69:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF70:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF71:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF72:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF73:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF74:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF75:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF76:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF77:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF78:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF79:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF80:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF81:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF82:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF83:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF84:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF85:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF86:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF87:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF88:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF89:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF90:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF91:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF92:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF93:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF94:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF95:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF96:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF97:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF98:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF99:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF100:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF101:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF102:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF103:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF104:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF105:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF106:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF107:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF108:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF109:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF110:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF111:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF112:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF113:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF114:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF115:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF116:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF117:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF118:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF119:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF120:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF121:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF122:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF123:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF124:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF125:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF126:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF127:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF128:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF129:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF130:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF131:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF132:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF133:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF134:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF135:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF136:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF137:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF138:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF139:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF140:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF141:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF142:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF143:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF144:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF145:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF146:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF147:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF148:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF149:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF150:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF151:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF152:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF153:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF154:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF155:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF156:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF157:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF158:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF159:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF160:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF161:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF162:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF163:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF164:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF165:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF166:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF167:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF168:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF169:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF170:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF171:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF172:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF173:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF174:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF175:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF176:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF177:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF178:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF179:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF180:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF181:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF182:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF183:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF184:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF185:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF186:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF187:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF188:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF189:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF190:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF191:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF192:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF193:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF194:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF195:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF196:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF197:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF198:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF199:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF200:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF201:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF202:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF203:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF204:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF205:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF206:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF207:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF208:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF209:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF210:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF211:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF212:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF213:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF214:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF215:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF216:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF217:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF218:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF219:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF220:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF221:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF222:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF223:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF224:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF225:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF226:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF227:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF228:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF229:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF230:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF231:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF232:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF233:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF234:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF235:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF236:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF237:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF238:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF239:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF240:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF241:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF242:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF249:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF250:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF251:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF252:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT:   [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+-  ; GFX90A-NEXT: {{  $}}
+-  ; GFX90A-NEXT: bb.1:
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]], implicit [[DEF129]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]], implicit [[DEF139]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]], implicit [[DEF149]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]], implicit [[DEF159]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]], implicit [[DEF169]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]], implicit [[DEF179]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]], implicit [[DEF189]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]], implicit [[DEF199]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]], implicit [[DEF209]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]], implicit [[DEF219]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
+-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
+-  ; GFX90A-NEXT:   S_ENDPGM 0
+-
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:agpr_32 = IMPLICIT_DEF
+-    %1:agpr_32 = IMPLICIT_DEF
+-    %2:agpr_32 = IMPLICIT_DEF
+-    %3:agpr_32 = IMPLICIT_DEF
+-    %4:agpr_32 = IMPLICIT_DEF
+-    %5:agpr_32 = IMPLICIT_DEF
+-    %6:agpr_32 = IMPLICIT_DEF
+-    %7:agpr_32 = IMPLICIT_DEF
+-    %8:agpr_32 = IMPLICIT_DEF
+-    %9:agpr_32 = IMPLICIT_DEF
+-    %10:agpr_32 = IMPLICIT_DEF
+-    %11:agpr_32 = IMPLICIT_DEF
+-    %12:agpr_32 = IMPLICIT_DEF
+-    %13:agpr_32 = IMPLICIT_DEF
+-    %14:agpr_32 = IMPLICIT_DEF
+-    %15:agpr_32 = IMPLICIT_DEF
+-    %16:agpr_32 = IMPLICIT_DEF
+-    %17:agpr_32 = IMPLICIT_DEF
+-    %18:agpr_32 = IMPLICIT_DEF
+-    %19:agpr_32 = IMPLICIT_DEF
+-    %20:agpr_32 = IMPLICIT_DEF
+-    %21:agpr_32 = IMPLICIT_DEF
+-    %22:agpr_32 = IMPLICIT_DEF
+-    %23:agpr_32 = IMPLICIT_DEF
+-    %24:agpr_32 = IMPLICIT_DEF
+-    %25:agpr_32 = IMPLICIT_DEF
+-    %26:agpr_32 = IMPLICIT_DEF
+-    %27:agpr_32 = IMPLICIT_DEF
+-    %28:agpr_32 = IMPLICIT_DEF
+-    %29:agpr_32 = IMPLICIT_DEF
+-    %30:agpr_32 = IMPLICIT_DEF
+-    %31:agpr_32 = IMPLICIT_DEF
+-    %32:agpr_32 = IMPLICIT_DEF
+-    %33:agpr_32 = IMPLICIT_DEF
+-    %34:agpr_32 = IMPLICIT_DEF
+-    %35:agpr_32 = IMPLICIT_DEF
+-    %36:agpr_32 = IMPLICIT_DEF
+-    %37:agpr_32 = IMPLICIT_DEF
+-    %38:agpr_32 = IMPLICIT_DEF
+-    %39:agpr_32 = IMPLICIT_DEF
+-    %40:agpr_32 = IMPLICIT_DEF
+-    %41:agpr_32 = IMPLICIT_DEF
+-    %42:agpr_32 = IMPLICIT_DEF
+-    %43:agpr_32 = IMPLICIT_DEF
+-    %44:agpr_32 = IMPLICIT_DEF
+-    %45:agpr_32 = IMPLICIT_DEF
+-    %46:agpr_32 = IMPLICIT_DEF
+-    %47:agpr_32 = IMPLICIT_DEF
+-    %48:agpr_32 = IMPLICIT_DEF
+-    %49:agpr_32 = IMPLICIT_DEF
+-    %50:agpr_32 = IMPLICIT_DEF
+-    %51:agpr_32 = IMPLICIT_DEF
+-    %52:agpr_32 = IMPLICIT_DEF
+-    %53:agpr_32 = IMPLICIT_DEF
+-    %54:agpr_32 = IMPLICIT_DEF
+-    %55:agpr_32 = IMPLICIT_DEF
+-    %56:agpr_32 = IMPLICIT_DEF
+-    %57:agpr_32 = IMPLICIT_DEF
+-    %58:agpr_32 = IMPLICIT_DEF
+-    %59:agpr_32 = IMPLICIT_DEF
+-    %60:agpr_32 = IMPLICIT_DEF
+-    %61:agpr_32 = IMPLICIT_DEF
+-    %62:agpr_32 = IMPLICIT_DEF
+-    %63:agpr_32 = IMPLICIT_DEF
+-    %64:agpr_32 = IMPLICIT_DEF
+-    %65:agpr_32 = IMPLICIT_DEF
+-    %66:agpr_32 = IMPLICIT_DEF
+-    %67:agpr_32 = IMPLICIT_DEF
+-    %68:agpr_32 = IMPLICIT_DEF
+-    %69:agpr_32 = IMPLICIT_DEF
+-    %70:agpr_32 = IMPLICIT_DEF
+-    %71:agpr_32 = IMPLICIT_DEF
+-    %72:agpr_32 = IMPLICIT_DEF
+-    %73:agpr_32 = IMPLICIT_DEF
+-    %74:agpr_32 = IMPLICIT_DEF
+-    %75:agpr_32 = IMPLICIT_DEF
+-    %76:agpr_32 = IMPLICIT_DEF
+-    %77:agpr_32 = IMPLICIT_DEF
+-    %78:agpr_32 = IMPLICIT_DEF
+-    %79:agpr_32 = IMPLICIT_DEF
+-    %80:agpr_32 = IMPLICIT_DEF
+-    %81:agpr_32 = IMPLICIT_DEF
+-    %82:agpr_32 = IMPLICIT_DEF
+-    %83:agpr_32 = IMPLICIT_DEF
+-    %84:agpr_32 = IMPLICIT_DEF
+-    %85:agpr_32 = IMPLICIT_DEF
+-    %86:agpr_32 = IMPLICIT_DEF
+-    %87:agpr_32 = IMPLICIT_DEF
+-    %88:agpr_32 = IMPLICIT_DEF
+-    %89:agpr_32 = IMPLICIT_DEF
+-    %90:agpr_32 = IMPLICIT_DEF
+-    %91:agpr_32 = IMPLICIT_DEF
+-    %92:agpr_32 = IMPLICIT_DEF
+-    %93:agpr_32 = IMPLICIT_DEF
+-    %94:agpr_32 = IMPLICIT_DEF
+-    %95:agpr_32 = IMPLICIT_DEF
+-    %96:agpr_32 = IMPLICIT_DEF
+-    %97:agpr_32 = IMPLICIT_DEF
+-    %98:agpr_32 = IMPLICIT_DEF
+-    %99:agpr_32 = IMPLICIT_DEF
+-    %100:agpr_32 = IMPLICIT_DEF
+-    %101:agpr_32 = IMPLICIT_DEF
+-    %102:agpr_32 = IMPLICIT_DEF
+-    %103:agpr_32 = IMPLICIT_DEF
+-    %104:agpr_32 = IMPLICIT_DEF
+-    %105:agpr_32 = IMPLICIT_DEF
+-    %106:agpr_32 = IMPLICIT_DEF
+-    %107:agpr_32 = IMPLICIT_DEF
+-    %108:agpr_32 = IMPLICIT_DEF
+-    %109:agpr_32 = IMPLICIT_DEF
+-    %110:agpr_32 = IMPLICIT_DEF
+-    %111:agpr_32 = IMPLICIT_DEF
+-    %112:agpr_32 = IMPLICIT_DEF
+-    %113:agpr_32 = IMPLICIT_DEF
+-    %114:agpr_32 = IMPLICIT_DEF
+-    %115:agpr_32 = IMPLICIT_DEF
+-    %116:agpr_32 = IMPLICIT_DEF
+-    %117:agpr_32 = IMPLICIT_DEF
+-    %118:agpr_32 = IMPLICIT_DEF
+-    %119:agpr_32 = IMPLICIT_DEF
+-    %120:agpr_32 = IMPLICIT_DEF
+-    %121:agpr_32 = IMPLICIT_DEF
+-    %122:agpr_32 = IMPLICIT_DEF
+-    %123:agpr_32 = IMPLICIT_DEF
+-    %124:agpr_32 = IMPLICIT_DEF
+-    %125:agpr_32 = IMPLICIT_DEF
+-    %126:agpr_32 = IMPLICIT_DEF
+-    %127:agpr_32 = IMPLICIT_DEF
+-    %128:agpr_32 = IMPLICIT_DEF
+-    %129:agpr_32 = IMPLICIT_DEF
+-    %130:agpr_32 = IMPLICIT_DEF
+-    %131:agpr_32 = IMPLICIT_DEF
+-    %132:agpr_32 = IMPLICIT_DEF
+-    %133:agpr_32 = IMPLICIT_DEF
+-    %134:agpr_32 = IMPLICIT_DEF
+-    %135:agpr_32 = IMPLICIT_DEF
+-    %136:agpr_32 = IMPLICIT_DEF
+-    %137:agpr_32 = IMPLICIT_DEF
+-    %138:agpr_32 = IMPLICIT_DEF
+-    %139:agpr_32 = IMPLICIT_DEF
+-    %140:agpr_32 = IMPLICIT_DEF
+-    %141:agpr_32 = IMPLICIT_DEF
+-    %142:agpr_32 = IMPLICIT_DEF
+-    %143:agpr_32 = IMPLICIT_DEF
+-    %144:agpr_32 = IMPLICIT_DEF
+-    %145:agpr_32 = IMPLICIT_DEF
+-    %146:agpr_32 = IMPLICIT_DEF
+-    %147:agpr_32 = IMPLICIT_DEF
+-    %148:agpr_32 = IMPLICIT_DEF
+-    %149:agpr_32 = IMPLICIT_DEF
+-    %150:agpr_32 = IMPLICIT_DEF
+-    %151:agpr_32 = IMPLICIT_DEF
+-    %152:agpr_32 = IMPLICIT_DEF
+-    %153:agpr_32 = IMPLICIT_DEF
+-    %154:agpr_32 = IMPLICIT_DEF
+-    %155:agpr_32 = IMPLICIT_DEF
+-    %156:agpr_32 = IMPLICIT_DEF
+-    %157:agpr_32 = IMPLICIT_DEF
+-    %158:agpr_32 = IMPLICIT_DEF
+-    %159:agpr_32 = IMPLICIT_DEF
+-    %160:agpr_32 = IMPLICIT_DEF
+-    %161:agpr_32 = IMPLICIT_DEF
+-    %162:agpr_32 = IMPLICIT_DEF
+-    %163:agpr_32 = IMPLICIT_DEF
+-    %164:agpr_32 = IMPLICIT_DEF
+-    %165:agpr_32 = IMPLICIT_DEF
+-    %166:agpr_32 = IMPLICIT_DEF
+-    %167:agpr_32 = IMPLICIT_DEF
+-    %168:agpr_32 = IMPLICIT_DEF
+-    %169:agpr_32 = IMPLICIT_DEF
+-    %170:agpr_32 = IMPLICIT_DEF
+-    %171:agpr_32 = IMPLICIT_DEF
+-    %172:agpr_32 = IMPLICIT_DEF
+-    %173:agpr_32 = IMPLICIT_DEF
+-    %174:agpr_32 = IMPLICIT_DEF
+-    %175:agpr_32 = IMPLICIT_DEF
+-    %176:agpr_32 = IMPLICIT_DEF
+-    %177:agpr_32 = IMPLICIT_DEF
+-    %178:agpr_32 = IMPLICIT_DEF
+-    %179:agpr_32 = IMPLICIT_DEF
+-    %180:agpr_32 = IMPLICIT_DEF
+-    %181:agpr_32 = IMPLICIT_DEF
+-    %182:agpr_32 = IMPLICIT_DEF
+-    %183:agpr_32 = IMPLICIT_DEF
+-    %184:agpr_32 = IMPLICIT_DEF
+-    %185:agpr_32 = IMPLICIT_DEF
+-    %186:agpr_32 = IMPLICIT_DEF
+-    %187:agpr_32 = IMPLICIT_DEF
+-    %188:agpr_32 = IMPLICIT_DEF
+-    %189:agpr_32 = IMPLICIT_DEF
+-    %190:agpr_32 = IMPLICIT_DEF
+-    %191:agpr_32 = IMPLICIT_DEF
+-    %192:agpr_32 = IMPLICIT_DEF
+-    %193:agpr_32 = IMPLICIT_DEF
+-    %194:agpr_32 = IMPLICIT_DEF
+-    %195:agpr_32 = IMPLICIT_DEF
+-    %196:agpr_32 = IMPLICIT_DEF
+-    %197:agpr_32 = IMPLICIT_DEF
+-    %198:agpr_32 = IMPLICIT_DEF
+-    %199:agpr_32 = IMPLICIT_DEF
+-    %200:agpr_32 = IMPLICIT_DEF
+-    %201:agpr_32 = IMPLICIT_DEF
+-    %202:agpr_32 = IMPLICIT_DEF
+-    %203:agpr_32 = IMPLICIT_DEF
+-    %204:agpr_32 = IMPLICIT_DEF
+-    %205:agpr_32 = IMPLICIT_DEF
+-    %206:agpr_32 = IMPLICIT_DEF
+-    %207:agpr_32 = IMPLICIT_DEF
+-    %208:agpr_32 = IMPLICIT_DEF
+-    %209:agpr_32 = IMPLICIT_DEF
+-    %210:agpr_32 = IMPLICIT_DEF
+-    %211:agpr_32 = IMPLICIT_DEF
+-    %212:agpr_32 = IMPLICIT_DEF
+-    %213:agpr_32 = IMPLICIT_DEF
+-    %214:agpr_32 = IMPLICIT_DEF
+-    %215:agpr_32 = IMPLICIT_DEF
+-    %216:agpr_32 = IMPLICIT_DEF
+-    %217:agpr_32 = IMPLICIT_DEF
+-    %218:agpr_32 = IMPLICIT_DEF
+-    %219:agpr_32 = IMPLICIT_DEF
+-    %220:agpr_32 = IMPLICIT_DEF
+-    %221:agpr_32 = IMPLICIT_DEF
+-    %222:agpr_32 = IMPLICIT_DEF
+-    %223:agpr_32 = IMPLICIT_DEF
+-    %224:agpr_32 = IMPLICIT_DEF
+-    %225:agpr_32 = IMPLICIT_DEF
+-    %226:agpr_32 = IMPLICIT_DEF
+-    %227:agpr_32 = IMPLICIT_DEF
+-    %228:agpr_32 = IMPLICIT_DEF
+-    %229:agpr_32 = IMPLICIT_DEF
+-    %230:agpr_32 = IMPLICIT_DEF
+-    %231:agpr_32 = IMPLICIT_DEF
+-    %232:agpr_32 = IMPLICIT_DEF
+-    %233:agpr_32 = IMPLICIT_DEF
+-    %234:agpr_32 = IMPLICIT_DEF
+-    %235:agpr_32 = IMPLICIT_DEF
+-    %236:agpr_32 = IMPLICIT_DEF
+-    %237:agpr_32 = IMPLICIT_DEF
+-    %238:agpr_32 = IMPLICIT_DEF
+-    %239:agpr_32 = IMPLICIT_DEF
+-    %240:agpr_32 = IMPLICIT_DEF
+-    %241:agpr_32 = IMPLICIT_DEF
+-    %242:agpr_32 = IMPLICIT_DEF
+-    %243:agpr_32 = IMPLICIT_DEF
+-    %244:agpr_32 = IMPLICIT_DEF
+-    %245:agpr_32 = IMPLICIT_DEF
+-    %246:agpr_32 = IMPLICIT_DEF
+-    %247:agpr_32 = IMPLICIT_DEF
+-    %248:agpr_32 = IMPLICIT_DEF
+-    %249:agpr_32 = IMPLICIT_DEF
+-    %250:agpr_32 = IMPLICIT_DEF
+-    %251:agpr_32 = IMPLICIT_DEF
+-    %252:agpr_32 = IMPLICIT_DEF
+-    %253:agpr_32 = IMPLICIT_DEF
+-    %254:agpr_32 = IMPLICIT_DEF
+-    %255:agpr_32 = IMPLICIT_DEF
+-    %256:agpr_32 = IMPLICIT_DEF
+-
+-    %257:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
+-    %258:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
+-
+-  bb.1:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
+-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
+-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
+-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
+-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
+-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
+-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
+-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
+-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
+-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
+-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
+-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
+-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
+-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
+-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
+-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
+-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
+-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
+-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
+-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
+-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
+-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
+-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254, implicit %255, implicit %256, implicit %257, implicit %258
+-
+-    S_ENDPGM 0
+-...
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+@@ -17,7 +17,7 @@
+   isEntryFunction: true
+ body:             |
+   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
+-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
++  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
+   ; DEBUG-NEXT: ********** MI Scheduling **********
+   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
+   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+@@ -89,7 +89,7 @@
+   isEntryFunction: true
+ body:             |
+   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
+-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
++  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
+   ; DEBUG-NEXT: ********** MI Scheduling **********
+   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
+   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+@@ -725,11 +725,11 @@
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+@@ -1181,13 +1181,8 @@
+   ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+   ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+@@ -1201,13 +1196,14 @@
+   ; GFX908-NEXT: bb.2:
+   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+   ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.3:
+   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+   ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
+   ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+   ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
+@@ -1232,11 +1228,7 @@
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
+@@ -1315,12 +1307,7 @@
+     %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+     %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+     %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+ 
+     %100:sgpr_32 = S_MOV_B32 0
+     %101:sgpr_32 = S_MOV_B32 1
+@@ -1443,10 +1430,7 @@
+     S_NOP 0, implicit %30, implicit %31
+     S_NOP 0, implicit %32, implicit %33
+     S_NOP 0, implicit %34, implicit %35
+-    S_NOP 0, implicit %36, implicit %37
+-    S_NOP 0, implicit %38, implicit %39
+-    S_NOP 0, implicit %40, implicit %41
+-    S_NOP 0, implicit %42
++    S_NOP 0, implicit %36
+ 
+     S_NOP 0, implicit %100, implicit %101
+     S_NOP 0, implicit %102, implicit %103
+@@ -2972,11 +2956,11 @@
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+@@ -3150,11 +3134,11 @@
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+@@ -3340,11 +3324,11 @@
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+@@ -4936,604 +4920,6 @@
+     S_ENDPGM 0
+ ...
+ ---
+-name:            test_occ_1_sink_for_spill
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: test_occ_1_sink_for_spill
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.2:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+-    %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+-
+-  bb.1:
+-    successors: %bb.2
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+-    %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
+-    %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
+-    %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
+-    %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
+-    %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
+-    %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
+-    %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
+-    %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
+-    %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
+-    %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
+-    %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
+-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
+-    %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
+-    %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
+-    %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
+-    %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
+-    %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
+-    %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
+-    %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
+-    %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
+-    %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
+-    %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
+-    %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
+-    %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
+-    %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
+-    %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
+-    %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
+-    %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
+-    %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
+-    %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
+-    %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
+-    %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
+-    %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
+-    %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
+-    %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
+-    %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
+-    %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
+-    %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
+-    %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
+-    %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
+-    %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
+-    %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
+-    %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
+-    %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
+-    %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
+-    %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
+-    %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
+-    %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
+-    %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
+-    %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
+-    %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
+-    %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
+-    %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
+-    %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
+-    %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
+-    %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
+-    %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
+-    %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
+-    %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
+-    %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
+-    %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
+-    %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
+-    %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
+-    %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
+-    %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
+-    %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
+-    %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
+-    %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
+-    %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
+-    %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
+-    %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
+-    %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
+-    %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
+-    %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
+-    %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
+-    %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
+-    %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
+-    %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
+-    %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
+-    %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
+-    %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
+-    %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
+-    %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
+-    %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
+-    %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
+-    %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
+-    %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
+-    %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
+-    %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
+-    %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
+-    %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
+-    %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
+-    %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
+-    %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
+-    %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
+-    %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
+-    %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
+-    %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
+-    %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
+-    %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
+-    %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
+-    %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
+-    %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
+-    %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
+-    %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
+-    %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
+-    %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
+-    %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
+-    %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
+-    %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
+-    %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
+-    %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
+-    %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
+-    %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
+-    %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
+-    %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
+-    %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
+-    %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
+-    %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
+-    %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
+-    %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
+-    %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
+-    %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
+-    %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
+-    %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
+-    %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
+-    %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
+-    %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
+-    %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
+-    %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
+-    %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
+-    %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
+-    %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
+-    %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
+-    %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
+-    %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
+-    %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
+-    %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
+-    %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
+-    %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
+-    %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
+-    %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
+-    %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
+-    %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
+-    %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
+-    %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
+-    %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
+-    %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
+-    %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
+-    %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
+-    %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
+-    %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
+-    %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
+-    %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
+-    %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
+-    %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
+-    %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
+-    %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
+-    %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
+-    %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
+-    %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
+-    %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
+-    %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
+-    %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
+-    %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
+-    %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
+-    %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
+-    %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
+-    %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
+-    %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
+-    %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
+-    %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
+-    %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
+-    %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
+-    %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
+-    %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
+-    %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
+-    %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
+-    %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
+-    %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
+-    %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
+-    %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
+-    %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
+-    %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
+-    %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
+-    %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
+-    %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
+-    %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
+-    %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
+-    %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
+-    %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
+-    %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
+-    %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
+-    %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
+-    %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
+-    %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
+-    %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
+-    %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
+-    %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
+-    %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
+-    %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
+-    %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
+-
+-  bb.2:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
+-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
+-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
+-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
+-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
+-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
+-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
+-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
+-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
+-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
+-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
+-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
+-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
+-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
+-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
+-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
+-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
+-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
+-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
+-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
+-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
+-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
+-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254
+-
+-    S_NOP 0, implicit %255, implicit %256
+-
+-    S_ENDPGM 0
+-...
+----
+ name:            test_no_sink_two_subregs_in_def_block
+ tracksRegLiveness: true
+ machineFunctionInfo:
+@@ -6158,12 +5544,12 @@
+     S_ENDPGM 0
+ ...
+ ---
+-name:            test_occ_7_sink_one_def_of_undef_subreg_for_8
++name:            test_occ_9_no_sink_one_def_of_undef_subreg
+ tracksRegLiveness: true
+ machineFunctionInfo:
+   isEntryFunction: true
+ body:             |
+-  ; GFX908-LABEL: name: test_occ_7_sink_one_def_of_undef_subreg_for_8
++  ; GFX908-LABEL: name: test_occ_9_no_sink_one_def_of_undef_subreg
+   ; GFX908: bb.0:
+   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+   ; GFX908-NEXT: {{  $}}
+@@ -6190,22 +5576,16 @@
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+   ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.2:
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
+@@ -6217,13 +5597,7 @@
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+   ; GFX908-NEXT:   S_ENDPGM 0
+   bb.0:
+     successors: %bb.1
+@@ -6251,24 +5625,17 @@
+     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+     %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    undef %32.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
++    undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+ 
+   bb.1:
+     successors: %bb.2
+ 
+-    S_NOP 0, implicit %0
++    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++    S_NOP 0, implicit %24
+ 
+   bb.2:
+ 
++    S_NOP 0, implicit %23.sub1
+     S_NOP 0, implicit %0, implicit %1
+     S_NOP 0, implicit %2, implicit %3
+     S_NOP 0, implicit %4, implicit %5
+@@ -6280,12 +5647,7 @@
+     S_NOP 0, implicit %16, implicit %17
+     S_NOP 0, implicit %18, implicit %19
+     S_NOP 0, implicit %20, implicit %21
+-    S_NOP 0, implicit %22, implicit %23
+-    S_NOP 0, implicit %24, implicit %25
+-    S_NOP 0, implicit %26, implicit %27
+-    S_NOP 0, implicit %28, implicit %29
+-    S_NOP 0, implicit %30, implicit %31
+-    S_NOP 0, implicit %32.sub1
++    S_NOP 0, implicit %22
+     S_ENDPGM 0
+ ...
+ ---
+@@ -6504,770 +5866,7 @@
+     S_NOP 0, implicit %22
+     S_ENDPGM 0
+ ...
+----
+-name:            test_live_through_occ_7_sink_for_8
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: test_live_through_occ_7_sink_for_8
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   dead [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.2:
+-  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_10]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.3:
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+-
+-  bb.1:
+-    successors: %bb.2
+-
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-
+-  bb.2:
+-    successors: %bb.3
+-
+-    S_NOP 0, implicit %2,  implicit %3,  implicit %4,  implicit %5,  implicit %6
+-    S_NOP 0, implicit %7,  implicit %8,  implicit %9,  implicit %10, implicit %11
+-    S_NOP 0, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16
+-    S_NOP 0, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21
+-    S_NOP 0, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26
+-    S_NOP 0, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
+-
+-  bb.3:
+-    S_NOP 0, implicit %0, implicit %1
+-    S_ENDPGM 0
+-...
+----
+-name:            test_remat_over_exec_modif
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: test_remat_over_exec_modif
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT:   liveins: $sgpr2_sgpr3
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   %save_exec:sreg_64 = S_MOV_B64 $exec
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   $exec = S_MOV_B64 %new_exec
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+-  ; GFX908-NEXT:   $exec = S_MOV_B64 %save_exec
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    liveins: $sgpr2_sgpr3
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-
+-    %save_exec:sreg_64 = S_MOV_B64 $exec
+-    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+-    $exec = S_MOV_B64 %new_exec
+-
+-  bb.1:
+-
+-    S_NOP 0, implicit %0, implicit %16
+-    S_NOP 0, implicit %1, implicit %17
+-    S_NOP 0, implicit %2, implicit %18
+-    S_NOP 0, implicit %3, implicit %19
+-    S_NOP 0, implicit %4, implicit %20
+-    S_NOP 0, implicit %5, implicit %21
+-    S_NOP 0, implicit %6, implicit %22
+-    S_NOP 0, implicit %7, implicit %23
+-    S_NOP 0, implicit %8, implicit %24
+-    S_NOP 0, implicit %9, implicit %25
+-    S_NOP 0, implicit %10, implicit %26
+-    S_NOP 0, implicit %11, implicit %27
+-    S_NOP 0, implicit %12, implicit %28
+-    S_NOP 0, implicit %13, implicit %29
+-    S_NOP 0, implicit %14, implicit %30
+-    S_NOP 0, implicit %15, implicit %31
+-    S_NOP 0, implicit %32
+-
+-    $exec = S_MOV_B64 %save_exec
+-    S_ENDPGM 0
+-...
+----
+-name:            test_remat_same_block
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  bb.0:
+-    liveins: $sgpr2_sgpr3
+-
+-    ; GFX908-LABEL: name: test_remat_same_block
+-    ; GFX908: liveins: $sgpr2_sgpr3
+-    ; GFX908-NEXT: {{  $}}
+-    ; GFX908-NEXT: %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: %save_exec:sreg_64 = S_MOV_B64 $exec
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    ; GFX908-NEXT: $exec = S_MOV_B64 %new_exec
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+-    ; GFX908-NEXT: $exec = S_MOV_B64 %save_exec
+-    ; GFX908-NEXT: S_ENDPGM 0
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-
+-    %save_exec:sreg_64 = S_MOV_B64 $exec
+-    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+-    $exec = S_MOV_B64 %new_exec
+-
+-    S_NOP 0, implicit %0, implicit %16
+-    S_NOP 0, implicit %1, implicit %17
+-    S_NOP 0, implicit %2, implicit %18
+-    S_NOP 0, implicit %3, implicit %19
+-    S_NOP 0, implicit %4, implicit %20
+-    S_NOP 0, implicit %5, implicit %21
+-    S_NOP 0, implicit %6, implicit %22
+-    S_NOP 0, implicit %7, implicit %23
+-    S_NOP 0, implicit %8, implicit %24
+-    S_NOP 0, implicit %9, implicit %25
+-    S_NOP 0, implicit %10, implicit %26
+-    S_NOP 0, implicit %11, implicit %27
+-    S_NOP 0, implicit %12, implicit %28
+-    S_NOP 0, implicit %13, implicit %29
+-    S_NOP 0, implicit %14, implicit %30
+-    S_NOP 0, implicit %15, implicit %31
+-    S_NOP 0, implicit %32
+ 
+-    $exec = S_MOV_B64 %save_exec
+-    S_ENDPGM 0
+-...
+----
+-name:            test_rollback_remat_defregion_above_target
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: test_rollback_remat_defregion_above_target
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.2:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-
+-  bb.1:
+-    successors: %bb.2
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32
+-
+-  bb.2:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31
+-
+-    S_ENDPGM 0
+-...
+----
+-name:            test_rollback_remat_useregion_above_target
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: test_rollback_remat_useregion_above_target
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.2:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-
+-  bb.1:
+-    successors: %bb.2
+-
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32
+-
+-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
+-
+-  bb.2:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31
+-    S_NOP 0, implicit %33,  implicit %34
+-
+-    S_ENDPGM 0
+-...
+----
+-name:            test_rollback_remats_emptydefregion
+-tracksRegLiveness: true
+-machineFunctionInfo:
+-  isEntryFunction: true
+-body:             |
+-  ; GFX908-LABEL: name: test_rollback_remats_emptydefregion
+-  ; GFX908: bb.0:
+-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.2:
+-  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
+-  ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT: bb.3:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+-  ; GFX908-NEXT:   S_ENDPGM 0
+-  bb.0:
+-    successors: %bb.1
+-
+-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-
+-  bb.1:
+-    successors: %bb.2
+-
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+-
+-  bb.2:
+-    successors: %bb.3
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
+-
+-  bb.3:
+-
+-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
+-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
+-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
+-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
+-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
+-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
+-    S_NOP 0, implicit %30,  implicit %31
+-
+-    S_ENDPGM 0
+-...
+ ---
+ name:            test_occ_8_physreg_use
+ tracksRegLiveness: true
+@@ -7347,15 +5946,15 @@
+   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   S_BRANCH %bb.1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
+   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
+@@ -7449,9 +6048,9 @@
+ 
+   bb.4:
+ 
+-    S_NOP 0, implicit %52, implicit %62
+     S_NOP 0, implicit %50, implicit %60, implicit %20
+     S_NOP 0, implicit %51, implicit %61, implicit %21
++    S_NOP 0, implicit %52, implicit %62
+     S_NOP 0, implicit %53, implicit %63
+     S_NOP 0, implicit %54, implicit %64
+     S_NOP 0, implicit %55, implicit %65
+@@ -7468,6 +6067,7 @@
+     S_NOP 0, implicit %80
+     S_ENDPGM 0
+ ...
++
+ ---
+ name:            test_occ_8_exec_use
+ tracksRegLiveness: true
+@@ -7543,21 +6143,21 @@
+   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
+   ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   S_BRANCH %bb.1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+   ; GFX908-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 255
+   ; GFX908-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 [[S_MOV_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF29]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF30]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_9]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF30]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_10]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
+@@ -7654,9 +6254,9 @@
+ 
+     %100:sreg_64 = S_MOV_B64 255
+     %101:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %100, implicit-def $exec, implicit-def $scc, implicit $exec
+-    S_NOP 0, implicit %52, implicit %62
+     S_NOP 0, implicit %50, implicit %60, implicit %20
+     S_NOP 0, implicit %51, implicit %61, implicit %21
++    S_NOP 0, implicit %52, implicit %62
+     S_NOP 0, implicit %53, implicit %63
+     S_NOP 0, implicit %54, implicit %64
+     S_NOP 0, implicit %55, implicit %65
+@@ -7674,6 +6274,7 @@
+     $exec = S_MOV_B64 %101:sreg_64_xexec
+     S_ENDPGM 0
+ ...
++
+ ---
+ name:            remat_virtual_vgpr_occ_6
+ tracksRegLiveness: true
+@@ -7740,30 +6341,30 @@
+   ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
++  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+   ; GFX908-NEXT:   S_BRANCH %bb.1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF22]], implicit [[DEF27]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF23]], implicit [[DEF28]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF29]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF30]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF31]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF30]], implicit [[DEF25]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF31]], implicit [[DEF26]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF22]], implicit [[DEF27]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF23]], implicit [[DEF28]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF24]], implicit [[DEF29]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+@@ -7852,12 +6453,12 @@
+     S_BRANCH %bb.4
+ 
+   bb.4:
+-    S_NOP 0, implicit %55, implicit %65
+     S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
+     S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
+     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
++    S_NOP 0, implicit %55, implicit %65
+     S_NOP 0, implicit %56, implicit %66
+     S_NOP 0, implicit %57, implicit %67
+     S_NOP 0, implicit %58, implicit %68
+@@ -8144,25 +6745,25 @@
+   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   S_BRANCH %bb.1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF24]], implicit [[DEF28]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF25]], implicit [[DEF29]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF26]], implicit [[DEF30]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF27]], implicit [[DEF31]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF28]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF30]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF27]], implicit [[DEF31]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+@@ -8254,12 +6855,12 @@
+     S_BRANCH %bb.4
+ 
+   bb.4:
+-    S_NOP 0, implicit %55, implicit %65
+     S_NOP 0, implicit %50, implicit %60, implicit %10.sub0, implicit %10.sub2
+     S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
+     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
++    S_NOP 0, implicit %55, implicit %65
+     S_NOP 0, implicit %56, implicit %66
+     S_NOP 0, implicit %57, implicit %67
+     S_NOP 0, implicit %58, implicit %68
+@@ -8555,24 +7156,24 @@
+   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   undef [[V_CVT_I32_F32_e32_1:%[0-9]+]].sub0:vreg_64 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
+   ; GFX908-NEXT:   S_BRANCH %bb.1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF1]], implicit [[DEF28]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
+   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF1]], implicit [[DEF28]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF25]], implicit [[DEF29]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF26]], implicit [[DEF30]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF27]], implicit [[DEF31]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF26]], implicit [[DEF30]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF27]], implicit [[DEF31]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_11]]
+@@ -8663,13 +7264,13 @@
+     S_BRANCH %bb.4
+ 
+   bb.4:
+-    S_NOP 0, implicit %55, implicit %65
+-    S_NOP 0, implicit %56, implicit %66
+     S_NOP 0, implicit %50.sub0, implicit %60, implicit %10.sub0, implicit %10.sub2
+     S_NOP 0, implicit %52, implicit %61, implicit %11, implicit %21
+     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
++    S_NOP 0, implicit %55, implicit %65
++    S_NOP 0, implicit %56, implicit %66
+     S_NOP 0, implicit %57, implicit %67
+     S_NOP 0, implicit %58, implicit %68
+     S_NOP 0, implicit %59, implicit %69
+@@ -8695,75 +7296,123 @@
+   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+   ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+   ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   dead [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+-  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF]].sub0, implicit $mode, implicit $exec
+-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF32]].sub0, implicit $mode, implicit $exec
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   dead [[DEF33:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
++  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++  ; GFX908-NEXT:   S_BRANCH %bb.1
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.1:
+-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++  ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+   ; GFX908-NEXT: {{  $}}
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_40]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_41]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_42]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_33]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_34]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_35]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_36]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_37]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_38]]
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_39]]
++  ; GFX908-NEXT:   %temp:vgpr_32 = IMPLICIT_DEF
++  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr3, $sgpr4, implicit-def $scc
++  ; GFX908-NEXT:   [[DEF34:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, [[DEF32]].sub0, 1, %temp, 0, 0, implicit $mode, implicit $exec
++  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
++  ; GFX908-NEXT:   S_BRANCH %bb.3
+   ; GFX908-NEXT: {{  $}}
+   ; GFX908-NEXT: bb.2:
+-  ; GFX908-NEXT:   dead [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF2]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF2]].sub0, 0, 0, implicit $mode, implicit $exec
++  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
++  ; GFX908-NEXT: {{  $}}
++  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF34]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF34]].sub0, 0, 0, implicit $mode, implicit $exec
+   ; GFX908-NEXT:   %temp2:vreg_64_align2 = IMPLICIT_DEF
+-  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF32]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+   ; GFX908-NEXT:   dead [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, [[V_FMA_F32_e64_]], 8, %temp2, 11, [[V_PK_MUL_F32_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[V_RCP_F32_e32_]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_23]], implicit [[DEF22]], implicit [[DEF27]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF23]], implicit [[DEF28]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF24]], implicit [[DEF29]]
++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++  ; GFX908-NEXT:   dead [[DEF35:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF25]], implicit [[DEF30]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF26]], implicit [[DEF31]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF32]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
++  ; GFX908-NEXT:   S_BRANCH %bb.3
++  ; GFX908-NEXT: {{  $}}
++  ; GFX908-NEXT: bb.3:
++  ; GFX908-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
++  ; GFX908-NEXT: {{  $}}
++  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
++  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
++  ; GFX908-NEXT:   S_BRANCH %bb.4
++  ; GFX908-NEXT: {{  $}}
++  ; GFX908-NEXT: bb.4:
+   ; GFX908-NEXT:   S_ENDPGM 0
+   bb.0:
+     liveins: $sgpr3, $sgpr4
+@@ -8773,72 +7422,116 @@
+     undef %2.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 %1.sub0:vreg_64_align2, implicit $mode, implicit $exec
+     %3:vreg_64_align2 =  nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, %2, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+     %5:vreg_64_align2 = IMPLICIT_DEF
+-
+-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
+-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
+-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
+-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
+-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
+-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
+-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
+-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
+-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
+-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
+-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
+-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
+-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
+-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
+-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
+-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
+-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
+-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++    %10:vgpr_32 = IMPLICIT_DEF
++    %11:vgpr_32 = IMPLICIT_DEF
++    %12:vgpr_32 = IMPLICIT_DEF
++    %13:vgpr_32 = IMPLICIT_DEF
++    %14:vgpr_32 = IMPLICIT_DEF
++    %15:vgpr_32 = IMPLICIT_DEF
++    %16:vgpr_32 = IMPLICIT_DEF
++    %17:vgpr_32 = IMPLICIT_DEF
++    %18:vgpr_32 = IMPLICIT_DEF
++    %19:vgpr_32 = IMPLICIT_DEF
++    %20:vgpr_32 = IMPLICIT_DEF
++    %21:vgpr_32 = IMPLICIT_DEF
++    %22:vgpr_32 = IMPLICIT_DEF
++    %23:vgpr_32 = IMPLICIT_DEF
++    %24:vgpr_32 = IMPLICIT_DEF
++    %25:vgpr_32 = IMPLICIT_DEF
++    %26:vgpr_32 = IMPLICIT_DEF
++    %27:vgpr_32 = IMPLICIT_DEF
++    %28:vgpr_32 = IMPLICIT_DEF
++    %29:vgpr_32 = IMPLICIT_DEF
++    %30:vgpr_32 = IMPLICIT_DEF
++    %31:vgpr_32 = IMPLICIT_DEF
++    %32:vgpr_32 = IMPLICIT_DEF
++    %33:vgpr_32 = IMPLICIT_DEF
++    %34:vgpr_32 = IMPLICIT_DEF
++    %35:vgpr_32 = IMPLICIT_DEF
++    %36:vgpr_32 = IMPLICIT_DEF
++    %37:vgpr_32 = IMPLICIT_DEF
++    %38:vgpr_32 = IMPLICIT_DEF
++    %39:vgpr_32 = IMPLICIT_DEF
++    %40:vgpr_32 = IMPLICIT_DEF
++    %41:vgpr_32 = IMPLICIT_DEF
++    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
++    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
++    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
++    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
++    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
++    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
++    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
++    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
++    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
++    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
++    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
++    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
++    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
++    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
++    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
++    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
++    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
++    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
++    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
++    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
++    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
++    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
++    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
++    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
++    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
++    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
++    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
++    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
++    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
++    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
++    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
++    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
++    S_BRANCH %bb.1
+ 
+   bb.1:
++    liveins: $sgpr3, $sgpr4
+ 
+-    S_NOP 0, implicit %10, implicit %20, implicit %30, implicit %40, implicit %50
+-    S_NOP 0, implicit %11, implicit %21, implicit %31, implicit %41, implicit %51
+-    S_NOP 0, implicit %12, implicit %22, implicit %32, implicit %42, implicit %52
+-    S_NOP 0, implicit %13, implicit %23, implicit %33, implicit %43
+-    S_NOP 0, implicit %14, implicit %24, implicit %34, implicit %44
+-    S_NOP 0, implicit %15, implicit %25, implicit %35, implicit %45
+-    S_NOP 0, implicit %16, implicit %26, implicit %36, implicit %46
+-    S_NOP 0, implicit %17, implicit %27, implicit %37, implicit %47
+-    S_NOP 0, implicit %18, implicit %28, implicit %38, implicit %48
+-    S_NOP 0, implicit %19, implicit %29, implicit %39, implicit %49
++    %temp:vgpr_32 = IMPLICIT_DEF
++    %5.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, %1.sub0:vreg_64_align2, 1, %temp, 0, 0, implicit $mode, implicit $exec
+ 
+-  bb.2:
++    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
++    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
++    S_BRANCH %bb.4
++
++  bb.3:
++    liveins: $sgpr3, $sgpr4
+ 
+     %6:vreg_64_align2 = IMPLICIT_DEF
+     undef %7.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, %5.sub1, 0, %2.sub0:vreg_64_align2, 0, %5.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
+     %temp2:vreg_64_align2 = IMPLICIT_DEF
+     %8:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, %7:vreg_64_align2, 8, %temp2, 11, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++    S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
++    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
++    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
++    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
++    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
++    S_NOP 0, implicit %55, implicit %65, implicit %1
++    S_NOP 0, implicit %56, implicit %66
++    S_NOP 0, implicit %57, implicit %67
++    S_NOP 0, implicit %58, implicit %68
++    S_NOP 0, implicit %59, implicit %69
++    S_NOP 0, implicit %70, implicit %71
++    S_NOP 0, implicit %72, implicit %73
++    S_NOP 0, implicit %72, implicit %73
++    S_NOP 0, implicit %74, implicit %75
++    S_NOP 0, implicit %76, implicit %77
++    S_NOP 0, implicit %78, implicit %79
++    S_NOP 0, implicit %80
++    S_BRANCH %bb.4
+ 
+-    S_NOP 0, implicit %1,  implicit %2
++  bb.4:
++    liveins: $sgpr3, $sgpr4
+ 
++    S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
++    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
++    S_BRANCH %bb.2
++
++  bb.2:
+     S_ENDPGM 0
++
+ ...
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
++++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+@@ -506,8 +506,8 @@
+ ; GFX908-NEXT:    v_accvgpr_write_b32 a3, 0
+ ; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
+ ; GFX908-NEXT:    v_accvgpr_write_b32 a0, 0
+-; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+ ; GFX908-NEXT:    s_mov_b32 s0, 16
++; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+ ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
+ ; GFX908-NEXT:  .LBB2_1: ; %for.cond.preheader
+ ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+@@ -566,6 +566,7 @@
+ ;
+ ; GFX90A-LABEL: test_mfma_loop_non_splat:
+ ; GFX90A:       ; %bb.0: ; %entry
++; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
+ ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
+ ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
+ ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+@@ -599,7 +600,6 @@
+ ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
+ ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
+ ; GFX90A-NEXT:    s_mov_b32 s0, 16
+-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
+ ; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
+ ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
+ ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+@@ -626,6 +626,7 @@
+ ;
+ ; GFX942-LABEL: test_mfma_loop_non_splat:
+ ; GFX942:       ; %bb.0: ; %entry
++; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+ ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
+ ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
+ ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+@@ -659,7 +660,6 @@
+ ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
+ ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+ ; GFX942-NEXT:    s_mov_b32 s0, 16
+-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+ ; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
+ ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
+ ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+diff -ruN --strip-trailing-cr a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
+--- a/mlir/lib/TableGen/Pattern.cpp
++++ b/mlir/lib/TableGen/Pattern.cpp
+@@ -304,8 +304,8 @@
+     assert(index < 0);
+     auto *operand = cast<NamedTypeConstraint *>(op->getArg(getArgIndex()));
+     if (operand->isOptional()) {
+-      auto repl =
+-          formatv(fmt, formatv("({0}.empty() ? Value() : *{0}.begin())", name));
++      auto repl = formatv(
++          fmt, formatv("({0}.empty() ? ::mlir::Value() : *{0}.begin())", name));
+       LLVM_DEBUG(dbgs() << repl << " (OptionalOperand)\n");
+       return std::string(repl);
+     }
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 691c32262772..2573c25056c4 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "fd9a882ce31cb0a53dba63528c15d76f088854b7"
-    LLVM_SHA256 = "99ab085087ea6a5f27f293a9f06c837eca57042bc6a7e11f3cd2d9d2168274b3"
+    LLVM_COMMIT = "741fef3a445339523500f614e0f752b9a74517a6"
+    LLVM_SHA256 = "ae542233c385388cb5f8ce04bf2085ed108b1a592c530c1746e8c92d00bd33fb"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/llvm_openmp/cmake_vars.bzl b/third_party/llvm_openmp/cmake_vars.bzl
deleted file mode 100644
index 95f2fe1c6120..000000000000
--- a/third_party/llvm_openmp/cmake_vars.bzl
+++ /dev/null
@@ -1,57 +0,0 @@
-"""This file contains helpers for cmake."""
-
-def _quote(s):
-    """Quotes the given string for use in a shell command.
-
-    This function double-quotes the given string (in case it contains spaces or
-    other special characters) and escapes any special characters (dollar signs,
-    double-quotes, and backslashes) that may be present.
-
-    Args:
-      s: The string to quote.
-
-    Returns:
-      An escaped and quoted version of the string that can be passed to a shell
-      command.
-    """
-    return ('"' +
-            s.replace("\\", "\\\\").replace("$", "\\$").replace('"', "\\\"") +
-            '"')
-
-def cmake_var_string(cmake_vars):
-    """Converts a dictionary to an input suitable for expand_cmake_vars.
-
-    Ideally we would jist stringify in the expand_cmake_vars() rule, but select()
-    interacts badly with genrules.
-
-    Args:
-      cmake_vars: a dictionary with string keys and values that are convertable to
-        strings.
-
-    Returns:
-      cmake_vars in a form suitable for passing to expand_cmake_vars.
-    """
-    return " ".join([
-        _quote("{}={}".format(k, str(v)))
-        for (k, v) in cmake_vars.items()
-    ])
-
-def expand_cmake_vars(name, src, dst, cmake_vars):
-    """Expands #cmakedefine, #cmakedefine01, and CMake variables in a text file.
-
-    Args:
-      name: the name of the rule
-      src: the input of the rule
-      dst: the output of the rule
-      cmake_vars: a string containing the CMake variables, as generated by
-        cmake_var_string.
-    """
-    expand_cmake_vars_tool = "@local_xla//third_party/llvm_openmp:expand_cmake_vars"
-    native.genrule(
-        name = name,
-        srcs = [src],
-        tools = [expand_cmake_vars_tool],
-        outs = [dst],
-        cmd = ("$(location {}) ".format(expand_cmake_vars_tool) + cmake_vars +
-               "< $< > $@"),
-    )
diff --git a/third_party/llvm_openmp/expand_cmake_vars.py b/third_party/llvm_openmp/expand_cmake_vars.py
deleted file mode 100644
index d28ebc80fa31..000000000000
--- a/third_party/llvm_openmp/expand_cmake_vars.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Expands CMake variables in a text file."""
-
-import re
-import sys
-
-_CMAKE_DEFINE_REGEX = re.compile(r"\s*#cmakedefine\s+([A-Za-z_0-9]*)(\s.*)?$")
-_CMAKE_DEFINE01_REGEX = re.compile(r"\s*#cmakedefine01\s+([A-Za-z_0-9]*)")
-_CMAKE_VAR_REGEX = re.compile(r"\${([A-Za-z_0-9]*)}")
-_CMAKE_ATVAR_REGEX = re.compile(r"@([A-Za-z_0-9]*)@")
-
-
-def _parse_args(argv):
-  """Parses arguments with the form KEY=VALUE into a dictionary."""
-  result = {}
-  for arg in argv:
-    k, v = arg.split("=")
-    result[k] = v
-  return result
-
-
-def _expand_variables(input_str, cmake_vars):
-  """Expands ${VARIABLE}s and @VARIABLE@s in 'input_str', using dictionary 'cmake_vars'.
-
-  Args:
-    input_str: the string containing ${VARIABLE} or @VARIABLE@ expressions to expand.
-    cmake_vars: a dictionary mapping variable names to their values.
-
-  Returns:
-    The expanded string.
-  """
-  def replace(match):
-    if match.group(1) in cmake_vars:
-      return cmake_vars[match.group(1)]
-    return ""
-  return _CMAKE_ATVAR_REGEX.sub(replace,_CMAKE_VAR_REGEX.sub(replace, input_str))
-
-
-def _expand_cmakedefines(line, cmake_vars):
-  """Expands #cmakedefine declarations, using a dictionary 'cmake_vars'."""
-
-  # Handles #cmakedefine lines
-  match = _CMAKE_DEFINE_REGEX.match(line)
-  if match:
-    name = match.group(1)
-    suffix = match.group(2) or ""
-    if name in cmake_vars:
-      return "#define {}{}\n".format(name,
-                                     _expand_variables(suffix, cmake_vars))
-    else:
-      return "/* #undef {} */\n".format(name)
-
-  # Handles #cmakedefine01 lines
-  match = _CMAKE_DEFINE01_REGEX.match(line)
-  if match:
-    name = match.group(1)
-    value = cmake_vars.get(name, "0")
-    return "#define {} {}\n".format(name, value)
-
-  # Otherwise return the line unchanged.
-  return _expand_variables(line, cmake_vars)
-
-
-def main():
-  cmake_vars = _parse_args(sys.argv[1:])
-  for line in sys.stdin:
-    sys.stdout.write(_expand_cmakedefines(line, cmake_vars))
-
-
-if __name__ == "__main__":
-  main()
diff --git a/third_party/llvm_openmp/openmp.bzl b/third_party/llvm_openmp/openmp.bzl
deleted file mode 100644
index 1b41d1dcc3e2..000000000000
--- a/third_party/llvm_openmp/openmp.bzl
+++ /dev/null
@@ -1,96 +0,0 @@
-"""This file contains BUILD extensions for building llvm_openmp.
-TODO(Intel-tf): Delete this and reuse a similar function in third_party/llvm
-after the TF 2.4 branch cut has passed.
-"""
-
-load(
-    "@local_xla//xla/tsl/platform:rules_cc.bzl",
-    "cc_binary",
-)
-
-WINDOWS_MSVC_LLVM_OPENMP_LIBPATH = "bazel-out/x64_windows-opt/bin/external/llvm_openmp/libiomp5md.dll.if.lib"
-WINDOWS_MSVC_LLVM_OPENMP_LINKOPTS = "/NODEFAULTLIB:libomp /DEFAULTLIB:" + WINDOWS_MSVC_LLVM_OPENMP_LIBPATH
-
-def windows_llvm_openmp_linkopts():
-    return WINDOWS_MSVC_LLVM_OPENMP_LINKOPTS
-
-def dict_add(*dictionaries):
-    """Returns a new `dict` that has all the entries of the given dictionaries.
-
-    If the same key is present in more than one of the input dictionaries, the
-    last of them in the argument list overrides any earlier ones.
-
-    Args:
-      *dictionaries: Zero or more dictionaries to be added.
-
-    Returns:
-      A new `dict` that has all the entries of the given dictionaries.
-    """
-    result = {}
-    for d in dictionaries:
-        result.update(d)
-    return result
-
-def select_os_specific(L, M, W):
-    return select({
-        "@local_xla//xla/tsl:linux_x86_64": L,
-        "@local_xla//xla/tsl:macos": M,
-        "@local_xla//xla/tsl:windows": W,
-        "//conditions:default": L,
-    })
-
-def select_os_specific_2(LM, W):
-    return select_os_specific(L = LM, M = LM, W = W)
-
-def libname_os_specific():
-    return "" + select_os_specific(L = "libiomp5.so", M = "libiomp5.dylib", W = "libiomp5md.dll")
-
-# TODO(Intel-tf) Replace the following calls to cc_binary with cc_library.
-# cc_library should be used for files that are not independently executed. Using
-# cc_library results in linking errors. For e.g on Linux, the build fails
-# with the following error message.
-# ERROR: //tensorflow/BUILD:689:1: Linking of rule '//tensorflow:libtensorflow_framework.so.2.4.0' failed (Exit 1)
-# /usr/bin/ld.gold: error: symbol GOMP_parallel_loop_nonmonotonic_guided has undefined version VERSION
-# /usr/bin/ld.gold: error: symbol GOMP_parallel_start has undefined version GOMP_1.0
-# /usr/bin/ld.gold: error: symbol GOMP_cancellation_point has undefined version GOMP_4.0
-# /usr/bin/ld.gold: error: symbol omp_set_num_threads has undefined version OMP_1.0
-# ......
-# ......
-
-# MacOS build has not been tested, however since the MacOS build of openmp
-# uses the same configuration as Linux, the following should work.
-def libiomp5_cc_binary(name, cppsources, srcdeps, common_includes):
-    cc_binary(
-        name = name,
-        srcs = cppsources + srcdeps +
-               select_os_specific_2(
-                   LM = [
-                       #linux & macos specific files
-                       "runtime/src/z_Linux_util.cpp",
-                       "runtime/src/kmp_gsupport.cpp",
-                       "runtime/src/z_Linux_asm.S",
-                   ],
-                   W = [
-                       #window specific files
-                       "runtime/src/z_Windows_NT_util.cpp",
-                       "runtime/src/z_Windows_NT-586_util.cpp",
-                       ":openmp_asm",
-                   ],
-               ),
-        defines = select_os_specific_2(
-            LM = ["omp_EXPORTS", "_GNU_SOURCE", "_REENTRANT"],
-            W = ["omp_EXPORTS", "_M_AMD64", "OMPT_SUPPORT=0", "_WINDOWS", "_WINNT", "_USRDLL"],
-        ),
-        includes = common_includes,
-        linkopts = select_os_specific_2(
-            LM = ["-lpthread -ldl -Wl,--version-script=$(location :ldscript)"],
-            W = ["/MACHINE:X64"],
-        ),
-        linkshared = True,
-        additional_linker_inputs = select_os_specific_2(
-            LM = [":ldscript"],
-            W = [":generate_def"],
-        ),
-        win_def_file = ":generate_def",  # This will be ignored for non Windows builds
-        visibility = ["//visibility:public"],
-    )
diff --git a/third_party/llvm_openmp/openmp_switch_default_patch.patch b/third_party/llvm_openmp/openmp_switch_default_patch.patch
deleted file mode 100644
index e374de1525e7..000000000000
--- a/third_party/llvm_openmp/openmp_switch_default_patch.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
-index 692ca26..b9dca42 100644
---- a/runtime/src/kmp_settings.cpp
-+++ b/runtime/src/kmp_settings.cpp
-@@ -806,6 +806,7 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
-     case library_throughput: {
-       value = "PASSIVE";
-     } break;
-+    default : { } break;
-     }
-   } else {
-     switch (__kmp_library) {
-@@ -818,6 +819,7 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
-     case library_throughput: {
-       value = "throughput";
-     } break;
-+    default : { } break;
-     }
-   }
-   if (value != NULL) {
-@@ -2428,6 +2430,7 @@ static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
-       __kmp_str_buf_print(buffer, "%s", "granularity=group,");
-       break;
- #endif /* KMP_GROUP_AFFINITY */
-+    default : break;
-     }
-   }
-   if (!KMP_AFFINITY_CAPABLE()) {
-@@ -3819,6 +3822,8 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
-     case kmp_sch_auto:
-       __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
-       break;
-+    default:
-+      break;
-     }
-   } else {
-     switch (sched) {
-@@ -3844,6 +3849,8 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
-     case kmp_sch_auto:
-       __kmp_str_buf_print(buffer, "%s'\n", "auto");
-       break;
-+    default:
-+      break;
-     }
-   }
- } // __kmp_stg_print_omp_schedule
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
deleted file mode 100644
index 25a6490c3031..000000000000
--- a/third_party/mkl_dnn/BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-exports_files(["LICENSE"])
diff --git a/third_party/mkl_dnn/LICENSE b/third_party/mkl_dnn/LICENSE
deleted file mode 100644
index 8dada3edaf50..000000000000
--- a/third_party/mkl_dnn/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright {yyyy} {name of copyright owner}
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/mkl_dnn/mkldnn_acl.BUILD
deleted file mode 100644
index 56686b95fbef..000000000000
--- a/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ /dev/null
@@ -1,183 +0,0 @@
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
-exports_files(["LICENSE"])
-
-_DNNL_COPTS_THREADPOOL = [
-    "-fopenmp-simd",
-    "-fexceptions",
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-]
-
-_DNNL_COPTS_OMP = [
-    "-fopenmp",
-    "-fexceptions",
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-]
-
-_DNNL_RUNTIME_THREADPOOL = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine DNNL_EXPERIMENTAL_SPARSE": "#define DNNL_EXPERIMENTAL_SPARSE",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-_DNNL_RUNTIME_OMP = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-expand_template(
-    name = "dnnl_config_h",
-    out = "include/oneapi/dnnl/dnnl_config.h",
-    substitutions = select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
-        "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
-    }),
-    template = "include/oneapi/dnnl/dnnl_config.h.in",
-)
-
-expand_template(
-    name = "dnnl_version_h",
-    out = "include/oneapi/dnnl/dnnl_version.h",
-    substitutions = {
-        "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "2",
-        "@DNNL_VERSION_PATCH@": "1",
-        "@DNNL_VERSION_HASH@": "N/A",
-    },
-    template = "include/oneapi/dnnl/dnnl_version.h.in",
-)
-
-cc_library(
-    name = "mkl_dnn_acl",
-    srcs = glob(
-        [
-            "src/common/*.cpp",
-            "src/cpu/**/*.cpp",
-            "src/cpu/*.cpp",
-        ],
-        exclude = [
-            "src/cpu/x64/**",
-            "src/cpu/rv64/**",
-        ],
-    ),
-    copts = select({
-        "@local_xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP,
-        "//conditions:default": _DNNL_COPTS_THREADPOOL,
-    }),
-    defines = ["DNNL_AARCH64_USE_ACL=1"],
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/aarch64/xbyak_aarch64/src",
-        "src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64",
-        "src/cpu/gemm",
-    ],
-    textual_hdrs = glob(
-        [
-            "include/**/*",
-            "include/*",
-            "src/common/*.hpp",
-            "src/common/**/*.h",
-            "src/cpu/**/*.hpp",
-            "src/cpu/*.hpp",
-            "src/cpu/aarch64/xbyak_aarch64/**/*.h",
-        ],
-    ) + [
-        ":dnnl_config_h",
-        ":dnnl_version_h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@compute_library//:arm_compute",
-    ],
-)
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
deleted file mode 100644
index 8c730960bc3e..000000000000
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ /dev/null
@@ -1,206 +0,0 @@
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@local_xla//xla/tsl:tsl.bzl", "tf_openmp_copts")
-load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml", "if_mkldnn_openmp")
-
-exports_files(["LICENSE"])
-
-_CMAKE_COMMON_LIST = {
-    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE",
-    "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL",
-    "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO",
-    "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA",
-    "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP",
-    "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER",
-    "#cmakedefine ONEDNN_BUILD_GRAPH": "#define ONEDNN_BUILD_GRAPH",
-    "#cmakedefine DNNL_EXPERIMENTAL_SPARSE": "#define DNNL_EXPERIMENTAL_SPARSE",
-    "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL",
-    "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1",
-    "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1",
-    "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0",
-    "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0",
-    "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0",
-    "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0",
-    "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0",
-    "#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1",
-    "#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0",
-    "#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 1",
-    "#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 1",
-    "#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 1",
-    "#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 1",
-    "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0",
-    "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0",
-    "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0",
-    "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0",
-    "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0",
-    "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0",
-    "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0",
-    "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0",
-    "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0",
-    "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0",
-    "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0",
-    "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0",
-    "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 1",
-    "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0",
-    "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0",
-    "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0",
-    "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0",
-    "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0",
-    "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0",
-    "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0",
-    "#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0",
-    "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0",
-    "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0",
-    "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0",
-    "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
-}
-
-_DNNL_RUNTIME_OMP = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-}
-
-_DNNL_RUNTIME_OMP.update(_CMAKE_COMMON_LIST)
-
-_DNNL_RUNTIME_THREADPOOL = {
-    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
-    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
-}
-
-_DNNL_RUNTIME_THREADPOOL.update(_CMAKE_COMMON_LIST)
-
-expand_template(
-    name = "dnnl_config_h",
-    out = "include/oneapi/dnnl/dnnl_config.h",
-    substitutions = select({
-        "@local_xla//xla/tsl/mkl:build_with_mkldnn_openmp": _DNNL_RUNTIME_OMP,
-        "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
-    }),
-    template = "include/oneapi/dnnl/dnnl_config.h.in",
-)
-
-# Create the file dnnl_version.h with DNNL version numbers.
-# Currently, the version numbers are hard coded here. If DNNL is upgraded then
-# the version numbers have to be updated manually. The version numbers can be
-# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
-# set to "version_major.version_minor.version_patch". The git hash version can
-# be set to NA.
-# TODO(agramesh1): Automatically get the version numbers from CMakeLists.txt.
-expand_template(
-    name = "dnnl_version_h",
-    out = "include/oneapi/dnnl/dnnl_version.h",
-    substitutions = {
-        "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "5",
-        "@DNNL_VERSION_PATCH@": "0",
-        "@DNNL_VERSION_HASH@": "N/A",
-    },
-    template = "include/oneapi/dnnl/dnnl_version.h.in",
-)
-
-_COPTS_LIST = select({
-    "@local_xla//xla/tsl:windows": [],
-    "//conditions:default": ["-fexceptions"],
-}) + [
-    "-UUSE_MKL",
-    "-UUSE_CBLAS",
-    "-DDNNL_ENABLE_MAX_CPU_ISA",
-    "-DDNNL_ENABLE_ITT_TASKS",
-    "-DDNNL_ENABLE_GRAPH_DUMP",
-] + tf_openmp_copts()
-
-_INCLUDES_LIST = [
-    "include",
-    "src",
-    "src/common",
-    "src/common/ittnotify",
-    "src/cpu",
-    "src/cpu/gemm",
-    "src/cpu/x64/xbyak",
-    "src/graph",
-]
-
-_TEXTUAL_HDRS_LIST = glob([
-    "include/**/*",
-    "src/common/*.hpp",
-    "src/common/ittnotify/**/*.h",
-    "src/cpu/*.hpp",
-    "src/cpu/**/*.hpp",
-    "src/cpu/jit_utils/**/*.hpp",
-    "src/cpu/x64/xbyak/*.h",
-    "src/graph/interface/*.hpp",
-    "src/graph/backend/*.hpp",
-    "src/graph/backend/dnnl/*.hpp",
-    "src/graph/backend/fake/*.hpp",
-    "src/graph/backend/dnnl/passes/*.hpp",
-    "src/graph/backend/dnnl/patterns/*.hpp",
-    "src/graph/backend/dnnl/kernels/*.hpp",
-    "src/graph/utils/*.hpp",
-    "src/graph/utils/pm/*.hpp",
-]) + [
-    ":dnnl_config_h",
-    ":dnnl_version_h",
-]
-
-# Large autogen files take too long time to compile with usual optimization
-# flags. These files just generate binary kernels and are not the hot spots,
-# so we factor them out to lower compiler optimizations in ":dnnl_autogen".
-# Using -O1 to enable optimizations to reduce stack consumption. (With -O0,
-# compiler doesn't clean up stack from temporary objects.)
-cc_library(
-    name = "onednn_autogen",
-    srcs = glob(["src/cpu/x64/gemm/**/*_kern_autogen*.cpp"]),
-    copts = [
-        "-O1",
-        "-U_FORTIFY_SOURCE",
-    ] + _COPTS_LIST,
-    includes = _INCLUDES_LIST,
-    textual_hdrs = _TEXTUAL_HDRS_LIST,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "mkl_dnn",
-    srcs = glob(
-        [
-            "src/common/*.cpp",
-            "src/cpu/*.cpp",
-            "src/cpu/**/*.cpp",
-            "src/common/ittnotify/*.c",
-            "src/cpu/jit_utils/**/*.cpp",
-            "src/cpu/x64/**/*.cpp",
-            "src/graph/interface/*.cpp",
-            "src/graph/backend/*.cpp",
-            "src/graph/backend/dnnl/*.cpp",
-            "src/graph/backend/fake/*.cpp",
-            "src/graph/backend/dnnl/passes/*.cpp",
-            "src/graph/backend/dnnl/patterns/*.cpp",
-            "src/graph/backend/dnnl/kernels/*.cpp",
-            "src/graph/utils/*.cpp",
-            "src/graph/utils/pm/*.cpp",
-        ],
-        exclude = [
-            "src/cpu/aarch64/**",
-            "src/cpu/rv64/**",
-            "src/cpu/x64/gemm/**/*_kern_autogen.cpp",
-        ],
-    ),
-    copts = _COPTS_LIST,
-    includes = _INCLUDES_LIST,
-    # TODO(penpornk): Use lrt_if_needed from tensorflow.bzl instead.
-    linkopts = select({
-        "@local_xla//xla/tsl:linux_aarch64": ["-lrt"],
-        "@local_xla//xla/tsl:linux_x86_64": ["-lrt"],
-        "@local_xla//xla/tsl:linux_ppc64le": ["-lrt"],
-        "//conditions:default": [],
-    }),
-    textual_hdrs = _TEXTUAL_HDRS_LIST,
-    visibility = ["//visibility:public"],
-    deps = [":onednn_autogen"] + if_mkl_ml(
-        ["@local_xla//xla/tsl/mkl:intel_binary_blob"],
-        [],
-    ),
-)
diff --git a/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch b/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch
deleted file mode 100644
index 42dd262323b5..000000000000
--- a/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
-index 65b887ea21..eabdb827bd 100644
---- a/src/cpu/platform.cpp
-+++ b/src/cpu/platform.cpp
-@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) {
- #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
-             return true;
- #endif
-+#elif DNNL_AARCH64_USE_ACL
-+            return arm_compute::CPUInfo::get().has_bf16();
- #else
-             return false;
- #endif
--- 
-2.34.1
-
diff --git a/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch b/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch
deleted file mode 100644
index 779608a68058..000000000000
--- a/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
-index ab13efb9b2..ec261e156d 100644
---- a/src/cpu/aarch64/matmul/acl_matmul.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
-@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t {
-                     = utils::everyone_is(data_type::f16, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type)
-                     && platform::has_data_type_support(data_type::f16);
-+            const bool is_fp32_bf16_ok
-+                    = (utils::everyone_is(data_type::f32, src_md()->data_type,
-+                               dst_md()->data_type, desc()->accum_data_type)
-+                            && platform::has_data_type_support(data_type::f32)
-+                            && utils::everyone_is(
-+                                    data_type::bf16, weights_md()->data_type)
-+                            && platform::has_data_type_support(
-+                                    data_type::bf16));
-+
-             const bool is_weights_md_format_ok
-                     = utils::one_of(weights_format_kind_received,
-                             format_kind::any, format_kind::blocked);
-             bool ok = is_dense_data()
--                    && utils::one_of(true, is_fp32_ok, is_fp16_ok)
-+                    && utils::one_of(
-+                            true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok)
-                     && !has_zero_dim_memory() && is_weights_md_format_ok
-                     && set_default_formats()
-                     && attr()->has_default_values(
--- 
-2.34.1
diff --git a/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch b/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch
deleted file mode 100644
index ec2cb97f5131..000000000000
--- a/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp
-index 451cc78d52..ab13efb9b2 100644
---- a/src/cpu/aarch64/matmul/acl_matmul.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp
-@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t {
- 
-         status_t init(engine_t *engine) {
-             using smask_t = primitive_attr_t::skip_mask_t;
-+            const format_kind_t weights_format_kind_received
-+                    = weights_md_.format_kind;
-             const bool is_fp32_ok
-                     = utils::everyone_is(data_type::f32, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type,
-@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t {
-                     = utils::everyone_is(data_type::f16, src_md()->data_type,
-                               weights_md()->data_type, dst_md()->data_type)
-                     && platform::has_data_type_support(data_type::f16);
-+            const bool is_weights_md_format_ok
-+                    = utils::one_of(weights_format_kind_received,
-+                            format_kind::any, format_kind::blocked);
-             bool ok = is_dense_data()
-                     && utils::one_of(true, is_fp32_ok, is_fp16_ok)
--                    && !has_zero_dim_memory()
--                    && weights_md_.format_kind == format_kind::any
-+                    && !has_zero_dim_memory() && is_weights_md_format_ok
-                     && set_default_formats()
-                     && attr()->has_default_values(
-                             smask_t::oscale | smask_t::post_ops)
-                     && attr_oscale_ok() && !has_runtime_dims_or_strides();
-             if (!ok) return status::unimplemented;
- 
--            CHECK(acl_matmul_utils::init_conf_matmul(
--                    amp_, src_md_, weights_md_, dst_md_, *desc(), *attr()));
-+            CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_,
-+                    dst_md_, *desc(), *attr(), weights_format_kind_received));
- 
-             arm_compute::ActivationLayerInfo act_info;
-             CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info));
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-index a314d96384..027f915a8a 100644
---- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-@@ -27,7 +27,8 @@ namespace acl_matmul_utils {
- 
- status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-         memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
--        const primitive_attr_t &attr) {
-+        const primitive_attr_t &attr,
-+        format_kind_t weights_format_kind_received) {
- 
-     const memory_desc_wrapper src_d(&src_md);
-     const memory_desc_wrapper wei_d(&wei_md);
-@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-     for (dim_t i = K_dim - 1; i >= 0; --i)
-         batch_dims.push_back(i);
- 
-+    const memory_desc_t weights_md_received = wei_md;
-     acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
-             expected_weight_format, K_dim, N_dim, {}, batch_dims);
- 
-+    ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked)
-+                    && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)),
-+            "specified blocked format not supported by ACL, use "
-+            "format_kind_t::any to find a supported blocked format for "
-+            "your platform");
-+
-     return status::success;
- }
- 
-diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-index 67bb2e78eb..5ba4241abc 100644
---- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
-@@ -52,7 +52,8 @@ namespace acl_matmul_utils {
- 
- status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
-         memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md,
--        const primitive_attr_t &attr);
-+        const primitive_attr_t &attr,
-+        format_kind_t weights_format_kind_received);
- 
- } // namespace acl_matmul_utils
- 
--- 
-2.34.1
diff --git a/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch b/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
deleted file mode 100644
index 6d6f0c0eaabb..000000000000
--- a/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001
-From: Kentaro Kawakami <kawakami.k@fujitsu.com>
-Date: Fri, 26 May 2023 10:58:36 +0900
-Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for
- Ubuntu 20.04
-
----
- .../aarch64/xbyak_aarch64/src/util_impl_linux.h   | 15 ++++++++++++---
- 1 file changed, 12 insertions(+), 3 deletions(-)
-
-diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-index 743843bae50..3db37e972d1 100644
---- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
-@@ -39,6 +39,13 @@
- #include <asm/hwcap.h>
- #endif
- 
-+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */
-+#ifdef AT_HWCAP2
-+#ifndef HWCAP2_BF16
-+#define HWCAP2_BF16 (1UL << 14)
-+#endif
-+#endif
-+
- namespace Xbyak_aarch64 {
- namespace util {
- #define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__);
-@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo {
-   }
- 
-   void setHwCap() {
--    unsigned long hwcap = getauxval(AT_HWCAP);
-+    const unsigned long hwcap = getauxval(AT_HWCAP);
-     if (hwcap & HWCAP_ATOMICS)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC;
- 
-@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo {
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_FP;
-     if (hwcap & HWCAP_ASIMD)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD;
--#ifdef HWCAP2_BF16
--    if (hwcap & HWCAP2_BF16)
-+
-+#ifdef AT_HWCAP2
-+    const unsigned long hwcap2 = getauxval(AT_HWCAP2);
-+    if (hwcap2 & HWCAP2_BF16)
-       type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16;
- #endif
- 
diff --git a/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch b/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch
deleted file mode 100644
index 39f7e74345e0..000000000000
--- a/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp
-index ea4bb200ec..3eb53b81bd 100644
---- a/src/cpu/aarch64/acl_post_ops.cpp
-+++ b/src/cpu/aarch64/acl_post_ops.cpp
-@@ -24,7 +24,7 @@ namespace aarch64 {
- 
- status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const {
- 
--    int post_op_index = 0;
-+    int post_op_index = post_op_start_index_;
- 
-     // As these are post ops, this src will also be our dst. If we have a sum
-     // post op, the src/dst will start off in a temporary, then change to
-diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp
-index 7b59ad71d3..ceaa95b73a 100644
---- a/src/cpu/aarch64/acl_post_ops.hpp
-+++ b/src/cpu/aarch64/acl_post_ops.hpp
-@@ -32,7 +32,9 @@ struct acl_post_ops_t {
-     // init the acl_post_ops_t. Note that this function modifies the passed in
-     // post ops by setting the preferred memory formats
-     status_t init(engine_t *engine, post_ops_t &post_ops,
--            const memory_desc_t &dst_md) {
-+            const memory_desc_t &dst_md, int post_op_start_index = 0) {
-+
-+        post_op_start_index_ = post_op_start_index;
- 
-         CHECK(post_ops.set_default_formats(&dst_md));
-         dst_data_type = dst_md.data_type;
-@@ -41,7 +43,7 @@ struct acl_post_ops_t {
-         sum_index = -1;
-         post_op_primitives = {};
- 
--        for (int i = 0; i < post_ops.len(); i++) {
-+        for (int i = post_op_start_index; i < post_ops.len(); i++) {
-             auto &po = post_ops.entry_[i];
- 
-             if (po.is_sum()) {
-@@ -135,7 +137,8 @@ struct acl_post_ops_t {
-     // formats
-     status_t init(engine_t *engine, post_ops_t &base_post_ops,
-             const memory_desc_t &dst_md,
--            arm_compute::ActivationLayerInfo &act_info_to_fuse) {
-+            arm_compute::ActivationLayerInfo &act_info_to_fuse,
-+            int post_op_start_index = 0) {
- 
-         CHECK(base_post_ops.set_default_formats(&dst_md));
-         dst_data_type = dst_md.data_type;
-@@ -149,18 +152,11 @@ struct acl_post_ops_t {
-                     "eltwise post op scale must be 1 (no scale)");
-             CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse));
- 
--            // Copy all but the first, because it has been fused
--            post_ops_t post_ops;
--            for (int idx = 1; idx < base_post_ops.len(); ++idx) {
--                // Construct empty entry then copy, so that we can check for failure
--                post_ops.entry_.emplace_back();
--                post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]);
--            }
--            return init(engine, post_ops, dst_md);
--
-+             // post_op_start_index + 1 to skip the fused eltwise
-+              return init(engine, base_post_ops, dst_md, post_op_start_index + 1);
-         } else {
-             // Nothing to fuse, just copy all post ops
--            return init(engine, base_post_ops, dst_md);
-+            return init(engine, base_post_ops, dst_md, post_op_start_index);
-         }
-     }
- 
-@@ -179,6 +175,9 @@ struct acl_post_ops_t {
- private:
-     // Index of the sum post op if there is one, < 0 means no sum
-     int sum_index = -1;
-+    // Index of the first post op this primitive executes. This is typically the
-+    // number of post ops which were fused.
-+    int post_op_start_index_ = 0;
-     data_type_t dst_data_type;
-     // Vector of primitives used to execute the post ops. They are constructed
-     // in init to be either acl_binary_t (for sum, add, sub, div, mul, min and
--- 
-2.34.1
diff --git a/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch b/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
deleted file mode 100644
index 202902a1894a..000000000000
--- a/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch
+++ /dev/null
@@ -1,111 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
-index 4a43b24c5..1a5cfe590 100644
---- a/src/cpu/aarch64/cpu_isa_traits.hpp
-+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
-@@ -1,6 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() {
-     return cpu().isAtomicSupported();
- }
- 
--inline bool isa_has_bf16(cpu_isa_t isa) {
--    return false;
-+static inline bool mayiuse_bf16() {
-+    using namespace Xbyak_aarch64::util;
-+    return cpu().isBf16Supported();
- }
--
- } // namespace
- 
- /* whatever is required to generate string literals... */
-diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
-index 6bd259ec2..5541bb702 100644
---- a/src/cpu/aarch64/jit_uni_reorder.cpp
-+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
-@@ -1,7 +1,7 @@
- /*******************************************************************************
- * Copyright 2018-2023 Intel Corporation
- * Copyright 2020-2023 FUJITSU LIMITED
--* Copyright 2022 Arm Ltd. and affiliates
-+* Copyright 2022-2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
- 
-         bool ok = true && p.ndims > 0
-                 && utils::one_of(p.itype, f32, s32, data_type::s8, u8)
--                && utils::one_of(p.otype, f32, s32, data_type::s8, u8)
-+                && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
-                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
-                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
--                && simple_impl_desc_init(p, nullptr)
--                && prb_has_small_strides(p);
-+                && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
-+                && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16()));
- 
-         return ok;
-     }
-@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-                         cvt_v_s32_u8(startIdx, regNum);
-                     if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum);
-                     break;
-+                case bf16:
-+                    if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
-+                    break;
-                 default: assert(!"unreachable");
-             }
-         };
-@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
-         UNROLL_INST(fcvtzs, VReg4S, tmp, tmp);
-     }
- 
-+    void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) {
-+        UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
-+    }
-+
-     void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
-         cvt_z_b_s(startIdx, regNum);
-         UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-index ba5499ba9..d4e21d316 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
-@@ -1,5 +1,6 @@
- /*******************************************************************************
- * Copyright 2020-2022 Intel Corporation
-+* Copyright 2023 Arm Ltd. and affiliates
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
-@@ -34,6 +35,8 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
-             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
- 
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-+
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep))
-             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep))
diff --git a/third_party/mkl_dnn/onednn_acl_indirect_conv.patch b/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
deleted file mode 100644
index 217e668352de..000000000000
--- a/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
+++ /dev/null
@@ -1,31 +0,0 @@
- *******************************************************************************
- Copyright 2024 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index f043fee4bc..0384cce757 100644
---- a/src/cpu/aarch64/acl_convolution_utils.cpp
-+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-
-     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-
--    // Indirect is slower than gemm for low thread counts, except for fast math
--    if (dnnl_get_max_threads() < 28 && !acp.fast_math)
--        return status::unimplemented;
--
-     // If we do not need to pad input channels for fast math mode then it would
-     // be faster to run convolution with im2row instead of using indirect kernel
-     int block_by = arm_compute::block_by(acp.weights_info.weight_format());
diff --git a/third_party/mkl_dnn/onednn_acl_reorder.patch b/third_party/mkl_dnn/onednn_acl_reorder.patch
deleted file mode 100644
index 5da6756c70a2..000000000000
--- a/third_party/mkl_dnn/onednn_acl_reorder.patch
+++ /dev/null
@@ -1,371 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
-new file mode 100644
-index 000000000..061751b55
---- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.cpp
-@@ -0,0 +1,52 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+
-+#include "cpu/aarch64/acl_reorder.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
-+    // Lock here is needed because resource_mapper does not support
-+    // concurrent multithreaded access.
-+    std::lock_guard<std::mutex> _lock {this->mtx};
-+
-+    auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM);
-+    auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO);
-+
-+    // Retrieve primitive resource and configured Compute Library objects
-+    auto *acl_resource
-+            = ctx.get_resource_mapper()->get<acl_reorder_resource_t>(this);
-+
-+    acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj();
-+
-+    acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
-+    acl_obj.dst_tensor.allocator()->import_memory(dst);
-+
-+    acl_obj.reorder.run();
-+
-+    acl_obj.src_tensor.allocator()->free();
-+    acl_obj.dst_tensor.allocator()->free();
-+
-+    return status::success;
-+}
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
-new file mode 100644
-index 0000000000..edbc38914d
---- /dev/null
-+++ b/src/cpu/aarch64/acl_reorder.hpp
-@@ -0,0 +1,262 @@
-+/*******************************************************************************
-+* Copyright 2023 Arm Ltd. and affiliates
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+*     http://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*******************************************************************************/
-+#ifndef CPU_AARCH64_ACL_REORDER_HPP
-+#define CPU_AARCH64_ACL_REORDER_HPP
-+
-+#include "cpu/aarch64/acl_utils.hpp"
-+#include "cpu/reorder/cpu_reorder_pd.hpp"
-+#include "arm_compute/core/Types.h"
-+#include "common/utils.hpp"
-+
-+namespace dnnl {
-+namespace impl {
-+namespace cpu {
-+namespace aarch64 {
-+
-+struct acl_reorder_obj_t {
-+    arm_compute::NEReorderLayer reorder;
-+    arm_compute::Tensor src_tensor;
-+    arm_compute::Tensor dst_tensor;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_conf_t {
-+    arm_compute::TensorInfo src_info;
-+    arm_compute::TensorInfo dst_info;
-+    arm_compute::WeightFormat src_wf;
-+    arm_compute::WeightFormat dst_wf;
-+};
-+
-+struct acl_reorder_resource_t : public resource_t {
-+    acl_reorder_resource_t() : acl_obj_(utils::make_unique<acl_reorder_obj_t>()) {}
-+
-+    status_t configure(const acl_reorder_conf_t &app) {
-+        if (!acl_obj_) return status::out_of_memory;
-+
-+        // Init Compute Library tensors based on info from descriptor
-+        acl_obj_->src_tensor.allocator()->init(app.src_info);
-+        acl_obj_->dst_tensor.allocator()->init(app.dst_info);
-+
-+        // clang-format off
-+        acl_obj_->reorder.configure(
-+            &acl_obj_->src_tensor,
-+            &acl_obj_->dst_tensor,
-+            app.src_wf,
-+            app.dst_wf
-+            );
-+        // clang-format on
-+
-+        return status::success;
-+    }
-+
-+    acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; }
-+    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t);
-+
-+private:
-+    std::unique_ptr<acl_reorder_obj_t> acl_obj_;
-+}; // acl_reorder_resource_t
-+
-+struct acl_reorder_fwd_t : public primitive_t {
-+    using primitive_t::primitive_t;
-+    struct pd_t : public cpu_reorder_pd_t {
-+
-+        using cpu_reorder_pd_t::cpu_reorder_pd_t;
-+
-+        DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t);
-+
-+        static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
-+                const primitive_attr_t *attr, engine_t *src_engine,
-+                const memory_desc_t *src_md, engine_t *dst_engine,
-+                const memory_desc_t *dst_md) {
-+
-+            using namespace acl_utils;
-+            // using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
-+
-+            bool ok = src_md->data_type
-+                            == dst_md->data_type // ACL only supports matching src/dst data types
-+                    && utils::one_of(src_md->data_type,
-+                            data_type::f32) // Only supports f32 for now
-+                    && attr->has_default_values();
-+            if (!ok) return status::unimplemented;
-+
-+            int mask = -1;
-+            bool is_set = false;
-+            // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
-+            const memory_desc_wrapper input_d(src_md);
-+            if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
-+                return status::unimplemented;
-+
-+            // Create and check primitive descriptor
-+            auto _pd = new pd_t(attr, src_engine->kind(), src_md,
-+                    dst_engine->kind(), dst_md);
-+            if (_pd == nullptr) return status::out_of_memory;
-+            if (_pd->init(engine, src_engine, dst_engine) != status::success) {
-+                delete _pd;
-+                return status::unimplemented;
-+            }
-+
-+            const memory_desc_wrapper src_d(*src_md);
-+            const memory_desc_wrapper dst_d(*dst_md);
-+
-+            const int ndims = src_d.ndims();
-+
-+            auto src_tag = memory_desc_matches_one_of_tag(
-+                            *src_md, format_tag::ba, format_tag::cdba);
-+            ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, src_tag),
-+                            "");
-+
-+            arm_compute::TensorShape acl_tensor_shape_in;
-+            arm_compute::TensorShape acl_tensor_shape_out;
-+            // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8)
-+            int dim_0_rounded_up;
-+
-+            // Switch for 2 or 4 dim tensors
-+            switch(ndims)
-+            {
-+                // Currently for Ab4a and Ab8a
-+                // No format_tag for these, have to deduce from stride
-+                case 2:
-+                    {
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        int dst_dim_1 = dst_md->dims[1];
-+                        int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0];
-+                        int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1];
-+                        // Interleave of 4 or 8 that stride for dim 1
-+                        if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){
-+                            return status::unimplemented;
-+                        }
-+                        // Check to ensure it's a blocking transpose
-+                        if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_dim_1_stride == 4){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        } else {
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up);
-+
-+                        break;
-+                    }
-+                // Currently for Acdb4a and Acdb8a
-+                case 4:
-+                    { 
-+
-+                        auto dst_tag = memory_desc_matches_one_of_tag(
-+                            *dst_md, format_tag::Acdb4a, format_tag::Acdb8a);
-+                        ACL_CHECK_SUPPORT(
-+                            utils::one_of(format_tag::undef, dst_tag),
-+                            "");
-+                        if(dst_tag == format_tag::Acdb4a){
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 4);
-+                        }
-+                        else{
-+                            // Set Dest WeightFormat
-+                            _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8;
-+                            dim_0_rounded_up
-+                                    = utils::rnd_up(src_md->dims[0], 8);
-+                        }
-+                        // Currently only supporting AxBx1x1 cases
-+                        if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){
-+                            return status::unimplemented;
-+                        }
-+                        if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){
-+                            return status::unimplemented;
-+                        }
-+                        acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]);
-+                        acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up);
-+                        break;
-+                    }
-+                default:
-+                    return status::unimplemented;
-+            }
-+
-+            // Choose the data layout
-+            // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc);
-+            const auto acl_layout = arm_compute::DataLayout::NCHW;
-+
-+            // Set Source WeightFormat
-+            _pd->app_.src_wf = arm_compute::WeightFormat::OHWI;
-+
-+            // Create ACL tensor infos
-+            const data_type_t data_type = src_d.data_type();
-+            const arm_compute::DataType acl_data_t
-+                    = acl_utils::get_acl_data_t(data_type);
-+            _pd->app_.src_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_in, 1, acl_data_t, acl_layout);
-+            _pd->app_.dst_info = arm_compute::TensorInfo(
-+                        acl_tensor_shape_out, 1, acl_data_t, acl_layout);
-+
-+            // Init scratch memory, not used so 0 in this implementation
-+            _pd->init_scratchpad_md();
-+
-+            return safe_ptr_assign(*reorder_pd, _pd);
-+        } // create 
-+
-+        friend dnnl::impl::impl_list_item_t;
-+        acl_reorder_conf_t app_;
-+
-+    }; // pd_t
-+
-+    acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-+
-+    status_t create_resource(
-+            engine_t *engine, resource_mapper_t &mapper) const override {
-+        if (mapper.has_resource(this)) return status::success;
-+
-+        auto r = utils::make_unique<acl_reorder_resource_t>();
-+        if (!r) return status::out_of_memory;
-+
-+        // Configure the resource based on information from primitive descriptor
-+        CHECK(r->configure(pd()->app_));
-+
-+        mapper.add(this, std::move(r));
-+        return status::success;
-+    }
-+
-+    status_t execute(const exec_ctx_t &ctx) const override {
-+        return execute_forward(ctx);
-+    }
-+
-+private:
-+    // To guard the const execute_forward, the mutex must be 'mutable'
-+    mutable std::mutex mtx;
-+    status_t execute_forward(const exec_ctx_t &ctx) const;
-+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-+
-+
-+}; // acl_reorder_fwd_t
-+
-+} // namespace aarch64
-+} // namespace cpu
-+} // namespace impl
-+} // namespace dnnl
-+
-+#endif // CPU_AARCH64_ACL_REORDER_HPP
-diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-index a4150b619..f4d6b4de3 100644
---- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
-@@ -16,6 +16,7 @@
- *******************************************************************************/
- 
- #include "cpu/reorder/cpu_reorder.hpp"
-+#include "cpu/aarch64/acl_reorder.hpp"
- 
- namespace dnnl {
- namespace impl {
-@@ -28,6 +29,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-         // f32 -> f32
-         {{f32, f32, 0}, {
-             REG_FAST_DIRECT_COPY_F32_F32
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
- 
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-@@ -69,6 +71,8 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
-             nullptr,
-         }},
-         {{f32, f32, 4}, {
-+
-+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
-             CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
- 
-             REG_FAST_DIRECT_COPY_F32_F32
diff --git a/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch b/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
deleted file mode 100644
index 9583308396dd..000000000000
--- a/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch
+++ /dev/null
@@ -1,97 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..bd7bed837 100644
---- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() {
- #endif
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
--void acl_set_tp_scheduler() {
--    static std::once_flag flag_once;
--    // Create threadpool scheduler
--    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
--            = std::make_unique<ThreadpoolScheduler>();
-+void acl_set_tp_scheduler(int intra_threads = 0) {
-+    static thread_local std::once_flag flag_once;
-     // set CUSTOM scheduler in ACL
-     std::call_once(flag_once,
--            [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
-+            [&]() {
-+                    // Create threadpool scheduler
-+                    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-+                        = std::make_unique<ThreadpoolScheduler>();
-+                    threadpool_scheduler->set_num_threads(intra_threads);
-+
-+                    arm_compute::Scheduler::set(threadpool_scheduler); });
- }
- 
- void acl_set_threadpool_num_threads() {
-@@ -102,14 +105,6 @@ void set_acl_threading() {
-         acl_set_benchmark_scheduler_default();
-     }
- #endif
--#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
--    if (verbose_has_profile_externals()) {
--        acl_set_tp_benchmark_scheduler();
--    } else {
--        acl_set_tp_scheduler();
--    }
--
--#endif
- }
- 
- } // namespace acl_thread_utils
-diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp
-index f073376e6..654a2aa5d 100644
---- a/src/cpu/aarch64/acl_thread.hpp
-+++ b/src/cpu/aarch64/acl_thread.hpp
-@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default();
- 
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- // Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads
--void acl_set_tp_scheduler();
-+void acl_set_tp_scheduler(int intra_threads);
- void acl_set_threadpool_num_threads();
- // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals
- void acl_set_tp_benchmark_scheduler();
-diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-index 439ca862e..6656c37a5 100644
---- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
-@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
- void ThreadpoolScheduler::run_workloads(
-         std::vector<arm_compute::IScheduler::Workload> &workloads) {
- 
--    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
--
-     const unsigned int num_threads
-             = std::min(static_cast<unsigned int>(_num_threads),
-                     static_cast<unsigned int>(workloads.size()));
-diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
-index 0bfec3871..7207b2b60 100644
---- a/src/cpu/cpu_engine.cpp
-+++ b/src/cpu/cpu_engine.cpp
-@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) {
- #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
- status_t cpu_engine_t::create_stream(stream_t **stream,
-         dnnl::threadpool_interop::threadpool_iface *threadpool) {
-+    dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads());
-     return safe_ptr_assign<stream_t>(
-             *stream, new cpu_stream_t(this, threadpool));
- }
diff --git a/third_party/mkl_dnn/onednn_acl_threadcap.patch b/third_party/mkl_dnn/onednn_acl_threadcap.patch
deleted file mode 100644
index 3a33af153e91..000000000000
--- a/third_party/mkl_dnn/onednn_acl_threadcap.patch
+++ /dev/null
@@ -1,43 +0,0 @@
- *******************************************************************************
- Copyright 2023 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
-index fd2c76d01..2d7c76d48 100644
---- a/src/cpu/aarch64/acl_thread.cpp
-+++ b/src/cpu/aarch64/acl_thread.cpp
-@@ -17,6 +17,8 @@
- #include "cpu/aarch64/acl_thread.hpp"
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
- #include "cpu/aarch64/acl_threadpool_scheduler.hpp"
-+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
-+#include <thread>
- #endif
- #include "cpu/aarch64/acl_benchmark_scheduler.hpp"
- 
-@@ -30,9 +32,10 @@ namespace acl_thread_utils {
- #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
- void acl_thread_bind() {
-     static std::once_flag flag_once;
--    // The threads in Compute Library are bound for the cores 0..max_threads-1
--    // dnnl_get_max_threads() returns OMP_NUM_THREADS
--    const int max_threads = dnnl_get_max_threads();
-+    // Cap the number of threads to 90% of the total core count
-+    // to ensure Compute Library doesn't use too much resource
-+    int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency());
-+    const int max_threads = std::min(capped_threads, dnnl_get_max_threads());
-     // arm_compute::Scheduler does not support concurrent access thus a
-     // workaround here restricts it to only one call
-     std::call_once(flag_once, [&]() {
diff --git a/third_party/mkl_dnn/setting_init.patch b/third_party/mkl_dnn/setting_init.patch
deleted file mode 100644
index 86f4e076d8b9..000000000000
--- a/third_party/mkl_dnn/setting_init.patch
+++ /dev/null
@@ -1,19 +0,0 @@
-diff --git a/src/common/utils.hpp b/src/common/utils.hpp
-index 6ef5bf3163..ddd285f3fe 100644
---- a/src/common/utils.hpp
-+++ b/src/common/utils.hpp
-@@ -673,11 +673,11 @@ template <typename T>
- struct setting_t {
- private:
-     T value_;
--    bool initialized_;
-+    std::atomic<bool> initialized_ = false;
-
- public:
--    constexpr setting_t() : value_ {}, initialized_ {false} {}
--    constexpr setting_t(const T init) : value_ {init}, initialized_ {false} {}
-+    constexpr setting_t() : value_ {} {}
-+    constexpr setting_t(const T init) : value_ {init} {}
-     bool initialized() { return initialized_; }
-     T get() { return value_; }
-     void set(T new_value) {
diff --git a/third_party/mpitrampoline/BUILD b/third_party/mpitrampoline/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/mpitrampoline/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/mpitrampoline/gen.patch b/third_party/mpitrampoline/gen.patch
deleted file mode 100644
index 35124db0abb1..000000000000
--- a/third_party/mpitrampoline/gen.patch
+++ /dev/null
@@ -1,149 +0,0 @@
-diff --git a/gen/gen_decl.py b/gen/gen_decl.py
-index 1005b95..696b4e0 100755
---- a/gen/gen_decl.py
-+++ b/gen/gen_decl.py
-@@ -9,8 +9,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
- 
- from mpi_constants import constants
- from mpi_functions import functions
--from mpi_constants_fortran import constants_fortran
--from mpi_functions_fortran import functions_fortran
-+# from mpi_constants_fortran import constants_fortran
-+# from mpi_functions_fortran import functions_fortran
- 
- support_profiling = True
- have_weak_symbols = False
-@@ -24,7 +24,7 @@ def wrap(line):
-     lines.append(line)
-     return "\n".join(lines)
- 
--with open("include/mpi_decl_constants_c.h", "w") as file:
-+with open(sys.argv[1], "w") as file:
-     file.write("// Declare C MPI constants\n")
-     file.write("\n")
-     for (tp, nm) in constants:
-@@ -32,7 +32,7 @@ with open("include/mpi_decl_constants_c.h", "w") as file:
-                 'mpi_nm': nm}
-         file.write(Template("extern $mpi_tp MPITRAMPOLINE_CONST $mpi_nm;\n").substitute(subs))
- 
--with open("include/mpi_decl_functions_c.h", "w") as file:
-+with open(sys.argv[2], "w") as file:
-     file.write("// Declare C MPI functions\n")
-     file.write("\n")
-     for (tp, nm, args, flags) in functions:
-@@ -90,7 +90,7 @@ with open("include/mpi_decl_functions_c.h", "w") as file:
-         file.write(Template("\n".join(tmpl)).substitute(subs))
-         file.write("\n")
- 
--with open("include/mpi_decl_constants_fortran.h", "w") as file:
-+if False:
-     file.write("!     Declare Fortran MPI constants\n")
-     file.write("\n")
-     for (tp, nm) in constants_fortran:
-@@ -104,7 +104,7 @@ with open("include/mpi_decl_constants_fortran.h", "w") as file:
-         file.write("\n".join(map(lambda line: wrap(Template(line).substitute(subs)), tmpl)))
-         file.write("\n")
- 
--with open("include/mpi_decl_functions_fortran.h", "w") as file:
-+if False:
-     file.write("!     Declare Fortran MPI functions\n")
-     file.write("\n")
-     for (tp, nm, args) in functions_fortran:
-diff --git a/gen/gen_defn.py b/gen/gen_defn.py
-index bf31f35..318222e 100755
---- a/gen/gen_defn.py
-+++ b/gen/gen_defn.py
-@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
- 
- from mpi_constants import constants
- from mpi_functions import functions
--from mpi_constants_fortran import constants_fortran
--from mpi_functions_fortran import functions_fortran
-+# from mpi_constants_fortran import constants_fortran
-+# from mpi_functions_fortran import functions_fortran
- 
- support_profiling = True
- have_weak_symbols = False
- replace_sentinels = False
- 
--with open("src/mpi_defn_constants_c.h", "w") as file:
-+with open(sys.argv[1], "w") as file:
-     file.write("// Define C MPI constants")
-     file.write("\n")
-     for (tp, nm) in constants:
-@@ -24,7 +24,7 @@ with open("src/mpi_defn_constants_c.h", "w") as file:
-                 'mpi_nm': nm}
-         file.write(Template("$mpi_tp $mpi_nm = ($mpi_tp)0xdeadbeef;\n").substitute(subs))
- 
--with open("src/mpi_defn_functions_c.h", "w") as file:
-+with open(sys.argv[2], "w") as file:
-     file.write("// Define C MPI functions\n")
-     file.write("\n")
-     for (tp, nm, args, flags) in functions:
-@@ -89,7 +89,7 @@ with open("src/mpi_defn_functions_c.h", "w") as file:
-         file.write(Template("\n".join(tmpl)).substitute(subs))
-         file.write("\n")
- 
--with open("src/mpi_defn_constants_fortran.h", "w") as file:
-+if False:
-     file.write("// Define Fortran MPI constants\n")
-     file.write("\n")
-     for (tp, nm) in constants_fortran:
-@@ -98,7 +98,7 @@ with open("src/mpi_defn_constants_fortran.h", "w") as file:
-         # Fortran common blocks with `-march=skylake-avx512` are aligned to 64 bytes
-         file.write(Template("$mpi_tp $abi_nm __attribute__((__aligned__(64))) = (int)0xdeadbeef;\n").substitute(subs))
- 
--with open("src/mpi_defn_functions_fortran.h", "w") as file:
-+if False:
-     file.write("// Define Fortran MPI functions\n")
-     file.write("\n")
-     for (tp, nm, args) in functions_fortran:
-diff --git a/gen/gen_init.py b/gen/gen_init.py
-index 4939261..0e52822 100755
---- a/gen/gen_init.py
-+++ b/gen/gen_init.py
-@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
- 
- from mpi_constants import constants
- from mpi_functions import functions
--from mpi_constants_fortran import constants_fortran
--from mpi_functions_fortran import functions_fortran
-+# from mpi_constants_fortran import constants_fortran
-+# from mpi_functions_fortran import functions_fortran
- 
- support_profiling = True
- have_weak_symbols = False
- replace_sentinels = False
- 
--with open("src/mpi_init_constants_c.h", "w") as file:
-+with open(sys.argv[1], "w") as file:
-     file.write("// Initialize C MPI constants")
-     file.write("\n")
-     for (tp, nm) in constants:
-@@ -25,7 +25,7 @@ with open("src/mpi_init_constants_c.h", "w") as file:
-                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm)}
-         file.write(Template("$mpi_nm = *($mpi_tp const *)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
- 
--with open("src/mpi_init_functions_c.h", "w") as file:
-+with open(sys.argv[2], "w") as file:
-     file.write("// Initialize C MPI functions\n")
-     file.write("\n")
-     for (tp, nm, args, flags) in functions:
-@@ -39,7 +39,7 @@ with open("src/mpi_init_functions_c.h", "w") as file:
-             subs['anm{0}'.format(i)] = anm
-         file.write(Template("$abi_nm = get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
- 
--with open("src/mpi_init_constants_fortran.h", "w") as file:
-+if False:
-     file.write("// Initialize Fortran MPI constants\n")
-     file.write("\n")
-     for (tp, nm) in constants_fortran:
-@@ -47,7 +47,7 @@ with open("src/mpi_init_constants_fortran.h", "w") as file:
-                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm).lower() + "_"}
-         file.write(Template("$abi_nm = *($abi_tp const*)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
- 
--with open("src/mpi_init_functions_fortran.h", "w") as file:
-+if False:
-     file.write("// Initialize Fortran MPI functions\n")
-     file.write("\n")
-     for (tp, nm, args) in functions_fortran:
diff --git a/third_party/mpitrampoline/mpitrampoline.BUILD b/third_party/mpitrampoline/mpitrampoline.BUILD
deleted file mode 100644
index f46e39d762a1..000000000000
--- a/third_party/mpitrampoline/mpitrampoline.BUILD
+++ /dev/null
@@ -1,135 +0,0 @@
-# Description:
-#  A forwarding MPI implementation that can use any other MPI implementation via an MPI ABI
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@local_xla//xla:strict.default.bzl", "py_strict_binary")
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])
-
-exports_files(["LICENSE.md"])
-
-genrule(
-    name = "mpi_version",
-    srcs = [
-        "CMakeLists.txt",
-        "include/mpi_version.h.in",
-    ],
-    outs = ["include/mpi_version.h"],
-    cmd = """
-      PROJECT_VERSION=`cat $(location CMakeLists.txt) \
-                       | grep "MPItrampoline VERSION" | awk '{print $$NF}'`
-      PROJECT_VERSION_MAJOR=`echo $$PROJECT_VERSION | cut -d. -f1`
-      PROJECT_VERSION_MINOR=`echo $$PROJECT_VERSION | cut -d. -f2`
-      PROJECT_VERSION_PATCH=`echo $$PROJECT_VERSION | cut -d. -f3`
-      sed -e "s/@PROJECT_VERSION@/$${PROJECT_VERSION}/" \
-          -e "s/@PROJECT_VERSION_MAJOR@/$${PROJECT_VERSION_MAJOR}/" \
-          -e "s/@PROJECT_VERSION_MINOR@/$${PROJECT_VERSION_MINOR}/" \
-          -e "s/@PROJECT_VERSION_PATCH@/$${PROJECT_VERSION_PATCH}/" \
-          $(location include/mpi_version.h.in) > $(location include/mpi_version.h)
-      """,
-)
-
-expand_template(
-    name = "mpi_defaults",
-    out = "src/mpi_defaults.h",
-    substitutions = {
-        "@MPITRAMPOLINE_DEFAULT_DELAY_INIT@": "",
-        "@MPITRAMPOLINE_DEFAULT_DLOPEN_BINDING@": "",
-        "@MPITRAMPOLINE_DEFAULT_DLOPEN_MODE@": "",
-        "@MPITRAMPOLINE_DEFAULT_LIB@": "",
-        "@MPITRAMPOLINE_DEFAULT_PRELOAD@": "",
-        "@MPITRAMPOLINE_DEFAULT_VERBOSE@": "",
-    },
-    template = "src/mpi_defaults.h.in",
-)
-
-py_strict_binary(
-    name = "gen_decl",
-    srcs = [
-        "gen/gen_decl.py",
-        "mpiabi/mpi_constants.py",
-        "mpiabi/mpi_functions.py",
-    ],
-)
-
-genrule(
-    name = "decl",
-    outs = [
-        "include/mpi_decl_constants_c.h",
-        "include/mpi_decl_functions_c.h",
-    ],
-    cmd = "$(location :gen_decl) $(location include/mpi_decl_constants_c.h) \
-           $(location include/mpi_decl_functions_c.h)",
-    tools = [":gen_decl"],
-)
-
-py_strict_binary(
-    name = "gen_defn",
-    srcs = [
-        "gen/gen_defn.py",
-        "mpiabi/mpi_constants.py",
-        "mpiabi/mpi_functions.py",
-    ],
-)
-
-genrule(
-    name = "defn",
-    outs = [
-        "include/mpi_defn_constants_c.h",
-        "include/mpi_defn_functions_c.h",
-    ],
-    cmd = "$(location :gen_defn) $(location include/mpi_defn_constants_c.h) \
-           $(location include/mpi_defn_functions_c.h)",
-    tools = [":gen_defn"],
-)
-
-py_strict_binary(
-    name = "gen_init",
-    srcs = [
-        "gen/gen_init.py",
-        "mpiabi/mpi_constants.py",
-        "mpiabi/mpi_functions.py",
-    ],
-)
-
-genrule(
-    name = "init",
-    outs = [
-        "include/mpi_init_constants_c.h",
-        "include/mpi_init_functions_c.h",
-    ],
-    cmd = "$(location :gen_init) $(location include/mpi_init_constants_c.h) \
-           $(location include/mpi_init_functions_c.h)",
-    tools = [":gen_init"],
-)
-
-cc_library(
-    name = "mpitrampoline",
-    srcs = [
-        "src/mpi.c",
-    ],
-    hdrs = [
-        "include/mpi.h",
-        "include/mpi_decl_constants_c.h",
-        "include/mpi_decl_functions_c.h",
-        "include/mpi_defn_constants_c.h",
-        "include/mpi_defn_functions_c.h",
-        "include/mpi_init_constants_c.h",
-        "include/mpi_init_functions_c.h",
-        "include/mpi_version.h",
-        "mpiabi/mpiabi.h",
-        "src/mpi_defaults.h",
-    ],
-    copts = [
-        "-fexceptions",
-    ],
-    includes = [
-        "include",
-        "mpiabi",
-        "src",
-    ],
-)
diff --git a/third_party/mpitrampoline/workspace.bzl b/third_party/mpitrampoline/workspace.bzl
deleted file mode 100644
index 4748931ae6e3..000000000000
--- a/third_party/mpitrampoline/workspace.bzl
+++ /dev/null
@@ -1,18 +0,0 @@
-"""Provides the repository macro to import mpitrampoline."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports mpitrampoline."""
-
-    MPITRAMPOLINE_COMMIT = "25efb0f7a4cd00ed82bafb8b1a6285fc50d297ed"
-    MPITRAMPOLINE_SHA256 = "5a36656205c472bdb639bffebb0f014523b32dda0c2cbedd9ce7abfc9e879e84"
-
-    tf_http_archive(
-        name = "mpitrampoline",
-        sha256 = MPITRAMPOLINE_SHA256,
-        strip_prefix = "MPItrampoline-{commit}".format(commit = MPITRAMPOLINE_COMMIT),
-        urls = tf_mirror_urls("https://github.com/eschnett/mpitrampoline/archive/{commit}.tar.gz".format(commit = MPITRAMPOLINE_COMMIT)),
-        patch_file = ["//third_party/mpitrampoline:gen.patch"],
-        build_file = "//third_party/mpitrampoline:mpitrampoline.BUILD",
-    )
diff --git a/third_party/nanobind/BUILD b/third_party/nanobind/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/nanobind/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/nanobind/nanobind.BUILD b/third_party/nanobind/nanobind.BUILD
deleted file mode 100644
index 52c0215b9159..000000000000
--- a/third_party/nanobind/nanobind.BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-licenses(["notice"])
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "nanobind",
-    srcs = glob(
-        [
-            "src/*.cpp",
-        ],
-        exclude = ["src/nb_combined.cpp"],
-    ),
-    copts = ["-fexceptions"],
-    defines = select({
-        "@rules_python//python/config_settings:is_py_freethreaded": [
-            "NB_FREE_THREADED=1",
-            "NB_BUILD=1",
-            "NB_SHARED=1",
-        ],
-        "//conditions:default": [
-            "NB_BUILD=1",
-            "NB_SHARED=1",
-        ],
-    }),
-    includes = ["include"],
-    textual_hdrs = glob(
-        [
-            "include/**/*.h",
-            "src/*.h",
-        ],
-    ),
-    deps = [
-        "@local_xla//third_party/python_runtime:headers",
-        "@robin_map",
-    ],
-)
diff --git a/third_party/nanobind/workspace.bzl b/third_party/nanobind/workspace.bzl
deleted file mode 100644
index 9615732266b2..000000000000
--- a/third_party/nanobind/workspace.bzl
+++ /dev/null
@@ -1,12 +0,0 @@
-"""Loads the nanobind library."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "nanobind",
-        strip_prefix = "nanobind-b4b933111fa61815f3f5b509fde80c24f029ac26",
-        sha256 = "d1d8575f2bf76cc2ca357ac5521daa2f1bcf5397231d856f4ce66ba0670ac928",
-        urls = tf_mirror_urls("https://github.com/wjakob/nanobind/archive/b4b933111fa61815f3f5b509fde80c24f029ac26.tar.gz"),
-        build_file = "//third_party/nanobind:nanobind.BUILD",
-    )
diff --git a/third_party/nasm/BUILD b/third_party/nasm/BUILD
deleted file mode 100644
index ed1568c32f33..000000000000
--- a/third_party/nasm/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Needed to make this a package.
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/nasm/BUILD.system b/third_party/nasm/BUILD.system
deleted file mode 100644
index 52f608187fef..000000000000
--- a/third_party/nasm/BUILD.system
+++ /dev/null
@@ -1,18 +0,0 @@
-licenses(["notice"])  # BSD 2-clause
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "lnnasmlink",
-    outs = ["nasmlink"],
-    cmd = "ln -s $$(which nasm) $@",
-)
-
-sh_binary(
-    name = "nasm",
-    srcs = ["nasmlink"],
-    visibility = ["@libjpeg_turbo//:__pkg__"],
-)
diff --git a/third_party/nasm/config.h b/third_party/nasm/config.h
deleted file mode 100644
index 3533280c472d..000000000000
--- a/third_party/nasm/config.h
+++ /dev/null
@@ -1,543 +0,0 @@
-/* config/config.h.  Generated from config.h.in by configure.  */
-/* config/config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define to 1 to call abort() on panics (internal errors), for debugging. */
-/* #undef ABORT_ON_PANIC */
-
-/* Define if building universal (internal helper macro) */
-/* #undef AC_APPLE_UNIVERSAL_BUILD */
-
-/* Define to 1 if compiled with the `-fdata-sections' compiler flag */
-/* #undef CFLAG_FDATA_SECTIONS */
-
-/* Define to 1 if compiled with the `-ffunction-sections' compiler flag */
-/* #undef CFLAG_FFUNCTION_SECTIONS */
-
-/* Define to 1 if compiled with the `-fgnu89-inline' compiler flag */
-/* #undef CFLAG_FGNU89_INLINE */
-
-/* Define to 1 if compiled with the `-flto' compiler flag */
-/* #undef CFLAG_FLTO */
-
-/* Define to 1 if compiled with the `-fno-common' compiler flag */
-#define CFLAG_FNO_COMMON 1
-
-/* Define to 1 if compiled with the `-fno-omit-frame-pointer' compiler flag */
-/* #undef CFLAG_FNO_OMIT_FRAME_POINTER */
-
-/* Define to 1 if compiled with the `-fsanitize=address' compiler flag */
-/* #undef CFLAG_FSANITIZE_ADDRESS */
-
-/* Define to 1 if compiled with the `-fsanitize=undefined' compiler flag */
-/* #undef CFLAG_FSANITIZE_UNDEFINED */
-
-/* Define to 1 if compiled with the `-fvisibility=hidden' compiler flag */
-#define CFLAG_FVISIBILITY_HIDDEN 1
-
-/* Define to 1 if compiled with the `-fwrapv' compiler flag */
-#define CFLAG_FWRAPV 1
-
-/* Define to 1 if compiled with the `-ggdb3' compiler flag */
-/* #undef CFLAG_GGDB3 */
-
-/* Define to 1 if compiled with the `-pedantic' compiler flag */
-#define CFLAG_PEDANTIC 1
-
-/* Define to 1 if compiled with the `-U__STRICT_ANSI__' compiler flag */
-#define CFLAG_U_STRICT_ANSI 1
-
-/* Define to 1 if compiled with the `-W' compiler flag */
-#define CFLAG_W 1
-
-/* Define to 1 if compiled with the `-Wall' compiler flag */
-#define CFLAG_WALL 1
-
-/* Define to 1 if compiled with the `-Wc90-c99-compat' compiler flag */
-/* #undef CFLAG_WC90_C99_COMPAT */
-
-/* Define to 1 if compiled with the `-Werror' compiler flag */
-/* #undef CFLAG_WERROR */
-
-/* Define to 1 if compiled with the `-Werror=attributes' compiler flag */
-#define CFLAG_WERROR_ATTRIBUTES 1
-
-/* Define to 1 if compiled with the `-Werror=comment' compiler flag */
-#define CFLAG_WERROR_COMMENT 1
-
-/* Define to 1 if compiled with the `-Werror=implicit' compiler flag */
-#define CFLAG_WERROR_IMPLICIT 1
-
-/* Define to 1 if compiled with the `-Werror=missing-braces' compiler flag */
-#define CFLAG_WERROR_MISSING_BRACES 1
-
-/* Define to 1 if compiled with the `-Werror=missing-declarations' compiler
-   flag */
-#define CFLAG_WERROR_MISSING_DECLARATIONS 1
-
-/* Define to 1 if compiled with the `-Werror=missing-prototypes' compiler flag
- */
-#define CFLAG_WERROR_MISSING_PROTOTYPES 1
-
-/* Define to 1 if compiled with the `-Werror=pointer-arith' compiler flag */
-#define CFLAG_WERROR_POINTER_ARITH 1
-
-/* Define to 1 if compiled with the `-Werror=return-type' compiler flag */
-#define CFLAG_WERROR_RETURN_TYPE 1
-
-/* Define to 1 if compiled with the `-Werror=strict-prototypes' compiler flag
- */
-/* #undef CFLAG_WERROR_STRICT_PROTOTYPES */
-
-/* Define to 1 if compiled with the `-Werror=trigraphs' compiler flag */
-#define CFLAG_WERROR_TRIGRAPHS 1
-
-/* Define to 1 if compiled with the `-Werror=unknown-warning-option' compiler
-   flag */
-/* #undef CFLAG_WERROR_UNKNOWN_WARNING_OPTION */
-
-/* Define to 1 if compiled with the `-Werror=vla' compiler flag */
-#define CFLAG_WERROR_VLA 1
-
-/* Define to 1 if compiled with the `-Wlong-long' compiler flag */
-#define CFLAG_WLONG_LONG 1
-
-/* Define to 1 if compiled with the `-Wl,--gc-sections' compiler flag */
-/* #undef CFLAG_WL_GC_SECTIONS */
-
-/* Define to 1 if compiled with the `-Wpedantic-ms-format' compiler flag */
-/* #undef CFLAG_WPEDANTIC_MS_FORMAT */
-
-/* Define to 1 if compiled with the `-Wshift-negative-value' compiler flag */
-#define CFLAG_WSHIFT_NEGATIVE_VALUE 1
-
-/* Define to 1 if compiled with the `-Wstringop-truncation' compiler flag */
-/* #undef CFLAG_WSTRINGOP_TRUNCATION */
-
-/* Define to 1 if you have the `access' function. */
-#define HAVE_ACCESS 1
-
-/* Define to 1 if you have the `canonicalize_file_name' function. */
-/* #undef HAVE_CANONICALIZE_FILE_NAME */
-
-/* Define to 1 if you have the `cpu_to_le16' intrinsic function. */
-/* #undef HAVE_CPU_TO_LE16 */
-
-/* Define to 1 if you have the `cpu_to_le32' intrinsic function. */
-/* #undef HAVE_CPU_TO_LE32 */
-
-/* Define to 1 if you have the `cpu_to_le64' intrinsic function. */
-/* #undef HAVE_CPU_TO_LE64 */
-
-/* Define to 1 if you have the declaration of `strcasecmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRCASECMP 1
-
-/* Define to 1 if you have the declaration of `stricmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRICMP 0
-
-/* Define to 1 if you have the declaration of `strlcpy', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRLCPY 0
-
-/* Define to 1 if you have the declaration of `strncasecmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRNCASECMP 1
-
-/* Define to 1 if you have the declaration of `strnicmp', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRNICMP 0
-
-/* Define to 1 if you have the declaration of `strnlen', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRNLEN 1
-
-/* Define to 1 if you have the declaration of `strrchrnul', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRRCHRNUL 0
-
-/* Define to 1 if you have the declaration of `strsep', and to 0 if you don't.
- */
-#define HAVE_DECL_STRSEP 1
-
-/* Define to 1 if you have the <endian.h> header file. */
-/* #undef HAVE_ENDIAN_H */
-
-/* Define to 1 if you have the `faccessat' function. */
-#define HAVE_FACCESSAT 1
-
-/* Define to 1 if you have the <fcntl.h> header file. */
-#define HAVE_FCNTL_H 1
-
-/* Define to 1 if you have the `fileno' function. */
-#define HAVE_FILENO 1
-
-/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
-#define HAVE_FSEEKO 1
-
-/* Define to 1 if you have the `fstat' function. */
-#define HAVE_FSTAT 1
-
-/* Define to 1 if you have the `ftruncate' function. */
-#define HAVE_FTRUNCATE 1
-
-/* Define to 1 if your compiler supports __attribute__((alloc_size)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_ALLOC_SIZE 1
-
-/* Define to 1 if your compiler supports __attribute__((cold)) on functions */
-#define HAVE_FUNC_ATTRIBUTE_COLD 1
-
-/* Define to 1 if your compiler supports __attribute__((const)) on functions
- */
-#define HAVE_FUNC_ATTRIBUTE_CONST 1
-
-/* Define to 1 if your compiler supports __attribute__((error)) on functions
- */
-/* #undef HAVE_FUNC_ATTRIBUTE_ERROR */
-
-/* Define to 1 if your compiler supports __attribute__((format)) on functions
- */
-#define HAVE_FUNC_ATTRIBUTE_FORMAT 1
-
-/* Define to 1 if your compiler supports __attribute__((malloc)) on functions
- */
-#define HAVE_FUNC_ATTRIBUTE_MALLOC 1
-
-/* Define to 1 if your compiler supports __attribute__((noreturn)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_NORETURN 1
-
-/* Define to 1 if your compiler supports __attribute__((pure)) on functions */
-#define HAVE_FUNC_ATTRIBUTE_PURE 1
-
-/* Define to 1 if your compiler supports __attribute__((returns_nonnull)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL 1
-
-/* Define to 1 if your compiler supports __attribute__((sentinel)) on
-   functions */
-#define HAVE_FUNC_ATTRIBUTE_SENTINEL 1
-
-/* Define to 1 if you have the `getgid' function. */
-#define HAVE_GETGID 1
-
-/* Define to 1 if you have the `getpagesize' function. */
-#define HAVE_GETPAGESIZE 1
-
-/* Define to 1 if you have the `getuid' function. */
-#define HAVE_GETUID 1
-
-/* Define to 1 if you have the `htole16' intrinsic function. */
-/* #undef HAVE_HTOLE16 */
-
-/* Define to 1 if you have the `htole32' intrinsic function. */
-/* #undef HAVE_HTOLE32 */
-
-/* Define to 1 if you have the `htole64' intrinsic function. */
-/* #undef HAVE_HTOLE64 */
-
-/* Define to 1 if you have the <intrin.h> header file. */
-/* #undef HAVE_INTRIN_H */
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <io.h> header file. */
-/* #undef HAVE_IO_H */
-
-/* Define to 1 if you have the <machine/endian.h> header file. */
-/* #undef HAVE_MACHINE_ENDIAN_H */
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have a working `mmap' system call. */
-#define HAVE_MMAP 1
-
-/* Define to 1 if you have the `pathconf' function. */
-#define HAVE_PATHCONF 1
-
-/* Define to 1 if you have the `realpath' function. */
-#define HAVE_REALPATH 1
-
-/* Define to 1 if you have the `snprintf' function. */
-#define HAVE_SNPRINTF 1
-
-/* Define to 1 if you have the `stat' function. */
-#define HAVE_STAT 1
-
-/* Define to 1 if stdbool.h conforms to C99. */
-#define HAVE_STDBOOL_H 1
-
-/* Define to 1 if your compiler supports C99 extern inline */
-#define HAVE_STDC_INLINE 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <stdnoreturn.h> header file. */
-#define HAVE_STDNORETURN_H 1
-
-/* Define to 1 if you have the `strcasecmp' function. */
-#define HAVE_STRCASECMP 1
-
-/* Define to 1 if you have the `stricmp' function. */
-/* #undef HAVE_STRICMP */
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the `strlcpy' function. */
-/* #undef HAVE_STRLCPY */
-
-/* Define to 1 if you have the `strncasecmp' function. */
-#define HAVE_STRNCASECMP 1
-
-/* Define to 1 if you have the `strnicmp' function. */
-/* #undef HAVE_STRNICMP */
-
-/* Define to 1 if you have the `strnlen' function. */
-#define HAVE_STRNLEN 1
-
-/* Define to 1 if you have the `strrchrnul' function. */
-/* #undef HAVE_STRRCHRNUL */
-
-/* Define to 1 if you have the `strsep' function. */
-#define HAVE_STRSEP 1
-
-/* Define to 1 if the system has the type `struct stat'. */
-#define HAVE_STRUCT_STAT 1
-
-/* Define to 1 if the system has the type `struct _stati64'. */
-/* #undef HAVE_STRUCT__STATI64 */
-
-/* Define to 1 if you have the `sysconf' function. */
-#define HAVE_SYSCONF 1
-
-/* Define to 1 if you have the <sys/endian.h> header file. */
-/* #undef HAVE_SYS_ENDIAN_H */
-
-/* Define to 1 if you have the <sys/mman.h> header file. */
-#define HAVE_SYS_MMAN_H 1
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#define HAVE_SYS_PARAM_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if the system has the type `uintptr_t'. */
-#define HAVE_UINTPTR_T 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Define to 1 if you have the `vsnprintf' function. */
-#define HAVE_VSNPRINTF 1
-
-/* Define to 1 if you have the `_access' function. */
-/* #undef HAVE__ACCESS */
-
-/* Define to 1 if you have the `_BitScanReverse' intrinsic function. */
-/* #undef HAVE__BITSCANREVERSE */
-
-/* Define to 1 if you have the `_BitScanReverse64' intrinsic function. */
-/* #undef HAVE__BITSCANREVERSE64 */
-
-/* Define to 1 if the system has the type `_Bool'. */
-#define HAVE__BOOL 1
-
-/* Define to 1 if you have the `_byteswap_uint64' intrinsic function. */
-/* #undef HAVE__BYTESWAP_UINT64 */
-
-/* Define to 1 if you have the `_byteswap_ulong' intrinsic function. */
-/* #undef HAVE__BYTESWAP_ULONG */
-
-/* Define to 1 if you have the `_byteswap_ushort' intrinsic function. */
-/* #undef HAVE__BYTESWAP_USHORT */
-
-/* Define to 1 if you have the `_chsize' function. */
-/* #undef HAVE__CHSIZE */
-
-/* Define to 1 if you have the `_chsize_s' function. */
-/* #undef HAVE__CHSIZE_S */
-
-/* Define to 1 if you have the `_filelengthi64' function. */
-/* #undef HAVE__FILELENGTHI64 */
-
-/* Define to 1 if you have the `_fileno' function. */
-/* #undef HAVE__FILENO */
-
-/* Define to 1 if you have the `_fseeki64' function. */
-/* #undef HAVE__FSEEKI64 */
-
-/* Define to 1 if you have the `_fstati64' function. */
-/* #undef HAVE__FSTATI64 */
-
-/* Define to 1 if you have the `_fullpath' function. */
-/* #undef HAVE__FULLPATH */
-
-/* Define to 1 if you have the `_snprintf' function. */
-/* #undef HAVE__SNPRINTF */
-
-/* Define to 1 if you have the `_stati64' function. */
-/* #undef HAVE__STATI64 */
-
-/* Define to 1 if you have the `_vsnprintf' function. */
-/* #undef HAVE__VSNPRINTF */
-
-/* Define to 1 if you have the `__bswap_16' intrinsic function. */
-/* #undef HAVE___BSWAP_16 */
-
-/* Define to 1 if you have the `__bswap_32' intrinsic function. */
-/* #undef HAVE___BSWAP_32 */
-
-/* Define to 1 if you have the `__bswap_64' intrinsic function. */
-/* #undef HAVE___BSWAP_64 */
-
-/* Define to 1 if you have the `__builtin_bswap16' intrinsic function. */
-#define HAVE___BUILTIN_BSWAP16 1
-
-/* Define to 1 if you have the `__builtin_bswap32' intrinsic function. */
-#define HAVE___BUILTIN_BSWAP32 1
-
-/* Define to 1 if you have the `__builtin_bswap64' intrinsic function. */
-#define HAVE___BUILTIN_BSWAP64 1
-
-/* Define to 1 if you have the `__builtin_clz' intrinsic function. */
-#define HAVE___BUILTIN_CLZ 1
-
-/* Define to 1 if you have the `__builtin_clzl' intrinsic function. */
-#define HAVE___BUILTIN_CLZL 1
-
-/* Define to 1 if you have the `__builtin_clzll' intrinsic function. */
-#define HAVE___BUILTIN_CLZLL 1
-
-/* Define to 1 if you have the `__builtin_constant_p' intrinsic function. */
-#define HAVE___BUILTIN_CONSTANT_P 1
-
-/* Define to 1 if you have the `__builtin_expect' intrinsic function. */
-#define HAVE___BUILTIN_EXPECT 1
-
-/* Define to 1 if you have the `__cpu_to_le16' intrinsic function. */
-/* #undef HAVE___CPU_TO_LE16 */
-
-/* Define to 1 if you have the `__cpu_to_le32' intrinsic function. */
-/* #undef HAVE___CPU_TO_LE32 */
-
-/* Define to 1 if you have the `__cpu_to_le64' intrinsic function. */
-/* #undef HAVE___CPU_TO_LE64 */
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME ""
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING ""
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME ""
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION ""
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Enable extensions on AIX 3, Interix.  */
-#ifndef _ALL_SOURCE
-#define _ALL_SOURCE 1
-#endif
-/* Enable GNU extensions on systems that have them.  */
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE 1
-#endif
-/* Enable threading extensions on Solaris.  */
-#ifndef _POSIX_PTHREAD_SEMANTICS
-#define _POSIX_PTHREAD_SEMANTICS 1
-#endif
-/* Enable extensions on HP NonStop.  */
-#ifndef _TANDEM_SOURCE
-#define _TANDEM_SOURCE 1
-#endif
-/* Enable general extensions on Solaris.  */
-#ifndef __EXTENSIONS__
-#define __EXTENSIONS__ 1
-#endif
-
-/* Define to 1 if your processor stores words with the most significant byte
-   first (like Motorola and SPARC, unlike Intel and VAX). */
-/* #undef WORDS_BIGENDIAN */
-
-/* Define to 1 if your processor stores words with the least significant byte
-   first (like Intel and VAX, unlike Motorola and SPARC). */
-#define WORDS_LITTLEENDIAN 1
-
-/* Enable large inode numbers on Mac OS X 10.5.  */
-#ifndef _DARWIN_USE_64_BIT_INODE
-#define _DARWIN_USE_64_BIT_INODE 1
-#endif
-
-/* Number of bits in a file offset, on hosts where this is settable. */
-/* #undef _FILE_OFFSET_BITS */
-
-/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */
-/* #undef _LARGEFILE_SOURCE */
-
-/* Define for large files, on AIX-style hosts. */
-/* #undef _LARGE_FILES */
-
-/* Define to 1 if on MINIX. */
-/* #undef _MINIX */
-
-/* Define to 2 if the system does not provide POSIX.1 features except with
-   this defined. */
-/* #undef _POSIX_1_SOURCE */
-
-/* Define to 1 if you need to in order for `stat' and other things to work. */
-/* #undef _POSIX_SOURCE */
-
-/* Define to empty if `const' does not conform to ANSI C. */
-/* #undef const */
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-/* #undef inline */
-#endif
-
-/* Define to the equivalent of the C99 'restrict' keyword, or to
-   nothing if this is not supported.  Do not define if restrict is
-   supported directly.  */
-#define restrict __restrict
-/* Work around a bug in Sun C++: it does not support _Restrict or
-   __restrict__, even though the corresponding Sun C compiler ends up with
-   "#define restrict _Restrict" or "#define restrict __restrict__" in the
-   previous line.  Perhaps some future version of Sun C++ will work with
-   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
-#if defined __SUNPRO_CC && !defined __RESTRICT
-#define _Restrict
-#define __restrict__
-#endif
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
-
-/* Define to the type of an unsigned integer type wide enough to hold a
-   pointer, if such a type exists, and if the system does not define it. */
-/* #undef uintptr_t */
diff --git a/third_party/nasm/nasm.BUILD b/third_party/nasm/nasm.BUILD
deleted file mode 100644
index a328d07ad5ad..000000000000
--- a/third_party/nasm/nasm.BUILD
+++ /dev/null
@@ -1,138 +0,0 @@
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-INCLUDES = [
-    ".",
-    "include",
-    "x86",
-    "asm",
-    "disasm",
-    "output",
-]
-
-COPTS = select({
-    ":windows": [],
-    "//conditions:default": [
-        "-w",
-        "-DHAVE_CONFIG_H",
-    ],
-})
-
-cc_library(
-    name = "nasm_2_14_02",
-    srcs = [
-        "asm/assemble.c",
-        "asm/directbl.c",
-        "asm/directiv.c",
-        "asm/error.c",
-        "asm/eval.c",
-        "asm/exprdump.c",
-        "asm/exprlib.c",
-        "asm/float.c",
-        "asm/labels.c",
-        "asm/listing.c",
-        "asm/parser.c",
-        "asm/pptok.c",
-        "asm/pragma.c",
-        "asm/preproc.c",
-        "asm/preproc-nop.c",
-        "asm/quote.c",
-        "asm/rdstrnum.c",
-        "asm/segalloc.c",
-        "asm/stdscan.c",
-        "asm/strfunc.c",
-        "asm/tokhash.c",
-        "common/common.c",
-        "disasm/disasm.c",
-        "disasm/sync.c",
-        "macros/macros.c",
-        "nasmlib/badenum.c",
-        "nasmlib/bsi.c",
-        "nasmlib/crc64.c",
-        "nasmlib/errfile.c",
-        "nasmlib/file.c",
-        "nasmlib/filename.c",
-        "nasmlib/hashtbl.c",
-        "nasmlib/ilog2.c",
-        "nasmlib/malloc.c",
-        "nasmlib/md5c.c",
-        "nasmlib/mmap.c",
-        "nasmlib/path.c",
-        "nasmlib/perfhash.c",
-        "nasmlib/raa.c",
-        "nasmlib/rbtree.c",
-        "nasmlib/readnum.c",
-        "nasmlib/realpath.c",
-        "nasmlib/saa.c",
-        "nasmlib/srcfile.c",
-        "nasmlib/string.c",
-        "nasmlib/strlist.c",
-        "nasmlib/ver.c",
-        "output/codeview.c",
-        "output/legacy.c",
-        "output/nulldbg.c",
-        "output/nullout.c",
-        "output/outaout.c",
-        "output/outas86.c",
-        "output/outbin.c",
-        "output/outcoff.c",
-        "output/outdbg.c",
-        "output/outelf.c",
-        "output/outform.c",
-        "output/outieee.c",
-        "output/outlib.c",
-        "output/outmacho.c",
-        "output/outobj.c",
-        "output/outrdf2.c",
-        "output/strtbl.c",
-        "stdlib/snprintf.c",
-        "stdlib/strlcpy.c",
-        "stdlib/strnlen.c",
-        "stdlib/strrchrnul.c",
-        "stdlib/vsnprintf.c",
-        "x86/disp8.c",
-        "x86/iflag.c",
-        "x86/insnsa.c",
-        "x86/insnsb.c",
-        "x86/insnsd.c",
-        "x86/insnsn.c",
-        "x86/regdis.c",
-        "x86/regflags.c",
-        "x86/regs.c",
-        "x86/regvals.c",
-    ],
-    hdrs = glob([
-        "*.h",
-        "include/*.h",
-        "x86/*.h",
-        "disasm/*.h",
-        "config/*.h",
-        "asm/*.h",
-        "output/*.h",
-        "nasmlib/*.h",
-    ]),
-    copts = COPTS,
-    includes = INCLUDES,
-)
-
-cc_binary(
-    name = "nasm",
-    srcs = [
-        "asm/nasm.c",
-        "nasmlib/zerobuf.c",
-    ],
-    copts = COPTS,
-    includes = INCLUDES,
-    visibility = ["@libjpeg_turbo//:__pkg__"],
-    deps = [
-        ":nasm_2_14_02",
-    ],
-)
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
diff --git a/third_party/nasm/workspace.bzl b/third_party/nasm/workspace.bzl
deleted file mode 100644
index 5806cba557dd..000000000000
--- a/third_party/nasm/workspace.bzl
+++ /dev/null
@@ -1,18 +0,0 @@
-"""loads the nasm library, used by TF."""
-
-load("//third_party:repo.bzl", "tf_http_archive")
-
-def repo():
-    tf_http_archive(
-        name = "nasm",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/www.nasm.us/pub/nasm/releasebuilds/2.14.02/nasm-2.14.02.tar.bz2",
-            "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.14.02.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.14.02.tar.bz2",
-            "http://www.nasm.us/pub/nasm/releasebuilds/2.14.02/nasm-2.14.02.tar.bz2",
-        ],
-        sha256 = "34fd26c70a277a9fdd54cb5ecf389badedaf48047b269d1008fbc819b24e80bc",
-        strip_prefix = "nasm-2.14.02",
-        build_file = "//third_party/nasm:nasm.BUILD",
-        system_build_file = "//third_party/nasm:BUILD.system",
-        link_files = {"//third_party/nasm:config.h": "config/config.h"},
-    )
diff --git a/third_party/nccl/LICENSE b/third_party/nccl/LICENSE
deleted file mode 100644
index b95851818605..000000000000
--- a/third_party/nccl/LICENSE
+++ /dev/null
@@ -1,30 +0,0 @@
-
- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in the
-    documentation and/or other materials provided with the distribution.
-  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
-    Laboratory, the U.S. Department of Energy, nor the names of their
-    contributors may be used to endorse or promote products derived
-    from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- The U.S. Department of Energy funded the development of this software
- under subcontract 7078610 with Lawrence Berkeley National Laboratory.
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
deleted file mode 100644
index d726cea4db33..000000000000
--- a/third_party/nccl/archive.BUILD
+++ /dev/null
@@ -1,283 +0,0 @@
-# NVIDIA NCCL 2
-# A package of optimized primitives for collective multi-GPU communication.
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "cuda_library",
-)
-load(
-    "@local_config_nccl//:build_defs.bzl",
-    "cuda_rdc_library",
-)
-load(
-    "@local_config_nccl//:generated_names.bzl",
-    "GENERATED_SOURCES",
-)
-
-licenses(["notice"])
-
-exports_files(["LICENSE.txt"])
-
-NCCL_MAJOR = 2
-
-NCCL_MINOR = 25
-
-NCCL_PATCH = 1
-
-NCCL_VERSION = NCCL_MAJOR * 10000 + NCCL_MINOR * 100 + NCCL_PATCH  # e.g., 21605
-
-expand_template(
-    name = "nccl_header_version",
-    out = "src/nccl.h",
-    substitutions = {
-        "${nccl:Major}": str(NCCL_MAJOR),
-        "${nccl:Minor}": str(NCCL_MINOR),
-        "${nccl:Patch}": str(NCCL_PATCH),
-        "${nccl:Suffix}": "\"\"",
-        "${nccl:Version}": str(NCCL_VERSION),
-    },
-    template = "src/nccl.h.in",
-)
-
-# This additional header allows us to determine the configured NCCL version
-# without including the rest of NCCL.
-write_file(
-    name = "nccl_config_header",
-    out = "nccl_config.h",
-    content = [
-        "#define TF_NCCL_VERSION \"{}\"".format(NCCL_MAJOR),
-    ],
-)
-
-cc_library(
-    name = "nccl_config",
-    hdrs = ["nccl_config.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "src_hdrs",
-    hdrs = [
-        "src/include/collectives.h",
-        "src/nccl.h",
-    ],
-    strip_include_prefix = "src",
-)
-
-cc_library(
-    name = "include_hdrs",
-    hdrs = glob(["src/include/**"]),
-    strip_include_prefix = "src/include",
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
-
-cc_library(
-    name = "device_hdrs",
-    hdrs = glob(["src/device/**/*.h"]),
-    strip_include_prefix = "src/device",
-)
-
-py_binary(
-    name = "generate",
-    srcs = ["src/device/generate.py"],
-    python_version = "PY3",
-)
-
-genrule(
-    name = "generated_srcs",
-    srcs = [],
-    outs = ["result.tar"],
-    cmd = """
-    mkdir -p src/device/generated
-    $(location :generate) src/device/generated
-    tar --warning=no-file-changed -cf $@ src
-    """,
-    tools = [":generate"],
-)
-
-genrule(
-    name = "generated_sources",
-    srcs = ["generated_srcs"],
-    outs = ["generated_names.bzl"],
-    cmd = """
-    echo '"List of sources generated by :generate_nccl_kernels"' > $@
-    echo "GENERATED_SOURCES = [" >> $@
-    tar --list -f $< | grep '.cc' |  sort | sed -e 's/\\(.*\\)/    "\\1",/' >> $@
-    echo "]" >> $@
-    """,
-)
-
-EXTRACT_CMD = """
-    set -x
-    OUTDIR=$$(mktemp -d)
-    tar -C $$OUTDIR -xf $(location :generated_srcs)
-    for outf in $(OUTS); do
-      F=$$(echo $$outf | sed -e 's@.*/src/device/generated/@@')
-      mv $$OUTDIR/src/device/generated/$$F $$outf
-    done
-"""
-
-genrule(
-    name = "generated_files",
-    srcs = [":generated_srcs"],
-    outs = GENERATED_SOURCES,
-    cmd = EXTRACT_CMD,
-)
-
-cuda_rdc_library(
-    name = "device",
-    srcs = [
-        ":generated_files",
-    ] + glob(include = [
-        "src/device/**/*.cu.cc",
-    ]),
-    deps = [
-        ":device_hdrs",
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-cc_library(
-    name = "net",
-    srcs = [
-        "src/transport/coll_net.cc",
-        "src/transport/net.cc",
-    ],
-    linkopts = ["-lrt"],
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-    ],
-)
-
-cc_library(
-    name = "nccl_via_stub",
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_xla//xla/tsl/cuda:nccl_stub",
-    ],
-)
-
-cc_library(
-    name = "nccl_headers",
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-cc_library(
-    name = "nccl",
-    srcs = glob(
-        include = [
-            "src/**/*.cc",
-            # Required for header inclusion checking, see below for details.
-            "src/graph/*.h",
-        ],
-        # Exclude device-library code.
-        exclude = [
-            "src/device/**",
-            "src/transport/coll_net.cc",
-            "src/transport/net.cc",
-            "src/enqueue.cc",
-            # RAS is a health-checking system (starting from NCCL 2.24:
-            # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting/ras.html
-            # It's not reqired for NCCL to work.
-            "src/ras/client.cc",
-        ],
-    ) + [
-        # Required for header inclusion checking (see
-        # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
-        # Files in src/ which #include "nccl.h" load it from there rather than
-        # from the virtual includes directory.
-        "src/include/collectives.h",
-        "src/nccl.h",
-        "src/ras/ras_internal.h",
-    ],
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    linkopts = ["-lrt"],
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device",
-        ":enqueue",
-        ":include_hdrs",
-        ":net",
-        ":src_hdrs",
-    ],
-)
-
-alias(
-    name = "enqueue",
-    actual = select({
-        "@local_config_cuda//cuda:using_clang": ":enqueue_clang",
-        "@local_config_cuda//cuda:using_nvcc": ":enqueue_nvcc",
-    }),
-)
-
-# Kernels and their names have special treatment in CUDA compilation.
-# Specifically, the host-side kernel launch stub (host-side representation of
-# the kernel) ends up having the name which does not match the actual kernel
-# name. In order to correctly refer to the kernel the referring code must be
-# compiled as CUDA.
-cuda_library(
-    name = "enqueue_clang",
-    srcs = [
-        "src/enqueue.cc",
-    ],
-    hdrs = ["src/nccl.h"],
-    copts = [
-        "--cuda-host-only",
-    ],
-    include_prefix = "third_party/nccl",
-    linkopts = ["-lrt"],
-    # The following definition is needed to enable placeholder literals such as
-    # PRIx64 defined at the inttypes.h since Tensorflow docker image uses
-    # an old version of glibc.
-    local_defines = ["__STDC_FORMAT_MACROS"],
-    strip_include_prefix = "src",
-    target_compatible_with = select({
-        "@local_config_cuda//cuda:using_clang": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device",
-        ":include_hdrs",
-        ":src_hdrs",
-    ],
-)
-
-cc_library(
-    name = "enqueue_nvcc",
-    srcs = [
-        "src/enqueue.cc",
-    ],
-    hdrs = ["src/nccl.h"],
-    include_prefix = "third_party/nccl",
-    linkopts = ["-lrt"],
-    strip_include_prefix = "src",
-    target_compatible_with = select({
-        "@local_config_cuda//cuda:using_nvcc": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":device",
-        ":include_hdrs",
-        ":src_hdrs",
-    ],
-)
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
deleted file mode 100644
index 4fc2dbb7aded..000000000000
--- a/third_party/nccl/archive.patch
+++ /dev/null
@@ -1,112 +0,0 @@
-diff --git a/src/device/common.cu b/src/device/common.cu.cc
-similarity index 100%
-rename from src/device/common.cu
-rename to src/device/common.cu.cc
-diff --git a/src/device/onerank.cu b/src/device/onerank.cu.cc
-similarity index 100%
-rename from src/device/onerank.cu
-rename to src/device/onerank.cu.cc
-diff --git a/src/device/common.h b/src/device/common.h
---- a/src/device/common.h
-+++ b/src/device/common.h
-@@ -24,7 +24,7 @@
- #endif
- 
- typedef void(*ncclDevFuncPtr_t)();
--extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];
-+extern __device__ ncclDevFuncPtr_t ncclDevFuncTable[];
- 
- struct ncclShmemGroup {
-   ncclConnInfo *recvConns[NCCL_MAX_ARITY];
-diff --git a/src/device/generate.py b/src/device/generate.py
-index a0d2259..62d6014 100755
---- a/src/device/generate.py
-+++ b/src/device/generate.py
-@@ -194,8 +194,8 @@ kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs))
- 
- ################################################################################
- 
--# Generate <gensrc>/device_table.cu
--with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
-+# Generate <gensrc>/device_table.cu.cc
-+with open(os.path.join(gensrc, "device_table.cu.cc"), "w") as f:
-   out = f.write
-   out('#include "common.h"\n')
-   out("\n")
-@@ -210,7 +210,7 @@ with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
-       out("#endif\n")
-   out("\n")
- 
--  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n");
-+  out("__device__ ncclDevFuncPtr_t ncclDevFuncTable[] = {\n");
-   index = 0
-   for fn in primary_funcs:
-     sym = paste("_", "ncclDevFunc", *fn)
-@@ -262,28 +262,43 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
- 
-   # List of all kernel function pointers.
-   out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
--  out("extern void* const ncclDevKernelList[] = {\n")
-   index = 0
-   for kfn in kernel_funcs:
-     cudart, _ = required_cuda(*kfn)
-     sym = paste("_", "ncclDevKernel", *kfn)
-     if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
--    out("/*%4d*/ (void*)%s,\n" % (index, sym));
--    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
-+    out("/*%4d*/ void* %s_ptr = (void*)%s;\n" % (index, sym, sym));
-+    if cudart != 0:
-+      out("#else\n/*%4d*/ void* %s_ptr = nullptr;\n#endif\n" % (index, sym));
-+    index += 1
-+
-+  out("extern void* const ncclDevKernelList[] = {\n")
-+  index = 0
-+  for kfn in kernel_funcs:
-+    sym = paste("_", "ncclDevKernel", *kfn)
-+    out("/*%4d*/ %s_ptr,\n" % (index, sym));
-     index += 1
-   out("nullptr};\n")
-   out("\n")
- 
-   # Maps primary id to kernel function pointer.
--  out("extern void* const ncclDevKernelForFunc[] = {\n")
-   index = 0
-   for fn in primary_funcs:
-     kfn = best_kernel(*fn)
-     sym = paste("_", "ncclDevKernel", *kfn)
-     cudart, _ = required_cuda(*kfn)
-     if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
--    out("/*%4d*/ (void*)%s,\n" % (index, sym))
--    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
-+    out("/*%4d*/ void* %s_ptr_%d = (void*)%s;\n" % (index, sym, index, sym))
-+    if cudart != 0:
-+      out("#else\n" "/*%4d*/ void* %s_ptr_%d = nullptr;\n" "#endif\n" % (index, sym, index))
-+    index += 1
-+
-+  out("extern void* const ncclDevKernelForFunc[] = {\n")
-+  index = 0
-+  for fn in primary_funcs:
-+    kfn = best_kernel(*fn)
-+    sym = paste("_", "ncclDevKernel", *kfn)
-+    out("/*%4d*/ %s_ptr_%d,\n" % (index, sym, index))
-     index += 1
-   out("nullptr};\n")
-   out("\n")
-@@ -302,7 +317,7 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
- # "coll" is reflected in the name: formally that no two funcs having different
- # coll's map to the same filename.
- def impl_filename(coll, redop, ty, algo, proto):
--  return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
-+  return "%s.cu.cc" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
- 
- # Partition the functions and kernels to the .cu filenames. The partition is
- # a dictionary mapping filename to (coll, func-tuple list)
-@@ -323,7 +338,7 @@ name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Gene
- with open(os.path.join(gensrc, "rules.mk"), "w") as f:
-   out = f.write
-   impl_names = sorted(name_to_funcs.keys())
--  names = impl_names + ["host_table.cc", "device_table.cu"]
-+  names = impl_names + ["host_table.cc", "device_table.cu.cc"]
-   out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n"
-       .format(names=" ".join(names)))
-   out("\n")
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
deleted file mode 100644
index a0930df34ece..000000000000
--- a/third_party/nccl/build_defs.bzl.tpl
+++ /dev/null
@@ -1,379 +0,0 @@
-"""Repository rule for NCCL."""
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
-load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
-
-# CUDA toolkit version as tuple (e.g. '(11, 1)').
-_cuda_version = %{cuda_version}
-
-def _rdc_copts():
-    """Returns copts for compiling relocatable device code."""
-
-    # The global functions can not have a lower register count than the
-    # device functions. This is enforced by setting a fixed register count.
-    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
-    maxrregcount = "-maxrregcount=96"
-
-    return cuda_default_copts() + select({
-        "@local_config_cuda//:is_cuda_compiler_nvcc": [
-            "-nvcc_options",
-            "relocatable-device-code=true",
-            "-nvcc_options",
-            "ptxas-options=" + maxrregcount,
-            "-nvcc_options",
-            "extended-lambda",
-        ],
-        "@local_config_cuda//:is_cuda_compiler_clang": [
-            "-fcuda-rdc",
-            "-Xcuda-ptxas",
-            maxrregcount,
-        ],
-        "//conditions:default": [],
-    })
-
-def _lookup_file(filegroup, path):
-    """Extracts file at (relative) path in filegroup."""
-    for file in filegroup.files:
-        if file.path.endswith(path):
-            return file
-    return None
-
-def _pic_only(files):
-    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
-    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
-    return pic_only if pic_only else files
-
-def _device_link_impl(ctx):
-    if not ctx.attr.gpu_archs:
-        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
-
-    inputs = []
-    for dep in ctx.attr.deps:
-        inputs += dep.files.to_list()
-    inputs = _pic_only(inputs)
-
-    # Device-link to cubins for each architecture.
-    name = ctx.attr.name
-    register_h = None
-    cubins = []
-    images = []
-    for arch in ctx.attr.gpu_archs:
-        arch = arch.replace("compute_", "sm_")  # PTX is JIT-linked at runtime.
-        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
-        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
-        ctx.actions.run(
-            outputs = [register_h, cubin],
-            inputs = inputs,
-            executable = ctx.file._nvlink,
-            arguments = ctx.attr.nvlink_args + [
-                "--arch=%s" % arch,
-                "--register-link-binaries=%s" % register_h.path,
-                "--output-file=%s" % cubin.path,
-            ] + [file.path for file in inputs],
-            mnemonic = "nvlink",
-            use_default_shell_env = True,
-        )
-        cubins.append(cubin)
-        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
-
-    # Generate fatbin header from all cubins.
-    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
-    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
-    bin2c = ctx.file._bin2c
-    arguments_list = [
-        "-64",
-        "--cmdline=--compile-only",
-        "--link",
-        "--compress-all",
-        "--create=%s" % tmp_fatbin.path,
-        "--embedded-fatbin=%s" % fatbin_h.path,
-    ]
-    if _cuda_version <= (10, 1):
-        arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
-    ctx.actions.run(
-        outputs = [tmp_fatbin, fatbin_h],
-        inputs = cubins,
-        executable = ctx.file._fatbinary,
-        arguments = arguments_list + images,
-        tools = [bin2c],
-        mnemonic = "fatbinary",
-        use_default_shell_env = True,
-    )
-
-    # Generate the source file #including the headers generated above.
-    ctx.actions.expand_template(
-        output = ctx.outputs.out,
-        template = ctx.file._link_stub,
-        substitutions = {
-            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
-            "FATBINFILE": '"%s"' % fatbin_h.short_path,
-        },
-    )
-
-    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
-
-_device_link = rule(
-    implementation = _device_link_impl,
-    attrs = {
-        "deps": attr.label_list(),
-        "out": attr.output(mandatory = True),
-        "gpu_archs": attr.string_list(),
-        "nvlink_args": attr.string_list(),
-        "_nvlink": attr.label(
-            default = Label("%{nvlink_label}"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_fatbinary": attr.label(
-            default = Label("%{fatbinary_label}"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_bin2c": attr.label(
-            default = Label("%{bin2c_label}"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-        "_link_stub": attr.label(
-            default = Label("%{link_stub_label}"),
-            allow_single_file = True,
-        ),
-    },
-)
-"""Links device code and generates source code for kernel registration."""
-
-def _prune_relocatable_code_impl(ctx):
-    """Clears __nv_relfatbin section containing relocatable device code."""
-
-    if _cuda_version < (11, 3):
-        # -no-relocatable-elf not supported, return unpruned input.
-        return ctx.attr.input[DefaultInfo]
-
-    # nvcc --generate-code options for the active set of cuda architectures.
-    gencodes = []
-    for code in ctx.attr.gpu_archs:
-        arch = code.replace("compute_", "sm_")
-        if code != arch:
-            gencodes.append((arch, arch))
-        gencodes.append((arch, code))
-
-    outputs = []
-    for input in ctx.files.input:
-        output = ctx.actions.declare_file(
-            "pruned_" + input.basename,
-            sibling = input,
-        )
-        arguments = (
-            ["--generate-code=arch=%s,code=%s" % code for code in gencodes] +
-            ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)]
-        )
-        ctx.actions.run(
-            outputs = [output],
-            inputs = [input],
-            executable = ctx.file._nvprune,
-            arguments = arguments,
-            mnemonic = "nvprune",
-            use_default_shell_env = True,
-        )
-        outputs.append(output)
-
-    return DefaultInfo(files = depset(outputs))
-
-_prune_relocatable_code = rule(
-    implementation = _prune_relocatable_code_impl,
-    attrs = {
-        "input": attr.label(mandatory = True, allow_files = True),
-        "gpu_archs": attr.string_list(),
-        "_nvprune": attr.label(
-            default = Label("%{nvprune_label}"),
-            allow_single_file = True,
-            executable = True,
-            cfg = "host",
-        ),
-    },
-)
-
-def _merge_archive_impl(ctx):
-    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
-    # See https://stackoverflow.com/a/23621751.
-    files = _pic_only(ctx.files.srcs)
-    mri_script = "create " + ctx.outputs.out.path
-    for f in files:
-        mri_script += r"\naddlib " + f.path
-    mri_script += r"\nsave\nend"
-
-    cc_toolchain = find_cpp_toolchain(ctx)
-    ctx.actions.run_shell(
-        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
-        outputs = [ctx.outputs.out],
-        command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
-        use_default_shell_env = True,
-    )
-
-_merge_archive = rule(
-    implementation = _merge_archive_impl,
-    attrs = {
-        "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "_cc_toolchain": attr.label(
-            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
-        ),
-        # "_crosstool": attr.label_list(
-        #     cfg = "host",
-        #     default = ["@bazel_tools//tools/cpp:crosstool"]
-        # ),
-    },
-    outputs = {"out": "lib%{name}.a"},
-)
-"""Merges srcs into a single archive."""
-
-def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
-    r"""Produces a cuda_library using separate compilation and linking.
-
-    CUDA separate compilation and linking allows device function calls across
-    translation units. This is different from the normal whole program
-    compilation where each translation unit contains all device code. For more
-    background, see
-    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
-    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
-
-    During separate compilation, the different CUDA source files are compiled
-    to 'relocatable device code' (RDC) and embedded in the host object files.
-    When using nvcc, linking the device code for each supported GPU
-    architecture and generating kernel registration code for the CUDA runtime
-    is handled automatically. Clang supports generating relocatable device
-    code, but it can't link it. We therefore rely on tools provided by the CUDA
-    SDK to link the device code and generate the host code to register the
-    kernels.
-
-    The nvlink tool extracts the RDC code from the object files and links it
-    into cubin files, one per GPU architecture. It also produces a header file
-    with a list of kernel names to register. The cubins are merged into a
-    binary blob using the fatbinary tool, and converted to a C header file with
-    the help of the bin2c tool. The registration header file, the fatbinary
-    header file, and the link.stub file (shipped with the CUDA SDK) are
-    compiled as ordinary host code.
-
-    Here is a diagram of the CUDA separate compilation trajectory:
-
-     x.cu.cc    y.cu.cc
-           \    /            cc_library (compile RDC and archive)
-            xy.a
-           /    \            * nvlink
-    register.h  xy.cubin
-          :      |           * fatbinary and bin2c
-          :     xy.fatbin.h
-          :      :           * #include
-          dlink.cc           * Expanded from crt/dlink.stub template
-             |               cc_library (host compile and archive)
-          dlink.a
-
-    The steps marked with '*' are implemented in the _device_link rule.
-
-    The intermediate relocatable device code in xy.a is no longer needed at
-    this point and the corresponding section is replaced with an empty one using
-    objcopy. We do not remove the section completely because it is referenced by
-    relocations, and removing those as well breaks fatbin registration.
-
-    The object files in both xy.a and dlink.a reference symbols defined in the
-    other archive. The separate archives are a side effect of using two
-    cc_library targets to implement a single compilation trajectory. We could
-    fix this once bazel supports C++ sandwich. For now, we just merge the two
-    archives to avoid unresolved symbols:
-
-                    xy.a
-                     |         objcopy --update-section __nv_relfatbin=''
-    dlink.a     xy_pruned.a
-         \           /         merge archive
-          xy_merged.a
-              |                cc_library (or alternatively, cc_import)
-         final target
-
-    Another complication is that cc_library produces (depending on the
-    configuration) both PIC and non-PIC archives, but the distinction
-    is hidden from Starlark until C++ sandwich becomes available. We work
-    around this by dropping the non-PIC files if PIC files are available.
-
-    Args:
-      name: Target name.
-      hdrs: Header files.
-      copts: Compiler options.
-      linkstatic: Must be true.
-      **kwargs: Any other arguments.
-    """
-
-    if not hdrs:
-        hdrs = []
-    if not copts:
-        copts = []
-
-    # Compile host and device code into library.
-    lib = name + "_lib"
-    native.cc_library(
-        name = lib,
-        hdrs = hdrs,
-        copts = _rdc_copts() + copts,
-        linkstatic = linkstatic,
-        **kwargs
-    )
-
-    # Generate source file containing linked device code.
-    dlink_hdrs = name + "_dlink_hdrs"
-    dlink_cc = name + "_dlink.cc"
-    _device_link(
-        name = dlink_hdrs,
-        deps = [lib],
-        out = dlink_cc,
-        gpu_archs = cuda_gpu_architectures(),
-        nvlink_args = select({
-            "@local_xla//xla/tsl:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@local_xla//xla/tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
-            "//conditions:default": [],
-        }),
-    )
-
-    # Compile the source file into a library.
-    dlink = name + "_dlink"
-    native.cc_library(
-        name = dlink,
-        srcs = [dlink_cc],
-        textual_hdrs = [dlink_hdrs],
-        deps = [
-            "@local_config_cuda//cuda:cuda_headers",
-        ],
-        defines = [
-            # Silence warning about including internal header.
-            "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
-            # Macros that need to be defined starting with CUDA 10.
-            "__NV_EXTRA_INITIALIZATION=",
-            "__NV_EXTRA_FINALIZATION=",
-        ],
-        linkstatic = linkstatic,
-    )
-
-    # Remove intermediate relocatable device code.
-    pruned = name + "_pruned"
-    _prune_relocatable_code(
-        name = pruned,
-        input = lib,
-        gpu_archs = cuda_gpu_architectures(),
-    )
-
-    # Repackage the two libs into a single archive. This is required because
-    # both libs reference symbols defined in the other one. For details, see
-    # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    merged = name + "_merged"
-    _merge_archive(
-        name = merged,
-        srcs = [pruned, dlink],
-    )
-
-    # Create cc target from archive.
-    native.cc_library(
-        name = name,
-        srcs = [merged],
-        hdrs = hdrs,
-        linkstatic = linkstatic,
-    )
diff --git a/third_party/nccl/generated_names.bzl.tpl b/third_party/nccl/generated_names.bzl.tpl
deleted file mode 100644
index 634e282e0934..000000000000
--- a/third_party/nccl/generated_names.bzl.tpl
+++ /dev/null
@@ -1,132 +0,0 @@
-"List of sources generated by :generate_nccl_kernels"
-GENERATED_SOURCES = [
-    "src/device/generated/all_gather.cu.cc",
-    "src/device/generated/all_reduce.cu.cc",
-    "src/device/generated/all_reduce_minmax_bf16.cu.cc",
-    "src/device/generated/all_reduce_minmax_f16.cu.cc",
-    "src/device/generated/all_reduce_minmax_f32.cu.cc",
-    "src/device/generated/all_reduce_minmax_f64.cu.cc",
-    "src/device/generated/all_reduce_minmax_f8e4m3.cu.cc",
-    "src/device/generated/all_reduce_minmax_f8e5m2.cu.cc",
-    "src/device/generated/all_reduce_minmax_i32.cu.cc",
-    "src/device/generated/all_reduce_minmax_i64.cu.cc",
-    "src/device/generated/all_reduce_minmax_u32.cu.cc",
-    "src/device/generated/all_reduce_minmax_u64.cu.cc",
-    "src/device/generated/all_reduce_minmax_u8.cu.cc",
-    "src/device/generated/all_reduce_premulsum_bf16.cu.cc",
-    "src/device/generated/all_reduce_premulsum_f16.cu.cc",
-    "src/device/generated/all_reduce_premulsum_f32.cu.cc",
-    "src/device/generated/all_reduce_premulsum_f64.cu.cc",
-    "src/device/generated/all_reduce_premulsum_f8e4m3.cu.cc",
-    "src/device/generated/all_reduce_premulsum_f8e5m2.cu.cc",
-    "src/device/generated/all_reduce_premulsum_u32.cu.cc",
-    "src/device/generated/all_reduce_premulsum_u64.cu.cc",
-    "src/device/generated/all_reduce_premulsum_u8.cu.cc",
-    "src/device/generated/all_reduce_prod_bf16.cu.cc",
-    "src/device/generated/all_reduce_prod_f16.cu.cc",
-    "src/device/generated/all_reduce_prod_f32.cu.cc",
-    "src/device/generated/all_reduce_prod_f64.cu.cc",
-    "src/device/generated/all_reduce_prod_f8e4m3.cu.cc",
-    "src/device/generated/all_reduce_prod_f8e5m2.cu.cc",
-    "src/device/generated/all_reduce_prod_u32.cu.cc",
-    "src/device/generated/all_reduce_prod_u64.cu.cc",
-    "src/device/generated/all_reduce_prod_u8.cu.cc",
-    "src/device/generated/all_reduce_sum_bf16.cu.cc",
-    "src/device/generated/all_reduce_sum_f16.cu.cc",
-    "src/device/generated/all_reduce_sum_f32.cu.cc",
-    "src/device/generated/all_reduce_sum_f64.cu.cc",
-    "src/device/generated/all_reduce_sum_f8e4m3.cu.cc",
-    "src/device/generated/all_reduce_sum_f8e5m2.cu.cc",
-    "src/device/generated/all_reduce_sumpostdiv_u32.cu.cc",
-    "src/device/generated/all_reduce_sumpostdiv_u64.cu.cc",
-    "src/device/generated/all_reduce_sumpostdiv_u8.cu.cc",
-    "src/device/generated/all_reduce_sum_u32.cu.cc",
-    "src/device/generated/all_reduce_sum_u64.cu.cc",
-    "src/device/generated/all_reduce_sum_u8.cu.cc",
-    "src/device/generated/broadcast.cu.cc",
-    "src/device/generated/device_table.cu.cc",
-    "src/device/generated/host_table.cc",
-    "src/device/generated/reduce.cu.cc",
-    "src/device/generated/reduce_minmax_bf16.cu.cc",
-    "src/device/generated/reduce_minmax_f16.cu.cc",
-    "src/device/generated/reduce_minmax_f32.cu.cc",
-    "src/device/generated/reduce_minmax_f64.cu.cc",
-    "src/device/generated/reduce_minmax_f8e4m3.cu.cc",
-    "src/device/generated/reduce_minmax_f8e5m2.cu.cc",
-    "src/device/generated/reduce_minmax_u32.cu.cc",
-    "src/device/generated/reduce_minmax_u64.cu.cc",
-    "src/device/generated/reduce_minmax_u8.cu.cc",
-    "src/device/generated/reduce_premulsum_bf16.cu.cc",
-    "src/device/generated/reduce_premulsum_f16.cu.cc",
-    "src/device/generated/reduce_premulsum_f32.cu.cc",
-    "src/device/generated/reduce_premulsum_f64.cu.cc",
-    "src/device/generated/reduce_premulsum_f8e4m3.cu.cc",
-    "src/device/generated/reduce_premulsum_f8e5m2.cu.cc",
-    "src/device/generated/reduce_premulsum_u32.cu.cc",
-    "src/device/generated/reduce_premulsum_u64.cu.cc",
-    "src/device/generated/reduce_premulsum_u8.cu.cc",
-    "src/device/generated/reduce_prod_bf16.cu.cc",
-    "src/device/generated/reduce_prod_f16.cu.cc",
-    "src/device/generated/reduce_prod_f32.cu.cc",
-    "src/device/generated/reduce_prod_f64.cu.cc",
-    "src/device/generated/reduce_prod_f8e4m3.cu.cc",
-    "src/device/generated/reduce_prod_f8e5m2.cu.cc",
-    "src/device/generated/reduce_prod_u32.cu.cc",
-    "src/device/generated/reduce_prod_u64.cu.cc",
-    "src/device/generated/reduce_prod_u8.cu.cc",
-    "src/device/generated/reduce_scatter.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_bf16.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_f16.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_f32.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_f64.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_f8e4m3.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_f8e5m2.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_i32.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_i64.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_u32.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_u64.cu.cc",
-    "src/device/generated/reduce_scatter_minmax_u8.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_bf16.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_f16.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_f32.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_f64.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_f8e4m3.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_f8e5m2.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_u32.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_u64.cu.cc",
-    "src/device/generated/reduce_scatter_premulsum_u8.cu.cc",
-    "src/device/generated/reduce_scatter_prod_bf16.cu.cc",
-    "src/device/generated/reduce_scatter_prod_f16.cu.cc",
-    "src/device/generated/reduce_scatter_prod_f32.cu.cc",
-    "src/device/generated/reduce_scatter_prod_f64.cu.cc",
-    "src/device/generated/reduce_scatter_prod_f8e4m3.cu.cc",
-    "src/device/generated/reduce_scatter_prod_f8e5m2.cu.cc",
-    "src/device/generated/reduce_scatter_prod_u32.cu.cc",
-    "src/device/generated/reduce_scatter_prod_u64.cu.cc",
-    "src/device/generated/reduce_scatter_prod_u8.cu.cc",
-    "src/device/generated/reduce_scatter_sum_bf16.cu.cc",
-    "src/device/generated/reduce_scatter_sum_f16.cu.cc",
-    "src/device/generated/reduce_scatter_sum_f32.cu.cc",
-    "src/device/generated/reduce_scatter_sum_f64.cu.cc",
-    "src/device/generated/reduce_scatter_sum_f8e4m3.cu.cc",
-    "src/device/generated/reduce_scatter_sum_f8e5m2.cu.cc",
-    "src/device/generated/reduce_scatter_sumpostdiv_u32.cu.cc",
-    "src/device/generated/reduce_scatter_sumpostdiv_u64.cu.cc",
-    "src/device/generated/reduce_scatter_sumpostdiv_u8.cu.cc",
-    "src/device/generated/reduce_scatter_sum_u32.cu.cc",
-    "src/device/generated/reduce_scatter_sum_u64.cu.cc",
-    "src/device/generated/reduce_scatter_sum_u8.cu.cc",
-    "src/device/generated/reduce_sum_bf16.cu.cc",
-    "src/device/generated/reduce_sum_f16.cu.cc",
-    "src/device/generated/reduce_sum_f32.cu.cc",
-    "src/device/generated/reduce_sum_f64.cu.cc",
-    "src/device/generated/reduce_sum_f8e4m3.cu.cc",
-    "src/device/generated/reduce_sum_f8e5m2.cu.cc",
-    "src/device/generated/reduce_sumpostdiv_u32.cu.cc",
-    "src/device/generated/reduce_sumpostdiv_u64.cu.cc",
-    "src/device/generated/reduce_sumpostdiv_u8.cu.cc",
-    "src/device/generated/reduce_sum_u32.cu.cc",
-    "src/device/generated/reduce_sum_u64.cu.cc",
-    "src/device/generated/reduce_sum_u8.cu.cc",
-    "src/device/generated/sendrecv.cu.cc",
-]
diff --git a/third_party/nccl/hermetic/cuda_nccl.BUILD.tpl b/third_party/nccl/hermetic/cuda_nccl.BUILD.tpl
deleted file mode 100644
index 51e7c35200fd..000000000000
--- a/third_party/nccl/hermetic/cuda_nccl.BUILD.tpl
+++ /dev/null
@@ -1,35 +0,0 @@
-licenses(["restricted"])  # NVIDIA proprietary license
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-
-exports_files([
-    "version.txt",
-])
-%{multiline_comment}
-cc_import(
-    name = "nccl_shared_library",
-    shared_library = "lib/libnccl.so.%{libnccl_version}",
-    hdrs = [":headers"],
-    deps = ["@local_config_cuda//cuda:cuda_headers", ":headers"],
-)
-%{multiline_comment}
-cc_library(
-    name = "nccl",
-    %{comment}deps = [":nccl_shared_library"],
-    %{comment}linkopts = cuda_rpath_flags("nvidia/nccl/lib"),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "headers",
-    %{comment}hdrs = glob([
-        %{comment}"include/nccl*.h",
-    %{comment}]),
-    include_prefix = "third_party/nccl",
-    includes = ["include/"],
-    strip_include_prefix = "include",
-    visibility = ["//visibility:public"],
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
diff --git a/third_party/nccl/hermetic/nccl_configure.bzl b/third_party/nccl/hermetic/nccl_configure.bzl
deleted file mode 100644
index c1e49a6b9f1d..000000000000
--- a/third_party/nccl/hermetic/nccl_configure.bzl
+++ /dev/null
@@ -1,192 +0,0 @@
-"""Repository rule for hermetic NCCL configuration.
-
-`nccl_configure` depends on the following environment variables:
-
-  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
-    be used, "0" if NCCL should be linked in statically.
-  * `HERMETIC_CUDA_VERSION`: The version of the CUDA toolkit. If not specified,
-  the version will be determined by the `TF_CUDA_VERSION`.
-
-"""
-
-load(
-    "//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
-    "HERMETIC_CUDA_VERSION",
-    "TF_CUDA_VERSION",
-    "TF_NEED_CUDA",
-    "enable_cuda",
-    "get_cuda_version",
-)
-load(
-    "//third_party/remote_config:common.bzl",
-    "get_cpu_value",
-    "get_host_environ",
-)
-
-_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
-
-_NCCL_DUMMY_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-  name = "nccl",
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-  name = "nccl_config",
-  hdrs = ["nccl_config.h"],
-  include_prefix = "third_party/nccl",
-  visibility = ["//visibility:public"],
-)
-"""
-
-_NCCL_ARCHIVE_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  data = ["@nccl_archive//:LICENSE.txt"],
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl",
-  actual = select({
-      "@local_config_cuda//cuda:cuda_tools_and_libs": "@cuda_nccl//:nccl",
-      "//conditions:default": "@nccl_archive//:nccl",
-  }),
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_headers",
-  actual = select({
-      "@local_config_cuda//cuda:cuda_tools_and_libs": "@cuda_nccl//:headers",
-      "//conditions:default": "@nccl_archive//:nccl_headers",
-  }),
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hermetic_nccl_config",
-    hdrs = ["nccl_config.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_config",
-  actual = select({
-      "@local_config_cuda//cuda:cuda_tools_and_libs": ":hermetic_nccl_config",
-      "//conditions:default": "@nccl_archive//:nccl_config",
-  }),
-  visibility = ["//visibility:public"],
-)
-"""
-
-_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  data = ["@nccl_archive//:LICENSE.txt"],
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl",
-  actual = select({
-      "@local_config_cuda//cuda:cuda_tools_and_libs": "@cuda_nccl//:nccl",
-      "//conditions:default": "@nccl_archive//:nccl_via_stub",
-  }),
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_headers",
-  actual = select({
-      "@local_config_cuda//cuda:cuda_tools_and_libs": "@cuda_nccl//:headers",
-      "//conditions:default": "@nccl_archive//:nccl_headers",
-  }),
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hermetic_nccl_config",
-    hdrs = ["nccl_config.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_config",
-  actual = select({
-      "@local_config_cuda//cuda:cuda_tools_and_libs": ":hermetic_nccl_config",
-      "//conditions:default": "@nccl_archive//:nccl_config",
-  }),
-  visibility = ["//visibility:public"],
-)
-"""
-
-def _create_local_nccl_repository(repository_ctx):
-    cuda_version = get_cuda_version(repository_ctx).split(".")[:2]
-    nccl_version = repository_ctx.read(repository_ctx.attr.nccl_version)
-
-    if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
-        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
-    else:
-        repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
-
-    repository_ctx.template("generated_names.bzl", repository_ctx.attr.generated_names_tpl, {})
-    repository_ctx.template(
-        "build_defs.bzl",
-        repository_ctx.attr.build_defs_tpl,
-        {
-            "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
-            "%{nvlink_label}": "@cuda_nvcc//:nvlink",
-            "%{fatbinary_label}": "@cuda_nvcc//:fatbinary",
-            "%{bin2c_label}": "@cuda_nvcc//:bin2c",
-            "%{link_stub_label}": "@cuda_nvcc//:link_stub",
-            "%{nvprune_label}": "@cuda_nvprune//:nvprune",
-        },
-    )
-    repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"%s\"" % nccl_version)
-
-def _nccl_autoconf_impl(repository_ctx):
-    if (not enable_cuda(repository_ctx) or
-        get_cpu_value(repository_ctx) != "Linux"):
-        # Add a dummy build file to make bazel query happy.
-        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
-        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
-    else:
-        _create_local_nccl_repository(repository_ctx)
-
-_ENVIRONS = [
-    TF_NEED_CUDA,
-    TF_CUDA_VERSION,
-    _TF_NCCL_USE_STUB,
-    HERMETIC_CUDA_VERSION,
-    "LOCAL_NCCL_PATH",
-]
-
-nccl_configure = repository_rule(
-    environ = _ENVIRONS,
-    implementation = _nccl_autoconf_impl,
-    attrs = {
-        "environ": attr.string_dict(),
-        "nccl_version": attr.label(default = Label("@cuda_nccl//:version.txt")),
-        "generated_names_tpl": attr.label(default = Label("//third_party/nccl:generated_names.bzl.tpl")),
-        "build_defs_tpl": attr.label(default = Label("//third_party/nccl:build_defs.bzl.tpl")),
-    },
-)
-"""Downloads and configures the hermetic NCCL configuration.
-
-Add the following to your WORKSPACE file:
-
-```python
-nccl_configure(name = "local_config_nccl")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""  # buildifier: disable=no-effect
diff --git a/third_party/nccl/hermetic/nccl_redist_init_repository.bzl b/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
deleted file mode 100644
index 3bb2fe0efcf5..000000000000
--- a/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Hermetic NCCL repositories initialization. Consult the WORKSPACE on how to use it."""
-
-load("//third_party:repo.bzl", "tf_mirror_urls")
-load(
-    "//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl",
-    "OS_ARCH_DICT",
-    "create_build_file",
-    "create_dummy_build_file",
-    "get_archive_name",
-    "get_env_var",
-    "get_lib_name_to_version_dict",
-    "get_major_library_version",
-    "get_version_and_template_lists",
-    "use_local_path",
-)
-load(
-    "//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
-    "CUDA_NCCL_WHEELS",
-    "REDIST_VERSIONS_TO_BUILD_TEMPLATES",
-)
-
-def _use_downloaded_nccl_wheel(repository_ctx):
-    # buildifier: disable=function-docstring-args
-    """ Downloads NCCL wheel and inits hermetic NCCL repository."""
-    cuda_version = (get_env_var(repository_ctx, "HERMETIC_CUDA_VERSION") or
-                    get_env_var(repository_ctx, "TF_CUDA_VERSION"))
-    major_version = ""
-    if not cuda_version:
-        # If no CUDA version is found, comment out cc_import targets.
-        create_dummy_build_file(repository_ctx)
-        repository_ctx.file("version.txt", major_version)
-        return
-
-    # Download archive only when GPU config is used.
-    target_arch = get_env_var(repository_ctx, "CUDA_REDIST_TARGET_PLATFORM")
-    if target_arch:
-        if target_arch in OS_ARCH_DICT.keys():
-            arch = OS_ARCH_DICT[target_arch]
-        else:
-            fail(
-                "Unsupported architecture: {arch}, use one of {supported}".format(
-                    arch = target_arch,
-                    supported = OS_ARCH_DICT.keys(),
-                ),
-            )
-    else:
-        arch = OS_ARCH_DICT[repository_ctx.os.arch]
-    dict_key = "{cuda_version}-{arch}".format(
-        cuda_version = cuda_version,
-        arch = arch,
-    )
-    supported_versions = repository_ctx.attr.url_dict.keys()
-    if dict_key not in supported_versions:
-        fail(
-            ("The supported NCCL versions are {supported_versions}." +
-             " Please provide a supported version in HERMETIC_CUDA_VERSION" +
-             " environment variable or add NCCL distribution for" +
-             " CUDA version={version}, OS={arch}.")
-                .format(
-                supported_versions = supported_versions,
-                version = cuda_version,
-                arch = arch,
-            ),
-        )
-    sha256 = repository_ctx.attr.sha256_dict[dict_key]
-    url = repository_ctx.attr.url_dict[dict_key]
-
-    archive_name = get_archive_name(url)
-    file_name = archive_name + ".zip"
-
-    print("Downloading and extracting {}".format(url))  # buildifier: disable=print
-    repository_ctx.download(
-        url = tf_mirror_urls(url),
-        output = file_name,
-        sha256 = sha256,
-    )
-    repository_ctx.extract(
-        archive = file_name,
-        stripPrefix = repository_ctx.attr.strip_prefix,
-    )
-    repository_ctx.delete(file_name)
-
-    lib_name_to_version_dict = get_lib_name_to_version_dict(repository_ctx)
-    major_version = get_major_library_version(
-        repository_ctx,
-        lib_name_to_version_dict,
-    )
-    create_build_file(
-        repository_ctx,
-        lib_name_to_version_dict,
-        major_version,
-    )
-
-    repository_ctx.file("version.txt", major_version)
-
-def _use_local_nccl_path(repository_ctx, local_nccl_path):
-    # buildifier: disable=function-docstring-args
-    """ Creates symlinks and initializes hermetic NCCL repository."""
-    use_local_path(repository_ctx, local_nccl_path, ["include", "lib"])
-
-def _cuda_nccl_repo_impl(repository_ctx):
-    local_nccl_path = get_env_var(repository_ctx, "LOCAL_NCCL_PATH")
-    if local_nccl_path:
-        _use_local_nccl_path(repository_ctx, local_nccl_path)
-    else:
-        _use_downloaded_nccl_wheel(repository_ctx)
-
-cuda_nccl_repo = repository_rule(
-    implementation = _cuda_nccl_repo_impl,
-    attrs = {
-        "sha256_dict": attr.string_dict(mandatory = True),
-        "url_dict": attr.string_dict(mandatory = True),
-        "versions": attr.string_list(mandatory = True),
-        "build_templates": attr.label_list(mandatory = True),
-        "strip_prefix": attr.string(),
-    },
-    environ = [
-        "HERMETIC_CUDA_VERSION",
-        "TF_CUDA_VERSION",
-        "LOCAL_NCCL_PATH",
-        "CUDA_REDIST_TARGET_PLATFORM",
-    ],
-)
-
-def nccl_redist_init_repository(
-        cuda_nccl_wheels = CUDA_NCCL_WHEELS,
-        redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
-    # buildifier: disable=function-docstring-args
-    """Initializes NCCL repository."""
-    nccl_artifacts_dict = {"sha256_dict": {}, "url_dict": {}}
-    for cuda_version, nccl_wheel_info in cuda_nccl_wheels.items():
-        for arch in OS_ARCH_DICT.values():
-            if arch in nccl_wheel_info.keys():
-                cuda_version_to_arch_key = "%s-%s" % (cuda_version, arch)
-                nccl_artifacts_dict["sha256_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch].get("sha256", "")
-                nccl_artifacts_dict["url_dict"][cuda_version_to_arch_key] = nccl_wheel_info[arch]["url"]
-    repo_data = redist_versions_to_build_templates["cuda_nccl"]
-    versions, templates = get_version_and_template_lists(
-        repo_data["version_to_template"],
-    )
-    cuda_nccl_repo(
-        name = repo_data["repo_name"],
-        sha256_dict = nccl_artifacts_dict["sha256_dict"],
-        url_dict = nccl_artifacts_dict["url_dict"],
-        versions = versions,
-        build_templates = templates,
-        strip_prefix = "nvidia/nccl",
-    )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
deleted file mode 100644
index 23544de321ff..000000000000
--- a/third_party/nccl/nccl_configure.bzl
+++ /dev/null
@@ -1,221 +0,0 @@
-"""Repository rule for NCCL configuration.
-
-NB: DEPRECATED! Use `hermetic/nccl_configure` rule instead.
-
-`nccl_configure` depends on the following environment variables:
-
-  * `TF_NCCL_VERSION`: Installed NCCL version or empty to build from source.
-  * `NCCL_INSTALL_PATH` (deprecated): The installation path of the NCCL library.
-  * `NCCL_HDR_PATH` (deprecated): The installation path of the NCCL header 
-    files.
-  * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
-    `/usr/local/cuda,usr/`.
-  * `TF_NCCL_USE_STUB`: "1" if a NCCL stub that loads NCCL dynamically should
-    be used, "0" if NCCL should be linked in statically.
-
-"""
-
-load(
-    "//third_party/gpus:cuda_configure.bzl",
-    "enable_cuda",
-    "find_cuda_config",
-)
-load(
-    "//third_party/remote_config:common.bzl",
-    "config_repo_label",
-    "get_cpu_value",
-    "get_host_environ",
-)
-
-_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
-_NCCL_HDR_PATH = "NCCL_HDR_PATH"
-_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
-_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
-_TF_NCCL_VERSION = "TF_NCCL_VERSION"
-_TF_NEED_CUDA = "TF_NEED_CUDA"
-_TF_CUDA_PATHS = "TF_CUDA_PATHS"
-_TF_NCCL_USE_STUB = "TF_NCCL_USE_STUB"
-
-_DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
-_DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
-_DEFINE_NCCL_PATCH = "#define NCCL_PATCH"
-
-_NCCL_DUMMY_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-  name = "nccl",
-  visibility = ["//visibility:public"],
-)
-
-cc_library(
-  name = "nccl_config",
-  hdrs = ["nccl_config.h"],
-  include_prefix = "third_party/nccl",
-  visibility = ["//visibility:public"],
-)
-"""
-
-_NCCL_ARCHIVE_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  data = ["@nccl_archive//:LICENSE.txt"],
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl",
-  actual = "@nccl_archive//:nccl",
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_config",
-  actual = "@nccl_archive//:nccl_config",
-  visibility = ["//visibility:public"],
-)
-"""
-
-_NCCL_ARCHIVE_STUB_BUILD_CONTENT = """
-filegroup(
-  name = "LICENSE",
-  data = ["@nccl_archive//:LICENSE.txt"],
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl",
-  actual = "@nccl_archive//:nccl_via_stub",
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_headers",
-  actual = "@nccl_archive//:nccl_headers",
-  visibility = ["//visibility:public"],
-)
-
-alias(
-  name = "nccl_config",
-  actual = "@nccl_archive//:nccl_config",
-  visibility = ["//visibility:public"],
-)
-"""
-
-def _label(file):
-    return Label("//third_party/nccl:{}".format(file))
-
-def _create_local_nccl_repository(repository_ctx):
-    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
-    if nccl_version:
-        nccl_version = nccl_version.split(".")[0]
-
-    cuda_config = find_cuda_config(repository_ctx, ["cuda"])
-    cuda_version = cuda_config["cuda_version"].split(".")
-
-    if nccl_version == "":
-        # Alias to open source build from @nccl_archive.
-        if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
-            repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
-        else:
-            repository_ctx.file("BUILD", _NCCL_ARCHIVE_STUB_BUILD_CONTENT)
-
-        repository_ctx.template("generated_names.bzl", _label("generated_names.bzl.tpl"), {})
-        repository_ctx.template(
-            "build_defs.bzl",
-            _label("build_defs.bzl.tpl"),
-            {
-                "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
-                "%{nvlink_label}": "@local_config_cuda//cuda:cuda/bin/nvlink",
-                "%{fatbinary_label}": "@local_config_cuda//cuda:cuda/bin/fatbinary",
-                "%{bin2c_label}": "@local_config_cuda//cuda:cuda/bin/bin2c",
-                "%{link_stub_label}": "@local_config_cuda//cuda:cuda/bin/crt/link.stub",
-                "%{nvprune_label}": "@local_config_cuda//cuda:cuda/bin/nvprune",
-            },
-        )
-    else:
-        # Create target for locally installed NCCL.
-        config = find_cuda_config(repository_ctx, ["nccl"])
-        config_wrap = {
-            "%{nccl_version}": config["nccl_version"],
-            "%{nccl_header_dir}": config["nccl_include_dir"],
-            "%{nccl_library_dir}": config["nccl_library_dir"],
-        }
-        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config_wrap)
-        repository_ctx.template("generated_names.bzl", _label("generated_names.bzl.tpl"), {})
-
-def _create_remote_nccl_repository(repository_ctx, remote_config_repo):
-    repository_ctx.template(
-        "BUILD",
-        config_repo_label(remote_config_repo, ":BUILD"),
-        {},
-    )
-    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
-    if nccl_version == "":
-        repository_ctx.template(
-            "generated_names.bzl",
-            config_repo_label(remote_config_repo, ":generated_names.bzl"),
-            {},
-        )
-        repository_ctx.template(
-            "build_defs.bzl",
-            config_repo_label(remote_config_repo, ":build_defs.bzl"),
-            {},
-        )
-
-def _nccl_autoconf_impl(repository_ctx):
-    if (not enable_cuda(repository_ctx) or
-        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
-        # Add a dummy build file to make bazel query happy.
-        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
-        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
-    elif get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO") != None:
-        _create_remote_nccl_repository(repository_ctx, get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO"))
-    else:
-        _create_local_nccl_repository(repository_ctx)
-
-_ENVIRONS = [
-    _CUDA_TOOLKIT_PATH,
-    _NCCL_HDR_PATH,
-    _NCCL_INSTALL_PATH,
-    _TF_NCCL_VERSION,
-    _TF_CUDA_COMPUTE_CAPABILITIES,
-    _TF_NEED_CUDA,
-    _TF_CUDA_PATHS,
-]
-
-remote_nccl_configure = repository_rule(
-    implementation = _create_local_nccl_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-
-nccl_configure = repository_rule(
-    implementation = _nccl_autoconf_impl,
-    environ = _ENVIRONS,
-    attrs = {
-        "_find_cuda_config": attr.label(
-            default = Label("@local_xla//third_party/gpus:find_cuda_config.py"),
-        ),
-    },
-)
-"""Detects and configures the NCCL configuration.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-nccl_configure(name = "local_config_nccl")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/nccl/system.BUILD.tpl b/third_party/nccl/system.BUILD.tpl
deleted file mode 100644
index 8a0827fb9e12..000000000000
--- a/third_party/nccl/system.BUILD.tpl
+++ /dev/null
@@ -1,62 +0,0 @@
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
-load(
-    "@local_xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags"
-)
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nccl",
-    srcs = ["libnccl.so.%{nccl_version}"],
-    hdrs = ["nccl.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-    linkopts = cuda_rpath_flags("nvidia/nccl/lib"),
-)
-
-cc_library(
-    name = "nccl_headers",
-    hdrs = ["nccl.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-genrule(
-    name = "nccl-files",
-    outs = [
-        "libnccl.so.%{nccl_version}",
-        "nccl.h",
-    ],
-    cmd = """
-cp "%{nccl_header_dir}/nccl.h" "$(@D)/nccl.h" &&
-cp "%{nccl_library_dir}/libnccl.so.%{nccl_version}" \
-  "$(@D)/libnccl.so.%{nccl_version}"
-""",
-)
-
-# This additional header allows us to determine the configured NCCL version
-# without including the rest of NCCL.
-write_file(
-    name = "nccl_config_header",
-    out = "nccl_config.h",
-    content = [
-        "#define TF_NCCL_VERSION \"%{nccl_version}\""
-    ]
-)
-
-cc_library(
-    name = "nccl_config",
-    hdrs = ["nccl_config.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/nvshmem/nvshmem.BUILD b/third_party/nvshmem/nvshmem.BUILD
deleted file mode 100644
index c99dfc1a3138..000000000000
--- a/third_party/nvshmem/nvshmem.BUILD
+++ /dev/null
@@ -1,102 +0,0 @@
-# NVSHMEM
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
-
-options_substitions = {
-    "#cmakedefine NVSHMEM_COMPLEX_SUPPORT": "/* #undef NVSHMEM_COMPLEX_SUPPORT */",
-    "#cmakedefine NVSHMEM_DEBUG": "/* #undef NVSHMEM_DEBUG */",
-    "#cmakedefine NVSHMEM_DEVEL": "/* #undef NVSHMEM_DEVEL */",
-    "#cmakedefine NVSHMEM_TRACE": "/* #undef NVSHMEM_TRACE */",
-    "#cmakedefine NVSHMEM_DEFAULT_PMI2": "/* #undef NVSHMEM_DEFAULT_PMI2 */",
-    "#cmakedefine NVSHMEM_DEFAULT_PMIX": "/* #undef NVSHMEM_DEFAULT_PMIX */",
-    "#cmakedefine NVSHMEM_DEFAULT_UCX": "/* #undef NVSHMEM_DEFAULT_UCX */",
-    "#cmakedefine NVSHMEM_DISABLE_COLL_POLL": "#define NVSHMEM_DISABLE_COLL_POLL",
-    "#cmakedefine NVSHMEM_GPU_COLL_USE_LDST": "/* #undef NVSHMEM_GPU_COLL_USE_LDST */",
-    "#cmakedefine NVSHMEM_IBDEVX_SUPPORT": "/* #undef NVSHMEM_IBDEVX_SUPPORT */",
-    "#cmakedefine NVSHMEM_IBRC_SUPPORT": "#define NVSHMEM_IBRC_SUPPORT",
-    "#cmakedefine NVSHMEM_LIBFABRIC_SUPPORT": "/* #undef NVSHMEM_LIBFABRIC_SUPPORT */",
-    "#cmakedefine NVSHMEM_MPI_SUPPORT": "/* #undef NVSHMEM_MPI_SUPPORT */",
-    "#cmakedefine NVSHMEM_NVTX": "#define NVSHMEM_NVTX",
-    "#cmakedefine NVSHMEM_PMIX_SUPPORT": "/* #undef NVSHMEM_PMIX_SUPPORT */",
-    "#cmakedefine NVSHMEM_SHMEM_SUPPORT": "/* #undef NVSHMEM_SHMEM_SUPPORT */",
-    "#cmakedefine NVSHMEM_TEST_STATIC_LIB": "/* #undef NVSHMEM_TEST_STATIC_LIB */",
-    "#cmakedefine NVSHMEM_TIMEOUT_DEVICE_POLLING": "/* #undef NVSHMEM_TIMEOUT_DEVICE_POLLING */",
-    "#cmakedefine NVSHMEM_UCX_SUPPORT": "/* #undef NVSHMEM_UCX_SUPPORT */",
-    "#cmakedefine NVSHMEM_USE_DLMALLOC": "/* #undef NVSHMEM_USE_DLMALLOC */",
-    "#cmakedefine NVSHMEM_USE_NCCL": "/* #undef NVSHMEM_USE_NCCL */",
-    "#cmakedefine NVSHMEM_USE_GDRCOPY": "/* #undef NVSHMEM_USE_GDRCOPY */",
-    "#cmakedefine NVSHMEM_VERBOSE": "/* #undef NVSHMEM_VERBOSE */",
-    "#cmakedefine NVSHMEM_BUILD_TESTS": "#define NVSHMEM_BUILD_TESTS",
-    "#cmakedefine NVSHMEM_BUILD_EXAMPLES": "#define NVSHMEM_BUILD_EXAMPLES",
-    "#cmakedefine NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY": "/* #undef NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY */",
-    "#cmakedefine NVSHMEM_IBGDA_SUPPORT": "/* #undef NVSHMEM_IBGDA_SUPPORT */",
-    "#cmakedefine NVSHMEM_ENABLE_ALL_DEVICE_INLINING": "/* #undef NVSHMEM_ENABLE_ALL_DEVICE_INLINING */",
-}
-
-expand_template(
-    name = "nvshmem_build_options_h",
-    out = "src/include/non_abi/nvshmem_build_options.h",
-    substitutions = options_substitions,
-    template = "src/include/non_abi/nvshmem_build_options.h.in",
-)
-
-NVSHMEM_MAJOR = 3
-
-version_substitions = {
-    "@PROJECT_VERSION_MAJOR@": str(NVSHMEM_MAJOR),
-    "@PROJECT_VERSION_MINOR@": "1",
-    "@PROJECT_VERSION_PATCH@": "7",
-    "@PROJECT_VERSION_TWEAK@": "0",
-    "@TRANSPORT_VERSION_MAJOR@": "3",
-    "@TRANSPORT_VERSION_MINOR@": "0",
-    "@TRANSPORT_VERSION_PATCH@": "0",
-    "@BOOTSTRAP_VERSION_MAJOR@": "3",
-    "@BOOTSTRAP_VERSION_MINOR@": "0",
-    "@BOOTSTRAP_VERSION_PATCH@": "0",
-    "@INTERLIB_VERSION_MAJOR@": "3",
-    "@INTERLIB_VERSION_MINOR@": "0",
-    "@INTERLIB_VERSION_PATCH@": "0",
-    "@INFO_BUILD_VARS@": "",
-}
-
-expand_template(
-    name = "nvshmem_version_h",
-    out = "src/include/non_abi/nvshmem_version.h",
-    substitutions = version_substitions,
-    template = "src/include/non_abi/nvshmem_version.h.in",
-)
-
-cc_library(
-    name = "nvshmem_lib",
-    hdrs = glob([
-        "src/include/**",
-    ]) + [
-        ":nvshmem_build_options_h",
-        ":nvshmem_version_h",
-    ],
-    include_prefix = "third_party/nvshmem",
-    includes = ["src/include"],
-    strip_include_prefix = "src/include",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_xla//xla/tsl/cuda:nvshmem_stub",
-    ],
-)
-
-# This additional header allows us to determine the configured NVSHMEM version
-# without including the rest of NVSHMEM.
-write_file(
-    name = "nvshmem_config_header",
-    out = "nvshmem_config.h",
-    content = [
-        "constexpr static char XLA_NVSHMEM_VERSION[] = \"{}\";".format(NVSHMEM_MAJOR),
-    ],
-)
-
-cc_library(
-    name = "nvshmem_config",
-    hdrs = ["nvshmem_config.h"],
-    include_prefix = "third_party/nvshmem",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/nvshmem/workspace.bzl b/third_party/nvshmem/workspace.bzl
deleted file mode 100644
index 5119e35df36d..000000000000
--- a/third_party/nvshmem/workspace.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-"""NVSHMEM - NVIDIA Shared Memory"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "nvshmem",
-        strip_prefix = "nvshmem_src",
-        sha256 = "2146ff231d9aadd2b11f324c142582f89e3804775877735dc507b4dfd70c788b",
-        urls = tf_mirror_urls("https://developer.download.nvidia.com/compute/redist/nvshmem/3.1.7/source/nvshmem_src_3.1.7-1.txz"),
-        build_file = "//third_party/nvshmem:nvshmem.BUILD",
-        type = "tar",
-    )
diff --git a/third_party/nvtx.BUILD b/third_party/nvtx.BUILD
deleted file mode 100644
index d3d42582fefc..000000000000
--- a/third_party/nvtx.BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-#Description : NVIDIA Tools Extension (NVTX) library for adding profiling annotations to applications.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["restricted"])  # NVIDIA proprietary license
-
-filegroup(
-    name = "nvtx_header_files",
-    srcs = glob([
-        "nvtx3/**",
-    ]),
-)
-
-cc_library(
-    name = "nvtx",
-    hdrs = [":nvtx_header_files"],
-    include_prefix = "third_party",
-)
diff --git a/third_party/nvtx/LICENSE b/third_party/nvtx/LICENSE
deleted file mode 100644
index aa23e563e6f8..000000000000
--- a/third_party/nvtx/LICENSE
+++ /dev/null
@@ -1,5 +0,0 @@
-Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
-
-Licensed under the Apache License v2.0 with LLVM Exceptions.
-See https://llvm.org/LICENSE.txt for license information.
-SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/third_party/ortools/bliss.BUILD b/third_party/ortools/bliss.BUILD
deleted file mode 100644
index 955edaa2c8e8..000000000000
--- a/third_party/ortools/bliss.BUILD
+++ /dev/null
@@ -1,30 +0,0 @@
-cc_library(
-    name = "libbliss",
-    srcs = [
-        "bliss-0.73/bliss_C.cc",
-        "bliss-0.73/defs.cc",
-        "bliss-0.73/graph.cc",
-        "bliss-0.73/heap.cc",
-        "bliss-0.73/orbit.cc",
-        "bliss-0.73/partition.cc",
-        "bliss-0.73/timer.cc",
-        "bliss-0.73/uintseqhash.cc",
-        "bliss-0.73/utils.cc",
-    ],
-    hdrs = [
-        "bliss-0.73/bignum.hh",
-        "bliss-0.73/bliss_C.h",
-        "bliss-0.73/defs.hh",
-        "bliss-0.73/graph.hh",
-        "bliss-0.73/heap.hh",
-        "bliss-0.73/kqueue.hh",
-        "bliss-0.73/kstack.hh",
-        "bliss-0.73/orbit.hh",
-        "bliss-0.73/partition.hh",
-        "bliss-0.73/timer.hh",
-        "bliss-0.73/uintseqhash.hh",
-        "bliss-0.73/utils.hh",
-    ],
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/ortools/glpk.BUILD b/third_party/ortools/glpk.BUILD
deleted file mode 100644
index 05390c670d48..000000000000
--- a/third_party/ortools/glpk.BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-cc_library(
-    name = "glpk",
-    srcs = glob(
-        [
-            "glpk-4.52/src/*.c",
-            "glpk-4.52/src/*/*.c",
-            "glpk-4.52/src/*.h",
-            "glpk-4.52/src/*/*.h",
-        ],
-        exclude = ["glpk-4.52/src/proxy/main.c"],
-    ),
-    hdrs = [
-        "glpk-4.52/src/glpk.h",
-    ],
-    copts = [
-        "-Wno-error",
-        "-w",
-        "-Iexternal/glpk/glpk-4.52/src",
-        "-Iexternal/glpk/glpk-4.52/src/amd",
-        "-Iexternal/glpk/glpk-4.52/src/bflib",
-        "-Iexternal/glpk/glpk-4.52/src/cglib",
-        "-Iexternal/glpk/glpk-4.52/src/colamd",
-        "-Iexternal/glpk/glpk-4.52/src/env",
-        "-Iexternal/glpk/glpk-4.52/src/minisat",
-        "-Iexternal/glpk/glpk-4.52/src/misc",
-        "-Iexternal/glpk/glpk-4.52/src/proxy",
-        "-Iexternal/glpk/glpk-4.52/src/zlib",
-        "-DHAVE_ZLIB",
-    ],
-    includes = ["glpk-4.52/src"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/ortools/ortools.patch b/third_party/ortools/ortools.patch
deleted file mode 100644
index c0fb1201ad48..000000000000
--- a/third_party/ortools/ortools.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-diff --git a/src/ortools/base/file.cc b/src/ortools/base/file.cc
---- a/ortools/base/file.cc
-+++ b/ortools/base/file.cc
-@@ -186,7 +186,7 @@
-   }
- #endif  // _MSC_VER
- 
--  file->Close(flags);  // Even if ReadToString() fails!
-+  static_cast<void>(file->Close(flags));  // Even if ReadToString() fails!
-   return absl::Status(absl::StatusCode::kInvalidArgument,
-                       absl::StrCat("Could not read from '", filename, "'."));
- }
\ No newline at end of file
diff --git a/third_party/ortools/scip.BUILD b/third_party/ortools/scip.BUILD
deleted file mode 100644
index 6435954dd9d2..000000000000
--- a/third_party/ortools/scip.BUILD
+++ /dev/null
@@ -1,124 +0,0 @@
-exports_files(
-    ["src/lpi/lpi_glop.cpp"],
-)
-
-config_setting(
-    name = "on_linux",
-    constraint_values = [
-        "@platforms//os:linux",
-    ],
-)
-
-config_setting(
-    name = "on_macos",
-    constraint_values = [
-        "@platforms//os:macos",
-    ],
-)
-
-config_setting(
-    name = "on_windows",
-    constraint_values = [
-        "@platforms//os:windows",
-    ],
-)
-
-PLATFORM_FLAGS = select({
-    "on_linux": [
-        "-Wunknown-pragmas",
-        "-fexceptions",
-        "-DSYM=bliss",
-    ],
-    "on_macos": [
-        "-Wunknown-pragmas",
-        "-fexceptions",
-        "-DSYM=bliss",
-    ],
-    "on_windows": [
-        "/DSYM=none",
-        "/DSCIP_NO_SIGACTION",
-        "/DSCIP_NO_STRTOK_R",
-    ],
-    "//conditions:default": [],
-})
-
-PLATFORM_DEPS = select({
-    "on_linux": ["@bliss//:libbliss"],
-    "on_macos": ["@bliss//:libbliss"],
-    "on_windows": [],
-    "//conditions:default": [],
-})
-
-BLISS_FILE = select({
-    "on_linux": ["src/symmetry/compute_symmetry_bliss.cpp"],
-    "on_macos": ["src/symmetry/compute_symmetry_bliss.cpp"],
-    "on_windows": ["src/symmetry/compute_symmetry_none.cpp"],
-    "//conditions:default": ["src/symmetry/compute_symmetry_none.cpp"],
-})
-
-# TODO(ckstanton): Remove glob.
-cc_library(
-    name = "libscip",
-    srcs = glob(
-        [
-            "src/*/*.c",
-        ],
-        exclude = [
-            "src/lpi/lpi_*.c",
-            "src/scip/exprinterpret_*.c",
-            "src/scip/nlpi_filtersqp.c",
-            "src/scip/nlpi_worhp.c",
-            "src/scip/compr_xyz.c",
-            "src/scip/sorttpl.c",
-            "src/symmetry/compute_symmetry_*.cpp",
-            "src/tpi/tpi_*.c",
-        ],
-    ) + BLISS_FILE + [
-        "src/scip/exprinterpret_none.c",
-        "src/tpi/tpi_tnycthrd.c",
-    ],
-    hdrs = glob(
-        [
-            "src/*/*.h",
-            "src/*/*.hpp",
-            "src/scip/githash.c",
-            "src/scip/sorttpl.c",
-            "src/scip/buildflags.c",
-        ],
-        exclude =
-            [
-                #"src/scip/prop_symmetry.h",
-            ],
-    ),
-    copts = [
-        "$(STACK_FRAME_UNLIMITED)",  # src/scip/reader_cnf.c
-        "-DSCIP_WITH_ZLIB",
-        "-DWITH_SCIPDEF",
-        "-DSCIP_ROUNDING_FE",
-        "-DTPI_TNYC",  # src/tpi/type_tpi_tnycthrd.h
-        # Compile in thead-safe mode (required since we use TPI_TNYC). Note,
-        # one does not technically need to add this, as SCIP code always
-        # uses syntax like "#ifndef NPARASCIP". But let's be explicit here.
-        "-DPARASCIP",
-        "-Isrc",
-        "-Isrc/scip",
-    ] + PLATFORM_FLAGS,
-    defines = [
-        # Scip v800 optionally depends on scip/config.h and
-        # scip/scip_export.h that are generated by build system.
-        #
-        # We need every library and binary that depends on SCIP libraries to
-        # define this macro. That is why we use `defines' here instead of
-        # `copts' or `local_defines'.
-        "NO_CONFIG_HEADER",
-    ],
-    features = ["-parse_headers"],
-    includes = [
-        "src",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        #"@cppad:cppad_includes",
-        "@zlib",
-    ] + PLATFORM_DEPS,
-)
diff --git a/third_party/ortools/scip.patch b/third_party/ortools/scip.patch
deleted file mode 100644
index 691f93504f21..000000000000
--- a/third_party/ortools/scip.patch
+++ /dev/null
@@ -1,415 +0,0 @@
-diff --git a/.gitignore b/.gitignore
-index 4fbbc0e3b3..964b2d030d 100644
---- a/.gitignore
-+++ b/.gitignore
-@@ -83,8 +83,6 @@ hooks/
- localhooks/
- 
- # created when packaging, don't version control this
--src/scip/githash.c
--src/scip/buildflags.c
- 
- # settings
- settings/
-diff --git a/src/scip/benders_xyz.c b/src/scip/benders_xyz.c
-index 0d812ba6bd..ffe1badee0 100644
---- a/src/scip/benders_xyz.c
-+++ b/src/scip/benders_xyz.c
-@@ -47,6 +47,7 @@
- /** Benders' decomposition data */
- struct SCIP_BendersData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/benderscut_xyz.c b/src/scip/benderscut_xyz.c
-index 0f05582fe4..7fb99f648a 100644
---- a/src/scip/benderscut_xyz.c
-+++ b/src/scip/benderscut_xyz.c
-@@ -41,6 +41,7 @@
- /** Benders' decomposition cut data */
- struct SCIP_BenderscutData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/branch_xyz.c b/src/scip/branch_xyz.c
-index 1f13ac7460..e8ded751cf 100644
---- a/src/scip/branch_xyz.c
-+++ b/src/scip/branch_xyz.c
-@@ -42,6 +42,7 @@
- /** branching rule data */
- struct SCIP_BranchruleData
- {
-+  void* ptr;
- };
- 
- 
-@@ -216,7 +217,7 @@ SCIP_RETCODE SCIPincludeBranchruleXyz(
-    /* use SCIPincludeBranchrule() if you want to set all callbacks explicitly and realize (by getting compiler errors) when
-     * new callbacks are added in future SCIP versions
-     */
--   SCIP_CALL( SCIPincludeBranchrule(scip, BRANCHRULE_NAME, BRANCHRULE_DESC, BRANCHRULE_PRIORITY, BRANCHRULE_MAXDEPTH, 
-+   SCIP_CALL( SCIPincludeBranchrule(scip, BRANCHRULE_NAME, BRANCHRULE_DESC, BRANCHRULE_PRIORITY, BRANCHRULE_MAXDEPTH,
-          BRANCHRULE_MAXBOUNDDIST,
-          branchCopyXyz, branchFreeXyz, branchInitXyz, branchExitXyz, branchInitsolXyz, branchExitsolXyz,
-          branchExeclpXyz, branchExecextXyz, branchExecpsXyz,
-diff --git a/src/scip/compr_xyz.c b/src/scip/compr_xyz.c
-index 2f6b29e88c..a6142d7785 100644
---- a/src/scip/compr_xyz.c
-+++ b/src/scip/compr_xyz.c
-@@ -41,6 +41,7 @@
- /** tree compression data */
- struct SCIP_ComprData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/cons_xyz.c b/src/scip/cons_xyz.c
-index 8141039a2e..e3f5d2b94b 100644
---- a/src/scip/cons_xyz.c
-+++ b/src/scip/cons_xyz.c
-@@ -69,11 +69,13 @@
- /** constraint data for xyz constraints */
- struct SCIP_ConsData
- {
-+  void* ptr;
- };
- 
- /** constraint handler data */
- struct SCIP_ConshdlrData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/cutsel_xyz.c b/src/scip/cutsel_xyz.c
-index c660098bb0..92f3fb9d92 100644
---- a/src/scip/cutsel_xyz.c
-+++ b/src/scip/cutsel_xyz.c
-@@ -40,6 +40,7 @@
- /** cut selector data */
- struct SCIP_CutselData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/dialog_xyz.c b/src/scip/dialog_xyz.c
-index 1918057017..8506d526e6 100644
---- a/src/scip/dialog_xyz.c
-+++ b/src/scip/dialog_xyz.c
-@@ -28,7 +28,7 @@
- 
- #define DIALOG_NAME            "xyz"
- #define DIALOG_DESC            "xyz user interface dialog"
--#define DIALOG_ISSUBMENU          FALSE 
-+#define DIALOG_ISSUBMENU          FALSE
- 
- 
- 
-@@ -42,6 +42,7 @@
- /** dialog data */
- struct SCIP_DialogData
- {
-+  void* ptr;
- };
- 
- 
-@@ -154,7 +155,7 @@ SCIP_RETCODE SCIPincludeDialogXyz(
-    /* create, include, and release dialog */
-    if( !SCIPdialogHasEntry(parentdialog, DIALOG_NAME) )
-    {
--      SCIP_CALL( SCIPincludeDialog(scip, &dialog, 
-+      SCIP_CALL( SCIPincludeDialog(scip, &dialog,
-             dialogCopyXyz, dialogExecXyz, dialogDescXyz, dialogFreeXyz,
-             DIALOG_NAME, DIALOG_DESC, DIALOG_ISSUBMENU, dialogdata) );
-       SCIP_CALL( SCIPaddDialogEntry(scip, parentdialog, dialog) );
-diff --git a/src/scip/disp_xyz.c b/src/scip/disp_xyz.c
-index 6c6a776091..b00cf1e036 100644
---- a/src/scip/disp_xyz.c
-+++ b/src/scip/disp_xyz.c
-@@ -26,13 +26,13 @@
- #include "scip/disp_xyz.h"
- 
- 
--#define DISP_NAME               "xyz"                
-+#define DISP_NAME               "xyz"
- #define DISP_DESC               "xyz display column"
--#define DISP_HEADER             "xyz" 
-+#define DISP_HEADER             "xyz"
- #define DISP_WIDTH              14      /**< the width of the display column */
- #define DISP_PRIORITY           110000  /**< the priority of the display column */
- #define DISP_POSITION           30100   /**< the relative position of the display column */
--#define DISP_STRIPLINE          TRUE    /**< the default for whether the display column should be separated 
-+#define DISP_STRIPLINE          TRUE    /**< the default for whether the display column should be separated
-                                          *   with a line from its right neighbor */
- 
- 
-@@ -47,6 +47,7 @@
- /** display column data */
- struct SCIP_DispData
- {
-+  void* ptr;
- };
- 
- 
-@@ -188,10 +189,10 @@ SCIP_RETCODE SCIPincludeDispXyz(
-    /* TODO: (optional) create display column specific data here */
- 
-    /* include display column */
--   SCIP_CALL( SCIPincludeDisp(scip, DISP_NAME, DISP_DESC, DISP_HEADER, SCIP_DISPSTATUS_AUTO, 
-+   SCIP_CALL( SCIPincludeDisp(scip, DISP_NAME, DISP_DESC, DISP_HEADER, SCIP_DISPSTATUS_AUTO,
-          dispCopyXyz,
--         dispFreeXyz, dispInitXyz, dispExitXyz, 
--         dispInitsolXyz, dispExitsolXyz, dispOutputXyz, 
-+         dispFreeXyz, dispInitXyz, dispExitXyz,
-+         dispInitsolXyz, dispExitsolXyz, dispOutputXyz,
-          dispdata, DISP_WIDTH, DISP_PRIORITY, DISP_POSITION, DISP_STRIPLINE) );
- 
-    /* add xyz display column parameters */
-diff --git a/src/scip/event_xyz.c b/src/scip/event_xyz.c
-index 31fd333f98..c793d69bc4 100644
---- a/src/scip/event_xyz.c
-+++ b/src/scip/event_xyz.c
-@@ -36,6 +36,7 @@
- /** event handler data */
- struct SCIP_EventhdlrData
- {
-+  void* ptr;
- };
- 
- /*
-@@ -179,7 +180,7 @@ SCIP_RETCODE SCIPincludeEventHdlrXyz(
-     */
-    SCIP_CALL( SCIPincludeEventhdlr(scip, EVENTHDLR_NAME, EVENTHDLR_DESC,
-          eventCopyXyz,
--         eventFreeXyz, eventInitXyz, eventExitXyz, 
-+         eventFreeXyz, eventInitXyz, eventExitXyz,
-          eventInitsolXyz, eventExitsolXyz, eventDeleteXyz, eventExecXyz,
-          eventhdlrdata) );
- #else
-diff --git a/src/scip/expr_xyz.c b/src/scip/expr_xyz.c
-index 2eb7914e1d..4e924b03b5 100644
---- a/src/scip/expr_xyz.c
-+++ b/src/scip/expr_xyz.c
-@@ -38,11 +38,13 @@
- /** expression handler data */
- struct SCIP_ExprhdlrData
- {
-+  void* ptr;
- };
- 
- /** expression data */
- struct SCIP_ExprData
- {
-+  void* ptr;
- };
- 
- /*
-diff --git a/src/scip/githash.c b/src/scip/githash.c
-new file mode 100644
-index 0000000000..2891bc72de
---- /dev/null
-+++ b/src/scip/githash.c
-@@ -0,0 +1 @@
-+#define SCIP_GITHASH "a740f0891e"
-diff --git a/src/scip/heur_xyz.c b/src/scip/heur_xyz.c
-index 9f7d804f4d..e33bb83b7c 100644
---- a/src/scip/heur_xyz.c
-+++ b/src/scip/heur_xyz.c
-@@ -46,6 +46,7 @@
- /** primal heuristic data */
- struct SCIP_HeurData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/nlhdlr_xyz.c b/src/scip/nlhdlr_xyz.c
-index bc90f3dafe..056af7b6d3 100644
---- a/src/scip/nlhdlr_xyz.c
-+++ b/src/scip/nlhdlr_xyz.c
-@@ -40,11 +40,13 @@
- /** nonlinear handler data */
- struct SCIP_NlhdlrData
- {
-+  void* ptr;
- };
- 
- /** nonlinear handler expression data */
- struct SCIP_NlhdlrExprData
- {
-+  void* ptr;
- };
- 
- /*
-diff --git a/src/scip/nlpi_xyz.c b/src/scip/nlpi_xyz.c
-index 3509410b23..901433d2d4 100644
---- a/src/scip/nlpi_xyz.c
-+++ b/src/scip/nlpi_xyz.c
-@@ -43,12 +43,14 @@
- 
- struct SCIP_NlpiData
- {
-+  void* ptr;
- };
- 
- /* TODO: fill in the necessary NLP problem instance data */
- 
- struct SCIP_NlpiProblem
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/nodesel_xyz.c b/src/scip/nodesel_xyz.c
-index a5b6d9d7d6..0aacc3c8d2 100644
---- a/src/scip/nodesel_xyz.c
-+++ b/src/scip/nodesel_xyz.c
-@@ -41,6 +41,7 @@
- /** node selector data */
- struct SCIP_NodeselData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/presol_xyz.c b/src/scip/presol_xyz.c
-index 38ba9df72e..04fe8605f5 100644
---- a/src/scip/presol_xyz.c
-+++ b/src/scip/presol_xyz.c
-@@ -42,6 +42,7 @@
- /** presolver data */
- struct SCIP_PresolData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/pricer_xyz.c b/src/scip/pricer_xyz.c
-index 16c968b951..5090a91e35 100644
---- a/src/scip/pricer_xyz.c
-+++ b/src/scip/pricer_xyz.c
-@@ -43,6 +43,7 @@
- /** variable pricer data */
- struct SCIP_PricerData
- {
-+  void* ptr;
- };
- 
- 
-@@ -204,7 +205,7 @@ SCIP_RETCODE SCIPincludePricerXyz(
-     * new callbacks are added in future SCIP versions
-     */
-    SCIP_CALL( SCIPincludePricer(scip, PRICER_NAME, PRICER_DESC, PRICER_PRIORITY, PRICER_DELAY,
--         pricerCopyXyz, pricerFreeXyz, pricerInitXyz, pricerExitXyz, 
-+         pricerCopyXyz, pricerFreeXyz, pricerInitXyz, pricerExitXyz,
-          pricerInitsolXyz, pricerExitsolXyz, pricerRedcostXyz, pricerFarkasXyz,
-          pricerdata) );
- #else
-diff --git a/src/scip/prop_xyz.c b/src/scip/prop_xyz.c
-index 431d8e909b..975564f12b 100644
---- a/src/scip/prop_xyz.c
-+++ b/src/scip/prop_xyz.c
-@@ -28,7 +28,7 @@
- /* fundamental propagator properties */
- #define PROP_NAME              "xyz"
- #define PROP_DESC              "propagator template"
--#define PROP_PRIORITY                 0 /**< propagator priority */ 
-+#define PROP_PRIORITY                 0 /**< propagator priority */
- #define PROP_FREQ                    10 /**< propagator frequency */
- #define PROP_DELAY                FALSE /**< should propagation method be delayed, if other propagators found reductions? */
- #define PROP_TIMING             SCIP_PROPTIMING_BEFORELP/**< propagation timing mask */
-@@ -50,6 +50,7 @@
- /** propagator data */
- struct SCIP_PropData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/reader_xyz.c b/src/scip/reader_xyz.c
-index 08e5a6ff7b..c8fdd9d4f5 100644
---- a/src/scip/reader_xyz.c
-+++ b/src/scip/reader_xyz.c
-@@ -40,6 +40,7 @@
- /** data for xyz reader */
- struct SCIP_ReaderData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/relax_xyz.c b/src/scip/relax_xyz.c
-index 4d4acc80b3..74dfbda91f 100644
---- a/src/scip/relax_xyz.c
-+++ b/src/scip/relax_xyz.c
-@@ -43,6 +43,7 @@
- /** relaxator data */
- struct SCIP_RelaxData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/scipbuildflags.c b/src/scip/scipbuildflags.c
-index b54b9112cb..dc8e62b5e0 100644
---- a/src/scip/scipbuildflags.c
-+++ b/src/scip/scipbuildflags.c
-@@ -21,10 +21,9 @@
- 
- /*---+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2*/
- 
-+#define SCIP_BUILDFLAGS " ARCH=x86_64\n COMP=gnu\n DEBUGSOL=false\n EXPRINT=none\n GAMS=false\n SYM=bliss\n GMP=false\n IPOPT=false\n IPOPTOPT=opt\n WORHP=false\n WORHPOPT=opt\n LPS=spx2\n LPSCHECK=false\n LPSOPT=opt\n NOBLKBUFMEM=false\n NOBLKMEM=false\n NOBUFMEM=false\n OPT=opt\n OSTYPE=linux\n PARASCIP=true\n READLINE=false\n SANITIZE=\n SHARED=false\n USRARFLAGS=\n USRCFLAGS=-fPIC\n USRCXXFLAGS=-fPIC\n USRDFLAGS=\n USRFLAGS=\n USRLDFLAGS=\n USROFLAGS=\n VERSION=7.0.1\n ZIMPL=false\n ZIMPLOPT=opt\n ZLIB=true"
-+
- #include "scip/scipbuildflags.h"
--#ifdef NO_CONFIG_HEADER
--#include "buildflags.c"
--#endif
- 
- /** returns the flags that were used to build SCIP */
- const char* SCIPgetBuildFlags(
-diff --git a/src/scip/sepa_xyz.c b/src/scip/sepa_xyz.c
-index 40d3c1c5f7..f68658951d 100644
---- a/src/scip/sepa_xyz.c
-+++ b/src/scip/sepa_xyz.c
-@@ -44,6 +44,7 @@
- /** separator data */
- struct SCIP_SepaData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/scip/table_xyz.c b/src/scip/table_xyz.c
-index 2c93a43235..4f1fd28de4 100644
---- a/src/scip/table_xyz.c
-+++ b/src/scip/table_xyz.c
-@@ -43,6 +43,7 @@
- /** statistics table data */
- struct SCIP_TableData
- {
-+  void* ptr;
- };
- 
- 
-diff --git a/src/symmetry/compute_symmetry_bliss.cpp b/src/symmetry/compute_symmetry_bliss.cpp
-index 484627c4b9..27c2895165 100644
---- a/src/symmetry/compute_symmetry_bliss.cpp
-+++ b/src/symmetry/compute_symmetry_bliss.cpp
-@@ -25,8 +25,8 @@
- #include "compute_symmetry.h"
- 
- /* include bliss graph */
--#include <bliss/defs.hh>
--#include <bliss/graph.hh>
-+#include <bliss-0.73/defs.hh>
-+#include <bliss-0.73/graph.hh>
- 
- #include <string.h>
- #include <vector>
- 
\ No newline at end of file
diff --git a/third_party/protobuf/protobuf.patch b/third_party/protobuf/protobuf.patch
deleted file mode 100644
index 48de6bb1937d..000000000000
--- a/third_party/protobuf/protobuf.patch
+++ /dev/null
@@ -1,141 +0,0 @@
-diff --git a/BUILD.bazel b/BUILD.bazel
---- a/BUILD.bazel	(revision 90b73ac3f0b10320315c2ca0d03a5a9b095d2f66)
-+++ b/BUILD.bazel	(date 1714620794503)
-@@ -68,6 +68,8 @@
-     copts = COPTS,
-     includes = ["src/"],
-     linkopts = LINK_OPTS,
-+    alwayslink = 1,
-     visibility = ["//visibility:public"],
- )
-
-@@ -135,6 +136,7 @@
-     copts = COPTS,
-     includes = ["src/"],
-     linkopts = LINK_OPTS,
-+    alwayslink = 1,
-     visibility = ["//visibility:public"],
-     deps = [":protobuf_lite"] + select({
-         "//build_defs:config_msvc": [],
-diff --git a/python/google/protobuf/pyext/descriptor.cc b/python/google/protobuf/pyext/descriptor.cc
-index 162531226..e93ec4809 100644
---- a/python/google/protobuf/pyext/descriptor.cc
-+++ b/python/google/protobuf/pyext/descriptor.cc
-@@ -58,6 +58,37 @@
-               : 0)                                               \
-        : PyBytes_AsStringAndSize(ob, (charpp), (sizep)))
- 
-+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
-+static PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
-+{
-+    Py_INCREF(frame->f_code);
-+    return frame->f_code;
-+}
-+
-+static PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
-+{
-+    Py_XINCREF(frame->f_back);
-+    return frame->f_back;
-+}
-+#endif
-+
-+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
-+static PyObject* PyFrame_GetLocals(PyFrameObject *frame)
-+{
-+    if (PyFrame_FastToLocalsWithError(frame) < 0) {
-+        return NULL;
-+    }
-+    Py_INCREF(frame->f_locals);
-+    return frame->f_locals;
-+}
-+
-+static PyObject* PyFrame_GetGlobals(PyFrameObject *frame)
-+{
-+    Py_INCREF(frame->f_globals);
-+    return frame->f_globals;
-+}
-+#endif
-+
- namespace google {
- namespace protobuf {
- namespace python {
-@@ -96,48 +127,66 @@ bool _CalledFromGeneratedFile(int stacklevel) {
-   // This check is not critical and is somewhat difficult to implement correctly
-   // in PyPy.
-   PyFrameObject* frame = PyEval_GetFrame();
-+  PyCodeObject* frame_code = nullptr;
-+  PyObject* frame_globals = nullptr;
-+  PyObject* frame_locals = nullptr;
-+  bool result = false;
-+
-   if (frame == nullptr) {
--    return false;
-+    goto exit;
-   }
-+  Py_INCREF(frame);
-   while (stacklevel-- > 0) {
--    frame = frame->f_back;
-+    PyFrameObject* next_frame = PyFrame_GetBack(frame);
-+    Py_DECREF(frame);
-+    frame = next_frame;
-     if (frame == nullptr) {
--      return false;
-+      goto exit;
-     }
-   }
- 
--  if (frame->f_code->co_filename == nullptr) {
--    return false;
-+  frame_code = PyFrame_GetCode(frame);
-+  if (frame_code->co_filename == nullptr) {
-+    goto exit;
-   }
-   char* filename;
-   Py_ssize_t filename_size;
--  if (PyString_AsStringAndSize(frame->f_code->co_filename,
-+  if (PyString_AsStringAndSize(frame_code->co_filename,
-                                &filename, &filename_size) < 0) {
-     // filename is not a string.
-     PyErr_Clear();
--    return false;
-+    goto exit;
-   }
-   if ((filename_size < 3) ||
-       (strcmp(&filename[filename_size - 3], ".py") != 0)) {
-     // Cython's stack does not have .py file name and is not at global module
-     // scope.
--    return true;
-+    result = true;
-+    goto exit;
-   }
-   if (filename_size < 7) {
-     // filename is too short.
--    return false;
-+    goto exit;
-   }
-   if (strcmp(&filename[filename_size - 7], "_pb2.py") != 0) {
-     // Filename is not ending with _pb2.
--    return false;
-+    goto exit;
-   }
- 
--  if (frame->f_globals != frame->f_locals) {
-+  frame_globals = PyFrame_GetGlobals(frame);
-+  frame_locals = PyFrame_GetLocals(frame);
-+  if (frame_globals != frame_locals) {
-     // Not at global module scope
--    return false;
-+    goto exit;
-   }
- #endif
--  return true;
-+  result = true;
-+exit:
-+  Py_XDECREF(frame_globals);
-+  Py_XDECREF(frame_locals);
-+  Py_XDECREF(frame_code);
-+  Py_XDECREF(frame);
-+  return result;
- }
- 
- // If the calling code is not a _pb2.py file, raise AttributeError.
\ No newline at end of file
diff --git a/third_party/py/BUILD b/third_party/py/BUILD.bazel
similarity index 100%
rename from third_party/py/BUILD
rename to third_party/py/BUILD.bazel
diff --git a/third_party/clang_toolchain/BUILD b/third_party/py/ml_dtypes/BUILD.bazel
similarity index 100%
rename from third_party/clang_toolchain/BUILD
rename to third_party/py/ml_dtypes/BUILD.bazel
diff --git a/third_party/git/BUILD b/third_party/py/non_hermetic/ml_dtypes/BUILD.bazel
similarity index 100%
rename from third_party/git/BUILD
rename to third_party/py/non_hermetic/ml_dtypes/BUILD.bazel
diff --git a/third_party/py/non_hermetic/numpy/BUILD b/third_party/py/non_hermetic/numpy/BUILD.bazel
similarity index 100%
rename from third_party/py/non_hermetic/numpy/BUILD
rename to third_party/py/non_hermetic/numpy/BUILD.bazel
diff --git a/third_party/py/numpy/BUILD b/third_party/py/numpy/BUILD.bazel
similarity index 100%
rename from third_party/py/numpy/BUILD
rename to third_party/py/numpy/BUILD.bazel
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 3728a91b9310..8a6c4894a621 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "//third_party/remote_config:common.bzl",
+    "@local_xla//third_party/remote_config:common.bzl",
     "BAZEL_SH",
     "PYTHON_BIN_PATH",
     "PYTHON_LIB_PATH",
diff --git a/third_party/py/python_init_rules.bzl b/third_party/py/python_init_rules.bzl
index 796ae3d92d99..0cabd24e7802 100644
--- a/third_party/py/python_init_rules.bzl
+++ b/third_party/py/python_init_rules.bzl
@@ -9,5 +9,9 @@ def python_init_rules():
         strip_prefix = "rules_python-0.39.0",
         url = "https://github.com/bazelbuild/rules_python/releases/download/0.39.0/rules_python-0.39.0.tar.gz",
         patch_args = ["-p1"],
-        patches = [Label("//third_party/py:rules_python.patch")],
+        patches = [
+            Label("//third_party/py:rules_python1.patch"),
+            Label("//third_party/py:rules_python2.patch"),
+            Label("//third_party/py:rules_python3.patch"),
+        ],
     )
diff --git a/third_party/py/python_init_toolchains.bzl b/third_party/py/python_init_toolchains.bzl
index fceca6eef1fa..f8f8f3dc0885 100644
--- a/third_party/py/python_init_toolchains.bzl
+++ b/third_party/py/python_init_toolchains.bzl
@@ -6,6 +6,7 @@ load(
     "HERMETIC_PYTHON_SHA256",
     "HERMETIC_PYTHON_URL",
     "HERMETIC_PYTHON_VERSION",
+    "HERMETIC_PYTHON_VERSION_KIND",
 )
 load("@rules_python//python:repositories.bzl", "python_register_toolchains")
 load("@rules_python//python:versions.bzl", "MINOR_MAPPING", "PLATFORMS")
@@ -61,4 +62,5 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
             name = name,
             ignore_root_user_error = True,
             python_version = HERMETIC_PYTHON_VERSION,
+            python_version_kind = HERMETIC_PYTHON_VERSION_KIND,
         )
diff --git a/third_party/py/python_repo.bzl b/third_party/py/python_repo.bzl
index 423d6dd63a8e..c1075b1491d8 100644
--- a/third_party/py/python_repo.bzl
+++ b/third_party/py/python_repo.bzl
@@ -106,6 +106,7 @@ Requirements_lock label: "{requirements_lock_label}"
         """
 TF_PYTHON_VERSION = "{version}"
 HERMETIC_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION_KIND = "{py_kind}"
 WHEEL_NAME = "{wheel_name}"
 WHEEL_COLLAB = "{wheel_collab}"
 REQUIREMENTS = "{requirements}"
@@ -117,6 +118,7 @@ HERMETIC_PYTHON_SHA256 = "{hermetic_sha256}"
 HERMETIC_PYTHON_PREFIX = "{hermetic_prefix}"
 """.format(
             version = version,
+            py_kind = py_kind,
             wheel_name = wheel_name,
             wheel_collab = wheel_collab,
             requirements = str(requirements),
diff --git a/third_party/py/rules_python.patch b/third_party/py/rules_python.patch
deleted file mode 100644
index 400e7d652b86..000000000000
--- a/third_party/py/rules_python.patch
+++ /dev/null
@@ -1,96 +0,0 @@
-diff --git a/python/private/pypi/deps.bzl b/python/private/pypi/deps.bzl
-index 8949ed4a..8d0ab0e7 100644
---- a/python/private/pypi/deps.bzl
-+++ b/python/private/pypi/deps.bzl
-@@ -51,8 +51,8 @@ _RULE_DEPS = [
-     ),
-     (
-         "pypi__packaging",
--        "https://files.pythonhosted.org/packages/49/df/1fceb2f8900f8639e278b056416d49134fb8d84c5942ffaa01ad34782422/packaging-24.0-py3-none-any.whl",
--        "2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5",
-+        "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl",
-+        "09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759",
-     ),
-     (
-         "pypi__pep517",
-@@ -61,8 +61,8 @@ _RULE_DEPS = [
-     ),
-     (
-         "pypi__pip",
--        "https://files.pythonhosted.org/packages/8a/6a/19e9fe04fca059ccf770861c7d5721ab4c2aebc539889e97c7977528a53b/pip-24.0-py3-none-any.whl",
--        "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc",
-+        "https://files.pythonhosted.org/packages/ef/7d/500c9ad20238fcfcb4cb9243eede163594d7020ce87bd9610c9e02771876/pip-24.3.1-py3-none-any.whl",
-+        "3790624780082365f47549d032f3770eeb2b1e8bd1f7b2e02dace1afa361b4ed",
-     ),
-     (
-         "pypi__pip_tools",
-diff --git a/python/private/pypi/evaluate_markers.bzl b/python/private/pypi/evaluate_markers.bzl
-index c805fd7a..e57e6138 100644
---- a/python/private/pypi/evaluate_markers.bzl
-+++ b/python/private/pypi/evaluate_markers.bzl
-@@ -20,7 +20,7 @@ load(":pypi_repo_utils.bzl", "pypi_repo_utils")
- SRCS = [
-     # When the version, or any of the files in `packaging` package changes,
-     # this file will change as well.
--    Label("@pypi__packaging//:packaging-24.0.dist-info/RECORD"),
-+    Label("@pypi__packaging//:packaging-24.2.dist-info/RECORD"),
-     Label("//python/private/pypi/requirements_parser:resolve_target_platforms.py"),
-     Label("//python/private/pypi/whl_installer:platform.py"),
- ]
-diff --git a/python/versions.bzl b/python/versions.bzl
-index 774c24d1..91e59f9b 100644
---- a/python/versions.bzl
-+++ b/python/versions.bzl
-@@ -561,6 +561,20 @@ TOOL_VERSIONS = {
-         },
-         "strip_prefix": "python",
-     },
-+    "3.12.8": {
-+        "url": "20241206/cpython-{python_version}+20241206-{platform}-{build}.tar.gz",
-+        "sha256": {
-+            "aarch64-apple-darwin": "e3c4aa607717b23903ca2650d5c3ee24f89b97543e2db2b0f463bddc7a9e92f3",
-+            "aarch64-unknown-linux-gnu": "ce674b55442b732973afb2932c281bb1ded4ad7e22bcf9b07071165770758c7e",
-+            "ppc64le-unknown-linux-gnu": "b7214790b273de9ed0532420054b72ba1393d62d2fc844ec55ade193771bd90c",
-+            "s390x-unknown-linux-gnu": "73102f5dbd7d1e7e9c2f2c80aedf2893d99a7fa407f6674ec8b2f57ba07daee5",
-+            "x86_64-apple-darwin": "3ba35c706577d755e8e52a4c161a042464577c0e695e2a605362fa469e26de10",
-+            "x86_64-pc-windows-msvc": "767b4be3ddf6b99e5ade519789c1615c191d8cf99d5aff4685cc18b48931f1e6",
-+            "x86_64-unknown-linux-gnu": "b9d6ee5ddac1198e72d53112698773fc8bb597de095592eb849ca794306699ba",
-+            "x86_64-unknown-linux-musl": "6f305888703691dd04cfff85284d23ea0b0146ed7c4415e472f1fb72b3f32cdf",
-+        },
-+        "strip_prefix": "python",
-+    },
-     "3.13.0": {
-         "url": "20241016/cpython-{python_version}+20241016-{platform}-{build}.{ext}",
-         "sha256": {
-@@ -589,7 +603,7 @@ MINOR_MAPPING = {
-     "3.9": "3.9.20",
-     "3.10": "3.10.15",
-     "3.11": "3.11.10",
--    "3.12": "3.12.7",
-+    "3.12": "3.12.8",
-     "3.13": "3.13.0",
- }
-
-diff --git a/python/private/python_bootstrap_template.txt b/python/private/python_bootstrap_template.txt
-index 0f9c90b3..6d1e2f61 100644
---- a/python/private/python_bootstrap_template.txt
-+++ b/python/private/python_bootstrap_template.txt
-@@ -52,8 +52,16 @@ def GetWindowsPathWithUNCPrefix(path):
-   # removed from common Win32 file and directory functions.
-   # Related doc: https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=cmd#enable-long-paths-in-windows-10-version-1607-and-later
-   import platform
--  if platform.win32_ver()[1] >= '10.0.14393':
--    return path
-+  version = None
-+  # The try-except block is needed to fix the flakiness of RBE tests
-+  # on Windows 2022 using hermetic python 3.12.8.
-+  try:
-+    version = platform.win32_ver()[1]
-+  except (ValueError, KeyError):
-+    version = platform.win32_ver()[1]
-+  finally:
-+    if version and version >= '10.0.14393':
-+      return path
- 
-   # import sysconfig only now to maintain python 2.6 compatibility
-   import sysconfig
diff --git a/third_party/py/rules_python1.patch b/third_party/py/rules_python1.patch
new file mode 100644
index 000000000000..aab0c23d4d22
--- /dev/null
+++ b/third_party/py/rules_python1.patch
@@ -0,0 +1,93 @@
+diff --git a/python/private/pypi/deps.bzl b/python/private/pypi/deps.bzl
+index 8949ed4a..8d0ab0e7 100644
+--- a/python/private/pypi/deps.bzl
++++ b/python/private/pypi/deps.bzl
+@@ -51,8 +51,8 @@ _RULE_DEPS = [
+     ),
+     (
+         "pypi__packaging",
+-        "https://files.pythonhosted.org/packages/49/df/1fceb2f8900f8639e278b056416d49134fb8d84c5942ffaa01ad34782422/packaging-24.0-py3-none-any.whl",
+-        "2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5",
++        "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl",
++        "09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759",
+     ),
+     (
+         "pypi__pep517",
+@@ -61,8 +61,8 @@ _RULE_DEPS = [
+     ),
+     (
+         "pypi__pip",
+-        "https://files.pythonhosted.org/packages/8a/6a/19e9fe04fca059ccf770861c7d5721ab4c2aebc539889e97c7977528a53b/pip-24.0-py3-none-any.whl",
+-        "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc",
++        "https://files.pythonhosted.org/packages/ef/7d/500c9ad20238fcfcb4cb9243eede163594d7020ce87bd9610c9e02771876/pip-24.3.1-py3-none-any.whl",
++        "3790624780082365f47549d032f3770eeb2b1e8bd1f7b2e02dace1afa361b4ed",
+     ),
+     (
+         "pypi__pip_tools",
+diff --git a/python/private/pypi/evaluate_markers.bzl b/python/private/pypi/evaluate_markers.bzl
+index c805fd7a..e57e6138 100644
+--- a/python/private/pypi/evaluate_markers.bzl
++++ b/python/private/pypi/evaluate_markers.bzl
+@@ -20,7 +20,7 @@ load(":pypi_repo_utils.bzl", "pypi_repo_utils")
+ SRCS = [
+     # When the version, or any of the files in `packaging` package changes,
+     # this file will change as well.
+-    Label("@pypi__packaging//:packaging-24.0.dist-info/RECORD"),
++    Label("@pypi__packaging//:packaging-24.2.dist-info/RECORD"),
+     Label("//python/private/pypi/requirements_parser:resolve_target_platforms.py"),
+     Label("//python/private/pypi/whl_installer:platform.py"),
+ ]
+diff --git a/python/versions.bzl b/python/versions.bzl
+index 774c24d1..91e59f9b 100644
+--- a/python/versions.bzl
++++ b/python/versions.bzl
+@@ -561,6 +561,20 @@ TOOL_VERSIONS = {
+         },
+         "strip_prefix": "python",
+     },
++    "3.12.8": {
++        "url": "20241206/cpython-{python_version}+20241206-{platform}-{build}.tar.gz",
++        "sha256": {
++            "aarch64-apple-darwin": "e3c4aa607717b23903ca2650d5c3ee24f89b97543e2db2b0f463bddc7a9e92f3",
++            "aarch64-unknown-linux-gnu": "ce674b55442b732973afb2932c281bb1ded4ad7e22bcf9b07071165770758c7e",
++            "ppc64le-unknown-linux-gnu": "b7214790b273de9ed0532420054b72ba1393d62d2fc844ec55ade193771bd90c",
++            "s390x-unknown-linux-gnu": "73102f5dbd7d1e7e9c2f2c80aedf2893d99a7fa407f6674ec8b2f57ba07daee5",
++            "x86_64-apple-darwin": "3ba35c706577d755e8e52a4c161a042464577c0e695e2a605362fa469e26de10",
++            "x86_64-pc-windows-msvc": "767b4be3ddf6b99e5ade519789c1615c191d8cf99d5aff4685cc18b48931f1e6",
++            "x86_64-unknown-linux-gnu": "b9d6ee5ddac1198e72d53112698773fc8bb597de095592eb849ca794306699ba",
++            "x86_64-unknown-linux-musl": "6f305888703691dd04cfff85284d23ea0b0146ed7c4415e472f1fb72b3f32cdf",
++        },
++        "strip_prefix": "python",
++    },
+     "3.13.0": {
+         "url": "20241016/cpython-{python_version}+20241016-{platform}-{build}.{ext}",
+         "sha256": {
+@@ -589,7 +603,7 @@ MINOR_MAPPING = {
+     "3.9": "3.9.20",
+     "3.10": "3.10.15",
+     "3.11": "3.11.10",
+-    "3.12": "3.12.7",
++    "3.12": "3.12.8",
+     "3.13": "3.13.0",
+ }
+
+diff --git a/python/private/python_bootstrap_template.txt b/python/private/python_bootstrap_template.txt
+index 0f9c90b3..567bdc88 100644
+--- a/python/private/python_bootstrap_template.txt
++++ b/python/private/python_bootstrap_template.txt
+@@ -52,7 +52,14 @@ def GetWindowsPathWithUNCPrefix(path):
+   # removed from common Win32 file and directory functions.
+   # Related doc: https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=cmd#enable-long-paths-in-windows-10-version-1607-and-later
+   import platform
+-  if platform.win32_ver()[1] >= '10.0.14393':
++  version = None
++  # The try-except block is needed to fix the flakiness of RBE tests
++  # on Windows 2022 using hermetic python 3.12.8.
++  try:
++    version = platform.win32_ver()[1]
++  except (ValueError, KeyError):
++    version = platform.win32_ver()[1]
++  if version and version >= '10.0.14393':
+     return path
+
+   # import sysconfig only now to maintain python 2.6 compatibility
diff --git a/third_party/py/rules_python2.patch b/third_party/py/rules_python2.patch
new file mode 100644
index 000000000000..0527e15f9d64
--- /dev/null
+++ b/third_party/py/rules_python2.patch
@@ -0,0 +1,106 @@
+diff --git a/python/versions.bzl b/python/versions.bzl
+index 91e59f9b..e74c9736 100644
+--- a/python/versions.bzl
++++ b/python/versions.bzl
+@@ -21,7 +21,7 @@ LINUX_NAME = "linux"
+ WINDOWS_NAME = "windows"
+ FREETHREADED = "freethreaded"
+ 
+-DEFAULT_RELEASE_BASE_URL = "https://github.com/indygreg/python-build-standalone/releases/download"
++DEFAULT_RELEASE_BASE_URL = "https://github.com/astral-sh/python-build-standalone/releases/download"
+ 
+ # When updating the versions and releases, run the following command to get
+ # the hashes:
+@@ -575,23 +575,55 @@ TOOL_VERSIONS = {
+         },
+         "strip_prefix": "python",
+     },
+-    "3.13.0": {
+-        "url": "20241016/cpython-{python_version}+20241016-{platform}-{build}.{ext}",
+-        "sha256": {
+-            "aarch64-apple-darwin": "31397953849d275aa2506580f3fa1cb5a85b6a3d392e495f8030e8b6412f5556",
+-            "aarch64-unknown-linux-gnu": "e8378c0162b2e0e4cc1f62b29443a3305d116d09583304dbb0149fecaff6347b",
+-            "ppc64le-unknown-linux-gnu": "fc4b7f27c4e84c78f3c8e6c7f8e4023e4638d11f1b36b6b5ce457b1926cebb53",
+-            "s390x-unknown-linux-gnu": "66b19e6a07717f6cfcd3a8ca953f0a2eaa232291142f3d26a8d17c979ec0f467",
+-            "x86_64-apple-darwin": "cff1b7e7cd26f2d47acac1ad6590e27d29829776f77e8afa067e9419f2f6ce77",
+-            "x86_64-pc-windows-msvc": "b25926e8ce4164cf103bacc4f4d154894ea53e07dd3fdd5ebb16fb1a82a7b1a0",
+-            "x86_64-unknown-linux-gnu": "2c8cb15c6a2caadaa98af51df6fe78a8155b8471cb3dd7b9836038e0d3657fb4",
+-            "aarch64-apple-darwin-freethreaded": "efc2e71c0e05bc5bedb7a846e05f28dd26491b1744ded35ed82f8b49ccfa684b",
+-            "aarch64-unknown-linux-gnu-freethreaded": "59b50df9826475d24bb7eff781fa3949112b5e9c92adb29e96a09cdf1216d5bd",
+-            "ppc64le-unknown-linux-gnu-freethreaded": "1217efa5f4ce67fcc9f7eb64165b1bd0912b2a21bc25c1a7e2cb174a21a5df7e",
+-            "s390x-unknown-linux-gnu-freethreaded": "6c3e1e4f19d2b018b65a7e3ef4cd4225c5b9adfbc490218628466e636d5c4b8c",
+-            "x86_64-apple-darwin-freethreaded": "2e07dfea62fe2215738551a179c87dbed1cc79d1b3654f4d7559889a6d5ce4eb",
+-            "x86_64-pc-windows-msvc-freethreaded": "bfd89f9acf866463bc4baf01733da5e767d13f5d0112175a4f57ba91f1541310",
+-            "x86_64-unknown-linux-gnu-freethreaded": "a73adeda301ad843cce05f31a2d3e76222b656984535a7b87696a24a098b216c",
++    "3.13.2": {
++        "url": "20250317/cpython-{python_version}+20250317-{platform}-{build}.{ext}",
++        "sha256": {
++            "aarch64-apple-darwin": "faa44274a331eb39786362818b21b3a4e74514e8805000b20b0e55c590cecb94",
++            "aarch64-unknown-linux-gnu": "9c67260446fee6ea706dad577a0b32936c63f449c25d66e4383d5846b2ab2e36",
++            "ppc64le-unknown-linux-gnu": "345b53d2f86c9dbd7f1320657cb227ff9a42ef63ff21f129abbbc8c82a375147",
++            "s390x-unknown-linux-gnu": "ec3b16ea8a97e3138acec72bc5ff35949950c62c8994a8ec8e213fd93f0e806b",
++            "x86_64-apple-darwin": "ee4526e84b5ce5b11141c50060b385320f2773616249a741f90c96d460ce8e8f",
++            "x86_64-pc-windows-msvc": "84d7b52f3558c8e35c670a4fa14080c75e3ec584adfae49fec8b51008b75b21e",
++            "x86_64-unknown-linux-gnu": "db011f0cd29cab2291584958f4e2eb001b0e6051848d89b38a2dc23c5c54e512",
++            "x86_64-unknown-linux-musl": "00bb2d629f7eacbb5c6b44dc04af26d1f1da64cee3425b0d8eb5135a93830296",
++            "aarch64-apple-darwin-freethreaded": "c98c9c977e6fa05c3813bd49f3553904d89d60fed27e2e36468da7afa1d6d5e2",
++            "aarch64-unknown-linux-gnu-freethreaded": "b8635e59e3143fd17f19a3dfe8ccc246ee6587c87da359bd1bcab35eefbb5f19",
++            "ppc64le-unknown-linux-gnu-freethreaded": "6ae8fa44cb2edf4ab49cff1820b53c40c10349c0f39e11b8cd76ce7f3e7e1def",
++            "s390x-unknown-linux-gnu-freethreaded": "c074144cc80c2af32c420b79a9df26e8db405212619990c1fbdd308bd75afe3f",
++            "x86_64-apple-darwin-freethreaded": "0d73e4348d8d4b5159058609d2303705190405b485dd09ad05d870d7e0f36e0f",
++            "x86_64-pc-windows-msvc-freethreaded": "c51b4845fda5421e044067c111192f645234081d704313f74ee77fa013a186ea",
++            "x86_64-unknown-linux-gnu-freethreaded": "1aea5062614c036904b55c1cc2fb4b500b7f6f7a4cacc263f4888889d355eef8",
++        },
++        "strip_prefix": {
++            "aarch64-apple-darwin": "python",
++            "aarch64-unknown-linux-gnu": "python",
++            "ppc64le-unknown-linux-gnu": "python",
++            "s390x-unknown-linux-gnu": "python",
++            "x86_64-apple-darwin": "python",
++            "x86_64-pc-windows-msvc": "python",
++            "x86_64-unknown-linux-gnu": "python",
++            "x86_64-unknown-linux-musl": "python",
++            "aarch64-apple-darwin-freethreaded": "python/install",
++            "aarch64-unknown-linux-gnu-freethreaded": "python/install",
++            "ppc64le-unknown-linux-gnu-freethreaded": "python/install",
++            "s390x-unknown-linux-gnu-freethreaded": "python/install",
++            "x86_64-apple-darwin-freethreaded": "python/install",
++            "x86_64-pc-windows-msvc-freethreaded": "python/install",
++            "x86_64-unknown-linux-gnu-freethreaded": "python/install",
++        },
++    },
++    # Note: This is python 3.14.0a6, but it is labeled as '3.14.0' due to limitation within rules_python
++    # that prevents the correct handling of pre-release version identifiers like '3.14.0a'.
++    "3.14.0": {
++        "url": "20250409/cpython-{python_version}a6+20250409-{platform}-{build}.tar.gz",
++        "sha256": {
++            "aarch64-apple-darwin": "fd34267a9923a09c03ae7e3626b5681b58bccecaaf3c1cfec6d770c8b110a8be",
++            "aarch64-unknown-linux-gnu": "03fd176e1e14f21a50c970b883500226919a566ca5b1d27dec06a2dd68102d3e",
++            "ppc64le-unknown-linux-gnu": "f8ef33c10ea532307325b4cfbf4b31cf372c141c8aff6e8fe3894a959e1e4f03",
++            "s390x-unknown-linux-gnu": "bb6451996db2c148d9ce112d2c967b59162180447ac089f75de71a45210b9864",
++            "x86_64-apple-darwin": "15b366906fda248928434e44fa3d2a9b7928944165512537d778a5e98b12ec96",
++            "x86_64-pc-windows-msvc": "72f40ada379c9c480fb429105017b44266c1dc6679b73cc0c5159d5dc1f0b8a6",
++            "x86_64-unknown-linux-gnu": "d47becae984d63578bf04abfb8f0a545a5682c2b7b64ff32b90c3662bec2b06d",
+         },
+         "strip_prefix": "python",
+     },
+@@ -604,7 +636,8 @@ MINOR_MAPPING = {
+     "3.10": "3.10.15",
+     "3.11": "3.11.10",
+     "3.12": "3.12.8",
+-    "3.13": "3.13.0",
++    "3.13": "3.13.2",
++    "3.14": "3.14.0",
+ }
+ 
+ def _generate_platforms():
+@@ -793,9 +826,6 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
+         else:
+             build = "install_only"
+ 
+-        if WINDOWS_NAME in platform:
+-            build = "shared-" + build
+-
+         release_filename = u.format(
+             platform = p,
+             python_version = python_version,
diff --git a/third_party/py/rules_python3.patch b/third_party/py/rules_python3.patch
new file mode 100644
index 000000000000..4b3c8c99885b
--- /dev/null
+++ b/third_party/py/rules_python3.patch
@@ -0,0 +1,45 @@
+diff --git a/python/private/python_register_toolchains.bzl b/python/private/python_register_toolchains.bzl
+index 98c8e5bf..fc533001 100644
+--- a/python/private/python_register_toolchains.bzl
++++ b/python/private/python_register_toolchains.bzl
+@@ -86,6 +86,7 @@ def python_register_toolchains(
+     minor_mapping = minor_mapping or MINOR_MAPPING
+
+     python_version = full_version(version = python_version, minor_mapping = minor_mapping)
++    python_version_kind = kwargs.pop("python_version_kind", "")
+
+     toolchain_repo_name = "{name}_toolchains".format(name = name)
+
+@@ -165,6 +166,7 @@ def python_register_toolchains(
+     toolchain_aliases(
+         name = name,
+         python_version = python_version,
++        python_version_kind = python_version_kind,
+         user_repository_name = name,
+         platforms = loaded_platforms,
+     )
+diff --git a/python/private/toolchains_repo.bzl b/python/private/toolchains_repo.bzl
+index d21fb53a..a5271c18 100644
+--- a/python/private/toolchains_repo.bzl
++++ b/python/private/toolchains_repo.bzl
+@@ -130,6 +130,9 @@ def _toolchain_aliases_impl(rctx):
+     (os_name, arch) = _get_host_os_arch(rctx, logger)
+
+     host_platform = _get_host_platform(os_name, arch)
++    python_version_kind = rctx.attr.python_version_kind
++    if python_version_kind == "ft":
++        host_platform += "-freethreaded"
+
+     is_windows = (os_name == WINDOWS_NAME)
+     python3_binary_path = "python.exe" if is_windows else "bin/python3"
+@@ -233,6 +236,10 @@ actions.""",
+             doc = "List of platforms for which aliases shall be created",
+         ),
+         "python_version": attr.string(doc = "The Python version."),
++        "python_version_kind": attr.string(
++            doc = "Python version kind, e.g. ft (free-threaded)",
++            default = ""
++        ),
+         "user_repository_name": attr.string(
+             mandatory = True,
+             doc = "The base name for all created repositories, like 'python38'.",
\ No newline at end of file
diff --git a/third_party/pybind11.BUILD b/third_party/pybind11.BUILD
deleted file mode 100644
index d2a5d102559b..000000000000
--- a/third_party/pybind11.BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "pybind11",
-    hdrs = glob(
-        include = [
-            "include/pybind11/*.h",
-            "include/pybind11/detail/*.h",
-        ],
-        exclude = [
-            "include/pybind11/common.h",
-            "include/pybind11/eigen.h",
-        ],
-    ),
-    copts = [
-        "-fexceptions",
-        "-Wno-undefined-inline",
-        "-Wno-pragma-once-outside-header",
-    ],
-    includes = ["include"],
-    strip_include_prefix = "include",
-    deps = [
-        "@local_xla//third_party/python_runtime:headers",
-    ],
-)
-
-# Needed by pybind11_bazel.
-config_setting(
-    name = "osx",
-    constraint_values = ["@platforms//os:osx"],
-)
diff --git a/third_party/pybind11_abseil/remove_license.patch b/third_party/pybind11_abseil/remove_license.patch
deleted file mode 100644
index 91852c2f398b..000000000000
--- a/third_party/pybind11_abseil/remove_license.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/pybind11_abseil/BUILD b/pybind11_abseil/BUILD
-index 41482e7..ed9e4af 100644
---- a/pybind11_abseil/BUILD
-+++ b/pybind11_abseil/BUILD
-@@ -6,8 +6,6 @@ package(default_visibility = ["//visibility:public"])
-
- licenses(["notice"])
-
--exports_files(["LICENSE"])
--
- pybind_library(
-     name = "absl_casters",
-     hdrs = ["absl_casters.h"],
\ No newline at end of file
diff --git a/third_party/pybind11_abseil/workspace.bzl b/third_party/pybind11_abseil/workspace.bzl
deleted file mode 100644
index 19c11118b8fd..000000000000
--- a/third_party/pybind11_abseil/workspace.bzl
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Provides the repo macro to import pybind11_abseil.
-
-pybind11_abseil requires pybind11 (which is loaded in another rule) and pybind11_bazel.
-See https://github.com/pybind/pybind11_abseil#installation.
-"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports pybind11_abseil."""
-    PA_COMMIT = "2c4932ed6f6204f1656e245838f4f5eae69d2e29"
-    PA_SHA256 = "0223b647b8cc817336a51e787980ebc299c8d5e64c069829bf34b69d72337449"
-    tf_http_archive(
-        name = "pybind11_abseil",
-        sha256 = PA_SHA256,
-        strip_prefix = "pybind11_abseil-{commit}".format(commit = PA_COMMIT),
-        urls = tf_mirror_urls("https://github.com/pybind/pybind11_abseil/archive/{commit}.tar.gz".format(commit = PA_COMMIT)),
-        build_file = "//third_party/pybind11_abseil:BUILD",
-        patch_file = ["//third_party/pybind11_abseil:remove_license.patch"],
-    )
diff --git a/third_party/pybind11_bazel/pybind11_bazel.patch b/third_party/pybind11_bazel/pybind11_bazel.patch
deleted file mode 100644
index 74e038ddc113..000000000000
--- a/third_party/pybind11_bazel/pybind11_bazel.patch
+++ /dev/null
@@ -1,37 +0,0 @@
-diff --git a/build_defs.bzl b/build_defs.bzl
-index cde1e93..03f14a5 100644
---- a/build_defs.bzl
-+++ b/build_defs.bzl
-@@ -27,7 +27,9 @@ PYBIND_DEPS = [
- 
- # Builds a Python extension module using pybind11.
- # This can be directly used in python with the import statement.
--# This adds rules for a .so binary file.
-+# This adds rules for .so and .pyd binary files, as well as
-+# a base target that selects between them depending on the platform
-+# (.pyd for windows, .so otherwise).
- def pybind_extension(
-         name,
-         copts = [],
-@@ -59,6 +61,21 @@ def pybind_extension(
-         **kwargs
-     )
- 
-+    native.genrule(
-+        name = name + "_pyd",
-+        srcs = [name + ".so"],
-+        outs = [name + ".pyd"],
-+        cmd = "cp $< $@",
-+    )
-+
-+    native.py_library(
-+        name = name,
-+        data = select({
-+            "@platforms//os:windows": [":" + name + ".pyd"],
-+            "//conditions:default": [":" + name + ".so"],
-+        }),
-+    )
-+
- # Builds a pybind11 compatible library. This can be linked to a pybind_extension.
- def pybind_library(
-         name,
diff --git a/third_party/pybind11_bazel/workspace.bzl b/third_party/pybind11_bazel/workspace.bzl
deleted file mode 100644
index dcc71d322287..000000000000
--- a/third_party/pybind11_bazel/workspace.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-"""Provides the repo macro to import pybind11_bazel.
-
-pybind11_bazel requires pybind11 (which is loaded in another rule).
-"""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    PB_COMMIT = "72cbbf1fbc830e487e3012862b7b720001b70672"
-    PB_SHA256 = "516c1b3a10d87740d2b7de6f121f8e19dde2c372ecbfe59aef44cd1872c10395"
-    tf_http_archive(
-        name = "pybind11_bazel",
-        strip_prefix = "pybind11_bazel-{commit}".format(commit = PB_COMMIT),
-        sha256 = PB_SHA256,
-        patch_file = ["//third_party/pybind11_bazel:pybind11_bazel.patch"],
-        urls = tf_mirror_urls("https://github.com/pybind/pybind11_bazel/archive/{commit}.tar.gz".format(commit = PB_COMMIT)),
-    )
diff --git a/third_party/pybind11_protobuf/BUILD b/third_party/pybind11_protobuf/BUILD
deleted file mode 100644
index 3b946e563d4e..000000000000
--- a/third_party/pybind11_protobuf/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Necessary for bazel to recognize this as a package.
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/pybind11_abseil/BUILD b/third_party/pybind11_protobuf/BUILD.bazel
similarity index 100%
rename from third_party/pybind11_abseil/BUILD
rename to third_party/pybind11_protobuf/BUILD.bazel
diff --git a/third_party/remote_config/BUILD.tpl b/third_party/remote_config/BUILD.tpl
deleted file mode 100644
index d97eb9701376..000000000000
--- a/third_party/remote_config/BUILD.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-# Each platform creates a constraint @<platform>//:platform_constraint that
-# is listed in its constraint_values; rule that want to select a specific
-# platform to run on can put @<platform>//:platform_constraing into their
-# exec_compatible_with attribute.
-# Toolchains can similarly be marked with target_compatible_with or
-# exec_compatible_with to bind them to this platform.
-constraint_setting(
-    name = "platform_setting"
-)
-
-constraint_value(
-    name = "platform_constraint",
-    constraint_setting = ":platform_setting",
-    visibility = ["//visibility:public"],
-)
-
-platform(
-    name = "platform",
-    visibility = ["//visibility:public"],
-    constraint_values = [
-        "@platforms//cpu:%{cpu}",
-        "@platforms//os:%{platform}",
-        ":platform_constraint",
-    ],
-    exec_properties = %{exec_properties},
-)
diff --git a/third_party/remote_config/common.bzl b/third_party/remote_config/common.bzl
deleted file mode 100644
index 94d2489c62b4..000000000000
--- a/third_party/remote_config/common.bzl
+++ /dev/null
@@ -1,347 +0,0 @@
-"""Functions common across configure rules."""
-
-BAZEL_SH = "BAZEL_SH"
-PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
-TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
-
-def auto_config_fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("%sConfiguration Error:%s %s\n" % (red, no_color, msg))
-
-def which(repository_ctx, program_name, allow_failure = False):
-    """Returns the full path to a program on the execution platform.
-
-    Args:
-      repository_ctx: the repository_ctx
-      program_name: name of the program on the PATH
-
-    Returns:
-      The full path to a program on the execution platform.
-    """
-    if is_windows(repository_ctx):
-        if not program_name.endswith(".exe"):
-            program_name = program_name + ".exe"
-        out = execute(
-            repository_ctx,
-            ["C:\\Windows\\System32\\where.exe", program_name],
-            allow_failure = allow_failure,
-        ).stdout
-        if out != None:
-            out = out.replace("\\", "\\\\").rstrip()
-        return out
-
-    out = execute(
-        repository_ctx,
-        ["which", program_name],
-        allow_failure = allow_failure,
-    ).stdout
-    if out != None:
-        out = out.replace("\\", "\\\\").rstrip()
-    return out
-
-def get_python_bin(repository_ctx):
-    """Gets the python bin path.
-
-    Args:
-      repository_ctx: the repository_ctx
-
-    Returns:
-      The python bin path.
-    """
-    python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH)
-    if python_bin:
-        return python_bin
-
-    # First check for an explicit "python3"
-    python_bin = which(repository_ctx, "python3", True)
-    if python_bin:
-        return python_bin
-
-    # Some systems just call pythone3 "python"
-    python_bin = which(repository_ctx, "python", True)
-    if python_bin:
-        return python_bin
-
-    auto_config_fail("Cannot find python in PATH, please make sure " +
-                     "python is installed and add its directory in PATH, or --define " +
-                     "%s='/something/else'.\nPATH=%s" % (
-                         PYTHON_BIN_PATH,
-                         get_environ(repository_ctx, "PATH"),
-                     ))
-    return python_bin  # unreachable
-
-def get_bash_bin(repository_ctx):
-    """Gets the bash bin path.
-
-    Args:
-      repository_ctx: the repository_ctx
-
-    Returns:
-      The bash bin path.
-    """
-    bash_bin = get_host_environ(repository_ctx, BAZEL_SH)
-    if bash_bin != None:
-        return bash_bin
-    bash_bin_path = which(repository_ctx, "bash")
-    if bash_bin_path == None:
-        auto_config_fail("Cannot find bash in PATH, please make sure " +
-                         "bash is installed and add its directory in PATH, or --define " +
-                         "%s='/path/to/bash'.\nPATH=%s" % (
-                             BAZEL_SH,
-                             get_environ(repository_ctx, "PATH"),
-                         ))
-    return bash_bin_path
-
-def read_dir(repository_ctx, src_dir):
-    """Returns a sorted list with all files in a directory.
-
-    Finds all files inside a directory, traversing subfolders and following
-    symlinks.
-
-    Args:
-      repository_ctx: the repository_ctx
-      src_dir: the directory to traverse
-
-    Returns:
-      A sorted list with all files in a directory.
-    """
-    if is_windows(repository_ctx):
-        src_dir = src_dir.replace("/", "\\")
-        find_result = execute(
-            repository_ctx,
-            ["C:\\Windows\\System32\\cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-            allow_failure = True,
-        )
-
-        # src_files will be used in genrule.outs where the paths must
-        # use forward slashes.
-        result = find_result.stdout.replace("\\", "/")
-    else:
-        find_result = execute(
-            repository_ctx,
-            ["find", src_dir, "-follow", "-type", "f"],
-            allow_failure = True,
-        )
-        result = find_result.stdout
-    return sorted(result.splitlines())
-
-def get_environ(repository_ctx, name, default_value = None):
-    """Returns the value of an environment variable on the execution platform.
-
-    Args:
-      repository_ctx: the repository_ctx
-      name: the name of environment variable
-      default_value: the value to return if not set
-
-    Returns:
-      The value of the environment variable 'name' on the execution platform
-      or 'default_value' if it's not set.
-    """
-    if is_windows(repository_ctx):
-        result = execute(
-            repository_ctx,
-            ["C:\\Windows\\System32\\cmd.exe", "/c", "echo", "%" + name + "%"],
-            allow_failure = True,
-        )
-    else:
-        cmd = "echo -n \"$%s\"" % name
-        result = execute(
-            repository_ctx,
-            [get_bash_bin(repository_ctx), "-c", cmd],
-            allow_failure = True,
-        )
-    if len(result.stdout) == 0:
-        return default_value
-    return result.stdout
-
-def get_host_environ(repository_ctx, name, default_value = None):
-    """Returns the value of an environment variable on the host platform.
-
-    The host platform is the machine that Bazel runs on.
-
-    Args:
-      repository_ctx: the repository_ctx
-      name: the name of environment variable
-
-    Returns:
-      The value of the environment variable 'name' on the host platform.
-    """
-    if name in repository_ctx.os.environ:
-        return repository_ctx.os.environ.get(name).strip()
-
-    if hasattr(repository_ctx.attr, "environ") and name in repository_ctx.attr.environ:
-        return repository_ctx.attr.environ.get(name).strip()
-
-    return default_value
-
-def is_windows(repository_ctx):
-    """Returns true if the execution platform is Windows.
-
-    Args:
-      repository_ctx: the repository_ctx
-
-    Returns:
-      If the execution platform is Windows.
-    """
-    os_name = ""
-    if hasattr(repository_ctx.attr, "exec_properties") and "OSFamily" in repository_ctx.attr.exec_properties:
-        os_name = repository_ctx.attr.exec_properties["OSFamily"]
-    else:
-        os_name = repository_ctx.os.name
-
-    return os_name.lower().find("windows") != -1
-
-def get_cpu_value(repository_ctx):
-    """Returns the name of the host operating system.
-
-    Args:
-      repository_ctx: The repository context.
-    Returns:
-      A string containing the name of the host operating system.
-    """
-    if is_windows(repository_ctx):
-        return "Windows"
-    result = raw_exec(repository_ctx, ["uname", "-s"])
-    return result.stdout.strip()
-
-def execute(
-        repository_ctx,
-        cmdline,
-        error_msg = None,
-        error_details = None,
-        allow_failure = False,
-        env_vars = {}):
-    """Executes an arbitrary shell command.
-
-    Args:
-      repository_ctx: the repository_ctx object
-      cmdline: list of strings, the command to execute
-      error_msg: string, a summary of the error if the command fails
-      error_details: string, details about the error or steps to fix it
-      allow_failure: bool, if True, an empty stdout result or output to stderr
-        is fine, otherwise either of these is an error
-      env_vars: environment variables
-    Returns:
-      The result of repository_ctx.execute(cmdline)
-    """
-    result = raw_exec(repository_ctx, cmdline, env_vars)
-    if (result.stderr or not result.stdout) and not allow_failure:
-        fail(
-            "\n".join([
-                error_msg.strip() if error_msg else "Repository command failed",
-                result.stderr.strip(),
-                error_details if error_details else "",
-            ]),
-        )
-    return result
-
-def raw_exec(repository_ctx, cmdline, env_vars = {}):
-    """Executes a command via repository_ctx.execute() and returns the result.
-
-    This method is useful for debugging purposes. For example, to print all
-    commands executed as well as their return code.
-
-    Args:
-      repository_ctx: the repository_ctx
-      cmdline: the list of args
-      env_vars: environment variables
-
-    Returns:
-      The 'exec_result' of repository_ctx.execute().
-    """
-    return repository_ctx.execute(cmdline, environment = env_vars)
-
-def files_exist(repository_ctx, paths, bash_bin = None):
-    """Checks which files in paths exists.
-
-    Args:
-      repository_ctx: the repository_ctx
-      paths: a list of paths
-      bash_bin: path to the bash interpreter
-
-    Returns:
-      Returns a list of Bool. True means that the path at the
-      same position in the paths list exists.
-    """
-    if bash_bin == None:
-        bash_bin = get_bash_bin(repository_ctx)
-
-    cmd_tpl = "[ -e \"%s\" ] && echo True || echo False"
-    cmds = [cmd_tpl % path for path in paths]
-    cmd = " ; ".join(cmds)
-
-    stdout = execute(repository_ctx, [bash_bin, "-c", cmd]).stdout.strip()
-    return [val == "True" for val in stdout.splitlines()]
-
-def realpath(repository_ctx, path, bash_bin = None):
-    """Returns the result of "realpath path".
-
-    Args:
-      repository_ctx: the repository_ctx
-      path: a path on the file system
-      bash_bin: path to the bash interpreter
-
-    Returns:
-      Returns the result of "realpath path"
-    """
-    if bash_bin == None:
-        bash_bin = get_bash_bin(repository_ctx)
-
-    return execute(repository_ctx, [bash_bin, "-c", "realpath \"%s\"" % path]).stdout.strip()
-
-def relative_to(repository_ctx, base, path, bash_bin = None):
-    """Returns the result of "realpath --relative-to".
-
-    Args:
-      repository_ctx: the repository_ctx
-      base: a path on the file system
-      path: a path on the file system
-      bash_bin: path to the bash interpreter
-
-    Returns:
-      Returns the result of "realpath --relative-to"
-    """
-    if bash_bin == None:
-        bash_bin = get_bash_bin(repository_ctx)
-
-    return execute(repository_ctx, [bash_bin, "-c", "realpath --relative-to \"%s\" \"%s\"" % (base, path)]).stdout.strip()
-
-def err_out(result):
-    """Returns stderr if set, else stdout.
-
-    This function is a workaround for a bug in RBE where stderr is returned as stdout. Instead
-    of using result.stderr use err_out(result) instead.
-
-    Args:
-      result: the exec_result.
-
-    Returns:
-      The stderr if set, else stdout
-    """
-    if len(result.stderr) == 0:
-        return result.stdout
-    return result.stderr
-
-def config_repo_label(config_repo, target):
-    """Construct a label from config_repo and target.
-
-    This function exists to ease the migration from preconfig to remote config. In preconfig
-    the TF_*_CONFIG_REPO environ variables are set to packages in the main repo while in
-    remote config they will point to remote repositories.
-
-    Args:
-      config_repo: a remote repository or package.
-      target: a target
-    Returns:
-      A label constructed from config_repo and target.
-    """
-    if config_repo.startswith("@") and not config_repo.find("//") > 0:
-        # remote config is being used.
-        return Label(config_repo + "//" + target)
-    elif target.startswith(":"):
-        return Label(config_repo + target)
-    else:
-        return Label(config_repo + "/" + target)
diff --git a/third_party/remote_config/remote_platform_configure.bzl b/third_party/remote_config/remote_platform_configure.bzl
deleted file mode 100644
index 59bedfe95de0..000000000000
--- a/third_party/remote_config/remote_platform_configure.bzl
+++ /dev/null
@@ -1,55 +0,0 @@
-"""Repository rule to create a platform for a docker image to be used with RBE."""
-
-def _remote_platform_configure_impl(repository_ctx):
-    platform = repository_ctx.attr.platform
-    if platform == "local":
-        os = repository_ctx.os.name.lower()
-        if os.startswith("windows"):
-            platform = "windows"
-        elif os.startswith("mac os"):
-            platform = "osx"
-        else:
-            platform = "linux"
-
-    cpu = "x86_64"
-    machine_type = repository_ctx.execute(["bash", "-c", "echo $MACHTYPE"]).stdout
-    if (machine_type.startswith("ppc") or
-        machine_type.startswith("powerpc")):
-        cpu = "ppc"
-    elif machine_type.startswith("s390x"):
-        cpu = "s390x"
-    elif machine_type.startswith("aarch64"):
-        cpu = "aarch64"
-    elif machine_type.startswith("arm64"):
-        cpu = "aarch64"
-    elif machine_type.startswith("arm"):
-        cpu = "arm"
-    elif machine_type.startswith("mips64"):
-        cpu = "mips64"
-    elif machine_type.startswith("riscv64"):
-        cpu = "riscv64"
-
-    exec_properties = repository_ctx.attr.platform_exec_properties
-
-    serialized_exec_properties = "{"
-    for k, v in exec_properties.items():
-        serialized_exec_properties += "\"%s\" : \"%s\"," % (k, v)
-    serialized_exec_properties += "}"
-
-    repository_ctx.template(
-        "BUILD",
-        Label("@local_xla//third_party/remote_config:BUILD.tpl"),
-        {
-            "%{platform}": platform,
-            "%{exec_properties}": serialized_exec_properties,
-            "%{cpu}": cpu,
-        },
-    )
-
-remote_platform_configure = repository_rule(
-    implementation = _remote_platform_configure_impl,
-    attrs = {
-        "platform_exec_properties": attr.string_dict(mandatory = True),
-        "platform": attr.string(default = "linux", values = ["linux", "windows", "local"]),
-    },
-)
diff --git a/third_party/robin_map/BUILD b/third_party/robin_map/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/robin_map/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/robin_map/robin_map.BUILD b/third_party/robin_map/robin_map.BUILD
deleted file mode 100644
index b649dda31766..000000000000
--- a/third_party/robin_map/robin_map.BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-licenses(["notice"])
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "robin_map",
-    hdrs = [
-        "include/tsl/robin_growth_policy.h",
-        "include/tsl/robin_hash.h",
-        "include/tsl/robin_map.h",
-        "include/tsl/robin_set.h",
-    ],
-    copts = ["-fexceptions"],
-    features = ["-use_header_modules"],  # Incompatible with -fexceptions.
-    includes = ["."],
-    strip_include_prefix = "include",
-)
diff --git a/third_party/robin_map/workspace.bzl b/third_party/robin_map/workspace.bzl
deleted file mode 100644
index d0546dd746f2..000000000000
--- a/third_party/robin_map/workspace.bzl
+++ /dev/null
@@ -1,12 +0,0 @@
-"""Loads the robin_map library."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "robin_map",
-        strip_prefix = "robin-map-1.3.0",
-        sha256 = "a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236",
-        urls = tf_mirror_urls("https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz"),
-        build_file = "//third_party/robin_map:robin_map.BUILD",
-    )
diff --git a/third_party/shardy/temporary.patch b/third_party/shardy/temporary.patch
index ae6be9c39825..e74347533787 100644
--- a/third_party/shardy/temporary.patch
+++ b/third_party/shardy/temporary.patch
@@ -1,284 +1,6393 @@
-diff --git a/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc b/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc
-index 750fcf1..1743279 100644
---- a/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc
-+++ b/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc
-@@ -176,14 +176,31 @@ using DirectionBasedTensorShardings =
- // The same holds for backwards propagation, except we allow expansion based
- // on the result factor shardings but not the operands.
- std::optional<DirectionBasedTensorShardings> getDirectionBasedTensorShardings(
--    PropagationDirection direction, ArrayRef<TensorFactorShardings> operands,
-+    PropagationDirection direction, Operation* op,
-+    ArrayRef<TensorFactorShardings> operands,
-     ArrayRef<TensorFactorShardings> results) {
-+  static const char* errMsg =
-+      "since Shardy is propagating {0} for this op, Shardy may not "
-+      "fully propagate to each of the multiple {1}s; {0} "
-+      "propagation was designed with single {1} ops in mind. Let the "
-+      "Shardy team know the operation that you'd like to be fully "
-+      "supported.";
-+  static llvm::once_flag flag;
-   switch (direction) {
-     case PropagationDirection::BOTH:
-+      return std::make_pair(operands, results);
-     case PropagationDirection::FORWARD: {
-+      if (op && results.size() > 1) {
-+        emitOpWarningOnce(flag, op,
-+                          llvm::formatv(errMsg, "forward", "result").str());
-+      }
-       return std::make_pair(operands, results);
-     }
-     case PropagationDirection::BACKWARD: {
-+      if (op && operands.size() > 1) {
-+        emitOpWarningOnce(flag, op,
-+                          llvm::formatv(errMsg, "backward", "operand").str());
-+      }
-       return std::make_pair(results, operands);
-     }
-     case PropagationDirection::NONE:
-@@ -294,7 +311,7 @@ SmallVector<AxisRefAttr> BasicFactorPropagation::getCompatibleMajorAxes(
-   }
- 
-   std::optional<DirectionBasedTensorShardings> tensorShardings =
--      getDirectionBasedTensorShardings(direction, projection.getOperands(),
-+      getDirectionBasedTensorShardings(direction, op, projection.getOperands(),
-                                        projection.getResults());
-   assert(tensorShardings.has_value());
- 
-diff --git a/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc b/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc
-index 25c0b29..f9d1fc7 100644
---- a/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc
-+++ b/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc
-@@ -21,18 +21,15 @@ limitations under the License.
- #include <memory>
- #include <numeric>
- 
--#include "llvm/ADT/STLExtras.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/Operation.h"
- #include "mlir/IR/SymbolTable.h"
--#include "mlir/IR/Value.h"
- #include "mlir/Pass/Pass.h"
- #include "mlir/Pass/PassRegistry.h"
- #include "mlir/Support/LLVM.h"
- #include "mlir/Support/LogicalResult.h"
- #include "shardy/dialect/sdy/ir/constants.h"
- #include "shardy/dialect/sdy/ir/dialect.h"
--#include "shardy/dialect/sdy/ir/utils.h"
- #include "shardy/dialect/sdy/transforms/common/op_properties.h"
- #include "shardy/dialect/sdy/transforms/propagation/aggressive_propagation.h"
- #include "shardy/dialect/sdy/transforms/propagation/basic_propagation.h"
-@@ -54,55 +51,17 @@ namespace {
- using GetDirectionToPropagateFnPtr = PropagationDirection (*)(Operation*,
-                                                               int64_t);
- 
--// Returns the direction to propagate based on whether any of the op's operands
--// has multiple uses.
--//
--// The rational is that if any of the operands has multiple uses:
--// - We don't want to propagate forward, to avoid introducing a conflict for
--//   each use of the operand.
--// - We don't want to propagate backwards, since there could be a conflict with
--//   another use of the operand.
--//
--// If `allowMultiUse` is true, returns `BOTH` regardless of the number of uses.
--PropagationDirection getDirectionBasedOnUses(Operation* op,
--                                             bool allowMultiUse) {
--  if (allowMultiUse) {
--    return PropagationDirection::BOTH;
--  }
--
--  bool anyOperandMultipleUses =
--      llvm::any_of(op->getOperands(), [&](Value operand) {
--        // We can ignore scalar operands, since they can't be sharded.
--        return !operand.hasOneUse() && !isScalar(operand);
--      });
--  return anyOperandMultipleUses ? PropagationDirection::NONE
--                                : PropagationDirection::BOTH;
--}
--
--PropagationDirection isPassThroughOp(Operation* op, int64_t factorIndex,
--                                     bool allowMultiUse) {
-+PropagationDirection isPassThroughOp(Operation* op, int64_t) {
-   if (isElementwise(op) ||
-       isa<stablehlo::ReshapeOp, stablehlo::TransposeOp, DataFlowEdgeOp>(op)) {
--    return getDirectionBasedOnUses(op, allowMultiUse);
-+    return PropagationDirection::BOTH;
-   }
-   if (isa<stablehlo::DynamicSliceOp, stablehlo::DynamicUpdateSliceOp>(op)) {
--    return intersectionOfPropagationDirections(
--        getDirectionBasedOnUses(op, allowMultiUse),
--        PropagationDirection::FORWARD);
-+    return PropagationDirection::FORWARD;
-   }
-   return PropagationDirection::NONE;
- }
- 
--PropagationDirection isPassThroughOpSingleUse(Operation* op,
--                                              int64_t factorIndex) {
--  return isPassThroughOp(op, factorIndex, /*allowMultiUse=*/false);
--}
--
--PropagationDirection isPassThroughOpMultiUse(Operation* op,
--                                             int64_t factorIndex) {
--  return isPassThroughOp(op, factorIndex, /*allowMultiUse=*/true);
--}
--
- // NOTE: if the `op` has no sharding rule, then we will assume it uses an
- // identity sharding rule. For example, `DataFlowEdgeOp`.
- PropagationDirection onlyPassThroughFactorsBroadcastBackward(
-@@ -118,9 +77,8 @@ PropagationDirection onlyPassThroughFactorsBroadcastBackward(
-   return PropagationDirection::BOTH;
- }
- 
--constexpr std::array<GetDirectionToPropagateFnPtr, 4> opPropagationSchedule = {
--    isPassThroughOpSingleUse, isPassThroughOpMultiUse,
--    onlyPassThroughFactorsBroadcastBackward, propagateAny};
-+constexpr std::array<GetDirectionToPropagateFnPtr, 3> opPropagationSchedule = {
-+    isPassThroughOp, onlyPassThroughFactorsBroadcastBackward, propagateAny};
- 
- // Returns the direction in which the given operation should be propagated.
- //
-diff --git a/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir b/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir
-index 6bf9bb5..36adf99 100644
---- a/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir
-+++ b/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir
-@@ -35,84 +35,6 @@ func.func @element_wise_over_dot_general_flipped_op_order(%arg0: tensor<8x8xf32>
-   return %2 : tensor<8x8xf32>
- }
- 
--// If we propagated forward through element-wise ops with multiple uses in the
--// first iteration, the sharding on dim 0 would have been propagated to the two
--// add ops, which would result in two reshards instead of one.
--// CHECK-LABEL: func @defer_forward_propagation_for_multi_use_ops
--func.func @defer_forward_propagation_for_multi_use_ops(
--    %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>})
--    -> (tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>},
--        tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>}) {
--  // CHECK-NEXT: %[[SINE:.*]] = stablehlo.sine %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %[[SINE]], %[[SINE]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[ADD_2:.*]] = stablehlo.add %[[SINE]], %[[SINE]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[COSINE_1:.*]] = stablehlo.cosine %[[ADD_1]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[COSINE_2:.*]] = stablehlo.cosine %[[ADD_2]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: return %[[COSINE_1]], %[[COSINE_2]]
--  %1 = stablehlo.sine %arg0 : tensor<8x8xf32>
--  %2 = stablehlo.add %1, %1 : tensor<8x8xf32>
--  %3 = stablehlo.add %1, %1 : tensor<8x8xf32>
--  // Need the additinal ops since they will get the result sharding regardless
--  // of op priority propagation.
--  %4 = stablehlo.cosine %2 : tensor<8x8xf32>
--  %5 = stablehlo.cosine %3 : tensor<8x8xf32>
--  return %4, %5 : tensor<8x8xf32>, tensor<8x8xf32>
--}
--
--// If we propagated forward through dynamic-slice op with multiple uses in the
--// first iteration, the sharding on dim 0 would have been propagated to the two
--// add ops.
--// CHECK-LABEL: func @defer_forward_propagation_for_multi_use_dynamic_slice
--func.func @defer_forward_propagation_for_multi_use_dynamic_slice(
--    %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>},
--    %arg1: tensor<i32>, %arg2: tensor<i32>)
--    -> (tensor<8x2xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>}) {
--  // CHECK-NEXT: %[[DS:.*]] = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %[[DS]], %[[DS]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[ADD_2:.*]] = stablehlo.add %[[DS]], %[[DS]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[MUL:.*]] = stablehlo.multiply %[[ADD_1]], %[[ADD_2]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: return %[[MUL]]
--  %1 = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] : (tensor<8x8xf32>, tensor<i32>, tensor<i32>) -> tensor<8x2xf32>
--  %2 = stablehlo.add %1, %1 : tensor<8x2xf32>
--  %3 = stablehlo.add %1, %1 : tensor<8x2xf32>
--  // Need the additinal op since it will get the result sharding regardless of
--  // op priority propagation.
--  %4 = stablehlo.multiply %2, %3 : tensor<8x2xf32>
--  return %4 : tensor<8x2xf32>
--}
--
--// If we propagated backwards through element-wise ops with a multi-use operand
--// in the first iteration, the sharding on dim 1 would have been propagated to
--// %arg0.
--// CHECK-LABEL: func @defer_backwards_propagation_for_op_with_multi_use_operand(
--// CHECK-SAME:      %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a", ?}, {?}]>})
--func.func @defer_backwards_propagation_for_op_with_multi_use_operand(%arg0: tensor<8x8xf32>)
--    -> (tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>},
--        tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>}) {
--  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD_2:.*]] = stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[SINE:.*]] = stablehlo.sine %[[ADD_1]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: return %[[ADD_2]], %[[SINE]]
--  %0 = stablehlo.add %arg0, %arg0 : tensor<8x8xf32>
--  %1 = stablehlo.add %arg0, %arg0 : tensor<8x8xf32>
--  %2 = stablehlo.sine %0 : tensor<8x8xf32>
--  return %1, %2 : tensor<8x8xf32>, tensor<8x8xf32>
--}
--
--// CHECK-LABEL: func @defer_backwards_propagation_dynamic_slice(
--// CHECK-SAME:      %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{?}, {"a", ?}]>}
--func.func @defer_backwards_propagation_dynamic_slice(
--    %arg0: tensor<8x8xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
--    -> (tensor<8x2xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>},
--        tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>}) {
--  // CHECK-NEXT: %[[DS:.*]] = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: return %[[DS]], %[[ADD]]
--  %1 = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] : (tensor<8x8xf32>, tensor<i32>, tensor<i32>) -> tensor<8x2xf32>
--  %2 = stablehlo.add %arg0, %arg0 : tensor<8x8xf32>
--  return %1, %2 : tensor<8x2xf32>, tensor<8x8xf32>
--}
--
- // Verify that the element-wise ops are sharded on dim 1 due to the
- // `sharding_constraint`. Without `sharding_constraint` haveing the
- // `Elementwise` trait, then the element-wise ops would be sharded on dim 0
 diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index 8ee25f9..5758bb1 100644
+index 442e692..e5697ac 100644
 --- a/third_party/llvm/generated.patch
 +++ b/third_party/llvm/generated.patch
-@@ -1,24 +1,13 @@
+@@ -1,154 +1,6246 @@
  Auto generated patch. Do not edit or delete it, even if empty.
--diff -ruN --strip-trailing-cr a/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
----- a/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
--+++ b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
--@@ -1,6 +1,6 @@
-- # REQUIRES x86_64-registered-target
---# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=fast -print-pipeline-passes %s 2>&1 | FileCheck %s
---# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=greedy -print-pipeline-passes %s 2>&1 | FileCheck %s --check-prefix=CHECK-GREEDY
--+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=fast -print-pipeline-passes %s -o - 2>&1 | FileCheck %s
--+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=greedy -print-pipeline-passes %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-GREEDY
-+diff -ruN --strip-trailing-cr a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
-+--- a/libc/src/stdlib/qsort_pivot.h
-++++ b/libc/src/stdlib/qsort_pivot.h
-+@@ -9,6 +9,8 @@
-+ #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
-+ #define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+-diff -ruN --strip-trailing-cr a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
+---- a/clang/include/clang/AST/Type.h
+-+++ b/clang/include/clang/AST/Type.h
+-@@ -3602,6 +3602,9 @@
++diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
++--- a/clang/lib/Sema/SemaDecl.cpp
+++++ b/clang/lib/Sema/SemaDecl.cpp
++@@ -4755,8 +4755,16 @@
++         return;
++     }
++   } else {
++-    Diag(New->getLocation(), diag::warn_cxx_compat_tentative_definition) << New;
++-    Diag(Old->getLocation(), diag::note_previous_declaration);
+++    // C++ may not have a tentative definition rule, but it has a different
+++    // rule about what constitutes a definition in the first place. See
+++    // [basic.def]p2 for details, but the basic idea is: if the old declaration
+++    // contains the extern specifier and doesn't have an initializer, it's fine
+++    // in C++.
+++    if (Old->getStorageClass() != SC_Extern || Old->hasInit()) {
+++      Diag(New->getLocation(), diag::warn_cxx_compat_tentative_definition)
+++          << New;
+++      Diag(Old->getLocation(), diag::note_previous_declaration);
+++    }
+    }
+  
+-   NestedNameSpecifier *getQualifier() const { return Qualifier; }
+-+  /// Note: this can trigger extra deserialization when external AST sources are
+-+  /// used. Prefer `getCXXRecordDecl()` unless you really need the most recent
+-+  /// decl.
+-   CXXRecordDecl *getMostRecentCXXRecordDecl() const;
++   if (haveIncompatibleLanguageLinkages(Old, New)) {
++diff -ruN --strip-trailing-cr a/clang/test/Sema/warn-tentative-defn-compat.c b/clang/test/Sema/warn-tentative-defn-compat.c
++--- a/clang/test/Sema/warn-tentative-defn-compat.c
+++++ b/clang/test/Sema/warn-tentative-defn-compat.c
++@@ -20,4 +20,7 @@
++                cxx-error {{redefinition of 'k'}}
+  
+-   bool isSugared() const;
+-@@ -3610,7 +3613,10 @@
+-   }
++ // Cannot have two declarations with initializers, that is a redefinition in
++-// both C and C++.
+++// both C and C++. However, C++ does have a different definition of what makes
+++// a declaration a definition.
+++extern const int a;
+++const int a = 12; // Okay in C and C++
++diff -ruN --strip-trailing-cr a/clang/unittests/Frontend/SearchPathTest.cpp b/clang/unittests/Frontend/SearchPathTest.cpp
++--- a/clang/unittests/Frontend/SearchPathTest.cpp
+++++ b/clang/unittests/Frontend/SearchPathTest.cpp
++@@ -51,6 +51,7 @@
++   FileManager FileMgr;
++   SourceManager SourceMgr;
++   std::unique_ptr<CompilerInvocation> Invocation;
+++  IntrusiveRefCntPtr<TargetInfo> Target;
++ 
++   void addDirectories(ArrayRef<StringRef> Dirs) {
++     for (StringRef Dir : Dirs) {
++@@ -65,10 +66,9 @@
++     CompilerInvocation::CreateFromArgs(*Invocation, Args, Diags);
++     HeaderSearchOptions HSOpts = Invocation->getHeaderSearchOpts();
++     LangOptions LangOpts = Invocation->getLangOpts();
++-    TargetInfo *Target =
++-        TargetInfo::CreateTargetInfo(Diags, Invocation->getTargetOpts());
+++    Target = TargetInfo::CreateTargetInfo(Diags, Invocation->getTargetOpts());
++     auto HeaderInfo = std::make_unique<HeaderSearch>(HSOpts, SourceMgr, Diags,
++-                                                     LangOpts, Target);
+++                                                     LangOpts, Target.get());
++     ApplyHeaderSearchOptions(*HeaderInfo, HSOpts, LangOpts,
++                              Target->getTriple());
++     return HeaderInfo;
++diff -ruN --strip-trailing-cr a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
++--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
++@@ -23,7 +23,6 @@
++ #include "llvm/ADT/iterator_range.h"
++ #include "llvm/CodeGen/MachineBasicBlock.h"
++ #include "llvm/CodeGen/MachineFunction.h"
++-#include "llvm/CodeGen/MachineInstr.h"
++ #include "llvm/CodeGen/MachineInstrBundle.h"
++ #include "llvm/CodeGen/MachineOperand.h"
++ #include "llvm/CodeGen/RegisterBank.h"
++@@ -586,9 +585,6 @@
++   /// multiple uses.
++   bool hasOneNonDBGUser(Register RegNo) const;
++ 
++-  /// If the register has a single non-Debug instruction using the specified
++-  /// register, returns it; otherwise returns nullptr.
++-  MachineInstr *getOneNonDBGUser(Register RegNo) const;
++ 
++   /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
++   /// non-debug user instructions.
++diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
++--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
++@@ -432,11 +432,6 @@
++   return hasSingleElement(use_nodbg_instructions(RegNo));
++ }
+  
+-   void Profile(llvm::FoldingSetNodeID &ID) {
+--    Profile(ID, getPointeeType(), getQualifier(), getMostRecentCXXRecordDecl());
+-+    // FIXME: `getMostRecentCXXRecordDecl()` should be possible to use here,
+-+    // however when external AST sources are used it causes nondeterminism
+-+    // issues (see https://github.com/llvm/llvm-project/pull/137910).
+-+    Profile(ID, getPointeeType(), getQualifier(), getCXXRecordDecl());
++-MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
++-  auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
++-  return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
++-}
++-
++ bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
++                                               unsigned MaxUsers) const {
++   return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
++--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
++@@ -53,20 +53,11 @@
++   /// UnifiedVGPRFile
++   unsigned getVGPRNum(bool UnifiedVGPRFile) const {
++     if (UnifiedVGPRFile) {
++-      return Value[AGPR32] ? getUnifiedVGPRNum(Value[VGPR32], Value[AGPR32])
+++      return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
++                            : Value[VGPR32] + Value[AGPR32];
++     }
++     return std::max(Value[VGPR32], Value[AGPR32]);
+    }
++-
++-  /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
++-  /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
++-  inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
++-                                           unsigned NumAGPRs) {
++-    return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
++-           NumAGPRs;
++-  }
++-
++   /// \returns the ArchVGPR32 pressure
++   unsigned getArchVGPRNum() const { return Value[VGPR32]; }
++   /// \returns the AccVGPR32 pressure
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
++--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
++@@ -25,13 +25,8 @@
++ 
++ #include "GCNSchedStrategy.h"
++ #include "AMDGPUIGroupLP.h"
++-#include "GCNRegPressure.h"
++ #include "SIMachineFunctionInfo.h"
++-#include "Utils/AMDGPUBaseInfo.h"
++-#include "llvm/ADT/STLExtras.h"
++ #include "llvm/CodeGen/RegisterClassInfo.h"
++-#include "llvm/MC/LaneBitmask.h"
++-#include "llvm/Support/ErrorHandling.h"
+  
+-   static void Profile(llvm::FoldingSetNodeID &ID, QualType Pointee,
+-@@ -3620,6 +3626,9 @@
+-   static bool classof(const Type *T) {
+-     return T->getTypeClass() == MemberPointer;
++ #define DEBUG_TYPE "machine-scheduler"
++ 
++@@ -306,11 +301,11 @@
++     HasHighPressure = true;
++     if (SGPRDelta > VGPRDelta) {
++       Cand.RPDelta.CriticalMax =
++-          PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
+++        PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
++       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
++     } else {
++       Cand.RPDelta.CriticalMax =
++-          PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
+++        PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
++       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
++     }
+    }
++@@ -323,7 +318,7 @@
++                                          const RegPressureTracker &RPTracker,
++                                          SchedCandidate &Cand,
++                                          bool IsBottomUp) {
++-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+++  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
++   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
++   unsigned SGPRPressure = 0;
++   unsigned VGPRPressure = 0;
++@@ -419,7 +414,7 @@
++       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
++                         /*IsBottomUp=*/false);
++       assert(TCand.SU == TopCand.SU &&
++-             "Last pick result should correspond to re-picking right now");
+++           "Last pick result should correspond to re-picking right now");
++     }
++ #endif
++   }
++@@ -895,13 +890,13 @@
++   std::vector<MachineInstr *> RegionFirstMIs;
++   RegionFirstMIs.reserve(Regions.size());
++   auto I = Regions.rbegin(), E = Regions.rend();
+++  auto *BB = I->first->getParent();
++   do {
++-    const MachineBasicBlock *MBB = I->first->getParent();
++     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
++     RegionFirstMIs.push_back(MI);
++     do {
++       ++I;
++-    } while (I != E && I->first->getParent() == MBB);
+++    } while (I != E && I->first->getParent() == BB);
++   } while (I != E);
++   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
++ }
++@@ -1086,46 +1081,31 @@
++   return true;
++ }
++ 
++-/// Allows to easily filter for this stage's debug output.
++-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
++-
++ bool PreRARematStage::initGCNSchedStage() {
++-  // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
++-  // regions inbetween the defs and region we sinked the def to. Will need to be
++-  // fixed if there is another pass after this pass.
++-  assert(!S.hasNextStage());
+++  if (!GCNSchedStage::initGCNSchedStage())
+++    return false;
++ 
++-  if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
++-      DAG.Regions.size() == 1)
+++  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
++     return false;
++ 
++-  // Before performing any IR modification record the parent region of each MI
++-  // and the parent MBB of each region.
++-  const unsigned NumRegions = DAG.Regions.size();
++-  RegionBB.reserve(NumRegions);
++-  for (unsigned I = 0; I < NumRegions; ++I) {
++-    RegionBoundaries Region = DAG.Regions[I];
++-    for (auto MI = Region.first; MI != Region.second; ++MI)
++-      MIRegion.insert({&*MI, I});
++-    RegionBB.push_back(Region.first->getParent());
++-  }
+++  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+++  // Rematerialization will not help if occupancy is not limited by reg usage.
+++  if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
+++    return false;
++ 
++-  if (!canIncreaseOccupancyOrReduceSpill())
+++  // FIXME: This pass will invalidate cached MBBLiveIns for regions
+++  // inbetween the defs and region we sinked the def to. Cached pressure
+++  // for regions where a def is sinked from will also be invalidated. Will
+++  // need to be fixed if there is another pass after this pass.
+++  assert(!S.hasNextStage());
+ +
+-+private:
+-+  CXXRecordDecl *getCXXRecordDecl() const;
+- };
+++  collectRematerializableInstructions();
+++  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
++     return false;
   
-- # CHECK: regallocfast
-- # CHECK-GREEDY: greedy<all>
--diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
----- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--@@ -9662,6 +9662,7 @@
--         "lib/Target/LLVMIR/DataLayoutImporter.h",
--         "lib/Target/LLVMIR/DebugImporter.cpp",
--         "lib/Target/LLVMIR/DebugImporter.h",
--+        "lib/Target/LLVMIR/LLVMImportInterface.cpp",
--         "lib/Target/LLVMIR/LoopAnnotationImporter.cpp",
--         "lib/Target/LLVMIR/LoopAnnotationImporter.h",
--         "lib/Target/LLVMIR/ModuleImport.cpp",
-++#include "src/__support/macros/attributes.h"
+- /// Capture whether this is a normal array (e.g. int X[4])
+-diff -ruN --strip-trailing-cr a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
+---- a/clang/lib/AST/Type.cpp
+-+++ b/clang/lib/AST/Type.cpp
+-@@ -5305,10 +5305,14 @@
+-     ID.AddPointer(Cls->getCanonicalDecl());
++-  // Rematerialize identified instructions and update scheduler's state.
++-  rematerialize();
++-  if (GCNTrackers)
++-    DAG.RegionLiveOuts.buildLiveRegMap();
++-  REMAT_DEBUG(
++-      dbgs() << "Retrying function scheduling with new min. occupancy of "
++-             << AchievedOcc << " from rematerializing (original was "
++-             << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
++-  if (AchievedOcc > DAG.MinOccupancy) {
++-    DAG.MinOccupancy = AchievedOcc;
++-    SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
++-    MFI.increaseOccupancy(MF, DAG.MinOccupancy);
++-  }
+++  LLVM_DEBUG(
+++      dbgs() << "Retrying function scheduling with improved occupancy of "
+++             << DAG.MinOccupancy << " from rematerializing\n");
++   return true;
+  }
+  
+-+CXXRecordDecl *MemberPointerType::getCXXRecordDecl() const {
+-+  return dyn_cast<MemberPointerType>(getCanonicalTypeInternal())
+-+      ->getQualifier()
+-+      ->getAsRecordDecl();
+-+}
++@@ -1513,7 +1493,8 @@
++       dbgs()
++       << "\n\t      *** In shouldRevertScheduling ***\n"
++       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
++-  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+++  ScheduleMetrics MBefore =
+++      getScheduleMetrics(DAG.SUnits);
++   LLVM_DEBUG(
++       dbgs()
++       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
++@@ -1546,9 +1527,13 @@
++ }
++ 
++ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
++-  return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
++-         mayCauseSpilling(WavesAfter) ||
++-         (IncreaseOccupancy && WavesAfter < TargetOcc);
+++  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+++    return true;
+ +
+- CXXRecordDecl *MemberPointerType::getMostRecentCXXRecordDecl() const {
+--  auto *RD = dyn_cast<MemberPointerType>(getCanonicalTypeInternal())
+--                 ->getQualifier()
+--                 ->getAsRecordDecl();
+-+  auto *RD = getCXXRecordDecl();
+-   if (!RD)
+-     return nullptr;
+-   return RD->getMostRecentNonInjectedDecl();
+-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
+---- a/clang/lib/Sema/SemaChecking.cpp
+-+++ b/clang/lib/Sema/SemaChecking.cpp
+-@@ -11596,6 +11596,15 @@
+-   }
+++  if (mayCauseSpilling(WavesAfter))
+++    return true;
+++
+++  return false;
+  }
+  
+-+static void CheckCommaOperand(Sema &S, Expr *E, QualType T, SourceLocation CC,
+-+                              bool ExtraCheckForImplicitConversion) {
+-+  E = E->IgnoreParenImpCasts();
+-+  AnalyzeImplicitConversions(S, E, CC);
++ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
++@@ -1698,407 +1683,160 @@
++   return true;
++ }
++ 
++-namespace {
++-/// Models excess register pressure in a region and tracks our progress as we
++-/// identify rematerialization opportunities.
++-struct ExcessRP {
++-  /// Number of excess ArchVGPRs.
++-  unsigned ArchVGPRs = 0;
++-  /// Number of excess AGPRs.
++-  unsigned AGPRs = 0;
++-  /// For unified register files, number of excess VGPRs.
++-  unsigned VGPRs = 0;
++-  /// For unified register files with AGPR usage, number of excess ArchVGPRs to
++-  /// save before we are able to save a whole allocation granule.
++-  unsigned ArchVGPRsToAlignment = 0;
++-  /// Whether the region uses AGPRs.
++-  bool HasAGPRs = false;
++-  /// Whether the subtarget has a unified RF.
++-  bool UnifiedRF;
++-
++-  /// Constructs the excess RP model; determines the excess pressure w.r.t. a
++-  /// maximum number of allowed VGPRs.
++-  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
++-
++-  /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
++-  /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
++-  /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
++-  /// saving these ArchVGPRs helped reduce excess pressure.
++-  bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
++-
++-  /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
++-  /// these ArchVGPRs helped reduce excess pressure.
++-  bool saveAGPRs(unsigned NumRegs);
++-
++-  /// Returns whether there is any excess register pressure.
++-  operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
++-
++-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
++-  friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
++-    OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
++-       << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
++-       << Excess.ArchVGPRsToAlignment << " registers)\n";
++-    return OS;
++-  }
++-#endif
+++void PreRARematStage::collectRematerializableInstructions() {
+++  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+++  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
+++    Register Reg = Register::index2VirtReg(I);
+++    if (!DAG.LIS->hasInterval(Reg))
+++      continue;
++ 
++-private:
++-  static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
++-    unsigned NumSaved = std::min(LeftToSave, NumRegs);
++-    NumRegs -= NumSaved;
++-    LeftToSave -= NumSaved;
++-    return NumSaved;
++-  }
++-};
++-} // namespace
+++    // TODO: Handle AGPR and SGPR rematerialization
+++    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+++        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
+++      continue;
++ 
++-ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
++-                   unsigned MaxVGPRs)
++-    : UnifiedRF(ST.hasGFX90AInsts()) {
++-  unsigned NumArchVGPRs = RP.getArchVGPRNum();
++-  unsigned NumAGPRs = RP.getAGPRNum();
++-  HasAGPRs = NumAGPRs;
++-
++-  if (!UnifiedRF) {
++-    // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
++-    // independently.
++-    if (NumArchVGPRs > MaxVGPRs)
++-      ArchVGPRs = NumArchVGPRs - MaxVGPRs;
++-    if (NumAGPRs > MaxVGPRs)
++-      AGPRs = NumAGPRs - MaxVGPRs;
++-    return;
++-  }
+++    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
+++    MachineInstr *Def = Op->getParent();
+++    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+++      continue;
++ 
++-  // Independently of whether overall VGPR pressure is under the limit, we still
++-  // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
++-  // number of addressable registers in each category.
++-  const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
++-  if (NumArchVGPRs > MaxArchVGPRs) {
++-    ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
++-    NumArchVGPRs = MaxArchVGPRs;
++-  }
++-  if (NumAGPRs > MaxArchVGPRs) {
++-    AGPRs = NumAGPRs - MaxArchVGPRs;
++-    NumAGPRs = MaxArchVGPRs;
++-  }
++-
++-  // Check overall VGPR usage against the limit; any excess above addressable
++-  // register limits has already been accounted for.
++-  const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
++-  unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
++-  if (NumVGPRs > MaxVGPRs) {
++-    VGPRs = NumVGPRs - MaxVGPRs;
++-    ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
++-    if (!ArchVGPRsToAlignment)
++-      ArchVGPRsToAlignment = Granule;
++-  }
++-}
+++    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
+++    if (Def->getParent() == UseI->getParent())
+++      continue;
++ 
++-bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
++-  bool Progress = saveRegs(ArchVGPRs, NumRegs);
++-  if (!NumRegs)
++-    return Progress;
++-
++-  if (!UnifiedRF) {
++-    if (UseArchVGPRForAGPRSpill)
++-      Progress |= saveRegs(AGPRs, NumRegs);
++-  } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
++-    // There is progress as long as there are VGPRs left to save, even if the
++-    // save induced by this particular call does not cross an ArchVGPR alignment
++-    // barrier.
++-    Progress = true;
++-
++-    // ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
++-    unsigned NumSavedRegs = 0;
++-
++-    // Count the number of whole ArchVGPR allocation granules we can save.
++-    const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
++-    if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
++-      NumSavedRegs = NumGranules * Granule;
++-      NumRegs -= NumSavedRegs;
++-    }
++-
++-    // We may be able to save one more whole ArchVGPR allocation granule.
++-    if (NumRegs >= ArchVGPRsToAlignment) {
++-      NumSavedRegs += Granule;
++-      ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
++-    } else {
++-      ArchVGPRsToAlignment -= NumRegs;
+++    bool HasRematDependency = false;
+++    // Check if this instruction uses any registers that are planned to be
+++    // rematerialized
+++    for (auto &RematEntry : RematerializableInsts) {
+++      if (find_if(RematEntry.second,
+++                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
+++                    for (MachineOperand &MO : Def->operands()) {
+++                      if (!MO.isReg())
+++                        continue;
+++                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
+++                        return true;
+++                    }
+++                    return false;
+++                  }) != RematEntry.second.end()) {
+++        HasRematDependency = true;
+++        break;
+++      }
++     }
+++    // Do not rematerialize an instruction if it uses an instruction that we
+++    // have designated for rematerialization.
+++    // FIXME: Allow for rematerialization chains: this requires 1. updating
+++    // remat points to account for uses that are rematerialized, and 2. either
+++    // rematerializing the candidates in careful ordering, or deferring the MBB
+++    // RP walk until the entire chain has been rematerialized.
+++    if (HasRematDependency)
+++      continue;
++ 
++-    // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
++-    // spilling and have some free ArchVGPR slots.
++-    saveRegs(VGPRs, NumSavedRegs);
++-    if (UseArchVGPRForAGPRSpill)
++-      saveRegs(AGPRs, NumSavedRegs);
++-  } else {
++-    // No AGPR usage in the region i.e., no allocation granule to worry about.
++-    Progress |= saveRegs(VGPRs, NumRegs);
++-  }
++-
++-  return Progress;
++-}
++-
++-bool ExcessRP::saveAGPRs(unsigned NumRegs) {
++-  return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
++-}
+++    // Similarly, check if the UseI is planned to be remat.
+++    for (auto &RematEntry : RematerializableInsts) {
+++      if (find_if(RematEntry.second,
+++                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
+++                    return Remat.first == UseI;
+++                  }) != RematEntry.second.end()) {
+++        HasRematDependency = true;
+++        break;
+++      }
+++    }
++ 
++-bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
++-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+++    if (HasRematDependency)
+++      break;
++ 
++-  REMAT_DEBUG({
++-    dbgs() << "Collecting rematerializable instructions in ";
++-    MF.getFunction().printAsOperand(dbgs(), false);
++-    dbgs() << '\n';
++-  });
++-
++-  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
++-  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
++-  DenseMap<unsigned, ExcessRP> OptRegions;
++-  const Function &F = MF.getFunction();
++-
++-  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
++-  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
++-  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
++-  const unsigned MaxSGPRsIncOcc =
++-      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
++-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
++-  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
++-
++-  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
++-    if (Cond) {
++-      // We won't try to increase occupancy.
++-      IncreaseOccupancy = false;
++-      OptRegions.clear();
++-    }
++-    return Cond;
++-  };
++-
++-  // Collect optimizable regions. If there is spilling in any region we will
++-  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
++-  // occupancy by one in the whole function.
++-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
++-    GCNRegPressure &RP = DAG.Pressure[I];
++-
++-    // Check whether SGPR pressures prevents us from eliminating spilling.
++-    unsigned NumSGPRs = RP.getSGPRNum();
++-    if (NumSGPRs > MaxSGPRsNoSpill)
++-      ClearOptRegionsIf(IncreaseOccupancy);
++-
++-    ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
++-    if (Excess) {
++-      ClearOptRegionsIf(IncreaseOccupancy);
++-    } else if (IncreaseOccupancy) {
++-      // Check whether SGPR pressure prevents us from increasing occupancy.
++-      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
++-        if (DAG.MinOccupancy >= WavesPerEU.first)
++-          return false;
++-        continue;
++-      }
++-      if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
++-        // We can only rematerialize ArchVGPRs at this point.
++-        unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
++-        bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
++-        if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
++-          if (DAG.MinOccupancy >= WavesPerEU.first)
++-            return false;
++-          continue;
+++    // We are only collecting defs that are defined in another block and are
+++    // live-through or used inside regions at MinOccupancy. This means that the
+++    // register must be in the live-in set for the region.
+++    bool AddedToRematList = false;
+++    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+++      auto It = DAG.LiveIns[I].find(Reg);
+++      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
+++        if (DAG.RegionsWithMinOcc[I]) {
+++          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
+++          SlotIndex UseIdx =
+++              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
+++          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
+++            RematerializableInsts[I][Def] = UseI;
+++            AddedToRematList = true;
+++          }
++         }
+ +
+-+  if (ExtraCheckForImplicitConversion && E->getType() != T)
+-+    S.CheckImplicitConversion(E, T, CC);
+++        // Collect regions with rematerializable reg as live-in to avoid
+++        // searching later when updating RP.
+++        RematDefToLiveInRegions[Def].push_back(I);
++       }
++     }
++-    if (Excess)
++-      OptRegions.insert({I, Excess});
+++    if (!AddedToRematList)
+++      RematDefToLiveInRegions.erase(Def);
++   }
++-  if (OptRegions.empty())
++-    return false;
++-
++-#ifndef NDEBUG
++-  if (IncreaseOccupancy)
++-    REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
++-  else
++-    REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
++-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
++-    if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
++-      REMAT_DEBUG(dbgs() << "  " << I << ": " << OptIt->getSecond() << '\n');
++-  }
++-#endif
++-
++-  // When we are reducing spilling, the target is the minimum target number of
++-  // waves/EU determined by the subtarget.
++-  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
++-
++-  // Accounts for a reduction in RP in an optimizable region. Returns whether we
++-  // estimate that we have identified enough rematerialization opportunities to
++-  // achieve our goal, and sets Progress to true when this particular reduction
++-  // in pressure was helpful toward that goal.
++-  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
++-                              bool &Progress) -> bool {
++-    ExcessRP &Excess = OptIt->getSecond();
++-    // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
++-    // only when we are just trying to eliminate spilling to memory. At this
++-    // point we err on the conservative side and do not increase
++-    // register-to-register spilling for the sake of increasing occupancy.
++-    Progress |=
++-        Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
++-                             /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
++-    if (!Excess)
++-      OptRegions.erase(OptIt->getFirst());
++-    return OptRegions.empty();
++-  };
++-
++-  // We need up-to-date live-out info. to query live-out register masks in
++-  // regions containing rematerializable instructions.
++-  DAG.RegionLiveOuts.buildLiveRegMap();
++-
++-  // Cache set of registers that are going to be rematerialized.
++-  DenseSet<unsigned> RematRegs;
++-
++-  // Identify rematerializable instructions in the function.
++-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
++-    auto Region = DAG.Regions[I];
++-    for (auto MI = Region.first; MI != Region.second; ++MI) {
++-      // The instruction must be trivially rematerializable.
++-      MachineInstr &DefMI = *MI;
++-      if (!isTriviallyReMaterializable(DefMI))
++-        continue;
++-
++-      // We only support rematerializing virtual VGPRs with one definition.
++-      Register Reg = DefMI.getOperand(0).getReg();
++-      if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
++-          !DAG.MRI.hasOneDef(Reg))
++-        continue;
++-
++-      // We only care to rematerialize the instruction if it has a single
++-      // non-debug user in a different region. The using MI may not belong to a
++-      // region if it is a lone region terminator.
++-      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
++-      if (!UseMI)
++-        continue;
++-      auto UseRegion = MIRegion.find(UseMI);
++-      if (UseRegion != MIRegion.end() && UseRegion->second == I)
++-        continue;
++-
++-      // Do not rematerialize an instruction if it uses or is used by an
++-      // instruction that we have designated for rematerialization.
++-      // FIXME: Allow for rematerialization chains: this requires 1. updating
++-      // remat points to account for uses that are rematerialized, and 2. either
++-      // rematerializing the candidates in careful ordering, or deferring the
++-      // MBB RP walk until the entire chain has been rematerialized.
++-      if (Rematerializations.contains(UseMI) ||
++-          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
++-            return MO.isReg() && RematRegs.contains(MO.getReg());
++-          }))
++-        continue;
+ +}
++ 
++-      // Do not rematerialize an instruction it it uses registers that aren't
++-      // available at its use. This ensures that we are not extending any live
++-      // range while rematerializing.
++-      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
++-      SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
++-      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
++-        continue;
+++bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+++                                              const TargetInstrInfo *TII) {
+++  // Temporary copies of cached variables we will be modifying and replacing if
+++  // sinking succeeds.
+++  SmallVector<
+++      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
+++      NewRegions;
+++  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
+++  DenseMap<unsigned, GCNRegPressure> NewPressure;
+++  BitVector NewRescheduleRegions;
+++  LiveIntervals *LIS = DAG.LIS;
++ 
++-      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
++-      RematInstruction &Remat =
++-          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
++-
++-      bool RematUseful = false;
++-      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
++-        // Optimistically consider that moving the instruction out of its
++-        // defining region will reduce RP in the latter; this assumes that
++-        // maximum RP in the region is reached somewhere between the defining
++-        // instruction and the end of the region.
++-        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
++-        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
++-        if (ReduceRPInRegion(It, Mask, RematUseful))
++-          return true;
++-      }
+++  NewRegions.resize(DAG.Regions.size());
+++  NewRescheduleRegions.resize(DAG.Regions.size());
++ 
++-      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
++-        // We are only collecting regions in which the register is a live-in
++-        // (and may be live-through).
++-        auto It = DAG.LiveIns[LIRegion].find(Reg);
++-        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
++-          continue;
++-        Remat.LiveInRegions.insert(LIRegion);
+++  // Collect only regions that has a rematerializable def as a live-in.
+++  SmallSet<unsigned, 16> ImpactedRegions;
+++  for (const auto &It : RematDefToLiveInRegions)
+++    ImpactedRegions.insert_range(It.second);
+++
+++  // Make copies of register pressure and live-ins cache that will be updated
+++  // as we rematerialize.
+++  for (auto Idx : ImpactedRegions) {
+++    NewPressure[Idx] = DAG.Pressure[Idx];
+++    NewLiveIns[Idx] = DAG.LiveIns[Idx];
+++  }
+++  NewRegions = DAG.Regions;
+++  NewRescheduleRegions.reset();
+ +
+- /// Analyze the given compound assignment for the possible losing of
+- /// floating-point precision.
+- static void AnalyzeCompoundAssignment(Sema &S, BinaryOperator *E) {
+-@@ -12413,7 +12422,7 @@
+-           << OrigE->getSourceRange() << T->isBooleanType()
+-           << FixItHint::CreateReplacement(UO->getBeginLoc(), "!");
+- 
+--  if (const auto *BO = dyn_cast<BinaryOperator>(SourceExpr))
+-+  if (auto *BO = dyn_cast<BinaryOperator>(SourceExpr)) {
+-     if ((BO->getOpcode() == BO_And || BO->getOpcode() == BO_Or) &&
+-         BO->getLHS()->isKnownToHaveBooleanValue() &&
+-         BO->getRHS()->isKnownToHaveBooleanValue() &&
+-@@ -12439,7 +12448,21 @@
+-                    (BO->getOpcode() == BO_And ? "&&" : "||"));
+-         S.Diag(BO->getBeginLoc(), diag::note_cast_operand_to_int);
+++  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+++  bool Improved = false;
+++  for (auto I : ImpactedRegions) {
+++    if (!DAG.RegionsWithMinOcc[I])
+++      continue;
++ 
++-        // Account for the reduction in RP due to the rematerialization in an
++-        // optimizable region in which the defined register is a live-in. This
++-        // is exact for live-through region but optimistic in the using region,
++-        // where RP is actually reduced only if maximum RP is reached somewhere
++-        // between the beginning of the region and the rematerializable
++-        // instruction's use.
++-        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
++-          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
++-          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
++-            return true;
++-        }
++-      }
+++    Improved = false;
+++    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
+++    int SGPRUsage = NewPressure[I].getSGPRNum();
 ++
-+ #include <stddef.h> // For size_t
+++    // TODO: Handle occupancy drop due to AGPR and SGPR.
+++    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
+++    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
+++      break;
++ 
++-      // If the instruction is not a live-in or live-out in any optimizable
++-      // region then there is no point in rematerializing it.
++-      if (!RematUseful) {
++-        Rematerializations.pop_back();
++-        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
++-      } else {
++-        RematRegs.insert(Reg);
++-      }
+++    // The occupancy of this region could have been improved by a previous
+++    // iteration's sinking of defs.
+++    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
+++      NewRescheduleRegions[I] = true;
+++      Improved = true;
+++      continue;
++     }
++-  }
++-
++-  if (IncreaseOccupancy) {
++-    // We were trying to increase occupancy but failed, abort the stage.
++-    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
++-    Rematerializations.clear();
++-    return false;
++-  }
++-  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
++-  return !Rematerializations.empty();
++-}
++-
++-void PreRARematStage::rematerialize() {
++-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
++-
++-  // Collect regions whose RP changes in unpredictable way; we will have to
++-  // fully recompute their RP after all rematerailizations.
++-  DenseSet<unsigned> RecomputeRP;
++-
++-  // Rematerialize all instructions.
++-  for (auto &[DefMI, Remat] : Rematerializations) {
++-    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
++-    Register Reg = DefMI->getOperand(0).getReg();
++-    unsigned SubReg = DefMI->getOperand(0).getSubReg();
++-    unsigned DefRegion = MIRegion.at(DefMI);
++-
++-    // Rematerialize DefMI to its use block.
++-    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
++-                       *DAG.TRI);
++-    Remat.RematMI = &*std::prev(InsertPos);
++-    Remat.RematMI->getOperand(0).setSubReg(SubReg);
++-    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
++-
++-    // Update region boundaries in regions we sinked from (remove defining MI)
++-    // and to (insert MI rematerialized in use block). Only then we can erase
++-    // the original MI.
++-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
++-    auto UseRegion = MIRegion.find(Remat.UseMI);
++-    if (UseRegion != MIRegion.end()) {
++-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
++-                                 Remat.RematMI);
++-    }
++-    DefMI->eraseFromParent();
++-    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
++-
++-    // Collect all regions impacted by the rematerialization and update their
++-    // live-in/RP information.
++-    for (unsigned I : Remat.LiveInRegions) {
++-      ImpactedRegions.insert({I, DAG.Pressure[I]});
++-      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
++ 
+++    // First check if we have enough trivially rematerializable instructions to
+++    // improve occupancy. Optimistically assume all instructions we are able to
+++    // sink decreased RP.
+++    int TotalSinkableRegs = 0;
+++    for (const auto &It : RematerializableInsts[I]) {
+++      MachineInstr *Def = It.first;
+++      Register DefReg = Def->getOperand(0).getReg();
+++      TotalSinkableRegs +=
+++          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
++ #ifdef EXPENSIVE_CHECKS
++       // All uses are known to be available / live at the remat point. Thus, the
++       // uses should already be live in to the region.
++-      for (MachineOperand &MO : DefMI->operands()) {
+++      for (MachineOperand &MO : Def->operands()) {
++         if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
++           continue;
++ 
++@@ -2106,12 +1844,13 @@
++         if (!UseReg.isVirtual())
++           continue;
++ 
++-        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
+++        LiveInterval &LI = LIS->getInterval(UseReg);
++         LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
++         if (LI.hasSubRanges() && MO.getSubReg())
++           LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
 + 
-+ namespace LIBC_NAMESPACE_DECL {
++-        LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
+++        assert(NewLiveIns[I].contains(UseReg));
+++        LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
++         LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
++         // If this register has lanes not covered by the LiveIns, be sure they
++         // do not map to any subrange. ref:
++@@ -2123,64 +1862,126 @@
++         }
+        }
+-+    } else if (BO->isCommaOp() && !S.getLangOpts().CPlusPlus) {
+-+      /// Analyze the given comma operator. The basic idea behind the analysis
+-+      /// is to analyze the left and right operands slightly differently. The
+-+      /// left operand needs to check whether the operand itself has an implicit
+-+      /// conversion, but not whether the left operand induces an implicit
+-+      /// conversion for the entire comma expression itself. This is similar to
+-+      /// how CheckConditionalOperand behaves; it's as-if the correct operand
+-+      /// were directly used for the implicit conversion check.
+-+      CheckCommaOperand(S, BO->getLHS(), T, BO->getOperatorLoc(),
+-+                        /*ExtraCheckForImplicitConversion=*/false);
+-+      CheckCommaOperand(S, BO->getRHS(), T, BO->getOperatorLoc(),
+-+                        /*ExtraCheckForImplicitConversion=*/true);
+-+      return;
++ #endif
++-
++-      // The register is no longer a live-in in all regions but the one that
++-      // contains the single use. In live-through regions, maximum register
++-      // pressure decreases predictably so we can directly update it. In the
++-      // using region, maximum RP may or may not decrease, so we will mark it
++-      // for re-computation after all materializations have taken place.
++-      LaneBitmask PrevMask = RegionLiveIns[Reg];
++-      RegionLiveIns.erase(Reg);
++-      RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
++-      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
++-        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
++-      else
++-        RecomputeRP.insert(I);
+      }
++-    // RP in the region from which the instruction was rematerialized may or may
++-    // not decrease.
++-    ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
++-    RecomputeRP.insert(DefRegion);
++-
++-    // Recompute live interval to reflect the register's rematerialization.
++-    Register RematReg = Remat.RematMI->getOperand(0).getReg();
++-    DAG.LIS->removeInterval(RematReg);
++-    DAG.LIS->createAndComputeVirtRegInterval(RematReg);
++-  }
++-
++-  // All regions impacted by at least one rematerialization must be rescheduled.
++-  // Maximum pressure must also be recomputed for all regions where it changed
++-  // non-predictably and checked against the target occupancy.
++-  AchievedOcc = TargetOcc;
++-  for (auto &[I, OriginalRP] : ImpactedRegions) {
++-    bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
++-    DAG.RescheduleRegions[I] = !IsEmptyRegion;
++-    if (!RecomputeRP.contains(I))
++-      continue;
+++    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
+++    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
+++    // If in the most optimistic scenario, we cannot improve occupancy, then do
+++    // not attempt to sink any instructions.
+++    if (OptimisticOccupancy <= DAG.MinOccupancy)
+++      break;
++ 
++-    GCNRegPressure RP;
++-    if (IsEmptyRegion) {
++-      RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
++-    } else {
++-      GCNDownwardRPTracker RPT(*DAG.LIS);
++-      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
++-                                                      DAG.Regions[I].second);
++-      if (NonDbgMI == DAG.Regions[I].second) {
++-        // Region is non-empty but contains only debug instructions.
++-        RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
++-      } else {
++-        RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
++-        RPT.advance(DAG.Regions[I].second);
++-        RP = RPT.moveMaxPressure();
+++    unsigned ImproveOccupancy = 0;
+++    SmallVector<MachineInstr *, 4> SinkedDefs;
+++    for (auto &It : RematerializableInsts[I]) {
+++      MachineInstr *Def = It.first;
+++      MachineBasicBlock::iterator InsertPos =
+++          MachineBasicBlock::iterator(It.second);
+++      Register Reg = Def->getOperand(0).getReg();
+++      // Rematerialize MI to its use block.
+++      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+++                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
+++      MachineInstr *NewMI = &*std::prev(InsertPos);
+++      LIS->InsertMachineInstrInMaps(*NewMI);
+++      LIS->removeInterval(Reg);
+++      LIS->createAndComputeVirtRegInterval(Reg);
+++      InsertedMIToOldDef[NewMI] = Def;
+++
+++      // Update region boundaries in scheduling region we sinked from since we
+++      // may sink an instruction that was at the beginning or end of its region
+++      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+++                                 /*Removing =*/true);
+++
+++      // Update region boundaries in region we sinked to.
+++      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+++
+++      LaneBitmask PrevMask = NewLiveIns[I][Reg];
+++      // FIXME: Also update cached pressure for where the def was sinked from.
+++      // Update RP for all regions that has this reg as a live-in and remove
+++      // the reg from all regions as a live-in.
+++      for (auto Idx : RematDefToLiveInRegions[Def]) {
+++        NewLiveIns[Idx].erase(Reg);
+++        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
+++          // Def is live-through and not used in this block.
+++          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+++        } else {
+++          // Def is used and rematerialized into this block.
+++          GCNDownwardRPTracker RPT(*LIS);
+++          auto *NonDbgMI = &*skipDebugInstructionsForward(
+++              NewRegions[Idx].first, NewRegions[Idx].second);
+++          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
+++          RPT.advance(NewRegions[Idx].second);
+++          NewPressure[Idx] = RPT.moveMaxPressure();
+++        }
++       }
+++
+++      SinkedDefs.push_back(Def);
+++      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
+++      if (ImproveOccupancy > DAG.MinOccupancy)
+++        break;
++     }
++-    DAG.Pressure[I] = RP;
++-    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
+++
+++    // Remove defs we just sinked from all regions' list of sinkable defs
+++    for (auto &Def : SinkedDefs)
+++      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
+++        RematerializableInsts[TrackedIdx].erase(Def);
+++
+++    if (ImproveOccupancy <= DAG.MinOccupancy)
+++      break;
+++
+++    NewRescheduleRegions[I] = true;
+++    Improved = true;
++   }
++-  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
+++
+++  if (!Improved) {
+++    // Occupancy was not improved for all regions that were at MinOccupancy.
+++    // Undo sinking and remove newly rematerialized instructions.
+++    for (auto &Entry : InsertedMIToOldDef) {
+++      MachineInstr *MI = Entry.first;
+++      MachineInstr *OldMI = Entry.second;
+++      Register Reg = MI->getOperand(0).getReg();
+++      LIS->RemoveMachineInstrFromMaps(*MI);
+++      MI->eraseFromParent();
+++      OldMI->clearRegisterDeads(Reg);
+++      LIS->removeInterval(Reg);
+++      LIS->createAndComputeVirtRegInterval(Reg);
+++    }
+++    return false;
+++  }
+++
+++  // Occupancy was improved for all regions.
+++  for (auto &Entry : InsertedMIToOldDef) {
+++    MachineInstr *MI = Entry.first;
+++    MachineInstr *OldMI = Entry.second;
+++
+++    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+++    DAG.BBLiveInMap.erase(OldMI);
+++
+++    // Remove OldMI and update LIS
+++    Register Reg = MI->getOperand(0).getReg();
+++    LIS->RemoveMachineInstrFromMaps(*OldMI);
+++    OldMI->eraseFromParent();
+++    LIS->removeInterval(Reg);
+++    LIS->createAndComputeVirtRegInterval(Reg);
+ +  }
+++
+++  // Update live-ins, register pressure, and regions caches.
+++  for (auto Idx : ImpactedRegions) {
+++    DAG.LiveIns[Idx] = NewLiveIns[Idx];
+++    DAG.Pressure[Idx] = NewPressure[Idx];
+++    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
+++  }
+++  DAG.Regions = NewRegions;
+++  DAG.RescheduleRegions = NewRescheduleRegions;
+++
+++  if (GCNTrackers)
+++    DAG.RegionLiveOuts.buildLiveRegMap();
+++
+++  SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+++  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
+++
+++  return true;
++ }
+  
+-   // For conditional operators, we analyze the arguments as if they
+-   // were being fed directly into the output.
+-diff -ruN --strip-trailing-cr a/clang/test/Sema/implicit-cast.c b/clang/test/Sema/implicit-cast.c
+---- a/clang/test/Sema/implicit-cast.c
+-+++ b/clang/test/Sema/implicit-cast.c
+-@@ -1,4 +1,4 @@
+--// RUN: %clang_cc1 -fsyntax-only %s
+-+// RUN: %clang_cc1 -fsyntax-only -verify %s
+- 
+- static char *test1(int cf) {
+-   return cf ? "abc" : 0;
+-@@ -6,3 +6,8 @@
+- static char *test2(int cf) {
+-   return cf ? 0 : "abc";
+- }
+-+
+-+int baz(void) {
+-+  int f;
+-+  return ((void)0, f = 1.4f); // expected-warning {{implicit conversion from 'float' to 'int' changes value from 1.4 to 1}}
+-+}
+-diff -ruN --strip-trailing-cr a/clang/test/Sema/implicit-int-enum-conversion.c b/clang/test/Sema/implicit-int-enum-conversion.c
+---- a/clang/test/Sema/implicit-int-enum-conversion.c
+-+++ b/clang/test/Sema/implicit-int-enum-conversion.c
+-@@ -50,3 +50,25 @@
+-   return E2_Zero;       // expected-warning {{implicit conversion from enumeration type 'enum E2' to different enumeration type 'enum E1'}} \
+-                            cxx-error {{cannot initialize return object of type 'enum E1' with an rvalue of type 'E2'}}
++-// Copied from MachineLICM
++ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
++   if (!DAG.TII->isTriviallyReMaterializable(MI))
++     return false;
++@@ -2198,83 +1999,46 @@
++   return true;
+  }
++ 
++-void PreRARematStage::finalizeGCNSchedStage() {
++-  // We consider that reducing spilling is always beneficial so we never
++-  // rollback rematerializations in such cases. It's also possible that
++-  // rescheduling lowers occupancy over the one achieved just through remats, in
++-  // which case we do not want to rollback either (the rescheduling was already
++-  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
++-  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
++-  if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
++-    return;
++-
++-  REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
++-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
++-
++-  // Rollback the rematerializations.
++-  for (const auto &[DefMI, Remat] : Rematerializations) {
++-    MachineInstr &RematMI = *Remat.RematMI;
++-    unsigned DefRegion = MIRegion.at(DefMI);
++-    MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
++-    MachineBasicBlock *MBB = RegionBB[DefRegion];
++-    Register Reg = RematMI.getOperand(0).getReg();
++-    unsigned SubReg = RematMI.getOperand(0).getSubReg();
++-
++-    // Re-rematerialize MI at the end of its original region. Note that it may
++-    // not be rematerialized exactly in the same position as originally within
++-    // the region, but it should not matter much.
++-    TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
++-    MachineInstr *NewMI = &*std::prev(InsertPos);
++-    NewMI->getOperand(0).setSubReg(SubReg);
++-    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
++-
++-    auto UseRegion = MIRegion.find(Remat.UseMI);
++-    if (UseRegion != MIRegion.end()) {
++-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
++-                                 nullptr);
++-    }
++-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
++-
++-    // Erase rematerialized MI.
++-    RematMI.eraseFromParent();
++-    DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
++-
++-    // Recompute live interval for the re-rematerialized register
++-    DAG.LIS->removeInterval(Reg);
++-    DAG.LIS->createAndComputeVirtRegInterval(Reg);
++-
++-    // Re-add the register as a live-in in all regions it used to be one in.
++-    for (unsigned LIRegion : Remat.LiveInRegions)
++-      DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
++-  }
++-
++-  // Reset RP in all impacted regions.
++-  for (auto &[I, OriginalRP] : ImpactedRegions)
++-    DAG.Pressure[I] = OriginalRP;
++-
++-  GCNSchedStage::finalizeGCNSchedStage();
++-}
++-
+++// When removing, we will have to check both beginning and ending of the region.
+++// When inserting, we will only have to check if we are inserting NewMI in front
+++// of a scheduling region and do not need to check the ending since we will only
+++// ever be inserting before an already existing MI.
++ void GCNScheduleDAGMILive::updateRegionBoundaries(
++-    RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
++-    MachineInstr *NewMI) {
++-  assert(!NewMI ||
++-         NewMI != RegionBounds.second && "cannot remove at region end");
++-
++-  if (RegionBounds.first == RegionBounds.second) {
++-    assert(NewMI && "cannot remove from an empty region");
++-    RegionBounds.first = NewMI;
++-    return;
+++    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+++                              MachineBasicBlock::iterator>> &RegionBoundaries,
+++    MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+++  unsigned I = 0, E = RegionBoundaries.size();
+++  // Search for first region of the block where MI is located
+++  while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
+++    ++I;
+ +
+-+enum E1 comma1(void) {
+-+  return ((void)0, E1_One);
+-+}
+++  for (; I != E; ++I) {
+++    if (MI->getParent() != RegionBoundaries[I].first->getParent())
+++      return;
+ +
+-+enum E1 comma2(void) {
+-+  enum E1 x;
+-+  return
+-+    (x = 12,  // expected-warning {{implicit conversion from 'int' to enumeration type 'enum E1' is invalid in C++}} \
+-+                 cxx-error {{assigning to 'enum E1' from incompatible type 'int'}}
+-+    E1_One);
+-+}
+++    if (Removing && MI == RegionBoundaries[I].first &&
+++        MI == RegionBoundaries[I].second) {
+++      // MI is in a region with size 1, after removing, the region will be
+++      // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
+++      RegionBoundaries[I] =
+++          std::pair(MI->getParent()->end(), MI->getParent()->end());
+++      return;
+++    }
+++    if (MI == RegionBoundaries[I].first) {
+++      if (Removing)
+++        RegionBoundaries[I] =
+++            std::pair(std::next(MI), RegionBoundaries[I].second);
+++      else
+++        // Inserted NewMI in front of region, set new RegionBegin to NewMI
+++        RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
+++                                        RegionBoundaries[I].second);
+++      return;
+++    }
+++    if (Removing && MI == RegionBoundaries[I].second) {
+++      RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
+++      return;
+++    }
++   }
++-
++-  // We only care for modifications at the beginning of a non-empty region since
++-  // the upper region boundary is exclusive.
++-  if (MI != RegionBounds.first)
++-    return;
++-  if (!NewMI)
++-    RegionBounds.first = std::next(MI); // Removal
++-  else
++-    RegionBounds.first = NewMI; // Insertion
++ }
++ 
++ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
++--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
++@@ -14,9 +14,7 @@
++ #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
++ 
++ #include "GCNRegPressure.h"
++-#include "llvm/ADT/DenseMap.h"
++ #include "llvm/ADT/MapVector.h"
++-#include "llvm/CodeGen/MachineInstr.h"
++ #include "llvm/CodeGen/MachineScheduler.h"
++ 
++ namespace llvm {
++@@ -216,11 +214,6 @@
++   }
++ };
++ 
++-/// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
++-/// boundary is inclusive, the upper boundary is exclusive.
++-using RegionBoundaries =
++-    std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;
++-
++ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
++   friend class GCNSchedStage;
++   friend class OccInitialScheduleStage;
++@@ -241,7 +234,8 @@
++   unsigned MinOccupancy;
++ 
++   // Vector of regions recorder for later rescheduling
++-  SmallVector<RegionBoundaries, 32> Regions;
+++  SmallVector<std::pair<MachineBasicBlock::iterator,
+++                        MachineBasicBlock::iterator>, 32> Regions;
++ 
++   // Records if a region is not yet scheduled, or schedule has been reverted,
++   // or we generally desire to reschedule it.
++@@ -292,13 +286,12 @@
++   // Compute and cache live-ins and pressure for all regions in block.
++   void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
++ 
++-  /// If necessary, updates a region's boundaries following insertion ( \p NewMI
++-  /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
++-  /// For an MI removal, this must be called before the MI is actually erased
++-  /// from its parent MBB.
++-  void updateRegionBoundaries(RegionBoundaries &RegionBounds,
++-                              MachineBasicBlock::iterator MI,
++-                              MachineInstr *NewMI);
+++  // Update region boundaries when removing MI or inserting NewMI before MI.
+++  void updateRegionBoundaries(
+++      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+++                                MachineBasicBlock::iterator>> &RegionBoundaries,
+++      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+++      bool Removing = false);
++ 
++   void runSchedStages();
++ 
++@@ -438,73 +431,30 @@
++       : GCNSchedStage(StageID, DAG) {}
++ };
++ 
++-/// Attempts to reduce function spilling or, if there is no spilling, to
++-/// increase function occupancy by one with respect to ArchVGPR usage by sinking
++-/// trivially rematerializable instructions to their use. When the stage
++-/// estimates reducing spilling or increasing occupancy is possible, as few
++-/// instructions as possible are rematerialized to reduce potential negative
++-/// effects on function latency.
++-///
++-/// TODO: We should extend this to work on SGPRs and AGPRs as well.
++ class PreRARematStage : public GCNSchedStage {
++ private:
++-  /// Useful information about a rematerializable instruction.
++-  struct RematInstruction {
++-    /// Single use of the rematerializable instruction's defined register,
++-    /// located in a different block.
++-    MachineInstr *UseMI;
++-    /// Rematerialized version of \p DefMI, set in
++-    /// PreRARematStage::rematerialize. Used for reverting rematerializations.
++-    MachineInstr *RematMI;
++-    /// Set of regions in which the rematerializable instruction's defined
++-    /// register is a live-in.
++-    SmallDenseSet<unsigned, 4> LiveInRegions;
++-
++-    RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
++-  };
++-
++-  /// Maps all MIs to their parent region. MI terminators are considered to be
++-  /// outside the region they delimitate, and as such are not stored in the map.
++-  DenseMap<MachineInstr *, unsigned> MIRegion;
++-  /// Parent MBB to each region, in region order.
++-  SmallVector<MachineBasicBlock *> RegionBB;
++-  /// Collects instructions to rematerialize.
++-  MapVector<MachineInstr *, RematInstruction> Rematerializations;
++-  /// Collects regions whose live-ins or register pressure will change due to
++-  /// rematerializations.
++-  DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
++-  /// In case we need to rollback rematerializations, save lane masks for all
++-  /// rematerialized registers in all regions in which they are live-ins.
++-  DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
++-  /// Target occupancy the stage estimates is reachable through
++-  /// rematerialization. Greater than or equal to the pre-stage min occupancy.
++-  unsigned TargetOcc;
++-  /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
++-  /// Smaller than or equal to the target occupancy.
++-  unsigned AchievedOcc;
++-  /// Whether the stage is attempting to increase occupancy in the abscence of
++-  /// spilling.
++-  bool IncreaseOccupancy;
++-
++-  /// Returns whether remat can reduce spilling or increase function occupancy
++-  /// by 1 through rematerialization. If it can do one, collects instructions in
++-  /// PreRARematStage::Rematerializations and sets the target occupancy in
++-  /// PreRARematStage::TargetOccupancy.
++-  bool canIncreaseOccupancyOrReduceSpill();
+++  // Each region at MinOccupancy will have their own list of trivially
+++  // rematerializable instructions we can remat to reduce RP. The list maps an
+++  // instruction to the position we should remat before, usually the MI using
+++  // the rematerializable instruction.
+++  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
+++      RematerializableInsts;
+ +
+-+enum E1 comma3(void) {
+-+  enum E1 x;
+-+  return ((void)0, foo()); // Okay, no conversion in C++
+-+}
+++  // Map a trivially rematerializable def to a list of regions at MinOccupancy
+++  // that has the defined reg as a live-in.
+++  MapVector<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
+ +
+-+enum E1 comma4(void) {
+-+  return ((void)1, 2); // expected-warning {{implicit conversion from 'int' to enumeration type 'enum E1' is invalid in C++}} \
+-+                          cxx-error {{cannot initialize return object of type 'enum E1' with an rvalue of type 'int'}}
+-+}
+++  // Collect all trivially rematerializable VGPR instructions with a single def
+++  // and single use outside the defining block into RematerializableInsts.
+++  void collectRematerializableInstructions();
++ 
++-  /// Whether the MI is trivially rematerializable and does not have any virtual
++-  /// register use.
++   bool isTriviallyReMaterializable(const MachineInstr &MI);
++ 
++-  /// Rematerializes all instructions in PreRARematStage::Rematerializations
++-  /// and stores the achieved occupancy after remat in
++-  /// PreRARematStage::AchievedOcc.
++-  void rematerialize();
++-
++-  /// If remat alone did not increase occupancy to the target one, rollbacks all
++-  /// rematerializations and resets live-ins/RP in all regions impacted by the
++-  /// stage to their pre-stage values.
++-  void finalizeGCNSchedStage() override;
+++  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+++  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
+++  // instructions. Returns true if we were able to sink instruction(s).
+++  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+++                               const TargetInstrInfo *TII);
++ 
++   /// \p Returns true if all the uses in \p InstToRemat defined at \p
++   /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
++--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
++@@ -466,7 +466,7 @@
++                             getReservedNumSGPRs(MF));
++ }
++ 
++-unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
+++static unsigned getMaxNumPreloadedSGPRs() {
++   using USI = GCNUserSGPRUsageInfo;
++   // Max number of user SGPRs
++   const unsigned MaxUserSGPRs =
++@@ -497,28 +497,42 @@
++ }
++ 
++ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
++-    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
++-  const auto &[Min, Max] = NumVGPRBounds;
+++    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
+++  // Compute maximum number of VGPRs function can use using default/requested
+++  // minimum number of waves per execution unit.
+++  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
++ 
++   // Check if maximum number of VGPRs was explicitly requested using
++   // "amdgpu-num-vgpr" attribute.
+++  unsigned Requested =
+++      F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
+++  if (Requested != MaxNumVGPRs) {
+++    if (hasGFX90AInsts())
+++      Requested *= 2;
+++
+++    // Make sure requested value is compatible with values implied by
+++    // default/requested minimum/maximum number of waves per execution unit.
+++    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+++      Requested = 0;
+++    if (WavesPerEU.second && Requested &&
+++        Requested < getMinNumVGPRs(WavesPerEU.second))
+++      Requested = 0;
+++
+++    if (Requested)
+++      MaxNumVGPRs = Requested;
+++  }
++ 
++-  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
++-  if (Requested != Max && hasGFX90AInsts())
++-    Requested *= 2;
++-
++-  // Make sure requested value is inside the range of possible VGPR usage.
++-  return std::clamp(Requested, Min, Max);
+++  return MaxNumVGPRs;
++ }
++ 
++ unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
++-  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
++-  return getBaseMaxNumVGPRs(
++-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
+++  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
++ }
++ 
++ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
++-  return getMaxNumVGPRs(MF.getFunction());
+++  const Function &F = MF.getFunction();
+++  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+++  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
++ }
++ 
++ void GCNSubtarget::adjustSchedDependency(
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
++--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
++@@ -1505,9 +1505,6 @@
++   /// \returns Reserved number of SGPRs for given function \p F.
++   unsigned getReservedNumSGPRs(const Function &F) const;
++ 
++-  /// \returns Maximum number of preloaded SGPRs for the subtarget.
++-  unsigned getMaxNumPreloadedSGPRs() const;
++-
++   /// \returns max num SGPRs. This is the common utility
++   /// function called by MachineFunction and Function
++   /// variants of getMaxNumSGPRs.
++@@ -1576,10 +1573,8 @@
++ 
++   /// \returns max num VGPRs. This is the common utility function
++   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
++-  unsigned
++-  getBaseMaxNumVGPRs(const Function &F,
++-                     std::pair<unsigned, unsigned> NumVGPRBounds) const;
++-
+++  unsigned getBaseMaxNumVGPRs(const Function &F,
+++                              std::pair<unsigned, unsigned> WavesPerEU) const;
++   /// \returns Maximum number of VGPRs that meets number of waves per execution
++   /// unit requirement for function \p F, or number of VGPRs explicitly
++   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
++--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
++@@ -1190,8 +1190,6 @@
++   return IsWave32 ? 8 : 4;
++ }
++ 
++-unsigned getArchVGPRAllocGranule() { return 4; }
++-
++ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
++   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
++     return 512;
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
++--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
++@@ -309,10 +309,6 @@
++     const MCSubtargetInfo *STI,
++     std::optional<bool> EnableWavefrontSize32 = std::nullopt);
++ 
++-/// For subtargets with a unified VGPR file and mixed ArchVGPR/AGPR usage,
++-/// returns the allocation granule for ArchVGPRs.
++-unsigned getArchVGPRAllocGranule();
++-
++ /// \returns Total number of VGPRs for given subtarget \p STI.
++ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
++ 
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
++--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
++@@ -1,2539 +0,0 @@
++-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
++-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
++-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
++-
++---- |
++-  define void @small_num_vgprs_as_spill() "amdgpu-num-vgpr"="28" {
++-    ret void
++-  }
++-  define void @dont_remat_waves_per_eu() "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-waves-per-eu"="7,7" {
++-    ret void
++-  }
++-  define void @dont_remat_at_max_occ() "amdgpu-waves-per-eu"="8,8" {
++-    ret void
++-  }
++-  define void @reduce_arch_and_acc_vgrp_spill() "amdgpu-waves-per-eu"="8,8" {
++-    ret void
++-  }
++-  define void @reduce_spill_archvgpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
++-    ret void
++-  }
++-  define void @reduce_spill_agpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
++-    ret void
++-  }
++----
++-# User-requested maximum number of VGPRs need to be taken into account by
++-# the scheduler's rematerialization stage. Register usage above that number
++-# is considered like spill; occupancy is "inadvertently" increased when
++-# eliminating spill.
++-name:            small_num_vgprs_as_spill
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: small_num_vgprs_as_spill
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: small_num_vgprs_as_spill
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
++-
++-    S_ENDPGM 0
++-...
++-# Min/Max occupancy is 8, but user requests 7, the scheduler's rematerialization
++-# stage should not try to rematerialize instructions.
++----
++-name:            dont_remat_waves_per_eu
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: dont_remat_waves_per_eu
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: dont_remat_waves_per_eu
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode,
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode,
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode,
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode,
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
++-    S_NOP 0, implicit %35
++-
++-    S_ENDPGM 0
++-...
++-# Min/Max occupancy is 8, the scheduler's rematerialization stage should not
++-# try to rematerialize instructions.
++----
++-name:            dont_remat_at_max_occ
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: dont_remat_at_max_occ
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: dont_remat_at_max_occ
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-
++-    S_ENDPGM 0
++-...
++-# Min/Max waves/EU is 8. For targets with non-unified RF (gfx908) we are able to
++-# eliminate both ArchVGPR and AGPR spilling by saving 2 ArchVGPRs (one for
++-# spilling AGPR to ArchVGPR). In the unified RF case (gfx90a) the ArchVGPR
++-# allocation granule forces us to remat all eligible ArchVGPRs to eliminate
++-# spilling.
++----
++-name:            reduce_arch_and_acc_vgrp_spill
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: reduce_arch_and_acc_vgrp_spill
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF32]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: reduce_arch_and_acc_vgrp_spill
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF32]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-    %32:agpr_32 = IMPLICIT_DEF
++-    %33:agpr_32 = IMPLICIT_DEF
++-    %34:agpr_32 = IMPLICIT_DEF
++-    %35:agpr_32 = IMPLICIT_DEF
++-    %36:agpr_32 = IMPLICIT_DEF
++-    %37:agpr_32 = IMPLICIT_DEF
++-    %38:agpr_32 = IMPLICIT_DEF
++-    %39:agpr_32 = IMPLICIT_DEF
++-    %40:agpr_32 = IMPLICIT_DEF
++-    %41:agpr_32 = IMPLICIT_DEF
++-    %42:agpr_32 = IMPLICIT_DEF
++-    %43:agpr_32 = IMPLICIT_DEF
++-    %44:agpr_32 = IMPLICIT_DEF
++-    %45:agpr_32 = IMPLICIT_DEF
++-    %46:agpr_32 = IMPLICIT_DEF
++-    %47:agpr_32 = IMPLICIT_DEF
++-    %48:agpr_32 = IMPLICIT_DEF
++-    %49:agpr_32 = IMPLICIT_DEF
++-    %50:agpr_32 = IMPLICIT_DEF
++-    %51:agpr_32 = IMPLICIT_DEF
++-    %52:agpr_32 = IMPLICIT_DEF
++-    %53:agpr_32 = IMPLICIT_DEF
++-    %54:agpr_32 = IMPLICIT_DEF
++-    %55:agpr_32 = IMPLICIT_DEF
++-    %56:agpr_32 = IMPLICIT_DEF
++-    %57:agpr_32 = IMPLICIT_DEF
++-    %58:agpr_32 = IMPLICIT_DEF
++-    %59:agpr_32 = IMPLICIT_DEF
++-    %60:agpr_32 = IMPLICIT_DEF
++-    %61:agpr_32 = IMPLICIT_DEF
++-    %62:agpr_32 = IMPLICIT_DEF
++-    %63:agpr_32 = IMPLICIT_DEF
++-
++-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
++-    %65:agpr_32 = IMPLICIT_DEF
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
++-    S_NOP 0, implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44
++-    S_NOP 0, implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54
++-    S_NOP 0, implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64
++-    S_NOP 0, implicit %65
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            reduce_spill_archvgpr_above_addressable_limit
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-    %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-    %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-    %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-    %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-    %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-    %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-    %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-    %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-    %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-    %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-    %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-    %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-    %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-    %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-    %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-    %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-    %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-    %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-    %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-    %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-    %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-    %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-    %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-    %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-    %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-    %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-    %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-    %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-    %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-    %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-    %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-    %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-    %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-    %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-    %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-    %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-    %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-    %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-    %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-    %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-    %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-    %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-    %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-    %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-    %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-    %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-    %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-    %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-    %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-    %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-    %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-    %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-    %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-    %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-    %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-    %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-    %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-    %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-    %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-    %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-    %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-    %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-    %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-    %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-    %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-    %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-    %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-    %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-    %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-    %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-    %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-    %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-    %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-    %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-    %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-    %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-    %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-    %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-    %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-    %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-    %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-    %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-    %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-    %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-    %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-    %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-    %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-    %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-    %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-    %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-    %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-    %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-    %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-    %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-    %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-    %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-    %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-    %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-    %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-    %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-    %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-    %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-    %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-    %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-    %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-    %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-    %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-    %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-    %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-    %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-    %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-    %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-    %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-    %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-    %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-    %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-    %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-    %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-    %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-    %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-    %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-    %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-    %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-    %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-    %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-    %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-    %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-    %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-    %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-    %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-    %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-    %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-    %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-    %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-    %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-    %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-    %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-    %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-    %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-    %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-    %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-    %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-    %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-    %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-    %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-    %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-    %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-    %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-    %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-    %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-    %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-    %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-    %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-    %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-    %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-    %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-    %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-    %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-    %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-    %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-    %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-    %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-    %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-    %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-    %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-    %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-    %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-    %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-    %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-    %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-    %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-    %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-    %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-    %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-    %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-    %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-    %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-    %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-    %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-    %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-    %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-    %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-    %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-    %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-    %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-    %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-    %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-    %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-    %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-    %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-    %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-    %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-    %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-    %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-    %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-    %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-    %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-    %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-    %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-    %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-    %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-    %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-    %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-
++-    %257:agpr_32 = IMPLICIT_DEF
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
++-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
++-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
++-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
++-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
++-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
++-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
++-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
++-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
++-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
++-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
++-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
++-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
++-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
++-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
++-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
++-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
++-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
++-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
++-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254, implicit %255, implicit %256, implicit %257
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            reduce_spill_agpr_above_addressable_limit
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: reduce_spill_agpr_above_addressable_limit
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF33:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF35:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF36:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF37:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF38:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF39:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF40:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF41:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF42:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF43:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF44:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF45:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF46:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF47:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF48:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF49:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF50:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF51:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF52:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF53:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF54:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF55:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF56:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF57:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF58:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF59:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF60:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF61:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF62:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF63:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF64:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF65:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF66:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF67:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF68:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF69:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF70:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF71:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF72:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF73:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF74:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF75:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF76:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF77:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF78:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF79:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF80:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF81:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF82:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF83:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF84:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF85:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF86:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF87:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF88:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF89:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF90:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF91:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF92:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF93:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF94:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF95:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF96:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF97:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF98:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF99:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF100:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF101:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF102:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF103:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF104:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF105:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF106:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF107:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF108:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF109:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF110:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF111:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF112:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF113:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF114:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF115:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF116:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF117:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF118:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF119:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF120:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF121:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF122:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF123:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF124:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF125:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF126:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF127:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF128:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF129:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF130:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF131:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF132:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF133:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF134:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF135:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF136:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF137:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF138:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF139:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF140:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF141:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF142:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF143:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF144:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF145:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF146:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF147:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF148:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF149:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF150:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF151:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF152:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF153:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF154:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF155:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF156:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF157:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF158:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF159:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF160:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF161:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF162:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF163:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF164:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF165:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF166:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF167:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF168:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF169:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF170:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF171:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF172:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF173:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF174:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF175:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF176:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF177:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF178:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF179:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF180:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF181:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF182:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF183:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF184:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF185:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF186:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF187:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF188:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF189:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF190:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF191:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF192:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF193:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF194:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF195:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF196:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF197:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF198:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF199:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF200:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF201:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF202:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF203:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF204:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF205:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF206:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF207:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF208:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF209:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF210:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF211:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF212:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF213:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF214:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF215:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF216:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF217:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF218:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF219:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF220:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF221:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF222:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF223:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF224:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF225:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF226:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF227:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF228:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF229:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF230:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF231:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF232:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF233:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF234:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF235:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF236:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF237:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF238:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF239:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF240:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF241:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF242:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF249:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF250:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF251:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF252:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]], implicit [[DEF129]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]], implicit [[DEF139]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]], implicit [[DEF149]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]], implicit [[DEF159]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]], implicit [[DEF169]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]], implicit [[DEF179]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]], implicit [[DEF189]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]], implicit [[DEF199]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]], implicit [[DEF209]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]], implicit [[DEF219]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF33:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF34:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF35:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF36:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF37:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF38:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF39:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF40:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF41:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF42:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF43:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF44:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF45:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF46:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF47:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF48:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF49:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF50:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF51:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF52:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF53:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF54:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF55:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF56:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF57:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF58:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF59:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF60:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF61:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF62:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF63:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF64:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF65:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF66:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF67:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF68:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF69:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF70:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF71:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF72:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF73:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF74:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF75:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF76:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF77:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF78:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF79:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF80:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF81:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF82:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF83:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF84:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF85:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF86:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF87:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF88:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF89:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF90:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF91:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF92:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF93:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF94:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF95:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF96:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF97:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF98:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF99:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF100:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF101:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF102:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF103:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF104:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF105:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF106:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF107:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF108:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF109:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF110:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF111:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF112:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF113:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF114:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF115:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF116:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF117:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF118:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF119:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF120:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF121:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF122:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF123:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF124:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF125:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF126:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF127:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF128:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF129:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF130:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF131:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF132:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF133:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF134:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF135:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF136:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF137:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF138:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF139:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF140:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF141:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF142:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF143:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF144:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF145:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF146:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF147:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF148:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF149:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF150:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF151:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF152:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF153:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF154:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF155:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF156:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF157:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF158:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF159:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF160:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF161:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF162:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF163:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF164:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF165:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF166:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF167:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF168:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF169:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF170:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF171:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF172:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF173:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF174:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF175:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF176:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF177:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF178:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF179:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF180:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF181:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF182:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF183:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF184:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF185:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF186:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF187:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF188:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF189:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF190:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF191:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF192:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF193:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF194:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF195:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF196:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF197:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF198:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF199:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF200:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF201:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF202:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF203:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF204:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF205:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF206:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF207:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF208:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF209:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF210:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF211:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF212:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF213:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF214:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF215:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF216:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF217:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF218:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF219:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF220:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF221:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF222:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF223:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF224:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF225:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF226:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF227:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF228:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF229:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF230:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF231:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF232:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF233:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF234:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF235:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF236:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF237:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF238:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF239:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF240:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF241:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF242:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF249:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF250:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF251:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF252:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]], implicit [[DEF129]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]], implicit [[DEF139]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]], implicit [[DEF149]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]], implicit [[DEF159]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]], implicit [[DEF169]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]], implicit [[DEF179]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]], implicit [[DEF189]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]], implicit [[DEF199]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]], implicit [[DEF209]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]], implicit [[DEF219]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:agpr_32 = IMPLICIT_DEF
++-    %1:agpr_32 = IMPLICIT_DEF
++-    %2:agpr_32 = IMPLICIT_DEF
++-    %3:agpr_32 = IMPLICIT_DEF
++-    %4:agpr_32 = IMPLICIT_DEF
++-    %5:agpr_32 = IMPLICIT_DEF
++-    %6:agpr_32 = IMPLICIT_DEF
++-    %7:agpr_32 = IMPLICIT_DEF
++-    %8:agpr_32 = IMPLICIT_DEF
++-    %9:agpr_32 = IMPLICIT_DEF
++-    %10:agpr_32 = IMPLICIT_DEF
++-    %11:agpr_32 = IMPLICIT_DEF
++-    %12:agpr_32 = IMPLICIT_DEF
++-    %13:agpr_32 = IMPLICIT_DEF
++-    %14:agpr_32 = IMPLICIT_DEF
++-    %15:agpr_32 = IMPLICIT_DEF
++-    %16:agpr_32 = IMPLICIT_DEF
++-    %17:agpr_32 = IMPLICIT_DEF
++-    %18:agpr_32 = IMPLICIT_DEF
++-    %19:agpr_32 = IMPLICIT_DEF
++-    %20:agpr_32 = IMPLICIT_DEF
++-    %21:agpr_32 = IMPLICIT_DEF
++-    %22:agpr_32 = IMPLICIT_DEF
++-    %23:agpr_32 = IMPLICIT_DEF
++-    %24:agpr_32 = IMPLICIT_DEF
++-    %25:agpr_32 = IMPLICIT_DEF
++-    %26:agpr_32 = IMPLICIT_DEF
++-    %27:agpr_32 = IMPLICIT_DEF
++-    %28:agpr_32 = IMPLICIT_DEF
++-    %29:agpr_32 = IMPLICIT_DEF
++-    %30:agpr_32 = IMPLICIT_DEF
++-    %31:agpr_32 = IMPLICIT_DEF
++-    %32:agpr_32 = IMPLICIT_DEF
++-    %33:agpr_32 = IMPLICIT_DEF
++-    %34:agpr_32 = IMPLICIT_DEF
++-    %35:agpr_32 = IMPLICIT_DEF
++-    %36:agpr_32 = IMPLICIT_DEF
++-    %37:agpr_32 = IMPLICIT_DEF
++-    %38:agpr_32 = IMPLICIT_DEF
++-    %39:agpr_32 = IMPLICIT_DEF
++-    %40:agpr_32 = IMPLICIT_DEF
++-    %41:agpr_32 = IMPLICIT_DEF
++-    %42:agpr_32 = IMPLICIT_DEF
++-    %43:agpr_32 = IMPLICIT_DEF
++-    %44:agpr_32 = IMPLICIT_DEF
++-    %45:agpr_32 = IMPLICIT_DEF
++-    %46:agpr_32 = IMPLICIT_DEF
++-    %47:agpr_32 = IMPLICIT_DEF
++-    %48:agpr_32 = IMPLICIT_DEF
++-    %49:agpr_32 = IMPLICIT_DEF
++-    %50:agpr_32 = IMPLICIT_DEF
++-    %51:agpr_32 = IMPLICIT_DEF
++-    %52:agpr_32 = IMPLICIT_DEF
++-    %53:agpr_32 = IMPLICIT_DEF
++-    %54:agpr_32 = IMPLICIT_DEF
++-    %55:agpr_32 = IMPLICIT_DEF
++-    %56:agpr_32 = IMPLICIT_DEF
++-    %57:agpr_32 = IMPLICIT_DEF
++-    %58:agpr_32 = IMPLICIT_DEF
++-    %59:agpr_32 = IMPLICIT_DEF
++-    %60:agpr_32 = IMPLICIT_DEF
++-    %61:agpr_32 = IMPLICIT_DEF
++-    %62:agpr_32 = IMPLICIT_DEF
++-    %63:agpr_32 = IMPLICIT_DEF
++-    %64:agpr_32 = IMPLICIT_DEF
++-    %65:agpr_32 = IMPLICIT_DEF
++-    %66:agpr_32 = IMPLICIT_DEF
++-    %67:agpr_32 = IMPLICIT_DEF
++-    %68:agpr_32 = IMPLICIT_DEF
++-    %69:agpr_32 = IMPLICIT_DEF
++-    %70:agpr_32 = IMPLICIT_DEF
++-    %71:agpr_32 = IMPLICIT_DEF
++-    %72:agpr_32 = IMPLICIT_DEF
++-    %73:agpr_32 = IMPLICIT_DEF
++-    %74:agpr_32 = IMPLICIT_DEF
++-    %75:agpr_32 = IMPLICIT_DEF
++-    %76:agpr_32 = IMPLICIT_DEF
++-    %77:agpr_32 = IMPLICIT_DEF
++-    %78:agpr_32 = IMPLICIT_DEF
++-    %79:agpr_32 = IMPLICIT_DEF
++-    %80:agpr_32 = IMPLICIT_DEF
++-    %81:agpr_32 = IMPLICIT_DEF
++-    %82:agpr_32 = IMPLICIT_DEF
++-    %83:agpr_32 = IMPLICIT_DEF
++-    %84:agpr_32 = IMPLICIT_DEF
++-    %85:agpr_32 = IMPLICIT_DEF
++-    %86:agpr_32 = IMPLICIT_DEF
++-    %87:agpr_32 = IMPLICIT_DEF
++-    %88:agpr_32 = IMPLICIT_DEF
++-    %89:agpr_32 = IMPLICIT_DEF
++-    %90:agpr_32 = IMPLICIT_DEF
++-    %91:agpr_32 = IMPLICIT_DEF
++-    %92:agpr_32 = IMPLICIT_DEF
++-    %93:agpr_32 = IMPLICIT_DEF
++-    %94:agpr_32 = IMPLICIT_DEF
++-    %95:agpr_32 = IMPLICIT_DEF
++-    %96:agpr_32 = IMPLICIT_DEF
++-    %97:agpr_32 = IMPLICIT_DEF
++-    %98:agpr_32 = IMPLICIT_DEF
++-    %99:agpr_32 = IMPLICIT_DEF
++-    %100:agpr_32 = IMPLICIT_DEF
++-    %101:agpr_32 = IMPLICIT_DEF
++-    %102:agpr_32 = IMPLICIT_DEF
++-    %103:agpr_32 = IMPLICIT_DEF
++-    %104:agpr_32 = IMPLICIT_DEF
++-    %105:agpr_32 = IMPLICIT_DEF
++-    %106:agpr_32 = IMPLICIT_DEF
++-    %107:agpr_32 = IMPLICIT_DEF
++-    %108:agpr_32 = IMPLICIT_DEF
++-    %109:agpr_32 = IMPLICIT_DEF
++-    %110:agpr_32 = IMPLICIT_DEF
++-    %111:agpr_32 = IMPLICIT_DEF
++-    %112:agpr_32 = IMPLICIT_DEF
++-    %113:agpr_32 = IMPLICIT_DEF
++-    %114:agpr_32 = IMPLICIT_DEF
++-    %115:agpr_32 = IMPLICIT_DEF
++-    %116:agpr_32 = IMPLICIT_DEF
++-    %117:agpr_32 = IMPLICIT_DEF
++-    %118:agpr_32 = IMPLICIT_DEF
++-    %119:agpr_32 = IMPLICIT_DEF
++-    %120:agpr_32 = IMPLICIT_DEF
++-    %121:agpr_32 = IMPLICIT_DEF
++-    %122:agpr_32 = IMPLICIT_DEF
++-    %123:agpr_32 = IMPLICIT_DEF
++-    %124:agpr_32 = IMPLICIT_DEF
++-    %125:agpr_32 = IMPLICIT_DEF
++-    %126:agpr_32 = IMPLICIT_DEF
++-    %127:agpr_32 = IMPLICIT_DEF
++-    %128:agpr_32 = IMPLICIT_DEF
++-    %129:agpr_32 = IMPLICIT_DEF
++-    %130:agpr_32 = IMPLICIT_DEF
++-    %131:agpr_32 = IMPLICIT_DEF
++-    %132:agpr_32 = IMPLICIT_DEF
++-    %133:agpr_32 = IMPLICIT_DEF
++-    %134:agpr_32 = IMPLICIT_DEF
++-    %135:agpr_32 = IMPLICIT_DEF
++-    %136:agpr_32 = IMPLICIT_DEF
++-    %137:agpr_32 = IMPLICIT_DEF
++-    %138:agpr_32 = IMPLICIT_DEF
++-    %139:agpr_32 = IMPLICIT_DEF
++-    %140:agpr_32 = IMPLICIT_DEF
++-    %141:agpr_32 = IMPLICIT_DEF
++-    %142:agpr_32 = IMPLICIT_DEF
++-    %143:agpr_32 = IMPLICIT_DEF
++-    %144:agpr_32 = IMPLICIT_DEF
++-    %145:agpr_32 = IMPLICIT_DEF
++-    %146:agpr_32 = IMPLICIT_DEF
++-    %147:agpr_32 = IMPLICIT_DEF
++-    %148:agpr_32 = IMPLICIT_DEF
++-    %149:agpr_32 = IMPLICIT_DEF
++-    %150:agpr_32 = IMPLICIT_DEF
++-    %151:agpr_32 = IMPLICIT_DEF
++-    %152:agpr_32 = IMPLICIT_DEF
++-    %153:agpr_32 = IMPLICIT_DEF
++-    %154:agpr_32 = IMPLICIT_DEF
++-    %155:agpr_32 = IMPLICIT_DEF
++-    %156:agpr_32 = IMPLICIT_DEF
++-    %157:agpr_32 = IMPLICIT_DEF
++-    %158:agpr_32 = IMPLICIT_DEF
++-    %159:agpr_32 = IMPLICIT_DEF
++-    %160:agpr_32 = IMPLICIT_DEF
++-    %161:agpr_32 = IMPLICIT_DEF
++-    %162:agpr_32 = IMPLICIT_DEF
++-    %163:agpr_32 = IMPLICIT_DEF
++-    %164:agpr_32 = IMPLICIT_DEF
++-    %165:agpr_32 = IMPLICIT_DEF
++-    %166:agpr_32 = IMPLICIT_DEF
++-    %167:agpr_32 = IMPLICIT_DEF
++-    %168:agpr_32 = IMPLICIT_DEF
++-    %169:agpr_32 = IMPLICIT_DEF
++-    %170:agpr_32 = IMPLICIT_DEF
++-    %171:agpr_32 = IMPLICIT_DEF
++-    %172:agpr_32 = IMPLICIT_DEF
++-    %173:agpr_32 = IMPLICIT_DEF
++-    %174:agpr_32 = IMPLICIT_DEF
++-    %175:agpr_32 = IMPLICIT_DEF
++-    %176:agpr_32 = IMPLICIT_DEF
++-    %177:agpr_32 = IMPLICIT_DEF
++-    %178:agpr_32 = IMPLICIT_DEF
++-    %179:agpr_32 = IMPLICIT_DEF
++-    %180:agpr_32 = IMPLICIT_DEF
++-    %181:agpr_32 = IMPLICIT_DEF
++-    %182:agpr_32 = IMPLICIT_DEF
++-    %183:agpr_32 = IMPLICIT_DEF
++-    %184:agpr_32 = IMPLICIT_DEF
++-    %185:agpr_32 = IMPLICIT_DEF
++-    %186:agpr_32 = IMPLICIT_DEF
++-    %187:agpr_32 = IMPLICIT_DEF
++-    %188:agpr_32 = IMPLICIT_DEF
++-    %189:agpr_32 = IMPLICIT_DEF
++-    %190:agpr_32 = IMPLICIT_DEF
++-    %191:agpr_32 = IMPLICIT_DEF
++-    %192:agpr_32 = IMPLICIT_DEF
++-    %193:agpr_32 = IMPLICIT_DEF
++-    %194:agpr_32 = IMPLICIT_DEF
++-    %195:agpr_32 = IMPLICIT_DEF
++-    %196:agpr_32 = IMPLICIT_DEF
++-    %197:agpr_32 = IMPLICIT_DEF
++-    %198:agpr_32 = IMPLICIT_DEF
++-    %199:agpr_32 = IMPLICIT_DEF
++-    %200:agpr_32 = IMPLICIT_DEF
++-    %201:agpr_32 = IMPLICIT_DEF
++-    %202:agpr_32 = IMPLICIT_DEF
++-    %203:agpr_32 = IMPLICIT_DEF
++-    %204:agpr_32 = IMPLICIT_DEF
++-    %205:agpr_32 = IMPLICIT_DEF
++-    %206:agpr_32 = IMPLICIT_DEF
++-    %207:agpr_32 = IMPLICIT_DEF
++-    %208:agpr_32 = IMPLICIT_DEF
++-    %209:agpr_32 = IMPLICIT_DEF
++-    %210:agpr_32 = IMPLICIT_DEF
++-    %211:agpr_32 = IMPLICIT_DEF
++-    %212:agpr_32 = IMPLICIT_DEF
++-    %213:agpr_32 = IMPLICIT_DEF
++-    %214:agpr_32 = IMPLICIT_DEF
++-    %215:agpr_32 = IMPLICIT_DEF
++-    %216:agpr_32 = IMPLICIT_DEF
++-    %217:agpr_32 = IMPLICIT_DEF
++-    %218:agpr_32 = IMPLICIT_DEF
++-    %219:agpr_32 = IMPLICIT_DEF
++-    %220:agpr_32 = IMPLICIT_DEF
++-    %221:agpr_32 = IMPLICIT_DEF
++-    %222:agpr_32 = IMPLICIT_DEF
++-    %223:agpr_32 = IMPLICIT_DEF
++-    %224:agpr_32 = IMPLICIT_DEF
++-    %225:agpr_32 = IMPLICIT_DEF
++-    %226:agpr_32 = IMPLICIT_DEF
++-    %227:agpr_32 = IMPLICIT_DEF
++-    %228:agpr_32 = IMPLICIT_DEF
++-    %229:agpr_32 = IMPLICIT_DEF
++-    %230:agpr_32 = IMPLICIT_DEF
++-    %231:agpr_32 = IMPLICIT_DEF
++-    %232:agpr_32 = IMPLICIT_DEF
++-    %233:agpr_32 = IMPLICIT_DEF
++-    %234:agpr_32 = IMPLICIT_DEF
++-    %235:agpr_32 = IMPLICIT_DEF
++-    %236:agpr_32 = IMPLICIT_DEF
++-    %237:agpr_32 = IMPLICIT_DEF
++-    %238:agpr_32 = IMPLICIT_DEF
++-    %239:agpr_32 = IMPLICIT_DEF
++-    %240:agpr_32 = IMPLICIT_DEF
++-    %241:agpr_32 = IMPLICIT_DEF
++-    %242:agpr_32 = IMPLICIT_DEF
++-    %243:agpr_32 = IMPLICIT_DEF
++-    %244:agpr_32 = IMPLICIT_DEF
++-    %245:agpr_32 = IMPLICIT_DEF
++-    %246:agpr_32 = IMPLICIT_DEF
++-    %247:agpr_32 = IMPLICIT_DEF
++-    %248:agpr_32 = IMPLICIT_DEF
++-    %249:agpr_32 = IMPLICIT_DEF
++-    %250:agpr_32 = IMPLICIT_DEF
++-    %251:agpr_32 = IMPLICIT_DEF
++-    %252:agpr_32 = IMPLICIT_DEF
++-    %253:agpr_32 = IMPLICIT_DEF
++-    %254:agpr_32 = IMPLICIT_DEF
++-    %255:agpr_32 = IMPLICIT_DEF
++-    %256:agpr_32 = IMPLICIT_DEF
++-
++-    %257:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
++-    %258:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
++-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
++-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
++-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
++-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
++-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
++-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
++-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
++-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
++-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
++-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
++-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
++-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
++-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
++-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
++-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
++-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
++-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
++-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
++-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254, implicit %255, implicit %256, implicit %257, implicit %258
++-
++-    S_ENDPGM 0
++-...
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
++--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
++@@ -17,7 +17,7 @@
++   isEntryFunction: true
++ body:             |
++   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
++-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
+++  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
++   ; DEBUG-NEXT: ********** MI Scheduling **********
++   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
++   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++@@ -89,7 +89,7 @@
++   isEntryFunction: true
++ body:             |
++   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
++-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
+++  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
++   ; DEBUG-NEXT: ********** MI Scheduling **********
++   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
++   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
++--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
++@@ -725,11 +725,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -1181,13 +1181,8 @@
++   ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
++   ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -1201,13 +1196,14 @@
++   ; GFX908-NEXT: bb.2:
++   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.3:
++   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
++   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
++   ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
++   ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
++@@ -1232,11 +1228,7 @@
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
++@@ -1315,12 +1307,7 @@
++     %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++     %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++     %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+++    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++ 
++     %100:sgpr_32 = S_MOV_B32 0
++     %101:sgpr_32 = S_MOV_B32 1
++@@ -1443,10 +1430,7 @@
++     S_NOP 0, implicit %30, implicit %31
++     S_NOP 0, implicit %32, implicit %33
++     S_NOP 0, implicit %34, implicit %35
++-    S_NOP 0, implicit %36, implicit %37
++-    S_NOP 0, implicit %38, implicit %39
++-    S_NOP 0, implicit %40, implicit %41
++-    S_NOP 0, implicit %42
+++    S_NOP 0, implicit %36
++ 
++     S_NOP 0, implicit %100, implicit %101
++     S_NOP 0, implicit %102, implicit %103
++@@ -2972,11 +2956,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -3150,11 +3134,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -3340,11 +3324,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -4936,604 +4920,6 @@
++     S_ENDPGM 0
++ ...
++ ---
++-name:            test_occ_1_sink_for_spill
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_occ_1_sink_for_spill
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-    %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-    %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-    %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-    %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-    %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-    %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-    %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-    %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-    %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-    %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-    %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-    %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-    %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-    %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-    %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-    %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-    %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-    %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-    %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-    %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-    %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-    %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-    %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-    %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-    %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-    %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-    %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-    %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-    %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-    %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-    %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-    %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-    %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-    %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-    %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-    %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-    %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-    %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-    %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-    %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-    %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-    %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-    %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-    %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-    %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-    %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-    %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-    %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-    %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-    %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-    %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-    %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-    %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-    %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-    %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-    %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-    %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-    %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-    %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-    %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-    %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-    %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-    %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-    %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-    %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-    %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-    %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-    %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-    %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-    %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-    %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-    %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-    %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-    %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-    %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-    %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-    %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-    %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-    %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-    %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-    %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-    %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-    %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-    %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-    %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-    %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-    %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-    %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-    %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-    %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-    %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-    %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-    %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-    %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-    %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-    %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-    %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-    %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-    %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-    %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-    %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-    %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-    %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-    %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-    %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-    %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-    %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-    %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-    %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-    %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-    %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-    %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-    %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-    %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-    %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-    %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-    %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-    %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-    %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-    %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-    %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-    %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-    %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-    %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-    %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-    %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-    %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-    %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-    %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-    %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-    %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-    %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-    %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-    %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-    %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-    %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-    %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-    %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-    %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-    %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-    %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-    %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-    %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-    %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-    %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-    %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-    %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-    %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-    %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-    %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-    %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-    %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-    %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-    %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-    %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-    %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-    %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-    %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-    %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-    %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-    %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-    %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-    %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-    %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-    %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-    %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-    %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-    %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-    %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-    %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-    %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-    %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-    %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-    %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-    %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-    %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-    %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-    %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-    %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-    %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-    %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-    %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-    %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-    %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-    %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-    %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-    %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-    %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-    %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-    %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-    %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-    %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-    %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-    %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-    %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-    %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-    %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-    %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-    %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-    %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-    %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-    %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.2:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
++-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
++-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
++-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
++-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
++-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
++-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
++-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
++-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
++-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
++-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
++-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
++-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
++-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
++-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
++-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
++-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
++-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
++-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
++-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254
++-
++-    S_NOP 0, implicit %255, implicit %256
++-
++-    S_ENDPGM 0
++-...
++----
++ name:            test_no_sink_two_subregs_in_def_block
++ tracksRegLiveness: true
++ machineFunctionInfo:
++@@ -6158,12 +5544,12 @@
++     S_ENDPGM 0
++ ...
++ ---
++-name:            test_occ_7_sink_one_def_of_undef_subreg_for_8
+++name:            test_occ_9_no_sink_one_def_of_undef_subreg
++ tracksRegLiveness: true
++ machineFunctionInfo:
++   isEntryFunction: true
++ body:             |
++-  ; GFX908-LABEL: name: test_occ_7_sink_one_def_of_undef_subreg_for_8
+++  ; GFX908-LABEL: name: test_occ_9_no_sink_one_def_of_undef_subreg
++   ; GFX908: bb.0:
++   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++   ; GFX908-NEXT: {{  $}}
++@@ -6190,22 +5576,16 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.2:
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
++@@ -6217,13 +5597,7 @@
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
++   ; GFX908-NEXT:   S_ENDPGM 0
++   bb.0:
++     successors: %bb.1
++@@ -6251,24 +5625,17 @@
++     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++     %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    undef %32.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+++    undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
++ 
++   bb.1:
++     successors: %bb.2
++ 
++-    S_NOP 0, implicit %0
+++    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+++    S_NOP 0, implicit %24
++ 
++   bb.2:
++ 
+++    S_NOP 0, implicit %23.sub1
++     S_NOP 0, implicit %0, implicit %1
++     S_NOP 0, implicit %2, implicit %3
++     S_NOP 0, implicit %4, implicit %5
++@@ -6280,12 +5647,7 @@
++     S_NOP 0, implicit %16, implicit %17
++     S_NOP 0, implicit %18, implicit %19
++     S_NOP 0, implicit %20, implicit %21
++-    S_NOP 0, implicit %22, implicit %23
++-    S_NOP 0, implicit %24, implicit %25
++-    S_NOP 0, implicit %26, implicit %27
++-    S_NOP 0, implicit %28, implicit %29
++-    S_NOP 0, implicit %30, implicit %31
++-    S_NOP 0, implicit %32.sub1
+++    S_NOP 0, implicit %22
++     S_ENDPGM 0
++ ...
++ ---
++@@ -6504,770 +5866,7 @@
++     S_NOP 0, implicit %22
++     S_ENDPGM 0
++ ...
++----
++-name:            test_live_through_occ_7_sink_for_8
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_live_through_occ_7_sink_for_8
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   dead [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_10]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.3:
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.2:
++-    successors: %bb.3
++-
++-    S_NOP 0, implicit %2,  implicit %3,  implicit %4,  implicit %5,  implicit %6
++-    S_NOP 0, implicit %7,  implicit %8,  implicit %9,  implicit %10, implicit %11
++-    S_NOP 0, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16
++-    S_NOP 0, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21
++-    S_NOP 0, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26
++-    S_NOP 0, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
++-
++-  bb.3:
++-    S_NOP 0, implicit %0, implicit %1
++-    S_ENDPGM 0
++-...
++----
++-name:            test_remat_over_exec_modif
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_remat_over_exec_modif
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT:   liveins: $sgpr2_sgpr3
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   %save_exec:sreg_64 = S_MOV_B64 $exec
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   $exec = S_MOV_B64 %new_exec
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
++-  ; GFX908-NEXT:   $exec = S_MOV_B64 %save_exec
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    liveins: $sgpr2_sgpr3
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-
++-    %save_exec:sreg_64 = S_MOV_B64 $exec
++-    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-    $exec = S_MOV_B64 %new_exec
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0, implicit %16
++-    S_NOP 0, implicit %1, implicit %17
++-    S_NOP 0, implicit %2, implicit %18
++-    S_NOP 0, implicit %3, implicit %19
++-    S_NOP 0, implicit %4, implicit %20
++-    S_NOP 0, implicit %5, implicit %21
++-    S_NOP 0, implicit %6, implicit %22
++-    S_NOP 0, implicit %7, implicit %23
++-    S_NOP 0, implicit %8, implicit %24
++-    S_NOP 0, implicit %9, implicit %25
++-    S_NOP 0, implicit %10, implicit %26
++-    S_NOP 0, implicit %11, implicit %27
++-    S_NOP 0, implicit %12, implicit %28
++-    S_NOP 0, implicit %13, implicit %29
++-    S_NOP 0, implicit %14, implicit %30
++-    S_NOP 0, implicit %15, implicit %31
++-    S_NOP 0, implicit %32
++-
++-    $exec = S_MOV_B64 %save_exec
++-    S_ENDPGM 0
++-...
++----
++-name:            test_remat_same_block
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  bb.0:
++-    liveins: $sgpr2_sgpr3
++-
++-    ; GFX908-LABEL: name: test_remat_same_block
++-    ; GFX908: liveins: $sgpr2_sgpr3
++-    ; GFX908-NEXT: {{  $}}
++-    ; GFX908-NEXT: %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: %save_exec:sreg_64 = S_MOV_B64 $exec
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: $exec = S_MOV_B64 %new_exec
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
++-    ; GFX908-NEXT: $exec = S_MOV_B64 %save_exec
++-    ; GFX908-NEXT: S_ENDPGM 0
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-
++-    %save_exec:sreg_64 = S_MOV_B64 $exec
++-    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-    $exec = S_MOV_B64 %new_exec
++-
++-    S_NOP 0, implicit %0, implicit %16
++-    S_NOP 0, implicit %1, implicit %17
++-    S_NOP 0, implicit %2, implicit %18
++-    S_NOP 0, implicit %3, implicit %19
++-    S_NOP 0, implicit %4, implicit %20
++-    S_NOP 0, implicit %5, implicit %21
++-    S_NOP 0, implicit %6, implicit %22
++-    S_NOP 0, implicit %7, implicit %23
++-    S_NOP 0, implicit %8, implicit %24
++-    S_NOP 0, implicit %9, implicit %25
++-    S_NOP 0, implicit %10, implicit %26
++-    S_NOP 0, implicit %11, implicit %27
++-    S_NOP 0, implicit %12, implicit %28
++-    S_NOP 0, implicit %13, implicit %29
++-    S_NOP 0, implicit %14, implicit %30
++-    S_NOP 0, implicit %15, implicit %31
++-    S_NOP 0, implicit %32
++ 
++-    $exec = S_MOV_B64 %save_exec
++-    S_ENDPGM 0
++-...
++----
++-name:            test_rollback_remat_defregion_above_target
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_rollback_remat_defregion_above_target
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32
++-
++-  bb.2:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            test_rollback_remat_useregion_above_target
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_rollback_remat_useregion_above_target
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32
++-
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-
++-  bb.2:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-    S_NOP 0, implicit %33,  implicit %34
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            test_rollback_remats_emptydefregion
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_rollback_remats_emptydefregion
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.3:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-
++-  bb.2:
++-    successors: %bb.3
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
++-
++-  bb.3:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-
++-    S_ENDPGM 0
++-...
++ ---
++ name:            test_occ_8_physreg_use
++ tracksRegLiveness: true
++@@ -7347,15 +5946,15 @@
++   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
++@@ -7449,9 +6048,9 @@
++ 
++   bb.4:
++ 
++-    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %50, implicit %60, implicit %20
++     S_NOP 0, implicit %51, implicit %61, implicit %21
+++    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %53, implicit %63
++     S_NOP 0, implicit %54, implicit %64
++     S_NOP 0, implicit %55, implicit %65
++@@ -7468,6 +6067,7 @@
++     S_NOP 0, implicit %80
++     S_ENDPGM 0
++ ...
+++
++ ---
++ name:            test_occ_8_exec_use
++ tracksRegLiveness: true
++@@ -7543,21 +6143,21 @@
++   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
++   ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 255
++   ; GFX908-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 [[S_MOV_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF30]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_9]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF30]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_10]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
++@@ -7654,9 +6254,9 @@
++ 
++     %100:sreg_64 = S_MOV_B64 255
++     %101:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %100, implicit-def $exec, implicit-def $scc, implicit $exec
++-    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %50, implicit %60, implicit %20
++     S_NOP 0, implicit %51, implicit %61, implicit %21
+++    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %53, implicit %63
++     S_NOP 0, implicit %54, implicit %64
++     S_NOP 0, implicit %55, implicit %65
++@@ -7674,6 +6274,7 @@
++     $exec = S_MOV_B64 %101:sreg_64_xexec
++     S_ENDPGM 0
++ ...
+++
++ ---
++ name:            remat_virtual_vgpr_occ_6
++ tracksRegLiveness: true
++@@ -7740,30 +6341,30 @@
++   ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+++  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF22]], implicit [[DEF27]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF23]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF31]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF30]], implicit [[DEF25]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF31]], implicit [[DEF26]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF22]], implicit [[DEF27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF23]], implicit [[DEF28]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF24]], implicit [[DEF29]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
++@@ -7852,12 +6453,12 @@
++     S_BRANCH %bb.4
++ 
++   bb.4:
++-    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
++     S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
++     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
++     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
++     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %57, implicit %67
++     S_NOP 0, implicit %58, implicit %68
++@@ -8144,25 +6745,25 @@
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF24]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF25]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF26]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF27]], implicit [[DEF31]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF28]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF27]], implicit [[DEF31]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
++@@ -8254,12 +6855,12 @@
++     S_BRANCH %bb.4
++ 
++   bb.4:
++-    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %50, implicit %60, implicit %10.sub0, implicit %10.sub2
++     S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
++     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
++     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
++     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %57, implicit %67
++     S_NOP 0, implicit %58, implicit %68
++@@ -8555,24 +7156,24 @@
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   undef [[V_CVT_I32_F32_e32_1:%[0-9]+]].sub0:vreg_64 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF1]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF1]], implicit [[DEF28]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF25]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF26]], implicit [[DEF30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF27]], implicit [[DEF31]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF26]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF27]], implicit [[DEF31]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_11]]
++@@ -8663,13 +7264,13 @@
++     S_BRANCH %bb.4
++ 
++   bb.4:
++-    S_NOP 0, implicit %55, implicit %65
++-    S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %50.sub0, implicit %60, implicit %10.sub0, implicit %10.sub2
++     S_NOP 0, implicit %52, implicit %61, implicit %11, implicit %21
++     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
++     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
++     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65
+++    S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %57, implicit %67
++     S_NOP 0, implicit %58, implicit %68
++     S_NOP 0, implicit %59, implicit %69
++@@ -8695,75 +7296,123 @@
++   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++   ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   dead [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF]].sub0, implicit $mode, implicit $exec
++-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF32]].sub0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   dead [[DEF33:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+++  ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_40]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_41]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_42]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_33]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_35]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_36]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_37]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_38]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_39]]
+++  ; GFX908-NEXT:   %temp:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr3, $sgpr4, implicit-def $scc
+++  ; GFX908-NEXT:   [[DEF34:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, [[DEF32]].sub0, 1, %temp, 0, 0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+++  ; GFX908-NEXT:   S_BRANCH %bb.3
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   dead [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF2]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF2]].sub0, 0, 0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF34]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF34]].sub0, 0, 0, implicit $mode, implicit $exec
++   ; GFX908-NEXT:   %temp2:vreg_64_align2 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF32]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++   ; GFX908-NEXT:   dead [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, [[V_FMA_F32_e64_]], 8, %temp2, 11, [[V_PK_MUL_F32_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[V_RCP_F32_e32_]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_23]], implicit [[DEF22]], implicit [[DEF27]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF23]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF24]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   dead [[DEF35:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF25]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF26]], implicit [[DEF31]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF32]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+++  ; GFX908-NEXT:   S_BRANCH %bb.3
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT: bb.3:
+++  ; GFX908-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
+++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
+++  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+++  ; GFX908-NEXT:   S_BRANCH %bb.4
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT: bb.4:
++   ; GFX908-NEXT:   S_ENDPGM 0
++   bb.0:
++     liveins: $sgpr3, $sgpr4
++@@ -8773,72 +7422,116 @@
++     undef %2.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 %1.sub0:vreg_64_align2, implicit $mode, implicit $exec
++     %3:vreg_64_align2 =  nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, %2, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++     %5:vreg_64_align2 = IMPLICIT_DEF
++-
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+++    %10:vgpr_32 = IMPLICIT_DEF
+++    %11:vgpr_32 = IMPLICIT_DEF
+++    %12:vgpr_32 = IMPLICIT_DEF
+++    %13:vgpr_32 = IMPLICIT_DEF
+++    %14:vgpr_32 = IMPLICIT_DEF
+++    %15:vgpr_32 = IMPLICIT_DEF
+++    %16:vgpr_32 = IMPLICIT_DEF
+++    %17:vgpr_32 = IMPLICIT_DEF
+++    %18:vgpr_32 = IMPLICIT_DEF
+++    %19:vgpr_32 = IMPLICIT_DEF
+++    %20:vgpr_32 = IMPLICIT_DEF
+++    %21:vgpr_32 = IMPLICIT_DEF
+++    %22:vgpr_32 = IMPLICIT_DEF
+++    %23:vgpr_32 = IMPLICIT_DEF
+++    %24:vgpr_32 = IMPLICIT_DEF
+++    %25:vgpr_32 = IMPLICIT_DEF
+++    %26:vgpr_32 = IMPLICIT_DEF
+++    %27:vgpr_32 = IMPLICIT_DEF
+++    %28:vgpr_32 = IMPLICIT_DEF
+++    %29:vgpr_32 = IMPLICIT_DEF
+++    %30:vgpr_32 = IMPLICIT_DEF
+++    %31:vgpr_32 = IMPLICIT_DEF
+++    %32:vgpr_32 = IMPLICIT_DEF
+++    %33:vgpr_32 = IMPLICIT_DEF
+++    %34:vgpr_32 = IMPLICIT_DEF
+++    %35:vgpr_32 = IMPLICIT_DEF
+++    %36:vgpr_32 = IMPLICIT_DEF
+++    %37:vgpr_32 = IMPLICIT_DEF
+++    %38:vgpr_32 = IMPLICIT_DEF
+++    %39:vgpr_32 = IMPLICIT_DEF
+++    %40:vgpr_32 = IMPLICIT_DEF
+++    %41:vgpr_32 = IMPLICIT_DEF
+++    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
+++    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+++    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+++    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+++    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+++    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+++    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+++    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+++    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+++    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+++    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
+++    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+++    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+++    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+++    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+++    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+++    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+++    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+++    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+++    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+++    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+++    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+++    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+++    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+++    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+++    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+++    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+++    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+++    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+++    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+++    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+++    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+++    S_BRANCH %bb.1
++ 
++   bb.1:
+++    liveins: $sgpr3, $sgpr4
++ 
++-    S_NOP 0, implicit %10, implicit %20, implicit %30, implicit %40, implicit %50
++-    S_NOP 0, implicit %11, implicit %21, implicit %31, implicit %41, implicit %51
++-    S_NOP 0, implicit %12, implicit %22, implicit %32, implicit %42, implicit %52
++-    S_NOP 0, implicit %13, implicit %23, implicit %33, implicit %43
++-    S_NOP 0, implicit %14, implicit %24, implicit %34, implicit %44
++-    S_NOP 0, implicit %15, implicit %25, implicit %35, implicit %45
++-    S_NOP 0, implicit %16, implicit %26, implicit %36, implicit %46
++-    S_NOP 0, implicit %17, implicit %27, implicit %37, implicit %47
++-    S_NOP 0, implicit %18, implicit %28, implicit %38, implicit %48
++-    S_NOP 0, implicit %19, implicit %29, implicit %39, implicit %49
+++    %temp:vgpr_32 = IMPLICIT_DEF
+++    %5.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, %1.sub0:vreg_64_align2, 1, %temp, 0, 0, implicit $mode, implicit $exec
++ 
++-  bb.2:
+++    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+++    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+++    S_BRANCH %bb.4
+++
+++  bb.3:
+++    liveins: $sgpr3, $sgpr4
++ 
++     %6:vreg_64_align2 = IMPLICIT_DEF
++     undef %7.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, %5.sub1, 0, %2.sub0:vreg_64_align2, 0, %5.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
++     %temp2:vreg_64_align2 = IMPLICIT_DEF
++     %8:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, %7:vreg_64_align2, 8, %temp2, 11, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+++    S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
+++    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
+++    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+++    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+++    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65, implicit %1
+++    S_NOP 0, implicit %56, implicit %66
+++    S_NOP 0, implicit %57, implicit %67
+++    S_NOP 0, implicit %58, implicit %68
+++    S_NOP 0, implicit %59, implicit %69
+++    S_NOP 0, implicit %70, implicit %71
+++    S_NOP 0, implicit %72, implicit %73
+++    S_NOP 0, implicit %72, implicit %73
+++    S_NOP 0, implicit %74, implicit %75
+++    S_NOP 0, implicit %76, implicit %77
+++    S_NOP 0, implicit %78, implicit %79
+++    S_NOP 0, implicit %80
+++    S_BRANCH %bb.4
++ 
++-    S_NOP 0, implicit %1,  implicit %2
+++  bb.4:
+++    liveins: $sgpr3, $sgpr4
++ 
+++    S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
+++    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+++    S_BRANCH %bb.2
+++
+++  bb.2:
++     S_ENDPGM 0
+++
++ ...
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
++--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
++@@ -506,8 +506,8 @@
++ ; GFX908-NEXT:    v_accvgpr_write_b32 a3, 0
++ ; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
++ ; GFX908-NEXT:    v_accvgpr_write_b32 a0, 0
++-; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX908-NEXT:    s_mov_b32 s0, 16
+++; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
++ ; GFX908-NEXT:  .LBB2_1: ; %for.cond.preheader
++ ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
++@@ -566,6 +566,7 @@
++ ;
++ ; GFX90A-LABEL: test_mfma_loop_non_splat:
++ ; GFX90A:       ; %bb.0: ; %entry
+++; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
++@@ -599,7 +600,6 @@
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
++ ; GFX90A-NEXT:    s_mov_b32 s0, 16
++-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
++ ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
++ ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
++@@ -626,6 +626,7 @@
++ ;
++ ; GFX942-LABEL: test_mfma_loop_non_splat:
++ ; GFX942:       ; %bb.0: ; %entry
+++; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
++@@ -659,7 +660,6 @@
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
++ ; GFX942-NEXT:    s_mov_b32 s0, 16
++-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
++ ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
++ ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
++diff -ruN --strip-trailing-cr a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
++--- a/mlir/lib/TableGen/Pattern.cpp
+++++ b/mlir/lib/TableGen/Pattern.cpp
++@@ -304,8 +304,8 @@
++     assert(index < 0);
++     auto *operand = cast<NamedTypeConstraint *>(op->getArg(getArgIndex()));
++     if (operand->isOptional()) {
++-      auto repl =
++-          formatv(fmt, formatv("({0}.empty() ? Value() : *{0}.begin())", name));
+++      auto repl = formatv(
+++          fmt, formatv("({0}.empty() ? ::mlir::Value() : *{0}.begin())", name));
++       LLVM_DEBUG(dbgs() << repl << " (OptionalOperand)\n");
++       return std::string(repl);
++     }
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 667ae4e..691c322 100644
+index b41439e..2573c25 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "f6212c1cd3d8b827c7d7e2f6cf54b135c27eacc6"
--    LLVM_SHA256 = "bba6d4c0020622b64202640a94504b957d3597e4f42e9b4f20cbcfa80a8aa41a"
-+    LLVM_COMMIT = "fd9a882ce31cb0a53dba63528c15d76f088854b7"
-+    LLVM_SHA256 = "99ab085087ea6a5f27f293a9f06c837eca57042bc6a7e11f3cd2d9d2168274b3"
+-    LLVM_COMMIT = "2d287f51eff2a5fbf84458a33f7fb2493cf67965"
+-    LLVM_SHA256 = "e06d0a35b0e0570b2f54dfd23d0e9fe6f084e032c14bb7ab194b06cb8c9cb86c"
++    LLVM_COMMIT = "741fef3a445339523500f614e0f752b9a74517a6"
++    LLVM_SHA256 = "ae542233c385388cb5f8ce04bf2085ed108b1a592c530c1746e8c92d00bd33fb"
  
      tf_http_archive(
          name = name,
diff --git a/third_party/shardy/workspace.bzl b/third_party/shardy/workspace.bzl
index 55204d7a4c04..251cf464b061 100644
--- a/third_party/shardy/workspace.bzl
+++ b/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "fc4a137d311e9c9b43a775e1dfd1e20ce06cd879"
-    SHARDY_SHA256 = "672c07073ccab36948a61e079a6a951dedf91d703e5fa910eecf75dc7c0922f4"
+    SHARDY_COMMIT = "badea7dca47a15c2f25e684312aac483eec9ec83"
+    SHARDY_SHA256 = "4677bba14e5c0d271ca4ec9b80a6c17e8f39c0e414e5036da83daf021e5b24fa"
 
     tf_http_archive(
         name = "shardy",
diff --git a/third_party/six.BUILD b/third_party/six.BUILD
deleted file mode 100644
index d6ac1420e305..000000000000
--- a/third_party/six.BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Description:
-#   Six provides simple utilities for wrapping over differences between Python 2
-#   and Python 3.
-
-licenses(["notice"])  # MIT
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "six",
-    srcs = ["six.py"],
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
deleted file mode 100644
index 9559b8e9f17d..000000000000
--- a/third_party/snappy.BUILD
+++ /dev/null
@@ -1,99 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # BSD 3-Clause
-
-exports_files(["COPYING"])
-
-cc_library(
-    name = "snappy",
-    srcs = [
-        "config.h",
-        "snappy.cc",
-        "snappy.h",
-        "snappy-internal.h",
-        "snappy-sinksource.cc",
-        "snappy-sinksource.h",
-        "snappy-stubs-internal.cc",
-        "snappy-stubs-internal.h",
-        "snappy-stubs-public.h",
-    ],
-    hdrs = ["snappy.h"],
-    copts = ["-DHAVE_CONFIG_H"] + select({
-        "@local_xla//xla/tsl:windows": [],
-        "//conditions:default": [
-            "-fno-exceptions",
-            "-Wno-sign-compare",
-            "-Wno-shift-negative-value",
-            "-Wno-implicit-function-declaration",
-        ],
-    }),
-    defines = select({
-        "@local_xla//xla/tsl:windows": [],
-        "//conditions:default": ["HAVE_SYS_UIO_H"],
-    }),
-)
-
-genrule(
-    name = "config_h",
-    outs = ["config.h"],
-    cmd = "\n".join([
-        "cat <<'EOF' >$@",
-        "#define HAVE_STDDEF_H 1",
-        "#define HAVE_STDINT_H 1",
-        "",
-        "#ifdef __has_builtin",
-        "#  if !defined(HAVE_BUILTIN_EXPECT) && __has_builtin(__builtin_expect)",
-        "#    define HAVE_BUILTIN_EXPECT 1",
-        "#  endif",
-        "#  if !defined(HAVE_BUILTIN_CTZ) && __has_builtin(__builtin_ctzll)",
-        "#    define HAVE_BUILTIN_CTZ 1",
-        "#  endif",
-        "#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4)",
-        "#  ifndef HAVE_BUILTIN_EXPECT",
-        "#    define HAVE_BUILTIN_EXPECT 1",
-        "#  endif",
-        "#  ifndef HAVE_BUILTIN_CTZ",
-        "#    define HAVE_BUILTIN_CTZ 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#ifdef __has_include",
-        "#  if !defined(HAVE_BYTESWAP_H) && __has_include(<byteswap.h>)",
-        "#    define HAVE_BYTESWAP_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_UNISTD_H) && __has_include(<unistd.h>)",
-        "#    define HAVE_UNISTD_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_SYS_ENDIAN_H) && __has_include(<sys/endian.h>)",
-        "#    define HAVE_SYS_ENDIAN_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_SYS_MMAN_H) && __has_include(<sys/mman.h>)",
-        "#    define HAVE_SYS_MMAN_H 1",
-        "#  endif",
-        "#  if !defined(HAVE_SYS_UIO_H) && __has_include(<sys/uio.h>)",
-        "#    define HAVE_SYS_UIO_H 1",
-        "#  endif",
-        "#endif",
-        "",
-        "#ifndef SNAPPY_IS_BIG_ENDIAN",
-        "#  ifdef __s390x__",
-        "#    define SNAPPY_IS_BIG_ENDIAN 1",
-        "#  elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__",
-        "#    define SNAPPY_IS_BIG_ENDIAN 1",
-        "#  endif",
-        "#endif",
-        "EOF",
-    ]),
-)
-
-genrule(
-    name = "snappy_stubs_public_h",
-    srcs = ["snappy-stubs-public.h.in"],
-    outs = ["snappy-stubs-public.h"],
-    cmd = ("sed " +
-           "-e 's/$${\\(.*\\)_01}/\\1/g' " +
-           "-e 's/$${SNAPPY_MAJOR}/1/g' " +
-           "-e 's/$${SNAPPY_MINOR}/1/g' " +
-           "-e 's/$${SNAPPY_PATCHLEVEL}/4/g' " +
-           "$< >$@"),
-)
diff --git a/third_party/spirv_llvm_translator/BUILD b/third_party/spirv_llvm_translator/BUILD
deleted file mode 100644
index 8d626dc7635d..000000000000
--- a/third_party/spirv_llvm_translator/BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-# spirv_llvm_translator license placeholder
diff --git a/third_party/spirv_llvm_translator/spirv_llvm_translator.BUILD b/third_party/spirv_llvm_translator/spirv_llvm_translator.BUILD
deleted file mode 100644
index 557e2e8f50ed..000000000000
--- a/third_party/spirv_llvm_translator/spirv_llvm_translator.BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-cc_library(
-    name = "spirv_llvm_translator",
-    srcs = glob([
-        "lib/SPIRV/libSPIRV/*.cpp",
-        "lib/SPIRV/libSPIRV/*.hpp",
-        "lib/SPIRV/libSPIRV/*.h",
-        "lib/SPIRV/Mangler/*.cpp",
-        "lib/SPIRV/Mangler/*.h",
-        "lib/SPIRV/*.cpp",
-        "lib/SPIRV/*.hpp",
-        "lib/SPIRV/*.h",
-    ]),
-    hdrs = glob(["include/*"]),
-    includes = [
-        "include/",
-        "lib/SPIRV/",
-        "lib/SPIRV/Mangler/",
-        "lib/SPIRV/libSPIRV/",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@llvm-project//llvm:Analysis",
-        "@llvm-project//llvm:BitWriter",
-        "@llvm-project//llvm:CodeGen",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Demangle",
-        "@llvm-project//llvm:IRReader",
-        "@llvm-project//llvm:Linker",
-        "@llvm-project//llvm:Passes",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TransformUtils",
-        "@spirv_headers//:spirv_cpp_headers",
-    ],
-)
diff --git a/third_party/spirv_llvm_translator/spirv_llvm_translator.patch b/third_party/spirv_llvm_translator/spirv_llvm_translator.patch
deleted file mode 100644
index fc843b1b039b..000000000000
--- a/third_party/spirv_llvm_translator/spirv_llvm_translator.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-diff --git a/lib/SPIRV/SPIRVInternal.h b/lib/SPIRV/SPIRVInternal.h
-index a828add8..924e13b4 100644
-
-Spir backend uses different addrspace representations link with nvptx backend link.
-We reorder the enum value here so that we can make XLA LLVM codegen simple(avoiding
-changing addrspace based on device backend everywhere)
-
---- a/lib/SPIRV/SPIRVInternal.h
-+++ b/lib/SPIRV/SPIRVInternal.h
-@@ -179,11 +179,12 @@ typedef SPIRVMap<Op, Op, IntBoolOpMapId> IntBoolOpMap;
-   "-v512:512:512-v1024:1024:1024"
-
- enum SPIRAddressSpace {
--  SPIRAS_Private,
-+  SPIRAS_Generic,
-   SPIRAS_Global,
--  SPIRAS_Constant,
-+  SPIRAS_Internal,
-   SPIRAS_Local,
--  SPIRAS_Generic,
-+  SPIRAS_Constant,
-+  SPIRAS_Private,
-   SPIRAS_GlobalDevice,
-   SPIRAS_GlobalHost,
-   SPIRAS_Input,
\ No newline at end of file
diff --git a/third_party/gpus/BUILD b/third_party/stablehlo/BUILD.bazel
similarity index 100%
rename from third_party/gpus/BUILD
rename to third_party/stablehlo/BUILD.bazel
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 3f84cc7af28d..56624322956e 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,71 +1,72 @@
-diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
---- stablehlo/stablehlo/dialect/Base.cpp
-+++ stablehlo/stablehlo/dialect/Base.cpp
-@@ -680,6 +680,7 @@
-           {{bf16, bf16, f32, 1}, KnownDotAlgorithm::BF16_BF16_F32},
-           {{bf16, bf16, f32, 3}, KnownDotAlgorithm::BF16_BF16_F32_X3},
-           {{bf16, bf16, f32, 6}, KnownDotAlgorithm::BF16_BF16_F32_X6},
-+          {{bf16, bf16, f32, 9}, KnownDotAlgorithm::BF16_BF16_F32_X9},
-           {{tf32, tf32, f32, 1}, KnownDotAlgorithm::TF32_TF32_F32},
-           {{tf32, tf32, f32, 3}, KnownDotAlgorithm::TF32_TF32_F32_X3},
-           {{f32, f32, f32, 1}, KnownDotAlgorithm::F32_F32_F32},
-diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
---- stablehlo/stablehlo/dialect/Base.h
-+++ stablehlo/stablehlo/dialect/Base.h
-@@ -260,6 +260,7 @@
-   TF32_TF32_F32_X3 = 10,
-   F32_F32_F32 = 11,
-   F64_F64_F64 = 12,
-+  BF16_BF16_F32_X9 = 13,
- };
- 
- FailureOr<KnownDotAlgorithm> getKnownDotAlgorithm(
-diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/dialect/ChloOps.cpp
---- stablehlo/stablehlo/dialect/ChloOps.cpp
-+++ stablehlo/stablehlo/dialect/ChloOps.cpp
-@@ -710,31 +710,6 @@
-   return success();
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -2635,6 +2635,38 @@
+                           inferredReturnTypes);
  }
  
--LogicalResult RaggedDotOp::inferReturnTypes(
--    MLIRContext*, std::optional<Location>, ValueRange operands,
--    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  RaggedDotOp::Adaptor op(operands, attributes, properties, regions);
--
--  auto rankedLhsType = cast<RankedTensorType>(op.getLhs().getType());
--  auto rankedRhsType = cast<RankedTensorType>(op.getRhs().getType());
--  auto rankedGroupSizesType =
--      cast<RankedTensorType>(op.getGroupSizes().getType());
--  auto raggedDotDimNums = op.getRaggedDotDimensionNumbers();
--
--  inferredReturnTypes.push_back(RankedTensorType::get(
--      inferRaggedDotOutputDimensions(
--          rankedLhsType, rankedRhsType, rankedGroupSizesType,
--          raggedDotDimNums.getLhsBatchingDimensions(),
--          raggedDotDimNums.getRhsBatchingDimensions(),
--          raggedDotDimNums.getLhsContractingDimensions(),
--          raggedDotDimNums.getRhsContractingDimensions(),
--          raggedDotDimNums.getLhsRaggedDimensions(),
--          raggedDotDimNums.getRhsGroupDimensions()),
--      rankedLhsType.getElementType()));
--  return success();
--}
--
++class FoldConstantCaseOp : public OpRewritePattern<CaseOp> {
++ public:
++  explicit FoldConstantCaseOp(MLIRContext* context)
++      : OpRewritePattern<CaseOp>(context) {}
++  LogicalResult matchAndRewrite(CaseOp op,
++                                PatternRewriter& rewriter) const override {
++    DenseIntElementsAttr branch;
++    if (!matchPattern(op.getIndex(), m_Constant(&branch))) return failure();
++
++    int index = *branch.getValues<int>().begin();
++    if (index >= op.getBranches().size() || index < 0) {
++      return failure();
++    }
++
++    Block &block = op.getBranches()[index].back();
++    IRMapping mapping;
++    for (auto &block_op : block.without_terminator()) {
++      rewriter.clone(block_op, mapping);
++    }
++    rewriter.replaceOp(
++        op, llvm::to_vector(llvm::map_range(
++            block.getTerminator()->getOperands(),
++            [&](Value v) { return mapping.lookupOrDefault(v); })));
++      return success();
++  }
++};
++
++void CaseOp::getCanonicalizationPatterns(RewritePatternSet& results,
++                                         MLIRContext* context) {
++  results.add<FoldConstantCaseOp>(context);
++}
++
  //===----------------------------------------------------------------------===//
- // TopKOp
+ // SliceOp
  //===----------------------------------------------------------------------===//
-diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.td b/stablehlo/stablehlo/dialect/ChloOps.td
---- stablehlo/stablehlo/dialect/ChloOps.td
-+++ stablehlo/stablehlo/dialect/ChloOps.td
-@@ -856,8 +856,7 @@
-   let hasCustomAssemblyFormat = 1;
- }
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.td b/stablehlo/stablehlo/dialect/StablehloOps.td
+--- stablehlo/stablehlo/dialect/StablehloOps.td
++++ stablehlo/stablehlo/dialect/StablehloOps.td
+@@ -1392,7 +1392,7 @@
+   );
  
--def CHLO_RaggedDotOp : CHLO_Op<"ragged_dot",
--    [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
-+def CHLO_RaggedDotOp : CHLO_Op<"ragged_dot", [Pure]> {
-   string summary = "Computes a matmul over a single ragged dimension";
+   let regions = (region VariadicRegion<SizedRegion<1>>:$branches /*case_i2*/);
+-
++  let hasCanonicalizer = 1;
+   let results = (outs Variadic<HLO_TensorOrPerAxisQuantizedTensorOrToken>);
+ }
  
-   string description = [{
+diff --ruN a/stablehlo/stablehlo/tests/canonicalize.mlir b/stablehlo/stablehlo/tests/canonicalize.mlir
+--- stablehlo/stablehlo/tests/canonicalize.mlir
++++ stablehlo/stablehlo/tests/canonicalize.mlir
+@@ -0,0 +1,13 @@
++// RUN: stablehlo-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{test-convergence}))' -split-input-file -allow-unregistered-dialect | FileCheck %s
++
++func.func @fold_constant_case(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>) {
++ %0 = "stablehlo.constant"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
++ %1 = "stablehlo.case"(%0) ({
++   "stablehlo.return"(%arg0) : (tensor<i32>) -> ()
++ }, {
++  "stablehlo.return"(%arg1) : (tensor<i32>) -> ()
++ }) : (tensor<i32>) -> tensor<i32>
++ return %1 : tensor<i32>
++
++// CHECK: return %arg1
++}
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 88f17a07d609..5c675bfaca97 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "7b7d6ad438bfcb2f15b8c339c12bb4a7845467f1"
-    STABLEHLO_SHA256 = "68bce868c9cc18eca6468bfc58b2450b6b219829c09aeeab8a2dbb9ffd85697a"
+    STABLEHLO_COMMIT = "630c315b1d2821dd1181137315eda93875096216"
+    STABLEHLO_SHA256 = "705c1ab05624d8b18b756f9984b9c72176fe0d9d32e279459317d4aa8d511561"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/gpus/crosstool/BUILD b/third_party/systemlibs/BUILD.bazel
similarity index 100%
rename from third_party/gpus/crosstool/BUILD
rename to third_party/systemlibs/BUILD.bazel
diff --git a/third_party/systemlibs/protobuf.bzl b/third_party/systemlibs/protobuf.bzl
index 3813d04954e2..5e2a98b51f9f 100644
--- a/third_party/systemlibs/protobuf.bzl
+++ b/third_party/systemlibs/protobuf.bzl
@@ -1,3 +1,8 @@
+""
+
+load("@rules_python//python:py_library.bzl", "py_library")
+load("@rules_python//python:py_test.bzl", "py_test")
+
 def _GetPath(ctx, path):
     if ctx.label.workspace_root:
         return ctx.label.workspace_root + "/" + path
@@ -387,7 +392,7 @@ def py_proto_library(
     if default_runtime and not default_runtime in py_libs + deps:
         py_libs = py_libs + [default_runtime]
 
-    native.py_library(
+    py_library(
         name = name,
         srcs = outs + py_extra_srcs,
         deps = py_libs + deps,
@@ -410,7 +415,7 @@ def internal_protobuf_py_tests(
     """
     for m in modules:
         s = "python/google/protobuf/internal/%s.py" % m
-        native.py_test(
+        py_test(
             name = "py_%s" % m,
             srcs = [s],
             main = s,
diff --git a/third_party/systemlibs/pybind11.BUILD b/third_party/systemlibs/pybind11.BUILD
index 9ea6b4186ee5..6579133497db 100644
--- a/third_party/systemlibs/pybind11.BUILD
+++ b/third_party/systemlibs/pybind11.BUILD
@@ -3,6 +3,6 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "pybind11",
     deps = [
-        "@local_xla//third_party/python_runtime:headers",
+        "@xla@local_xla//third_party/python_runtime:headers",
     ],
 )
diff --git a/third_party/gpus/cuda/BUILD b/third_party/tensorrt/BUILD.bazel
similarity index 100%
rename from third_party/gpus/cuda/BUILD
rename to third_party/tensorrt/BUILD.bazel
diff --git a/third_party/tensorrt/plugin/BUILD b/third_party/tensorrt/plugin/BUILD.bazel
similarity index 100%
rename from third_party/tensorrt/plugin/BUILD
rename to third_party/tensorrt/plugin/BUILD.bazel
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 1ca6d0cbfc6a..32c6d96f1618 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -7,13 +7,13 @@
 """
 
 load(
-    "//third_party/gpus:cuda_configure.bzl",
+    "@local_xla//third_party/gpus:cuda_configure.bzl",
     "find_cuda_config",
     "lib_name",
     "make_copy_files_rule",
 )
 load(
-    "//third_party/remote_config:common.bzl",
+    "@local_xla//third_party/remote_config:common.bzl",
     "config_repo_label",
     "get_cpu_value",
     "get_host_environ",
diff --git a/third_party/tensorrt/workspace.bzl b/third_party/tensorrt/workspace.bzl
index be383ee69eac..e237fa83fed2 100644
--- a/third_party/tensorrt/workspace.bzl
+++ b/third_party/tensorrt/workspace.bzl
@@ -16,6 +16,6 @@ def repo(name = "tensorrt_oss_archive"):
             "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVIDIA/TensorRT/archive/{commit}.tar.gz".format(commit = TRT_OSS_COMMIT),
             "https://github.com/NVIDIA/TensorRT/archive/{commit}.tar.gz".format(commit = TRT_OSS_COMMIT),
         ],
-        build_file = "//third_party/tensorrt/plugin:BUILD",
+        build_file = "//third_party/tensorrt/plugin:BUILD.bazel",
         patch_file = ["//third_party/tensorrt/plugin:tensorrt_oss.patch"],
     )
diff --git a/third_party/gpus/cuda/hermetic/BUILD b/third_party/tf_runtime/BUILD.bazel
similarity index 100%
rename from third_party/gpus/cuda/hermetic/BUILD
rename to third_party/tf_runtime/BUILD.bazel
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 3ab4952432ed..1c2e58999b9f 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "345ae8d48df934a5e440febea9a5e1ec856868b0"
-    TFRT_SHA256 = "07f2ad8b51175ea1521a20a8d42b136ffa89db4aa5fae0935dba8e4a11a9542a"
+    TFRT_COMMIT = "ea3168acde66aa0c51594d9392159bf8cf4b5566"
+    TFRT_SHA256 = "198d3d6509e8fd7a61f7e998ccb26887fa637e8c0e1dd1f4ee16b3788b430eb3"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/BUILD b/third_party/triton/BUILD
index 3033d9222465..fe0016216270 100644
--- a/third_party/triton/BUILD
+++ b/third_party/triton/BUILD
@@ -1,18 +1,16 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
-# copybara:uncomment_begin
-# package(default_applicable_licenses = ["//tensorflow:license"])
-#
-# filegroup(
-#     name = "patch_files",
-#     srcs = glob([
-#         "xla_extensions/**",
-#         "llvm_integration/**",
-#         "temporary/**",
-#     ]),
-#     visibility = ["//visibility:public"],
-# )
-# copybara:uncomment_end
+package(default_applicable_licenses = ["//tensorflow:license"])
+
+filegroup(
+    name = "patch_files",
+    srcs = glob([
+        "xla_extensions/**",
+        "llvm_integration/**",
+        "temporary/**",
+    ]),
+    visibility = ["//visibility:public"],
+)
 
 filegroup(
     name = "workspace",
diff --git a/third_party/gpus/rocm/BUILD b/third_party/triton/BUILD.bazel
similarity index 100%
rename from third_party/gpus/rocm/BUILD
rename to third_party/triton/BUILD.bazel
diff --git a/third_party/triton/llvm_integration/cl722861209.patch b/third_party/triton/llvm_integration/cl722861209.patch
deleted file mode 100644
index 90c59d610359..000000000000
--- a/third_party/triton/llvm_integration/cl722861209.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-
---- a/include/triton/Dialect/Triton/IR/TritonOps.td	2025-02-03 07:46:30.000000000 -0800
-+++ b/include/triton/Dialect/Triton/IR/TritonOps.td	2025-02-03 17:40:44.000000000 -0800
-@@ -1106,7 +1106,12 @@
-     MutableOperandRange getArgOperandsMutable() {
-       return getOperandsMutable();
-     }
--
-+    Attribute removeArgAttrsAttr() { return nullptr; }
-+    Attribute removeResAttrsAttr() { return nullptr; }
-+    ArrayAttr getArgAttrsAttr() { return nullptr; }
-+    ArrayAttr getResAttrsAttr() { return nullptr; }
-+    void setArgAttrsAttr(ArrayAttr) { return; }
-+    void setResAttrsAttr(ArrayAttr) { return; }
-   }];
- 
-   let assemblyFormat = [{
-
---- a/lib/Dialect/Triton/IR/Ops.cpp	2025-01-31 01:23:09.000000000 -0800
-+++ b/lib/Dialect/Triton/IR/Ops.cpp	2025-02-03 17:40:45.000000000 -0800
-@@ -899,7 +899,7 @@
-   if (argAttrs.empty())
-     return;
-   assert(type.getNumInputs() == argAttrs.size());
--  function_interface_impl::addArgAndResultAttrs(
-+  call_interface_impl::addArgAndResultAttrs(
-       builder, state, argAttrs, /*resultAttrs=*/std::nullopt,
-       getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));
- }
diff --git a/third_party/triton/llvm_integration/cl727763182.patch b/third_party/triton/llvm_integration/cl727763182.patch
deleted file mode 100644
index 2d2e6842ab3b..000000000000
--- a/third_party/triton/llvm_integration/cl727763182.patch
+++ /dev/null
@@ -1,11 +0,0 @@
-
---- a/BUILD	2025-02-07 01:23:11.000000000 -0800
-+++ b/BUILD	2025-02-17 01:49:18.000000000 -0800
-@@ -639,6 +639,7 @@
-         "@llvm-project//mlir:Analysis",
-         "@llvm-project//mlir:ControlFlowDialect",
-         "@llvm-project//mlir:DataLayoutInterfaces",
-+        "@llvm-project//mlir:FuncToLLVM",
-         "@llvm-project//mlir:FunctionInterfaces",
-         "@llvm-project//mlir:GPUDialect",
-         "@llvm-project//mlir:IR",
diff --git a/third_party/triton/llvm_integration/cl727917222.patch b/third_party/triton/llvm_integration/cl727917222.patch
deleted file mode 100644
index b4f601010208..000000000000
--- a/third_party/triton/llvm_integration/cl727917222.patch
+++ /dev/null
@@ -1,235 +0,0 @@
-
---- a/test/TritonGPU/combine.mlir	2025-02-07 01:23:11.000000000 -0800
-+++ b/test/TritonGPU/combine.mlir	2025-02-17 12:05:55.000000000 -0800
-@@ -2380,12 +2380,12 @@
-     %c0_i32 = arith.constant 0 : i32
-     %c32_i32 = arith.constant 32 : i32
-     %c4096_i32 = arith.constant 4096 : i32
--    // CHECK: %[[F:.+]]:4 = scf.for
-+    // CHECK: %[[F:.+]]:3 = scf.for
-     // CHECK:   %[[R:.+]] = arith.addf
-     // CHECK:   arith.addf
--    // CHECK:   scf.yield %{{.+}}, %{{.+}}, %{{.+}}, %[[R]]
-+    // CHECK:   scf.yield %{{.+}}, %{{.+}}, %[[R]]
-     // CHECK: }
--    // CHECK: tt.return %[[F]]#3, %[[F]]#1, %[[F]]#2
-+    // CHECK: tt.return %[[F]]#2, %[[F]]#1, %[[F]]#0
-     %1:3 = scf.for %arg0 = %c0_i32 to %c4096_i32 step %c32_i32 iter_args(%arg1 = %cst, %arg3 = %cst_0, %arg4 = %cst) -> (tensor<32xf32, #blocked1>, tensor<32xf32, #blocked>, tensor<32xf32, #blocked1>) : i32 {
-       %4 = arith.addf %arg1, %cst : tensor<32xf32, #blocked1>
-       %5 = ttg.convert_layout %4 : tensor<32xf32, #blocked1> -> tensor<32xf32, #blocked>
-
---- a/test/TritonGPU/samples/simulated-grouped-gemm.mlir	2025-02-03 07:46:30.000000000 -0800
-+++ b/test/TritonGPU/samples/simulated-grouped-gemm.mlir	2025-02-17 12:05:55.000000000 -0800
-@@ -153,115 +153,115 @@
- // CHECK:           %[[VAL_115:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_113]]#1 : !tt.tensordesc<tensor<256x64xf16>> to !tt.ptr<i8>
- // CHECK:           ttng.async_tma_copy_global_to_local %[[VAL_115]]{{\[}}%[[VAL_113]]#6, %[[VAL_109]]] %[[VAL_114]], %[[VAL_110]], %[[VAL_75]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
- // CHECK:           %[[VAL_116:.*]] = ttg.local_alloc  : () -> !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
--// CHECK:           %[[VAL_117:.*]]:24 = scf.for %[[VAL_118:.*]] = %[[VAL_13]] to %[[VAL_44]] step %[[VAL_10]] iter_args(%[[VAL_119:.*]] = %[[VAL_77]], %[[VAL_120:.*]] = %[[VAL_113]]#0, %[[VAL_121:.*]] = %[[VAL_113]]#1, %[[VAL_122:.*]] = %[[VAL_113]]#2, %[[VAL_123:.*]] = %[[VAL_113]]#3, %[[VAL_124:.*]] = %[[VAL_113]]#4, %[[VAL_125:.*]] = %[[VAL_113]]#5, %[[VAL_126:.*]] = %[[VAL_113]]#6, %[[VAL_127:.*]] = %[[VAL_22]], %[[VAL_128:.*]] = %[[VAL_9]], %[[VAL_129:.*]] = %[[VAL_10]], %[[VAL_130:.*]] = %[[VAL_12]], %[[VAL_131:.*]] = %[[VAL_13]], %[[VAL_132:.*]] = %[[VAL_113]]#7, %[[VAL_133:.*]] = %[[VAL_113]]#8, %[[VAL_134:.*]] = %[[VAL_113]]#9, %[[VAL_135:.*]] = %[[VAL_13]], %[[VAL_136:.*]] = %[[VAL_77]], %[[VAL_137:.*]] = %[[VAL_35]], %[[VAL_138:.*]] = %[[VAL_113]]#2, %[[VAL_139:.*]] = %[[VAL_72]]#0, %[[VAL_140:.*]] = %[[VAL_113]]#5, %[[VAL_141:.*]] = %[[VAL_72]]#1, %[[VAL_142:.*]] = %[[VAL_113]]#6) -> (i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32)  : i32 {
--// CHECK:             %[[VAL_143:.*]] = arith.subi %[[VAL_44]], %[[VAL_7]] : i32
--// CHECK:             %[[VAL_144:.*]] = arith.cmpi slt, %[[VAL_118]], %[[VAL_143]] : i32
--// CHECK:             %[[VAL_145:.*]] = arith.cmpi eq, %[[VAL_119]], %[[VAL_45]] : i32
--// CHECK:             %[[VAL_146:.*]] = arith.addi %[[VAL_119]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_147:.*]] = arith.select %[[VAL_145]], %[[VAL_13]], %[[VAL_146]] : i32
--// CHECK:             %[[VAL_148:.*]] = arith.cmpi eq, %[[VAL_147]], %[[VAL_13]] : i32
--// CHECK:             %[[VAL_149:.*]] = arith.andi %[[VAL_144]], %[[VAL_148]] : i1
--// CHECK:             %[[VAL_150:.*]]:10 = scf.if %[[VAL_149]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32) {
--// CHECK:               %[[VAL_151:.*]] = arith.addi %[[VAL_124]], %[[VAL_10]] : i32
--// CHECK:               %[[VAL_152:.*]] = arith.cmpi eq, %[[VAL_151]], %[[VAL_10]] : i32
--// CHECK:               %[[VAL_153:.*]] = arith.select %[[VAL_152]], %[[VAL_13]], %[[VAL_151]] : i32
--// CHECK:               %[[VAL_154:.*]]:6 = scf.if %[[VAL_152]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32) {
--// CHECK:                 %[[VAL_155:.*]] = tt.addptr %[[VAL_0]], %[[VAL_43]] : !tt.ptr<f16>, i32
--// CHECK:                 %[[VAL_156:.*]] = arith.muli %[[VAL_132]], %[[VAL_15]] : i32
--// CHECK:                 %[[VAL_157:.*]] = tt.addptr %[[VAL_46]], %[[VAL_156]] : !tt.ptr<i8>, i32
--// CHECK:                 %[[VAL_158:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
--// CHECK:                 tt.experimental_tensormap_create %[[VAL_157]], %[[VAL_155]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_5]], %[[VAL_3]]], {{\[}}%[[VAL_158]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
--// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_157]] : !tt.ptr<i8>
--// CHECK:                 %[[VAL_159:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_157]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x64xf16>>
--// CHECK:                 %[[VAL_160:.*]] = arith.addi %[[VAL_132]], %[[VAL_10]] : i32
--// CHECK:                 %[[VAL_161:.*]] = arith.cmpi slt, %[[VAL_160]], %[[VAL_8]] : i32
--// CHECK:                 %[[VAL_162:.*]] = arith.select %[[VAL_161]], %[[VAL_160]], %[[VAL_13]] : i32
--// CHECK:                 %[[VAL_163:.*]] = tt.addptr %[[VAL_1]], %[[VAL_43]] : !tt.ptr<f16>, i32
--// CHECK:                 %[[VAL_164:.*]] = arith.muli %[[VAL_133]], %[[VAL_15]] : i32
--// CHECK:                 %[[VAL_165:.*]] = tt.addptr %[[VAL_47]], %[[VAL_164]] : !tt.ptr<i8>, i32
--// CHECK:                 %[[VAL_166:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
--// CHECK:                 tt.experimental_tensormap_create %[[VAL_165]], %[[VAL_163]], {{\[}}%[[VAL_17]], %[[VAL_16]]], {{\[}}%[[VAL_5]], %[[VAL_4]]], {{\[}}%[[VAL_166]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
--// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_165]] : !tt.ptr<i8>
--// CHECK:                 %[[VAL_167:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_165]] : !tt.ptr<i8> to !tt.tensordesc<tensor<256x64xf16>>
--// CHECK:                 %[[VAL_168:.*]] = arith.addi %[[VAL_133]], %[[VAL_10]] : i32
--// CHECK:                 %[[VAL_169:.*]] = arith.cmpi slt, %[[VAL_168]], %[[VAL_8]] : i32
--// CHECK:                 %[[VAL_170:.*]] = arith.select %[[VAL_169]], %[[VAL_168]], %[[VAL_13]] : i32
--// CHECK:                 %[[VAL_171:.*]] = tt.addptr %[[VAL_2]], %[[VAL_43]] : !tt.ptr<f16>, i32
--// CHECK:                 %[[VAL_172:.*]] = arith.muli %[[VAL_134]], %[[VAL_15]] : i32
--// CHECK:                 %[[VAL_173:.*]] = tt.addptr %[[VAL_48]], %[[VAL_172]] : !tt.ptr<i8>, i32
--// CHECK:                 %[[VAL_174:.*]] = arith.muli %[[VAL_34]], %[[VAL_6]] : i64
--// CHECK:                 tt.experimental_tensormap_create %[[VAL_173]], %[[VAL_171]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_4]], %[[VAL_3]]], {{\[}}%[[VAL_174]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
--// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_173]] : !tt.ptr<i8>
--// CHECK:                 %[[VAL_175:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_173]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x256xf16>>
--// CHECK:                 %[[VAL_176:.*]] = arith.addi %[[VAL_134]], %[[VAL_10]] : i32
--// CHECK:                 %[[VAL_177:.*]] = arith.cmpi slt, %[[VAL_176]], %[[VAL_8]] : i32
--// CHECK:                 %[[VAL_178:.*]] = arith.select %[[VAL_177]], %[[VAL_176]], %[[VAL_13]] : i32
--// CHECK:                 scf.yield %[[VAL_159]], %[[VAL_167]], %[[VAL_175]], %[[VAL_162]], %[[VAL_170]], %[[VAL_178]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32
-+// CHECK:           %[[VAL_117:.*]]:20 = scf.for %[[VAL_118:.*]] = %[[VAL_13]] to %[[VAL_44]] step %[[VAL_10]] iter_args(%[[VAL_119:.*]] = %[[VAL_77]], %[[VAL_120:.*]] = %[[VAL_113]]#0, %[[VAL_121:.*]] = %[[VAL_113]]#1, %[[VAL_122:.*]] = %[[VAL_113]]#2, %[[VAL_123:.*]] = %[[VAL_113]]#3, %[[VAL_124:.*]] = %[[VAL_113]]#4, %[[VAL_125:.*]] = %[[VAL_113]]#5, %[[VAL_126:.*]] = %[[VAL_113]]#6, %[[VAL_127:.*]] = %[[VAL_22]], %[[VAL_128:.*]] = %[[VAL_9]], %[[VAL_129:.*]] = %[[VAL_10]], %[[VAL_130:.*]] = %[[VAL_12]], %[[VAL_131:.*]] = %[[VAL_13]], %[[VAL_132:.*]] = %[[VAL_113]]#7, %[[VAL_133:.*]] = %[[VAL_113]]#8, %[[VAL_134:.*]] = %[[VAL_113]]#9, %[[VAL_135:.*]] = %[[VAL_13]], %[[VAL_136:.*]] = %[[VAL_35]], %[[VAL_137:.*]] = %[[VAL_72]]#0, %[[VAL_138:.*]] = %[[VAL_72]]#1) -> (i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, i32, i32)  : i32 {
-+// CHECK:             %[[VAL_139:.*]] = arith.subi %[[VAL_44]], %[[VAL_7]] : i32
-+// CHECK:             %[[VAL_140:.*]] = arith.cmpi slt, %[[VAL_118]], %[[VAL_139]] : i32
-+// CHECK:             %[[VAL_141:.*]] = arith.cmpi eq, %[[VAL_119]], %[[VAL_45]] : i32
-+// CHECK:             %[[VAL_142:.*]] = arith.addi %[[VAL_119]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_143:.*]] = arith.select %[[VAL_141]], %[[VAL_13]], %[[VAL_142]] : i32
-+// CHECK:             %[[VAL_144:.*]] = arith.cmpi eq, %[[VAL_143]], %[[VAL_13]] : i32
-+// CHECK:             %[[VAL_145:.*]] = arith.andi %[[VAL_140]], %[[VAL_144]] : i1
-+// CHECK:             %[[VAL_146:.*]]:10 = scf.if %[[VAL_145]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32) {
-+// CHECK:               %[[VAL_147:.*]] = arith.addi %[[VAL_124]], %[[VAL_10]] : i32
-+// CHECK:               %[[VAL_148:.*]] = arith.cmpi eq, %[[VAL_147]], %[[VAL_10]] : i32
-+// CHECK:               %[[VAL_149:.*]] = arith.select %[[VAL_148]], %[[VAL_13]], %[[VAL_147]] : i32
-+// CHECK:               %[[VAL_150:.*]]:6 = scf.if %[[VAL_148]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32) {
-+// CHECK:                 %[[VAL_151:.*]] = tt.addptr %[[VAL_0]], %[[VAL_43]] : !tt.ptr<f16>, i32
-+// CHECK:                 %[[VAL_152:.*]] = arith.muli %[[VAL_132]], %[[VAL_15]] : i32
-+// CHECK:                 %[[VAL_153:.*]] = tt.addptr %[[VAL_46]], %[[VAL_152]] : !tt.ptr<i8>, i32
-+// CHECK:                 %[[VAL_154:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
-+// CHECK:                 tt.experimental_tensormap_create %[[VAL_153]], %[[VAL_151]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_5]], %[[VAL_3]]], {{\[}}%[[VAL_154]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
-+// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_153]] : !tt.ptr<i8>
-+// CHECK:                 %[[VAL_155:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_153]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x64xf16>>
-+// CHECK:                 %[[VAL_156:.*]] = arith.addi %[[VAL_132]], %[[VAL_10]] : i32
-+// CHECK:                 %[[VAL_157:.*]] = arith.cmpi slt, %[[VAL_156]], %[[VAL_8]] : i32
-+// CHECK:                 %[[VAL_158:.*]] = arith.select %[[VAL_157]], %[[VAL_156]], %[[VAL_13]] : i32
-+// CHECK:                 %[[VAL_159:.*]] = tt.addptr %[[VAL_1]], %[[VAL_43]] : !tt.ptr<f16>, i32
-+// CHECK:                 %[[VAL_160:.*]] = arith.muli %[[VAL_133]], %[[VAL_15]] : i32
-+// CHECK:                 %[[VAL_161:.*]] = tt.addptr %[[VAL_47]], %[[VAL_160]] : !tt.ptr<i8>, i32
-+// CHECK:                 %[[VAL_162:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
-+// CHECK:                 tt.experimental_tensormap_create %[[VAL_161]], %[[VAL_159]], {{\[}}%[[VAL_17]], %[[VAL_16]]], {{\[}}%[[VAL_5]], %[[VAL_4]]], {{\[}}%[[VAL_162]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
-+// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_161]] : !tt.ptr<i8>
-+// CHECK:                 %[[VAL_163:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_161]] : !tt.ptr<i8> to !tt.tensordesc<tensor<256x64xf16>>
-+// CHECK:                 %[[VAL_164:.*]] = arith.addi %[[VAL_133]], %[[VAL_10]] : i32
-+// CHECK:                 %[[VAL_165:.*]] = arith.cmpi slt, %[[VAL_164]], %[[VAL_8]] : i32
-+// CHECK:                 %[[VAL_166:.*]] = arith.select %[[VAL_165]], %[[VAL_164]], %[[VAL_13]] : i32
-+// CHECK:                 %[[VAL_167:.*]] = tt.addptr %[[VAL_2]], %[[VAL_43]] : !tt.ptr<f16>, i32
-+// CHECK:                 %[[VAL_168:.*]] = arith.muli %[[VAL_134]], %[[VAL_15]] : i32
-+// CHECK:                 %[[VAL_169:.*]] = tt.addptr %[[VAL_48]], %[[VAL_168]] : !tt.ptr<i8>, i32
-+// CHECK:                 %[[VAL_170:.*]] = arith.muli %[[VAL_34]], %[[VAL_6]] : i64
-+// CHECK:                 tt.experimental_tensormap_create %[[VAL_169]], %[[VAL_167]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_4]], %[[VAL_3]]], {{\[}}%[[VAL_170]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
-+// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_169]] : !tt.ptr<i8>
-+// CHECK:                 %[[VAL_171:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_169]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x256xf16>>
-+// CHECK:                 %[[VAL_172:.*]] = arith.addi %[[VAL_134]], %[[VAL_10]] : i32
-+// CHECK:                 %[[VAL_173:.*]] = arith.cmpi slt, %[[VAL_172]], %[[VAL_8]] : i32
-+// CHECK:                 %[[VAL_174:.*]] = arith.select %[[VAL_173]], %[[VAL_172]], %[[VAL_13]] : i32
-+// CHECK:                 scf.yield %[[VAL_155]], %[[VAL_163]], %[[VAL_171]], %[[VAL_158]], %[[VAL_166]], %[[VAL_174]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32
- // CHECK:               } else {
- // CHECK:                 scf.yield %[[VAL_120]], %[[VAL_121]], %[[VAL_122]], %[[VAL_132]], %[[VAL_133]], %[[VAL_134]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32
- // CHECK:               }
--// CHECK:               %[[VAL_179:.*]] = arith.addi %[[VAL_123]], %[[VAL_11]] : i32
--// CHECK:               %[[VAL_180:.*]] = arith.divsi %[[VAL_179]], %[[VAL_42]] : i32
--// CHECK:               %[[VAL_181:.*]] = arith.muli %[[VAL_180]], %[[VAL_14]] : i32
--// CHECK:               %[[VAL_182:.*]] = arith.subi %[[VAL_25]], %[[VAL_181]] : i32
--// CHECK:               %[[VAL_183:.*]] = arith.minsi %[[VAL_182]], %[[VAL_14]] : i32
--// CHECK:               %[[VAL_184:.*]] = arith.remsi %[[VAL_179]], %[[VAL_183]] : i32
--// CHECK:               %[[VAL_185:.*]] = arith.addi %[[VAL_181]], %[[VAL_184]] : i32
--// CHECK:               %[[VAL_186:.*]] = arith.remsi %[[VAL_179]], %[[VAL_42]] : i32
--// CHECK:               %[[VAL_187:.*]] = arith.divsi %[[VAL_186]], %[[VAL_183]] : i32
--// CHECK:               %[[VAL_188:.*]] = arith.muli %[[VAL_185]], %[[VAL_15]] : i32
--// CHECK:               %[[VAL_189:.*]] = arith.muli %[[VAL_187]], %[[VAL_16]] : i32
--// CHECK:               scf.yield %[[VAL_190:.*]]#0, %[[VAL_190]]#1, %[[VAL_190]]#2, %[[VAL_179]], %[[VAL_153]], %[[VAL_188]], %[[VAL_189]], %[[VAL_190]]#3, %[[VAL_190]]#4, %[[VAL_190]]#5 : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32
-+// CHECK:               %[[VAL_175:.*]] = arith.addi %[[VAL_123]], %[[VAL_11]] : i32
-+// CHECK:               %[[VAL_176:.*]] = arith.divsi %[[VAL_175]], %[[VAL_42]] : i32
-+// CHECK:               %[[VAL_177:.*]] = arith.muli %[[VAL_176]], %[[VAL_14]] : i32
-+// CHECK:               %[[VAL_178:.*]] = arith.subi %[[VAL_25]], %[[VAL_177]] : i32
-+// CHECK:               %[[VAL_179:.*]] = arith.minsi %[[VAL_178]], %[[VAL_14]] : i32
-+// CHECK:               %[[VAL_180:.*]] = arith.remsi %[[VAL_175]], %[[VAL_179]] : i32
-+// CHECK:               %[[VAL_181:.*]] = arith.addi %[[VAL_177]], %[[VAL_180]] : i32
-+// CHECK:               %[[VAL_182:.*]] = arith.remsi %[[VAL_175]], %[[VAL_42]] : i32
-+// CHECK:               %[[VAL_183:.*]] = arith.divsi %[[VAL_182]], %[[VAL_179]] : i32
-+// CHECK:               %[[VAL_184:.*]] = arith.muli %[[VAL_181]], %[[VAL_15]] : i32
-+// CHECK:               %[[VAL_185:.*]] = arith.muli %[[VAL_183]], %[[VAL_16]] : i32
-+// CHECK:               scf.yield %[[VAL_186:.*]]#0, %[[VAL_186]]#1, %[[VAL_186]]#2, %[[VAL_175]], %[[VAL_149]], %[[VAL_184]], %[[VAL_185]], %[[VAL_186]]#3, %[[VAL_186]]#4, %[[VAL_186]]#5 : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32
- // CHECK:             } else {
- // CHECK:               scf.yield %[[VAL_120]], %[[VAL_121]], %[[VAL_122]], %[[VAL_123]], %[[VAL_124]], %[[VAL_125]], %[[VAL_126]], %[[VAL_132]], %[[VAL_133]], %[[VAL_134]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32
- // CHECK:             }
--// CHECK:             %[[VAL_191:.*]] = arith.addi %[[VAL_130]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_192:.*]] = arith.cmpi slt, %[[VAL_191]], %[[VAL_8]] : i32
--// CHECK:             %[[VAL_193:.*]] = arith.select %[[VAL_192]], %[[VAL_191]], %[[VAL_13]] : i32
--// CHECK:             %[[VAL_194:.*]] = arith.xori %[[VAL_131]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_195:.*]] = arith.select %[[VAL_192]], %[[VAL_131]], %[[VAL_194]] : i32
--// CHECK:             %[[VAL_196:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_193]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             ttng.wait_barrier %[[VAL_196]], %[[VAL_195]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             %[[VAL_197:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_193]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
--// CHECK:             %[[VAL_198:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_193]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
--// CHECK:             %[[VAL_199:.*]] = ttg.memdesc_trans %[[VAL_197]] {order = array<i32: 1, 0>} : !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64> -> !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
--// CHECK:             %[[VAL_200:.*]] = ttng.warp_group_dot %[[VAL_198]], %[[VAL_199]], %[[VAL_127]], %[[VAL_128]] {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64> * !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable> -> tensor<128x256xf32, #[[$ATTR_0]]>
--// CHECK:             %[[VAL_201:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_200]], %[[VAL_198]], %[[VAL_199]] {pendings = 1 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
--// CHECK:             %[[VAL_202:.*]] = arith.addi %[[VAL_129]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_203:.*]] = arith.cmpi slt, %[[VAL_202]], %[[VAL_8]] : i32
--// CHECK:             %[[VAL_204:.*]] = arith.select %[[VAL_203]], %[[VAL_202]], %[[VAL_13]] : i32
--// CHECK:             %[[VAL_205:.*]] = arith.muli %[[VAL_147]], %[[VAL_17]] : i32
--// CHECK:             %[[VAL_206:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_204]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             ttng.barrier_expect %[[VAL_206]], 49152, %[[VAL_144]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             %[[VAL_207:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_204]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
--// CHECK:             %[[VAL_208:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_209:.*]]#0 : !tt.tensordesc<tensor<128x64xf16>> to !tt.ptr<i8>
--// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_208]]{{\[}}%[[VAL_209]]#5, %[[VAL_205]]] %[[VAL_207]], %[[VAL_206]], %[[VAL_144]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
--// CHECK:             %[[VAL_210:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_204]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
--// CHECK:             %[[VAL_211:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_209]]#1 : !tt.tensordesc<tensor<256x64xf16>> to !tt.ptr<i8>
--// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_211]]{{\[}}%[[VAL_209]]#6, %[[VAL_205]]] %[[VAL_210]], %[[VAL_206]], %[[VAL_144]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
--// CHECK:             %[[VAL_212:.*]] = arith.cmpi eq, %[[VAL_135]], %[[VAL_45]] : i32
--// CHECK:             %[[VAL_213:.*]] = arith.cmpi ne, %[[VAL_135]], %[[VAL_45]] : i32
--// CHECK:             scf.if %[[VAL_212]] {
--// CHECK:               %[[VAL_214:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_201]]#0, %[[VAL_198]], %[[VAL_199]] {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
--// CHECK:               %[[VAL_215:.*]] = arith.truncf %[[VAL_214]]#0 : tensor<128x256xf32, #[[$ATTR_0]]> to tensor<128x256xf16, #[[$ATTR_0]]>
-+// CHECK:             %[[VAL_187:.*]] = arith.addi %[[VAL_130]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_188:.*]] = arith.cmpi slt, %[[VAL_187]], %[[VAL_8]] : i32
-+// CHECK:             %[[VAL_189:.*]] = arith.select %[[VAL_188]], %[[VAL_187]], %[[VAL_13]] : i32
-+// CHECK:             %[[VAL_190:.*]] = arith.xori %[[VAL_131]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_191:.*]] = arith.select %[[VAL_188]], %[[VAL_131]], %[[VAL_190]] : i32
-+// CHECK:             %[[VAL_192:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_189]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             ttng.wait_barrier %[[VAL_192]], %[[VAL_191]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             %[[VAL_193:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_189]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
-+// CHECK:             %[[VAL_194:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_189]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
-+// CHECK:             %[[VAL_195:.*]] = ttg.memdesc_trans %[[VAL_193]] {order = array<i32: 1, 0>} : !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64> -> !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
-+// CHECK:             %[[VAL_196:.*]] = ttng.warp_group_dot %[[VAL_194]], %[[VAL_195]], %[[VAL_127]], %[[VAL_128]] {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64> * !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable> -> tensor<128x256xf32, #[[$ATTR_0]]>
-+// CHECK:             %[[VAL_197:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_196]], %[[VAL_194]], %[[VAL_195]] {pendings = 1 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
-+// CHECK:             %[[VAL_198:.*]] = arith.addi %[[VAL_129]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_199:.*]] = arith.cmpi slt, %[[VAL_198]], %[[VAL_8]] : i32
-+// CHECK:             %[[VAL_200:.*]] = arith.select %[[VAL_199]], %[[VAL_198]], %[[VAL_13]] : i32
-+// CHECK:             %[[VAL_201:.*]] = arith.muli %[[VAL_143]], %[[VAL_17]] : i32
-+// CHECK:             %[[VAL_202:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_200]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             ttng.barrier_expect %[[VAL_202]], 49152, %[[VAL_140]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             %[[VAL_203:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_200]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
-+// CHECK:             %[[VAL_204:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_205:.*]]#0 : !tt.tensordesc<tensor<128x64xf16>> to !tt.ptr<i8>
-+// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_204]]{{\[}}%[[VAL_205]]#5, %[[VAL_201]]] %[[VAL_203]], %[[VAL_202]], %[[VAL_140]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
-+// CHECK:             %[[VAL_206:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_200]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
-+// CHECK:             %[[VAL_207:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_205]]#1 : !tt.tensordesc<tensor<256x64xf16>> to !tt.ptr<i8>
-+// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_207]]{{\[}}%[[VAL_205]]#6, %[[VAL_201]]] %[[VAL_206]], %[[VAL_202]], %[[VAL_140]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
-+// CHECK:             %[[VAL_208:.*]] = arith.cmpi eq, %[[VAL_135]], %[[VAL_45]] : i32
-+// CHECK:             %[[VAL_209:.*]] = arith.cmpi ne, %[[VAL_135]], %[[VAL_45]] : i32
-+// CHECK:             scf.if %[[VAL_208]] {
-+// CHECK:               %[[VAL_210:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_197]]#0, %[[VAL_194]], %[[VAL_195]] {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
-+// CHECK:               %[[VAL_211:.*]] = arith.truncf %[[VAL_210]]#0 : tensor<128x256xf32, #[[$ATTR_0]]> to tensor<128x256xf16, #[[$ATTR_0]]>
- // CHECK:               ttng.async_tma_store_wait {pendings = 0 : i32}
--// CHECK:               ttg.local_store %[[VAL_215]], %[[VAL_116]] : tensor<128x256xf16, #[[$ATTR_0]]> -> !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
-+// CHECK:               ttg.local_store %[[VAL_211]], %[[VAL_116]] : tensor<128x256xf16, #[[$ATTR_0]]> -> !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:               ttng.fence_async_shared {bCluster = false}
--// CHECK:               %[[VAL_216:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_137]] : !tt.tensordesc<tensor<128x256xf16>> to !tt.ptr<i8>
--// CHECK:               ttng.async_tma_copy_local_to_global %[[VAL_216]]{{\[}}%[[VAL_139]], %[[VAL_141]]] %[[VAL_116]] : !tt.ptr<i8>, !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
-+// CHECK:               %[[VAL_212:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_136]] : !tt.tensordesc<tensor<128x256xf16>> to !tt.ptr<i8>
-+// CHECK:               ttng.async_tma_copy_local_to_global %[[VAL_212]]{{\[}}%[[VAL_137]], %[[VAL_138]]] %[[VAL_116]] : !tt.ptr<i8>, !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:             }
--// CHECK:             scf.yield %[[VAL_147]], %[[VAL_209]]#0, %[[VAL_209]]#1, %[[VAL_209]]#2, %[[VAL_209]]#3, %[[VAL_209]]#4, %[[VAL_209]]#5, %[[VAL_209]]#6, %[[VAL_201]]#0, %[[VAL_213]], %[[VAL_204]], %[[VAL_193]], %[[VAL_195]], %[[VAL_209]]#7, %[[VAL_209]]#8, %[[VAL_209]]#9, %[[VAL_136]], %[[VAL_147]], %[[VAL_138]], %[[VAL_209]]#2, %[[VAL_140]], %[[VAL_209]]#5, %[[VAL_142]], %[[VAL_209]]#6 : i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32
-+// CHECK:             scf.yield %[[VAL_143]], %[[VAL_205]]#0, %[[VAL_205]]#1, %[[VAL_205]]#2, %[[VAL_205]]#3, %[[VAL_205]]#4, %[[VAL_205]]#5, %[[VAL_205]]#6, %[[VAL_197]]#0, %[[VAL_209]], %[[VAL_200]], %[[VAL_189]], %[[VAL_191]], %[[VAL_205]]#7, %[[VAL_205]]#8, %[[VAL_205]]#9, %[[VAL_119]], %[[VAL_122]], %[[VAL_125]], %[[VAL_126]] : i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, i32, i32
- // CHECK:           }
- // CHECK:           ttng.async_tma_store_wait {pendings = 0 : i32}
- // CHECK:           ttg.local_dealloc %[[VAL_116]] : !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
--// CHECK:           %[[VAL_217:.*]] = ttng.warp_group_dot_wait %[[VAL_218:.*]]#8 {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>
--// CHECK:           %[[VAL_219:.*]] = ttg.async_wait  {num = 0 : i32}
--// CHECK:           %[[VAL_220:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_13]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           ttng.inval_barrier %[[VAL_220]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           %[[VAL_221:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_10]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           ttng.inval_barrier %[[VAL_221]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           %[[VAL_222:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_7]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           ttng.inval_barrier %[[VAL_222]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           %[[VAL_213:.*]] = ttng.warp_group_dot_wait %[[VAL_214:.*]]#8 {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>
-+// CHECK:           %[[VAL_215:.*]] = ttg.async_wait  {num = 0 : i32}
-+// CHECK:           %[[VAL_216:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_13]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           ttng.inval_barrier %[[VAL_216]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           %[[VAL_217:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_10]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           ttng.inval_barrier %[[VAL_217]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           %[[VAL_218:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_7]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           ttng.inval_barrier %[[VAL_218]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
- // CHECK:           ttg.local_dealloc %[[VAL_49]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:           ttg.local_dealloc %[[VAL_50]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:           tt.return
diff --git a/third_party/triton/llvm_integration/cl728192169.patch b/third_party/triton/llvm_integration/cl728192169.patch
deleted file mode 100644
index 90b3f63aa862..000000000000
--- a/third_party/triton/llvm_integration/cl728192169.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-
---- a/python/src/passes.cc	2025-01-21 05:40:49.000000000 -0800
-+++ b/python/src/passes.cc	2025-02-18 07:12:11.000000000 -0800
-@@ -79,7 +79,7 @@
- 
- void init_triton_passes_convert(py::module &&m) {
-   using namespace mlir;
--  ADD_PASS_WRAPPER_0("add_scf_to_cf", createConvertSCFToCFPass);
-+  ADD_PASS_WRAPPER_0("add_scf_to_cf", createSCFToControlFlowPass);
-   ADD_PASS_WRAPPER_0("add_cf_to_llvmir", createConvertControlFlowToLLVMPass);
-   ADD_PASS_WRAPPER_0("add_index_to_llvmir", createConvertIndexToLLVMPass);
-   ADD_PASS_WRAPPER_0("add_arith_to_llvmir", createArithToLLVMConversionPass);
diff --git a/third_party/triton/llvm_integration/cl728670559.patch b/third_party/triton/llvm_integration/cl728670559.patch
deleted file mode 100644
index a69470a5f7bc..000000000000
--- a/third_party/triton/llvm_integration/cl728670559.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-
---- a/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir	2025-01-21 05:40:49.000000000 -0800
-+++ b/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir	2025-02-19 08:53:36.000000000 -0800
-@@ -1,5 +1,5 @@
--// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx940 matrix-instruction-size=0' | FileCheck %s --check-prefixes MFMA0,CHECK
--// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx940 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
-+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=0' | FileCheck %s --check-prefixes MFMA0,CHECK
-+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
- 
- #blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
- // CHECK-LABEL: mfma_dot_fp8e5m2
-
---- a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir	2025-01-31 01:23:09.000000000 -0800
-+++ b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir	2025-02-19 08:53:36.000000000 -0800
-@@ -1,4 +1,4 @@
--// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops='arch-generation-name=gfx940'| FileCheck %s
-+// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops='arch-generation-name=gfx942'| FileCheck %s
- 
- #blocked0 = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
- module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
-
---- a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp	2025-01-21 05:40:49.000000000 -0800
-+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp	2025-02-19 08:05:12.000000000 -0800
-@@ -13,8 +13,6 @@
-   switch (kind) {
-   case llvm::AMDGPU::GK_GFX950:
-   case llvm::AMDGPU::GK_GFX942:
--  case llvm::AMDGPU::GK_GFX941:
--  case llvm::AMDGPU::GK_GFX940:
-     return ISAFamily::CDNA3;
-   case llvm::AMDGPU::GK_GFX90A:
-     return ISAFamily::CDNA2;
diff --git a/third_party/triton/llvm_integration/cl751628816.patch b/third_party/triton/llvm_integration/cl751628816.patch
new file mode 100644
index 000000000000..d0669de9c8e0
--- /dev/null
+++ b/third_party/triton/llvm_integration/cl751628816.patch
@@ -0,0 +1,31 @@
+
+--- a/lib/Conversion/TritonToTritonGPU/TritonGPUConversion.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/lib/Conversion/TritonToTritonGPU/TritonGPUConversion.cpp	2025-04-25 18:48:56.000000000 -0700
+@@ -55,15 +55,6 @@
+   //
+   // Materializations
+   //
+-  // This will be called when (newArgType != origArgType)
+-  // This will create newArg, and map(origArg, newArg)
+-  addArgumentMaterialization([](OpBuilder &builder, RankedTensorType tensorType,
+-                                ValueRange inputs, Location loc) -> Value {
+-    llvm_unreachable("Argument rematerialization should not happen in Triton "
+-                     "-> TritonGPU conversion");
+-    return {};
+-  });
+-
+   // If the origValue still has live user(s), use this to
+   // convert origValue to newValue
+   addSourceMaterialization([=](OpBuilder &builder, RankedTensorType tensorType,
+
+--- a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp	2025-04-25 18:48:56.000000000 -0700
+@@ -13,7 +13,7 @@
+ #include "mlir/IR/TypeUtilities.h"
+ #include "mlir/IR/Value.h"
+ #include "mlir/Pass/Pass.h"
+-#include "mlir/Transforms/OneToNTypeConversion.h"
++#include "mlir/Transforms/DialectConversion.h"
+ #include "triton/Analysis/Utility.h"
+ #include "triton/Dialect/Triton/IR/Dialect.h"
+ #include "triton/Dialect/Triton/IR/Types.h"
diff --git a/third_party/triton/llvm_integration/cl752825850.patch b/third_party/triton/llvm_integration/cl752825850.patch
new file mode 100644
index 000000000000..dc0a4be8fc11
--- /dev/null
+++ b/third_party/triton/llvm_integration/cl752825850.patch
@@ -0,0 +1,87 @@
+
+--- a/third_party/amd/BUILD	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/amd/BUILD	2025-04-29 12:09:00.000000000 -0700
+@@ -131,6 +131,7 @@
+         "@llvm-project//llvm:Support",
+         "@llvm-project//llvm:TargetParser",
+         "@llvm-project//mlir:AMDGPUDialect",
++        "@llvm-project//mlir:AMDGPUUtils",
+         "@llvm-project//mlir:Analysis",
+         "@llvm-project//mlir:ArithDialect",
+         "@llvm-project//mlir:ArithToLLVM",
+
+--- a/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp	2025-02-03 07:46:30.000000000 -0800
++++ b/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp	2025-04-29 12:09:00.000000000 -0700
+@@ -194,7 +194,7 @@
+     ModuleOp mod = getOperation();
+ 
+     GreedyRewriteConfig config;
+-    config.enableRegionSimplification = GreedySimplifyRegionLevel::Aggressive;
++    config.setRegionSimplificationLevel(GreedySimplifyRegionLevel::Aggressive);
+ 
+     RewritePatternSet patterns(context);
+     patterns.add<CallOpConversion>(context, this->ftz);
+
+--- a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp	2025-03-25 07:48:50.000000000 -0700
++++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp	2025-04-29 12:09:00.000000000 -0700
+@@ -1,8 +1,8 @@
+-#include "TritonAMDGPUToLLVM/Passes.h"
+-
+ #include "PatternTritonGPUOpToLLVM.h"
+ #include "SchedInstructions.h"
+ #include "TargetInfo.h"
++#include "TritonAMDGPUToLLVM/Passes.h"
++#include "llvm/TargetParser/TargetParser.h"
+ #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+ #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+ #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+@@ -10,11 +10,13 @@
+ #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+ #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+ #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
++#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+ #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+ #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+ #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+ #include "mlir/Pass/Pass.h"
+ #include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
++#include "third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h"
+ #include "triton/Analysis/Allocation.h"
+ #include "triton/Analysis/AxisInfo.h"
+ #include "triton/Analysis/Membar.h"
+@@ -24,8 +26,6 @@
+ #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+ #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+ 
+-#include "third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h"
+-
+ namespace mlir::triton {
+ #define GEN_PASS_DEF_CONVERTTRITONAMDGPUTOLLVM
+ #include "TritonAMDGPUToLLVM/Passes.h.inc"
+@@ -85,6 +85,15 @@
+       mod.emitError("unsupported target: '") << this->arch.getValue() << "'";
+       return signalPassFailure();
+     }
++    llvm::StringRef chipset =
++        llvm::AMDGPU::getArchNameAMDGCN(targetInfo.getGPUKind());
++    llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
++        mlir::amdgpu::Chipset::parse(chipset);
++    if (failed(maybeChipset)) {
++      mlir::emitError(mlir::UnknownLoc::get(&getContext()),
++                      "Invalid chipset name: " + chipset);
++      return signalPassFailure();
++    }
+ 
+     mlir::LowerToLLVMOptions option(context);
+     option.overrideIndexBitwidth(32);
+@@ -207,8 +216,8 @@
+     mlir::populateMathToLLVMConversionPatterns(typeConverter, patterns);
+ 
+     // Native lowering patterns
+-    mlir::populateGpuToROCDLConversionPatterns(typeConverter, patterns,
+-                                               mlir::gpu::amd::HIP);
++    mlir::populateGpuToROCDLConversionPatterns(
++        typeConverter, patterns, mlir::gpu::amd::HIP, *maybeChipset);
+ 
+     mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
+                                                           patterns);
diff --git a/third_party/triton/llvm_integration/cl753345493.patch b/third_party/triton/llvm_integration/cl753345493.patch
new file mode 100644
index 000000000000..4bc4a8267ad1
--- /dev/null
+++ b/third_party/triton/llvm_integration/cl753345493.patch
@@ -0,0 +1,57 @@
+
+--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-03-25 07:48:50.000000000 -0700
++++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -295,7 +295,7 @@
+             offset);
+       }
+       auto vecAddr = b.gep(sharedPtrTy, elemTy, smemBase, offset);
+-      vecAddr.setInbounds(true);
++      vecAddr.setNoWrapFlags(mlir::LLVM::GEPNoWrapFlags::inbounds);
+       return vecAddr;
+     };
+ 
+
+--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp	2025-04-25 05:19:43.000000000 -0700
++++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -399,7 +399,7 @@
+   }
+   auto ptrTy = smemBase.getType();
+   auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset);
+-  vecAddr.setInbounds(true);
++  vecAddr.setNoWrapFlags(mlir::LLVM::GEPNoWrapFlags::inbounds);
+   return vecAddr;
+ }
+ 
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -191,7 +191,7 @@
+             .second;
+     Value offset = b.xor_(regBase, b.i32_val(regIdx));
+     auto vecAddr = b.gep(smemPtrTy, llvmElemTy, smemBase, offset);
+-    vecAddr.setInbounds(true);
++    vecAddr.setNoWrapFlags(mlir::LLVM::GEPNoWrapFlags::inbounds);
+     SmallVector<Value> inValsVec;
+     for (int j = 0; j < srcVec; j++)
+       inValsVec.push_back(srcVals[i + j]);
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -234,7 +234,7 @@
+     SmallVector<Value> vals = unpackLLVector(loc, val, rewriter);
+     for (int i = 0; i < vec / maxVec; i++) {
+       auto newPtr = b.gep(ptr.getType(), elemTy, ptr, b.i32_val(i * maxVec),
+-                          /*inbounds=*/true);
++                          mlir::LLVM::GEPNoWrapFlags::inbounds);
+       storeDShared(
+           rewriter, loc, newPtr, ctaId,
+           packLLVector(loc, ArrayRef(vals).slice(i * maxVec, maxVec), rewriter),
+@@ -347,7 +347,7 @@
+     SmallVector<Value> vals;
+     for (int i = 0; i < vec / maxVec; i++) {
+       auto newPtr = b.gep(ptr.getType(), elemTy, ptr, b.i32_val(i * maxVec),
+-                          /*inbounds=*/true);
++                          mlir::LLVM::GEPNoWrapFlags::inbounds);
+       auto newVal = loadDShared(rewriter, loc, newPtr, ctaId,
+                                 vec_ty(elemTy, maxVec), pred);
+       for (Value v : unpackLLVector(loc, newVal, rewriter)) {
diff --git a/third_party/triton/llvm_integration/cl757906268.patch b/third_party/triton/llvm_integration/cl757906268.patch
new file mode 100644
index 000000000000..47c4b4586c6d
--- /dev/null
+++ b/third_party/triton/llvm_integration/cl757906268.patch
@@ -0,0 +1,12 @@
+
+--- a/test/Conversion/cvt_to_llvm.mlir	2025-01-31 01:23:09.000000000 -0800
++++ b/test/Conversion/cvt_to_llvm.mlir	2025-05-12 14:34:12.000000000 -0700
+@@ -48,7 +48,7 @@
+ 
+   // CHECK-DAG: [[X_MOD_2:%.*]] = and i32 [[TID]], 1
+   // CHECK-DAG: [[X_2_4_LOWER:%.*]] = shl {{.*}} i32 [[IS_UPPER_HALF]], 1
+-  // CHECK-DAG: [[X_2_4_UPPER0:%.*]] = shl i32 [[TID]], 1
++  // CHECK-DAG: [[X_2_4_UPPER0:%.*]] = shl {{.*}} i32 [[TID]], 1
+   // CHECK-DAG: [[X_2_4_UPPER1:%.*]] = and i32 [[X_2_4_UPPER0]], 24
+   // CHECK-DAG: [[X_GE_16:%.*]] = and i32 [[TID]], 16
+   // CHECK-DAG: [[X_GE_16_2:%.*]] = lshr exact i32 [[X_GE_16]], 3
diff --git a/third_party/triton/llvm_integration/series.bzl b/third_party/triton/llvm_integration/series.bzl
index 1683cdaff129..dc9627f2c8b2 100644
--- a/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/triton/llvm_integration/series.bzl
@@ -8,9 +8,8 @@ LLVM nor MLIR integrator, please do not add any patches to this list.
 """
 
 llvm_patch_list = [
-    "//third_party/triton:llvm_integration/cl727763182.patch",
-    "//third_party/triton:llvm_integration/cl727917222.patch",
-    "//third_party/triton:llvm_integration/cl728192169.patch",
-    "//third_party/triton:llvm_integration/cl728670559.patch",
+    "//third_party/triton:llvm_integration/cl752825850.patch",
+    "//third_party/triton:llvm_integration/cl753345493.patch",
+    "//third_party/triton:llvm_integration/cl757906268.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/triton/temporary/addition_to_sparsity.patch b/third_party/triton/temporary/addition_to_sparsity.patch
deleted file mode 100644
index be953584c100..000000000000
--- a/third_party/triton/temporary/addition_to_sparsity.patch
+++ /dev/null
@@ -1,27 +0,0 @@
-# This patch should be merged with public/sparsity.patch in the beginning of the next
-# integration.
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -230,7 +230,7 @@ static bool bwdFilter(Operation *op) {
- // result, kwidth can be the bitwidth of the lower precision primitive.
- // Conversely, in the downcasting scenario, no reordering is performed,
- // making it directory use the lower precision primitive.
--static int computeOrigBitWidth(Value x) {
-+int computeOrigBitWidth(Value x) {
-   int finalBitWidth = getElementTypeOrSelf(x).getIntOrFloatBitWidth();
-   int origBitWidth = finalBitWidth;
-   SetVector<Operation *> slice;
-@@ -1045,10 +1045,6 @@ public:
-   }
- };
- 
--// Expose helper functions from BlockedToMMA to be reused for sparse matmul.
--int computeOrigBitWidth(Value x) {
--  return BlockedToMMA::computeOrigBitWidth(x);
--}
- Value getSharedMemMMAOperand(Value v, mlir::PatternRewriter &rewriter,
-                                 int opIdx, bool allowTranspose) {
-   return getSharedMemoryMMAOperand(v, rewriter, opIdx, allowTranspose);
-
- 
\ No newline at end of file
diff --git a/third_party/triton/temporary/enable_peer_access.patch b/third_party/triton/temporary/enable_peer_access.patch
deleted file mode 100644
index 35a590e2ece1..000000000000
--- a/third_party/triton/temporary/enable_peer_access.patch
+++ /dev/null
@@ -1,81 +0,0 @@
-This adds support for accessing peer memory from Trition.
-TODO: b/347711619 - Upstream after we've upstreamed the rest of the launcher.
-
-diff --git a/python/test/unit/runtime/test_peer_access.py b/python/test/unit/runtime/test_peer_access.py
-new file mode 100644
---- /dev/null
-+++ b/python/test/unit/runtime/test_peer_access.py
-@@ -0,0 +1,24 @@
-+import pytest
-+import torch
-+
-+import triton
-+import triton.language as tl
-+
-+
-+def test_peer_access(device):
-+    if not hasattr(torch, device):
-+       pytest.skip(f"{device} does not support peer access")
-+    if getattr(torch, device).device_count() < 2:
-+       pytest.skip("need at least 2 devices to test peer access")
-+
-+    @triton.jit
-+    def device_accumulate(my_ptr, peer_ptr):
-+        tl.store(my_ptr, tl.load(my_ptr) + tl.load(peer_ptr))
-+
-+    my_tensor = torch.randn(1, device=f"{device}:0")
-+    peer_tensor = torch.randn(1, device=f"{device}:1")
-+    expected = my_tensor + peer_tensor.to(device=f"{device}:0")
-+
-+    device_accumulate[(1,1,1)](my_tensor, peer_tensor)
-+
-+    torch.testing.assert_close(my_tensor, expected)
-diff --git a/third_party/nvidia/backend/cuda_utils.cc b/third_party/nvidia/backend/cuda_utils.cc
---- a/third_party/nvidia/backend/cuda_utils.cc
-+++ b/third_party/nvidia/backend/cuda_utils.cc
-@@ -168,6 +168,35 @@ PyObject* launchKernel(const TritonLaunc
- // otherwise.
- using ExtractorType = bool (*)(PyObject* obj, void* ptr);
- 
-+// Enable peer access if dev_ptr is allocated on a different device than the
-+// device on which we will execute the kernel.
-+PyObject* enablePeerAccessIfNecessary(CUdeviceptr dev_ptr) {
-+  CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
-+  CUresult status = cuPointerGetAttribute(
-+      &mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dev_ptr);
-+  if (status != CUDA_SUCCESS || mem_type != CU_MEMORYTYPE_DEVICE) {
-+    // Not peer memory
-+    Py_RETURN_NONE;
-+  }
-+  int mem_device_ordinal = 0;
-+  CUDA_CHECK_AND_RETURN_NULL(cuPointerGetAttribute(
-+      &mem_device_ordinal, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
-+  CUdevice mem_device = 0;
-+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGet(&mem_device, mem_device_ordinal));
-+  CUdevice compute_device = 0;
-+  CUDA_CHECK_AND_RETURN_NULL(cuCtxGetDevice(&compute_device));
-+  if (mem_device != compute_device) {
-+    CUcontext mem_ctx = nullptr;
-+    CUDA_CHECK_AND_RETURN_NULL(cuDevicePrimaryCtxRetain(&mem_ctx, mem_device));
-+    CUresult status = cuCtxEnablePeerAccess(mem_ctx, /*flags=*/0);
-+    if (status == CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
-+      status = CUDA_SUCCESS;
-+    }
-+    CUDA_CHECK_AND_RETURN_NULL(status);
-+  }
-+  Py_RETURN_NONE;
-+}
-+
- // Extract a CUDA device pointer from a pointer-like PyObject obj, and store
- // it to the memory location pointed by ptr.
- bool extractPointer(PyObject* obj, void* ptr) {
-@@ -200,6 +229,9 @@ bool extractPointer(PyObject* obj, void*
-   if (*dev_ptr == 0) {
-     return true;  // valid nullptr
-   }
-+  if (enablePeerAccessIfNecessary(*dev_ptr) == nullptr) {
-+    return false;
-+  }
-   CUresult status = cuPointerGetAttribute(
-       dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, *dev_ptr);
-   if (status == CUDA_ERROR_INVALID_VALUE) {
diff --git a/third_party/triton/temporary/f8e5m2_conversion.patch b/third_party/triton/temporary/f8e5m2_conversion.patch
deleted file mode 100644
index 749f92a357d4..000000000000
--- a/third_party/triton/temporary/f8e5m2_conversion.patch
+++ /dev/null
@@ -1,62 +0,0 @@
-diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
---- a/python/test/unit/language/test_conversions.py
-+++ b/python/test/unit/language/test_conversions.py
-@@ -331,14 +331,18 @@ def test_typeconvert_upcast(src_dtype, d
- def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
- 
-     if is_cuda():
--        if src_dtype != 'float32' and torch.cuda.get_device_capability(0) < (9, 0):
--            pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
--
--        if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
--            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
--
--        if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne':
-+        if dst_dtype in ('float8e5b16', 'float8e4b8'):
-             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU MI300")
-+        if torch.cuda.get_device_capability(0) < (9, 0):
-+            match (src_dtype, dst_dtype, rounding):
-+                case ('float16', 'float8e5', 'rtne'):
-+                    ...
-+                case (_, ty, 'rtne') if ty in ('float8e5', 'float8e4nv'):
-+                    pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
-+                case (ty, _, _) if ty != 'float32':
-+                    pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
-+                case _:
-+                    ...
- 
-     if is_hip():
-         if dst_dtype == 'float8e5' and rounding == 'rtne':
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
-@@ -27,15 +27,20 @@ struct Fp8ConversionDesc {
- static const Fp8ConversionDesc Fp16_to_Fp8E5M2_RTNE(bool hasNativeFP) {
-   Fp8ConversionDesc ret;
-   if (!hasNativeFP) {
--    ret = {"{                            \n"
--           ".reg .b32 a<2>;              \n"
--           "and.b32 a0, $1, 0xfffefffe;  \n"   // a0 &= 0xfffefffe
--           "and.b32 a1, $2, 0xfffefffe;  \n"   // (strip lowest bit)
--           "add.u32 a0, a0, 0x00800080;  \n"   // a0 += 0x00800080
--           "add.u32 a1, a1, 0x00800080;  \n"   // (round to nearest)
--           "prmt.b32 $0, a0, a1, 0x7531; \n\t" // output = a1a0
--           "}",
--           32, 32, 4};
-+    ret = {
-+        "{                            \n"
-+        ".reg .b32 a<2>;              \n"
-+        "and.b32 a0, $1, 0x01000100;  \n"    // a0 = $1 & 0x01000100
-+        "and.b32 a1, $2, 0x01000100;  \n"    // (least significant result bit)
-+        "shr.b32 a0, a0, 8;           \n"    // a0 >>= 8
-+        "shr.b32 a1, a1, 8;           \n"    // (shift the lsb)
-+        "add.u32 a0, a0, 0x007f007f;  \n"    // a0 += 0x007f007f
-+        "add.u32 a1, a1, 0x007f007f;  \n"    // (add rounding base)
-+        "add.u32 a0, a0, $1;          \n"    // res = $1 + lsb + 0x7f
-+        "add.u32 a1, a1, $2;          \n"    // (round to nearest)
-+        "prmt.b32 $0, a0, a1, 0x7531; \n\t"  // output = a1a0
-+        "}",
-+        32, 32, 4};
-   } else {
-     ret = {"cvt.rn.satfinite.e5m2x2.f16x2 $0, $1; \n\t", 32, 16, 2};
-   }
diff --git a/third_party/triton/temporary/fix_assert.patch b/third_party/triton/temporary/fix_assert.patch
deleted file mode 100644
index e8bf395e1431..000000000000
--- a/third_party/triton/temporary/fix_assert.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-# This should be upstreamed b/394779997
-diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
---- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
-@@ -255,7 +255,7 @@ mlir::triton::maybeGetStageCluster(Opera
- }
- std::pair<int, int> mlir::triton::getStageCluster(Operation *op) {
-   auto res = maybeGetStageCluster(op);
--  assert(res.has_value() || "Operation is missing stage & cluster attribute");
-+  assert(res.has_value() && "Operation is missing stage & cluster attribute");
-   return *res;
- }
- 
\ No newline at end of file
diff --git a/third_party/triton/temporary/fix_fence_insertion_race.patch b/third_party/triton/temporary/fix_fence_insertion_race.patch
deleted file mode 100644
index 0ebe589bebb5..000000000000
--- a/third_party/triton/temporary/fix_fence_insertion_race.patch
+++ /dev/null
@@ -1,63 +0,0 @@
-TODO: b/391620138 - Upstream once we manage to get a good reproducer.
-
-This should prevent a crash due to a race condition for now.
-
-diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
---- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-+++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -43,6 +43,7 @@ public:
-     if (::triton::tools::getBoolEnv("DISABLE_MMA_V3"))
-       return;
-     ModuleOp mod = getOperation();
-+    DenseSet<std::pair<Operation *, unsigned>> trace;
-     mod.walk([&](Operation *op) {
-       if (!isa<ttng::WarpGroupDotOp>(op) &&
-           op->getName().getStringRef() != "triton_xla.sparse_dot")
-@@ -54,8 +55,8 @@ public:
-           cast<RankedTensorType>(op->getResult(0).getType()).getEncoding());
-       if (!mmaEncoding || !mmaEncoding.isHopper())
-         return WalkResult::advance();
--      bool aDependsOnShared = dependOnSharedEncOperand(a);
--      bool bDependsOnShared = dependOnSharedEncOperand(b);
-+      bool aDependsOnShared = dependOnSharedEncOperand(a, trace);
-+      bool bDependsOnShared = dependOnSharedEncOperand(b, trace);
-       if (!aDependsOnShared && !bDependsOnShared)
-         return WalkResult::advance();
-       Operation *fence = builder.create<ttng::FenceAsyncSharedOp>(
-@@ -76,8 +77,7 @@ public:
-   }
- 
- private:
--  bool dependOnSharedEncOperand(Value operand) {
--    static DenseSet<std::pair<Operation *, unsigned>> trace;
-+  bool dependOnSharedEncOperand(Value operand, DenseSet<std::pair<Operation *, unsigned>> &trace) {
-     auto op = operand.getDefiningOp();
-     // avoid redundant insertion
-     if (op && op->hasTrait<OpTrait::DotLike>())
-@@ -92,7 +92,7 @@ private:
-     // op and not BlockArgument
-     if (op && !isa<BlockArgument>(operand)) {
-       for (auto v : op->getOperands()) {
--        if (dependOnSharedEncOperand(v))
-+        if (dependOnSharedEncOperand(v, trace))
-           return true;
-       }
-     }
-@@ -107,7 +107,7 @@ private:
-         auto iterOperands = forOp.getInitArgs();
-         if (argNum == 0)
-           return false;
--        if (dependOnSharedEncOperand(iterOperands[argNum - 1]))
-+        if (dependOnSharedEncOperand(iterOperands[argNum - 1], trace))
-           return true;
-         // yield
-         auto yieldOp = forOp.getBody()->getTerminator();
-@@ -120,7 +120,7 @@ private:
-         else
-           trace.insert(entry);
- 
--        if (dependOnSharedEncOperand(v))
-+        if (dependOnSharedEncOperand(v, trace))
-           return true;
-       } else if (auto whileOp = dyn_cast<scf::WhileOp>(argOwner)) {
-         assert(false && "FenceInsertionPass does not supported WhileOp");
diff --git a/third_party/triton/temporary/launcher_tma_arg_mask.patch b/third_party/triton/temporary/launcher_tma_arg_mask.patch
new file mode 100644
index 000000000000..0df2f9a5418c
--- /dev/null
+++ b/third_party/triton/temporary/launcher_tma_arg_mask.patch
@@ -0,0 +1,151 @@
+diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
+--- a/third_party/nvidia/backend/driver.py
++++ b/third_party/nvidia/backend/driver.py
+@@ -1,8 +1,9 @@
+-from collections.abc import Callable
++from collections.abc import Callable, Iterator, Sequence
+ import functools
+ import operator
+ import os
+ import subprocess
++from typing import Any, AnyStr
+ import triton
+ from triton.runtime import _allocation
+ from triton.backends.compiler import GPUTarget
+@@ -95,54 +96,64 @@ def ty_to_cpp(ty):
+         "nvTmaDesc": "CUtensorMap",
+     }[ty]
+ 
++# A nested sequence of arg type strings.
++type _ArgTypeWithNesting = str | Sequence[ "_ArgTypeWithNesting"]
+ 
+-def flatten_tuples(xs):
++# Nested mask that has True for elements that should be kept and False for
++# elements that should be removed. Has the same shape as the signature.
++type _ArgMask = Sequence[bool | "_ArgMask"]
++
++
++def _make_nonconst_arg_mask(signature_types: Sequence[_ArgTypeWithNesting]) -> _ArgMask:
++    """Makes a mask that keeps non-constexpr args and removes constexpr args."""
++    # For example:
++    #   Signature: [i32, constexpr, (i32, constexpr)]
++    #   Mask:      [True, False, [True, False]]
++    return [
++        _make_nonconst_arg_mask(ty) if isinstance(ty, tuple) else ty != "constexpr"
++        for ty in signature_types
++    ]
++
++
++def _flatten_tuples(xs):
+     """Recursively flattens tuple elements in xs."""
+     for x in xs:
+         if isinstance(x, tuple):
+-            yield from flatten_tuples(x)
++            yield from _flatten_tuples(x)
+         else:
+             yield x
+ 
+ 
+-def make_launcher(constants : dict[int, str], signature : dict[int, any]) -> Callable[..., None]:
+-
+-    # Here, signature can look like:
+-    #  {'_0': 'i32',
+-    #   'Ptrs': (),
+-    #   '_1': 'constexpr',
+-    #   'values': '[*f32, constexpr]',
+-    #   'out_tuple': 'constexpr'}
+-    # We want to remove the constexprs, flatten the tuples, and remove any more
+-    # constexprs. If we remove them all at the end, we won't be able to remove
+-    # entire tuples that are a single constexpr. If we remove them before
+-    # flattening, we will miss mixed-tuples. So we do it twice.
+-
+-    def _serialize_signature(sig):
+-        if isinstance(sig, tuple):
+-            return ','.join(map(_serialize_signature, sig))
+-        return sig
++def _flatten_and_apply_arg_mask(args: Sequence[Any], mask: _ArgMask) -> Iterator[Any]:
++    """Flattens nested args skipping those filtered out by the mask."""
++    if len(mask) != len(args):
++        # If the included elements in the mask are the same length as the args,
++        # we can assume the caller filtered the args already.
++        # Otherwise there is an unexpected length mismatch.
++        if mask.count(True) == len(args):
++            yield from _flatten_tuples(args)
++            return
++        else:
++            raise ValueError(f"Mask length {len(mask)} does not match arg length {len(args)}")
+ 
+-    # Remember & remove all the constexpr before flattening.
+-    constant_indices_before_flattening = {i for i, [k, v] in enumerate(signature.items()) if v == 'constexpr'}
+-    # constant_indices_before_flattening = [2, 4]
+-    signature = {k: v for k, v in signature.items() if v != 'constexpr'}
+-    # signature = {'_0': 'i32', 'Ptrs': (), 'values': '[*f32, constexpr]'}
++    for mask_item, arg in zip(mask, args):
++        if not mask_item:
++            continue
++        arg_is_sequence = isinstance(arg, Sequence) and not isinstance(arg, str)
++        mask_item_is_sequence = isinstance(mask_item, Sequence)
++        if arg_is_sequence and mask_item_is_sequence:
++            yield from _flatten_and_apply_arg_mask(arg, mask_item)
++        elif arg_is_sequence != mask_item_is_sequence:
++            raise ValueError(f"Inconsistent mask {mask_item} and arg {arg}")
++        else:
++            yield arg
+ 
+-    # Flatten.
+-    signature = ','.join(map(_serialize_signature, signature.values()))
+-    # signature = 'i32,,*f32,constexpr'
+-    signature = list(filter(bool, signature.split(',')))
+-    # signature = ['i32', '*f32', 'constexpr']
+ 
+-    # Remove any constexprs after flattening.
+-    constant_indices_after_flattening = {i for i, s in enumerate(signature) if s == 'constexpr'}
+-    # constant_indices_after_flattening = [2]
+-    signature = {i: s for i, s in enumerate(signature) if s != 'constexpr'}
+-    # signature = {0: 'i32', 1: '*f32'}
++def make_launcher(signature_types: Sequence[_ArgTypeWithNesting]) -> Callable[..., None]:
++    non_const_arg_mask = _make_nonconst_arg_mask(signature_types)
++    flattened_signature = _flatten_and_apply_arg_mask(signature_types, non_const_arg_mask)
+ 
+-    signature_metadata = cuda_utils.build_signature_metadata(
+-            ty for ty in signature.values())
++    signature_metadata = cuda_utils.build_signature_metadata(flattened_signature)
+ 
+     def wrapper(grid_dim_x: int, grid_dim_y: int, grid_dim_z: int,
+                 stream: int, kernel: int, global_scratch: any,
+@@ -151,23 +162,8 @@ def make_launcher(constants : dict[int, 
+                 launch_enter_hook: Callable[..., None],
+                 launch_exit_hook: Callable[..., None],
+                 *args: any) -> None:
+-        # Given the example above, args would look something like:
+-        # args = [8, (), 5, (3, 4), (2, 2, 2)]
+-        # constant_indices_before_flattening = [2, 4]
+-        # Remove constantexprs before flattening:
+-        non_const_args = [arg
+-            for idx, arg in enumerate(args)
+-            if idx not in constant_indices_before_flattening
+-        ]
+-        # non_const_args = [8, (), (3, 4)]
+-        non_const_args = flatten_tuples(non_const_args)
+-        # non_const_args = [8, 3, 4]
+-        # constant_indices_after_flattening = [2]
+-        non_const_args = [arg
+-            for idx, arg in enumerate(non_const_args)
+-            if idx not in constant_indices_after_flattening
+-        ]
+-        # non_const_args = [8, 3]
++        non_const_args = _flatten_and_apply_arg_mask(args, non_const_arg_mask)
++
+         cuda_utils.launch(grid_dim_x, grid_dim_y, grid_dim_z, stream, kernel,
+                           packed_metadata, hook_args, launch_enter_hook,
+                           launch_exit_hook, signature_metadata, global_scratch,
+@@ -255,7 +251,8 @@ class CudaLauncher(object):
+         constants = {arg_idx(idx): value for idx, value in constants.items()}
+         signature = {idx: value for idx, value in src.signature.items()}
+         self.num_ctas = functools.reduce(operator.mul, metadata.cluster_dims, 1)
+-        self.launch = make_launcher(constants, signature)
++        del constants  # Unused.
++        self.launch = make_launcher(signature.values())
+         self.global_scratch_size = metadata.global_scratch_size
+         self.global_scratch_align = metadata.global_scratch_align
+         self.launch_cooperative_grid = metadata.launch_cooperative_grid
diff --git a/third_party/triton/temporary/launcher_tma_tensordesc.patch b/third_party/triton/temporary/launcher_tma_tensordesc.patch
new file mode 100644
index 000000000000..f9baa46d91d1
--- /dev/null
+++ b/third_party/triton/temporary/launcher_tma_tensordesc.patch
@@ -0,0 +1,125 @@
+diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
+--- a/third_party/nvidia/backend/driver.py
++++ b/third_party/nvidia/backend/driver.py
+@@ -1,13 +1,15 @@
+-from collections.abc import Callable, Iterator, Sequence
++import array
++from collections.abc import Callable, Iterator, MutableSequence, Sequence
+ import functools
++import inspect
+ import operator
+ import os
+ import subprocess
+-from typing import Any, AnyStr
++from typing import Any
+ import triton
+ from triton.runtime import _allocation
+ from triton.backends.compiler import GPUTarget
+-from triton.backends.driver import GPUDriver, platform_key
++from triton.backends.driver import GPUDriver
+ from ._C import cuda_utils
+ 
+ from triton.tools.tensor_descriptor import TensorDescriptor
+@@ -150,6 +152,27 @@ def _flatten_and_apply_arg_mask(args: Se
+ 
+ 
+ def make_launcher(signature_types: Sequence[_ArgTypeWithNesting]) -> Callable[..., None]:
++    # Code copied from the original with added type hints.
++    # Expands tensordesc with the type and block shapes like <fp16[128, 16]>
++    # into an nvTmaDesc, shapes, and strides.
++    # This is the signature-handling counterpart to `make_tensordesc_arg`.
++    def _expand_signature(sig: _ArgTypeWithNesting, output: MutableSequence[_ArgTypeWithNesting]):
++        # Expand tensordesc arguments
++        if isinstance(sig, str) and sig.startswith("tensordesc"):
++            output.append("nvTmaDesc")
++            ndim = sig.count(",") + 1
++            for _ in range(ndim):
++                output.append("i32")
++            for _ in range(ndim):
++                output.append("i64")
++        else:
++            output.append(sig)
++
++    expand_signature = []
++    for sig in signature_types:
++        _expand_signature(sig, expand_signature)
++
++    signature_types = expand_signature
+     non_const_arg_mask = _make_nonconst_arg_mask(signature_types)
+     flattened_signature = _flatten_and_apply_arg_mask(signature_types, non_const_arg_mask)
+ 
+@@ -168,19 +191,27 @@ def make_launcher(signature_types: Seque
+                           packed_metadata, hook_args, launch_enter_hook,
+                           launch_exit_hook, signature_metadata, global_scratch,
+                           non_const_args)
++
+     return wrapper
+ 
+ 
+ class TmaDescKernelParam:
+     TMA_DESC_SIZE = 128
++    _ALIGN = 64
+ 
+     def __init__(self):
+-        import torch
+-        self.desc = torch.empty(self.TMA_DESC_SIZE, dtype=torch.uint8, device="cpu")
++        # Add the alignment to the array size to ensure that the address can be
++        # aligned without access going out of bounds.
++        self._array = array.array('B', [0] * (self.TMA_DESC_SIZE + self._ALIGN))
++        address, num_bytes = self._array.buffer_info()
++        # Shift the address to the nearest multiple of the alignment.
++        self._aligned_address = address + self._ALIGN - (address % self._ALIGN)
++        assert self._aligned_address + self.TMA_DESC_SIZE <= address + num_bytes
++        assert self._aligned_address % self._ALIGN == 0
+ 
+     # Return a CUtensorMap* pointer in host memory
+     def tma_desc_cpu_ptr(self):
+-        return self.desc.data_ptr()
++        return self._aligned_address
+ 
+ 
+ # The TMA dtype enum values are slightly different on host vs device...
+@@ -221,13 +252,27 @@ def make_tensordesc_arg(arg, metadata):
+     return result
+ 
+ 
++def get_var_positional_arg_index(launcher: Callable[..., None]) -> int | None:
++    """Returns the index of the variable positional argument in a callable."""
++    launcher_sig = inspect.signature(launcher)
++    for i, param in enumerate(launcher_sig.parameters.values()):
++        if param.kind == inspect.Parameter.VAR_POSITIONAL:
++            return i
++    return None
++
++
++# Ported from the original with improved positional argument handling.
+ def wrap_handle_tensordesc(launcher, tensordesc_meta):
+     if not tensordesc_meta:
+         return launcher
+ 
++    # Get the index of the `*args` entry in the launcher signature.
++    var_positional_arg = get_var_positional_arg_index(launcher)
++    assert var_positional_arg is not None
++
+     def inner(*args):
+-        meta_args = args[:11]
+-        raw_kernel_args = args[11:]
++        meta_args = args[:var_positional_arg]
++        raw_kernel_args = args[var_positional_arg:]
+         tensordesc_idx = 0
+         final_args = []
+         for i, arg in enumerate(raw_kernel_args):
+@@ -252,7 +297,12 @@ class CudaLauncher(object):
+         signature = {idx: value for idx, value in src.signature.items()}
+         self.num_ctas = functools.reduce(operator.mul, metadata.cluster_dims, 1)
+         del constants  # Unused.
+-        self.launch = make_launcher(signature.values())
++        launch = make_launcher(signature.values())
++        tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
++        if tensordesc_meta is not None:
++            self.launch = wrap_handle_tensordesc(launch, tensordesc_meta)
++        else:
++            self.launch = launch
+         self.global_scratch_size = metadata.global_scratch_size
+         self.global_scratch_align = metadata.global_scratch_align
+         self.launch_cooperative_grid = metadata.launch_cooperative_grid
diff --git a/third_party/triton/temporary/mlir_types.patch b/third_party/triton/temporary/mlir_types.patch
deleted file mode 100644
index 7712ae1baab3..000000000000
--- a/third_party/triton/temporary/mlir_types.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-# This is already fixed upstream. Remove this patch in the next integration.
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
-@@ -3,6 +3,7 @@
- #include "PatternTritonGPUOpToLLVM.h"
- #include "Utility.h"
- #include "mlir/Support/LLVM.h"
-+#include "third_party/triton/include/triton/Conversion/TritonGPUToLLVM/Utility.h"
- #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
- 
- using namespace mlir;
-@@ -60,7 +61,8 @@ enum class mxfpKind { mxf8f6f4 = 0, mxf4
- inline mxfpKind getMXFPKind(ScaleDotElemType typeA, ScaleDotElemType typeB,
-                             Type scaleAType, Type scaleBType) {
-   if (typeA == ScaleDotElemType::E2M1 && typeB == ScaleDotElemType::E2M1) {
--    if (scaleAType.isFloat8E4M3FN() && scaleBType.isFloat8E4M3FN()) {
-+    if (llvm::isa<mlir::Float8E4M3FNType>(scaleAType) &&
-+        llvm::isa<mlir::Float8E4M3FNType>(scaleBType)) {
-       return mxfpKind::mxf4nvf4;
-     }
-     return mxfpKind::mxf4;
-@@ -100,10 +102,11 @@ static Value createInstDescriptor(Conver
-       return 1;
-     if (type.isF32())
-       return 2;
--    if (type.isFloat8E4M3FN())
-+    if (llvm::isa<mlir::Float8E4M3FNType>(type))
-       return 0;
--    if (type.isFloat8E5M2())
-+    if (llvm::isa<mlir::Float8E5M2Type>(type))
-       return 1;
-+
-     llvm_unreachable("Unsupported type.");
-   };
-   static_assert(sizeof(TCGen5InstructionDescriptor) == 4,
-@@ -224,7 +227,8 @@ static void createGen5MMA(ConversionPatt
-     opcode += "f16";
-   else if (srcElementTy.isF32())
-     opcode += "tf32";
--  else if (srcElementTy.isFloat8E4M3FN() || srcElementTy.isFloat8E5M2())
-+  else if (llvm::isa<mlir::Float8E4M3FNType>(srcElementTy) ||
-+           llvm::isa<mlir::Float8E5M2Type>(srcElementTy))
-     opcode += "f8f6f4";
-   else
-     assert(0 && "Unsupported type.");
diff --git a/third_party/triton/temporary/mmav5_warps.patch b/third_party/triton/temporary/mmav5_warps.patch
deleted file mode 100644
index 5e7742b2df27..000000000000
--- a/third_party/triton/temporary/mmav5_warps.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-# Already fixed upstream, can be removed in the next integrate.
-# https://github.com/triton-lang/triton/commit/2b2a872459648a7418d1c7e4cfa9aa8cb39e71e5
-
-diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
---- a/lib/Analysis/Utility.cpp
-+++ b/lib/Analysis/Utility.cpp
-@@ -727,6 +727,10 @@ bool supportMMA(triton::DotOp op, int ve
-       return false;
-     if (op.getType().getRank() != 2)
-       return false;
-+    if (numWarps != 4 && numWarps != 8) {
-+      // Currently only support numWarps 4 or 8 for TMEM load and store.
-+      return false;
-+    }
-     if (!(numWarps % 4 == 0 && retShapePerCTA[rank - 2] % 64 == 0 &&
-           retShapePerCTA[rank - 1] % 8 == 0))
-       return false;
diff --git a/third_party/triton/temporary/ptxas_blackwell.patch b/third_party/triton/temporary/ptxas_blackwell.patch
deleted file mode 100644
index bb315711fd34..000000000000
--- a/third_party/triton/temporary/ptxas_blackwell.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-# TODO(b/399031689):Clarify with NVIDIA if this is intentional, and whether it 
-# will be changed in the future. Until then, this should be an internal patch.
-diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
---- a/third_party/nvidia/backend/compiler.py
-+++ b/third_party/nvidia/backend/compiler.py
-@@ -51,7 +51,7 @@ def _path_to_binary(binary: str):
- 
- @functools.lru_cache()
- def get_ptxas(arch: int):
--    name = "ptxas-blackwell" if arch >= 100 else "ptxas"
-+    name = "ptxas"
-     return _path_to_binary(name)
- 
- 
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
index 65f551d18c17..bef7347fb755 100644
--- a/third_party/triton/temporary/series.bzl
+++ b/third_party/triton/temporary/series.bzl
@@ -14,11 +14,7 @@ those to this list.
 """
 
 temporary_patch_list = [
-    "//third_party/triton:temporary/fix_fence_insertion_race.patch",
-    "//third_party/triton:temporary/enable_peer_access.patch",
-    "//third_party/triton:temporary/sm120.patch",
-    "//third_party/triton:temporary/mmav5_warps.patch",
-    "//third_party/triton:temporary/ptxas_blackwell.patch",
-    "//third_party/triton:temporary/f8e5m2_conversion.patch",
+    "//third_party/triton:temporary/launcher_tma_arg_mask.patch",
+    "//third_party/triton:temporary/launcher_tma_tensordesc.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/triton/temporary/sm120.patch b/third_party/triton/temporary/sm120.patch
deleted file mode 100644
index 252ec47ad124..000000000000
--- a/third_party/triton/temporary/sm120.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index c66c9f4ae..3415d6a91 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -33,6 +33,8 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
-     versionsSupported = {3, 2};
-   } else if (computeCapability < 110) {
-     versionsSupported = {5, 2};
-+  } else if (computeCapability == 120) {
-+    versionsSupported = {2};
-   } else {
-     assert(false && "computeCapability not supported");
-   }
\ No newline at end of file
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index d445dd233617..509744db9809 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -8,12 +8,12 @@ load("//third_party/triton:xla_extensions/series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl722997049"
-    TRITON_SHA256 = "3c5de0e06947cc8cd6b6b06d0c28b3234f6ff72a4ae8f63c80e6d276413e5d7e"
+    TRITON_COMMIT = "triton_integrate_branch-1.5"
+    TRITON_SHA256 = "55094d08c314243382d051e19ad23eda94b37cb880ae1608d2a6a0d64489e855"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
-        strip_prefix = "triton-{commit}".format(commit = TRITON_COMMIT),
-        urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
+        strip_prefix = "triton-" + TRITON_COMMIT,
+        urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{}.tar.gz".format(TRITON_COMMIT)),
         patch_file = extensions_files_patch_list + llvm_patch_list + temporary_patch_list,
     )
diff --git a/third_party/uv/BUILD b/third_party/uv/BUILD
deleted file mode 100644
index 3c413807167a..000000000000
--- a/third_party/uv/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/uv/uv.BUILD b/third_party/uv/uv.BUILD
deleted file mode 100644
index 4bf3e4bb6c6b..000000000000
--- a/third_party/uv/uv.BUILD
+++ /dev/null
@@ -1,84 +0,0 @@
-# Description:
-#   libuv is a cross-platform asynchronous I/O library.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "uv",
-    srcs = [
-        "src/fs-poll.c",
-        "src/idna.c",
-        "src/inet.c",
-        "src/random.c",
-        "src/strscpy.c",
-        "src/threadpool.c",
-        "src/timer.c",
-        "src/uv-common.c",
-        "src/uv-data-getter-setters.c",
-        "src/version.c",
-    ] + [
-        "src/unix/async.c",
-        "src/unix/core.c",
-        "src/unix/dl.c",
-        "src/unix/fs.c",
-        "src/unix/getaddrinfo.c",
-        "src/unix/getnameinfo.c",
-        "src/unix/loop.c",
-        "src/unix/loop-watcher.c",
-        "src/unix/pipe.c",
-        "src/unix/poll.c",
-        "src/unix/process.c",
-        "src/unix/random-devurandom.c",
-        "src/unix/signal.c",
-        "src/unix/stream.c",
-        "src/unix/tcp.c",
-        "src/unix/thread.c",
-        "src/unix/tty.c",
-        "src/unix/udp.c",
-    ] + select({
-        "@platforms//os:osx": [
-            "src/unix/bsd-ifaddrs.c",
-            "src/unix/darwin.c",
-            "src/unix/darwin-proctitle.c",
-            "src/unix/fsevents.c",
-            "src/unix/kqueue.c",
-            "src/unix/proctitle.c",
-            "src/unix/random-getentropy.c",
-        ],
-        "//conditions:default": [],
-    }),
-    # TODO: Add Linux, etc. as in https://github.com/libuv/libuv/blob/v1.38.0/CMakeLists.txt.
-    hdrs = [
-        "include/uv.h",
-        "src/heap-inl.h",
-        "src/idna.h",
-        "src/queue.h",
-        "src/strscpy.h",
-        "src/unix/atomic-ops.h",
-        "src/unix/internal.h",
-        "src/unix/spinlock.h",
-        "src/uv-common.h",
-    ] + select({
-        "@platforms//os:osx": [
-            "src/unix/darwin-stub.h",
-        ],
-        "//conditions:default": [],
-    }) + glob(["include/uv/*.h"]),
-    copts = [
-        "-fexceptions",
-        "-Wno-unused-variable",
-    ],
-    includes = [
-        "include",
-        "src",
-    ],
-    textual_hdrs = [
-        "include/uv.h",
-    ],
-)
diff --git a/third_party/uv/workspace.bzl b/third_party/uv/workspace.bzl
deleted file mode 100644
index 8d26ab4dcd41..000000000000
--- a/third_party/uv/workspace.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-"""Provides the repository macro to import libuv."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    """Imports libuv."""
-
-    UV_VERSION = "v1.38.0"
-    UV_SHA256 = "71344f62c5020ed3643ad0bcba98ae4d7d6037285923c5416844d7c141a3ff93"
-
-    tf_http_archive(
-        name = "uv",
-        sha256 = UV_SHA256,
-        strip_prefix = "libuv-{version}".format(version = UV_VERSION),
-        urls = tf_mirror_urls("https://dist.libuv.org/dist/{version}/libuv-{version}.tar.gz".format(version = UV_VERSION)),
-        build_file = "//third_party/uv:uv.BUILD",
-    )
diff --git a/third_party/xla/.github/CODEOWNERS b/third_party/xla/.github/CODEOWNERS
deleted file mode 100644
index 43a575c23697..000000000000
--- a/third_party/xla/.github/CODEOWNERS
+++ /dev/null
@@ -1,2 +0,0 @@
-# TODO: Have default codeowners but note that it will add reviewers to copybara
-# exports also
diff --git a/third_party/xla/.github/workflows/autorun_ci.py b/third_party/xla/.github/workflows/autorun_ci.py
deleted file mode 100644
index 8221fdcd90cf..000000000000
--- a/third_party/xla/.github/workflows/autorun_ci.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2024 The OpenXLA Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Autoruns CI for OpenXLA org members with membership set to public."""
-import logging
-import os
-
-import github_api
-
-_OPENXLA_ORG_ID = 107584881  # https://api.github.com/orgs/107584881
-
-
-def main():
-  username = os.getenv("PR_AUTHOR_USERNAME")
-  pr_number = os.getenv("PR_NUMBER")
-  api = github_api.GitHubAPI(os.getenv("GH_TOKEN"))
-
-  orgs = api.get_user_orgs(username)
-  logging.info("Found public organizations for user %s: %s", username, orgs)
-
-  if _OPENXLA_ORG_ID in {org["id"] for org in orgs}:
-    logging.info(
-        "Found OpenXLA org in public memberships, so adding kokoro:force-run"
-        " label."
-    )
-    api.add_issue_labels("openxla/xla", pr_number, ["kokoro:force-run"])
-
-
-if __name__ == "__main__":
-  logging.basicConfig()
-  logging.getLogger().setLevel(logging.INFO)
-  main()
diff --git a/third_party/xla/.github/workflows/autorun_ci.yml b/third_party/xla/.github/workflows/autorun_ci.yml
deleted file mode 100644
index 92ebd74e7579..000000000000
--- a/third_party/xla/.github/workflows/autorun_ci.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2024 The OpenXLA Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-name: Autorun CI for OpenXLA Public Members
-permissions:
-  pull-requests: write
-on:
-  pull_request_target:
-    branches: ["main"]
-
-jobs:
-  autorun-ci:
-    runs-on: ubuntu-22.04
-    defaults:
-      run:
-        shell: bash
-    env:
-      GH_TOKEN: ${{ github.token }}
-      PR_NUMBER: ${{ github.event.number }}
-      PR_AUTHOR_USERNAME: ${{ github.event.pull_request.user.login }}
-    timeout-minutes: 6
-    if: github.event.sender.type == 'User'
-    steps:
-      - name: "Checking out repository"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: "Autorun CI for public OpenXLA org members"
-        run: python3 .github/workflows/autorun_ci.py
diff --git a/third_party/xla/.github/workflows/benchmark_postsubmit.yml b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
new file mode 100644
index 000000000000..cadb7f26538c
--- /dev/null
+++ b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
@@ -0,0 +1,241 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+name: Postsubmit Benchmarks
+permissions:
+  contents: read
+on:
+  workflow_dispatch:
+    inputs:
+      halt-for-connection:
+        description: 'Should this workflow run wait for a remote connection?'
+        type: choice
+        required: true
+        default: 'no'
+        options:
+        - 'yes'
+        - 'no'
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'main' }}
+
+jobs:
+  Tests:
+    strategy:
+      # Don't fail fast - want to see results for all builds even if one fails.
+      fail-fast: false
+      matrix:
+        job_info: [
+          {
+            pool: "linux-arm64-t2a-48",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest",
+            pretty_name: "XLA Linux ARM64 CPU 48 vcpu Presubmit",
+            bazel_arch_dir: "aarch64-opt",
+            platform: "CPU",
+            runner_binary: "hlo_runner_main",
+            hlo_test_file: "xla/tools/hlo_opt/tests/cpu_hlo.hlo"
+          },
+          {
+            pool: "linux-x86-n2-128",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 CPU 128 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "CPU",
+            runner_binary: "hlo_runner_main",
+            hlo_test_file: "xla/tools/hlo_opt/tests/cpu_hlo.hlo"
+          },
+          {
+            pool: "linux-x86-g2-16-l4-1gpu",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-cuda12.8-cudnn9.8:latest",
+            pretty_name: "XLA Linux x86 GPU L4 16 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "GPU",
+            runner_binary: "hlo_runner_main_gpu",
+            hlo_test_file: "xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo"
+          },
+        ]
+    name: ${{ matrix.job_info.pretty_name }}
+    runs-on: ${{ matrix.job_info.pool }}
+    container: ${{ matrix.job_info.container }}
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 60
+    env:
+      OUTPUT_FILE_PATH: ${{ github.workspace }}/hlo_test_output.txt
+    steps:
+      - name: Print GitHub Context
+        run: |
+          echo "GitHub SHA: ${{ github.sha }}"
+          echo "GitHub Ref: ${{ github.ref }}"
+          echo "GitHub Ref Name: ${{ github.ref_name }}"
+          echo "GitHub Head Ref: ${{ github.head_ref }})"
+          echo "GitHub Base Ref: ${{ github.base_ref }})"
+          echo "GitHub Repository: ${{ github.repository }}"
+          echo "GitHub Run ID: ${{ github.run_id }}"
+          echo "GitHub Run Number: ${{ github.run_number }}"
+          echo "GitHub Workflow: ${{ github.workflow }}"
+          echo "GitHub Actor: ${{ github.actor }}"
+          echo "GitHub Event Name: ${{ github.event_name }}"
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "Pull Request Number: ${{ github.event.pull_request.number }}"
+            echo "Pull Request Head Ref: ${{ github.event.pull_request.head.ref }}"
+            echo "Pull Request Base Ref: ${{ github.event.pull_request.base.ref }}"
+          fi
+      # Find the current PR number, if any, because github context doesn't have it for push events.
+      - uses: jwalton/gh-find-current-pr@2f6a0c6ed5c54c19f04d8411e0723b3de68f464a # v1.3.3
+        id: find_pr
+        with:
+          # Can be "open", "closed", or "all".  Defaults to "open".
+          state: all
+      # This will echo "Your PR is 7", or be skipped if there is no current PR.
+      - run: echo "PR_NUMBER=${{ steps.find_pr.outputs.pr }}"
+        if: success() && steps.find_pr.outputs.number
+        env:
+          PR: ${{ steps.find_pr.outputs.pr }}
+
+      - name: Checkout OpenXLA
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Wait For Connection
+        uses: google-ml-infra/actions/ci_connection@main
+        with:
+          halt-dispatch-input: ${{ inputs.halt-for-connection }}
+
+      - name: Configure GPU backend
+        if: ${{ matrix.job_info.platform == 'GPU' }}
+        run: |
+          ./configure.py --backend=CUDA --cuda_compiler=nvcc
+
+      - name: "Run build.py"
+        run: |
+          ./build_tools/ci/build.py --build="${{ matrix.job_info.pretty_name }}_github_actions"
+
+      # Run the corresponding HLO tests based on platform
+      - name: Run HLO tests
+        run: |
+          bazel_arch_dir="${{ matrix.job_info.bazel_arch_dir }}"
+          binary_dir="./bazel-out/${bazel_arch_dir}/bin/xla/tools"
+          runner_binary="${{ matrix.job_info.runner_binary }}"
+          test_hlo_file="${{ matrix.job_info.hlo_test_file }}"
+          platform="${{ matrix.job_info.platform }}"
+          output_file="$GITHUB_WORKSPACE/${test_hlo_file}_output.txt"
+          xspace_file="$GITHUB_WORKSPACE/xspace.pb"
+
+          echo "Running ${platform} test with binary: $binary_dir/$runner_binary"
+          pwd #print working directory
+
+          if [[ "$platform" == "CPU" ]]; then
+            $binary_dir/multihost_hlo_runner/$runner_binary --device_type=host --num_repeats=5 --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to="$xspace_file" "$test_hlo_file" > "$output_file"
+            $binary_dir/compute_xspace_stats_main --input="$xspace_file" --device_type=CPU >> "$output_file"
+          elif [[ "$platform" == "GPU" ]]; then
+            $binary_dir/multihost_hlo_runner/$runner_binary --device_type=gpu --num_repeats=5 --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to="$xspace_file" "$test_hlo_file" > "$output_file"
+            $binary_dir/compute_xspace_stats_main_gpu --input="$xspace_file" --device_type=GPU >> "$output_file"
+          else
+            echo "Unsupported platform: $platform"
+            exit 1
+          fi
+          cat "$output_file"
+          echo "Output written to: $output_file"
+          echo "XSpace file written to: $xspace_file"
+          echo "OUTPUT_FILE=$output_file" >> "$GITHUB_ENV"
+          echo "XSPACE_FILE=$xspace_file" >> "$GITHUB_ENV"
+
+      - name: Upload HLO test output to a GCS bucket
+        run: |
+          GCS_BUCKET="gs://openxla-postsubmit-transient"
+          TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+          DATE_FOLDER=$(date +%Y%m%d)
+          FILENAME=$(basename "$OUTPUT_FILE")
+          COMMIT_SHA="${{ github.sha }}"
+          RUN_ID="${{ github.run_id }}"
+          PR_NUMBER="${{ steps.find_pr.outputs.pr }}"
+          PLATFORM="${{ matrix.job_info.platform }}"
+          POOL="${{ matrix.job_info.pool }}"
+          PR_SUFFIX=""
+          if [[ -n "$PR_NUMBER" ]]; then
+            PR_SUFFIX="_pr_${PR_NUMBER}"
+          fi
+
+          GCS_OBJECT_NAME="${DATE_FOLDER}/${TIMESTAMP}_${POOL}_run_${RUN_ID}_commit_${COMMIT_SHA}${PR_SUFFIX}_${PLATFORM}_${FILENAME}"
+
+          echo "Uploading $OUTPUT_FILE to $GCS_BUCKET/$GCS_OBJECT_NAME"
+          ls -l "$OUTPUT_FILE" # Verify the file exists
+          gsutil cp "$OUTPUT_FILE" "$GCS_BUCKET/$GCS_OBJECT_NAME"
+
+      - name: Download Gemma Hlo Files
+        run: |
+          mkdir -p tmp_hlo
+          cd tmp_hlo
+          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo
+          cd ..
+
+      - name: Run Gemma HLO tests
+        if: ${{ matrix.job_info.pool != 'linux-arm64-t2a-48' }}
+        run: |
+          bazel_arch_dir="${{ matrix.job_info.bazel_arch_dir }}"
+          binary_dir="./bazel-out/${bazel_arch_dir}/bin/xla/tools"
+          runner_binary="${{ matrix.job_info.runner_binary }}"
+          test_hlo_file="tmp_hlo/gemma3_1b_flax_call.hlo"
+          platform="${{ matrix.job_info.platform }}"
+          output_file="$GITHUB_WORKSPACE/${test_hlo_file}_output.txt"
+          xspace_file="$GITHUB_WORKSPACE/xspace.pb"
+
+          if [[ "$platform" == "CPU" ]]; then
+            $binary_dir/multihost_hlo_runner/$runner_binary --device_type=host --num_repeats=5 --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to="$xspace_file" "$test_hlo_file" > "$output_file"
+            $binary_dir/compute_xspace_stats_main --input="$xspace_file" --device_type=CPU >> "$output_file"
+          elif [[ "$platform" == "GPU" ]]; then
+            $binary_dir/multihost_hlo_runner/$runner_binary --device_type=gpu --num_repeats=5 --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to="$xspace_file" "$test_hlo_file" > "$output_file"
+            $binary_dir/compute_xspace_stats_main_gpu --input="$xspace_file" --device_type=GPU >> "$output_file"
+          else
+            echo "Unsupported platform: $platform"
+            exit 1
+          fi
+          cat "$output_file"
+          echo "Output written to: $output_file"
+          echo "XSpace file written to: $xspace_file"
+          echo "OUTPUT_FILE=$output_file" >> "$GITHUB_ENV"
+          echo "XSPACE_FILE=$xspace_file" >> "$GITHUB_ENV"
+
+      - name: Upload gemma HLO test output to a GCS bucket
+        if: ${{ matrix.job_info.pool != 'linux-arm64-t2a-48' }}
+        run: |
+          GCS_BUCKET="gs://openxla-postsubmit-transient"
+          TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+          DATE_FOLDER=$(date +%Y%m%d)
+          FILENAME="gemma3_1b_flax_call.hlo"
+          COMMIT_SHA="${{ github.sha }}"
+          RUN_ID="${{ github.run_id }}"
+          PR_NUMBER="${{ steps.find_pr.outputs.pr }}"
+          PLATFORM="${{ matrix.job_info.platform }}"
+          POOL="${{ matrix.job_info.pool }}"
+          PR_SUFFIX=""
+          if [[ -n "$PR_NUMBER" ]]; then
+            PR_SUFFIX="_pr_${PR_NUMBER}"
+          fi
+
+          GCS_OBJECT_NAME="${DATE_FOLDER}/${TIMESTAMP}_${POOL}_run_${RUN_ID}_commit_${COMMIT_SHA}${PR_SUFFIX}_${PLATFORM}_${FILENAME}"
+
+          echo "Uploading $OUTPUT_FILE to $GCS_BUCKET/$GCS_OBJECT_NAME"
+          ls -l "$OUTPUT_FILE" # Verify the file exists
+          gsutil cp "$OUTPUT_FILE" "$GCS_BUCKET/$GCS_OBJECT_NAME"
+
+      - name: Upload XSpace artifacts
+        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        with:
+          name: xspace-artifacts-${{ matrix.job_info.pool }}-${{ matrix.job_info.platform }}
+          path: ${{ env.XSPACE_FILE }}
\ No newline at end of file
diff --git a/third_party/xla/.github/workflows/benchmark_presubmit.yml b/third_party/xla/.github/workflows/benchmark_presubmit.yml
new file mode 100644
index 000000000000..516adbe9888f
--- /dev/null
+++ b/third_party/xla/.github/workflows/benchmark_presubmit.yml
@@ -0,0 +1,134 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+name: Presubmit Benchmarks
+permissions:
+  contents: read
+on:
+  workflow_dispatch:
+    inputs:
+      halt-for-connection:
+        description: 'Should this workflow run wait for a remote connection?'
+        type: choice
+        required: true
+        default: 'no'
+        options:
+        - 'yes'
+        - 'no'
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'main' }}
+
+jobs:
+  Tests:
+    strategy:
+      # Don't fail fast - want to see results for all builds even if one fails.
+      fail-fast: false
+      matrix:
+        job_info: [
+          {
+            pool: "linux-x86-n2-128",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 CPU 128 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "CPU"
+          },
+          {
+            pool: "linux-x86-g2-16-l4-1gpu",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-cuda12.8-cudnn9.8:latest",
+            pretty_name: "XLA Linux x86 GPU L4 16 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "GPU"
+          },
+        ]
+    name: ${{ matrix.job_info.pretty_name }}
+    runs-on: ${{ matrix.job_info.pool }}
+    container: ${{ matrix.job_info.container }}
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 15
+    env:
+      OUTPUT_DIR: ${{ github.workspace }}/output
+    steps:
+      - name: Print GitHub Context
+        run: |
+          echo "GitHub SHA: ${{ github.sha }}"
+          echo "GitHub Ref: ${{ github.ref }}"
+          echo "GitHub Ref Name: ${{ github.ref_name }}"
+          echo "GitHub Head Ref: ${{ github.head_ref }})"
+          echo "GitHub Base Ref: ${{ github.base_ref }})"
+          echo "GitHub Repository: ${{ github.repository }}"
+          echo "GitHub Run ID: ${{ github.run_id }}"
+          echo "GitHub Run Number: ${{ github.run_number }}"
+          echo "GitHub Workflow: ${{ github.workflow }}"
+          echo "GitHub Actor: ${{ github.actor }}"
+          echo "GitHub Event Name: ${{ github.event_name }}"
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "Pull Request Number: ${{ github.event.pull_request.number }}"
+            echo "Pull Request Head Ref: ${{ github.event.pull_request.head.ref }}"
+            echo "Pull Request Base Ref: ${{ github.event.pull_request.base.ref }}"
+          fi
+
+      - name: Checkout OpenXLA
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Configure GPU backend
+        if: ${{ matrix.job_info.platform == 'GPU' }}
+        run: |
+          ./configure.py --backend=CUDA --cuda_compiler=nvcc
+
+      - name: "Run build.py"
+        run: |
+          ./build_tools/ci/build.py --build="${{ matrix.job_info.pretty_name }}_github_actions"
+
+      - name: Download Gemma Hlo Files
+        run: |
+          mkdir -p "$OUTPUT_DIR/tmp_hlo"
+          cd "$OUTPUT_DIR/tmp_hlo"
+          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo
+          cd ..
+
+      # Run the corresponding HLO tests based on platform
+      - name: Run HLO tests
+        run: |
+          BAZEL_ARCH_DIR="${{ matrix.job_info.bazel_arch_dir }}"
+          BINARY_DIR="./bazel-out/${BAZEL_ARCH_DIR}/bin/xla/tools"
+          OUTPUT_FILE_PATH="$OUTPUT_DIR/gemma3_1b_flax_call_output.txt"
+          TEST_HLO_FILE="$OUTPUT_DIR/tmp_hlo/gemma3_1b_flax_call.hlo"
+          XSPACE_FILE_PATH="$OUTPUT_DIR/gemma3_1b_flax_call_xspace.pb"
+          DEVICE_TYPE=""
+
+          if [[ ${{ matrix.job_info.platform }} == "CPU" ]]; then
+            DEVICE_TYPE="host"
+            HLO_RUNNER_BINARY="$BINARY_DIR/multihost_hlo_runner/hlo_runner_main"
+            COMPUTE_XSPACE_STATS_BINARY="$BINARY_DIR/compute_xspace_stats_main"
+          elif [[ ${{ matrix.job_info.platform }} == "GPU" ]]; then
+            DEVICE_TYPE="gpu"
+            HLO_RUNNER_BINARY="$BINARY_DIR/multihost_hlo_runner/hlo_runner_main_gpu"
+            COMPUTE_XSPACE_STATS_BINARY="$BINARY_DIR/compute_xspace_stats_main_gpu"
+          else
+            echo "Unsupported platform: ${{ matrix.job_info.platform }}"
+            exit 1
+          fi
+
+          echo "Running test with binary: $HLO_RUNNER_BINARY"
+          pwd #print working directory
+          $HLO_RUNNER_BINARY --device_type=$DEVICE_TYPE --use_spmd_partitioning --num_repeats=5 --profile_execution=True --xla_gpu_dump_xspace_to="$XSPACE_FILE_PATH" "$TEST_HLO_FILE" > "$OUTPUT_FILE_PATH"
+          $COMPUTE_XSPACE_STATS_BINARY --input="$XSPACE_FILE_PATH" --device_type="${{ matrix.job_info.platform }}" >> "$OUTPUT_FILE_PATH"
+          cat "$OUTPUT_FILE_PATH"
diff --git a/third_party/xla/.github/workflows/benchmarks/build_binaries.sh b/third_party/xla/.github/workflows/benchmarks/build_binaries.sh
new file mode 100755
index 000000000000..1581168df6f7
--- /dev/null
+++ b/third_party/xla/.github/workflows/benchmarks/build_binaries.sh
@@ -0,0 +1,113 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# .github/workflows/benchmarks/build_binaries.sh
+# TODO(juliagmt): convert this to a python script.
+#!/bin/bash
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error when substituting.
+# set -o pipefail # Causes pipelines to fail if any command fails (see Run script)
+
+echo "--- Configuring and Building Binaries ---"
+echo "Building binaries for $HARDWARE_CATEGORY..."
+
+# --- Configure ---
+echo "Configuring backend..."
+if [[ "$HARDWARE_CATEGORY" == CPU* ]]; then
+  ./configure.py --backend=CPU || echo "INFO: CPU Configure script failed or is not applicable."
+elif [[ "$HARDWARE_CATEGORY" == GPU* ]]; then
+   ./configure.py --backend=CUDA --cuda_compiler=nvcc || echo "INFO: GPU Configure script failed or is not applicable."
+else
+  echo "::error::Unsupported hardware category for configuration: $HARDWARE_CATEGORY"
+  exit 1
+fi
+echo "Configuration step finished."
+
+# --- Determine Paths and Build ---
+declare BAZEL_BIN_DIR="bazel-bin"
+declare runner_binary_path=""
+declare stats_binary_path=""
+declare device_type_flag_value=""
+declare bazel_exit_code=0
+
+# TODO(juliagmt): use build.py to build binaries.
+if [[ "$HARDWARE_CATEGORY" == CPU* ]]; then
+    runner_binary_path="./$BAZEL_BIN_DIR/xla/tools/multihost_hlo_runner/hlo_runner_main"
+    stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main"
+    device_type_flag_value="host"
+
+    echo "Building CPU binaries with RBE..."
+     bazel build \
+        --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
+        --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
+        --config=warnings \
+        --config=nonccl \
+        --config=rbe_linux_cpu \
+        --color=yes \
+        --test_output=errors \
+        --verbose_failures \
+        --keep_going \
+        --nobuild_tests_only \
+        --profile=profile.json.gz \
+        --flaky_test_attempts=3 \
+        --jobs=150 \
+        --bes_upload_mode=fully_async \
+        //xla/tools/multihost_hlo_runner:hlo_runner_main \
+        //xla/tools:compute_xspace_stats_main
+      bazel_exit_code=$?
+
+elif [[ "$HARDWARE_CATEGORY" == GPU* ]]; then
+    runner_binary_path="./$BAZEL_BIN_DIR/xla/tools/multihost_hlo_runner/hlo_runner_main_gpu"
+    stats_binary_path="./$BAZEL_BIN_DIR/xla/tools/compute_xspace_stats_main_gpu"
+    device_type_flag_value="gpu"
+
+    echo "Building GPU binaries with RBE..."
+     bazel build \
+        --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only \
+        --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd \
+        --config=warnings --config=rbe_linux_cuda_nvcc \
+        --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 \
+        --run_under=//build_tools/ci:parallel_gpu_execute \
+        --@cuda_driver//:enable_forward_compatibility=false --color=yes \
+        --test_output=errors --verbose_failures --keep_going --nobuild_tests_only \
+        --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 \
+        --bes_upload_mode=fully_async \
+         -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+      bazel_exit_code=$?
+else
+    echo "::error::Unsupported hardware category for building binaries: $HARDWARE_CATEGORY"
+    exit 1
+fi
+ # Check build result
+ if [ $bazel_exit_code -ne 0 ]; then 
+   echo "::error::Bazel build failed with exit code $bazel_exit_code!"
+   exit $bazel_exit_code
+ fi
+ echo "Bazel build completed successfully."
+
+# --- Verify and Output ---
+echo "Verifying binary existence..."
+if [ ! -f "$runner_binary_path" ]; then echo "::error::Runner binary '$runner_binary_path' not found after build!"; exit 1; fi
+if [ ! -f "$stats_binary_path" ]; then echo "::error::Stats binary '$stats_binary_path' not found after build!"; exit 1; fi
+echo "Binaries verified."
+ 
+echo "Setting step outputs..."
+echo "runner_binary=$runner_binary_path" >> "$GITHUB_OUTPUT"
+echo "stats_binary=$stats_binary_path" >> "$GITHUB_OUTPUT"
+echo "device_type_flag=$device_type_flag_value" >> "$GITHUB_OUTPUT"
+
+echo "  runner_binary=$runner_binary_path"
+echo "  stats_binary=$stats_binary_path"
+echo "  device_type_flag=$device_type_flag_value"
+echo "--- Build Script Finished ---"
diff --git a/third_party/xla/.github/workflows/benchmarks/prepare_artifact.sh b/third_party/xla/.github/workflows/benchmarks/prepare_artifact.sh
new file mode 100755
index 000000000000..e3af7e058e36
--- /dev/null
+++ b/third_party/xla/.github/workflows/benchmarks/prepare_artifact.sh
@@ -0,0 +1,86 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# .github/workflows/benchmarks/prepare_artifact.sh
+# TODO(juliagmt): convert this to a python script.
+#!/bin/bash
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error when substituting.
+
+echo "--- prepare_artifact.sh (Self-creating directory version) ---"
+echo "SCRIPT: Current PWD: $(pwd)"
+echo "SCRIPT: GITHUB_WORKSPACE is: $GITHUB_WORKSPACE"
+echo "SCRIPT: Intended OUTPUT_DIR is: $OUTPUT_DIR"
+
+# Create the directory HERE, inside this script, right before using it.
+echo "SCRIPT: Ensuring directory '$OUTPUT_DIR' exists by creating it with mkdir -p."
+mkdir -p "$OUTPUT_DIR"
+
+# Verify creation immediately
+echo "SCRIPT: Verifying directory '$OUTPUT_DIR' after mkdir with 'ls -ld':"
+ls -ld "$OUTPUT_DIR" || echo "SCRIPT: 'ls -ld ""$OUTPUT_DIR""' FAILED even after mkdir in script!"
+
+# Now, check with [ -d ... ]
+if [ ! -d "$OUTPUT_DIR" ]; then
+  echo "::error::SCRIPT: Output directory '$OUTPUT_DIR' STILL NOT found with [ -d ... ] even after mkdir in this script."
+  echo "SCRIPT: Listing parent directory '$(dirname "$OUTPUT_DIR")' using 'ls -la':"
+  ls -la "$(dirname "$OUTPUT_DIR")" || echo "SCRIPT: Failed to list parent directory."
+  exit 1
+else
+  echo "SCRIPT: Output directory '$OUTPUT_DIR' IS now found with [ -d ... ]."
+fi
+
+# --- Original script logic from here ---
+echo "--- Preparing Artifact (main logic) ---"
+
+ARTIFACT_FILE_NAME=$(basename "$ARTIFACT_LOCATION")
+LOCAL_ARTIFACT_PATH="$OUTPUT_DIR/$ARTIFACT_FILE_NAME"
+
+echo "Target local path: ${LOCAL_ARTIFACT_PATH}"
+
+if [ "$IS_GCS_ARTIFACT" == "true" ]; then
+  echo "Downloading GCS artifact from: $ARTIFACT_LOCATION"
+   if ! command -v wget &> /dev/null; then
+     echo "::error::wget command not found in container. Cannot download GCS artifact."
+     exit 1
+   fi
+
+   wget -q -nv -O "$LOCAL_ARTIFACT_PATH" "$ARTIFACT_LOCATION"
+   WGET_EXIT_CODE=$?
+    if [ $WGET_EXIT_CODE -ne 0 ]; then
+      echo "::error::wget failed to download GCS artifact from $ARTIFACT_LOCATION (Exit code: $WGET_EXIT_CODE)"
+      rm -f "$LOCAL_ARTIFACT_PATH" # Clean up partial file
+      exit $WGET_EXIT_CODE
+    fi
+   echo "GCS artifact downloaded."
+else
+   REPO_ARTIFACT_PATH="$GITHUB_WORKSPACE/$ARTIFACT_LOCATION" # ARTIFACT_LOCATION is the relative repo path here
+   echo "Copying local artifact from workspace path: $REPO_ARTIFACT_PATH (IS_GCS_ARTIFACT was false)"
+    if [ ! -f "$REPO_ARTIFACT_PATH" ]; then
+       echo "::error::Local artifact not found at repository path: $REPO_ARTIFACT_PATH"
+       exit 1
+    fi
+    cp -v "$REPO_ARTIFACT_PATH" "$LOCAL_ARTIFACT_PATH" || exit 1 # Exit if copy fails
+   echo "Local artifact copied successfully."
+fi
+
+# Verify the final destination file exists
+if [ ! -f "$LOCAL_ARTIFACT_PATH" ]; then
+   echo "::error::Final artifact file not found at destination: $LOCAL_ARTIFACT_PATH"
+   exit 1
+fi
+echo "Artifact successfully prepared at $LOCAL_ARTIFACT_PATH."
+
+echo "artifact_local_path=$LOCAL_ARTIFACT_PATH" >> "$GITHUB_OUTPUT"
+echo "--- Artifact Prep Finished ---"
diff --git a/third_party/xla/.github/workflows/benchmarks/run_benchmark.sh b/third_party/xla/.github/workflows/benchmarks/run_benchmark.sh
new file mode 100755
index 000000000000..c9b6f679de13
--- /dev/null
+++ b/third_party/xla/.github/workflows/benchmarks/run_benchmark.sh
@@ -0,0 +1,152 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# .github/workflows/benchmarks/prepare_artifact.sh
+# TODO(juliagmt): convert this to a python script.
+#!/bin/bash
+set -u # Treat unset variables as an error when substituting.
+# IMPORTANT: pipefail is handled specifically around the runner command.
+set -e # Exit on errors, EXCEPT where explicitly handled.
+
+echo "--- Running Benchmark ---"
+
+# Reads ENVs from the Step:
+#   RUNNER_BINARY, STATS_BINARY, DEVICE_TYPE_FLAG, LOCAL_ARTIFACT_PATH
+# Reads ENVs from the Job:
+#   BENCHMARK_NAME, CONFIG_ID, HARDWARE_CATEGORY, OUTPUT_DIR,
+#   XLA_FLAGS_JSON, RUNTIME_FLAGS_JSON, 
+#   COMMIT_SHA, WORKFLOW_RUN_ID
+
+# --- Validate Inputs ---
+if [ -z "$LOCAL_ARTIFACT_PATH" ] || [ ! -f "$LOCAL_ARTIFACT_PATH" ]; then echo "::error::LOCAL_ARTIFACT_PATH path is invalid or file not found: '$LOCAL_ARTIFACT_PATH'"; exit 1; fi
+if [ -z "$RUNNER_BINARY" ] || [ ! -x "$RUNNER_BINARY" ]; then echo "::error::RUNNER_BINARY path is invalid or file not executable: '$RUNNER_BINARY'"; exit 1; fi
+if [ -z "$DEVICE_TYPE_FLAG" ]; then echo "::error::DEVICE_TYPE_FLAG is empty"; exit 1; fi
+if [ -z "$STATS_BINARY" ] || [ ! -x "$STATS_BINARY" ]; then echo "::error::STATS_BINARY path is invalid or file not executable: '$STATS_BINARY'"; exit 1; fi
+if ! command -v jq &> /dev/null; then echo "::error::jq command not found."; exit 1; fi
+
+RUNNER_STDOUT_FILE="$OUTPUT_DIR/runner_stdout.txt"
+XSPACE_FILE_PATH="$OUTPUT_DIR/xspace.pb"
+RESULTS_JSON_FILE="$OUTPUT_DIR/results.json"
+
+# --- Prepare flags ---
+declare -a xla_flags_array=()
+declare -a runtime_flags_array=()
+
+# Use JQ to safely parse JSON and populate bash arrays
+if echo "$XLA_FLAGS_JSON" | jq -e '. | arrays and length > 0' > /dev/null; then
+    mapfile -t xla_flags_array < <(echo "$XLA_FLAGS_JSON" | jq -r '.[]')
+fi
+if echo "$RUNTIME_FLAGS_JSON" | jq -e '. | arrays and length > 0' > /dev/null; then
+   mapfile -t runtime_flags_array < <(echo "$RUNTIME_FLAGS_JSON" | jq -r '.[]')
+fi
+
+# Conditionally add profile flag if needed for stats
+needs_profile_flag=true
+for flag in "${runtime_flags_array[@]}"; do
+    if [[ "$flag" == "--profile_execution"* ]]; then
+        needs_profile_flag=false; break
+    fi
+done
+needs_xspace_dump_flag=true # Assume we always want stats if possible
+if $needs_profile_flag && $needs_xspace_dump_flag; then
+    runtime_flags_array+=("--profile_execution=True")
+     echo "INFO: Added --profile_execution=True for stats generation."
+fi
+
+# --- Build Runner Command ---
+declare -a runner_command_array=("$RUNNER_BINARY" "--device_type=$DEVICE_TYPE_FLAG")
+if [ ${#runtime_flags_array[@]} -gt 0 ]; then runner_command_array+=("${runtime_flags_array[@]}"); fi
+if [ ${#xla_flags_array[@]} -gt 0 ]; then runner_command_array+=("${xla_flags_array[@]}"); fi
+if $needs_xspace_dump_flag; then
+   runner_command_array+=("--xla_gpu_dump_xspace_to=$XSPACE_FILE_PATH")
+fi
+runner_command_array+=("$LOCAL_ARTIFACT_PATH")
+
+# --- Execute Runner ---
+echo "Executing HLO Runner command:" 
+printf "%q " "${runner_command_array[@]}"; echo # Print quoted command
+
+set +e # Disable exit-on-error temporarily to capture exit code
+set -o pipefail # Ensure tee doesn't mask the runner's exit code
+"${runner_command_array[@]}" 2>&1 | tee "$RUNNER_STDOUT_FILE"
+RUNNER_EXIT_CODE=${PIPESTATUS[0]}
+set +o pipefail
+set -e # Re-enable exit-on-error
+
+echo "Runner stdout/stderr saved to $RUNNER_STDOUT_FILE"
+echo "Runner exited with code: $RUNNER_EXIT_CODE"
+
+# --- Execute Stats or Generate Fallback JSON ---
+STATS_EXIT_CODE=0
+if [ -f "$XSPACE_FILE_PATH" ] && [ $RUNNER_EXIT_CODE -eq 0 ]; then
+  echo "Running compute_xspace_stats_main..."
+  STATS_PLATFORM_TYPE=$([[ "$HARDWARE_CATEGORY" == GPU* ]] && echo "GPU" || echo "CPU")
+  declare -a stats_command_array=("$STATS_BINARY" "--input=$XSPACE_FILE_PATH" "--device_type=$STATS_PLATFORM_TYPE" "--output_json=$RESULTS_JSON_FILE")
+
+  echo "Executing Stats command:"; printf "%q " "${stats_command_array[@]}"; echo
+
+  set +e # Disable exit-on-error temporarily
+  "${stats_command_array[@]}" >> "$RUNNER_STDOUT_FILE" # Append stats stdout to runner log
+  STATS_EXIT_CODE=$?
+  set -e # Re-enable
+
+  if [ $STATS_EXIT_CODE -ne 0 ]; then
+     echo "::warning::compute_xspace_stats_main failed with code $STATS_EXIT_CODE."
+      # Fallback to creating JSON with run status and error message for stats failure
+      jq -n \
+        --arg bn "$BENCHMARK_NAME" --arg cid "$CONFIG_ID" --arg hc "$HARDWARE_CATEGORY" \
+        --arg rs "STATS_FAILURE" \
+        --arg em "compute_xspace_stats_main failed with code $STATS_EXIT_CODE. Runner was successful." \
+        --arg cs "$COMMIT_SHA" --arg wrid "$WORKFLOW_RUN_ID" \
+        '{ benchmark_name: $bn, config_id: $cid, hardware_category: $hc, run_status: $rs, error_message: $em, commit_sha: $cs, workflow_run_id: $wrid }' \
+        > "$RESULTS_JSON_FILE"
+     echo "Fallback results JSON created at $RESULTS_JSON_FILE due to stats failure."
+  else
+      echo "Stats computed and saved to $RESULTS_JSON_FILE"
+  fi
+else
+   # Create fallback JSON if Runner failed OR if Runner succeeded but produced no XSpace file
+   if [ $RUNNER_EXIT_CODE -ne 0 ]; then 
+      echo "::warning::Runner failed (Exit Code: $RUNNER_EXIT_CODE), skipping stats."
+   else 
+     echo "::warning::XSpace file missing at $XSPACE_FILE_PATH, skipping stats."
+   fi
+
+   RUN_STATUS=$([ $RUNNER_EXIT_CODE -eq 0 ] && echo "SUCCESS_NO_PROFILE" || echo "FAILURE")
+   ERROR_MSG=$([ $RUNNER_EXIT_CODE -ne 0 ] && echo "Runner failed with code $RUNNER_EXIT_CODE" || echo "XSpace file not generated by successful run.")
+
+    jq -n \
+      --arg bn "$BENCHMARK_NAME" --arg cid "$CONFIG_ID" --arg hc "$HARDWARE_CATEGORY" \
+      --arg rs "$RUN_STATUS" --arg em "$ERROR_MSG" \
+       --arg cs "$COMMIT_SHA" --arg wrid "$WORKFLOW_RUN_ID" \
+      '{ benchmark_name: $bn, config_id: $cid, hardware_category: $hc, run_status: $rs, error_message: $em, commit_sha: $cs, workflow_run_id: $wrid }' \
+       > "$RESULTS_JSON_FILE"
+
+     if [ $? -eq 0 ]; then
+        echo "Basic results JSON created at $RESULTS_JSON_FILE."
+     else
+        # Should not happen if jq is present, but a safety-net
+        echo "::error::FATAL: Failed to create basic results JSON using jq."
+        echo "Fallback error: Benchmark Name: $BENCHMARK_NAME, Run Status: $RUN_STATUS, Error: $ERROR_MSG" > "$RESULTS_JSON_FILE.txt"
+        exit 1 # Make sure this failure is noted
+     fi
+fi
+
+# --- Final Exit Status ---
+if [ $RUNNER_EXIT_CODE -ne 0 ]; then 
+  echo "::error::Benchmark run failed (Runner Exit Code: $RUNNER_EXIT_CODE)."
+  exit $RUNNER_EXIT_CODE # Propagate the runner's failure code
+fi
+
+echo "--- Run Benchmark Script Finished Successfully ---"
\ No newline at end of file
diff --git a/third_party/xla/.github/workflows/ci.yml b/third_party/xla/.github/workflows/ci.yml
index 11428d5bbc07..623c64d86ce5 100644
--- a/third_party/xla/.github/workflows/ci.yml
+++ b/third_party/xla/.github/workflows/ci.yml
@@ -91,7 +91,7 @@ jobs:
     defaults:
       run:
         shell: bash
-    timeout-minutes: 30
+    timeout-minutes: 60
     steps:
       - name: "Checking out openxla/xla"
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
diff --git a/third_party/xla/.github/workflows/cpu_benchmarks.yml b/third_party/xla/.github/workflows/cpu_benchmarks.yml
deleted file mode 100644
index 7b7ffeda2fd3..000000000000
--- a/third_party/xla/.github/workflows/cpu_benchmarks.yml
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-name: CPU Benchmarks
-permissions:
-  contents: read
-on:
-  workflow_dispatch:  # Allows manual triggering
-  schedule:
-    - cron: '0 */6 * * *'  # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18)
-
-jobs:
-  Tests:
-    strategy:
-      # Don't fail fast - want to see results for all builds even if one fails.
-      fail-fast: false
-      matrix:
-        job_info: [
-          {
-            os: "linux-x86-n2-16",
-            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
-            pretty_name: "XLA Linux x86 CPU with 16 vcpu",
-          },
-          {
-            os: "linux-arm64-c4a-16",
-            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest",
-            pretty_name: "XLA Linux ARM64 CPU",
-          },
-          {
-            os: "linux-x86-n2-128",
-            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
-            pretty_name: "XLA Linux x86 CPU with 128 vcpu",
-          }
-        ]
-    name: ${{ matrix.job_info.pretty_name }}
-    runs-on: ${{ matrix.job_info.os }}
-    container: ${{ matrix.job_info.container }}
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 540
-    steps:
-      - name: Print machine specs
-        run: |
-          lscpu
-          free -h  # Memory information
-          df -h    # Disk space information
-          uname -a # Kernel information
-
-      - name: Check Python Version in Container
-        run: python3 --version
-
-      - name: Install Python 3.10 if not present (IN CONTAINER)
-        run: |
-          if ! python3 --version > /dev/null 2>&1; then # check for python3
-            echo "Python 3 not found, installing..."
-            apt-get update
-            apt-get install -y python3.10 python3-pip
-          else
-            echo "Python 3 found."
-          fi
-
-      - name: Checkout OpenXLA
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Create results directory
-        run:
-          mkdir results
-
-      - name: Print machine specs
-        run: |
-          lscpu
-          free -h  # Memory information
-          df -h    # Disk space information
-          uname -a # Kernel information
-
-      - name: Run E2E benchmarks flax_2b
-        timeout-minutes: 60
-        run: |
-          cd xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
-          bash setup.sh
-          bash run.sh | tee -a ../../../../../../../results/flax_2b.log
-
-      - name: Configure CPU backend
-        run: |
-          ./configure.py --backend=CPU
-
-      - name: Build run_hlo_module
-        run: bazel build -c opt --dynamic_mode=off xla/tools:run_hlo_module
-
-      - name: Run xla/tests/fuzz/rand_000001.hlo and collect runtime and compile time
-        run: ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU xla/tests/fuzz/rand_000001.hlo
-
-      - name: Download and run Gemma2 2B HLO and collect runtime and compile time
-        run: |
-          mkdir -p tmp_hlo
-          cd tmp_hlo
-          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma2_2b_keras_jax.hlo
-          cd ..
-          ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU --reference_platform="" tmp_hlo/gemma2_2b_keras_jax.hlo
-
-      - name: Compute the cost of gemma2_2b_keras_jax.hlo
-        run: |
-          PWD=$(pwd)
-          bazel run //xla/tools:compute_cost -- --input=$PWD/tmp_hlo/gemma2_2b_keras_jax.hlo --format=hlo
-
-      - name: Build hlo_runner_main
-        run: bazel build -c opt --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
-
-      - name: Run hlo_runner_main on gemma2_2b_keras_jax.hlo and collect profile stats
-        run: |
-          ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=host --log_output=True --use_spmd_partitioning --profile_execution=True tmp_hlo/gemma2_2b_keras_jax.hlo
diff --git a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
new file mode 100644
index 000000000000..f2e4ee1b979a
--- /dev/null
+++ b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
@@ -0,0 +1,223 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+name: Nightly CPU Benchmarks
+permissions:
+  contents: read
+on:
+  workflow_dispatch:  # Allows manual triggering
+  schedule:
+    - cron: "0 0 * * *"  # Run at midnight every day
+
+jobs:
+  Tests:
+    strategy:
+      # Don't fail fast - want to see results for all builds even if one fails.
+      fail-fast: false
+      matrix:
+        job_info: [
+           {
+            pool: "linux-arm64-t2a-48",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest",
+            pretty_name: "XLA Linux ARM64 CPU 48 vcpu Presubmit",
+            bazel_arch_dir: "aarch64-opt",
+            platform: "CPU",
+            output_dir: "cpu_48_output"
+          },
+          {
+            pool: "linux-x86-n2-128",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 CPU 128 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "CPU",
+            output_dir: "cpu_128_output"
+          }
+        ]
+    name: ${{ matrix.job_info.pretty_name }}
+    runs-on: ${{ matrix.job_info.pool }}
+    container: ${{ matrix.job_info.container }}
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 540
+    env:
+      OUTPUT_DIR: ${{ github.workspace }}/${{ matrix.job_info.output_dir }}
+    steps:
+      - name: Print machine specs
+        run: |
+          lscpu
+          free -h  # Memory information
+          df -h    # Disk space information
+          uname -a # Kernel information
+
+      - name: Check Python Version in Container
+        run: python3 --version
+
+      - name: Install Python 3.10 if not present (IN CONTAINER)
+        run: |
+          if ! python3 --version > /dev/null 2>&1; then # check for python3
+            echo "Python 3 not found, installing..."
+            apt-get update
+            apt-get install -y python3.10 python3-pip
+          else
+            echo "Python 3 found."
+          fi
+
+      - name: Checkout OpenXLA
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Create results directory
+        run:
+          mkdir results
+
+      - name: Configure CPU backend
+        run: |
+          ./configure.py --backend=CPU
+
+      - name: Download and run Gemma HLO files and collect runtime and compile time stats
+        if: ${{ matrix.job_info.os != 'linux-arm64-t2a-48' }}
+        run: |
+          mkdir -p "$OUTPUT_DIR/tmp_hlo"
+          cd "$OUTPUT_DIR/tmp_hlo"
+          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma2_2b_keras_jax.hlo
+          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo
+          cd ..
+
+      - name: Print GitHub Context
+        run: |
+          echo "GitHub SHA: ${{ github.sha }}"
+          echo "GitHub Ref: ${{ github.ref }}"
+          echo "GitHub Ref Name: ${{ github.ref_name }}"
+          echo "GitHub Head Ref: ${{ github.head_ref }})"
+          echo "GitHub Base Ref: ${{ github.base_ref }})"
+          echo "GitHub Repository: ${{ github.repository }}"
+          echo "GitHub Run ID: ${{ github.run_id }}"
+          echo "GitHub Run Number: ${{ github.run_number }}"
+          echo "GitHub Workflow: ${{ github.workflow }}"
+          echo "GitHub Actor: ${{ github.actor }}"
+          echo "GitHub Event Name: ${{ github.event_name }}"
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "Pull Request Number: ${{ github.event.pull_request.number }}"
+            echo "Pull Request Head Ref: ${{ github.event.pull_request.head.ref }}"
+            echo "Pull Request Base Ref: ${{ github.event.pull_request.base.ref }}"
+          fi
+
+      # Find the current PR number, if any, because github context doesn't have it for push events.
+      - uses: jwalton/gh-find-current-pr@2f6a0c6ed5c54c19f04d8411e0723b3de68f464a # v1.3.3
+        id: findPr
+        with:
+          # Can be "open", "closed", or "all".  Defaults to "open".
+          state: all
+      # This will echo "Your PR is 7", or be skipped if there is no current PR.
+      - run: echo "PR_NUMBER=${{ steps.findPr.outputs.pr }}"
+        if: success() && steps.findPr.outputs.number
+        env:
+          PR: ${{ steps.findPr.outputs.pr }}
+
+      - name: "Run build.py"
+        run: |
+          ./build_tools/ci/build.py --build="${{ matrix.job_info.pretty_name }}_github_actions"
+
+
+      - name: Run hlo_runner_main on cpu_hlo.hlo and collect profile stats
+        run: |
+          bazel_arch_dir="${{ matrix.job_info.bazel_arch_dir }}"
+          binary_dir="./bazel-out/${bazel_arch_dir}/bin/xla/tools"
+          mkdir -p "$OUTPUT_DIR"
+          OUTPUT_FILE_PATH="$OUTPUT_DIR/cpu_hlo_output.txt"
+          XSPACE_FILE_PATH="$OUTPUT_DIR/cpu_hlo_xspace.pb"
+          test_hlo_file="xla/tools/hlo_opt/tests/cpu_hlo.hlo"
+          echo "Running CPU test with binary: $binary_dir"
+          pwd #print working directory
+          $binary_dir/multihost_hlo_runner/hlo_runner_main --device_type=host --num_repeats=5 --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to="$XSPACE_FILE_PATH" "$test_hlo_file" > "$OUTPUT_FILE_PATH"
+          $binary_dir/compute_xspace_stats_main --input="$XSPACE_FILE_PATH" --device_type=CPU >> "$OUTPUT_FILE_PATH"
+          cat "$OUTPUT_FILE_PATH"
+
+      - name: Run hlo_runner_main on gemma2_2b_keras_jax.hlo and collect profile stats
+        if: ${{ matrix.job_info.pool != 'linux-arm64-t2a-48' }}
+        run: |
+          bazel_arch_dir="${{ matrix.job_info.bazel_arch_dir }}"
+          binary_dir="./bazel-out/${bazel_arch_dir}/bin/xla/tools"
+          OUTPUT_FILE_PATH="$OUTPUT_DIR/gemma2_2b_keras_jax_output.txt"
+          XSPACE_FILE_PATH="$OUTPUT_DIR/gemma2_2b_xspace.pb"
+          test_hlo_file="$OUTPUT_DIR/tmp_hlo/gemma2_2b_keras_jax.hlo"
+          echo "Running CPU test with binary: $binary_dir"
+          pwd #print working directory
+          $binary_dir/multihost_hlo_runner/hlo_runner_main --device_type=host --num_repeats=5 --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to="$XSPACE_FILE_PATH" "$test_hlo_file" > "$OUTPUT_FILE_PATH"
+          $binary_dir/compute_xspace_stats_main --input="$XSPACE_FILE_PATH" --device_type=CPU >> "$OUTPUT_FILE_PATH"
+          cat "$OUTPUT_FILE_PATH"
+
+      - name: Run hlo_runner_main on gemma3_1b_flax_call.hlo and collect profile stats
+        if: ${{ matrix.job_info.pool != 'linux-arm64-t2a-48' }}
+        run: |
+          bazel_arch_dir="${{ matrix.job_info.bazel_arch_dir }}"
+          binary_dir="./bazel-out/${bazel_arch_dir}/bin/xla/tools"
+          OUTPUT_FILE_PATH="$OUTPUT_DIR/gemma3_1b_flax_call_output.txt"
+          XSPACE_FILE_PATH="$OUTPUT_DIR/gemma3_1b_xspace.pb"
+          test_hlo_file="$OUTPUT_DIR/tmp_hlo/gemma3_1b_flax_call.hlo"
+          echo "Running CPU test with binary: $binary_dir"
+          pwd #print working directory
+          $binary_dir/multihost_hlo_runner/hlo_runner_main --device_type=host --num_repeats=5 --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to="$XSPACE_FILE_PATH" "$test_hlo_file" > "$OUTPUT_FILE_PATH"
+          $binary_dir/compute_xspace_stats_main --input="$XSPACE_FILE_PATH" --device_type=CPU >> "$OUTPUT_FILE_PATH"
+          cat "$OUTPUT_FILE_PATH"
+
+      - name: Upload HLO test output to a GCS bucket
+        if: ${{ matrix.job_info.pool != 'linux-arm64-t2a-48' }}
+        run: |
+          GCS_BUCKET="gs://openxla-nightly-transient"
+          TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+          DATE_FOLDER=$(date +%Y%m%d)
+          FILENAME_GEMMA="gemma2_2b_keras_jax_output.txt"
+          FILENAME_CPU="cpu_hlo_output.txt"
+          FILENAME_GEMMA3="gemma3_1b_flax_call_output.txt"
+          COMMIT_SHA="${{ github.sha }}"
+          RUN_ID="${{ github.run_id }}"
+          PR_NUMBER="${{ steps.findPr.outputs.pr }}"
+          OUTPUT_SUBDIR="${{ matrix.job_info.output_dir }}"
+          PR_SUFFIX=""
+          if [[ -n "$PR_NUMBER" ]]; then
+            PR_SUFFIX="_pr_${PR_NUMBER}"
+          fi
+
+          GEMMA_GCS_OBJECT_NAME="${DATE_FOLDER}/${TIMESTAMP}_${{ matrix.job_info.pool }}_run_${RUN_ID}_commit_${COMMIT_SHA}${PR_SUFFIX}_${OUTPUT_SUBDIR}_${FILENAME_GEMMA}"
+          CPU_GCS_OBJECT_NAME="${DATE_FOLDER}/${TIMESTAMP}_${{ matrix.job_info.pool }}_run_${RUN_ID}_commit_${COMMIT_SHA}${PR_SUFFIX}_${OUTPUT_SUBDIR}_${FILENAME_CPU}"
+          GEMMA3_GCS_OBJECT_NAME="${DATE_FOLDER}/${TIMESTAMP}_${{ matrix.job_info.pool }}_run_${RUN_ID}_commit_${COMMIT_SHA}${PR_SUFFIX}_${OUTPUT_SUBDIR}_${FILENAME_GEMMA3}"
+
+          echo "Uploading $OUTPUT_DIR/$FILENAME_GEMMA to $GCS_BUCKET/$GEMMA_GCS_OBJECT_NAME"
+          ls -l "$OUTPUT_DIR/$FILENAME_GEMMA" # Verify the file exists
+          gsutil cp "$OUTPUT_DIR/$FILENAME_GEMMA" "$GCS_BUCKET/$GEMMA_GCS_OBJECT_NAME"
+
+          echo "Uploading $OUTPUT_DIR/$FILENAME_CPU to $GCS_BUCKET/$CPU_GCS_OBJECT_NAME"
+          ls -l "$OUTPUT_DIR/$FILENAME_CPU" # Verify the file exists
+          gsutil cp "$OUTPUT_DIR/$FILENAME_CPU" "$GCS_BUCKET/$CPU_GCS_OBJECT_NAME"
+
+          echo "Uploading $OUTPUT_DIR/$FILENAME_GEMMA3 to $GCS_BUCKET/$GEMMA3_GCS_OBJECT_NAME"
+          ls -l "$OUTPUT_DIR/$FILENAME_GEMMA3" # Verify the file exists
+          gsutil cp "$OUTPUT_DIR/$FILENAME_GEMMA3" "$GCS_BUCKET/$GEMMA3_GCS_OBJECT_NAME"
+
+      - name: Upload XSpace artifacts
+        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        with:
+          name: cpu-xla-benchmarks-xspace-${{ matrix.job_info.pool }}
+          path: ${{ github.workspace }}/${{ matrix.job_info.output_dir }}/*_xspace.pb
+
+      - name: Run E2E benchmarks flax_2b
+        timeout-minutes: 60
+        run: |
+          cd xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
+          bash setup.sh
+          bash run.sh | tee -a ../../../../../../../results/flax_2b.log
+
+
+    # TODO(juliagmt): Add CPU XSpace collection support to hlo_runner_main.
\ No newline at end of file
diff --git a/third_party/xla/.github/workflows/generate_benchmark_matrix.yml b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
new file mode 100644
index 000000000000..e8d35bc7c4bb
--- /dev/null
+++ b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
@@ -0,0 +1,109 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# .github/workflows/generate_benchmark_matrix.yml
+name: Generate Benchmark Matrix
+
+permissions:
+  contents: read
+
+on:
+  workflow_call:
+    inputs:
+      registry_file:
+        description: 'Path to the benchmark registry file (relative to repository root)'
+        required: false
+        type: string
+        default: 'xla/tools/benchmarks/registries/default_registry.yml'
+      workflow_type:
+        description: 'The type of workflow triggering this generation (presubmit, postsubmit, scheduled, manual)'
+        required: true
+        type: string # Keep as string for flexibility (e.g., "nightly" alias)
+      checkout_ref:
+        description: 'The Git ref (branch, tag, or SHA) to checkout'
+        required: false
+        type: string
+        default: '' # Default to the caller's context ref
+    outputs:
+      matrix_include_json:
+        description: 'JSON string representing the array for the "include" matrix strategy'
+        value: ${{ jobs.generate.outputs.matrix_json_output }} # Note: Output name matches job output
+
+jobs:
+  generate:
+    name: Generate Matrix (${{ inputs.workflow_type }})
+    runs-on: linux-x86-n2-128
+    container: us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest
+    outputs:
+      matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }}
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Checkout OpenXLA
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Configure OpenXLA
+        run: |
+          echo "Configuring OpenXLA..."
+          ./configure.py --backend=CPU
+      # TODO(juliagmt): Use build.py to build the binary.
+      - name: Build generate_benchmark_matrices_main
+        run: |
+          echo "Building generator..."
+          bazel build \
+                --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
+                --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd \
+                --config=warnings \
+                --config=nonccl \
+                --config=rbe_linux_cpu \
+                --color=yes \
+                --test_output=errors \
+                --verbose_failures \
+                --keep_going \
+                --nobuild_tests_only \
+                --profile=profile.json.gz \
+                --flaky_test_attempts=3 \
+                --jobs=150 \
+                --bes_upload_mode=fully_async \
+                //xla/tools/benchmarks/utils:generate_benchmark_matrices_main
+
+      - name: Run generate_benchmark_matrices_main
+        id: run_generator
+        run: |
+          BINARY_PATH="./bazel-bin/xla/tools/benchmarks/utils/generate_benchmark_matrices_main"
+          REGISTRY_PATH="${{ inputs.registry_file }}"
+          WORKFLOW_TYPE="${{ inputs.workflow_type }}"
+
+          if [ ! -f "$BINARY_PATH" ]; then echo "Error: Generator binary not found."; exit 1; fi
+          if [ ! -f "$REGISTRY_PATH" ]; then echo "Error: Registry file '$REGISTRY_PATH' not found."; exit 1; fi
+
+          echo "Generating matrix for workflow type: $WORKFLOW_TYPE"
+          JSON_ARRAY_STRING=$($BINARY_PATH --registry_file="$REGISTRY_PATH" --workflow_type="$WORKFLOW_TYPE")
+
+          # Basic validation that output looks like a JSON array (starts/ends with brackets)
+          if [[ ! "$JSON_ARRAY_STRING" =~ ^\[.*\]$ ]]; then
+             echo "Error: Generator output does not look like a JSON array."
+             echo "Output was: $JSON_ARRAY_STRING"
+             exit 1
+          fi
+
+          echo "Generated matrix array string:"
+          echo "$JSON_ARRAY_STRING"
+
+          # Set the output for the job
+          echo "matrix_json<<EOF" >> $GITHUB_OUTPUT
+          echo "$JSON_ARRAY_STRING" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
diff --git a/third_party/xla/.github/workflows/gpu_benchmarks.yml b/third_party/xla/.github/workflows/gpu_benchmarks.yml
deleted file mode 100644
index ea8c475718b2..000000000000
--- a/third_party/xla/.github/workflows/gpu_benchmarks.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-name: GPU Benchmarks
-permissions:
-  contents: read
-on:
-  workflow_dispatch:  # Allows manual triggering
-  schedule:
-    - cron: '0 */6 * * *'  # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18)
-
-jobs:
-  Tests:
-    strategy:
-      # Don't fail fast - want to see results for all builds even if one fails.
-      fail-fast: false
-      matrix:
-        job_info: [
-          {
-            os: "linux-x86-g2-48-l4-4gpu",
-            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
-            pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs",
-          },
-          # Expect more GPU types in the future.
-        ]
-    name: ${{ matrix.job_info.pretty_name }}
-    runs-on: ${{ matrix.job_info.os }}
-    container: ${{ matrix.job_info.container }}
-    defaults:
-      run:
-        shell: bash
-    timeout-minutes: 360
-    steps:
-      - name: Checkout XLA
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Print machine specs
-        run: |
-          nvidia-smi
-          free -h  # Memory information
-          df -h    # Disk space information
-          uname -a # Kernel information
-
-      - name: Create results directory
-        run: mkdir -p results
-
-      - name: Configure XLA for GPU backend
-        run: |
-          ./configure.py --backend CUDA --nccl
-
-      - name: Set TF_CPP_MAX_VLOG_LEVEL
-        env:
-          TF_CPP_MAX_VLOG_LEVEL: 1
-        run: |
-          echo "TF_CPP_MAX_VLOG_LEVEL is: $TF_CPP_MAX_VLOG_LEVEL"
-
-      - name: Build hlo_runner_main
-        run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
-
-      # TODO(juliagmt): Add more performance-critical HLOs to benchmark.
-      - name: Run gpu_hlo_pass.hlo and dump XSpace with profile stats
-        run: |
-          ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to=results/gpu_hlo_pass_xspace.pb xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
-
-      - name: Compute the cost of gpu_hlo_pass.hlo
-        run: |
-          PWD=$(pwd)
-          bazel run //xla/tools:compute_cost -- --input=$PWD/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo --format=hlo --gpu
-
-      - name: Download and run Gemma2 2B HLO and collect runtime and compile time
-        run: |
-          mkdir -p tmp_hlo
-          cd tmp_hlo
-          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma2_2b_keras_jax.hlo
-          cd ..
-
-      - name: Run gemma2_2b_keras_jax.hlo and dump XSpace with profile stats
-        run: |
-          ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=results/gemma2_2b_keras_jax_xspace.pb tmp_hlo/gemma2_2b_keras_jax.hlo
-
-      - name: Compute the cost of gemma2_2b_keras_jax.hlo
-        run: |
-          PWD=$(pwd)
-          bazel run //xla/tools:compute_cost -- --input=$PWD/tmp_hlo/gemma2_2b_keras_jax.hlo --format=hlo --gpu
-
-      - name: Build run_hlo_module
-        run: bazel build -c opt --config=cuda --dynamic_mode=off xla/tools:run_hlo_module
-
-      - name: Run gpu_hlo_pass.hlo and collect runtime and compile time
-        run: |
-          ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
-
-      - name: Compute device stats
-        run: |
-          PWD=$(pwd)
-          bazel run //xla/tools:compute_gpu_device_stats_main -- --input=$PWD/results/gpu_hlo_pass_xspace.pb
-          bazel run //xla/tools:compute_gpu_device_stats_main -- --input=$PWD/results/gemma2_2b_keras_jax_xspace.pb
-
-      - name: Upload XSpace
-        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
-        with:
-          name: gpu-xla-benchmarks-xspace
-          path: results/
diff --git a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
new file mode 100644
index 000000000000..95c547ee7e57
--- /dev/null
+++ b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
@@ -0,0 +1,204 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+name: Nightly GPU Benchmarks
+permissions:
+  contents: read
+on:
+  workflow_dispatch:  # Allows manual triggering
+  schedule:
+    - cron: "0 0 * * *"  # Run at midnight every day
+
+
+jobs:
+  Tests:
+    strategy:
+      # Don't fail fast - want to see results for all builds even if one fails.
+      fail-fast: false
+      matrix:
+        job_info: [
+          {
+            os: "linux-x86-g2-48-l4-4gpu",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 GPU L4 48 vcpu Presubmit",
+          },
+          {
+            os: "linux-x86-g2-16-l4-1gpu",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 GPU L4 16 vcpu Presubmit",
+          },
+          {
+            os: "linux-x86-a4-224-b200-1gpu",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-cuda12.8-cudnn9.8:latest",
+            pretty_name: "XLA Linux x86 GPU A4 224 vcpu Presubmit",
+          },
+          # Expect more GPU types in the future.
+        ]
+    name: ${{ matrix.job_info.pretty_name }}
+    runs-on: ${{ matrix.job_info.os }}
+    container: ${{ matrix.job_info.container }}
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 240
+    env:
+      OUTPUT_DIR: ${{ github.workspace }}/output
+    steps:
+      - name: Checkout XLA
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Download Gemma Hlo Files
+        run: |
+          mkdir -p tmp_hlo
+          cd tmp_hlo
+          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma2_2b_keras_jax.hlo
+          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo
+          wget https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_sample_loop.hlo
+          cd ..
+
+      - name: Configure GPU backend
+        run: |
+          ./configure.py --backend=CUDA --cuda_compiler=nvcc
+
+      - name: "Run build.py"
+        run: |
+          ./build_tools/ci/build.py --build="${{ matrix.job_info.pretty_name }}_github_actions"
+
+      - name: Run HLO tests and collect data
+        run: |
+          binary_dir="./bazel-out/k8-opt/bin/xla/tools"
+          mkdir -p "$OUTPUT_DIR"
+
+          # Run gpu_hlo_backend.hlo
+          HLO_FILE_GB="xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo"
+          OUTPUT_PREFIX_GB="$OUTPUT_DIR/gpu_hlo_backend"
+          echo "Running GPU test: $HLO_FILE_GB"
+          $binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
+            --device_type=gpu \
+            --num_repeats=5 \
+            --use_spmd_partitioning \
+            --profile_execution=True \
+            --xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GB}_xspace.pb" \
+            "$HLO_FILE_GB" \
+            > "${OUTPUT_PREFIX_GB}.txt"
+
+          $binary_dir/compute_xspace_stats_main_gpu \
+            --input="${OUTPUT_PREFIX_GB}_xspace.pb" \
+            --device_type=GPU \
+            >> "${OUTPUT_PREFIX_GB}.txt"
+
+          cat "${OUTPUT_PREFIX_GB}.txt"
+
+          # Run gemma2_2b_keras_jax.hlo
+          HLO_FILE_GEMMA="tmp_hlo/gemma2_2b_keras_jax.hlo"
+          OUTPUT_PREFIX_GEMMA="$OUTPUT_DIR/gemma2_2b_keras_jax"
+          echo "Running GPU test: $HLO_FILE_GEMMA"
+          $binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
+            --device_type=gpu \
+            --num_repeats=5 \
+            --use_spmd_partitioning \
+            --profile_execution=True \
+            --xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GEMMA}_xspace.pb" \
+            "$HLO_FILE_GEMMA" \
+            > "${OUTPUT_PREFIX_GEMMA}.txt"
+
+          $binary_dir/compute_xspace_stats_main_gpu \
+            --input="${OUTPUT_PREFIX_GEMMA}_xspace.pb" \
+            --device_type=GPU \
+            >> "${OUTPUT_PREFIX_GEMMA}.txt"
+
+          cat "${OUTPUT_PREFIX_GEMMA}.txt"
+
+          echo "Output written to: ${OUTPUT_PREFIX_GB}.txt and ${OUTPUT_PREFIX_GEMMA}.txt"
+
+          # Run gemma3_1b_flax_call.hlo
+          HLO_FILE_GEMMA3_CALL="tmp_hlo/gemma3_1b_flax_call.hlo"
+          OUTPUT_PREFIX_GEMMA3_CALL="$OUTPUT_DIR/gemma3_1b_flax_call"
+          echo "Running GPU test: $HLO_FILE_GEMMA3_CALL"
+          $binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
+            --device_type=gpu \
+            --num_repeats=5 \
+            --use_spmd_partitioning \
+            --profile_execution=True \
+            --xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GEMMA3_CALL}_xspace.pb" \
+            "$HLO_FILE_GEMMA3_CALL" \
+            > "${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
+
+            $binary_dir/compute_xspace_stats_main_gpu \
+            --input="${OUTPUT_PREFIX_GEMMA3_CALL}_xspace.pb" \
+            --device_type=GPU \
+            >> "${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
+
+            cat "${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
+
+            echo "Output written to: ${OUTPUT_PREFIX_GEMMA3_CALL}.txt"
+
+            # Run gemma3_1b_flax_sample_loop.hlo
+          HLO_FILE_GEMMA3_SAMPLE_LOOP="tmp_hlo/gemma3_1b_flax_sample_loop.hlo"
+          OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP="$OUTPUT_DIR/gemma3_1b_flax_sample_loop"
+          echo "Running GPU test: $HLO_FILE_GEMMA3_SAMPLE_LOOP"
+          $binary_dir/multihost_hlo_runner/hlo_runner_main_gpu \
+            --device_type=gpu \
+            --num_repeats=5 \
+            --use_spmd_partitioning \
+            --profile_execution=True \
+            --xla_gpu_dump_xspace_to="${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}_xspace.pb" \
+            "$HLO_FILE_GEMMA3_SAMPLE_LOOP" \
+            > "${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
+
+            $binary_dir/compute_xspace_stats_main_gpu \
+            --input="${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}_xspace.pb" \
+            --device_type=GPU \
+            >> "${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
+
+            cat "${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
+
+            echo "Output written to: ${OUTPUT_PREFIX_GEMMA3_SAMPLE_LOOP}.txt"
+
+      - name: Upload HLO test output to GCS
+        run: |
+          GCS_BUCKET="gs://openxla-nightly-transient"
+          TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+          DATE_FOLDER=$(date +%Y%m%d)
+          COMMIT_SHA="${{ github.sha }}"
+          RUN_ID="${{ github.run_id }}"
+
+          upload_to_gcs() {
+            local base_name="$1"
+            local gcs_file_name="${DATE_FOLDER}/${TIMESTAMP}_${{ matrix.job_info.os }}_run_${RUN_ID}_commit_${COMMIT_SHA}_${base_name}.txt"
+            echo "Uploading $OUTPUT_DIR/${base_name}.txt to $GCS_BUCKET/$gcs_file_name"
+            gsutil cp "$OUTPUT_DIR/${base_name}.txt" "$GCS_BUCKET/$gcs_file_name"
+          }
+
+          # Upload output for gpu_hlo_backend
+          GB_BASE_NAME="gpu_hlo_backend"
+          upload_to_gcs "$GB_BASE_NAME"
+
+          # Upload output for gemma2_2b_keras_jax
+          GEMMA_BASE_NAME="gemma2_2b_keras_jax"
+          upload_to_gcs "$GEMMA_BASE_NAME"
+
+          # Upload output for gemma3_1b_flax_call
+          GEMMA3_CALL_BASE_NAME="gemma3_1b_flax_call"
+          upload_to_gcs "$GEMMA3_CALL_BASE_NAME"
+
+          # Upload output for gemma3_1b_flax_sample_loop
+          GEMMA3_SAMPLE_LOOP_BASE_NAME="gemma3_1b_flax_sample_loop"
+          upload_to_gcs "$GEMMA3_SAMPLE_LOOP_BASE_NAME"
+
+      - name: Upload XSpace artifacts
+        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        with:
+          name: gpu-xla-benchmarks-xspace-${{ matrix.job_info.os }}
+          path: ${{ github.workspace }}/output/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/presubmit_benchmark.yml b/third_party/xla/.github/workflows/presubmit_benchmark.yml
new file mode 100644
index 000000000000..76fde27ab188
--- /dev/null
+++ b/third_party/xla/.github/workflows/presubmit_benchmark.yml
@@ -0,0 +1,177 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# .github/workflows/presubmit_benchmarks.yml
+name: Presubmit - Run Benchmarks
+
+permissions:
+  contents: read # Default for checkout, other permissions are job-specific
+
+on:
+  workflow_dispatch:
+    inputs:
+      halt-for-connection:
+        description: 'Should this workflow run wait for a remote connection?'
+        type: choice
+        required: true
+        default: 'no'
+        options:
+        - 'yes'
+        - 'no'
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - labeled
+      - unlabeled
+
+# Cancel in-progress runs for the same PR if a new commit is pushed
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # ================================================================
+  # Job 1: Generate the matrix specifically for PRESUBMIT benchmarks
+  # ================================================================
+  generate_matrix:
+    name: Generate Presubmit Matrix
+    # Add a condition here to only run if the PR has the label or it's a manual dispatch
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      contains(github.event.pull_request.labels.*.name, 'blocking_presubmit')
+    uses: ./.github/workflows/generate_benchmark_matrix.yml
+    with:
+      workflow_type: 'presubmit'
+      registry_file: 'xla/tools/benchmarks/registries/default_registry.yml'
+      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
+
+  # ================================================================
+  # Job 2: Run benchmarks based on the generated matrix
+  # ================================================================
+  run_benchmarks:
+    name: Run Benchmark (${{ matrix.benchmark_entry.benchmark_name }} / ${{ matrix.benchmark_entry.hardware_category }})
+    needs: generate_matrix
+    if: success() && needs.generate_matrix.outputs.matrix_include_json != '[]' && needs.generate_matrix.outputs.matrix_include_json != ''
+
+    strategy:
+      fail-fast: false
+      matrix:
+         benchmark_entry: ${{ fromJson(needs.generate_matrix.outputs.matrix_include_json) }}
+
+    runs-on: ${{ matrix.benchmark_entry.runner_label }}
+    container: ${{ matrix.benchmark_entry.container_image }}
+
+    defaults:
+      run:
+        shell: bash
+        # working-directory: ${{ github.workspace }}
+
+    timeout-minutes: 60
+
+    env:
+      # --- Pass all Matrix and GitHub context info via ENV ---
+      BENCHMARK_NAME: ${{ matrix.benchmark_entry.benchmark_name }}
+      CONFIG_ID: ${{ matrix.benchmark_entry.config_id || matrix.benchmark_entry.benchmark_name }}
+      RUNNER_LABEL: ${{ matrix.benchmark_entry.runner_label }}
+      CONTAINER_IMAGE: ${{ matrix.benchmark_entry.container_image }}
+      ARTIFACT_LOCATION: ${{ matrix.benchmark_entry.artifact_location }}
+      IS_GCS_ARTIFACT: ${{ matrix.benchmark_entry.is_gcs_artifact }}
+      INPUT_FORMAT: ${{ matrix.benchmark_entry.input_format }}
+      XLA_FLAGS_JSON: ${{ toJson(matrix.benchmark_entry.xla_compilation_flags) }}
+      RUNTIME_FLAGS_JSON: ${{ toJson(matrix.benchmark_entry.runtime_flags) }}
+      TARGET_METRICS_JSON: ${{ toJson(matrix.benchmark_entry.target_metrics) }}
+      TOPOLOGY_JSON: ${{ toJson(matrix.benchmark_entry.topology) }}
+      HARDWARE_CATEGORY: ${{ matrix.benchmark_entry.hardware_category }}
+      GITHUB_LABELS_JSON: ${{ toJson(matrix.benchmark_entry.github_labels) }}
+      CHECKOUT_REF: ${{ github.event.pull_request.head.sha || github.sha }}
+      COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+      WORKFLOW_RUN_ID: ${{ github.run_id }}
+      # --- Define only the NAME of the output dir here ---
+      OUTPUT_DIR_NAME: benchmark_output_${{ matrix.benchmark_entry.config_id || matrix.benchmark_entry.benchmark_name }}_${{ matrix.benchmark_entry.hardware_category }}
+      # --- Location for scripts (Adjust if you move them, e.g., to .github/scripts) ---
+      SCRIPT_DIR_RELATIVE: .github/workflows/benchmarks # Relative to GITHUB_WORKSPACE
+
+    steps:
+      - name: "Wait For Connection"
+        uses: google-ml-infra/actions/ci_connection@main
+        with:
+          halt-dispatch-input: ${{ inputs.halt-for-connection }}
+
+      - name: Print Job Info & Set Env Vars
+        run: |
+          FULL_OUTPUT_DIR_PATH="${GITHUB_WORKSPACE}/${OUTPUT_DIR_NAME}"
+          RESOLVED_SCRIPT_DIR_PATH="${GITHUB_WORKSPACE}/${SCRIPT_DIR_RELATIVE}" # Renamed for clarity
+
+          echo "--- Benchmark Job Info ---"
+          echo "Config ID: $CONFIG_ID"
+          echo "Benchmark Name: $BENCHMARK_NAME"
+          echo "Runner Label: $RUNNER_LABEL"
+          echo "Hardware Category: $HARDWARE_CATEGORY"
+          echo "Output Directory Name (relative to workspace): $OUTPUT_DIR_NAME"
+          echo "Full Output Directory Path (will be created by script): $FULL_OUTPUT_DIR_PATH"
+          echo "Full Script Directory Path: $RESOLVED_SCRIPT_DIR_PATH"
+          echo "GITHUB_WORKSPACE: ${GITHUB_WORKSPACE}"
+          echo "Current PWD: $(pwd)"
+          echo "--------------------------"
+
+          echo "RESOLVED_OUTPUT_DIR=$FULL_OUTPUT_DIR_PATH" >> $GITHUB_ENV
+          echo "RESOLVED_SCRIPT_DIR=$RESOLVED_SCRIPT_DIR_PATH" >> $GITHUB_ENV
+
+      - name: Checkout OpenXLA Repository
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          ref: ${{ env.CHECKOUT_REF }}
+
+      - name: Build Binaries
+        id: build_binaries
+        # Use the RESOLVED_SCRIPT_DIR and RESOLVED_OUTPUT_DIR from GITHUB_ENV
+        run: |
+           bash "${RESOLVED_SCRIPT_DIR}/build_binaries.sh"
+        env:
+          OUTPUT_DIR: ${{ env.RESOLVED_OUTPUT_DIR }} # Pass the correctly resolved full path
+
+      - name: Prepare Benchmark Artifact
+        id: prep_artifact
+        run: |
+           bash "${RESOLVED_SCRIPT_DIR}/prepare_artifact.sh"
+        env:
+          OUTPUT_DIR: ${{ env.RESOLVED_OUTPUT_DIR }} # Pass the correctly resolved full path
+
+      - name: Run Benchmark and Generate Stats
+        id: run_hlo
+        env:
+           RUNNER_BINARY: "${{ steps.build_binaries.outputs.runner_binary }}"
+           STATS_BINARY: "${{ steps.build_binaries.outputs.stats_binary }}"
+           DEVICE_TYPE_FLAG: "${{ steps.build_binaries.outputs.device_type_flag }}"
+           LOCAL_ARTIFACT_PATH: "${{ steps.prep_artifact.outputs.artifact_local_path }}"
+           OUTPUT_DIR: ${{ env.RESOLVED_OUTPUT_DIR }} # Pass the correctly resolved full path
+           # Other job-level env vars are automatically inherited
+        run: |
+           bash "${RESOLVED_SCRIPT_DIR}/run_benchmark.sh"
+
+      #TODO(juliagmt): add a step to compare the stats with the baseline and block the PR if there is a regression.
+
+      - name: Upload Benchmark Artifacts
+        if: always()
+        uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
+        with:
+          name: results-${{ env.CONFIG_ID }}-${{ env.HARDWARE_CATEGORY }}
+          # Use the RESOLVED_OUTPUT_DIR that was set in GITHUB_ENV
+          path: ${{ env.RESOLVED_OUTPUT_DIR }}
+          retention-days: 7
+          if-no-files-found: error
\ No newline at end of file
diff --git a/third_party/xla/.kokoro/macos/build.sh b/third_party/xla/.kokoro/macos/build.sh
index 6578ec4172d7..c84e1814b2ea 100644
--- a/third_party/xla/.kokoro/macos/build.sh
+++ b/third_party/xla/.kokoro/macos/build.sh
@@ -25,4 +25,4 @@ set -euox pipefail -o history
 # TODO(ddunleavy) figure out how to best move this into build.py
 cd "${KOKORO_ARTIFACTS_DIR}/github/xla"
 
-"$KOKORO_ARTIFACTS_DIR"/github/xla/build_tools/ci/build.py
+"$KOKORO_ARTIFACTS_DIR"/github/xla/build_tools/ci/build.py --build=XLA_MACOS_X86_CPU_KOKORO
diff --git a/third_party/xla/README.md b/third_party/xla/README.md
index e284ed5067cb..7291b3ce2aa6 100644
--- a/third_party/xla/README.md
+++ b/third_party/xla/README.md
@@ -12,6 +12,8 @@ The XLA compiler takes models from popular ML frameworks such as PyTorch,
 TensorFlow, and JAX, and optimizes them for high-performance execution across
 different hardware platforms including GPUs, CPUs, and ML accelerators.
 
+[openxla.org](https://openxla.org/) is the project's website.
+
 ## Get started
 
 If you want to use XLA to compile your ML project, refer to the corresponding
diff --git a/third_party/xla/build_tools/BUILD b/third_party/xla/build_tools/BUILD
index 3111ccb2505d..f1d4b666c363 100644
--- a/third_party/xla/build_tools/BUILD
+++ b/third_party/xla/build_tools/BUILD
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 
-load("//xla:pytype.default.bzl", "pytype_strict_library")
+load("//xla:pytype.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/build_tools/ci/BUILD b/third_party/xla/build_tools/ci/BUILD
index 532aeb29dba1..3d37ca202dd8 100644
--- a/third_party/xla/build_tools/ci/BUILD
+++ b/third_party/xla/build_tools/ci/BUILD
@@ -14,7 +14,7 @@
 # ============================================================================
 
 load("@bazel_skylib//rules:diff_test.bzl", "diff_test")
-load("//xla:pytype.default.bzl", "pytype_strict_binary")
+load("//xla:pytype.bzl", "pytype_strict_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -36,7 +36,7 @@ genrule(
 
 diff_test(
     name = "build_command_golden_test",
-    failure_message = """Regenerate with `KOKORO_JOB_NAME=GOLDENS PYTHONDONTWRITEBYTECODE=1 python3 build.py > golden_commands.txt`.""",
+    failure_message = """Regenerate with `PYTHONDONTWRITEBYTECODE=1 python3 build.py --dump_commands > golden_commands.txt`.""",
     file1 = "golden_commands.txt",
     file2 = ":generated_build_commands",
     tags = ["not_run:arm"],
diff --git a/third_party/xla/build_tools/ci/build.py b/third_party/xla/build_tools/ci/build.py
index f91330c34128..639bc0225460 100755
--- a/third_party/xla/build_tools/ci/build.py
+++ b/third_party/xla/build_tools/ci/build.py
@@ -13,11 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""XLA build script for use in CI.
+r"""XLA build script for use in CI.
 
 This build script aims to be completely agnostic to the specifics of the VM, the
 exceptions are uses of `KOKORO_ARTIFACTS_DIR` and `GITHUB_WORKSPACE` to know
 where JAX or TensorFlow lives depending on which build is being executed.
+
+To update the goldens associated with this file, run:
+  ```PYTHONDONTWRITEBYTECODE=1 python3 build.py \
+      --dump_commands > golden_commands.txt```
 """
 import argparse
 import dataclasses
@@ -48,6 +52,14 @@
     "//build_tools/...",
     "@local_tsl//tsl/...",
 )
+_XLA_CPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS = (
+    "//xla/tools/multihost_hlo_runner:hlo_runner_main",
+    "//xla/tools:compute_xspace_stats_main",
+)
+_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS = (
+    "//xla/tools/multihost_hlo_runner:hlo_runner_main_gpu",
+    "//xla/tools:compute_xspace_stats_main_gpu",
+)
 _KOKORO_ARTIFACTS_DIR = os.environ.get(
     "KOKORO_ARTIFACTS_DIR", "$KOKORO_ARTIFACTS_DIR"
 )
@@ -89,10 +101,18 @@ class BuildType(enum.Enum):
 
   Should be named as `REPO,OS,HOST_TYPE,BACKEND,GPU_TYPE,CI_TYPE`.
   """
+
   XLA_LINUX_X86_CPU_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_ARM64_CPU_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS = enum.auto()
 
+  # Presubmit builds for regression testing.
+  XLA_LINUX_ARM64_CPU_48_VCPU_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
+  XLA_LINUX_X86_CPU_128_VCPU_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
+  XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
+  XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
+  XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS = enum.auto()
+
   XLA_MACOS_X86_CPU_KOKORO = enum.auto()
   XLA_MACOS_ARM64_CPU_KOKORO = enum.auto()
 
@@ -115,16 +135,20 @@ def from_str(cls, s):
 @dataclasses.dataclass(frozen=True, **_KW_ONLY_IF_PYTHON310)
 class Build:
   """Class representing a build of XLA."""
+
   _builds: ClassVar[Dict[BuildType, "Build"]] = {}
 
   type_: BuildType
   repo: str
   target_patterns: Tuple[str, ...]
+  subcommand: str = "test"
   configs: Tuple[str, ...] = ()
   build_tag_filters: Tuple[str, ...] = ()
   test_tag_filters: Tuple[str, ...] = ()
   action_env: Dict[str, Any] = dataclasses.field(default_factory=dict)
   test_env: Dict[str, Any] = dataclasses.field(default_factory=dict)
+  repo_env: Dict[str, Any] = dataclasses.field(default_factory=dict)
+  override_repository: Dict[str, str] = dataclasses.field(default_factory=dict)
   options: Dict[str, Any] = dataclasses.field(default_factory=dict)
   extra_setup_commands: Tuple[List[str], ...] = ()
 
@@ -156,6 +180,11 @@ def bazel_command(
     test_tag_filters = f"--test_tag_filters={','.join(self.test_tag_filters)}"
     action_env = [f"--action_env={k}={v}" for k, v in self.action_env.items()]
     test_env = [f"--test_env={k}={v}" for k, v in self.test_env.items()]
+    repo_env = [f"--repo_env={k}={v}" for k, v in self.repo_env.items()]
+    override_repository = [
+        f"--override_repository={k}={v}"
+        for k, v in self.override_repository.items()
+    ]
 
     tag_filters = [build_tag_filters, test_tag_filters]
     all_options = (
@@ -163,6 +192,8 @@ def bazel_command(
         + configs
         + action_env
         + test_env
+        + repo_env
+        + override_repository
         + options
         + list(extra_options)
     )
@@ -179,10 +210,11 @@ def commands(self) -> List[List[str]]:
     # problems in practice.
     # TODO(ddunleavy): Remove the condition here. Need to get parallel on the
     # MacOS VM.
-    if (
-        self.type_ != BuildType.XLA_MACOS_X86_CPU_KOKORO
-        and self.type_ != BuildType.XLA_MACOS_ARM64_CPU_KOKORO
-    ):
+    macos_build = (
+        self.type_ == BuildType.XLA_MACOS_X86_CPU_KOKORO
+        or self.type_ == BuildType.XLA_MACOS_ARM64_CPU_KOKORO
+    )
+    if not macos_build:
       cmds.append(
           retry(
               self.bazel_command(
@@ -190,7 +222,7 @@ def commands(self) -> List[List[str]]:
               )
           )
       )
-    cmds.append(self.bazel_command())
+    cmds.append(self.bazel_command(subcommand=self.subcommand))
     cmds.append(["bazel", "analyze-profile", "profile.json.gz"])
 
     return cmds
@@ -227,10 +259,10 @@ def nvidia_gpu_build_with_compute_capability(
       build_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only"),
       options={
           "run_under": "//build_tools/ci:parallel_gpu_execute",
-          "repo_env": f"TF_CUDA_COMPUTE_CAPABILITIES={compute_capability/10}",
           "@cuda_driver//:enable_forward_compatibility": "true",
           **_DEFAULT_BAZEL_OPTIONS,
       },
+      repo_env={"TF_CUDA_COMPUTE_CAPABILITIES": f"{compute_capability/10}"},
       extra_setup_commands=(["nvidia-smi"],),
   )
 
@@ -241,7 +273,7 @@ def nvidia_gpu_build_with_compute_capability(
     "-requires-gpu-nvidia",
     "-requires-gpu-amd",
 )
-_XLA_LINUX_X86_CPU_GITHUB_ACTIONS_BUILD = Build(
+Build(
     type_=BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS,
     repo="openxla/xla",
     configs=("warnings", "nonccl", "rbe_linux_cpu"),
@@ -258,7 +290,7 @@ def nvidia_gpu_build_with_compute_capability(
     "-requires-gpu-amd",
     "-not_run:arm",
 )
-_XLA_LINUX_ARM64_CPU_GITHUB_ACTIONS_BUILD = Build(
+Build(
     type_=BuildType.XLA_LINUX_ARM64_CPU_GITHUB_ACTIONS,
     repo="openxla/xla",
     configs=("warnings", "rbe_cross_compile_linux_arm64", "nonccl"),
@@ -268,12 +300,96 @@ def nvidia_gpu_build_with_compute_capability(
     test_tag_filters=cpu_arm_tag_filter,
 )
 
-_XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS_BUILD = (
-    nvidia_gpu_build_with_compute_capability(
-        type_=BuildType.XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS,
-        configs=("warnings", "rbe_linux_cuda_nvcc"),
-        compute_capability=75,
-    )
+nvidia_gpu_build_with_compute_capability(
+    type_=BuildType.XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS,
+    configs=("warnings", "rbe_linux_cuda_nvcc"),
+    compute_capability=75,
+)
+
+
+Build(
+    type_=BuildType.XLA_LINUX_X86_CPU_128_VCPU_PRESUBMIT_GITHUB_ACTIONS,
+    repo="openxla/xla",
+    configs=("warnings", "nonccl", "rbe_linux_cpu"),
+    target_patterns=_XLA_CPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
+    build_tag_filters=cpu_x86_tag_filter,
+    test_tag_filters=cpu_x86_tag_filter,
+    options=_DEFAULT_BAZEL_OPTIONS,
+    subcommand="build",
+)
+
+Build(
+    type_=BuildType.XLA_LINUX_ARM64_CPU_48_VCPU_PRESUBMIT_GITHUB_ACTIONS,
+    repo="openxla/xla",
+    configs=("warnings", "rbe_cross_compile_linux_arm64", "nonccl"),
+    target_patterns=_XLA_CPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
+    options={**_DEFAULT_BAZEL_OPTIONS, "build_tests_only": False},
+    build_tag_filters=cpu_arm_tag_filter,
+    test_tag_filters=cpu_arm_tag_filter,
+    subcommand="build",
+)
+
+Build(
+    type_=BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS,
+    repo="openxla/xla",
+    target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
+    configs=("warnings", "rbe_linux_cuda_nvcc"),
+    test_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only")
+    + _tag_filters_for_compute_capability(compute_capability=75),
+    build_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only"),
+    options={
+        "run_under": "//build_tools/ci:parallel_gpu_execute",
+        "@cuda_driver//:enable_forward_compatibility": "false",
+        **_DEFAULT_BAZEL_OPTIONS,
+    },
+    repo_env={
+        "TF_CUDA_COMPUTE_CAPABILITIES": "7.5",
+    },
+    extra_setup_commands=(["nvidia-smi"],),
+    subcommand="build",
+)
+
+Build(
+    type_=BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS,
+    repo="openxla/xla",
+    configs=("warnings", "rbe_linux_cuda_nvcc"),
+    target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
+    test_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only")
+    + _tag_filters_for_compute_capability(compute_capability=75),
+    build_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only"),
+    options={
+        "run_under": "//build_tools/ci:parallel_gpu_execute",
+        "@cuda_driver//:enable_forward_compatibility": "false",
+        **_DEFAULT_BAZEL_OPTIONS,
+    },
+    repo_env={
+        "TF_CUDA_COMPUTE_CAPABILITIES": "7.5",
+    },
+    extra_setup_commands=(["nvidia-smi"],),
+    subcommand="build",
+)
+
+Build(
+    type_=BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS,
+    repo="openxla/xla",
+    configs=(),
+    target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
+    test_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only")
+    + _tag_filters_for_compute_capability(compute_capability=100),
+    build_tag_filters=("-no_oss", "requires-gpu-nvidia", "gpu", "-rocm-only"),
+    options={
+        "run_under": "//build_tools/ci:parallel_gpu_execute",
+        # Use User Mode and Kernel Mode Drivers pre-installed on the system.
+        "@cuda_driver//:enable_forward_compatibility": "true",
+        **_DEFAULT_BAZEL_OPTIONS,
+    },
+    repo_env={
+        "TF_CUDA_COMPUTE_CAPABILITIES": "10",
+        "HERMETIC_CUDA_VERSION": "12.8.0",
+        "HERMETIC_CUDNN_VERSION": "9.8.0",
+    },
+    extra_setup_commands=(["nvidia-smi"],),
+    subcommand="build",
 )
 
 macos_tag_filter = (
@@ -285,7 +401,7 @@ def nvidia_gpu_build_with_compute_capability(
     "-requires-gpu-amd",
 )
 
-_XLA_MACOS_X86_CPU_KOKORO_BUILD = Build(
+Build(
     type_=BuildType.XLA_MACOS_X86_CPU_KOKORO,
     repo="openxla/xla",
     configs=("nonccl",),
@@ -319,7 +435,7 @@ def nvidia_gpu_build_with_compute_capability(
     ),
 )
 
-_XLA_MACOS_ARM64_CPU_KOKORO_BUILD = Build(
+Build(
     type_=BuildType.XLA_MACOS_ARM64_CPU_KOKORO,
     repo="openxla/xla",
     configs=("nonccl",),
@@ -346,7 +462,7 @@ def nvidia_gpu_build_with_compute_capability(
     ),
 )
 
-_JAX_LINUX_X86_CPU_GITHUB_ACTIONS_BUILD = Build(
+Build(
     type_=BuildType.JAX_LINUX_X86_CPU_GITHUB_ACTIONS,
     repo="google/jax",
     configs=("rbe_linux_x86_64",),
@@ -355,14 +471,14 @@ def nvidia_gpu_build_with_compute_capability(
         JAX_NUM_GENERATED_CASES=25,
         JAX_SKIP_SLOW_TESTS=1,
     ),
-    options=dict(
-        **_DEFAULT_BAZEL_OPTIONS,
-        override_repository=f"xla={_GITHUB_WORKSPACE}/openxla/xla",
-        repo_env="HERMETIC_PYTHON_VERSION=3.12",
+    override_repository=dict(
+        xla=f"{_GITHUB_WORKSPACE}/openxla/xla",
     ),
+    options=_DEFAULT_BAZEL_OPTIONS,
+    repo_env={"HERMETIC_PYTHON_VERSION": "3.12"},
 )
 
-_JAX_LINUX_X86_GPU_T4_GITHUB_ACTIONS_BUILD = Build(
+Build(
     type_=BuildType.JAX_LINUX_X86_GPU_T4_GITHUB_ACTIONS,
     repo="google/jax",
     configs=("rbe_linux_x86_64_cuda",),
@@ -374,11 +490,12 @@ def nvidia_gpu_build_with_compute_capability(
         TF_CPP_MIN_LOG_LEVEL=0,
         JAX_EXCLUDE_TEST_TARGETS="PmapTest.testSizeOverflow",
     ),
-    options=dict(
-        **_DEFAULT_BAZEL_OPTIONS,
-        override_repository=f"xla={_GITHUB_WORKSPACE}/openxla/xla",
-        repo_env="HERMETIC_PYTHON_VERSION=3.10",
+    override_repository=dict(
+        xla=f"{_GITHUB_WORKSPACE}/openxla/xla",
     ),
+    options=_DEFAULT_BAZEL_OPTIONS,
+    repo_env={"HERMETIC_PYTHON_VERSION": "3.10"},
+    extra_setup_commands=(["nvidia-smi"],),
 )
 
 tensorflow_tag_filters = (
@@ -399,7 +516,7 @@ def nvidia_gpu_build_with_compute_capability(
     "+gpu",
 )
 
-_TENSORFLOW_LINUX_X86_CPU_GITHUB_ACTIONS_BUILD = Build(
+Build(
     type_=BuildType.TENSORFLOW_LINUX_X86_CPU_GITHUB_ACTIONS,
     repo="tensorflow/tensorflow",
     configs=(
@@ -412,6 +529,7 @@ def nvidia_gpu_build_with_compute_capability(
         "//tensorflow/python/...",
         "-//tensorflow/python/distribute/...",
         "-//tensorflow/python/kernel_tests/...",
+        "-//tensorflow/python/data/...",
         "-//tensorflow/python/compiler/tensorrt/...",
     ),
     build_tag_filters=tensorflow_cpu_tag_filters,
@@ -419,14 +537,42 @@ def nvidia_gpu_build_with_compute_capability(
     options=dict(
         verbose_failures=True,
         test_output="errors",
-        override_repository=f"xla={_GITHUB_WORKSPACE}/openxla/xla",
         profile="profile.json.gz",
         test_lang_filters="cc,py",
         color="yes",
     ),
+    repo_env={"USE_PYWRAP_RULES": "True"},
+    extra_setup_commands=(
+        # This is pretty devious - but we have to do some adhoc extra Copybara
+        # work here to get XLA into the shape TF expects. b/407638223
+        # pyformat:disable
+        [
+            "find",
+            f"{_GITHUB_WORKSPACE}/openxla/xla",
+            "-type", "f",
+            "-exec", "sed", "-i", "s/@local_xla/@local_xla/g", "{}", "+",
+        ],
+        [
+            "find",
+            f"{_GITHUB_WORKSPACE}/openxla/xla",
+            "-type", "f",
+            "-exec", "sed", "-i", "s/@local_tsl/@local_tsl/g", "{}", "+",
+        ],
+        [
+            "cp", "-r",
+            f"{_GITHUB_WORKSPACE}/openxla/xla",
+            f"{_GITHUB_WORKSPACE}/tensorflow/tensorflow/third_party",
+        ],
+        [
+            "find",
+            f"{_GITHUB_WORKSPACE}/openxla/xla/third_party/",
+            "-maxdepth", "1", "-exec", "cp", "-r", "{}",
+            f"{_GITHUB_WORKSPACE}/tensorflow/tensorflow/third_party", ";",
+        ],
+    ),
 )
 
-_TENSORFLOW_LINUX_X86_GPU_T4_GITHUB_ACTIONS_BUILD = Build(
+Build(
     type_=BuildType.TENSORFLOW_LINUX_X86_GPU_T4_GITHUB_ACTIONS,
     repo="tensorflow/tensorflow",
     configs=(
@@ -439,6 +585,7 @@ def nvidia_gpu_build_with_compute_capability(
         "//tensorflow/python/...",
         "-//tensorflow/python/distribute/...",
         "-//tensorflow/python/kernel_tests/...",
+        "-//tensorflow/python/data/...",
         "-//tensorflow/python/compiler/tensorrt/...",
     ),
     build_tag_filters=tensorflow_gpu_tag_filters,
@@ -446,11 +593,40 @@ def nvidia_gpu_build_with_compute_capability(
     options=dict(
         verbose_failures=True,
         test_output="errors",
-        override_repository=f"xla={_GITHUB_WORKSPACE}/openxla/xla",
         profile="profile.json.gz",
         test_lang_filters="cc,py",
         color="yes",
     ),
+    repo_env={"USE_PYWRAP_RULES": "True"},
+    extra_setup_commands=(
+        # This is pretty devious - but we have to do some adhoc extra Copybara
+        # work here to get XLA into the shape TF expects. b/407638223
+        # pyformat:disable
+        [
+            "find",
+            f"{_GITHUB_WORKSPACE}/openxla/xla",
+            "-type", "f",
+            "-exec", "sed", "-i", "s/@local_xla/@local_xla/g", "{}", "+",
+        ],
+        [
+            "find",
+            f"{_GITHUB_WORKSPACE}/openxla/xla",
+            "-type", "f",
+            "-exec", "sed", "-i", "s/@local_tsl/@local_tsl/g", "{}", "+",
+        ],
+        [
+            "cp", "-r",
+            f"{_GITHUB_WORKSPACE}/openxla/xla",
+            f"{_GITHUB_WORKSPACE}/tensorflow/tensorflow/third_party",
+        ],
+        [
+            "find",
+            f"{_GITHUB_WORKSPACE}/openxla/xla/third_party/",
+            "-maxdepth", "1", "-exec", "cp", "-r", "{}",
+            f"{_GITHUB_WORKSPACE}/tensorflow/tensorflow/third_party", ";",
+        ],
+        ["nvidia-smi"],
+    ),
 )
 
 
@@ -494,5 +670,6 @@ def main():
     for cmd in Build.all_builds()[args.build].commands():
       sh(cmd)
 
+
 if __name__ == "__main__":
   main()
diff --git a/third_party/xla/build_tools/ci/golden_commands.txt b/third_party/xla/build_tools/ci/golden_commands.txt
index e8cc6a3ca357..23dd0cca4b3b 100644
--- a/third_party/xla/build_tools/ci/golden_commands.txt
+++ b/third_party/xla/build_tools/ci/golden_commands.txt
@@ -1,37 +1,75 @@
 # BEGIN BuildType.JAX_LINUX_X86_CPU_GITHUB_ACTIONS
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters= --test_tag_filters= --config=rbe_linux_x86_64 --test_env=JAX_NUM_GENERATED_CASES=25 --test_env=JAX_SKIP_SLOW_TESTS=1 --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --repo_env=HERMETIC_PYTHON_VERSION=3.12 --nobuild -- //tests:cpu_tests //tests:backend_independent_tests
-bazel test --build_tag_filters= --test_tag_filters= --config=rbe_linux_x86_64 --test_env=JAX_NUM_GENERATED_CASES=25 --test_env=JAX_SKIP_SLOW_TESTS=1 --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --repo_env=HERMETIC_PYTHON_VERSION=3.12 -- //tests:cpu_tests //tests:backend_independent_tests
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters= --test_tag_filters= --config=rbe_linux_x86_64 --test_env=JAX_NUM_GENERATED_CASES=25 --test_env=JAX_SKIP_SLOW_TESTS=1 --repo_env=HERMETIC_PYTHON_VERSION=3.12 --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //tests:cpu_tests //tests:backend_independent_tests
+bazel test --build_tag_filters= --test_tag_filters= --config=rbe_linux_x86_64 --test_env=JAX_NUM_GENERATED_CASES=25 --test_env=JAX_SKIP_SLOW_TESTS=1 --repo_env=HERMETIC_PYTHON_VERSION=3.12 --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //tests:cpu_tests //tests:backend_independent_tests
 bazel analyze-profile profile.json.gz
 # END BuildType.JAX_LINUX_X86_CPU_GITHUB_ACTIONS
 # BEGIN BuildType.JAX_LINUX_X86_GPU_T4_GITHUB_ACTIONS
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-multiaccelerator --test_tag_filters=-multiaccelerator --config=rbe_linux_x86_64_cuda --test_env=JAX_SKIP_SLOW_TESTS=1 --test_env=TF_CPP_MIN_LOG_LEVEL=0 --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --repo_env=HERMETIC_PYTHON_VERSION=3.10 --nobuild -- //tests:gpu_tests //tests:backend_independent_tests
-bazel test --build_tag_filters=-multiaccelerator --test_tag_filters=-multiaccelerator --config=rbe_linux_x86_64_cuda --test_env=JAX_SKIP_SLOW_TESTS=1 --test_env=TF_CPP_MIN_LOG_LEVEL=0 --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --repo_env=HERMETIC_PYTHON_VERSION=3.10 -- //tests:gpu_tests //tests:backend_independent_tests
+nvidia-smi
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-multiaccelerator --test_tag_filters=-multiaccelerator --config=rbe_linux_x86_64_cuda --test_env=JAX_SKIP_SLOW_TESTS=1 --test_env=TF_CPP_MIN_LOG_LEVEL=0 --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow --repo_env=HERMETIC_PYTHON_VERSION=3.10 --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //tests:gpu_tests //tests:backend_independent_tests
+bazel test --build_tag_filters=-multiaccelerator --test_tag_filters=-multiaccelerator --config=rbe_linux_x86_64_cuda --test_env=JAX_SKIP_SLOW_TESTS=1 --test_env=TF_CPP_MIN_LOG_LEVEL=0 --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow --repo_env=HERMETIC_PYTHON_VERSION=3.10 --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //tests:gpu_tests //tests:backend_independent_tests
 bazel analyze-profile profile.json.gz
 # END BuildType.JAX_LINUX_X86_GPU_T4_GITHUB_ACTIONS
 # BEGIN BuildType.TENSORFLOW_LINUX_X86_CPU_GITHUB_ACTIONS
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --config=release_cpu_linux --config=rbe_linux_cpu --verbose_failures --test_output=errors --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --profile=profile.json.gz --test_lang_filters=cc,py --color=yes --nobuild -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/compiler/tensorrt/...
-bazel test --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --config=release_cpu_linux --config=rbe_linux_cpu --verbose_failures --test_output=errors --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --profile=profile.json.gz --test_lang_filters=cc,py --color=yes -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/compiler/tensorrt/...
+find $GITHUB_WORKSPACE/openxla/xla -type f -exec sed -i s/@local_xla/@local_xla/g {} +
+find $GITHUB_WORKSPACE/openxla/xla -type f -exec sed -i s/@local_tsl/@local_tsl/g {} +
+cp -r $GITHUB_WORKSPACE/openxla/xla $GITHUB_WORKSPACE/tensorflow/tensorflow/third_party
+find $GITHUB_WORKSPACE/openxla/xla/third_party/ -maxdepth 1 -exec cp -r {} $GITHUB_WORKSPACE/tensorflow/tensorflow/third_party ;
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --config=release_cpu_linux --config=rbe_linux_cpu --repo_env=USE_PYWRAP_RULES=True --verbose_failures --test_output=errors --profile=profile.json.gz --test_lang_filters=cc,py --color=yes --nobuild -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/data/... -//tensorflow/python/compiler/tensorrt/...
+bazel test --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-gpu --config=release_cpu_linux --config=rbe_linux_cpu --repo_env=USE_PYWRAP_RULES=True --verbose_failures --test_output=errors --profile=profile.json.gz --test_lang_filters=cc,py --color=yes -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/data/... -//tensorflow/python/compiler/tensorrt/...
 bazel analyze-profile profile.json.gz
 # END BuildType.TENSORFLOW_LINUX_X86_CPU_GITHUB_ACTIONS
 # BEGIN BuildType.TENSORFLOW_LINUX_X86_GPU_T4_GITHUB_ACTIONS
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --config=release_gpu_linux --config=rbe_linux_cuda --verbose_failures --test_output=errors --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --profile=profile.json.gz --test_lang_filters=cc,py --color=yes --nobuild -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/compiler/tensorrt/...
-bazel test --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --config=release_gpu_linux --config=rbe_linux_cuda --verbose_failures --test_output=errors --override_repository=xla=$GITHUB_WORKSPACE/openxla/xla --profile=profile.json.gz --test_lang_filters=cc,py --color=yes -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/compiler/tensorrt/...
+find $GITHUB_WORKSPACE/openxla/xla -type f -exec sed -i s/@local_xla/@local_xla/g {} +
+find $GITHUB_WORKSPACE/openxla/xla -type f -exec sed -i s/@local_tsl/@local_tsl/g {} +
+cp -r $GITHUB_WORKSPACE/openxla/xla $GITHUB_WORKSPACE/tensorflow/tensorflow/third_party
+find $GITHUB_WORKSPACE/openxla/xla/third_party/ -maxdepth 1 -exec cp -r {} $GITHUB_WORKSPACE/tensorflow/tensorflow/third_party ;
+nvidia-smi
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --config=release_gpu_linux --config=rbe_linux_cuda --repo_env=USE_PYWRAP_RULES=True --verbose_failures --test_output=errors --profile=profile.json.gz --test_lang_filters=cc,py --color=yes --nobuild -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/data/... -//tensorflow/python/compiler/tensorrt/...
+bazel test --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-tpu,-benchmark-test,-v1only,-no_gpu,-no_gpu_presubmit,-no_cuda11,+gpu --config=release_gpu_linux --config=rbe_linux_cuda --repo_env=USE_PYWRAP_RULES=True --verbose_failures --test_output=errors --profile=profile.json.gz --test_lang_filters=cc,py --color=yes -- //tensorflow/compiler/... -//tensorflow/compiler/tf2tensorrt/... //tensorflow/python/... -//tensorflow/python/distribute/... -//tensorflow/python/kernel_tests/... -//tensorflow/python/data/... -//tensorflow/python/compiler/tensorrt/...
 bazel analyze-profile profile.json.gz
 # END BuildType.TENSORFLOW_LINUX_X86_GPU_T4_GITHUB_ACTIONS
+# BEGIN BuildType.XLA_LINUX_ARM64_CPU_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --config=warnings --config=rbe_cross_compile_linux_arm64 --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --build_tests_only=False --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main //xla/tools:compute_xspace_stats_main
+bazel build --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --config=warnings --config=rbe_cross_compile_linux_arm64 --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --build_tests_only=False -- //xla/tools/multihost_hlo_runner:hlo_runner_main //xla/tools:compute_xspace_stats_main
+bazel analyze-profile profile.json.gz
+# END BuildType.XLA_LINUX_ARM64_CPU_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_ARM64_CPU_GITHUB_ACTIONS
 parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --config=warnings --config=rbe_cross_compile_linux_arm64 --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --build_tests_only --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel test --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-not_run:arm --config=warnings --config=rbe_cross_compile_linux_arm64 --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --build_tests_only -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_ARM64_CPU_GITHUB_ACTIONS
+# BEGIN BuildType.XLA_LINUX_X86_CPU_128_VCPU_PRESUBMIT_GITHUB_ACTIONS
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --config=warnings --config=nonccl --config=rbe_linux_cpu --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main //xla/tools:compute_xspace_stats_main
+bazel build --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --config=warnings --config=nonccl --config=rbe_linux_cpu --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main //xla/tools:compute_xspace_stats_main
+bazel analyze-profile profile.json.gz
+# END BuildType.XLA_LINUX_X86_CPU_128_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS
 parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --config=warnings --config=nonccl --config=rbe_linux_cpu --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel test --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd --config=warnings --config=nonccl --config=rbe_linux_cpu --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS
+# BEGIN BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS
+nvidia-smi
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=true --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=true --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel analyze-profile profile.json.gz
+# END BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS
+# BEGIN BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
+nvidia-smi
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel analyze-profile profile.json.gz
+# END BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
+# BEGIN BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
+nvidia-smi
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel analyze-profile profile.json.gz
+# END BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
-bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=true --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
+bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --@cuda_driver//:enable_forward_compatibility=true --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_MACOS_ARM64_CPU_KOKORO
diff --git a/third_party/xla/build_tools/ci/parallel_gpu_execute.sh b/third_party/xla/build_tools/ci/parallel_gpu_execute.sh
index a00dcbc3f340..2bb8493a3503 100755
--- a/third_party/xla/build_tools/ci/parallel_gpu_execute.sh
+++ b/third_party/xla/build_tools/ci/parallel_gpu_execute.sh
@@ -29,7 +29,7 @@ function is_absolute {
   [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
 }
 
-export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-2048}
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-4096}
 
 # *******************************************************************
 #         This section of the script is needed to
diff --git a/third_party/xla/build_tools/ci/readme.md b/third_party/xla/build_tools/ci/readme.md
new file mode 100644
index 000000000000..834378ad24d3
--- /dev/null
+++ b/third_party/xla/build_tools/ci/readme.md
@@ -0,0 +1,39 @@
+This folder contains scripts and commands executed by GitHub actions in OpenXLA,
+TensorFlow and JAX OSS repos. The GitHub actions replaced
+KokoroPresubmit-tensorflow* workflows. They run under
+`Copybara_XLA{Presubmit,Submit}` presubmit chip (category) at Google internally.
+The tests run on several target OS configurations/GCP containers such as Linux
+x86 with GPU or Linux ARM64. They assure that XLA/Tensor Flow/JAX compiles and
+runs on those platforms. The tests uses released OSS c++ clang compiler which
+has some differences in supporting c++ standards compared Google's internal
+version.
+
+#### How it works
+
+Repo specific GitHub actions call `build.py --build="build_name"`. E.g. OpenXLA
+uses https://github.com/openxla/xla/blob/main/.github/workflows/ci.yml
+
+The build here is a set of shell script commands executing the test targets or
+doing compile only testing. Each GitHub action call translates into compile only
+test:
+
+1.  dryrun `bazel build --nobuild ... test_targets`
+1.  actual compile `bazel build ... test_targets`
+1.  analyse results `bazel analyze-profile ...`
+
+or compile and run test commands:
+
+1.  dry run `bazel build --nobuild ... test_targets`
+1.  actual test `bazel test ... test_targets`
+1.  analyse results `bazel analyze-profile profile.json.gz`
+
+Checking in changes to `build.py` regenerates `golden_commands.txt` which lets
+us see how commands are changing. `golden_commands.txt` are not called as part
+of the continuous integration process.
+
+#### When does it run?
+
+The GitHub actions are automatically called as part of Google presubmit. The
+GitHub actions are automatically called on GitHub PR commit if the committer is
+part of OpenXLA GitHub org. Committer which are not part of OpenXLA org need an
+approval from OpenXLA org member to run the actions.
diff --git a/third_party/xla/build_tools/configure/BUILD b/third_party/xla/build_tools/configure/BUILD
index ed518510f5ea..81179e63f6f2 100644
--- a/third_party/xla/build_tools/configure/BUILD
+++ b/third_party/xla/build_tools/configure/BUILD
@@ -15,7 +15,7 @@
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load("@rules_python//python:py_test.bzl", "py_test")
-load("//xla:pytype.default.bzl", "pytype_strict_library")
+load("//xla:pytype.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/build_tools/configure/configure.py b/third_party/xla/build_tools/configure/configure.py
index dd789a611cb1..77ff8f4fbe29 100755
--- a/third_party/xla/build_tools/configure/configure.py
+++ b/third_party/xla/build_tools/configure/configure.py
@@ -412,8 +412,13 @@ def to_bazelrc_lines(
     if dpav.ld_library_path:
       rc.append(f"build --action_env LD_LIBRARY_PATH={dpav.ld_library_path}")
 
+    # Needed due to error in @upb//:upb which is a dep of @com_github_grpc_grpc
+    # error: defining a type within 'offsetof' is a Clang extension
     if dpav.clang_major_version in (16, 17, 18):
       self.compiler_options.append("-Wno-gnu-offsetof-extensions")
+    # error: defining a type within 'offsetof' is a C23 extension
+    if dpav.clang_major_version and dpav.clang_major_version >= 19:
+      self.compiler_options.append("-Wno-c23-extensions")
 
     # Avoid XNNPACK using `-mavxvnniint8` (needs clang-16+/gcc-13+)
     if (
diff --git a/third_party/xla/build_tools/configure/configure_test.py b/third_party/xla/build_tools/configure/configure_test.py
index 8849b931a905..a94cefdc4a92 100644
--- a/third_party/xla/build_tools/configure/configure_test.py
+++ b/third_party/xla/build_tools/configure/configure_test.py
@@ -35,9 +35,9 @@
 
 # CUDA specific paths and versions
 _CUDA_SPECIFIC_PATHS_AND_VERSIONS = {
-    "cuda_version": '"12.1.1"',
+    "cuda_version": '"12.6.3"',
     "cuda_compute_capabilities": ["7.5"],
-    "cudnn_version": '"8.6"',
+    "cudnn_version": '"9.3.0"',
     "ld_library_path": "/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
 }
 _CUDA_COMPUTE_CAPABILITIES_AND_LD_LIBRARY_PATH = {
diff --git a/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
index 3f42ca9e563a..0832fbcb4c16 100644
--- a/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
@@ -3,9 +3,9 @@ build --repo_env CC=/usr/lib/llvm-18/bin/clang
 build --repo_env BAZEL_COMPILER=/usr/lib/llvm-18/bin/clang
 build --config cuda_clang
 build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-18/bin/clang
-build:cuda --repo_env HERMETIC_CUDA_VERSION="12.1.1"
+build:cuda --repo_env HERMETIC_CUDA_VERSION="12.6.3"
 build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES=7.5
-build:cuda --repo_env HERMETIC_CUDNN_VERSION="8.6"
+build:cuda --repo_env HERMETIC_CUDNN_VERSION="9.3.0"
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
index 59d8d15c2208..96c655bc57c2 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
@@ -3,9 +3,9 @@ build --repo_env CC=/usr/lib/llvm-18/bin/clang
 build --repo_env BAZEL_COMPILER=/usr/lib/llvm-18/bin/clang
 build --config cuda_nvcc
 build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-18/bin/clang
-build:cuda --repo_env HERMETIC_CUDA_VERSION="12.1.1"
+build:cuda --repo_env HERMETIC_CUDA_VERSION="12.6.3"
 build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES=7.5
-build:cuda --repo_env HERMETIC_CUDNN_VERSION="8.6"
+build:cuda --repo_env HERMETIC_CUDNN_VERSION="9.3.0"
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
index d587802dbd3a..cc6c84fb0195 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
@@ -1,8 +1,8 @@
 build --action_env GCC_HOST_COMPILER_PATH=/usr/bin/gcc
 build --config cuda
-build:cuda --repo_env HERMETIC_CUDA_VERSION="12.1.1"
+build:cuda --repo_env HERMETIC_CUDA_VERSION="12.6.3"
 build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES=7.5
-build:cuda --repo_env HERMETIC_CUDNN_VERSION="8.6"
+build:cuda --repo_env HERMETIC_CUDNN_VERSION="9.3.0"
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 build --define=xnn_enable_avxvnniint8=false
diff --git a/third_party/xla/build_tools/lint/BUILD b/third_party/xla/build_tools/lint/BUILD
index 486b25d8536f..286ca7aef0ba 100644
--- a/third_party/xla/build_tools/lint/BUILD
+++ b/third_party/xla/build_tools/lint/BUILD
@@ -14,7 +14,7 @@
 # ============================================================================
 
 load("@rules_python//python:py_test.bzl", "py_test")
-load("//xla:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
+load("//xla:pytype.bzl", "pytype_strict_binary", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/build_tools/lint/tags.py b/third_party/xla/build_tools/lint/tags.py
index b28190ad5dbb..35a722e38d6f 100644
--- a/third_party/xla/build_tools/lint/tags.py
+++ b/third_party/xla/build_tools/lint/tags.py
@@ -41,9 +41,7 @@
         " doesn't build things tagged with this either."
     ),
     # Various disable tags (currently *unrecognized* by OpenXLA CI)
-    "notap": (
-        "Internal tag which disables the test. Will be extended to OpenXLA CI."
-    ),
+    "notap": "Internal tag which disables the test. Not used on OpenXLA CI.",
     "nosan": "Disabled under all sanitizers. Not used on OpenXLA CI.",
     "noasan": "Disabled under asan. Not used on OpenXLA CI.",
     "nomsan": "Disabled under msan. Not used on OpenXLA CI.",
diff --git a/third_party/xla/build_tools/rocm/asan_ignore_list.txt b/third_party/xla/build_tools/rocm/asan_ignore_list.txt
new file mode 100644
index 000000000000..9f9ba4ad9b8c
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/asan_ignore_list.txt
@@ -0,0 +1,2 @@
+interceptor_via_lib:libhsa-runtime64.so
+interceptor_via_lib:libamdhip64.so
diff --git a/third_party/xla/build_tools/rocm/lsan_ignore_list.txt b/third_party/xla/build_tools/rocm/lsan_ignore_list.txt
new file mode 100644
index 000000000000..41fd8f23abc3
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/lsan_ignore_list.txt
@@ -0,0 +1,3 @@
+leak:libhsa-runtime64.so
+leak:libstdc++.so
+leak:libamdhip64.so
diff --git a/third_party/xla/build_tools/rocm/run_xla_ci_build.sh b/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
new file mode 100755
index 000000000000..e7de2182f3d1
--- /dev/null
+++ b/third_party/xla/build_tools/rocm/run_xla_ci_build.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+CONFIG=$1
+DISK_CACHE_PATH=$2
+
+ASAN_ARGS=()
+if [[ $CONFIG == "rocm_ci_hermetic" ]]; then
+	ASAN_ARGS+=("--test_env=ASAN_OPTIONS=suppressions=$(realpath $(dirname $0))/asan_ignore_list.txt")
+	ASAN_ARGS+=("--test_env=LSAN_OPTIONS=suppressions=$(realpath $(dirname $0))/lsan_ignore_list.txt")
+	ASAN_ARGS+=("--config=asan")
+fi
+
+bazel --bazelrc=/usertools/rocm.bazelrc test \
+	--config=${CONFIG} \
+	--config=xla_cpp \
+	--disk_cache=${DISK_CACHE_PATH} \
+	--test_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
+	--build_tag_filters=gpu,requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm,-requires-gpu-sm60,-requires-gpu-sm60-only,-requires-gpu-sm70,-requires-gpu-sm70-only,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm86,-requires-gpu-sm86-only,-requires-gpu-sm89,-requires-gpu-sm89-only,-requires-gpu-sm90,-requires-gpu-sm90-only \
+	--profile=/tf/pkg/profile.json.gz \
+	--keep_going \
+	--test_env=TF_TESTS_PER_GPU=1 \
+	--test_env=TF_GPU_COUNT=2 \
+	--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
+	--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+	--test_output=errors \
+	--local_test_jobs=2 \
+	--run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
+	"${ASAN_ARGS[@]}"
diff --git a/third_party/xla/docs/broadcasting.md b/third_party/xla/docs/broadcasting.md
index 0f781f06e869..af60534c9c90 100644
--- a/third_party/xla/docs/broadcasting.md
+++ b/third_party/xla/docs/broadcasting.md
@@ -11,10 +11,11 @@ compatible shapes for arithmetic operations. The terminology is borrowed from
 Broadcasting may be required for operations between multi-dimensional arrays of
 different ranks, or between multi-dimensional arrays with different but
 compatible shapes. Consider the addition `X+v` where `X` is a matrix (an array
-of rank 2) and `v` is a vector (an array of rank 1). To perform element-wise
-addition, XLA needs to "broadcast" the vector `v` to the same rank as the
-matrix `X`, by replicating `v` a certain number of times. The vector's length
-has to match at least one of the dimensions of the matrix.
+with 2 dimensions) and `v` is a vector (an array with 1 dimension). To perform
+element-wise addition, XLA needs to "broadcast" the vector `v` to the same
+number of dimensions as the matrix `X`, by replicating `v` a certain number of
+times. The vector's length has to match at least one of the dimensions of the
+matrix.
 
 For example:
 
@@ -42,7 +43,7 @@ With regard to broadcasting, XLA requires explicit broadcasting specifications
 on operations between arrays of different ranks. This is different from NumPy,
 which infers the specification when possible.
 
-## Broadcasting a lower-rank array onto a higher-rank array
+## Broadcasting a lower-dimensional array onto a higher-dimensional array
 
 *Scalars* can always be broadcast over arrays without an explicit specification
 of broadcasting dimensions. An element-wise binary operation between a scalar
@@ -56,8 +57,8 @@ input matrix.
 
 Most broadcasting needs can be captured by using a tuple of dimensions on a
 binary operation. When the inputs to the operation have different ranks, this
-broadcasting tuple specifies which dimension(s) in the **higher-rank** array to
-match with the **lower-rank** array.
+broadcasting tuple specifies which dimension(s) in the **higher-dimensional**
+array to match with the **lower-dimensional** array.
 
 Consider the previous example. Instead of adding a scalar to a (2,3) matrix, add
 a vector of dimension (3) to a matrix of dimensions (2,3). *Without specifying
@@ -92,10 +93,10 @@ and the vector is duplicated for each column in the matrix.
 Note: When adding a 2x3 matrix to a 3-element vector, a broadcasting dimension
 of 0 is invalid.
 
-The broadcasting dimensions can be a tuple that describes how a smaller rank
-shape is broadcast into a larger rank shape. For example, given a 2x3x4 cuboid
-and a 3x4 matrix, a broadcasting tuple (1,2) means matching the matrix to
-dimensions 1 and 2 of the cuboid.
+The broadcasting dimensions can be a tuple that describes how a
+smaller-dimensional shape is broadcast into a larger-dimensional shape. For
+example, given a 2x3x4 cuboid and a 3x4 matrix, a broadcasting tuple (1,2)
+means matching the matrix to dimensions 1 and 2 of the cuboid.
 
 This type of broadcast is used in the binary ops in `XlaBuilder`, if the
 `broadcast_dimensions` argument is given. For example, see
@@ -105,10 +106,10 @@ broadcasting.
 
 ### Formal definition
 
-The broadcasting attribute allows matching a lower-rank array to a higher-rank
-array by specifying which dimensions of the higher-rank array to match. For
-example, for an array with dimensions MxNxPxQ, a vector with dimension T can be
-matched as follows:
+The broadcasting attribute allows matching a lower-dimensional array to a
+higher-dimensional array by specifying which dimensions of the
+higher-dimensional array to match. For example, for an array with dimensions
+MxNxPxQ, a vector with dimension T can be matched as follows:
 
               MxNxPxQ
 
@@ -117,9 +118,9 @@ matched as follows:
     dim 1:      T
     dim 0:    T
 
-In each case, T has to be equal to the matching dimension of the higher-rank
-array. The vector's values are then broadcast from the matched dimension to all
-the other dimensions.
+In each case, T has to be equal to the matching dimension of the
+higher-dimensional array. The vector's values are then broadcast from the
+matched dimension to all the other dimensions.
 
 To match a TxV matrix onto the MxNxPxQ array, a pair of broadcasting dimensions
 is used:
@@ -131,21 +132,21 @@ is used:
     etc...
 
 The order of dimensions in the broadcasting tuple must be the order in which the
-dimensions of the lower-rank array are expected to match the dimensions of the
-higher-rank array. The first element in the tuple specifies which dimension in
-the higher-rank array has to match dimension 0 in the lower-rank array. The
-second element in the tuple specifies which dimension in the higher-rank array
-has to match dimension 1 in the lower-rank array, and so on. The order of
-broadcast dimensions must be strictly increasing. For example, in the previous
-example it is illegal to match V to N and T to P; it is also illegal to match V
-to both P and N.
-
-## Broadcasting similar-rank arrays with degenerate dimensions
-
-A related problem is broadcasting two arrays that have the same rank but
-different dimension sizes. As with NumPy, this is only possible when the arrays
-are *compatible*. Two arrays are compatible when all their dimensions are
-compatible. Two dimensions are compatible if:
+dimensions of the lower-dimensional array are expected to match the dimensions
+of the higher-dimensional array. The first element in the tuple specifies which
+dimension in the higher-dimensional array has to match dimension 0 in the
+lower-dimensional array. The second element in the tuple specifies which
+dimension in the higher-dimensional array has to match dimension 1 in the
+lower-dimensional array, and so on. The order of broadcast dimensions must be
+strictly increasing. For example, in the previous example it is illegal to
+match V to N and T to P; it is also illegal to match V to both P and N.
+
+## Broadcasting similar-dimensional arrays with degenerate dimensions
+
+A related problem is broadcasting two arrays that have the same number of
+dimensions but different dimension sizes. As with NumPy, this is only possible
+when the arrays are *compatible*. Two arrays are compatible when all their
+dimensions are compatible. Two dimensions are compatible if:
 
 *   They are equal, or
 *   One of them is 1 (a "degenerate" dimension)
@@ -168,14 +169,14 @@ consult the
 
 ## Broadcast composition
 
-Broadcasting of a lower-rank array to a higher-rank array **and** broadcasting
-using degenerate dimensions can both be performed in the same binary operation.
-For example, a vector of size 4 and a matrix of size 1x2 can be added together
-using broadcast dimensions of value (0):
+Broadcasting of a lower-dimensional array to a higher-dimensional array **and**
+broadcasting using degenerate dimensions can both be performed in the same
+binary operation. For example, a vector of size 4 and a matrix of size 1x2 can
+be added together using broadcast dimensions of value (0):
 
     |1 2 3 4| + [5 6]    // [5 6] is a 1x2 matrix, not a vector.
 
-First the vector is broadcast up to rank 2 (matrix) using the broadcast
+First the vector is broadcast up to 2-dimensional (matrix) using the broadcast
 dimensions. The single value (0) in the broadcast dimensions indicates that
 dimension zero of the vector matches dimension zero of the matrix. This produces
 a matrix of size 4xM where the value M is chosen to match the corresponding
@@ -196,10 +197,10 @@ matrix to match the corresponding dimension size of the right hand side:
 
 A more complicated example is a matrix of size 1x2 added to an array of size
 4x3x1 using broadcast dimensions of (1, 2). First the 1x2 matrix is broadcast up
-to rank 3 using the broadcast dimensions to produce an intermediate Mx1x2 array
-where the dimension size M is determined by the size of the larger operand (the
-4x3x1 array) producing a 4x1x2 intermediate array. The M is at dimension 0 (the
-left-most dimension) because the dimensions 1 and 2 are mapped to the dimensions
-of the original 1x2 matrix as the broadcast dimensions are (1, 2). This
-intermediate array can be added to the 4x3x1 matrix using broadcasting of
+to 3 dimensions using the broadcast dimensions to produce an intermediate Mx1x2
+array where the dimension size M is determined by the size of the larger operand
+(the 4x3x1 array) producing a 4x1x2 intermediate array. The M is at dimension 0
+(the left-most dimension) because the dimensions 1 and 2 are mapped to the
+dimensions of the original 1x2 matrix as the broadcast dimensions are (1, 2).
+This intermediate array can be added to the 4x3x1 matrix using broadcasting of
 degenerate dimensions to produce a 4x3x2 array result.
diff --git a/third_party/xla/docs/custom_call.md b/third_party/xla/docs/custom_call.md
index 840e284c7eab..3c87049362af 100644
--- a/third_party/xla/docs/custom_call.md
+++ b/third_party/xla/docs/custom_call.md
@@ -93,7 +93,7 @@ type, dimensions, and a pointer to the buffer itself.
 
 
 ```c++
-// Buffers of any rank and data type.
+// Buffers of any number of dimensions and data type.
 auto handler = Ffi::Bind().Arg<AnyBuffer>().Ret<AnyBuffer>().To(
     [](AnyBuffer arg, Result<AnyBuffer> res) -> Error {
       void* arg_data = arg.untyped_data();
@@ -104,12 +104,13 @@ auto handler = Ffi::Bind().Arg<AnyBuffer>().Ret<AnyBuffer>().To(
 
 ### Constrained Buffer Arguments And Results
 
-`Buffer` allows to add constraints on the buffer data type and rank, and they
-will be automatically checked by the handler and return an error to XLA runtime,
-if run time arguments do not match the FFI handler signature.
+`Buffer` allows to add constraints on the buffer data type and number of
+dimensions, and they will be automatically checked by the handler and return
+an error to XLA runtime, if run time arguments do not match the FFI handler
+signature.
 
 ```c++
-// Buffers of any rank and F32 data type.
+// Buffers of any number of dimensions and F32 data type.
 auto handler = Ffi::Bind().Arg<Buffer<F32>>().Ret<Buffer<F32>>().To(
     [](Buffer<F32> arg, Result<Buffer<F32>> res) -> Error {
       float* arg_data = arg.typed_data();
@@ -119,7 +120,7 @@ auto handler = Ffi::Bind().Arg<Buffer<F32>>().Ret<Buffer<F32>>().To(
 ```
 
 ```c++
-// Buffers of rank 2 and F32 data type.
+// Buffers of number of dimensions 2 and F32 data type.
 auto handler = Ffi::Bind().Arg<BufferR2<F32>>().Ret<BufferR2<F32>>().To(
     [](BufferR2<F32> arg, Result<BufferR2<F32>> res) -> Error {
       float* arg_data = arg.typed_data();
@@ -312,7 +313,7 @@ void do_it() {
         /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
 }
 
-// Constrain custom call arguments to rank-1 buffers of F32 data type.
+// Constrain custom call arguments to 1-dimensional buffers of F32 data type.
 using BufferF32 = xla::ffi::BufferR1<xla::ffi::DataType::F32>;
 
 // Implement a custom call as a C+ function. Note that we can use `Buffer` type
diff --git a/third_party/xla/docs/emitters.md b/third_party/xla/docs/emitters.md
index 460483ef3f75..c003bd23e8cc 100644
--- a/third_party/xla/docs/emitters.md
+++ b/third_party/xla/docs/emitters.md
@@ -123,8 +123,8 @@ https://github.com/openxla/xla/blob/main/xla/hlo/transforms/simplifiers/gather_s
 ## Subgraph functions
 
 For a subgraph of a computation with parameters `%p0` to `%p_n`, and subgraph
-roots with rank `r` and element types (`e0` to `e_m`), we use the following MLIR
-function signature:
+roots with `r` dimensions and element types (`e0` to `e_m`), we use the
+following MLIR function signature:
 
 ``````
 (%p0: tensor<...>, %p1: tensor<...>, ..., %pn: tensor<...>,
@@ -489,10 +489,11 @@ coalesced writes to the output.
 
 ### Reproducer
 
-In order to see the IR after every pass of the compilation pipeline, one can launch `run_hlo_module` with the `--v=5` flag.
+In order to see the IR after every pass of the compilation pipeline, one can
+launch `run_hlo_module` with the `--xla_dump_hlo_pass_re=fusion-emitter` flag.
 
 ```
-run_hlo_module --platform=CUDA --xla_disable_all_hlo_passes --reference_platform="" --v=5 /tmp/gelu.hlo
+run_hlo_module --platform=CUDA --xla_disable_all_hlo_passes --reference_platform="" /tmp/gelu.hlo --xla_dump_hlo_pass_re=fusion-emitter --xla_dump_to=<some_directory>
 ```
 
 where `/tmp/gelu.hlo` contains
diff --git a/third_party/xla/docs/gpu_architecture.md b/third_party/xla/docs/gpu_architecture.md
index 295b206ae213..0c5fc1abdb54 100644
--- a/third_party/xla/docs/gpu_architecture.md
+++ b/third_party/xla/docs/gpu_architecture.md
@@ -133,7 +133,7 @@ HLO decouples logical shape and physical layout (how tensors are laid out in
 memory). For example, a matrix `f32[32, 64]` can be represented either in
 row-major or column-major order, represented as `{1,0}` or `{0,1}` respectively.
 In general, layout is represented as a part of shape, showing a permutation over
-the rank indicating physical layout in memory.
+the number of dimensions indicating physical layout in memory.
 
 For each operation present in the HLO, the Layout Assignment pass chooses an
 optimal layout (e.g. NHWC for a convolution on Ampere). For example, an
diff --git a/third_party/xla/docs/hermetic_cuda.md b/third_party/xla/docs/hermetic_cuda.md
index f7bcbb9b5f3e..886d6db7a468 100644
--- a/third_party/xla/docs/hermetic_cuda.md
+++ b/third_party/xla/docs/hermetic_cuda.md
@@ -10,11 +10,11 @@ versions.
 
 The supported CUDA versions are specified in `CUDA_REDIST_JSON_DICT`
 dictionary,
-[third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl).
+[third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl).
 
 The supported CUDNN versions are specified in `CUDNN_REDIST_JSON_DICT`
 dictionary,
-[third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl).
+[third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl).
 
 The `.bazelrc` files of individual projects have `HERMETIC_CUDA_VERSION`,
 `HERMETIC_CUDNN_VERSION` environment variables set to the versions used by
@@ -23,26 +23,26 @@ default when `--config=cuda` is specified in Bazel command options.
 ## Environment variables controlling the hermetic CUDA/CUDNN versions
 
 `HERMETIC_CUDA_VERSION` environment variable should consist of major, minor and
-patch CUDA version, e.g. `12.3.2`.
+patch CUDA version, e.g. `12.6.3`.
 `HERMETIC_CUDNN_VERSION` environment variable should consist of major, minor and
-patch CUDNN version, e.g. `9.1.1`.
+patch CUDNN version, e.g. `9.3.0`.
 
 Three ways to set the environment variables for Bazel commands:
 
 ```
 # Add an entry to your `.bazelrc` file
-build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
-build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.1.1"
+build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.6.3"
+build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
 
 # OR pass it directly to your specific build command
 bazel build --config=cuda <target> \
---repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
---repo_env=HERMETIC_CUDNN_VERSION="9.1.1"
+--repo_env=HERMETIC_CUDA_VERSION="12.6.3" \
+--repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
 
 # If .bazelrc doesn't have corresponding entries and the environment variables
 # are not passed to bazel command, you can set them globally in your shell:
-export HERMETIC_CUDA_VERSION="12.3.2"
-export HERMETIC_CUDNN_VERSION="9.1.1"
+export HERMETIC_CUDA_VERSION="12.6.3"
+export HERMETIC_CUDNN_VERSION="9.3.0"
 ```
 
 If `HERMETIC_CUDA_VERSION` and `HERMETIC_CUDNN_VERSION` are not present, the
@@ -51,7 +51,7 @@ hermetic CUDA/CUDNN repository rules will look up `TF_CUDA_VERSION` and
 compatibility with non-hermetic CUDA/CUDNN repository rules.
 
 The mapping between CUDA version and NCCL distribution version to be downloaded
-is specified in [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl)
+is specified in [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl)
 
 ## Configure hermetic CUDA
 
@@ -114,8 +114,8 @@ is specified in [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https:
    respectively. Use only supported versions. You may set the environment
    variables directly in your shell or in `.bazelrc` file as shown below:
    ```
-   build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.3.2"
-   build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.1.1"
+   build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.6.3"
+   build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
    build:cuda --repo_env=HERMETIC_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90"
    ```
 
@@ -152,24 +152,43 @@ is specified in [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https:
    details.
 
 ## Upgrade hermetic CUDA/CUDNN version
-1. Create and submit a pull request with updated `CUDA_REDIST_JSON_DICT`,
-   `CUDA_REDIST_JSON_DICT` dictionaries in
-   [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl).
 
-   Update `CUDA_NCCL_WHEELS` in
-   [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl)
-   if needed.
-
-   Update `REDIST_VERSIONS_TO_BUILD_TEMPLATES` in
-   [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl)
-   if needed.
-
-2. For each Google ML project create a separate pull request with updated
-   `HERMETIC_CUDA_VERSION` and `HERMETIC_CUDNN_VERSION` in `.bazelrc` file.
-
-   The PR presubmit job executions will launch bazel tests and download hermetic
-   CUDA/CUDNN distributions. Verify that the presubmit jobs passed before
-   submitting the PR.
+1.  Create and submit a pull request with updated `CUDA_REDIST_JSON_DICT`,
+    `CUDNN_REDIST_JSON_DICT` dictionaries in
+    [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl).
+
+    Update `CUDA_NCCL_WHEELS` in
+    [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl)
+    if needed.
+
+    Update `REDIST_VERSIONS_TO_BUILD_TEMPLATES` in
+    [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl)
+    if needed.
+
+    Update `PTX_VERSION_DICT` in
+    [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl)
+    if needed.
+
+2.  For each Google ML project create a separate pull request with updated
+    `HERMETIC_CUDA_VERSION` and `HERMETIC_CUDNN_VERSION` in `.bazelrc` file.
+
+    The PR presubmit job executions will launch bazel tests and download
+    hermetic CUDA/CUDNN distributions. Verify that the presubmit jobs passed
+    before submitting the PR.
+
+3.  For the time optimization some build/test configurations utilize mirrored
+    `.tar` redistributions. The `json` file with information about the mirrored
+    `.tar` redistributions is uploaded some time later after
+    `CUDA_REDIST_JSON_DICT` and `CUDNN_REDIST_JSON_DICT` are updated. One can
+    download these files using
+    `wget "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_<cuda_version>_tar.json"`
+    for `CUDA` and
+    `wget "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_<cudnn_version>_tar.json"`
+    for `CUDNN`.
+    After that create and submit a pull request with updated
+    `MIRRORED_TARS_CUDA_REDIST_JSON_DICT`,
+    `MIRRORED_TARS_CUDNN_REDIST_JSON_DICT` dictionaries in
+    [third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl](https://github.com/openxla/xla/blob/main/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl).
 
 ## Pointing to CUDA/CUDNN/NCCL redistributions on local file system
 
diff --git a/third_party/xla/docs/images/openxla.svg b/third_party/xla/docs/images/openxla.svg
index d7dadcf9fdc8..11223e0c44dd 100644
--- a/third_party/xla/docs/images/openxla.svg
+++ b/third_party/xla/docs/images/openxla.svg
@@ -51,7 +51,7 @@
       }
 
       .cls-3 {
-        fill: #6a1b9a;
+        fill: #4d81f3;
       }
 
       .cls-4 {
diff --git a/third_party/xla/docs/indexing.md b/third_party/xla/docs/indexing.md
index fb524a9f42d2..ef0dec522eaf 100644
--- a/third_party/xla/docs/indexing.md
+++ b/third_party/xla/docs/indexing.md
@@ -1,9 +1,9 @@
-# Indexing analysis
+# Indexing Analysis
 
-This document describes the HLO indexing analysis, which lets you symbolically
-compute indexing maps for HLO ops. The indexing map is a function that maps
-indices of one tensor to the indices of another, e.g. indices of an HLO
-instruction output to indices of HLO instruction inputs or vice versa.
+HLO indexing analysis is a [dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis)
+that describes how elements of one tensor relate to another via "indexing
+maps". For example, how indices of an HLO instruction output map to indices of
+HLO instruction operands.
 
 #### Example
 
@@ -19,7 +19,7 @@ the indexing map from the output to input is `(i, j, k) -> (j)` for `i in
 
 ## Motivation
 
-XLA GPU uses several bespoke solutions to reason about coalescing, operand
+XLA uses several bespoke solutions to reason about coalescing, operand
 utilization, and tiling schemes (more details below). The goal of indexing
 analysis is providing a reusable component for such use cases. Indexing analysis
 is built on MLIR's Affine Map infrastructure and adds HLO semantics.
@@ -34,7 +34,7 @@ output.
 
 Operand utilization in XLA indicates how much each input of the instruction is
 used assuming its output is fully used. Currently, utilization is also not
-computed for a generic case. Indexing analysis allows to compute utilization
+computed for a generic case. Indexing analysis allows us to compute utilization
 precisely.
 
 ### Tiling
@@ -47,31 +47,85 @@ is already a
 that does it for softmax and dot. Tile propagation can be made more generic and
 robust if it is expressed via indexing maps.
 
-## Function and Domain
+## Indexing map
 
-The indexing map is a function **f**(**x**) = **f**(**d**,  **r**, **rt**)
-that maps a multi-index **d** of a tensor `A` to elements/ranges of
-tensor `B`. The parameter **r** refers to the ranges of indices of
-the dimensions that are present in tensor `B`, but not in tensor `A`​. The
-parameter **rt** refers to the runtime values, e.g. indices for a gather op.
+An *indexing map* is a combination of
 
-For example, if we have a reduction from `tensor<2x4x8x16xf32>` to
-`tensor<4x8xf32>`, then the indexing map from the 2D output to the 4D input is
-`(d0, d1) -> (r0, d0, d1, r1)`, where `d_i` are the dimension variables that
-correspond to the indices of the output tensor. Range variables `r_j` encode
-multiple values, i.e. to compute a `(d0, d1)` element of the output, we need
-`(r0, d0, d1, r1)` elements of the input, where `r0 in [0, 1]` and
-`r1 in [0, 15]`.
+- a symbolically expressed function that maps every element of one tensor `A` to
+  ranges of elements in tensor `B`;
+- constraints on valid function arguments, including function's domain.
 
-This mapping can be constructed from the attributes of HLO instructions or the
-mappings of unfused instructions can be composed to get indexing for a fusion.
-The mapping also has a domain, which specifies for what elements of the tensor
-the mapping exists.
+Function arguments are split into 3 categories to better communicate their
+nature:
 
-**f**(**x**) s.t.
+- *dimension* variables of the tensor `A` or a GPU grid we are mapping from;
+  values are known statically. Index elements are also called *dimension
+  variables*.
 
-**lb** <= **g**(**x**) <= **ub**
+- *range* variables. They define a one-to-many mapping and specify a set of
+  elements in `B` used to compute a single value of `A`; values are known
+  statically. The contracting dimension of a matrix multiplication is an example
+  of a range variable.
 
+- *runtime variables* that are only known at during execution. For example,
+  indices argument of [gather](https://openxla.org/xla/operation_semantics#gather)
+  operation.
+
+Result of the function is an index of the target `B` tensor.
+
+In short, an indexing function from tensor `A` to tensor `B` for operation `x`
+is
+
+`map_ab(index in A, range variables, runtime variables) -> index in B`.
+
+To better separate the types of mapping arguments we write them as:
+
+`map_ab(index in A)[range variables]{runtime variables} -> (index in B)`
+
+For example, let's look at the indexing maps for the reduce operation
+`f32[4, 8] out = reduce(f32[2, 4, 8, 16] in, 0), dimensions={0,3}`:
+
+- to map elements of `in` to `out` our function can be expressed as
+  `(d0, d1, d2, d3) -> (d1, d2)`. The constraints of the variables
+  `d0 in [0, 1], d1 in [0, 3], d2 in [0, 7], d3 in [0, 15]` are defined by the
+  shape of `in`.
+
+- To map elements of `out` to `in`: `out` has only two dimensions, and reduction
+  introduces two range variables that cover reducing dimensions. Thus the
+  mapping function is `(d0, d1)[s0, s1] -> (s0, d0, d1, s1)`, where `(d0, d1)`
+  is index of `out`. `s0`, `s1` are ranges defined by operation's semantics and
+  span dimension 0 and 3 of the `in` tensor. The constraints are
+  `d0 in [0, 3], d1 in [0, 7], s0 in [0,1], s1 in [0, 15]`.
+
+It's important to note that in most scenarios we are interested in mapping from
+the elements of the *output*. For computation
+
+```
+C = op1(A, B)
+E = op2(C, D)
+```
+
+we can talk about "indexing of B" meaning "mapping of elements of `E` into
+the elements of `B`". This might be counter-intuitive compared to other types of
+data-flow analysis that work from input toward outputs.
+
+*Constraints* on variables enable optimization opportunities and help with
+code generation. In the documentation and implementation constraints are also
+referred to as *domain* as they define all valid combinations or argument values
+of the mapping function. For many operation, constraints simply describe the
+dimensions of tensors but for some operations they might be more complicated;
+see examples below.
+
+By having functions and argument constraints expressed symbolically and being
+able to combine functions and constraints we can compute a compact indexing
+mapping for an arbitrary large computation (fusion).
+
+Expressiveness of symbolic function and constraints is a balance between
+implementation complexity and optimization gains we get from having a more
+precise representation. For some HLO operations we capture access patterns only
+approximately.
+
+## Implementation
 
 Since we want to minimize recomputation, we need a library for symbolic
 computations. XLA already depends on MLIR, so we use
@@ -84,13 +138,13 @@ A typical `AffineMap` looks like
 (d0)[s0, s1] -> (s0 + 5, d0 * 2, s1 * 3 + 50)
 ```
 
-`AffineMap` has two types of parameters: *dimensions* and *symbols*. The
-*dimensions* correspond to the dimension variables *d*, *symbols* correspond to
-the range variables *r* and RT variables *rt*.  `AffineMap` does not contain any
-metadata about ranges of the dimensions, so we have to provide this data
-ourselves.
+`AffineMap` has two types of parameters: *dimensions* and *symbols*.
+*Dimensions* correspond to the dimension variables *d*; *symbols* correspond to
+the range variables *r* and runtime variables *rt*. `AffineMap` does not contain
+any metadata about constraints of the parameters, so we have to provide them
+separately.
 
-```c++
+```c
 struct Interval {
  int64_t lower;
  int64_t upper;
@@ -126,11 +180,18 @@ shape of the output tensor for ops like transpose, reduce, elementwise, dot, but
 there are some exceptions like
 [HloConcatenateInstruction](https://github.com/openxla/stablehlo/blob/main/docs/spec.md#concatenate).
 
-`range_vars_` encode possible values that **r** parameters can take.
+`range_vars_` all values that range variables **s** take. The range variables
+are needed when multiple values are necessary to compute a single element of the
+tensor we are mapping from, e.g. for output->input indexing map of reductions or
+input->output map for broadcasts.
 
 `rt_vars_` encode the feasible values in runtime. For example, the offset is
 dynamic for a 1D `HloDynamicSliceInstruction`. The corresponding `RTVar` will
-have the feasible values between `0` and `tensor_size - slice_size - 1`.
+have feasible values between `0` and `tensor_size - slice_size - 1`.
+
+`constraints_` capture relations between values in form
+`<expression> in <range>`, e.g. `d0 + s0 in [0, 20]`. Together with
+`Variable.bounds` they define the "domain" of indexing function.
 
 Let's study-by-example to understand what's all of the above actually means.
 
@@ -140,15 +201,13 @@ Let's study-by-example to understand what's all of the above actually means.
 
 For elementwise ops the indexing map is an identity.
 
-```c++
+```c
   p0 = f32[10, 20] parameter(0)
   p1 = f32[10, 20] parameter(1)
-  add = f32[10, 20] add(p0, p1)
+  output = f32[10, 20] add(p0, p1)
 ```
 
-The output to input maps:
-
--   output -> input_i:
+The output to input map `output -> p0`:
 
 ```
 (d0, d1) -> (d0, d1),
@@ -157,9 +216,7 @@ d0 in [0, 9],
 d1 in [0, 19]
 ```
 
-The input to output maps
-
--   input_i -> output:
+The input to output map `p0 -> output`:
 
 ```
 (d0, d1) -> (d0, d1),
@@ -173,7 +230,7 @@ d1 in [0, 19]
 Broadcasting means that some of the dimensions will be removed when we map
 output to input and added when we map input to output.
 
-```c+
+```c
 p0 = f32[20] parameter(0)
 bc0 = f32[10, 20, 30] broadcast(p0), dimensions={1}
 ```
@@ -188,7 +245,7 @@ d1 in [0, 19],
 d2 in [0, 29]
 ```
 
-The input to output map
+The input to output map:
 
 ```
 (d0)[s0, s1] -> (s0, d0, s1),
@@ -198,28 +255,50 @@ s0 in [0, 9],
 s1 in [0, 29]
 ```
 
-Note that now we have **s** on the right side for the input-to-output
-mapping. Those are the symbols that represent ranges of values. For example, in
-this particular case every element of input with index `d0` is mapped to a
-10x1x30 slice of the output.
+Note that now we have range variables **s** on the right side for the
+input-to-output mapping. Those are the symbols that represent ranges of values.
+For example, in this particular case every element of input with index `d0` is
+mapped to a 10x1x30 slice of the output.
+
+### [Iota](https://openxla.org/xla/operation_semantics#iota)
 
-### Constant and [Iota](https://openxla.org/xla/operation_semantics#iota)
+Iota has no input tensor operand, so there is no input index arguments.
+
+```c
+iota = f32[2,4] iota(), dimensions={1}
+```
+
+Output to input map:
+
+```
+(d0, d1) -> ()
+domain:
+d0 in [0, 1]
+d1 in [0, 3]
+```
 
-Conveniently, they do not have any input parameters, so there is nothing to
-compute indexing for.
+Input to output map:
+
+```
+()[s0, s1] -> (s0, s1)
+domain:
+s0 in [0, 1]
+s1 in [0, 3]
+```
 
 ### [DynamicSlice](https://openxla.org/xla/operation_semantics#dynamicslice)
-DynamicSlice is just like Slice, but the offsets are dynamic.
 
-```c+
-src = s32[2,2,258] parameter(0)
+DynamicSlice has offsets known only at runtime.
+
+```c
+src = s32[2, 2, 258] parameter(0)
 of1 = s32[] parameter(1)
 of2 = s32[] parameter(2)
 of3 = s32[] parameter(3)
-ds = dynamic-slice(s32[2,2,258] src, s32[] of1, s32[] of2, s32[] of3), dynamic_slice_sizes={1, 2, 32}
+ds = s32[1, 2, 32] dynamic-slice(src, of1, of2, of3), dynamic_slice_sizes={1, 2, 32}
 ```
 
-The output to input map for `src`:
+The output to input map from `ds` to `src`:
 
 ```
 (d0, d1, d2){rt0, rt1, rt2} -> (d0 + rt0, d1 + rt1, d2 + rt2),
@@ -242,7 +321,7 @@ slice stays in bounds.
 The output to input map for `of1`, `of2` and `of3`:
 
 ```
-(d0, d1, d2)  -> (),
+(d0, d1, d2) -> (),
 domain:
 d0 in [0, 0],
 d1 in [0, 1],
@@ -251,7 +330,7 @@ d2 in [0, 31]
 
 ### [DynamicUpdateSlice](https://openxla.org/xla/operation_semantics#dynamicupdateslice)
 
-```c+
+```c
 src = s32[20,30] parameter(0)
 upd = s32[5,10] parameter(1)
 of1 = s32[] parameter(2)
@@ -262,7 +341,7 @@ dus = s32[20,30] dynamic-update-slice(
 
 The output to input map for `src` is trivial. It can be made more precise by
 restricting the domain to the not-updated indices, but right now indexing maps
-do not support inqequality constraints.
+do not support inequality constraints.
 
 ```
 (d0, d1) -> (d0, d1),
@@ -274,7 +353,7 @@ d1 in [0, 29]
 The output to input map for `upd`:
 
 ```
-(d0, d1){rt0, rt1}  -> (d0 - rt0, d1 - rt1),
+(d0, d1){rt0, rt1} -> (d0 - rt0, d1 - rt1),
 domain:
 d0 in [0, 19],
 d1 in [0, 29],
@@ -282,18 +361,16 @@ rt0 in [0, 15],
 rt1 in [0, 20]
 ```
 
-Note that now we have **s** on the right side for the input-to-output mapping.
-Those are the symbols that represent runtime values. For example, in this
-particular case for every element of the output with indices `d0, d1` we access
-slice offsets `of1` and `of2` to compute the index of the input.  The intervals
-for the runtime variables are derived by assuming that the entire slice stays in
-bounds.
-
+Note that now we have `rt0` and `rt1` that represent runtime values. In
+this particular case for every element of the output with indices `d0, d1` we
+access slice offsets `of1` and `of2` to compute the index of the input. The
+intervals for the runtime variables are derived by assuming that the entire
+slice stays in bounds.
 
 The output to input map for `of1` and `of2`:
 
 ```
-(d0, d1)  -> (),
+(d0, d1) -> (),
 domain:
 d0 in [0, 19],
 d1 in [0, 29]
@@ -303,7 +380,7 @@ d1 in [0, 29]
 
 Only the simplified gather is supported. See [gather_simplifier.h](https://github.com/openxla/xla/blob/main/xla/hlo/transforms/simplifiers/gather_simplifier.h).
 
-```c++
+```c
 operand = f32[33,76,70] parameter(0)
 indices = s32[1806,2] parameter(1)
 gather = f32[1806,7,8,4] gather(operand, indices),
@@ -348,7 +425,7 @@ The range variable `s0` shows that we need the entire row (d0, *) of the
 
 Indexing map for transpose is a permutation of input/output dimensions.
 
-```c+
+```c
 p0 = f32[3, 12288, 6, 128] parameter(0)
 transpose = f32[3, 6, 128, 12288] transpose(p0), dimensions={0, 2, 3, 1}
 ```
@@ -380,7 +457,7 @@ d3 in [0, 127]
 Indexing map for reverse changes the reverted dimensions to `upper_bound(d_i) -
 d_i`:
 
-```c+
+```c
 p0 = f32[1, 17, 9, 9] parameter(0)
 reverse = f32[1, 17, 9, 9] reverse(p0), dimensions={1, 2}
 ```
@@ -407,24 +484,23 @@ d2 in [0, 8],
 d3 in [0, 8]
 ```
 
-### **[(Variadic)Reduce](https://openxla.org/xla/operation_semantics#reduce)**
+### [(Variadic)Reduce](https://openxla.org/xla/operation_semantics#reduce)
 
-Variadic reduction have several inputs and several inits, the map from output to
-input adds the reduced dimensions. So, it behaves like an inverse to a broadcast
-in some sense.
+Variadic reduction have several inputs and several initial values, the map from
+output to input adds the reduced dimensions.
 
-```c+
+```c
 p0 = f32[256,10] parameter(0)
 p0_init = f32[] constant(-inf)
 p1 = s32[256,10] parameter(1)
 p1_init = s32[] constant(0)
-reduce = (f32[10], s32[10]) reduce(p0, p1, p0_init, p1_init),
+out = (f32[10], s32[10]) reduce(p0, p1, p0_init, p1_init),
   dimensions={0}, to_apply=max
 ```
 
 The output to input maps:
 
--   output -> input_j:
+- `out[0]` -> `p0`:
 
 ```
 (d0)[s0] -> (s0, d0),
@@ -433,7 +509,7 @@ d0 in [0, 9],
 s0 in [0, 255]
 ```
 
--   output -> init_j:
+- `out[0]` -> `p0_init`:
 
 ```
 (d0) -> (),
@@ -443,7 +519,7 @@ d0 in [0, 9]
 
 The input to output maps:
 
--   input_i -> output_j:
+- `p0` -> `out[0]`:
 
 ```
 (d0, d1) -> (d1),
@@ -452,7 +528,7 @@ d0 in [0, 255],
 d1 in [0, 9]
 ```
 
--   init_i -> output_j:
+- `p0_init` -> `out[0]`:
 
 ```
 ()[s0] -> (s0),
@@ -460,15 +536,13 @@ domain:
 s0 in [0, 9]
 ```
 
-for i, j = 0, ... INPUT_COUNT.
-
 ### [Slice](https://openxla.org/xla/operation_semantics#slice)
 
 Indexing from output to input for slice results in a strided indexing map which
 is valid for every element of the output. Mapping from the input to output is
 restricted to a strided range of the elements in the input.
 
-```c+
+```c
 p0 = f32[10, 20, 50] parameter(0)
 slice = f32[5, 3, 25] slice(f32[10, 20, 50] p0),
   slice={[5:10:1], [3:20:7], [0:50:2]}
@@ -504,7 +578,7 @@ Reshapes come in different flavors.
 
 This is a "linearizing" reshape from N-D to 1D.
 
-```c+
+```c
 p0 = f32[4,8] parameter(0)
 reshape = f32[32] reshape(p0)
 ```
@@ -530,7 +604,7 @@ d1 in [0, 7]
 
 This is an inverse "collapse shape" op, it reshapes a 1D input into N-D output.
 
-```c+
+```c
 p0 = f32[32] parameter(0)
 reshape = f32[4, 8] reshape(p0)
 ```
@@ -560,7 +634,7 @@ expand or collapse shapes.
 
 ##### Example 1: Linearization-delinearization.
 
-```c+
+```c
 p0 = f32[4,8] parameter(0)
 reshape = f32[2, 4, 4] reshape(p0)
 ```
@@ -590,7 +664,7 @@ d1 in [0, 7]
 
 ##### Example 2: Expanded and collapsed subshapes
 
-```c+
+```c
 p0 = f32[4, 8, 12] parameter(0)
 reshape = f32[32, 3, 4] reshape(p0)
 ```
@@ -632,16 +706,16 @@ sequence.
 Output-to-input mapping for concat is defined for all inputs, but with
 non-overlapping domains, i.e. only one of the inputs will be used at a time.
 
-```c+
+```c
 p0 = f32[2, 5, 7] parameter(0)
 p1 = f32[2, 11, 7] parameter(1)
 p2 = f32[2, 17, 7] parameter(2)
-ROOT concat = f32[2, 33, 7] concatenate(f32[2, 5, 7] p0, f32[2, 11, 7] p1, f32[2, 17, 7] p2), dimensions={1}
+ROOT output = f32[2, 33, 7] concatenate(f32[2, 5, 7] p0, f32[2, 11, 7] p1, f32[2, 17, 7] p2), dimensions={1}
 ```
 
 The output to inputs maps:
 
--   output -> input 1:
+- `output` -> `p0`:
 
 ```
 (d0, d1, d2) -> (d0, d1, d2),
@@ -651,7 +725,7 @@ d1 in [0, 4],
 d2 in [0, 6]
 ```
 
--   output -> input 2:
+- `output` -> `p1`:
 
 ```
 (d0, d1, d2) -> (d0, d1 - 5, d2),
@@ -661,7 +735,7 @@ d1 in [5, 15],
 d2 in [0, 6]
 ```
 
--   output -> input 3:
+- `output` -> `p2`:
 
 ```
 (d0, d1, d2) -> (d0, d1 - 16, d2),
@@ -671,10 +745,9 @@ d1 in [16, 32],
 d2 in [0, 6]
 ```
 
-
 The inputs to output maps:
 
--   input 1 -> output:
+- `p0` -> `output`:
 
 ```
 (d0, d1, d2) -> (d0, d1, d2),
@@ -684,7 +757,7 @@ d1 in [0, 4],
 d2 in [0, 6]
 ```
 
--   input 2 -> output:
+- `p1` -> `output`:
 
 ```
 (d0, d1, d2) -> (d0, d1 + 5, d2),
@@ -694,7 +767,7 @@ d1 in [0, 10],
 d2 in [0, 6]
 ```
 
--   input 3 -> output:
+- `p2` -> `output`:
 
 ```
 (d0, d1, d2) -> (d0, d1 + 16, d2),
@@ -708,17 +781,17 @@ d2 in [0, 6]
 
 Indexing maps for dot are very similar to the ones of reduce.
 
-```c+
+```c
 p0 = f32[4, 128, 256] parameter(0)
 p1 = f32[4, 256, 64] parameter(1)
-dot = f32[4, 128, 64] dot(p0, p1),
+output = f32[4, 128, 64] dot(p0, p1),
   lhs_batch_dims={0}, rhs_batch_dims={0},
   lhs_contracting_dims={2}, rhs_contracting_dims={1}
 ```
 
 The output to inputs maps:
 
--   output -> input_1:
+- output -> p0:
 
 ```
 (d0, d1, d2)[s0] -> (d0, d1, s0),
@@ -729,7 +802,7 @@ d2 in [0, 63],
 s0 in [0, 255]
 ```
 
--   output -> input_2:
+- output -> p1:
 
 ```
 (d0, d1, d2)[s0] -> (d0, s0, d2),
@@ -742,7 +815,7 @@ s0 in [0, 255]
 
 The inputs to output maps:
 
--   input_1 -> output:
+- p0 -> output:
 
 ```
 (d0, d1, d2)[s0] -> (d0, d1, s0),
@@ -753,7 +826,7 @@ d2 in [0, 255],
 s0 in [0, 63]
 ```
 
--   input_2 -> output:
+- p1 -> output:
 
 ```
 (d0, d1, d2)[s0] -> (d0, s0, d1),
@@ -766,9 +839,9 @@ s0 in [0, 127]
 
 ### [Pad](https://openxla.org/xla/operation_semantics#pad)
 
-Indexing of PadOp is inverse of SliceOp indexing.
+Indexing of PadOp is the inverse of SliceOp indexing.
 
-```c+
+```c
 p0 = f32[4, 4] parameter(0)
 p1 = f32[] parameter(1)
 pad = f32[12, 16] pad(p0, p1), padding=1_4_1x4_8_0
@@ -778,7 +851,7 @@ The padding config `1_4_1x4_8_0` denotes `lowPad_highPad_interiorPad_dim_0 x low
 
 The output to input maps:
 
--   output -> input:
+- output -> p0:
 
 ```
 (d0, d1) -> ((d0 - 1) floordiv 2, d1 - 4),
@@ -788,7 +861,7 @@ d1 in [4, 7],
 (d0 - 1) mod 2 in [0, 0]
 ```
 
--   output -> init:
+- output -> p1:
 
 ```
 (d0, d1) -> (),
@@ -797,24 +870,22 @@ d0 in [0, 11],
 d1 in [0, 15]
 ```
 
-
 ### [ReduceWindow](https://openxla.org/xla/operation_semantics#reducewindow)
 
 ReduceWindow in XLA also performs padding. Therefore, the indexing maps can be
 computed as a composition of ReduceWindow indexing that does not do any padding
 and PadOp's indexing.
 
-
-```c+
+```c
 c_inf = f32[] constant(-inf)
 p0 = f32[1024, 514] parameter(0)
-reduce-window = f32[1024, 3] reduce-window(p0, c_inf),
+outpu = f32[1024, 3] reduce-window(p0, c_inf),
   window={size=1x512 pad=0_0x0_0}, to_apply=max
 ```
 
 The output to input maps:
 
--   output -> input:
+- `output -> p0`:
 
 ```
 (d0, d1)[s0] -> (d0, d1 + s0),
@@ -824,7 +895,7 @@ d1 in [0, 2],
 s0 in [0, 511]
 ```
 
--   output -> init:
+- `output -> c_inf`:
 
 ```
 (d0, d1) -> (),
@@ -843,7 +914,7 @@ access patterns.
 
 Here is an example for `p0 + transpose(p0)`.
 
-```c+
+```c
 f {
   p0 = f32[1000, 1000] parameter(0)
   transpose_p0 = f32[1000, 1000]{0, 1} transpose(p0), dimensions={1, 0}
@@ -862,7 +933,7 @@ of the output we might need to read the input parameter twice.
 There are cases when the indexing maps are actually the same, even though it is
 not immediately obvious.
 
-```c+
+```c
 f {
   p0 = f32[20, 10, 50] parameter(0)
   lhs_transpose_1 = f32[10, 20, 50] transpose(p0), dimensions={1, 0, 2}
@@ -871,7 +942,7 @@ f {
   rhs_transpose_1 = f32[50, 10, 20] transpose(p0), dimensions={2, 1, 0}
   rhs_log = f32[50, 10, 20] exponential(rhs_transpose_1)
   rhs_transpose_2 = f32[10, 50, 20] transpose(rhs_log), dimensions={1, 0, 2}
-  ROOT add = f32[10, 50, 20] add(lhs_transpose_2, rhs_transpose_2)
+  ROOT output = f32[10, 50, 20] add(lhs_transpose_2, rhs_transpose_2)
 }
 ```
 
@@ -903,7 +974,7 @@ d1 in [0, 64],
 d2 in [0, 124]
 ```
 
-where `s0` refers to the inner-most dimension of the input.
+where `s0` refers to the innermost dimension of the input.
 
 For more examples see [indexing_analysis_test.cc](https://github.com/openxla/xla/blob/main/xla/hlo/analysis/indexing_analysis_test.cc).
 
@@ -932,7 +1003,7 @@ The simplifier can rewrite the following expressions.
 Indexing map simplifier allows us to understand that some of the chained
 reshapes in HLO cancel each other.
 
-```c+
+```c
 p0 = f32[10, 10, 10] parameter(0)
 reshape1 = f32[50, 20] reshape(p0)
 reshape2 = f32[10, 10, 10] reshape(reshape1)
@@ -952,5 +1023,4 @@ for `d0 in [0, 5]` and `s0 in [1, 3]` are eliminated.
 3. Affine expressions in the constraints are optimized as the indexing affine
 map above.
 
-
 For more examples see [indexing_map_test.cc](https://github.com/openxla/xla/blob/main/xla/hlo/analysis/indexing_map_test.cc).
diff --git a/third_party/xla/docs/operation_semantics.md b/third_party/xla/docs/operation_semantics.md
index 90a0a1a5a7ca..d6ee37cfaf95 100644
--- a/third_party/xla/docs/operation_semantics.md
+++ b/third_party/xla/docs/operation_semantics.md
@@ -277,7 +277,7 @@ across all the other dimensions and uses the mean and variance to normalize each
 element in `operand`. The `feature_index` must be a valid index for the feature
 dimension in `operand`.
 
-`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
+`BatchNormInference` is equivalent to calling `BatchNormTraining` without
 computing `mean` and `variance` for each batch. It uses the input `mean` and
 `variance` instead as estimated values. The purpose of this op is to reduce
 latency in inference, hence the name `BatchNormInference`.
@@ -324,7 +324,8 @@ is a 4 dimensional array):
 -   Normalizes, scales and shifts:
     $y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l$
 
-The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+The epsilon value, usually a small number, is added to avoid divide-by-zero
+errors.
 
 The output type is a tuple of three `XlaOp`s:
 
@@ -427,7 +428,8 @@ For example, if `operand` is a scalar `f32` with value `2.0f`, and
 See also
 [`XlaBuilder::BroadcastInDim`](https://github.com/openxla/xla/tree/main/xla/hlo/builder/xla_builder.h).
 
-Expands the size and rank of an array by duplicating the data in the array.
+Expands the size and number of dimensions of an array by duplicating the data
+in the array.
 
 **`BroadcastInDim(operand, out_dim_size, broadcast_dimensions)`**
 
@@ -519,7 +521,7 @@ of a batch of symmetric (Hermitian) positive definite matrices.
 
 Arguments | Type    | Semantics
 --------- | ------- | -----------------------------------------------------
-`a`       | `XlaOp` | a rank > 2 array of a complex or floating-point type.
+`a`       | `XlaOp` | an array of a complex or floating-point type with > 2 dimensions.
 `lower`   | `bool`  | whether to use the upper or lower triangle of `a`.
 
 If `lower` is `true`, computes lower-triangular matrices `l` such that $a = l .
@@ -531,7 +533,7 @@ value of `lower`. Values from the other triangle are ignored. Output data is
 returned in the same triangle; the values in the other triangle are
 implementation-defined and may be anything.
 
-If the rank of `a` is greater than 2, `a` is treated as a batch of matrices,
+If `a` has greater than 2 dimensions, `a` is treated as a batch of matrices,
 where all except the minor 2 dimensions are batch dimensions.
 
 If `a` is not symmetric (Hermitian) positive definite, the result is
@@ -555,7 +557,7 @@ Arguments | Type    | Semantics
 Given an operand and minimum and maximum values, returns the operand if it is in
 the range between the minimum and maximum, else returns the minimum value if the
 operand is below this range or the maximum value if the operand is above this
-range.  That is, `clamp(a, x, b) =  min(max(a, x), b)`.
+range. That is, `clamp(a, x, b) =  min(max(a, x), b)`.
 
 All three arrays must be the same shape. Alternatively, as a restricted form of
 [broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
@@ -660,9 +662,10 @@ Note that there are the following restrictions on the `source_target_pair`:
 See also
 [`XlaBuilder::ConcatInDim`](https://github.com/openxla/xla/tree/main/xla/hlo/builder/xla_builder.h).
 
-Concatenate composes an array from multiple array operands. The array is of the
-same rank as each of the input array operands (which must be of the same rank as
-each other) and contains the arguments in the order that they were specified.
+Concatenate composes an array from multiple array operands. The array has the
+same number of dimensions as each of the input array operands (which must have
+the same number of dimensions as each other) and contains the arguments in the
+order that they were specified.
 
 **`Concatenate(operands..., dimension)`**
 
@@ -672,9 +675,9 @@ each other) and contains the arguments in the order that they were specified.
 | `dimension` | `int64`               | A value in the interval `[0, N)` that names the dimension to be concatenated between the `operands`. |
 
 With the exception of `dimension` all dimensions must be the same. This is
-because XLA does not support "ragged" arrays. Also note that rank-0 values
-cannot be concatenated (as it's impossible to name the dimension along which the
-concatenation occurs).
+because XLA does not support "ragged" arrays. Also note that 0-dimensional
+values cannot be concatenated (as it's impossible to name the dimension along
+which the concatenation occurs).
 
 1-dimensional example:
 
@@ -783,8 +786,8 @@ area and a computation is performed for each possible position of the window.
 
 | Arguments             | Type                     | Semantics                |
 | --------------------- | ------------------------ | ------------------------ |
-| `lhs`                 | `XlaOp`                  | rank n+2 array of inputs |
-| `rhs`                 | `XlaOp`                  | rank n+2 array of kernel weights |
+| `lhs`                 | `XlaOp`                  | (n+2)-dimensional array of inputs |
+| `rhs`                 | `XlaOp`                  | (n+2)-dimensional array of kernel weights |
 | `window_strides`      | `ArraySlice<int64>`      | n-d array of kernel strides |
 | `padding`             | `ArraySlice< pair<int64,int64>>` | n-d array of (low, high) padding |
 | `lhs_dilation`        | `ArraySlice<int64>`      | n-d lhs dilation factor array |
@@ -792,10 +795,10 @@ area and a computation is performed for each possible position of the window.
 | `feature_group_count` | int64                    | the number of feature groups |
 | `batch_group_count`   | int64                    | the number of batch groups |
 
-Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2
-array describing the base area. This is called the input, even though of course
-the rhs is also an input. In a neural network, these are the input activations.
-The n+2 dimensions are, in this order:
+Let n be the number of spatial dimensions. The `lhs` argument is an
+(n+2)-dimensional array describing the base area. This is called the input,
+even though of course the rhs is also an input. In a neural network, these are
+the input activations. The n+2 dimensions are, in this order:
 
 *   `batch`: Each coordinate in this dimension represents an independent input
 for which convolution is carried out.
@@ -804,7 +807,7 @@ associated to it, which goes into this dimension.
 *   `spatial_dims`: Describes the `n` spatial dimensions that define the base
 area that the window moves across.
 
-The `rhs` argument is a rank n+2 array describing the convolutional
+The `rhs` argument is an (n+2)-dimensional array describing the convolutional
 filter/kernel/window. The dimensions are, in this order:
 
 *   `output-z`: The `z` dimension of the output.
@@ -943,7 +946,7 @@ conversion routine such as round-to-nearest-even.
 
 > Note: The precise float-to-int and visa-versa conversions are currently
 > unspecified, but may become additional arguments to the convert operation in
-> the future.  Not all possible conversions have been implemented for all
+> the future. Not all possible conversions have been implemented for all
 >targets.
 
 ```cpp
@@ -1034,7 +1037,7 @@ The exact semantics of this operation depend on the ranks of the operands:
 | matrix [m x k] `dot` matrix [k x n] | matrix [m x n]  | matrix-matrix multiplication |
 
 The operation performs sum of products over the second dimension of `lhs` (or
-the first if it has rank 1) and the first dimension of `rhs`. These are the
+the first if it has 1 dimension) and the first dimension of `rhs`. These are the
 "contracted" dimensions. The contracted dimensions of `lhs` and `rhs` must be of
 the same size. In practice, it can be used to perform dot products between
 vectors, vector/matrix multiplications or matrix/matrix multiplications.
@@ -1134,8 +1137,9 @@ See also
 DynamicSlice extracts a sub-array from the input array at dynamic
 `start_indices`. The size of the slice in each dimension is passed in
 `size_indices`, which specify the end point of exclusive slice intervals in each
-dimension: [start, start + size). The shape of `start_indices` must be rank ==
-1, with dimension size equal to the rank of `operand`.
+dimension: [start, start + size). The shape of `start_indices` must
+1-dimensional, with dimension size equal to the number of dimensions of
+`operand`.
 
 **`DynamicSlice(operand, start_indices, size_indices)`**
 
@@ -1189,8 +1193,8 @@ DynamicUpdateSlice generates a result which is the value of the input array
 `operand`, with a slice `update` overwritten at `start_indices`.
 The shape of `update` determines the shape of the sub-array of the result which
 is updated.
-The shape of `start_indices` must be rank == 1, with dimension size equal to
-the rank of `operand`.
+The shape of `start_indices` must be 1-dimensional, with dimension size equal to
+the number of dimensions of `operand`.
 
 **`DynamicUpdateSlice(operand, update, start_indices)`**
 
@@ -1279,8 +1283,8 @@ Integer division overflow (signed/unsigned division/remainder by zero or signed
 division/remainder of `INT_SMIN` with `-1`) produces an implementation defined
 value.
 
-An alternative variant with different-rank broadcasting support exists for these
-operations:
+An alternative variant with different-dimensional broadcasting support exists
+for these operations:
 
 **`Op(lhs, rhs, broadcast_dimensions)`**
 
@@ -1289,9 +1293,10 @@ for arithmetic operations between arrays of different ranks (such as adding a
 matrix to a vector).
 
 The additional `broadcast_dimensions` operand is a slice of integers used to
-expand the rank of the lower-rank operand up to the rank of the higher-rank
-operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
-the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
+expand the number of dimensions of the lower-dimensional operand up to the
+number of dimensions of the higher-dimensional operand. `broadcast_dimensions`
+maps the dimensions of the lower-dimensional shape to the dimensions of the
+higher-dimensional shape. The unmapped dimensions of the expanded
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
 then broadcasts the shapes along these degenerate dimensions to equalize the
 shapes of both operands. The semantics are described in detail on the
@@ -1327,8 +1332,8 @@ broadcasting the two input arrays with the element type `PRED`. In this variant,
 operations between arrays of different ranks are *not* supported, unless one of
 the operands is a scalar.
 
-An alternative variant with different-rank broadcasting support exists for these
-operations:
+An alternative variant with different-dimensional broadcasting support exists
+for these operations:
 
 **`Op(lhs, rhs, broadcast_dimensions)`**
 
@@ -1344,6 +1349,7 @@ in detail on the [broadcasting page](broadcasting.md).
 
 XlaBuilder supports these element-wise unary functions:
 
+<!-- disableFinding(HTML_FORMAT) -->
 <b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
 
 <b>`Cbrt(operand)`</b> Element-wise cubic root operation `x -> cbrt(x)`.
@@ -1360,7 +1366,8 @@ $$\text{erf}(x) = \frac{2}{\sqrt{\pi}}\int_0^x e^{-t^2} \, dt$$.
 
 <b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
 
-<b>`Expm1(operand)`</b> Element-wise natural exponential minus one `x -> e^x - 1`.
+<b>`Expm1(operand)`</b> Element-wise natural exponential minus one
+`x -> e^x - 1`.
 
 <b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
 
@@ -1398,7 +1405,8 @@ element of `operand`.
 
 <b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
 
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ -0 & x = -0\\ NaN & x = NaN\\ +0 & x = +0\\ 1 & x > 0 \end{cases}$$
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ -0 & x = -0\\ NaN & x = NaN\\ +0 &
+x = +0\\ 1 & x > 0 \end{cases}$$
 
 using the comparison operator of the element type of `operand`.
 
@@ -1415,7 +1423,8 @@ Arguments | Type    | Semantics
 `operand` | `XlaOp` | The operand to the function
 
 The function is applied to each element in the `operand` array, resulting in an
-array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
+array with the same shape. It is allowed for `operand` to be a scalar
+(0-dimensional).
 
 ## Fft
 
@@ -1480,7 +1489,7 @@ For a more intuitive description, see the "Informal Description" section below.
 For convenience, we label dimensions in the output array not in `offset_dims`
 as `batch_dims`.
 
-The output is an array of rank `batch_dims.size` + `offset_dims.size`.
+The output is an array with `batch_dims.size` + `offset_dims.size` dimensions.
 
 The `operand.rank` must equal the sum of `offset_dims.size` and
 `collapsed_slice_dims.size`. Also, `slice_sizes.size` has to be equal to
@@ -1572,7 +1581,7 @@ that follow. More interesting values for `index_vector_dim` do not change the
 operation fundamentally, but make the visual representation more cumbersome.
 
 To get an intuition on how all of the above fits together, let's look at an
-example that gathers 5 slices of shape `[8,6]` from a `[16,11]` array.  The
+example that gathers 5 slices of shape `[8,6]` from a `[16,11]` array. The
 position of a slice into the `[16,11]` array can be represented as an index
 vector of shape `S64[2]`, so the set of 5 positions can be represented as a
 `S64[5,2]` array.
@@ -1595,34 +1604,34 @@ O<sub>`1`</sub>, and this in turn decides the bounds of the slice.
 This gather operation acts as a batch dynamic slice with `G` as the batch
 dimension.
 
-The gather indices may be multidimensional.  For instance, a more general
+The gather indices may be multidimensional. For instance, a more general
 version of the example above using a "gather indices" array of shape `[4,5,2]`
 would translate indices like this:
 
 ![](images/ops_xla_gather_1.svg)
 
 Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
-`G`<sub>`1`</sub> as the batch dimensions.  The slice size is still `[8,6]`.
+`G`<sub>`1`</sub> as the batch dimensions. The slice size is still `[8,6]`.
 
 The gather operation in XLA generalizes the informal semantics outlined above in
 the following ways:
 
 1. We can configure which dimensions in the output shape are the offset
 dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
-the last example).  The output batch dimensions (dimensions containing
+the last example). The output batch dimensions (dimensions containing
 `G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
 the output dimensions that are not offset dimensions.
 
 2. The number of output offset dimensions explicitly present in the output
-shape may be smaller than the input rank.  These "missing" dimensions, which
-are listed explicitly as `collapsed_slice_dims`, must have a slice size of
-`1`.  Since they have a slice size of `1` the only valid index for them is
-`0` and eliding them does not introduce ambiguity.
+shape may be smaller than the input number of dimensions. These "missing"
+dimensions, which are listed explicitly as `collapsed_slice_dims`, must have a
+slice size of `1`. Since they have a slice size of `1` the only valid index
+for them is `0` and eliding them does not introduce ambiguity.
 
 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
-example) may have fewer elements than the input array rank, and an explicit
-mapping dictates how the index should be expanded to have the same rank as
-the input.
+example) may have fewer elements than the input array's number of dimensions,
+and an explicit mapping dictates how the index should be expanded to have the
+same number of dimensions as the input.
 
 As a final example, we use (2) and (3) to implement `tf.gather_nd`:
 
@@ -1640,7 +1649,7 @@ adding up to [`X`,`O`<sub>`0`</sub>]. In other words, the output index
 [`GatherIndices`[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`0`],`O`<sub>`0`</sub>]
 which gives us the semantics for `tf.gather_nd`.
 
-`slice_sizes` for this case is `[1,11]`.  Intuitively this means that every
+`slice_sizes` for this case is `[1,11]`. Intuitively this means that every
 index `X` in the gather indices array picks an entire row and the result is the
 concatenation of all these rows.
 
@@ -1855,7 +1864,7 @@ absolute value of negative padding indicates the number of elements to remove
 from the specified dimension.
 
 `interior_padding` specifies the amount of padding added between any two
-elements in each dimension; it may not be negative.  Interior padding occurs
+elements in each dimension; it may not be negative. Interior padding occurs
 logically before edge padding, so in the case of negative edge padding, elements
 are removed from the interior-padded operand.
 
@@ -1926,11 +1935,12 @@ Where:
     `T`.
 
 This operation reduces one or more dimensions of each input array into scalars.
-The rank of each returned array is `rank(operand) - len(dimensions)`. The output
-of the op is `Collate(Q_0, ..., Q_N)` where `Q_i` is an array of type `T_i`, the
+The number of dimensions of each returned array is
+`number_of_dimensions(operand) - len(dimensions)`. The output of the op is
+`Collate(Q_0, ..., Q_N)` where `Q_i` is an array of type `T_i`, the
 dimensions of which are described below.
 
-Different backends are allowed to reassociate the reduction computation.  This
+Different backends are allowed to reassociate the reduction computation. This
 can lead to numerical differences, as some reduction functions like addition are
 not associative for floats.
 However, if the range of the data is limited, floating-point addition is close
@@ -1956,7 +1966,7 @@ of 0.
 result_shape <- remove all dims in dimensions from operand_shape
 
 # Iterate over all elements in result_shape. The number of r's here is equal
-# to the rank of the result
+# to the number of dimensions of the result.
 for r0 in range(result_shape[0]), r1 in range(result_shape[1]), ...:
   # Initialize this result element
   result[r0, r1...] <- 0
@@ -1970,7 +1980,7 @@ for r0 in range(result_shape[0]), r1 in range(result_shape[1]), ...:
     result[r0, r1...] += operand[ri... di]
 ```
 
-Here's an example of reducing a 2D array (matrix). The shape has rank 2,
+Here's an example of reducing a 2D array (matrix). The shape has 2 dimensions,
 dimension 0 of size 2 and dimension 1 of size 3:
 
 ![](images/ops_2d_matrix.png)
@@ -1982,14 +1992,14 @@ Results of reducing dimensions 0 or 1 with an "add" function:
 Note that both reduction results are 1D arrays. The diagram shows one as column
 and another as row just for visual convenience.
 
-For a more complex example, here is a 3D array. Its rank is 3, dimension 0 of
-size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the
-values 1 to 6 are replicated across dimension 0.
+For a more complex example, here is a 3D array. Its number of dimensions is 3,
+dimension 0 of size 4, dimension 1 of size 2 and dimension 2 of size 3. For
+simplicity, the values 1 to 6 are replicated across dimension 0.
 
 ![](images/ops_reduce_from_3d_matrix.png)
 
 Similarly to the 2D example, we can reduce just one dimension. If we reduce
-dimension 0, for example, we get a rank-2 array where all values across
+dimension 0, for example, we get a 2-dimensional array where all values across
 dimension 0 were folded into a scalar:
 
 ```text
@@ -1997,8 +2007,8 @@ dimension 0 were folded into a scalar:
 | 16  20  24 |
 ```
 
-If we reduce dimension 2, we also get a rank-2 array where all values across
-dimension 2 were folded into a scalar:
+If we reduce dimension 2, we also get a 2-dimensional array where all values
+across dimension 2 were folded into a scalar:
 
 ```text
 | 6  15 |
@@ -2009,7 +2019,7 @@ dimension 2 were folded into a scalar:
 
 Note that the relative order between the remaining dimensions in the input is
 preserved in the output, but some dimensions may get assigned new numbers (since
-the rank changes).
+the number of dimensions changes).
 
 We can also reduce multiple dimensions. Add-reducing dimensions 0 and 1 produces
 the 1D array `[20, 28, 36]`.
@@ -2062,7 +2072,7 @@ See also
 [`XlaBuilder::ReducePrecision`](https://github.com/openxla/xla/tree/main/xla/hlo/builder/xla_builder.h).
 
 Models the effect of converting floating-point values to a lower-precision
-format (such as IEEE-FP16) and back to the original format.  The number of
+format (such as IEEE-FP16) and back to the original format. The number of
 exponent and mantissa bits in the lower-precision format can be specified
 arbitrarily, although all bit sizes may not be supported on all hardware
 implementations.
@@ -2075,15 +2085,15 @@ Arguments       | Type    | Semantics
 `exponent_bits` | `int32` | number of exponent bits in lower-precision format
 `mantissa_bits` | `int32` | number of mantissa bits in lower-precision format
 
-The result is an array of type `T`.  The input values are rounded to the nearest
+The result is an array of type `T`. The input values are rounded to the nearest
 value representable with the given number of mantissa bits (using "ties to even"
 semantics), and any values that exceed the range specified by the number of
-exponent bits are clamped to positive or negative infinity.  `NaN` values are
+exponent bits are clamped to positive or negative infinity. `NaN` values are
 retained, although they may be converted to canonical `NaN` values.
 
 The lower-precision format must have at least one exponent bit (in order to
 distinguish a zero value from an infinity, since both have a zero mantissa), and
-must have a non-negative number of mantissa bits.  The number of exponent or
+must have a non-negative number of mantissa bits. The number of exponent or
 mantissa bits may exceed the corresponding value for type `T`; the corresponding
 portion of the conversion is then simply a no-op.
 
@@ -2429,8 +2439,8 @@ order.
 
 The arguments of scatter should follow these constraints:
 
--   Each `updates` array must be of rank `update_window_dims.size +
-    scatter_indices.rank - 1`.
+-   Each `updates` array must have `update_window_dims.size +
+    scatter_indices.rank - 1` dimensions.
 
 -   Bounds of dimension `i` in each `updates` array must conform to the
     following:
@@ -2509,10 +2519,10 @@ always be the current value from the `output` array and the second parameter
 will always be the value from the `updates` array. This is important
 specifically for cases when the `update_computation` is _not commutative_.
 
-If `indices_are_sorted` is set to true then XLA can assume that `scatter_indices`
-are sorted (in ascending order, _after_ scattering its values according to
-`scatter_dims_to_operand_dims`) by the user. If they are not then the semantics
-are implementation defined.
+If `indices_are_sorted` is set to true then XLA can assume that
+`scatter_indices` are sorted (in ascending order, _after_ scattering its values
+according to `scatter_dims_to_operand_dims`) by the user. If they are not then
+the semantics are implementation defined.
 
 If `unique_indices` is set to true then XLA can assume that all elements
 scattered to are unique. So XLA could use non-atomic operations. If
@@ -2550,7 +2560,8 @@ For each element `P` of `pred`, the corresponding element of the output array is
 taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
 value of `P` is `false`. As a restricted form of [broadcasting](broadcasting.md),
 `pred` can be a scalar of type `PRED`. In this case, the output array is taken
-wholly from `on_true` if `pred` is `true`, and from `on_false` if `pred` is `false`.
+wholly from `on_true` if `pred` is `true`, and from `on_false` if `pred` is
+`false`.
 
 Example with non-scalar `pred`:
 
@@ -2668,7 +2679,7 @@ data transfer. The context is a tuple of {operand (shape), request identifier
 **`SendDone(HloInstruction context)`**
 
 Given a context created by a `Send` instruction, waits for the data transfer to
-complete.  The instruction does not return any data.
+complete. The instruction does not return any data.
 
 **Scheduling of channel instructions**
 
@@ -2693,10 +2704,10 @@ computations. For example, below schedules lead to deadlocks.
 See also
 [`XlaBuilder::Slice`](https://github.com/openxla/xla/tree/main/xla/hlo/builder/xla_builder.h).
 
-Slicing extracts a sub-array from the input array. The sub-array is of the same
-rank as the input and contains the values inside a bounding box within the input
-array where the dimensions and indices of the bounding box are given as
-arguments to the slice operation.
+Slicing extracts a sub-array from the input array. The sub-array has the same
+number of dimensions as the input and contains the values inside a bounding box
+within the input array where the dimensions and indices of the bounding box are
+given as arguments to the slice operation.
 
 **`Slice(operand, start_indices, limit_indices, strides)`**
 
@@ -2705,7 +2716,7 @@ arguments to the slice operation.
 | `operand`       | `XlaOp`             | N dimensional array of type T        |
 | `start_indices` | `ArraySlice<int64>` | List of N integers containing the  starting indices of the slice for each dimension. Values must be greater than or equal to zero. |
 | `limit_indices` | `ArraySlice<int64>` | List of N integers containing the ending indices (exclusive) for the slice for each dimension. Each value must be greater than or equal to the respective `start_indices` value for the dimension and less than or equal to the size of the dimension. |
-| `strides`      | `ArraySlice<int64>` | List of N integers that decides the input stride of the slice.  The slice picks every `strides[d]` element in dimension `d`. |
+| `strides`      | `ArraySlice<int64>` | List of N integers that decides the input stride of the slice. The slice picks every `strides[d]` element in dimension `d`. |
 
 
 1-dimensional example:
@@ -2746,19 +2757,19 @@ Arguments    | Type                | Semantics
 
 If only one operand is provided:
 
-* If the operand is a rank-1 tensor (an array), the result is a sorted array.
-  If you want to sort the array into ascending order, the comparator should
-  perform a less-than comparison. Formally, after the array is sorted, it holds
-  for all index positions `i, j` with `i < j` that either
+* If the operand is a 1-dimensional tensor (an array), the result is a sorted
+  array. If you want to sort the array into ascending order, the comparator
+  should perform a less-than comparison. Formally, after the array is sorted,
+  it holds for all index positions `i, j` with `i < j` that either
   `comparator(value[i], value[j]) = comparator(value[j], value[i]) = false` or
   `comparator(value[i], value[j]) = true`.
 
-* If the operand has higher rank, the operand is sorted along the provided
-  dimension. For example, for a rank-2 tensor (a matrix), a dimension value of
-  `0` will independently sort every column, and a dimension value of `1` will
-  independently sort each row. If no dimension number is provided, then the last
-  dimension is chosen by default. For the dimension which is sorted, the same
-  sorting order applies as in the rank-1 case.
+* If the operand has higher number of dimensions, the operand is sorted along
+  the provided dimension. For example, for a 2-dimensional tensor (a matrix),
+  a dimension value of `0` will independently sort every column, and a dimension
+  value of `1` will independently sort each row. If no dimension number is
+  provided, then the last dimension is chosen by default. For the dimension
+  which is sorted, the same sorting order applies as in the 1-dimensional case.
 
 If `n > 1` operands are provided:
 
@@ -2789,6 +2800,38 @@ relative order of the equal values is preserved. Two elements `e1` and `e2` are
 equal if and only if `comparator(e1, e2) = comparator(e2, e1) = false`. By
 default, `is_stable` is set to false.
 
+## TopK
+
+See also
+[`XlaBuilder::TopK`](https://github.com/openxla/xla/tree/main/xla/hlo/builder/xla_builder.h).
+
+`TopK` finds the values and indices of the `k` largest or smallest elements for
+the last dimension of the given tensor.
+
+**`TopK(operand, k, largest)`**
+
+Arguments | Type    | Semantics
+--------- | ------- | --------------------
+`operand` | `XlaOp` | The tensor from which to extract the top `k` elements. The tensor must have greater or equal to one dimensions. The size of the last dimension of the tensor must be greater or equal to `k`.
+`k`       | `int64` | The number of elements to extract.
+`largest` | `bool`  | Whether to extract the largest or smallest `k` elements.
+
+For a 1-dimensional input tensor (an array), finds the `k` largest or smallest
+entries in the array and outputs a tuple of two arrays `(values, indices)`. Thus
+`values[j]` is the `j`-th largest/smallest entry in `operand`, and its index is
+`indices[j]`.
+
+For an input tensor with more than 1 dimension, computes the top `k` entries
+along the last dimension, preserving all other dimensions (rows) in the output.
+Thus, for an operand of shape `[A, B, ..., P, Q]` where `Q >= k` the output is
+a tuple `(values, indices)` where:
+
+```
+values.shape = indices.shape = [A, B, ..., P, k]
+```
+
+If two elements within a row are equal, the lower-index element appears first.
+
 ## Transpose
 
 See also the `tf.reshape` operation.
@@ -2800,9 +2843,9 @@ Arguments     | Type                | Semantics
 `operand`     | `XlaOp`             | The operand to transpose.
 `permutation` | `ArraySlice<int64>` | How to permute the dimensions.
 
-
 Permutes the operand dimensions with the given permutation, so
-`∀ i . 0 ≤ i < rank ⇒ input_dimensions[permutation[i]] = output_dimensions[i]`.
+`∀ i . 0 ≤ i < number of dimensions ⇒
+input_dimensions[permutation[i]] = output_dimensions[i]`.
 
 This is the same as Reshape(operand, permutation,
                             Permute(permutation, operand.shape.dimensions)).
@@ -2815,16 +2858,17 @@ See also
 Solves systems of linear equations with lower or upper triangular coefficient
 matrices by forward- or back-substitution. Broadcasting along leading
 dimensions, this routine solves one of the matrix systems `op(a) * x =
-b`, or `x * op(a) = b`, for the variable `x`, given `a` and `b`, where `op(a)` is
-either `op(a) = a`, or `op(a) = Transpose(a)`, or `op(a) = Conj(Transpose(a))`.
+b`, or `x * op(a) = b`, for the variable `x`, given `a` and `b`, where `op(a)`
+is either `op(a) = a`, or `op(a) = Transpose(a)`, or
+`op(a) = Conj(Transpose(a))`.
 
 **`TriangularSolve(a, b, left_side, lower, unit_diagonal, transpose_a)`**
 
 | Arguments       | Type        | Semantics                                    |
 | --------------- | ----------- | -------------------------------------------- |
-| `a`             | `XlaOp`     | a rank > 2 array of a complex or floating-point type with shape `[..., M, M]`. |
-| `b`             | `XlaOp`     | a rank > 2 array of the same type with shape `[..., M, K]` if `left_side` is true, `[..., K, M]` otherwise.  |
-| `left_side`     | `bool`      | indicates whether to solve a system of the form `op(a) * x = b` (`true`) or `x * op(a) = b` (`false`).  |
+| `a`             | `XlaOp`     | a > 2 dimensional array of a complex or floating-point type with shape `[..., M, M]`. |
+| `b`             | `XlaOp`     | a > 2 dimensional array of the same type with shape `[..., M, K]` if `left_side` is true, `[..., K, M]` otherwise. |
+| `left_side`     | `bool`      | indicates whether to solve a system of the form `op(a) * x = b` (`true`) or `x * op(a) = b` (`false`). |
 | `lower`         | `bool`      | whether to use the upper or lower triangle of `a`. |
 | `unit_diagonal` | `bool`      | if `true`, the diagonal elements of `a` are assumed to be `1` and not accessed. |
 | `transpose_a`   | `Transpose` | whether to use `a` as is, transpose it or take its conjugate transpose. |
@@ -2834,9 +2878,9 @@ value of `lower`. Values from the other triangle are ignored. Output data is
 returned in the same triangle; the values in the other triangle are
 implementation-defined and may be anything.
 
-If the rank of `a` and `b` are greater than 2, they are treated as batches of
-matrices, where all except the minor 2 dimensions are batch dimensions. `a` and
-`b` must have equal batch dimensions.
+If the number of dimensions of `a` and `b` are greater than 2, they are treated
+as batches of matrices, where all except the minor 2 dimensions are batch
+dimensions. `a` and `b` must have equal batch dimensions.
 
 ## Tuple
 
diff --git a/third_party/xla/docs/pjrt/index.md b/third_party/xla/docs/pjrt/index.md
index 92c3bcb455af..a0e84816aaba 100644
--- a/third_party/xla/docs/pjrt/index.md
+++ b/third_party/xla/docs/pjrt/index.md
@@ -24,6 +24,8 @@ opaque to the frameworks.
 *   [PJRT design docs](https://drive.google.com/drive/folders/18M944-QQPk1E34qRyIjkqDRDnpMa3miN)
 *   [PJRT API ABI versioning and compatibility](https://docs.google.com/document/d/1TKB5NyGtdzrpgw5mpyFjVAhJjpSNdF31T6pjPl_UT2o/edit)
 *   [PJRT Plugin Mechanism design doc](https://docs.google.com/document/d/1Qdptisz1tUPGn1qFAVgCV2omnfjN01zoQPwKLdlizas/edit)
+*   [OpenXLA 2024 Fall DevLab PJRT plugin tutorial slides](https://drive.google.com/file/d/1epUJkMONG2t06WOeMHz4Oi3F_-8cTuz-/view)
+    ([recording](https://www.youtube.com/watch?v=2GlMqaNxP_w&list=PLlFotmaRrOzv2OIEpijqiHGmY7rpscFcj))
 *   [OpenXLA/IREE PJRT plugin implementation](https://github.com/openxla/openxla-pjrt-plugin)
 
 
diff --git a/third_party/xla/docs/pjrt/pjrt_integration.md b/third_party/xla/docs/pjrt/pjrt_integration.md
index 6c23d7ff1733..0db5d5cdde17 100644
--- a/third_party/xla/docs/pjrt/pjrt_integration.md
+++ b/third_party/xla/docs/pjrt/pjrt_integration.md
@@ -55,7 +55,7 @@ You can call [RegisterPjRtCApiTestFactory](https://github.com/openxla/xla/blob/c
 
 You can either use JAX nightly
 ```
-pip install --pre -U jaxlib -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+pip install --pre -U jaxlib -i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/
 
 pip install git+https://github.com/google/jax
 ```
@@ -63,7 +63,7 @@ or [build JAX from source](https://jax.readthedocs.io/en/latest/developer.html#b
 
 For now, you need to match the jaxlib version with the PJRT C API version. It's usually sufficient to use a jaxlib nightly version from the same day as the TF commit you're building your plugin against, e.g.
 ```
-pip install --pre -U jaxlib==0.4.2.dev20230103 -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+pip install --pre -U jaxlib==0.6.1.dev20250428 -i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/
 ```
 You can also build a jaxlib from source at exactly the XLA commit you're building against ([instructions](https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source-with-a-modified-xla-repository)).
 
diff --git a/third_party/xla/docs/shapes.md b/third_party/xla/docs/shapes.md
index 6810ab49a55f..2b1fdc6b973f 100644
--- a/third_party/xla/docs/shapes.md
+++ b/third_party/xla/docs/shapes.md
@@ -1,19 +1,26 @@
 # Shapes and layout
 
-The XLA `Shape` proto
+The XLA `ShapeProto` proto
 ([xla_data.proto](https://github.com/openxla/xla/tree/main/xla/xla_data.proto))
-describes the rank, size, and data type of an N-dimensional array (*array* in
-short).
+describes the number of dimensions, size, and data type of an N-dimensional
+array (*array* in short).
 
 ## Terminology, notation, and conventions
 
-*   The rank of an array is equal to the number of dimensions. The *true rank*
-    of an array is the number of dimensions which have a size greater than 1.
+NOTE: in the past, XLA has used the term "rank" to mean the number of dimensions
+of an array. We have stopped this usage as it's inconsistent with the matrix
+rank concept in linear algebra. However, you may still see the name `rank` used
+in legacy documentation and some of the code.
+
+*   The *true number of dimensions* of an array is the number of dimensions
+    which have a size greater than 1.
 
 *   Dimensions are numbered from `0` up to `N-1` for an `N` dimensional array.
-    The dimension numbers are arbitrary labels for convenience. The order of
-    these dimension numbers does not imply a particular minor/major ordering in
-    the layout of the shape. The layout is determined by the `Layout` proto.
+    The size of a dimension is a non-negative integer. In particular, size 0 is
+    valid. The dimension numbers are arbitrary labels for convenience. The
+    order of these dimension numbers does not imply a particular minor/major
+    ordering in the layout of the shape. The layout is determined by the
+    `LayoutProto` proto.
 
 *   By convention, dimensions are listed in increasing order of dimension
     number. For example, for a 3-dimensional array of size `[A x B x C]`,
@@ -55,14 +62,14 @@ short).
 
 ## Layout
 
-The `Layout` proto describes how an array is represented in memory. The `Layout`
-proto includes the following fields:
+The `LayoutProto` proto describes how an array is represented in memory. It
+includes the following fields:
 
 ```
-message Layout {
-  repeated int64 minor_to_major = 1;
-  repeated int64 padded_dimensions = 2;
-  optional PaddingValue padding_value = 3;
+message LayoutProto {
+  repeated int64 minor_to_major;
+  int64 tail_padding_alignment_in_elements;
+  ...
 }
 ```
 
@@ -93,8 +100,8 @@ a d b e c f
 ```
 
 This minor-to-major dimension order of `0` up to `N-1` is akin to *column-major*
-(at rank 2). Assuming a monotonic ordering of dimensions, another way we may
-refer to this layout in the code is simply "dim 0 is minor".
+(for 2-dimensionals). Assuming a monotonic ordering of dimensions, another way
+we may refer to this layout in the code is simply "dim 0 is minor".
 
 On the other hand, if the `minor_to_major` field in the layout is `[1, 0]` then
 the layout in linear memory is:
@@ -104,39 +111,21 @@ a b c d e f
 ```
 
 A minor-to-major dimension order of `N-1` down to `0` for an `N` dimensional
-array is akin to *row-major* (at rank 2). Assuming a monotonic ordering of
-dimensions, another way we may refer to this layout in the code is simply "dim 0
-is major".
+array is akin to *row-major* (for 2-dimensionals). Assuming a monotonic
+ordering of dimensions, another way we may refer to this layout in the code is
+simply "dim 0 is major".
 
 #### Default minor-to-major ordering
 
 The default layout for newly created Shapes is "dimension order is
-major-to-minor" (akin to row-major at rank 2).
+major-to-minor" (i.e. `[N-1, ..., 0]`).
 
 ### Padding
 
-Padding is defined in the optional `padded_dimensions` and `padding_value`
-fields. The field `padded_dimensions` describes the sizes (widths) to which each
-dimension is padded. If present, the number of elements in `padded_dimensions`
-must equal the rank of the shape.
-
-For example, given the `[2 x 3]` array defined above, if `padded_dimensions` is
-`[3, 5]` then dimension 0 is padded to a width of 3 and dimension 1 is padded to
-a width of 5. The layout in linear memory (assuming a padding value of 0 and
-column-major layout) is:
-
-```
-a d 0 b e 0 c f 0 0 0 0 0 0 0
-```
-
-This is equivalent to the layout of the following array with the same
-minor-to-major dimension order:
-
-```
-a b c 0 0
-d e f 0 0
-0 0 0 0 0
-```
+The `tail_padding_alignment_in_elements` field defines the alignment of the
+[tiled](tiled_layout.md) array in terms of the number of elements. After
+applying tiling, padded elements will be added at the end of the layout until
+the total number of elements is a multiple of this value.
 
 ### Indexing into arrays
 
diff --git a/third_party/xla/docs/test_hlo_passes.md b/third_party/xla/docs/test_hlo_passes.md
index e7c3d35b2f6e..8afcf6bf773a 100644
--- a/third_party/xla/docs/test_hlo_passes.md
+++ b/third_party/xla/docs/test_hlo_passes.md
@@ -76,7 +76,7 @@ process, you can use
 to automatically insert generated `CHECK` directives above each test case in an
 HLO file.
 
-> WARNING: This tool inherently assumes that the pass's current behavior is
+> IMPORTANT: This tool inherently assumes that the pass's current behavior is
 > correct, so make sure to look over the generated `CHECK` lines yourself and
 > confirm that they match the output you expect.
 
diff --git a/third_party/xla/docs/tools.md b/third_party/xla/docs/tools.md
index b09154842d02..d492007f2e3f 100644
--- a/third_party/xla/docs/tools.md
+++ b/third_party/xla/docs/tools.md
@@ -1,4 +1,4 @@
-# Using XLA tooling
+# XLA Tooling
 
 The XLA development workflow is usually centered around
 [HLO](./operation_semantics) IR, which represents isolated functional
@@ -19,7 +19,11 @@ $ XLA_FLAGS=--xla_dump_to=/tmp/myfolder ./myprogram-entry-point
 which stores all before-optimization HLO files in the folder specified, along
 with many other useful artifacts.
 
-## Running HLO snippets: `run_hlo_module`
+## [`run_hlo_module`] Run HLO Modules
+
+```
+bazel run //xla/tools:run_hlo_module -- [flags] <filename>
+```
 
 The tool `run_hlo_module` operates on pre-optimization HLO, and by default
 bundles compilation, running and comparison with the reference interpreter
@@ -27,33 +31,47 @@ implementation. For example, the usual invocation to run an input file
 `computation.hlo` on an NVIDIA GPU and to check it for correctness is:
 
 ```
-$ run_hlo_module --platform=CUDA --reference_platform=Interpreter computation.hlo
+run_hlo_module --platform=CUDA --reference_platform=Interpreter computation.hlo
+```
+
+### Run Multiple HLO Modules
+Invocation with multiple HLO modules is supported for `run_hlo_module`. To run
+all hlo modules from a directory:
+
+```
+bazel run //xla/tools:run_hlo_module -- [flags] /dump/*before_optimizations*
 ```
 
-As with all the tools, `--help` can be used to obtain the full list of options.
+## [`multihost_hlo_runner`] Run HLO Modules With SPMD Support
 
-## Running HLO snippets with SPMD support: `multihost_hlo_runner`
+```
+# Note: Binary name is `hlo_runner_main`.
+bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main -- [flags] <filename>
+```
 
 Multihost HLO runner is a very similar tool, with the caveat that it supports
 SPMD, including cross host communication. See
 [Multi-Host HLO Runner](./tools_multihost_hlo_runner) for details.
 
-## Multi-HLO replay
+### Run Multiple HLO Modules With SPMD Support
 
-Invocation with multiple modules is supported for both `run_hlo_module` and
-`hlo_runner_main`, which is often convenient to replay all modules in a dump
-directory:
+Similar to `run_hlo_module`, `multihost_hlo_runner` also supports invocation
+with multiple modules.
 
-```shell
-$ hlo_runner_main /dump/*before_optimizations*
+```
+bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main -- [flags] /dump/*before_optimizations*
 ```
 
-## Running passes/stages of HLO compilation: `hlo-opt`
+## [`hlo-opt`] Compile HLO Module
+
+```
+bazel run //xla/tools:hlo-opt -- --platform=[gpu|cpu|...] [more flags] <filename>
+```
 
 When debugging or understanding the workings of the compiler, it is often useful
 to get the expansion for a particular hardware at a particular point in the
-pipeline (be it HLO, optimized HLO, TritonIR or LLVM), for a given (Stable) HLO
-input.
+pipeline (be it HLO, optimized HLO, TritonIR or LLVM), for a given HLO or
+StableHLO input.
 
 `hlo-opt` supports multiple output stages: be it PTX, HLO after optimizations,
 LLVM IR before optimizations, or TritonIR. The exact set of stages supported
@@ -61,9 +79,14 @@ depends on the platform (as e.g. PTX is NVIDIA-specific), and can be seen using
 the --list-stages command:
 
 ```
-$ hlo-opt --platform=CUDA --list-stages
+hlo-opt --platform=CUDA --list-stages
+buffer-assignment
 hlo
+hlo-backend
+html
 llvm
+llvm-after-optimizations
+llvm-before-optimizations
 ptx
 ```
 
@@ -71,26 +94,24 @@ After selecting a stage, the user can write the result of the conversion for a
 given platform to a given stream:
 
 ```
-$ hlo-opt myinput.hlo --platform=CUDA --stage=llvm
+hlo-opt --platform=cpu --stage=hlo input.hlo
 ```
 
 which would print the dump to stdout (or to a given file if `-o` was specified).
 
-### Deviceless Usage
+### Deviceless Compilation for GPU
+
+Deviceless compilation do not need access to a GPU. The Deviceless Compilation
+provides a way to specify GPU spec on the command line
+(`--xla_gpu_target_config_filename`) for stages where access to GPU is required,
+eliminating a need for GPU device.
 
-Access to a GPU is not needed for most of the compilation, and by specifying a
-GPU spec on the command line we can get e.g. PTX output without access to an
-accelerator:
+Example: PTX output without access to a gpu device:
 
 ```
-$ hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=(pwd)/tools/data/gpu_specs/a100_pcie_80.txtpb input.hlo
+hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb input.hlo
 ```
 
-Note: For the above invocation to work, the user would usually either need to
-disable autotuning with `--xla_gpu_autotune_level=0` or load a pre-existing
-autotuning results with `--xla_gpu_load_autotune_results_from=<filename>`
-(obtained with `--xla_gpu_dump_autotune_results_to=<filename>`).
-
 Specs for popular GPUs are shipped with the compiler, and the provided file is
 string serialization of `device_description.proto`:
 
@@ -117,12 +138,20 @@ gpu_device_info {
 }
 platform_name: "CUDA"
 ```
+More GPU specs are located at `/xla/tools/hlo_opt/gpu_specs`
+
+#### Autotuning
 
-Deviceless compilation might run into issues if autotuning is required. Luckily,
-we can also provide those on the command line:
+Sometimes compilation may involve autotuning based on a compilation `--stage`.
+For the deviceless compilation to work, the user either need to \
+**disable** autotuning with `--xla_gpu_autotune_level=0`\
+or\
+**load a pre-existing autotuning results** with
+`--xla_gpu_load_autotune_results_from=<filename>` (obtained with
+`--xla_gpu_dump_autotune_results_to=<filename>`).
 
 ```
-$ hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=gpu_specs/a100_pcie_80.txtpb --xla_gpu_load_autotune_results_from=results.textpb input.hlo
+hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=gpu_specs/a100_pcie_80.txtpb --xla_gpu_load_autotune_results_from=results.textpb input.hlo
 ```
 
 The autotune file is text serialization of `autotune_results.proto`, with
@@ -150,13 +179,115 @@ results {
 ```
 
 The autotuning database can be serialized using
-`XLA_FLAGS=--xla_gpu_dump_autotune_results_t=<myfile.pbtxt>`
+`XLA_FLAGS=--xla_gpu_dump_autotune_results_to=<myfile.pbtxt>`
 
-### Running a Single Compiler Pass
+## [`hlo-opt`] HLO Pass Development And Debugging
+
+```
+# If you are working with hardware independent passes from the
+# `xla/hlo/transforms/` directory, prefer light-weight version
+# of the `hlo-opt` tool with fewer dependencies:
+
+bazel run //xla/hlo/tools:hlo-opt -- [flags] <filename>
+
+# Otherwise, for hardware independent and CPU, GPU passes use
+# the same binary from "Compile HLO Modules" section above:
+
+bazel run //xla/tools:hlo-opt -- [flags] <filename>
+```
+
+The `hlo-opt` tool allows execution of an individual passes
+independent of the given platform compilation stages. This isolation helps to
+quickly run passes on input hlo module and pinpoint the root cause of failures.
+
+```
+hlo-opt --passes=schedule-aware-collective-cse input.hlo
+```
+
+Note: `--platform` option is not required.
+
+`hlo-opt` tool also supports [`DebugOptions XLA_FLAGS`](https://github.com/openxla/xla/blob/5bf1e6420d250dce5eb840889096bdf8aad6f432/xla/xla.proto#L40-L1197).
+
+```
+hlo-opt --passes=schedule-aware-collective-cse
+--xla_gpu_experimental_collective_cse_distance_threshold=20 input.hlo
+```
+
+Use`--list-passes` option to get the pass name string.
+
+```
+hlo-opt --list-passes
+```
+
+Users can create their own custom pipeline by specifying more than one passes
+to `--passes` option.
+
+```
+hlo-opt --passes=pass1,pass2,pass3 input.hlo
+```
+
+### Assist New HLO Pass Development
+
+1. First, write your pass.
+1. Register the new pass to the `hlo-opt` tool pass registry.
+
+    ```
+    RegisterPass<FooPass>(FooPassInputOptions)
+    ```
+
+    Based on the pass type, choose one of the following locations for
+    registration:\
+    [`opt_lib.cc`](https://github.com/openxla/xla/blob/5d015a2ddfcf4f40934a33891dc63471704f221d/xla/hlo/tools/hlo_opt/opt_lib.cc)
+    Hardware-independent passes.\
+    [`cpu_opt.cc`](https://github.com/openxla/xla/blob/5d015a2ddfcf4f40934a33891dc63471704f221d/xla/tools/hlo_opt/cpu_opt.cc)
+    CPU specific passes.\
+    [`gpu_opt.cc`](https://github.com/openxla/xla/blob/5d015a2ddfcf4f40934a33891dc63471704f221d/xla/tools/hlo_opt/gpu_opt.cc)
+    GPU specific passes.\
+    [`compiled_opt.cc`](https://github.com/openxla/xla/blob/5d015a2ddfcf4f40934a33891dc63471704f221d/xla/tools/hlo_opt/compiled_opt_lib.cc)
+    Passes common to CPU, GPU, XPU.\
+    Don't forget to add build dependency.
+
+    Include pass registration as part of your PR([example](https://github.com/openxla/xla/pull/22968/files#diff-e37a0ea999dfc5764d624240cd2edebb8b7ee4e6d91686be89c632dd7203b823)) so that the pass will be
+    available to use for all `hlo-opt` users.
+
+1. Rebuild the `hlo-opt` tool, validate successful pass registration using
+   `--list-passes` option and then use `--passes` option to run the pass.
+
+    ```
+    $ hlo-opt --passes=foo-pass input.hlo
+    ```
+
+1. Writing unit tests for the pass? refer https://openxla.org/xla/test_hlo_passes for more details.
+
+### Pass Runtime Measurement
+
+For large models, full compilation runs can take upto few minutes, making it
+challenging to detect subtle performance regressions. In contrast, individual
+pass runs using `hlo-opt` allow for precise
+performance measurement and the easy detection of even small increases in
+execution time caused by new code changes.
+
+```
+time hlo-opt --passes=reduce-window-rewriter,scatter_simplifier
+--xla_reduce_window_rewrite_base_length=128 input.hlo
+```
+
+## [`hlo-opt`] Convert HLO Module Formats
+
+```
+# Use the light weight version of the `hlo-opt` tool.
+
+bazel run //xla/hlo/tools:hlo-opt -- [flags] <filename>
+```
+
+#### Convert `HLO Text` -> `HLO Proto`
+
+```
+hlo-opt --emit-proto input.hlo
+```
 
-The flags from `XLA_FLAGS` are also supported, so the tool can be used to test
-running a single pass:
+#### Convert `HLO Proto` or `HLO Proto Binary` -> `HLO Text`
 
 ```
-$ hlo-opt --platform=CUDA --stage=hlo --passes=algebraic_simplifer input.hlo
+hlo-opt input.pbtxt or input.pb
 ```
diff --git a/third_party/xla/docs/tools_multihost_hlo_runner.md b/third_party/xla/docs/tools_multihost_hlo_runner.md
index 17d9bcbabd47..783eeaa75e9c 100644
--- a/third_party/xla/docs/tools_multihost_hlo_runner.md
+++ b/third_party/xla/docs/tools_multihost_hlo_runner.md
@@ -11,26 +11,23 @@ We can identify these HLOs by seeing `sharding=` annotations. For example
 `sharding={devices=[1,1,2,1]0,1}` means that the annotated tensor should be
 sharded to 2 GPUs (GPU0 and GPU1) along the 3rd dimension.
 
-The following instructions assume the working directory is the xla Git
+The following instructions assume the working directory is the XLA Git
 repository and that `./configure.py` has been run.
 
 If we have enough GPUs, we can replay these HLOs like this:
 
 ```
-bazel run -c opt --config=cuda --dynamic_mode=off \
-  //xla/tools/multihost_hlo_runner:hlo_runner_main -- my-hlo.txt
+bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main -- my-hlo.txt
 ```
 
 Tip: If the input generation takes too long or uses too much host memory,
 consider using `--hlo_argument_mode=uninitialized`.
 
 It is also possible to compile the same HLO without running it by setting
-`--run=false`
+`--run=false`:
 
 ```
-bazel run -c opt --config=cuda --dynamic_mode=off \
-  //xla/tools/multihost_hlo_runner:hlo_runner_main \
-  -- --run=false my-hlo.txt
+bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main -- --run=false my-hlo.txt
 ```
 
 In that case, a single GPU is necessary, unless the
@@ -63,44 +60,43 @@ Note, those instructions can be outdated more quickly. Adjust as needed.
 ```
 # The 8 below is the number of GPUs you have.
 # test-pax.sh --help for more details on the parallelization options
-(export XLA_FLAGS="--xla_dump_to=/tmp/dump --xla_dump_hlo_as_text"; test-pax.sh --fsdp 8 --batch-per-gpu 1)
+(export XLA_FLAGS="--xla_dump_to=/tmp/dump"; test-pax.sh --fsdp 8 --batch-per-gpu 1)
 
 ls -lSh /tmp/dump/*before_optimizations.txt
 # The biggest file one is normally the one you care about.
 # I picked one, for the rest of the scripts, but the name could change when you change the JAX or XLA version.
 ```
 
-### Build XLA multinode runner
+### Build XLA multihost runner
 
 ```
 cd /opt/xla/
 ./configure.py --backend CUDA --nccl
-bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
+bazel build //xla/tools/multihost_hlo_runner:hlo_runner_main
 ```
 
 ### Single process example: Before optimization graph replay
 
 ```
-bazel run -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main -- /tmp/dump/module_0023.pjit__wrapped_step_fn.before_optimizations.txt
+bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main -- \
+  /tmp/dump/module_0023.pjit__wrapped_step_fn.before_optimizations.txt
 ```
 
 ### Single process example: After optimization graph replay
 
-To replay an optimized HLO, you must use those two parameters
-`--run_xla_backend_only=true --xla_disable_all_hlo_passes=true`. Otherwise, it
-will try to recompile the HLO and this isn't supported. So it will give you many
-strange errors.
+To replay an optimized HLO, you must use either `--xla_disable_all_hlo_passes`
+or `--run_xla_backend_only`. Otherwise, XLA will try to recompile the HLO and
+this isn't supported. So it will give you many strange errors.
 
-Full command: `bazel run -c opt --config=cuda --dynamic_mode=off
-//xla/tools/multihost_hlo_runner:hlo_runner_main -- --run_xla_backend_only=true
---xla_disable_all_hlo_passes=true
+Full command: `bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main --
+--run_xla_backend_only
 /tmp/dump/module_0023.pjit__wrapped_step_fn.sm_8.0_gpu_after_optimizations.txt`
 
 ## Multi-processes, single-node
 
 ### Launch container
 
-Also install some missing librairies. (Note, that can be outdated more quickly.
+Also install some missing libraries. (Note, that can be outdated more quickly.
 Adjust as needed.)
 
 ```
@@ -108,14 +104,13 @@ docker run -it --shm-size=1g --gpus all ghcr.io/nvidia/jax:pax-2024-06-03
 apt-get update && apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
 ```
 
-### Run original model and dump HLO.
+### Run original model and dump HLO
 
 For this example, we will use an 8-GPU PAXML model from `test-pax.sh`. (Note
 this will be the same dump as the single process case. So you can do `cp -r
 /tmp/dump /tmp/dump_multi_process` if you already have it. `export
-XLA_FLAGS="--xla_dump_to=/tmp/dump_multi_process --xla_dump_hlo_as_text" mpirun
---allow-run-as-root -np 8 test-pax.sh --fsdp 8 --batch-per-gpu 1 -o
-/tmp/checkpoint --multiprocess`
+XLA_FLAGS="--xla_dump_to=/tmp/dump_multi_process" mpirun --allow-run-as-root -np
+8 test-pax.sh --fsdp 8 --batch-per-gpu 1 -o /tmp/checkpoint --multiprocess`
 
 The HLO dump will be saved to `/tmp/dump_multi_process/`. For PAX specifically,
 the main module will have "pjit__wrapped_step_fn" in the name. For this example
@@ -129,7 +124,7 @@ Create a bash script called `run.sh`:
 ```
 #!/bin/bash
 export CUDA_VISIBLE_DEVICES=${OMPI_COMM_WORLD_LOCAL_RANK}
-bazel run -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main -- \
+bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main -- \
   --task_id=${OMPI_COMM_WORLD_RANK} \
   --num_nodes=${OMPI_COMM_WORLD_SIZE} \
   --address=127.0.0.1:12345 \
@@ -146,10 +141,10 @@ mpirun --allow-run-as-root -np 8 run.sh
 ### Run on multiple nodes with SLURM
 
 When running on multiple nodes using SLURM, you can forward the SLURM env
-variables to the hlo runner like so in your slurm job:
+variables to the HLO runner like so in your SLURM job:
 
 ```
-bazel run -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main -- \
+bazel run //xla/tools/multihost_hlo_runner:hlo_runner_main -- \
   --task_id=${SLURM_PROCID} \
   --num_nodes=${SLURM_NTASKS} \
   --address="${SLURM_LAUNCH_NODE_IPADDR}:12345" \
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 26816efb1dfd..a8185b074000 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -1,203 +1,177 @@
-compiler/xla/backends/cpu/nanort/package_groups.bzl:
-compiler/xla/backends/cpu/package_groups.bzl:
-compiler/xla/internal/package_groups.bzl:
-compiler/xla/mlir_hlo/WORKSPACE:
-compiler/xla/package_groups.bzl:
-compiler/xla/pjrt/cpu/package_groups.bzl:
-compiler/xla/pjrt/gpu/package_groups.bzl:
-compiler/xla/stream_executor/build_defs.bzl:
-compiler/xla/tsl/cuda/stub.bzl:
-compiler/xla/tsl/mkl/BUILD:
-compiler/xla/tsl/mkl/LICENSE:
-compiler/xla/tsl/mkl/MKL_LICENSE:
-compiler/xla/tsl/mkl/build_defs.bzl:
-compiler/xla/tsl/package_groups.bzl:
-compiler/xla/tsl/profiler/BUILD:
-third_party/BUILD:
-third_party/absl/com_google_absl.BUILD:
-third_party/clang_toolchain/BUILD:
-third_party/clang_toolchain/cc_configure_clang.bzl:
-third_party/clang_toolchain/download_clang.bzl:
-third_party/compute_library/BUILD:
-third_party/compute_library/build_defs.bzl:
-third_party/curl.BUILD:
-third_party/cython.BUILD:
-third_party/ducc/BUILD:
-third_party/ducc/ducc0_custom_lowlevel_threading.h:
-third_party/ducc/fft.cc:
-third_party/ducc/fft.h:
-third_party/ducc/threading.cc:
-third_party/ducc/threading.h:
-third_party/eigen3/BUILD:
-third_party/eigen3/LICENSE:
-third_party/eigen3/eigen_archive.BUILD:
-third_party/git/BUILD.tpl:
-third_party/git/BUILD:
-third_party/git/git_configure.bzl:
-third_party/gpus/BUILD:
-third_party/gpus/compiler_common_tools.bzl:
-third_party/gpus/crosstool/BUILD.rocm.tpl:
-third_party/gpus/crosstool/BUILD.sycl.tpl:
-third_party/gpus/crosstool/BUILD.tpl:
-third_party/gpus/crosstool/BUILD:
-third_party/gpus/crosstool/LICENSE:
-third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
-third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
-third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
-third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
-third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
-third_party/gpus/cuda/BUILD.tpl:
-third_party/gpus/cuda/BUILD.windows.tpl:
-third_party/gpus/cuda/BUILD:
-third_party/gpus/cuda/LICENSE:
-third_party/gpus/cuda/build_defs.bzl.tpl:
-third_party/gpus/cuda/cuda_config.h.tpl:
-third_party/gpus/cuda/cuda_config.py.tpl:
-third_party/gpus/cuda/hermetic/BUILD.tpl:
-third_party/gpus/cuda/hermetic/BUILD:
-third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_configure.bzl:
-third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl:
-third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_nvprune.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_nvrtc.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl:
-third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl:
-third_party/gpus/cuda_configure.bzl:
-third_party/gpus/find_cuda_config:.py
-third_party/gpus/rocm/BUILD.tpl:
-third_party/gpus/rocm/BUILD:
-third_party/gpus/rocm/build_defs.bzl.tpl:
-third_party/gpus/rocm/rocm_config.h.tpl:
-third_party/gpus/rocm_configure.bzl:
-third_party/gpus/sycl/BUILD.tpl:
-third_party/gpus/sycl/BUILD:
-third_party/gpus/sycl/build_defs.bzl.tpl:
-third_party/gpus/sycl_configure.bzl:
-third_party/grpc/BUILD:
-third_party/implib_so/BUILD:
-third_party/implib_so/get_symbols.py:
-third_party/implib_so/make_stub.py:
-third_party/llvm_openmp/BUILD:
-third_party/llvm_openmp/cmake_vars.bzl:
-third_party/llvm_openmp/expand_cmake_vars:.py
-third_party/llvm_openmp/openmp.bzl:
-third_party/mkl/BUILD:
-third_party/mkl_dnn/LICENSE:
-third_party/mkl_dnn/mkldnn_acl.BUILD:
-third_party/mkl_dnn/mkldnn_v1.BUILD:
-third_party/nccl/BUILD:
-third_party/nccl/LICENSE:
-third_party/nccl/archive.BUILD:
-third_party/nccl/archive.patch:
-third_party/nccl/build_defs.bzl.tpl:
-third_party/nccl/generated_names.bzl.tpl:
-third_party/nccl/hermetic/BUILD:
-third_party/nccl/hermetic/cuda_nccl.BUILD.tpl:
-third_party/nccl/hermetic/nccl_configure.bzl:
-third_party/nccl/nccl_configure.bzl:
-third_party/nccl/system.BUILD.tpl:
-third_party/nvshmem/BUILD:
-third_party/nvshmem/nvshmem.BUILD:
-third_party/nvshmem/workspace.bzl:
-third_party/nvtx/BUILD:
-third_party/nvtx/LICENSE:
-third_party/ortools/BUILD:
-third_party/ortools/glpk.BUILD:
-third_party/ortools/ortools.patch:
-third_party/protobuf/BUILD:
-third_party/py/BUILD.tpl:
-third_party/py/BUILD:
-third_party/py/manylinux_compliance_test.py:
-third_party/py/ml_dtypes/BUILD:
-third_party/py/numpy/BUILD:
-third_party/py/py_import.bzl:
-third_party/py/py_manylinux_compliance_test.bzl:
-third_party/py/python_configure.bzl:
-third_party/py/python_init_pip.bzl:
-third_party/py/python_init_repositories.bzl:
-third_party/py/python_init_rules.bzl:
-third_party/py/python_init_toolchains.bzl:
-third_party/py/python_repo.bzl:
-third_party/py/python_wheel.bzl:
-third_party/pybind11.BUILD:
-third_party/pybind11_bazel/BUILD:
-third_party/python_runtime/BUILD:
-third_party/remote_config/BUILD.tpl:
-third_party/remote_config/BUILD:
-third_party/remote_config/common.bzl:
-third_party/remote_config/remote_platform_configure.bzl:
-third_party/repo.bzl:
-third_party/six.BUILD:
-third_party/snappy.BUILD:
-third_party/spirv_llvm_translator/spirv_llvm_translator.BUILD:
-third_party/stablehlo/BUILD:
-third_party/systemlibs/BUILD.tpl:
-third_party/systemlibs/BUILD:
-third_party/systemlibs/absl_py.BUILD:
-third_party/systemlibs/absl_py.absl.flags.BUILD:
-third_party/systemlibs/absl_py.absl.logging.BUILD:
-third_party/systemlibs/absl_py.absl.testing.BUILD:
-third_party/systemlibs/boringssl.BUILD:
-third_party/systemlibs/build_defs.bzl.tpl:
-third_party/systemlibs/curl.BUILD:
-third_party/systemlibs/cython.BUILD:
-third_party/systemlibs/gif.BUILD:
-third_party/systemlibs/google_cloud_cpp.BUILD:
-third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD:
-third_party/systemlibs/grpc.BUILD:
-third_party/systemlibs/jsoncpp.BUILD:
-third_party/systemlibs/lmdb.BUILD:
-third_party/systemlibs/png.BUILD:
-third_party/systemlibs/protobuf.BUILD:
-third_party/systemlibs/protobuf.bzl:
-third_party/systemlibs/re2.BUILD:
-third_party/systemlibs/six.BUILD:
-third_party/systemlibs/snappy.BUILD:
-third_party/systemlibs/sqlite.BUILD:
-third_party/systemlibs/syslibs_configure.bzl:
-third_party/systemlibs/zlib.BUILD:
-third_party/tensorrt/BUILD.tpl:
-third_party/tensorrt/BUILD:
-third_party/tensorrt/LICENSE:
-third_party/tensorrt/build_defs.bzl.tpl:
-third_party/tensorrt/plugin/BUILD:
-third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl:
-third_party/tensorrt/tensorrt/tensorrt_config.py.tpl:
-third_party/tensorrt/tensorrt_configure.bzl:
-third_party/tensorrt/workspace.bzl:
-third_party/zlib.BUILD:
-tools/def_file_filter/BUILD.tpl:
-tools/def_file_filter/BUILD:
-tools/def_file_filter/def_file_filter.py.tpl:
-tools/def_file_filter/def_file_filter_configure.bzl:
-tools/toolchains/BUILD:
-tools/toolchains/clang6/BUILD:
-tools/toolchains/cpus/py/BUILD:
-tools/toolchains/cpus/py3/BUILD:
-tools/toolchains/cross_compile/cc/BUILD:
-tools/toolchains/cross_compile/config/BUILD:
-tools/toolchains/embedded/arm-linux/BUILD:
-tools/toolchains/java/BUILD:
-tools/toolchains/python/BUILD:
-tools/toolchains/remote/BUILD:
-tools/toolchains/remote_config/BUILD:
-tools/toolchains/win/20240424/BUILD:
-tools/toolchains/win/BUILD:
-tools/toolchains/win/bazel_211/BUILD:
-tools/toolchains/win/tf_win_05022023/BUILD:
-tools/toolchains/win2022/20241118/BUILD:
-tools/toolchains/win2022/BUILD:
-tools/toolchains/win_1803/py38/BUILD:
-tools/toolchains/win_1803/py39/BUILD:
+tensorflow/compiler/xla/backends/cpu/nanort/package_groups.bzl:
+tensorflow/compiler/xla/backends/cpu/package_groups.bzl:
+tensorflow/compiler/xla/internal/package_groups.bzl:
+tensorflow/compiler/xla/mlir_hlo/WORKSPACE:
+tensorflow/compiler/xla/package_groups.bzl:
+tensorflow/compiler/xla/pjrt/cpu/package_groups.bzl:
+tensorflow/compiler/xla/pjrt/gpu/package_groups.bzl:
+tensorflow/compiler/xla/python/package_groups.bzl:
+tensorflow/compiler/xla/stream_executor/build_defs.bzl:
+tensorflow/compiler/xla/tsl/cuda/stub.bzl:
+tensorflow/compiler/xla/tsl/mkl/LICENSE:
+tensorflow/compiler/xla/tsl/mkl/MKL_LICENSE:
+tensorflow/compiler/xla/tsl/mkl/build_defs.bzl:
+tensorflow/compiler/xla/tsl/package_groups.bzl:
+tensorflow/third_party/py/BUILD.tpl:
+tensorflow/third_party/py/manylinux_compliance_test.py:
+tensorflow/third_party/py/py_import.bzl:
+tensorflow/third_party/py/py_manylinux_compliance_test.bzl:
+tensorflow/third_party/py/python_configure.bzl:
+tensorflow/third_party/py/python_init_pip.bzl:
+tensorflow/third_party/py/python_init_repositories.bzl:
+tensorflow/third_party/py/python_init_rules.bzl:
+tensorflow/third_party/py/python_init_toolchains.bzl:
+tensorflow/third_party/py/python_repo.bzl:
+tensorflow/third_party/py/python_wheel.bzl:
+tensorflow/third_party/py/rules_pywrap/def_file_filter_tool.py:
+tensorflow/third_party/systemlibs/BUILD.tpl:
+tensorflow/third_party/systemlibs/absl_py.BUILD:
+tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD:
+tensorflow/third_party/systemlibs/absl_py.absl.logging.BUILD:
+tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD:
+tensorflow/third_party/systemlibs/boringssl.BUILD:
+tensorflow/third_party/systemlibs/build_defs.bzl.tpl:
+tensorflow/third_party/systemlibs/curl.BUILD:
+tensorflow/third_party/systemlibs/cython.BUILD:
+tensorflow/third_party/systemlibs/gif.BUILD:
+tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD:
+tensorflow/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD:
+tensorflow/third_party/systemlibs/grpc.BUILD:
+tensorflow/third_party/systemlibs/jsoncpp.BUILD:
+tensorflow/third_party/systemlibs/lmdb.BUILD:
+tensorflow/third_party/systemlibs/png.BUILD:
+tensorflow/third_party/systemlibs/protobuf.BUILD:
+tensorflow/third_party/systemlibs/protobuf.bzl:
+tensorflow/third_party/systemlibs/re2.BUILD:
+tensorflow/third_party/systemlibs/six.BUILD:
+tensorflow/third_party/systemlibs/snappy.BUILD:
+tensorflow/third_party/systemlibs/sqlite.BUILD:
+tensorflow/third_party/systemlibs/syslibs_configure.bzl:
+tensorflow/third_party/systemlibs/zlib.BUILD:
+tensorflow/third_party/tensorrt/BUILD.tpl:
+tensorflow/third_party/tensorrt/LICENSE:
+tensorflow/third_party/tensorrt/build_defs.bzl.tpl:
+tensorflow/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl:
+tensorflow/third_party/tensorrt/tensorrt/tensorrt_config.py.tpl:
+tensorflow/third_party/tensorrt/tensorrt_configure.bzl:
+tensorflow/third_party/tensorrt/workspace.bzl:
+tensorflow/tools/def_file_filter/BUILD.tpl:
+tensorflow/tools/def_file_filter/BUILD:
+tensorflow/tools/def_file_filter/def_file_filter.py.tpl:
+tensorflow/tools/def_file_filter/def_file_filter_configure.bzl:
+tensorflow/tools/toolchains/BUILD:
+tensorflow/tools/toolchains/android/BUILD:
+tensorflow/tools/toolchains/clang6/BUILD:
+tensorflow/tools/toolchains/cpus/py/BUILD:
+tensorflow/tools/toolchains/cpus/py3/BUILD:
+tensorflow/tools/toolchains/cross_compile/cc/BUILD:
+tensorflow/tools/toolchains/cross_compile/config/BUILD:
+tensorflow/tools/toolchains/embedded/arm-linux/BUILD:
+tensorflow/tools/toolchains/ios/BUILD:
+tensorflow/tools/toolchains/java/BUILD:
+tensorflow/tools/toolchains/linux/BUILD:
+tensorflow/tools/toolchains/python/BUILD:
+tensorflow/tools/toolchains/remote/BUILD:
+tensorflow/tools/toolchains/remote_config/BUILD:
+tensorflow/tools/toolchains/win/20240424/BUILD:
+tensorflow/tools/toolchains/win/BUILD:
+tensorflow/tools/toolchains/win/bazel_211/BUILD:
+tensorflow/tools/toolchains/win/tf_win_05022023/BUILD:
+tensorflow/tools/toolchains/win2022/20241118/BUILD:
+tensorflow/tools/toolchains/win2022/BUILD:
+tensorflow/tools/toolchains/win_1803/py38/BUILD:
+tensorflow/tools/toolchains/win_1803/py39/BUILD:
+xla/third_party/absl/com_google_absl.BUILD:
+xla/third_party/clang_toolchain/cc_configure_clang.bzl:
+xla/third_party/clang_toolchain/download_clang.bzl:
+xla/third_party/compute_library/build_defs.bzl:
+xla/third_party/curl.BUILD:
+xla/third_party/cython.BUILD:
+xla/third_party/ducc/ducc0_custom_lowlevel_threading.h:
+xla/third_party/ducc/fft.cc:
+xla/third_party/ducc/fft.h:
+xla/third_party/ducc/threading.cc:
+xla/third_party/ducc/threading.h:
+xla/third_party/eigen3/LICENSE:
+xla/third_party/eigen3/eigen_archive.BUILD:
+xla/third_party/git/BUILD.tpl:
+xla/third_party/git/git_configure.bzl:
+xla/third_party/gpus/check_cuda_libs.py:
+xla/third_party/gpus/compiler_common_tools.bzl:
+xla/third_party/gpus/crosstool/BUILD.rocm.tpl:
+xla/third_party/gpus/crosstool/BUILD.sycl.tpl:
+xla/third_party/gpus/crosstool/BUILD.tpl:
+xla/third_party/gpus/crosstool/LICENSE:
+xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
+xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
+xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_sycl.tpl:
+xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl:
+xla/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
+xla/third_party/gpus/cuda/BUILD.tpl:
+xla/third_party/gpus/cuda/BUILD.windows.tpl:
+xla/third_party/gpus/cuda/LICENSE:
+xla/third_party/gpus/cuda/build_defs.bzl.tpl:
+xla/third_party/gpus/cuda/cuda_config.h.tpl:
+xla/third_party/gpus/cuda/cuda_config.py.tpl:
+xla/third_party/gpus/cuda/hermetic/BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_configure.bzl:
+xla/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl:
+xla/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_nvprune.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_nvrtc.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl:
+xla/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl:
+xla/third_party/gpus/cuda_configure.bzl:
+xla/third_party/gpus/find_cuda_config.py:
+xla/third_party/gpus/find_rocm_config.py:
+xla/third_party/gpus/find_sycl_config.py:
+xla/third_party/gpus/rocm/BUILD.tpl:
+xla/third_party/gpus/rocm/build_defs.bzl.tpl:
+xla/third_party/gpus/rocm/rocm_config.h.tpl:
+xla/third_party/gpus/rocm_configure.bzl:
+xla/third_party/gpus/sycl/BUILD.tpl:
+xla/third_party/gpus/sycl/build_defs.bzl.tpl:
+xla/third_party/gpus/sycl_configure.bzl:
+xla/third_party/hwloc/BUILD.system:
+xla/third_party/hwloc/static-components.h:
+xla/third_party/implib_so/get_symbols.py:
+xla/third_party/implib_so/make_stub.py:
+xla/third_party/llvm_openmp/cmake_vars.bzl:
+xla/third_party/llvm_openmp/expand_cmake_vars.py:
+xla/third_party/llvm_openmp/openmp.bzl:
+xla/third_party/mkl_dnn/LICENSE:
+xla/third_party/mkl_dnn/mkldnn_acl.BUILD:
+xla/third_party/mkl_dnn/mkldnn_v1.BUILD:
+xla/third_party/nasm/BUILD.system:
+xla/third_party/nasm/config.h:
+xla/third_party/nccl/LICENSE:
+xla/third_party/nccl/archive.BUILD:
+xla/third_party/nccl/archive.patch:
+xla/third_party/nccl/build_defs.bzl.tpl:
+xla/third_party/nccl/generated_names.bzl.tpl:
+xla/third_party/nccl/hermetic/cuda_nccl.BUILD.tpl:
+xla/third_party/nccl/hermetic/nccl_configure.bzl:
+xla/third_party/nccl/nccl_configure.bzl:
+xla/third_party/nccl/system.BUILD.tpl:
+xla/third_party/nvshmem/nvshmem.BUILD:
+xla/third_party/nvshmem/workspace.bzl:
+xla/third_party/nvtx/LICENSE:
+xla/third_party/ortools/glpk.BUILD:
+xla/third_party/ortools/ortools.patch:
+xla/third_party/pybind11.BUILD:
+xla/third_party/six.BUILD:
+xla/third_party/snappy.BUILD:
+xla/third_party/spirv_llvm_translator/spirv_llvm_translator.BUILD:
+xla/third_party/zlib.BUILD:
diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index f19d39695632..78387786db4b 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -86,6 +86,10 @@ build --enable_platform_specific_config
 # TODO(mihaimaruseac): Document this option or remove if no longer needed
 build --config=short_logs
 
+# Precompiling results in some action conflicts. Disable it for now until
+# the problematic targets are fixed.
+build --@rules_python//python/config_settings:precompile=force_disabled
+
 # TF now has `cc_shared_library` targets, so it needs the experimental flag
 # TODO(rostam): Remove when `cc_shared_library` is enabled by default
 common --experimental_cc_shared_library
@@ -123,6 +127,7 @@ build:macos --features=archive_param_file
 # Settings for MacOS on ARM CPUs.
 build:macos_arm64 --cpu=darwin_arm64
 build:macos_arm64 --macos_minimum_os=11.0
+build:macos_arm64 --platforms=@build_bazel_apple_support//configs/platforms:darwin_arm64
 
 # Config to use a mostly-static build and disable modular op registration
 # support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python).
@@ -157,13 +162,15 @@ build:mkl_aarch64 -c opt
 build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
 build:mkl_aarch64_threadpool -c opt
 
+# Default CUDA and CUDNN versions.
+build:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.6.3"
+build:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
+
 # CUDA: This config refers to building CUDA op kernels with nvcc.
 build:cuda --repo_env TF_NEED_CUDA=1
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --@local_config_cuda//:enable_cuda
-# Default CUDA and CUDNN versions.
-build:cuda --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
-build:cuda --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
+build:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
 build:cuda --@local_config_cuda//cuda:include_cuda_libs=true
 
@@ -193,8 +200,7 @@ build:cuda_clang --linkopt="-lm"
 
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 build:cuda_clang_official --config=cuda_clang
-build:cuda_clang_official --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
-build:cuda_clang_official --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
+build:cuda_clang_official --config=cuda_version
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
 build:cuda_clang_official --crosstool_top="@local_config_cuda//crosstool:toolchain"
 
@@ -215,6 +221,14 @@ build:dbg --per_file_copt=+.*,-xla.*@-g0
 # AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
 build:dbg --copt -DDEBUG_BUILD
 
+build:asan --strip=never
+build:asan --copt -fsanitize=address
+build:asan --copt -DADDRESS_SANITIZER
+build:asan --copt -O1
+build:asan --copt -g
+build:asan --copt -fno-omit-frame-pointer
+build:asan --linkopt -fsanitize=address
+
 build:rocm_base --copt=-Wno-gnu-offsetof-extensions
 build:rocm_base --crosstool_top=@local_config_rocm//crosstool:toolchain
 build:rocm_base --define=using_rocm_hipcc=true
@@ -227,6 +241,7 @@ build:rocm_base --repo_env TF_NEED_ROCM=1
 build:rocm --config=rocm_base 
 
 build:rocm_gcc --config=rocm_base
+build:rocm_gcc --copt=-Wno-stringop-truncation
 
 build:rocm_clang_official --config=rocm_base
 build:rocm_clang_official --action_env=CLANG_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
@@ -235,10 +250,11 @@ build:rocm_clang_official --linkopt="-fuse-ld=lld"
 build:rocm_clang_official --host_linkopt="-fuse-ld=lld"
 
 build:rocm_ci --config=rocm_clang_official
+
 build:rocm_ci_hermetic --config=rocm_clang_official
 build:rocm_ci_hermetic --repo_env="OS=ubuntu_22.04"
 build:rocm_ci_hermetic --repo_env="ROCM_VERSION=6.2.0"
-build:rocm_ci_hermetic --@local_config_rocm//rocm:use_rocm_hermetic_rpath=True
+build:rocm_ci_hermetic --@local_config_rocm//rocm:rocm_path_type=hermetic
 
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl --define=using_sycl=true
@@ -357,20 +373,31 @@ build:avx_linux --copt=-mavx
 build:avx_linux --host_copt=-mavx
 build:avx_win --copt=/arch:AVX
 
-build:windows_x86_cpu_2022 --crosstool_top="//tools/toolchains/win2022/20241118:toolchain"
-build:windows_x86_cpu_2022 --extra_toolchains="//tools/toolchains/win2022/20241118:cc-toolchain-x64_windows-clang-cl"
-build:windows_x86_cpu_2022 --extra_execution_platforms="//tools/toolchains/win2022:windows_ltsc2022_clang"
-build:windows_x86_cpu_2022 --host_platform="//tools/toolchains/win2022:windows_ltsc2022_clang"
-build:windows_x86_cpu_2022 --platforms="//tools/toolchains/win2022:windows_ltsc2022_clang"
-build:windows_x86_cpu_2022 --copt=/clang:-Weverything
-build:windows_x86_cpu_2022 --host_copt=/clang:-Weverything
-build:windows_x86_cpu_2022 --compiler=clang-cl
-build:windows_x86_cpu_2022 --linkopt=/FORCE:MULTIPLE
-build:windows_x86_cpu_2022 --host_linkopt=/FORCE:MULTIPLE
-test:windows_x86_cpu_2022 --linkopt=/FORCE:MULTIPLE
-test:windows_x86_cpu_2022 --host_linkopt=/FORCE:MULTIPLE
-test:windows_x86_cpu_2022 --action_env=PATHEXT=.COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC;.PY;.PYW
-test:windows_x86_cpu_2022 --build_tests_only --keep_going --test_output=errors --verbose_failures=true --test_summary=short
+build:win_clang_base --@com_google_protobuf//:use_dlls=True
+build:win_clang_base --@com_google_absl//absl:use_dlls
+build:win_clang_base --linkopt=/demangle:no --host_linkopt=/demangle:no
+build:win_clang_base --linkopt=/errorlimit:0 --host_linkopt=/errorlimit:0
+build:win_clang_base --copt=/clang:-Weverything
+build:win_clang_base --host_copt=/clang:-Weverything
+build:win_clang_base --compiler=clang-cl
+build:win_clang_base --linkopt=/FORCE:MULTIPLE
+build:win_clang_base --host_linkopt=/FORCE:MULTIPLE
+build:win_clang_base --action_env=PATHEXT=.COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC;.PY;.PYW
+test:win_clang_base --linkopt=/FORCE:MULTIPLE
+test:win_clang_base --host_linkopt=/FORCE:MULTIPLE
+test:win_clang_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true --test_summary=short
+
+build:win_clang --config=win_clang_base
+build:win_clang --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl
+build:win_clang --extra_execution_platforms=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
+build:win_clang --host_platform=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
+
+build:windows_x86_cpu_2022 --config=win_clang_base
+build:windows_x86_cpu_2022 --crosstool_top="//tensorflow/tools/toolchains/win2022/20241118:toolchain"
+build:windows_x86_cpu_2022 --extra_toolchains="//tensorflow/tools/toolchains/win2022/20241118:cc-toolchain-x64_windows-clang-cl"
+build:windows_x86_cpu_2022 --extra_execution_platforms="//tensorflow/tools/toolchains/win2022:windows_ltsc2022_clang"
+build:windows_x86_cpu_2022 --host_platform="//tensorflow/tools/toolchains/win2022:windows_ltsc2022_clang"
+build:windows_x86_cpu_2022 --platforms="//tensorflow/tools/toolchains/win2022:windows_ltsc2022_clang"
 
 # Options to build TensorFlow 1.x or 2.x.
 # TODO(kanglan): Change v2's define to default behavior
@@ -439,8 +466,15 @@ build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 
+# Download CUDA/CUDNN redistributions to preserve the repositories cache between
+# CPU and GPU builds.
+# TODO(ybaturina): Uncomment when RBE is ready to support this.
+# build:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
+# build:rbe_linux_cpu --config=cuda_version
+
 build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
+build:rbe_linux_cuda --repo_env=USE_CUDA_TAR_ARCHIVE_FILES=1
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
 
@@ -468,8 +502,10 @@ build:elinux --crosstool_top=@local_config_embedded_arm//:toolchain
 build:elinux --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:elinux_aarch64 --config=elinux
 build:elinux_aarch64 --cpu=aarch64
+build:elinux_aarch64 --platforms=@org_tensorflow//tensorflow/tools/toolchains/linux:linux_aarch64
 build:elinux_armhf --config=elinux
 build:elinux_armhf --cpu=armhf
+build:elinux_armhf --platforms=@org_tensorflow//tensorflow/tools/toolchains/linux:linux_armhf
 build:elinux_armhf --copt -mfp16-format=ieee
 
 # Config-specific options should come above this line.
@@ -493,6 +529,7 @@ build:release_macos_x86 --config=release_macos_base
 # Build with the AVX instruction set when on macOS x86
 build:release_macos_x86 --config=avx_linux
 build:release_macos_x86 --cpu=darwin
+build:release_macos_x86 --platforms=@build_bazel_apple_support//configs/platforms:darwin_x86_64
 # Target Catalina as the minimum compatible OS version
 build:release_macos_x86 --macos_minimum_os=10.15
 build:release_macos_x86 --macos_sdk_version=10.15
@@ -500,6 +537,7 @@ build:release_macos_x86 --macos_sdk_version=10.15
 # Build configs for macOS Arm64
 build:release_macos_arm64 --config=release_macos_base
 build:release_macos_arm64 --cpu=darwin_arm64
+build:release_macos_arm64 --platforms=@build_bazel_apple_support//configs/platforms:darwin_arm64
 build:release_macos_arm64 --define=tensorflow_mkldnn_contraction_kernel=0
 # Target Moneterey as the minimum compatible OS version
 build:release_macos_arm64 --macos_minimum_os=12.0
diff --git a/third_party/xla/third_party/BUILD b/third_party/xla/third_party/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/BUILD
rename to third_party/xla/third_party/BUILD.bazel
diff --git a/third_party/xla/third_party/FP16/BUILD b/third_party/xla/third_party/FP16/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/FP16/BUILD
+++ b/third_party/xla/third_party/FP16/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/absl/BUILD b/third_party/xla/third_party/absl/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/absl/BUILD
+++ b/third_party/xla/third_party/absl/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/absl/build_dll.patch b/third_party/xla/third_party/absl/build_dll.patch
index 4089106dab2c..d666374db166 100644
--- a/third_party/xla/third_party/absl/build_dll.patch
+++ b/third_party/xla/third_party/absl/build_dll.patch
@@ -1,14 +1,43 @@
-Index: absl/copts/GENERATED_copts.bzl
-<+>UTF-8
-===================================================================
-diff --git a/absl/copts/GENERATED_copts.bzl b/absl/copts/GENERATED_copts.bzl
---- a/absl/copts/GENERATED_copts.bzl	(revision fb3621f4f897824c0dbe0615fa94543df6192f30)
-+++ b/absl/copts/GENERATED_copts.bzl	(date 1729733688013)
-@@ -11,6 +11,7 @@
-     "/D_CRT_SECURE_NO_WARNINGS",
-     "/D_SCL_SECURE_NO_WARNINGS",
-     "/D_ENABLE_EXTENDED_ALIGNED_STORAGE",
-+    "/DABSL_BUILD_DLL",
- ]
- 
- ABSL_CLANG_CL_TEST_FLAGS = [
+diff --git a/absl/BUILD.bazel b/absl/BUILD.bazel
+--- a/absl/BUILD.bazel	(revision fb3621f4f897824c0dbe0615fa94543df6192f30)
++++ b/absl/BUILD.bazel	(date 1741994158088)
+@@ -15,11 +15,26 @@
+ #
+
+ load("@bazel_skylib//lib:selects.bzl", "selects")
++load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+
+ package(default_visibility = ["//visibility:public"])
+
+ licenses(["notice"])
+
++bool_flag(
++    name = "use_dlls",
++    build_setting_default = False,
++    visibility = ["//visibility:public"],
++)
++
++config_setting(
++    name = "enable_use_dlls",
++    flag_values = {
++        ":use_dlls": "True",
++    },
++    visibility = [":__subpackages__"],
++)
++
+ config_setting(
+     name = "clang_compiler",
+     flag_values = {
+diff --git a/absl/copts/configure_copts.bzl b/absl/copts/configure_copts.bzl
+--- a/absl/copts/configure_copts.bzl	(revision fb3621f4f897824c0dbe0615fa94543df6192f30)
++++ b/absl/copts/configure_copts.bzl	(date 1741994109164)
+@@ -27,6 +27,9 @@
+     "//absl:clang_compiler": ABSL_LLVM_FLAGS,
+     "//absl:gcc_compiler": ABSL_GCC_FLAGS,
+     "//conditions:default": ABSL_GCC_FLAGS,
++}) + select({
++    "//absl:enable_use_dlls": ["/DABSL_BUILD_DLL"],
++    "//conditions:default": []
+ })
+
+ ABSL_TEST_COPTS = select({
diff --git a/third_party/xla/third_party/absl/nullability_macros.patch b/third_party/xla/third_party/absl/nullability_macros.patch
new file mode 100644
index 000000000000..1ce9d11be26b
--- /dev/null
+++ b/third_party/xla/third_party/absl/nullability_macros.patch
@@ -0,0 +1,24 @@
+# absl_nonnull, absl_nullable, and absl_nullability_unknown are not yet present
+# in the version of absl we are using. This patch can be removed when the absl
+# version used is bumped to commit 48f0f91 or newer, unless the check for
+# `ABSL_HAVE_FEATURE(nullability_on_classes)` is critical, which was added in
+# commit 9e660cf.
+diff --git a/absl/base/nullability.h b/absl/base/nullability.h
+--- a/absl/base/nullability.h
++++ b/absl/base/nullability.h
+@@ -221,4 +221,15 @@ using NullabilityUnknown = nullability_internal::NullabilityUnknownImpl<T>;
+ 
+ }  // namespace absl
+ 
++#if defined(__clang__) && !defined(__OBJC__) && \
++ABSL_HAVE_FEATURE(nullability_on_classes)
++#define absl_nonnull _Nonnull
++#define absl_nullable _Nullable
++#define absl_nullability_unknown _Null_unspecified
++#else
++#define absl_nonnull
++#define absl_nullable
++#define absl_nullability_unknown
++#endif
++
+ #endif  // ABSL_BASE_NULLABILITY_H_
diff --git a/third_party/xla/third_party/absl/workspace.bzl b/third_party/xla/third_party/absl/workspace.bzl
index d5973b13b395..d44bd155dd2f 100644
--- a/third_party/xla/third_party/absl/workspace.bzl
+++ b/third_party/xla/third_party/absl/workspace.bzl
@@ -46,5 +46,7 @@ def repo():
         urls = tf_mirror_urls("https://github.com/abseil/abseil-cpp/archive/{commit}.tar.gz".format(commit = ABSL_COMMIT)),
         patch_file = [
             "//third_party/absl:nvidia_jetson.patch",
+            "//third_party/absl:build_dll.patch",
+            "//third_party/absl:nullability_macros.patch",
         ],
     )
diff --git a/third_party/xla/third_party/benchmark/BUILD b/third_party/xla/third_party/benchmark/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/benchmark/BUILD
+++ b/third_party/xla/third_party/benchmark/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/gpus/sycl/BUILD b/third_party/xla/third_party/clang_toolchain/BUILD.bazel
similarity index 100%
rename from third_party/gpus/sycl/BUILD
rename to third_party/xla/third_party/clang_toolchain/BUILD.bazel
diff --git a/third_party/xla/third_party/compute_library/BUILD b/third_party/xla/third_party/compute_library/BUILD
deleted file mode 100644
index 4fc694c50a43..000000000000
--- a/third_party/xla/third_party/compute_library/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-exports_files(["LICENSE"])
-
-config_setting(
-    name = "build_with_acl",
-    define_values = {
-        "build_with_acl": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/compute_library/BUILD b/third_party/xla/third_party/compute_library/BUILD.bazel
similarity index 100%
rename from third_party/compute_library/BUILD
rename to third_party/xla/third_party/compute_library/BUILD.bazel
diff --git a/third_party/xla/third_party/dlpack/BUILD b/third_party/xla/third_party/dlpack/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/dlpack/BUILD
+++ b/third_party/xla/third_party/dlpack/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/dlpack/workspace.bzl b/third_party/xla/third_party/dlpack/workspace.bzl
index 3d7560af3720..df3907ea64ed 100644
--- a/third_party/xla/third_party/dlpack/workspace.bzl
+++ b/third_party/xla/third_party/dlpack/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "dlpack",
-        strip_prefix = "dlpack-2a7e9f1256ddc48186c86dff7a00e189b47e5310",
-        sha256 = "044d2f5738e677c5f0f1ff9fb616a0245af67d09e42ae3514c73ba50cea0e4a5",
-        urls = tf_mirror_urls("https://github.com/dmlc/dlpack/archive/2a7e9f1256ddc48186c86dff7a00e189b47e5310.tar.gz"),
+        strip_prefix = "dlpack-1.1",
+        sha256 = "2e3b94b55825c240cc58e6721e15b449978cbae21a2a4caa23058b0157ee2fb3",
+        urls = tf_mirror_urls("https://github.com/dmlc/dlpack/archive/refs/tags/v1.1.tar.gz"),
         build_file = "//third_party/dlpack:dlpack.BUILD",
     )
diff --git a/third_party/xla/third_party/ducc/BUILD b/third_party/xla/third_party/ducc/BUILD
deleted file mode 100644
index 073696a82553..000000000000
--- a/third_party/xla/third_party/ducc/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-# DUCC FFT library (https://gitlab.mpcdf.mpg.de/mtr/ducc).
diff --git a/third_party/ducc/BUILD b/third_party/xla/third_party/ducc/BUILD.bazel
similarity index 100%
rename from third_party/ducc/BUILD
rename to third_party/xla/third_party/ducc/BUILD.bazel
diff --git a/third_party/xla/third_party/ducc/ducc.BUILD b/third_party/xla/third_party/ducc/ducc.BUILD
index a1c4956d0a79..7cbb613c652c 100644
--- a/third_party/xla/third_party/ducc/ducc.BUILD
+++ b/third_party/xla/third_party/ducc/ducc.BUILD
@@ -1,3 +1,5 @@
+load("@local_xla//xla/tsl:tsl.bzl", "if_macos")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
@@ -5,11 +7,17 @@ package(
 
 exports_files(["LICENSE"])
 
+# Having fp-contract set to fast causes incorrect results on macos:
+# https://github.com/jax-ml/jax/issues/28217
+fp_contract = if_macos(
+    ["-ffp-contract=on"],
+    ["-ffp-contract=fast"],
+)
+
 DUCC_COPTS = [
     "-frtti",
     "-fexceptions",
-    "-ffp-contract=fast",
-]
+] + fp_contract
 
 # This library exposes the raw DUCC fft API.  It should be used
 # with caution, since inclusion of the headers will require any
diff --git a/third_party/xla/third_party/eigen3/BUILD b/third_party/xla/third_party/eigen3/BUILD
deleted file mode 100644
index 84a4205d6f38..000000000000
--- a/third_party/xla/third_party/eigen3/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Description:
-#   Eigen is a C++ template library for linear algebra: vectors,
-#   matrices, and related algorithms.
diff --git a/third_party/eigen3/BUILD b/third_party/xla/third_party/eigen3/BUILD.bazel
similarity index 100%
rename from third_party/eigen3/BUILD
rename to third_party/xla/third_party/eigen3/BUILD.bazel
diff --git a/third_party/xla/third_party/farmhash/BUILD b/third_party/xla/third_party/farmhash/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/farmhash/BUILD
+++ b/third_party/xla/third_party/farmhash/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/gemmlowp/BUILD b/third_party/xla/third_party/gemmlowp/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/gemmlowp/BUILD
+++ b/third_party/xla/third_party/gemmlowp/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/grpc/BUILD b/third_party/xla/third_party/git/BUILD.bazel
similarity index 100%
rename from third_party/grpc/BUILD
rename to third_party/xla/third_party/git/BUILD.bazel
diff --git a/third_party/xla/third_party/gloo/BUILD b/third_party/xla/third_party/gloo/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/gloo/BUILD
+++ b/third_party/xla/third_party/gloo/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/googletest/BUILD.bazel b/third_party/xla/third_party/googletest/BUILD.bazel
new file mode 100644
index 000000000000..6df61c8b5a0a
--- /dev/null
+++ b/third_party/xla/third_party/googletest/BUILD.bazel
@@ -0,0 +1 @@
+"""Patches for making googletest work with Tensorflow and related projects."""
diff --git a/third_party/xla/third_party/googletest/googletest.patch b/third_party/xla/third_party/googletest/googletest.patch
new file mode 100644
index 000000000000..d6a17b089e5d
--- /dev/null
+++ b/third_party/xla/third_party/googletest/googletest.patch
@@ -0,0 +1,47 @@
+diff --git a/BUILD.bazel b/BUILD.bazel
+index 53501454..4cce09f8 100644
+--- a/BUILD.bazel
++++ b/BUILD.bazel
+@@ -142,19 +142,19 @@ cc_library(
+     }),
+     deps = select({
+         ":has_absl": [
+-            "@abseil-cpp//absl/container:flat_hash_set",
+-            "@abseil-cpp//absl/debugging:failure_signal_handler",
+-            "@abseil-cpp//absl/debugging:stacktrace",
+-            "@abseil-cpp//absl/debugging:symbolize",
+-            "@abseil-cpp//absl/flags:flag",
+-            "@abseil-cpp//absl/flags:parse",
+-            "@abseil-cpp//absl/flags:reflection",
+-            "@abseil-cpp//absl/flags:usage",
+-            "@abseil-cpp//absl/strings",
+-            "@abseil-cpp//absl/types:any",
+-            "@abseil-cpp//absl/types:optional",
+-            "@abseil-cpp//absl/types:variant",
+-            "@re2//:re2",
++            "@com_google_absl//absl/container:flat_hash_set",
++            "@com_google_absl//absl/debugging:failure_signal_handler",
++            "@com_google_absl//absl/debugging:stacktrace",
++            "@com_google_absl//absl/debugging:symbolize",
++            "@com_google_absl//absl/flags:flag",
++            "@com_google_absl//absl/flags:parse",
++            "@com_google_absl//absl/flags:reflection",
++            "@com_google_absl//absl/flags:usage",
++            "@com_google_absl//absl/strings",
++            "@com_google_absl//absl/types:any",
++            "@com_google_absl//absl/types:optional",
++            "@com_google_absl//absl/types:variant",
++            "@com_googlesource_code_re2//:re2",
+         ],
+         "//conditions:default": [],
+     }) + select({
+@@ -163,9 +163,6 @@ cc_library(
+         # so that's why these libraries are needed.
+         # Otherwise, builds targeting Fuchsia would fail to compile.
+         ":fuchsia": [
+-            "@fuchsia_sdk//pkg/fdio",
+-            "@fuchsia_sdk//pkg/syslog",
+-            "@fuchsia_sdk//pkg/zx",
+         ],
+         "//conditions:default": [],
+     }),
diff --git a/third_party/nccl/BUILD b/third_party/xla/third_party/gpus/BUILD.bazel
similarity index 100%
rename from third_party/nccl/BUILD
rename to third_party/xla/third_party/gpus/BUILD.bazel
diff --git a/third_party/xla/third_party/gpus/compiler_common_tools.bzl b/third_party/xla/third_party/gpus/compiler_common_tools.bzl
index bd07f49ec457..b51edce3bb22 100644
--- a/third_party/xla/third_party/gpus/compiler_common_tools.bzl
+++ b/third_party/xla/third_party/gpus/compiler_common_tools.bzl
@@ -72,8 +72,14 @@ def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sys_root):
     sysroot = []
     if tf_sys_root:
         sysroot += ["--sysroot", tf_sys_root]
+    no_canonical_prefixes_supported = _is_compiler_option_supported(
+        repository_ctx,
+        cc,
+        "-no-canonical-prefixes",
+    )
+    no_canonical_prefixes = (["-no-canonical-prefixes"] if no_canonical_prefixes_supported else [])
     result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
-                                      sysroot)
+                                      sysroot + no_canonical_prefixes)
     stderr = err_out(result)
     index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
     if index1 == -1:
diff --git a/third_party/nccl/hermetic/BUILD b/third_party/xla/third_party/gpus/crosstool/BUILD.bazel
similarity index 100%
rename from third_party/nccl/hermetic/BUILD
rename to third_party/xla/third_party/gpus/crosstool/BUILD.bazel
diff --git a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 35bb5ec2b422..50945644019e 100755
--- a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -224,7 +224,7 @@ def InvokeNvcc(argv, log=False):
   # Unfortunately, there are other options that have -c prefix too.
   # So allowing only those look like C/C++ files.
   src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+               re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
diff --git a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index e97d13f68121..559a36076701 100755
--- a/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 """Crosstool wrapper for compiling ROCm programs.
 
 SYNOPSIS:
@@ -20,7 +20,7 @@ import os
 import subprocess
 import re
 import sys
-import pipes
+import shlex
 
 # Template values set by rocm_configure.bzl.
 CPU_COMPILER = ('%{cpu_compiler}')
@@ -163,7 +163,7 @@ def InvokeHipcc(argv, log=False):
   # Unfortunately, there are other options that have -c prefix too.
   # So allowing only those look like C/C++ files.
   src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+               re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
@@ -228,13 +228,13 @@ def main():
   if args.x and args.x[0] == 'rocm':
     # compilation for GPU objects
     if args.rocm_log: Log('-x rocm')
-    leftover = [pipes.quote(s) for s in leftover]
+    leftover = [shlex.quote(s) for s in leftover]
     if args.rocm_log: Log('using hipcc')
     return InvokeHipcc(leftover, log=args.rocm_log)
 
   elif args.pass_exit_codes:
     # link
-    # with hipcc compiler invoked with -fno-gpu-rdc by default now, it's ok to 
+    # with hipcc compiler invoked with -fno-gpu-rdc by default now, it's ok to
     # use host compiler as linker, but we have to link with HCC/HIP runtime.
     # Such restriction would be revised further as the bazel script get
     # improved to fine tune dependencies to ROCm libraries.
diff --git a/third_party/xla/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/xla/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index eb3a1d8c8ddf..59d150b82fa3 100644
--- a/third_party/xla/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -102,7 +102,7 @@ def InvokeNvcc(argv, log=False):
   """
 
   src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+               re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/nvshmem/BUILD b/third_party/xla/third_party/gpus/cuda/BUILD.bazel
similarity index 100%
rename from third_party/nvshmem/BUILD
rename to third_party/xla/third_party/gpus/cuda/BUILD.bazel
diff --git a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
index 6c1b68ffb77b..b5767abf5be9 100644
--- a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -94,6 +94,12 @@ def if_cuda_is_configured(x, no_cuda = []):
       return select({"//conditions:default": x})
     return select({"//conditions:default": no_cuda})
 
+def is_cuda_configured():
+    """
+    Returns True if CUDA is configured. False otherwise.
+    """
+    return %{cuda_is_configured}
+
 def if_cuda_newer_than(wanted_ver, if_true, if_false = []):
     """Tests if CUDA was enabled during the configured process and if the
     configured version is at least `wanted_ver`. `wanted_ver` needs
diff --git a/third_party/ortools/BUILD b/third_party/xla/third_party/gpus/cuda/hermetic/BUILD.bazel
similarity index 100%
rename from third_party/ortools/BUILD
rename to third_party/xla/third_party/gpus/cuda/hermetic/BUILD.bazel
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl
index ce509857e566..85c0cbbb196f 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cccl.BUILD.tpl
@@ -1,9 +1,5 @@
 licenses(["restricted"])  # NVIDIA proprietary license
 
-exports_files([
-    "version.txt",
-])
-
 cc_library(
     name = "headers",
     hdrs = glob([
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_configure.bzl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_configure.bzl
index af6f306b333c..4da463c8c161 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_configure.bzl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_configure.bzl
@@ -1,8 +1,11 @@
-"""Repository rule for hermetic CUDA autoconfiguration.
+"""Repository rule for hermetic CUDA configuration.
 
 `cuda_configure` depends on the following environment variables:
 
-  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
+  * `TF_NEED_CUDA`: Whether to enable building with CUDA toolchain.
+  * `USE_CUDA_REDISTRIBUTIONS`: Whether to use CUDA redistributions, but not
+    the CUDA toolchain. This can be used to preserve the cache between GPU and
+    CPU builds.
   * `TF_NVCC_CLANG` (deprecated): Whether to use clang for C++ and NVCC for Cuda
     compilation.
   * `CUDA_NVCC`: Whether to use NVCC for Cuda compilation.
@@ -20,13 +23,32 @@
     environment variable is used by GCC compiler.
 """
 
+load("@cuda_cccl//:version.bzl", _cccl_version = "VERSION")
+load("@cuda_cublas//:version.bzl", _cublas_version = "VERSION")
+load("@cuda_cudart//:version.bzl", _cudart_version = "VERSION")
+load("@cuda_cudnn//:version.bzl", _cudnn_version = "VERSION")
+load("@cuda_cufft//:version.bzl", _cufft_version = "VERSION")
+load("@cuda_cupti//:version.bzl", _cupti_version = "VERSION")
+load("@cuda_curand//:version.bzl", _curand_version = "VERSION")
+load("@cuda_cusolver//:version.bzl", _cusolver_version = "VERSION")
+load("@cuda_cusparse//:version.bzl", _cusparse_version = "VERSION")
+load("@cuda_nvcc//:version.bzl", _nvcc_version = "VERSION")
+load("@cuda_nvjitlink//:version.bzl", _nvjitlink_version = "VERSION")
+load("@cuda_nvml//:version.bzl", _nvml_version = "VERSION")
+load("@cuda_nvtx//:version.bzl", _nvtx_version = "VERSION")
 load(
     "//third_party/gpus:compiler_common_tools.bzl",
     "get_cxx_inc_directories",
     "to_list_of_strings",
 )
+load(
+    "//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
+    "PTX_VERSION_DICT",
+)
 load(
     "//third_party/remote_config:common.bzl",
+    "execute",
+    "get_bash_bin",
     "get_cpu_value",
     "get_host_environ",
     "realpath",
@@ -45,6 +67,8 @@ def _find_cc(repository_ctx):
         cc_name = cc_name_from_env
     cc = which(repository_ctx, cc_name, allow_failure = True)
     if not cc:
+        # Use print instead of fail because fail interrupts execution,
+        # but tensorflow needs an empty toolchain when the compiler is not found.
         print(("Cannot find {}, either correct your path," +
                " or set the CLANG_CUDA_COMPILER_PATH or CC" +
                " environment variables").format(cc_name))  # buildifier: disable=print
@@ -98,10 +122,46 @@ def get_cuda_version(repository_ctx):
     return (get_host_environ(repository_ctx, HERMETIC_CUDA_VERSION) or
             get_host_environ(repository_ctx, TF_CUDA_VERSION))
 
+def _is_clang(cc):
+    return "clang" in cc
+
+# Function works only in pair with non-hermetic toolchain
+def _get_clang_major_version(repository_ctx, cc):
+    """Gets the major version of the clang at `cc`"""
+    cmd = "echo __clang_major__ | \"%s\" -E -P -" % cc
+    result = execute(
+        repository_ctx,
+        [get_bash_bin(repository_ctx), "-c", cmd],
+    )
+    return result.stdout.strip()
+
+# Function works only in pair with non-hermetic toolchain
+def _get_cpu_compiler(repository_ctx):
+    if _use_hermetic_toolchains(repository_ctx):
+        return "clang"
+
+    cc = _find_cc(repository_ctx)
+    if not _is_clang(cc):
+        # We support builds by clang only
+        return "clang"
+
+    return "clang {}".format(_get_clang_major_version(repository_ctx, cc))
+
+def _is_linux_x86_64(repository_ctx):
+    return repository_ctx.os.arch == "amd64" and repository_ctx.os.name == "linux"
+
+def _use_hermetic_toolchains(repository_ctx):
+    return _flag_enabled(repository_ctx, USE_HERMETIC_CC_TOOLCHAIN)
+
 def enable_cuda(repository_ctx):
     """Returns whether to build with CUDA support."""
     return int(get_host_environ(repository_ctx, TF_NEED_CUDA, False))
 
+def use_cuda_redistributions(repository_ctx):
+    """Returns whether to use CUDA redistributions."""
+    return (int(get_host_environ(repository_ctx, USE_CUDA_REDISTRIBUTIONS, False)) and
+            not int(get_host_environ(repository_ctx, _TF_NEED_ROCM, False)))
+
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
 
@@ -182,15 +242,71 @@ def _compute_capabilities(repository_ctx):
 
     return capabilities
 
-def _compute_cuda_extra_copts(compute_capabilities, is_clang):
-    copts = ["--no-cuda-include-ptx=all"] if is_clang else []
+def _ptx_version_to_int(version):
+    major, minor = version.split(".")
+    return int(major + minor)
+
+def _get_cuda_ptx_version(cuda_version, clang_version):
+    cuda_version = ".".join(cuda_version.split(".")[:2])
+    clang_dict = PTX_VERSION_DICT["clang"]
+    cuda_dict = PTX_VERSION_DICT["cuda"]
+    if clang_version not in clang_dict:
+        _auto_configure_fail(
+            ("The supported Clang versions are {supported_versions}. Please" +
+             " add max PTX version supported by Clang major version={version}.")
+                .format(
+                supported_versions = clang_dict.keys(),
+                version = clang_version,
+            ),
+        )
+    if cuda_version not in cuda_dict:
+        _auto_configure_fail(
+            ("The supported CUDA versions are {supported_versions}. Please" +
+             " add max PTX version supported by CUDA version={version}.")
+                .format(
+                supported_versions = cuda_dict.keys(),
+                version = cuda_version,
+            ),
+        )
+    ptx_version = min(
+        _ptx_version_to_int(clang_dict[clang_version]),
+        _ptx_version_to_int(cuda_dict[cuda_version]),
+    )
+
+    return ptx_version
+
+def _create_cuda_copts_list(compute_capabilities):
+    copts = []
+
     for capability in compute_capabilities:
         if capability.startswith("compute_"):
             capability = capability.replace("compute_", "sm_")
             copts.append("--cuda-include-ptx=%s" % capability)
         copts.append("--cuda-gpu-arch=%s" % capability)
+    return copts
+
+# Function works only in pair with non-hermetic toolchain
+def _create_cuda_ptx_copts_list(repository_ctx, cuda_version):
+    copts = []
+
+    if _use_hermetic_toolchains(repository_ctx):
+        return copts
+
+    cc = _find_cc(repository_ctx)
+    if not _is_clang(cc):
+        return copts
+
+    ptx_version = _get_cuda_ptx_version(
+        cuda_version,
+        _get_clang_major_version(repository_ctx, cc),
+    )
 
-    return str(copts)
+    copts.append("--no-cuda-include-ptx=all")
+    copts.append(
+        "--cuda-feature=+ptx{version}".format(version = ptx_version),
+    )
+
+    return copts
 
 def _get_cuda_config(repository_ctx):
     """Detects and returns information about the CUDA installation on the system.
@@ -209,14 +325,19 @@ def _get_cuda_config(repository_ctx):
 
     return struct(
         cuda_version = get_cuda_version(repository_ctx),
-        cupti_version = repository_ctx.read(repository_ctx.attr.cupti_version),
-        cudart_version = repository_ctx.read(repository_ctx.attr.cudart_version),
-        cublas_version = repository_ctx.read(repository_ctx.attr.cublas_version),
-        cusolver_version = repository_ctx.read(repository_ctx.attr.cusolver_version),
-        curand_version = repository_ctx.read(repository_ctx.attr.curand_version),
-        cufft_version = repository_ctx.read(repository_ctx.attr.cufft_version),
-        cusparse_version = repository_ctx.read(repository_ctx.attr.cusparse_version),
-        cudnn_version = repository_ctx.read(repository_ctx.attr.cudnn_version),
+        cupti_version = _cupti_version,
+        cudart_version = _cudart_version,
+        cublas_version = _cublas_version,
+        cusolver_version = _cusolver_version,
+        curand_version = _curand_version,
+        cufft_version = _cufft_version,
+        cusparse_version = _cusparse_version,
+        cudnn_version = _cudnn_version,
+        cccl_version = _cccl_version,
+        nvcc_version = _nvcc_version,
+        nvjitlink_version = _nvjitlink_version,
+        nvml_version = _nvml_version,
+        nvtx_version = _nvtx_version,
         compute_capabilities = _compute_capabilities(repository_ctx),
         cpu_value = get_cpu_value(repository_ctx),
     )
@@ -263,7 +384,18 @@ def _cuda_include_paths(repository_ctx):
         repository_ctx.attr.nvtx_version,
     ]]
 
-def _setup_toolchains(repository_ctx, cc, cuda_version):
+def _create_dummy_toolchains_repository(repository_ctx):
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+
+def _create_local_toolchains_repository(repository_ctx):
+    cc = _find_cc(repository_ctx)
+    if not cc:
+        return False
+
     is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
     is_nvcc_for_cuda = _use_nvcc_for_cuda(repository_ctx)
     tf_sysroot = _tf_sysroot(repository_ctx)
@@ -291,7 +423,7 @@ def _setup_toolchains(repository_ctx, cc, cuda_version):
     })
 
     cuda_defines["%{builtin_sysroot}"] = tf_sysroot
-    is_clang_compiler = "clang" in cc
+    is_clang_compiler = _is_clang(cc)
     if not enable_cuda(repository_ctx):
         cuda_defines["%{cuda_toolkit_path}"] = ""
         cuda_defines["%{cuda_nvcc_files}"] = "[]"
@@ -336,32 +468,14 @@ def _setup_toolchains(repository_ctx, cc, cuda_version):
           "-Wno-invalid-partial-specialization"
       """
         cuda_defines["%{compiler_deps}"] = ":cuda_nvcc_files"
-        repository_ctx.file(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            "",
-        )
+
+        _create_dummy_nvcc_wrapper(repository_ctx)
     else:
         cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
         cuda_defines["%{host_compiler_warnings}"] = ""
         cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
 
-        wrapper_defines = {
-            "%{cpu_compiler}": str(cc),
-            "%{cuda_version}": cuda_version,
-            "%{nvcc_path}": nvcc_relative_path,
-            "%{host_compiler_path}": str(cc),
-            "%{use_clang_compiler}": str(is_clang_compiler),
-            "%{tmpdir}": get_host_environ(
-                repository_ctx,
-                _TMPDIR,
-                "",
-            ),
-        }
-        repository_ctx.template(
-            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            repository_ctx.attr.crosstool_wrapper_driver_is_not_gcc_tpl,
-            wrapper_defines,
-        )
+        _create_nvcc_wrapper(repository_ctx, cc)
 
     _verify_build_defines(cuda_defines)
 
@@ -379,8 +493,42 @@ def _setup_toolchains(repository_ctx, cc, cuda_version):
         repository_ctx.attr.cc_toolchain_config_tpl,
         {},
     )
+    return True
+
+def _create_dummy_nvcc_wrapper(repository_ctx):
+    repository_ctx.file(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        "",
+    )
+
+def _create_nvcc_wrapper(repository_ctx, cc):
+    cuda_config = _get_cuda_config(repository_ctx)
+
+    nvcc_relative_path = "%s/%s" % (
+        repository_ctx.attr.nvcc_binary.workspace_root,
+        repository_ctx.attr.nvcc_binary.name,
+    )
+
+    wrapper_defines = {
+        "%{cpu_compiler}": str(cc),
+        "%{cuda_version}": cuda_config.cuda_version,
+        "%{nvcc_path}": nvcc_relative_path,
+        "%{host_compiler_path}": str(cc),
+        "%{use_clang_compiler}": str(True),
+        "%{tmpdir}": get_host_environ(
+            repository_ctx,
+            _TMPDIR,
+            "",
+        ),
+    }
+
+    repository_ctx.template(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        repository_ctx.attr.crosstool_wrapper_driver_is_not_gcc_tpl,
+        wrapper_defines,
+    )
 
-def _create_dummy_repository(repository_ctx):
+def _create_dummy_cuda_repository(repository_ctx):
     cpu_value = get_cpu_value(repository_ctx)
 
     # Set up BUILD file for cuda/.
@@ -392,6 +540,7 @@ def _create_dummy_repository(repository_ctx):
             "%{cuda_extra_copts}": "[]",
             "%{cuda_gpu_architectures}": "[]",
             "%{cuda_version}": "0.0",
+            "%{use_hermetic_cc_toolchain}": str(_use_hermetic_toolchains(repository_ctx)),
         },
     )
 
@@ -405,23 +554,43 @@ def _create_dummy_repository(repository_ctx):
 
     # Set up cuda_config.h, which is used by
     # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
-    repository_ctx.template(
-        "cuda/cuda/cuda_config.h",
-        repository_ctx.attr.cuda_config_tpl,
-        {
-            "%{cuda_version}": "",
-            "%{cudart_version}": "",
-            "%{cupti_version}": "",
-            "%{cublas_version}": "",
-            "%{cusolver_version}": "",
-            "%{curand_version}": "",
-            "%{cufft_version}": "",
-            "%{cusparse_version}": "",
-            "%{cudnn_version}": "",
-            "%{cuda_toolkit_path}": "",
-            "%{cuda_compute_capabilities}": "",
-        },
-    )
+    if use_cuda_redistributions(repository_ctx):
+        cuda_config = _get_cuda_config(repository_ctx)
+        repository_ctx.template(
+            "cuda/cuda/cuda_config.h",
+            repository_ctx.attr.cuda_config_tpl,
+            {
+                "%{cuda_version}": cuda_config.cudart_version,
+                "%{cudart_version}": cuda_config.cudart_version,
+                "%{cupti_version}": cuda_config.cupti_version,
+                "%{cublas_version}": cuda_config.cublas_version,
+                "%{cusolver_version}": cuda_config.cusolver_version,
+                "%{curand_version}": cuda_config.curand_version,
+                "%{cufft_version}": cuda_config.cufft_version,
+                "%{cusparse_version}": cuda_config.cusparse_version,
+                "%{cudnn_version}": cuda_config.cudnn_version,
+                "%{cuda_toolkit_path}": "",
+                "%{cuda_compute_capabilities}": "",
+            },
+        )
+    else:
+        repository_ctx.template(
+            "cuda/cuda/cuda_config.h",
+            repository_ctx.attr.cuda_config_tpl,
+            {
+                "%{cuda_version}": "",
+                "%{cudart_version}": "",
+                "%{cupti_version}": "",
+                "%{cublas_version}": "",
+                "%{cusolver_version}": "",
+                "%{curand_version}": "",
+                "%{cufft_version}": "",
+                "%{cusparse_version}": "",
+                "%{cudnn_version}": "",
+                "%{cuda_toolkit_path}": "",
+                "%{cuda_compute_capabilities}": "",
+            },
+        )
 
     # Set up cuda_config.py, which is used by gen_build_info to provide
     # static build environment info to the API
@@ -431,22 +600,9 @@ def _create_dummy_repository(repository_ctx):
         _py_tmpl_dict({}),
     )
 
-    cc = None
-    if repository_ctx.os.arch == "amd64" and repository_ctx.os.name == "linux":
-        cc = _find_cc(repository_ctx)
-    if cc:
-        _setup_toolchains(repository_ctx, cc, "")
-    else:
-        repository_ctx.file(
-            "crosstool/error_gpu_disabled.bzl",
-            _DUMMY_CROSSTOOL_BZL_FILE,
-        )
-        repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
 def _create_local_cuda_repository(repository_ctx):
     """Creates the repository containing files set up to build with CUDA."""
     cuda_config = _get_cuda_config(repository_ctx)
-    cc = _find_cc(repository_ctx)
 
     # Set up BUILD file for cuda/
     repository_ctx.template(
@@ -454,12 +610,13 @@ def _create_local_cuda_repository(repository_ctx):
         repository_ctx.attr.build_defs_tpl,
         {
             "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                cuda_config.compute_capabilities,
-                "clang" in cc,
+            "%{cuda_extra_copts}": str(
+                _create_cuda_ptx_copts_list(repository_ctx, cuda_config.cuda_version) +
+                _create_cuda_copts_list(cuda_config.compute_capabilities),
             ),
             "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
             "%{cuda_version}": cuda_config.cuda_version,
+            "%{use_hermetic_cc_toolchain}": str(_use_hermetic_toolchains(repository_ctx)),
         },
     )
 
@@ -473,9 +630,6 @@ def _create_local_cuda_repository(repository_ctx):
         },
     )
 
-    # Set up crosstool/
-    _setup_toolchains(repository_ctx, cc, cuda_config.cuda_version)
-
     # Set up cuda_config.h, which is used by
     # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
     repository_ctx.template(
@@ -508,18 +662,31 @@ def _create_local_cuda_repository(repository_ctx):
             "cuda_version": cuda_config.cuda_version,
             "cudnn_version": cuda_config.cudnn_version,
             "cuda_compute_capabilities": cuda_config.compute_capabilities,
-            "cpu_compiler": str(cc),
+            "cpu_compiler": _get_cpu_compiler(repository_ctx),
         }),
     )
 
-def _cuda_autoconf_impl(repository_ctx):
-    """Implementation of the cuda_autoconf repository rule."""
+def _create_cuda_repository(repository_ctx):
+    if enable_cuda(repository_ctx):
+        _create_local_cuda_repository(repository_ctx)
+    else:
+        _create_dummy_cuda_repository(repository_ctx)
+
+def _create_toolchains_repository(repository_ctx):
+    created = False
+    if enable_cuda(repository_ctx) or _is_linux_x86_64(repository_ctx):
+        created = _create_local_toolchains_repository(repository_ctx)
+
+    if not created:
+        _create_dummy_toolchains_repository(repository_ctx)
+
+def _cuda_configure_impl(repository_ctx):
+    """Implementation of the cuda_configure repository rule."""
     build_file = repository_ctx.attr.local_config_cuda_build_file
 
-    if not enable_cuda(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    else:
-        _create_local_cuda_repository(repository_ctx)
+    _create_cuda_repository(repository_ctx)
+    if not _use_hermetic_toolchains(repository_ctx):
+        _create_toolchains_repository(repository_ctx)
 
     repository_ctx.symlink(build_file, "BUILD")
 
@@ -528,8 +695,11 @@ _CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
 _HERMETIC_CUDA_COMPUTE_CAPABILITIES = "HERMETIC_CUDA_COMPUTE_CAPABILITIES"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 HERMETIC_CUDA_VERSION = "HERMETIC_CUDA_VERSION"
+USE_HERMETIC_CC_TOOLCHAIN = "USE_HERMETIC_CC_TOOLCHAIN"
 TF_CUDA_VERSION = "TF_CUDA_VERSION"
 TF_NEED_CUDA = "TF_NEED_CUDA"
+_TF_NEED_ROCM = "TF_NEED_ROCM"
+USE_CUDA_REDISTRIBUTIONS = "USE_CUDA_REDISTRIBUTIONS"
 _TF_NVCC_CLANG = "TF_NVCC_CLANG"
 _CUDA_NVCC = "CUDA_NVCC"
 _TF_SYSROOT = "TF_SYSROOT"
@@ -539,6 +709,7 @@ _ENVIRONS = [
     _CC,
     _CLANG_CUDA_COMPILER_PATH,
     TF_NEED_CUDA,
+    _TF_NEED_ROCM,
     _TF_NVCC_CLANG,
     _CUDA_NVCC,
     TF_CUDA_VERSION,
@@ -550,27 +721,28 @@ _ENVIRONS = [
     _TMPDIR,
     "LOCAL_CUDA_PATH",
     "LOCAL_CUDNN_PATH",
+    USE_CUDA_REDISTRIBUTIONS,
 ]
 
 cuda_configure = repository_rule(
-    implementation = _cuda_autoconf_impl,
+    implementation = _cuda_configure_impl,
     environ = _ENVIRONS,
     attrs = {
         "environ": attr.string_dict(),
-        "cccl_version": attr.label(default = Label("@cuda_cccl//:version.txt")),
-        "cublas_version": attr.label(default = Label("@cuda_cublas//:version.txt")),
-        "cudart_version": attr.label(default = Label("@cuda_cudart//:version.txt")),
-        "cudnn_version": attr.label(default = Label("@cuda_cudnn//:version.txt")),
-        "cufft_version": attr.label(default = Label("@cuda_cufft//:version.txt")),
-        "cupti_version": attr.label(default = Label("@cuda_cupti//:version.txt")),
-        "curand_version": attr.label(default = Label("@cuda_curand//:version.txt")),
-        "cusolver_version": attr.label(default = Label("@cuda_cusolver//:version.txt")),
-        "cusparse_version": attr.label(default = Label("@cuda_cusparse//:version.txt")),
+        "cccl_version": attr.label(default = Label("@cuda_cccl//:version.bzl")),
+        "cublas_version": attr.label(default = Label("@cuda_cublas//:version.bzl")),
+        "cudart_version": attr.label(default = Label("@cuda_cudart//:version.bzl")),
+        "cudnn_version": attr.label(default = Label("@cuda_cudnn//:version.bzl")),
+        "cufft_version": attr.label(default = Label("@cuda_cufft//:version.bzl")),
+        "cupti_version": attr.label(default = Label("@cuda_cupti//:version.bzl")),
+        "curand_version": attr.label(default = Label("@cuda_curand//:version.bzl")),
+        "cusolver_version": attr.label(default = Label("@cuda_cusolver//:version.bzl")),
+        "cusparse_version": attr.label(default = Label("@cuda_cusparse//:version.bzl")),
         "nvcc_binary": attr.label(default = Label("@cuda_nvcc//:bin/nvcc")),
-        "nvcc_version": attr.label(default = Label("@cuda_nvcc//:version.txt")),
-        "nvjitlink_version": attr.label(default = Label("@cuda_nvjitlink//:version.txt")),
-        "nvml_version": attr.label(default = Label("@cuda_nvml//:version.txt")),
-        "nvtx_version": attr.label(default = Label("@cuda_nvtx//:version.txt")),
+        "nvcc_version": attr.label(default = Label("@cuda_nvcc//:version.bzl")),
+        "nvjitlink_version": attr.label(default = Label("@cuda_nvjitlink//:version.bzl")),
+        "nvml_version": attr.label(default = Label("@cuda_nvml//:version.bzl")),
+        "nvtx_version": attr.label(default = Label("@cuda_nvtx//:version.bzl")),
         "local_config_cuda_build_file": attr.label(default = Label("//third_party/gpus:local_config_cuda.BUILD")),
         "build_defs_tpl": attr.label(default = Label("//third_party/gpus/cuda:build_defs.bzl.tpl")),
         "cuda_build_tpl": attr.label(default = Label("//third_party/gpus/cuda/hermetic:BUILD.tpl")),
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl
index d8f125fa3d32..a414cf781d4c 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cublas.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "cublas_shared_library",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl
index fabb310001cd..3e513b69b68f 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudart.BUILD.tpl
@@ -4,10 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
-
 filegroup(
     name = "static",
     srcs = ["lib/libcudart_static.a"],
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl
index c3701a624124..b762577405ed 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "cudnn_ops_infer",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl
index 4e8bcbd84e03..564066a420cb 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cudnn9.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import( 
     name = "cudnn_ops",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl
index 2e55a742d549..029171c28d4e 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cufft.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "cufft_shared_library",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl
index 16d6991b5841..7f24e8d048b3 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cupti.BUILD.tpl
@@ -5,9 +5,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "cupti_shared_library",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl
index 746503fcf222..c33f35db0e97 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_curand.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "curand_shared_library",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl
index 30bacf07eebd..167739ce67bb 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusolver.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "cusolver_shared_library",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl
index b7765ab22508..0c6ae547d091 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_cusparse.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "cusparse_shared_library",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl
index fdda3aaf92ce..757c0a67b0c8 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_json_init_repository.bzl
@@ -19,23 +19,59 @@ load(
     "//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
     "CUDA_REDIST_JSON_DICT",
     "CUDNN_REDIST_JSON_DICT",
+    "MIRRORED_TARS_CUDA_REDIST_JSON_DICT",
+    "MIRRORED_TARS_CUDNN_REDIST_JSON_DICT",
 )
 
 def _get_env_var(ctx, name):
     return ctx.os.environ.get(name)
 
-def _get_json_file_content(repository_ctx, url_to_sha256, json_file_name):
-    if len(url_to_sha256) > 1:
-        (url, sha256) = url_to_sha256
-    else:
-        url = url_to_sha256[0]
-        sha256 = ""
-    repository_ctx.download(
-        url = tf_mirror_urls(url),
-        sha256 = sha256,
-        output = json_file_name,
+def _get_json_file_content(
+        repository_ctx,
+        url_to_sha256,
+        mirrored_tars_url_to_sha256,
+        json_file_name,
+        mirrored_tars_json_file_name):
+    use_cuda_tars = _get_env_var(
+        repository_ctx,
+        "USE_CUDA_TAR_ARCHIVE_FILES",
     )
-    return repository_ctx.read(repository_ctx.path(json_file_name))
+    (url, sha256) = url_to_sha256
+    if mirrored_tars_url_to_sha256:
+        (mirrored_tar_url, mirrored_tar_sha256) = mirrored_tars_url_to_sha256
+    else:
+        mirrored_tar_url = None
+        mirrored_tar_sha256 = None
+    json_file = None
+
+    if use_cuda_tars and mirrored_tar_url:
+        json_tar_downloaded = repository_ctx.download(
+            url = mirrored_tar_url,
+            sha256 = mirrored_tar_sha256,
+            output = mirrored_tars_json_file_name,
+            allow_fail = True,
+        )
+        if json_tar_downloaded.success:
+            print("Successfully downloaded mirrored tar file: {}".format(
+                mirrored_tar_url,
+            ))  # buildifier: disable=print
+            json_file = mirrored_tars_json_file_name
+        else:
+            print("Failed to download mirrored tar file: {}".format(
+                mirrored_tar_url,
+            ))  # buildifier: disable=print
+
+    if not json_file:
+        repository_ctx.download(
+            url = tf_mirror_urls(url),
+            sha256 = sha256,
+            output = json_file_name,
+        )
+        json_file = json_file_name
+
+    json_content = repository_ctx.read(repository_ctx.path(json_file))
+    repository_ctx.delete(json_file)
+    return json_content
 
 def _cuda_redist_json_impl(repository_ctx):
     cuda_version = (_get_env_var(repository_ctx, "HERMETIC_CUDA_VERSION") or
@@ -72,16 +108,28 @@ def _cuda_redist_json_impl(repository_ctx):
     cuda_redistributions = "{}"
     cudnn_redistributions = "{}"
     if cuda_version and not local_cuda_path:
+        if cuda_version in repository_ctx.attr.mirrored_tars_cuda_json_dict.keys():
+            mirrored_tars_url_to_sha256 = repository_ctx.attr.mirrored_tars_cuda_json_dict[cuda_version]
+        else:
+            mirrored_tars_url_to_sha256 = {}
         cuda_redistributions = _get_json_file_content(
             repository_ctx,
-            repository_ctx.attr.cuda_json_dict[cuda_version],
-            "redistrib_cuda_%s.json" % cuda_version,
+            url_to_sha256 = repository_ctx.attr.cuda_json_dict[cuda_version],
+            mirrored_tars_url_to_sha256 = mirrored_tars_url_to_sha256,
+            json_file_name = "redistrib_cuda_%s.json" % cuda_version,
+            mirrored_tars_json_file_name = "redistrib_cuda_%s_tar.json" % cuda_version,
         )
     if cudnn_version and not local_cudnn_path:
+        if cudnn_version in repository_ctx.attr.mirrored_tars_cudnn_json_dict.keys():
+            mirrored_tars_url_to_sha256 = repository_ctx.attr.mirrored_tars_cudnn_json_dict[cudnn_version]
+        else:
+            mirrored_tars_url_to_sha256 = {}
         cudnn_redistributions = _get_json_file_content(
             repository_ctx,
-            repository_ctx.attr.cudnn_json_dict[cudnn_version],
-            "redistrib_cudnn_%s.json" % cudnn_version,
+            mirrored_tars_url_to_sha256 = mirrored_tars_url_to_sha256,
+            url_to_sha256 = repository_ctx.attr.cudnn_json_dict[cudnn_version],
+            json_file_name = "redistrib_cudnn_%s.json" % cudnn_version,
+            mirrored_tars_json_file_name = "redistrib_cudnn_%s_tar.json" % cudnn_version,
         )
 
     repository_ctx.file(
@@ -104,6 +152,8 @@ cuda_redist_json = repository_rule(
     attrs = {
         "cuda_json_dict": attr.string_list_dict(mandatory = True),
         "cudnn_json_dict": attr.string_list_dict(mandatory = True),
+        "mirrored_tars_cuda_json_dict": attr.string_list_dict(mandatory = True),
+        "mirrored_tars_cudnn_json_dict": attr.string_list_dict(mandatory = True),
     },
     environ = [
         "HERMETIC_CUDA_VERSION",
@@ -112,14 +162,19 @@ cuda_redist_json = repository_rule(
         "TF_CUDNN_VERSION",
         "LOCAL_CUDA_PATH",
         "LOCAL_CUDNN_PATH",
+        "USE_CUDA_TAR_ARCHIVE_FILES",
     ],
 )
 
 def cuda_json_init_repository(
         cuda_json_dict = CUDA_REDIST_JSON_DICT,
-        cudnn_json_dict = CUDNN_REDIST_JSON_DICT):
+        cudnn_json_dict = CUDNN_REDIST_JSON_DICT,
+        mirrored_tars_cuda_json_dict = MIRRORED_TARS_CUDA_REDIST_JSON_DICT,
+        mirrored_tars_cudnn_json_dict = MIRRORED_TARS_CUDNN_REDIST_JSON_DICT):
     cuda_redist_json(
         name = "cuda_redist_json",
         cuda_json_dict = cuda_json_dict,
         cudnn_json_dict = cudnn_json_dict,
+        mirrored_tars_cuda_json_dict = mirrored_tars_cuda_json_dict,
+        mirrored_tars_cudnn_json_dict = mirrored_tars_cudnn_json_dict,
     )
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
index 16ff3c8bea80..7757a92a90b7 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
@@ -2,7 +2,6 @@ licenses(["restricted"])  # NVIDIA proprietary license
 
 exports_files([
     "bin/nvcc",
-    "version.txt",
 ])
 
 filegroup(
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl
index 5be8d6ef2408..d86afc3e943f 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvjitlink.BUILD.tpl
@@ -4,9 +4,6 @@ load(
     "cuda_rpath_flags",
 )
 
-exports_files([
-    "version.txt",
-])
 %{multiline_comment}
 cc_import(
     name = "nvjitlink_shared_library",
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl
index 65bcb04db5c2..23ee30f09f8f 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvml.BUILD.tpl
@@ -1,9 +1,5 @@
 licenses(["restricted"])  # NVIDIA proprietary license
 
-exports_files([
-    "version.txt",
-])
-
 cc_library(
     name = "headers",
     %{comment}hdrs = ["include/nvml.h"],
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl
index 72418eeb158d..3457f41a502d 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_nvtx.BUILD.tpl
@@ -1,9 +1,5 @@
 licenses(["restricted"])  # NVIDIA proprietary license
 
-exports_files([
-    "version.txt",
-])
-
 cc_library(
     name = "headers",
     %{comment}hdrs = glob([
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
index fc11336ba3d1..6be181f13c30 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_init_repositories.bzl
@@ -19,6 +19,8 @@ load(
     "//third_party/gpus/cuda/hermetic:cuda_redist_versions.bzl",
     "CUDA_REDIST_PATH_PREFIX",
     "CUDNN_REDIST_PATH_PREFIX",
+    "MIRRORED_TAR_CUDA_REDIST_PATH_PREFIX",
+    "MIRRORED_TAR_CUDNN_REDIST_PATH_PREFIX",
     "REDIST_VERSIONS_TO_BUILD_TEMPLATES",
 )
 
@@ -232,6 +234,12 @@ def _create_cuda_version_file(repository_ctx, lib_name_to_version_dict):
             "MAJOR_CUDA_VERSION = \"{}\"".format(major_cudart_version),
         )
 
+def create_version_file(repository_ctx, major_lib_version):
+    repository_ctx.file(
+        "version.bzl",
+        "VERSION = \"{}\"".format(major_lib_version),
+    )
+
 def use_local_path(repository_ctx, local_path, dirs):
     # buildifier: disable=function-docstring-args
     """Creates repository using local redistribution paths."""
@@ -255,7 +263,7 @@ def use_local_path(repository_ctx, local_path, dirs):
         lib_name_to_version_dict,
     )
     _create_cuda_version_file(repository_ctx, lib_name_to_version_dict)
-    repository_ctx.file("version.txt", major_version)
+    create_version_file(repository_ctx, major_version)
 
 def _use_local_cuda_path(repository_ctx, local_cuda_path):
     # buildifier: disable=function-docstring-args
@@ -271,18 +279,30 @@ def _use_local_cudnn_path(repository_ctx, local_cudnn_path):
     """ Creates symlinks and initializes hermetic CUDNN repository."""
     use_local_path(repository_ctx, local_cudnn_path, ["include", "lib"])
 
-def _download_redistribution(repository_ctx, arch_key, path_prefix):
+def _download_redistribution(
+        repository_ctx,
+        arch_key,
+        path_prefix,
+        mirrored_tar_path_prefix):
     (url, sha256) = repository_ctx.attr.url_dict[arch_key]
+    use_cuda_tars = get_env_var(
+        repository_ctx,
+        "USE_CUDA_TAR_ARCHIVE_FILES",
+    )
 
     # If url is not relative, then appending prefix is not needed.
     if not (url.startswith("http") or url.startswith("file:///")):
-        url = path_prefix + url
+        if use_cuda_tars:
+            url = mirrored_tar_path_prefix + url
+        else:
+            url = path_prefix + url
     archive_name = get_archive_name(url)
     file_name = _get_file_name(url)
+    urls = [url] if use_cuda_tars else tf_mirror_urls(url)
 
     print("Downloading and extracting {}".format(url))  # buildifier: disable=print
     repository_ctx.download(
-        url = tf_mirror_urls(url),
+        url = urls,
         output = file_name,
         sha256 = sha256,
     )
@@ -329,7 +349,7 @@ def _use_downloaded_cuda_redistribution(repository_ctx):
         # If no CUDA version is found, comment out all cc_import targets.
         create_dummy_build_file(repository_ctx)
         _create_cuda_version_file(repository_ctx, {})
-        repository_ctx.file("version.txt", major_version)
+        create_version_file(repository_ctx, major_version)
         return
 
     if len(repository_ctx.attr.url_dict) == 0:
@@ -338,7 +358,7 @@ def _use_downloaded_cuda_redistribution(repository_ctx):
         ))  # buildifier: disable=print
         create_dummy_build_file(repository_ctx)
         _create_cuda_version_file(repository_ctx, {})
-        repository_ctx.file("version.txt", major_version)
+        create_version_file(repository_ctx, major_version)
         return
 
     # Download archive only when GPU config is used.
@@ -357,6 +377,7 @@ def _use_downloaded_cuda_redistribution(repository_ctx):
         repository_ctx,
         arch_key,
         repository_ctx.attr.cuda_redist_path_prefix,
+        repository_ctx.attr.mirrored_tar_cuda_redist_path_prefix,
     )
     lib_name_to_version_dict = get_lib_name_to_version_dict(repository_ctx)
     major_version = get_major_library_version(repository_ctx, lib_name_to_version_dict)
@@ -371,7 +392,7 @@ def _use_downloaded_cuda_redistribution(repository_ctx):
     )
     _create_cuda_header_symlinks(repository_ctx)
     _create_cuda_version_file(repository_ctx, lib_name_to_version_dict)
-    repository_ctx.file("version.txt", major_version)
+    create_version_file(repository_ctx, major_version)
 
 def _cuda_repo_impl(repository_ctx):
     local_cuda_path = get_env_var(repository_ctx, "LOCAL_CUDA_PATH")
@@ -388,12 +409,14 @@ cuda_repo = repository_rule(
         "build_templates": attr.label_list(mandatory = True),
         "override_strip_prefix": attr.string(),
         "cuda_redist_path_prefix": attr.string(),
+        "mirrored_tar_cuda_redist_path_prefix": attr.string(mandatory = False),
     },
     environ = [
         "HERMETIC_CUDA_VERSION",
         "TF_CUDA_VERSION",
         "LOCAL_CUDA_PATH",
         "CUDA_REDIST_TARGET_PLATFORM",
+        "USE_CUDA_TAR_ARCHIVE_FILES",
     ],
 )
 
@@ -409,7 +432,7 @@ def _use_downloaded_cudnn_redistribution(repository_ctx):
     if not cudnn_version:
         # If no CUDNN version is found, comment out cc_import targets.
         create_dummy_build_file(repository_ctx)
-        repository_ctx.file("version.txt", major_version)
+        create_version_file(repository_ctx, major_version)
         return
 
     if len(repository_ctx.attr.url_dict) == 0:
@@ -417,7 +440,7 @@ def _use_downloaded_cudnn_redistribution(repository_ctx):
             repository_ctx.name,
         ))  # buildifier: disable=print
         create_dummy_build_file(repository_ctx)
-        repository_ctx.file("version.txt", major_version)
+        create_version_file(repository_ctx, major_version)
         return
 
     # Download archive only when GPU config is used.
@@ -442,6 +465,7 @@ def _use_downloaded_cudnn_redistribution(repository_ctx):
         repository_ctx,
         arch_key,
         repository_ctx.attr.cudnn_redist_path_prefix,
+        repository_ctx.attr.mirrored_tar_cudnn_redist_path_prefix,
     )
 
     lib_name_to_version_dict = get_lib_name_to_version_dict(repository_ctx)
@@ -455,7 +479,7 @@ def _use_downloaded_cudnn_redistribution(repository_ctx):
         major_version,
     )
 
-    repository_ctx.file("version.txt", major_version)
+    create_version_file(repository_ctx, major_version)
 
 def _cudnn_repo_impl(repository_ctx):
     local_cudnn_path = get_env_var(repository_ctx, "LOCAL_CUDNN_PATH")
@@ -472,6 +496,7 @@ cudnn_repo = repository_rule(
         "build_templates": attr.label_list(mandatory = True),
         "override_strip_prefix": attr.string(),
         "cudnn_redist_path_prefix": attr.string(),
+        "mirrored_tar_cudnn_redist_path_prefix": attr.string(),
     },
     environ = [
         "HERMETIC_CUDNN_VERSION",
@@ -480,6 +505,7 @@ cudnn_repo = repository_rule(
         "TF_CUDA_VERSION",
         "LOCAL_CUDNN_PATH",
         "CUDA_REDIST_TARGET_PLATFORM",
+        "USE_CUDA_TAR_ARCHIVE_FILES",
     ],
 )
 
@@ -489,6 +515,8 @@ def _get_redistribution_urls(dist_info):
         arch_key = arch
         if arch_key == "linux-aarch64" and arch_key not in dist_info:
             arch_key = "linux-sbsa"
+        if arch_key not in dist_info:
+            continue
         if "relative_path" in dist_info[arch_key]:
             url_dict[_REDIST_ARCH_DICT[arch]] = [
                 dist_info[arch_key]["relative_path"],
@@ -534,6 +562,7 @@ def get_version_and_template_lists(version_to_template):
 def cudnn_redist_init_repository(
         cudnn_redistributions,
         cudnn_redist_path_prefix = CUDNN_REDIST_PATH_PREFIX,
+        mirrored_tar_cudnn_redist_path_prefix = MIRRORED_TAR_CUDNN_REDIST_PATH_PREFIX,
         redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
     # buildifier: disable=function-docstring-args
     """Initializes CUDNN repository."""
@@ -551,11 +580,13 @@ def cudnn_redist_init_repository(
         build_templates = templates,
         url_dict = url_dict,
         cudnn_redist_path_prefix = cudnn_redist_path_prefix,
+        mirrored_tar_cudnn_redist_path_prefix = mirrored_tar_cudnn_redist_path_prefix,
     )
 
 def cuda_redist_init_repositories(
         cuda_redistributions,
         cuda_redist_path_prefix = CUDA_REDIST_PATH_PREFIX,
+        mirrored_tar_cuda_redist_path_prefix = MIRRORED_TAR_CUDA_REDIST_PATH_PREFIX,
         redist_versions_to_build_templates = REDIST_VERSIONS_TO_BUILD_TEMPLATES):
     # buildifier: disable=function-docstring-args
     """Initializes CUDA repositories."""
@@ -576,4 +607,5 @@ def cuda_redist_init_repositories(
             build_templates = templates,
             url_dict = url_dict,
             cuda_redist_path_prefix = cuda_redist_path_prefix,
+            mirrored_tar_cuda_redist_path_prefix = mirrored_tar_cuda_redist_path_prefix,
         )
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
index 2c1c5d68cacb..0cdc329bf154 100644
--- a/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
+++ b/third_party/xla/third_party/gpus/cuda/hermetic/cuda_redist_versions.bzl
@@ -16,6 +16,8 @@
 
 CUDA_REDIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cuda/redist/"
 CUDNN_REDIST_PATH_PREFIX = "https://developer.download.nvidia.com/compute/cudnn/redist/"
+MIRRORED_TAR_CUDA_REDIST_PATH_PREFIX = "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/"
+MIRRORED_TAR_CUDNN_REDIST_PATH_PREFIX = "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/"
 
 CUDA_REDIST_JSON_DICT = {
     "11.8": [
@@ -74,13 +76,76 @@ CUDA_REDIST_JSON_DICT = {
         "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.8.0.json",
         "daa0d766b36feaa933592162c27be5fb63b68fc547ca6886c160a35d96ee8891",
     ],
+    "12.8.1": [
+        "https://developer.download.nvidia.com/compute/cuda/redist/redistrib_12.8.1.json",
+        "249e28a83008d711d5f72880541c8be6253f6d61608461de4fcb715554a6cf17",
+    ],
 }
 
-CUDNN_REDIST_JSON_DICT = {
-    "8.6": [
-        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.6.0.json",
-        "7f6f50bed4fd8216dc10d6ef505771dc0ecc99cce813993ab405cb507a21d51d",
+MIRRORED_TARS_CUDA_REDIST_JSON_DICT = {
+    "11.8": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_11.8.0_tar.json",
+        "a325b9dfba60c88f71b681e2f58b790b09afd9cb476fe620fabcb50be6f30add",
+    ],
+    "12.1.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.1.1_tar.json",
+        "f4c6679ebf3dedbeff329d5ee0c8bfec3f32c4976f5d9cdc238ac9faa0109502",
+    ],
+    "12.2.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.2.0_tar.json",
+        "69db566d620fbc5ecb8ee367d60b7e1d23f0ee64a11eca4cad97b037d9850819",
+    ],
+    "12.3.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.1_tar.json",
+        "d2d6331166117ca6889899245071903b1b01127713e934f8a91850f52862644c",
+    ],
+    "12.3.2": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.3.2_tar.json",
+        "796b019c6d707a656544ef007ad180d2e57dbf5c018683464166e2c512c1ec68",
+    ],
+    "12.4.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.4.0_tar.json",
+        "3b5066efdfe8072997ca8f3bbb9bf8c4bb869f25461d22887247be4d16101ba7",
+    ],
+    "12.4.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.4.1_tar.json",
+        "ff6cf5d43fd65e65bf1380f295adcc77b1c7598feff5b782912885ee5ac242e8",
+    ],
+    "12.5.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.5.0_tar.json",
+        "32a8d4ce1b31d15f02ac6a9cc7c5b060bd329a2a754906b1485752d9c9da59b5",
     ],
+    "12.5.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.5.1_tar.json",
+        "b1d50589900b5b50d01d1f741448802020835b5135fcbb969c6bf7b831372a7f",
+    ],
+    "12.6.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.0_tar.json",
+        "a5de3ae3f01ab25dec442fa133ca1d3eb0001fab6de14490b2f314b03dd3c0e4",
+    ],
+    "12.6.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.1_tar.json",
+        "8da05eb613d2d71b4814fde25de0a418b1dc04c0a409209dfce82b5ca8b15dec",
+    ],
+    "12.6.2": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.2_tar.json",
+        "cb18f8464212e71c364f6d8c9bf6b70c0908e2e069d75c90fc65e0b07981bb53",
+    ],
+    "12.6.3": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.6.3_tar.json",
+        "e1b558de79fe2da21cac80c498e4175a48087677627eacb915dd78f42833b5b3",
+    ],
+    "12.8.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.8.0_tar.json",
+        "c9790b289d654844d9dd2ec07f30383220dac1320f7d7d686722e946f9a55e44",
+    ],
+    "12.8.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cuda/redist/redistrib_12.8.1_tar.json",
+        "30a1b8ace0d38237f4ab3ab28d89dbc77ae2c4ebabe27ba08b3c0961cc6cc7fa",
+    ],
+}
+
+CUDNN_REDIST_JSON_DICT = {
     "8.9.4.25": [
         "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.4.25.json",
         "02258dba8384860c9230fe3c78522e7bd8e350e461ccd37a8d932cb64127ba57",
@@ -133,6 +198,69 @@ CUDNN_REDIST_JSON_DICT = {
         "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.7.1.json",
         "f9bc411a4908f0931e7323f89049e3a38453632c4ac5f4aa3220af69ddded9dc",
     ],
+    "9.8.0": [
+        "https://developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.8.0.json",
+        "a1599fa1f8dcb81235157be5de5ab7d3936e75dfc4e1e442d07970afad3c4843",
+    ],
+}
+
+MIRRORED_TARS_CUDNN_REDIST_JSON_DICT = {
+    "8.9.4.25": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.4.25_tar.json",
+        "cf2642a1db2b564065232277f061e89f1b20435f88164fa783855ac69f33d3c2",
+    ],
+    "8.9.6": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.6_tar.json",
+        "dab3ead7f79bf0378e2e9037a9f6a87f249c581aa75d1e2f352ffa3df56d5356",
+    ],
+    "8.9.7.29": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_8.9.7.29_tar.json",
+        "7e305dc19b8a273645078bb3a37faaa54256a59ac9137934979983d9ce481717",
+    ],
+    "9.1.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.1.1_tar.json",
+        "6960bc9e472b21c4ffae0a75309f41f48eb3d943a553ad70273927fb170fa99f",
+    ],
+    "9.2.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.2.0_tar.json",
+        "35469a1494c8f95d81774fd7750c6cd2def3919e83b0fa8e0285edd42bcead20",
+    ],
+    "9.2.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.2.1_tar.json",
+        "de77cb78dd620f1c1f8d1a07e167ba6d6cfa1be5769172a09c5315a1463811c1",
+    ],
+    "9.3.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.3.0_tar.json",
+        "50aadf1e10b0988bb74497331953f1afbd9c596c27c6014f4d3f370cec2713aa",
+    ],
+    "9.4.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.4.0_tar.json",
+        "114a6ad4152ea014cc07fec1fa63a029c6eec6a5dc4463c8dc83ad6d5f809795",
+    ],
+    "9.5.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.5.0_tar.json",
+        "f224f5a875129eeb5b3c7e18d8a5f2e7bb5498f0e3095a8ae5fb863ebc450c52",
+    ],
+    "9.5.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.5.1_tar.json",
+        "28ce996b3f4171f6a3873152470e14753788cddd089261513c18c773fe2a2b73",
+    ],
+    "9.6.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.6.0_tar.json",
+        "084cc250593cfbc962f7942a4871aa13a179ce5beb1aea236b74080cc23e29f0",
+    ],
+    "9.7.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.7.0_tar.json",
+        "402906b09b7b2624e6a5c6937a41cc3330d6e588f2f211504ad3fb8a5823fa01",
+    ],
+    "9.7.1": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.7.1_tar.json",
+        "2eaa4594c1ab188c939026d90245d3ffca2a83d41aba1be903f644cc1215c23d",
+    ],
+    "9.8.0": [
+        "https://storage.googleapis.com/mirror.tensorflow.org/developer.download.nvidia.com/compute/cudnn/redist/redistrib_9.8.0_tar.json",
+        "030378782b94597855cdf7d3068968f88460cd9c4ce9d73c77cfad64dfdea070",
+    ],
 }
 
 CUDA_12_NCCL_WHEEL_DICT = {
@@ -172,6 +300,33 @@ CUDA_NCCL_WHEELS = {
     "12.6.2": CUDA_12_NCCL_WHEEL_DICT,
     "12.6.3": CUDA_12_NCCL_WHEEL_DICT,
     "12.8.0": CUDA_12_NCCL_WHEEL_DICT,
+    "12.8.1": CUDA_12_NCCL_WHEEL_DICT,
+}
+
+# Ensures PTX version compatibility w/ Clang & ptxas in cuda_configure.bzl
+PTX_VERSION_DICT = {
+    # To find, invoke `llc -march=nvptx64 -mcpu=help 2>&1 | grep ptx | sort -V | tail -n 1`
+    "clang": {
+        "14": "7.5",
+        "15": "7.5",
+        "16": "7.8",
+        "17": "8.1",
+        "18": "8.3",
+        "19": "8.5",
+        "20": "8.7",
+    },
+    # To find, look at https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
+    "cuda": {
+        "11.8": "7.8",
+        "12.1": "8.1",
+        "12.2": "8.2",
+        "12.3": "8.3",
+        "12.4": "8.4",
+        "12.5": "8.5",
+        "12.6": "8.5",
+        "12.8": "8.7",
+        "12.9": "8.8",
+    },
 }
 
 REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
@@ -210,6 +365,7 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
     "cuda_cudart": {
         "repo_name": "cuda_cudart",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_cudart.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_cudart.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cudart.BUILD.tpl",
         },
@@ -250,12 +406,14 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
     "libnvjitlink": {
         "repo_name": "cuda_nvjitlink",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_nvjitlink.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvjitlink.BUILD.tpl",
         },
     },
     "cuda_nvrtc": {
         "repo_name": "cuda_nvrtc",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_nvrtc.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvrtc.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvrtc.BUILD.tpl",
         },
@@ -263,6 +421,7 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
     "cuda_cccl": {
         "repo_name": "cuda_cccl",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_cccl.BUILD.tpl",
         },
@@ -270,6 +429,7 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
     "cuda_nvcc": {
         "repo_name": "cuda_nvcc",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_nvcc.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvcc.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvcc.BUILD.tpl",
         },
@@ -277,6 +437,7 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
     "cuda_nvml_dev": {
         "repo_name": "cuda_nvml",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_nvml.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvml.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvml.BUILD.tpl",
         },
@@ -284,6 +445,7 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
     "cuda_nvprune": {
         "repo_name": "cuda_nvprune",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_nvprune.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvprune.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvprune.BUILD.tpl",
         },
@@ -291,6 +453,7 @@ REDIST_VERSIONS_TO_BUILD_TEMPLATES = {
     "cuda_nvtx": {
         "repo_name": "cuda_nvtx",
         "version_to_template": {
+            "13": "//third_party/gpus/cuda/hermetic:cuda_nvtx.BUILD.tpl",
             "12": "//third_party/gpus/cuda/hermetic:cuda_nvtx.BUILD.tpl",
             "11": "//third_party/gpus/cuda/hermetic:cuda_nvtx.BUILD.tpl",
         },
diff --git a/third_party/xla/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/gpus/cuda_configure.bzl
index c1e6c42e4fd4..d110fa814608 100644
--- a/third_party/xla/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/gpus/cuda_configure.bzl
@@ -1065,10 +1065,6 @@ def _create_local_cuda_repository(repository_ctx):
         },
     )
 
-    cub_actual = "@cub_archive//:cub"
-    if int(cuda_config.cuda_version_major) >= 11:
-        cub_actual = ":cuda_headers"
-
     repository_ctx.template(
         "cuda/BUILD",
         tpl_paths["cuda:BUILD"],
@@ -1085,7 +1081,7 @@ def _create_local_cuda_repository(repository_ctx):
             "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
             "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
             "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
-            "%{cub_actual}": cub_actual,
+            "%{cub_actual}": ":cuda_headers",
             "%{copy_rules}": "\n".join(copy_rules),
         },
     )
diff --git a/third_party/xla/third_party/gpus/rocm/BUILD b/third_party/xla/third_party/gpus/rocm/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/protobuf/BUILD b/third_party/xla/third_party/gpus/rocm/BUILD.bazel
similarity index 100%
rename from third_party/protobuf/BUILD
rename to third_party/xla/third_party/gpus/rocm/BUILD.bazel
diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index c97363599738..d6d4570a675b 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -1,20 +1,32 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
 load("@local_config_rocm//rocm:build_defs.bzl", "rocm_version_number", "select_threshold")
 
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
 package(default_visibility = ["//visibility:private"])
 
-bool_flag(
-    name = "use_rocm_hermetic_rpath",
-    build_setting_default = False,
+string_flag(
+    name = "rocm_path_type",
+    build_setting_default = "system",
+    values = [
+       "hermetic",
+       "multiple",
+       "system",
+    ]
 )
 
 config_setting(
     name = "build_hermetic",
     flag_values = {
-        ":use_rocm_hermetic_rpath": "True",
+        ":rocm_path_type": "hermetic",
+    },
+)
+
+config_setting(
+    name = "multiple_rocm_paths",
+    flag_values = {
+        ":rocm_path_type": "multiple",
     },
 )
 
@@ -75,6 +87,7 @@ cc_library(
         "%{rocm_root}/include/**",
         "%{rocm_root}/lib/llvm/lib/**/*.h",
     ]),
+    defines = ["MIOPEN_BETA_API=1"],
     include_prefix = "rocm",
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
@@ -129,6 +142,9 @@ cc_library(
         ":build_hermetic": [
             "-Wl,-rpath,%{rocm_toolkit_path}/lib",
         ],
+        ":multiple_rocm_paths": [
+            "-Wl,-rpath=%{rocm_lib_paths}",
+        ],
         "//conditions:default": [
             "-Wl,-rpath,/opt/rocm/lib",
         ],
diff --git a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
index d327083e4dc8..4c3a85c1730c 100644
--- a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -64,6 +64,12 @@ def if_rocm_is_configured(if_true, if_false = []):
       return select({"//conditions:default": if_true})
     return select({"//conditions:default": if_false})
 
+def is_rocm_configured():
+    """
+    Returns True if ROCm is configured. False otherwise.
+    """
+    return %{rocm_is_configured}
+
 def rocm_hipblaslt():
     return %{rocm_is_configured} and %{rocm_hipblaslt}
 
diff --git a/third_party/xla/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/gpus/rocm_configure.bzl
index de86f2f72473..b5b56fe5298c 100644
--- a/third_party/xla/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/gpus/rocm_configure.bzl
@@ -26,13 +26,13 @@ load(
     "get_cpu_value",
     "get_host_environ",
     "get_python_bin",
-    "raw_exec",
     "realpath",
     "relative_to",
     "which",
 )
 load(
     ":compiler_common_tools.bzl",
+    "get_cxx_inc_directories",
     "to_list_of_strings",
 )
 load(
@@ -56,6 +56,8 @@ _OS = "OS"
 _ROCM_VERSION = "ROCM_VERSION"
 
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
+_TF_ROCM_MULTIPLE_PATHS = "TF_ROCM_MULTIPLE_PATHS"
+_LLVM_PATH = "LLVM_PATH"
 
 def verify_build_defines(params):
     """Verify all variables that crosstool/BUILD.rocm.tpl expects are substituted.
@@ -107,80 +109,6 @@ def find_cc(repository_ctx, use_rocm_clang):
               " environment variable").format(target_cc_name, cc_path_envvar))
     return cc
 
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    return path
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    sysroot = []
-    if tf_sysroot:
-        sysroot += ["--sysroot", tf_sysroot]
-
-    # TODO: We pass -no-canonical-prefixes here to match the compiler flags,
-    #       but in rocm_clang CROSSTOOL file that is a `feature` and we should
-    #       handle the case when it's disabled and no flag is passed
-    result = raw_exec(repository_ctx, [
-        cc,
-        "-E",
-        "-x" + lang,
-        "-",
-        "-v",
-    ] + sysroot)
-    stderr = err_out(result)
-    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = stderr[index1 + 1:]
-    else:
-        inc_dirs = stderr[index1 + 1:index2].strip()
-
-    return [
-        str(repository_ctx.path(_cxx_inc_convert(p)))
-        for p in inc_dirs.split("\n")
-    ]
-
-def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        True,
-        tf_sysroot,
-    )
-    includes_c = _get_cxx_inc_directories_impl(
-        repository_ctx,
-        cc,
-        False,
-        tf_sysroot,
-    )
-
-    includes_cpp_set = depset(includes_cpp)
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp_set.to_list()
-    ]
-
 def auto_configure_fail(msg):
     """Output failure message when rocm configuration fails."""
     red = "\033[0;31m"
@@ -595,12 +523,31 @@ def _setup_rocm_distro_dir(repository_ctx):
     bash_bin = get_bash_bin(repository_ctx)
     os = repository_ctx.os.environ.get(_OS)
     rocm_version = repository_ctx.os.environ.get(_ROCM_VERSION)
+    multiple_paths = repository_ctx.os.environ.get(_TF_ROCM_MULTIPLE_PATHS)
     if os and rocm_version:
         redist = rocm_redist[os][rocm_version]
         repository_ctx.file("rocm/.index")
         for archive in redist["archives"]:
             _download_package(repository_ctx, archive)
         return _get_rocm_config(repository_ctx, bash_bin, "{}/{}".format(_DISTRIBUTION_PATH, redist["rocm_root"]), "/{}".format(redist["rocm_root"]))
+    elif multiple_paths:
+        paths_list = multiple_paths.split(":")
+        for rocm_custom_path in paths_list:
+            cmd = "find " + rocm_custom_path + "/* \\( -type f -o -type l \\)"
+            result = execute(repository_ctx, [bash_bin, "-c", cmd]).stdout.strip().split("\n")
+            for file_path in result:
+                relative_path = file_path[len(rocm_custom_path):]
+                symlink_path = _DISTRIBUTION_PATH + relative_path
+                if files_exist(repository_ctx, [symlink_path], bash_bin)[0]:
+                    fail("File already present: " + relative_path)
+                else:
+                    repository_ctx.symlink(file_path, symlink_path)
+        llvm_path = repository_ctx.os.environ.get(_LLVM_PATH)
+        if llvm_path:
+            repository_ctx.symlink(llvm_path, _DISTRIBUTION_PATH + "/llvm")
+            repository_ctx.symlink(llvm_path, _DISTRIBUTION_PATH + "/lib/llvm")
+            repository_ctx.symlink(llvm_path + "/amdgcn", _DISTRIBUTION_PATH + "/amdgcn")
+        return _get_rocm_config(repository_ctx, bash_bin, _DISTRIBUTION_PATH, _DISTRIBUTION_PATH)
     else:
         rocm_path = repository_ctx.os.environ.get(_ROCM_TOOLKIT_PATH, _DEFAULT_ROCM_TOOLKIT_PATH)
         repository_ctx.report_progress("Using local rocm installation {}".format(rocm_path))  # buildifier: disable=print
@@ -676,6 +623,16 @@ def _create_local_rocm_repository(repository_ctx):
         repository_dict["%{hipsolver_lib}"] = rocm_libs["hipsolver"].file_name
         repository_dict["%{hipblas_lib}"] = rocm_libs["hipblas"].file_name
 
+    multiple_paths = repository_ctx.os.environ.get(_TF_ROCM_MULTIPLE_PATHS)
+    if multiple_paths:
+        paths_list = multiple_paths.split(":")
+        rocm_lib_paths = []
+        for rocm_custom_path in paths_list:
+            lib_path = rocm_custom_path + "/lib/"
+            if files_exist(repository_ctx, [lib_path], bash_bin)[0] and not lib_path in rocm_lib_paths:
+                rocm_lib_paths.append(lib_path)
+        repository_dict["%{rocm_lib_paths}"] = ":".join(rocm_lib_paths)
+
     repository_ctx.template(
         "rocm/BUILD",
         tpl_paths["rocm:BUILD"],
diff --git a/third_party/xla/third_party/gpus/sycl/BUILD b/third_party/xla/third_party/gpus/sycl/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/py/ml_dtypes/BUILD b/third_party/xla/third_party/gpus/sycl/BUILD.bazel
similarity index 100%
rename from third_party/py/ml_dtypes/BUILD
rename to third_party/xla/third_party/gpus/sycl/BUILD.bazel
diff --git a/third_party/xla/third_party/grpc/BUILD b/third_party/xla/third_party/grpc/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/py/non_hermetic/ml_dtypes/BUILD b/third_party/xla/third_party/grpc/BUILD.bazel
similarity index 100%
rename from third_party/py/non_hermetic/ml_dtypes/BUILD
rename to third_party/xla/third_party/grpc/BUILD.bazel
diff --git a/third_party/xla/third_party/grpc/upb_build.patch b/third_party/xla/third_party/grpc/upb_build.patch
new file mode 100644
index 000000000000..bb8bd419f2d4
--- /dev/null
+++ b/third_party/xla/third_party/grpc/upb_build.patch
@@ -0,0 +1,13 @@
+diff --git a/BUILD b/BUILD
+index ad85b202..89e17443 100644
+--- a/BUILD
++++ b/BUILD
+@@ -24,7 +24,7 @@ exports_files([
+ 
+ CPPOPTS = [
+     # copybara:strip_for_google3_begin
+-    "-Werror",
++    "-w",
+     "-Wno-long-long",
+     # copybara:strip_end
+ ]
diff --git a/third_party/xla/third_party/highwayhash/BUILD b/third_party/xla/third_party/highwayhash/BUILD
new file mode 100644
index 000000000000..9e2309bed155
--- /dev/null
+++ b/third_party/xla/third_party/highwayhash/BUILD
@@ -0,0 +1,3 @@
+# Dummy BUILD file to make this directory a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/highwayhash/highwayhash.BUILD b/third_party/xla/third_party/highwayhash/highwayhash.BUILD
new file mode 100644
index 000000000000..0314bd443f26
--- /dev/null
+++ b/third_party/xla/third_party/highwayhash/highwayhash.BUILD
@@ -0,0 +1,304 @@
+# Description:
+#   SipHash and HighwayHash: cryptographically-strong pseudorandom functions
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["header_modules"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+config_setting(
+    name = "haswell",
+    values = {"cpu": "haswell"},
+)
+
+config_setting(
+    name = "k8",
+    values = {"cpu": "k8"},
+)
+
+config_setting(
+    name = "cpu_ppc",
+    values = {"cpu": "ppc"},
+)
+
+config_setting(
+    name = "cpu_aarch64",
+    values = {"cpu": "aarch64"},
+)
+
+config_setting(
+    name = "cpu_darwin_arm64",
+    values = {"cpu": "darwin_arm64"},
+)
+
+#-----------------------------------------------------------------------------
+# Platform-specific
+
+cc_library(
+    name = "compiler_specific",
+    hdrs = ["highwayhash/compiler_specific.h"],
+)
+
+cc_library(
+    name = "arch_specific",
+    srcs = ["highwayhash/arch_specific.cc"],
+    hdrs = ["highwayhash/arch_specific.h"],
+    deps = [":compiler_specific"],
+)
+
+cc_library(
+    name = "endianess",
+    hdrs = ["highwayhash/endianess.h"],
+)
+
+cc_library(
+    name = "instruction_sets",
+    srcs = ["highwayhash/instruction_sets.cc"],
+    hdrs = ["highwayhash/instruction_sets.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+    ],
+)
+
+cc_library(
+    name = "iaca",
+    hdrs = ["highwayhash/iaca.h"],
+    deps = [":compiler_specific"],
+)
+
+cc_library(
+    name = "os_specific",
+    srcs = ["highwayhash/os_specific.cc"],
+    hdrs = ["highwayhash/os_specific.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+    ],
+)
+
+#-----------------------------------------------------------------------------
+# Vectors
+
+cc_library(
+    name = "scalar",
+    textual_hdrs = ["highwayhash/scalar.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+    ],
+)
+
+cc_library(
+    name = "vector128",
+    textual_hdrs = ["highwayhash/vector128.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+    ],
+)
+
+cc_library(
+    name = "vector256",
+    textual_hdrs = ["highwayhash/vector256.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+    ],
+)
+
+#-----------------------------------------------------------------------------
+# SipHash
+
+cc_library(
+    name = "sip_hash",
+    srcs = ["highwayhash/sip_hash.cc"],
+    hdrs = [
+        "highwayhash/sip_hash.h",
+        "highwayhash/state_helpers.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":endianess",
+    ],
+)
+
+#-----------------------------------------------------------------------------
+# HighwayHash
+
+cc_library(
+    name = "hh_types",
+    hdrs = ["highwayhash/hh_types.h"],
+    deps = [":instruction_sets"],
+)
+
+cc_library(
+    name = "load3",
+    textual_hdrs = ["highwayhash/load3.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":endianess",
+    ],
+)
+
+cc_library(
+    name = "hh_avx2",
+    srcs = ["highwayhash/hh_avx2.cc"],
+    hdrs = ["highwayhash/highwayhash_target.h"],
+    copts = select({
+        ":k8": ["-mavx2"],
+        ":haswell": ["-mavx2"],
+        "//conditions:default": ["-DHH_DISABLE_TARGET_SPECIFIC"],
+    }),
+    textual_hdrs = [
+        "highwayhash/hh_avx2.h",
+        "highwayhash/highwayhash_target.cc",
+        "highwayhash/highwayhash.h",
+        "highwayhash/hh_buffer.h",
+    ],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":hh_types",
+        ":iaca",
+        ":load3",
+        ":vector128",
+        ":vector256",
+    ],
+)
+
+cc_library(
+    name = "hh_sse41",
+    srcs = ["highwayhash/hh_sse41.cc"],
+    hdrs = ["highwayhash/highwayhash_target.h"],
+    copts = select({
+        ":k8": ["-msse4.1"],
+        ":haswell": ["-msse4.1"],
+        "//conditions:default": ["-DHH_DISABLE_TARGET_SPECIFIC"],
+    }),
+    textual_hdrs = [
+        "highwayhash/hh_sse41.h",
+        "highwayhash/highwayhash_target.cc",
+        "highwayhash/highwayhash.h",
+        "highwayhash/hh_buffer.h",
+    ],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":hh_types",
+        ":iaca",
+        ":load3",
+        ":vector128",
+    ],
+)
+
+cc_library(
+    name = "hh_neon",
+    srcs = [
+        "highwayhash/hh_neon.cc",
+        "highwayhash/vector_neon.h",
+    ],
+    hdrs = ["highwayhash/highwayhash_target.h"],
+    copts = select({
+        ":cpu_aarch64": [],
+        ":cpu_darwin_arm64": [],
+        "//conditions:default": ["-DHH_DISABLE_TARGET_SPECIFIC"],
+    }),
+    textual_hdrs = [
+        "highwayhash/highwayhash_target.cc",
+        "highwayhash/highwayhash.h",
+        "highwayhash/hh_buffer.h",
+        "highwayhash/hh_neon.h",
+    ],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":hh_types",
+        ":load3",
+    ],
+)
+
+cc_library(
+    name = "hh_vsx",
+    srcs = ["highwayhash/hh_vsx.cc"],
+    hdrs = ["highwayhash/highwayhash_target.h"],
+    textual_hdrs = [
+        "highwayhash/highwayhash_target.cc",
+        "highwayhash/highwayhash.h",
+        "highwayhash/hh_buffer.h",
+        "highwayhash/hh_vsx.h",
+    ],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":hh_types",
+        ":load3",
+    ],
+)
+
+cc_library(
+    name = "hh_portable",
+    srcs = ["highwayhash/hh_portable.cc"],
+    hdrs = ["highwayhash/highwayhash_target.h"],
+    textual_hdrs = [
+        "highwayhash/hh_portable.h",
+        "highwayhash/highwayhash_target.cc",
+        "highwayhash/highwayhash.h",
+        "highwayhash/hh_buffer.h",
+    ],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":hh_types",
+        ":iaca",
+        ":load3",
+        ":scalar",
+    ],
+)
+
+# For users of the HighwayHashT template
+cc_library(
+    name = "highwayhash",
+    hdrs = ["highwayhash/highwayhash.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":hh_portable",
+        ":hh_types",
+    ] + select({
+        ":cpu_ppc": [":hh_vsx"],
+        ":cpu_aarch64": [":hh_neon"],
+        ":cpu_darwin_arm64": [":hh_neon"],
+        "//conditions:default": [
+            ":hh_avx2",
+            ":hh_sse41",
+            ":iaca",
+        ],
+    }),
+)
+
+# For users of InstructionSets<HighwayHash> runtime dispatch
+cc_library(
+    name = "highwayhash_dynamic",
+    hdrs = ["highwayhash/highwayhash_target.h"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+        ":hh_portable",
+        ":hh_types",
+    ] + select({
+        ":cpu_ppc": [":hh_vsx"],
+        ":cpu_aarch64": [":hh_neon"],
+        ":cpu_darwin_arm64": [":hh_neon"],
+        "//conditions:default": [
+            ":hh_avx2",
+            ":hh_sse41",
+        ],
+    }),
+)
diff --git a/third_party/xla/third_party/highwayhash/workspace.bzl b/third_party/xla/third_party/highwayhash/workspace.bzl
new file mode 100644
index 000000000000..9b2c0ccbec07
--- /dev/null
+++ b/third_party/xla/third_party/highwayhash/workspace.bzl
@@ -0,0 +1,12 @@
+"""loads the highwayhash library, used by TF."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "highwayhash",
+        urls = tf_mirror_urls("https://github.com/google/highwayhash/archive/c13d28517a4db259d738ea4886b1f00352a3cc33.tar.gz"),
+        sha256 = "c0e2b9931fbcce3bfbcd7999c3c114f404ac0f8b89775a5bbccbcaa501868e58",
+        strip_prefix = "highwayhash-c13d28517a4db259d738ea4886b1f00352a3cc33",
+        build_file = "//third_party/highwayhash:highwayhash.BUILD",
+    )
diff --git a/third_party/xla/third_party/hwloc/BUILD b/third_party/xla/third_party/hwloc/BUILD
index db5c9ec8873b..e32872c38f5f 100644
--- a/third_party/xla/third_party/hwloc/BUILD
+++ b/third_party/xla/third_party/hwloc/BUILD
@@ -1,7 +1,7 @@
 # BUILD file to make this directory a package.
 
 package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # copybara:uncomment default_applicable_licenses = ["//third_party/tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/third_party/xla/third_party/implib_so/BUILD b/third_party/xla/third_party/implib_so/BUILD
deleted file mode 100644
index ca6976cd8d34..000000000000
--- a/third_party/xla/third_party/implib_so/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # MIT
-
-py_binary(
-    name = "get_symbols",
-    srcs = ["get_symbols.py"],
-    deps = [
-        "@bazel_tools//tools/python/runfiles",
-        "@implib_so//:implib_gen_lib",
-    ],
-)
-
-py_binary(
-    name = "make_stub",
-    srcs = ["make_stub.py"],
-    deps = [
-        "@bazel_tools//tools/python/runfiles",
-        "@implib_so//:implib_gen_lib",
-    ],
-)
diff --git a/third_party/implib_so/BUILD b/third_party/xla/third_party/implib_so/BUILD.bazel
similarity index 100%
rename from third_party/implib_so/BUILD
rename to third_party/xla/third_party/implib_so/BUILD.bazel
diff --git a/third_party/xla/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/llvm_openmp/BUILD
deleted file mode 100644
index fbde2733a2a3..000000000000
--- a/third_party/xla/third_party/llvm_openmp/BUILD
+++ /dev/null
@@ -1,243 +0,0 @@
-# Build file for OpenMP library that is part of llvm
-
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load(
-    "@local_xla//third_party/llvm_openmp:cmake_vars.bzl",
-    "cmake_var_string",
-    "expand_cmake_vars",
-)
-load(
-    "@local_xla//third_party/llvm_openmp:openmp.bzl",
-    "dict_add",
-    "libiomp5_cc_binary",
-)
-load(
-    "@local_xla//xla/tsl:tsl.bzl",
-    "if_linux_x86_64",
-    "if_macos",
-    "if_windows",
-)
-
-package(
-    default_visibility = [
-        "//visibility:public",
-    ],
-)
-
-exports_files(["LICENSE.txt"])
-
-py_binary(
-    name = "expand_cmake_vars",
-    srcs = ["expand_cmake_vars.py"],
-    srcs_version = "PY3",
-    visibility = [
-        "@llvm_openmp//:__subpackages__",
-    ],
-)
-
-kmp_i18n_os_type = select({
-    "@local_xla//xla/tsl:windows": "win",
-    "//conditions:default": "lin",
-})
-
-genrule(
-    name = "kmp_i18n_id",
-    srcs = [
-        "runtime/tools/message-converter.pl",
-        "runtime/tools/lib/tools.pm",
-        "runtime/src/i18n/en_US.txt",
-    ],
-    outs = ["include/kmp_i18n_id.inc"],
-    cmd = "perl $(location runtime/tools/message-converter.pl) --os=" + kmp_i18n_os_type + " --prefix=kmp_i18n --enum=$@  $(location runtime/src/i18n/en_US.txt)",
-)
-
-genrule(
-    name = "kmp_i18n_default",
-    srcs = [
-        "runtime/tools/message-converter.pl",
-        "runtime/tools/lib/tools.pm",
-        "runtime/src/i18n/en_US.txt",
-    ],
-    outs = ["include/kmp_i18n_default.inc"],
-    cmd = "perl $(location runtime/tools/message-converter.pl) --os=" + kmp_i18n_os_type + " --prefix=kmp_i18n --default=$@ $(location runtime/src/i18n/en_US.txt)",
-)
-
-genrule(
-    name = "generate_def",
-    srcs = [
-        "runtime/tools/generate-def.pl",
-    ],
-    outs = ["omp_dll.def"],
-    cmd = "perl $(location runtime/tools/generate-def.pl) -D arch_32e -D msvc_compat -D norm -o omp_dll.tmp external/llvm_openmp/runtime/src/dllexports; mv omp_dll.tmp $@",
-)
-
-# Bazel doesn't accept .txt as an input, rename the ldscript to .inc to workaround.
-genrule(
-    name = "ldscript",
-    srcs = ["runtime/src/exports_so.txt"],
-    outs = ["exports_so.inc"],
-    cmd = "cp $(location runtime/src/exports_so.txt) $@",
-)
-
-genrule(
-    name = "openmp_asm",
-    srcs = [
-        "runtime/src/z_Windows_NT-586_asm.asm",
-    ],
-    outs = [
-        "z_Windows_NT-586_asm.S",
-    ],
-    cmd = "cp $(location runtime/src/z_Windows_NT-586_asm.asm) $@",
-    visibility = ["//visibility:public"],
-)
-
-# Common Cmake vars to expand.
-omp_vars = {
-    "LIBOMP_ENABLE_SHARED": 1,
-    "LIBOMP_LEGAL_ARCH": "Intel(R) 64",
-    "LIBOMP_LIB_FILE": "libiomp5",
-    "LIBOMP_VERSION_MAJOR": 5,
-    "LIBOMP_VERSION_MINOR": 0,
-}
-
-# Linux Cmake vars to expand.
-omp_vars_linux = {
-    "LIBOMP_USE_VERSION_SYMBOLS": 1,
-    "LIBOMP_HAVE_WEAK_ATTRIBUTE": 1,
-    "LIBOMP_USE_ADAPTIVE_LOCKS": 1,
-    "LIBOMP_ENABLE_ASSERTIONS": 1,
-}
-
-# Windows Cmake vars to expand.
-omp_vars_win = {
-    "MSVC": 1,
-}
-
-omp_all_cmake_vars = select({
-    "@local_xla//xla/tsl:windows": cmake_var_string(
-        dict_add(
-            omp_vars,
-            omp_vars_win,
-        ),
-    ),
-    "//conditions:default": cmake_var_string(
-        dict_add(
-            omp_vars,
-            omp_vars_linux,
-        ),
-    ),
-})
-
-expand_cmake_vars(
-    name = "config_kmp",
-    src = "runtime/src/kmp_config.h.cmake",
-    cmake_vars = omp_all_cmake_vars,
-    dst = "include/kmp_config.h",
-)
-
-expand_cmake_vars(
-    name = "config_omp",
-    src = "runtime/src/include/omp.h.var",
-    cmake_vars = omp_all_cmake_vars,
-    dst = "include/omp.h",
-)
-
-headers = [
-    "runtime/src/kmp_affinity.h",
-    "runtime/src/kmp_atomic.h",
-    "runtime/src/kmp_debug.h",
-    "runtime/src/kmp_dispatch_hier.h",
-    "runtime/src/kmp_dispatch.h",
-    "runtime/src/kmp_environment.h",
-    "runtime/src/kmp_error.h",
-    "runtime/src/kmp_ftn_entry.h",
-    "runtime/src/kmp_ftn_os.h",
-    "runtime/src/kmp_i18n.h",
-    "runtime/src/kmp_io.h",
-    "runtime/src/kmp_itt.h",
-    "runtime/src/kmp_itt.inl",
-    "runtime/src/kmp_lock.h",
-    "runtime/src/kmp_os.h",
-    "runtime/src/kmp_platform.h",
-    "runtime/src/kmp_safe_c_api.h",
-    "runtime/src/kmp_settings.h",
-    "runtime/src/kmp_stats.h",
-    "runtime/src/kmp_str.h",
-    "runtime/src/kmp_taskdeps.h",
-    "runtime/src/kmp_version.h",
-    "runtime/src/kmp_wait_release.h",
-    "runtime/src/kmp_wrapper_getpid.h",
-    "runtime/src/kmp_wrapper_malloc.h",
-    "runtime/src/kmp.h",
-    "runtime/src/ompt-specific.h",
-    "runtime/src/tsan_annotations.h",
-]
-
-cppsources = [
-    "runtime/src/kmp_alloc.cpp",
-    "runtime/src/kmp_atomic.cpp",
-    "runtime/src/kmp_csupport.cpp",
-    "runtime/src/kmp_debug.cpp",
-    "runtime/src/kmp_itt.cpp",
-    "runtime/src/kmp_environment.cpp",
-    "runtime/src/kmp_error.cpp",
-    "runtime/src/kmp_global.cpp",
-    "runtime/src/kmp_i18n.cpp",
-    "runtime/src/kmp_io.cpp",
-    "runtime/src/kmp_runtime.cpp",
-    "runtime/src/kmp_settings.cpp",
-    "runtime/src/kmp_str.cpp",
-    "runtime/src/kmp_tasking.cpp",
-    "runtime/src/kmp_threadprivate.cpp",
-    "runtime/src/kmp_utility.cpp",
-    "runtime/src/kmp_barrier.cpp",
-    "runtime/src/kmp_wait_release.cpp",
-    "runtime/src/kmp_affinity.cpp",
-    "runtime/src/kmp_dispatch.cpp",
-    "runtime/src/kmp_lock.cpp",
-    "runtime/src/kmp_sched.cpp",
-    "runtime/src/kmp_taskdeps.cpp",
-    "runtime/src/kmp_cancel.cpp",
-    "runtime/src/kmp_ftn_cdecl.cpp",
-    "runtime/src/kmp_ftn_extra.cpp",
-    "runtime/src/kmp_version.cpp",
-]
-
-srcdeps = [
-    ":config_kmp",
-    ":config_omp",
-    ":kmp_i18n_id",
-    ":kmp_i18n_default",
-]
-
-common_includes = [
-    "runtime/src/",
-    "include/",
-]
-
-# Expand libiomp build rule based on platform.
-if_linux_x86_64(a = libiomp5_cc_binary(
-    "libiomp5.so",
-    cppsources,
-    srcdeps + headers,
-    common_includes,
-))
-
-if_macos(a = libiomp5_cc_binary(
-    "libiomp5.dylib",
-    cppsources,
-    srcdeps + headers,
-    common_includes,
-))
-
-if_windows(a = libiomp5_cc_binary(
-    "libiomp5md.dll",
-    cppsources,
-    srcdeps + headers,
-    common_includes,
-))
-
-bzl_library(
-    name = "openmp_bzl",
-    srcs = ["openmp.bzl"],
-)
diff --git a/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/llvm_openmp/BUILD.bazel
similarity index 100%
rename from third_party/llvm_openmp/BUILD
rename to third_party/xla/third_party/llvm_openmp/BUILD.bazel
diff --git a/third_party/xla/third_party/mkl/BUILD b/third_party/xla/third_party/mkl/BUILD
deleted file mode 100644
index 067771b43f7e..000000000000
--- a/third_party/xla/third_party/mkl/BUILD
+++ /dev/null
@@ -1 +0,0 @@
-licenses(["notice"])  # 3-Clause BSD
diff --git a/third_party/mkl/BUILD b/third_party/xla/third_party/mkl/BUILD.bazel
similarity index 100%
rename from third_party/mkl/BUILD
rename to third_party/xla/third_party/mkl/BUILD.bazel
diff --git a/third_party/xla/third_party/mkl_dnn/BUILD b/third_party/xla/third_party/mkl_dnn/BUILD
index 99ee10bb8354..057c269f0c07 100644
--- a/third_party/xla/third_party/mkl_dnn/BUILD
+++ b/third_party/xla/third_party/mkl_dnn/BUILD
@@ -1,5 +1,5 @@
 package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # copybara:uncomment default_applicable_licenses = ["//third_party/tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/third_party/mpitrampoline/BUILD b/third_party/xla/third_party/mpitrampoline/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/mpitrampoline/BUILD
+++ b/third_party/xla/third_party/mpitrampoline/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD b/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
index f46e39d762a1..f8c44a2ccd5e 100644
--- a/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
+++ b/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
@@ -2,7 +2,7 @@
 #  A forwarding MPI implementation that can use any other MPI implementation via an MPI ABI
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@local_xla//xla:strict.default.bzl", "py_strict_binary")
+load("@local_xla//xla:py_strict.bzl", "py_strict_binary")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/third_party/xla/third_party/nanobind/BUILD b/third_party/xla/third_party/nanobind/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/nanobind/BUILD
+++ b/third_party/xla/third_party/nanobind/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/nanobind/workspace.bzl b/third_party/xla/third_party/nanobind/workspace.bzl
index 9615732266b2..a870a2ae9480 100644
--- a/third_party/xla/third_party/nanobind/workspace.bzl
+++ b/third_party/xla/third_party/nanobind/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "nanobind",
-        strip_prefix = "nanobind-b4b933111fa61815f3f5b509fde80c24f029ac26",
-        sha256 = "d1d8575f2bf76cc2ca357ac5521daa2f1bcf5397231d856f4ce66ba0670ac928",
-        urls = tf_mirror_urls("https://github.com/wjakob/nanobind/archive/b4b933111fa61815f3f5b509fde80c24f029ac26.tar.gz"),
+        strip_prefix = "nanobind-2.7.0",
+        sha256 = "6c8c6bf0435b9d8da9312801686affcf34b6dbba142db60feec8d8e220830499",
+        urls = tf_mirror_urls("https://github.com/wjakob/nanobind/archive/refs/tags/v2.7.0.tar.gz"),
         build_file = "//third_party/nanobind:nanobind.BUILD",
     )
diff --git a/third_party/xla/third_party/nasm/BUILD b/third_party/xla/third_party/nasm/BUILD
index ed1568c32f33..55578367568e 100644
--- a/third_party/xla/third_party/nasm/BUILD
+++ b/third_party/xla/third_party/nasm/BUILD
@@ -1,3 +1,3 @@
 # Needed to make this a package.
 
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/nccl/BUILD b/third_party/xla/third_party/nccl/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/pybind11_bazel/BUILD b/third_party/xla/third_party/nccl/BUILD.bazel
similarity index 100%
rename from third_party/pybind11_bazel/BUILD
rename to third_party/xla/third_party/nccl/BUILD.bazel
diff --git a/third_party/xla/third_party/nccl/archive.BUILD b/third_party/xla/third_party/nccl/archive.BUILD
index d726cea4db33..61d4975424e3 100644
--- a/third_party/xla/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/nccl/archive.BUILD
@@ -93,7 +93,8 @@ genrule(
     cmd = """
     mkdir -p src/device/generated
     $(location :generate) src/device/generated
-    tar --warning=no-file-changed -cf $@ src
+    # Retry the tar command three times to avoid flakiness.
+    (r=3;while ! tar --warning=no-file-changed -cf $@ src ; do ((--r))||exit;sleep 1;done)
     """,
     tools = [":generate"],
 )
diff --git a/third_party/xla/third_party/nccl/hermetic/BUILD b/third_party/xla/third_party/nccl/hermetic/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/remote_config/BUILD b/third_party/xla/third_party/nccl/hermetic/BUILD.bazel
similarity index 100%
rename from third_party/remote_config/BUILD
rename to third_party/xla/third_party/nccl/hermetic/BUILD.bazel
diff --git a/third_party/xla/third_party/nccl/hermetic/nccl_configure.bzl b/third_party/xla/third_party/nccl/hermetic/nccl_configure.bzl
index c1e49a6b9f1d..fe80dc2a3d54 100644
--- a/third_party/xla/third_party/nccl/hermetic/nccl_configure.bzl
+++ b/third_party/xla/third_party/nccl/hermetic/nccl_configure.bzl
@@ -9,13 +9,16 @@
 
 """
 
+load("@cuda_nccl//:version.bzl", _nccl_version = "VERSION")
 load(
     "//third_party/gpus/cuda/hermetic:cuda_configure.bzl",
     "HERMETIC_CUDA_VERSION",
     "TF_CUDA_VERSION",
     "TF_NEED_CUDA",
+    "USE_CUDA_REDISTRIBUTIONS",
     "enable_cuda",
     "get_cuda_version",
+    "use_cuda_redistributions",
 )
 load(
     "//third_party/remote_config:common.bzl",
@@ -130,7 +133,7 @@ alias(
 
 def _create_local_nccl_repository(repository_ctx):
     cuda_version = get_cuda_version(repository_ctx).split(".")[:2]
-    nccl_version = repository_ctx.read(repository_ctx.attr.nccl_version)
+    nccl_version = _nccl_version
 
     if get_host_environ(repository_ctx, _TF_NCCL_USE_STUB, "0") == "0":
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
@@ -157,7 +160,14 @@ def _nccl_autoconf_impl(repository_ctx):
         get_cpu_value(repository_ctx) != "Linux"):
         # Add a dummy build file to make bazel query happy.
         repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
-        repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
+        if use_cuda_redistributions(repository_ctx):
+            nccl_version = _nccl_version
+            repository_ctx.file(
+                "nccl_config.h",
+                "#define TF_NCCL_VERSION \"%s\"" % nccl_version,
+            )
+        else:
+            repository_ctx.file("nccl_config.h", "#define TF_NCCL_VERSION \"\"")
     else:
         _create_local_nccl_repository(repository_ctx)
 
@@ -167,6 +177,8 @@ _ENVIRONS = [
     _TF_NCCL_USE_STUB,
     HERMETIC_CUDA_VERSION,
     "LOCAL_NCCL_PATH",
+    USE_CUDA_REDISTRIBUTIONS,
+    "TF_NEED_ROCM",
 ]
 
 nccl_configure = repository_rule(
@@ -174,7 +186,6 @@ nccl_configure = repository_rule(
     implementation = _nccl_autoconf_impl,
     attrs = {
         "environ": attr.string_dict(),
-        "nccl_version": attr.label(default = Label("@cuda_nccl//:version.txt")),
         "generated_names_tpl": attr.label(default = Label("//third_party/nccl:generated_names.bzl.tpl")),
         "build_defs_tpl": attr.label(default = Label("//third_party/nccl:build_defs.bzl.tpl")),
     },
diff --git a/third_party/xla/third_party/nccl/hermetic/nccl_redist_init_repository.bzl b/third_party/xla/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
index 3bb2fe0efcf5..524ba30a50eb 100644
--- a/third_party/xla/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
+++ b/third_party/xla/third_party/nccl/hermetic/nccl_redist_init_repository.bzl
@@ -20,6 +20,7 @@ load(
     "OS_ARCH_DICT",
     "create_build_file",
     "create_dummy_build_file",
+    "create_version_file",
     "get_archive_name",
     "get_env_var",
     "get_lib_name_to_version_dict",
@@ -42,7 +43,7 @@ def _use_downloaded_nccl_wheel(repository_ctx):
     if not cuda_version:
         # If no CUDA version is found, comment out cc_import targets.
         create_dummy_build_file(repository_ctx)
-        repository_ctx.file("version.txt", major_version)
+        create_version_file(repository_ctx, major_version)
         return
 
     # Download archive only when GPU config is used.
@@ -105,7 +106,7 @@ def _use_downloaded_nccl_wheel(repository_ctx):
         major_version,
     )
 
-    repository_ctx.file("version.txt", major_version)
+    create_version_file(repository_ctx, major_version)
 
 def _use_local_nccl_path(repository_ctx, local_nccl_path):
     # buildifier: disable=function-docstring-args
diff --git a/third_party/xla/third_party/nvshmem/BUILD b/third_party/xla/third_party/nvshmem/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/stablehlo/BUILD b/third_party/xla/third_party/nvshmem/BUILD.bazel
similarity index 100%
rename from third_party/stablehlo/BUILD
rename to third_party/xla/third_party/nvshmem/BUILD.bazel
diff --git a/third_party/xla/third_party/nvtx/BUILD b/third_party/xla/third_party/nvtx/BUILD
deleted file mode 100644
index af6de99cb8fc..000000000000
--- a/third_party/xla/third_party/nvtx/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-# NVIDIA NVTX 3
-
-licenses(["notice"])
-
-exports_files(["LICENSE.txt"])
-
-cc_library(
-    name = "headers",
-    hdrs = glob(["**"]),
-    include_prefix = "nvtx3",
-    visibility = ["//visibility:public"],
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
diff --git a/third_party/nvtx/BUILD b/third_party/xla/third_party/nvtx/BUILD.bazel
similarity index 100%
rename from third_party/nvtx/BUILD
rename to third_party/xla/third_party/nvtx/BUILD.bazel
diff --git a/third_party/xla/third_party/ortools/BUILD b/third_party/xla/third_party/ortools/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/systemlibs/BUILD b/third_party/xla/third_party/ortools/BUILD.bazel
similarity index 100%
rename from third_party/systemlibs/BUILD
rename to third_party/xla/third_party/ortools/BUILD.bazel
diff --git a/third_party/xla/third_party/protobuf/BUILD b/third_party/xla/third_party/protobuf/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/tensorrt/BUILD b/third_party/xla/third_party/protobuf/BUILD.bazel
similarity index 100%
rename from third_party/tensorrt/BUILD
rename to third_party/xla/third_party/protobuf/BUILD.bazel
diff --git a/third_party/xla/third_party/protobuf/protobuf.patch b/third_party/xla/third_party/protobuf/protobuf.patch
index 48de6bb1937d..659b91409dd3 100644
--- a/third_party/xla/third_party/protobuf/protobuf.patch
+++ b/third_party/xla/third_party/protobuf/protobuf.patch
@@ -1,22 +1,86 @@
 diff --git a/BUILD.bazel b/BUILD.bazel
 --- a/BUILD.bazel	(revision 90b73ac3f0b10320315c2ca0d03a5a9b095d2f66)
-+++ b/BUILD.bazel	(date 1714620794503)
-@@ -68,6 +68,8 @@
++++ b/BUILD.bazel	(date 1741994163620)
+@@ -15,6 +15,7 @@
+     "internal_protobuf_py_tests",
+     "py_proto_library",
+ )
++load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+
+ licenses(["notice"])
+
+@@ -24,6 +25,19 @@
+ # Protobuf Runtime Library
+ ################################################################################
+
++bool_flag(
++    name = "use_dlls",
++    build_setting_default = False,
++    visibility = ["//visibility:public"],
++)
++
++config_setting(
++    name = "enable_use_dlls",
++    flag_values = {
++        ":use_dlls": "True",
++    },
++)
++
+ cc_library(
+     name = "protobuf_lite",
+     srcs = [
+@@ -68,6 +82,14 @@
      copts = COPTS,
      includes = ["src/"],
      linkopts = LINK_OPTS,
++    local_defines = select({
++        ":enable_use_dlls": [
++            "PROTOBUF_USE_DLLS",
++            "LIBPROTOBUF_EXPORTS",
++        ],
++        "//conditions:default": [],
++    }),
 +    alwayslink = 1,
      visibility = ["//visibility:public"],
  )
 
-@@ -135,6 +136,7 @@
+@@ -135,6 +157,14 @@
      copts = COPTS,
      includes = ["src/"],
      linkopts = LINK_OPTS,
++    local_defines = select({
++        ":enable_use_dlls": [
++            "PROTOBUF_USE_DLLS",
++            "LIBPROTOBUF_EXPORTS",
++        ],
++        "//conditions:default": [],
++    }),
 +    alwayslink = 1,
      visibility = ["//visibility:public"],
      deps = [":protobuf_lite"] + select({
          "//build_defs:config_msvc": [],
+@@ -1074,7 +1104,8 @@
+         "@com_google_protobuf//:type_proto",
+         "@com_google_protobuf//:wrappers_proto",
+     ],
+-    command_line = "--cpp_out=$(OUT)",
++    command_line = "--cpp_out=dllexport_decl=PROTOBUF_EXPORT:$(OUT)",
++#    command_line = "--cpp_out=$(OUT)",
+     runtime = ":protobuf",
+     visibility = ["//visibility:public"],
+ )
+diff --git a/protobuf.bzl b/protobuf.bzl
+--- a/protobuf.bzl	(revision 90b73ac3f0b10320315c2ca0d03a5a9b095d2f66)
++++ b/protobuf.bzl	(date 1714611573270)
+@@ -127,7 +127,7 @@
+         use_grpc_plugin = (ctx.attr.plugin_language == "grpc" and ctx.attr.plugin)
+         path_tpl = "$(realpath %s)" if in_gen_dir else "%s"
+         if ctx.attr.gen_cc:
+-            args += [("--cpp_out=" + path_tpl) % gen_dir]
++            args += [("--cpp_out=dllexport_decl=PROTOBUF_EXPORT:" + path_tpl) % gen_dir]
+             outs.extend(_CcOuts([src.basename], use_grpc_plugin = use_grpc_plugin))
+         if ctx.attr.gen_py:
+             args += [("--python_out=" + path_tpl) % gen_dir]
 diff --git a/python/google/protobuf/pyext/descriptor.cc b/python/google/protobuf/pyext/descriptor.cc
 index 162531226..e93ec4809 100644
 --- a/python/google/protobuf/pyext/descriptor.cc
diff --git a/third_party/xla/third_party/py/BUILD b/third_party/xla/third_party/py/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/py/BUILD
rename to third_party/xla/third_party/py/BUILD.bazel
diff --git a/third_party/xla/third_party/py/ml_dtypes/BUILD b/third_party/xla/third_party/py/ml_dtypes/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/tf_runtime/BUILD b/third_party/xla/third_party/py/ml_dtypes/BUILD.bazel
similarity index 100%
rename from third_party/tf_runtime/BUILD
rename to third_party/xla/third_party/py/ml_dtypes/BUILD.bazel
diff --git a/third_party/xla/third_party/py/numpy/BUILD b/third_party/xla/third_party/py/numpy/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/py/numpy/BUILD
rename to third_party/xla/third_party/py/numpy/BUILD.bazel
diff --git a/third_party/xla/third_party/py/py_import.bzl b/third_party/xla/third_party/py/py_import.bzl
index 38a1ae1da7c3..4f05941cae04 100644
--- a/third_party/xla/third_party/py/py_import.bzl
+++ b/third_party/xla/third_party/py/py_import.bzl
@@ -1,5 +1,7 @@
 """ Macros to unpack a wheel and use its content as a py_library. """
 
+load("@rules_python//python:defs.bzl", "py_library")
+
 def _unpacked_wheel_impl(ctx):
     output_dir = ctx.actions.declare_directory(ctx.label.name)
     wheel = ctx.file.wheel
@@ -52,7 +54,7 @@ def py_import(
         wheel = wheel,
         wheel_deps = wheel_deps,
     )
-    native.py_library(
+    py_library(
         name = name,
         data = [":" + unpacked_wheel_name],
         imports = [unpacked_wheel_name],
diff --git a/third_party/xla/third_party/py/python_configure.bzl b/third_party/xla/third_party/py/python_configure.bzl
index 3728a91b9310..8a6c4894a621 100644
--- a/third_party/xla/third_party/py/python_configure.bzl
+++ b/third_party/xla/third_party/py/python_configure.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "//third_party/remote_config:common.bzl",
+    "@local_xla//third_party/remote_config:common.bzl",
     "BAZEL_SH",
     "PYTHON_BIN_PATH",
     "PYTHON_LIB_PATH",
diff --git a/third_party/xla/third_party/py/python_init_rules.bzl b/third_party/xla/third_party/py/python_init_rules.bzl
index 796ae3d92d99..0cabd24e7802 100644
--- a/third_party/xla/third_party/py/python_init_rules.bzl
+++ b/third_party/xla/third_party/py/python_init_rules.bzl
@@ -9,5 +9,9 @@ def python_init_rules():
         strip_prefix = "rules_python-0.39.0",
         url = "https://github.com/bazelbuild/rules_python/releases/download/0.39.0/rules_python-0.39.0.tar.gz",
         patch_args = ["-p1"],
-        patches = [Label("//third_party/py:rules_python.patch")],
+        patches = [
+            Label("//third_party/py:rules_python1.patch"),
+            Label("//third_party/py:rules_python2.patch"),
+            Label("//third_party/py:rules_python3.patch"),
+        ],
     )
diff --git a/third_party/xla/third_party/py/python_init_toolchains.bzl b/third_party/xla/third_party/py/python_init_toolchains.bzl
index fceca6eef1fa..f8f8f3dc0885 100644
--- a/third_party/xla/third_party/py/python_init_toolchains.bzl
+++ b/third_party/xla/third_party/py/python_init_toolchains.bzl
@@ -6,6 +6,7 @@ load(
     "HERMETIC_PYTHON_SHA256",
     "HERMETIC_PYTHON_URL",
     "HERMETIC_PYTHON_VERSION",
+    "HERMETIC_PYTHON_VERSION_KIND",
 )
 load("@rules_python//python:repositories.bzl", "python_register_toolchains")
 load("@rules_python//python:versions.bzl", "MINOR_MAPPING", "PLATFORMS")
@@ -61,4 +62,5 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
             name = name,
             ignore_root_user_error = True,
             python_version = HERMETIC_PYTHON_VERSION,
+            python_version_kind = HERMETIC_PYTHON_VERSION_KIND,
         )
diff --git a/third_party/xla/third_party/py/python_repo.bzl b/third_party/xla/third_party/py/python_repo.bzl
index 423d6dd63a8e..c1075b1491d8 100644
--- a/third_party/xla/third_party/py/python_repo.bzl
+++ b/third_party/xla/third_party/py/python_repo.bzl
@@ -106,6 +106,7 @@ Requirements_lock label: "{requirements_lock_label}"
         """
 TF_PYTHON_VERSION = "{version}"
 HERMETIC_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION_KIND = "{py_kind}"
 WHEEL_NAME = "{wheel_name}"
 WHEEL_COLLAB = "{wheel_collab}"
 REQUIREMENTS = "{requirements}"
@@ -117,6 +118,7 @@ HERMETIC_PYTHON_SHA256 = "{hermetic_sha256}"
 HERMETIC_PYTHON_PREFIX = "{hermetic_prefix}"
 """.format(
             version = version,
+            py_kind = py_kind,
             wheel_name = wheel_name,
             wheel_collab = wheel_collab,
             requirements = str(requirements),
diff --git a/third_party/xla/third_party/py/python_wheel.bzl b/third_party/xla/third_party/py/python_wheel.bzl
index ad33f44c3cc7..5bb045321d14 100644
--- a/third_party/xla/third_party/py/python_wheel.bzl
+++ b/third_party/xla/third_party/py/python_wheel.bzl
@@ -114,14 +114,14 @@ Examples:
                                      --repo_env=ML_WHEEL_BUILD_DATE=20250107
 2. release wheel version: 2.19.0
    Env vars passed to Bazel command: --repo_env=ML_WHEEL_TYPE=release
-3. release candidate wheel version: 2.19.0-rc1
+3. release candidate wheel version: 2.19.0rc1
    Env vars passed to Bazel command: --repo_env=ML_WHEEL_TYPE=release
-                                     --repo_env=ML_WHEEL_VERSION_SUFFIX=-rc1
-4. custom wheel version: 2.19.0.dev20250107+cbe478fc5-custom
+                                     --repo_env=ML_WHEEL_VERSION_SUFFIX=rc1
+4. custom wheel version: 2.19.0.dev20250107+cbe478fc5custom
    Env vars passed to Bazel command: --repo_env=ML_WHEEL_TYPE=custom
                                      --repo_env=ML_WHEEL_BUILD_DATE=$(git show -s --format=%as HEAD)
                                      --repo_env=ML_WHEEL_GIT_HASH=$(git rev-parse HEAD)
-                                     --repo_env=ML_WHEEL_VERSION_SUFFIX=-custom
+                                     --repo_env=ML_WHEEL_VERSION_SUFFIX=custom
 5. snapshot wheel version: 2.19.0.dev0+selfbuilt
    Env vars passed to Bazel command: --repo_env=ML_WHEEL_TYPE=snapshot
 
@@ -168,11 +168,12 @@ def _collect_data_aspect_impl(_, ctx):
     if hasattr(ctx.rule.attr, "data"):
         for data in ctx.rule.attr.data:
             for f in data.files.to_list():
-                if not any([f.path.endswith(ext) for ext in extensions]):
+                if not f.owner.package:
                     continue
-                if "pypi" in f.path:
-                    continue
-                files[f] = True
+                for ext in extensions:
+                    if f.extension == ext:
+                        files[f] = True
+                        break
 
     if hasattr(ctx.rule.attr, "deps"):
         for dep in ctx.rule.attr.deps:
@@ -187,17 +188,50 @@ collect_data_aspect = aspect(
     attr_aspects = ["deps"],
     attrs = {
         "_extensions": attr.string_list(
-            default = [".so", ".pyd", ".pyi", ".dll", ".dylib", ".lib", ".pd"],
+            default = ["so", "pyd", "pyi", "dll", "dylib", "lib", "pd"],
+        ),
+    },
+)
+
+def _collect_symlink_data_aspect_impl(_, ctx):
+    files = {}
+    symlink_extensions = ctx.attr._symlink_extensions
+    if not hasattr(ctx.rule.attr, "deps"):
+        return [FilePathInfo(files = depset(files.keys()))]
+    for dep in ctx.rule.attr.deps:
+        if not (dep[DefaultInfo].default_runfiles and
+                dep[DefaultInfo].default_runfiles.files):
+            continue
+        for file in dep[DefaultInfo].default_runfiles.files.to_list():
+            if not file.owner.package:
+                continue
+            for ext in symlink_extensions:
+                if file.extension == ext:
+                    files[file] = True
+                    break
+
+    return [FilePathInfo(files = depset(files.keys()))]
+
+collect_symlink_data_aspect = aspect(
+    implementation = _collect_symlink_data_aspect_impl,
+    attr_aspects = ["symlink_deps"],
+    attrs = {
+        "_symlink_extensions": attr.string_list(
+            default = ["pyi", "lib", "pd"],
         ),
     },
 )
 
 def _collect_data_files_impl(ctx):
-    files = []
+    files = {}
     for dep in ctx.attr.deps:
-        files.extend((dep[FilePathInfo].files.to_list()))
+        for f in dep[FilePathInfo].files.to_list():
+            files[f] = True
+    for symlink_dep in ctx.attr.symlink_deps:
+        for f in symlink_dep[FilePathInfo].files.to_list():
+            files[f] = True
     return [DefaultInfo(files = depset(
-        files,
+        files.keys(),
     ))]
 
 collect_data_files = rule(
@@ -206,6 +240,9 @@ collect_data_files = rule(
         "deps": attr.label_list(
             aspects = [collect_data_aspect],
         ),
+        "symlink_deps": attr.label_list(
+            aspects = [collect_symlink_data_aspect],
+        ),
     },
 )
 
diff --git a/third_party/xla/third_party/py/rules_python.patch b/third_party/xla/third_party/py/rules_python.patch
deleted file mode 100644
index 400e7d652b86..000000000000
--- a/third_party/xla/third_party/py/rules_python.patch
+++ /dev/null
@@ -1,96 +0,0 @@
-diff --git a/python/private/pypi/deps.bzl b/python/private/pypi/deps.bzl
-index 8949ed4a..8d0ab0e7 100644
---- a/python/private/pypi/deps.bzl
-+++ b/python/private/pypi/deps.bzl
-@@ -51,8 +51,8 @@ _RULE_DEPS = [
-     ),
-     (
-         "pypi__packaging",
--        "https://files.pythonhosted.org/packages/49/df/1fceb2f8900f8639e278b056416d49134fb8d84c5942ffaa01ad34782422/packaging-24.0-py3-none-any.whl",
--        "2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5",
-+        "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl",
-+        "09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759",
-     ),
-     (
-         "pypi__pep517",
-@@ -61,8 +61,8 @@ _RULE_DEPS = [
-     ),
-     (
-         "pypi__pip",
--        "https://files.pythonhosted.org/packages/8a/6a/19e9fe04fca059ccf770861c7d5721ab4c2aebc539889e97c7977528a53b/pip-24.0-py3-none-any.whl",
--        "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc",
-+        "https://files.pythonhosted.org/packages/ef/7d/500c9ad20238fcfcb4cb9243eede163594d7020ce87bd9610c9e02771876/pip-24.3.1-py3-none-any.whl",
-+        "3790624780082365f47549d032f3770eeb2b1e8bd1f7b2e02dace1afa361b4ed",
-     ),
-     (
-         "pypi__pip_tools",
-diff --git a/python/private/pypi/evaluate_markers.bzl b/python/private/pypi/evaluate_markers.bzl
-index c805fd7a..e57e6138 100644
---- a/python/private/pypi/evaluate_markers.bzl
-+++ b/python/private/pypi/evaluate_markers.bzl
-@@ -20,7 +20,7 @@ load(":pypi_repo_utils.bzl", "pypi_repo_utils")
- SRCS = [
-     # When the version, or any of the files in `packaging` package changes,
-     # this file will change as well.
--    Label("@pypi__packaging//:packaging-24.0.dist-info/RECORD"),
-+    Label("@pypi__packaging//:packaging-24.2.dist-info/RECORD"),
-     Label("//python/private/pypi/requirements_parser:resolve_target_platforms.py"),
-     Label("//python/private/pypi/whl_installer:platform.py"),
- ]
-diff --git a/python/versions.bzl b/python/versions.bzl
-index 774c24d1..91e59f9b 100644
---- a/python/versions.bzl
-+++ b/python/versions.bzl
-@@ -561,6 +561,20 @@ TOOL_VERSIONS = {
-         },
-         "strip_prefix": "python",
-     },
-+    "3.12.8": {
-+        "url": "20241206/cpython-{python_version}+20241206-{platform}-{build}.tar.gz",
-+        "sha256": {
-+            "aarch64-apple-darwin": "e3c4aa607717b23903ca2650d5c3ee24f89b97543e2db2b0f463bddc7a9e92f3",
-+            "aarch64-unknown-linux-gnu": "ce674b55442b732973afb2932c281bb1ded4ad7e22bcf9b07071165770758c7e",
-+            "ppc64le-unknown-linux-gnu": "b7214790b273de9ed0532420054b72ba1393d62d2fc844ec55ade193771bd90c",
-+            "s390x-unknown-linux-gnu": "73102f5dbd7d1e7e9c2f2c80aedf2893d99a7fa407f6674ec8b2f57ba07daee5",
-+            "x86_64-apple-darwin": "3ba35c706577d755e8e52a4c161a042464577c0e695e2a605362fa469e26de10",
-+            "x86_64-pc-windows-msvc": "767b4be3ddf6b99e5ade519789c1615c191d8cf99d5aff4685cc18b48931f1e6",
-+            "x86_64-unknown-linux-gnu": "b9d6ee5ddac1198e72d53112698773fc8bb597de095592eb849ca794306699ba",
-+            "x86_64-unknown-linux-musl": "6f305888703691dd04cfff85284d23ea0b0146ed7c4415e472f1fb72b3f32cdf",
-+        },
-+        "strip_prefix": "python",
-+    },
-     "3.13.0": {
-         "url": "20241016/cpython-{python_version}+20241016-{platform}-{build}.{ext}",
-         "sha256": {
-@@ -589,7 +603,7 @@ MINOR_MAPPING = {
-     "3.9": "3.9.20",
-     "3.10": "3.10.15",
-     "3.11": "3.11.10",
--    "3.12": "3.12.7",
-+    "3.12": "3.12.8",
-     "3.13": "3.13.0",
- }
-
-diff --git a/python/private/python_bootstrap_template.txt b/python/private/python_bootstrap_template.txt
-index 0f9c90b3..6d1e2f61 100644
---- a/python/private/python_bootstrap_template.txt
-+++ b/python/private/python_bootstrap_template.txt
-@@ -52,8 +52,16 @@ def GetWindowsPathWithUNCPrefix(path):
-   # removed from common Win32 file and directory functions.
-   # Related doc: https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=cmd#enable-long-paths-in-windows-10-version-1607-and-later
-   import platform
--  if platform.win32_ver()[1] >= '10.0.14393':
--    return path
-+  version = None
-+  # The try-except block is needed to fix the flakiness of RBE tests
-+  # on Windows 2022 using hermetic python 3.12.8.
-+  try:
-+    version = platform.win32_ver()[1]
-+  except (ValueError, KeyError):
-+    version = platform.win32_ver()[1]
-+  finally:
-+    if version and version >= '10.0.14393':
-+      return path
- 
-   # import sysconfig only now to maintain python 2.6 compatibility
-   import sysconfig
diff --git a/third_party/xla/third_party/py/rules_python1.patch b/third_party/xla/third_party/py/rules_python1.patch
new file mode 100644
index 000000000000..aab0c23d4d22
--- /dev/null
+++ b/third_party/xla/third_party/py/rules_python1.patch
@@ -0,0 +1,93 @@
+diff --git a/python/private/pypi/deps.bzl b/python/private/pypi/deps.bzl
+index 8949ed4a..8d0ab0e7 100644
+--- a/python/private/pypi/deps.bzl
++++ b/python/private/pypi/deps.bzl
+@@ -51,8 +51,8 @@ _RULE_DEPS = [
+     ),
+     (
+         "pypi__packaging",
+-        "https://files.pythonhosted.org/packages/49/df/1fceb2f8900f8639e278b056416d49134fb8d84c5942ffaa01ad34782422/packaging-24.0-py3-none-any.whl",
+-        "2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5",
++        "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl",
++        "09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759",
+     ),
+     (
+         "pypi__pep517",
+@@ -61,8 +61,8 @@ _RULE_DEPS = [
+     ),
+     (
+         "pypi__pip",
+-        "https://files.pythonhosted.org/packages/8a/6a/19e9fe04fca059ccf770861c7d5721ab4c2aebc539889e97c7977528a53b/pip-24.0-py3-none-any.whl",
+-        "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc",
++        "https://files.pythonhosted.org/packages/ef/7d/500c9ad20238fcfcb4cb9243eede163594d7020ce87bd9610c9e02771876/pip-24.3.1-py3-none-any.whl",
++        "3790624780082365f47549d032f3770eeb2b1e8bd1f7b2e02dace1afa361b4ed",
+     ),
+     (
+         "pypi__pip_tools",
+diff --git a/python/private/pypi/evaluate_markers.bzl b/python/private/pypi/evaluate_markers.bzl
+index c805fd7a..e57e6138 100644
+--- a/python/private/pypi/evaluate_markers.bzl
++++ b/python/private/pypi/evaluate_markers.bzl
+@@ -20,7 +20,7 @@ load(":pypi_repo_utils.bzl", "pypi_repo_utils")
+ SRCS = [
+     # When the version, or any of the files in `packaging` package changes,
+     # this file will change as well.
+-    Label("@pypi__packaging//:packaging-24.0.dist-info/RECORD"),
++    Label("@pypi__packaging//:packaging-24.2.dist-info/RECORD"),
+     Label("//python/private/pypi/requirements_parser:resolve_target_platforms.py"),
+     Label("//python/private/pypi/whl_installer:platform.py"),
+ ]
+diff --git a/python/versions.bzl b/python/versions.bzl
+index 774c24d1..91e59f9b 100644
+--- a/python/versions.bzl
++++ b/python/versions.bzl
+@@ -561,6 +561,20 @@ TOOL_VERSIONS = {
+         },
+         "strip_prefix": "python",
+     },
++    "3.12.8": {
++        "url": "20241206/cpython-{python_version}+20241206-{platform}-{build}.tar.gz",
++        "sha256": {
++            "aarch64-apple-darwin": "e3c4aa607717b23903ca2650d5c3ee24f89b97543e2db2b0f463bddc7a9e92f3",
++            "aarch64-unknown-linux-gnu": "ce674b55442b732973afb2932c281bb1ded4ad7e22bcf9b07071165770758c7e",
++            "ppc64le-unknown-linux-gnu": "b7214790b273de9ed0532420054b72ba1393d62d2fc844ec55ade193771bd90c",
++            "s390x-unknown-linux-gnu": "73102f5dbd7d1e7e9c2f2c80aedf2893d99a7fa407f6674ec8b2f57ba07daee5",
++            "x86_64-apple-darwin": "3ba35c706577d755e8e52a4c161a042464577c0e695e2a605362fa469e26de10",
++            "x86_64-pc-windows-msvc": "767b4be3ddf6b99e5ade519789c1615c191d8cf99d5aff4685cc18b48931f1e6",
++            "x86_64-unknown-linux-gnu": "b9d6ee5ddac1198e72d53112698773fc8bb597de095592eb849ca794306699ba",
++            "x86_64-unknown-linux-musl": "6f305888703691dd04cfff85284d23ea0b0146ed7c4415e472f1fb72b3f32cdf",
++        },
++        "strip_prefix": "python",
++    },
+     "3.13.0": {
+         "url": "20241016/cpython-{python_version}+20241016-{platform}-{build}.{ext}",
+         "sha256": {
+@@ -589,7 +603,7 @@ MINOR_MAPPING = {
+     "3.9": "3.9.20",
+     "3.10": "3.10.15",
+     "3.11": "3.11.10",
+-    "3.12": "3.12.7",
++    "3.12": "3.12.8",
+     "3.13": "3.13.0",
+ }
+
+diff --git a/python/private/python_bootstrap_template.txt b/python/private/python_bootstrap_template.txt
+index 0f9c90b3..567bdc88 100644
+--- a/python/private/python_bootstrap_template.txt
++++ b/python/private/python_bootstrap_template.txt
+@@ -52,7 +52,14 @@ def GetWindowsPathWithUNCPrefix(path):
+   # removed from common Win32 file and directory functions.
+   # Related doc: https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=cmd#enable-long-paths-in-windows-10-version-1607-and-later
+   import platform
+-  if platform.win32_ver()[1] >= '10.0.14393':
++  version = None
++  # The try-except block is needed to fix the flakiness of RBE tests
++  # on Windows 2022 using hermetic python 3.12.8.
++  try:
++    version = platform.win32_ver()[1]
++  except (ValueError, KeyError):
++    version = platform.win32_ver()[1]
++  if version and version >= '10.0.14393':
+     return path
+
+   # import sysconfig only now to maintain python 2.6 compatibility
diff --git a/third_party/xla/third_party/py/rules_python2.patch b/third_party/xla/third_party/py/rules_python2.patch
new file mode 100644
index 000000000000..0527e15f9d64
--- /dev/null
+++ b/third_party/xla/third_party/py/rules_python2.patch
@@ -0,0 +1,106 @@
+diff --git a/python/versions.bzl b/python/versions.bzl
+index 91e59f9b..e74c9736 100644
+--- a/python/versions.bzl
++++ b/python/versions.bzl
+@@ -21,7 +21,7 @@ LINUX_NAME = "linux"
+ WINDOWS_NAME = "windows"
+ FREETHREADED = "freethreaded"
+ 
+-DEFAULT_RELEASE_BASE_URL = "https://github.com/indygreg/python-build-standalone/releases/download"
++DEFAULT_RELEASE_BASE_URL = "https://github.com/astral-sh/python-build-standalone/releases/download"
+ 
+ # When updating the versions and releases, run the following command to get
+ # the hashes:
+@@ -575,23 +575,55 @@ TOOL_VERSIONS = {
+         },
+         "strip_prefix": "python",
+     },
+-    "3.13.0": {
+-        "url": "20241016/cpython-{python_version}+20241016-{platform}-{build}.{ext}",
+-        "sha256": {
+-            "aarch64-apple-darwin": "31397953849d275aa2506580f3fa1cb5a85b6a3d392e495f8030e8b6412f5556",
+-            "aarch64-unknown-linux-gnu": "e8378c0162b2e0e4cc1f62b29443a3305d116d09583304dbb0149fecaff6347b",
+-            "ppc64le-unknown-linux-gnu": "fc4b7f27c4e84c78f3c8e6c7f8e4023e4638d11f1b36b6b5ce457b1926cebb53",
+-            "s390x-unknown-linux-gnu": "66b19e6a07717f6cfcd3a8ca953f0a2eaa232291142f3d26a8d17c979ec0f467",
+-            "x86_64-apple-darwin": "cff1b7e7cd26f2d47acac1ad6590e27d29829776f77e8afa067e9419f2f6ce77",
+-            "x86_64-pc-windows-msvc": "b25926e8ce4164cf103bacc4f4d154894ea53e07dd3fdd5ebb16fb1a82a7b1a0",
+-            "x86_64-unknown-linux-gnu": "2c8cb15c6a2caadaa98af51df6fe78a8155b8471cb3dd7b9836038e0d3657fb4",
+-            "aarch64-apple-darwin-freethreaded": "efc2e71c0e05bc5bedb7a846e05f28dd26491b1744ded35ed82f8b49ccfa684b",
+-            "aarch64-unknown-linux-gnu-freethreaded": "59b50df9826475d24bb7eff781fa3949112b5e9c92adb29e96a09cdf1216d5bd",
+-            "ppc64le-unknown-linux-gnu-freethreaded": "1217efa5f4ce67fcc9f7eb64165b1bd0912b2a21bc25c1a7e2cb174a21a5df7e",
+-            "s390x-unknown-linux-gnu-freethreaded": "6c3e1e4f19d2b018b65a7e3ef4cd4225c5b9adfbc490218628466e636d5c4b8c",
+-            "x86_64-apple-darwin-freethreaded": "2e07dfea62fe2215738551a179c87dbed1cc79d1b3654f4d7559889a6d5ce4eb",
+-            "x86_64-pc-windows-msvc-freethreaded": "bfd89f9acf866463bc4baf01733da5e767d13f5d0112175a4f57ba91f1541310",
+-            "x86_64-unknown-linux-gnu-freethreaded": "a73adeda301ad843cce05f31a2d3e76222b656984535a7b87696a24a098b216c",
++    "3.13.2": {
++        "url": "20250317/cpython-{python_version}+20250317-{platform}-{build}.{ext}",
++        "sha256": {
++            "aarch64-apple-darwin": "faa44274a331eb39786362818b21b3a4e74514e8805000b20b0e55c590cecb94",
++            "aarch64-unknown-linux-gnu": "9c67260446fee6ea706dad577a0b32936c63f449c25d66e4383d5846b2ab2e36",
++            "ppc64le-unknown-linux-gnu": "345b53d2f86c9dbd7f1320657cb227ff9a42ef63ff21f129abbbc8c82a375147",
++            "s390x-unknown-linux-gnu": "ec3b16ea8a97e3138acec72bc5ff35949950c62c8994a8ec8e213fd93f0e806b",
++            "x86_64-apple-darwin": "ee4526e84b5ce5b11141c50060b385320f2773616249a741f90c96d460ce8e8f",
++            "x86_64-pc-windows-msvc": "84d7b52f3558c8e35c670a4fa14080c75e3ec584adfae49fec8b51008b75b21e",
++            "x86_64-unknown-linux-gnu": "db011f0cd29cab2291584958f4e2eb001b0e6051848d89b38a2dc23c5c54e512",
++            "x86_64-unknown-linux-musl": "00bb2d629f7eacbb5c6b44dc04af26d1f1da64cee3425b0d8eb5135a93830296",
++            "aarch64-apple-darwin-freethreaded": "c98c9c977e6fa05c3813bd49f3553904d89d60fed27e2e36468da7afa1d6d5e2",
++            "aarch64-unknown-linux-gnu-freethreaded": "b8635e59e3143fd17f19a3dfe8ccc246ee6587c87da359bd1bcab35eefbb5f19",
++            "ppc64le-unknown-linux-gnu-freethreaded": "6ae8fa44cb2edf4ab49cff1820b53c40c10349c0f39e11b8cd76ce7f3e7e1def",
++            "s390x-unknown-linux-gnu-freethreaded": "c074144cc80c2af32c420b79a9df26e8db405212619990c1fbdd308bd75afe3f",
++            "x86_64-apple-darwin-freethreaded": "0d73e4348d8d4b5159058609d2303705190405b485dd09ad05d870d7e0f36e0f",
++            "x86_64-pc-windows-msvc-freethreaded": "c51b4845fda5421e044067c111192f645234081d704313f74ee77fa013a186ea",
++            "x86_64-unknown-linux-gnu-freethreaded": "1aea5062614c036904b55c1cc2fb4b500b7f6f7a4cacc263f4888889d355eef8",
++        },
++        "strip_prefix": {
++            "aarch64-apple-darwin": "python",
++            "aarch64-unknown-linux-gnu": "python",
++            "ppc64le-unknown-linux-gnu": "python",
++            "s390x-unknown-linux-gnu": "python",
++            "x86_64-apple-darwin": "python",
++            "x86_64-pc-windows-msvc": "python",
++            "x86_64-unknown-linux-gnu": "python",
++            "x86_64-unknown-linux-musl": "python",
++            "aarch64-apple-darwin-freethreaded": "python/install",
++            "aarch64-unknown-linux-gnu-freethreaded": "python/install",
++            "ppc64le-unknown-linux-gnu-freethreaded": "python/install",
++            "s390x-unknown-linux-gnu-freethreaded": "python/install",
++            "x86_64-apple-darwin-freethreaded": "python/install",
++            "x86_64-pc-windows-msvc-freethreaded": "python/install",
++            "x86_64-unknown-linux-gnu-freethreaded": "python/install",
++        },
++    },
++    # Note: This is python 3.14.0a6, but it is labeled as '3.14.0' due to limitation within rules_python
++    # that prevents the correct handling of pre-release version identifiers like '3.14.0a'.
++    "3.14.0": {
++        "url": "20250409/cpython-{python_version}a6+20250409-{platform}-{build}.tar.gz",
++        "sha256": {
++            "aarch64-apple-darwin": "fd34267a9923a09c03ae7e3626b5681b58bccecaaf3c1cfec6d770c8b110a8be",
++            "aarch64-unknown-linux-gnu": "03fd176e1e14f21a50c970b883500226919a566ca5b1d27dec06a2dd68102d3e",
++            "ppc64le-unknown-linux-gnu": "f8ef33c10ea532307325b4cfbf4b31cf372c141c8aff6e8fe3894a959e1e4f03",
++            "s390x-unknown-linux-gnu": "bb6451996db2c148d9ce112d2c967b59162180447ac089f75de71a45210b9864",
++            "x86_64-apple-darwin": "15b366906fda248928434e44fa3d2a9b7928944165512537d778a5e98b12ec96",
++            "x86_64-pc-windows-msvc": "72f40ada379c9c480fb429105017b44266c1dc6679b73cc0c5159d5dc1f0b8a6",
++            "x86_64-unknown-linux-gnu": "d47becae984d63578bf04abfb8f0a545a5682c2b7b64ff32b90c3662bec2b06d",
+         },
+         "strip_prefix": "python",
+     },
+@@ -604,7 +636,8 @@ MINOR_MAPPING = {
+     "3.10": "3.10.15",
+     "3.11": "3.11.10",
+     "3.12": "3.12.8",
+-    "3.13": "3.13.0",
++    "3.13": "3.13.2",
++    "3.14": "3.14.0",
+ }
+ 
+ def _generate_platforms():
+@@ -793,9 +826,6 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
+         else:
+             build = "install_only"
+ 
+-        if WINDOWS_NAME in platform:
+-            build = "shared-" + build
+-
+         release_filename = u.format(
+             platform = p,
+             python_version = python_version,
diff --git a/third_party/xla/third_party/py/rules_python3.patch b/third_party/xla/third_party/py/rules_python3.patch
new file mode 100644
index 000000000000..4b3c8c99885b
--- /dev/null
+++ b/third_party/xla/third_party/py/rules_python3.patch
@@ -0,0 +1,45 @@
+diff --git a/python/private/python_register_toolchains.bzl b/python/private/python_register_toolchains.bzl
+index 98c8e5bf..fc533001 100644
+--- a/python/private/python_register_toolchains.bzl
++++ b/python/private/python_register_toolchains.bzl
+@@ -86,6 +86,7 @@ def python_register_toolchains(
+     minor_mapping = minor_mapping or MINOR_MAPPING
+
+     python_version = full_version(version = python_version, minor_mapping = minor_mapping)
++    python_version_kind = kwargs.pop("python_version_kind", "")
+
+     toolchain_repo_name = "{name}_toolchains".format(name = name)
+
+@@ -165,6 +166,7 @@ def python_register_toolchains(
+     toolchain_aliases(
+         name = name,
+         python_version = python_version,
++        python_version_kind = python_version_kind,
+         user_repository_name = name,
+         platforms = loaded_platforms,
+     )
+diff --git a/python/private/toolchains_repo.bzl b/python/private/toolchains_repo.bzl
+index d21fb53a..a5271c18 100644
+--- a/python/private/toolchains_repo.bzl
++++ b/python/private/toolchains_repo.bzl
+@@ -130,6 +130,9 @@ def _toolchain_aliases_impl(rctx):
+     (os_name, arch) = _get_host_os_arch(rctx, logger)
+
+     host_platform = _get_host_platform(os_name, arch)
++    python_version_kind = rctx.attr.python_version_kind
++    if python_version_kind == "ft":
++        host_platform += "-freethreaded"
+
+     is_windows = (os_name == WINDOWS_NAME)
+     python3_binary_path = "python.exe" if is_windows else "bin/python3"
+@@ -233,6 +236,10 @@ actions.""",
+             doc = "List of platforms for which aliases shall be created",
+         ),
+         "python_version": attr.string(doc = "The Python version."),
++        "python_version_kind": attr.string(
++            doc = "Python version kind, e.g. ft (free-threaded)",
++            default = ""
++        ),
+         "user_repository_name": attr.string(
+             mandatory = True,
+             doc = "The base name for all created repositories, like 'python38'.",
\ No newline at end of file
diff --git a/third_party/xla/third_party/py/rules_pywrap/BUILD b/third_party/xla/third_party/py/rules_pywrap/BUILD
deleted file mode 100644
index 595b43626f01..000000000000
--- a/third_party/xla/third_party/py/rules_pywrap/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-exports_files(["pybind_extension.py.tpl"])
-
-bzl_library(
-    name = "pywrap_bzl",
-    srcs = [
-        "pywrap.bzl",
-        # copybara:uncomment "pywrap.google.bzl",
-        "pywrap.impl.bzl",
-    ],
-    # copybara:uncomment parse_tests = False,
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/py/rules_pywrap/BUILD.bazel b/third_party/xla/third_party/py/rules_pywrap/BUILD.bazel
new file mode 100644
index 000000000000..9920906ec07c
--- /dev/null
+++ b/third_party/xla/third_party/py/rules_pywrap/BUILD.bazel
@@ -0,0 +1,18 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
+exports_files(["pybind_extension.py.tpl"])
+
+bzl_library(
+    name = "pywrap_bzl",
+    srcs = [
+        "pywrap.default.bzl",
+        "pywrap.impl.bzl",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "def_file_filter_tool",
+    srcs = [":def_file_filter_tool.py"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/third_party/py/rules_pywrap/def_file_filter_tool.py b/third_party/xla/third_party/py/rules_pywrap/def_file_filter_tool.py
new file mode 100644
index 000000000000..56830028581b
--- /dev/null
+++ b/third_party/xla/third_party/py/rules_pywrap/def_file_filter_tool.py
@@ -0,0 +1,84 @@
+# Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tool to filter a windows .def file based on a filter .json."""
+import argparse
+import json
+import re
+from typing import Any, Dict, List, Pattern
+
+
+def filter_def_file(
+    def_file: str, filter_file: str, filtered_file: str
+) -> None:
+  """Filters a windows .def file based on a filter .json.
+
+  Args:
+    def_file: The path to the input windows .def file.
+    filter_file: The path to the filter file (JSON format).
+    filtered_file: The path to the output filtered windows .def file.
+  """
+  with open(filter_file, "r", encoding="utf-8") as filter_file_handle:
+    filter_json: Dict[str, Any] = json.load(filter_file_handle)
+    inclusion_patterns: List[str] = filter_json["global"] + ["EXPORTS", "*;*"]
+
+    incl_patterns: List[Pattern[str]] = [
+        re.compile(re.escape(p).replace("\\*", ".*"))
+        for p in inclusion_patterns
+    ]
+    exclusion_patterns: List[str] = filter_json["local"]
+    excl_patterns: List[Pattern[str]] = [
+        re.compile(re.escape(p).replace("\\*", ".*"))
+        for p in exclusion_patterns
+    ]
+
+  with open(def_file, "r") as orig_file, open(filtered_file, "w") as filt_file:
+    for l in orig_file:
+      if not matches_any(excl_patterns, l) or matches_any(incl_patterns, l):
+        filt_file.write(l)
+
+
+def matches_any(patterns: List[Pattern[str]], line: str) -> bool:
+  """Checks if the line matches any of the given patterns.
+
+  Args:
+    patterns: A list of compiled regular expression patterns.
+    line: The line to check for matches.
+
+  Returns:
+    True if the line matches any of the patterns, False otherwise.
+  """
+  stripped_line = line.strip()
+  for pattern in patterns:
+    if pattern.match(stripped_line):
+      return True
+  return False
+
+
+def parse_args() -> argparse.Namespace:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--def-file", required=True)
+  parser.add_argument("--def-file-filter", required=True)
+  parser.add_argument("--filtered-def-file", required=True)
+
+  return parser.parse_args()
+
+
+def main():
+  args = parse_args()
+  filter_def_file(args.def_file, args.def_file_filter, args.filtered_def_file)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/third_party/xla/third_party/py/rules_pywrap/pybind_extension.py.tpl b/third_party/xla/third_party/py/rules_pywrap/pybind_extension.py.tpl
index fb225d16aa0d..6554f6ce9158 100644
--- a/third_party/xla/third_party/py/rules_pywrap/pybind_extension.py.tpl
+++ b/third_party/xla/third_party/py/rules_pywrap/pybind_extension.py.tpl
@@ -1,12 +1,16 @@
-def __update_globals(pywrap_m):
-  if hasattr(pywrap_m, '__all__'):
-    all_names = pywrap_m.__all__
-  else:
-    all_names = [name for name in dir(pywrap_m) if not name.startswith('_')]
+from sys import modules
+from types import ModuleType
 
-  extra_names = []  # template_val
-  all_names.extend(extra_names)
-  globals().update({name: getattr(pywrap_m, name) for name in all_names})
+
+def __update_globals(new_import_path, pywrap_m):
+  all_names = pywrap_m.__all__ if hasattr(pywrap_m, '__all__') else dir(
+      pywrap_m)
+  modules[new_import_path] = pywrap_m
+  for name in all_names:
+    sub_pywrap = getattr(pywrap_m, name)
+    if isinstance(sub_pywrap, ModuleType):
+      sub_name = sub_pywrap.__name__[len(pywrap_m.__name__):]
+      __update_globals(new_import_path + sub_name, sub_pywrap)
 
 
 def __try_import():
@@ -16,7 +20,7 @@ def __try_import():
   for import_path in imports_paths:
     try:
       pywrap_m = __import__(import_path, fromlist=["*"])
-      __update_globals(pywrap_m)
+      __update_globals(__name__, pywrap_m)
       return
     except ImportError as e:
       exceptions.append(str(e))
@@ -27,4 +31,5 @@ def __try_import():
 Could not import original test/binary location, import paths tried: {imports_paths}. 
 Previous exceptions: {exceptions}""", last_exception)
 
+
 __try_import()
diff --git a/third_party/xla/third_party/py/rules_pywrap/pywrap.bzl b/third_party/xla/third_party/py/rules_pywrap/pywrap.bzl
deleted file mode 100644
index 86ceda2437f5..000000000000
--- a/third_party/xla/third_party/py/rules_pywrap/pywrap.bzl
+++ /dev/null
@@ -1,22 +0,0 @@
-load(
-    "//third_party/py/rules_pywrap:pywrap.default.bzl",
-    _pybind_extension = "pybind_extension",
-    _pywrap_aware_cc_import = "pywrap_aware_cc_import",
-    _pywrap_aware_filegroup = "pywrap_aware_filegroup",
-    _pywrap_aware_genrule = "pywrap_aware_genrule",
-    _pywrap_binaries = "pywrap_binaries",
-    _pywrap_common_library = "pywrap_common_library",
-    _pywrap_library = "pywrap_library",
-    _stripped_cc_info = "stripped_cc_info",
-    _use_pywrap_rules = "use_pywrap_rules",
-)
-
-pybind_extension = _pybind_extension
-use_pywrap_rules = _use_pywrap_rules
-pywrap_library = _pywrap_library
-pywrap_common_library = _pywrap_common_library
-stripped_cc_info = _stripped_cc_info
-pywrap_aware_filegroup = _pywrap_aware_filegroup
-pywrap_aware_genrule = _pywrap_aware_genrule
-pywrap_aware_cc_import = _pywrap_aware_cc_import
-pywrap_binaries = _pywrap_binaries
diff --git a/third_party/xla/third_party/py/rules_pywrap/pywrap.impl.bzl b/third_party/xla/third_party/py/rules_pywrap/pywrap.impl.bzl
index b2795c10778e..3ef99b63cea3 100644
--- a/third_party/xla/third_party/py/rules_pywrap/pywrap.impl.bzl
+++ b/third_party/xla/third_party/py/rules_pywrap/pywrap.impl.bzl
@@ -1,4 +1,5 @@
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain", "use_cpp_toolchain")
+load("@rules_python//python:py_library.bzl", "py_library")
 
 PywrapInfo = provider(
     fields = {
@@ -18,6 +19,12 @@ CollectedPywrapInfo = provider(
     },
 )
 
+ObjectFiles = provider(
+    fields = {
+        "objects": "",
+    },
+)
+
 PywrapFilters = provider(
     fields = {
         "pywrap_lib_filter": "",
@@ -33,9 +40,11 @@ def pywrap_library(
         pywrap_lib_filter = None,
         pywrap_lib_exclusion_filter = None,
         common_lib_filters = {},
+        common_lib_versions = {},
         common_lib_version_scripts = {},
+        common_lib_def_files_or_filters = {},
         common_lib_linkopts = {},
-        win_def_file = None,
+        enable_common_lib_starlark_only_filter = True,
         pywrap_count = None,
         starlark_only_pywrap_count = 0,
         extra_deps = ["@pybind11//:pybind11"],
@@ -81,13 +90,17 @@ def pywrap_library(
         pywrap_lib_exclusion_filter = pywrap_lib_exclusion_filter,
         common_lib_filters = inverse_common_lib_filters,
         starlark_only_filter_name = starlark_only_filter_full_name,
+        enable_common_lib_starlark_only_filter = enable_common_lib_starlark_only_filter,
     )
 
-    common_deps = []
+    common_deps = [] + extra_deps
     starlark_only_common_deps = []
     binaries_data = {}
     starlark_only_binaries_data = {}
+    win_binaries_data = {}
+    win_starlark_only_binaries_data = {}
     internal_binaries = []
+    win_internal_binaries = []
 
     common_lib_full_names = []
     common_lib_full_names.extend(common_lib_filters.keys())
@@ -107,31 +120,52 @@ def pywrap_library(
             linker_input_filters = "%s" % linker_input_filters_name,
             testonly = testonly,
             compatible_with = compatible_with,
+            collect_objects = select({
+                "@bazel_tools//src/conditions:windows": True,
+                "//conditions:default": False,
+            }),
         )
-        ver_script = common_lib_version_scripts.get(common_lib_full_name, None)
-        linkopts = common_lib_linkopts.get(common_lib_full_name, [])
 
+        win_def_name = "_%s_def" % common_lib_name
+        def_file_or_filter = common_lib_def_files_or_filters.get(
+            common_lib_full_name,
+            None,
+        )
+        generated_common_win_def_file(
+            name = win_def_name,
+            dep = ":%s" % common_split_name,
+            filter = def_file_or_filter,
+        )
+
+        linkopts = common_lib_linkopts.get(common_lib_full_name, [])
+        ver_script = common_lib_version_scripts.get(common_lib_full_name, None)
         common_cc_binary_name = "%s" % common_lib_name
-        common_import_name = _construct_common_binary(
+
+        common_import_name, win_import_library_name = _construct_common_binary(
             common_cc_binary_name,
             common_deps + [":%s" % common_split_name],
             linkopts,
             testonly,
             compatible_with,
-            win_def_file,
+            ":%s" % win_def_name,
             None,
             binaries_data.values(),
             common_lib_pkg,
             ver_script,
-            data = [":%s" % common_split_name],
+            [":%s" % common_split_name],
+            common_lib_versions.get(common_lib_full_name, ""),
         )
         actual_binaries_data = binaries_data
         actual_common_deps = common_deps
+        actual_win_binaries_data = win_binaries_data
         if common_lib_full_name == starlark_only_filter_full_name:
             actual_binaries_data = starlark_only_binaries_data
             actual_common_deps = starlark_only_common_deps
+            actual_win_binaries_data = win_starlark_only_binaries_data
         internal_binaries.append(":%s" % common_cc_binary_name)
+        win_internal_binaries.append(":%s" % win_import_library_name)
         actual_binaries_data[":%s" % common_cc_binary_name] = common_lib_pkg
+        actual_win_binaries_data[":%s" % win_import_library_name] = common_lib_pkg
         actual_common_deps.append(":%s" % common_import_name)
 
     # 2) Create individual super-thin pywrap libraries, which depend on the
@@ -184,12 +218,22 @@ def pywrap_library(
     #
     pywrap_binaries_name = "%s_common_binaries" % name
     wheel_locations_json_name = ":%s_wheel_locations.json" % pywrap_binaries_name
+
+    win_binaries_data.update(binaries_data)
+    win_starlark_only_binaries_data.update(starlark_only_binaries_data)
+
     _pywrap_binaries(
         name = pywrap_binaries_name,
         collected_pywraps = ":%s" % info_collector_name,
         deps = shared_objects,
-        common_binaries = binaries_data,
-        starlark_only_common_binaries = starlark_only_binaries_data,
+        common_binaries = select({
+            "@bazel_tools//src/conditions:windows": win_binaries_data,
+            "//conditions:default": binaries_data,
+        }),
+        starlark_only_common_binaries = select({
+            "@bazel_tools//src/conditions:windows": win_starlark_only_binaries_data,
+            "//conditions:default": starlark_only_binaries_data,
+        }),
         extension = select({
             "@bazel_tools//src/conditions:windows": ".pyd",
             "//conditions:default": ".so",
@@ -206,7 +250,7 @@ def pywrap_library(
     all_binaries_data.append(":%s" % pywrap_binaries_name)
     all_binaries_data.extend([shared_objects[-1]])
 
-    native.py_library(
+    py_library(
         name = name,
         srcs = [":%s" % info_collector_name],
         data = all_binaries_data,
@@ -217,7 +261,10 @@ def pywrap_library(
 
     native.filegroup(
         name = name + "_all_binaries",
-        srcs = internal_binaries,
+        srcs = select({
+            "@bazel_tools//src/conditions:windows": internal_binaries + win_internal_binaries,
+            "//conditions:default": internal_binaries,
+        }),
     )
 
 def _construct_common_binary(
@@ -231,43 +278,113 @@ def _construct_common_binary(
         dependency_common_lib_packages,
         dependent_common_lib_package,
         version_script,
-        data):
-    actual_linkopts = _construct_linkopt_soname(name) + _construct_linkopt_rpaths(
+        data,
+        version = ""):
+    version_str = ".{}".format(version) if version else version
+    linux_binary_name = "lib{}.so{}".format(name, version_str)
+    win_binary_name = "{}{}.dll".format(name, version_str)
+    darwin_binary_name = "lib{}{}.dylib".format(name, version_str)
+
+    actual_version_script = None
+    if version_script:
+        actual_version_script = "{}_version_script".format(name)
+        native.alias(
+            name = actual_version_script,
+            actual = version_script,
+        )
+        actual_version_script = ":{}".format(actual_version_script)
+
+    linux_linkopts = _construct_linkopt_soname(
+        linux_binary_name,
+        False,
+    ) + _construct_linkopt_rpaths(
         dependency_common_lib_packages,
         dependent_common_lib_package,
-    ) + _construct_linkopt_version_script(version_script)
+        False,
+    ) + _construct_linkopt_version_script(actual_version_script, False)
 
     native.cc_binary(
-        name = name,
-        deps = deps + ([version_script] if version_script else []),
+        name = linux_binary_name,
+        deps = deps + ([actual_version_script] if actual_version_script else []),
+        linkstatic = True,
+        linkshared = True,
+        linkopts = linkopts + select({
+            "@bazel_tools//src/conditions:windows": [],
+            "@bazel_tools//src/conditions:darwin": [],
+            "//conditions:default": linux_linkopts,
+        }),
+        testonly = testonly,
+        compatible_with = compatible_with,
+        local_defines = local_defines,
+    )
+
+    native.cc_binary(
+        name = win_binary_name,
+        deps = deps,
         linkstatic = True,
         linkshared = True,
         linkopts = linkopts + select({
             "@bazel_tools//src/conditions:windows": [],
-            "//conditions:default": actual_linkopts,
+            "@bazel_tools//src/conditions:darwin": [],
+            "//conditions:default": [],
         }),
         testonly = testonly,
         compatible_with = compatible_with,
         win_def_file = win_def_file,
         local_defines = local_defines,
-        #        data = data,
     )
 
-    if_lib_name = "%s_if_lib" % name
+    darwin_linkopts = _construct_linkopt_soname(
+        darwin_binary_name,
+        True,
+    ) + _construct_linkopt_rpaths(
+        dependency_common_lib_packages,
+        dependent_common_lib_package,
+        True,
+    ) + _construct_linkopt_version_script(actual_version_script, True)
+
+    native.cc_binary(
+        name = darwin_binary_name,
+        deps = deps + ([actual_version_script] if actual_version_script else []),
+        linkstatic = True,
+        linkshared = True,
+        linkopts = linkopts + select({
+            "@bazel_tools//src/conditions:windows": [],
+            "@bazel_tools//src/conditions:darwin": darwin_linkopts,
+            "//conditions:default": [],
+        }),
+        testonly = testonly,
+        compatible_with = compatible_with,
+        local_defines = local_defines,
+    )
+
+    if_lib_name = "{}{}_if_lib".format(name, version_str)
     native.filegroup(
         name = if_lib_name,
-        srcs = [":%s" % name],
+        srcs = [":%s" % win_binary_name],
         output_group = "interface_library",
         testonly = testonly,
         compatible_with = compatible_with,
     )
 
+    native.alias(
+        name = name,
+        actual = select({
+            "@bazel_tools//src/conditions:windows": ":%s" % win_binary_name,
+            "@bazel_tools//src/conditions:darwin": ":%s" % darwin_binary_name,
+            "//conditions:default": ":%s" % linux_binary_name,
+        }),
+    )
+
     import_name = "%s_import" % name
+
     native.cc_import(
         name = import_name,
-        shared_library = ":%s" % name,
-        # TODO: put it back to fix Windows
-        #        interface_library = ":%s" % if_lib_name,
+        shared_library = "%s" % name,
+        interface_library = select({
+            "@bazel_tools//src/conditions:windows": ":%s" % if_lib_name,
+            "//conditions:default": None,
+        }),
         testonly = testonly,
         compatible_with = compatible_with,
     )
@@ -280,7 +397,7 @@ def _construct_common_binary(
         data = data,
     )
 
-    return import_name
+    return cc_lib_name, if_lib_name
 
 def _pywrap_split_library_impl(ctx):
     pywrap_index = ctx.attr.pywrap_index
@@ -312,6 +429,7 @@ def _pywrap_split_library_impl(ctx):
         user_link_flags,
         private_linker_inputs,
         default_runfiles,
+        ctx.attr.collect_objects,
     )
 
 _pywrap_split_library = rule(
@@ -330,6 +448,7 @@ _pywrap_split_library = rule(
         "_cc_toolchain": attr.label(
             default = "@bazel_tools//tools/cpp:current_cc_toolchain",
         ),
+        "collect_objects": attr.bool(default = False, mandatory = False),
     },
     fragments = ["cpp"],
     toolchains = use_cpp_toolchain(),
@@ -353,7 +472,7 @@ def _pywrap_common_split_library_impl(ctx):
     else:
         libs_to_include = filters.common_lib_filters[ctx.attr.common_lib_full_name]
 
-    user_link_flags = {}
+    user_link_flags = []
     dynamic_lib_filter = filters.dynamic_lib_filter
     default_runfiles = ctx.runfiles()
     for pw in pywrap_infos:
@@ -364,8 +483,7 @@ def _pywrap_common_split_library_impl(ctx):
                 continue
             if include_all_not_excluded or (li in libs_to_include) or li in dynamic_lib_filter:
                 split_linker_inputs.append(li)
-                for user_link_flag in li.user_link_flags:
-                    user_link_flags[user_link_flag] = True
+                user_link_flags.extend(li.user_link_flags)
                 if not pw_runfiles_merged:
                     default_runfiles = default_runfiles.merge(pw.default_runfiles)
                     pw_runfiles_merged = True
@@ -373,9 +491,10 @@ def _pywrap_common_split_library_impl(ctx):
     return _construct_split_library_cc_info(
         ctx,
         split_linker_inputs,
-        list(user_link_flags.keys()),
+        user_link_flags,
         [],
         default_runfiles,
+        ctx.attr.collect_objects,
     )
 
 _pywrap_common_split_library = rule(
@@ -393,6 +512,7 @@ _pywrap_common_split_library = rule(
         "_cc_toolchain": attr.label(
             default = "@bazel_tools//tools/cpp:current_cc_toolchain",
         ),
+        "collect_objects": attr.bool(default = False, mandatory = False),
     },
     fragments = ["cpp"],
     toolchains = use_cpp_toolchain(),
@@ -404,16 +524,18 @@ def _construct_split_library_cc_info(
         split_linker_inputs,
         user_link_flags,
         private_linker_inputs,
-        default_runfiles):
-    dependency_libraries = _construct_dependency_libraries(
+        default_runfiles,
+        collect_objects):
+    dependency_libraries, objects = _construct_dependency_libraries(
         ctx,
         split_linker_inputs,
+        collect_objects,
     )
 
     linker_input = cc_common.create_linker_input(
         owner = ctx.label,
         libraries = depset(direct = dependency_libraries),
-        user_link_flags = depset(direct = user_link_flags),
+        user_link_flags = user_link_flags,
     )
 
     linking_context = cc_common.create_linking_context(
@@ -425,11 +547,11 @@ def _construct_split_library_cc_info(
 
     return [
         CcInfo(linking_context = linking_context),
-        #        DefaultInfo(files = default_runfiles.files)
+        ObjectFiles(objects = depset(direct = objects)),
         DefaultInfo(runfiles = default_runfiles),
     ]
 
-def _construct_dependency_libraries(ctx, split_linker_inputs):
+def _construct_dependency_libraries(ctx, split_linker_inputs, collect_objects):
     cc_toolchain = find_cpp_toolchain(ctx)
     feature_configuration = cc_common.configure_features(
         ctx = ctx,
@@ -438,22 +560,26 @@ def _construct_dependency_libraries(ctx, split_linker_inputs):
         unsupported_features = ctx.disabled_features,
     )
     dependency_libraries = []
+    objects = []
     for split_linker_input in split_linker_inputs:
         for lib in split_linker_input.libraries:
             lib_copy = lib
-            if not lib.alwayslink and (lib.static_library or lib.pic_static_library):
-                lib_copy = cc_common.create_library_to_link(
-                    actions = ctx.actions,
-                    cc_toolchain = cc_toolchain,
-                    feature_configuration = feature_configuration,
-                    static_library = lib.static_library,
-                    pic_static_library = lib.pic_static_library,
-                    interface_library = lib.interface_library,
-                    alwayslink = True,
-                )
+            if lib.static_library or lib.pic_static_library:
+                if collect_objects:
+                    objects.extend(lib.objects)
+                if not lib.alwayslink:
+                    lib_copy = cc_common.create_library_to_link(
+                        actions = ctx.actions,
+                        cc_toolchain = cc_toolchain,
+                        feature_configuration = feature_configuration,
+                        static_library = lib.static_library,
+                        pic_static_library = lib.pic_static_library,
+                        interface_library = lib.interface_library,
+                        alwayslink = True,
+                    )
             dependency_libraries.append(lib_copy)
 
-    return dependency_libraries
+    return dependency_libraries, objects
 
 def _linker_input_filters_impl(ctx):
     pywrap_lib_exclusion_filter = {}
@@ -491,15 +617,16 @@ def _linker_input_filters_impl(ctx):
     starlark_only_filter = {}
 
     if ctx.attr.starlark_only_filter_name:
-        for pw in pywrap_infos:
-            if pw.starlark_only:
-                for li in pw.cc_info.linking_context.linker_inputs.to_list()[1:]:
-                    starlark_only_filter[li] = li.owner
+        if ctx.attr.enable_common_lib_starlark_only_filter:
+            for pw in pywrap_infos:
+                if pw.starlark_only:
+                    for li in pw.cc_info.linking_context.linker_inputs.to_list()[1:]:
+                        starlark_only_filter[li] = li.owner
 
-        for pw in pywrap_infos:
-            if not pw.starlark_only:
-                for li in pw.cc_info.linking_context.linker_inputs.to_list()[1:]:
-                    starlark_only_filter.pop(li, None)
+            for pw in pywrap_infos:
+                if not pw.starlark_only:
+                    for li in pw.cc_info.linking_context.linker_inputs.to_list()[1:]:
+                        starlark_only_filter.pop(li, None)
 
         common_lib_filters[ctx.attr.starlark_only_filter_name] = starlark_only_filter
 
@@ -551,6 +678,10 @@ _linker_input_filters = rule(
             default = {},
         ),
         "starlark_only_filter_name": attr.string(mandatory = False),
+        "enable_common_lib_starlark_only_filter": attr.bool(
+            mandatory = False,
+            default = True,
+        ),
     },
     implementation = _linker_input_filters_impl,
 )
@@ -590,6 +721,7 @@ def _generated_win_def_file_impl(ctx):
             owner = pywrap_info.owner.name,
             win_def_file = win_def_file.path,
         ),
+        mnemonic = "PywrapWinDefFile",
         outputs = [win_def_file],
     )
 
@@ -620,6 +752,8 @@ def pybind_extension(
         linkopts = [],
         starlark_only = False,
         **kwargs):
+    # For backward compatibility that I don't want to mess with
+    _ignore = [additional_exported_symbols]
     cc_library_name = "_%s_cc_library" % name
     native.cc_library(
         name = cc_library_name,
@@ -633,9 +767,15 @@ def pybind_extension(
         local_defines = ["PROTOBUF_USE_DLLS", "ABSL_CONSUME_DLL"],
         linkopts = linkopts + select({
             "@bazel_tools//src/conditions:windows": [],
+            "@bazel_tools//src/conditions:darwin": _construct_linkopt_rpaths(
+                common_lib_packages + [native.package_name()],
+                native.package_name(),
+                True,
+            ),
             "//conditions:default": _construct_linkopt_rpaths(
                 common_lib_packages + [native.package_name()],
                 native.package_name(),
+                False,
             ),
         }),
         **kwargs
@@ -655,7 +795,6 @@ def pybind_extension(
             name = name,
             deps = ["%s" % cc_library_name],
             common_lib_packages = common_lib_packages,
-            additional_exported_symbols = additional_exported_symbols,
             starlark_only = starlark_only,
             testonly = testonly,
             compatible_with = compatible_with,
@@ -669,9 +808,6 @@ def _pywrap_info_wrapper_impl(ctx):
 
     py_stub = ctx.actions.declare_file("%s.py" % ctx.attr.name)
     substitutions = {}
-
-    additional_exported_symbols = ctx.attr.additional_exported_symbols
-
     py_pkgs = []
     for pkg in ctx.attr.common_lib_packages:
         if pkg:
@@ -681,10 +817,6 @@ def _pywrap_info_wrapper_impl(ctx):
         val = "imports_paths = %s # template_val" % py_pkgs
         substitutions["imports_paths = []  # template_val"] = val
 
-    if additional_exported_symbols:
-        val = "extra_names = %s # template_val" % additional_exported_symbols
-        substitutions["extra_names = []  # template_val"] = val
-
     ctx.actions.expand_template(
         template = ctx.file.py_stub_src,
         output = py_stub,
@@ -716,10 +848,6 @@ _pywrap_info_wrapper = rule(
             allow_single_file = True,
             default = Label("//third_party/py/rules_pywrap:pybind_extension.py.tpl"),
         ),
-        "additional_exported_symbols": attr.string_list(
-            mandatory = False,
-            default = [],
-        ),
         "starlark_only": attr.bool(mandatory = False, default = False),
     },
     implementation = _pywrap_info_wrapper_impl,
@@ -868,7 +996,7 @@ def _pywrap_binaries_impl(ctx):
 
     final_binaries = []
     original_to_final_binaries = [
-        "\n\nvvv Shared objects corresondence map, target = {} vvv".format(ctx.label),
+        "\n\nvvv Shared objects correspondence map, target = {} vvv".format(ctx.label),
     ]
     wheel_locations = {}
     for i in range(0, len(pywrap_infos)):
@@ -884,6 +1012,7 @@ def _pywrap_binaries_impl(ctx):
                 original = original_binary_file.path,
                 final = final_binary.path,
             ),
+            mnemonic = "PywrapBinaryRename",
             outputs = [final_binary],
         )
 
@@ -934,11 +1063,6 @@ def _pywrap_binaries_impl(ctx):
         content = str(wheel_locations),
     )
 
-    original_to_final_binaries.append(
-        "^^^ Shared objects corresondence map^^^\n\n",
-    )
-    print("\n".join(original_to_final_binaries))
-
     return [DefaultInfo(files = depset(direct = final_binaries))]
 
 def _construct_final_binary_location(final_binary, new_package):
@@ -1005,9 +1129,12 @@ def _get_common_lib_package_and_name(common_lib_full_name):
 
 def _construct_inverse_common_lib_filters(common_lib_filters):
     inverse_common_lib_filters = {}
+    select_type = type(select({"//conditions:default": []}))
+    list_type = type([])
+
     for common_lib_k, common_lib_v in common_lib_filters.items():
         new_common_lib_k = common_lib_v
-        if type(common_lib_v) == type([]):
+        if type(common_lib_v) == list_type or type(common_lib_v) == select_type:
             new_common_lib_k = "_%s_common_lib_filter" % common_lib_k.rsplit("/", 1)[-1]
             native.cc_library(
                 name = new_common_lib_k,
@@ -1017,18 +1144,25 @@ def _construct_inverse_common_lib_filters(common_lib_filters):
         inverse_common_lib_filters[new_common_lib_k] = common_lib_k
     return inverse_common_lib_filters
 
-def _construct_linkopt_soname(name):
+def _construct_linkopt_soname(name, darwin):
     soname = name.rsplit("/", 1)[1] if "/" in name else name
-    soname = soname if name.startswith("lib") else ("lib%s" % soname)
-    if ".so" not in name:
-        soname += ".so"
-    return ["-Wl,-soname,%s" % soname]
-
-def _construct_linkopt_rpaths(dependency_lib_packages, dependent_lib_package):
+    soname = soname if name.startswith("lib") else ("lib{}".format(soname))
+    extension = ".so"
+    arg_name = "-soname"
+    if darwin:
+        extension = ".dylib"
+        arg_name = "-install_name"
+        soname = "@rpath/" + soname
+    if extension not in name:
+        soname += extension
+    return ["-Wl,{},{}".format(arg_name, soname)]
+
+def _construct_linkopt_rpaths(dependency_lib_packages, dependent_lib_package, darwin):
     linkopts = {}
+    origin = "@loader_path" if darwin else "$$ORIGIN"
     for dependency_lib_package in dependency_lib_packages:
         origin_pkg = _construct_rpath(dependency_lib_package, dependent_lib_package)
-        linkopts["-rpath,'$$ORIGIN/%s'" % origin_pkg] = True
+        linkopts["-rpath,'{}/{}'".format(origin, origin_pkg)] = True
     return ["-Wl," + ",".join(linkopts.keys())] if linkopts else []
 
 def _construct_rpath(dependency_lib_package, dependent_lib_package):
@@ -1047,7 +1181,79 @@ def _construct_rpath(dependency_lib_package, dependent_lib_package):
 
     return levels_up + remaining_pkg
 
-def _construct_linkopt_version_script(version_script):
+def _construct_linkopt_version_script(version_script, darwin):
     if not version_script:
         return []
-    return ["-Wl,--version-script,$(location {})".format(version_script)]
+    arg_name = "-exported_symbols_list" if darwin else "--version-script"
+    return ["-Wl,{},$(location {})".format(arg_name, version_script)]
+
+def _generated_common_win_def_file_impl(ctx):
+    win_raw_def_file_name = "%s.gen.def" % ctx.attr.name
+    if ctx.attr.filter:
+        if ctx.file.filter.extension != "json":
+            return [DefaultInfo(files = depset(direct = [ctx.file.filter]))]
+        win_raw_def_file_name = "%s.raw.gen.def" % ctx.attr.name
+    win_raw_def_file = ctx.actions.declare_file(win_raw_def_file_name)
+
+    args = ctx.actions.args()
+    args.add(win_raw_def_file)
+    args.add("")
+    obj_files_args = ctx.actions.args()
+    obj_files_args.add_all(ctx.attr.dep[ObjectFiles].objects)
+    obj_files_args.use_param_file("@%s", use_always = True)
+    obj_files_args.set_param_file_format("multiline")
+
+    ctx.actions.run(
+        inputs = ctx.attr.dep[ObjectFiles].objects,
+        tools = [ctx.executable.parser],
+        executable = ctx.executable.parser,
+        arguments = [args, obj_files_args],
+        outputs = [win_raw_def_file],
+        mnemonic = "WinDefFileParse",
+    )
+
+    win_def_file = win_raw_def_file
+    if ctx.attr.filter:
+        win_def_file_name = "%s.gen.def" % ctx.attr.name
+        win_def_file = ctx.actions.declare_file(win_def_file_name)
+
+        filter_args = ctx.actions.args()
+        filter_args.add("--def-file", win_raw_def_file)
+        filter_args.add("--def-file-filter", ctx.file.filter)
+        filter_args.add("--filtered-def-file", win_def_file)
+
+        ctx.actions.run(
+            inputs = [win_raw_def_file, ctx.file.filter],
+            tools = [ctx.executable.filter_tool],
+            executable = ctx.executable.filter_tool,
+            arguments = [filter_args],
+            outputs = [win_def_file],
+            mnemonic = "WinDefFileFilter",
+        )
+
+    return [DefaultInfo(files = depset(direct = [win_def_file]))]
+
+generated_common_win_def_file = rule(
+    attrs = {
+        "dep": attr.label(
+            providers = [ObjectFiles],
+            mandatory = True,
+        ),
+        "filter": attr.label(
+            allow_single_file = True,
+            mandatory = False,
+        ),
+        "parser": attr.label(
+            allow_single_file = True,
+            default = Label("@bazel_tools//tools/def_parser:def_parser"),
+            executable = True,
+            cfg = "host",
+        ),
+        "filter_tool": attr.label(
+            default = Label(":def_file_filter_tool"),
+            executable = True,
+            cfg = "host",
+        ),
+    },
+    implementation = _generated_common_win_def_file_impl,
+)
diff --git a/third_party/xla/third_party/pybind11_abseil/BUILD b/third_party/xla/third_party/pybind11_abseil/BUILD
index 3b946e563d4e..3e72f44dc935 100644
--- a/third_party/xla/third_party/pybind11_abseil/BUILD
+++ b/third_party/xla/third_party/pybind11_abseil/BUILD
@@ -1,3 +1,3 @@
 # Necessary for bazel to recognize this as a package.
 
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/pybind11_abseil/workspace.bzl b/third_party/xla/third_party/pybind11_abseil/workspace.bzl
index 19c11118b8fd..c12f0ba77f82 100644
--- a/third_party/xla/third_party/pybind11_abseil/workspace.bzl
+++ b/third_party/xla/third_party/pybind11_abseil/workspace.bzl
@@ -15,6 +15,6 @@ def repo():
         sha256 = PA_SHA256,
         strip_prefix = "pybind11_abseil-{commit}".format(commit = PA_COMMIT),
         urls = tf_mirror_urls("https://github.com/pybind/pybind11_abseil/archive/{commit}.tar.gz".format(commit = PA_COMMIT)),
-        build_file = "//third_party/pybind11_abseil:BUILD",
+        build_file = "//third_party/pybind11_abseil:BUILD.bazel",
         patch_file = ["//third_party/pybind11_abseil:remove_license.patch"],
     )
diff --git a/third_party/xla/third_party/pybind11_bazel/BUILD b/third_party/xla/third_party/pybind11_bazel/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/xla/third_party/clang_toolchain/BUILD b/third_party/xla/third_party/pybind11_bazel/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/clang_toolchain/BUILD
rename to third_party/xla/third_party/pybind11_bazel/BUILD.bazel
diff --git a/third_party/xla/third_party/python_runtime/BUILD b/third_party/xla/third_party/python_runtime/BUILD
deleted file mode 100644
index 2a1609191fe3..000000000000
--- a/third_party/xla/third_party/python_runtime/BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-licenses(["notice"])  # New BSD, Python Software Foundation
-
-package(default_visibility = ["//visibility:public"])
-
-alias(
-    name = "headers",
-    actual = "@local_config_python//:python_headers",
-)
diff --git a/third_party/python_runtime/BUILD b/third_party/xla/third_party/python_runtime/BUILD.bazel
similarity index 100%
rename from third_party/python_runtime/BUILD
rename to third_party/xla/third_party/python_runtime/BUILD.bazel
diff --git a/third_party/xla/third_party/remote_config/BUILD b/third_party/xla/third_party/remote_config/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/xla/third_party/git/BUILD b/third_party/xla/third_party/remote_config/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/git/BUILD
rename to third_party/xla/third_party/remote_config/BUILD.bazel
diff --git a/third_party/xla/third_party/remote_config/common.bzl b/third_party/xla/third_party/remote_config/common.bzl
index 94d2489c62b4..53167b5a7f1c 100644
--- a/third_party/xla/third_party/remote_config/common.bzl
+++ b/third_party/xla/third_party/remote_config/common.bzl
@@ -228,10 +228,10 @@ def execute(
       The result of repository_ctx.execute(cmdline)
     """
     result = raw_exec(repository_ctx, cmdline, env_vars)
-    if (result.stderr or not result.stdout) and not allow_failure:
+    if (result.return_code != 0 or not result.stdout) and not allow_failure:
         fail(
             "\n".join([
-                error_msg.strip() if error_msg else "Repository command failed",
+                error_msg.strip() if error_msg else "Repository command failed (code {})".format(result.return_code),
                 result.stderr.strip(),
                 error_details if error_details else "",
             ]),
diff --git a/third_party/xla/third_party/repo.bzl b/third_party/xla/third_party/repo.bzl
index 226f87993929..7049c585c68d 100644
--- a/third_party/xla/third_party/repo.bzl
+++ b/third_party/xla/third_party/repo.bzl
@@ -129,8 +129,6 @@ def tf_http_archive(name, sha256, urls, **kwargs):
              "along shortly thereafter and mirror the file.")
 
     if native.existing_rule(name):
-        print("\n\033[1;33mWarning:\033[0m skipping import of repository '" +
-              name + "' because it already exists.\n")
         return
 
     _tf_http_archive(
diff --git a/third_party/xla/third_party/robin_map/BUILD b/third_party/xla/third_party/robin_map/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/robin_map/BUILD
+++ b/third_party/xla/third_party/robin_map/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index ae6be9c39825..e74347533787 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,284 +1,6393 @@
-diff --git a/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc b/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc
-index 750fcf1..1743279 100644
---- a/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc
-+++ b/shardy/dialect/sdy/transforms/propagation/basic_factor_propagation.cc
-@@ -176,14 +176,31 @@ using DirectionBasedTensorShardings =
- // The same holds for backwards propagation, except we allow expansion based
- // on the result factor shardings but not the operands.
- std::optional<DirectionBasedTensorShardings> getDirectionBasedTensorShardings(
--    PropagationDirection direction, ArrayRef<TensorFactorShardings> operands,
-+    PropagationDirection direction, Operation* op,
-+    ArrayRef<TensorFactorShardings> operands,
-     ArrayRef<TensorFactorShardings> results) {
-+  static const char* errMsg =
-+      "since Shardy is propagating {0} for this op, Shardy may not "
-+      "fully propagate to each of the multiple {1}s; {0} "
-+      "propagation was designed with single {1} ops in mind. Let the "
-+      "Shardy team know the operation that you'd like to be fully "
-+      "supported.";
-+  static llvm::once_flag flag;
-   switch (direction) {
-     case PropagationDirection::BOTH:
-+      return std::make_pair(operands, results);
-     case PropagationDirection::FORWARD: {
-+      if (op && results.size() > 1) {
-+        emitOpWarningOnce(flag, op,
-+                          llvm::formatv(errMsg, "forward", "result").str());
-+      }
-       return std::make_pair(operands, results);
-     }
-     case PropagationDirection::BACKWARD: {
-+      if (op && operands.size() > 1) {
-+        emitOpWarningOnce(flag, op,
-+                          llvm::formatv(errMsg, "backward", "operand").str());
-+      }
-       return std::make_pair(results, operands);
-     }
-     case PropagationDirection::NONE:
-@@ -294,7 +311,7 @@ SmallVector<AxisRefAttr> BasicFactorPropagation::getCompatibleMajorAxes(
-   }
- 
-   std::optional<DirectionBasedTensorShardings> tensorShardings =
--      getDirectionBasedTensorShardings(direction, projection.getOperands(),
-+      getDirectionBasedTensorShardings(direction, op, projection.getOperands(),
-                                        projection.getResults());
-   assert(tensorShardings.has_value());
- 
-diff --git a/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc b/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc
-index 25c0b29..f9d1fc7 100644
---- a/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc
-+++ b/shardy/dialect/sdy/transforms/propagation/op_priority_propagation.cc
-@@ -21,18 +21,15 @@ limitations under the License.
- #include <memory>
- #include <numeric>
- 
--#include "llvm/ADT/STLExtras.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/Operation.h"
- #include "mlir/IR/SymbolTable.h"
--#include "mlir/IR/Value.h"
- #include "mlir/Pass/Pass.h"
- #include "mlir/Pass/PassRegistry.h"
- #include "mlir/Support/LLVM.h"
- #include "mlir/Support/LogicalResult.h"
- #include "shardy/dialect/sdy/ir/constants.h"
- #include "shardy/dialect/sdy/ir/dialect.h"
--#include "shardy/dialect/sdy/ir/utils.h"
- #include "shardy/dialect/sdy/transforms/common/op_properties.h"
- #include "shardy/dialect/sdy/transforms/propagation/aggressive_propagation.h"
- #include "shardy/dialect/sdy/transforms/propagation/basic_propagation.h"
-@@ -54,55 +51,17 @@ namespace {
- using GetDirectionToPropagateFnPtr = PropagationDirection (*)(Operation*,
-                                                               int64_t);
- 
--// Returns the direction to propagate based on whether any of the op's operands
--// has multiple uses.
--//
--// The rational is that if any of the operands has multiple uses:
--// - We don't want to propagate forward, to avoid introducing a conflict for
--//   each use of the operand.
--// - We don't want to propagate backwards, since there could be a conflict with
--//   another use of the operand.
--//
--// If `allowMultiUse` is true, returns `BOTH` regardless of the number of uses.
--PropagationDirection getDirectionBasedOnUses(Operation* op,
--                                             bool allowMultiUse) {
--  if (allowMultiUse) {
--    return PropagationDirection::BOTH;
--  }
--
--  bool anyOperandMultipleUses =
--      llvm::any_of(op->getOperands(), [&](Value operand) {
--        // We can ignore scalar operands, since they can't be sharded.
--        return !operand.hasOneUse() && !isScalar(operand);
--      });
--  return anyOperandMultipleUses ? PropagationDirection::NONE
--                                : PropagationDirection::BOTH;
--}
--
--PropagationDirection isPassThroughOp(Operation* op, int64_t factorIndex,
--                                     bool allowMultiUse) {
-+PropagationDirection isPassThroughOp(Operation* op, int64_t) {
-   if (isElementwise(op) ||
-       isa<stablehlo::ReshapeOp, stablehlo::TransposeOp, DataFlowEdgeOp>(op)) {
--    return getDirectionBasedOnUses(op, allowMultiUse);
-+    return PropagationDirection::BOTH;
-   }
-   if (isa<stablehlo::DynamicSliceOp, stablehlo::DynamicUpdateSliceOp>(op)) {
--    return intersectionOfPropagationDirections(
--        getDirectionBasedOnUses(op, allowMultiUse),
--        PropagationDirection::FORWARD);
-+    return PropagationDirection::FORWARD;
-   }
-   return PropagationDirection::NONE;
- }
- 
--PropagationDirection isPassThroughOpSingleUse(Operation* op,
--                                              int64_t factorIndex) {
--  return isPassThroughOp(op, factorIndex, /*allowMultiUse=*/false);
--}
--
--PropagationDirection isPassThroughOpMultiUse(Operation* op,
--                                             int64_t factorIndex) {
--  return isPassThroughOp(op, factorIndex, /*allowMultiUse=*/true);
--}
--
- // NOTE: if the `op` has no sharding rule, then we will assume it uses an
- // identity sharding rule. For example, `DataFlowEdgeOp`.
- PropagationDirection onlyPassThroughFactorsBroadcastBackward(
-@@ -118,9 +77,8 @@ PropagationDirection onlyPassThroughFactorsBroadcastBackward(
-   return PropagationDirection::BOTH;
- }
- 
--constexpr std::array<GetDirectionToPropagateFnPtr, 4> opPropagationSchedule = {
--    isPassThroughOpSingleUse, isPassThroughOpMultiUse,
--    onlyPassThroughFactorsBroadcastBackward, propagateAny};
-+constexpr std::array<GetDirectionToPropagateFnPtr, 3> opPropagationSchedule = {
-+    isPassThroughOp, onlyPassThroughFactorsBroadcastBackward, propagateAny};
- 
- // Returns the direction in which the given operation should be propagated.
- //
-diff --git a/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir b/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir
-index 6bf9bb5..36adf99 100644
---- a/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir
-+++ b/shardy/dialect/sdy/transforms/propagation/test/op_priority_propagation.mlir
-@@ -35,84 +35,6 @@ func.func @element_wise_over_dot_general_flipped_op_order(%arg0: tensor<8x8xf32>
-   return %2 : tensor<8x8xf32>
- }
- 
--// If we propagated forward through element-wise ops with multiple uses in the
--// first iteration, the sharding on dim 0 would have been propagated to the two
--// add ops, which would result in two reshards instead of one.
--// CHECK-LABEL: func @defer_forward_propagation_for_multi_use_ops
--func.func @defer_forward_propagation_for_multi_use_ops(
--    %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>})
--    -> (tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>},
--        tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>}) {
--  // CHECK-NEXT: %[[SINE:.*]] = stablehlo.sine %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %[[SINE]], %[[SINE]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[ADD_2:.*]] = stablehlo.add %[[SINE]], %[[SINE]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[COSINE_1:.*]] = stablehlo.cosine %[[ADD_1]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[COSINE_2:.*]] = stablehlo.cosine %[[ADD_2]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: return %[[COSINE_1]], %[[COSINE_2]]
--  %1 = stablehlo.sine %arg0 : tensor<8x8xf32>
--  %2 = stablehlo.add %1, %1 : tensor<8x8xf32>
--  %3 = stablehlo.add %1, %1 : tensor<8x8xf32>
--  // Need the additinal ops since they will get the result sharding regardless
--  // of op priority propagation.
--  %4 = stablehlo.cosine %2 : tensor<8x8xf32>
--  %5 = stablehlo.cosine %3 : tensor<8x8xf32>
--  return %4, %5 : tensor<8x8xf32>, tensor<8x8xf32>
--}
--
--// If we propagated forward through dynamic-slice op with multiple uses in the
--// first iteration, the sharding on dim 0 would have been propagated to the two
--// add ops.
--// CHECK-LABEL: func @defer_forward_propagation_for_multi_use_dynamic_slice
--func.func @defer_forward_propagation_for_multi_use_dynamic_slice(
--    %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>},
--    %arg1: tensor<i32>, %arg2: tensor<i32>)
--    -> (tensor<8x2xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>}) {
--  // CHECK-NEXT: %[[DS:.*]] = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %[[DS]], %[[DS]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[ADD_2:.*]] = stablehlo.add %[[DS]], %[[DS]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[MUL:.*]] = stablehlo.multiply %[[ADD_1]], %[[ADD_2]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: return %[[MUL]]
--  %1 = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] : (tensor<8x8xf32>, tensor<i32>, tensor<i32>) -> tensor<8x2xf32>
--  %2 = stablehlo.add %1, %1 : tensor<8x2xf32>
--  %3 = stablehlo.add %1, %1 : tensor<8x2xf32>
--  // Need the additinal op since it will get the result sharding regardless of
--  // op priority propagation.
--  %4 = stablehlo.multiply %2, %3 : tensor<8x2xf32>
--  return %4 : tensor<8x2xf32>
--}
--
--// If we propagated backwards through element-wise ops with a multi-use operand
--// in the first iteration, the sharding on dim 1 would have been propagated to
--// %arg0.
--// CHECK-LABEL: func @defer_backwards_propagation_for_op_with_multi_use_operand(
--// CHECK-SAME:      %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a", ?}, {?}]>})
--func.func @defer_backwards_propagation_for_op_with_multi_use_operand(%arg0: tensor<8x8xf32>)
--    -> (tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>},
--        tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>}) {
--  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD_2:.*]] = stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: %[[SINE:.*]] = stablehlo.sine %[[ADD_1]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: return %[[ADD_2]], %[[SINE]]
--  %0 = stablehlo.add %arg0, %arg0 : tensor<8x8xf32>
--  %1 = stablehlo.add %arg0, %arg0 : tensor<8x8xf32>
--  %2 = stablehlo.sine %0 : tensor<8x8xf32>
--  return %1, %2 : tensor<8x8xf32>, tensor<8x8xf32>
--}
--
--// CHECK-LABEL: func @defer_backwards_propagation_dynamic_slice(
--// CHECK-SAME:      %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{?}, {"a", ?}]>}
--func.func @defer_backwards_propagation_dynamic_slice(
--    %arg0: tensor<8x8xf32>, %arg1: tensor<i32>, %arg2: tensor<i32>)
--    -> (tensor<8x2xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>},
--        tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"a"}]>}) {
--  // CHECK-NEXT: %[[DS:.*]] = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a", ?}, {?}]>]>}
--  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{?}, {"a", ?}]>]>}
--  // CHECK-NEXT: return %[[DS]], %[[ADD]]
--  %1 = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [8, 2] : (tensor<8x8xf32>, tensor<i32>, tensor<i32>) -> tensor<8x2xf32>
--  %2 = stablehlo.add %arg0, %arg0 : tensor<8x8xf32>
--  return %1, %2 : tensor<8x2xf32>, tensor<8x8xf32>
--}
--
- // Verify that the element-wise ops are sharded on dim 1 due to the
- // `sharding_constraint`. Without `sharding_constraint` haveing the
- // `Elementwise` trait, then the element-wise ops would be sharded on dim 0
 diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index 8ee25f9..5758bb1 100644
+index 442e692..e5697ac 100644
 --- a/third_party/llvm/generated.patch
 +++ b/third_party/llvm/generated.patch
-@@ -1,24 +1,13 @@
+@@ -1,154 +1,6246 @@
  Auto generated patch. Do not edit or delete it, even if empty.
--diff -ruN --strip-trailing-cr a/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
----- a/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
--+++ b/llvm/test/tools/llc/new-pm/x86_64-regalloc-pipeline.mir
--@@ -1,6 +1,6 @@
-- # REQUIRES x86_64-registered-target
---# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=fast -print-pipeline-passes %s 2>&1 | FileCheck %s
---# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=greedy -print-pipeline-passes %s 2>&1 | FileCheck %s --check-prefix=CHECK-GREEDY
--+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=fast -print-pipeline-passes %s -o - 2>&1 | FileCheck %s
--+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-new-pm -O3 -regalloc-npm=greedy -print-pipeline-passes %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-GREEDY
-+diff -ruN --strip-trailing-cr a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
-+--- a/libc/src/stdlib/qsort_pivot.h
-++++ b/libc/src/stdlib/qsort_pivot.h
-+@@ -9,6 +9,8 @@
-+ #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
-+ #define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
+-diff -ruN --strip-trailing-cr a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
+---- a/clang/include/clang/AST/Type.h
+-+++ b/clang/include/clang/AST/Type.h
+-@@ -3602,6 +3602,9 @@
++diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
++--- a/clang/lib/Sema/SemaDecl.cpp
+++++ b/clang/lib/Sema/SemaDecl.cpp
++@@ -4755,8 +4755,16 @@
++         return;
++     }
++   } else {
++-    Diag(New->getLocation(), diag::warn_cxx_compat_tentative_definition) << New;
++-    Diag(Old->getLocation(), diag::note_previous_declaration);
+++    // C++ may not have a tentative definition rule, but it has a different
+++    // rule about what constitutes a definition in the first place. See
+++    // [basic.def]p2 for details, but the basic idea is: if the old declaration
+++    // contains the extern specifier and doesn't have an initializer, it's fine
+++    // in C++.
+++    if (Old->getStorageClass() != SC_Extern || Old->hasInit()) {
+++      Diag(New->getLocation(), diag::warn_cxx_compat_tentative_definition)
+++          << New;
+++      Diag(Old->getLocation(), diag::note_previous_declaration);
+++    }
+    }
+  
+-   NestedNameSpecifier *getQualifier() const { return Qualifier; }
+-+  /// Note: this can trigger extra deserialization when external AST sources are
+-+  /// used. Prefer `getCXXRecordDecl()` unless you really need the most recent
+-+  /// decl.
+-   CXXRecordDecl *getMostRecentCXXRecordDecl() const;
++   if (haveIncompatibleLanguageLinkages(Old, New)) {
++diff -ruN --strip-trailing-cr a/clang/test/Sema/warn-tentative-defn-compat.c b/clang/test/Sema/warn-tentative-defn-compat.c
++--- a/clang/test/Sema/warn-tentative-defn-compat.c
+++++ b/clang/test/Sema/warn-tentative-defn-compat.c
++@@ -20,4 +20,7 @@
++                cxx-error {{redefinition of 'k'}}
+  
+-   bool isSugared() const;
+-@@ -3610,7 +3613,10 @@
+-   }
++ // Cannot have two declarations with initializers, that is a redefinition in
++-// both C and C++.
+++// both C and C++. However, C++ does have a different definition of what makes
+++// a declaration a definition.
+++extern const int a;
+++const int a = 12; // Okay in C and C++
++diff -ruN --strip-trailing-cr a/clang/unittests/Frontend/SearchPathTest.cpp b/clang/unittests/Frontend/SearchPathTest.cpp
++--- a/clang/unittests/Frontend/SearchPathTest.cpp
+++++ b/clang/unittests/Frontend/SearchPathTest.cpp
++@@ -51,6 +51,7 @@
++   FileManager FileMgr;
++   SourceManager SourceMgr;
++   std::unique_ptr<CompilerInvocation> Invocation;
+++  IntrusiveRefCntPtr<TargetInfo> Target;
++ 
++   void addDirectories(ArrayRef<StringRef> Dirs) {
++     for (StringRef Dir : Dirs) {
++@@ -65,10 +66,9 @@
++     CompilerInvocation::CreateFromArgs(*Invocation, Args, Diags);
++     HeaderSearchOptions HSOpts = Invocation->getHeaderSearchOpts();
++     LangOptions LangOpts = Invocation->getLangOpts();
++-    TargetInfo *Target =
++-        TargetInfo::CreateTargetInfo(Diags, Invocation->getTargetOpts());
+++    Target = TargetInfo::CreateTargetInfo(Diags, Invocation->getTargetOpts());
++     auto HeaderInfo = std::make_unique<HeaderSearch>(HSOpts, SourceMgr, Diags,
++-                                                     LangOpts, Target);
+++                                                     LangOpts, Target.get());
++     ApplyHeaderSearchOptions(*HeaderInfo, HSOpts, LangOpts,
++                              Target->getTriple());
++     return HeaderInfo;
++diff -ruN --strip-trailing-cr a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
++--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
++@@ -23,7 +23,6 @@
++ #include "llvm/ADT/iterator_range.h"
++ #include "llvm/CodeGen/MachineBasicBlock.h"
++ #include "llvm/CodeGen/MachineFunction.h"
++-#include "llvm/CodeGen/MachineInstr.h"
++ #include "llvm/CodeGen/MachineInstrBundle.h"
++ #include "llvm/CodeGen/MachineOperand.h"
++ #include "llvm/CodeGen/RegisterBank.h"
++@@ -586,9 +585,6 @@
++   /// multiple uses.
++   bool hasOneNonDBGUser(Register RegNo) const;
++ 
++-  /// If the register has a single non-Debug instruction using the specified
++-  /// register, returns it; otherwise returns nullptr.
++-  MachineInstr *getOneNonDBGUser(Register RegNo) const;
++ 
++   /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
++   /// non-debug user instructions.
++diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
++--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
++@@ -432,11 +432,6 @@
++   return hasSingleElement(use_nodbg_instructions(RegNo));
++ }
+  
+-   void Profile(llvm::FoldingSetNodeID &ID) {
+--    Profile(ID, getPointeeType(), getQualifier(), getMostRecentCXXRecordDecl());
+-+    // FIXME: `getMostRecentCXXRecordDecl()` should be possible to use here,
+-+    // however when external AST sources are used it causes nondeterminism
+-+    // issues (see https://github.com/llvm/llvm-project/pull/137910).
+-+    Profile(ID, getPointeeType(), getQualifier(), getCXXRecordDecl());
++-MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
++-  auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
++-  return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
++-}
++-
++ bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
++                                               unsigned MaxUsers) const {
++   return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
++--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
++@@ -53,20 +53,11 @@
++   /// UnifiedVGPRFile
++   unsigned getVGPRNum(bool UnifiedVGPRFile) const {
++     if (UnifiedVGPRFile) {
++-      return Value[AGPR32] ? getUnifiedVGPRNum(Value[VGPR32], Value[AGPR32])
+++      return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
++                            : Value[VGPR32] + Value[AGPR32];
++     }
++     return std::max(Value[VGPR32], Value[AGPR32]);
+    }
++-
++-  /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
++-  /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
++-  inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
++-                                           unsigned NumAGPRs) {
++-    return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
++-           NumAGPRs;
++-  }
++-
++   /// \returns the ArchVGPR32 pressure
++   unsigned getArchVGPRNum() const { return Value[VGPR32]; }
++   /// \returns the AccVGPR32 pressure
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
++--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
++@@ -25,13 +25,8 @@
++ 
++ #include "GCNSchedStrategy.h"
++ #include "AMDGPUIGroupLP.h"
++-#include "GCNRegPressure.h"
++ #include "SIMachineFunctionInfo.h"
++-#include "Utils/AMDGPUBaseInfo.h"
++-#include "llvm/ADT/STLExtras.h"
++ #include "llvm/CodeGen/RegisterClassInfo.h"
++-#include "llvm/MC/LaneBitmask.h"
++-#include "llvm/Support/ErrorHandling.h"
+  
+-   static void Profile(llvm::FoldingSetNodeID &ID, QualType Pointee,
+-@@ -3620,6 +3626,9 @@
+-   static bool classof(const Type *T) {
+-     return T->getTypeClass() == MemberPointer;
++ #define DEBUG_TYPE "machine-scheduler"
++ 
++@@ -306,11 +301,11 @@
++     HasHighPressure = true;
++     if (SGPRDelta > VGPRDelta) {
++       Cand.RPDelta.CriticalMax =
++-          PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
+++        PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
++       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
++     } else {
++       Cand.RPDelta.CriticalMax =
++-          PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
+++        PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
++       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
++     }
+    }
++@@ -323,7 +318,7 @@
++                                          const RegPressureTracker &RPTracker,
++                                          SchedCandidate &Cand,
++                                          bool IsBottomUp) {
++-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+++  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
++   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
++   unsigned SGPRPressure = 0;
++   unsigned VGPRPressure = 0;
++@@ -419,7 +414,7 @@
++       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
++                         /*IsBottomUp=*/false);
++       assert(TCand.SU == TopCand.SU &&
++-             "Last pick result should correspond to re-picking right now");
+++           "Last pick result should correspond to re-picking right now");
++     }
++ #endif
++   }
++@@ -895,13 +890,13 @@
++   std::vector<MachineInstr *> RegionFirstMIs;
++   RegionFirstMIs.reserve(Regions.size());
++   auto I = Regions.rbegin(), E = Regions.rend();
+++  auto *BB = I->first->getParent();
++   do {
++-    const MachineBasicBlock *MBB = I->first->getParent();
++     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
++     RegionFirstMIs.push_back(MI);
++     do {
++       ++I;
++-    } while (I != E && I->first->getParent() == MBB);
+++    } while (I != E && I->first->getParent() == BB);
++   } while (I != E);
++   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
++ }
++@@ -1086,46 +1081,31 @@
++   return true;
++ }
++ 
++-/// Allows to easily filter for this stage's debug output.
++-#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
++-
++ bool PreRARematStage::initGCNSchedStage() {
++-  // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
++-  // regions inbetween the defs and region we sinked the def to. Will need to be
++-  // fixed if there is another pass after this pass.
++-  assert(!S.hasNextStage());
+++  if (!GCNSchedStage::initGCNSchedStage())
+++    return false;
++ 
++-  if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
++-      DAG.Regions.size() == 1)
+++  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
++     return false;
++ 
++-  // Before performing any IR modification record the parent region of each MI
++-  // and the parent MBB of each region.
++-  const unsigned NumRegions = DAG.Regions.size();
++-  RegionBB.reserve(NumRegions);
++-  for (unsigned I = 0; I < NumRegions; ++I) {
++-    RegionBoundaries Region = DAG.Regions[I];
++-    for (auto MI = Region.first; MI != Region.second; ++MI)
++-      MIRegion.insert({&*MI, I});
++-    RegionBB.push_back(Region.first->getParent());
++-  }
+++  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+++  // Rematerialization will not help if occupancy is not limited by reg usage.
+++  if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
+++    return false;
++ 
++-  if (!canIncreaseOccupancyOrReduceSpill())
+++  // FIXME: This pass will invalidate cached MBBLiveIns for regions
+++  // inbetween the defs and region we sinked the def to. Cached pressure
+++  // for regions where a def is sinked from will also be invalidated. Will
+++  // need to be fixed if there is another pass after this pass.
+++  assert(!S.hasNextStage());
+ +
+-+private:
+-+  CXXRecordDecl *getCXXRecordDecl() const;
+- };
+++  collectRematerializableInstructions();
+++  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
++     return false;
   
-- # CHECK: regallocfast
-- # CHECK-GREEDY: greedy<all>
--diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
----- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--@@ -9662,6 +9662,7 @@
--         "lib/Target/LLVMIR/DataLayoutImporter.h",
--         "lib/Target/LLVMIR/DebugImporter.cpp",
--         "lib/Target/LLVMIR/DebugImporter.h",
--+        "lib/Target/LLVMIR/LLVMImportInterface.cpp",
--         "lib/Target/LLVMIR/LoopAnnotationImporter.cpp",
--         "lib/Target/LLVMIR/LoopAnnotationImporter.h",
--         "lib/Target/LLVMIR/ModuleImport.cpp",
-++#include "src/__support/macros/attributes.h"
+- /// Capture whether this is a normal array (e.g. int X[4])
+-diff -ruN --strip-trailing-cr a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
+---- a/clang/lib/AST/Type.cpp
+-+++ b/clang/lib/AST/Type.cpp
+-@@ -5305,10 +5305,14 @@
+-     ID.AddPointer(Cls->getCanonicalDecl());
++-  // Rematerialize identified instructions and update scheduler's state.
++-  rematerialize();
++-  if (GCNTrackers)
++-    DAG.RegionLiveOuts.buildLiveRegMap();
++-  REMAT_DEBUG(
++-      dbgs() << "Retrying function scheduling with new min. occupancy of "
++-             << AchievedOcc << " from rematerializing (original was "
++-             << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
++-  if (AchievedOcc > DAG.MinOccupancy) {
++-    DAG.MinOccupancy = AchievedOcc;
++-    SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
++-    MFI.increaseOccupancy(MF, DAG.MinOccupancy);
++-  }
+++  LLVM_DEBUG(
+++      dbgs() << "Retrying function scheduling with improved occupancy of "
+++             << DAG.MinOccupancy << " from rematerializing\n");
++   return true;
+  }
+  
+-+CXXRecordDecl *MemberPointerType::getCXXRecordDecl() const {
+-+  return dyn_cast<MemberPointerType>(getCanonicalTypeInternal())
+-+      ->getQualifier()
+-+      ->getAsRecordDecl();
+-+}
++@@ -1513,7 +1493,8 @@
++       dbgs()
++       << "\n\t      *** In shouldRevertScheduling ***\n"
++       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
++-  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+++  ScheduleMetrics MBefore =
+++      getScheduleMetrics(DAG.SUnits);
++   LLVM_DEBUG(
++       dbgs()
++       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
++@@ -1546,9 +1527,13 @@
++ }
++ 
++ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
++-  return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
++-         mayCauseSpilling(WavesAfter) ||
++-         (IncreaseOccupancy && WavesAfter < TargetOcc);
+++  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+++    return true;
+ +
+- CXXRecordDecl *MemberPointerType::getMostRecentCXXRecordDecl() const {
+--  auto *RD = dyn_cast<MemberPointerType>(getCanonicalTypeInternal())
+--                 ->getQualifier()
+--                 ->getAsRecordDecl();
+-+  auto *RD = getCXXRecordDecl();
+-   if (!RD)
+-     return nullptr;
+-   return RD->getMostRecentNonInjectedDecl();
+-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
+---- a/clang/lib/Sema/SemaChecking.cpp
+-+++ b/clang/lib/Sema/SemaChecking.cpp
+-@@ -11596,6 +11596,15 @@
+-   }
+++  if (mayCauseSpilling(WavesAfter))
+++    return true;
+++
+++  return false;
+  }
+  
+-+static void CheckCommaOperand(Sema &S, Expr *E, QualType T, SourceLocation CC,
+-+                              bool ExtraCheckForImplicitConversion) {
+-+  E = E->IgnoreParenImpCasts();
+-+  AnalyzeImplicitConversions(S, E, CC);
++ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
++@@ -1698,407 +1683,160 @@
++   return true;
++ }
++ 
++-namespace {
++-/// Models excess register pressure in a region and tracks our progress as we
++-/// identify rematerialization opportunities.
++-struct ExcessRP {
++-  /// Number of excess ArchVGPRs.
++-  unsigned ArchVGPRs = 0;
++-  /// Number of excess AGPRs.
++-  unsigned AGPRs = 0;
++-  /// For unified register files, number of excess VGPRs.
++-  unsigned VGPRs = 0;
++-  /// For unified register files with AGPR usage, number of excess ArchVGPRs to
++-  /// save before we are able to save a whole allocation granule.
++-  unsigned ArchVGPRsToAlignment = 0;
++-  /// Whether the region uses AGPRs.
++-  bool HasAGPRs = false;
++-  /// Whether the subtarget has a unified RF.
++-  bool UnifiedRF;
++-
++-  /// Constructs the excess RP model; determines the excess pressure w.r.t. a
++-  /// maximum number of allowed VGPRs.
++-  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
++-
++-  /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
++-  /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
++-  /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
++-  /// saving these ArchVGPRs helped reduce excess pressure.
++-  bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
++-
++-  /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
++-  /// these ArchVGPRs helped reduce excess pressure.
++-  bool saveAGPRs(unsigned NumRegs);
++-
++-  /// Returns whether there is any excess register pressure.
++-  operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
++-
++-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
++-  friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
++-    OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
++-       << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
++-       << Excess.ArchVGPRsToAlignment << " registers)\n";
++-    return OS;
++-  }
++-#endif
+++void PreRARematStage::collectRematerializableInstructions() {
+++  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+++  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
+++    Register Reg = Register::index2VirtReg(I);
+++    if (!DAG.LIS->hasInterval(Reg))
+++      continue;
++ 
++-private:
++-  static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
++-    unsigned NumSaved = std::min(LeftToSave, NumRegs);
++-    NumRegs -= NumSaved;
++-    LeftToSave -= NumSaved;
++-    return NumSaved;
++-  }
++-};
++-} // namespace
+++    // TODO: Handle AGPR and SGPR rematerialization
+++    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+++        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
+++      continue;
++ 
++-ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
++-                   unsigned MaxVGPRs)
++-    : UnifiedRF(ST.hasGFX90AInsts()) {
++-  unsigned NumArchVGPRs = RP.getArchVGPRNum();
++-  unsigned NumAGPRs = RP.getAGPRNum();
++-  HasAGPRs = NumAGPRs;
++-
++-  if (!UnifiedRF) {
++-    // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
++-    // independently.
++-    if (NumArchVGPRs > MaxVGPRs)
++-      ArchVGPRs = NumArchVGPRs - MaxVGPRs;
++-    if (NumAGPRs > MaxVGPRs)
++-      AGPRs = NumAGPRs - MaxVGPRs;
++-    return;
++-  }
+++    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
+++    MachineInstr *Def = Op->getParent();
+++    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+++      continue;
++ 
++-  // Independently of whether overall VGPR pressure is under the limit, we still
++-  // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
++-  // number of addressable registers in each category.
++-  const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
++-  if (NumArchVGPRs > MaxArchVGPRs) {
++-    ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
++-    NumArchVGPRs = MaxArchVGPRs;
++-  }
++-  if (NumAGPRs > MaxArchVGPRs) {
++-    AGPRs = NumAGPRs - MaxArchVGPRs;
++-    NumAGPRs = MaxArchVGPRs;
++-  }
++-
++-  // Check overall VGPR usage against the limit; any excess above addressable
++-  // register limits has already been accounted for.
++-  const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
++-  unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
++-  if (NumVGPRs > MaxVGPRs) {
++-    VGPRs = NumVGPRs - MaxVGPRs;
++-    ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
++-    if (!ArchVGPRsToAlignment)
++-      ArchVGPRsToAlignment = Granule;
++-  }
++-}
+++    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
+++    if (Def->getParent() == UseI->getParent())
+++      continue;
++ 
++-bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
++-  bool Progress = saveRegs(ArchVGPRs, NumRegs);
++-  if (!NumRegs)
++-    return Progress;
++-
++-  if (!UnifiedRF) {
++-    if (UseArchVGPRForAGPRSpill)
++-      Progress |= saveRegs(AGPRs, NumRegs);
++-  } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
++-    // There is progress as long as there are VGPRs left to save, even if the
++-    // save induced by this particular call does not cross an ArchVGPR alignment
++-    // barrier.
++-    Progress = true;
++-
++-    // ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
++-    unsigned NumSavedRegs = 0;
++-
++-    // Count the number of whole ArchVGPR allocation granules we can save.
++-    const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
++-    if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
++-      NumSavedRegs = NumGranules * Granule;
++-      NumRegs -= NumSavedRegs;
++-    }
++-
++-    // We may be able to save one more whole ArchVGPR allocation granule.
++-    if (NumRegs >= ArchVGPRsToAlignment) {
++-      NumSavedRegs += Granule;
++-      ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
++-    } else {
++-      ArchVGPRsToAlignment -= NumRegs;
+++    bool HasRematDependency = false;
+++    // Check if this instruction uses any registers that are planned to be
+++    // rematerialized
+++    for (auto &RematEntry : RematerializableInsts) {
+++      if (find_if(RematEntry.second,
+++                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
+++                    for (MachineOperand &MO : Def->operands()) {
+++                      if (!MO.isReg())
+++                        continue;
+++                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
+++                        return true;
+++                    }
+++                    return false;
+++                  }) != RematEntry.second.end()) {
+++        HasRematDependency = true;
+++        break;
+++      }
++     }
+++    // Do not rematerialize an instruction if it uses an instruction that we
+++    // have designated for rematerialization.
+++    // FIXME: Allow for rematerialization chains: this requires 1. updating
+++    // remat points to account for uses that are rematerialized, and 2. either
+++    // rematerializing the candidates in careful ordering, or deferring the MBB
+++    // RP walk until the entire chain has been rematerialized.
+++    if (HasRematDependency)
+++      continue;
++ 
++-    // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
++-    // spilling and have some free ArchVGPR slots.
++-    saveRegs(VGPRs, NumSavedRegs);
++-    if (UseArchVGPRForAGPRSpill)
++-      saveRegs(AGPRs, NumSavedRegs);
++-  } else {
++-    // No AGPR usage in the region i.e., no allocation granule to worry about.
++-    Progress |= saveRegs(VGPRs, NumRegs);
++-  }
++-
++-  return Progress;
++-}
++-
++-bool ExcessRP::saveAGPRs(unsigned NumRegs) {
++-  return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
++-}
+++    // Similarly, check if the UseI is planned to be remat.
+++    for (auto &RematEntry : RematerializableInsts) {
+++      if (find_if(RematEntry.second,
+++                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
+++                    return Remat.first == UseI;
+++                  }) != RematEntry.second.end()) {
+++        HasRematDependency = true;
+++        break;
+++      }
+++    }
++ 
++-bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
++-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+++    if (HasRematDependency)
+++      break;
++ 
++-  REMAT_DEBUG({
++-    dbgs() << "Collecting rematerializable instructions in ";
++-    MF.getFunction().printAsOperand(dbgs(), false);
++-    dbgs() << '\n';
++-  });
++-
++-  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
++-  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
++-  DenseMap<unsigned, ExcessRP> OptRegions;
++-  const Function &F = MF.getFunction();
++-
++-  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
++-  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
++-  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
++-  const unsigned MaxSGPRsIncOcc =
++-      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
++-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
++-  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
++-
++-  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
++-    if (Cond) {
++-      // We won't try to increase occupancy.
++-      IncreaseOccupancy = false;
++-      OptRegions.clear();
++-    }
++-    return Cond;
++-  };
++-
++-  // Collect optimizable regions. If there is spilling in any region we will
++-  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
++-  // occupancy by one in the whole function.
++-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
++-    GCNRegPressure &RP = DAG.Pressure[I];
++-
++-    // Check whether SGPR pressures prevents us from eliminating spilling.
++-    unsigned NumSGPRs = RP.getSGPRNum();
++-    if (NumSGPRs > MaxSGPRsNoSpill)
++-      ClearOptRegionsIf(IncreaseOccupancy);
++-
++-    ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
++-    if (Excess) {
++-      ClearOptRegionsIf(IncreaseOccupancy);
++-    } else if (IncreaseOccupancy) {
++-      // Check whether SGPR pressure prevents us from increasing occupancy.
++-      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
++-        if (DAG.MinOccupancy >= WavesPerEU.first)
++-          return false;
++-        continue;
++-      }
++-      if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
++-        // We can only rematerialize ArchVGPRs at this point.
++-        unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
++-        bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
++-        if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
++-          if (DAG.MinOccupancy >= WavesPerEU.first)
++-            return false;
++-          continue;
+++    // We are only collecting defs that are defined in another block and are
+++    // live-through or used inside regions at MinOccupancy. This means that the
+++    // register must be in the live-in set for the region.
+++    bool AddedToRematList = false;
+++    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+++      auto It = DAG.LiveIns[I].find(Reg);
+++      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
+++        if (DAG.RegionsWithMinOcc[I]) {
+++          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
+++          SlotIndex UseIdx =
+++              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
+++          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
+++            RematerializableInsts[I][Def] = UseI;
+++            AddedToRematList = true;
+++          }
++         }
+ +
+-+  if (ExtraCheckForImplicitConversion && E->getType() != T)
+-+    S.CheckImplicitConversion(E, T, CC);
+++        // Collect regions with rematerializable reg as live-in to avoid
+++        // searching later when updating RP.
+++        RematDefToLiveInRegions[Def].push_back(I);
++       }
++     }
++-    if (Excess)
++-      OptRegions.insert({I, Excess});
+++    if (!AddedToRematList)
+++      RematDefToLiveInRegions.erase(Def);
++   }
++-  if (OptRegions.empty())
++-    return false;
++-
++-#ifndef NDEBUG
++-  if (IncreaseOccupancy)
++-    REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
++-  else
++-    REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
++-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
++-    if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
++-      REMAT_DEBUG(dbgs() << "  " << I << ": " << OptIt->getSecond() << '\n');
++-  }
++-#endif
++-
++-  // When we are reducing spilling, the target is the minimum target number of
++-  // waves/EU determined by the subtarget.
++-  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
++-
++-  // Accounts for a reduction in RP in an optimizable region. Returns whether we
++-  // estimate that we have identified enough rematerialization opportunities to
++-  // achieve our goal, and sets Progress to true when this particular reduction
++-  // in pressure was helpful toward that goal.
++-  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
++-                              bool &Progress) -> bool {
++-    ExcessRP &Excess = OptIt->getSecond();
++-    // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
++-    // only when we are just trying to eliminate spilling to memory. At this
++-    // point we err on the conservative side and do not increase
++-    // register-to-register spilling for the sake of increasing occupancy.
++-    Progress |=
++-        Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
++-                             /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
++-    if (!Excess)
++-      OptRegions.erase(OptIt->getFirst());
++-    return OptRegions.empty();
++-  };
++-
++-  // We need up-to-date live-out info. to query live-out register masks in
++-  // regions containing rematerializable instructions.
++-  DAG.RegionLiveOuts.buildLiveRegMap();
++-
++-  // Cache set of registers that are going to be rematerialized.
++-  DenseSet<unsigned> RematRegs;
++-
++-  // Identify rematerializable instructions in the function.
++-  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
++-    auto Region = DAG.Regions[I];
++-    for (auto MI = Region.first; MI != Region.second; ++MI) {
++-      // The instruction must be trivially rematerializable.
++-      MachineInstr &DefMI = *MI;
++-      if (!isTriviallyReMaterializable(DefMI))
++-        continue;
++-
++-      // We only support rematerializing virtual VGPRs with one definition.
++-      Register Reg = DefMI.getOperand(0).getReg();
++-      if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
++-          !DAG.MRI.hasOneDef(Reg))
++-        continue;
++-
++-      // We only care to rematerialize the instruction if it has a single
++-      // non-debug user in a different region. The using MI may not belong to a
++-      // region if it is a lone region terminator.
++-      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
++-      if (!UseMI)
++-        continue;
++-      auto UseRegion = MIRegion.find(UseMI);
++-      if (UseRegion != MIRegion.end() && UseRegion->second == I)
++-        continue;
++-
++-      // Do not rematerialize an instruction if it uses or is used by an
++-      // instruction that we have designated for rematerialization.
++-      // FIXME: Allow for rematerialization chains: this requires 1. updating
++-      // remat points to account for uses that are rematerialized, and 2. either
++-      // rematerializing the candidates in careful ordering, or deferring the
++-      // MBB RP walk until the entire chain has been rematerialized.
++-      if (Rematerializations.contains(UseMI) ||
++-          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
++-            return MO.isReg() && RematRegs.contains(MO.getReg());
++-          }))
++-        continue;
+ +}
++ 
++-      // Do not rematerialize an instruction it it uses registers that aren't
++-      // available at its use. This ensures that we are not extending any live
++-      // range while rematerializing.
++-      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
++-      SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
++-      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
++-        continue;
+++bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+++                                              const TargetInstrInfo *TII) {
+++  // Temporary copies of cached variables we will be modifying and replacing if
+++  // sinking succeeds.
+++  SmallVector<
+++      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
+++      NewRegions;
+++  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
+++  DenseMap<unsigned, GCNRegPressure> NewPressure;
+++  BitVector NewRescheduleRegions;
+++  LiveIntervals *LIS = DAG.LIS;
++ 
++-      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
++-      RematInstruction &Remat =
++-          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
++-
++-      bool RematUseful = false;
++-      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
++-        // Optimistically consider that moving the instruction out of its
++-        // defining region will reduce RP in the latter; this assumes that
++-        // maximum RP in the region is reached somewhere between the defining
++-        // instruction and the end of the region.
++-        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
++-        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
++-        if (ReduceRPInRegion(It, Mask, RematUseful))
++-          return true;
++-      }
+++  NewRegions.resize(DAG.Regions.size());
+++  NewRescheduleRegions.resize(DAG.Regions.size());
++ 
++-      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
++-        // We are only collecting regions in which the register is a live-in
++-        // (and may be live-through).
++-        auto It = DAG.LiveIns[LIRegion].find(Reg);
++-        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
++-          continue;
++-        Remat.LiveInRegions.insert(LIRegion);
+++  // Collect only regions that has a rematerializable def as a live-in.
+++  SmallSet<unsigned, 16> ImpactedRegions;
+++  for (const auto &It : RematDefToLiveInRegions)
+++    ImpactedRegions.insert_range(It.second);
+++
+++  // Make copies of register pressure and live-ins cache that will be updated
+++  // as we rematerialize.
+++  for (auto Idx : ImpactedRegions) {
+++    NewPressure[Idx] = DAG.Pressure[Idx];
+++    NewLiveIns[Idx] = DAG.LiveIns[Idx];
+++  }
+++  NewRegions = DAG.Regions;
+++  NewRescheduleRegions.reset();
+ +
+- /// Analyze the given compound assignment for the possible losing of
+- /// floating-point precision.
+- static void AnalyzeCompoundAssignment(Sema &S, BinaryOperator *E) {
+-@@ -12413,7 +12422,7 @@
+-           << OrigE->getSourceRange() << T->isBooleanType()
+-           << FixItHint::CreateReplacement(UO->getBeginLoc(), "!");
+- 
+--  if (const auto *BO = dyn_cast<BinaryOperator>(SourceExpr))
+-+  if (auto *BO = dyn_cast<BinaryOperator>(SourceExpr)) {
+-     if ((BO->getOpcode() == BO_And || BO->getOpcode() == BO_Or) &&
+-         BO->getLHS()->isKnownToHaveBooleanValue() &&
+-         BO->getRHS()->isKnownToHaveBooleanValue() &&
+-@@ -12439,7 +12448,21 @@
+-                    (BO->getOpcode() == BO_And ? "&&" : "||"));
+-         S.Diag(BO->getBeginLoc(), diag::note_cast_operand_to_int);
+++  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+++  bool Improved = false;
+++  for (auto I : ImpactedRegions) {
+++    if (!DAG.RegionsWithMinOcc[I])
+++      continue;
++ 
++-        // Account for the reduction in RP due to the rematerialization in an
++-        // optimizable region in which the defined register is a live-in. This
++-        // is exact for live-through region but optimistic in the using region,
++-        // where RP is actually reduced only if maximum RP is reached somewhere
++-        // between the beginning of the region and the rematerializable
++-        // instruction's use.
++-        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
++-          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
++-          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
++-            return true;
++-        }
++-      }
+++    Improved = false;
+++    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
+++    int SGPRUsage = NewPressure[I].getSGPRNum();
 ++
-+ #include <stddef.h> // For size_t
+++    // TODO: Handle occupancy drop due to AGPR and SGPR.
+++    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
+++    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
+++      break;
++ 
++-      // If the instruction is not a live-in or live-out in any optimizable
++-      // region then there is no point in rematerializing it.
++-      if (!RematUseful) {
++-        Rematerializations.pop_back();
++-        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
++-      } else {
++-        RematRegs.insert(Reg);
++-      }
+++    // The occupancy of this region could have been improved by a previous
+++    // iteration's sinking of defs.
+++    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
+++      NewRescheduleRegions[I] = true;
+++      Improved = true;
+++      continue;
++     }
++-  }
++-
++-  if (IncreaseOccupancy) {
++-    // We were trying to increase occupancy but failed, abort the stage.
++-    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
++-    Rematerializations.clear();
++-    return false;
++-  }
++-  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
++-  return !Rematerializations.empty();
++-}
++-
++-void PreRARematStage::rematerialize() {
++-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
++-
++-  // Collect regions whose RP changes in unpredictable way; we will have to
++-  // fully recompute their RP after all rematerailizations.
++-  DenseSet<unsigned> RecomputeRP;
++-
++-  // Rematerialize all instructions.
++-  for (auto &[DefMI, Remat] : Rematerializations) {
++-    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
++-    Register Reg = DefMI->getOperand(0).getReg();
++-    unsigned SubReg = DefMI->getOperand(0).getSubReg();
++-    unsigned DefRegion = MIRegion.at(DefMI);
++-
++-    // Rematerialize DefMI to its use block.
++-    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
++-                       *DAG.TRI);
++-    Remat.RematMI = &*std::prev(InsertPos);
++-    Remat.RematMI->getOperand(0).setSubReg(SubReg);
++-    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
++-
++-    // Update region boundaries in regions we sinked from (remove defining MI)
++-    // and to (insert MI rematerialized in use block). Only then we can erase
++-    // the original MI.
++-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
++-    auto UseRegion = MIRegion.find(Remat.UseMI);
++-    if (UseRegion != MIRegion.end()) {
++-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
++-                                 Remat.RematMI);
++-    }
++-    DefMI->eraseFromParent();
++-    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
++-
++-    // Collect all regions impacted by the rematerialization and update their
++-    // live-in/RP information.
++-    for (unsigned I : Remat.LiveInRegions) {
++-      ImpactedRegions.insert({I, DAG.Pressure[I]});
++-      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
++ 
+++    // First check if we have enough trivially rematerializable instructions to
+++    // improve occupancy. Optimistically assume all instructions we are able to
+++    // sink decreased RP.
+++    int TotalSinkableRegs = 0;
+++    for (const auto &It : RematerializableInsts[I]) {
+++      MachineInstr *Def = It.first;
+++      Register DefReg = Def->getOperand(0).getReg();
+++      TotalSinkableRegs +=
+++          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
++ #ifdef EXPENSIVE_CHECKS
++       // All uses are known to be available / live at the remat point. Thus, the
++       // uses should already be live in to the region.
++-      for (MachineOperand &MO : DefMI->operands()) {
+++      for (MachineOperand &MO : Def->operands()) {
++         if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
++           continue;
++ 
++@@ -2106,12 +1844,13 @@
++         if (!UseReg.isVirtual())
++           continue;
++ 
++-        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
+++        LiveInterval &LI = LIS->getInterval(UseReg);
++         LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
++         if (LI.hasSubRanges() && MO.getSubReg())
++           LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
 + 
-+ namespace LIBC_NAMESPACE_DECL {
++-        LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
+++        assert(NewLiveIns[I].contains(UseReg));
+++        LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
++         LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
++         // If this register has lanes not covered by the LiveIns, be sure they
++         // do not map to any subrange. ref:
++@@ -2123,64 +1862,126 @@
++         }
+        }
+-+    } else if (BO->isCommaOp() && !S.getLangOpts().CPlusPlus) {
+-+      /// Analyze the given comma operator. The basic idea behind the analysis
+-+      /// is to analyze the left and right operands slightly differently. The
+-+      /// left operand needs to check whether the operand itself has an implicit
+-+      /// conversion, but not whether the left operand induces an implicit
+-+      /// conversion for the entire comma expression itself. This is similar to
+-+      /// how CheckConditionalOperand behaves; it's as-if the correct operand
+-+      /// were directly used for the implicit conversion check.
+-+      CheckCommaOperand(S, BO->getLHS(), T, BO->getOperatorLoc(),
+-+                        /*ExtraCheckForImplicitConversion=*/false);
+-+      CheckCommaOperand(S, BO->getRHS(), T, BO->getOperatorLoc(),
+-+                        /*ExtraCheckForImplicitConversion=*/true);
+-+      return;
++ #endif
++-
++-      // The register is no longer a live-in in all regions but the one that
++-      // contains the single use. In live-through regions, maximum register
++-      // pressure decreases predictably so we can directly update it. In the
++-      // using region, maximum RP may or may not decrease, so we will mark it
++-      // for re-computation after all materializations have taken place.
++-      LaneBitmask PrevMask = RegionLiveIns[Reg];
++-      RegionLiveIns.erase(Reg);
++-      RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
++-      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
++-        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
++-      else
++-        RecomputeRP.insert(I);
+      }
++-    // RP in the region from which the instruction was rematerialized may or may
++-    // not decrease.
++-    ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
++-    RecomputeRP.insert(DefRegion);
++-
++-    // Recompute live interval to reflect the register's rematerialization.
++-    Register RematReg = Remat.RematMI->getOperand(0).getReg();
++-    DAG.LIS->removeInterval(RematReg);
++-    DAG.LIS->createAndComputeVirtRegInterval(RematReg);
++-  }
++-
++-  // All regions impacted by at least one rematerialization must be rescheduled.
++-  // Maximum pressure must also be recomputed for all regions where it changed
++-  // non-predictably and checked against the target occupancy.
++-  AchievedOcc = TargetOcc;
++-  for (auto &[I, OriginalRP] : ImpactedRegions) {
++-    bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
++-    DAG.RescheduleRegions[I] = !IsEmptyRegion;
++-    if (!RecomputeRP.contains(I))
++-      continue;
+++    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
+++    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
+++    // If in the most optimistic scenario, we cannot improve occupancy, then do
+++    // not attempt to sink any instructions.
+++    if (OptimisticOccupancy <= DAG.MinOccupancy)
+++      break;
++ 
++-    GCNRegPressure RP;
++-    if (IsEmptyRegion) {
++-      RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
++-    } else {
++-      GCNDownwardRPTracker RPT(*DAG.LIS);
++-      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
++-                                                      DAG.Regions[I].second);
++-      if (NonDbgMI == DAG.Regions[I].second) {
++-        // Region is non-empty but contains only debug instructions.
++-        RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
++-      } else {
++-        RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
++-        RPT.advance(DAG.Regions[I].second);
++-        RP = RPT.moveMaxPressure();
+++    unsigned ImproveOccupancy = 0;
+++    SmallVector<MachineInstr *, 4> SinkedDefs;
+++    for (auto &It : RematerializableInsts[I]) {
+++      MachineInstr *Def = It.first;
+++      MachineBasicBlock::iterator InsertPos =
+++          MachineBasicBlock::iterator(It.second);
+++      Register Reg = Def->getOperand(0).getReg();
+++      // Rematerialize MI to its use block.
+++      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+++                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
+++      MachineInstr *NewMI = &*std::prev(InsertPos);
+++      LIS->InsertMachineInstrInMaps(*NewMI);
+++      LIS->removeInterval(Reg);
+++      LIS->createAndComputeVirtRegInterval(Reg);
+++      InsertedMIToOldDef[NewMI] = Def;
+++
+++      // Update region boundaries in scheduling region we sinked from since we
+++      // may sink an instruction that was at the beginning or end of its region
+++      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+++                                 /*Removing =*/true);
+++
+++      // Update region boundaries in region we sinked to.
+++      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+++
+++      LaneBitmask PrevMask = NewLiveIns[I][Reg];
+++      // FIXME: Also update cached pressure for where the def was sinked from.
+++      // Update RP for all regions that has this reg as a live-in and remove
+++      // the reg from all regions as a live-in.
+++      for (auto Idx : RematDefToLiveInRegions[Def]) {
+++        NewLiveIns[Idx].erase(Reg);
+++        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
+++          // Def is live-through and not used in this block.
+++          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+++        } else {
+++          // Def is used and rematerialized into this block.
+++          GCNDownwardRPTracker RPT(*LIS);
+++          auto *NonDbgMI = &*skipDebugInstructionsForward(
+++              NewRegions[Idx].first, NewRegions[Idx].second);
+++          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
+++          RPT.advance(NewRegions[Idx].second);
+++          NewPressure[Idx] = RPT.moveMaxPressure();
+++        }
++       }
+++
+++      SinkedDefs.push_back(Def);
+++      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
+++      if (ImproveOccupancy > DAG.MinOccupancy)
+++        break;
++     }
++-    DAG.Pressure[I] = RP;
++-    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
+++
+++    // Remove defs we just sinked from all regions' list of sinkable defs
+++    for (auto &Def : SinkedDefs)
+++      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
+++        RematerializableInsts[TrackedIdx].erase(Def);
+++
+++    if (ImproveOccupancy <= DAG.MinOccupancy)
+++      break;
+++
+++    NewRescheduleRegions[I] = true;
+++    Improved = true;
++   }
++-  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
+++
+++  if (!Improved) {
+++    // Occupancy was not improved for all regions that were at MinOccupancy.
+++    // Undo sinking and remove newly rematerialized instructions.
+++    for (auto &Entry : InsertedMIToOldDef) {
+++      MachineInstr *MI = Entry.first;
+++      MachineInstr *OldMI = Entry.second;
+++      Register Reg = MI->getOperand(0).getReg();
+++      LIS->RemoveMachineInstrFromMaps(*MI);
+++      MI->eraseFromParent();
+++      OldMI->clearRegisterDeads(Reg);
+++      LIS->removeInterval(Reg);
+++      LIS->createAndComputeVirtRegInterval(Reg);
+++    }
+++    return false;
+++  }
+++
+++  // Occupancy was improved for all regions.
+++  for (auto &Entry : InsertedMIToOldDef) {
+++    MachineInstr *MI = Entry.first;
+++    MachineInstr *OldMI = Entry.second;
+++
+++    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+++    DAG.BBLiveInMap.erase(OldMI);
+++
+++    // Remove OldMI and update LIS
+++    Register Reg = MI->getOperand(0).getReg();
+++    LIS->RemoveMachineInstrFromMaps(*OldMI);
+++    OldMI->eraseFromParent();
+++    LIS->removeInterval(Reg);
+++    LIS->createAndComputeVirtRegInterval(Reg);
+ +  }
+++
+++  // Update live-ins, register pressure, and regions caches.
+++  for (auto Idx : ImpactedRegions) {
+++    DAG.LiveIns[Idx] = NewLiveIns[Idx];
+++    DAG.Pressure[Idx] = NewPressure[Idx];
+++    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
+++  }
+++  DAG.Regions = NewRegions;
+++  DAG.RescheduleRegions = NewRescheduleRegions;
+++
+++  if (GCNTrackers)
+++    DAG.RegionLiveOuts.buildLiveRegMap();
+++
+++  SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+++  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
+++
+++  return true;
++ }
+  
+-   // For conditional operators, we analyze the arguments as if they
+-   // were being fed directly into the output.
+-diff -ruN --strip-trailing-cr a/clang/test/Sema/implicit-cast.c b/clang/test/Sema/implicit-cast.c
+---- a/clang/test/Sema/implicit-cast.c
+-+++ b/clang/test/Sema/implicit-cast.c
+-@@ -1,4 +1,4 @@
+--// RUN: %clang_cc1 -fsyntax-only %s
+-+// RUN: %clang_cc1 -fsyntax-only -verify %s
+- 
+- static char *test1(int cf) {
+-   return cf ? "abc" : 0;
+-@@ -6,3 +6,8 @@
+- static char *test2(int cf) {
+-   return cf ? 0 : "abc";
+- }
+-+
+-+int baz(void) {
+-+  int f;
+-+  return ((void)0, f = 1.4f); // expected-warning {{implicit conversion from 'float' to 'int' changes value from 1.4 to 1}}
+-+}
+-diff -ruN --strip-trailing-cr a/clang/test/Sema/implicit-int-enum-conversion.c b/clang/test/Sema/implicit-int-enum-conversion.c
+---- a/clang/test/Sema/implicit-int-enum-conversion.c
+-+++ b/clang/test/Sema/implicit-int-enum-conversion.c
+-@@ -50,3 +50,25 @@
+-   return E2_Zero;       // expected-warning {{implicit conversion from enumeration type 'enum E2' to different enumeration type 'enum E1'}} \
+-                            cxx-error {{cannot initialize return object of type 'enum E1' with an rvalue of type 'E2'}}
++-// Copied from MachineLICM
++ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
++   if (!DAG.TII->isTriviallyReMaterializable(MI))
++     return false;
++@@ -2198,83 +1999,46 @@
++   return true;
+  }
++ 
++-void PreRARematStage::finalizeGCNSchedStage() {
++-  // We consider that reducing spilling is always beneficial so we never
++-  // rollback rematerializations in such cases. It's also possible that
++-  // rescheduling lowers occupancy over the one achieved just through remats, in
++-  // which case we do not want to rollback either (the rescheduling was already
++-  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
++-  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
++-  if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
++-    return;
++-
++-  REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
++-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
++-
++-  // Rollback the rematerializations.
++-  for (const auto &[DefMI, Remat] : Rematerializations) {
++-    MachineInstr &RematMI = *Remat.RematMI;
++-    unsigned DefRegion = MIRegion.at(DefMI);
++-    MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
++-    MachineBasicBlock *MBB = RegionBB[DefRegion];
++-    Register Reg = RematMI.getOperand(0).getReg();
++-    unsigned SubReg = RematMI.getOperand(0).getSubReg();
++-
++-    // Re-rematerialize MI at the end of its original region. Note that it may
++-    // not be rematerialized exactly in the same position as originally within
++-    // the region, but it should not matter much.
++-    TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
++-    MachineInstr *NewMI = &*std::prev(InsertPos);
++-    NewMI->getOperand(0).setSubReg(SubReg);
++-    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
++-
++-    auto UseRegion = MIRegion.find(Remat.UseMI);
++-    if (UseRegion != MIRegion.end()) {
++-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
++-                                 nullptr);
++-    }
++-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
++-
++-    // Erase rematerialized MI.
++-    RematMI.eraseFromParent();
++-    DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
++-
++-    // Recompute live interval for the re-rematerialized register
++-    DAG.LIS->removeInterval(Reg);
++-    DAG.LIS->createAndComputeVirtRegInterval(Reg);
++-
++-    // Re-add the register as a live-in in all regions it used to be one in.
++-    for (unsigned LIRegion : Remat.LiveInRegions)
++-      DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
++-  }
++-
++-  // Reset RP in all impacted regions.
++-  for (auto &[I, OriginalRP] : ImpactedRegions)
++-    DAG.Pressure[I] = OriginalRP;
++-
++-  GCNSchedStage::finalizeGCNSchedStage();
++-}
++-
+++// When removing, we will have to check both beginning and ending of the region.
+++// When inserting, we will only have to check if we are inserting NewMI in front
+++// of a scheduling region and do not need to check the ending since we will only
+++// ever be inserting before an already existing MI.
++ void GCNScheduleDAGMILive::updateRegionBoundaries(
++-    RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
++-    MachineInstr *NewMI) {
++-  assert(!NewMI ||
++-         NewMI != RegionBounds.second && "cannot remove at region end");
++-
++-  if (RegionBounds.first == RegionBounds.second) {
++-    assert(NewMI && "cannot remove from an empty region");
++-    RegionBounds.first = NewMI;
++-    return;
+++    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+++                              MachineBasicBlock::iterator>> &RegionBoundaries,
+++    MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+++  unsigned I = 0, E = RegionBoundaries.size();
+++  // Search for first region of the block where MI is located
+++  while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
+++    ++I;
+ +
+-+enum E1 comma1(void) {
+-+  return ((void)0, E1_One);
+-+}
+++  for (; I != E; ++I) {
+++    if (MI->getParent() != RegionBoundaries[I].first->getParent())
+++      return;
+ +
+-+enum E1 comma2(void) {
+-+  enum E1 x;
+-+  return
+-+    (x = 12,  // expected-warning {{implicit conversion from 'int' to enumeration type 'enum E1' is invalid in C++}} \
+-+                 cxx-error {{assigning to 'enum E1' from incompatible type 'int'}}
+-+    E1_One);
+-+}
+++    if (Removing && MI == RegionBoundaries[I].first &&
+++        MI == RegionBoundaries[I].second) {
+++      // MI is in a region with size 1, after removing, the region will be
+++      // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
+++      RegionBoundaries[I] =
+++          std::pair(MI->getParent()->end(), MI->getParent()->end());
+++      return;
+++    }
+++    if (MI == RegionBoundaries[I].first) {
+++      if (Removing)
+++        RegionBoundaries[I] =
+++            std::pair(std::next(MI), RegionBoundaries[I].second);
+++      else
+++        // Inserted NewMI in front of region, set new RegionBegin to NewMI
+++        RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
+++                                        RegionBoundaries[I].second);
+++      return;
+++    }
+++    if (Removing && MI == RegionBoundaries[I].second) {
+++      RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
+++      return;
+++    }
++   }
++-
++-  // We only care for modifications at the beginning of a non-empty region since
++-  // the upper region boundary is exclusive.
++-  if (MI != RegionBounds.first)
++-    return;
++-  if (!NewMI)
++-    RegionBounds.first = std::next(MI); // Removal
++-  else
++-    RegionBounds.first = NewMI; // Insertion
++ }
++ 
++ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
++--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
++@@ -14,9 +14,7 @@
++ #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
++ 
++ #include "GCNRegPressure.h"
++-#include "llvm/ADT/DenseMap.h"
++ #include "llvm/ADT/MapVector.h"
++-#include "llvm/CodeGen/MachineInstr.h"
++ #include "llvm/CodeGen/MachineScheduler.h"
++ 
++ namespace llvm {
++@@ -216,11 +214,6 @@
++   }
++ };
++ 
++-/// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
++-/// boundary is inclusive, the upper boundary is exclusive.
++-using RegionBoundaries =
++-    std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;
++-
++ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
++   friend class GCNSchedStage;
++   friend class OccInitialScheduleStage;
++@@ -241,7 +234,8 @@
++   unsigned MinOccupancy;
++ 
++   // Vector of regions recorder for later rescheduling
++-  SmallVector<RegionBoundaries, 32> Regions;
+++  SmallVector<std::pair<MachineBasicBlock::iterator,
+++                        MachineBasicBlock::iterator>, 32> Regions;
++ 
++   // Records if a region is not yet scheduled, or schedule has been reverted,
++   // or we generally desire to reschedule it.
++@@ -292,13 +286,12 @@
++   // Compute and cache live-ins and pressure for all regions in block.
++   void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
++ 
++-  /// If necessary, updates a region's boundaries following insertion ( \p NewMI
++-  /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
++-  /// For an MI removal, this must be called before the MI is actually erased
++-  /// from its parent MBB.
++-  void updateRegionBoundaries(RegionBoundaries &RegionBounds,
++-                              MachineBasicBlock::iterator MI,
++-                              MachineInstr *NewMI);
+++  // Update region boundaries when removing MI or inserting NewMI before MI.
+++  void updateRegionBoundaries(
+++      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+++                                MachineBasicBlock::iterator>> &RegionBoundaries,
+++      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+++      bool Removing = false);
++ 
++   void runSchedStages();
++ 
++@@ -438,73 +431,30 @@
++       : GCNSchedStage(StageID, DAG) {}
++ };
++ 
++-/// Attempts to reduce function spilling or, if there is no spilling, to
++-/// increase function occupancy by one with respect to ArchVGPR usage by sinking
++-/// trivially rematerializable instructions to their use. When the stage
++-/// estimates reducing spilling or increasing occupancy is possible, as few
++-/// instructions as possible are rematerialized to reduce potential negative
++-/// effects on function latency.
++-///
++-/// TODO: We should extend this to work on SGPRs and AGPRs as well.
++ class PreRARematStage : public GCNSchedStage {
++ private:
++-  /// Useful information about a rematerializable instruction.
++-  struct RematInstruction {
++-    /// Single use of the rematerializable instruction's defined register,
++-    /// located in a different block.
++-    MachineInstr *UseMI;
++-    /// Rematerialized version of \p DefMI, set in
++-    /// PreRARematStage::rematerialize. Used for reverting rematerializations.
++-    MachineInstr *RematMI;
++-    /// Set of regions in which the rematerializable instruction's defined
++-    /// register is a live-in.
++-    SmallDenseSet<unsigned, 4> LiveInRegions;
++-
++-    RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
++-  };
++-
++-  /// Maps all MIs to their parent region. MI terminators are considered to be
++-  /// outside the region they delimitate, and as such are not stored in the map.
++-  DenseMap<MachineInstr *, unsigned> MIRegion;
++-  /// Parent MBB to each region, in region order.
++-  SmallVector<MachineBasicBlock *> RegionBB;
++-  /// Collects instructions to rematerialize.
++-  MapVector<MachineInstr *, RematInstruction> Rematerializations;
++-  /// Collects regions whose live-ins or register pressure will change due to
++-  /// rematerializations.
++-  DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
++-  /// In case we need to rollback rematerializations, save lane masks for all
++-  /// rematerialized registers in all regions in which they are live-ins.
++-  DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
++-  /// Target occupancy the stage estimates is reachable through
++-  /// rematerialization. Greater than or equal to the pre-stage min occupancy.
++-  unsigned TargetOcc;
++-  /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
++-  /// Smaller than or equal to the target occupancy.
++-  unsigned AchievedOcc;
++-  /// Whether the stage is attempting to increase occupancy in the abscence of
++-  /// spilling.
++-  bool IncreaseOccupancy;
++-
++-  /// Returns whether remat can reduce spilling or increase function occupancy
++-  /// by 1 through rematerialization. If it can do one, collects instructions in
++-  /// PreRARematStage::Rematerializations and sets the target occupancy in
++-  /// PreRARematStage::TargetOccupancy.
++-  bool canIncreaseOccupancyOrReduceSpill();
+++  // Each region at MinOccupancy will have their own list of trivially
+++  // rematerializable instructions we can remat to reduce RP. The list maps an
+++  // instruction to the position we should remat before, usually the MI using
+++  // the rematerializable instruction.
+++  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
+++      RematerializableInsts;
+ +
+-+enum E1 comma3(void) {
+-+  enum E1 x;
+-+  return ((void)0, foo()); // Okay, no conversion in C++
+-+}
+++  // Map a trivially rematerializable def to a list of regions at MinOccupancy
+++  // that has the defined reg as a live-in.
+++  MapVector<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
+ +
+-+enum E1 comma4(void) {
+-+  return ((void)1, 2); // expected-warning {{implicit conversion from 'int' to enumeration type 'enum E1' is invalid in C++}} \
+-+                          cxx-error {{cannot initialize return object of type 'enum E1' with an rvalue of type 'int'}}
+-+}
+++  // Collect all trivially rematerializable VGPR instructions with a single def
+++  // and single use outside the defining block into RematerializableInsts.
+++  void collectRematerializableInstructions();
++ 
++-  /// Whether the MI is trivially rematerializable and does not have any virtual
++-  /// register use.
++   bool isTriviallyReMaterializable(const MachineInstr &MI);
++ 
++-  /// Rematerializes all instructions in PreRARematStage::Rematerializations
++-  /// and stores the achieved occupancy after remat in
++-  /// PreRARematStage::AchievedOcc.
++-  void rematerialize();
++-
++-  /// If remat alone did not increase occupancy to the target one, rollbacks all
++-  /// rematerializations and resets live-ins/RP in all regions impacted by the
++-  /// stage to their pre-stage values.
++-  void finalizeGCNSchedStage() override;
+++  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+++  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
+++  // instructions. Returns true if we were able to sink instruction(s).
+++  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+++                               const TargetInstrInfo *TII);
++ 
++   /// \p Returns true if all the uses in \p InstToRemat defined at \p
++   /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
++--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
++@@ -466,7 +466,7 @@
++                             getReservedNumSGPRs(MF));
++ }
++ 
++-unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
+++static unsigned getMaxNumPreloadedSGPRs() {
++   using USI = GCNUserSGPRUsageInfo;
++   // Max number of user SGPRs
++   const unsigned MaxUserSGPRs =
++@@ -497,28 +497,42 @@
++ }
++ 
++ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
++-    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
++-  const auto &[Min, Max] = NumVGPRBounds;
+++    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
+++  // Compute maximum number of VGPRs function can use using default/requested
+++  // minimum number of waves per execution unit.
+++  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
++ 
++   // Check if maximum number of VGPRs was explicitly requested using
++   // "amdgpu-num-vgpr" attribute.
+++  unsigned Requested =
+++      F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
+++  if (Requested != MaxNumVGPRs) {
+++    if (hasGFX90AInsts())
+++      Requested *= 2;
+++
+++    // Make sure requested value is compatible with values implied by
+++    // default/requested minimum/maximum number of waves per execution unit.
+++    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+++      Requested = 0;
+++    if (WavesPerEU.second && Requested &&
+++        Requested < getMinNumVGPRs(WavesPerEU.second))
+++      Requested = 0;
+++
+++    if (Requested)
+++      MaxNumVGPRs = Requested;
+++  }
++ 
++-  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
++-  if (Requested != Max && hasGFX90AInsts())
++-    Requested *= 2;
++-
++-  // Make sure requested value is inside the range of possible VGPR usage.
++-  return std::clamp(Requested, Min, Max);
+++  return MaxNumVGPRs;
++ }
++ 
++ unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
++-  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
++-  return getBaseMaxNumVGPRs(
++-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
+++  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
++ }
++ 
++ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
++-  return getMaxNumVGPRs(MF.getFunction());
+++  const Function &F = MF.getFunction();
+++  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+++  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
++ }
++ 
++ void GCNSubtarget::adjustSchedDependency(
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
++--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
++@@ -1505,9 +1505,6 @@
++   /// \returns Reserved number of SGPRs for given function \p F.
++   unsigned getReservedNumSGPRs(const Function &F) const;
++ 
++-  /// \returns Maximum number of preloaded SGPRs for the subtarget.
++-  unsigned getMaxNumPreloadedSGPRs() const;
++-
++   /// \returns max num SGPRs. This is the common utility
++   /// function called by MachineFunction and Function
++   /// variants of getMaxNumSGPRs.
++@@ -1576,10 +1573,8 @@
++ 
++   /// \returns max num VGPRs. This is the common utility function
++   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
++-  unsigned
++-  getBaseMaxNumVGPRs(const Function &F,
++-                     std::pair<unsigned, unsigned> NumVGPRBounds) const;
++-
+++  unsigned getBaseMaxNumVGPRs(const Function &F,
+++                              std::pair<unsigned, unsigned> WavesPerEU) const;
++   /// \returns Maximum number of VGPRs that meets number of waves per execution
++   /// unit requirement for function \p F, or number of VGPRs explicitly
++   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
++--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
++@@ -1190,8 +1190,6 @@
++   return IsWave32 ? 8 : 4;
++ }
++ 
++-unsigned getArchVGPRAllocGranule() { return 4; }
++-
++ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
++   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
++     return 512;
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
++--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
++@@ -309,10 +309,6 @@
++     const MCSubtargetInfo *STI,
++     std::optional<bool> EnableWavefrontSize32 = std::nullopt);
++ 
++-/// For subtargets with a unified VGPR file and mixed ArchVGPR/AGPR usage,
++-/// returns the allocation granule for ArchVGPRs.
++-unsigned getArchVGPRAllocGranule();
++-
++ /// \returns Total number of VGPRs for given subtarget \p STI.
++ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
++ 
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
++--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
++@@ -1,2539 +0,0 @@
++-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
++-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
++-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
++-
++---- |
++-  define void @small_num_vgprs_as_spill() "amdgpu-num-vgpr"="28" {
++-    ret void
++-  }
++-  define void @dont_remat_waves_per_eu() "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-waves-per-eu"="7,7" {
++-    ret void
++-  }
++-  define void @dont_remat_at_max_occ() "amdgpu-waves-per-eu"="8,8" {
++-    ret void
++-  }
++-  define void @reduce_arch_and_acc_vgrp_spill() "amdgpu-waves-per-eu"="8,8" {
++-    ret void
++-  }
++-  define void @reduce_spill_archvgpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
++-    ret void
++-  }
++-  define void @reduce_spill_agpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
++-    ret void
++-  }
++----
++-# User-requested maximum number of VGPRs need to be taken into account by
++-# the scheduler's rematerialization stage. Register usage above that number
++-# is considered like spill; occupancy is "inadvertently" increased when
++-# eliminating spill.
++-name:            small_num_vgprs_as_spill
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: small_num_vgprs_as_spill
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: small_num_vgprs_as_spill
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
++-
++-    S_ENDPGM 0
++-...
++-# Min/Max occupancy is 8, but user requests 7, the scheduler's rematerialization
++-# stage should not try to rematerialize instructions.
++----
++-name:            dont_remat_waves_per_eu
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: dont_remat_waves_per_eu
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: dont_remat_waves_per_eu
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode,
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode,
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode,
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode,
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
++-    S_NOP 0, implicit %35
++-
++-    S_ENDPGM 0
++-...
++-# Min/Max occupancy is 8, the scheduler's rematerialization stage should not
++-# try to rematerialize instructions.
++----
++-name:            dont_remat_at_max_occ
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: dont_remat_at_max_occ
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: dont_remat_at_max_occ
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode,
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode,
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode,
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode,
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode,
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode,
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode,
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode,
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-
++-    S_ENDPGM 0
++-...
++-# Min/Max waves/EU is 8. For targets with non-unified RF (gfx908) we are able to
++-# eliminate both ArchVGPR and AGPR spilling by saving 2 ArchVGPRs (one for
++-# spilling AGPR to ArchVGPR). In the unified RF case (gfx90a) the ArchVGPR
++-# allocation granule forces us to remat all eligible ArchVGPRs to eliminate
++-# spilling.
++----
++-name:            reduce_arch_and_acc_vgrp_spill
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: reduce_arch_and_acc_vgrp_spill
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF32]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: reduce_arch_and_acc_vgrp_spill
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF32]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
++-    %32:agpr_32 = IMPLICIT_DEF
++-    %33:agpr_32 = IMPLICIT_DEF
++-    %34:agpr_32 = IMPLICIT_DEF
++-    %35:agpr_32 = IMPLICIT_DEF
++-    %36:agpr_32 = IMPLICIT_DEF
++-    %37:agpr_32 = IMPLICIT_DEF
++-    %38:agpr_32 = IMPLICIT_DEF
++-    %39:agpr_32 = IMPLICIT_DEF
++-    %40:agpr_32 = IMPLICIT_DEF
++-    %41:agpr_32 = IMPLICIT_DEF
++-    %42:agpr_32 = IMPLICIT_DEF
++-    %43:agpr_32 = IMPLICIT_DEF
++-    %44:agpr_32 = IMPLICIT_DEF
++-    %45:agpr_32 = IMPLICIT_DEF
++-    %46:agpr_32 = IMPLICIT_DEF
++-    %47:agpr_32 = IMPLICIT_DEF
++-    %48:agpr_32 = IMPLICIT_DEF
++-    %49:agpr_32 = IMPLICIT_DEF
++-    %50:agpr_32 = IMPLICIT_DEF
++-    %51:agpr_32 = IMPLICIT_DEF
++-    %52:agpr_32 = IMPLICIT_DEF
++-    %53:agpr_32 = IMPLICIT_DEF
++-    %54:agpr_32 = IMPLICIT_DEF
++-    %55:agpr_32 = IMPLICIT_DEF
++-    %56:agpr_32 = IMPLICIT_DEF
++-    %57:agpr_32 = IMPLICIT_DEF
++-    %58:agpr_32 = IMPLICIT_DEF
++-    %59:agpr_32 = IMPLICIT_DEF
++-    %60:agpr_32 = IMPLICIT_DEF
++-    %61:agpr_32 = IMPLICIT_DEF
++-    %62:agpr_32 = IMPLICIT_DEF
++-    %63:agpr_32 = IMPLICIT_DEF
++-
++-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
++-    %65:agpr_32 = IMPLICIT_DEF
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34
++-    S_NOP 0, implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44
++-    S_NOP 0, implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54
++-    S_NOP 0, implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64
++-    S_NOP 0, implicit %65
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            reduce_spill_archvgpr_above_addressable_limit
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-    %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-    %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-    %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-    %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-    %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-    %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-    %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-    %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-    %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-    %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-    %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-    %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-    %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-    %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-    %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-    %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-    %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-    %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-    %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-    %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-    %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-    %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-    %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-    %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-    %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-    %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-    %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-    %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-    %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-    %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-    %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-    %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-    %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-    %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-    %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-    %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-    %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-    %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-    %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-    %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-    %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-    %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-    %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-    %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-    %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-    %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-    %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-    %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-    %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-    %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-    %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-    %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-    %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-    %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-    %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-    %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-    %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-    %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-    %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-    %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-    %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-    %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-    %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-    %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-    %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-    %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-    %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-    %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-    %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-    %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-    %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-    %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-    %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-    %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-    %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-    %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-    %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-    %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-    %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-    %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-    %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-    %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-    %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-    %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-    %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-    %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-    %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-    %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-    %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-    %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-    %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-    %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-    %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-    %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-    %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-    %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-    %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-    %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-    %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-    %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-    %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-    %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-    %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-    %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-    %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-    %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-    %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-    %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-    %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-    %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-    %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-    %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-    %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-    %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-    %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-    %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-    %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-    %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-    %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-    %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-    %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-    %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-    %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-    %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-    %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-    %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-    %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-    %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-    %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-    %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-    %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-    %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-    %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-    %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-    %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-    %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-    %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-    %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-    %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-    %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-    %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-    %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-    %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-    %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-    %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-    %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-    %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-    %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-    %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-    %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-    %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-    %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-    %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-    %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-    %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-    %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-    %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-    %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-    %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-    %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-    %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-    %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-    %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-    %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-    %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-    %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-    %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-    %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-    %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-    %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-    %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-    %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-    %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-    %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-    %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-    %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-    %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-    %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-    %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-    %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-    %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-    %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-    %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-    %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-    %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-    %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-    %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-    %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-    %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-    %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-    %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-    %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-    %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-    %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-    %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-    %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-    %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-    %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-    %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-    %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-    %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-    %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-    %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-
++-    %257:agpr_32 = IMPLICIT_DEF
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
++-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
++-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
++-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
++-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
++-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
++-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
++-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
++-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
++-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
++-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
++-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
++-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
++-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
++-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
++-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
++-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
++-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
++-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
++-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254, implicit %255, implicit %256, implicit %257
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            reduce_spill_agpr_above_addressable_limit
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: reduce_spill_agpr_above_addressable_limit
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF33:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF35:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF36:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF37:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF38:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF39:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF40:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF41:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF42:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF43:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF44:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF45:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF46:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF47:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF48:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF49:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF50:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF51:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF52:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF53:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF54:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF55:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF56:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF57:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF58:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF59:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF60:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF61:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF62:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF63:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF64:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF65:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF66:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF67:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF68:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF69:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF70:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF71:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF72:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF73:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF74:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF75:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF76:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF77:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF78:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF79:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF80:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF81:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF82:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF83:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF84:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF85:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF86:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF87:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF88:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF89:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF90:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF91:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF92:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF93:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF94:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF95:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF96:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF97:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF98:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF99:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF100:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF101:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF102:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF103:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF104:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF105:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF106:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF107:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF108:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF109:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF110:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF111:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF112:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF113:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF114:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF115:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF116:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF117:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF118:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF119:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF120:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF121:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF122:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF123:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF124:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF125:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF126:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF127:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF128:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF129:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF130:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF131:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF132:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF133:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF134:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF135:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF136:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF137:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF138:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF139:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF140:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF141:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF142:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF143:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF144:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF145:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF146:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF147:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF148:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF149:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF150:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF151:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF152:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF153:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF154:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF155:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF156:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF157:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF158:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF159:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF160:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF161:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF162:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF163:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF164:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF165:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF166:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF167:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF168:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF169:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF170:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF171:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF172:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF173:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF174:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF175:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF176:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF177:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF178:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF179:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF180:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF181:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF182:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF183:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF184:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF185:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF186:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF187:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF188:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF189:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF190:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF191:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF192:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF193:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF194:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF195:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF196:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF197:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF198:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF199:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF200:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF201:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF202:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF203:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF204:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF205:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF206:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF207:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF208:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF209:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF210:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF211:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF212:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF213:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF214:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF215:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF216:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF217:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF218:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF219:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF220:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF221:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF222:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF223:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF224:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF225:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF226:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF227:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF228:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF229:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF230:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF231:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF232:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF233:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF234:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF235:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF236:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF237:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF238:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF239:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF240:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF241:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF242:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF249:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF250:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF251:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF252:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]], implicit [[DEF129]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]], implicit [[DEF139]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]], implicit [[DEF149]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]], implicit [[DEF159]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]], implicit [[DEF169]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]], implicit [[DEF179]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]], implicit [[DEF189]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]], implicit [[DEF199]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]], implicit [[DEF209]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]], implicit [[DEF219]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  ;
++-  ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit
++-  ; GFX90A: bb.0:
++-  ; GFX90A-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF6:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF7:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF8:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF9:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF10:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF11:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF12:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF13:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF14:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF15:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF16:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF17:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF18:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF19:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF20:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF21:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF22:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF23:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF24:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF25:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF26:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF27:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF33:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF34:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF35:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF36:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF37:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF38:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF39:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF40:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF41:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF42:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF43:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF44:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF45:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF46:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF47:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF48:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF49:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF50:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF51:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF52:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF53:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF54:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF55:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF56:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF57:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF58:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF59:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF60:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF61:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF62:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF63:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF64:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF65:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF66:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF67:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF68:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF69:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF70:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF71:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF72:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF73:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF74:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF75:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF76:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF77:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF78:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF79:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF80:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF81:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF82:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF83:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF84:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF85:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF86:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF87:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF88:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF89:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF90:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF91:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF92:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF93:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF94:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF95:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF96:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF97:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF98:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF99:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF100:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF101:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF102:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF103:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF104:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF105:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF106:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF107:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF108:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF109:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF110:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF111:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF112:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF113:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF114:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF115:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF116:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF117:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF118:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF119:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF120:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF121:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF122:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF123:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF124:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF125:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF126:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF127:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF128:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF129:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF130:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF131:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF132:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF133:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF134:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF135:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF136:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF137:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF138:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF139:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF140:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF141:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF142:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF143:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF144:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF145:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF146:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF147:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF148:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF149:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF150:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF151:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF152:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF153:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF154:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF155:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF156:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF157:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF158:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF159:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF160:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF161:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF162:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF163:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF164:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF165:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF166:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF167:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF168:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF169:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF170:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF171:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF172:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF173:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF174:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF175:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF176:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF177:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF178:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF179:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF180:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF181:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF182:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF183:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF184:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF185:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF186:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF187:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF188:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF189:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF190:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF191:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF192:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF193:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF194:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF195:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF196:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF197:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF198:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF199:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF200:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF201:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF202:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF203:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF204:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF205:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF206:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF207:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF208:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF209:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF210:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF211:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF212:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF213:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF214:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF215:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF216:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF217:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF218:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF219:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF220:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF221:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF222:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF223:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF224:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF225:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF226:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF227:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF228:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF229:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF230:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF231:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF232:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF233:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF234:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF235:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF236:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF237:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF238:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF239:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF240:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF241:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF242:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF243:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF244:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF245:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF246:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF247:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF248:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF249:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF250:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF251:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF252:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT:   [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
++-  ; GFX90A-NEXT: {{  $}}
++-  ; GFX90A-NEXT: bb.1:
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]], implicit [[DEF129]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]], implicit [[DEF139]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]], implicit [[DEF149]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]], implicit [[DEF159]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]], implicit [[DEF169]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]], implicit [[DEF179]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]], implicit [[DEF189]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]], implicit [[DEF199]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]], implicit [[DEF209]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]], implicit [[DEF219]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]]
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]]
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
++-  ; GFX90A-NEXT:   S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
++-  ; GFX90A-NEXT:   S_ENDPGM 0
++-
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:agpr_32 = IMPLICIT_DEF
++-    %1:agpr_32 = IMPLICIT_DEF
++-    %2:agpr_32 = IMPLICIT_DEF
++-    %3:agpr_32 = IMPLICIT_DEF
++-    %4:agpr_32 = IMPLICIT_DEF
++-    %5:agpr_32 = IMPLICIT_DEF
++-    %6:agpr_32 = IMPLICIT_DEF
++-    %7:agpr_32 = IMPLICIT_DEF
++-    %8:agpr_32 = IMPLICIT_DEF
++-    %9:agpr_32 = IMPLICIT_DEF
++-    %10:agpr_32 = IMPLICIT_DEF
++-    %11:agpr_32 = IMPLICIT_DEF
++-    %12:agpr_32 = IMPLICIT_DEF
++-    %13:agpr_32 = IMPLICIT_DEF
++-    %14:agpr_32 = IMPLICIT_DEF
++-    %15:agpr_32 = IMPLICIT_DEF
++-    %16:agpr_32 = IMPLICIT_DEF
++-    %17:agpr_32 = IMPLICIT_DEF
++-    %18:agpr_32 = IMPLICIT_DEF
++-    %19:agpr_32 = IMPLICIT_DEF
++-    %20:agpr_32 = IMPLICIT_DEF
++-    %21:agpr_32 = IMPLICIT_DEF
++-    %22:agpr_32 = IMPLICIT_DEF
++-    %23:agpr_32 = IMPLICIT_DEF
++-    %24:agpr_32 = IMPLICIT_DEF
++-    %25:agpr_32 = IMPLICIT_DEF
++-    %26:agpr_32 = IMPLICIT_DEF
++-    %27:agpr_32 = IMPLICIT_DEF
++-    %28:agpr_32 = IMPLICIT_DEF
++-    %29:agpr_32 = IMPLICIT_DEF
++-    %30:agpr_32 = IMPLICIT_DEF
++-    %31:agpr_32 = IMPLICIT_DEF
++-    %32:agpr_32 = IMPLICIT_DEF
++-    %33:agpr_32 = IMPLICIT_DEF
++-    %34:agpr_32 = IMPLICIT_DEF
++-    %35:agpr_32 = IMPLICIT_DEF
++-    %36:agpr_32 = IMPLICIT_DEF
++-    %37:agpr_32 = IMPLICIT_DEF
++-    %38:agpr_32 = IMPLICIT_DEF
++-    %39:agpr_32 = IMPLICIT_DEF
++-    %40:agpr_32 = IMPLICIT_DEF
++-    %41:agpr_32 = IMPLICIT_DEF
++-    %42:agpr_32 = IMPLICIT_DEF
++-    %43:agpr_32 = IMPLICIT_DEF
++-    %44:agpr_32 = IMPLICIT_DEF
++-    %45:agpr_32 = IMPLICIT_DEF
++-    %46:agpr_32 = IMPLICIT_DEF
++-    %47:agpr_32 = IMPLICIT_DEF
++-    %48:agpr_32 = IMPLICIT_DEF
++-    %49:agpr_32 = IMPLICIT_DEF
++-    %50:agpr_32 = IMPLICIT_DEF
++-    %51:agpr_32 = IMPLICIT_DEF
++-    %52:agpr_32 = IMPLICIT_DEF
++-    %53:agpr_32 = IMPLICIT_DEF
++-    %54:agpr_32 = IMPLICIT_DEF
++-    %55:agpr_32 = IMPLICIT_DEF
++-    %56:agpr_32 = IMPLICIT_DEF
++-    %57:agpr_32 = IMPLICIT_DEF
++-    %58:agpr_32 = IMPLICIT_DEF
++-    %59:agpr_32 = IMPLICIT_DEF
++-    %60:agpr_32 = IMPLICIT_DEF
++-    %61:agpr_32 = IMPLICIT_DEF
++-    %62:agpr_32 = IMPLICIT_DEF
++-    %63:agpr_32 = IMPLICIT_DEF
++-    %64:agpr_32 = IMPLICIT_DEF
++-    %65:agpr_32 = IMPLICIT_DEF
++-    %66:agpr_32 = IMPLICIT_DEF
++-    %67:agpr_32 = IMPLICIT_DEF
++-    %68:agpr_32 = IMPLICIT_DEF
++-    %69:agpr_32 = IMPLICIT_DEF
++-    %70:agpr_32 = IMPLICIT_DEF
++-    %71:agpr_32 = IMPLICIT_DEF
++-    %72:agpr_32 = IMPLICIT_DEF
++-    %73:agpr_32 = IMPLICIT_DEF
++-    %74:agpr_32 = IMPLICIT_DEF
++-    %75:agpr_32 = IMPLICIT_DEF
++-    %76:agpr_32 = IMPLICIT_DEF
++-    %77:agpr_32 = IMPLICIT_DEF
++-    %78:agpr_32 = IMPLICIT_DEF
++-    %79:agpr_32 = IMPLICIT_DEF
++-    %80:agpr_32 = IMPLICIT_DEF
++-    %81:agpr_32 = IMPLICIT_DEF
++-    %82:agpr_32 = IMPLICIT_DEF
++-    %83:agpr_32 = IMPLICIT_DEF
++-    %84:agpr_32 = IMPLICIT_DEF
++-    %85:agpr_32 = IMPLICIT_DEF
++-    %86:agpr_32 = IMPLICIT_DEF
++-    %87:agpr_32 = IMPLICIT_DEF
++-    %88:agpr_32 = IMPLICIT_DEF
++-    %89:agpr_32 = IMPLICIT_DEF
++-    %90:agpr_32 = IMPLICIT_DEF
++-    %91:agpr_32 = IMPLICIT_DEF
++-    %92:agpr_32 = IMPLICIT_DEF
++-    %93:agpr_32 = IMPLICIT_DEF
++-    %94:agpr_32 = IMPLICIT_DEF
++-    %95:agpr_32 = IMPLICIT_DEF
++-    %96:agpr_32 = IMPLICIT_DEF
++-    %97:agpr_32 = IMPLICIT_DEF
++-    %98:agpr_32 = IMPLICIT_DEF
++-    %99:agpr_32 = IMPLICIT_DEF
++-    %100:agpr_32 = IMPLICIT_DEF
++-    %101:agpr_32 = IMPLICIT_DEF
++-    %102:agpr_32 = IMPLICIT_DEF
++-    %103:agpr_32 = IMPLICIT_DEF
++-    %104:agpr_32 = IMPLICIT_DEF
++-    %105:agpr_32 = IMPLICIT_DEF
++-    %106:agpr_32 = IMPLICIT_DEF
++-    %107:agpr_32 = IMPLICIT_DEF
++-    %108:agpr_32 = IMPLICIT_DEF
++-    %109:agpr_32 = IMPLICIT_DEF
++-    %110:agpr_32 = IMPLICIT_DEF
++-    %111:agpr_32 = IMPLICIT_DEF
++-    %112:agpr_32 = IMPLICIT_DEF
++-    %113:agpr_32 = IMPLICIT_DEF
++-    %114:agpr_32 = IMPLICIT_DEF
++-    %115:agpr_32 = IMPLICIT_DEF
++-    %116:agpr_32 = IMPLICIT_DEF
++-    %117:agpr_32 = IMPLICIT_DEF
++-    %118:agpr_32 = IMPLICIT_DEF
++-    %119:agpr_32 = IMPLICIT_DEF
++-    %120:agpr_32 = IMPLICIT_DEF
++-    %121:agpr_32 = IMPLICIT_DEF
++-    %122:agpr_32 = IMPLICIT_DEF
++-    %123:agpr_32 = IMPLICIT_DEF
++-    %124:agpr_32 = IMPLICIT_DEF
++-    %125:agpr_32 = IMPLICIT_DEF
++-    %126:agpr_32 = IMPLICIT_DEF
++-    %127:agpr_32 = IMPLICIT_DEF
++-    %128:agpr_32 = IMPLICIT_DEF
++-    %129:agpr_32 = IMPLICIT_DEF
++-    %130:agpr_32 = IMPLICIT_DEF
++-    %131:agpr_32 = IMPLICIT_DEF
++-    %132:agpr_32 = IMPLICIT_DEF
++-    %133:agpr_32 = IMPLICIT_DEF
++-    %134:agpr_32 = IMPLICIT_DEF
++-    %135:agpr_32 = IMPLICIT_DEF
++-    %136:agpr_32 = IMPLICIT_DEF
++-    %137:agpr_32 = IMPLICIT_DEF
++-    %138:agpr_32 = IMPLICIT_DEF
++-    %139:agpr_32 = IMPLICIT_DEF
++-    %140:agpr_32 = IMPLICIT_DEF
++-    %141:agpr_32 = IMPLICIT_DEF
++-    %142:agpr_32 = IMPLICIT_DEF
++-    %143:agpr_32 = IMPLICIT_DEF
++-    %144:agpr_32 = IMPLICIT_DEF
++-    %145:agpr_32 = IMPLICIT_DEF
++-    %146:agpr_32 = IMPLICIT_DEF
++-    %147:agpr_32 = IMPLICIT_DEF
++-    %148:agpr_32 = IMPLICIT_DEF
++-    %149:agpr_32 = IMPLICIT_DEF
++-    %150:agpr_32 = IMPLICIT_DEF
++-    %151:agpr_32 = IMPLICIT_DEF
++-    %152:agpr_32 = IMPLICIT_DEF
++-    %153:agpr_32 = IMPLICIT_DEF
++-    %154:agpr_32 = IMPLICIT_DEF
++-    %155:agpr_32 = IMPLICIT_DEF
++-    %156:agpr_32 = IMPLICIT_DEF
++-    %157:agpr_32 = IMPLICIT_DEF
++-    %158:agpr_32 = IMPLICIT_DEF
++-    %159:agpr_32 = IMPLICIT_DEF
++-    %160:agpr_32 = IMPLICIT_DEF
++-    %161:agpr_32 = IMPLICIT_DEF
++-    %162:agpr_32 = IMPLICIT_DEF
++-    %163:agpr_32 = IMPLICIT_DEF
++-    %164:agpr_32 = IMPLICIT_DEF
++-    %165:agpr_32 = IMPLICIT_DEF
++-    %166:agpr_32 = IMPLICIT_DEF
++-    %167:agpr_32 = IMPLICIT_DEF
++-    %168:agpr_32 = IMPLICIT_DEF
++-    %169:agpr_32 = IMPLICIT_DEF
++-    %170:agpr_32 = IMPLICIT_DEF
++-    %171:agpr_32 = IMPLICIT_DEF
++-    %172:agpr_32 = IMPLICIT_DEF
++-    %173:agpr_32 = IMPLICIT_DEF
++-    %174:agpr_32 = IMPLICIT_DEF
++-    %175:agpr_32 = IMPLICIT_DEF
++-    %176:agpr_32 = IMPLICIT_DEF
++-    %177:agpr_32 = IMPLICIT_DEF
++-    %178:agpr_32 = IMPLICIT_DEF
++-    %179:agpr_32 = IMPLICIT_DEF
++-    %180:agpr_32 = IMPLICIT_DEF
++-    %181:agpr_32 = IMPLICIT_DEF
++-    %182:agpr_32 = IMPLICIT_DEF
++-    %183:agpr_32 = IMPLICIT_DEF
++-    %184:agpr_32 = IMPLICIT_DEF
++-    %185:agpr_32 = IMPLICIT_DEF
++-    %186:agpr_32 = IMPLICIT_DEF
++-    %187:agpr_32 = IMPLICIT_DEF
++-    %188:agpr_32 = IMPLICIT_DEF
++-    %189:agpr_32 = IMPLICIT_DEF
++-    %190:agpr_32 = IMPLICIT_DEF
++-    %191:agpr_32 = IMPLICIT_DEF
++-    %192:agpr_32 = IMPLICIT_DEF
++-    %193:agpr_32 = IMPLICIT_DEF
++-    %194:agpr_32 = IMPLICIT_DEF
++-    %195:agpr_32 = IMPLICIT_DEF
++-    %196:agpr_32 = IMPLICIT_DEF
++-    %197:agpr_32 = IMPLICIT_DEF
++-    %198:agpr_32 = IMPLICIT_DEF
++-    %199:agpr_32 = IMPLICIT_DEF
++-    %200:agpr_32 = IMPLICIT_DEF
++-    %201:agpr_32 = IMPLICIT_DEF
++-    %202:agpr_32 = IMPLICIT_DEF
++-    %203:agpr_32 = IMPLICIT_DEF
++-    %204:agpr_32 = IMPLICIT_DEF
++-    %205:agpr_32 = IMPLICIT_DEF
++-    %206:agpr_32 = IMPLICIT_DEF
++-    %207:agpr_32 = IMPLICIT_DEF
++-    %208:agpr_32 = IMPLICIT_DEF
++-    %209:agpr_32 = IMPLICIT_DEF
++-    %210:agpr_32 = IMPLICIT_DEF
++-    %211:agpr_32 = IMPLICIT_DEF
++-    %212:agpr_32 = IMPLICIT_DEF
++-    %213:agpr_32 = IMPLICIT_DEF
++-    %214:agpr_32 = IMPLICIT_DEF
++-    %215:agpr_32 = IMPLICIT_DEF
++-    %216:agpr_32 = IMPLICIT_DEF
++-    %217:agpr_32 = IMPLICIT_DEF
++-    %218:agpr_32 = IMPLICIT_DEF
++-    %219:agpr_32 = IMPLICIT_DEF
++-    %220:agpr_32 = IMPLICIT_DEF
++-    %221:agpr_32 = IMPLICIT_DEF
++-    %222:agpr_32 = IMPLICIT_DEF
++-    %223:agpr_32 = IMPLICIT_DEF
++-    %224:agpr_32 = IMPLICIT_DEF
++-    %225:agpr_32 = IMPLICIT_DEF
++-    %226:agpr_32 = IMPLICIT_DEF
++-    %227:agpr_32 = IMPLICIT_DEF
++-    %228:agpr_32 = IMPLICIT_DEF
++-    %229:agpr_32 = IMPLICIT_DEF
++-    %230:agpr_32 = IMPLICIT_DEF
++-    %231:agpr_32 = IMPLICIT_DEF
++-    %232:agpr_32 = IMPLICIT_DEF
++-    %233:agpr_32 = IMPLICIT_DEF
++-    %234:agpr_32 = IMPLICIT_DEF
++-    %235:agpr_32 = IMPLICIT_DEF
++-    %236:agpr_32 = IMPLICIT_DEF
++-    %237:agpr_32 = IMPLICIT_DEF
++-    %238:agpr_32 = IMPLICIT_DEF
++-    %239:agpr_32 = IMPLICIT_DEF
++-    %240:agpr_32 = IMPLICIT_DEF
++-    %241:agpr_32 = IMPLICIT_DEF
++-    %242:agpr_32 = IMPLICIT_DEF
++-    %243:agpr_32 = IMPLICIT_DEF
++-    %244:agpr_32 = IMPLICIT_DEF
++-    %245:agpr_32 = IMPLICIT_DEF
++-    %246:agpr_32 = IMPLICIT_DEF
++-    %247:agpr_32 = IMPLICIT_DEF
++-    %248:agpr_32 = IMPLICIT_DEF
++-    %249:agpr_32 = IMPLICIT_DEF
++-    %250:agpr_32 = IMPLICIT_DEF
++-    %251:agpr_32 = IMPLICIT_DEF
++-    %252:agpr_32 = IMPLICIT_DEF
++-    %253:agpr_32 = IMPLICIT_DEF
++-    %254:agpr_32 = IMPLICIT_DEF
++-    %255:agpr_32 = IMPLICIT_DEF
++-    %256:agpr_32 = IMPLICIT_DEF
++-
++-    %257:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
++-    %258:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
++-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
++-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
++-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
++-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
++-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
++-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
++-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
++-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
++-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
++-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
++-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
++-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
++-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
++-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
++-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
++-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
++-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
++-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
++-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254, implicit %255, implicit %256, implicit %257, implicit %258
++-
++-    S_ENDPGM 0
++-...
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
++--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
++@@ -17,7 +17,7 @@
++   isEntryFunction: true
++ body:             |
++   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
++-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
+++  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
++   ; DEBUG-NEXT: ********** MI Scheduling **********
++   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
++   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++@@ -89,7 +89,7 @@
++   isEntryFunction: true
++ body:             |
++   ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
++-  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
+++  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
++   ; DEBUG-NEXT: ********** MI Scheduling **********
++   ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
++   ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
++--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
++@@ -725,11 +725,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -1181,13 +1181,8 @@
++   ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
++   ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -1201,13 +1196,14 @@
++   ; GFX908-NEXT: bb.2:
++   ; GFX908-NEXT:   successors: %bb.3(0x80000000)
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.3:
++   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]]
++   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
++   ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
++   ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
++@@ -1232,11 +1228,7 @@
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
++@@ -1315,12 +1307,7 @@
++     %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++     %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++     %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+++    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
++ 
++     %100:sgpr_32 = S_MOV_B32 0
++     %101:sgpr_32 = S_MOV_B32 1
++@@ -1443,10 +1430,7 @@
++     S_NOP 0, implicit %30, implicit %31
++     S_NOP 0, implicit %32, implicit %33
++     S_NOP 0, implicit %34, implicit %35
++-    S_NOP 0, implicit %36, implicit %37
++-    S_NOP 0, implicit %38, implicit %39
++-    S_NOP 0, implicit %40, implicit %41
++-    S_NOP 0, implicit %42
+++    S_NOP 0, implicit %36
++ 
++     S_NOP 0, implicit %100, implicit %101
++     S_NOP 0, implicit %102, implicit %103
++@@ -2972,11 +2956,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -3150,11 +3134,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -3340,11 +3324,11 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
++@@ -4936,604 +4920,6 @@
++     S_ENDPGM 0
++ ...
++ ---
++-name:            test_occ_1_sink_for_spill
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_occ_1_sink_for_spill
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_43:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_44:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_45:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_46:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_47:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_48:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_49:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_50:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_51:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_52:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_53:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_54:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_55:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_56:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_57:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_58:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_59:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_60:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_61:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_62:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_63:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_65:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_66:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_67:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_68:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_69:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_70:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_71:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_72:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_73:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_74:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_75:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_76:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_77:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_78:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_79:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_80:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_81:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_82:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_83:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_84:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_85:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_86:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_87:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_88:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_89:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_90:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_91:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_92:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_93:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_94:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_95:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_96:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_97:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_98:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_99:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_100:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_101:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_102:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_103:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_104:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_105:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_106:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_107:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_108:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_109:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_110:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_111:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_112:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_113:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_114:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_115:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_116:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_117:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_118:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_119:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_120:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_121:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_122:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_123:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_124:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_125:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_126:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_127:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_128:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_129:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_130:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_131:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_132:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_133:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_134:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_135:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_136:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_137:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_138:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_139:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_140:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_141:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_142:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_143:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_144:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_145:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_146:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_147:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_148:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_149:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_150:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_151:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_152:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_153:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_154:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_155:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_156:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_157:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_158:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_159:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_160:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_161:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_162:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_163:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_164:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_165:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_166:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_167:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_168:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_169:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_170:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_171:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_172:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_173:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_174:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_175:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_176:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_177:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_178:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_179:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_180:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_181:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_182:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_183:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_184:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_185:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_186:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_187:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_188:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_189:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_190:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_191:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_192:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_193:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_194:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_195:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_196:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_197:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_198:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_199:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_200:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_201:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_202:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_203:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_204:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_205:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_206:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_207:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_208:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_209:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_210:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_211:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_212:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_213:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_214:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_215:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_216:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_217:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_218:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_219:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_220:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_221:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_222:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_223:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_224:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_225:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_226:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_227:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_228:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_229:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_230:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_231:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_232:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_233:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_234:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_235:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_236:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_237:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_238:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_239:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_240:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_241:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_242:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_243:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_244:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_245:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_246:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_247:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_248:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_249:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_250:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_251:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]], implicit [[V_CVT_I32_F64_e32_35]], implicit [[V_CVT_I32_F64_e32_36]], implicit [[V_CVT_I32_F64_e32_37]], implicit [[V_CVT_I32_F64_e32_38]], implicit [[V_CVT_I32_F64_e32_39]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_40]], implicit [[V_CVT_I32_F64_e32_41]], implicit [[V_CVT_I32_F64_e32_42]], implicit [[V_CVT_I32_F64_e32_43]], implicit [[V_CVT_I32_F64_e32_44]], implicit [[V_CVT_I32_F64_e32_45]], implicit [[V_CVT_I32_F64_e32_46]], implicit [[V_CVT_I32_F64_e32_47]], implicit [[V_CVT_I32_F64_e32_48]], implicit [[V_CVT_I32_F64_e32_49]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_50]], implicit [[V_CVT_I32_F64_e32_51]], implicit [[V_CVT_I32_F64_e32_52]], implicit [[V_CVT_I32_F64_e32_53]], implicit [[V_CVT_I32_F64_e32_54]], implicit [[V_CVT_I32_F64_e32_55]], implicit [[V_CVT_I32_F64_e32_56]], implicit [[V_CVT_I32_F64_e32_57]], implicit [[V_CVT_I32_F64_e32_58]], implicit [[V_CVT_I32_F64_e32_59]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_60]], implicit [[V_CVT_I32_F64_e32_61]], implicit [[V_CVT_I32_F64_e32_62]], implicit [[V_CVT_I32_F64_e32_63]], implicit [[V_CVT_I32_F64_e32_64]], implicit [[V_CVT_I32_F64_e32_65]], implicit [[V_CVT_I32_F64_e32_66]], implicit [[V_CVT_I32_F64_e32_67]], implicit [[V_CVT_I32_F64_e32_68]], implicit [[V_CVT_I32_F64_e32_69]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_70]], implicit [[V_CVT_I32_F64_e32_71]], implicit [[V_CVT_I32_F64_e32_72]], implicit [[V_CVT_I32_F64_e32_73]], implicit [[V_CVT_I32_F64_e32_74]], implicit [[V_CVT_I32_F64_e32_75]], implicit [[V_CVT_I32_F64_e32_76]], implicit [[V_CVT_I32_F64_e32_77]], implicit [[V_CVT_I32_F64_e32_78]], implicit [[V_CVT_I32_F64_e32_79]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_80]], implicit [[V_CVT_I32_F64_e32_81]], implicit [[V_CVT_I32_F64_e32_82]], implicit [[V_CVT_I32_F64_e32_83]], implicit [[V_CVT_I32_F64_e32_84]], implicit [[V_CVT_I32_F64_e32_85]], implicit [[V_CVT_I32_F64_e32_86]], implicit [[V_CVT_I32_F64_e32_87]], implicit [[V_CVT_I32_F64_e32_88]], implicit [[V_CVT_I32_F64_e32_89]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_90]], implicit [[V_CVT_I32_F64_e32_91]], implicit [[V_CVT_I32_F64_e32_92]], implicit [[V_CVT_I32_F64_e32_93]], implicit [[V_CVT_I32_F64_e32_94]], implicit [[V_CVT_I32_F64_e32_95]], implicit [[V_CVT_I32_F64_e32_96]], implicit [[V_CVT_I32_F64_e32_97]], implicit [[V_CVT_I32_F64_e32_98]], implicit [[V_CVT_I32_F64_e32_99]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_100]], implicit [[V_CVT_I32_F64_e32_101]], implicit [[V_CVT_I32_F64_e32_102]], implicit [[V_CVT_I32_F64_e32_103]], implicit [[V_CVT_I32_F64_e32_104]], implicit [[V_CVT_I32_F64_e32_105]], implicit [[V_CVT_I32_F64_e32_106]], implicit [[V_CVT_I32_F64_e32_107]], implicit [[V_CVT_I32_F64_e32_108]], implicit [[V_CVT_I32_F64_e32_109]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_110]], implicit [[V_CVT_I32_F64_e32_111]], implicit [[V_CVT_I32_F64_e32_112]], implicit [[V_CVT_I32_F64_e32_113]], implicit [[V_CVT_I32_F64_e32_114]], implicit [[V_CVT_I32_F64_e32_115]], implicit [[V_CVT_I32_F64_e32_116]], implicit [[V_CVT_I32_F64_e32_117]], implicit [[V_CVT_I32_F64_e32_118]], implicit [[V_CVT_I32_F64_e32_119]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_120]], implicit [[V_CVT_I32_F64_e32_121]], implicit [[V_CVT_I32_F64_e32_122]], implicit [[V_CVT_I32_F64_e32_123]], implicit [[V_CVT_I32_F64_e32_124]], implicit [[V_CVT_I32_F64_e32_125]], implicit [[V_CVT_I32_F64_e32_126]], implicit [[V_CVT_I32_F64_e32_127]], implicit [[V_CVT_I32_F64_e32_128]], implicit [[V_CVT_I32_F64_e32_129]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_130]], implicit [[V_CVT_I32_F64_e32_131]], implicit [[V_CVT_I32_F64_e32_132]], implicit [[V_CVT_I32_F64_e32_133]], implicit [[V_CVT_I32_F64_e32_134]], implicit [[V_CVT_I32_F64_e32_135]], implicit [[V_CVT_I32_F64_e32_136]], implicit [[V_CVT_I32_F64_e32_137]], implicit [[V_CVT_I32_F64_e32_138]], implicit [[V_CVT_I32_F64_e32_139]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_140]], implicit [[V_CVT_I32_F64_e32_141]], implicit [[V_CVT_I32_F64_e32_142]], implicit [[V_CVT_I32_F64_e32_143]], implicit [[V_CVT_I32_F64_e32_144]], implicit [[V_CVT_I32_F64_e32_145]], implicit [[V_CVT_I32_F64_e32_146]], implicit [[V_CVT_I32_F64_e32_147]], implicit [[V_CVT_I32_F64_e32_148]], implicit [[V_CVT_I32_F64_e32_149]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_150]], implicit [[V_CVT_I32_F64_e32_151]], implicit [[V_CVT_I32_F64_e32_152]], implicit [[V_CVT_I32_F64_e32_153]], implicit [[V_CVT_I32_F64_e32_154]], implicit [[V_CVT_I32_F64_e32_155]], implicit [[V_CVT_I32_F64_e32_156]], implicit [[V_CVT_I32_F64_e32_157]], implicit [[V_CVT_I32_F64_e32_158]], implicit [[V_CVT_I32_F64_e32_159]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_160]], implicit [[V_CVT_I32_F64_e32_161]], implicit [[V_CVT_I32_F64_e32_162]], implicit [[V_CVT_I32_F64_e32_163]], implicit [[V_CVT_I32_F64_e32_164]], implicit [[V_CVT_I32_F64_e32_165]], implicit [[V_CVT_I32_F64_e32_166]], implicit [[V_CVT_I32_F64_e32_167]], implicit [[V_CVT_I32_F64_e32_168]], implicit [[V_CVT_I32_F64_e32_169]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_170]], implicit [[V_CVT_I32_F64_e32_171]], implicit [[V_CVT_I32_F64_e32_172]], implicit [[V_CVT_I32_F64_e32_173]], implicit [[V_CVT_I32_F64_e32_174]], implicit [[V_CVT_I32_F64_e32_175]], implicit [[V_CVT_I32_F64_e32_176]], implicit [[V_CVT_I32_F64_e32_177]], implicit [[V_CVT_I32_F64_e32_178]], implicit [[V_CVT_I32_F64_e32_179]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_180]], implicit [[V_CVT_I32_F64_e32_181]], implicit [[V_CVT_I32_F64_e32_182]], implicit [[V_CVT_I32_F64_e32_183]], implicit [[V_CVT_I32_F64_e32_184]], implicit [[V_CVT_I32_F64_e32_185]], implicit [[V_CVT_I32_F64_e32_186]], implicit [[V_CVT_I32_F64_e32_187]], implicit [[V_CVT_I32_F64_e32_188]], implicit [[V_CVT_I32_F64_e32_189]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_190]], implicit [[V_CVT_I32_F64_e32_191]], implicit [[V_CVT_I32_F64_e32_192]], implicit [[V_CVT_I32_F64_e32_193]], implicit [[V_CVT_I32_F64_e32_194]], implicit [[V_CVT_I32_F64_e32_195]], implicit [[V_CVT_I32_F64_e32_196]], implicit [[V_CVT_I32_F64_e32_197]], implicit [[V_CVT_I32_F64_e32_198]], implicit [[V_CVT_I32_F64_e32_199]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_200]], implicit [[V_CVT_I32_F64_e32_201]], implicit [[V_CVT_I32_F64_e32_202]], implicit [[V_CVT_I32_F64_e32_203]], implicit [[V_CVT_I32_F64_e32_204]], implicit [[V_CVT_I32_F64_e32_205]], implicit [[V_CVT_I32_F64_e32_206]], implicit [[V_CVT_I32_F64_e32_207]], implicit [[V_CVT_I32_F64_e32_208]], implicit [[V_CVT_I32_F64_e32_209]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_210]], implicit [[V_CVT_I32_F64_e32_211]], implicit [[V_CVT_I32_F64_e32_212]], implicit [[V_CVT_I32_F64_e32_213]], implicit [[V_CVT_I32_F64_e32_214]], implicit [[V_CVT_I32_F64_e32_215]], implicit [[V_CVT_I32_F64_e32_216]], implicit [[V_CVT_I32_F64_e32_217]], implicit [[V_CVT_I32_F64_e32_218]], implicit [[V_CVT_I32_F64_e32_219]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %255:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
++-    %256:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-    %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0
++-    %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0
++-    %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0
++-    %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0
++-    %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0
++-    %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0
++-    %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0
++-    %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0
++-    %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0
++-    %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0
++-    %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0
++-    %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode, implicit-def $m0
++-    %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode, implicit-def $m0
++-    %66:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 66, implicit $exec, implicit $mode, implicit-def $m0
++-    %67:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 67, implicit $exec, implicit $mode, implicit-def $m0
++-    %68:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 68, implicit $exec, implicit $mode, implicit-def $m0
++-    %69:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 69, implicit $exec, implicit $mode, implicit-def $m0
++-    %70:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 70, implicit $exec, implicit $mode, implicit-def $m0
++-    %71:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 71, implicit $exec, implicit $mode, implicit-def $m0
++-    %72:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 72, implicit $exec, implicit $mode, implicit-def $m0
++-    %73:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 73, implicit $exec, implicit $mode, implicit-def $m0
++-    %74:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 74, implicit $exec, implicit $mode, implicit-def $m0
++-    %75:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 75, implicit $exec, implicit $mode, implicit-def $m0
++-    %76:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 76, implicit $exec, implicit $mode, implicit-def $m0
++-    %77:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 77, implicit $exec, implicit $mode, implicit-def $m0
++-    %78:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 78, implicit $exec, implicit $mode, implicit-def $m0
++-    %79:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 79, implicit $exec, implicit $mode, implicit-def $m0
++-    %80:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 80, implicit $exec, implicit $mode, implicit-def $m0
++-    %81:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 81, implicit $exec, implicit $mode, implicit-def $m0
++-    %82:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 82, implicit $exec, implicit $mode, implicit-def $m0
++-    %83:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 83, implicit $exec, implicit $mode, implicit-def $m0
++-    %84:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 84, implicit $exec, implicit $mode, implicit-def $m0
++-    %85:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 85, implicit $exec, implicit $mode, implicit-def $m0
++-    %86:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 86, implicit $exec, implicit $mode, implicit-def $m0
++-    %87:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 87, implicit $exec, implicit $mode, implicit-def $m0
++-    %88:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 88, implicit $exec, implicit $mode, implicit-def $m0
++-    %89:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 89, implicit $exec, implicit $mode, implicit-def $m0
++-    %90:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 90, implicit $exec, implicit $mode, implicit-def $m0
++-    %91:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 91, implicit $exec, implicit $mode, implicit-def $m0
++-    %92:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 92, implicit $exec, implicit $mode, implicit-def $m0
++-    %93:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 93, implicit $exec, implicit $mode, implicit-def $m0
++-    %94:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 94, implicit $exec, implicit $mode, implicit-def $m0
++-    %95:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 95, implicit $exec, implicit $mode, implicit-def $m0
++-    %96:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 96, implicit $exec, implicit $mode, implicit-def $m0
++-    %97:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 97, implicit $exec, implicit $mode, implicit-def $m0
++-    %98:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 98, implicit $exec, implicit $mode, implicit-def $m0
++-    %99:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 99, implicit $exec, implicit $mode, implicit-def $m0
++-    %100:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 100, implicit $exec, implicit $mode, implicit-def $m0
++-    %101:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 101, implicit $exec, implicit $mode, implicit-def $m0
++-    %102:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 102, implicit $exec, implicit $mode, implicit-def $m0
++-    %103:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 103, implicit $exec, implicit $mode, implicit-def $m0
++-    %104:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 104, implicit $exec, implicit $mode, implicit-def $m0
++-    %105:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 105, implicit $exec, implicit $mode, implicit-def $m0
++-    %106:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 106, implicit $exec, implicit $mode, implicit-def $m0
++-    %107:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 107, implicit $exec, implicit $mode, implicit-def $m0
++-    %108:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 108, implicit $exec, implicit $mode, implicit-def $m0
++-    %109:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 109, implicit $exec, implicit $mode, implicit-def $m0
++-    %110:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 110, implicit $exec, implicit $mode, implicit-def $m0
++-    %111:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 111, implicit $exec, implicit $mode, implicit-def $m0
++-    %112:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 112, implicit $exec, implicit $mode, implicit-def $m0
++-    %113:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 113, implicit $exec, implicit $mode, implicit-def $m0
++-    %114:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 114, implicit $exec, implicit $mode, implicit-def $m0
++-    %115:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 115, implicit $exec, implicit $mode, implicit-def $m0
++-    %116:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 116, implicit $exec, implicit $mode, implicit-def $m0
++-    %117:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 117, implicit $exec, implicit $mode, implicit-def $m0
++-    %118:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 118, implicit $exec, implicit $mode, implicit-def $m0
++-    %119:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 119, implicit $exec, implicit $mode, implicit-def $m0
++-    %120:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 120, implicit $exec, implicit $mode, implicit-def $m0
++-    %121:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 121, implicit $exec, implicit $mode, implicit-def $m0
++-    %122:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 122, implicit $exec, implicit $mode, implicit-def $m0
++-    %123:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 123, implicit $exec, implicit $mode, implicit-def $m0
++-    %124:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 124, implicit $exec, implicit $mode, implicit-def $m0
++-    %125:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 125, implicit $exec, implicit $mode, implicit-def $m0
++-    %126:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 126, implicit $exec, implicit $mode, implicit-def $m0
++-    %127:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 127, implicit $exec, implicit $mode, implicit-def $m0
++-    %128:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 128, implicit $exec, implicit $mode, implicit-def $m0
++-    %129:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 129, implicit $exec, implicit $mode, implicit-def $m0
++-    %130:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 130, implicit $exec, implicit $mode, implicit-def $m0
++-    %131:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 131, implicit $exec, implicit $mode, implicit-def $m0
++-    %132:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 132, implicit $exec, implicit $mode, implicit-def $m0
++-    %133:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 133, implicit $exec, implicit $mode, implicit-def $m0
++-    %134:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 134, implicit $exec, implicit $mode, implicit-def $m0
++-    %135:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 135, implicit $exec, implicit $mode, implicit-def $m0
++-    %136:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 136, implicit $exec, implicit $mode, implicit-def $m0
++-    %137:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 137, implicit $exec, implicit $mode, implicit-def $m0
++-    %138:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 138, implicit $exec, implicit $mode, implicit-def $m0
++-    %139:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 139, implicit $exec, implicit $mode, implicit-def $m0
++-    %140:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 140, implicit $exec, implicit $mode, implicit-def $m0
++-    %141:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 141, implicit $exec, implicit $mode, implicit-def $m0
++-    %142:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 142, implicit $exec, implicit $mode, implicit-def $m0
++-    %143:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 143, implicit $exec, implicit $mode, implicit-def $m0
++-    %144:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 144, implicit $exec, implicit $mode, implicit-def $m0
++-    %145:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 145, implicit $exec, implicit $mode, implicit-def $m0
++-    %146:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 146, implicit $exec, implicit $mode, implicit-def $m0
++-    %147:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 147, implicit $exec, implicit $mode, implicit-def $m0
++-    %148:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 148, implicit $exec, implicit $mode, implicit-def $m0
++-    %149:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 149, implicit $exec, implicit $mode, implicit-def $m0
++-    %150:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 150, implicit $exec, implicit $mode, implicit-def $m0
++-    %151:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 151, implicit $exec, implicit $mode, implicit-def $m0
++-    %152:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 152, implicit $exec, implicit $mode, implicit-def $m0
++-    %153:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 153, implicit $exec, implicit $mode, implicit-def $m0
++-    %154:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 154, implicit $exec, implicit $mode, implicit-def $m0
++-    %155:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 155, implicit $exec, implicit $mode, implicit-def $m0
++-    %156:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 156, implicit $exec, implicit $mode, implicit-def $m0
++-    %157:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 157, implicit $exec, implicit $mode, implicit-def $m0
++-    %158:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 158, implicit $exec, implicit $mode, implicit-def $m0
++-    %159:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 159, implicit $exec, implicit $mode, implicit-def $m0
++-    %160:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 160, implicit $exec, implicit $mode, implicit-def $m0
++-    %161:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 161, implicit $exec, implicit $mode, implicit-def $m0
++-    %162:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 162, implicit $exec, implicit $mode, implicit-def $m0
++-    %163:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 163, implicit $exec, implicit $mode, implicit-def $m0
++-    %164:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 164, implicit $exec, implicit $mode, implicit-def $m0
++-    %165:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 165, implicit $exec, implicit $mode, implicit-def $m0
++-    %166:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 166, implicit $exec, implicit $mode, implicit-def $m0
++-    %167:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 167, implicit $exec, implicit $mode, implicit-def $m0
++-    %168:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 168, implicit $exec, implicit $mode, implicit-def $m0
++-    %169:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 169, implicit $exec, implicit $mode, implicit-def $m0
++-    %170:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 170, implicit $exec, implicit $mode, implicit-def $m0
++-    %171:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 171, implicit $exec, implicit $mode, implicit-def $m0
++-    %172:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 172, implicit $exec, implicit $mode, implicit-def $m0
++-    %173:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 173, implicit $exec, implicit $mode, implicit-def $m0
++-    %174:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 174, implicit $exec, implicit $mode, implicit-def $m0
++-    %175:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 175, implicit $exec, implicit $mode, implicit-def $m0
++-    %176:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 176, implicit $exec, implicit $mode, implicit-def $m0
++-    %177:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 177, implicit $exec, implicit $mode, implicit-def $m0
++-    %178:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 178, implicit $exec, implicit $mode, implicit-def $m0
++-    %179:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 179, implicit $exec, implicit $mode, implicit-def $m0
++-    %180:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 180, implicit $exec, implicit $mode, implicit-def $m0
++-    %181:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 181, implicit $exec, implicit $mode, implicit-def $m0
++-    %182:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 182, implicit $exec, implicit $mode, implicit-def $m0
++-    %183:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 183, implicit $exec, implicit $mode, implicit-def $m0
++-    %184:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 184, implicit $exec, implicit $mode, implicit-def $m0
++-    %185:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 185, implicit $exec, implicit $mode, implicit-def $m0
++-    %186:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 186, implicit $exec, implicit $mode, implicit-def $m0
++-    %187:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 187, implicit $exec, implicit $mode, implicit-def $m0
++-    %188:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 188, implicit $exec, implicit $mode, implicit-def $m0
++-    %189:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 189, implicit $exec, implicit $mode, implicit-def $m0
++-    %190:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 190, implicit $exec, implicit $mode, implicit-def $m0
++-    %191:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 191, implicit $exec, implicit $mode, implicit-def $m0
++-    %192:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 192, implicit $exec, implicit $mode, implicit-def $m0
++-    %193:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 193, implicit $exec, implicit $mode, implicit-def $m0
++-    %194:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 194, implicit $exec, implicit $mode, implicit-def $m0
++-    %195:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 195, implicit $exec, implicit $mode, implicit-def $m0
++-    %196:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 196, implicit $exec, implicit $mode, implicit-def $m0
++-    %197:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 197, implicit $exec, implicit $mode, implicit-def $m0
++-    %198:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 198, implicit $exec, implicit $mode, implicit-def $m0
++-    %199:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 199, implicit $exec, implicit $mode, implicit-def $m0
++-    %200:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 200, implicit $exec, implicit $mode, implicit-def $m0
++-    %201:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 201, implicit $exec, implicit $mode, implicit-def $m0
++-    %202:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 202, implicit $exec, implicit $mode, implicit-def $m0
++-    %203:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 203, implicit $exec, implicit $mode, implicit-def $m0
++-    %204:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 204, implicit $exec, implicit $mode, implicit-def $m0
++-    %205:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 205, implicit $exec, implicit $mode, implicit-def $m0
++-    %206:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 206, implicit $exec, implicit $mode, implicit-def $m0
++-    %207:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 207, implicit $exec, implicit $mode, implicit-def $m0
++-    %208:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 208, implicit $exec, implicit $mode, implicit-def $m0
++-    %209:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 209, implicit $exec, implicit $mode, implicit-def $m0
++-    %210:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 210, implicit $exec, implicit $mode, implicit-def $m0
++-    %211:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 211, implicit $exec, implicit $mode, implicit-def $m0
++-    %212:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 212, implicit $exec, implicit $mode, implicit-def $m0
++-    %213:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 213, implicit $exec, implicit $mode, implicit-def $m0
++-    %214:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 214, implicit $exec, implicit $mode, implicit-def $m0
++-    %215:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 215, implicit $exec, implicit $mode, implicit-def $m0
++-    %216:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 216, implicit $exec, implicit $mode, implicit-def $m0
++-    %217:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 217, implicit $exec, implicit $mode, implicit-def $m0
++-    %218:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 218, implicit $exec, implicit $mode, implicit-def $m0
++-    %219:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 219, implicit $exec, implicit $mode, implicit-def $m0
++-    %220:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 220, implicit $exec, implicit $mode, implicit-def $m0
++-    %221:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 221, implicit $exec, implicit $mode, implicit-def $m0
++-    %222:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 222, implicit $exec, implicit $mode, implicit-def $m0
++-    %223:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 223, implicit $exec, implicit $mode, implicit-def $m0
++-    %224:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 224, implicit $exec, implicit $mode, implicit-def $m0
++-    %225:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 225, implicit $exec, implicit $mode, implicit-def $m0
++-    %226:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 226, implicit $exec, implicit $mode, implicit-def $m0
++-    %227:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 227, implicit $exec, implicit $mode, implicit-def $m0
++-    %228:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 228, implicit $exec, implicit $mode, implicit-def $m0
++-    %229:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 229, implicit $exec, implicit $mode, implicit-def $m0
++-    %230:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 230, implicit $exec, implicit $mode, implicit-def $m0
++-    %231:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 231, implicit $exec, implicit $mode, implicit-def $m0
++-    %232:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 232, implicit $exec, implicit $mode, implicit-def $m0
++-    %233:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 233, implicit $exec, implicit $mode, implicit-def $m0
++-    %234:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 234, implicit $exec, implicit $mode, implicit-def $m0
++-    %235:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 235, implicit $exec, implicit $mode, implicit-def $m0
++-    %236:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 236, implicit $exec, implicit $mode, implicit-def $m0
++-    %237:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 237, implicit $exec, implicit $mode, implicit-def $m0
++-    %238:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 238, implicit $exec, implicit $mode, implicit-def $m0
++-    %239:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 239, implicit $exec, implicit $mode, implicit-def $m0
++-    %240:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 240, implicit $exec, implicit $mode, implicit-def $m0
++-    %241:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 241, implicit $exec, implicit $mode, implicit-def $m0
++-    %242:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 242, implicit $exec, implicit $mode, implicit-def $m0
++-    %243:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 243, implicit $exec, implicit $mode, implicit-def $m0
++-    %244:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 244, implicit $exec, implicit $mode, implicit-def $m0
++-    %245:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 245, implicit $exec, implicit $mode, implicit-def $m0
++-    %246:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 246, implicit $exec, implicit $mode, implicit-def $m0
++-    %247:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 247, implicit $exec, implicit $mode, implicit-def $m0
++-    %248:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 248, implicit $exec, implicit $mode, implicit-def $m0
++-    %249:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 249, implicit $exec, implicit $mode, implicit-def $m0
++-    %250:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 250, implicit $exec, implicit $mode, implicit-def $m0
++-    %251:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 251, implicit $exec, implicit $mode, implicit-def $m0
++-    %252:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
++-    %253:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
++-    %254:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.2:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,   implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14,  implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24,  implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33,  implicit %34,  implicit %35,  implicit %36,  implicit %37,  implicit %38,  implicit %39
++-    S_NOP 0, implicit %40,  implicit %41,  implicit %42,  implicit %43,  implicit %44,  implicit %45,  implicit %46,  implicit %47,  implicit %48,  implicit %49
++-    S_NOP 0, implicit %50,  implicit %51,  implicit %52,  implicit %53,  implicit %54,  implicit %55,  implicit %56,  implicit %57,  implicit %58,  implicit %59
++-    S_NOP 0, implicit %60,  implicit %61,  implicit %62,  implicit %63,  implicit %64,  implicit %65,  implicit %66,  implicit %67,  implicit %68,  implicit %69
++-    S_NOP 0, implicit %70,  implicit %71,  implicit %72,  implicit %73,  implicit %74,  implicit %75,  implicit %76,  implicit %77,  implicit %78,  implicit %79
++-    S_NOP 0, implicit %80,  implicit %81,  implicit %82,  implicit %83,  implicit %84,  implicit %85,  implicit %86,  implicit %87,  implicit %88,  implicit %89
++-    S_NOP 0, implicit %90,  implicit %91,  implicit %92,  implicit %93,  implicit %94,  implicit %95,  implicit %96,  implicit %97,  implicit %98,  implicit %99
++-    S_NOP 0, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105, implicit %106, implicit %107, implicit %108, implicit %109
++-    S_NOP 0, implicit %110, implicit %111, implicit %112, implicit %113, implicit %114, implicit %115, implicit %116, implicit %117, implicit %118, implicit %119
++-    S_NOP 0, implicit %120, implicit %121, implicit %122, implicit %123, implicit %124, implicit %125, implicit %126, implicit %127, implicit %128, implicit %129
++-    S_NOP 0, implicit %130, implicit %131, implicit %132, implicit %133, implicit %134, implicit %135, implicit %136, implicit %137, implicit %138, implicit %139
++-    S_NOP 0, implicit %140, implicit %141, implicit %142, implicit %143, implicit %144, implicit %145, implicit %146, implicit %147, implicit %148, implicit %149
++-    S_NOP 0, implicit %150, implicit %151, implicit %152, implicit %153, implicit %154, implicit %155, implicit %156, implicit %157, implicit %158, implicit %159
++-    S_NOP 0, implicit %160, implicit %161, implicit %162, implicit %163, implicit %164, implicit %165, implicit %166, implicit %167, implicit %168, implicit %169
++-    S_NOP 0, implicit %170, implicit %171, implicit %172, implicit %173, implicit %174, implicit %175, implicit %176, implicit %177, implicit %178, implicit %179
++-    S_NOP 0, implicit %180, implicit %181, implicit %182, implicit %183, implicit %184, implicit %185, implicit %186, implicit %187, implicit %188, implicit %189
++-    S_NOP 0, implicit %190, implicit %191, implicit %192, implicit %193, implicit %194, implicit %195, implicit %196, implicit %197, implicit %198, implicit %199
++-    S_NOP 0, implicit %200, implicit %201, implicit %202, implicit %203, implicit %204, implicit %205, implicit %206, implicit %207, implicit %208, implicit %209
++-    S_NOP 0, implicit %210, implicit %211, implicit %212, implicit %213, implicit %214, implicit %215, implicit %216, implicit %217, implicit %218, implicit %219
++-    S_NOP 0, implicit %220, implicit %221, implicit %222, implicit %223, implicit %224, implicit %225, implicit %226, implicit %227, implicit %228, implicit %229
++-    S_NOP 0, implicit %230, implicit %231, implicit %232, implicit %233, implicit %234, implicit %235, implicit %236, implicit %237, implicit %238, implicit %239
++-    S_NOP 0, implicit %240, implicit %241, implicit %242, implicit %243, implicit %244, implicit %245, implicit %246, implicit %247, implicit %248, implicit %249
++-    S_NOP 0, implicit %250, implicit %251, implicit %252, implicit %253, implicit %254
++-
++-    S_NOP 0, implicit %255, implicit %256
++-
++-    S_ENDPGM 0
++-...
++----
++ name:            test_no_sink_two_subregs_in_def_block
++ tracksRegLiveness: true
++ machineFunctionInfo:
++@@ -6158,12 +5544,12 @@
++     S_ENDPGM 0
++ ...
++ ---
++-name:            test_occ_7_sink_one_def_of_undef_subreg_for_8
+++name:            test_occ_9_no_sink_one_def_of_undef_subreg
++ tracksRegLiveness: true
++ machineFunctionInfo:
++   isEntryFunction: true
++ body:             |
++-  ; GFX908-LABEL: name: test_occ_7_sink_one_def_of_undef_subreg_for_8
+++  ; GFX908-LABEL: name: test_occ_9_no_sink_one_def_of_undef_subreg
++   ; GFX908: bb.0:
++   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++   ; GFX908-NEXT: {{  $}}
++@@ -6190,22 +5576,16 @@
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.2:
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
++@@ -6217,13 +5597,7 @@
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
++   ; GFX908-NEXT:   S_ENDPGM 0
++   bb.0:
++     successors: %bb.1
++@@ -6251,24 +5625,17 @@
++     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++     %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    undef %32.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+++    undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
++ 
++   bb.1:
++     successors: %bb.2
++ 
++-    S_NOP 0, implicit %0
+++    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+++    S_NOP 0, implicit %24
++ 
++   bb.2:
++ 
+++    S_NOP 0, implicit %23.sub1
++     S_NOP 0, implicit %0, implicit %1
++     S_NOP 0, implicit %2, implicit %3
++     S_NOP 0, implicit %4, implicit %5
++@@ -6280,12 +5647,7 @@
++     S_NOP 0, implicit %16, implicit %17
++     S_NOP 0, implicit %18, implicit %19
++     S_NOP 0, implicit %20, implicit %21
++-    S_NOP 0, implicit %22, implicit %23
++-    S_NOP 0, implicit %24, implicit %25
++-    S_NOP 0, implicit %26, implicit %27
++-    S_NOP 0, implicit %28, implicit %29
++-    S_NOP 0, implicit %30, implicit %31
++-    S_NOP 0, implicit %32.sub1
+++    S_NOP 0, implicit %22
++     S_ENDPGM 0
++ ...
++ ---
++@@ -6504,770 +5866,7 @@
++     S_NOP 0, implicit %22
++     S_ENDPGM 0
++ ...
++----
++-name:            test_live_through_occ_7_sink_for_8
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_live_through_occ_7_sink_for_8
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   dead [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_10]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_20]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.3:
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.2:
++-    successors: %bb.3
++-
++-    S_NOP 0, implicit %2,  implicit %3,  implicit %4,  implicit %5,  implicit %6
++-    S_NOP 0, implicit %7,  implicit %8,  implicit %9,  implicit %10, implicit %11
++-    S_NOP 0, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16
++-    S_NOP 0, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21
++-    S_NOP 0, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26
++-    S_NOP 0, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31
++-
++-  bb.3:
++-    S_NOP 0, implicit %0, implicit %1
++-    S_ENDPGM 0
++-...
++----
++-name:            test_remat_over_exec_modif
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_remat_over_exec_modif
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT:   liveins: $sgpr2_sgpr3
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   %save_exec:sreg_64 = S_MOV_B64 $exec
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   $exec = S_MOV_B64 %new_exec
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
++-  ; GFX908-NEXT:   $exec = S_MOV_B64 %save_exec
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    liveins: $sgpr2_sgpr3
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-
++-    %save_exec:sreg_64 = S_MOV_B64 $exec
++-    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-    $exec = S_MOV_B64 %new_exec
++-
++-  bb.1:
++-
++-    S_NOP 0, implicit %0, implicit %16
++-    S_NOP 0, implicit %1, implicit %17
++-    S_NOP 0, implicit %2, implicit %18
++-    S_NOP 0, implicit %3, implicit %19
++-    S_NOP 0, implicit %4, implicit %20
++-    S_NOP 0, implicit %5, implicit %21
++-    S_NOP 0, implicit %6, implicit %22
++-    S_NOP 0, implicit %7, implicit %23
++-    S_NOP 0, implicit %8, implicit %24
++-    S_NOP 0, implicit %9, implicit %25
++-    S_NOP 0, implicit %10, implicit %26
++-    S_NOP 0, implicit %11, implicit %27
++-    S_NOP 0, implicit %12, implicit %28
++-    S_NOP 0, implicit %13, implicit %29
++-    S_NOP 0, implicit %14, implicit %30
++-    S_NOP 0, implicit %15, implicit %31
++-    S_NOP 0, implicit %32
++-
++-    $exec = S_MOV_B64 %save_exec
++-    S_ENDPGM 0
++-...
++----
++-name:            test_remat_same_block
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  bb.0:
++-    liveins: $sgpr2_sgpr3
++-
++-    ; GFX908-LABEL: name: test_remat_same_block
++-    ; GFX908: liveins: $sgpr2_sgpr3
++-    ; GFX908-NEXT: {{  $}}
++-    ; GFX908-NEXT: %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: %save_exec:sreg_64 = S_MOV_B64 $exec
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    ; GFX908-NEXT: $exec = S_MOV_B64 %new_exec
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
++-    ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
++-    ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
++-    ; GFX908-NEXT: $exec = S_MOV_B64 %save_exec
++-    ; GFX908-NEXT: S_ENDPGM 0
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-
++-    %save_exec:sreg_64 = S_MOV_B64 $exec
++-    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
++-    $exec = S_MOV_B64 %new_exec
++-
++-    S_NOP 0, implicit %0, implicit %16
++-    S_NOP 0, implicit %1, implicit %17
++-    S_NOP 0, implicit %2, implicit %18
++-    S_NOP 0, implicit %3, implicit %19
++-    S_NOP 0, implicit %4, implicit %20
++-    S_NOP 0, implicit %5, implicit %21
++-    S_NOP 0, implicit %6, implicit %22
++-    S_NOP 0, implicit %7, implicit %23
++-    S_NOP 0, implicit %8, implicit %24
++-    S_NOP 0, implicit %9, implicit %25
++-    S_NOP 0, implicit %10, implicit %26
++-    S_NOP 0, implicit %11, implicit %27
++-    S_NOP 0, implicit %12, implicit %28
++-    S_NOP 0, implicit %13, implicit %29
++-    S_NOP 0, implicit %14, implicit %30
++-    S_NOP 0, implicit %15, implicit %31
++-    S_NOP 0, implicit %32
++ 
++-    $exec = S_MOV_B64 %save_exec
++-    S_ENDPGM 0
++-...
++----
++-name:            test_rollback_remat_defregion_above_target
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_rollback_remat_defregion_above_target
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32
++-
++-  bb.2:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            test_rollback_remat_useregion_above_target
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_rollback_remat_useregion_above_target
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32
++-
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode
++-
++-  bb.2:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-    S_NOP 0, implicit %33,  implicit %34
++-
++-    S_ENDPGM 0
++-...
++----
++-name:            test_rollback_remats_emptydefregion
++-tracksRegLiveness: true
++-machineFunctionInfo:
++-  isEntryFunction: true
++-body:             |
++-  ; GFX908-LABEL: name: test_rollback_remats_emptydefregion
++-  ; GFX908: bb.0:
++-  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]]
++-  ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT: bb.3:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
++-  ; GFX908-NEXT:   S_ENDPGM 0
++-  bb.0:
++-    successors: %bb.1
++-
++-    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
++-    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
++-    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
++-    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
++-    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
++-    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
++-    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
++-    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
++-    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
++-    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-
++-  bb.1:
++-    successors: %bb.2
++-
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
++-
++-  bb.2:
++-    successors: %bb.3
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31,  implicit %32,  implicit %33
++-
++-  bb.3:
++-
++-    S_NOP 0, implicit %0,   implicit %1,   implicit %2,   implicit %3,   implicit %4,
++-    S_NOP 0, implicit %5,   implicit %6,   implicit %7,   implicit %8,   implicit %9
++-    S_NOP 0, implicit %10,  implicit %11,  implicit %12,  implicit %13,  implicit %14
++-    S_NOP 0, implicit %15,  implicit %16,  implicit %17,  implicit %18,  implicit %19
++-    S_NOP 0, implicit %20,  implicit %21,  implicit %22,  implicit %23,  implicit %24
++-    S_NOP 0, implicit %25,  implicit %26,  implicit %27,  implicit %28,  implicit %29
++-    S_NOP 0, implicit %30,  implicit %31
++-
++-    S_ENDPGM 0
++-...
++ ---
++ name:            test_occ_8_physreg_use
++ tracksRegLiveness: true
++@@ -7347,15 +5946,15 @@
++   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
++@@ -7449,9 +6048,9 @@
++ 
++   bb.4:
++ 
++-    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %50, implicit %60, implicit %20
++     S_NOP 0, implicit %51, implicit %61, implicit %21
+++    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %53, implicit %63
++     S_NOP 0, implicit %54, implicit %64
++     S_NOP 0, implicit %55, implicit %65
++@@ -7468,6 +6067,7 @@
++     S_NOP 0, implicit %80
++     S_ENDPGM 0
++ ...
+++
++ ---
++ name:            test_occ_8_exec_use
++ tracksRegLiveness: true
++@@ -7543,21 +6143,21 @@
++   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
++   ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++   ; GFX908-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 255
++   ; GFX908-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 [[S_MOV_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF30]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_9]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF30]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_10]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
++@@ -7654,9 +6254,9 @@
++ 
++     %100:sreg_64 = S_MOV_B64 255
++     %101:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %100, implicit-def $exec, implicit-def $scc, implicit $exec
++-    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %50, implicit %60, implicit %20
++     S_NOP 0, implicit %51, implicit %61, implicit %21
+++    S_NOP 0, implicit %52, implicit %62
++     S_NOP 0, implicit %53, implicit %63
++     S_NOP 0, implicit %54, implicit %64
++     S_NOP 0, implicit %55, implicit %65
++@@ -7674,6 +6274,7 @@
++     $exec = S_MOV_B64 %101:sreg_64_xexec
++     S_ENDPGM 0
++ ...
+++
++ ---
++ name:            remat_virtual_vgpr_occ_6
++ tracksRegLiveness: true
++@@ -7740,30 +6341,30 @@
++   ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
++-  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+++  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+++  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF22]], implicit [[DEF27]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF23]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF31]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF30]], implicit [[DEF25]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF31]], implicit [[DEF26]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF22]], implicit [[DEF27]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF23]], implicit [[DEF28]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF24]], implicit [[DEF29]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
++@@ -7852,12 +6453,12 @@
++     S_BRANCH %bb.4
++ 
++   bb.4:
++-    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
++     S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
++     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
++     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
++     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %57, implicit %67
++     S_NOP 0, implicit %58, implicit %68
++@@ -8144,25 +6745,25 @@
++   ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF24]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF25]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF26]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF27]], implicit [[DEF31]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF24]], implicit [[DEF28]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_24]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF26]], implicit [[DEF30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF27]], implicit [[DEF31]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
++@@ -8254,12 +6855,12 @@
++     S_BRANCH %bb.4
++ 
++   bb.4:
++-    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %50, implicit %60, implicit %10.sub0, implicit %10.sub2
++     S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
++     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
++     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
++     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65
++     S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %57, implicit %67
++     S_NOP 0, implicit %58, implicit %68
++@@ -8555,24 +7156,24 @@
++   ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   undef [[V_CVT_I32_F32_e32_1:%[0-9]+]].sub0:vreg_64 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub0, implicit $exec, implicit $mode
++   ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
++-  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]].sub2, implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]].sub0, implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF]].sub0, implicit [[DEF]].sub2
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF1]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF25]], implicit [[DEF29]]
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
++   ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF1]], implicit [[DEF28]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF25]], implicit [[DEF29]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF26]], implicit [[DEF30]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF27]], implicit [[DEF31]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF26]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF27]], implicit [[DEF31]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
++   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_11]]
++@@ -8663,13 +7264,13 @@
++     S_BRANCH %bb.4
++ 
++   bb.4:
++-    S_NOP 0, implicit %55, implicit %65
++-    S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %50.sub0, implicit %60, implicit %10.sub0, implicit %10.sub2
++     S_NOP 0, implicit %52, implicit %61, implicit %11, implicit %21
++     S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
++     S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
++     S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65
+++    S_NOP 0, implicit %56, implicit %66
++     S_NOP 0, implicit %57, implicit %67
++     S_NOP 0, implicit %58, implicit %68
++     S_NOP 0, implicit %59, implicit %69
++@@ -8695,75 +7296,123 @@
++   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
++   ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   dead [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_37:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_38:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_39:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_40:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_41:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_42:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
++-  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF]].sub0, implicit $mode, implicit $exec
++-  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   [[DEF32:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   undef [[V_RCP_F32_e32_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 [[DEF32]].sub0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF31]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   dead [[DEF33:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   [[DEF34:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   S_BRANCH %bb.1
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.1:
++-  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+++  ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
++   ; GFX908-NEXT: {{  $}}
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_40]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_41]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_42]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_33]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_34]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_35]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_36]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_37]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_38]]
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_19]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_39]]
+++  ; GFX908-NEXT:   %temp:vgpr_32 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr3, $sgpr4, implicit-def $scc
+++  ; GFX908-NEXT:   [[DEF34:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, [[DEF32]].sub0, 1, %temp, 0, 0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+++  ; GFX908-NEXT:   S_BRANCH %bb.3
++   ; GFX908-NEXT: {{  $}}
++   ; GFX908-NEXT: bb.2:
++-  ; GFX908-NEXT:   dead [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF2]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF2]].sub0, 0, 0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT:   undef [[V_FMA_F32_e64_:%[0-9]+]].sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, [[DEF34]].sub1, 0, [[V_RCP_F32_e32_]].sub0, 0, [[DEF34]].sub0, 0, 0, implicit $mode, implicit $exec
++   ; GFX908-NEXT:   %temp2:vreg_64_align2 = IMPLICIT_DEF
++-  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+++  ; GFX908-NEXT:   [[V_PK_MUL_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, [[V_RCP_F32_e32_]], 8, [[DEF32]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++   ; GFX908-NEXT:   dead [[V_PK_FMA_F32_:%[0-9]+]]:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, [[V_FMA_F32_e64_]], 8, %temp2, 11, [[V_PK_MUL_F32_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++-  ; GFX908-NEXT:   S_NOP 0, implicit [[DEF]], implicit [[V_RCP_F32_e32_]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_28]], implicit [[V_CVT_I32_F32_e32_23]], implicit [[DEF22]], implicit [[DEF27]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]], implicit [[V_CVT_I32_F32_e32_24]], implicit [[DEF23]], implicit [[DEF28]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_25]], implicit [[DEF24]], implicit [[DEF29]]
+++  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+++  ; GFX908-NEXT:   dead [[DEF35:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_26]], implicit [[DEF25]], implicit [[DEF30]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_22]], implicit [[V_CVT_I32_F32_e32_27]], implicit [[DEF26]], implicit [[DEF31]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_6]], implicit [[DEF32]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_7]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_8]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_9]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_10]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_11]], implicit [[V_CVT_I32_F32_e32_12]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_13]], implicit [[V_CVT_I32_F32_e32_14]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_15]], implicit [[V_CVT_I32_F32_e32_16]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+++  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]]
+++  ; GFX908-NEXT:   S_BRANCH %bb.3
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT: bb.3:
+++  ; GFX908-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
+++  ; GFX908-NEXT:   liveins: $sgpr3, $sgpr4
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT:   S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
+++  ; GFX908-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+++  ; GFX908-NEXT:   S_BRANCH %bb.4
+++  ; GFX908-NEXT: {{  $}}
+++  ; GFX908-NEXT: bb.4:
++   ; GFX908-NEXT:   S_ENDPGM 0
++   bb.0:
++     liveins: $sgpr3, $sgpr4
++@@ -8773,72 +7422,116 @@
++     undef %2.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_RCP_F32_e32 %1.sub0:vreg_64_align2, implicit $mode, implicit $exec
++     %3:vreg_64_align2 =  nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_MUL_F32 0, %2, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
++     %5:vreg_64_align2 = IMPLICIT_DEF
++-
++-    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
++-    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
++-    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
++-    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
++-    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
++-    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
++-    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
++-    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
++-    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
++-    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
++-    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
++-    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
++-    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
++-    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
++-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
++-    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
++-    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
++-    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
++-    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
++-    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
++-    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
++-    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
++-    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
++-    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
++-    %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
++-    %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0
++-    %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0
++-    %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0
++-    %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0
++-    %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0
++-    %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0
++-    %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0
++-    %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0
++-    %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0
++-    %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0
++-    %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0
++-    %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0
++-    %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0
++-    %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0
++-    %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0
++-    %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0
++-    %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0
++-    %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0
+++    %10:vgpr_32 = IMPLICIT_DEF
+++    %11:vgpr_32 = IMPLICIT_DEF
+++    %12:vgpr_32 = IMPLICIT_DEF
+++    %13:vgpr_32 = IMPLICIT_DEF
+++    %14:vgpr_32 = IMPLICIT_DEF
+++    %15:vgpr_32 = IMPLICIT_DEF
+++    %16:vgpr_32 = IMPLICIT_DEF
+++    %17:vgpr_32 = IMPLICIT_DEF
+++    %18:vgpr_32 = IMPLICIT_DEF
+++    %19:vgpr_32 = IMPLICIT_DEF
+++    %20:vgpr_32 = IMPLICIT_DEF
+++    %21:vgpr_32 = IMPLICIT_DEF
+++    %22:vgpr_32 = IMPLICIT_DEF
+++    %23:vgpr_32 = IMPLICIT_DEF
+++    %24:vgpr_32 = IMPLICIT_DEF
+++    %25:vgpr_32 = IMPLICIT_DEF
+++    %26:vgpr_32 = IMPLICIT_DEF
+++    %27:vgpr_32 = IMPLICIT_DEF
+++    %28:vgpr_32 = IMPLICIT_DEF
+++    %29:vgpr_32 = IMPLICIT_DEF
+++    %30:vgpr_32 = IMPLICIT_DEF
+++    %31:vgpr_32 = IMPLICIT_DEF
+++    %32:vgpr_32 = IMPLICIT_DEF
+++    %33:vgpr_32 = IMPLICIT_DEF
+++    %34:vgpr_32 = IMPLICIT_DEF
+++    %35:vgpr_32 = IMPLICIT_DEF
+++    %36:vgpr_32 = IMPLICIT_DEF
+++    %37:vgpr_32 = IMPLICIT_DEF
+++    %38:vgpr_32 = IMPLICIT_DEF
+++    %39:vgpr_32 = IMPLICIT_DEF
+++    %40:vgpr_32 = IMPLICIT_DEF
+++    %41:vgpr_32 = IMPLICIT_DEF
+++    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %10, implicit $exec, implicit $mode
+++    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %11, implicit $exec, implicit $mode
+++    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+++    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+++    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+++    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+++    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+++    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+++    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+++    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+++    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
+++    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+++    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+++    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+++    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+++    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+++    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+++    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+++    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+++    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+++    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+++    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+++    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+++    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+++    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+++    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+++    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+++    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+++    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+++    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+++    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+++    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+++    S_BRANCH %bb.1
++ 
++   bb.1:
+++    liveins: $sgpr3, $sgpr4
++ 
++-    S_NOP 0, implicit %10, implicit %20, implicit %30, implicit %40, implicit %50
++-    S_NOP 0, implicit %11, implicit %21, implicit %31, implicit %41, implicit %51
++-    S_NOP 0, implicit %12, implicit %22, implicit %32, implicit %42, implicit %52
++-    S_NOP 0, implicit %13, implicit %23, implicit %33, implicit %43
++-    S_NOP 0, implicit %14, implicit %24, implicit %34, implicit %44
++-    S_NOP 0, implicit %15, implicit %25, implicit %35, implicit %45
++-    S_NOP 0, implicit %16, implicit %26, implicit %36, implicit %46
++-    S_NOP 0, implicit %17, implicit %27, implicit %37, implicit %47
++-    S_NOP 0, implicit %18, implicit %28, implicit %38, implicit %48
++-    S_NOP 0, implicit %19, implicit %29, implicit %39, implicit %49
+++    %temp:vgpr_32 = IMPLICIT_DEF
+++    %5.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_MUL_F32_e64 0, %1.sub0:vreg_64_align2, 1, %temp, 0, 0, implicit $mode, implicit $exec
++ 
++-  bb.2:
+++    S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
+++    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+++    S_BRANCH %bb.4
+++
+++  bb.3:
+++    liveins: $sgpr3, $sgpr4
++ 
++     %6:vreg_64_align2 = IMPLICIT_DEF
++     undef %7.sub0:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_FMA_F32_e64 0, %5.sub1, 0, %2.sub0:vreg_64_align2, 0, %5.sub0:vreg_64_align2, 0, 0, implicit $mode, implicit $exec
++     %temp2:vreg_64_align2 = IMPLICIT_DEF
++     %8:vreg_64_align2 = nnan ninf nsz arcp contract afn reassoc nofpexcept V_PK_FMA_F32 0, %7:vreg_64_align2, 8, %temp2, 11, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+++    S_NOP 0, implicit %50, implicit %60, implicit %10, implicit %20
+++    S_NOP 0, implicit %51, implicit %61, implicit %11, implicit %21
+++    S_NOP 0, implicit %52, implicit %62, implicit %12, implicit %22
+++    S_NOP 0, implicit %53, implicit %63, implicit %13, implicit %23
+++    S_NOP 0, implicit %54, implicit %64, implicit %14, implicit %24
+++    S_NOP 0, implicit %55, implicit %65, implicit %1
+++    S_NOP 0, implicit %56, implicit %66
+++    S_NOP 0, implicit %57, implicit %67
+++    S_NOP 0, implicit %58, implicit %68
+++    S_NOP 0, implicit %59, implicit %69
+++    S_NOP 0, implicit %70, implicit %71
+++    S_NOP 0, implicit %72, implicit %73
+++    S_NOP 0, implicit %72, implicit %73
+++    S_NOP 0, implicit %74, implicit %75
+++    S_NOP 0, implicit %76, implicit %77
+++    S_NOP 0, implicit %78, implicit %79
+++    S_NOP 0, implicit %80
+++    S_BRANCH %bb.4
++ 
++-    S_NOP 0, implicit %1,  implicit %2
+++  bb.4:
+++    liveins: $sgpr3, $sgpr4
++ 
+++    S_CMP_LG_U32 $sgpr4, 0, implicit-def $scc
+++    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+++    S_BRANCH %bb.2
+++
+++  bb.2:
++     S_ENDPGM 0
+++
++ ...
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
++--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
++@@ -506,8 +506,8 @@
++ ; GFX908-NEXT:    v_accvgpr_write_b32 a3, 0
++ ; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
++ ; GFX908-NEXT:    v_accvgpr_write_b32 a0, 0
++-; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX908-NEXT:    s_mov_b32 s0, 16
+++; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
++ ; GFX908-NEXT:  .LBB2_1: ; %for.cond.preheader
++ ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
++@@ -566,6 +566,7 @@
++ ;
++ ; GFX90A-LABEL: test_mfma_loop_non_splat:
++ ; GFX90A:       ; %bb.0: ; %entry
+++; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
++@@ -599,7 +600,6 @@
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
++ ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
++ ; GFX90A-NEXT:    s_mov_b32 s0, 16
++-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
++ ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
++ ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
++@@ -626,6 +626,7 @@
++ ;
++ ; GFX942-LABEL: test_mfma_loop_non_splat:
++ ; GFX942:       ; %bb.0: ; %entry
+++; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
++@@ -659,7 +660,6 @@
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
++ ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
++ ; GFX942-NEXT:    s_mov_b32 s0, 16
++-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
++ ; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
++ ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
++ ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
++diff -ruN --strip-trailing-cr a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
++--- a/mlir/lib/TableGen/Pattern.cpp
+++++ b/mlir/lib/TableGen/Pattern.cpp
++@@ -304,8 +304,8 @@
++     assert(index < 0);
++     auto *operand = cast<NamedTypeConstraint *>(op->getArg(getArgIndex()));
++     if (operand->isOptional()) {
++-      auto repl =
++-          formatv(fmt, formatv("({0}.empty() ? Value() : *{0}.begin())", name));
+++      auto repl = formatv(
+++          fmt, formatv("({0}.empty() ? ::mlir::Value() : *{0}.begin())", name));
++       LLVM_DEBUG(dbgs() << repl << " (OptionalOperand)\n");
++       return std::string(repl);
++     }
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 667ae4e..691c322 100644
+index b41439e..2573c25 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "f6212c1cd3d8b827c7d7e2f6cf54b135c27eacc6"
--    LLVM_SHA256 = "bba6d4c0020622b64202640a94504b957d3597e4f42e9b4f20cbcfa80a8aa41a"
-+    LLVM_COMMIT = "fd9a882ce31cb0a53dba63528c15d76f088854b7"
-+    LLVM_SHA256 = "99ab085087ea6a5f27f293a9f06c837eca57042bc6a7e11f3cd2d9d2168274b3"
+-    LLVM_COMMIT = "2d287f51eff2a5fbf84458a33f7fb2493cf67965"
+-    LLVM_SHA256 = "e06d0a35b0e0570b2f54dfd23d0e9fe6f084e032c14bb7ab194b06cb8c9cb86c"
++    LLVM_COMMIT = "741fef3a445339523500f614e0f752b9a74517a6"
++    LLVM_SHA256 = "ae542233c385388cb5f8ce04bf2085ed108b1a592c530c1746e8c92d00bd33fb"
  
      tf_http_archive(
          name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 55204d7a4c04..251cf464b061 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "fc4a137d311e9c9b43a775e1dfd1e20ce06cd879"
-    SHARDY_SHA256 = "672c07073ccab36948a61e079a6a951dedf91d703e5fa910eecf75dc7c0922f4"
+    SHARDY_COMMIT = "badea7dca47a15c2f25e684312aac483eec9ec83"
+    SHARDY_SHA256 = "4677bba14e5c0d271ca4ec9b80a6c17e8f39c0e414e5036da83daf021e5b24fa"
 
     tf_http_archive(
         name = "shardy",
diff --git a/third_party/xla/third_party/spirv_llvm_translator/BUILD b/third_party/xla/third_party/spirv_llvm_translator/BUILD
index 8d626dc7635d..15b134befc23 100644
--- a/third_party/xla/third_party/spirv_llvm_translator/BUILD
+++ b/third_party/xla/third_party/spirv_llvm_translator/BUILD
@@ -1,5 +1,5 @@
 package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # copybara:uncomment default_applicable_licenses = ["//third_party/tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/third_party/stablehlo/BUILD b/third_party/xla/third_party/stablehlo/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/xla/third_party/gpus/BUILD b/third_party/xla/third_party/stablehlo/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/gpus/BUILD
rename to third_party/xla/third_party/stablehlo/BUILD.bazel
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 3f84cc7af28d..56624322956e 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,71 +1,72 @@
-diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
---- stablehlo/stablehlo/dialect/Base.cpp
-+++ stablehlo/stablehlo/dialect/Base.cpp
-@@ -680,6 +680,7 @@
-           {{bf16, bf16, f32, 1}, KnownDotAlgorithm::BF16_BF16_F32},
-           {{bf16, bf16, f32, 3}, KnownDotAlgorithm::BF16_BF16_F32_X3},
-           {{bf16, bf16, f32, 6}, KnownDotAlgorithm::BF16_BF16_F32_X6},
-+          {{bf16, bf16, f32, 9}, KnownDotAlgorithm::BF16_BF16_F32_X9},
-           {{tf32, tf32, f32, 1}, KnownDotAlgorithm::TF32_TF32_F32},
-           {{tf32, tf32, f32, 3}, KnownDotAlgorithm::TF32_TF32_F32_X3},
-           {{f32, f32, f32, 1}, KnownDotAlgorithm::F32_F32_F32},
-diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
---- stablehlo/stablehlo/dialect/Base.h
-+++ stablehlo/stablehlo/dialect/Base.h
-@@ -260,6 +260,7 @@
-   TF32_TF32_F32_X3 = 10,
-   F32_F32_F32 = 11,
-   F64_F64_F64 = 12,
-+  BF16_BF16_F32_X9 = 13,
- };
- 
- FailureOr<KnownDotAlgorithm> getKnownDotAlgorithm(
-diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/dialect/ChloOps.cpp
---- stablehlo/stablehlo/dialect/ChloOps.cpp
-+++ stablehlo/stablehlo/dialect/ChloOps.cpp
-@@ -710,31 +710,6 @@
-   return success();
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -2635,6 +2635,38 @@
+                           inferredReturnTypes);
  }
  
--LogicalResult RaggedDotOp::inferReturnTypes(
--    MLIRContext*, std::optional<Location>, ValueRange operands,
--    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  RaggedDotOp::Adaptor op(operands, attributes, properties, regions);
--
--  auto rankedLhsType = cast<RankedTensorType>(op.getLhs().getType());
--  auto rankedRhsType = cast<RankedTensorType>(op.getRhs().getType());
--  auto rankedGroupSizesType =
--      cast<RankedTensorType>(op.getGroupSizes().getType());
--  auto raggedDotDimNums = op.getRaggedDotDimensionNumbers();
--
--  inferredReturnTypes.push_back(RankedTensorType::get(
--      inferRaggedDotOutputDimensions(
--          rankedLhsType, rankedRhsType, rankedGroupSizesType,
--          raggedDotDimNums.getLhsBatchingDimensions(),
--          raggedDotDimNums.getRhsBatchingDimensions(),
--          raggedDotDimNums.getLhsContractingDimensions(),
--          raggedDotDimNums.getRhsContractingDimensions(),
--          raggedDotDimNums.getLhsRaggedDimensions(),
--          raggedDotDimNums.getRhsGroupDimensions()),
--      rankedLhsType.getElementType()));
--  return success();
--}
--
++class FoldConstantCaseOp : public OpRewritePattern<CaseOp> {
++ public:
++  explicit FoldConstantCaseOp(MLIRContext* context)
++      : OpRewritePattern<CaseOp>(context) {}
++  LogicalResult matchAndRewrite(CaseOp op,
++                                PatternRewriter& rewriter) const override {
++    DenseIntElementsAttr branch;
++    if (!matchPattern(op.getIndex(), m_Constant(&branch))) return failure();
++
++    int index = *branch.getValues<int>().begin();
++    if (index >= op.getBranches().size() || index < 0) {
++      return failure();
++    }
++
++    Block &block = op.getBranches()[index].back();
++    IRMapping mapping;
++    for (auto &block_op : block.without_terminator()) {
++      rewriter.clone(block_op, mapping);
++    }
++    rewriter.replaceOp(
++        op, llvm::to_vector(llvm::map_range(
++            block.getTerminator()->getOperands(),
++            [&](Value v) { return mapping.lookupOrDefault(v); })));
++      return success();
++  }
++};
++
++void CaseOp::getCanonicalizationPatterns(RewritePatternSet& results,
++                                         MLIRContext* context) {
++  results.add<FoldConstantCaseOp>(context);
++}
++
  //===----------------------------------------------------------------------===//
- // TopKOp
+ // SliceOp
  //===----------------------------------------------------------------------===//
-diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.td b/stablehlo/stablehlo/dialect/ChloOps.td
---- stablehlo/stablehlo/dialect/ChloOps.td
-+++ stablehlo/stablehlo/dialect/ChloOps.td
-@@ -856,8 +856,7 @@
-   let hasCustomAssemblyFormat = 1;
- }
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.td b/stablehlo/stablehlo/dialect/StablehloOps.td
+--- stablehlo/stablehlo/dialect/StablehloOps.td
++++ stablehlo/stablehlo/dialect/StablehloOps.td
+@@ -1392,7 +1392,7 @@
+   );
  
--def CHLO_RaggedDotOp : CHLO_Op<"ragged_dot",
--    [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
-+def CHLO_RaggedDotOp : CHLO_Op<"ragged_dot", [Pure]> {
-   string summary = "Computes a matmul over a single ragged dimension";
+   let regions = (region VariadicRegion<SizedRegion<1>>:$branches /*case_i2*/);
+-
++  let hasCanonicalizer = 1;
+   let results = (outs Variadic<HLO_TensorOrPerAxisQuantizedTensorOrToken>);
+ }
  
-   string description = [{
+diff --ruN a/stablehlo/stablehlo/tests/canonicalize.mlir b/stablehlo/stablehlo/tests/canonicalize.mlir
+--- stablehlo/stablehlo/tests/canonicalize.mlir
++++ stablehlo/stablehlo/tests/canonicalize.mlir
+@@ -0,0 +1,13 @@
++// RUN: stablehlo-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{test-convergence}))' -split-input-file -allow-unregistered-dialect | FileCheck %s
++
++func.func @fold_constant_case(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>) {
++ %0 = "stablehlo.constant"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
++ %1 = "stablehlo.case"(%0) ({
++   "stablehlo.return"(%arg0) : (tensor<i32>) -> ()
++ }, {
++  "stablehlo.return"(%arg1) : (tensor<i32>) -> ()
++ }) : (tensor<i32>) -> tensor<i32>
++ return %1 : tensor<i32>
++
++// CHECK: return %arg1
++}
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 88f17a07d609..5c675bfaca97 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "7b7d6ad438bfcb2f15b8c339c12bb4a7845467f1"
-    STABLEHLO_SHA256 = "68bce868c9cc18eca6468bfc58b2450b6b219829c09aeeab8a2dbb9ffd85697a"
+    STABLEHLO_COMMIT = "630c315b1d2821dd1181137315eda93875096216"
+    STABLEHLO_SHA256 = "705c1ab05624d8b18b756f9984b9c72176fe0d9d32e279459317d4aa8d511561"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/systemlibs/BUILD b/third_party/xla/third_party/systemlibs/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/xla/third_party/gpus/crosstool/BUILD b/third_party/xla/third_party/systemlibs/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/gpus/crosstool/BUILD
rename to third_party/xla/third_party/systemlibs/BUILD.bazel
diff --git a/third_party/xla/third_party/systemlibs/protobuf.bzl b/third_party/xla/third_party/systemlibs/protobuf.bzl
index 3813d04954e2..5e2a98b51f9f 100644
--- a/third_party/xla/third_party/systemlibs/protobuf.bzl
+++ b/third_party/xla/third_party/systemlibs/protobuf.bzl
@@ -1,3 +1,8 @@
+""
+
+load("@rules_python//python:py_library.bzl", "py_library")
+load("@rules_python//python:py_test.bzl", "py_test")
+
 def _GetPath(ctx, path):
     if ctx.label.workspace_root:
         return ctx.label.workspace_root + "/" + path
@@ -387,7 +392,7 @@ def py_proto_library(
     if default_runtime and not default_runtime in py_libs + deps:
         py_libs = py_libs + [default_runtime]
 
-    native.py_library(
+    py_library(
         name = name,
         srcs = outs + py_extra_srcs,
         deps = py_libs + deps,
@@ -410,7 +415,7 @@ def internal_protobuf_py_tests(
     """
     for m in modules:
         s = "python/google/protobuf/internal/%s.py" % m
-        native.py_test(
+        py_test(
             name = "py_%s" % m,
             srcs = [s],
             main = s,
diff --git a/third_party/xla/third_party/tensorrt/BUILD b/third_party/xla/third_party/tensorrt/BUILD
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/third_party/xla/third_party/gpus/cuda/BUILD b/third_party/xla/third_party/tensorrt/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/gpus/cuda/BUILD
rename to third_party/xla/third_party/tensorrt/BUILD.bazel
diff --git a/third_party/xla/third_party/tensorrt/plugin/BUILD b/third_party/xla/third_party/tensorrt/plugin/BUILD
deleted file mode 100644
index 097e4b2b50b7..000000000000
--- a/third_party/xla/third_party/tensorrt/plugin/BUILD
+++ /dev/null
@@ -1,59 +0,0 @@
-# NVIDIA TensorRT Open Source Plugins
-# This package contains build targets for select TensorRT plugins included in the
-# TensorRT open source repository.
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_library")
-
-exports_files(["LICENSE"])
-
-cuda_library(
-    name = "plugin_common",
-    srcs = [
-        "plugin/common/kernels/common.cu.cc",
-    ],
-    hdrs = [
-        "plugin/common/bboxUtils.h",
-        "plugin/common/checkMacrosPlugin.h",
-        "plugin/common/plugin.h",
-    ],
-    strip_include_prefix = "plugin/common",
-    deps = [
-        "//third_party/tensorrt:nv_infer_headers",
-        "@local_config_tensorrt//:tensorrt",
-    ],
-)
-
-cc_library(
-    name = "nms_plugin_hdrs",
-    hdrs = [
-        "plugin/efficientNMSPlugin/efficientNMSInference.h",
-        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
-    ],
-)
-
-cuda_library(
-    name = "nvinfer_plugin_nms",
-    srcs = [
-        "plugin/efficientNMSPlugin/efficientNMSInference.cu.cc",
-        "plugin/efficientNMSPlugin/efficientNMSInference.cu.h",
-        "plugin/efficientNMSPlugin/efficientNMSInference.h",
-        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.cpp",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
-    ],
-    hdrs = [
-        "plugin/efficientNMSPlugin/efficientNMSInference.h",
-        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
-        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
-    ],
-    copts = cuda_default_copts(),
-    include_prefix = "third_party/tensorrt/plugin/efficientNMSPlugin",
-    strip_include_prefix = "plugin/efficientNMSPlugin",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":nms_plugin_hdrs",
-        ":plugin_common",
-        "//third_party/tensorrt:nv_infer_headers",
-        "@local_config_tensorrt//:tensorrt",
-    ],
-)
diff --git a/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel b/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
new file mode 100644
index 000000000000..56e26d779de1
--- /dev/null
+++ b/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
@@ -0,0 +1,59 @@
+# NVIDIA TensorRT Open Source Plugins
+# This package contains build targets for select TensorRT plugins included in the
+# TensorRT open source repository.
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_library")
+
+exports_files(["LICENSE"])
+
+cuda_library(
+    name = "plugin_common",
+    srcs = [
+        "plugin/common/kernels/common.cu.cc",
+    ],
+    hdrs = [
+        "plugin/common/bboxUtils.h",
+        "plugin/common/checkMacrosPlugin.h",
+        "plugin/common/plugin.h",
+    ],
+    strip_include_prefix = "plugin/common",
+    deps = [
+        "@local_config_tensorrt//:tensorrt",
+        "@local_config_tensorrt//:tensorrt_headers",
+    ],
+)
+
+cc_library(
+    name = "nms_plugin_hdrs",
+    hdrs = [
+        "plugin/efficientNMSPlugin/efficientNMSInference.h",
+        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
+        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
+    ],
+)
+
+cuda_library(
+    name = "nvinfer_plugin_nms",
+    srcs = [
+        "plugin/efficientNMSPlugin/efficientNMSInference.cu.cc",
+        "plugin/efficientNMSPlugin/efficientNMSInference.cu.h",
+        "plugin/efficientNMSPlugin/efficientNMSInference.h",
+        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
+        "plugin/efficientNMSPlugin/efficientNMSPlugin.cpp",
+        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
+    ],
+    hdrs = [
+        "plugin/efficientNMSPlugin/efficientNMSInference.h",
+        "plugin/efficientNMSPlugin/efficientNMSParameters.h",
+        "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
+    ],
+    copts = cuda_default_copts(),
+    include_prefix = "third_party/tensorrt/plugin/efficientNMSPlugin",
+    strip_include_prefix = "plugin/efficientNMSPlugin",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":nms_plugin_hdrs",
+        ":plugin_common",
+        "@local_config_tensorrt//:tensorrt",
+        "@local_config_tensorrt//:tensorrt_headers",
+    ],
+)
diff --git a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
index 1ca6d0cbfc6a..32c6d96f1618 100644
--- a/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/xla/third_party/tensorrt/tensorrt_configure.bzl
@@ -7,13 +7,13 @@
 """
 
 load(
-    "//third_party/gpus:cuda_configure.bzl",
+    "@local_xla//third_party/gpus:cuda_configure.bzl",
     "find_cuda_config",
     "lib_name",
     "make_copy_files_rule",
 )
 load(
-    "//third_party/remote_config:common.bzl",
+    "@local_xla//third_party/remote_config:common.bzl",
     "config_repo_label",
     "get_cpu_value",
     "get_host_environ",
diff --git a/third_party/xla/third_party/tensorrt/workspace.bzl b/third_party/xla/third_party/tensorrt/workspace.bzl
index be383ee69eac..e237fa83fed2 100644
--- a/third_party/xla/third_party/tensorrt/workspace.bzl
+++ b/third_party/xla/third_party/tensorrt/workspace.bzl
@@ -16,6 +16,6 @@ def repo(name = "tensorrt_oss_archive"):
             "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVIDIA/TensorRT/archive/{commit}.tar.gz".format(commit = TRT_OSS_COMMIT),
             "https://github.com/NVIDIA/TensorRT/archive/{commit}.tar.gz".format(commit = TRT_OSS_COMMIT),
         ],
-        build_file = "//third_party/tensorrt/plugin:BUILD",
+        build_file = "//third_party/tensorrt/plugin:BUILD.bazel",
         patch_file = ["//third_party/tensorrt/plugin:tensorrt_oss.patch"],
     )
diff --git a/third_party/xla/third_party/triton/BUILD b/third_party/xla/third_party/triton/BUILD
deleted file mode 100644
index 3033d9222465..000000000000
--- a/third_party/xla/third_party/triton/BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-# copybara:uncomment_begin
-# package(default_applicable_licenses = ["//tensorflow:license"])
-#
-# filegroup(
-#     name = "patch_files",
-#     srcs = glob([
-#         "xla_extensions/**",
-#         "llvm_integration/**",
-#         "temporary/**",
-#     ]),
-#     visibility = ["//visibility:public"],
-# )
-# copybara:uncomment_end
-
-filegroup(
-    name = "workspace",
-    srcs = ["workspace.bzl"],
-    visibility = ["//visibility:public"],
-)
-
-bzl_library(
-    name = "workspace_bzl",
-    srcs = ["workspace.bzl"],
-    parse_tests = False,
-    visibility = ["//visibility:private"],
-)
diff --git a/third_party/xla/third_party/gpus/cuda/hermetic/BUILD b/third_party/xla/third_party/triton/BUILD.bazel
similarity index 100%
rename from third_party/xla/third_party/gpus/cuda/hermetic/BUILD
rename to third_party/xla/third_party/triton/BUILD.bazel
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl722861209.patch b/third_party/xla/third_party/triton/llvm_integration/cl722861209.patch
deleted file mode 100644
index 90c59d610359..000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl722861209.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-
---- a/include/triton/Dialect/Triton/IR/TritonOps.td	2025-02-03 07:46:30.000000000 -0800
-+++ b/include/triton/Dialect/Triton/IR/TritonOps.td	2025-02-03 17:40:44.000000000 -0800
-@@ -1106,7 +1106,12 @@
-     MutableOperandRange getArgOperandsMutable() {
-       return getOperandsMutable();
-     }
--
-+    Attribute removeArgAttrsAttr() { return nullptr; }
-+    Attribute removeResAttrsAttr() { return nullptr; }
-+    ArrayAttr getArgAttrsAttr() { return nullptr; }
-+    ArrayAttr getResAttrsAttr() { return nullptr; }
-+    void setArgAttrsAttr(ArrayAttr) { return; }
-+    void setResAttrsAttr(ArrayAttr) { return; }
-   }];
- 
-   let assemblyFormat = [{
-
---- a/lib/Dialect/Triton/IR/Ops.cpp	2025-01-31 01:23:09.000000000 -0800
-+++ b/lib/Dialect/Triton/IR/Ops.cpp	2025-02-03 17:40:45.000000000 -0800
-@@ -899,7 +899,7 @@
-   if (argAttrs.empty())
-     return;
-   assert(type.getNumInputs() == argAttrs.size());
--  function_interface_impl::addArgAndResultAttrs(
-+  call_interface_impl::addArgAndResultAttrs(
-       builder, state, argAttrs, /*resultAttrs=*/std::nullopt,
-       getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));
- }
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl727763182.patch b/third_party/xla/third_party/triton/llvm_integration/cl727763182.patch
deleted file mode 100644
index 2d2e6842ab3b..000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl727763182.patch
+++ /dev/null
@@ -1,11 +0,0 @@
-
---- a/BUILD	2025-02-07 01:23:11.000000000 -0800
-+++ b/BUILD	2025-02-17 01:49:18.000000000 -0800
-@@ -639,6 +639,7 @@
-         "@llvm-project//mlir:Analysis",
-         "@llvm-project//mlir:ControlFlowDialect",
-         "@llvm-project//mlir:DataLayoutInterfaces",
-+        "@llvm-project//mlir:FuncToLLVM",
-         "@llvm-project//mlir:FunctionInterfaces",
-         "@llvm-project//mlir:GPUDialect",
-         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl727917222.patch b/third_party/xla/third_party/triton/llvm_integration/cl727917222.patch
deleted file mode 100644
index b4f601010208..000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl727917222.patch
+++ /dev/null
@@ -1,235 +0,0 @@
-
---- a/test/TritonGPU/combine.mlir	2025-02-07 01:23:11.000000000 -0800
-+++ b/test/TritonGPU/combine.mlir	2025-02-17 12:05:55.000000000 -0800
-@@ -2380,12 +2380,12 @@
-     %c0_i32 = arith.constant 0 : i32
-     %c32_i32 = arith.constant 32 : i32
-     %c4096_i32 = arith.constant 4096 : i32
--    // CHECK: %[[F:.+]]:4 = scf.for
-+    // CHECK: %[[F:.+]]:3 = scf.for
-     // CHECK:   %[[R:.+]] = arith.addf
-     // CHECK:   arith.addf
--    // CHECK:   scf.yield %{{.+}}, %{{.+}}, %{{.+}}, %[[R]]
-+    // CHECK:   scf.yield %{{.+}}, %{{.+}}, %[[R]]
-     // CHECK: }
--    // CHECK: tt.return %[[F]]#3, %[[F]]#1, %[[F]]#2
-+    // CHECK: tt.return %[[F]]#2, %[[F]]#1, %[[F]]#0
-     %1:3 = scf.for %arg0 = %c0_i32 to %c4096_i32 step %c32_i32 iter_args(%arg1 = %cst, %arg3 = %cst_0, %arg4 = %cst) -> (tensor<32xf32, #blocked1>, tensor<32xf32, #blocked>, tensor<32xf32, #blocked1>) : i32 {
-       %4 = arith.addf %arg1, %cst : tensor<32xf32, #blocked1>
-       %5 = ttg.convert_layout %4 : tensor<32xf32, #blocked1> -> tensor<32xf32, #blocked>
-
---- a/test/TritonGPU/samples/simulated-grouped-gemm.mlir	2025-02-03 07:46:30.000000000 -0800
-+++ b/test/TritonGPU/samples/simulated-grouped-gemm.mlir	2025-02-17 12:05:55.000000000 -0800
-@@ -153,115 +153,115 @@
- // CHECK:           %[[VAL_115:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_113]]#1 : !tt.tensordesc<tensor<256x64xf16>> to !tt.ptr<i8>
- // CHECK:           ttng.async_tma_copy_global_to_local %[[VAL_115]]{{\[}}%[[VAL_113]]#6, %[[VAL_109]]] %[[VAL_114]], %[[VAL_110]], %[[VAL_75]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
- // CHECK:           %[[VAL_116:.*]] = ttg.local_alloc  : () -> !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
--// CHECK:           %[[VAL_117:.*]]:24 = scf.for %[[VAL_118:.*]] = %[[VAL_13]] to %[[VAL_44]] step %[[VAL_10]] iter_args(%[[VAL_119:.*]] = %[[VAL_77]], %[[VAL_120:.*]] = %[[VAL_113]]#0, %[[VAL_121:.*]] = %[[VAL_113]]#1, %[[VAL_122:.*]] = %[[VAL_113]]#2, %[[VAL_123:.*]] = %[[VAL_113]]#3, %[[VAL_124:.*]] = %[[VAL_113]]#4, %[[VAL_125:.*]] = %[[VAL_113]]#5, %[[VAL_126:.*]] = %[[VAL_113]]#6, %[[VAL_127:.*]] = %[[VAL_22]], %[[VAL_128:.*]] = %[[VAL_9]], %[[VAL_129:.*]] = %[[VAL_10]], %[[VAL_130:.*]] = %[[VAL_12]], %[[VAL_131:.*]] = %[[VAL_13]], %[[VAL_132:.*]] = %[[VAL_113]]#7, %[[VAL_133:.*]] = %[[VAL_113]]#8, %[[VAL_134:.*]] = %[[VAL_113]]#9, %[[VAL_135:.*]] = %[[VAL_13]], %[[VAL_136:.*]] = %[[VAL_77]], %[[VAL_137:.*]] = %[[VAL_35]], %[[VAL_138:.*]] = %[[VAL_113]]#2, %[[VAL_139:.*]] = %[[VAL_72]]#0, %[[VAL_140:.*]] = %[[VAL_113]]#5, %[[VAL_141:.*]] = %[[VAL_72]]#1, %[[VAL_142:.*]] = %[[VAL_113]]#6) -> (i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32)  : i32 {
--// CHECK:             %[[VAL_143:.*]] = arith.subi %[[VAL_44]], %[[VAL_7]] : i32
--// CHECK:             %[[VAL_144:.*]] = arith.cmpi slt, %[[VAL_118]], %[[VAL_143]] : i32
--// CHECK:             %[[VAL_145:.*]] = arith.cmpi eq, %[[VAL_119]], %[[VAL_45]] : i32
--// CHECK:             %[[VAL_146:.*]] = arith.addi %[[VAL_119]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_147:.*]] = arith.select %[[VAL_145]], %[[VAL_13]], %[[VAL_146]] : i32
--// CHECK:             %[[VAL_148:.*]] = arith.cmpi eq, %[[VAL_147]], %[[VAL_13]] : i32
--// CHECK:             %[[VAL_149:.*]] = arith.andi %[[VAL_144]], %[[VAL_148]] : i1
--// CHECK:             %[[VAL_150:.*]]:10 = scf.if %[[VAL_149]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32) {
--// CHECK:               %[[VAL_151:.*]] = arith.addi %[[VAL_124]], %[[VAL_10]] : i32
--// CHECK:               %[[VAL_152:.*]] = arith.cmpi eq, %[[VAL_151]], %[[VAL_10]] : i32
--// CHECK:               %[[VAL_153:.*]] = arith.select %[[VAL_152]], %[[VAL_13]], %[[VAL_151]] : i32
--// CHECK:               %[[VAL_154:.*]]:6 = scf.if %[[VAL_152]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32) {
--// CHECK:                 %[[VAL_155:.*]] = tt.addptr %[[VAL_0]], %[[VAL_43]] : !tt.ptr<f16>, i32
--// CHECK:                 %[[VAL_156:.*]] = arith.muli %[[VAL_132]], %[[VAL_15]] : i32
--// CHECK:                 %[[VAL_157:.*]] = tt.addptr %[[VAL_46]], %[[VAL_156]] : !tt.ptr<i8>, i32
--// CHECK:                 %[[VAL_158:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
--// CHECK:                 tt.experimental_tensormap_create %[[VAL_157]], %[[VAL_155]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_5]], %[[VAL_3]]], {{\[}}%[[VAL_158]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
--// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_157]] : !tt.ptr<i8>
--// CHECK:                 %[[VAL_159:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_157]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x64xf16>>
--// CHECK:                 %[[VAL_160:.*]] = arith.addi %[[VAL_132]], %[[VAL_10]] : i32
--// CHECK:                 %[[VAL_161:.*]] = arith.cmpi slt, %[[VAL_160]], %[[VAL_8]] : i32
--// CHECK:                 %[[VAL_162:.*]] = arith.select %[[VAL_161]], %[[VAL_160]], %[[VAL_13]] : i32
--// CHECK:                 %[[VAL_163:.*]] = tt.addptr %[[VAL_1]], %[[VAL_43]] : !tt.ptr<f16>, i32
--// CHECK:                 %[[VAL_164:.*]] = arith.muli %[[VAL_133]], %[[VAL_15]] : i32
--// CHECK:                 %[[VAL_165:.*]] = tt.addptr %[[VAL_47]], %[[VAL_164]] : !tt.ptr<i8>, i32
--// CHECK:                 %[[VAL_166:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
--// CHECK:                 tt.experimental_tensormap_create %[[VAL_165]], %[[VAL_163]], {{\[}}%[[VAL_17]], %[[VAL_16]]], {{\[}}%[[VAL_5]], %[[VAL_4]]], {{\[}}%[[VAL_166]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
--// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_165]] : !tt.ptr<i8>
--// CHECK:                 %[[VAL_167:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_165]] : !tt.ptr<i8> to !tt.tensordesc<tensor<256x64xf16>>
--// CHECK:                 %[[VAL_168:.*]] = arith.addi %[[VAL_133]], %[[VAL_10]] : i32
--// CHECK:                 %[[VAL_169:.*]] = arith.cmpi slt, %[[VAL_168]], %[[VAL_8]] : i32
--// CHECK:                 %[[VAL_170:.*]] = arith.select %[[VAL_169]], %[[VAL_168]], %[[VAL_13]] : i32
--// CHECK:                 %[[VAL_171:.*]] = tt.addptr %[[VAL_2]], %[[VAL_43]] : !tt.ptr<f16>, i32
--// CHECK:                 %[[VAL_172:.*]] = arith.muli %[[VAL_134]], %[[VAL_15]] : i32
--// CHECK:                 %[[VAL_173:.*]] = tt.addptr %[[VAL_48]], %[[VAL_172]] : !tt.ptr<i8>, i32
--// CHECK:                 %[[VAL_174:.*]] = arith.muli %[[VAL_34]], %[[VAL_6]] : i64
--// CHECK:                 tt.experimental_tensormap_create %[[VAL_173]], %[[VAL_171]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_4]], %[[VAL_3]]], {{\[}}%[[VAL_174]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
--// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_173]] : !tt.ptr<i8>
--// CHECK:                 %[[VAL_175:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_173]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x256xf16>>
--// CHECK:                 %[[VAL_176:.*]] = arith.addi %[[VAL_134]], %[[VAL_10]] : i32
--// CHECK:                 %[[VAL_177:.*]] = arith.cmpi slt, %[[VAL_176]], %[[VAL_8]] : i32
--// CHECK:                 %[[VAL_178:.*]] = arith.select %[[VAL_177]], %[[VAL_176]], %[[VAL_13]] : i32
--// CHECK:                 scf.yield %[[VAL_159]], %[[VAL_167]], %[[VAL_175]], %[[VAL_162]], %[[VAL_170]], %[[VAL_178]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32
-+// CHECK:           %[[VAL_117:.*]]:20 = scf.for %[[VAL_118:.*]] = %[[VAL_13]] to %[[VAL_44]] step %[[VAL_10]] iter_args(%[[VAL_119:.*]] = %[[VAL_77]], %[[VAL_120:.*]] = %[[VAL_113]]#0, %[[VAL_121:.*]] = %[[VAL_113]]#1, %[[VAL_122:.*]] = %[[VAL_113]]#2, %[[VAL_123:.*]] = %[[VAL_113]]#3, %[[VAL_124:.*]] = %[[VAL_113]]#4, %[[VAL_125:.*]] = %[[VAL_113]]#5, %[[VAL_126:.*]] = %[[VAL_113]]#6, %[[VAL_127:.*]] = %[[VAL_22]], %[[VAL_128:.*]] = %[[VAL_9]], %[[VAL_129:.*]] = %[[VAL_10]], %[[VAL_130:.*]] = %[[VAL_12]], %[[VAL_131:.*]] = %[[VAL_13]], %[[VAL_132:.*]] = %[[VAL_113]]#7, %[[VAL_133:.*]] = %[[VAL_113]]#8, %[[VAL_134:.*]] = %[[VAL_113]]#9, %[[VAL_135:.*]] = %[[VAL_13]], %[[VAL_136:.*]] = %[[VAL_35]], %[[VAL_137:.*]] = %[[VAL_72]]#0, %[[VAL_138:.*]] = %[[VAL_72]]#1) -> (i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, i32, i32)  : i32 {
-+// CHECK:             %[[VAL_139:.*]] = arith.subi %[[VAL_44]], %[[VAL_7]] : i32
-+// CHECK:             %[[VAL_140:.*]] = arith.cmpi slt, %[[VAL_118]], %[[VAL_139]] : i32
-+// CHECK:             %[[VAL_141:.*]] = arith.cmpi eq, %[[VAL_119]], %[[VAL_45]] : i32
-+// CHECK:             %[[VAL_142:.*]] = arith.addi %[[VAL_119]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_143:.*]] = arith.select %[[VAL_141]], %[[VAL_13]], %[[VAL_142]] : i32
-+// CHECK:             %[[VAL_144:.*]] = arith.cmpi eq, %[[VAL_143]], %[[VAL_13]] : i32
-+// CHECK:             %[[VAL_145:.*]] = arith.andi %[[VAL_140]], %[[VAL_144]] : i1
-+// CHECK:             %[[VAL_146:.*]]:10 = scf.if %[[VAL_145]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32) {
-+// CHECK:               %[[VAL_147:.*]] = arith.addi %[[VAL_124]], %[[VAL_10]] : i32
-+// CHECK:               %[[VAL_148:.*]] = arith.cmpi eq, %[[VAL_147]], %[[VAL_10]] : i32
-+// CHECK:               %[[VAL_149:.*]] = arith.select %[[VAL_148]], %[[VAL_13]], %[[VAL_147]] : i32
-+// CHECK:               %[[VAL_150:.*]]:6 = scf.if %[[VAL_148]] -> (!tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32) {
-+// CHECK:                 %[[VAL_151:.*]] = tt.addptr %[[VAL_0]], %[[VAL_43]] : !tt.ptr<f16>, i32
-+// CHECK:                 %[[VAL_152:.*]] = arith.muli %[[VAL_132]], %[[VAL_15]] : i32
-+// CHECK:                 %[[VAL_153:.*]] = tt.addptr %[[VAL_46]], %[[VAL_152]] : !tt.ptr<i8>, i32
-+// CHECK:                 %[[VAL_154:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
-+// CHECK:                 tt.experimental_tensormap_create %[[VAL_153]], %[[VAL_151]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_5]], %[[VAL_3]]], {{\[}}%[[VAL_154]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
-+// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_153]] : !tt.ptr<i8>
-+// CHECK:                 %[[VAL_155:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_153]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x64xf16>>
-+// CHECK:                 %[[VAL_156:.*]] = arith.addi %[[VAL_132]], %[[VAL_10]] : i32
-+// CHECK:                 %[[VAL_157:.*]] = arith.cmpi slt, %[[VAL_156]], %[[VAL_8]] : i32
-+// CHECK:                 %[[VAL_158:.*]] = arith.select %[[VAL_157]], %[[VAL_156]], %[[VAL_13]] : i32
-+// CHECK:                 %[[VAL_159:.*]] = tt.addptr %[[VAL_1]], %[[VAL_43]] : !tt.ptr<f16>, i32
-+// CHECK:                 %[[VAL_160:.*]] = arith.muli %[[VAL_133]], %[[VAL_15]] : i32
-+// CHECK:                 %[[VAL_161:.*]] = tt.addptr %[[VAL_47]], %[[VAL_160]] : !tt.ptr<i8>, i32
-+// CHECK:                 %[[VAL_162:.*]] = arith.muli %[[VAL_31]], %[[VAL_6]] : i64
-+// CHECK:                 tt.experimental_tensormap_create %[[VAL_161]], %[[VAL_159]], {{\[}}%[[VAL_17]], %[[VAL_16]]], {{\[}}%[[VAL_5]], %[[VAL_4]]], {{\[}}%[[VAL_162]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
-+// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_161]] : !tt.ptr<i8>
-+// CHECK:                 %[[VAL_163:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_161]] : !tt.ptr<i8> to !tt.tensordesc<tensor<256x64xf16>>
-+// CHECK:                 %[[VAL_164:.*]] = arith.addi %[[VAL_133]], %[[VAL_10]] : i32
-+// CHECK:                 %[[VAL_165:.*]] = arith.cmpi slt, %[[VAL_164]], %[[VAL_8]] : i32
-+// CHECK:                 %[[VAL_166:.*]] = arith.select %[[VAL_165]], %[[VAL_164]], %[[VAL_13]] : i32
-+// CHECK:                 %[[VAL_167:.*]] = tt.addptr %[[VAL_2]], %[[VAL_43]] : !tt.ptr<f16>, i32
-+// CHECK:                 %[[VAL_168:.*]] = arith.muli %[[VAL_134]], %[[VAL_15]] : i32
-+// CHECK:                 %[[VAL_169:.*]] = tt.addptr %[[VAL_48]], %[[VAL_168]] : !tt.ptr<i8>, i32
-+// CHECK:                 %[[VAL_170:.*]] = arith.muli %[[VAL_34]], %[[VAL_6]] : i64
-+// CHECK:                 tt.experimental_tensormap_create %[[VAL_169]], %[[VAL_167]], {{\[}}%[[VAL_17]], %[[VAL_15]]], {{\[}}%[[VAL_4]], %[[VAL_3]]], {{\[}}%[[VAL_170]]], {{\[}}%[[VAL_10]], %[[VAL_10]]] {elem_type = 1 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 3 : i32} : (!tt.ptr<i8>, !tt.ptr<f16>, i32, i32, i32, i32, i64, i32, i32) -> ()
-+// CHECK:                 tt.experimental_tensormap_fenceproxy_acquire %[[VAL_169]] : !tt.ptr<i8>
-+// CHECK:                 %[[VAL_171:.*]] = tt.reinterpret_tensor_descriptor %[[VAL_169]] : !tt.ptr<i8> to !tt.tensordesc<tensor<128x256xf16>>
-+// CHECK:                 %[[VAL_172:.*]] = arith.addi %[[VAL_134]], %[[VAL_10]] : i32
-+// CHECK:                 %[[VAL_173:.*]] = arith.cmpi slt, %[[VAL_172]], %[[VAL_8]] : i32
-+// CHECK:                 %[[VAL_174:.*]] = arith.select %[[VAL_173]], %[[VAL_172]], %[[VAL_13]] : i32
-+// CHECK:                 scf.yield %[[VAL_155]], %[[VAL_163]], %[[VAL_171]], %[[VAL_158]], %[[VAL_166]], %[[VAL_174]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32
- // CHECK:               } else {
- // CHECK:                 scf.yield %[[VAL_120]], %[[VAL_121]], %[[VAL_122]], %[[VAL_132]], %[[VAL_133]], %[[VAL_134]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32
- // CHECK:               }
--// CHECK:               %[[VAL_179:.*]] = arith.addi %[[VAL_123]], %[[VAL_11]] : i32
--// CHECK:               %[[VAL_180:.*]] = arith.divsi %[[VAL_179]], %[[VAL_42]] : i32
--// CHECK:               %[[VAL_181:.*]] = arith.muli %[[VAL_180]], %[[VAL_14]] : i32
--// CHECK:               %[[VAL_182:.*]] = arith.subi %[[VAL_25]], %[[VAL_181]] : i32
--// CHECK:               %[[VAL_183:.*]] = arith.minsi %[[VAL_182]], %[[VAL_14]] : i32
--// CHECK:               %[[VAL_184:.*]] = arith.remsi %[[VAL_179]], %[[VAL_183]] : i32
--// CHECK:               %[[VAL_185:.*]] = arith.addi %[[VAL_181]], %[[VAL_184]] : i32
--// CHECK:               %[[VAL_186:.*]] = arith.remsi %[[VAL_179]], %[[VAL_42]] : i32
--// CHECK:               %[[VAL_187:.*]] = arith.divsi %[[VAL_186]], %[[VAL_183]] : i32
--// CHECK:               %[[VAL_188:.*]] = arith.muli %[[VAL_185]], %[[VAL_15]] : i32
--// CHECK:               %[[VAL_189:.*]] = arith.muli %[[VAL_187]], %[[VAL_16]] : i32
--// CHECK:               scf.yield %[[VAL_190:.*]]#0, %[[VAL_190]]#1, %[[VAL_190]]#2, %[[VAL_179]], %[[VAL_153]], %[[VAL_188]], %[[VAL_189]], %[[VAL_190]]#3, %[[VAL_190]]#4, %[[VAL_190]]#5 : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32
-+// CHECK:               %[[VAL_175:.*]] = arith.addi %[[VAL_123]], %[[VAL_11]] : i32
-+// CHECK:               %[[VAL_176:.*]] = arith.divsi %[[VAL_175]], %[[VAL_42]] : i32
-+// CHECK:               %[[VAL_177:.*]] = arith.muli %[[VAL_176]], %[[VAL_14]] : i32
-+// CHECK:               %[[VAL_178:.*]] = arith.subi %[[VAL_25]], %[[VAL_177]] : i32
-+// CHECK:               %[[VAL_179:.*]] = arith.minsi %[[VAL_178]], %[[VAL_14]] : i32
-+// CHECK:               %[[VAL_180:.*]] = arith.remsi %[[VAL_175]], %[[VAL_179]] : i32
-+// CHECK:               %[[VAL_181:.*]] = arith.addi %[[VAL_177]], %[[VAL_180]] : i32
-+// CHECK:               %[[VAL_182:.*]] = arith.remsi %[[VAL_175]], %[[VAL_42]] : i32
-+// CHECK:               %[[VAL_183:.*]] = arith.divsi %[[VAL_182]], %[[VAL_179]] : i32
-+// CHECK:               %[[VAL_184:.*]] = arith.muli %[[VAL_181]], %[[VAL_15]] : i32
-+// CHECK:               %[[VAL_185:.*]] = arith.muli %[[VAL_183]], %[[VAL_16]] : i32
-+// CHECK:               scf.yield %[[VAL_186:.*]]#0, %[[VAL_186]]#1, %[[VAL_186]]#2, %[[VAL_175]], %[[VAL_149]], %[[VAL_184]], %[[VAL_185]], %[[VAL_186]]#3, %[[VAL_186]]#4, %[[VAL_186]]#5 : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32
- // CHECK:             } else {
- // CHECK:               scf.yield %[[VAL_120]], %[[VAL_121]], %[[VAL_122]], %[[VAL_123]], %[[VAL_124]], %[[VAL_125]], %[[VAL_126]], %[[VAL_132]], %[[VAL_133]], %[[VAL_134]] : !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, i32, i32, i32
- // CHECK:             }
--// CHECK:             %[[VAL_191:.*]] = arith.addi %[[VAL_130]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_192:.*]] = arith.cmpi slt, %[[VAL_191]], %[[VAL_8]] : i32
--// CHECK:             %[[VAL_193:.*]] = arith.select %[[VAL_192]], %[[VAL_191]], %[[VAL_13]] : i32
--// CHECK:             %[[VAL_194:.*]] = arith.xori %[[VAL_131]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_195:.*]] = arith.select %[[VAL_192]], %[[VAL_131]], %[[VAL_194]] : i32
--// CHECK:             %[[VAL_196:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_193]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             ttng.wait_barrier %[[VAL_196]], %[[VAL_195]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             %[[VAL_197:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_193]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
--// CHECK:             %[[VAL_198:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_193]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
--// CHECK:             %[[VAL_199:.*]] = ttg.memdesc_trans %[[VAL_197]] {order = array<i32: 1, 0>} : !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64> -> !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
--// CHECK:             %[[VAL_200:.*]] = ttng.warp_group_dot %[[VAL_198]], %[[VAL_199]], %[[VAL_127]], %[[VAL_128]] {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64> * !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable> -> tensor<128x256xf32, #[[$ATTR_0]]>
--// CHECK:             %[[VAL_201:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_200]], %[[VAL_198]], %[[VAL_199]] {pendings = 1 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
--// CHECK:             %[[VAL_202:.*]] = arith.addi %[[VAL_129]], %[[VAL_10]] : i32
--// CHECK:             %[[VAL_203:.*]] = arith.cmpi slt, %[[VAL_202]], %[[VAL_8]] : i32
--// CHECK:             %[[VAL_204:.*]] = arith.select %[[VAL_203]], %[[VAL_202]], %[[VAL_13]] : i32
--// CHECK:             %[[VAL_205:.*]] = arith.muli %[[VAL_147]], %[[VAL_17]] : i32
--// CHECK:             %[[VAL_206:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_204]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             ttng.barrier_expect %[[VAL_206]], 49152, %[[VAL_144]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:             %[[VAL_207:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_204]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
--// CHECK:             %[[VAL_208:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_209:.*]]#0 : !tt.tensordesc<tensor<128x64xf16>> to !tt.ptr<i8>
--// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_208]]{{\[}}%[[VAL_209]]#5, %[[VAL_205]]] %[[VAL_207]], %[[VAL_206]], %[[VAL_144]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
--// CHECK:             %[[VAL_210:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_204]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
--// CHECK:             %[[VAL_211:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_209]]#1 : !tt.tensordesc<tensor<256x64xf16>> to !tt.ptr<i8>
--// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_211]]{{\[}}%[[VAL_209]]#6, %[[VAL_205]]] %[[VAL_210]], %[[VAL_206]], %[[VAL_144]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
--// CHECK:             %[[VAL_212:.*]] = arith.cmpi eq, %[[VAL_135]], %[[VAL_45]] : i32
--// CHECK:             %[[VAL_213:.*]] = arith.cmpi ne, %[[VAL_135]], %[[VAL_45]] : i32
--// CHECK:             scf.if %[[VAL_212]] {
--// CHECK:               %[[VAL_214:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_201]]#0, %[[VAL_198]], %[[VAL_199]] {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
--// CHECK:               %[[VAL_215:.*]] = arith.truncf %[[VAL_214]]#0 : tensor<128x256xf32, #[[$ATTR_0]]> to tensor<128x256xf16, #[[$ATTR_0]]>
-+// CHECK:             %[[VAL_187:.*]] = arith.addi %[[VAL_130]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_188:.*]] = arith.cmpi slt, %[[VAL_187]], %[[VAL_8]] : i32
-+// CHECK:             %[[VAL_189:.*]] = arith.select %[[VAL_188]], %[[VAL_187]], %[[VAL_13]] : i32
-+// CHECK:             %[[VAL_190:.*]] = arith.xori %[[VAL_131]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_191:.*]] = arith.select %[[VAL_188]], %[[VAL_131]], %[[VAL_190]] : i32
-+// CHECK:             %[[VAL_192:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_189]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             ttng.wait_barrier %[[VAL_192]], %[[VAL_191]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             %[[VAL_193:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_189]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
-+// CHECK:             %[[VAL_194:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_189]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
-+// CHECK:             %[[VAL_195:.*]] = ttg.memdesc_trans %[[VAL_193]] {order = array<i32: 1, 0>} : !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64> -> !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
-+// CHECK:             %[[VAL_196:.*]] = ttng.warp_group_dot %[[VAL_194]], %[[VAL_195]], %[[VAL_127]], %[[VAL_128]] {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64> * !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable> -> tensor<128x256xf32, #[[$ATTR_0]]>
-+// CHECK:             %[[VAL_197:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_196]], %[[VAL_194]], %[[VAL_195]] {pendings = 1 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
-+// CHECK:             %[[VAL_198:.*]] = arith.addi %[[VAL_129]], %[[VAL_10]] : i32
-+// CHECK:             %[[VAL_199:.*]] = arith.cmpi slt, %[[VAL_198]], %[[VAL_8]] : i32
-+// CHECK:             %[[VAL_200:.*]] = arith.select %[[VAL_199]], %[[VAL_198]], %[[VAL_13]] : i32
-+// CHECK:             %[[VAL_201:.*]] = arith.muli %[[VAL_143]], %[[VAL_17]] : i32
-+// CHECK:             %[[VAL_202:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_200]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             ttng.barrier_expect %[[VAL_202]], 49152, %[[VAL_140]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:             %[[VAL_203:.*]] = ttg.memdesc_subview %[[VAL_49]]{{\[}}%[[VAL_200]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
-+// CHECK:             %[[VAL_204:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_205:.*]]#0 : !tt.tensordesc<tensor<128x64xf16>> to !tt.ptr<i8>
-+// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_204]]{{\[}}%[[VAL_205]]#5, %[[VAL_201]]] %[[VAL_203]], %[[VAL_202]], %[[VAL_140]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>
-+// CHECK:             %[[VAL_206:.*]] = ttg.memdesc_subview %[[VAL_50]]{{\[}}%[[VAL_200]], %[[VAL_13]], %[[VAL_13]]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
-+// CHECK:             %[[VAL_207:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_205]]#1 : !tt.tensordesc<tensor<256x64xf16>> to !tt.ptr<i8>
-+// CHECK:             ttng.async_tma_copy_global_to_local %[[VAL_207]]{{\[}}%[[VAL_205]]#6, %[[VAL_201]]] %[[VAL_206]], %[[VAL_202]], %[[VAL_140]] : !tt.ptr<i8>, !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3> -> !ttg.memdesc<256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x256x64>
-+// CHECK:             %[[VAL_208:.*]] = arith.cmpi eq, %[[VAL_135]], %[[VAL_45]] : i32
-+// CHECK:             %[[VAL_209:.*]] = arith.cmpi ne, %[[VAL_135]], %[[VAL_45]] : i32
-+// CHECK:             scf.if %[[VAL_208]] {
-+// CHECK:               %[[VAL_210:.*]]:3 = ttng.warp_group_dot_wait %[[VAL_197]]#0, %[[VAL_194]], %[[VAL_195]] {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>, !ttg.memdesc<128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable, 3x128x64>, !ttg.memdesc<64x256xf16, #[[$ATTR_3]], #[[$ATTR_4]], mutable>
-+// CHECK:               %[[VAL_211:.*]] = arith.truncf %[[VAL_210]]#0 : tensor<128x256xf32, #[[$ATTR_0]]> to tensor<128x256xf16, #[[$ATTR_0]]>
- // CHECK:               ttng.async_tma_store_wait {pendings = 0 : i32}
--// CHECK:               ttg.local_store %[[VAL_215]], %[[VAL_116]] : tensor<128x256xf16, #[[$ATTR_0]]> -> !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
-+// CHECK:               ttg.local_store %[[VAL_211]], %[[VAL_116]] : tensor<128x256xf16, #[[$ATTR_0]]> -> !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:               ttng.fence_async_shared {bCluster = false}
--// CHECK:               %[[VAL_216:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_137]] : !tt.tensordesc<tensor<128x256xf16>> to !tt.ptr<i8>
--// CHECK:               ttng.async_tma_copy_local_to_global %[[VAL_216]]{{\[}}%[[VAL_139]], %[[VAL_141]]] %[[VAL_116]] : !tt.ptr<i8>, !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
-+// CHECK:               %[[VAL_212:.*]] = ttng.tensor_desc_to_tma_ptr %[[VAL_136]] : !tt.tensordesc<tensor<128x256xf16>> to !tt.ptr<i8>
-+// CHECK:               ttng.async_tma_copy_local_to_global %[[VAL_212]]{{\[}}%[[VAL_137]], %[[VAL_138]]] %[[VAL_116]] : !tt.ptr<i8>, !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:             }
--// CHECK:             scf.yield %[[VAL_147]], %[[VAL_209]]#0, %[[VAL_209]]#1, %[[VAL_209]]#2, %[[VAL_209]]#3, %[[VAL_209]]#4, %[[VAL_209]]#5, %[[VAL_209]]#6, %[[VAL_201]]#0, %[[VAL_213]], %[[VAL_204]], %[[VAL_193]], %[[VAL_195]], %[[VAL_209]]#7, %[[VAL_209]]#8, %[[VAL_209]]#9, %[[VAL_136]], %[[VAL_147]], %[[VAL_138]], %[[VAL_209]]#2, %[[VAL_140]], %[[VAL_209]]#5, %[[VAL_142]], %[[VAL_209]]#6 : i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32
-+// CHECK:             scf.yield %[[VAL_143]], %[[VAL_205]]#0, %[[VAL_205]]#1, %[[VAL_205]]#2, %[[VAL_205]]#3, %[[VAL_205]]#4, %[[VAL_205]]#5, %[[VAL_205]]#6, %[[VAL_197]]#0, %[[VAL_209]], %[[VAL_200]], %[[VAL_189]], %[[VAL_191]], %[[VAL_205]]#7, %[[VAL_205]]#8, %[[VAL_205]]#9, %[[VAL_119]], %[[VAL_122]], %[[VAL_125]], %[[VAL_126]] : i32, !tt.tensordesc<tensor<128x64xf16>>, !tt.tensordesc<tensor<256x64xf16>>, !tt.tensordesc<tensor<128x256xf16>>, i32, i32, i32, i32, tensor<128x256xf32, #[[$ATTR_0]]>, i1, i32, i32, i32, i32, i32, i32, i32, !tt.tensordesc<tensor<128x256xf16>>, i32, i32
- // CHECK:           }
- // CHECK:           ttng.async_tma_store_wait {pendings = 0 : i32}
- // CHECK:           ttg.local_dealloc %[[VAL_116]] : !ttg.memdesc<128x256xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
--// CHECK:           %[[VAL_217:.*]] = ttng.warp_group_dot_wait %[[VAL_218:.*]]#8 {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>
--// CHECK:           %[[VAL_219:.*]] = ttg.async_wait  {num = 0 : i32}
--// CHECK:           %[[VAL_220:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_13]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           ttng.inval_barrier %[[VAL_220]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           %[[VAL_221:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_10]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           ttng.inval_barrier %[[VAL_221]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           %[[VAL_222:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_7]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
--// CHECK:           ttng.inval_barrier %[[VAL_222]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           %[[VAL_213:.*]] = ttng.warp_group_dot_wait %[[VAL_214:.*]]#8 {pendings = 0 : i32} : tensor<128x256xf32, #[[$ATTR_0]]>
-+// CHECK:           %[[VAL_215:.*]] = ttg.async_wait  {num = 0 : i32}
-+// CHECK:           %[[VAL_216:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_13]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           ttng.inval_barrier %[[VAL_216]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           %[[VAL_217:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_10]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           ttng.inval_barrier %[[VAL_217]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           %[[VAL_218:.*]] = ttg.memdesc_subview %[[VAL_51]]{{\[}}%[[VAL_7]]] : !ttg.memdesc<3xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable> -> !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
-+// CHECK:           ttng.inval_barrier %[[VAL_218]] : !ttg.memdesc<1xi64, #[[$ATTR_2]], #[[$ATTR_4]], mutable, 3>
- // CHECK:           ttg.local_dealloc %[[VAL_49]] : !ttg.memdesc<3x128x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:           ttg.local_dealloc %[[VAL_50]] : !ttg.memdesc<3x256x64xf16, #[[$ATTR_1]], #[[$ATTR_4]], mutable>
- // CHECK:           tt.return
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl728192169.patch b/third_party/xla/third_party/triton/llvm_integration/cl728192169.patch
deleted file mode 100644
index 90b3f63aa862..000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl728192169.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-
---- a/python/src/passes.cc	2025-01-21 05:40:49.000000000 -0800
-+++ b/python/src/passes.cc	2025-02-18 07:12:11.000000000 -0800
-@@ -79,7 +79,7 @@
- 
- void init_triton_passes_convert(py::module &&m) {
-   using namespace mlir;
--  ADD_PASS_WRAPPER_0("add_scf_to_cf", createConvertSCFToCFPass);
-+  ADD_PASS_WRAPPER_0("add_scf_to_cf", createSCFToControlFlowPass);
-   ADD_PASS_WRAPPER_0("add_cf_to_llvmir", createConvertControlFlowToLLVMPass);
-   ADD_PASS_WRAPPER_0("add_index_to_llvmir", createConvertIndexToLLVMPass);
-   ADD_PASS_WRAPPER_0("add_arith_to_llvmir", createArithToLLVMConversionPass);
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl728670559.patch b/third_party/xla/third_party/triton/llvm_integration/cl728670559.patch
deleted file mode 100644
index a69470a5f7bc..000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl728670559.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-
---- a/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir	2025-01-21 05:40:49.000000000 -0800
-+++ b/test/TritonGPU/amd/accelerate-amd-matmul-mfma.mlir	2025-02-19 08:53:36.000000000 -0800
-@@ -1,5 +1,5 @@
--// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx940 matrix-instruction-size=0' | FileCheck %s --check-prefixes MFMA0,CHECK
--// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx940 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
-+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=0' | FileCheck %s --check-prefixes MFMA0,CHECK
-+// RUN: triton-opt %s -split-input-file --tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=16' | FileCheck %s --check-prefixes MFMA16,CHECK
- 
- #blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [1, 0]}>
- // CHECK-LABEL: mfma_dot_fp8e5m2
-
---- a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir	2025-01-31 01:23:09.000000000 -0800
-+++ b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir	2025-02-19 08:53:36.000000000 -0800
-@@ -1,4 +1,4 @@
--// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops='arch-generation-name=gfx940'| FileCheck %s
-+// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops='arch-generation-name=gfx942'| FileCheck %s
- 
- #blocked0 = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
- module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
-
---- a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp	2025-01-21 05:40:49.000000000 -0800
-+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp	2025-02-19 08:05:12.000000000 -0800
-@@ -13,8 +13,6 @@
-   switch (kind) {
-   case llvm::AMDGPU::GK_GFX950:
-   case llvm::AMDGPU::GK_GFX942:
--  case llvm::AMDGPU::GK_GFX941:
--  case llvm::AMDGPU::GK_GFX940:
-     return ISAFamily::CDNA3;
-   case llvm::AMDGPU::GK_GFX90A:
-     return ISAFamily::CDNA2;
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl751628816.patch b/third_party/xla/third_party/triton/llvm_integration/cl751628816.patch
new file mode 100644
index 000000000000..d0669de9c8e0
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl751628816.patch
@@ -0,0 +1,31 @@
+
+--- a/lib/Conversion/TritonToTritonGPU/TritonGPUConversion.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/lib/Conversion/TritonToTritonGPU/TritonGPUConversion.cpp	2025-04-25 18:48:56.000000000 -0700
+@@ -55,15 +55,6 @@
+   //
+   // Materializations
+   //
+-  // This will be called when (newArgType != origArgType)
+-  // This will create newArg, and map(origArg, newArg)
+-  addArgumentMaterialization([](OpBuilder &builder, RankedTensorType tensorType,
+-                                ValueRange inputs, Location loc) -> Value {
+-    llvm_unreachable("Argument rematerialization should not happen in Triton "
+-                     "-> TritonGPU conversion");
+-    return {};
+-  });
+-
+   // If the origValue still has live user(s), use this to
+   // convert origValue to newValue
+   addSourceMaterialization([=](OpBuilder &builder, RankedTensorType tensorType,
+
+--- a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp	2025-04-25 18:48:56.000000000 -0700
+@@ -13,7 +13,7 @@
+ #include "mlir/IR/TypeUtilities.h"
+ #include "mlir/IR/Value.h"
+ #include "mlir/Pass/Pass.h"
+-#include "mlir/Transforms/OneToNTypeConversion.h"
++#include "mlir/Transforms/DialectConversion.h"
+ #include "triton/Analysis/Utility.h"
+ #include "triton/Dialect/Triton/IR/Dialect.h"
+ #include "triton/Dialect/Triton/IR/Types.h"
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl752825850.patch b/third_party/xla/third_party/triton/llvm_integration/cl752825850.patch
new file mode 100644
index 000000000000..dc0a4be8fc11
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl752825850.patch
@@ -0,0 +1,87 @@
+
+--- a/third_party/amd/BUILD	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/amd/BUILD	2025-04-29 12:09:00.000000000 -0700
+@@ -131,6 +131,7 @@
+         "@llvm-project//llvm:Support",
+         "@llvm-project//llvm:TargetParser",
+         "@llvm-project//mlir:AMDGPUDialect",
++        "@llvm-project//mlir:AMDGPUUtils",
+         "@llvm-project//mlir:Analysis",
+         "@llvm-project//mlir:ArithDialect",
+         "@llvm-project//mlir:ArithToLLVM",
+
+--- a/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp	2025-02-03 07:46:30.000000000 -0800
++++ b/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp	2025-04-29 12:09:00.000000000 -0700
+@@ -194,7 +194,7 @@
+     ModuleOp mod = getOperation();
+ 
+     GreedyRewriteConfig config;
+-    config.enableRegionSimplification = GreedySimplifyRegionLevel::Aggressive;
++    config.setRegionSimplificationLevel(GreedySimplifyRegionLevel::Aggressive);
+ 
+     RewritePatternSet patterns(context);
+     patterns.add<CallOpConversion>(context, this->ftz);
+
+--- a/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp	2025-03-25 07:48:50.000000000 -0700
++++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp	2025-04-29 12:09:00.000000000 -0700
+@@ -1,8 +1,8 @@
+-#include "TritonAMDGPUToLLVM/Passes.h"
+-
+ #include "PatternTritonGPUOpToLLVM.h"
+ #include "SchedInstructions.h"
+ #include "TargetInfo.h"
++#include "TritonAMDGPUToLLVM/Passes.h"
++#include "llvm/TargetParser/TargetParser.h"
+ #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+ #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+ #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+@@ -10,11 +10,13 @@
+ #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+ #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+ #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
++#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+ #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+ #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+ #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+ #include "mlir/Pass/Pass.h"
+ #include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
++#include "third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h"
+ #include "triton/Analysis/Allocation.h"
+ #include "triton/Analysis/AxisInfo.h"
+ #include "triton/Analysis/Membar.h"
+@@ -24,8 +26,6 @@
+ #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+ #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+ 
+-#include "third_party/proton/dialect/include/TritonProtonToLLVM/PatternTritonProtonOpToLLVM.h"
+-
+ namespace mlir::triton {
+ #define GEN_PASS_DEF_CONVERTTRITONAMDGPUTOLLVM
+ #include "TritonAMDGPUToLLVM/Passes.h.inc"
+@@ -85,6 +85,15 @@
+       mod.emitError("unsupported target: '") << this->arch.getValue() << "'";
+       return signalPassFailure();
+     }
++    llvm::StringRef chipset =
++        llvm::AMDGPU::getArchNameAMDGCN(targetInfo.getGPUKind());
++    llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
++        mlir::amdgpu::Chipset::parse(chipset);
++    if (failed(maybeChipset)) {
++      mlir::emitError(mlir::UnknownLoc::get(&getContext()),
++                      "Invalid chipset name: " + chipset);
++      return signalPassFailure();
++    }
+ 
+     mlir::LowerToLLVMOptions option(context);
+     option.overrideIndexBitwidth(32);
+@@ -207,8 +216,8 @@
+     mlir::populateMathToLLVMConversionPatterns(typeConverter, patterns);
+ 
+     // Native lowering patterns
+-    mlir::populateGpuToROCDLConversionPatterns(typeConverter, patterns,
+-                                               mlir::gpu::amd::HIP);
++    mlir::populateGpuToROCDLConversionPatterns(
++        typeConverter, patterns, mlir::gpu::amd::HIP, *maybeChipset);
+ 
+     mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
+                                                           patterns);
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl753345493.patch b/third_party/xla/third_party/triton/llvm_integration/cl753345493.patch
new file mode 100644
index 000000000000..4bc4a8267ad1
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl753345493.patch
@@ -0,0 +1,57 @@
+
+--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-03-25 07:48:50.000000000 -0700
++++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -295,7 +295,7 @@
+             offset);
+       }
+       auto vecAddr = b.gep(sharedPtrTy, elemTy, smemBase, offset);
+-      vecAddr.setInbounds(true);
++      vecAddr.setNoWrapFlags(mlir::LLVM::GEPNoWrapFlags::inbounds);
+       return vecAddr;
+     };
+ 
+
+--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp	2025-04-25 05:19:43.000000000 -0700
++++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -399,7 +399,7 @@
+   }
+   auto ptrTy = smemBase.getType();
+   auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset);
+-  vecAddr.setInbounds(true);
++  vecAddr.setNoWrapFlags(mlir::LLVM::GEPNoWrapFlags::inbounds);
+   return vecAddr;
+ }
+ 
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -191,7 +191,7 @@
+             .second;
+     Value offset = b.xor_(regBase, b.i32_val(regIdx));
+     auto vecAddr = b.gep(smemPtrTy, llvmElemTy, smemBase, offset);
+-    vecAddr.setInbounds(true);
++    vecAddr.setNoWrapFlags(mlir::LLVM::GEPNoWrapFlags::inbounds);
+     SmallVector<Value> inValsVec;
+     for (int j = 0; j < srcVec; j++)
+       inValsVec.push_back(srcVals[i + j]);
+
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp	2025-04-11 01:29:32.000000000 -0700
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp	2025-05-01 06:10:17.000000000 -0700
+@@ -234,7 +234,7 @@
+     SmallVector<Value> vals = unpackLLVector(loc, val, rewriter);
+     for (int i = 0; i < vec / maxVec; i++) {
+       auto newPtr = b.gep(ptr.getType(), elemTy, ptr, b.i32_val(i * maxVec),
+-                          /*inbounds=*/true);
++                          mlir::LLVM::GEPNoWrapFlags::inbounds);
+       storeDShared(
+           rewriter, loc, newPtr, ctaId,
+           packLLVector(loc, ArrayRef(vals).slice(i * maxVec, maxVec), rewriter),
+@@ -347,7 +347,7 @@
+     SmallVector<Value> vals;
+     for (int i = 0; i < vec / maxVec; i++) {
+       auto newPtr = b.gep(ptr.getType(), elemTy, ptr, b.i32_val(i * maxVec),
+-                          /*inbounds=*/true);
++                          mlir::LLVM::GEPNoWrapFlags::inbounds);
+       auto newVal = loadDShared(rewriter, loc, newPtr, ctaId,
+                                 vec_ty(elemTy, maxVec), pred);
+       for (Value v : unpackLLVector(loc, newVal, rewriter)) {
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl757906268.patch b/third_party/xla/third_party/triton/llvm_integration/cl757906268.patch
new file mode 100644
index 000000000000..47c4b4586c6d
--- /dev/null
+++ b/third_party/xla/third_party/triton/llvm_integration/cl757906268.patch
@@ -0,0 +1,12 @@
+
+--- a/test/Conversion/cvt_to_llvm.mlir	2025-01-31 01:23:09.000000000 -0800
++++ b/test/Conversion/cvt_to_llvm.mlir	2025-05-12 14:34:12.000000000 -0700
+@@ -48,7 +48,7 @@
+ 
+   // CHECK-DAG: [[X_MOD_2:%.*]] = and i32 [[TID]], 1
+   // CHECK-DAG: [[X_2_4_LOWER:%.*]] = shl {{.*}} i32 [[IS_UPPER_HALF]], 1
+-  // CHECK-DAG: [[X_2_4_UPPER0:%.*]] = shl i32 [[TID]], 1
++  // CHECK-DAG: [[X_2_4_UPPER0:%.*]] = shl {{.*}} i32 [[TID]], 1
+   // CHECK-DAG: [[X_2_4_UPPER1:%.*]] = and i32 [[X_2_4_UPPER0]], 24
+   // CHECK-DAG: [[X_GE_16:%.*]] = and i32 [[TID]], 16
+   // CHECK-DAG: [[X_GE_16_2:%.*]] = lshr exact i32 [[X_GE_16]], 3
diff --git a/third_party/xla/third_party/triton/llvm_integration/series.bzl b/third_party/xla/third_party/triton/llvm_integration/series.bzl
index 1683cdaff129..dc9627f2c8b2 100644
--- a/third_party/xla/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/xla/third_party/triton/llvm_integration/series.bzl
@@ -8,9 +8,8 @@ LLVM nor MLIR integrator, please do not add any patches to this list.
 """
 
 llvm_patch_list = [
-    "//third_party/triton:llvm_integration/cl727763182.patch",
-    "//third_party/triton:llvm_integration/cl727917222.patch",
-    "//third_party/triton:llvm_integration/cl728192169.patch",
-    "//third_party/triton:llvm_integration/cl728670559.patch",
+    "//third_party/triton:llvm_integration/cl752825850.patch",
+    "//third_party/triton:llvm_integration/cl753345493.patch",
+    "//third_party/triton:llvm_integration/cl757906268.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/addition_to_sparsity.patch b/third_party/xla/third_party/triton/temporary/addition_to_sparsity.patch
deleted file mode 100644
index be953584c100..000000000000
--- a/third_party/xla/third_party/triton/temporary/addition_to_sparsity.patch
+++ /dev/null
@@ -1,27 +0,0 @@
-# This patch should be merged with public/sparsity.patch in the beginning of the next
-# integration.
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -230,7 +230,7 @@ static bool bwdFilter(Operation *op) {
- // result, kwidth can be the bitwidth of the lower precision primitive.
- // Conversely, in the downcasting scenario, no reordering is performed,
- // making it directory use the lower precision primitive.
--static int computeOrigBitWidth(Value x) {
-+int computeOrigBitWidth(Value x) {
-   int finalBitWidth = getElementTypeOrSelf(x).getIntOrFloatBitWidth();
-   int origBitWidth = finalBitWidth;
-   SetVector<Operation *> slice;
-@@ -1045,10 +1045,6 @@ public:
-   }
- };
- 
--// Expose helper functions from BlockedToMMA to be reused for sparse matmul.
--int computeOrigBitWidth(Value x) {
--  return BlockedToMMA::computeOrigBitWidth(x);
--}
- Value getSharedMemMMAOperand(Value v, mlir::PatternRewriter &rewriter,
-                                 int opIdx, bool allowTranspose) {
-   return getSharedMemoryMMAOperand(v, rewriter, opIdx, allowTranspose);
-
- 
\ No newline at end of file
diff --git a/third_party/xla/third_party/triton/temporary/enable_peer_access.patch b/third_party/xla/third_party/triton/temporary/enable_peer_access.patch
deleted file mode 100644
index 35a590e2ece1..000000000000
--- a/third_party/xla/third_party/triton/temporary/enable_peer_access.patch
+++ /dev/null
@@ -1,81 +0,0 @@
-This adds support for accessing peer memory from Trition.
-TODO: b/347711619 - Upstream after we've upstreamed the rest of the launcher.
-
-diff --git a/python/test/unit/runtime/test_peer_access.py b/python/test/unit/runtime/test_peer_access.py
-new file mode 100644
---- /dev/null
-+++ b/python/test/unit/runtime/test_peer_access.py
-@@ -0,0 +1,24 @@
-+import pytest
-+import torch
-+
-+import triton
-+import triton.language as tl
-+
-+
-+def test_peer_access(device):
-+    if not hasattr(torch, device):
-+       pytest.skip(f"{device} does not support peer access")
-+    if getattr(torch, device).device_count() < 2:
-+       pytest.skip("need at least 2 devices to test peer access")
-+
-+    @triton.jit
-+    def device_accumulate(my_ptr, peer_ptr):
-+        tl.store(my_ptr, tl.load(my_ptr) + tl.load(peer_ptr))
-+
-+    my_tensor = torch.randn(1, device=f"{device}:0")
-+    peer_tensor = torch.randn(1, device=f"{device}:1")
-+    expected = my_tensor + peer_tensor.to(device=f"{device}:0")
-+
-+    device_accumulate[(1,1,1)](my_tensor, peer_tensor)
-+
-+    torch.testing.assert_close(my_tensor, expected)
-diff --git a/third_party/nvidia/backend/cuda_utils.cc b/third_party/nvidia/backend/cuda_utils.cc
---- a/third_party/nvidia/backend/cuda_utils.cc
-+++ b/third_party/nvidia/backend/cuda_utils.cc
-@@ -168,6 +168,35 @@ PyObject* launchKernel(const TritonLaunc
- // otherwise.
- using ExtractorType = bool (*)(PyObject* obj, void* ptr);
- 
-+// Enable peer access if dev_ptr is allocated on a different device than the
-+// device on which we will execute the kernel.
-+PyObject* enablePeerAccessIfNecessary(CUdeviceptr dev_ptr) {
-+  CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
-+  CUresult status = cuPointerGetAttribute(
-+      &mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dev_ptr);
-+  if (status != CUDA_SUCCESS || mem_type != CU_MEMORYTYPE_DEVICE) {
-+    // Not peer memory
-+    Py_RETURN_NONE;
-+  }
-+  int mem_device_ordinal = 0;
-+  CUDA_CHECK_AND_RETURN_NULL(cuPointerGetAttribute(
-+      &mem_device_ordinal, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, dev_ptr));
-+  CUdevice mem_device = 0;
-+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGet(&mem_device, mem_device_ordinal));
-+  CUdevice compute_device = 0;
-+  CUDA_CHECK_AND_RETURN_NULL(cuCtxGetDevice(&compute_device));
-+  if (mem_device != compute_device) {
-+    CUcontext mem_ctx = nullptr;
-+    CUDA_CHECK_AND_RETURN_NULL(cuDevicePrimaryCtxRetain(&mem_ctx, mem_device));
-+    CUresult status = cuCtxEnablePeerAccess(mem_ctx, /*flags=*/0);
-+    if (status == CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
-+      status = CUDA_SUCCESS;
-+    }
-+    CUDA_CHECK_AND_RETURN_NULL(status);
-+  }
-+  Py_RETURN_NONE;
-+}
-+
- // Extract a CUDA device pointer from a pointer-like PyObject obj, and store
- // it to the memory location pointed by ptr.
- bool extractPointer(PyObject* obj, void* ptr) {
-@@ -200,6 +229,9 @@ bool extractPointer(PyObject* obj, void*
-   if (*dev_ptr == 0) {
-     return true;  // valid nullptr
-   }
-+  if (enablePeerAccessIfNecessary(*dev_ptr) == nullptr) {
-+    return false;
-+  }
-   CUresult status = cuPointerGetAttribute(
-       dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, *dev_ptr);
-   if (status == CUDA_ERROR_INVALID_VALUE) {
diff --git a/third_party/xla/third_party/triton/temporary/f8e5m2_conversion.patch b/third_party/xla/third_party/triton/temporary/f8e5m2_conversion.patch
deleted file mode 100644
index 749f92a357d4..000000000000
--- a/third_party/xla/third_party/triton/temporary/f8e5m2_conversion.patch
+++ /dev/null
@@ -1,62 +0,0 @@
-diff --git a/python/test/unit/language/test_conversions.py b/python/test/unit/language/test_conversions.py
---- a/python/test/unit/language/test_conversions.py
-+++ b/python/test/unit/language/test_conversions.py
-@@ -331,14 +331,18 @@ def test_typeconvert_upcast(src_dtype, d
- def test_typeconvert_downcast(src_dtype, dst_dtype, rounding, max_repr, device):
- 
-     if is_cuda():
--        if src_dtype != 'float32' and torch.cuda.get_device_capability(0) < (9, 0):
--            pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
--
--        if dst_dtype in ('float8e5', 'float8e4nv') and rounding == 'rtne' and torch.cuda.get_device_capability(0) < (9, 0):
--            pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
--
--        if dst_dtype in ('float8e5b16', 'float8e4b8') and rounding == 'rtne':
-+        if dst_dtype in ('float8e5b16', 'float8e4b8'):
-             pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on AMDGPU MI300")
-+        if torch.cuda.get_device_capability(0) < (9, 0):
-+            match (src_dtype, dst_dtype, rounding):
-+                case ('float16', 'float8e5', 'rtne'):
-+                    ...
-+                case (_, ty, 'rtne') if ty in ('float8e5', 'float8e4nv'):
-+                    pytest.skip(f"{dst_dtype} downcast with RTNE rounding tests only supported on NVGPU with compute capability 9.0+")
-+                case (ty, _, _) if ty != 'float32':
-+                    pytest.skip("non-float32 downcast tests only supported on NVGPU with compute capability 9.0+")
-+                case _:
-+                    ...
- 
-     if is_hip():
-         if dst_dtype == 'float8e5' and rounding == 'rtne':
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
-@@ -27,15 +27,20 @@ struct Fp8ConversionDesc {
- static const Fp8ConversionDesc Fp16_to_Fp8E5M2_RTNE(bool hasNativeFP) {
-   Fp8ConversionDesc ret;
-   if (!hasNativeFP) {
--    ret = {"{                            \n"
--           ".reg .b32 a<2>;              \n"
--           "and.b32 a0, $1, 0xfffefffe;  \n"   // a0 &= 0xfffefffe
--           "and.b32 a1, $2, 0xfffefffe;  \n"   // (strip lowest bit)
--           "add.u32 a0, a0, 0x00800080;  \n"   // a0 += 0x00800080
--           "add.u32 a1, a1, 0x00800080;  \n"   // (round to nearest)
--           "prmt.b32 $0, a0, a1, 0x7531; \n\t" // output = a1a0
--           "}",
--           32, 32, 4};
-+    ret = {
-+        "{                            \n"
-+        ".reg .b32 a<2>;              \n"
-+        "and.b32 a0, $1, 0x01000100;  \n"    // a0 = $1 & 0x01000100
-+        "and.b32 a1, $2, 0x01000100;  \n"    // (least significant result bit)
-+        "shr.b32 a0, a0, 8;           \n"    // a0 >>= 8
-+        "shr.b32 a1, a1, 8;           \n"    // (shift the lsb)
-+        "add.u32 a0, a0, 0x007f007f;  \n"    // a0 += 0x007f007f
-+        "add.u32 a1, a1, 0x007f007f;  \n"    // (add rounding base)
-+        "add.u32 a0, a0, $1;          \n"    // res = $1 + lsb + 0x7f
-+        "add.u32 a1, a1, $2;          \n"    // (round to nearest)
-+        "prmt.b32 $0, a0, a1, 0x7531; \n\t"  // output = a1a0
-+        "}",
-+        32, 32, 4};
-   } else {
-     ret = {"cvt.rn.satfinite.e5m2x2.f16x2 $0, $1; \n\t", 32, 16, 2};
-   }
diff --git a/third_party/xla/third_party/triton/temporary/fix_assert.patch b/third_party/xla/third_party/triton/temporary/fix_assert.patch
deleted file mode 100644
index e8bf395e1431..000000000000
--- a/third_party/xla/third_party/triton/temporary/fix_assert.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-# This should be upstreamed b/394779997
-diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
---- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
-@@ -255,7 +255,7 @@ mlir::triton::maybeGetStageCluster(Opera
- }
- std::pair<int, int> mlir::triton::getStageCluster(Operation *op) {
-   auto res = maybeGetStageCluster(op);
--  assert(res.has_value() || "Operation is missing stage & cluster attribute");
-+  assert(res.has_value() && "Operation is missing stage & cluster attribute");
-   return *res;
- }
- 
\ No newline at end of file
diff --git a/third_party/xla/third_party/triton/temporary/fix_fence_insertion_race.patch b/third_party/xla/third_party/triton/temporary/fix_fence_insertion_race.patch
deleted file mode 100644
index 0ebe589bebb5..000000000000
--- a/third_party/xla/third_party/triton/temporary/fix_fence_insertion_race.patch
+++ /dev/null
@@ -1,63 +0,0 @@
-TODO: b/391620138 - Upstream once we manage to get a good reproducer.
-
-This should prevent a crash due to a race condition for now.
-
-diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
---- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-+++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
-@@ -43,6 +43,7 @@ public:
-     if (::triton::tools::getBoolEnv("DISABLE_MMA_V3"))
-       return;
-     ModuleOp mod = getOperation();
-+    DenseSet<std::pair<Operation *, unsigned>> trace;
-     mod.walk([&](Operation *op) {
-       if (!isa<ttng::WarpGroupDotOp>(op) &&
-           op->getName().getStringRef() != "triton_xla.sparse_dot")
-@@ -54,8 +55,8 @@ public:
-           cast<RankedTensorType>(op->getResult(0).getType()).getEncoding());
-       if (!mmaEncoding || !mmaEncoding.isHopper())
-         return WalkResult::advance();
--      bool aDependsOnShared = dependOnSharedEncOperand(a);
--      bool bDependsOnShared = dependOnSharedEncOperand(b);
-+      bool aDependsOnShared = dependOnSharedEncOperand(a, trace);
-+      bool bDependsOnShared = dependOnSharedEncOperand(b, trace);
-       if (!aDependsOnShared && !bDependsOnShared)
-         return WalkResult::advance();
-       Operation *fence = builder.create<ttng::FenceAsyncSharedOp>(
-@@ -76,8 +77,7 @@ public:
-   }
- 
- private:
--  bool dependOnSharedEncOperand(Value operand) {
--    static DenseSet<std::pair<Operation *, unsigned>> trace;
-+  bool dependOnSharedEncOperand(Value operand, DenseSet<std::pair<Operation *, unsigned>> &trace) {
-     auto op = operand.getDefiningOp();
-     // avoid redundant insertion
-     if (op && op->hasTrait<OpTrait::DotLike>())
-@@ -92,7 +92,7 @@ private:
-     // op and not BlockArgument
-     if (op && !isa<BlockArgument>(operand)) {
-       for (auto v : op->getOperands()) {
--        if (dependOnSharedEncOperand(v))
-+        if (dependOnSharedEncOperand(v, trace))
-           return true;
-       }
-     }
-@@ -107,7 +107,7 @@ private:
-         auto iterOperands = forOp.getInitArgs();
-         if (argNum == 0)
-           return false;
--        if (dependOnSharedEncOperand(iterOperands[argNum - 1]))
-+        if (dependOnSharedEncOperand(iterOperands[argNum - 1], trace))
-           return true;
-         // yield
-         auto yieldOp = forOp.getBody()->getTerminator();
-@@ -120,7 +120,7 @@ private:
-         else
-           trace.insert(entry);
- 
--        if (dependOnSharedEncOperand(v))
-+        if (dependOnSharedEncOperand(v, trace))
-           return true;
-       } else if (auto whileOp = dyn_cast<scf::WhileOp>(argOwner)) {
-         assert(false && "FenceInsertionPass does not supported WhileOp");
diff --git a/third_party/xla/third_party/triton/temporary/launcher_tma_arg_mask.patch b/third_party/xla/third_party/triton/temporary/launcher_tma_arg_mask.patch
new file mode 100644
index 000000000000..0df2f9a5418c
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_tma_arg_mask.patch
@@ -0,0 +1,151 @@
+diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
+--- a/third_party/nvidia/backend/driver.py
++++ b/third_party/nvidia/backend/driver.py
+@@ -1,8 +1,9 @@
+-from collections.abc import Callable
++from collections.abc import Callable, Iterator, Sequence
+ import functools
+ import operator
+ import os
+ import subprocess
++from typing import Any, AnyStr
+ import triton
+ from triton.runtime import _allocation
+ from triton.backends.compiler import GPUTarget
+@@ -95,54 +96,64 @@ def ty_to_cpp(ty):
+         "nvTmaDesc": "CUtensorMap",
+     }[ty]
+ 
++# A nested sequence of arg type strings.
++type _ArgTypeWithNesting = str | Sequence[ "_ArgTypeWithNesting"]
+ 
+-def flatten_tuples(xs):
++# Nested mask that has True for elements that should be kept and False for
++# elements that should be removed. Has the same shape as the signature.
++type _ArgMask = Sequence[bool | "_ArgMask"]
++
++
++def _make_nonconst_arg_mask(signature_types: Sequence[_ArgTypeWithNesting]) -> _ArgMask:
++    """Makes a mask that keeps non-constexpr args and removes constexpr args."""
++    # For example:
++    #   Signature: [i32, constexpr, (i32, constexpr)]
++    #   Mask:      [True, False, [True, False]]
++    return [
++        _make_nonconst_arg_mask(ty) if isinstance(ty, tuple) else ty != "constexpr"
++        for ty in signature_types
++    ]
++
++
++def _flatten_tuples(xs):
+     """Recursively flattens tuple elements in xs."""
+     for x in xs:
+         if isinstance(x, tuple):
+-            yield from flatten_tuples(x)
++            yield from _flatten_tuples(x)
+         else:
+             yield x
+ 
+ 
+-def make_launcher(constants : dict[int, str], signature : dict[int, any]) -> Callable[..., None]:
+-
+-    # Here, signature can look like:
+-    #  {'_0': 'i32',
+-    #   'Ptrs': (),
+-    #   '_1': 'constexpr',
+-    #   'values': '[*f32, constexpr]',
+-    #   'out_tuple': 'constexpr'}
+-    # We want to remove the constexprs, flatten the tuples, and remove any more
+-    # constexprs. If we remove them all at the end, we won't be able to remove
+-    # entire tuples that are a single constexpr. If we remove them before
+-    # flattening, we will miss mixed-tuples. So we do it twice.
+-
+-    def _serialize_signature(sig):
+-        if isinstance(sig, tuple):
+-            return ','.join(map(_serialize_signature, sig))
+-        return sig
++def _flatten_and_apply_arg_mask(args: Sequence[Any], mask: _ArgMask) -> Iterator[Any]:
++    """Flattens nested args skipping those filtered out by the mask."""
++    if len(mask) != len(args):
++        # If the included elements in the mask are the same length as the args,
++        # we can assume the caller filtered the args already.
++        # Otherwise there is an unexpected length mismatch.
++        if mask.count(True) == len(args):
++            yield from _flatten_tuples(args)
++            return
++        else:
++            raise ValueError(f"Mask length {len(mask)} does not match arg length {len(args)}")
+ 
+-    # Remember & remove all the constexpr before flattening.
+-    constant_indices_before_flattening = {i for i, [k, v] in enumerate(signature.items()) if v == 'constexpr'}
+-    # constant_indices_before_flattening = [2, 4]
+-    signature = {k: v for k, v in signature.items() if v != 'constexpr'}
+-    # signature = {'_0': 'i32', 'Ptrs': (), 'values': '[*f32, constexpr]'}
++    for mask_item, arg in zip(mask, args):
++        if not mask_item:
++            continue
++        arg_is_sequence = isinstance(arg, Sequence) and not isinstance(arg, str)
++        mask_item_is_sequence = isinstance(mask_item, Sequence)
++        if arg_is_sequence and mask_item_is_sequence:
++            yield from _flatten_and_apply_arg_mask(arg, mask_item)
++        elif arg_is_sequence != mask_item_is_sequence:
++            raise ValueError(f"Inconsistent mask {mask_item} and arg {arg}")
++        else:
++            yield arg
+ 
+-    # Flatten.
+-    signature = ','.join(map(_serialize_signature, signature.values()))
+-    # signature = 'i32,,*f32,constexpr'
+-    signature = list(filter(bool, signature.split(',')))
+-    # signature = ['i32', '*f32', 'constexpr']
+ 
+-    # Remove any constexprs after flattening.
+-    constant_indices_after_flattening = {i for i, s in enumerate(signature) if s == 'constexpr'}
+-    # constant_indices_after_flattening = [2]
+-    signature = {i: s for i, s in enumerate(signature) if s != 'constexpr'}
+-    # signature = {0: 'i32', 1: '*f32'}
++def make_launcher(signature_types: Sequence[_ArgTypeWithNesting]) -> Callable[..., None]:
++    non_const_arg_mask = _make_nonconst_arg_mask(signature_types)
++    flattened_signature = _flatten_and_apply_arg_mask(signature_types, non_const_arg_mask)
+ 
+-    signature_metadata = cuda_utils.build_signature_metadata(
+-            ty for ty in signature.values())
++    signature_metadata = cuda_utils.build_signature_metadata(flattened_signature)
+ 
+     def wrapper(grid_dim_x: int, grid_dim_y: int, grid_dim_z: int,
+                 stream: int, kernel: int, global_scratch: any,
+@@ -151,23 +162,8 @@ def make_launcher(constants : dict[int, 
+                 launch_enter_hook: Callable[..., None],
+                 launch_exit_hook: Callable[..., None],
+                 *args: any) -> None:
+-        # Given the example above, args would look something like:
+-        # args = [8, (), 5, (3, 4), (2, 2, 2)]
+-        # constant_indices_before_flattening = [2, 4]
+-        # Remove constantexprs before flattening:
+-        non_const_args = [arg
+-            for idx, arg in enumerate(args)
+-            if idx not in constant_indices_before_flattening
+-        ]
+-        # non_const_args = [8, (), (3, 4)]
+-        non_const_args = flatten_tuples(non_const_args)
+-        # non_const_args = [8, 3, 4]
+-        # constant_indices_after_flattening = [2]
+-        non_const_args = [arg
+-            for idx, arg in enumerate(non_const_args)
+-            if idx not in constant_indices_after_flattening
+-        ]
+-        # non_const_args = [8, 3]
++        non_const_args = _flatten_and_apply_arg_mask(args, non_const_arg_mask)
++
+         cuda_utils.launch(grid_dim_x, grid_dim_y, grid_dim_z, stream, kernel,
+                           packed_metadata, hook_args, launch_enter_hook,
+                           launch_exit_hook, signature_metadata, global_scratch,
+@@ -255,7 +251,8 @@ class CudaLauncher(object):
+         constants = {arg_idx(idx): value for idx, value in constants.items()}
+         signature = {idx: value for idx, value in src.signature.items()}
+         self.num_ctas = functools.reduce(operator.mul, metadata.cluster_dims, 1)
+-        self.launch = make_launcher(constants, signature)
++        del constants  # Unused.
++        self.launch = make_launcher(signature.values())
+         self.global_scratch_size = metadata.global_scratch_size
+         self.global_scratch_align = metadata.global_scratch_align
+         self.launch_cooperative_grid = metadata.launch_cooperative_grid
diff --git a/third_party/xla/third_party/triton/temporary/launcher_tma_tensordesc.patch b/third_party/xla/third_party/triton/temporary/launcher_tma_tensordesc.patch
new file mode 100644
index 000000000000..f9baa46d91d1
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_tma_tensordesc.patch
@@ -0,0 +1,125 @@
+diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
+--- a/third_party/nvidia/backend/driver.py
++++ b/third_party/nvidia/backend/driver.py
+@@ -1,13 +1,15 @@
+-from collections.abc import Callable, Iterator, Sequence
++import array
++from collections.abc import Callable, Iterator, MutableSequence, Sequence
+ import functools
++import inspect
+ import operator
+ import os
+ import subprocess
+-from typing import Any, AnyStr
++from typing import Any
+ import triton
+ from triton.runtime import _allocation
+ from triton.backends.compiler import GPUTarget
+-from triton.backends.driver import GPUDriver, platform_key
++from triton.backends.driver import GPUDriver
+ from ._C import cuda_utils
+ 
+ from triton.tools.tensor_descriptor import TensorDescriptor
+@@ -150,6 +152,27 @@ def _flatten_and_apply_arg_mask(args: Se
+ 
+ 
+ def make_launcher(signature_types: Sequence[_ArgTypeWithNesting]) -> Callable[..., None]:
++    # Code copied from the original with added type hints.
++    # Expands tensordesc with the type and block shapes like <fp16[128, 16]>
++    # into an nvTmaDesc, shapes, and strides.
++    # This is the signature-handling counterpart to `make_tensordesc_arg`.
++    def _expand_signature(sig: _ArgTypeWithNesting, output: MutableSequence[_ArgTypeWithNesting]):
++        # Expand tensordesc arguments
++        if isinstance(sig, str) and sig.startswith("tensordesc"):
++            output.append("nvTmaDesc")
++            ndim = sig.count(",") + 1
++            for _ in range(ndim):
++                output.append("i32")
++            for _ in range(ndim):
++                output.append("i64")
++        else:
++            output.append(sig)
++
++    expand_signature = []
++    for sig in signature_types:
++        _expand_signature(sig, expand_signature)
++
++    signature_types = expand_signature
+     non_const_arg_mask = _make_nonconst_arg_mask(signature_types)
+     flattened_signature = _flatten_and_apply_arg_mask(signature_types, non_const_arg_mask)
+ 
+@@ -168,19 +191,27 @@ def make_launcher(signature_types: Seque
+                           packed_metadata, hook_args, launch_enter_hook,
+                           launch_exit_hook, signature_metadata, global_scratch,
+                           non_const_args)
++
+     return wrapper
+ 
+ 
+ class TmaDescKernelParam:
+     TMA_DESC_SIZE = 128
++    _ALIGN = 64
+ 
+     def __init__(self):
+-        import torch
+-        self.desc = torch.empty(self.TMA_DESC_SIZE, dtype=torch.uint8, device="cpu")
++        # Add the alignment to the array size to ensure that the address can be
++        # aligned without access going out of bounds.
++        self._array = array.array('B', [0] * (self.TMA_DESC_SIZE + self._ALIGN))
++        address, num_bytes = self._array.buffer_info()
++        # Shift the address to the nearest multiple of the alignment.
++        self._aligned_address = address + self._ALIGN - (address % self._ALIGN)
++        assert self._aligned_address + self.TMA_DESC_SIZE <= address + num_bytes
++        assert self._aligned_address % self._ALIGN == 0
+ 
+     # Return a CUtensorMap* pointer in host memory
+     def tma_desc_cpu_ptr(self):
+-        return self.desc.data_ptr()
++        return self._aligned_address
+ 
+ 
+ # The TMA dtype enum values are slightly different on host vs device...
+@@ -221,13 +252,27 @@ def make_tensordesc_arg(arg, metadata):
+     return result
+ 
+ 
++def get_var_positional_arg_index(launcher: Callable[..., None]) -> int | None:
++    """Returns the index of the variable positional argument in a callable."""
++    launcher_sig = inspect.signature(launcher)
++    for i, param in enumerate(launcher_sig.parameters.values()):
++        if param.kind == inspect.Parameter.VAR_POSITIONAL:
++            return i
++    return None
++
++
++# Ported from the original with improved positional argument handling.
+ def wrap_handle_tensordesc(launcher, tensordesc_meta):
+     if not tensordesc_meta:
+         return launcher
+ 
++    # Get the index of the `*args` entry in the launcher signature.
++    var_positional_arg = get_var_positional_arg_index(launcher)
++    assert var_positional_arg is not None
++
+     def inner(*args):
+-        meta_args = args[:11]
+-        raw_kernel_args = args[11:]
++        meta_args = args[:var_positional_arg]
++        raw_kernel_args = args[var_positional_arg:]
+         tensordesc_idx = 0
+         final_args = []
+         for i, arg in enumerate(raw_kernel_args):
+@@ -252,7 +297,12 @@ class CudaLauncher(object):
+         signature = {idx: value for idx, value in src.signature.items()}
+         self.num_ctas = functools.reduce(operator.mul, metadata.cluster_dims, 1)
+         del constants  # Unused.
+-        self.launch = make_launcher(signature.values())
++        launch = make_launcher(signature.values())
++        tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
++        if tensordesc_meta is not None:
++            self.launch = wrap_handle_tensordesc(launch, tensordesc_meta)
++        else:
++            self.launch = launch
+         self.global_scratch_size = metadata.global_scratch_size
+         self.global_scratch_align = metadata.global_scratch_align
+         self.launch_cooperative_grid = metadata.launch_cooperative_grid
diff --git a/third_party/xla/third_party/triton/temporary/mlir_types.patch b/third_party/xla/third_party/triton/temporary/mlir_types.patch
deleted file mode 100644
index 7712ae1baab3..000000000000
--- a/third_party/xla/third_party/triton/temporary/mlir_types.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-# This is already fixed upstream. Remove this patch in the next integration.
-diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
---- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
-+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp
-@@ -3,6 +3,7 @@
- #include "PatternTritonGPUOpToLLVM.h"
- #include "Utility.h"
- #include "mlir/Support/LLVM.h"
-+#include "third_party/triton/include/triton/Conversion/TritonGPUToLLVM/Utility.h"
- #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
- 
- using namespace mlir;
-@@ -60,7 +61,8 @@ enum class mxfpKind { mxf8f6f4 = 0, mxf4
- inline mxfpKind getMXFPKind(ScaleDotElemType typeA, ScaleDotElemType typeB,
-                             Type scaleAType, Type scaleBType) {
-   if (typeA == ScaleDotElemType::E2M1 && typeB == ScaleDotElemType::E2M1) {
--    if (scaleAType.isFloat8E4M3FN() && scaleBType.isFloat8E4M3FN()) {
-+    if (llvm::isa<mlir::Float8E4M3FNType>(scaleAType) &&
-+        llvm::isa<mlir::Float8E4M3FNType>(scaleBType)) {
-       return mxfpKind::mxf4nvf4;
-     }
-     return mxfpKind::mxf4;
-@@ -100,10 +102,11 @@ static Value createInstDescriptor(Conver
-       return 1;
-     if (type.isF32())
-       return 2;
--    if (type.isFloat8E4M3FN())
-+    if (llvm::isa<mlir::Float8E4M3FNType>(type))
-       return 0;
--    if (type.isFloat8E5M2())
-+    if (llvm::isa<mlir::Float8E5M2Type>(type))
-       return 1;
-+
-     llvm_unreachable("Unsupported type.");
-   };
-   static_assert(sizeof(TCGen5InstructionDescriptor) == 4,
-@@ -224,7 +227,8 @@ static void createGen5MMA(ConversionPatt
-     opcode += "f16";
-   else if (srcElementTy.isF32())
-     opcode += "tf32";
--  else if (srcElementTy.isFloat8E4M3FN() || srcElementTy.isFloat8E5M2())
-+  else if (llvm::isa<mlir::Float8E4M3FNType>(srcElementTy) ||
-+           llvm::isa<mlir::Float8E5M2Type>(srcElementTy))
-     opcode += "f8f6f4";
-   else
-     assert(0 && "Unsupported type.");
diff --git a/third_party/xla/third_party/triton/temporary/mmav5_warps.patch b/third_party/xla/third_party/triton/temporary/mmav5_warps.patch
deleted file mode 100644
index 5e7742b2df27..000000000000
--- a/third_party/xla/third_party/triton/temporary/mmav5_warps.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-# Already fixed upstream, can be removed in the next integrate.
-# https://github.com/triton-lang/triton/commit/2b2a872459648a7418d1c7e4cfa9aa8cb39e71e5
-
-diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
---- a/lib/Analysis/Utility.cpp
-+++ b/lib/Analysis/Utility.cpp
-@@ -727,6 +727,10 @@ bool supportMMA(triton::DotOp op, int ve
-       return false;
-     if (op.getType().getRank() != 2)
-       return false;
-+    if (numWarps != 4 && numWarps != 8) {
-+      // Currently only support numWarps 4 or 8 for TMEM load and store.
-+      return false;
-+    }
-     if (!(numWarps % 4 == 0 && retShapePerCTA[rank - 2] % 64 == 0 &&
-           retShapePerCTA[rank - 1] % 8 == 0))
-       return false;
diff --git a/third_party/xla/third_party/triton/temporary/ptxas_blackwell.patch b/third_party/xla/third_party/triton/temporary/ptxas_blackwell.patch
deleted file mode 100644
index bb315711fd34..000000000000
--- a/third_party/xla/third_party/triton/temporary/ptxas_blackwell.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-# TODO(b/399031689):Clarify with NVIDIA if this is intentional, and whether it 
-# will be changed in the future. Until then, this should be an internal patch.
-diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
---- a/third_party/nvidia/backend/compiler.py
-+++ b/third_party/nvidia/backend/compiler.py
-@@ -51,7 +51,7 @@ def _path_to_binary(binary: str):
- 
- @functools.lru_cache()
- def get_ptxas(arch: int):
--    name = "ptxas-blackwell" if arch >= 100 else "ptxas"
-+    name = "ptxas"
-     return _path_to_binary(name)
- 
- 
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index 65f551d18c17..bef7347fb755 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -14,11 +14,7 @@ those to this list.
 """
 
 temporary_patch_list = [
-    "//third_party/triton:temporary/fix_fence_insertion_race.patch",
-    "//third_party/triton:temporary/enable_peer_access.patch",
-    "//third_party/triton:temporary/sm120.patch",
-    "//third_party/triton:temporary/mmav5_warps.patch",
-    "//third_party/triton:temporary/ptxas_blackwell.patch",
-    "//third_party/triton:temporary/f8e5m2_conversion.patch",
+    "//third_party/triton:temporary/launcher_tma_arg_mask.patch",
+    "//third_party/triton:temporary/launcher_tma_tensordesc.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/sm120.patch b/third_party/xla/third_party/triton/temporary/sm120.patch
deleted file mode 100644
index 252ec47ad124..000000000000
--- a/third_party/xla/third_party/triton/temporary/sm120.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index c66c9f4ae..3415d6a91 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -33,6 +33,8 @@ static int getMMAVersionSafe(int computeCapability, DotOp op) {
-     versionsSupported = {3, 2};
-   } else if (computeCapability < 110) {
-     versionsSupported = {5, 2};
-+  } else if (computeCapability == 120) {
-+    versionsSupported = {2};
-   } else {
-     assert(false && "computeCapability not supported");
-   }
\ No newline at end of file
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index d445dd233617..509744db9809 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -8,12 +8,12 @@ load("//third_party/triton:xla_extensions/series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl722997049"
-    TRITON_SHA256 = "3c5de0e06947cc8cd6b6b06d0c28b3234f6ff72a4ae8f63c80e6d276413e5d7e"
+    TRITON_COMMIT = "triton_integrate_branch-1.5"
+    TRITON_SHA256 = "55094d08c314243382d051e19ad23eda94b37cb880ae1608d2a6a0d64489e855"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
-        strip_prefix = "triton-{commit}".format(commit = TRITON_COMMIT),
-        urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
+        strip_prefix = "triton-" + TRITON_COMMIT,
+        urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{}.tar.gz".format(TRITON_COMMIT)),
         patch_file = extensions_files_patch_list + llvm_patch_list + temporary_patch_list,
     )
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index 0d5da2b21df0..579599559d87 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -82,26 +82,6 @@ tsl_cc_test(
     ],
 )
 
-cc_library(
-    name = "criticality",
-    compatible_with = get_compatible_with_portable(),
-    textual_hdrs = ["criticality.h"],
-    deps = tf_platform_deps("criticality"),
-)
-
-tsl_cc_test(
-    name = "criticality_test",
-    size = "small",
-    srcs = [
-        "criticality_test.cc",
-    ],
-    deps = [
-        ":criticality",
-        "@com_google_googletest//:gtest_main",
-        "@local_xla//xla/tsl/platform:test",
-    ],
-)
-
 cc_library(
     name = "denormal",
     srcs = ["denormal.cc"],
@@ -364,7 +344,6 @@ filegroup(
         "coding.h",
         "context.h",
         "cpu_info.h",
-        "criticality.h",
         "env.h",
         "errors.h",
         "file_statistics.h",
@@ -440,7 +419,6 @@ filegroup(
         "context.h",
         "cpu_info.cc",
         "cpu_info.h",
-        "criticality.h",
         "demangle.h",
         "denormal.cc",
         "denormal.h",
@@ -550,7 +528,6 @@ exports_files(
         "context.h",
         "cpu_info.cc",
         "cpu_info.h",
-        "criticality.h",
         "cuda_root_path.h",
         "demangle.h",
         "env.h",
@@ -684,6 +661,7 @@ cc_library(
 filegroup(
     name = "xla_cpu_runtime_srcs",
     srcs = [
+        "context.h",
         "env_time.h",
         "macros.h",
         "platform.h",
@@ -716,7 +694,12 @@ cc_library(
         "numa.h",
         "snappy.h",
     ],
-    deps = tf_windows_aware_platform_deps("platform_port"),
+    deps = tf_windows_aware_platform_deps("platform_port") + [
+        ":platform",
+        "@local_xla//xla/tsl/platform:byte_order",
+        "@local_xla//xla/tsl/platform:dynamic_annotations",
+        "@local_xla//xla/tsl/platform:types",
+    ],
 )
 
 cc_library(
@@ -796,7 +779,9 @@ cc_library(
     deps = [
         ":numbers",
         ":stringpiece",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/platform:logging",
         "@local_xla//xla/tsl/platform:macros",
         "@local_xla//xla/tsl/platform:types",
@@ -1139,6 +1124,7 @@ tsl_cc_test(
         ":platform",
         ":stringpiece",
         ":tstring",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_xla//xla/tsl/platform:test",
     ],
@@ -1387,6 +1373,7 @@ tsl_cc_test(
         "strcat_test.cc",
     ],
     deps = [
+        ":bfloat16",
         ":strcat",
         ":stringprintf",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc
index 5ed6c7ff4c0a..e30797b5f76a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tsl/platform/cpu_info.h"
 
+#include <string>
+
 #include "absl/base/call_once.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/types.h"
@@ -23,6 +25,7 @@ limitations under the License.
 #include <mutex>  // NOLINT
 #endif
 #if defined(PLATFORM_IS_ARM64) && !defined(__APPLE__) && !defined(__OpenBSD__)
+#include <asm/hwcap.h> /* Get HWCAP bits from asm/hwcap.h */
 #include <sys/auxv.h>
 #ifndef HWCAP_CPUID
 #define HWCAP_CPUID (1 << 11)
@@ -375,6 +378,7 @@ void InitCPUIDInfo() {
 
 class CPUIDInfo;
 void InitCPUIDInfo();
+void InitCPUIDFeatureInfo();
 
 CPUIDInfo *cpuid = nullptr;
 
@@ -386,7 +390,8 @@ class CPUIDInfo {
         variant_(0),
         cpunum_(0),
         is_arm_neoverse_v1_(0),
-        is_arm_neoverse_n1_(0) {}
+        is_arm_neoverse_n1_(0),
+        has_bf16_(0) {}
 
   static void Initialize() {
     // Initialize CPUIDInfo pointer.
@@ -458,26 +463,54 @@ class CPUIDInfo {
     }
 #endif  // !PLATFORM_WINDOWS
   }
+  static void InitializeCPUFeature() {
+    // Initialize CPUIDInfo pointer.
+    if (cpuid != nullptr) return;
+
+    cpuid = new CPUIDInfo;
+
+    const uint32_t hwcaps2 = getauxval(AT_HWCAP2);
+    cpuid->has_bf16_ = IsFeatureSupported(hwcaps2, kHwcap2Bf16);
+  }
 
   int implementer() const { return implementer_; }
   int cpunum() const { return cpunum_; }
 
   static bool TestAarch64CPU(Aarch64CPU cpu) {
     InitCPUIDInfo();
+    // clang-format off
     switch (cpu) {
       case ARM_NEOVERSE_V1:
         return cpuid->is_arm_neoverse_v1_;
       default:
-        return 0;
+        return false;
     }
+    // clang-format on
+    return false;
+  }
+
+  static bool IsFeatureSupported(uint64_t features, uint64_t feature_mask) {
+    return (features & feature_mask);
+  }
+  static bool TestAarch64Feature(CPUFeature feature) {
+    InitCPUIDFeatureInfo();
+    switch (feature) {
+      case AARCH64_BF16:
+        return cpuid->has_bf16_;
+      default:
+        break;
+    }
+    return false;
   }
 
  private:
+  static constexpr uint64_t kHwcap2Bf16 = 1ull << 14;
   int implementer_;
   int variant_;
   int cpunum_;
   int is_arm_neoverse_v1_;  // ARM NEOVERSE V1
   int is_arm_neoverse_n1_;  // ARM NEOVERSE N1
+  int has_bf16_;
 };
 
 absl::once_flag cpuid_once_flag;
@@ -486,6 +519,10 @@ void InitCPUIDInfo() {
   absl::call_once(cpuid_once_flag, CPUIDInfo::Initialize);
 }
 
+void InitCPUIDFeatureInfo() {
+  absl::call_once(cpuid_once_flag, CPUIDInfo::InitializeCPUFeature);
+}
+
 #endif  // PLATFORM_IS_ARM64 && !__APPLE__ && !__OpenBSD__
 
 }  // namespace
@@ -493,6 +530,8 @@ void InitCPUIDInfo() {
 bool TestCPUFeature(CPUFeature feature) {
 #ifdef PLATFORM_IS_X86
   return CPUIDInfo::TestFeature(feature);
+#elif defined(PLATFORM_IS_ARM64) && !defined(__APPLE__) && !defined(__OpenBSD__)
+  return CPUIDInfo::TestAarch64Feature(feature);
 #else
   return false;
 #endif
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
index 9ba3ee80b20e..30eca5c41cea 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
@@ -70,8 +70,11 @@ int GetCurrentCPU();
 int NumHyperthreadsPerCore();
 
 // Mostly ISA related features that we care about
+// Do not change numeric assignments.
 enum CPUFeature {
-  // Do not change numeric assignments.
+  //===--------------------------------------------------------------------===//
+  // x86 features
+  //===--------------------------------------------------------------------===//
   MMX = 0,
   SSE = 1,
   SSE2 = 2,
@@ -138,6 +141,14 @@ enum CPUFeature {
   AMX_FP16 = 45,        // Float16 tile matrix multiplication
   AVX_NE_CONVERT = 46,  // Instructions for faster bfloat16, float16 convert.
   AVX_VNNI_INT8 = 47,   // VNNI instructions for combinations of u8, s8 dtypes.
+
+  //===--------------------------------------------------------------------===//
+  // AArch64 features
+  //===--------------------------------------------------------------------===//
+  AARCH64_NEON = 1000,
+  AARCH64_SVE = 1001,
+  AARCH64_SVE2 = 1002,
+  AARCH64_BF16 = 1003,  // BF16 on AArch64 systems
 };
 
 enum Aarch64CPU {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cpu_info_test.cc b/third_party/xla/third_party/tsl/tsl/platform/cpu_info_test.cc
index e4757931cf6c..56320fd31f60 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cpu_info_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/cpu_info_test.cc
@@ -33,4 +33,10 @@ TEST(CPUInfo, Aarch64NeoverseV1CPU) {
   }
 }
 
+TEST(CPUInfo, Aarch64Bf16) {
+  if (port::TestCPUFeature(port::CPUFeature::AARCH64_BF16)) {
+    EXPECT_TRUE(port::IsAarch64CPU());
+  }
+}
+
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/dso_loader.h b/third_party/xla/third_party/tsl/tsl/platform/dso_loader.h
index b10a5f4eef6e..da1ba0e8f964 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/dso_loader.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/dso_loader.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tsl/platform/platform.h"
 
 // Include appropriate platform-dependent implementations
-#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_CHROMIUMOS)
+#if defined(PLATFORM_GOOGLE) || \
+    (defined(PLATFORM_PORTABLE_GOOGLE) && !defined(__EMSCRIPTEN__))
 #include "xla/tsl/platform/google/dso_loader.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
     defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
diff --git a/third_party/xla/third_party/tsl/tsl/platform/fingerprint.h b/third_party/xla/third_party/tsl/tsl/platform/fingerprint.h
index 33d2b707092d..9f25d7cdfb16 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/fingerprint.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/fingerprint.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_FINGERPRINT_H_
 #define TENSORFLOW_TSL_PLATFORM_FINGERPRINT_H_
 
+#include <array>
+#include <climits>
+#include <cstdint>
+
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/stringpiece.h"
@@ -122,6 +126,19 @@ inline Fprint128 FingerprintCat128(const Fprint128& a, const uint64_t b) {
   return {x, FingerprintCat64(a.high64, x)};
 }
 
+inline std::array<char, 16> Fprint128ToBytes(tsl::Fprint128 fprint) {
+  static_assert(sizeof(char) == 1, "Size of char is not 1 bytes.");
+  static_assert(CHAR_BIT == 8, "Size of byte is not 8 bits.");
+  std::array<char, 16> result;
+  for (int i = 0; i < 8; ++i) {
+    result[8 - i - 1] = fprint.low64 & 0xFF;
+    result[16 - i - 1] = fprint.high64 & 0xFF;
+    fprint.low64 >>= 8;
+    fprint.high64 >>= 8;
+  }
+  return result;
+}
+
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_FINGERPRINT_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/fingerprint_test.cc b/third_party/xla/third_party/tsl/tsl/platform/fingerprint_test.cc
index 2a40d863f78d..b6b990364482 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/fingerprint_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/fingerprint_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tsl/platform/fingerprint.h"
 
+#include <array>
 #include <unordered_set>
 
 #include "xla/tsl/platform/test.h"
@@ -69,5 +70,14 @@ TEST(FingerprintCat64, Idempotence) {
             FingerprintCat64(Fingerprint64("Hello"), Fingerprint64("World")));
 }
 
+TEST(Fprint128ToBytes, WorksCorrectly) {
+  const Fprint128 fprint = {0xCAFEF00DDEADBEEF, 0xC0FFEE123456789A};
+  constexpr std::array<char, 16> kExpected = {
+      '\xca', '\xfe', '\xf0', '\x0d', '\xde', '\xad', '\xbe', '\xef',
+      '\xc0', '\xff', '\xee', '\x12', '\x34', '\x56', '\x78', '\x9a',
+  };
+  EXPECT_EQ(Fprint128ToBytes(fprint), kExpected);
+}
+
 }  // namespace
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers.cc b/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
index 54609b06f010..c9dd5fc4fb6e 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers.cc
@@ -243,6 +243,18 @@ size_t FloatToBuffer(float value, char* buffer) {
   return snprintf_result;
 }
 
+strings_internal::AlphaNumBuffer LegacyPrecision(double d) {
+  strings_internal::AlphaNumBuffer result;
+  result.size = DoubleToBuffer(d, result.data.data());
+  return result;
+}
+
+strings_internal::AlphaNumBuffer LegacyPrecision(float f) {
+  strings_internal::AlphaNumBuffer result;
+  result.size = FloatToBuffer(f, result.data.data());
+  return result;
+}
+
 std::string FpToString(Fprint fp) {
   return absl::StrCat(absl::Hex(fp, absl::kZeroPad16));
 }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers.h b/third_party/xla/third_party/tsl/tsl/platform/numbers.h
index 0f4dc84e2fa1..8d87fc871876 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_NUMBERS_H_
 #define TENSORFLOW_TSL_PLATFORM_NUMBERS_H_
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <string>
 
 #include "absl/base/macros.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/stringpiece.h"
 
@@ -77,6 +79,42 @@ size_t FastUInt64ToBufferLeft(uint64_t i, char* buffer);  // at least 22 bytes
 size_t DoubleToBuffer(double value, char* buffer);
 size_t FloatToBuffer(float value, char* buffer);
 
+namespace strings_internal {
+// AlphaNumBuffer allows a way to pass a string to absl::StrCat without having
+// to do memory allocation. It is simply a pair of a fixed-size character
+// array, and a size.  Please don't use outside of the "strings" package.
+struct AlphaNumBuffer {
+  std::array<char, kFastToBufferSize> data;
+  size_t size;
+
+  // Support for absl::StrCat() etc.
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const AlphaNumBuffer& buffer) {
+    sink.Append(absl::string_view(buffer.data.data(), buffer.size));
+  }
+};
+}  // namespace strings_internal
+
+// Helper function for legacy google formatting.
+template <typename T>
+const T& LegacyPrecision(const T& t) {
+  return t;
+}
+
+// Have to use overloads rather than specialization because specialization can't
+// change the function return type.
+
+// Helper function for the old strings::StrCat default "float" format, which was
+// either %.6g, %.7g, %.8g, or %.9g, basically the smallest string that would
+// round-trip back to the original float. This is fast.
+strings_internal::AlphaNumBuffer LegacyPrecision(float f);
+
+// Helper function for the old strings::StrCat default "double" format, which
+// was either %.15g or %.17g, depending on whether the %.15g format would
+// round-trip back to the original double.  This is approx. 20-30x slower than
+// the others.
+strings_internal::AlphaNumBuffer LegacyPrecision(double d);
+
 // Convert a 64-bit fingerprint value to an ASCII representation.
 std::string FpToString(Fprint fp);
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc b/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
index 2c90bee0c525..60b6deb67dac 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/numbers_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tsl/platform/numbers.h"
 
 #include <cmath>
+#include <cstdlib>
+#include <limits>
 #include <string>
 
 #include "absl/strings/str_cat.h"
@@ -381,5 +383,149 @@ TEST(safe_strtod, Double) {
   EXPECT_TRUE(std::isnan(result));
 }
 
+std::string DoubleToString(double d) {
+  return absl::StrCat(strings::LegacyPrecision(d));
+}
+
+TEST(LegacyPrecision, SimpleDoubleToString) {
+  // Make sure that nice, round decimal numbers are printed using few digits,
+  // even if they can't be represented exactly in binary.
+  EXPECT_EQ("0", DoubleToString(0.0));
+  EXPECT_EQ("1", DoubleToString(1.0));
+  EXPECT_EQ("-1", DoubleToString(-1.0));
+  EXPECT_EQ("0.2", DoubleToString(0.2));
+  EXPECT_EQ("1.1", DoubleToString(1.1));
+  EXPECT_EQ("1e+23", DoubleToString(1e23));
+  EXPECT_EQ("47.8", DoubleToString(47.8));
+  EXPECT_EQ("1000.2", DoubleToString(1000.2));
+}
+
+void TestDoubleRoundTrip(double value) {
+  std::string str = DoubleToString(value);
+  double rt = std::strtod(str.c_str(), nullptr);
+  if (std::isnan(value)) {
+    EXPECT_TRUE(std::isnan(rt));
+  } else {
+    EXPECT_EQ(value, rt);
+  }
+}
+
+TEST(LegacyPrecision, DoubleRoundTrip) {
+  // Make sure round-trips with strtod() work.  Note that even though we're
+  // dealing with floating points, we expect the results to be *exactly*
+  // equal, not approximately.
+  TestDoubleRoundTrip(1.2345678901234567);
+  TestDoubleRoundTrip(1.2345678901234565);
+  TestDoubleRoundTrip(1.2345678901234569);
+  TestDoubleRoundTrip(47.800000000000001);
+  TestDoubleRoundTrip(0.10000000000000005);
+  TestDoubleRoundTrip(0.010000000000000005);
+  TestDoubleRoundTrip(0.000000010000000000000005);
+  TestDoubleRoundTrip(1.0000000000000005);
+  TestDoubleRoundTrip(10.000000000000005);
+  TestDoubleRoundTrip(100.00000000000005);
+  TestDoubleRoundTrip(1000.0000000000005);
+  TestDoubleRoundTrip(100000000000000.05);
+
+  // IEEE-754 double finite values furthest from zero.
+  TestDoubleRoundTrip(1.7976931348623157e308);
+  TestDoubleRoundTrip(-1.7976931348623157e308);
+
+  // IEEE-754 double normalized values closest to zero.
+  TestDoubleRoundTrip(2.225073858507202e-308);
+  TestDoubleRoundTrip(-2.225073858507202e-308);
+
+  // IEEE-754 double denormalized values closest to zero.
+  TestDoubleRoundTrip(5e-324);
+  TestDoubleRoundTrip(-5e-324);
+
+  // Biggest and lowest valid numbers.
+  TestDoubleRoundTrip(std::numeric_limits<double>::max());
+  TestDoubleRoundTrip(std::numeric_limits<double>::lowest());
+
+  // Infinity and NaN.
+  TestDoubleRoundTrip(std::numeric_limits<double>::infinity());
+  TestDoubleRoundTrip(-std::numeric_limits<double>::infinity());
+  TestDoubleRoundTrip(std::numeric_limits<double>::quiet_NaN());
+}
+
+std::string FloatToString(float f) {
+  return absl::StrCat(strings::LegacyPrecision(f));
+}
+
+TEST(LegacyPrecision, SimpleFloatToString) {
+  // Make sure that nice, round decimal numbers are printed using few digits,
+  // even if they can't be represented exactly in binary.
+  EXPECT_EQ("0", FloatToString(0.0f));
+  EXPECT_EQ("-0", FloatToString(-0.0f));
+  EXPECT_EQ("1", FloatToString(1.0f));
+  EXPECT_EQ("-1", FloatToString(-1.0f));
+  EXPECT_EQ("0.2", FloatToString(0.2f));
+  EXPECT_EQ("1.1", FloatToString(1.1f));
+  EXPECT_EQ("1e+23", FloatToString(1e23f));
+  EXPECT_EQ("47.8", FloatToString(47.8f));
+  EXPECT_EQ("1000.2", FloatToString(1000.2f));
+  EXPECT_EQ("1.17549435e-38", FloatToString(1.17549435e-38f));
+}
+
+void TestFloatRoundTrip(float value) {
+  std::string str = FloatToString(value);
+  float rt = std::strtof(str.c_str(), nullptr);
+  if (std::isnan(value)) {
+    EXPECT_TRUE(std::isnan(rt));
+  } else {
+    EXPECT_EQ(value, rt);
+  }
+}
+
+TEST(LegacyPrecision, FloatRoundTrip) {
+  // Make sure round-trips with strtod() work.  Note that even though we're
+  // dealing with floating points, we expect the results to be *exactly*
+  // equal, not approximately.
+  FloatToString(1.2345678901234567);
+  FloatToString(1.2345678901234565);
+  FloatToString(1.2345678901234569);
+  FloatToString(47.800005);
+  FloatToString(0.10000005);
+  FloatToString(0.010000005);
+  FloatToString(0.000000010000005);
+  FloatToString(1.0000005);
+  FloatToString(10.000005);
+  FloatToString(100.00005);
+  FloatToString(1000.0005);
+  FloatToString(100000.05);
+  FloatToString(10.0000095);
+  FloatToString(10.0000100);
+  FloatToString(10.0000105);
+  FloatToString(10.0000110);
+  FloatToString(10.0000115);
+
+  // IEEE-754 float finite values furthest from zero.
+  FloatToString(+3.4028234e38f);
+  FloatToString(-3.4028234e38f);
+
+  // IEEE-754 float normalized values closest to zero.
+  FloatToString(+1.175494351e-38);
+  FloatToString(-1.175494351e-38);
+
+  // IEEE-754 double denormalized values closest to zero.
+  FloatToString(+1.4e-45);
+  FloatToString(-1.4e-45);
+
+  // Biggest and lowest valid numbers.
+  FloatToString(std::numeric_limits<float>::max());
+  FloatToString(std::numeric_limits<float>::lowest());
+
+  // Infinity and NaN.
+  FloatToString(std::numeric_limits<float>::infinity());
+  FloatToString(-std::numeric_limits<float>::infinity());
+  FloatToString(std::numeric_limits<float>::quiet_NaN());
+}
+
+TEST(LegacyPrecision, NoOpTypes) {
+  EXPECT_EQ(absl::StrCat(strings::LegacyPrecision(1)), "1");
+  EXPECT_EQ(absl::StrCat(strings::LegacyPrecision("foo")), "foo");
+}
+
 }  // namespace strings
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/platform.h b/third_party/xla/third_party/tsl/tsl/platform/platform.h
index a89772370012..ebc3173d557a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/platform.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/platform.h
@@ -58,6 +58,10 @@ limitations under the License.
 // files.
 #define PLATFORM_GOOGLE
 
+#elif defined(__CHROME__)
+#define PLATFORM_POSIX
+#define PLATFORM_PORTABLE_GOOGLE
+
 #else
 // If no platform specified, use:
 #define PLATFORM_POSIX
@@ -72,7 +76,7 @@ limitations under the License.
 #define PLATFORM_IS_X86
 #endif
 
-// Check if we are compmiling for an arm device.
+// Check if we are compiling for an arm device.
 #if defined(__arm__) || defined(__aarch64__)
 #define PLATFORM_IS_ARM
 #if defined(__aarch64__)
diff --git a/third_party/xla/third_party/tsl/tsl/platform/strcat.cc b/third_party/xla/third_party/tsl/tsl/platform/strcat.cc
index 0259c4bb4c02..6e04df5158d5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/strcat.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/strcat.cc
@@ -32,7 +32,7 @@ AlphaNum::AlphaNum(Hex hex) {
   char *const end = &digits_[kFastToBufferSize];
   char *writer = end;
   uint64 value = hex.value;
-  uint64 width = hex.spec;
+  uint64 width = hex.width;
   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
   // where 0x10000 is the smallest hex number that is as wide as the user
   // asked for.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/strcat.h b/third_party/xla/third_party/tsl/tsl/platform/strcat.h
index dfea869466c0..e2aa39e73010 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/strcat.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/strcat.h
@@ -22,6 +22,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/numbers.h"
@@ -52,47 +55,30 @@ limitations under the License.
 // You can convert to Hexadecimal output rather than Decimal output using Hex.
 // To do this, pass strings::Hex(my_int) as a parameter to StrCat. You may
 // specify a minimum field width using a separate parameter, so the equivalent
-// of Printf("%04x", my_int) is StrCat(Hex(my_int, strings::kZeroPad4))
+// of Printf("%04x", my_int) is StrCat(Hex(my_int, absl::kZeroPad4))
 //
 // This class has implicit constructors.
 namespace tsl {
 namespace strings {
 
-enum PadSpec {
-  kNoPad = 1,
-  kZeroPad2,
-  kZeroPad3,
-  kZeroPad4,
-  kZeroPad5,
-  kZeroPad6,
-  kZeroPad7,
-  kZeroPad8,
-  kZeroPad9,
-  kZeroPad10,
-  kZeroPad11,
-  kZeroPad12,
-  kZeroPad13,
-  kZeroPad14,
-  kZeroPad15,
-  kZeroPad16
-};
-
-struct Hex {
-  uint64 value;
-  enum PadSpec spec;
-  template <class Int>
-  explicit Hex(Int v, PadSpec s = kNoPad) : spec(s) {
-    // Prevent sign-extension by casting integers to
-    // their unsigned counterparts.
-    static_assert(
-        sizeof(v) == 1 || sizeof(v) == 2 || sizeof(v) == 4 || sizeof(v) == 8,
-        "Unknown integer type");
-    value = sizeof(v) == 1   ? static_cast<uint8>(v)
-            : sizeof(v) == 2 ? static_cast<uint16>(v)
-            : sizeof(v) == 4 ? static_cast<uint32>(v)
-                             : static_cast<uint64>(v);
-  }
-};
+using PadSpec ABSL_DEPRECATE_AND_INLINE() = absl::PadSpec;
+using absl::kNoPad;
+using absl::kZeroPad10;
+using absl::kZeroPad11;
+using absl::kZeroPad12;
+using absl::kZeroPad13;
+using absl::kZeroPad14;
+using absl::kZeroPad15;
+using absl::kZeroPad16;
+using absl::kZeroPad2;
+using absl::kZeroPad3;
+using absl::kZeroPad4;
+using absl::kZeroPad5;
+using absl::kZeroPad6;
+using absl::kZeroPad7;
+using absl::kZeroPad8;
+using absl::kZeroPad9;
+using Hex ABSL_DEPRECATE_AND_INLINE() = absl::Hex;
 
 class AlphaNum {
   // NOLINTBEGIN(google-explicit-constructor)
@@ -121,10 +107,10 @@ class AlphaNum {
 
   AlphaNum(Hex hex);  // NOLINT(runtime/explicit)
 
-  AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
+  AlphaNum(const char *c_str) : piece_(c_str) {}  // NOLINT(runtime/explicit)
   AlphaNum(const absl::string_view &pc)
-      : piece_(pc) {}                              // NOLINT(runtime/explicit)
-  AlphaNum(const std::string &str)                 // NOLINT(runtime/explicit)
+      : piece_(pc) {}               // NOLINT(runtime/explicit)
+  AlphaNum(const std::string &str)  // NOLINT(runtime/explicit)
       : piece_(str) {}
   AlphaNum(const tstring &str)  // NOLINT(runtime/explicit)
       : piece_(str) {}
@@ -172,10 +158,14 @@ class AlphaNum {
 // ----------------------------------------------------------------------
 
 // For performance reasons, we have specializations for <= 4 args.
+ABSL_DEPRECATED("Use absl::StrCat() instead.")
 std::string StrCat(const AlphaNum &a) TF_MUST_USE_RESULT;
+ABSL_DEPRECATED("Use absl::StrCat() instead.")
 std::string StrCat(const AlphaNum &a, const AlphaNum &b) TF_MUST_USE_RESULT;
+ABSL_DEPRECATED("Use absl::StrCat() instead.")
 std::string StrCat(const AlphaNum &a, const AlphaNum &b,
                    const AlphaNum &c) TF_MUST_USE_RESULT;
+ABSL_DEPRECATED("Use absl::StrCat() instead.")
 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                    const AlphaNum &d) TF_MUST_USE_RESULT;
 
@@ -190,11 +180,13 @@ void AppendPieces(std::string *dest,
 
 // Support 5 or more arguments
 template <typename... AV>
+ABSL_DEPRECATED("Use absl::StrCat() instead.")
 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                    const AlphaNum &d, const AlphaNum &e,
                    const AV &...args) TF_MUST_USE_RESULT;
 
 template <typename... AV>
+ABSL_DEPRECATED("Use absl::StrCat() instead.")
 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
                    const AlphaNum &d, const AlphaNum &e, const AV &...args) {
   return internal::CatPieces({a.Piece(), b.Piece(), c.Piece(), d.Piece(),
@@ -223,15 +215,20 @@ std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
 //    worked around as consecutive calls to StrAppend are quite efficient.
 // ----------------------------------------------------------------------
 
+ABSL_DEPRECATED("Use absl::StrAppend() instead.")
 void StrAppend(std::string *dest, const AlphaNum &a);
+ABSL_DEPRECATED("Use absl::StrAppend() instead.")
 void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b);
+ABSL_DEPRECATED("Use absl::StrAppend() instead.")
 void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b,
                const AlphaNum &c);
+ABSL_DEPRECATED("Use absl::StrAppend() instead.")
 void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b,
                const AlphaNum &c, const AlphaNum &d);
 
 // Support 5 or more arguments
 template <typename... AV>
+ABSL_DEPRECATED("Use absl::StrAppend() instead.")
 inline void StrAppend(std::string *dest, const AlphaNum &a, const AlphaNum &b,
                       const AlphaNum &c, const AlphaNum &d, const AlphaNum &e,
                       const AV &...args) {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/strcat_test.cc b/third_party/xla/third_party/tsl/tsl/platform/strcat_test.cc
index d98359458dd5..9ff95912c77c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/strcat_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/strcat_test.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "tsl/platform/strcat.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/types.h"
+#include "tsl/platform/bfloat16.h"
 #include "tsl/platform/stringprintf.h"
 
 #ifdef _MSC_VER
@@ -336,35 +340,35 @@ TEST(StrAppend, Death) {
 }
 
 static void CheckHex64(uint64 v) {
-  string actual = StrCat(Hex(v, kZeroPad16));
+  string actual = strings::StrCat(absl::Hex(v, absl::kZeroPad16));
   string expected = Printf("%016llx", static_cast<unsigned long long>(v));
   EXPECT_EQ(expected, actual) << " decimal value " << v;
 
-  actual = StrCat(Hex(v, kZeroPad8));
+  actual = strings::StrCat(absl::Hex(v, absl::kZeroPad8));
   expected = Printf("%08llx", static_cast<unsigned long long>(v));
   EXPECT_EQ(expected, actual) << " decimal value " << v;
 
-  actual = StrCat(Hex(v));
+  actual = strings::StrCat(absl::Hex(v));
   expected = Printf("%llx", static_cast<unsigned long long>(v));
   EXPECT_EQ(expected, actual) << " decimal value " << v;
 }
 
 static void CheckHex32(uint32 v) {
-  string actual = StrCat(Hex(v, kZeroPad8));
+  string actual = strings::StrCat(absl::Hex(v, absl::kZeroPad8));
   string expected = Printf("%08x", v);
   EXPECT_EQ(expected, actual) << " decimal value " << v;
 
-  actual = StrCat(Hex(v));
+  actual = strings::StrCat(absl::Hex(v));
   expected = Printf("%x", v);
   EXPECT_EQ(expected, actual) << " decimal value " << v;
 }
 
 static void CheckHexSigned32(int32_t v) {
-  string actual = StrCat(Hex(v, kZeroPad8));
+  string actual = strings::StrCat(absl::Hex(v, absl::kZeroPad8));
   string expected = Printf("%08x", v);
   EXPECT_EQ(expected, actual) << " decimal value " << v;
 
-  actual = StrCat(Hex(v));
+  actual = strings::StrCat(absl::Hex(v));
   expected = Printf("%x", v);
   EXPECT_EQ(expected, actual) << " decimal value " << v;
 }
@@ -381,10 +385,10 @@ static void TestFastPrints() {
   CheckHex32(0x12345678);
 
   int8_t minus_one_8bit = -1;
-  EXPECT_EQ("ff", StrCat(Hex(minus_one_8bit)));
+  EXPECT_EQ("ff", strings::StrCat(absl::Hex(minus_one_8bit)));
 
   int16_t minus_one_16bit = -1;
-  EXPECT_EQ("ffff", StrCat(Hex(minus_one_16bit)));
+  EXPECT_EQ("ffff", strings::StrCat(absl::Hex(minus_one_16bit)));
 }
 
 TEST(Numbers, TestFunctionsMovedOverFromNumbersMain) { TestFastPrints(); }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tstring.h b/third_party/xla/third_party/tsl/tsl/platform/tstring.h
index 97028f6ca261..9c10389fc043 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tstring.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/tstring.h
@@ -144,12 +144,6 @@ class tstring {
   operator std::string() const;  // NOLINT
   // TODO(b/147740521): Make explicit.
   operator absl::string_view() const;  // NOLINT
-#ifdef PLATFORM_GOOGLE
-  template <typename T,
-            typename std::enable_if<std::is_same<T, absl::AlphaNum>::value,
-                                    T>::type* = nullptr>
-  operator T() const;  // NOLINT TODO(b/147740521): Remove.
-#endif  // PLATFORM_GOOGLE
 
   // Attributes
   size_t size() const;
@@ -215,6 +209,11 @@ class tstring {
   friend tstring operator+(const tstring& a, const tstring& b);
   friend std::ostream& operator<<(std::ostream& o, const tstring& str);
   friend std::hash<tstring>;
+  // Support for absl::StrCat() etc.
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const tstring& str) {
+    sink.Append(absl::string_view(str));
+  }
 };
 
 // Non-member function overloads
@@ -381,14 +380,6 @@ inline tstring::operator absl::string_view() const {
   return absl::string_view(data(), size());
 }
 
-#ifdef PLATFORM_GOOGLE
-template <typename T, typename std::enable_if<
-                          std::is_same<T, absl::AlphaNum>::value, T>::type*>
-inline tstring::operator T() const {
-  return T(absl::string_view(*this));
-}
-#endif  // PLATFORM_GOOGLE
-
 // Attributes
 
 inline size_t tstring::size() const { return TF_TString_GetSize(&tstr_); }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc b/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
index 859f8676846e..04951c6e9de5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/tstring_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/platform/test.h"
 #include "tsl/platform/cord.h"
 #include "tsl/platform/platform.h"
@@ -261,12 +262,12 @@ TEST(TF_TStringTest, Conversion) {
   EXPECT_EQ(kLongString, s52);
   EXPECT_EQ(kLongStringLen, s52.size());
 
-#ifdef PLATFORM_GOOGLE
-  absl::AlphaNum s53(s50);
+  std::string s53 = absl::StrCat(s50);
 
   EXPECT_STREQ(kLongString, s53.data());
   EXPECT_EQ(kLongStringLen, s53.size());
-#endif  // PLATFORM_GOOGLE
+
+  EXPECT_EQ(std::string("\0a", 2), absl::StrCat(tstring("\0a", 2)));
 }
 
 TEST(TF_TStringTest, Allocation) {
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index d9d3165defee..720c9134f9ad 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -173,7 +173,10 @@ tsl_cc_test(
 cc_library(
     name = "profiler_session",
     hdrs = ["profiler_session.h"],
-    visibility = internal_visibility(["@local_xla//xla/tsl:internal"]),
+    visibility = internal_visibility([
+        "@local_xla//xla/tsl:internal",
+        "//third_party/py/jax:__subpackages__",
+    ]),
     deps = [
         "//tsl/platform",
         "//tsl/platform:thread_annotations",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
index a581f47572f9..13ebdb901a25 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
@@ -50,6 +50,8 @@ const char* GetContextTypeString(ContextType context_type) {
       return "pjrt_library_call";
     case ContextType::kThreadpoolEvent:
       return "threadpool_event";
+    case ContextType::kJaxServingExecutor:
+      return "jax_serving";
   }
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
index 35bf1b8b2755..1e07a038272a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
@@ -38,7 +38,8 @@ enum class ContextType : int {
   kPathwaysExecutor,
   kPjrtLibraryCall,
   kThreadpoolEvent,
-  kLastContextType = ContextType::kTpuLaunch,
+  kJaxServingExecutor,
+  kLastContextType = ContextType::kJaxServingExecutor,
 };
 
 // In XFlow we encode context type as flow category as 6 bits.
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
index 849092eb46b1..ea677e5fa7fa 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
@@ -121,7 +121,13 @@ ProfilerSession::ProfilerSession(const ProfileOptions& options)
   DCHECK(profiler_lock_.Active());
   profilers_ = std::make_unique<tsl::profiler::ProfilerCollection>(
       profiler::CreateProfilers(options_));
-  profilers_->Start().IgnoreError();
+
+  absl::Status status = profilers_->Start();
+  if (options_.raise_error_on_start_failure()) {
+    status_ = status;
+  } else {
+    status.IgnoreError();
+  }
 #endif
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
index d39536401e7a..6ea66b6bde27 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string_view>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/macros.h"
 #include "tsl/platform/platform.h"  // IWYU pragma: keep
 #include "tsl/profiler/lib/nvtx_utils.h"
@@ -44,7 +45,8 @@ void PushAnnotation(const T& generator) {
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-    AnnotationStack::PushAnnotation(static_cast<std::string_view>(generator()));
+    AnnotationStack::PushAnnotation(
+        static_cast<absl::string_view>(generator()));
   }
 #endif
 }
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
index be5489b2cd4c..81905a10c947 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/profiler_options.proto
@@ -2,7 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 
-// Next ID: 12
+// Next ID: 14
 message ProfileOptions {
   // Some default value of option are not proto3 default value. Use this version
   // to determine if we should use default option value instead of proto3
@@ -73,6 +73,29 @@ message ProfileOptions {
   }
 
   TraceOptions trace_options = 11;
+
+  // AdvancedConfigValue represents the configuration value, it can be one of
+  // the following types: string, bool, int64, depending upon the config type.
+  message AdvancedConfigValue {
+    oneof value {
+      string string_value = 1;
+      bool bool_value = 2;
+      int64 int64_value = 3;
+    }
+  }
+
+  // Advanced configuration for the profiler contains a map of config name to
+  // config value. It gives the flexibility to pass any configuration to the
+  // profiler. eg:
+  // advanced_configuration {
+  //   key: "tpu_trace_mode"
+  //   value: {
+  //     int64_value: 2
+  //   }
+  // }
+  map<string, AdvancedConfigValue> advanced_configuration = 12;
+
+  bool raise_error_on_start_failure = 13;
 }
 
 // Options for remote profiler session manager.
diff --git a/third_party/xla/third_party/uv/BUILD b/third_party/xla/third_party/uv/BUILD
index 3c413807167a..cc558415f2b3 100644
--- a/third_party/xla/third_party/uv/BUILD
+++ b/third_party/xla/third_party/uv/BUILD
@@ -1 +1 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/tools/def_file_filter/symbols_pybind.txt b/third_party/xla/tools/def_file_filter/symbols_pybind.txt
index 0e1abdd8dd23..53cefe2c70a9 100644
--- a/third_party/xla/tools/def_file_filter/symbols_pybind.txt
+++ b/third_party/xla/tools/def_file_filter/symbols_pybind.txt
@@ -429,6 +429,14 @@ tsl::profiler::RemoteProfilerSession::GetServiceAddress
 tsl::profiler::RemoteProfilerSession::WaitForCompletion
 tsl::profiler::RemoteProfilerSession::~RemoteProfilerSession
 
+[//external/local_xla/xla/tsl/concurrency:async_value] # tfcompile
+tsl::AsyncValue::CreateTypeInfoAndReturnTypeIdImpl
+tsl::AsyncValue::TypeInfo
+tsl::internal::ConcreteAsyncValue
+tsl::DummyValueForErrorAsyncValue
+tsl::AsyncValue::NotifyAvailable
+tsl::internal::ConcreteAsyncValue
+
 [//external/local_xla/xla:status_macros] # tfcompile
 xla::status_macros::MakeErrorStream::Impl::Impl
 xla::status_macros::MakeErrorStream::Impl::~Impl
diff --git a/third_party/xla/tools/toolchains/android/BUILD b/third_party/xla/tools/toolchains/android/BUILD
new file mode 100644
index 000000000000..fe32baa142bb
--- /dev/null
+++ b/third_party/xla/tools/toolchains/android/BUILD
@@ -0,0 +1,35 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+platform(
+    name = "x86",
+    constraint_values = [
+        "@platforms//os:android",
+        "@platforms//cpu:x86_32",
+    ],
+)
+
+platform(
+    name = "x86_64",
+    constraint_values = [
+        "@platforms//os:android",
+        "@platforms//cpu:x86_64",
+    ],
+)
+
+platform(
+    name = "armeabi-v7a",
+    constraint_values = [
+        "@platforms//os:android",
+        "@platforms//cpu:armv7",
+    ],
+)
+
+platform(
+    name = "arm64-v8a",
+    constraint_values = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:android",
+    ],
+)
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
index 8945d3988e5b..da4cc1302b7c 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64.bzl
@@ -2,7 +2,7 @@
 """
 
 load(
-    "//third_party/remote_config:common.bzl",
+    "@local_xla//third_party/remote_config:common.bzl",
     "err_out",
     "get_host_environ",
     "raw_exec",
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 166d87f63e86..77f3565f0736 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,6 +1,6 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
diff --git a/third_party/xla/tools/toolchains/ios/BUILD b/third_party/xla/tools/toolchains/ios/BUILD
new file mode 100644
index 000000000000..b35900758000
--- /dev/null
+++ b/third_party/xla/tools/toolchains/ios/BUILD
@@ -0,0 +1,13 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+platform(
+    name = "ios_armv7",
+    constraint_values = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:ios",
+        "@build_bazel_apple_support//constraints:apple",
+        "@build_bazel_apple_support//constraints:device",
+    ],
+)
diff --git a/third_party/xla/tools/toolchains/linux/BUILD b/third_party/xla/tools/toolchains/linux/BUILD
new file mode 100644
index 000000000000..7ef483e7dc83
--- /dev/null
+++ b/third_party/xla/tools/toolchains/linux/BUILD
@@ -0,0 +1,19 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+platform(
+    name = "linux_aarch64",
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+        "@platforms//os:linux",
+    ],
+)
+
+platform(
+    name = "linux_armhf",
+    constraint_values = [
+        "@platforms//cpu:armv7e-mf",
+        "@platforms//os:linux",
+    ],
+)
diff --git a/third_party/xla/tools/toolchains/python/python_repo.bzl b/third_party/xla/tools/toolchains/python/python_repo.bzl
index 47fe64d7b7b0..2af9b29d7af2 100644
--- a/third_party/xla/tools/toolchains/python/python_repo.bzl
+++ b/third_party/xla/tools/toolchains/python/python_repo.bzl
@@ -7,7 +7,7 @@ Defaults to 3.10.
 To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
 """
 
-VERSIONS = ["3.9", "3.10", "3.11", "3.12"]
+VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"]
 DEFAULT_VERSION = "3.11"
 WARNING = """
 TF_PYTHON_VERSION environment variable was not set correctly; using Python {}.
diff --git a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
index dbfafdfb08c1..ba78bb509de7 100644
--- a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,8 +1,8 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("@local_xla//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("@local_xla//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//tools/toolchains/remote_config:containers.bzl", "containers")
 
 def _container_image_uri(container_name):
diff --git a/third_party/xla/tsl_workspace1.bzl b/third_party/xla/tsl_workspace1.bzl
index 2495080d804c..aead12298027 100644
--- a/third_party/xla/tsl_workspace1.bzl
+++ b/third_party/xla/tsl_workspace1.bzl
@@ -2,6 +2,7 @@
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
@@ -17,6 +18,8 @@ def workspace(with_rules_cc = True):
 
     closure_repositories()
 
+    boost_deps()
+
     http_archive(
         name = "bazel_toolchains",
         sha256 = "294cdd859e57fcaf101d4301978c408c88683fbc46fbc1a3829da92afbea55fb",
diff --git a/third_party/xla/tsl_workspace2.bzl b/third_party/xla/tsl_workspace2.bzl
index 11ef5e3ce015..b705c7e34f07 100644
--- a/third_party/xla/tsl_workspace2.bzl
+++ b/third_party/xla/tsl_workspace2.bzl
@@ -21,6 +21,7 @@ load("//third_party/gemmlowp:workspace.bzl", gemmlowp = "repo")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/gpus:sycl_configure.bzl", "sycl_configure")
+load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
@@ -49,6 +50,7 @@ def _initialize_third_party():
     eigen3()
     farmhash()
     gemmlowp()
+    highwayhash()
     hwloc()
     implib_so()
     ml_dtypes()
@@ -115,9 +117,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "9e290e7b094134bdda0cad4ef4b89625fbde3c4b8e8f5dc84044c0f2e55b875a",
-        strip_prefix = "XNNPACK-5b4978cae19292232a27bdf0f495819bf5297167",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/5b4978cae19292232a27bdf0f495819bf5297167.zip"),
+        sha256 = "1832b8998252529d73e585b545c3f1a12a69ddd136ba9072ea9f717e17ce452b",
+        strip_prefix = "XNNPACK-8a2f5f441833b80806b58b5d704ec8335634182c",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/8a2f5f441833b80806b58b5d704ec8335634182c.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -130,9 +132,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "215724985c4845cdcadcb5f26a2a8777943927bb5a172a00e7716fe16a6f3c1b",
-        strip_prefix = "pthreadpool-b1aee199d54003fb557076a201bcac3398af580b",
-        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/b1aee199d54003fb557076a201bcac3398af580b.zip"),
+        sha256 = "6416b3ca51c60fbcd4776685ef27e4858760ecf689d113adf074a0749f977ff7",
+        strip_prefix = "pthreadpool-290ee6fff0c36614702d6b297c148e3fa08e056a",
+        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/290ee6fff0c36614702d6b297c148e3fa08e056a.zip"),
     )
 
     tf_http_archive(
@@ -304,9 +306,28 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "com_google_googletest",
-        sha256 = "81964fe578e9bd7c94dfdb09c8e4d6e6759e19967e397dbea48d1c10e45d0df2",
-        strip_prefix = "googletest-release-1.12.1",
-        urls = tf_mirror_urls("https://github.com/google/googletest/archive/refs/tags/release-1.12.1.tar.gz"),
+        # Use the commit on 2025/3/21:
+        # https://github.com/google/googletest/commit/2ae29b52fdff88c52fef655fa0d245fc514ca35b
+        sha256 = "21a3a4021fd5e3127c90547234e2126d24f23571fedefa0d9370bf706a870fba",
+        strip_prefix = "googletest-2ae29b52fdff88c52fef655fa0d245fc514ca35b",
+        # Patch googletest to:
+        #   - avoid dependencies on @fuchsia_sdk,
+        #   - refer to re2 as @com_googlesource_code_re2,
+        #   - refer to abseil as @com_google_absl.
+        #
+        # To update the patch, run:
+        # $ cd ~
+        # $ mkdir -p github
+        # $ cd github
+        # $ git clone https://github.com/google/googletest.git
+        # $ cd googletest
+        # $ git checkout 2ae29b52fdff88c52fef655fa0d245fc514ca35b
+        # ... make local changes to googletest ...
+        # $ git diff > <client-root>/third_party/tensorflow/third_party/googletest/googletest.patch
+        #
+        # The patch path is relative to third_party/tensorflow.
+        patch_file = ["//third_party/googletest:googletest.patch"],
+        urls = tf_mirror_urls("https://github.com/google/googletest/archive/2ae29b52fdff88c52fef655fa0d245fc514ca35b.zip"),
     )
 
     tf_http_archive(
@@ -343,7 +364,7 @@ def _tf_repositories():
             "//third_party/grpc:register_go_toolchain.patch",
         ],
         system_link_files = {
-            "//third_party/systemlibs:BUILD": "bazel/BUILD",
+            "//third_party/systemlibs:BUILD.bazel": "bazel/BUILD.bazel",
             "//third_party/systemlibs:grpc.BUILD": "src/compiler/BUILD",
             "//third_party/systemlibs:grpc.bazel.grpc_deps.bzl": "bazel/grpc_deps.bzl",
             "//third_party/systemlibs:grpc.bazel.grpc_extra_deps.bzl": "bazel/grpc_extra_deps.bzl",
@@ -359,7 +380,7 @@ def _tf_repositories():
     # Intel openMP that is part of LLVM sources.
     tf_http_archive(
         name = "llvm_openmp",
-        build_file = "//third_party/llvm_openmp:BUILD",
+        build_file = "//third_party/llvm_openmp:BUILD.bazel",
         patch_file = ["//third_party/llvm_openmp:openmp_switch_default_patch.patch"],
         sha256 = "d19f728c8e04fb1e94566c8d76aef50ec926cd2f95ef3bf1e0a5de4909b28b44",
         strip_prefix = "openmp-10.0.1.src",
@@ -403,7 +424,7 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "nvtx_archive",
-        build_file = "//third_party:nvtx/BUILD",
+        build_file = "//third_party:nvtx/BUILD.bazel",
         sha256 = "e4438f921fb88a564b0b92791c1c1fdd0f388901213e6a31fdd0dc3803fb9764",
         strip_prefix = "NVTX-bf31d7859ab3130cbf1ef77c33d18d0ebb8c8d08/c/include",
         urls = tf_mirror_urls("https://github.com/NVIDIA/NVTX/archive/bf31d7859ab3130cbf1ef77c33d18d0ebb8c8d08.tar.gz"),
@@ -512,8 +533,8 @@ def _tf_repositories():
     # https://github.com/bazelbuild/apple_support/releases
     tf_http_archive(
         name = "build_bazel_apple_support",
-        sha256 = "b6148de2d7bbdf9e5819b4e2265f6508321a8e1f0a15990eb048f822cd41550d",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.18.0/apple_support.1.18.0.tar.gz"),
+        sha256 = "d71b02d6df0500f43279e22400db6680024c1c439115c57a9a82e9effe199d7b",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.18.1/apple_support.1.18.1.tar.gz"),
     )
 
     # https://github.com/apple/swift-protobuf/releases
@@ -562,7 +583,12 @@ def _tf_repositories():
         name = "upb",
         sha256 = "61d0417abd60e65ed589c9deee7c124fe76a4106831f6ad39464e1525cef1454",
         strip_prefix = "upb-9effcbcb27f0a665f9f345030188c0b291e32482",
-        patch_file = ["//third_party/grpc:upb_platform_fix.patch"],
+        patch_file = [
+            "//third_party/grpc:upb_platform_fix.patch",
+            # Disables warning-as-error when building upb, as it generates
+            # warnings when compiled with clang.
+            "//third_party/grpc:upb_build.patch",
+        ],
         urls = tf_mirror_urls("https://github.com/protocolbuffers/upb/archive/9effcbcb27f0a665f9f345030188c0b291e32482.tar.gz"),
     )
 
@@ -589,6 +615,13 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/dad1f0eaab8047a4f73c50ed5f3d1694b78aae97.tar.gz"),
     )
 
+    tf_http_archive(
+        name = "com_github_nelhage_rules_boost",
+        urls = tf_mirror_urls("https://github.com/nelhage/rules_boost/archive/5160325dbdc8c9e499f9d9917d913f35f1785d52.zip"),
+        strip_prefix = "rules_boost-5160325dbdc8c9e499f9d9917d913f35f1785d52",
+        sha256 = "feb4b1294684c79df7c1e08f1aec5da0da52021e33db59c88edbe86b4d1a017a",
+    )
+
 # buildifier: disable=unnamed-macro
 def workspace():
     # Check the bazel version before executing any repository rules, in case
diff --git a/third_party/xla/workspace1.bzl b/third_party/xla/workspace1.bzl
index f04f6305d9b8..b4c446195874 100644
--- a/third_party/xla/workspace1.bzl
+++ b/third_party/xla/workspace1.bzl
@@ -2,6 +2,7 @@
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 load("//:tsl_workspace1.bzl", "tsl_workspace1")
@@ -16,6 +17,8 @@ def workspace():
 
     closure_repositories()
 
+    boost_deps()
+
     http_archive(
         name = "bazel_toolchains",
         sha256 = "294cdd859e57fcaf101d4301978c408c88683fbc46fbc1a3829da92afbea55fb",
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 8a0336e338aa..345f1931c68e 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -45,9 +45,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "9e290e7b094134bdda0cad4ef4b89625fbde3c4b8e8f5dc84044c0f2e55b875a",
-        strip_prefix = "XNNPACK-5b4978cae19292232a27bdf0f495819bf5297167",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/5b4978cae19292232a27bdf0f495819bf5297167.zip"),
+        sha256 = "1832b8998252529d73e585b545c3f1a12a69ddd136ba9072ea9f717e17ce452b",
+        strip_prefix = "XNNPACK-8a2f5f441833b80806b58b5d704ec8335634182c",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/8a2f5f441833b80806b58b5d704ec8335634182c.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -74,9 +74,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "215724985c4845cdcadcb5f26a2a8777943927bb5a172a00e7716fe16a6f3c1b",
-        strip_prefix = "pthreadpool-b1aee199d54003fb557076a201bcac3398af580b",
-        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/b1aee199d54003fb557076a201bcac3398af580b.zip"),
+        sha256 = "6416b3ca51c60fbcd4776685ef27e4858760ecf689d113adf074a0749f977ff7",
+        strip_prefix = "pthreadpool-290ee6fff0c36614702d6b297c148e3fa08e056a",
+        urls = tf_mirror_urls("https://github.com/google/pthreadpool/archive/290ee6fff0c36614702d6b297c148e3fa08e056a.zip"),
     )
 
     tf_http_archive(
@@ -91,17 +91,17 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "59fb63e273c845cb85996d536194a7e2b22012810983cbbf06c4a46b09d17a32",
-        strip_prefix = "cudnn-frontend-1.10.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.10.0.zip"),
+        sha256 = "34dfe01057e43e799af207522aa0c863ad3177f8c1568b6e7a7e4ccf1cbff769",
+        strip_prefix = "cudnn-frontend-1.11.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.11.0.zip"),
     )
 
     tf_http_archive(
         name = "cutlass_archive",
         build_file = "//third_party:cutlass.BUILD",
-        sha256 = "84cf3fcc47c440a8dde016eb458f8d6b93b3335d9c3a7a16f388333823f1eae0",
-        strip_prefix = "cutlass-afa7b7241aabe598b725c65480bd9fa71121732c",
-        urls = tf_mirror_urls("https://github.com/chsigg/cutlass/archive/afa7b7241aabe598b725c65480bd9fa71121732c.tar.gz"),
+        sha256 = "a7739ca3dc74e3a5cb57f93fc95224c5e2a3c2dff2c16bb09a5e459463604c08",
+        strip_prefix = "cutlass-3.8.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.8.0.zip"),
     )
 
     tf_http_archive(
@@ -158,6 +158,13 @@ def _tf_repositories():
         strip_prefix = "pybind11_protobuf-80f3440cd8fee124e077e2e47a8a17b78b451363",
     )
 
+    tf_http_archive(
+        name = "com_github_nelhage_rules_boost",
+        urls = tf_mirror_urls("https://github.com/nelhage/rules_boost/archive/5160325dbdc8c9e499f9d9917d913f35f1785d52.zip"),
+        strip_prefix = "rules_boost-5160325dbdc8c9e499f9d9917d913f35f1785d52",
+        sha256 = "feb4b1294684c79df7c1e08f1aec5da0da52021e33db59c88edbe86b4d1a017a",
+    )
+
 # buildifier: disable=function-docstring
 # buildifier: disable=unnamed-macro
 def workspace():
diff --git a/third_party/xla/workspace3.bzl b/third_party/xla/workspace3.bzl
index 253f4e25f1eb..1c477480d6b3 100644
--- a/third_party/xla/workspace3.bzl
+++ b/third_party/xla/workspace3.bzl
@@ -47,6 +47,16 @@ def workspace():
         url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG,
     )
 
+    # Platforms
+    http_archive(
+        name = "platforms",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+            "https://github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+        ],
+        sha256 = "29742e87275809b5e598dc2f04d86960cc7a55b3067d97221c9abbc9926bff0f",
+    )
+
     # Load the raw llvm-project.  llvm does not have build rules set up by default,
     # but provides a script for setting up build rules via overlays.
     llvm("llvm-raw")
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 90c009e88219..e94208f90f45 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,9 +1,10 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 
 # copybara:uncomment load("@rules_python//python:proto.bzl", "py_proto_library")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 load("//xla:package_groups.bzl", "xla_package_groups")
-load("//xla:xla.bzl", "xla_bzl_library", "xla_cc_test", "xla_py_proto_library")
+load("//xla:xla.default.bzl", "xla_bzl_library", "xla_cc_test", "xla_py_proto_library")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
@@ -19,6 +20,36 @@ package(
     licenses = ["notice"],
 )
 
+# Indicates whether --//third_party/tensorflow/compiler/xla:ci_build=True
+# is set on the bazel command line.
+#
+# Example:
+#   cc_binary(
+#     name = "foo",
+#     srcs = ["foo.cc"],
+#     linkstatic = select({
+#         # If --//third_party/tensorflow/compiler/xla:ci_build=True is set
+#         # on the bazel command line, then the binary is linked statically.
+#         "//xla:is_ci_build": True,
+#         # Otherwise, the binary is linked dynamically.
+#         "//conditions:default": False,
+#     }),
+#     deps = [...],
+#   )
+config_setting(
+    name = "is_ci_build",
+    flag_values = {":ci_build": "True"},
+    visibility = ["//visibility:public"],
+)
+
+# Implements the custom bazel flag
+# --//third_party/tensorflow/compiler/xla:ci_build=True.
+bool_flag(
+    name = "ci_build",
+    build_setting_default = False,
+    visibility = ["//visibility:private"],
+)
+
 exports_files([
     "lit.cfg.py",
 ] + if_google(["lit_google_cfg.py"]))
@@ -169,8 +200,6 @@ xla_cc_test(
         ":ef57",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log:log_streamer",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -207,15 +236,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "test",
-    testonly = 1,
-    hdrs = ["test.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/testlib:test instead.",
-    visibility = internal_visibility([":friends"]),
-    deps = ["//xla/hlo/testlib:test"],
-)
-
 cc_library(
     name = "types",
     hdrs = ["types.h"],
@@ -247,10 +267,10 @@ cc_library(
     hdrs = ["status_macros.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log",
@@ -271,8 +291,12 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/base:log_severity",
+        "@com_google_absl//absl/log:log_sink",
+        "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
@@ -286,7 +310,6 @@ cc_library(
     deps = [
         ":types",
         ":util",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -326,6 +349,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -348,6 +372,7 @@ cc_library(
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:stacktrace",
     ],
 )
@@ -358,12 +383,14 @@ xla_cc_test(
     deps = [
         ":types",
         ":util",
+        ":xla_data_proto_cc",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -449,6 +476,7 @@ cc_library(
     deps = [
         ":permutation_util",
         ":printer",
+        ":shape_util_proto_cc",
         ":status_macros",
         ":types",
         ":util",
@@ -478,6 +506,11 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "shape_util_proto",
+    srcs = ["shape_util.proto"],
+)
+
 cc_library(
     name = "sharding_op_util",
     srcs = ["sharding_op_util.cc"],
@@ -500,9 +533,11 @@ xla_cc_test(
         ":shape_util",
         ":xla_data_proto_cc",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/hash:hash_testing",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -516,6 +551,7 @@ xla_cc_test(
         ":xla_data_proto_cc",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/status",
@@ -532,10 +568,10 @@ xla_cc_test(
     srcs = ["shape_partition_test.cc"],
     deps = [
         ":shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
+        ":util",
+        ":xla_data_proto_cc",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_googletest//:gtest",
@@ -568,7 +604,6 @@ xla_cc_test(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
     ],
@@ -581,6 +616,7 @@ xla_cc_test(
         ":shape_util",
         ":xla_data_proto_cc",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -625,6 +661,7 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:byte_swap_array",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
@@ -663,6 +700,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/random",
@@ -693,7 +731,6 @@ xla_cc_test(
     name = "literal_pool_test",
     srcs = ["literal_pool_test.cc"],
     deps = [
-        ":literal",
         ":literal_pool",
         ":literal_util",
         "//xla/tsl/platform:test",
@@ -724,6 +761,7 @@ cc_library(
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/random:distributions",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -798,7 +836,9 @@ cc_library(
     hdrs = ["array.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
+        ":permutation_util",
         ":types",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -814,9 +854,9 @@ xla_cc_test(
         ":array",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -919,7 +959,6 @@ cc_library(
     deps = [
         ":literal",
         ":shape_util",
-        ":status_macros",
         ":types",
         ":util",
         ":xla_data_proto_cc",
@@ -932,19 +971,9 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
-cc_library(
-    name = "test_helpers",
-    testonly = 1,
-    hdrs = ["test_helpers.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/testlib:test_helpers instead.",
-    visibility = internal_visibility([":friends"]),
-    deps = ["//xla/hlo/testlib:test_helpers"],
-)
-
 cc_library(
     name = "text_literal_reader",
     srcs = ["text_literal_reader.cc"],
@@ -953,7 +982,6 @@ cc_library(
     deps = [
         ":literal",
         ":shape_util",
-        ":status_macros",
         ":types",
         ":util",
         ":xla_data_proto_cc",
@@ -966,7 +994,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -977,7 +1004,6 @@ xla_cc_test(
         ":literal",
         ":shape_util",
         ":text_literal_reader",
-        ":types",
         ":xla_data_proto_cc",
         "//xla/hlo/testlib:test",
         "//xla/tsl/platform:env",
@@ -994,7 +1020,6 @@ cc_library(
     deps = [
         ":literal",
         ":shape_util",
-        ":status_macros",
         ":types",
         ":xla_data_proto_cc",
         "//xla/tsl/platform:env",
@@ -1009,10 +1034,8 @@ xla_cc_test(
     name = "text_literal_writer_test",
     srcs = ["text_literal_writer_test.cc"],
     deps = [
-        ":literal",
         ":literal_util",
         ":text_literal_writer",
-        ":types",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "//xla/tsl/lib/core:status_test_util",
@@ -1165,7 +1188,6 @@ cc_library(
             "@com_google_absl//absl/log",
             "@com_google_absl//absl/log:check",
             "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
             "@com_google_absl//absl/synchronization",
             "@com_google_absl//absl/types:span",
         ],
@@ -1205,6 +1227,7 @@ cc_library(
             "//xla/service:collective_utils",
             "//xla/stream_executor/cuda:nvjitlink_support",
             "//xla/stream_executor/cuda:ptx_compiler_support",
+            "//xla/tsl/platform:env",
             "//xla/tsl/platform:logging",
             "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/algorithm:container",
@@ -1215,6 +1238,8 @@ cc_library(
             "@com_google_absl//absl/log:check",
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
+            "@local_tsl//tsl/platform",
+            "@local_tsl//tsl/platform:platform_port",
             "@local_tsl//tsl/platform:protobuf",
         ],
 )
@@ -1227,6 +1252,7 @@ cc_library(
     visibility = internal_visibility([":friends"]),
     deps = [
         "//xla/backends/cpu:alignment",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/base:dynamic_annotations",
     ],
 )
@@ -1243,6 +1269,7 @@ xla_cc_test(
             ":debug_options_flags",
             ":parse_flags_from_env",
             ":xla_proto_cc",
+            "//xla/service:dump",
             "//xla/tsl/platform:env",
             "//xla/tsl/platform:test",
             "//xla/tsl/util:command_line_flags",
@@ -1346,6 +1373,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -1393,14 +1421,39 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "online_topsort",
+    hdrs = ["online_topsort.h"],
+    deps = [
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "online_topsort_test",
+    srcs = ["online_topsort_test.cc"],
+    deps = [
+        ":online_topsort",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 bzl_library(
     name = "lit_bzl",
     srcs = ["lit.bzl"],
     deps = [
+        # copybara:uncomment "@rules_python//python:defs_bzl",
+        "@bazel_skylib//lib:paths",
         "//xla/tsl:package_groups_bzl",
         "//xla/tsl:tsl_bzl",
         "//xla/tsl/platform/default:cuda_build_defs_bzl",
-        "@bazel_skylib//lib:paths",
     ],
 )
 
diff --git a/third_party/xla/xla/array.h b/third_party/xla/xla/array.h
index 0bec1540e95f..7d1274de103b 100644
--- a/third_party/xla/xla/array.h
+++ b/third_party/xla/xla/array.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <functional>
@@ -31,11 +32,13 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
+#include "xla/permutation_util.h"
 #include "xla/types.h"
 
 namespace xla {
@@ -324,6 +327,23 @@ class Array {
     }
   }
 
+  // Templated variants of Each() that avoid virtual function call
+  // overhead per element. Useful for hot code paths.
+  template <class Fn>  // void(absl::Span<const int64_t>, T*)
+  void TemplatedEach(const Fn& f) {
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      f(index.span(), &values_[i]);
+    }
+  }
+  template <class Fn>  // void(absl::Span<const int64_t>, T)
+  void TemplatedEach(const Fn& f) const {
+    OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
+    for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
+      f(index.span(), values_[i]);
+    }
+  }
+
   // Invokes a callback with the (indices, value_ptr) for each cell in the
   // array. If a callback returns a non-OK status, returns that else returns
   // absl::OkStatus().
@@ -364,7 +384,10 @@ class Array {
   typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
                           const T&>::type
   operator()(Dims... dims) const {
-    CHECK_EQ(sizeof...(dims), num_dimensions());
+    DCHECK_EQ(sizeof...(dims), num_dimensions());
+    // Check each index is within the bounds of the array.
+    int64_t i = 0;
+    ([&] { DCHECK_LT(dims, sizes_[i++]); }(), ...);
     // We are using a std::array to avoid having to allocate memory in this
     // function for performance reasons.
     std::array<int64_t, sizeof...(dims)> indexes{
@@ -532,18 +555,20 @@ class Array {
             std::enable_if_t<std::is_integral_v<IntT>>* = nullptr>
   void TransposeDimensionsImpl(absl::Span<const IntT> permutation) {
     CHECK_EQ(sizes_.size, permutation.size());
+    if (IsIdentityPermutation(permutation)) {
+      return;  // Nothing to permute.
+    }
+
     OwnedBuffer<int64_t> permuted_dims(permutation.size());
     for (int64_t i = 0; i < permutation.size(); ++i) {
       permuted_dims[i] = this->dim(permutation[i]);
     }
-    Array<T> permuted(permuted_dims.span());
-    OwnedBuffer<int64_t> src_indices(sizes_.size, -1);
-    permuted.Each([&](absl::Span<const int64_t> indices, T* value) {
-      for (int64_t i = 0; i < sizes_.size; ++i) {
-        src_indices[permutation[i]] = indices[i];
-      }
-      *value = (*this)(src_indices.span());
-    });
+    Array<T> permuted(permuted_dims.span(), no_default_init_t{});
+    if (sizes_.size == 2) {
+      TransposeDimensionsImpl2D(permutation, permuted);
+    } else {
+      TransposeDimensionsImpl3DOrMore(permutation, permuted);
+    }
     *this = std::move(permuted);
   }
 
@@ -700,9 +725,11 @@ class Array {
   // wrapped around (i.e. result isn't {0, 0, ...}).
   bool next_index(OwnedBuffer<int64_t>* index) const {
     DCHECK_EQ(index->size, sizes_.size);
-    for (int64_t i = sizes_.size - 1; i >= 0; --i) {
-      (*index)[i]++;
-      if ((*index)[i] < sizes_[i]) {
+    for (size_t i_plus_1 = sizes_.size; i_plus_1 > 0; --i_plus_1) {
+      size_t i = i_plus_1 - 1;
+      int64_t new_index = (*index)[i] + 1;
+      if (new_index < sizes_[i]) {
+        (*index)[i] = new_index;
         return true;
       }
       (*index)[i] = 0;
@@ -710,6 +737,86 @@ class Array {
     return false;
   }
 
+  template <typename IntT,
+            std::enable_if_t<std::is_integral_v<IntT>>* = nullptr>
+  void TransposeDimensionsImpl2D(absl::Span<const IntT> permutation,
+                                 Array<T>& permuted) {
+    DCHECK_EQ(permutation[0], 1);
+    DCHECK_EQ(permutation[1], 0);
+    size_t src_value_index = 0;
+    const size_t size0 = sizes_[0];
+    const size_t size1 = sizes_[1];
+    for (size_t i0 = 0; i0 < size0; ++i0) {
+      size_t dst_value_index = i0;
+      for (size_t i1 = 0; i1 < size1; ++i1) {
+        // original: i0 * sizes_[1] + i1
+        // permutated: i1 * sizes_[0] + i0
+        permuted.values_[dst_value_index] = this->values_[src_value_index];
+        ++src_value_index;
+        dst_value_index += size0;
+      }
+    }
+  }
+
+  template <typename IntT,
+            std::enable_if_t<std::is_integral_v<IntT>>* = nullptr>
+  void TransposeDimensionsImpl3DOrMore(absl::Span<const IntT> permutation,
+                                       Array<T>& permuted) {
+    const size_t num_array_elements = num_elements();
+    if (num_array_elements == 0) {
+      return;
+    }
+    const size_t num_dims = sizes_.size;
+    absl::InlinedVector<size_t, 8> strides(num_dims);
+    strides[num_dims - 1] = 1;
+    for (size_t k = num_dims - 1; k > 0; --k) {
+      strides[k - 1] = strides[k] * permuted.dim(k);
+    }
+    DCHECK_EQ(strides[0] * permuted.dim(0), num_array_elements);
+
+    // A 3D Example:
+    //                   `this` sizes: { 7,  8,  9}
+    //                    permutation: { 1,  2,  0}
+    //                  permuted_dims: { 8,  9,  7}
+    //                        strides: {63,  7,  1}
+    // strides_aligned_with_src_array: { 1, 63,  7}
+    absl::InlinedVector<size_t, 8> strides_aligned_with_src_array(num_dims);
+    for (size_t k = 0; k < num_dims; ++k) {
+      strides_aligned_with_src_array[permutation[k]] = strides[k];
+    }
+
+    absl::InlinedVector<size_t, 8> dst_value_index_stack(num_dims);
+    absl::InlinedVector<size_t, 8> src_indexes(num_dims);
+    std::memset(src_indexes.data(), 0, num_dims * sizeof(size_t));
+    const size_t src_value_index_end = num_array_elements - 1;
+    size_t dst_value_index = 0;
+    for (size_t src_value_index = 0;; ++src_value_index) {
+      permuted.values_[dst_value_index] = this->values_[src_value_index];
+      if (src_value_index == src_value_index_end) {
+        break;
+      }
+      size_t dim_index_plus_1;
+      for (dim_index_plus_1 = num_dims;; --dim_index_plus_1) {
+        DCHECK_GE(dim_index_plus_1, 1);
+        size_t dim_index = dim_index_plus_1 - 1;
+        size_t new_index = src_indexes[dim_index] + 1;
+        if (new_index < dim(dim_index)) {
+          src_indexes[dim_index] = new_index;
+          dst_value_index = dst_value_index_stack[dim_index] +
+                            strides_aligned_with_src_array[dim_index];
+          dst_value_index_stack[dim_index] = dst_value_index;
+          break;
+        }
+      }
+      DCHECK_GE(dim_index_plus_1, 1);
+      for (size_t dim_index = dim_index_plus_1; dim_index < num_dims;
+           ++dim_index) {
+        src_indexes[dim_index] = 0;
+        dst_value_index_stack[dim_index] = dst_value_index;
+      }
+    }
+  }
+
   static size_t calculate_elements(absl::Span<const int64_t> sizes) {
     return std::accumulate(sizes.begin(), sizes.end(), 1LL,
                            std::multiplies<int64_t>());
diff --git a/third_party/xla/xla/array_test.cc b/third_party/xla/xla/array_test.cc
index a20223d746c7..ae5169b60e62 100644
--- a/third_party/xla/xla/array_test.cc
+++ b/third_party/xla/xla/array_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "Eigen/Core"
@@ -66,6 +67,210 @@ TEST(ArrayTest, InitializerListCtor) {
   EXPECT_EQ(arr(1, 2), 6);
 }
 
+TEST(ArrayTest, Transpose1DNoOp) {
+  Array<int> arr({3});
+  arr.FillWithMultiples(10);  // {0, 10, 20}
+
+  ASSERT_EQ(arr.num_dimensions(), 1);
+  ASSERT_EQ(arr.dim(0), 3);
+
+  arr.TransposeDimensions({0});
+  ASSERT_EQ(arr.num_dimensions(), 1);
+  ASSERT_EQ(arr.dim(0), 3);
+  EXPECT_EQ(arr(0), 0);
+  EXPECT_EQ(arr(1), 10);
+  EXPECT_EQ(arr(2), 20);
+}
+
+TEST(ArrayTest, Transpose2DNoOp) {
+  Array<int> arr({{0, 10, 20}, {30, 40, 50}});
+  ASSERT_EQ(arr.num_dimensions(), 2);
+  ASSERT_EQ(arr.dim(0), 2);
+  ASSERT_EQ(arr.dim(1), 3);
+
+  arr.TransposeDimensions({0, 1});
+  ASSERT_EQ(arr.num_dimensions(), 2);
+  ASSERT_EQ(arr.dim(0), 2);
+  ASSERT_EQ(arr.dim(1), 3);
+  EXPECT_EQ(arr(0, 0), 0);
+  EXPECT_EQ(arr(0, 1), 10);
+  EXPECT_EQ(arr(0, 2), 20);
+  EXPECT_EQ(arr(1, 0), 30);
+  EXPECT_EQ(arr(1, 1), 40);
+  EXPECT_EQ(arr(1, 2), 50);
+}
+
+TEST(ArrayTest, Transpose2DSwap) {
+  Array<int> arr({{0, 10, 20}, {30, 40, 50}});
+  ASSERT_EQ(arr.num_dimensions(), 2);
+  ASSERT_EQ(arr.dim(0), 2);
+  ASSERT_EQ(arr.dim(1), 3);
+
+  arr.TransposeDimensions({1, 0});
+  ASSERT_EQ(arr.num_dimensions(), 2);
+  ASSERT_EQ(arr.dim(0), 3);
+  ASSERT_EQ(arr.dim(1), 2);
+  EXPECT_EQ(arr(0, 0), 0);
+  EXPECT_EQ(arr(0, 1), 30);
+  EXPECT_EQ(arr(1, 0), 10);
+  EXPECT_EQ(arr(1, 1), 40);
+  EXPECT_EQ(arr(2, 0), 20);
+  EXPECT_EQ(arr(2, 1), 50);
+}
+
+TEST(ArrayTest, Transpose3DNoOp) {
+  Array<int> arr({{{0, 10}, {20, 30}}, {{40, 50}, {60, 70}}});
+  ASSERT_EQ(arr.num_dimensions(), 3);
+  ASSERT_EQ(arr.dim(0), 2);
+  ASSERT_EQ(arr.dim(1), 2);
+  ASSERT_EQ(arr.dim(2), 2);
+
+  arr.TransposeDimensions({0, 1, 2});
+  ASSERT_EQ(arr.num_dimensions(), 3);
+  ASSERT_EQ(arr.dim(0), 2);
+  ASSERT_EQ(arr.dim(1), 2);
+  ASSERT_EQ(arr.dim(2), 2);
+  EXPECT_EQ(arr(0, 0, 0), 0);
+  EXPECT_EQ(arr(0, 0, 1), 10);
+  EXPECT_EQ(arr(0, 1, 0), 20);
+  EXPECT_EQ(arr(0, 1, 1), 30);
+  EXPECT_EQ(arr(1, 0, 0), 40);
+  EXPECT_EQ(arr(1, 0, 1), 50);
+  EXPECT_EQ(arr(1, 1, 0), 60);
+  EXPECT_EQ(arr(1, 1, 1), 70);
+}
+
+TEST(ArrayTest, Transpose3DCyclic) {
+  Array<int> arr({{{0, 10}, {20, 30}, {40, 50}},
+                  {{60, 70}, {80, 90}, {100, 110}},
+                  {{120, 130}, {140, 150}, {160, 170}},
+                  {{180, 190}, {200, 210}, {220, 230}}});
+  ASSERT_EQ(arr.num_dimensions(), 3);
+  ASSERT_EQ(arr.dim(0), 4);
+  ASSERT_EQ(arr.dim(1), 3);
+  ASSERT_EQ(arr.dim(2), 2);
+  Array<int> arr_before_transpose = arr;
+
+  arr.TransposeDimensions({1, 2, 0});
+  ASSERT_EQ(arr.num_dimensions(), 3);
+  ASSERT_EQ(arr.dim(0), 3);
+  ASSERT_EQ(arr.dim(1), 2);
+  ASSERT_EQ(arr.dim(2), 4);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        EXPECT_EQ(arr(i, j, k), arr_before_transpose(k, i, j));
+      }
+    }
+  }
+}
+
+Array<int> Make4DArray() {
+  Array<int> arr({2, 4, 8, 16});
+  CHECK_EQ(arr.num_elements(), 1024);
+  arr.FillWithMultiples(10);
+  return arr;
+}
+
+TEST(ArrayTest, Transpose4DNoOp) {
+  Array<int> arr1 = Make4DArray();
+  Array<int> arr2 = arr1;
+  arr2.TransposeDimensions({0, 1, 2, 3});
+  ASSERT_EQ(arr2.num_dimensions(), 4);
+  ASSERT_EQ(arr2.dim(0), 2);
+  ASSERT_EQ(arr2.dim(1), 4);
+  ASSERT_EQ(arr2.dim(2), 8);
+  ASSERT_EQ(arr2.dim(3), 16);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 16; ++l) {
+          EXPECT_EQ(arr1(i, j, k, l), arr2(i, j, k, l));
+        }
+      }
+    }
+  }
+}
+
+TEST(ArrayTest, Transpose4DCyclic) {
+  Array<int> arr1 = Make4DArray();
+  Array<int> arr2 = arr1;
+  arr2.TransposeDimensions({1, 2, 3, 0});
+  ASSERT_EQ(arr2.num_dimensions(), 4);
+  ASSERT_EQ(arr2.dim(0), 4);
+  ASSERT_EQ(arr2.dim(1), 8);
+  ASSERT_EQ(arr2.dim(2), 16);
+  ASSERT_EQ(arr2.dim(3), 2);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      for (int k = 0; k < 16; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          EXPECT_EQ(arr2(i, j, k, l), arr1(l, i, j, k));
+        }
+      }
+    }
+  }
+}
+
+TEST(ArrayTest, Transpose4DSomePermutation) {
+  Array<int> arr1 = Make4DArray();
+  Array<int> arr2 = arr1;
+  arr2.TransposeDimensions({3, 1, 0, 2});
+  ASSERT_EQ(arr2.num_dimensions(), 4);
+  ASSERT_EQ(arr2.dim(0), 16);
+  ASSERT_EQ(arr2.dim(1), 4);
+  ASSERT_EQ(arr2.dim(2), 2);
+  ASSERT_EQ(arr2.dim(3), 8);
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 8; ++l) {
+          EXPECT_EQ(arr2(i, j, k, l), arr1(k, j, l, i));
+        }
+      }
+    }
+  }
+}
+
+TEST(ArrayTest, Transpose1DEmpty) {
+  Array<int> arr({0});
+
+  ASSERT_EQ(arr.num_dimensions(), 1);
+  ASSERT_EQ(arr.dim(0), 0);
+
+  arr.TransposeDimensions({0});
+  EXPECT_EQ(arr.num_dimensions(), 1);
+  EXPECT_EQ(arr.dim(0), 0);
+}
+
+TEST(ArrayTest, Transpose2DEmpty) {
+  Array<int> arr({0, 2});
+
+  ASSERT_EQ(arr.num_dimensions(), 2);
+  ASSERT_EQ(arr.dim(0), 0);
+  ASSERT_EQ(arr.dim(1), 2);
+
+  arr.TransposeDimensions({1, 0});
+  EXPECT_EQ(arr.num_dimensions(), 2);
+  EXPECT_EQ(arr.dim(0), 2);
+  EXPECT_EQ(arr.dim(1), 0);
+}
+
+TEST(ArrayTest, Transpose3DEmpty) {
+  Array<int> arr({0, 5, 7});
+
+  ASSERT_EQ(arr.num_dimensions(), 3);
+  ASSERT_EQ(arr.dim(0), 0);
+  ASSERT_EQ(arr.dim(1), 5);
+  ASSERT_EQ(arr.dim(2), 7);
+
+  arr.TransposeDimensions({1, 2, 0});
+  EXPECT_EQ(arr.num_dimensions(), 3);
+  EXPECT_EQ(arr.dim(0), 5);
+  EXPECT_EQ(arr.dim(1), 7);
+  EXPECT_EQ(arr.dim(2), 0);
+}
+
 TEST(ArrayTest, InitializerListCtorHalf) {
   Array<Eigen::half> d2({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
   EXPECT_EQ(d2.dim(0), 2);
diff --git a/third_party/xla/xla/autotune_result_wrapper.cc b/third_party/xla/xla/autotune_result_wrapper.cc
index ee92f173d6a4..d4b4d04fde13 100644
--- a/third_party/xla/xla/autotune_result_wrapper.cc
+++ b/third_party/xla/xla/autotune_result_wrapper.cc
@@ -42,6 +42,7 @@ AutotuneResultWrapper::FromKeyAndValue(OpaqueKey key, OpaqueValue value) {
   AutotuneResults::Entry full_entry;
   full_entry.set_device(key_proto.results(0).device());
   full_entry.set_hlo(key_proto.results(0).hlo());
+  full_entry.set_version(key_proto.results(0).version());
   *full_entry.mutable_result() = value_entry.result();
   return AutotuneResultWrapper(full_entry, key_proto.version());
 }
@@ -52,6 +53,7 @@ AutotuneResultWrapper::OpaqueKey AutotuneResultWrapper::Key() const {
   auto entry = key_proto.add_results();
   entry->set_device(autotune_result_.device());
   entry->set_hlo(autotune_result_.hlo());
+  entry->set_version(autotune_result_.version());
   OpaqueKey serialized;
   CHECK(tsl::SerializeToStringDeterministic(key_proto, &serialized));
   return serialized;
diff --git a/third_party/xla/xla/autotune_result_wrapper_test.cc b/third_party/xla/xla/autotune_result_wrapper_test.cc
index 2b12e00aacda..3557d12d4644 100644
--- a/third_party/xla/xla/autotune_result_wrapper_test.cc
+++ b/third_party/xla/xla/autotune_result_wrapper_test.cc
@@ -34,16 +34,19 @@ AutotuneResults ThreeAutotuneEntries(int32_t version) {
   auto r1 = results.add_results();
   r1->set_device("dev1");
   r1->set_hlo("hlo1");
+  r1->set_version(1);
   r1->mutable_result()->set_scratch_bytes(1);
 
   auto r2 = results.add_results();
   r2->set_device("dev2");
   r2->set_hlo("hlo2");
+  r2->set_version(2);
   r2->mutable_result()->set_scratch_bytes(2);
 
   auto r3 = results.add_results();
   r3->set_device("dev3");
   r3->set_hlo("hlo3");
+  r3->set_version(3);
   r3->mutable_result()->set_scratch_bytes(3);
 
   return results;
@@ -75,12 +78,15 @@ TEST(AutotuneResultWrapperTest, FullRoundTrip) {
   EXPECT_EQ(round_tripped.version(), 42);
   EXPECT_EQ(round_tripped.results(0).device(), "dev1");
   EXPECT_EQ(round_tripped.results(0).hlo(), "hlo1");
+  EXPECT_EQ(round_tripped.results(0).version(), 1);
   EXPECT_EQ(round_tripped.results(0).result().scratch_bytes(), 1);
   EXPECT_EQ(round_tripped.results(1).device(), "dev2");
   EXPECT_EQ(round_tripped.results(1).hlo(), "hlo2");
+  EXPECT_EQ(round_tripped.results(1).version(), 2);
   EXPECT_EQ(round_tripped.results(1).result().scratch_bytes(), 2);
   EXPECT_EQ(round_tripped.results(2).device(), "dev3");
   EXPECT_EQ(round_tripped.results(2).hlo(), "hlo3");
+  EXPECT_EQ(round_tripped.results(2).version(), 3);
   EXPECT_EQ(round_tripped.results(2).result().scratch_bytes(), 3);
 }
 
diff --git a/third_party/xla/xla/autotune_results.proto b/third_party/xla/xla/autotune_results.proto
index cf3ddcc1ad6e..cc52c92acd32 100644
--- a/third_party/xla/xla/autotune_results.proto
+++ b/third_party/xla/xla/autotune_results.proto
@@ -34,6 +34,9 @@ message AutotuneResults {
     string device = 1;
     string hlo = 2;
     AutotuneResult result = 3;
+    // Autotuning results may be versioned separately from the format of the
+    // cache.
+    int32 version = 4;
   }
 
   int32 version = 1;
diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index b3d6b8e380b4..b421b9302d93 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -100,8 +100,6 @@ message AutotuneResult {
     CustomKernelFusionKey custom_kernel_fusion = 18;
     stream_executor.dnn.AlgorithmProto algorithm = 16;
   }
-
-  // Next ID: 19
 }
 
 message AutotuningLog {
diff --git a/third_party/xla/xla/backends/autotuner/BUILD b/third_party/xla/xla/backends/autotuner/BUILD
new file mode 100644
index 000000000000..7f4b4647b56f
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/BUILD
@@ -0,0 +1,28 @@
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "codegen_backend",
+    hdrs = ["codegen_backend.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:executable",
+        "//xla/stream_executor:stream_executor_h",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/BUILD b/third_party/xla/xla/backends/autotuner/backends/gpu/BUILD
new file mode 100644
index 000000000000..d905389a849e
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/BUILD
@@ -0,0 +1,293 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tests:build_defs.bzl", "xla_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "gpu_codegen_backend",
+    hdrs = ["gpu_codegen_backend.h"],
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "cublas",
+    srcs = ["cublas.cc"],
+    hdrs = ["cublas.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotuning_proto_cc",
+        "//xla:shape_util",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:compiler",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/autotuning:redzone_buffers",
+        "//xla/service/gpu/transforms:dot_algorithm_rewriter",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:priority_fusion",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/gpu:redzone_allocator",
+        "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/lib/gtl:iterator_range",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "cublaslt",
+    srcs = ["cublaslt.cc"],
+    hdrs = ["cublaslt.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotuning_proto_cc",
+        "//xla:util",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/autotuning:redzone_buffers",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_test(
+    name = "cublaslt_test",
+    srcs = ["cublaslt_test.cc"],
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+        "gpu_b200",
+    ],
+    tags = [
+        "cuda-only",
+        "no_mac",
+    ],
+    deps = [
+        ":cublaslt",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:nvptx_compiler_impl",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_test(
+    name = "cublas_test",
+    srcs = ["cublas_test.cc"],
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+        "gpu_b200",
+    ],
+    tags = [
+        "cuda-only",
+        "no_mac",
+    ],
+    deps = [
+        ":cublas",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:nvptx_compiler_impl",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "custom_kernel",
+    srcs = ["custom_kernel.cc"],
+    hdrs = ["custom_kernel.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotune_results_proto_cc",
+        "//xla:autotuning_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/service/gpu/kernels:custom_kernel_fusion",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_test(
+    name = "custom_kernel_test",
+    srcs = ["custom_kernel_test.cc"],
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+        "gpu_b200",
+    ],
+    tags = [
+        "cuda-only",
+        "no_mac",
+    ],
+    deps = [
+        ":custom_kernel",
+        "//xla:autotuning_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:nvptx_compiler_impl",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "triton",
+    srcs = ["triton.cc"],
+    hdrs = ["triton.h"],
+    deps = [
+        ":gpu_codegen_backend",
+        "//xla:autotuning_proto_cc",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/service:compiler",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_float_support",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:split_k_gemm_rewriter",
+        "//xla/service/gpu/transforms:fusion_wrapper",
+        "//xla/service/gpu/transforms:nest_gemm_fusion",
+        "//xla/service/gpu/transforms:priority_fusion",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "triton_test",
+    srcs = ["triton_test.cc"],
+    tags = [
+        "cuda-only",  # rocm support is not tested.
+        "gpu",
+        "no_mac",
+    ],
+    deps = [
+        ":triton",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/autotuner:codegen_backend",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:gpu_plugin",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:nvptx_compiler_impl",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ] + if_cuda([
+        "//xla/stream_executor/cuda:cuda_platform",
+    ]),
+)
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/cublas.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/cublas.cc
new file mode 100644
index 000000000000..563eed340c73
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/cublas.cc
@@ -0,0 +1,192 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/cublas.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
+#include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/gpu/transforms/priority_fusion.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::stream_executor;
+
+using CublasBackendConfig = AutotuneResult::GemmKey;
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+CublasBackend::GetSupportedConfigs(
+    const HloInstruction& instr,
+    stream_executor::StreamExecutor* stream_executor) {
+  if (!IsLegacyCublasMatmul(instr)) {
+    return absl::InvalidArgumentError(
+        "CublasBackend does not support this instruction.");
+  }
+
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator =
+      std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor);
+  TF_ASSIGN_OR_RETURN(se::Stream * stream,
+                      allocator->GetStream(stream_executor->device_ordinal()));
+
+  // We use GemmConfig::For with GemmBackendConfig as a fallback because
+  // Matmul_utils.cc relies on backend config to determine gemm contracting
+  // dimensions.
+  GemmBackendConfig backend_config;
+  backend_config =
+      instr.backend_config<GpuBackendConfig>()->gemm_backend_config();
+  TF_ASSIGN_OR_RETURN(
+      GemmConfig gemm_config,
+      GemmConfig::For(
+          &instr, backend_config,
+          target_config().device_description.gpu_compute_capability()));
+
+  TF_ASSIGN_OR_RETURN(RedzoneBuffers rz_buffers,
+                      RedzoneBuffers::FromInstruction(
+                          instr, allocator.get(), stream,
+                          RedzoneBuffers::kAllInputsAllOutputs, true, true,
+                          instr.GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_redzone_padding_bytes()));
+
+  TF_ASSIGN_OR_RETURN(
+      GemmConfig::DescriptorsTuple desc,
+      gemm_config.GetMatrixDescriptors(rz_buffers.input_buffers().at(0),
+                                       rz_buffers.input_buffers().at(1),
+                                       rz_buffers.output_buffers().at(0)));
+
+  se::blas::BlasSupport* blas = stream_executor->AsBlas();
+  if (blas == nullptr) {
+    return absl::InternalError("Failed to getBlas support.");
+  }
+  std::vector<se::blas::AlgorithmType> algorithms;
+  blas->GetBlasGemmAlgorithms(stream, desc.lhs, desc.rhs, &desc.output,
+                              &gemm_config.alpha, &gemm_config.beta,
+                              &algorithms);
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.reserve(algorithms.size());
+  for (se::blas::AlgorithmType algorithm : algorithms) {
+    std::unique_ptr<CublasBackendConfig> gemm_key =
+        std::make_unique<CublasBackendConfig>();
+    gemm_key->set_algorithm(algorithm);
+    configs.push_back(std::move(gemm_key));
+  }
+  return configs;
+}
+
+HloCostAnalysis::Options PriorityFusionOptions() {
+  // The real pointer size is set in GpuCompiler. In HloCostAnalysis, the
+  // pointer size is used only to determine the size of tuple types. We
+  // shouldn't have any tuples in the autotuned module, so it's safe to use
+  // the default value here, instead of piping the real value.
+  HloCostAnalysis::Options options;
+  options.count_multiple_input_accesses = true;
+  return options;
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> RewriteToCublasCustomCall(
+    std::unique_ptr<HloModule> hlo_module,
+    const se::DeviceDescription& gpu_device_info) {
+  HloInstruction* dot = hlo_query::GetFirstInstructionWithOpcode(
+      *hlo_module->entry_computation(), HloOpcode::kDot);
+  // Substitute algorithms, which are not supported by cuBLAS for the check, but
+  // don't use cuBlas in the end. This assumes that the substituting algorithm
+  // has result which are close enough for the check in this file.
+  if (dot->precision_config().algorithm() ==
+      PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
+    dot->mutable_precision_config()->set_algorithm(
+        PrecisionConfig::ALG_DOT_F32_F32_F32);
+  }
+
+  for (GemmRewriterOptions::DType dtype :
+       {GemmRewriterOptions::DType::kFp8Only,
+        GemmRewriterOptions::DType::kNonFp8Only}) {
+    GemmRewriter gemm_rewriter(gpu_device_info.cuda_compute_capability(),
+                               gpu_device_info.runtime_version(),
+                               GemmRewriterOptions{dtype});
+    DotAlgorithmRewriter dot_algorithm_rewriter;
+    PriorityFusion fusion_pass(
+        /*thread_pool=*/nullptr, gpu_device_info, PriorityFusionOptions());
+    TF_RETURN_IF_ERROR(dot_algorithm_rewriter.Run(hlo_module.get()).status());
+    TF_RETURN_IF_ERROR(gemm_rewriter.Run(hlo_module.get()).status());
+    TF_RETURN_IF_ERROR(fusion_pass.Run(hlo_module.get()).status());
+  }
+
+  return hlo_module;
+}
+
+void SubstituteCublasAlgorithms(const HloInstruction* gemm,
+                                se::blas::AlgorithmType algorithm) {
+  GpuBackendConfig gpu_config =
+      gemm->backend_config<GpuBackendConfig>().value();
+  GemmBackendConfig& backend_config = *gpu_config.mutable_gemm_backend_config();
+  backend_config.set_selected_algorithm(algorithm);
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>> CublasBackend::GetDefaultConfig(
+    const HloInstruction& instr) {
+  if (!IsLegacyCublasMatmul(instr)) {
+    return absl::InvalidArgumentError(
+        "CublasBackend does not support this instruction.");
+  }
+
+  std::unique_ptr<CublasBackendConfig> gemm_key =
+      std::make_unique<CublasBackendConfig>();
+  gemm_key->set_algorithm(se::blas::kDefaultAlgorithm);
+  return gemm_key;
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> CublasBackend::WrapInModule(
+    const HloInstruction& hlo_instruction, const BackendConfig& config) {
+  return absl::UnimplementedError("Not implemented.");
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> CublasBackend::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    const Compiler::CompileOptions& options) {
+  return absl::UnimplementedError("Not implemented.");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/cublas.h b/third_party/xla/xla/backends/autotuner/backends/gpu/cublas.h
new file mode 100644
index 000000000000..f5632cb408c9
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/cublas.h
@@ -0,0 +1,73 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUBLAS_H_
+#define XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUBLAS_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/autotuner/backends/gpu/gpu_codegen_backend.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/compiler.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// A codegen backend for cuBLAS.
+// This backend is used to autotune cuBLAS algorithms.
+//
+// Cublas calls are represented as custom-call instructions, with and
+// configurable algorithm:
+// ```
+//   %custom-call.1 = .. custom-call(...), custom_call_target="__cublas$gemm",
+//   backend_config={"
+//     gemm_backend_config":{"selected_algorithm":"18"}
+//   }
+// ```
+
+class CublasBackend : public GpuCodegenBackend {
+ public:
+  explicit CublasBackend(const Compiler::TargetConfig* target_config,
+                         const DebugOptions* debug_options, Compiler* compiler)
+      : GpuCodegenBackend("Cublas", target_config, debug_options, compiler) {}
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(
+      const HloInstruction& instr,
+      stream_executor::StreamExecutor* stream_executor) override;
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+ private:
+  absl::StatusOr<std::unique_ptr<HloModule>> WrapInModule(
+      const HloInstruction& hlo_instruction,
+      const BackendConfig& config) override;
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      const Compiler::CompileOptions& options) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUBLAS_H_
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/cublas_test.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/cublas_test.cc
new file mode 100644
index 000000000000..3f0280411963
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/cublas_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/cublas.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/nvptx_compiler.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using CublasBackendConfig = AutotuneResult::GemmKey;
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::testing::IsOk;
+
+const char kCublasCustomCallHlo[] = R"(
+    HloModule module, entry_computation_layout={(f32[100,100]{1,0}, f32[100,100]{1,0})->f32[100,100]{1,0}}
+
+    ENTRY %main (arg0: f32[100,100], arg1: f32[100,100]) -> f32[100,100] {
+      %arg0 = f32[100,100]{1,0} parameter(0)
+      %arg1 = f32[100,100]{1,0} parameter(1)
+      %custom-call.1 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%arg0, %arg1), 
+      custom_call_target="__cublas$gemm", 
+      backend_config={
+        "gemm_backend_config":{
+          "dot_dimension_numbers":
+            {
+              "lhs_contracting_dimensions":["1"],
+              "rhs_contracting_dimensions":["0"],
+              "lhs_batch_dimensions":[],
+              "rhs_batch_dimensions":[]
+          }
+        }
+      }
+      ROOT %get-tuple-element = f32[100,100]{1,0} get-tuple-element(%custom-call.1), index=0
+    })";
+
+class CublasBackendTest : public HloHardwareIndependentTestBase {
+ protected:
+  DebugOptions debug_options_;
+  NVPTXCompiler compiler_;
+  Compiler::TargetConfig target_config_;
+  CublasBackend backend_;
+
+  CublasBackendTest()
+      : target_config_([]() {
+          se::GpuTargetConfigProto target_config_proto;
+          *target_config_proto.mutable_gpu_device_info() =
+              TestGpuDeviceInfo().CudaOrRocmDeviceInfo().ToGpuProto();
+          return Compiler::TargetConfig(target_config_proto);
+        }()),
+        backend_(&target_config_, &debug_options_, &compiler_) {}
+
+  CublasBackendConfig ExpectedDefaultAlgorithm() {
+    auto config = AutotuneResult::GemmKey();
+    config.set_algorithm(se::blas::kDefaultAlgorithm);
+    return config;
+  }
+};
+
+TEST_F(CublasBackendTest, CanCreateCublasBackend) {
+  ASSERT_NE(nullptr, &backend_);
+}
+
+TEST_F(CublasBackendTest, GetSupportedConfigsFromCublasCustomCall) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kCublasCustomCallHlo));
+  se::StreamExecutor* stream_executor =
+      PlatformUtil::GetDefaultPlatform().value()->ExecutorForDevice(0).value();
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend_.GetSupportedConfigs(
+          (*module->entry_computation()->root_instruction()->operand(0)),
+          stream_executor);
+  EXPECT_THAT(configs, IsOk());
+  EXPECT_GT(configs.value().size(), 0);
+}
+
+TEST_F(CublasBackendTest, GetDefaultConfigFromCublasCustomCall) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kCublasCustomCallHlo));
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(
+          (*module->entry_computation()->root_instruction()->operand(0)));
+  EXPECT_THAT(static_cast<const CublasBackendConfig&>(*config.value()),
+              EqualsProto(ExpectedDefaultAlgorithm()));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt.cc
new file mode 100644
index 000000000000..a010f47e56fa
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt.cc
@@ -0,0 +1,159 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/cublaslt.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::stream_executor;
+using se::gpu::BlasLt;
+
+using CublasLtBackendConfig = AutotuneResult::GemmKey;
+
+absl::StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue) {
+  switch (epilogue) {
+    case GemmBackendConfig::DEFAULT:
+      return BlasLt::Epilogue::kDefault;
+    case GemmBackendConfig::RELU:
+      return BlasLt::Epilogue::kReLU;
+    case GemmBackendConfig::GELU:
+      return BlasLt::Epilogue::kGELU;
+    case GemmBackendConfig::GELU_AUX:
+      return BlasLt::Epilogue::kGELUWithAux;
+    case GemmBackendConfig::BIAS:
+      return BlasLt::Epilogue::kBias;
+    case GemmBackendConfig::BIAS_RELU:
+      return BlasLt::Epilogue::kBiasThenReLU;
+    case GemmBackendConfig::BIAS_GELU:
+      return BlasLt::Epilogue::kBiasThenGELU;
+    case GemmBackendConfig::BIAS_GELU_AUX:
+      return BlasLt::Epilogue::kBiasThenGELUWithAux;
+    default:
+      return Internal("Unsupported Epilogue.");
+  }
+}
+
+bool IsSupported(const HloInstruction& instr) {
+  return IsCublasLtMatmul(instr) || IsCublasLtMatmulF8(instr);
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+CublasLtBackend::GetSupportedConfigs(
+    const HloInstruction& instr,
+    stream_executor::StreamExecutor* stream_executor) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError(
+        "Not a CublasLt custom call instruction.");
+  }
+
+  GpuBackendConfig gpu_config =
+      instr.backend_config<GpuBackendConfig>().value();
+  const GemmBackendConfig& backend_config = gpu_config.gemm_backend_config();
+
+  TF_ASSIGN_OR_RETURN(
+      GemmConfig gemm_config,
+      GemmConfig::For(
+          &instr, target_config().device_description.gpu_compute_capability()));
+
+  TF_ASSIGN_OR_RETURN(BlasLt::Epilogue epilogue,
+                      AsBlasLtEpilogue(backend_config.epilogue()));
+
+  auto allocator =
+      std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor);
+  TF_ASSIGN_OR_RETURN(se::Stream * stream,
+                      allocator->GetStream(stream_executor->device_ordinal()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BlasLt::MatmulPlan> plan,
+      se::gpu::BlasLt::GetMatmulPlan(stream, gemm_config, epilogue));
+
+  TF_ASSIGN_OR_RETURN(RedzoneBuffers rz_buffers,
+                      RedzoneBuffers::FromInstruction(
+                          instr, allocator.get(), stream,
+                          RedzoneBuffers::kAllInputsAllOutputs, true, true,
+                          instr.GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_redzone_padding_bytes()));
+  se::DeviceMemoryBase workspace_buffer =
+      rz_buffers.output_buffers().at(instr.shape().tuple_shapes().size() - 1);
+
+  TF_ASSIGN_OR_RETURN(std::vector<BlasLt::MatmulAlgorithm> algorithms,
+                      plan->GetAlgorithms(stream, GemmConfig::kNumAlgorithms,
+                                          workspace_buffer.size()));
+  int num_algorithms = algorithms.size();
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.reserve(num_algorithms);
+  for (int i = 0; i < num_algorithms; ++i) {
+    auto gemm_key = std::make_unique<CublasLtBackendConfig>();
+    gemm_key->set_algorithm(i);
+    configs.push_back(std::move(gemm_key));
+  }
+
+  return configs;
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>>
+CublasLtBackend::GetDefaultConfig(const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError(
+        "Not a CublasLt custom call instruction.");
+  }
+
+  AutotuneResult::GemmKey gemm_key;
+  gemm_key.set_algorithm(se::blas::kDefaultAlgorithm);
+  return std::make_unique<CublasLtBackendConfig>(gemm_key);
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> CublasLtBackend::WrapInModule(
+    const HloInstruction& hlo_instruction, const BackendConfig& config) {
+  return absl::UnimplementedError("Not implemented.");
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> CublasLtBackend::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    const Compiler::CompileOptions& options) {
+  return absl::UnimplementedError("Not implemented.");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt.h b/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt.h
new file mode 100644
index 000000000000..e040d552fb96
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt.h
@@ -0,0 +1,72 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUBLASLT_H_
+#define XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUBLASLT_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/autotuner/backends/gpu/gpu_codegen_backend.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/compiler.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// A codegen backend for cuBLASLt.
+// This backend is used to autotune cuBLASLt algorithms.
+//
+// The CublasLt backend requires a fusion instruction with a cuBLASLt custom
+// call.
+// CuBLASLt custom calls are represented as:
+// ```
+//   %custom-call.1 = .. custom-call(...),
+//   custom_call_target="__cublas$lt&matmul"
+// ```
+class CublasLtBackend : public GpuCodegenBackend {
+ public:
+  explicit CublasLtBackend(const Compiler::TargetConfig* target_config,
+                           const DebugOptions* debug_options,
+                           Compiler* compiler)
+      : GpuCodegenBackend("CublasLt", target_config, debug_options, compiler) {}
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(
+      const HloInstruction& instr,
+      stream_executor::StreamExecutor* stream_executor) override;
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+ private:
+  absl::StatusOr<std::unique_ptr<HloModule>> WrapInModule(
+      const HloInstruction& hlo_instruction,
+      const BackendConfig& config) override;
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      const Compiler::CompileOptions& options) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUBLASLT_H_
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt_test.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt_test.cc
new file mode 100644
index 000000000000..1d66727f0df6
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/cublaslt_test.cc
@@ -0,0 +1,154 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/cublaslt.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/nvptx_compiler.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using CublasLtBackendConfig = AutotuneResult::GemmKey;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+const char kCublasLtCustomCallHlo[] = R"(
+HloModule module
+
+ENTRY main {
+  p0 = f32[100,100] parameter(0)
+  p1 = f32[100,100] parameter(1)
+  %custom-call.1 = (f32[100,100]{1,0}, s8[4194304]{0}) custom-call(p0, p1),
+    custom_call_target="__cublas$lt$matmul",
+    backend_config={
+      "gemm_backend_config":{
+        "selected_algorithm":"0",
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{
+          "operand_precision":["DEFAULT","DEFAULT"],
+          "algorithm":"ALG_UNSET"
+        },
+        "epilogue":"DEFAULT",
+        "lhs_stride":"10000",
+        "rhs_stride":"10000",
+        "grad_x":false,
+        "grad_y":false,
+        "damax_output":false
+      }
+    }
+  ROOT %get-tuple-element = f32[100,100]{1,0} get-tuple-element(%custom-call.1), index=0
+})";
+
+class CublasLtBackendTest : public HloHardwareIndependentTestBase {
+ protected:
+  DebugOptions debug_options_;
+  NVPTXCompiler compiler_;
+  Compiler::TargetConfig target_config_;
+  CublasLtBackend backend_;
+
+  CublasLtBackendTest()
+      : target_config_([]() {
+          se::GpuTargetConfigProto target_config_proto;
+          *target_config_proto.mutable_gpu_device_info() =
+              TestGpuDeviceInfo().CudaOrRocmDeviceInfo().ToGpuProto();
+          return Compiler::TargetConfig(target_config_proto);
+        }()),
+        backend_(&target_config_, &debug_options_, &compiler_) {}
+
+  CublasLtBackendConfig ExpectedDefaultAlgorithm() {
+    auto config = AutotuneResult::GemmKey();
+    config.set_algorithm(se::blas::kDefaultAlgorithm);
+    return config;
+  }
+};
+
+TEST_F(CublasLtBackendTest, CanCreateCublasBackend) {
+  ASSERT_NE(nullptr, &backend_);
+}
+
+TEST_F(CublasLtBackendTest, GetSupportedConfigs) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kCublasLtCustomCallHlo));
+
+  se::StreamExecutor* stream_executor =
+      PlatformUtil::GetDefaultPlatform().value()->ExecutorForDevice(0).value();
+  const HloInstruction* gemm_instr =
+      module->entry_computation()->root_instruction()->operand(0);
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend_.GetSupportedConfigs(*gemm_instr, stream_executor);
+  EXPECT_THAT(configs, IsOk());
+  EXPECT_GT(configs.value().size(), 0);
+}
+
+TEST_F(CublasLtBackendTest, GetDefaultConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kCublasLtCustomCallHlo));
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(
+          (*module->entry_computation()->root_instruction()->operand(0)));
+  EXPECT_THAT(config, IsOk());
+}
+
+TEST_F(CublasLtBackendTest, GetDefaultConfigFailsWithoutACublasLtCustomCall) {
+  std::string hlo = R"(
+    HloModule module
+
+    ENTRY main {
+      p0 = f32[1024,1024]{1,0} parameter(0)
+      p1 = f32[1024,1024]{1,0} parameter(1)
+      ROOT dot = f32[1024,1024]{1,0} dot(p0, p1),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(
+          (*module->entry_computation()->root_instruction()));
+  EXPECT_THAT(config, StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel.cc
new file mode 100644
index 000000000000..14b5c1a72f62
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel.cc
@@ -0,0 +1,120 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/custom_kernel.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::stream_executor;
+
+using CustomKernelBackendConfig = AutotuneResult::CustomKernelFusionKey;
+
+absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
+    const HloInstruction* fusion_instruction,
+    const se::DeviceDescription& device_description) {
+  CustomFusionConfig config =
+      fusion_instruction->backend_config<GpuBackendConfig>()
+          ->fusion_backend_config()
+          .custom_fusion_config();
+  CustomKernelFusionRegistry* registry = CustomKernelFusionRegistry::Default();
+  CustomKernelFusion* custom_kernel_fusion = registry->Lookup(config.name());
+
+  // If custom fusion is not found it means that some of the build targets might
+  // not be statically linked into the binary.
+  if (custom_kernel_fusion == nullptr) {
+    return absl::InternalError(
+        absl::StrCat("Custom kernel fusion ", config.name(),
+                     " not found in a default registry."));
+  }
+
+  // Load custom kernels that can implement a fusion computation.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<CustomKernel> kernels,
+      custom_kernel_fusion->LoadKernels(
+          device_description,
+          fusion_instruction->fused_instructions_computation()));
+
+  return kernels;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+CustomKernelBackend::GetSupportedConfigs(
+    const HloInstruction& instr,
+    stream_executor::StreamExecutor* stream_executor) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return absl::InvalidArgumentError(
+        "CustomKernelBackend doesn't support non-fusion instructions.");
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<CustomKernel> kernels,
+      LoadKernels(&instr, stream_executor->GetDeviceDescription()));
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  int num_kernels = kernels.size();
+  configs.reserve(num_kernels);
+  for (int i = 0; i < num_kernels; ++i) {
+    auto config = std::make_unique<CustomKernelBackendConfig>();
+    config->set_kernel_index(i);
+    configs.push_back(std::move(config));
+  }
+  return configs;
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>>
+CustomKernelBackend::GetDefaultConfig(const HloInstruction& instr) {
+  // CustomKernels need a device description to load the kernels, so we can't
+  // return a default config.
+  return absl::InvalidArgumentError(
+      "CustomKernelBackend doesn't support getting a default config.");
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> CustomKernelBackend::WrapInModule(
+    const HloInstruction& hlo_instruction, const BackendConfig& config) {
+  return absl::InvalidArgumentError(
+      "CustomKernelBackend doesn't support wrapping in a module.");
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> CustomKernelBackend::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    const Compiler::CompileOptions& options) {
+  return absl::InvalidArgumentError(
+      "CustomKernelBackend doesn't support wrapping in a module.");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel.h b/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel.h
new file mode 100644
index 000000000000..754a8ceebc3d
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel.h
@@ -0,0 +1,61 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUSTOM_KERNEL_H_
+#define XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUSTOM_KERNEL_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/autotuner/backends/gpu/gpu_codegen_backend.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/compiler.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// A codegen backend for custom kernels.
+class CustomKernelBackend : public GpuCodegenBackend {
+ public:
+  explicit CustomKernelBackend(const Compiler::TargetConfig* target_config,
+                               const DebugOptions* debug_options,
+                               Compiler* compiler)
+      : GpuCodegenBackend("Cublas", target_config, debug_options, compiler) {}
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(
+      const HloInstruction& instr,
+      stream_executor::StreamExecutor* stream_executor) override;
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+ private:
+  absl::StatusOr<std::unique_ptr<HloModule>> WrapInModule(
+      const HloInstruction& hlo_instruction,
+      const BackendConfig& config) override;
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      const Compiler::CompileOptions& options) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_CUSTOM_KERNEL_H_
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel_test.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel_test.cc
new file mode 100644
index 000000000000..a1bc3c82b74b
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/custom_kernel_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/custom_kernel.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/nvptx_compiler.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+using CublasBackendConfig = AutotuneResult::GemmKey;
+
+using tsl::testing::IsOk;
+using tsl::testing::StatusIs;
+
+const char kCustomKernelFusionHlo[] = R"(
+HloModule extracted
+
+cutlass_gemm {
+  p0 = f32[15,19]{1,0} parameter(0)
+  p1 = f32[19,17]{1,0} parameter(1)
+  ROOT r = f32[15, 17]{1,0} dot(p0, p1), lhs_contracting_dims={1},
+  rhs_contracting_dims={0}
+}
+
+ENTRY region_198.14436 {
+  p.0 = f32[15,19]{1,0} parameter(0)
+  p.1 = f32[19,17]{1,0} parameter(1)
+  ROOT cutlass_gemm = f32[15,17]{1,0} fusion(p.0, p.1), kind=kCustom,
+  calls=cutlass_gemm,
+  backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"cutlass_gemm","kernel_index":-1}},"force_earliest_schedule":false}
+})";
+
+class CustomKernelBackendTest : public HloHardwareIndependentTestBase {
+ protected:
+  DebugOptions debug_options_;
+  NVPTXCompiler compiler_;
+  Compiler::TargetConfig target_config_;
+  CustomKernelBackend backend_;
+
+  CustomKernelBackendTest()
+      : target_config_([]() {
+          se::GpuTargetConfigProto target_config_proto;
+          *target_config_proto.mutable_gpu_device_info() =
+              TestGpuDeviceInfo().CudaOrRocmDeviceInfo().ToGpuProto();
+          return Compiler::TargetConfig(target_config_proto);
+        }()),
+        backend_(&target_config_, &debug_options_, &compiler_) {}
+};
+
+TEST_F(CustomKernelBackendTest, CanCreateCublasBackend) {
+  ASSERT_NE(nullptr, &backend_);
+}
+
+TEST_F(CustomKernelBackendTest, GetSupportedConfigsFromCustomKernelFusion) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kCustomKernelFusionHlo));
+  se::StreamExecutor* stream_executor =
+      PlatformUtil::GetDefaultPlatform().value()->ExecutorForDevice(0).value();
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend_.GetSupportedConfigs(
+          (*module->entry_computation()->root_instruction()), stream_executor);
+  EXPECT_THAT(configs, IsOk());
+  EXPECT_FALSE(configs.value().empty());
+}
+
+TEST_F(CustomKernelBackendTest, GetDefaultConfigFails) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kCustomKernelFusionHlo));
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(
+          (*module->entry_computation()->root_instruction()->operand(0)));
+  EXPECT_THAT(config, StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/gpu_codegen_backend.h b/third_party/xla/xla/backends/autotuner/backends/gpu/gpu_codegen_backend.h
new file mode 100644
index 000000000000..1728cb3f62de
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/gpu_codegen_backend.h
@@ -0,0 +1,93 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_GPU_CODEGEN_BACKEND_H_
+#define XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_GPU_CODEGEN_BACKEND_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace  gpu {
+
+// Abstract base class for GPU backends, implementing the Backend interface.
+class GpuCodegenBackend : public CodegenBackend {
+ public:
+  // target_config, debug_options and compiler should outlive the backend.
+  GpuCodegenBackend(absl::string_view name,
+             const Compiler::TargetConfig* target_config,
+             const DebugOptions* debug_options, Compiler* compiler)
+      : name_(name),
+        target_config_(*target_config),
+        debug_options_(*debug_options),
+        compiler_(compiler) {}
+
+  absl::string_view name() const override { return name_; }
+
+  const Compiler::TargetConfig& target_config() const { return target_config_; }
+  const DebugOptions& debug_options() const { return debug_options_; }
+
+  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      const HloInstruction& hlo_instruction,
+      const BackendConfig& config) override {
+    TF_ASSIGN_OR_RETURN(auto hlo_module, WrapInModule(hlo_instruction, config));
+    hlo_module->mutable_config().set_debug_options(debug_options_);
+
+    Compiler::CompileOptions options;
+    options.target_config = target_config_;
+
+    TF_ASSIGN_OR_RETURN(auto optimized_module,
+                        RunHloPasses(std::move(hlo_module), options));
+    return compiler_->RunBackend(std::move(optimized_module),
+                                 /*executor=*/nullptr, options);
+  }
+
+ private:
+  // TODO(b/407494653): Provide a default implementation.
+  virtual absl::StatusOr<std::unique_ptr<HloModule>> WrapInModule(
+      const HloInstruction& hlo_instruction, const BackendConfig& config) = 0;
+
+  // Optimize the HLO module.
+  // TODO(b/407494653): Remove this when XLA pipelines is fixed and we autotune
+  // only optimized and fused HLOs.
+  virtual absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      const Compiler::CompileOptions& options) = 0;
+
+  std::string name_;
+  const Compiler::TargetConfig& target_config_;
+  const DebugOptions& debug_options_;
+  // TODO(b/407494653): remove compiler when we don't need to run any HLO passes
+  // and the codegen backend can directly produce an executable without a
+  // compiler instance.
+  Compiler* compiler_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_GPU_CODEGEN_BACKEND_H_
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/triton.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/triton.cc
new file mode 100644
index 000000000000..84a91d181d97
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/triton.cc
@@ -0,0 +1,219 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/triton.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/transforms/simplifiers/float_normalization.h"
+#include "xla/service/compiler.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_float_support.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/split_k_gemm_rewriter.h"
+#include "xla/service/gpu/transforms/fusion_wrapper.h"
+#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
+#include "xla/service/gpu/transforms/priority_fusion.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Search space for exhaustive matmul autotuning.
+constexpr std::array<int, 6> kBlockSizes = {16, 32, 64, 128, 256, 512};
+constexpr std::array<int, 4> kNumStages = {1, 2, 3, 4};
+constexpr std::array<int, 4> kNumWarps = {2, 4, 8, 16};
+constexpr std::array<int, 5> kSplitK = {1, 2, 4, 8, 16};
+constexpr std::array<int, 5> kNumCtas = {1, 2, 4, 8, 16};
+
+}  // namespace
+
+using TritonBackendConfig = AutotuneResult::TritonGemmKey;
+
+absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+TritonBackend::GetSupportedConfigs(
+    const HloInstruction& instr,
+    stream_executor::StreamExecutor* stream_executor) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError(
+        "TritonBackend does not support this instruction.");
+  }
+  se::GpuComputeCapability gcc =
+      target_config().device_description.gpu_compute_capability();
+  bool is_rocm = std::holds_alternative<se::RocmComputeCapability>(gcc);
+  auto cuda_compute_capability = std::get_if<se::CudaComputeCapability>(&gcc);
+
+  bool tune_ctas = !is_rocm && cuda_compute_capability &&
+                   cuda_compute_capability->IsAtLeastHopper();
+
+  const int64_t threads_per_warp =
+      target_config().device_description.threads_per_warp();
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  for (int num_stages : kNumStages) {
+    for (int tile_m : kBlockSizes) {
+      for (int tile_n : kBlockSizes) {
+        for (int tile_k : kBlockSizes) {
+          const int tile_lhs = tile_m * tile_k;
+          const int tile_rhs = tile_k * tile_n;
+          for (int num_warps : kNumWarps) {
+            // Each thread should read at least one input element.
+            if (num_warps * threads_per_warp > std::min(tile_lhs, tile_rhs)) {
+              break;
+            }
+            for (int split_k : kSplitK) {
+              // Split-K autotuning may be disabled by a flag.
+              if (debug_options().xla_gpu_enable_split_k_autotuning() &&
+                  split_k > 1) {
+                break;
+              }
+              for (int num_ctas : kNumCtas) {
+                // Clusters are only supported on Hopper.
+                // Autotuning this parameter is enabled by a flag.
+                if (!tune_ctas && num_ctas > 1) {
+                  break;
+                }
+                if (num_ctas > num_warps) {
+                  break;
+                }
+                configs.push_back(std::make_unique<TritonBackendConfig>(
+                    TritonGemmConfig(tile_m, tile_n, tile_k, split_k,
+                                     num_stages, num_warps, num_ctas)
+                        .ToProto()));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return configs;
+}
+
+absl::StatusOr<std::unique_ptr<BackendConfig>> TritonBackend::GetDefaultConfig(
+    const HloInstruction& instr) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError(
+        "TritonBackend does not support this instruction.");
+  }
+  return std::make_unique<TritonBackendConfig>(
+      TritonGemmConfig(64, 64, 64, 1, 1, 2, 1).ToProto());
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> TritonBackend::WrapInModule(
+    const HloInstruction& instr, const BackendConfig& config) {
+  if (!IsSupported(instr)) {
+    return absl::InvalidArgumentError(
+        "TritonBackend does not support this instruction.");
+  }
+  if (config.GetDescriptor() != TritonBackendConfig::GetDescriptor()) {
+    return absl::InvalidArgumentError(
+        "Invalid backend config type for TritonBackend.");
+  }
+  const TritonBackendConfig& triton_config_proto =
+      static_cast<const TritonBackendConfig&>(config);
+
+  std::unique_ptr<HloModule> new_module =
+      ExtractInstructionIntoNewModule(instr);
+
+  HloComputation* entry_computation = new_module->entry_computation();
+  HloInstruction* dot_fusion = entry_computation->root_instruction();
+
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
+                      dot_fusion->backend_config<GpuBackendConfig>());
+  FusionBackendConfig& backend_config =
+      *gpu_config.mutable_fusion_backend_config();
+
+  *backend_config.mutable_triton_gemm_config() = triton_config_proto;
+  TF_RETURN_IF_ERROR(dot_fusion->set_backend_config(gpu_config));
+
+  TF_ASSIGN_OR_RETURN(TritonGemmConfig triton_config,
+                      TritonGemmConfig::FromProto(triton_config_proto));
+  if (triton_config.split_k > 1) {
+    TF_RETURN_IF_ERROR(MakeDotSplitKBatch(dot_fusion, triton_config));
+  }
+  return new_module;
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> TritonBackend::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    const Compiler::CompileOptions& options) {
+  auto gpu_device_info = target_config().device_description;
+  for (PrimitiveType type :
+       {BF16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ, F8E5M2FNUZ, F8E4M3FNUZ}) {
+    GpuFloatSupport float_support(gpu_device_info.cuda_compute_capability(),
+                                  type);
+    FloatNormalization float_normalization(&float_support);
+    TF_RETURN_IF_ERROR(float_normalization.Run(hlo_module.get()).status());
+  }
+
+  HloCostAnalysis::Options priority_fusion_options;
+  priority_fusion_options.count_multiple_input_accesses = true;
+  PriorityFusion priority_fusion(
+      /*thread_pool=*/nullptr, gpu_device_info, priority_fusion_options);
+  TF_RETURN_IF_ERROR(priority_fusion.Run(hlo_module.get()).status());
+
+  // If the priority fusion pass above skipped some instructions, turn them
+  // into fusions.
+  FusionWrapper fusion_wrapper(gpu_device_info);
+  TF_RETURN_IF_ERROR(fusion_wrapper.Run(hlo_module.get()).status());
+
+  if (debug_options()
+          .xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms()) {
+    NestGemmFusion nest_gemm_fusion(gpu_device_info.gpu_compute_capability());
+    TF_RETURN_IF_ERROR(nest_gemm_fusion.Run(hlo_module.get()).status());
+  }
+
+  return hlo_module;
+}
+
+bool TritonBackend::IsSupported(const HloInstruction& instr) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return false;
+  }
+  auto gpu_config = instr.backend_config<GpuBackendConfig>();
+  if (!gpu_config.ok()) {
+    return false;
+  }
+  const FusionBackendConfig& backend_config =
+      gpu_config->fusion_backend_config();
+  return backend_config.kind() == kTritonGemmFusionKind ||
+         backend_config.kind() == kCuDnnFusionKind ||
+         backend_config.kind() == kCustomFusionKind;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/triton.h b/third_party/xla/xla/backends/autotuner/backends/gpu/triton.h
new file mode 100644
index 000000000000..bd213faca6ae
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/triton.h
@@ -0,0 +1,61 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_TRITON_H_
+#define XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_TRITON_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/autotuner/backends/gpu/gpu_codegen_backend.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+
+namespace gpu {
+
+class TritonBackend : public GpuCodegenBackend {
+ public:
+  explicit TritonBackend(const Compiler::TargetConfig* target_config,
+                         const DebugOptions* debug_options, Compiler* compiler)
+      : GpuCodegenBackend("Triton", target_config, debug_options, compiler) {}
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(
+      const HloInstruction& instr,
+      stream_executor::StreamExecutor* stream_executor) override;
+  absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) override;
+
+ private:
+  absl::StatusOr<std::unique_ptr<HloModule>> WrapInModule(
+      const HloInstruction& instr, const BackendConfig& config) override;
+
+  absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      const Compiler::CompileOptions& options) override;
+
+  bool IsSupported(const HloInstruction& instr);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_AUTOTUNER_BACKENDS_GPU_TRITON_H_
diff --git a/third_party/xla/xla/backends/autotuner/backends/gpu/triton_test.cc b/third_party/xla/xla/backends/autotuner/backends/gpu/triton_test.cc
new file mode 100644
index 000000000000..563e711b800e
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/backends/gpu/triton_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/autotuner/backends/gpu/triton.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/autotuner/codegen_backend.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/nvptx_compiler.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+using TritonBackendConfig = AutotuneResult::TritonGemmKey;
+
+const char kHlo[] = R"(
+  HloModule module
+
+  computation {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    convert0 = f32[1024,1024]{1,0} convert(p0)
+    p1 = bf16[1024,1024]{1,0} parameter(1)
+    convert1 = f32[1024,1024]{1,0} convert(p1)
+    ROOT dot = f32[1024,1024]{1,0} dot(convert0, convert1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY main {
+    p0 = bf16[1024,1024]{1,0} parameter(0)
+    p1 = bf16[1024,1024]{1,0} parameter(1)
+    ROOT fusion = f32[1024,1024]{1,0} fusion(p0, p1),
+      kind=kCustom, calls=computation,
+      backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+  })";
+
+class TritonBackendTest : public HloHardwareIndependentTestBase {
+ protected:
+  TritonBackendTest()
+      : target_config_([]() {
+          se::GpuTargetConfigProto target_config_proto;
+          *target_config_proto.mutable_gpu_device_info() =
+              TestGpuDeviceInfo().CudaOrRocmDeviceInfo().ToGpuProto();
+          return Compiler::TargetConfig(target_config_proto);
+        }()),
+        backend_(&target_config_, &debug_options_, &compiler_) {}
+
+  DebugOptions debug_options_;
+  NVPTXCompiler compiler_;
+  Compiler::TargetConfig target_config_;
+  TritonBackend backend_;
+};
+
+TEST_F(TritonBackendTest, GetSupportedConfigs) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend_.GetSupportedConfigs(
+          *(module->entry_computation()->root_instruction()), nullptr);
+  EXPECT_THAT(configs, IsOk());
+  EXPECT_GT(configs.value().size(), 0);
+}
+
+TEST_F(TritonBackendTest, GetSupportedConfigsForUnsupportedInstruction) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloInstruction* unsupported_instr = module->entry_computation()
+                                          ->root_instruction()
+                                          ->called_computations()[0]
+                                          ->root_instruction();
+  absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>> configs =
+      backend_.GetSupportedConfigs(*unsupported_instr, nullptr);
+  EXPECT_THAT(configs, StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(TritonBackendTest, GetDefaultConfig) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  TritonBackendConfig expected_config =
+      TritonGemmConfig(64, 64, 64, 1, 1, 2, 1).ToProto();
+
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(
+          *(module->entry_computation()->root_instruction()));
+
+  EXPECT_THAT(config, IsOk());
+  const TritonBackendConfig& actual_config =
+      static_cast<const TritonBackendConfig&>(*config.value());
+  EXPECT_THAT(actual_config, EqualsProto(expected_config));
+}
+
+TEST_F(TritonBackendTest, GetDefaultConfigForUnsupportedInstruction) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloInstruction* unsupported_instr = module->entry_computation()
+                                          ->root_instruction()
+                                          ->called_computations()[0]
+                                          ->root_instruction();
+  absl::StatusOr<std::unique_ptr<BackendConfig>> config =
+      backend_.GetDefaultConfig(*unsupported_instr);
+  EXPECT_THAT(config.status(), StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(TritonBackendTest, Compile) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BackendConfig> config,
+      backend_.GetDefaultConfig(
+          *(module->entry_computation()->root_instruction())));
+  absl::StatusOr<std::unique_ptr<Executable>> executable = backend_.Compile(
+      *(module->entry_computation()->root_instruction()), *config);
+  EXPECT_THAT(executable, IsOk());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/codegen_backend.h b/third_party/xla/xla/backends/autotuner/codegen_backend.h
new file mode 100644
index 000000000000..9034d9f4ee44
--- /dev/null
+++ b/third_party/xla/xla/backends/autotuner/codegen_backend.h
@@ -0,0 +1,62 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_AUTOTUNER_CODEGEN_BACKEND_H_
+#define XLA_BACKENDS_AUTOTUNER_CODEGEN_BACKEND_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/executable.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+using BackendConfig = tsl::protobuf::Message;
+
+// Interface for a codegen backend which can compile HLO instructions with
+// different configurations. This can be used to get the supported configs, and
+// compile HLO instructions with different configs.
+class CodegenBackend {
+ public:
+  virtual ~CodegenBackend() = default;
+
+  virtual absl::string_view name() const = 0;
+
+  // Returns all supported configs for the given HLO instruction.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
+  GetSupportedConfigs(const HloInstruction& instr,
+                      stream_executor::StreamExecutor* stream_executor) = 0;
+
+  // Returns a default config for the given HLO instruction.
+  virtual absl::StatusOr<std::unique_ptr<BackendConfig>> GetDefaultConfig(
+      const HloInstruction& instr) {
+    return absl::UnimplementedError("Not implemented.");
+  };
+
+  // Wraps the HLO instruction in a module, assigns the given config, and
+  // compiles it.
+  virtual absl::StatusOr<std::unique_ptr<Executable>> Compile(
+      const HloInstruction& instr, const BackendConfig& config) = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_AUTOTUNER_CODEGEN_BACKEND_H_
diff --git a/third_party/xla/xla/backends/cpu/BUILD b/third_party/xla/xla/backends/cpu/BUILD
index c65516d819b7..42d359f6f76c 100644
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@@ -97,11 +97,13 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/backends/cpu/runtime:dot_lib",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
     ],
 )
@@ -114,6 +116,7 @@ cc_library(
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/BUILD b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
index 94fef0fc7820..79a9a51fc367 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/BUILD
+++ b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
@@ -1,4 +1,5 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "tsl_copts")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -14,6 +15,31 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "aot_benchmark_helper",
+    testonly = 1,
+    srcs = ["aot_benchmark_helper.cc"],
+    hdrs = ["aot_benchmark_helper.h"],
+    deps = [
+        "//xla/service:compiler",
+        "//xla/service/cpu:cpu_aot_compilation_result",
+        "//xla/service/cpu:test_header_helper",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "aot_benchmark_helper_test",
+    srcs = ["aot_benchmark_helper_test.cc"],
+    deps = [
+        ":aot_benchmark_helper",
+        "//xla/service:compiler",
+        "//xla/service/cpu:cpu_aot_compilation_result",
+        "//xla/service/cpu:test_header_helper",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "hlo_benchmark_runner",
     testonly = 1,
@@ -21,25 +47,48 @@ cc_library(
     hdrs = ["hlo_benchmark_runner.h"],
     deps = [
         "//xla:literal",
+        "//xla:shape_util",
+        "//xla:util",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/cpu:cpu_client",
         "//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "//xla/service:compiler",
         "//xla/service:hlo_module_config",
         "//xla/tests:test_utils",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
+    ],
+)
+
+xla_cc_test(
+    name = "aliasing_benchmark_test",
+    srcs = ["aliasing_benchmark_test.cc"],
+    deps = [
+        ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:stacktrace_handler",
     ],
 )
 
@@ -49,6 +98,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -68,16 +118,20 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:test_benchmark",
-        "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:stacktrace_handler",
     ],
 )
 
@@ -87,6 +141,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -106,6 +161,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -125,6 +181,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -144,6 +201,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -163,6 +221,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -177,12 +236,25 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "multi_benchmark_config",
+    testonly = 1,
+    hdrs = ["multi_benchmark_config.h"],
+    deps = [
+        ":aot_benchmark_helper",
+        ":hlo_benchmark_runner",
+        "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 xla_cc_test(
     name = "optimizer_benchmark_test",
     srcs = ["optimizer_benchmark_test.cc"],
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -202,6 +274,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -221,6 +294,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -238,6 +312,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -260,6 +335,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:array2d",
         "//xla:literal",
         "//xla:literal_util",
@@ -280,6 +356,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -299,6 +376,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -318,6 +396,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -335,6 +414,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -348,12 +428,36 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "onednn_matmul_benchmark_test",
+    srcs = ["onednn_matmul_benchmark_test.cc"],
+    copts = tsl_copts(),
+    fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
+    deps = [
+        ":hlo_benchmark_runner",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service/cpu:onednn_util",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_cc_test(
     name = "tanh_benchmark_test",
     srcs = ["tanh_benchmark_test.cc"],
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -373,6 +477,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:array2d",
         "//xla:literal",
         "//xla:literal_util",
@@ -386,7 +491,6 @@ xla_cc_test(
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -396,6 +500,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -415,6 +520,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -434,6 +540,7 @@ xla_cc_test(
     fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
     deps = [
         ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/aliasing_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/aliasing_benchmark_test.cc
new file mode 100644
index 000000000000..f949af51a35a
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/benchmarks/aliasing_benchmark_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/stacktrace_handler.h"
+
+namespace xla::cpu {
+namespace {
+
+static void BM_ModelWithAliasing(benchmark::State& state,
+                                 HloBenchmarkOptions options) {
+  int64_t num_executions = state.range(0);
+
+  absl::string_view hlo = R"(
+HloModule add_one_aliased, input_output_alias={ {0}: (0, {}, may-alias) }
+
+ENTRY main.5 {
+  x = f32[] parameter(0), sharding={replicated}
+  constant = f32[] constant(1)
+  add_result = f32[] add(x, constant)
+  ROOT tuple_result = (f32[]) tuple(add_result)
+}
+)";
+
+  options.num_executions = num_executions;
+
+  CHECK_OK(RunHloBenchmark(state, hlo, {}, {}, options));
+}
+
+XLA_CPU_BENCHMARK(BM_ModelWithAliasing)
+    ->ArgName("num_executions")
+    ->Arg(1)
+    ->Arg(8);
+
+}  // namespace
+}  // namespace xla::cpu
+
+GTEST_API_ int main(int argc, char** argv) {
+  tsl::testing::InstallStacktraceHandler();
+
+  ::benchmark::Initialize(&argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  ::benchmark::RunSpecifiedBenchmarks();
+  ::benchmark::Shutdown();
+  return 0;
+}
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper.cc b/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper.cc
new file mode 100644
index 000000000000..989452fd7b14
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper.cc
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/benchmarks/aot_benchmark_helper.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "xla/service/compiler.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
+#include "xla/service/cpu/test_target_triple_helper.h"
+
+namespace xla::cpu {
+std::unique_ptr<AotCompilationOptions> GetAotCompilationOptions(
+    absl::string_view entry_point_name,
+    CpuAotCompilationOptions::RelocationModel relocation_model,
+    absl::string_view features) {
+  std::string entry_point_name_str(entry_point_name);
+  std::string features_str(features);
+  return std::make_unique<CpuAotCompilationOptions>(
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/std::move(features_str),
+      /*entry_point_name=*/std::move(entry_point_name_str),
+      /*relocation_model=*/relocation_model);
+}
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper.h b/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper.h
new file mode 100644
index 000000000000..04d8640cf1c3
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper.h
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_BENCHMARKS_AOT_BENCHMARK_HELPER_H_
+#define XLA_BACKENDS_CPU_BENCHMARKS_AOT_BENCHMARK_HELPER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/service/compiler.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
+
+namespace xla::cpu {
+
+namespace internal {
+inline constexpr absl::string_view kEntryPointNameDefault = "entry";
+}  // end namespace internal
+
+std::unique_ptr<AotCompilationOptions> GetAotCompilationOptions(
+    absl::string_view entry_point_name = internal::kEntryPointNameDefault,
+    CpuAotCompilationOptions::RelocationModel relocation_model =
+        CpuAotCompilationOptions::RelocationModel::BigPic,
+    absl::string_view features = "");
+
+};  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_BENCHMARKS_AOT_BENCHMARK_HELPER_H_
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper_test.cc
new file mode 100644
index 000000000000..e3a16dfa84e4
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/benchmarks/aot_benchmark_helper_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/benchmarks/aot_benchmark_helper.h"
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "xla/service/compiler.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
+#include "xla/service/cpu/test_target_triple_helper.h"
+
+namespace xla::cpu {
+namespace {
+
+TEST(AotBenchmarkHelperTest, GetAotCompilationOptions) {
+  std::shared_ptr<xla::AotCompilationOptions> options =
+      xla::cpu::GetAotCompilationOptions();
+
+  xla::cpu::CpuAotCompilationOptions* cpu_options =
+      dynamic_cast<xla::cpu::CpuAotCompilationOptions*>(options.get());
+
+  EXPECT_NE(cpu_options, nullptr);
+
+  EXPECT_EQ(cpu_options->entry_point_name(),
+            xla::cpu::internal::kEntryPointNameDefault);
+  EXPECT_EQ(cpu_options->relocation_model(),
+            xla::cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
+  EXPECT_EQ(cpu_options->triple(), kTargetTripleForHost);
+  EXPECT_EQ(cpu_options->cpu_name(), kTargetCpuForHost);
+  EXPECT_EQ(cpu_options->features(), "");
+}
+
+TEST(AotBenchmarkHelperTest, GetAotCompilationOptionsCustomValues) {
+  const std::string entry_point_name = "entry_point_name";
+  const CpuAotCompilationOptions::RelocationModel relocation_model =
+      CpuAotCompilationOptions::RelocationModel::BigPie;
+  const std::string features = "features";
+
+  std::shared_ptr<xla::AotCompilationOptions> options =
+      xla::cpu::GetAotCompilationOptions(entry_point_name, relocation_model,
+                                         features);
+
+  xla::cpu::CpuAotCompilationOptions* cpu_options =
+      dynamic_cast<xla::cpu::CpuAotCompilationOptions*>(options.get());
+
+  EXPECT_NE(cpu_options, nullptr);
+
+  EXPECT_EQ(cpu_options->entry_point_name(), entry_point_name);
+  EXPECT_EQ(cpu_options->relocation_model(), relocation_model);
+  EXPECT_EQ(cpu_options->triple(), kTargetTripleForHost);
+  EXPECT_EQ(cpu_options->cpu_name(), kTargetCpuForHost);
+  EXPECT_EQ(cpu_options->features(), features);
+}
+
+}  // namespace
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/concatenate_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/concatenate_benchmark_test.cc
index 733649e511f3..793c79d42529 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/concatenate_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/concatenate_benchmark_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
@@ -32,7 +33,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_ConcatenateTwoR3F32(benchmark::State& state) {
+static void BM_ConcatenateTwoR3F32(benchmark::State& state,
+                                   HloBenchmarkOptions options) {
   bool disable_parallel_backend = !static_cast<bool>(state.range(0));
   int64_t dims[3] = {state.range(1), state.range(2), state.range(3)};
   Shape shape = ShapeUtil::MakeShape(F32, dims);
@@ -57,8 +59,7 @@ static void BM_ConcatenateTwoR3F32(benchmark::State& state) {
   auto p0 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
   auto p1 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
 
-  HloBenchmarkOptions benchmark_options;
-  benchmark_options.disable_parallel_task_assigner = disable_parallel_backend;
+  options.disable_parallel_task_assigner = disable_parallel_backend;
 
   std::vector<const Literal*> args = {&p0, &p1};
   CHECK_OK(RunHloBenchmark(state, hlo, args,
@@ -66,10 +67,10 @@ static void BM_ConcatenateTwoR3F32(benchmark::State& state) {
                             {"$shape", absl::StrJoin(dims, ",")},
                             {"$out_shape", absl::StrJoin(out_dims, ",")},
                             {"$axis", absl::StrCat(axis)}},
-                           benchmark_options));
+                           options));
 }
 
-BENCHMARK(BM_ConcatenateTwoR3F32)
+XLA_CPU_BENCHMARK(BM_ConcatenateTwoR3F32)
     ->MeasureProcessCPUTime()
     ->ArgNames({"parallel", "batch", "width", "height", "axis"})
     // Fast Concat (memcpy, no parallelism)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/convolution_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/convolution_benchmark_test.cc
index 3d355905eccc..e3beb047c090 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/convolution_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/convolution_benchmark_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -32,7 +33,7 @@ namespace {
 bool IsOdd(int n) { return n % 2 == 1; }
 
 template <PrimitiveType ElementType>
-static void BM_Conv2D(benchmark::State& state) {
+static void BM_Conv2D(benchmark::State& state, HloBenchmarkOptions options) {
   int batch = state.range(0);
   int height = state.range(1);
   int width = state.range(2);
@@ -78,10 +79,12 @@ static void BM_Conv2D(benchmark::State& state) {
                        {"$kernel_shape", kernel_shape.ToString()},
                        {"$window_size", absl::StrCat(kernel_h, "x", kernel_w)},
                        {"$padding", absl::StrCat(padding_h, "_", padding_h, "x",
-                                                 padding_w, "_", padding_w)}}));
+                                                 padding_w, "_", padding_w)}},
+                      options));
 }
 
-static void BM_GroupedConv2D(benchmark::State& state) {
+static void BM_GroupedConv2D(benchmark::State& state,
+                             HloBenchmarkOptions options) {
   int batch = state.range(0);
   int height = state.range(1);
   int width = state.range(2);
@@ -133,11 +136,13 @@ static void BM_GroupedConv2D(benchmark::State& state) {
        {"$window_size", absl::StrCat(kernel_h, "x", kernel_w)},
        {"$padding", absl::StrCat(padding_h, "_", padding_h, "x", padding_w, "_",
                                  padding_w)},
-       {"$feature_group_count", absl::StrCat(feature_group_count)}}));
+       {"$feature_group_count", absl::StrCat(feature_group_count)}},
+      options));
 }
 
 // Regular strided 1D convolution. Shapes come from an actual use case.
-static void BM_Conv1DStrided(benchmark::State& state) {
+static void BM_Conv1DStrided(benchmark::State& state,
+                             HloBenchmarkOptions options) {
   int input_channels = state.range(0);
   int output_channels = state.range(1);
 
@@ -170,7 +175,8 @@ static void BM_Conv1DStrided(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo_module, args,
                            {{"$input_shape", input_shape.ToString()},
                             {"$kernel_shape", kernel_shape.ToString()},
-                            {"$output_shape", output_shape.ToString()}}));
+                            {"$output_shape", output_shape.ToString()}},
+                           options));
 }
 
 // Transposed version (i.e. gradient) of BM_Conv1DStrided. In terms of shapes,
@@ -179,7 +185,8 @@ static void BM_Conv1DStrided(benchmark::State& state) {
 // performance of this function with BM_Conv1DStrided).
 // Currently, the performance is few times worse than regular conv when they
 // should be similar.
-static void BM_Conv1DTransposedStrided(benchmark::State& state) {
+static void BM_Conv1DTransposedStrided(benchmark::State& state,
+                                       HloBenchmarkOptions options) {
   int input_channels = state.range(0);
   int output_channels = state.range(1);
 
@@ -212,12 +219,13 @@ static void BM_Conv1DTransposedStrided(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo_module, args,
                            {{"$input_shape", input_shape.ToString()},
                             {"$kernel_shape", kernel_shape.ToString()},
-                            {"$output_shape", output_shape.ToString()}}));
+                            {"$output_shape", output_shape.ToString()}},
+                           options));
 }
 
 // The same shapes as BM_Conv1DTransposedStrided, but with a different layout.
 static void BM_Conv1DTransposedStridedNonDefaultLayout(
-    benchmark::State& state) {
+    benchmark::State& state, HloBenchmarkOptions options) {
   int input_channels = state.range(0);
   int output_channels = state.range(1);
   std::string hlo_module = R"(
@@ -249,12 +257,14 @@ static void BM_Conv1DTransposedStridedNonDefaultLayout(
   CHECK_OK(RunHloBenchmark(state, hlo_module, args,
                            {{"$input_shape", input_shape.ToString()},
                             {"$kernel_shape", kernel_shape.ToString()},
-                            {"$output_shape", output_shape.ToString()}}));
+                            {"$output_shape", output_shape.ToString()}},
+                           options));
 }
 
 // Regular strided 2D convolution. Buffer sizes and convolution parameters are
 // based on an actual 1D use case, but adapted to a 2D convolution.
-static void BM_Conv2DStrided(benchmark::State& state) {
+static void BM_Conv2DStrided(benchmark::State& state,
+                             HloBenchmarkOptions options) {
   std::string hlo_module = R"(
     HloModule jit_jconvf
 
@@ -279,7 +289,7 @@ static void BM_Conv2DStrided(benchmark::State& state) {
       *LiteralUtil::CreateRandomLiteral<F32>(kernel_shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&input, &kernel};
 
-  CHECK_OK(RunHloBenchmark(state, hlo_module, args));
+  CHECK_OK(RunHloBenchmark(state, hlo_module, args, {}, options));
 }
 
 // Transposed version (i.e. gradient) of BM_Conv2DStrided. In terms of shapes,
@@ -288,7 +298,8 @@ static void BM_Conv2DStrided(benchmark::State& state) {
 // performance of this function with BM_Conv2DStrided).
 // Currently, the performance is orders of magnitude worse than regular conv
 // when they should be similar.
-static void BM_Conv2DTransposedStrided(benchmark::State& state) {
+static void BM_Conv2DTransposedStrided(benchmark::State& state,
+                                       HloBenchmarkOptions options) {
   std::string hlo_module = R"(
     HloModule jit_jconvt
 
@@ -314,11 +325,12 @@ static void BM_Conv2DTransposedStrided(benchmark::State& state) {
       *LiteralUtil::CreateRandomLiteral<F32>(kernel_shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&input, &kernel};
 
-  CHECK_OK(RunHloBenchmark(state, hlo_module, args));
+  CHECK_OK(RunHloBenchmark(state, hlo_module, args, {}, options));
 }
 
 // Regular (i.e. non-transposed) grouped and strided 2D convolution.
-static void BM_GroupedConv2DStrided(benchmark::State& state) {
+static void BM_GroupedConv2DStrided(benchmark::State& state,
+                                    HloBenchmarkOptions options) {
   int input_channels = state.range(0);
   int output_channels = state.range(1);
   int feature_group_count = state.range(2);
@@ -356,14 +368,16 @@ static void BM_GroupedConv2DStrided(benchmark::State& state) {
       state, hlo_module, args,
       {{"$input_shape", input_shape.ToString()},
        {"$kernel_shape", kernel_shape.ToString()},
-       {"$feature_group_count", std::to_string(feature_group_count)}}));
+       {"$feature_group_count", std::to_string(feature_group_count)}},
+      options));
 }
 
 // Transposed version (i.e. gradient) of BM_GroupedConv2DStrided. In terms of
 // shapes, this operation can be thought of as reverse of regular strided
 // convolution, that's why input and output shapes are swapped (so we can
 // directly compare performance of this function with BM_GroupedConv2DStrided).
-static void BM_GroupedConv2DTransposedStrided(benchmark::State& state) {
+static void BM_GroupedConv2DTransposedStrided(benchmark::State& state,
+                                              HloBenchmarkOptions options) {
   int input_channels = state.range(0);
   int output_channels = state.range(1);
   int feature_group_count = state.range(2);
@@ -401,56 +415,40 @@ static void BM_GroupedConv2DTransposedStrided(benchmark::State& state) {
       state, hlo_module, args,
       {{"$input_shape", input_shape.ToString()},
        {"$kernel_shape", kernel_shape.ToString()},
-       {"$feature_group_count", std::to_string(feature_group_count)}}));
+       {"$feature_group_count", std::to_string(feature_group_count)}},
+      options));
 }
 
-// -------------------------------------------------------------------------- //
-// Pixel CNN convolutions.
-// -------------------------------------------------------------------------- //
-
-// Shapes from XLA convolution tests
-BENCHMARK(BM_Conv2D<F32>)
+XLA_CPU_BENCHMARK(BM_Conv2D<F32>)
     ->MeasureProcessCPUTime()
+    // --------------------------------------------------------------------------
+    // // Pixel CNN convolutions.
+    // --------------------------------------------------------------------------
+    // // Shapes from XLA convolution tests
     ->Args({8, 5, 5, 1, 1, 1, 32})
     ->Args({8, 5, 5, 4, 1, 1, 32})
-    ->Args({8, 128, 128, 4, 1, 1, 8});
-
-// Shapes from TF convolution benchmarks.
-BENCHMARK(BM_Conv2D<F32>)
-    ->MeasureProcessCPUTime()
+    ->Args({8, 128, 128, 4, 1, 1, 8})
+    // Shapes from TF convolution benchmarks.
     ->Args({8, 32, 32, 128, 1, 1, 1024})
     ->Args({16, 32, 32, 128, 1, 1, 1024})
-    ->Args({32, 32, 32, 128, 1, 1, 1024});
-
-// Shapes similar to Eigen spatial convolution benchmarks.
-BENCHMARK(BM_Conv2D<F32>)
-    ->MeasureProcessCPUTime()
+    ->Args({32, 32, 32, 128, 1, 1, 1024})
+    // Shapes similar to Eigen spatial convolution benchmarks.
     ->Args({32, 64, 64, 32, 1, 1, 64})
     ->Args({32, 256, 256, 4, 1, 1, 16})
     ->Args({32, 64, 64, 4, 1, 1, 16})
-    ->Args({32, 32, 32, 96, 1, 1, 96});
-
-// -------------------------------------------------------------------------- //
-// 3x3 Convolution: SpatialConvolution
-// -------------------------------------------------------------------------- //
-
-// Shapes from XLA convolution tests
-BENCHMARK(BM_Conv2D<F32>)
-    ->MeasureProcessCPUTime()
+    ->Args({32, 32, 32, 96, 1, 1, 96})
+    // --------------------------------------------------------------------------
+    // // 3x3 Convolution: SpatialConvolution
+    // --------------------------------------------------------------------------
+    // // Shapes from XLA convolution tests
     ->Args({8, 5, 5, 1, 3, 3, 32})
     ->Args({8, 5, 5, 4, 3, 3, 32})
-    ->Args({8, 128, 128, 4, 3, 3, 8});
-
-// Shapes from TF convolution benchmarks
-BENCHMARK(BM_Conv2D<F32>)
-    ->MeasureProcessCPUTime()
+    ->Args({8, 128, 128, 4, 3, 3, 8})
+    // Shapes from TF convolution benchmarks
     ->Args({8, 32, 32, 128, 3, 3, 1024})
     ->Args({16, 32, 32, 128, 3, 3, 1024})
-    ->Args({32, 32, 32, 128, 3, 3, 1024});
-
-// Shapes similar to Eigen spatial convolution benchmarks.
-BENCHMARK(BM_Conv2D<F32>)
-    ->MeasureProcessCPUTime()
+    ->Args({32, 32, 32, 128, 3, 3, 1024})
+    // Shapes similar to Eigen spatial convolution benchmarks.
     ->Args({32, 64, 64, 32, 3, 3, 64})
     ->Args({32, 256, 256, 4, 3, 3, 16})
     ->Args({32, 64, 64, 4, 3, 3, 16})
@@ -460,7 +458,7 @@ BENCHMARK(BM_Conv2D<F32>)
 // Grouped convolution
 // -------------------------------------------------------------------------- //
 
-BENCHMARK(BM_GroupedConv2D)
+XLA_CPU_BENCHMARK(BM_GroupedConv2D)
     ->MeasureProcessCPUTime()
     ->Args({1, 45, 45, 1024, 5, 5, 1024, 1024});
 
@@ -468,38 +466,34 @@ BENCHMARK(BM_GroupedConv2D)
 // 1D and 2D strided convolutions
 // -------------------------------------------------------------------------- //
 
-BENCHMARK(BM_Conv1DStrided)
+XLA_CPU_BENCHMARK(BM_Conv1DStrided)
     ->MeasureProcessCPUTime()
     ->Args({1, 129})
     ->Args({3, 129});
-BENCHMARK(BM_Conv1DTransposedStrided)
+XLA_CPU_BENCHMARK(BM_Conv1DTransposedStrided)
     ->MeasureProcessCPUTime()
     ->MeasureProcessCPUTime()
     ->Args({129, 1})
     ->Args({129, 3});
-BENCHMARK(BM_Conv1DTransposedStridedNonDefaultLayout)
+XLA_CPU_BENCHMARK(BM_Conv1DTransposedStridedNonDefaultLayout)
     ->MeasureProcessCPUTime()
     ->Args({129, 1})
     ->Args({129, 3});
 
-BENCHMARK(BM_Conv2DStrided)->MeasureProcessCPUTime();
-BENCHMARK(BM_Conv2DTransposedStrided)->MeasureProcessCPUTime();
+XLA_CPU_BENCHMARK(BM_Conv2DStrided)->MeasureProcessCPUTime();
+XLA_CPU_BENCHMARK(BM_Conv2DTransposedStrided)->MeasureProcessCPUTime();
 
 // -------------------------------------------------------------------------- //
 // Grouped strided convolutions
 // -------------------------------------------------------------------------- //
 
-BENCHMARK(BM_GroupedConv2DStrided)
-    ->MeasureProcessCPUTime()
-    ->Args({128, 128, 128});
-BENCHMARK(BM_GroupedConv2DTransposedStrided)
-    ->MeasureProcessCPUTime()
-    ->Args({128, 128, 128});
-BENCHMARK(BM_GroupedConv2DStrided)
+XLA_CPU_BENCHMARK(BM_GroupedConv2DStrided)
     ->MeasureProcessCPUTime()
+    ->Args({128, 128, 128})
     ->Args({128, 128, 16});
-BENCHMARK(BM_GroupedConv2DTransposedStrided)
+XLA_CPU_BENCHMARK(BM_GroupedConv2DTransposedStrided)
     ->MeasureProcessCPUTime()
+    ->Args({128, 128, 128})
     ->Args({128, 128, 16});
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/custom_call_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/custom_call_benchmark_test.cc
index c17f162a4619..c5a759c7ca06 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/custom_call_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/custom_call_benchmark_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/literal.h"
@@ -49,7 +50,8 @@ XLA_FFI_DEFINE_HANDLER(
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_bm$$minimal", "Host",
                          kMinimal);
 
-static void BM_CustomCall_Minimal(benchmark::State& state) {
+static void BM_CustomCall_Minimal(benchmark::State& state,
+                                  HloBenchmarkOptions options) {
   const char* kModuleStr = R"(
     HloModule module
 
@@ -60,7 +62,7 @@ static void BM_CustomCall_Minimal(benchmark::State& state) {
     }
   )";
   CHECK_OK(RunHloBenchmark(state, kModuleStr, /*args=*/{},
-                           /*replacements=*/{}));
+                           /*replacements=*/{}, options));
 }
 
 static absl::Status ManyIntAttributes(
@@ -94,7 +96,8 @@ XLA_FFI_DEFINE_HANDLER(kManyIntAttributes, ManyIntAttributes,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_bm$$many_int_attributes",
                          "Host", kManyIntAttributes);
 
-static void BM_CustomCall_16IntAttributes(benchmark::State& state) {
+static void BM_CustomCall_16IntAttributes(benchmark::State& state,
+                                          HloBenchmarkOptions options) {
   absl::string_view hlo = R"(
     HloModule module
 
@@ -111,7 +114,8 @@ static void BM_CustomCall_16IntAttributes(benchmark::State& state) {
   }
   config << "}";
   CHECK_OK(RunHloBenchmark(state, hlo, /*args=*/{},
-                           /*replacements=*/{{"$config", config.str()}}));
+                           /*replacements=*/{{"$config", config.str()}},
+                           options));
 }
 
 static absl::Status ManyFloatBuffers(
@@ -151,7 +155,8 @@ XLA_FFI_DEFINE_HANDLER(kManyFloatBuffers, ManyFloatBuffers,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_bm$$many_float_buffers",
                          "Host", kManyFloatBuffers);
 
-static void BM_CustomCall_16FloatBuffers(benchmark::State& state) {
+static void BM_CustomCall_16FloatBuffers(benchmark::State& state,
+                                         HloBenchmarkOptions options) {
   int64_t d = 128;
 
   absl::string_view hlo = R"(
@@ -182,12 +187,13 @@ static void BM_CustomCall_16FloatBuffers(benchmark::State& state) {
   auto p0 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args(10, &p0);
 
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d", absl::StrCat(d)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d", absl::StrCat(d)}}, options));
 }
 
-BENCHMARK(BM_CustomCall_Minimal)->MeasureProcessCPUTime();
-BENCHMARK(BM_CustomCall_16IntAttributes)->MeasureProcessCPUTime();
-BENCHMARK(BM_CustomCall_16FloatBuffers)->MeasureProcessCPUTime();
+XLA_CPU_BENCHMARK(BM_CustomCall_Minimal)->MeasureProcessCPUTime();
+XLA_CPU_BENCHMARK(BM_CustomCall_16IntAttributes)->MeasureProcessCPUTime();
+XLA_CPU_BENCHMARK(BM_CustomCall_16FloatBuffers)->MeasureProcessCPUTime();
 
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/dag_execution_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/dag_execution_benchmark_test.cc
index 3d1782f34db0..82423c92a4ab 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/dag_execution_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/dag_execution_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_DagExecution(benchmark::State& state) {
+static void BM_DagExecution(benchmark::State& state,
+                            HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   // We use this benchmark to test how well XLA does the scheduling of the HLO
@@ -88,10 +90,11 @@ static void BM_DagExecution(benchmark::State& state) {
   auto p0 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-BENCHMARK(BM_DagExecution)
+XLA_CPU_BENCHMARK(BM_DagExecution)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/dot_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/dot_benchmark_test.cc
index 3e46d5ea1fe9..ca1243d3be0d 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/dot_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/dot_benchmark_test.cc
@@ -14,25 +14,56 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <memory>
 #include <random>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include <gtest/gtest.h>
+#include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/stacktrace_handler.h"
 
 namespace xla::cpu {
 
-static void BM_BatchedDot(benchmark::State& state) {
+Literal GetRandomLiteral(const Shape& shape) {
+  double mean = 1.0f;
+  double stddev = 0.1f;
+  std::minstd_rand0 engine;
+  PrimitiveType dtype = shape.element_type();
+  switch (dtype) {
+    case F32:
+      return *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, mean,
+                                                    stddev);
+    case BF16:
+      return *LiteralUtil::CreateRandomLiteral<BF16>(shape, &engine, mean,
+                                                     stddev);
+    default:
+      LOG(FATAL) << "Add dtype to the if-else block before use: " << dtype;
+  }
+}
+
+static void BM_BatchedDot(benchmark::State& state,
+                          HloBenchmarkOptions options) {
   PrimitiveType dtype = static_cast<PrimitiveType>(state.range(0));
+  PrimitiveType out_dtype = F32;
   int64_t d0 = state.range(1);
   int64_t d1 = state.range(2);
 
@@ -42,37 +73,28 @@ static void BM_BatchedDot(benchmark::State& state) {
     ENTRY e {
       p0 = $dtype[$d0,$d1,$d1] parameter(0)
       p1 = $dtype[$d0,$d1,$d1] parameter(1)
-      ROOT dot = $dtype[$d0,$d1,$d1] dot(p0, p1),
+      ROOT dot = $out_dtype[$d0,$d1,$d1] dot(p0, p1),
         lhs_batch_dims={0}, rhs_batch_dims={0},
         lhs_contracting_dims={2}, rhs_contracting_dims={1}
     }
   )";
 
-  Literal p0, p1;
-  double mean = 1.0f;
-  double stddev = 0.1f;
-  std::minstd_rand0 engine;
   auto shape = ShapeUtil::MakeShape(dtype, {d0, d1, d1});
-  if (dtype == F32) {
-    p0 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, mean, stddev);
-    p1 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, mean, stddev);
-  } else if (dtype == BF16) {
-    p0 = *LiteralUtil::CreateRandomLiteral<BF16>(shape, &engine, mean, stddev);
-    p1 = *LiteralUtil::CreateRandomLiteral<BF16>(shape, &engine, mean, stddev);
-  } else {
-    LOG(FATAL) << "Add dtype to the if-else block before use: " << dtype;
-  }
+  Literal p0 = GetRandomLiteral(shape);
+  Literal p1 = GetRandomLiteral(shape);
 
   std::vector<const Literal*> args = {&p0, &p1};
   CHECK_OK(RunHloBenchmark(
       state, hlo, args,
       {{"$dtype", primitive_util::LowercasePrimitiveTypeName(dtype)},
+       {"$out_dtype", primitive_util::LowercasePrimitiveTypeName(out_dtype)},
        {"$d0", absl::StrCat(d0)},
-       {"$d1", absl::StrCat(d1)}}));
+       {"$d1", absl::StrCat(d1)}},
+      options));
 }
 
 #define BENCHMARK_BATCHED_DOT(dtype) \
-  BENCHMARK(BM_BatchedDot)           \
+  XLA_CPU_BENCHMARK(BM_BatchedDot)   \
       ->MeasureProcessCPUTime()      \
       ->Args({dtype, 1, 2})          \
       ->Args({dtype, 1, 32})         \
@@ -102,4 +124,121 @@ static void BM_BatchedDot(benchmark::State& state) {
 BENCHMARK_BATCHED_DOT(F32);   // Shown as "11" in the benchmark name.
 BENCHMARK_BATCHED_DOT(BF16);  // Shown as "16" in the benchmark name.
 
+namespace {
+
+// LINT.IfChange
+struct GenericDot {
+  std::string name;
+  PrimitiveType lhs_type;
+  std::vector<int64_t> lhs_shape;
+  PrimitiveType rhs_type;
+  std::vector<int64_t> rhs_shape;
+  PrimitiveType out_type;
+  std::vector<int64_t> out_shape;
+  std::vector<int64_t> lhs_batch_dims;
+  std::vector<int64_t> rhs_batch_dims;
+  std::vector<int64_t> lhs_contracting_dims;
+  std::vector<int64_t> rhs_contracting_dims;
+};
+// LINT.ThenChange(//tensorflow/compiler/xla/tools/extract_dots_for_benchmark.cc)
+
+void BM_GenericDot(benchmark::State& state, GenericDot info) {
+  HloComputation::Builder builder("BM_GenericDot");
+  auto lhs_shape = ShapeUtil::MakeShape(info.lhs_type, info.lhs_shape);
+  auto rhs_shape = ShapeUtil::MakeShape(info.rhs_type, info.rhs_shape);
+  auto out_shape = ShapeUtil::MakeShape(info.out_type, info.out_shape);
+  HloInstruction* lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, lhs_shape, "lhs"));
+  HloInstruction* rhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, rhs_shape, "rhs"));
+
+  DotDimensionNumbers dot_dnums;
+  for (int64_t dim : info.lhs_batch_dims) {
+    dot_dnums.add_lhs_batch_dimensions(dim);
+  }
+  for (int64_t dim : info.rhs_batch_dims) {
+    dot_dnums.add_rhs_batch_dimensions(dim);
+  }
+  for (int64_t dim : info.lhs_contracting_dims) {
+    dot_dnums.add_lhs_contracting_dimensions(dim);
+  }
+  for (int64_t dim : info.rhs_contracting_dims) {
+    dot_dnums.add_rhs_contracting_dimensions(dim);
+  }
+  builder.AddInstruction(HloInstruction::CreateDot(
+      out_shape, lhs, rhs, dot_dnums, PrecisionConfig()));
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  Literal lhs_lit = GetRandomLiteral(lhs_shape);
+  Literal rhs_lit = GetRandomLiteral(rhs_shape);
+  std::vector<const Literal*> args = {&lhs_lit, &rhs_lit};
+  CHECK_OK(RunHloBenchmark(state, std::move(computation), args));
+}
+
+std::vector<GenericDot> GetGenericDotList() {
+  std::vector<GenericDot> list;
+  // clang-format off
+  // NOLINTBEGIN
+  // Generate dot entries from an HLO module file by running
+  // //xla/tools:extract_dots_for_benchmark
+  std::string name = "Gemma3_1B_Call";
+  list.insert(list.end(), {
+    GenericDot{name, BF16, {1,11,1152}, BF16, {2,6912,1152}, BF16, {1,11,2,6912}, {}, {}, {2}, {2}},
+    GenericDot{name, BF16, {1,11,1152}, BF16, {4,1152,256}, BF16, {1,11,4,256}, {}, {}, {2}, {1}},
+    GenericDot{name, BF16, {1,11,4,11}, BF16, {1,11,256}, BF16, {1,11,4,256}, {0}, {0}, {3}, {1}},
+    GenericDot{name, BF16, {1,11,4,256}, BF16, {1,11,256}, BF16, {1,11,4,11}, {0}, {0}, {3}, {2}},
+    GenericDot{name, BF16, {1,11,4,256}, BF16, {4,256,1152}, BF16, {1,11,1152}, {}, {}, {3,2}, {1,0}},
+    GenericDot{name, BF16, {1,11,6912}, BF16, {6912,1152}, BF16, {1,11,1152}, {}, {}, {2}, {0}},
+    GenericDot{name, BF16, {1,1152}, BF16, {1152,262144}, BF16, {1,262144}, {}, {}, {1}, {0}},
+    GenericDot{name, BF16, {2,1,1152,256}, BF16, {1,11,1152}, BF16, {2,1,256,1,11}, {}, {}, {2}, {2}}
+  });
+  name = "Gemma3_1B_SampleLoop";
+  list.insert(list.end(), {
+    GenericDot{name, BF16, {1,1,1152}, BF16, {1152,262144}, BF16, {1,1,262144}, {}, {}, {2}, {0}},
+    GenericDot{name, BF16, {1,1,1152}, BF16, {2,6912,1152}, BF16, {1,1,2,6912}, {}, {}, {2}, {2}},
+    GenericDot{name, BF16, {1,1,1152}, BF16, {4,1152,256}, BF16, {1,1,4,256}, {}, {}, {2}, {1}},
+    GenericDot{name, BF16, {1,1,4,256}, BF16, {1,4096,256}, BF16, {1,1,4,4096}, {0}, {0}, {3}, {2}},
+    GenericDot{name, BF16, {1,1,4,256}, BF16, {4,256,1152}, BF16, {1,1,1152}, {}, {}, {3,2}, {1,0}},
+    GenericDot{name, BF16, {1,1,4,4096}, BF16, {1,4096,256}, BF16, {1,1,4,256}, {0}, {0}, {3}, {1}},
+    GenericDot{name, BF16, {1,1,6912}, BF16, {6912,1152}, BF16, {1,1,1152}, {}, {}, {2}, {0}},
+    GenericDot{name, BF16, {2,1,1152,256}, BF16, {1,1,1152}, BF16, {2,1,256,1,1}, {}, {}, {2}, {2}}
+  });
+  // NOLINTEND
+  // clang-format on
+  return list;
+}
+
+std::string BenchmarkName(const GenericDot& dot) {
+  auto dtype_str = absl::AsciiStrToUpper(absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(dot.lhs_type), "_",
+      primitive_util::LowercasePrimitiveTypeName(dot.rhs_type), "_",
+      primitive_util::LowercasePrimitiveTypeName(dot.out_type)));
+  return absl::StrCat("BM_", dot.name, "/", dtype_str, "_",
+                      absl::StrJoin(dot.lhs_shape, "x"), "_",
+                      absl::StrJoin(dot.rhs_shape, "x"), "_",
+                      absl::StrJoin(dot.out_shape, "x"));
+}
+
+void RegisterBenchmarks() {
+  for (const GenericDot& dot : GetGenericDotList()) {
+    benchmark::RegisterBenchmark(BenchmarkName(dot), BM_GenericDot, dot)
+        ->MeasureProcessCPUTime();
+  }
+}
+
+}  // namespace
 }  // namespace xla::cpu
+
+GTEST_API_ int main(int argc, char** argv) {
+  // Only run benchmarks if `--benchmark_filter` is set.
+  for (int i = 1; i < argc; ++i) {
+    if (absl::StartsWith(argv[i], "--benchmark_filter=")) {
+      tsl::testing::InstallStacktraceHandler();
+      ::benchmark::Initialize(&argc, argv);
+      testing::InitGoogleTest(&argc, argv);
+      xla::cpu::RegisterBenchmarks();
+      ::benchmark::RunSpecifiedBenchmarks();
+      return 0;
+    }
+  }
+}
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/dynamic_update_slice_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/dynamic_update_slice_benchmark_test.cc
index cd06619c55eb..1763738647c9 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/dynamic_update_slice_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/dynamic_update_slice_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_DynamicUpdateSliceF32(benchmark::State& state) {
+static void BM_DynamicUpdateSliceF32(benchmark::State& state,
+                                     HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -55,10 +57,11 @@ static void BM_DynamicUpdateSliceF32(benchmark::State& state) {
   auto p3 = LiteralUtil::CreateR0<int32_t>(0);
 
   std::vector<const Literal*> args = {&p0, &p1, &p2, &p3};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-BENCHMARK(BM_DynamicUpdateSliceF32)
+XLA_CPU_BENCHMARK(BM_DynamicUpdateSliceF32)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/benchmark.py b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/benchmark.py
index 23d482fde3a9..f6069b032204 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/benchmark.py
+++ b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/benchmark.py
@@ -18,10 +18,7 @@
 import os
 import statistics
 
-from gemma import params as params_lib
-from gemma import sampler as sampler_lib
-from gemma import transformer as transformer_lib
-import sentencepiece as spm
+from gemma import gm
 
 
 GEMMA_VARIANT = 'gemma2-2b-it'
@@ -38,32 +35,29 @@
 assert os.path.exists(CKPT_PATH), 'Flax checkpoint not found!'
 
 # Set up model sampler
-params = params_lib.load_and_format_params(CKPT_PATH)
-vocab = spm.SentencePieceProcessor()
-vocab.Load(TOKENIZER_PATH)
-transformer_config = transformer_lib.TransformerConfig.from_params(
-    params=params, cache_size=1024
-)
-transformer = transformer_lib.Transformer(transformer_config)
-sampler = sampler_lib.Sampler(
-    transformer=transformer,
-    vocab=vocab,
-    params=params['transformer'],
+params = gm.ckpts.load_params(CKPT_PATH)
+transformer = gm.nn.Gemma2_2B()
+tokenizer = gm.text.Gemma2Tokenizer(path=TOKENIZER_PATH)
+sampler = gm.text.ChatSampler(
+    model=transformer,
+    tokenizer=tokenizer,
+    params=params,
+    cache_length=1024,
 )
 
 OUTPUT_TOKEN_LEN = 128
-prompt = ['What is JAX in 3 bullet points?']
+prompt = 'What is JAX in 3 bullet points?'
 
 
 def benchmark_generation_time(output_token_len):
   """Benchmark generation time given output token length."""
   timestamp_start = datetime.datetime.now()
-  reply = sampler(input_strings=prompt, total_generation_steps=output_token_len)
+  reply = sampler.chat(prompt, max_new_tokens=output_token_len)
   timestamp_end = datetime.datetime.now()
   timer_delta = timestamp_end - timestamp_start
   # Prints generated tokens when benchmarking the full length.
   if output_token_len == OUTPUT_TOKEN_LEN:
-    print(reply.text)
+    print(reply)
   return timer_delta.total_seconds() * 1000
 
 
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/requirements.txt b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/requirements.txt
index 59c242835f35..4fe6f377a6fc 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/requirements.txt
+++ b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b/requirements.txt
@@ -3,7 +3,7 @@ chex==0.1.87
 etils==1.11.0
 flax==0.10.2
 fsspec==2024.10.0
-gemma @ git+https://github.com/google-deepmind/gemma.git@af38d6eb413cb98446b78a906c77cf5ba28be149
+gemma==3.0.2
 humanize==4.11.0
 importlib_resources==6.4.5
 jax==0.4.37
diff --git a/third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/README.md b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/README.md
similarity index 100%
rename from third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/README.md
rename to third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/README.md
diff --git a/third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/benchmark.py b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/benchmark.py
similarity index 100%
rename from third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/benchmark.py
rename to third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/benchmark.py
diff --git a/third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/config.sh b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/config.sh
similarity index 100%
rename from third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/config.sh
rename to third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/config.sh
diff --git a/third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/requirements.txt b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/requirements.txt
similarity index 100%
rename from third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/requirements.txt
rename to third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/requirements.txt
diff --git a/third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/run.sh b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/run.sh
similarity index 100%
rename from third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/run.sh
rename to third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/run.sh
diff --git a/third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/setup.sh b/third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/setup.sh
similarity index 100%
rename from third_party/xla/xla/service/cpu/benchmarks/e2e/gemma2/pytorch_2b/setup.sh
rename to third_party/xla/xla/backends/cpu/benchmarks/e2e/gemma2/pytorch_2b/setup.sh
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/elementwise_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/elementwise_benchmark_test.cc
index 61225745a41a..7960d8558f9d 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/elementwise_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/elementwise_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_AddF32(benchmark::State& state) {
+static void BM_AddF32(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -50,10 +51,11 @@ static void BM_AddF32(benchmark::State& state) {
   auto p1 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_AddBF16(benchmark::State& state) {
+static void BM_AddBF16(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -73,10 +75,12 @@ static void BM_AddBF16(benchmark::State& state) {
   auto p1 = *LiteralUtil::CreateRandomLiteral<BF16>(shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_ConvertF32ToBF16(benchmark::State& state) {
+static void BM_ConvertF32ToBF16(benchmark::State& state,
+                                HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -94,11 +98,12 @@ static void BM_ConvertF32ToBF16(benchmark::State& state) {
   auto p0 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 #define BENCHMARK_SIZES(NAME)   \
-  BENCHMARK(NAME)               \
+  XLA_CPU_BENCHMARK(NAME)       \
       ->MeasureProcessCPUTime() \
       ->Arg(128)                \
       ->Arg(256)                \
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/exp_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/exp_benchmark_test.cc
index 9aad43a91473..9105a53a62cd 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/exp_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/exp_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_ExpF32(benchmark::State& state) {
+static void BM_ExpF32(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -48,7 +49,8 @@ static void BM_ExpF32(benchmark::State& state) {
   auto p0 =
       *LiteralUtil::CreateRandomLiteral<F32>(input_shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 static void BM_ExpF16(benchmark::State& state) {
@@ -72,8 +74,29 @@ static void BM_ExpF16(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
 }
 
+static void BM_ExpF64(benchmark::State& state, HloBenchmarkOptions options) {
+  int64_t d0 = state.range(0);
+
+  absl::string_view hlo = R"(
+    HloModule exp_f64_$d0
+
+    ENTRY e {
+      input = f64[$d0] parameter(0)
+      ROOT output = exponential(input)
+    }
+  )";
+  std::minstd_rand0 engine;
+
+  auto input_shape = ShapeUtil::MakeShape(F64, {d0});
+  auto p0 =
+      *LiteralUtil::CreateRandomLiteral<F64>(input_shape, &engine, 1.0f, 0.1f);
+  std::vector<const Literal*> args = {&p0};
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
+}
+
 #define REGISTER_EXP_BENCHMARK(NAME) \
-  BENCHMARK(NAME)                    \
+  XLA_CPU_BENCHMARK(NAME)            \
       ->MeasureProcessCPUTime()      \
       ->Arg(128)                     \
       ->Arg(256)                     \
@@ -82,6 +105,14 @@ static void BM_ExpF16(benchmark::State& state) {
       ->Arg(4096);
 
 REGISTER_EXP_BENCHMARK(BM_ExpF32);
-REGISTER_EXP_BENCHMARK(BM_ExpF16);
+REGISTER_EXP_BENCHMARK(BM_ExpF64);
+// TODO(b/406431945): add AOT for f16 exp
+BENCHMARK(BM_ExpF16)
+    ->MeasureProcessCPUTime()
+    ->Arg(128)
+    ->Arg(256)
+    ->Arg(512)
+    ->Arg(1024)
+    ->Arg(4096);
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/fusion_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/fusion_benchmark_test.cc
index 6556be2f8905..1a1d79d00963 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/fusion_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/fusion_benchmark_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -32,7 +33,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_FusionF32(benchmark::State& state) {
+static void BM_FusionF32(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -62,10 +63,12 @@ static void BM_FusionF32(benchmark::State& state) {
   auto p2 = *LiteralUtil::CreateRandomLiteral<F32>(scalar, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1, &p2};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_FusionF32_2(benchmark::State& state) {
+static void BM_FusionF32_2(benchmark::State& state,
+                           HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -138,10 +141,12 @@ static void BM_FusionF32_2(benchmark::State& state) {
   auto p6 = *LiteralUtil::CreateRandomLiteral<F32>(shape3, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1, &p2, &p3, &p4, &p5, &p6};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_BcastFusionF32(benchmark::State& state) {
+static void BM_BcastFusionF32(benchmark::State& state,
+                              HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -163,10 +168,12 @@ static void BM_BcastFusionF32(benchmark::State& state) {
   auto p1 = *LiteralUtil::CreateRandomLiteral<F32>(scalar, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_DynamicUpdateSliceFusionF32(benchmark::State& state) {
+static void BM_DynamicUpdateSliceFusionF32(benchmark::State& state,
+                                           HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -190,10 +197,12 @@ static void BM_DynamicUpdateSliceFusionF32(benchmark::State& state) {
   auto p2 = LiteralUtil::CreateR0<int32_t>(0);
 
   std::vector<const Literal*> args = {&p0, &p1, &p2};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_ChainOfAddF32(benchmark::State& state) {
+static void BM_ChainOfAddF32(benchmark::State& state,
+                             HloBenchmarkOptions options) {
   int64_t size = state.range(0);
 
   // In this benchmark we create a chain of additions starting from `p2` and
@@ -242,10 +251,11 @@ static void BM_ChainOfAddF32(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo, args,
                            {{"$size", absl::StrCat(size)},
                             {"$parameters", parameters},
-                            {"$additions", additions}}));
+                            {"$additions", additions}},
+                           options));
 }
 
-BENCHMARK(BM_FusionF32)
+XLA_CPU_BENCHMARK(BM_FusionF32)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
@@ -254,14 +264,14 @@ BENCHMARK(BM_FusionF32)
     ->Arg(8192)
     ->Arg(16384);
 
-BENCHMARK(BM_FusionF32_2)
+XLA_CPU_BENCHMARK(BM_FusionF32_2)
     ->MeasureProcessCPUTime()
     ->Arg(40)
     ->Arg(80)
     ->Arg(160)
     ->Arg(240);
 
-BENCHMARK(BM_BcastFusionF32)
+XLA_CPU_BENCHMARK(BM_BcastFusionF32)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
@@ -270,7 +280,7 @@ BENCHMARK(BM_BcastFusionF32)
     ->Arg(8192)
     ->Arg(16384);
 
-BENCHMARK(BM_DynamicUpdateSliceFusionF32)
+XLA_CPU_BENCHMARK(BM_DynamicUpdateSliceFusionF32)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
@@ -279,7 +289,7 @@ BENCHMARK(BM_DynamicUpdateSliceFusionF32)
     ->Arg(8192)
     ->Arg(16384);
 
-BENCHMARK(BM_ChainOfAddF32)
+XLA_CPU_BENCHMARK(BM_ChainOfAddF32)
     ->MeasureProcessCPUTime()
     ->Arg(64)
     ->Arg(128)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/gather_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/gather_benchmark_test.cc
index 7cba1748fba9..fe5ca3c8eb28 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/gather_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/gather_benchmark_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/array2d.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -31,7 +32,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_GatherS32(benchmark::State& state) {
+static void BM_GatherS32(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
   int64_t d1 = state.range(1);
   int64_t slice_size = state.range(2);
@@ -74,10 +75,11 @@ static void BM_GatherS32(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo, args,
                            {{"$d0", absl::StrCat(d0)},
                             {"$d1", absl::StrCat(d1)},
-                            {"$slice_size", absl::StrCat(slice_size)}}));
+                            {"$slice_size", absl::StrCat(slice_size)}},
+                           options));
 }
 
-BENCHMARK(BM_GatherS32)
+XLA_CPU_BENCHMARK(BM_GatherS32)
     ->MeasureProcessCPUTime()
     ->Args({3, 3, 1})
     ->Args({3, 3, 2})
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
index f9b20a2be5ce..057ee7108f0e 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
@@ -16,9 +16,12 @@ limitations under the License.
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_replace.h"
@@ -26,28 +29,127 @@ limitations under the License.
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/literal.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/shape_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
 
 namespace xla::cpu {
 
+namespace {
+
+// Helper class for replacing output buffers with input buffers if there is
+// aliasing.
+// This class is required for benchmarking models where outputs are aliased to
+// inputs. This returns ownership of the aliased memory to the input buffers
+// so that it can be reused for the next iteration, otherwise we'd have invalid
+// input buffers.
+// Alternatively, we could turn off aliasing but that wouldn't be representative
+// of production performance.
+class AliasHelper {
+ public:
+  AliasHelper(HloModule* hlo_module, PjRtClient* client, PjRtDevice* device,
+              PjRtMemorySpace* memory_space)
+      : client_(client), device_(device), memory_space_(memory_space) {
+    hlo_module->input_output_alias_config().ForEachAlias(
+        [this](const ShapeIndex& output_index,
+               const HloInputOutputAliasConfig::Alias& alias) {
+          aliased_output_index_to_argument_index_.push_back(
+              std::make_pair(output_index, alias.parameter_number));
+        });
+  }
+
+  bool ComputationHasAliasing() const {
+    return !aliased_output_index_to_argument_index_.empty();
+  }
+
+  absl::Status SwapOutputAliasedBuffersToArgumentBuffers(
+      std::vector<std::unique_ptr<PjRtBuffer>>& results,
+      std::vector<std::unique_ptr<PjRtBuffer>>& args_buffers,
+      std::vector<PjRtBuffer*>& args_ptrs) {
+    if (!ComputationHasAliasing()) {
+      return absl::OkStatus();
+    }
+    for (const auto& [output_sindex, arg_index] :
+         aliased_output_index_to_argument_index_) {
+      if (output_sindex.size() > 1) {
+        return absl::InvalidArgumentError("Nested tuples not supported");
+      }
+      size_t output_index = 0;
+      if (output_sindex.size() == 1) {
+        output_index = output_sindex[0];
+      }
+      if (output_index >= results.size()) {
+        return absl::InvalidArgumentError("index out of bounds.");
+      }
+      if (!results[output_index]) {
+        return absl::InvalidArgumentError("Result already donated.");
+      }
+      args_buffers[arg_index] = std::move(results[output_index]);
+      args_ptrs[arg_index] = args_buffers[arg_index].get();
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  std::vector<std::pair<ShapeIndex, int64_t>>
+      aliased_output_index_to_argument_index_;
+
+  PjRtClient* client_;
+  PjRtDevice* device_;
+  PjRtMemorySpace* memory_space_;
+};
+
+}  // namespace
+
 absl::Status RunHloBenchmark(benchmark::State& state,
                              absl::string_view hlo_module,
                              absl::Span<const Literal* const> args,
                              StrToStrMapping replacements,
                              const HloBenchmarkOptions& benchmark_options) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      ParseAndReturnUnverifiedModule(
+                          absl::StrReplaceAll(hlo_module, replacements),
+                          HloModuleConfig() /* unused */));
+  return RunHloBenchmark(state, std::move(module), args, benchmark_options);
+}
+
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::unique_ptr<HloComputation> hlo_computation,
+                             absl::Span<const Literal* const> args,
+                             const HloBenchmarkOptions& benchmark_options) {
+  std::unique_ptr<HloModule> module = std::make_unique<VerifiedHloModule>(
+      "test", HloModuleConfig() /* unused */,
+      /*verifier_layout_sensitive=*/false,
+      /*allow_mixed_precision_in_hlo_verifier=*/true,
+      ShapeUtil::ByteSizeOfElements);
+  module->AddEntryComputation(std::move(hlo_computation));
+
+  return RunHloBenchmark(state, std::move(module), args, benchmark_options);
+}
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::unique_ptr<HloModule> module,
+                             absl::Span<const Literal* const> args,
+                             const HloBenchmarkOptions& benchmark_options) {
   xla::CpuClientOptions client_options;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
                       xla::GetXlaPjrtCpuClient(client_options));
@@ -55,11 +157,6 @@ absl::Status RunHloBenchmark(benchmark::State& state,
   TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
                       device->default_memory_space());
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      ParseAndReturnUnverifiedModule(
-                          absl::StrReplaceAll(hlo_module, replacements),
-                          HloModuleConfig() /* unused */));
-
   XlaComputation computation(module->ToProto());
 
   // Compile HLO module to executable.
@@ -68,11 +165,30 @@ absl::Status RunHloBenchmark(benchmark::State& state,
     compile_options.executable_build_options.mutable_debug_options()
         ->add_xla_disable_hlo_passes("cpu-parallel-task-assigner");
   }
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                      client->Compile(computation, compile_options));
+  // TODO(intel-tf): Remove this if-block once oneDNN custom calls are enabled
+  // with thunk runtime
+  if (!benchmark_options.use_thunk_runtime) {
+    compile_options.executable_build_options.mutable_debug_options()
+        ->set_xla_cpu_use_thunk_runtime(false);
+  }
+  std::unique_ptr<PjRtLoadedExecutable> executable;
+  if (benchmark_options.aot_options) {
+    auto* cpu_client = tsl::down_cast<TfrtCpuClient*>(client.get());
+    TF_ASSIGN_OR_RETURN(executable, cpu_client->CompileAheadOfTimeAndLoad(
+                                        computation, compile_options,
+                                        *benchmark_options.aot_options));
+  } else {
+    TF_ASSIGN_OR_RETURN(executable,
+                        client->CompileAndLoad(computation, compile_options));
+  }
 
-  // Convert literals to PjRtBuffers.
-  std::vector<std::unique_ptr<PjRtBuffer>> args_buffers;
+  CHECK_GE(benchmark_options.num_executions, 1);
+
+  // For every parallel execution, we need to have a copy of the arguments in
+  // case there is aliasing. This also makes the benchmark more realistic,
+  // since in production each call would have separate argument buffers.
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> execution_args_buffers(
+      benchmark_options.num_executions);
 
   size_t expected_arg_count =
       module->entry_computation()->parameter_instructions().size();
@@ -82,11 +198,13 @@ absl::Status RunHloBenchmark(benchmark::State& state,
   if (args.empty()) {
     TF_ASSIGN_OR_RETURN(std::vector<Literal> fake_args,
                         MakeFakeArguments(module.get()));
-    args_buffers.reserve(fake_args.size());
-    for (const Literal& arg : fake_args) {
-      TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
-                          client->BufferFromHostLiteral(arg, memory_space));
-      TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+    for (auto& args_buffers : execution_args_buffers) {
+      args_buffers.reserve(fake_args.size());
+      for (const Literal& arg : fake_args) {
+        TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
+                            client->BufferFromHostLiteral(arg, memory_space));
+        TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+      }
     }
   } else {
     if (expected_arg_count != args.size()) {
@@ -95,11 +213,13 @@ absl::Status RunHloBenchmark(benchmark::State& state,
           "the HLO module.");
     }
 
-    args_buffers.reserve(args.size());
-    for (const Literal* arg : args) {
-      TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
-                          client->BufferFromHostLiteral(*arg, memory_space));
-      TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+    for (auto& args_buffers : execution_args_buffers) {
+      args_buffers.reserve(args.size());
+      for (const Literal* arg : args) {
+        TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
+                            client->BufferFromHostLiteral(*arg, memory_space));
+        TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+      }
     }
   }
 
@@ -107,31 +227,34 @@ absl::Status RunHloBenchmark(benchmark::State& state,
   // thread pool if we need to run multiple executions in parallel.
   ExecuteOptions execute_options;
   execute_options.execution_mode = ExecuteOptions::ExecutionMode::kSynchronous;
+  execute_options.untuple_result = true;
 
-  std::vector<PjRtBuffer*> args_ptrs;
-  args_ptrs.reserve(args_buffers.size());
-  for (const auto& arg : args_buffers) {
-    args_ptrs.push_back(arg.get());
+  std::vector<std::vector<PjRtBuffer*>> execution_args_ptrs(
+      benchmark_options.num_executions);
+  for (int i = 0; i < benchmark_options.num_executions; ++i) {
+    std::vector<PjRtBuffer*>& args_ptrs = execution_args_ptrs[i];
+    const std::vector<std::unique_ptr<PjRtBuffer>>& args_buffers =
+        execution_args_buffers[i];
+    args_ptrs.reserve(args_buffers.size());
+    for (const auto& arg : args_buffers) {
+      args_ptrs.push_back(arg.get());
+    }
   }
 
-  CHECK_GE(benchmark_options.num_executions, 1);
-  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results(
+  AliasHelper alias_helper(module.get(), client.get(), device, memory_space);
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> execution_results(
       benchmark_options.num_executions);
 
   // Thread pool for dispatching multiple executions in parallel.
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "hlo_benchmark_runner",
                                   benchmark_options.num_executions);
 
-  // Warmup executable.
-  TF_ASSIGN_OR_RETURN(results[0], executable->ExecuteSharded(args_ptrs, device,
-                                                             execute_options));
-
-  // Benchmark executable.
-  for (auto _ : state) {
+  auto run_benchmark_once = [&]() -> absl::Status {
     if (benchmark_options.num_executions == 1) {
       // Single execution always runs in the caller thread.
-      results[0] =
-          executable->ExecuteSharded(args_ptrs, device, execute_options)
+      execution_results[0] =
+          executable
+              ->ExecuteSharded(execution_args_ptrs[0], device, execute_options)
               .value();
     } else {
       // Multiple executions run in parallel.
@@ -139,7 +262,10 @@ absl::Status RunHloBenchmark(benchmark::State& state,
 
       for (size_t i = 0; i < benchmark_options.num_executions; ++i) {
         threads.Schedule([&, i]() {
-          results[i] =
+          const std::vector<PjRtBuffer*>& args_ptrs = execution_args_ptrs[i];
+          std::vector<std::unique_ptr<PjRtBuffer>>& results =
+              execution_results[i];
+          results =
               executable->ExecuteSharded(args_ptrs, device, execute_options)
                   .value();
           counter.DecrementCount();
@@ -151,10 +277,25 @@ absl::Status RunHloBenchmark(benchmark::State& state,
 
     // Wait for all results to be ready.
     for (size_t i = 0; i < benchmark_options.num_executions; ++i) {
-      for (const auto& result : results[i]) {
+      for (const auto& result : execution_results[i]) {
         CHECK_OK(result->GetReadyFuture().Await());
       }
+      std::vector<std::unique_ptr<PjRtBuffer>>& args_buffers =
+          execution_args_buffers[i];
+      std::vector<PjRtBuffer*>& args_ptrs = execution_args_ptrs[i];
+      TF_RETURN_IF_ERROR(alias_helper.SwapOutputAliasedBuffersToArgumentBuffers(
+          execution_results[i], args_buffers, args_ptrs));
     }
+
+    return absl::OkStatus();
+  };
+
+  // Warm up executable.
+  TF_RETURN_IF_ERROR(run_benchmark_once());
+
+  // Benchmark executable.
+  for (auto _ : state) {
+    TF_RETURN_IF_ERROR(run_benchmark_once());
   }
 
   return absl::OkStatus();
@@ -180,10 +321,16 @@ absl::Status CompileHloBenchmark(benchmark::State& state,
     compile_options.executable_build_options.mutable_debug_options()
         ->add_xla_disable_hlo_passes("cpu-parallel-task-assigner");
   }
+  // TODO(intel-tf): Remove this if-block once oneDNN custom calls are enabled
+  // with thunk runtime
+  if (!benchmark_options.use_thunk_runtime) {
+    compile_options.executable_build_options.mutable_debug_options()
+        ->set_xla_cpu_use_thunk_runtime(false);
+  }
 
   for (auto _ : state) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                        client->Compile(computation, compile_options));
+                        client->CompileAndLoad(computation, compile_options));
     tsl::testing::DoNotOptimize(executable);
   }
 
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
index 529483f03d2e..e585e8b92598 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
@@ -18,12 +18,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <initializer_list>
+#include <memory>
 #include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
+#include "xla/service/compiler.h"
 #include "xla/tsl/platform/test_benchmark.h"
 
 namespace xla::cpu {
@@ -35,6 +37,9 @@ using StrToStrMapping =
 struct HloBenchmarkOptions {
   int32_t num_executions = 1;
   bool disable_parallel_task_assigner = false;
+  bool use_thunk_runtime = true;
+  // If not null, AOT compilation will be used.
+  std::unique_ptr<AotCompilationOptions> aot_options;
 };
 
 // Runs the given HLO module as a benchmark.
@@ -52,6 +57,17 @@ absl::Status RunHloBenchmark(benchmark::State& state,
                              StrToStrMapping replacements = {},
                              const HloBenchmarkOptions& benchmark_options = {});
 
+// Same as above, but take an HloComputation or HloModule as input instead of
+// HLO text.
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::unique_ptr<HloComputation> hlo_computation,
+                             absl::Span<const Literal* const> args,
+                             const HloBenchmarkOptions& benchmark_options = {});
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::unique_ptr<HloModule> hlo_module,
+                             absl::Span<const Literal* const> args,
+                             const HloBenchmarkOptions& benchmark_options = {});
+
 // Benchmarks the given HLO's compilation time.
 //
 // Takes the same options as RunHloBenchmark, except no arguments since the
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/log_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/log_benchmark_test.cc
index 712c4305d6a1..ae6c51856140 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/log_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/log_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_LogF32(benchmark::State& state) {
+static void BM_LogF32(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -48,7 +49,8 @@ static void BM_LogF32(benchmark::State& state) {
   auto p0 =
       *LiteralUtil::CreateRandomLiteral<F32>(input_shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 static void BM_LogF16(benchmark::State& state) {
@@ -73,7 +75,7 @@ static void BM_LogF16(benchmark::State& state) {
 }
 
 #define REGISTER_EXP_BENCHMARK(NAME) \
-  BENCHMARK(NAME)                    \
+  XLA_CPU_BENCHMARK(NAME)            \
       ->MeasureProcessCPUTime()      \
       ->Arg(128)                     \
       ->Arg(256)                     \
@@ -82,6 +84,14 @@ static void BM_LogF16(benchmark::State& state) {
       ->Arg(4096);
 
 REGISTER_EXP_BENCHMARK(BM_LogF32);
-REGISTER_EXP_BENCHMARK(BM_LogF16);
+
+// TODO(b/406431945): add AOT for f16 log
+BENCHMARK(BM_LogF16)
+    ->MeasureProcessCPUTime()
+    ->Arg(128)
+    ->Arg(256)
+    ->Arg(512)
+    ->Arg(1024)
+    ->Arg(4096);
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/multi_benchmark_config.h b/third_party/xla/xla/backends/cpu/benchmarks/multi_benchmark_config.h
new file mode 100644
index 000000000000..b03e71709500
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/benchmarks/multi_benchmark_config.h
@@ -0,0 +1,293 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_BENCHMARKS_MULTI_BENCHMARK_CONFIG_H_
+#define XLA_BACKENDS_CPU_BENCHMARKS_MULTI_BENCHMARK_CONFIG_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/benchmarks/aot_benchmark_helper.h"
+#include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/tsl/platform/test_benchmark.h"
+
+namespace xla::cpu {
+
+// Forwards a config to multiple benchmarks.
+class MultiBenchmarkConfig {
+ public:
+  MultiBenchmarkConfig(
+      std::initializer_list<benchmark::internal::Benchmark*> ptrs)
+      : benchmarks_(ptrs) {}
+
+  MultiBenchmarkConfig* Arg(int64_t x) {
+    for (auto b : benchmarks_) {
+      b->Arg(x);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Unit(benchmark::TimeUnit unit) {
+    for (auto b : benchmarks_) {
+      b->Unit(unit);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Range(int64_t start, int64_t limit) {
+    for (auto b : benchmarks_) {
+      b->Range(start, limit);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* DenseRange(int64_t start, int64_t limit, int step = 1) {
+    for (auto b : benchmarks_) {
+      b->DenseRange(start, limit, step);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Args(const std::vector<int64_t>& args) {
+    for (auto b : benchmarks_) {
+      b->Args(args);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ArgPair(int64_t x, int64_t y) {
+    std::vector<int64_t> args;
+    args.push_back(x);
+    args.push_back(y);
+    return Args(args);
+  }
+
+  MultiBenchmarkConfig* Ranges(
+      const std::vector<std::pair<int64_t, int64_t> >& ranges) {
+    for (auto b : benchmarks_) {
+      b->Ranges(ranges);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ArgsProduct(
+      const std::vector<std::vector<int64_t> >& arglists) {
+    for (auto b : benchmarks_) {
+      b->ArgsProduct(arglists);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ArgName(const std::string& name) {
+    for (auto b : benchmarks_) {
+      b->ArgName(name);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ArgNames(const std::vector<std::string>& names) {
+    for (auto b : benchmarks_) {
+      b->ArgNames(names);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* RangePair(int64_t lo1, int64_t hi1, int64_t lo2,
+                                  int64_t hi2) {
+    std::vector<std::pair<int64_t, int64_t> > ranges;
+    ranges.push_back(std::make_pair(lo1, hi1));
+    ranges.push_back(std::make_pair(lo2, hi2));
+    return Ranges(ranges);
+  }
+
+  MultiBenchmarkConfig* Apply(
+      void (*func)(benchmark::internal::Benchmark* benchmark)) {
+    for (auto b : benchmarks_) {
+      b->Apply(func);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* RangeMultiplier(int multiplier) {
+    for (auto b : benchmarks_) {
+      b->RangeMultiplier(multiplier);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* MinTime(double t) {
+    for (auto b : benchmarks_) {
+      b->MinTime(t);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* MinWarmUpTime(double t) {
+    for (auto b : benchmarks_) {
+      b->MinWarmUpTime(t);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Iterations(benchmark::IterationCount n) {
+    for (auto b : benchmarks_) {
+      b->Iterations(n);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Repetitions(int n) {
+    for (auto b : benchmarks_) {
+      b->Repetitions(n);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ReportAggregatesOnly(bool value = true) {
+    for (auto b : benchmarks_) {
+      b->ReportAggregatesOnly(value);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* DisplayAggregatesOnly(bool value = true) {
+    for (auto b : benchmarks_) {
+      b->DisplayAggregatesOnly(value);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* MeasureProcessCPUTime() {
+    for (auto b : benchmarks_) {
+      b->MeasureProcessCPUTime();
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* UseRealTime() {
+    for (auto b : benchmarks_) {
+      b->UseRealTime();
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* UseManualTime() {
+    for (auto b : benchmarks_) {
+      b->UseManualTime();
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Complexity(
+      benchmark::BigO complexity = benchmark::oAuto) {
+    for (auto b : benchmarks_) {
+      b->Complexity(complexity);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Complexity(benchmark::BigOFunc* complexity) {
+    for (auto b : benchmarks_) {
+      b->Complexity(complexity);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ComputeStatistics(
+      const std::string& name, benchmark::StatisticsFunc* statistics,
+      benchmark::StatisticUnit unit = benchmark::kTime) {
+    for (auto b : benchmarks_) {
+      b->ComputeStatistics(name, statistics, unit);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* Threads(int t) {
+    for (auto b : benchmarks_) {
+      b->Threads(t);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ThreadRange(int min_threads, int max_threads) {
+    for (auto b : benchmarks_) {
+      b->ThreadRange(min_threads, max_threads);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* DenseThreadRange(int min_threads, int max_threads,
+                                         int stride = 1) {
+    for (auto b : benchmarks_) {
+      b->DenseThreadRange(min_threads, max_threads, stride);
+    }
+    return this;
+  }
+
+  MultiBenchmarkConfig* ThreadPerCpu() {
+    for (auto b : benchmarks_) {
+      b->ThreadPerCpu();
+    }
+    return this;
+  }
+
+ private:
+  const std::vector<benchmark::internal::Benchmark*> benchmarks_;
+};
+
+// Benchmarks 'fn' in JIT and AOT modes. The JIT benchmark
+// keeps the given 'name'; AOT is suffixed with '_Aot'.
+inline MultiBenchmarkConfig* RegisterJitAndAotBenchmarks(
+    absl::string_view name, void(fn)(benchmark::State&, HloBenchmarkOptions)) {
+  std::string jit_name(name);
+  std::string aot_name = jit_name + "_Aot";
+  auto jit_fn = [fn](benchmark::State& state) {
+    HloBenchmarkOptions options;
+    fn(state, std::move(options));
+  };
+  auto aot_fn = [fn](benchmark::State& state) {
+    HloBenchmarkOptions options;
+    options.aot_options = GetAotCompilationOptions();
+    fn(state, std::move(options));
+  };
+  benchmark::internal::Benchmark* jit =
+      benchmark::RegisterBenchmark(jit_name, jit_fn);
+  benchmark::internal::Benchmark* aot =
+      benchmark::RegisterBenchmark(aot_name, aot_fn);
+  return new MultiBenchmarkConfig({jit, aot});
+};
+
+// Registers the given benchmark in both JIT and AOT modes.
+// The benchmark's function signature must be as follows:
+// `void BenchmarkFunc(benchmark::State&, const HloBenchmarkOptions&)`.
+#define XLA_CPU_BENCHMARK(n) XLA_CPU_BENCHMARK_HELPER(__COUNTER__, n)
+
+// Helper for implementing macros above.  Do not use directly.
+//
+// Forces the evaluation of "counter", which we expect is equal to __COUNTER__.
+#define XLA_CPU_BENCHMARK_HELPER(ctr, n) XLA_CPU_BENCHMARK_HELPER2(ctr, n)
+
+// Helper for macros above.  Don't use directly.
+#define XLA_CPU_BENCHMARK_HELPER2(ctr, n)                    \
+  static MultiBenchmarkConfig* xla_cpu_benchmark_ptr_##ctr = \
+      RegisterJitAndAotBenchmarks(#n, n)
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_BENCHMARKS_MULTI_BENCHMARK_CONFIG_H_
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/onednn_matmul_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/onednn_matmul_benchmark_test.cc
new file mode 100644
index 000000000000..52b9c433af9e
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/benchmarks/onednn_matmul_benchmark_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL)
+
+#include <cstdint>
+#include <random>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/cpu/onednn_util.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/test.h"
+
+namespace xla::cpu {
+
+static void BM_oneDNN_MM(benchmark::State& state) {
+  PrimitiveType dtype = static_cast<PrimitiveType>(state.range(0));
+  int64_t d0 = state.range(1);
+
+  absl::string_view hlo = R"(
+    HloModule oneDNN_$dtype_$d0
+
+    ENTRY e {
+      lhs = $dtype[128,$d0] parameter(0)
+      rhs = $dtype[$d0,256] parameter(1)
+      ROOT custom-call = $dtype[128,256] custom-call(lhs, rhs), 
+                  custom_call_target="__onednn$matmul"
+    }
+  )";
+
+  std::minstd_rand0 engine;
+
+  auto lhs_shape = ShapeUtil::MakeShape(dtype, {128, d0});
+  auto rhs_shape = ShapeUtil::MakeShape(dtype, {d0, 256});
+  Literal p0, p1;
+
+  if (dtype == F32) {
+    p0 = *LiteralUtil::CreateRandomLiteral<F32>(lhs_shape, &engine, 1.0f, 0.1f);
+    p1 = *LiteralUtil::CreateRandomLiteral<F32>(rhs_shape, &engine, 1.0f, 0.1f);
+  } else if (dtype == BF16 && IsSupportedType(BF16)) {
+    p0 =
+        *LiteralUtil::CreateRandomLiteral<BF16>(lhs_shape, &engine, 1.0f, 0.1f);
+    p1 =
+        *LiteralUtil::CreateRandomLiteral<BF16>(rhs_shape, &engine, 1.0f, 0.1f);
+  } else if (dtype == F16 && IsSupportedType(F16)) {
+    p0 = *LiteralUtil::CreateRandomLiteral<F16>(lhs_shape, &engine, 1.0f, 0.1f);
+    p1 = *LiteralUtil::CreateRandomLiteral<F16>(rhs_shape, &engine, 1.0f, 0.1f);
+  } else {
+    VLOG(0) << primitive_util::LowercasePrimitiveTypeName(dtype)
+            << " not supported on this platform";
+    return;
+  }
+
+  std::vector<const Literal*> args = {&p0, &p1};
+  HloBenchmarkOptions benchmark_options;
+  benchmark_options.use_thunk_runtime = false;
+  CHECK_OK(RunHloBenchmark(
+      state, hlo, args,
+      {{"$dtype", primitive_util::LowercasePrimitiveTypeName(dtype)},
+       {"$d0", absl::StrCat(d0)}},
+      benchmark_options));
+}
+
+#define BENCHMARK_ONEDNN_MM(dtype) \
+  BENCHMARK(BM_oneDNN_MM)          \
+      ->MeasureProcessCPUTime()    \
+      ->Args({dtype, 512})         \
+      ->Args({dtype, 1024})        \
+      ->Args({dtype, 2048})
+
+BENCHMARK_ONEDNN_MM(F32);
+BENCHMARK_ONEDNN_MM(BF16);
+BENCHMARK_ONEDNN_MM(F16);
+
+}  // namespace xla::cpu
+
+#endif  // INTEL_MKL
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/optimizer_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/optimizer_benchmark_test.cc
index 3a4a6d80b1c1..0daf44efc6a9 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/optimizer_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/optimizer_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_Optimizer0(benchmark::State& state) {
+static void BM_Optimizer0(benchmark::State& state,
+                          HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -107,10 +109,11 @@ static void BM_Optimizer0(benchmark::State& state) {
   auto p1 = *LiteralUtil::CreateRandomLiteral<S32>(scalar, &engine, 1, 2);
 
   std::vector<const Literal*> args = {&p0, &p0, &p0, &p1};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-BENCHMARK(BM_Optimizer0)
+XLA_CPU_BENCHMARK(BM_Optimizer0)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/pad_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/pad_benchmark_test.cc
index 195c7b5e6a62..15fe6545d6c3 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/pad_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/pad_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_PadF32(benchmark::State& state) {
+static void BM_PadF32(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -53,10 +54,11 @@ static void BM_PadF32(benchmark::State& state) {
       *LiteralUtil::CreateRandomLiteral<F32>(value_shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-BENCHMARK(BM_PadF32)
+XLA_CPU_BENCHMARK(BM_PadF32)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
index 3e9222f63a85..c99081f5ce0e 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/reduction_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_ReduceAddF32(benchmark::State& state) {
+static void BM_ReduceAddF32(benchmark::State& state,
+                            HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -55,10 +57,12 @@ static void BM_ReduceAddF32(benchmark::State& state) {
   auto p0 = *LiteralUtil::CreateRandomLiteral<F32>(shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_ReduceAddBF16(benchmark::State& state) {
+static void BM_ReduceAddBF16(benchmark::State& state,
+                             HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -83,11 +87,12 @@ static void BM_ReduceAddBF16(benchmark::State& state) {
   auto p0 = *LiteralUtil::CreateRandomLiteral<BF16>(shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 #define BENCHMARK_SIZES(NAME)   \
-  BENCHMARK(NAME)               \
+  XLA_CPU_BENCHMARK(NAME)       \
       ->MeasureProcessCPUTime() \
       ->Arg(128)                \
       ->Arg(256)                \
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/scatter_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/scatter_benchmark_test.cc
index 44d05970d6e6..51cc9e1714dd 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/scatter_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/scatter_benchmark_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/array2d.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
@@ -72,7 +73,7 @@ Literal CreateReduceIndices(int32_t num_elems, int32_t step) {
   return LiteralUtil::CreateR2FromArray2D(array);
 }
 
-void BM_ScatterS32_R1(benchmark::State& state) {
+void BM_ScatterS32_R1(benchmark::State& state, HloBenchmarkOptions options) {
   const int64_t d0 = state.range(0);
   const int64_t slice_size = state.range(1);
 
@@ -113,12 +114,13 @@ void BM_ScatterS32_R1(benchmark::State& state) {
   std::vector<const Literal*> args = {&operand, &scatter_indices, &update};
   CHECK_OK(RunHloBenchmark(
       state, hlo, args,
-      {{"$d0", absl::StrCat(d0)}, {"$slice_size", absl::StrCat(slice_size)}}));
+      {{"$d0", absl::StrCat(d0)}, {"$slice_size", absl::StrCat(slice_size)}},
+      options));
 
   state.SetComplexityN(state.range(1));
 }
 
-void BM_ScatterS32_R2(benchmark::State& state) {
+void BM_ScatterS32_R2(benchmark::State& state, HloBenchmarkOptions options) {
   const int64_t d0 = state.range(0);
   const int64_t d1 = d0;
   const int64_t slice_size = state.range(1);
@@ -160,10 +162,11 @@ void BM_ScatterS32_R2(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo, args,
                            {{"$d0", absl::StrCat(d0)},
                             {"$d1", absl::StrCat(d1)},
-                            {"$slice_size", absl::StrCat(slice_size)}}));
+                            {"$slice_size", absl::StrCat(slice_size)}},
+                           options));
 }
 
-void BM_ScatterS32_R3(benchmark::State& state) {
+void BM_ScatterS32_R3(benchmark::State& state, HloBenchmarkOptions options) {
   const int64_t d0 = state.range(0);
   const int64_t d1 = d0;
   const int64_t d2 = d0;
@@ -208,10 +211,12 @@ void BM_ScatterS32_R3(benchmark::State& state) {
                            {{"$d0", absl::StrCat(d0)},
                             {"$d1", absl::StrCat(d1)},
                             {"$d2", absl::StrCat(d2)},
-                            {"$slice_size", absl::StrCat(slice_size)}}));
+                            {"$slice_size", absl::StrCat(slice_size)}},
+                           options));
 }
 
-void BM_SimpleScatterReduceF32_R3(benchmark::State& state) {
+void BM_SimpleScatterReduceF32_R3(benchmark::State& state,
+                                  HloBenchmarkOptions options) {
   const int64_t d0 = state.range(0);
   const int64_t d1 = state.range(1);
   const int64_t d2 = state.range(2);
@@ -263,16 +268,22 @@ void BM_SimpleScatterReduceF32_R3(benchmark::State& state) {
       update_shape, &engine, /*mean=*/50, /*stddev=*/10);
 
   std::vector<const Literal*> args = {&operand, &indices, &update};
-  CHECK_OK(RunHloBenchmark(state, hlo, args));
+  CHECK_OK(RunHloBenchmark(state, hlo, args, {}, options));
 }
 
 // these all have the same number of elements in the operand
 // (2^18) == (2^9)^2 == (2^6)^3
-BENCHMARK(BM_ScatterS32_R1)->MeasureProcessCPUTime()->Args({1 << 18, 1 << 18});
-BENCHMARK(BM_ScatterS32_R2)->MeasureProcessCPUTime()->Args({1 << 9, 1 << 9});
-BENCHMARK(BM_ScatterS32_R3)->MeasureProcessCPUTime()->Args({1 << 6, 1 << 6});
+XLA_CPU_BENCHMARK(BM_ScatterS32_R1)
+    ->MeasureProcessCPUTime()
+    ->Args({1 << 18, 1 << 18});
+XLA_CPU_BENCHMARK(BM_ScatterS32_R2)
+    ->MeasureProcessCPUTime()
+    ->Args({1 << 9, 1 << 9});
+XLA_CPU_BENCHMARK(BM_ScatterS32_R3)
+    ->MeasureProcessCPUTime()
+    ->Args({1 << 6, 1 << 6});
 
-BENCHMARK(BM_SimpleScatterReduceF32_R3)
+XLA_CPU_BENCHMARK(BM_SimpleScatterReduceF32_R3)
     ->MeasureProcessCPUTime()
     ->ArgNames({"d0", "d1", "d2", "num_slices"})
     ->Args({1, 64, 8, 1})
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/select_and_scatter_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/select_and_scatter_benchmark_test.cc
index 9d29c2681ad3..83d7db3858d8 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/select_and_scatter_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/select_and_scatter_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_SelectAndScatterF32(benchmark::State& state) {
+static void BM_SelectAndScatterF32(benchmark::State& state,
+                                   HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
   int64_t d1 = (d0 - 1) / 2;
 
@@ -67,12 +69,12 @@ static void BM_SelectAndScatterF32(benchmark::State& state) {
   auto p2 = LiteralUtil::CreateR0(1.0f);
 
   std::vector<const Literal*> args = {&p0, &p1, &p2};
-  CHECK_OK(
-      RunHloBenchmark(state, hlo, args,
-                      {{"$d0", absl::StrCat(d0)}, {"$d1", absl::StrCat(d1)}}));
+  CHECK_OK(RunHloBenchmark(
+      state, hlo, args, {{"$d0", absl::StrCat(d0)}, {"$d1", absl::StrCat(d1)}},
+      options));
 }
 
-BENCHMARK(BM_SelectAndScatterF32)
+XLA_CPU_BENCHMARK(BM_SelectAndScatterF32)
     ->MeasureProcessCPUTime()
     ->Arg(128)
     ->Arg(256)
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/tanh_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/tanh_benchmark_test.cc
index f0df31c0e68b..11ffce2f19d2 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/tanh_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/tanh_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,7 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_TanhF32(benchmark::State& state) {
+static void BM_TanhF32(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -48,7 +49,8 @@ static void BM_TanhF32(benchmark::State& state) {
   auto p0 =
       *LiteralUtil::CreateRandomLiteral<F32>(input_shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 static void BM_TanhF16(benchmark::State& state) {
@@ -72,7 +74,7 @@ static void BM_TanhF16(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
 }
 
-static void BM_TanhF64(benchmark::State& state) {
+static void BM_TanhF64(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -90,11 +92,12 @@ static void BM_TanhF64(benchmark::State& state) {
   auto p0 =
       *LiteralUtil::CreateRandomLiteral<F64>(input_shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 #define REGISTER_TANH_BENCHMARK(NAME) \
-  BENCHMARK(NAME)                     \
+  XLA_CPU_BENCHMARK(NAME)             \
       ->MeasureProcessCPUTime()       \
       ->Arg(128)                      \
       ->Arg(256)                      \
@@ -103,7 +106,15 @@ static void BM_TanhF64(benchmark::State& state) {
       ->Arg(4096);
 
 REGISTER_TANH_BENCHMARK(BM_TanhF32);
-REGISTER_TANH_BENCHMARK(BM_TanhF16);
 REGISTER_TANH_BENCHMARK(BM_TanhF64);
 
+// TODO(b/406431945): add AOT for f16 tanh
+BENCHMARK(BM_TanhF16)
+    ->MeasureProcessCPUTime()
+    ->Arg(128)
+    ->Arg(256)
+    ->Arg(512)
+    ->Arg(1024)
+    ->Arg(4096);
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/topk_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/topk_benchmark_test.cc
index 29557cc13045..eac2774bcaab 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/topk_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/topk_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -28,7 +29,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_TopKCustomCall_F32(benchmark::State& state) {
+static void BM_TopKCustomCall_F32(benchmark::State& state,
+                                  HloBenchmarkOptions options) {
   int64_t k = state.range(0);
   int64_t batch = state.range(1);
   int64_t length = state.range(2);
@@ -53,10 +55,11 @@ static void BM_TopKCustomCall_F32(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo, {&x},
                            {{"$batch", absl::StrCat(batch)},
                             {"$length", absl::StrCat(length)},
-                            {"$k", absl::StrCat(k)}}));
+                            {"$k", absl::StrCat(k)}},
+                           options));
 }
 
-static void BM_TopK_BF16(benchmark::State& state) {
+static void BM_TopK_BF16(benchmark::State& state, HloBenchmarkOptions options) {
   int64_t k = state.range(0);
   int64_t batch = state.range(1);
   int64_t length = state.range(2);
@@ -80,11 +83,12 @@ static void BM_TopK_BF16(benchmark::State& state) {
   CHECK_OK(RunHloBenchmark(state, hlo, {&x},
                            {{"$batch", absl::StrCat(batch)},
                             {"$length", absl::StrCat(length)},
-                            {"$k", absl::StrCat(k)}}));
+                            {"$k", absl::StrCat(k)}},
+                           options));
 }
 
 #define BENCHMARK_TOPK(name)               \
-  BENCHMARK(name)                          \
+  XLA_CPU_BENCHMARK(name)                  \
       ->MeasureProcessCPUTime()            \
       ->ArgNames({"k", "batch", "length"}) \
       ->Args({4, 4, 64})                   \
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/transposed_copy_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/transposed_copy_benchmark_test.cc
index f999ae939528..c11698371fce 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/transposed_copy_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/transposed_copy_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_TransposeAndCopy(benchmark::State& state) {
+static void BM_TransposeAndCopy(benchmark::State& state,
+                                HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -49,11 +51,12 @@ static void BM_TransposeAndCopy(benchmark::State& state) {
   auto p0 =
       *LiteralUtil::CreateRandomLiteral<F32>(input_shape, &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 #define REGISTER_BENCHMARK(NAME) \
-  BENCHMARK(NAME)                \
+  XLA_CPU_BENCHMARK(NAME)        \
       ->MeasureProcessCPUTime()  \
       ->Arg(128)                 \
       ->Arg(256)                 \
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/transposed_dot_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/transposed_dot_benchmark_test.cc
index 02b875aa9867..c0ec6b615376 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/transposed_dot_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/transposed_dot_benchmark_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -30,7 +31,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
-static void BM_TransposeAndDot(benchmark::State& state) {
+static void BM_TransposeAndDot(benchmark::State& state,
+                               HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -56,10 +58,12 @@ static void BM_TransposeAndDot(benchmark::State& state) {
       *LiteralUtil::CreateRandomLiteral<F32>(p1_shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
-static void BM_TransposedDot(benchmark::State& state) {
+static void BM_TransposedDot(benchmark::State& state,
+                             HloBenchmarkOptions options) {
   int64_t d0 = state.range(0);
 
   absl::string_view hlo = R"(
@@ -84,11 +88,12 @@ static void BM_TransposedDot(benchmark::State& state) {
       *LiteralUtil::CreateRandomLiteral<F32>(p1_shape, &engine, 1.0f, 0.1f);
 
   std::vector<const Literal*> args = {&p0, &p1};
-  CHECK_OK(RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}));
+  CHECK_OK(
+      RunHloBenchmark(state, hlo, args, {{"$d0", absl::StrCat(d0)}}, options));
 }
 
 #define REGISTER_BENCHMARK(NAME) \
-  BENCHMARK(NAME)                \
+  XLA_CPU_BENCHMARK(NAME)        \
       ->MeasureProcessCPUTime()  \
       ->Arg(128)                 \
       ->Arg(256)                 \
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc
index e1284b1fd27f..e0aee67ebae6 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cstdint>
 #include <random>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/benchmarks/hlo_benchmark_runner.h"
+#include "xla/backends/cpu/benchmarks/multi_benchmark_config.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
@@ -33,6 +35,7 @@ limitations under the License.
 namespace xla::cpu {
 
 static absl::Status RunFusionBenchmark(benchmark::State& state,
+                                       HloBenchmarkOptions options,
                                        absl::string_view hlo,
                                        bool is_xnn_fusion = false) {
   int64_t d0 = state.range(0);  // Tensor size.
@@ -55,8 +58,11 @@ static absl::Status RunFusionBenchmark(benchmark::State& state,
       ShapeUtil::MakeShape(F32, {d0, d0}), &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0, &p1};
 
-  HloBenchmarkOptions options;
-  if (is_xnn_fusion) options.disable_parallel_task_assigner = true;
+  if (is_xnn_fusion) {
+    options.disable_parallel_task_assigner = true;
+    options.aot_options = nullptr;
+  }
+
   return RunHloBenchmark(state, hlo, args,
                          {{"$d0", absl::StrCat(d0)},
                           {"$n", absl::StrCat(n)},
@@ -64,7 +70,8 @@ static absl::Status RunFusionBenchmark(benchmark::State& state,
                          options);
 }
 
-static void BM_EltwiseF32(benchmark::State& state) {
+static void BM_EltwiseF32(benchmark::State& state,
+                          HloBenchmarkOptions options) {
   // Perform `n+1` iterations of `add` and `multiply`, then end with `subtract`.
   absl::string_view hlo = R"(
     HloModule eltwise_f32_$n
@@ -78,10 +85,11 @@ static void BM_EltwiseF32(benchmark::State& state) {
       ROOT sub = f32[$d0,$d0] subtract(mul$n, p0)
     }
   )";
-  CHECK_OK(RunFusionBenchmark(state, hlo));
+  CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo));
 }
 
-static void BM_XnnEltwiseF32(benchmark::State& state) {
+static void BM_XnnEltwiseF32(benchmark::State& state,
+                             HloBenchmarkOptions options) {
   // Perform `n+1` iterations of `add` and `multiply`, then end with `subtract`.
   absl::string_view hlo = R"(
     HloModule eltwise_f32_$n
@@ -103,10 +111,12 @@ static void BM_XnnEltwiseF32(benchmark::State& state) {
         backend_config={"fusion_config": {kind: "__xnn_fusion"}}
     }
   )";
-  CHECK_OK(RunFusionBenchmark(state, hlo, /*is_xnn_fusion=*/true));
+  CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo,
+                              /*is_xnn_fusion=*/true));
 }
 
-static void BM_DotAndEltwiseF32(benchmark::State& state) {
+static void BM_DotAndEltwiseF32(benchmark::State& state,
+                                HloBenchmarkOptions options) {
   // Perform `dot` followed by `n+1` iterations of `add` and `multiply`, then
   // end with `subtract`.
   absl::string_view hlo = R"(
@@ -123,10 +133,11 @@ static void BM_DotAndEltwiseF32(benchmark::State& state) {
       ROOT sub = f32[$d0,$d0] subtract(mul$n, p0)
     }
   )";
-  CHECK_OK(RunFusionBenchmark(state, hlo));
+  CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo));
 }
 
-static void BM_XnnDotAndEltwiseF32(benchmark::State& state) {
+static void BM_XnnDotAndEltwiseF32(benchmark::State& state,
+                                   HloBenchmarkOptions options) {
   // Perform `dot` followed by `n+1` iterations of `add` and `multiply`, then
   // end with `subtract`.
   absl::string_view hlo = R"(
@@ -151,11 +162,12 @@ static void BM_XnnDotAndEltwiseF32(benchmark::State& state) {
         backend_config={"fusion_config": {kind: "__xnn_fusion"}}
     }
   )";
-  CHECK_OK(RunFusionBenchmark(state, hlo, /*is_xnn_fusion=*/true));
+  CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo,
+                              /*is_xnn_fusion=*/true));
 }
 
 #define BENCHMARK_FUSION(name)  \
-  BENCHMARK(name)               \
+  XLA_CPU_BENCHMARK(name)       \
       ->MeasureProcessCPUTime() \
       ->Args({1024, 4})         \
       ->Args({1024, 8})         \
diff --git a/third_party/xla/xla/backends/cpu/codegen/BUILD b/third_party/xla/xla/backends/cpu/codegen/BUILD
index 2399f42c0f7c..31900131b81d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/BUILD
@@ -1,4 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test", "xla_internal")
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_llvm_aarch64_available",
@@ -10,7 +12,7 @@ load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla/backends/cpu:xla_backend_cpu_internal_access"],
+    default_visibility = internal_visibility(["//xla/backends/cpu:xla_backend_cpu_internal_access"]),
     licenses = ["notice"],
 )
 
@@ -41,13 +43,13 @@ cc_library(
     srcs = ["cpu_features.cc"],
     hdrs = ["cpu_features.h"],
     deps = [
+        "//xla/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
@@ -57,18 +59,27 @@ cc_library(
     srcs = ["ir_compiler.cc"],
     hdrs = ["ir_compiler.h"],
     deps = [
+        ":cpu_features",
         ":polynomial_approximations",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/service:hlo_module_config",
+        "//xla/service/cpu:cpu_options",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:Instrumentation",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Object",
@@ -77,6 +88,60 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
+cc_library(
+    name = "fusion_compiler",
+    srcs = ["fusion_compiler.cc"],
+    hdrs = ["fusion_compiler.h"],
+    deps = [
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
+        "//xla/backends/cpu/codegen/emitters/transforms:passes",
+        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:mlir_kernel_source",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
+        "//xla/codegen/emitters/transforms:passes",
+        "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
+        "//xla/mlir_hlo",
+        "//xla/mlir_hlo:mhlo_passes",
+        "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:AffineTransforms",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:ComplexToStandard",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:DLTIDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMIRTransforms",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MemRefTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:VectorDialect",
     ],
 )
 
@@ -135,10 +200,14 @@ xla_cc_test(
     name = "jit_compiler_test",
     srcs = ["jit_compiler_test.cc"],
     deps = [
+        ":ir_compiler",
         ":jit_compiler",
         "//xla:util",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -150,10 +219,6 @@ xla_cc_test(
         "@llvm-project//llvm:OrcShared",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -184,11 +249,42 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:alignment",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:ir_headers",
-        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "target_machine_test_base",
+    testonly = True,
+    hdrs = ["target_machine_test_base.h"],
+    deps = [
+        ":target_machine_features",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_for_library",
+        "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
+        "@llvm-project//llvm:ExecutionEngine",
+        "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
+    ],
+)
+
+xla_cc_test(
+    name = "target_machine_features_test",
+    srcs = ["target_machine_features_test.cc"],
+    deps = [
+        ":target_machine_features",
+        ":target_machine_test_base",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -199,7 +295,6 @@ cc_library(
     deps = [
         ":target_machine_features",
         "//xla:shape_util",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/algorithm:container",
@@ -221,7 +316,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
-        "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_module_config",
         "//xla/service/llvm_ir:ir_array",
@@ -255,14 +349,17 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
         "//xla/service:hlo_module_config",
         "//xla/service:logical_buffer",
+        "//xla/service/cpu:cpu_executable",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -354,12 +451,64 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:JITLink",
-        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcJIT",  # buildcleaner: keep
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+    ],
+)
+
+cc_library(
+    name = "computation_kernel_emitter",
+    srcs = ["computation_kernel_emitter.cc"],
+    hdrs = ["computation_kernel_emitter.h"],
+    deps = [
+        ":kernel_api_ir_builder",
+        ":target_machine_features",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
+        "//xla/codegen:kernel_spec",
+        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service/cpu:backend_config_proto_cc",
+        "//xla/service/cpu:ir_emitter",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/stream_executor:launch_dim",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
     ],
 )
 
+py_strict_test(
+    name = "scatter_kernel_emitter_test",
+    srcs = ["scatter_kernel_emitter_test.py"],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        "//third_party/py/numpy",
+        "//xla/backends/cpu/testlib",
+        "//xla/codegen/testlib",
+        "//xla/python:xla_extension",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 xla_cc_test(
     name = "object_loader_test",
     srcs = ["object_loader_test.cc"],
@@ -411,7 +560,21 @@ xla_cc_test(
         ":symbol_name_util",
         "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+py_strict_test(
+    name = "computation_kernel_emitter_test",
+    srcs = ["computation_kernel_emitter_test.py"],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        "//third_party/py/numpy",
+        "//xla/backends/cpu/testlib",
+        "//xla/codegen/testlib",
+        "//xla/python:xla_extension",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/compiled_function_library.h b/third_party/xla/xla/backends/cpu/codegen/compiled_function_library.h
index 7a5a2385bba6..7f73416ecbbd 100644
--- a/third_party/xla/xla/backends/cpu/codegen/compiled_function_library.h
+++ b/third_party/xla/xla/backends/cpu/codegen/compiled_function_library.h
@@ -49,6 +49,16 @@ class CompiledFunctionLibrary : public FunctionLibrary {
   absl::StatusOr<void*> ResolveFunction(TypeId type_id,
                                         absl::string_view name) final;
 
+  // Returns a map from symbol names to compiled function pointers without the
+  // type information. Can be used to construct an AotCompiledFunctionLibrary.
+  absl::flat_hash_map<std::string, void*> GetTypelessSymbolsMap() const {
+    absl::flat_hash_map<std::string, void*> ret;
+    for (const auto& [name, symbol] : symbols_map_) {
+      ret[name] = symbol.ptr;
+    }
+    return ret;
+  }
+
  private:
   std::unique_ptr<ExecutionEngine> execution_engine_;
   // Caches the resolved symbols so we don't have to look them up every time a
diff --git a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
new file mode 100644
index 000000000000..31397c65be2a
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.cc
@@ -0,0 +1,250 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/computation_kernel_emitter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_spec.h"
+#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/ir_emitter.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::cpu {
+
+namespace {
+
+// Get the slices for the given instruction, expanding the elements if a tuple.
+absl::Status GetInstructionSlices(
+    const HloInstruction* instruction,
+    const BufferAssignment* buffer_assignment,
+    absl::flat_hash_set<KernelApiIrBuilder::KernelParameter>& parameters) {
+  const Shape& shape = instruction->shape();
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                      buffer_assignment->GetUniqueTopLevelSlice(instruction));
+  if (slice.allocation()->is_thread_local()) {
+    return absl::OkStatus();
+  }
+  parameters.insert({shape, std::move(slice)});
+
+  if (shape.IsTuple()) {
+    for (const auto& [leaf_index, leaf_shape] :
+         ShapeUtil::GetLeafShapes(shape)) {
+      TF_ASSIGN_OR_RETURN(
+          BufferAllocation::Slice leaf_slice,
+          buffer_assignment->GetUniqueSlice(instruction, {leaf_index}));
+      parameters.insert({leaf_shape, std::move(leaf_slice)});
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status GetAllSlices(
+    const HloComputation* computation,
+    const BufferAssignment* buffer_assignment,
+    absl::flat_hash_set<KernelApiIrBuilder::KernelParameter>& arguments,
+    absl::flat_hash_set<KernelApiIrBuilder::KernelParameter>& results) {
+  for (const HloInstruction* instruction : computation->instructions()) {
+    for (const HloInstruction* operand : instruction->operands()) {
+      TF_RETURN_IF_ERROR(
+          GetInstructionSlices(operand, buffer_assignment, arguments));
+    }
+
+    // Parameters just forward the results
+    // TODO(willfroom): Is there a method somewhere to check if an instruction
+    // just forwards the buffer? (e.g get-tuple-arg)
+    if (instruction->opcode() != HloOpcode::kParameter) {
+      TF_RETURN_IF_ERROR(
+          GetInstructionSlices(instruction, buffer_assignment, results));
+    }
+
+    for (const HloComputation* nested_computation :
+         instruction->called_computations()) {
+      if (nested_computation->IsFusionComputation()) {
+        continue;
+      }
+
+      TF_RETURN_IF_ERROR(GetAllSlices(nested_computation, buffer_assignment,
+                                      arguments, results));
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+ComputationKernelEmitter::ComputationKernelEmitter(
+    const HloInstruction* instr, const BufferAssignment* buffer_assignment,
+    const TargetMachineFeatures* target_machine)
+    : instr_(instr),
+      buffer_assignment_(buffer_assignment),
+      target_machine_(target_machine) {}
+
+absl::StatusOr<KernelDefinition>
+ComputationKernelEmitter::EmitKernelDefinition() {
+  VLOG(2) << "Emit Computation host kernel: " << instr_->name();
+
+  auto ctx = std::make_unique<llvm::LLVMContext>();
+
+  const HloModule* hlo_module = instr_->GetModule();
+  if (hlo_module == nullptr) {
+    return Internal("HloModule is null");
+  }
+
+  absl::flat_hash_set<KernelApiIrBuilder::KernelParameter> arguments;
+  absl::flat_hash_set<KernelApiIrBuilder::KernelParameter> results;
+  TF_RETURN_IF_ERROR(
+      GetAllSlices(instr_->to_apply(), buffer_assignment_, arguments, results));
+
+  // As the computation is a series of operations, buffers are not disjoint.
+  KernelApiIrBuilder kernel_api_ir_builder(
+      *ctx,
+      KernelApiIrBuilder::Options::FromHloModuleConfig(hlo_module->config()),
+      KernelApiIrBuilder::BufferValidation::kNone);
+
+  std::unique_ptr<llvm::Module> llvm_module = KernelApiIrBuilder::CreateModule(
+      absl::StrCat(instr_->name(), "_computation_kernel_module"), *ctx);
+
+  TF_ASSIGN_OR_RETURN(std::string kernel_name,
+                      kernel_api_ir_builder.GetKernelName(instr_, "_kernel"));
+
+  TF_ASSIGN_OR_RETURN(KernelApiIrBuilder::KernelPrototype kernel_prototype,
+                      kernel_api_ir_builder.EmitKernelPrototype(
+                          *llvm_module, kernel_name,
+                          std::vector<KernelApiIrBuilder::KernelParameter>(
+                              arguments.begin(), arguments.end()),
+                          std::vector<KernelApiIrBuilder::KernelParameter>(
+                              results.begin(), results.end())));
+
+  llvm::IRBuilder<> ir_builder(*ctx);
+  ir_builder.SetInsertPoint(
+      kernel_prototype.function->getEntryBlock().getTerminator());
+
+  llvm::Value* alloca_size = ir_builder.getInt64(
+      kernel_prototype.arguments.size() + kernel_prototype.results.size());
+  llvm::Value* buffer_table = ir_builder.CreateAlloca(
+      ir_builder.getPtrTy(), alloca_size, "buffer_table");
+
+  absl::flat_hash_map<BufferAllocation::Slice, int64_t>
+      slice_to_buffer_table_index;
+
+  int64_t buffer_table_index = 0;
+  for (const auto& [array, slice] : llvm::zip(
+           kernel_prototype.arguments, kernel_prototype.argument_buffers)) {
+    int64_t index = buffer_table_index++;
+    slice_to_buffer_table_index[slice] = index;
+    llvm::Value* buffer_table_ptr = llvm_ir::EmitBufferIndexingGEP(
+        buffer_table, ir_builder.getPtrTy(), index, &ir_builder);
+    ir_builder.CreateStore(array.GetBasePointer(), buffer_table_ptr);
+  }
+  for (const auto& [array, slice] :
+       llvm::zip(kernel_prototype.results, kernel_prototype.result_buffers)) {
+    int64_t index = buffer_table_index++;
+    slice_to_buffer_table_index[slice] = index;
+    llvm::Value* buffer_table_ptr = llvm_ir::EmitBufferIndexingGEP(
+        buffer_table, ir_builder.getPtrTy(), index, &ir_builder);
+    ir_builder.CreateStore(array.GetBasePointer(), buffer_table_ptr);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      llvm::Function * computation_function,
+      EmitNestedComputation(
+          kernel_prototype.function, kernel_prototype.return_block, ir_builder,
+          *llvm_module, std::move(slice_to_buffer_table_index)));
+
+  ir_builder.SetInsertPoint(
+      kernel_prototype.function->getEntryBlock().getTerminator());
+
+  llvm::Value* llvm_nullptr =
+      llvm::Constant::getNullValue(ir_builder.getPtrTy());
+  std::vector<llvm::Value*> args = {llvm_nullptr, llvm_nullptr, llvm_nullptr,
+                                    buffer_table, llvm_nullptr, llvm_nullptr};
+  ir_builder.CreateCall(computation_function, args);
+
+  auto source = std::make_unique<LlvmIrKernelSource>(std::move(ctx),
+                                                     std::move(llvm_module));
+
+  KernelSpec spec(kernel_prototype.function->getName(), se::ThreadDim(),
+                  std::move(kernel_prototype.argument_buffers),
+                  std::move(kernel_prototype.result_buffers),
+                  std::move(kernel_prototype.invariant_arguments));
+
+  return KernelDefinition(std::move(spec), std::move(source));
+}
+
+absl::StatusOr<llvm::Function*> ComputationKernelEmitter::EmitNestedComputation(
+    llvm::Function* function, llvm::BasicBlock* return_block,
+    llvm::IRBuilderBase& builder, llvm::Module& llvm_module,
+    absl::flat_hash_map<BufferAllocation::Slice, int64_t> buffer_table_index)
+    const {
+  const HloModule* hlo_module = instr_->GetModule();
+
+  IrEmitter ir_emitter(
+      nullptr, *hlo_module, *buffer_assignment_, &llvm_module,
+      /*instruction_to_profile_idx=*/{},
+      /*computation_to_profile_idx=*/{},
+      ComputationsTransitivelyContainCustomCall(instr_), target_machine_,
+      /*emit_code_for_msan=*/false, std::move(buffer_table_index),
+      /*allow_runtime_calls=*/false);
+  IrEmitter::IRBuilderGuard builder_guard = ir_emitter.WithBuilder(builder);
+
+  TF_RETURN_IF_ERROR(ir_emitter.EmitSmallConstantGlobals());
+
+  const HloComputation* computation = instr_->to_apply();
+  return ir_emitter.EmitNestedComputation(*computation, computation->name(),
+                                          /*is_reducer=*/false);
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
new file mode 100644
index 000000000000..8a9ebed739d0
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter.h
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_COMPUTATION_KERNEL_EMITTER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_COMPUTATION_KERNEL_EMITTER_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla::cpu {
+
+// Emits a kernel definition for a call instruction, including all nested
+// computations.
+//
+// This is useful where the alternative of using the thunk runtime would
+// introduce unreasonable overheads, e.g. for tight while loops with scalar
+// operations.
+//
+// This class leverages the legacy IrEmitter to emit the kernel definition,
+// producing a synthetic buffer_table for all arguments and results (including
+// intermediate instructions), though this may change in the future to use stack
+// allocations for small buffers.
+class ComputationKernelEmitter final : public KernelEmitter {
+ public:
+  ComputationKernelEmitter(const HloInstruction* instr,
+                           const BufferAssignment* buffer_assignment,
+                           const TargetMachineFeatures* target_machine);
+
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
+
+ private:
+  absl::StatusOr<llvm::Function*> EmitNestedComputation(
+      llvm::Function* function, llvm::BasicBlock* return_block,
+      llvm::IRBuilderBase& builder, llvm::Module& module,
+      absl::flat_hash_map<BufferAllocation::Slice, int64_t> buffer_table_index)
+      const;
+
+ private:
+  const HloInstruction* instr_;
+
+  const BufferAssignment* buffer_assignment_;
+  const TargetMachineFeatures* target_machine_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_COMPUTATION_KERNEL_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter_test.py b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter_test.py
new file mode 100644
index 000000000000..9b7736f08d64
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/computation_kernel_emitter_test.py
@@ -0,0 +1,106 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+
+from xla.backends.cpu import testlib as testlib_cpu
+from xla.backends.cpu.testlib import utilities
+from xla.codegen import testlib as testlib_base
+from xla.codegen.testlib import utilities as base_utilities
+from xla.python import xla_extension
+
+create_literal = base_utilities.create_literal_from_np
+HloInstruction = testlib_base.HloInstruction
+
+
+def create_trivial_add_computation(
+    dtype: np.dtype,
+) -> testlib_base.HloComputation:
+  scalar_shape = xla_extension.Shape.scalar_shape(dtype)
+  param_0 = testlib_base.HloInstruction.create_parameter(
+      0,
+      scalar_shape,
+      "sub_computation_param_0",
+  )
+  param_1 = testlib_base.HloInstruction.create_parameter(
+      1,
+      scalar_shape,
+      "sub_computation_param_1",
+  )
+  root = testlib_base.HloInstruction.create_binary(
+      scalar_shape,
+      testlib_base.HloOpcode.add,
+      param_0,
+      param_1,
+  )
+  return testlib_base.build_hlo_computation(
+      root,
+      param_0,
+      param_1,
+  )
+
+
+class CallKernelTest(parameterized.TestCase):
+
+  def test_basic_call(self):
+    dtype = np.dtype(np.float32)
+
+    lhs_literal = base_utilities.create_scalar_literal(1.0, dtype)
+    lhs_parameter = testlib_base.HloInstruction.create_parameter(
+        0, lhs_literal.shape(), "lhs"
+    )
+
+    rhs_literal = base_utilities.create_scalar_literal(2.0, dtype)
+    rhs_parameter = testlib_base.HloInstruction.create_parameter(
+        1, rhs_literal.shape(), "rhs"
+    )
+
+    result_literal = base_utilities.create_scalar_literal(0.0, dtype)
+
+    add_computation = create_trivial_add_computation(dtype)
+
+    call_instruction = testlib_base.HloInstruction.create_call(
+        result_literal.shape(),
+        [lhs_parameter, rhs_parameter],
+        add_computation,
+    )
+
+    hlo_module, buffer_assignment = utilities.build_hlo_module(
+        call_instruction,
+        lhs_parameter,
+        rhs_parameter,
+        extra_computations=[add_computation],
+    )
+
+    jit_compiler = testlib_cpu.JitCompiler(hlo_module.get_config())
+
+    computation_emitter = testlib_cpu.ComputationKernelEmitter(
+        hlo_module.get_root_instruction(),
+        buffer_assignment,
+        jit_compiler.get_target_machine(),
+    )
+
+    kernel_definition = computation_emitter.emit_kernel_definition()
+    self.assertIsNotNone(kernel_definition)
+
+    runner = testlib_cpu.KernelRunner.create(kernel_definition, jit_compiler)
+    runner.call([lhs_literal, rhs_literal, result_literal])
+    self.assertEqual(np.asarray(result_literal).item(), 3.0)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/third_party/xla/xla/backends/cpu/codegen/cpu_features.cc b/third_party/xla/xla/backends/cpu/codegen/cpu_features.cc
index 6697676c583c..4525fae89a50 100644
--- a/third_party/xla/xla/backends/cpu/codegen/cpu_features.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/cpu_features.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/StringMap.h"  // IWYU pragma: keep
 #include "llvm/TargetParser/Host.h"
+#include "xla/tsl/platform/logging.h"
 #include "tsl/platform/cpu_info.h"
-#include "tsl/platform/logging.h"
 
 namespace xla::cpu {
 
@@ -37,6 +37,9 @@ using tsl::port::CPUFeature;
 // Returns the earliest CPU generation that supports the instruction set.
 absl::string_view CpuTargetFromMaxFeature(CPUFeature max_feature) {
   switch (max_feature) {
+    //===------------------------------------------------------------------===//
+    // x86
+    //===------------------------------------------------------------------===//
     case CPUFeature::SSE4_2:
       return "nehalem";
     case CPUFeature::AVX:
@@ -54,6 +57,17 @@ absl::string_view CpuTargetFromMaxFeature(CPUFeature max_feature) {
       return "sapphirerapids";
     case CPUFeature::AMX_FP16:
       return "graniterapids";
+
+    //===------------------------------------------------------------------===//
+    // AArch64
+    //===------------------------------------------------------------------===//
+    case CPUFeature::AARCH64_NEON:
+      return "neoverse-n1";
+    case CPUFeature::AARCH64_SVE:
+      return "neoverse-v1";
+    case CPUFeature::AARCH64_SVE2:
+      return "neoverse-v2";
+
     default:
       LOG(FATAL) << "Unsupported max feature: " << max_feature;
   }
@@ -63,9 +77,11 @@ std::optional<CPUFeature> CpuFeatureFromString(absl::string_view cpu_feature) {
   if (cpu_feature.empty()) return std::nullopt;
 
   // Non-exhaustive list of CPU features. (Only the ones we care about.)
-  // TODO(penporn): Handle ARM
   static auto* x86 = [] {
     return new absl::flat_hash_map<std::string, CPUFeature>(
+        //===--------------------------------------------------------------===//
+        // x86
+        //===--------------------------------------------------------------===//
         {{"SSE4_2", CPUFeature::SSE4_2},
          {"AVX", CPUFeature::AVX},
          {"AVX2", CPUFeature::AVX2},
@@ -73,7 +89,13 @@ std::optional<CPUFeature> CpuFeatureFromString(absl::string_view cpu_feature) {
          {"AVX512_VNNI", CPUFeature::AVX512_VNNI},
          {"AVX512_BF16", CPUFeature::AVX512_BF16},
          {"AMX", CPUFeature::AMX_BF16},  // Includes AMX_INT8.
-         {"AMX_FP16", CPUFeature::AMX_FP16}});
+         {"AMX_FP16", CPUFeature::AMX_FP16},
+         //===-------------------------------------------------------------===//
+         // AArch64
+         //===-------------------------------------------------------------===//
+         {"NEON", CPUFeature::AARCH64_NEON},
+         {"SVE", CPUFeature::AARCH64_SVE},
+         {"SVE2", CPUFeature::AARCH64_SVE2}});
   }();
 
   if (auto it = x86->find(absl::AsciiStrToUpper(cpu_feature));
@@ -89,7 +111,8 @@ std::optional<CPUFeature> CpuFeatureFromString(absl::string_view cpu_feature) {
 // switch statement is the most readable way to express the logic.
 //
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
-bool ShouldEnableCpuFeature(absl::string_view feature, CPUFeature max_feature) {
+static bool ShouldEnableX86CpuFeature(absl::string_view feature,
+                                      CPUFeature max_feature) {
   // x86 CPUs have backward compatibility so newer CPUs have all features of
   // older CPUs. We go through switch cases from oldest features to newest.
   //   - Each case looks for features that are introduced in the next
@@ -102,9 +125,6 @@ bool ShouldEnableCpuFeature(absl::string_view feature, CPUFeature max_feature) {
   // AVX512-generation features in the AVX2 case, then fall through to the
   // AVX512 case to disable next-gen features (AVX512_VNNI), etc, all the way
   // down to the newest one.
-  //
-  // TODO(https://github.com/openxla/xla/issues/17758): Figure out if we need to
-  // support AVX10 and where to put it.
   switch (max_feature) {
     case CPUFeature::SSE4_2:
       if (absl::StartsWith(feature, "avx") || feature == "f16c" ||
@@ -148,12 +168,46 @@ bool ShouldEnableCpuFeature(absl::string_view feature, CPUFeature max_feature) {
   }
 }
 
+static bool ShouldEnableAArch64CpuFeature(absl::string_view feature,
+                                          CPUFeature max_feature) {
+  // AArch64 CPUs have backward compatibility so newer CPUs have all features
+  // of older CPUs. We go through switch cases from oldest features to newest.
+  //   - Each case looks for features that are introduced in the next
+  //     generation, i.e., features that should be disabled if `max_feature` is
+  //     older or equal to the case's ISA.
+  //   - We combine all features that needs to be disabled from all ISAs newer
+  //     than `max_feature` by falling through cases.
+  switch (max_feature) {
+    case CPUFeature::AARCH64_NEON:
+      if (feature == "sve") return false;
+      [[fallthrough]];
+
+    case CPUFeature::AARCH64_SVE:
+      if (feature == "sve2") return false;
+      [[fallthrough]];
+
+    default:
+      // Leave all other features enabled.
+      return true;
+  }
+}
+
+bool ShouldEnableCpuFeature(absl::string_view feature, CPUFeature max_feature) {
+  if constexpr (tsl::port::IsX86CPU()) {
+    return ShouldEnableX86CpuFeature(feature, max_feature);
+  } else if constexpr (tsl::port::IsAarch64CPU()) {
+    return ShouldEnableAArch64CpuFeature(feature, max_feature);
+  }
+  return true;
+}
+
 DetectedMachineAttributes DetectMachineAttributes(
     std::optional<CPUFeature> max_feature) {
   DetectedMachineAttributes result;
   // We only have x86 constraints. Skip the check if we are on non-x86 CPUs.
   bool no_feature_constraint =
-      !max_feature.has_value() || !tsl::port::IsX86CPU();
+      !max_feature.has_value() ||
+      !(tsl::port::IsX86CPU() || tsl::port::IsAarch64CPU());
   for (const auto& [feature, enabled] : llvm::sys::getHostCPUFeatures()) {
     bool should_enable =
         enabled && (no_feature_constraint ||
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
index 5e15b3593cb0..47cfefb8ccd7 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
@@ -1,4 +1,4 @@
-load("//xla:strict.default.bzl", "py_strict_test")
+load("//xla:py_strict.bzl", "py_strict_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
index 1cd0e6d2b89b..32bbdcadb673 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/dot_kernel_emitter.cc
@@ -61,8 +61,8 @@ DotKernelEmitter::DotKernelEmitter(const HloInstruction* instr,
 absl::StatusOr<KernelDefinition> DotKernelEmitter::EmitKernelDefinition() {
   const HloModuleConfig& config = instr_->GetModule()->config();
 
-  DotImplementationStrategy strategy =
-      GetDotImplementationStrategy(config, *instr_, *target_machine_);
+  DotImplementationStrategy strategy = GetDotImplementationStrategy(
+      config, *instr_, *target_machine_, /*allow_runtime_calls=*/false);
 
   if (!IsDotCodegenStrategy(strategy)) {
     return Internal("Unsupported dot implementation strategy");
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
index 893c53da36a6..59ec624f4737 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
@@ -1,5 +1,5 @@
-load("//xla:strict.default.bzl", "py_strict_test")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -92,7 +92,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:JITLink",
         "@llvm-project//llvm:ir_headers",
     ],
 )
@@ -102,17 +101,19 @@ xla_cc_test(
     srcs = ["elemental_kernel_emitter_test.cc"],
     deps = [
         ":elemental_kernel_emitter",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:kernel_spec",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
         "//xla/service:logical_buffer",
+        "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:target_machine_features_stub",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
index d3c08a9a901a..8140af590683 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.cc
@@ -159,43 +159,6 @@ ParallelPartitionBounds EmitParallelPartitionBounds(
   return bounds;
 }
 
-// Implementation detail for ComputationsTransitivelyContainCustomCall, which
-// recursively checks whether a computation contains a custom call.
-bool RecursivelyCheckForCustomCall(
-    const HloComputation& computation,
-    absl::flat_hash_map<const HloComputation*, bool>& custom_call_map) {
-  bool contains_custom_call = computation.IsCustomCallComputation();
-
-  for (const HloInstruction* instruction : computation.instructions()) {
-    for (const HloComputation* nested_computation :
-         instruction->called_computations()) {
-      if (const auto itr = custom_call_map.find(nested_computation);
-          itr != custom_call_map.end()) {
-        return itr->second;
-      }
-      contains_custom_call |=
-          RecursivelyCheckForCustomCall(*nested_computation, custom_call_map);
-    }
-  }
-
-  custom_call_map[&computation] = contains_custom_call;
-  return contains_custom_call;
-}
-
-// For each called computation in operation, determines whether that computation
-// calls a custom-call function, either directly or indirectly (e.g. because it
-// calls another computation that does).
-absl::flat_hash_map<const HloComputation*, bool>
-ComputationsTransitivelyContainCustomCall(const HloInstruction* instr) {
-  absl::flat_hash_map<const HloComputation*, bool> custom_call_map;
-
-  for (const HloComputation* computation : instr->called_computations()) {
-    RecursivelyCheckForCustomCall(*computation, custom_call_map);
-  }
-
-  return custom_call_map;
-}
-
 }  // namespace
 
 ElementalKernelEmitter::ElementalKernelEmitter(
@@ -342,9 +305,12 @@ ElementalKernelEmitter::ThreadLocalCallbackFactory(llvm::IRBuilderBase& builder,
     HloComputation* nested_computation = instr_->to_apply();
     bool is_reducer = instr_->opcode() == HloOpcode::kReduce ||
                       instr_->opcode() == HloOpcode::kReduceWindow;
-    TF_RETURN_IF_ERROR(ir_emitter->EmitNestedComputation(
-        *nested_computation, llvm_ir::IrName(nested_computation->name()),
-        is_reducer));
+    TF_RETURN_IF_ERROR(
+        ir_emitter
+            ->EmitNestedComputation(*nested_computation,
+                                    llvm_ir::IrName(nested_computation->name()),
+                                    is_reducer)
+            .status());
   }
 
   return [ir_emitter = std::move(ir_emitter), &builder](
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
index e8eb8d7dbf71..3ac7464113d1 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/elemental_kernel_emitter_test.cc
@@ -28,16 +28,19 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
 #include "xla/service/logical_buffer.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/shape.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
-class ElementalKernelEmitterTest : public HloTestBase {
+class ElementalKernelEmitterTest : public HloHardwareIndependentTestBase {
  public:
   ElementalKernelEmitterTest()
       : target_machine_features_([](int64_t size) { return 1; }) {}
@@ -54,7 +57,9 @@ class ElementalKernelEmitterTest : public HloTestBase {
       const HloModule& hlo) {
     return BufferAssigner::Run(
         &hlo, std::make_unique<DependencyHloOrdering>(&hlo),
-        backend().compiler()->BufferSizeBytesFunction(),
+        [](const BufferValue& buffer) {
+          return CpuExecutable::ShapeSizeBytes(buffer.shape());
+        },
         [](LogicalBuffer::Color) { return /*alignment=*/1; });
   }
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
new file mode 100644
index 000000000000..04b56161e5f4
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
@@ -0,0 +1,127 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/backends/cpu:xla_backend_cpu_internal_access"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "cpu_fusion_emitter_config",
+    hdrs = ["cpu_fusion_emitter_config.h"],
+)
+
+cc_library(
+    name = "cpu_fusion_emitters",
+    srcs = [
+        "cpu_fusion_emitter.cc",
+        "cpu_scatter_emitter.cc",
+    ],
+    hdrs = [
+        "cpu_fusion_emitter.h",
+        "cpu_scatter_emitter.h",
+    ],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu:alignment",
+        "//xla/backends/cpu/codegen:fusion_compiler",
+        "//xla/backends/cpu/codegen:kernel_api_ir_builder",
+        "//xla/backends/cpu/codegen:symbol_name_util",
+        "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
+        "//xla/codegen:kernel_spec",
+        "//xla/codegen:mlir_kernel_source",
+        "//xla/codegen/emitters:computation_partitioner",
+        "//xla/codegen/emitters:elemental_hlo_to_mlir",
+        "//xla/codegen/emitters:type_util",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
+        "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/ir:hlo",
+        "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
+        "//xla/mlir_hlo",
+        "//xla/mlir_hlo:mhlo_passes",
+        "//xla/service:buffer_assignment",
+        "//xla/service:dump",
+        "//xla/service:scatter_simplifier",
+        "//xla/service/cpu:backend_config_proto_cc",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/stream_executor:launch_dim",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Linker",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:DLTIDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MemRefTransforms",
+        "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:VectorDialect",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_fusion_emitter_test",
+    srcs = ["cpu_fusion_emitter_test.cc"],
+    deps = [
+        ":cpu_fusion_emitters",
+        "//xla/backends/cpu/codegen:fusion_compiler",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:mlir_kernel_source",
+        "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:logical_buffer",
+        "//xla/service/cpu:cpu_executable",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:casts",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
new file mode 100644
index 000000000000..d3b9ae9a513d
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
@@ -0,0 +1,393 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Linker/Linker.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "xla/backends/cpu/alignment.h"
+#include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.h"
+#include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_types.h"
+#include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
+#include "xla/backends/cpu/codegen/symbol_name_util.h"
+#include "xla/codegen/emitters/computation_partitioner.h"
+#include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/emitters/type_util.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/dump.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using llvm::SmallVector;
+using mlir::func::FuncOp;
+
+bool Needs64Bits(const Shape& shape) {
+  return shape.IsArray() ? !IsInt32(ShapeUtil::ElementsIn(shape))
+                         : absl::c_any_of(shape.tuple_shapes(), Needs64Bits);
+}
+
+bool Is64BitIndex(const HloInstruction* instr, int operand) {
+  const auto& shape = instr->operand(operand)->shape();
+  return shape.element_type() == PrimitiveType::S64 ||
+         shape.element_type() == PrimitiveType::U64;
+}
+
+bool Needs64BitIndices(const HloComputation* computation) {
+  for (auto* instr : computation->instructions()) {
+    // Check if any HLO instructions directly take 64 bit indices as operands.
+    switch (instr->opcode()) {
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+        for (int i = 1; i < instr->operand_count(); ++i) {
+          if (Is64BitIndex(instr, i)) return true;
+        }
+        break;
+      case HloOpcode::kGather:
+      case HloOpcode::kScatter: {
+        int indices_operand_index = instr->operand_count() / 2;
+        if (Is64BitIndex(instr, indices_operand_index)) return true;
+        break;
+      }
+      default:
+        break;
+    }
+
+    if (Needs64Bits(instr->shape()) ||
+        absl::c_any_of(instr->called_computations(), Needs64BitIndices)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+absl::Status SetKernelFunctionAttributes(const HloFusionInstruction& fusion,
+                                         mlir::Builder& builder,
+                                         mlir::func::FuncOp& func) {
+  const HloModule* hlo_module = fusion.GetModule();
+  if (hlo_module == nullptr) {
+    return Internal("HloModule is null");
+  }
+
+  mlir::MLIRContext* context = func->getContext();
+
+  int32_t vector_width =
+      hlo_module->config().debug_options().xla_cpu_prefer_vector_width();
+  mlir::ArrayAttr prefer_vector_width_attr = builder.getStrArrayAttr(
+      {"prefer-vector-width", absl::StrCat(vector_width)});
+  func->setAttr("passthrough",
+                builder.getArrayAttr({prefer_vector_width_attr}));
+  func->setAttr(
+      "frame_pointer",
+      mlir::LLVM::FramePointerKindAttr::get(
+          context, mlir::LLVM::framePointerKind::FramePointerKind::All));
+  func->setAttr("uwtable_kind",
+                mlir::LLVM::UWTableKindAttr::get(
+                    context, mlir::LLVM::uwtable::UWTableKind::Async));
+
+  return absl::OkStatus();
+}
+}  // namespace
+
+using mlir::AffineExpr;
+
+IndexingMap GetDefaultIndexingMap(absl::Span<const int64_t> thread_tile_sizes,
+                                  absl::Span<const int64_t> shape,
+                                  mlir::MLIRContext* mlir_context) {
+  CHECK_EQ(thread_tile_sizes.size(), shape.size())
+      << "thread_tile_sizes and shape must have the same size";
+  SmallVector<int64_t> thread_tile_counts;
+  thread_tile_counts.reserve(thread_tile_sizes.size());
+  for (auto [tile_size, dim_size] : llvm::zip(thread_tile_sizes, shape)) {
+    thread_tile_counts.push_back(CeilDiv(dim_size, tile_size));
+  }
+  // Delinearize thread_expr w.r.t. number of thread tiles per dimension.
+  auto thread_expr = mlir::getAffineDimExpr(0, mlir_context);
+  SmallVector<AffineExpr, 4> thread_ids =
+      DelinearizeInBoundsIndex(thread_expr, thread_tile_counts);
+  SmallVector<AffineExpr, 4> result;
+  result.reserve(thread_ids.size());
+  auto linear_index = mlir::getAffineSymbolExpr(0, mlir_context);
+  SmallVector<AffineExpr, 4> indices_in_tile =
+      DelinearizeInBoundsIndex(linear_index, thread_tile_sizes);
+  SmallVector<std::pair<AffineExpr, Interval>, 4> constraints;
+  constraints.reserve(thread_ids.size());
+  for (auto [tile_size, thread_id, index_in_tile, dim] :
+       llvm::zip(thread_tile_sizes, thread_ids, indices_in_tile, shape)) {
+    result.push_back(thread_id * tile_size + index_in_tile);
+    constraints.push_back(std::make_pair(result.back(), Interval{0, dim - 1}));
+  }
+  int64_t num_threads = Product(thread_tile_counts);
+  int64_t num_tile_elements = Product(thread_tile_sizes);
+
+  auto affine_map = mlir::AffineMap::get(/*num_dims=*/1, /*num_symbols=*/1,
+                                         result, mlir_context);
+  return IndexingMap(
+      affine_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
+      {IndexingMap::Variable({0, num_tile_elements - 1, "linear_index"})}, {},
+      constraints);
+}
+
+absl::StatusOr<mlir::func::FuncOp> EmitFusionKernelApi(
+    mlir::ModuleOp fusion_module, const HloFusionInstruction& fusion,
+    const std::string& entry_function_name,
+    const BufferAssignment& buffer_assignment) {
+  auto* context = fusion_module.getContext();
+
+  if (fusion_module.getName().value().empty()) {
+    return Internal("Fusion module name shouldn't be empty.");
+  }
+
+  absl::string_view module_name(fusion_module.getName().value());
+  mlir::OpBuilder builder(context);
+  auto loc = mlir::NameLoc::get(builder.getStringAttr(module_name));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<KernelApiIrBuilder::KernelParameter> arguments,
+      KernelApiIrBuilder::GetKernelArgumentsParameters(&fusion,
+                                                       &buffer_assignment));
+  TF_ASSIGN_OR_RETURN(std::vector<KernelApiIrBuilder::KernelParameter> results,
+                      KernelApiIrBuilder::GetKernelResultsParameters(
+                          &fusion, &buffer_assignment));
+
+  // TBD: Annotate tensors with the buffer indices. This way, the buffer
+  // propagation pass can clean them up later.
+  auto get_arg_attrs = [&](int index, BufferAllocation::Slice& slice,
+                           bool is_result) -> absl::StatusOr<mlir::Attribute> {
+    SmallVector<mlir::NamedAttribute> attrs;
+    attrs.push_back(builder.getNamedAttr(
+        "xla.slice_index",
+        builder.getIndexAttr(index + (is_result ? arguments.size() : 0))));
+    attrs.push_back(builder.getNamedAttr(
+        mlir::LLVM::LLVMDialect::getDereferenceableAttrName(),
+        builder.getIndexAttr(slice.size())));
+    attrs.push_back(
+        builder.getNamedAttr(mlir::LLVM::LLVMDialect::getAlignAttrName(),
+                             builder.getIndexAttr(MinAlign())));
+    return builder.getDictionaryAttr(attrs);
+  };
+
+  // First argument is the thread id.
+  SmallVector<mlir::Attribute> arg_attrs{builder.getDictionaryAttr(
+      builder.getNamedAttr("xla.invariant", builder.getUnitAttr()))};
+  SmallVector<mlir::Type> param_types{builder.getIndexType()};
+
+  for (const auto& [index, arg] : llvm::enumerate(arguments)) {
+    param_types.push_back(emitters::TensorShapeToMlirType(arg.shape, builder));
+    TF_ASSIGN_OR_RETURN(
+        arg_attrs.emplace_back(),
+        get_arg_attrs(index - 1, arg.slice, /*is_result=*/false));
+  }
+
+  auto result_types = emitters::ShapeToMlirTypes(fusion.shape(), builder);
+  param_types.append(result_types.begin(), result_types.end());
+  for (const auto& [index, result] : llvm::enumerate(results)) {
+    TF_ASSIGN_OR_RETURN(arg_attrs.emplace_back(),
+                        get_arg_attrs(index, result.slice, /*is_result=*/true));
+  }
+
+  builder.setInsertionPointToStart(fusion_module.getBody());
+  auto entry_func = builder.create<FuncOp>(
+      loc, entry_function_name,
+      mlir::FunctionType::get(context, param_types, result_types),
+      /*sym_visibility=*/mlir::StringAttr{},
+      mlir::ArrayAttr::get(context, arg_attrs),
+      /*res_attrs=*/mlir::ArrayAttr{});
+  entry_func->setAttr("xla.entry", mlir::UnitAttr::get(context));
+  SetBackendKind(context, entry_func, xla::BackendKind::kCpu);
+  entry_func.setPrivate();
+
+  // Create wrapper for the entry function. This function has one call_frame
+  // argument and call the entry function.
+  auto error_type = cpu::ErrorType::get(context);
+  auto call_frame_type = CallFrameType::get(context);
+  auto call_frame_func = builder.create<FuncOp>(
+      loc, module_name,
+      builder.getFunctionType(/*arg_types=*/{call_frame_type},
+                              /*result_types=*/{error_type}));
+
+  TF_RETURN_IF_ERROR(
+      SetKernelFunctionAttributes(fusion, builder, call_frame_func));
+
+  builder.setInsertionPointToStart(call_frame_func.addEntryBlock());
+  mlir::Value call_frame_arg = call_frame_func.getArgument(0);
+  SmallVector<mlir::Value> extracted_values;
+  extracted_values.reserve(arguments.size() + results.size() + 1);
+  extracted_values.push_back(builder.create<cpu::ThreadIdOp>(
+      loc, builder.getIndexType(), call_frame_arg));
+
+  for (int i = 1; i < param_types.size(); ++i) {
+    extracted_values.push_back(builder.create<cpu::LoadOp>(
+        loc, param_types[i], call_frame_arg, i - 1));
+  }
+  auto call_results =
+      builder.create<xla::PureCallOp>(loc, entry_func, extracted_values);
+  call_results->setAttr("noinline", mlir::UnitAttr::get(context));
+  for (auto [index, call_result] : llvm::enumerate(call_results.getResults())) {
+    builder.create<cpu::StoreOp>(loc, call_result, call_frame_arg,
+                                 index + arguments.size());
+  }
+  auto error = builder.create<cpu::SuccessOp>(loc, error_type);
+  builder.create<mlir::func::ReturnOp>(loc, error.getResult());
+
+  return entry_func;
+}
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+absl::StatusOr<emitters::CallTargetProvider> EmitCallTargets(
+    mlir::ModuleOp module, const HloFusionInstruction& fusion,
+    const emitters::PartitionedComputations& computations,
+    const std::vector<emitters::EpilogueSpecification>& epilogues) {
+  auto subgraph_to_mlir_fn = computations.DeclareFunctions(module);
+
+  // Erase subgraphs for all heroes that aren't used anywhere else. This is
+  // necessary because the instructions may not have elemental implementations
+  // (scatter).
+  for (const auto& epilogue : epilogues) {
+    for (auto* custom : epilogue.heroes) {
+      if (custom->user_count() == 0) {
+        subgraph_to_mlir_fn.extract(&computations.FindSubgraph(custom))
+            .mapped()
+            .erase();
+      }
+    }
+  }
+
+  // The epilogue functions replace the root tuple.
+  auto* root = fusion.fused_instructions_computation()->root_instruction();
+  if (root->opcode() == HloOpcode::kTuple && !epilogues.empty()) {
+    subgraph_to_mlir_fn.extract(&computations.FindSubgraph(root))
+        .mapped()
+        .erase();
+  }
+
+  auto call_targets =
+      computations.CreateCallTargetProvider(subgraph_to_mlir_fn);
+  for (const auto& comp : computations.partitioned_computations()) {
+    for (const auto& subgraph : comp.subgraphs()) {
+      if (subgraph_to_mlir_fn.contains(&subgraph)) {
+        TF_RETURN_IF_ERROR(emitters::SubgraphToMlirFunction(
+            comp, subgraph, subgraph_to_mlir_fn[&subgraph], call_targets));
+      }
+    }
+  }
+  for (const auto& epilogue : computations.epilogues()) {
+    if (epilogue.roots.empty()) continue;
+    TF_RETURN_IF_ERROR(emitters::SubgraphToMlirFunction(
+        computations.FindPartitionedComputation(
+            fusion.fused_instructions_computation()),
+        epilogue, subgraph_to_mlir_fn[&epilogue], call_targets));
+  }
+
+  return call_targets;
+}
+
+void SetDataLayoutAttribute(mlir::ModuleOp module,
+                            const HloFusionInstruction& fusion) {
+  int index_bitwidth =
+      Needs64BitIndices(fusion.fused_instructions_computation()) ? 64 : 32;
+  mlir::OpBuilder b(module->getContext());
+  auto index_layout = mlir::DataLayoutEntryAttr::get(
+      b.getIndexType(), b.getI32IntegerAttr(index_bitwidth));
+  module->setAttr(
+      mlir::DLTIDialect::kDataLayoutAttrName,
+      mlir::DataLayoutSpecAttr::get(module->getContext(), {index_layout}));
+}
+
+int64_t CeilDiv(int64_t a, int64_t b) { return (a + b - 1) / b; }
+
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateNamedMlirModuleOp(
+    const HloFusionInstruction& fusion, mlir::Builder& builder) {
+  TF_ASSIGN_OR_RETURN(std::string fusion_name, GetFusionName(fusion));
+  auto loc = mlir::NameLoc::get(builder.getStringAttr(fusion_name));
+  return llvm_ir::CreateMlirModuleOp(loc, fusion_name);
+}
+
+absl::StatusOr<std::string> GetFusionName(const HloFusionInstruction& fusion) {
+  std::string fusion_name(fusion.name());
+  if (fusion.parent()
+          ->parent()
+          ->config()
+          .debug_options()
+          .xla_cpu_generate_unique_c_style_kernel_entry_points()) {
+    TF_ASSIGN_OR_RETURN(fusion_name, ConvertToCName(absl::StrCat(
+                                         fusion.parent()->parent()->name(), "_",
+                                         fusion.name())));
+  }
+  return fusion_name;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
new file mode 100644
index 000000000000..848a6c2cc9a6
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
@@ -0,0 +1,95 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_FUSION_EMITTER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_FUSION_EMITTER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Pass/PassManager.h"
+#include "xla/codegen/emitters/computation_partitioner.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla {
+namespace cpu {
+
+IndexingMap GetDefaultIndexingMap(absl::Span<const int64_t> thread_tile_sizes,
+                                  absl::Span<const int64_t> shape,
+                                  mlir::MLIRContext* mlir_context);
+
+absl::StatusOr<mlir::func::FuncOp> EmitFusionKernelApi(
+    mlir::ModuleOp fusion_module, const HloFusionInstruction& fusion,
+    const std::string& entry_function_name,
+    const BufferAssignment& buffer_assignment);
+
+// Emit the call targets for the given fusion.
+absl::StatusOr<emitters::CallTargetProvider> EmitCallTargets(
+    mlir::ModuleOp module, const HloFusionInstruction& fusion,
+    const emitters::PartitionedComputations& computations,
+    const std::vector<emitters::EpilogueSpecification>& epilogues);
+
+// Set the data layout attribute of the module based on the called instructions
+// of the fusion.
+void SetDataLayoutAttribute(mlir::ModuleOp module,
+                            const HloFusionInstruction& fusion);
+
+// Creates a module op with the name of the fusion using `GetFusionName`.
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateNamedMlirModuleOp(
+    const HloFusionInstruction& fusion, mlir::Builder& builder);
+
+// Returns the name of the fusion.
+// If `xla_cpu_generate_unique_c_style_kernel_entry_points` is true, returns a
+// C-style name of the fusion created by combining the name of the parent
+// HloModule and the name of the fusion.
+absl::StatusOr<std::string> GetFusionName(const HloFusionInstruction& fusion);
+
+class CpuFusionEmitterBase {
+ public:
+  virtual ~CpuFusionEmitterBase() = default;
+
+  virtual int64_t num_threads() const = 0;
+
+  virtual std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t, mlir::MLIRContext*) const = 0;
+
+  virtual std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t, int64_t, mlir::MLIRContext*) const = 0;
+
+  virtual std::string BackendExtraOptions() { return {}; }
+
+  virtual absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Emit() const = 0;
+};
+
+int64_t CeilDiv(int64_t a, int64_t b);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_FUSION_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h
new file mode 100644
index 000000000000..8009cda132ef
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h
@@ -0,0 +1,29 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_FUSION_EMITTER_CONFIG_H_
+#define XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_FUSION_EMITTER_CONFIG_H_
+
+namespace xla {
+namespace cpu {
+
+// Whether CPU Fusion emitters are enabled is controlled via XLA_FLAGS.
+// Here we define some flags to enable/disable specific emitters. This
+// will avoid churn until all emitters are stable.
+inline constexpr bool kFusionEmitterScatterEnabled = true;
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_FUSION_EMITTER_CONFIG_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
new file mode 100644
index 000000000000..d23562b311b2
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/logical_buffer.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/casts.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+std::string LlvmModuleToString(const llvm::Module& module) {
+  std::string dump;
+  llvm::raw_string_ostream stream(dump);
+  stream << module;
+  return dump;
+}
+
+std::string MlirModuleToString(const mlir::ModuleOp& module) {
+  std::string mlir_dump;
+  llvm::raw_string_ostream mlir_stream(mlir_dump);
+  module->print(mlir_stream);
+  return mlir_dump;
+}
+
+class CpuFusionEmitterTest : public HloHardwareIndependentTestBase {
+ protected:
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> RunBufferAssignment(
+      const HloModule& hlo) {
+    return BufferAssigner::Run(
+        &hlo, std::make_unique<DependencyHloOrdering>(&hlo),
+        [](const BufferValue& buffer) {
+          return CpuExecutable::ShapeSizeBytes(buffer.shape());
+        },
+        [](LogicalBuffer::Color) { return /*alignment=*/1; });
+  }
+};
+
+static constexpr absl::string_view kScatterHlo = R"(
+  add {
+    %lhs = f32[] parameter(0)
+    %rhs = f32[] parameter(1)
+    ROOT %add.2 = f32[] add(%lhs, %rhs)
+  }
+
+  scatter_computation {
+    %operand = f32[50,64,8] parameter(0)
+    %indices = s32[500,1]{1,0} parameter(1)
+    %updates = f32[500,1,64,8] parameter(2)
+    ROOT %scatter = f32[50,64,8] scatter(%operand, %indices, %updates),
+      update_window_dims={1,2,3},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      to_apply=add
+  }
+
+  ENTRY main {
+    %p = f32[50,64,8]{2,1,0} parameter(0)
+    %p.1 = s32[500,1]{1,0} parameter(1)
+    %p.2 = f32[500,1,64,8]{3,2,1,0} parameter(2)
+    ROOT %wrapped_scatter = f32[50,64,8]{2,1,0} fusion(%p, %p.1, %p.2),
+      kind=kLoop,
+      calls=%scatter_computation
+  }
+)";
+
+TEST_F(CpuFusionEmitterTest, ScatterMlir) {
+  constexpr absl::string_view kExpected = R"(
+    CHECK:       module @wrapped_scatter attributes {{{.*}}xla.extra_backend_options = #xla<extra_backend_options["xla_cpu_disable_loop_unrolling"]>{{.*}}}
+    CHECK:       @wrapped_scatter_entry(
+    CHECK-SAME:    xla.entry
+    CHECK:           %[[XLA_LOOP:.+]] = xla.loop
+    CHECK:           xla.pure_call
+    CHECK:           scf.if
+    CHECK:             xla.pure_call
+    CHECK:             xla.pure_call
+    CHECK:             arith.addf
+    CHECK:           return %[[XLA_LOOP]]
+    CHECK:       @wrapped_scatter(
+    CHECK-SAME:    %[[CALL_FRAME:.+]]: !xla_cpu.call_frame)
+    CHECK-SAME:    -> !xla_cpu.error
+    CHECK-DAG:       xla_cpu.thread_id %[[CALL_FRAME]]
+    CHECK-DAG:       xla_cpu.load %[[CALL_FRAME]], 0
+    CHECK-DAG:       xla_cpu.load %[[CALL_FRAME]], 1
+    CHECK-DAG:       xla_cpu.load %[[CALL_FRAME]], 2
+    CHECK-DAG:       xla_cpu.load %[[CALL_FRAME]], 3
+    CHECK:           xla.pure_call @wrapped_scatter_entry({{.*}}) {noinline}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnVerifiedModule(kScatterHlo));
+  auto& debug_options = hlo_module->mutable_config().mutable_debug_options();
+  debug_options.set_xla_cpu_use_thunk_runtime(true);
+  debug_options.set_xla_cpu_use_fusion_emitters(true);
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment,
+                          RunBufferAssignment(*hlo_module));
+  auto fusion = Cast<HloFusionInstruction>(
+      hlo_module->entry_computation()->root_instruction());
+  CpuScatterFusion emitter(*buffer_assignment, fusion);
+  TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
+                          emitter.EmitKernelDefinition());
+  const auto& mlir_source =
+      tsl::down_cast<const MlirKernelSource&>(kernel_definition.source());
+  auto mlir_dump = mlir_source.ToString();
+  TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
+                          RunFileCheck(mlir_dump, kExpected));
+  EXPECT_TRUE(filecheck_matched);
+}
+
+TEST_F(CpuFusionEmitterTest, ScatterLlvm) {
+  constexpr absl::string_view kExpected = R"(
+    CHECK-NOT:  @wrapped_scatter_entry(
+    CHECK:      @wrapped_scatter(
+    CHECK:      uwtable "frame-pointer"="all"
+    CHECK-SAME: "prefer-vector-width"="512"
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnVerifiedModule(kScatterHlo));
+  auto& debug_options = hlo_module->mutable_config().mutable_debug_options();
+  debug_options.set_xla_cpu_use_thunk_runtime(true);
+  debug_options.set_xla_cpu_use_fusion_emitters(true);
+  debug_options.set_xla_cpu_prefer_vector_width(512);
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment,
+                          RunBufferAssignment(*hlo_module));
+  auto fusion = Cast<HloFusionInstruction>(
+      hlo_module->entry_computation()->root_instruction());
+  CpuScatterFusion emitter(*buffer_assignment, fusion);
+  TF_ASSERT_OK_AND_ASSIGN(KernelDefinition kernel_definition,
+                          emitter.EmitKernelDefinition());
+  auto [spec, source] = std::move(kernel_definition).release();
+  auto& mlir_source = tsl::down_cast<MlirKernelSource&>(*source);
+  FusionCompiler compiler(FusionCompiler::Options{});
+  TF_ASSERT_OK_AND_ASSIGN(LlvmIrKernelSource llvm_source,
+                          compiler.Compile(std::move(mlir_source)));
+  auto llvm_dump = llvm_source.ToString();
+  TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
+                          RunFileCheck(llvm_dump, kExpected));
+  EXPECT_TRUE(filecheck_matched);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
new file mode 100644
index 000000000000..cc0685caf2bc
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
@@ -0,0 +1,455 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h"
+
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
+#include "xla/codegen/emitters/computation_partitioner.h"
+#include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
+#include "xla/codegen/emitters/ir/xla_attrs.h.inc"
+#include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_spec.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/primitive_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/scatter_simplifier.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+
+using llvm::SmallVector;
+using mlir::ImplicitLocOpBuilder;
+using mlir::Value;
+using mlir::ValueRange;
+
+namespace ma = ::mlir::arith;
+namespace scf = ::mlir::scf;
+
+std::vector<emitters::EpilogueSpecification> CpuScatterFusion::GetEpilogues(
+    const HloFusionInstruction& fusion, mlir::MLIRContext* mlir_context) const {
+  const auto* scatter = fusion_->fused_expression_root();
+  // We don't actually support epilogues for scatter, but this is how we tell
+  // the base class that we don't want it to generate code for the scatter.
+  return {emitters::EpilogueSpecification::FromIdentityIndexing(
+      scatter, scatter, mlir_context)};
+}
+
+std::optional<IndexingMap> CpuScatterFusion::ComputeThreadIdToOutputIndexing(
+    int64_t root_index, mlir::MLIRContext* ctx) const {
+  return std::nullopt;
+}
+
+std::optional<IndexingMap> CpuScatterFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  const auto* scatter =
+      DynCast<HloScatterInstruction>(fusion_->fused_expression_root());
+  CHECK(ScatterSimplifier::IsSimplifiedScatter(scatter))
+      << "Non-simplified HLO Scatter is not supported.";
+  int64_t scatter_operand_count = scatter->scatter_operand_count();
+  // Scatter operands a packed in the following way:
+  // Operand IDs [0, scatter_operand_count - 1] for `scatter operands`.
+  // Operand ID  scatter_operand_count for `scatter indices`.
+  // Operand IDs [scatter_operand_count + 1, 2 * scatter_operand_count] for
+  // `scatter updates`.
+
+  // For scatter operands we do not know the thread ID indexing.
+  if (hero_operand_index < scatter_operand_count) {
+    return std::nullopt;
+  }
+  // Compute thread id mapping based on the first update operand.
+  Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
+
+  auto root_shape = scatter->scatter_operands().front()->shape();
+  SmallVector<int64_t> outer_dimension_partitions(
+      root_shape.dimensions().size(), 1);
+  auto backend_config = fusion_->backend_config<BackendConfig>();
+  if (backend_config.ok() &&
+      !backend_config->outer_dimension_partitions().empty()) {
+    outer_dimension_partitions.assign(
+        backend_config->outer_dimension_partitions().begin(),
+        backend_config->outer_dimension_partitions().end());
+  }
+  SmallVector<int64_t> tile_sizes;
+  tile_sizes.reserve(outer_dimension_partitions.size());
+  for (auto [count, dim] :
+       llvm::zip(root_shape.dimensions(), outer_dimension_partitions)) {
+    tile_sizes.push_back(CeilDiv(count, dim));
+  }
+  return GetDefaultIndexingMap(tile_sizes, root_shape.dimensions(), ctx);
+}
+
+SmallVector<Value> EmitScatterComputation(
+    int64_t num_threads, const HloScatterInstruction* scatter,
+    ValueRange indices, ValueRange update_elems, ValueRange output_tensors,
+    const emitters::PartitionedComputation& root_computation,
+    const emitters::CallTargetProvider& call_targets,
+    mlir::func::FuncOp entry_function, mlir::ImplicitLocOpBuilder& b) {
+  auto reducer =
+      call_targets(scatter->called_computations()[0]->root_instruction());
+  if (scatter->unique_indices() || num_threads == 1 ||
+      scatter->scatter_operand_count() > 1) {
+    SmallVector<Value> computation_args =
+        ProvideParameterRange(root_computation, scatter, /*start=*/0,
+                              /*num=*/scatter->scatter_operand_count(), indices,
+                              call_targets, entry_function, b);
+    computation_args.append(update_elems.begin(), update_elems.end());
+    auto reduced_values =
+        emitters::InlineBlock(b, reducer.getBody().front(), computation_args);
+    SmallVector<Value> ret;
+    ret.reserve(reduced_values.size());
+    for (const auto& [reduced_value, output_tensor] :
+         llvm::zip(reduced_values, output_tensors)) {
+      ret.push_back(b.create<mlir::tensor::InsertOp>(reduced_value,
+                                                     output_tensor, indices));
+    }
+    return ret;
+  }
+  Value output_tensor = output_tensors.front();
+  Value update_elem = update_elems.front();
+  auto atomic_rmw = b.create<AtomicRMWOp>(output_tensor, indices);
+  mlir::OpBuilder body_builder = atomic_rmw.getBodyBuilder();
+  auto reduced_val =
+      emitters::InlineBlock(body_builder, reducer.getBody().front(),
+                            {atomic_rmw.getCurrentValue(), update_elem})[0];
+  body_builder.create<xla::YieldOp>(reducer->getLoc(), reduced_val);
+  return {atomic_rmw->getResult(0)};
+}
+
+CpuScatterFusion::CpuScatterFusion(const BufferAssignment& buffer_assignment,
+                                   const HloFusionInstruction* fusion)
+    : buffer_assignment_(buffer_assignment), fusion_(fusion) {
+  const auto* scatter = Cast<HloScatterInstruction>(
+      fusion->fused_instructions_computation()->root_instruction());
+  auto update_shape = scatter->scatter_updates().front()->shape();
+  auto output_shape = scatter->scatter_operands().front()->shape();
+
+  num_threads_ = 1;
+  SmallVector<int64_t, 2> slice_shape(update_shape.dimensions().begin() + 1,
+                                      update_shape.dimensions().end());
+  int64_t num_elements = Product(slice_shape);
+
+  const int64_t max_vectorized_bytes = 64;
+  int64_t max_vectorized_elements =
+      max_vectorized_bytes /
+      ShapeUtil::ByteSizeOfPrimitiveType(output_shape.element_type());
+  vector_size_ = std::gcd(max_vectorized_elements, num_elements);
+  if (VLOG_IS_ON(5)) {
+    llvm::errs() << "\nvector_size_: " << vector_size_ << "\n\n";
+    llvm::errs() << "\num_threads_: " << num_threads_ << "\n\n";
+  }
+}
+IndexingMap GetScatterIndexingMap(
+    absl::Span<const int64_t> updates_operand_shape, int64_t num_threads,
+    int64_t vector_size, mlir::MLIRContext* mlir_context) {
+  using mlir::AffineExpr;
+
+  // Delinearize thread_expr w.r.t. number of thread tiles per dimension.
+  auto thread_expr = mlir::getAffineDimExpr(0, mlir_context);
+  auto index_id = mlir::getAffineSymbolExpr(0, mlir_context);
+  auto slice_linear_index = mlir::getAffineSymbolExpr(1, mlir_context);
+  auto vector_element_id = mlir::getAffineSymbolExpr(2, mlir_context);
+
+  int64_t num_updates = updates_operand_shape.front();
+  int64_t num_updates_per_thread = CeilOfRatio(num_updates, num_threads);
+  SmallVector<int64_t, 2> slice_shape(updates_operand_shape.begin() + 1,
+                                      updates_operand_shape.end());
+  int64_t num_slice_elements = Product(slice_shape);
+  int64_t num_vectors_per_slice = CeilOfRatio(num_slice_elements, vector_size);
+
+  // Loop w.r.t. indices.
+  AffineExpr updates_id_expr = thread_expr * num_updates_per_thread + index_id;
+  AffineExpr slice_linear_index_expr =
+      slice_linear_index * vector_size + vector_element_id;
+  llvm::SmallVector<AffineExpr, 4> indices_in_tile =
+      DelinearizeInBoundsIndex(slice_linear_index_expr, slice_shape);
+  llvm::SmallVector<AffineExpr, 4> result{updates_id_expr};
+  result.append(indices_in_tile.begin(), indices_in_tile.end());
+
+  SmallVector<std::pair<AffineExpr, Interval>, 4> constraints{
+      {updates_id_expr, {0, num_updates}},
+      {slice_linear_index_expr, {0, num_slice_elements - 1}}};
+
+  auto affine_map = mlir::AffineMap::get(/*num_dims=*/1, /*num_symbols=*/3,
+                                         result, mlir_context);
+  return IndexingMap(
+      affine_map, {IndexingMap::Variable({0, num_threads - 1, "thread_id"})},
+      {IndexingMap::Variable({0, num_updates_per_thread - 1, "index_id"}),
+       IndexingMap::Variable({0, num_vectors_per_slice - 1, "vector_id"}),
+       IndexingMap::Variable({0, vector_size - 1, "vector_element_id"})},
+      {}, constraints);
+}
+
+absl::StatusOr<KernelDefinition> CpuScatterFusion::EmitKernelDefinition() {
+  std::unique_ptr<mlir::MLIRContext> context = FusionCompiler::CreateContext();
+
+  mlir::OpBuilder builder(context.get());
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                      CreateNamedMlirModuleOp(*fusion_, builder));
+
+  absl::string_view module_name(mlir_module->getName().value());
+  SetDataLayoutAttribute(mlir_module.get(), *fusion_);
+
+  mlir::StringAttr disable_loop_unrolling_attr =
+      builder.getStringAttr("xla_cpu_disable_loop_unrolling");
+  mlir_module->getOperation()->setAttr(
+      xla::ExtraBackendOptionsAttr::name,
+      builder.getAttr<xla::ExtraBackendOptionsAttr>(
+          llvm::ArrayRef{disable_loop_unrolling_attr}));
+
+  TF_ASSIGN_OR_RETURN(mlir::func::FuncOp entry_func,
+                      EmitFusionKernelApi(mlir_module.get(), *fusion_,
+                                          std::string(module_name) + "_entry",
+                                          buffer_assignment_));
+
+  std::vector<emitters::EpilogueSpecification> epilogues =
+      GetEpilogues(*fusion_, context.get());
+  emitters::PartitionedComputations computations(
+      fusion_->fused_instructions_computation(), context.get(), epilogues);
+  TF_ASSIGN_OR_RETURN(
+      emitters::CallTargetProvider call_targets,
+      EmitCallTargets(mlir_module.get(), *fusion_, computations, epilogues));
+
+  TF_RETURN_IF_ERROR(
+      EmitEntryFunction(computations, call_targets, entry_func, *fusion_));
+
+  // Convert kernel arguments to fake allocations and buffer uses.
+  KernelSpec::Buffers argument_buffers;
+  KernelSpec::Buffers result_buffers;
+
+  for (auto& indexed : ShapeUtil::GetLeafShapes(fusion_->shape())) {
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice slice,
+        buffer_assignment_.GetUniqueSlice(fusion_, indexed.index));
+    result_buffers.push_back(std::move(slice));
+  }
+
+  // TODO(willfroom): Move this to common method that can be shared across
+  // emitters.
+  absl::flat_hash_set<int64_t> invariant_arguments;
+  int64_t operand_index = 0;
+  for (HloInstruction* operand : fusion_->operands()) {
+    for (auto& indexed : ShapeUtil::GetLeafShapes(operand->shape())) {
+      TF_ASSIGN_OR_RETURN(
+          BufferAllocation::Slice slice,
+          buffer_assignment_.GetUniqueSlice(operand, indexed.index));
+
+      bool invariant = absl::c_none_of(
+          result_buffers,
+          [&slice](const BufferAllocation::Slice& result_slice) {
+            return result_slice.OverlapsWith(slice);
+          });
+      if (invariant) {
+        invariant_arguments.insert(operand_index);
+      }
+
+      argument_buffers.push_back(std::move(slice));
+      ++operand_index;
+    }
+  }
+
+  KernelSpec kernel_spec(module_name, se::ThreadDim(num_threads_),
+                         std::move(argument_buffers), std::move(result_buffers),
+                         std::move(invariant_arguments));
+  return KernelDefinition(std::move(kernel_spec),
+                          std::make_unique<MlirKernelSource>(
+                              std::move(context), std::move(mlir_module)));
+}
+
+absl::Status CpuScatterFusion::EmitEntryFunction(
+    const emitters::PartitionedComputations& computations,
+    const emitters::CallTargetProvider& call_targets,
+    mlir::func::FuncOp entry_function,
+    const HloFusionInstruction& fusion) const {
+  const auto* scatter = Cast<HloScatterInstruction>(
+      fusion.fused_instructions_computation()->root_instruction());
+  if (VLOG_IS_ON(5)) {
+    llvm::errs() << "\n\nScatter: " << scatter->ToString() << "\n\n";
+  }
+
+  absl::Span<HloInstruction* const> scatter_operands =
+      scatter->scatter_operands();
+  const HloInstruction* scatter_indices = scatter->scatter_indices();
+  absl::Span<HloInstruction* const> scatter_updates =
+      scatter->scatter_updates();
+
+  mlir::MLIRContext* mlir_context = entry_function.getContext();
+  ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
+  b.setInsertionPointToStart(entry_function.addEntryBlock());
+  // %arg1 and %arg4 do alias -- they point to the same address!
+  // Therefore we simply don't explicitly use %arg4, and perform all
+  // input/output accesses on %arg1.
+  SmallVector<Value> output_tensors;
+  output_tensors.reserve(scatter_operands.size());
+  for (int i = 0; i < scatter_operands.size(); ++i) {
+    output_tensors.push_back(entry_function.getArgument(1 + i));
+  }
+
+  const auto& root_computation = computations.FindPartitionedComputation(
+      fusion.fused_instructions_computation());
+  CHECK(ScatterSimplifier::IsSimplifiedScatter(scatter))
+      << "Non-simplified HLO Scatter is not supported.";
+
+  // For now, thread_id is hardcoded to 0.
+  entry_function.setArgAttr(0, "xla.range", b.getIndexArrayAttr({0, 0}));
+
+  const Shape& update_shape = scatter_updates.front()->shape();
+
+  Value thread_id = entry_function.getArgument(0);
+  // Set range for the func thread id arg.
+  entry_function.setArgAttr(0, "xla.range",
+                            b.getIndexArrayAttr({0, num_threads_ - 1}));
+  IndexingMap map = GetScatterIndexingMap(
+      update_shape.dimensions(), num_threads_, vector_size_, mlir_context);
+  map.Simplify();
+
+  const ScatterDimensionNumbers& scatter_dims =
+      scatter->scatter_dimension_numbers();
+  int64_t index_vector_dim = scatter_dims.index_vector_dim();
+
+  auto results = emitters::EmitXlaLoopOp(
+      b, {thread_id}, output_tensors, map,
+      [&](ImplicitLocOpBuilder nested_b, ValueRange iv,
+          ValueRange update_indices,
+          ValueRange output_tensors) -> SmallVector<Value> {
+        Value update_id = update_indices.front();
+
+        Value c0 = nested_b.create<mlir::arith::ConstantIndexOp>(0);
+        Value in_bounds = nested_b.create<ma::ConstantIntOp>(1, b.getI1Type());
+
+        SmallVector<Value, 4> update_offsets(
+            scatter_operands.front()->shape().dimensions().size(), c0);
+        for (int i = 0;
+             i < scatter_indices->shape().dimensions(index_vector_dim); ++i) {
+          SmallVector<Value, 4> indices_tensor_indices = {
+              update_id, b.create<ma::ConstantIndexOp>(i)};
+          int indices_index = scatter->scatter_operand_count();
+          auto index = ProvideParameter(
+              root_computation, scatter, indices_index, indices_tensor_indices,
+              call_targets, entry_function, nested_b)[0];
+          if (primitive_util::IsUnsignedIntegralType(
+                  scatter_indices->shape().element_type())) {
+            index = nested_b.create<ma::IndexCastUIOp>(b.getIndexType(), index);
+          } else {
+            index = nested_b.create<ma::IndexCastOp>(b.getIndexType(), index);
+          }
+          Value ub = nested_b.create<ma::ConstantIndexOp>(
+              scatter_operands.front()->shape().dimensions(i) -
+              scatter_updates.front()->shape().dimensions(i + 1));
+          // One bounds check is enough even for signed indices: `sge 0` is
+          // implied by `ule ub`, because `ub >= 0`.
+          in_bounds = nested_b.create<ma::AndIOp>(
+              in_bounds,
+              nested_b.create<ma::CmpIOp>(ma::CmpIPredicate::ule, index, ub));
+          update_offsets[i] = index;
+        }
+        ValueRange predicated_updates =
+            nested_b
+                .create<scf::IfOp>(
+                    in_bounds,
+                    [&](mlir::OpBuilder& then_builder,
+                        mlir::Location then_loc) -> void {
+                      ImplicitLocOpBuilder implicit_then_builder(then_loc,
+                                                                 then_builder);
+                      // Extract update elements.
+                      auto update_elems = ProvideParameterRange(
+                          root_computation, scatter,
+                          /*start=*/scatter->scatter_operand_count() + 1,
+                          /*num=*/scatter->scatter_operand_count(),
+                          update_indices, call_targets, entry_function,
+                          implicit_then_builder);
+
+                      auto output_indices = std::move(update_offsets);
+                      for (int i = 0; i < output_indices.size(); ++i) {
+                        output_indices[i] =
+                            implicit_then_builder.create<ma::AddIOp>(
+                                update_indices[i + 1], output_indices[i]);
+                      }
+                      SmallVector<Value> updated_outputs =
+                          EmitScatterComputation(
+                              num_threads_, scatter, output_indices,
+                              update_elems, output_tensors, root_computation,
+                              call_targets, entry_function,
+                              implicit_then_builder);
+                      implicit_then_builder.create<scf::YieldOp>(
+                          updated_outputs);
+                    },
+                    [&](mlir::OpBuilder& else_b, mlir::Location else_loc) {
+                      else_b.create<scf::YieldOp>(else_loc, output_tensors);
+                    })
+                .getResults();
+        return predicated_updates;
+      });
+  b.create<mlir::func::ReturnOp>(results);
+
+  if (VLOG_IS_ON(5)) {
+    entry_function->getParentOfType<mlir::ModuleOp>().dump();
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
new file mode 100644
index 000000000000..7d09fb77dfa7
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
@@ -0,0 +1,77 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_SCATTER_EMITTER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_SCATTER_EMITTER_H_
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
+#include "xla/codegen/emitters/computation_partitioner.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
+
+namespace xla {
+namespace cpu {
+
+// Generic scatter fusion. Lowers to LLVM via MLIR.
+class CpuScatterFusion final : public KernelEmitter {
+ public:
+  explicit CpuScatterFusion(const BufferAssignment& buffer_assignment,
+                            const HloFusionInstruction* fusion);
+
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
+
+ private:
+  absl::Status EmitEntryFunction(
+      const emitters::PartitionedComputations& computations,
+      const emitters::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const;
+
+  std::vector<emitters::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const;
+
+  mlir::Value EmitThreadId(mlir::ImplicitLocOpBuilder& builder, int dim) const;
+
+  // These two methods do not seem to be used @ecg?
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* ctx) const;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const;
+
+  const BufferAssignment& buffer_assignment_;
+  const HloFusionInstruction* fusion_;
+
+  int64_t vector_size_;
+  int64_t num_threads_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_EMITTERS_CPU_SCATTER_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
index 71e5d5af522b..d5f892ba18d2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/ir/BUILD
@@ -30,16 +30,10 @@ gentbl_cc_library(
     name = "xla_cpu_dialect_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-dialect-decls"],
-            "xla_cpu_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "xla_cpu_dialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_cpu_dialect.h.inc": ["-gen-dialect-decls"],
+        "xla_cpu_dialect.cc.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_cpu_dialect.td",
     deps = [":xla_cpu_td_files"],
@@ -49,22 +43,16 @@ gentbl_cc_library(
     name = "xla_cpu_types_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-typedef-decls",
-                "-typedefs-dialect=xla_cpu",
-            ],
-            "xla_cpu_types.h.inc",
-        ),
-        (
-            [
-                "-gen-typedef-defs",
-                "-typedefs-dialect=xla_cpu",
-            ],
-            "xla_cpu_types.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_cpu_types.h.inc": [
+            "-gen-typedef-decls",
+            "-typedefs-dialect=xla_cpu",
+        ],
+        "xla_cpu_types.cc.inc": [
+            "-gen-typedef-defs",
+            "-typedefs-dialect=xla_cpu",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_cpu_types.td",
     deps = [":xla_cpu_td_files"],
@@ -74,16 +62,10 @@ gentbl_cc_library(
     name = "xla_cpu_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "xla_cpu_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "xla_cpu_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_cpu_ops.h.inc": ["-gen-op-decls"],
+        "xla_cpu_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_cpu_ops.td",
     deps = [":xla_cpu_td_files"],
@@ -108,6 +90,5 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
index abd71c599a85..8d1db1679a84 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
@@ -18,15 +18,10 @@ package_group(
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=XlaCpuTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=XlaCpuTransforms",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     visibility = ["//visibility:private"],
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/lower_to_llvm.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/lower_to_llvm.cc
index 787f82d1d808..a3ca8e8b1a93 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/lower_to_llvm.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/lower_to_llvm.cc
@@ -41,7 +41,7 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
     mlir::RewritePatternSet patterns(mlir_context);
     PopulateXlaCpuConversionPatterns(patterns);
     mlir::GreedyRewriteConfig config;
-    config.fold = true;
+    config.enableFolding(true);
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
       signalPassFailure();
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
index cd4d18f20ce1..8a6ce396581c 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
@@ -71,7 +71,7 @@ struct LowerLoadOp : public mlir::OpRewritePattern<LoadOp> {
         ptr, kernel_call_frame, cast,
         llvm::SmallVector<mlir::LLVM::GEPArg, 2>{mlir::LLVM::GEPArg(0),
                                                  mlir::LLVM::GEPArg(3)},
-        /*inbounds=*/true);
+        mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto args_ptr = b.create<mlir::LLVM::LoadOp>(ptr, args_gep);
     args_ptr.setInvariant(true);
 
@@ -80,7 +80,7 @@ struct LowerLoadOp : public mlir::OpRewritePattern<LoadOp> {
         ptr, kernel_arg, args_ptr,
         llvm::SmallVector<mlir::LLVM::GEPArg, 2>{
             mlir::LLVM::GEPArg(op.getIndex()), mlir::LLVM::GEPArg(0)},
-        /*inbounds=*/true);
+        mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto arg_ptr = b.create<mlir::LLVM::LoadOp>(ptr, arg_gep);
     arg_ptr.setInvariant(true);
     arg_ptr->setAttr(mlir::LLVM::LLVMDialect::getAlignAttrName(),
@@ -113,12 +113,14 @@ struct LowerThreadId : public mlir::OpRewritePattern<ThreadIdOp> {
                                                            op.getCallFrame())
                     .getResult(0);
     auto tid_gep = b.create<mlir::LLVM::GEPOp>(
-        ptr, kernel_call_frame, cast, mlir::LLVM::GEPArg(1), /*inbounds=*/true);
+        ptr, kernel_call_frame, cast, mlir::LLVM::GEPArg(1),
+        mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto tid_ptr = b.create<mlir::LLVM::LoadOp>(ptr, tid_gep);
 
     // Load 'x'.
     auto thread_x_get = b.create<mlir::LLVM::GEPOp>(
-        ptr, kernel_dim, tid_ptr, mlir::LLVM::GEPArg(0), /*inbounds=*/true);
+        ptr, kernel_dim, tid_ptr, mlir::LLVM::GEPArg(0),
+        mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto thread_id = b.create<mlir::LLVM::LoadOp>(i64_ty, thread_x_get);
 
     mlir::Value tix = thread_id.getResult();
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
new file mode 100644
index 000000000000..7c2fb355b38a
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
@@ -0,0 +1,240 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Transforms/Passes.h"
+#include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_dialect.h"
+#include "xla/backends/cpu/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/emitters/ir/xla_attrs.h.inc"
+#include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::cpu {
+
+static absl::Status RunPassPipeline(
+    mlir::ModuleOp module, mlir::PassManager& pm,
+    mlir::interpreter::MlirCompilationTrace* trace) {
+  if (VLOG_IS_ON(5)) {
+    module.getContext()->disableMultithreading();
+    pm.enableIRPrinting();
+  }
+
+  tsl::StatusScopedDiagnosticHandler diagnostic_handler(module.getContext());
+  return diagnostic_handler.consumeStatus(pm.run(module));
+}
+
+static void AddXlaOpsOptimizationPasses(mlir::OpPassManager& pm) {
+  pm.addNestedPass<mlir::func::FuncOp>(emitters::CreateSimplifyArithPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(emitters::CreateEraseDeadFunctionsPass());
+  pm.addPass(mlir::createCSEPass());
+}
+
+static void AddLoopTransformationPasses(mlir::OpPassManager& pm) {
+  pm.addNestedPass<mlir::func::FuncOp>(emitters::CreateLowerXlaToScfPass());
+  pm.addPass(mlir::createInlinerPass({}, [&](mlir::OpPassManager& pm) {
+    // CSE after inlining because inlining can introduce duplicates.
+    pm.addPass(mlir::createCSEPass());
+  }));
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      emitters::CreateLowerXlaLoopsToScfPass());
+  pm.addPass(mlir::mhlo::createConvertToSignlessPass());
+  pm.addPass(emitters::CreatePropagateSliceIndicesPass());
+  pm.addPass(emitters::CreateFlattenTensorsPass());
+  // We need LICM before unswitching loops, because our loop unswitcher only
+  // detects for loops with a single if inside them.
+  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm.addNestedPass<mlir::func::FuncOp>(emitters::CreateUnswitchLoopsPass());
+  // We need LICM again after unswitching, because that can introduce new
+  // opportunities for LICM. This would not be necessary if LICM also moved
+  // instructions over ifs.
+  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      emitters::CreateVectorizeLoadsAndStoresPass(/*target_type=*/"cpu"));
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+}
+
+static void AddLoweringPasses(mlir::OpPassManager& pm) {
+  pm.addNestedPass<mlir::func::FuncOp>(
+      emitters::CreateConvertPureCallOpsPass());
+  pm.addPass(cpu::CreateLowerToLLVMPass());
+  pm.addPass(emitters::CreateLowerTensorsPass(/*target_type=*/"cpu"));
+  pm.addPass(mlir::createConvertComplexToStandardPass());
+  pm.addPass(emitters::CreateMergePointersToSameSlicePass());
+
+  // LowerTensors creates new affine.apply ops. Fold and CSE them so
+  // simplify-affine has maximally folded expressions to work with.
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addNestedPass<mlir::func::FuncOp>(emitters::CreateSimplifyArithPass());
+  pm.addPass(emitters::CreateSimplifyAffinePass());
+  pm.addPass(mlir::createCanonicalizerPass());
+
+  // simplify-affine lowers most affine.apply ops, but if it can't prove a
+  // division or modulo is unsigned, affine.apply ops will remain.
+  pm.addPass(mlir::createLowerAffinePass());
+
+  pm.addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  pm.addPass(mlir::createCSEPass());
+
+  pm.addPass(emitters::CreateExpandFloatOpsPass());
+  pm.addPass(emitters::CreateEraseDeadFunctionsPass());
+  pm.addPass(mlir::createLowerAffinePass());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::createSCFToControlFlowPass());
+  pm.addPass(emitters::CreateLowerToLLVMPass(/*target_type=*/"cpu"));
+  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+}
+
+static int GetLlvmFunctionDefCount(mlir::ModuleOp m) {
+  int count = 0;
+  m.walk([&count](mlir::LLVM::LLVMFuncOp func) {
+    if (!func.getBody().empty()) {
+      count++;
+    }
+    return mlir::WalkResult::advance();
+  });
+  return count;
+};
+
+absl::StatusOr<std::unique_ptr<llvm::Module>> FusionCompiler::Compile(
+    llvm::LLVMContext& llvm_context, mlir::ModuleOp mlir_module) {
+  mlir::PassManager pass_manager(mlir_module.getContext());
+
+  AddXlaOpsOptimizationPasses(pass_manager);
+  AddLoopTransformationPasses(pass_manager);
+  AddLoweringPasses(pass_manager);
+
+  TF_RETURN_IF_ERROR(RunPassPipeline(mlir_module, pass_manager, nullptr));
+
+  // At the end of the MLIR pipeline we must have just one function definition.
+  // This helps later compilation stages, where each thunk is assumed to be a
+  // standalone function.
+  if (int func_count = GetLlvmFunctionDefCount(mlir_module); func_count != 1) {
+    return Internal("The module must have just one function definition; has %d",
+                    func_count);
+  }
+
+  std::unique_ptr<llvm::Module> llvm_module =
+      mlir::translateModuleToLLVMIR(mlir_module, llvm_context);
+
+  if (mlir::Attribute options =
+          mlir_module->getAttr(xla::ExtraBackendOptionsAttr::name)) {
+    const auto formatter = [](std::string* out, const mlir::StringAttr& attr) {
+      absl::StrAppend(out, attr.str());
+    };
+    std::string options_csv = absl::StrJoin(
+        mlir::cast<xla::ExtraBackendOptionsAttr>(options), ",", formatter);
+    llvm::MDString* options_mdstring =
+        llvm::MDString::get(llvm_context, options_csv);
+    llvm_module->addModuleFlag(llvm::Module::Error, "xla_backend_extra_options",
+                               options_mdstring);
+  }
+
+  TF_RET_CHECK(llvm_module != nullptr)
+      << "Failed to translate module to LLVM IR.";
+
+  llvm_module->setDataLayout(llvm_module->getDataLayout());
+
+  return llvm_module;
+}
+
+// Compile a MLIR kernel source to a LLVM kernel source.
+absl::StatusOr<LlvmIrKernelSource> FusionCompiler::Compile(
+    MlirKernelSource mlir_kernel_source) {
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> llvm_module,
+                      Compile(*llvm_context, mlir_kernel_source.module()));
+  return LlvmIrKernelSource(std::move(llvm_context), std::move(llvm_module));
+}
+
+std::unique_ptr<mlir::MLIRContext> FusionCompiler::CreateContext() {
+  auto context = std::make_unique<mlir::MLIRContext>();
+  context->loadDialect<mlir::DLTIDialect, mlir::affine::AffineDialect,
+                       mlir::arith::ArithDialect, mlir::cf::ControlFlowDialect,
+                       mlir::func::FuncDialect, mlir::math::MathDialect,
+                       xla::cpu::XlaCpuDialect, mlir::mhlo::MhloDialect,
+                       mlir::scf::SCFDialect, mlir::LLVM::LLVMDialect,
+                       mlir::tensor::TensorDialect, mlir::vector::VectorDialect,
+                       xla::XlaDialect>();
+
+  mlir::DialectRegistry registry;
+  mlir::LLVM::registerInlinerInterface(registry);
+  mlir::func::registerInlinerExtension(registry);
+  mlir::registerLLVMDialectTranslation(registry);
+  mlir::registerBuiltinDialectTranslation(registry);
+  context->appendDialectRegistry(registry);
+
+  return context;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
new file mode 100644
index 000000000000..f970d69475d3
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.h
@@ -0,0 +1,57 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_FUSION_COMPILER_H_
+#define XLA_BACKENDS_CPU_CODEGEN_FUSION_COMPILER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
+
+namespace xla::cpu {
+
+// FusionCompiler compiles MLIR modules to LLVM IR using the XLA:CPU compilation
+// pipeline.
+class FusionCompiler {
+ public:
+  struct Options {
+    // Placeholder for now, but will be used in the future.
+  };
+
+  explicit FusionCompiler(Options options) {}
+
+  // Compile a given MLIR module to LLVM, using the provided LLVM context.
+  absl::StatusOr<std::unique_ptr<llvm::Module>> Compile(
+      llvm::LLVMContext& llvm_context, mlir::ModuleOp mlir_module);
+  // Compile a MLIR kernel source to a LLVM kernel source.
+  absl::StatusOr<LlvmIrKernelSource> Compile(
+      MlirKernelSource mlir_kernel_source);
+
+  // Create a new MLIR context for the compiler with the required dialects for
+  // compiling an XLA:CPU fusion.
+  static std::unique_ptr<mlir::MLIRContext> CreateContext();
+
+ private:
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_FUSION_COMPILER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
index f8cc6cf30725..24a6d3c91b4f 100644
--- a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.cc
@@ -16,19 +16,29 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/base/call_once.h"
+#include "absl/base/nullability.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCContext.h"
@@ -36,20 +46,26 @@ limitations under the License.
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
+#include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/polynomial_approximations.h"
+#include "xla/service/cpu/cpu_options.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/cpu_info.h"
 
 namespace xla::cpu {
 
@@ -71,6 +87,81 @@ static llvm::OptimizationLevel GetOptimizationLevel(
   }
 }
 
+static std::unique_ptr<HloModuleConfig> ParseXlaBackendExtraOptions(
+    absl::string_view config_csv) {
+  auto module_config = std::make_unique<HloModuleConfig>();
+  DebugOptions& debug_options = module_config->mutable_debug_options();
+  auto* map = debug_options.mutable_xla_backend_extra_options();
+  std::vector<absl::string_view> vec =
+      absl::StrSplit(config_csv, ',', absl::SkipEmpty());
+  for (const auto& v : vec) {
+    std::vector<absl::string_view> kv = absl::StrSplit(v, '=');
+    (*map)[kv[0]] = kv.size() == 1 ? "" : kv[1];
+  }
+  return module_config;
+}
+
+// Returns an HloModuleConfig with its DebugOptions.xla_backend_extra_options
+// set by the values embedded in the LLVM module. The rest of the fields
+// of the proto should be ignored since they're just the default values.
+// We could instead return an unordered_map<str, str>, but we already have
+// helpers that expect a DebugOptions, so this ends up being simpler.
+static absl_nullable std::unique_ptr<HloModuleConfig> GetXlaBackendExtraOptions(
+    const llvm::Module& llvm_module) {
+  llvm::Metadata* md = llvm_module.getModuleFlag("xla_backend_extra_options");
+  if (md == nullptr) return nullptr;
+  auto* md_string = llvm::dyn_cast<llvm::MDString>(md);
+  if (md_string == nullptr) return nullptr;
+  std::string config_csv = md_string->getString().str();
+  return ParseXlaBackendExtraOptions(config_csv);
+}
+
+static llvm::PipelineTuningOptions GetPipelineTuningOptions(
+    const llvm::Module& module, IrCompiler::Options options) {
+  auto pto_from_options = [](const IrCompiler::Options opts) {
+    llvm::PipelineTuningOptions pto;
+    pto.LoopVectorization = !opts.optimize_for_size;
+    pto.SLPVectorization =
+        !opts.optimize_for_size && !opts.disable_slp_vectorizer;
+    pto.LoopUnrolling = !opts.disable_loop_unrolling;
+
+    // TODO(b/411125413): Re-enable SLPVectorization once the LLVM bug is fixed.
+    pto.SLPVectorization = false;
+
+    return pto;
+  };
+
+  std::unique_ptr<HloModuleConfig> config = GetXlaBackendExtraOptions(module);
+  if (config == nullptr) {
+    return pto_from_options(options);
+  }
+
+  // Apply overrides from the embedded config.
+  IrCompiler::Options with_overrides(options);
+  if (options::OptimizeForSizeRequested(*config)) {
+    with_overrides.optimize_for_size = true;
+  }
+  if (options::SlpVectorizerDisabled(*config)) {
+    with_overrides.disable_slp_vectorizer = true;
+  }
+  if (options::DisableLoopUnrolling(*config)) {
+    with_overrides.disable_loop_unrolling = true;
+  }
+  return pto_from_options(with_overrides);
+}
+
+std::unique_ptr<IrCompiler> IrCompiler::Create(
+    llvm::TargetOptions target_options, Options options,
+    CompilationHooks hooks) {
+  TargetMachineBuilder target_machine_builder =
+      IrCompiler::InferTargetMachineBuilder(std::move(target_options),
+                                            options.opt_level,
+                                            options.max_cpu_feature);
+
+  return std::make_unique<IrCompiler>(target_machine_builder,
+                                      std::move(options), std::move(hooks));
+}
+
 IrCompiler::IrCompiler(TargetMachineBuilder target_machine_builder,
                        Options options, CompilationHooks hooks)
     : IRCompiler(llvm::orc::IRSymbolMapper::ManglingOptions()),
@@ -78,6 +169,55 @@ IrCompiler::IrCompiler(TargetMachineBuilder target_machine_builder,
       options_(std::move(options)),
       hooks_(std::move(hooks)) {}
 
+// Initialize LLVM the first time `InferTargetMachine` is called.
+static void InitializeLLVMTarget() {
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+}
+
+absl::once_flag initialize_llvm_flag;
+
+absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
+IrCompiler::InferTargetMachine(
+    const llvm::TargetOptions& target_options, llvm::CodeGenOptLevel opt_level,
+    std::optional<tsl::port::CPUFeature> max_cpu_feature) {
+  // Detect machine attributes for the target CPU.
+  auto result = DetectMachineAttributes(max_cpu_feature);
+  llvm::SmallVector<std::string> attrs(result.features.begin(),
+                                       result.features.end());
+
+  // If `max_cpu_feature` is newer than the host CPU, we should keep the host
+  // CPU name, e.g., we don't want to set the target CPU to Skylake when we are
+  // on a Broadwell host.
+  absl::string_view cpu = result.num_filtered_features
+                              ? CpuTargetFromMaxFeature(*max_cpu_feature)
+                              : absl::string_view(llvm::sys::getHostCPUName());
+
+  absl::call_once(initialize_llvm_flag, InitializeLLVMTarget);
+  std::unique_ptr<llvm::TargetMachine> target_machine(
+      llvm::EngineBuilder()
+          .setTargetOptions(target_options)
+          .setOptLevel(opt_level)
+          .selectTarget(
+              /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
+              /*MCPU=*/cpu,
+              /*MAttrs=*/attrs));
+
+  if (target_machine == nullptr) {
+    return Internal("Failed to create target machine for CPU %s", cpu);
+  }
+
+  return std::move(target_machine);
+}
+
+IrCompiler::TargetMachineBuilder IrCompiler::InferTargetMachineBuilder(
+    const llvm::TargetOptions& target_options, llvm::CodeGenOptLevel opt_level,
+    std::optional<tsl::port::CPUFeature> max_cpu_feature) {
+  return [target_options, opt_level, max_cpu_feature] {
+    return InferTargetMachine(target_options, opt_level, max_cpu_feature);
+  };
+}
+
 llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
     llvm::Module& module) {
   VLOG(2) << "IR before optimizations";
@@ -87,8 +227,8 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
   // multiple threads, `IrCompiler` user (in most cases `SimpleOrcJIT`)
   // must guarantee that target machine builder will return a unique
   // TargetMachine for each compilation, as it is not thread safe.
-  absl::StatusOr<std::shared_ptr<llvm::TargetMachine>> target_machine =
-      target_machine_builder_();
+  absl::StatusOr<std::unique_ptr<llvm::TargetMachine>> target_machine =
+      build_target_machine();
 
   if (!target_machine.ok()) {
     return llvm::make_error<llvm::StringError>(
@@ -105,12 +245,43 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
     }
   }
 
-  llvm::PipelineTuningOptions pto;
-  pto.LoopVectorization = !options_.optimize_for_size;
-  pto.SLPVectorization =
-      !options_.optimize_for_size && !options_.disable_slp_vectorizer;
-  pto.LoopUnrolling = !options_.disable_loop_unrolling;
+  if (llvm::Error ir_passes_error =
+          RunIrPasses(module, target_machine->get())) {
+    return ir_passes_error;
+  }
 
+  VLOG(2) << "IR after optimizations";
+  XLA_VLOG_LINES(2, llvm_ir::DumpToString(&module));
+
+  {  // Synchronize access to user-defined hooks.
+    absl::MutexLock lock(&mutex_);
+    if (hooks_.post_optimization) {
+      hooks_.post_optimization(module);
+    }
+  }
+
+  std::unique_ptr<llvm::MemoryBuffer> mc_memory_buffer =
+      EmitMachineCode(module, target_machine->get());
+
+  {  // Synchronize access to user-defined hooks.
+    absl::MutexLock lock(&mutex_);
+    if (hooks_.post_codegen) {
+      llvm::Expected<std::unique_ptr<llvm::object::ObjectFile>> obj_file =
+          llvm::object::ObjectFile::createObjectFile(*mc_memory_buffer);
+      if (obj_file) {
+        hooks_.post_codegen(module, *obj_file.get());
+      } else {
+        LOG(WARNING) << "Could not convert memory buffer to object file";
+      }
+    }
+  }
+
+  return std::move(mc_memory_buffer);
+}
+
+llvm::Error IrCompiler::RunIrPasses(llvm::Module& module,
+                                    llvm::TargetMachine* target_machine) const {
+  llvm::PipelineTuningOptions pto = GetPipelineTuningOptions(module, options_);
   llvm::LoopAnalysisManager lam;
   llvm::FunctionAnalysisManager fam;
   llvm::CGSCCAnalysisManager cgam;
@@ -120,10 +291,10 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
   llvm::StandardInstrumentations si(module.getContext(), false);
   si.registerCallbacks(pic, &mam);
 
-  llvm::PassBuilder pb(target_machine->get(), pto, {}, &pic);
+  llvm::PassBuilder pb(target_machine, pto, {}, &pic);
 
   // Add the appropriate TargetLibraryInfo.
-  llvm::Triple target_triple((*target_machine)->getTargetTriple());
+  llvm::Triple target_triple(target_machine->getTargetTriple());
   auto target_library_info_impl =
       std::make_unique<llvm::TargetLibraryInfoImpl>(target_triple);
   target_library_info_impl->addVectorizableFunctions(
@@ -151,51 +322,49 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> IrCompiler::operator()(
     pm.addPass(pb.buildPerModuleDefaultPipeline(opt_level));
   }
 
-  CHECK(!llvm::verifyModule(module, &llvm::dbgs()));
+  {
+    std::string error_string;
+    llvm::raw_string_ostream error_stream(error_string);
+    if (llvm::verifyModule(module, &error_stream)) {
+      return llvm::make_error<llvm::StringError>(
+          llvm::errc::invalid_argument,
+          absl::StrFormat("Invalid LLVM IR before optimizations:\n%s",
+                          error_stream.str()));
+    }
+  }
 
   pm.run(module, mam);
 
-  CHECK(!llvm::verifyModule(module, &llvm::dbgs()));
+  {
+    std::string error_string;
+    llvm::raw_string_ostream error_stream(error_string);
+    if (llvm::verifyModule(module, &error_stream)) {
+      return llvm::make_error<llvm::StringError>(
+          llvm::errc::invalid_argument,
+          absl::StrFormat("Invalid LLVM IR after optimizations:\n%s",
+                          error_stream.str()));
+    }
+  }
 
   RewriteToPolynomialApproximations(&module, options_.fast_math_flags);
 
+  return llvm::Error::success();
+}
+
+std::unique_ptr<llvm::MemoryBuffer> IrCompiler::EmitMachineCode(
+    llvm::Module& module, llvm::TargetMachine* target_machine) const {
   // Buffer for holding machine code prior to constructing the ObjectFile.
   llvm::SmallVector<char, 0> mc_stream_buffer;
   llvm::raw_svector_ostream ostream(mc_stream_buffer);
 
-  VLOG(2) << "IR after optimizations";
-  XLA_VLOG_LINES(2, llvm_ir::DumpToString(&module));
-
-  {  // Synchronize access to user-defined hooks.
-    absl::MutexLock lock(&mutex_);
-    if (hooks_.post_optimization) {
-      hooks_.post_optimization(module);
-    }
-  }
-
   // Generate code.
   llvm::MCContext* mc_context;
   llvm::legacy::PassManager codegen_passes;
-  (*target_machine)->addPassesToEmitMC(codegen_passes, mc_context, ostream);
+  target_machine->addPassesToEmitMC(codegen_passes, mc_context, ostream);
   codegen_passes.run(module);
 
-  std::unique_ptr<llvm::MemoryBuffer> mc_memory_buffer(
-      new llvm::SmallVectorMemoryBuffer(std::move(mc_stream_buffer)));
-
-  {  // Synchronize access to user-defined hooks.
-    absl::MutexLock lock(&mutex_);
-    if (hooks_.post_codegen) {
-      llvm::Expected<std::unique_ptr<llvm::object::ObjectFile>> obj_file =
-          llvm::object::ObjectFile::createObjectFile(*mc_memory_buffer);
-      if (obj_file) {
-        hooks_.post_codegen(module, *obj_file.get());
-      } else {
-        LOG(WARNING) << "Could not convert memory buffer to object file";
-      }
-    }
-  }
-
-  return std::move(mc_memory_buffer);
+  return std::make_unique<llvm::SmallVectorMemoryBuffer>(
+      std::move(mc_stream_buffer));
 }
 
 llvm::CodeGenOptLevel IrCompiler::GetCodeGenOptLevel(
diff --git a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
index 2430de1fcef8..e9f161ef5ec8 100644
--- a/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
+++ b/third_party/xla/xla/backends/cpu/codegen/ir_compiler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -32,7 +33,9 @@ limitations under the License.
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "xla/service/hlo_module_config.h"
+#include "tsl/platform/cpu_info.h"
 
 namespace xla::cpu {
 
@@ -49,13 +52,18 @@ class IrCompiler : public llvm::orc::IRCompileLayer::IRCompiler {
   //
   // See `llvm::orc::ConcurrentIRCompiler` to see corresponding API in ORC.
   using TargetMachineBuilder =
-      std::function<absl::StatusOr<std::shared_ptr<llvm::TargetMachine>>()>;
+      std::function<absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>()>;
 
   // Options for configuring the LLVM compilation pipeline and optimizations.
   struct Options {
     llvm::CodeGenOptLevel opt_level = llvm::CodeGenOptLevel::None;
     bool optimize_for_size = false;
 
+    // Maximum CPU instruction set for wich the compiler should generate code.
+    // If instruction set is empty, compiler will generate code for all ISA
+    // extensions detected on the current machine.
+    std::optional<tsl::port::CPUFeature> max_cpu_feature;
+
     llvm::FastMathFlags fast_math_flags;
 
     bool disable_expensive_passes = false;
@@ -75,16 +83,50 @@ class IrCompiler : public llvm::orc::IRCompileLayer::IRCompiler {
         post_codegen;
   };
 
+  static std::unique_ptr<IrCompiler> Create(llvm::TargetOptions target_options,
+                                            Options options,
+                                            CompilationHooks hooks);
+
   IrCompiler(TargetMachineBuilder target_machine_builder, Options options,
              CompilationHooks hooks);
 
+  // Infers the `llvm::TargetMachine` for the current host. If `max_cpu_feature`
+  // is provided, it will be used to constrain the set of features that LLVM
+  // codegen (instruction selection) is allowed to use, e.g. it can be used to
+  // explicitly disable certain AVX512 extensions, in case the compiled
+  // executable will be serialized and later loaded on a different machine.
+  static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
+  InferTargetMachine(const llvm::TargetOptions& target_options,
+                     llvm::CodeGenOptLevel opt_level,
+                     std::optional<tsl::port::CPUFeature> max_cpu_feature);
+
+  // Returns a target machine builder that uses `InferTargetMachine` defined
+  // above to infer the target machine for the given options.
+  static TargetMachineBuilder InferTargetMachineBuilder(
+      const llvm::TargetOptions& target_options,
+      llvm::CodeGenOptLevel opt_level,
+      std::optional<tsl::port::CPUFeature> max_cpu_feature);
+
   // Compiles a `module` to an ObjectFile.
   llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> operator()(
       llvm::Module& module) final;
 
+  // Runs the IR passes on the given module.
+  llvm::Error RunIrPasses(llvm::Module& module,
+                          llvm::TargetMachine* target_machine) const;
+
+  // Emits machine code for the given module.
+  std::unique_ptr<llvm::MemoryBuffer> EmitMachineCode(
+      llvm::Module& module, llvm::TargetMachine* target_machine) const;
+
   static llvm::CodeGenOptLevel GetCodeGenOptLevel(
       const HloModuleConfig& module_config);
 
+  absl::StatusOr<std::unique_ptr<llvm::TargetMachine>> build_target_machine()
+      const {
+    return target_machine_builder_();
+  }
+
  private:
   TargetMachineBuilder target_machine_builder_;
   Options options_;
diff --git a/third_party/xla/xla/backends/cpu/codegen/jit_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/jit_compiler.cc
index a9cf0680a28e..33187c8e8d19 100644
--- a/third_party/xla/xla/backends/cpu/codegen/jit_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/jit_compiler.cc
@@ -17,19 +17,14 @@ limitations under the License.
 
 #include <cstddef>
 #include <memory>
-#include <optional>
-#include <string>
 #include <utility>
 
-#include "absl/base/call_once.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
@@ -37,24 +32,18 @@ limitations under the License.
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Mangler.h"
-#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Host.h"
-#include "xla/backends/cpu/codegen/contiguous_section_memory_manager.h"
-#include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/object_loader.h"
 #include "xla/backends/cpu/runtime/function_library.h"
-#include "xla/service/cpu/orc_jit_memory_mapper.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/cpu_info.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
@@ -63,66 +52,11 @@ namespace xla::cpu {
 using tsl::profiler::TraceMe;
 using tsl::profiler::TraceMeEncode;
 
-// Initialize LLVM the first time `JitCompiler` is created.
-static void InitializeLLVMTarget() {
-  llvm::InitializeNativeTarget();
-  llvm::InitializeNativeTargetAsmPrinter();
-}
-
-absl::once_flag initialize_llvm_flag;
-
-absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
-JitCompiler::InferTargetMachine(
-    const llvm::TargetOptions& target_options, llvm::CodeGenOptLevel opt_level,
-    std::optional<tsl::port::CPUFeature> max_cpu_feature) {
-  // Detect machine attributes for the target CPU.
-  auto result = DetectMachineAttributes(max_cpu_feature);
-  llvm::SmallVector<std::string> attrs(result.features.begin(),
-                                       result.features.end());
-
-  // If `max_cpu_feature` is newer than the host CPU, we should keep the host
-  // CPU name, e.g., we don't want to set the target CPU to Skylake when we are
-  // on a Broadwell host.
-  absl::string_view cpu = result.num_filtered_features
-                              ? CpuTargetFromMaxFeature(*max_cpu_feature)
-                              : absl::string_view(llvm::sys::getHostCPUName());
-
-  absl::call_once(initialize_llvm_flag, InitializeLLVMTarget);
-  std::unique_ptr<llvm::TargetMachine> target_machine(
-      llvm::EngineBuilder()
-          .setTargetOptions(target_options)
-          .setOptLevel(opt_level)
-          .selectTarget(
-              /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
-              /*MCPU=*/cpu,
-              /*MAttrs=*/attrs));
-
-  if (target_machine == nullptr) {
-    return Internal("Failed to create target machine for CPU %s", cpu);
-  }
-
-  return std::move(target_machine);
-}
-
-IrCompiler::TargetMachineBuilder JitCompiler::InferTargetMachineBuilder(
-    const llvm::TargetOptions& target_options, llvm::CodeGenOptLevel opt_level,
-    std::optional<tsl::port::CPUFeature> max_cpu_feature) {
-  return [target_options, opt_level, max_cpu_feature] {
-    return InferTargetMachine(target_options, opt_level, max_cpu_feature);
-  };
-}
-
 absl::StatusOr<JitCompiler> JitCompiler::Create(
-    llvm::TargetOptions target_options, Options options,
+    Options options, std::unique_ptr<IrCompiler> ir_compiler,
     TaskRunner task_runner) {
-  absl::call_once(initialize_llvm_flag, InitializeLLVMTarget);
-
-  // Infer target machine from the current host CPU.
-  IrCompiler::TargetMachineBuilder target_machine_builder =
-      InferTargetMachineBuilder(std::move(target_options),
-                                options.ir_compiler_options.opt_level,
-                                options.max_cpu_feature);
-  TF_ASSIGN_OR_RETURN(auto target_machine, target_machine_builder());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
+                      ir_compiler->build_target_machine());
 
   // Dispatch compilation tasks using the provided task runner.
   auto task_dispatcher =
@@ -138,15 +72,10 @@ absl::StatusOr<JitCompiler> JitCompiler::Create(
     LOG(ERROR) << "LLVM compilation error: " << llvm::toString(std::move(err));
   });
 
-  // Create an instance of IrCompiler for lowering LLVM modules to machine code.
-  auto ir_compiler = std::make_unique<IrCompiler>(
-      target_machine_builder, std::move(options.ir_compiler_options),
-      std::move(options.ir_compiler_hooks));
-
-  return JitCompiler(
-      std::move(target_machine_builder), std::move(target_machine),
-      task_dispatcher_ptr, std::move(execution_session), std::move(ir_compiler),
-      options.num_dylibs, std::move(options.definition_generator));
+  return JitCompiler(std::move(target_machine), task_dispatcher_ptr,
+                     std::move(execution_session), std::move(ir_compiler),
+                     options.num_dylibs,
+                     std::move(options.definition_generator));
 }
 
 static std::unique_ptr<llvm::orc::IRCompileLayer> CreateCompileLayer(
@@ -158,14 +87,12 @@ static std::unique_ptr<llvm::orc::IRCompileLayer> CreateCompileLayer(
 }
 
 JitCompiler::JitCompiler(
-    IrCompiler::TargetMachineBuilder target_machine_builder,
-    std::shared_ptr<llvm::TargetMachine> target_machine,
+    std::unique_ptr<llvm::TargetMachine> target_machine,
     TaskDispatcher* task_dispatcher,
     std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
     std::unique_ptr<IrCompiler> ir_compiler, size_t num_dylibs,
     ExecutionEngine::DefinitionGenerator definition_generator)
-    : target_machine_builder_(std::move(target_machine_builder)),
-      target_machine_(std::move(target_machine)),
+    : target_machine_(std::move(target_machine)),
       task_dispatcher_(task_dispatcher),
       execution_engine_(std::make_unique<ExecutionEngine>(
           std::move(execution_session), target_machine_->createDataLayout(),
@@ -183,12 +110,20 @@ JitCompiler::JitCompiler(
 
 JitCompiler::~JitCompiler() = default;
 
+static void AddDylibIndexModuleFlag(llvm::Module& llvm_module,
+                                    size_t dylib_index) {
+  auto i64ty = llvm::Type::getInt64Ty(llvm_module.getContext());
+  llvm_module.addModuleFlag(llvm::Module::Error, "xla_dylib_index",
+                            llvm::ConstantInt::get(i64ty, dylib_index));
+}
+
 absl::Status JitCompiler::AddModule(llvm::orc::ThreadSafeModule module,
                                     size_t dylib_index) {
   // Set up module for codegen for the target machine at hand.
   module.withModuleDo([&](llvm::Module& m) {
     m.setDataLayout(target_machine_->createDataLayout());
-    m.setTargetTriple(target_machine_->getTargetTriple().getTriple());
+    m.setTargetTriple(target_machine_->getTargetTriple());
+    AddDylibIndexModuleFlag(m, dylib_index);
   });
 
   // Add module to the selected dynamic library.
diff --git a/third_party/xla/xla/backends/cpu/codegen/jit_compiler.h b/third_party/xla/xla/backends/cpu/codegen/jit_compiler.h
index 383cfd5a8f01..99b2727b491a 100644
--- a/third_party/xla/xla/backends/cpu/codegen/jit_compiler.h
+++ b/third_party/xla/xla/backends/cpu/codegen/jit_compiler.h
@@ -20,28 +20,21 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
-#include <string>
-#include <vector>
 
 #include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
-#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
-#include "xla/backends/cpu/codegen/object_loader.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "tsl/platform/cpu_info.h"
 
@@ -67,28 +60,7 @@ class JitCompiler {
 
   ~JitCompiler();
 
-  // Infers the `llvm::TargetMachine` for the current host. If `max_cpu_feature`
-  // is provided, it will be used to constrain the set of features that LLVM
-  // codegen (instruction selection) is allowed to use, e.g. it can be used to
-  // explicitly disable certain AVX512 extensions, in case the compiled
-  // executable will be serialized and later loaded on a different machine.
-  static absl::StatusOr<std::unique_ptr<llvm::TargetMachine>>
-  InferTargetMachine(const llvm::TargetOptions& target_options,
-                     llvm::CodeGenOptLevel opt_level,
-                     std::optional<tsl::port::CPUFeature> max_cpu_feature);
-
-  // Returns a target machine builder that uses `InferTargetMachine` defined
-  // above to infer the target machine for the given options.
-  static IrCompiler::TargetMachineBuilder InferTargetMachineBuilder(
-      const llvm::TargetOptions& target_options,
-      llvm::CodeGenOptLevel opt_level,
-      std::optional<tsl::port::CPUFeature> max_cpu_feature);
-
   struct Options {
-    // Options for the underlying IR compiler instance.
-    IrCompiler::Options ir_compiler_options;
-    IrCompiler::CompilationHooks ir_compiler_hooks;
-
     // The number of dynamic libraries to create for the jit compiler instance.
     // We compile XLA:CPU program into multiple LLVM modules, and by using
     // multiple dynamic libraries we enable parallel compilation.
@@ -97,17 +69,12 @@ class JitCompiler {
     // Optional definition generator to inject host runtime symbols into the
     // jit-compiled function library.
     ExecutionEngine::DefinitionGenerator definition_generator;
-
-    // Maximum CPU instruction set for wich the compiler should generate code.
-    // If instruction set is empty, compiler will generate code for all ISA
-    // extensions detected on the current machine.
-    std::optional<tsl::port::CPUFeature> max_cpu_feature;
   };
 
   // Creates a new instance of the JitCompiler.
-  static absl::StatusOr<JitCompiler> Create(llvm::TargetOptions target_options,
-                                            Options options,
-                                            TaskRunner task_runner = nullptr);
+  static absl::StatusOr<JitCompiler> Create(
+      Options options, std::unique_ptr<IrCompiler> ir_compiler,
+      TaskRunner task_runner = nullptr);
 
   // Adds a LLVM module to the dynamic library at `dylib_index`.
   absl::Status AddModule(llvm::orc::ThreadSafeModule module,
@@ -154,8 +121,7 @@ class JitCompiler {
     size_t num_dispatched_tasks_ ABSL_GUARDED_BY(mu_) = 0;
   };
 
-  JitCompiler(IrCompiler::TargetMachineBuilder target_machine_builder,
-              std::shared_ptr<llvm::TargetMachine> target_machine,
+  JitCompiler(std::unique_ptr<llvm::TargetMachine> target_machine,
               TaskDispatcher* task_dispatcher,
               std::unique_ptr<llvm::orc::ExecutionSession> execution_session,
               std::unique_ptr<IrCompiler> ir_compiler, size_t num_dylibs,
@@ -163,8 +129,7 @@ class JitCompiler {
 
   // Target machine builder that is used to construct target machines for this
   // instance of `JitCompiler` (when compiling LLVM modules in parallel).
-  IrCompiler::TargetMachineBuilder target_machine_builder_;
-  std::shared_ptr<llvm::TargetMachine> target_machine_;
+  std::unique_ptr<llvm::TargetMachine> target_machine_;
 
   TaskDispatcher* task_dispatcher_;  // owned by `execution_session_`
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc b/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc
index 5ca254a5b21b..b76e57d6604e 100644
--- a/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/jit_compiler_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -41,14 +42,14 @@ limitations under the License.
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla::cpu {
 
@@ -92,9 +93,13 @@ TEST(JitCompilerTest, Compile) {
     thread_pool.Schedule(std::move(task));
   };
 
+  std::unique_ptr<IrCompiler> ir_compiler =
+      IrCompiler::Create(llvm::TargetOptions(), IrCompiler::Options(),
+                         IrCompiler::CompilationHooks());
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto compiler,
-      JitCompiler::Create(llvm::TargetOptions(), std::move(options),
+      JitCompiler::Create(std::move(options), std::move(ir_compiler),
                           std::move(task_runner)));
 
   constexpr absl::string_view add_in_place_ir = R"(
@@ -184,9 +189,13 @@ TEST(JitCompilerTest, ExternalDefinitionGenerator) {
     return std::make_unique<ExternalDefinitionGenerator>();
   };
 
+  std::unique_ptr<IrCompiler> ir_compiler =
+      IrCompiler::Create(llvm::TargetOptions(), IrCompiler::Options(),
+                         IrCompiler::CompilationHooks());
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto compiler,
-      JitCompiler::Create(llvm::TargetOptions(), std::move(options),
+      JitCompiler::Create(std::move(options), std::move(ir_compiler),
                           /*task_runner=*/nullptr));
 
   constexpr absl::string_view call_external_fn_ir = R"(
diff --git a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
index b5e0ca318a45..2ce99c4b93a8 100644
--- a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "llvm/Support/CodeGen.h"
 #include "xla/backends/cpu/codegen/symbol_name_util.h"
 #include "xla/cpu_function_runtime.h"
-#include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -105,8 +104,13 @@ class MemoryDependencyAnalyzer {
     return scopes.empty() ? nullptr : llvm::MDNode::get(context_, scopes);
   };
 
-  bool ResultContainsSlice(BufferAllocation::Slice slice) {
-    return result_slices_.contains(slice);
+  bool ResultsOverlapWithSlice(BufferAllocation::Slice slice) {
+    for (const BufferAllocation::Slice& result_slice : result_slices_) {
+      if (result_slice.OverlapsWith(slice)) {
+        return true;
+      }
+    }
+    return false;
   }
 
  private:
@@ -177,8 +181,7 @@ absl::Status VerifyKernelArgumentsNonOverlapping(
 }
 
 // Check that all kernel results are unique and coming from non-overlapping
-// slices. We rely on this property to create LLVM `!alias.scope` for each
-// kernel result buffer and to construct `!noalias` metadata for arguments.
+// slices.
 absl::Status VerifyKernelResultsNonOverlapping(
     absl::Span<const KernelApiIrBuilder::KernelParameter> results) {
   for (size_t i = 0; i < results.size(); ++i) {
@@ -244,39 +247,39 @@ absl::StatusOr<BufferAllocation::Slice> GetUniqueSlice(
   return buffer_assignment->GetUniqueSlice(instruction, index);
 }
 
+}  // namespace
+
 absl::StatusOr<std::vector<KernelApiIrBuilder::KernelParameter>>
-GetKernelArgumentsParameters(const HloInstruction* instruction,
-                             const BufferAssignment* buffer_assignment) {
-  std::vector<KernelApiIrBuilder::KernelParameter> arguments;
+KernelApiIrBuilder::GetKernelArgumentsParameters(
+    const HloInstruction* instruction,
+    const BufferAssignment* buffer_assignment) {
+  std::vector<KernelParameter> arguments;
 
   for (HloInstruction* operand : instruction->operands()) {
     for (auto& indexed : ShapeUtil::GetLeafShapes(operand->shape())) {
       TF_ASSIGN_OR_RETURN(
           BufferAllocation::Slice slice,
           GetUniqueSlice(buffer_assignment, operand, indexed.index));
-      arguments.push_back(
-          KernelApiIrBuilder::KernelParameter{indexed.shape, slice});
+      arguments.push_back(KernelParameter{indexed.shape, slice});
     }
   }
   return arguments;
 }
 
 absl::StatusOr<std::vector<KernelApiIrBuilder::KernelParameter>>
-GetKernelResultsParameters(const HloInstruction* instruction,
-                           const BufferAssignment* buffer_assignment) {
-  std::vector<KernelApiIrBuilder::KernelParameter> results;
+KernelApiIrBuilder::GetKernelResultsParameters(
+    const HloInstruction* instruction,
+    const BufferAssignment* buffer_assignment) {
+  std::vector<KernelParameter> results;
   for (auto& indexed : ShapeUtil::GetLeafShapes(instruction->shape())) {
     TF_ASSIGN_OR_RETURN(
         BufferAllocation::Slice slice,
         GetUniqueSlice(buffer_assignment, instruction, indexed.index));
-    results.push_back(
-        KernelApiIrBuilder::KernelParameter{indexed.shape, slice});
+    results.push_back(KernelParameter{indexed.shape, slice});
   }
   return results;
 }
 
-}  // namespace
-
 auto KernelApiIrBuilder::Options::FromHloModuleConfig(
     const HloModuleConfig& config) -> Options {
   return KernelApiIrBuilder::Options{
@@ -287,8 +290,11 @@ auto KernelApiIrBuilder::Options::FromHloModuleConfig(
 }
 
 KernelApiIrBuilder::KernelApiIrBuilder(llvm::LLVMContext& context,
-                                       Options options)
-    : context_(context), options_(std::move(options)) {
+                                       Options options,
+                                       BufferValidation buffer_validation)
+    : context_(context),
+      options_(std::move(options)),
+      buffer_validation_(buffer_validation) {
   thread_dim_ty_ = KernelThreadDimTy(context_);
   thread_ty_ = KernelThreadTy(context_);
   arg_ty_ = KernelArgTy(context_);
@@ -305,14 +311,7 @@ auto KernelApiIrBuilder::EmitKernelPrototype(
   TF_ASSIGN_OR_RETURN(std::vector<KernelParameter> results,
                       GetKernelResultsParameters(instr, buffer_assignment));
 
-  std::string name;
-  if (options_.generate_unique_c_style_kernel_entry_points) {
-    TF_ASSIGN_OR_RETURN(
-        name, ConvertToCName(absl::StrCat(instr->GetModule()->name(), "_",
-                                          instr->name(), suffix)));
-  } else {
-    name = absl::StrCat(instr->name(), suffix);
-  }
+  TF_ASSIGN_OR_RETURN(std::string name, GetKernelName(instr, suffix));
 
   return EmitKernelPrototype(module, name, arguments, results);
 }
@@ -336,7 +335,9 @@ auto KernelApiIrBuilder::EmitKernelPrototype(
             << result.slice.ToString();
   }
 
-  TF_RETURN_IF_ERROR(VerifyKernelParameters(arguments, results));
+  if (buffer_validation_ == BufferValidation::kDisjoint) {
+    TF_RETURN_IF_ERROR(VerifyKernelParameters(arguments, results));
+  }
 
   MemoryDependencyAnalyzer memory_dependency_analyzer(context_, name, results);
 
@@ -371,7 +372,7 @@ auto KernelApiIrBuilder::EmitKernelPrototype(
 
     // If a buffer slice is not a part of result set, then it must be invariant
     // (read-only).
-    if (!memory_dependency_analyzer.ResultContainsSlice(argument.slice)) {
+    if (!memory_dependency_analyzer.ResultsOverlapWithSlice(argument.slice)) {
       ir_argument.MarkInvariantOverWholeProgram(&context_);
       invariant_arguments.insert(i);
     }
@@ -425,6 +426,16 @@ auto KernelApiIrBuilder::EmitKernelPrototype(
                          std::move(result_buffers)};
 }
 
+absl::StatusOr<std::string> KernelApiIrBuilder::GetKernelName(
+    const HloInstruction* instr, absl::string_view suffix) const {
+  if (options_.generate_unique_c_style_kernel_entry_points) {
+    return ConvertToCName(
+        absl::StrCat(instr->GetModule()->name(), "_", instr->name(), suffix));
+  } else {
+    return absl::StrCat(instr->name(), suffix);
+  }
+}
+
 std::unique_ptr<llvm::Module> KernelApiIrBuilder::CreateModule(
     absl::string_view name, llvm::LLVMContext& context) {
   constexpr absl::string_view kXlaModuleIdentifier = "__compute_module";
@@ -488,8 +499,11 @@ llvm_ir::IrArray KernelApiIrBuilder::EmitKernelArgument(
 
   // All buffers pointers passed to host kernels are expected to be
   // dereferenceable.
-  llvm_ir::SetDereferenceableMetadataForLoad(data,
-                                             ShapeUtil::ByteSizeOf(shape));
+  const llvm::Module* llvm_module = builder.GetInsertBlock()->getModule();
+  const llvm::DataLayout& data_layout = llvm_module->getDataLayout();
+  int64_t pointer_size = data_layout.getTypeStoreSize(builder.getPtrTy());
+  int64_t byte_size = ShapeUtil::ByteSizeOf(shape, pointer_size);
+  llvm_ir::SetDereferenceableMetadataForLoad(data, byte_size);
 
   // All buffers pointers passed to host kernels are expected to be invariant
   // over the whole program. Note the metadata is attached only to loading
@@ -507,6 +521,11 @@ llvm::Function* KernelApiIrBuilder::EmitKernelFunction(llvm::Module& module,
   llvm::Function* function = llvm::Function::Create(
       kernel_function_ty_, llvm::GlobalValue::ExternalLinkage, name, module);
 
+  SetKernelFunctionAttributes(function);
+  return function;
+}
+
+void KernelApiIrBuilder::SetKernelFunctionAttributes(llvm::Function* function) {
   // We use external linkage because we'll be resolving this function from the
   // XLA runtime.
   function->setCallingConv(llvm::CallingConv::C);
@@ -523,8 +542,6 @@ llvm::Function* KernelApiIrBuilder::EmitKernelFunction(llvm::Module& module,
   // Always keep a frame pointer for the host kernel so we can see them in all
   // performance profiling tools.
   function->addFnAttr("frame-pointer", "all");
-
-  return function;
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
index d264e7479662..58ffc5bc8594 100644
--- a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
+++ b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -48,6 +48,12 @@ class KernelApiIrBuilder {
     static Options FromHloModuleConfig(const HloModuleConfig& config);
   };
 
+  enum class BufferValidation {
+    kNone,      // No validation is performed.
+    kDisjoint,  // Check that all buffers are disjoint (and any overlap between
+                // arguments and results is the same buffer)
+  };
+
   // Thread dimensions of the kernel invocation.
   struct ThreadDims {
     llvm::Value* x;
@@ -96,7 +102,9 @@ class KernelApiIrBuilder {
     absl::InlinedVector<BufferAllocation::Slice, 8> result_buffers;
   };
 
-  KernelApiIrBuilder(llvm::LLVMContext& context, Options options);
+  KernelApiIrBuilder(
+      llvm::LLVMContext& context, Options options,
+      BufferValidation buffer_validation = BufferValidation::kDisjoint);
 
   // Emits a kernel prototype for the given HLO instruction.
   // buffer_assignment may be null, in which case we will not compute alias
@@ -110,11 +118,27 @@ class KernelApiIrBuilder {
       absl::Span<const KernelParameter> arguments,
       absl::Span<const KernelParameter> results);
 
+  // Get the kernel name for the given HLO instruction.
+  // If generate_unique_c_style_kernel_entry_points is enabled, the name will
+  // be converted to a valid C name and prefixed with the HLO module name.
+  absl::StatusOr<std::string> GetKernelName(
+      const HloInstruction* instr, absl::string_view suffix = "") const;
+
   // Create a module with the given name, the name is given a prefix that is
   // specific to XLA and relied on further down the pipeline.
   static std::unique_ptr<llvm::Module> CreateModule(absl::string_view name,
                                                     llvm::LLVMContext& context);
 
+  static absl::StatusOr<std::vector<KernelParameter>>
+  GetKernelArgumentsParameters(const HloInstruction* instruction,
+                               const BufferAssignment* buffer_assignment);
+
+  static absl::StatusOr<std::vector<KernelParameter>>
+  GetKernelResultsParameters(const HloInstruction* instruction,
+                             const BufferAssignment* buffer_assignment);
+
+  void SetKernelFunctionAttributes(llvm::Function* function);
+
  private:
   ThreadDims EmitKernelThreadDims(llvm::IRBuilderBase& builder,
                                   llvm::Value* call_frame);
@@ -131,6 +155,8 @@ class KernelApiIrBuilder {
 
   Options options_;
 
+  BufferValidation buffer_validation_;
+
   llvm::StructType* thread_dim_ty_;
   llvm::StructType* thread_ty_;
   llvm::StructType* arg_ty_;
@@ -138,6 +164,17 @@ class KernelApiIrBuilder {
   llvm::FunctionType* kernel_function_ty_;
 };
 
+inline bool operator==(const KernelApiIrBuilder::KernelParameter& lhs,
+                       const KernelApiIrBuilder::KernelParameter& rhs) {
+  return lhs.shape == rhs.shape && lhs.slice == rhs.slice;
+}
+
+template <typename Hash>
+Hash AbslHashValue(Hash hash,
+                   const KernelApiIrBuilder::KernelParameter& param) {
+  return Hash::combine(std::move(hash), param.shape, param.slice);
+}
+
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_CODEGEN_KERNEL_API_IR_BUILDER_H_
diff --git a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder_test.cc b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder_test.cc
index 04b25ec25c5f..9f01879da303 100644
--- a/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/kernel_api_ir_builder_test.cc
@@ -19,11 +19,15 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
@@ -32,24 +36,29 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
-class KernelApiIrBuilderTest : public HloTestBase {
+template <bool ValidateBuffers>
+class KernelApiIrBuilderTestBase : public HloHardwareIndependentTestBase {
  public:
-  KernelApiIrBuilderTest()
+  KernelApiIrBuilderTestBase()
       : module_("KernelApiIrBuilderTest", context_),
-        kernel_api_ir_builder_(context_,
-                               KernelApiIrBuilder::Options{true, 256}) {}
+        kernel_api_ir_builder_(
+            context_, KernelApiIrBuilder::Options{true, 256},
+            ValidateBuffers ? KernelApiIrBuilder::BufferValidation::kDisjoint
+                            : KernelApiIrBuilder::BufferValidation::kNone) {}
 
   llvm::IRBuilder<> getBuilder() { return llvm::IRBuilder<>(context_); }
 
@@ -71,10 +80,16 @@ class KernelApiIrBuilderTest : public HloTestBase {
       const HloModule& hlo) {
     return BufferAssigner::Run(
         &hlo, std::make_unique<DependencyHloOrdering>(&hlo),
-        backend().compiler()->BufferSizeBytesFunction(),
+        [](const BufferValue& buffer) {
+          return CpuExecutable::ShapeSizeBytes(buffer.shape());
+        },
         [](LogicalBuffer::Color) { return /*alignment=*/1; });
   }
 
+  void SetKernelFunctionAttributes(llvm::Function* function) {
+    kernel_api_ir_builder_.SetKernelFunctionAttributes(function);
+  }
+
   llvm::LLVMContext& context() { return context_; }
   std::string DumpToString() { return llvm_ir::DumpToString(&module_); }
 
@@ -84,6 +99,10 @@ class KernelApiIrBuilderTest : public HloTestBase {
   KernelApiIrBuilder kernel_api_ir_builder_;
 };
 
+using KernelApiIrBuilderTest = KernelApiIrBuilderTestBase<true>;
+using KernelApiIrBuilderTestNoBufferValidation =
+    KernelApiIrBuilderTestBase<false>;
+
 namespace {
 
 TEST_F(KernelApiIrBuilderTest, BuildKernelPrototype) {
@@ -294,5 +313,178 @@ TEST_F(KernelApiIrBuilderTest, MixedBuffers) {
   EXPECT_TRUE(prototype.invariant_arguments.contains(0));
 }
 
+TEST_F(KernelApiIrBuilderTestNoBufferValidation, PartialOverlap) {
+  auto hlo = std::make_unique<HloModule>("test", HloModuleConfig());
+
+  auto shape = ShapeUtil::MakeShape(PrimitiveType::F32, {4, 2});
+
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice arg0(&alloc, /*offset=*/0, /*size=*/256);
+  BufferAllocation::Slice arg1(&alloc, /*offset=*/256, /*size=*/256);
+  BufferAllocation::Slice res0(&alloc, /*offset=*/288, /*size=*/512);
+  BufferAllocation::Slice res1(&alloc, /*offset=*/768, /*size=*/256);
+  BufferAllocation::Slice res2(&alloc, /*offset=*/1024, /*size=*/256);
+
+  std::vector<KernelApiIrBuilder::KernelParameter> arguments = {{shape, arg0},
+                                                                {shape, arg1}};
+  std::vector<KernelApiIrBuilder::KernelParameter> results = {
+      {shape, res0}, {shape, res1}, {shape, res2}};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto prototype,
+                          EmitKernelPrototype("test", arguments, results));
+  llvm::IRBuilder<> builder = getBuilder();
+  builder.SetInsertPoint(prototype.function->getEntryBlock().getTerminator());
+
+  auto* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(context()), 0);
+  llvm_ir::IrArray::Index index(zero, shape, &builder);
+
+  // Emit loads from arguments and results buffers to test alias scope metadata.
+  EXPECT_NE(prototype.arguments[0].EmitReadArrayElement(index, &builder),
+            nullptr);
+  EXPECT_NE(prototype.arguments[1].EmitReadArrayElement(index, &builder),
+            nullptr);
+  EXPECT_NE(prototype.results[0].EmitReadArrayElement(index, &builder),
+            nullptr);
+  EXPECT_NE(prototype.results[1].EmitReadArrayElement(index, &builder),
+            nullptr);
+  EXPECT_NE(prototype.results[2].EmitReadArrayElement(index, &builder),
+            nullptr);
+
+  // clang-format off
+  ASSERT_TRUE(*RunFileCheck(DumpToString(),
+                            absl::StrCat(R"(
+    CHECK: define ptr @test(ptr %0) #0 {
+
+    CHECK-NEXT: getelementptr inbounds nuw %XLA_CPU_KernelCallFrame, {{.*}} i32 0
+    CHECK:      getelementptr inbounds nuw %XLA_CPU_KernelThreadDim, {{.*}} i32 0
+    CHECK:      getelementptr inbounds nuw %XLA_CPU_KernelThreadDim, {{.*}} i32 1
+    CHECK:      getelementptr inbounds nuw %XLA_CPU_KernelThreadDim, {{.*}} i32 2
+    CHECK:      load i64
+    CHECK:      load i64
+    CHECK:      load i64
+
+    CHECK-NEXT: getelementptr inbounds nuw %XLA_CPU_KernelCallFrame, {{.*}} i32 1
+    CHECK:      getelementptr inbounds nuw %XLA_CPU_KernelThread, {{.*}} i32 0
+    CHECK:      getelementptr inbounds nuw %XLA_CPU_KernelThread, {{.*}} i32 1
+    CHECK:      getelementptr inbounds nuw %XLA_CPU_KernelThread, {{.*}} i32 2
+    CHECK:      load i64
+    CHECK:      load i64
+    CHECK:      load i64
+
+    CHECK-NEXT: getelementptr inbounds nuw %XLA_CPU_KernelCallFrame, {{.*}} i32 3
+    CHECK:      load ptr
+    CHECK:      getelementptr %XLA_CPU_KernelArg, {{.*}} i32 0, i32 0
+    CHECK:      %[[ARG0:.+]] = load ptr, {{.*}}, !invariant.load ![[SCOPE0:.+]], !dereferenceable ![[DEREF_BYTES:.+]], !align ![[ALIGNMENT:.+]]
+
+    CHECK-NEXT: getelementptr inbounds nuw %XLA_CPU_KernelCallFrame, {{.*}} i32 3
+    CHECK:      load ptr
+    CHECK:      getelementptr %XLA_CPU_KernelArg, {{.*}} i32 1, i32 0
+    CHECK:      %[[ARG1:.+]] = load ptr, {{.*}}, !invariant.load ![[SCOPE0]], !dereferenceable ![[DEREF_BYTES]], !align ![[ALIGNMENT]]
+
+    CHECK-NEXT: getelementptr inbounds nuw %XLA_CPU_KernelCallFrame, {{.*}} i32 3
+    CHECK:      load ptr
+    CHECK:      getelementptr %XLA_CPU_KernelArg, {{.*}} i32 2, i32 0
+    CHECK:      %[[ARG2:.+]] = load ptr, {{.*}}, !invariant.load ![[SCOPE0]], !dereferenceable ![[DEREF_BYTES]], !align ![[ALIGNMENT]]
+
+    CHECK-NEXT: getelementptr inbounds nuw %XLA_CPU_KernelCallFrame, {{.*}} i32 3
+    CHECK:      load ptr
+    CHECK:      getelementptr %XLA_CPU_KernelArg, {{.*}} i32 3, i32 0
+    CHECK:      %[[ARG3:.+]] = load ptr, {{.*}}, !invariant.load ![[SCOPE0]], !dereferenceable ![[DEREF_BYTES]], !align ![[ALIGNMENT]]
+
+    CHECK-NEXT: getelementptr inbounds nuw %XLA_CPU_KernelCallFrame, {{.*}} i32 3
+    CHECK:      load ptr
+    CHECK:      getelementptr %XLA_CPU_KernelArg, {{.*}} i32 4, i32 0
+    CHECK:      %[[ARG4:.+]] = load ptr, {{.*}}, !invariant.load ![[SCOPE0]], !dereferenceable ![[DEREF_BYTES]], !align ![[ALIGNMENT]]
+
+    CHECK-NEXT: %[[PTR0:.+]] = getelementptr inbounds float, ptr %[[ARG0]]
+    CHECK:      load float, ptr %[[PTR0]], align 4,
+    CHECK-SAME:                            !invariant.load ![[EMPTY_NODE:.+]],
+    CHECK-SAME:                            !noalias ![[ARG0_NOALIAS:.+]]
+
+    CHECK-NEXT: %[[PTR1:.+]] = getelementptr inbounds float, ptr %[[ARG1]]
+    CHECK:      load float, ptr %[[PTR1]], align 4,
+    CHECK-SAME:                            !noalias ![[ARG1_NOALIAS:.+]]
+
+    CHECK-NEXT: %[[PTR2:.+]] = getelementptr inbounds float, ptr %[[ARG2]]
+    CHECK:      load float, ptr %[[PTR2]], align 4, !alias.scope ![[ARG2_SCOPE:.+]],
+    CHECK:                                          !noalias ![[ARG4_SCOPE:.+]]
+
+    CHECK-NEXT: %[[PTR3:.+]] = getelementptr inbounds float, ptr %[[ARG3]]
+    CHECK:      load float, ptr %[[PTR3]], align 4, !alias.scope ![[ARG3_SCOPE:.+]],
+    CHECK:                                          !noalias ![[ARG4_SCOPE]]
+
+    CHECK-NEXT: %[[PTR4:.+]] = getelementptr inbounds float, ptr %[[ARG4]]
+    CHECK:      load float, ptr %[[PTR4]], align 4, !alias.scope ![[ARG4_SCOPE]],
+    CHECK:                                          !noalias ![[ARG4_NOALIAS:.+]]
+
+    CHECK:      ret ptr null
+    CHECK: }
+
+    #0 = { uwtable "frame-pointer"="all" "prefer-vector-width"="256" }
+    CHECK-DAG: ![[ALIGNMENT]] = !{i64 )", cpu_function_runtime::MinAlign(), R"(}
+    CHECK-DAG: ![[EMPTY_NODE]] = !{}
+    CHECK-DAG: ![[DOMAIN:.+]] = !{!"XLA host kernel test AA domain"}
+    CHECK-DAG: ![[RES0:.+]] = !{!"{{.*}}, offset:288, size:512}", ![[DOMAIN]]}
+    CHECK-DAG: ![[RES1:.+]] = !{!"{{.*}}, offset:768, size:256}", ![[DOMAIN]]}
+    CHECK-DAG: ![[RES2:.+]] = !{!"{{.*}}, offset:1024, size:256}", ![[DOMAIN]]}
+    CHECK-DAG: ![[ARG2_SCOPE]] = !{![[RES0]]}
+    CHECK-DAG: ![[ARG3_SCOPE]] = !{![[RES1]]}
+    CHECK-DAG: ![[ARG4_SCOPE]] = !{![[RES2]]}
+    CHECK-DAG: ![[ARG0_NOALIAS]] = !{![[RES0]], ![[RES1]], ![[RES2]]}
+    CHECK-DAG: ![[ARG1_NOALIAS]] = !{![[RES1]], ![[RES2]]}
+    CHECK-DAG: ![[ARG4_NOALIAS]] = !{![[RES0]], ![[RES1]]}
+  )")));
+  // clang-format on
+
+  // Match for dereferenceable metadata in separate check, because depending on
+  // the alignment value, it may be the same scope as align, and may be a
+  // separate one. It's impossible to match both these cases in one FileCheck.
+  ASSERT_TRUE(*RunFileCheck(DumpToString(), R"(
+    CHECK:      {{.+}} = load ptr, {{.*}}, !dereferenceable ![[DEREF_BYTES:.+]],
+    CHECK: ![[DEREF_BYTES]] = !{i64 32}
+  )"));
+}
+
+TEST_F(KernelApiIrBuilderTest, GetKernelParams) {
+  llvm::LLVMContext context;
+  auto module = std::make_unique<llvm::Module>("test", context);
+  constexpr absl::string_view hlo_text = R"(
+    HloModule m
+    ENTRY main {
+      p0 = f32[2,2] parameter(0)
+      p1 = f32[2,2] parameter(1)
+      ROOT add.0 = f32[2,2] add(p0, p1)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, ParseAndReturnUnverifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment, RunBufferAssignment(*hlo));
+  const auto* root = hlo->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(auto args,
+                          KernelApiIrBuilder::GetKernelArgumentsParameters(
+                              root, buffer_assignment.get()));
+  EXPECT_EQ(args.size(), 2);
+  EXPECT_THAT(args[0].shape.dimensions(), ::testing::ElementsAre(2, 2));
+  EXPECT_THAT(args[1].shape.dimensions(), ::testing::ElementsAre(2, 2));
+  TF_ASSERT_OK_AND_ASSIGN(auto results,
+                          KernelApiIrBuilder::GetKernelResultsParameters(
+                              root, buffer_assignment.get()));
+  EXPECT_EQ(results.size(), 1);
+  EXPECT_THAT(results[0].shape.dimensions(), ::testing::ElementsAre(2, 2));
+}
+
+TEST_F(KernelApiIrBuilderTest, SetKernelFunctionAttributes) {
+  llvm::LLVMContext context;
+  auto module = std::make_unique<llvm::Module>("test", context);
+  llvm::FunctionType* function_ty =
+      llvm::FunctionType::get(llvm::PointerType::getUnqual(context),
+                              llvm::PointerType::getUnqual(context),
+                              /*isVarArg=*/false);
+  llvm::Function* function = llvm::Function::Create(
+      function_ty, llvm::GlobalValue::ExternalLinkage, "foo", *module);
+  EXPECT_FALSE(function->hasFnAttribute("prefer-vector-width"));
+  SetKernelFunctionAttributes(function);
+  EXPECT_TRUE(function->hasFnAttribute("prefer-vector-width"));
+}
+
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/object_loader.cc b/third_party/xla/xla/backends/cpu/codegen/object_loader.cc
index 1321e459d5a9..ab0efe2ce649 100644
--- a/third_party/xla/xla/backends/cpu/codegen/object_loader.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/object_loader.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc b/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc
index 120e7763b8c3..d076b99cd43e 100644
--- a/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/object_loader_test.cc
@@ -106,12 +106,17 @@ TEST_P(ObjectLoaderTest, Load) {
 
   JitCompiler::Options options;
   options.num_dylibs = kNumDyLibs;
-  options.ir_compiler_hooks.post_codegen = object_files_saver;
   options.definition_generator = params.definition_generator;
 
+  IrCompiler::CompilationHooks ir_compiler_hooks;
+  ir_compiler_hooks.post_codegen = object_files_saver;
+
+  std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
+      llvm::TargetOptions(), IrCompiler::Options(), ir_compiler_hooks);
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto compiler,
-      JitCompiler::Create(llvm::TargetOptions(), std::move(options)));
+      JitCompiler::Create(std::move(options), std::move(ir_compiler)));
 
   auto add_module = [&](absl::string_view ir, absl::string_view name,
                         size_t dylib_index) -> absl::Status {
diff --git a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
index fe328c12bbd8..fc41eb1afd23 100644
--- a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <optional>
 #include <vector>
 
 #include "absl/strings/string_view.h"
@@ -426,29 +427,29 @@ static constexpr absl::string_view kTanhV16F16Sym = "__xla_cpu_TanhV16F16";
 std::vector<llvm::VecDesc> TanhVectorization() {
   return {
       {"tanhf", kTanhV4F32Sym, llvm::ElementCount::getFixed(4), false,
-       "_ZGV_LLVM_N4v"},
+       "_ZGV_LLVM_N4v", std::nullopt},
       {"llvm.tanh.f32", kTanhV4F32Sym, llvm::ElementCount::getFixed(4), false,
-       "_ZGV_LLVM_N4v"},
+       "_ZGV_LLVM_N4v", std::nullopt},
 
       {"tanhf", kTanhV8F32Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
       {"llvm.tanh.f32", kTanhV8F32Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
 
       {"tanhf", kTanhV16F32Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
       {"llvm.tanh.f32", kTanhV16F32Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
 
       {"tanhf", kTanhV8F16Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
       {"llvm.tanh.f16", kTanhV8F16Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
 
       {"tanhf", kTanhV16F16Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
       {"llvm.tanh.f16", kTanhV16F16Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
   };
 }
 
@@ -466,29 +467,29 @@ static constexpr absl::string_view kExpV16F16Sym = "__xla_cpu_ExpV16F16";
 std::vector<llvm::VecDesc> ExpVectorization() {
   return {
       {"expf", kExpV4F32Sym, llvm::ElementCount::getFixed(4), false,
-       "_ZGV_LLVM_N4v"},
+       "_ZGV_LLVM_N4v", std::nullopt},
       {"llvm.exp.f32", kExpV4F32Sym, llvm::ElementCount::getFixed(4), false,
-       "_ZGV_LLVM_N4v"},
+       "_ZGV_LLVM_N4v", std::nullopt},
 
       {"expf", kExpV8F32Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
       {"llvm.exp.f32", kExpV8F32Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
 
       {"expf", kExpV16F32Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
       {"llvm.exp.f32", kExpV16F32Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
 
       {"expf", kExpV8F16Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
       {"llvm.exp.f16", kExpV8F16Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
 
       {"expf", kExpV16F16Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
       {"llvm.exp.f16", kExpV16F16Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
   };
 }
 
@@ -506,29 +507,29 @@ static constexpr absl::string_view kLogV16F16Sym = "__xla_cpu_LogV16F16";
 std::vector<llvm::VecDesc> LogVectorization() {
   return {
       {"logf", kLogV4F32Sym, llvm::ElementCount::getFixed(4), false,
-       "_ZGV_LLVM_N4v"},
+       "_ZGV_LLVM_N4v", std::nullopt},
       {"llvm.log.f32", kLogV4F32Sym, llvm::ElementCount::getFixed(4), false,
-       "_ZGV_LLVM_N4v"},
+       "_ZGV_LLVM_N4v", std::nullopt},
 
       {"logf", kLogV8F32Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
       {"llvm.log.f32", kLogV8F32Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
 
       {"logf", kLogV16F32Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
       {"llvm.log.f32", kLogV16F32Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
 
       {"logf", kLogV8F16Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
       {"llvm.log.f16", kLogV8F16Sym, llvm::ElementCount::getFixed(8), false,
-       "_ZGV_LLVM_N8v"},
+       "_ZGV_LLVM_N8v", std::nullopt},
 
       {"logf", kLogV16F16Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
       {"llvm.log.f16", kLogV16F16Sym, llvm::ElementCount::getFixed(16), false,
-       "_ZGV_LLVM_N16v"},
+       "_ZGV_LLVM_N16v", std::nullopt},
   };
 }
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/scatter_kernel_emitter_test.py b/third_party/xla/xla/backends/cpu/codegen/scatter_kernel_emitter_test.py
new file mode 100644
index 000000000000..377e45767956
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/scatter_kernel_emitter_test.py
@@ -0,0 +1,191 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+
+from xla.backends.cpu import testlib as testlib_cpu
+from xla.backends.cpu.testlib import utilities
+from xla.codegen import testlib as testlib_base
+from xla.codegen.testlib import utilities as base_utilities
+from xla.python import xla_extension
+
+
+def create_sclar_add_computation(
+    dtype: np.dtype,
+) -> testlib_base.HloComputation:
+  scalar_shape = xla_extension.Shape.scalar_shape(dtype)
+  param_0 = testlib_base.HloInstruction.create_parameter(
+      0,
+      scalar_shape,
+      "update_param_0",
+  )
+  param_1 = testlib_base.HloInstruction.create_parameter(
+      1,
+      scalar_shape,
+      "update_param_1",
+  )
+  root = testlib_base.HloInstruction.create_binary(
+      scalar_shape,
+      testlib_base.HloOpcode.add,
+      param_0,
+      param_1,
+  )
+  return testlib_base.build_hlo_computation(
+      root,
+      param_0,
+      param_1,
+  )
+
+
+def create_scatter_runner(
+    operand: xla_extension.Literal,
+    scatter_indicies: xla_extension.Literal,
+    updates: xla_extension.Literal,
+    scatter_dimension_numbers: testlib_base.ScatterDimensionNumbers,
+) -> testlib_cpu.KernelRunner:
+  dtype = np.dtype(np.float32)
+
+  operand_parameter = testlib_base.HloInstruction.create_parameter(
+      0, operand.shape(), "operand"
+  )
+  scatter_indicies_parameter = testlib_base.HloInstruction.create_parameter(
+      1, scatter_indicies.shape(), "scatter_indicies"
+  )
+  updates_parameter = testlib_base.HloInstruction.create_parameter(
+      2, updates.shape(), "updates"
+  )
+
+  update_computation = create_sclar_add_computation(dtype)
+
+  scatter_instruction = testlib_base.HloInstruction.create_scatter(
+      operand.shape(),
+      operand_parameter,
+      scatter_indicies_parameter,
+      updates_parameter,
+      update_computation,
+      scatter_dimension_numbers,
+      False,
+      False,
+  )
+
+  hlo_module, _ = utilities.build_hlo_module(
+      scatter_instruction,
+      operand_parameter,
+      scatter_indicies_parameter,
+      updates_parameter,
+      extra_computations=[update_computation],
+  )
+
+  hlo_module = testlib_cpu.run_fusion_wrapper_pass(hlo_module)
+  hlo_module, buffer_assignment = utilities.annotate_hlo_module(hlo_module)
+
+  scatter_emitter = testlib_cpu.ScatterKernelEmitter(
+      hlo_module.get_root_instruction(), buffer_assignment
+  )
+  kernel_definition = scatter_emitter.emit_kernel_definition()
+
+  jit_compiler = testlib_cpu.JitCompiler(hlo_module.get_config())
+
+  return testlib_cpu.KernelRunner.create(kernel_definition, jit_compiler)
+
+
+class ScatterKernelTest(parameterized.TestCase):
+
+  def test_basic_scatter(self):
+    dtype = np.dtype(np.float32)
+
+    scatter_dimension_numbers = testlib_base.ScatterDimensionNumbers(
+        update_window_dims=[1, 2],
+        inserted_window_dims=[],
+        scatter_dims_to_operand_dims=[0, 1],
+        index_vector_dim=1,
+    )
+
+    operand_shape = [3, 4]
+    operand = base_utilities.create_literal_from_np(
+        np.zeros(operand_shape, dtype)
+    )
+
+    # Repeat the last dimension to test the update computation.
+    scatter_indicies = base_utilities.create_literal_from_np(
+        np.array([[0, 0], [2, 0], [1, 0], [1, 1], [2, 0]], dtype=np.int32)
+    )
+
+    updates = base_utilities.create_literal_from_np(
+        np.arange(15, dtype=dtype).reshape([5, 1, 3])
+    )
+
+    runner = create_scatter_runner(
+        operand, scatter_indicies, updates, scatter_dimension_numbers
+    )
+
+    runner.call([operand, scatter_indicies, updates])
+
+    operand_np = np.asarray(operand)
+    updates_np = np.asarray(updates).reshape([5, 3])
+
+    np.testing.assert_array_equal(
+        operand_np[0, :], np.concatenate((updates_np[0, :], [0]))
+    )
+    np.testing.assert_array_equal(
+        operand_np[1, :],
+        np.concatenate((updates_np[2, :], [0]))
+        + (np.concatenate(([0], updates_np[3, :]))),
+    )
+    np.testing.assert_array_equal(
+        operand_np[2, :],
+        np.concatenate((updates_np[1, :] + updates_np[4, :], [0])),
+    )
+
+  def test_out_of_bounds_scatter(self):
+    dtype = np.dtype(np.float32)
+
+    scatter_dimension_numbers = testlib_base.ScatterDimensionNumbers(
+        update_window_dims=[1, 2],
+        inserted_window_dims=[],
+        scatter_dims_to_operand_dims=[0, 1],
+        index_vector_dim=1,
+    )
+
+    operand_shape = [3, 4]
+    operand = base_utilities.create_literal_from_np(
+        np.zeros(operand_shape, dtype)
+    )
+
+    # Offset of 2 combined with window size of 3 results in oob.
+    scatter_indicies = base_utilities.create_literal_from_np(
+        np.array([[0, 2]], dtype=np.int32)
+    )
+
+    updates = base_utilities.create_literal_from_np(
+        np.arange(3, dtype=dtype).reshape([1, 1, 3])
+    )
+
+    runner = create_scatter_runner(
+        operand, scatter_indicies, updates, scatter_dimension_numbers
+    )
+
+    runner.call([operand, scatter_indicies, updates])
+
+    np.testing.assert_array_equal(
+        np.asarray(operand), np.zeros(operand_shape, dtype)
+    )
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/third_party/xla/xla/backends/cpu/codegen/symbol_name_util.cc b/third_party/xla/xla/backends/cpu/codegen/symbol_name_util.cc
index fcbc66b4333d..92c7a168b23f 100644
--- a/third_party/xla/xla/backends/cpu/codegen/symbol_name_util.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/symbol_name_util.cc
@@ -15,12 +15,11 @@ limitations under the License.
 
 #include "xla/backends/cpu/codegen/symbol_name_util.h"
 
-#include <ctype.h>
-
 #include <cstddef>
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "xla/util.h"
@@ -32,12 +31,12 @@ bool isValidCVariableName(absl::string_view name) {
     return false;
   }
 
-  if (!isalpha(name[0]) && name[0] != '_') {
+  if (!absl::ascii_isalpha(name[0]) && name[0] != '_') {
     return false;
   }
 
   for (size_t i = 1; i < name.size(); ++i) {
-    if (!isalnum(name[i]) && name[i] != '_') {
+    if (!absl::ascii_isalnum(name[i]) && name[i] != '_') {
       return false;
     }
   }
diff --git a/third_party/xla/xla/backends/cpu/codegen/target_machine_features.cc b/third_party/xla/xla/backends/cpu/codegen/target_machine_features.cc
index a7e3f396573f..e63c0fdd790c 100644
--- a/third_party/xla/xla/backends/cpu/codegen/target_machine_features.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/target_machine_features.cc
@@ -19,19 +19,28 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/log/check.h"
+#include "absl/strings/match.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/primitive_util.h"
-#include "tsl/platform/logging.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
 TargetMachineFeatures::TargetMachineFeatures(
     llvm::TargetMachine* target_machine)
-    : target_machine_(target_machine) {}
+    : target_machine_(target_machine) {
+  // Calling virtual methods in the constructor is discouraged, so we don't
+  // call `get_target_feature_string` here.
+  if (target_machine_) {
+    has_avx512bf16_ = absl::StrContains(
+        target_machine_->getTargetFeatureString().str(), "+avx512bf16");
+  }
+}
 
 int32_t TargetMachineFeatures::vectorization_factor_in_bytes() const {
   // Ideally this should be a function of the cache line size (which we can
@@ -88,7 +97,9 @@ int64_t TargetMachineFeatures::minimum_alignment_for_allocation(
 }
 
 std::string TargetMachineFeatures::get_target_feature_string() const {
-  return target_machine_->getTargetFeatureString().str();
+  return target_machine_ == nullptr
+             ? ""
+             : target_machine_->getTargetFeatureString().str();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/target_machine_features.h b/third_party/xla/xla/backends/cpu/codegen/target_machine_features.h
index e47acef5569a..a46aee1cdfaf 100644
--- a/third_party/xla/xla/backends/cpu/codegen/target_machine_features.h
+++ b/third_party/xla/xla/backends/cpu/codegen/target_machine_features.h
@@ -69,6 +69,8 @@ class TargetMachineFeatures {
 
   virtual std::string get_target_feature_string() const;
 
+  virtual bool has_avx512bf16() const { return has_avx512bf16_; }
+
  private:
   llvm::TargetTransformInfo* GetTargetTransformInfoFor(
       const llvm::Function& fn) const;
@@ -78,6 +80,9 @@ class TargetMachineFeatures {
       target_transform_info_;
 
   llvm::TargetMachine* target_machine_;
+
+  // Store availability of popular features here for efficient checks.
+  bool has_avx512bf16_ = false;
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/target_machine_features_test.cc b/third_party/xla/xla/backends/cpu/codegen/target_machine_features_test.cc
new file mode 100644
index 000000000000..dff75455d464
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/target_machine_features_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/codegen/target_machine_test_base.h"
+
+namespace xla::cpu {
+namespace {
+
+struct Avx512Bf16TestSpec {
+  std::string cpu_name;
+  std::string features;
+  bool has_avx512bf16;
+};
+
+class Avx512Bf16Test
+    : public TargetMachineTestBase,
+      public ::testing::WithParamInterface<Avx512Bf16TestSpec> {
+ public:
+  static std::string Name(
+      const ::testing::TestParamInfo<Avx512Bf16TestSpec>& info) {
+    return info.param.cpu_name;
+  }
+};
+
+TEST_P(Avx512Bf16Test, CheckAvailability) {
+  Avx512Bf16TestSpec spec = GetParam();
+  const char* triple_string = "x86_64-unknown-linux-gnu";
+  std::unique_ptr<TargetMachineFeatures> features =
+      CreateTargetMachineFeatures(triple_string, spec.cpu_name, spec.features);
+  EXPECT_EQ(features->has_avx512bf16(), spec.has_avx512bf16);
+}
+
+std::vector<Avx512Bf16TestSpec> GetAvx512Bf16TestSpecs() {
+  return std::vector<Avx512Bf16TestSpec>{
+      Avx512Bf16TestSpec{"znver3", "+avx,+avx2", false},
+      Avx512Bf16TestSpec{"sapphirerapids",
+                         "+avx512vnni,+avx512bf16,+amx-bf16,+amx-int8,"
+                         "+amx-tile,+amx-transpose",
+                         true}};
+}
+
+INSTANTIATE_TEST_SUITE_P(Avx512Bf16Suite, Avx512Bf16Test,
+                         ::testing::ValuesIn(GetAvx512Bf16TestSpecs()),
+                         Avx512Bf16Test::Name);
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/codegen/target_machine_test_base.h b/third_party/xla/xla/backends/cpu/codegen/target_machine_test_base.h
new file mode 100644
index 000000000000..c801891a34ae
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/codegen/target_machine_test_base.h
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_CODEGEN_TARGET_MACHINE_TEST_BASE_H_
+#define XLA_BACKENDS_CPU_CODEGEN_TARGET_MACHINE_TEST_BASE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "llvm-c/Target.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+
+namespace xla::cpu {
+
+class TargetMachineTestBase : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    LLVMInitializeX86Target();
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86TargetMC();
+    LLVMInitializeARMTarget();
+    LLVMInitializeARMTargetInfo();
+    LLVMInitializeARMTargetMC();
+  }
+
+  std::unique_ptr<llvm::TargetMachine> CreateTargetMachine(
+      absl::string_view triple_string, absl::string_view cpu_name,
+      absl::string_view features) {
+    std::string error;
+    const llvm::Target* target =
+        llvm::TargetRegistry::lookupTarget(triple_string, error);
+    if (target == nullptr) {
+      LOG(ERROR) << "Failed to lookup target: " << error;
+    }
+
+    llvm::Triple triple(triple_string);
+    llvm::TargetOptions target_options;
+    return absl::WrapUnique(target->createTargetMachine(
+        triple, cpu_name, features, target_options, /*RM=*/std::nullopt));
+  }
+
+  std::unique_ptr<TargetMachineFeatures> CreateTargetMachineFeatures(
+      absl::string_view triple_string, absl::string_view cpu_name,
+      absl::string_view features) {
+    std::unique_ptr<llvm::TargetMachine> target_machine =
+        CreateTargetMachine(triple_string, cpu_name, features);
+    return std::make_unique<TargetMachineFeatures>(target_machine.get());
+  }
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_CODEGEN_TARGET_MACHINE_TEST_BASE_H_
diff --git a/third_party/xla/xla/backends/cpu/collectives/BUILD b/third_party/xla/xla/backends/cpu/collectives/BUILD
index 24164199e68b..fd8155a82a1e 100644
--- a/third_party/xla/xla/backends/cpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/cpu/collectives/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -24,7 +24,6 @@ cc_library(
         "//xla/service:global_device_id",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:casts",
     ],
@@ -55,25 +54,43 @@ cc_library(
         ":cpu_clique_key",
         ":cpu_collectives",
         "//xla:util",
-        "//xla/core/collectives:clique",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
     ],
 )
 
+xla_cc_test(
+    name = "cpu_cliques_test",
+    srcs = ["cpu_cliques_test.cc"],
+    deps = [
+        ":cpu_clique_key",
+        ":cpu_cliques",
+        ":in_process_collectives",
+        "//xla:util",
+        "//xla/core/collectives:rank_id",
+        "//xla/service:global_device_id",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "cpu_collectives",
     srcs = ["cpu_collectives.cc"],
     hdrs = ["cpu_collectives.h"],
     deps = [
-        "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/core/collectives",
@@ -84,7 +101,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
@@ -92,7 +108,6 @@ cc_library(
     ],
 )
 
-# TODO(b/380457503): Restrict visibility to private.
 cc_library(
     name = "in_process_collectives",
     srcs = ["in_process_collectives.cc"],
@@ -100,32 +115,18 @@ cc_library(
     deps = [
         ":cpu_collectives",
         ":in_process_communicator",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:global_device_id",
-        "//xla/stream_executor:device_memory",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
     ],
 )
 
-# TODO(b/380457503): Restrict visibility to private.
 cc_library(
     name = "in_process_communicator",
     srcs = ["in_process_communicator.cc"],
@@ -133,20 +134,19 @@ cc_library(
     deps = [
         ":cpu_collectives",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
-        "//xla/service:global_device_id",
         "//xla/service:rendezvous",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/math:math_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -154,12 +154,10 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-# TODO(b/380457503): Restrict visibility to private.
 cc_library(
     name = "gloo_kv_store",
     srcs = ["gloo_kv_store.cc"],
@@ -169,9 +167,10 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = [
+    visibility = internal_visibility([
         "//xla/pjrt/cpu:legacy_cpu_internal_users",
-    ],
+        "//third_party/py/jax:__subpackages__",
+    ]),
     deps = [
         "//xla/pjrt:status_casters",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -181,7 +180,6 @@ cc_library(
     ],
 )
 
-# TODO(b/380457503): Restrict visibility to private.
 cc_library(
     name = "gloo_collectives",
     srcs = ["gloo_collectives.cc"],
@@ -192,34 +190,19 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
+        ":cpu_collectives",
+        ":gloo_communicator",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/collectives:cpu_collectives",
-        "//xla/backends/cpu/collectives:gloo_communicator",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
-        "//xla/service:collective_ops_utils",
         "//xla/service:global_device_id",
-        "//xla/stream_executor:device_memory",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@gloo",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -227,12 +210,12 @@ xla_cc_test(
     name = "gloo_collectives_test",
     srcs = ["gloo_collectives_test.cc"],
     deps = [
+        ":cpu_clique_key",
+        ":cpu_collectives",
         ":gloo_collectives",
         ":gloo_kv_store",
         "//xla:executable_run_options",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/collectives:cpu_clique_key",
-        "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:communicator",
@@ -242,6 +225,7 @@ xla_cc_test(
         "//xla/service:collective_ops_utils",
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -263,7 +247,6 @@ xla_cc_test(
     }),
 )
 
-# TODO(b/380457503): Restrict visibility to private.
 cc_library(
     name = "gloo_communicator",
     srcs = ["gloo_communicator.cc"],
@@ -283,22 +266,18 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
-        "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@gloo",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -316,34 +295,21 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
+        ":cpu_collectives",
+        ":mpi_communicator",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/collectives:cpu_collectives",
-        "//xla/backends/cpu/collectives:mpi_communicator",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:communicator",
-        "//xla/service:collective_ops_utils",
         "//xla/service:global_device_id",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@mpitrampoline",
     ],
 )
 
-# TODO(b/380457503): Restrict visibility to private.
 cc_library(
     name = "mpi_communicator",
     srcs = ["mpi_communicator.cc"],
@@ -360,27 +326,20 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
-        "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
         "@mpitrampoline",
     ],
diff --git a/third_party/xla/xla/backends/cpu/collectives/cpu_cliques.cc b/third_party/xla/xla/backends/cpu/collectives/cpu_cliques.cc
index c52b400e4b57..61da03505353 100644
--- a/third_party/xla/xla/backends/cpu/collectives/cpu_cliques.cc
+++ b/third_party/xla/xla/backends/cpu/collectives/cpu_cliques.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/call_once.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
@@ -49,15 +51,26 @@ struct ThreadSafeClique {
 
   absl::Mutex mu;
   CpuClique clique ABSL_GUARDED_BY(mu);
+
+  // Used to construct every communicator once.
+  absl::flat_hash_map<RankId, std::unique_ptr<absl::once_flag>> create_comm_once
+      ABSL_GUARDED_BY(mu);
+
+  // The status of constructing a communicator with a particular rank.
+  absl::flat_hash_map<RankId, absl::Status> create_comm_status
+      ABSL_GUARDED_BY(mu);
 };
 
 // Container for initialized and ready to use CPU cliques. In contrast to GPU
 // cliques, CPU cliques are not lockable, and we create communicators lazily
 // when needed.
 struct ProcessCpuCliques {
+  using Key = std::pair<CpuCollectives*, CpuCliqueKey>;
+
   absl::Mutex mu;
-  absl::node_hash_map<CpuCliqueKey, ThreadSafeClique> map ABSL_GUARDED_BY(mu);
+  absl::node_hash_map<Key, ThreadSafeClique> map ABSL_GUARDED_BY(mu);
 };
+
 }  // namespace
 
 // Returns process-local CPU cliques.
@@ -66,55 +79,96 @@ static ProcessCpuCliques& GetProcessCpuCliques() {
   return *cliques;
 }
 
+// Erases cliques constructed from a given instance of CpuCollectives.
+static void EraseProcessCpuCliques(CpuCollectives* collectives) {
+  VLOG(3) << "Erase process CPU cliques for collectives: " << collectives;
+  ProcessCpuCliques& cliques = GetProcessCpuCliques();
+
+  absl::MutexLock lock(&cliques.mu);
+  absl::erase_if(cliques.map, [collectives](const auto& entry) {
+    return entry.first.first == collectives;
+  });
+}
+
 //===----------------------------------------------------------------------===//
 
+absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator(
+    CpuCollectives* collectives, const CpuCliqueKey& clique_key, RankId rank) {
+  VLOG(3) << "Create a new communicator for clique key "
+          << clique_key.ToString() << " and rank " << rank;
+
+  CpuCollectives::DeviceRank device_rank(/*device=*/nullptr, rank);
+  CpuCollectives::Config config;
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Communicator>> communicators,
+                      collectives->CreateCommunicators(clique_key, std::nullopt,
+                                                       {device_rank}, config));
+
+  if (communicators.size() != 1) {
+    // We expect to create communicators lazily, one at a time.
+    return Internal(
+        "Expected to create a single communicator for a clique key %s and "
+        "rank %d, but got %d",
+        clique_key.ToString(), rank.value(), communicators.size());
+  }
+
+  return std::move(communicators.front());
+};
+
 // TODO(b/380457503): Consider switching to a lockable CPU clique model similar
 // to GPU cliques, and creating all communicators upfront.
 absl::StatusOr<Communicator*> AcquireCommunicator(
     CpuCollectives* collectives, const CpuCliqueKey& clique_key, RankId rank) {
   VLOG(3) << "Acquire communicator for clique key " << clique_key.ToString()
-          << " and rank " << rank;
+          << " and rank " << rank << " from collectives: " << collectives;
 
   ProcessCpuCliques& cliques = GetProcessCpuCliques();
 
   // Synchronize access to the process cliques.
   ThreadSafeClique& thread_safe_clique = [&]() -> ThreadSafeClique& {
     absl::MutexLock lock(&cliques.mu);
-    auto [it, emplaced] = cliques.map.try_emplace(clique_key, clique_key);
+    auto [it, emplaced] = cliques.map.try_emplace(
+        std::make_pair(collectives, clique_key), clique_key);
+
+    // If we created a new clique, register a callback to erase it when the
+    // collectives instance is destroyed.
+    if (emplaced) {
+      VLOG(3) << "Created a new clique for clique key " << clique_key.ToString()
+              << " and collectives: " << collectives;
+      collectives->AddOnDestroyCallback(
+          [collectives] { EraseProcessCpuCliques(collectives); });
+    }
+
     return it->second;
   }();
 
-  // Check if we already have a communicator for this rank.
-  std::optional<Communicator*> comm = [&]() -> std::optional<Communicator*> {
+  // Create the communicator, once.
+  absl::once_flag* create_comm_once = nullptr;
+  {
     absl::MutexLock lock(&thread_safe_clique.mu);
-    return thread_safe_clique.clique.comm(rank);
-  }();
-
-  if (comm.has_value()) return *comm;
-
-  VLOG(3) << "Create a new communicator for clique key "
-          << clique_key.ToString() << " and rank " << rank;
-
-  // Create a new communicator and add it to the clique.
-  CpuCollectives::DeviceRank device_rank(/*device=*/nullptr, rank);
-  CpuCollectives::Config config;
-
-  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Communicator>> communicators,
-                      collectives->CreateCommunicators(clique_key, std::nullopt,
-                                                       {device_rank}, config));
-
-  // We expect to create communicators lazily on at a time.
-  if (communicators.size() != 1) {
-    return Internal(
-        "Expected to create a single communicator for a clique key %s and rank "
-        "%d, but got %d",
-        clique_key.ToString(), rank.value(), communicators.size());
+    std::unique_ptr<absl::once_flag>& x =
+        thread_safe_clique.create_comm_once[rank];
+    if (!x) {
+      x = std::make_unique<absl::once_flag>();
+    }
+    create_comm_once = x.get();
   }
+  absl::call_once(*create_comm_once, [&]() {
+    absl::StatusOr<std::unique_ptr<Communicator>> comm =
+        CreateCommunicator(collectives, clique_key, rank);
 
-  absl::MutexLock lock(&thread_safe_clique.mu);
-  TF_RETURN_IF_ERROR(thread_safe_clique.clique.AddComm(
-      rank, std::move(communicators.front())));
+    absl::MutexLock lock(&thread_safe_clique.mu);
+    if (!comm.ok()) {
+      thread_safe_clique.create_comm_status[rank] = comm.status();
+      return;
+    }
+    absl::Status s = thread_safe_clique.clique.AddComm(rank, *std::move(comm));
+    if (!s.ok()) {
+      thread_safe_clique.create_comm_status[rank] = s;
+    }
+  });
 
+  absl::MutexLock lock(&thread_safe_clique.mu);
+  TF_RETURN_IF_ERROR(thread_safe_clique.create_comm_status[rank]);
   return *thread_safe_clique.clique.comm(rank);
 }
 
diff --git a/third_party/xla/xla/backends/cpu/collectives/cpu_cliques_test.cc b/third_party/xla/xla/backends/cpu/collectives/cpu_cliques_test.cc
new file mode 100644
index 000000000000..f01f8b38aa7f
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/collectives/cpu_cliques_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/collectives/cpu_cliques.h"
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "xla/backends/cpu/collectives/cpu_clique_key.h"
+#include "xla/backends/cpu/collectives/in_process_collectives.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/global_device_id.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla::cpu {
+namespace {
+
+TEST(CpuCliques, InvalidateAcquiredCommunicators) {
+  GlobalDeviceId d0(0);
+  GlobalDeviceId d1(1);
+
+  CpuCliqueKey clique_key({d0, d1});
+
+  auto collectives0 = std::make_unique<InProcessCollectives>();
+  auto collectives1 = std::make_unique<InProcessCollectives>();
+
+  // Check that communicator instance is cached.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* comm0, AcquireCommunicator(&*collectives0, clique_key, RankId(0)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* comm1, AcquireCommunicator(&*collectives0, clique_key, RankId(0)));
+  EXPECT_EQ(comm0, comm1);
+
+  // Destroy communicators created for `collectives0`.
+  collectives0.reset();
+
+  // Acquire communicator from a new instance of collectives.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* comm2, AcquireCommunicator(&*collectives1, clique_key, RankId(0)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* comm3, AcquireCommunicator(&*collectives1, clique_key, RankId(0)));
+  EXPECT_EQ(comm2, comm3);
+
+  // Check that we acquired new communicators.
+  EXPECT_NE(comm0, comm2);
+
+  // Destroy communicators created for `collectives1`.
+  collectives1.reset();
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/collectives/gloo_collectives.cc b/third_party/xla/xla/backends/cpu/collectives/gloo_collectives.cc
index 20c59ac9c186..5206a9fabb07 100644
--- a/third_party/xla/xla/backends/cpu/collectives/gloo_collectives.cc
+++ b/third_party/xla/xla/backends/cpu/collectives/gloo_collectives.cc
@@ -61,16 +61,29 @@ GlooCollectives::CreateCommunicators(const CliqueKey& clique_key,
 
     auto gloo_context = std::make_shared<gloo::rendezvous::Context>(
         rank, clique_key.num_devices());
-    auto prefix_store = gloo::rendezvous::PrefixStore(
+
+#ifdef GLOO_SHARED_STORE
+    auto store_pointer = std::shared_ptr<gloo::rendezvous::Store>(
+        store_.get(), [](gloo::rendezvous::Store*) {});
+#else
+    auto& store_pointer = *store_;
+#endif  // GLOO_SHARED_STORE
+
+    auto prefix_store = std::make_shared<gloo::rendezvous::PrefixStore>(
         absl::StrCat("gloo/",
                      absl::StrJoin(clique_key.devices(), ",",
                                    [](std::string* out, GlobalDeviceId id) {
                                      absl::StrAppend(out, id.value());
                                    })),
-        *store_);
+        store_pointer);
 
     try {
-      gloo_context->connectFullMesh(prefix_store, device_);
+#ifdef GLOO_SHARED_STORE
+      auto prefix_store_pointer = prefix_store;
+#else
+      auto& prefix_store_pointer = *prefix_store;
+#endif  // GLOO_SHARED_STORE
+      gloo_context->connectFullMesh(prefix_store_pointer, device_);
     } catch (std::exception& e) {
       return absl::UnknownError(
           absl::StrCat("Gloo context initialization failed: ", e.what()));
diff --git a/third_party/xla/xla/backends/cpu/collectives/gloo_collectives_test.cc b/third_party/xla/xla/backends/cpu/collectives/gloo_collectives_test.cc
index c4a9009e73c8..9a8583a75ca1 100644
--- a/third_party/xla/xla/backends/cpu/collectives/gloo_collectives_test.cc
+++ b/third_party/xla/xla/backends/cpu/collectives/gloo_collectives_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/global_device_id.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -110,9 +111,15 @@ absl::StatusOr<std::vector<uint8_t>> AllReduce(
       GetCommunicator(kNumParticipants, global_devices, kv_store, rank));
 
   CpuCollectives::Executor executor(rendezvous_key, kTimeout);
-  TF_RETURN_IF_ERROR(communicator->AllReduce(
+  auto event = communicator->AllReduce(
       AsDeviceMemory(input_buffer), AsDeviceMemory(output_buffer),
-      xla::PrimitiveType::U8, kBufferSize, xla::ReductionKind::SUM, executor));
+      xla::PrimitiveType::U8, kBufferSize, xla::ReductionKind::SUM, executor);
+
+  tsl::BlockUntilReady(event);
+
+  if (event.IsError()) {
+    return event.GetError();
+  }
 
   return output_buffer;
 }
diff --git a/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.cc b/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.cc
index e5e19aa3a1cf..27ef03932d14 100644
--- a/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.cc
+++ b/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
@@ -65,12 +67,12 @@ static absl::Status SetAllReduceOptions(ReductionKind reduction_kind,
                                         se::DeviceMemoryBase output_buffer,
                                         size_t num_elements,
                                         gloo::AllreduceOptions& options) {
-  options.setInput(
-      reinterpret_cast<T*>(const_cast<void*>(input_buffer.opaque())),
-      num_elements);
-  options.setOutput(
-      reinterpret_cast<T*>(const_cast<void*>(output_buffer.opaque())),
-      num_elements);
+  options.setInput(reinterpret_cast<T*>(  // REINTERPRET_CAST_OK=existing code.
+                       input_buffer.opaque()),
+                   num_elements);
+  options.setOutput(reinterpret_cast<T*>(  // REINTERPRET_CAST_OK=existing code.
+                        output_buffer.opaque()),
+                    num_elements);
 
   using ReductionFn = void (*)(void*, const void*, const void*, size_t);
 
@@ -101,11 +103,10 @@ static absl::Status SetAllReduceOptions(ReductionKind reduction_kind,
   return absl::OkStatus();
 }
 
-absl::Status GlooCommunicator::AllReduce(se::DeviceMemoryBase send_buffer,
-                                         se::DeviceMemoryBase recv_buffer,
-                                         PrimitiveType dtype, size_t count,
-                                         ReductionKind reduction_kind,
-                                         const Executor& executor) {
+tsl::AsyncValueRef<GlooCommunicator::Event> GlooCommunicator::AllReduce(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
   TF_ASSIGN_OR_RETURN(auto cpu_executor, CpuCollectives::TryCast(&executor));
 
   gloo::AllreduceOptions options(context_);
@@ -181,12 +182,12 @@ absl::Status GlooCommunicator::AllReduce(se::DeviceMemoryBase send_buffer,
     return absl::UnknownError(
         absl::StrCat("Gloo all-reduce failed: ", e.what()));
   }
-  return absl::OkStatus();
+  return OkEvent();
 }
 
 static constexpr uint8_t kCollectivePermuteSlotPrefix = 0x40;
 
-absl::Status GlooCommunicator::CollectivePermute(
+tsl::AsyncValueRef<GlooCommunicator::Event> GlooCommunicator::CollectivePermute(
     se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
     PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
     absl::Span<const RankId> target_ranks, const Executor& executor) {
@@ -234,13 +235,13 @@ absl::Status GlooCommunicator::CollectivePermute(
     return absl::UnknownError(
         absl::StrCat("Gloo collective permute failed: ", e.what()));
   }
-  return absl::OkStatus();
+  return OkEvent();
 }
 
-absl::Status GlooCommunicator::AllToAll(
-    absl::Span<const se::DeviceMemoryBase> send_buffers,
-    absl::Span<const se::DeviceMemoryBase> recv_buffers, PrimitiveType dtype,
-    size_t count, const Executor& executor) {
+tsl::AsyncValueRef<GlooCommunicator::Event> GlooCommunicator::AllToAll(
+    absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
   // We can't use Gloo's all-to-all implementation directly because it assumes
   // that the inputs and outputs are contiguous. No big deal; it's just built
   // on top of send/recv and we can do the same as it.
@@ -262,10 +263,10 @@ absl::Status GlooCommunicator::AllToAll(
         context_->size);
     for (size_t i = 0; i < world_size; ++i) {
       if (i != my_rank) {
-        ins[i] = context_->createUnboundBuffer(
-            const_cast<void*>(send_buffers[i].opaque()), chunk_bytes);
-        outs[i] = context_->createUnboundBuffer(
-            const_cast<void*>(recv_buffers[i].opaque()), chunk_bytes);
+        ins[i] = context_->createUnboundBuffer(send_buffers[i].opaque(),
+                                               chunk_bytes);
+        outs[i] = context_->createUnboundBuffer(recv_buffers[i].opaque(),
+                                                chunk_bytes);
       }
     }
 
@@ -276,8 +277,8 @@ absl::Status GlooCommunicator::AllToAll(
       outs[recv_rank]->recv(recv_rank, slot);
     }
 
-    std::memcpy(const_cast<void*>(recv_buffers[my_rank].opaque()),
-                send_buffers[my_rank].opaque(), chunk_bytes);
+    std::memcpy(recv_buffers[my_rank].opaque(), send_buffers[my_rank].opaque(),
+                chunk_bytes);
 
     auto deadline = absl::ToChronoTime(absl::Now() + cpu_executor->timeout());
     for (int i = 0; i < world_size; i++) {
@@ -290,13 +291,12 @@ absl::Status GlooCommunicator::AllToAll(
     return absl::UnknownError(
         absl::StrCat("Gloo all-to-all failed: ", e.what()));
   }
-  return absl::OkStatus();
+  return OkEvent();
 }
 
-absl::Status GlooCommunicator::AllGather(se::DeviceMemoryBase send_buffer,
-                                         se::DeviceMemoryBase recv_buffer,
-                                         PrimitiveType dtype, size_t count,
-                                         const Executor& executor) {
+tsl::AsyncValueRef<GlooCommunicator::Event> GlooCommunicator::AllGather(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
   uint32_t tag = 0;  // TODO(phawkins): use better tags.
 
   TF_ASSIGN_OR_RETURN(auto cpu_executor, CpuCollectives::TryCast(&executor));
@@ -315,7 +315,7 @@ absl::Status GlooCommunicator::AllGather(se::DeviceMemoryBase send_buffer,
     return absl::UnknownError(
         absl::StrCat("Gloo AllGather failed: ", e.what()));
   }
-  return absl::OkStatus();
+  return OkEvent();
 }
 
 template <typename T>
@@ -367,11 +367,10 @@ absl::Status ReduceScatterHelper(std::shared_ptr<gloo::Context> context,
   return absl::OkStatus();
 }
 
-absl::Status GlooCommunicator::ReduceScatter(se::DeviceMemoryBase send_buffer,
-                                             se::DeviceMemoryBase recv_buffer,
-                                             PrimitiveType dtype, size_t count,
-                                             ReductionKind reduction_kind,
-                                             const Executor& executor) {
+tsl::AsyncValueRef<GlooCommunicator::Event> GlooCommunicator::ReduceScatter(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
   size_t chunk_bytes = count * primitive_util::ByteWidth(dtype);
   std::unique_ptr<char[]> temp(new char[chunk_bytes * context_->size]);
   std::memcpy(temp.get(), send_buffer.opaque(), chunk_bytes * context_->size);
@@ -437,7 +436,7 @@ absl::Status GlooCommunicator::ReduceScatter(se::DeviceMemoryBase send_buffer,
       return absl::InvalidArgumentError("Unknown datatype in reducescatter");
   }
   std::memcpy(recv_buffer.opaque(), temp.get(), chunk_bytes);
-  return absl::OkStatus();
+  return OkEvent();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.h b/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.h
index 234716da7593..1db92ddbe166 100644
--- a/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.h
+++ b/third_party/xla/xla/backends/cpu/collectives/gloo_communicator.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -42,46 +44,47 @@ class GlooCommunicator : public Communicator {
                    size_t num_ranks);
   ~GlooCommunicator() override;
 
-  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, ReductionKind reduction_kind,
-                         const Executor& executor) override;
-
-  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
-                                 PrimitiveType dtype, size_t count,
-                                 std::optional<RankId> source_rank,
-                                 absl::Span<const RankId> target_ranks,
-                                 const Executor& executor) override;
-
-  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
-                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
-                        PrimitiveType dtype, size_t count,
-                        const Executor& executor) override;
-
-  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, const Executor& executor) override;
-
-  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
-                             PrimitiveType dtype, size_t count,
-                             ReductionKind reduction_kind,
-                             const Executor& executor) override;
-
-  absl::Status Broadcast(se::DeviceMemoryBase, se::DeviceMemoryBase,
-                         PrimitiveType, size_t, RankId,
-                         const Executor&) override {
+  tsl::AsyncValueRef<Event> AllReduce(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      ReductionKind reduction_kind,
+                                      const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> CollectivePermute(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+      absl::Span<const RankId> target_ranks, const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> AllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> AllGather(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                          se::DeviceMemoryBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          ReductionKind reduction_kind,
+                                          const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> Broadcast(se::DeviceMemoryBase,
+                                      se::DeviceMemoryBase, PrimitiveType,
+                                      size_t, RankId,
+                                      const Executor&) override {
     return Unimplemented("Broadcast is not implemented");
   }
 
-  absl::Status Send(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
-                    const Executor&) override {
+  tsl::AsyncValueRef<Event> Send(se::DeviceMemoryBase, PrimitiveType, size_t,
+                                 RankId, const Executor&) override {
     return Unimplemented("Send is not implemented");
   }
 
-  absl::Status Recv(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
-                    const Executor&) override {
+  tsl::AsyncValueRef<Event> Recv(se::DeviceMemoryBase, PrimitiveType, size_t,
+                                 RankId, const Executor&) override {
     return Unimplemented("Recv is not implemented");
   }
 
diff --git a/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.cc b/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.cc
index 30af23eddee9..5da56f541b7b 100644
--- a/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.cc
+++ b/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/lib/math/math_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -257,7 +259,7 @@ static absl::Status AllReduceOp(
   // Reduce all inputs into the destination buffer at rank 0.
   void* output = chunk_ptr(participants[0].dest);
 
-  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch<absl::Status>(
+  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch(
       [&](const auto type_tag) {
         return ReduceScatter<type_tag>(reduction_kind, inputs, output,
                                        chunk_count);
@@ -308,7 +310,7 @@ static absl::Status ReduceScatterOp(
   // Reduce all inputs into the destination buffer.
   void* output = participants[rank].dest.opaque();
 
-  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch<absl::Status>(
+  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch(
       [&](const auto type_tag) {
         return ReduceScatter<type_tag>(reduction_kind, inputs, output, count);
       },
@@ -393,45 +395,59 @@ static absl::Status CollectivePermuteOp(
 InProcessCommunicator::InProcessCommunicator(size_t rank, size_t num_ranks)
     : rank_(rank), num_ranks_(num_ranks) {}
 
-absl::Status InProcessCommunicator::AllReduce(se::DeviceMemoryBase send_buffer,
-                                              se::DeviceMemoryBase recv_buffer,
-                                              PrimitiveType dtype, size_t count,
-                                              ReductionKind reduction_kind,
-                                              const Executor& executor) {
+tsl::AsyncValueRef<InProcessCommunicator::Event>
+InProcessCommunicator::AllReduce(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 ReductionKind reduction_kind,
+                                 const Executor& executor) {
   TF_ASSIGN_OR_RETURN(auto cpu_executor, CpuCollectives::TryCast(&executor));
   const RendezvousKey& key = cpu_executor->rendezvous_key();
 
   std::string name = absl::StrCat("all reduce ", key.ToString());
   AllReduceParticipant partiticipant{rank_, send_buffer, recv_buffer};
 
-  auto op = Rendezvous<OpParticipants<AllReduceParticipant>>(
-      name, key, partiticipant, key.num_local_participants,
-      CollectParticipants<AllReduceParticipant>);
+  TF_ASSIGN_OR_RETURN(auto op,
+                      Rendezvous<OpParticipants<AllReduceParticipant>>(
+                          name, key, partiticipant, key.num_local_participants,
+                          CollectParticipants<AllReduceParticipant>));
 
-  return op->Invoke(AllReduceOp, rank_, dtype, count, reduction_kind);
+  TF_RETURN_IF_ERROR(
+      op->Invoke(AllReduceOp, rank_, dtype, count, reduction_kind));
+
+  return OkEvent();
 }
 
-absl::Status InProcessCommunicator::ReduceScatter(
-    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
-    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
-    const Executor& executor) {
+tsl::AsyncValueRef<InProcessCommunicator::Event>
+InProcessCommunicator::ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                     se::DeviceMemoryBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     ReductionKind reduction_kind,
+                                     const Executor& executor) {
   TF_ASSIGN_OR_RETURN(auto cpu_executor, CpuCollectives::TryCast(&executor));
   const RendezvousKey& key = cpu_executor->rendezvous_key();
 
   std::string name = absl::StrCat("reduce scatter ", key.ToString());
   ReduceScatterParticipant partiticipant{rank_, send_buffer, recv_buffer};
 
-  auto op = Rendezvous<OpParticipants<ReduceScatterParticipant>>(
-      name, key, partiticipant, key.num_local_participants,
-      CollectParticipants<ReduceScatterParticipant>);
+  TF_ASSIGN_OR_RETURN(auto op,
+                      Rendezvous<OpParticipants<ReduceScatterParticipant>>(
+                          name, key, partiticipant, key.num_local_participants,
+                          CollectParticipants<ReduceScatterParticipant>));
+
+  TF_RETURN_IF_ERROR(
+      op->Invoke(ReduceScatterOp, rank_, dtype, count, reduction_kind));
 
-  return op->Invoke(ReduceScatterOp, rank_, dtype, count, reduction_kind);
+  return OkEvent();
 }
 
-absl::Status InProcessCommunicator::CollectivePermute(
-    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
-    PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
-    absl::Span<const RankId> target_ranks, const Executor& executor) {
+tsl::AsyncValueRef<InProcessCommunicator::Event>
+InProcessCommunicator::CollectivePermute(se::DeviceMemoryBase send_buffer,
+                                         se::DeviceMemoryBase recv_buffer,
+                                         PrimitiveType dtype, size_t count,
+                                         std::optional<RankId> source_rank,
+                                         absl::Span<const RankId> target_ranks,
+                                         const Executor& executor) {
   TF_ASSIGN_OR_RETURN(auto cpu_executor, CpuCollectives::TryCast(&executor));
   const RendezvousKey& key = cpu_executor->rendezvous_key();
 
@@ -439,18 +455,23 @@ absl::Status InProcessCommunicator::CollectivePermute(
   CollectivePermuteParticipant partiticipant{rank_, source_rank, send_buffer,
                                              recv_buffer};
 
-  auto op = Rendezvous<OpParticipants<CollectivePermuteParticipant>>(
-      name, key, partiticipant, key.num_local_participants,
-      CollectParticipants<CollectivePermuteParticipant>);
+  TF_ASSIGN_OR_RETURN(auto op,
+                      Rendezvous<OpParticipants<CollectivePermuteParticipant>>(
+                          name, key, partiticipant, key.num_local_participants,
+                          CollectParticipants<CollectivePermuteParticipant>));
 
   size_t num_bytes = count * primitive_util::ByteWidth(dtype);
-  return op->Invoke(CollectivePermuteOp, rank_, num_bytes);
+
+  TF_RETURN_IF_ERROR(op->Invoke(CollectivePermuteOp, rank_, num_bytes));
+
+  return OkEvent();
 }
 
-absl::Status InProcessCommunicator::AllToAll(
-    absl::Span<const se::DeviceMemoryBase> send_buffers,
-    absl::Span<const se::DeviceMemoryBase> recv_buffers, PrimitiveType dtype,
-    size_t count, const Executor& executor) {
+tsl::AsyncValueRef<InProcessCommunicator::Event>
+InProcessCommunicator::AllToAll(
+    absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
   TF_ASSIGN_OR_RETURN(auto cpu_executor, CpuCollectives::TryCast(&executor));
   const RendezvousKey& key = cpu_executor->rendezvous_key();
 
@@ -459,30 +480,39 @@ absl::Status InProcessCommunicator::AllToAll(
                                     {send_buffers.begin(), send_buffers.end()},
                                     {recv_buffers.begin(), recv_buffers.end()}};
 
-  auto op = Rendezvous<OpParticipants<AllToAllParticipant>>(
-      name, key, partiticipant, key.num_local_participants,
-      CollectParticipants<AllToAllParticipant>);
+  TF_ASSIGN_OR_RETURN(auto op,
+                      Rendezvous<OpParticipants<AllToAllParticipant>>(
+                          name, key, partiticipant, key.num_local_participants,
+                          CollectParticipants<AllToAllParticipant>));
 
   size_t num_bytes = count * primitive_util::ByteWidth(dtype);
-  return op->Invoke(AllToAllOp, rank_, num_bytes);
+
+  TF_RETURN_IF_ERROR(op->Invoke(AllToAllOp, rank_, num_bytes));
+
+  return OkEvent();
 }
 
-absl::Status InProcessCommunicator::AllGather(se::DeviceMemoryBase send_buffer,
-                                              se::DeviceMemoryBase recv_buffer,
-                                              PrimitiveType dtype, size_t count,
-                                              const Executor& executor) {
+tsl::AsyncValueRef<InProcessCommunicator::Event>
+InProcessCommunicator::AllGather(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count,
+                                 const Executor& executor) {
   TF_ASSIGN_OR_RETURN(auto cpu_executor, CpuCollectives::TryCast(&executor));
   const RendezvousKey& key = cpu_executor->rendezvous_key();
 
   std::string name = absl::StrCat("all gather ", key.ToString());
   AllGatherParticipant partiticipant{rank_, send_buffer, recv_buffer};
 
-  auto op = Rendezvous<OpParticipants<AllGatherParticipant>>(
-      name, key, partiticipant, key.num_local_participants,
-      CollectParticipants<AllGatherParticipant>);
+  TF_ASSIGN_OR_RETURN(auto op,
+                      Rendezvous<OpParticipants<AllGatherParticipant>>(
+                          name, key, partiticipant, key.num_local_participants,
+                          CollectParticipants<AllGatherParticipant>));
 
   size_t num_bytes = count * primitive_util::ByteWidth(dtype);
-  return op->Invoke(AllGatherOp, rank_, num_bytes);
+
+  TF_RETURN_IF_ERROR(op->Invoke(AllGatherOp, rank_, num_bytes));
+
+  return OkEvent();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.h b/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.h
index f4366c858f66..a7d682db9503 100644
--- a/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.h
+++ b/third_party/xla/xla/backends/cpu/collectives/in_process_communicator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -39,46 +41,47 @@ class InProcessCommunicator : public Communicator {
  public:
   InProcessCommunicator(size_t rank, size_t num_ranks);
 
-  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, ReductionKind reduction_kind,
-                         const Executor& executor) override;
-
-  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
-                                 PrimitiveType dtype, size_t count,
-                                 std::optional<RankId> source_rank,
-                                 absl::Span<const RankId> target_ranks,
-                                 const Executor& executor) override;
-
-  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
-                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
-                        PrimitiveType dtype, size_t count,
-                        const Executor& executor) override;
-
-  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, const Executor& executor) override;
-
-  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
-                             PrimitiveType dtype, size_t count,
-                             ReductionKind reduction_kind,
-                             const Executor& executor) override;
-
-  absl::Status Broadcast(se::DeviceMemoryBase, se::DeviceMemoryBase,
-                         PrimitiveType, size_t, RankId,
-                         const Executor&) override {
+  tsl::AsyncValueRef<Event> AllReduce(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      ReductionKind reduction_kind,
+                                      const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> CollectivePermute(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+      absl::Span<const RankId> target_ranks, const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> AllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> AllGather(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                          se::DeviceMemoryBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          ReductionKind reduction_kind,
+                                          const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> Broadcast(se::DeviceMemoryBase,
+                                      se::DeviceMemoryBase, PrimitiveType,
+                                      size_t, RankId,
+                                      const Executor&) override {
     return Unimplemented("Broadcast is not implemented");
   }
 
-  absl::Status Send(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
-                    const Executor&) override {
+  tsl::AsyncValueRef<Event> Send(se::DeviceMemoryBase, PrimitiveType, size_t,
+                                 RankId, const Executor&) override {
     return Unimplemented("Send is not implemented");
   }
 
-  absl::Status Recv(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
-                    const Executor&) override {
+  tsl::AsyncValueRef<Event> Recv(se::DeviceMemoryBase, PrimitiveType, size_t,
+                                 RankId, const Executor&) override {
     return Unimplemented("Recv is not implemented");
   }
 
diff --git a/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.cc b/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.cc
index 0062593da754..e4964af58fa0 100644
--- a/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.cc
+++ b/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/xla_data.pb.h"
@@ -123,18 +125,18 @@ MpiCommunicator::MpiCommunicator(int color, int key) {
 
 MpiCommunicator::~MpiCommunicator() { MPI_Comm_free(&comm_); };
 
-absl::Status MpiCommunicator::AllReduce(se::DeviceMemoryBase send_buffer,
-                                        se::DeviceMemoryBase recv_buffer,
-                                        PrimitiveType dtype, size_t count,
-                                        ReductionKind reduction_kind,
-                                        const Executor& executor) {
+tsl::AsyncValueRef<MpiCommunicator::Event> MpiCommunicator::AllReduce(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
   TF_ASSIGN_OR_RETURN(MPI_Datatype type, PrimitiveTypeToMpiType(dtype));
   TF_ASSIGN_OR_RETURN(MPI_Op op, ReductionKindToMpiOp(reduction_kind, type));
-  return MpiErrorToAbslStatus(MPI_Allreduce(
-      send_buffer.opaque(), recv_buffer.opaque(), count, type, op, comm_));
+  TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(MPI_Allreduce(
+      send_buffer.opaque(), recv_buffer.opaque(), count, type, op, comm_)));
+  return OkEvent();
 }
 
-absl::Status MpiCommunicator::CollectivePermute(
+tsl::AsyncValueRef<MpiCommunicator::Event> MpiCommunicator::CollectivePermute(
     se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
     PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
     absl::Span<const RankId> target_ranks, const Executor& executor) {
@@ -175,13 +177,13 @@ absl::Status MpiCommunicator::CollectivePermute(
         MpiErrorToAbslStatus(MPI_Wait(&request, MPI_STATUS_IGNORE)));
   }
 
-  return absl::OkStatus();
+  return OkEvent();
 }
 
-absl::Status MpiCommunicator::AllToAll(
-    absl::Span<const se::DeviceMemoryBase> send_buffers,
-    absl::Span<const se::DeviceMemoryBase> recv_buffers, PrimitiveType dtype,
-    size_t count, const Executor& executor) {
+tsl::AsyncValueRef<MpiCommunicator::Event> MpiCommunicator::AllToAll(
+    absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
   // We can't use MPI_Alltoall directly because it assumes that the inputs and
   // outputs are contiguous. Therefore here we implement it using MPI_Sendrecv.
 
@@ -197,8 +199,8 @@ absl::Status MpiCommunicator::AllToAll(
   std::vector<void*> output_buffers;
 
   for (int i = 0; i < size; i++) {
-    input_buffers.push_back(const_cast<void*>(send_buffers[i].opaque()));
-    output_buffers.push_back(const_cast<void*>(recv_buffers[i].opaque()));
+    input_buffers.push_back(send_buffers[i].opaque());
+    output_buffers.push_back(recv_buffers[i].opaque());
   }
 
   std::memcpy(output_buffers[rank], input_buffers[rank], chunk_bytes);
@@ -212,31 +214,33 @@ absl::Status MpiCommunicator::AllToAll(
                      recv_rank, tag, comm_, MPI_STATUS_IGNORE)));
   }
 
-  return absl::OkStatus();
+  return OkEvent();
 }
 
-absl::Status MpiCommunicator::AllGather(se::DeviceMemoryBase send_buffer,
-                                        se::DeviceMemoryBase recv_buffer,
-                                        PrimitiveType dtype, size_t count,
-                                        const Executor& executor) {
+tsl::AsyncValueRef<MpiCommunicator::Event> MpiCommunicator::AllGather(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
   TF_ASSIGN_OR_RETURN(MPI_Datatype type, PrimitiveTypeToMpiType(dtype));
-  return MpiErrorToAbslStatus(MPI_Allgather(send_buffer.opaque(), count, type,
-                                            recv_buffer.opaque(), count, type,
-                                            comm_));
+  TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
+      MPI_Allgather(send_buffer.opaque(), count, type, recv_buffer.opaque(),
+                    count, type, comm_)));
+
+  return OkEvent();
 }
 
-absl::Status MpiCommunicator::ReduceScatter(se::DeviceMemoryBase send_buffer,
-                                            se::DeviceMemoryBase recv_buffer,
-                                            PrimitiveType dtype, size_t count,
-                                            ReductionKind reduction_kind,
-                                            const Executor& executor) {
+tsl::AsyncValueRef<MpiCommunicator::Event> MpiCommunicator::ReduceScatter(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
   const int size = mpi_size_;
   std::vector<int> recvcounts(size, count);
   TF_ASSIGN_OR_RETURN(MPI_Datatype type, PrimitiveTypeToMpiType(dtype));
   TF_ASSIGN_OR_RETURN(MPI_Op op, ReductionKindToMpiOp(reduction_kind, type));
-  return MpiErrorToAbslStatus(
+  TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
       MPI_Reduce_scatter(send_buffer.opaque(), recv_buffer.opaque(),
-                         recvcounts.data(), type, op, comm_));
+                         recvcounts.data(), type, op, comm_)));
+
+  return OkEvent();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.h b/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.h
index cfed534b66bd..432d8c0ea93a 100644
--- a/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.h
+++ b/third_party/xla/xla/backends/cpu/collectives/mpi_communicator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -39,44 +41,45 @@ class MpiCommunicator : public Communicator {
   explicit MpiCommunicator(int color, int key);
   ~MpiCommunicator() override;
 
-  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, ReductionKind reduction_kind,
-                         const Executor& executor) override;
-
-  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
-                                 PrimitiveType dtype, size_t count,
-                                 std::optional<RankId> source_rank,
-                                 absl::Span<const RankId> target_ranks,
-                                 const Executor& executor) override;
-
-  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
-                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
-                        PrimitiveType dtype, size_t count,
-                        const Executor& executor) override;
-  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, const Executor& executor) override;
-  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
-                             PrimitiveType dtype, size_t count,
-                             ReductionKind reduction_kind,
-                             const Executor& executor) override;
-
-  absl::Status Broadcast(se::DeviceMemoryBase, se::DeviceMemoryBase,
-                         PrimitiveType, size_t, RankId,
-                         const Executor&) override {
+  tsl::AsyncValueRef<Event> AllReduce(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      ReductionKind reduction_kind,
+                                      const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> CollectivePermute(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+      absl::Span<const RankId> target_ranks, const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> AllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) override;
+  tsl::AsyncValueRef<Event> AllGather(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      const Executor& executor) override;
+  tsl::AsyncValueRef<Event> ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                          se::DeviceMemoryBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          ReductionKind reduction_kind,
+                                          const Executor& executor) override;
+
+  tsl::AsyncValueRef<Event> Broadcast(se::DeviceMemoryBase,
+                                      se::DeviceMemoryBase, PrimitiveType,
+                                      size_t, RankId,
+                                      const Executor&) override {
     return Unimplemented("Broadcast is not implemented");
   }
 
-  absl::Status Send(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
-                    const Executor&) override {
+  tsl::AsyncValueRef<Event> Send(se::DeviceMemoryBase, PrimitiveType, size_t,
+                                 RankId, const Executor&) override {
     return Unimplemented("Send is not implemented");
   }
 
-  absl::Status Recv(se::DeviceMemoryBase, PrimitiveType, size_t, RankId,
-                    const Executor&) override {
+  tsl::AsyncValueRef<Event> Recv(se::DeviceMemoryBase, PrimitiveType, size_t,
+                                 RankId, const Executor&) override {
     return Unimplemented("Recv is not implemented");
   }
 
diff --git a/third_party/xla/xla/backends/cpu/constant_allocation.cc b/third_party/xla/xla/backends/cpu/constant_allocation.cc
index 9da4f8148109..3702811c48e0 100644
--- a/third_party/xla/xla/backends/cpu/constant_allocation.cc
+++ b/third_party/xla/xla/backends/cpu/constant_allocation.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
diff --git a/third_party/xla/xla/backends/cpu/nanort/BUILD b/third_party/xla/xla/backends/cpu/nanort/BUILD
index a68ea447ef11..3513c0365ad2 100644
--- a/third_party/xla/xla/backends/cpu/nanort/BUILD
+++ b/third_party/xla/xla/backends/cpu/nanort/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/backends/cpu/nanort:package_groups.bzl", "xla_cpu_nanort_packages")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform/default:rules_cc.bzl", "cc_library")
@@ -46,9 +46,15 @@ xla_cc_test(
     deps = [
         ":nanort_client",
         ":nanort_executable",
+        "//xla:array2d",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:alignment",
+        "//xla/ffi",
+        "//xla/ffi:execution_context",
+        "//xla/ffi:ffi_api",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
@@ -56,15 +62,21 @@ xla_cc_test(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "//xla/runtime:device_id",
+        "//xla/service:computation_placer_hdr",
         "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -76,15 +88,20 @@ cc_library(
         "//xla/backends/cpu/nanort:nanort_users",
     ]),
     deps = [
+        "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:util",
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu/runtime:buffer_allocations",
+        "//xla/backends/cpu/runtime:function_library",
         "//xla/backends/cpu/runtime:thread_pool_task_runner",
         "//xla/backends/cpu/runtime:thunk",
+        "//xla/ffi:execution_context",
         "//xla/hlo/ir:hlo",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:computation_layout",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:executable",
         "//xla/service:hlo_value",
         "//xla/service/cpu:cpu_executable",
@@ -135,6 +152,8 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
+        "//xla/python/ifrt:client_impl_util",
+        "//xla/python/ifrt:user_context",
         "//xla/python/ifrt/hlo:hlo_program",
         "//xla/python/pjrt_ifrt:pjrt_dtype",
         "//xla/python/pjrt_ifrt:xla_ifrt",
@@ -154,6 +173,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
@@ -184,7 +204,7 @@ xla_cc_test(
         ":register_nanort_for_ifrt_tests",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_executable",
         "//xla/python/ifrt",
@@ -192,11 +212,11 @@ xla_cc_test(
         "//xla/python/ifrt:client_impl_test_lib",
         "//xla/python/ifrt:test_util",
         "//xla/python/ifrt:tuple_impl_test_lib",
+        "//xla/python/ifrt:user_context",
         "//xla/python/ifrt/hlo:hlo_program",
         "//xla/python/pjrt_ifrt:xla_executable_impl_test_lib",
         "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
index 30c342a5fc62..ace2eff4ebea 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
@@ -57,9 +58,11 @@ limitations under the License.
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/utils.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/client_impl_util.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
@@ -76,6 +79,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
@@ -153,7 +157,7 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
   using DataPtr = std::shared_ptr<void>;
 
   NanoArray(NanoIfrtClient* client, ifrt::DType dtype, ifrt::Shape shape,
-            DataPtr data, std::shared_ptr<const ifrt::Sharding> sharding)
+            DataPtr data, ifrt::ShardingRef sharding)
       : NanoValue<NanoArray, ifrt::Array>(client),
         dtype_(std::move(dtype)),
         shape_(std::move(shape)),
@@ -163,7 +167,7 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
   // Allocates a new array of the given type and shape.
   static absl::StatusOr<tsl::RCReference<NanoArray>> Allocate(
       NanoIfrtClient* client, ifrt::DType dtype, ifrt::Shape shape,
-      std::shared_ptr<const ifrt::Sharding> sharding) {
+      ifrt::ShardingRef sharding) {
     TF_RET_CHECK(dtype.byte_size().has_value());
     TF_ASSIGN_OR_RETURN(
         DataPtr data_ptr,
@@ -178,7 +182,7 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
   // and dense.
   static absl::StatusOr<tsl::RCReference<NanoArray>> FromBuffer(
       NanoIfrtClient* client, void* data, ifrt::DType dtype, ifrt::Shape shape,
-      std::shared_ptr<const ifrt::Sharding> sharding,
+      ifrt::ShardingRef sharding,
       std::optional<absl::Span<const int64_t>> byte_strides, bool make_copy,
       std::function<void()> on_done_with_host_buffer) {
     auto size = dtype.byte_size().value_or(0) * shape.num_elements();
@@ -380,33 +384,22 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
 
   const ifrt::Sharding& sharding() const override { return *sharding_; }
 
-  absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> shared_ptr_sharding()
-      const override {
-    return sharding_;
-  }
+  ifrt::ShardingRef shared_ptr_sharding() const override { return sharding_; }
 
   absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override {
     TF_RETURN_IF_ERROR(ValidateNotDeleted());
     return std::make_shared<PjRtLayout>(xla::Layout(shape().dims()));
   }
 
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(
-      ifrt::ArrayCopySemantics semantics) override {
-    TF_RETURN_IF_ERROR(ValidateNotDeleted());
-    TF_ASSIGN_OR_RETURN(auto shards, Disassemble());
-    return std::vector<tsl::RCReference<Array>>(shards.begin(), shards.end());
-  }
-
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> DisassembleIntoSingleDeviceArrays(
       ifrt::ArrayCopySemantics array_copy_semantics,
       ifrt::SingleDeviceShardSemantics single_device_shard_semantics) override {
     TF_RETURN_IF_ERROR(ValidateNotDeleted());
-    return DisassembleIntoSingleDeviceArrays(array_copy_semantics);
+    TF_ASSIGN_OR_RETURN(auto shards, Disassemble());
+    return std::vector<ifrt::ArrayRef>(shards.begin(), shards.end());
   }
 
-  absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+  absl::StatusOr<ifrt::ArrayRef> FullyReplicatedShard(
       ifrt::ArrayCopySemantics semantics) override {
     TF_RETURN_IF_ERROR(ValidateNotDeleted());
     return tsl::FormRef(this);
@@ -518,7 +511,7 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
   ifrt::DType dtype_;
   ifrt::Shape shape_;
   DataPtr data_;
-  std::shared_ptr<const ifrt::Sharding> sharding_;
+  ifrt::ShardingRef sharding_;
 };
 
 ABSL_ATTRIBUTE_UNUSED char NanoArray::ID = 'A';  // NOLINT
@@ -529,9 +522,8 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
  public:
   // Creates an array from the given shards. Note that if we can assemble the
   // array using the given sharding, this method will return a NanoArray.
-  static absl::StatusOr<tsl::RCReference<ifrt::Array>> FromShards(
-      NanoIfrtClient* client, ifrt::Shape shape,
-      std::shared_ptr<const ifrt::Sharding> sharding,
+  static absl::StatusOr<ifrt::ArrayRef> FromShards(
+      NanoIfrtClient* client, ifrt::Shape shape, ifrt::ShardingRef sharding,
       std::vector<tsl::RCReference<NanoArray>> shards) {
     if (shards.empty()) {
       return InvalidArgument("Can't create a sharded array with no shards.");
@@ -561,7 +553,7 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
   // common case where a checkpoint is loaded with an unknown sharding, but
   // then we find the real sharding when the program is run.
   absl::StatusOr<tsl::RCReference<NanoArray>> AssembleForExecution(
-      std::shared_ptr<const ifrt::Sharding> sharding) {
+      ifrt::ShardingRef sharding) {
     TF_RETURN_IF_ERROR(ValidateNotDeleted());
     absl::call_once(assemble_once_, [this, sharding]() {
       assemble_result_ = Assemble(sharding);
@@ -603,30 +595,20 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
 
   const ifrt::Sharding& sharding() const override { return *sharding_; }
 
-  absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> shared_ptr_sharding()
-      const override {
-    return sharding_;
-  }
+  ifrt::ShardingRef shared_ptr_sharding() const override { return sharding_; }
 
   absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override {
     return std::make_shared<PjRtLayout>(xla::Layout(shape().dims()));
   }
 
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(
-      ifrt::ArrayCopySemantics semantics) override {
-    TF_RETURN_IF_ERROR(ValidateNotDeleted());
-    return std::vector<tsl::RCReference<Array>>(shards_.begin(), shards_.end());
-  }
-
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> DisassembleIntoSingleDeviceArrays(
       ifrt::ArrayCopySemantics array_copy_semantics,
       ifrt::SingleDeviceShardSemantics single_device_shard_semantics) override {
-    return DisassembleIntoSingleDeviceArrays(array_copy_semantics);
+    TF_RETURN_IF_ERROR(ValidateNotDeleted());
+    return std::vector<ifrt::ArrayRef>(shards_.begin(), shards_.end());
   }
 
-  absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+  absl::StatusOr<ifrt::ArrayRef> FullyReplicatedShard(
       ifrt::ArrayCopySemantics semantics) override {
     TF_RETURN_IF_ERROR(ValidateNotDeleted());
     return tsl::FormRef(this);
@@ -642,7 +624,7 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
 
  private:
   ShardedNanoArray(NanoIfrtClient* client, ifrt::DType dtype, ifrt::Shape shape,
-                   std::shared_ptr<const ifrt::Sharding> sharding,
+                   ifrt::ShardingRef sharding,
                    std::vector<tsl::RCReference<NanoArray>> shards)
       : NanoValue<ShardedNanoArray, ifrt::Array>(client),
         dtype_(std::move(dtype)),
@@ -651,7 +633,7 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
         shards_(std::move(shards)) {}
 
   absl::StatusOr<tsl::RCReference<NanoArray>> Assemble(
-      std::shared_ptr<const ifrt::Sharding> sharding) {
+      ifrt::ShardingRef sharding) {
     TF_ASSIGN_OR_RETURN(auto index_domains, sharding->IndexDomains(shape()));
     if (index_domains.size() != shards_.size()) {
       return absl::FailedPreconditionError(
@@ -703,7 +685,7 @@ class ShardedNanoArray final : public NanoValue<ShardedNanoArray, ifrt::Array> {
 
   ifrt::DType dtype_;
   ifrt::Shape shape_;
-  std::shared_ptr<const ifrt::Sharding> sharding_;
+  ifrt::ShardingRef sharding_;
   std::vector<tsl::RCReference<NanoArray>> shards_;
 
   absl::once_flag assemble_once_;
@@ -715,8 +697,7 @@ char ShardedNanoArray::ID = 'A';  // NOLINT
 // Tuple implementation.
 class NanoTuple final : public NanoValue<NanoTuple, ifrt::Tuple> {
  public:
-  explicit NanoTuple(NanoIfrtClient* client,
-                     absl::Span<tsl::RCReference<ifrt::Value>> values)
+  explicit NanoTuple(NanoIfrtClient* client, absl::Span<ifrt::ValueRef> values)
       : NanoValue<NanoTuple, ifrt::Tuple>(client),
         values_(values.begin(), values.end()) {}
 
@@ -742,8 +723,7 @@ class NanoTuple final : public NanoValue<NanoTuple, ifrt::Tuple> {
   int Arity() override { return values_.size(); }
 
   // Unpacks the tuple into its constituent pieces.
-  absl::Status Unpack(
-      absl::Span<tsl::RCReference<ifrt::Value>> values) override {
+  absl::Status Unpack(absl::Span<ifrt::ValueRef> values) override {
     TF_RETURN_IF_ERROR(ValidateNotDeleted());
     if (values.size() != values_.size()) {
       return InvalidArgument("Tuple arity mismatch: expected %d, got %d",
@@ -768,7 +748,7 @@ class NanoTuple final : public NanoValue<NanoTuple, ifrt::Tuple> {
 
  private:
   bool deleted_ = false;
-  std::vector<tsl::RCReference<ifrt::Value>> values_;
+  std::vector<ifrt::ValueRef> values_;
 };
 
 ABSL_ATTRIBUTE_UNUSED char NanoTuple::ID = 'T';  // NOLINT
@@ -801,10 +781,10 @@ class NanoExecutable final
                         GetInputShardings(program_shape, computation));
     TF_ASSIGN_OR_RETURN(auto proto_output_shardings,
                         GetOutputShardings(program_shape, computation));
-    auto input_shardings =
-        IfrtShardingsFromProto(client, proto_input_shardings);
-    auto output_shardings =
-        IfrtShardingsFromProto(client, proto_output_shardings);
+    TF_ASSIGN_OR_RETURN(auto input_shardings,
+                        IfrtShardingsFromProto(client, proto_input_shardings));
+    TF_ASSIGN_OR_RETURN(auto output_shardings,
+                        IfrtShardingsFromProto(client, proto_output_shardings));
 
     return absl::WrapUnique(new NanoExecutable(
         client, std::move(computation), std::move(program_shape),
@@ -816,9 +796,14 @@ class NanoExecutable final
 
   absl::string_view name() const override { return program_.name(); }
 
+  absl::StatusOr<absl::Span<const int>> GetDonatableInputIndices()
+      const override {
+    return absl::UnimplementedError(
+        "NanoExecutable::GetDonatableInputIndices is not implemented.");
+  }
+
   absl::StatusOr<ExecuteResult> Execute(
-      absl::Span<tsl::RCReference<ifrt::Array>> args,
-      const ExecuteOptions& options,
+      absl::Span<ifrt::ArrayRef> args, const ExecuteOptions& options,
       std::optional<ifrt::DeviceListRef> devices) override {
     if (ABSL_PREDICT_FALSE(args.size() != input_shardings_.size())) {
       return InvalidArgument(
@@ -948,18 +933,6 @@ class NanoExecutable final
     return absl::UnimplementedError("GetCostAnalysis is not implemented.");
   }
 
-  ifrt::Future<> Delete() override {
-    client_ = nullptr;
-    program_ = {};
-    program_shape_ = {};
-    executable_.reset();
-    input_shardings_.clear();
-    output_shardings_.clear();
-    return Ready();
-  }
-
-  bool IsDeleted() const override { return executable_ == nullptr; }
-
   absl::Span<ifrt::Device* const> addressable_devices() const override {
     return client_->addressable_devices();
   }
@@ -970,8 +943,8 @@ class NanoExecutable final
   NanoExecutable(NanoIfrtClient* client, XlaComputation program,
                  ProgramShape program_shape,
                  std::unique_ptr<NanoRtExecutable> executable,
-                 std::vector<std::shared_ptr<ifrt::Sharding>> input_shardings,
-                 std::vector<std::shared_ptr<ifrt::Sharding>> output_shardings)
+                 std::vector<ifrt::ShardingRef> input_shardings,
+                 std::vector<ifrt::ShardingRef> output_shardings)
       : client_(client),
         program_(std::move(program)),
         program_shape_(std::move(program_shape)),
@@ -981,9 +954,9 @@ class NanoExecutable final
 
   // Converts an OpSharding proto (from an HLO Instruction) to an ifrt
   // sharding.
-  static std::vector<std::shared_ptr<ifrt::Sharding>> IfrtShardingsFromProto(
+  static absl::StatusOr<std::vector<ifrt::ShardingRef>> IfrtShardingsFromProto(
       NanoIfrtClient* client, absl::Span<const OpSharding> shardings) {
-    std::vector<std::shared_ptr<ifrt::Sharding>> result;
+    std::vector<ifrt::ShardingRef> result;
     result.reserve(shardings.size());
     for (const auto& sharding : shardings) {
       if (sharding.type() == OpSharding::REPLICATED ||
@@ -995,10 +968,13 @@ class NanoExecutable final
       for (const auto dim : sharding.tile_assignment_dimensions()) {
         num_tiles *= dim;
       }
-      // Repeat the device for each tile. We only have one device anyway so
-      // just used the first.
+      if (num_tiles > client->devices().size()) {
+        return absl::InvalidArgumentError(absl::StrFormat(
+            "Sharding has %d tiles, but only %d devices are available.",
+            num_tiles, client->devices().size()));
+      }
       auto device_list = ifrt::BasicDeviceList::Create(
-          ifrt::BasicDeviceList::Devices(num_tiles, client->devices()[0]));
+          client->devices().subspan(0, num_tiles));
       auto xla_sharding = *HloSharding::FromProto(sharding);
       result.push_back(ifrt::HloSharding::Create(
           std::move(device_list), client->devices()[0]->Memories()[0]->Kind(),
@@ -1083,7 +1059,7 @@ class NanoExecutable final
   // had to be assembled.
   absl::StatusOr<std::vector<xla::cpu::NanoRtExecutable::Argument>>
   NanoArgumentsFromIfrtArguments(
-      absl::Span<tsl::RCReference<ifrt::Array>> args,
+      absl::Span<ifrt::ArrayRef> args,
       std::vector<tsl::RCReference<NanoArray>>& tmp) {
     std::vector<xla::cpu::NanoRtExecutable::Argument> nano_args;
     nano_args.reserve(args.size());
@@ -1115,8 +1091,8 @@ class NanoExecutable final
   XlaComputation program_;
   ProgramShape program_shape_;
   std::unique_ptr<NanoRtExecutable> executable_;
-  std::vector<std::shared_ptr<ifrt::Sharding>> input_shardings_;
-  std::vector<std::shared_ptr<ifrt::Sharding>> output_shardings_;
+  std::vector<ifrt::ShardingRef> input_shardings_;
+  std::vector<ifrt::ShardingRef> output_shardings_;
 };
 
 ABSL_ATTRIBUTE_UNUSED char NanoExecutable::ID = 'E';  // NOLINT
@@ -1127,20 +1103,21 @@ class NanoCompiler final
  public:
   explicit NanoCompiler(NanoIfrtClient* client) : client_(client) {}
 
-  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
+  using xla::ifrt::Compiler::Compile;
+
+  absl::StatusOr<ifrt::LoadedExecutableRef> CompileAndLoad(
       std::unique_ptr<ifrt::Program> program,
       std::unique_ptr<ifrt::CompileOptions> options) override {
     return NanoExecutable::Create(client_, std::move(program));
   }
 
-  absl::StatusOr<std::unique_ptr<ifrt::Executable>> Compile(
+  absl::StatusOr<ifrt::ExecutableRef> Compile(
       std::unique_ptr<ifrt::Program> program, const ifrt::Topology& topology,
       std::unique_ptr<ifrt::CompileOptions> options) override {
     return absl::UnimplementedError("Partial compilation is not implemented.");
   }
 
-  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>>
-  DeserializeLoadedExecutable(
+  absl::StatusOr<ifrt::LoadedExecutableRef> DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<ifrt::DeserializeExecutableOptions> options) override {
     return absl::UnimplementedError(
@@ -1186,12 +1163,12 @@ ABSL_ATTRIBUTE_UNUSED char NanoMemory::ID = 'M';  // NOLINT
 // Device implementation. There is only one device so this doesn't do much.
 class NanoDevice final : public llvm::RTTIExtends<NanoDevice, ifrt::Device> {
  public:
-  NanoDevice(NanoIfrtClient* client, ifrt::Memory* memory)
-      : client_(client), memory_(memory) {}
+  NanoDevice(NanoIfrtClient* client, ifrt::DeviceId id, ifrt::Memory* memory)
+      : client_(client), id_(id), memory_(memory) {}
 
   ifrt::Client* client() const override { return client_; }
 
-  ifrt::DeviceId Id() const override { return ifrt::DeviceId(0); }
+  ifrt::DeviceId Id() const override { return id_; }
 
   const ifrt::AttributeMap& Attributes() const override {
     static auto attributes = new ifrt::AttributeMap({});
@@ -1220,6 +1197,7 @@ class NanoDevice final : public llvm::RTTIExtends<NanoDevice, ifrt::Device> {
 
  private:
   NanoIfrtClient* client_;
+  ifrt::DeviceId id_;
   ifrt::Memory* memory_;
 };
 
@@ -1238,17 +1216,18 @@ std::shared_ptr<NanoIfrtClient> NanoIfrtClient::CreateWithDevices(
   return std::shared_ptr<NanoIfrtClient>(new NanoIfrtClient(num_devices));
 }
 
-std::shared_ptr<ifrt::Sharding> NanoIfrtClient::default_sharding() const {
-  return ifrt::SingleDeviceSharding::Create(device_.get(), ifrt::MemoryKind{});
+ifrt::ShardingRef NanoIfrtClient::default_sharding() const {
+  return ifrt::SingleDeviceSharding::Create(devices_.front(),
+                                            ifrt::MemoryKind{});
 }
 
-absl::StatusOr<tsl::RCReference<ifrt::Array>>
-NanoIfrtClient::MakeArrayFromHostBuffer(
+absl::StatusOr<ifrt::ArrayRef> NanoIfrtClient::MakeArrayFromHostBuffer(
     const void* data, ifrt::DType dtype, ifrt::Shape shape,
     std::optional<absl::Span<const int64_t>> byte_strides,
-    absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
-    HostBufferSemantics semantics,
-    std::function<void()> on_done_with_host_buffer) {
+    ifrt::ShardingRef sharding, HostBufferSemantics semantics,
+    std::function<void()> on_done_with_host_buffer,
+    tsl::RCReference<xla::ifrt::UserContext> user_context) {
+  // Currently the `user_context` parameter is ignored.
   bool make_copy = false;
   switch (semantics) {
     case HostBufferSemantics::kImmutableUntilTransferCompletes:
@@ -1266,12 +1245,29 @@ NanoIfrtClient::MakeArrayFromHostBuffer(
                                std::move(on_done_with_host_buffer));
 }
 
-absl::StatusOr<tsl::RCReference<ifrt::Array>>
+absl::StatusOr<std::vector<ifrt::ArrayRef>>
+NanoIfrtClient::MakeArraysFromHostBufferShards(
+    absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
+    HostBufferSemantics semantics,
+    tsl::RCReference<ifrt::UserContext> user_context) {
+  return ifrt::ClientMakeArraysFromHostBufferShards(this, specs, semantics,
+                                                    std::move(user_context));
+}
+
+absl::StatusOr<std::vector<ifrt::ArrayRef>> NanoIfrtClient::MakeErrorArrays(
+    const absl::Status& error,
+    absl::Span<const xla::ifrt::ArraySpec> array_specs,
+    tsl::RCReference<xla::ifrt::UserContext> user_context) {
+  return absl::UnimplementedError(
+      "NanoIfrtClient does not support MakeErrorArrays.");
+}
+
+absl::StatusOr<ifrt::ArrayRef>
 NanoIfrtClient::AssembleArrayFromSingleDeviceArrays(
-    ifrt::Shape shape,
-    absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
-    absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-    ifrt::ArrayCopySemantics semantics) {
+    ifrt::DType dtype, ifrt::Shape shape, ifrt::ShardingRef sharding,
+    absl::Span<ifrt::ArrayRef> arrays,
+    ifrt::ArrayCopySemantics array_copy_semantics,
+    ifrt::SingleDeviceShardSemantics single_device_shard_semantics) {
   std::vector<tsl::RCReference<NanoArray>> nano_arrays;
   nano_arrays.reserve(arrays.size());
   for (const auto& array : arrays) {
@@ -1286,41 +1282,15 @@ NanoIfrtClient::AssembleArrayFromSingleDeviceArrays(
                                       std::move(nano_arrays));
 }
 
-absl::StatusOr<tsl::RCReference<ifrt::Array>>
-NanoIfrtClient::AssembleArrayFromSingleDeviceArrays(
-    ifrt::Shape shape,
-    absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
-    absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-    ifrt::ArrayCopySemantics array_copy_semantics,
-    ifrt::SingleDeviceShardSemantics single_device_shard_semantics) {
-  return AssembleArrayFromSingleDeviceArrays(shape, sharding, arrays,
-                                             array_copy_semantics);
-}
-
-absl::StatusOr<tsl::RCReference<ifrt::Array>>
-NanoIfrtClient::AssembleArrayFromSingleDeviceArrays(
-    ifrt::DType dtype, ifrt::Shape shape,
-    absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
-    absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-    ifrt::ArrayCopySemantics array_copy_semantics,
-    ifrt::SingleDeviceShardSemantics single_device_shard_semantics) {
-  // NanoRT devices always have at least one buffer, so we can use the buffer
-  // dtype.
-  TF_RET_CHECK(!arrays.empty());
-  TF_RET_CHECK(dtype == arrays.front()->dtype());
-  return AssembleArrayFromSingleDeviceArrays(shape, sharding, arrays,
-                                             array_copy_semantics);
-}
-
-absl::StatusOr<std::vector<tsl::RCReference<ifrt::Array>>>
-NanoIfrtClient::CopyArrays(absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-                           std::optional<ifrt::DeviceListRef> devices,
-                           std::optional<ifrt::MemoryKind> memory_kind,
-                           ifrt::ArrayCopySemantics semantics) {
-  std::vector<tsl::RCReference<ifrt::Array>> result;
+absl::StatusOr<std::vector<ifrt::ArrayRef>> NanoIfrtClient::CopyArrays(
+    absl::Span<ifrt::ArrayRef> arrays,
+    std::optional<ifrt::DeviceListRef> devices,
+    std::optional<ifrt::MemoryKind> memory_kind,
+    ifrt::ArrayCopySemantics semantics) {
+  std::vector<ifrt::ArrayRef> result;
   result.reserve(arrays.size());
   for (const auto& array : arrays) {
-    tsl::RCReference<ifrt::Array> copy;
+    ifrt::ArrayRef copy;
     TF_ASSIGN_OR_RETURN(auto sharding, array->sharding().WithDeviceAssignment(
                                            devices, memory_kind));
     if (auto nano_array = llvm::dyn_cast_or_null<NanoArray>(array.get())) {
@@ -1350,21 +1320,19 @@ NanoIfrtClient::CopyArrays(absl::Span<tsl::RCReference<ifrt::Array>> arrays,
   return result;
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-NanoIfrtClient::RemapArrays(
-    const ifrt::RemapPlan& plan,
-    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+absl::StatusOr<std::vector<ifrt::ArrayRef>> NanoIfrtClient::RemapArrays(
+    const ifrt::RemapPlan& plan, absl::Span<ifrt::ArrayRef> arrays,
     ifrt::ArrayCopySemantics semantics) {
   return absl::UnimplementedError("RemapArrays is not implemented.");
 }
 
 ifrt::Future<> NanoIfrtClient::GetReadyFuture(
-    absl::Span<const tsl::RCReference<ifrt::Value>> values) {
+    absl::Span<const ifrt::ValueRef> values) {
   return Ready();
 }
 
 absl::StatusOr<tsl::RCReference<ifrt::Tuple>> NanoIfrtClient::MakeTuple(
-    absl::Span<tsl::RCReference<ifrt::Value>> values) {
+    absl::Span<ifrt::ValueRef> values) {
   return tsl::MakeRef<NanoTuple>(this, std::move(values));
 }
 
@@ -1408,7 +1376,20 @@ absl::Span<xla::ifrt::Device* const> NanoIfrtClient::GetAllDevices() const {
 absl::StatusOr<ifrt::DeviceAssignment>
 NanoIfrtClient::GetDefaultDeviceAssignment(int num_replicas,
                                            int num_partitions) const {
-  return ifrt::DeviceAssignment(num_replicas, num_partitions);
+  if (num_replicas < 1 || num_partitions < 1) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Requested device assignment is invalid: %d replicas, "
+                        "%d partitions",
+                        num_replicas, num_partitions));
+  } else if (num_replicas * num_partitions > devices_.size()) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "Requested device assignment is too large for the number of devices "
+        "available: %d vs. %d",
+        num_replicas * num_partitions, devices_.size()));
+  }
+  ifrt::DeviceAssignment device_assignment(num_replicas, num_partitions);
+  device_assignment.FillIota(0);
+  return device_assignment;
 }
 
 absl::StatusOr<ifrt::Device*> NanoIfrtClient::LookupDevice(
@@ -1418,7 +1399,12 @@ absl::StatusOr<ifrt::Device*> NanoIfrtClient::LookupDevice(
 
 absl::StatusOr<ifrt::Device*> NanoIfrtClient::LookupAddressableDevice(
     int local_hardware_id) const {
-  return device_.get();
+  if (local_hardware_id < 0 || local_hardware_id >= devices_.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Device id %d is out of range [0, %d)",
+                        local_hardware_id, devices_.size()));
+  }
+  return devices_[local_hardware_id];
 }
 
 ifrt::DeviceListRef NanoIfrtClient::MakeDeviceList(
@@ -1444,11 +1430,15 @@ NanoIfrtClient::GetDefaultLayout(ifrt::DType dtype,
 
 NanoIfrtClient::NanoIfrtClient(int32_t num_devices)
     : compiler_(std::make_unique<NanoCompiler>(this)),
-      memory_(std::make_unique<NanoMemory>(this)),
-      device_(std::make_unique<NanoDevice>(this, memory_.get())),
-      default_sharding_(
-          ifrt::SingleDeviceSharding::Create(device_.get(), memory_->Kind())),
-      devices_(num_devices, device_.get()) {}
+      memory_(std::make_unique<NanoMemory>(this)) {
+  owned_devices_.reserve(num_devices);
+  devices_.reserve(num_devices);
+  for (int i = 0; i < num_devices; ++i) {
+    owned_devices_.push_back(
+        std::make_unique<NanoDevice>(this, ifrt::DeviceId(i), memory_.get()));
+    devices_.push_back(owned_devices_.back().get());
+  }
+}
 
 char NanoIfrtClient::ID = 'N';  // NOLINT
 
diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.h b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.h
index 6ac0122f9c61..8742274e9a28 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.h
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "xla/backends/cpu/nanort/nanort_client.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
@@ -43,6 +45,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
@@ -78,7 +81,7 @@ class NanoIfrtClient : public llvm::RTTIExtends<NanoIfrtClient, ifrt::Client> {
 
   // Returns a single device sharding. Generally callers should prefer to use
   // this when possible for optimal performance.
-  std::shared_ptr<ifrt::Sharding> default_sharding() const;
+  ifrt::ShardingRef default_sharding() const;
 
   // Returns the underlying NanoRtClient.
   NanoRtClient* nano_client() { return &client_; }
@@ -88,12 +91,22 @@ class NanoIfrtClient : public llvm::RTTIExtends<NanoIfrtClient, ifrt::Client> {
   // Creates an array from a host buffer. The buffer will be used directly
   // without a copy if the copy semantics allow it and the layout is row major
   // and dense.
-  absl::StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromHostBuffer(
+  absl::StatusOr<ifrt::ArrayRef> MakeArrayFromHostBuffer(
       const void* data, ifrt::DType dtype, ifrt::Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
-      absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
+      ifrt::ShardingRef sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer,
+      tsl::RCReference<xla::ifrt::UserContext> user_context) override;
+
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> MakeArraysFromHostBufferShards(
+      absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
       HostBufferSemantics semantics,
-      std::function<void()> on_done_with_host_buffer) override;
+      tsl::RCReference<xla::ifrt::UserContext> user_context) override;
+
+  absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> MakeErrorArrays(
+      const absl::Status& error,
+      absl::Span<const xla::ifrt::ArraySpec> array_specs,
+      tsl::RCReference<xla::ifrt::UserContext> user_context) override;
 
   // Assembles a sharded array from a list of single device arrays. If the
   // provided sharding is specific enough to assemble a dense array, this method
@@ -101,43 +114,27 @@ class NanoIfrtClient : public llvm::RTTIExtends<NanoIfrtClient, ifrt::Client> {
   //
   // Otherwise we will produce an assembled array on demand when it is first
   // accessed by an XLA program.
-  absl::StatusOr<tsl::RCReference<ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      ifrt::Shape shape,
-      absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-      ifrt::ArrayCopySemantics semantics) override;
-  absl::StatusOr<tsl::RCReference<ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      ifrt::Shape shape,
-      absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-      ifrt::ArrayCopySemantics array_copy_semantics,
-      ifrt::SingleDeviceShardSemantics single_device_shard_semantics) override;
-  absl::StatusOr<tsl::RCReference<ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      ifrt::DType dtype, ifrt::Shape shape,
-      absl::Nonnull<std::shared_ptr<const ifrt::Sharding>> sharding,
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
+  absl::StatusOr<ifrt::ArrayRef> AssembleArrayFromSingleDeviceArrays(
+      ifrt::DType dtype, ifrt::Shape shape, ifrt::ShardingRef sharding,
+      absl::Span<ifrt::ArrayRef> arrays,
       ifrt::ArrayCopySemantics array_copy_semantics,
       ifrt::SingleDeviceShardSemantics single_device_shard_semantics) override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<ifrt::Array>>> CopyArrays(
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> CopyArrays(
+      absl::Span<ifrt::ArrayRef> arrays,
       std::optional<ifrt::DeviceListRef> devices,
       std::optional<ifrt::MemoryKind> memory_kind,
       ifrt::ArrayCopySemantics semantics) override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> RemapArrays(
-      const ifrt::RemapPlan& plan,
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+  absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> RemapArrays(
+      const ifrt::RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
       ifrt::ArrayCopySemantics semantics) override;
 
   ifrt::Future<> GetReadyFuture(
-      absl::Span<const tsl::RCReference<ifrt::Value>> values) override;
+      absl::Span<const ifrt::ValueRef> values) override;
 
   absl::StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
-      absl::Span<tsl::RCReference<ifrt::Value>> values) override;
+      absl::Span<ifrt::ValueRef> values) override;
 
   absl::string_view runtime_type() const override;
 
@@ -174,6 +171,10 @@ class NanoIfrtClient : public llvm::RTTIExtends<NanoIfrtClient, ifrt::Client> {
       ifrt::DType dtype, absl::Span<const int64_t> dims, ifrt::Device* device,
       xla::ifrt::MemoryKind memory_kind) const override;
 
+  tsl::RCReference<xla::ifrt::UserContext> CreateUserContext() override {
+    return tsl::RCReference<xla::ifrt::UserContext>();
+  }
+
   static char ID;  // NOLINT
 
  private:
@@ -186,19 +187,10 @@ class NanoIfrtClient : public llvm::RTTIExtends<NanoIfrtClient, ifrt::Client> {
   // details.
   std::unique_ptr<ifrt::Compiler> compiler_;
   std::unique_ptr<ifrt::Memory> memory_;
-  std::unique_ptr<ifrt::Device> device_;
-
-  // The default sharding for this client. When this sharding is used it
-  // typically means that we can use an array's contents directly.
-  std::shared_ptr<ifrt::Sharding> default_sharding_;
+  std::vector<std::unique_ptr<ifrt::Device>> owned_devices_;
 
   // Some of the ifrt::Client methods return a span of devices, so we need to
-  // keep storage for them here. Note that this may repeat the device_ pointer
-  // multiple times if this client is configured with multiple devices. This is
-  // mostly to make IFRT callers that expect sharded programs to run on multiple
-  // devices happy. This has the unusual property that we have multiple devices
-  // but a single device_id, but this seems to work fine and most documentation
-  // warns that devices may be repeated within a device list or sharding.
+  // keep storage for them here.
   std::vector<ifrt::Device*> devices_;
 };
 
diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc
index aa374a862714..8418add69751 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client_test.cc
@@ -41,10 +41,12 @@
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/test_util.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
@@ -70,8 +72,8 @@ TEST(NanoIfrtClientTest, BigResult) {
   mlir::MLIRContext context;
   auto module = xla::ParseMlirModuleString(kBigResult, context);
 
-  auto executable =
-      compiler->Compile(std::make_unique<ifrt::HloProgram>(**module), nullptr);
+  auto executable = compiler->CompileAndLoad(
+      std::make_unique<ifrt::HloProgram>(**module), nullptr);
   CHECK_OK(executable);
 
   ifrt::DType dtype(ifrt::DType::kF32);
@@ -80,7 +82,9 @@ TEST(NanoIfrtClientTest, BigResult) {
 
   auto a_array = client->MakeArrayFromHostBuffer(
       &a, dtype, shape, std::nullopt, client->default_sharding(),
-      ifrt::Client::HostBufferSemantics::kImmutableZeroCopy, nullptr);
+      ifrt::Client::HostBufferSemantics::kImmutableZeroCopy,
+      /*on_done_with_host_buffer=*/nullptr,
+      tsl::RCReference<xla::ifrt::UserContext>());
   CHECK_OK(a_array);
 
   auto result =
@@ -102,19 +106,20 @@ TEST(NanoIfrtClientTest, BigResult) {
 // Performance benchmarks below
 //===----------------------------------------------------------------------===//
 
-static absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
+static absl::StatusOr<ifrt::LoadedExecutableRef> CompileAndLoad(
     NanoIfrtClient* client, absl::string_view program) {
   auto compiler = client->GetDefaultCompiler();
 
   mlir::MLIRContext context;
   auto module = xla::ParseMlirModuleString(program, context);
 
-  auto compile_options =
-      std::make_unique<ifrt::XlaCompileOptions>(xla::CompileOptions());
+  auto devices = client->MakeDeviceList({client->addressable_devices().at(0)});
+  auto compile_options = std::make_unique<ifrt::XlaCompileOptions>(
+      xla::CompileOptions(), std::move(devices));
   compile_options->compile_options.compile_portable_executable = true;
 
-  return compiler->Compile(std::make_unique<ifrt::HloProgram>(**module),
-                           std::move(compile_options));
+  return compiler->CompileAndLoad(std::make_unique<ifrt::HloProgram>(**module),
+                                  std::move(compile_options));
 }
 
 static ifrt::DType DTypeFromPrimitiveType(PrimitiveType type) {
@@ -126,16 +131,17 @@ static ifrt::DType DTypeFromPrimitiveType(PrimitiveType type) {
   }
 }
 
-static absl::StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromLiteral(
+static absl::StatusOr<ifrt::ArrayRef> MakeArrayFromLiteral(
     NanoIfrtClient* client, const Literal& literal,
-    std::shared_ptr<const ifrt::Sharding> sharding) {
+    ifrt::ShardingRef sharding) {
   return client->MakeArrayFromHostBuffer(
       literal.untyped_data(),
       DTypeFromPrimitiveType(literal.shape().element_type()),
       ifrt::Shape(literal.shape().dimensions()),
       /*byte_strides=*/std::nullopt, std::move(sharding),
       ifrt::Client::HostBufferSemantics::kImmutableZeroCopy,
-      /*on_done_with_host_buffer=*/{});
+      /*on_done_with_host_buffer=*/{},
+      tsl::RCReference<xla::ifrt::UserContext>());
 }
 
 static void BM_IfRtAddScalars(benchmark::State& state) {
@@ -148,10 +154,10 @@ static void BM_IfRtAddScalars(benchmark::State& state) {
       })";
 
   auto client = NanoIfrtClient::Create();
-  auto executable = Compile(client.get(), program);
+  auto executable = CompileAndLoad(client.get(), program);
 
   ifrt::Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const ifrt::Sharding> sharding =
+  ifrt::ShardingRef sharding =
       ifrt::SingleDeviceSharding::Create(device, ifrt::MemoryKind());
 
   ifrt::ExecuteOptions execute_options;
@@ -165,8 +171,8 @@ static void BM_IfRtAddScalars(benchmark::State& state) {
     auto b_array = MakeArrayFromLiteral(client.get(), b, sharding);
     CHECK(a_array.ok() && b_array.ok());
 
-    std::array<tsl::RCReference<ifrt::Array>, 2> args = {std::move(*a_array),
-                                                         std::move(*b_array)};
+    std::array<ifrt::ArrayRef, 2> args = {std::move(*a_array),
+                                          std::move(*b_array)};
 
     auto result = (*executable)
                       ->Execute(absl::MakeSpan(args), execute_options,
@@ -202,10 +208,10 @@ static void BM_IfRtAddManyScalars(benchmark::State& state) {
       })";
 
   auto client = NanoIfrtClient::Create();
-  auto executable = Compile(client.get(), program);
+  auto executable = CompileAndLoad(client.get(), program);
 
   ifrt::Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const ifrt::Sharding> sharding =
+  ifrt::ShardingRef sharding =
       ifrt::SingleDeviceSharding::Create(device, ifrt::MemoryKind());
 
   ifrt::ExecuteOptions execute_options;
@@ -219,8 +225,8 @@ static void BM_IfRtAddManyScalars(benchmark::State& state) {
     auto b_array = MakeArrayFromLiteral(client.get(), b, sharding);
     CHECK(a_array.ok() && b_array.ok());
 
-    std::array<tsl::RCReference<ifrt::Array>, 2> args = {std::move(*a_array),
-                                                         std::move(*b_array)};
+    std::array<ifrt::ArrayRef, 2> args = {std::move(*a_array),
+                                          std::move(*b_array)};
 
     auto result = (*executable)
                       ->Execute(absl::MakeSpan(args), execute_options,
@@ -235,10 +241,16 @@ BENCHMARK(BM_IfRtAddManyScalars);
 }  // namespace xla::cpu
 
 int main(int argc, char** argv) {
-  // This test expects copies to multiple devices to fail, but we only have one
-  // device and it doesn't seem worth pretending that we have more.
   static constexpr absl::string_view kFilter =
-      "-ArrayImplTest.CopyMixedSourceDevices";
+      // This test expects copies to multiple devices to fail, but we only have
+      // one device and it doesn't seem worth pretending that we have more.
+      "-ArrayImplTest.CopyMixedSourceDevices:"
+      // String arrays are not supported in NanoIfrtClient.
+      "ArrayImplTest.MakeArrayFromHostBufferAndCopyToHostBufferWithString:"
+      "ArrayImplTest."
+      "MakeArraysFromHostBufferShardsAndCopyToHostBufferWithString:"
+      // `MakeErrorArrays` is not supported in NanoIfrtClient.
+      "ArrayImplTest.MakeErrorArrays";
   xla::ifrt::test_util::SetTestFilterIfNotUserSpecified(kFilter);
 
   for (int i = 1; i < argc; i++) {
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
index 300a92d6bda5..711a350eae9f 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client_test.cc
@@ -15,31 +15,51 @@ limitations under the License.
 
 #include "xla/backends/cpu/nanort/nanort_client.h"
 
+#include <stdalign.h>
+
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array2d.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/backends/cpu/nanort/nanort_executable.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
+#include "xla/runtime/device_id.h"
+#include "xla/service/computation_placer.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/xla_data.pb.h"
 
+#define EIGEN_USE_THREADS
+
+#include "Eigen/ThreadPool"
+#include "unsupported/Eigen/CXX11/Tensor"
+
 namespace xla::cpu {
 namespace {
 
@@ -207,6 +227,184 @@ TEST(NanoRtClientTest, CompileAndRunConditionalComputation) {
   EXPECT_EQ(r0_value, 8.0f);
 }
 
+TEST(NanoRtClientTest, CompileAndRunModelWithThreadPool) {
+  // Implements matmul(A, C) + matmul(B, C)
+  absl::string_view hlo = R"(
+    HloModule test_module
+
+ENTRY test_module {
+  first = f32[1024,4096] parameter(0)
+  second = f32[1024,4096] parameter(1)
+  mul_par = f32[4096,4096] parameter(2)
+  matmul_1 = f32[1024,4096] dot(first, mul_par), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  matmul_2 = f32[1024,4096] dot(second, mul_par), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT add = f32[1024,4096] add(matmul_1, matmul_2)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  XlaComputation computation(module->ToProto());
+
+  NanoRtClient client;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<NanoRtExecutable> executable,
+                          client.Compile(computation));
+
+  xla::Literal first_literal =
+      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(1024, 4096, 1.0f));
+  xla::Literal second_literal =
+      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(1024, 4096, 1.0f));
+  xla::Literal mul_literal =
+      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(4096, 4096, 1.0f));
+  xla::Literal result_literal =
+      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(1024, 4096, 0.0f));
+
+  const float expected_result = 4096 * 2;
+
+  absl::Span<float> first_span = first_literal.data<float>();
+  absl::Span<float> second_span = second_literal.data<float>();
+  absl::Span<float> mul_span = mul_literal.data<float>();
+  absl::Span<float> result_span = result_literal.data<float>();
+
+  // Prepare executable parameters, results and temp storage.
+  Arguments arguments = {
+      {first_span.data(), static_cast<int64_t>(first_span.size())},
+      {second_span.data(), static_cast<int64_t>(second_span.size())},
+      {mul_span.data(), static_cast<int64_t>(mul_span.size())}};
+  Results results = {
+      {result_span.data(), static_cast<int64_t>(result_span.size())}};
+  NanoRtExecutable::ManagedTemp<32> temp(executable->temp_buffer_size());
+
+  Eigen::ThreadPool tp(2);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  NanoRtExecutable::ExecuteOptions execute_options;
+  execute_options.set_intra_op_thread_pool(&device);
+  auto event = executable->Execute(arguments, results, temp, execute_options);
+  tsl::BlockUntilReady(event);
+
+  EXPECT_TRUE(event.IsConcrete());
+  EXPECT_EQ(result_span[0], expected_result);
+}
+
+TEST(NanoRtClientTest, CompileAndRunPartitionAndReplicaIdInstructions) {
+  constexpr absl::string_view hlo = R"(
+    HloModule replica-and-partition-id
+
+ENTRY ReplicaAndPartitionId {
+  replica_id = u32[] replica-id()
+  partition_id = u32[] partition-id()
+  ROOT result = (u32[], u32[]) tuple(replica_id, partition_id)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  XlaComputation computation(module->ToProto());
+
+  NanoRtClient client;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<NanoRtExecutable> executable,
+                          client.Compile(computation));
+
+  ComputationPlacer computation_placer;
+  constexpr int kReplicaCount = 2;
+  constexpr int kComputationCount = 2;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_assignment,
+      computation_placer.AssignDevices(kReplicaCount, kComputationCount));
+
+  for (int i = 0; i < kReplicaCount; ++i) {
+    for (int j = 0; j < kComputationCount; ++j) {
+      alignas(32) uint32_t result_replica_id = 0;
+      alignas(32) uint32_t result_computation_id = 0;
+      Results results = {{&result_replica_id, 1}, {&result_computation_id, 1}};
+
+      auto execute_options = NanoRtExecutable::ExecuteOptions();
+      execute_options.set_device_assignment(&device_assignment);
+
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto device_id,
+          computation_placer.DeviceId(i, j, kReplicaCount, kComputationCount));
+      execute_options.set_global_device_id(GlobalDeviceId(device_id));
+
+      auto event = executable->Execute({}, results, {}, execute_options);
+      tsl::BlockUntilReady(event);
+
+      EXPECT_TRUE(event.IsConcrete());
+      EXPECT_EQ(result_replica_id, i);
+      EXPECT_EQ(result_computation_id, j);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Custom call tests below
+//===----------------------------------------------------------------------===//
+
+struct StrUserData {
+  explicit StrUserData(std::string str) : str(std::move(str)) {}
+  std::string str;
+};
+
+static absl::Status Add(StrUserData* user_data,
+                        ffi::BufferR0<PrimitiveType::S32> a,
+                        ffi::BufferR0<PrimitiveType::S32> b,
+                        ffi::Result<ffi::BufferR0<PrimitiveType::S32>> sum) {
+  EXPECT_EQ(user_data->str, "foo");
+  sum->typed_data()[0] = a.typed_data()[0] + b.typed_data()[0];
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kAdd, Add,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::UserData<StrUserData>>()
+                           .Arg<ffi::BufferR0<PrimitiveType::S32>>()
+                           .Arg<ffi::BufferR0<PrimitiveType::S32>>()
+                           .Ret<ffi::BufferR0<PrimitiveType::S32>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_nanort_test$$add", "Host",
+                         kAdd);
+
+TEST(NanoRtClientTest, CustomCallTest) {
+  const char* kModuleStr = R"(
+    HloModule module
+
+    ENTRY custom_call {
+      a = s32[] parameter(0)
+      b = s32[] parameter(1)
+      ROOT custom-call = s32[] custom-call(a, b),
+        custom_call_target="__xla_nanort_test$$add",
+        api_version=API_VERSION_TYPED_FFI
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+  XlaComputation computation(module->ToProto());
+
+  NanoRtClient client;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<NanoRtExecutable> executable,
+                          client.Compile(computation));
+
+  int32_t a = 1.0f;
+  int32_t b = 2.0f;
+  int32_t result = 0.0f;
+
+  std::vector<NanoRtExecutable::Argument> arguments;
+  std::vector<NanoRtExecutable::Result> results;
+  arguments.push_back({&a, 1});
+  arguments.push_back({&b, 1});
+  results.push_back({&result, 1});
+
+  ffi::ExecutionContext context;
+  TF_ASSERT_OK(context.Emplace<StrUserData>("foo"));
+
+  NanoRtExecutable::ExecuteOptions execute_options;
+  execute_options.set_ffi_context(&context);
+
+  auto event = executable->Execute(arguments, results, {}, execute_options);
+  tsl::BlockUntilReady(event);
+
+  EXPECT_TRUE(event.IsConcrete());
+  EXPECT_EQ(result, 3.0f);
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks below
 //===----------------------------------------------------------------------===//
@@ -237,7 +435,8 @@ static absl::StatusOr<XlaComputation> CreateFibonacciComputation() {
   return b.Build();
 }
 
-static void BM_NanoRtAddScalars(benchmark::State& state) {
+static void BM_NanoRtAddScalars(benchmark::State& state,
+                                std::optional<Eigen::ThreadPool> tp) {
   NanoRtClient client;
 
   auto computation = CreateAddScalarsComputation();
@@ -248,23 +447,39 @@ static void BM_NanoRtAddScalars(benchmark::State& state) {
   alignas(32) float p1_value = 2.0f;
   alignas(32) float r0_value = 0.0f;
 
+  NanoRtExecutable::ExecuteOptions execute_options;
+  if (tp) {
+    Eigen::ThreadPoolDevice device(&tp.value(), tp->NumThreads());
+    execute_options.set_intra_op_thread_pool(&device);
+  }
+
   for (auto _ : state) {
     Arguments arguments = {{&p0_value, 1}, {&p1_value, 1}};
     Results results = {{&r0_value, 1}};
 
-    auto event = (*executable)->Execute(arguments, results);
+    auto event =
+        (*executable)->Execute(arguments, results, {}, execute_options);
     tsl::BlockUntilReady(event);
   }
 }
 
-BENCHMARK(BM_NanoRtAddScalars);
+BENCHMARK_CAPTURE(BM_NanoRtAddScalars, no_thread_pool, std::nullopt);
+BENCHMARK_CAPTURE(BM_NanoRtAddScalars, thread_pool,
+                  std::make_optional<Eigen::ThreadPool>(2));
 
-static void BM_NanoRtFibonacci(benchmark::State& state) {
+static void BM_NanoRtFibonacci(benchmark::State& state,
+                               std::optional<Eigen::ThreadPool> tp) {
   NanoRtClient client;
 
   auto computation = CreateFibonacciComputation();
   auto executable = client.Compile(*computation);
 
+  NanoRtExecutable::ExecuteOptions execute_options;
+  if (tp) {
+    Eigen::ThreadPoolDevice device(&tp.value(), tp->NumThreads());
+    execute_options.set_intra_op_thread_pool(&device);
+  }
+
   // Storage for executable arguments and results.
   alignas(32) float p0_value = 1.0f;
   alignas(32) float p1_value = 2.0f;
@@ -274,12 +489,15 @@ static void BM_NanoRtFibonacci(benchmark::State& state) {
     Arguments arguments = {{&p0_value, 1}, {&p1_value, 1}};
     Results results = {{&r0_value, 1}};
 
-    auto event = (*executable)->Execute(arguments, results);
+    auto event =
+        (*executable)->Execute(arguments, results, {}, execute_options);
     tsl::BlockUntilReady(event);
   }
 }
 
-BENCHMARK(BM_NanoRtFibonacci);
+BENCHMARK_CAPTURE(BM_NanoRtFibonacci, no_thread_pool, std::nullopt);
+BENCHMARK_CAPTURE(BM_NanoRtFibonacci, thread_pool,
+                  std::make_optional<Eigen::ThreadPool>(2));
 
 static void BM_PjRtAddScalars(benchmark::State& state) {
   auto client = GetXlaPjrtCpuClient(/*options=*/{});
@@ -289,7 +507,7 @@ static void BM_PjRtAddScalars(benchmark::State& state) {
   auto computation = CreateAddScalarsComputation();
 
   CompileOptions compile_options;
-  auto executable = (*client)->Compile(*computation, compile_options);
+  auto executable = (*client)->CompileAndLoad(*computation, compile_options);
 
   // Storage for executable arguments.
   alignas(32) float p0_value = 1.0f;
@@ -323,7 +541,7 @@ static void BM_PjRtFibonacci(benchmark::State& state) {
   auto computation = CreateFibonacciComputation();
 
   CompileOptions compile_options;
-  auto executable = (*client)->Compile(*computation, compile_options);
+  auto executable = (*client)->CompileAndLoad(*computation, compile_options);
 
   // Storage for executable arguments.
   alignas(32) float p0_value = 1.0f;
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.cc
index d847fc5da341..ced2b3f9bf33 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/cpu/nanort/nanort_executable.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -28,11 +29,16 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
+#include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/thread_pool_task_runner.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_value.h"
@@ -183,6 +189,43 @@ NanoRtExecutable::ExecuteOptions&
 NanoRtExecutable::ExecuteOptions::set_intra_op_thread_pool(
     const Eigen::ThreadPoolDevice* intra_op_thread_pool) {
   intra_op_thread_pool_ = intra_op_thread_pool;
+  task_runner_ = intra_op_thread_pool_ ? std::make_unique<ThreadPoolTaskRunner>(
+                                             intra_op_thread_pool_->getPool())
+                                       : nullptr;
+  return *this;
+}
+
+NanoRtExecutable::ExecuteOptions&
+NanoRtExecutable::ExecuteOptions::set_ffi_context(
+    const ffi::ExecutionContext* ffi_context) {
+  ffi_context_ = ffi_context;
+  return *this;
+}
+
+NanoRtExecutable::ExecuteOptions&
+NanoRtExecutable::ExecuteOptions::set_launch_id(int32_t launch_id) {
+  launch_id_ = launch_id;
+  return *this;
+}
+
+NanoRtExecutable::ExecuteOptions&
+NanoRtExecutable::ExecuteOptions::set_local_device_id(
+    LocalDeviceId local_device_id) {
+  local_device_id_ = local_device_id;
+  return *this;
+}
+
+NanoRtExecutable::ExecuteOptions&
+NanoRtExecutable::ExecuteOptions::set_global_device_id(
+    GlobalDeviceId global_device_id) {
+  global_device_id_ = global_device_id;
+  return *this;
+}
+
+NanoRtExecutable::ExecuteOptions&
+NanoRtExecutable::ExecuteOptions::set_device_assignment(
+    DeviceAssignment* device_assignment) {
+  device_assignment_ = device_assignment;
   return *this;
 }
 
@@ -191,6 +234,10 @@ NanoRtExecutable::ExecuteOptions::intra_op_thread_pool() const {
   return intra_op_thread_pool_;
 }
 
+ThreadPoolTaskRunner* NanoRtExecutable::ExecuteOptions::task_runner() const {
+  return task_runner_.get();
+}
+
 absl::StatusOr<std::unique_ptr<NanoRtExecutable>> NanoRtExecutable::Create(
     std::unique_ptr<Executable> executable) {
   const HloModule& module = executable->module();
@@ -265,7 +312,7 @@ static se::DeviceMemoryBase ToDeviceMemory(
 
 tsl::AsyncValueRef<NanoRtExecutable::ExecuteEvent> NanoRtExecutable::Execute(
     absl::Span<const Argument> arguments, absl::Span<const Result> results,
-    PreallocatedTemp temp, ExecuteOptions options) {
+    PreallocatedTemp temp, const ExecuteOptions& options) {
   TraceMe trace([&] {
     return TraceMeEncode("NanoRtExecutable::Execute",
                          {{"name", executable_->module().name()}});
@@ -329,22 +376,57 @@ tsl::AsyncValueRef<NanoRtExecutable::ExecuteEvent> NanoRtExecutable::Execute(
     }
   }
 
-  cpu::BufferAllocations allocations(std::move(buffers));
-
-  // Use the intra-op thread pool to offload thunk executor tasks.
-  auto* intra_op_thread_pool = options.intra_op_thread_pool();
-  ThreadPoolTaskRunner task_runner(
-      intra_op_thread_pool ? intra_op_thread_pool->getPool() : nullptr);
+  struct ExecutionContext {
+    ExecutionContext(cpu::BufferAllocations::Buffers buffers,
+                     FunctionLibrary* function_library,
+                     const ExecuteOptions& options)
+        : allocations(std::move(buffers)),
+          execute_params(Thunk::ExecuteParams{function_library, &allocations,
+                                              /*xfeed=*/nullptr,
+                                              options.intra_op_thread_pool(),
+                                              options.task_runner()}),
+          collective_execute_params(
+              RunId(options.launch_id()), options.local_device_id().value(),
+
+              GlobalDeviceId(options.global_device_id()),
+              options.device_assignment(), /*collectives=*/nullptr),
+          custom_call_execute_params(
+              RunId(options.launch_id()), options.local_device_id().value(),
+              options.intra_op_thread_pool(), options.ffi_context()) {
+      execute_params.collective_params = &collective_execute_params;
+      execute_params.custom_call_params = &custom_call_execute_params;
+    }
 
-  Thunk::ExecuteParams execute_params = {
-      executable->function_library(),
-      &allocations,
-      /*xfeed=*/nullptr,
-      intra_op_thread_pool,
-      &task_runner,
+    cpu::BufferAllocations allocations;
+    Thunk::ExecuteParams execute_params;
+    Thunk::CollectiveExecuteParams collective_execute_params;
+    Thunk::CustomCallExecuteParams custom_call_execute_params;
   };
 
-  return executable->thunks().Execute(execute_params);
+  // Do a heap allocation if we're running with a thread pool, using
+  // custom calls, or passed a device assignment. This allows us to keep the
+  // execution context alive as long as we need it, but also to skip a dynamic
+  // allocation when it is not required.
+  if (options.intra_op_thread_pool() || options.ffi_context() ||
+      options.device_assignment()) {
+    auto execution_context = std::make_unique<ExecutionContext>(
+        std::move(buffers), executable->function_library(), options);
+
+    auto execute_event =
+        executable->thunks().Execute(execution_context->execute_params);
+
+    execute_event.AndThen(
+        [execution_context = std::move(execution_context)] {});
+
+    return execute_event;
+  } else {
+    cpu::BufferAllocations allocations(std::move(buffers));
+    Thunk::ExecuteParams execute_params{
+        executable->function_library(), &allocations,
+        /*xfeed=*/nullptr, options.intra_op_thread_pool(),
+        options.task_runner()};
+    return executable->thunks().Execute(execute_params);
+  }
 }
 
 size_t NanoRtExecutable::temp_buffer_size() const {
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
index b2fc76186539..15efab8dcdc6 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_executable.h
@@ -28,6 +28,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/alignment.h"
+#include "xla/backends/cpu/runtime/thread_pool_task_runner.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/runtime/device_id.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
@@ -54,7 +58,14 @@ class NanoRtExecutable {
 
   class ExecuteOptions {
    public:
-    ExecuteOptions() : intra_op_thread_pool_(nullptr) {}
+    ExecuteOptions()
+        : intra_op_thread_pool_(nullptr),
+          task_runner_(nullptr),
+          local_device_id_(0),
+          global_device_id_(0),
+          device_assignment_(nullptr),
+          launch_id_(0),
+          ffi_context_(nullptr) {}
     // Sets the thread pool device on which to run Eigen subcomputations.
     //
     // This field must be set for XLA:CPU models that call Eigen routines, but
@@ -66,10 +77,38 @@ class NanoRtExecutable {
     ExecuteOptions& set_intra_op_thread_pool(
         const Eigen::ThreadPoolDevice* intra_op_thread_pool);
 
+    ExecuteOptions& set_ffi_context(const ffi::ExecutionContext* ffi_context);
+
+    ExecuteOptions& set_launch_id(int32_t launch_id);
+
+    ExecuteOptions& set_local_device_id(LocalDeviceId local_device_id);
+    ExecuteOptions& set_global_device_id(GlobalDeviceId global_device_id);
+
+    ExecuteOptions& set_device_assignment(DeviceAssignment* device_assignment);
+
     const Eigen::ThreadPoolDevice* intra_op_thread_pool() const;
+    ThreadPoolTaskRunner* task_runner() const;
+
+    LocalDeviceId local_device_id() const { return local_device_id_; }
+    GlobalDeviceId global_device_id() const { return global_device_id_; }
+    DeviceAssignment* device_assignment() const { return device_assignment_; }
+    int32_t launch_id() const { return launch_id_; }
+    const ffi::ExecutionContext* ffi_context() const { return ffi_context_; }
 
    private:
     const Eigen::ThreadPoolDevice* intra_op_thread_pool_;
+    std::unique_ptr<ThreadPoolTaskRunner> task_runner_;
+
+    LocalDeviceId local_device_id_;
+    GlobalDeviceId global_device_id_;
+    DeviceAssignment* device_assignment_;
+
+    // If non-zero, identifies this execution as part of a potentially
+    // multi-device launch. This can be used to detect scheduling errors, e.g.
+    // if multi-host programs are launched in different orders on different
+    // hosts, the launch IDs may be used by the runtime to detect the mismatch.
+    int32_t launch_id_;
+    const ffi::ExecutionContext* ffi_context_;
   };
 
   // A non-owning read-only view into the XLA executable's argument buffer.
@@ -130,13 +169,13 @@ class NanoRtExecutable {
   tsl::AsyncValueRef<ExecuteEvent> Execute(absl::Span<const Argument> arguments,
                                            absl::Span<const Result> results,
                                            PreallocatedTemp temp = {},
-                                           ExecuteOptions options = {});
+                                           const ExecuteOptions& options = {});
 
   template <size_t n>
   tsl::AsyncValueRef<ExecuteEvent> Execute(absl::Span<const Argument> arguments,
                                            absl::Span<const Result> results,
                                            ManagedTemp<n>& temp,
-                                           ExecuteOptions options = {}) {
+                                           const ExecuteOptions& options = {}) {
     return Execute(arguments, results, temp.data(), std::move(options));
   }
 
diff --git a/third_party/xla/xla/backends/cpu/onednn_emitter.cc b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
index 34025d8ce464..d39a6a939b87 100644
--- a/third_party/xla/xla/backends/cpu/onednn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
@@ -90,7 +90,7 @@ static dnnl::graph::logical_tensor::dims OneDnnDimensions(const Shape& shape) {
 }
 
 static dnnl::graph::logical_tensor::dims OneDnnStrides(const Shape& shape) {
-  dnnl::graph::logical_tensor::dims strides(shape.rank());
+  dnnl::graph::logical_tensor::dims strides(shape.dimensions_size());
   int64_t stride = 1;
   for (int i : shape.layout().minor_to_major()) {
     strides.at(i) = stride;
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index c105c1d1183b..9dc97f47e731 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
 load("//xla/service/cpu:build_defs.bzl", "runtime_copts")
 load("//xla/tsl:tsl.bzl", "if_google", "if_windows", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
@@ -86,19 +86,16 @@ cc_library(
     hdrs = ["kernel.h"],
     deps = [
         ":kernel_c_api",
+        ":work_queue",
         "//xla:util",
-        "//xla/backends/cpu/runtime:work_queue",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
     ],
@@ -137,45 +134,20 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "object_pool",
-    hdrs = ["object_pool.h"],
-    deps = [
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "object_pool_test",
-    srcs = ["object_pool_test.cc"],
-    shuffle_tests = False,
-    deps = [
-        ":object_pool",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "//xla/tsl/platform:test_benchmark",
-        "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
 cc_library(
     name = "parallel_loop_runner",
     srcs = ["parallel_loop_runner.cc"],
     hdrs = ["parallel_loop_runner.h"],
     deps = [
-        "//xla/backends/cpu/runtime:work_queue",
+        ":work_queue",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/math:math_util",
         "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -192,8 +164,6 @@ xla_cc_test(
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
     ],
@@ -205,6 +175,7 @@ tf_proto_library(
     create_grpc_library = True,
     make_default_target_header_only = True,
     protodeps = [
+        "//xla/service:buffer_assignment_proto",
         "//xla:xla_data_proto",
         "//xla/service:hlo_proto",
     ] + if_google(["@com_google_protobuf//:any"]),
@@ -218,13 +189,12 @@ cc_library(
     deps = [
         ":buffer_allocations",
         ":function_library",
-        ":resource_use",
         "//xla:executable_run_options",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/backends/cpu/collectives:in_process_collectives",
         "//xla/ffi:execution_context",
         "//xla/runtime:buffer_use",
-        "//xla/service:buffer_assignment",
+        "//xla/runtime:resource_use",
         "//xla/service:global_device_id",
         "//xla/service/cpu:cpu_executable_run_options",
         "//xla/service/cpu:cpu_runtime",
@@ -248,10 +218,10 @@ cc_library(
     hdrs = ["thunk_testlib.h"],
     deps = [
         ":buffer_allocations",
-        ":resource_use",
         ":thunk",
         "//xla:literal",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
@@ -280,13 +250,14 @@ cc_library(
     hdrs = ["thunk_executor.h"],
     local_defines = if_windows(["_ENABLE_EXTENDED_ALIGNED_STORAGE"]),
     deps = [
-        ":resource_use",
         ":thunk",
-        "//xla:util",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:execution_graph",
+        "//xla/runtime:resource_use",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:fixed_array",
@@ -302,8 +273,7 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:context_types_hdrs",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
-    ],
+    ] + xla_internal(["service:execution_graph_visualizer_google"]),
 )
 
 xla_cc_test(
@@ -311,7 +281,6 @@ xla_cc_test(
     srcs = ["thunk_executor_test.cc"],
     deps = [
         ":buffer_allocations",
-        ":resource_use",
         ":thread_pool_task_runner",
         ":thunk",
         ":thunk_executor",
@@ -319,6 +288,7 @@ xla_cc_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
@@ -344,12 +314,11 @@ cc_library(
     deps = [
         ":thunk",
         ":thunk_executor",
-        "//xla/service:buffer_assignment",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/profiler/lib:traceme",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -369,6 +338,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
@@ -378,10 +348,10 @@ xla_cc_test(
     srcs = ["conditional_thunk_test.cc"],
     deps = [
         ":conditional_thunk",
-        ":resource_use",
         ":thunk",
         ":thunk_testlib",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
@@ -397,21 +367,19 @@ cc_library(
         ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/core/collectives:communicator",
-        "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -428,7 +396,6 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -466,24 +433,17 @@ cc_library(
         ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -511,14 +471,9 @@ xla_cc_test(
         ":convolution_thunk",
         ":convolution_thunk_test_util",
         ":thunk",
-        ":thunk_testlib",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla/service:buffer_assignment",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -535,7 +490,6 @@ cc_library(
         ":thunk",
         "//xla:shape_util",
         "//xla:util",
-        "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/service:buffer_assignment",
@@ -543,6 +497,7 @@ cc_library(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -550,7 +505,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -576,7 +530,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -592,18 +545,17 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/core/collectives:communicator",
-        "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -626,6 +578,7 @@ cc_library(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
@@ -634,7 +587,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -643,7 +595,6 @@ cc_library(
     srcs = ["collective_thunk.cc"],
     hdrs = ["collective_thunk.h"],
     deps = [
-        ":resource_use",
         ":thunk",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -655,6 +606,7 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
@@ -666,6 +618,7 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -701,7 +654,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -738,10 +690,12 @@ cc_library(
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:object_pool",
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
         "//xla/service:custom_call_target_registry",
+        "//xla/service:hlo_proto_cc",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
@@ -759,7 +713,6 @@ cc_library(
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -771,6 +724,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/algorithm:container",
@@ -814,7 +768,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -844,11 +797,11 @@ cc_library(
     srcs = ["outfeed_thunk.cc"],
     hdrs = ["outfeed_thunk.h"],
     deps = [
-        ":resource_use",
         ":thunk",
         "//xla:shape_util",
         "//xla:util",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/service/cpu:cpu_runtime",
         "//xla/stream_executor:device_memory",
@@ -859,7 +812,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -868,11 +820,11 @@ xla_cc_test(
     srcs = ["outfeed_thunk_test.cc"],
     deps = [
         ":outfeed_thunk",
-        ":resource_use",
         ":thunk",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -898,7 +850,6 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -927,11 +878,11 @@ cc_library(
     srcs = ["infeed_thunk.cc"],
     hdrs = ["infeed_thunk.h"],
     deps = [
-        ":resource_use",
         ":thunk",
         "//xla:shape_util",
         "//xla:util",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/service/cpu:cpu_runtime",
         "//xla/stream_executor:device_memory",
@@ -942,7 +893,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -951,11 +901,11 @@ xla_cc_test(
     srcs = ["infeed_thunk_test.cc"],
     deps = [
         ":infeed_thunk",
-        ":resource_use",
         ":thunk",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -997,7 +947,6 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -1026,25 +975,24 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "resource_use",
-    srcs = ["resource_use.cc"],
-    hdrs = ["resource_use.h"],
+    name = "rng_state_lib",
+    srcs = ["rng_state_lib.cc"],
+    hdrs = ["rng_state_lib.h"],
     deps = [
-        "//xla:xla_data_proto_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/base:config",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
 xla_cc_test(
-    name = "resource_use_test",
-    srcs = ["resource_use_test.cc"],
+    name = "rng_state_lib_test",
+    srcs = ["rng_state_lib_test.cc"],
     deps = [
-        ":resource_use",
+        ":rng_state_lib",
+        "@com_google_absl//absl/numeric:int128",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1053,6 +1001,7 @@ cc_library(
     srcs = ["rng_state_thunk.cc"],
     hdrs = ["rng_state_thunk.h"],
     deps = [
+        ":rng_state_lib",
         ":thunk",
         "//xla:util",
         "//xla/runtime:buffer_use",
@@ -1068,7 +1017,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -1101,7 +1049,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -1152,7 +1099,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -1161,12 +1107,12 @@ xla_cc_test(
     srcs = ["while_thunk_test.cc"],
     deps = [
         ":buffer_allocations",
-        ":resource_use",
         ":thunk",
         ":thunk_testlib",
         ":while_thunk",
         "//xla:literal_util",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
@@ -1186,6 +1132,7 @@ cc_library(
         ":thunk",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service/cpu:runtime_fft",
@@ -1198,7 +1145,6 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -1248,7 +1194,6 @@ cc_library(
         ":logical_id_thunk",
         ":outfeed_thunk",
         ":reduce_scatter_thunk",
-        ":resource_use",
         ":rng_state_thunk",
         ":serdes_base",
         ":sort_thunk",
@@ -1261,12 +1206,16 @@ cc_library(
         "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -1281,14 +1230,10 @@ cc_library(
     deps = [
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/math:math_util",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/time",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -1300,16 +1245,11 @@ xla_cc_test(
         ":work_queue",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -1336,12 +1276,12 @@ xla_cc_test(
         ":logical_id_thunk",
         ":outfeed_thunk",
         ":reduce_scatter_thunk",
-        ":resource_use",
         ":rng_state_thunk",
         ":serdes_base",
         ":sort_thunk",
         ":thunk",
         ":thunk_executor",
+        ":thunk_proto_cc",
         ":thunk_proto_serdes",
         ":thunk_testlib",
         ":topk_thunk",
@@ -1354,16 +1294,23 @@ xla_cc_test(
         "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
         "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_proto_cc",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:casts",
diff --git a/third_party/xla/xla/backends/cpu/runtime/all_gather_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/all_gather_thunk.cc
index e23905e5fc6f..3a941a95aebc 100644
--- a/third_party/xla/xla/backends/cpu/runtime/all_gather_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/all_gather_thunk.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
@@ -74,16 +75,29 @@ tsl::AsyncValueRef<AllGatherThunk::ExecuteEvent> AllGatherThunk::Execute(
 
   return ExecuteWithCommunicator(
       params.collective_params,
-      [&](const RendezvousKey& key, Communicator& comm) {
+      [&](const RendezvousKey& key,
+          Communicator& comm) -> tsl::AsyncValueRef<Communicator::Event> {
         CpuCollectives::Executor executor(key, DefaultCollectiveTimeout());
 
+        tsl::CountDownAsyncValueRef<Communicator::Event> state(
+            data.source.size());
+
         for (int32_t i = 0; i < data.source.size(); ++i) {
           const Shape& shape = source_shape(i);
-          TF_RETURN_IF_ERROR(comm.AllGather(
+          auto communicator_event = comm.AllGather(
               data.source[i], data.destination[i], shape.element_type(),
-              ShapeUtil::ElementsIn(shape), executor));
+              ShapeUtil::ElementsIn(shape), executor);
+
+          communicator_event.AndThen([state, communicator_event]() mutable {
+            if (ABSL_PREDICT_FALSE(communicator_event.IsError())) {
+              state.CountDown(communicator_event.GetError());
+            } else {
+              state.CountDown();
+            }
+          });
         }
-        return absl::OkStatus();
+
+        return state.AsRef();
       });
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/all_reduce_thunk.cc
index 9db71d2d878d..9376f80bd177 100644
--- a/third_party/xla/xla/backends/cpu/runtime/all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/all_reduce_thunk.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -68,7 +69,6 @@ AllReduceThunk::AllReduceThunk(Info info, ReductionKind reduction_kind,
 
 tsl::AsyncValueRef<AllReduceThunk::ExecuteEvent> AllReduceThunk::Execute(
     const ExecuteParams& params) {
-
   TF_ASSIGN_OR_RETURN(OpDeviceMemory data, GetOpDeviceMemory(params));
 
   VLOG(3) << absl::StreamFormat(
@@ -101,18 +101,29 @@ tsl::AsyncValueRef<AllReduceThunk::ExecuteEvent> AllReduceThunk::Execute(
 
   return ExecuteWithCommunicator(
       params.collective_params,
-      [&](const RendezvousKey& key, Communicator& comm) {
+      [&, data = std::move(data)](const RendezvousKey& key, Communicator& comm)
+          -> tsl::AsyncValueRef<Communicator::Event> {
+        tsl::CountDownAsyncValueRef<Communicator::Event> state(
+            data.source.size());
+
         CpuCollectives::Executor executor(key, DefaultCollectiveTimeout());
         for (int32_t i = 0; i < data.source.size(); ++i) {
           const Shape& shape = destination_shape(i);
-          TF_RETURN_IF_ERROR(comm.AllReduce(
+
+          auto communicator_event = comm.AllReduce(
               data.source[i], data.destination[i], shape.element_type(),
-              ShapeUtil::ElementsIn(shape), reduction_kind_, executor));
+              ShapeUtil::ElementsIn(shape), reduction_kind_, executor);
+
+          communicator_event.AndThen([state, communicator_event]() mutable {
+            if (ABSL_PREDICT_FALSE(communicator_event.IsError())) {
+              state.CountDown(communicator_event.GetError());
+            } else {
+              state.CountDown();
+            }
+          });
         }
-        return absl::OkStatus();
+        return state.AsRef();
       });
-
-  return OkExecuteEvent();
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/all_to_all_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/all_to_all_thunk.cc
index 42452761da6a..31a667b8031c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/all_to_all_thunk.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "xla/backends/cpu/collectives/cpu_collectives.h"
@@ -54,7 +53,6 @@ AllToAllThunk::AllToAllThunk(Info info, OpParams op_params,
 
 tsl::AsyncValueRef<AllToAllThunk::ExecuteEvent> AllToAllThunk::Execute(
     const ExecuteParams& params) {
-
   TF_ASSIGN_OR_RETURN(OpDeviceMemory data, GetOpDeviceMemory(params));
 
   VLOG(3) << absl::StreamFormat(
@@ -79,11 +77,9 @@ tsl::AsyncValueRef<AllToAllThunk::ExecuteEvent> AllToAllThunk::Execute(
         CpuCollectives::Executor executor(key, DefaultCollectiveTimeout());
         const Shape& shape = destination_shape(0);
 
-        TF_RETURN_IF_ERROR(
-            comm.AllToAll(data.source, data.destination, shape.element_type(),
-                          ShapeUtil::ElementsIn(shape), executor));
-
-        return absl::OkStatus();
+        return comm.AllToAll(std::move(data.source),
+                             std::move(data.destination), shape.element_type(),
+                             ShapeUtil::ElementsIn(shape), executor);
       });
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/call_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/call_thunk.cc
index 39132fe451bf..97193c2f782b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/call_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/call_thunk.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/call_thunk.h"
 
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -52,4 +55,10 @@ CallThunk::ResourceUses CallThunk::resource_uses() const {
   return called_executor_.resource_uses();
 }
 
+std::vector<std::pair<std::string, const ThunkSequence*>>
+CallThunk::nested_thunks() const {
+  return {{absl::StrCat(info().op_name, "-called_sequence"),
+           &called_executor_.thunk_sequence()}};
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/call_thunk.h b/third_party/xla/xla/backends/cpu/runtime/call_thunk.h
index 6444ea3f8fff..618176559d05 100644
--- a/third_party/xla/xla/backends/cpu/runtime/call_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/call_thunk.h
@@ -17,6 +17,9 @@ limitations under the License.
 #define XLA_BACKENDS_CPU_RUNTIME_CALL_THUNK_H_
 
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/runtime/thunk.h"
@@ -39,6 +42,9 @@ class CallThunk final : public Thunk {
 
   const ThunkExecutor& called_executor() const { return called_executor_; }
 
+  std::vector<std::pair<std::string, const ThunkSequence*>> nested_thunks()
+      const final;
+
  private:
   CallThunk(Info info, ThunkExecutor called_executor);
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/collective_permute_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/collective_permute_thunk.cc
index 3d4953559150..e9630ee2d173 100644
--- a/third_party/xla/xla/backends/cpu/runtime/collective_permute_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/collective_permute_thunk.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
@@ -130,15 +131,26 @@ CollectivePermuteThunk::Execute(const ExecuteParams& params) {
       params.collective_params,
       [&](const RendezvousKey& key, Communicator& comm) {
         CpuCollectives::Executor executor(key, DefaultCollectiveTimeout());
-
+        tsl::CountDownAsyncValueRef<Communicator::Event> state(
+            data.source.size());
         for (int32_t i = 0; i < data.source.size(); ++i) {
           const Shape& shape = source_shape(i);
-          TF_RETURN_IF_ERROR(comm.CollectivePermute(
+
+          auto communicator_event = comm.CollectivePermute(
               data.source[i], data.destination[i], shape.element_type(),
               ShapeUtil::ElementsIn(shape), source_replica_id, copy_to,
-              executor));
+              executor);
+
+          communicator_event.AndThen([state, communicator_event]() mutable {
+            if (ABSL_PREDICT_FALSE(communicator_event.IsError())) {
+              state.CountDown(communicator_event.GetError());
+            } else {
+              state.CountDown();
+            }
+          });
         }
-        return absl::OkStatus();
+
+        return state.AsRef();
       });
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc
index 8753c5055a2f..8300d2636b83 100644
--- a/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/collective_thunk.cc
@@ -37,11 +37,11 @@ limitations under the License.
 #include "xla/backends/cpu/collectives/cpu_clique_key.h"
 #include "xla/backends/cpu/collectives/cpu_cliques.h"
 #include "xla/backends/cpu/collectives/cpu_collectives.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
@@ -51,7 +51,6 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -216,9 +215,7 @@ CollectiveThunk::ExecuteWithCommunicator(
       Communicator * communicator,
       AcquireCommunicator(collectives, clique_key, RankId(rank)));
 
-  TF_RETURN_IF_ERROR(callback(key, *communicator));
-
-  return OkExecuteEvent();
+  return callback(key, *communicator);
 }
 
 const BufferAllocation::Slice& CollectiveThunk::source_buffer(
diff --git a/third_party/xla/xla/backends/cpu/runtime/collective_thunk.h b/third_party/xla/xla/backends/cpu/runtime/collective_thunk.h
index 268fef3b43f3..c9cdcec375d2 100644
--- a/third_party/xla/xla/backends/cpu/runtime/collective_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/collective_thunk.h
@@ -29,13 +29,14 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/global_device_id.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/xla_data.pb.h"
 
@@ -104,8 +105,8 @@ class CollectiveThunk : public Thunk {
   ResourceUses resource_uses() const final;
 
   // Callback for collective thunk implementations.
-  using Callback = absl::AnyInvocable<absl::Status(const RendezvousKey& key,
-                                                   Communicator& comm)>;
+  using Callback = absl::AnyInvocable<tsl::AsyncValueRef<Communicator::Event>(
+      const RendezvousKey& key, Communicator& comm)>;
 
   static bool IsDataTypeSupportedByCollectiveReduce(PrimitiveType datatype);
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
index fe4220ad562e..308f95877e3a 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
@@ -15,14 +15,17 @@ limitations under the License.
 
 #include "xla/backends/cpu/runtime/conditional_thunk.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
@@ -115,4 +118,16 @@ ConditionalThunk::ResourceUses ConditionalThunk::resource_uses() const {
   return resource_uses;
 }
 
+std::vector<std::pair<std::string, const ThunkSequence*>>
+ConditionalThunk::nested_thunks() const {
+  std::vector<std::pair<std::string, const ThunkSequence*>> result;
+  result.reserve(branch_executors_.size());
+  size_t branch_idx = 0;
+  for (const auto& branch_executor : branch_executors_) {
+    result.emplace_back(absl::StrCat(info().op_name, "-branch_", branch_idx++),
+                        &branch_executor.thunk_sequence());
+  }
+  return result;
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
index a7cf133fb3fc..7a980561c7bb 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.h
@@ -17,11 +17,14 @@ limitations under the License.
 #define XLA_BACKENDS_CPU_RUNTIME_CONDITIONAL_THUNK_H_
 
 #include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla::cpu {
@@ -45,6 +48,9 @@ class ConditionalThunk final : public Thunk {
     return branch_index_buffer_;
   }
 
+  std::vector<std::pair<std::string, const ThunkSequence*>> nested_thunks()
+      const final;
+
  private:
   ConditionalThunk(Info info, BufferAllocation::Slice branch_index_buffer,
                    std::vector<ThunkExecutor> branch_executors);
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
index 589273b87977..b3f0cda919a5 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc b/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc
index 59931b302d78..92a86cbf7601 100644
--- a/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/convolution_lib.cc
@@ -45,7 +45,7 @@ static size_t GetConvolutionRank(const Shape& input_shape) {
   // Convolution rank is the number of spatial dimensions. Besides spatial
   // dimensions, input shape contains two other dimensions (batch size and the
   // number of channels).
-  return input_shape.dimensions_size() - 2;
+  return input_shape.dimensions().size() - 2;
 }
 
 static absl::Status ValidateConvolutionShapes(
@@ -59,13 +59,13 @@ static absl::Status ValidateConvolutionShapes(
   }
 
   // Rank of input, kernel and output buffers.
-  if (input_shape.dimensions_size() != kernel_shape.dimensions_size() ||
-      input_shape.dimensions_size() != output_shape.dimensions_size()) {
+  if (input_shape.dimensions().size() != kernel_shape.dimensions().size() ||
+      input_shape.dimensions().size() != output_shape.dimensions().size()) {
     return InvalidArgument(
         "ConvolutionThunk: Buffer ranks mismatch. Input rank (%d) vs kernel "
         "rank (%d) vs output rank (%d)",
-        input_shape.dimensions_size(), kernel_shape.dimensions_size(),
-        output_shape.dimensions_size());
+        input_shape.dimensions().size(), kernel_shape.dimensions().size(),
+        output_shape.dimensions().size());
   }
 
   // Batch size.
diff --git a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
index 606ccc226cd0..68aa04385330 100644
--- a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/dynamic_annotations.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
@@ -57,11 +59,10 @@ limitations under the License.
 #include "xla/util.h"
 
 namespace xla::cpu {
-namespace {
 
 using AttributesMap = ffi::CallFrameBuilder::AttributesMap;
 
-absl::StatusOr<AttributesMap> ParseAttributes(
+static absl::StatusOr<AttributesMap> ParseAttributes(
     absl::string_view backend_config) {
   AttributesMap attributes;
   if (!backend_config.empty() && backend_config != "{}") {
@@ -85,19 +86,11 @@ absl::StatusOr<AttributesMap> ParseAttributes(
 
 // Call `instantiate` callback if passed. This function needs its own copy of
 // attributes, that's what AttributesBuilder expects, there's no way around it.
-absl::Status InstantiateHandlerState(absl::string_view target_name,
-                                     ffi::ExecutionState* execution_state,
-                                     AttributesMap attributes) {
-  // Find the registered FFI handler for this target.
-  auto handler = ffi::FindHandler(target_name, "Host");
-  if (!handler.ok()) {
-    return NotFound(
-        "No registered implementation for FFI custom call to %s for Host",
-        target_name);
-  }
-
+static absl::Status InstantiateHandlerState(
+    ffi::HandlerRegistration& handler, ffi::ExecutionState* execution_state,
+    AttributesMap attributes) {
   // Initialize FFI handler state if it has an instantiate callback.
-  if (handler->bundle.instantiate) {
+  if (handler.bundle.instantiate) {
     // At FFI handler instantiation time, we don't have any arguments or results
     ffi::CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
 
@@ -109,7 +102,7 @@ absl::Status InstantiateHandlerState(absl::string_view target_name,
 
     ffi::CallOptions options;
     options.execution_state = execution_state;
-    TF_RETURN_IF_ERROR(Call(handler->bundle.instantiate, instantiate_call_frame,
+    TF_RETURN_IF_ERROR(Call(handler.bundle.instantiate, instantiate_call_frame,
                             options, XLA_FFI_ExecutionStage_INSTANTIATE));
   }
 
@@ -119,7 +112,7 @@ absl::Status InstantiateHandlerState(absl::string_view target_name,
 // Builds a call frame prototype for typed-FFI custom calls with dummy device
 // memory addresses. This is called once when creating the CustomCall thunk,
 // then the thunk will need to update the addresses at runtime.
-absl::StatusOr<ffi::CallFrame> BuildCallFrameForTypedFFI(
+static absl::StatusOr<ffi::CallFrame> BuildCallFrameForTypedFFI(
     const CustomCallApiVersion version,
     const CustomCallThunk::OpBuffers& op_buffers,
     const absl::string_view backend_config, AttributesMap attributes) {
@@ -173,7 +166,51 @@ absl::StatusOr<ffi::CallFrame> BuildCallFrameForTypedFFI(
   return builder.Build();
 }
 
-}  // namespace
+static absl::StatusOr<CustomCallThunk::CustomCallTarget> ToCustomCallTarget(
+    CustomCallApiVersion api_version, absl::string_view target_name,
+    void* target) {
+  if (target == nullptr) {
+    return NotFound(
+        "No registered implementation for untyped custom call to %s for Host",
+        target_name);
+  }
+
+  switch (api_version) {
+    case CustomCallApiVersion::API_VERSION_ORIGINAL:
+#ifdef PLATFORM_GOOGLE
+      LOG(FATAL)
+#else
+      LOG(ERROR)
+#endif
+          << "Custom call API version `API_VERSION_ORIGINAL` is not supported "
+             "by XLA:CPU. Prefer https://docs.jax.dev/en/latest/ffi.html. It "
+             "will be fully removed in November 2025.";
+
+      using v1_signature = void (*)(void* /*out*/, const void** /*in*/);
+      return [target](void* out, const void** in, const char* opaque,
+                      size_t opaque_len, XlaCustomCallStatus* status) {
+        auto fn = reinterpret_cast<v1_signature>(target);
+        fn(out, in);
+      };
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
+      using v2_signature = void (*)(void* /*out*/, const void** /*in*/,
+                                    XlaCustomCallStatus* /*status*/);
+      return [target](void* out, const void** in, const char* opaque,
+                      size_t opaque_len, XlaCustomCallStatus* status) {
+        auto fn = reinterpret_cast<v2_signature>(target);
+        fn(out, in, status);
+      };
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
+      using v3_signature =
+          void (*)(void* /*out*/, const void** /*in*/, const char* /*opaque*/,
+                   size_t /*opaque_len*/, XlaCustomCallStatus* /*status*/);
+      return reinterpret_cast<v3_signature>(target);
+    default:
+      return InvalidArgument(
+          "Unknown custom-call API version enum value: %d (%s)", api_version,
+          CustomCallApiVersion_Name(api_version));
+  }
+}
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     Info info, absl::string_view target_name, OpBuffers op_buffers,
@@ -181,35 +218,49 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
   std::optional<ffi::CallFrame> call_frame;
   auto execution_state = std::make_unique<ffi::ExecutionState>();
 
+  // Resolve custom call target name to the target callable.
+  std::variant<CustomCallTarget, ffi::HandlerRegistration> target;
+
   if (api_version == CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+    TF_ASSIGN_OR_RETURN(target, ffi::FindHandler(target_name, "Host"));
+
     TF_ASSIGN_OR_RETURN(AttributesMap attributes,
                         ParseAttributes(backend_config));
 
     TF_RETURN_IF_ERROR(InstantiateHandlerState(
-        target_name, execution_state.get(), attributes));
+        std::get<1>(target), execution_state.get(), attributes));
 
     TF_ASSIGN_OR_RETURN(call_frame, BuildCallFrameForTypedFFI(
                                         api_version, op_buffers, backend_config,
                                         std::move(attributes)));
+  } else {
+    auto* registry = CustomCallTargetRegistry::Global();
+    TF_ASSIGN_OR_RETURN(
+        target,
+        ToCustomCallTarget(api_version, target_name,
+                           registry->Lookup(std::string(target_name), "Host")));
   }
 
-  return absl::WrapUnique(
-      new CustomCallThunk(std::move(info), target_name, std::move(op_buffers),
-                          api_version, std::move(backend_config),
-                          std::move(call_frame), std::move(execution_state)));
+  return absl::WrapUnique(new CustomCallThunk(
+      std::move(info), target_name, std::move(target), std::move(op_buffers),
+      api_version, std::move(backend_config), std::move(call_frame),
+      std::move(execution_state)));
 }
 
 CustomCallThunk::CustomCallThunk(
-    Info info, absl::string_view target_name, OpBuffers op_buffers,
-    CustomCallApiVersion api_version, absl::string_view backend_config,
-    std::optional<ffi::CallFrame> call_frame,
+    Info info, absl::string_view target_name,
+    std::variant<CustomCallTarget, ffi::HandlerRegistration> target,
+    OpBuffers op_buffers, CustomCallApiVersion api_version,
+    absl::string_view backend_config, std::optional<ffi::CallFrame> call_frame,
     std::unique_ptr<ffi::ExecutionState> execution_state)
     : Thunk(Kind::kCustomCall, std::move(info)),
       target_name_(target_name),
+      target_(std::move(target)),
       op_buffers_(std::move(op_buffers)),
       api_version_(api_version),
       backend_config_(std::move(backend_config)),
       call_frame_(std::move(call_frame)),
+      call_frames_([this] { return call_frame_->Copy(); }),
       execution_state_(std::move(execution_state)) {}
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::Execute(
@@ -225,14 +276,6 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::Execute(
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::CallTypedFFI(
     const ExecuteParams& params) {
-
-  // Find the registered FFI handler for this target.
-  auto handler = ffi::FindHandler(target_name_, "Host");
-  if (!handler.ok()) {
-    return NotFound(
-        "No registered implementation for FFI custom call to %s for Host",
-        target_name_);
-  }
   if (params.custom_call_params == nullptr) {
     return Internal("CustomCallExecuteParams cannot be nullptr.");
   }
@@ -265,9 +308,10 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::CallTypedFFI(
                                   slice.ToString(), results[i].opaque());
   }
 
-  // Update the FFI call frame with the actual device memory addresses.
-  TF_ASSIGN_OR_RETURN(ffi::CallFrame call_frame,
-                      call_frame_->CopyWithBuffers(arguments, results));
+  // Borrow the FFI call frame from the object pool and update with the actual
+  // device memory addresses.
+  TF_ASSIGN_OR_RETURN(auto call_frame, call_frames_.GetOrCreate());
+  TF_RETURN_IF_ERROR(call_frame->UpdateWithBuffers(arguments, results));
 
   // Forward ExecutableRunOptions to the FFI handlers via the call options.
   CustomCallExecuteParams* custom_call_params = params.custom_call_params;
@@ -279,21 +323,12 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::CallTypedFFI(
       custom_call_params->ffi_execution_context,
       execution_state_.get()};
 
-  return ffi::CallAsync(handler->bundle.execute, call_frame, call_options);
+  ffi::HandlerRegistration& handler = std::get<1>(target_);
+  return ffi::CallAsync(handler.bundle.execute, *call_frame, call_options);
 }
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::CallUntypedAPI(
     const ExecuteParams& params) {
-
-  // Find the corresponding call target.
-  void* call_target =
-      CustomCallTargetRegistry::Global()->Lookup(target_name_, "Host");
-  if (!call_target) {
-    return NotFound(
-        "No registered implementation for untyped custom call to %s for Host",
-        target_name_);
-  }
-
   // Collect raw input pointers in an array.
   absl::InlinedVector<const void*, 8> arguments;
   arguments.reserve(op_buffers_.arguments_buffers.size());
@@ -324,44 +359,13 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> CustomCallThunk::CallUntypedAPI(
 
   void* out_ptr = op_buffers_.is_tuple_result ? results.data() : results[0];
 
-  // Set up the correct function type for each API version.
-  CustomCallTarget custom_call_target;
-  switch (api_version_) {
-    case CustomCallApiVersion::API_VERSION_ORIGINAL:
-      using v1_signature = void (*)(void* /*out*/, const void** /*in*/);
-      custom_call_target = [call_target](void* out, const void** in,
-                                         const char* opaque, size_t opaque_len,
-                                         XlaCustomCallStatus* status) {
-        auto fn = reinterpret_cast<v1_signature>(call_target);
-        fn(out, in);
-      };
-      break;
-    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
-      using v2_signature = void (*)(void* /*out*/, const void** /*in*/,
-                                    XlaCustomCallStatus* /*status*/);
-      custom_call_target = [call_target](void* out, const void** in,
-                                         const char* opaque, size_t opaque_len,
-                                         XlaCustomCallStatus* status) {
-        auto fn = reinterpret_cast<v2_signature>(call_target);
-        fn(out, in, status);
-      };
-      break;
-    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
-      using v3_signature =
-          void (*)(void* /*out*/, const void** /*in*/, const char* /*opaque*/,
-                   size_t /*opaque_len*/, XlaCustomCallStatus* /*status*/);
-      custom_call_target = reinterpret_cast<v3_signature>(call_target);
-      break;
-    default:
-      return InvalidArgument(
-          "Unknown custom-call API version enum value: %d (%s)", api_version_,
-          CustomCallApiVersion_Name(api_version_));
-  }
-
   // Call the function and check execution status.
+  CustomCallTarget custom_call_target = std::get<0>(target_);
+
   XlaCustomCallStatus status;
   custom_call_target(out_ptr, in_ptrs, backend_config_.c_str(),
                      backend_config_.size(), &status);
+
   auto status_message = xla::CustomCallStatusGetMessage(&status);
   if (status_message.has_value()) {
     return Internal("%s", status_message.value());
diff --git a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.h b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.h
index 81d2b504e599..2a95bf1c213a 100644
--- a/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/custom_call_thunk.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "absl/status/statusor.h"
@@ -28,8 +29,11 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/execution_state.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
@@ -38,6 +42,10 @@ namespace xla::cpu {
 // Handles XLA custom calls.
 class CustomCallThunk final : public Thunk {
  public:
+  // Function signature for legacy untyped custom call API.
+  using CustomCallTarget = std::function<void(void*, const void**, const char*,
+                                              size_t, XlaCustomCallStatus*)>;
+
   // Buffer allocation slices and shapes to fill FFI arguments.
   struct OpBuffers {
     std::vector<BufferAllocation::Slice> arguments_buffers;
@@ -62,11 +70,13 @@ class CustomCallThunk final : public Thunk {
   const std::string& backend_config() const { return backend_config_; }
 
  private:
-  CustomCallThunk(Info info, absl::string_view target_name,
-                  OpBuffers op_buffers, CustomCallApiVersion api_version,
-                  absl::string_view backend_config,
-                  std::optional<ffi::CallFrame> call_frame,
-                  std::unique_ptr<ffi::ExecutionState> execution_state);
+  CustomCallThunk(
+      Info info, absl::string_view target_name,
+      std::variant<CustomCallTarget, ffi::HandlerRegistration> target,
+      OpBuffers op_buffers, CustomCallApiVersion api_version,
+      absl::string_view backend_config,
+      std::optional<ffi::CallFrame> call_frame,
+      std::unique_ptr<ffi::ExecutionState> execution_state);
 
   // Handles typed-FFI custom calls (API v4).
   tsl::AsyncValueRef<ExecuteEvent> CallTypedFFI(const ExecuteParams& params);
@@ -74,16 +84,20 @@ class CustomCallThunk final : public Thunk {
   // Handles legacy, untyped custom calls (API v1-v3).
   tsl::AsyncValueRef<ExecuteEvent> CallUntypedAPI(const ExecuteParams& params);
 
-  // Function signature for legacy untyped API.
-  using CustomCallTarget = std::function<void(void*, const void**, const char*,
-                                              size_t, XlaCustomCallStatus*)>;
-
   std::string target_name_;
+  std::variant<CustomCallTarget, ffi::HandlerRegistration> target_;
+
   OpBuffers op_buffers_;
   CustomCallApiVersion api_version_;
   std::string backend_config_;
+
+  // Reference call frame pre-initialized at construction time.
   std::optional<ffi::CallFrame> call_frame_;
 
+  // A pool of call frames used at run time. Newly created call frames are
+  // copied from the reference call frame and updated with buffer addresses.
+  ObjectPool<ffi::CallFrame> call_frames_;
+
   // Execution state bound to the FFI handler. Optional.
   std::unique_ptr<ffi::ExecutionState> execution_state_;
 };
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc b/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc
index 8ad719379e3b..2b477022831e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
@@ -106,8 +107,9 @@ absl::StatusOr<DotShape> GetDotShape(const DotDimensionNumbers& dot_dimensions,
 
   // Check that matmul shapes are rank 2 or less and can be represented as
   // Eigen 2D contraction.
-  if (lhs_matmul_shape.rank() > 2 || rhs_matmul_shape.rank() > 2 ||
-      out_matmul_shape.rank() > 2) {
+  if (lhs_matmul_shape.dimensions().size() > 2 ||
+      rhs_matmul_shape.dimensions().size() > 2 ||
+      out_matmul_shape.dimensions().size() > 2) {
     return InvalidArgument(
         "MatMul shape must be rank 2 or less: lhs=%s, rhs=%s, out=%s",
         lhs_matmul_shape.ToString(true), rhs_matmul_shape.ToString(true),
@@ -148,19 +150,20 @@ absl::StatusOr<DotCanonicalDims> GetDotCanonicalDims(
   TF_RET_CHECK(rhs_contracting_dims[0] < 2);
 
   auto is_column_major = [](const Shape& shape) {
-    return shape.rank() > 1 && LayoutUtil::Minor(shape.layout(), 0) == 0;
+    return shape.dimensions().size() > 1 &&
+           LayoutUtil::Minor(shape.layout(), 0) == 0;
   };
 
   return DotCanonicalDims{
-      /*m=*/dot_shape.lhs_matmul_shape.rank() <= 1
+      /*m=*/dot_shape.lhs_matmul_shape.dimensions().size() <= 1
           ? int64_t{1}
           : dot_shape.lhs_matmul_shape.dimensions(1 - lhs_contracting_dims[0]),
       /*k=*/dot_shape.lhs_matmul_shape.dimensions(lhs_contracting_dims[0]),
-      /*n=*/dot_shape.rhs_matmul_shape.rank() <= 1
+      /*n=*/dot_shape.rhs_matmul_shape.dimensions().size() <= 1
           ? int64_t{1}
           : dot_shape.rhs_matmul_shape.dimensions(1 - rhs_contracting_dims[0]),
       /*lhs_column_major=*/is_column_major(dot_shape.lhs_matmul_shape),
-      /*lhs_canonical=*/dot_shape.lhs_matmul_shape.rank() <= 1 ||
+      /*lhs_canonical=*/dot_shape.lhs_matmul_shape.dimensions().size() <= 1 ||
           lhs_contracting_dims[0] == 1,
       /*rhs_column_major=*/is_column_major(dot_shape.rhs_matmul_shape),
       /*rhs_canonical=*/rhs_contracting_dims[0] == 0,
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_lib.h b/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
index 1c7bbf9727a2..00a29d7d2fc2 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_lib.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
index 7a68eaf0503f..cd68d42acf28 100644
--- a/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/dot_thunk.cc
@@ -172,6 +172,9 @@ tsl::AsyncValueRef<DotThunk::ExecuteEvent> DotThunk::Execute(
   };
 
   switch (element_type) {
+    case BF16:
+      dispatch(bfloat16{});  // Enable Eigen BF16 kernel for fallback.
+      break;
     case F16:
       dispatch(half{});
       break;
diff --git a/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc
index b2bf851485ff..cc099c66be53 100644
--- a/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/fft_thunk.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
@@ -79,7 +80,7 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> FftThunk::Execute(
   // Flatten operand batches.
   absl::InlinedVector<int64_t, 4> operand_shape_flat(fft_rank + 1);
   int64_t input_batch = 1;
-  int64_t input_batch_length = output_shape_.dimensions_size() - fft_rank;
+  int64_t input_batch_length = output_shape_.dimensions().size() - fft_rank;
   for (int i = 0; i < input_batch_length; i++) {
     input_batch *= input_shape_.dimensions(i);
   }
diff --git a/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.cc
index 76f7b5f66b8a..c313f9f09711 100644
--- a/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/xfeed_manager.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.h b/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.h
index e8f4d8e88bab..cf4905f54603 100644
--- a/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/infeed_thunk.h
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/infeed_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/infeed_thunk_test.cc
index 76f4e27e4cec..e148f528a753 100644
--- a/third_party/xla/xla/backends/cpu/runtime/infeed_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/infeed_thunk_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel.cc b/third_party/xla/xla/backends/cpu/runtime/kernel.cc
index 4464571b2d4e..ac1a5d7181ec 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel.cc
@@ -54,7 +54,7 @@ static absl::InlinedVector<XLA_CPU_KernelArg, 8> ConvertBuffersToKernelArgs(
     absl::Span<const Kernel::DeviceMemoryBase> buffers) {
   absl::InlinedVector<XLA_CPU_KernelArg, 8> args(buffers.size());
   for (size_t i = 0; i < buffers.size(); ++i) {
-    args[i].data = const_cast<void*>(buffers[i].opaque());
+    args[i].data = buffers[i].opaque();
     args[i].size = buffers[i].size();
   }
   return args;
diff --git a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
index 52724d3c58b5..90c15a09bd67 100644
--- a/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/kernel_thunk.cc
@@ -333,11 +333,18 @@ absl::StatusOr<std::unique_ptr<Thunk>> KernelThunk::Create(
   static constexpr auto _4 = std::integral_constant<size_t, 4>{};
   static constexpr auto _5 = std::integral_constant<size_t, 5>{};
   static constexpr auto _6 = std::integral_constant<size_t, 6>{};
+  static constexpr auto _7 = std::integral_constant<size_t, 7>{};
+  static constexpr auto _8 = std::integral_constant<size_t, 8>{};
+  static constexpr auto _9 = std::integral_constant<size_t, 9>{};
+  static constexpr auto _10 = std::integral_constant<size_t, 10>{};
+  static constexpr auto _11 = std::integral_constant<size_t, 11>{};
+  static constexpr auto _12 = std::integral_constant<size_t, 12>{};
 
   std::pair<size_t, size_t> params(arguments_buffers.size(),
                                    results_buffers.size());
 
   // Return SmallKernelThunk specializations for the most common cases.
+  // NOLINTBEGIN
   if (params == std::make_pair(_0(), _1())) return small_kernel_thunk(_0, _1);
   if (params == std::make_pair(_1(), _1())) return small_kernel_thunk(_1, _1);
   if (params == std::make_pair(_2(), _1())) return small_kernel_thunk(_2, _1);
@@ -345,6 +352,13 @@ absl::StatusOr<std::unique_ptr<Thunk>> KernelThunk::Create(
   if (params == std::make_pair(_4(), _1())) return small_kernel_thunk(_4, _1);
   if (params == std::make_pair(_5(), _1())) return small_kernel_thunk(_5, _1);
   if (params == std::make_pair(_6(), _1())) return small_kernel_thunk(_6, _1);
+  if (params == std::make_pair(_7(), _1())) return small_kernel_thunk(_7, _1);
+  if (params == std::make_pair(_8(), _1())) return small_kernel_thunk(_8, _1);
+  if (params == std::make_pair(_9(), _1())) return small_kernel_thunk(_9, _1);
+  if (params == std::make_pair(_10(), _1())) return small_kernel_thunk(_10, _1);
+  if (params == std::make_pair(_11(), _1())) return small_kernel_thunk(_11, _1);
+  if (params == std::make_pair(_12(), _1())) return small_kernel_thunk(_12, _1);
+  // NOLINTEND
 
   // Return a generic KernelThunk for dynamic numbers of arguments and results.
   return absl::WrapUnique(
diff --git a/third_party/xla/xla/backends/cpu/runtime/object_pool.h b/third_party/xla/xla/backends/cpu/runtime/object_pool.h
deleted file mode 100644
index a739ee57c43e..000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/object_pool.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_OBJECT_POOL_H_
-#define XLA_BACKENDS_CPU_RUNTIME_OBJECT_POOL_H_
-
-#include <atomic>
-#include <cstddef>
-#include <memory>
-#include <optional>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/status/statusor.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla::cpu {
-
-// A non-blocking pool of objects of type `T`. Objects in the pool are created
-// lazily when needed by calling the user-provided `builder` function.
-//
-// This object pool is intended to be used on a critical path and optimized for
-// zero-allocation in steady state.
-template <typename T, typename... Args>
-class ObjectPool {
-  struct Entry {
-    // Keep `object` as optional to allow using object pool for objects that
-    // cannot be default-constructed.
-    std::optional<T> object;
-    std::atomic<Entry*> next;
-  };
-
- public:
-  explicit ObjectPool(absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder);
-  ~ObjectPool();
-
-  class BorrowedObject {
-   public:
-    ~BorrowedObject();
-
-    T& operator*() { return *entry_->object; }
-    T* operator->() { return &*entry_->object; }
-
-    BorrowedObject(BorrowedObject&&) = default;
-    BorrowedObject& operator=(BorrowedObject&&) = default;
-
-   private:
-    friend class ObjectPool;
-
-    BorrowedObject(ObjectPool* parent, std::unique_ptr<Entry> entry);
-
-    ObjectPool* parent_;
-    std::unique_ptr<Entry> entry_;
-  };
-
-  absl::StatusOr<BorrowedObject> GetOrCreate(Args... args);
-
-  size_t num_created() const { return num_created_.load(); }
-
- private:
-  absl::StatusOr<std::unique_ptr<Entry>> CreateEntry(Args... args);
-  std::unique_ptr<Entry> PopEntry();
-  void PushEntry(std::unique_ptr<Entry> entry);
-
-  absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder_;
-  std::atomic<Entry*> head_;
-  std::atomic<size_t> num_created_;
-};
-
-template <typename T, typename... Args>
-ObjectPool<T, Args...>::ObjectPool(
-    absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder)
-    : builder_(std::move(builder)), head_(nullptr), num_created_(0) {}
-
-template <typename T, typename... Args>
-ObjectPool<T, Args...>::~ObjectPool() {
-  while (Entry* entry = head_.load()) {
-    head_.store(entry->next);
-    delete entry;
-  }
-}
-
-template <typename T, typename... Args>
-auto ObjectPool<T, Args...>::CreateEntry(Args... args)
-    -> absl::StatusOr<std::unique_ptr<Entry>> {
-  auto entry = std::make_unique<Entry>();
-  TF_ASSIGN_OR_RETURN(entry->object, builder_(std::forward<Args>(args)...));
-  entry->next = nullptr;
-  num_created_.fetch_add(1);
-  return entry;
-}
-
-template <typename T, typename... Args>
-auto ObjectPool<T, Args...>::PopEntry() -> std::unique_ptr<Entry> {
-  Entry* head = head_.load();
-  while (head && !head_.compare_exchange_weak(head, head->next)) {
-  }
-  return std::unique_ptr<Entry>(head);
-}
-
-template <typename T, typename... Args>
-void ObjectPool<T, Args...>::PushEntry(std::unique_ptr<Entry> entry) {
-  Entry* head = head_.load();
-  Entry* new_head = entry.release();
-  do {
-    new_head->next = head;
-  } while (!head_.compare_exchange_weak(head, new_head));
-}
-
-template <typename T, typename... Args>
-ObjectPool<T, Args...>::BorrowedObject::BorrowedObject(
-    ObjectPool<T, Args...>* parent, std::unique_ptr<Entry> entry)
-    : parent_(parent), entry_(std::move(entry)) {}
-
-template <typename T, typename... Args>
-ObjectPool<T, Args...>::BorrowedObject::~BorrowedObject() {
-  if (parent_ && entry_) parent_->PushEntry(std::move(entry_));
-}
-
-template <typename T, typename... Args>
-auto ObjectPool<T, Args...>::GetOrCreate(Args... args)
-    -> absl::StatusOr<BorrowedObject> {
-  if (std::unique_ptr<Entry> entry = PopEntry()) {
-    return BorrowedObject(this, std::move(entry));
-  }
-  TF_ASSIGN_OR_RETURN(auto entry, CreateEntry(std::forward<Args>(args)...));
-  return BorrowedObject(this, std::move(entry));
-}
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_OBJECT_POOL_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/object_pool_test.cc b/third_party/xla/xla/backends/cpu/runtime/object_pool_test.cc
deleted file mode 100644
index 14fb05a88ef5..000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/object_pool_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/object_pool.h"
-
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "absl/status/statusor.h"
-#include "absl/synchronization/blocking_counter.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/test_benchmark.h"
-#include "xla/tsl/platform/threadpool.h"
-
-namespace xla::cpu {
-namespace {
-
-using IntPool = ObjectPool<std::unique_ptr<int32_t>>;
-
-TEST(ObjectPoolTest, GetOrCreate) {
-  int32_t counter = 0;
-  IntPool pool([&]() -> absl::StatusOr<std::unique_ptr<int32_t>> {
-    return std::make_unique<int32_t>(counter++);
-  });
-
-  TF_ASSERT_OK_AND_ASSIGN(auto obj0, pool.GetOrCreate());
-  ASSERT_EQ(**obj0, 0);
-
-  TF_ASSERT_OK_AND_ASSIGN(auto obj1, pool.GetOrCreate());
-  ASSERT_EQ(**obj1, 1);
-
-  auto destroy = [](IntPool::BorrowedObject obj) {};
-  destroy(std::move(obj0));
-  destroy(std::move(obj1));
-
-  TF_ASSERT_OK_AND_ASSIGN(auto obj2, pool.GetOrCreate());
-  ASSERT_EQ(**obj2, 1);
-  ASSERT_EQ(counter, 2);
-}
-
-TEST(ObjectPoolTest, GetOrCreateUnderContention) {
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
-
-  std::atomic<int32_t> counter = 0;
-  IntPool pool([&]() -> absl::StatusOr<std::unique_ptr<int32_t>> {
-    return std::make_unique<int32_t>(counter++);
-  });
-
-  size_t num_tasks = 10;
-  absl::BlockingCounter blocking_counter(num_tasks);
-
-  for (int32_t t = 0; t < num_tasks; ++t) {
-    threads.Schedule([&] {
-      for (int32_t i = 0; i < 100; ++i) {
-        TF_ASSERT_OK_AND_ASSIGN(auto obj, pool.GetOrCreate());
-        ASSERT_GE(**obj, 0);
-      }
-      blocking_counter.DecrementCount();
-    });
-  }
-
-  blocking_counter.Wait();
-
-  // We should create at most one object for each thread in the pool.
-  EXPECT_LE(counter, 8);
-}
-
-//===----------------------------------------------------------------------===//
-// Performance benchmarks.
-//===----------------------------------------------------------------------===//
-
-static void BM_GetOrCreate(benchmark::State& state) {
-  IntPool pool([cnt = 0]() mutable -> absl::StatusOr<std::unique_ptr<int32_t>> {
-    return std::make_unique<int32_t>(cnt++);
-  });
-
-  for (auto _ : state) {
-    auto obj = pool.GetOrCreate();
-    benchmark::DoNotOptimize(obj);
-  }
-}
-
-BENCHMARK(BM_GetOrCreate);
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD b/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
index 22daf8cf4de7..b31b2ecf9544 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/BUILD
@@ -46,10 +46,10 @@ onednn_graph_cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:onednn_fusion",
         "//xla/backends/cpu/runtime:dot_lib",
-        "//xla/backends/cpu/runtime:object_pool",
         "//xla/backends/cpu/runtime:parallel_loop_runner",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:object_pool",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_fusion_thunk.h b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_fusion_thunk.h
index 7f9aa8b4bb25..25da151d3eee 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_fusion_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_fusion_thunk.h
@@ -27,8 +27,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/onednn_fusion.h"
-#include "xla/backends/cpu/runtime/object_pool.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool.h b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool.h
index 2f2a25393805..91149dc3dca0 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool.h
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool.h
@@ -52,7 +52,9 @@ inline uint64_t OneDnnThreadPool::get_flags() const { return 0; }
 
 inline void OneDnnThreadPool::parallel_for(
     int n, const std::function<void(int, int)>& fn) {
-  runner_->Parallelize(n, [fn, n](size_t task_index) { fn(task_index, n); });
+  runner_->Parallelize(
+      ParallelLoopRunner::RangeDim{static_cast<size_t>(n)},
+      [fn, n](ParallelLoopRunner::RangeIndex i) { fn(i.offset, n); });
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool_test.cc b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool_test.cc
index 3cd9e9923141..ff4e3e28b46b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/onednn/onednn_threadpool_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "oneapi/dnnl/dnnl_common.hpp"
 #include "oneapi/dnnl/dnnl_graph.hpp"
 #include "oneapi/dnnl/dnnl_threadpool.hpp"
-#include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/runtime/onednn/onednn_interop.h"
 #include "xla/backends/cpu/runtime/parallel_loop_runner.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc
index dd35f7b32f34..bc8deeeccbb6 100644
--- a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/xfeed_manager.h"
 #include "xla/stream_executor/device_memory.h"
@@ -54,7 +54,6 @@ OutfeedThunk::OutfeedThunk(Info info,
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent> OutfeedThunk::Execute(
     const ExecuteParams& params) {
-
   VLOG(3) << absl::StreamFormat("Outfeed %d buffers", outfeed_buffers_.size());
 
   runtime::XfeedManager* xfeed = params.xfeed;
diff --git a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.h b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.h
index ba9c1ce31559..cda8c9951d1f 100644
--- a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk.h
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc
index 8073773889a7..d2004c2f54b0 100644
--- a/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/outfeed_thunk_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.cc b/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.cc
index 8db79a65fb24..6e519753fa00 100644
--- a/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <limits>
+#include <tuple>
 #include <utility>
 
 #include "absl/base/attributes.h"
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/work_queue.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
-#include "xla/tsl/lib/math/math_util.h"
 #include "xla/tsl/platform/logging.h"
 
 #define EIGEN_USE_THREADS
@@ -37,8 +36,6 @@ limitations under the License.
 
 namespace xla::cpu {
 
-using Task = std::function<void(size_t task_index)>;
-
 // Returns non-reference-counted async value ref in constructed state.
 //
 // Returned async value is a per-process singleton stored in a storage with a
@@ -108,112 +105,91 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE void ParallelLoopRunner::ScheduleAll(
   done_event_ = std::move(count_down_done);
 }
 
-namespace {
-
-// Multidimensional index types for the parallel loop runner tasks. We launch
-// tasks using one-dimensional `task_index` and convert it into a
-// multidimensional index type depending on the loop type.
-
-struct Task1DTile1DIndex {
-  size_t offset;
-  size_t extent;
-};
-
-struct Task2DTile1DIndex {
-  size_t i;
-  size_t offset_j;
-  size_t extent_j;
-};
-
-struct Task3DTile2DIndex {
-  size_t i;
-  size_t offset_j;
-  size_t offset_k;
-  size_t extent_j;
-  size_t extent_k;
-};
-
-}  // namespace
-
-static Task1DTile1DIndex Delinearize(size_t task_index, size_t range,
-                                     size_t tile) {
-  size_t offset = task_index * tile;
-  size_t extent = std::min(range - offset, tile);
-  return {offset, extent};
-}
-
-static size_t NumTasks(size_t range_i, size_t range_j, size_t tile_j) {
-  size_t num_tile_j_tasks = tsl::MathUtil::CeilOfRatio(range_j, tile_j);
-  size_t num_tasks = range_i * num_tile_j_tasks;
-  DCHECK_GT(num_tasks, 0) << "Expected at least one tile task";
-  return num_tasks;
-}
-
-static Task2DTile1DIndex Delinearize(size_t task_index, size_t range_i,
-                                     size_t range_j, size_t tile_j) {
-  size_t num_tile_j_tasks = tsl::MathUtil::CeilOfRatio(range_j, tile_j);
-  DCHECK_GT(num_tile_j_tasks, 0) << "Expected at least one tile j task";
+// A collection of helper macros to define parallel task structs for ND loops
+// with different types of dimensions.
 
-  // Compute task indices along the `i` and `j` dimensions.
-  size_t task_i = task_index / num_tile_j_tasks;
-  size_t task_j = task_index % num_tile_j_tasks;
-
-  // Convert task index into the offset and extent along the `j` dimension.
-  size_t offset_j = task_j * tile_j;
-  size_t extent_j = std::min(range_j - offset_j, tile_j);
+#define DEFINE_PARALLEL_TASK_1D(TASK, DIM0)                                 \
+  struct ParallelLoopRunner::Parallel##TASK {                               \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { \
+      std::apply(task, Delinearize(task_index, this->i));                   \
+    }                                                                       \
+    DIM0 i;                                                                 \
+    TASK task;                                                              \
+  }
 
-  return {task_i, offset_j, extent_j};
-}
+#define DEFINE_PARALLEL_TASK_2D(TASK, DIM0, DIM1)                           \
+  struct ParallelLoopRunner::Parallel##TASK {                               \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { \
+      std::apply(task, Delinearize(task_index, this->i, this->j));          \
+    }                                                                       \
+    DIM0 i;                                                                 \
+    DIM1 j;                                                                 \
+    TASK task;                                                              \
+  }
 
-static size_t NumTasks(size_t range_i, size_t range_j, size_t range_k,
-                       size_t tile_j, size_t tile_k) {
-  size_t num_tile_j_tasks = tsl::MathUtil::CeilOfRatio(range_j, tile_j);
-  size_t num_tile_k_tasks = tsl::MathUtil::CeilOfRatio(range_k, tile_k);
-  size_t num_tasks = range_i * num_tile_j_tasks * num_tile_k_tasks;
-  DCHECK_GT(num_tasks, 0) << "Expected at least one tile task";
-  return num_tasks;
-}
+#define DEFINE_PARALLEL_TASK_3D(TASK, DIM0, DIM1, DIM2)                     \
+  struct ParallelLoopRunner::Parallel##TASK {                               \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { \
+      std::apply(task, Delinearize(task_index, this->i, this->j, this->k)); \
+    }                                                                       \
+    DIM0 i;                                                                 \
+    DIM1 j;                                                                 \
+    DIM2 k;                                                                 \
+    TASK task;                                                              \
+  }
 
-static Task3DTile2DIndex Delinearize(size_t task_index, size_t range_i,
-                                     size_t range_j, size_t range_k,
-                                     size_t tile_j, size_t tile_k) {
-  size_t num_tile_j_tasks = tsl::MathUtil::CeilOfRatio(range_j, tile_j);
-  size_t num_tile_k_tasks = tsl::MathUtil::CeilOfRatio(range_k, tile_k);
-  size_t num_tile_tasks = num_tile_j_tasks * num_tile_k_tasks;
+#define DEFINE_PARALLEL_TASK_4D(TASK, DIM0, DIM1, DIM2, DIM3)                  \
+  struct ParallelLoopRunner::Parallel##TASK {                                  \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const {    \
+      std::apply(task,                                                         \
+                 Delinearize(task_index, this->i, this->j, this->k, this->l)); \
+    }                                                                          \
+    DIM0 i;                                                                    \
+    DIM1 j;                                                                    \
+    DIM2 k;                                                                    \
+    DIM3 l;                                                                    \
+    TASK task;                                                                 \
+  }
 
-  DCHECK_GT(num_tile_j_tasks, 0) << "Expected at least one tile j task";
-  DCHECK_GT(num_tile_k_tasks, 0) << "Expected at least one tile k task";
+#define DEFINE_PARALLEL_TASK_5D(TASK, DIM0, DIM1, DIM2, DIM3, DIM4)         \
+  struct ParallelLoopRunner::Parallel##TASK {                               \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const { \
+      std::apply(task, Delinearize(task_index, this->i, this->j, this->k,   \
+                                   this->l, this->m));                      \
+    }                                                                       \
+    DIM0 i;                                                                 \
+    DIM1 j;                                                                 \
+    DIM2 k;                                                                 \
+    DIM3 l;                                                                 \
+    DIM4 m;                                                                 \
+    TASK task;                                                              \
+  }
 
-  // Compute task indices along the `i`, `j` and `k` dimensions.
-  size_t task_i = task_index / num_tile_tasks;
-  task_index %= num_tile_tasks;
+DEFINE_PARALLEL_TASK_1D(Task1D, RangeDim);
+DEFINE_PARALLEL_TASK_1D(Task1DTile1D, TileDim);
 
-  size_t task_j = task_index / num_tile_k_tasks;
-  task_index %= num_tile_k_tasks;
+DEFINE_PARALLEL_TASK_2D(Task2D, RangeDim, RangeDim);
+DEFINE_PARALLEL_TASK_2D(Task2DTile1D, RangeDim, TileDim);
+DEFINE_PARALLEL_TASK_2D(Task2DTile2D, TileDim, TileDim);
 
-  size_t task_k = task_index;
+DEFINE_PARALLEL_TASK_3D(Task3D, RangeDim, RangeDim, RangeDim);
+DEFINE_PARALLEL_TASK_3D(Task3DTile1D, RangeDim, RangeDim, TileDim);
+DEFINE_PARALLEL_TASK_3D(Task3DTile2D, RangeDim, TileDim, TileDim);
 
-  // Convert task indices into the offset and extent along the `j` and `k`
-  // dimensions.
-  size_t offset_j = task_j * tile_j;
-  size_t offset_k = task_k * tile_k;
-  size_t extent_j = std::min(range_j - offset_j, tile_j);
-  size_t extent_k = std::min(range_k - offset_k, tile_k);
+DEFINE_PARALLEL_TASK_4D(Task4DTile2D, RangeDim, RangeDim, TileDim, TileDim);
 
-  return {task_i, offset_j, offset_k, extent_j, extent_k};
-}
+DEFINE_PARALLEL_TASK_5D(Task5D, RangeDim, RangeDim, RangeDim, RangeDim,
+                        RangeDim);
+DEFINE_PARALLEL_TASK_5D(Task5DTile2D, RangeDim, RangeDim, RangeDim, TileDim,
+                        TileDim);
 
-// XNNPACK tends to choose too small tile sizes that create too many tasks. For
-// dynamic versions of parallel loops we can choose tile size to be any multiple
-// of the original tile size. This function ensures that the tile size is at
-// least `min_tile_size`.
-static size_t AdjustTileSize(size_t tile_size, size_t min_tile_size) {
-  size_t adjusted_tile_size = tile_size;
-  while (adjusted_tile_size < min_tile_size) adjusted_tile_size += tile_size;
-  return adjusted_tile_size;
-}
+#undef DEFINE_PARALLEL_TASK_1D
+#undef DEFINE_PARALLEL_TASK_2D
+#undef DEFINE_PARALLEL_TASK_3D
+#undef DEFINE_PARALLEL_TASK_4D
+#undef DEFINE_PARALLEL_TASK_5D
 
-// In the `Parallelize` implementations below:
+// Parallelize `task` over dimensions `dims` using `ParallelTask`.
 //
 // (1) If done event is already available, execute the task immediately in the
 //     caller thread. In this case we don't need to overwrite the done event,
@@ -225,159 +201,139 @@ static size_t AdjustTileSize(size_t tile_size, size_t min_tile_size) {
 //
 // We wrap all tasks into structs conforming to the `ParallelTest` API, so that
 // in profiles we can see human-readable names of the tasks instead of lambdas.
+template <typename ParallelTask, typename... Dims, typename Task>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void ParallelLoopRunner::Parallelize(Dims... dims,
+                                                                  Task&& task) {
+  DCHECK(done_event_) << "Parallel loop runner is in moved-from state";
 
-struct ParallelLoopRunner::ParallelTask1D {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const {
-    task(task_index);
-  }
-
-  Task1D task;
-};
+  size_t num_tasks = NumTasks(dims...);
+  DCHECK_GT(num_tasks, 0) << "Expected at least one task";
 
-void ParallelLoopRunner::Parallelize(size_t range, Task1D task) {
-  DCHECK(done_event_) << "Parallel loop runner is in moved-from state";
-  DCHECK_GT(range, 0) << "Expected at least one task";
+  // Fast path for the degenerate parallel loop with a single task.
+  if (ABSL_PREDICT_TRUE(num_tasks == 1)) {
+    // Converts the dimension into the first task index.
+    auto to_first_task_index = [](auto dim) {
+      if constexpr (std::is_same_v<decltype(dim), RangeDim>) {
+        return RangeIndex{0};
+      } else {
+        return TileIndex{0, dim.range};
+      }
+    };
 
-  // Fast path for the degenerate parallel loop with single task.
-  if (ABSL_PREDICT_TRUE(range == 1)) {
     // Execute task in the caller thread if done event is already available.
     if (ABSL_PREDICT_TRUE(done_event_.IsConcrete())) {
-      task(0);
+      task(to_first_task_index(dims)...);
       return;
     }
 
     // Schedule task when done event becomes available.
-    ScheduleOne([task = std::move(task)] { task(0); });
+    ScheduleOne([task = std::forward<Task>(task),
+                 idxs = std::make_tuple(to_first_task_index(dims)...)] {
+      std::apply([&task](auto... idxs) { task(idxs...); }, idxs);
+    });
     return;
   }
 
-  ScheduleAll(range, ParallelTask1D{std::move(task)});
+  ScheduleAll(num_tasks, ParallelTask{dims..., std::forward<Task>(task)});
 }
 
-struct ParallelLoopRunner::ParallelTask1DTile1D {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const {
-    auto x = Delinearize(task_index, range, tile);
-    task(x.offset, x.extent);
-  }
-
-  size_t range;
-  size_t tile;
-  Task1DTile1D task;
-};
-
-void ParallelLoopRunner::Parallelize(size_t range, size_t tile,
-                                     Task1DTile1D task) {
-  DCHECK(done_event_) << "Parallel loop runner is in moved-from state";
-
-  size_t num_tasks = tsl::MathUtil::CeilOfRatio(range, tile);
-  DCHECK_GT(num_tasks, 0) << "Expected at least one task";
-
-  // Fast path for the degenerate parallel loop with single task.
-  if (ABSL_PREDICT_TRUE(num_tasks == 1)) {
-    // Execute task in the caller thread if done event is already available.
-    if (ABSL_PREDICT_TRUE(done_event_.IsConcrete())) {
-      task(0, range);
-      return;
-    }
+// Parallelize `task` over dynamic dimensions `dims` using `ParallelTask`.
+template <typename ParallelTask, typename... Dims, typename Task>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void ParallelLoopRunner::ParallelizeDynamic(
+    Dims... dims, Task&& task) {
+  // We target 4 tasks per thread to enable load balancing and keep task
+  // scheduling overheads low.
+  static constexpr size_t kTasksPerThread = 4;
+  std::apply(
+      [&](auto... dynamic_dims) {
+        Parallelize<ParallelTask, Dims...>(dynamic_dims...,
+                                           std::forward<Task>(task));
+      },
+      DynamicDimensions(kTasksPerThread * num_threads(), dims...));
+}
 
-    // Schedule task when done event becomes available.
-    ScheduleOne([range, task = std::move(task)] { task(0, range); });
-    return;
-  }
+void ParallelLoopRunner::Parallelize(RangeDim i, Task1D task) {
+  Parallelize<ParallelTask1D, RangeDim>(i, std::move(task));
+}
 
-  ScheduleAll(num_tasks, ParallelTask1DTile1D{range, tile, std::move(task)});
+void ParallelLoopRunner::Parallelize(RangeDim i, RangeDim j, Task2D task) {
+  Parallelize<ParallelTask2D, RangeDim, RangeDim>(i, j, std::move(task));
 }
 
-void ParallelLoopRunner::ParallelizeDynamic(size_t range, size_t tile,
-                                            Task1DTile1DDynamic task) {
-  Parallelize(range, AdjustTileSize(tile, 128), std::move(task));
+void ParallelLoopRunner::Parallelize(TileDim i, Task1DTile1D task) {
+  Parallelize<ParallelTask1DTile1D, TileDim>(i, std::move(task));
 }
 
-struct ParallelLoopRunner::ParallelTask2DTile1D {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const {
-    auto x = Delinearize(task_index, range_i, range_j, tile_j);
-    task(x.i, x.offset_j, x.extent_j);
-  }
+void ParallelLoopRunner::ParallelizeDynamic(TileDim i, Task1DTile1D task) {
+  ParallelizeDynamic<ParallelTask1DTile1D, TileDim>(i, std::move(task));
+}
 
-  size_t range_i;
-  size_t range_j;
-  size_t tile_j;
-  Task2DTile1D task;
-};
+void ParallelLoopRunner::Parallelize(RangeDim i, TileDim j, Task2DTile1D task) {
+  Parallelize<ParallelTask2DTile1D, RangeDim, TileDim>(i, j, std::move(task));
+}
 
-void ParallelLoopRunner::Parallelize(size_t range_i, size_t range_j,
-                                     size_t tile_j, Task2DTile1D task) {
-  DCHECK(done_event_) << "Parallel loop runner is in moved-from state";
-  size_t num_tasks = NumTasks(range_i, range_j, tile_j);
+void ParallelLoopRunner::ParallelizeDynamic(RangeDim i, TileDim j,
+                                            Task2DTile1D task) {
+  ParallelizeDynamic<ParallelTask2DTile1D, RangeDim, TileDim>(i, j,
+                                                              std::move(task));
+}
 
-  // Fast path for the degenerate parallel loop with single task.
-  if (ABSL_PREDICT_TRUE(num_tasks == 1)) {
-    // Execute task in the caller thread if done event is already available.
-    if (ABSL_PREDICT_TRUE(done_event_.IsConcrete())) {
-      task(0, 0, range_j);
-      return;
-    }
+void ParallelLoopRunner::Parallelize(TileDim i, TileDim j, Task2DTile2D task) {
+  Parallelize<ParallelTask2DTile2D, TileDim, TileDim>(i, j, std::move(task));
+}
 
-    // Schedule task when done event becomes available.
-    ScheduleOne([range_j, task = std::move(task)] { task(0, 0, range_j); });
-    return;
-  }
+void ParallelLoopRunner::ParallelizeDynamic(TileDim i, TileDim j,
+                                            Task2DTile2D task) {
+  ParallelizeDynamic<ParallelTask2DTile2D, TileDim, TileDim>(i, j,
+                                                             std::move(task));
+}
 
-  ScheduleAll(num_tasks,
-              ParallelTask2DTile1D{range_i, range_j, tile_j, std::move(task)});
+void ParallelLoopRunner::Parallelize(RangeDim i, RangeDim j, RangeDim k,
+                                     Task3D task) {
+  Parallelize<ParallelTask3D, RangeDim, RangeDim, RangeDim>(i, j, k,
+                                                            std::move(task));
 }
 
-void ParallelLoopRunner::ParallelizeDynamic(size_t range_i, size_t range_j,
-                                            size_t tile_j,
-                                            Task2DTile1DDynamic task) {
-  Parallelize(range_i, range_j, AdjustTileSize(tile_j, 128), std::move(task));
+void ParallelLoopRunner::Parallelize(RangeDim i, RangeDim j, TileDim k,
+                                     Task3DTile1D task) {
+  Parallelize<ParallelTask3DTile1D, RangeDim, RangeDim, TileDim>(
+      i, j, k, std::move(task));
 }
 
-struct ParallelLoopRunner::ParallelTask3DTile2D {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE void operator()(size_t task_index) const {
-    auto x = Delinearize(task_index, range_i, range_j, range_k, tile_j, tile_k);
-    task(x.i, x.offset_j, x.offset_k, x.extent_j, x.extent_k);
-  }
+void ParallelLoopRunner::Parallelize(RangeDim i, TileDim j, TileDim k,
+                                     Task3DTile2D task) {
+  Parallelize<ParallelTask3DTile2D, RangeDim, TileDim, TileDim>(
+      i, j, k, std::move(task));
+}
 
-  size_t range_i;
-  size_t range_j;
-  size_t range_k;
-  size_t tile_j;
-  size_t tile_k;
-  Task3DTile2D task;
-};
-
-void ParallelLoopRunner::Parallelize(size_t range_i, size_t range_j,
-                                     size_t range_k, size_t tile_j,
-                                     size_t tile_k, Task3DTile2D task) {
-  DCHECK(done_event_) << "Parallel loop runner is in moved-from state";
-  size_t num_tasks = NumTasks(range_i, range_j, range_k, tile_j, tile_k);
+void ParallelLoopRunner::ParallelizeDynamic(RangeDim i, TileDim j, TileDim k,
+                                            Task3DTile2D task) {
+  ParallelizeDynamic<ParallelTask3DTile2D, RangeDim, TileDim, TileDim>(
+      i, j, k, std::move(task));
+}
 
-  // Fast path for the degenerate parallel loop with single task.
-  if (ABSL_PREDICT_TRUE(num_tasks == 1)) {
-    // Execute task in the caller thread if done event is already available.
-    if (ABSL_PREDICT_TRUE(done_event_.IsConcrete())) {
-      task(0, 0, 0, range_j, range_k);
-      return;
-    }
+void ParallelLoopRunner::Parallelize(RangeDim i, RangeDim j, TileDim k,
+                                     TileDim l, Task4DTile2D task) {
+  Parallelize<ParallelTask4DTile2D, RangeDim, RangeDim, TileDim, TileDim>(
+      i, j, k, l, std::move(task));
+}
 
-    // Schedule task when done event becomes available.
-    ScheduleOne([range_j, range_k, task = std::move(task)] {
-      task(0, 0, 0, range_j, range_k);
-    });
-    return;
-  }
+void ParallelLoopRunner::ParallelizeDynamic(RangeDim i, RangeDim j, TileDim k,
+                                            TileDim l, Task4DTile2D task) {
+  ParallelizeDynamic<ParallelTask4DTile2D, RangeDim, RangeDim, TileDim,
+                     TileDim>(i, j, k, l, std::move(task));
+}
 
-  ScheduleAll(num_tasks, ParallelTask3DTile2D{range_i, range_j, range_k, tile_j,
-                                              tile_k, std::move(task)});
+void ParallelLoopRunner::Parallelize(RangeDim i, RangeDim j, RangeDim k,
+                                     RangeDim l, RangeDim m, Task5D task) {
+  Parallelize<ParallelTask5D, RangeDim, RangeDim, RangeDim, RangeDim, RangeDim>(
+      i, j, k, l, m, std::move(task));
 }
 
-void ParallelLoopRunner::ParallelizeDynamic(size_t range_i, size_t range_j,
-                                            size_t range_k, size_t tile_j,
-                                            size_t tile_k,
-                                            Task3DTile2DDynamic task) {
-  Parallelize(range_i, range_j, range_k, AdjustTileSize(tile_j, 128),
-              AdjustTileSize(tile_k, 128), std::move(task));
+void ParallelLoopRunner::Parallelize(RangeDim i, RangeDim j, RangeDim k,
+                                     TileDim l, TileDim m, Task5DTile2D task) {
+  Parallelize<ParallelTask5DTile2D, RangeDim, RangeDim, RangeDim, TileDim,
+              TileDim>(i, j, k, l, m, std::move(task));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.h b/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.h
index a798697d01d8..1cdee8243eb9 100644
--- a/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.h
+++ b/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner.h
@@ -16,12 +16,18 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_PARALLEL_LOOP_RUNNER_H_
 #define XLA_BACKENDS_CPU_RUNTIME_PARALLEL_LOOP_RUNNER_H_
 
+#include <array>
 #include <atomic>
 #include <cstddef>
 #include <functional>
+#include <tuple>
+#include <type_traits>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
+#include "xla/tsl/lib/math/math_util.h"
 
 namespace Eigen {
 struct ThreadPoolDevice;
@@ -65,68 +71,154 @@ class ParallelLoopRunner {
   static tsl::AsyncValueRef<tsl::Chain> TakeDoneEvent(
       ParallelLoopRunner&& runner);
 
-  using Task1D = std::function<void(size_t offset)>;
+  //===--------------------------------------------------------------------===//
+  // Parallel dimensions and task coordinates APIs.
+  //===--------------------------------------------------------------------===//
 
-  using Task1DTile1D = std::function<void(size_t offset, size_t extent)>;
-  using Task1DTile1DDynamic = std::function<void(size_t offset, size_t count)>;
+  // Parallel dimension iterated in [0, range) range in parallel.
+  struct RangeDim {
+    size_t range;
+  };
 
-  using Task2DTile1D =
-      std::function<void(size_t offset_i, size_t offset_j, size_t extent_j)>;
-  using Task2DTile1DDynamic =
-      std::function<void(size_t offset_i, size_t offset_j, size_t count_j)>;
+  // Tiled dimension iterated in [0, range) range in `tile`-sized chunks.
+  struct TileDim {
+    size_t range;
+    size_t tile;
+  };
+
+  // Parallel task index along the range dimension.
+  struct RangeIndex {
+    size_t offset;
+  };
+
+  // Parallel task index along the tile dimension.
+  struct TileIndex {
+    size_t offset;
+    size_t count;
+  };
+
+  // Mapping from parallel loop dimension to the parallel task index. Defined
+  // as template specializations below.
+  template <typename Dim>
+  struct TaskIndex;
+
+  static size_t DimSize(RangeDim dim) { return dim.range; }
+
+  static size_t DimSize(TileDim dim) {
+    return tsl::MathUtil::CeilOfRatio(dim.range, dim.tile);
+  }
+
+  // Returns the number of tasks to be launched for the given dimensions.
+  template <typename... Dims>
+  static size_t NumTasks(Dims... dims);
+
+  // Delinearizes linear `task_index` into the parallel task coordinates.
+  template <typename... Dims>
+  static std::tuple<typename TaskIndex<Dims>::Index...> Delinearize(
+      size_t task_index, Dims... dims);
+
+  // Adjusts tile dimensions to fit the product of all dimensions into the
+  // desired number of tasks. Used in `ParallelizeDynamic` versions of the
+  // parallel loop APIs, to minimize the task scheduling overheads.
+  template <typename... Dims>
+  static std::tuple<Dims...> DynamicDimensions(size_t target_num_tasks,
+                                               Dims... dims);
+
+  //===--------------------------------------------------------------------===//
+  // Parallel loop APIs.
+  //===--------------------------------------------------------------------===//
+
+  using Task1D = std::function<void(RangeIndex i)>;
+
+  using Task2D = std::function<void(RangeIndex i, RangeIndex j)>;
+
+  using Task3D = std::function<void(RangeIndex i, RangeIndex j, RangeIndex k)>;
+
+  using Task1DTile1D = std::function<void(TileIndex i)>;
+
+  using Task2DTile1D = std::function<void(RangeIndex i, TileIndex j)>;
+
+  using Task2DTile2D = std::function<void(TileIndex i, TileIndex j)>;
+
+  using Task3DTile1D =
+      std::function<void(RangeIndex i, RangeIndex j, TileIndex k)>;
 
   using Task3DTile2D =
-      std::function<void(size_t offset_i, size_t offset_j, size_t offset_k,
-                         size_t extent_j, size_t extent_k)>;
-  using Task3DTile2DDynamic =
-      std::function<void(size_t offset_i, size_t offset_j, size_t offset_k,
-                         size_t count_j, size_t count_k)>;
+      std::function<void(RangeIndex i, TileIndex j, TileIndex k)>;
+
+  using Task4DTile2D =
+      std::function<void(RangeIndex i, RangeIndex j, TileIndex k, TileIndex l)>;
+
+  using Task5D = std::function<void(RangeIndex i, RangeIndex j, RangeIndex k,
+                                    RangeIndex l, RangeIndex m)>;
+
+  using Task5DTile2D = std::function<void(
+      RangeIndex i, RangeIndex j, RangeIndex k, TileIndex l, TileIndex m)>;
 
   // IMPORTANT: For `dynamic` versions of the parallel loops, the runner is free
   // to adjust `count` for tiled dimensions to minimize the number of launched
   // tasks. Today we don't take advantage of this feature, and always launch the
   // same number of tasks as in regular parallel loops.
 
-  // This function implements a parallel version of a following loop:
-  //
-  //   for (size_t i = 0; i < range; i++)
-  //     task(i);
-  void Parallelize(size_t range, Task1D task);
+  // Launches `task` in parallel for each element of the `i` dimension.
+  void Parallelize(RangeDim i, Task1D task);
 
-  // This function implements a parallel version of a following loop:
-  //
-  //   for (size_t i = 0; i < range; i += tile)
-  //     task(i, std::min(range - i, tile));
-  void Parallelize(size_t range, size_t tile, Task1DTile1D task);
+  // Launches `task` in parallel for each element of the `i` and `j` dimensions.
+  void Parallelize(RangeDim i, RangeDim j, Task2D task);
 
-  // Implements a parallel version of 1D loop with dynamic task count.
-  void ParallelizeDynamic(size_t range, size_t tile, Task1DTile1DDynamic task);
+  // Launches `task` in parallel for each element of the `i` dimension.
+  void Parallelize(TileDim i, Task1DTile1D task);
 
-  // This function implements a parallel version of a following loop:
-  //
-  //   for (size_t i = 0; i < range_i; i++)
-  //     for (size_t j = 0; j < range_j; j += tile_j)
-  //       task(i, j, min(range_j - j, tile_j));
-  void Parallelize(size_t range_i, size_t range_j, size_t tile_j,
-                   Task2DTile1D task);
+  // Launches `task` in parallel for each element of the `i` dimension.
+  void ParallelizeDynamic(TileDim i, Task1DTile1D task);
 
-  // Implements a parallel version of 2D loop with dynamic task count.
-  void ParallelizeDynamic(size_t range_i, size_t range_j, size_t tile_j,
-                          Task2DTile1DDynamic task);
+  // Launches `task` in parallel for each element of the `i` and `j` dimensions.
+  void Parallelize(RangeDim i, TileDim j, Task2DTile1D task);
 
-  // This function implements a parallel version of a following loop:
-  //
-  //   for (size_t i = 0; i < range_i; i++)
-  //     for (size_t j = 0; j < range_j; j += tile_j)
-  //       for (size_t k = 0; k < range_k; k += tile_k)
-  //         task(i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k));
-  void Parallelize(size_t range_i, size_t range_j, size_t range_k,
-                   size_t tile_j, size_t tile_k, Task3DTile2D task);
-
-  // Implements a parallel version of 3D loop with dynamic task count.
-  void ParallelizeDynamic(size_t range_i, size_t range_j, size_t range_k,
-                          size_t tile_j, size_t tile_k,
-                          Task3DTile2DDynamic task);
+  // Launches `task` in parallel for each element of the `i` and `j` dimensions.
+  void ParallelizeDynamic(RangeDim i, TileDim j, Task2DTile1D task);
+
+  // Launches `task` in parallel for each element of the `i` and `j` dimensions.
+  void Parallelize(TileDim i, TileDim j, Task2DTile2D task);
+
+  // Launches `task` in parallel for each element of the `i` and `j` dimensions.
+  void ParallelizeDynamic(TileDim i, TileDim j, Task2DTile2D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j` and `k`
+  // dimensions.
+  void Parallelize(RangeDim i, RangeDim j, RangeDim k, Task3D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j` and `k`
+  // dimensions.
+  void Parallelize(RangeDim i, RangeDim j, TileDim k, Task3DTile1D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j` and `k`
+  // dimensions.
+  void Parallelize(RangeDim i, TileDim j, TileDim k, Task3DTile2D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j` and `k`
+  // dimensions.
+  void ParallelizeDynamic(RangeDim i, TileDim j, TileDim k, Task3DTile2D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j`, `k` and `l`
+  // dimensions.
+  void Parallelize(RangeDim i, RangeDim j, TileDim k, TileDim l,
+                   Task4DTile2D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j`, `k` and `l`
+  // dimensions.
+  void ParallelizeDynamic(RangeDim i, RangeDim j, TileDim k, TileDim l,
+                          Task4DTile2D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j`, `k`, `l` and
+  // `m` dimensions.
+  void Parallelize(RangeDim i, RangeDim j, RangeDim k, RangeDim l, RangeDim m,
+                   Task5D task);
+
+  // Launches `task` in parallel for each element of the `i`, `j`, `k`, `l` and
+  // `m` dimensions.
+  void Parallelize(RangeDim i, RangeDim j, RangeDim k, TileDim l, TileDim m,
+                   Task5DTile2D task);
 
   // Resets the parallel loop runner `done_event` and returns the previous one
   // to the caller.
@@ -146,9 +238,16 @@ class ParallelLoopRunner {
  private:
   // Forward declarations of the parallel tasks.
   struct ParallelTask1D;
+  struct ParallelTask2D;
+  struct ParallelTask3D;
+  struct ParallelTask5D;
   struct ParallelTask1DTile1D;
   struct ParallelTask2DTile1D;
+  struct ParallelTask2DTile2D;
+  struct ParallelTask3DTile1D;
   struct ParallelTask3DTile2D;
+  struct ParallelTask4DTile2D;
+  struct ParallelTask5DTile2D;
 
   // Schedules `task` as the AndThen callback of the `done_event_`. Updates
   // `done_event_` to the new completion event.
@@ -161,6 +260,14 @@ class ParallelLoopRunner {
   template <typename ParallelTask>
   void ScheduleAll(size_t num_tasks, ParallelTask&& parallel_task);
 
+  // Internal implementation of the parallel loop APIs.
+  template <typename ParallelTask, typename... Dims, typename Task>
+  void Parallelize(Dims... dims, Task&& task);
+
+  // Internal implementation of the dynamic parallel loop APIs.
+  template <typename ParallelTask, typename... Dims, typename Task>
+  void ParallelizeDynamic(Dims... dims, Task&& task);
+
   // Async value that signals completion of the last scheduled parallel loop.
   tsl::AsyncValueRef<tsl::Chain> done_event_;
 
@@ -175,6 +282,209 @@ class ParallelLoopRunner {
   std::atomic<const Eigen::ThreadPoolDevice*> device_;
 };
 
+// An explicit specialization shall be declared in the namespace of which the
+// template is a member, or, for member templates, in the namespace of which the
+// enclosing class or enclosing class template is a member.
+
+template <>
+struct ParallelLoopRunner::TaskIndex<ParallelLoopRunner::RangeDim> {
+  using Index = RangeIndex;
+};
+
+template <>
+struct ParallelLoopRunner::TaskIndex<ParallelLoopRunner::TileDim> {
+  using Index = TileIndex;
+};
+
+//===----------------------------------------------------------------------===//
+// Parallel dimensions and task coordinates APIs.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+template <typename Dim>
+auto TaskStrides(Dim dim) {
+  return std::array<size_t, 1>{1};
+}
+
+template <typename Dim, typename... Dims>
+auto TaskStrides(Dim dim, Dims... dims) {
+  std::array<size_t, 1 + sizeof...(Dims)> strides = {
+      ParallelLoopRunner::NumTasks(dims...)};
+  absl::c_copy(TaskStrides(dims...), &strides[1]);
+  return strides;
+}
+
+template <size_t n>
+auto TaskCoordinate(size_t task_index, std::array<size_t, n> strides) {
+  std::array<size_t, n> coordinate;
+  for (size_t d = 0; d < n; ++d) {
+    coordinate[d] = task_index / strides[d];
+    task_index %= strides[d];
+  }
+  return coordinate;
+}
+
+}  // namespace internal
+
+template <typename... Dims>
+size_t ParallelLoopRunner::NumTasks(Dims... dims) {
+  return (DimSize(dims) * ...);
+}
+
+template <typename... Dims>
+std::tuple<typename ParallelLoopRunner::TaskIndex<Dims>::Index...>
+ParallelLoopRunner::Delinearize(size_t task_index, Dims... dims) {
+  // Convert linear task index into the multidimensional parallel task index.
+  auto strides = internal::TaskStrides(dims...);
+  auto coord = internal::TaskCoordinate(task_index, strides);
+
+  size_t d = 0;
+  auto to_task_index = [&](auto dim) {
+    size_t dim_index = coord[d++];
+    DCHECK_LE(dim_index, DimSize(dim)) << "Dimension index is out of bounds";
+
+    if constexpr (std::is_same_v<decltype(dim), RangeDim>) {
+      return RangeIndex{dim_index};
+    } else if constexpr (std::is_same_v<decltype(dim), TileDim>) {
+      size_t offset = dim_index * dim.tile;
+      return TileIndex{offset, std::min(dim.range - offset, dim.tile)};
+    } else {
+      static_assert(sizeof(decltype(dim)) == 0, "Unsupported dimension type");
+    }
+  };
+
+  return std::make_tuple(to_task_index(dims)...);
+}
+
+template <typename... Dims>
+std::tuple<Dims...> ParallelLoopRunner::DynamicDimensions(
+    size_t target_num_tasks, Dims... dims) {
+  constexpr size_t num_dims = sizeof...(Dims);
+
+  // XNNPACK tends to choose too small tile sizes that creates too many tasks.
+  // For dynamic versions of parallel loops we prefer tile sizes to be at
+  // least 128 elements. We do further adjustments later to fit the number of
+  // tasks into the target number of tasks.
+  auto update_min_tile = [](auto& dim) {
+    if constexpr (std::is_same_v<std::decay_t<decltype(dim)>, TileDim>) {
+      static constexpr size_t kMinTileSize = 128;
+      size_t multiple = tsl::MathUtil::CeilOfRatio(kMinTileSize, dim.tile);
+      dim.tile = std::min(dim.range, dim.tile * multiple);
+    }
+  };
+  (update_min_tile(dims), ...);
+
+  // We can't adjust range dimensions and must execute a task for each index.
+  auto to_range_tasks = [](auto dim) -> size_t {
+    if constexpr (std::is_same_v<std::decay_t<decltype(dim)>, RangeDim>) {
+      return dim.range;
+    }
+    return 1;
+  };
+
+  // Tile dimensions can be dynamically adjusted to reduce the number of tasks.
+  auto to_tile_tasks = [](auto dim) -> size_t {
+    if constexpr (std::is_same_v<std::decay_t<decltype(dim)>, TileDim>) {
+      return tsl::MathUtil::CeilOfRatio(dim.range, dim.tile);
+    }
+    return 1;
+  };
+
+  std::array<size_t, num_dims> range_tasks = {to_range_tasks(dims)...};
+  std::array<size_t, num_dims> tile_tasks = {to_tile_tasks(dims)...};
+
+  size_t num_range_tasks =
+      absl::c_accumulate(range_tasks, 1, std::multiplies<>());
+
+  // The target number of tasks that should be assigned to tile dimensions.
+  size_t target_dyn_tasks =
+      tsl::MathUtil::CeilOfRatio(target_num_tasks, num_range_tasks);
+
+  // Compute the target number of tile tasks for each tile dimension.
+  std::array<size_t, num_dims> target_tile_tasks;
+
+  for (size_t d = 0; d < num_dims; ++d) {
+    if (target_dyn_tasks == 1 || tile_tasks[d] == 1) {
+      // If we don't have any dyn task to assign or tile tasks to adjust, we
+      // should process tile dimension as a single task.
+      target_tile_tasks[d] = 1;
+
+    } else if (target_dyn_tasks <= tile_tasks[d]) {
+      // Assign the remaining dyn tasks to the current tile dimension.
+      target_tile_tasks[d] = target_dyn_tasks;
+      target_dyn_tasks = 1;
+
+    } else {
+      // Keep the number of tile tasks the same for current dimension and assign
+      // the remaining dyn tasks to the inner tile dimensions.
+      target_tile_tasks[d] = tile_tasks[d];
+      target_dyn_tasks =
+          tsl::MathUtil::CeilOfRatio(target_dyn_tasks, tile_tasks[d]);
+    }
+  }
+
+  // Update tile dimensions to adjust the tile size to get the target number
+  // of tasks per dimension.
+  size_t d = 0;
+  auto update_target_tile = [&](auto& dim) {
+    if constexpr (std::is_same_v<std::decay_t<decltype(dim)>, TileDim>) {
+      size_t target_tile_size =
+          tsl::MathUtil::CeilOfRatio(dim.range, target_tile_tasks[d]);
+      size_t multiple = tsl::MathUtil::CeilOfRatio(target_tile_size, dim.tile);
+      dim.tile = dim.tile * multiple;
+    } else {
+      DCHECK_EQ(target_tile_tasks[d], 1)
+          << "Target tile tasks for range dimensions must be 1";
+    }
+    ++d;
+  };
+  (update_target_tile(dims), ...);
+
+  return std::make_tuple(dims...);
+}
+
+constexpr bool operator==(ParallelLoopRunner::RangeDim a,
+                          ParallelLoopRunner::RangeDim b) {
+  return a.range == b.range;
+}
+
+constexpr bool operator==(ParallelLoopRunner::TileDim a,
+                          ParallelLoopRunner::TileDim b) {
+  return a.range == b.range && a.tile == b.tile;
+}
+
+constexpr bool operator==(ParallelLoopRunner::RangeIndex a,
+                          ParallelLoopRunner::RangeIndex b) {
+  return a.offset == b.offset;
+}
+
+constexpr bool operator==(ParallelLoopRunner::TileIndex a,
+                          ParallelLoopRunner::TileIndex b) {
+  return a.offset == b.offset && a.count == b.count;
+}
+
+template <typename Sink>
+void AbslStringify(Sink& sink, ParallelLoopRunner::RangeDim dim) {
+  absl::Format(&sink, "RangeDim{range=%zu}", dim.range);
+}
+
+template <typename Sink>
+void AbslStringify(Sink& sink, ParallelLoopRunner::TileDim dim) {
+  absl::Format(&sink, "TileDim{range=%zu, tile=%zu}", dim.range, dim.tile);
+}
+
+template <typename Sink>
+void AbslStringify(Sink& sink, ParallelLoopRunner::RangeIndex index) {
+  absl::Format(&sink, "RangeIndex{offset=%zu}", index.offset);
+}
+
+template <typename Sink>
+void AbslStringify(Sink& sink, ParallelLoopRunner::TileIndex index) {
+  absl::Format(&sink, "TileIndex{offset=%zu, count=%zu}", index.offset,
+               index.count);
+}
+
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_RUNTIME_PARALLEL_LOOP_RUNNER_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner_test.cc b/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner_test.cc
index 0acbabecbfbd..b75acaa94896 100644
--- a/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/parallel_loop_runner_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <tuple>
 #include <utility>
 
 #include "absl/algorithm/container.h"
@@ -34,18 +35,77 @@ limitations under the License.
 namespace xla::cpu {
 namespace {
 
+using RangeDim = ParallelLoopRunner::RangeDim;
+using RangeIndex = ParallelLoopRunner::RangeIndex;
+
+using TileDim = ParallelLoopRunner::TileDim;
+using TileIndex = ParallelLoopRunner::TileIndex;
+
+TEST(ParallelLoopRunnerTest, NumTasks) {
+  EXPECT_EQ(ParallelLoopRunner::NumTasks(RangeDim{2}), 2);
+  EXPECT_EQ(ParallelLoopRunner::NumTasks(RangeDim{2}, RangeDim{3}), 2 * 3);
+  EXPECT_EQ(ParallelLoopRunner::NumTasks(RangeDim{2}, TileDim{10, 4}), 2 * 3);
+}
+
+TEST(ParallelLoopRunnerTest, Delinearize) {
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(0, RangeDim{2}, RangeDim{3}),
+            std::make_tuple(RangeIndex{0}, RangeIndex{0}));
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(1, RangeDim{2}, RangeDim{3}),
+            std::make_tuple(RangeIndex{0}, RangeIndex{1}));
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(2, RangeDim{2}, RangeDim{3}),
+            std::make_tuple(RangeIndex{0}, RangeIndex{2}));
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(3, RangeDim{2}, RangeDim{3}),
+            std::make_tuple(RangeIndex{1}, RangeIndex{0}));
+
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(0, RangeDim{2}, TileDim{10, 4}),
+            std::make_tuple(RangeIndex{0}, TileIndex{0, 4}));
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(1, RangeDim{2}, TileDim{10, 4}),
+            std::make_tuple(RangeIndex{0}, TileIndex{4, 4}));
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(2, RangeDim{2}, TileDim{10, 4}),
+            std::make_tuple(RangeIndex{0}, TileIndex{8, 2}));
+  EXPECT_EQ(ParallelLoopRunner::Delinearize(3, RangeDim{2}, TileDim{10, 4}),
+            std::make_tuple(RangeIndex{1}, TileIndex{0, 4}));
+}
+
+TEST(ParallelLoopRunnerTest, DynamicDimensions) {
+  EXPECT_EQ(ParallelLoopRunner::DynamicDimensions(4, TileDim{128, 4}),
+            std::make_tuple(TileDim{128, 128}));
+  EXPECT_EQ(ParallelLoopRunner::DynamicDimensions(4, TileDim{1024, 4}),
+            std::make_tuple(TileDim{1024, 256}));
+  EXPECT_EQ(ParallelLoopRunner::DynamicDimensions(4, TileDim{1024, 512}),
+            std::make_tuple(TileDim{1024, 512}));
+  EXPECT_EQ(ParallelLoopRunner::DynamicDimensions(4, TileDim{1024, 400}),
+            std::make_tuple(TileDim{1024, 400}));
+
+  EXPECT_EQ(
+      ParallelLoopRunner::DynamicDimensions(4, RangeDim{2}, TileDim{1024, 128}),
+      std::make_tuple(RangeDim{2}, TileDim{1024, 512}));
+
+  EXPECT_EQ(ParallelLoopRunner::DynamicDimensions(16, TileDim{1024, 128},
+                                                  TileDim{1024, 128}),
+            std::make_tuple(TileDim{1024, 128}, TileDim{1024, 512}));
+
+  EXPECT_EQ(ParallelLoopRunner::DynamicDimensions(32, TileDim{1024, 128},
+                                                  TileDim{1024, 128}),
+            std::make_tuple(TileDim{1024, 128}, TileDim{1024, 256}));
+
+  EXPECT_EQ(ParallelLoopRunner::DynamicDimensions(
+                32, RangeDim{1}, TileDim{512, 16}, TileDim{12, 5}),
+            std::make_tuple(RangeDim{1}, TileDim{512, 128}, TileDim{12, 12}));
+}
+
 TEST(ParallelLoopRunnerTest, Parallelize1D) {
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
   Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
                                  threads.NumThreads());
   ParallelLoopRunner runner(&device);
 
-  constexpr int32_t d0 = 128;
+  constexpr RangeDim d0 = {128};
 
-  auto* data = new int32_t[d0]();
+  auto* data = new int32_t[d0.range]();
   auto cleanup = absl::Cleanup([&]() { delete[] data; });
 
-  auto increment = [&](size_t offset) { data[offset] += 1; };
+  auto increment = [&](RangeIndex i) { data[i.offset] += 1; };
 
   runner.Parallelize(d0, increment);
   runner.Parallelize(d0, increment);
@@ -54,7 +114,34 @@ TEST(ParallelLoopRunnerTest, Parallelize1D) {
   runner.Parallelize(d0, increment);
 
   tsl::BlockUntilReady(ParallelLoopRunner::TakeDoneEvent(std::move(runner)));
-  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0], d0),
+  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0], d0.range),
+                             [](int32_t value) { return value == 5; }));
+}
+
+TEST(ParallelLoopRunnerTest, Parallelize2D) {
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
+  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
+                                 threads.NumThreads());
+  ParallelLoopRunner runner(&device);
+
+  constexpr RangeDim d0 = {8};
+  constexpr RangeDim d1 = {9};
+
+  auto* data = new int32_t[d0.range][d1.range]();
+  auto cleanup = absl::Cleanup([&]() { delete[] data; });
+
+  auto increment = [&](RangeIndex i, RangeIndex j) {
+    data[i.offset][j.offset] += 1;
+  };
+
+  runner.Parallelize(d0, d1, increment);
+  runner.Parallelize(d0, d1, increment);
+  runner.Parallelize(d0, d1, increment);
+  runner.Parallelize(d0, d1, increment);
+  runner.Parallelize(d0, d1, increment);
+
+  tsl::BlockUntilReady(ParallelLoopRunner::TakeDoneEvent(std::move(runner)));
+  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0][0], d0.range * d1.range),
                              [](int32_t value) { return value == 5; }));
 }
 
@@ -64,25 +151,25 @@ TEST(ParallelLoopRunnerTest, Parallelize1DTile1D) {
                                  threads.NumThreads());
   ParallelLoopRunner runner(&device);
 
-  constexpr int32_t d0 = 128;
+  constexpr int32_t r0 = 128;
 
-  auto* data = new int32_t[d0]();
+  auto* data = new int32_t[r0]();
   auto cleanup = absl::Cleanup([&]() { delete[] data; });
 
-  auto increment = [&](size_t offset, size_t extent) {
-    for (size_t i = offset; i < offset + extent; ++i) {
-      data[i] += 1;
+  auto increment = [&](TileIndex i) {
+    for (size_t i0 = i.offset; i0 < i.offset + i.count; ++i0) {
+      data[i0] += 1;
     }
   };
 
-  runner.Parallelize(d0, 1, increment);
-  runner.Parallelize(d0, 2, increment);
-  runner.Parallelize(d0, 3, increment);
-  runner.Parallelize(d0, 4, increment);
-  runner.Parallelize(d0, 5, increment);
+  runner.Parallelize(TileDim{r0, 1}, increment);
+  runner.Parallelize(TileDim{r0, 2}, increment);
+  runner.Parallelize(TileDim{r0, 3}, increment);
+  runner.Parallelize(TileDim{r0, 4}, increment);
+  runner.Parallelize(TileDim{r0, 5}, increment);
 
   tsl::BlockUntilReady(ParallelLoopRunner::TakeDoneEvent(std::move(runner)));
-  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0], d0),
+  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0], r0),
                              [](int32_t value) { return value == 5; }));
 }
 
@@ -92,26 +179,57 @@ TEST(ParallelLoopRunnerTest, Parallelize2DTile1D) {
                                  threads.NumThreads());
   ParallelLoopRunner runner(&device);
 
-  constexpr int32_t d0 = 4;
-  constexpr int32_t d1 = 39;
+  constexpr int32_t r0 = 4;
+  constexpr int32_t r1 = 39;
+
+  auto* data = new int32_t[r0][r1]();
+  auto cleanup = absl::Cleanup([&]() { delete[] data; });
+
+  auto increment = [&](RangeIndex i, TileIndex j) {
+    for (size_t j0 = j.offset; j0 < j.offset + j.count; ++j0) {
+      data[i.offset][j0] += 1;
+    }
+  };
+
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 1}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 2}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 3}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 4}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 5}, increment);
+
+  tsl::BlockUntilReady(ParallelLoopRunner::TakeDoneEvent(std::move(runner)));
+  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0][0], r0 * r1),
+                             [](int32_t value) { return value == 5; }));
+}
+
+TEST(ParallelLoopRunnerTest, Parallelize2DTile2D) {
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
+  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
+                                 threads.NumThreads());
+  ParallelLoopRunner runner(&device);
+
+  constexpr int32_t r0 = 4;
+  constexpr int32_t r1 = 39;
 
-  auto* data = new int32_t[d0][d1]();
+  auto* data = new int32_t[r0][r1]();
   auto cleanup = absl::Cleanup([&]() { delete[] data; });
 
-  auto increment = [&](size_t i, size_t offset_j, size_t extent_j) {
-    for (size_t j = offset_j; j < offset_j + extent_j; ++j) {
-      data[i][j] += 1;
+  auto increment = [&](TileIndex i, TileIndex j) {
+    for (size_t i0 = i.offset; i0 < i.offset + i.count; ++i0) {
+      for (size_t j0 = j.offset; j0 < j.offset + j.count; ++j0) {
+        data[i0][j0] += 1;
+      }
     }
   };
 
-  runner.Parallelize(d0, d1, 1, increment);
-  runner.Parallelize(d0, d1, 2, increment);
-  runner.Parallelize(d0, d1, 3, increment);
-  runner.Parallelize(d0, d1, 4, increment);
-  runner.Parallelize(d0, d1, 5, increment);
+  runner.Parallelize(TileDim{r0, 5}, TileDim{r1, 1}, increment);
+  runner.Parallelize(TileDim{r0, 4}, TileDim{r1, 2}, increment);
+  runner.Parallelize(TileDim{r0, 3}, TileDim{r1, 3}, increment);
+  runner.Parallelize(TileDim{r0, 2}, TileDim{r1, 4}, increment);
+  runner.Parallelize(TileDim{r0, 1}, TileDim{r1, 5}, increment);
 
   tsl::BlockUntilReady(ParallelLoopRunner::TakeDoneEvent(std::move(runner)));
-  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0][0], d0 * d1),
+  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0][0], r0 * r1),
                              [](int32_t value) { return value == 5; }));
 }
 
@@ -121,30 +239,29 @@ TEST(ParallelLoopRunnerTest, Parallelize3DTile2D) {
                                  threads.NumThreads());
   ParallelLoopRunner runner(&device);
 
-  constexpr int32_t d0 = 4;
-  constexpr int32_t d1 = 39;
-  constexpr int32_t d2 = 63;
+  constexpr int32_t r0 = 4;
+  constexpr int32_t r1 = 39;
+  constexpr int32_t r2 = 63;
 
-  auto* data = new int32_t[d0][d1][d2]();
+  auto* data = new int32_t[r0][r1][r2]();
   auto cleanup = absl::Cleanup([&]() { delete[] data; });
 
-  auto increment = [&](size_t i, size_t offset_j, size_t offset_k,
-                       size_t extent_j, size_t extent_k) {
-    for (size_t j = offset_j; j < offset_j + extent_j; ++j) {
-      for (size_t k = offset_k; k < offset_k + extent_k; ++k) {
-        data[i][j][k] += 1;
+  auto increment = [&](RangeIndex i, TileIndex j, TileIndex k) {
+    for (size_t j0 = j.offset; j0 < j.offset + j.count; ++j0) {
+      for (size_t k0 = k.offset; k0 < k.offset + k.count; ++k0) {
+        data[i.offset][j0][k0] += 1;
       }
     }
   };
 
-  runner.Parallelize(d0, d1, d2, 1, 5, increment);
-  runner.Parallelize(d0, d1, d2, 2, 4, increment);
-  runner.Parallelize(d0, d1, d2, 3, 4, increment);
-  runner.Parallelize(d0, d1, d2, 4, 3, increment);
-  runner.Parallelize(d0, d1, d2, 5, 1, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 5}, TileDim{r2, 1}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 4}, TileDim{r2, 2}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 3}, TileDim{r2, 3}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 2}, TileDim{r2, 4}, increment);
+  runner.Parallelize(RangeDim{r0}, TileDim{r1, 1}, TileDim{r2, 5}, increment);
 
   tsl::BlockUntilReady(ParallelLoopRunner::TakeDoneEvent(std::move(runner)));
-  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0][0][0], d0 * d1 * d2),
+  ASSERT_TRUE(absl::c_all_of(absl::MakeSpan(&data[0][0][0], r0 * r1 * r2),
                              [](int32_t value) { return value == 5; }));
 }
 
@@ -152,6 +269,62 @@ TEST(ParallelLoopRunnerTest, Parallelize3DTile2D) {
 // Performance benchmarks.
 //===----------------------------------------------------------------------===//
 
+static void BM_NumTasks4DTile4D(benchmark::State& state) {
+  auto t0 = TileDim{10, 4};
+  auto t1 = TileDim{20, 5};
+  auto t2 = TileDim{30, 6};
+  auto t3 = TileDim{40, 70};
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(t0);
+    benchmark::DoNotOptimize(t1);
+    benchmark::DoNotOptimize(t2);
+    benchmark::DoNotOptimize(t3);
+    benchmark::DoNotOptimize(ParallelLoopRunner::NumTasks(t0, t1, t2, t3));
+  }
+}
+
+BENCHMARK(BM_NumTasks4DTile4D);
+
+static void BM_Delinearize2D(benchmark::State& state) {
+  auto t0 = RangeDim{10};
+  auto t1 = RangeDim{20};
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(t0);
+    benchmark::DoNotOptimize(t1);
+    benchmark::DoNotOptimize(ParallelLoopRunner::Delinearize(123, t0, t1));
+  }
+}
+
+BENCHMARK(BM_Delinearize2D);
+
+static void BM_Delinearize2DTile1D(benchmark::State& state) {
+  auto t0 = RangeDim{10};
+  auto t1 = TileDim{10, 4};
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(t0);
+    benchmark::DoNotOptimize(t1);
+    benchmark::DoNotOptimize(ParallelLoopRunner::Delinearize(123, t0, t1));
+  }
+}
+
+BENCHMARK(BM_Delinearize2DTile1D);
+
+static void BM_Delinearize2DTile2D(benchmark::State& state) {
+  auto t0 = TileDim{10, 4};
+  auto t1 = TileDim{20, 5};
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(t0);
+    benchmark::DoNotOptimize(t1);
+    benchmark::DoNotOptimize(ParallelLoopRunner::Delinearize(123, t0, t1));
+  }
+}
+
+BENCHMARK(BM_Delinearize2DTile2D);
+
 static void BM_SingleTask1DLoop(benchmark::State& state) {
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
   Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
@@ -159,7 +332,7 @@ static void BM_SingleTask1DLoop(benchmark::State& state) {
   ParallelLoopRunner runner(&device);
 
   for (auto _ : state) {
-    runner.Parallelize(1, 1, [](size_t, size_t) {});
+    runner.Parallelize(TileDim{1, 1}, [](TileIndex) {});
     tsl::BlockUntilReady(runner.done_event());
   }
 }
@@ -173,11 +346,11 @@ static void BM_Parallelize2DTile1D(benchmark::State& state) {
 
   ParallelLoopRunner runner(&device);
 
-  size_t range = 4;
-  size_t tile = 1;
+  RangeDim d0 = {4};
+  TileDim d1 = {4, 1};
 
   for (auto _ : state) {
-    runner.Parallelize(range, range, tile, [](size_t, size_t, size_t) {});
+    runner.Parallelize(d0, d1, [](RangeIndex, TileIndex) {});
     tsl::BlockUntilReady(runner.done_event());
   }
 }
@@ -191,12 +364,12 @@ static void BM_Parallelize3DTile2D(benchmark::State& state) {
 
   ParallelLoopRunner runner(&device);
 
-  size_t range = 4;
-  size_t tile = 1;
+  RangeDim d0 = {4};
+  TileDim d1 = {4, 1};
+  TileDim d2 = {4, 1};
 
   for (auto _ : state) {
-    runner.Parallelize(range, range, range, tile, tile,
-                       [](size_t, size_t, size_t, size_t, size_t) {});
+    runner.Parallelize(d0, d1, d2, [](RangeIndex, TileIndex, TileIndex) {});
     tsl::BlockUntilReady(runner.done_event());
   }
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/reduce_scatter_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/reduce_scatter_thunk.cc
index 8682ce958a40..f17fabab0b4f 100644
--- a/third_party/xla/xla/backends/cpu/runtime/reduce_scatter_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/reduce_scatter_thunk.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
@@ -89,14 +90,25 @@ ReduceScatterThunk::Execute(const ExecuteParams& params) {
       [&](const RendezvousKey& key, Communicator& comm) {
         CpuCollectives::Executor executor(key, DefaultCollectiveTimeout());
 
+        tsl::CountDownAsyncValueRef<Communicator::Event> state(
+            data.source.size());
+
         for (int32_t i = 0; i < data.source.size(); ++i) {
           const Shape& shape = destination_shape(i);
-          TF_RETURN_IF_ERROR(comm.ReduceScatter(
+          auto communicator_event = comm.ReduceScatter(
               data.source[i], data.destination[i], shape.element_type(),
-              ShapeUtil::ElementsIn(shape), reduction_kind_, executor));
+              ShapeUtil::ElementsIn(shape), reduction_kind_, executor);
+
+          communicator_event.AndThen([state, communicator_event]() mutable {
+            if (ABSL_PREDICT_FALSE(communicator_event.IsError())) {
+              state.CountDown(communicator_event.GetError());
+            } else {
+              state.CountDown();
+            }
+          });
         }
 
-        return absl::OkStatus();
+        return state.AsRef();
       });
 }
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/rng_state_lib.cc b/third_party/xla/xla/backends/cpu/runtime/rng_state_lib.cc
new file mode 100644
index 000000000000..aa9e88c63da2
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/rng_state_lib.cc
@@ -0,0 +1,48 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/rng_state_lib.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "absl/base/config.h"
+#include "absl/numeric/int128.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla::cpu {
+
+// Use a non-zero initial value as zero state can cause the result of the first
+// random number generation not passing the chi-square test. The value used here
+// is arbitrarily chosen, any non-zero values should be fine.
+static constexpr absl::int128 kRngStateInitialValue = 0x7012395ull;
+
+RngState::RngState(int64_t delta)
+    : delta_(delta), state_(kRngStateInitialValue) {}
+
+void RngState::GetAndUpdateState(uint64_t* data) {
+  absl::MutexLock lock(&mu_);
+
+  uint64_t low = absl::Int128Low64(state_);
+  uint64_t high = absl::Int128High64(state_);
+
+  static_assert(ABSL_IS_LITTLE_ENDIAN, "Big endian not supported");
+  std::memcpy(data, &low, sizeof(low));
+  std::memcpy(data + 1, &high, sizeof(high));
+
+  state_ += delta_;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/rng_state_lib.h b/third_party/xla/xla/backends/cpu/runtime/rng_state_lib.h
new file mode 100644
index 000000000000..8907fd4ca629
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/rng_state_lib.h
@@ -0,0 +1,44 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_RUNTIME_RNG_STATE_LIB_H_
+#define XLA_BACKENDS_CPU_RUNTIME_RNG_STATE_LIB_H_
+
+#include <cstdint>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/numeric/int128.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla::cpu {
+
+class RngState {
+ public:
+  explicit RngState(int64_t delta);
+
+  void GetAndUpdateState(uint64_t* data);
+
+  int64_t delta() const { return delta_; }
+
+ private:
+  int64_t delta_;
+
+  absl::Mutex mu_;
+  absl::int128 state_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_RUNTIME_RNG_STATE_LIB_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/rng_state_lib_test.cc b/third_party/xla/xla/backends/cpu/runtime/rng_state_lib_test.cc
new file mode 100644
index 000000000000..9b28964cbb49
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/runtime/rng_state_lib_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/runtime/rng_state_lib.h"
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "absl/numeric/int128.h"
+
+namespace xla::cpu {
+namespace {
+
+TEST(RngStateTest, DeltaConstructorSetsDeltaCorrectly) {
+  const int64_t test_delta = 12345;
+  RngState rng(test_delta);
+  EXPECT_EQ(rng.delta(), test_delta);
+}
+
+TEST(RngStateTest, ConfirmStateIsIncreasedByDelta) {
+  const int64_t test_delta = 12345;
+  RngState rng(test_delta);
+  absl::int128 initial_data;
+  rng.GetAndUpdateState(reinterpret_cast<uint64_t*>(&initial_data));
+
+  absl::int128 expected_data = initial_data + test_delta;
+
+  absl::int128 data;
+  rng.GetAndUpdateState(reinterpret_cast<uint64_t*>(&data));
+  EXPECT_EQ(data, expected_data);
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.cc
index e531fd74fe84..d605a7a8327b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.cc
@@ -16,17 +16,14 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/rng_state_thunk.h"
 
 #include <cstdint>
-#include <cstring>
 #include <memory>
 #include <utility>
 
-#include "absl/base/config.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/numeric/int128.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
-#include "absl/synchronization/mutex.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
@@ -36,11 +33,6 @@ limitations under the License.
 
 namespace xla::cpu {
 
-// Use a non-zero initial value as zero state can cause the result of the first
-// random number generation not passing the chi-square test. The value used here
-// is arbitrarily chosen, any non-zero values should be fine.
-static constexpr absl::int128 kRngStateInitialValue = 0x7012395ull;
-
 absl::StatusOr<std::unique_ptr<RngGetAndUpdateStateThunk>>
 RngGetAndUpdateStateThunk::Create(Info info,
                                   BufferAllocation::Slice state_buffer,
@@ -53,12 +45,10 @@ RngGetAndUpdateStateThunk::RngGetAndUpdateStateThunk(
     Info info, BufferAllocation::Slice state_buffer, int64_t delta)
     : Thunk(Kind::kRngGetAndUpdateState, info),
       state_buffer_(state_buffer),
-      delta_(delta),
-      state_(kRngStateInitialValue) {}
+      rng_state_(delta) {}
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent> RngGetAndUpdateStateThunk::Execute(
     const ExecuteParams& params) {
-
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase state_data,
       params.buffer_allocations->GetDeviceAddress(state_buffer_));
@@ -71,17 +61,8 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> RngGetAndUpdateStateThunk::Execute(
   VLOG(3) << absl::StreamFormat("  state: %s (%p)", state_buffer_.ToString(),
                                 state_data.opaque());
 
-  absl::MutexLock lock(&mu_);
-
-  uint64_t low = absl::Int128Low64(state_);
-  uint64_t high = absl::Int128High64(state_);
   uint64_t* data = static_cast<uint64_t*>(state_data.opaque());
-
-  static_assert(ABSL_IS_LITTLE_ENDIAN, "Big endian not supported");
-  std::memcpy(data, &low, sizeof(low));
-  std::memcpy(data + 1, &high, sizeof(high));
-
-  state_ += delta_;
+  rng_state_.GetAndUpdateState(data);
 
   return OkExecuteEvent();
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.h b/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.h
index 103802b73156..6294da855696 100644
--- a/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/rng_state_thunk.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/numeric/int128.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/backends/cpu/runtime/rng_state_lib.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 
@@ -42,7 +43,7 @@ class RngGetAndUpdateStateThunk final : public Thunk {
     return {{state_buffer_, BufferUse::kWrite}};
   }
 
-  int64_t delta() const { return delta_; }
+  int64_t delta() const { return rng_state_.delta(); }
 
   const BufferAllocation::Slice& state_buffer() const { return state_buffer_; }
 
@@ -51,10 +52,7 @@ class RngGetAndUpdateStateThunk final : public Thunk {
                             int64_t delta);
 
   BufferAllocation::Slice state_buffer_;
-  int64_t delta_;
-
-  absl::Mutex mu_;
-  absl::int128 state_ ABSL_GUARDED_BY(mu_);
+  RngState rng_state_;
 };
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
index 1bcb0af4a299..9a5303201134 100644
--- a/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/sort_thunk.cc
@@ -79,8 +79,8 @@ static absl::Status VerifySortInputs(absl::Span<const SortThunk::Input> inputs,
 
   // Check that sort dimension is valid.
   int64_t sort_dimension =
-      dimension >= 0 ? dimension : shape.rank() + dimension;
-  if (shape.rank() <= sort_dimension) {
+      dimension >= 0 ? dimension : shape.dimensions().size() + dimension;
+  if (shape.dimensions().size() <= sort_dimension) {
     return Internal(
         "Shape of dimensions [%s] can't be sorted along dimension %d",
         absl::StrJoin(shape.dimensions(), ","), dimension);
@@ -604,7 +604,7 @@ struct SortDims {
 // (or `std::stable_sort`) on each (strided) slice of the buffer.
 static SortDims GetSortDims(const Shape& shape, int64_t dimension) {
   int64_t sort_dimension =
-      dimension >= 0 ? dimension : shape.rank() + dimension;
+      dimension >= 0 ? dimension : shape.dimensions().size() + dimension;
 
   // We need to normalize shape + layout into a descending layout, so that we
   // can compute access strides according to the physical layout.
@@ -766,7 +766,7 @@ static absl::Status SortInplace(
     // Sorts array using builtin comparator functor
     auto builtin_sort = [&](PrimitiveType type,
                             SortThunk::SortDirection direction) {
-      primitive_util::ArrayTypeSwitch<void>(
+      primitive_util::ArrayTypeSwitch(
           [&](auto cst_type) {
             if constexpr ((primitive_util::IsFloatingPointType(cst_type) ||
                            primitive_util::IsIntegralType(cst_type)) &&
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.cc b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
index 09922aa09691..6b575c27b3a8 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
@@ -98,7 +98,8 @@ Thunk::CollectiveExecuteParams::Create(
 
   // Default implementation of a collectives interface that can execute
   // collective operations within the same process.
-  static CpuCollectives* in_process_collectives = new InProcessCollectives();
+  static CpuCollectives* const in_process_collectives =
+      new InProcessCollectives();
 
   // If CPU executable run options are set, use the collectives interface
   // provided by the executable run options if it is set. Otherwise, use the
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.h b/third_party/xla/xla/backends/cpu/runtime/thunk.h
index 75d61bd74a48..d4a56ae88fa5 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.h
@@ -26,15 +26,15 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
 #include "xla/backends/cpu/collectives/cpu_collectives.h"
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
 #include "xla/backends/cpu/runtime/function_library.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/cpu/xfeed_manager.h"
 #include "xla/service/global_device_id.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -49,6 +49,7 @@ struct ThreadPoolDevice;
 namespace xla::cpu {
 
 // Forward declare.
+class ThunkSequence;
 class ThunkExecutor;
 
 // Thunk is the basic unit of execution for the XLA CPU runtime.
@@ -84,6 +85,13 @@ class Thunk {
     kOneDnnFusion,
   };
 
+  static absl::string_view KindToString(Kind kind);
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, Kind kind) {
+    sink.Append(KindToString(kind));
+  }
+
   struct Info {
     std::string op_name;
     std::string module_name;
@@ -126,8 +134,6 @@ class Thunk {
   Kind kind() const { return kind_; }
   const Info& info() const { return info_; }
 
-  static absl::string_view KindToString(Kind kind);
-
   // Returns the list of buffers used by a thunk. Thunk executor relies on this
   // information to execute thunks concurrently and to avoid data races.
   using BufferUses = absl::InlinedVector<BufferUse, 4>;
@@ -141,12 +147,17 @@ class Thunk {
   using ResourceUses = absl::InlinedVector<ResourceUse, 4>;
   virtual ResourceUses resource_uses() const { return {}; }
 
+  virtual std::vector<std::pair<std::string, const ThunkSequence*>>
+  nested_thunks() const {
+    return {};
+  }
+
   //===--------------------------------------------------------------------===//
   // CollectiveExecuteParams
   //===--------------------------------------------------------------------===//
 
-  // Parameters capturing all the details required for collective execution of
-  // XLA executables (multiple partitions and replicas).
+  // Parameters capturing all the details required for collective execution
+  // of XLA executables (multiple partitions and replicas).
   struct CollectiveExecuteParams {
     static absl::StatusOr<CollectiveExecuteParams> Create(
         const ExecutableRunOptions* run_options);
@@ -159,7 +170,6 @@ class Thunk {
     const DeviceAssignment* device_assignment = nullptr;
     CpuCollectives* collectives = nullptr;
 
-   private:
     CollectiveExecuteParams(RunId run_id, int64_t local_device_ordinal,
                             GlobalDeviceId global_device_id,
                             const DeviceAssignment* device_assignment,
@@ -181,7 +191,6 @@ class Thunk {
     const Eigen::ThreadPoolDevice* intra_op_thread_pool = nullptr;
     const ffi::ExecutionContext* ffi_execution_context = nullptr;
 
-   private:
     CustomCallExecuteParams(RunId run_id, int32_t device_ordinal,
                             const Eigen::ThreadPoolDevice* intra_op_thread_pool,
                             const ffi::ExecutionContext* ffi_execution_context);
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.proto b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
index cc9ee4b86a40..32a1fcb192ac 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
@@ -17,6 +17,7 @@ syntax = "proto3";
 
 package xla.cpu;
 
+import "xla/service/buffer_assignment.proto";
 import "xla/service/hlo.proto";
 import "xla/xla_data.proto";
 
@@ -57,15 +58,9 @@ message SortDirectionOptional {
   bool contains_value = 2;
 }
 
-message BufferAllocationSliceProto {
-  int64 offset = 1;
-  int64 size = 2;
-  int64 buffer_allocation_index = 3;
-}
-
 message ShapeBufferAllocationSliceProto {
   xla.ShapeProto shape = 1;
-  BufferAllocationSliceProto slice = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto slice = 2;
 }
 
 message OpParamsProto {
@@ -124,7 +119,7 @@ message CallThunkProto {
 
 message ConditionalThunkProto {
   repeated ThunkSequenceProto branch_sequences = 1;
-  BufferAllocationSliceProto branch_index_buffer = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto branch_index_buffer = 2;
 }
 
 message ConvolutionThunkProto {
@@ -194,23 +189,23 @@ message DotThunkProto {
 
 message RngGetAndUpdateStateThunkProto {
   int64 delta = 1;
-  BufferAllocationSliceProto state_buffer = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto state_buffer = 2;
 }
 
 message TopKThunkProto {
   int64 batch_size = 1;
   int64 input_size = 2;
   int64 k = 3;
-  BufferAllocationSliceProto values_buffer = 4;
-  BufferAllocationSliceProto output_buffer = 5;
-  BufferAllocationSliceProto indices_buffer = 6;
+  xla.buffer_assignment.BufferAllocationSliceProto values_buffer = 4;
+  xla.buffer_assignment.BufferAllocationSliceProto output_buffer = 5;
+  xla.buffer_assignment.BufferAllocationSliceProto indices_buffer = 6;
 }
 
 message WhileThunkProto {
   ThunkSequenceProto cond_sequence = 1;
   ThunkSequenceProto body_sequence = 2;
   Int64Optional trip_count = 3;
-  BufferAllocationSliceProto cond_buffer = 4;
+  xla.buffer_assignment.BufferAllocationSliceProto cond_buffer = 4;
 }
 
 message KernelThunkProto {
@@ -223,8 +218,9 @@ message KernelThunkProto {
   ThreadDim thread_dim = 2;
   repeated int64 invariant_arguments = 3;
   Int64Optional min_alignment = 4;
-  repeated BufferAllocationSliceProto arguments_buffers = 5;
-  repeated BufferAllocationSliceProto results_buffers = 6;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto arguments_buffers =
+      5;
+  repeated xla.buffer_assignment.BufferAllocationSliceProto results_buffers = 6;
 }
 
 message CopyThunkProto {
@@ -272,11 +268,11 @@ message CustomCallThunkProto {
 }
 
 message PartitionIdThunkProto {
-  BufferAllocationSliceProto logical_id_buffer = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto logical_id_buffer = 1;
 }
 
 message ReplicaIdThunkProto {
-  BufferAllocationSliceProto logical_id_buffer = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto logical_id_buffer = 1;
 }
 
 message InfoProto {
@@ -311,5 +307,10 @@ message ThunkProto {
 }
 
 message ThunkSequenceProto {
+  message ResourceUsersProto {
+    repeated int64 thunk_indices = 1;
+    ResourceProto resource = 2;
+  }
   repeated ThunkProto thunks = 1;
+  repeated ResourceUsersProto thunk_resources = 2;
 }
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
index 110350303673..9362e289984f 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.cc
@@ -22,30 +22,31 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
-#include <limits>
 #include <memory>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
 #include "absl/base/optimization.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/execution_graph.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/logging.h"
-#include "xla/util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 #include "tsl/profiler/lib/context_types.h"
@@ -67,36 +68,54 @@ static constexpr bool UseBlockingThunkExecutor() {
 #endif  // XLA_CPU_USE_BLOCKING_THUNK_EXECUTOR
 }
 
+namespace {
+
+// An adaptor from Thunk to ExecutionGraph::Operation for building an execution
+// graph from a thunk sequence.
+class ThunkOperation : public ExecutionGraph::Operation {
+ public:
+  explicit ThunkOperation(Thunk* thunk)
+      : name_(absl::StrFormat("op: %s (kind: %v)", thunk->info().op_name,
+                              thunk->kind())),
+        op_type_id_(static_cast<int64_t>(thunk->kind())),
+        buffer_uses_(thunk->buffer_uses()),
+        resource_uses_(thunk->resource_uses()) {}
+
+  absl::string_view name() const final { return name_; }
+  int64_t op_type_id() const final { return op_type_id_; }
+  absl::Span<const BufferUse> BufferUses() const final { return buffer_uses_; }
+  absl::Span<const ResourceUse> ResourceUses() const final {
+    return resource_uses_;
+  }
+
+ private:
+  std::string name_;
+  int64_t op_type_id_;
+  Thunk::BufferUses buffer_uses_;
+  Thunk::ResourceUses resource_uses_;
+};
+
+}  // namespace
+
+// Converts a ThunkSequence to a vector of ThunkOperations.
+static std::vector<ThunkOperation> CreateThunkOperations(
+    const ThunkSequence& thunk_sequence) {
+  std::vector<ThunkOperation> operations;
+  operations.reserve(thunk_sequence.size());
+  for (const auto& thunk : thunk_sequence) {
+    operations.emplace_back(thunk.get());
+  }
+  return operations;
+}
+
 ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
-                             NodesEdges nodes_in_edges,
-                             NodesEdges nodes_out_edges,
-                             std::vector<NodeDef> nodes_defs,
+                             ExecutionGraph execution_graph,
                              const ThunkExecutor::Options& options)
     : thunk_sequence_(std::move(thunk_sequence)),
+      execution_graph_(std::move(execution_graph)),
       options_(options),
       num_thunks_(thunk_sequence_.size()),
-      nodes_in_edges_(std::move(nodes_in_edges)),
-      nodes_out_edges_(std::move(nodes_out_edges)),
-      nodes_defs_(std::move(nodes_defs)),
-      is_sequential_(true) {
-  for (NodeId i = 0; i < nodes_defs_.size(); ++i) {
-    // Mark nodes with empty in-edges as source nodes.
-    if (nodes_defs_[i].in_edges.empty()) {
-      source_.push_back(i);
-    }
-
-    // Mark nodes with empty out-edges as sink nodes.
-    if (nodes_defs_[i].out_edges.empty()) {
-      sink_.push_back(i);
-    }
-  }
-
-  // Check if constructed execution DAG is sequential: every node depends on the
-  // completion of the previous node.
-  for (NodeId i = 1; i < nodes_defs_.size() && is_sequential_; ++i) {
-    is_sequential_ &= (absl::c_count(nodes_defs_[i].in_edges, i - 1) != 0);
-  }
-
+      is_sequential_(execution_graph_.is_sequential()) {
   // Prefer sequential execution if all thunks use small buffers.
   auto uses_small_buffers = [&](const std::unique_ptr<Thunk>& thunk) {
     return absl::c_all_of(thunk->buffer_uses(), [&](const BufferUse& use) {
@@ -116,68 +135,46 @@ ThunkExecutor::ThunkExecutor(ThunkSequence thunk_sequence,
   is_sequential_ |= UseBlockingThunkExecutor();
 
   VLOG(2) << absl::StreamFormat(
-      "Constructed ThunkExecutor with %d nodes: #source_nodes=%d "
+      "Constructed ThunkExecutor with %d thunks: #source_nodes=%d "
       "#sink_nodes=%d, is_sequential=%v, small_buffers=%v",
-      nodes_defs_.size(), source_.size(), sink_.size(), is_sequential_,
-      small_buffers);
+      num_thunks_, execution_graph_.source().size(),
+      execution_graph_.sink().size(), is_sequential_, small_buffers);
 
-  // Sanity check that all vectors are empty or all vectors are non-empty.
-  DCHECK((!source_.empty() && !sink_.empty() && !thunk_sequence_.empty()) ||
-         (source_.empty() && sink_.empty() && thunk_sequence_.empty()));
-}
-
-absl::StatusOr<ThunkExecutor> ThunkExecutor::Create(
-    ThunkSequence thunk_sequence, const ThunkExecutor::Options& options) {
-  // Make sure that thunk sequence size fits into NodeId.
-  if (thunk_sequence.size() > std::numeric_limits<NodeId>::max()) {
-    return Internal("Can't create ThunkExecutor with more than %d thunks",
-                    std::numeric_limits<NodeId>::max());
-  }
-
-  std::vector<NodeDefBuilder> builders(thunk_sequence.size());
+  VLOG(6) << "ThunkExecutor execution graph:\n" << ToString();
 
-  std::vector<BufferUse::ReadWriteSet> buffer_rwsets(thunk_sequence.size());
-  std::vector<ResourceUse::ReadWriteSet> resource_rwsets(thunk_sequence.size());
+  if (VLOG_IS_ON(8) && !options.is_nested_executor) {
+    ExecutionGraph::Renderer* renderer = ExecutionGraph::GetRenderer();
 
-  // TODO(ezhulenev): This is very inefficient O(N^2) complexity algorithm
-  // that will create a lot of redundant edges. We can do much better by
-  // stopping traversal once we prove that we already have dependencies on the
-  // most recent updates that touch the whole buffer slice.
-
-  for (NodeId i = 0; i < thunk_sequence.size(); ++i) {
-    builders[i].id = i;
-
-    Thunk& thunk = *thunk_sequence[i];
-    buffer_rwsets[i].AddAll(thunk.buffer_uses());
-    resource_rwsets[i].AddAll(thunk.resource_uses());
-
-    for (NodeId j = 0; j < i; ++j) {
-      // Check if node `i` must be executed after node `j`.
-      if (buffer_rwsets[j].HasConflicts(buffer_rwsets[i]) ||
-          resource_rwsets[j].HasConflicts(resource_rwsets[i])) {
-        builders[j].out_edges.push_back(i);
-        builders[i].in_edges.push_back(j);
+    if (renderer == nullptr) {
+      VLOG(8) << "No execution graph renderer registered.";
+    } else {
+      auto operations = CreateThunkOperations(thunk_sequence_);
+      absl::InlinedVector<const ExecutionGraph::Operation*, 32>
+          operations_as_ptr;
+      operations_as_ptr.reserve(operations.size());
+      for (const auto& operation : operations) {
+        operations_as_ptr.push_back(&operation);
+      }
+      auto graph_as_string = renderer->GenerateGraphAsString(operations_as_ptr);
+      absl::StatusOr<std::string> url = renderer->PublishGraph(graph_as_string);
+      if (url.ok()) {
+        VLOG(8) << "Execution graph visualization URL: " << *url;
+      } else {
+        VLOG(8) << url.status();
       }
     }
   }
+}
 
-  // Verify that both in-edges and out-edges are sorted in ascending order as we
-  // use this property later.
-  for (NodeId i = 0; i < builders.size(); ++i) {
-    DCHECK(absl::c_is_sorted(builders[i].out_edges));
-    DCHECK(absl::c_is_sorted(builders[i].in_edges));
-  }
-
-  // Erase redundant edges between nodes.
-  int64_t num_erased_edges =
-      RunTransitiveReductionAndUpdatePriorities(absl::MakeSpan(builders));
-  VLOG(5) << absl::StreamFormat(
-      "Transitive reduction erased %d edges from the nodes graph",
-      num_erased_edges);
+absl::StatusOr<ThunkExecutor> ThunkExecutor::Create(
+    ThunkSequence thunk_sequence, const ThunkExecutor::Options& options) {
+  // Construct an execution graph for the given thunk sequence.
+  TF_ASSIGN_OR_RETURN(ExecutionGraph execution_graph,
+                      ExecutionGraph::Create<ThunkOperation>(
+                          CreateThunkOperations(thunk_sequence)));
 
-  auto [in_edges, out_edges, nodes_defs] = CreateNodeDefs(std::move(builders));
-  return ThunkExecutor(std::move(thunk_sequence), std::move(in_edges),
-                       std::move(out_edges), std::move(nodes_defs), options);
+  return ThunkExecutor(std::move(thunk_sequence), std::move(execution_graph),
+                       options);
 }
 
 ThunkExecutor::ExecuteState::Node::Node(const NodeDef& node_def)
@@ -187,12 +184,12 @@ ThunkExecutor::ExecuteState::ExecuteState(ThunkExecutor* executor,
                                           Thunk::TaskRunner* runner)
     : executor(executor),
       runner(runner),
-      nodes(executor->nodes_defs().size()),
+      nodes(executor->execution_graph_.nodes_defs().size()),
       execute_event(tsl::MakeConstructedAsyncValueRef<ExecuteEvent>()),
-      pending_sink_nodes(executor->sink().size()),
+      pending_nodes(executor->execution_graph_.sink().size()),
       abort(false) {
   NodeStorage* node = nodes.data();
-  for (const NodeDef& node_def : executor->nodes_defs()) {
+  for (const NodeDef& node_def : executor->execution_graph_.nodes_defs()) {
     new (node++) Node(node_def);
   }
 }
@@ -257,13 +254,14 @@ tsl::AsyncValueRef<ThunkExecutor::ExecuteEvent> ThunkExecutor::Execute(
 
   switch (options_.ready_queue_type) {
     case Options::ReadyQueueType::kFifo:
-      execute(FifoReadyQueue(source_));
+      execute(FifoReadyQueue(execution_graph_.source()));
       break;
     case Options::ReadyQueueType::kLifo:
-      execute(LifoReadyQueue(source_));
+      execute(LifoReadyQueue(execution_graph_.source()));
       break;
     case Options::ReadyQueueType::kPriority:
-      execute(PriorityReadyQueue(nodes_defs_, source_));
+      execute(PriorityReadyQueue(execution_graph_.nodes_defs(),
+                                 execution_graph_.source()));
       break;
   }
 
@@ -277,9 +275,9 @@ tsl::AsyncValueRef<ThunkExecutor::ExecuteEvent> ThunkExecutor::Execute(
   // alive while thunk executor has pending tasks.
   tsl::AsyncValueRef<ExecuteEvent> execute_event = state->execute_event;
   execute_event.AndThen([state = std::move(state)] {
-    auto cnt = state->pending_sink_nodes.load(std::memory_order_acquire);
+    auto cnt = state->pending_nodes.load(std::memory_order_acquire);
     DCHECK_EQ(cnt, 0)
-        << "All sink nodes must be completed before execute_event is marked "
+        << "All pending nodes must be completed before execute_event is marked "
            "available.";
   });
 
@@ -435,6 +433,8 @@ void ThunkExecutor::Execute(ExecuteState* state,
     int64_t cnt = node.counter.load(std::memory_order_acquire);
     DCHECK_EQ(cnt, 0) << "Node counter must be 0";
 
+    bool is_sink = node.out_edges.empty();
+
     // If we have multiple ready thunks, split the ready queue and offload
     // thunks processing to the task runner.
     int64_t num_ready_thunks = ready_queue.Size();
@@ -453,9 +453,16 @@ void ThunkExecutor::Execute(ExecuteState* state,
     if (ABSL_PREDICT_TRUE(execute_event.IsAvailable())) {
       // If thunk execution is completed, process out edges in the current
       // thread and keep working on the ready queue.
-      ProcessOutEdges(state, execute_event.AsPtr(), node, ready_queue);
+      ProcessOutEdges</*process_scheduling_edges=*/true>(
+          state, execute_event.AsPtr(), node, ready_queue,
+          /*drop_pending_nodes=*/is_sink);
 
     } else {
+      // Process scheduling edges first, before waiting for the completion of
+      // thunk execution. This allows to schedule more thunks without having to
+      // wait for the execution completion.
+      bool inc_pending_nodes = ProcessOutEdges(state, node, ready_queue);
+
       // If thunk execution is not completed yet, attach a continuation to the
       // event and resume execution on the continuation thread (ready queue
       // processing will continue on a thread that marked event completed).
@@ -466,36 +473,38 @@ void ThunkExecutor::Execute(ExecuteState* state,
       // execute session. If we happen to process the last thunk in the ready
       // queue, we will forward the lock that we already hold (note that the
       // lock might be empty, if `Execute` was called by the main thread).
-      execute_event.AndThen(
-          [&params, &node, state, execute_event = execute_event.AsPtr(),
-           ready_queue = ready_queue.CreateEmptyReadyQueue(),
-           lock = ready_queue.Empty() ? std::move(lock)
-                                      : params.session.Join()]() mutable {
-            state->executor->ProcessOutEdges(state, execute_event, node,
-                                             ready_queue);
-
-            // If ready queue is empty, it might mean that we have completed an
-            // execution and destroyed the `state`, so we make sure we don't
-            // touch `state` if we don't have to.
-            if (ABSL_PREDICT_FALSE(ready_queue.Empty())) {
-              return;
-            }
-
-            Thunk::TaskRunner* runner = state->runner;
-            if (ABSL_PREDICT_TRUE(!runner || runner->current_worker_id())) {
-              // Resume execution in the current thread if we are already
-              // running on a thread managed by the task runner.
-              state->executor->Execute(state, params, std::move(ready_queue),
-                                       std::move(lock));
-            } else {
-              // Resume execution in the task runner to avoid thread "leaks".
-              (*runner)([state, &params, ready_queue = std::move(ready_queue),
-                         lock = std::move(lock)] {
-                state->executor->Execute(state, params, std::move(ready_queue),
-                                         std::move(lock));
-              });
-            }
+      execute_event.AndThen([&params, &node, state, is_sink, inc_pending_nodes,
+                             execute_event = execute_event.AsPtr(),
+                             ready_queue = ready_queue.CreateEmptyReadyQueue(),
+                             lock = ready_queue.Empty()
+                                        ? std::move(lock)
+                                        : params.session.Join()]() mutable {
+        state->executor->ProcessOutEdges</*process_scheduling_edges=*/false>(
+            state, execute_event, node, ready_queue,
+            /*drop_pending_nodes=*/is_sink || inc_pending_nodes);
+
+        // If ready queue is empty, it might mean that we have completed an
+        // execution and destroyed the `state`, so we make sure we don't
+        // touch `state` if we don't have to.
+        if (ABSL_PREDICT_FALSE(ready_queue.Empty())) {
+          return;
+        }
+
+        Thunk::TaskRunner* runner = state->runner;
+        if (ABSL_PREDICT_TRUE(!runner || runner->current_worker_id())) {
+          // Resume execution in the current thread if we are already
+          // running on a thread managed by the task runner.
+          state->executor->Execute(state, params, std::move(ready_queue),
+                                   std::move(lock));
+        } else {
+          // Resume execution in the task runner to avoid thread "leaks".
+          (*runner)([state, &params, ready_queue = std::move(ready_queue),
+                     lock = std::move(lock)] {
+            state->executor->Execute(state, params, std::move(ready_queue),
+                                     std::move(lock));
           });
+        }
+      });
     }
   }
 }
@@ -530,9 +539,47 @@ inline ABSL_ATTRIBUTE_ALWAYS_INLINE void ThunkExecutor::SplitReadyQueue(
 }
 
 template <typename ReadyQueue>
+bool ThunkExecutor::ProcessOutEdges(ExecuteState* state,
+                                    ExecuteState::Node& node,
+                                    ReadyQueue& ready_queue) {
+  bool inc_pending_nodes = false;
+
+  // Append ready nodes to the back of the ready queue.
+  for (NodeEdge out_edge : node.out_edges) {
+    // Do not process execution edges as they'll be processed later after thunk
+    // execution is completed (async execute event becomes available).
+    if (ABSL_PREDICT_TRUE(out_edge.kind == NodeEdge::Kind::kExecution)) {
+      continue;
+    }
+
+    // If it is the first scheduling edge, we must increment the pending nodes
+    // counter to keep thunk executor alive until all scheduled thunks are
+    // completed. We don't need to do that for execution edges, as they have a
+    // path to sink nodes, and executor waits for the completion of all sink
+    // nodes. We must do it before we decrement the node counter, to avoid data
+    // races with the dependent thunks.
+    if (!inc_pending_nodes) {
+      state->pending_nodes.fetch_add(1, std::memory_order_relaxed);
+      inc_pending_nodes = true;
+    }
+
+    ExecuteState::Node& out_node = state->node(out_edge.id);
+
+    int64_t cnt = out_node.counter.fetch_sub(1, std::memory_order_release);
+    DCHECK_GE(cnt, 1) << "Node counter can't drop below 0";
+    if (cnt == 1) {
+      ready_queue.Push(out_edge.id);
+    }
+  }
+
+  return inc_pending_nodes;
+}
+
+template <bool process_scheduling_edges, typename ReadyQueue>
 void ThunkExecutor::ProcessOutEdges(
     ExecuteState* state, tsl::AsyncValuePtr<Thunk::ExecuteEvent> node_event,
-    ExecuteState::Node& node, ReadyQueue& ready_queue) {
+    ExecuteState::Node& node, ReadyQueue& ready_queue,
+    bool drop_pending_nodes) {
   // If thunk execution failed, mark execution as aborted and record the error.
   // We still continue processing the nodes DAG to eventually mark sink nodes
   // completed as it's easier than to add a special abort handling logic.
@@ -542,27 +589,36 @@ void ThunkExecutor::ProcessOutEdges(
     state->abort_status.Update(node_event.GetError());
   }
 
-  // Load `is_sink` before dropping node counters because otherwise it might
-  // race with NodeDef destructor.
-  bool is_sink = node.out_edges.empty();
-
   // Append ready nodes to the back of the ready queue.
-  for (NodeId out_edge : node.out_edges) {
-    ExecuteState::Node& out_node = state->node(out_edge);
+  for (NodeEdge out_edge : node.out_edges) {
+    // Do not process scheduling edges if they were already processed earlier.
+    if (process_scheduling_edges == false &&
+        out_edge.kind == NodeEdge::Kind::kScheduling) {
+      continue;
+    }
+
+    ExecuteState::Node& out_node = state->node(out_edge.id);
 
     int64_t cnt = out_node.counter.fetch_sub(1, std::memory_order_release);
     DCHECK_GE(cnt, 1) << "Node counter can't drop below 0";
-    if (cnt == 1) ready_queue.Push(out_edge);
+    if (cnt == 1) {
+      ready_queue.Push(out_edge.id);
+    }
   }
 
-  // Drop the pending sink nodes counter if the node is a sink.
-  if (ABSL_PREDICT_FALSE(is_sink)) {
-    // Check if it was the last sink node and thunk executor is done. We update
-    // the counter using `std::memory_order_acq_rel` to ensure that the
+  if (ABSL_PREDICT_FALSE(drop_pending_nodes)) {
+    // Check if it was the last pending node and thunk executor is done. We
+    // update the counter using `std::memory_order_acq_rel` to ensure that the
     // remaining memory writes are visible to the consumer of execute event.
     bool is_done =
-        state->pending_sink_nodes.fetch_sub(1, std::memory_order_acq_rel) == 1;
-    if (ABSL_PREDICT_TRUE(!is_done)) return;
+        state->pending_nodes.fetch_sub(1, std::memory_order_acq_rel) == 1;
+    if (ABSL_PREDICT_TRUE(!is_done)) {
+      return;
+    }
+
+    // We can't be done and have pending nodes in the queue at the same time.
+    DCHECK(ready_queue.Empty())
+        << "Ready queue must be empty when execution is completed";
 
     // In the unlikely event of an execution error during thunk execution,
     // forward it to the caller via the execute event.
@@ -580,168 +636,35 @@ void ThunkExecutor::ProcessOutEdges(
   }
 }
 
-std::tuple<ThunkExecutor::NodesEdges, ThunkExecutor::NodesEdges,
-           std::vector<ThunkExecutor::NodeDef>>
-ThunkExecutor::CreateNodeDefs(std::vector<NodeDefBuilder> builders) {
-  // Find how many in-edges and out-edges we have in total.
-  size_t num_in_edges = 0, num_out_edges = 0;
-  for (const NodeDefBuilder& b : builders) {
-    num_in_edges += b.in_edges.size();
-    num_out_edges += b.out_edges.size();
-  }
-
-  NodesEdges nodes_in_edges;
-  NodesEdges nodes_out_edges;
-  std::vector<NodeDef> nodes_defs;
-
-  // Reserve memory to avoid re-allocation and dangling spans into freed memory.
-  nodes_in_edges.reserve(num_in_edges);
-  nodes_out_edges.reserve(num_out_edges);
-  nodes_defs.reserve(builders.size());
-
-  for (const NodeDefBuilder& b : builders) {
-    size_t num_in_edges = b.in_edges.size();
-    size_t num_out_edges = b.out_edges.size();
-
-    auto inserted_in_edges = nodes_in_edges.insert(
-        nodes_in_edges.end(), b.in_edges.begin(), b.in_edges.end());
-    auto inserted_out_edges = nodes_out_edges.insert(
-        nodes_out_edges.end(), b.out_edges.begin(), b.out_edges.end());
-
-    nodes_defs.push_back(NodeDef{
-        b.id, b.priority,
-        num_in_edges ? absl::MakeConstSpan(&*inserted_in_edges, num_in_edges)
-                     : absl::Span<const NodeId>(),
-        num_out_edges ? absl::MakeConstSpan(&*inserted_out_edges, num_out_edges)
-                      : absl::Span<const NodeId>()});
-  }
-
-  return std::make_tuple(std::move(nodes_in_edges), std::move(nodes_out_edges),
-                         std::move(nodes_defs));
-}
-
-// Erases edge from `from` node to `to` node if it exists. We rely on the fact
-// that out and in-edges are sorted and use binary search on a critical path.
-static int64_t EraseEdge(ThunkExecutor::NodeDefBuilder& from,
-                         ThunkExecutor::NodeDefBuilder& to) {
-  DCHECK_NE(from.id, to.id) << "Nodes must be different";
-  DCHECK_LT(from.id, to.id) << "Nodes must be ordered";
-
-  // Short-circuit if out or in-edges are empty.
-  if (from.out_edges.empty() || to.in_edges.empty()) {
-    DCHECK_EQ(absl::c_count(from.out_edges, to.id), 0) << "Unexpected out edge";
-    DCHECK_EQ(absl::c_count(to.in_edges, from.id), 0) << "Unexpected in edge";
-    return 0;
-  }
-
-  // Short-circuit if out-edges or in-edges don't intersect with `to` or `from`
-  // node ids (remember that edges are sorted).
-  if (from.out_edges.back() < to.id || to.in_edges.front() > from.id) {
-    DCHECK_EQ(absl::c_count(from.out_edges, to.id), 0) << "Unexpected out edge";
-    DCHECK_EQ(absl::c_count(to.in_edges, from.id), 0) << "Unexpected in edge";
-    return 0;
-  }
-
-  // Check if `from` node has an out edge to `to` node.
-  auto out_edges_it = absl::c_lower_bound(from.out_edges, to.id);
-  bool has_out_edge =
-      out_edges_it != from.out_edges.end() && *out_edges_it == to.id;
-
-  // Short-circuit if there is no out edge from `from` node to `to` node.
-  if (!has_out_edge) {
-    DCHECK_EQ(absl::c_count(to.in_edges, from.id), 0) << "Unexpected in edge";
-    return 0;
-  }
-
-  // Check if `to` node has an in edge from `from` node.
-  auto in_edges_it = absl::c_lower_bound(to.in_edges, from.id);
-  bool has_in_edge =
-      in_edges_it != to.in_edges.end() && *in_edges_it == from.id;
-
-  DCHECK(has_in_edge) << "In-edge must exist if out-edge exists";
-
-  from.out_edges.erase(out_edges_it);
-  to.in_edges.erase(in_edges_it);
-
-  // We erased one edge between `from` and `to` nodes.
-  return 1;
-}
-
-int64_t ThunkExecutor::RunTransitiveReductionAndUpdatePriorities(
-    absl::Span<NodeDefBuilder> builders) {
-  int64_t num_erased_edges = 0;
-
-  // Keep workspace for DFS traversal between iterations.
-  std::vector<int64_t> stack;
-  std::vector<bool> visited;
-
-  auto add_to_stack = [&](int64_t node_id) {
-    if (!visited[node_id]) {
-      stack.push_back(node_id);
-      visited[node_id] = true;
-    }
-  };
-
-  // For each node we do a DFS traversal and delete redundant edges that
-  // connect source node with the node reachable via DFS. We do traversal in
-  // reverse order as we end up traversing fewer edges this way.
-  for (int64_t i = builders.size() - 1; i >= 0; --i) {
-    NodeDefBuilder& source_node = builders[i];
-
-    // Clear DFS workspace from previous iteration.
-    stack.clear();
-    visited.assign(builders.size(), false);
-
-    // Initialize stack with nodes reachable via immediate out nodes. We mark
-    // immediate out nodes as visited to correctly compute node priority below.
-    for (int64_t out_id : source_node.out_edges) {
-      NodeDefBuilder& out_node = builders[out_id];
-      visited[out_id] = true;
-      for (int64_t start_id : out_node.out_edges) add_to_stack(start_id);
-    }
-
-    // Traverse the graph and delete redundant edges.
-    while (!stack.empty()) {
-      int64_t node_id = stack.back();
-      stack.pop_back();
-
-      NodeDefBuilder& node = builders[node_id];
-      num_erased_edges += EraseEdge(source_node, node);
-
-      for (int64_t out_id : node.out_edges) add_to_stack(out_id);
-    }
-
-    // Set node priority to the number of visited nodes in the DFS traversal.
-    source_node.priority = absl::c_count(visited, true);
-  }
-
-  return num_erased_edges;
-}
-
 std::string ThunkExecutor::ToString() const {
   std::string str = absl::StrFormat(
       "ThunkExecutor: #thunks=%d #source_nodes=%d #sink_nodes=%d", num_thunks_,
-      source_.size(), sink_.size());
+      execution_graph_.source().size(), execution_graph_.sink().size());
 
   // Collect names of `in_edges`.
   std::vector<std::vector<std::string>> in_edges(num_thunks_);
-  for (const auto& node_def : nodes_defs_) {
-    for (NodeId in_edge : node_def.in_edges) {
-      in_edges[node_def.id].push_back(thunk_sequence_[in_edge]->info().op_name);
+  for (const auto& node_def : execution_graph_.nodes_defs()) {
+    for (NodeEdge in_edge : node_def.in_edges) {
+      in_edges[node_def.id].push_back(
+          thunk_sequence_[in_edge.id]->info().op_name);
     }
   }
 
+  absl::Span<const NodeId> source = execution_graph_.source();
+  absl::Span<const NodeId> sink = execution_graph_.sink();
+
   // Print thunks with a list of their dependencies;
   for (NodeId i = 0; i < num_thunks_; ++i) {
     const Thunk& thunk = *thunk_sequence_[i];
-    bool is_source = absl::c_find(source_, i) != source_.end();
-    bool is_sink = absl::c_find(sink_, i) != sink_.end();
-    absl::StrAppendFormat(&str,
-                          "\n thunk #%05d: op_name=%s, dependencies=[%s], "
-                          "source=%v, sink=%v, priority=%d",
-                          i, thunk.info().op_name,
-                          absl::StrJoin(in_edges[i], ", "), is_source, is_sink,
-                          nodes_defs_[i].priority);
+    bool is_source = absl::c_find(source, i) != source.end();
+    bool is_sink = absl::c_find(sink, i) != sink.end();
+    absl::StrAppendFormat(
+        &str,
+        "\n thunk #%05d: op_name=%s, kind=%s, dependencies=[%s], "
+        "source=%v, sink=%v, priority=%d",
+        i, thunk.info().op_name, Thunk::KindToString(thunk.kind()),
+        absl::StrJoin(in_edges[i], ", "), is_source, is_sink,
+        execution_graph_.priority(i));
   }
 
   return str;
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.h b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.h
index b684d74e9dbf..3baf9bac0705 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_executor.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_executor.h
@@ -19,12 +19,11 @@ limitations under the License.
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <new>
 #include <queue>
 #include <string>
-#include <tuple>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -36,31 +35,11 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/execution_graph.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla::cpu {
 
-namespace internal {
-// Clang does not allow defining a nested struct with member initializer, as
-// a workaround we define a struct in internal namespace and create an alias.
-struct ThunkExecutorOptions {
-  enum class ReadyQueueType { kFifo, kLifo, kPriority };
-
-  // If all thunks in a sequence use buffers of size less than or equal to the
-  // given threshold, we mark execution as sequential, as concurrency overheads
-  // will likely dominate the overall execution time.
-  size_t execute_sequential_buffer_threshold = 512;
-
-  // If thunk sequence length is less than or equal to the given threshold, we
-  // mark execution as sequential, as concurrency overheads will likely dominate
-  // the overall execution time.
-  size_t execute_sequential_num_thunks_threshold = 8;
-
-  // The type of a queue for ready thunks.
-  ReadyQueueType ready_queue_type = ReadyQueueType::kFifo;
-};
-}  // namespace internal
-
 // A dataflow-style (run when ready) executor for a ThunkSequence that depends
 // on buffer uses to build a DAG defining execution order. At run time executes
 // thunks concurrently in a given thread pool.
@@ -69,41 +48,37 @@ class ThunkExecutor {
   using BufferUses = Thunk::BufferUses;
   using ResourceUses = Thunk::ResourceUses;
   using ExecuteEvent = Thunk::ExecuteEvent;
-  using Options = internal::ThunkExecutorOptions;
-
-  // Nodes identified by their index in the captured ThunkSequence.
-  using NodeId = int32_t;
-
-  static constexpr NodeId kInvalidNodeId = std::numeric_limits<NodeId>::min();
 
   ThunkExecutor(ThunkExecutor&&) = default;
   ThunkExecutor& operator=(ThunkExecutor&&) = default;
 
-  static absl::StatusOr<ThunkExecutor> Create(
-      ThunkSequence thunk_sequence, const Options& options = Options());
+  struct Options {
+    enum class ReadyQueueType { kFifo, kLifo, kPriority };
 
-  // We store all `in_edges` and `out_edges` referenced by the `NodeDef` inside
-  // large vectors to optimize for data locality on a hot path.
-  using NodesEdges = std::vector<NodeId>;
+    // If all thunks in a sequence use buffers of size less than or equal to the
+    // given threshold, we mark execution as sequential, as concurrency
+    // overheads will likely dominate the overall execution time.
+    size_t execute_sequential_buffer_threshold = 512;
 
-  // NodeDef defines an execution order for all thunks in a sequence.
-  struct NodeDef {
-    NodeId id = kInvalidNodeId;
-    int64_t priority = 0;
-    absl::Span<const NodeId> in_edges;
-    absl::Span<const NodeId> out_edges;
-  };
+    // If thunk sequence length is less than or equal to the given threshold, we
+    // mark execution as sequential, as concurrency overheads will likely
+    // dominate the overall execution time.
+    size_t execute_sequential_num_thunks_threshold = 8;
+
+    // The type of a queue for ready thunks.
+    ReadyQueueType ready_queue_type = ReadyQueueType::kFifo;
 
-  // A NodeDef builder to collect all in-edges and out-edges before constructing
-  // a NodeDef. We use it at ThunkExecutor creation time when we don't know how
-  // many in-edges and out-edges we have in total.
-  struct NodeDefBuilder {
-    NodeId id = kInvalidNodeId;
-    int64_t priority = 0;
-    std::vector<NodeId> in_edges;
-    std::vector<NodeId> out_edges;
+    // Flag denoting whether the executor is nested within another executor.
+    bool is_nested_executor = true;
   };
 
+  static absl::StatusOr<ThunkExecutor> Create(ThunkSequence thunk_sequence,
+                                              const Options& options);
+
+  static absl::StatusOr<ThunkExecutor> Create(ThunkSequence thunk_sequence) {
+    return Create(std::move(thunk_sequence), Options());
+  }
+
   // Executes the thunk sequence using the prepared dataflow graph. Executor
   // uses runner to execute ready tasks concurrently. If runner is not provided,
   // executes all tasks in the caller thread.
@@ -114,12 +89,6 @@ class ThunkExecutor {
 
   const ThunkSequence& thunk_sequence() const { return thunk_sequence_; }
 
-  absl::Span<const NodeDef> nodes_defs() const { return nodes_defs_; }
-  const NodeDef& node_def(NodeId id) const { return nodes_defs_[id]; }
-
-  absl::Span<const NodeId> source() const { return source_; }
-  absl::Span<const NodeId> sink() const { return sink_; }
-
   BufferUses buffer_uses() const { return thunk_sequence_.buffer_uses(); }
   ResourceUses resource_uses() const { return thunk_sequence_.resource_uses(); }
 
@@ -127,6 +96,11 @@ class ThunkExecutor {
 
   bool is_sequential() const { return is_sequential_; }
 
+  // We use underlying execution graph nodes to index into the thunk sequence.
+  using NodeId = ExecutionGraph::NodeId;
+  using NodeDef = ExecutionGraph::NodeDef;
+  using NodeEdge = ExecutionGraph::NodeEdge;
+
   // A ready queue that executes nodes in FIFO order.
   class FifoReadyQueue {
    public:
@@ -215,7 +189,7 @@ class ThunkExecutor {
       explicit Node(const NodeDef& node_def);
 
       alignas(kAtomicAlignment) std::atomic<int64_t> counter;
-      absl::Span<const NodeId> out_edges;
+      absl::Span<const NodeEdge> out_edges;
     };
 
     static_assert(std::is_trivially_destructible_v<Node>,
@@ -242,9 +216,9 @@ class ThunkExecutor {
     absl::FixedArray<NodeStorage> nodes;
     tsl::AsyncValueRef<ExecuteEvent> execute_event;
 
-    // Once the number of pending sink nodes drops to zero, the execution is
+    // Once the number of pending nodes drops to zero, the execution is
     // completed and we set `execute_event` as concrete or error.
-    alignas(kAtomicAlignment) std::atomic<int64_t> pending_sink_nodes;
+    alignas(kAtomicAlignment) std::atomic<int64_t> pending_nodes;
 
     // We store the first error from failed thunks in `abort_status` and at the
     // end of execution the executor forwards it via the `execute_event`.
@@ -253,8 +227,7 @@ class ThunkExecutor {
     absl::Status abort_status ABSL_GUARDED_BY(abort_mutex);
   };
 
-  ThunkExecutor(ThunkSequence thunk_sequence, NodesEdges nodes_in_edges,
-                NodesEdges nodes_out_edges, std::vector<NodeDef> nodes_defs,
+  ThunkExecutor(ThunkSequence thunk_sequence, ExecutionGraph execution_graph,
                 const Options& options);
 
   // Executes given `thunk` with `params` and adds tracing annotation to capture
@@ -284,41 +257,32 @@ class ThunkExecutor {
   void SplitReadyQueue(ExecuteState* state, const Thunk::ExecuteParams& params,
                        ReadyQueue& ready_queue, int64_t split_threshold);
 
+  // Processes out edges of a scheduled `node` and updates `ready_queue` with
+  // nodes that are ready to execute. Returns true if `node` had any scheduling
+  // edges, and pending nodes counter was incremented, and must be dropped when
+  // `node` is completed.
+  template <typename ReadyQueue>
+  bool ProcessOutEdges(ExecuteState* state, ExecuteState::Node& node,
+                       ReadyQueue& ready_queue);
+
   // Processes out edges of a completed `node` and updates `ready_queue` with
   // nodes that are ready to execute. If `node_event` is in error state, aborts
   // the execution and records the error status to forward it to the caller.
-  template <typename ReadyQueue>
+  template <bool process_scheduling_edges, typename ReadyQueue>
   void ProcessOutEdges(ExecuteState* state,
                        tsl::AsyncValuePtr<Thunk::ExecuteEvent> node_event,
-                       ExecuteState::Node& node, ReadyQueue& ready_queue);
-
-  // Converts a vector of NodeDefBuilder to a tuple of NodesEdges and a vector
-  // of NodeDef.
-  static std::tuple<NodesEdges, NodesEdges, std::vector<NodeDef>>
-  CreateNodeDefs(std::vector<NodeDefBuilder> builders);
-
-  // Runs a transitive reduction on the NodeDefBuilder graph to remove redundant
-  // edges, and updates nodes priorities. Returns the number of removed edges.
-  //
-  // See: https://en.wikipedia.org/wiki/Transitive_reduction
-  static int64_t RunTransitiveReductionAndUpdatePriorities(
-      absl::Span<NodeDefBuilder> builders);
+                       ExecuteState::Node& node, ReadyQueue& ready_queue,
+                       bool drop_pending_nodes);
 
   ThunkSequence thunk_sequence_;
+  ExecutionGraph execution_graph_;
   Options options_;
 
   int64_t num_thunks_;
 
-  NodesEdges nodes_in_edges_;   // `in_edges` referenced by `nodes_defs_`
-  NodesEdges nodes_out_edges_;  // `out_edges` referenced by `nodes_defs_`
-  std::vector<NodeDef> nodes_defs_;
-
-  std::vector<NodeId> source_;
-  std::vector<NodeId> sink_;
-
-  // If NodeDef graph dependency structure is sequential and does not have any
-  // opportunities for executing thunks concurrently, we skip the expensive
-  // async execution and simply run thunks in the `thunk_sequence_` one by one.
+  // In addition to the execution graph sequential ordering property, we use
+  // heuristics to use sequential execution for sequences of small thunks where
+  // async execution overhead will likely dominate the overall execution time.
   bool is_sequential_;
 };
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc
index fee3485030db..fbabd6ed8bf8 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_executor_test.cc
@@ -33,13 +33,13 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thread_pool_task_runner.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -103,18 +103,22 @@ auto MakeTaskRunnerFrom(Runner&& runner, WorkerId&& worker_id) {
 // We generate random thunk sequences reading and writing different slices of
 // the same buffer, and check that at run time it does not lead to any data
 // races and produces expected result.
+//
+// We also emulate shared resource access by writing to the global static
+// `shared_resource` variable and detecting data races with thread sanitizer.
 class AddI32Thunk final : public Thunk {
  public:
   AddI32Thunk(std::string name, std::vector<BufferAllocation::Slice> srcs,
               std::vector<BufferAllocation::Slice> dsts,
-              std::vector<std::string>* trace, bool use_shared_resource,
-              bool inject_error);
+              std::vector<std::string>* trace,
+              std::optional<Resource::Kind> shared_resource, bool inject_error);
 
   static std::unique_ptr<Thunk> Create(
       std::string name, std::vector<BufferAllocation::Slice> srcs,
       std::vector<BufferAllocation::Slice> dsts,
       std::vector<std::string>* trace = nullptr,
-      bool use_shared_resource = false, bool inject_error = false);
+      std::optional<Resource::Kind> shared_resource = std::nullopt,
+      bool inject_error = false);
 
   // Executes `dst += src` for a single src/dst pair.
   static absl::Status Execute(const BufferAllocations* allocations,
@@ -130,29 +134,30 @@ class AddI32Thunk final : public Thunk {
   std::vector<BufferAllocation::Slice> srcs_;
   std::vector<BufferAllocation::Slice> dsts_;
   std::vector<std::string>* trace_;
-  bool use_shared_resource_;
+  std::optional<Resource::Kind> shared_resource_;
   bool inject_error_;
 };
 
 std::unique_ptr<Thunk> AddI32Thunk::Create(
     std::string name, std::vector<BufferAllocation::Slice> srcs,
     std::vector<BufferAllocation::Slice> dsts, std::vector<std::string>* trace,
-    bool use_shared_resource, bool inject_error) {
+    std::optional<Resource::Kind> shared_resource, bool inject_error) {
   return std::make_unique<AddI32Thunk>(std::move(name), std::move(srcs),
-                                       std::move(dsts), trace,
-                                       use_shared_resource, inject_error);
+                                       std::move(dsts), trace, shared_resource,
+                                       inject_error);
 }
 
 AddI32Thunk::AddI32Thunk(std::string name,
                          std::vector<BufferAllocation::Slice> srcs,
                          std::vector<BufferAllocation::Slice> dsts,
                          std::vector<std::string>* trace,
-                         bool use_shared_resource, bool inject_error)
+                         std::optional<Resource::Kind> shared_resource,
+                         bool inject_error)
     : Thunk(Kind::kKernel, Info{name}),
       srcs_(std::move(srcs)),
       dsts_(std::move(dsts)),
       trace_(trace),
-      use_shared_resource_(use_shared_resource),
+      shared_resource_(shared_resource),
       inject_error_(inject_error) {}
 
 absl::Status AddI32Thunk::Execute(const BufferAllocations* allocations,
@@ -171,14 +176,18 @@ absl::Status AddI32Thunk::Execute(const BufferAllocations* allocations,
   int32_t* dst_ptr = static_cast<int32_t*>(dst.opaque());
   size_t len = std::min(src.size(), dst.size()) / sizeof(int32_t);
 
-  for (int j = 0; j < len; ++j) dst_ptr[j] += src_ptr[j];
+  for (int j = 0; j < len; ++j) {
+    dst_ptr[j] += src_ptr[j];
+  }
 
   return absl::OkStatus();
 }
 
 tsl::AsyncValueRef<Thunk::ExecuteEvent> AddI32Thunk::Execute(
     const ExecuteParams& params) {
-  if (trace_) trace_->push_back(info().op_name);
+  if (trace_) {
+    trace_->push_back(info().op_name);
+  }
 
   auto execute = [&]() -> absl::Status {
     CHECK_EQ(srcs_.size(), dsts_.size());
@@ -192,8 +201,22 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> AddI32Thunk::Execute(
   // Offload the execution to the intra-op thread pool.
   if (params.intra_op_threadpool) {
     auto event = tsl::MakeConstructedAsyncValueRef<ExecuteEvent>();
+
+    // Collective communicator resource creates only a scheduling edge,
+    // incrementing the shared resource value from the intra-op thread pool
+    // might lead to data races. ThunkExecutor will only guarantee that calls
+    // to Execute() are properly synchronized.
+    if (shared_resource_ &&
+        *shared_resource_ == Resource::kCollectiveCommunicator) {
+      shared_resource++;
+    }
+
     params.intra_op_threadpool->getPool()->Schedule([&, event, execute] {
-      if (use_shared_resource_) {
+      // Token creates an execution edge, and it means that dependent thunks
+      // will wait for the completion of execution of all dependencies, and we
+      // verify that we don't have data races by mutating shared resource from a
+      // task that runs on a thread pool.
+      if (shared_resource_ && *shared_resource_ == Resource::kToken) {
         shared_resource++;
       }
 
@@ -207,7 +230,7 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> AddI32Thunk::Execute(
     return event;
   }
 
-  if (use_shared_resource_) {
+  if (shared_resource_) {
     shared_resource++;
   }
 
@@ -221,18 +244,33 @@ tsl::AsyncValueRef<Thunk::ExecuteEvent> AddI32Thunk::Execute(
 
 AddI32Thunk::BufferUses AddI32Thunk::buffer_uses() const {
   BufferUses buffer_uses;
-  for (const auto& src : srcs_) buffer_uses.push_back(BufferUse::Read(src));
-  for (const auto& dst : dsts_) buffer_uses.push_back(BufferUse::Write(dst));
+  for (const auto& src : srcs_) {
+    buffer_uses.push_back(BufferUse::Read(src));
+  }
+  for (const auto& dst : dsts_) {
+    buffer_uses.push_back(BufferUse::Write(dst));
+  }
   return buffer_uses;
 }
 
 AddI32Thunk::ResourceUses AddI32Thunk::resource_uses() const {
-  static std::shared_ptr<Resource>* shared_resource =
+  static std::shared_ptr<Resource>* token_resource =
       new std::shared_ptr<Resource>(Resource::Create(Resource::kToken));
 
-  return use_shared_resource_
-             ? ResourceUses{ResourceUse::Write(*shared_resource)}
-             : ResourceUses{};
+  static std::shared_ptr<Resource>* comm_resource =
+      new std::shared_ptr<Resource>(
+          Resource::Create(Resource::kCollectiveCommunicator));
+
+  if (!shared_resource_) {
+    return ResourceUses{};
+  }
+
+  switch (*shared_resource_) {
+    case Resource::kToken:
+      return ResourceUses{ResourceUse::Write(*token_resource)};
+    case Resource::kCollectiveCommunicator:
+      return ResourceUses{ResourceUse::Write(*comm_resource)};
+  }
 }
 
 static ThunkExecutor::Options OptionsForTest() {
@@ -415,105 +453,6 @@ TEST(ThunkExecutorTest, PriorityReadyQueueTest) {
   EXPECT_EQ(half2.Pop(), 1);
 }
 
-TEST(ThunkExecutorTest, DependencyOrdering) {
-  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
-
-  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
-  BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
-  BufferAllocation::Slice slice2(&alloc, /*offset=*/20, /*size=*/40);
-
-  ThunkSequence sequence;
-  sequence.push_back(AddI32Thunk::Create("a", {slice0}, {slice0}));
-  sequence.push_back(AddI32Thunk::Create("b", {slice1}, {slice1}));
-  sequence.push_back(AddI32Thunk::Create("c", {slice2}, {slice2}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      ThunkExecutor executor,
-      ThunkExecutor::Create(std::move(sequence), OptionsForTest()));
-
-  EXPECT_FALSE(executor.is_sequential());
-  EXPECT_THAT(executor.source(), ElementsAre(0, 1));
-  EXPECT_THAT(executor.sink(), ElementsAre(2));
-
-  EXPECT_EQ(executor.node_def(0).priority, 1);
-  EXPECT_EQ(executor.node_def(1).priority, 1);
-  EXPECT_EQ(executor.node_def(2).priority, 0);
-}
-
-TEST(ThunkExecutorTest, SequentialOrdering) {
-  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
-  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
-
-  ThunkSequence sequence;
-  sequence.push_back(AddI32Thunk::Create("a", {slice}, {slice}));
-  sequence.push_back(AddI32Thunk::Create("b", {slice}, {slice}));
-  sequence.push_back(AddI32Thunk::Create("c", {slice}, {slice}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      ThunkExecutor executor,
-      ThunkExecutor::Create(std::move(sequence), OptionsForTest()));
-
-  EXPECT_TRUE(executor.is_sequential());
-  EXPECT_THAT(executor.source(), ElementsAre(0));
-  EXPECT_THAT(executor.sink(), ElementsAre(2));
-
-  EXPECT_EQ(executor.node_def(0).priority, 2);
-  EXPECT_EQ(executor.node_def(1).priority, 1);
-  EXPECT_EQ(executor.node_def(2).priority, 0);
-}
-
-TEST(ThunkExecutorTest, ResourceOrdering) {
-  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
-
-  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
-  BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
-
-  ThunkSequence sequence;
-  sequence.push_back(AddI32Thunk::Create("a", {slice0}, {slice0},
-                                         /*trace=*/nullptr,
-                                         /*use_shared_resource=*/true));
-  sequence.push_back(AddI32Thunk::Create("b", {slice1}, {slice1},
-                                         /*trace=*/nullptr,
-                                         /*use_shared_resource=*/true));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      ThunkExecutor executor,
-      ThunkExecutor::Create(std::move(sequence), OptionsForTest()));
-
-  EXPECT_TRUE(executor.is_sequential());
-  EXPECT_THAT(executor.source(), ElementsAre(0));
-  EXPECT_THAT(executor.sink(), ElementsAre(1));
-
-  EXPECT_EQ(executor.node_def(0).priority, 1);
-  EXPECT_EQ(executor.node_def(1).priority, 0);
-}
-
-TEST(ThunkExecutorTest, TransitiveReduction) {
-  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
-  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
-
-  ThunkSequence sequence;
-  sequence.push_back(AddI32Thunk::Create("a", {slice}, {slice}));
-  sequence.push_back(AddI32Thunk::Create("b", {slice}, {slice}));
-  sequence.push_back(AddI32Thunk::Create("c", {slice}, {slice}));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      ThunkExecutor executor,
-      ThunkExecutor::Create(std::move(sequence), OptionsForTest()));
-
-  EXPECT_THAT(executor.source(), ElementsAre(0));
-  EXPECT_THAT(executor.sink(), ElementsAre(2));
-
-  EXPECT_THAT(executor.node_def(0).out_edges, ElementsAre(1));
-  EXPECT_THAT(executor.node_def(1).in_edges, ElementsAre(0));
-  EXPECT_THAT(executor.node_def(1).out_edges, ElementsAre(2));
-  EXPECT_THAT(executor.node_def(2).in_edges, ElementsAre(1));
-
-  EXPECT_EQ(executor.node_def(0).priority, 2);
-  EXPECT_EQ(executor.node_def(1).priority, 1);
-  EXPECT_EQ(executor.node_def(2).priority, 0);
-}
-
 TEST(ThunkExecutorTest, Execute) {
   BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
 
@@ -653,7 +592,12 @@ TEST(ThunkExecutorTest, ExecuteOnCorrectThreadPool) {
 //===----------------------------------------------------------------------===//
 
 // We generate random thunk sequences that may or may not use a shared resource.
-enum class SharedResourceUse { kNo, kAll, kRandom };
+struct SharedResourceUse {
+  enum class Kind { kNo, kAll, kRandom };
+
+  Kind kind;
+  std::optional<Resource::Kind> resource_kind;
+};
 
 struct GeneratedThunkSequence {
   explicit GeneratedThunkSequence(int64_t num_elements)
@@ -712,17 +656,23 @@ GenerateThunkSequence(size_t num_elements, size_t num_thunks,
         CreateBufferAllocations(absl::MakeSpan(g->expected_literals));
     TF_RETURN_IF_ERROR(AddI32Thunk::Execute(&allocations, src, dst));
 
-    bool use_resource = [&] {
-      switch (shared_resource_use) {
-        case SharedResourceUse::kNo:
-          return false;
-        case SharedResourceUse::kAll:
-          return true;
-        case SharedResourceUse::kRandom:
-          return use_resource_dist(engine) == 0;
+    auto use_resource = [&]() -> std::optional<Resource::Kind> {
+      switch (shared_resource_use.kind) {
+        case SharedResourceUse::Kind::kNo:
+          return std::nullopt;
+        case SharedResourceUse::Kind::kAll:
+          return shared_resource_use.resource_kind;
+        case SharedResourceUse::Kind::kRandom:
+          if (use_resource_dist(engine) == 0) {
+            return shared_resource_use.resource_kind;
+          }
+          return std::nullopt;
       }
     }();
-    if (use_resource) g->expected_shared_resource_value++;
+
+    if (use_resource) {
+      g->expected_shared_resource_value++;
+    }
 
     bool inject_error = inject_errors && inject_error_dist(engine) == 0;
     g->sequence.push_back(AddI32Thunk::Create(absl::StrCat(i), {src}, {dst},
@@ -760,13 +710,11 @@ class ThunkExecutorStressTest
   }
 
   Thunk::TaskRunner* task_runner() {
-    if (!use_task_runner_) return nullptr;
-    return &*task_runner_;
+    return use_task_runner_ ? &*task_runner_ : nullptr;
   }
 
   Eigen::ThreadPoolDevice* device() {
-    if (!use_device_) return nullptr;
-    return &*device_;
+    return use_device_ ? &*device_ : nullptr;
   }
 
  private:
@@ -815,17 +763,39 @@ TEST_P(ThunkExecutorStressTest, Execute) {
   }
 }
 
+// We keep the number of thunks smaller in debug builds as otherwise it takes
+// too long to run the tests. In optimized builds we can afford to run longer
+// thunk sequences to get more coverage.
+auto NumTestThunks() {
+#ifdef NDEBUG
+  return testing::ValuesIn({10, 50, 100});
+#else
+  return testing::ValuesIn({10, 100, 500});
+#endif
+}
+
+// Create aliases for all possible combinations of shared resource use.
+static constexpr auto kToken = Resource::Kind::kToken;
+static constexpr auto kComm = Resource::Kind::kCollectiveCommunicator;
+static constexpr auto kNoResource = SharedResourceUse::Kind::kNo;
+static constexpr auto kAllResource = SharedResourceUse::Kind::kAll;
+static constexpr auto kRandomResource = SharedResourceUse::Kind::kRandom;
+
 INSTANTIATE_TEST_SUITE_P(
     ThunkExecutor, ThunkExecutorStressTest,
-    testing::Combine(/*num_thunks=*/testing::ValuesIn({10, 100, 1000}),
-                     /*use_task_runner=*/testing::Bool(),
-                     /*use_device=*/testing::Bool(),
-                     /*shared_resource_use=*/
-                     testing::Values(SharedResourceUse::kNo,
-                                     SharedResourceUse::kAll,
-                                     SharedResourceUse::kRandom),
-                     /*inject_errors=*/testing::Bool(),
-                     /*use_priority_ready_queue=*/testing::Bool()));
+    testing::Combine(
+        /*num_thunks=*/NumTestThunks(),
+        /*use_task_runner=*/testing::Bool(),
+        /*use_device=*/testing::Bool(),
+        /*shared_resource_use=*/
+        testing::Values(SharedResourceUse{kNoResource, kToken},
+                        SharedResourceUse{kAllResource, kToken},
+                        SharedResourceUse{kRandomResource, kToken},
+                        SharedResourceUse{kNoResource, kComm},
+                        SharedResourceUse{kAllResource, kComm},
+                        SharedResourceUse{kRandomResource, kComm}),
+        /*inject_errors=*/testing::Bool(),
+        /*use_priority_ready_queue=*/testing::Bool()));
 
 //===----------------------------------------------------------------------===//
 // Performance benchmarks below
@@ -918,7 +888,7 @@ static void BM_CreateThunkExecutor(benchmark::State& state) {
 
   for (auto _ : state) {
     auto g = GenerateThunkSequence(/*num_elements=*/1024, num_thunks,
-                                   SharedResourceUse::kNo, false);
+                                   {kNoResource}, false);
     CHECK_OK(ThunkExecutor::Create(std::move((*g)->sequence), OptionsForTest())
                  .status());
   }
@@ -927,11 +897,10 @@ static void BM_CreateThunkExecutor(benchmark::State& state) {
 static void BM_SequentialThunkExecutor(benchmark::State& state) {
   const size_t num_thunks = state.range(0);
 
-  auto g =
-      GenerateThunkSequence(/*num_elements=*/1024, num_thunks,
-                            /*shared_resource_use=*/SharedResourceUse::kAll,
-                            /*inject_errors=*/false)
-          .value();
+  auto g = GenerateThunkSequence(/*num_elements=*/1024, num_thunks,
+                                 /*shared_resource_use=*/{kAllResource, kToken},
+                                 /*inject_errors=*/false)
+               .value();
   auto e =
       ThunkExecutor::Create(std::move(g->sequence), OptionsForTest()).value();
 
@@ -950,7 +919,7 @@ static void BM_SyncThunkExecutor(benchmark::State& state) {
   const size_t num_thunks = state.range(0);
 
   auto g = GenerateThunkSequence(/*num_elements=*/1024, num_thunks,
-                                 /*shared_resource_use=*/SharedResourceUse::kNo,
+                                 /*shared_resource_use=*/{kNoResource},
                                  /*inject_errors=*/false);
   auto e = ThunkExecutor::Create(std::move((*g)->sequence), OptionsForTest());
 
@@ -973,7 +942,7 @@ static void BM_AsyncThunkExecutor(benchmark::State& state) {
                                  thread_pool.NumThreads());
 
   auto g = GenerateThunkSequence(/*num_elements=*/1024, num_thunks,
-                                 /*shared_resource_use=*/SharedResourceUse::kNo,
+                                 /*shared_resource_use=*/{kNoResource},
                                  /*inject_errors=*/false);
   auto e = ThunkExecutor::Create(std::move((*g)->sequence), OptionsForTest());
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
index 32f6dc1496ce..0ce509118195 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,7 +25,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@@ -47,7 +51,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/logical_id_thunk.h"
 #include "xla/backends/cpu/runtime/outfeed_thunk.h"
 #include "xla/backends/cpu/runtime/reduce_scatter_thunk.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/rng_state_thunk.h"
 #include "xla/backends/cpu/runtime/serdes_base.h"
 #include "xla/backends/cpu/runtime/sort_thunk.h"
@@ -58,6 +61,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/shape.h"
@@ -69,6 +73,24 @@ limitations under the License.
 
 namespace xla::cpu {
 
+void ForEachThunkProto(const ThunkSequenceProto& proto,
+                       std::function<void(const ThunkProto&)> callback) {
+  for (const ThunkProto& thunk_proto : proto.thunks()) {
+    if (thunk_proto.has_call_thunk()) {
+      ForEachThunkProto(thunk_proto.call_thunk().called_sequence(), callback);
+    } else if (thunk_proto.has_conditional_thunk()) {
+      for (const ThunkSequenceProto& branch_sequence :
+           thunk_proto.conditional_thunk().branch_sequences()) {
+        ForEachThunkProto(branch_sequence, callback);
+      }
+    } else if (thunk_proto.has_while_thunk()) {
+      ForEachThunkProto(thunk_proto.while_thunk().body_sequence(), callback);
+      ForEachThunkProto(thunk_proto.while_thunk().cond_sequence(), callback);
+    }
+    callback(thunk_proto);
+  }
+}
+
 static absl::StatusOr<CollectiveThunk::CollectiveKind>
 ProtoCollectiveThunkToCollectiveThunkKind(const CollectiveThunkProto& proto) {
   switch (proto.impl_case()) {
@@ -208,29 +230,11 @@ static absl::StatusOr<CollectiveThunk::OpParams> OpParamsFromProto(
   return op_params;
 }
 
-static absl::StatusOr<BufferAllocationSliceProto> SerializeSliceIntoProto(
-    const BufferAllocation::Slice& slice) {
-  BufferAllocationSliceProto proto;
-  proto.set_offset(slice.offset());
-  proto.set_size(slice.size());
-  proto.set_buffer_allocation_index(
-      slice.allocation() == nullptr ? -1 : slice.index());
-  return proto;
-}
-
-static absl::StatusOr<BufferAllocation::Slice> DeserializeSliceFromProto(
-    const BufferAllocationSliceProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  const BufferAllocation& allocation =
-      buffer_allocations[proto.buffer_allocation_index()];
-  return BufferAllocation::Slice(&allocation, proto.offset(), proto.size());
-}
-
 static absl::Status SerializeSliceShapeIntoProto(
     const BufferAllocation::Slice& slice, const Shape& shape,
     ShapeBufferAllocationSliceProto* proto) {
   *proto->mutable_shape() = shape.ToProto();
-  TF_ASSIGN_OR_RETURN(*proto->mutable_slice(), SerializeSliceIntoProto(slice));
+  TF_ASSIGN_OR_RETURN(*proto->mutable_slice(), slice.ToProto());
   return absl::OkStatus();
 }
 
@@ -240,8 +244,8 @@ DeserializeSliceShapeFromProto(
     const std::vector<BufferAllocation>& buffer_allocations) {
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice slice,
-      DeserializeSliceFromProto(proto.slice(), buffer_allocations));
-  Shape shape(proto.shape());
+      BufferAllocation::Slice::FromProto(proto.slice(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
   return std::make_pair(slice, shape);
 }
 
@@ -250,7 +254,8 @@ static absl::StatusOr<
                CollectiveThunk::OpResources>>
 GetCollectiveThunkParamsFromProto(
     const CollectiveThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
+    const std::vector<BufferAllocation>& buffer_allocations,
+    const std::vector<std::shared_ptr<Resource>>& resources) {
   TF_ASSIGN_OR_RETURN(CollectiveThunk::OpParams op_params,
                       OpParamsFromProto(proto.op_params()));
 
@@ -278,10 +283,30 @@ GetCollectiveThunkParamsFromProto(
 
   CollectiveThunk::OpResources op_resources;
   if (proto.op_resources().communicator_resource().has_value()) {
+    if (resources.size() != 1) {
+      return Internal(
+          "Expected exactly one resource for collective thunk, but got %d "
+          "resources.",
+          resources.size());
+    }
+
+    op_resources.communicator_resource = resources[0];
+
+    // Validate that the serialized resource has the same type as the
+    // resource we are setting.
     TF_ASSIGN_OR_RETURN(
-        op_resources.communicator_resource,
+        std::shared_ptr<Resource> communicator_resource_from_proto,
         CreateResourceFromProto(
             proto.op_resources().communicator_resource().value()));
+
+    if (communicator_resource_from_proto->kind() !=
+        op_resources.communicator_resource->kind()) {
+      return Internal(
+          "Resource kind mismatch between global resource state %d and "
+          "serialized resource %d.",
+          op_resources.communicator_resource->kind(),
+          communicator_resource_from_proto->kind());
+    }
   } else {
     op_resources.communicator_resource = nullptr;
   }
@@ -315,10 +340,10 @@ class ThunkSerDesProtobuf : public SerDesBase<Thunk> {
   friend class ThunkSequenceSerDesProtobuf;
 
  public:
+  // Buffer allocations and resources are not needed for serialization.
   explicit ThunkSerDesProtobuf(
-      const std::vector<BufferAllocation>* buffer_allocations =
-          nullptr);  // NOTE buffer assignment isn't
-                     // needed for serialization.
+      const std::vector<BufferAllocation>* buffer_allocations = nullptr,
+      const std::vector<std::shared_ptr<Resource>>* thunk_resources = nullptr);
   absl::StatusOr<std::string> Serialize(const Thunk& thunk) override;
   absl::StatusOr<std::unique_ptr<Thunk>> Deserialize(
       const std::string& serialized) override;
@@ -331,11 +356,15 @@ class ThunkSerDesProtobuf : public SerDesBase<Thunk> {
  private:
   // TODO(basiol) remove NOLINT when this actually gets used
   const std::vector<BufferAllocation>* buffer_allocations_;  // NOLINT
+
+  const std::vector<std::shared_ptr<Resource>>* thunk_resources_;
 };
 
 ThunkSerDesProtobuf::ThunkSerDesProtobuf(
-    const std::vector<BufferAllocation>* buffer_allocations)
-    : buffer_allocations_(buffer_allocations) {}
+    const std::vector<BufferAllocation>* buffer_allocations,
+    const std::vector<std::shared_ptr<Resource>>* thunk_resources)
+    : buffer_allocations_(buffer_allocations),
+      thunk_resources_(thunk_resources) {}
 
 absl::StatusOr<std::string> ThunkSerDesProtobuf::Serialize(const Thunk& thunk) {
   TF_ASSIGN_OR_RETURN(ThunkProto proto, ToProto(thunk));
@@ -660,11 +689,11 @@ static absl::Status ToProto(const TopKThunk& thunk, ThunkProto& proto) {
   top_k_thunk_proto->set_k(thunk.k());
 
   TF_ASSIGN_OR_RETURN(*top_k_thunk_proto->mutable_values_buffer(),
-                      SerializeSliceIntoProto(thunk.values_buffer()));
+                      thunk.values_buffer().ToProto());
   TF_ASSIGN_OR_RETURN(*top_k_thunk_proto->mutable_indices_buffer(),
-                      SerializeSliceIntoProto(thunk.indices_buffer()));
+                      thunk.indices_buffer().ToProto());
   TF_ASSIGN_OR_RETURN(*top_k_thunk_proto->mutable_output_buffer(),
-                      SerializeSliceIntoProto(thunk.output_buffer()));
+                      thunk.output_buffer().ToProto());
   return absl::OkStatus();
 }
 
@@ -686,7 +715,7 @@ static absl::Status ToProto(const WhileThunk& thunk, ThunkProto& proto) {
       thunk_sequence_serdes.ToProto(thunk.body_executor().thunk_sequence()));
 
   TF_ASSIGN_OR_RETURN(*while_thunk_proto->mutable_cond_buffer(),
-                      SerializeSliceIntoProto(thunk.cond_buffer()));
+                      thunk.cond_buffer().ToProto());
   return absl::OkStatus();
 }
 
@@ -777,7 +806,7 @@ static absl::Status ToProto(const RngGetAndUpdateStateThunk& thunk,
 
   TF_ASSIGN_OR_RETURN(
       *rng_get_and_update_state_thunk_proto->mutable_state_buffer(),
-      SerializeSliceIntoProto(thunk.state_buffer()));
+      thunk.state_buffer().ToProto());
 
   return absl::OkStatus();
 }
@@ -801,12 +830,12 @@ static absl::Status ToProto(const KernelThunkBase& thunk, ThunkProto& proto) {
 
   for (const BufferAllocation::Slice& buffer : thunk.arguments_buffers()) {
     TF_ASSIGN_OR_RETURN(*kernel_thunk_proto->add_arguments_buffers(),
-                        SerializeSliceIntoProto(buffer));
+                        buffer.ToProto());
   }
 
   for (const BufferAllocation::Slice& buffer : thunk.results_buffers()) {
     TF_ASSIGN_OR_RETURN(*kernel_thunk_proto->add_results_buffers(),
-                        SerializeSliceIntoProto(buffer));
+                        buffer.ToProto());
   }
 
   kernel_thunk_proto->mutable_invariant_arguments()->Add(
@@ -829,21 +858,21 @@ static absl::Status ToProto(const ConditionalThunk& thunk, ThunkProto& proto) {
   }
 
   TF_ASSIGN_OR_RETURN(*conditional_thunk_proto->mutable_branch_index_buffer(),
-                      SerializeSliceIntoProto(thunk.branch_index_buffer()));
+                      thunk.branch_index_buffer().ToProto());
   return absl::OkStatus();
 }
 
 static absl::Status ToProto(const PartitionIdThunk& thunk, ThunkProto& proto) {
   TF_ASSIGN_OR_RETURN(
       *proto.mutable_partition_id_thunk()->mutable_logical_id_buffer(),
-      SerializeSliceIntoProto(thunk.logical_id_buffer()));
+      thunk.logical_id_buffer().ToProto());
   return absl::OkStatus();
 }
 
 static absl::Status ToProto(const ReplicaIdThunk& thunk, ThunkProto& proto) {
   TF_ASSIGN_OR_RETURN(
       *proto.mutable_replica_id_thunk()->mutable_logical_id_buffer(),
-      SerializeSliceIntoProto(thunk.logical_id_buffer()));
+      thunk.logical_id_buffer().ToProto());
   return absl::OkStatus();
 }
 
@@ -959,12 +988,14 @@ absl::StatusOr<ThunkProto> ThunkSerDesProtobuf::ToProto(
 
 static absl::StatusOr<std::unique_ptr<AllGatherThunk>> AllGatherThunkFromProto(
     const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
+    const std::vector<BufferAllocation>& buffer_allocations,
+    const std::vector<std::shared_ptr<Resource>>& resources) {
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
 
-  TF_ASSIGN_OR_RETURN(auto collective_thunk_params,
-                      GetCollectiveThunkParamsFromProto(
-                          proto.collective_thunk(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      auto collective_thunk_params,
+      GetCollectiveThunkParamsFromProto(proto.collective_thunk(),
+                                        buffer_allocations, resources));
 
   const auto& [op_params, op_buffers, op_resources] = collective_thunk_params;
   return AllGatherThunk::Create(info, op_params, op_buffers, op_resources);
@@ -972,12 +1003,14 @@ static absl::StatusOr<std::unique_ptr<AllGatherThunk>> AllGatherThunkFromProto(
 
 static absl::StatusOr<std::unique_ptr<AllReduceThunk>> AllReduceThunkFromProto(
     const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
+    const std::vector<BufferAllocation>& buffer_allocations,
+    const std::vector<std::shared_ptr<Resource>>& resources) {
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
 
-  TF_ASSIGN_OR_RETURN(auto collective_thunk_params,
-                      GetCollectiveThunkParamsFromProto(
-                          proto.collective_thunk(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      auto collective_thunk_params,
+      GetCollectiveThunkParamsFromProto(proto.collective_thunk(),
+                                        buffer_allocations, resources));
 
   const auto& [op_params, op_buffers, op_resources] = collective_thunk_params;
   TF_ASSIGN_OR_RETURN(
@@ -992,11 +1025,13 @@ static absl::StatusOr<std::unique_ptr<AllReduceThunk>> AllReduceThunkFromProto(
 
 static absl::StatusOr<std::unique_ptr<AllToAllThunk>> AllToAllThunkFromProto(
     const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
+    const std::vector<BufferAllocation>& buffer_allocations,
+    const std::vector<std::shared_ptr<Resource>>& resources) {
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
-  TF_ASSIGN_OR_RETURN(auto collective_thunk_params,
-                      GetCollectiveThunkParamsFromProto(
-                          proto.collective_thunk(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      auto collective_thunk_params,
+      GetCollectiveThunkParamsFromProto(proto.collective_thunk(),
+                                        buffer_allocations, resources));
 
   const auto& [op_params, op_buffers, op_resources] = collective_thunk_params;
   return AllToAllThunk::Create(info, op_params, op_buffers, op_resources);
@@ -1005,12 +1040,14 @@ static absl::StatusOr<std::unique_ptr<AllToAllThunk>> AllToAllThunkFromProto(
 static absl::StatusOr<std::unique_ptr<CollectivePermuteThunk>>
 CollectivePermuteThunkFromProto(
     const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
+    const std::vector<BufferAllocation>& buffer_allocations,
+    const std::vector<std::shared_ptr<Resource>>& resources) {
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
 
-  TF_ASSIGN_OR_RETURN(auto collective_thunk_params,
-                      GetCollectiveThunkParamsFromProto(
-                          proto.collective_thunk(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      auto collective_thunk_params,
+      GetCollectiveThunkParamsFromProto(proto.collective_thunk(),
+                                        buffer_allocations, resources));
 
   const auto& [op_params, op_buffers, op_resources] = collective_thunk_params;
   std::vector<CollectivePermuteThunk::SourceTargetPair> source_target_pairs;
@@ -1027,12 +1064,14 @@ CollectivePermuteThunkFromProto(
 static absl::StatusOr<std::unique_ptr<ReduceScatterThunk>>
 ReduceScatterThunkFromProto(
     const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
+    const std::vector<BufferAllocation>& buffer_allocations,
+    const std::vector<std::shared_ptr<Resource>>& resources) {
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
 
-  TF_ASSIGN_OR_RETURN(auto collective_thunk_params,
-                      GetCollectiveThunkParamsFromProto(
-                          proto.collective_thunk(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      auto collective_thunk_params,
+      GetCollectiveThunkParamsFromProto(proto.collective_thunk(),
+                                        buffer_allocations, resources));
 
   const auto& [op_params, op_buffers, op_resources] = collective_thunk_params;
 
@@ -1074,8 +1113,8 @@ ConditionalThunkFromProto(
 
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice branch_index_buffer,
-      DeserializeSliceFromProto(proto.conditional_thunk().branch_index_buffer(),
-                                buffer_allocations));
+      BufferAllocation::Slice::FromProto(
+          proto.conditional_thunk().branch_index_buffer(), buffer_allocations));
 
   return ConditionalThunk::Create(std::move(info),
                                   std::move(branch_index_buffer),
@@ -1286,16 +1325,16 @@ static absl::StatusOr<std::unique_ptr<Thunk>> KernelThunkFromProto(
   std::vector<BufferAllocation::Slice> arguments_buffers;
   std::vector<BufferAllocation::Slice> results_buffers;
 
-  for (const BufferAllocationSliceProto& buffer_proto :
+  for (const xla::buffer_assignment::BufferAllocationSliceProto& buffer_proto :
        proto.kernel_thunk().arguments_buffers()) {
-    TF_ASSIGN_OR_RETURN(auto buffer, DeserializeSliceFromProto(
+    TF_ASSIGN_OR_RETURN(auto buffer, BufferAllocation::Slice::FromProto(
                                          buffer_proto, buffer_allocations));
     arguments_buffers.push_back(std::move(buffer));
   }
 
-  for (const BufferAllocationSliceProto& buffer_proto :
+  for (const xla::buffer_assignment::BufferAllocationSliceProto& buffer_proto :
        proto.kernel_thunk().results_buffers()) {
-    TF_ASSIGN_OR_RETURN(auto buffer, DeserializeSliceFromProto(
+    TF_ASSIGN_OR_RETURN(auto buffer, BufferAllocation::Slice::FromProto(
                                          buffer_proto, buffer_allocations));
     results_buffers.push_back(std::move(buffer));
   }
@@ -1374,7 +1413,7 @@ RngGetAndUpdateStateThunkFromProto(
   TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice state_buffer,
-                      DeserializeSliceFromProto(
+                      BufferAllocation::Slice::FromProto(
                           proto.rng_get_and_update_state_thunk().state_buffer(),
                           buffer_allocations));
 
@@ -1419,16 +1458,16 @@ static absl::StatusOr<std::unique_ptr<TopKThunk>> TopKThunkFromProto(
 
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice values_buffer,
-      DeserializeSliceFromProto(proto.top_k_thunk().values_buffer(),
-                                buffer_allocations));
+      BufferAllocation::Slice::FromProto(proto.top_k_thunk().values_buffer(),
+                                         buffer_allocations));
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice output_buffer,
-      DeserializeSliceFromProto(proto.top_k_thunk().output_buffer(),
-                                buffer_allocations));
+      BufferAllocation::Slice::FromProto(proto.top_k_thunk().output_buffer(),
+                                         buffer_allocations));
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice indices_buffer,
-      DeserializeSliceFromProto(proto.top_k_thunk().indices_buffer(),
-                                buffer_allocations));
+      BufferAllocation::Slice::FromProto(proto.top_k_thunk().indices_buffer(),
+                                         buffer_allocations));
 
   return TopKThunk::Create(std::move(info), values_buffer, output_buffer,
                            indices_buffer, proto.top_k_thunk().batch_size(),
@@ -1452,13 +1491,14 @@ static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
 
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice cond_buffer,
-      DeserializeSliceFromProto(proto.while_thunk().cond_buffer(),
-                                buffer_allocations));
+      BufferAllocation::Slice::FromProto(proto.while_thunk().cond_buffer(),
+                                         buffer_allocations));
 
   std::optional<int64_t> trip_count = std::nullopt;
-  if (proto.while_thunk().has_trip_count()) {
+  if (proto.while_thunk().trip_count().contains_value()) {
     trip_count = proto.while_thunk().trip_count().value();
   }
+
   return WhileThunk::Create(std::move(info), cond_buffer,
                             std::move(*cond_sequence),
                             std::move(*body_sequence), trip_count);
@@ -1556,8 +1596,8 @@ static absl::StatusOr<std::unique_ptr<Thunk>> PartitionIdThunkFromProto(
 
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice logical_id_buffer,
-      DeserializeSliceFromProto(proto.partition_id_thunk().logical_id_buffer(),
-                                buffer_allocations));
+      BufferAllocation::Slice::FromProto(
+          proto.partition_id_thunk().logical_id_buffer(), buffer_allocations));
 
   return internal::LogicalIdThunk<
       internal::LogicalIdKind::kPartitionId>::Create(std::move(info),
@@ -1572,8 +1612,8 @@ static absl::StatusOr<std::unique_ptr<Thunk>> ReplicaIdThunkFromProto(
 
   TF_ASSIGN_OR_RETURN(
       BufferAllocation::Slice logical_id_buffer,
-      DeserializeSliceFromProto(proto.replica_id_thunk().logical_id_buffer(),
-                                buffer_allocations));
+      BufferAllocation::Slice::FromProto(
+          proto.replica_id_thunk().logical_id_buffer(), buffer_allocations));
 
   return internal::LogicalIdThunk<internal::LogicalIdKind::kReplicaId>::Create(
       std::move(info), std::move(logical_id_buffer));
@@ -1581,6 +1621,8 @@ static absl::StatusOr<std::unique_ptr<Thunk>> ReplicaIdThunkFromProto(
 
 absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
     const ThunkProto& proto) const {
+  CHECK(buffer_allocations_ != nullptr);
+  CHECK(thunk_resources_ != nullptr);
   TF_ASSIGN_OR_RETURN(Thunk::Kind kind, ProtoThunkToThunkKind(proto));
   if (Thunk::KindToString(kind) != proto.kind()) {
     return absl::Status(
@@ -1597,15 +1639,20 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
           ProtoCollectiveThunkToCollectiveThunkKind(proto.collective_thunk()));
       switch (collective_kind) {
         case CollectiveThunk::CollectiveKind::kAllGather:
-          return AllGatherThunkFromProto(proto, *buffer_allocations_);
+          return AllGatherThunkFromProto(proto, *buffer_allocations_,
+                                         *thunk_resources_);
         case CollectiveThunk::CollectiveKind::kAllReduce:
-          return AllReduceThunkFromProto(proto, *buffer_allocations_);
+          return AllReduceThunkFromProto(proto, *buffer_allocations_,
+                                         *thunk_resources_);
         case CollectiveThunk::CollectiveKind::kAllToAll:
-          return AllToAllThunkFromProto(proto, *buffer_allocations_);
+          return AllToAllThunkFromProto(proto, *buffer_allocations_,
+                                        *thunk_resources_);
         case CollectiveThunk::CollectiveKind::kCollectivePermute:
-          return CollectivePermuteThunkFromProto(proto, *buffer_allocations_);
+          return CollectivePermuteThunkFromProto(proto, *buffer_allocations_,
+                                                 *thunk_resources_);
         case CollectiveThunk::CollectiveKind::kReduceScatter:
-          return ReduceScatterThunkFromProto(proto, *buffer_allocations_);
+          return ReduceScatterThunkFromProto(proto, *buffer_allocations_,
+                                             *thunk_resources_);
       }
     }
     case Thunk::Kind::kCall:
@@ -1686,17 +1733,67 @@ absl::StatusOr<ThunkSequenceProto> ThunkSequenceSerDesProtobuf::ToProto(
   ThunkSerDesProtobuf thunk_serdes(buffer_allocations_);
   ThunkSequenceProto proto;
   proto.mutable_thunks()->Reserve(thunk_sequence.size());
+
+  size_t thunk_index = 0;
+  absl::flat_hash_map<Resource*, std::vector<size_t>> resource_users;
   for (auto& thunk : thunk_sequence) {
     TF_ASSIGN_OR_RETURN(*proto.add_thunks(), thunk_serdes.ToProto(*thunk));
+    for (auto& resource_use : thunk->resource_uses()) {
+      Resource* resource = resource_use.resource().get();
+      if (resource) {
+        resource_users[resource].push_back(thunk_index);
+      }
+    }
+    thunk_index++;
   }
+
+  for (const auto& [resource, users] : resource_users) {
+    ThunkSequenceProto::ResourceUsersProto* resource_users_proto =
+        proto.add_thunk_resources();
+
+    switch (resource->kind()) {
+      case Resource::Kind::kToken:
+        resource_users_proto->mutable_resource()->set_kind(
+            ResourceProto::TOKEN);
+        break;
+      case Resource::Kind::kCollectiveCommunicator:
+        resource_users_proto->mutable_resource()->set_kind(
+            ResourceProto::COLLECTIVE_COMMUNICATOR);
+        break;
+      default:
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Unsupported resource kind: %d", resource->kind()));
+    }
+
+    for (size_t user : users) {
+      resource_users_proto->add_thunk_indices(user);
+    }
+  }
+
   return proto;
 }
 
 absl::StatusOr<std::unique_ptr<ThunkSequence>>
 ThunkSequenceSerDesProtobuf::FromProto(const ThunkSequenceProto& proto) const {
-  ThunkSerDesProtobuf thunk_serdes(buffer_allocations_);
   auto thunk_sequence = std::make_unique<ThunkSequence>();
+
+  // For every thunk we store a list of resources that are used by the thunk.
+  std::vector<std::vector<std::shared_ptr<Resource>>> thunk_resources;
+  thunk_resources.resize(proto.thunks_size());
+
+  for (const auto& resource_users_proto : proto.thunk_resources()) {
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<Resource> resource,
+        CreateResourceFromProto(resource_users_proto.resource()));
+    for (size_t user : resource_users_proto.thunk_indices()) {
+      thunk_resources[user].push_back(resource);
+    }
+  }
+
+  size_t thunk_index = 0;
   for (const ThunkProto& thunk_proto : proto.thunks()) {
+    ThunkSerDesProtobuf thunk_serdes(buffer_allocations_,
+                                     &thunk_resources[thunk_index++]);
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> thunk,
                         thunk_serdes.FromProto(thunk_proto));
     thunk_sequence->push_back(std::move(thunk));
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
index cddd4bf8decc..b18f727ccea9 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_BACKENDS_CPU_RUNTIME_THUNK_PROTO_SERDES_H_
 #define XLA_BACKENDS_CPU_RUNTIME_THUNK_PROTO_SERDES_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -28,6 +29,11 @@ limitations under the License.
 
 namespace xla::cpu {
 
+// Invokes callback for every thunk, including nested thunks, in the given
+// thunk sequence.
+void ForEachThunkProto(const ThunkSequenceProto& proto,
+                       std::function<void(const ThunkProto&)> callback);
+
 class ThunkSequenceSerDesProtobuf : public SerDesBase<ThunkSequence> {
  public:
   explicit ThunkSequenceSerDesProtobuf(
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
index 7ce500896798..35d207f2ef98 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
@@ -16,15 +16,20 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/all_gather_thunk.h"
 #include "xla/backends/cpu/runtime/all_reduce_thunk.h"
@@ -44,11 +49,11 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/logical_id_thunk.h"
 #include "xla/backends/cpu/runtime/outfeed_thunk.h"
 #include "xla/backends/cpu/runtime/reduce_scatter_thunk.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/rng_state_thunk.h"
 #include "xla/backends/cpu/runtime/serdes_base.h"
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/thunk_executor.h"
 #include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
@@ -57,8 +62,11 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/hlo.pb.h"
@@ -74,6 +82,11 @@ limitations under the License.
 namespace xla::cpu {
 namespace {
 
+// Register a no-op FFI handler for testing custom call thunk.
+static absl::Status NoOp() { return absl::OkStatus(); }
+XLA_FFI_DEFINE_HANDLER(kNoOp, NoOp, ffi::Ffi::Bind());
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "no_op", "Host", kNoOp);
+
 template <typename T>
 class FixedCapacityVector {
  public:
@@ -113,6 +126,32 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
       // allocations owned by the thunks.
       : buffer_allocations_(10000) {};
 
+  absl::StatusOr<ThunkSequence> CreateThunkSequenceFromCollectiveThunkTypes(
+      const absl::flat_hash_map<CollectiveThunk::CollectiveKind,
+                                std::shared_ptr<Resource>>&
+          collective_thunk_resources) {
+    ThunkSequence thunk_sequence;
+
+    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
+                        CreateAllGatherThunk(collective_thunk_resources.at(
+                            CollectiveThunk::CollectiveKind::kAllGather)));
+    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
+                        CreateAllReduceThunk(collective_thunk_resources.at(
+                            CollectiveThunk::CollectiveKind::kAllReduce)));
+    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
+                        CreateAllToAllThunk(collective_thunk_resources.at(
+                            CollectiveThunk::CollectiveKind::kAllToAll)));
+    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
+                        CreateReduceScatterThunk(collective_thunk_resources.at(
+                            CollectiveThunk::CollectiveKind::kReduceScatter)));
+    TF_ASSIGN_OR_RETURN(
+        thunk_sequence.emplace_back(),
+        CreateCollectivePermuteThunk(collective_thunk_resources.at(
+            CollectiveThunk::CollectiveKind::kCollectivePermute)));
+
+    return thunk_sequence;
+  }
+
   absl::StatusOr<ThunkSequence> CreateThunkSequenceFromAllThunkTypes() {
     // NOTE create buffer allocations using thunk_testlib
     ThunkSequence thunk_sequence;
@@ -140,6 +179,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
                         CreateRngGetAndUpdateStateThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateTopKThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateWhileThunk());
+    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateWhileThunk(1));
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateXnnDotThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
                         CreateXnnConvolutionThunk());
@@ -151,12 +191,12 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
   }
 
   absl::StatusOr<std::string> Serialize(const ThunkSequence& thunk_sequence) {
-    return thunk_sequence_serdes_->Serialize(thunk_sequence);
+    return thunk_sequence_serdes()->Serialize(thunk_sequence);
   }
 
   absl::StatusOr<std::unique_ptr<ThunkSequence>> Deserialize(
       const std::string& serialized) {
-    return thunk_sequence_serdes_->Deserialize(serialized);
+    return thunk_sequence_serdes()->Deserialize(serialized);
   }
 
   bool VerifyThunkSequenceEquality(const ThunkSequence& thunk_sequence_1,
@@ -178,7 +218,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         std::make_unique<T>(&buffer_allocations_.GetUnderlyingVector());
   }
 
- private:
+ protected:
   absl::Status AddBufferAllocations(const size_t no_of_allocations_to_add) {
     for (size_t i = 0; i < no_of_allocations_to_add; ++i) {
       literals_.push_back(LiteralUtil::CreateFull<float>({2, 4}, 0.0));
@@ -188,8 +228,11 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
 
     return absl::OkStatus();
   }
+
   // Thunk creation helper functions.
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateAllGatherThunk() {
+  absl::StatusOr<std::unique_ptr<Thunk>> CreateAllGatherThunk(
+      std::shared_ptr<Resource> communicator_resource =
+          Resource::Create(Resource::Kind::kCollectiveCommunicator)) {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
 
     return AllGatherThunk::Create(
@@ -213,11 +256,13 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         },
         /*op_resources=*/
         {
-            /*communicator_resource=*/nullptr,
+            /*communicator_resource=*/communicator_resource,
         });
   }
 
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateAllReduceThunk() {
+  absl::StatusOr<std::unique_ptr<Thunk>> CreateAllReduceThunk(
+      std::shared_ptr<Resource> communicator_resource =
+          Resource::Create(Resource::Kind::kCollectiveCommunicator)) {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
 
     return AllReduceThunk::Create(
@@ -241,12 +286,14 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         },
         /*op_resources=*/
         {
-            /*communicator_resource=*/nullptr,
+            /*communicator_resource=*/communicator_resource,
         },
         /*single_replica=*/false);
   }
 
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateAllToAllThunk() {
+  absl::StatusOr<std::unique_ptr<Thunk>> CreateAllToAllThunk(
+      std::shared_ptr<Resource> communicator_resource =
+          Resource::Create(Resource::Kind::kCollectiveCommunicator)) {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
 
     return AllToAllThunk::Create(
@@ -270,11 +317,13 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         },
         /*op_resources=*/
         {
-            /*communicator_resource=*/nullptr,
+            /*communicator_resource=*/communicator_resource,
         });
   }
 
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateReduceScatterThunk() {
+  absl::StatusOr<std::unique_ptr<Thunk>> CreateReduceScatterThunk(
+      std::shared_ptr<Resource> communicator_resource =
+          Resource::Create(Resource::Kind::kCollectiveCommunicator)) {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
 
     return ReduceScatterThunk::Create(
@@ -298,7 +347,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         },
         /*op_resources=*/
         {
-            /*communicator_resource=*/nullptr,
+            /*communicator_resource=*/communicator_resource,
         });
   }
 
@@ -308,11 +357,12 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     TF_ASSIGN_OR_RETURN(called_sequence.emplace_back(), CreateAllReduceThunk());
     TF_ASSIGN_OR_RETURN(called_sequence.emplace_back(), CreateAllToAllThunk());
     return CallThunk::Create(Thunk::Info(),
-                             /*called_sequence=*/
-                             std::move(called_sequence));
+                             /*called_sequence=*/std::move(called_sequence));
   }
 
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateCollectivePermuteThunk() {
+  absl::StatusOr<std::unique_ptr<Thunk>> CreateCollectivePermuteThunk(
+      std::shared_ptr<Resource> communicator_resource =
+          Resource::Create(Resource::Kind::kCollectiveCommunicator)) {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
 
     return CollectivePermuteThunk::Create(
@@ -336,7 +386,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         },
         /*op_resources=*/
         {
-            /*communicator_resource=*/nullptr,
+            /*communicator_resource=*/communicator_resource,
         },
         {{0, 0}});
   }
@@ -383,7 +433,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
 
     return CustomCallThunk::Create(
-        Thunk::Info(), "custom_call_thunk_test",
+        Thunk::Info(), "no_op",
         {
             /*arguments_buffers=*/{CreateBufferAllocationSlice(
                 buffer_allocations_[buffer_allocations_.size() - 2])},
@@ -396,7 +446,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
             {literals_[buffer_allocations_.size() - 1].shape()},
             /*is_tuple_result=*/false,
         },
-        /*backend_config=*/"", CustomCallApiVersion::API_VERSION_ORIGINAL);
+        /*backend_config=*/"", CustomCallApiVersion::API_VERSION_TYPED_FFI);
   }
 
   absl::StatusOr<std::unique_ptr<Thunk>> CreateDotThunk() {
@@ -524,7 +574,8 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     );
   }
 
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateWhileThunk() {
+  absl::StatusOr<std::unique_ptr<Thunk>> CreateWhileThunk(
+      std::optional<int64_t> trip_count = std::nullopt) {
     ThunkSequence cond_sequence;
     TF_ASSIGN_OR_RETURN(cond_sequence.emplace_back(), CreateAllGatherThunk());
     ThunkSequence body_sequence;
@@ -540,7 +591,7 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
             buffer_allocations_[buffer_allocations_.size() - 1]),
         /*cond_sequence=*/std::move(cond_sequence),
         /*body_sequence=*/std::move(body_sequence),
-        /*trip_count=*/1);
+        /*trip_count=*/trip_count);
   }
 
   absl::StatusOr<std::unique_ptr<Thunk>> CreateXnnDotThunk() {
@@ -1372,6 +1423,10 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     return true;
   }
 
+  SerDesBase<ThunkSequence>* thunk_sequence_serdes() {
+    return thunk_sequence_serdes_.get();
+  }
+
   std::unique_ptr<SerDesBase<ThunkSequence>> thunk_sequence_serdes_;
   FixedCapacityVector<BufferAllocation> buffer_allocations_;
   std::vector<Literal> literals_;
@@ -1392,6 +1447,164 @@ TYPED_TEST(ThunkSequenceSerdesTest, SerializeAndDeserialize) {
   EXPECT_TRUE(this->VerifyThunkSequenceEquality(thunk_sequence, *deserialized));
 }
 
+TYPED_TEST(ThunkSequenceSerdesTest, ResourceSharingRecounstruction) {
+  ThunkSequence collectives_thunk_sequence;
+  absl::flat_hash_set<CollectiveThunk::CollectiveKind> first_group_collectives{
+      CollectiveThunk::CollectiveKind::kAllGather,
+      CollectiveThunk::CollectiveKind::kAllReduce,
+      CollectiveThunk::CollectiveKind::kAllToAll,
+  };
+
+  absl::flat_hash_set<CollectiveThunk::CollectiveKind> second_group_collectives{
+      CollectiveThunk::CollectiveKind::kReduceScatter,
+      CollectiveThunk::CollectiveKind::kCollectivePermute};
+
+  {
+    std::shared_ptr<Resource> first_collective_group_communicator =
+        Resource::Create(Resource::Kind::kCollectiveCommunicator);
+    std::shared_ptr<Resource> second_collective_group_communicator =
+        Resource::Create(Resource::Kind::kCollectiveCommunicator);
+
+    absl::flat_hash_map<CollectiveThunk::CollectiveKind,
+                        std::shared_ptr<Resource>>
+        collective_kind_to_communicator_resource;
+    for (const auto& collective_kind : first_group_collectives) {
+      collective_kind_to_communicator_resource[collective_kind] =
+          first_collective_group_communicator;
+    }
+    for (const auto& collective_kind : second_group_collectives) {
+      collective_kind_to_communicator_resource[collective_kind] =
+          second_collective_group_communicator;
+    }
+
+    // We share one communicator resource between All* thunks and the other with
+    // ReduceScatter and CollectivePermute.
+    TF_ASSERT_OK_AND_ASSIGN(
+        collectives_thunk_sequence,
+        this->CreateThunkSequenceFromCollectiveThunkTypes(
+            std::move(collective_kind_to_communicator_resource)));
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          this->Serialize(collectives_thunk_sequence));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<ThunkSequence> deserialized,
+                          this->Deserialize(serialized));
+  EXPECT_TRUE(this->VerifyThunkSequenceEquality(collectives_thunk_sequence,
+                                                *deserialized));
+
+  std::shared_ptr<Resource> first_collective_group_communicator = nullptr;
+  std::shared_ptr<Resource> second_collective_group_communicator = nullptr;
+
+  auto set_or_compare_communicator_resource =
+      [](absl::string_view test_name,
+         const std::shared_ptr<Resource>& thunk_resource,
+         std::shared_ptr<Resource>& group_resource) {
+        SCOPED_TRACE(test_name);
+        if (!group_resource) {
+          group_resource = thunk_resource;
+        }
+        EXPECT_EQ(thunk_resource.get(), group_resource.get());
+      };
+
+  for (const auto& thunk : *deserialized) {
+    CollectiveThunk* collective_thunk =
+        tsl::down_cast<CollectiveThunk*>(thunk.get());
+    EXPECT_FALSE(collective_thunk == nullptr);
+
+    if (first_group_collectives.contains(collective_thunk->collective_kind())) {
+      set_or_compare_communicator_resource(
+          "First group communicator resource",
+          collective_thunk->op_resources().communicator_resource,
+          first_collective_group_communicator);
+    } else {
+      EXPECT_TRUE(second_group_collectives.contains(
+          collective_thunk->collective_kind()));
+      set_or_compare_communicator_resource(
+          "Second group communicator resource",
+          collective_thunk->op_resources().communicator_resource,
+          second_collective_group_communicator);
+    }
+  }
+}
+
+void ForEachThunkProtoCountTestHelper(
+    SerDesBase<ThunkSequence>* thunk_sequence_serdes,
+    const ThunkSequence& thunk_sequence, int expected_thunk_count) {
+  auto thunk_sequence_proto_serdes =
+      tsl::down_cast<ThunkSequenceSerDesProtobuf*>(thunk_sequence_serdes);
+
+  EXPECT_TRUE(thunk_sequence_proto_serdes != nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkSequenceProto thunk_sequence_proto,
+                          thunk_sequence_proto_serdes->ToProto(thunk_sequence));
+
+  int count = 0;
+  ForEachThunkProto(thunk_sequence_proto,
+                    [&count](const ThunkProto& proto) { count++; });
+
+  EXPECT_EQ(count, expected_thunk_count);
+}
+
+TYPED_TEST(ThunkSequenceSerdesTest, CallThunkForEachThunkProto) {
+  if (!std::is_same<TypeParam, ThunkSequenceSerDesProtobuf>::value) {
+    GTEST_SKIP() << "This test is intended only for "
+                    "ThunkSequenceSerDesProtobuf. Skipping for "
+                 << typeid(TypeParam).name();
+  }
+
+  ThunkSequence thunk_sequence;
+
+  TF_ASSERT_OK_AND_ASSIGN(thunk_sequence.emplace_back(),
+                          this->CreateCallThunk());
+
+  // NOTE: We expect 4 thunks: 1 for the call thunk, and 3 for the
+  // nested thunk sequence.
+  constexpr int kExpectedCountValue = 4;
+
+  ForEachThunkProtoCountTestHelper(this->thunk_sequence_serdes(),
+                                   thunk_sequence, kExpectedCountValue);
+}
+
+TYPED_TEST(ThunkSequenceSerdesTest, WhileThunkForEachThunkProto) {
+  if (!std::is_same<TypeParam, ThunkSequenceSerDesProtobuf>::value) {
+    GTEST_SKIP() << "This test is intended only for "
+                    "ThunkSequenceSerDesProtobuf. Skipping for "
+                 << typeid(TypeParam).name();
+  }
+
+  ThunkSequence thunk_sequence;
+
+  TF_ASSERT_OK_AND_ASSIGN(thunk_sequence.emplace_back(),
+                          this->CreateWhileThunk());
+
+  // NOTE: We expect 5 thunks: 1 for the while thunk, and 1 for the
+  // condition thunk, and 3 for the body thunk.
+  constexpr int kExpectedCountValue = 5;
+
+  ForEachThunkProtoCountTestHelper(this->thunk_sequence_serdes(),
+                                   thunk_sequence, kExpectedCountValue);
+}
+
+TYPED_TEST(ThunkSequenceSerdesTest, ConditionalThunkForEachThunkProto) {
+  if (!std::is_same<TypeParam, ThunkSequenceSerDesProtobuf>::value) {
+    GTEST_SKIP() << "This test is intended only for "
+                    "ThunkSequenceSerDesProtobuf. Skipping for "
+                 << typeid(TypeParam).name();
+  }
+
+  ThunkSequence thunk_sequence;
+
+  TF_ASSERT_OK_AND_ASSIGN(thunk_sequence.emplace_back(),
+                          this->CreateConditionalThunk());
+
+  // NOTE: We expect 7 thunks: 1 for the conditional thunk, and 6 for
+  // the branch thunk sequences.
+  constexpr int kExpectedCountValue = 7;
+
+  ForEachThunkProtoCountTestHelper(this->thunk_sequence_serdes(),
+                                   thunk_sequence, kExpectedCountValue);
+}
+
 }  // namespace
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_testlib.h b/third_party/xla/xla/backends/cpu/runtime/thunk_testlib.h
index 9476184750c5..4fc9e4d30019 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_testlib.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_testlib.h
@@ -26,10 +26,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/literal.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc
index 1d99970c277e..6ce5f4556c58 100644
--- a/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/while_thunk.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/base/optimization.h"
 #include "absl/log/check.h"
@@ -287,4 +288,14 @@ WhileThunk::ResourceUses WhileThunk::resource_uses() const {
   return resource_uses;
 }
 
+std::vector<std::pair<std::string, const ThunkSequence*>>
+WhileThunk::nested_thunks() const {
+  std::string maybe_trip_count_info =
+      trip_count_.has_value() ? absl::StrCat(" trip_count=", *trip_count_) : "";
+  return {{absl::StrCat(info().op_name, "-while-condition"),
+           &cond_executor_.thunk_sequence()},
+          {absl::StrCat(info().op_name, "-while-body", maybe_trip_count_info),
+           &body_executor_.thunk_sequence()}};
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/while_thunk.h b/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
index c0e8774fd44f..45832f5b4614 100644
--- a/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/while_thunk.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/status/statusor.h"
 #include "xla/backends/cpu/runtime/thunk.h"
@@ -54,6 +56,9 @@ class WhileThunk final : public Thunk {
 
   std::optional<int64_t> trip_count() const { return trip_count_; }
 
+  std::vector<std::pair<std::string, const ThunkSequence*>> nested_thunks()
+      const final;
+
  private:
   WhileThunk(Info info, BufferAllocation::Slice cond_buffer,
              ThunkExecutor cond_executor, ThunkExecutor body_executor,
diff --git a/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc
index 0a78fff78187..a338e55edf0c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/while_thunk_test.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include <vector>
 
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
 #include "xla/literal_util.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/work_queue.h b/third_party/xla/xla/backends/cpu/runtime/work_queue.h
index 51ab6ff25eda..ac82639629d2 100644
--- a/third_party/xla/xla/backends/cpu/runtime/work_queue.h
+++ b/third_party/xla/xla/backends/cpu/runtime/work_queue.h
@@ -128,8 +128,8 @@ class Worker {
                                     ParallelTask&& parallel_task);
 
   template <typename ParallelTask>
-  static void ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
-                                     uint16_t start_index, uint16_t end_index);
+  static void Parallelize(ParallelizeContext<ParallelTask>* ctx,
+                          uint16_t start_index, uint16_t end_index);
 
   size_t worker_index_;
   size_t partition_index_;
@@ -201,7 +201,9 @@ inline Worker::Worker(size_t worker_index, WorkQueue* queue)
 
 inline std::optional<size_t> Worker::Pop() {
   std::optional<size_t> task = queue_->Pop(partition_index_);
-  if (ABSL_PREDICT_TRUE(task)) return task;
+  if (ABSL_PREDICT_TRUE(task)) {
+    return task;
+  }
 
   // If we didn't find a task in the initially assigned partition, notify the
   // work queue that we are switching to work stealing mode.
@@ -252,8 +254,8 @@ Worker::ParallelizeContext<ParallelTask>::ParallelizeContext(
 
 template <typename ParallelTask>
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
-void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
-                                    uint16_t start_index, uint16_t end_index) {
+void Worker::Parallelize(ParallelizeContext<ParallelTask>* ctx,
+                         uint16_t start_index, uint16_t end_index) {
   DCHECK_LT(start_index, end_index) << "Invalid worker index range";
 
   using R = std::invoke_result_t<ParallelTask, size_t>;
@@ -262,7 +264,9 @@ void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
 
   auto count_down = [&](size_t count, absl::Status status) {
     // If count down is completed, delete the context.
-    if (ctx->count_down.CountDown(count, std::move(status))) delete ctx;
+    if (ctx->count_down.CountDown(count, std::move(status))) {
+      delete ctx;
+    }
   };
 
   // Recursively split assigned workers into two halves and schedule the
@@ -285,14 +289,22 @@ void Worker::ParallelizeWithContext(ParallelizeContext<ParallelTask>* ctx,
       count_down(skip_workers, absl::OkStatus());
 
       end_index -= skip_workers;
-      if (start_index == end_index) return;
-      if (end_index - start_index == 1) break;
+
+      // Return if there is no more work to do.
+      if (start_index == end_index) {
+        return;
+      }
+
+      // Execute the last remaining worker in the caller thread.
+      if (end_index - start_index == 1) {
+        break;
+      }
     }
 
     DCHECK_GE(end_index - start_index, 1);
     uint16_t mid_index = (start_index + end_index) / 2;
     ctx->device->enqueueNoNotification([ctx, mid_index, end_index] {
-      ParallelizeWithContext(ctx, mid_index, end_index);
+      Parallelize(ctx, mid_index, end_index);
     });
     end_index = mid_index;
   }
@@ -360,7 +372,7 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE void Worker::Parallelize(
       device, std::move(count_down), num_tasks,
       std::forward<ParallelTask>(parallel_task));
 
-  ParallelizeWithContext(ctx.release(), 0, num_workers);
+  Parallelize(ctx.release(), 0, num_workers);
 }
 
 template <typename ParallelTask>
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
index 75ec085df71f..d94c39c6e729 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -33,7 +33,7 @@ cc_library(
     hdrs = ["xnn_threadpool.h"],
     # copybara:uncomment_begin(google-only)
     # local_defines = select({
-    # "@pthreadpool:pthreadpool_header_only_explicit_true": [
+    # "//xla/tsl:linux_any": [
     # "XLA_CPU_USE_CUSTOM_PTHREADPOOL",
     # ],
     # "//conditions:default": [],
@@ -46,6 +46,9 @@ cc_library(
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:str_format",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:platform_port",
         "@pthreadpool",
@@ -61,12 +64,9 @@ xla_cc_test(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:test",
-        "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@XNNPACK",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
         "@pthreadpool",
     ],
@@ -104,7 +104,6 @@ xla_cc_test(
     deps = [
         ":xnn_convolution_thunk",
         "//xla:error_spec",
-        "//xla:executable_run_options",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -123,10 +122,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
@@ -141,6 +137,7 @@ cc_library(
         ":xnn_fusion_thunk",
         ":xnn_interop",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/runtime:dot_lib",
         "//xla/backends/cpu/runtime:thunk",
@@ -162,7 +159,6 @@ xla_cc_test(
     srcs = ["xnn_dot_thunk_test.cc"],
     deps = [
         ":xnn_dot_thunk",
-        "//xla:executable_run_options",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -173,10 +169,10 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
@@ -188,14 +184,10 @@ cc_library(
         ":xnn_interop",
         ":xnn_threadpool",
         "//xla:shape_util",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:dot_lib",
-        "//xla/backends/cpu/runtime:object_pool",
         "//xla/backends/cpu/runtime:parallel_loop_runner",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:object_pool",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
@@ -210,14 +202,10 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/profiler/lib:traceme",
         "@pthreadpool",
     ],
 )
@@ -228,9 +216,9 @@ xla_cc_test(
     deps = [
         ":xnn_fusion_thunk",
         ":xnn_interop",
-        "//xla:executable_run_options",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/runtime:buffer_allocations",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/backends/cpu/runtime:thunk_testlib",
@@ -239,7 +227,6 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@XNNPACK",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
index 7a667cd47439..8ff54f739a9e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
@@ -33,11 +33,13 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
+#include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
@@ -60,16 +62,25 @@ absl::StatusOr<xnn_subgraph_t> XnnDotThunk::BuildDotSubgraph(
   std::vector<size_t> rhs_dims = dims(dot_slices_.rhs_shape.dimensions());
   std::vector<size_t> out_dims = dims(dot_slices_.out_shape.dimensions());
 
+  PrimitiveType dtype = dot_slices_.lhs_shape.element_type();
+  if (dtype != F32 && dtype != BF16) {
+    return InvalidArgument("Unsupported input data type for XnnDotThunk: %s",
+                           primitive_util::LowercasePrimitiveTypeName(dtype));
+  }
+  xnn_datatype input_dtype =
+      (dtype == F32) ? xnn_datatype_fp32 : xnn_datatype_bf16;
+  xnn_datatype output_dtype = xnn_datatype_fp32;
+
   XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, lhs_dims.size(), lhs_dims.data(), nullptr,
+      subgraph, input_dtype, lhs_dims.size(), lhs_dims.data(), nullptr,
       /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &lhs_id));
 
   XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, rhs_dims.size(), rhs_dims.data(), nullptr,
+      subgraph, input_dtype, rhs_dims.size(), rhs_dims.data(), nullptr,
       /*external_id=*/1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &rhs_id));
 
   XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, xnn_datatype_fp32, out_dims.size(), out_dims.data(), nullptr,
+      subgraph, output_dtype, out_dims.size(), out_dims.data(), nullptr,
       /*external_id=*/2, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
 
   XNN_RETURN_IF_ERROR(xnn_define_batch_matrix_multiply(
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
index ea170fa1cb25..6f40ebebf5d4 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
index 42c218cab332..547f6c0c65ce 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
 
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
@@ -27,6 +31,7 @@ limitations under the License.
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/cpu_info.h"
 
 #define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -34,12 +39,27 @@ limitations under the License.
 namespace xla::cpu {
 namespace {
 
-class XnnDotThunkTest : public testing::TestWithParam<bool> {
- protected:
-  bool use_threadpool() const { return GetParam(); }
+struct XnnDotThunkTestSpec {
+  PrimitiveType input_type;
+  bool use_threadpool;
+};
+
+class XnnDotThunkTest : public testing::TestWithParam<XnnDotThunkTestSpec> {
+ public:
+  static std::string Name(
+      const ::testing::TestParamInfo<XnnDotThunkTestSpec>& info) {
+    return absl::StrCat(
+        PrimitiveType_Name(info.param.input_type), "_",
+        info.param.use_threadpool ? "threadpool" : "single_threaded");
+  }
 };
 
 TEST_P(XnnDotThunkTest, SimpleDot) {
+  XnnDotThunkTestSpec spec = GetParam();
+  if (spec.input_type == BF16 &&
+      !tsl::port::TestCPUFeature(tsl::port::AVX512_BF16)) {
+    GTEST_SKIP() << "CPU needs AVX512_BF16 for this test.";
+  }
   tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
   Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
                                  threads.NumThreads());
@@ -47,6 +67,10 @@ TEST_P(XnnDotThunkTest, SimpleDot) {
   auto lhs = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto rhs = LiteralUtil::CreateR2<float>({{4.0, 3.0}, {2.0, 1.0}});
   auto out = LiteralUtil::CreateR2<float>({{0.0, 0.0}, {0.0, 0.0}});
+  if (spec.input_type == BF16) {
+    lhs = LiteralUtil::ConvertF32ToBF16(lhs);
+    rhs = LiteralUtil::ConvertF32ToBF16(rhs);
+  }
 
   BufferAllocations allocations = CreateBufferAllocations(lhs, rhs, out);
 
@@ -55,20 +79,22 @@ TEST_P(XnnDotThunkTest, SimpleDot) {
   auto [lhs_slice, rhs_slice, out_slice] =
       CreateBufferAllocationSlice(lhs_alloc, rhs_alloc, out_alloc);
 
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+  Shape input_shape = ShapeUtil::MakeShape(spec.input_type, {2, 2});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 2});
 
   DotDimensionNumbers dot_dimensions;
   dot_dimensions.add_lhs_contracting_dimensions(1);
   dot_dimensions.add_rhs_contracting_dimensions(0);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto thunk, XnnDotThunk::Create(XnnDotThunk::Options{use_threadpool()},
-                                      {"dot"}, dot_dimensions, lhs_slice, shape,
-                                      rhs_slice, shape, out_slice, shape));
+      auto thunk,
+      XnnDotThunk::Create(XnnDotThunk::Options{spec.use_threadpool}, {"dot"},
+                          dot_dimensions, lhs_slice, input_shape, rhs_slice,
+                          input_shape, out_slice, output_shape));
 
   Thunk::ExecuteParams params;
   params.buffer_allocations = &allocations;
-  params.intra_op_threadpool = use_threadpool() ? &device : nullptr;
+  params.intra_op_threadpool = spec.use_threadpool ? &device : nullptr;
 
   auto execute_event = thunk->Execute(params);
   tsl::BlockUntilReady(execute_event);
@@ -77,7 +103,17 @@ TEST_P(XnnDotThunkTest, SimpleDot) {
   EXPECT_EQ(out, LiteralUtil::CreateR2<float>({{8.0, 5.0}, {20.0, 13.0}}));
 }
 
-INSTANTIATE_TEST_SUITE_P(XnnDot, XnnDotThunkTest, testing::Values(true, false));
+std::vector<XnnDotThunkTestSpec> GetXnnDotThunkTestSpecs() {
+  return std::vector<XnnDotThunkTestSpec>{
+      XnnDotThunkTestSpec{F32, /*use_threadpool=*/true},
+      XnnDotThunkTestSpec{F32, /*use_threadpool=*/false},
+      XnnDotThunkTestSpec{BF16, /*use_threadpool=*/true},
+      XnnDotThunkTestSpec{BF16, /*use_threadpool=*/false}};
+}
+
+INSTANTIATE_TEST_SUITE_P(XnnDot, XnnDotThunkTest,
+                         ::testing::ValuesIn(GetXnnDotThunkTestSpecs()),
+                         XnnDotThunkTest::Name);
 
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
index 5c21c83d3ba3..47a5e18b4ca0 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
@@ -45,9 +45,9 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::cpu {
-
 namespace {
-enum class ParallelizationMode { kInline, kParallelLoopRunner, kPThreadPool };
+
+enum class ParallelizationMode { kInline, kParallelLoopRunner };
 
 template <typename Sink>
 void AbslStringify(Sink& sink, ParallelizationMode m) {
@@ -58,9 +58,6 @@ void AbslStringify(Sink& sink, ParallelizationMode m) {
     case ParallelizationMode::kParallelLoopRunner:
       sink.Append("kParallelLoopRunner");
       break;
-    case ParallelizationMode::kPThreadPool:
-      sink.Append("kPThreadPool");
-      break;
   }
 }
 
@@ -171,19 +168,16 @@ void XnnFusionThunk::XnnRuntime::Destroy() {
   if (runtime != nullptr) XNN_LOG_IF_ERROR(xnn_delete_runtime(runtime));
   if (subgraph != nullptr) XNN_LOG_IF_ERROR(xnn_delete_subgraph(subgraph));
   if (workspace != nullptr) XNN_LOG_IF_ERROR(xnn_release_workspace(workspace));
-
-  bool owned_threadpool = threadpool != nullptr && IsCustomPthreadpoolEnabled();
-  if (owned_threadpool) pthreadpool_destroy(threadpool);
+  if (threadpool) DestroyCustomPthreadpool(threadpool);
 }
 
 absl::StatusOr<XnnFusionThunk::XnnRuntime> XnnFusionThunk::CreateXnnRuntime(
     const Eigen::ThreadPoolDevice* device, bool one_use,
     absl::FunctionRef<absl::StatusOr<xnn_subgraph_t>()> builder) {
   ParallelizationMode parallelization_mode =
-      options_.use_threadpool ? (device && IsCustomPthreadpoolEnabled()
-                                     ? ParallelizationMode::kParallelLoopRunner
-                                     : ParallelizationMode::kPThreadPool)
-                              : ParallelizationMode::kInline;
+      options_.use_threadpool && device
+          ? ParallelizationMode::kParallelLoopRunner
+          : ParallelizationMode::kInline;
 
   VLOG(3) << absl::StreamFormat(
       "Create %s XNN runtime for `%s` operation: num_created=%d, "
@@ -202,8 +196,6 @@ absl::StatusOr<XnnFusionThunk::XnnRuntime> XnnFusionThunk::CreateXnnRuntime(
   if (parallelization_mode == ParallelizationMode::kParallelLoopRunner) {
     runtime.runner = std::make_unique<ParallelLoopRunner>(device);
     runtime.threadpool = CreateCustomPthreadpool(runtime.runner.get());
-  } else if (parallelization_mode == ParallelizationMode::kPThreadPool) {
-    runtime.threadpool = DefaultPthreadpool();
   }
 
   XNN_RETURN_IF_ERROR(xnn_create_workspace(&runtime.workspace));
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
index ba7d598b5089..027891e1f1c4 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
@@ -33,8 +33,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/object_pool.h"
 #include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
index f2b4febacd69..59ae32dc945d 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/xla_data.pb.h"
 
 #define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
index 664283c3a758..76092af2a740 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
@@ -18,39 +18,31 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <memory>
-#include <utility>
+#include <tuple>
 
-#include "absl/base/call_once.h"
 #include "absl/base/optimization.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_format.h"
 #include "pthreadpool.h"
 #include "xla/backends/cpu/runtime/parallel_loop_runner.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "tsl/platform/cpu_info.h"
 
 #define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
 // `pthreadpool` API implementation on top of ParallelLoopRunner.
 //
-// When building with `pthreadpool_header_only` config, `pthreadpool` becomes a
-// header-only library, and we implement the API on top of ParallelLoopRunner.
-//
-// At link time `pthreadpool` symbols resolved to our own implementation. This
-// is a temporary hack around the fact that it's impossible to customize
-// `pthreadpool` implementation at run time. The downsize is that it's
-// impossible to have two `pthreadpool` implementations linked into the same
-// binary.
+// At link time weak `pthreadpool` symbols resolved to our own implementation,
+// and we use pointer tagging to distinguish between our custom pthreadpools and
+// the native pthreadpools, and dispatch to strong pthreadpool symbol aliases.
 //
 // WARNING: This is under construction and implements only the subset of the API
 // surface which is needed by XNNPACK uses inside XLA.
 
 namespace xla::cpu {
 
-bool IsCustomPthreadpoolEnabled() {
+static constexpr bool IsCustomPthreadpoolEnabled() {
 #if defined(XLA_CPU_USE_CUSTOM_PTHREADPOOL)
   return true;
 #else
@@ -58,520 +50,450 @@ bool IsCustomPthreadpoolEnabled() {
 #endif  // XLA_CPU_USE_CUSTOM_PTHREADPOOL
 }
 
-// Default XLA:CPU pthreadpool initialized once per process.
-static absl::once_flag pthreadpool_init;
-static pthreadpool_t default_pthreadpool;
-
-pthreadpool_t DefaultPthreadpool() {
-  if (IsCustomPthreadpoolEnabled()) {
-    LOG(WARNING) << "Default pthreadpool is not supported when build with "
-                    "`--define pthreadpool_header_only=true`";
-    return nullptr;
-  }
-
-  absl::call_once(pthreadpool_init, []() {
-    default_pthreadpool = pthreadpool_create(tsl::port::MaxParallelism());
-  });
+// We rely on the pointer tagging to identify custom pthreadpools. We assume
+// that native pthreadpool is at least std::max_align_t aligned and we can use
+// the lowest bit to mark the custom pthreadpool.
+static constexpr uintptr_t kCustomPthreadpoolTag = 1;
 
-  return default_pthreadpool;
+static bool IsCustomPthreadpool(pthreadpool_t threadpool) {
+  return IsCustomPthreadpoolEnabled() &&
+         (reinterpret_cast<uintptr_t>(threadpool) & kCustomPthreadpoolTag);
 }
 
-namespace {
-
-class Pthreadpool {
- public:
-  virtual ~Pthreadpool() = default;
-  virtual ParallelLoopRunner* runner() = 0;
-};
-
-// Wraps user-provided parallel loop runner into the custom pthreadpool.
-class WrappedParallelLoopRunner : public Pthreadpool {
- public:
-  explicit WrappedParallelLoopRunner(ParallelLoopRunner* runner)
-      : runner_(runner) {}
-  ParallelLoopRunner* runner() final { return runner_; }
-
- private:
-  ParallelLoopRunner* runner_;
-};
+static ParallelLoopRunner* Cast(pthreadpool_t threadpool) {
+  CHECK(IsCustomPthreadpoolEnabled()) << "Custom pthreadpool is not enabled";
+  CHECK(IsCustomPthreadpool(threadpool)) << "Not a custom pthreadpool";
 
-// Wraps newly created thread pool into the custom pthreadpool.
-class OwnedParallelLoopRunner : public Pthreadpool {
- public:
-  explicit OwnedParallelLoopRunner(size_t threads_count)
-      : thread_pool_(tsl::Env::Default(), "xnn_threadpool", threads_count),
-        device_(thread_pool_.AsEigenThreadPool(), threads_count),
-        runner_(&device_) {}
-
-  ParallelLoopRunner* runner() final { return &runner_; }
-
- private:
-  tsl::thread::ThreadPool thread_pool_;
-  Eigen::ThreadPoolDevice device_;
-  ParallelLoopRunner runner_;
-};
-
-}  // namespace
+  return reinterpret_cast<ParallelLoopRunner*>(  // REINTERPRET_CAST_OK=ok
+      reinterpret_cast<uintptr_t>(threadpool) &  // REINTERPRET_CAST_OK=ok
+      ~kCustomPthreadpoolTag);
+}
 
 pthreadpool_t CreateCustomPthreadpool(ParallelLoopRunner* runner) {
-  if (IsCustomPthreadpoolEnabled()) {
-    return reinterpret_cast<pthreadpool_t>(
-        std::make_unique<WrappedParallelLoopRunner>(runner).release());
+  // If XLA was built without custom pthreadpool, we return a default threadpool
+  // implementation. This should never be used in production jobs as it creates
+  // and destroys a threadpool for each XNNPACK fusion. We enable this path only
+  // for testing on platforms that do not support custom pthreadpool.
+  if constexpr (!IsCustomPthreadpoolEnabled()) {
+    LOG(WARNING) << absl::StrFormat(
+        "Custom XLA pthreadpool is disabled. Create a default pthreadpool with "
+        "%d threads.",
+        runner->num_threads());
+    pthreadpool_t threadpool = pthreadpool_create(runner->num_threads());
+    CHECK(!IsCustomPthreadpool(threadpool))
+        << "Default pthreadpool tagged as a custom pthreadpool";
+    return threadpool;
   }
-  LOG(FATAL) << "To use custom pthreadpool, build with "
-                "`--define pthreadpool_header_only=true`";
+
+  return reinterpret_cast<pthreadpool_t>(    // REINTERPRET_CAST_OK=ok
+      reinterpret_cast<uintptr_t>(runner) |  // REINTERPRET_CAST_OK=ok
+      kCustomPthreadpoolTag);
 }
 
-static pthreadpool_t CreateCustomPthreadpool(size_t threads_count) {  // NOLINT
-  if (IsCustomPthreadpoolEnabled()) {
-    return reinterpret_cast<pthreadpool_t>(
-        std::make_unique<OwnedParallelLoopRunner>(threads_count).release());
+void DestroyCustomPthreadpool(pthreadpool_t threadpool) {  // NOLINT
+  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
+    return;
+  }
+
+  // If XLA was built without custom pthreadpool, then it must be the default
+  // pthreadpool implementation that we should destroy.
+  if constexpr (!IsCustomPthreadpoolEnabled()) {
+    pthreadpool_destroy(threadpool);
+    return;
   }
-  LOG(FATAL) << "To use custom pthreadpool, build with "
-                "`--define pthreadpool_header_only=true`";
-}
 
-static Pthreadpool* Cast(pthreadpool_t threadpool) {
-  return reinterpret_cast<Pthreadpool*>(threadpool);
+  tsl::BlockUntilReady(Cast(threadpool)->done_event());
 }
 
-xla::cpu::ParallelLoopRunner* GetParallelLoopRunner(pthreadpool_t threadpool) {
-  return IsCustomPthreadpoolEnabled() ? Cast(threadpool)->runner() : nullptr;
+ParallelLoopRunner* GetParallelLoopRunner(pthreadpool_t threadpool) {
+  return IsCustomPthreadpool(threadpool) ? Cast(threadpool) : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
 // C++ implementation of the subset of `pthreadpool` C API.
 //===----------------------------------------------------------------------===//
 
-static void DestroyCustomPthreadpool(pthreadpool_t threadpool) {  // NOLINT
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    return;
-  }
+using RangeDim = ParallelLoopRunner::RangeDim;
+using TileDim = ParallelLoopRunner::TileDim;
 
-  tsl::BlockUntilReady(Cast(threadpool)->runner()->done_event());
-  delete Cast(threadpool);
-}
+using RangeIndex = ParallelLoopRunner::RangeIndex;
+using TileIndex = ParallelLoopRunner::TileIndex;
 
 static size_t GetThreadsCount(pthreadpool_t threadpool) {  // NOLINT
   if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
     return 1;
   }
 
-  return Cast(threadpool)->runner()->num_threads();
+  return Cast(threadpool)->num_threads();
 }
 
-static void Parallelize1D(  // NOLINT
-    pthreadpool_t threadpool, pthreadpool_task_1d_t function, void* context,
-    size_t range, uint32_t flags) {
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    for (size_t i = 0; i < range; ++i) {
-      function(context, i);
-    }
-    return;
-  }
-
-  ParallelLoopRunner::Task1D task = [function, context](size_t offset) {
-    (*function)(context, offset);
-  };
-  Cast(threadpool)->runner()->Parallelize(range, std::move(task));
-}
+namespace internal {
 
-static void Parallelize1DTile1D(  // NOLINT
-    pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_t function,
-    void* context, size_t range, size_t tile, uint32_t flags) {
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    for (size_t i = 0; i < range; i += tile) {
-      function(context, i, std::min(range - i, tile));
-    }
-    return;
-  }
+// A little bit of a template metaprogramming to invoke XNNPACK function at the
+// given task indices via recursive parameter pack expansion.
 
-  ParallelLoopRunner::Task1DTile1D task = [function, context](size_t offset,
-                                                              size_t extent) {
-    (*function)(context, offset, extent);
-  };
-  Cast(threadpool)->runner()->Parallelize(range, tile, std::move(task));
+template <typename Fn, typename Offsets, typename Counts>
+static void Invoke(Fn function, void* context, Offsets offsets, Counts counts) {
+  std::apply([&](auto... args) { (*function)(context, args...); },
+             std::tuple_cat(offsets, counts));
 }
 
-static void Parallelize1DTile1DDynamic(  // NOLINT
-    pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_dynamic_t function,
-    void* context, size_t range, size_t tile, uint32_t flags) {
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    function(context, 0, range);
-    return;
+template <typename Fn, typename Offsets, typename Counts, typename Index,
+          typename... Indices>
+static void Invoke(Fn function, void* context, Offsets offsets, Counts counts,
+                   Index index, Indices... indices) {
+  if constexpr (std::is_same_v<Index, RangeIndex>) {
+    Invoke(function, context,
+           std::tuple_cat(offsets, std::make_tuple(index.offset)), counts,
+           indices...);
+  } else if constexpr (std::is_same_v<Index, TileIndex>) {
+    Invoke(function, context,
+           std::tuple_cat(offsets, std::make_tuple(index.offset)),
+           std::tuple_cat(counts, std::make_tuple(index.count)), indices...);
+  } else {
+    static_assert(sizeof(Index) == 0, "Unsupported task index type");
   }
-
-  ParallelLoopRunner::Task1DTile1DDynamic task =
-      [function, context](size_t offset, size_t count) {
-        (*function)(context, offset, count);
-      };
-  Cast(threadpool)->runner()->ParallelizeDynamic(range, tile, std::move(task));
 }
 
-static void Parallelize2DTile1D(pthreadpool_t threadpool,  // NOLINT
-                                pthreadpool_task_2d_tile_1d_t function,
-                                void* context, size_t range_i, size_t range_j,
-                                size_t tile_j, uint32_t flags) {
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    for (size_t i = 0; i < range_i; i++) {
-      for (size_t j = 0; j < range_j; j += tile_j) {
-        function(context, i, j, std::min(range_j - j, tile_j));
-      }
-    }
-    return;
-  }
+// A little bit of template metaprogramming to construct a loop nest to invoke
+// XNNPACK function in the caller thread for all task indices.
 
-  ParallelLoopRunner::Task2DTile1D task =
-      [function, context](size_t offset_i, size_t offset_j, size_t extent_j) {
-        (*function)(context, offset_i, offset_j, extent_j);
-      };
-  Cast(threadpool)
-      ->runner()
-      ->Parallelize(range_i, range_j, tile_j, std::move(task));
+template <bool dynamic, typename Fn, typename Indices>
+static void InvokeAll(Fn function, void* context, Indices indices) {
+  std::apply(
+      [&](auto... indices) {
+        Invoke(function, context, std::make_tuple(), std::make_tuple(),
+               indices...);
+      },
+      indices);
 }
 
-static void Parallelize2DTile1DDynamic(  // NOLINT
-    pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function,
-    void* context, size_t range_i, size_t range_j, size_t tile_j,
-    uint32_t flags) {
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    for (size_t i = 0; i < range_i; i++) {
-      function(context, i, 0, range_j);
-    }
-    return;
-  }
+template <bool dynamic, typename Fn, typename Indices, typename Dim,
+          typename... Dims>
+void InvokeAll(Fn function, void* context, Indices indices, Dim dim,
+               Dims... dims) {
+  // Appends index to the tuple of indices.
+  auto index_cat = [&](auto index) {
+    return std::tuple_cat(indices, std::make_tuple(index));
+  };
 
-  ParallelLoopRunner::Task2DTile1DDynamic task =
-      [function, context](size_t offset_i, size_t offset_j, size_t extent_j) {
-        (*function)(context, offset_i, offset_j, extent_j);
-      };
-  Cast(threadpool)
-      ->runner()
-      ->ParallelizeDynamic(range_i, range_j, tile_j, std::move(task));
-}
+  if constexpr (std::is_same_v<Dim, RangeDim>) {
+    for (size_t d = 0; d < dim.range; ++d) {
+      InvokeAll<dynamic>(function, context, index_cat(RangeIndex{d}), dims...);
+    }
 
-static void Parallelize3DTile2D(pthreadpool_t threadpool,  // NOLINT
-                                pthreadpool_task_3d_tile_2d_t function,
-                                void* context, size_t range_i, size_t range_j,
-                                size_t range_k, size_t tile_j, size_t tile_k,
-                                uint32_t flags) {
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    for (size_t i = 0; i < range_i; i++) {
-      for (size_t j = 0; j < range_j; j += tile_j) {
-        for (size_t k = 0; k < range_k; k += tile_k) {
-          function(context, i, j, k, std::min(range_j - j, tile_j),
-                   std::min(range_k - k, tile_k));
-        }
+  } else if constexpr (std::is_same_v<Dim, TileDim>) {
+    if constexpr (dynamic) {
+      InvokeAll<dynamic>(function, context, index_cat(TileIndex{0, dim.range}),
+                         dims...);
+    } else {
+      for (size_t d = 0; d < dim.range; d += dim.tile) {
+        InvokeAll<dynamic>(
+            function, context,
+            index_cat(TileIndex{d, std::min(dim.range - d, dim.tile)}),
+            dims...);
       }
     }
-    return;
+  } else {
+    static_assert(sizeof(Dim) == 0, "Unsupported dimension type");
   }
-
-  ParallelLoopRunner::Task3DTile2D task =
-      [function, context](size_t offset_i, size_t offset_j, size_t offset_k,
-                          size_t extent_j, size_t extent_k) {
-        (*function)(context, offset_i, offset_j, offset_k, extent_j, extent_k);
-      };
-  Cast(threadpool)
-      ->runner()
-      ->Parallelize(range_i, range_j, range_k, tile_j, tile_k, std::move(task));
 }
 
-static void Parallelize3DTile2DDynamic(  // NOLINT
-    pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t tile_j, size_t tile_k, uint32_t flags) {
-  if (ABSL_PREDICT_FALSE(threadpool == nullptr)) {
-    for (size_t i = 0; i < range_i; i++) {
-      function(context, i, 0, 0, range_j, range_k);
-    }
-    return;
+}  // namespace internal
+
+// Executes XNNPACK function in parallel over the given dimensions.
+template <typename... Dims, typename Fn>
+static void Parallelize(pthreadpool_t threadpool, Fn function, void* context,
+                        Dims... dims) {
+  if (ABSL_PREDICT_TRUE(threadpool)) {
+    ParallelLoopRunner* runner = Cast(threadpool);
+    runner->Parallelize(dims..., [function, context](auto... indices) {
+      internal::Invoke(function, context, std::make_tuple(), std::make_tuple(),
+                       indices...);
+    });
+  } else {
+    internal::InvokeAll<false>(function, context, std::make_tuple(), dims...);
   }
-
-  ParallelLoopRunner::Task3DTile2DDynamic task =
-      [function, context](size_t offset_i, size_t offset_j, size_t offset_k,
-                          size_t count_j, size_t count_k) {
-        (*function)(context, offset_i, offset_j, offset_k, count_j, count_k);
-      };
-  Cast(threadpool)
-      ->runner()
-      ->ParallelizeDynamic(range_i, range_j, range_k, tile_j, tile_k,
-                           std::move(task));
 }
 
-}  // namespace xla::cpu
-
-#if defined(XLA_CPU_USE_CUSTOM_PTHREADPOOL)
-
-extern "C" pthreadpool_t pthreadpool_create(size_t threads_count) {
-  return xla::cpu::CreateCustomPthreadpool(threads_count);
+template <typename... Dims, typename Fn>
+static void ParallelizeDynamic(pthreadpool_t threadpool, Fn function,
+                               void* context, Dims... dims) {
+  if (ABSL_PREDICT_TRUE(threadpool)) {
+    ParallelLoopRunner* runner = Cast(threadpool);
+    runner->ParallelizeDynamic(dims..., [function, context](auto... indices) {
+      internal::Invoke(function, context, std::make_tuple(), std::make_tuple(),
+                       indices...);
+    });
+  } else {
+    internal::InvokeAll<true>(function, context, std::make_tuple(), dims...);
+  }
 }
 
-extern "C" void pthreadpool_destroy(pthreadpool_t threadpool) {
-  xla::cpu::DestroyCustomPthreadpool(threadpool);
-}
+}  // namespace xla::cpu
 
-extern "C" size_t pthreadpool_get_threads_count(pthreadpool_t threadpool) {
-  return xla::cpu::GetThreadsCount(threadpool);
-}
+//===----------------------------------------------------------------------===//
+// pthreadpool C API implementation on top of the custom loop runner.
+//===----------------------------------------------------------------------===//
 
-extern "C" void pthreadpool_parallelize_1d(pthreadpool_t threadpool,
-                                           pthreadpool_task_1d_t function,
-                                           void* context, size_t range,
-                                           uint32_t flags) {
-  xla::cpu::Parallelize1D(threadpool, function, context, range, flags);
-}
+#if defined(XLA_CPU_USE_CUSTOM_PTHREADPOOL)
 
-extern "C" void pthreadpool_parallelize_1d_with_thread(
-    pthreadpool_t threadpool, pthreadpool_task_1d_with_thread_t function,
-    void* context, size_t range, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
+// In all APIs below we dispatch to XLA:CPU implementation if the threadpool is
+// a custom pthreadpool (we use tagged pointers to detect custom pthreadpools),
+// or ot the native pthreadpool implementation otherwise.
+//
+// IMPORTANT: We override only the small subset of pthreadpool API that we need
+// for XLA + XNNPACK integration. The rest of the pthreadpool API calls will go
+// to the default pthreadpool implementation.
 
-extern "C" void pthreadpool_parallelize_1d_with_uarch(
-    pthreadpool_t threadpool, pthreadpool_task_1d_with_id_t function,
-    void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
-    size_t range, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
+using namespace xla::cpu;  // NOLINT
 
-extern "C" void pthreadpool_parallelize_1d_tile_1d(
-    pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_t function,
-    void* context, size_t range, size_t tile, uint32_t flags) {
-  xla::cpu::Parallelize1DTile1D(threadpool, function, context, range, tile,
-                                flags);
-}
+#define DEFINE_PTHREADPOOL_FUNCTION_R(result, name, ...)    \
+  extern "C" result pthreadpool_##name##_private_impl(...); \
+  extern "C" result pthreadpool_##name(__VA_ARGS__)
 
-extern "C" void pthreadpool_parallelize_1d_tile_1d_dynamic(
-    pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_dynamic_t function,
-    void* context, size_t range, size_t tile, uint32_t flags) {
-  xla::cpu::Parallelize1DTile1DDynamic(threadpool, function, context, range,
-                                       tile, flags);
-}
+#define DEFINE_PTHREADPOOL_FUNCTION(name, ...)            \
+  extern "C" void pthreadpool_##name##_private_impl(...); \
+  extern "C" void pthreadpool_##name(__VA_ARGS__)
 
-extern "C" void pthreadpool_parallelize_2d(pthreadpool_t threadpool,
-                                           pthreadpool_task_2d_t function,
-                                           void* context, size_t range_i,
-                                           size_t range_j, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION_R(pthreadpool_t, create, size_t num_threads) {
+  return pthreadpool_create_private_impl(num_threads);
 }
 
-extern "C" void pthreadpool_parallelize_2d_with_thread(
-    pthreadpool_t threadpool, pthreadpool_task_2d_with_thread_t function,
-    void* context, size_t range_i, size_t range_j, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(destroy, pthreadpool_t threadpool) {
+  pthreadpool_destroy_private_impl(threadpool);
 }
 
-extern "C" void pthreadpool_parallelize_2d_tile_1d(
-    pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_t function,
-    void* context, size_t range_i, size_t range_j, size_t tile_j,
-    uint32_t flags) {
-  xla::cpu::Parallelize2DTile1D(threadpool, function, context, range_i, range_j,
-                                tile_j, flags);
-}
-
-extern "C" void pthreadpool_parallelize_2d_tile_1d_dynamic(
-    pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function,
-    void* context, size_t range_i, size_t range_j, size_t tile_j,
-    uint32_t flags) {
-  xla::cpu::Parallelize2DTile1DDynamic(threadpool, function, context, range_i,
-                                       range_j, tile_j, flags);
-}
-
-extern "C" void pthreadpool_parallelize_2d_tile_1d_with_uarch(
-    pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_with_id_t function,
-    void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
-    size_t range_i, size_t range_j, size_t tile_j, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION_R(size_t, get_threads_count,
+                              pthreadpool_t threadpool) {
+  if (IsCustomPthreadpool(threadpool)) {
+    return GetThreadsCount(threadpool);
+  }
+  return pthreadpool_get_threads_count_private_impl(threadpool);
 }
 
-extern "C" void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(
-    pthreadpool_t threadpool,
-    pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, void* context,
-    uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
-    size_t range_j, size_t tile_j, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_1d, pthreadpool_t threadpool,
+                            pthreadpool_task_1d_t function, void* context,
+                            size_t range, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range});
+  } else {
+    pthreadpool_parallelize_1d_private_impl(threadpool, function, context,
+                                            range, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_2d_tile_2d(
-    pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_t function,
-    void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_1d_tile_1d, pthreadpool_t threadpool,
+                            pthreadpool_task_1d_tile_1d_t function,
+                            void* context, size_t range, size_t tile,
+                            uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, TileDim{range, tile});
+  } else {
+    pthreadpool_parallelize_1d_tile_1d_private_impl(
+        threadpool, function, context, range, tile, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_2d_tile_2d_dynamic(
-    pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_dynamic_t function,
-    void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_1d_tile_1d_dynamic,
+                            pthreadpool_t threadpool,
+                            pthreadpool_task_1d_tile_1d_dynamic_t function,
+                            void* context, size_t range, size_t tile,
+                            uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    ParallelizeDynamic(threadpool, function, context, TileDim{range, tile});
+  } else {
+    pthreadpool_parallelize_1d_tile_1d_dynamic_private_impl(
+        threadpool, function, context, range, tile, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_2d_tile_2d_with_uarch(
-    pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_with_id_t function,
-    void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
-    size_t range_i, size_t range_j, size_t tile_i, size_t tile_j,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_2d, pthreadpool_t threadpool,
+                            pthreadpool_task_2d_t function, void* context,
+                            size_t range_i, size_t range_j, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                RangeDim{range_j});
+  } else {
+    pthreadpool_parallelize_2d_private_impl(threadpool, function, context,
+                                            range_i, range_j, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch(
-    pthreadpool_t threadpool,
-    pthreadpool_task_2d_tile_2d_dynamic_with_id_t function, void* context,
-    uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
-    size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_2d_tile_1d, pthreadpool_t threadpool,
+                            pthreadpool_task_2d_tile_1d_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t tile_j, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                TileDim{range_j, tile_j});
+  } else {
+    pthreadpool_parallelize_2d_tile_1d_private_impl(
+        threadpool, function, context, range_i, range_j, tile_j, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d(pthreadpool_t threadpool,
-                                           pthreadpool_task_3d_t function,
-                                           void* context, size_t range_i,
-                                           size_t range_j, size_t range_k,
-                                           uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_2d_tile_1d_dynamic,
+                            pthreadpool_t threadpool,
+                            pthreadpool_task_2d_tile_1d_dynamic_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t tile_j, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    ParallelizeDynamic(threadpool, function, context, RangeDim{range_i},
+                       TileDim{range_j, tile_j});
+  } else {
+    pthreadpool_parallelize_2d_tile_1d_dynamic_private_impl(
+        threadpool, function, context, range_i, range_j, tile_j, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_1d(
-    pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t tile_k, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_2d_tile_2d, pthreadpool_t threadpool,
+                            pthreadpool_task_2d_tile_2d_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t tile_i, size_t tile_j, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, TileDim{range_i, tile_i},
+                TileDim{range_j, tile_j});
+  } else {
+    pthreadpool_parallelize_2d_tile_2d_private_impl(
+        threadpool, function, context, range_i, range_j, tile_i, tile_j, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_1d_with_thread(
-    pthreadpool_t threadpool,
-    pthreadpool_task_3d_tile_1d_with_thread_t function, void* context,
-    size_t range_i, size_t range_j, size_t range_k, size_t tile_k,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_2d_tile_2d_dynamic,
+                            pthreadpool_t threadpool,
+                            pthreadpool_task_2d_tile_2d_dynamic_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t tile_i, size_t tile_j, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    ParallelizeDynamic(threadpool, function, context, TileDim{range_i, tile_i},
+                       TileDim{range_j, tile_j});
+  } else {
+    pthreadpool_parallelize_2d_tile_2d_dynamic_private_impl(
+        threadpool, function, context, range_i, range_j, tile_i, tile_j, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_1d_with_uarch(
-    pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_with_id_t function,
-    void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
-    size_t range_i, size_t range_j, size_t range_k, size_t tile_k,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_3d, pthreadpool_t threadpool,
+                            pthreadpool_task_3d_t function, void* context,
+                            size_t range_i, size_t range_j, size_t range_k,
+                            uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                RangeDim{range_j}, RangeDim{range_k});
+  } else {
+    pthreadpool_parallelize_3d_private_impl(threadpool, function, context,
+                                            range_i, range_j, range_k, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(
-    pthreadpool_t threadpool,
-    pthreadpool_task_3d_tile_1d_with_id_with_thread_t function, void* context,
-    uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
-    size_t range_j, size_t range_k, size_t tile_k, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_3d_tile_1d, pthreadpool_t threadpool,
+                            pthreadpool_task_3d_tile_1d_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t range_k, size_t tile_k, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                RangeDim{range_j}, TileDim{range_k, tile_k});
+  } else {
+    pthreadpool_parallelize_3d_tile_1d_private_impl(threadpool, function,
+                                                    context, range_i, range_j,
+                                                    range_k, tile_k, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_2d(
-    pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t tile_j, size_t tile_k, uint32_t flags) {
-  xla::cpu::Parallelize3DTile2D(threadpool, function, context, range_i, range_j,
-                                range_k, tile_j, tile_k, flags);
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_3d_tile_2d, pthreadpool_t threadpool,
+                            pthreadpool_task_3d_tile_2d_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t range_k, size_t tile_j, size_t tile_k,
+                            uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                TileDim{range_j, tile_j}, TileDim{range_k, tile_k});
+  } else {
+    pthreadpool_parallelize_3d_tile_2d_private_impl(
+        threadpool, function, context, range_i, range_j, range_k, tile_j,
+        tile_k, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_2d_dynamic(
-    pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t tile_j, size_t tile_k, uint32_t flags) {
-  xla::cpu::Parallelize3DTile2DDynamic(threadpool, function, context, range_i,
-                                       range_j, range_k, tile_j, tile_k, flags);
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_3d_tile_2d_dynamic,
+                            pthreadpool_t threadpool,
+                            pthreadpool_task_3d_tile_2d_dynamic_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t range_k, size_t tile_j, size_t tile_k,
+                            uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    ParallelizeDynamic(threadpool, function, context, RangeDim{range_i},
+                       TileDim{range_j, tile_j}, TileDim{range_k, tile_k});
+  } else {
+    pthreadpool_parallelize_3d_tile_2d_dynamic_private_impl(
+        threadpool, function, context, range_i, range_j, range_k, tile_j,
+        tile_k, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_2d_with_uarch(
-    pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_with_id_t function,
-    void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
-    size_t range_i, size_t range_j, size_t range_k, size_t tile_j,
-    size_t tile_k, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_4d_tile_2d, pthreadpool_t threadpool,
+                            pthreadpool_task_4d_tile_2d_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t range_k, size_t range_l, size_t tile_k,
+                            size_t tile_l, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                RangeDim{range_j}, TileDim{range_k, tile_k},
+                TileDim{range_l, tile_l});
+  } else {
+    pthreadpool_parallelize_4d_tile_2d_private_impl(
+        threadpool, function, context, range_i, range_j, range_k, range_l,
+        tile_k, tile_l, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch(
-    pthreadpool_t threadpool,
-    pthreadpool_task_3d_tile_2d_dynamic_with_id_t function, void* context,
-    uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
-    size_t range_j, size_t range_k, size_t tile_j, size_t tile_k,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_4d_tile_2d_dynamic,
+                            pthreadpool_t threadpool,
+                            pthreadpool_task_4d_tile_2d_dynamic_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t range_k, size_t range_l, size_t tile_k,
+                            size_t tile_l, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    ParallelizeDynamic(threadpool, function, context, RangeDim{range_i},
+                       RangeDim{range_j}, TileDim{range_k, tile_k},
+                       TileDim{range_l, tile_l});
+  } else {
+    pthreadpool_parallelize_4d_tile_2d_dynamic_private_impl(
+        threadpool, function, context, range_i, range_j, range_k, range_l,
+        tile_k, tile_l, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_4d(pthreadpool_t threadpool,
-                                           pthreadpool_task_4d_t function,
-                                           void* context, size_t range_i,
-                                           size_t range_j, size_t range_k,
-                                           size_t range_l, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_4d_tile_1d(
-    pthreadpool_t threadpool, pthreadpool_task_4d_tile_1d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t range_l, size_t tile_l, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_4d_tile_2d(
-    pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_4d_tile_2d_with_uarch(
-    pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_with_id_t function,
-    void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
-    size_t range_i, size_t range_j, size_t range_k, size_t range_l,
-    size_t tile_k, size_t tile_l, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_5d(pthreadpool_t threadpool,
-                                           pthreadpool_task_5d_t function,
-                                           void* context, size_t range_i,
-                                           size_t range_j, size_t range_k,
-                                           size_t range_l, size_t range_m,
-                                           uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_5d, pthreadpool_t threadpool,
+                            pthreadpool_task_5d_t function, void* context,
+                            size_t range_i, size_t range_j, size_t range_k,
+                            size_t range_l, size_t range_m, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                RangeDim{range_j}, RangeDim{range_k}, RangeDim{range_l},
+                RangeDim{range_m});
+  } else {
+    pthreadpool_parallelize_5d_private_impl(threadpool, function, context,
+                                            range_i, range_j, range_k, range_l,
+                                            range_m, flags);
+  }
 }
 
-extern "C" void pthreadpool_parallelize_5d_tile_1d(
-    pthreadpool_t threadpool, pthreadpool_task_5d_tile_1d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t range_l, size_t range_m, size_t tile_m, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_5d_tile_2d(
-    pthreadpool_t threadpool, pthreadpool_task_5d_tile_2d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t range_l, size_t range_m, size_t tile_l, size_t tile_m,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_6d(pthreadpool_t threadpool,
-                                           pthreadpool_task_6d_t function,
-                                           void* context, size_t range_i,
-                                           size_t range_j, size_t range_k,
-                                           size_t range_l, size_t range_m,
-                                           size_t range_n, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_6d_tile_1d(
-    pthreadpool_t threadpool, pthreadpool_task_6d_tile_1d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t range_l, size_t range_m, size_t range_n, size_t tile_n,
-    uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
-}
-
-extern "C" void pthreadpool_parallelize_6d_tile_2d(
-    pthreadpool_t threadpool, pthreadpool_task_6d_tile_2d_t function,
-    void* context, size_t range_i, size_t range_j, size_t range_k,
-    size_t range_l, size_t range_m, size_t range_n, size_t tile_m,
-    size_t tile_n, uint32_t flags) {
-  LOG(FATAL) << "Not implemented";
+DEFINE_PTHREADPOOL_FUNCTION(parallelize_5d_tile_2d, pthreadpool_t threadpool,
+                            pthreadpool_task_5d_tile_2d_t function,
+                            void* context, size_t range_i, size_t range_j,
+                            size_t range_k, size_t range_l, size_t range_m,
+                            size_t tile_l, size_t tile_m, uint32_t flags) {
+  if (IsCustomPthreadpool(threadpool)) {
+    Parallelize(threadpool, function, context, RangeDim{range_i},
+                RangeDim{range_j}, RangeDim{range_k}, TileDim{range_l, tile_l},
+                TileDim{range_m, tile_m});
+  } else {
+    pthreadpool_parallelize_5d_tile_2d_private_impl(
+        threadpool, function, context, range_i, range_j, range_k, range_l,
+        range_m, tile_l, tile_m, flags);
+  }
 }
 
 #endif  // XLA_CPU_USE_CUSTOM_PTHREADPOOL
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
index d2e3106a4aee..f7be00b62023 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
@@ -21,20 +21,17 @@ limitations under the License.
 
 namespace xla::cpu {
 
-// Returns true if the custom pthreadpool is enabled.
-bool IsCustomPthreadpoolEnabled();
-
-// Returns the default per-process pthreadpool. If custom `pthreadpool` is
-// enabled, it will return nullptr.
-pthreadpool_t DefaultPthreadpool();
-
-// Creates a `pthreadpool` that uses the given `runner` to execute work. If
-// custom `pthreadpool` is disabled, it will kill the process.
+// Creates a `pthreadpool` that uses the given `runner` to execute work.
 pthreadpool_t CreateCustomPthreadpool(xla::cpu::ParallelLoopRunner* runner);
 
-// Returns the parallel loop runner associated with the given `pthreadpool`. If
-// the `pthreadpool` is not associated with a parallel loop runner, returns
-// nullptr.
+// Destroys the given `pthreadpool`.
+//
+// IMPORTANT: Thread pool must be created with `CreateCustomPthreadpool`.
+void DestroyCustomPthreadpool(pthreadpool_t threadpool);
+
+// Returns the parallel loop runner associated with the given `pthreadpool`.
+//
+// IMPORTANT: Thread pool must be created with `CreateCustomPthreadpool`.
 xla::cpu::ParallelLoopRunner* GetParallelLoopRunner(pthreadpool_t threadpool);
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool_test.cc
index bbedfe52ce08..e4a51eb36640 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool_test.cc
@@ -25,7 +25,12 @@ limitations under the License.
 #include "pthreadpool.h"
 #include "xla/backends/cpu/runtime/parallel_loop_runner.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/threadpool.h"
+
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace xla::cpu {
 namespace {
@@ -125,8 +130,34 @@ static xnn_status CreateDotSubgraph(xnn_subgraph_t subgraph, size_t m, size_t n,
   return xnn_status_success;
 }
 
-TEST(XnnThreadPoolTest, Binary) {
-  pthreadpool_t threadpool = pthreadpool_create(8);
+class XnnThreadPoolTest : public testing::TestWithParam<bool> {
+ public:
+  XnnThreadPoolTest()
+      : thread_pool_(tsl::Env::Default(), "xnn-threadpool-test", 8),
+        device_(thread_pool_.AsEigenThreadPool(), thread_pool_.NumThreads()),
+        runner_(&device_) {}
+
+  pthreadpool_t CreateThreadPool() {
+    return GetParam() ? pthreadpool_create(8)
+                      : CreateCustomPthreadpool(&runner_);
+  }
+
+  void DestroyThreadPool(pthreadpool_t threadpool) {
+    if (GetParam()) {
+      pthreadpool_destroy(threadpool);
+    } else {
+      DestroyCustomPthreadpool(threadpool);
+    }
+  }
+
+ private:
+  tsl::thread::ThreadPool thread_pool_;
+  Eigen::ThreadPoolDevice device_;
+  ParallelLoopRunner runner_;
+};
+
+TEST_P(XnnThreadPoolTest, Binary) {
+  pthreadpool_t threadpool = CreateThreadPool();
   ASSERT_NE(threadpool, nullptr);
 
   ASSERT_EQ(xnn_initialize(/*allocator=*/nullptr), xnn_status_success);
@@ -178,11 +209,11 @@ TEST(XnnThreadPoolTest, Binary) {
   ASSERT_EQ(xnn_delete_subgraph(subgraph), xnn_status_success);
   ASSERT_EQ(xnn_release_workspace(workspace), xnn_status_success);
 
-  pthreadpool_destroy(threadpool);
+  DestroyThreadPool(threadpool);
 }
 
-TEST(XnnThreadPoolTest, Dot) {
-  pthreadpool_t threadpool = pthreadpool_create(8);
+TEST_P(XnnThreadPoolTest, Dot) {
+  pthreadpool_t threadpool = CreateThreadPool();
   ASSERT_NE(threadpool, nullptr);
 
   ASSERT_EQ(xnn_initialize(/*allocator=*/nullptr), xnn_status_success);
@@ -231,8 +262,11 @@ TEST(XnnThreadPoolTest, Dot) {
   ASSERT_EQ(xnn_delete_subgraph(subgraph), xnn_status_success);
   ASSERT_EQ(xnn_release_workspace(workspace), xnn_status_success);
 
-  pthreadpool_destroy(threadpool);
+  DestroyThreadPool(threadpool);
 }
 
+INSTANTIATE_TEST_SUITE_P(XnnThreadPool, XnnThreadPoolTest, testing::Bool(),
+                         testing::PrintToStringParamName());
+
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/BUILD b/third_party/xla/xla/backends/cpu/testlib/BUILD
index 956612dd0e9b..44de65f0c71e 100644
--- a/third_party/xla/xla/backends/cpu/testlib/BUILD
+++ b/third_party/xla/xla/backends/cpu/testlib/BUILD
@@ -1,6 +1,6 @@
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-load("//xla:strict.default.bzl", "py_strict_test")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla:pytype.bzl", "pytype_strict_library")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -26,6 +26,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
+        "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:ir_compiler",
         "//xla/backends/cpu/codegen:jit_compiler",
         "//xla/backends/cpu/runtime:function_library",
@@ -34,6 +35,7 @@ cc_library(
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
         "//xla/service:hlo_module_config",
         "//xla/service/cpu:cpu_options",
@@ -99,6 +101,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "mlir_kernel_emitter",
+    srcs = ["mlir_kernel_emitter.cc"],
+    hdrs = ["mlir_kernel_emitter.h"],
+    deps = [
+        "//xla/backends/cpu/codegen:fusion_compiler",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_emitter",
+        "//xla/codegen:kernel_spec",
+        "//xla/codegen:mlir_kernel_source",
+        "//xla/runtime:buffer_use",
+        "//xla/service:buffer_assignment",
+        "//xla/stream_executor:launch_dim",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 tsl_pybind_extension(
     name = "_extension",
     testonly = 1,
@@ -107,25 +131,34 @@ tsl_pybind_extension(
     deps = [
         ":kernel_runner",
         ":llvm_ir_kernel_emitter",
+        ":mlir_kernel_emitter",
         # placeholder for index annotation deps  # buildcleaner: keep
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
         "@nanobind",
         "@local_config_python//:python_headers",  # buildcleaner: keep
+        "//xla/backends/cpu/codegen:computation_kernel_emitter",
+        "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:jit_compiler",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/backends/cpu/codegen/dot:dot_kernel_emitter",
         "//xla/backends/cpu/codegen/elemental:concatenate_kernel_emitter",
         "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
+        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_module_config",
         "//xla/service/cpu:cpu_compiler_pure",
+        "//xla/service/cpu:fusion_wrapper",
         "//xla/stream_executor:launch_dim",
+        "@local_tsl//tsl/platform:casts",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/__init__.py b/third_party/xla/xla/backends/cpu/testlib/__init__.py
index c13596cf0e39..fe0a28954bb1 100644
--- a/third_party/xla/xla/backends/cpu/testlib/__init__.py
+++ b/third_party/xla/xla/backends/cpu/testlib/__init__.py
@@ -16,7 +16,9 @@
 
 from xla.backends.cpu.testlib import _extension
 
+# Classes.
 # go/keep-sorted start
+ComputationKernelEmitter = _extension.ComputationKernelEmitter
 ConcatenateKernelEmitter = _extension.ConcatenateKernelEmitter
 DotKernelEmitter = _extension.DotKernelEmitter
 ElementalKernelEmitter = _extension.ElementalKernelEmitter
@@ -24,5 +26,14 @@
 JitCompiler = _extension.JitCompiler
 KernelRunner = _extension.KernelRunner
 LlvmIrKernelEmitter = _extension.LlvmIrKernelEmitter
+MLIRContext = _extension.MLIRContext
+MlirKernelEmitter = _extension.MlirKernelEmitter
+ScatterKernelEmitter = _extension.ScatterKernelEmitter
 TargetMachineFeatures = _extension.TargetMachineFeatures
 # go/keep-sorted end
+
+# Free functions.
+# go/keep-sorted start
+lower_to_llvm = _extension.lower_to_llvm
+run_fusion_wrapper_pass = _extension.run_fusion_wrapper_pass
+# go/keep-sorted end
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
index 23e9e1f418c4..4c93abbd0eef 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.cc
@@ -24,9 +24,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Target/TargetOptions.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/jit_compiler.h"
 #include "xla/backends/cpu/runtime/function_library.h"
@@ -35,6 +37,7 @@ limitations under the License.
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
 #include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/hlo_module_config.h"
@@ -56,6 +59,10 @@ absl::StatusOr<KernelRunner> KernelRunner::Create(
           dynamic_cast<LlvmIrKernelSource*>(kernel_source.get())) {
     return Create(kernel_spec, std::move(*llvm_kernel_source),
                   std::move(compiler));
+  } else if (auto* mlir_kernel_source =
+                 dynamic_cast<MlirKernelSource*>(kernel_source.get())) {
+    return Create(kernel_spec, std::move(*mlir_kernel_source),
+                  std::move(compiler));
   }
 
   return absl::InvalidArgumentError("Unrecognised kernel spec type");
@@ -79,6 +86,16 @@ absl::StatusOr<KernelRunner> KernelRunner::Create(
   return KernelRunner(std::move(library), Kernel(1, kernel_fn), thread_dim);
 }
 
+absl::StatusOr<KernelRunner> KernelRunner::Create(
+    const KernelSpec& kernel_spec, MlirKernelSource mlir_kernel_source,
+    JitCompiler compiler) {
+  TF_ASSIGN_OR_RETURN(LlvmIrKernelSource llvm_ir_kernel_source,
+                      LowerToLlvm(mlir_kernel_source));
+
+  return Create(kernel_spec, std::move(llvm_ir_kernel_source),
+                std::move(compiler));
+}
+
 KernelRunner::KernelRunner(std::unique_ptr<FunctionLibrary> library,
                            Kernel kernel, Kernel::ThreadDim thread_dim)
     : library_(std::move(library)),
@@ -101,6 +118,7 @@ absl::StatusOr<JitCompiler> KernelRunner::CreateJitCompiler(
   IrCompiler::Options ir_compiler_options{
       /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(config),
       /*optimize_for_size=*/options::OptimizeForSizeRequested(config),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
       /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config),
       /*disable_expensive_passes=*/
       debug_options.xla_llvm_disable_expensive_passes(),
@@ -117,17 +135,31 @@ absl::StatusOr<JitCompiler> KernelRunner::CreateJitCompiler(
       };
 
   JitCompiler::Options jit_compiler_options{
-      std::move(ir_compiler_options),
-      std::move(ir_compiler_hooks),
       /*num_dylibs=*/1,
       /*definition_generator=*/std::move(definition_generator),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
   };
 
   llvm::TargetOptions target_options;
   target_options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
 
-  return JitCompiler::Create(target_options, jit_compiler_options);
+  std::unique_ptr<IrCompiler> ir_compiler =
+      IrCompiler::Create(target_options, std::move(ir_compiler_options),
+                         std::move(ir_compiler_hooks));
+
+  return JitCompiler::Create(std::move(jit_compiler_options),
+                             std::move(ir_compiler));
+}
+
+absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
+    MlirKernelSource& mlir_kernel_source) {
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
+
+  FusionCompiler fusion_compiler(FusionCompiler::Options{});
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<llvm::Module> llvm_module,
+      fusion_compiler.Compile(*llvm_context, mlir_kernel_source.module()));
+
+  return LlvmIrKernelSource(std::move(llvm_context), std::move(llvm_module));
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
index 702cb13bf53b..126a8d9101ea 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
 #include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/service/hlo_module_config.h"
 
@@ -53,6 +54,9 @@ class KernelRunner final : public xla::KernelRunner {
   static absl::StatusOr<KernelRunner> Create(
       const KernelSpec& kernel_spec, LlvmIrKernelSource llvm_ir_kernel_source,
       JitCompiler compiler);
+  static absl::StatusOr<KernelRunner> Create(
+      const KernelSpec& kernel_spec, MlirKernelSource mlir_kernel_source,
+      JitCompiler compiler);
 
   KernelRunner(std::unique_ptr<FunctionLibrary> library, Kernel kernel,
                Kernel::ThreadDim thread_dim);
@@ -62,6 +66,9 @@ class KernelRunner final : public xla::KernelRunner {
   Kernel::ThreadDim thread_dim_;
 };
 
+absl::StatusOr<LlvmIrKernelSource> LowerToLlvm(
+    MlirKernelSource& mlir_kernel_source);
+
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_TESTLIB_KERNEL_RUNNER_H_
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
index 674ebe645298..1484a905428a 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
@@ -23,28 +23,38 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
 #include "nanobind/nanobind.h"
 #include "nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "nanobind/stl/tuple.h"  // IWYU pragma: keep
 #include "nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/backends/cpu/codegen/computation_kernel_emitter.h"
 #include "xla/backends/cpu/codegen/dot/dot_kernel_emitter.h"
 #include "xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h"
 #include "xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/jit_compiler.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/testlib/kernel_runner.h"
 #include "xla/backends/cpu/testlib/llvm_ir_kernel_emitter.h"
+#include "xla/backends/cpu/testlib/mlir_kernel_emitter.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_emitter.h"
+#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/cpu_compiler.h"
+#include "xla/service/cpu/fusion_wrapper.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "tsl/platform/casts.h"
 
 namespace xla::cpu {
 
@@ -87,6 +97,30 @@ NB_MODULE(_extension, kernel_runner_module) {
                  {});
            });
 
+  nb::class_<MlirKernelEmitter, KernelEmitter>(kernel_runner_module,
+                                               "MlirKernelEmitter")
+      .def("__init__",
+           [](MlirKernelEmitter* self, absl::string_view ir,
+              absl::string_view kernel_name, NbThreadDim thread_dim) {
+             new (self) MlirKernelEmitter(
+                 ir, kernel_name,
+                 se::ThreadDim{std::get<0>(thread_dim), std::get<1>(thread_dim),
+                               std::get<2>(thread_dim)},
+                 {});
+           });
+
+  kernel_runner_module.def("lower_to_llvm", [](MlirKernelSource& source) {
+    absl::StatusOr<LlvmIrKernelSource> llvm_ir_kernel_source =
+        LowerToLlvm(source);
+
+    if (!llvm_ir_kernel_source.ok()) {
+      throw std::runtime_error(
+          std::string(llvm_ir_kernel_source.status().message()));
+    }
+
+    return std::move(llvm_ir_kernel_source).value();
+  });
+
   nb::class_<CpuCompiler>(kernel_runner_module, "HloCompiler")
       .def(nb::init<>())
       .def("create_buffer_assignment",
@@ -113,6 +147,9 @@ NB_MODULE(_extension, kernel_runner_module) {
         return std::move(schedule).value();
       });
 
+  nb::class_<mlir::MLIRContext>(kernel_runner_module, "MLIRContext")
+      .def(nb::new_([] { return FusionCompiler::CreateContext(); }));
+
   nb::class_<TargetMachineFeatures>(kernel_runner_module,
                                     "TargetMachineFeatures")
       .def("__str__", &TargetMachineFeatures::get_target_feature_string);
@@ -138,6 +175,23 @@ NB_MODULE(_extension, kernel_runner_module) {
            nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
            nb::keep_alive<1, 4>());
 
+  nb::class_<ComputationKernelEmitter, KernelEmitter>(
+      kernel_runner_module, "ComputationKernelEmitter")
+      .def(nb::init<const HloInstruction*, const BufferAssignment*,
+                    const TargetMachineFeatures*>(),
+           nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>(),
+           nb::keep_alive<1, 4>());
+
+  nb::class_<CpuScatterFusion, KernelEmitter>(kernel_runner_module,
+                                              "ScatterKernelEmitter")
+      .def(
+          "__init__",
+          [](CpuScatterFusion* self, const HloFusionInstruction* instruction,
+             const BufferAssignment* bufffer_assignment) {
+            new (self) CpuScatterFusion(*bufffer_assignment, instruction);
+          },
+          nb::keep_alive<1, 2>(), nb::keep_alive<1, 3>());
+
   nb::class_<JitCompiler>(kernel_runner_module, "JitCompiler")
       .def(nb::new_([](const HloModuleConfig& config) {
              absl::StatusOr<JitCompiler> compiler =
@@ -177,6 +231,18 @@ NB_MODULE(_extension, kernel_runner_module) {
 
             return std::move(runner).value();
           });
+
+  kernel_runner_module.def(
+      "run_fusion_wrapper_pass",
+      [](std::unique_ptr<HloModule, nb::deleter<HloModule>> hlo_module) {
+        FusionWrapper fusion_wrapper;
+        absl::StatusOr<bool> result = fusion_wrapper.Run(hlo_module.get());
+        if (!result.ok()) {
+          throw std::runtime_error(std::string(result.status().message()));
+        }
+
+        return hlo_module->Clone();
+      });
 }
 
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.py b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.py
index 802e3ffa75c4..9689e288478c 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.py
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_test.py
@@ -68,5 +68,66 @@ def test_llvm_ir_kernel_runner(self):
     np.testing.assert_array_equal(np.asarray(c), np.asarray(a) + np.asarray(b))
 
 
+class MlirKernelRunnerTest(absltest.TestCase):
+
+  def test_mlir_kernel_runner(self):
+    ir = """
+      #indexing_map = #xla.indexing_map<"()[s0, s1] -> (s0, s1), domain: s0 in [0, 1023], s1 in [0, 31]">
+      module attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i32>} {
+        func.func private
+        @sum_kernel_entry(%input_buffer: tensor<1024x32xf32>,
+                          %output_buffer: tensor<1xf32>) -> tensor<1xf32>
+                            attributes {xla.backend_kind = #xla.backend_kind<cpu>, xla.entry} {
+          // Initial sum set to 0.
+          %sum_0 = arith.constant 0.0 : f32
+          // iter_args binds initial values to the loop's region arguments.
+          %sum = xla.loop ()[%i, %j] -> (%r0, %r1)
+              in #indexing_map iter_args(%sum_iter = %sum_0) -> (f32) {
+            %t = tensor.extract %input_buffer[%i, %j] : tensor<1024x32xf32>
+            %sum_next = arith.addf %sum_iter, %t : f32
+            // Yield current iteration sum to next iteration %sum_iter or to %sum
+            // if final iteration.
+            xla.yield %sum_next : f32
+          }
+
+          // Ideally it would be be possible to do this in the kernel region,
+          // but currently our lowering results in xla_cpu.store being removed
+          // before the tensors are lowered which then results in the insert
+          // being removed as tensors have value scemantics and are treated as
+          // pure.
+          %zero_index = arith.constant 0 : index
+          %inserted = tensor.insert %sum into %output_buffer[%zero_index] : tensor<1xf32>
+
+          return %inserted : tensor<1xf32>
+        }
+        func.func @sum_kernel(%call_frame: !xla_cpu.call_frame) -> !xla_cpu.error {
+          %thread_idx = xla_cpu.thread_id %call_frame : index
+          %input_buffer = xla_cpu.load %call_frame, 0 : tensor<1024x32xf32>
+          %output_buffer = xla_cpu.load %call_frame, 1 : tensor<1xf32>
+          %sum = xla.pure_call @sum_kernel_entry(%input_buffer, %output_buffer)
+            {noinline} : (tensor<1024x32xf32>, tensor<1xf32>) -> tensor<1xf32>
+          xla_cpu.store %sum into %call_frame, 1 : tensor<1xf32>
+          %success = xla_cpu.success : !xla_cpu.error
+          return %success : !xla_cpu.error
+        }
+      }
+    """
+    mlir_emitter = cpu_testlib.MlirKernelEmitter(ir, "sum_kernel", (1, 1, 1))
+
+    kernel_definition = mlir_emitter.emit_kernel_definition()
+
+    runner = cpu_testlib.KernelRunner.create(
+        kernel_definition,
+        cpu_testlib.JitCompiler(base_testlib.HloModuleConfig()),
+    )
+    input_tensor = create_literal(np.ones([1024, 32], dtype=np.float32))
+    output_sum = create_literal(np.zeros([1], dtype=np.float32))
+    runner.call([input_tensor, output_sum])
+
+    np.testing.assert_array_equal(
+        np.asarray(output_sum).item(), np.asarray(input_tensor).sum()
+    )
+
+
 if __name__ == "__main__":
   absltest.main()
diff --git a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
new file mode 100644
index 000000000000..b13008d834bc
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.cc
@@ -0,0 +1,76 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/cpu/testlib/mlir_kernel_emitter.h"
+
+#include <memory>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_spec.h"
+#include "xla/codegen/mlir_kernel_source.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::cpu {
+MlirKernelEmitter::MlirKernelEmitter(absl::string_view mlir,
+                                     absl::string_view kernel_name,
+                                     se::ThreadDim thread_dim,
+                                     absl::Span<const KernelArg> args)
+    : mlir_(mlir),
+      kernel_name_(kernel_name),
+      thread_dim_(thread_dim),
+      args_(args.begin(), args.end()) {
+  for (const MlirKernelEmitter::KernelArg& arg : args_) {
+    buffer_allocations_.emplace_back(buffer_allocations_.size(), arg.size_bytes,
+                                     /*color=*/0);
+  }
+}
+
+absl::StatusOr<KernelDefinition> MlirKernelEmitter::EmitKernelDefinition() {
+  std::unique_ptr<mlir::MLIRContext> context = FusionCompiler::CreateContext();
+
+  TF_ASSIGN_OR_RETURN(
+      MlirKernelSource source,
+      MlirKernelSource::ParseFromString(mlir_, std::move(context)));
+
+  // Convert kernel arguments to fake allocations and buffer uses.
+  KernelSpec::Buffers argument_buffers;
+  KernelSpec::Buffers result_buffers;
+
+  for (const auto& [arg, allocation] : llvm::zip(args_, buffer_allocations_)) {
+    BufferAllocation::Slice slice(&allocation, 0, arg.size_bytes);
+    if (arg.memory_access == BufferUse::MemoryAccess::kRead) {
+      argument_buffers.push_back(slice);
+    } else {
+      result_buffers.push_back(slice);
+    }
+  }
+
+  KernelSpec kernel_spec(kernel_name_, thread_dim_, std::move(argument_buffers),
+                         std::move(result_buffers), /*invariant_arguments=*/{});
+  return KernelDefinition(
+      std::move(kernel_spec),
+      std::make_unique<MlirKernelSource>(std::move(source)));
+}
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
new file mode 100644
index 000000000000..4587d195452e
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/testlib/mlir_kernel_emitter.h
@@ -0,0 +1,65 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_CPU_TESTLIB_MLIR_KERNEL_EMITTER_H_
+#define XLA_BACKENDS_CPU_TESTLIB_MLIR_KERNEL_EMITTER_H_
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/codegen/kernel_definition.h"
+#include "xla/codegen/kernel_emitter.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::cpu {
+
+// An XLA kernel emitter that emits a kernel by parsing the given MLIR module
+// into the dedicated MLIR context and module instance. This kernel emitter is
+// intended to be used for testing purposes only: (1) load pre-compiled LLVM IR
+// into the XLA kernel spec; (2) Execute it with user provided input buffers.
+class MlirKernelEmitter : public KernelEmitter {
+ public:
+  // When loading kernel IR into the KernelSpec we create a separate buffer
+  // allocation for every kernel argument. We don't use buffer assignment in
+  // kernel testlib, but we still need to return a valid BufferUses vector.
+  struct KernelArg {
+    size_t size_bytes;
+    BufferUse::MemoryAccess memory_access;
+  };
+
+  MlirKernelEmitter(absl::string_view mlir, absl::string_view kernel_name,
+                    se::ThreadDim thread_dim, absl::Span<const KernelArg> args);
+
+  absl::StatusOr<KernelDefinition> EmitKernelDefinition() final;
+
+ private:
+  std::string mlir_;
+  std::string kernel_name_;
+  se::ThreadDim thread_dim_;
+  std::vector<KernelArg> args_;
+  // Normally this would be populated by the buffer assignment pass, but for
+  // testing purposes we hold it in the emitter.
+  std::vector<BufferAllocation> buffer_allocations_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_BACKENDS_CPU_TESTLIB_MLIR_KERNEL_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index 32f46e91310b..618bf5d2d571 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -38,9 +38,9 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu:xnn_fusion",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
index 680d4dffbf5c..a95fdc8bf912 100644
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -36,7 +36,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla::cpu {
 namespace {
 
-using XnnGraphFusionTest = HloTestBase;
+using XnnGraphFusionTest = HloHardwareIndependentTestBase;
 
 TEST_F(XnnGraphFusionTest, BasicFusion) {
   std::string hlo_string = R"(
diff --git a/third_party/xla/xla/backends/cpu/xnn_fusion.cc b/third_party/xla/xla/backends/cpu/xnn_fusion.cc
index 0f0887ba3978..70e45e3ea1df 100644
--- a/third_party/xla/xla/backends/cpu/xnn_fusion.cc
+++ b/third_party/xla/xla/backends/cpu/xnn_fusion.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/status/statusor.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/runtime/dot_lib.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -78,10 +79,21 @@ bool XnnShouldUseThreadPool(const HloComputation* computation) {
 
 absl::StatusOr<bool> IsXnnDotSupported(
     const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
-    const Shape& rhs_shape, const Shape& out_shape) {
+    const Shape& rhs_shape, const Shape& out_shape,
+    TargetMachineFeatures* cpu_features) {
   // TODO(ezhulenev): Support other element types.
-  if (lhs_shape.element_type() != F32 || rhs_shape.element_type() != F32 ||
-      out_shape.element_type() != F32) {
+  auto check_dtype = [&](PrimitiveType in_dtype, PrimitiveType out_dtype) {
+    return lhs_shape.element_type() == in_dtype &&
+           rhs_shape.element_type() == in_dtype &&
+           out_shape.element_type() == out_dtype;
+  };
+
+  // We assume that the feature is available if `cpu_features` is not provided.
+  bool cpu_has_avx512bf16 =
+      cpu_features == nullptr || cpu_features->has_avx512bf16();
+  bool dtype_is_supported =
+      check_dtype(F32, F32) || (check_dtype(BF16, F32) && cpu_has_avx512bf16);
+  if (!dtype_is_supported) {
     return false;
   }
 
diff --git a/third_party/xla/xla/backends/cpu/xnn_fusion.h b/third_party/xla/xla/backends/cpu/xnn_fusion.h
index 96ad289b853d..755594af7208 100644
--- a/third_party/xla/xla/backends/cpu/xnn_fusion.h
+++ b/third_party/xla/xla/backends/cpu/xnn_fusion.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/shape.h"
@@ -37,7 +38,8 @@ bool XnnShouldUseThreadPool(const HloComputation* computation);
 // if the dot operation shape is invalid.
 absl::StatusOr<bool> IsXnnDotSupported(
     const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
-    const Shape& rhs_shape, const Shape& out_shape);
+    const Shape& rhs_shape, const Shape& out_shape,
+    TargetMachineFeatures* cpu_features = nullptr);
 
 }  // namespace xla::cpu
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index eaa9a18690d6..2b35a4623609 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -1,4 +1,5 @@
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
@@ -21,14 +22,20 @@ cc_library(
     hdrs = ["copy.h"],
     deps = [
         ":fusion_emitter",
+        "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/runtime:copy_thunk",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:ir_emitter_context",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
@@ -36,6 +43,18 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "copy_test",
+    srcs = ["copy_test.cc"],
+    deps = [
+        ":copy",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:path",
+    ],
+)
+
 cc_library(
     name = "cudnn",
     srcs = ["cudnn.cc"],
@@ -70,7 +89,6 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -78,6 +96,7 @@ xla_test(
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:pattern_matcher",
+        "//xla/service/gpu:cudnn_support_utils",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/tests:gpu_codegen_test",
@@ -110,14 +129,14 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/runtime:all_reduce_thunk",
+        "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:copy_thunk",
         "//xla/backends/gpu/runtime:custom_call_target",
         "//xla/backends/gpu/runtime:custom_call_thunk",
         "//xla/backends/gpu/runtime:dynamic_slice_thunk",
         "//xla/backends/gpu/runtime:gemm_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
-        "//xla/backends/gpu/runtime:nccl_all_reduce_thunk",
-        "//xla/backends/gpu/runtime:nccl_collective_thunk",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
@@ -226,13 +245,11 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
-        "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:kernel_arguments",
-        "//xla/service/gpu:kernel_reuse_cache",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
         "//xla/service/llvm_ir:ir_array",
@@ -248,8 +265,7 @@ cc_library(
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
+        "@llvm-project//mlir:LLVMDialect",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy.cc b/third_party/xla/xla/backends/gpu/codegen/copy.cc
index ffeae5878b6b..0c09e54a7aa0 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy.cc
@@ -14,9 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/backends/gpu/codegen/copy.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <memory>
+#include <optional>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
@@ -25,15 +31,30 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/literal_util.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
+namespace {
+
+HloInstructionAdaptor SkipOptionalBitcast(HloInstructionAdaptor adaptor) {
+  return adaptor.opcode() == HloOpcode::kBitcast ? adaptor.GetOperand(0)
+                                                 : adaptor;
+}
+
+const HloInstruction* SkipOptionalBitcast(const HloInstruction* instr) {
+  return instr->opcode() == HloOpcode::kBitcast ? instr->operand(0) : instr;
+}
+
+}  // namespace
 
 absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
     IrEmitterContext& ir_emitter_context,
@@ -73,5 +94,204 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
   return result;
 }
 
+absl::StatusOr<FusionEmissionResult> DynamicMemcpyFusion::Emit(
+    IrEmitterContext& ir_emitter_context,
+    const HloFusionInstruction& fusion) const {
+  CHECK_EQ(analysis_.fusion_roots().size(), 1);
+
+  auto root = SkipOptionalBitcast(analysis_.fusion_roots().front());
+
+  int source_operand_index;
+  const Shape* copy_shape;
+
+  if (root.opcode() == HloOpcode::kDynamicUpdateSlice) {
+    // We only handle in-place DUS operations (where the source and the
+    // destination are the same). This could be extended to out-of-place DUSes,
+    // but we would either have to issue two memcpys (one of the full original
+    // buffer, one for the updated slice), or three (one for the unchanged
+    // prefix, one for the updated slice, one for the unchanged suffix). The
+    // first option is inefficient, the second option is currently not
+    // implemented: we only support dynamic offsets, no dynamic sizes.
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice input_slice,
+        buffer_assignment_->GetUniqueSlice(
+            &SkipOptionalBitcast(root.GetOperand(0)).instruction(), {}));
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_slice,
+                        buffer_assignment_->GetUniqueSlice(&fusion, {}));
+    CHECK_EQ(input_slice, dst_slice);
+
+    source_operand_index = 1;
+    copy_shape = &root.GetOperand(source_operand_index).shape();
+  } else {
+    CHECK_EQ(root.opcode(), HloOpcode::kDynamicSlice);
+    source_operand_index = 0;
+    copy_shape = &root.shape();
+  }
+
+  const auto* src_instr =
+      &SkipOptionalBitcast(root.GetOperand(source_operand_index)).instruction();
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice src_buffer,
+                      buffer_assignment_->GetUniqueSlice(src_instr, {}));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_buffer,
+                      buffer_assignment_->GetUniqueSlice(&fusion, {}));
+
+  FusionEmissionResult result;
+  result.thunks.emplace_back(std::make_unique<DynamicMemcpyThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(&fusion),
+      /*source_buffer=*/src_buffer,
+      /*destination_buffer=*/dst_buffer,
+      /*mem_size=*/ShapeUtil::ByteSizeOfElements(*copy_shape),
+      /*descriptor=*/descriptor_));
+  return result;
+}
+
+namespace {
+
+// Returns the slice size in the given dimension for a dynamic-(update-)slice
+// instruction.
+int64_t GetSliceSize(const HloInstruction* slice, int dim) {
+  if (slice->opcode() == HloOpcode::kDynamicSlice) {
+    return slice->dynamic_slice_sizes()[dim];
+  }
+  CHECK_EQ(slice->opcode(), HloOpcode::kDynamicUpdateSlice);
+  return slice->operand(1)->shape().dimensions(dim);
+}
+
+// Whether the offset in the given dimension of the slice operation is
+// guaranteed to be clamped to 0. This is the case if the slice size is the
+// same as the size of the dimension in the unsliced shape.
+bool IsZeroOffset(const HloInstruction* slice, int dim) {
+  return GetSliceSize(slice, dim) == slice->operand(0)->shape().dimensions(dim);
+}
+
+int GetFirstOffsetOperandIndex(const HloInstruction* slice) {
+  // dynamic-slice takes the full array, then the offsets.
+  // dynamic-update-slice takes the full array, then the update slice, then the
+  // offsets.
+  CHECK(slice->opcode() == HloOpcode::kDynamicSlice ||
+        slice->opcode() == HloOpcode::kDynamicUpdateSlice);
+  return slice->opcode() == HloOpcode::kDynamicSlice ? 1 : 2;
+}
+
+}  // namespace
+
+bool DynamicMemcpyFusion::IsCandidateFusion(
+    const HloFusionInstruction& instruction) {
+  const HloInstruction* root =
+      SkipOptionalBitcast(instruction.fused_expression_root());
+  if (root->opcode() != HloOpcode::kDynamicSlice &&
+      root->opcode() != HloOpcode::kDynamicUpdateSlice) {
+    return false;
+  }
+
+  // Only contiguous slices can be represented by a memcpy.
+  if (!IsContiguousSlice(*root)) {
+    VLOG(5) << "Slice is not contiguous.";
+    return false;
+  }
+
+  std::optional<absl::InlinedVector<int64_t, 4>> strides =
+      ShapeUtil::ByteStrides(root->operand(0)->shape());
+  if (!strides) {
+    VLOG(5) << "Failed to get byte strides.";
+    return false;
+  }
+
+  int first_offset_index = GetFirstOffsetOperandIndex(root);
+  for (int i = 0; i < first_offset_index; ++i) {
+    auto* operand = SkipOptionalBitcast(root->operand(i));
+    if (operand->opcode() != HloOpcode::kParameter) {
+      VLOG(5) << "Not a slice of a parameter.";
+      return false;
+    }
+  }
+
+  // This might be a dynamic memcpy fusion. We need to actually analyze the data
+  // dependencies of the parameters to know for sure.
+  return true;
+}
+
+std::optional<DynamicMemcpyThunk::MemcpyDescriptor>
+DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(
+    const HloFusionInstruction& fusion) {
+  if (!IsCandidateFusion(fusion)) {
+    return std::nullopt;
+  }
+
+  const HloInstruction* slice =
+      SkipOptionalBitcast(fusion.fused_expression_root());
+  const Shape& slice_input_shape = slice->operand(0)->shape();
+  std::optional<absl::InlinedVector<int64_t, 4>> strides =
+      ShapeUtil::ByteStrides(slice_input_shape);
+  if (!strides) {
+    return std::nullopt;
+  }
+
+  int first_offset_index = GetFirstOffsetOperandIndex(slice);
+  int rank = slice_input_shape.dimensions().size();
+
+  VLOG(5) << "Preconditions passed, trying to build a memcpy descriptor.";
+  DynamicMemcpyThunk::MemcpyDescriptor descriptor;
+  auto& dynamic_offsets = slice->opcode() == HloOpcode::kDynamicSlice
+                              ? descriptor.src_dynamic_offsets
+                              : descriptor.dst_dynamic_offsets;
+  auto& static_offset = slice->opcode() == HloOpcode::kDynamicSlice
+                            ? descriptor.src_byte_static_offset
+                            : descriptor.dst_byte_static_offset;
+  for (int i = 0; i < rank; ++i) {
+    auto* operand = slice->operand(i + first_offset_index);
+    // If this dimension's offset is always clamped to 0, we can skip it.
+    if (IsZeroOffset(slice, i)) {
+      VLOG(5) << "Offset for dimension " << i << " is clamped to 0.";
+      continue;
+    }
+
+    if (operand->opcode() == HloOpcode::kConstant) {
+      std::optional<int64_t> value =
+          LiteralUtil::LiteralAsScalarInt64(operand->literal());
+      if (!value) {
+        return std::nullopt;
+      }
+
+      // Clamp the offset to [0; dimension size - slice size].
+      int64_t max =
+          slice->operand(0)->shape().dimensions(i) - GetSliceSize(slice, i);
+      *value = std::max<int64_t>(0, std::min(*value, max));
+      VLOG(5) << "Offset for dimension " << i << " is constant: " << *value
+              << ".";
+      static_offset += *value * (*strides)[i];
+      continue;
+    }
+
+    auto functional_dependency =
+        ResolveFunctionalDependencyOnInductionVariable(operand);
+    if (!functional_dependency) {
+      VLOG(5) << "Offset for dimension " << i << " is not statically known.";
+      return std::nullopt;
+    }
+
+    // The while loop must actually be a for loop.
+    auto loop_config = functional_dependency->loop
+                           ->backend_config<xla::WhileLoopBackendConfig>();
+    if (!loop_config.ok() || !loop_config->has_known_init_step() ||
+        !loop_config->has_known_trip_count()) {
+      VLOG(5) << "Offset for dimension " << i
+              << " depends on loop with unknown behavior.";
+      return std::nullopt;
+    }
+
+    VLOG(5) << "Offset for dimension " << i << " is dynamic.";
+    dynamic_offsets.emplace_back() = {
+        functional_dependency->loop,
+        functional_dependency->induction_var,
+        std::move(functional_dependency->required_parameters),
+        operand,
+        /*dimension_size=*/slice_input_shape.dimensions(i),
+        /*byte_stride=*/(*strides)[i]};
+  }
+
+  return descriptor;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy.h b/third_party/xla/xla/backends/gpu/codegen/copy.h
index 78f375e3f4ef..6941ed52b88c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy.h
+++ b/third_party/xla/xla/backends/gpu/codegen/copy.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -45,6 +46,37 @@ class MemcpyFusion : public FusionInterface {
   const BufferAssignment* buffer_assignment_;
 };
 
+// Special case of a fusion consisting only of instructions that can be
+// implemented using `memcpy`s. The difference between this fusion and
+// `MemcpyFusion` is that here we allow `memcpy`s that have dynamic offsets
+// (e.g. dynamic-slice in a while loop).
+class DynamicMemcpyFusion : public FusionInterface {
+ public:
+  DynamicMemcpyFusion(const HloFusionAnalysis& analysis,
+                      const BufferAssignment* buffer_assignment,
+                      DynamicMemcpyThunk::MemcpyDescriptor descriptor)
+      : analysis_(analysis),
+        buffer_assignment_(buffer_assignment),
+        descriptor_(std::move(descriptor)) {}
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+
+  // Inexpensive checks to see if a fusion might be a dynamic memcpy fusion.
+  // If this returns true, GetMemcpyDescriptorForFusion might still fail.
+  static bool IsCandidateFusion(const HloFusionInstruction& fusion);
+
+  // Attempts to build a memcpy descriptor for the given fusion.
+  static std::optional<DynamicMemcpyThunk::MemcpyDescriptor>
+  GetMemcpyDescriptorForFusion(const HloFusionInstruction& fusion);
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  const BufferAssignment* buffer_assignment_;
+  DynamicMemcpyThunk::MemcpyDescriptor descriptor_;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
new file mode 100644
index 000000000000..660489ae0da6
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
@@ -0,0 +1,300 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/backends/gpu/codegen/copy.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using CopyFusionTest = HloHardwareIndependentTestBase;
+
+using ::testing::IsEmpty;
+
+const HloFusionInstruction& GetFusion(HloModule* module) {
+  const HloInstruction* fusion =
+      module->GetComputationWithName("dynamic_slice")->FusionInstruction();
+  return *Cast<HloFusionInstruction>(fusion);
+}
+
+TEST_F(CopyFusionTest, ValidCandidate) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    dynamic_slice {
+      p0 = f32[100] parameter(0)
+      p1 = s32[] parameter(1)
+      ROOT slice = f32[10] dynamic-slice(p0, p1), dynamic_slice_sizes={10}
+    }
+
+    ENTRY main {
+      p0 = f32[100] parameter(0)
+      p1 = s32[] parameter(1)
+      ROOT fusion = f32[10] fusion(p0, p1), kind=kLoop, calls=dynamic_slice
+    }
+  )")
+                    .value();
+
+  EXPECT_TRUE(DynamicMemcpyFusion::IsCandidateFusion(GetFusion(module.get())));
+}
+
+TEST_F(CopyFusionTest, ValidCandidateClamped) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    dynamic_slice {
+      p0 = f32[100] parameter(0)
+      p1 = s32[] parameter(1)
+      c1 = s32[] constant(1)
+      sum = s32[] add(p1, c1)
+      ROOT slice = f32[100] dynamic-slice(p0, sum), dynamic_slice_sizes={100}
+    }
+
+    ENTRY main {
+      p0 = f32[100] parameter(0)
+      p1 = s32[] parameter(1)
+      ROOT fusion = f32[100] fusion(p0, p1), kind=kLoop, calls=dynamic_slice
+    }
+  )")
+                    .value();
+
+  EXPECT_TRUE(DynamicMemcpyFusion::IsCandidateFusion(GetFusion(module.get())));
+}
+
+TEST_F(CopyFusionTest, ClampedConstantPositive) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    dynamic_slice {
+      p0 = f32[200] parameter(0)
+      c195 = s32[] constant(195)
+      ROOT slice = f32[100] dynamic-slice(p0, c195), dynamic_slice_sizes={100}
+    }
+
+    ENTRY main {
+      p0 = f32[200] parameter(0)
+      ROOT fusion = f32[100] fusion(p0), kind=kLoop, calls=dynamic_slice
+    }
+  )")
+                    .value();
+
+  auto descriptor = DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(
+      GetFusion(module.get()));
+
+  ASSERT_TRUE(descriptor.has_value());
+  EXPECT_THAT(descriptor->src_dynamic_offsets, IsEmpty());
+  EXPECT_THAT(descriptor->dst_dynamic_offsets, IsEmpty());
+  EXPECT_EQ(descriptor->src_byte_static_offset, sizeof(float) * 100);
+  EXPECT_EQ(descriptor->dst_byte_static_offset, 0);
+}
+
+TEST_F(CopyFusionTest, ClampedConstantNegative) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    dynamic_slice {
+      p0 = f32[200] parameter(0)
+      cn1 = s32[] constant(-1)
+      ROOT slice = f32[100] dynamic-slice(p0, cn1), dynamic_slice_sizes={100}
+    }
+
+    ENTRY main {
+      p0 = f32[200] parameter(0)
+      ROOT fusion = f32[100] fusion(p0), kind=kLoop, calls=dynamic_slice
+    }
+  )")
+                    .value();
+
+  auto descriptor = DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(
+      GetFusion(module.get()));
+
+  ASSERT_TRUE(descriptor.has_value());
+  EXPECT_THAT(descriptor->src_dynamic_offsets, IsEmpty());
+  EXPECT_THAT(descriptor->dst_dynamic_offsets, IsEmpty());
+  EXPECT_EQ(descriptor->src_byte_static_offset, 0);
+  EXPECT_EQ(descriptor->dst_byte_static_offset, 0);
+}
+
+TEST_F(CopyFusionTest, UpdateSliceOfBitcast) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    dynamic_slice {
+      p0 = f32[1,100] parameter(0)
+      p1 = f32[1,10] parameter(1)
+      p2 = s32[] parameter(2)
+      bc0 = f32[100] bitcast(p0)
+      bc1 = f32[10] bitcast(p1)
+      ROOT slice = f32[100] dynamic-update-slice(bc0, bc1, p2)
+    }
+
+    ENTRY main {
+      p0 = f32[1,100] parameter(0)
+      p1 = f32[1,10] parameter(1)
+      p2 = s32[] parameter(2)
+      ROOT fusion = f32[100] fusion(p0, p1, p2), kind=kLoop, calls=dynamic_slice
+    }
+  )")
+                    .value();
+
+  EXPECT_TRUE(DynamicMemcpyFusion::IsCandidateFusion(GetFusion(module.get())));
+}
+
+constexpr char kSliceMemcpyModule[] = R"(
+    dynamic_slice {
+      p0 = s32[4,8,8,8] parameter(0)
+      p1 = s32[] parameter(1)
+      c1 = s32[] constant(1)
+      p2 = s32[] parameter(2)
+
+      // Test all supported kinds of offsets: derived from the while loop's
+      // induction variable (p1), constant (c1) and always clamped to 0, so
+      // the value is irrelevant (p2).
+      // Also test an offset that is computed inside the fusion (o3).
+      o3 = s32[] add(p1, c1)
+      ROOT slice = s32[1,1,1,8] dynamic-slice(p0, p1, c1, o3, p2),
+          dynamic_slice_sizes={1,1,1,8}
+    }
+
+    body {
+      p0 = (s32[], s32[4,8,8,8], s32[1,1,1,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      input = s32[4,8,8,8] get-tuple-element(p0), index=1
+
+      c1 = s32[] constant(1)
+      c5 = s32[] constant(5)
+
+      offset1 = s32[] remainder(ivar, c5)
+      offset2 = s32[] get-tuple-element(p0), index=3
+
+      slice = s32[1,1,1,8] fusion(input, offset1, offset2), kind=kLoop, calls=dynamic_slice,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      next_ivar = s32[] add(ivar, c1)
+
+      ROOT result = (s32[], s32[4,8,8,8], s32[1,1,1,8], s32[])
+          tuple(next_ivar, input, slice, offset2)
+    }
+
+    condition {
+      p0 = (s32[], s32[4,8,8,8], s32[1,1,1,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(ivar, c6), direction=LT
+    }
+
+    ENTRY main {
+      input = s32[4,8,8,8] parameter(0)
+      init_acc = s32[1,1,1,8] parameter(1)
+      c0 = s32[] constant(0)
+      c1 = s32[] constant(1)
+      tuple = (s32[], s32[4,8,8,8], s32[1,1,1,8], s32[]) tuple(c0, input, init_acc, c1)
+      ROOT while = (s32[], s32[4,8,8,8], s32[1,1,1,8], s32[]) while(tuple),
+          condition=condition, body=body,
+          backend_config={"known_trip_count":{"n":"6"},
+                          "known_init_step":{"init":"0","step":"1"},
+                          "known_induction_variable":{"tuple_index":"0"}}
+    })";
+
+TEST_F(CopyFusionTest, BuildSliceDescriptor) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kSliceMemcpyModule));
+
+  auto descriptor = DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(
+      GetFusion(module.get()));
+
+  ASSERT_TRUE(descriptor.has_value());
+  ASSERT_THAT(descriptor->src_dynamic_offsets, ::testing::SizeIs(2));
+
+  EXPECT_EQ(descriptor->src_byte_static_offset, 8 * 8 * sizeof(float));
+
+  const auto& offset1 = descriptor->src_dynamic_offsets[0];
+  EXPECT_EQ(offset1.while_loop->name(), "while");
+  EXPECT_EQ(offset1.induction_variable->name(), "ivar");
+  EXPECT_EQ(offset1.offset->name(), "p1");
+  EXPECT_EQ(offset1.dimension_size, 4);
+  EXPECT_EQ(offset1.byte_stride, 8 * 8 * 8 * sizeof(float));
+
+  const auto& offset2 = descriptor->src_dynamic_offsets[1];
+  EXPECT_EQ(offset2.while_loop->name(), "while");
+  EXPECT_EQ(offset2.induction_variable->name(), "ivar");
+  EXPECT_EQ(offset2.offset->name(), "o3");
+  EXPECT_EQ(offset2.dimension_size, 8);
+  EXPECT_EQ(offset2.byte_stride, 8 * sizeof(float));
+
+  EXPECT_THAT(descriptor->dst_dynamic_offsets, ::testing::IsEmpty());
+  EXPECT_EQ(descriptor->dst_byte_static_offset, 0);
+}
+
+constexpr char kUpdateSliceMemcpyModule[] = R"(
+    dynamic_slice {
+      p0 = s32[4,8,8] parameter(0)
+      p1 = s32[1,1,8] parameter(1)
+      p2 = s32[] parameter(2)
+      c1 = s32[] constant(1)
+
+      ROOT update-slice = s32[4,8,8] dynamic-update-slice(p0, p1, p2, c1, c1)
+    }
+
+    body {
+      p0 = (s32[], s32[4,8,8]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      input = s32[4,8,8] get-tuple-element(p0), index=1
+      val = s32[1,1,8] constant({{{1,2,3,4,5,6,7,8}}})
+
+      updated = s32[4,8,8] fusion(input, val, ivar), kind=kLoop, calls=dynamic_slice,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      c1 = s32[] constant(1)
+      next_ivar = s32[] add(ivar, c1)
+
+      ROOT result = (s32[], s32[4,8,8])
+          tuple(next_ivar, updated)
+    }
+
+    condition {
+      p0 = (s32[], s32[4,8,8]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(ivar, c6), direction=LT
+    }
+
+    ENTRY main {
+      input = s32[4,8,8] parameter(0)
+      c0 = s32[] constant(0)
+      tuple = (s32[], s32[4,8,8]) tuple(c0, input)
+      ROOT while = (s32[], s32[4,8,8]) while(tuple),
+          condition=condition, body=body,
+          backend_config={"known_trip_count":{"n":"6"},
+                          "known_init_step":{"init":"0","step":"1"},
+                          "known_induction_variable":{"tuple_index":"0"}}
+    })";
+
+TEST_F(CopyFusionTest, BuildUpdateSliceDescriptor) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kUpdateSliceMemcpyModule));
+
+  auto descriptor = DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(
+      GetFusion(module.get()));
+
+  ASSERT_TRUE(descriptor.has_value());
+  EXPECT_THAT(descriptor->src_dynamic_offsets, ::testing::IsEmpty());
+  EXPECT_EQ(descriptor->src_byte_static_offset, 0);
+
+  ASSERT_THAT(descriptor->dst_dynamic_offsets, ::testing::SizeIs(1));
+  const auto& offset = descriptor->dst_dynamic_offsets[0];
+  EXPECT_EQ(descriptor->dst_byte_static_offset, 32);
+  EXPECT_EQ(offset.while_loop->name(), "while");
+  EXPECT_EQ(offset.induction_variable->name(), "ivar");
+  EXPECT_EQ(offset.offset->name(), "p2");
+  EXPECT_EQ(offset.dimension_size, 4);
+  EXPECT_EQ(offset.byte_stride, 8 * 8 * sizeof(float));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
index 87751f56a036..b23ba5a00408 100644
--- a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
+#include "xla/service/gpu/cudnn_support_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
@@ -57,6 +58,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::tsl::testing::IsOkAndHolds;
+
 class CuDnnFusionTest : public GpuCodegenTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
@@ -219,16 +222,46 @@ ENTRY e {
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Fusion(m::GetTupleElement(m::Fusion()))));
-  EXPECT_THAT(module->entry_computation()
-                  ->root_instruction()
-                  ->operand(0)
-                  ->operand(0)
-                  ->fused_instructions_computation()
-                  ->root_instruction(),
-              GmockMatch(m::Tuple(m::Dot(), m::CustomCall())));
+  EXPECT_TRUE(IsWorkspaceAllocationRoot(*module->entry_computation()
+                                             ->root_instruction()
+                                             ->operand(0)
+                                             ->operand(0)
+                                             ->fused_expression_root()));
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(CuDnnFusionExecutionTest, CompilerSupportsFusionsWithWorkspace) {
+  const std::string kHloText = R"(
+f {
+  a = f32[32,96] parameter(0)
+  b = f32[96,64] parameter(1)
+  d = f32[32,64] dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  c = s8[33554688] custom-call(), custom_call_target="__nop"
+  t = (f32[32,64], s8[33554688]{0}) tuple(d, c)
+}
+
+e {
+  a = f32[32,96] parameter(0)
+  b = f32[96,64] parameter(1)
+  r = (f32[32,64], s8[33554688]) fusion(a, b), kind=kCustom, calls=f,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
+  g = f32[32,64] get-tuple-element(r), index=0
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  BinaryMap dnn_compiled_graphs;
+  CuDnnFusionCompiler cudnn_compiler(*backend().default_stream_executor(),
+                                     dnn_compiled_graphs);
+  EXPECT_THAT(cudnn_compiler.Run(module.get()), IsOkAndHolds(false));
+  EXPECT_TRUE(RunAndCompareTwoModules(kHloText, R"(e {
+    a = f32[32,96] parameter(0)
+    b = f32[96,64] parameter(1)
+    d = f32[32,64] dot(a, b),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })",
+                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(CuDnnFusionExecutionTest,
        CuDnnFusionCompilerDoesNotFailOnDependentFusions) {
   if (!IsAtLeastCuDnn91()) {
@@ -1071,7 +1104,7 @@ ENTRY e {
   const std::string hlo_test = absl::Substitute(
       kHloTemplate, primitive_util::LowercasePrimitiveTypeName(GetParam()));
 
-  EXPECT_TRUE(RunAndCompare(hlo_test, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
+  EXPECT_TRUE(RunAndCompare(hlo_test, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 constexpr std::array<PrimitiveType, 3> kSupportedDataTypes{F16, F32, BF16};
diff --git a/third_party/xla/xla/backends/gpu/codegen/custom.cc b/third_party/xla/xla/backends/gpu/codegen/custom.cc
index c0d71c439d9b..5584a1bfec11 100644
--- a/third_party/xla/xla/backends/gpu/codegen/custom.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/custom.cc
@@ -37,14 +37,14 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_target.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_reduce_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/ffi/attribute_map.h"
 #include "xla/ffi/ffi_api.h"
@@ -111,10 +111,10 @@ absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
   }
 
   // Walk through ShapeIndex to find the real starting point.
-  auto* start = const_cast<HloInstruction*>(&start_instr);
+  const auto* start = &start_instr;
   for (auto idx : shape_idx) {
     CHECK(start->shape().IsTuple());
-    start = const_cast<HloInstruction*>(start->operand(idx));
+    start = start->operand(idx);
   }
 
   if (const auto* param = DynCast<HloParameterInstruction>(start)) {
@@ -264,8 +264,9 @@ std::unique_ptr<HloModule> ExtractOffsetModule(
 std::unique_ptr<HloModule> ExtractWhileUpdateModule(
     const HloInstruction* while_op) {
   std::optional<int64_t> tuple_idx = GetLoopInductionVarTupleIdx(while_op);
-  CHECK(tuple_idx != std::nullopt)
-      << "Unable to get tuple idx for whileop " << while_op->ToString();
+  if (tuple_idx == std::nullopt) {
+    return nullptr;
+  }
   const HloInstruction* update =
       while_op->while_body()->root_instruction()->operand(*tuple_idx);
   return ExtractOffsetModule(update, while_op);
@@ -277,8 +278,9 @@ std::unique_ptr<HloModule> ExtractWhileUpdateModule(
 std::unique_ptr<HloModule> ExtractWhileInitModule(
     const HloInstruction* while_op) {
   std::optional<int64_t> tuple_idx = GetLoopInductionVarTupleIdx(while_op);
-  CHECK(tuple_idx != std::nullopt)
-      << "Unable to get tuple idx for while op: " << while_op->ToString();
+  if (tuple_idx == std::nullopt) {
+    return nullptr;
+  }
   const HloInstruction* init = while_op->operand(0)->operand(*tuple_idx);
   std::unique_ptr<HloModule> init_module = ExtractModule(
       /*instruction=*/init, /*height=*/-1, /*extract_selector=*/nullptr,
@@ -311,10 +313,8 @@ absl::Status CollectSliceInfo(
   }
   auto* arg_slice_instr =
       Cast<HloDynamicIndexInstruction>(slice_instrs[arg_idx]);
-  std::optional<HloInstruction*> async_caller = std::nullopt;
-  if (fusion_instr.parent()->IsAsyncComputation()) {
-    async_caller = fusion_instr.parent()->AsyncStart();
-  }
+  std::optional<HloInstruction*> async_caller =
+      fusion_instr.parent()->GetUniqueCaller(HloOpcode::kAsyncStart);
 
   std::vector<DynamicSliceThunk::Offset> arg_offsets;
   for (auto idx_op : arg_slice_instr->index_operands()) {
@@ -997,129 +997,205 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   return result;
 }
 
-template <typename NcclThunkType, typename HloInstType>
-absl::StatusOr<FusionEmissionResult> EmitCollective(
-    IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
-    const HloFusionInstruction& fusion_instr, const HloInstType* instr,
-    bool use_global_device_ids, const CallGraph& call_graph) {
-  Thunk::Kind collective_done_thunk_kind;
-  switch (instr->opcode()) {
-    case HloOpcode::kReduceScatter:
-      collective_done_thunk_kind = Thunk::kNcclReduceScatterDone;
-      break;
-    default:
-      return absl::InternalError(
-          "Unexpected operation in dynamic slice fusion");
-  }
-
-  const BufferAssignment& buffer_assignment =
-      ir_emitter_context.buffer_assignment();
+using Slice = std::optional<BufferAllocation::Slice>;
+using Slices = std::vector<Slice>;
 
+// This struct holds all the information about inputs and outputs of a dynamic
+// slice fusion with a collective hero.
+//
+// fake_allocations: the fake allocations for the inputs/outputs of the hero
+// instruction, when the slicing is dynamic. These are "fake" in the sense
+// that they have no "values" assigned to them. So, they never materialize and
+// are not seen in the final buffer assignment.
+//
+// fake_arguments: the fake slices of the inputs/outputs of the hero
+// instruction, when the slicing is dynamic.
+struct SliceDataForCollectives {
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
+  std::vector<HloInstruction*> slice_instrs;
+  Slices arguments, fake_arguments;
   std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
-      offset_buffer_indices(2, std::nullopt);
-  std::vector<std::optional<Shape>> orig_shapes(2, std::nullopt);
-  std::vector<std::optional<Shape>> sliced_shapes(2, std::nullopt);
-  std::vector<std::optional<uint64_t>> offset_byte_sizes(2, std::nullopt);
-
-  std::vector<HloInstruction*> slice_instrs(2, nullptr);
-  std::vector<std::optional<BufferAllocation::Slice>> arguments;
-
+      offset_buffer_indices;
+  std::vector<std::optional<Shape>> orig_shapes, sliced_shapes;
+  std::vector<std::optional<uint64_t>> offset_byte_sizes;
   std::vector<std::unique_ptr<HloModule>> extracted_offset_modules;
+  std::unique_ptr<HloModule> init_module, update_module;
+  bool isDynamic, can_compute_indvar_on_host;
+  explicit SliceDataForCollectives(int num_args)
+      : fake_allocations(num_args),
+        slice_instrs(num_args),
+        arguments(num_args, std::nullopt),
+        fake_arguments(num_args, std::nullopt),
+        offset_buffer_indices(num_args, std::nullopt),
+        orig_shapes(num_args, std::nullopt),
+        sliced_shapes(num_args, std::nullopt),
+        offset_byte_sizes(num_args, std::nullopt),
+        init_module(nullptr),
+        update_module(nullptr),
+        isDynamic(false),
+        can_compute_indvar_on_host(false) {}
+
+  Slices& args() { return isDynamic ? fake_arguments : arguments; }
+};
+
+// Collects slice information for inputs and outputs of a HLO instruction.
+template <typename HloInstType>
+absl::StatusOr<SliceDataForCollectives>
+CollectSliceArgumentMetadataForCollectives(
+    const HloInstType* instr, const BufferAssignment& buffer_assignment,
+    const HloFusionAdaptor& adaptor, const HloFusionInstruction& fusion_instr,
+    const CallGraph& call_graph) {
+  int num_args =
+      instr->operand_count() +
+      (instr->shape().IsTuple() ? instr->shape().tuple_shapes_size() : 1);
+  SliceDataForCollectives slice_data(num_args);
   std::optional<HloInstruction*> while_op =
       GetParentWhileOp(fusion_instr, call_graph);
-  std::unique_ptr<HloModule> init_module, update_module;
   if (while_op != std::nullopt) {
     CHECK(while_op.value() != nullptr)
-        << "GetWhileOp is not expected to return nullptr.";
-    init_module = ExtractWhileInitModule(*while_op);
-    update_module = ExtractWhileUpdateModule(*while_op);
+        << "GetParentWhileOp is not expected to return nullptr.";
+    slice_data.init_module = ExtractWhileInitModule(*while_op);
+    slice_data.update_module = ExtractWhileUpdateModule(*while_op);
   }
-  bool can_compute_indvar_on_host =
-      (init_module != nullptr && update_module != nullptr);
-
+  slice_data.can_compute_indvar_on_host = (slice_data.init_module != nullptr &&
+                                           slice_data.update_module != nullptr);
   // Collect slice information for inputs.
   unsigned arg_idx = 0;
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice src,
-      GetOperandSlice(buffer_assignment, adaptor, fusion_instr,
-                      /*start_instr=*/*instr->operand(arg_idx), slice_instrs,
-                      /*shape_idx=*/{}, arg_idx));
-  arguments.push_back(src);
-  TF_RETURN_IF_ERROR(CollectSliceInfo(
-      buffer_assignment, fusion_instr,
-      /*slice_instrs=*/absl::Span<HloInstruction*>(slice_instrs),
-      /*offsets=*/offset_buffer_indices, orig_shapes, sliced_shapes,
-      offset_byte_sizes, extracted_offset_modules, arg_idx,
-      can_compute_indvar_on_host, while_op));
-  arg_idx++;
+  for (HloInstruction* operand : instr->operands()) {
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice src,
+        GetOperandSlice(buffer_assignment, adaptor, fusion_instr,
+                        /*start_instr=*/*operand, slice_data.slice_instrs,
+                        /*shape_idx=*/{}, arg_idx));
+    slice_data.arguments[arg_idx] = src;
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion_instr,
+        /*slice_instrs=*/absl::Span<HloInstruction*>(slice_data.slice_instrs),
+        /*offsets=*/slice_data.offset_buffer_indices, slice_data.orig_shapes,
+        slice_data.sliced_shapes, slice_data.offset_byte_sizes,
+        slice_data.extracted_offset_modules, arg_idx,
+        slice_data.can_compute_indvar_on_host, while_op));
+    arg_idx++;
+  }
 
   // Collect slice information for outputs.
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst,
-                      GetResultSlice(buffer_assignment, adaptor, fusion_instr,
-                                     /*start_instr=*/*instr, slice_instrs,
-                                     /*shape_idx=*/{}, arg_idx));
-  arguments.push_back(dst);
-  TF_RETURN_IF_ERROR(CollectSliceInfo(
-      buffer_assignment, fusion_instr,
-      /*slice_instrs=*/absl::Span<HloInstruction*>(slice_instrs),
-      /*offsets=*/offset_buffer_indices, orig_shapes, sliced_shapes,
-      offset_byte_sizes, extracted_offset_modules, arg_idx,
-      can_compute_indvar_on_host, while_op));
+  PtrVec<const HloInstruction*> collective_results;
+  if (instr->shape().IsTuple()) {
+    for (const HloInstruction* user : instr->users()) {
+      collective_results.push_back(user);
+    }
+  } else {
+    collective_results.push_back(instr);
+  }
+  for (const HloInstruction* user : collective_results) {
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice dst,
+        GetResultSlice(buffer_assignment, adaptor, fusion_instr,
+                       /*start_instr=*/*user, slice_data.slice_instrs,
+                       /*shape_idx=*/{}, arg_idx));
+    slice_data.arguments[arg_idx] = dst;
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion_instr,
+        /*slice_instrs=*/absl::Span<HloInstruction*>(slice_data.slice_instrs),
+        /*offsets=*/slice_data.offset_buffer_indices, slice_data.orig_shapes,
+        slice_data.sliced_shapes, slice_data.offset_byte_sizes,
+        slice_data.extracted_offset_modules, arg_idx,
+        slice_data.can_compute_indvar_on_host, while_op));
+    arg_idx++;
+  }
 
   // Sanity checks.
-  //  1. Expect atleast one slicing operation.
-  //  2. Expect atleast one dynamic index operation iff the fusion is a
+  //  1. Expect at least one slicing operation.
+  //  2. Expect at least one dynamic index operation iff the fusion is a
   //  dynamic-address-fusion.
-  if (absl::c_all_of(slice_instrs, [&](HloInstruction* slice_instr) {
+  if (absl::c_all_of(slice_data.slice_instrs, [&](HloInstruction* slice_instr) {
         return slice_instr == nullptr;
       })) {
     return absl::InternalError("Expected atleast one slicing operation");
   }
-  bool isDynamic =
-      absl::c_any_of(slice_instrs, IsDynamicSliceOrDynamicUpdateSlice);
+  slice_data.isDynamic = absl::c_any_of(slice_data.slice_instrs,
+                                        IsDynamicSliceOrDynamicUpdateSlice);
   TF_ASSIGN_OR_RETURN(
       auto backend_config,
       fusion_instr.backend_config<xla::gpu::GpuBackendConfig>());
   const std::string fusion_name =
       backend_config.fusion_backend_config().custom_fusion_config().name();
-  TF_RET_CHECK(isDynamic ==
+  TF_RET_CHECK(slice_data.isDynamic ==
                (fusion_name ==
                 kDynamicSliceFusionWithDynamicAddressComputationConfigName))
       << "Dynamic index operation found in a fusion instruction that is not "
          "labelled dynamic_address_computation";
+  if (slice_data.isDynamic) {
+    // Provide fake allocations for inputs and outputs. The dynamic-slice thunk
+    // will own these allocations.
+    unsigned fake_arg_idx = 0;
+    for (HloInstruction* operand : instr->operands()) {
+      int64_t operand_byte_size = ShapeUtil::ByteSizeOf(operand->shape());
+      slice_data.fake_allocations[fake_arg_idx] =
+          std::make_unique<BufferAllocation>(
+              /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
+      BufferAllocation::Slice fake_slice(
+          /*allocation=*/slice_data.fake_allocations[fake_arg_idx].get(),
+          /*offset=*/0,
+          /*size=*/operand_byte_size);
+      slice_data.fake_arguments[fake_arg_idx] = fake_slice;
+      fake_arg_idx++;
+    }
+    PtrVec<const HloInstruction*> collective_results;
+    if (instr->shape().IsTuple()) {
+      for (const HloInstruction* user : instr->users()) {
+        collective_results.push_back(user);
+      }
+    } else {
+      collective_results.push_back(instr);
+    }
+    for (const HloInstruction* user : collective_results) {
+      int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(user->shape());
+      slice_data.fake_allocations[fake_arg_idx] =
+          std::make_unique<BufferAllocation>(
+              /*index=*/fake_arg_idx, /*size=*/out_fake_byte_size, /*color=*/0);
+      BufferAllocation::Slice fake_slice(
+          /*allocation=*/slice_data.fake_allocations[fake_arg_idx].get(),
+          /*offset=*/0, /*size=*/out_fake_byte_size);
+      slice_data.fake_arguments[fake_arg_idx] = fake_slice;
+      fake_arg_idx++;
+    }
+  }
+  return slice_data;
+}
+
+template <typename NcclThunkType, typename HloInstType>
+absl::StatusOr<FusionEmissionResult> EmitCollective(
+    IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
+    const HloFusionInstruction& fusion_instr, const HloInstType* instr,
+    bool use_global_device_ids, const CallGraph& call_graph) {
+  Thunk::Kind collective_done_thunk_kind;
+  switch (instr->opcode()) {
+    case HloOpcode::kReduceScatter:
+      collective_done_thunk_kind = Thunk::kReduceScatterDone;
+      break;
+    default:
+      return absl::InternalError(
+          "Unexpected operation in dynamic slice fusion");
+  }
+
+  const BufferAssignment& buffer_assignment =
+      ir_emitter_context.buffer_assignment();
+
+  TF_ASSIGN_OR_RETURN(
+      auto slice_data,
+      CollectSliceArgumentMetadataForCollectives(
+          instr, buffer_assignment, adaptor, fusion_instr, call_graph));
 
   int64_t replica_count = instr->GetModule()->config().replica_count();
   int64_t partition_count = instr->GetModule()->config().num_partitions();
   absl::Status implementable_status =
       NcclThunkType::CheckImplementable(instr, replica_count, partition_count);
-  bool is_degenerate = GetNcclCollectiveConfig(instr, use_global_device_ids)
+  bool is_degenerate = GetCollectiveConfig(instr, use_global_device_ids)
                            .IsDegenerate(replica_count, partition_count);
   Thunk::ThunkInfo thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(instr);
 
   FusionEmissionResult result;
-  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
-  if (isDynamic) {
-    // Provide fake allocations for inputs and outputs. The dynamic-slice thunk
-    // will own these allocations.
-    unsigned fake_arg_idx = 0;
-    int64_t operand_byte_size =
-        ShapeUtil::ByteSizeOf(instr->operand(fake_arg_idx)->shape());
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
-        /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
-    src = BufferAllocation::Slice(
-        /*allocation=*/fake_allocations[fake_arg_idx].get(), /*offset=*/0,
-        /*size=*/operand_byte_size);
-    fake_arg_idx++;
-    TF_RET_CHECK(instr->shape().IsArray() &&
-                 "The output is not expected to be a tuple.");
-    int64_t out_fake_byte_size =
-        ShapeUtil::ByteSizeOf(instr->shape());  // TODO: we don't need this
-    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
-        /*index=*/fake_arg_idx, /*size*/ out_fake_byte_size, /*color=*/0);
-    dst = BufferAllocation::Slice(
-        /*allocation=*/fake_allocations[fake_arg_idx].get(),
-        /*offset=*/0, /*size=*/out_fake_byte_size);
-  }
 
   // First we get the thunk sequence. This decides whether to generate a d2d
   // copy thunk or collective thunk.
@@ -1133,34 +1209,55 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
            "the "
            "collective is degenerate: "
         << shape.ToString() << " vs " << instr->shape().ToString();
-    seq.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        thunk_info,
-        /*source_buffer=*/src,
-        /*destination_buffer=*/dst,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
+    for (int idx = 0; idx < instr->operand_count(); ++idx) {
+      std::optional<BufferAllocation::Slice> src = slice_data.args()[idx];
+      std::optional<BufferAllocation::Slice> dst =
+          slice_data.args()[idx + instr->operand_count()];
+      TF_RET_CHECK(src.has_value() && dst.has_value())
+          << "Expected source and destination to be present for degenerate "
+             "collective";
+      seq.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
+          thunk_info,
+          /*source_buffer=*/src.value(),
+          /*destination_buffer=*/dst.value(),
+          /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
+    }
   } else if (implementable_status.ok()) {
-    std::vector<NcclCollectiveThunk::Buffer> buffers;
-    const Shape& src_shape = instr->operand(0)->shape();
-    const Shape& dst_shape = instr->shape();
-    buffers.push_back(NcclCollectiveThunk::Buffer{
-        /*element_count=*/ShapeUtil::ElementsIn(src_shape),
-        /*source_buffer=*/src,
-        /*destination_buffer=*/dst,
-        /*source_memory_space=*/src_shape.layout().memory_space(),
-        /*destination_memory_space=*/dst_shape.layout().memory_space(),
-        /*source_value=*/nullptr,
-        /*destination_value=*/nullptr});
+    std::vector<CollectiveThunk::Buffer> buffers;
+    for (int idx = 0; idx < instr->operand_count(); ++idx) {
+      const Shape& src_shape = instr->operand(idx)->shape();
+      const Shape& dst_shape = instr->shape().IsTuple()
+                                   ? instr->shape().tuple_shapes(idx)
+                                   : instr->shape();
+      std::optional<BufferAllocation::Slice> src = slice_data.args()[idx];
+      std::optional<BufferAllocation::Slice> dst =
+          slice_data.args()[idx + instr->operand_count()];
+      TF_RET_CHECK(src.has_value() && dst.has_value())
+          << "Expected source and destination to be present for non-degenerate "
+             "collective";
+      buffers.push_back(CollectiveThunk::Buffer{
+          /*element_count=*/ShapeUtil::ElementsIn(src_shape),
+          /*source_buffer=*/src.value(),
+          /*destination_buffer=*/dst.value(),
+          /*source_memory_space=*/src_shape.layout().memory_space(),
+          /*destination_memory_space=*/dst_shape.layout().memory_space(),
+          /*source_value=*/nullptr,
+          /*destination_value=*/nullptr});
+    }
     auto collective_start_thunk =
         std::make_unique<NcclThunkType>(thunk_info, instr, buffers);
-    std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events =
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events =
         collective_start_thunk->async_events();
     seq.emplace_back(std::move(collective_start_thunk));
     // If the fusion is async, we do not emit the done thunk at the end.
     if (fusion_instr.parent()->IsAsyncComputation()) {
+      auto async_start =
+          fusion_instr.parent()->GetUniqueCaller(HloOpcode::kAsyncStart);
+      CHECK(async_start) << "Async computations should have a unique caller.";
       ir_emitter_context.collectives_async_events().insert(
-          {fusion_instr.parent()->AsyncStart(), async_events});
+          {*async_start, async_events});
     } else {
-      auto collective_done_thunk = std::make_unique<NcclCollectiveDoneThunk>(
+      auto collective_done_thunk = std::make_unique<CollectiveDoneThunk>(
           /*kind=*/collective_done_thunk_kind,
           /*thunk_info=*/Thunk::ThunkInfo::WithProfileAnnotation(instr),
           /*async_events=*/async_events,
@@ -1173,22 +1270,24 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
 
   // Depending on whether this is a dynamic fusion or not, we wrap the thunk(s)
   // within a dynamic-slice thunk.
-  if (isDynamic) {
+  if (slice_data.isDynamic) {
     std::optional<DynamicSliceThunk::OffsetAsFunctionOfIndvarModulesMetadata>
         offset_modules_metadata = std::nullopt;
-    if (can_compute_indvar_on_host) {
+    if (slice_data.can_compute_indvar_on_host) {
       offset_modules_metadata =
           DynamicSliceThunk::OffsetAsFunctionOfIndvarModulesMetadata(
-              /*indvar_init=*/std::move(init_module),
-              /*indvar_update=*/std::move(update_module),
-              /*extracted_offset_modules=*/std::move(extracted_offset_modules));
+              /*indvar_init=*/std::move(slice_data.init_module),
+              /*indvar_update=*/std::move(slice_data.update_module),
+              /*extracted_offset_modules=*/
+              std::move(slice_data.extracted_offset_modules));
     }
     std::unique_ptr<Thunk> thunk = std::make_unique<DynamicSliceThunk>(
         thunk_info,
         /*embedded_thunk=*/std::make_unique<ThunkSequence>(std::move(seq)),
-        std::move(arguments), std::move(fake_allocations),
-        std::move(offset_buffer_indices), std::move(orig_shapes),
-        std::move(sliced_shapes), std::move(offset_byte_sizes),
+        std::move(slice_data.arguments), std::move(slice_data.fake_allocations),
+        std::move(slice_data.offset_buffer_indices),
+        std::move(slice_data.orig_shapes), std::move(slice_data.sliced_shapes),
+        std::move(slice_data.offset_byte_sizes),
         std::move(offset_modules_metadata));
     result.thunks.push_back(std::move(thunk));
   } else {
@@ -1261,8 +1360,7 @@ absl::StatusOr<FusionEmissionResult> DynamicSliceFusion::Emit(
     const HloReduceScatterInstruction* rs =
         Cast<const HloReduceScatterInstruction>(
             &maybe_collective->instruction());
-    return EmitCollective<NcclReduceScatterStartThunk,
-                          HloReduceScatterInstruction>(
+    return EmitCollective<ReduceScatterStartThunk, HloReduceScatterInstruction>(
         ir_emitter_context, adaptor, /*fusion_instr=*/fusion, /*instr=*/rs,
         /*use_global_device_ids=*/rs->use_global_device_ids(),
         /*call_graph=*/call_graph_);
diff --git a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
index e623f1f50be9..65c5aa636499 100644
--- a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
@@ -216,7 +216,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmWithWorkspace) {
     %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
     %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
 
-    ROOT %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -235,6 +235,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmWithWorkspace) {
         "grad_x":false,
         "grad_y":false
       }}
+    ROOT %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -275,8 +276,9 @@ TEST_F(DynamicSliceFusionTest, CublasGemmWithWorkspace) {
   ENTRY %main.9 {
     %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
     %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
-    ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
+    %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+    ROOT %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%fusion.2), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(
@@ -563,7 +565,7 @@ TEST_F(DynamicSliceFusionTest, OperandIsSlicedGetTupleElement) {
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
     %slice.26 = f32[100,100]{1,0} slice(%get-tuple-element.97), slice={[0:100], [0:100]}
-    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %get-tuple-element.240),
+    %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %get-tuple-element.240),
       custom_call_target="__cublas$gemm",
       backend_config={
         "gemm_backend_config":{
@@ -584,6 +586,7 @@ TEST_F(DynamicSliceFusionTest, OperandIsSlicedGetTupleElement) {
           "grad_y":false
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%custom-call.17), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -646,7 +649,7 @@ TEST_F(DynamicSliceFusionTest, OperandIsSlicedGetTupleElement) {
         }
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
-    ROOT %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %get-tuple-element.240),
+    %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %get-tuple-element.240),
       kind=kCustom,
       calls=%dynamic-slice-fusion,
       backend_config={
@@ -654,6 +657,7 @@ TEST_F(DynamicSliceFusionTest, OperandIsSlicedGetTupleElement) {
           "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%dynamic-slice-fusion.6), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
@@ -779,7 +783,7 @@ TEST_F(DynamicSliceFusionTest, SingleOperandComputation) {
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
     %slice.26 = f32[100,100]{1,0} slice(%get-tuple-element.97), slice={[0:100], [0:100]}
-    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
+    %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
       custom_call_target="__cublas$gemm",
       backend_config={
         "gemm_backend_config":{
@@ -800,6 +804,7 @@ TEST_F(DynamicSliceFusionTest, SingleOperandComputation) {
           "grad_y":false
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%custom-call.17), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -861,7 +866,7 @@ TEST_F(DynamicSliceFusionTest, SingleOperandComputation) {
         }
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
-    ROOT %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97),
+    %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97),
       kind=kCustom,
       calls=%dynamic-slice-fusion,
       backend_config={
@@ -869,6 +874,7 @@ TEST_F(DynamicSliceFusionTest, SingleOperandComputation) {
           "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%dynamic-slice-fusion.6), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
@@ -888,7 +894,7 @@ TEST_F(DynamicSliceFusionTest, SlicedOperandAliasingOutput) {
       %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
       %slice.30 = f32[100,100]{1,0} slice(%concatenate.12), slice={[20:120], [0:100]}
       %slice.34 = f32[100,100]{1,0} slice(%concatenate.12), slice={[99:199], [0:100]}
-      ROOT %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
+      %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
         custom_call_target="__cublas$gemm",
         output_to_operand_aliasing={{0}: (2, {})},
         backend_config={"gemm_backend_config":{
@@ -908,6 +914,7 @@ TEST_F(DynamicSliceFusionTest, SlicedOperandAliasingOutput) {
           "grad_x":false,
           "grad_y":false
         }}
+      ROOT %get-tuple-element.289 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.15), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -950,7 +957,7 @@ TEST_F(DynamicSliceFusionTest, SlicedOperandAliasingOutput) {
     %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
     %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
     %slice.34 = f32[100,100]{1,0} slice(%concatenate.12), slice={[99:199], [0:100]}
-    ROOT %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[120000]{0}) fusion(%get-tuple-element.287, %slice.34, %concatenate.12),
+    %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[120000]{0}) fusion(%get-tuple-element.287, %slice.34, %concatenate.12),
       kind=kCustom,
       calls=%dynamic-slice-fusion,
       output_to_operand_aliasing={{0}: (1, {})},
@@ -959,6 +966,7 @@ TEST_F(DynamicSliceFusionTest, SlicedOperandAliasingOutput) {
           "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
+    ROOT %get-tuple-element.289 = f32[100,100]{1,0} get-tuple-element(%dynamic-slice-fusion.6), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
@@ -995,9 +1003,11 @@ TEST_F(DynamicSliceFusionTest, CustomCallSimple) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto program_shape,
+      xla::ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -1138,9 +1148,11 @@ TEST_F(DynamicSliceFusionTest, CustomCallWithTuple) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto program_shape,
+      xla::ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/true);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -1187,91 +1199,11 @@ TEST_F(DynamicSliceFusionTest, NilTuple) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
-  hlo_config.set_debug_options(debug_options);
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
-                                            computation.proto(), hlo_config));
-
-  debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
-  hlo_config.set_debug_options(debug_options);
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
-                                            computation.proto(), hlo_config));
-
-  DynamicSliceFusionRewriter pass(PLATFORM);
-  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
-  EXPECT_TRUE(changed);
-
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
-                                      error_spec,
-                                      /*run_hlo_passes=*/false));
-}
-
-void Callback_Memcpy(se::gpu::GpuStreamHandle stream, void** buffers,
-                     const char* /*opaque*/, size_t /*opaque_len*/) {
-  void* src = buffers[0];
-  void* dst = buffers[1];
-  auto err = gpuMemcpyAsync(dst, src, /*count=*/sizeof(float) * 3 * 128,
-                            gpuMemcpyDeviceToDevice, stream);
-  ASSERT_EQ(err, gpuSuccess);
-}
-
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Memcpy, PLATFORM);
-
-TEST_F(DynamicSliceFusionTest, CustomCallLegacyAPI) {
-  XlaBuilder b(TestName());
-  CustomCall(&b, "Callback_Memcpy",
-             /*operands=*/
-             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {512}), {128},
-                    {4 * 128}, {1})},
-             ShapeUtil::MakeShape(F32, {3 * 128}), /*opaque=*/"");
-  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
-  hlo_config.set_debug_options(debug_options);
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
-                                            computation.proto(), hlo_config));
-
-  debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
-  hlo_config.set_debug_options(debug_options);
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
-                                            computation.proto(), hlo_config));
-
-  DynamicSliceFusionRewriter pass(PLATFORM);
-  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
-  EXPECT_TRUE(changed);
-
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
-                                      error_spec,
-                                      /*run_hlo_passes=*/false));
-}
-
-void Callback_Void(se::gpu::GpuStreamHandle /*stream*/, void** /*buffers*/,
-                   const char* /*opaque*/, size_t /*opaque_len*/) {}
-
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Void, PLATFORM);
-
-TEST_F(DynamicSliceFusionTest, NilTupleLegacyAPI) {
-  XlaBuilder b(TestName());
-  CustomCall(&b, "Callback_Void", /*operands=*/
-             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
-                    {128}, {1})},
-             ShapeUtil::MakeNil(),
-             /*opaque=*/"");
-  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto program_shape,
+      xla::ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -1392,7 +1324,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDynamicWithWorkspace) {
     %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
     %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
 
-    ROOT %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -1411,6 +1343,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDynamicWithWorkspace) {
         "grad_x":false,
         "grad_y":false
       }}
+    ROOT %gte = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -1455,8 +1388,9 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDynamicWithWorkspace) {
     %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
     %c1_s32 = s32[] constant(1)
     %c0_s32 = s32[] constant(0)
-    ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+    %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+    ROOT %gte = f16[8,8]{1,0} get-tuple-element(%fusion.2), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(
@@ -1681,7 +1615,7 @@ TEST_F(DynamicSliceFusionTest, DynamicOperandIsSlicedGetTupleElement) {
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
     %slice.26 = f32[100,100]{1,0} dynamic-slice(%get-tuple-element.97, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
-    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %get-tuple-element.240),
+    %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %get-tuple-element.240),
       custom_call_target="__cublas$gemm",
       backend_config={
         "gemm_backend_config":{
@@ -1702,6 +1636,7 @@ TEST_F(DynamicSliceFusionTest, DynamicOperandIsSlicedGetTupleElement) {
           "grad_y":false
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%custom-call.17), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -1766,7 +1701,7 @@ TEST_F(DynamicSliceFusionTest, DynamicOperandIsSlicedGetTupleElement) {
         }
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
-    ROOT %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %get-tuple-element.240, %c0_s32),
+    %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %get-tuple-element.240, %c0_s32),
       kind=kCustom,
       calls=%dynamic-slice-fusion,
       backend_config={
@@ -1774,6 +1709,7 @@ TEST_F(DynamicSliceFusionTest, DynamicOperandIsSlicedGetTupleElement) {
           "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%dynamic-slice-fusion.6), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
@@ -1906,7 +1842,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSingleOperandComputation) {
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
     %slice.26 = f32[100,100]{1,0} dynamic-slice(%get-tuple-element.97, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
-    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
+    %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
       custom_call_target="__cublas$gemm",
       backend_config={
         "gemm_backend_config":{
@@ -1927,6 +1863,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSingleOperandComputation) {
           "grad_y":false
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%custom-call.17), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -1990,7 +1927,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSingleOperandComputation) {
         }
       }
     %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
-    ROOT %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %c0_s32),
+    %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %c0_s32),
       kind=kCustom,
       calls=%dynamic-slice-fusion,
       backend_config={
@@ -1998,6 +1935,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSingleOperandComputation) {
           "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
+    ROOT %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%dynamic-slice-fusion.6), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
@@ -2020,7 +1958,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSlicedOperandAliasingOutput) {
       %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
       %slice.30 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c20_s32, %c0_s32), dynamic_slice_sizes={100,100}
       %slice.34 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c99_s32, %c0_s32), dynamic_slice_sizes={100,100}
-      ROOT %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
+      %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
         custom_call_target="__cublas$gemm",
         output_to_operand_aliasing={{0}: (2, {})},
         backend_config={"gemm_backend_config":{
@@ -2040,6 +1978,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSlicedOperandAliasingOutput) {
           "grad_x":false,
           "grad_y":false
         }}
+      ROOT %get-tuple-element.289 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.15), index=0
   })";
 
   const char* hlo_opt = R"(
@@ -2087,7 +2026,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSlicedOperandAliasingOutput) {
     %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
     %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
     %slice.34 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c99_s32, %c0_s32), dynamic_slice_sizes={100,100}
-    ROOT %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[120000]{0}) fusion(%get-tuple-element.287, %slice.34, %concatenate.12, %c0_s32, %c20_s32),
+    %dynamic-slice-fusion.6 = (f32[100,100]{1,0}, s8[120000]{0}) fusion(%get-tuple-element.287, %slice.34, %concatenate.12, %c0_s32, %c20_s32),
       kind=kCustom,
       calls=%dynamic-slice-fusion,
       output_to_operand_aliasing={{0}: (1, {})},
@@ -2096,6 +2035,7 @@ TEST_F(DynamicSliceFusionTest, DynamicSlicedOperandAliasingOutput) {
           "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
+    ROOT %get-tuple-element.289 = f32[100,100]{1,0} get-tuple-element(%dynamic-slice-fusion.6), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
@@ -2202,7 +2142,6 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSWithWorkspace) {
 
   const char* hlo_ref = R"(
   HloModule jit_slice
-
   ENTRY %main.9 {
     %p0 = f16[2,8,8]{2,1,0} parameter(0)
     %p1 = f16[2,8,8]{2,1,0} parameter(1)
@@ -2213,7 +2152,6 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSWithWorkspace) {
     %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
     %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
     %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
-
     %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
@@ -2235,14 +2173,11 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSWithWorkspace) {
       }}
     %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
     %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
-    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
-    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
-    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
   })";
 
   const char* hlo_opt = R"(
   HloModule jit_slice
-
   %fused_computation {
     %p0 = f16[2,8,8]{2,1,0} parameter(0)
     %p1 = f16[2,8,8]{2,1,0} parameter(1)
@@ -2253,7 +2188,6 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSWithWorkspace) {
     %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
     %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
     %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
-
     %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
@@ -2279,15 +2213,15 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSWithWorkspace) {
     %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
     ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
   }
-
   ENTRY %main.9 {
     %p0 = f16[2,8,8]{2,1,0} parameter(0)
     %p1 = f16[2,8,8]{2,1,0} parameter(1)
     %p2 = f16[4,8,8]{2,1,0} parameter(2)
     %c1_s32 = s32[] constant(1)
     %c0_s32 = s32[] constant(0)
-    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+    %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+    ROOT %gte = f16[4,8,8]{2,1,0} get-tuple-element(%fusion.2), index=0
   })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(
@@ -2421,9 +2355,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSOffsetS32NotConstant) {
       }}
     %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
     %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
-    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
-    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
-    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
   })";
 
   const char* hlo_opt = R"(
@@ -2461,9 +2393,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSOffsetS32NotConstant) {
       }}
     %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
     %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
-    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
-    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
-    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
   }
 
   ENTRY %main.9 {
@@ -2472,7 +2402,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSOffsetS32NotConstant) {
     %p2 = f16[4,8,8]{2,1,0} parameter(2)
     %c1_s32 = s32[] parameter(3)
     %c0_s32 = s32[] parameter(4)
-    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+    ROOT %fusion.2 = f16[4,8,8]{2,1,0} fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
@@ -2518,9 +2448,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSOffsetOOB) {
       }}
     %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
     %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
-    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
-    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
-    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
   })";
 
   const char* hlo_opt = R"(
@@ -2558,9 +2486,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSOffsetOOB) {
       }}
     %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
     %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
-    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
-    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
-    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
   }
 
   ENTRY %main.9 {
@@ -2569,7 +2495,7 @@ TEST_F(DynamicSliceFusionTest, CublasGemmDUSOffsetOOB) {
     %p2 = f16[4,8,8]{2,1,0} parameter(2)
     %c1_s32 = s64[] constant(10)
     %c0_s32 = s64[] constant(-1)
-    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+    ROOT %fusion.2 = f16[4,8,8]{2,1,0} fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
@@ -2594,9 +2520,11 @@ TEST_F(DynamicSliceFusionTest, DynamicCustomCallSimple) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto program_shape,
+      xla::ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -2661,9 +2589,11 @@ TEST_F(DynamicSliceFusionTest, DynamicCustomCallWithTuple) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto program_shape,
+      xla::ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/true);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -2761,45 +2691,6 @@ XLA_FFI_DEFINE_HANDLER(kSubBuffers2, SubBuffers2,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers2",
                          PLATFORM, kSubBuffers2);
 
-TEST_F(DynamicSliceFusionTest, CustomCallDUS) {
-  XlaBuilder b(TestName());
-  auto custom_call =
-      CustomCall(&b, "Callback_Memcpy",
-                 /*operands=*/
-                 {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {10, 128}),
-                        {2, 0}, {5, 128}, {1, 1})},
-                 ShapeUtil::MakeShape(F32, {3, 128}), /*opaque=*/"");
-
-  DynamicUpdateSlice(
-      Broadcast(ConstantR0WithType(&b, F32, 92.0), {10, 128}), custom_call,
-      {ConstantR0WithType(&b, S32, 4), ConstantR0WithType(&b, S32, 0)});
-
-  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
-  hlo_config.set_debug_options(debug_options);
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
-                                            computation.proto(), hlo_config));
-
-  debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
-  hlo_config.set_debug_options(debug_options);
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
-                                            computation.proto(), hlo_config));
-
-  DynamicSliceFusionRewriter pass(PLATFORM);
-  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
-  EXPECT_TRUE(changed);
-
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
-                                      error_spec,
-                                      /*run_hlo_passes=*/false));
-}
-
 TEST_F(DynamicSliceFusionTest, CustomCallDUSTuple) {
   XlaBuilder b(TestName());
   auto big_buffer1 =
@@ -2873,9 +2764,11 @@ TEST_F(DynamicSliceFusionTest, CustomCallDUSTuple) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto program_shape,
+      xla::ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -3148,12 +3041,11 @@ TEST_F(DynamicSliceFusionTest, ReduceScatterSlice) {
   // is converted into an async operation. The kWaitForStreams thunks are
   // expected because of the async operation.
   ASSERT_EQ(gpu_exec->GetThunk().thunks().size(), 4ul);
-  EXPECT_THAT(
-      gpu_exec->GetThunk().thunks(),
-      ::testing::ElementsAre(ThunkKindIs(Thunk::kWaitForStreams),
-                             ThunkKindIs(Thunk::kNcclReduceScatterStart),
-                             ThunkKindIs(Thunk::kNcclReduceScatterDone),
-                             ThunkKindIs(Thunk::kWaitForStreams)));
+  EXPECT_THAT(gpu_exec->GetThunk().thunks(),
+              ::testing::ElementsAre(ThunkKindIs(Thunk::kWaitForStreams),
+                                     ThunkKindIs(Thunk::kReduceScatterStart),
+                                     ThunkKindIs(Thunk::kReduceScatterDone),
+                                     ThunkKindIs(Thunk::kWaitForStreams)));
 
   ErrorSpec error{/*aabs=*/1e-3, /*arel=*/1e-3};
   EXPECT_TRUE(RunAndCompareTwoModulesReplicated(std::move(module_ref_opt),
@@ -3346,7 +3238,7 @@ TEST_F(DynamicSliceFusionTest,
                                      ThunkKindIs(Thunk::kWaitForStreams),
                                      ThunkKindIs(Thunk::kDynamicSlice),
                                      ThunkKindIs(Thunk::kKernel),
-                                     ThunkKindIs(Thunk::kNcclReduceScatterDone),
+                                     ThunkKindIs(Thunk::kReduceScatterDone),
                                      ThunkKindIs(Thunk::kWaitForStreams)));
 
   // Check that the dynamic slice thunk only produces a start thunk, and not a
@@ -3357,9 +3249,8 @@ TEST_F(DynamicSliceFusionTest,
   const SequentialThunk* embedded_thunk = dynamic_cast<const SequentialThunk*>(
       dynamic_slice_thunk->embedded_thunk());
   ASSERT_NE(embedded_thunk, nullptr);
-  EXPECT_THAT(
-      embedded_thunk->thunks(),
-      ::testing::ElementsAre(ThunkKindIs(Thunk::kNcclReduceScatterStart)));
+  EXPECT_THAT(embedded_thunk->thunks(),
+              ::testing::ElementsAre(ThunkKindIs(Thunk::kReduceScatterStart)));
 
   // Check that the offsets were propagated as constants, and not as device
   // allocated buffers.
@@ -3395,14 +3286,14 @@ TEST_F(DynamicSliceFusionTest,
       add = s32[] add(iter, c1)
       src = s32[32,32] get-tuple-element(param), index=1
       dest = s32[32,32] get-tuple-element(param), index=2
-      
+
       // Offset calculation as a function of the induction variable.
       add.1 = s32[] add(iter, iter)
       c3 = s32[] constant(3)
       multiply = s32[] multiply(add.1, c3)
       c16 = s32[] constant(16)
       offset = s32[] subtract(multiply, c16)
-      
+
       c0 = s32[] constant(0)
       address_computation = s32[32,32] fusion(src, dest, offset, c0), kind=kCustom, calls=dynamic-slice-fusion, backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
       ROOT tuple = (s32[], s32[32,32], s32[32,32]) tuple(add, src, address_computation)
@@ -3422,26 +3313,26 @@ TEST_F(DynamicSliceFusionTest,
     })";
   const char* hlo_unfused = R"(
     HloModule test, replica_count=2
-    
+
     add {
       a = s32[] parameter(0)
       b = s32[] parameter(1)
       ROOT add = s32[] add(a, b)
     }
-    
+
     body {
       param = (s32[], s32[32,32], s32[32,32]) parameter(0)
       iter = s32[] get-tuple-element(param), index=0
       src = s32[32,32] get-tuple-element(param), index=1
       dest = s32[32,32] get-tuple-element(param), index=2
-      
+
       // Offset calculation as a function of the induction variable.
       add = s32[] add(iter, iter)
       c3 = s32[] constant(3)
       multiply = s32[] multiply(add, c3)
       c16 = s32[] constant(16)
       offset = s32[] subtract(multiply, c16)
-      
+
       c0 = s32[] constant(0)
       rs_start = ((s32[32,32]), s32[16,32]) reduce-scatter-start(src), dimensions={0}, replica_groups={{0,1}}, to_apply=add
       rs = s32[16,32] reduce-scatter-done(rs_start)
@@ -3450,14 +3341,14 @@ TEST_F(DynamicSliceFusionTest,
       add.1 = s32[] add(iter, c1)
       ROOT tuple = tuple(add.1, src, dus)
     }
-    
+
     condition {
       param = (s32[], s32[32,32], s32[32,32]) parameter(0)
       iter = s32[] get-tuple-element(param), index=0
       c16 = s32[] constant(16)
       ROOT compare = pred[] compare(iter, c16), direction=LT
     }
-    
+
     ENTRY main {
       src = s32[32,32] parameter(0)
       dest = s32[32,32] parameter(1)
@@ -3541,7 +3432,7 @@ TEST_F(DynamicSliceFusionTest, MultipleOffsetsAsFunctionOfInductionVariable) {
       add = s32[] add(iter, c1)
       src = s32[16,32,32] get-tuple-element(param), index=1
       dest = s32[32,32] get-tuple-element(param), index=2
-      
+
       // Offset calculation as a function of the induction variable.
       // offset.1 = 5i-32
       c5 = s32[] constant(5)
@@ -3554,7 +3445,7 @@ TEST_F(DynamicSliceFusionTest, MultipleOffsetsAsFunctionOfInductionVariable) {
       multiply.2 = s32[] multiply(add.1, c3)
       c16 = s32[] constant(16)
       offset.2 = s32[] subtract(multiply.2, c16)
-      
+
       c0 = s32[] constant(0)
       address_computation = s32[32,32] fusion(src, dest, offset.1, offset.2, c0), kind=kCustom, calls=dynamic-slice-fusion, backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
       ROOT tuple = (s32[], s32[16,32,32], s32[32,32]) tuple(add, src, address_computation)
@@ -3574,19 +3465,19 @@ TEST_F(DynamicSliceFusionTest, MultipleOffsetsAsFunctionOfInductionVariable) {
     })";
   const char* hlo_unfused = R"(
     HloModule test, replica_count=2
-    
+
     add {
       a = s32[] parameter(0)
       b = s32[] parameter(1)
       ROOT add = s32[] add(a, b)
     }
-    
+
     body {
       param = (s32[], s32[16,32,32], s32[32,32]) parameter(0)
       iter = s32[] get-tuple-element(param), index=0
       src = s32[16,32,32] get-tuple-element(param), index=1
       dest = s32[32,32] get-tuple-element(param), index=2
-      
+
       // Offset calculation as a function of the induction variable.
       // offset.1 = 5i-32
       c5 = s32[] constant(5)
@@ -3599,7 +3490,7 @@ TEST_F(DynamicSliceFusionTest, MultipleOffsetsAsFunctionOfInductionVariable) {
       multiply.2 = s32[] multiply(add, c3)
       c16 = s32[] constant(16)
       offset.2 = s32[] subtract(multiply.2, c16)
-      
+
       c0 = s32[] constant(0)
       ds = s32[1,32,32] dynamic-slice(src, offset.1, c0, c0), dynamic_slice_sizes={1,32,32}
       reshape = s32[32,32] reshape(ds)
@@ -3610,14 +3501,14 @@ TEST_F(DynamicSliceFusionTest, MultipleOffsetsAsFunctionOfInductionVariable) {
       add.1 = s32[] add(iter, c1)
       ROOT tuple = tuple(add.1, src, dus)
     }
-    
+
     condition {
       param = (s32[], s32[16,32,32], s32[32,32]) parameter(0)
       iter = s32[] get-tuple-element(param), index=0
       c16 = s32[] constant(16)
       ROOT compare = pred[] compare(iter, c16), direction=LT
     }
-    
+
     ENTRY main {
       src = s32[16,32,32] parameter(0)
       dest = s32[32,32] parameter(1)
@@ -3632,6 +3523,137 @@ TEST_F(DynamicSliceFusionTest, MultipleOffsetsAsFunctionOfInductionVariable) {
       /*run_hlo_passes=*/false, /*use_threads=*/true, std::nullopt));
 }
 
+TEST_F(DynamicSliceFusionTest,
+       ReduceScatterDynamicSliceMultipleBuffersShouldFuseAndExecuteCorrectly) {
+  const char* hlo = R"(
+    HloModule test, replica_count=2
+    add {
+      a = s32[] parameter(0)
+      b = s32[] parameter(1)
+      ROOT add = s32[] add(a, b)
+    }
+    body {
+      param.1 = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) parameter(0)
+      iter.1 = s32[] get-tuple-element(param.1), index=0
+      c1 = s32[] constant(1)
+      c0 = s32[] constant(0)
+      src1 = s32[8,8,8] get-tuple-element(param.1), index=1
+      src2 = s32[8,8,8] get-tuple-element(param.1), index=2
+      dst1 = s32[8,4,8] get-tuple-element(param.1), index=3
+      dst2 = s32[8,4,8] get-tuple-element(param.1), index=4
+      ds1 = s32[1,8,8]{2,1,0} dynamic-slice(src1, iter.1, c0, c0), dynamic_slice_sizes={1,8,8}
+      ds2 = s32[1,8,8]{2,1,0} dynamic-slice(src2, iter.1, c0, c0), dynamic_slice_sizes={1,8,8}
+      rs1 = s32[8,8] reshape(ds1)
+      rs2 = s32[8,8] reshape(ds2)
+      rs = (s32[4,8], s32[4,8]) reduce-scatter(rs1, rs2), dimensions={0}, replica_groups={{0,1}}, to_apply=add
+      reduce-scatter1 = s32[4,8] get-tuple-element(rs), index=0
+      reduce-scatter2 = s32[4,8] get-tuple-element(rs), index=1
+      reshape1 = s32[1,4,8] reshape(reduce-scatter1)
+      reshape2 = s32[1,4,8] reshape(reduce-scatter2)
+      dus1 = s32[8,4,8] dynamic-update-slice(dst1, reshape1, iter.1, c0, c0)
+      dus2 = s32[8,4,8] dynamic-update-slice(dst2, reshape2, iter.1, c0, c0)
+      add = s32[] add(iter.1, c1)
+      ROOT tuple = tuple(add, src1, src2, dus1, dus2)
+    }
+    condition {
+      param.2 = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) parameter(0)
+      iter.2 = s32[] get-tuple-element(param.2), index=0
+      c8 = s32[] constant(8)
+      ROOT compare = pred[] compare(iter.2, c8), direction=LT
+    }
+    ENTRY main {
+      c0 = s32[] constant(0)
+      p1 = s32[8,8,8] parameter(0)
+      p2 = s32[8,8,8] parameter(1)
+      p3 = s32[8,4,8] parameter(2)
+      p4 = s32[8,4,8] parameter(3)
+      tuple = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) tuple(c0, p1, p2, p3, p4)
+      ROOT while = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) while(tuple), body=body, condition=condition
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo));
+  std::unique_ptr<HloModule> m_fused = m->Clone();
+  m_fused->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_enable_dynamic_slice_fusion(true);
+  TF_ASSERT_OK_AND_ASSIGN(m_fused, GetOptimizedModule(std::move(m_fused)));
+  TF_ASSERT_OK_AND_ASSIGN(m, GetOptimizedModule(std::move(m)));
+
+  // Check that the fused module has a dynamic address computation.
+  std::vector<HloComputation*> fused_dynamic_slice_fusions =
+      GetDynamicSliceFusions(*m_fused);
+  ASSERT_EQ(fused_dynamic_slice_fusions.size(), 1);
+  // Check that the unfused module does not have a dynamic address computation.
+  std::vector<HloComputation*> unfused_dynamic_slice_fusions =
+      GetDynamicSliceFusions(*m);
+  ASSERT_EQ(unfused_dynamic_slice_fusions.size(), 0);
+
+  EXPECT_TRUE(RunAndCompareTwoModulesReplicated(
+      std::move(m_fused), std::move(m),
+      /*run_hlo_passes=*/false, /*use_threads=*/true, std::nullopt));
+}
+
+TEST_F(DynamicSliceFusionTest, WhileLoopSliceWithNoInductionVariable) {
+  const char* hlo = R"(
+  HloModule test, replica_count=2
+
+  add {
+    a = f32[] parameter(0)
+    b = f32[] parameter(1)
+    ROOT add = f32[] add(a, b)
+  }
+
+  body {
+    param = (s32[], s32[], f32[128,128], f32[1024,128]) parameter(0)
+    iter0 = s32[] get-tuple-element(param), index=0
+    iter1 = s32[] get-tuple-element(param), index=1
+    c0 = s32[] constant(0)
+    c1 = s32[] constant(1)
+    add0 = s32[] add(iter0, iter0)
+    add1 = s32[] add(iter1, c1)
+    a = f32[128,128] get-tuple-element(param), index=2
+    b = f32[1024,128] get-tuple-element(param), index=3
+    slice = f32[256,128] slice(b), slice={[0:256], [0:128]}
+    rs = f32[128,128] reduce-scatter(slice), replica_groups={{0,1}}, dimensions={0}, to_apply=add
+    ROOT tuple = tuple(add0, add1, rs, b)
+  }
+
+  condition {
+    param = (s32[], s32[], f32[128,128], f32[1024,128]) parameter(0)
+    iter = s32[] get-tuple-element(param), index=0
+    iter1 = s32[] get-tuple-element(param), index=1
+    c8 = s32[] constant(8)
+    compare1 = pred[] compare(iter, c8), direction=LT
+    compare2 = pred[] compare(iter1, c8), direction=LT
+    ROOT compare = pred[] and(compare1, compare2)
+  }
+
+  ENTRY main {
+    c1 = s32[] constant(1)
+    a = f32[128,128] parameter(0)
+    b = f32[1024,128] parameter(1)
+    tuple = tuple(c1, c1, a, b)
+    while = while(tuple), body=body, condition=condition
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo));
+  m->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_enable_dynamic_slice_fusion(false);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m_ref,
+                          GetOptimizedModule(m->Clone()));
+  m->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_enable_dynamic_slice_fusion(true);
+  TF_ASSERT_OK_AND_ASSIGN(m, GetOptimizedModule(std::move(m)));
+  ErrorSpec error_spec(1e-5, 1e-5);
+  EXPECT_TRUE(RunAndCompareTwoModulesReplicated(std::move(m), std::move(m_ref),
+                                                /*run_hlo_passes=*/false,
+                                                /*use_threads=*/true,
+                                                error_spec));
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
index 9d73b8f3545c..d90fb1368f31 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -71,9 +71,12 @@ cc_library(
         "//xla/service/gpu:kernel_reuse_cache",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
+        "//xla/service/gpu/llvm_gpu_backend:ptx_version_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -116,8 +119,6 @@ cc_library(
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -130,11 +131,11 @@ xla_cc_test(
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/mlir_hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:launch_dimensions",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -266,6 +267,7 @@ cc_library(
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
+        "//xla/service:platform_util",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
@@ -310,10 +312,8 @@ cc_library(
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:reduction_utils",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:launch_dim",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -332,9 +332,9 @@ xla_cc_test(
     srcs = ["reduction_base_test.cc"],
     deps = [
         ":reduction_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -353,6 +353,7 @@ cc_library(
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters:type_util",
+        "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
@@ -394,35 +395,29 @@ cc_library(
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters:type_util",
+        "//xla/codegen/emitters:utils",
+        "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
-    ],
-)
-
-xla_cc_test(
-    name = "transpose_test",
-    srcs = ["transpose_test.cc"],
-    deps = [
-        ":transpose",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:VectorDialect",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
index a444f4141396..24557ccfac32 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -30,6 +31,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -37,6 +40,7 @@ limitations under the License.
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
@@ -85,8 +89,11 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/codegen/emitters/type_util.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
@@ -99,6 +106,7 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/llvm_gpu_backend/ptx_version_util.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
@@ -106,10 +114,10 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -196,6 +204,52 @@ bool Needs64BitIndices(const HloComputation* computation) {
   return false;
 }
 
+absl::Status RunPassPipeline(mlir::ModuleOp module, const HloModule& hlo_module,
+                             mlir::PassManager& pm,
+                             absl::string_view entry_function_name) {
+  bool should_dump_mlir_passes =
+      DumpingEnabledForHloModule(hlo_module) &&
+      DumpingEnabledForHloPass("mlir-fusion-emitter",
+                               hlo_module.config().debug_options());
+
+  std::string mlir_passes_dump_result;
+  llvm::raw_string_ostream log_stream(mlir_passes_dump_result);
+  mlir::interpreter::MlirCompilationTrace trace;
+
+  if (should_dump_mlir_passes) {
+    module.getContext()->disableMultithreading();
+
+    auto print_always = [](mlir::Pass*, mlir::Operation*) { return true; };
+    pm.enableIRPrinting(/*shouldPrintBeforePass=*/print_always,
+                        /*shouldPrintAfterPass=*/print_always,
+                        /*printModuleScope=*/true,
+                        /*printAfterOnlyOnChange=*/false,
+                        /*printAfterOnlyOnFailure=*/true, log_stream,
+                        /*opPrintingFlags=*/{});
+    pm.printAsTextualPipeline(log_stream);
+    log_stream.write("\n\n", 2);
+
+    pm.addInstrumentation(
+        std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
+            trace));
+  }
+
+  tsl::StatusScopedDiagnosticHandler diagnostic_handler(module.getContext());
+  (void)pm.run(module);
+
+  if (should_dump_mlir_passes) {
+    DumpPerModuleProtobufToFile(
+        hlo_module, trace, hlo_module.config().debug_options(),
+        absl::StrCat(entry_function_name, ".mlir-trace"));
+
+    DumpToFileInDirOrStdout(
+        hlo_module, "", absl::StrCat(entry_function_name, ".mlir-passes.log"),
+        mlir_passes_dump_result);
+  }
+
+  return diagnostic_handler.consumeStatus();
+}
+
 }  // namespace
 
 Value EmitterBase::EmitBlockId(mlir::ImplicitLocOpBuilder& builder,
@@ -293,13 +347,6 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
     const se::DeviceDescription& device, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
-  HloModule* hlo_module = fusion.GetModule();
-  std::unique_ptr<mlir::interpreter::MlirCompilationTrace> trace = nullptr;
-  if (DumpingEnabledForHloModule(*hlo_module) &&
-      DumpingEnabledForHloPass("mlir-fusion-emitter",
-                               hlo_module->config().debug_options())) {
-    trace = std::make_unique<mlir::interpreter::MlirCompilationTrace>();
-  }
   TF_ASSIGN_OR_RETURN(
       auto module, CreateMLIRModule(mlir_context, fusion, entry_function_name,
                                     buffer_assignment));
@@ -308,13 +355,9 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
   AddXlaGpuOpsOptimizationPasses(pm);
   AddLoopTransformationPasses(pm, device);
   AddLoweringPasses(pm, device);
-  auto pipeline_status = RunPassPipeline(module.get(), pm, trace.get());
-  if (trace) {
-    DumpPerModuleProtobufToFile(
-        *hlo_module, *trace, hlo_module->config().debug_options(),
-        absl::StrCat(entry_function_name, ".mlir-trace"));
-  }
-  TF_RETURN_IF_ERROR(pipeline_status);
+
+  auto pipeline_status = RunPassPipeline(module.get(), *fusion.GetModule(), pm,
+                                         entry_function_name);
 
   auto llvm_module = mlir::translateModuleToLLVMIR(module.get(), llvm_context);
   TF_RET_CHECK(llvm_module != nullptr)
@@ -326,8 +369,7 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> EmitterBase::CreateLLVMModule(
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitterBase::CreateMLIRModule(
     mlir::MLIRContext& context, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
-    const BufferAssignment* buffer_assignment,
-    mlir::interpreter::MlirCompilationTrace* trace) const {
+    const BufferAssignment* buffer_assignment) const {
   context.loadDialect<
       mlir::DLTIDialect, mlir::NVVM::NVVMDialect, mlir::ROCDL::ROCDLDialect,
       mlir::affine::AffineDialect, mlir::arith::ArithDialect,
@@ -357,13 +399,10 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitterBase::CreateMLIRModule(
   }
   // Annotate tensors with the buffer indices. This way, the buffer propagation
   // pass can clean them up later.
-  int next_slice_index = 0;
-  absl::flat_hash_map<BufferAllocation::Slice, std::optional<int>>
-      slice_indices;
-  auto get_arg_attrs = [&](int index) -> absl::StatusOr<mlir::Attribute> {
+  auto get_arg_attrs = [&](int index) -> mlir::Attribute {
     if (!args) {
       return builder.getDictionaryAttr({builder.getNamedAttr(
-          "xla.slice_index", builder.getIndexAttr(next_slice_index++))});
+          "xla.slice_index", builder.getIndexAttr(index))});
     }
 
     const auto& arg = args->args()[index];
@@ -383,24 +422,21 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> EmitterBase::CreateMLIRModule(
     return builder.getDictionaryAttr(attrs);
   };
 
+  auto result_types = emitters::ShapeToMlirTypes(fusion.shape(), builder);
+
   SmallVector<mlir::Attribute> arg_attrs;
-  int arg_index = 0;
-  for (auto* param : fusion.operands()) {
+  arg_attrs.reserve(fusion.operands().size() + result_types.size());
+
+  for (auto [arg_index, param] : llvm::enumerate(fusion.operands())) {
     param_types.push_back(
         emitters::TensorShapeToMlirType(param->shape(), builder));
-    TF_ASSIGN_OR_RETURN(arg_attrs.emplace_back(), get_arg_attrs(arg_index++));
+    arg_attrs.push_back(get_arg_attrs(arg_index));
   }
 
-  auto result_types = emitters::ShapeToMlirTypes(fusion.shape(), builder);
-  param_types.append(result_types.begin(), result_types.end());
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      fusion.shape(), [&](const auto& shape, const ShapeIndex& index) {
-        if (shape.IsArray()) {
-          TF_ASSIGN_OR_RETURN(arg_attrs.emplace_back(),
-                              get_arg_attrs(arg_index++));
-        }
-        return absl::OkStatus();
-      }));
+  for (auto [result_index, type] : llvm::enumerate(result_types)) {
+    param_types.push_back(type);
+    arg_attrs.push_back(get_arg_attrs(fusion.operands().size() + result_index));
+  }
 
   builder.setInsertionPointToStart(module->getBody());
   auto entry_func = builder.create<FuncOp>(
@@ -554,25 +590,6 @@ EmitterBase::EmitEpilogue(
   return results_per_root;
 }
 
-absl::Status EmitterBase::RunPassPipeline(
-    mlir::ModuleOp module, mlir::PassManager& pm,
-    mlir::interpreter::MlirCompilationTrace* trace) const {
-  if (VLOG_IS_ON(5)) {
-    module.getContext()->disableMultithreading();
-    pm.enableIRPrinting();
-  }
-  if (trace) {
-    module.getContext()->disableMultithreading();
-    pm.addInstrumentation(
-        std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
-            *trace));
-  }
-
-  tsl::StatusScopedDiagnosticHandler diagnostic_handler(module.getContext());
-  (void)pm.run(module);
-  return diagnostic_handler.consumeStatus();
-}
-
 void AddXlaGpuOpsOptimizationPasses(mlir::OpPassManager& pm) {
   pm.addNestedPass<FuncOp>(emitters::CreateSimplifyArithPass());
   pm.addPass(mlir::createCanonicalizerPass());
@@ -605,7 +622,7 @@ void AddLoopTransformationPasses(mlir::OpPassManager& pm,
   // opportunities for LICM. This would not be necessary if LICM also moved
   // instructions over ifs.
   pm.addPass(mlir::createLoopInvariantCodeMotionPass());
-  pm.addNestedPass<FuncOp>(CreateVectorizeLoadsAndStoresPass(device));
+  pm.addNestedPass<FuncOp>(emitters::CreateVectorizeLoadsAndStoresPass(device));
   pm.addNestedPass<FuncOp>(CreateOptimizeLoopsPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
@@ -634,9 +651,23 @@ void AddLoweringPasses(mlir::OpPassManager& pm,
   pm.addPass(mlir::createCSEPass());
 
   // This pass has to run before `ExpandFloatOpsPass`.
-  auto maybe_convert_fp8 = MaybeCreateConvertFloatNvidiaPass(device);
-  if (maybe_convert_fp8.has_value()) {
-    pm.addPass(std::move(*maybe_convert_fp8));
+  if (auto* cc = std::get_if<se::CudaComputeCapability>(
+          &device.gpu_compute_capability())) {
+    se::SemanticVersion ptx_version =
+        nvptx::DetermineHighestSupportedPtxVersionFromCudaVersion(
+            device.runtime_version());
+
+    // FP8 conversion intrinsics are available on sm89 since ptx 8.1
+    // Older ptx versions only support FP8 conversion for sm90
+    if ((ptx_version >= se::SemanticVersion(8, 1, 0) && cc->IsAtLeast(8, 9)) ||
+        (ptx_version >= se::SemanticVersion(7, 8, 0) && cc->IsAtLeast(9, 0))) {
+      pm.addPass(CreateConvertFloatNvidiaPass());
+    }
+  } else if (auto* cc = std::get_if<se::RocmComputeCapability>(
+                 &device.gpu_compute_capability())) {
+    if (cc->has_fp8_support()) {
+      pm.addPass(CreateConvertFloatAMDPass(*cc));
+    }
   }
 
   pm.addPass(emitters::CreateExpandFloatOpsPass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
index 2a07ade3bc71..daafbd83d377 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
@@ -68,8 +68,7 @@ class EmitterBase : public KernelFusionInterface {
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
       mlir::MLIRContext& context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
-      const BufferAssignment* buffer_assignment,
-      mlir::interpreter::MlirCompilationTrace* trace = nullptr) const;
+      const BufferAssignment* buffer_assignment) const;
 
  protected:
   // Returns the set of instructions that will be isolated in the partitioned,
@@ -117,9 +116,6 @@ class EmitterBase : public KernelFusionInterface {
   absl::Status EmitMlir(mlir::ModuleOp module,
                         mlir::func::FuncOp entry_function,
                         const HloFusionInstruction& fusion) const;
-  absl::Status RunPassPipeline(
-      mlir::ModuleOp module, mlir::PassManager& pm,
-      mlir::interpreter::MlirCompilationTrace* trace) const;
 };
 
 // Adds passes that simplify arithmetic operations and remove dead code.
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
index ddf4b03cf5f2..d98158efccaf 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
@@ -49,11 +49,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -92,7 +92,7 @@ class DummyCopyEmitter : public EmitterBase {
   }
 };
 
-class EmitterBaseTest : public HloTestBase {
+class EmitterBaseTest : public HloHardwareIndependentTestBase {
  protected:
   EmitterBaseTest() {
     context_.loadDialect<mlir::tensor::TensorDialect, mlir::func::FuncDialect,
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
index 7f34246819b5..524313a47ea6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
@@ -136,11 +136,12 @@ absl::Status InPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
               Cast<HloDynamicUpdateSliceInstruction>(&instr.instruction());
           const auto& update_shape = dus_instr->update()->shape();
           SmallVector<Value> update_indices;
-          auto start_indices = ProvideParameterRange(
-              root_computation, dus_instr,
-              dus_instr->first_index_operand_number(), update_shape.rank(), {},
-              call_targets, entry_function, nested_b);
-          for (int i = 0; i < update_shape.rank(); ++i) {
+          auto start_indices =
+              ProvideParameterRange(root_computation, dus_instr,
+                                    dus_instr->first_index_operand_number(),
+                                    update_shape.dimensions().size(), {},
+                                    call_targets, entry_function, nested_b);
+          for (int i = 0; i < update_shape.dimensions().size(); ++i) {
             int64_t update_size = update_shape.dimensions(i);
             auto start_index = ClampIndex(
                 start_indices[i],
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
index 25b835dd9e52..20b5dec1c59a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
@@ -35,16 +35,10 @@ gentbl_cc_library(
     name = "xla_gpu_dialect_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-dialect-decls"],
-            "xla_gpu_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "xla_gpu_dialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_gpu_dialect.h.inc": ["-gen-dialect-decls"],
+        "xla_gpu_dialect.cc.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_gpu_dialect.td",
     deps = [":xla_gpu_td_files"],
@@ -54,16 +48,10 @@ gentbl_cc_library(
     name = "xla_gpu_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "xla_gpu_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "xla_gpu_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_gpu_ops.h.inc": ["-gen-op-decls"],
+        "xla_gpu_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_gpu_ops.td",
     deps = [":xla_gpu_td_files"],
@@ -73,30 +61,18 @@ gentbl_cc_library(
     name = "xla_gpu_attrs_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-enum-decls"],
-            "xla_gpu_enums.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "xla_gpu_enums.cc.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-decls",
-                "-attrdefs-dialect=xla_gpu",
-            ],
-            "xla_gpu_attrs.h.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-defs",
-                "-attrdefs-dialect=xla_gpu",
-            ],
-            "xla_gpu_attrs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_gpu_enums.h.inc": ["-gen-enum-decls"],
+        "xla_gpu_enums.cc.inc": ["-gen-enum-defs"],
+        "xla_gpu_attrs.h.inc": [
+            "-gen-attrdef-decls",
+            "-attrdefs-dialect=xla_gpu",
+        ],
+        "xla_gpu_attrs.cc.inc": [
+            "-gen-attrdef-defs",
+            "-attrdefs-dialect=xla_gpu",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_gpu_attrs.td",
     deps = [":xla_gpu_td_files"],
@@ -106,22 +82,16 @@ gentbl_cc_library(
     name = "xla_gpu_types_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-typedef-decls",
-                "-typedefs-dialect=xla_gpu",
-            ],
-            "xla_gpu_types.h.inc",
-        ),
-        (
-            [
-                "-gen-typedef-defs",
-                "-typedefs-dialect=xla_gpu",
-            ],
-            "xla_gpu_types.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_gpu_types.h.inc": [
+            "-gen-typedef-decls",
+            "-typedefs-dialect=xla_gpu",
+        ],
+        "xla_gpu_types.cc.inc": [
+            "-gen-typedef-defs",
+            "-typedefs-dialect=xla_gpu",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_gpu_types.td",
     deps = [":xla_gpu_td_files"],
@@ -143,9 +113,6 @@ cc_library(
         ":xla_gpu_types_inc_gen",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BytecodeOpInterface",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc
index c7d5c5f61880..c916b8e6dfbf 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.cc
@@ -54,33 +54,22 @@ namespace {
 
 using llvm::ArrayRef;
 using mlir::AffineExpr;
-using mlir::AffineMap;
-using mlir::Block;
 using mlir::DenseI64ArrayAttr;
 using mlir::failure;
-using mlir::getAffineConstantExpr;
-using mlir::getAffineDimExpr;
-using mlir::getAffineSymbolExpr;
 using mlir::Location;
 using mlir::LogicalResult;
 using mlir::MLIRContext;
 using mlir::OpAsmParser;
 using mlir::OpAsmPrinter;
-using mlir::OpBuilder;
 using mlir::OperationState;
 using mlir::ParseResult;
-using mlir::PatternRewriter;
 using mlir::RankedTensorType;
-using mlir::Region;
 using mlir::SmallVector;
 using mlir::success;
 using mlir::Type;
 using mlir::TypeRange;
-using mlir::Value;
 using mlir::ValueRange;
 
-namespace arith = mlir::arith;
-
 }  // namespace
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
index e63941870120..201ca60541fc 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/reduction_utils.h"
+#include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/launch_dim.h"
@@ -113,7 +114,7 @@ struct ReductionFusion::EmitterState {
       root_indices[&root.instruction()] = root_index;
       fusion_result_index_starts[&root.instruction()] = output_index;
       output_index +=
-          root.shape().IsTuple() ? root.shape().tuple_shapes_size() : 1;
+          root.shape().IsTuple() ? root.shape().tuple_shapes().size() : 1;
     }
   }
 
@@ -789,56 +790,66 @@ RowReductionFusion::RowReductionFusion(const HloFusionAnalysis& analysis)
     : ReductionFusion(analysis) {
   CHECK(reduction_dimensions_.is_row_reduction);
   Vector3 shape = reduction_dimensions_.dimensions;
-  constexpr int64_t kMinorReducedElementsPerThread = 16;
-
-  int64_t num_threads_kept = 1;
-  // Number of threads doing the reduction.
-  int64_t num_threads_reduced = [&] {
-    int64_t max_block_size = MinThreadsXRowReduction();
-    return std::min(max_block_size,
-                    RoundUpTo(CeilOfRatio(shape[kRowMinorReduced],
-                                          kMinorReducedElementsPerThread),
-                              WarpSize()));
-  }();
-
-  // If we're limited by the size of the x dimension, add additional parallelism
-  // in the y dimension. The code generator doesn't currently support
-  // parallelizing the z dimension (major reduced dimensions). The general
-  // recommendation is to use between 128 and 512 threads, so we just go for
-  // 256. See https://forums.developer.nvidia.com/t/55529
-  constexpr int64_t kThreadsPerBlockTarget = 256;
-  if (num_threads_reduced * 2 <= kThreadsPerBlockTarget) {
-    int64_t kept_size = reduction_dimensions_.dimensions[kRowKept];
-    // Increase the size of the y dimension as long as there's remaining
-    // parallelism.
-    if (kept_size * num_threads_reduced <= kThreadsPerBlockTarget) {
-      num_threads_kept = kept_size;
-    } else {
-      num_threads_kept = kThreadsPerBlockTarget / num_threads_reduced;
+  int64_t kMinorReducedElementsPerThread = 8;
+
+  do {
+    kMinorReducedElementsPerThread *= 2;
+    int64_t num_threads_kept = 1;
+    // Number of threads doing the reduction.
+    int64_t num_threads_reduced = [&] {
+      int64_t max_block_size = MinThreadsXRowReduction();
+      return std::min(max_block_size,
+                      RoundUpTo(CeilOfRatio(shape[kRowMinorReduced],
+                                            kMinorReducedElementsPerThread),
+                                WarpSize()));
+    }();
+
+    // If we're limited by the size of the x dimension, add additional
+    // parallelism in the y dimension. The code generator doesn't currently
+    // support parallelizing the z dimension (major reduced dimensions). The
+    // general recommendation is to use between 128 and 512 threads, so we just
+    // go for 256. See https://forums.developer.nvidia.com/t/55529
+    constexpr int64_t kThreadsPerBlockTarget = 256;
+    if (num_threads_reduced * 2 <= kThreadsPerBlockTarget) {
+      int64_t kept_size = reduction_dimensions_.dimensions[kRowKept];
+      // Increase the size of the y dimension as long as there's remaining
+      // parallelism.
+      if (kept_size * num_threads_reduced <= kThreadsPerBlockTarget) {
+        num_threads_kept = kept_size;
+      } else {
+        num_threads_kept = kThreadsPerBlockTarget / num_threads_reduced;
+      }
     }
-  }
 
-  int vector_size = GetVectorSizeForMlir(analysis, /*minor_dim=*/shape.back(),
-                                         num_threads_reduced);
-  num_threads_ = {num_threads_kept, num_threads_reduced};
-  // TODO(jreiffers): Get rid of `vector_size` in here.
-  input_shape_ = {shape[0], shape[1], shape[2] / vector_size, vector_size};
-  // TODO(jreiffers): Tighten ranges based on constraints when simplifying
-  // instead of using min here. For example, based on
-  //
-  //   s1 in [0, 127]
-  //   d0 floordiv 32 + s1 * 32 in [0, 63]
-  //
-  // Tighten the bound of s1 to [0, 1].
-  int minor_reduced_tile_size =
-      std::min(kMinorReducedElementsPerThread / vector_size,
-               CeilOfRatio(input_shape_[2], num_threads_[1]));
-
-  tile_sizes_per_thread_ = {shape[0], minor_reduced_tile_size, vector_size};
-  tile_sizes_per_block_ = {num_threads_kept,
-                           minor_reduced_tile_size * num_threads_reduced};
-  num_blocks_ = {CeilOfRatio(input_shape_[1], tile_sizes_per_block_[0]),
-                 CeilOfRatio(input_shape_[2], tile_sizes_per_block_[1])};
+    int vector_size = GetVectorSizeForMlir(analysis, /*minor_dim=*/shape.back(),
+                                           num_threads_reduced);
+    num_threads_ = {num_threads_kept, num_threads_reduced};
+    // TODO(jreiffers): Get rid of `vector_size` in here.
+    input_shape_ = {shape[0], shape[1], shape[2] / vector_size, vector_size};
+    // TODO(jreiffers): Tighten ranges based on constraints when simplifying
+    // instead of using min here. For example, based on
+    //
+    //   s1 in [0, 127]
+    //   d0 floordiv 32 + s1 * 32 in [0, 63]
+    //
+    // Tighten the bound of s1 to [0, 1].
+    int minor_reduced_tile_size =
+        std::min(kMinorReducedElementsPerThread / vector_size,
+                 CeilOfRatio(input_shape_[2], num_threads_[1]));
+
+    tile_sizes_per_thread_ = {shape[0], minor_reduced_tile_size, vector_size};
+    tile_sizes_per_block_ = {num_threads_kept,
+                             minor_reduced_tile_size * num_threads_reduced};
+    num_blocks_ = {CeilOfRatio(input_shape_[1], tile_sizes_per_block_[0]),
+                   CeilOfRatio(input_shape_[2], tile_sizes_per_block_[1])};
+    /* ROCm hipModuleLaunchKernel limitation
+     * https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/group___module.html#ga2e4de5937aa8171e9eda16c881ed0674
+     */
+  } while (xla::PlatformUtil::CanonicalPlatformName("gpu").value() == "rocm" &&
+           kMinorReducedElementsPerThread < 65536 &&
+           ((Product(num_blocks_) * Product(num_threads_)) >
+            std::numeric_limits<uint32_t>::max()));
+
   VLOG(3) << absl::StrFormat(
       "RowReductionFusion::RowReductionFusion selected parameters: num_threads "
       "= [%s], tile_sizes_per_thread = [%s], tile_sizes_per_block = [%s], "
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction_base_test.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction_base_test.cc
index 8343ac1f15c8..bed62b2d2e33 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction_base_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction_base_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace gpu {
@@ -27,7 +27,7 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::SizeIs;
 
-using MlirReductionBaseTest = HloTestBase;
+using MlirReductionBaseTest = HloHardwareIndependentTestBase;
 
 TEST_F(MlirReductionBaseTest, TwoGroups) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
index 7457e35e8ea9..2133b520f523 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/type_util.h"
+#include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -636,23 +637,6 @@ void ScatterWithDistributedIndices::ComputeIndexing(
   }
 }
 
-DenseElementsAttr GetShapedZeroConstantAttr(VectorType vector_type) {
-  auto elem_type = vector_type.getElementType();
-  if (auto float_type = mlir::dyn_cast<mlir::FloatType>(elem_type)) {
-    std::vector<llvm::APFloat> values(
-        vector_type.getNumElements(),
-        APFloat::getZero(float_type.getFloatSemantics()));
-    return DenseElementsAttr::get(vector_type, values);
-  }
-  if (auto int_type = mlir::dyn_cast<mlir::IntegerType>(elem_type)) {
-    std::vector<llvm::APInt> values(
-        vector_type.getNumElements(),
-        APInt::getZero(int_type.getIntOrFloatBitWidth()));
-    return DenseElementsAttr::get(vector_type, values);
-  }
-  llvm_unreachable("Unsupported vector element type");
-}
-
 Value ScatterWithDistributedIndices::InitializeAccumulator(
     ImplicitLocOpBuilder& b) const {
   auto elem_type = emitters::PrimitiveTypeToMlirType(description_.elem_type, b);
@@ -662,7 +646,7 @@ Value ScatterWithDistributedIndices::InitializeAccumulator(
   auto accumulator_type =
       VectorType::get({update_iterations_per_thread, vector_size_}, elem_type);
   return b.create<arith::ConstantOp>(
-      accumulator_type, GetShapedZeroConstantAttr(accumulator_type));
+      accumulator_type, emitters::GetZeroDenseElementsAttr(accumulator_type));
 }
 
 absl::Status ScatterWithDistributedIndices::EmitEntryFunctionImpl(
@@ -915,7 +899,6 @@ std::unique_ptr<ScatterFusion> CreateScatterFusion(
 
   int64_t max_active_warps =
       kNumWarpsPerBlock * analysis.device_info().core_count();
-  // TODO(b/385081952): Investigate why bf16 and f64 leads to incorrect results.
   // If we have enough data, we assign each warp to process a single
   // slice.
   if (num_slices > max_active_warps &&
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/concatenate/concat_s4.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/concatenate/concat_s4.hlo
new file mode 100644
index 000000000000..f8778da9a34b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/concatenate/concat_s4.hlo
@@ -0,0 +1,13 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  param0 = s4[200] parameter(0)
+  param1 = s4[400] parameter(1)
+  param2 = s4[300] parameter(2)
+  ROOT concat = s4[900] concatenate(param0, param1, param2), dimensions={0}
+}
+// CHECK: xla.pure_call @fusion_param0
+// CHECK: xla.pure_call @fusion_param1
+// CHECK: xla.pure_call @fusion_param2
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/loop/iota_copy_bitcast.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/loop/iota_copy_bitcast.hlo
index 53afdbf15071..8783c4d8beb2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/loop/iota_copy_bitcast.hlo
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/loop/iota_copy_bitcast.hlo
@@ -2,14 +2,14 @@
 // RUN: test_correctness %s
 
 fusion {
-  %iota = f32[10,20,30] iota(), iota_dimension=2
-  %copy = f32[10,20,30] copy(%iota)
-  %bitcast = s32[10,20,30] bitcast-convert(%copy)
-  %broadcast = s32[2,10,3,20,5,30,7] broadcast(%bitcast),
+  %iota = f32[10,20,6] iota(), iota_dimension=2
+  %copy = f32[10,20,6] copy(%iota)
+  %bitcast = s32[10,20,6] bitcast-convert(%copy)
+  %broadcast = s32[2,10,3,20,5,6,7] broadcast(%bitcast),
     dimensions={1,3,5}
-  %reshape = s32[20,60,150,7] reshape(%broadcast)
-  %reverse = s32[20,60,150,7] reverse(%reshape), dimensions={2,3}
-  ROOT %transpose = s32[60,20,7,150] transpose(%reverse),
+  %reshape = s32[20,60,30,7] reshape(%broadcast)
+  %reverse = s32[20,60,30,7] reverse(%reshape), dimensions={2,3}
+  ROOT %transpose = s32[60,20,7,30] transpose(%reverse),
     dimensions={1,0,3,2}
 }
 // CHECK-COUNT-2: func.func
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/loop/large_loop_slow_compile_time.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/loop/large_loop_slow_compile_time.hlo
new file mode 100644
index 000000000000..3f29d4a32089
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/loop/large_loop_slow_compile_time.hlo
@@ -0,0 +1,16838 @@
+// RUN: fusion_to_mlir %s | FileCheck %s --check-prefix=CHECK-PARTITIONED-HLO
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize \
+// RUN:   -xla-gpu-test-transform-loops | FileCheck %s
+
+HloModule m
+
+%fused_multiply {
+  %constant_84353_39 = f32[] constant(-1.99999988)
+  %broadcast.244407.384 = f32[1280,1280]{1,0} broadcast(%constant_84353_39), dimensions={}
+  %constant_84358_39 = f32[] constant(-0.954499722)
+  %broadcast.244408.1024 = f32[1280,1280]{1,0} broadcast(%constant_84358_39), dimensions={}
+  %param_0.210024 = u64[1280]{0} parameter(0)
+  %broadcast.244410.17671 = u64[1280,1280]{1,0} broadcast(%param_0.210024), dimensions={0}
+  %iota.3149.17671 = u64[1280,1280]{1,0} iota(), iota_dimension=1
+  %add.244064.17671 = u64[1280,1280]{1,0} add(%broadcast.244410.17671, %iota.3149.17671)
+  %constant_39483_353 = u64[] constant(32)
+  %broadcast.244411.4867 = u64[1280,1280]{1,0} broadcast(%constant_39483_353), dimensions={}
+  %shift-right-logical.113832.4867 = u64[1280,1280]{1,0} shift-right-logical(%add.244064.17671, %broadcast.244411.4867)
+  %convert.3610.4865 = u32[1280,1280]{1,0} convert(%shift-right-logical.113832.4867)
+  %constant_158097_1 = u32[] constant(2326687384)
+  %broadcast.244412.44 = u32[1280,1280]{1,0} broadcast(%constant_158097_1), dimensions={}
+  %add.244065.37 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.244412.44)
+  %convert.3611.12801 = u32[1280,1280]{1,0} convert(%add.244064.17671)
+  %constant_158104_1 = u32[] constant(3182756316)
+  %broadcast.244413.113 = u32[1280,1280]{1,0} broadcast(%constant_158104_1), dimensions={}
+  %add.244067.99 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.244413.113)
+  %add.244068.35 = u32[1280,1280]{1,0} add(%add.244065.37, %add.244067.99)
+  %constant_39484_252 = u32[] constant(13)
+  %broadcast.244414.6272 = u32[1280,1280]{1,0} broadcast(%constant_39484_252), dimensions={}
+  %shift-left.107747.31 = u32[1280,1280]{1,0} shift-left(%add.244067.99, %broadcast.244414.6272)
+  %constant_39485_252 = u32[] constant(19)
+  %broadcast.244415.6016 = u32[1280,1280]{1,0} broadcast(%constant_39485_252), dimensions={}
+  %shift-right-logical.113833.29 = u32[1280,1280]{1,0} shift-right-logical(%add.244067.99, %broadcast.244415.6016)
+  %or.113347.29 = u32[1280,1280]{1,0} or(%shift-left.107747.31, %shift-right-logical.113833.29)
+  %xor.119908.27 = u32[1280,1280]{1,0} xor(%add.244068.35, %or.113347.29)
+  %add.244069.5 = u32[1280,1280]{1,0} add(%add.244068.35, %xor.119908.27)
+  %constant_39486_382 = u32[] constant(15)
+  %broadcast.244416.5760 = u32[1280,1280]{1,0} broadcast(%constant_39486_382), dimensions={}
+  %shift-left.107748.9 = u32[1280,1280]{1,0} shift-left(%xor.119908.27, %broadcast.244416.5760)
+  %constant_39487_382 = u32[] constant(17)
+  %broadcast.244417.5760 = u32[1280,1280]{1,0} broadcast(%constant_39487_382), dimensions={}
+  %shift-right-logical.113834.9 = u32[1280,1280]{1,0} shift-right-logical(%xor.119908.27, %broadcast.244417.5760)
+  %or.113348.7 = u32[1280,1280]{1,0} or(%shift-left.107748.9, %shift-right-logical.113834.9)
+  %xor.119909.5 = u32[1280,1280]{1,0} xor(%add.244069.5, %or.113348.7)
+  %add.244070.3 = u32[1280,1280]{1,0} add(%add.244069.5, %xor.119909.5)
+  %constant_39488_447 = u32[] constant(26)
+  %broadcast.244418.4352 = u32[1280,1280]{1,0} broadcast(%constant_39488_447), dimensions={}
+  %shift-left.107749.5 = u32[1280,1280]{1,0} shift-left(%xor.119909.5, %broadcast.244418.4352)
+  %constant_39489_447 = u32[] constant(6)
+  %broadcast.244419.4352 = u32[1280,1280]{1,0} broadcast(%constant_39489_447), dimensions={}
+  %shift-right-logical.113835.5 = u32[1280,1280]{1,0} shift-right-logical(%xor.119909.5, %broadcast.244419.4352)
+  %or.113349.3 = u32[1280,1280]{1,0} or(%shift-left.107749.5, %shift-right-logical.113835.5)
+  %xor.119911.3 = u32[1280,1280]{1,0} xor(%add.244070.3, %or.113349.3)
+  %add.244072.3 = u32[1280,1280]{1,0} add(%add.244070.3, %xor.119911.3)
+  %add.244073.7 = u32[1280,1280]{1,0} add(%add.244072.3, %broadcast.244413.113)
+  %shift-left.107751.5 = u32[1280,1280]{1,0} shift-left(%xor.119911.3, %broadcast.244419.4352)
+  %shift-right-logical.113836.5 = u32[1280,1280]{1,0} shift-right-logical(%xor.119911.3, %broadcast.244418.4352)
+  %or.113351.3 = u32[1280,1280]{1,0} or(%shift-left.107751.5, %shift-right-logical.113836.5)
+  %xor.119912.3 = u32[1280,1280]{1,0} xor(%add.244072.3, %or.113351.3)
+  %constant_217763_1 = u32[] constant(751465631)
+  %broadcast.244425.5 = u32[1280,1280]{1,0} broadcast(%constant_217763_1), dimensions={}
+  %add.244074.5 = u32[1280,1280]{1,0} add(%xor.119912.3, %broadcast.244425.5)
+  %add.244075.5 = u32[1280,1280]{1,0} add(%add.244073.7, %add.244074.5)
+  %shift-left.107752.9 = u32[1280,1280]{1,0} shift-left(%add.244074.5, %broadcast.244417.5760)
+  %shift-right-logical.113838.9 = u32[1280,1280]{1,0} shift-right-logical(%add.244074.5, %broadcast.244416.5760)
+  %or.113352.7 = u32[1280,1280]{1,0} or(%shift-left.107752.9, %shift-right-logical.113838.9)
+  %xor.119913.5 = u32[1280,1280]{1,0} xor(%add.244075.5, %or.113352.7)
+  %add.244077.3 = u32[1280,1280]{1,0} add(%add.244075.5, %xor.119913.5)
+  %constant_39492_187 = u32[] constant(29)
+  %broadcast.244428.2304 = u32[1280,1280]{1,0} broadcast(%constant_39492_187), dimensions={}
+  %shift-left.107753.9 = u32[1280,1280]{1,0} shift-left(%xor.119913.5, %broadcast.244428.2304)
+  %constant_39493_187 = u32[] constant(3)
+  %broadcast.244429.2304 = u32[1280,1280]{1,0} broadcast(%constant_39493_187), dimensions={}
+  %shift-right-logical.113839.9 = u32[1280,1280]{1,0} shift-right-logical(%xor.119913.5, %broadcast.244429.2304)
+  %or.113353.7 = u32[1280,1280]{1,0} or(%shift-left.107753.9, %shift-right-logical.113839.9)
+  %xor.119914.5 = u32[1280,1280]{1,0} xor(%add.244077.3, %or.113353.7)
+  %add.244078.3 = u32[1280,1280]{1,0} add(%add.244077.3, %xor.119914.5)
+  %constant_39494_317 = u32[] constant(16)
+  %broadcast.244430.4608 = u32[1280,1280]{1,0} broadcast(%constant_39494_317), dimensions={}
+  %shift-left.107754.9 = u32[1280,1280]{1,0} shift-left(%xor.119914.5, %broadcast.244430.4608)
+  %shift-right-logical.113840.9 = u32[1280,1280]{1,0} shift-right-logical(%xor.119914.5, %broadcast.244430.4608)
+  %or.113354.7 = u32[1280,1280]{1,0} or(%shift-left.107754.9, %shift-right-logical.113840.9)
+  %xor.119916.5 = u32[1280,1280]{1,0} xor(%add.244078.3, %or.113354.7)
+  %add.244079.3 = u32[1280,1280]{1,0} add(%add.244078.3, %xor.119916.5)
+  %constant_158106_1 = u32[] constant(751465630)
+  %broadcast.244432.24 = u32[1280,1280]{1,0} broadcast(%constant_158106_1), dimensions={}
+  %add.244080.7 = u32[1280,1280]{1,0} add(%add.244079.3, %broadcast.244432.24)
+  %constant_39495_187 = u32[] constant(24)
+  %broadcast.244433.2816 = u32[1280,1280]{1,0} broadcast(%constant_39495_187), dimensions={}
+  %shift-left.107755.11 = u32[1280,1280]{1,0} shift-left(%xor.119916.5, %broadcast.244433.2816)
+  %constant_39496_187 = u32[] constant(8)
+  %broadcast.244434.2816 = u32[1280,1280]{1,0} broadcast(%constant_39496_187), dimensions={}
+  %shift-right-logical.113841.11 = u32[1280,1280]{1,0} shift-right-logical(%xor.119916.5, %broadcast.244434.2816)
+  %or.113356.9 = u32[1280,1280]{1,0} or(%shift-left.107755.11, %shift-right-logical.113841.11)
+  %xor.119917.7 = u32[1280,1280]{1,0} xor(%add.244079.3, %or.113356.9)
+  %constant_217764_1 = u32[] constant(2326687386)
+  %broadcast.244435.5 = u32[1280,1280]{1,0} broadcast(%constant_217764_1), dimensions={}
+  %add.244081.5 = u32[1280,1280]{1,0} add(%xor.119917.7, %broadcast.244435.5)
+  %add.244083.5 = u32[1280,1280]{1,0} add(%add.244080.7, %add.244081.5)
+  %shift-left.107756.9 = u32[1280,1280]{1,0} shift-left(%add.244081.5, %broadcast.244414.6272)
+  %shift-right-logical.113842.9 = u32[1280,1280]{1,0} shift-right-logical(%add.244081.5, %broadcast.244415.6016)
+  %or.113357.7 = u32[1280,1280]{1,0} or(%shift-left.107756.9, %shift-right-logical.113842.9)
+  %xor.119918.5 = u32[1280,1280]{1,0} xor(%add.244083.5, %or.113357.7)
+  %add.244087.3 = u32[1280,1280]{1,0} add(%add.244083.5, %xor.119918.5)
+  %shift-left.107757.9 = u32[1280,1280]{1,0} shift-left(%xor.119918.5, %broadcast.244416.5760)
+  %shift-right-logical.113843.9 = u32[1280,1280]{1,0} shift-right-logical(%xor.119918.5, %broadcast.244417.5760)
+  %or.113358.7 = u32[1280,1280]{1,0} or(%shift-left.107757.9, %shift-right-logical.113843.9)
+  %xor.119919.5 = u32[1280,1280]{1,0} xor(%add.244087.3, %or.113358.7)
+  %add.244088.3 = u32[1280,1280]{1,0} add(%add.244087.3, %xor.119919.5)
+  %shift-left.107758.7 = u32[1280,1280]{1,0} shift-left(%xor.119919.5, %broadcast.244418.4352)
+  %shift-right-logical.113844.7 = u32[1280,1280]{1,0} shift-right-logical(%xor.119919.5, %broadcast.244419.4352)
+  %or.113359.5 = u32[1280,1280]{1,0} or(%shift-left.107758.7, %shift-right-logical.113844.7)
+  %xor.119921.3 = u32[1280,1280]{1,0} xor(%add.244088.3, %or.113359.5)
+  %add.244089.3 = u32[1280,1280]{1,0} add(%add.244088.3, %xor.119921.3)
+  %add.244090.7 = u32[1280,1280]{1,0} add(%add.244089.3, %broadcast.244412.44)
+  %shift-left.107759.7 = u32[1280,1280]{1,0} shift-left(%xor.119921.3, %broadcast.244419.4352)
+  %shift-right-logical.113845.7 = u32[1280,1280]{1,0} shift-right-logical(%xor.119921.3, %broadcast.244418.4352)
+  %or.113360.5 = u32[1280,1280]{1,0} or(%shift-left.107759.7, %shift-right-logical.113845.7)
+  %xor.119922.3 = u32[1280,1280]{1,0} xor(%add.244089.3, %or.113360.5)
+  %constant_217765_1 = u32[] constant(3182756319)
+  %broadcast.244445.5 = u32[1280,1280]{1,0} broadcast(%constant_217765_1), dimensions={}
+  %add.244092.5 = u32[1280,1280]{1,0} add(%xor.119922.3, %broadcast.244445.5)
+  %add.244093.5 = u32[1280,1280]{1,0} add(%add.244090.7, %add.244092.5)
+  %shift-left.107761.9 = u32[1280,1280]{1,0} shift-left(%add.244092.5, %broadcast.244417.5760)
+  %shift-right-logical.113846.9 = u32[1280,1280]{1,0} shift-right-logical(%add.244092.5, %broadcast.244416.5760)
+  %or.113361.7 = u32[1280,1280]{1,0} or(%shift-left.107761.9, %shift-right-logical.113846.9)
+  %xor.119923.5 = u32[1280,1280]{1,0} xor(%add.244093.5, %or.113361.7)
+  %add.244094.3 = u32[1280,1280]{1,0} add(%add.244093.5, %xor.119923.5)
+  %shift-left.107762.9 = u32[1280,1280]{1,0} shift-left(%xor.119923.5, %broadcast.244428.2304)
+  %shift-right-logical.113847.9 = u32[1280,1280]{1,0} shift-right-logical(%xor.119923.5, %broadcast.244429.2304)
+  %or.113362.7 = u32[1280,1280]{1,0} or(%shift-left.107762.9, %shift-right-logical.113847.9)
+  %xor.119924.5 = u32[1280,1280]{1,0} xor(%add.244094.3, %or.113362.7)
+  %add.244095.3 = u32[1280,1280]{1,0} add(%add.244094.3, %xor.119924.5)
+  %shift-left.107763.9 = u32[1280,1280]{1,0} shift-left(%xor.119924.5, %broadcast.244430.4608)
+  %shift-right-logical.113848.9 = u32[1280,1280]{1,0} shift-right-logical(%xor.119924.5, %broadcast.244430.4608)
+  %or.113363.7 = u32[1280,1280]{1,0} or(%shift-left.107763.9, %shift-right-logical.113848.9)
+  %xor.119925.5 = u32[1280,1280]{1,0} xor(%add.244095.3, %or.113363.7)
+  %add.244097.3 = u32[1280,1280]{1,0} add(%add.244095.3, %xor.119925.5)
+  %add.244098.7 = u32[1280,1280]{1,0} add(%add.244097.3, %broadcast.244413.113)
+  %shift-left.107764.11 = u32[1280,1280]{1,0} shift-left(%xor.119925.5, %broadcast.244433.2816)
+  %shift-right-logical.113849.11 = u32[1280,1280]{1,0} shift-right-logical(%xor.119925.5, %broadcast.244434.2816)
+  %or.113364.9 = u32[1280,1280]{1,0} or(%shift-left.107764.11, %shift-right-logical.113849.11)
+  %xor.119926.7 = u32[1280,1280]{1,0} xor(%add.244097.3, %or.113364.9)
+  %constant_217766_1 = u32[] constant(751465634)
+  %broadcast.244457.5 = u32[1280,1280]{1,0} broadcast(%constant_217766_1), dimensions={}
+  %add.244099.5 = u32[1280,1280]{1,0} add(%xor.119926.7, %broadcast.244457.5)
+  %add.244100.5 = u32[1280,1280]{1,0} add(%add.244098.7, %add.244099.5)
+  %shift-left.107766.9 = u32[1280,1280]{1,0} shift-left(%add.244099.5, %broadcast.244414.6272)
+  %shift-right-logical.113850.9 = u32[1280,1280]{1,0} shift-right-logical(%add.244099.5, %broadcast.244415.6016)
+  %or.113366.7 = u32[1280,1280]{1,0} or(%shift-left.107766.9, %shift-right-logical.113850.9)
+  %xor.119927.5 = u32[1280,1280]{1,0} xor(%add.244100.5, %or.113366.7)
+  %add.244102.3 = u32[1280,1280]{1,0} add(%add.244100.5, %xor.119927.5)
+  %shift-left.107767.9 = u32[1280,1280]{1,0} shift-left(%xor.119927.5, %broadcast.244416.5760)
+  %shift-right-logical.113851.9 = u32[1280,1280]{1,0} shift-right-logical(%xor.119927.5, %broadcast.244417.5760)
+  %or.113367.7 = u32[1280,1280]{1,0} or(%shift-left.107767.9, %shift-right-logical.113851.9)
+  %xor.119928.5 = u32[1280,1280]{1,0} xor(%add.244102.3, %or.113367.7)
+  %add.244103.3 = u32[1280,1280]{1,0} add(%add.244102.3, %xor.119928.5)
+  %shift-left.107768.5 = u32[1280,1280]{1,0} shift-left(%xor.119928.5, %broadcast.244418.4352)
+  %shift-right-logical.113852.5 = u32[1280,1280]{1,0} shift-right-logical(%xor.119928.5, %broadcast.244419.4352)
+  %or.113368.3 = u32[1280,1280]{1,0} or(%shift-left.107768.5, %shift-right-logical.113852.5)
+  %xor.119929.3 = u32[1280,1280]{1,0} xor(%add.244103.3, %or.113368.3)
+  %add.244104.3 = u32[1280,1280]{1,0} add(%add.244103.3, %xor.119929.3)
+  %add.244105.17 = u32[1280,1280]{1,0} add(%add.244104.3, %broadcast.244432.24)
+  %shift-left.107769.5 = u32[1280,1280]{1,0} shift-left(%xor.119929.3, %broadcast.244419.4352)
+  %shift-right-logical.113853.5 = u32[1280,1280]{1,0} shift-right-logical(%xor.119929.3, %broadcast.244418.4352)
+  %or.113369.3 = u32[1280,1280]{1,0} or(%shift-left.107769.5, %shift-right-logical.113853.5)
+  %xor.119931.15 = u32[1280,1280]{1,0} xor(%add.244104.3, %or.113369.3)
+  %constant_217767_1 = u32[] constant(2326687389)
+  %broadcast.244467.19 = u32[1280,1280]{1,0} broadcast(%constant_217767_1), dimensions={}
+  %add.244106.19 = u32[1280,1280]{1,0} add(%xor.119931.15, %broadcast.244467.19)
+  %xor.119932.17 = u32[1280,1280]{1,0} xor(%add.244105.17, %add.244106.19)
+  %constant_39500_122 = u32[] constant(9)
+  %broadcast.244468.1920 = u32[1280,1280]{1,0} broadcast(%constant_39500_122), dimensions={}
+  %shift-right-logical.113854.15 = u32[1280,1280]{1,0} shift-right-logical(%xor.119932.17, %broadcast.244468.1920)
+  %constant_39501_122 = u32[] constant(1065353216)
+  %broadcast.244469.1664 = u32[1280,1280]{1,0} broadcast(%constant_39501_122), dimensions={}
+  %or.113371.13 = u32[1280,1280]{1,0} or(%shift-right-logical.113854.15, %broadcast.244469.1664)
+  %bitcast-convert.5669.11 = f32[1280,1280]{1,0} bitcast-convert(%or.113371.13)
+  %constant_84831_250 = f32[] constant(-1)
+  %broadcast.244470.1152 = f32[1280,1280]{1,0} broadcast(%constant_84831_250), dimensions={}
+  %add.244108.9 = f32[1280,1280]{1,0} add(%bitcast-convert.5669.11, %broadcast.244470.1152)
+  %constant_84836_39 = f32[] constant(1.90899944)
+  %broadcast.244471.896 = f32[1280,1280]{1,0} broadcast(%constant_84836_39), dimensions={}
+  %multiply.25583.7 = f32[1280,1280]{1,0} multiply(%add.244108.9, %broadcast.244471.896)
+  %add.244112.5 = f32[1280,1280]{1,0} add(%multiply.25583.7, %broadcast.244408.1024)
+  %maximum.3601.3 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.244112.5)
+  %abs.1483.3 = f32[1280,1280]{1,0} abs(%maximum.3601.3)
+  %constant_23_60 = f32[] constant(1)
+  %broadcast.244475.384 = f32[1280,1280]{1,0} broadcast(%constant_23_60), dimensions={}
+  %compare.7114.3 = pred[1280,1280]{1,0} compare(%abs.1483.3, %broadcast.244475.384), direction=EQ
+  %constant_39504_52 = f32[] constant(inf)
+  %broadcast.244476.1152 = f32[1280,1280]{1,0} broadcast(%constant_39504_52), dimensions={}
+  %multiply.25584.9 = f32[1280,1280]{1,0} multiply(%maximum.3601.3, %broadcast.244476.1152)
+  %negate.4471.5 = f32[1280,1280]{1,0} negate(%maximum.3601.3)
+  %multiply.25585.5 = f32[1280,1280]{1,0} multiply(%maximum.3601.3, %negate.4471.5)
+  %log-plus-one.1483.3 = f32[1280,1280]{1,0} log-plus-one(%multiply.25585.5)
+  %negate.4472.4 = f32[1280,1280]{1,0} negate(%log-plus-one.1483.3)
+  %constant_39505_52 = f32[] constant(5)
+  %broadcast.244477.384 = f32[1280,1280]{1,0} broadcast(%constant_39505_52), dimensions={}
+  %compare.7115.3 = pred[1280,1280]{1,0} compare(%negate.4472.4, %broadcast.244477.384), direction=LT
+  %constant_39506_52 = f32[] constant(1.50140941)
+  %broadcast.244478.896 = f32[1280,1280]{1,0} broadcast(%constant_39506_52), dimensions={}
+  %constant_39507_52 = f32[] constant(2.83297682)
+  %broadcast.244479.896 = f32[1280,1280]{1,0} broadcast(%constant_39507_52), dimensions={}
+  %select.20357.7 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244478.896, %broadcast.244479.896)
+  %constant_39508_52 = f32[] constant(0.246640727)
+  %broadcast.244480.1408 = f32[1280,1280]{1,0} broadcast(%constant_39508_52), dimensions={}
+  %constant_39509_52 = f32[] constant(1.00167406)
+  %broadcast.244481.1408 = f32[1280,1280]{1,0} broadcast(%constant_39509_52), dimensions={}
+  %select.20358.11 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244480.1408, %broadcast.244481.1408)
+  %constant_39510_52 = f32[] constant(-0.00417768164)
+  %broadcast.244482.640 = f32[1280,1280]{1,0} broadcast(%constant_39510_52), dimensions={}
+  %constant_39511_52 = f32[] constant(0.00943887047)
+  %broadcast.244483.640 = f32[1280,1280]{1,0} broadcast(%constant_39511_52), dimensions={}
+  %select.20359.5 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244482.640, %broadcast.244483.640)
+  %constant_39512_52 = f32[] constant(-0.00125372503)
+  %broadcast.244484.640 = f32[1280,1280]{1,0} broadcast(%constant_39512_52), dimensions={}
+  %constant_39513_52 = f32[] constant(-0.0076224613)
+  %broadcast.244485.640 = f32[1280,1280]{1,0} broadcast(%constant_39513_52), dimensions={}
+  %select.20360.5 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244484.640, %broadcast.244485.640)
+  %constant_39514_52 = f32[] constant(0.00021858087)
+  %broadcast.244486.384 = f32[1280,1280]{1,0} broadcast(%constant_39514_52), dimensions={}
+  %constant_39515_52 = f32[] constant(0.00573950773)
+  %broadcast.244487.384 = f32[1280,1280]{1,0} broadcast(%constant_39515_52), dimensions={}
+  %select.20361.3 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244486.384, %broadcast.244487.384)
+  %constant_39516_52 = f32[] constant(-4.39150654e-06)
+  %broadcast.244488.384 = f32[1280,1280]{1,0} broadcast(%constant_39516_52), dimensions={}
+  %constant_39517_52 = f32[] constant(-0.00367342844)
+  %broadcast.244489.384 = f32[1280,1280]{1,0} broadcast(%constant_39517_52), dimensions={}
+  %select.20362.3 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244488.384, %broadcast.244489.384)
+  %constant_39518_52 = f32[] constant(-3.5233877e-06)
+  %broadcast.244490.384 = f32[1280,1280]{1,0} broadcast(%constant_39518_52), dimensions={}
+  %constant_39519_52 = f32[] constant(0.00134934322)
+  %broadcast.244491.384 = f32[1280,1280]{1,0} broadcast(%constant_39519_52), dimensions={}
+  %select.20363.3 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244490.384, %broadcast.244491.384)
+  %constant_39520_52 = f32[] constant(3.43273939e-07)
+  %broadcast.244492.384 = f32[1280,1280]{1,0} broadcast(%constant_39520_52), dimensions={}
+  %constant_39521_52 = f32[] constant(0.000100950558)
+  %broadcast.244493.384 = f32[1280,1280]{1,0} broadcast(%constant_39521_52), dimensions={}
+  %select.20364.3 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244492.384, %broadcast.244493.384)
+  %constant_39522_52 = f32[] constant(2.81022636e-08)
+  %broadcast.244494.384 = f32[1280,1280]{1,0} broadcast(%constant_39522_52), dimensions={}
+  %constant_39523_52 = f32[] constant(-0.000200214257)
+  %broadcast.244495.384 = f32[1280,1280]{1,0} broadcast(%constant_39523_52), dimensions={}
+  %select.20365.3 = f32[1280,1280]{1,0} select(%compare.7115.3, %broadcast.244494.384, %broadcast.244495.384)
+  %constant_84838_52 = f32[] constant(-2.5)
+  %broadcast.244496.640 = f32[1280,1280]{1,0} broadcast(%constant_84838_52), dimensions={}
+  %add.244113.5 = f32[1280,1280]{1,0} add(%negate.4472.4, %broadcast.244496.640)
+  %sqrt.1483.5 = f32[1280,1280]{1,0} sqrt(%negate.4472.4)
+  %constant_84839_52 = f32[] constant(-3)
+  %broadcast.244498.640 = f32[1280,1280]{1,0} broadcast(%constant_84839_52), dimensions={}
+  %add.244114.5 = f32[1280,1280]{1,0} add(%sqrt.1483.5, %broadcast.244498.640)
+  %select.20366.3 = f32[1280,1280]{1,0} select(%compare.7115.3, %add.244113.5, %add.244114.5)
+  %multiply.25586.1 = f32[1280,1280]{1,0} multiply(%select.20365.3, %select.20366.3)
+  %add.244115.1 = f32[1280,1280]{1,0} add(%select.20364.3, %multiply.25586.1)
+  %multiply.25587.1 = f32[1280,1280]{1,0} multiply(%add.244115.1, %select.20366.3)
+  %add.244117.1 = f32[1280,1280]{1,0} add(%select.20363.3, %multiply.25587.1)
+  %multiply.25588.1 = f32[1280,1280]{1,0} multiply(%add.244117.1, %select.20366.3)
+  %add.244118.1 = f32[1280,1280]{1,0} add(%select.20362.3, %multiply.25588.1)
+  %multiply.25589.1 = f32[1280,1280]{1,0} multiply(%add.244118.1, %select.20366.3)
+  %add.244119.1 = f32[1280,1280]{1,0} add(%select.20361.3, %multiply.25589.1)
+  %multiply.25590.1 = f32[1280,1280]{1,0} multiply(%add.244119.1, %select.20366.3)
+  %add.244120.3 = f32[1280,1280]{1,0} add(%select.20360.5, %multiply.25590.1)
+  %multiply.25591.1 = f32[1280,1280]{1,0} multiply(%add.244120.3, %select.20366.3)
+  %add.244122.3 = f32[1280,1280]{1,0} add(%select.20359.5, %multiply.25591.1)
+  %multiply.25592.7 = f32[1280,1280]{1,0} multiply(%add.244122.3, %select.20366.3)
+  %add.244123.9 = f32[1280,1280]{1,0} add(%select.20358.11, %multiply.25592.7)
+  %multiply.25593.7 = f32[1280,1280]{1,0} multiply(%add.244123.9, %select.20366.3)
+  %add.244124.7 = f32[1280,1280]{1,0} add(%select.20357.7, %multiply.25593.7)
+  %multiply.25594.7 = f32[1280,1280]{1,0} multiply(%add.244124.7, %maximum.3601.3)
+  %select.20367.7 = f32[1280,1280]{1,0} select(%compare.7114.3, %multiply.25584.9, %multiply.25594.7)
+  %constant_39526_43 = f32[] constant(1.41421354)
+  %broadcast.244500.640 = f32[1280,1280]{1,0} broadcast(%constant_39526_43), dimensions={}
+  %multiply.25595.5 = f32[1280,1280]{1,0} multiply(%select.20367.7, %broadcast.244500.640)
+  %constant_84330_39 = f32[] constant(1.99999988)
+  %broadcast.244501.384 = f32[1280,1280]{1,0} broadcast(%constant_84330_39), dimensions={}
+  %clamp.1127.3 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25595.5, %broadcast.244501.384)
+  %constant_106_2 = f32[] constant(0.0317758434)
+  %broadcast.244502.1 = f32[1280,1280]{1,0} broadcast(%constant_106_2), dimensions={}
+  %multiply.25596.1 = f32[1280,1280]{1,0} multiply(%clamp.1127.3, %broadcast.244502.1)
+  %constant_158523_1_clone_1 = u32[] constant(2951773949)
+  %broadcast.244607.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_158523_1_clone_1), dimensions={}
+  %add.244174.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.244607.44.clone.1)
+  %constant_158531_1_clone_1 = u32[] constant(1662691728)
+  %broadcast.244608.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_158531_1_clone_1), dimensions={}
+  %add.244175.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.244608.113.clone.1)
+  %add.244176.35.clone.1 = u32[1280,1280]{1,0} add(%add.244174.37.clone.1, %add.244175.99.clone.1)
+  %shift-left.107794.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.244175.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.113876.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.244175.99.clone.1, %broadcast.244415.6016)
+  %or.113397.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107794.31.clone.1, %shift-right-logical.113876.29.clone.1)
+  %xor.119958.27.clone.1 = u32[1280,1280]{1,0} xor(%add.244176.35.clone.1, %or.113397.29.clone.1)
+  %add.244177.5.clone.1 = u32[1280,1280]{1,0} add(%add.244176.35.clone.1, %xor.119958.27.clone.1)
+  %shift-left.107796.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119958.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.113877.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119958.27.clone.1, %broadcast.244417.5760)
+  %or.113398.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107796.9.clone.1, %shift-right-logical.113877.9.clone.1)
+  %xor.119959.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244177.5.clone.1, %or.113398.7.clone.1)
+  %add.244178.3.clone.1 = u32[1280,1280]{1,0} add(%add.244177.5.clone.1, %xor.119959.5.clone.1)
+  %shift-left.107797.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119959.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.113878.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119959.5.clone.1, %broadcast.244419.4352)
+  %or.113399.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107797.5.clone.1, %shift-right-logical.113878.5.clone.1)
+  %xor.119961.3.clone.1 = u32[1280,1280]{1,0} xor(%add.244178.3.clone.1, %or.113399.3.clone.1)
+  %add.244179.3.clone.1 = u32[1280,1280]{1,0} add(%add.244178.3.clone.1, %xor.119961.3.clone.1)
+  %add.244180.7.clone.1 = u32[1280,1280]{1,0} add(%add.244179.3.clone.1, %broadcast.244608.113.clone.1)
+  %shift-left.107798.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119961.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.113879.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119961.3.clone.1, %broadcast.244418.4352)
+  %or.113401.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107798.5.clone.1, %shift-right-logical.113879.5.clone.1)
+  %xor.119962.3.clone.1 = u32[1280,1280]{1,0} xor(%add.244179.3.clone.1, %or.113401.3.clone.1)
+  %constant_217773_1_clone_1 = u32[] constant(3611020472)
+  %broadcast.244618.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217773_1_clone_1), dimensions={}
+  %add.244181.5.clone.1 = u32[1280,1280]{1,0} add(%xor.119962.3.clone.1, %broadcast.244618.5.clone.1)
+  %add.244182.5.clone.1 = u32[1280,1280]{1,0} add(%add.244180.7.clone.1, %add.244181.5.clone.1)
+  %shift-left.107799.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.244181.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.113880.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.244181.5.clone.1, %broadcast.244416.5760)
+  %or.113402.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107799.9.clone.1, %shift-right-logical.113880.9.clone.1)
+  %xor.119963.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244182.5.clone.1, %or.113402.7.clone.1)
+  %add.244184.3.clone.1 = u32[1280,1280]{1,0} add(%add.244182.5.clone.1, %xor.119963.5.clone.1)
+  %shift-left.107801.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119963.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.113881.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119963.5.clone.1, %broadcast.244429.2304)
+  %or.113403.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107801.9.clone.1, %shift-right-logical.113881.9.clone.1)
+  %xor.119964.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244184.3.clone.1, %or.113403.7.clone.1)
+  %add.244185.3.clone.1 = u32[1280,1280]{1,0} add(%add.244184.3.clone.1, %xor.119964.5.clone.1)
+  %shift-left.107802.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119964.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.113882.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119964.5.clone.1, %broadcast.244430.4608)
+  %or.113404.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107802.9.clone.1, %shift-right-logical.113882.9.clone.1)
+  %xor.119966.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244185.3.clone.1, %or.113404.7.clone.1)
+  %add.244186.3.clone.1 = u32[1280,1280]{1,0} add(%add.244185.3.clone.1, %xor.119966.5.clone.1)
+  %constant_158541_1_clone_1 = u32[] constant(3611020471)
+  %broadcast.244625.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_158541_1_clone_1), dimensions={}
+  %add.244187.7.clone.1 = u32[1280,1280]{1,0} add(%add.244186.3.clone.1, %broadcast.244625.24.clone.1)
+  %shift-left.107803.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119966.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.113883.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119966.5.clone.1, %broadcast.244434.2816)
+  %or.113406.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107803.11.clone.1, %shift-right-logical.113883.11.clone.1)
+  %xor.119967.7.clone.1 = u32[1280,1280]{1,0} xor(%add.244186.3.clone.1, %or.113406.9.clone.1)
+  %constant_217774_1_clone_1 = u32[] constant(2951773951)
+  %broadcast.244628.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217774_1_clone_1), dimensions={}
+  %add.244188.5.clone.1 = u32[1280,1280]{1,0} add(%xor.119967.7.clone.1, %broadcast.244628.5.clone.1)
+  %add.244189.5.clone.1 = u32[1280,1280]{1,0} add(%add.244187.7.clone.1, %add.244188.5.clone.1)
+  %shift-left.107804.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.244188.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.113884.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.244188.5.clone.1, %broadcast.244415.6016)
+  %or.113407.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107804.9.clone.1, %shift-right-logical.113884.9.clone.1)
+  %xor.119968.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244189.5.clone.1, %or.113407.7.clone.1)
+  %add.244190.3.clone.1 = u32[1280,1280]{1,0} add(%add.244189.5.clone.1, %xor.119968.5.clone.1)
+  %shift-left.107805.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119968.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.113886.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119968.5.clone.1, %broadcast.244417.5760)
+  %or.113408.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107805.9.clone.1, %shift-right-logical.113886.9.clone.1)
+  %xor.119969.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244190.3.clone.1, %or.113408.7.clone.1)
+  %add.244191.3.clone.1 = u32[1280,1280]{1,0} add(%add.244190.3.clone.1, %xor.119969.5.clone.1)
+  %shift-left.107806.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119969.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.113887.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119969.5.clone.1, %broadcast.244419.4352)
+  %or.113409.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107806.7.clone.1, %shift-right-logical.113887.7.clone.1)
+  %xor.119971.3.clone.1 = u32[1280,1280]{1,0} xor(%add.244191.3.clone.1, %or.113409.5.clone.1)
+  %add.244192.3.clone.1 = u32[1280,1280]{1,0} add(%add.244191.3.clone.1, %xor.119971.3.clone.1)
+  %add.244193.7.clone.1 = u32[1280,1280]{1,0} add(%add.244192.3.clone.1, %broadcast.244607.44.clone.1)
+  %shift-left.107807.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119971.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.113888.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119971.3.clone.1, %broadcast.244418.4352)
+  %or.113410.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107807.7.clone.1, %shift-right-logical.113888.7.clone.1)
+  %xor.119972.3.clone.1 = u32[1280,1280]{1,0} xor(%add.244192.3.clone.1, %or.113410.5.clone.1)
+  %constant_217775_1_clone_1 = u32[] constant(1662691731)
+  %broadcast.244638.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217775_1_clone_1), dimensions={}
+  %add.244194.5.clone.1 = u32[1280,1280]{1,0} add(%xor.119972.3.clone.1, %broadcast.244638.5.clone.1)
+  %add.244195.5.clone.1 = u32[1280,1280]{1,0} add(%add.244193.7.clone.1, %add.244194.5.clone.1)
+  %shift-left.107808.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.244194.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.113889.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.244194.5.clone.1, %broadcast.244416.5760)
+  %or.113411.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107808.9.clone.1, %shift-right-logical.113889.9.clone.1)
+  %xor.119973.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244195.5.clone.1, %or.113411.7.clone.1)
+  %add.244196.3.clone.1 = u32[1280,1280]{1,0} add(%add.244195.5.clone.1, %xor.119973.5.clone.1)
+  %shift-left.107809.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119973.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.113891.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119973.5.clone.1, %broadcast.244429.2304)
+  %or.113412.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107809.9.clone.1, %shift-right-logical.113891.9.clone.1)
+  %xor.119974.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244196.3.clone.1, %or.113412.7.clone.1)
+  %add.244197.3.clone.1 = u32[1280,1280]{1,0} add(%add.244196.3.clone.1, %xor.119974.5.clone.1)
+  %shift-left.107811.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119974.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.113892.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119974.5.clone.1, %broadcast.244430.4608)
+  %or.113413.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107811.9.clone.1, %shift-right-logical.113892.9.clone.1)
+  %xor.119975.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244197.3.clone.1, %or.113413.7.clone.1)
+  %add.244198.3.clone.1 = u32[1280,1280]{1,0} add(%add.244197.3.clone.1, %xor.119975.5.clone.1)
+  %add.244199.7.clone.1 = u32[1280,1280]{1,0} add(%add.244198.3.clone.1, %broadcast.244608.113.clone.1)
+  %shift-left.107812.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119975.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.113893.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119975.5.clone.1, %broadcast.244434.2816)
+  %or.113414.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107812.11.clone.1, %shift-right-logical.113893.11.clone.1)
+  %xor.119976.7.clone.1 = u32[1280,1280]{1,0} xor(%add.244198.3.clone.1, %or.113414.9.clone.1)
+  %constant_217776_1_clone_1 = u32[] constant(3611020475)
+  %broadcast.244648.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217776_1_clone_1), dimensions={}
+  %add.244200.5.clone.1 = u32[1280,1280]{1,0} add(%xor.119976.7.clone.1, %broadcast.244648.5.clone.1)
+  %add.244201.5.clone.1 = u32[1280,1280]{1,0} add(%add.244199.7.clone.1, %add.244200.5.clone.1)
+  %shift-left.107813.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.244200.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.113894.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.244200.5.clone.1, %broadcast.244415.6016)
+  %or.113415.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107813.9.clone.1, %shift-right-logical.113894.9.clone.1)
+  %xor.119977.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244201.5.clone.1, %or.113415.7.clone.1)
+  %add.244202.3.clone.1 = u32[1280,1280]{1,0} add(%add.244201.5.clone.1, %xor.119977.5.clone.1)
+  %shift-left.107814.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119977.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.113896.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119977.5.clone.1, %broadcast.244417.5760)
+  %or.113416.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107814.9.clone.1, %shift-right-logical.113896.9.clone.1)
+  %xor.119978.5.clone.1 = u32[1280,1280]{1,0} xor(%add.244202.3.clone.1, %or.113416.7.clone.1)
+  %add.244203.3.clone.1 = u32[1280,1280]{1,0} add(%add.244202.3.clone.1, %xor.119978.5.clone.1)
+  %shift-left.107816.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119978.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.113897.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119978.5.clone.1, %broadcast.244419.4352)
+  %or.113417.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107816.5.clone.1, %shift-right-logical.113897.5.clone.1)
+  %xor.119979.3.clone.1 = u32[1280,1280]{1,0} xor(%add.244203.3.clone.1, %or.113417.3.clone.1)
+  %add.244204.3.clone.1 = u32[1280,1280]{1,0} add(%add.244203.3.clone.1, %xor.119979.3.clone.1)
+  %add.244205.17.clone.1 = u32[1280,1280]{1,0} add(%add.244204.3.clone.1, %broadcast.244625.24.clone.1)
+  %shift-left.107817.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.119979.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.113898.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119979.3.clone.1, %broadcast.244418.4352)
+  %or.113418.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.107817.5.clone.1, %shift-right-logical.113898.5.clone.1)
+  %xor.119980.15.clone.1 = u32[1280,1280]{1,0} xor(%add.244204.3.clone.1, %or.113418.3.clone.1)
+  %constant_217777_1_clone_1 = u32[] constant(2951773954)
+  %broadcast.244658.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217777_1_clone_1), dimensions={}
+  %add.244206.19.clone.1 = u32[1280,1280]{1,0} add(%xor.119980.15.clone.1, %broadcast.244658.19.clone.1)
+  %xor.119981.17.clone.1 = u32[1280,1280]{1,0} xor(%add.244205.17.clone.1, %add.244206.19.clone.1)
+  %shift-right-logical.113899.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.119981.17.clone.1, %broadcast.244468.1920)
+  %or.113419.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.113899.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5671.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.113419.13.clone.1)
+  %add.244207.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5671.11.clone.1, %broadcast.244470.1152)
+  %multiply.25611.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.244207.9.clone.1, %broadcast.244471.896)
+  %add.244208.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25611.7.clone.1, %broadcast.244408.1024)
+  %maximum.3603.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.244208.5.clone.1)
+  %abs.1485.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3603.3.clone.1)
+  %compare.7118.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1485.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25612.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3603.3.clone.1, %broadcast.244476.1152)
+  %negate.4475.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3603.3.clone.1)
+  %multiply.25613.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3603.3.clone.1, %negate.4475.5.clone.1)
+  %log-plus-one.1485.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25613.5.clone.1)
+  %negate.4476.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1485.3.clone.1)
+  %compare.7119.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4476.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20379.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20380.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20381.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20382.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20383.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20384.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20385.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20386.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20387.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.244209.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4476.4.clone.1, %broadcast.244496.640)
+  %sqrt.1485.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4476.4.clone.1)
+  %add.244210.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1485.5.clone.1, %broadcast.244498.640)
+  %select.20388.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7119.3.clone.1, %add.244209.5.clone.1, %add.244210.5.clone.1)
+  %multiply.25614.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20387.3.clone.1, %select.20388.3.clone.1)
+  %add.244211.1.clone.1 = f32[1280,1280]{1,0} add(%select.20386.3.clone.1, %multiply.25614.1.clone.1)
+  %multiply.25615.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.244211.1.clone.1, %select.20388.3.clone.1)
+  %add.244212.1.clone.1 = f32[1280,1280]{1,0} add(%select.20385.3.clone.1, %multiply.25615.1.clone.1)
+  %multiply.25616.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.244212.1.clone.1, %select.20388.3.clone.1)
+  %add.244213.1.clone.1 = f32[1280,1280]{1,0} add(%select.20384.3.clone.1, %multiply.25616.1.clone.1)
+  %multiply.25617.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.244213.1.clone.1, %select.20388.3.clone.1)
+  %add.244214.1.clone.1 = f32[1280,1280]{1,0} add(%select.20383.3.clone.1, %multiply.25617.1.clone.1)
+  %multiply.25618.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.244214.1.clone.1, %select.20388.3.clone.1)
+  %add.244216.3.clone.1 = f32[1280,1280]{1,0} add(%select.20382.5.clone.1, %multiply.25618.1.clone.1)
+  %multiply.25619.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.244216.3.clone.1, %select.20388.3.clone.1)
+  %add.244219.3.clone.1 = f32[1280,1280]{1,0} add(%select.20381.5.clone.1, %multiply.25619.1.clone.1)
+  %multiply.25620.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.244219.3.clone.1, %select.20388.3.clone.1)
+  %add.244220.9.clone.1 = f32[1280,1280]{1,0} add(%select.20380.11.clone.1, %multiply.25620.7.clone.1)
+  %multiply.25621.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.244220.9.clone.1, %select.20388.3.clone.1)
+  %add.244221.7.clone.1 = f32[1280,1280]{1,0} add(%select.20379.7.clone.1, %multiply.25621.7.clone.1)
+  %multiply.25622.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.244221.7.clone.1, %maximum.3603.3.clone.1)
+  %select.20389.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7118.3.clone.1, %multiply.25612.9.clone.1, %multiply.25622.7.clone.1)
+  %multiply.25623.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20389.7.clone.1, %broadcast.244500.640)
+  %clamp.1129.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25623.5.clone.1, %broadcast.244501.384)
+  %multiply.25624.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1129.3.clone.1, %broadcast.244502.1)
+  %constant_177603_1_clone_1 = u32[] constant(3731615297)
+  %broadcast.252810.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177603_1_clone_1), dimensions={}
+  %add.248884.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.252810.44.clone.1)
+  %constant_177614_1_clone_1 = u32[] constant(961749960)
+  %broadcast.252811.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177614_1_clone_1), dimensions={}
+  %add.248886.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.252811.113.clone.1)
+  %add.248887.35.clone.1 = u32[1280,1280]{1,0} add(%add.248884.37.clone.1, %add.248886.99.clone.1)
+  %shift-left.109834.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248886.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116024.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248886.99.clone.1, %broadcast.244415.6016)
+  %or.115553.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109834.31.clone.1, %shift-right-logical.116024.29.clone.1)
+  %xor.122103.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248887.35.clone.1, %or.115553.29.clone.1)
+  %add.248888.5.clone.1 = u32[1280,1280]{1,0} add(%add.248887.35.clone.1, %xor.122103.27.clone.1)
+  %shift-left.109835.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122103.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116025.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122103.27.clone.1, %broadcast.244417.5760)
+  %or.115555.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109835.9.clone.1, %shift-right-logical.116025.9.clone.1)
+  %xor.122104.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248888.5.clone.1, %or.115555.7.clone.1)
+  %add.248889.3.clone.1 = u32[1280,1280]{1,0} add(%add.248888.5.clone.1, %xor.122104.5.clone.1)
+  %shift-left.109836.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122104.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116026.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122104.5.clone.1, %broadcast.244419.4352)
+  %or.115556.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109836.5.clone.1, %shift-right-logical.116026.5.clone.1)
+  %xor.122105.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248889.3.clone.1, %or.115556.3.clone.1)
+  %add.248891.3.clone.1 = u32[1280,1280]{1,0} add(%add.248889.3.clone.1, %xor.122105.3.clone.1)
+  %add.248892.7.clone.1 = u32[1280,1280]{1,0} add(%add.248891.3.clone.1, %broadcast.252811.113.clone.1)
+  %shift-left.109837.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122105.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116027.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122105.3.clone.1, %broadcast.244418.4352)
+  %or.115557.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109837.5.clone.1, %shift-right-logical.116027.5.clone.1)
+  %xor.122106.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248891.3.clone.1, %or.115557.3.clone.1)
+  %constant_218297_1_clone_1 = u32[] constant(4243183188)
+  %broadcast.252821.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218297_1_clone_1), dimensions={}
+  %add.248893.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122106.3.clone.1, %broadcast.252821.5.clone.1)
+  %add.248894.5.clone.1 = u32[1280,1280]{1,0} add(%add.248892.7.clone.1, %add.248893.5.clone.1)
+  %shift-left.109839.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248893.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116028.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248893.5.clone.1, %broadcast.244416.5760)
+  %or.115558.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109839.9.clone.1, %shift-right-logical.116028.9.clone.1)
+  %xor.122107.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248894.5.clone.1, %or.115558.7.clone.1)
+  %add.248895.3.clone.1 = u32[1280,1280]{1,0} add(%add.248894.5.clone.1, %xor.122107.5.clone.1)
+  %shift-left.109840.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122107.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116029.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122107.5.clone.1, %broadcast.244429.2304)
+  %or.115559.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109840.9.clone.1, %shift-right-logical.116029.9.clone.1)
+  %xor.122108.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248895.3.clone.1, %or.115559.7.clone.1)
+  %add.248897.3.clone.1 = u32[1280,1280]{1,0} add(%add.248895.3.clone.1, %xor.122108.5.clone.1)
+  %shift-left.109841.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122108.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116030.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122108.5.clone.1, %broadcast.244430.4608)
+  %or.115560.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109841.9.clone.1, %shift-right-logical.116030.9.clone.1)
+  %xor.122110.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248897.3.clone.1, %or.115560.7.clone.1)
+  %add.248901.3.clone.1 = u32[1280,1280]{1,0} add(%add.248897.3.clone.1, %xor.122110.5.clone.1)
+  %constant_177616_1_clone_1 = u32[] constant(4243183187)
+  %broadcast.252828.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177616_1_clone_1), dimensions={}
+  %add.248902.7.clone.1 = u32[1280,1280]{1,0} add(%add.248901.3.clone.1, %broadcast.252828.24.clone.1)
+  %shift-left.109842.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122110.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116031.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122110.5.clone.1, %broadcast.244434.2816)
+  %or.115561.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109842.11.clone.1, %shift-right-logical.116031.11.clone.1)
+  %xor.122111.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248901.3.clone.1, %or.115561.9.clone.1)
+  %constant_218298_1_clone_1 = u32[] constant(3731615299)
+  %broadcast.252831.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218298_1_clone_1), dimensions={}
+  %add.248903.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122111.7.clone.1, %broadcast.252831.5.clone.1)
+  %add.248904.5.clone.1 = u32[1280,1280]{1,0} add(%add.248902.7.clone.1, %add.248903.5.clone.1)
+  %shift-left.109844.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248903.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116032.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248903.5.clone.1, %broadcast.244415.6016)
+  %or.115562.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109844.9.clone.1, %shift-right-logical.116032.9.clone.1)
+  %xor.122112.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248904.5.clone.1, %or.115562.7.clone.1)
+  %add.248906.3.clone.1 = u32[1280,1280]{1,0} add(%add.248904.5.clone.1, %xor.122112.5.clone.1)
+  %shift-left.109845.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122112.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116033.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122112.5.clone.1, %broadcast.244417.5760)
+  %or.115563.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109845.9.clone.1, %shift-right-logical.116033.9.clone.1)
+  %xor.122113.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248906.3.clone.1, %or.115563.7.clone.1)
+  %add.248907.3.clone.1 = u32[1280,1280]{1,0} add(%add.248906.3.clone.1, %xor.122113.5.clone.1)
+  %shift-left.109846.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122113.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116034.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122113.5.clone.1, %broadcast.244419.4352)
+  %or.115565.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109846.7.clone.1, %shift-right-logical.116034.7.clone.1)
+  %xor.122115.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248907.3.clone.1, %or.115565.5.clone.1)
+  %add.248908.3.clone.1 = u32[1280,1280]{1,0} add(%add.248907.3.clone.1, %xor.122115.3.clone.1)
+  %add.248909.7.clone.1 = u32[1280,1280]{1,0} add(%add.248908.3.clone.1, %broadcast.252810.44.clone.1)
+  %shift-left.109847.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122115.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116035.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122115.3.clone.1, %broadcast.244418.4352)
+  %or.115566.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109847.7.clone.1, %shift-right-logical.116035.7.clone.1)
+  %xor.122116.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248908.3.clone.1, %or.115566.5.clone.1)
+  %constant_218299_1_clone_1 = u32[] constant(961749963)
+  %broadcast.252841.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218299_1_clone_1), dimensions={}
+  %add.248911.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122116.3.clone.1, %broadcast.252841.5.clone.1)
+  %add.248912.5.clone.1 = u32[1280,1280]{1,0} add(%add.248909.7.clone.1, %add.248911.5.clone.1)
+  %shift-left.109848.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248911.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116036.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248911.5.clone.1, %broadcast.244416.5760)
+  %or.115567.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109848.9.clone.1, %shift-right-logical.116036.9.clone.1)
+  %xor.122117.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248912.5.clone.1, %or.115567.7.clone.1)
+  %add.248913.3.clone.1 = u32[1280,1280]{1,0} add(%add.248912.5.clone.1, %xor.122117.5.clone.1)
+  %shift-left.109849.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122117.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116037.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122117.5.clone.1, %broadcast.244429.2304)
+  %or.115568.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109849.9.clone.1, %shift-right-logical.116037.9.clone.1)
+  %xor.122118.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248913.3.clone.1, %or.115568.7.clone.1)
+  %add.248914.3.clone.1 = u32[1280,1280]{1,0} add(%add.248913.3.clone.1, %xor.122118.5.clone.1)
+  %shift-left.109850.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122118.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116038.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122118.5.clone.1, %broadcast.244430.4608)
+  %or.115570.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109850.9.clone.1, %shift-right-logical.116038.9.clone.1)
+  %xor.122120.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248914.3.clone.1, %or.115570.7.clone.1)
+  %add.248916.3.clone.1 = u32[1280,1280]{1,0} add(%add.248914.3.clone.1, %xor.122120.5.clone.1)
+  %add.248917.7.clone.1 = u32[1280,1280]{1,0} add(%add.248916.3.clone.1, %broadcast.252811.113.clone.1)
+  %shift-left.109851.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122120.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116039.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122120.5.clone.1, %broadcast.244434.2816)
+  %or.115571.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109851.11.clone.1, %shift-right-logical.116039.11.clone.1)
+  %xor.122121.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248916.3.clone.1, %or.115571.9.clone.1)
+  %constant_218300_1_clone_1 = u32[] constant(4243183191)
+  %broadcast.252851.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218300_1_clone_1), dimensions={}
+  %add.248918.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122121.7.clone.1, %broadcast.252851.5.clone.1)
+  %add.248919.5.clone.1 = u32[1280,1280]{1,0} add(%add.248917.7.clone.1, %add.248918.5.clone.1)
+  %shift-left.109852.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248918.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116040.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248918.5.clone.1, %broadcast.244415.6016)
+  %or.115572.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109852.9.clone.1, %shift-right-logical.116040.9.clone.1)
+  %xor.122122.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248919.5.clone.1, %or.115572.7.clone.1)
+  %add.248920.3.clone.1 = u32[1280,1280]{1,0} add(%add.248919.5.clone.1, %xor.122122.5.clone.1)
+  %shift-left.109854.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122122.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116041.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122122.5.clone.1, %broadcast.244417.5760)
+  %or.115573.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109854.9.clone.1, %shift-right-logical.116041.9.clone.1)
+  %xor.122123.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248920.3.clone.1, %or.115573.7.clone.1)
+  %add.248922.3.clone.1 = u32[1280,1280]{1,0} add(%add.248920.3.clone.1, %xor.122123.5.clone.1)
+  %shift-left.109855.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122123.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116042.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122123.5.clone.1, %broadcast.244419.4352)
+  %or.115575.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109855.5.clone.1, %shift-right-logical.116042.5.clone.1)
+  %xor.122125.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248922.3.clone.1, %or.115575.3.clone.1)
+  %add.248926.3.clone.1 = u32[1280,1280]{1,0} add(%add.248922.3.clone.1, %xor.122125.3.clone.1)
+  %add.248927.17.clone.1 = u32[1280,1280]{1,0} add(%add.248926.3.clone.1, %broadcast.252828.24.clone.1)
+  %shift-left.109856.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122125.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116043.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122125.3.clone.1, %broadcast.244418.4352)
+  %or.115576.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109856.5.clone.1, %shift-right-logical.116043.5.clone.1)
+  %xor.122126.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248926.3.clone.1, %or.115576.3.clone.1)
+  %constant_218301_1_clone_1 = u32[] constant(3731615302)
+  %broadcast.252861.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218301_1_clone_1), dimensions={}
+  %add.248928.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122126.15.clone.1, %broadcast.252861.19.clone.1)
+  %xor.122127.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248927.17.clone.1, %add.248928.19.clone.1)
+  %shift-right-logical.116044.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122127.17.clone.1, %broadcast.244468.1920)
+  %or.115577.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116044.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5765.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115577.13.clone.1)
+  %add.248929.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5765.11.clone.1, %broadcast.244470.1152)
+  %multiply.26575.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248929.9.clone.1, %broadcast.244471.896)
+  %add.248931.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26575.7.clone.1, %broadcast.244408.1024)
+  %maximum.3697.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248931.5.clone.1)
+  %abs.1547.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3697.3.clone.1)
+  %compare.7242.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1547.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26576.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3697.3.clone.1, %broadcast.244476.1152)
+  %negate.4599.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3697.3.clone.1)
+  %multiply.26577.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3697.3.clone.1, %negate.4599.5.clone.1)
+  %log-plus-one.1547.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26577.5.clone.1)
+  %negate.4600.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1547.3.clone.1)
+  %compare.7243.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4600.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21103.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21104.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21105.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21106.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21107.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21108.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21109.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21110.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21111.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248932.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4600.4.clone.1, %broadcast.244496.640)
+  %sqrt.1547.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4600.4.clone.1)
+  %add.248933.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1547.5.clone.1, %broadcast.244498.640)
+  %select.21112.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7243.3.clone.1, %add.248932.5.clone.1, %add.248933.5.clone.1)
+  %multiply.26578.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21111.3.clone.1, %select.21112.3.clone.1)
+  %add.248934.1.clone.1 = f32[1280,1280]{1,0} add(%select.21110.3.clone.1, %multiply.26578.1.clone.1)
+  %multiply.26579.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248934.1.clone.1, %select.21112.3.clone.1)
+  %add.248936.1.clone.1 = f32[1280,1280]{1,0} add(%select.21109.3.clone.1, %multiply.26579.1.clone.1)
+  %multiply.26580.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248936.1.clone.1, %select.21112.3.clone.1)
+  %add.248937.1.clone.1 = f32[1280,1280]{1,0} add(%select.21108.3.clone.1, %multiply.26580.1.clone.1)
+  %multiply.26581.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248937.1.clone.1, %select.21112.3.clone.1)
+  %add.248938.1.clone.1 = f32[1280,1280]{1,0} add(%select.21107.3.clone.1, %multiply.26581.1.clone.1)
+  %multiply.26582.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248938.1.clone.1, %select.21112.3.clone.1)
+  %add.248939.3.clone.1 = f32[1280,1280]{1,0} add(%select.21106.5.clone.1, %multiply.26582.1.clone.1)
+  %multiply.26583.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248939.3.clone.1, %select.21112.3.clone.1)
+  %add.248941.3.clone.1 = f32[1280,1280]{1,0} add(%select.21105.5.clone.1, %multiply.26583.1.clone.1)
+  %multiply.26584.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248941.3.clone.1, %select.21112.3.clone.1)
+  %add.248942.9.clone.1 = f32[1280,1280]{1,0} add(%select.21104.11.clone.1, %multiply.26584.7.clone.1)
+  %multiply.26585.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248942.9.clone.1, %select.21112.3.clone.1)
+  %add.248943.7.clone.1 = f32[1280,1280]{1,0} add(%select.21103.7.clone.1, %multiply.26585.7.clone.1)
+  %multiply.26586.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248943.7.clone.1, %maximum.3697.3.clone.1)
+  %select.21113.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7242.3.clone.1, %multiply.26576.9.clone.1, %multiply.26586.7.clone.1)
+  %multiply.26587.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21113.7.clone.1, %broadcast.244500.640)
+  %clamp.1191.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26587.5.clone.1, %broadcast.244501.384)
+  %multiply.26588.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1191.3.clone.1, %broadcast.244502.1)
+  %constant_177814_1_clone_1 = u32[] constant(3930035116)
+  %broadcast.252896.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177814_1_clone_1), dimensions={}
+  %add.248944.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.252896.44.clone.1)
+  %constant_177828_1_clone_1 = u32[] constant(595127978)
+  %broadcast.252897.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177828_1_clone_1), dimensions={}
+  %add.248945.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.252897.113.clone.1)
+  %add.248947.35.clone.1 = u32[1280,1280]{1,0} add(%add.248944.37.clone.1, %add.248945.99.clone.1)
+  %shift-left.109857.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248945.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116045.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248945.99.clone.1, %broadcast.244415.6016)
+  %or.115578.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109857.31.clone.1, %shift-right-logical.116045.29.clone.1)
+  %xor.122128.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248947.35.clone.1, %or.115578.29.clone.1)
+  %add.248951.5.clone.1 = u32[1280,1280]{1,0} add(%add.248947.35.clone.1, %xor.122128.27.clone.1)
+  %shift-left.109859.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122128.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116046.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122128.27.clone.1, %broadcast.244417.5760)
+  %or.115580.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109859.9.clone.1, %shift-right-logical.116046.9.clone.1)
+  %xor.122129.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248951.5.clone.1, %or.115580.7.clone.1)
+  %add.248952.3.clone.1 = u32[1280,1280]{1,0} add(%add.248951.5.clone.1, %xor.122129.5.clone.1)
+  %shift-left.109860.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122129.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116047.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122129.5.clone.1, %broadcast.244419.4352)
+  %or.115581.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109860.5.clone.1, %shift-right-logical.116047.5.clone.1)
+  %xor.122130.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248952.3.clone.1, %or.115581.3.clone.1)
+  %add.248953.3.clone.1 = u32[1280,1280]{1,0} add(%add.248952.3.clone.1, %xor.122130.3.clone.1)
+  %add.248954.7.clone.1 = u32[1280,1280]{1,0} add(%add.248953.3.clone.1, %broadcast.252897.113.clone.1)
+  %shift-left.109861.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122130.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116048.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122130.3.clone.1, %broadcast.244418.4352)
+  %or.115582.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109861.5.clone.1, %shift-right-logical.116048.5.clone.1)
+  %xor.122131.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248953.3.clone.1, %or.115582.3.clone.1)
+  %constant_218302_1_clone_1 = u32[] constant(3533072093)
+  %broadcast.252907.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218302_1_clone_1), dimensions={}
+  %add.248956.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122131.3.clone.1, %broadcast.252907.5.clone.1)
+  %add.248957.5.clone.1 = u32[1280,1280]{1,0} add(%add.248954.7.clone.1, %add.248956.5.clone.1)
+  %shift-left.109862.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248956.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116049.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248956.5.clone.1, %broadcast.244416.5760)
+  %or.115583.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109862.9.clone.1, %shift-right-logical.116049.9.clone.1)
+  %xor.122132.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248957.5.clone.1, %or.115583.7.clone.1)
+  %add.248958.3.clone.1 = u32[1280,1280]{1,0} add(%add.248957.5.clone.1, %xor.122132.5.clone.1)
+  %shift-left.109864.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122132.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116050.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122132.5.clone.1, %broadcast.244429.2304)
+  %or.115584.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109864.9.clone.1, %shift-right-logical.116050.9.clone.1)
+  %xor.122133.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248958.3.clone.1, %or.115584.7.clone.1)
+  %add.248959.3.clone.1 = u32[1280,1280]{1,0} add(%add.248958.3.clone.1, %xor.122133.5.clone.1)
+  %shift-left.109865.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122133.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116051.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122133.5.clone.1, %broadcast.244430.4608)
+  %or.115585.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109865.9.clone.1, %shift-right-logical.116051.9.clone.1)
+  %xor.122135.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248959.3.clone.1, %or.115585.7.clone.1)
+  %add.248961.3.clone.1 = u32[1280,1280]{1,0} add(%add.248959.3.clone.1, %xor.122135.5.clone.1)
+  %constant_177832_1_clone_1 = u32[] constant(3533072092)
+  %broadcast.252916.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177832_1_clone_1), dimensions={}
+  %add.248962.7.clone.1 = u32[1280,1280]{1,0} add(%add.248961.3.clone.1, %broadcast.252916.24.clone.1)
+  %shift-left.109866.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122135.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116052.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122135.5.clone.1, %broadcast.244434.2816)
+  %or.115586.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109866.11.clone.1, %shift-right-logical.116052.11.clone.1)
+  %xor.122136.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248961.3.clone.1, %or.115586.9.clone.1)
+  %constant_218303_1_clone_1 = u32[] constant(3930035118)
+  %broadcast.252922.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218303_1_clone_1), dimensions={}
+  %add.248963.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122136.7.clone.1, %broadcast.252922.5.clone.1)
+  %add.248964.5.clone.1 = u32[1280,1280]{1,0} add(%add.248962.7.clone.1, %add.248963.5.clone.1)
+  %shift-left.109867.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248963.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116053.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248963.5.clone.1, %broadcast.244415.6016)
+  %or.115587.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109867.9.clone.1, %shift-right-logical.116053.9.clone.1)
+  %xor.122137.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248964.5.clone.1, %or.115587.7.clone.1)
+  %add.248966.3.clone.1 = u32[1280,1280]{1,0} add(%add.248964.5.clone.1, %xor.122137.5.clone.1)
+  %shift-left.109869.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122137.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116054.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122137.5.clone.1, %broadcast.244417.5760)
+  %or.115588.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109869.9.clone.1, %shift-right-logical.116054.9.clone.1)
+  %xor.122138.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248966.3.clone.1, %or.115588.7.clone.1)
+  %add.248967.3.clone.1 = u32[1280,1280]{1,0} add(%add.248966.3.clone.1, %xor.122138.5.clone.1)
+  %shift-left.109870.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122138.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116055.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122138.5.clone.1, %broadcast.244419.4352)
+  %or.115590.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109870.7.clone.1, %shift-right-logical.116055.7.clone.1)
+  %xor.122140.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248967.3.clone.1, %or.115590.5.clone.1)
+  %add.248968.3.clone.1 = u32[1280,1280]{1,0} add(%add.248967.3.clone.1, %xor.122140.3.clone.1)
+  %add.248969.7.clone.1 = u32[1280,1280]{1,0} add(%add.248968.3.clone.1, %broadcast.252896.44.clone.1)
+  %shift-left.109871.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122140.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116056.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122140.3.clone.1, %broadcast.244418.4352)
+  %or.115591.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109871.7.clone.1, %shift-right-logical.116056.7.clone.1)
+  %xor.122141.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248968.3.clone.1, %or.115591.5.clone.1)
+  %constant_218304_1_clone_1 = u32[] constant(595127981)
+  %broadcast.252942.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218304_1_clone_1), dimensions={}
+  %add.248970.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122141.3.clone.1, %broadcast.252942.5.clone.1)
+  %add.248972.5.clone.1 = u32[1280,1280]{1,0} add(%add.248969.7.clone.1, %add.248970.5.clone.1)
+  %shift-left.109872.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248970.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116057.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248970.5.clone.1, %broadcast.244416.5760)
+  %or.115592.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109872.9.clone.1, %shift-right-logical.116057.9.clone.1)
+  %xor.122142.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248972.5.clone.1, %or.115592.7.clone.1)
+  %add.248976.3.clone.1 = u32[1280,1280]{1,0} add(%add.248972.5.clone.1, %xor.122142.5.clone.1)
+  %shift-left.109873.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122142.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116058.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122142.5.clone.1, %broadcast.244429.2304)
+  %or.115593.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109873.9.clone.1, %shift-right-logical.116058.9.clone.1)
+  %xor.122143.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248976.3.clone.1, %or.115593.7.clone.1)
+  %add.248977.3.clone.1 = u32[1280,1280]{1,0} add(%add.248976.3.clone.1, %xor.122143.5.clone.1)
+  %shift-left.109874.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122143.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116059.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122143.5.clone.1, %broadcast.244430.4608)
+  %or.115595.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109874.9.clone.1, %shift-right-logical.116059.9.clone.1)
+  %xor.122145.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248977.3.clone.1, %or.115595.7.clone.1)
+  %add.248978.3.clone.1 = u32[1280,1280]{1,0} add(%add.248977.3.clone.1, %xor.122145.5.clone.1)
+  %add.248979.7.clone.1 = u32[1280,1280]{1,0} add(%add.248978.3.clone.1, %broadcast.252897.113.clone.1)
+  %shift-left.109875.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122145.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116060.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122145.5.clone.1, %broadcast.244434.2816)
+  %or.115596.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109875.11.clone.1, %shift-right-logical.116060.11.clone.1)
+  %xor.122146.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248978.3.clone.1, %or.115596.9.clone.1)
+  %constant_218305_1_clone_1 = u32[] constant(3533072096)
+  %broadcast.252954.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218305_1_clone_1), dimensions={}
+  %add.248981.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122146.7.clone.1, %broadcast.252954.5.clone.1)
+  %add.248982.5.clone.1 = u32[1280,1280]{1,0} add(%add.248979.7.clone.1, %add.248981.5.clone.1)
+  %shift-left.109876.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248981.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116061.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248981.5.clone.1, %broadcast.244415.6016)
+  %or.115597.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109876.9.clone.1, %shift-right-logical.116061.9.clone.1)
+  %xor.122147.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248982.5.clone.1, %or.115597.7.clone.1)
+  %add.248983.3.clone.1 = u32[1280,1280]{1,0} add(%add.248982.5.clone.1, %xor.122147.5.clone.1)
+  %shift-left.109877.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122147.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116062.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122147.5.clone.1, %broadcast.244417.5760)
+  %or.115598.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109877.9.clone.1, %shift-right-logical.116062.9.clone.1)
+  %xor.122148.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248983.3.clone.1, %or.115598.7.clone.1)
+  %add.248984.3.clone.1 = u32[1280,1280]{1,0} add(%add.248983.3.clone.1, %xor.122148.5.clone.1)
+  %shift-left.109878.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122148.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116063.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122148.5.clone.1, %broadcast.244419.4352)
+  %or.115600.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109878.5.clone.1, %shift-right-logical.116063.5.clone.1)
+  %xor.122150.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248984.3.clone.1, %or.115600.3.clone.1)
+  %add.248986.3.clone.1 = u32[1280,1280]{1,0} add(%add.248984.3.clone.1, %xor.122150.3.clone.1)
+  %add.248987.17.clone.1 = u32[1280,1280]{1,0} add(%add.248986.3.clone.1, %broadcast.252916.24.clone.1)
+  %shift-left.109879.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122150.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116064.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122150.3.clone.1, %broadcast.244418.4352)
+  %or.115601.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109879.5.clone.1, %shift-right-logical.116064.5.clone.1)
+  %xor.122151.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248986.3.clone.1, %or.115601.3.clone.1)
+  %constant_218306_1_clone_1 = u32[] constant(3930035121)
+  %broadcast.252964.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218306_1_clone_1), dimensions={}
+  %add.248988.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122151.15.clone.1, %broadcast.252964.19.clone.1)
+  %xor.122152.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248987.17.clone.1, %add.248988.19.clone.1)
+  %shift-right-logical.116065.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122152.17.clone.1, %broadcast.244468.1920)
+  %or.115602.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116065.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5766.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115602.13.clone.1)
+  %add.248989.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5766.11.clone.1, %broadcast.244470.1152)
+  %multiply.26589.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248989.9.clone.1, %broadcast.244471.896)
+  %add.248991.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26589.7.clone.1, %broadcast.244408.1024)
+  %maximum.3698.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248991.5.clone.1)
+  %abs.1548.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3698.3.clone.1)
+  %compare.7244.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1548.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26590.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3698.3.clone.1, %broadcast.244476.1152)
+  %negate.4601.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3698.3.clone.1)
+  %multiply.26591.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3698.3.clone.1, %negate.4601.5.clone.1)
+  %log-plus-one.1548.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26591.5.clone.1)
+  %negate.4602.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1548.3.clone.1)
+  %compare.7245.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4602.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21114.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21115.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21116.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21117.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21118.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21119.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21120.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21121.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21122.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248992.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4602.4.clone.1, %broadcast.244496.640)
+  %sqrt.1548.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4602.4.clone.1)
+  %add.248993.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1548.5.clone.1, %broadcast.244498.640)
+  %select.21123.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7245.3.clone.1, %add.248992.5.clone.1, %add.248993.5.clone.1)
+  %multiply.26592.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21122.3.clone.1, %select.21123.3.clone.1)
+  %add.248994.1.clone.1 = f32[1280,1280]{1,0} add(%select.21121.3.clone.1, %multiply.26592.1.clone.1)
+  %multiply.26593.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248994.1.clone.1, %select.21123.3.clone.1)
+  %add.248995.1.clone.1 = f32[1280,1280]{1,0} add(%select.21120.3.clone.1, %multiply.26593.1.clone.1)
+  %multiply.26594.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248995.1.clone.1, %select.21123.3.clone.1)
+  %add.248997.1.clone.1 = f32[1280,1280]{1,0} add(%select.21119.3.clone.1, %multiply.26594.1.clone.1)
+  %multiply.26595.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248997.1.clone.1, %select.21123.3.clone.1)
+  %add.249000.1.clone.1 = f32[1280,1280]{1,0} add(%select.21118.3.clone.1, %multiply.26595.1.clone.1)
+  %multiply.26596.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249000.1.clone.1, %select.21123.3.clone.1)
+  %add.249001.3.clone.1 = f32[1280,1280]{1,0} add(%select.21117.5.clone.1, %multiply.26596.1.clone.1)
+  %multiply.26597.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249001.3.clone.1, %select.21123.3.clone.1)
+  %add.249002.3.clone.1 = f32[1280,1280]{1,0} add(%select.21116.5.clone.1, %multiply.26597.1.clone.1)
+  %multiply.26598.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249002.3.clone.1, %select.21123.3.clone.1)
+  %add.249003.9.clone.1 = f32[1280,1280]{1,0} add(%select.21115.11.clone.1, %multiply.26598.7.clone.1)
+  %multiply.26599.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249003.9.clone.1, %select.21123.3.clone.1)
+  %add.249004.7.clone.1 = f32[1280,1280]{1,0} add(%select.21114.7.clone.1, %multiply.26599.7.clone.1)
+  %multiply.26600.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249004.7.clone.1, %maximum.3698.3.clone.1)
+  %select.21124.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7244.3.clone.1, %multiply.26590.9.clone.1, %multiply.26600.7.clone.1)
+  %multiply.26601.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21124.7.clone.1, %broadcast.244500.640)
+  %clamp.1192.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26601.5.clone.1, %broadcast.244501.384)
+  %multiply.26602.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1192.3.clone.1, %broadcast.244502.1)
+  %constant_187357_1_clone_1 = u32[] constant(3486608628)
+  %broadcast.257058.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_187357_1_clone_1), dimensions={}
+  %add.251290.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.257058.44.clone.1)
+  %constant_187364_1_clone_1 = u32[] constant(2943763298)
+  %broadcast.257059.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_187364_1_clone_1), dimensions={}
+  %add.251291.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.257059.113.clone.1)
+  %add.251293.35.clone.1 = u32[1280,1280]{1,0} add(%add.251290.37.clone.1, %add.251291.99.clone.1)
+  %shift-left.110880.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251291.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117156.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251291.99.clone.1, %broadcast.244415.6016)
+  %or.116665.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110880.31.clone.1, %shift-right-logical.117156.29.clone.1)
+  %xor.123236.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251293.35.clone.1, %or.116665.29.clone.1)
+  %add.251294.5.clone.1 = u32[1280,1280]{1,0} add(%add.251293.35.clone.1, %xor.123236.27.clone.1)
+  %shift-left.110881.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123236.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117157.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123236.27.clone.1, %broadcast.244417.5760)
+  %or.116666.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110881.9.clone.1, %shift-right-logical.117157.9.clone.1)
+  %xor.123237.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251294.5.clone.1, %or.116666.7.clone.1)
+  %add.251295.3.clone.1 = u32[1280,1280]{1,0} add(%add.251294.5.clone.1, %xor.123237.5.clone.1)
+  %shift-left.110882.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123237.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117159.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123237.5.clone.1, %broadcast.244419.4352)
+  %or.116667.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110882.5.clone.1, %shift-right-logical.117159.5.clone.1)
+  %xor.123238.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251295.3.clone.1, %or.116667.3.clone.1)
+  %add.251296.3.clone.1 = u32[1280,1280]{1,0} add(%add.251295.3.clone.1, %xor.123238.3.clone.1)
+  %add.251297.7.clone.1 = u32[1280,1280]{1,0} add(%add.251296.3.clone.1, %broadcast.257059.113.clone.1)
+  %shift-left.110883.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123238.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117160.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123238.3.clone.1, %broadcast.244418.4352)
+  %or.116668.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110883.5.clone.1, %shift-right-logical.117160.5.clone.1)
+  %xor.123239.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251296.3.clone.1, %or.116668.3.clone.1)
+  %constant_218559_1_clone_1 = u32[] constant(2071344205)
+  %broadcast.257069.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218559_1_clone_1), dimensions={}
+  %add.251299.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123239.3.clone.1, %broadcast.257069.5.clone.1)
+  %add.251303.5.clone.1 = u32[1280,1280]{1,0} add(%add.251297.7.clone.1, %add.251299.5.clone.1)
+  %shift-left.110884.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251299.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117161.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251299.5.clone.1, %broadcast.244416.5760)
+  %or.116670.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110884.9.clone.1, %shift-right-logical.117161.9.clone.1)
+  %xor.123240.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251303.5.clone.1, %or.116670.7.clone.1)
+  %add.251304.3.clone.1 = u32[1280,1280]{1,0} add(%add.251303.5.clone.1, %xor.123240.5.clone.1)
+  %shift-left.110885.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123240.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117162.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123240.5.clone.1, %broadcast.244429.2304)
+  %or.116671.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110885.9.clone.1, %shift-right-logical.117162.9.clone.1)
+  %xor.123241.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251304.3.clone.1, %or.116671.7.clone.1)
+  %add.251305.3.clone.1 = u32[1280,1280]{1,0} add(%add.251304.3.clone.1, %xor.123241.5.clone.1)
+  %shift-left.110886.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123241.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117164.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123241.5.clone.1, %broadcast.244430.4608)
+  %or.116672.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110886.9.clone.1, %shift-right-logical.117164.9.clone.1)
+  %xor.123242.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251305.3.clone.1, %or.116672.7.clone.1)
+  %add.251306.3.clone.1 = u32[1280,1280]{1,0} add(%add.251305.3.clone.1, %xor.123242.5.clone.1)
+  %constant_187366_1_clone_1 = u32[] constant(2071344204)
+  %broadcast.257076.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_187366_1_clone_1), dimensions={}
+  %add.251308.7.clone.1 = u32[1280,1280]{1,0} add(%add.251306.3.clone.1, %broadcast.257076.24.clone.1)
+  %shift-left.110887.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123242.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117165.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123242.5.clone.1, %broadcast.244434.2816)
+  %or.116673.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110887.11.clone.1, %shift-right-logical.117165.11.clone.1)
+  %xor.123243.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251306.3.clone.1, %or.116673.9.clone.1)
+  %constant_218560_1_clone_1 = u32[] constant(3486608630)
+  %broadcast.257079.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218560_1_clone_1), dimensions={}
+  %add.251309.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123243.7.clone.1, %broadcast.257079.5.clone.1)
+  %add.251310.5.clone.1 = u32[1280,1280]{1,0} add(%add.251308.7.clone.1, %add.251309.5.clone.1)
+  %shift-left.110888.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251309.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117166.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251309.5.clone.1, %broadcast.244415.6016)
+  %or.116675.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110888.9.clone.1, %shift-right-logical.117166.9.clone.1)
+  %xor.123244.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251310.5.clone.1, %or.116675.7.clone.1)
+  %add.251311.3.clone.1 = u32[1280,1280]{1,0} add(%add.251310.5.clone.1, %xor.123244.5.clone.1)
+  %shift-left.110889.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123244.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117167.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123244.5.clone.1, %broadcast.244417.5760)
+  %or.116676.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110889.9.clone.1, %shift-right-logical.117167.9.clone.1)
+  %xor.123245.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251311.3.clone.1, %or.116676.7.clone.1)
+  %add.251313.3.clone.1 = u32[1280,1280]{1,0} add(%add.251311.3.clone.1, %xor.123245.5.clone.1)
+  %shift-left.110890.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123245.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117169.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123245.5.clone.1, %broadcast.244419.4352)
+  %or.116677.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110890.7.clone.1, %shift-right-logical.117169.7.clone.1)
+  %xor.123246.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251313.3.clone.1, %or.116677.5.clone.1)
+  %add.251314.3.clone.1 = u32[1280,1280]{1,0} add(%add.251313.3.clone.1, %xor.123246.3.clone.1)
+  %add.251315.7.clone.1 = u32[1280,1280]{1,0} add(%add.251314.3.clone.1, %broadcast.257058.44.clone.1)
+  %shift-left.110891.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123246.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117170.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123246.3.clone.1, %broadcast.244418.4352)
+  %or.116678.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110891.7.clone.1, %shift-right-logical.117170.7.clone.1)
+  %xor.123247.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251314.3.clone.1, %or.116678.5.clone.1)
+  %constant_218561_1_clone_1 = u32[] constant(2943763301)
+  %broadcast.257089.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218561_1_clone_1), dimensions={}
+  %add.251316.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123247.3.clone.1, %broadcast.257089.5.clone.1)
+  %add.251318.5.clone.1 = u32[1280,1280]{1,0} add(%add.251315.7.clone.1, %add.251316.5.clone.1)
+  %shift-left.110892.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251316.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117171.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251316.5.clone.1, %broadcast.244416.5760)
+  %or.116680.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110892.9.clone.1, %shift-right-logical.117171.9.clone.1)
+  %xor.123248.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251318.5.clone.1, %or.116680.7.clone.1)
+  %add.251319.3.clone.1 = u32[1280,1280]{1,0} add(%add.251318.5.clone.1, %xor.123248.5.clone.1)
+  %shift-left.110893.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123248.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117172.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123248.5.clone.1, %broadcast.244429.2304)
+  %or.116681.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110893.9.clone.1, %shift-right-logical.117172.9.clone.1)
+  %xor.123249.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251319.3.clone.1, %or.116681.7.clone.1)
+  %add.251320.3.clone.1 = u32[1280,1280]{1,0} add(%add.251319.3.clone.1, %xor.123249.5.clone.1)
+  %shift-left.110894.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123249.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117173.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123249.5.clone.1, %broadcast.244430.4608)
+  %or.116682.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110894.9.clone.1, %shift-right-logical.117173.9.clone.1)
+  %xor.123250.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251320.3.clone.1, %or.116682.7.clone.1)
+  %add.251321.3.clone.1 = u32[1280,1280]{1,0} add(%add.251320.3.clone.1, %xor.123250.5.clone.1)
+  %add.251322.7.clone.1 = u32[1280,1280]{1,0} add(%add.251321.3.clone.1, %broadcast.257059.113.clone.1)
+  %shift-left.110895.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123250.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117174.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123250.5.clone.1, %broadcast.244434.2816)
+  %or.116683.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110895.11.clone.1, %shift-right-logical.117174.11.clone.1)
+  %xor.123251.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251321.3.clone.1, %or.116683.9.clone.1)
+  %constant_218562_1_clone_1 = u32[] constant(2071344208)
+  %broadcast.257099.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218562_1_clone_1), dimensions={}
+  %add.251324.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123251.7.clone.1, %broadcast.257099.5.clone.1)
+  %add.251327.5.clone.1 = u32[1280,1280]{1,0} add(%add.251322.7.clone.1, %add.251324.5.clone.1)
+  %shift-left.110896.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251324.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117175.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251324.5.clone.1, %broadcast.244415.6016)
+  %or.116684.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110896.9.clone.1, %shift-right-logical.117175.9.clone.1)
+  %xor.123252.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251327.5.clone.1, %or.116684.7.clone.1)
+  %add.251328.3.clone.1 = u32[1280,1280]{1,0} add(%add.251327.5.clone.1, %xor.123252.5.clone.1)
+  %shift-left.110897.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123252.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117176.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123252.5.clone.1, %broadcast.244417.5760)
+  %or.116685.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110897.9.clone.1, %shift-right-logical.117176.9.clone.1)
+  %xor.123253.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251328.3.clone.1, %or.116685.7.clone.1)
+  %add.251329.3.clone.1 = u32[1280,1280]{1,0} add(%add.251328.3.clone.1, %xor.123253.5.clone.1)
+  %shift-left.110898.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123253.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117177.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123253.5.clone.1, %broadcast.244419.4352)
+  %or.116686.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110898.5.clone.1, %shift-right-logical.117177.5.clone.1)
+  %xor.123254.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251329.3.clone.1, %or.116686.3.clone.1)
+  %add.251330.3.clone.1 = u32[1280,1280]{1,0} add(%add.251329.3.clone.1, %xor.123254.3.clone.1)
+  %add.251331.17.clone.1 = u32[1280,1280]{1,0} add(%add.251330.3.clone.1, %broadcast.257076.24.clone.1)
+  %shift-left.110899.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123254.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117178.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123254.3.clone.1, %broadcast.244418.4352)
+  %or.116687.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110899.5.clone.1, %shift-right-logical.117178.5.clone.1)
+  %xor.123255.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251330.3.clone.1, %or.116687.3.clone.1)
+  %constant_218563_1_clone_1 = u32[] constant(3486608633)
+  %broadcast.257109.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218563_1_clone_1), dimensions={}
+  %add.251332.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123255.15.clone.1, %broadcast.257109.19.clone.1)
+  %xor.123256.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251331.17.clone.1, %add.251332.19.clone.1)
+  %shift-right-logical.117179.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123256.17.clone.1, %broadcast.244468.1920)
+  %or.116688.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117179.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5813.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116688.13.clone.1)
+  %add.251333.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5813.11.clone.1, %broadcast.244470.1152)
+  %multiply.27069.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251333.9.clone.1, %broadcast.244471.896)
+  %add.251334.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27069.7.clone.1, %broadcast.244408.1024)
+  %maximum.3745.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251334.5.clone.1)
+  %abs.1579.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3745.3.clone.1)
+  %compare.7320.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1579.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27070.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3745.3.clone.1, %broadcast.244476.1152)
+  %negate.4663.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3745.3.clone.1)
+  %multiply.27071.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3745.3.clone.1, %negate.4663.5.clone.1)
+  %log-plus-one.1579.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27071.5.clone.1)
+  %negate.4664.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1579.3.clone.1)
+  %compare.7321.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4664.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21455.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21456.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21457.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21458.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21459.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21460.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21461.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21462.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21463.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251335.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4664.4.clone.1, %broadcast.244496.640)
+  %sqrt.1579.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4664.4.clone.1)
+  %add.251336.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1579.5.clone.1, %broadcast.244498.640)
+  %select.21464.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7321.3.clone.1, %add.251335.5.clone.1, %add.251336.5.clone.1)
+  %multiply.27072.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21463.3.clone.1, %select.21464.3.clone.1)
+  %add.251337.1.clone.1 = f32[1280,1280]{1,0} add(%select.21462.3.clone.1, %multiply.27072.1.clone.1)
+  %multiply.27073.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251337.1.clone.1, %select.21464.3.clone.1)
+  %add.251338.1.clone.1 = f32[1280,1280]{1,0} add(%select.21461.3.clone.1, %multiply.27073.1.clone.1)
+  %multiply.27074.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251338.1.clone.1, %select.21464.3.clone.1)
+  %add.251339.1.clone.1 = f32[1280,1280]{1,0} add(%select.21460.3.clone.1, %multiply.27074.1.clone.1)
+  %multiply.27075.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251339.1.clone.1, %select.21464.3.clone.1)
+  %add.251340.1.clone.1 = f32[1280,1280]{1,0} add(%select.21459.3.clone.1, %multiply.27075.1.clone.1)
+  %multiply.27076.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251340.1.clone.1, %select.21464.3.clone.1)
+  %add.251341.3.clone.1 = f32[1280,1280]{1,0} add(%select.21458.5.clone.1, %multiply.27076.1.clone.1)
+  %multiply.27077.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251341.3.clone.1, %select.21464.3.clone.1)
+  %add.251342.3.clone.1 = f32[1280,1280]{1,0} add(%select.21457.5.clone.1, %multiply.27077.1.clone.1)
+  %multiply.27078.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251342.3.clone.1, %select.21464.3.clone.1)
+  %add.251343.9.clone.1 = f32[1280,1280]{1,0} add(%select.21456.11.clone.1, %multiply.27078.7.clone.1)
+  %multiply.27079.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251343.9.clone.1, %select.21464.3.clone.1)
+  %add.251344.7.clone.1 = f32[1280,1280]{1,0} add(%select.21455.7.clone.1, %multiply.27079.7.clone.1)
+  %multiply.27080.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251344.7.clone.1, %maximum.3745.3.clone.1)
+  %select.21465.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7320.3.clone.1, %multiply.27070.9.clone.1, %multiply.27080.7.clone.1)
+  %multiply.27081.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21465.7.clone.1, %broadcast.244500.640)
+  %clamp.1223.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27081.5.clone.1, %broadcast.244501.384)
+  %multiply.27082.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1223.3.clone.1, %broadcast.244502.1)
+  %constant_177052_1_clone_1 = u32[] constant(196262093)
+  %broadcast.252577.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177052_1_clone_1), dimensions={}
+  %add.248749.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.252577.44.clone.1)
+  %constant_177063_1_clone_1 = u32[] constant(157763338)
+  %broadcast.252578.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177063_1_clone_1), dimensions={}
+  %add.248750.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.252578.113.clone.1)
+  %add.248752.35.clone.1 = u32[1280,1280]{1,0} add(%add.248749.37.clone.1, %add.248750.99.clone.1)
+  %shift-left.109762.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248750.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115961.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248750.99.clone.1, %broadcast.244415.6016)
+  %or.115485.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109762.31.clone.1, %shift-right-logical.115961.29.clone.1)
+  %xor.122030.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248752.35.clone.1, %or.115485.29.clone.1)
+  %add.248753.5.clone.1 = u32[1280,1280]{1,0} add(%add.248752.35.clone.1, %xor.122030.27.clone.1)
+  %shift-left.109764.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122030.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115962.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122030.27.clone.1, %broadcast.244417.5760)
+  %or.115486.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109764.9.clone.1, %shift-right-logical.115962.9.clone.1)
+  %xor.122031.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248753.5.clone.1, %or.115486.7.clone.1)
+  %add.248754.3.clone.1 = u32[1280,1280]{1,0} add(%add.248753.5.clone.1, %xor.122031.5.clone.1)
+  %shift-left.109765.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122031.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115963.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122031.5.clone.1, %broadcast.244419.4352)
+  %or.115487.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109765.5.clone.1, %shift-right-logical.115963.5.clone.1)
+  %xor.122032.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248754.3.clone.1, %or.115487.3.clone.1)
+  %add.248755.3.clone.1 = u32[1280,1280]{1,0} add(%add.248754.3.clone.1, %xor.122032.3.clone.1)
+  %add.248757.7.clone.1 = u32[1280,1280]{1,0} add(%add.248755.3.clone.1, %broadcast.252578.113.clone.1)
+  %shift-left.109766.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122032.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115964.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122032.3.clone.1, %broadcast.244418.4352)
+  %or.115488.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109766.5.clone.1, %shift-right-logical.115964.5.clone.1)
+  %xor.122033.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248755.3.clone.1, %or.115488.3.clone.1)
+  %constant_218282_1_clone_1 = u32[] constant(419750942)
+  %broadcast.252590.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218282_1_clone_1), dimensions={}
+  %add.248758.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122033.3.clone.1, %broadcast.252590.5.clone.1)
+  %add.248759.5.clone.1 = u32[1280,1280]{1,0} add(%add.248757.7.clone.1, %add.248758.5.clone.1)
+  %shift-left.109767.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248758.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115965.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248758.5.clone.1, %broadcast.244416.5760)
+  %or.115489.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109767.9.clone.1, %shift-right-logical.115965.9.clone.1)
+  %xor.122034.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248759.5.clone.1, %or.115489.7.clone.1)
+  %add.248760.3.clone.1 = u32[1280,1280]{1,0} add(%add.248759.5.clone.1, %xor.122034.5.clone.1)
+  %shift-left.109769.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122034.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115966.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122034.5.clone.1, %broadcast.244429.2304)
+  %or.115490.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109769.9.clone.1, %shift-right-logical.115966.9.clone.1)
+  %xor.122035.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248760.3.clone.1, %or.115490.7.clone.1)
+  %add.248761.3.clone.1 = u32[1280,1280]{1,0} add(%add.248760.3.clone.1, %xor.122035.5.clone.1)
+  %shift-left.109770.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122035.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115967.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122035.5.clone.1, %broadcast.244430.4608)
+  %or.115491.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109770.9.clone.1, %shift-right-logical.115967.9.clone.1)
+  %xor.122036.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248761.3.clone.1, %or.115491.7.clone.1)
+  %add.248763.3.clone.1 = u32[1280,1280]{1,0} add(%add.248761.3.clone.1, %xor.122036.5.clone.1)
+  %constant_177065_1_clone_1 = u32[] constant(419750941)
+  %broadcast.252597.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_177065_1_clone_1), dimensions={}
+  %add.248767.7.clone.1 = u32[1280,1280]{1,0} add(%add.248763.3.clone.1, %broadcast.252597.24.clone.1)
+  %shift-left.109771.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122036.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115968.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122036.5.clone.1, %broadcast.244434.2816)
+  %or.115492.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109771.11.clone.1, %shift-right-logical.115968.11.clone.1)
+  %xor.122037.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248763.3.clone.1, %or.115492.9.clone.1)
+  %constant_218283_1_clone_1 = u32[] constant(196262095)
+  %broadcast.252600.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218283_1_clone_1), dimensions={}
+  %add.248768.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122037.7.clone.1, %broadcast.252600.5.clone.1)
+  %add.248769.5.clone.1 = u32[1280,1280]{1,0} add(%add.248767.7.clone.1, %add.248768.5.clone.1)
+  %shift-left.109772.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248768.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115969.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248768.5.clone.1, %broadcast.244415.6016)
+  %or.115493.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109772.9.clone.1, %shift-right-logical.115969.9.clone.1)
+  %xor.122038.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248769.5.clone.1, %or.115493.7.clone.1)
+  %add.248770.3.clone.1 = u32[1280,1280]{1,0} add(%add.248769.5.clone.1, %xor.122038.5.clone.1)
+  %shift-left.109773.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122038.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115970.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122038.5.clone.1, %broadcast.244417.5760)
+  %or.115495.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109773.9.clone.1, %shift-right-logical.115970.9.clone.1)
+  %xor.122039.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248770.3.clone.1, %or.115495.7.clone.1)
+  %add.248772.3.clone.1 = u32[1280,1280]{1,0} add(%add.248770.3.clone.1, %xor.122039.5.clone.1)
+  %shift-left.109774.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122039.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115971.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122039.5.clone.1, %broadcast.244419.4352)
+  %or.115496.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109774.7.clone.1, %shift-right-logical.115971.7.clone.1)
+  %xor.122040.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248772.3.clone.1, %or.115496.5.clone.1)
+  %add.248773.3.clone.1 = u32[1280,1280]{1,0} add(%add.248772.3.clone.1, %xor.122040.3.clone.1)
+  %add.248774.7.clone.1 = u32[1280,1280]{1,0} add(%add.248773.3.clone.1, %broadcast.252577.44.clone.1)
+  %shift-left.109775.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122040.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115972.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122040.3.clone.1, %broadcast.244418.4352)
+  %or.115498.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109775.7.clone.1, %shift-right-logical.115972.7.clone.1)
+  %xor.122041.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248773.3.clone.1, %or.115498.5.clone.1)
+  %constant_218284_1_clone_1 = u32[] constant(157763341)
+  %broadcast.252612.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218284_1_clone_1), dimensions={}
+  %add.248775.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122041.3.clone.1, %broadcast.252612.5.clone.1)
+  %add.248777.5.clone.1 = u32[1280,1280]{1,0} add(%add.248774.7.clone.1, %add.248775.5.clone.1)
+  %shift-left.109776.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248775.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115973.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248775.5.clone.1, %broadcast.244416.5760)
+  %or.115499.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109776.9.clone.1, %shift-right-logical.115973.9.clone.1)
+  %xor.122042.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248777.5.clone.1, %or.115499.7.clone.1)
+  %add.248778.3.clone.1 = u32[1280,1280]{1,0} add(%add.248777.5.clone.1, %xor.122042.5.clone.1)
+  %shift-left.109777.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122042.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115974.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122042.5.clone.1, %broadcast.244429.2304)
+  %or.115500.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109777.9.clone.1, %shift-right-logical.115974.9.clone.1)
+  %xor.122043.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248778.3.clone.1, %or.115500.7.clone.1)
+  %add.248779.3.clone.1 = u32[1280,1280]{1,0} add(%add.248778.3.clone.1, %xor.122043.5.clone.1)
+  %shift-left.109779.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122043.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115975.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122043.5.clone.1, %broadcast.244430.4608)
+  %or.115501.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109779.9.clone.1, %shift-right-logical.115975.9.clone.1)
+  %xor.122044.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248779.3.clone.1, %or.115501.7.clone.1)
+  %add.248780.3.clone.1 = u32[1280,1280]{1,0} add(%add.248779.3.clone.1, %xor.122044.5.clone.1)
+  %add.248782.7.clone.1 = u32[1280,1280]{1,0} add(%add.248780.3.clone.1, %broadcast.252578.113.clone.1)
+  %shift-left.109780.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122044.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115976.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122044.5.clone.1, %broadcast.244434.2816)
+  %or.115502.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109780.11.clone.1, %shift-right-logical.115976.11.clone.1)
+  %xor.122045.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248780.3.clone.1, %or.115502.9.clone.1)
+  %constant_218285_1_clone_1 = u32[] constant(419750945)
+  %broadcast.252622.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218285_1_clone_1), dimensions={}
+  %add.248783.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122045.7.clone.1, %broadcast.252622.5.clone.1)
+  %add.248784.5.clone.1 = u32[1280,1280]{1,0} add(%add.248782.7.clone.1, %add.248783.5.clone.1)
+  %shift-left.109781.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248783.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115977.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248783.5.clone.1, %broadcast.244415.6016)
+  %or.115503.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109781.9.clone.1, %shift-right-logical.115977.9.clone.1)
+  %xor.122046.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248784.5.clone.1, %or.115503.7.clone.1)
+  %add.248785.3.clone.1 = u32[1280,1280]{1,0} add(%add.248784.5.clone.1, %xor.122046.5.clone.1)
+  %shift-left.109782.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122046.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115978.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122046.5.clone.1, %broadcast.244417.5760)
+  %or.115504.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109782.9.clone.1, %shift-right-logical.115978.9.clone.1)
+  %xor.122047.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248785.3.clone.1, %or.115504.7.clone.1)
+  %add.248786.3.clone.1 = u32[1280,1280]{1,0} add(%add.248785.3.clone.1, %xor.122047.5.clone.1)
+  %shift-left.109784.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122047.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115979.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122047.5.clone.1, %broadcast.244419.4352)
+  %or.115505.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109784.5.clone.1, %shift-right-logical.115979.5.clone.1)
+  %xor.122048.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248786.3.clone.1, %or.115505.3.clone.1)
+  %add.248788.3.clone.1 = u32[1280,1280]{1,0} add(%add.248786.3.clone.1, %xor.122048.3.clone.1)
+  %add.248792.17.clone.1 = u32[1280,1280]{1,0} add(%add.248788.3.clone.1, %broadcast.252597.24.clone.1)
+  %shift-left.109785.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122048.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115980.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122048.3.clone.1, %broadcast.244418.4352)
+  %or.115506.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109785.5.clone.1, %shift-right-logical.115980.5.clone.1)
+  %xor.122051.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248788.3.clone.1, %or.115506.3.clone.1)
+  %constant_218286_1_clone_1 = u32[] constant(196262098)
+  %broadcast.252634.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218286_1_clone_1), dimensions={}
+  %add.248793.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122051.15.clone.1, %broadcast.252634.19.clone.1)
+  %xor.122052.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248792.17.clone.1, %add.248793.19.clone.1)
+  %shift-right-logical.115981.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122052.17.clone.1, %broadcast.244468.1920)
+  %or.115507.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115981.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5762.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115507.13.clone.1)
+  %add.248794.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5762.11.clone.1, %broadcast.244470.1152)
+  %multiply.26557.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248794.9.clone.1, %broadcast.244471.896)
+  %add.248795.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26557.7.clone.1, %broadcast.244408.1024)
+  %maximum.3694.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248795.5.clone.1)
+  %abs.1546.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3694.3.clone.1)
+  %compare.7240.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1546.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26558.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3694.3.clone.1, %broadcast.244476.1152)
+  %negate.4597.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3694.3.clone.1)
+  %multiply.26559.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3694.3.clone.1, %negate.4597.5.clone.1)
+  %log-plus-one.1546.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26559.5.clone.1)
+  %negate.4598.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1546.3.clone.1)
+  %compare.7241.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4598.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21092.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21093.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21094.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21095.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21096.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21097.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21098.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21099.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21100.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248797.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4598.4.clone.1, %broadcast.244496.640)
+  %sqrt.1546.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4598.4.clone.1)
+  %add.248798.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1546.5.clone.1, %broadcast.244498.640)
+  %select.21101.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7241.3.clone.1, %add.248797.5.clone.1, %add.248798.5.clone.1)
+  %multiply.26560.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21100.3.clone.1, %select.21101.3.clone.1)
+  %add.248799.1.clone.1 = f32[1280,1280]{1,0} add(%select.21099.3.clone.1, %multiply.26560.1.clone.1)
+  %multiply.26561.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248799.1.clone.1, %select.21101.3.clone.1)
+  %add.248800.1.clone.1 = f32[1280,1280]{1,0} add(%select.21098.3.clone.1, %multiply.26561.1.clone.1)
+  %multiply.26562.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248800.1.clone.1, %select.21101.3.clone.1)
+  %add.248802.1.clone.1 = f32[1280,1280]{1,0} add(%select.21097.3.clone.1, %multiply.26562.1.clone.1)
+  %multiply.26563.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248802.1.clone.1, %select.21101.3.clone.1)
+  %add.248803.1.clone.1 = f32[1280,1280]{1,0} add(%select.21096.3.clone.1, %multiply.26563.1.clone.1)
+  %multiply.26564.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248803.1.clone.1, %select.21101.3.clone.1)
+  %add.248804.3.clone.1 = f32[1280,1280]{1,0} add(%select.21095.5.clone.1, %multiply.26564.1.clone.1)
+  %multiply.26565.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248804.3.clone.1, %select.21101.3.clone.1)
+  %add.248805.3.clone.1 = f32[1280,1280]{1,0} add(%select.21094.5.clone.1, %multiply.26565.1.clone.1)
+  %multiply.26566.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248805.3.clone.1, %select.21101.3.clone.1)
+  %add.248807.9.clone.1 = f32[1280,1280]{1,0} add(%select.21093.11.clone.1, %multiply.26566.7.clone.1)
+  %multiply.26567.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248807.9.clone.1, %select.21101.3.clone.1)
+  %add.248808.7.clone.1 = f32[1280,1280]{1,0} add(%select.21092.7.clone.1, %multiply.26567.7.clone.1)
+  %multiply.26568.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248808.7.clone.1, %maximum.3694.3.clone.1)
+  %select.21102.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7240.3.clone.1, %multiply.26558.9.clone.1, %multiply.26568.7.clone.1)
+  %multiply.26569.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21102.7.clone.1, %broadcast.244500.640)
+  %clamp.1190.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26569.5.clone.1, %broadcast.244501.384)
+  %multiply.26570.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1190.3.clone.1, %broadcast.244502.1)
+  %constant_192244_1_clone_1 = u32[] constant(2225837459)
+  %broadcast.259151.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_192244_1_clone_1), dimensions={}
+  %add.252505.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.259151.44.clone.1)
+  %constant_192251_1_clone_1 = u32[] constant(1598632266)
+  %broadcast.259152.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_192251_1_clone_1), dimensions={}
+  %add.252506.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.259152.113.clone.1)
+  %add.252507.35.clone.1 = u32[1280,1280]{1,0} add(%add.252505.37.clone.1, %add.252506.99.clone.1)
+  %shift-left.111400.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252506.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117702.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252506.99.clone.1, %broadcast.244415.6016)
+  %or.117230.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111400.31.clone.1, %shift-right-logical.117702.29.clone.1)
+  %xor.123785.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252507.35.clone.1, %or.117230.29.clone.1)
+  %add.252508.5.clone.1 = u32[1280,1280]{1,0} add(%add.252507.35.clone.1, %xor.123785.27.clone.1)
+  %shift-left.111401.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123785.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117703.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123785.27.clone.1, %broadcast.244417.5760)
+  %or.117231.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111401.9.clone.1, %shift-right-logical.117703.9.clone.1)
+  %xor.123786.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252508.5.clone.1, %or.117231.7.clone.1)
+  %add.252509.3.clone.1 = u32[1280,1280]{1,0} add(%add.252508.5.clone.1, %xor.123786.5.clone.1)
+  %shift-left.111402.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123786.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117705.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123786.5.clone.1, %broadcast.244419.4352)
+  %or.117232.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111402.5.clone.1, %shift-right-logical.117705.5.clone.1)
+  %xor.123787.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252509.3.clone.1, %or.117232.3.clone.1)
+  %add.252510.3.clone.1 = u32[1280,1280]{1,0} add(%add.252509.3.clone.1, %xor.123787.3.clone.1)
+  %add.252511.7.clone.1 = u32[1280,1280]{1,0} add(%add.252510.3.clone.1, %broadcast.259152.113.clone.1)
+  %shift-left.111403.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123787.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117706.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123787.3.clone.1, %broadcast.244418.4352)
+  %or.117233.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111403.5.clone.1, %shift-right-logical.117706.5.clone.1)
+  %xor.123788.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252510.3.clone.1, %or.117233.3.clone.1)
+  %constant_218695_1_clone_1 = u32[] constant(3224616708)
+  %broadcast.259162.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218695_1_clone_1), dimensions={}
+  %add.252512.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123788.3.clone.1, %broadcast.259162.5.clone.1)
+  %add.252513.5.clone.1 = u32[1280,1280]{1,0} add(%add.252511.7.clone.1, %add.252512.5.clone.1)
+  %shift-left.111404.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252512.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117707.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252512.5.clone.1, %broadcast.244416.5760)
+  %or.117234.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111404.9.clone.1, %shift-right-logical.117707.9.clone.1)
+  %xor.123789.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252513.5.clone.1, %or.117234.7.clone.1)
+  %add.252514.3.clone.1 = u32[1280,1280]{1,0} add(%add.252513.5.clone.1, %xor.123789.5.clone.1)
+  %shift-left.111405.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123789.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117708.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123789.5.clone.1, %broadcast.244429.2304)
+  %or.117235.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111405.9.clone.1, %shift-right-logical.117708.9.clone.1)
+  %xor.123790.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252514.3.clone.1, %or.117235.7.clone.1)
+  %add.252515.3.clone.1 = u32[1280,1280]{1,0} add(%add.252514.3.clone.1, %xor.123790.5.clone.1)
+  %shift-left.111406.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123790.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117710.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123790.5.clone.1, %broadcast.244430.4608)
+  %or.117236.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111406.9.clone.1, %shift-right-logical.117710.9.clone.1)
+  %xor.123791.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252515.3.clone.1, %or.117236.7.clone.1)
+  %add.252516.3.clone.1 = u32[1280,1280]{1,0} add(%add.252515.3.clone.1, %xor.123791.5.clone.1)
+  %constant_192253_1_clone_1 = u32[] constant(3224616707)
+  %broadcast.259169.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_192253_1_clone_1), dimensions={}
+  %add.252517.7.clone.1 = u32[1280,1280]{1,0} add(%add.252516.3.clone.1, %broadcast.259169.24.clone.1)
+  %shift-left.111407.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123791.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117711.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123791.5.clone.1, %broadcast.244434.2816)
+  %or.117237.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111407.11.clone.1, %shift-right-logical.117711.11.clone.1)
+  %xor.123792.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252516.3.clone.1, %or.117237.9.clone.1)
+  %constant_218696_1_clone_1 = u32[] constant(2225837461)
+  %broadcast.259172.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218696_1_clone_1), dimensions={}
+  %add.252518.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123792.7.clone.1, %broadcast.259172.5.clone.1)
+  %add.252519.5.clone.1 = u32[1280,1280]{1,0} add(%add.252517.7.clone.1, %add.252518.5.clone.1)
+  %shift-left.111408.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252518.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117712.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252518.5.clone.1, %broadcast.244415.6016)
+  %or.117238.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111408.9.clone.1, %shift-right-logical.117712.9.clone.1)
+  %xor.123793.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252519.5.clone.1, %or.117238.7.clone.1)
+  %add.252520.3.clone.1 = u32[1280,1280]{1,0} add(%add.252519.5.clone.1, %xor.123793.5.clone.1)
+  %shift-left.111409.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123793.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117713.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123793.5.clone.1, %broadcast.244417.5760)
+  %or.117239.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111409.9.clone.1, %shift-right-logical.117713.9.clone.1)
+  %xor.123794.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252520.3.clone.1, %or.117239.7.clone.1)
+  %add.252521.3.clone.1 = u32[1280,1280]{1,0} add(%add.252520.3.clone.1, %xor.123794.5.clone.1)
+  %shift-left.111410.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123794.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117715.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123794.5.clone.1, %broadcast.244419.4352)
+  %or.117240.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111410.7.clone.1, %shift-right-logical.117715.7.clone.1)
+  %xor.123795.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252521.3.clone.1, %or.117240.5.clone.1)
+  %add.252522.3.clone.1 = u32[1280,1280]{1,0} add(%add.252521.3.clone.1, %xor.123795.3.clone.1)
+  %add.252523.7.clone.1 = u32[1280,1280]{1,0} add(%add.252522.3.clone.1, %broadcast.259151.44.clone.1)
+  %shift-left.111411.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123795.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117716.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123795.3.clone.1, %broadcast.244418.4352)
+  %or.117241.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111411.7.clone.1, %shift-right-logical.117716.7.clone.1)
+  %xor.123796.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252522.3.clone.1, %or.117241.5.clone.1)
+  %constant_218697_1_clone_1 = u32[] constant(1598632269)
+  %broadcast.259182.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218697_1_clone_1), dimensions={}
+  %add.252524.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123796.3.clone.1, %broadcast.259182.5.clone.1)
+  %add.252525.5.clone.1 = u32[1280,1280]{1,0} add(%add.252523.7.clone.1, %add.252524.5.clone.1)
+  %shift-left.111412.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252524.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117717.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252524.5.clone.1, %broadcast.244416.5760)
+  %or.117242.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111412.9.clone.1, %shift-right-logical.117717.9.clone.1)
+  %xor.123797.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252525.5.clone.1, %or.117242.7.clone.1)
+  %add.252526.3.clone.1 = u32[1280,1280]{1,0} add(%add.252525.5.clone.1, %xor.123797.5.clone.1)
+  %shift-left.111413.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123797.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117718.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123797.5.clone.1, %broadcast.244429.2304)
+  %or.117243.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111413.9.clone.1, %shift-right-logical.117718.9.clone.1)
+  %xor.123798.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252526.3.clone.1, %or.117243.7.clone.1)
+  %add.252527.3.clone.1 = u32[1280,1280]{1,0} add(%add.252526.3.clone.1, %xor.123798.5.clone.1)
+  %shift-left.111414.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123798.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117720.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123798.5.clone.1, %broadcast.244430.4608)
+  %or.117244.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111414.9.clone.1, %shift-right-logical.117720.9.clone.1)
+  %xor.123799.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252527.3.clone.1, %or.117244.7.clone.1)
+  %add.252528.3.clone.1 = u32[1280,1280]{1,0} add(%add.252527.3.clone.1, %xor.123799.5.clone.1)
+  %add.252529.7.clone.1 = u32[1280,1280]{1,0} add(%add.252528.3.clone.1, %broadcast.259152.113.clone.1)
+  %shift-left.111415.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123799.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117721.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123799.5.clone.1, %broadcast.244434.2816)
+  %or.117245.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111415.11.clone.1, %shift-right-logical.117721.11.clone.1)
+  %xor.123800.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252528.3.clone.1, %or.117245.9.clone.1)
+  %constant_218698_1_clone_1 = u32[] constant(3224616711)
+  %broadcast.259192.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218698_1_clone_1), dimensions={}
+  %add.252530.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123800.7.clone.1, %broadcast.259192.5.clone.1)
+  %add.252531.5.clone.1 = u32[1280,1280]{1,0} add(%add.252529.7.clone.1, %add.252530.5.clone.1)
+  %shift-left.111416.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252530.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117722.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252530.5.clone.1, %broadcast.244415.6016)
+  %or.117246.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111416.9.clone.1, %shift-right-logical.117722.9.clone.1)
+  %xor.123801.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252531.5.clone.1, %or.117246.7.clone.1)
+  %add.252532.3.clone.1 = u32[1280,1280]{1,0} add(%add.252531.5.clone.1, %xor.123801.5.clone.1)
+  %shift-left.111417.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123801.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117723.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123801.5.clone.1, %broadcast.244417.5760)
+  %or.117247.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111417.9.clone.1, %shift-right-logical.117723.9.clone.1)
+  %xor.123802.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252532.3.clone.1, %or.117247.7.clone.1)
+  %add.252533.3.clone.1 = u32[1280,1280]{1,0} add(%add.252532.3.clone.1, %xor.123802.5.clone.1)
+  %shift-left.111418.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123802.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117724.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123802.5.clone.1, %broadcast.244419.4352)
+  %or.117248.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111418.5.clone.1, %shift-right-logical.117724.5.clone.1)
+  %xor.123803.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252533.3.clone.1, %or.117248.3.clone.1)
+  %add.252534.3.clone.1 = u32[1280,1280]{1,0} add(%add.252533.3.clone.1, %xor.123803.3.clone.1)
+  %add.252535.17.clone.1 = u32[1280,1280]{1,0} add(%add.252534.3.clone.1, %broadcast.259169.24.clone.1)
+  %shift-left.111419.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123803.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117725.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123803.3.clone.1, %broadcast.244418.4352)
+  %or.117249.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111419.5.clone.1, %shift-right-logical.117725.5.clone.1)
+  %xor.123804.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252534.3.clone.1, %or.117249.3.clone.1)
+  %constant_218699_1_clone_1 = u32[] constant(2225837464)
+  %broadcast.259202.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218699_1_clone_1), dimensions={}
+  %add.252536.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123804.15.clone.1, %broadcast.259202.19.clone.1)
+  %xor.123805.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252535.17.clone.1, %add.252536.19.clone.1)
+  %shift-right-logical.117726.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123805.17.clone.1, %broadcast.244468.1920)
+  %or.117250.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117726.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5837.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117250.13.clone.1)
+  %add.252537.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5837.11.clone.1, %broadcast.244470.1152)
+  %multiply.27310.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252537.9.clone.1, %broadcast.244471.896)
+  %add.252538.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27310.7.clone.1, %broadcast.244408.1024)
+  %maximum.3769.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252538.5.clone.1)
+  %abs.1595.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3769.3.clone.1)
+  %compare.7352.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1595.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27311.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3769.3.clone.1, %broadcast.244476.1152)
+  %negate.4695.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3769.3.clone.1)
+  %multiply.27312.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3769.3.clone.1, %negate.4695.5.clone.1)
+  %log-plus-one.1595.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27312.5.clone.1)
+  %negate.4696.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1595.3.clone.1)
+  %compare.7353.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4696.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21652.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21653.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21654.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21655.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21656.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21657.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21658.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21659.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21660.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252539.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4696.4.clone.1, %broadcast.244496.640)
+  %sqrt.1595.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4696.4.clone.1)
+  %add.252540.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1595.5.clone.1, %broadcast.244498.640)
+  %select.21661.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7353.3.clone.1, %add.252539.5.clone.1, %add.252540.5.clone.1)
+  %multiply.27313.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21660.3.clone.1, %select.21661.3.clone.1)
+  %add.252541.1.clone.1 = f32[1280,1280]{1,0} add(%select.21659.3.clone.1, %multiply.27313.1.clone.1)
+  %multiply.27314.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252541.1.clone.1, %select.21661.3.clone.1)
+  %add.252542.1.clone.1 = f32[1280,1280]{1,0} add(%select.21658.3.clone.1, %multiply.27314.1.clone.1)
+  %multiply.27315.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252542.1.clone.1, %select.21661.3.clone.1)
+  %add.252543.1.clone.1 = f32[1280,1280]{1,0} add(%select.21657.3.clone.1, %multiply.27315.1.clone.1)
+  %multiply.27316.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252543.1.clone.1, %select.21661.3.clone.1)
+  %add.252544.1.clone.1 = f32[1280,1280]{1,0} add(%select.21656.3.clone.1, %multiply.27316.1.clone.1)
+  %multiply.27317.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252544.1.clone.1, %select.21661.3.clone.1)
+  %add.252545.3.clone.1 = f32[1280,1280]{1,0} add(%select.21655.5.clone.1, %multiply.27317.1.clone.1)
+  %multiply.27318.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252545.3.clone.1, %select.21661.3.clone.1)
+  %add.252546.3.clone.1 = f32[1280,1280]{1,0} add(%select.21654.5.clone.1, %multiply.27318.1.clone.1)
+  %multiply.27319.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252546.3.clone.1, %select.21661.3.clone.1)
+  %add.252547.9.clone.1 = f32[1280,1280]{1,0} add(%select.21653.11.clone.1, %multiply.27319.7.clone.1)
+  %multiply.27320.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252547.9.clone.1, %select.21661.3.clone.1)
+  %add.252548.7.clone.1 = f32[1280,1280]{1,0} add(%select.21652.7.clone.1, %multiply.27320.7.clone.1)
+  %multiply.27321.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252548.7.clone.1, %maximum.3769.3.clone.1)
+  %select.21662.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7352.3.clone.1, %multiply.27311.9.clone.1, %multiply.27321.7.clone.1)
+  %multiply.27322.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21662.7.clone.1, %broadcast.244500.640)
+  %clamp.1239.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27322.5.clone.1, %broadcast.244501.384)
+  %multiply.27323.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1239.3.clone.1, %broadcast.244502.1)
+  %constant_176829_1_clone_1 = u32[] constant(109760836)
+  %broadcast.252474.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176829_1_clone_1), dimensions={}
+  %add.248687.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.252474.44.clone.1)
+  %constant_176836_1_clone_1 = u32[] constant(546401704)
+  %broadcast.252475.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176836_1_clone_1), dimensions={}
+  %add.248689.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.252475.113.clone.1)
+  %add.248692.35.clone.1 = u32[1280,1280]{1,0} add(%add.248687.37.clone.1, %add.248689.99.clone.1)
+  %shift-left.109740.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248689.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115940.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248689.99.clone.1, %broadcast.244415.6016)
+  %or.115462.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109740.31.clone.1, %shift-right-logical.115940.29.clone.1)
+  %xor.122009.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248692.35.clone.1, %or.115462.29.clone.1)
+  %add.248693.5.clone.1 = u32[1280,1280]{1,0} add(%add.248692.35.clone.1, %xor.122009.27.clone.1)
+  %shift-left.109741.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122009.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115941.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122009.27.clone.1, %broadcast.244417.5760)
+  %or.115463.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109741.9.clone.1, %shift-right-logical.115941.9.clone.1)
+  %xor.122010.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248693.5.clone.1, %or.115463.7.clone.1)
+  %add.248694.3.clone.1 = u32[1280,1280]{1,0} add(%add.248693.5.clone.1, %xor.122010.5.clone.1)
+  %shift-left.109742.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122010.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115942.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122010.5.clone.1, %broadcast.244419.4352)
+  %or.115464.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109742.5.clone.1, %shift-right-logical.115942.5.clone.1)
+  %xor.122011.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248694.3.clone.1, %or.115464.3.clone.1)
+  %add.248695.3.clone.1 = u32[1280,1280]{1,0} add(%add.248694.3.clone.1, %xor.122011.3.clone.1)
+  %add.248697.7.clone.1 = u32[1280,1280]{1,0} add(%add.248695.3.clone.1, %broadcast.252475.113.clone.1)
+  %shift-left.109743.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122011.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115943.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122011.3.clone.1, %broadcast.244418.4352)
+  %or.115465.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109743.5.clone.1, %shift-right-logical.115943.5.clone.1)
+  %xor.122012.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248695.3.clone.1, %or.115465.3.clone.1)
+  %constant_218277_1_clone_1 = u32[] constant(1036691255)
+  %broadcast.252485.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218277_1_clone_1), dimensions={}
+  %add.248698.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122012.3.clone.1, %broadcast.252485.5.clone.1)
+  %add.248699.5.clone.1 = u32[1280,1280]{1,0} add(%add.248697.7.clone.1, %add.248698.5.clone.1)
+  %shift-left.109744.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248698.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115944.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248698.5.clone.1, %broadcast.244416.5760)
+  %or.115466.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109744.9.clone.1, %shift-right-logical.115944.9.clone.1)
+  %xor.122013.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248699.5.clone.1, %or.115466.7.clone.1)
+  %add.248700.3.clone.1 = u32[1280,1280]{1,0} add(%add.248699.5.clone.1, %xor.122013.5.clone.1)
+  %shift-left.109745.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122013.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115945.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122013.5.clone.1, %broadcast.244429.2304)
+  %or.115467.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109745.9.clone.1, %shift-right-logical.115945.9.clone.1)
+  %xor.122014.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248700.3.clone.1, %or.115467.7.clone.1)
+  %add.248702.3.clone.1 = u32[1280,1280]{1,0} add(%add.248700.3.clone.1, %xor.122014.5.clone.1)
+  %shift-left.109746.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122014.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115946.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122014.5.clone.1, %broadcast.244430.4608)
+  %or.115468.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109746.9.clone.1, %shift-right-logical.115946.9.clone.1)
+  %xor.122015.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248702.3.clone.1, %or.115468.7.clone.1)
+  %add.248703.3.clone.1 = u32[1280,1280]{1,0} add(%add.248702.3.clone.1, %xor.122015.5.clone.1)
+  %constant_176838_1_clone_1 = u32[] constant(1036691254)
+  %broadcast.252492.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176838_1_clone_1), dimensions={}
+  %add.248704.7.clone.1 = u32[1280,1280]{1,0} add(%add.248703.3.clone.1, %broadcast.252492.24.clone.1)
+  %shift-left.109747.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122015.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115947.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122015.5.clone.1, %broadcast.244434.2816)
+  %or.115469.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109747.11.clone.1, %shift-right-logical.115947.11.clone.1)
+  %xor.122016.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248703.3.clone.1, %or.115469.9.clone.1)
+  %constant_218278_1_clone_1 = u32[] constant(109760838)
+  %broadcast.252495.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218278_1_clone_1), dimensions={}
+  %add.248705.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122016.7.clone.1, %broadcast.252495.5.clone.1)
+  %add.248707.5.clone.1 = u32[1280,1280]{1,0} add(%add.248704.7.clone.1, %add.248705.5.clone.1)
+  %shift-left.109748.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248705.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115948.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248705.5.clone.1, %broadcast.244415.6016)
+  %or.115470.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109748.9.clone.1, %shift-right-logical.115948.9.clone.1)
+  %xor.122017.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248707.5.clone.1, %or.115470.7.clone.1)
+  %add.248708.3.clone.1 = u32[1280,1280]{1,0} add(%add.248707.5.clone.1, %xor.122017.5.clone.1)
+  %shift-left.109749.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122017.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115949.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122017.5.clone.1, %broadcast.244417.5760)
+  %or.115471.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109749.9.clone.1, %shift-right-logical.115949.9.clone.1)
+  %xor.122018.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248708.3.clone.1, %or.115471.7.clone.1)
+  %add.248709.3.clone.1 = u32[1280,1280]{1,0} add(%add.248708.3.clone.1, %xor.122018.5.clone.1)
+  %shift-left.109750.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122018.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115950.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122018.5.clone.1, %broadcast.244419.4352)
+  %or.115473.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109750.7.clone.1, %shift-right-logical.115950.7.clone.1)
+  %xor.122019.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248709.3.clone.1, %or.115473.5.clone.1)
+  %add.248710.3.clone.1 = u32[1280,1280]{1,0} add(%add.248709.3.clone.1, %xor.122019.3.clone.1)
+  %add.248711.7.clone.1 = u32[1280,1280]{1,0} add(%add.248710.3.clone.1, %broadcast.252474.44.clone.1)
+  %shift-left.109751.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122019.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115951.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122019.3.clone.1, %broadcast.244418.4352)
+  %or.115474.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109751.7.clone.1, %shift-right-logical.115951.7.clone.1)
+  %xor.122020.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248710.3.clone.1, %or.115474.5.clone.1)
+  %constant_218279_1_clone_1 = u32[] constant(546401707)
+  %broadcast.252505.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218279_1_clone_1), dimensions={}
+  %add.248713.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122020.3.clone.1, %broadcast.252505.5.clone.1)
+  %add.248717.5.clone.1 = u32[1280,1280]{1,0} add(%add.248711.7.clone.1, %add.248713.5.clone.1)
+  %shift-left.109752.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248713.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115952.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248713.5.clone.1, %broadcast.244416.5760)
+  %or.115476.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109752.9.clone.1, %shift-right-logical.115952.9.clone.1)
+  %xor.122021.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248717.5.clone.1, %or.115476.7.clone.1)
+  %add.248718.3.clone.1 = u32[1280,1280]{1,0} add(%add.248717.5.clone.1, %xor.122021.5.clone.1)
+  %shift-left.109754.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122021.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115953.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122021.5.clone.1, %broadcast.244429.2304)
+  %or.115477.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109754.9.clone.1, %shift-right-logical.115953.9.clone.1)
+  %xor.122022.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248718.3.clone.1, %or.115477.7.clone.1)
+  %add.248719.3.clone.1 = u32[1280,1280]{1,0} add(%add.248718.3.clone.1, %xor.122022.5.clone.1)
+  %shift-left.109755.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122022.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115954.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122022.5.clone.1, %broadcast.244430.4608)
+  %or.115478.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109755.9.clone.1, %shift-right-logical.115954.9.clone.1)
+  %xor.122023.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248719.3.clone.1, %or.115478.7.clone.1)
+  %add.248720.3.clone.1 = u32[1280,1280]{1,0} add(%add.248719.3.clone.1, %xor.122023.5.clone.1)
+  %add.248722.7.clone.1 = u32[1280,1280]{1,0} add(%add.248720.3.clone.1, %broadcast.252475.113.clone.1)
+  %shift-left.109756.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122023.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115955.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122023.5.clone.1, %broadcast.244434.2816)
+  %or.115479.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109756.11.clone.1, %shift-right-logical.115955.11.clone.1)
+  %xor.122024.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248720.3.clone.1, %or.115479.9.clone.1)
+  %constant_218280_1_clone_1 = u32[] constant(1036691258)
+  %broadcast.252515.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218280_1_clone_1), dimensions={}
+  %add.248723.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122024.7.clone.1, %broadcast.252515.5.clone.1)
+  %add.248724.5.clone.1 = u32[1280,1280]{1,0} add(%add.248722.7.clone.1, %add.248723.5.clone.1)
+  %shift-left.109757.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248723.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115956.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248723.5.clone.1, %broadcast.244415.6016)
+  %or.115480.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109757.9.clone.1, %shift-right-logical.115956.9.clone.1)
+  %xor.122025.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248724.5.clone.1, %or.115480.7.clone.1)
+  %add.248725.3.clone.1 = u32[1280,1280]{1,0} add(%add.248724.5.clone.1, %xor.122025.5.clone.1)
+  %shift-left.109759.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122025.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115957.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122025.5.clone.1, %broadcast.244417.5760)
+  %or.115481.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109759.9.clone.1, %shift-right-logical.115957.9.clone.1)
+  %xor.122026.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248725.3.clone.1, %or.115481.7.clone.1)
+  %add.248727.3.clone.1 = u32[1280,1280]{1,0} add(%add.248725.3.clone.1, %xor.122026.5.clone.1)
+  %shift-left.109760.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122026.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115958.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122026.5.clone.1, %broadcast.244419.4352)
+  %or.115482.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109760.5.clone.1, %shift-right-logical.115958.5.clone.1)
+  %xor.122027.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248727.3.clone.1, %or.115482.3.clone.1)
+  %add.248728.3.clone.1 = u32[1280,1280]{1,0} add(%add.248727.3.clone.1, %xor.122027.3.clone.1)
+  %add.248729.17.clone.1 = u32[1280,1280]{1,0} add(%add.248728.3.clone.1, %broadcast.252492.24.clone.1)
+  %shift-left.109761.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122027.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115959.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122027.3.clone.1, %broadcast.244418.4352)
+  %or.115483.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109761.5.clone.1, %shift-right-logical.115959.5.clone.1)
+  %xor.122028.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248728.3.clone.1, %or.115483.3.clone.1)
+  %constant_218281_1_clone_1 = u32[] constant(109760841)
+  %broadcast.252528.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218281_1_clone_1), dimensions={}
+  %add.248730.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122028.15.clone.1, %broadcast.252528.19.clone.1)
+  %xor.122029.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248729.17.clone.1, %add.248730.19.clone.1)
+  %shift-right-logical.115960.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122029.17.clone.1, %broadcast.244468.1920)
+  %or.115484.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115960.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5761.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115484.13.clone.1)
+  %add.248732.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5761.11.clone.1, %broadcast.244470.1152)
+  %multiply.26542.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248732.9.clone.1, %broadcast.244471.896)
+  %add.248733.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26542.7.clone.1, %broadcast.244408.1024)
+  %maximum.3693.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248733.5.clone.1)
+  %abs.1545.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3693.3.clone.1)
+  %compare.7238.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1545.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26543.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3693.3.clone.1, %broadcast.244476.1152)
+  %negate.4595.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3693.3.clone.1)
+  %multiply.26544.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3693.3.clone.1, %negate.4595.5.clone.1)
+  %log-plus-one.1545.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26544.5.clone.1)
+  %negate.4596.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1545.3.clone.1)
+  %compare.7239.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4596.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21081.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21082.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21083.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21084.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21085.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21086.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21087.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21088.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21089.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248734.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4596.4.clone.1, %broadcast.244496.640)
+  %sqrt.1545.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4596.4.clone.1)
+  %add.248735.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1545.5.clone.1, %broadcast.244498.640)
+  %select.21090.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7239.3.clone.1, %add.248734.5.clone.1, %add.248735.5.clone.1)
+  %multiply.26545.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21089.3.clone.1, %select.21090.3.clone.1)
+  %add.248736.1.clone.1 = f32[1280,1280]{1,0} add(%select.21088.3.clone.1, %multiply.26545.1.clone.1)
+  %multiply.26546.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248736.1.clone.1, %select.21090.3.clone.1)
+  %add.248738.1.clone.1 = f32[1280,1280]{1,0} add(%select.21087.3.clone.1, %multiply.26546.1.clone.1)
+  %multiply.26547.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248738.1.clone.1, %select.21090.3.clone.1)
+  %add.248742.1.clone.1 = f32[1280,1280]{1,0} add(%select.21086.3.clone.1, %multiply.26547.1.clone.1)
+  %multiply.26548.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248742.1.clone.1, %select.21090.3.clone.1)
+  %add.248743.1.clone.1 = f32[1280,1280]{1,0} add(%select.21085.3.clone.1, %multiply.26548.1.clone.1)
+  %multiply.26549.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248743.1.clone.1, %select.21090.3.clone.1)
+  %add.248744.3.clone.1 = f32[1280,1280]{1,0} add(%select.21084.5.clone.1, %multiply.26549.1.clone.1)
+  %multiply.26551.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248744.3.clone.1, %select.21090.3.clone.1)
+  %add.248745.3.clone.1 = f32[1280,1280]{1,0} add(%select.21083.5.clone.1, %multiply.26551.1.clone.1)
+  %multiply.26552.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248745.3.clone.1, %select.21090.3.clone.1)
+  %add.248747.9.clone.1 = f32[1280,1280]{1,0} add(%select.21082.11.clone.1, %multiply.26552.7.clone.1)
+  %multiply.26553.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248747.9.clone.1, %select.21090.3.clone.1)
+  %add.248748.7.clone.1 = f32[1280,1280]{1,0} add(%select.21081.7.clone.1, %multiply.26553.7.clone.1)
+  %multiply.26554.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248748.7.clone.1, %maximum.3693.3.clone.1)
+  %select.21091.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7238.3.clone.1, %multiply.26543.9.clone.1, %multiply.26554.7.clone.1)
+  %multiply.26555.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21091.7.clone.1, %broadcast.244500.640)
+  %clamp.1189.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26555.5.clone.1, %broadcast.244501.384)
+  %multiply.26556.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1189.3.clone.1, %broadcast.244502.1)
+  %constant_186813_1_clone_1 = u32[] constant(3048909039)
+  %broadcast.256825.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186813_1_clone_1), dimensions={}
+  %add.251151.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.256825.44.clone.1)
+  %constant_186820_1_clone_1 = u32[] constant(4279400643)
+  %broadcast.256826.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186820_1_clone_1), dimensions={}
+  %add.251152.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256826.113.clone.1)
+  %add.251153.35.clone.1 = u32[1280,1280]{1,0} add(%add.251151.37.clone.1, %add.251152.99.clone.1)
+  %shift-left.110820.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251152.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117081.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251152.99.clone.1, %broadcast.244415.6016)
+  %or.116601.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110820.31.clone.1, %shift-right-logical.117081.29.clone.1)
+  %xor.123172.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251153.35.clone.1, %or.116601.29.clone.1)
+  %add.251154.5.clone.1 = u32[1280,1280]{1,0} add(%add.251153.35.clone.1, %xor.123172.27.clone.1)
+  %shift-left.110821.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123172.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117082.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123172.27.clone.1, %broadcast.244417.5760)
+  %or.116602.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110821.9.clone.1, %shift-right-logical.117082.9.clone.1)
+  %xor.123173.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251154.5.clone.1, %or.116602.7.clone.1)
+  %add.251155.3.clone.1 = u32[1280,1280]{1,0} add(%add.251154.5.clone.1, %xor.123173.5.clone.1)
+  %shift-left.110822.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123173.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117084.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123173.5.clone.1, %broadcast.244419.4352)
+  %or.116603.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110822.5.clone.1, %shift-right-logical.117084.5.clone.1)
+  %xor.123174.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251155.3.clone.1, %or.116603.3.clone.1)
+  %add.251156.3.clone.1 = u32[1280,1280]{1,0} add(%add.251155.3.clone.1, %xor.123174.3.clone.1)
+  %add.251157.7.clone.1 = u32[1280,1280]{1,0} add(%add.251156.3.clone.1, %broadcast.256826.113.clone.1)
+  %shift-left.110823.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123174.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117085.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123174.3.clone.1, %broadcast.244418.4352)
+  %or.116604.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110823.5.clone.1, %shift-right-logical.117085.5.clone.1)
+  %xor.123175.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251156.3.clone.1, %or.116604.3.clone.1)
+  %constant_218544_1_clone_1 = u32[] constant(1366936567)
+  %broadcast.256836.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218544_1_clone_1), dimensions={}
+  %add.251158.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123175.3.clone.1, %broadcast.256836.5.clone.1)
+  %add.251159.5.clone.1 = u32[1280,1280]{1,0} add(%add.251157.7.clone.1, %add.251158.5.clone.1)
+  %shift-left.110824.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251158.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117086.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251158.5.clone.1, %broadcast.244416.5760)
+  %or.116605.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110824.9.clone.1, %shift-right-logical.117086.9.clone.1)
+  %xor.123176.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251159.5.clone.1, %or.116605.7.clone.1)
+  %add.251160.3.clone.1 = u32[1280,1280]{1,0} add(%add.251159.5.clone.1, %xor.123176.5.clone.1)
+  %shift-left.110825.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123176.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117087.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123176.5.clone.1, %broadcast.244429.2304)
+  %or.116606.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110825.9.clone.1, %shift-right-logical.117087.9.clone.1)
+  %xor.123178.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251160.3.clone.1, %or.116606.7.clone.1)
+  %add.251161.3.clone.1 = u32[1280,1280]{1,0} add(%add.251160.3.clone.1, %xor.123178.5.clone.1)
+  %shift-left.110826.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123178.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117089.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123178.5.clone.1, %broadcast.244430.4608)
+  %or.116607.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110826.9.clone.1, %shift-right-logical.117089.9.clone.1)
+  %xor.123179.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251161.3.clone.1, %or.116607.7.clone.1)
+  %add.251162.3.clone.1 = u32[1280,1280]{1,0} add(%add.251161.3.clone.1, %xor.123179.5.clone.1)
+  %constant_186822_1_clone_1 = u32[] constant(1366936566)
+  %broadcast.256845.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186822_1_clone_1), dimensions={}
+  %add.251163.7.clone.1 = u32[1280,1280]{1,0} add(%add.251162.3.clone.1, %broadcast.256845.24.clone.1)
+  %shift-left.110827.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123179.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117090.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123179.5.clone.1, %broadcast.244434.2816)
+  %or.116608.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110827.11.clone.1, %shift-right-logical.117090.11.clone.1)
+  %xor.123180.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251162.3.clone.1, %or.116608.9.clone.1)
+  %constant_218545_1_clone_1 = u32[] constant(3048909041)
+  %broadcast.256848.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218545_1_clone_1), dimensions={}
+  %add.251164.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123180.7.clone.1, %broadcast.256848.5.clone.1)
+  %add.251165.5.clone.1 = u32[1280,1280]{1,0} add(%add.251163.7.clone.1, %add.251164.5.clone.1)
+  %shift-left.110828.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251164.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117091.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251164.5.clone.1, %broadcast.244415.6016)
+  %or.116609.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110828.9.clone.1, %shift-right-logical.117091.9.clone.1)
+  %xor.123181.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251165.5.clone.1, %or.116609.7.clone.1)
+  %add.251166.3.clone.1 = u32[1280,1280]{1,0} add(%add.251165.5.clone.1, %xor.123181.5.clone.1)
+  %shift-left.110829.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123181.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117092.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123181.5.clone.1, %broadcast.244417.5760)
+  %or.116610.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110829.9.clone.1, %shift-right-logical.117092.9.clone.1)
+  %xor.123182.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251166.3.clone.1, %or.116610.7.clone.1)
+  %add.251168.3.clone.1 = u32[1280,1280]{1,0} add(%add.251166.3.clone.1, %xor.123182.5.clone.1)
+  %shift-left.110830.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123182.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117094.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123182.5.clone.1, %broadcast.244419.4352)
+  %or.116611.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110830.7.clone.1, %shift-right-logical.117094.7.clone.1)
+  %xor.123183.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251168.3.clone.1, %or.116611.5.clone.1)
+  %add.251169.3.clone.1 = u32[1280,1280]{1,0} add(%add.251168.3.clone.1, %xor.123183.3.clone.1)
+  %add.251170.7.clone.1 = u32[1280,1280]{1,0} add(%add.251169.3.clone.1, %broadcast.256825.44.clone.1)
+  %shift-left.110831.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123183.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117095.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123183.3.clone.1, %broadcast.244418.4352)
+  %or.116612.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110831.7.clone.1, %shift-right-logical.117095.7.clone.1)
+  %xor.123184.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251169.3.clone.1, %or.116612.5.clone.1)
+  %constant_218546_1_clone_1 = u32[] constant(4279400646)
+  %broadcast.256858.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218546_1_clone_1), dimensions={}
+  %add.251171.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123184.3.clone.1, %broadcast.256858.5.clone.1)
+  %add.251172.5.clone.1 = u32[1280,1280]{1,0} add(%add.251170.7.clone.1, %add.251171.5.clone.1)
+  %shift-left.110832.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251171.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117096.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251171.5.clone.1, %broadcast.244416.5760)
+  %or.116613.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110832.9.clone.1, %shift-right-logical.117096.9.clone.1)
+  %xor.123185.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251172.5.clone.1, %or.116613.7.clone.1)
+  %add.251173.3.clone.1 = u32[1280,1280]{1,0} add(%add.251172.5.clone.1, %xor.123185.5.clone.1)
+  %shift-left.110833.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123185.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117097.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123185.5.clone.1, %broadcast.244429.2304)
+  %or.116614.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110833.9.clone.1, %shift-right-logical.117097.9.clone.1)
+  %xor.123186.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251173.3.clone.1, %or.116614.7.clone.1)
+  %add.251174.3.clone.1 = u32[1280,1280]{1,0} add(%add.251173.3.clone.1, %xor.123186.5.clone.1)
+  %shift-left.110834.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123186.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117098.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123186.5.clone.1, %broadcast.244430.4608)
+  %or.116615.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110834.9.clone.1, %shift-right-logical.117098.9.clone.1)
+  %xor.123187.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251174.3.clone.1, %or.116615.7.clone.1)
+  %add.251175.3.clone.1 = u32[1280,1280]{1,0} add(%add.251174.3.clone.1, %xor.123187.5.clone.1)
+  %add.251176.7.clone.1 = u32[1280,1280]{1,0} add(%add.251175.3.clone.1, %broadcast.256826.113.clone.1)
+  %shift-left.110835.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123187.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117099.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123187.5.clone.1, %broadcast.244434.2816)
+  %or.116616.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110835.11.clone.1, %shift-right-logical.117099.11.clone.1)
+  %xor.123188.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251175.3.clone.1, %or.116616.9.clone.1)
+  %constant_218547_1_clone_1 = u32[] constant(1366936570)
+  %broadcast.256870.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218547_1_clone_1), dimensions={}
+  %add.251177.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123188.7.clone.1, %broadcast.256870.5.clone.1)
+  %add.251178.5.clone.1 = u32[1280,1280]{1,0} add(%add.251176.7.clone.1, %add.251177.5.clone.1)
+  %shift-left.110836.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251177.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117100.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251177.5.clone.1, %broadcast.244415.6016)
+  %or.116617.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110836.9.clone.1, %shift-right-logical.117100.9.clone.1)
+  %xor.123189.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251178.5.clone.1, %or.116617.7.clone.1)
+  %add.251179.3.clone.1 = u32[1280,1280]{1,0} add(%add.251178.5.clone.1, %xor.123189.5.clone.1)
+  %shift-left.110837.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123189.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117101.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123189.5.clone.1, %broadcast.244417.5760)
+  %or.116618.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110837.9.clone.1, %shift-right-logical.117101.9.clone.1)
+  %xor.123190.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251179.3.clone.1, %or.116618.7.clone.1)
+  %add.251180.3.clone.1 = u32[1280,1280]{1,0} add(%add.251179.3.clone.1, %xor.123190.5.clone.1)
+  %shift-left.110838.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123190.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117102.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123190.5.clone.1, %broadcast.244419.4352)
+  %or.116619.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110838.5.clone.1, %shift-right-logical.117102.5.clone.1)
+  %xor.123191.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251180.3.clone.1, %or.116619.3.clone.1)
+  %add.251181.3.clone.1 = u32[1280,1280]{1,0} add(%add.251180.3.clone.1, %xor.123191.3.clone.1)
+  %add.251182.17.clone.1 = u32[1280,1280]{1,0} add(%add.251181.3.clone.1, %broadcast.256845.24.clone.1)
+  %shift-left.110839.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123191.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117104.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123191.3.clone.1, %broadcast.244418.4352)
+  %or.116620.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110839.5.clone.1, %shift-right-logical.117104.5.clone.1)
+  %xor.123192.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251181.3.clone.1, %or.116620.3.clone.1)
+  %constant_218548_1_clone_1 = u32[] constant(3048909044)
+  %broadcast.256880.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218548_1_clone_1), dimensions={}
+  %add.251183.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123192.15.clone.1, %broadcast.256880.19.clone.1)
+  %xor.123193.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251182.17.clone.1, %add.251183.19.clone.1)
+  %shift-right-logical.117105.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123193.17.clone.1, %broadcast.244468.1920)
+  %or.116621.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117105.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5810.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116621.13.clone.1)
+  %add.251184.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5810.11.clone.1, %broadcast.244470.1152)
+  %multiply.27049.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251184.9.clone.1, %broadcast.244471.896)
+  %add.251185.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27049.7.clone.1, %broadcast.244408.1024)
+  %maximum.3742.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251185.5.clone.1)
+  %abs.1578.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3742.3.clone.1)
+  %compare.7318.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1578.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27050.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3742.3.clone.1, %broadcast.244476.1152)
+  %negate.4661.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3742.3.clone.1)
+  %multiply.27052.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3742.3.clone.1, %negate.4661.5.clone.1)
+  %log-plus-one.1578.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27052.5.clone.1)
+  %negate.4662.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1578.3.clone.1)
+  %compare.7319.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4662.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21444.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21445.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21446.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21447.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21448.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21449.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21450.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21451.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21452.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251186.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4662.4.clone.1, %broadcast.244496.640)
+  %sqrt.1578.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4662.4.clone.1)
+  %add.251187.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1578.5.clone.1, %broadcast.244498.640)
+  %select.21453.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7319.3.clone.1, %add.251186.5.clone.1, %add.251187.5.clone.1)
+  %multiply.27054.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21452.3.clone.1, %select.21453.3.clone.1)
+  %add.251188.1.clone.1 = f32[1280,1280]{1,0} add(%select.21451.3.clone.1, %multiply.27054.1.clone.1)
+  %multiply.27055.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251188.1.clone.1, %select.21453.3.clone.1)
+  %add.251189.1.clone.1 = f32[1280,1280]{1,0} add(%select.21450.3.clone.1, %multiply.27055.1.clone.1)
+  %multiply.27056.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251189.1.clone.1, %select.21453.3.clone.1)
+  %add.251190.1.clone.1 = f32[1280,1280]{1,0} add(%select.21449.3.clone.1, %multiply.27056.1.clone.1)
+  %multiply.27057.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251190.1.clone.1, %select.21453.3.clone.1)
+  %add.251191.1.clone.1 = f32[1280,1280]{1,0} add(%select.21448.3.clone.1, %multiply.27057.1.clone.1)
+  %multiply.27058.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251191.1.clone.1, %select.21453.3.clone.1)
+  %add.251192.3.clone.1 = f32[1280,1280]{1,0} add(%select.21447.5.clone.1, %multiply.27058.1.clone.1)
+  %multiply.27059.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251192.3.clone.1, %select.21453.3.clone.1)
+  %add.251193.3.clone.1 = f32[1280,1280]{1,0} add(%select.21446.5.clone.1, %multiply.27059.1.clone.1)
+  %multiply.27060.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251193.3.clone.1, %select.21453.3.clone.1)
+  %add.251194.9.clone.1 = f32[1280,1280]{1,0} add(%select.21445.11.clone.1, %multiply.27060.7.clone.1)
+  %multiply.27061.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251194.9.clone.1, %select.21453.3.clone.1)
+  %add.251195.7.clone.1 = f32[1280,1280]{1,0} add(%select.21444.7.clone.1, %multiply.27061.7.clone.1)
+  %multiply.27062.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251195.7.clone.1, %maximum.3742.3.clone.1)
+  %select.21454.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7318.3.clone.1, %multiply.27050.9.clone.1, %multiply.27062.7.clone.1)
+  %multiply.27063.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21454.7.clone.1, %broadcast.244500.640)
+  %clamp.1222.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27063.5.clone.1, %broadcast.244501.384)
+  %multiply.27064.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1222.3.clone.1, %broadcast.244502.1)
+  %constant_176597_1_clone_1 = u32[] constant(3351390897)
+  %broadcast.252388.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176597_1_clone_1), dimensions={}
+  %add.248642.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.252388.44.clone.1)
+  %constant_176604_1_clone_1 = u32[] constant(3630472852)
+  %broadcast.252389.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176604_1_clone_1), dimensions={}
+  %add.248643.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.252389.113.clone.1)
+  %add.248644.35.clone.1 = u32[1280,1280]{1,0} add(%add.248642.37.clone.1, %add.248643.99.clone.1)
+  %shift-left.109720.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248643.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115919.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248643.99.clone.1, %broadcast.244415.6016)
+  %or.115441.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109720.31.clone.1, %shift-right-logical.115919.29.clone.1)
+  %xor.121988.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248644.35.clone.1, %or.115441.29.clone.1)
+  %add.248645.5.clone.1 = u32[1280,1280]{1,0} add(%add.248644.35.clone.1, %xor.121988.27.clone.1)
+  %shift-left.109721.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121988.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115920.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121988.27.clone.1, %broadcast.244417.5760)
+  %or.115442.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109721.9.clone.1, %shift-right-logical.115920.9.clone.1)
+  %xor.121989.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248645.5.clone.1, %or.115442.7.clone.1)
+  %add.248646.3.clone.1 = u32[1280,1280]{1,0} add(%add.248645.5.clone.1, %xor.121989.5.clone.1)
+  %shift-left.109722.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121989.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115921.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121989.5.clone.1, %broadcast.244419.4352)
+  %or.115443.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109722.5.clone.1, %shift-right-logical.115921.5.clone.1)
+  %xor.121990.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248646.3.clone.1, %or.115443.3.clone.1)
+  %add.248647.3.clone.1 = u32[1280,1280]{1,0} add(%add.248646.3.clone.1, %xor.121990.3.clone.1)
+  %add.248648.7.clone.1 = u32[1280,1280]{1,0} add(%add.248647.3.clone.1, %broadcast.252389.113.clone.1)
+  %shift-left.109723.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121990.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115922.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121990.3.clone.1, %broadcast.244418.4352)
+  %or.115444.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109723.5.clone.1, %shift-right-logical.115922.5.clone.1)
+  %xor.121991.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248647.3.clone.1, %or.115444.3.clone.1)
+  %constant_218272_1_clone_1 = u32[] constant(74952704)
+  %broadcast.252399.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218272_1_clone_1), dimensions={}
+  %add.248649.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121991.3.clone.1, %broadcast.252399.5.clone.1)
+  %add.248650.5.clone.1 = u32[1280,1280]{1,0} add(%add.248648.7.clone.1, %add.248649.5.clone.1)
+  %shift-left.109724.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248649.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115923.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248649.5.clone.1, %broadcast.244416.5760)
+  %or.115445.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109724.9.clone.1, %shift-right-logical.115923.9.clone.1)
+  %xor.121992.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248650.5.clone.1, %or.115445.7.clone.1)
+  %add.248651.3.clone.1 = u32[1280,1280]{1,0} add(%add.248650.5.clone.1, %xor.121992.5.clone.1)
+  %shift-left.109725.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121992.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115924.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121992.5.clone.1, %broadcast.244429.2304)
+  %or.115446.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109725.9.clone.1, %shift-right-logical.115924.9.clone.1)
+  %xor.121993.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248651.3.clone.1, %or.115446.7.clone.1)
+  %add.248652.3.clone.1 = u32[1280,1280]{1,0} add(%add.248651.3.clone.1, %xor.121993.5.clone.1)
+  %shift-left.109726.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121993.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115925.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121993.5.clone.1, %broadcast.244430.4608)
+  %or.115447.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109726.9.clone.1, %shift-right-logical.115925.9.clone.1)
+  %xor.121994.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248652.3.clone.1, %or.115447.7.clone.1)
+  %add.248653.3.clone.1 = u32[1280,1280]{1,0} add(%add.248652.3.clone.1, %xor.121994.5.clone.1)
+  %constant_176606_1_clone_1 = u32[] constant(74952703)
+  %broadcast.252406.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176606_1_clone_1), dimensions={}
+  %add.248654.7.clone.1 = u32[1280,1280]{1,0} add(%add.248653.3.clone.1, %broadcast.252406.24.clone.1)
+  %shift-left.109727.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121994.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115926.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121994.5.clone.1, %broadcast.244434.2816)
+  %or.115448.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109727.11.clone.1, %shift-right-logical.115926.11.clone.1)
+  %xor.121995.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248653.3.clone.1, %or.115448.9.clone.1)
+  %constant_218273_1_clone_1 = u32[] constant(3351390899)
+  %broadcast.252409.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218273_1_clone_1), dimensions={}
+  %add.248655.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121995.7.clone.1, %broadcast.252409.5.clone.1)
+  %add.248657.5.clone.1 = u32[1280,1280]{1,0} add(%add.248654.7.clone.1, %add.248655.5.clone.1)
+  %shift-left.109728.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248655.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115927.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248655.5.clone.1, %broadcast.244415.6016)
+  %or.115449.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109728.9.clone.1, %shift-right-logical.115927.9.clone.1)
+  %xor.121996.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248657.5.clone.1, %or.115449.7.clone.1)
+  %add.248658.3.clone.1 = u32[1280,1280]{1,0} add(%add.248657.5.clone.1, %xor.121996.5.clone.1)
+  %shift-left.109729.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121996.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115928.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121996.5.clone.1, %broadcast.244417.5760)
+  %or.115450.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109729.9.clone.1, %shift-right-logical.115928.9.clone.1)
+  %xor.121997.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248658.3.clone.1, %or.115450.7.clone.1)
+  %add.248659.3.clone.1 = u32[1280,1280]{1,0} add(%add.248658.3.clone.1, %xor.121997.5.clone.1)
+  %shift-left.109730.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121997.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115929.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121997.5.clone.1, %broadcast.244419.4352)
+  %or.115451.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109730.7.clone.1, %shift-right-logical.115929.7.clone.1)
+  %xor.121998.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248659.3.clone.1, %or.115451.5.clone.1)
+  %add.248660.3.clone.1 = u32[1280,1280]{1,0} add(%add.248659.3.clone.1, %xor.121998.3.clone.1)
+  %add.248661.7.clone.1 = u32[1280,1280]{1,0} add(%add.248660.3.clone.1, %broadcast.252388.44.clone.1)
+  %shift-left.109731.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121998.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115930.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121998.3.clone.1, %broadcast.244418.4352)
+  %or.115452.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109731.7.clone.1, %shift-right-logical.115930.7.clone.1)
+  %xor.121999.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248660.3.clone.1, %or.115452.5.clone.1)
+  %constant_218274_1_clone_1 = u32[] constant(3630472855)
+  %broadcast.252419.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218274_1_clone_1), dimensions={}
+  %add.248662.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121999.3.clone.1, %broadcast.252419.5.clone.1)
+  %add.248663.5.clone.1 = u32[1280,1280]{1,0} add(%add.248661.7.clone.1, %add.248662.5.clone.1)
+  %shift-left.109732.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248662.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115931.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248662.5.clone.1, %broadcast.244416.5760)
+  %or.115453.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109732.9.clone.1, %shift-right-logical.115931.9.clone.1)
+  %xor.122000.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248663.5.clone.1, %or.115453.7.clone.1)
+  %add.248664.3.clone.1 = u32[1280,1280]{1,0} add(%add.248663.5.clone.1, %xor.122000.5.clone.1)
+  %shift-left.109733.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122000.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115932.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122000.5.clone.1, %broadcast.244429.2304)
+  %or.115454.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109733.9.clone.1, %shift-right-logical.115932.9.clone.1)
+  %xor.122001.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248664.3.clone.1, %or.115454.7.clone.1)
+  %add.248665.3.clone.1 = u32[1280,1280]{1,0} add(%add.248664.3.clone.1, %xor.122001.5.clone.1)
+  %shift-left.109734.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122001.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115933.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122001.5.clone.1, %broadcast.244430.4608)
+  %or.115455.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109734.9.clone.1, %shift-right-logical.115933.9.clone.1)
+  %xor.122002.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248665.3.clone.1, %or.115455.7.clone.1)
+  %add.248666.3.clone.1 = u32[1280,1280]{1,0} add(%add.248665.3.clone.1, %xor.122002.5.clone.1)
+  %add.248667.7.clone.1 = u32[1280,1280]{1,0} add(%add.248666.3.clone.1, %broadcast.252389.113.clone.1)
+  %shift-left.109735.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122002.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115934.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122002.5.clone.1, %broadcast.244434.2816)
+  %or.115456.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109735.11.clone.1, %shift-right-logical.115934.11.clone.1)
+  %xor.122003.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248666.3.clone.1, %or.115456.9.clone.1)
+  %constant_218275_1_clone_1 = u32[] constant(74952707)
+  %broadcast.252429.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218275_1_clone_1), dimensions={}
+  %add.248668.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122003.7.clone.1, %broadcast.252429.5.clone.1)
+  %add.248669.5.clone.1 = u32[1280,1280]{1,0} add(%add.248667.7.clone.1, %add.248668.5.clone.1)
+  %shift-left.109736.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248668.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115935.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248668.5.clone.1, %broadcast.244415.6016)
+  %or.115457.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109736.9.clone.1, %shift-right-logical.115935.9.clone.1)
+  %xor.122004.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248669.5.clone.1, %or.115457.7.clone.1)
+  %add.248670.3.clone.1 = u32[1280,1280]{1,0} add(%add.248669.5.clone.1, %xor.122004.5.clone.1)
+  %shift-left.109737.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122004.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115936.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122004.5.clone.1, %broadcast.244417.5760)
+  %or.115458.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109737.9.clone.1, %shift-right-logical.115936.9.clone.1)
+  %xor.122005.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248670.3.clone.1, %or.115458.7.clone.1)
+  %add.248671.3.clone.1 = u32[1280,1280]{1,0} add(%add.248670.3.clone.1, %xor.122005.5.clone.1)
+  %shift-left.109738.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122005.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115937.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122005.5.clone.1, %broadcast.244419.4352)
+  %or.115459.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109738.5.clone.1, %shift-right-logical.115937.5.clone.1)
+  %xor.122006.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248671.3.clone.1, %or.115459.3.clone.1)
+  %add.248672.3.clone.1 = u32[1280,1280]{1,0} add(%add.248671.3.clone.1, %xor.122006.3.clone.1)
+  %add.248673.17.clone.1 = u32[1280,1280]{1,0} add(%add.248672.3.clone.1, %broadcast.252406.24.clone.1)
+  %shift-left.109739.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122006.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115938.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122006.3.clone.1, %broadcast.244418.4352)
+  %or.115460.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109739.5.clone.1, %shift-right-logical.115938.5.clone.1)
+  %xor.122007.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248672.3.clone.1, %or.115460.3.clone.1)
+  %constant_218276_1_clone_1 = u32[] constant(3351390902)
+  %broadcast.252439.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218276_1_clone_1), dimensions={}
+  %add.248674.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122007.15.clone.1, %broadcast.252439.19.clone.1)
+  %xor.122008.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248673.17.clone.1, %add.248674.19.clone.1)
+  %shift-right-logical.115939.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122008.17.clone.1, %broadcast.244468.1920)
+  %or.115461.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115939.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5760.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115461.13.clone.1)
+  %add.248675.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5760.11.clone.1, %broadcast.244470.1152)
+  %multiply.26528.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248675.9.clone.1, %broadcast.244471.896)
+  %add.248676.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26528.7.clone.1, %broadcast.244408.1024)
+  %maximum.3692.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248676.5.clone.1)
+  %abs.1544.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3692.3.clone.1)
+  %compare.7236.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1544.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26529.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3692.3.clone.1, %broadcast.244476.1152)
+  %negate.4593.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3692.3.clone.1)
+  %multiply.26530.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3692.3.clone.1, %negate.4593.5.clone.1)
+  %log-plus-one.1544.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26530.5.clone.1)
+  %negate.4594.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1544.3.clone.1)
+  %compare.7237.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4594.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21070.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21071.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21072.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21073.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21074.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21075.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21076.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21077.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21078.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248677.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4594.4.clone.1, %broadcast.244496.640)
+  %sqrt.1544.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4594.4.clone.1)
+  %add.248678.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1544.5.clone.1, %broadcast.244498.640)
+  %select.21079.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7237.3.clone.1, %add.248677.5.clone.1, %add.248678.5.clone.1)
+  %multiply.26531.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21078.3.clone.1, %select.21079.3.clone.1)
+  %add.248679.1.clone.1 = f32[1280,1280]{1,0} add(%select.21077.3.clone.1, %multiply.26531.1.clone.1)
+  %multiply.26532.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248679.1.clone.1, %select.21079.3.clone.1)
+  %add.248680.1.clone.1 = f32[1280,1280]{1,0} add(%select.21076.3.clone.1, %multiply.26532.1.clone.1)
+  %multiply.26533.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248680.1.clone.1, %select.21079.3.clone.1)
+  %add.248681.1.clone.1 = f32[1280,1280]{1,0} add(%select.21075.3.clone.1, %multiply.26533.1.clone.1)
+  %multiply.26534.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248681.1.clone.1, %select.21079.3.clone.1)
+  %add.248682.1.clone.1 = f32[1280,1280]{1,0} add(%select.21074.3.clone.1, %multiply.26534.1.clone.1)
+  %multiply.26535.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248682.1.clone.1, %select.21079.3.clone.1)
+  %add.248683.3.clone.1 = f32[1280,1280]{1,0} add(%select.21073.5.clone.1, %multiply.26535.1.clone.1)
+  %multiply.26536.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248683.3.clone.1, %select.21079.3.clone.1)
+  %add.248684.3.clone.1 = f32[1280,1280]{1,0} add(%select.21072.5.clone.1, %multiply.26536.1.clone.1)
+  %multiply.26537.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248684.3.clone.1, %select.21079.3.clone.1)
+  %add.248685.9.clone.1 = f32[1280,1280]{1,0} add(%select.21071.11.clone.1, %multiply.26537.7.clone.1)
+  %multiply.26538.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248685.9.clone.1, %select.21079.3.clone.1)
+  %add.248686.7.clone.1 = f32[1280,1280]{1,0} add(%select.21070.7.clone.1, %multiply.26538.7.clone.1)
+  %multiply.26539.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248686.7.clone.1, %maximum.3692.3.clone.1)
+  %select.21080.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7236.3.clone.1, %multiply.26529.9.clone.1, %multiply.26539.7.clone.1)
+  %multiply.26540.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21080.7.clone.1, %broadcast.244500.640)
+  %clamp.1188.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26540.5.clone.1, %broadcast.244501.384)
+  %multiply.26541.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1188.3.clone.1, %broadcast.244502.1)
+  %constant_194669_1_clone_1 = u32[] constant(4088501695)
+  %broadcast.260211.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_194669_1_clone_1), dimensions={}
+  %add.253106.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.260211.44.clone.1)
+  %constant_194676_1_clone_1 = u32[] constant(3766148499)
+  %broadcast.260212.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_194676_1_clone_1), dimensions={}
+  %add.253107.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.260212.113.clone.1)
+  %add.253108.35.clone.1 = u32[1280,1280]{1,0} add(%add.253106.37.clone.1, %add.253107.99.clone.1)
+  %shift-left.111666.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253107.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117960.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253107.99.clone.1, %broadcast.244415.6016)
+  %or.117499.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111666.31.clone.1, %shift-right-logical.117960.29.clone.1)
+  %xor.124059.27.clone.1 = u32[1280,1280]{1,0} xor(%add.253108.35.clone.1, %or.117499.29.clone.1)
+  %add.253109.5.clone.1 = u32[1280,1280]{1,0} add(%add.253108.35.clone.1, %xor.124059.27.clone.1)
+  %shift-left.111667.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124059.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117961.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124059.27.clone.1, %broadcast.244417.5760)
+  %or.117500.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111667.9.clone.1, %shift-right-logical.117961.9.clone.1)
+  %xor.124060.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253109.5.clone.1, %or.117500.7.clone.1)
+  %add.253110.3.clone.1 = u32[1280,1280]{1,0} add(%add.253109.5.clone.1, %xor.124060.5.clone.1)
+  %shift-left.111668.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124060.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117962.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124060.5.clone.1, %broadcast.244419.4352)
+  %or.117502.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111668.5.clone.1, %shift-right-logical.117962.5.clone.1)
+  %xor.124061.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253110.3.clone.1, %or.117502.3.clone.1)
+  %add.253111.3.clone.1 = u32[1280,1280]{1,0} add(%add.253110.3.clone.1, %xor.124061.3.clone.1)
+  %add.253112.7.clone.1 = u32[1280,1280]{1,0} add(%add.253111.3.clone.1, %broadcast.260212.113.clone.1)
+  %shift-left.111669.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124061.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117963.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124061.3.clone.1, %broadcast.244418.4352)
+  %or.117503.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111669.5.clone.1, %shift-right-logical.117963.5.clone.1)
+  %xor.124062.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253111.3.clone.1, %or.117503.3.clone.1)
+  %constant_218766_1_clone_1 = u32[] constant(135943159)
+  %broadcast.260224.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218766_1_clone_1), dimensions={}
+  %add.253114.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124062.3.clone.1, %broadcast.260224.5.clone.1)
+  %add.253115.5.clone.1 = u32[1280,1280]{1,0} add(%add.253112.7.clone.1, %add.253114.5.clone.1)
+  %shift-left.111671.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253114.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117964.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253114.5.clone.1, %broadcast.244416.5760)
+  %or.117504.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111671.9.clone.1, %shift-right-logical.117964.9.clone.1)
+  %xor.124063.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253115.5.clone.1, %or.117504.7.clone.1)
+  %add.253116.3.clone.1 = u32[1280,1280]{1,0} add(%add.253115.5.clone.1, %xor.124063.5.clone.1)
+  %shift-left.111672.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124063.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117965.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124063.5.clone.1, %broadcast.244429.2304)
+  %or.117505.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111672.9.clone.1, %shift-right-logical.117965.9.clone.1)
+  %xor.124064.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253116.3.clone.1, %or.117505.7.clone.1)
+  %add.253117.3.clone.1 = u32[1280,1280]{1,0} add(%add.253116.3.clone.1, %xor.124064.5.clone.1)
+  %shift-left.111673.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124064.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117966.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124064.5.clone.1, %broadcast.244430.4608)
+  %or.117507.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111673.9.clone.1, %shift-right-logical.117966.9.clone.1)
+  %xor.124065.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253117.3.clone.1, %or.117507.7.clone.1)
+  %add.253118.3.clone.1 = u32[1280,1280]{1,0} add(%add.253117.3.clone.1, %xor.124065.5.clone.1)
+  %constant_194678_1_clone_1 = u32[] constant(135943158)
+  %broadcast.260231.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_194678_1_clone_1), dimensions={}
+  %add.253119.7.clone.1 = u32[1280,1280]{1,0} add(%add.253118.3.clone.1, %broadcast.260231.24.clone.1)
+  %shift-left.111674.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124065.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117967.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124065.5.clone.1, %broadcast.244434.2816)
+  %or.117508.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111674.11.clone.1, %shift-right-logical.117967.11.clone.1)
+  %xor.124066.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253118.3.clone.1, %or.117508.9.clone.1)
+  %constant_218768_1_clone_1 = u32[] constant(4088501697)
+  %broadcast.260234.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218768_1_clone_1), dimensions={}
+  %add.253120.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124066.7.clone.1, %broadcast.260234.5.clone.1)
+  %add.253121.5.clone.1 = u32[1280,1280]{1,0} add(%add.253119.7.clone.1, %add.253120.5.clone.1)
+  %shift-left.111676.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253120.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117968.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253120.5.clone.1, %broadcast.244415.6016)
+  %or.117509.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111676.9.clone.1, %shift-right-logical.117968.9.clone.1)
+  %xor.124067.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253121.5.clone.1, %or.117509.7.clone.1)
+  %add.253122.3.clone.1 = u32[1280,1280]{1,0} add(%add.253121.5.clone.1, %xor.124067.5.clone.1)
+  %shift-left.111677.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124067.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117969.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124067.5.clone.1, %broadcast.244417.5760)
+  %or.117510.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111677.9.clone.1, %shift-right-logical.117969.9.clone.1)
+  %xor.124068.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253122.3.clone.1, %or.117510.7.clone.1)
+  %add.253123.3.clone.1 = u32[1280,1280]{1,0} add(%add.253122.3.clone.1, %xor.124068.5.clone.1)
+  %shift-left.111678.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124068.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117970.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124068.5.clone.1, %broadcast.244419.4352)
+  %or.117512.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111678.7.clone.1, %shift-right-logical.117970.7.clone.1)
+  %xor.124069.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253123.3.clone.1, %or.117512.5.clone.1)
+  %add.253124.3.clone.1 = u32[1280,1280]{1,0} add(%add.253123.3.clone.1, %xor.124069.3.clone.1)
+  %add.253125.7.clone.1 = u32[1280,1280]{1,0} add(%add.253124.3.clone.1, %broadcast.260211.44.clone.1)
+  %shift-left.111679.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124069.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117972.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124069.3.clone.1, %broadcast.244418.4352)
+  %or.117513.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111679.7.clone.1, %shift-right-logical.117972.7.clone.1)
+  %xor.124070.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253124.3.clone.1, %or.117513.5.clone.1)
+  %constant_218770_1_clone_1 = u32[] constant(3766148502)
+  %broadcast.260246.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218770_1_clone_1), dimensions={}
+  %add.253126.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124070.3.clone.1, %broadcast.260246.5.clone.1)
+  %add.253127.5.clone.1 = u32[1280,1280]{1,0} add(%add.253125.7.clone.1, %add.253126.5.clone.1)
+  %shift-left.111680.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253126.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117973.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253126.5.clone.1, %broadcast.244416.5760)
+  %or.117514.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111680.9.clone.1, %shift-right-logical.117973.9.clone.1)
+  %xor.124071.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253127.5.clone.1, %or.117514.7.clone.1)
+  %add.253128.3.clone.1 = u32[1280,1280]{1,0} add(%add.253127.5.clone.1, %xor.124071.5.clone.1)
+  %shift-left.111681.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124071.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117974.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124071.5.clone.1, %broadcast.244429.2304)
+  %or.117515.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111681.9.clone.1, %shift-right-logical.117974.9.clone.1)
+  %xor.124072.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253128.3.clone.1, %or.117515.7.clone.1)
+  %add.253129.3.clone.1 = u32[1280,1280]{1,0} add(%add.253128.3.clone.1, %xor.124072.5.clone.1)
+  %shift-left.111682.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124072.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117975.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124072.5.clone.1, %broadcast.244430.4608)
+  %or.117516.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111682.9.clone.1, %shift-right-logical.117975.9.clone.1)
+  %xor.124073.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253129.3.clone.1, %or.117516.7.clone.1)
+  %add.253130.3.clone.1 = u32[1280,1280]{1,0} add(%add.253129.3.clone.1, %xor.124073.5.clone.1)
+  %add.253131.7.clone.1 = u32[1280,1280]{1,0} add(%add.253130.3.clone.1, %broadcast.260212.113.clone.1)
+  %shift-left.111683.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124073.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117977.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124073.5.clone.1, %broadcast.244434.2816)
+  %or.117517.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111683.11.clone.1, %shift-right-logical.117977.11.clone.1)
+  %xor.124074.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253130.3.clone.1, %or.117517.9.clone.1)
+  %constant_218772_1_clone_1 = u32[] constant(135943162)
+  %broadcast.260261.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218772_1_clone_1), dimensions={}
+  %add.253132.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124074.7.clone.1, %broadcast.260261.5.clone.1)
+  %add.253133.5.clone.1 = u32[1280,1280]{1,0} add(%add.253131.7.clone.1, %add.253132.5.clone.1)
+  %shift-left.111684.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253132.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117978.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253132.5.clone.1, %broadcast.244415.6016)
+  %or.117518.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111684.9.clone.1, %shift-right-logical.117978.9.clone.1)
+  %xor.124075.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253133.5.clone.1, %or.117518.7.clone.1)
+  %add.253134.3.clone.1 = u32[1280,1280]{1,0} add(%add.253133.5.clone.1, %xor.124075.5.clone.1)
+  %shift-left.111686.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124075.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117979.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124075.5.clone.1, %broadcast.244417.5760)
+  %or.117519.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111686.9.clone.1, %shift-right-logical.117979.9.clone.1)
+  %xor.124076.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253134.3.clone.1, %or.117519.7.clone.1)
+  %add.253135.3.clone.1 = u32[1280,1280]{1,0} add(%add.253134.3.clone.1, %xor.124076.5.clone.1)
+  %shift-left.111687.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124076.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117980.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124076.5.clone.1, %broadcast.244419.4352)
+  %or.117520.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111687.5.clone.1, %shift-right-logical.117980.5.clone.1)
+  %xor.124077.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253135.3.clone.1, %or.117520.3.clone.1)
+  %add.253136.3.clone.1 = u32[1280,1280]{1,0} add(%add.253135.3.clone.1, %xor.124077.3.clone.1)
+  %add.253137.17.clone.1 = u32[1280,1280]{1,0} add(%add.253136.3.clone.1, %broadcast.260231.24.clone.1)
+  %shift-left.111688.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124077.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117982.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124077.3.clone.1, %broadcast.244418.4352)
+  %or.117522.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111688.5.clone.1, %shift-right-logical.117982.5.clone.1)
+  %xor.124078.15.clone.1 = u32[1280,1280]{1,0} xor(%add.253136.3.clone.1, %or.117522.3.clone.1)
+  %constant_218774_1_clone_1 = u32[] constant(4088501700)
+  %broadcast.260273.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218774_1_clone_1), dimensions={}
+  %add.253138.19.clone.1 = u32[1280,1280]{1,0} add(%xor.124078.15.clone.1, %broadcast.260273.19.clone.1)
+  %xor.124079.17.clone.1 = u32[1280,1280]{1,0} xor(%add.253137.17.clone.1, %add.253138.19.clone.1)
+  %shift-right-logical.117983.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124079.17.clone.1, %broadcast.244468.1920)
+  %or.117523.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117983.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5849.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117523.13.clone.1)
+  %add.253139.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5849.11.clone.1, %broadcast.244470.1152)
+  %multiply.27430.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253139.9.clone.1, %broadcast.244471.896)
+  %add.253140.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27430.7.clone.1, %broadcast.244408.1024)
+  %maximum.3781.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.253140.5.clone.1)
+  %abs.1603.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3781.3.clone.1)
+  %compare.7368.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1603.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27431.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3781.3.clone.1, %broadcast.244476.1152)
+  %negate.4711.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3781.3.clone.1)
+  %multiply.27432.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3781.3.clone.1, %negate.4711.5.clone.1)
+  %log-plus-one.1603.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27432.5.clone.1)
+  %negate.4712.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1603.3.clone.1)
+  %compare.7369.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4712.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21740.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21741.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21742.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21743.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21744.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21745.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21746.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21747.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21748.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.253141.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4712.4.clone.1, %broadcast.244496.640)
+  %sqrt.1603.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4712.4.clone.1)
+  %add.253142.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1603.5.clone.1, %broadcast.244498.640)
+  %select.21749.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7369.3.clone.1, %add.253141.5.clone.1, %add.253142.5.clone.1)
+  %multiply.27433.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21748.3.clone.1, %select.21749.3.clone.1)
+  %add.253143.1.clone.1 = f32[1280,1280]{1,0} add(%select.21747.3.clone.1, %multiply.27433.1.clone.1)
+  %multiply.27434.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253143.1.clone.1, %select.21749.3.clone.1)
+  %add.253144.1.clone.1 = f32[1280,1280]{1,0} add(%select.21746.3.clone.1, %multiply.27434.1.clone.1)
+  %multiply.27435.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253144.1.clone.1, %select.21749.3.clone.1)
+  %add.253145.1.clone.1 = f32[1280,1280]{1,0} add(%select.21745.3.clone.1, %multiply.27435.1.clone.1)
+  %multiply.27436.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253145.1.clone.1, %select.21749.3.clone.1)
+  %add.253146.1.clone.1 = f32[1280,1280]{1,0} add(%select.21744.3.clone.1, %multiply.27436.1.clone.1)
+  %multiply.27437.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253146.1.clone.1, %select.21749.3.clone.1)
+  %add.253147.3.clone.1 = f32[1280,1280]{1,0} add(%select.21743.5.clone.1, %multiply.27437.1.clone.1)
+  %multiply.27438.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253147.3.clone.1, %select.21749.3.clone.1)
+  %add.253148.3.clone.1 = f32[1280,1280]{1,0} add(%select.21742.5.clone.1, %multiply.27438.1.clone.1)
+  %multiply.27439.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253148.3.clone.1, %select.21749.3.clone.1)
+  %add.253149.9.clone.1 = f32[1280,1280]{1,0} add(%select.21741.11.clone.1, %multiply.27439.7.clone.1)
+  %multiply.27440.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253149.9.clone.1, %select.21749.3.clone.1)
+  %add.253150.7.clone.1 = f32[1280,1280]{1,0} add(%select.21740.7.clone.1, %multiply.27440.7.clone.1)
+  %multiply.27441.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253150.7.clone.1, %maximum.3781.3.clone.1)
+  %select.21750.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7368.3.clone.1, %multiply.27431.9.clone.1, %multiply.27441.7.clone.1)
+  %multiply.27442.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21750.7.clone.1, %broadcast.244500.640)
+  %clamp.1247.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27442.5.clone.1, %broadcast.244501.384)
+  %multiply.27443.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1247.3.clone.1, %broadcast.244502.1)
+  %constant_176386_1_clone_1 = u32[] constant(866984013)
+  %broadcast.252287.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176386_1_clone_1), dimensions={}
+  %add.248592.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.252287.44.clone.1)
+  %constant_176393_1_clone_1 = u32[] constant(280297754)
+  %broadcast.252289.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176393_1_clone_1), dimensions={}
+  %add.248593.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.252289.113.clone.1)
+  %add.248595.35.clone.1 = u32[1280,1280]{1,0} add(%add.248592.37.clone.1, %add.248593.99.clone.1)
+  %shift-left.109700.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248593.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115898.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248593.99.clone.1, %broadcast.244415.6016)
+  %or.115420.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109700.31.clone.1, %shift-right-logical.115898.29.clone.1)
+  %xor.121967.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248595.35.clone.1, %or.115420.29.clone.1)
+  %add.248596.5.clone.1 = u32[1280,1280]{1,0} add(%add.248595.35.clone.1, %xor.121967.27.clone.1)
+  %shift-left.109701.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121967.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115899.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121967.27.clone.1, %broadcast.244417.5760)
+  %or.115421.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109701.9.clone.1, %shift-right-logical.115899.9.clone.1)
+  %xor.121968.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248596.5.clone.1, %or.115421.7.clone.1)
+  %add.248597.3.clone.1 = u32[1280,1280]{1,0} add(%add.248596.5.clone.1, %xor.121968.5.clone.1)
+  %shift-left.109702.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121968.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115900.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121968.5.clone.1, %broadcast.244419.4352)
+  %or.115422.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109702.5.clone.1, %shift-right-logical.115900.5.clone.1)
+  %xor.121969.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248597.3.clone.1, %or.115422.3.clone.1)
+  %add.248598.3.clone.1 = u32[1280,1280]{1,0} add(%add.248597.3.clone.1, %xor.121969.3.clone.1)
+  %add.248600.7.clone.1 = u32[1280,1280]{1,0} add(%add.248598.3.clone.1, %broadcast.252289.113.clone.1)
+  %shift-left.109703.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121969.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115901.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121969.3.clone.1, %broadcast.244418.4352)
+  %or.115423.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109703.5.clone.1, %shift-right-logical.115901.5.clone.1)
+  %xor.121970.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248598.3.clone.1, %or.115423.3.clone.1)
+  %constant_218267_1_clone_1 = u32[] constant(952711822)
+  %broadcast.252302.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218267_1_clone_1), dimensions={}
+  %add.248601.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121970.3.clone.1, %broadcast.252302.5.clone.1)
+  %add.248602.5.clone.1 = u32[1280,1280]{1,0} add(%add.248600.7.clone.1, %add.248601.5.clone.1)
+  %shift-left.109704.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248601.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115902.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248601.5.clone.1, %broadcast.244416.5760)
+  %or.115424.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109704.9.clone.1, %shift-right-logical.115902.9.clone.1)
+  %xor.121971.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248602.5.clone.1, %or.115424.7.clone.1)
+  %add.248603.3.clone.1 = u32[1280,1280]{1,0} add(%add.248602.5.clone.1, %xor.121971.5.clone.1)
+  %shift-left.109705.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121971.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115903.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121971.5.clone.1, %broadcast.244429.2304)
+  %or.115425.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109705.9.clone.1, %shift-right-logical.115903.9.clone.1)
+  %xor.121972.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248603.3.clone.1, %or.115425.7.clone.1)
+  %add.248604.3.clone.1 = u32[1280,1280]{1,0} add(%add.248603.3.clone.1, %xor.121972.5.clone.1)
+  %shift-left.109706.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121972.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115904.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121972.5.clone.1, %broadcast.244430.4608)
+  %or.115426.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109706.9.clone.1, %shift-right-logical.115904.9.clone.1)
+  %xor.121973.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248604.3.clone.1, %or.115426.7.clone.1)
+  %add.248606.3.clone.1 = u32[1280,1280]{1,0} add(%add.248604.3.clone.1, %xor.121973.5.clone.1)
+  %constant_176395_1_clone_1 = u32[] constant(952711821)
+  %broadcast.252312.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_176395_1_clone_1), dimensions={}
+  %add.248609.7.clone.1 = u32[1280,1280]{1,0} add(%add.248606.3.clone.1, %broadcast.252312.24.clone.1)
+  %shift-left.109707.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121973.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115905.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121973.5.clone.1, %broadcast.244434.2816)
+  %or.115427.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109707.11.clone.1, %shift-right-logical.115905.11.clone.1)
+  %xor.121974.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248606.3.clone.1, %or.115427.9.clone.1)
+  %constant_218268_1_clone_1 = u32[] constant(866984015)
+  %broadcast.252315.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218268_1_clone_1), dimensions={}
+  %add.248610.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121974.7.clone.1, %broadcast.252315.5.clone.1)
+  %add.248611.5.clone.1 = u32[1280,1280]{1,0} add(%add.248609.7.clone.1, %add.248610.5.clone.1)
+  %shift-left.109708.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248610.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115906.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248610.5.clone.1, %broadcast.244415.6016)
+  %or.115428.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109708.9.clone.1, %shift-right-logical.115906.9.clone.1)
+  %xor.121975.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248611.5.clone.1, %or.115428.7.clone.1)
+  %add.248612.3.clone.1 = u32[1280,1280]{1,0} add(%add.248611.5.clone.1, %xor.121975.5.clone.1)
+  %shift-left.109709.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121975.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115907.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121975.5.clone.1, %broadcast.244417.5760)
+  %or.115429.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109709.9.clone.1, %shift-right-logical.115907.9.clone.1)
+  %xor.121976.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248612.3.clone.1, %or.115429.7.clone.1)
+  %add.248613.3.clone.1 = u32[1280,1280]{1,0} add(%add.248612.3.clone.1, %xor.121976.5.clone.1)
+  %shift-left.109710.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121976.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115908.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121976.5.clone.1, %broadcast.244419.4352)
+  %or.115430.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109710.7.clone.1, %shift-right-logical.115908.7.clone.1)
+  %xor.121977.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248613.3.clone.1, %or.115430.5.clone.1)
+  %add.248614.3.clone.1 = u32[1280,1280]{1,0} add(%add.248613.3.clone.1, %xor.121977.3.clone.1)
+  %add.248615.7.clone.1 = u32[1280,1280]{1,0} add(%add.248614.3.clone.1, %broadcast.252287.44.clone.1)
+  %shift-left.109711.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121977.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115909.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121977.3.clone.1, %broadcast.244418.4352)
+  %or.115431.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109711.7.clone.1, %shift-right-logical.115909.7.clone.1)
+  %xor.121978.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248614.3.clone.1, %or.115431.5.clone.1)
+  %constant_218269_1_clone_1 = u32[] constant(280297757)
+  %broadcast.252327.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218269_1_clone_1), dimensions={}
+  %add.248616.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121978.3.clone.1, %broadcast.252327.5.clone.1)
+  %add.248617.5.clone.1 = u32[1280,1280]{1,0} add(%add.248615.7.clone.1, %add.248616.5.clone.1)
+  %shift-left.109712.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248616.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115910.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248616.5.clone.1, %broadcast.244416.5760)
+  %or.115432.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109712.9.clone.1, %shift-right-logical.115910.9.clone.1)
+  %xor.121979.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248617.5.clone.1, %or.115432.7.clone.1)
+  %add.248618.3.clone.1 = u32[1280,1280]{1,0} add(%add.248617.5.clone.1, %xor.121979.5.clone.1)
+  %shift-left.109713.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121979.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115911.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121979.5.clone.1, %broadcast.244429.2304)
+  %or.115433.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109713.9.clone.1, %shift-right-logical.115911.9.clone.1)
+  %xor.121980.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248618.3.clone.1, %or.115433.7.clone.1)
+  %add.248619.3.clone.1 = u32[1280,1280]{1,0} add(%add.248618.3.clone.1, %xor.121980.5.clone.1)
+  %shift-left.109714.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121980.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115912.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121980.5.clone.1, %broadcast.244430.4608)
+  %or.115434.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109714.9.clone.1, %shift-right-logical.115912.9.clone.1)
+  %xor.121981.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248619.3.clone.1, %or.115434.7.clone.1)
+  %add.248620.3.clone.1 = u32[1280,1280]{1,0} add(%add.248619.3.clone.1, %xor.121981.5.clone.1)
+  %add.248621.7.clone.1 = u32[1280,1280]{1,0} add(%add.248620.3.clone.1, %broadcast.252289.113.clone.1)
+  %shift-left.109715.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121981.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115913.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121981.5.clone.1, %broadcast.244434.2816)
+  %or.115435.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109715.11.clone.1, %shift-right-logical.115913.11.clone.1)
+  %xor.121982.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248620.3.clone.1, %or.115435.9.clone.1)
+  %constant_218270_1_clone_1 = u32[] constant(952711825)
+  %broadcast.252342.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218270_1_clone_1), dimensions={}
+  %add.248622.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121982.7.clone.1, %broadcast.252342.5.clone.1)
+  %add.248623.5.clone.1 = u32[1280,1280]{1,0} add(%add.248621.7.clone.1, %add.248622.5.clone.1)
+  %shift-left.109716.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248622.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115914.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248622.5.clone.1, %broadcast.244415.6016)
+  %or.115436.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109716.9.clone.1, %shift-right-logical.115914.9.clone.1)
+  %xor.121983.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248623.5.clone.1, %or.115436.7.clone.1)
+  %add.248624.3.clone.1 = u32[1280,1280]{1,0} add(%add.248623.5.clone.1, %xor.121983.5.clone.1)
+  %shift-left.109717.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121983.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115915.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121983.5.clone.1, %broadcast.244417.5760)
+  %or.115437.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109717.9.clone.1, %shift-right-logical.115915.9.clone.1)
+  %xor.121984.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248624.3.clone.1, %or.115437.7.clone.1)
+  %add.248625.3.clone.1 = u32[1280,1280]{1,0} add(%add.248624.3.clone.1, %xor.121984.5.clone.1)
+  %shift-left.109718.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121984.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115916.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121984.5.clone.1, %broadcast.244419.4352)
+  %or.115438.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109718.5.clone.1, %shift-right-logical.115916.5.clone.1)
+  %xor.121985.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248625.3.clone.1, %or.115438.3.clone.1)
+  %add.248626.3.clone.1 = u32[1280,1280]{1,0} add(%add.248625.3.clone.1, %xor.121985.3.clone.1)
+  %add.248627.17.clone.1 = u32[1280,1280]{1,0} add(%add.248626.3.clone.1, %broadcast.252312.24.clone.1)
+  %shift-left.109719.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121985.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115917.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121985.3.clone.1, %broadcast.244418.4352)
+  %or.115439.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109719.5.clone.1, %shift-right-logical.115917.5.clone.1)
+  %xor.121986.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248626.3.clone.1, %or.115439.3.clone.1)
+  %constant_218271_1_clone_1 = u32[] constant(866984018)
+  %broadcast.252353.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218271_1_clone_1), dimensions={}
+  %add.248628.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121986.15.clone.1, %broadcast.252353.19.clone.1)
+  %xor.121987.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248627.17.clone.1, %add.248628.19.clone.1)
+  %shift-right-logical.115918.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121987.17.clone.1, %broadcast.244468.1920)
+  %or.115440.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115918.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5759.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115440.13.clone.1)
+  %add.248629.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5759.11.clone.1, %broadcast.244470.1152)
+  %multiply.26514.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248629.9.clone.1, %broadcast.244471.896)
+  %add.248630.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26514.7.clone.1, %broadcast.244408.1024)
+  %maximum.3691.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248630.5.clone.1)
+  %abs.1543.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3691.3.clone.1)
+  %compare.7234.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1543.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26515.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3691.3.clone.1, %broadcast.244476.1152)
+  %negate.4591.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3691.3.clone.1)
+  %multiply.26516.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3691.3.clone.1, %negate.4591.5.clone.1)
+  %log-plus-one.1543.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26516.5.clone.1)
+  %negate.4592.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1543.3.clone.1)
+  %compare.7235.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4592.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21059.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21060.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21061.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21062.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21063.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21064.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21065.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21066.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21067.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248631.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4592.4.clone.1, %broadcast.244496.640)
+  %sqrt.1543.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4592.4.clone.1)
+  %add.248632.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1543.5.clone.1, %broadcast.244498.640)
+  %select.21068.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7235.3.clone.1, %add.248631.5.clone.1, %add.248632.5.clone.1)
+  %multiply.26517.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21067.3.clone.1, %select.21068.3.clone.1)
+  %add.248633.1.clone.1 = f32[1280,1280]{1,0} add(%select.21066.3.clone.1, %multiply.26517.1.clone.1)
+  %multiply.26518.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248633.1.clone.1, %select.21068.3.clone.1)
+  %add.248634.1.clone.1 = f32[1280,1280]{1,0} add(%select.21065.3.clone.1, %multiply.26518.1.clone.1)
+  %multiply.26519.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248634.1.clone.1, %select.21068.3.clone.1)
+  %add.248635.1.clone.1 = f32[1280,1280]{1,0} add(%select.21064.3.clone.1, %multiply.26519.1.clone.1)
+  %multiply.26520.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248635.1.clone.1, %select.21068.3.clone.1)
+  %add.248637.1.clone.1 = f32[1280,1280]{1,0} add(%select.21063.3.clone.1, %multiply.26520.1.clone.1)
+  %multiply.26521.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248637.1.clone.1, %select.21068.3.clone.1)
+  %add.248638.3.clone.1 = f32[1280,1280]{1,0} add(%select.21062.5.clone.1, %multiply.26521.1.clone.1)
+  %multiply.26522.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248638.3.clone.1, %select.21068.3.clone.1)
+  %add.248639.3.clone.1 = f32[1280,1280]{1,0} add(%select.21061.5.clone.1, %multiply.26522.1.clone.1)
+  %multiply.26523.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248639.3.clone.1, %select.21068.3.clone.1)
+  %add.248640.9.clone.1 = f32[1280,1280]{1,0} add(%select.21060.11.clone.1, %multiply.26523.7.clone.1)
+  %multiply.26524.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248640.9.clone.1, %select.21068.3.clone.1)
+  %add.248641.7.clone.1 = f32[1280,1280]{1,0} add(%select.21059.7.clone.1, %multiply.26524.7.clone.1)
+  %multiply.26525.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248641.7.clone.1, %maximum.3691.3.clone.1)
+  %select.21069.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7234.3.clone.1, %multiply.26515.9.clone.1, %multiply.26525.7.clone.1)
+  %multiply.26526.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21069.7.clone.1, %broadcast.244500.640)
+  %clamp.1187.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26526.5.clone.1, %broadcast.244501.384)
+  %multiply.26527.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1187.3.clone.1, %broadcast.244502.1)
+  %constant_186570_1_clone_1 = u32[] constant(658820832)
+  %broadcast.256722.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186570_1_clone_1), dimensions={}
+  %add.251101.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.256722.44.clone.1)
+  %constant_186577_1_clone_1 = u32[] constant(157675534)
+  %broadcast.256723.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186577_1_clone_1), dimensions={}
+  %add.251102.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256723.113.clone.1)
+  %add.251103.35.clone.1 = u32[1280,1280]{1,0} add(%add.251101.37.clone.1, %add.251102.99.clone.1)
+  %shift-left.110797.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251102.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117056.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251102.99.clone.1, %broadcast.244415.6016)
+  %or.116576.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110797.31.clone.1, %shift-right-logical.117056.29.clone.1)
+  %xor.123151.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251103.35.clone.1, %or.116576.29.clone.1)
+  %add.251104.5.clone.1 = u32[1280,1280]{1,0} add(%add.251103.35.clone.1, %xor.123151.27.clone.1)
+  %shift-left.110798.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123151.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117057.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123151.27.clone.1, %broadcast.244417.5760)
+  %or.116577.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110798.9.clone.1, %shift-right-logical.117057.9.clone.1)
+  %xor.123152.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251104.5.clone.1, %or.116577.7.clone.1)
+  %add.251106.3.clone.1 = u32[1280,1280]{1,0} add(%add.251104.5.clone.1, %xor.123152.5.clone.1)
+  %shift-left.110800.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123152.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117059.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123152.5.clone.1, %broadcast.244419.4352)
+  %or.116578.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110800.5.clone.1, %shift-right-logical.117059.5.clone.1)
+  %xor.123153.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251106.3.clone.1, %or.116578.3.clone.1)
+  %add.251107.3.clone.1 = u32[1280,1280]{1,0} add(%add.251106.3.clone.1, %xor.123153.3.clone.1)
+  %add.251108.7.clone.1 = u32[1280,1280]{1,0} add(%add.251107.3.clone.1, %broadcast.256723.113.clone.1)
+  %shift-left.110801.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123153.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117060.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123153.3.clone.1, %broadcast.244418.4352)
+  %or.116579.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110801.5.clone.1, %shift-right-logical.117060.5.clone.1)
+  %xor.123154.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251107.3.clone.1, %or.116579.3.clone.1)
+  %constant_218539_1_clone_1 = u32[] constant(904930613)
+  %broadcast.256733.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218539_1_clone_1), dimensions={}
+  %add.251109.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123154.3.clone.1, %broadcast.256733.5.clone.1)
+  %add.251111.5.clone.1 = u32[1280,1280]{1,0} add(%add.251108.7.clone.1, %add.251109.5.clone.1)
+  %shift-left.110802.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251109.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117061.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251109.5.clone.1, %broadcast.244416.5760)
+  %or.116581.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110802.9.clone.1, %shift-right-logical.117061.9.clone.1)
+  %xor.123155.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251111.5.clone.1, %or.116581.7.clone.1)
+  %add.251112.3.clone.1 = u32[1280,1280]{1,0} add(%add.251111.5.clone.1, %xor.123155.5.clone.1)
+  %shift-left.110803.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123155.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117062.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123155.5.clone.1, %broadcast.244429.2304)
+  %or.116582.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110803.9.clone.1, %shift-right-logical.117062.9.clone.1)
+  %xor.123156.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251112.3.clone.1, %or.116582.7.clone.1)
+  %add.251113.3.clone.1 = u32[1280,1280]{1,0} add(%add.251112.3.clone.1, %xor.123156.5.clone.1)
+  %shift-left.110805.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123156.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117064.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123156.5.clone.1, %broadcast.244430.4608)
+  %or.116583.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110805.9.clone.1, %shift-right-logical.117064.9.clone.1)
+  %xor.123157.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251113.3.clone.1, %or.116583.7.clone.1)
+  %add.251114.3.clone.1 = u32[1280,1280]{1,0} add(%add.251113.3.clone.1, %xor.123157.5.clone.1)
+  %constant_186579_1_clone_1 = u32[] constant(904930612)
+  %broadcast.256740.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186579_1_clone_1), dimensions={}
+  %add.251115.7.clone.1 = u32[1280,1280]{1,0} add(%add.251114.3.clone.1, %broadcast.256740.24.clone.1)
+  %shift-left.110806.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123157.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117065.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123157.5.clone.1, %broadcast.244434.2816)
+  %or.116584.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110806.11.clone.1, %shift-right-logical.117065.11.clone.1)
+  %xor.123158.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251114.3.clone.1, %or.116584.9.clone.1)
+  %constant_218540_1_clone_1 = u32[] constant(658820834)
+  %broadcast.256743.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218540_1_clone_1), dimensions={}
+  %add.251117.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123158.7.clone.1, %broadcast.256743.5.clone.1)
+  %add.251120.5.clone.1 = u32[1280,1280]{1,0} add(%add.251115.7.clone.1, %add.251117.5.clone.1)
+  %shift-left.110807.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251117.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117066.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251117.5.clone.1, %broadcast.244415.6016)
+  %or.116586.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110807.9.clone.1, %shift-right-logical.117066.9.clone.1)
+  %xor.123159.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251120.5.clone.1, %or.116586.7.clone.1)
+  %add.251121.3.clone.1 = u32[1280,1280]{1,0} add(%add.251120.5.clone.1, %xor.123159.5.clone.1)
+  %shift-left.110808.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123159.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117067.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123159.5.clone.1, %broadcast.244417.5760)
+  %or.116587.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110808.9.clone.1, %shift-right-logical.117067.9.clone.1)
+  %xor.123160.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251121.3.clone.1, %or.116587.7.clone.1)
+  %add.251122.3.clone.1 = u32[1280,1280]{1,0} add(%add.251121.3.clone.1, %xor.123160.5.clone.1)
+  %shift-left.110810.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123160.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117069.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123160.5.clone.1, %broadcast.244419.4352)
+  %or.116588.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110810.7.clone.1, %shift-right-logical.117069.7.clone.1)
+  %xor.123161.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251122.3.clone.1, %or.116588.5.clone.1)
+  %add.251123.3.clone.1 = u32[1280,1280]{1,0} add(%add.251122.3.clone.1, %xor.123161.3.clone.1)
+  %add.251124.7.clone.1 = u32[1280,1280]{1,0} add(%add.251123.3.clone.1, %broadcast.256722.44.clone.1)
+  %shift-left.110811.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123161.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117070.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123161.3.clone.1, %broadcast.244418.4352)
+  %or.116589.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110811.7.clone.1, %shift-right-logical.117070.7.clone.1)
+  %xor.123162.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251123.3.clone.1, %or.116589.5.clone.1)
+  %constant_218541_1_clone_1 = u32[] constant(157675537)
+  %broadcast.256753.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218541_1_clone_1), dimensions={}
+  %add.251125.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123162.3.clone.1, %broadcast.256753.5.clone.1)
+  %add.251126.5.clone.1 = u32[1280,1280]{1,0} add(%add.251124.7.clone.1, %add.251125.5.clone.1)
+  %shift-left.110812.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251125.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117071.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251125.5.clone.1, %broadcast.244416.5760)
+  %or.116591.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110812.9.clone.1, %shift-right-logical.117071.9.clone.1)
+  %xor.123163.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251126.5.clone.1, %or.116591.7.clone.1)
+  %add.251127.3.clone.1 = u32[1280,1280]{1,0} add(%add.251126.5.clone.1, %xor.123163.5.clone.1)
+  %shift-left.110813.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123163.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117072.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123163.5.clone.1, %broadcast.244429.2304)
+  %or.116592.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110813.9.clone.1, %shift-right-logical.117072.9.clone.1)
+  %xor.123164.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251127.3.clone.1, %or.116592.7.clone.1)
+  %add.251128.3.clone.1 = u32[1280,1280]{1,0} add(%add.251127.3.clone.1, %xor.123164.5.clone.1)
+  %shift-left.110814.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123164.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117073.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123164.5.clone.1, %broadcast.244430.4608)
+  %or.116593.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110814.9.clone.1, %shift-right-logical.117073.9.clone.1)
+  %xor.123165.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251128.3.clone.1, %or.116593.7.clone.1)
+  %add.251129.3.clone.1 = u32[1280,1280]{1,0} add(%add.251128.3.clone.1, %xor.123165.5.clone.1)
+  %add.251130.7.clone.1 = u32[1280,1280]{1,0} add(%add.251129.3.clone.1, %broadcast.256723.113.clone.1)
+  %shift-left.110815.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123165.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117074.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123165.5.clone.1, %broadcast.244434.2816)
+  %or.116594.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110815.11.clone.1, %shift-right-logical.117074.11.clone.1)
+  %xor.123166.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251129.3.clone.1, %or.116594.9.clone.1)
+  %constant_218542_1_clone_1 = u32[] constant(904930616)
+  %broadcast.256763.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218542_1_clone_1), dimensions={}
+  %add.251131.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123166.7.clone.1, %broadcast.256763.5.clone.1)
+  %add.251132.5.clone.1 = u32[1280,1280]{1,0} add(%add.251130.7.clone.1, %add.251131.5.clone.1)
+  %shift-left.110816.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251131.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117075.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251131.5.clone.1, %broadcast.244415.6016)
+  %or.116596.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110816.9.clone.1, %shift-right-logical.117075.9.clone.1)
+  %xor.123167.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251132.5.clone.1, %or.116596.7.clone.1)
+  %add.251133.3.clone.1 = u32[1280,1280]{1,0} add(%add.251132.5.clone.1, %xor.123167.5.clone.1)
+  %shift-left.110817.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123167.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117076.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123167.5.clone.1, %broadcast.244417.5760)
+  %or.116597.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110817.9.clone.1, %shift-right-logical.117076.9.clone.1)
+  %xor.123168.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251133.3.clone.1, %or.116597.7.clone.1)
+  %add.251134.3.clone.1 = u32[1280,1280]{1,0} add(%add.251133.3.clone.1, %xor.123168.5.clone.1)
+  %shift-left.110818.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123168.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117077.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123168.5.clone.1, %broadcast.244419.4352)
+  %or.116598.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110818.5.clone.1, %shift-right-logical.117077.5.clone.1)
+  %xor.123169.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251134.3.clone.1, %or.116598.3.clone.1)
+  %add.251135.3.clone.1 = u32[1280,1280]{1,0} add(%add.251134.3.clone.1, %xor.123169.3.clone.1)
+  %add.251136.17.clone.1 = u32[1280,1280]{1,0} add(%add.251135.3.clone.1, %broadcast.256740.24.clone.1)
+  %shift-left.110819.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123169.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117079.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123169.3.clone.1, %broadcast.244418.4352)
+  %or.116599.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110819.5.clone.1, %shift-right-logical.117079.5.clone.1)
+  %xor.123170.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251135.3.clone.1, %or.116599.3.clone.1)
+  %constant_218543_1_clone_1 = u32[] constant(658820837)
+  %broadcast.256773.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218543_1_clone_1), dimensions={}
+  %add.251137.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123170.15.clone.1, %broadcast.256773.19.clone.1)
+  %xor.123171.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251136.17.clone.1, %add.251137.19.clone.1)
+  %shift-right-logical.117080.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123171.17.clone.1, %broadcast.244468.1920)
+  %or.116600.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117080.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5809.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116600.13.clone.1)
+  %add.251138.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5809.11.clone.1, %broadcast.244470.1152)
+  %multiply.27027.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251138.9.clone.1, %broadcast.244471.896)
+  %add.251139.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27027.7.clone.1, %broadcast.244408.1024)
+  %maximum.3741.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251139.5.clone.1)
+  %abs.1577.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3741.3.clone.1)
+  %compare.7314.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1577.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27029.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3741.3.clone.1, %broadcast.244476.1152)
+  %negate.4659.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3741.3.clone.1)
+  %multiply.27030.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3741.3.clone.1, %negate.4659.5.clone.1)
+  %log-plus-one.1577.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27030.5.clone.1)
+  %negate.4660.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1577.3.clone.1)
+  %compare.7315.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4660.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21433.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21434.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21435.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21436.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21437.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21438.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21439.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21440.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21441.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251140.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4660.4.clone.1, %broadcast.244496.640)
+  %sqrt.1577.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4660.4.clone.1)
+  %add.251141.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1577.5.clone.1, %broadcast.244498.640)
+  %select.21442.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7315.3.clone.1, %add.251140.5.clone.1, %add.251141.5.clone.1)
+  %multiply.27032.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21441.3.clone.1, %select.21442.3.clone.1)
+  %add.251142.1.clone.1 = f32[1280,1280]{1,0} add(%select.21440.3.clone.1, %multiply.27032.1.clone.1)
+  %multiply.27033.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251142.1.clone.1, %select.21442.3.clone.1)
+  %add.251143.1.clone.1 = f32[1280,1280]{1,0} add(%select.21439.3.clone.1, %multiply.27033.1.clone.1)
+  %multiply.27035.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251143.1.clone.1, %select.21442.3.clone.1)
+  %add.251144.1.clone.1 = f32[1280,1280]{1,0} add(%select.21438.3.clone.1, %multiply.27035.1.clone.1)
+  %multiply.27036.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251144.1.clone.1, %select.21442.3.clone.1)
+  %add.251145.1.clone.1 = f32[1280,1280]{1,0} add(%select.21437.3.clone.1, %multiply.27036.1.clone.1)
+  %multiply.27038.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251145.1.clone.1, %select.21442.3.clone.1)
+  %add.251146.3.clone.1 = f32[1280,1280]{1,0} add(%select.21436.5.clone.1, %multiply.27038.1.clone.1)
+  %multiply.27039.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251146.3.clone.1, %select.21442.3.clone.1)
+  %add.251148.3.clone.1 = f32[1280,1280]{1,0} add(%select.21435.5.clone.1, %multiply.27039.1.clone.1)
+  %multiply.27041.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251148.3.clone.1, %select.21442.3.clone.1)
+  %add.251149.9.clone.1 = f32[1280,1280]{1,0} add(%select.21434.11.clone.1, %multiply.27041.7.clone.1)
+  %multiply.27042.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251149.9.clone.1, %select.21442.3.clone.1)
+  %add.251150.7.clone.1 = f32[1280,1280]{1,0} add(%select.21433.7.clone.1, %multiply.27042.7.clone.1)
+  %multiply.27044.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251150.7.clone.1, %maximum.3741.3.clone.1)
+  %select.21443.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7314.3.clone.1, %multiply.27029.9.clone.1, %multiply.27044.7.clone.1)
+  %multiply.27045.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21443.7.clone.1, %broadcast.244500.640)
+  %clamp.1221.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27045.5.clone.1, %broadcast.244501.384)
+  %multiply.27047.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1221.3.clone.1, %broadcast.244502.1)
+  %constant_175835_1_clone_1 = u32[] constant(1103031926)
+  %broadcast.252052.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175835_1_clone_1), dimensions={}
+  %add.248448.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.252052.44.clone.1)
+  %constant_175842_1_clone_1 = u32[] constant(3567742604)
+  %broadcast.252053.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175842_1_clone_1), dimensions={}
+  %add.248449.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.252053.113.clone.1)
+  %add.248450.35.clone.1 = u32[1280,1280]{1,0} add(%add.248448.37.clone.1, %add.248449.99.clone.1)
+  %shift-left.109631.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248449.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115831.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248449.99.clone.1, %broadcast.244415.6016)
+  %or.115357.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109631.31.clone.1, %shift-right-logical.115831.29.clone.1)
+  %xor.121904.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248450.35.clone.1, %or.115357.29.clone.1)
+  %add.248451.5.clone.1 = u32[1280,1280]{1,0} add(%add.248450.35.clone.1, %xor.121904.27.clone.1)
+  %shift-left.109632.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121904.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115833.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121904.27.clone.1, %broadcast.244417.5760)
+  %or.115358.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109632.9.clone.1, %shift-right-logical.115833.9.clone.1)
+  %xor.121905.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248451.5.clone.1, %or.115358.7.clone.1)
+  %add.248452.3.clone.1 = u32[1280,1280]{1,0} add(%add.248451.5.clone.1, %xor.121905.5.clone.1)
+  %shift-left.109633.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121905.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115834.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121905.5.clone.1, %broadcast.244419.4352)
+  %or.115359.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109633.5.clone.1, %shift-right-logical.115834.5.clone.1)
+  %xor.121906.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248452.3.clone.1, %or.115359.3.clone.1)
+  %add.248453.3.clone.1 = u32[1280,1280]{1,0} add(%add.248452.3.clone.1, %xor.121906.3.clone.1)
+  %add.248454.7.clone.1 = u32[1280,1280]{1,0} add(%add.248453.3.clone.1, %broadcast.252053.113.clone.1)
+  %shift-left.109635.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121906.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115835.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121906.3.clone.1, %broadcast.244418.4352)
+  %or.115360.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109635.5.clone.1, %shift-right-logical.115835.5.clone.1)
+  %xor.121907.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248453.3.clone.1, %or.115360.3.clone.1)
+  %constant_218252_1_clone_1 = u32[] constant(2395511585)
+  %broadcast.252063.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218252_1_clone_1), dimensions={}
+  %add.248455.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121907.3.clone.1, %broadcast.252063.5.clone.1)
+  %add.248456.5.clone.1 = u32[1280,1280]{1,0} add(%add.248454.7.clone.1, %add.248455.5.clone.1)
+  %shift-left.109636.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248455.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115836.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248455.5.clone.1, %broadcast.244416.5760)
+  %or.115361.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109636.9.clone.1, %shift-right-logical.115836.9.clone.1)
+  %xor.121908.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248456.5.clone.1, %or.115361.7.clone.1)
+  %add.248457.3.clone.1 = u32[1280,1280]{1,0} add(%add.248456.5.clone.1, %xor.121908.5.clone.1)
+  %shift-left.109637.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121908.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115838.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121908.5.clone.1, %broadcast.244429.2304)
+  %or.115362.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109637.9.clone.1, %shift-right-logical.115838.9.clone.1)
+  %xor.121909.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248457.3.clone.1, %or.115362.7.clone.1)
+  %add.248458.3.clone.1 = u32[1280,1280]{1,0} add(%add.248457.3.clone.1, %xor.121909.5.clone.1)
+  %shift-left.109638.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121909.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115839.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121909.5.clone.1, %broadcast.244430.4608)
+  %or.115363.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109638.9.clone.1, %shift-right-logical.115839.9.clone.1)
+  %xor.121910.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248458.3.clone.1, %or.115363.7.clone.1)
+  %add.248459.3.clone.1 = u32[1280,1280]{1,0} add(%add.248458.3.clone.1, %xor.121910.5.clone.1)
+  %constant_175844_1_clone_1 = u32[] constant(2395511584)
+  %broadcast.252070.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175844_1_clone_1), dimensions={}
+  %add.248460.7.clone.1 = u32[1280,1280]{1,0} add(%add.248459.3.clone.1, %broadcast.252070.24.clone.1)
+  %shift-left.109639.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121910.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115840.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121910.5.clone.1, %broadcast.244434.2816)
+  %or.115364.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109639.11.clone.1, %shift-right-logical.115840.11.clone.1)
+  %xor.121911.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248459.3.clone.1, %or.115364.9.clone.1)
+  %constant_218253_1_clone_1 = u32[] constant(1103031928)
+  %broadcast.252073.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218253_1_clone_1), dimensions={}
+  %add.248461.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121911.7.clone.1, %broadcast.252073.5.clone.1)
+  %add.248462.5.clone.1 = u32[1280,1280]{1,0} add(%add.248460.7.clone.1, %add.248461.5.clone.1)
+  %shift-left.109640.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248461.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115841.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248461.5.clone.1, %broadcast.244415.6016)
+  %or.115365.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109640.9.clone.1, %shift-right-logical.115841.9.clone.1)
+  %xor.121912.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248462.5.clone.1, %or.115365.7.clone.1)
+  %add.248463.3.clone.1 = u32[1280,1280]{1,0} add(%add.248462.5.clone.1, %xor.121912.5.clone.1)
+  %shift-left.109641.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121912.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115842.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121912.5.clone.1, %broadcast.244417.5760)
+  %or.115366.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109641.9.clone.1, %shift-right-logical.115842.9.clone.1)
+  %xor.121913.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248463.3.clone.1, %or.115366.7.clone.1)
+  %add.248464.3.clone.1 = u32[1280,1280]{1,0} add(%add.248463.3.clone.1, %xor.121913.5.clone.1)
+  %shift-left.109642.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121913.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115843.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121913.5.clone.1, %broadcast.244419.4352)
+  %or.115367.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109642.7.clone.1, %shift-right-logical.115843.7.clone.1)
+  %xor.121914.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248464.3.clone.1, %or.115367.5.clone.1)
+  %add.248465.3.clone.1 = u32[1280,1280]{1,0} add(%add.248464.3.clone.1, %xor.121914.3.clone.1)
+  %add.248466.7.clone.1 = u32[1280,1280]{1,0} add(%add.248465.3.clone.1, %broadcast.252052.44.clone.1)
+  %shift-left.109643.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121914.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115844.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121914.3.clone.1, %broadcast.244418.4352)
+  %or.115368.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109643.7.clone.1, %shift-right-logical.115844.7.clone.1)
+  %xor.121915.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248465.3.clone.1, %or.115368.5.clone.1)
+  %constant_218254_1_clone_1 = u32[] constant(3567742607)
+  %broadcast.252083.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218254_1_clone_1), dimensions={}
+  %add.248467.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121915.3.clone.1, %broadcast.252083.5.clone.1)
+  %add.248468.5.clone.1 = u32[1280,1280]{1,0} add(%add.248466.7.clone.1, %add.248467.5.clone.1)
+  %shift-left.109645.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248467.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115845.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248467.5.clone.1, %broadcast.244416.5760)
+  %or.115369.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109645.9.clone.1, %shift-right-logical.115845.9.clone.1)
+  %xor.121916.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248468.5.clone.1, %or.115369.7.clone.1)
+  %add.248469.3.clone.1 = u32[1280,1280]{1,0} add(%add.248468.5.clone.1, %xor.121916.5.clone.1)
+  %shift-left.109646.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121916.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115846.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121916.5.clone.1, %broadcast.244429.2304)
+  %or.115370.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109646.9.clone.1, %shift-right-logical.115846.9.clone.1)
+  %xor.121917.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248469.3.clone.1, %or.115370.7.clone.1)
+  %add.248470.3.clone.1 = u32[1280,1280]{1,0} add(%add.248469.3.clone.1, %xor.121917.5.clone.1)
+  %shift-left.109647.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121917.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115847.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121917.5.clone.1, %broadcast.244430.4608)
+  %or.115371.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109647.9.clone.1, %shift-right-logical.115847.9.clone.1)
+  %xor.121918.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248470.3.clone.1, %or.115371.7.clone.1)
+  %add.248471.3.clone.1 = u32[1280,1280]{1,0} add(%add.248470.3.clone.1, %xor.121918.5.clone.1)
+  %add.248472.7.clone.1 = u32[1280,1280]{1,0} add(%add.248471.3.clone.1, %broadcast.252053.113.clone.1)
+  %shift-left.109648.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121918.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115848.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121918.5.clone.1, %broadcast.244434.2816)
+  %or.115372.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109648.11.clone.1, %shift-right-logical.115848.11.clone.1)
+  %xor.121919.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248471.3.clone.1, %or.115372.9.clone.1)
+  %constant_218255_1_clone_1 = u32[] constant(2395511588)
+  %broadcast.252093.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218255_1_clone_1), dimensions={}
+  %add.248473.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121919.7.clone.1, %broadcast.252093.5.clone.1)
+  %add.248474.5.clone.1 = u32[1280,1280]{1,0} add(%add.248472.7.clone.1, %add.248473.5.clone.1)
+  %shift-left.109650.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248473.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115849.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248473.5.clone.1, %broadcast.244415.6016)
+  %or.115373.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109650.9.clone.1, %shift-right-logical.115849.9.clone.1)
+  %xor.121920.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248474.5.clone.1, %or.115373.7.clone.1)
+  %add.248475.3.clone.1 = u32[1280,1280]{1,0} add(%add.248474.5.clone.1, %xor.121920.5.clone.1)
+  %shift-left.109651.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121920.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115850.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121920.5.clone.1, %broadcast.244417.5760)
+  %or.115374.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109651.9.clone.1, %shift-right-logical.115850.9.clone.1)
+  %xor.121921.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248475.3.clone.1, %or.115374.7.clone.1)
+  %add.248476.3.clone.1 = u32[1280,1280]{1,0} add(%add.248475.3.clone.1, %xor.121921.5.clone.1)
+  %shift-left.109652.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121921.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115851.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121921.5.clone.1, %broadcast.244419.4352)
+  %or.115375.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109652.5.clone.1, %shift-right-logical.115851.5.clone.1)
+  %xor.121922.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248476.3.clone.1, %or.115375.3.clone.1)
+  %add.248477.3.clone.1 = u32[1280,1280]{1,0} add(%add.248476.3.clone.1, %xor.121922.3.clone.1)
+  %add.248478.17.clone.1 = u32[1280,1280]{1,0} add(%add.248477.3.clone.1, %broadcast.252070.24.clone.1)
+  %shift-left.109653.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121922.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115852.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121922.3.clone.1, %broadcast.244418.4352)
+  %or.115376.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109653.5.clone.1, %shift-right-logical.115852.5.clone.1)
+  %xor.121923.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248477.3.clone.1, %or.115376.3.clone.1)
+  %constant_218256_1_clone_1 = u32[] constant(1103031931)
+  %broadcast.252103.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218256_1_clone_1), dimensions={}
+  %add.248479.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121923.15.clone.1, %broadcast.252103.19.clone.1)
+  %xor.121924.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248478.17.clone.1, %add.248479.19.clone.1)
+  %shift-right-logical.115853.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121924.17.clone.1, %broadcast.244468.1920)
+  %or.115377.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115853.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5756.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115377.13.clone.1)
+  %add.248480.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5756.11.clone.1, %broadcast.244470.1152)
+  %multiply.26491.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248480.9.clone.1, %broadcast.244471.896)
+  %add.248482.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26491.7.clone.1, %broadcast.244408.1024)
+  %maximum.3688.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248482.5.clone.1)
+  %abs.1542.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3688.3.clone.1)
+  %compare.7232.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1542.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26492.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3688.3.clone.1, %broadcast.244476.1152)
+  %negate.4589.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3688.3.clone.1)
+  %multiply.26494.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3688.3.clone.1, %negate.4589.5.clone.1)
+  %log-plus-one.1542.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26494.5.clone.1)
+  %negate.4590.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1542.3.clone.1)
+  %compare.7233.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4590.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21048.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21049.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21050.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21051.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21052.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21053.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21054.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21055.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21056.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248485.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4590.4.clone.1, %broadcast.244496.640)
+  %sqrt.1542.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4590.4.clone.1)
+  %add.248486.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1542.5.clone.1, %broadcast.244498.640)
+  %select.21057.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7233.3.clone.1, %add.248485.5.clone.1, %add.248486.5.clone.1)
+  %multiply.26495.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21056.3.clone.1, %select.21057.3.clone.1)
+  %add.248487.1.clone.1 = f32[1280,1280]{1,0} add(%select.21055.3.clone.1, %multiply.26495.1.clone.1)
+  %multiply.26497.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248487.1.clone.1, %select.21057.3.clone.1)
+  %add.248488.1.clone.1 = f32[1280,1280]{1,0} add(%select.21054.3.clone.1, %multiply.26497.1.clone.1)
+  %multiply.26499.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248488.1.clone.1, %select.21057.3.clone.1)
+  %add.248490.1.clone.1 = f32[1280,1280]{1,0} add(%select.21053.3.clone.1, %multiply.26499.1.clone.1)
+  %multiply.26500.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248490.1.clone.1, %select.21057.3.clone.1)
+  %add.248491.1.clone.1 = f32[1280,1280]{1,0} add(%select.21052.3.clone.1, %multiply.26500.1.clone.1)
+  %multiply.26502.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248491.1.clone.1, %select.21057.3.clone.1)
+  %add.248492.3.clone.1 = f32[1280,1280]{1,0} add(%select.21051.5.clone.1, %multiply.26502.1.clone.1)
+  %multiply.26504.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248492.3.clone.1, %select.21057.3.clone.1)
+  %add.248493.3.clone.1 = f32[1280,1280]{1,0} add(%select.21050.5.clone.1, %multiply.26504.1.clone.1)
+  %multiply.26505.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248493.3.clone.1, %select.21057.3.clone.1)
+  %add.248495.9.clone.1 = f32[1280,1280]{1,0} add(%select.21049.11.clone.1, %multiply.26505.7.clone.1)
+  %multiply.26506.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248495.9.clone.1, %select.21057.3.clone.1)
+  %add.248496.7.clone.1 = f32[1280,1280]{1,0} add(%select.21048.7.clone.1, %multiply.26506.7.clone.1)
+  %multiply.26507.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248496.7.clone.1, %maximum.3688.3.clone.1)
+  %select.21058.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7232.3.clone.1, %multiply.26492.9.clone.1, %multiply.26507.7.clone.1)
+  %multiply.26508.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21058.7.clone.1, %broadcast.244500.640)
+  %clamp.1186.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26508.5.clone.1, %broadcast.244501.384)
+  %multiply.26509.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1186.3.clone.1, %broadcast.244502.1)
+  %constant_191693_1_clone_1 = u32[] constant(1438835833)
+  %broadcast.258922.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191693_1_clone_1), dimensions={}
+  %add.252356.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258922.44.clone.1)
+  %constant_191700_1_clone_1 = u32[] constant(3720491603)
+  %broadcast.258923.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191700_1_clone_1), dimensions={}
+  %add.252357.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258923.113.clone.1)
+  %add.252358.35.clone.1 = u32[1280,1280]{1,0} add(%add.252356.37.clone.1, %add.252357.99.clone.1)
+  %shift-left.111340.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252357.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117627.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252357.99.clone.1, %broadcast.244415.6016)
+  %or.117165.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111340.31.clone.1, %shift-right-logical.117627.29.clone.1)
+  %xor.123721.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252358.35.clone.1, %or.117165.29.clone.1)
+  %add.252359.5.clone.1 = u32[1280,1280]{1,0} add(%add.252358.35.clone.1, %xor.123721.27.clone.1)
+  %shift-left.111341.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123721.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117628.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123721.27.clone.1, %broadcast.244417.5760)
+  %or.117166.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111341.9.clone.1, %shift-right-logical.117628.9.clone.1)
+  %xor.123722.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252359.5.clone.1, %or.117166.7.clone.1)
+  %add.252360.3.clone.1 = u32[1280,1280]{1,0} add(%add.252359.5.clone.1, %xor.123722.5.clone.1)
+  %shift-left.111342.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123722.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117630.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123722.5.clone.1, %broadcast.244419.4352)
+  %or.117167.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111342.5.clone.1, %shift-right-logical.117630.5.clone.1)
+  %xor.123723.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252360.3.clone.1, %or.117167.3.clone.1)
+  %add.252361.3.clone.1 = u32[1280,1280]{1,0} add(%add.252360.3.clone.1, %xor.123723.3.clone.1)
+  %add.252362.7.clone.1 = u32[1280,1280]{1,0} add(%add.252361.3.clone.1, %broadcast.258923.113.clone.1)
+  %shift-left.111343.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123723.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117631.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123723.3.clone.1, %broadcast.244418.4352)
+  %or.117168.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111343.5.clone.1, %shift-right-logical.117631.5.clone.1)
+  %xor.123724.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252361.3.clone.1, %or.117168.3.clone.1)
+  %constant_218680_1_clone_1 = u32[] constant(2480000497)
+  %broadcast.258935.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218680_1_clone_1), dimensions={}
+  %add.252363.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123724.3.clone.1, %broadcast.258935.5.clone.1)
+  %add.252364.5.clone.1 = u32[1280,1280]{1,0} add(%add.252362.7.clone.1, %add.252363.5.clone.1)
+  %shift-left.111344.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252363.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117632.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252363.5.clone.1, %broadcast.244416.5760)
+  %or.117170.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111344.9.clone.1, %shift-right-logical.117632.9.clone.1)
+  %xor.123725.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252364.5.clone.1, %or.117170.7.clone.1)
+  %add.252365.3.clone.1 = u32[1280,1280]{1,0} add(%add.252364.5.clone.1, %xor.123725.5.clone.1)
+  %shift-left.111345.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123725.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117633.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123725.5.clone.1, %broadcast.244429.2304)
+  %or.117171.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111345.9.clone.1, %shift-right-logical.117633.9.clone.1)
+  %xor.123726.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252365.3.clone.1, %or.117171.7.clone.1)
+  %add.252366.3.clone.1 = u32[1280,1280]{1,0} add(%add.252365.3.clone.1, %xor.123726.5.clone.1)
+  %shift-left.111346.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123726.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117635.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123726.5.clone.1, %broadcast.244430.4608)
+  %or.117172.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111346.9.clone.1, %shift-right-logical.117635.9.clone.1)
+  %xor.123727.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252366.3.clone.1, %or.117172.7.clone.1)
+  %add.252367.3.clone.1 = u32[1280,1280]{1,0} add(%add.252366.3.clone.1, %xor.123727.5.clone.1)
+  %constant_191702_1_clone_1 = u32[] constant(2480000496)
+  %broadcast.258942.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191702_1_clone_1), dimensions={}
+  %add.252368.7.clone.1 = u32[1280,1280]{1,0} add(%add.252367.3.clone.1, %broadcast.258942.24.clone.1)
+  %shift-left.111347.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123727.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117636.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123727.5.clone.1, %broadcast.244434.2816)
+  %or.117173.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111347.11.clone.1, %shift-right-logical.117636.11.clone.1)
+  %xor.123729.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252367.3.clone.1, %or.117173.9.clone.1)
+  %constant_218681_1_clone_1 = u32[] constant(1438835835)
+  %broadcast.258945.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218681_1_clone_1), dimensions={}
+  %add.252369.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123729.7.clone.1, %broadcast.258945.5.clone.1)
+  %add.252370.5.clone.1 = u32[1280,1280]{1,0} add(%add.252368.7.clone.1, %add.252369.5.clone.1)
+  %shift-left.111348.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252369.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117637.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252369.5.clone.1, %broadcast.244415.6016)
+  %or.117174.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111348.9.clone.1, %shift-right-logical.117637.9.clone.1)
+  %xor.123730.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252370.5.clone.1, %or.117174.7.clone.1)
+  %add.252371.3.clone.1 = u32[1280,1280]{1,0} add(%add.252370.5.clone.1, %xor.123730.5.clone.1)
+  %shift-left.111349.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123730.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117638.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123730.5.clone.1, %broadcast.244417.5760)
+  %or.117175.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111349.9.clone.1, %shift-right-logical.117638.9.clone.1)
+  %xor.123731.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252371.3.clone.1, %or.117175.7.clone.1)
+  %add.252373.3.clone.1 = u32[1280,1280]{1,0} add(%add.252371.3.clone.1, %xor.123731.5.clone.1)
+  %shift-left.111350.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123731.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117640.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123731.5.clone.1, %broadcast.244419.4352)
+  %or.117176.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111350.7.clone.1, %shift-right-logical.117640.7.clone.1)
+  %xor.123732.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252373.3.clone.1, %or.117176.5.clone.1)
+  %add.252376.3.clone.1 = u32[1280,1280]{1,0} add(%add.252373.3.clone.1, %xor.123732.3.clone.1)
+  %add.252377.7.clone.1 = u32[1280,1280]{1,0} add(%add.252376.3.clone.1, %broadcast.258922.44.clone.1)
+  %shift-left.111351.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123732.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117641.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123732.3.clone.1, %broadcast.244418.4352)
+  %or.117177.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111351.7.clone.1, %shift-right-logical.117641.7.clone.1)
+  %xor.123733.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252376.3.clone.1, %or.117177.5.clone.1)
+  %constant_218682_1_clone_1 = u32[] constant(3720491606)
+  %broadcast.258955.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218682_1_clone_1), dimensions={}
+  %add.252378.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123733.3.clone.1, %broadcast.258955.5.clone.1)
+  %add.252379.5.clone.1 = u32[1280,1280]{1,0} add(%add.252377.7.clone.1, %add.252378.5.clone.1)
+  %shift-left.111352.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252378.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117642.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252378.5.clone.1, %broadcast.244416.5760)
+  %or.117178.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111352.9.clone.1, %shift-right-logical.117642.9.clone.1)
+  %xor.123734.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252379.5.clone.1, %or.117178.7.clone.1)
+  %add.252381.3.clone.1 = u32[1280,1280]{1,0} add(%add.252379.5.clone.1, %xor.123734.5.clone.1)
+  %shift-left.111353.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123734.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117643.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123734.5.clone.1, %broadcast.244429.2304)
+  %or.117179.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111353.9.clone.1, %shift-right-logical.117643.9.clone.1)
+  %xor.123735.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252381.3.clone.1, %or.117179.7.clone.1)
+  %add.252382.3.clone.1 = u32[1280,1280]{1,0} add(%add.252381.3.clone.1, %xor.123735.5.clone.1)
+  %shift-left.111354.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123735.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117645.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123735.5.clone.1, %broadcast.244430.4608)
+  %or.117180.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111354.9.clone.1, %shift-right-logical.117645.9.clone.1)
+  %xor.123736.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252382.3.clone.1, %or.117180.7.clone.1)
+  %add.252383.3.clone.1 = u32[1280,1280]{1,0} add(%add.252382.3.clone.1, %xor.123736.5.clone.1)
+  %add.252384.7.clone.1 = u32[1280,1280]{1,0} add(%add.252383.3.clone.1, %broadcast.258923.113.clone.1)
+  %shift-left.111355.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123736.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117646.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123736.5.clone.1, %broadcast.244434.2816)
+  %or.117181.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111355.11.clone.1, %shift-right-logical.117646.11.clone.1)
+  %xor.123737.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252383.3.clone.1, %or.117181.9.clone.1)
+  %constant_218683_1_clone_1 = u32[] constant(2480000500)
+  %broadcast.258967.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218683_1_clone_1), dimensions={}
+  %add.252386.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123737.7.clone.1, %broadcast.258967.5.clone.1)
+  %add.252387.5.clone.1 = u32[1280,1280]{1,0} add(%add.252384.7.clone.1, %add.252386.5.clone.1)
+  %shift-left.111356.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252386.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117647.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252386.5.clone.1, %broadcast.244415.6016)
+  %or.117182.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111356.9.clone.1, %shift-right-logical.117647.9.clone.1)
+  %xor.123738.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252387.5.clone.1, %or.117182.7.clone.1)
+  %add.252388.3.clone.1 = u32[1280,1280]{1,0} add(%add.252387.5.clone.1, %xor.123738.5.clone.1)
+  %shift-left.111357.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123738.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117648.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123738.5.clone.1, %broadcast.244417.5760)
+  %or.117183.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111357.9.clone.1, %shift-right-logical.117648.9.clone.1)
+  %xor.123739.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252388.3.clone.1, %or.117183.7.clone.1)
+  %add.252389.3.clone.1 = u32[1280,1280]{1,0} add(%add.252388.3.clone.1, %xor.123739.5.clone.1)
+  %shift-left.111358.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123739.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117649.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123739.5.clone.1, %broadcast.244419.4352)
+  %or.117184.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111358.5.clone.1, %shift-right-logical.117649.5.clone.1)
+  %xor.123740.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252389.3.clone.1, %or.117184.3.clone.1)
+  %add.252391.3.clone.1 = u32[1280,1280]{1,0} add(%add.252389.3.clone.1, %xor.123740.3.clone.1)
+  %add.252392.17.clone.1 = u32[1280,1280]{1,0} add(%add.252391.3.clone.1, %broadcast.258942.24.clone.1)
+  %shift-left.111359.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123740.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117650.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123740.3.clone.1, %broadcast.244418.4352)
+  %or.117185.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111359.5.clone.1, %shift-right-logical.117650.5.clone.1)
+  %xor.123741.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252391.3.clone.1, %or.117185.3.clone.1)
+  %constant_218684_1_clone_1 = u32[] constant(1438835838)
+  %broadcast.258977.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218684_1_clone_1), dimensions={}
+  %add.252393.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123741.15.clone.1, %broadcast.258977.19.clone.1)
+  %xor.123742.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252392.17.clone.1, %add.252393.19.clone.1)
+  %shift-right-logical.117651.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123742.17.clone.1, %broadcast.244468.1920)
+  %or.117186.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117651.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5834.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117186.13.clone.1)
+  %add.252394.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5834.11.clone.1, %broadcast.244470.1152)
+  %multiply.27292.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252394.9.clone.1, %broadcast.244471.896)
+  %add.252395.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27292.7.clone.1, %broadcast.244408.1024)
+  %maximum.3766.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252395.5.clone.1)
+  %abs.1594.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3766.3.clone.1)
+  %compare.7350.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1594.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27293.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3766.3.clone.1, %broadcast.244476.1152)
+  %negate.4693.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3766.3.clone.1)
+  %multiply.27294.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3766.3.clone.1, %negate.4693.5.clone.1)
+  %log-plus-one.1594.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27294.5.clone.1)
+  %negate.4694.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1594.3.clone.1)
+  %compare.7351.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4694.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21641.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21642.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21643.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21644.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21645.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21646.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21647.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21648.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21649.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252397.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4694.4.clone.1, %broadcast.244496.640)
+  %sqrt.1594.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4694.4.clone.1)
+  %add.252401.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1594.5.clone.1, %broadcast.244498.640)
+  %select.21650.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7351.3.clone.1, %add.252397.5.clone.1, %add.252401.5.clone.1)
+  %multiply.27295.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21649.3.clone.1, %select.21650.3.clone.1)
+  %add.252402.1.clone.1 = f32[1280,1280]{1,0} add(%select.21648.3.clone.1, %multiply.27295.1.clone.1)
+  %multiply.27296.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252402.1.clone.1, %select.21650.3.clone.1)
+  %add.252403.1.clone.1 = f32[1280,1280]{1,0} add(%select.21647.3.clone.1, %multiply.27296.1.clone.1)
+  %multiply.27297.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252403.1.clone.1, %select.21650.3.clone.1)
+  %add.252404.1.clone.1 = f32[1280,1280]{1,0} add(%select.21646.3.clone.1, %multiply.27297.1.clone.1)
+  %multiply.27298.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252404.1.clone.1, %select.21650.3.clone.1)
+  %add.252406.1.clone.1 = f32[1280,1280]{1,0} add(%select.21645.3.clone.1, %multiply.27298.1.clone.1)
+  %multiply.27299.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252406.1.clone.1, %select.21650.3.clone.1)
+  %add.252407.3.clone.1 = f32[1280,1280]{1,0} add(%select.21644.5.clone.1, %multiply.27299.1.clone.1)
+  %multiply.27300.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252407.3.clone.1, %select.21650.3.clone.1)
+  %add.252408.3.clone.1 = f32[1280,1280]{1,0} add(%select.21643.5.clone.1, %multiply.27300.1.clone.1)
+  %multiply.27301.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252408.3.clone.1, %select.21650.3.clone.1)
+  %add.252409.9.clone.1 = f32[1280,1280]{1,0} add(%select.21642.11.clone.1, %multiply.27301.7.clone.1)
+  %multiply.27302.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252409.9.clone.1, %select.21650.3.clone.1)
+  %add.252411.7.clone.1 = f32[1280,1280]{1,0} add(%select.21641.7.clone.1, %multiply.27302.7.clone.1)
+  %multiply.27303.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252411.7.clone.1, %maximum.3766.3.clone.1)
+  %select.21651.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7350.3.clone.1, %multiply.27293.9.clone.1, %multiply.27303.7.clone.1)
+  %multiply.27304.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21651.7.clone.1, %broadcast.244500.640)
+  %clamp.1238.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27304.5.clone.1, %broadcast.244501.384)
+  %multiply.27305.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1238.3.clone.1, %broadcast.244502.1)
+  %constant_175592_1_clone_1 = u32[] constant(2935165355)
+  %broadcast.251966.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175592_1_clone_1), dimensions={}
+  %add.248394.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251966.44.clone.1)
+  %constant_175599_1_clone_1 = u32[] constant(1815324084)
+  %broadcast.251967.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175599_1_clone_1), dimensions={}
+  %add.248395.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251967.113.clone.1)
+  %add.248397.35.clone.1 = u32[1280,1280]{1,0} add(%add.248394.37.clone.1, %add.248395.99.clone.1)
+  %shift-left.109607.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248395.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115806.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248395.99.clone.1, %broadcast.244415.6016)
+  %or.115336.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109607.31.clone.1, %shift-right-logical.115806.29.clone.1)
+  %xor.121883.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248397.35.clone.1, %or.115336.29.clone.1)
+  %add.248401.5.clone.1 = u32[1280,1280]{1,0} add(%add.248397.35.clone.1, %xor.121883.27.clone.1)
+  %shift-left.109608.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121883.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115808.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121883.27.clone.1, %broadcast.244417.5760)
+  %or.115337.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109608.9.clone.1, %shift-right-logical.115808.9.clone.1)
+  %xor.121884.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248401.5.clone.1, %or.115337.7.clone.1)
+  %add.248402.3.clone.1 = u32[1280,1280]{1,0} add(%add.248401.5.clone.1, %xor.121884.5.clone.1)
+  %shift-left.109610.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121884.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115809.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121884.5.clone.1, %broadcast.244419.4352)
+  %or.115338.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109610.5.clone.1, %shift-right-logical.115809.5.clone.1)
+  %xor.121885.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248402.3.clone.1, %or.115338.3.clone.1)
+  %add.248403.3.clone.1 = u32[1280,1280]{1,0} add(%add.248402.3.clone.1, %xor.121885.3.clone.1)
+  %add.248404.7.clone.1 = u32[1280,1280]{1,0} add(%add.248403.3.clone.1, %broadcast.251967.113.clone.1)
+  %shift-left.109611.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121885.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115810.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121885.3.clone.1, %broadcast.244418.4352)
+  %or.115339.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109611.5.clone.1, %shift-right-logical.115810.5.clone.1)
+  %xor.121886.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248403.3.clone.1, %or.115339.3.clone.1)
+  %constant_218247_1_clone_1 = u32[] constant(3641814982)
+  %broadcast.251977.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218247_1_clone_1), dimensions={}
+  %add.248406.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121886.3.clone.1, %broadcast.251977.5.clone.1)
+  %add.248407.5.clone.1 = u32[1280,1280]{1,0} add(%add.248404.7.clone.1, %add.248406.5.clone.1)
+  %shift-left.109612.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248406.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115811.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248406.5.clone.1, %broadcast.244416.5760)
+  %or.115340.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109612.9.clone.1, %shift-right-logical.115811.9.clone.1)
+  %xor.121887.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248407.5.clone.1, %or.115340.7.clone.1)
+  %add.248408.3.clone.1 = u32[1280,1280]{1,0} add(%add.248407.5.clone.1, %xor.121887.5.clone.1)
+  %shift-left.109613.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121887.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115813.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121887.5.clone.1, %broadcast.244429.2304)
+  %or.115341.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109613.9.clone.1, %shift-right-logical.115813.9.clone.1)
+  %xor.121888.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248408.3.clone.1, %or.115341.7.clone.1)
+  %add.248409.3.clone.1 = u32[1280,1280]{1,0} add(%add.248408.3.clone.1, %xor.121888.5.clone.1)
+  %shift-left.109614.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121888.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115814.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121888.5.clone.1, %broadcast.244430.4608)
+  %or.115342.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109614.9.clone.1, %shift-right-logical.115814.9.clone.1)
+  %xor.121889.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248409.3.clone.1, %or.115342.7.clone.1)
+  %add.248411.3.clone.1 = u32[1280,1280]{1,0} add(%add.248409.3.clone.1, %xor.121889.5.clone.1)
+  %constant_175601_1_clone_1 = u32[] constant(3641814981)
+  %broadcast.251984.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175601_1_clone_1), dimensions={}
+  %add.248412.7.clone.1 = u32[1280,1280]{1,0} add(%add.248411.3.clone.1, %broadcast.251984.24.clone.1)
+  %shift-left.109615.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121889.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115815.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121889.5.clone.1, %broadcast.244434.2816)
+  %or.115343.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109615.11.clone.1, %shift-right-logical.115815.11.clone.1)
+  %xor.121890.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248411.3.clone.1, %or.115343.9.clone.1)
+  %constant_218248_1_clone_1 = u32[] constant(2935165357)
+  %broadcast.251987.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218248_1_clone_1), dimensions={}
+  %add.248413.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121890.7.clone.1, %broadcast.251987.5.clone.1)
+  %add.248414.5.clone.1 = u32[1280,1280]{1,0} add(%add.248412.7.clone.1, %add.248413.5.clone.1)
+  %shift-left.109616.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248413.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115816.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248413.5.clone.1, %broadcast.244415.6016)
+  %or.115344.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109616.9.clone.1, %shift-right-logical.115816.9.clone.1)
+  %xor.121891.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248414.5.clone.1, %or.115344.7.clone.1)
+  %add.248416.3.clone.1 = u32[1280,1280]{1,0} add(%add.248414.5.clone.1, %xor.121891.5.clone.1)
+  %shift-left.109617.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121891.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115817.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121891.5.clone.1, %broadcast.244417.5760)
+  %or.115345.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109617.9.clone.1, %shift-right-logical.115817.9.clone.1)
+  %xor.121892.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248416.3.clone.1, %or.115345.7.clone.1)
+  %add.248417.3.clone.1 = u32[1280,1280]{1,0} add(%add.248416.3.clone.1, %xor.121892.5.clone.1)
+  %shift-left.109618.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121892.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115818.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121892.5.clone.1, %broadcast.244419.4352)
+  %or.115346.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109618.7.clone.1, %shift-right-logical.115818.7.clone.1)
+  %xor.121893.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248417.3.clone.1, %or.115346.5.clone.1)
+  %add.248418.3.clone.1 = u32[1280,1280]{1,0} add(%add.248417.3.clone.1, %xor.121893.3.clone.1)
+  %add.248419.7.clone.1 = u32[1280,1280]{1,0} add(%add.248418.3.clone.1, %broadcast.251966.44.clone.1)
+  %shift-left.109620.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121893.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115819.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121893.3.clone.1, %broadcast.244418.4352)
+  %or.115347.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109620.7.clone.1, %shift-right-logical.115819.7.clone.1)
+  %xor.121894.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248418.3.clone.1, %or.115347.5.clone.1)
+  %constant_218249_1_clone_1 = u32[] constant(1815324087)
+  %broadcast.251997.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218249_1_clone_1), dimensions={}
+  %add.248420.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121894.3.clone.1, %broadcast.251997.5.clone.1)
+  %add.248422.5.clone.1 = u32[1280,1280]{1,0} add(%add.248419.7.clone.1, %add.248420.5.clone.1)
+  %shift-left.109621.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248420.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115820.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248420.5.clone.1, %broadcast.244416.5760)
+  %or.115348.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109621.9.clone.1, %shift-right-logical.115820.9.clone.1)
+  %xor.121895.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248422.5.clone.1, %or.115348.7.clone.1)
+  %add.248425.3.clone.1 = u32[1280,1280]{1,0} add(%add.248422.5.clone.1, %xor.121895.5.clone.1)
+  %shift-left.109622.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121895.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115821.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121895.5.clone.1, %broadcast.244429.2304)
+  %or.115349.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109622.9.clone.1, %shift-right-logical.115821.9.clone.1)
+  %xor.121896.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248425.3.clone.1, %or.115349.7.clone.1)
+  %add.248426.3.clone.1 = u32[1280,1280]{1,0} add(%add.248425.3.clone.1, %xor.121896.5.clone.1)
+  %shift-left.109623.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121896.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115823.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121896.5.clone.1, %broadcast.244430.4608)
+  %or.115350.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109623.9.clone.1, %shift-right-logical.115823.9.clone.1)
+  %xor.121897.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248426.3.clone.1, %or.115350.7.clone.1)
+  %add.248427.3.clone.1 = u32[1280,1280]{1,0} add(%add.248426.3.clone.1, %xor.121897.5.clone.1)
+  %add.248428.7.clone.1 = u32[1280,1280]{1,0} add(%add.248427.3.clone.1, %broadcast.251967.113.clone.1)
+  %shift-left.109625.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121897.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115824.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121897.5.clone.1, %broadcast.244434.2816)
+  %or.115351.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109625.11.clone.1, %shift-right-logical.115824.11.clone.1)
+  %xor.121898.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248427.3.clone.1, %or.115351.9.clone.1)
+  %constant_218250_1_clone_1 = u32[] constant(3641814985)
+  %broadcast.252007.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218250_1_clone_1), dimensions={}
+  %add.248429.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121898.7.clone.1, %broadcast.252007.5.clone.1)
+  %add.248430.5.clone.1 = u32[1280,1280]{1,0} add(%add.248428.7.clone.1, %add.248429.5.clone.1)
+  %shift-left.109626.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248429.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115825.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248429.5.clone.1, %broadcast.244415.6016)
+  %or.115352.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109626.9.clone.1, %shift-right-logical.115825.9.clone.1)
+  %xor.121899.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248430.5.clone.1, %or.115352.7.clone.1)
+  %add.248431.3.clone.1 = u32[1280,1280]{1,0} add(%add.248430.5.clone.1, %xor.121899.5.clone.1)
+  %shift-left.109627.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121899.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115826.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121899.5.clone.1, %broadcast.244417.5760)
+  %or.115353.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109627.9.clone.1, %shift-right-logical.115826.9.clone.1)
+  %xor.121900.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248431.3.clone.1, %or.115353.7.clone.1)
+  %add.248432.3.clone.1 = u32[1280,1280]{1,0} add(%add.248431.3.clone.1, %xor.121900.5.clone.1)
+  %shift-left.109628.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121900.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115828.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121900.5.clone.1, %broadcast.244419.4352)
+  %or.115354.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109628.5.clone.1, %shift-right-logical.115828.5.clone.1)
+  %xor.121901.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248432.3.clone.1, %or.115354.3.clone.1)
+  %add.248433.3.clone.1 = u32[1280,1280]{1,0} add(%add.248432.3.clone.1, %xor.121901.3.clone.1)
+  %add.248434.17.clone.1 = u32[1280,1280]{1,0} add(%add.248433.3.clone.1, %broadcast.251984.24.clone.1)
+  %shift-left.109630.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121901.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115829.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121901.3.clone.1, %broadcast.244418.4352)
+  %or.115355.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109630.5.clone.1, %shift-right-logical.115829.5.clone.1)
+  %xor.121902.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248433.3.clone.1, %or.115355.3.clone.1)
+  %constant_218251_1_clone_1 = u32[] constant(2935165360)
+  %broadcast.252017.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218251_1_clone_1), dimensions={}
+  %add.248435.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121902.15.clone.1, %broadcast.252017.19.clone.1)
+  %xor.121903.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248434.17.clone.1, %add.248435.19.clone.1)
+  %shift-right-logical.115830.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121903.17.clone.1, %broadcast.244468.1920)
+  %or.115356.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115830.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5755.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115356.13.clone.1)
+  %add.248436.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5755.11.clone.1, %broadcast.244470.1152)
+  %multiply.26471.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248436.9.clone.1, %broadcast.244471.896)
+  %add.248437.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26471.7.clone.1, %broadcast.244408.1024)
+  %maximum.3687.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248437.5.clone.1)
+  %abs.1541.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3687.3.clone.1)
+  %compare.7230.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1541.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26472.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3687.3.clone.1, %broadcast.244476.1152)
+  %negate.4587.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3687.3.clone.1)
+  %multiply.26473.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3687.3.clone.1, %negate.4587.5.clone.1)
+  %log-plus-one.1541.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26473.5.clone.1)
+  %negate.4588.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1541.3.clone.1)
+  %compare.7231.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4588.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21032.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21033.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21035.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21040.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21041.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21042.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21043.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21044.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21045.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248438.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4588.4.clone.1, %broadcast.244496.640)
+  %sqrt.1541.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4588.4.clone.1)
+  %add.248439.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1541.5.clone.1, %broadcast.244498.640)
+  %select.21046.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7231.3.clone.1, %add.248438.5.clone.1, %add.248439.5.clone.1)
+  %multiply.26474.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21045.3.clone.1, %select.21046.3.clone.1)
+  %add.248440.1.clone.1 = f32[1280,1280]{1,0} add(%select.21044.3.clone.1, %multiply.26474.1.clone.1)
+  %multiply.26476.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248440.1.clone.1, %select.21046.3.clone.1)
+  %add.248441.1.clone.1 = f32[1280,1280]{1,0} add(%select.21043.3.clone.1, %multiply.26476.1.clone.1)
+  %multiply.26477.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248441.1.clone.1, %select.21046.3.clone.1)
+  %add.248442.1.clone.1 = f32[1280,1280]{1,0} add(%select.21042.3.clone.1, %multiply.26477.1.clone.1)
+  %multiply.26479.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248442.1.clone.1, %select.21046.3.clone.1)
+  %add.248443.1.clone.1 = f32[1280,1280]{1,0} add(%select.21041.3.clone.1, %multiply.26479.1.clone.1)
+  %multiply.26480.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248443.1.clone.1, %select.21046.3.clone.1)
+  %add.248444.3.clone.1 = f32[1280,1280]{1,0} add(%select.21040.5.clone.1, %multiply.26480.1.clone.1)
+  %multiply.26482.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248444.3.clone.1, %select.21046.3.clone.1)
+  %add.248445.3.clone.1 = f32[1280,1280]{1,0} add(%select.21035.5.clone.1, %multiply.26482.1.clone.1)
+  %multiply.26483.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248445.3.clone.1, %select.21046.3.clone.1)
+  %add.248446.9.clone.1 = f32[1280,1280]{1,0} add(%select.21033.11.clone.1, %multiply.26483.7.clone.1)
+  %multiply.26485.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248446.9.clone.1, %select.21046.3.clone.1)
+  %add.248447.7.clone.1 = f32[1280,1280]{1,0} add(%select.21032.7.clone.1, %multiply.26485.7.clone.1)
+  %multiply.26486.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248447.7.clone.1, %maximum.3687.3.clone.1)
+  %select.21047.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7230.3.clone.1, %multiply.26472.9.clone.1, %multiply.26486.7.clone.1)
+  %multiply.26488.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21047.7.clone.1, %broadcast.244500.640)
+  %clamp.1185.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26488.5.clone.1, %broadcast.244501.384)
+  %multiply.26489.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1185.3.clone.1, %broadcast.244502.1)
+  %constant_186354_1_clone_1 = u32[] constant(2496889964)
+  %broadcast.256636.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186354_1_clone_1), dimensions={}
+  %add.251038.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.256636.44.clone.1)
+  %constant_186361_1_clone_1 = u32[] constant(4113454960)
+  %broadcast.256637.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186361_1_clone_1), dimensions={}
+  %add.251039.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256637.113.clone.1)
+  %add.251040.35.clone.1 = u32[1280,1280]{1,0} add(%add.251038.37.clone.1, %add.251039.99.clone.1)
+  %shift-left.110773.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251039.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117034.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251039.99.clone.1, %broadcast.244415.6016)
+  %or.116551.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110773.31.clone.1, %shift-right-logical.117034.29.clone.1)
+  %xor.123127.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251040.35.clone.1, %or.116551.29.clone.1)
+  %add.251042.5.clone.1 = u32[1280,1280]{1,0} add(%add.251040.35.clone.1, %xor.123127.27.clone.1)
+  %shift-left.110775.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123127.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117035.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123127.27.clone.1, %broadcast.244417.5760)
+  %or.116552.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110775.9.clone.1, %shift-right-logical.117035.9.clone.1)
+  %xor.123129.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251042.5.clone.1, %or.116552.7.clone.1)
+  %add.251046.3.clone.1 = u32[1280,1280]{1,0} add(%add.251042.5.clone.1, %xor.123129.5.clone.1)
+  %shift-left.110776.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123129.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117036.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123129.5.clone.1, %broadcast.244419.4352)
+  %or.116553.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110776.5.clone.1, %shift-right-logical.117036.5.clone.1)
+  %xor.123130.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251046.3.clone.1, %or.116553.3.clone.1)
+  %add.251047.3.clone.1 = u32[1280,1280]{1,0} add(%add.251046.3.clone.1, %xor.123130.3.clone.1)
+  %add.251048.7.clone.1 = u32[1280,1280]{1,0} add(%add.251047.3.clone.1, %broadcast.256637.113.clone.1)
+  %shift-left.110777.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123130.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117037.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123130.3.clone.1, %broadcast.244418.4352)
+  %or.116554.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110777.5.clone.1, %shift-right-logical.117037.5.clone.1)
+  %xor.123131.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251047.3.clone.1, %or.116554.3.clone.1)
+  %constant_218534_1_clone_1 = u32[] constant(2049755335)
+  %broadcast.256647.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218534_1_clone_1), dimensions={}
+  %add.251049.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123131.3.clone.1, %broadcast.256647.5.clone.1)
+  %add.251051.5.clone.1 = u32[1280,1280]{1,0} add(%add.251048.7.clone.1, %add.251049.5.clone.1)
+  %shift-left.110778.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251049.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117038.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251049.5.clone.1, %broadcast.244416.5760)
+  %or.116556.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110778.9.clone.1, %shift-right-logical.117038.9.clone.1)
+  %xor.123132.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251051.5.clone.1, %or.116556.7.clone.1)
+  %add.251052.3.clone.1 = u32[1280,1280]{1,0} add(%add.251051.5.clone.1, %xor.123132.5.clone.1)
+  %shift-left.110780.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123132.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117039.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123132.5.clone.1, %broadcast.244429.2304)
+  %or.116557.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110780.9.clone.1, %shift-right-logical.117039.9.clone.1)
+  %xor.123134.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251052.3.clone.1, %or.116557.7.clone.1)
+  %add.251053.3.clone.1 = u32[1280,1280]{1,0} add(%add.251052.3.clone.1, %xor.123134.5.clone.1)
+  %shift-left.110781.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123134.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117040.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123134.5.clone.1, %broadcast.244430.4608)
+  %or.116558.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110781.9.clone.1, %shift-right-logical.117040.9.clone.1)
+  %xor.123135.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251053.3.clone.1, %or.116558.7.clone.1)
+  %add.251054.3.clone.1 = u32[1280,1280]{1,0} add(%add.251053.3.clone.1, %xor.123135.5.clone.1)
+  %constant_186363_1_clone_1 = u32[] constant(2049755334)
+  %broadcast.256654.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186363_1_clone_1), dimensions={}
+  %add.251056.7.clone.1 = u32[1280,1280]{1,0} add(%add.251054.3.clone.1, %broadcast.256654.24.clone.1)
+  %shift-left.110782.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123135.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117041.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123135.5.clone.1, %broadcast.244434.2816)
+  %or.116559.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110782.11.clone.1, %shift-right-logical.117041.11.clone.1)
+  %xor.123136.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251054.3.clone.1, %or.116559.9.clone.1)
+  %constant_218535_1_clone_1 = u32[] constant(2496889966)
+  %broadcast.256657.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218535_1_clone_1), dimensions={}
+  %add.251057.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123136.7.clone.1, %broadcast.256657.5.clone.1)
+  %add.251058.5.clone.1 = u32[1280,1280]{1,0} add(%add.251056.7.clone.1, %add.251057.5.clone.1)
+  %shift-left.110783.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251057.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117042.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251057.5.clone.1, %broadcast.244415.6016)
+  %or.116561.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110783.9.clone.1, %shift-right-logical.117042.9.clone.1)
+  %xor.123137.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251058.5.clone.1, %or.116561.7.clone.1)
+  %add.251059.3.clone.1 = u32[1280,1280]{1,0} add(%add.251058.5.clone.1, %xor.123137.5.clone.1)
+  %shift-left.110785.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123137.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117043.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123137.5.clone.1, %broadcast.244417.5760)
+  %or.116562.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110785.9.clone.1, %shift-right-logical.117043.9.clone.1)
+  %xor.123139.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251059.3.clone.1, %or.116562.7.clone.1)
+  %add.251061.3.clone.1 = u32[1280,1280]{1,0} add(%add.251059.3.clone.1, %xor.123139.5.clone.1)
+  %shift-left.110786.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123139.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117044.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123139.5.clone.1, %broadcast.244419.4352)
+  %or.116563.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110786.7.clone.1, %shift-right-logical.117044.7.clone.1)
+  %xor.123140.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251061.3.clone.1, %or.116563.5.clone.1)
+  %add.251062.3.clone.1 = u32[1280,1280]{1,0} add(%add.251061.3.clone.1, %xor.123140.3.clone.1)
+  %add.251063.7.clone.1 = u32[1280,1280]{1,0} add(%add.251062.3.clone.1, %broadcast.256636.44.clone.1)
+  %shift-left.110787.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123140.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117045.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123140.3.clone.1, %broadcast.244418.4352)
+  %or.116564.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110787.7.clone.1, %shift-right-logical.117045.7.clone.1)
+  %xor.123141.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251062.3.clone.1, %or.116564.5.clone.1)
+  %constant_218536_1_clone_1 = u32[] constant(4113454963)
+  %broadcast.256667.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218536_1_clone_1), dimensions={}
+  %add.251064.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123141.3.clone.1, %broadcast.256667.5.clone.1)
+  %add.251065.5.clone.1 = u32[1280,1280]{1,0} add(%add.251063.7.clone.1, %add.251064.5.clone.1)
+  %shift-left.110788.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251064.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117046.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251064.5.clone.1, %broadcast.244416.5760)
+  %or.116566.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110788.9.clone.1, %shift-right-logical.117046.9.clone.1)
+  %xor.123142.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251065.5.clone.1, %or.116566.7.clone.1)
+  %add.251067.3.clone.1 = u32[1280,1280]{1,0} add(%add.251065.5.clone.1, %xor.123142.5.clone.1)
+  %shift-left.110789.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123142.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117047.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123142.5.clone.1, %broadcast.244429.2304)
+  %or.116567.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110789.9.clone.1, %shift-right-logical.117047.9.clone.1)
+  %xor.123143.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251067.3.clone.1, %or.116567.7.clone.1)
+  %add.251071.3.clone.1 = u32[1280,1280]{1,0} add(%add.251067.3.clone.1, %xor.123143.5.clone.1)
+  %shift-left.110790.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123143.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117048.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123143.5.clone.1, %broadcast.244430.4608)
+  %or.116568.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110790.9.clone.1, %shift-right-logical.117048.9.clone.1)
+  %xor.123144.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251071.3.clone.1, %or.116568.7.clone.1)
+  %add.251072.3.clone.1 = u32[1280,1280]{1,0} add(%add.251071.3.clone.1, %xor.123144.5.clone.1)
+  %add.251073.7.clone.1 = u32[1280,1280]{1,0} add(%add.251072.3.clone.1, %broadcast.256637.113.clone.1)
+  %shift-left.110791.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123144.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117049.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123144.5.clone.1, %broadcast.244434.2816)
+  %or.116569.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110791.11.clone.1, %shift-right-logical.117049.11.clone.1)
+  %xor.123145.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251072.3.clone.1, %or.116569.9.clone.1)
+  %constant_218537_1_clone_1 = u32[] constant(2049755338)
+  %broadcast.256677.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218537_1_clone_1), dimensions={}
+  %add.251074.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123145.7.clone.1, %broadcast.256677.5.clone.1)
+  %add.251076.5.clone.1 = u32[1280,1280]{1,0} add(%add.251073.7.clone.1, %add.251074.5.clone.1)
+  %shift-left.110792.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251074.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117050.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251074.5.clone.1, %broadcast.244415.6016)
+  %or.116571.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110792.9.clone.1, %shift-right-logical.117050.9.clone.1)
+  %xor.123146.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251076.5.clone.1, %or.116571.7.clone.1)
+  %add.251077.3.clone.1 = u32[1280,1280]{1,0} add(%add.251076.5.clone.1, %xor.123146.5.clone.1)
+  %shift-left.110793.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123146.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117051.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123146.5.clone.1, %broadcast.244417.5760)
+  %or.116572.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110793.9.clone.1, %shift-right-logical.117051.9.clone.1)
+  %xor.123147.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251077.3.clone.1, %or.116572.7.clone.1)
+  %add.251078.3.clone.1 = u32[1280,1280]{1,0} add(%add.251077.3.clone.1, %xor.123147.5.clone.1)
+  %shift-left.110795.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123147.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117052.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123147.5.clone.1, %broadcast.244419.4352)
+  %or.116573.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110795.5.clone.1, %shift-right-logical.117052.5.clone.1)
+  %xor.123148.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251078.3.clone.1, %or.116573.3.clone.1)
+  %add.251079.3.clone.1 = u32[1280,1280]{1,0} add(%add.251078.3.clone.1, %xor.123148.3.clone.1)
+  %add.251081.17.clone.1 = u32[1280,1280]{1,0} add(%add.251079.3.clone.1, %broadcast.256654.24.clone.1)
+  %shift-left.110796.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123148.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117054.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123148.3.clone.1, %broadcast.244418.4352)
+  %or.116574.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110796.5.clone.1, %shift-right-logical.117054.5.clone.1)
+  %xor.123149.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251079.3.clone.1, %or.116574.3.clone.1)
+  %constant_218538_1_clone_1 = u32[] constant(2496889969)
+  %broadcast.256687.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218538_1_clone_1), dimensions={}
+  %add.251082.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123149.15.clone.1, %broadcast.256687.19.clone.1)
+  %xor.123150.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251081.17.clone.1, %add.251082.19.clone.1)
+  %shift-right-logical.117055.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123150.17.clone.1, %broadcast.244468.1920)
+  %or.116575.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117055.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5808.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116575.13.clone.1)
+  %add.251083.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5808.11.clone.1, %broadcast.244470.1152)
+  %multiply.27011.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251083.9.clone.1, %broadcast.244471.896)
+  %add.251084.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27011.7.clone.1, %broadcast.244408.1024)
+  %maximum.3740.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251084.5.clone.1)
+  %abs.1576.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3740.3.clone.1)
+  %compare.7309.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1576.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27012.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3740.3.clone.1, %broadcast.244476.1152)
+  %negate.4657.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3740.3.clone.1)
+  %multiply.27013.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3740.3.clone.1, %negate.4657.5.clone.1)
+  %log-plus-one.1576.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27013.5.clone.1)
+  %negate.4658.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1576.3.clone.1)
+  %compare.7313.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4658.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21422.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21423.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21424.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21425.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21426.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21427.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21428.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21429.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21430.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251086.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4658.4.clone.1, %broadcast.244496.640)
+  %sqrt.1576.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4658.4.clone.1)
+  %add.251087.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1576.5.clone.1, %broadcast.244498.640)
+  %select.21431.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7313.3.clone.1, %add.251086.5.clone.1, %add.251087.5.clone.1)
+  %multiply.27014.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21430.3.clone.1, %select.21431.3.clone.1)
+  %add.251088.1.clone.1 = f32[1280,1280]{1,0} add(%select.21429.3.clone.1, %multiply.27014.1.clone.1)
+  %multiply.27016.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251088.1.clone.1, %select.21431.3.clone.1)
+  %add.251089.1.clone.1 = f32[1280,1280]{1,0} add(%select.21428.3.clone.1, %multiply.27016.1.clone.1)
+  %multiply.27017.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251089.1.clone.1, %select.21431.3.clone.1)
+  %add.251090.1.clone.1 = f32[1280,1280]{1,0} add(%select.21427.3.clone.1, %multiply.27017.1.clone.1)
+  %multiply.27018.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251090.1.clone.1, %select.21431.3.clone.1)
+  %add.251092.1.clone.1 = f32[1280,1280]{1,0} add(%select.21426.3.clone.1, %multiply.27018.1.clone.1)
+  %multiply.27019.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251092.1.clone.1, %select.21431.3.clone.1)
+  %add.251096.3.clone.1 = f32[1280,1280]{1,0} add(%select.21425.5.clone.1, %multiply.27019.1.clone.1)
+  %multiply.27020.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251096.3.clone.1, %select.21431.3.clone.1)
+  %add.251097.3.clone.1 = f32[1280,1280]{1,0} add(%select.21424.5.clone.1, %multiply.27020.1.clone.1)
+  %multiply.27021.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251097.3.clone.1, %select.21431.3.clone.1)
+  %add.251098.9.clone.1 = f32[1280,1280]{1,0} add(%select.21423.11.clone.1, %multiply.27021.7.clone.1)
+  %multiply.27022.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251098.9.clone.1, %select.21431.3.clone.1)
+  %add.251099.7.clone.1 = f32[1280,1280]{1,0} add(%select.21422.7.clone.1, %multiply.27022.7.clone.1)
+  %multiply.27023.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251099.7.clone.1, %maximum.3740.3.clone.1)
+  %select.21432.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7309.3.clone.1, %multiply.27012.9.clone.1, %multiply.27023.7.clone.1)
+  %multiply.27024.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21432.7.clone.1, %broadcast.244500.640)
+  %clamp.1220.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27024.5.clone.1, %broadcast.244501.384)
+  %multiply.27026.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1220.3.clone.1, %broadcast.244502.1)
+  %constant_175378_1_clone_1 = u32[] constant(1467111404)
+  %broadcast.251863.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175378_1_clone_1), dimensions={}
+  %add.248334.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251863.44.clone.1)
+  %constant_175392_1_clone_1 = u32[] constant(4217064098)
+  %broadcast.251864.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175392_1_clone_1), dimensions={}
+  %add.248336.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251864.113.clone.1)
+  %add.248337.35.clone.1 = u32[1280,1280]{1,0} add(%add.248334.37.clone.1, %add.248336.99.clone.1)
+  %shift-left.109583.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248336.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115781.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248336.99.clone.1, %broadcast.244415.6016)
+  %or.115314.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109583.31.clone.1, %shift-right-logical.115781.29.clone.1)
+  %xor.121862.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248337.35.clone.1, %or.115314.29.clone.1)
+  %add.248338.5.clone.1 = u32[1280,1280]{1,0} add(%add.248337.35.clone.1, %xor.121862.27.clone.1)
+  %shift-left.109585.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121862.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115783.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121862.27.clone.1, %broadcast.244417.5760)
+  %or.115315.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109585.9.clone.1, %shift-right-logical.115783.9.clone.1)
+  %xor.121863.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248338.5.clone.1, %or.115315.7.clone.1)
+  %add.248339.3.clone.1 = u32[1280,1280]{1,0} add(%add.248338.5.clone.1, %xor.121863.5.clone.1)
+  %shift-left.109586.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121863.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115784.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121863.5.clone.1, %broadcast.244419.4352)
+  %or.115316.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109586.5.clone.1, %shift-right-logical.115784.5.clone.1)
+  %xor.121864.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248339.3.clone.1, %or.115316.3.clone.1)
+  %add.248341.3.clone.1 = u32[1280,1280]{1,0} add(%add.248339.3.clone.1, %xor.121864.3.clone.1)
+  %add.248342.7.clone.1 = u32[1280,1280]{1,0} add(%add.248341.3.clone.1, %broadcast.251864.113.clone.1)
+  %shift-left.109587.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121864.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115785.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121864.3.clone.1, %broadcast.244418.4352)
+  %or.115317.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109587.5.clone.1, %shift-right-logical.115785.5.clone.1)
+  %xor.121865.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248341.3.clone.1, %or.115317.3.clone.1)
+  %constant_218242_1_clone_1 = u32[] constant(3086485141)
+  %broadcast.251876.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218242_1_clone_1), dimensions={}
+  %add.248343.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121865.3.clone.1, %broadcast.251876.5.clone.1)
+  %add.248344.5.clone.1 = u32[1280,1280]{1,0} add(%add.248342.7.clone.1, %add.248343.5.clone.1)
+  %shift-left.109588.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248343.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115786.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248343.5.clone.1, %broadcast.244416.5760)
+  %or.115318.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109588.9.clone.1, %shift-right-logical.115786.9.clone.1)
+  %xor.121866.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248344.5.clone.1, %or.115318.7.clone.1)
+  %add.248345.3.clone.1 = u32[1280,1280]{1,0} add(%add.248344.5.clone.1, %xor.121866.5.clone.1)
+  %shift-left.109589.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121866.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115788.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121866.5.clone.1, %broadcast.244429.2304)
+  %or.115319.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109589.9.clone.1, %shift-right-logical.115788.9.clone.1)
+  %xor.121867.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248345.3.clone.1, %or.115319.7.clone.1)
+  %add.248347.3.clone.1 = u32[1280,1280]{1,0} add(%add.248345.3.clone.1, %xor.121867.5.clone.1)
+  %shift-left.109590.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121867.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115789.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121867.5.clone.1, %broadcast.244430.4608)
+  %or.115320.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109590.9.clone.1, %shift-right-logical.115789.9.clone.1)
+  %xor.121868.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248347.3.clone.1, %or.115320.7.clone.1)
+  %add.248351.3.clone.1 = u32[1280,1280]{1,0} add(%add.248347.3.clone.1, %xor.121868.5.clone.1)
+  %constant_175396_1_clone_1 = u32[] constant(3086485140)
+  %broadcast.251883.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175396_1_clone_1), dimensions={}
+  %add.248352.7.clone.1 = u32[1280,1280]{1,0} add(%add.248351.3.clone.1, %broadcast.251883.24.clone.1)
+  %shift-left.109591.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121868.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115790.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121868.5.clone.1, %broadcast.244434.2816)
+  %or.115321.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109591.11.clone.1, %shift-right-logical.115790.11.clone.1)
+  %xor.121869.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248351.3.clone.1, %or.115321.9.clone.1)
+  %constant_218243_1_clone_1 = u32[] constant(1467111406)
+  %broadcast.251886.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218243_1_clone_1), dimensions={}
+  %add.248353.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121869.7.clone.1, %broadcast.251886.5.clone.1)
+  %add.248354.5.clone.1 = u32[1280,1280]{1,0} add(%add.248352.7.clone.1, %add.248353.5.clone.1)
+  %shift-left.109592.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248353.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115791.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248353.5.clone.1, %broadcast.244415.6016)
+  %or.115322.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109592.9.clone.1, %shift-right-logical.115791.9.clone.1)
+  %xor.121870.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248354.5.clone.1, %or.115322.7.clone.1)
+  %add.248356.3.clone.1 = u32[1280,1280]{1,0} add(%add.248354.5.clone.1, %xor.121870.5.clone.1)
+  %shift-left.109593.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121870.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115792.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121870.5.clone.1, %broadcast.244417.5760)
+  %or.115323.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109593.9.clone.1, %shift-right-logical.115792.9.clone.1)
+  %xor.121871.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248356.3.clone.1, %or.115323.7.clone.1)
+  %add.248357.3.clone.1 = u32[1280,1280]{1,0} add(%add.248356.3.clone.1, %xor.121871.5.clone.1)
+  %shift-left.109595.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121871.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115793.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121871.5.clone.1, %broadcast.244419.4352)
+  %or.115324.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109595.7.clone.1, %shift-right-logical.115793.7.clone.1)
+  %xor.121872.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248357.3.clone.1, %or.115324.5.clone.1)
+  %add.248358.3.clone.1 = u32[1280,1280]{1,0} add(%add.248357.3.clone.1, %xor.121872.3.clone.1)
+  %add.248359.7.clone.1 = u32[1280,1280]{1,0} add(%add.248358.3.clone.1, %broadcast.251863.44.clone.1)
+  %shift-left.109596.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121872.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115794.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121872.3.clone.1, %broadcast.244418.4352)
+  %or.115325.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109596.7.clone.1, %shift-right-logical.115794.7.clone.1)
+  %xor.121873.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248358.3.clone.1, %or.115325.5.clone.1)
+  %constant_218244_1_clone_1 = u32[] constant(4217064101)
+  %broadcast.251896.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218244_1_clone_1), dimensions={}
+  %add.248361.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121873.3.clone.1, %broadcast.251896.5.clone.1)
+  %add.248362.5.clone.1 = u32[1280,1280]{1,0} add(%add.248359.7.clone.1, %add.248361.5.clone.1)
+  %shift-left.109597.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248361.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115795.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248361.5.clone.1, %broadcast.244416.5760)
+  %or.115326.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109597.9.clone.1, %shift-right-logical.115795.9.clone.1)
+  %xor.121874.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248362.5.clone.1, %or.115326.7.clone.1)
+  %add.248363.3.clone.1 = u32[1280,1280]{1,0} add(%add.248362.5.clone.1, %xor.121874.5.clone.1)
+  %shift-left.109598.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121874.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115796.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121874.5.clone.1, %broadcast.244429.2304)
+  %or.115327.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109598.9.clone.1, %shift-right-logical.115796.9.clone.1)
+  %xor.121875.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248363.3.clone.1, %or.115327.7.clone.1)
+  %add.248364.3.clone.1 = u32[1280,1280]{1,0} add(%add.248363.3.clone.1, %xor.121875.5.clone.1)
+  %shift-left.109600.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121875.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115798.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121875.5.clone.1, %broadcast.244430.4608)
+  %or.115328.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109600.9.clone.1, %shift-right-logical.115798.9.clone.1)
+  %xor.121876.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248364.3.clone.1, %or.115328.7.clone.1)
+  %add.248366.3.clone.1 = u32[1280,1280]{1,0} add(%add.248364.3.clone.1, %xor.121876.5.clone.1)
+  %add.248367.7.clone.1 = u32[1280,1280]{1,0} add(%add.248366.3.clone.1, %broadcast.251864.113.clone.1)
+  %shift-left.109601.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121876.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115799.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121876.5.clone.1, %broadcast.244434.2816)
+  %or.115330.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109601.11.clone.1, %shift-right-logical.115799.11.clone.1)
+  %xor.121877.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248366.3.clone.1, %or.115330.9.clone.1)
+  %constant_218245_1_clone_1 = u32[] constant(3086485144)
+  %broadcast.251909.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218245_1_clone_1), dimensions={}
+  %add.248368.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121877.7.clone.1, %broadcast.251909.5.clone.1)
+  %add.248369.5.clone.1 = u32[1280,1280]{1,0} add(%add.248367.7.clone.1, %add.248368.5.clone.1)
+  %shift-left.109602.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248368.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115800.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248368.5.clone.1, %broadcast.244415.6016)
+  %or.115331.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109602.9.clone.1, %shift-right-logical.115800.9.clone.1)
+  %xor.121878.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248369.5.clone.1, %or.115331.7.clone.1)
+  %add.248370.3.clone.1 = u32[1280,1280]{1,0} add(%add.248369.5.clone.1, %xor.121878.5.clone.1)
+  %shift-left.109603.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121878.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115801.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121878.5.clone.1, %broadcast.244417.5760)
+  %or.115332.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109603.9.clone.1, %shift-right-logical.115801.9.clone.1)
+  %xor.121879.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248370.3.clone.1, %or.115332.7.clone.1)
+  %add.248372.3.clone.1 = u32[1280,1280]{1,0} add(%add.248370.3.clone.1, %xor.121879.5.clone.1)
+  %shift-left.109605.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121879.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115803.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121879.5.clone.1, %broadcast.244419.4352)
+  %or.115333.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109605.5.clone.1, %shift-right-logical.115803.5.clone.1)
+  %xor.121880.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248372.3.clone.1, %or.115333.3.clone.1)
+  %add.248376.3.clone.1 = u32[1280,1280]{1,0} add(%add.248372.3.clone.1, %xor.121880.3.clone.1)
+  %add.248377.17.clone.1 = u32[1280,1280]{1,0} add(%add.248376.3.clone.1, %broadcast.251883.24.clone.1)
+  %shift-left.109606.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121880.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115804.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121880.3.clone.1, %broadcast.244418.4352)
+  %or.115334.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109606.5.clone.1, %shift-right-logical.115804.5.clone.1)
+  %xor.121881.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248376.3.clone.1, %or.115334.3.clone.1)
+  %constant_218246_1_clone_1 = u32[] constant(1467111409)
+  %broadcast.251923.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218246_1_clone_1), dimensions={}
+  %add.248378.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121881.15.clone.1, %broadcast.251923.19.clone.1)
+  %xor.121882.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248377.17.clone.1, %add.248378.19.clone.1)
+  %shift-right-logical.115805.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121882.17.clone.1, %broadcast.244468.1920)
+  %or.115335.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115805.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5754.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115335.13.clone.1)
+  %add.248379.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5754.11.clone.1, %broadcast.244470.1152)
+  %multiply.26456.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248379.9.clone.1, %broadcast.244471.896)
+  %add.248381.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26456.7.clone.1, %broadcast.244408.1024)
+  %maximum.3686.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248381.5.clone.1)
+  %abs.1540.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3686.3.clone.1)
+  %compare.7228.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1540.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26457.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3686.3.clone.1, %broadcast.244476.1152)
+  %negate.4585.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3686.3.clone.1)
+  %multiply.26458.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3686.3.clone.1, %negate.4585.5.clone.1)
+  %log-plus-one.1540.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26458.5.clone.1)
+  %negate.4586.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1540.3.clone.1)
+  %compare.7229.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4586.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21021.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21022.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21023.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21024.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21025.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21026.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21027.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21028.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21029.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248382.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4586.4.clone.1, %broadcast.244496.640)
+  %sqrt.1540.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4586.4.clone.1)
+  %add.248383.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1540.5.clone.1, %broadcast.244498.640)
+  %select.21030.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7229.3.clone.1, %add.248382.5.clone.1, %add.248383.5.clone.1)
+  %multiply.26459.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21029.3.clone.1, %select.21030.3.clone.1)
+  %add.248384.1.clone.1 = f32[1280,1280]{1,0} add(%select.21028.3.clone.1, %multiply.26459.1.clone.1)
+  %multiply.26460.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248384.1.clone.1, %select.21030.3.clone.1)
+  %add.248386.1.clone.1 = f32[1280,1280]{1,0} add(%select.21027.3.clone.1, %multiply.26460.1.clone.1)
+  %multiply.26461.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248386.1.clone.1, %select.21030.3.clone.1)
+  %add.248387.1.clone.1 = f32[1280,1280]{1,0} add(%select.21026.3.clone.1, %multiply.26461.1.clone.1)
+  %multiply.26462.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248387.1.clone.1, %select.21030.3.clone.1)
+  %add.248388.1.clone.1 = f32[1280,1280]{1,0} add(%select.21025.3.clone.1, %multiply.26462.1.clone.1)
+  %multiply.26463.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248388.1.clone.1, %select.21030.3.clone.1)
+  %add.248389.3.clone.1 = f32[1280,1280]{1,0} add(%select.21024.5.clone.1, %multiply.26463.1.clone.1)
+  %multiply.26464.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248389.3.clone.1, %select.21030.3.clone.1)
+  %add.248391.3.clone.1 = f32[1280,1280]{1,0} add(%select.21023.5.clone.1, %multiply.26464.1.clone.1)
+  %multiply.26466.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248391.3.clone.1, %select.21030.3.clone.1)
+  %add.248392.9.clone.1 = f32[1280,1280]{1,0} add(%select.21022.11.clone.1, %multiply.26466.7.clone.1)
+  %multiply.26467.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248392.9.clone.1, %select.21030.3.clone.1)
+  %add.248393.7.clone.1 = f32[1280,1280]{1,0} add(%select.21021.7.clone.1, %multiply.26467.7.clone.1)
+  %multiply.26468.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248393.7.clone.1, %maximum.3686.3.clone.1)
+  %select.21031.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7228.3.clone.1, %multiply.26457.9.clone.1, %multiply.26468.7.clone.1)
+  %multiply.26469.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21031.7.clone.1, %broadcast.244500.640)
+  %clamp.1184.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26469.5.clone.1, %broadcast.244501.384)
+  %multiply.26470.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1184.3.clone.1, %broadcast.244502.1)
+  %constant_195878_1_clone_1 = u32[] constant(3825517637)
+  %broadcast.260736.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195878_1_clone_1), dimensions={}
+  %add.253396.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.260736.44.clone.1)
+  %constant_195885_1_clone_1 = u32[] constant(586838930)
+  %broadcast.260737.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195885_1_clone_1), dimensions={}
+  %add.253397.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.260737.113.clone.1)
+  %add.253398.35.clone.1 = u32[1280,1280]{1,0} add(%add.253396.37.clone.1, %add.253397.99.clone.1)
+  %shift-left.111800.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253397.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118107.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253397.99.clone.1, %broadcast.244415.6016)
+  %or.117638.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111800.31.clone.1, %shift-right-logical.118107.29.clone.1)
+  %xor.124199.27.clone.1 = u32[1280,1280]{1,0} xor(%add.253398.35.clone.1, %or.117638.29.clone.1)
+  %add.253399.5.clone.1 = u32[1280,1280]{1,0} add(%add.253398.35.clone.1, %xor.124199.27.clone.1)
+  %shift-left.111801.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124199.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118108.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124199.27.clone.1, %broadcast.244417.5760)
+  %or.117639.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111801.9.clone.1, %shift-right-logical.118108.9.clone.1)
+  %xor.124201.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253399.5.clone.1, %or.117639.7.clone.1)
+  %add.253400.3.clone.1 = u32[1280,1280]{1,0} add(%add.253399.5.clone.1, %xor.124201.5.clone.1)
+  %shift-left.111802.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124201.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118109.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124201.5.clone.1, %broadcast.244419.4352)
+  %or.117641.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111802.5.clone.1, %shift-right-logical.118109.5.clone.1)
+  %xor.124202.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253400.3.clone.1, %or.117641.3.clone.1)
+  %add.253401.3.clone.1 = u32[1280,1280]{1,0} add(%add.253400.3.clone.1, %xor.124202.3.clone.1)
+  %add.253402.7.clone.1 = u32[1280,1280]{1,0} add(%add.253401.3.clone.1, %broadcast.260737.113.clone.1)
+  %shift-left.111803.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124202.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118110.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124202.3.clone.1, %broadcast.244418.4352)
+  %or.117642.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111803.5.clone.1, %shift-right-logical.118110.5.clone.1)
+  %xor.124203.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253401.3.clone.1, %or.117642.3.clone.1)
+  %constant_218802_1_clone_1 = u32[] constant(3710888974)
+  %broadcast.260747.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218802_1_clone_1), dimensions={}
+  %add.253403.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124203.3.clone.1, %broadcast.260747.5.clone.1)
+  %add.253404.5.clone.1 = u32[1280,1280]{1,0} add(%add.253402.7.clone.1, %add.253403.5.clone.1)
+  %shift-left.111804.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253403.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118111.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253403.5.clone.1, %broadcast.244416.5760)
+  %or.117643.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111804.9.clone.1, %shift-right-logical.118111.9.clone.1)
+  %xor.124204.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253404.5.clone.1, %or.117643.7.clone.1)
+  %add.253405.3.clone.1 = u32[1280,1280]{1,0} add(%add.253404.5.clone.1, %xor.124204.5.clone.1)
+  %shift-left.111805.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124204.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118112.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124204.5.clone.1, %broadcast.244429.2304)
+  %or.117644.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111805.9.clone.1, %shift-right-logical.118112.9.clone.1)
+  %xor.124206.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253405.3.clone.1, %or.117644.7.clone.1)
+  %add.253406.3.clone.1 = u32[1280,1280]{1,0} add(%add.253405.3.clone.1, %xor.124206.5.clone.1)
+  %shift-left.111806.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124206.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118113.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124206.5.clone.1, %broadcast.244430.4608)
+  %or.117646.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111806.9.clone.1, %shift-right-logical.118113.9.clone.1)
+  %xor.124207.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253406.3.clone.1, %or.117646.7.clone.1)
+  %add.253407.3.clone.1 = u32[1280,1280]{1,0} add(%add.253406.3.clone.1, %xor.124207.5.clone.1)
+  %constant_195887_1_clone_1 = u32[] constant(3710888973)
+  %broadcast.260754.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195887_1_clone_1), dimensions={}
+  %add.253408.7.clone.1 = u32[1280,1280]{1,0} add(%add.253407.3.clone.1, %broadcast.260754.24.clone.1)
+  %shift-left.111807.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124207.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118114.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124207.5.clone.1, %broadcast.244434.2816)
+  %or.117647.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111807.11.clone.1, %shift-right-logical.118114.11.clone.1)
+  %xor.124208.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253407.3.clone.1, %or.117647.9.clone.1)
+  %constant_218803_1_clone_1 = u32[] constant(3825517639)
+  %broadcast.260757.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218803_1_clone_1), dimensions={}
+  %add.253409.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124208.7.clone.1, %broadcast.260757.5.clone.1)
+  %add.253410.5.clone.1 = u32[1280,1280]{1,0} add(%add.253408.7.clone.1, %add.253409.5.clone.1)
+  %shift-left.111808.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253409.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118115.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253409.5.clone.1, %broadcast.244415.6016)
+  %or.117648.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111808.9.clone.1, %shift-right-logical.118115.9.clone.1)
+  %xor.124209.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253410.5.clone.1, %or.117648.7.clone.1)
+  %add.253411.3.clone.1 = u32[1280,1280]{1,0} add(%add.253410.5.clone.1, %xor.124209.5.clone.1)
+  %shift-left.111809.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124209.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118116.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124209.5.clone.1, %broadcast.244417.5760)
+  %or.117649.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111809.9.clone.1, %shift-right-logical.118116.9.clone.1)
+  %xor.124211.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253411.3.clone.1, %or.117649.7.clone.1)
+  %add.253412.3.clone.1 = u32[1280,1280]{1,0} add(%add.253411.3.clone.1, %xor.124211.5.clone.1)
+  %shift-left.111810.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124211.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118117.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124211.5.clone.1, %broadcast.244419.4352)
+  %or.117650.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111810.7.clone.1, %shift-right-logical.118117.7.clone.1)
+  %xor.124212.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253412.3.clone.1, %or.117650.5.clone.1)
+  %add.253413.3.clone.1 = u32[1280,1280]{1,0} add(%add.253412.3.clone.1, %xor.124212.3.clone.1)
+  %add.253414.7.clone.1 = u32[1280,1280]{1,0} add(%add.253413.3.clone.1, %broadcast.260736.44.clone.1)
+  %shift-left.111811.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124212.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118118.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124212.3.clone.1, %broadcast.244418.4352)
+  %or.117651.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111811.7.clone.1, %shift-right-logical.118118.7.clone.1)
+  %xor.124213.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253413.3.clone.1, %or.117651.5.clone.1)
+  %constant_218804_1_clone_1 = u32[] constant(586838933)
+  %broadcast.260767.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218804_1_clone_1), dimensions={}
+  %add.253415.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124213.3.clone.1, %broadcast.260767.5.clone.1)
+  %add.253416.5.clone.1 = u32[1280,1280]{1,0} add(%add.253414.7.clone.1, %add.253415.5.clone.1)
+  %shift-left.111812.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253415.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118119.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253415.5.clone.1, %broadcast.244416.5760)
+  %or.117652.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111812.9.clone.1, %shift-right-logical.118119.9.clone.1)
+  %xor.124214.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253416.5.clone.1, %or.117652.7.clone.1)
+  %add.253417.3.clone.1 = u32[1280,1280]{1,0} add(%add.253416.5.clone.1, %xor.124214.5.clone.1)
+  %shift-left.111813.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124214.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118120.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124214.5.clone.1, %broadcast.244429.2304)
+  %or.117653.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111813.9.clone.1, %shift-right-logical.118120.9.clone.1)
+  %xor.124216.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253417.3.clone.1, %or.117653.7.clone.1)
+  %add.253418.3.clone.1 = u32[1280,1280]{1,0} add(%add.253417.3.clone.1, %xor.124216.5.clone.1)
+  %shift-left.111814.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124216.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118121.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124216.5.clone.1, %broadcast.244430.4608)
+  %or.117654.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111814.9.clone.1, %shift-right-logical.118121.9.clone.1)
+  %xor.124217.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253418.3.clone.1, %or.117654.7.clone.1)
+  %add.253419.3.clone.1 = u32[1280,1280]{1,0} add(%add.253418.3.clone.1, %xor.124217.5.clone.1)
+  %add.253420.7.clone.1 = u32[1280,1280]{1,0} add(%add.253419.3.clone.1, %broadcast.260737.113.clone.1)
+  %shift-left.111815.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124217.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118122.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124217.5.clone.1, %broadcast.244434.2816)
+  %or.117656.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111815.11.clone.1, %shift-right-logical.118122.11.clone.1)
+  %xor.124218.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253419.3.clone.1, %or.117656.9.clone.1)
+  %constant_218805_1_clone_1 = u32[] constant(3710888977)
+  %broadcast.260777.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218805_1_clone_1), dimensions={}
+  %add.253421.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124218.7.clone.1, %broadcast.260777.5.clone.1)
+  %add.253422.5.clone.1 = u32[1280,1280]{1,0} add(%add.253420.7.clone.1, %add.253421.5.clone.1)
+  %shift-left.111816.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253421.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118123.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253421.5.clone.1, %broadcast.244415.6016)
+  %or.117657.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111816.9.clone.1, %shift-right-logical.118123.9.clone.1)
+  %xor.124219.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253422.5.clone.1, %or.117657.7.clone.1)
+  %add.253423.3.clone.1 = u32[1280,1280]{1,0} add(%add.253422.5.clone.1, %xor.124219.5.clone.1)
+  %shift-left.111817.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124219.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118124.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124219.5.clone.1, %broadcast.244417.5760)
+  %or.117658.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111817.9.clone.1, %shift-right-logical.118124.9.clone.1)
+  %xor.124220.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253423.3.clone.1, %or.117658.7.clone.1)
+  %add.253425.3.clone.1 = u32[1280,1280]{1,0} add(%add.253423.3.clone.1, %xor.124220.5.clone.1)
+  %shift-left.111818.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124220.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118125.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124220.5.clone.1, %broadcast.244419.4352)
+  %or.117659.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111818.5.clone.1, %shift-right-logical.118125.5.clone.1)
+  %xor.124221.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253425.3.clone.1, %or.117659.3.clone.1)
+  %add.253428.3.clone.1 = u32[1280,1280]{1,0} add(%add.253425.3.clone.1, %xor.124221.3.clone.1)
+  %add.253429.17.clone.1 = u32[1280,1280]{1,0} add(%add.253428.3.clone.1, %broadcast.260754.24.clone.1)
+  %shift-left.111819.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124221.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118126.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124221.3.clone.1, %broadcast.244418.4352)
+  %or.117661.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111819.5.clone.1, %shift-right-logical.118126.5.clone.1)
+  %xor.124222.15.clone.1 = u32[1280,1280]{1,0} xor(%add.253428.3.clone.1, %or.117661.3.clone.1)
+  %constant_218806_1_clone_1 = u32[] constant(3825517642)
+  %broadcast.260787.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218806_1_clone_1), dimensions={}
+  %add.253430.19.clone.1 = u32[1280,1280]{1,0} add(%xor.124222.15.clone.1, %broadcast.260787.19.clone.1)
+  %xor.124223.17.clone.1 = u32[1280,1280]{1,0} xor(%add.253429.17.clone.1, %add.253430.19.clone.1)
+  %shift-right-logical.118127.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124223.17.clone.1, %broadcast.244468.1920)
+  %or.117662.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.118127.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5855.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117662.13.clone.1)
+  %add.253431.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5855.11.clone.1, %broadcast.244470.1152)
+  %multiply.27492.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253431.9.clone.1, %broadcast.244471.896)
+  %add.253433.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27492.7.clone.1, %broadcast.244408.1024)
+  %maximum.3787.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.253433.5.clone.1)
+  %abs.1607.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3787.3.clone.1)
+  %compare.7376.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1607.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27493.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3787.3.clone.1, %broadcast.244476.1152)
+  %negate.4719.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3787.3.clone.1)
+  %multiply.27494.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3787.3.clone.1, %negate.4719.5.clone.1)
+  %log-plus-one.1607.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27494.5.clone.1)
+  %negate.4720.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1607.3.clone.1)
+  %compare.7377.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4720.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21784.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21785.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21786.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21787.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21788.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21789.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21790.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21791.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21792.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.253434.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4720.4.clone.1, %broadcast.244496.640)
+  %sqrt.1607.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4720.4.clone.1)
+  %add.253435.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1607.5.clone.1, %broadcast.244498.640)
+  %select.21793.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7377.3.clone.1, %add.253434.5.clone.1, %add.253435.5.clone.1)
+  %multiply.27495.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21792.3.clone.1, %select.21793.3.clone.1)
+  %add.253436.1.clone.1 = f32[1280,1280]{1,0} add(%select.21791.3.clone.1, %multiply.27495.1.clone.1)
+  %multiply.27496.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253436.1.clone.1, %select.21793.3.clone.1)
+  %add.253438.1.clone.1 = f32[1280,1280]{1,0} add(%select.21790.3.clone.1, %multiply.27496.1.clone.1)
+  %multiply.27497.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253438.1.clone.1, %select.21793.3.clone.1)
+  %add.253439.1.clone.1 = f32[1280,1280]{1,0} add(%select.21789.3.clone.1, %multiply.27497.1.clone.1)
+  %multiply.27498.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253439.1.clone.1, %select.21793.3.clone.1)
+  %add.253440.1.clone.1 = f32[1280,1280]{1,0} add(%select.21788.3.clone.1, %multiply.27498.1.clone.1)
+  %multiply.27499.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253440.1.clone.1, %select.21793.3.clone.1)
+  %add.253441.3.clone.1 = f32[1280,1280]{1,0} add(%select.21787.5.clone.1, %multiply.27499.1.clone.1)
+  %multiply.27500.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253441.3.clone.1, %select.21793.3.clone.1)
+  %add.253443.3.clone.1 = f32[1280,1280]{1,0} add(%select.21786.5.clone.1, %multiply.27500.1.clone.1)
+  %multiply.27501.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253443.3.clone.1, %select.21793.3.clone.1)
+  %add.253444.9.clone.1 = f32[1280,1280]{1,0} add(%select.21785.11.clone.1, %multiply.27501.7.clone.1)
+  %multiply.27502.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253444.9.clone.1, %select.21793.3.clone.1)
+  %add.253445.7.clone.1 = f32[1280,1280]{1,0} add(%select.21784.7.clone.1, %multiply.27502.7.clone.1)
+  %multiply.27503.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253445.7.clone.1, %maximum.3787.3.clone.1)
+  %select.21794.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7376.3.clone.1, %multiply.27493.9.clone.1, %multiply.27503.7.clone.1)
+  %multiply.27504.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21794.7.clone.1, %broadcast.244500.640)
+  %clamp.1251.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27504.5.clone.1, %broadcast.244501.384)
+  %multiply.27505.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1251.3.clone.1, %broadcast.244502.1)
+  %constant_175166_1_clone_1 = u32[] constant(4012746161)
+  %broadcast.251768.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175166_1_clone_1), dimensions={}
+  %add.248279.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251768.44.clone.1)
+  %constant_175173_1_clone_1 = u32[] constant(3152204928)
+  %broadcast.251770.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175173_1_clone_1), dimensions={}
+  %add.248280.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251770.113.clone.1)
+  %add.248281.35.clone.1 = u32[1280,1280]{1,0} add(%add.248279.37.clone.1, %add.248280.99.clone.1)
+  %shift-left.109560.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248280.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115756.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248280.99.clone.1, %broadcast.244415.6016)
+  %or.115293.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109560.31.clone.1, %shift-right-logical.115756.29.clone.1)
+  %xor.121840.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248281.35.clone.1, %or.115293.29.clone.1)
+  %add.248282.5.clone.1 = u32[1280,1280]{1,0} add(%add.248281.35.clone.1, %xor.121840.27.clone.1)
+  %shift-left.109561.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121840.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115758.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121840.27.clone.1, %broadcast.244417.5760)
+  %or.115294.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109561.9.clone.1, %shift-right-logical.115758.9.clone.1)
+  %xor.121841.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248282.5.clone.1, %or.115294.7.clone.1)
+  %add.248283.3.clone.1 = u32[1280,1280]{1,0} add(%add.248282.5.clone.1, %xor.121841.5.clone.1)
+  %shift-left.109562.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121841.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115759.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121841.5.clone.1, %broadcast.244419.4352)
+  %or.115295.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109562.5.clone.1, %shift-right-logical.115759.5.clone.1)
+  %xor.121842.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248283.3.clone.1, %or.115295.3.clone.1)
+  %add.248284.3.clone.1 = u32[1280,1280]{1,0} add(%add.248283.3.clone.1, %xor.121842.3.clone.1)
+  %add.248285.7.clone.1 = u32[1280,1280]{1,0} add(%add.248284.3.clone.1, %broadcast.251770.113.clone.1)
+  %shift-left.109563.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121842.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115760.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121842.3.clone.1, %broadcast.244418.4352)
+  %or.115296.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109563.5.clone.1, %shift-right-logical.115760.5.clone.1)
+  %xor.121843.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248284.3.clone.1, %or.115296.3.clone.1)
+  %constant_218237_1_clone_1 = u32[] constant(1327393516)
+  %broadcast.251782.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218237_1_clone_1), dimensions={}
+  %add.248286.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121843.3.clone.1, %broadcast.251782.5.clone.1)
+  %add.248287.5.clone.1 = u32[1280,1280]{1,0} add(%add.248285.7.clone.1, %add.248286.5.clone.1)
+  %shift-left.109564.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248286.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115761.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248286.5.clone.1, %broadcast.244416.5760)
+  %or.115297.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109564.9.clone.1, %shift-right-logical.115761.9.clone.1)
+  %xor.121844.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248287.5.clone.1, %or.115297.7.clone.1)
+  %add.248288.3.clone.1 = u32[1280,1280]{1,0} add(%add.248287.5.clone.1, %xor.121844.5.clone.1)
+  %shift-left.109565.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121844.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115763.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121844.5.clone.1, %broadcast.244429.2304)
+  %or.115298.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109565.9.clone.1, %shift-right-logical.115763.9.clone.1)
+  %xor.121845.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248288.3.clone.1, %or.115298.7.clone.1)
+  %add.248289.3.clone.1 = u32[1280,1280]{1,0} add(%add.248288.3.clone.1, %xor.121845.5.clone.1)
+  %shift-left.109566.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121845.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115764.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121845.5.clone.1, %broadcast.244430.4608)
+  %or.115299.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109566.9.clone.1, %shift-right-logical.115764.9.clone.1)
+  %xor.121847.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248289.3.clone.1, %or.115299.7.clone.1)
+  %add.248290.3.clone.1 = u32[1280,1280]{1,0} add(%add.248289.3.clone.1, %xor.121847.5.clone.1)
+  %constant_175175_1_clone_1 = u32[] constant(1327393515)
+  %broadcast.251789.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_175175_1_clone_1), dimensions={}
+  %add.248291.7.clone.1 = u32[1280,1280]{1,0} add(%add.248290.3.clone.1, %broadcast.251789.24.clone.1)
+  %shift-left.109567.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121847.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115765.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121847.5.clone.1, %broadcast.244434.2816)
+  %or.115300.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109567.11.clone.1, %shift-right-logical.115765.11.clone.1)
+  %xor.121848.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248290.3.clone.1, %or.115300.9.clone.1)
+  %constant_218238_1_clone_1 = u32[] constant(4012746163)
+  %broadcast.251792.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218238_1_clone_1), dimensions={}
+  %add.248292.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121848.7.clone.1, %broadcast.251792.5.clone.1)
+  %add.248293.5.clone.1 = u32[1280,1280]{1,0} add(%add.248291.7.clone.1, %add.248292.5.clone.1)
+  %shift-left.109568.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248292.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115766.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248292.5.clone.1, %broadcast.244415.6016)
+  %or.115301.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109568.9.clone.1, %shift-right-logical.115766.9.clone.1)
+  %xor.121849.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248293.5.clone.1, %or.115301.7.clone.1)
+  %add.248294.3.clone.1 = u32[1280,1280]{1,0} add(%add.248293.5.clone.1, %xor.121849.5.clone.1)
+  %shift-left.109570.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121849.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115767.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121849.5.clone.1, %broadcast.244417.5760)
+  %or.115302.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109570.9.clone.1, %shift-right-logical.115767.9.clone.1)
+  %xor.121850.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248294.3.clone.1, %or.115302.7.clone.1)
+  %add.248295.3.clone.1 = u32[1280,1280]{1,0} add(%add.248294.3.clone.1, %xor.121850.5.clone.1)
+  %shift-left.109571.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121850.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115768.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121850.5.clone.1, %broadcast.244419.4352)
+  %or.115303.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109571.7.clone.1, %shift-right-logical.115768.7.clone.1)
+  %xor.121851.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248295.3.clone.1, %or.115303.5.clone.1)
+  %add.248296.3.clone.1 = u32[1280,1280]{1,0} add(%add.248295.3.clone.1, %xor.121851.3.clone.1)
+  %add.248298.7.clone.1 = u32[1280,1280]{1,0} add(%add.248296.3.clone.1, %broadcast.251768.44.clone.1)
+  %shift-left.109572.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121851.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115769.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121851.3.clone.1, %broadcast.244418.4352)
+  %or.115304.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109572.7.clone.1, %shift-right-logical.115769.7.clone.1)
+  %xor.121852.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248296.3.clone.1, %or.115304.5.clone.1)
+  %constant_218239_1_clone_1 = u32[] constant(3152204931)
+  %broadcast.251804.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218239_1_clone_1), dimensions={}
+  %add.248301.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121852.3.clone.1, %broadcast.251804.5.clone.1)
+  %add.248302.5.clone.1 = u32[1280,1280]{1,0} add(%add.248298.7.clone.1, %add.248301.5.clone.1)
+  %shift-left.109573.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248301.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115770.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248301.5.clone.1, %broadcast.244416.5760)
+  %or.115305.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109573.9.clone.1, %shift-right-logical.115770.9.clone.1)
+  %xor.121853.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248302.5.clone.1, %or.115305.7.clone.1)
+  %add.248303.3.clone.1 = u32[1280,1280]{1,0} add(%add.248302.5.clone.1, %xor.121853.5.clone.1)
+  %shift-left.109575.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121853.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115771.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121853.5.clone.1, %broadcast.244429.2304)
+  %or.115306.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109575.9.clone.1, %shift-right-logical.115771.9.clone.1)
+  %xor.121854.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248303.3.clone.1, %or.115306.7.clone.1)
+  %add.248304.3.clone.1 = u32[1280,1280]{1,0} add(%add.248303.3.clone.1, %xor.121854.5.clone.1)
+  %shift-left.109576.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121854.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115773.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121854.5.clone.1, %broadcast.244430.4608)
+  %or.115307.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109576.9.clone.1, %shift-right-logical.115773.9.clone.1)
+  %xor.121855.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248304.3.clone.1, %or.115307.7.clone.1)
+  %add.248306.3.clone.1 = u32[1280,1280]{1,0} add(%add.248304.3.clone.1, %xor.121855.5.clone.1)
+  %add.248307.7.clone.1 = u32[1280,1280]{1,0} add(%add.248306.3.clone.1, %broadcast.251770.113.clone.1)
+  %shift-left.109577.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121855.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115774.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121855.5.clone.1, %broadcast.244434.2816)
+  %or.115308.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109577.11.clone.1, %shift-right-logical.115774.11.clone.1)
+  %xor.121856.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248306.3.clone.1, %or.115308.9.clone.1)
+  %constant_218240_1_clone_1 = u32[] constant(1327393519)
+  %broadcast.251814.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218240_1_clone_1), dimensions={}
+  %add.248308.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121856.7.clone.1, %broadcast.251814.5.clone.1)
+  %add.248309.5.clone.1 = u32[1280,1280]{1,0} add(%add.248307.7.clone.1, %add.248308.5.clone.1)
+  %shift-left.109578.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248308.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115775.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248308.5.clone.1, %broadcast.244415.6016)
+  %or.115309.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109578.9.clone.1, %shift-right-logical.115775.9.clone.1)
+  %xor.121857.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248309.5.clone.1, %or.115309.7.clone.1)
+  %add.248311.3.clone.1 = u32[1280,1280]{1,0} add(%add.248309.5.clone.1, %xor.121857.5.clone.1)
+  %shift-left.109580.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121857.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115776.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121857.5.clone.1, %broadcast.244417.5760)
+  %or.115310.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109580.9.clone.1, %shift-right-logical.115776.9.clone.1)
+  %xor.121858.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248311.3.clone.1, %or.115310.7.clone.1)
+  %add.248312.3.clone.1 = u32[1280,1280]{1,0} add(%add.248311.3.clone.1, %xor.121858.5.clone.1)
+  %shift-left.109581.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121858.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115778.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121858.5.clone.1, %broadcast.244419.4352)
+  %or.115311.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109581.5.clone.1, %shift-right-logical.115778.5.clone.1)
+  %xor.121859.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248312.3.clone.1, %or.115311.3.clone.1)
+  %add.248313.3.clone.1 = u32[1280,1280]{1,0} add(%add.248312.3.clone.1, %xor.121859.3.clone.1)
+  %add.248314.17.clone.1 = u32[1280,1280]{1,0} add(%add.248313.3.clone.1, %broadcast.251789.24.clone.1)
+  %shift-left.109582.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121859.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115779.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121859.3.clone.1, %broadcast.244418.4352)
+  %or.115312.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109582.5.clone.1, %shift-right-logical.115779.5.clone.1)
+  %xor.121860.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248313.3.clone.1, %or.115312.3.clone.1)
+  %constant_218241_1_clone_1 = u32[] constant(4012746166)
+  %broadcast.251826.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218241_1_clone_1), dimensions={}
+  %add.248316.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121860.15.clone.1, %broadcast.251826.19.clone.1)
+  %xor.121861.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248314.17.clone.1, %add.248316.19.clone.1)
+  %shift-right-logical.115780.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121861.17.clone.1, %broadcast.244468.1920)
+  %or.115313.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115780.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5753.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115313.13.clone.1)
+  %add.248317.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5753.11.clone.1, %broadcast.244470.1152)
+  %multiply.26442.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248317.9.clone.1, %broadcast.244471.896)
+  %add.248318.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26442.7.clone.1, %broadcast.244408.1024)
+  %maximum.3685.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248318.5.clone.1)
+  %abs.1539.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3685.3.clone.1)
+  %compare.7226.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1539.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26443.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3685.3.clone.1, %broadcast.244476.1152)
+  %negate.4583.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3685.3.clone.1)
+  %multiply.26444.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3685.3.clone.1, %negate.4583.5.clone.1)
+  %log-plus-one.1539.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26444.5.clone.1)
+  %negate.4584.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1539.3.clone.1)
+  %compare.7227.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4584.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21005.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21006.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21007.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21008.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21009.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21010.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21011.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21013.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21018.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248319.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4584.4.clone.1, %broadcast.244496.640)
+  %sqrt.1539.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4584.4.clone.1)
+  %add.248320.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1539.5.clone.1, %broadcast.244498.640)
+  %select.21019.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7227.3.clone.1, %add.248319.5.clone.1, %add.248320.5.clone.1)
+  %multiply.26445.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21018.3.clone.1, %select.21019.3.clone.1)
+  %add.248322.1.clone.1 = f32[1280,1280]{1,0} add(%select.21013.3.clone.1, %multiply.26445.1.clone.1)
+  %multiply.26446.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248322.1.clone.1, %select.21019.3.clone.1)
+  %add.248326.1.clone.1 = f32[1280,1280]{1,0} add(%select.21011.3.clone.1, %multiply.26446.1.clone.1)
+  %multiply.26447.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248326.1.clone.1, %select.21019.3.clone.1)
+  %add.248327.1.clone.1 = f32[1280,1280]{1,0} add(%select.21010.3.clone.1, %multiply.26447.1.clone.1)
+  %multiply.26448.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248327.1.clone.1, %select.21019.3.clone.1)
+  %add.248328.1.clone.1 = f32[1280,1280]{1,0} add(%select.21009.3.clone.1, %multiply.26448.1.clone.1)
+  %multiply.26449.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248328.1.clone.1, %select.21019.3.clone.1)
+  %add.248329.3.clone.1 = f32[1280,1280]{1,0} add(%select.21008.5.clone.1, %multiply.26449.1.clone.1)
+  %multiply.26450.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248329.3.clone.1, %select.21019.3.clone.1)
+  %add.248331.3.clone.1 = f32[1280,1280]{1,0} add(%select.21007.5.clone.1, %multiply.26450.1.clone.1)
+  %multiply.26451.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248331.3.clone.1, %select.21019.3.clone.1)
+  %add.248332.9.clone.1 = f32[1280,1280]{1,0} add(%select.21006.11.clone.1, %multiply.26451.7.clone.1)
+  %multiply.26452.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248332.9.clone.1, %select.21019.3.clone.1)
+  %add.248333.7.clone.1 = f32[1280,1280]{1,0} add(%select.21005.7.clone.1, %multiply.26452.7.clone.1)
+  %multiply.26453.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248333.7.clone.1, %maximum.3685.3.clone.1)
+  %select.21020.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7226.3.clone.1, %multiply.26443.9.clone.1, %multiply.26453.7.clone.1)
+  %multiply.26454.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21020.7.clone.1, %broadcast.244500.640)
+  %clamp.1183.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26454.5.clone.1, %broadcast.244501.384)
+  %multiply.26455.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1183.3.clone.1, %broadcast.244502.1)
+  %constant_186144_1_clone_1 = u32[] constant(2626188011)
+  %broadcast.256535.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186144_1_clone_1), dimensions={}
+  %add.250981.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.256535.44.clone.1)
+  %constant_186151_1_clone_1 = u32[] constant(3739757680)
+  %broadcast.256536.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186151_1_clone_1), dimensions={}
+  %add.250982.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256536.113.clone.1)
+  %add.250983.35.clone.1 = u32[1280,1280]{1,0} add(%add.250981.37.clone.1, %add.250982.99.clone.1)
+  %shift-left.110750.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250982.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117013.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250982.99.clone.1, %broadcast.244415.6016)
+  %or.116526.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110750.31.clone.1, %shift-right-logical.117013.29.clone.1)
+  %xor.123102.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250983.35.clone.1, %or.116526.29.clone.1)
+  %add.250984.5.clone.1 = u32[1280,1280]{1,0} add(%add.250983.35.clone.1, %xor.123102.27.clone.1)
+  %shift-left.110751.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123102.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117014.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123102.27.clone.1, %broadcast.244417.5760)
+  %or.116527.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110751.9.clone.1, %shift-right-logical.117014.9.clone.1)
+  %xor.123104.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250984.5.clone.1, %or.116527.7.clone.1)
+  %add.250985.3.clone.1 = u32[1280,1280]{1,0} add(%add.250984.5.clone.1, %xor.123104.5.clone.1)
+  %shift-left.110752.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123104.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117015.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123104.5.clone.1, %broadcast.244419.4352)
+  %or.116528.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110752.5.clone.1, %shift-right-logical.117015.5.clone.1)
+  %xor.123105.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250985.3.clone.1, %or.116528.3.clone.1)
+  %add.250986.3.clone.1 = u32[1280,1280]{1,0} add(%add.250985.3.clone.1, %xor.123105.3.clone.1)
+  %add.250987.7.clone.1 = u32[1280,1280]{1,0} add(%add.250986.3.clone.1, %broadcast.256536.113.clone.1)
+  %shift-left.110753.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123105.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117016.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123105.3.clone.1, %broadcast.244418.4352)
+  %or.116529.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110753.5.clone.1, %shift-right-logical.117016.5.clone.1)
+  %xor.123106.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250986.3.clone.1, %or.116529.3.clone.1)
+  %constant_218526_1_clone_1 = u32[] constant(1504789826)
+  %broadcast.256546.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218526_1_clone_1), dimensions={}
+  %add.250988.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123106.3.clone.1, %broadcast.256546.5.clone.1)
+  %add.250989.5.clone.1 = u32[1280,1280]{1,0} add(%add.250987.7.clone.1, %add.250988.5.clone.1)
+  %shift-left.110755.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250988.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117017.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250988.5.clone.1, %broadcast.244416.5760)
+  %or.116531.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110755.9.clone.1, %shift-right-logical.117017.9.clone.1)
+  %xor.123107.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250989.5.clone.1, %or.116531.7.clone.1)
+  %add.250990.3.clone.1 = u32[1280,1280]{1,0} add(%add.250989.5.clone.1, %xor.123107.5.clone.1)
+  %shift-left.110756.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123107.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117018.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123107.5.clone.1, %broadcast.244429.2304)
+  %or.116532.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110756.9.clone.1, %shift-right-logical.117018.9.clone.1)
+  %xor.123109.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250990.3.clone.1, %or.116532.7.clone.1)
+  %add.250991.3.clone.1 = u32[1280,1280]{1,0} add(%add.250990.3.clone.1, %xor.123109.5.clone.1)
+  %shift-left.110757.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123109.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117019.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123109.5.clone.1, %broadcast.244430.4608)
+  %or.116533.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110757.9.clone.1, %shift-right-logical.117019.9.clone.1)
+  %xor.123110.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250991.3.clone.1, %or.116533.7.clone.1)
+  %add.250993.3.clone.1 = u32[1280,1280]{1,0} add(%add.250991.3.clone.1, %xor.123110.5.clone.1)
+  %constant_186153_1_clone_1 = u32[] constant(1504789825)
+  %broadcast.256555.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_186153_1_clone_1), dimensions={}
+  %add.250996.7.clone.1 = u32[1280,1280]{1,0} add(%add.250993.3.clone.1, %broadcast.256555.24.clone.1)
+  %shift-left.110758.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123110.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117020.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123110.5.clone.1, %broadcast.244434.2816)
+  %or.116534.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110758.11.clone.1, %shift-right-logical.117020.11.clone.1)
+  %xor.123111.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250993.3.clone.1, %or.116534.9.clone.1)
+  %constant_218528_1_clone_1 = u32[] constant(2626188013)
+  %broadcast.256558.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218528_1_clone_1), dimensions={}
+  %add.250997.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123111.7.clone.1, %broadcast.256558.5.clone.1)
+  %add.250998.5.clone.1 = u32[1280,1280]{1,0} add(%add.250996.7.clone.1, %add.250997.5.clone.1)
+  %shift-left.110760.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250997.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117021.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250997.5.clone.1, %broadcast.244415.6016)
+  %or.116536.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110760.9.clone.1, %shift-right-logical.117021.9.clone.1)
+  %xor.123112.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250998.5.clone.1, %or.116536.7.clone.1)
+  %add.250999.3.clone.1 = u32[1280,1280]{1,0} add(%add.250998.5.clone.1, %xor.123112.5.clone.1)
+  %shift-left.110761.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123112.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117022.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123112.5.clone.1, %broadcast.244417.5760)
+  %or.116537.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110761.9.clone.1, %shift-right-logical.117022.9.clone.1)
+  %xor.123114.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250999.3.clone.1, %or.116537.7.clone.1)
+  %add.251001.3.clone.1 = u32[1280,1280]{1,0} add(%add.250999.3.clone.1, %xor.123114.5.clone.1)
+  %shift-left.110762.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123114.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117023.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123114.5.clone.1, %broadcast.244419.4352)
+  %or.116538.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110762.7.clone.1, %shift-right-logical.117023.7.clone.1)
+  %xor.123115.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251001.3.clone.1, %or.116538.5.clone.1)
+  %add.251002.3.clone.1 = u32[1280,1280]{1,0} add(%add.251001.3.clone.1, %xor.123115.3.clone.1)
+  %add.251003.7.clone.1 = u32[1280,1280]{1,0} add(%add.251002.3.clone.1, %broadcast.256535.44.clone.1)
+  %shift-left.110763.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123115.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117024.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123115.3.clone.1, %broadcast.244418.4352)
+  %or.116539.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110763.7.clone.1, %shift-right-logical.117024.7.clone.1)
+  %xor.123116.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251002.3.clone.1, %or.116539.5.clone.1)
+  %constant_218530_1_clone_1 = u32[] constant(3739757683)
+  %broadcast.256573.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218530_1_clone_1), dimensions={}
+  %add.251004.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123116.3.clone.1, %broadcast.256573.5.clone.1)
+  %add.251006.5.clone.1 = u32[1280,1280]{1,0} add(%add.251003.7.clone.1, %add.251004.5.clone.1)
+  %shift-left.110764.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251004.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117025.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251004.5.clone.1, %broadcast.244416.5760)
+  %or.116541.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110764.9.clone.1, %shift-right-logical.117025.9.clone.1)
+  %xor.123117.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251006.5.clone.1, %or.116541.7.clone.1)
+  %add.251007.3.clone.1 = u32[1280,1280]{1,0} add(%add.251006.5.clone.1, %xor.123117.5.clone.1)
+  %shift-left.110765.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123117.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117026.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123117.5.clone.1, %broadcast.244429.2304)
+  %or.116542.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110765.9.clone.1, %shift-right-logical.117026.9.clone.1)
+  %xor.123118.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251007.3.clone.1, %or.116542.7.clone.1)
+  %add.251008.3.clone.1 = u32[1280,1280]{1,0} add(%add.251007.3.clone.1, %xor.123118.5.clone.1)
+  %shift-left.110766.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123118.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117027.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123118.5.clone.1, %broadcast.244430.4608)
+  %or.116543.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110766.9.clone.1, %shift-right-logical.117027.9.clone.1)
+  %xor.123119.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251008.3.clone.1, %or.116543.7.clone.1)
+  %add.251009.3.clone.1 = u32[1280,1280]{1,0} add(%add.251008.3.clone.1, %xor.123119.5.clone.1)
+  %add.251011.7.clone.1 = u32[1280,1280]{1,0} add(%add.251009.3.clone.1, %broadcast.256536.113.clone.1)
+  %shift-left.110767.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123119.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117028.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123119.5.clone.1, %broadcast.244434.2816)
+  %or.116544.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110767.11.clone.1, %shift-right-logical.117028.11.clone.1)
+  %xor.123120.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251009.3.clone.1, %or.116544.9.clone.1)
+  %constant_218532_1_clone_1 = u32[] constant(1504789829)
+  %broadcast.256585.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218532_1_clone_1), dimensions={}
+  %add.251012.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123120.7.clone.1, %broadcast.256585.5.clone.1)
+  %add.251013.5.clone.1 = u32[1280,1280]{1,0} add(%add.251011.7.clone.1, %add.251012.5.clone.1)
+  %shift-left.110768.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251012.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117029.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251012.5.clone.1, %broadcast.244415.6016)
+  %or.116546.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110768.9.clone.1, %shift-right-logical.117029.9.clone.1)
+  %xor.123121.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251013.5.clone.1, %or.116546.7.clone.1)
+  %add.251014.3.clone.1 = u32[1280,1280]{1,0} add(%add.251013.5.clone.1, %xor.123121.5.clone.1)
+  %shift-left.110770.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123121.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117030.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123121.5.clone.1, %broadcast.244417.5760)
+  %or.116547.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110770.9.clone.1, %shift-right-logical.117030.9.clone.1)
+  %xor.123122.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251014.3.clone.1, %or.116547.7.clone.1)
+  %add.251015.3.clone.1 = u32[1280,1280]{1,0} add(%add.251014.3.clone.1, %xor.123122.5.clone.1)
+  %shift-left.110771.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123122.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117031.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123122.5.clone.1, %broadcast.244419.4352)
+  %or.116548.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110771.5.clone.1, %shift-right-logical.117031.5.clone.1)
+  %xor.123124.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251015.3.clone.1, %or.116548.3.clone.1)
+  %add.251017.3.clone.1 = u32[1280,1280]{1,0} add(%add.251015.3.clone.1, %xor.123124.3.clone.1)
+  %add.251021.17.clone.1 = u32[1280,1280]{1,0} add(%add.251017.3.clone.1, %broadcast.256555.24.clone.1)
+  %shift-left.110772.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123124.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117032.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123124.3.clone.1, %broadcast.244418.4352)
+  %or.116549.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110772.5.clone.1, %shift-right-logical.117032.5.clone.1)
+  %xor.123125.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251017.3.clone.1, %or.116549.3.clone.1)
+  %constant_218533_1_clone_1 = u32[] constant(2626188016)
+  %broadcast.256596.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218533_1_clone_1), dimensions={}
+  %add.251022.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123125.15.clone.1, %broadcast.256596.19.clone.1)
+  %xor.123126.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251021.17.clone.1, %add.251022.19.clone.1)
+  %shift-right-logical.117033.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123126.17.clone.1, %broadcast.244468.1920)
+  %or.116550.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117033.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5807.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116550.13.clone.1)
+  %add.251023.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5807.11.clone.1, %broadcast.244470.1152)
+  %multiply.26997.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251023.9.clone.1, %broadcast.244471.896)
+  %add.251024.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26997.7.clone.1, %broadcast.244408.1024)
+  %maximum.3739.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251024.5.clone.1)
+  %abs.1575.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3739.3.clone.1)
+  %compare.7307.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1575.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26998.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3739.3.clone.1, %broadcast.244476.1152)
+  %negate.4655.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3739.3.clone.1)
+  %multiply.26999.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3739.3.clone.1, %negate.4655.5.clone.1)
+  %log-plus-one.1575.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26999.5.clone.1)
+  %negate.4656.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1575.3.clone.1)
+  %compare.7308.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4656.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21411.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21412.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21413.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21414.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21415.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21416.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21417.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21418.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21419.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251026.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4656.4.clone.1, %broadcast.244496.640)
+  %sqrt.1575.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4656.4.clone.1)
+  %add.251027.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1575.5.clone.1, %broadcast.244498.640)
+  %select.21420.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7308.3.clone.1, %add.251026.5.clone.1, %add.251027.5.clone.1)
+  %multiply.27000.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21419.3.clone.1, %select.21420.3.clone.1)
+  %add.251028.1.clone.1 = f32[1280,1280]{1,0} add(%select.21418.3.clone.1, %multiply.27000.1.clone.1)
+  %multiply.27001.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251028.1.clone.1, %select.21420.3.clone.1)
+  %add.251029.1.clone.1 = f32[1280,1280]{1,0} add(%select.21417.3.clone.1, %multiply.27001.1.clone.1)
+  %multiply.27002.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251029.1.clone.1, %select.21420.3.clone.1)
+  %add.251031.1.clone.1 = f32[1280,1280]{1,0} add(%select.21416.3.clone.1, %multiply.27002.1.clone.1)
+  %multiply.27003.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251031.1.clone.1, %select.21420.3.clone.1)
+  %add.251032.1.clone.1 = f32[1280,1280]{1,0} add(%select.21415.3.clone.1, %multiply.27003.1.clone.1)
+  %multiply.27004.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251032.1.clone.1, %select.21420.3.clone.1)
+  %add.251033.3.clone.1 = f32[1280,1280]{1,0} add(%select.21414.5.clone.1, %multiply.27004.1.clone.1)
+  %multiply.27005.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251033.3.clone.1, %select.21420.3.clone.1)
+  %add.251034.3.clone.1 = f32[1280,1280]{1,0} add(%select.21413.5.clone.1, %multiply.27005.1.clone.1)
+  %multiply.27006.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251034.3.clone.1, %select.21420.3.clone.1)
+  %add.251036.9.clone.1 = f32[1280,1280]{1,0} add(%select.21412.11.clone.1, %multiply.27006.7.clone.1)
+  %multiply.27007.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251036.9.clone.1, %select.21420.3.clone.1)
+  %add.251037.7.clone.1 = f32[1280,1280]{1,0} add(%select.21411.7.clone.1, %multiply.27007.7.clone.1)
+  %multiply.27008.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251037.7.clone.1, %maximum.3739.3.clone.1)
+  %select.21421.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7307.3.clone.1, %multiply.26998.9.clone.1, %multiply.27008.7.clone.1)
+  %multiply.27009.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21421.7.clone.1, %broadcast.244500.640)
+  %clamp.1219.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27009.5.clone.1, %broadcast.244501.384)
+  %multiply.27010.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1219.3.clone.1, %broadcast.244502.1)
+  %constant_174615_1_clone_1 = u32[] constant(385371153)
+  %broadcast.251538.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174615_1_clone_1), dimensions={}
+  %add.248145.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251538.44.clone.1)
+  %constant_174622_1_clone_1 = u32[] constant(4274959127)
+  %broadcast.251539.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174622_1_clone_1), dimensions={}
+  %add.248146.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251539.113.clone.1)
+  %add.248147.35.clone.1 = u32[1280,1280]{1,0} add(%add.248145.37.clone.1, %add.248146.99.clone.1)
+  %shift-left.109500.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248146.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115687.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248146.99.clone.1, %broadcast.244415.6016)
+  %or.115218.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109500.31.clone.1, %shift-right-logical.115687.29.clone.1)
+  %xor.121770.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248147.35.clone.1, %or.115218.29.clone.1)
+  %add.248149.5.clone.1 = u32[1280,1280]{1,0} add(%add.248147.35.clone.1, %xor.121770.27.clone.1)
+  %shift-left.109501.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121770.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115688.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121770.27.clone.1, %broadcast.244417.5760)
+  %or.115219.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109501.9.clone.1, %shift-right-logical.115688.9.clone.1)
+  %xor.121771.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248149.5.clone.1, %or.115219.7.clone.1)
+  %add.248150.3.clone.1 = u32[1280,1280]{1,0} add(%add.248149.5.clone.1, %xor.121771.5.clone.1)
+  %shift-left.109502.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121771.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115689.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121771.5.clone.1, %broadcast.244419.4352)
+  %or.115220.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109502.5.clone.1, %shift-right-logical.115689.5.clone.1)
+  %xor.121773.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248150.3.clone.1, %or.115220.3.clone.1)
+  %add.248151.3.clone.1 = u32[1280,1280]{1,0} add(%add.248150.3.clone.1, %xor.121773.3.clone.1)
+  %add.248152.7.clone.1 = u32[1280,1280]{1,0} add(%add.248151.3.clone.1, %broadcast.251539.113.clone.1)
+  %shift-left.109503.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121773.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115690.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121773.3.clone.1, %broadcast.244418.4352)
+  %or.115221.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109503.5.clone.1, %shift-right-logical.115690.5.clone.1)
+  %xor.121774.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248151.3.clone.1, %or.115221.3.clone.1)
+  %constant_218222_1_clone_1 = u32[] constant(4092060893)
+  %broadcast.251549.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218222_1_clone_1), dimensions={}
+  %add.248154.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121774.3.clone.1, %broadcast.251549.5.clone.1)
+  %add.248155.5.clone.1 = u32[1280,1280]{1,0} add(%add.248152.7.clone.1, %add.248154.5.clone.1)
+  %shift-left.109504.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248154.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115691.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248154.5.clone.1, %broadcast.244416.5760)
+  %or.115223.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109504.9.clone.1, %shift-right-logical.115691.9.clone.1)
+  %xor.121775.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248155.5.clone.1, %or.115223.7.clone.1)
+  %add.248156.3.clone.1 = u32[1280,1280]{1,0} add(%add.248155.5.clone.1, %xor.121775.5.clone.1)
+  %shift-left.109505.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121775.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115692.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121775.5.clone.1, %broadcast.244429.2304)
+  %or.115224.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109505.9.clone.1, %shift-right-logical.115692.9.clone.1)
+  %xor.121776.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248156.3.clone.1, %or.115224.7.clone.1)
+  %add.248157.3.clone.1 = u32[1280,1280]{1,0} add(%add.248156.3.clone.1, %xor.121776.5.clone.1)
+  %shift-left.109506.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121776.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115693.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121776.5.clone.1, %broadcast.244430.4608)
+  %or.115225.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109506.9.clone.1, %shift-right-logical.115693.9.clone.1)
+  %xor.121778.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248157.3.clone.1, %or.115225.7.clone.1)
+  %add.248159.3.clone.1 = u32[1280,1280]{1,0} add(%add.248157.3.clone.1, %xor.121778.5.clone.1)
+  %constant_174624_1_clone_1 = u32[] constant(4092060892)
+  %broadcast.251562.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174624_1_clone_1), dimensions={}
+  %add.248160.7.clone.1 = u32[1280,1280]{1,0} add(%add.248159.3.clone.1, %broadcast.251562.24.clone.1)
+  %shift-left.109507.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121778.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115694.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121778.5.clone.1, %broadcast.244434.2816)
+  %or.115226.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109507.11.clone.1, %shift-right-logical.115694.11.clone.1)
+  %xor.121779.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248159.3.clone.1, %or.115226.9.clone.1)
+  %constant_218223_1_clone_1 = u32[] constant(385371155)
+  %broadcast.251565.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218223_1_clone_1), dimensions={}
+  %add.248161.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121779.7.clone.1, %broadcast.251565.5.clone.1)
+  %add.248162.5.clone.1 = u32[1280,1280]{1,0} add(%add.248160.7.clone.1, %add.248161.5.clone.1)
+  %shift-left.109508.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248161.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115695.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248161.5.clone.1, %broadcast.244415.6016)
+  %or.115228.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109508.9.clone.1, %shift-right-logical.115695.9.clone.1)
+  %xor.121780.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248162.5.clone.1, %or.115228.7.clone.1)
+  %add.248163.3.clone.1 = u32[1280,1280]{1,0} add(%add.248162.5.clone.1, %xor.121780.5.clone.1)
+  %shift-left.109509.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121780.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115696.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121780.5.clone.1, %broadcast.244417.5760)
+  %or.115229.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109509.9.clone.1, %shift-right-logical.115696.9.clone.1)
+  %xor.121781.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248163.3.clone.1, %or.115229.7.clone.1)
+  %add.248165.3.clone.1 = u32[1280,1280]{1,0} add(%add.248163.3.clone.1, %xor.121781.5.clone.1)
+  %shift-left.109510.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121781.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115697.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121781.5.clone.1, %broadcast.244419.4352)
+  %or.115230.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109510.7.clone.1, %shift-right-logical.115697.7.clone.1)
+  %xor.121783.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248165.3.clone.1, %or.115230.5.clone.1)
+  %add.248169.3.clone.1 = u32[1280,1280]{1,0} add(%add.248165.3.clone.1, %xor.121783.3.clone.1)
+  %add.248170.7.clone.1 = u32[1280,1280]{1,0} add(%add.248169.3.clone.1, %broadcast.251538.44.clone.1)
+  %shift-left.109511.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121783.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115698.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121783.3.clone.1, %broadcast.244418.4352)
+  %or.115231.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109511.7.clone.1, %shift-right-logical.115698.7.clone.1)
+  %xor.121784.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248169.3.clone.1, %or.115231.5.clone.1)
+  %constant_218224_1_clone_1 = u32[] constant(4274959130)
+  %broadcast.251575.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218224_1_clone_1), dimensions={}
+  %add.248171.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121784.3.clone.1, %broadcast.251575.5.clone.1)
+  %add.248172.5.clone.1 = u32[1280,1280]{1,0} add(%add.248170.7.clone.1, %add.248171.5.clone.1)
+  %shift-left.109512.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248171.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115699.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248171.5.clone.1, %broadcast.244416.5760)
+  %or.115233.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109512.9.clone.1, %shift-right-logical.115699.9.clone.1)
+  %xor.121785.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248172.5.clone.1, %or.115233.7.clone.1)
+  %add.248174.3.clone.1 = u32[1280,1280]{1,0} add(%add.248172.5.clone.1, %xor.121785.5.clone.1)
+  %shift-left.109513.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121785.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115700.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121785.5.clone.1, %broadcast.244429.2304)
+  %or.115234.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109513.9.clone.1, %shift-right-logical.115700.9.clone.1)
+  %xor.121786.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248174.3.clone.1, %or.115234.7.clone.1)
+  %add.248175.3.clone.1 = u32[1280,1280]{1,0} add(%add.248174.3.clone.1, %xor.121786.5.clone.1)
+  %shift-left.109514.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121786.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115701.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121786.5.clone.1, %broadcast.244430.4608)
+  %or.115235.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109514.9.clone.1, %shift-right-logical.115701.9.clone.1)
+  %xor.121787.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248175.3.clone.1, %or.115235.7.clone.1)
+  %add.248176.3.clone.1 = u32[1280,1280]{1,0} add(%add.248175.3.clone.1, %xor.121787.5.clone.1)
+  %add.248177.7.clone.1 = u32[1280,1280]{1,0} add(%add.248176.3.clone.1, %broadcast.251539.113.clone.1)
+  %shift-left.109515.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121787.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115702.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121787.5.clone.1, %broadcast.244434.2816)
+  %or.115236.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109515.11.clone.1, %shift-right-logical.115702.11.clone.1)
+  %xor.121788.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248176.3.clone.1, %or.115236.9.clone.1)
+  %constant_218225_1_clone_1 = u32[] constant(4092060896)
+  %broadcast.251585.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218225_1_clone_1), dimensions={}
+  %add.248179.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121788.7.clone.1, %broadcast.251585.5.clone.1)
+  %add.248180.5.clone.1 = u32[1280,1280]{1,0} add(%add.248177.7.clone.1, %add.248179.5.clone.1)
+  %shift-left.109516.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248179.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115703.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248179.5.clone.1, %broadcast.244415.6016)
+  %or.115238.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109516.9.clone.1, %shift-right-logical.115703.9.clone.1)
+  %xor.121789.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248180.5.clone.1, %or.115238.7.clone.1)
+  %add.248181.3.clone.1 = u32[1280,1280]{1,0} add(%add.248180.5.clone.1, %xor.121789.5.clone.1)
+  %shift-left.109517.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121789.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115704.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121789.5.clone.1, %broadcast.244417.5760)
+  %or.115239.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109517.9.clone.1, %shift-right-logical.115704.9.clone.1)
+  %xor.121790.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248181.3.clone.1, %or.115239.7.clone.1)
+  %add.248182.3.clone.1 = u32[1280,1280]{1,0} add(%add.248181.3.clone.1, %xor.121790.5.clone.1)
+  %shift-left.109518.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121790.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115705.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121790.5.clone.1, %broadcast.244419.4352)
+  %or.115240.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109518.5.clone.1, %shift-right-logical.115705.5.clone.1)
+  %xor.121791.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248182.3.clone.1, %or.115240.3.clone.1)
+  %add.248184.3.clone.1 = u32[1280,1280]{1,0} add(%add.248182.3.clone.1, %xor.121791.3.clone.1)
+  %add.248185.17.clone.1 = u32[1280,1280]{1,0} add(%add.248184.3.clone.1, %broadcast.251562.24.clone.1)
+  %shift-left.109519.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121791.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115706.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121791.3.clone.1, %broadcast.244418.4352)
+  %or.115241.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109519.5.clone.1, %shift-right-logical.115706.5.clone.1)
+  %xor.121793.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248184.3.clone.1, %or.115241.3.clone.1)
+  %constant_218226_1_clone_1 = u32[] constant(385371158)
+  %broadcast.251595.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218226_1_clone_1), dimensions={}
+  %add.248186.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121793.15.clone.1, %broadcast.251595.19.clone.1)
+  %xor.121794.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248185.17.clone.1, %add.248186.19.clone.1)
+  %shift-right-logical.115707.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121794.17.clone.1, %broadcast.244468.1920)
+  %or.115242.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115707.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5750.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115242.13.clone.1)
+  %add.248187.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5750.11.clone.1, %broadcast.244470.1152)
+  %multiply.26424.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248187.9.clone.1, %broadcast.244471.896)
+  %add.248188.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26424.7.clone.1, %broadcast.244408.1024)
+  %maximum.3682.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248188.5.clone.1)
+  %abs.1538.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3682.3.clone.1)
+  %compare.7224.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1538.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26425.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3682.3.clone.1, %broadcast.244476.1152)
+  %negate.4581.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3682.3.clone.1)
+  %multiply.26426.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3682.3.clone.1, %negate.4581.5.clone.1)
+  %log-plus-one.1538.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26426.5.clone.1)
+  %negate.4582.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1538.3.clone.1)
+  %compare.7225.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4582.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20993.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20994.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20996.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20997.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20998.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20999.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21000.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21001.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21002.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248190.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4582.4.clone.1, %broadcast.244496.640)
+  %sqrt.1538.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4582.4.clone.1)
+  %add.248194.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1538.5.clone.1, %broadcast.244498.640)
+  %select.21003.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7225.3.clone.1, %add.248190.5.clone.1, %add.248194.5.clone.1)
+  %multiply.26427.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21002.3.clone.1, %select.21003.3.clone.1)
+  %add.248195.1.clone.1 = f32[1280,1280]{1,0} add(%select.21001.3.clone.1, %multiply.26427.1.clone.1)
+  %multiply.26428.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248195.1.clone.1, %select.21003.3.clone.1)
+  %add.248196.1.clone.1 = f32[1280,1280]{1,0} add(%select.21000.3.clone.1, %multiply.26428.1.clone.1)
+  %multiply.26429.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248196.1.clone.1, %select.21003.3.clone.1)
+  %add.248197.1.clone.1 = f32[1280,1280]{1,0} add(%select.20999.3.clone.1, %multiply.26429.1.clone.1)
+  %multiply.26430.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248197.1.clone.1, %select.21003.3.clone.1)
+  %add.248199.1.clone.1 = f32[1280,1280]{1,0} add(%select.20998.3.clone.1, %multiply.26430.1.clone.1)
+  %multiply.26431.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248199.1.clone.1, %select.21003.3.clone.1)
+  %add.248200.3.clone.1 = f32[1280,1280]{1,0} add(%select.20997.5.clone.1, %multiply.26431.1.clone.1)
+  %multiply.26432.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248200.3.clone.1, %select.21003.3.clone.1)
+  %add.248201.3.clone.1 = f32[1280,1280]{1,0} add(%select.20996.5.clone.1, %multiply.26432.1.clone.1)
+  %multiply.26433.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248201.3.clone.1, %select.21003.3.clone.1)
+  %add.248202.9.clone.1 = f32[1280,1280]{1,0} add(%select.20994.11.clone.1, %multiply.26433.7.clone.1)
+  %multiply.26434.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248202.9.clone.1, %select.21003.3.clone.1)
+  %add.248204.7.clone.1 = f32[1280,1280]{1,0} add(%select.20993.7.clone.1, %multiply.26434.7.clone.1)
+  %multiply.26435.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248204.7.clone.1, %maximum.3682.3.clone.1)
+  %select.21004.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7224.3.clone.1, %multiply.26425.9.clone.1, %multiply.26435.7.clone.1)
+  %multiply.26436.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21004.7.clone.1, %broadcast.244500.640)
+  %clamp.1182.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26436.5.clone.1, %broadcast.244501.384)
+  %multiply.26437.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1182.3.clone.1, %broadcast.244502.1)
+  %constant_191448_1_clone_1 = u32[] constant(2491264431)
+  %broadcast.258815.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191448_1_clone_1), dimensions={}
+  %add.252310.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258815.44.clone.1)
+  %constant_191457_1_clone_1 = u32[] constant(3255467134)
+  %broadcast.258816.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191457_1_clone_1), dimensions={}
+  %add.252311.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258816.113.clone.1)
+  %add.252312.35.clone.1 = u32[1280,1280]{1,0} add(%add.252310.37.clone.1, %add.252311.99.clone.1)
+  %shift-left.111320.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252311.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117602.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252311.99.clone.1, %broadcast.244415.6016)
+  %or.117140.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111320.31.clone.1, %shift-right-logical.117602.29.clone.1)
+  %xor.123700.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252312.35.clone.1, %or.117140.29.clone.1)
+  %add.252313.5.clone.1 = u32[1280,1280]{1,0} add(%add.252312.35.clone.1, %xor.123700.27.clone.1)
+  %shift-left.111321.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123700.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117603.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123700.27.clone.1, %broadcast.244417.5760)
+  %or.117141.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111321.9.clone.1, %shift-right-logical.117603.9.clone.1)
+  %xor.123701.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252313.5.clone.1, %or.117141.7.clone.1)
+  %add.252314.3.clone.1 = u32[1280,1280]{1,0} add(%add.252313.5.clone.1, %xor.123701.5.clone.1)
+  %shift-left.111322.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123701.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117605.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123701.5.clone.1, %broadcast.244419.4352)
+  %or.117142.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111322.5.clone.1, %shift-right-logical.117605.5.clone.1)
+  %xor.123702.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252314.3.clone.1, %or.117142.3.clone.1)
+  %add.252315.3.clone.1 = u32[1280,1280]{1,0} add(%add.252314.3.clone.1, %xor.123702.3.clone.1)
+  %add.252316.7.clone.1 = u32[1280,1280]{1,0} add(%add.252315.3.clone.1, %broadcast.258816.113.clone.1)
+  %shift-left.111323.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123702.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117606.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123702.3.clone.1, %broadcast.244418.4352)
+  %or.117143.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111323.5.clone.1, %shift-right-logical.117606.5.clone.1)
+  %xor.123703.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252315.3.clone.1, %or.117143.3.clone.1)
+  %constant_218675_1_clone_1 = u32[] constant(1302776332)
+  %broadcast.258827.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218675_1_clone_1), dimensions={}
+  %add.252317.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123703.3.clone.1, %broadcast.258827.5.clone.1)
+  %add.252318.5.clone.1 = u32[1280,1280]{1,0} add(%add.252316.7.clone.1, %add.252317.5.clone.1)
+  %shift-left.111324.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252317.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117607.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252317.5.clone.1, %broadcast.244416.5760)
+  %or.117145.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111324.9.clone.1, %shift-right-logical.117607.9.clone.1)
+  %xor.123704.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252318.5.clone.1, %or.117145.7.clone.1)
+  %add.252319.3.clone.1 = u32[1280,1280]{1,0} add(%add.252318.5.clone.1, %xor.123704.5.clone.1)
+  %shift-left.111325.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123704.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117608.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123704.5.clone.1, %broadcast.244429.2304)
+  %or.117146.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111325.9.clone.1, %shift-right-logical.117608.9.clone.1)
+  %xor.123705.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252319.3.clone.1, %or.117146.7.clone.1)
+  %add.252321.3.clone.1 = u32[1280,1280]{1,0} add(%add.252319.3.clone.1, %xor.123705.5.clone.1)
+  %shift-left.111326.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123705.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117610.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123705.5.clone.1, %broadcast.244430.4608)
+  %or.117147.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111326.9.clone.1, %shift-right-logical.117610.9.clone.1)
+  %xor.123706.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252321.3.clone.1, %or.117147.7.clone.1)
+  %add.252322.3.clone.1 = u32[1280,1280]{1,0} add(%add.252321.3.clone.1, %xor.123706.5.clone.1)
+  %constant_191459_1_clone_1 = u32[] constant(1302776331)
+  %broadcast.258841.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191459_1_clone_1), dimensions={}
+  %add.252323.7.clone.1 = u32[1280,1280]{1,0} add(%add.252322.3.clone.1, %broadcast.258841.24.clone.1)
+  %shift-left.111327.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123706.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117611.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123706.5.clone.1, %broadcast.244434.2816)
+  %or.117148.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111327.11.clone.1, %shift-right-logical.117611.11.clone.1)
+  %xor.123707.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252322.3.clone.1, %or.117148.9.clone.1)
+  %constant_218676_1_clone_1 = u32[] constant(2491264433)
+  %broadcast.258847.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218676_1_clone_1), dimensions={}
+  %add.252324.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123707.7.clone.1, %broadcast.258847.5.clone.1)
+  %add.252325.5.clone.1 = u32[1280,1280]{1,0} add(%add.252323.7.clone.1, %add.252324.5.clone.1)
+  %shift-left.111328.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252324.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117612.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252324.5.clone.1, %broadcast.244415.6016)
+  %or.117149.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111328.9.clone.1, %shift-right-logical.117612.9.clone.1)
+  %xor.123708.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252325.5.clone.1, %or.117149.7.clone.1)
+  %add.252326.3.clone.1 = u32[1280,1280]{1,0} add(%add.252325.5.clone.1, %xor.123708.5.clone.1)
+  %shift-left.111329.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123708.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117613.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123708.5.clone.1, %broadcast.244417.5760)
+  %or.117150.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111329.9.clone.1, %shift-right-logical.117613.9.clone.1)
+  %xor.123709.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252326.3.clone.1, %or.117150.7.clone.1)
+  %add.252327.3.clone.1 = u32[1280,1280]{1,0} add(%add.252326.3.clone.1, %xor.123709.5.clone.1)
+  %shift-left.111330.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123709.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117615.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123709.5.clone.1, %broadcast.244419.4352)
+  %or.117151.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111330.7.clone.1, %shift-right-logical.117615.7.clone.1)
+  %xor.123710.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252327.3.clone.1, %or.117151.5.clone.1)
+  %add.252328.3.clone.1 = u32[1280,1280]{1,0} add(%add.252327.3.clone.1, %xor.123710.3.clone.1)
+  %add.252329.7.clone.1 = u32[1280,1280]{1,0} add(%add.252328.3.clone.1, %broadcast.258815.44.clone.1)
+  %shift-left.111331.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123710.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117616.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123710.3.clone.1, %broadcast.244418.4352)
+  %or.117152.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111331.7.clone.1, %shift-right-logical.117616.7.clone.1)
+  %xor.123711.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252328.3.clone.1, %or.117152.5.clone.1)
+  %constant_218677_1_clone_1 = u32[] constant(3255467137)
+  %broadcast.258863.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218677_1_clone_1), dimensions={}
+  %add.252330.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123711.3.clone.1, %broadcast.258863.5.clone.1)
+  %add.252331.5.clone.1 = u32[1280,1280]{1,0} add(%add.252329.7.clone.1, %add.252330.5.clone.1)
+  %shift-left.111332.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252330.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117617.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252330.5.clone.1, %broadcast.244416.5760)
+  %or.117153.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111332.9.clone.1, %shift-right-logical.117617.9.clone.1)
+  %xor.123712.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252331.5.clone.1, %or.117153.7.clone.1)
+  %add.252332.3.clone.1 = u32[1280,1280]{1,0} add(%add.252331.5.clone.1, %xor.123712.5.clone.1)
+  %shift-left.111333.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123712.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117618.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123712.5.clone.1, %broadcast.244429.2304)
+  %or.117155.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111333.9.clone.1, %shift-right-logical.117618.9.clone.1)
+  %xor.123713.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252332.3.clone.1, %or.117155.7.clone.1)
+  %add.252333.3.clone.1 = u32[1280,1280]{1,0} add(%add.252332.3.clone.1, %xor.123713.5.clone.1)
+  %shift-left.111334.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123713.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117620.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123713.5.clone.1, %broadcast.244430.4608)
+  %or.117156.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111334.9.clone.1, %shift-right-logical.117620.9.clone.1)
+  %xor.123714.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252333.3.clone.1, %or.117156.7.clone.1)
+  %add.252334.3.clone.1 = u32[1280,1280]{1,0} add(%add.252333.3.clone.1, %xor.123714.5.clone.1)
+  %add.252335.7.clone.1 = u32[1280,1280]{1,0} add(%add.252334.3.clone.1, %broadcast.258816.113.clone.1)
+  %shift-left.111335.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123714.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117621.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123714.5.clone.1, %broadcast.244434.2816)
+  %or.117157.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111335.11.clone.1, %shift-right-logical.117621.11.clone.1)
+  %xor.123715.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252334.3.clone.1, %or.117157.9.clone.1)
+  %constant_218678_1_clone_1 = u32[] constant(1302776335)
+  %broadcast.258873.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218678_1_clone_1), dimensions={}
+  %add.252336.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123715.7.clone.1, %broadcast.258873.5.clone.1)
+  %add.252337.5.clone.1 = u32[1280,1280]{1,0} add(%add.252335.7.clone.1, %add.252336.5.clone.1)
+  %shift-left.111336.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252336.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117622.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252336.5.clone.1, %broadcast.244415.6016)
+  %or.117158.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111336.9.clone.1, %shift-right-logical.117622.9.clone.1)
+  %xor.123716.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252337.5.clone.1, %or.117158.7.clone.1)
+  %add.252338.3.clone.1 = u32[1280,1280]{1,0} add(%add.252337.5.clone.1, %xor.123716.5.clone.1)
+  %shift-left.111337.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123716.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117623.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123716.5.clone.1, %broadcast.244417.5760)
+  %or.117160.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111337.9.clone.1, %shift-right-logical.117623.9.clone.1)
+  %xor.123717.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252338.3.clone.1, %or.117160.7.clone.1)
+  %add.252339.3.clone.1 = u32[1280,1280]{1,0} add(%add.252338.3.clone.1, %xor.123717.5.clone.1)
+  %shift-left.111338.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123717.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117624.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123717.5.clone.1, %broadcast.244419.4352)
+  %or.117161.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111338.5.clone.1, %shift-right-logical.117624.5.clone.1)
+  %xor.123718.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252339.3.clone.1, %or.117161.3.clone.1)
+  %add.252341.3.clone.1 = u32[1280,1280]{1,0} add(%add.252339.3.clone.1, %xor.123718.3.clone.1)
+  %add.252342.17.clone.1 = u32[1280,1280]{1,0} add(%add.252341.3.clone.1, %broadcast.258841.24.clone.1)
+  %shift-left.111339.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123718.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117625.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123718.3.clone.1, %broadcast.244418.4352)
+  %or.117162.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111339.5.clone.1, %shift-right-logical.117625.5.clone.1)
+  %xor.123719.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252341.3.clone.1, %or.117162.3.clone.1)
+  %constant_218679_1_clone_1 = u32[] constant(2491264436)
+  %broadcast.258885.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218679_1_clone_1), dimensions={}
+  %add.252343.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123719.15.clone.1, %broadcast.258885.19.clone.1)
+  %xor.123720.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252342.17.clone.1, %add.252343.19.clone.1)
+  %shift-right-logical.117626.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123720.17.clone.1, %broadcast.244468.1920)
+  %or.117163.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117626.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5833.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117163.13.clone.1)
+  %add.252344.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5833.11.clone.1, %broadcast.244470.1152)
+  %multiply.27278.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252344.9.clone.1, %broadcast.244471.896)
+  %add.252345.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27278.7.clone.1, %broadcast.244408.1024)
+  %maximum.3765.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252345.5.clone.1)
+  %abs.1593.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3765.3.clone.1)
+  %compare.7348.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1593.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27279.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3765.3.clone.1, %broadcast.244476.1152)
+  %negate.4691.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3765.3.clone.1)
+  %multiply.27280.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3765.3.clone.1, %negate.4691.5.clone.1)
+  %log-plus-one.1593.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27280.5.clone.1)
+  %negate.4692.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1593.3.clone.1)
+  %compare.7349.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4692.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21630.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21631.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21632.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21633.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21634.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21635.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21636.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21637.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21638.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252346.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4692.4.clone.1, %broadcast.244496.640)
+  %sqrt.1593.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4692.4.clone.1)
+  %add.252347.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1593.5.clone.1, %broadcast.244498.640)
+  %select.21639.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7349.3.clone.1, %add.252346.5.clone.1, %add.252347.5.clone.1)
+  %multiply.27281.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21638.3.clone.1, %select.21639.3.clone.1)
+  %add.252348.1.clone.1 = f32[1280,1280]{1,0} add(%select.21637.3.clone.1, %multiply.27281.1.clone.1)
+  %multiply.27282.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252348.1.clone.1, %select.21639.3.clone.1)
+  %add.252349.1.clone.1 = f32[1280,1280]{1,0} add(%select.21636.3.clone.1, %multiply.27282.1.clone.1)
+  %multiply.27283.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252349.1.clone.1, %select.21639.3.clone.1)
+  %add.252350.1.clone.1 = f32[1280,1280]{1,0} add(%select.21635.3.clone.1, %multiply.27283.1.clone.1)
+  %multiply.27284.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252350.1.clone.1, %select.21639.3.clone.1)
+  %add.252351.1.clone.1 = f32[1280,1280]{1,0} add(%select.21634.3.clone.1, %multiply.27284.1.clone.1)
+  %multiply.27285.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252351.1.clone.1, %select.21639.3.clone.1)
+  %add.252352.3.clone.1 = f32[1280,1280]{1,0} add(%select.21633.5.clone.1, %multiply.27285.1.clone.1)
+  %multiply.27286.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252352.3.clone.1, %select.21639.3.clone.1)
+  %add.252353.3.clone.1 = f32[1280,1280]{1,0} add(%select.21632.5.clone.1, %multiply.27286.1.clone.1)
+  %multiply.27287.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252353.3.clone.1, %select.21639.3.clone.1)
+  %add.252354.9.clone.1 = f32[1280,1280]{1,0} add(%select.21631.11.clone.1, %multiply.27287.7.clone.1)
+  %multiply.27288.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252354.9.clone.1, %select.21639.3.clone.1)
+  %add.252355.7.clone.1 = f32[1280,1280]{1,0} add(%select.21630.7.clone.1, %multiply.27288.7.clone.1)
+  %multiply.27289.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252355.7.clone.1, %maximum.3765.3.clone.1)
+  %select.21640.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7348.3.clone.1, %multiply.27279.9.clone.1, %multiply.27289.7.clone.1)
+  %multiply.27290.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21640.7.clone.1, %broadcast.244500.640)
+  %clamp.1237.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27290.5.clone.1, %broadcast.244501.384)
+  %multiply.27291.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1237.3.clone.1, %broadcast.244502.1)
+  %constant_174383_1_clone_1 = u32[] constant(2599510339)
+  %broadcast.251439.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174383_1_clone_1), dimensions={}
+  %add.248084.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251439.44.clone.1)
+  %constant_174390_1_clone_1 = u32[] constant(191910730)
+  %broadcast.251440.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174390_1_clone_1), dimensions={}
+  %add.248085.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251440.113.clone.1)
+  %add.248086.35.clone.1 = u32[1280,1280]{1,0} add(%add.248084.37.clone.1, %add.248085.99.clone.1)
+  %shift-left.109480.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248085.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115665.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248085.99.clone.1, %broadcast.244415.6016)
+  %or.115193.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109480.31.clone.1, %shift-right-logical.115665.29.clone.1)
+  %xor.121745.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248086.35.clone.1, %or.115193.29.clone.1)
+  %add.248087.5.clone.1 = u32[1280,1280]{1,0} add(%add.248086.35.clone.1, %xor.121745.27.clone.1)
+  %shift-left.109481.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121745.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115667.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121745.27.clone.1, %broadcast.244417.5760)
+  %or.115194.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109481.9.clone.1, %shift-right-logical.115667.9.clone.1)
+  %xor.121746.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248087.5.clone.1, %or.115194.7.clone.1)
+  %add.248088.3.clone.1 = u32[1280,1280]{1,0} add(%add.248087.5.clone.1, %xor.121746.5.clone.1)
+  %shift-left.109482.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121746.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115668.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121746.5.clone.1, %broadcast.244419.4352)
+  %or.115195.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109482.5.clone.1, %shift-right-logical.115668.5.clone.1)
+  %xor.121748.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248088.3.clone.1, %or.115195.3.clone.1)
+  %add.248089.3.clone.1 = u32[1280,1280]{1,0} add(%add.248088.3.clone.1, %xor.121748.3.clone.1)
+  %add.248091.7.clone.1 = u32[1280,1280]{1,0} add(%add.248089.3.clone.1, %broadcast.251440.113.clone.1)
+  %shift-left.109483.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121748.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115669.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121748.3.clone.1, %broadcast.244418.4352)
+  %or.115196.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109483.5.clone.1, %shift-right-logical.115669.5.clone.1)
+  %xor.121749.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248089.3.clone.1, %or.115196.3.clone.1)
+  %constant_218217_1_clone_1 = u32[] constant(2320509396)
+  %broadcast.251450.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218217_1_clone_1), dimensions={}
+  %add.248094.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121749.3.clone.1, %broadcast.251450.5.clone.1)
+  %add.248095.5.clone.1 = u32[1280,1280]{1,0} add(%add.248091.7.clone.1, %add.248094.5.clone.1)
+  %shift-left.109484.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248094.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115670.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248094.5.clone.1, %broadcast.244416.5760)
+  %or.115198.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109484.9.clone.1, %shift-right-logical.115670.9.clone.1)
+  %xor.121750.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248095.5.clone.1, %or.115198.7.clone.1)
+  %add.248096.3.clone.1 = u32[1280,1280]{1,0} add(%add.248095.5.clone.1, %xor.121750.5.clone.1)
+  %shift-left.109485.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121750.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115671.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121750.5.clone.1, %broadcast.244429.2304)
+  %or.115199.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109485.9.clone.1, %shift-right-logical.115671.9.clone.1)
+  %xor.121751.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248096.3.clone.1, %or.115199.7.clone.1)
+  %add.248097.3.clone.1 = u32[1280,1280]{1,0} add(%add.248096.3.clone.1, %xor.121751.5.clone.1)
+  %shift-left.109486.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121751.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115672.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121751.5.clone.1, %broadcast.244430.4608)
+  %or.115200.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109486.9.clone.1, %shift-right-logical.115672.9.clone.1)
+  %xor.121753.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248097.3.clone.1, %or.115200.7.clone.1)
+  %add.248099.3.clone.1 = u32[1280,1280]{1,0} add(%add.248097.3.clone.1, %xor.121753.5.clone.1)
+  %constant_174392_1_clone_1 = u32[] constant(2320509395)
+  %broadcast.251459.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174392_1_clone_1), dimensions={}
+  %add.248100.7.clone.1 = u32[1280,1280]{1,0} add(%add.248099.3.clone.1, %broadcast.251459.24.clone.1)
+  %shift-left.109487.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121753.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115673.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121753.5.clone.1, %broadcast.244434.2816)
+  %or.115201.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109487.11.clone.1, %shift-right-logical.115673.11.clone.1)
+  %xor.121754.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248099.3.clone.1, %or.115201.9.clone.1)
+  %constant_218218_1_clone_1 = u32[] constant(2599510341)
+  %broadcast.251462.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218218_1_clone_1), dimensions={}
+  %add.248101.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121754.7.clone.1, %broadcast.251462.5.clone.1)
+  %add.248102.5.clone.1 = u32[1280,1280]{1,0} add(%add.248100.7.clone.1, %add.248101.5.clone.1)
+  %shift-left.109488.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248101.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115674.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248101.5.clone.1, %broadcast.244415.6016)
+  %or.115203.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109488.9.clone.1, %shift-right-logical.115674.9.clone.1)
+  %xor.121755.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248102.5.clone.1, %or.115203.7.clone.1)
+  %add.248104.3.clone.1 = u32[1280,1280]{1,0} add(%add.248102.5.clone.1, %xor.121755.5.clone.1)
+  %shift-left.109489.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121755.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115675.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121755.5.clone.1, %broadcast.244417.5760)
+  %or.115204.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109489.9.clone.1, %shift-right-logical.115675.9.clone.1)
+  %xor.121756.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248104.3.clone.1, %or.115204.7.clone.1)
+  %add.248105.3.clone.1 = u32[1280,1280]{1,0} add(%add.248104.3.clone.1, %xor.121756.5.clone.1)
+  %shift-left.109490.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121756.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115676.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121756.5.clone.1, %broadcast.244419.4352)
+  %or.115205.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109490.7.clone.1, %shift-right-logical.115676.7.clone.1)
+  %xor.121758.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248105.3.clone.1, %or.115205.5.clone.1)
+  %add.248106.3.clone.1 = u32[1280,1280]{1,0} add(%add.248105.3.clone.1, %xor.121758.3.clone.1)
+  %add.248107.7.clone.1 = u32[1280,1280]{1,0} add(%add.248106.3.clone.1, %broadcast.251439.44.clone.1)
+  %shift-left.109491.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121758.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115677.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121758.3.clone.1, %broadcast.244418.4352)
+  %or.115206.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109491.7.clone.1, %shift-right-logical.115677.7.clone.1)
+  %xor.121759.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248106.3.clone.1, %or.115206.5.clone.1)
+  %constant_218219_1_clone_1 = u32[] constant(191910733)
+  %broadcast.251472.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218219_1_clone_1), dimensions={}
+  %add.248109.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121759.3.clone.1, %broadcast.251472.5.clone.1)
+  %add.248110.5.clone.1 = u32[1280,1280]{1,0} add(%add.248107.7.clone.1, %add.248109.5.clone.1)
+  %shift-left.109492.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248109.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115678.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248109.5.clone.1, %broadcast.244416.5760)
+  %or.115208.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109492.9.clone.1, %shift-right-logical.115678.9.clone.1)
+  %xor.121760.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248110.5.clone.1, %or.115208.7.clone.1)
+  %add.248111.3.clone.1 = u32[1280,1280]{1,0} add(%add.248110.5.clone.1, %xor.121760.5.clone.1)
+  %shift-left.109493.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121760.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115679.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121760.5.clone.1, %broadcast.244429.2304)
+  %or.115209.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109493.9.clone.1, %shift-right-logical.115679.9.clone.1)
+  %xor.121761.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248111.3.clone.1, %or.115209.7.clone.1)
+  %add.248112.3.clone.1 = u32[1280,1280]{1,0} add(%add.248111.3.clone.1, %xor.121761.5.clone.1)
+  %shift-left.109494.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121761.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115680.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121761.5.clone.1, %broadcast.244430.4608)
+  %or.115210.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109494.9.clone.1, %shift-right-logical.115680.9.clone.1)
+  %xor.121762.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248112.3.clone.1, %or.115210.7.clone.1)
+  %add.248113.3.clone.1 = u32[1280,1280]{1,0} add(%add.248112.3.clone.1, %xor.121762.5.clone.1)
+  %add.248115.7.clone.1 = u32[1280,1280]{1,0} add(%add.248113.3.clone.1, %broadcast.251440.113.clone.1)
+  %shift-left.109495.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121762.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115681.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121762.5.clone.1, %broadcast.244434.2816)
+  %or.115211.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109495.11.clone.1, %shift-right-logical.115681.11.clone.1)
+  %xor.121763.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248113.3.clone.1, %or.115211.9.clone.1)
+  %constant_218220_1_clone_1 = u32[] constant(2320509399)
+  %broadcast.251484.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218220_1_clone_1), dimensions={}
+  %add.248119.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121763.7.clone.1, %broadcast.251484.5.clone.1)
+  %add.248120.5.clone.1 = u32[1280,1280]{1,0} add(%add.248115.7.clone.1, %add.248119.5.clone.1)
+  %shift-left.109496.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248119.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115682.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248119.5.clone.1, %broadcast.244415.6016)
+  %or.115213.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109496.9.clone.1, %shift-right-logical.115682.9.clone.1)
+  %xor.121764.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248120.5.clone.1, %or.115213.7.clone.1)
+  %add.248121.3.clone.1 = u32[1280,1280]{1,0} add(%add.248120.5.clone.1, %xor.121764.5.clone.1)
+  %shift-left.109497.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121764.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115683.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121764.5.clone.1, %broadcast.244417.5760)
+  %or.115214.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109497.9.clone.1, %shift-right-logical.115683.9.clone.1)
+  %xor.121765.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248121.3.clone.1, %or.115214.7.clone.1)
+  %add.248122.3.clone.1 = u32[1280,1280]{1,0} add(%add.248121.3.clone.1, %xor.121765.5.clone.1)
+  %shift-left.109498.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121765.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115684.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121765.5.clone.1, %broadcast.244419.4352)
+  %or.115215.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109498.5.clone.1, %shift-right-logical.115684.5.clone.1)
+  %xor.121766.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248122.3.clone.1, %or.115215.3.clone.1)
+  %add.248124.3.clone.1 = u32[1280,1280]{1,0} add(%add.248122.3.clone.1, %xor.121766.3.clone.1)
+  %add.248125.17.clone.1 = u32[1280,1280]{1,0} add(%add.248124.3.clone.1, %broadcast.251459.24.clone.1)
+  %shift-left.109499.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121766.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115685.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121766.3.clone.1, %broadcast.244418.4352)
+  %or.115216.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109499.5.clone.1, %shift-right-logical.115685.5.clone.1)
+  %xor.121768.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248124.3.clone.1, %or.115216.3.clone.1)
+  %constant_218221_1_clone_1 = u32[] constant(2599510344)
+  %broadcast.251494.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218221_1_clone_1), dimensions={}
+  %add.248126.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121768.15.clone.1, %broadcast.251494.19.clone.1)
+  %xor.121769.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248125.17.clone.1, %add.248126.19.clone.1)
+  %shift-right-logical.115686.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121769.17.clone.1, %broadcast.244468.1920)
+  %or.115217.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115686.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5749.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115217.13.clone.1)
+  %add.248127.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5749.11.clone.1, %broadcast.244470.1152)
+  %multiply.26410.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248127.9.clone.1, %broadcast.244471.896)
+  %add.248129.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26410.7.clone.1, %broadcast.244408.1024)
+  %maximum.3681.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248129.5.clone.1)
+  %abs.1537.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3681.3.clone.1)
+  %compare.7222.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1537.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26411.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3681.3.clone.1, %broadcast.244476.1152)
+  %negate.4579.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3681.3.clone.1)
+  %multiply.26412.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3681.3.clone.1, %negate.4579.5.clone.1)
+  %log-plus-one.1537.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26412.5.clone.1)
+  %negate.4580.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1537.3.clone.1)
+  %compare.7223.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4580.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20978.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20980.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20981.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20983.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20984.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20986.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20987.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20989.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20990.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248130.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4580.4.clone.1, %broadcast.244496.640)
+  %sqrt.1537.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4580.4.clone.1)
+  %add.248131.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1537.5.clone.1, %broadcast.244498.640)
+  %select.20991.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7223.3.clone.1, %add.248130.5.clone.1, %add.248131.5.clone.1)
+  %multiply.26413.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20990.3.clone.1, %select.20991.3.clone.1)
+  %add.248132.1.clone.1 = f32[1280,1280]{1,0} add(%select.20989.3.clone.1, %multiply.26413.1.clone.1)
+  %multiply.26414.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248132.1.clone.1, %select.20991.3.clone.1)
+  %add.248134.1.clone.1 = f32[1280,1280]{1,0} add(%select.20987.3.clone.1, %multiply.26414.1.clone.1)
+  %multiply.26415.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248134.1.clone.1, %select.20991.3.clone.1)
+  %add.248135.1.clone.1 = f32[1280,1280]{1,0} add(%select.20986.3.clone.1, %multiply.26415.1.clone.1)
+  %multiply.26416.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248135.1.clone.1, %select.20991.3.clone.1)
+  %add.248136.1.clone.1 = f32[1280,1280]{1,0} add(%select.20984.3.clone.1, %multiply.26416.1.clone.1)
+  %multiply.26417.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248136.1.clone.1, %select.20991.3.clone.1)
+  %add.248137.3.clone.1 = f32[1280,1280]{1,0} add(%select.20983.5.clone.1, %multiply.26417.1.clone.1)
+  %multiply.26418.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248137.3.clone.1, %select.20991.3.clone.1)
+  %add.248138.3.clone.1 = f32[1280,1280]{1,0} add(%select.20981.5.clone.1, %multiply.26418.1.clone.1)
+  %multiply.26419.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248138.3.clone.1, %select.20991.3.clone.1)
+  %add.248140.9.clone.1 = f32[1280,1280]{1,0} add(%select.20980.11.clone.1, %multiply.26419.7.clone.1)
+  %multiply.26420.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248140.9.clone.1, %select.20991.3.clone.1)
+  %add.248144.7.clone.1 = f32[1280,1280]{1,0} add(%select.20978.7.clone.1, %multiply.26420.7.clone.1)
+  %multiply.26421.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248144.7.clone.1, %maximum.3681.3.clone.1)
+  %select.20992.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7222.3.clone.1, %multiply.26411.9.clone.1, %multiply.26421.7.clone.1)
+  %multiply.26422.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20992.7.clone.1, %broadcast.244500.640)
+  %clamp.1181.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26422.5.clone.1, %broadcast.244501.384)
+  %multiply.26423.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1181.3.clone.1, %broadcast.244502.1)
+  %constant_185593_1_clone_1 = u32[] constant(2718060488)
+  %broadcast.256300.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185593_1_clone_1), dimensions={}
+  %add.250843.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.256300.44.clone.1)
+  %constant_185600_1_clone_1 = u32[] constant(3294696415)
+  %broadcast.256301.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185600_1_clone_1), dimensions={}
+  %add.250844.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256301.113.clone.1)
+  %add.250845.35.clone.1 = u32[1280,1280]{1,0} add(%add.250843.37.clone.1, %add.250844.99.clone.1)
+  %shift-left.110680.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250844.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116941.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250844.99.clone.1, %broadcast.244415.6016)
+  %or.116455.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110680.31.clone.1, %shift-right-logical.116941.29.clone.1)
+  %xor.123027.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250845.35.clone.1, %or.116455.29.clone.1)
+  %add.250847.5.clone.1 = u32[1280,1280]{1,0} add(%add.250845.35.clone.1, %xor.123027.27.clone.1)
+  %shift-left.110681.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123027.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116942.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123027.27.clone.1, %broadcast.244417.5760)
+  %or.116456.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110681.9.clone.1, %shift-right-logical.116942.9.clone.1)
+  %xor.123029.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250847.5.clone.1, %or.116456.7.clone.1)
+  %add.250848.3.clone.1 = u32[1280,1280]{1,0} add(%add.250847.5.clone.1, %xor.123029.5.clone.1)
+  %shift-left.110682.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123029.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116943.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123029.5.clone.1, %broadcast.244419.4352)
+  %or.116457.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110682.5.clone.1, %shift-right-logical.116943.5.clone.1)
+  %xor.123030.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250848.3.clone.1, %or.116457.3.clone.1)
+  %add.250849.3.clone.1 = u32[1280,1280]{1,0} add(%add.250848.3.clone.1, %xor.123030.3.clone.1)
+  %add.250850.7.clone.1 = u32[1280,1280]{1,0} add(%add.250849.3.clone.1, %broadcast.256301.113.clone.1)
+  %shift-left.110683.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123030.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116945.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123030.3.clone.1, %broadcast.244418.4352)
+  %or.116458.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110683.5.clone.1, %shift-right-logical.116945.5.clone.1)
+  %xor.123031.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250849.3.clone.1, %or.116458.3.clone.1)
+  %constant_218509_1_clone_1 = u32[] constant(2108835790)
+  %broadcast.256311.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218509_1_clone_1), dimensions={}
+  %add.250852.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123031.3.clone.1, %broadcast.256311.5.clone.1)
+  %add.250853.5.clone.1 = u32[1280,1280]{1,0} add(%add.250850.7.clone.1, %add.250852.5.clone.1)
+  %shift-left.110684.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250852.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116946.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250852.5.clone.1, %broadcast.244416.5760)
+  %or.116459.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110684.9.clone.1, %shift-right-logical.116946.9.clone.1)
+  %xor.123032.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250853.5.clone.1, %or.116459.7.clone.1)
+  %add.250854.3.clone.1 = u32[1280,1280]{1,0} add(%add.250853.5.clone.1, %xor.123032.5.clone.1)
+  %shift-left.110685.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123032.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116947.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123032.5.clone.1, %broadcast.244429.2304)
+  %or.116460.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110685.9.clone.1, %shift-right-logical.116947.9.clone.1)
+  %xor.123034.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250854.3.clone.1, %or.116460.7.clone.1)
+  %add.250855.3.clone.1 = u32[1280,1280]{1,0} add(%add.250854.3.clone.1, %xor.123034.5.clone.1)
+  %shift-left.110686.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123034.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116948.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123034.5.clone.1, %broadcast.244430.4608)
+  %or.116461.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110686.9.clone.1, %shift-right-logical.116948.9.clone.1)
+  %xor.123035.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250855.3.clone.1, %or.116461.7.clone.1)
+  %add.250856.3.clone.1 = u32[1280,1280]{1,0} add(%add.250855.3.clone.1, %xor.123035.5.clone.1)
+  %constant_185602_1_clone_1 = u32[] constant(2108835789)
+  %broadcast.256318.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185602_1_clone_1), dimensions={}
+  %add.250858.7.clone.1 = u32[1280,1280]{1,0} add(%add.250856.3.clone.1, %broadcast.256318.24.clone.1)
+  %shift-left.110687.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123035.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116950.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123035.5.clone.1, %broadcast.244434.2816)
+  %or.116462.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110687.11.clone.1, %shift-right-logical.116950.11.clone.1)
+  %xor.123036.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250856.3.clone.1, %or.116462.9.clone.1)
+  %constant_218510_1_clone_1 = u32[] constant(2718060490)
+  %broadcast.256321.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218510_1_clone_1), dimensions={}
+  %add.250862.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123036.7.clone.1, %broadcast.256321.5.clone.1)
+  %add.250863.5.clone.1 = u32[1280,1280]{1,0} add(%add.250858.7.clone.1, %add.250862.5.clone.1)
+  %shift-left.110688.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250862.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116951.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250862.5.clone.1, %broadcast.244415.6016)
+  %or.116463.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110688.9.clone.1, %shift-right-logical.116951.9.clone.1)
+  %xor.123037.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250863.5.clone.1, %or.116463.7.clone.1)
+  %add.250864.3.clone.1 = u32[1280,1280]{1,0} add(%add.250863.5.clone.1, %xor.123037.5.clone.1)
+  %shift-left.110689.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123037.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116952.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123037.5.clone.1, %broadcast.244417.5760)
+  %or.116464.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110689.9.clone.1, %shift-right-logical.116952.9.clone.1)
+  %xor.123039.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250864.3.clone.1, %or.116464.7.clone.1)
+  %add.250865.3.clone.1 = u32[1280,1280]{1,0} add(%add.250864.3.clone.1, %xor.123039.5.clone.1)
+  %shift-left.110690.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123039.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116953.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123039.5.clone.1, %broadcast.244419.4352)
+  %or.116465.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110690.7.clone.1, %shift-right-logical.116953.7.clone.1)
+  %xor.123040.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250865.3.clone.1, %or.116465.5.clone.1)
+  %add.250867.3.clone.1 = u32[1280,1280]{1,0} add(%add.250865.3.clone.1, %xor.123040.3.clone.1)
+  %add.250868.7.clone.1 = u32[1280,1280]{1,0} add(%add.250867.3.clone.1, %broadcast.256300.44.clone.1)
+  %shift-left.110691.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123040.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116955.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123040.3.clone.1, %broadcast.244418.4352)
+  %or.116466.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110691.7.clone.1, %shift-right-logical.116955.7.clone.1)
+  %xor.123041.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250867.3.clone.1, %or.116466.5.clone.1)
+  %constant_218511_1_clone_1 = u32[] constant(3294696418)
+  %broadcast.256331.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218511_1_clone_1), dimensions={}
+  %add.250869.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123041.3.clone.1, %broadcast.256331.5.clone.1)
+  %add.250870.5.clone.1 = u32[1280,1280]{1,0} add(%add.250868.7.clone.1, %add.250869.5.clone.1)
+  %shift-left.110692.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250869.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116956.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250869.5.clone.1, %broadcast.244416.5760)
+  %or.116467.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110692.9.clone.1, %shift-right-logical.116956.9.clone.1)
+  %xor.123042.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250870.5.clone.1, %or.116467.7.clone.1)
+  %add.250872.3.clone.1 = u32[1280,1280]{1,0} add(%add.250870.5.clone.1, %xor.123042.5.clone.1)
+  %shift-left.110693.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123042.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116957.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123042.5.clone.1, %broadcast.244429.2304)
+  %or.116468.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110693.9.clone.1, %shift-right-logical.116957.9.clone.1)
+  %xor.123043.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250872.3.clone.1, %or.116468.7.clone.1)
+  %add.250873.3.clone.1 = u32[1280,1280]{1,0} add(%add.250872.3.clone.1, %xor.123043.5.clone.1)
+  %shift-left.110695.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123043.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116958.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123043.5.clone.1, %broadcast.244430.4608)
+  %or.116469.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110695.9.clone.1, %shift-right-logical.116958.9.clone.1)
+  %xor.123044.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250873.3.clone.1, %or.116469.7.clone.1)
+  %add.250874.3.clone.1 = u32[1280,1280]{1,0} add(%add.250873.3.clone.1, %xor.123044.5.clone.1)
+  %add.250875.7.clone.1 = u32[1280,1280]{1,0} add(%add.250874.3.clone.1, %broadcast.256301.113.clone.1)
+  %shift-left.110696.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123044.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116960.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123044.5.clone.1, %broadcast.244434.2816)
+  %or.116470.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110696.11.clone.1, %shift-right-logical.116960.11.clone.1)
+  %xor.123045.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250874.3.clone.1, %or.116470.9.clone.1)
+  %constant_218512_1_clone_1 = u32[] constant(2108835793)
+  %broadcast.256341.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218512_1_clone_1), dimensions={}
+  %add.250877.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123045.7.clone.1, %broadcast.256341.5.clone.1)
+  %add.250878.5.clone.1 = u32[1280,1280]{1,0} add(%add.250875.7.clone.1, %add.250877.5.clone.1)
+  %shift-left.110697.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250877.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116961.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250877.5.clone.1, %broadcast.244415.6016)
+  %or.116471.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110697.9.clone.1, %shift-right-logical.116961.9.clone.1)
+  %xor.123046.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250878.5.clone.1, %or.116471.7.clone.1)
+  %add.250879.3.clone.1 = u32[1280,1280]{1,0} add(%add.250878.5.clone.1, %xor.123046.5.clone.1)
+  %shift-left.110698.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123046.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116962.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123046.5.clone.1, %broadcast.244417.5760)
+  %or.116472.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110698.9.clone.1, %shift-right-logical.116962.9.clone.1)
+  %xor.123047.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250879.3.clone.1, %or.116472.7.clone.1)
+  %add.250880.3.clone.1 = u32[1280,1280]{1,0} add(%add.250879.3.clone.1, %xor.123047.5.clone.1)
+  %shift-left.110700.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123047.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116963.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123047.5.clone.1, %broadcast.244419.4352)
+  %or.116473.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110700.5.clone.1, %shift-right-logical.116963.5.clone.1)
+  %xor.123049.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250880.3.clone.1, %or.116473.3.clone.1)
+  %add.250881.3.clone.1 = u32[1280,1280]{1,0} add(%add.250880.3.clone.1, %xor.123049.3.clone.1)
+  %add.250883.17.clone.1 = u32[1280,1280]{1,0} add(%add.250881.3.clone.1, %broadcast.256318.24.clone.1)
+  %shift-left.110701.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123049.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116964.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123049.3.clone.1, %broadcast.244418.4352)
+  %or.116474.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110701.5.clone.1, %shift-right-logical.116964.5.clone.1)
+  %xor.123050.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250881.3.clone.1, %or.116474.3.clone.1)
+  %constant_218513_1_clone_1 = u32[] constant(2718060493)
+  %broadcast.256351.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218513_1_clone_1), dimensions={}
+  %add.250887.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123050.15.clone.1, %broadcast.256351.19.clone.1)
+  %xor.123051.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250883.17.clone.1, %add.250887.19.clone.1)
+  %shift-right-logical.116965.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123051.17.clone.1, %broadcast.244468.1920)
+  %or.116475.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116965.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5804.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116475.13.clone.1)
+  %add.250888.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5804.11.clone.1, %broadcast.244470.1152)
+  %multiply.26979.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250888.9.clone.1, %broadcast.244471.896)
+  %add.250889.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26979.7.clone.1, %broadcast.244408.1024)
+  %maximum.3736.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250889.5.clone.1)
+  %abs.1574.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3736.3.clone.1)
+  %compare.7304.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1574.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26980.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3736.3.clone.1, %broadcast.244476.1152)
+  %negate.4653.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3736.3.clone.1)
+  %multiply.26981.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3736.3.clone.1, %negate.4653.5.clone.1)
+  %log-plus-one.1574.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26981.5.clone.1)
+  %negate.4654.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1574.3.clone.1)
+  %compare.7305.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4654.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21400.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21401.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21402.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21403.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21404.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21405.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21406.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21407.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21408.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250890.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4654.4.clone.1, %broadcast.244496.640)
+  %sqrt.1574.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4654.4.clone.1)
+  %add.250892.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1574.5.clone.1, %broadcast.244498.640)
+  %select.21409.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7305.3.clone.1, %add.250890.5.clone.1, %add.250892.5.clone.1)
+  %multiply.26982.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21408.3.clone.1, %select.21409.3.clone.1)
+  %add.250893.1.clone.1 = f32[1280,1280]{1,0} add(%select.21407.3.clone.1, %multiply.26982.1.clone.1)
+  %multiply.26983.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250893.1.clone.1, %select.21409.3.clone.1)
+  %add.250894.1.clone.1 = f32[1280,1280]{1,0} add(%select.21406.3.clone.1, %multiply.26983.1.clone.1)
+  %multiply.26984.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250894.1.clone.1, %select.21409.3.clone.1)
+  %add.250895.1.clone.1 = f32[1280,1280]{1,0} add(%select.21405.3.clone.1, %multiply.26984.1.clone.1)
+  %multiply.26985.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250895.1.clone.1, %select.21409.3.clone.1)
+  %add.250897.1.clone.1 = f32[1280,1280]{1,0} add(%select.21404.3.clone.1, %multiply.26985.1.clone.1)
+  %multiply.26986.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250897.1.clone.1, %select.21409.3.clone.1)
+  %add.250898.3.clone.1 = f32[1280,1280]{1,0} add(%select.21403.5.clone.1, %multiply.26986.1.clone.1)
+  %multiply.26987.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250898.3.clone.1, %select.21409.3.clone.1)
+  %add.250899.3.clone.1 = f32[1280,1280]{1,0} add(%select.21402.5.clone.1, %multiply.26987.1.clone.1)
+  %multiply.26988.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250899.3.clone.1, %select.21409.3.clone.1)
+  %add.250900.9.clone.1 = f32[1280,1280]{1,0} add(%select.21401.11.clone.1, %multiply.26988.7.clone.1)
+  %multiply.26989.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250900.9.clone.1, %select.21409.3.clone.1)
+  %add.250902.7.clone.1 = f32[1280,1280]{1,0} add(%select.21400.7.clone.1, %multiply.26989.7.clone.1)
+  %multiply.26990.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250902.7.clone.1, %maximum.3736.3.clone.1)
+  %select.21410.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7304.3.clone.1, %multiply.26980.9.clone.1, %multiply.26990.7.clone.1)
+  %multiply.26991.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21410.7.clone.1, %broadcast.244500.640)
+  %clamp.1218.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26991.5.clone.1, %broadcast.244501.384)
+  %multiply.26992.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1218.3.clone.1, %broadcast.244502.1)
+  %constant_174172_1_clone_1 = u32[] constant(3829579796)
+  %broadcast.251332.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174172_1_clone_1), dimensions={}
+  %add.248040.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251332.44.clone.1)
+  %constant_174179_1_clone_1 = u32[] constant(645380412)
+  %broadcast.251333.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174179_1_clone_1), dimensions={}
+  %add.248041.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251333.113.clone.1)
+  %add.248042.35.clone.1 = u32[1280,1280]{1,0} add(%add.248040.37.clone.1, %add.248041.99.clone.1)
+  %shift-left.109460.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248041.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115641.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248041.99.clone.1, %broadcast.244415.6016)
+  %or.115168.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109460.31.clone.1, %shift-right-logical.115641.29.clone.1)
+  %xor.121720.27.clone.1 = u32[1280,1280]{1,0} xor(%add.248042.35.clone.1, %or.115168.29.clone.1)
+  %add.248043.5.clone.1 = u32[1280,1280]{1,0} add(%add.248042.35.clone.1, %xor.121720.27.clone.1)
+  %shift-left.109461.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121720.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115642.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121720.27.clone.1, %broadcast.244417.5760)
+  %or.115169.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109461.9.clone.1, %shift-right-logical.115642.9.clone.1)
+  %xor.121721.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248043.5.clone.1, %or.115169.7.clone.1)
+  %add.248044.3.clone.1 = u32[1280,1280]{1,0} add(%add.248043.5.clone.1, %xor.121721.5.clone.1)
+  %shift-left.109462.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121721.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115644.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121721.5.clone.1, %broadcast.244419.4352)
+  %or.115170.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109462.5.clone.1, %shift-right-logical.115644.5.clone.1)
+  %xor.121723.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248044.3.clone.1, %or.115170.3.clone.1)
+  %add.248045.3.clone.1 = u32[1280,1280]{1,0} add(%add.248044.3.clone.1, %xor.121723.3.clone.1)
+  %add.248046.7.clone.1 = u32[1280,1280]{1,0} add(%add.248045.3.clone.1, %broadcast.251333.113.clone.1)
+  %shift-left.109463.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121723.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115645.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121723.3.clone.1, %broadcast.244418.4352)
+  %or.115171.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109463.5.clone.1, %shift-right-logical.115645.5.clone.1)
+  %xor.121724.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248045.3.clone.1, %or.115171.3.clone.1)
+  %constant_218212_1_clone_1 = u32[] constant(3655623411)
+  %broadcast.251343.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218212_1_clone_1), dimensions={}
+  %add.248047.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121724.3.clone.1, %broadcast.251343.5.clone.1)
+  %add.248048.5.clone.1 = u32[1280,1280]{1,0} add(%add.248046.7.clone.1, %add.248047.5.clone.1)
+  %shift-left.109464.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248047.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115646.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248047.5.clone.1, %broadcast.244416.5760)
+  %or.115173.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109464.9.clone.1, %shift-right-logical.115646.9.clone.1)
+  %xor.121725.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248048.5.clone.1, %or.115173.7.clone.1)
+  %add.248049.3.clone.1 = u32[1280,1280]{1,0} add(%add.248048.5.clone.1, %xor.121725.5.clone.1)
+  %shift-left.109465.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121725.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115647.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121725.5.clone.1, %broadcast.244429.2304)
+  %or.115174.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109465.9.clone.1, %shift-right-logical.115647.9.clone.1)
+  %xor.121726.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248049.3.clone.1, %or.115174.7.clone.1)
+  %add.248050.3.clone.1 = u32[1280,1280]{1,0} add(%add.248049.3.clone.1, %xor.121726.5.clone.1)
+  %shift-left.109466.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121726.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115649.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121726.5.clone.1, %broadcast.244430.4608)
+  %or.115175.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109466.9.clone.1, %shift-right-logical.115649.9.clone.1)
+  %xor.121728.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248050.3.clone.1, %or.115175.7.clone.1)
+  %add.248051.3.clone.1 = u32[1280,1280]{1,0} add(%add.248050.3.clone.1, %xor.121728.5.clone.1)
+  %constant_174181_1_clone_1 = u32[] constant(3655623410)
+  %broadcast.251351.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_174181_1_clone_1), dimensions={}
+  %add.248052.7.clone.1 = u32[1280,1280]{1,0} add(%add.248051.3.clone.1, %broadcast.251351.24.clone.1)
+  %shift-left.109467.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121728.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115650.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121728.5.clone.1, %broadcast.244434.2816)
+  %or.115176.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109467.11.clone.1, %shift-right-logical.115650.11.clone.1)
+  %xor.121729.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248051.3.clone.1, %or.115176.9.clone.1)
+  %constant_218213_1_clone_1 = u32[] constant(3829579798)
+  %broadcast.251357.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218213_1_clone_1), dimensions={}
+  %add.248053.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121729.7.clone.1, %broadcast.251357.5.clone.1)
+  %add.248054.5.clone.1 = u32[1280,1280]{1,0} add(%add.248052.7.clone.1, %add.248053.5.clone.1)
+  %shift-left.109468.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248053.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115651.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248053.5.clone.1, %broadcast.244415.6016)
+  %or.115178.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109468.9.clone.1, %shift-right-logical.115651.9.clone.1)
+  %xor.121730.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248054.5.clone.1, %or.115178.7.clone.1)
+  %add.248055.3.clone.1 = u32[1280,1280]{1,0} add(%add.248054.5.clone.1, %xor.121730.5.clone.1)
+  %shift-left.109469.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121730.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115652.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121730.5.clone.1, %broadcast.244417.5760)
+  %or.115179.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109469.9.clone.1, %shift-right-logical.115652.9.clone.1)
+  %xor.121731.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248055.3.clone.1, %or.115179.7.clone.1)
+  %add.248056.3.clone.1 = u32[1280,1280]{1,0} add(%add.248055.3.clone.1, %xor.121731.5.clone.1)
+  %shift-left.109470.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121731.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115654.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121731.5.clone.1, %broadcast.244419.4352)
+  %or.115180.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109470.7.clone.1, %shift-right-logical.115654.7.clone.1)
+  %xor.121733.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248056.3.clone.1, %or.115180.5.clone.1)
+  %add.248057.3.clone.1 = u32[1280,1280]{1,0} add(%add.248056.3.clone.1, %xor.121733.3.clone.1)
+  %add.248058.7.clone.1 = u32[1280,1280]{1,0} add(%add.248057.3.clone.1, %broadcast.251332.44.clone.1)
+  %shift-left.109471.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121733.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115655.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121733.3.clone.1, %broadcast.244418.4352)
+  %or.115181.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109471.7.clone.1, %shift-right-logical.115655.7.clone.1)
+  %xor.121734.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248057.3.clone.1, %or.115181.5.clone.1)
+  %constant_218214_1_clone_1 = u32[] constant(645380415)
+  %broadcast.251377.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218214_1_clone_1), dimensions={}
+  %add.248059.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121734.3.clone.1, %broadcast.251377.5.clone.1)
+  %add.248060.5.clone.1 = u32[1280,1280]{1,0} add(%add.248058.7.clone.1, %add.248059.5.clone.1)
+  %shift-left.109472.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248059.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115656.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248059.5.clone.1, %broadcast.244416.5760)
+  %or.115183.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109472.9.clone.1, %shift-right-logical.115656.9.clone.1)
+  %xor.121735.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248060.5.clone.1, %or.115183.7.clone.1)
+  %add.248061.3.clone.1 = u32[1280,1280]{1,0} add(%add.248060.5.clone.1, %xor.121735.5.clone.1)
+  %shift-left.109473.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121735.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115657.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121735.5.clone.1, %broadcast.244429.2304)
+  %or.115184.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109473.9.clone.1, %shift-right-logical.115657.9.clone.1)
+  %xor.121736.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248061.3.clone.1, %or.115184.7.clone.1)
+  %add.248062.3.clone.1 = u32[1280,1280]{1,0} add(%add.248061.3.clone.1, %xor.121736.5.clone.1)
+  %shift-left.109474.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121736.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115658.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121736.5.clone.1, %broadcast.244430.4608)
+  %or.115185.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109474.9.clone.1, %shift-right-logical.115658.9.clone.1)
+  %xor.121737.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248062.3.clone.1, %or.115185.7.clone.1)
+  %add.248063.3.clone.1 = u32[1280,1280]{1,0} add(%add.248062.3.clone.1, %xor.121737.5.clone.1)
+  %add.248064.7.clone.1 = u32[1280,1280]{1,0} add(%add.248063.3.clone.1, %broadcast.251333.113.clone.1)
+  %shift-left.109475.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121737.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115659.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121737.5.clone.1, %broadcast.244434.2816)
+  %or.115186.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109475.11.clone.1, %shift-right-logical.115659.11.clone.1)
+  %xor.121738.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248063.3.clone.1, %or.115186.9.clone.1)
+  %constant_218215_1_clone_1 = u32[] constant(3655623414)
+  %broadcast.251390.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218215_1_clone_1), dimensions={}
+  %add.248065.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121738.7.clone.1, %broadcast.251390.5.clone.1)
+  %add.248066.5.clone.1 = u32[1280,1280]{1,0} add(%add.248064.7.clone.1, %add.248065.5.clone.1)
+  %shift-left.109476.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248065.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115660.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248065.5.clone.1, %broadcast.244415.6016)
+  %or.115188.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109476.9.clone.1, %shift-right-logical.115660.9.clone.1)
+  %xor.121739.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248066.5.clone.1, %or.115188.7.clone.1)
+  %add.248067.3.clone.1 = u32[1280,1280]{1,0} add(%add.248066.5.clone.1, %xor.121739.5.clone.1)
+  %shift-left.109477.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121739.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115661.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121739.5.clone.1, %broadcast.244417.5760)
+  %or.115189.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109477.9.clone.1, %shift-right-logical.115661.9.clone.1)
+  %xor.121740.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248067.3.clone.1, %or.115189.7.clone.1)
+  %add.248068.3.clone.1 = u32[1280,1280]{1,0} add(%add.248067.3.clone.1, %xor.121740.5.clone.1)
+  %shift-left.109478.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121740.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115662.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121740.5.clone.1, %broadcast.244419.4352)
+  %or.115190.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109478.5.clone.1, %shift-right-logical.115662.5.clone.1)
+  %xor.121741.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248068.3.clone.1, %or.115190.3.clone.1)
+  %add.248069.3.clone.1 = u32[1280,1280]{1,0} add(%add.248068.3.clone.1, %xor.121741.3.clone.1)
+  %add.248070.17.clone.1 = u32[1280,1280]{1,0} add(%add.248069.3.clone.1, %broadcast.251351.24.clone.1)
+  %shift-left.109479.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121741.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115663.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121741.3.clone.1, %broadcast.244418.4352)
+  %or.115191.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109479.5.clone.1, %shift-right-logical.115663.5.clone.1)
+  %xor.121743.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248069.3.clone.1, %or.115191.3.clone.1)
+  %constant_218216_1_clone_1 = u32[] constant(3829579801)
+  %broadcast.251400.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218216_1_clone_1), dimensions={}
+  %add.248071.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121743.15.clone.1, %broadcast.251400.19.clone.1)
+  %xor.121744.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248070.17.clone.1, %add.248071.19.clone.1)
+  %shift-right-logical.115664.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121744.17.clone.1, %broadcast.244468.1920)
+  %or.115192.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115664.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5748.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115192.13.clone.1)
+  %add.248072.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5748.11.clone.1, %broadcast.244470.1152)
+  %multiply.26396.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248072.9.clone.1, %broadcast.244471.896)
+  %add.248073.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26396.7.clone.1, %broadcast.244408.1024)
+  %maximum.3680.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248073.5.clone.1)
+  %abs.1536.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3680.3.clone.1)
+  %compare.7220.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1536.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26397.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3680.3.clone.1, %broadcast.244476.1152)
+  %negate.4577.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3680.3.clone.1)
+  %multiply.26398.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3680.3.clone.1, %negate.4577.5.clone.1)
+  %log-plus-one.1536.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26398.5.clone.1)
+  %negate.4578.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1536.3.clone.1)
+  %compare.7221.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4578.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20961.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20962.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20963.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20964.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20968.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20969.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20971.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20972.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20974.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248074.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4578.4.clone.1, %broadcast.244496.640)
+  %sqrt.1536.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4578.4.clone.1)
+  %add.248075.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1536.5.clone.1, %broadcast.244498.640)
+  %select.20975.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7221.3.clone.1, %add.248074.5.clone.1, %add.248075.5.clone.1)
+  %multiply.26399.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20974.3.clone.1, %select.20975.3.clone.1)
+  %add.248076.1.clone.1 = f32[1280,1280]{1,0} add(%select.20972.3.clone.1, %multiply.26399.1.clone.1)
+  %multiply.26400.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248076.1.clone.1, %select.20975.3.clone.1)
+  %add.248077.1.clone.1 = f32[1280,1280]{1,0} add(%select.20971.3.clone.1, %multiply.26400.1.clone.1)
+  %multiply.26401.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248077.1.clone.1, %select.20975.3.clone.1)
+  %add.248078.1.clone.1 = f32[1280,1280]{1,0} add(%select.20969.3.clone.1, %multiply.26401.1.clone.1)
+  %multiply.26402.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248078.1.clone.1, %select.20975.3.clone.1)
+  %add.248079.1.clone.1 = f32[1280,1280]{1,0} add(%select.20968.3.clone.1, %multiply.26402.1.clone.1)
+  %multiply.26403.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248079.1.clone.1, %select.20975.3.clone.1)
+  %add.248080.3.clone.1 = f32[1280,1280]{1,0} add(%select.20964.5.clone.1, %multiply.26403.1.clone.1)
+  %multiply.26404.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248080.3.clone.1, %select.20975.3.clone.1)
+  %add.248081.3.clone.1 = f32[1280,1280]{1,0} add(%select.20963.5.clone.1, %multiply.26404.1.clone.1)
+  %multiply.26405.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248081.3.clone.1, %select.20975.3.clone.1)
+  %add.248082.9.clone.1 = f32[1280,1280]{1,0} add(%select.20962.11.clone.1, %multiply.26405.7.clone.1)
+  %multiply.26406.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248082.9.clone.1, %select.20975.3.clone.1)
+  %add.248083.7.clone.1 = f32[1280,1280]{1,0} add(%select.20961.7.clone.1, %multiply.26406.7.clone.1)
+  %multiply.26407.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248083.7.clone.1, %maximum.3680.3.clone.1)
+  %select.20977.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7220.3.clone.1, %multiply.26397.9.clone.1, %multiply.26407.7.clone.1)
+  %multiply.26408.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20977.7.clone.1, %broadcast.244500.640)
+  %clamp.1180.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26408.5.clone.1, %broadcast.244501.384)
+  %multiply.26409.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1180.3.clone.1, %broadcast.244502.1)
+  %constant_194118_1_clone_1 = u32[] constant(708617427)
+  %broadcast.259978.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_194118_1_clone_1), dimensions={}
+  %add.252962.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.259978.44.clone.1)
+  %constant_194125_1_clone_1 = u32[] constant(1243070488)
+  %broadcast.259979.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_194125_1_clone_1), dimensions={}
+  %add.252963.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.259979.113.clone.1)
+  %add.252965.35.clone.1 = u32[1280,1280]{1,0} add(%add.252962.37.clone.1, %add.252963.99.clone.1)
+  %shift-left.111600.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252963.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117897.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252963.99.clone.1, %broadcast.244415.6016)
+  %or.117424.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111600.31.clone.1, %shift-right-logical.117897.29.clone.1)
+  %xor.123984.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252965.35.clone.1, %or.117424.29.clone.1)
+  %add.252966.5.clone.1 = u32[1280,1280]{1,0} add(%add.252965.35.clone.1, %xor.123984.27.clone.1)
+  %shift-left.111601.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123984.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117898.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123984.27.clone.1, %broadcast.244417.5760)
+  %or.117425.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111601.9.clone.1, %shift-right-logical.117898.9.clone.1)
+  %xor.123985.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252966.5.clone.1, %or.117425.7.clone.1)
+  %add.252967.3.clone.1 = u32[1280,1280]{1,0} add(%add.252966.5.clone.1, %xor.123985.5.clone.1)
+  %shift-left.111602.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123985.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117899.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123985.5.clone.1, %broadcast.244419.4352)
+  %or.117427.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111602.5.clone.1, %shift-right-logical.117899.5.clone.1)
+  %xor.123986.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252967.3.clone.1, %or.117427.3.clone.1)
+  %add.252968.3.clone.1 = u32[1280,1280]{1,0} add(%add.252967.3.clone.1, %xor.123986.3.clone.1)
+  %add.252969.7.clone.1 = u32[1280,1280]{1,0} add(%add.252968.3.clone.1, %broadcast.259979.113.clone.1)
+  %shift-left.111603.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123986.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117900.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123986.3.clone.1, %broadcast.244418.4352)
+  %or.117428.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111603.5.clone.1, %shift-right-logical.117900.5.clone.1)
+  %xor.123987.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252968.3.clone.1, %or.117428.3.clone.1)
+  %constant_218741_1_clone_1 = u32[] constant(2080014098)
+  %broadcast.259989.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218741_1_clone_1), dimensions={}
+  %add.252971.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123987.3.clone.1, %broadcast.259989.5.clone.1)
+  %add.252975.5.clone.1 = u32[1280,1280]{1,0} add(%add.252969.7.clone.1, %add.252971.5.clone.1)
+  %shift-left.111604.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252971.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117901.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252971.5.clone.1, %broadcast.244416.5760)
+  %or.117429.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111604.9.clone.1, %shift-right-logical.117901.9.clone.1)
+  %xor.123988.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252975.5.clone.1, %or.117429.7.clone.1)
+  %add.252976.3.clone.1 = u32[1280,1280]{1,0} add(%add.252975.5.clone.1, %xor.123988.5.clone.1)
+  %shift-left.111605.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123988.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117902.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123988.5.clone.1, %broadcast.244429.2304)
+  %or.117430.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111605.9.clone.1, %shift-right-logical.117902.9.clone.1)
+  %xor.123989.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252976.3.clone.1, %or.117430.7.clone.1)
+  %add.252977.3.clone.1 = u32[1280,1280]{1,0} add(%add.252976.3.clone.1, %xor.123989.5.clone.1)
+  %shift-left.111606.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123989.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117903.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123989.5.clone.1, %broadcast.244430.4608)
+  %or.117432.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111606.9.clone.1, %shift-right-logical.117903.9.clone.1)
+  %xor.123990.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252977.3.clone.1, %or.117432.7.clone.1)
+  %add.252978.3.clone.1 = u32[1280,1280]{1,0} add(%add.252977.3.clone.1, %xor.123990.5.clone.1)
+  %constant_194127_1_clone_1 = u32[] constant(2080014097)
+  %broadcast.259996.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_194127_1_clone_1), dimensions={}
+  %add.252980.7.clone.1 = u32[1280,1280]{1,0} add(%add.252978.3.clone.1, %broadcast.259996.24.clone.1)
+  %shift-left.111607.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123990.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117904.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123990.5.clone.1, %broadcast.244434.2816)
+  %or.117433.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111607.11.clone.1, %shift-right-logical.117904.11.clone.1)
+  %xor.123992.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252978.3.clone.1, %or.117433.9.clone.1)
+  %constant_218742_1_clone_1 = u32[] constant(708617429)
+  %broadcast.259999.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218742_1_clone_1), dimensions={}
+  %add.252981.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123992.7.clone.1, %broadcast.259999.5.clone.1)
+  %add.252982.5.clone.1 = u32[1280,1280]{1,0} add(%add.252980.7.clone.1, %add.252981.5.clone.1)
+  %shift-left.111608.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252981.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117905.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252981.5.clone.1, %broadcast.244415.6016)
+  %or.117434.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111608.9.clone.1, %shift-right-logical.117905.9.clone.1)
+  %xor.123993.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252982.5.clone.1, %or.117434.7.clone.1)
+  %add.252983.3.clone.1 = u32[1280,1280]{1,0} add(%add.252982.5.clone.1, %xor.123993.5.clone.1)
+  %shift-left.111609.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123993.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117906.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123993.5.clone.1, %broadcast.244417.5760)
+  %or.117435.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111609.9.clone.1, %shift-right-logical.117906.9.clone.1)
+  %xor.123994.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252983.3.clone.1, %or.117435.7.clone.1)
+  %add.252985.3.clone.1 = u32[1280,1280]{1,0} add(%add.252983.3.clone.1, %xor.123994.5.clone.1)
+  %shift-left.111610.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123994.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117907.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123994.5.clone.1, %broadcast.244419.4352)
+  %or.117437.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111610.7.clone.1, %shift-right-logical.117907.7.clone.1)
+  %xor.123995.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252985.3.clone.1, %or.117437.5.clone.1)
+  %add.252986.3.clone.1 = u32[1280,1280]{1,0} add(%add.252985.3.clone.1, %xor.123995.3.clone.1)
+  %add.252987.7.clone.1 = u32[1280,1280]{1,0} add(%add.252986.3.clone.1, %broadcast.259978.44.clone.1)
+  %shift-left.111611.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123995.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117908.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123995.3.clone.1, %broadcast.244418.4352)
+  %or.117438.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111611.7.clone.1, %shift-right-logical.117908.7.clone.1)
+  %xor.123997.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252986.3.clone.1, %or.117438.5.clone.1)
+  %constant_218743_1_clone_1 = u32[] constant(1243070491)
+  %broadcast.260009.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218743_1_clone_1), dimensions={}
+  %add.252988.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123997.3.clone.1, %broadcast.260009.5.clone.1)
+  %add.252990.5.clone.1 = u32[1280,1280]{1,0} add(%add.252987.7.clone.1, %add.252988.5.clone.1)
+  %shift-left.111612.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252988.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117909.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252988.5.clone.1, %broadcast.244416.5760)
+  %or.117439.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111612.9.clone.1, %shift-right-logical.117909.9.clone.1)
+  %xor.123998.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252990.5.clone.1, %or.117439.7.clone.1)
+  %add.252991.3.clone.1 = u32[1280,1280]{1,0} add(%add.252990.5.clone.1, %xor.123998.5.clone.1)
+  %shift-left.111613.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123998.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117910.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123998.5.clone.1, %broadcast.244429.2304)
+  %or.117440.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111613.9.clone.1, %shift-right-logical.117910.9.clone.1)
+  %xor.123999.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252991.3.clone.1, %or.117440.7.clone.1)
+  %add.252992.3.clone.1 = u32[1280,1280]{1,0} add(%add.252991.3.clone.1, %xor.123999.5.clone.1)
+  %shift-left.111614.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123999.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117911.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123999.5.clone.1, %broadcast.244430.4608)
+  %or.117441.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111614.9.clone.1, %shift-right-logical.117911.9.clone.1)
+  %xor.124000.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252992.3.clone.1, %or.117441.7.clone.1)
+  %add.252993.3.clone.1 = u32[1280,1280]{1,0} add(%add.252992.3.clone.1, %xor.124000.5.clone.1)
+  %add.252994.7.clone.1 = u32[1280,1280]{1,0} add(%add.252993.3.clone.1, %broadcast.259979.113.clone.1)
+  %shift-left.111615.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124000.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117912.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124000.5.clone.1, %broadcast.244434.2816)
+  %or.117442.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111615.11.clone.1, %shift-right-logical.117912.11.clone.1)
+  %xor.124002.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252993.3.clone.1, %or.117442.9.clone.1)
+  %constant_218744_1_clone_1 = u32[] constant(2080014101)
+  %broadcast.260019.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218744_1_clone_1), dimensions={}
+  %add.252996.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124002.7.clone.1, %broadcast.260019.5.clone.1)
+  %add.253000.5.clone.1 = u32[1280,1280]{1,0} add(%add.252994.7.clone.1, %add.252996.5.clone.1)
+  %shift-left.111616.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252996.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117913.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252996.5.clone.1, %broadcast.244415.6016)
+  %or.117443.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111616.9.clone.1, %shift-right-logical.117913.9.clone.1)
+  %xor.124003.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253000.5.clone.1, %or.117443.7.clone.1)
+  %add.253001.3.clone.1 = u32[1280,1280]{1,0} add(%add.253000.5.clone.1, %xor.124003.5.clone.1)
+  %shift-left.111617.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124003.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117914.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124003.5.clone.1, %broadcast.244417.5760)
+  %or.117444.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111617.9.clone.1, %shift-right-logical.117914.9.clone.1)
+  %xor.124004.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253001.3.clone.1, %or.117444.7.clone.1)
+  %add.253002.3.clone.1 = u32[1280,1280]{1,0} add(%add.253001.3.clone.1, %xor.124004.5.clone.1)
+  %shift-left.111618.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124004.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117915.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124004.5.clone.1, %broadcast.244419.4352)
+  %or.117445.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111618.5.clone.1, %shift-right-logical.117915.5.clone.1)
+  %xor.124005.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253002.3.clone.1, %or.117445.3.clone.1)
+  %add.253003.3.clone.1 = u32[1280,1280]{1,0} add(%add.253002.3.clone.1, %xor.124005.3.clone.1)
+  %add.253005.17.clone.1 = u32[1280,1280]{1,0} add(%add.253003.3.clone.1, %broadcast.259996.24.clone.1)
+  %shift-left.111619.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124005.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117916.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124005.3.clone.1, %broadcast.244418.4352)
+  %or.117447.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111619.5.clone.1, %shift-right-logical.117916.5.clone.1)
+  %xor.124007.15.clone.1 = u32[1280,1280]{1,0} xor(%add.253003.3.clone.1, %or.117447.3.clone.1)
+  %constant_218745_1_clone_1 = u32[] constant(708617432)
+  %broadcast.260029.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218745_1_clone_1), dimensions={}
+  %add.253006.19.clone.1 = u32[1280,1280]{1,0} add(%xor.124007.15.clone.1, %broadcast.260029.19.clone.1)
+  %xor.124008.17.clone.1 = u32[1280,1280]{1,0} xor(%add.253005.17.clone.1, %add.253006.19.clone.1)
+  %shift-right-logical.117917.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124008.17.clone.1, %broadcast.244468.1920)
+  %or.117448.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117917.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5846.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117448.13.clone.1)
+  %add.253007.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5846.11.clone.1, %broadcast.244470.1152)
+  %multiply.27412.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253007.9.clone.1, %broadcast.244471.896)
+  %add.253008.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27412.7.clone.1, %broadcast.244408.1024)
+  %maximum.3778.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.253008.5.clone.1)
+  %abs.1602.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3778.3.clone.1)
+  %compare.7366.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1602.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27413.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3778.3.clone.1, %broadcast.244476.1152)
+  %negate.4709.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3778.3.clone.1)
+  %multiply.27414.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3778.3.clone.1, %negate.4709.5.clone.1)
+  %log-plus-one.1602.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27414.5.clone.1)
+  %negate.4710.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1602.3.clone.1)
+  %compare.7367.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4710.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21729.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21730.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21731.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21732.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21733.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21734.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21735.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21736.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21737.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.253010.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4710.4.clone.1, %broadcast.244496.640)
+  %sqrt.1602.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4710.4.clone.1)
+  %add.253011.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1602.5.clone.1, %broadcast.244498.640)
+  %select.21738.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7367.3.clone.1, %add.253010.5.clone.1, %add.253011.5.clone.1)
+  %multiply.27415.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21737.3.clone.1, %select.21738.3.clone.1)
+  %add.253012.1.clone.1 = f32[1280,1280]{1,0} add(%select.21736.3.clone.1, %multiply.27415.1.clone.1)
+  %multiply.27416.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253012.1.clone.1, %select.21738.3.clone.1)
+  %add.253013.1.clone.1 = f32[1280,1280]{1,0} add(%select.21735.3.clone.1, %multiply.27416.1.clone.1)
+  %multiply.27417.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253013.1.clone.1, %select.21738.3.clone.1)
+  %add.253015.1.clone.1 = f32[1280,1280]{1,0} add(%select.21734.3.clone.1, %multiply.27417.1.clone.1)
+  %multiply.27418.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253015.1.clone.1, %select.21738.3.clone.1)
+  %add.253016.1.clone.1 = f32[1280,1280]{1,0} add(%select.21733.3.clone.1, %multiply.27418.1.clone.1)
+  %multiply.27419.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253016.1.clone.1, %select.21738.3.clone.1)
+  %add.253017.3.clone.1 = f32[1280,1280]{1,0} add(%select.21732.5.clone.1, %multiply.27419.1.clone.1)
+  %multiply.27420.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253017.3.clone.1, %select.21738.3.clone.1)
+  %add.253018.3.clone.1 = f32[1280,1280]{1,0} add(%select.21731.5.clone.1, %multiply.27420.1.clone.1)
+  %multiply.27421.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253018.3.clone.1, %select.21738.3.clone.1)
+  %add.253019.9.clone.1 = f32[1280,1280]{1,0} add(%select.21730.11.clone.1, %multiply.27421.7.clone.1)
+  %multiply.27422.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253019.9.clone.1, %select.21738.3.clone.1)
+  %add.253021.7.clone.1 = f32[1280,1280]{1,0} add(%select.21729.7.clone.1, %multiply.27422.7.clone.1)
+  %multiply.27423.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253021.7.clone.1, %maximum.3778.3.clone.1)
+  %select.21739.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7366.3.clone.1, %multiply.27413.9.clone.1, %multiply.27423.7.clone.1)
+  %multiply.27424.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21739.7.clone.1, %broadcast.244500.640)
+  %clamp.1246.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27424.5.clone.1, %broadcast.244501.384)
+  %multiply.27425.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1246.3.clone.1, %broadcast.244502.1)
+  %constant_173929_1_clone_1 = u32[] constant(1260932913)
+  %broadcast.251246.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173929_1_clone_1), dimensions={}
+  %add.247979.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251246.44.clone.1)
+  %constant_173936_1_clone_1 = u32[] constant(1413264670)
+  %broadcast.251247.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173936_1_clone_1), dimensions={}
+  %add.247981.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251247.113.clone.1)
+  %add.247985.35.clone.1 = u32[1280,1280]{1,0} add(%add.247979.37.clone.1, %add.247981.99.clone.1)
+  %shift-left.109440.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247981.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115616.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247981.99.clone.1, %broadcast.244415.6016)
+  %or.115147.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109440.31.clone.1, %shift-right-logical.115616.29.clone.1)
+  %xor.121695.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247985.35.clone.1, %or.115147.29.clone.1)
+  %add.247986.5.clone.1 = u32[1280,1280]{1,0} add(%add.247985.35.clone.1, %xor.121695.27.clone.1)
+  %shift-left.109441.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121695.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115617.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121695.27.clone.1, %broadcast.244417.5760)
+  %or.115148.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109441.9.clone.1, %shift-right-logical.115617.9.clone.1)
+  %xor.121696.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247986.5.clone.1, %or.115148.7.clone.1)
+  %add.247987.3.clone.1 = u32[1280,1280]{1,0} add(%add.247986.5.clone.1, %xor.121696.5.clone.1)
+  %shift-left.109442.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121696.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115619.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121696.5.clone.1, %broadcast.244419.4352)
+  %or.115149.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109442.5.clone.1, %shift-right-logical.115619.5.clone.1)
+  %xor.121698.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247987.3.clone.1, %or.115149.3.clone.1)
+  %add.247988.3.clone.1 = u32[1280,1280]{1,0} add(%add.247987.3.clone.1, %xor.121698.3.clone.1)
+  %add.247990.7.clone.1 = u32[1280,1280]{1,0} add(%add.247988.3.clone.1, %broadcast.251247.113.clone.1)
+  %shift-left.109443.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121698.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115620.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121698.3.clone.1, %broadcast.244418.4352)
+  %or.115150.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109443.5.clone.1, %shift-right-logical.115620.5.clone.1)
+  %xor.121699.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247988.3.clone.1, %or.115150.3.clone.1)
+  %constant_218207_1_clone_1 = u32[] constant(80077302)
+  %broadcast.251257.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218207_1_clone_1), dimensions={}
+  %add.247991.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121699.3.clone.1, %broadcast.251257.5.clone.1)
+  %add.247992.5.clone.1 = u32[1280,1280]{1,0} add(%add.247990.7.clone.1, %add.247991.5.clone.1)
+  %shift-left.109444.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247991.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115621.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247991.5.clone.1, %broadcast.244416.5760)
+  %or.115151.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109444.9.clone.1, %shift-right-logical.115621.9.clone.1)
+  %xor.121700.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247992.5.clone.1, %or.115151.7.clone.1)
+  %add.247993.3.clone.1 = u32[1280,1280]{1,0} add(%add.247992.5.clone.1, %xor.121700.5.clone.1)
+  %shift-left.109445.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121700.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115622.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121700.5.clone.1, %broadcast.244429.2304)
+  %or.115152.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109445.9.clone.1, %shift-right-logical.115622.9.clone.1)
+  %xor.121701.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247993.3.clone.1, %or.115152.7.clone.1)
+  %add.247995.3.clone.1 = u32[1280,1280]{1,0} add(%add.247993.3.clone.1, %xor.121701.5.clone.1)
+  %shift-left.109446.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121701.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115624.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121701.5.clone.1, %broadcast.244430.4608)
+  %or.115153.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109446.9.clone.1, %shift-right-logical.115624.9.clone.1)
+  %xor.121703.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247995.3.clone.1, %or.115153.7.clone.1)
+  %add.247996.3.clone.1 = u32[1280,1280]{1,0} add(%add.247995.3.clone.1, %xor.121703.5.clone.1)
+  %constant_173938_1_clone_1 = u32[] constant(80077301)
+  %broadcast.251264.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173938_1_clone_1), dimensions={}
+  %add.247997.7.clone.1 = u32[1280,1280]{1,0} add(%add.247996.3.clone.1, %broadcast.251264.24.clone.1)
+  %shift-left.109447.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121703.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115625.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121703.5.clone.1, %broadcast.244434.2816)
+  %or.115154.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109447.11.clone.1, %shift-right-logical.115625.11.clone.1)
+  %xor.121704.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247996.3.clone.1, %or.115154.9.clone.1)
+  %constant_218208_1_clone_1 = u32[] constant(1260932915)
+  %broadcast.251267.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218208_1_clone_1), dimensions={}
+  %add.247998.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121704.7.clone.1, %broadcast.251267.5.clone.1)
+  %add.248000.5.clone.1 = u32[1280,1280]{1,0} add(%add.247997.7.clone.1, %add.247998.5.clone.1)
+  %shift-left.109448.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247998.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115626.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247998.5.clone.1, %broadcast.244415.6016)
+  %or.115155.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109448.9.clone.1, %shift-right-logical.115626.9.clone.1)
+  %xor.121705.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248000.5.clone.1, %or.115155.7.clone.1)
+  %add.248001.3.clone.1 = u32[1280,1280]{1,0} add(%add.248000.5.clone.1, %xor.121705.5.clone.1)
+  %shift-left.109449.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121705.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115627.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121705.5.clone.1, %broadcast.244417.5760)
+  %or.115156.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109449.9.clone.1, %shift-right-logical.115627.9.clone.1)
+  %xor.121706.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248001.3.clone.1, %or.115156.7.clone.1)
+  %add.248002.3.clone.1 = u32[1280,1280]{1,0} add(%add.248001.3.clone.1, %xor.121706.5.clone.1)
+  %shift-left.109450.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121706.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115629.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121706.5.clone.1, %broadcast.244419.4352)
+  %or.115157.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109450.7.clone.1, %shift-right-logical.115629.7.clone.1)
+  %xor.121708.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248002.3.clone.1, %or.115157.5.clone.1)
+  %add.248003.3.clone.1 = u32[1280,1280]{1,0} add(%add.248002.3.clone.1, %xor.121708.3.clone.1)
+  %add.248004.7.clone.1 = u32[1280,1280]{1,0} add(%add.248003.3.clone.1, %broadcast.251246.44.clone.1)
+  %shift-left.109451.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121708.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115630.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121708.3.clone.1, %broadcast.244418.4352)
+  %or.115158.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109451.7.clone.1, %shift-right-logical.115630.7.clone.1)
+  %xor.121709.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248003.3.clone.1, %or.115158.5.clone.1)
+  %constant_218209_1_clone_1 = u32[] constant(1413264673)
+  %broadcast.251277.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218209_1_clone_1), dimensions={}
+  %add.248006.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121709.3.clone.1, %broadcast.251277.5.clone.1)
+  %add.248010.5.clone.1 = u32[1280,1280]{1,0} add(%add.248004.7.clone.1, %add.248006.5.clone.1)
+  %shift-left.109452.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248006.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115631.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248006.5.clone.1, %broadcast.244416.5760)
+  %or.115159.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109452.9.clone.1, %shift-right-logical.115631.9.clone.1)
+  %xor.121710.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248010.5.clone.1, %or.115159.7.clone.1)
+  %add.248011.3.clone.1 = u32[1280,1280]{1,0} add(%add.248010.5.clone.1, %xor.121710.5.clone.1)
+  %shift-left.109453.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121710.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115632.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121710.5.clone.1, %broadcast.244429.2304)
+  %or.115160.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109453.9.clone.1, %shift-right-logical.115632.9.clone.1)
+  %xor.121711.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248011.3.clone.1, %or.115160.7.clone.1)
+  %add.248012.3.clone.1 = u32[1280,1280]{1,0} add(%add.248011.3.clone.1, %xor.121711.5.clone.1)
+  %shift-left.109454.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121711.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115633.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121711.5.clone.1, %broadcast.244430.4608)
+  %or.115161.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109454.9.clone.1, %shift-right-logical.115633.9.clone.1)
+  %xor.121712.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248012.3.clone.1, %or.115161.7.clone.1)
+  %add.248013.3.clone.1 = u32[1280,1280]{1,0} add(%add.248012.3.clone.1, %xor.121712.5.clone.1)
+  %add.248015.7.clone.1 = u32[1280,1280]{1,0} add(%add.248013.3.clone.1, %broadcast.251247.113.clone.1)
+  %shift-left.109455.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121712.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115634.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121712.5.clone.1, %broadcast.244434.2816)
+  %or.115162.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109455.11.clone.1, %shift-right-logical.115634.11.clone.1)
+  %xor.121713.7.clone.1 = u32[1280,1280]{1,0} xor(%add.248013.3.clone.1, %or.115162.9.clone.1)
+  %constant_218210_1_clone_1 = u32[] constant(80077305)
+  %broadcast.251287.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218210_1_clone_1), dimensions={}
+  %add.248016.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121713.7.clone.1, %broadcast.251287.5.clone.1)
+  %add.248017.5.clone.1 = u32[1280,1280]{1,0} add(%add.248015.7.clone.1, %add.248016.5.clone.1)
+  %shift-left.109456.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.248016.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115635.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.248016.5.clone.1, %broadcast.244415.6016)
+  %or.115163.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109456.9.clone.1, %shift-right-logical.115635.9.clone.1)
+  %xor.121714.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248017.5.clone.1, %or.115163.7.clone.1)
+  %add.248018.3.clone.1 = u32[1280,1280]{1,0} add(%add.248017.5.clone.1, %xor.121714.5.clone.1)
+  %shift-left.109457.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121714.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115636.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121714.5.clone.1, %broadcast.244417.5760)
+  %or.115164.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109457.9.clone.1, %shift-right-logical.115636.9.clone.1)
+  %xor.121715.5.clone.1 = u32[1280,1280]{1,0} xor(%add.248018.3.clone.1, %or.115164.7.clone.1)
+  %add.248020.3.clone.1 = u32[1280,1280]{1,0} add(%add.248018.3.clone.1, %xor.121715.5.clone.1)
+  %shift-left.109458.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121715.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115637.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121715.5.clone.1, %broadcast.244419.4352)
+  %or.115165.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109458.5.clone.1, %shift-right-logical.115637.5.clone.1)
+  %xor.121716.3.clone.1 = u32[1280,1280]{1,0} xor(%add.248020.3.clone.1, %or.115165.3.clone.1)
+  %add.248021.3.clone.1 = u32[1280,1280]{1,0} add(%add.248020.3.clone.1, %xor.121716.3.clone.1)
+  %add.248022.17.clone.1 = u32[1280,1280]{1,0} add(%add.248021.3.clone.1, %broadcast.251264.24.clone.1)
+  %shift-left.109459.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121716.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115639.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121716.3.clone.1, %broadcast.244418.4352)
+  %or.115166.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109459.5.clone.1, %shift-right-logical.115639.5.clone.1)
+  %xor.121718.15.clone.1 = u32[1280,1280]{1,0} xor(%add.248021.3.clone.1, %or.115166.3.clone.1)
+  %constant_218211_1_clone_1 = u32[] constant(1260932918)
+  %broadcast.251297.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218211_1_clone_1), dimensions={}
+  %add.248023.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121718.15.clone.1, %broadcast.251297.19.clone.1)
+  %xor.121719.17.clone.1 = u32[1280,1280]{1,0} xor(%add.248022.17.clone.1, %add.248023.19.clone.1)
+  %shift-right-logical.115640.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121719.17.clone.1, %broadcast.244468.1920)
+  %or.115167.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115640.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5747.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115167.13.clone.1)
+  %add.248025.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5747.11.clone.1, %broadcast.244470.1152)
+  %multiply.26382.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248025.9.clone.1, %broadcast.244471.896)
+  %add.248026.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26382.7.clone.1, %broadcast.244408.1024)
+  %maximum.3679.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.248026.5.clone.1)
+  %abs.1535.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3679.3.clone.1)
+  %compare.7218.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1535.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26383.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3679.3.clone.1, %broadcast.244476.1152)
+  %negate.4575.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3679.3.clone.1)
+  %multiply.26384.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3679.3.clone.1, %negate.4575.5.clone.1)
+  %log-plus-one.1535.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26384.5.clone.1)
+  %negate.4576.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1535.3.clone.1)
+  %compare.7219.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4576.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20950.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20951.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20952.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20953.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20954.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20955.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20956.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20957.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20958.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.248027.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4576.4.clone.1, %broadcast.244496.640)
+  %sqrt.1535.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4576.4.clone.1)
+  %add.248028.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1535.5.clone.1, %broadcast.244498.640)
+  %select.20959.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7219.3.clone.1, %add.248027.5.clone.1, %add.248028.5.clone.1)
+  %multiply.26385.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20958.3.clone.1, %select.20959.3.clone.1)
+  %add.248029.1.clone.1 = f32[1280,1280]{1,0} add(%select.20957.3.clone.1, %multiply.26385.1.clone.1)
+  %multiply.26386.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248029.1.clone.1, %select.20959.3.clone.1)
+  %add.248031.1.clone.1 = f32[1280,1280]{1,0} add(%select.20956.3.clone.1, %multiply.26386.1.clone.1)
+  %multiply.26387.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248031.1.clone.1, %select.20959.3.clone.1)
+  %add.248034.1.clone.1 = f32[1280,1280]{1,0} add(%select.20955.3.clone.1, %multiply.26387.1.clone.1)
+  %multiply.26388.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248034.1.clone.1, %select.20959.3.clone.1)
+  %add.248035.1.clone.1 = f32[1280,1280]{1,0} add(%select.20954.3.clone.1, %multiply.26388.1.clone.1)
+  %multiply.26389.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248035.1.clone.1, %select.20959.3.clone.1)
+  %add.248036.3.clone.1 = f32[1280,1280]{1,0} add(%select.20953.5.clone.1, %multiply.26389.1.clone.1)
+  %multiply.26390.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.248036.3.clone.1, %select.20959.3.clone.1)
+  %add.248037.3.clone.1 = f32[1280,1280]{1,0} add(%select.20952.5.clone.1, %multiply.26390.1.clone.1)
+  %multiply.26391.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248037.3.clone.1, %select.20959.3.clone.1)
+  %add.248038.9.clone.1 = f32[1280,1280]{1,0} add(%select.20951.11.clone.1, %multiply.26391.7.clone.1)
+  %multiply.26392.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248038.9.clone.1, %select.20959.3.clone.1)
+  %add.248039.7.clone.1 = f32[1280,1280]{1,0} add(%select.20950.7.clone.1, %multiply.26392.7.clone.1)
+  %multiply.26393.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.248039.7.clone.1, %maximum.3679.3.clone.1)
+  %select.20960.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7218.3.clone.1, %multiply.26383.9.clone.1, %multiply.26393.7.clone.1)
+  %multiply.26394.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20960.7.clone.1, %broadcast.244500.640)
+  %clamp.1179.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26394.5.clone.1, %broadcast.244501.384)
+  %multiply.26395.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1179.3.clone.1, %broadcast.244502.1)
+  %constant_185361_1_clone_1 = u32[] constant(1322156039)
+  %broadcast.256214.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185361_1_clone_1), dimensions={}
+  %add.250788.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.256214.44.clone.1)
+  %constant_185368_1_clone_1 = u32[] constant(1657033467)
+  %broadcast.256215.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185368_1_clone_1), dimensions={}
+  %add.250789.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256215.113.clone.1)
+  %add.250790.35.clone.1 = u32[1280,1280]{1,0} add(%add.250788.37.clone.1, %add.250789.99.clone.1)
+  %shift-left.110660.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250789.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116916.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250789.99.clone.1, %broadcast.244415.6016)
+  %or.116434.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110660.31.clone.1, %shift-right-logical.116916.29.clone.1)
+  %xor.123003.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250790.35.clone.1, %or.116434.29.clone.1)
+  %add.250791.5.clone.1 = u32[1280,1280]{1,0} add(%add.250790.35.clone.1, %xor.123003.27.clone.1)
+  %shift-left.110661.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123003.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116917.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123003.27.clone.1, %broadcast.244417.5760)
+  %or.116435.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110661.9.clone.1, %shift-right-logical.116917.9.clone.1)
+  %xor.123004.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250791.5.clone.1, %or.116435.7.clone.1)
+  %add.250792.3.clone.1 = u32[1280,1280]{1,0} add(%add.250791.5.clone.1, %xor.123004.5.clone.1)
+  %shift-left.110662.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123004.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116918.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123004.5.clone.1, %broadcast.244419.4352)
+  %or.116436.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110662.5.clone.1, %shift-right-logical.116918.5.clone.1)
+  %xor.123005.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250792.3.clone.1, %or.116436.3.clone.1)
+  %add.250793.3.clone.1 = u32[1280,1280]{1,0} add(%add.250792.3.clone.1, %xor.123005.3.clone.1)
+  %add.250794.7.clone.1 = u32[1280,1280]{1,0} add(%add.250793.3.clone.1, %broadcast.256215.113.clone.1)
+  %shift-left.110663.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123005.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116920.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123005.3.clone.1, %broadcast.244418.4352)
+  %or.116437.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110663.5.clone.1, %shift-right-logical.116920.5.clone.1)
+  %xor.123006.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250793.3.clone.1, %or.116437.3.clone.1)
+  %constant_218504_1_clone_1 = u32[] constant(937150759)
+  %broadcast.256225.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218504_1_clone_1), dimensions={}
+  %add.250795.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123006.3.clone.1, %broadcast.256225.5.clone.1)
+  %add.250796.5.clone.1 = u32[1280,1280]{1,0} add(%add.250794.7.clone.1, %add.250795.5.clone.1)
+  %shift-left.110664.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250795.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116921.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250795.5.clone.1, %broadcast.244416.5760)
+  %or.116438.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110664.9.clone.1, %shift-right-logical.116921.9.clone.1)
+  %xor.123007.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250796.5.clone.1, %or.116438.7.clone.1)
+  %add.250797.3.clone.1 = u32[1280,1280]{1,0} add(%add.250796.5.clone.1, %xor.123007.5.clone.1)
+  %shift-left.110665.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123007.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116922.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123007.5.clone.1, %broadcast.244429.2304)
+  %or.116439.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110665.9.clone.1, %shift-right-logical.116922.9.clone.1)
+  %xor.123008.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250797.3.clone.1, %or.116439.7.clone.1)
+  %add.250798.3.clone.1 = u32[1280,1280]{1,0} add(%add.250797.3.clone.1, %xor.123008.5.clone.1)
+  %shift-left.110666.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123008.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116923.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123008.5.clone.1, %broadcast.244430.4608)
+  %or.116440.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110666.9.clone.1, %shift-right-logical.116923.9.clone.1)
+  %xor.123009.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250798.3.clone.1, %or.116440.7.clone.1)
+  %add.250799.3.clone.1 = u32[1280,1280]{1,0} add(%add.250798.3.clone.1, %xor.123009.5.clone.1)
+  %constant_185370_1_clone_1 = u32[] constant(937150758)
+  %broadcast.256232.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185370_1_clone_1), dimensions={}
+  %add.250800.7.clone.1 = u32[1280,1280]{1,0} add(%add.250799.3.clone.1, %broadcast.256232.24.clone.1)
+  %shift-left.110667.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123009.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116925.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123009.5.clone.1, %broadcast.244434.2816)
+  %or.116441.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110667.11.clone.1, %shift-right-logical.116925.11.clone.1)
+  %xor.123010.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250799.3.clone.1, %or.116441.9.clone.1)
+  %constant_218505_1_clone_1 = u32[] constant(1322156041)
+  %broadcast.256235.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218505_1_clone_1), dimensions={}
+  %add.250801.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123010.7.clone.1, %broadcast.256235.5.clone.1)
+  %add.250802.5.clone.1 = u32[1280,1280]{1,0} add(%add.250800.7.clone.1, %add.250801.5.clone.1)
+  %shift-left.110668.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250801.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116926.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250801.5.clone.1, %broadcast.244415.6016)
+  %or.116442.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110668.9.clone.1, %shift-right-logical.116926.9.clone.1)
+  %xor.123011.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250802.5.clone.1, %or.116442.7.clone.1)
+  %add.250803.3.clone.1 = u32[1280,1280]{1,0} add(%add.250802.5.clone.1, %xor.123011.5.clone.1)
+  %shift-left.110669.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123011.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116927.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123011.5.clone.1, %broadcast.244417.5760)
+  %or.116443.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110669.9.clone.1, %shift-right-logical.116927.9.clone.1)
+  %xor.123012.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250803.3.clone.1, %or.116443.7.clone.1)
+  %add.250804.3.clone.1 = u32[1280,1280]{1,0} add(%add.250803.3.clone.1, %xor.123012.5.clone.1)
+  %shift-left.110670.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123012.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116928.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123012.5.clone.1, %broadcast.244419.4352)
+  %or.116444.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110670.7.clone.1, %shift-right-logical.116928.7.clone.1)
+  %xor.123015.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250804.3.clone.1, %or.116444.5.clone.1)
+  %add.250805.3.clone.1 = u32[1280,1280]{1,0} add(%add.250804.3.clone.1, %xor.123015.3.clone.1)
+  %add.250806.7.clone.1 = u32[1280,1280]{1,0} add(%add.250805.3.clone.1, %broadcast.256214.44.clone.1)
+  %shift-left.110671.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123015.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116930.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123015.3.clone.1, %broadcast.244418.4352)
+  %or.116445.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110671.7.clone.1, %shift-right-logical.116930.7.clone.1)
+  %xor.123016.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250805.3.clone.1, %or.116445.5.clone.1)
+  %constant_218506_1_clone_1 = u32[] constant(1657033470)
+  %broadcast.256245.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218506_1_clone_1), dimensions={}
+  %add.250807.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123016.3.clone.1, %broadcast.256245.5.clone.1)
+  %add.250809.5.clone.1 = u32[1280,1280]{1,0} add(%add.250806.7.clone.1, %add.250807.5.clone.1)
+  %shift-left.110672.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250807.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116931.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250807.5.clone.1, %broadcast.244416.5760)
+  %or.116446.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110672.9.clone.1, %shift-right-logical.116931.9.clone.1)
+  %xor.123017.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250809.5.clone.1, %or.116446.7.clone.1)
+  %add.250812.3.clone.1 = u32[1280,1280]{1,0} add(%add.250809.5.clone.1, %xor.123017.5.clone.1)
+  %shift-left.110673.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123017.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116932.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123017.5.clone.1, %broadcast.244429.2304)
+  %or.116447.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110673.9.clone.1, %shift-right-logical.116932.9.clone.1)
+  %xor.123018.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250812.3.clone.1, %or.116447.7.clone.1)
+  %add.250813.3.clone.1 = u32[1280,1280]{1,0} add(%add.250812.3.clone.1, %xor.123018.5.clone.1)
+  %shift-left.110674.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123018.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116933.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123018.5.clone.1, %broadcast.244430.4608)
+  %or.116448.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110674.9.clone.1, %shift-right-logical.116933.9.clone.1)
+  %xor.123019.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250813.3.clone.1, %or.116448.7.clone.1)
+  %add.250814.3.clone.1 = u32[1280,1280]{1,0} add(%add.250813.3.clone.1, %xor.123019.5.clone.1)
+  %add.250815.7.clone.1 = u32[1280,1280]{1,0} add(%add.250814.3.clone.1, %broadcast.256215.113.clone.1)
+  %shift-left.110675.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123019.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116935.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123019.5.clone.1, %broadcast.244434.2816)
+  %or.116449.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110675.11.clone.1, %shift-right-logical.116935.11.clone.1)
+  %xor.123020.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250814.3.clone.1, %or.116449.9.clone.1)
+  %constant_218507_1_clone_1 = u32[] constant(937150762)
+  %broadcast.256255.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218507_1_clone_1), dimensions={}
+  %add.250817.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123020.7.clone.1, %broadcast.256255.5.clone.1)
+  %add.250818.5.clone.1 = u32[1280,1280]{1,0} add(%add.250815.7.clone.1, %add.250817.5.clone.1)
+  %shift-left.110676.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250817.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116936.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250817.5.clone.1, %broadcast.244415.6016)
+  %or.116450.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110676.9.clone.1, %shift-right-logical.116936.9.clone.1)
+  %xor.123021.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250818.5.clone.1, %or.116450.7.clone.1)
+  %add.250819.3.clone.1 = u32[1280,1280]{1,0} add(%add.250818.5.clone.1, %xor.123021.5.clone.1)
+  %shift-left.110677.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123021.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116937.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123021.5.clone.1, %broadcast.244417.5760)
+  %or.116451.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110677.9.clone.1, %shift-right-logical.116937.9.clone.1)
+  %xor.123022.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250819.3.clone.1, %or.116451.7.clone.1)
+  %add.250820.3.clone.1 = u32[1280,1280]{1,0} add(%add.250819.3.clone.1, %xor.123022.5.clone.1)
+  %shift-left.110678.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123022.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116938.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123022.5.clone.1, %broadcast.244419.4352)
+  %or.116452.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110678.5.clone.1, %shift-right-logical.116938.5.clone.1)
+  %xor.123024.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250820.3.clone.1, %or.116452.3.clone.1)
+  %add.250822.3.clone.1 = u32[1280,1280]{1,0} add(%add.250820.3.clone.1, %xor.123024.3.clone.1)
+  %add.250823.17.clone.1 = u32[1280,1280]{1,0} add(%add.250822.3.clone.1, %broadcast.256232.24.clone.1)
+  %shift-left.110679.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123024.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116939.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123024.3.clone.1, %broadcast.244418.4352)
+  %or.116453.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110679.5.clone.1, %shift-right-logical.116939.5.clone.1)
+  %xor.123025.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250822.3.clone.1, %or.116453.3.clone.1)
+  %constant_218508_1_clone_1 = u32[] constant(1322156044)
+  %broadcast.256265.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218508_1_clone_1), dimensions={}
+  %add.250824.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123025.15.clone.1, %broadcast.256265.19.clone.1)
+  %xor.123026.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250823.17.clone.1, %add.250824.19.clone.1)
+  %shift-right-logical.116940.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123026.17.clone.1, %broadcast.244468.1920)
+  %or.116454.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116940.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5803.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116454.13.clone.1)
+  %add.250825.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5803.11.clone.1, %broadcast.244470.1152)
+  %multiply.26965.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250825.9.clone.1, %broadcast.244471.896)
+  %add.250827.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26965.7.clone.1, %broadcast.244408.1024)
+  %maximum.3735.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250827.5.clone.1)
+  %abs.1573.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3735.3.clone.1)
+  %compare.7302.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1573.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26966.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3735.3.clone.1, %broadcast.244476.1152)
+  %negate.4651.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3735.3.clone.1)
+  %multiply.26967.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3735.3.clone.1, %negate.4651.5.clone.1)
+  %log-plus-one.1573.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26967.5.clone.1)
+  %negate.4652.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1573.3.clone.1)
+  %compare.7303.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4652.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21389.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21390.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21391.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21392.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21393.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21394.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21395.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21396.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21397.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250828.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4652.4.clone.1, %broadcast.244496.640)
+  %sqrt.1573.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4652.4.clone.1)
+  %add.250829.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1573.5.clone.1, %broadcast.244498.640)
+  %select.21398.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7303.3.clone.1, %add.250828.5.clone.1, %add.250829.5.clone.1)
+  %multiply.26968.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21397.3.clone.1, %select.21398.3.clone.1)
+  %add.250830.1.clone.1 = f32[1280,1280]{1,0} add(%select.21396.3.clone.1, %multiply.26968.1.clone.1)
+  %multiply.26969.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250830.1.clone.1, %select.21398.3.clone.1)
+  %add.250831.1.clone.1 = f32[1280,1280]{1,0} add(%select.21395.3.clone.1, %multiply.26969.1.clone.1)
+  %multiply.26970.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250831.1.clone.1, %select.21398.3.clone.1)
+  %add.250833.1.clone.1 = f32[1280,1280]{1,0} add(%select.21394.3.clone.1, %multiply.26970.1.clone.1)
+  %multiply.26971.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250833.1.clone.1, %select.21398.3.clone.1)
+  %add.250837.1.clone.1 = f32[1280,1280]{1,0} add(%select.21393.3.clone.1, %multiply.26971.1.clone.1)
+  %multiply.26972.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250837.1.clone.1, %select.21398.3.clone.1)
+  %add.250838.3.clone.1 = f32[1280,1280]{1,0} add(%select.21392.5.clone.1, %multiply.26972.1.clone.1)
+  %multiply.26973.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250838.3.clone.1, %select.21398.3.clone.1)
+  %add.250839.3.clone.1 = f32[1280,1280]{1,0} add(%select.21391.5.clone.1, %multiply.26973.1.clone.1)
+  %multiply.26974.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250839.3.clone.1, %select.21398.3.clone.1)
+  %add.250840.9.clone.1 = f32[1280,1280]{1,0} add(%select.21390.11.clone.1, %multiply.26974.7.clone.1)
+  %multiply.26975.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250840.9.clone.1, %select.21398.3.clone.1)
+  %add.250842.7.clone.1 = f32[1280,1280]{1,0} add(%select.21389.7.clone.1, %multiply.26975.7.clone.1)
+  %multiply.26976.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250842.7.clone.1, %maximum.3735.3.clone.1)
+  %select.21399.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7302.3.clone.1, %multiply.26966.9.clone.1, %multiply.26976.7.clone.1)
+  %multiply.26977.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21399.7.clone.1, %broadcast.244500.640)
+  %clamp.1217.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26977.5.clone.1, %broadcast.244501.384)
+  %multiply.26978.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1217.3.clone.1, %broadcast.244502.1)
+  %constant_173378_1_clone_1 = u32[] constant(2492626316)
+  %broadcast.251013.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173378_1_clone_1), dimensions={}
+  %add.247845.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.251013.44.clone.1)
+  %constant_173385_1_clone_1 = u32[] constant(377826067)
+  %broadcast.251014.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173385_1_clone_1), dimensions={}
+  %add.247846.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.251014.113.clone.1)
+  %add.247847.35.clone.1 = u32[1280,1280]{1,0} add(%add.247845.37.clone.1, %add.247846.99.clone.1)
+  %shift-left.109380.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247846.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115541.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247846.99.clone.1, %broadcast.244415.6016)
+  %or.115080.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109380.31.clone.1, %shift-right-logical.115541.29.clone.1)
+  %xor.121629.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247847.35.clone.1, %or.115080.29.clone.1)
+  %add.247848.5.clone.1 = u32[1280,1280]{1,0} add(%add.247847.35.clone.1, %xor.121629.27.clone.1)
+  %shift-left.109381.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121629.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115542.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121629.27.clone.1, %broadcast.244417.5760)
+  %or.115081.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109381.9.clone.1, %shift-right-logical.115542.9.clone.1)
+  %xor.121630.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247848.5.clone.1, %or.115081.7.clone.1)
+  %add.247849.3.clone.1 = u32[1280,1280]{1,0} add(%add.247848.5.clone.1, %xor.121630.5.clone.1)
+  %shift-left.109382.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121630.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115544.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121630.5.clone.1, %broadcast.244419.4352)
+  %or.115082.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109382.5.clone.1, %shift-right-logical.115544.5.clone.1)
+  %xor.121631.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247849.3.clone.1, %or.115082.3.clone.1)
+  %add.247850.3.clone.1 = u32[1280,1280]{1,0} add(%add.247849.3.clone.1, %xor.121631.3.clone.1)
+  %add.247851.7.clone.1 = u32[1280,1280]{1,0} add(%add.247850.3.clone.1, %broadcast.251014.113.clone.1)
+  %shift-left.109383.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121631.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115545.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121631.3.clone.1, %broadcast.244418.4352)
+  %or.115083.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109383.5.clone.1, %shift-right-logical.115545.5.clone.1)
+  %xor.121632.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247850.3.clone.1, %or.115083.3.clone.1)
+  %constant_218176_1_clone_1 = u32[] constant(2579907910)
+  %broadcast.251026.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218176_1_clone_1), dimensions={}
+  %add.247852.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121632.3.clone.1, %broadcast.251026.5.clone.1)
+  %add.247853.5.clone.1 = u32[1280,1280]{1,0} add(%add.247851.7.clone.1, %add.247852.5.clone.1)
+  %shift-left.109384.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247852.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115546.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247852.5.clone.1, %broadcast.244416.5760)
+  %or.115084.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109384.9.clone.1, %shift-right-logical.115546.9.clone.1)
+  %xor.121633.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247853.5.clone.1, %or.115084.7.clone.1)
+  %add.247855.3.clone.1 = u32[1280,1280]{1,0} add(%add.247853.5.clone.1, %xor.121633.5.clone.1)
+  %shift-left.109385.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121633.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115547.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121633.5.clone.1, %broadcast.244429.2304)
+  %or.115085.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109385.9.clone.1, %shift-right-logical.115547.9.clone.1)
+  %xor.121634.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247855.3.clone.1, %or.115085.7.clone.1)
+  %add.247856.3.clone.1 = u32[1280,1280]{1,0} add(%add.247855.3.clone.1, %xor.121634.5.clone.1)
+  %shift-left.109386.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121634.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115549.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121634.5.clone.1, %broadcast.244430.4608)
+  %or.115086.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109386.9.clone.1, %shift-right-logical.115549.9.clone.1)
+  %xor.121635.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247856.3.clone.1, %or.115086.7.clone.1)
+  %add.247857.3.clone.1 = u32[1280,1280]{1,0} add(%add.247856.3.clone.1, %xor.121635.5.clone.1)
+  %constant_173387_1_clone_1 = u32[] constant(2579907909)
+  %broadcast.251033.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173387_1_clone_1), dimensions={}
+  %add.247858.7.clone.1 = u32[1280,1280]{1,0} add(%add.247857.3.clone.1, %broadcast.251033.24.clone.1)
+  %shift-left.109387.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121635.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115550.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121635.5.clone.1, %broadcast.244434.2816)
+  %or.115087.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109387.11.clone.1, %shift-right-logical.115550.11.clone.1)
+  %xor.121636.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247857.3.clone.1, %or.115087.9.clone.1)
+  %constant_218177_1_clone_1 = u32[] constant(2492626318)
+  %broadcast.251036.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218177_1_clone_1), dimensions={}
+  %add.247859.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121636.7.clone.1, %broadcast.251036.5.clone.1)
+  %add.247860.5.clone.1 = u32[1280,1280]{1,0} add(%add.247858.7.clone.1, %add.247859.5.clone.1)
+  %shift-left.109388.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247859.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115551.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247859.5.clone.1, %broadcast.244415.6016)
+  %or.115089.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109388.9.clone.1, %shift-right-logical.115551.9.clone.1)
+  %xor.121637.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247860.5.clone.1, %or.115089.7.clone.1)
+  %add.247861.3.clone.1 = u32[1280,1280]{1,0} add(%add.247860.5.clone.1, %xor.121637.5.clone.1)
+  %shift-left.109389.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121637.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115552.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121637.5.clone.1, %broadcast.244417.5760)
+  %or.115090.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109389.9.clone.1, %shift-right-logical.115552.9.clone.1)
+  %xor.121638.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247861.3.clone.1, %or.115090.7.clone.1)
+  %add.247862.3.clone.1 = u32[1280,1280]{1,0} add(%add.247861.3.clone.1, %xor.121638.5.clone.1)
+  %shift-left.109390.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121638.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115554.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121638.5.clone.1, %broadcast.244419.4352)
+  %or.115091.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109390.7.clone.1, %shift-right-logical.115554.7.clone.1)
+  %xor.121639.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247862.3.clone.1, %or.115091.5.clone.1)
+  %add.247863.3.clone.1 = u32[1280,1280]{1,0} add(%add.247862.3.clone.1, %xor.121639.3.clone.1)
+  %add.247864.7.clone.1 = u32[1280,1280]{1,0} add(%add.247863.3.clone.1, %broadcast.251013.44.clone.1)
+  %shift-left.109391.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121639.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115555.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121639.3.clone.1, %broadcast.244418.4352)
+  %or.115092.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109391.7.clone.1, %shift-right-logical.115555.7.clone.1)
+  %xor.121640.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247863.3.clone.1, %or.115092.5.clone.1)
+  %constant_218178_1_clone_1 = u32[] constant(377826070)
+  %broadcast.251048.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218178_1_clone_1), dimensions={}
+  %add.247865.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121640.3.clone.1, %broadcast.251048.5.clone.1)
+  %add.247866.5.clone.1 = u32[1280,1280]{1,0} add(%add.247864.7.clone.1, %add.247865.5.clone.1)
+  %shift-left.109392.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247865.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115556.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247865.5.clone.1, %broadcast.244416.5760)
+  %or.115094.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109392.9.clone.1, %shift-right-logical.115556.9.clone.1)
+  %xor.121641.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247866.5.clone.1, %or.115094.7.clone.1)
+  %add.247867.3.clone.1 = u32[1280,1280]{1,0} add(%add.247866.5.clone.1, %xor.121641.5.clone.1)
+  %shift-left.109393.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121641.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115557.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121641.5.clone.1, %broadcast.244429.2304)
+  %or.115095.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109393.9.clone.1, %shift-right-logical.115557.9.clone.1)
+  %xor.121642.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247867.3.clone.1, %or.115095.7.clone.1)
+  %add.247868.3.clone.1 = u32[1280,1280]{1,0} add(%add.247867.3.clone.1, %xor.121642.5.clone.1)
+  %shift-left.109394.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121642.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115558.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121642.5.clone.1, %broadcast.244430.4608)
+  %or.115096.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109394.9.clone.1, %shift-right-logical.115558.9.clone.1)
+  %xor.121643.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247868.3.clone.1, %or.115096.7.clone.1)
+  %add.247869.3.clone.1 = u32[1280,1280]{1,0} add(%add.247868.3.clone.1, %xor.121643.5.clone.1)
+  %add.247870.7.clone.1 = u32[1280,1280]{1,0} add(%add.247869.3.clone.1, %broadcast.251014.113.clone.1)
+  %shift-left.109395.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121643.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115559.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121643.5.clone.1, %broadcast.244434.2816)
+  %or.115097.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109395.11.clone.1, %shift-right-logical.115559.11.clone.1)
+  %xor.121644.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247869.3.clone.1, %or.115097.9.clone.1)
+  %constant_218179_1_clone_1 = u32[] constant(2579907913)
+  %broadcast.251058.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218179_1_clone_1), dimensions={}
+  %add.247871.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121644.7.clone.1, %broadcast.251058.5.clone.1)
+  %add.247872.5.clone.1 = u32[1280,1280]{1,0} add(%add.247870.7.clone.1, %add.247871.5.clone.1)
+  %shift-left.109396.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247871.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115560.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247871.5.clone.1, %broadcast.244415.6016)
+  %or.115099.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109396.9.clone.1, %shift-right-logical.115560.9.clone.1)
+  %xor.121645.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247872.5.clone.1, %or.115099.7.clone.1)
+  %add.247873.3.clone.1 = u32[1280,1280]{1,0} add(%add.247872.5.clone.1, %xor.121645.5.clone.1)
+  %shift-left.109397.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121645.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115561.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121645.5.clone.1, %broadcast.244417.5760)
+  %or.115100.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109397.9.clone.1, %shift-right-logical.115561.9.clone.1)
+  %xor.121646.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247873.3.clone.1, %or.115100.7.clone.1)
+  %add.247875.3.clone.1 = u32[1280,1280]{1,0} add(%add.247873.3.clone.1, %xor.121646.5.clone.1)
+  %shift-left.109398.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121646.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115562.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121646.5.clone.1, %broadcast.244419.4352)
+  %or.115101.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109398.5.clone.1, %shift-right-logical.115562.5.clone.1)
+  %xor.121647.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247875.3.clone.1, %or.115101.3.clone.1)
+  %add.247876.3.clone.1 = u32[1280,1280]{1,0} add(%add.247875.3.clone.1, %xor.121647.3.clone.1)
+  %add.247877.17.clone.1 = u32[1280,1280]{1,0} add(%add.247876.3.clone.1, %broadcast.251033.24.clone.1)
+  %shift-left.109399.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121647.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115564.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121647.3.clone.1, %broadcast.244418.4352)
+  %or.115102.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109399.5.clone.1, %shift-right-logical.115564.5.clone.1)
+  %xor.121648.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247876.3.clone.1, %or.115102.3.clone.1)
+  %constant_218180_1_clone_1 = u32[] constant(2492626321)
+  %broadcast.251070.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218180_1_clone_1), dimensions={}
+  %add.247878.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121648.15.clone.1, %broadcast.251070.19.clone.1)
+  %xor.121649.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247877.17.clone.1, %add.247878.19.clone.1)
+  %shift-right-logical.115565.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121649.17.clone.1, %broadcast.244468.1920)
+  %or.115104.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115565.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5744.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115104.13.clone.1)
+  %add.247879.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5744.11.clone.1, %broadcast.244470.1152)
+  %multiply.26363.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247879.9.clone.1, %broadcast.244471.896)
+  %add.247880.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26363.7.clone.1, %broadcast.244408.1024)
+  %maximum.3676.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247880.5.clone.1)
+  %abs.1534.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3676.3.clone.1)
+  %compare.7216.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1534.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26364.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3676.3.clone.1, %broadcast.244476.1152)
+  %negate.4573.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3676.3.clone.1)
+  %multiply.26365.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3676.3.clone.1, %negate.4573.5.clone.1)
+  %log-plus-one.1534.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26365.5.clone.1)
+  %negate.4574.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1534.3.clone.1)
+  %compare.7217.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4574.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20939.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20940.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20941.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20942.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20943.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20944.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20945.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20946.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20947.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247881.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4574.4.clone.1, %broadcast.244496.640)
+  %sqrt.1534.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4574.4.clone.1)
+  %add.247882.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1534.5.clone.1, %broadcast.244498.640)
+  %select.20948.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7217.3.clone.1, %add.247881.5.clone.1, %add.247882.5.clone.1)
+  %multiply.26366.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20947.3.clone.1, %select.20948.3.clone.1)
+  %add.247883.1.clone.1 = f32[1280,1280]{1,0} add(%select.20946.3.clone.1, %multiply.26366.1.clone.1)
+  %multiply.26367.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247883.1.clone.1, %select.20948.3.clone.1)
+  %add.247884.1.clone.1 = f32[1280,1280]{1,0} add(%select.20945.3.clone.1, %multiply.26367.1.clone.1)
+  %multiply.26368.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247884.1.clone.1, %select.20948.3.clone.1)
+  %add.247885.1.clone.1 = f32[1280,1280]{1,0} add(%select.20944.3.clone.1, %multiply.26368.1.clone.1)
+  %multiply.26369.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247885.1.clone.1, %select.20948.3.clone.1)
+  %add.247886.1.clone.1 = f32[1280,1280]{1,0} add(%select.20943.3.clone.1, %multiply.26369.1.clone.1)
+  %multiply.26370.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247886.1.clone.1, %select.20948.3.clone.1)
+  %add.247887.3.clone.1 = f32[1280,1280]{1,0} add(%select.20942.5.clone.1, %multiply.26370.1.clone.1)
+  %multiply.26371.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247887.3.clone.1, %select.20948.3.clone.1)
+  %add.247888.3.clone.1 = f32[1280,1280]{1,0} add(%select.20941.5.clone.1, %multiply.26371.1.clone.1)
+  %multiply.26372.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247888.3.clone.1, %select.20948.3.clone.1)
+  %add.247889.9.clone.1 = f32[1280,1280]{1,0} add(%select.20940.11.clone.1, %multiply.26372.7.clone.1)
+  %multiply.26373.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247889.9.clone.1, %select.20948.3.clone.1)
+  %add.247890.7.clone.1 = f32[1280,1280]{1,0} add(%select.20939.7.clone.1, %multiply.26373.7.clone.1)
+  %multiply.26374.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247890.7.clone.1, %maximum.3676.3.clone.1)
+  %select.20949.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7216.3.clone.1, %multiply.26364.9.clone.1, %multiply.26374.7.clone.1)
+  %multiply.26375.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20949.7.clone.1, %broadcast.244500.640)
+  %clamp.1178.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26375.5.clone.1, %broadcast.244501.384)
+  %multiply.26376.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1178.3.clone.1, %broadcast.244502.1)
+  %constant_191234_1_clone_1 = u32[] constant(2005361984)
+  %broadcast.258729.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191234_1_clone_1), dimensions={}
+  %add.252255.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258729.44.clone.1)
+  %constant_191241_1_clone_1 = u32[] constant(2881587123)
+  %broadcast.258730.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191241_1_clone_1), dimensions={}
+  %add.252256.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258730.113.clone.1)
+  %add.252257.35.clone.1 = u32[1280,1280]{1,0} add(%add.252255.37.clone.1, %add.252256.99.clone.1)
+  %shift-left.111300.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252256.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117581.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252256.99.clone.1, %broadcast.244415.6016)
+  %or.117115.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111300.31.clone.1, %shift-right-logical.117581.29.clone.1)
+  %xor.123676.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252257.35.clone.1, %or.117115.29.clone.1)
+  %add.252259.5.clone.1 = u32[1280,1280]{1,0} add(%add.252257.35.clone.1, %xor.123676.27.clone.1)
+  %shift-left.111301.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123676.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117582.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123676.27.clone.1, %broadcast.244417.5760)
+  %or.117116.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111301.9.clone.1, %shift-right-logical.117582.9.clone.1)
+  %xor.123677.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252259.5.clone.1, %or.117116.7.clone.1)
+  %add.252260.3.clone.1 = u32[1280,1280]{1,0} add(%add.252259.5.clone.1, %xor.123677.5.clone.1)
+  %shift-left.111302.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123677.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117583.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123677.5.clone.1, %broadcast.244419.4352)
+  %or.117117.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111302.5.clone.1, %shift-right-logical.117583.5.clone.1)
+  %xor.123678.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252260.3.clone.1, %or.117117.3.clone.1)
+  %add.252261.3.clone.1 = u32[1280,1280]{1,0} add(%add.252260.3.clone.1, %xor.123678.3.clone.1)
+  %add.252262.7.clone.1 = u32[1280,1280]{1,0} add(%add.252261.3.clone.1, %broadcast.258730.113.clone.1)
+  %shift-left.111303.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123678.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117584.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123678.3.clone.1, %broadcast.244418.4352)
+  %or.117118.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111303.5.clone.1, %shift-right-logical.117584.5.clone.1)
+  %xor.123680.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252261.3.clone.1, %or.117118.3.clone.1)
+  %constant_218670_1_clone_1 = u32[] constant(3348625706)
+  %broadcast.258740.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218670_1_clone_1), dimensions={}
+  %add.252263.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123680.3.clone.1, %broadcast.258740.5.clone.1)
+  %add.252265.5.clone.1 = u32[1280,1280]{1,0} add(%add.252262.7.clone.1, %add.252263.5.clone.1)
+  %shift-left.111304.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252263.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117585.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252263.5.clone.1, %broadcast.244416.5760)
+  %or.117120.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111304.9.clone.1, %shift-right-logical.117585.9.clone.1)
+  %xor.123681.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252265.5.clone.1, %or.117120.7.clone.1)
+  %add.252269.3.clone.1 = u32[1280,1280]{1,0} add(%add.252265.5.clone.1, %xor.123681.5.clone.1)
+  %shift-left.111305.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123681.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117586.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123681.5.clone.1, %broadcast.244429.2304)
+  %or.117121.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111305.9.clone.1, %shift-right-logical.117586.9.clone.1)
+  %xor.123682.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252269.3.clone.1, %or.117121.7.clone.1)
+  %add.252270.3.clone.1 = u32[1280,1280]{1,0} add(%add.252269.3.clone.1, %xor.123682.5.clone.1)
+  %shift-left.111306.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123682.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117587.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123682.5.clone.1, %broadcast.244430.4608)
+  %or.117122.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111306.9.clone.1, %shift-right-logical.117587.9.clone.1)
+  %xor.123683.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252270.3.clone.1, %or.117122.7.clone.1)
+  %add.252271.3.clone.1 = u32[1280,1280]{1,0} add(%add.252270.3.clone.1, %xor.123683.5.clone.1)
+  %constant_191243_1_clone_1 = u32[] constant(3348625705)
+  %broadcast.258747.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191243_1_clone_1), dimensions={}
+  %add.252272.7.clone.1 = u32[1280,1280]{1,0} add(%add.252271.3.clone.1, %broadcast.258747.24.clone.1)
+  %shift-left.111307.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123683.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117588.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123683.5.clone.1, %broadcast.244434.2816)
+  %or.117123.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111307.11.clone.1, %shift-right-logical.117588.11.clone.1)
+  %xor.123685.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252271.3.clone.1, %or.117123.9.clone.1)
+  %constant_218671_1_clone_1 = u32[] constant(2005361986)
+  %broadcast.258750.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218671_1_clone_1), dimensions={}
+  %add.252274.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123685.7.clone.1, %broadcast.258750.5.clone.1)
+  %add.252275.5.clone.1 = u32[1280,1280]{1,0} add(%add.252272.7.clone.1, %add.252274.5.clone.1)
+  %shift-left.111308.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252274.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117589.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252274.5.clone.1, %broadcast.244415.6016)
+  %or.117124.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111308.9.clone.1, %shift-right-logical.117589.9.clone.1)
+  %xor.123686.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252275.5.clone.1, %or.117124.7.clone.1)
+  %add.252276.3.clone.1 = u32[1280,1280]{1,0} add(%add.252275.5.clone.1, %xor.123686.5.clone.1)
+  %shift-left.111309.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123686.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117590.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123686.5.clone.1, %broadcast.244417.5760)
+  %or.117125.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111309.9.clone.1, %shift-right-logical.117590.9.clone.1)
+  %xor.123687.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252276.3.clone.1, %or.117125.7.clone.1)
+  %add.252277.3.clone.1 = u32[1280,1280]{1,0} add(%add.252276.3.clone.1, %xor.123687.5.clone.1)
+  %shift-left.111310.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123687.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117591.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123687.5.clone.1, %broadcast.244419.4352)
+  %or.117126.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111310.7.clone.1, %shift-right-logical.117591.7.clone.1)
+  %xor.123688.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252277.3.clone.1, %or.117126.5.clone.1)
+  %add.252279.3.clone.1 = u32[1280,1280]{1,0} add(%add.252277.3.clone.1, %xor.123688.3.clone.1)
+  %add.252280.7.clone.1 = u32[1280,1280]{1,0} add(%add.252279.3.clone.1, %broadcast.258729.44.clone.1)
+  %shift-left.111311.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123688.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117592.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123688.3.clone.1, %broadcast.244418.4352)
+  %or.117127.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111311.7.clone.1, %shift-right-logical.117592.7.clone.1)
+  %xor.123690.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252279.3.clone.1, %or.117127.5.clone.1)
+  %constant_218672_1_clone_1 = u32[] constant(2881587126)
+  %broadcast.258760.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218672_1_clone_1), dimensions={}
+  %add.252281.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123690.3.clone.1, %broadcast.258760.5.clone.1)
+  %add.252282.5.clone.1 = u32[1280,1280]{1,0} add(%add.252280.7.clone.1, %add.252281.5.clone.1)
+  %shift-left.111312.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252281.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117593.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252281.5.clone.1, %broadcast.244416.5760)
+  %or.117128.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111312.9.clone.1, %shift-right-logical.117593.9.clone.1)
+  %xor.123691.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252282.5.clone.1, %or.117128.7.clone.1)
+  %add.252284.3.clone.1 = u32[1280,1280]{1,0} add(%add.252282.5.clone.1, %xor.123691.5.clone.1)
+  %shift-left.111313.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123691.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117594.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123691.5.clone.1, %broadcast.244429.2304)
+  %or.117130.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111313.9.clone.1, %shift-right-logical.117594.9.clone.1)
+  %xor.123692.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252284.3.clone.1, %or.117130.7.clone.1)
+  %add.252285.3.clone.1 = u32[1280,1280]{1,0} add(%add.252284.3.clone.1, %xor.123692.5.clone.1)
+  %shift-left.111314.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123692.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117595.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123692.5.clone.1, %broadcast.244430.4608)
+  %or.117131.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111314.9.clone.1, %shift-right-logical.117595.9.clone.1)
+  %xor.123693.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252285.3.clone.1, %or.117131.7.clone.1)
+  %add.252286.3.clone.1 = u32[1280,1280]{1,0} add(%add.252285.3.clone.1, %xor.123693.5.clone.1)
+  %add.252287.7.clone.1 = u32[1280,1280]{1,0} add(%add.252286.3.clone.1, %broadcast.258730.113.clone.1)
+  %shift-left.111315.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123693.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117596.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123693.5.clone.1, %broadcast.244434.2816)
+  %or.117132.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111315.11.clone.1, %shift-right-logical.117596.11.clone.1)
+  %xor.123694.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252286.3.clone.1, %or.117132.9.clone.1)
+  %constant_218673_1_clone_1 = u32[] constant(3348625709)
+  %broadcast.258770.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218673_1_clone_1), dimensions={}
+  %add.252288.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123694.7.clone.1, %broadcast.258770.5.clone.1)
+  %add.252290.5.clone.1 = u32[1280,1280]{1,0} add(%add.252287.7.clone.1, %add.252288.5.clone.1)
+  %shift-left.111316.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252288.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117597.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252288.5.clone.1, %broadcast.244415.6016)
+  %or.117133.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111316.9.clone.1, %shift-right-logical.117597.9.clone.1)
+  %xor.123695.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252290.5.clone.1, %or.117133.7.clone.1)
+  %add.252293.3.clone.1 = u32[1280,1280]{1,0} add(%add.252290.5.clone.1, %xor.123695.5.clone.1)
+  %shift-left.111317.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123695.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117598.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123695.5.clone.1, %broadcast.244417.5760)
+  %or.117135.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111317.9.clone.1, %shift-right-logical.117598.9.clone.1)
+  %xor.123696.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252293.3.clone.1, %or.117135.7.clone.1)
+  %add.252294.3.clone.1 = u32[1280,1280]{1,0} add(%add.252293.3.clone.1, %xor.123696.5.clone.1)
+  %shift-left.111318.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123696.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117599.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123696.5.clone.1, %broadcast.244419.4352)
+  %or.117136.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111318.5.clone.1, %shift-right-logical.117599.5.clone.1)
+  %xor.123697.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252294.3.clone.1, %or.117136.3.clone.1)
+  %add.252295.3.clone.1 = u32[1280,1280]{1,0} add(%add.252294.3.clone.1, %xor.123697.3.clone.1)
+  %add.252296.17.clone.1 = u32[1280,1280]{1,0} add(%add.252295.3.clone.1, %broadcast.258747.24.clone.1)
+  %shift-left.111319.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123697.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117600.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123697.3.clone.1, %broadcast.244418.4352)
+  %or.117137.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111319.5.clone.1, %shift-right-logical.117600.5.clone.1)
+  %xor.123698.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252295.3.clone.1, %or.117137.3.clone.1)
+  %constant_218674_1_clone_1 = u32[] constant(2005361989)
+  %broadcast.258780.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218674_1_clone_1), dimensions={}
+  %add.252297.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123698.15.clone.1, %broadcast.258780.19.clone.1)
+  %xor.123699.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252296.17.clone.1, %add.252297.19.clone.1)
+  %shift-right-logical.117601.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123699.17.clone.1, %broadcast.244468.1920)
+  %or.117138.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117601.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5832.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117138.13.clone.1)
+  %add.252298.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5832.11.clone.1, %broadcast.244470.1152)
+  %multiply.27264.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252298.9.clone.1, %broadcast.244471.896)
+  %add.252299.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27264.7.clone.1, %broadcast.244408.1024)
+  %maximum.3764.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252299.5.clone.1)
+  %abs.1592.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3764.3.clone.1)
+  %compare.7346.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1592.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27265.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3764.3.clone.1, %broadcast.244476.1152)
+  %negate.4689.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3764.3.clone.1)
+  %multiply.27266.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3764.3.clone.1, %negate.4689.5.clone.1)
+  %log-plus-one.1592.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27266.5.clone.1)
+  %negate.4690.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1592.3.clone.1)
+  %compare.7347.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4690.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21619.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21620.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21621.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21622.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21623.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21624.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21625.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21626.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21627.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252300.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4690.4.clone.1, %broadcast.244496.640)
+  %sqrt.1592.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4690.4.clone.1)
+  %add.252301.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1592.5.clone.1, %broadcast.244498.640)
+  %select.21628.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7347.3.clone.1, %add.252300.5.clone.1, %add.252301.5.clone.1)
+  %multiply.27267.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21627.3.clone.1, %select.21628.3.clone.1)
+  %add.252302.1.clone.1 = f32[1280,1280]{1,0} add(%select.21626.3.clone.1, %multiply.27267.1.clone.1)
+  %multiply.27268.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252302.1.clone.1, %select.21628.3.clone.1)
+  %add.252303.1.clone.1 = f32[1280,1280]{1,0} add(%select.21625.3.clone.1, %multiply.27268.1.clone.1)
+  %multiply.27269.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252303.1.clone.1, %select.21628.3.clone.1)
+  %add.252304.1.clone.1 = f32[1280,1280]{1,0} add(%select.21624.3.clone.1, %multiply.27269.1.clone.1)
+  %multiply.27270.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252304.1.clone.1, %select.21628.3.clone.1)
+  %add.252305.1.clone.1 = f32[1280,1280]{1,0} add(%select.21623.3.clone.1, %multiply.27270.1.clone.1)
+  %multiply.27271.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252305.1.clone.1, %select.21628.3.clone.1)
+  %add.252306.3.clone.1 = f32[1280,1280]{1,0} add(%select.21622.5.clone.1, %multiply.27271.1.clone.1)
+  %multiply.27272.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252306.3.clone.1, %select.21628.3.clone.1)
+  %add.252307.3.clone.1 = f32[1280,1280]{1,0} add(%select.21621.5.clone.1, %multiply.27272.1.clone.1)
+  %multiply.27273.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252307.3.clone.1, %select.21628.3.clone.1)
+  %add.252308.9.clone.1 = f32[1280,1280]{1,0} add(%select.21620.11.clone.1, %multiply.27273.7.clone.1)
+  %multiply.27274.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252308.9.clone.1, %select.21628.3.clone.1)
+  %add.252309.7.clone.1 = f32[1280,1280]{1,0} add(%select.21619.7.clone.1, %multiply.27274.7.clone.1)
+  %multiply.27275.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252309.7.clone.1, %maximum.3764.3.clone.1)
+  %select.21629.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7346.3.clone.1, %multiply.27265.9.clone.1, %multiply.27275.7.clone.1)
+  %multiply.27276.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21629.7.clone.1, %broadcast.244500.640)
+  %clamp.1236.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27276.5.clone.1, %broadcast.244501.384)
+  %multiply.27277.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1236.3.clone.1, %broadcast.244502.1)
+  %constant_173163_1_clone_1 = u32[] constant(3938287682)
+  %broadcast.250910.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173163_1_clone_1), dimensions={}
+  %add.247790.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.250910.44.clone.1)
+  %constant_173170_1_clone_1 = u32[] constant(2876408776)
+  %broadcast.250911.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173170_1_clone_1), dimensions={}
+  %add.247791.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.250911.113.clone.1)
+  %add.247793.35.clone.1 = u32[1280,1280]{1,0} add(%add.247790.37.clone.1, %add.247791.99.clone.1)
+  %shift-left.109360.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247791.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115519.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247791.99.clone.1, %broadcast.244415.6016)
+  %or.115055.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109360.31.clone.1, %shift-right-logical.115519.29.clone.1)
+  %xor.121604.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247793.35.clone.1, %or.115055.29.clone.1)
+  %add.247794.5.clone.1 = u32[1280,1280]{1,0} add(%add.247793.35.clone.1, %xor.121604.27.clone.1)
+  %shift-left.109361.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121604.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115520.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121604.27.clone.1, %broadcast.244417.5760)
+  %or.115056.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109361.9.clone.1, %shift-right-logical.115520.9.clone.1)
+  %xor.121605.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247794.5.clone.1, %or.115056.7.clone.1)
+  %add.247795.3.clone.1 = u32[1280,1280]{1,0} add(%add.247794.5.clone.1, %xor.121605.5.clone.1)
+  %shift-left.109362.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121605.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115521.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121605.5.clone.1, %broadcast.244419.4352)
+  %or.115057.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109362.5.clone.1, %shift-right-logical.115521.5.clone.1)
+  %xor.121606.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247795.3.clone.1, %or.115057.3.clone.1)
+  %add.247796.3.clone.1 = u32[1280,1280]{1,0} add(%add.247795.3.clone.1, %xor.121606.3.clone.1)
+  %add.247797.7.clone.1 = u32[1280,1280]{1,0} add(%add.247796.3.clone.1, %broadcast.250911.113.clone.1)
+  %shift-left.109363.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121606.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115522.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121606.3.clone.1, %broadcast.244418.4352)
+  %or.115058.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109363.5.clone.1, %shift-right-logical.115522.5.clone.1)
+  %xor.121607.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247796.3.clone.1, %or.115058.3.clone.1)
+  %constant_218171_1_clone_1 = u32[] constant(1511921745)
+  %broadcast.250921.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218171_1_clone_1), dimensions={}
+  %add.247799.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121607.3.clone.1, %broadcast.250921.5.clone.1)
+  %add.247803.5.clone.1 = u32[1280,1280]{1,0} add(%add.247797.7.clone.1, %add.247799.5.clone.1)
+  %shift-left.109364.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247799.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115523.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247799.5.clone.1, %broadcast.244416.5760)
+  %or.115059.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109364.9.clone.1, %shift-right-logical.115523.9.clone.1)
+  %xor.121609.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247803.5.clone.1, %or.115059.7.clone.1)
+  %add.247804.3.clone.1 = u32[1280,1280]{1,0} add(%add.247803.5.clone.1, %xor.121609.5.clone.1)
+  %shift-left.109365.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121609.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115524.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121609.5.clone.1, %broadcast.244429.2304)
+  %or.115060.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109365.9.clone.1, %shift-right-logical.115524.9.clone.1)
+  %xor.121610.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247804.3.clone.1, %or.115060.7.clone.1)
+  %add.247805.3.clone.1 = u32[1280,1280]{1,0} add(%add.247804.3.clone.1, %xor.121610.5.clone.1)
+  %shift-left.109366.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121610.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115525.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121610.5.clone.1, %broadcast.244430.4608)
+  %or.115061.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109366.9.clone.1, %shift-right-logical.115525.9.clone.1)
+  %xor.121611.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247805.3.clone.1, %or.115061.7.clone.1)
+  %add.247806.3.clone.1 = u32[1280,1280]{1,0} add(%add.247805.3.clone.1, %xor.121611.5.clone.1)
+  %constant_173172_1_clone_1 = u32[] constant(1511921744)
+  %broadcast.250928.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_173172_1_clone_1), dimensions={}
+  %add.247808.7.clone.1 = u32[1280,1280]{1,0} add(%add.247806.3.clone.1, %broadcast.250928.24.clone.1)
+  %shift-left.109367.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121611.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115526.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121611.5.clone.1, %broadcast.244434.2816)
+  %or.115062.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109367.11.clone.1, %shift-right-logical.115526.11.clone.1)
+  %xor.121612.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247806.3.clone.1, %or.115062.9.clone.1)
+  %constant_218172_1_clone_1 = u32[] constant(3938287684)
+  %broadcast.250931.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218172_1_clone_1), dimensions={}
+  %add.247809.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121612.7.clone.1, %broadcast.250931.5.clone.1)
+  %add.247810.5.clone.1 = u32[1280,1280]{1,0} add(%add.247808.7.clone.1, %add.247809.5.clone.1)
+  %shift-left.109368.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247809.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115527.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247809.5.clone.1, %broadcast.244415.6016)
+  %or.115064.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109368.9.clone.1, %shift-right-logical.115527.9.clone.1)
+  %xor.121614.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247810.5.clone.1, %or.115064.7.clone.1)
+  %add.247811.3.clone.1 = u32[1280,1280]{1,0} add(%add.247810.5.clone.1, %xor.121614.5.clone.1)
+  %shift-left.109369.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121614.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115528.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121614.5.clone.1, %broadcast.244417.5760)
+  %or.115065.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109369.9.clone.1, %shift-right-logical.115528.9.clone.1)
+  %xor.121615.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247811.3.clone.1, %or.115065.7.clone.1)
+  %add.247813.3.clone.1 = u32[1280,1280]{1,0} add(%add.247811.3.clone.1, %xor.121615.5.clone.1)
+  %shift-left.109370.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121615.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115529.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121615.5.clone.1, %broadcast.244419.4352)
+  %or.115066.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109370.7.clone.1, %shift-right-logical.115529.7.clone.1)
+  %xor.121616.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247813.3.clone.1, %or.115066.5.clone.1)
+  %add.247814.3.clone.1 = u32[1280,1280]{1,0} add(%add.247813.3.clone.1, %xor.121616.3.clone.1)
+  %add.247815.7.clone.1 = u32[1280,1280]{1,0} add(%add.247814.3.clone.1, %broadcast.250910.44.clone.1)
+  %shift-left.109371.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121616.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115530.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121616.3.clone.1, %broadcast.244418.4352)
+  %or.115067.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109371.7.clone.1, %shift-right-logical.115530.7.clone.1)
+  %xor.121617.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247814.3.clone.1, %or.115067.5.clone.1)
+  %constant_218173_1_clone_1 = u32[] constant(2876408779)
+  %broadcast.250941.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218173_1_clone_1), dimensions={}
+  %add.247816.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121617.3.clone.1, %broadcast.250941.5.clone.1)
+  %add.247818.5.clone.1 = u32[1280,1280]{1,0} add(%add.247815.7.clone.1, %add.247816.5.clone.1)
+  %shift-left.109372.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247816.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115531.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247816.5.clone.1, %broadcast.244416.5760)
+  %or.115069.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109372.9.clone.1, %shift-right-logical.115531.9.clone.1)
+  %xor.121619.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247818.5.clone.1, %or.115069.7.clone.1)
+  %add.247819.3.clone.1 = u32[1280,1280]{1,0} add(%add.247818.5.clone.1, %xor.121619.5.clone.1)
+  %shift-left.109373.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121619.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115532.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121619.5.clone.1, %broadcast.244429.2304)
+  %or.115070.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109373.9.clone.1, %shift-right-logical.115532.9.clone.1)
+  %xor.121620.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247819.3.clone.1, %or.115070.7.clone.1)
+  %add.247820.3.clone.1 = u32[1280,1280]{1,0} add(%add.247819.3.clone.1, %xor.121620.5.clone.1)
+  %shift-left.109374.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121620.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115533.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121620.5.clone.1, %broadcast.244430.4608)
+  %or.115071.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109374.9.clone.1, %shift-right-logical.115533.9.clone.1)
+  %xor.121621.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247820.3.clone.1, %or.115071.7.clone.1)
+  %add.247821.3.clone.1 = u32[1280,1280]{1,0} add(%add.247820.3.clone.1, %xor.121621.5.clone.1)
+  %add.247822.7.clone.1 = u32[1280,1280]{1,0} add(%add.247821.3.clone.1, %broadcast.250911.113.clone.1)
+  %shift-left.109375.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121621.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115534.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121621.5.clone.1, %broadcast.244434.2816)
+  %or.115072.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109375.11.clone.1, %shift-right-logical.115534.11.clone.1)
+  %xor.121622.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247821.3.clone.1, %or.115072.9.clone.1)
+  %constant_218174_1_clone_1 = u32[] constant(1511921748)
+  %broadcast.250951.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218174_1_clone_1), dimensions={}
+  %add.247824.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121622.7.clone.1, %broadcast.250951.5.clone.1)
+  %add.247827.5.clone.1 = u32[1280,1280]{1,0} add(%add.247822.7.clone.1, %add.247824.5.clone.1)
+  %shift-left.109376.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247824.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115535.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247824.5.clone.1, %broadcast.244415.6016)
+  %or.115074.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109376.9.clone.1, %shift-right-logical.115535.9.clone.1)
+  %xor.121624.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247827.5.clone.1, %or.115074.7.clone.1)
+  %add.247828.3.clone.1 = u32[1280,1280]{1,0} add(%add.247827.5.clone.1, %xor.121624.5.clone.1)
+  %shift-left.109377.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121624.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115536.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121624.5.clone.1, %broadcast.244417.5760)
+  %or.115075.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109377.9.clone.1, %shift-right-logical.115536.9.clone.1)
+  %xor.121625.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247828.3.clone.1, %or.115075.7.clone.1)
+  %add.247829.3.clone.1 = u32[1280,1280]{1,0} add(%add.247828.3.clone.1, %xor.121625.5.clone.1)
+  %shift-left.109378.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121625.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115537.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121625.5.clone.1, %broadcast.244419.4352)
+  %or.115076.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109378.5.clone.1, %shift-right-logical.115537.5.clone.1)
+  %xor.121626.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247829.3.clone.1, %or.115076.3.clone.1)
+  %add.247830.3.clone.1 = u32[1280,1280]{1,0} add(%add.247829.3.clone.1, %xor.121626.3.clone.1)
+  %add.247831.17.clone.1 = u32[1280,1280]{1,0} add(%add.247830.3.clone.1, %broadcast.250928.24.clone.1)
+  %shift-left.109379.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121626.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115539.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121626.3.clone.1, %broadcast.244418.4352)
+  %or.115077.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109379.5.clone.1, %shift-right-logical.115539.5.clone.1)
+  %xor.121627.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247830.3.clone.1, %or.115077.3.clone.1)
+  %constant_218175_1_clone_1 = u32[] constant(3938287687)
+  %broadcast.250964.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218175_1_clone_1), dimensions={}
+  %add.247832.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121627.15.clone.1, %broadcast.250964.19.clone.1)
+  %xor.121628.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247831.17.clone.1, %add.247832.19.clone.1)
+  %shift-right-logical.115540.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121628.17.clone.1, %broadcast.244468.1920)
+  %or.115079.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115540.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5743.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115079.13.clone.1)
+  %add.247833.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5743.11.clone.1, %broadcast.244470.1152)
+  %multiply.26348.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247833.9.clone.1, %broadcast.244471.896)
+  %add.247834.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26348.7.clone.1, %broadcast.244408.1024)
+  %maximum.3675.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247834.5.clone.1)
+  %abs.1533.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3675.3.clone.1)
+  %compare.7214.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1533.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26349.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3675.3.clone.1, %broadcast.244476.1152)
+  %negate.4571.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3675.3.clone.1)
+  %multiply.26350.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3675.3.clone.1, %negate.4571.5.clone.1)
+  %log-plus-one.1533.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26350.5.clone.1)
+  %negate.4572.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1533.3.clone.1)
+  %compare.7215.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4572.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20928.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20929.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20930.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20931.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20932.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20933.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20934.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20935.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20936.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247835.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4572.4.clone.1, %broadcast.244496.640)
+  %sqrt.1533.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4572.4.clone.1)
+  %add.247836.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1533.5.clone.1, %broadcast.244498.640)
+  %select.20937.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7215.3.clone.1, %add.247835.5.clone.1, %add.247836.5.clone.1)
+  %multiply.26351.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20936.3.clone.1, %select.20937.3.clone.1)
+  %add.247837.1.clone.1 = f32[1280,1280]{1,0} add(%select.20935.3.clone.1, %multiply.26351.1.clone.1)
+  %multiply.26352.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247837.1.clone.1, %select.20937.3.clone.1)
+  %add.247838.1.clone.1 = f32[1280,1280]{1,0} add(%select.20934.3.clone.1, %multiply.26352.1.clone.1)
+  %multiply.26353.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247838.1.clone.1, %select.20937.3.clone.1)
+  %add.247839.1.clone.1 = f32[1280,1280]{1,0} add(%select.20933.3.clone.1, %multiply.26353.1.clone.1)
+  %multiply.26354.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247839.1.clone.1, %select.20937.3.clone.1)
+  %add.247840.1.clone.1 = f32[1280,1280]{1,0} add(%select.20932.3.clone.1, %multiply.26354.1.clone.1)
+  %multiply.26355.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247840.1.clone.1, %select.20937.3.clone.1)
+  %add.247841.3.clone.1 = f32[1280,1280]{1,0} add(%select.20931.5.clone.1, %multiply.26355.1.clone.1)
+  %multiply.26356.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247841.3.clone.1, %select.20937.3.clone.1)
+  %add.247842.3.clone.1 = f32[1280,1280]{1,0} add(%select.20930.5.clone.1, %multiply.26356.1.clone.1)
+  %multiply.26357.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247842.3.clone.1, %select.20937.3.clone.1)
+  %add.247843.9.clone.1 = f32[1280,1280]{1,0} add(%select.20929.11.clone.1, %multiply.26357.7.clone.1)
+  %multiply.26358.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247843.9.clone.1, %select.20937.3.clone.1)
+  %add.247844.7.clone.1 = f32[1280,1280]{1,0} add(%select.20928.7.clone.1, %multiply.26358.7.clone.1)
+  %multiply.26359.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247844.7.clone.1, %maximum.3675.3.clone.1)
+  %select.20938.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7214.3.clone.1, %multiply.26349.9.clone.1, %multiply.26359.7.clone.1)
+  %multiply.26360.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20938.7.clone.1, %broadcast.244500.640)
+  %clamp.1177.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26360.5.clone.1, %broadcast.244501.384)
+  %multiply.26361.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1177.3.clone.1, %broadcast.244502.1)
+  %constant_185150_1_clone_1 = u32[] constant(328282474)
+  %broadcast.256098.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185150_1_clone_1), dimensions={}
+  %add.250743.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.256098.44.clone.1)
+  %constant_185157_1_clone_1 = u32[] constant(3490391876)
+  %broadcast.256099.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185157_1_clone_1), dimensions={}
+  %add.250744.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256099.113.clone.1)
+  %add.250745.35.clone.1 = u32[1280,1280]{1,0} add(%add.250743.37.clone.1, %add.250744.99.clone.1)
+  %shift-left.110640.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250744.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116891.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250744.99.clone.1, %broadcast.244415.6016)
+  %or.116412.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110640.31.clone.1, %shift-right-logical.116891.29.clone.1)
+  %xor.122982.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250745.35.clone.1, %or.116412.29.clone.1)
+  %add.250746.5.clone.1 = u32[1280,1280]{1,0} add(%add.250745.35.clone.1, %xor.122982.27.clone.1)
+  %shift-left.110641.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122982.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116892.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122982.27.clone.1, %broadcast.244417.5760)
+  %or.116413.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110641.9.clone.1, %shift-right-logical.116892.9.clone.1)
+  %xor.122983.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250746.5.clone.1, %or.116413.7.clone.1)
+  %add.250747.3.clone.1 = u32[1280,1280]{1,0} add(%add.250746.5.clone.1, %xor.122983.5.clone.1)
+  %shift-left.110642.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122983.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116893.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122983.5.clone.1, %broadcast.244419.4352)
+  %or.116414.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110642.5.clone.1, %shift-right-logical.116893.5.clone.1)
+  %xor.122984.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250747.3.clone.1, %or.116414.3.clone.1)
+  %add.250748.3.clone.1 = u32[1280,1280]{1,0} add(%add.250747.3.clone.1, %xor.122984.3.clone.1)
+  %add.250749.7.clone.1 = u32[1280,1280]{1,0} add(%add.250748.3.clone.1, %broadcast.256099.113.clone.1)
+  %shift-left.110643.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122984.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116895.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122984.3.clone.1, %broadcast.244418.4352)
+  %or.116415.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110643.5.clone.1, %shift-right-logical.116895.5.clone.1)
+  %xor.122985.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250748.3.clone.1, %or.116415.3.clone.1)
+  %constant_218499_1_clone_1 = u32[] constant(3628797429)
+  %broadcast.256110.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218499_1_clone_1), dimensions={}
+  %add.250750.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122985.3.clone.1, %broadcast.256110.5.clone.1)
+  %add.250751.5.clone.1 = u32[1280,1280]{1,0} add(%add.250749.7.clone.1, %add.250750.5.clone.1)
+  %shift-left.110644.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250750.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116896.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250750.5.clone.1, %broadcast.244416.5760)
+  %or.116416.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110644.9.clone.1, %shift-right-logical.116896.9.clone.1)
+  %xor.122986.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250751.5.clone.1, %or.116416.7.clone.1)
+  %add.250752.3.clone.1 = u32[1280,1280]{1,0} add(%add.250751.5.clone.1, %xor.122986.5.clone.1)
+  %shift-left.110645.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122986.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116897.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122986.5.clone.1, %broadcast.244429.2304)
+  %or.116417.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110645.9.clone.1, %shift-right-logical.116897.9.clone.1)
+  %xor.122987.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250752.3.clone.1, %or.116417.7.clone.1)
+  %add.250753.3.clone.1 = u32[1280,1280]{1,0} add(%add.250752.3.clone.1, %xor.122987.5.clone.1)
+  %shift-left.110646.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122987.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116898.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122987.5.clone.1, %broadcast.244430.4608)
+  %or.116418.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110646.9.clone.1, %shift-right-logical.116898.9.clone.1)
+  %xor.122988.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250753.3.clone.1, %or.116418.7.clone.1)
+  %add.250754.3.clone.1 = u32[1280,1280]{1,0} add(%add.250753.3.clone.1, %xor.122988.5.clone.1)
+  %constant_185159_1_clone_1 = u32[] constant(3628797428)
+  %broadcast.256122.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_185159_1_clone_1), dimensions={}
+  %add.250755.7.clone.1 = u32[1280,1280]{1,0} add(%add.250754.3.clone.1, %broadcast.256122.24.clone.1)
+  %shift-left.110647.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122988.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116900.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122988.5.clone.1, %broadcast.244434.2816)
+  %or.116419.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110647.11.clone.1, %shift-right-logical.116900.11.clone.1)
+  %xor.122989.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250754.3.clone.1, %or.116419.9.clone.1)
+  %constant_218500_1_clone_1 = u32[] constant(328282476)
+  %broadcast.256127.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218500_1_clone_1), dimensions={}
+  %add.250756.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122989.7.clone.1, %broadcast.256127.5.clone.1)
+  %add.250757.5.clone.1 = u32[1280,1280]{1,0} add(%add.250755.7.clone.1, %add.250756.5.clone.1)
+  %shift-left.110648.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250756.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116901.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250756.5.clone.1, %broadcast.244415.6016)
+  %or.116420.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110648.9.clone.1, %shift-right-logical.116901.9.clone.1)
+  %xor.122990.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250757.5.clone.1, %or.116420.7.clone.1)
+  %add.250758.3.clone.1 = u32[1280,1280]{1,0} add(%add.250757.5.clone.1, %xor.122990.5.clone.1)
+  %shift-left.110649.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122990.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116902.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122990.5.clone.1, %broadcast.244417.5760)
+  %or.116421.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110649.9.clone.1, %shift-right-logical.116902.9.clone.1)
+  %xor.122991.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250758.3.clone.1, %or.116421.7.clone.1)
+  %add.250759.3.clone.1 = u32[1280,1280]{1,0} add(%add.250758.3.clone.1, %xor.122991.5.clone.1)
+  %shift-left.110650.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122991.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116903.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122991.5.clone.1, %broadcast.244419.4352)
+  %or.116422.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110650.7.clone.1, %shift-right-logical.116903.7.clone.1)
+  %xor.122992.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250759.3.clone.1, %or.116422.5.clone.1)
+  %add.250760.3.clone.1 = u32[1280,1280]{1,0} add(%add.250759.3.clone.1, %xor.122992.3.clone.1)
+  %add.250761.7.clone.1 = u32[1280,1280]{1,0} add(%add.250760.3.clone.1, %broadcast.256098.44.clone.1)
+  %shift-left.110651.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122992.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116905.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122992.3.clone.1, %broadcast.244418.4352)
+  %or.116423.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110651.7.clone.1, %shift-right-logical.116905.7.clone.1)
+  %xor.122993.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250760.3.clone.1, %or.116423.5.clone.1)
+  %constant_218501_1_clone_1 = u32[] constant(3490391879)
+  %broadcast.256147.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218501_1_clone_1), dimensions={}
+  %add.250762.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122993.3.clone.1, %broadcast.256147.5.clone.1)
+  %add.250763.5.clone.1 = u32[1280,1280]{1,0} add(%add.250761.7.clone.1, %add.250762.5.clone.1)
+  %shift-left.110652.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250762.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116906.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250762.5.clone.1, %broadcast.244416.5760)
+  %or.116424.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110652.9.clone.1, %shift-right-logical.116906.9.clone.1)
+  %xor.122994.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250763.5.clone.1, %or.116424.7.clone.1)
+  %add.250764.3.clone.1 = u32[1280,1280]{1,0} add(%add.250763.5.clone.1, %xor.122994.5.clone.1)
+  %shift-left.110653.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122994.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116907.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122994.5.clone.1, %broadcast.244429.2304)
+  %or.116425.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110653.9.clone.1, %shift-right-logical.116907.9.clone.1)
+  %xor.122995.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250764.3.clone.1, %or.116425.7.clone.1)
+  %add.250765.3.clone.1 = u32[1280,1280]{1,0} add(%add.250764.3.clone.1, %xor.122995.5.clone.1)
+  %shift-left.110654.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122995.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116908.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122995.5.clone.1, %broadcast.244430.4608)
+  %or.116426.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110654.9.clone.1, %shift-right-logical.116908.9.clone.1)
+  %xor.122996.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250765.3.clone.1, %or.116426.7.clone.1)
+  %add.250766.3.clone.1 = u32[1280,1280]{1,0} add(%add.250765.3.clone.1, %xor.122996.5.clone.1)
+  %add.250767.7.clone.1 = u32[1280,1280]{1,0} add(%add.250766.3.clone.1, %broadcast.256099.113.clone.1)
+  %shift-left.110655.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122996.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116910.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122996.5.clone.1, %broadcast.244434.2816)
+  %or.116427.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110655.11.clone.1, %shift-right-logical.116910.11.clone.1)
+  %xor.122997.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250766.3.clone.1, %or.116427.9.clone.1)
+  %constant_218502_1_clone_1 = u32[] constant(3628797432)
+  %broadcast.256167.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218502_1_clone_1), dimensions={}
+  %add.250768.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122997.7.clone.1, %broadcast.256167.5.clone.1)
+  %add.250769.5.clone.1 = u32[1280,1280]{1,0} add(%add.250767.7.clone.1, %add.250768.5.clone.1)
+  %shift-left.110656.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250768.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116911.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250768.5.clone.1, %broadcast.244415.6016)
+  %or.116428.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110656.9.clone.1, %shift-right-logical.116911.9.clone.1)
+  %xor.122998.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250769.5.clone.1, %or.116428.7.clone.1)
+  %add.250770.3.clone.1 = u32[1280,1280]{1,0} add(%add.250769.5.clone.1, %xor.122998.5.clone.1)
+  %shift-left.110657.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122998.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116912.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122998.5.clone.1, %broadcast.244417.5760)
+  %or.116429.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110657.9.clone.1, %shift-right-logical.116912.9.clone.1)
+  %xor.122999.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250770.3.clone.1, %or.116429.7.clone.1)
+  %add.250771.3.clone.1 = u32[1280,1280]{1,0} add(%add.250770.3.clone.1, %xor.122999.5.clone.1)
+  %shift-left.110658.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122999.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116913.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122999.5.clone.1, %broadcast.244419.4352)
+  %or.116430.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110658.5.clone.1, %shift-right-logical.116913.5.clone.1)
+  %xor.123000.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250771.3.clone.1, %or.116430.3.clone.1)
+  %add.250772.3.clone.1 = u32[1280,1280]{1,0} add(%add.250771.3.clone.1, %xor.123000.3.clone.1)
+  %add.250773.17.clone.1 = u32[1280,1280]{1,0} add(%add.250772.3.clone.1, %broadcast.256122.24.clone.1)
+  %shift-left.110659.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123000.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116914.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123000.3.clone.1, %broadcast.244418.4352)
+  %or.116431.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110659.5.clone.1, %shift-right-logical.116914.5.clone.1)
+  %xor.123001.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250772.3.clone.1, %or.116431.3.clone.1)
+  %constant_218503_1_clone_1 = u32[] constant(328282479)
+  %broadcast.256179.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218503_1_clone_1), dimensions={}
+  %add.250774.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123001.15.clone.1, %broadcast.256179.19.clone.1)
+  %xor.123002.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250773.17.clone.1, %add.250774.19.clone.1)
+  %shift-right-logical.116915.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123002.17.clone.1, %broadcast.244468.1920)
+  %or.116433.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116915.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5802.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116433.13.clone.1)
+  %add.250775.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5802.11.clone.1, %broadcast.244470.1152)
+  %multiply.26951.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250775.9.clone.1, %broadcast.244471.896)
+  %add.250776.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26951.7.clone.1, %broadcast.244408.1024)
+  %maximum.3734.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250776.5.clone.1)
+  %abs.1572.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3734.3.clone.1)
+  %compare.7300.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1572.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26952.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3734.3.clone.1, %broadcast.244476.1152)
+  %negate.4649.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3734.3.clone.1)
+  %multiply.26953.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3734.3.clone.1, %negate.4649.5.clone.1)
+  %log-plus-one.1572.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26953.5.clone.1)
+  %negate.4650.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1572.3.clone.1)
+  %compare.7301.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4650.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21378.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21379.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21380.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21381.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21382.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21383.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21384.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21385.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21386.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250778.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4650.4.clone.1, %broadcast.244496.640)
+  %sqrt.1572.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4650.4.clone.1)
+  %add.250779.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1572.5.clone.1, %broadcast.244498.640)
+  %select.21387.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7301.3.clone.1, %add.250778.5.clone.1, %add.250779.5.clone.1)
+  %multiply.26954.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21386.3.clone.1, %select.21387.3.clone.1)
+  %add.250780.1.clone.1 = f32[1280,1280]{1,0} add(%select.21385.3.clone.1, %multiply.26954.1.clone.1)
+  %multiply.26955.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250780.1.clone.1, %select.21387.3.clone.1)
+  %add.250781.1.clone.1 = f32[1280,1280]{1,0} add(%select.21384.3.clone.1, %multiply.26955.1.clone.1)
+  %multiply.26956.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250781.1.clone.1, %select.21387.3.clone.1)
+  %add.250782.1.clone.1 = f32[1280,1280]{1,0} add(%select.21383.3.clone.1, %multiply.26956.1.clone.1)
+  %multiply.26957.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250782.1.clone.1, %select.21387.3.clone.1)
+  %add.250783.1.clone.1 = f32[1280,1280]{1,0} add(%select.21382.3.clone.1, %multiply.26957.1.clone.1)
+  %multiply.26958.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250783.1.clone.1, %select.21387.3.clone.1)
+  %add.250784.3.clone.1 = f32[1280,1280]{1,0} add(%select.21381.5.clone.1, %multiply.26958.1.clone.1)
+  %multiply.26959.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250784.3.clone.1, %select.21387.3.clone.1)
+  %add.250785.3.clone.1 = f32[1280,1280]{1,0} add(%select.21380.5.clone.1, %multiply.26959.1.clone.1)
+  %multiply.26960.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250785.3.clone.1, %select.21387.3.clone.1)
+  %add.250786.9.clone.1 = f32[1280,1280]{1,0} add(%select.21379.11.clone.1, %multiply.26960.7.clone.1)
+  %multiply.26961.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250786.9.clone.1, %select.21387.3.clone.1)
+  %add.250787.7.clone.1 = f32[1280,1280]{1,0} add(%select.21378.7.clone.1, %multiply.26961.7.clone.1)
+  %multiply.26962.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250787.7.clone.1, %maximum.3734.3.clone.1)
+  %select.21388.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7300.3.clone.1, %multiply.26952.9.clone.1, %multiply.26962.7.clone.1)
+  %multiply.26963.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21388.7.clone.1, %broadcast.244500.640)
+  %clamp.1216.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26963.5.clone.1, %broadcast.244501.384)
+  %multiply.26964.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1216.3.clone.1, %broadcast.244502.1)
+  %constant_172952_1_clone_1 = u32[] constant(575593931)
+  %broadcast.250824.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172952_1_clone_1), dimensions={}
+  %add.247730.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.250824.44.clone.1)
+  %constant_172959_1_clone_1 = u32[] constant(3302378801)
+  %broadcast.250825.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172959_1_clone_1), dimensions={}
+  %add.247731.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.250825.113.clone.1)
+  %add.247733.35.clone.1 = u32[1280,1280]{1,0} add(%add.247730.37.clone.1, %add.247731.99.clone.1)
+  %shift-left.109340.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247731.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115498.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247731.99.clone.1, %broadcast.244415.6016)
+  %or.115030.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109340.31.clone.1, %shift-right-logical.115498.29.clone.1)
+  %xor.121579.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247733.35.clone.1, %or.115030.29.clone.1)
+  %add.247734.5.clone.1 = u32[1280,1280]{1,0} add(%add.247733.35.clone.1, %xor.121579.27.clone.1)
+  %shift-left.109341.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121579.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115499.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121579.27.clone.1, %broadcast.244417.5760)
+  %or.115031.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109341.9.clone.1, %shift-right-logical.115499.9.clone.1)
+  %xor.121580.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247734.5.clone.1, %or.115031.7.clone.1)
+  %add.247735.3.clone.1 = u32[1280,1280]{1,0} add(%add.247734.5.clone.1, %xor.121580.5.clone.1)
+  %shift-left.109342.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121580.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115500.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121580.5.clone.1, %broadcast.244419.4352)
+  %or.115032.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109342.5.clone.1, %shift-right-logical.115500.5.clone.1)
+  %xor.121581.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247735.3.clone.1, %or.115032.3.clone.1)
+  %add.247736.3.clone.1 = u32[1280,1280]{1,0} add(%add.247735.3.clone.1, %xor.121581.3.clone.1)
+  %add.247738.7.clone.1 = u32[1280,1280]{1,0} add(%add.247736.3.clone.1, %broadcast.250825.113.clone.1)
+  %shift-left.109343.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121581.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115501.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121581.3.clone.1, %broadcast.244418.4352)
+  %or.115033.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109343.5.clone.1, %shift-right-logical.115501.5.clone.1)
+  %xor.121582.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247736.3.clone.1, %or.115033.3.clone.1)
+  %constant_218166_1_clone_1 = u32[] constant(4249455393)
+  %broadcast.250835.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218166_1_clone_1), dimensions={}
+  %add.247739.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121582.3.clone.1, %broadcast.250835.5.clone.1)
+  %add.247740.5.clone.1 = u32[1280,1280]{1,0} add(%add.247738.7.clone.1, %add.247739.5.clone.1)
+  %shift-left.109344.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247739.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115502.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247739.5.clone.1, %broadcast.244416.5760)
+  %or.115034.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109344.9.clone.1, %shift-right-logical.115502.9.clone.1)
+  %xor.121584.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247740.5.clone.1, %or.115034.7.clone.1)
+  %add.247741.3.clone.1 = u32[1280,1280]{1,0} add(%add.247740.5.clone.1, %xor.121584.5.clone.1)
+  %shift-left.109345.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121584.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115503.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121584.5.clone.1, %broadcast.244429.2304)
+  %or.115035.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109345.9.clone.1, %shift-right-logical.115503.9.clone.1)
+  %xor.121585.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247741.3.clone.1, %or.115035.7.clone.1)
+  %add.247743.3.clone.1 = u32[1280,1280]{1,0} add(%add.247741.3.clone.1, %xor.121585.5.clone.1)
+  %shift-left.109346.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121585.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115504.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121585.5.clone.1, %broadcast.244430.4608)
+  %or.115036.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109346.9.clone.1, %shift-right-logical.115504.9.clone.1)
+  %xor.121586.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247743.3.clone.1, %or.115036.7.clone.1)
+  %add.247744.3.clone.1 = u32[1280,1280]{1,0} add(%add.247743.3.clone.1, %xor.121586.5.clone.1)
+  %constant_172961_1_clone_1 = u32[] constant(4249455392)
+  %broadcast.250842.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172961_1_clone_1), dimensions={}
+  %add.247745.7.clone.1 = u32[1280,1280]{1,0} add(%add.247744.3.clone.1, %broadcast.250842.24.clone.1)
+  %shift-left.109347.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121586.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115505.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121586.5.clone.1, %broadcast.244434.2816)
+  %or.115037.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109347.11.clone.1, %shift-right-logical.115505.11.clone.1)
+  %xor.121587.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247744.3.clone.1, %or.115037.9.clone.1)
+  %constant_218167_1_clone_1 = u32[] constant(575593933)
+  %broadcast.250845.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218167_1_clone_1), dimensions={}
+  %add.247746.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121587.7.clone.1, %broadcast.250845.5.clone.1)
+  %add.247747.5.clone.1 = u32[1280,1280]{1,0} add(%add.247745.7.clone.1, %add.247746.5.clone.1)
+  %shift-left.109348.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247746.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115506.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247746.5.clone.1, %broadcast.244415.6016)
+  %or.115039.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109348.9.clone.1, %shift-right-logical.115506.9.clone.1)
+  %xor.121589.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247747.5.clone.1, %or.115039.7.clone.1)
+  %add.247749.3.clone.1 = u32[1280,1280]{1,0} add(%add.247747.5.clone.1, %xor.121589.5.clone.1)
+  %shift-left.109349.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121589.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115507.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121589.5.clone.1, %broadcast.244417.5760)
+  %or.115040.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109349.9.clone.1, %shift-right-logical.115507.9.clone.1)
+  %xor.121590.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247749.3.clone.1, %or.115040.7.clone.1)
+  %add.247753.3.clone.1 = u32[1280,1280]{1,0} add(%add.247749.3.clone.1, %xor.121590.5.clone.1)
+  %shift-left.109350.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121590.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115508.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121590.5.clone.1, %broadcast.244419.4352)
+  %or.115041.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109350.7.clone.1, %shift-right-logical.115508.7.clone.1)
+  %xor.121591.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247753.3.clone.1, %or.115041.5.clone.1)
+  %add.247754.3.clone.1 = u32[1280,1280]{1,0} add(%add.247753.3.clone.1, %xor.121591.3.clone.1)
+  %add.247755.7.clone.1 = u32[1280,1280]{1,0} add(%add.247754.3.clone.1, %broadcast.250824.44.clone.1)
+  %shift-left.109351.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121591.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115509.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121591.3.clone.1, %broadcast.244418.4352)
+  %or.115042.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109351.7.clone.1, %shift-right-logical.115509.7.clone.1)
+  %xor.121592.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247754.3.clone.1, %or.115042.5.clone.1)
+  %constant_218168_1_clone_1 = u32[] constant(3302378804)
+  %broadcast.250855.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218168_1_clone_1), dimensions={}
+  %add.247756.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121592.3.clone.1, %broadcast.250855.5.clone.1)
+  %add.247758.5.clone.1 = u32[1280,1280]{1,0} add(%add.247755.7.clone.1, %add.247756.5.clone.1)
+  %shift-left.109352.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247756.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115510.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247756.5.clone.1, %broadcast.244416.5760)
+  %or.115044.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109352.9.clone.1, %shift-right-logical.115510.9.clone.1)
+  %xor.121594.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247758.5.clone.1, %or.115044.7.clone.1)
+  %add.247759.3.clone.1 = u32[1280,1280]{1,0} add(%add.247758.5.clone.1, %xor.121594.5.clone.1)
+  %shift-left.109353.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121594.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115511.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121594.5.clone.1, %broadcast.244429.2304)
+  %or.115045.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109353.9.clone.1, %shift-right-logical.115511.9.clone.1)
+  %xor.121595.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247759.3.clone.1, %or.115045.7.clone.1)
+  %add.247760.3.clone.1 = u32[1280,1280]{1,0} add(%add.247759.3.clone.1, %xor.121595.5.clone.1)
+  %shift-left.109354.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121595.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115512.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121595.5.clone.1, %broadcast.244430.4608)
+  %or.115046.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109354.9.clone.1, %shift-right-logical.115512.9.clone.1)
+  %xor.121596.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247760.3.clone.1, %or.115046.7.clone.1)
+  %add.247761.3.clone.1 = u32[1280,1280]{1,0} add(%add.247760.3.clone.1, %xor.121596.5.clone.1)
+  %add.247763.7.clone.1 = u32[1280,1280]{1,0} add(%add.247761.3.clone.1, %broadcast.250825.113.clone.1)
+  %shift-left.109355.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121596.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115513.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121596.5.clone.1, %broadcast.244434.2816)
+  %or.115047.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109355.11.clone.1, %shift-right-logical.115513.11.clone.1)
+  %xor.121597.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247761.3.clone.1, %or.115047.9.clone.1)
+  %constant_218169_1_clone_1 = u32[] constant(4249455396)
+  %broadcast.250865.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218169_1_clone_1), dimensions={}
+  %add.247764.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121597.7.clone.1, %broadcast.250865.5.clone.1)
+  %add.247765.5.clone.1 = u32[1280,1280]{1,0} add(%add.247763.7.clone.1, %add.247764.5.clone.1)
+  %shift-left.109356.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247764.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115514.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247764.5.clone.1, %broadcast.244415.6016)
+  %or.115049.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109356.9.clone.1, %shift-right-logical.115514.9.clone.1)
+  %xor.121599.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247765.5.clone.1, %or.115049.7.clone.1)
+  %add.247766.3.clone.1 = u32[1280,1280]{1,0} add(%add.247765.5.clone.1, %xor.121599.5.clone.1)
+  %shift-left.109357.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121599.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115515.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121599.5.clone.1, %broadcast.244417.5760)
+  %or.115050.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109357.9.clone.1, %shift-right-logical.115515.9.clone.1)
+  %xor.121600.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247766.3.clone.1, %or.115050.7.clone.1)
+  %add.247768.3.clone.1 = u32[1280,1280]{1,0} add(%add.247766.3.clone.1, %xor.121600.5.clone.1)
+  %shift-left.109358.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121600.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115516.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121600.5.clone.1, %broadcast.244419.4352)
+  %or.115051.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109358.5.clone.1, %shift-right-logical.115516.5.clone.1)
+  %xor.121601.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247768.3.clone.1, %or.115051.3.clone.1)
+  %add.247769.3.clone.1 = u32[1280,1280]{1,0} add(%add.247768.3.clone.1, %xor.121601.3.clone.1)
+  %add.247770.17.clone.1 = u32[1280,1280]{1,0} add(%add.247769.3.clone.1, %broadcast.250842.24.clone.1)
+  %shift-left.109359.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121601.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115517.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121601.3.clone.1, %broadcast.244418.4352)
+  %or.115052.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109359.5.clone.1, %shift-right-logical.115517.5.clone.1)
+  %xor.121602.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247769.3.clone.1, %or.115052.3.clone.1)
+  %constant_218170_1_clone_1 = u32[] constant(575593936)
+  %broadcast.250875.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218170_1_clone_1), dimensions={}
+  %add.247771.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121602.15.clone.1, %broadcast.250875.19.clone.1)
+  %xor.121603.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247770.17.clone.1, %add.247771.19.clone.1)
+  %shift-right-logical.115518.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121603.17.clone.1, %broadcast.244468.1920)
+  %or.115054.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115518.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5742.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115054.13.clone.1)
+  %add.247772.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5742.11.clone.1, %broadcast.244470.1152)
+  %multiply.26334.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247772.9.clone.1, %broadcast.244471.896)
+  %add.247774.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26334.7.clone.1, %broadcast.244408.1024)
+  %maximum.3674.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247774.5.clone.1)
+  %abs.1532.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3674.3.clone.1)
+  %compare.7212.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1532.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26335.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3674.3.clone.1, %broadcast.244476.1152)
+  %negate.4569.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3674.3.clone.1)
+  %multiply.26336.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3674.3.clone.1, %negate.4569.5.clone.1)
+  %log-plus-one.1532.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26336.5.clone.1)
+  %negate.4570.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1532.3.clone.1)
+  %compare.7213.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4570.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20917.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20918.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20919.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20920.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20921.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20922.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20923.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20924.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20925.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247778.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4570.4.clone.1, %broadcast.244496.640)
+  %sqrt.1532.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4570.4.clone.1)
+  %add.247779.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1532.5.clone.1, %broadcast.244498.640)
+  %select.20926.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7213.3.clone.1, %add.247778.5.clone.1, %add.247779.5.clone.1)
+  %multiply.26337.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20925.3.clone.1, %select.20926.3.clone.1)
+  %add.247780.1.clone.1 = f32[1280,1280]{1,0} add(%select.20924.3.clone.1, %multiply.26337.1.clone.1)
+  %multiply.26338.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247780.1.clone.1, %select.20926.3.clone.1)
+  %add.247781.1.clone.1 = f32[1280,1280]{1,0} add(%select.20923.3.clone.1, %multiply.26338.1.clone.1)
+  %multiply.26339.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247781.1.clone.1, %select.20926.3.clone.1)
+  %add.247783.1.clone.1 = f32[1280,1280]{1,0} add(%select.20922.3.clone.1, %multiply.26339.1.clone.1)
+  %multiply.26340.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247783.1.clone.1, %select.20926.3.clone.1)
+  %add.247784.1.clone.1 = f32[1280,1280]{1,0} add(%select.20921.3.clone.1, %multiply.26340.1.clone.1)
+  %multiply.26341.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247784.1.clone.1, %select.20926.3.clone.1)
+  %add.247785.3.clone.1 = f32[1280,1280]{1,0} add(%select.20920.5.clone.1, %multiply.26341.1.clone.1)
+  %multiply.26342.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247785.3.clone.1, %select.20926.3.clone.1)
+  %add.247786.3.clone.1 = f32[1280,1280]{1,0} add(%select.20919.5.clone.1, %multiply.26342.1.clone.1)
+  %multiply.26343.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247786.3.clone.1, %select.20926.3.clone.1)
+  %add.247788.9.clone.1 = f32[1280,1280]{1,0} add(%select.20918.11.clone.1, %multiply.26343.7.clone.1)
+  %multiply.26344.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247788.9.clone.1, %select.20926.3.clone.1)
+  %add.247789.7.clone.1 = f32[1280,1280]{1,0} add(%select.20917.7.clone.1, %multiply.26344.7.clone.1)
+  %multiply.26345.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247789.7.clone.1, %maximum.3674.3.clone.1)
+  %select.20927.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7212.3.clone.1, %multiply.26335.9.clone.1, %multiply.26345.7.clone.1)
+  %multiply.26346.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20927.7.clone.1, %broadcast.244500.640)
+  %clamp.1176.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26346.5.clone.1, %broadcast.244501.384)
+  %multiply.26347.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1176.3.clone.1, %broadcast.244502.1)
+  %constant_196332_1_clone_1 = u32[] constant(391327767)
+  %broadcast.260927.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_196332_1_clone_1), dimensions={}
+  %add.253509.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.260927.44.clone.1)
+  %constant_196339_1_clone_1 = u32[] constant(2492804280)
+  %broadcast.260928.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_196339_1_clone_1), dimensions={}
+  %add.253510.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.260928.113.clone.1)
+  %add.253511.35.clone.1 = u32[1280,1280]{1,0} add(%add.253509.37.clone.1, %add.253510.99.clone.1)
+  %shift-left.111840.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253510.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118149.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253510.99.clone.1, %broadcast.244415.6016)
+  %or.117688.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111840.31.clone.1, %shift-right-logical.118149.29.clone.1)
+  %xor.124249.27.clone.1 = u32[1280,1280]{1,0} xor(%add.253511.35.clone.1, %or.117688.29.clone.1)
+  %add.253513.5.clone.1 = u32[1280,1280]{1,0} add(%add.253511.35.clone.1, %xor.124249.27.clone.1)
+  %shift-left.111841.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124249.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118150.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124249.27.clone.1, %broadcast.244417.5760)
+  %or.117689.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111841.9.clone.1, %shift-right-logical.118150.9.clone.1)
+  %xor.124250.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253513.5.clone.1, %or.117689.7.clone.1)
+  %add.253514.3.clone.1 = u32[1280,1280]{1,0} add(%add.253513.5.clone.1, %xor.124250.5.clone.1)
+  %shift-left.111842.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124250.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118151.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124250.5.clone.1, %broadcast.244419.4352)
+  %or.117691.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111842.5.clone.1, %shift-right-logical.118151.5.clone.1)
+  %xor.124251.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253514.3.clone.1, %or.117691.3.clone.1)
+  %add.253515.3.clone.1 = u32[1280,1280]{1,0} add(%add.253514.3.clone.1, %xor.124251.3.clone.1)
+  %add.253516.7.clone.1 = u32[1280,1280]{1,0} add(%add.253515.3.clone.1, %broadcast.260928.113.clone.1)
+  %shift-left.111843.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124251.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118152.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124251.3.clone.1, %broadcast.244418.4352)
+  %or.117692.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111843.5.clone.1, %shift-right-logical.118152.5.clone.1)
+  %xor.124252.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253515.3.clone.1, %or.117692.3.clone.1)
+  %constant_218812_1_clone_1 = u32[] constant(2551646070)
+  %broadcast.260938.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218812_1_clone_1), dimensions={}
+  %add.253518.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124252.3.clone.1, %broadcast.260938.5.clone.1)
+  %add.253519.5.clone.1 = u32[1280,1280]{1,0} add(%add.253516.7.clone.1, %add.253518.5.clone.1)
+  %shift-left.111844.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253518.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118153.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253518.5.clone.1, %broadcast.244416.5760)
+  %or.117693.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111844.9.clone.1, %shift-right-logical.118153.9.clone.1)
+  %xor.124253.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253519.5.clone.1, %or.117693.7.clone.1)
+  %add.253520.3.clone.1 = u32[1280,1280]{1,0} add(%add.253519.5.clone.1, %xor.124253.5.clone.1)
+  %shift-left.111845.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124253.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118154.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124253.5.clone.1, %broadcast.244429.2304)
+  %or.117694.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111845.9.clone.1, %shift-right-logical.118154.9.clone.1)
+  %xor.124254.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253520.3.clone.1, %or.117694.7.clone.1)
+  %add.253521.3.clone.1 = u32[1280,1280]{1,0} add(%add.253520.3.clone.1, %xor.124254.5.clone.1)
+  %shift-left.111846.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124254.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118156.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124254.5.clone.1, %broadcast.244430.4608)
+  %or.117696.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111846.9.clone.1, %shift-right-logical.118156.9.clone.1)
+  %xor.124255.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253521.3.clone.1, %or.117696.7.clone.1)
+  %add.253522.3.clone.1 = u32[1280,1280]{1,0} add(%add.253521.3.clone.1, %xor.124255.5.clone.1)
+  %constant_196341_1_clone_1 = u32[] constant(2551646069)
+  %broadcast.260945.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_196341_1_clone_1), dimensions={}
+  %add.253524.7.clone.1 = u32[1280,1280]{1,0} add(%add.253522.3.clone.1, %broadcast.260945.24.clone.1)
+  %shift-left.111847.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124255.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118157.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124255.5.clone.1, %broadcast.244434.2816)
+  %or.117697.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111847.11.clone.1, %shift-right-logical.118157.11.clone.1)
+  %xor.124256.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253522.3.clone.1, %or.117697.9.clone.1)
+  %constant_218813_1_clone_1 = u32[] constant(391327769)
+  %broadcast.260949.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218813_1_clone_1), dimensions={}
+  %add.253528.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124256.7.clone.1, %broadcast.260949.5.clone.1)
+  %add.253529.5.clone.1 = u32[1280,1280]{1,0} add(%add.253524.7.clone.1, %add.253528.5.clone.1)
+  %shift-left.111848.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253528.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118158.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253528.5.clone.1, %broadcast.244415.6016)
+  %or.117698.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111848.9.clone.1, %shift-right-logical.118158.9.clone.1)
+  %xor.124257.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253529.5.clone.1, %or.117698.7.clone.1)
+  %add.253530.3.clone.1 = u32[1280,1280]{1,0} add(%add.253529.5.clone.1, %xor.124257.5.clone.1)
+  %shift-left.111849.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124257.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118159.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124257.5.clone.1, %broadcast.244417.5760)
+  %or.117699.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111849.9.clone.1, %shift-right-logical.118159.9.clone.1)
+  %xor.124258.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253530.3.clone.1, %or.117699.7.clone.1)
+  %add.253531.3.clone.1 = u32[1280,1280]{1,0} add(%add.253530.3.clone.1, %xor.124258.5.clone.1)
+  %shift-left.111850.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124258.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118161.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124258.5.clone.1, %broadcast.244419.4352)
+  %or.117700.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111850.7.clone.1, %shift-right-logical.118161.7.clone.1)
+  %xor.124259.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253531.3.clone.1, %or.117700.5.clone.1)
+  %add.253533.3.clone.1 = u32[1280,1280]{1,0} add(%add.253531.3.clone.1, %xor.124259.3.clone.1)
+  %add.253534.7.clone.1 = u32[1280,1280]{1,0} add(%add.253533.3.clone.1, %broadcast.260927.44.clone.1)
+  %shift-left.111851.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124259.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118162.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124259.3.clone.1, %broadcast.244418.4352)
+  %or.117701.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111851.7.clone.1, %shift-right-logical.118162.7.clone.1)
+  %xor.124260.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253533.3.clone.1, %or.117701.5.clone.1)
+  %constant_218814_1_clone_1 = u32[] constant(2492804283)
+  %broadcast.260960.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218814_1_clone_1), dimensions={}
+  %add.253535.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124260.3.clone.1, %broadcast.260960.5.clone.1)
+  %add.253536.5.clone.1 = u32[1280,1280]{1,0} add(%add.253534.7.clone.1, %add.253535.5.clone.1)
+  %shift-left.111852.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253535.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118163.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253535.5.clone.1, %broadcast.244416.5760)
+  %or.117702.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111852.9.clone.1, %shift-right-logical.118163.9.clone.1)
+  %xor.124261.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253536.5.clone.1, %or.117702.7.clone.1)
+  %add.253538.3.clone.1 = u32[1280,1280]{1,0} add(%add.253536.5.clone.1, %xor.124261.5.clone.1)
+  %shift-left.111853.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124261.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118164.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124261.5.clone.1, %broadcast.244429.2304)
+  %or.117703.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111853.9.clone.1, %shift-right-logical.118164.9.clone.1)
+  %xor.124262.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253538.3.clone.1, %or.117703.7.clone.1)
+  %add.253539.3.clone.1 = u32[1280,1280]{1,0} add(%add.253538.3.clone.1, %xor.124262.5.clone.1)
+  %shift-left.111854.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124262.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118166.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124262.5.clone.1, %broadcast.244430.4608)
+  %or.117704.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111854.9.clone.1, %shift-right-logical.118166.9.clone.1)
+  %xor.124263.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253539.3.clone.1, %or.117704.7.clone.1)
+  %add.253540.3.clone.1 = u32[1280,1280]{1,0} add(%add.253539.3.clone.1, %xor.124263.5.clone.1)
+  %add.253541.7.clone.1 = u32[1280,1280]{1,0} add(%add.253540.3.clone.1, %broadcast.260928.113.clone.1)
+  %shift-left.111855.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124263.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118167.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124263.5.clone.1, %broadcast.244434.2816)
+  %or.117706.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111855.11.clone.1, %shift-right-logical.118167.11.clone.1)
+  %xor.124264.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253540.3.clone.1, %or.117706.9.clone.1)
+  %constant_218815_1_clone_1 = u32[] constant(2551646073)
+  %broadcast.260970.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218815_1_clone_1), dimensions={}
+  %add.253543.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124264.7.clone.1, %broadcast.260970.5.clone.1)
+  %add.253544.5.clone.1 = u32[1280,1280]{1,0} add(%add.253541.7.clone.1, %add.253543.5.clone.1)
+  %shift-left.111856.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253543.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118168.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253543.5.clone.1, %broadcast.244415.6016)
+  %or.117707.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111856.9.clone.1, %shift-right-logical.118168.9.clone.1)
+  %xor.124265.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253544.5.clone.1, %or.117707.7.clone.1)
+  %add.253545.3.clone.1 = u32[1280,1280]{1,0} add(%add.253544.5.clone.1, %xor.124265.5.clone.1)
+  %shift-left.111857.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124265.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118169.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124265.5.clone.1, %broadcast.244417.5760)
+  %or.117708.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111857.9.clone.1, %shift-right-logical.118169.9.clone.1)
+  %xor.124266.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253545.3.clone.1, %or.117708.7.clone.1)
+  %add.253546.3.clone.1 = u32[1280,1280]{1,0} add(%add.253545.3.clone.1, %xor.124266.5.clone.1)
+  %shift-left.111858.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124266.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118171.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124266.5.clone.1, %broadcast.244419.4352)
+  %or.117709.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111858.5.clone.1, %shift-right-logical.118171.5.clone.1)
+  %xor.124267.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253546.3.clone.1, %or.117709.3.clone.1)
+  %add.253547.3.clone.1 = u32[1280,1280]{1,0} add(%add.253546.3.clone.1, %xor.124267.3.clone.1)
+  %add.253549.17.clone.1 = u32[1280,1280]{1,0} add(%add.253547.3.clone.1, %broadcast.260945.24.clone.1)
+  %shift-left.111859.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124267.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118172.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124267.3.clone.1, %broadcast.244418.4352)
+  %or.117711.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111859.5.clone.1, %shift-right-logical.118172.5.clone.1)
+  %xor.124268.15.clone.1 = u32[1280,1280]{1,0} xor(%add.253547.3.clone.1, %or.117711.3.clone.1)
+  %constant_218816_1_clone_1 = u32[] constant(391327772)
+  %broadcast.260982.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218816_1_clone_1), dimensions={}
+  %add.253552.19.clone.1 = u32[1280,1280]{1,0} add(%xor.124268.15.clone.1, %broadcast.260982.19.clone.1)
+  %xor.124269.17.clone.1 = u32[1280,1280]{1,0} xor(%add.253549.17.clone.1, %add.253552.19.clone.1)
+  %shift-right-logical.118173.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124269.17.clone.1, %broadcast.244468.1920)
+  %or.117712.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.118173.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5857.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117712.13.clone.1)
+  %add.253553.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5857.11.clone.1, %broadcast.244470.1152)
+  %multiply.27520.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253553.9.clone.1, %broadcast.244471.896)
+  %add.253554.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27520.7.clone.1, %broadcast.244408.1024)
+  %maximum.3789.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.253554.5.clone.1)
+  %abs.1609.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3789.3.clone.1)
+  %compare.7380.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1609.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27521.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3789.3.clone.1, %broadcast.244476.1152)
+  %negate.4723.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3789.3.clone.1)
+  %multiply.27522.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3789.3.clone.1, %negate.4723.5.clone.1)
+  %log-plus-one.1609.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27522.5.clone.1)
+  %negate.4724.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1609.3.clone.1)
+  %compare.7381.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4724.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21806.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21807.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21808.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21809.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21810.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21811.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21812.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21813.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21814.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.253555.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4724.4.clone.1, %broadcast.244496.640)
+  %sqrt.1609.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4724.4.clone.1)
+  %add.253556.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1609.5.clone.1, %broadcast.244498.640)
+  %select.21815.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7381.3.clone.1, %add.253555.5.clone.1, %add.253556.5.clone.1)
+  %multiply.27523.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21814.3.clone.1, %select.21815.3.clone.1)
+  %add.253557.1.clone.1 = f32[1280,1280]{1,0} add(%select.21813.3.clone.1, %multiply.27523.1.clone.1)
+  %multiply.27524.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253557.1.clone.1, %select.21815.3.clone.1)
+  %add.253558.1.clone.1 = f32[1280,1280]{1,0} add(%select.21812.3.clone.1, %multiply.27524.1.clone.1)
+  %multiply.27525.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253558.1.clone.1, %select.21815.3.clone.1)
+  %add.253559.1.clone.1 = f32[1280,1280]{1,0} add(%select.21811.3.clone.1, %multiply.27525.1.clone.1)
+  %multiply.27526.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253559.1.clone.1, %select.21815.3.clone.1)
+  %add.253560.1.clone.1 = f32[1280,1280]{1,0} add(%select.21810.3.clone.1, %multiply.27526.1.clone.1)
+  %multiply.27527.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253560.1.clone.1, %select.21815.3.clone.1)
+  %add.253561.3.clone.1 = f32[1280,1280]{1,0} add(%select.21809.5.clone.1, %multiply.27527.1.clone.1)
+  %multiply.27528.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253561.3.clone.1, %select.21815.3.clone.1)
+  %add.253562.3.clone.1 = f32[1280,1280]{1,0} add(%select.21808.5.clone.1, %multiply.27528.1.clone.1)
+  %multiply.27529.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253562.3.clone.1, %select.21815.3.clone.1)
+  %add.253563.9.clone.1 = f32[1280,1280]{1,0} add(%select.21807.11.clone.1, %multiply.27529.7.clone.1)
+  %multiply.27530.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253563.9.clone.1, %select.21815.3.clone.1)
+  %add.253564.7.clone.1 = f32[1280,1280]{1,0} add(%select.21806.7.clone.1, %multiply.27530.7.clone.1)
+  %multiply.27531.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253564.7.clone.1, %maximum.3789.3.clone.1)
+  %select.21816.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7380.3.clone.1, %multiply.27521.9.clone.1, %multiply.27531.7.clone.1)
+  %multiply.27532.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21816.7.clone.1, %broadcast.244500.640)
+  %clamp.1253.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27532.5.clone.1, %broadcast.244501.384)
+  %multiply.27533.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1253.3.clone.1, %broadcast.244502.1)
+  %constant_172720_1_clone_1 = u32[] constant(4241515503)
+  %broadcast.250726.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172720_1_clone_1), dimensions={}
+  %add.247676.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.250726.44.clone.1)
+  %constant_172727_1_clone_1 = u32[] constant(1055489444)
+  %broadcast.250727.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172727_1_clone_1), dimensions={}
+  %add.247677.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.250727.113.clone.1)
+  %add.247678.35.clone.1 = u32[1280,1280]{1,0} add(%add.247676.37.clone.1, %add.247677.99.clone.1)
+  %shift-left.109320.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247677.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115477.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247677.99.clone.1, %broadcast.244415.6016)
+  %or.115005.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109320.31.clone.1, %shift-right-logical.115477.29.clone.1)
+  %xor.121554.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247678.35.clone.1, %or.115005.29.clone.1)
+  %add.247679.5.clone.1 = u32[1280,1280]{1,0} add(%add.247678.35.clone.1, %xor.121554.27.clone.1)
+  %shift-left.109321.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121554.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115478.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121554.27.clone.1, %broadcast.244417.5760)
+  %or.115006.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109321.9.clone.1, %shift-right-logical.115478.9.clone.1)
+  %xor.121555.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247679.5.clone.1, %or.115006.7.clone.1)
+  %add.247680.3.clone.1 = u32[1280,1280]{1,0} add(%add.247679.5.clone.1, %xor.121555.5.clone.1)
+  %shift-left.109322.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121555.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115479.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121555.5.clone.1, %broadcast.244419.4352)
+  %or.115007.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109322.5.clone.1, %shift-right-logical.115479.5.clone.1)
+  %xor.121556.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247680.3.clone.1, %or.115007.3.clone.1)
+  %add.247681.3.clone.1 = u32[1280,1280]{1,0} add(%add.247680.3.clone.1, %xor.121556.3.clone.1)
+  %add.247682.7.clone.1 = u32[1280,1280]{1,0} add(%add.247681.3.clone.1, %broadcast.250727.113.clone.1)
+  %shift-left.109323.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121556.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115480.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121556.3.clone.1, %broadcast.244418.4352)
+  %or.115008.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109323.5.clone.1, %shift-right-logical.115480.5.clone.1)
+  %xor.121557.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247681.3.clone.1, %or.115008.3.clone.1)
+  %constant_218161_1_clone_1 = u32[] constant(3655861650)
+  %broadcast.250739.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218161_1_clone_1), dimensions={}
+  %add.247683.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121557.3.clone.1, %broadcast.250739.5.clone.1)
+  %add.247684.5.clone.1 = u32[1280,1280]{1,0} add(%add.247682.7.clone.1, %add.247683.5.clone.1)
+  %shift-left.109324.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247683.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115481.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247683.5.clone.1, %broadcast.244416.5760)
+  %or.115009.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109324.9.clone.1, %shift-right-logical.115481.9.clone.1)
+  %xor.121559.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247684.5.clone.1, %or.115009.7.clone.1)
+  %add.247685.3.clone.1 = u32[1280,1280]{1,0} add(%add.247684.5.clone.1, %xor.121559.5.clone.1)
+  %shift-left.109325.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121559.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115482.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121559.5.clone.1, %broadcast.244429.2304)
+  %or.115010.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109325.9.clone.1, %shift-right-logical.115482.9.clone.1)
+  %xor.121560.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247685.3.clone.1, %or.115010.7.clone.1)
+  %add.247686.3.clone.1 = u32[1280,1280]{1,0} add(%add.247685.3.clone.1, %xor.121560.5.clone.1)
+  %shift-left.109326.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121560.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115483.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121560.5.clone.1, %broadcast.244430.4608)
+  %or.115011.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109326.9.clone.1, %shift-right-logical.115483.9.clone.1)
+  %xor.121561.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247686.3.clone.1, %or.115011.7.clone.1)
+  %add.247687.3.clone.1 = u32[1280,1280]{1,0} add(%add.247686.3.clone.1, %xor.121561.5.clone.1)
+  %constant_172729_1_clone_1 = u32[] constant(3655861649)
+  %broadcast.250750.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172729_1_clone_1), dimensions={}
+  %add.247688.7.clone.1 = u32[1280,1280]{1,0} add(%add.247687.3.clone.1, %broadcast.250750.24.clone.1)
+  %shift-left.109327.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121561.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115484.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121561.5.clone.1, %broadcast.244434.2816)
+  %or.115012.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109327.11.clone.1, %shift-right-logical.115484.11.clone.1)
+  %xor.121562.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247687.3.clone.1, %or.115012.9.clone.1)
+  %constant_218162_1_clone_1 = u32[] constant(4241515505)
+  %broadcast.250753.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218162_1_clone_1), dimensions={}
+  %add.247689.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121562.7.clone.1, %broadcast.250753.5.clone.1)
+  %add.247690.5.clone.1 = u32[1280,1280]{1,0} add(%add.247688.7.clone.1, %add.247689.5.clone.1)
+  %shift-left.109328.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247689.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115485.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247689.5.clone.1, %broadcast.244415.6016)
+  %or.115014.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109328.9.clone.1, %shift-right-logical.115485.9.clone.1)
+  %xor.121564.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247690.5.clone.1, %or.115014.7.clone.1)
+  %add.247691.3.clone.1 = u32[1280,1280]{1,0} add(%add.247690.5.clone.1, %xor.121564.5.clone.1)
+  %shift-left.109329.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121564.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115486.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121564.5.clone.1, %broadcast.244417.5760)
+  %or.115015.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109329.9.clone.1, %shift-right-logical.115486.9.clone.1)
+  %xor.121565.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247691.3.clone.1, %or.115015.7.clone.1)
+  %add.247692.3.clone.1 = u32[1280,1280]{1,0} add(%add.247691.3.clone.1, %xor.121565.5.clone.1)
+  %shift-left.109330.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121565.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115487.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121565.5.clone.1, %broadcast.244419.4352)
+  %or.115016.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109330.7.clone.1, %shift-right-logical.115487.7.clone.1)
+  %xor.121566.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247692.3.clone.1, %or.115016.5.clone.1)
+  %add.247693.3.clone.1 = u32[1280,1280]{1,0} add(%add.247692.3.clone.1, %xor.121566.3.clone.1)
+  %add.247694.7.clone.1 = u32[1280,1280]{1,0} add(%add.247693.3.clone.1, %broadcast.250726.44.clone.1)
+  %shift-left.109331.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121566.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115488.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121566.3.clone.1, %broadcast.244418.4352)
+  %or.115017.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109331.7.clone.1, %shift-right-logical.115488.7.clone.1)
+  %xor.121567.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247693.3.clone.1, %or.115017.5.clone.1)
+  %constant_218163_1_clone_1 = u32[] constant(1055489447)
+  %broadcast.250763.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218163_1_clone_1), dimensions={}
+  %add.247695.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121567.3.clone.1, %broadcast.250763.5.clone.1)
+  %add.247696.5.clone.1 = u32[1280,1280]{1,0} add(%add.247694.7.clone.1, %add.247695.5.clone.1)
+  %shift-left.109332.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247695.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115489.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247695.5.clone.1, %broadcast.244416.5760)
+  %or.115019.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109332.9.clone.1, %shift-right-logical.115489.9.clone.1)
+  %xor.121569.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247696.5.clone.1, %or.115019.7.clone.1)
+  %add.247697.3.clone.1 = u32[1280,1280]{1,0} add(%add.247696.5.clone.1, %xor.121569.5.clone.1)
+  %shift-left.109333.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121569.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115490.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121569.5.clone.1, %broadcast.244429.2304)
+  %or.115020.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109333.9.clone.1, %shift-right-logical.115490.9.clone.1)
+  %xor.121570.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247697.3.clone.1, %or.115020.7.clone.1)
+  %add.247698.3.clone.1 = u32[1280,1280]{1,0} add(%add.247697.3.clone.1, %xor.121570.5.clone.1)
+  %shift-left.109334.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121570.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115491.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121570.5.clone.1, %broadcast.244430.4608)
+  %or.115021.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109334.9.clone.1, %shift-right-logical.115491.9.clone.1)
+  %xor.121571.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247698.3.clone.1, %or.115021.7.clone.1)
+  %add.247700.3.clone.1 = u32[1280,1280]{1,0} add(%add.247698.3.clone.1, %xor.121571.5.clone.1)
+  %add.247703.7.clone.1 = u32[1280,1280]{1,0} add(%add.247700.3.clone.1, %broadcast.250727.113.clone.1)
+  %shift-left.109335.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121571.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115492.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121571.5.clone.1, %broadcast.244434.2816)
+  %or.115022.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109335.11.clone.1, %shift-right-logical.115492.11.clone.1)
+  %xor.121572.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247700.3.clone.1, %or.115022.9.clone.1)
+  %constant_218164_1_clone_1 = u32[] constant(3655861653)
+  %broadcast.250777.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218164_1_clone_1), dimensions={}
+  %add.247704.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121572.7.clone.1, %broadcast.250777.5.clone.1)
+  %add.247705.5.clone.1 = u32[1280,1280]{1,0} add(%add.247703.7.clone.1, %add.247704.5.clone.1)
+  %shift-left.109336.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247704.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115493.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247704.5.clone.1, %broadcast.244415.6016)
+  %or.115024.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109336.9.clone.1, %shift-right-logical.115493.9.clone.1)
+  %xor.121574.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247705.5.clone.1, %or.115024.7.clone.1)
+  %add.247706.3.clone.1 = u32[1280,1280]{1,0} add(%add.247705.5.clone.1, %xor.121574.5.clone.1)
+  %shift-left.109337.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121574.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115494.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121574.5.clone.1, %broadcast.244417.5760)
+  %or.115025.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109337.9.clone.1, %shift-right-logical.115494.9.clone.1)
+  %xor.121575.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247706.3.clone.1, %or.115025.7.clone.1)
+  %add.247708.3.clone.1 = u32[1280,1280]{1,0} add(%add.247706.3.clone.1, %xor.121575.5.clone.1)
+  %shift-left.109338.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121575.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115495.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121575.5.clone.1, %broadcast.244419.4352)
+  %or.115026.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109338.5.clone.1, %shift-right-logical.115495.5.clone.1)
+  %xor.121576.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247708.3.clone.1, %or.115026.3.clone.1)
+  %add.247709.3.clone.1 = u32[1280,1280]{1,0} add(%add.247708.3.clone.1, %xor.121576.3.clone.1)
+  %add.247710.17.clone.1 = u32[1280,1280]{1,0} add(%add.247709.3.clone.1, %broadcast.250750.24.clone.1)
+  %shift-left.109339.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121576.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115496.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121576.3.clone.1, %broadcast.244418.4352)
+  %or.115027.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109339.5.clone.1, %shift-right-logical.115496.5.clone.1)
+  %xor.121577.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247709.3.clone.1, %or.115027.3.clone.1)
+  %constant_218165_1_clone_1 = u32[] constant(4241515508)
+  %broadcast.250789.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218165_1_clone_1), dimensions={}
+  %add.247711.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121577.15.clone.1, %broadcast.250789.19.clone.1)
+  %xor.121578.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247710.17.clone.1, %add.247711.19.clone.1)
+  %shift-right-logical.115497.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121578.17.clone.1, %broadcast.244468.1920)
+  %or.115029.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115497.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5741.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115029.13.clone.1)
+  %add.247713.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5741.11.clone.1, %broadcast.244470.1152)
+  %multiply.26320.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247713.9.clone.1, %broadcast.244471.896)
+  %add.247714.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26320.7.clone.1, %broadcast.244408.1024)
+  %maximum.3673.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247714.5.clone.1)
+  %abs.1531.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3673.3.clone.1)
+  %compare.7210.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1531.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26321.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3673.3.clone.1, %broadcast.244476.1152)
+  %negate.4567.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3673.3.clone.1)
+  %multiply.26322.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3673.3.clone.1, %negate.4567.5.clone.1)
+  %log-plus-one.1531.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26322.5.clone.1)
+  %negate.4568.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1531.3.clone.1)
+  %compare.7211.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4568.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20906.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20907.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20908.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20909.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20910.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20911.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20912.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20913.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20914.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247715.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4568.4.clone.1, %broadcast.244496.640)
+  %sqrt.1531.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4568.4.clone.1)
+  %add.247716.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1531.5.clone.1, %broadcast.244498.640)
+  %select.20915.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7211.3.clone.1, %add.247715.5.clone.1, %add.247716.5.clone.1)
+  %multiply.26323.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20914.3.clone.1, %select.20915.3.clone.1)
+  %add.247718.1.clone.1 = f32[1280,1280]{1,0} add(%select.20913.3.clone.1, %multiply.26323.1.clone.1)
+  %multiply.26324.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247718.1.clone.1, %select.20915.3.clone.1)
+  %add.247719.1.clone.1 = f32[1280,1280]{1,0} add(%select.20912.3.clone.1, %multiply.26324.1.clone.1)
+  %multiply.26325.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247719.1.clone.1, %select.20915.3.clone.1)
+  %add.247720.1.clone.1 = f32[1280,1280]{1,0} add(%select.20911.3.clone.1, %multiply.26325.1.clone.1)
+  %multiply.26326.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247720.1.clone.1, %select.20915.3.clone.1)
+  %add.247721.1.clone.1 = f32[1280,1280]{1,0} add(%select.20910.3.clone.1, %multiply.26326.1.clone.1)
+  %multiply.26327.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247721.1.clone.1, %select.20915.3.clone.1)
+  %add.247722.3.clone.1 = f32[1280,1280]{1,0} add(%select.20909.5.clone.1, %multiply.26327.1.clone.1)
+  %multiply.26328.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247722.3.clone.1, %select.20915.3.clone.1)
+  %add.247724.3.clone.1 = f32[1280,1280]{1,0} add(%select.20908.5.clone.1, %multiply.26328.1.clone.1)
+  %multiply.26329.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247724.3.clone.1, %select.20915.3.clone.1)
+  %add.247728.9.clone.1 = f32[1280,1280]{1,0} add(%select.20907.11.clone.1, %multiply.26329.7.clone.1)
+  %multiply.26330.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247728.9.clone.1, %select.20915.3.clone.1)
+  %add.247729.7.clone.1 = f32[1280,1280]{1,0} add(%select.20906.7.clone.1, %multiply.26330.7.clone.1)
+  %multiply.26331.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247729.7.clone.1, %maximum.3673.3.clone.1)
+  %select.20916.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7210.3.clone.1, %multiply.26321.9.clone.1, %multiply.26331.7.clone.1)
+  %multiply.26332.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20916.7.clone.1, %broadcast.244500.640)
+  %clamp.1175.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26332.5.clone.1, %broadcast.244501.384)
+  %multiply.26333.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1175.3.clone.1, %broadcast.244502.1)
+  %constant_184902_1_clone_1 = u32[] constant(1079347404)
+  %broadcast.255999.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184902_1_clone_1), dimensions={}
+  %add.250683.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255999.44.clone.1)
+  %constant_184912_1_clone_1 = u32[] constant(344521199)
+  %broadcast.256000.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184912_1_clone_1), dimensions={}
+  %add.250687.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.256000.113.clone.1)
+  %add.250688.35.clone.1 = u32[1280,1280]{1,0} add(%add.250683.37.clone.1, %add.250687.99.clone.1)
+  %shift-left.110618.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250687.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116866.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250687.99.clone.1, %broadcast.244415.6016)
+  %or.116391.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110618.31.clone.1, %shift-right-logical.116866.29.clone.1)
+  %xor.122961.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250688.35.clone.1, %or.116391.29.clone.1)
+  %add.250689.5.clone.1 = u32[1280,1280]{1,0} add(%add.250688.35.clone.1, %xor.122961.27.clone.1)
+  %shift-left.110619.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122961.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116867.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122961.27.clone.1, %broadcast.244417.5760)
+  %or.116392.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110619.9.clone.1, %shift-right-logical.116867.9.clone.1)
+  %xor.122962.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250689.5.clone.1, %or.116392.7.clone.1)
+  %add.250690.3.clone.1 = u32[1280,1280]{1,0} add(%add.250689.5.clone.1, %xor.122962.5.clone.1)
+  %shift-left.110621.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122962.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116868.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122962.5.clone.1, %broadcast.244419.4352)
+  %or.116393.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110621.5.clone.1, %shift-right-logical.116868.5.clone.1)
+  %xor.122963.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250690.3.clone.1, %or.116393.3.clone.1)
+  %add.250692.3.clone.1 = u32[1280,1280]{1,0} add(%add.250690.3.clone.1, %xor.122963.3.clone.1)
+  %add.250693.7.clone.1 = u32[1280,1280]{1,0} add(%add.250692.3.clone.1, %broadcast.256000.113.clone.1)
+  %shift-left.110622.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122963.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116870.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122963.3.clone.1, %broadcast.244418.4352)
+  %or.116394.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110622.5.clone.1, %shift-right-logical.116870.5.clone.1)
+  %xor.122964.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250692.3.clone.1, %or.116394.3.clone.1)
+  %constant_218494_1_clone_1 = u32[] constant(1326213882)
+  %broadcast.256010.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218494_1_clone_1), dimensions={}
+  %add.250694.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122964.3.clone.1, %broadcast.256010.5.clone.1)
+  %add.250695.5.clone.1 = u32[1280,1280]{1,0} add(%add.250693.7.clone.1, %add.250694.5.clone.1)
+  %shift-left.110623.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250694.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116871.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250694.5.clone.1, %broadcast.244416.5760)
+  %or.116395.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110623.9.clone.1, %shift-right-logical.116871.9.clone.1)
+  %xor.122965.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250695.5.clone.1, %or.116395.7.clone.1)
+  %add.250697.3.clone.1 = u32[1280,1280]{1,0} add(%add.250695.5.clone.1, %xor.122965.5.clone.1)
+  %shift-left.110624.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122965.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116872.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122965.5.clone.1, %broadcast.244429.2304)
+  %or.116396.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110624.9.clone.1, %shift-right-logical.116872.9.clone.1)
+  %xor.122966.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250697.3.clone.1, %or.116396.7.clone.1)
+  %add.250698.3.clone.1 = u32[1280,1280]{1,0} add(%add.250697.3.clone.1, %xor.122966.5.clone.1)
+  %shift-left.110626.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122966.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116873.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122966.5.clone.1, %broadcast.244430.4608)
+  %or.116397.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110626.9.clone.1, %shift-right-logical.116873.9.clone.1)
+  %xor.122967.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250698.3.clone.1, %or.116397.7.clone.1)
+  %add.250699.3.clone.1 = u32[1280,1280]{1,0} add(%add.250698.3.clone.1, %xor.122967.5.clone.1)
+  %constant_184916_1_clone_1 = u32[] constant(1326213881)
+  %broadcast.256019.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184916_1_clone_1), dimensions={}
+  %add.250700.7.clone.1 = u32[1280,1280]{1,0} add(%add.250699.3.clone.1, %broadcast.256019.24.clone.1)
+  %shift-left.110627.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122967.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116875.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122967.5.clone.1, %broadcast.244434.2816)
+  %or.116398.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110627.11.clone.1, %shift-right-logical.116875.11.clone.1)
+  %xor.122968.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250699.3.clone.1, %or.116398.9.clone.1)
+  %constant_218495_1_clone_1 = u32[] constant(1079347406)
+  %broadcast.256022.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218495_1_clone_1), dimensions={}
+  %add.250702.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122968.7.clone.1, %broadcast.256022.5.clone.1)
+  %add.250703.5.clone.1 = u32[1280,1280]{1,0} add(%add.250700.7.clone.1, %add.250702.5.clone.1)
+  %shift-left.110628.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250702.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116876.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250702.5.clone.1, %broadcast.244415.6016)
+  %or.116399.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110628.9.clone.1, %shift-right-logical.116876.9.clone.1)
+  %xor.122969.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250703.5.clone.1, %or.116399.7.clone.1)
+  %add.250704.3.clone.1 = u32[1280,1280]{1,0} add(%add.250703.5.clone.1, %xor.122969.5.clone.1)
+  %shift-left.110629.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122969.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116877.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122969.5.clone.1, %broadcast.244417.5760)
+  %or.116400.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110629.9.clone.1, %shift-right-logical.116877.9.clone.1)
+  %xor.122970.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250704.3.clone.1, %or.116400.7.clone.1)
+  %add.250705.3.clone.1 = u32[1280,1280]{1,0} add(%add.250704.3.clone.1, %xor.122970.5.clone.1)
+  %shift-left.110630.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122970.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116878.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122970.5.clone.1, %broadcast.244419.4352)
+  %or.116401.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110630.7.clone.1, %shift-right-logical.116878.7.clone.1)
+  %xor.122971.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250705.3.clone.1, %or.116401.5.clone.1)
+  %add.250706.3.clone.1 = u32[1280,1280]{1,0} add(%add.250705.3.clone.1, %xor.122971.3.clone.1)
+  %add.250708.7.clone.1 = u32[1280,1280]{1,0} add(%add.250706.3.clone.1, %broadcast.255999.44.clone.1)
+  %shift-left.110631.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122971.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116880.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122971.3.clone.1, %broadcast.244418.4352)
+  %or.116402.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110631.7.clone.1, %shift-right-logical.116880.7.clone.1)
+  %xor.122972.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250706.3.clone.1, %or.116402.5.clone.1)
+  %constant_218496_1_clone_1 = u32[] constant(344521202)
+  %broadcast.256032.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218496_1_clone_1), dimensions={}
+  %add.250712.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122972.3.clone.1, %broadcast.256032.5.clone.1)
+  %add.250713.5.clone.1 = u32[1280,1280]{1,0} add(%add.250708.7.clone.1, %add.250712.5.clone.1)
+  %shift-left.110632.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250712.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116881.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250712.5.clone.1, %broadcast.244416.5760)
+  %or.116403.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110632.9.clone.1, %shift-right-logical.116881.9.clone.1)
+  %xor.122973.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250713.5.clone.1, %or.116403.7.clone.1)
+  %add.250714.3.clone.1 = u32[1280,1280]{1,0} add(%add.250713.5.clone.1, %xor.122973.5.clone.1)
+  %shift-left.110633.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122973.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116882.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122973.5.clone.1, %broadcast.244429.2304)
+  %or.116404.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110633.9.clone.1, %shift-right-logical.116882.9.clone.1)
+  %xor.122974.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250714.3.clone.1, %or.116404.7.clone.1)
+  %add.250715.3.clone.1 = u32[1280,1280]{1,0} add(%add.250714.3.clone.1, %xor.122974.5.clone.1)
+  %shift-left.110634.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122974.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116883.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122974.5.clone.1, %broadcast.244430.4608)
+  %or.116405.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110634.9.clone.1, %shift-right-logical.116883.9.clone.1)
+  %xor.122975.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250715.3.clone.1, %or.116405.7.clone.1)
+  %add.250717.3.clone.1 = u32[1280,1280]{1,0} add(%add.250715.3.clone.1, %xor.122975.5.clone.1)
+  %add.250718.7.clone.1 = u32[1280,1280]{1,0} add(%add.250717.3.clone.1, %broadcast.256000.113.clone.1)
+  %shift-left.110635.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122975.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116885.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122975.5.clone.1, %broadcast.244434.2816)
+  %or.116406.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110635.11.clone.1, %shift-right-logical.116885.11.clone.1)
+  %xor.122976.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250717.3.clone.1, %or.116406.9.clone.1)
+  %constant_218497_1_clone_1 = u32[] constant(1326213885)
+  %broadcast.256044.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218497_1_clone_1), dimensions={}
+  %add.250719.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122976.7.clone.1, %broadcast.256044.5.clone.1)
+  %add.250720.5.clone.1 = u32[1280,1280]{1,0} add(%add.250718.7.clone.1, %add.250719.5.clone.1)
+  %shift-left.110636.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250719.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116886.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250719.5.clone.1, %broadcast.244415.6016)
+  %or.116407.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110636.9.clone.1, %shift-right-logical.116886.9.clone.1)
+  %xor.122977.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250720.5.clone.1, %or.116407.7.clone.1)
+  %add.250722.3.clone.1 = u32[1280,1280]{1,0} add(%add.250720.5.clone.1, %xor.122977.5.clone.1)
+  %shift-left.110637.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122977.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116887.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122977.5.clone.1, %broadcast.244417.5760)
+  %or.116408.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110637.9.clone.1, %shift-right-logical.116887.9.clone.1)
+  %xor.122978.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250722.3.clone.1, %or.116408.7.clone.1)
+  %add.250723.3.clone.1 = u32[1280,1280]{1,0} add(%add.250722.3.clone.1, %xor.122978.5.clone.1)
+  %shift-left.110638.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122978.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116888.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122978.5.clone.1, %broadcast.244419.4352)
+  %or.116409.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110638.5.clone.1, %shift-right-logical.116888.5.clone.1)
+  %xor.122979.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250723.3.clone.1, %or.116409.3.clone.1)
+  %add.250724.3.clone.1 = u32[1280,1280]{1,0} add(%add.250723.3.clone.1, %xor.122979.3.clone.1)
+  %add.250725.17.clone.1 = u32[1280,1280]{1,0} add(%add.250724.3.clone.1, %broadcast.256019.24.clone.1)
+  %shift-left.110639.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122979.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116889.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122979.3.clone.1, %broadcast.244418.4352)
+  %or.116410.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110639.5.clone.1, %shift-right-logical.116889.5.clone.1)
+  %xor.122980.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250724.3.clone.1, %or.116410.3.clone.1)
+  %constant_218498_1_clone_1 = u32[] constant(1079347409)
+  %broadcast.256054.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218498_1_clone_1), dimensions={}
+  %add.250727.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122980.15.clone.1, %broadcast.256054.19.clone.1)
+  %xor.122981.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250725.17.clone.1, %add.250727.19.clone.1)
+  %shift-right-logical.116890.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122981.17.clone.1, %broadcast.244468.1920)
+  %or.116411.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116890.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5801.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116411.13.clone.1)
+  %add.250728.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5801.11.clone.1, %broadcast.244470.1152)
+  %multiply.26937.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250728.9.clone.1, %broadcast.244471.896)
+  %add.250729.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26937.7.clone.1, %broadcast.244408.1024)
+  %maximum.3733.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250729.5.clone.1)
+  %abs.1571.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3733.3.clone.1)
+  %compare.7298.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1571.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26938.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3733.3.clone.1, %broadcast.244476.1152)
+  %negate.4647.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3733.3.clone.1)
+  %multiply.26939.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3733.3.clone.1, %negate.4647.5.clone.1)
+  %log-plus-one.1571.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26939.5.clone.1)
+  %negate.4648.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1571.3.clone.1)
+  %compare.7299.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4648.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21367.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21368.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21369.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21370.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21371.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21372.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21373.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21374.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21375.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250730.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4648.4.clone.1, %broadcast.244496.640)
+  %sqrt.1571.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4648.4.clone.1)
+  %add.250731.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1571.5.clone.1, %broadcast.244498.640)
+  %select.21376.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7299.3.clone.1, %add.250730.5.clone.1, %add.250731.5.clone.1)
+  %multiply.26940.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21375.3.clone.1, %select.21376.3.clone.1)
+  %add.250733.1.clone.1 = f32[1280,1280]{1,0} add(%select.21374.3.clone.1, %multiply.26940.1.clone.1)
+  %multiply.26941.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250733.1.clone.1, %select.21376.3.clone.1)
+  %add.250736.1.clone.1 = f32[1280,1280]{1,0} add(%select.21373.3.clone.1, %multiply.26941.1.clone.1)
+  %multiply.26942.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250736.1.clone.1, %select.21376.3.clone.1)
+  %add.250737.1.clone.1 = f32[1280,1280]{1,0} add(%select.21372.3.clone.1, %multiply.26942.1.clone.1)
+  %multiply.26943.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250737.1.clone.1, %select.21376.3.clone.1)
+  %add.250738.1.clone.1 = f32[1280,1280]{1,0} add(%select.21371.3.clone.1, %multiply.26943.1.clone.1)
+  %multiply.26944.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250738.1.clone.1, %select.21376.3.clone.1)
+  %add.250739.3.clone.1 = f32[1280,1280]{1,0} add(%select.21370.5.clone.1, %multiply.26944.1.clone.1)
+  %multiply.26945.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250739.3.clone.1, %select.21376.3.clone.1)
+  %add.250740.3.clone.1 = f32[1280,1280]{1,0} add(%select.21369.5.clone.1, %multiply.26945.1.clone.1)
+  %multiply.26946.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250740.3.clone.1, %select.21376.3.clone.1)
+  %add.250741.9.clone.1 = f32[1280,1280]{1,0} add(%select.21368.11.clone.1, %multiply.26946.7.clone.1)
+  %multiply.26947.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250741.9.clone.1, %select.21376.3.clone.1)
+  %add.250742.7.clone.1 = f32[1280,1280]{1,0} add(%select.21367.7.clone.1, %multiply.26947.7.clone.1)
+  %multiply.26948.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250742.7.clone.1, %maximum.3733.3.clone.1)
+  %select.21377.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7298.3.clone.1, %multiply.26938.9.clone.1, %multiply.26948.7.clone.1)
+  %multiply.26949.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21377.7.clone.1, %broadcast.244500.640)
+  %clamp.1215.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26949.5.clone.1, %broadcast.244501.384)
+  %multiply.26950.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1215.3.clone.1, %broadcast.244502.1)
+  %constant_172169_1_clone_1 = u32[] constant(1166513390)
+  %broadcast.250491.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172169_1_clone_1), dimensions={}
+  %add.247532.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.250491.44.clone.1)
+  %constant_172176_1_clone_1 = u32[] constant(1392346075)
+  %broadcast.250492.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172176_1_clone_1), dimensions={}
+  %add.247534.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.250492.113.clone.1)
+  %add.247535.35.clone.1 = u32[1280,1280]{1,0} add(%add.247532.37.clone.1, %add.247534.99.clone.1)
+  %shift-left.109248.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247534.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115414.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247534.99.clone.1, %broadcast.244415.6016)
+  %or.114936.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109248.31.clone.1, %shift-right-logical.115414.29.clone.1)
+  %xor.121481.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247535.35.clone.1, %or.114936.29.clone.1)
+  %add.247536.5.clone.1 = u32[1280,1280]{1,0} add(%add.247535.35.clone.1, %xor.121481.27.clone.1)
+  %shift-left.109249.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121481.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115415.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121481.27.clone.1, %broadcast.244417.5760)
+  %or.114937.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109249.9.clone.1, %shift-right-logical.115415.9.clone.1)
+  %xor.121482.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247536.5.clone.1, %or.114937.7.clone.1)
+  %add.247537.3.clone.1 = u32[1280,1280]{1,0} add(%add.247536.5.clone.1, %xor.121482.5.clone.1)
+  %shift-left.109250.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121482.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115416.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121482.5.clone.1, %broadcast.244419.4352)
+  %or.114938.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109250.5.clone.1, %shift-right-logical.115416.5.clone.1)
+  %xor.121483.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247537.3.clone.1, %or.114938.3.clone.1)
+  %add.247538.3.clone.1 = u32[1280,1280]{1,0} add(%add.247537.3.clone.1, %xor.121483.3.clone.1)
+  %add.247540.7.clone.1 = u32[1280,1280]{1,0} add(%add.247538.3.clone.1, %broadcast.250492.113.clone.1)
+  %shift-left.109251.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121483.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115417.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121483.3.clone.1, %broadcast.244418.4352)
+  %or.114939.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109251.5.clone.1, %shift-right-logical.115417.5.clone.1)
+  %xor.121484.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247538.3.clone.1, %or.114939.3.clone.1)
+  %constant_218146_1_clone_1 = u32[] constant(212536560)
+  %broadcast.250502.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218146_1_clone_1), dimensions={}
+  %add.247544.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121484.3.clone.1, %broadcast.250502.5.clone.1)
+  %add.247545.5.clone.1 = u32[1280,1280]{1,0} add(%add.247540.7.clone.1, %add.247544.5.clone.1)
+  %shift-left.109253.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247544.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115418.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247544.5.clone.1, %broadcast.244416.5760)
+  %or.114940.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109253.9.clone.1, %shift-right-logical.115418.9.clone.1)
+  %xor.121485.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247545.5.clone.1, %or.114940.7.clone.1)
+  %add.247546.3.clone.1 = u32[1280,1280]{1,0} add(%add.247545.5.clone.1, %xor.121485.5.clone.1)
+  %shift-left.109254.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121485.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115419.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121485.5.clone.1, %broadcast.244429.2304)
+  %or.114941.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109254.9.clone.1, %shift-right-logical.115419.9.clone.1)
+  %xor.121486.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247546.3.clone.1, %or.114941.7.clone.1)
+  %add.247547.3.clone.1 = u32[1280,1280]{1,0} add(%add.247546.3.clone.1, %xor.121486.5.clone.1)
+  %shift-left.109255.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121486.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115420.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121486.5.clone.1, %broadcast.244430.4608)
+  %or.114942.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109255.9.clone.1, %shift-right-logical.115420.9.clone.1)
+  %xor.121487.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247547.3.clone.1, %or.114942.7.clone.1)
+  %add.247549.3.clone.1 = u32[1280,1280]{1,0} add(%add.247547.3.clone.1, %xor.121487.5.clone.1)
+  %constant_172178_1_clone_1 = u32[] constant(212536559)
+  %broadcast.250509.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_172178_1_clone_1), dimensions={}
+  %add.247550.7.clone.1 = u32[1280,1280]{1,0} add(%add.247549.3.clone.1, %broadcast.250509.24.clone.1)
+  %shift-left.109256.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121487.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115421.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121487.5.clone.1, %broadcast.244434.2816)
+  %or.114944.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109256.11.clone.1, %shift-right-logical.115421.11.clone.1)
+  %xor.121488.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247549.3.clone.1, %or.114944.9.clone.1)
+  %constant_218147_1_clone_1 = u32[] constant(1166513392)
+  %broadcast.250512.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218147_1_clone_1), dimensions={}
+  %add.247551.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121488.7.clone.1, %broadcast.250512.5.clone.1)
+  %add.247552.5.clone.1 = u32[1280,1280]{1,0} add(%add.247550.7.clone.1, %add.247551.5.clone.1)
+  %shift-left.109258.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247551.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115422.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247551.5.clone.1, %broadcast.244415.6016)
+  %or.114945.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109258.9.clone.1, %shift-right-logical.115422.9.clone.1)
+  %xor.121489.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247552.5.clone.1, %or.114945.7.clone.1)
+  %add.247554.3.clone.1 = u32[1280,1280]{1,0} add(%add.247552.5.clone.1, %xor.121489.5.clone.1)
+  %shift-left.109259.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121489.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115423.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121489.5.clone.1, %broadcast.244417.5760)
+  %or.114947.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109259.9.clone.1, %shift-right-logical.115423.9.clone.1)
+  %xor.121490.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247554.3.clone.1, %or.114947.7.clone.1)
+  %add.247555.3.clone.1 = u32[1280,1280]{1,0} add(%add.247554.3.clone.1, %xor.121490.5.clone.1)
+  %shift-left.109260.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121490.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115424.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121490.5.clone.1, %broadcast.244419.4352)
+  %or.114948.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109260.7.clone.1, %shift-right-logical.115424.7.clone.1)
+  %xor.121491.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247555.3.clone.1, %or.114948.5.clone.1)
+  %add.247556.3.clone.1 = u32[1280,1280]{1,0} add(%add.247555.3.clone.1, %xor.121491.3.clone.1)
+  %add.247557.7.clone.1 = u32[1280,1280]{1,0} add(%add.247556.3.clone.1, %broadcast.250491.44.clone.1)
+  %shift-left.109261.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121491.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115425.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121491.3.clone.1, %broadcast.244418.4352)
+  %or.114949.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109261.7.clone.1, %shift-right-logical.115425.7.clone.1)
+  %xor.121492.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247556.3.clone.1, %or.114949.5.clone.1)
+  %constant_218148_1_clone_1 = u32[] constant(1392346078)
+  %broadcast.250522.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218148_1_clone_1), dimensions={}
+  %add.247559.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121492.3.clone.1, %broadcast.250522.5.clone.1)
+  %add.247560.5.clone.1 = u32[1280,1280]{1,0} add(%add.247557.7.clone.1, %add.247559.5.clone.1)
+  %shift-left.109263.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247559.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115426.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247559.5.clone.1, %broadcast.244416.5760)
+  %or.114950.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109263.9.clone.1, %shift-right-logical.115426.9.clone.1)
+  %xor.121493.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247560.5.clone.1, %or.114950.7.clone.1)
+  %add.247561.3.clone.1 = u32[1280,1280]{1,0} add(%add.247560.5.clone.1, %xor.121493.5.clone.1)
+  %shift-left.109264.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121493.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115427.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121493.5.clone.1, %broadcast.244429.2304)
+  %or.114951.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109264.9.clone.1, %shift-right-logical.115427.9.clone.1)
+  %xor.121494.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247561.3.clone.1, %or.114951.7.clone.1)
+  %add.247562.3.clone.1 = u32[1280,1280]{1,0} add(%add.247561.3.clone.1, %xor.121494.5.clone.1)
+  %shift-left.109265.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121494.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115428.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121494.5.clone.1, %broadcast.244430.4608)
+  %or.114952.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109265.9.clone.1, %shift-right-logical.115428.9.clone.1)
+  %xor.121495.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247562.3.clone.1, %or.114952.7.clone.1)
+  %add.247563.3.clone.1 = u32[1280,1280]{1,0} add(%add.247562.3.clone.1, %xor.121495.5.clone.1)
+  %add.247565.7.clone.1 = u32[1280,1280]{1,0} add(%add.247563.3.clone.1, %broadcast.250492.113.clone.1)
+  %shift-left.109266.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121495.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115429.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121495.5.clone.1, %broadcast.244434.2816)
+  %or.114953.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109266.11.clone.1, %shift-right-logical.115429.11.clone.1)
+  %xor.121496.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247563.3.clone.1, %or.114953.9.clone.1)
+  %constant_218149_1_clone_1 = u32[] constant(212536563)
+  %broadcast.250532.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218149_1_clone_1), dimensions={}
+  %add.247569.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121496.7.clone.1, %broadcast.250532.5.clone.1)
+  %add.247570.5.clone.1 = u32[1280,1280]{1,0} add(%add.247565.7.clone.1, %add.247569.5.clone.1)
+  %shift-left.109268.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247569.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115430.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247569.5.clone.1, %broadcast.244415.6016)
+  %or.114954.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109268.9.clone.1, %shift-right-logical.115430.9.clone.1)
+  %xor.121497.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247570.5.clone.1, %or.114954.7.clone.1)
+  %add.247571.3.clone.1 = u32[1280,1280]{1,0} add(%add.247570.5.clone.1, %xor.121497.5.clone.1)
+  %shift-left.109269.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121497.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115431.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121497.5.clone.1, %broadcast.244417.5760)
+  %or.114955.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109269.9.clone.1, %shift-right-logical.115431.9.clone.1)
+  %xor.121500.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247571.3.clone.1, %or.114955.7.clone.1)
+  %add.247572.3.clone.1 = u32[1280,1280]{1,0} add(%add.247571.3.clone.1, %xor.121500.5.clone.1)
+  %shift-left.109270.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121500.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115432.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121500.5.clone.1, %broadcast.244419.4352)
+  %or.114956.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109270.5.clone.1, %shift-right-logical.115432.5.clone.1)
+  %xor.121501.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247572.3.clone.1, %or.114956.3.clone.1)
+  %add.247574.3.clone.1 = u32[1280,1280]{1,0} add(%add.247572.3.clone.1, %xor.121501.3.clone.1)
+  %add.247575.17.clone.1 = u32[1280,1280]{1,0} add(%add.247574.3.clone.1, %broadcast.250509.24.clone.1)
+  %shift-left.109271.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121501.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115433.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121501.3.clone.1, %broadcast.244418.4352)
+  %or.114957.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109271.5.clone.1, %shift-right-logical.115433.5.clone.1)
+  %xor.121502.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247574.3.clone.1, %or.114957.3.clone.1)
+  %constant_218150_1_clone_1 = u32[] constant(1166513395)
+  %broadcast.250542.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218150_1_clone_1), dimensions={}
+  %add.247576.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121502.15.clone.1, %broadcast.250542.19.clone.1)
+  %xor.121503.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247575.17.clone.1, %add.247576.19.clone.1)
+  %shift-right-logical.115434.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121503.17.clone.1, %broadcast.244468.1920)
+  %or.114958.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115434.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5738.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114958.13.clone.1)
+  %add.247577.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5738.11.clone.1, %broadcast.244470.1152)
+  %multiply.26302.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247577.9.clone.1, %broadcast.244471.896)
+  %add.247579.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26302.7.clone.1, %broadcast.244408.1024)
+  %maximum.3670.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247579.5.clone.1)
+  %abs.1530.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3670.3.clone.1)
+  %compare.7208.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1530.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26303.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3670.3.clone.1, %broadcast.244476.1152)
+  %negate.4565.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3670.3.clone.1)
+  %multiply.26304.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3670.3.clone.1, %negate.4565.5.clone.1)
+  %log-plus-one.1530.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26304.5.clone.1)
+  %negate.4566.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1530.3.clone.1)
+  %compare.7209.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4566.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20895.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20896.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20897.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20898.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20899.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20900.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20901.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20902.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20903.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247580.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4566.4.clone.1, %broadcast.244496.640)
+  %sqrt.1530.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4566.4.clone.1)
+  %add.247581.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1530.5.clone.1, %broadcast.244498.640)
+  %select.20904.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7209.3.clone.1, %add.247580.5.clone.1, %add.247581.5.clone.1)
+  %multiply.26305.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20903.3.clone.1, %select.20904.3.clone.1)
+  %add.247582.1.clone.1 = f32[1280,1280]{1,0} add(%select.20902.3.clone.1, %multiply.26305.1.clone.1)
+  %multiply.26306.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247582.1.clone.1, %select.20904.3.clone.1)
+  %add.247584.1.clone.1 = f32[1280,1280]{1,0} add(%select.20901.3.clone.1, %multiply.26306.1.clone.1)
+  %multiply.26307.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247584.1.clone.1, %select.20904.3.clone.1)
+  %add.247585.1.clone.1 = f32[1280,1280]{1,0} add(%select.20900.3.clone.1, %multiply.26307.1.clone.1)
+  %multiply.26308.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247585.1.clone.1, %select.20904.3.clone.1)
+  %add.247586.1.clone.1 = f32[1280,1280]{1,0} add(%select.20899.3.clone.1, %multiply.26308.1.clone.1)
+  %multiply.26309.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247586.1.clone.1, %select.20904.3.clone.1)
+  %add.247587.3.clone.1 = f32[1280,1280]{1,0} add(%select.20898.5.clone.1, %multiply.26309.1.clone.1)
+  %multiply.26310.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247587.3.clone.1, %select.20904.3.clone.1)
+  %add.247588.3.clone.1 = f32[1280,1280]{1,0} add(%select.20897.5.clone.1, %multiply.26310.1.clone.1)
+  %multiply.26311.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247588.3.clone.1, %select.20904.3.clone.1)
+  %add.247590.9.clone.1 = f32[1280,1280]{1,0} add(%select.20896.11.clone.1, %multiply.26311.7.clone.1)
+  %multiply.26312.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247590.9.clone.1, %select.20904.3.clone.1)
+  %add.247594.7.clone.1 = f32[1280,1280]{1,0} add(%select.20895.7.clone.1, %multiply.26312.7.clone.1)
+  %multiply.26313.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247594.7.clone.1, %maximum.3670.3.clone.1)
+  %select.20905.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7208.3.clone.1, %multiply.26303.9.clone.1, %multiply.26313.7.clone.1)
+  %multiply.26314.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20905.7.clone.1, %broadcast.244500.640)
+  %clamp.1174.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26314.5.clone.1, %broadcast.244501.384)
+  %multiply.26315.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1174.3.clone.1, %broadcast.244502.1)
+  %constant_191024_1_clone_1 = u32[] constant(1759838114)
+  %broadcast.258643.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191024_1_clone_1), dimensions={}
+  %add.252195.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258643.44.clone.1)
+  %constant_191031_1_clone_1 = u32[] constant(525301722)
+  %broadcast.258644.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191031_1_clone_1), dimensions={}
+  %add.252196.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258644.113.clone.1)
+  %add.252197.35.clone.1 = u32[1280,1280]{1,0} add(%add.252195.37.clone.1, %add.252196.99.clone.1)
+  %shift-left.111280.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252196.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117560.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252196.99.clone.1, %broadcast.244415.6016)
+  %or.117090.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111280.31.clone.1, %shift-right-logical.117560.29.clone.1)
+  %xor.123651.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252197.35.clone.1, %or.117090.29.clone.1)
+  %add.252199.5.clone.1 = u32[1280,1280]{1,0} add(%add.252197.35.clone.1, %xor.123651.27.clone.1)
+  %shift-left.111281.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123651.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117561.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123651.27.clone.1, %broadcast.244417.5760)
+  %or.117091.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111281.9.clone.1, %shift-right-logical.117561.9.clone.1)
+  %xor.123652.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252199.5.clone.1, %or.117091.7.clone.1)
+  %add.252200.3.clone.1 = u32[1280,1280]{1,0} add(%add.252199.5.clone.1, %xor.123652.5.clone.1)
+  %shift-left.111282.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123652.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117562.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123652.5.clone.1, %broadcast.244419.4352)
+  %or.117092.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111282.5.clone.1, %shift-right-logical.117562.5.clone.1)
+  %xor.123653.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252200.3.clone.1, %or.117092.3.clone.1)
+  %add.252201.3.clone.1 = u32[1280,1280]{1,0} add(%add.252200.3.clone.1, %xor.123653.3.clone.1)
+  %add.252202.7.clone.1 = u32[1280,1280]{1,0} add(%add.252201.3.clone.1, %broadcast.258644.113.clone.1)
+  %shift-left.111283.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123653.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117563.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123653.3.clone.1, %broadcast.244418.4352)
+  %or.117093.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111283.5.clone.1, %shift-right-logical.117563.5.clone.1)
+  %xor.123655.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252201.3.clone.1, %or.117093.3.clone.1)
+  %constant_218665_1_clone_1 = u32[] constant(1819972515)
+  %broadcast.258654.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218665_1_clone_1), dimensions={}
+  %add.252204.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123655.3.clone.1, %broadcast.258654.5.clone.1)
+  %add.252205.5.clone.1 = u32[1280,1280]{1,0} add(%add.252202.7.clone.1, %add.252204.5.clone.1)
+  %shift-left.111284.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252204.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117564.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252204.5.clone.1, %broadcast.244416.5760)
+  %or.117095.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111284.9.clone.1, %shift-right-logical.117564.9.clone.1)
+  %xor.123656.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252205.5.clone.1, %or.117095.7.clone.1)
+  %add.252206.3.clone.1 = u32[1280,1280]{1,0} add(%add.252205.5.clone.1, %xor.123656.5.clone.1)
+  %shift-left.111285.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123656.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117565.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123656.5.clone.1, %broadcast.244429.2304)
+  %or.117096.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111285.9.clone.1, %shift-right-logical.117565.9.clone.1)
+  %xor.123657.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252206.3.clone.1, %or.117096.7.clone.1)
+  %add.252207.3.clone.1 = u32[1280,1280]{1,0} add(%add.252206.3.clone.1, %xor.123657.5.clone.1)
+  %shift-left.111286.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123657.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117566.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123657.5.clone.1, %broadcast.244430.4608)
+  %or.117097.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111286.9.clone.1, %shift-right-logical.117566.9.clone.1)
+  %xor.123658.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252207.3.clone.1, %or.117097.7.clone.1)
+  %add.252209.3.clone.1 = u32[1280,1280]{1,0} add(%add.252207.3.clone.1, %xor.123658.5.clone.1)
+  %constant_191033_1_clone_1 = u32[] constant(1819972514)
+  %broadcast.258661.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_191033_1_clone_1), dimensions={}
+  %add.252210.7.clone.1 = u32[1280,1280]{1,0} add(%add.252209.3.clone.1, %broadcast.258661.24.clone.1)
+  %shift-left.111287.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123658.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117567.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123658.5.clone.1, %broadcast.244434.2816)
+  %or.117098.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111287.11.clone.1, %shift-right-logical.117567.11.clone.1)
+  %xor.123660.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252209.3.clone.1, %or.117098.9.clone.1)
+  %constant_218666_1_clone_1 = u32[] constant(1759838116)
+  %broadcast.258664.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218666_1_clone_1), dimensions={}
+  %add.252211.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123660.7.clone.1, %broadcast.258664.5.clone.1)
+  %add.252212.5.clone.1 = u32[1280,1280]{1,0} add(%add.252210.7.clone.1, %add.252211.5.clone.1)
+  %shift-left.111288.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252211.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117568.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252211.5.clone.1, %broadcast.244415.6016)
+  %or.117099.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111288.9.clone.1, %shift-right-logical.117568.9.clone.1)
+  %xor.123661.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252212.5.clone.1, %or.117099.7.clone.1)
+  %add.252213.3.clone.1 = u32[1280,1280]{1,0} add(%add.252212.5.clone.1, %xor.123661.5.clone.1)
+  %shift-left.111289.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123661.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117569.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123661.5.clone.1, %broadcast.244417.5760)
+  %or.117100.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111289.9.clone.1, %shift-right-logical.117569.9.clone.1)
+  %xor.123662.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252213.3.clone.1, %or.117100.7.clone.1)
+  %add.252215.3.clone.1 = u32[1280,1280]{1,0} add(%add.252213.3.clone.1, %xor.123662.5.clone.1)
+  %shift-left.111290.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123662.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117570.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123662.5.clone.1, %broadcast.244419.4352)
+  %or.117101.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111290.7.clone.1, %shift-right-logical.117570.7.clone.1)
+  %xor.123663.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252215.3.clone.1, %or.117101.5.clone.1)
+  %add.252219.3.clone.1 = u32[1280,1280]{1,0} add(%add.252215.3.clone.1, %xor.123663.3.clone.1)
+  %add.252220.7.clone.1 = u32[1280,1280]{1,0} add(%add.252219.3.clone.1, %broadcast.258643.44.clone.1)
+  %shift-left.111291.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123663.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117571.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123663.3.clone.1, %broadcast.244418.4352)
+  %or.117102.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111291.7.clone.1, %shift-right-logical.117571.7.clone.1)
+  %xor.123665.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252219.3.clone.1, %or.117102.5.clone.1)
+  %constant_218667_1_clone_1 = u32[] constant(525301725)
+  %broadcast.258674.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218667_1_clone_1), dimensions={}
+  %add.252221.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123665.3.clone.1, %broadcast.258674.5.clone.1)
+  %add.252222.5.clone.1 = u32[1280,1280]{1,0} add(%add.252220.7.clone.1, %add.252221.5.clone.1)
+  %shift-left.111292.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252221.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117572.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252221.5.clone.1, %broadcast.244416.5760)
+  %or.117103.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111292.9.clone.1, %shift-right-logical.117572.9.clone.1)
+  %xor.123666.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252222.5.clone.1, %or.117103.7.clone.1)
+  %add.252224.3.clone.1 = u32[1280,1280]{1,0} add(%add.252222.5.clone.1, %xor.123666.5.clone.1)
+  %shift-left.111293.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123666.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117573.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123666.5.clone.1, %broadcast.244429.2304)
+  %or.117105.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111293.9.clone.1, %shift-right-logical.117573.9.clone.1)
+  %xor.123667.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252224.3.clone.1, %or.117105.7.clone.1)
+  %add.252225.3.clone.1 = u32[1280,1280]{1,0} add(%add.252224.3.clone.1, %xor.123667.5.clone.1)
+  %shift-left.111294.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123667.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117574.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123667.5.clone.1, %broadcast.244430.4608)
+  %or.117106.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111294.9.clone.1, %shift-right-logical.117574.9.clone.1)
+  %xor.123668.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252225.3.clone.1, %or.117106.7.clone.1)
+  %add.252226.3.clone.1 = u32[1280,1280]{1,0} add(%add.252225.3.clone.1, %xor.123668.5.clone.1)
+  %add.252227.7.clone.1 = u32[1280,1280]{1,0} add(%add.252226.3.clone.1, %broadcast.258644.113.clone.1)
+  %shift-left.111295.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123668.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117575.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123668.5.clone.1, %broadcast.244434.2816)
+  %or.117107.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111295.11.clone.1, %shift-right-logical.117575.11.clone.1)
+  %xor.123669.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252226.3.clone.1, %or.117107.9.clone.1)
+  %constant_218668_1_clone_1 = u32[] constant(1819972518)
+  %broadcast.258684.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218668_1_clone_1), dimensions={}
+  %add.252229.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123669.7.clone.1, %broadcast.258684.5.clone.1)
+  %add.252230.5.clone.1 = u32[1280,1280]{1,0} add(%add.252227.7.clone.1, %add.252229.5.clone.1)
+  %shift-left.111296.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252229.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117576.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252229.5.clone.1, %broadcast.244415.6016)
+  %or.117108.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111296.9.clone.1, %shift-right-logical.117576.9.clone.1)
+  %xor.123670.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252230.5.clone.1, %or.117108.7.clone.1)
+  %add.252231.3.clone.1 = u32[1280,1280]{1,0} add(%add.252230.5.clone.1, %xor.123670.5.clone.1)
+  %shift-left.111297.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123670.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117577.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123670.5.clone.1, %broadcast.244417.5760)
+  %or.117110.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111297.9.clone.1, %shift-right-logical.117577.9.clone.1)
+  %xor.123671.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252231.3.clone.1, %or.117110.7.clone.1)
+  %add.252232.3.clone.1 = u32[1280,1280]{1,0} add(%add.252231.3.clone.1, %xor.123671.5.clone.1)
+  %shift-left.111298.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123671.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117578.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123671.5.clone.1, %broadcast.244419.4352)
+  %or.117111.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111298.5.clone.1, %shift-right-logical.117578.5.clone.1)
+  %xor.123672.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252232.3.clone.1, %or.117111.3.clone.1)
+  %add.252234.3.clone.1 = u32[1280,1280]{1,0} add(%add.252232.3.clone.1, %xor.123672.3.clone.1)
+  %add.252235.17.clone.1 = u32[1280,1280]{1,0} add(%add.252234.3.clone.1, %broadcast.258661.24.clone.1)
+  %shift-left.111299.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123672.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117579.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123672.3.clone.1, %broadcast.244418.4352)
+  %or.117112.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111299.5.clone.1, %shift-right-logical.117579.5.clone.1)
+  %xor.123673.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252234.3.clone.1, %or.117112.3.clone.1)
+  %constant_218669_1_clone_1 = u32[] constant(1759838119)
+  %broadcast.258694.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218669_1_clone_1), dimensions={}
+  %add.252236.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123673.15.clone.1, %broadcast.258694.19.clone.1)
+  %xor.123675.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252235.17.clone.1, %add.252236.19.clone.1)
+  %shift-right-logical.117580.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123675.17.clone.1, %broadcast.244468.1920)
+  %or.117113.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117580.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5831.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117113.13.clone.1)
+  %add.252237.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5831.11.clone.1, %broadcast.244470.1152)
+  %multiply.27250.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252237.9.clone.1, %broadcast.244471.896)
+  %add.252238.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27250.7.clone.1, %broadcast.244408.1024)
+  %maximum.3763.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252238.5.clone.1)
+  %abs.1591.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3763.3.clone.1)
+  %compare.7344.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1591.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27251.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3763.3.clone.1, %broadcast.244476.1152)
+  %negate.4687.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3763.3.clone.1)
+  %multiply.27252.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3763.3.clone.1, %negate.4687.5.clone.1)
+  %log-plus-one.1591.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27252.5.clone.1)
+  %negate.4688.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1591.3.clone.1)
+  %compare.7345.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4688.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21608.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21609.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21610.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21611.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21612.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21613.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21614.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21615.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21616.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252240.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4688.4.clone.1, %broadcast.244496.640)
+  %sqrt.1591.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4688.4.clone.1)
+  %add.252244.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1591.5.clone.1, %broadcast.244498.640)
+  %select.21617.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7345.3.clone.1, %add.252240.5.clone.1, %add.252244.5.clone.1)
+  %multiply.27253.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21616.3.clone.1, %select.21617.3.clone.1)
+  %add.252245.1.clone.1 = f32[1280,1280]{1,0} add(%select.21615.3.clone.1, %multiply.27253.1.clone.1)
+  %multiply.27254.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252245.1.clone.1, %select.21617.3.clone.1)
+  %add.252246.1.clone.1 = f32[1280,1280]{1,0} add(%select.21614.3.clone.1, %multiply.27254.1.clone.1)
+  %multiply.27255.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252246.1.clone.1, %select.21617.3.clone.1)
+  %add.252247.1.clone.1 = f32[1280,1280]{1,0} add(%select.21613.3.clone.1, %multiply.27255.1.clone.1)
+  %multiply.27256.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252247.1.clone.1, %select.21617.3.clone.1)
+  %add.252249.1.clone.1 = f32[1280,1280]{1,0} add(%select.21612.3.clone.1, %multiply.27256.1.clone.1)
+  %multiply.27257.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252249.1.clone.1, %select.21617.3.clone.1)
+  %add.252250.3.clone.1 = f32[1280,1280]{1,0} add(%select.21611.5.clone.1, %multiply.27257.1.clone.1)
+  %multiply.27258.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252250.3.clone.1, %select.21617.3.clone.1)
+  %add.252251.3.clone.1 = f32[1280,1280]{1,0} add(%select.21610.5.clone.1, %multiply.27258.1.clone.1)
+  %multiply.27259.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252251.3.clone.1, %select.21617.3.clone.1)
+  %add.252252.9.clone.1 = f32[1280,1280]{1,0} add(%select.21609.11.clone.1, %multiply.27259.7.clone.1)
+  %multiply.27260.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252252.9.clone.1, %select.21617.3.clone.1)
+  %add.252254.7.clone.1 = f32[1280,1280]{1,0} add(%select.21608.7.clone.1, %multiply.27260.7.clone.1)
+  %multiply.27261.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252254.7.clone.1, %maximum.3763.3.clone.1)
+  %select.21618.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7344.3.clone.1, %multiply.27251.9.clone.1, %multiply.27261.7.clone.1)
+  %multiply.27262.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21618.7.clone.1, %broadcast.244500.640)
+  %clamp.1235.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27262.5.clone.1, %broadcast.244501.384)
+  %multiply.27263.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1235.3.clone.1, %broadcast.244502.1)
+  %constant_171958_1_clone_1 = u32[] constant(3820892366)
+  %broadcast.250405.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171958_1_clone_1), dimensions={}
+  %add.247482.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.250405.44.clone.1)
+  %constant_171965_1_clone_1 = u32[] constant(3482744791)
+  %broadcast.250406.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171965_1_clone_1), dimensions={}
+  %add.247484.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.250406.113.clone.1)
+  %add.247485.35.clone.1 = u32[1280,1280]{1,0} add(%add.247482.37.clone.1, %add.247484.99.clone.1)
+  %shift-left.109224.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247484.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115393.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247484.99.clone.1, %broadcast.244415.6016)
+  %or.114913.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109224.31.clone.1, %shift-right-logical.115393.29.clone.1)
+  %xor.121460.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247485.35.clone.1, %or.114913.29.clone.1)
+  %add.247486.5.clone.1 = u32[1280,1280]{1,0} add(%add.247485.35.clone.1, %xor.121460.27.clone.1)
+  %shift-left.109225.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121460.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115394.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121460.27.clone.1, %broadcast.244417.5760)
+  %or.114914.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109225.9.clone.1, %shift-right-logical.115394.9.clone.1)
+  %xor.121461.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247486.5.clone.1, %or.114914.7.clone.1)
+  %add.247487.3.clone.1 = u32[1280,1280]{1,0} add(%add.247486.5.clone.1, %xor.121461.5.clone.1)
+  %shift-left.109226.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121461.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115395.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121461.5.clone.1, %broadcast.244419.4352)
+  %or.114915.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109226.5.clone.1, %shift-right-logical.115395.5.clone.1)
+  %xor.121462.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247487.3.clone.1, %or.114915.3.clone.1)
+  %add.247488.3.clone.1 = u32[1280,1280]{1,0} add(%add.247487.3.clone.1, %xor.121462.3.clone.1)
+  %add.247489.7.clone.1 = u32[1280,1280]{1,0} add(%add.247488.3.clone.1, %broadcast.250406.113.clone.1)
+  %shift-left.109228.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121462.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115396.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121462.3.clone.1, %broadcast.244418.4352)
+  %or.114916.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109228.5.clone.1, %shift-right-logical.115396.5.clone.1)
+  %xor.121463.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247488.3.clone.1, %or.114916.3.clone.1)
+  %constant_218141_1_clone_1 = u32[] constant(939089092)
+  %broadcast.250416.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218141_1_clone_1), dimensions={}
+  %add.247490.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121463.3.clone.1, %broadcast.250416.5.clone.1)
+  %add.247491.5.clone.1 = u32[1280,1280]{1,0} add(%add.247489.7.clone.1, %add.247490.5.clone.1)
+  %shift-left.109229.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247490.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115397.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247490.5.clone.1, %broadcast.244416.5760)
+  %or.114917.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109229.9.clone.1, %shift-right-logical.115397.9.clone.1)
+  %xor.121464.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247491.5.clone.1, %or.114917.7.clone.1)
+  %add.247492.3.clone.1 = u32[1280,1280]{1,0} add(%add.247491.5.clone.1, %xor.121464.5.clone.1)
+  %shift-left.109230.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121464.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115398.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121464.5.clone.1, %broadcast.244429.2304)
+  %or.114918.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109230.9.clone.1, %shift-right-logical.115398.9.clone.1)
+  %xor.121465.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247492.3.clone.1, %or.114918.7.clone.1)
+  %add.247493.3.clone.1 = u32[1280,1280]{1,0} add(%add.247492.3.clone.1, %xor.121465.5.clone.1)
+  %shift-left.109231.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121465.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115399.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121465.5.clone.1, %broadcast.244430.4608)
+  %or.114919.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109231.9.clone.1, %shift-right-logical.115399.9.clone.1)
+  %xor.121466.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247493.3.clone.1, %or.114919.7.clone.1)
+  %add.247494.3.clone.1 = u32[1280,1280]{1,0} add(%add.247493.3.clone.1, %xor.121466.5.clone.1)
+  %constant_171967_1_clone_1 = u32[] constant(939089091)
+  %broadcast.250423.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171967_1_clone_1), dimensions={}
+  %add.247495.7.clone.1 = u32[1280,1280]{1,0} add(%add.247494.3.clone.1, %broadcast.250423.24.clone.1)
+  %shift-left.109233.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121466.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115400.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121466.5.clone.1, %broadcast.244434.2816)
+  %or.114920.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109233.11.clone.1, %shift-right-logical.115400.11.clone.1)
+  %xor.121467.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247494.3.clone.1, %or.114920.9.clone.1)
+  %constant_218142_1_clone_1 = u32[] constant(3820892368)
+  %broadcast.250426.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218142_1_clone_1), dimensions={}
+  %add.247496.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121467.7.clone.1, %broadcast.250426.5.clone.1)
+  %add.247497.5.clone.1 = u32[1280,1280]{1,0} add(%add.247495.7.clone.1, %add.247496.5.clone.1)
+  %shift-left.109234.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247496.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115401.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247496.5.clone.1, %broadcast.244415.6016)
+  %or.114922.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109234.9.clone.1, %shift-right-logical.115401.9.clone.1)
+  %xor.121468.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247497.5.clone.1, %or.114922.7.clone.1)
+  %add.247498.3.clone.1 = u32[1280,1280]{1,0} add(%add.247497.5.clone.1, %xor.121468.5.clone.1)
+  %shift-left.109235.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121468.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115402.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121468.5.clone.1, %broadcast.244417.5760)
+  %or.114923.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109235.9.clone.1, %shift-right-logical.115402.9.clone.1)
+  %xor.121469.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247498.3.clone.1, %or.114923.7.clone.1)
+  %add.247499.3.clone.1 = u32[1280,1280]{1,0} add(%add.247498.3.clone.1, %xor.121469.5.clone.1)
+  %shift-left.109236.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121469.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115403.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121469.5.clone.1, %broadcast.244419.4352)
+  %or.114925.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109236.7.clone.1, %shift-right-logical.115403.7.clone.1)
+  %xor.121470.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247499.3.clone.1, %or.114925.5.clone.1)
+  %add.247500.3.clone.1 = u32[1280,1280]{1,0} add(%add.247499.3.clone.1, %xor.121470.3.clone.1)
+  %add.247501.7.clone.1 = u32[1280,1280]{1,0} add(%add.247500.3.clone.1, %broadcast.250405.44.clone.1)
+  %shift-left.109238.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121470.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115404.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121470.3.clone.1, %broadcast.244418.4352)
+  %or.114926.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109238.7.clone.1, %shift-right-logical.115404.7.clone.1)
+  %xor.121471.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247500.3.clone.1, %or.114926.5.clone.1)
+  %constant_218143_1_clone_1 = u32[] constant(3482744794)
+  %broadcast.250436.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218143_1_clone_1), dimensions={}
+  %add.247502.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121471.3.clone.1, %broadcast.250436.5.clone.1)
+  %add.247503.5.clone.1 = u32[1280,1280]{1,0} add(%add.247501.7.clone.1, %add.247502.5.clone.1)
+  %shift-left.109239.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247502.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115405.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247502.5.clone.1, %broadcast.244416.5760)
+  %or.114927.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109239.9.clone.1, %shift-right-logical.115405.9.clone.1)
+  %xor.121472.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247503.5.clone.1, %or.114927.7.clone.1)
+  %add.247504.3.clone.1 = u32[1280,1280]{1,0} add(%add.247503.5.clone.1, %xor.121472.5.clone.1)
+  %shift-left.109240.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121472.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115406.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121472.5.clone.1, %broadcast.244429.2304)
+  %or.114928.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109240.9.clone.1, %shift-right-logical.115406.9.clone.1)
+  %xor.121473.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247504.3.clone.1, %or.114928.7.clone.1)
+  %add.247505.3.clone.1 = u32[1280,1280]{1,0} add(%add.247504.3.clone.1, %xor.121473.5.clone.1)
+  %shift-left.109241.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121473.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115407.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121473.5.clone.1, %broadcast.244430.4608)
+  %or.114929.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109241.9.clone.1, %shift-right-logical.115407.9.clone.1)
+  %xor.121474.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247505.3.clone.1, %or.114929.7.clone.1)
+  %add.247506.3.clone.1 = u32[1280,1280]{1,0} add(%add.247505.3.clone.1, %xor.121474.5.clone.1)
+  %add.247507.7.clone.1 = u32[1280,1280]{1,0} add(%add.247506.3.clone.1, %broadcast.250406.113.clone.1)
+  %shift-left.109243.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121474.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115408.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121474.5.clone.1, %broadcast.244434.2816)
+  %or.114930.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109243.11.clone.1, %shift-right-logical.115408.11.clone.1)
+  %xor.121475.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247506.3.clone.1, %or.114930.9.clone.1)
+  %constant_218144_1_clone_1 = u32[] constant(939089095)
+  %broadcast.250446.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218144_1_clone_1), dimensions={}
+  %add.247508.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121475.7.clone.1, %broadcast.250446.5.clone.1)
+  %add.247509.5.clone.1 = u32[1280,1280]{1,0} add(%add.247507.7.clone.1, %add.247508.5.clone.1)
+  %shift-left.109244.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247508.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115409.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247508.5.clone.1, %broadcast.244415.6016)
+  %or.114931.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109244.9.clone.1, %shift-right-logical.115409.9.clone.1)
+  %xor.121476.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247509.5.clone.1, %or.114931.7.clone.1)
+  %add.247510.3.clone.1 = u32[1280,1280]{1,0} add(%add.247509.5.clone.1, %xor.121476.5.clone.1)
+  %shift-left.109245.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121476.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115410.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121476.5.clone.1, %broadcast.244417.5760)
+  %or.114932.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109245.9.clone.1, %shift-right-logical.115410.9.clone.1)
+  %xor.121477.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247510.3.clone.1, %or.114932.7.clone.1)
+  %add.247511.3.clone.1 = u32[1280,1280]{1,0} add(%add.247510.3.clone.1, %xor.121477.5.clone.1)
+  %shift-left.109246.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121477.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115411.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121477.5.clone.1, %broadcast.244419.4352)
+  %or.114933.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109246.5.clone.1, %shift-right-logical.115411.5.clone.1)
+  %xor.121478.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247511.3.clone.1, %or.114933.3.clone.1)
+  %add.247512.3.clone.1 = u32[1280,1280]{1,0} add(%add.247511.3.clone.1, %xor.121478.3.clone.1)
+  %add.247513.17.clone.1 = u32[1280,1280]{1,0} add(%add.247512.3.clone.1, %broadcast.250423.24.clone.1)
+  %shift-left.109247.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121478.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115412.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121478.3.clone.1, %broadcast.244418.4352)
+  %or.114934.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109247.5.clone.1, %shift-right-logical.115412.5.clone.1)
+  %xor.121479.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247512.3.clone.1, %or.114934.3.clone.1)
+  %constant_218145_1_clone_1 = u32[] constant(3820892371)
+  %broadcast.250456.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218145_1_clone_1), dimensions={}
+  %add.247514.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121479.15.clone.1, %broadcast.250456.19.clone.1)
+  %xor.121480.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247513.17.clone.1, %add.247514.19.clone.1)
+  %shift-right-logical.115413.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121480.17.clone.1, %broadcast.244468.1920)
+  %or.114935.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115413.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5737.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114935.13.clone.1)
+  %add.247516.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5737.11.clone.1, %broadcast.244470.1152)
+  %multiply.26288.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247516.9.clone.1, %broadcast.244471.896)
+  %add.247519.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26288.7.clone.1, %broadcast.244408.1024)
+  %maximum.3669.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247519.5.clone.1)
+  %abs.1529.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3669.3.clone.1)
+  %compare.7206.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1529.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26289.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3669.3.clone.1, %broadcast.244476.1152)
+  %negate.4563.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3669.3.clone.1)
+  %multiply.26290.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3669.3.clone.1, %negate.4563.5.clone.1)
+  %log-plus-one.1529.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26290.5.clone.1)
+  %negate.4564.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1529.3.clone.1)
+  %compare.7207.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4564.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20884.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20885.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20886.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20887.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20888.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20889.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20890.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20891.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20892.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247520.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4564.4.clone.1, %broadcast.244496.640)
+  %sqrt.1529.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4564.4.clone.1)
+  %add.247521.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1529.5.clone.1, %broadcast.244498.640)
+  %select.20893.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7207.3.clone.1, %add.247520.5.clone.1, %add.247521.5.clone.1)
+  %multiply.26291.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20892.3.clone.1, %select.20893.3.clone.1)
+  %add.247522.1.clone.1 = f32[1280,1280]{1,0} add(%select.20891.3.clone.1, %multiply.26291.1.clone.1)
+  %multiply.26292.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247522.1.clone.1, %select.20893.3.clone.1)
+  %add.247524.1.clone.1 = f32[1280,1280]{1,0} add(%select.20890.3.clone.1, %multiply.26292.1.clone.1)
+  %multiply.26293.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247524.1.clone.1, %select.20893.3.clone.1)
+  %add.247525.1.clone.1 = f32[1280,1280]{1,0} add(%select.20889.3.clone.1, %multiply.26293.1.clone.1)
+  %multiply.26294.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247525.1.clone.1, %select.20893.3.clone.1)
+  %add.247526.1.clone.1 = f32[1280,1280]{1,0} add(%select.20888.3.clone.1, %multiply.26294.1.clone.1)
+  %multiply.26295.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247526.1.clone.1, %select.20893.3.clone.1)
+  %add.247527.3.clone.1 = f32[1280,1280]{1,0} add(%select.20887.5.clone.1, %multiply.26295.1.clone.1)
+  %multiply.26296.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247527.3.clone.1, %select.20893.3.clone.1)
+  %add.247529.3.clone.1 = f32[1280,1280]{1,0} add(%select.20886.5.clone.1, %multiply.26296.1.clone.1)
+  %multiply.26297.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247529.3.clone.1, %select.20893.3.clone.1)
+  %add.247530.9.clone.1 = f32[1280,1280]{1,0} add(%select.20885.11.clone.1, %multiply.26297.7.clone.1)
+  %multiply.26298.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247530.9.clone.1, %select.20893.3.clone.1)
+  %add.247531.7.clone.1 = f32[1280,1280]{1,0} add(%select.20884.7.clone.1, %multiply.26298.7.clone.1)
+  %multiply.26299.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247531.7.clone.1, %maximum.3669.3.clone.1)
+  %select.20894.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7206.3.clone.1, %multiply.26289.9.clone.1, %multiply.26299.7.clone.1)
+  %multiply.26300.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20894.7.clone.1, %broadcast.244500.640)
+  %clamp.1173.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26300.5.clone.1, %broadcast.244501.384)
+  %multiply.26301.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1173.3.clone.1, %broadcast.244502.1)
+  %constant_184351_1_clone_1 = u32[] constant(2451525708)
+  %broadcast.255768.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184351_1_clone_1), dimensions={}
+  %add.250547.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255768.44.clone.1)
+  %constant_184361_1_clone_1 = u32[] constant(2332708934)
+  %broadcast.255769.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184361_1_clone_1), dimensions={}
+  %add.250549.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.255769.113.clone.1)
+  %add.250552.35.clone.1 = u32[1280,1280]{1,0} add(%add.250547.37.clone.1, %add.250549.99.clone.1)
+  %shift-left.110547.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250549.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116801.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250549.99.clone.1, %broadcast.244415.6016)
+  %or.116316.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110547.31.clone.1, %shift-right-logical.116801.29.clone.1)
+  %xor.122886.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250552.35.clone.1, %or.116316.29.clone.1)
+  %add.250553.5.clone.1 = u32[1280,1280]{1,0} add(%add.250552.35.clone.1, %xor.122886.27.clone.1)
+  %shift-left.110548.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122886.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116802.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122886.27.clone.1, %broadcast.244417.5760)
+  %or.116317.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110548.9.clone.1, %shift-right-logical.116802.9.clone.1)
+  %xor.122887.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250553.5.clone.1, %or.116317.7.clone.1)
+  %add.250554.3.clone.1 = u32[1280,1280]{1,0} add(%add.250553.5.clone.1, %xor.122887.5.clone.1)
+  %shift-left.110549.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122887.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116803.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122887.5.clone.1, %broadcast.244419.4352)
+  %or.116318.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110549.5.clone.1, %shift-right-logical.116803.5.clone.1)
+  %xor.122888.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250554.3.clone.1, %or.116318.3.clone.1)
+  %add.250555.3.clone.1 = u32[1280,1280]{1,0} add(%add.250554.3.clone.1, %xor.122888.3.clone.1)
+  %add.250556.7.clone.1 = u32[1280,1280]{1,0} add(%add.250555.3.clone.1, %broadcast.255769.113.clone.1)
+  %shift-left.110551.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122888.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116804.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122888.3.clone.1, %broadcast.244418.4352)
+  %or.116319.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110551.5.clone.1, %shift-right-logical.116804.5.clone.1)
+  %xor.122890.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250555.3.clone.1, %or.116319.3.clone.1)
+  %constant_218479_1_clone_1 = u32[] constant(46406609)
+  %broadcast.255779.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218479_1_clone_1), dimensions={}
+  %add.250557.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122890.3.clone.1, %broadcast.255779.5.clone.1)
+  %add.250558.5.clone.1 = u32[1280,1280]{1,0} add(%add.250556.7.clone.1, %add.250557.5.clone.1)
+  %shift-left.110552.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250557.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116805.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250557.5.clone.1, %broadcast.244416.5760)
+  %or.116320.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110552.9.clone.1, %shift-right-logical.116805.9.clone.1)
+  %xor.122891.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250558.5.clone.1, %or.116320.7.clone.1)
+  %add.250559.3.clone.1 = u32[1280,1280]{1,0} add(%add.250558.5.clone.1, %xor.122891.5.clone.1)
+  %shift-left.110553.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122891.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116806.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122891.5.clone.1, %broadcast.244429.2304)
+  %or.116321.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110553.9.clone.1, %shift-right-logical.116806.9.clone.1)
+  %xor.122892.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250559.3.clone.1, %or.116321.7.clone.1)
+  %add.250560.3.clone.1 = u32[1280,1280]{1,0} add(%add.250559.3.clone.1, %xor.122892.5.clone.1)
+  %shift-left.110554.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122892.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116807.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122892.5.clone.1, %broadcast.244430.4608)
+  %or.116322.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110554.9.clone.1, %shift-right-logical.116807.9.clone.1)
+  %xor.122893.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250560.3.clone.1, %or.116322.7.clone.1)
+  %add.250561.3.clone.1 = u32[1280,1280]{1,0} add(%add.250560.3.clone.1, %xor.122893.5.clone.1)
+  %constant_184365_1_clone_1 = u32[] constant(46406608)
+  %broadcast.255786.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184365_1_clone_1), dimensions={}
+  %add.250562.7.clone.1 = u32[1280,1280]{1,0} add(%add.250561.3.clone.1, %broadcast.255786.24.clone.1)
+  %shift-left.110555.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122893.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116808.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122893.5.clone.1, %broadcast.244434.2816)
+  %or.116323.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110555.11.clone.1, %shift-right-logical.116808.11.clone.1)
+  %xor.122895.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250561.3.clone.1, %or.116323.9.clone.1)
+  %constant_218480_1_clone_1 = u32[] constant(2451525710)
+  %broadcast.255789.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218480_1_clone_1), dimensions={}
+  %add.250563.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122895.7.clone.1, %broadcast.255789.5.clone.1)
+  %add.250564.5.clone.1 = u32[1280,1280]{1,0} add(%add.250562.7.clone.1, %add.250563.5.clone.1)
+  %shift-left.110556.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250563.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116809.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250563.5.clone.1, %broadcast.244415.6016)
+  %or.116325.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110556.9.clone.1, %shift-right-logical.116809.9.clone.1)
+  %xor.122896.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250564.5.clone.1, %or.116325.7.clone.1)
+  %add.250565.3.clone.1 = u32[1280,1280]{1,0} add(%add.250564.5.clone.1, %xor.122896.5.clone.1)
+  %shift-left.110557.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122896.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116810.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122896.5.clone.1, %broadcast.244417.5760)
+  %or.116326.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110557.9.clone.1, %shift-right-logical.116810.9.clone.1)
+  %xor.122897.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250565.3.clone.1, %or.116326.7.clone.1)
+  %add.250566.3.clone.1 = u32[1280,1280]{1,0} add(%add.250565.3.clone.1, %xor.122897.5.clone.1)
+  %shift-left.110558.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122897.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116811.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122897.5.clone.1, %broadcast.244419.4352)
+  %or.116327.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110558.7.clone.1, %shift-right-logical.116811.7.clone.1)
+  %xor.122898.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250566.3.clone.1, %or.116327.5.clone.1)
+  %add.250567.3.clone.1 = u32[1280,1280]{1,0} add(%add.250566.3.clone.1, %xor.122898.3.clone.1)
+  %add.250568.7.clone.1 = u32[1280,1280]{1,0} add(%add.250567.3.clone.1, %broadcast.255768.44.clone.1)
+  %shift-left.110559.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122898.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116812.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122898.3.clone.1, %broadcast.244418.4352)
+  %or.116328.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110559.7.clone.1, %shift-right-logical.116812.7.clone.1)
+  %xor.122900.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250567.3.clone.1, %or.116328.5.clone.1)
+  %constant_218481_1_clone_1 = u32[] constant(2332708937)
+  %broadcast.255799.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218481_1_clone_1), dimensions={}
+  %add.250569.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122900.3.clone.1, %broadcast.255799.5.clone.1)
+  %add.250570.5.clone.1 = u32[1280,1280]{1,0} add(%add.250568.7.clone.1, %add.250569.5.clone.1)
+  %shift-left.110561.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250569.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116813.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250569.5.clone.1, %broadcast.244416.5760)
+  %or.116330.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110561.9.clone.1, %shift-right-logical.116813.9.clone.1)
+  %xor.122901.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250570.5.clone.1, %or.116330.7.clone.1)
+  %add.250571.3.clone.1 = u32[1280,1280]{1,0} add(%add.250570.5.clone.1, %xor.122901.5.clone.1)
+  %shift-left.110562.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122901.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116815.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122901.5.clone.1, %broadcast.244429.2304)
+  %or.116331.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110562.9.clone.1, %shift-right-logical.116815.9.clone.1)
+  %xor.122902.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250571.3.clone.1, %or.116331.7.clone.1)
+  %add.250572.3.clone.1 = u32[1280,1280]{1,0} add(%add.250571.3.clone.1, %xor.122902.5.clone.1)
+  %shift-left.110563.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122902.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116816.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122902.5.clone.1, %broadcast.244430.4608)
+  %or.116332.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110563.9.clone.1, %shift-right-logical.116816.9.clone.1)
+  %xor.122903.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250572.3.clone.1, %or.116332.7.clone.1)
+  %add.250573.3.clone.1 = u32[1280,1280]{1,0} add(%add.250572.3.clone.1, %xor.122903.5.clone.1)
+  %add.250574.7.clone.1 = u32[1280,1280]{1,0} add(%add.250573.3.clone.1, %broadcast.255769.113.clone.1)
+  %shift-left.110564.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122903.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116817.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122903.5.clone.1, %broadcast.244434.2816)
+  %or.116333.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110564.11.clone.1, %shift-right-logical.116817.11.clone.1)
+  %xor.122905.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250573.3.clone.1, %or.116333.9.clone.1)
+  %constant_218482_1_clone_1 = u32[] constant(46406612)
+  %broadcast.255809.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218482_1_clone_1), dimensions={}
+  %add.250575.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122905.7.clone.1, %broadcast.255809.5.clone.1)
+  %add.250576.5.clone.1 = u32[1280,1280]{1,0} add(%add.250574.7.clone.1, %add.250575.5.clone.1)
+  %shift-left.110566.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250575.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116818.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250575.5.clone.1, %broadcast.244415.6016)
+  %or.116335.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110566.9.clone.1, %shift-right-logical.116818.9.clone.1)
+  %xor.122906.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250576.5.clone.1, %or.116335.7.clone.1)
+  %add.250577.3.clone.1 = u32[1280,1280]{1,0} add(%add.250576.5.clone.1, %xor.122906.5.clone.1)
+  %shift-left.110567.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122906.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116819.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122906.5.clone.1, %broadcast.244417.5760)
+  %or.116336.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110567.9.clone.1, %shift-right-logical.116819.9.clone.1)
+  %xor.122907.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250577.3.clone.1, %or.116336.7.clone.1)
+  %add.250578.3.clone.1 = u32[1280,1280]{1,0} add(%add.250577.3.clone.1, %xor.122907.5.clone.1)
+  %shift-left.110568.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122907.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116820.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122907.5.clone.1, %broadcast.244419.4352)
+  %or.116337.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110568.5.clone.1, %shift-right-logical.116820.5.clone.1)
+  %xor.122908.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250578.3.clone.1, %or.116337.3.clone.1)
+  %add.250579.3.clone.1 = u32[1280,1280]{1,0} add(%add.250578.3.clone.1, %xor.122908.3.clone.1)
+  %add.250580.17.clone.1 = u32[1280,1280]{1,0} add(%add.250579.3.clone.1, %broadcast.255786.24.clone.1)
+  %shift-left.110569.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122908.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116822.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122908.3.clone.1, %broadcast.244418.4352)
+  %or.116338.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110569.5.clone.1, %shift-right-logical.116822.5.clone.1)
+  %xor.122909.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250579.3.clone.1, %or.116338.3.clone.1)
+  %constant_218483_1_clone_1 = u32[] constant(2451525713)
+  %broadcast.255819.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218483_1_clone_1), dimensions={}
+  %add.250581.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122909.15.clone.1, %broadcast.255819.19.clone.1)
+  %xor.122910.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250580.17.clone.1, %add.250581.19.clone.1)
+  %shift-right-logical.116823.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122910.17.clone.1, %broadcast.244468.1920)
+  %or.116340.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116823.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5798.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116340.13.clone.1)
+  %add.250582.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5798.11.clone.1, %broadcast.244470.1152)
+  %multiply.26918.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250582.9.clone.1, %broadcast.244471.896)
+  %add.250583.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26918.7.clone.1, %broadcast.244408.1024)
+  %maximum.3730.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250583.5.clone.1)
+  %abs.1570.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3730.3.clone.1)
+  %compare.7296.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1570.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26919.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3730.3.clone.1, %broadcast.244476.1152)
+  %negate.4645.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3730.3.clone.1)
+  %multiply.26920.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3730.3.clone.1, %negate.4645.5.clone.1)
+  %log-plus-one.1570.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26920.5.clone.1)
+  %negate.4646.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1570.3.clone.1)
+  %compare.7297.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4646.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21356.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21357.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21358.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21359.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21360.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21361.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21362.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21363.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21364.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250584.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4646.4.clone.1, %broadcast.244496.640)
+  %sqrt.1570.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4646.4.clone.1)
+  %add.250585.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1570.5.clone.1, %broadcast.244498.640)
+  %select.21365.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7297.3.clone.1, %add.250584.5.clone.1, %add.250585.5.clone.1)
+  %multiply.26921.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21364.3.clone.1, %select.21365.3.clone.1)
+  %add.250586.1.clone.1 = f32[1280,1280]{1,0} add(%select.21363.3.clone.1, %multiply.26921.1.clone.1)
+  %multiply.26922.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250586.1.clone.1, %select.21365.3.clone.1)
+  %add.250587.1.clone.1 = f32[1280,1280]{1,0} add(%select.21362.3.clone.1, %multiply.26922.1.clone.1)
+  %multiply.26923.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250587.1.clone.1, %select.21365.3.clone.1)
+  %add.250588.1.clone.1 = f32[1280,1280]{1,0} add(%select.21361.3.clone.1, %multiply.26923.1.clone.1)
+  %multiply.26924.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250588.1.clone.1, %select.21365.3.clone.1)
+  %add.250589.1.clone.1 = f32[1280,1280]{1,0} add(%select.21360.3.clone.1, %multiply.26924.1.clone.1)
+  %multiply.26925.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250589.1.clone.1, %select.21365.3.clone.1)
+  %add.250590.3.clone.1 = f32[1280,1280]{1,0} add(%select.21359.5.clone.1, %multiply.26925.1.clone.1)
+  %multiply.26926.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250590.3.clone.1, %select.21365.3.clone.1)
+  %add.250591.3.clone.1 = f32[1280,1280]{1,0} add(%select.21358.5.clone.1, %multiply.26926.1.clone.1)
+  %multiply.26927.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250591.3.clone.1, %select.21365.3.clone.1)
+  %add.250592.9.clone.1 = f32[1280,1280]{1,0} add(%select.21357.11.clone.1, %multiply.26927.7.clone.1)
+  %multiply.26928.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250592.9.clone.1, %select.21365.3.clone.1)
+  %add.250593.7.clone.1 = f32[1280,1280]{1,0} add(%select.21356.7.clone.1, %multiply.26928.7.clone.1)
+  %multiply.26930.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250593.7.clone.1, %maximum.3730.3.clone.1)
+  %select.21366.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7296.3.clone.1, %multiply.26919.9.clone.1, %multiply.26930.7.clone.1)
+  %multiply.26931.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21366.7.clone.1, %broadcast.244500.640)
+  %clamp.1214.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26931.5.clone.1, %broadcast.244501.384)
+  %multiply.26932.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1214.3.clone.1, %broadcast.244502.1)
+  %constant_171715_1_clone_1 = u32[] constant(4125501039)
+  %broadcast.250289.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171715_1_clone_1), dimensions={}
+  %add.247437.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.250289.44.clone.1)
+  %constant_171722_1_clone_1 = u32[] constant(1840320350)
+  %broadcast.250291.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171722_1_clone_1), dimensions={}
+  %add.247438.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.250291.113.clone.1)
+  %add.247439.35.clone.1 = u32[1280,1280]{1,0} add(%add.247437.37.clone.1, %add.247438.99.clone.1)
+  %shift-left.109200.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247438.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115372.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247438.99.clone.1, %broadcast.244415.6016)
+  %or.114892.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109200.31.clone.1, %shift-right-logical.115372.29.clone.1)
+  %xor.121439.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247439.35.clone.1, %or.114892.29.clone.1)
+  %add.247440.5.clone.1 = u32[1280,1280]{1,0} add(%add.247439.35.clone.1, %xor.121439.27.clone.1)
+  %shift-left.109201.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121439.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115373.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121439.27.clone.1, %broadcast.244417.5760)
+  %or.114893.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109201.9.clone.1, %shift-right-logical.115373.9.clone.1)
+  %xor.121440.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247440.5.clone.1, %or.114893.7.clone.1)
+  %add.247441.3.clone.1 = u32[1280,1280]{1,0} add(%add.247440.5.clone.1, %xor.121440.5.clone.1)
+  %shift-left.109203.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121440.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115374.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121440.5.clone.1, %broadcast.244419.4352)
+  %or.114894.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109203.5.clone.1, %shift-right-logical.115374.5.clone.1)
+  %xor.121441.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247441.3.clone.1, %or.114894.3.clone.1)
+  %add.247442.3.clone.1 = u32[1280,1280]{1,0} add(%add.247441.3.clone.1, %xor.121441.3.clone.1)
+  %add.247443.7.clone.1 = u32[1280,1280]{1,0} add(%add.247442.3.clone.1, %broadcast.250291.113.clone.1)
+  %shift-left.109204.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121441.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115375.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121441.3.clone.1, %broadcast.244418.4352)
+  %or.114895.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109204.5.clone.1, %shift-right-logical.115375.5.clone.1)
+  %xor.121442.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247442.3.clone.1, %or.114895.3.clone.1)
+  %constant_218131_1_clone_1 = u32[] constant(2206610156)
+  %broadcast.250308.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218131_1_clone_1), dimensions={}
+  %add.247444.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121442.3.clone.1, %broadcast.250308.5.clone.1)
+  %add.247445.5.clone.1 = u32[1280,1280]{1,0} add(%add.247443.7.clone.1, %add.247444.5.clone.1)
+  %shift-left.109205.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247444.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115376.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247444.5.clone.1, %broadcast.244416.5760)
+  %or.114896.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109205.9.clone.1, %shift-right-logical.115376.9.clone.1)
+  %xor.121443.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247445.5.clone.1, %or.114896.7.clone.1)
+  %add.247446.3.clone.1 = u32[1280,1280]{1,0} add(%add.247445.5.clone.1, %xor.121443.5.clone.1)
+  %shift-left.109206.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121443.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115377.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121443.5.clone.1, %broadcast.244429.2304)
+  %or.114897.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109206.9.clone.1, %shift-right-logical.115377.9.clone.1)
+  %xor.121444.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247446.3.clone.1, %or.114897.7.clone.1)
+  %add.247447.3.clone.1 = u32[1280,1280]{1,0} add(%add.247446.3.clone.1, %xor.121444.5.clone.1)
+  %shift-left.109208.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121444.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115378.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121444.5.clone.1, %broadcast.244430.4608)
+  %or.114898.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109208.9.clone.1, %shift-right-logical.115378.9.clone.1)
+  %xor.121445.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247447.3.clone.1, %or.114898.7.clone.1)
+  %add.247448.3.clone.1 = u32[1280,1280]{1,0} add(%add.247447.3.clone.1, %xor.121445.5.clone.1)
+  %constant_171724_1_clone_1 = u32[] constant(2206610155)
+  %broadcast.250322.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171724_1_clone_1), dimensions={}
+  %add.247449.7.clone.1 = u32[1280,1280]{1,0} add(%add.247448.3.clone.1, %broadcast.250322.24.clone.1)
+  %shift-left.109209.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121445.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115379.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121445.5.clone.1, %broadcast.244434.2816)
+  %or.114899.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109209.11.clone.1, %shift-right-logical.115379.11.clone.1)
+  %xor.121446.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247448.3.clone.1, %or.114899.9.clone.1)
+  %constant_218133_1_clone_1 = u32[] constant(4125501041)
+  %broadcast.250328.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218133_1_clone_1), dimensions={}
+  %add.247450.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121446.7.clone.1, %broadcast.250328.5.clone.1)
+  %add.247451.5.clone.1 = u32[1280,1280]{1,0} add(%add.247449.7.clone.1, %add.247450.5.clone.1)
+  %shift-left.109210.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247450.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115380.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247450.5.clone.1, %broadcast.244415.6016)
+  %or.114900.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109210.9.clone.1, %shift-right-logical.115380.9.clone.1)
+  %xor.121447.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247451.5.clone.1, %or.114900.7.clone.1)
+  %add.247452.3.clone.1 = u32[1280,1280]{1,0} add(%add.247451.5.clone.1, %xor.121447.5.clone.1)
+  %shift-left.109211.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121447.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115381.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121447.5.clone.1, %broadcast.244417.5760)
+  %or.114901.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109211.9.clone.1, %shift-right-logical.115381.9.clone.1)
+  %xor.121448.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247452.3.clone.1, %or.114901.7.clone.1)
+  %add.247453.3.clone.1 = u32[1280,1280]{1,0} add(%add.247452.3.clone.1, %xor.121448.5.clone.1)
+  %shift-left.109213.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121448.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115382.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121448.5.clone.1, %broadcast.244419.4352)
+  %or.114902.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109213.7.clone.1, %shift-right-logical.115382.7.clone.1)
+  %xor.121449.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247453.3.clone.1, %or.114902.5.clone.1)
+  %add.247454.3.clone.1 = u32[1280,1280]{1,0} add(%add.247453.3.clone.1, %xor.121449.3.clone.1)
+  %add.247455.7.clone.1 = u32[1280,1280]{1,0} add(%add.247454.3.clone.1, %broadcast.250289.44.clone.1)
+  %shift-left.109214.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121449.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115383.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121449.3.clone.1, %broadcast.244418.4352)
+  %or.114903.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109214.7.clone.1, %shift-right-logical.115383.7.clone.1)
+  %xor.121450.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247454.3.clone.1, %or.114903.5.clone.1)
+  %constant_218135_1_clone_1 = u32[] constant(1840320353)
+  %broadcast.250348.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218135_1_clone_1), dimensions={}
+  %add.247456.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121450.3.clone.1, %broadcast.250348.5.clone.1)
+  %add.247457.5.clone.1 = u32[1280,1280]{1,0} add(%add.247455.7.clone.1, %add.247456.5.clone.1)
+  %shift-left.109215.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247456.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115384.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247456.5.clone.1, %broadcast.244416.5760)
+  %or.114904.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109215.9.clone.1, %shift-right-logical.115384.9.clone.1)
+  %xor.121451.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247457.5.clone.1, %or.114904.7.clone.1)
+  %add.247458.3.clone.1 = u32[1280,1280]{1,0} add(%add.247457.5.clone.1, %xor.121451.5.clone.1)
+  %shift-left.109216.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121451.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115385.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121451.5.clone.1, %broadcast.244429.2304)
+  %or.114905.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109216.9.clone.1, %shift-right-logical.115385.9.clone.1)
+  %xor.121452.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247458.3.clone.1, %or.114905.7.clone.1)
+  %add.247459.3.clone.1 = u32[1280,1280]{1,0} add(%add.247458.3.clone.1, %xor.121452.5.clone.1)
+  %shift-left.109218.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121452.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115386.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121452.5.clone.1, %broadcast.244430.4608)
+  %or.114906.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109218.9.clone.1, %shift-right-logical.115386.9.clone.1)
+  %xor.121453.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247459.3.clone.1, %or.114906.7.clone.1)
+  %add.247460.3.clone.1 = u32[1280,1280]{1,0} add(%add.247459.3.clone.1, %xor.121453.5.clone.1)
+  %add.247461.7.clone.1 = u32[1280,1280]{1,0} add(%add.247460.3.clone.1, %broadcast.250291.113.clone.1)
+  %shift-left.109219.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121453.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115387.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121453.5.clone.1, %broadcast.244434.2816)
+  %or.114907.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109219.11.clone.1, %shift-right-logical.115387.11.clone.1)
+  %xor.121454.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247460.3.clone.1, %or.114907.9.clone.1)
+  %constant_218137_1_clone_1 = u32[] constant(2206610159)
+  %broadcast.250360.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218137_1_clone_1), dimensions={}
+  %add.247462.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121454.7.clone.1, %broadcast.250360.5.clone.1)
+  %add.247464.5.clone.1 = u32[1280,1280]{1,0} add(%add.247461.7.clone.1, %add.247462.5.clone.1)
+  %shift-left.109220.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247462.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115388.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247462.5.clone.1, %broadcast.244415.6016)
+  %or.114908.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109220.9.clone.1, %shift-right-logical.115388.9.clone.1)
+  %xor.121455.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247464.5.clone.1, %or.114908.7.clone.1)
+  %add.247465.3.clone.1 = u32[1280,1280]{1,0} add(%add.247464.5.clone.1, %xor.121455.5.clone.1)
+  %shift-left.109221.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121455.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115389.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121455.5.clone.1, %broadcast.244417.5760)
+  %or.114909.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109221.9.clone.1, %shift-right-logical.115389.9.clone.1)
+  %xor.121456.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247465.3.clone.1, %or.114909.7.clone.1)
+  %add.247466.3.clone.1 = u32[1280,1280]{1,0} add(%add.247465.3.clone.1, %xor.121456.5.clone.1)
+  %shift-left.109222.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121456.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115390.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121456.5.clone.1, %broadcast.244419.4352)
+  %or.114910.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109222.5.clone.1, %shift-right-logical.115390.5.clone.1)
+  %xor.121457.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247466.3.clone.1, %or.114910.3.clone.1)
+  %add.247467.3.clone.1 = u32[1280,1280]{1,0} add(%add.247466.3.clone.1, %xor.121457.3.clone.1)
+  %add.247468.17.clone.1 = u32[1280,1280]{1,0} add(%add.247467.3.clone.1, %broadcast.250322.24.clone.1)
+  %shift-left.109223.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121457.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115391.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121457.3.clone.1, %broadcast.244418.4352)
+  %or.114911.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109223.5.clone.1, %shift-right-logical.115391.5.clone.1)
+  %xor.121458.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247467.3.clone.1, %or.114911.3.clone.1)
+  %constant_218139_1_clone_1 = u32[] constant(4125501044)
+  %broadcast.250370.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218139_1_clone_1), dimensions={}
+  %add.247469.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121458.15.clone.1, %broadcast.250370.19.clone.1)
+  %xor.121459.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247468.17.clone.1, %add.247469.19.clone.1)
+  %shift-right-logical.115392.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121459.17.clone.1, %broadcast.244468.1920)
+  %or.114912.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115392.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5736.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114912.13.clone.1)
+  %add.247470.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5736.11.clone.1, %broadcast.244470.1152)
+  %multiply.26274.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247470.9.clone.1, %broadcast.244471.896)
+  %add.247471.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26274.7.clone.1, %broadcast.244408.1024)
+  %maximum.3668.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247471.5.clone.1)
+  %abs.1528.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3668.3.clone.1)
+  %compare.7204.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1528.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26275.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3668.3.clone.1, %broadcast.244476.1152)
+  %negate.4561.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3668.3.clone.1)
+  %multiply.26276.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3668.3.clone.1, %negate.4561.5.clone.1)
+  %log-plus-one.1528.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26276.5.clone.1)
+  %negate.4562.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1528.3.clone.1)
+  %compare.7205.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4562.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20873.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20874.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20875.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20876.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20877.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20878.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20879.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20880.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20881.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247472.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4562.4.clone.1, %broadcast.244496.640)
+  %sqrt.1528.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4562.4.clone.1)
+  %add.247473.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1528.5.clone.1, %broadcast.244498.640)
+  %select.20882.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7205.3.clone.1, %add.247472.5.clone.1, %add.247473.5.clone.1)
+  %multiply.26277.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20881.3.clone.1, %select.20882.3.clone.1)
+  %add.247474.1.clone.1 = f32[1280,1280]{1,0} add(%select.20880.3.clone.1, %multiply.26277.1.clone.1)
+  %multiply.26278.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247474.1.clone.1, %select.20882.3.clone.1)
+  %add.247475.1.clone.1 = f32[1280,1280]{1,0} add(%select.20879.3.clone.1, %multiply.26278.1.clone.1)
+  %multiply.26279.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247475.1.clone.1, %select.20882.3.clone.1)
+  %add.247476.1.clone.1 = f32[1280,1280]{1,0} add(%select.20878.3.clone.1, %multiply.26279.1.clone.1)
+  %multiply.26280.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247476.1.clone.1, %select.20882.3.clone.1)
+  %add.247477.1.clone.1 = f32[1280,1280]{1,0} add(%select.20877.3.clone.1, %multiply.26280.1.clone.1)
+  %multiply.26281.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247477.1.clone.1, %select.20882.3.clone.1)
+  %add.247478.3.clone.1 = f32[1280,1280]{1,0} add(%select.20876.5.clone.1, %multiply.26281.1.clone.1)
+  %multiply.26282.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247478.3.clone.1, %select.20882.3.clone.1)
+  %add.247479.3.clone.1 = f32[1280,1280]{1,0} add(%select.20875.5.clone.1, %multiply.26282.1.clone.1)
+  %multiply.26283.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247479.3.clone.1, %select.20882.3.clone.1)
+  %add.247480.9.clone.1 = f32[1280,1280]{1,0} add(%select.20874.11.clone.1, %multiply.26283.7.clone.1)
+  %multiply.26284.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247480.9.clone.1, %select.20882.3.clone.1)
+  %add.247481.7.clone.1 = f32[1280,1280]{1,0} add(%select.20873.7.clone.1, %multiply.26284.7.clone.1)
+  %multiply.26285.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247481.7.clone.1, %maximum.3668.3.clone.1)
+  %select.20883.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7204.3.clone.1, %multiply.26275.9.clone.1, %multiply.26285.7.clone.1)
+  %multiply.26286.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20883.7.clone.1, %broadcast.244500.640)
+  %clamp.1172.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26286.5.clone.1, %broadcast.244501.384)
+  %multiply.26287.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1172.3.clone.1, %broadcast.244502.1)
+  %constant_193907_1_clone_1 = u32[] constant(210688267)
+  %broadcast.259886.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193907_1_clone_1), dimensions={}
+  %add.252913.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.259886.44.clone.1)
+  %constant_193914_1_clone_1 = u32[] constant(3327797064)
+  %broadcast.259887.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193914_1_clone_1), dimensions={}
+  %add.252914.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.259887.113.clone.1)
+  %add.252915.35.clone.1 = u32[1280,1280]{1,0} add(%add.252913.37.clone.1, %add.252914.99.clone.1)
+  %shift-left.111580.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252914.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117876.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252914.99.clone.1, %broadcast.244415.6016)
+  %or.117402.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111580.31.clone.1, %shift-right-logical.117876.29.clone.1)
+  %xor.123959.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252915.35.clone.1, %or.117402.29.clone.1)
+  %add.252916.5.clone.1 = u32[1280,1280]{1,0} add(%add.252915.35.clone.1, %xor.123959.27.clone.1)
+  %shift-left.111581.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123959.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117877.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123959.27.clone.1, %broadcast.244417.5760)
+  %or.117403.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111581.9.clone.1, %shift-right-logical.117877.9.clone.1)
+  %xor.123960.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252916.5.clone.1, %or.117403.7.clone.1)
+  %add.252917.3.clone.1 = u32[1280,1280]{1,0} add(%add.252916.5.clone.1, %xor.123960.5.clone.1)
+  %shift-left.111582.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123960.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117878.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123960.5.clone.1, %broadcast.244419.4352)
+  %or.117404.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111582.5.clone.1, %shift-right-logical.117878.5.clone.1)
+  %xor.123961.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252917.3.clone.1, %or.117404.3.clone.1)
+  %add.252918.3.clone.1 = u32[1280,1280]{1,0} add(%add.252917.3.clone.1, %xor.123961.3.clone.1)
+  %add.252919.7.clone.1 = u32[1280,1280]{1,0} add(%add.252918.3.clone.1, %broadcast.259887.113.clone.1)
+  %shift-left.111583.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123961.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117879.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123961.3.clone.1, %broadcast.244418.4352)
+  %or.117405.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111583.5.clone.1, %shift-right-logical.117879.5.clone.1)
+  %xor.123962.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252918.3.clone.1, %or.117405.3.clone.1)
+  %constant_218736_1_clone_1 = u32[] constant(3506824602)
+  %broadcast.259900.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218736_1_clone_1), dimensions={}
+  %add.252920.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123962.3.clone.1, %broadcast.259900.5.clone.1)
+  %add.252921.5.clone.1 = u32[1280,1280]{1,0} add(%add.252919.7.clone.1, %add.252920.5.clone.1)
+  %shift-left.111584.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252920.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117880.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252920.5.clone.1, %broadcast.244416.5760)
+  %or.117406.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111584.9.clone.1, %shift-right-logical.117880.9.clone.1)
+  %xor.123963.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252921.5.clone.1, %or.117406.7.clone.1)
+  %add.252922.3.clone.1 = u32[1280,1280]{1,0} add(%add.252921.5.clone.1, %xor.123963.5.clone.1)
+  %shift-left.111585.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123963.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117881.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123963.5.clone.1, %broadcast.244429.2304)
+  %or.117407.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111585.9.clone.1, %shift-right-logical.117881.9.clone.1)
+  %xor.123964.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252922.3.clone.1, %or.117407.7.clone.1)
+  %add.252923.3.clone.1 = u32[1280,1280]{1,0} add(%add.252922.3.clone.1, %xor.123964.5.clone.1)
+  %shift-left.111586.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123964.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117882.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123964.5.clone.1, %broadcast.244430.4608)
+  %or.117408.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111586.9.clone.1, %shift-right-logical.117882.9.clone.1)
+  %xor.123965.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252923.3.clone.1, %or.117408.7.clone.1)
+  %add.252924.3.clone.1 = u32[1280,1280]{1,0} add(%add.252923.3.clone.1, %xor.123965.5.clone.1)
+  %constant_193916_1_clone_1 = u32[] constant(3506824601)
+  %broadcast.259910.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193916_1_clone_1), dimensions={}
+  %add.252925.7.clone.1 = u32[1280,1280]{1,0} add(%add.252924.3.clone.1, %broadcast.259910.24.clone.1)
+  %shift-left.111587.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123965.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117883.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123965.5.clone.1, %broadcast.244434.2816)
+  %or.117409.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111587.11.clone.1, %shift-right-logical.117883.11.clone.1)
+  %xor.123967.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252924.3.clone.1, %or.117409.9.clone.1)
+  %constant_218737_1_clone_1 = u32[] constant(210688269)
+  %broadcast.259913.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218737_1_clone_1), dimensions={}
+  %add.252926.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123967.7.clone.1, %broadcast.259913.5.clone.1)
+  %add.252927.5.clone.1 = u32[1280,1280]{1,0} add(%add.252925.7.clone.1, %add.252926.5.clone.1)
+  %shift-left.111588.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252926.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117884.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252926.5.clone.1, %broadcast.244415.6016)
+  %or.117410.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111588.9.clone.1, %shift-right-logical.117884.9.clone.1)
+  %xor.123968.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252927.5.clone.1, %or.117410.7.clone.1)
+  %add.252928.3.clone.1 = u32[1280,1280]{1,0} add(%add.252927.5.clone.1, %xor.123968.5.clone.1)
+  %shift-left.111589.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123968.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117885.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123968.5.clone.1, %broadcast.244417.5760)
+  %or.117411.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111589.9.clone.1, %shift-right-logical.117885.9.clone.1)
+  %xor.123969.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252928.3.clone.1, %or.117411.7.clone.1)
+  %add.252929.3.clone.1 = u32[1280,1280]{1,0} add(%add.252928.3.clone.1, %xor.123969.5.clone.1)
+  %shift-left.111590.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123969.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117886.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123969.5.clone.1, %broadcast.244419.4352)
+  %or.117412.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111590.7.clone.1, %shift-right-logical.117886.7.clone.1)
+  %xor.123970.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252929.3.clone.1, %or.117412.5.clone.1)
+  %add.252930.3.clone.1 = u32[1280,1280]{1,0} add(%add.252929.3.clone.1, %xor.123970.3.clone.1)
+  %add.252931.7.clone.1 = u32[1280,1280]{1,0} add(%add.252930.3.clone.1, %broadcast.259886.44.clone.1)
+  %shift-left.111591.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123970.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117887.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123970.3.clone.1, %broadcast.244418.4352)
+  %or.117413.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111591.7.clone.1, %shift-right-logical.117887.7.clone.1)
+  %xor.123972.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252930.3.clone.1, %or.117413.5.clone.1)
+  %constant_218738_1_clone_1 = u32[] constant(3327797067)
+  %broadcast.259923.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218738_1_clone_1), dimensions={}
+  %add.252932.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123972.3.clone.1, %broadcast.259923.5.clone.1)
+  %add.252933.5.clone.1 = u32[1280,1280]{1,0} add(%add.252931.7.clone.1, %add.252932.5.clone.1)
+  %shift-left.111592.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252932.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117888.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252932.5.clone.1, %broadcast.244416.5760)
+  %or.117414.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111592.9.clone.1, %shift-right-logical.117888.9.clone.1)
+  %xor.123973.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252933.5.clone.1, %or.117414.7.clone.1)
+  %add.252934.3.clone.1 = u32[1280,1280]{1,0} add(%add.252933.5.clone.1, %xor.123973.5.clone.1)
+  %shift-left.111593.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123973.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117889.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123973.5.clone.1, %broadcast.244429.2304)
+  %or.117415.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111593.9.clone.1, %shift-right-logical.117889.9.clone.1)
+  %xor.123974.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252934.3.clone.1, %or.117415.7.clone.1)
+  %add.252935.3.clone.1 = u32[1280,1280]{1,0} add(%add.252934.3.clone.1, %xor.123974.5.clone.1)
+  %shift-left.111594.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123974.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117890.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123974.5.clone.1, %broadcast.244430.4608)
+  %or.117416.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111594.9.clone.1, %shift-right-logical.117890.9.clone.1)
+  %xor.123975.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252935.3.clone.1, %or.117416.7.clone.1)
+  %add.252936.3.clone.1 = u32[1280,1280]{1,0} add(%add.252935.3.clone.1, %xor.123975.5.clone.1)
+  %add.252937.7.clone.1 = u32[1280,1280]{1,0} add(%add.252936.3.clone.1, %broadcast.259887.113.clone.1)
+  %shift-left.111595.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123975.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117891.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123975.5.clone.1, %broadcast.244434.2816)
+  %or.117417.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111595.11.clone.1, %shift-right-logical.117891.11.clone.1)
+  %xor.123977.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252936.3.clone.1, %or.117417.9.clone.1)
+  %constant_218739_1_clone_1 = u32[] constant(3506824605)
+  %broadcast.259933.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218739_1_clone_1), dimensions={}
+  %add.252938.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123977.7.clone.1, %broadcast.259933.5.clone.1)
+  %add.252939.5.clone.1 = u32[1280,1280]{1,0} add(%add.252937.7.clone.1, %add.252938.5.clone.1)
+  %shift-left.111596.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252938.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117892.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252938.5.clone.1, %broadcast.244415.6016)
+  %or.117418.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111596.9.clone.1, %shift-right-logical.117892.9.clone.1)
+  %xor.123978.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252939.5.clone.1, %or.117418.7.clone.1)
+  %add.252940.3.clone.1 = u32[1280,1280]{1,0} add(%add.252939.5.clone.1, %xor.123978.5.clone.1)
+  %shift-left.111597.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123978.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117893.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123978.5.clone.1, %broadcast.244417.5760)
+  %or.117419.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111597.9.clone.1, %shift-right-logical.117893.9.clone.1)
+  %xor.123979.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252940.3.clone.1, %or.117419.7.clone.1)
+  %add.252941.3.clone.1 = u32[1280,1280]{1,0} add(%add.252940.3.clone.1, %xor.123979.5.clone.1)
+  %shift-left.111598.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123979.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117894.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123979.5.clone.1, %broadcast.244419.4352)
+  %or.117420.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111598.5.clone.1, %shift-right-logical.117894.5.clone.1)
+  %xor.123980.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252941.3.clone.1, %or.117420.3.clone.1)
+  %add.252942.3.clone.1 = u32[1280,1280]{1,0} add(%add.252941.3.clone.1, %xor.123980.3.clone.1)
+  %add.252943.17.clone.1 = u32[1280,1280]{1,0} add(%add.252942.3.clone.1, %broadcast.259910.24.clone.1)
+  %shift-left.111599.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123980.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117895.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123980.3.clone.1, %broadcast.244418.4352)
+  %or.117422.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111599.5.clone.1, %shift-right-logical.117895.5.clone.1)
+  %xor.123982.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252942.3.clone.1, %or.117422.3.clone.1)
+  %constant_218740_1_clone_1 = u32[] constant(210688272)
+  %broadcast.259943.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218740_1_clone_1), dimensions={}
+  %add.252944.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123982.15.clone.1, %broadcast.259943.19.clone.1)
+  %xor.123983.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252943.17.clone.1, %add.252944.19.clone.1)
+  %shift-right-logical.117896.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123983.17.clone.1, %broadcast.244468.1920)
+  %or.117423.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117896.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5845.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117423.13.clone.1)
+  %add.252945.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5845.11.clone.1, %broadcast.244470.1152)
+  %multiply.27398.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252945.9.clone.1, %broadcast.244471.896)
+  %add.252947.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27398.7.clone.1, %broadcast.244408.1024)
+  %maximum.3777.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252947.5.clone.1)
+  %abs.1601.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3777.3.clone.1)
+  %compare.7364.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1601.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27399.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3777.3.clone.1, %broadcast.244476.1152)
+  %negate.4707.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3777.3.clone.1)
+  %multiply.27400.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3777.3.clone.1, %negate.4707.5.clone.1)
+  %log-plus-one.1601.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27400.5.clone.1)
+  %negate.4708.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1601.3.clone.1)
+  %compare.7365.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4708.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21718.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21719.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21720.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21721.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21722.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21723.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21724.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21725.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21726.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252950.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4708.4.clone.1, %broadcast.244496.640)
+  %sqrt.1601.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4708.4.clone.1)
+  %add.252951.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1601.5.clone.1, %broadcast.244498.640)
+  %select.21727.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7365.3.clone.1, %add.252950.5.clone.1, %add.252951.5.clone.1)
+  %multiply.27401.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21726.3.clone.1, %select.21727.3.clone.1)
+  %add.252952.1.clone.1 = f32[1280,1280]{1,0} add(%select.21725.3.clone.1, %multiply.27401.1.clone.1)
+  %multiply.27402.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252952.1.clone.1, %select.21727.3.clone.1)
+  %add.252953.1.clone.1 = f32[1280,1280]{1,0} add(%select.21724.3.clone.1, %multiply.27402.1.clone.1)
+  %multiply.27403.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252953.1.clone.1, %select.21727.3.clone.1)
+  %add.252955.1.clone.1 = f32[1280,1280]{1,0} add(%select.21723.3.clone.1, %multiply.27403.1.clone.1)
+  %multiply.27404.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252955.1.clone.1, %select.21727.3.clone.1)
+  %add.252956.1.clone.1 = f32[1280,1280]{1,0} add(%select.21722.3.clone.1, %multiply.27404.1.clone.1)
+  %multiply.27405.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252956.1.clone.1, %select.21727.3.clone.1)
+  %add.252957.3.clone.1 = f32[1280,1280]{1,0} add(%select.21721.5.clone.1, %multiply.27405.1.clone.1)
+  %multiply.27406.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252957.3.clone.1, %select.21727.3.clone.1)
+  %add.252958.3.clone.1 = f32[1280,1280]{1,0} add(%select.21720.5.clone.1, %multiply.27406.1.clone.1)
+  %multiply.27407.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252958.3.clone.1, %select.21727.3.clone.1)
+  %add.252960.9.clone.1 = f32[1280,1280]{1,0} add(%select.21719.11.clone.1, %multiply.27407.7.clone.1)
+  %multiply.27408.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252960.9.clone.1, %select.21727.3.clone.1)
+  %add.252961.7.clone.1 = f32[1280,1280]{1,0} add(%select.21718.7.clone.1, %multiply.27408.7.clone.1)
+  %multiply.27409.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252961.7.clone.1, %maximum.3777.3.clone.1)
+  %select.21728.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7364.3.clone.1, %multiply.27399.9.clone.1, %multiply.27409.7.clone.1)
+  %multiply.27410.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21728.7.clone.1, %broadcast.244500.640)
+  %clamp.1245.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27410.5.clone.1, %broadcast.244501.384)
+  %multiply.27411.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1245.3.clone.1, %broadcast.244502.1)
+  %constant_171499_1_clone_1 = u32[] constant(2417465807)
+  %broadcast.250193.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171499_1_clone_1), dimensions={}
+  %add.247375.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.250193.44.clone.1)
+  %constant_171507_1_clone_1 = u32[] constant(1581799846)
+  %broadcast.250194.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171507_1_clone_1), dimensions={}
+  %add.247377.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.250194.113.clone.1)
+  %add.247378.35.clone.1 = u32[1280,1280]{1,0} add(%add.247375.37.clone.1, %add.247377.99.clone.1)
+  %shift-left.109180.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247377.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115351.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247377.99.clone.1, %broadcast.244415.6016)
+  %or.114871.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109180.31.clone.1, %shift-right-logical.115351.29.clone.1)
+  %xor.121418.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247378.35.clone.1, %or.114871.29.clone.1)
+  %add.247379.5.clone.1 = u32[1280,1280]{1,0} add(%add.247378.35.clone.1, %xor.121418.27.clone.1)
+  %shift-left.109181.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121418.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115352.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121418.27.clone.1, %broadcast.244417.5760)
+  %or.114872.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109181.9.clone.1, %shift-right-logical.115352.9.clone.1)
+  %xor.121419.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247379.5.clone.1, %or.114872.7.clone.1)
+  %add.247380.3.clone.1 = u32[1280,1280]{1,0} add(%add.247379.5.clone.1, %xor.121419.5.clone.1)
+  %shift-left.109182.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121419.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115353.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121419.5.clone.1, %broadcast.244419.4352)
+  %or.114873.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109182.5.clone.1, %shift-right-logical.115353.5.clone.1)
+  %xor.121420.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247380.3.clone.1, %or.114873.3.clone.1)
+  %add.247381.3.clone.1 = u32[1280,1280]{1,0} add(%add.247380.3.clone.1, %xor.121420.3.clone.1)
+  %add.247383.7.clone.1 = u32[1280,1280]{1,0} add(%add.247381.3.clone.1, %broadcast.250194.113.clone.1)
+  %shift-left.109183.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121420.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115354.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121420.3.clone.1, %broadcast.244418.4352)
+  %or.114874.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109183.5.clone.1, %shift-right-logical.115354.5.clone.1)
+  %xor.121421.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247381.3.clone.1, %or.114874.3.clone.1)
+  %constant_218126_1_clone_1 = u32[] constant(3582909364)
+  %broadcast.250206.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218126_1_clone_1), dimensions={}
+  %add.247387.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121421.3.clone.1, %broadcast.250206.5.clone.1)
+  %add.247388.5.clone.1 = u32[1280,1280]{1,0} add(%add.247383.7.clone.1, %add.247387.5.clone.1)
+  %shift-left.109184.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247387.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115355.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247387.5.clone.1, %broadcast.244416.5760)
+  %or.114875.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109184.9.clone.1, %shift-right-logical.115355.9.clone.1)
+  %xor.121422.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247388.5.clone.1, %or.114875.7.clone.1)
+  %add.247389.3.clone.1 = u32[1280,1280]{1,0} add(%add.247388.5.clone.1, %xor.121422.5.clone.1)
+  %shift-left.109185.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121422.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115356.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121422.5.clone.1, %broadcast.244429.2304)
+  %or.114876.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109185.9.clone.1, %shift-right-logical.115356.9.clone.1)
+  %xor.121423.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247389.3.clone.1, %or.114876.7.clone.1)
+  %add.247390.3.clone.1 = u32[1280,1280]{1,0} add(%add.247389.3.clone.1, %xor.121423.5.clone.1)
+  %shift-left.109186.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121423.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115357.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121423.5.clone.1, %broadcast.244430.4608)
+  %or.114877.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109186.9.clone.1, %shift-right-logical.115357.9.clone.1)
+  %xor.121424.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247390.3.clone.1, %or.114877.7.clone.1)
+  %add.247392.3.clone.1 = u32[1280,1280]{1,0} add(%add.247390.3.clone.1, %xor.121424.5.clone.1)
+  %constant_171509_1_clone_1 = u32[] constant(3582909363)
+  %broadcast.250213.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_171509_1_clone_1), dimensions={}
+  %add.247393.7.clone.1 = u32[1280,1280]{1,0} add(%add.247392.3.clone.1, %broadcast.250213.24.clone.1)
+  %shift-left.109187.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121424.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115358.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121424.5.clone.1, %broadcast.244434.2816)
+  %or.114878.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109187.11.clone.1, %shift-right-logical.115358.11.clone.1)
+  %xor.121425.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247392.3.clone.1, %or.114878.9.clone.1)
+  %constant_218127_1_clone_1 = u32[] constant(2417465809)
+  %broadcast.250216.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218127_1_clone_1), dimensions={}
+  %add.247394.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121425.7.clone.1, %broadcast.250216.5.clone.1)
+  %add.247395.5.clone.1 = u32[1280,1280]{1,0} add(%add.247393.7.clone.1, %add.247394.5.clone.1)
+  %shift-left.109188.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247394.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115359.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247394.5.clone.1, %broadcast.244415.6016)
+  %or.114879.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109188.9.clone.1, %shift-right-logical.115359.9.clone.1)
+  %xor.121426.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247395.5.clone.1, %or.114879.7.clone.1)
+  %add.247397.3.clone.1 = u32[1280,1280]{1,0} add(%add.247395.5.clone.1, %xor.121426.5.clone.1)
+  %shift-left.109189.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121426.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115360.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121426.5.clone.1, %broadcast.244417.5760)
+  %or.114880.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109189.9.clone.1, %shift-right-logical.115360.9.clone.1)
+  %xor.121427.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247397.3.clone.1, %or.114880.7.clone.1)
+  %add.247398.3.clone.1 = u32[1280,1280]{1,0} add(%add.247397.3.clone.1, %xor.121427.5.clone.1)
+  %shift-left.109190.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121427.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115361.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121427.5.clone.1, %broadcast.244419.4352)
+  %or.114881.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109190.7.clone.1, %shift-right-logical.115361.7.clone.1)
+  %xor.121428.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247398.3.clone.1, %or.114881.5.clone.1)
+  %add.247399.3.clone.1 = u32[1280,1280]{1,0} add(%add.247398.3.clone.1, %xor.121428.3.clone.1)
+  %add.247400.7.clone.1 = u32[1280,1280]{1,0} add(%add.247399.3.clone.1, %broadcast.250193.44.clone.1)
+  %shift-left.109191.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121428.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115362.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121428.3.clone.1, %broadcast.244418.4352)
+  %or.114882.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109191.7.clone.1, %shift-right-logical.115362.7.clone.1)
+  %xor.121429.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247399.3.clone.1, %or.114882.5.clone.1)
+  %constant_218128_1_clone_1 = u32[] constant(1581799849)
+  %broadcast.250226.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218128_1_clone_1), dimensions={}
+  %add.247402.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121429.3.clone.1, %broadcast.250226.5.clone.1)
+  %add.247403.5.clone.1 = u32[1280,1280]{1,0} add(%add.247400.7.clone.1, %add.247402.5.clone.1)
+  %shift-left.109192.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247402.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115363.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247402.5.clone.1, %broadcast.244416.5760)
+  %or.114883.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109192.9.clone.1, %shift-right-logical.115363.9.clone.1)
+  %xor.121430.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247403.5.clone.1, %or.114883.7.clone.1)
+  %add.247404.3.clone.1 = u32[1280,1280]{1,0} add(%add.247403.5.clone.1, %xor.121430.5.clone.1)
+  %shift-left.109193.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121430.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115364.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121430.5.clone.1, %broadcast.244429.2304)
+  %or.114884.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109193.9.clone.1, %shift-right-logical.115364.9.clone.1)
+  %xor.121431.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247404.3.clone.1, %or.114884.7.clone.1)
+  %add.247405.3.clone.1 = u32[1280,1280]{1,0} add(%add.247404.3.clone.1, %xor.121431.5.clone.1)
+  %shift-left.109194.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121431.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115365.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121431.5.clone.1, %broadcast.244430.4608)
+  %or.114885.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109194.9.clone.1, %shift-right-logical.115365.9.clone.1)
+  %xor.121432.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247405.3.clone.1, %or.114885.7.clone.1)
+  %add.247406.3.clone.1 = u32[1280,1280]{1,0} add(%add.247405.3.clone.1, %xor.121432.5.clone.1)
+  %add.247408.7.clone.1 = u32[1280,1280]{1,0} add(%add.247406.3.clone.1, %broadcast.250194.113.clone.1)
+  %shift-left.109195.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121432.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115366.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121432.5.clone.1, %broadcast.244434.2816)
+  %or.114886.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109195.11.clone.1, %shift-right-logical.115366.11.clone.1)
+  %xor.121433.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247406.3.clone.1, %or.114886.9.clone.1)
+  %constant_218129_1_clone_1 = u32[] constant(3582909367)
+  %broadcast.250238.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218129_1_clone_1), dimensions={}
+  %add.247412.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121433.7.clone.1, %broadcast.250238.5.clone.1)
+  %add.247413.5.clone.1 = u32[1280,1280]{1,0} add(%add.247408.7.clone.1, %add.247412.5.clone.1)
+  %shift-left.109196.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247412.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115367.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247412.5.clone.1, %broadcast.244415.6016)
+  %or.114887.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109196.9.clone.1, %shift-right-logical.115367.9.clone.1)
+  %xor.121434.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247413.5.clone.1, %or.114887.7.clone.1)
+  %add.247414.3.clone.1 = u32[1280,1280]{1,0} add(%add.247413.5.clone.1, %xor.121434.5.clone.1)
+  %shift-left.109197.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121434.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115368.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121434.5.clone.1, %broadcast.244417.5760)
+  %or.114888.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109197.9.clone.1, %shift-right-logical.115368.9.clone.1)
+  %xor.121435.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247414.3.clone.1, %or.114888.7.clone.1)
+  %add.247415.3.clone.1 = u32[1280,1280]{1,0} add(%add.247414.3.clone.1, %xor.121435.5.clone.1)
+  %shift-left.109198.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121435.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115369.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121435.5.clone.1, %broadcast.244419.4352)
+  %or.114889.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109198.5.clone.1, %shift-right-logical.115369.5.clone.1)
+  %xor.121436.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247415.3.clone.1, %or.114889.3.clone.1)
+  %add.247417.3.clone.1 = u32[1280,1280]{1,0} add(%add.247415.3.clone.1, %xor.121436.3.clone.1)
+  %add.247418.17.clone.1 = u32[1280,1280]{1,0} add(%add.247417.3.clone.1, %broadcast.250213.24.clone.1)
+  %shift-left.109199.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121436.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115370.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121436.3.clone.1, %broadcast.244418.4352)
+  %or.114890.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109199.5.clone.1, %shift-right-logical.115370.5.clone.1)
+  %xor.121437.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247417.3.clone.1, %or.114890.3.clone.1)
+  %constant_218130_1_clone_1 = u32[] constant(2417465812)
+  %broadcast.250248.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218130_1_clone_1), dimensions={}
+  %add.247419.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121437.15.clone.1, %broadcast.250248.19.clone.1)
+  %xor.121438.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247418.17.clone.1, %add.247419.19.clone.1)
+  %shift-right-logical.115371.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121438.17.clone.1, %broadcast.244468.1920)
+  %or.114891.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115371.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5735.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114891.13.clone.1)
+  %add.247420.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5735.11.clone.1, %broadcast.244470.1152)
+  %multiply.26260.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247420.9.clone.1, %broadcast.244471.896)
+  %add.247422.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26260.7.clone.1, %broadcast.244408.1024)
+  %maximum.3667.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247422.5.clone.1)
+  %abs.1527.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3667.3.clone.1)
+  %compare.7202.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1527.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26261.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3667.3.clone.1, %broadcast.244476.1152)
+  %negate.4559.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3667.3.clone.1)
+  %multiply.26262.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3667.3.clone.1, %negate.4559.5.clone.1)
+  %log-plus-one.1527.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26262.5.clone.1)
+  %negate.4560.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1527.3.clone.1)
+  %compare.7203.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4560.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20862.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20863.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20864.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20865.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20866.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20867.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20868.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20869.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20870.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247423.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4560.4.clone.1, %broadcast.244496.640)
+  %sqrt.1527.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4560.4.clone.1)
+  %add.247424.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1527.5.clone.1, %broadcast.244498.640)
+  %select.20871.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7203.3.clone.1, %add.247423.5.clone.1, %add.247424.5.clone.1)
+  %multiply.26263.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20870.3.clone.1, %select.20871.3.clone.1)
+  %add.247425.1.clone.1 = f32[1280,1280]{1,0} add(%select.20869.3.clone.1, %multiply.26263.1.clone.1)
+  %multiply.26264.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247425.1.clone.1, %select.20871.3.clone.1)
+  %add.247427.1.clone.1 = f32[1280,1280]{1,0} add(%select.20868.3.clone.1, %multiply.26264.1.clone.1)
+  %multiply.26265.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247427.1.clone.1, %select.20871.3.clone.1)
+  %add.247428.1.clone.1 = f32[1280,1280]{1,0} add(%select.20867.3.clone.1, %multiply.26265.1.clone.1)
+  %multiply.26266.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247428.1.clone.1, %select.20871.3.clone.1)
+  %add.247429.1.clone.1 = f32[1280,1280]{1,0} add(%select.20866.3.clone.1, %multiply.26266.1.clone.1)
+  %multiply.26267.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247429.1.clone.1, %select.20871.3.clone.1)
+  %add.247430.3.clone.1 = f32[1280,1280]{1,0} add(%select.20865.5.clone.1, %multiply.26267.1.clone.1)
+  %multiply.26268.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247430.3.clone.1, %select.20871.3.clone.1)
+  %add.247431.3.clone.1 = f32[1280,1280]{1,0} add(%select.20864.5.clone.1, %multiply.26268.1.clone.1)
+  %multiply.26269.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247431.3.clone.1, %select.20871.3.clone.1)
+  %add.247433.9.clone.1 = f32[1280,1280]{1,0} add(%select.20863.11.clone.1, %multiply.26269.7.clone.1)
+  %multiply.26270.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247433.9.clone.1, %select.20871.3.clone.1)
+  %add.247436.7.clone.1 = f32[1280,1280]{1,0} add(%select.20862.7.clone.1, %multiply.26270.7.clone.1)
+  %multiply.26271.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247436.7.clone.1, %maximum.3667.3.clone.1)
+  %select.20872.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7202.3.clone.1, %multiply.26261.9.clone.1, %multiply.26271.7.clone.1)
+  %multiply.26272.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20872.7.clone.1, %broadcast.244500.640)
+  %clamp.1171.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26272.5.clone.1, %broadcast.244501.384)
+  %multiply.26273.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1171.3.clone.1, %broadcast.244502.1)
+  %constant_184140_1_clone_1 = u32[] constant(662923858)
+  %broadcast.255682.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184140_1_clone_1), dimensions={}
+  %add.250488.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255682.44.clone.1)
+  %constant_184147_1_clone_1 = u32[] constant(107869931)
+  %broadcast.255683.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184147_1_clone_1), dimensions={}
+  %add.250489.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.255683.113.clone.1)
+  %add.250490.35.clone.1 = u32[1280,1280]{1,0} add(%add.250488.37.clone.1, %add.250489.99.clone.1)
+  %shift-left.110523.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250489.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116780.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250489.99.clone.1, %broadcast.244415.6016)
+  %or.116291.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110523.31.clone.1, %shift-right-logical.116780.29.clone.1)
+  %xor.122861.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250490.35.clone.1, %or.116291.29.clone.1)
+  %add.250491.5.clone.1 = u32[1280,1280]{1,0} add(%add.250490.35.clone.1, %xor.122861.27.clone.1)
+  %shift-left.110524.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122861.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116781.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122861.27.clone.1, %broadcast.244417.5760)
+  %or.116292.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110524.9.clone.1, %shift-right-logical.116781.9.clone.1)
+  %xor.122862.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250491.5.clone.1, %or.116292.7.clone.1)
+  %add.250493.3.clone.1 = u32[1280,1280]{1,0} add(%add.250491.5.clone.1, %xor.122862.5.clone.1)
+  %shift-left.110526.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122862.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116782.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122862.5.clone.1, %broadcast.244419.4352)
+  %or.116293.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110526.5.clone.1, %shift-right-logical.116782.5.clone.1)
+  %xor.122863.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250493.3.clone.1, %or.116293.3.clone.1)
+  %add.250494.3.clone.1 = u32[1280,1280]{1,0} add(%add.250493.3.clone.1, %xor.122863.3.clone.1)
+  %add.250495.7.clone.1 = u32[1280,1280]{1,0} add(%add.250494.3.clone.1, %broadcast.255683.113.clone.1)
+  %shift-left.110527.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122863.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116783.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122863.3.clone.1, %broadcast.244418.4352)
+  %or.116294.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110527.5.clone.1, %shift-right-logical.116783.5.clone.1)
+  %xor.122865.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250494.3.clone.1, %or.116294.3.clone.1)
+  %constant_218474_1_clone_1 = u32[] constant(977241956)
+  %broadcast.255693.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218474_1_clone_1), dimensions={}
+  %add.250496.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122865.3.clone.1, %broadcast.255693.5.clone.1)
+  %add.250497.5.clone.1 = u32[1280,1280]{1,0} add(%add.250495.7.clone.1, %add.250496.5.clone.1)
+  %shift-left.110528.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250496.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116784.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250496.5.clone.1, %broadcast.244416.5760)
+  %or.116295.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110528.9.clone.1, %shift-right-logical.116784.9.clone.1)
+  %xor.122866.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250497.5.clone.1, %or.116295.7.clone.1)
+  %add.250499.3.clone.1 = u32[1280,1280]{1,0} add(%add.250497.5.clone.1, %xor.122866.5.clone.1)
+  %shift-left.110529.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122866.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116785.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122866.5.clone.1, %broadcast.244429.2304)
+  %or.116296.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110529.9.clone.1, %shift-right-logical.116785.9.clone.1)
+  %xor.122867.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250499.3.clone.1, %or.116296.7.clone.1)
+  %add.250503.3.clone.1 = u32[1280,1280]{1,0} add(%add.250499.3.clone.1, %xor.122867.5.clone.1)
+  %shift-left.110530.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122867.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116786.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122867.5.clone.1, %broadcast.244430.4608)
+  %or.116297.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110530.9.clone.1, %shift-right-logical.116786.9.clone.1)
+  %xor.122868.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250503.3.clone.1, %or.116297.7.clone.1)
+  %add.250504.3.clone.1 = u32[1280,1280]{1,0} add(%add.250503.3.clone.1, %xor.122868.5.clone.1)
+  %constant_184149_1_clone_1 = u32[] constant(977241955)
+  %broadcast.255700.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_184149_1_clone_1), dimensions={}
+  %add.250505.7.clone.1 = u32[1280,1280]{1,0} add(%add.250504.3.clone.1, %broadcast.255700.24.clone.1)
+  %shift-left.110531.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122868.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116787.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122868.5.clone.1, %broadcast.244434.2816)
+  %or.116298.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110531.11.clone.1, %shift-right-logical.116787.11.clone.1)
+  %xor.122870.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250504.3.clone.1, %or.116298.9.clone.1)
+  %constant_218475_1_clone_1 = u32[] constant(662923860)
+  %broadcast.255703.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218475_1_clone_1), dimensions={}
+  %add.250506.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122870.7.clone.1, %broadcast.255703.5.clone.1)
+  %add.250508.5.clone.1 = u32[1280,1280]{1,0} add(%add.250505.7.clone.1, %add.250506.5.clone.1)
+  %shift-left.110532.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250506.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116788.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250506.5.clone.1, %broadcast.244415.6016)
+  %or.116300.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110532.9.clone.1, %shift-right-logical.116788.9.clone.1)
+  %xor.122871.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250508.5.clone.1, %or.116300.7.clone.1)
+  %add.250509.3.clone.1 = u32[1280,1280]{1,0} add(%add.250508.5.clone.1, %xor.122871.5.clone.1)
+  %shift-left.110533.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122871.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116789.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122871.5.clone.1, %broadcast.244417.5760)
+  %or.116301.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110533.9.clone.1, %shift-right-logical.116789.9.clone.1)
+  %xor.122872.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250509.3.clone.1, %or.116301.7.clone.1)
+  %add.250510.3.clone.1 = u32[1280,1280]{1,0} add(%add.250509.3.clone.1, %xor.122872.5.clone.1)
+  %shift-left.110534.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122872.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116790.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122872.5.clone.1, %broadcast.244419.4352)
+  %or.116302.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110534.7.clone.1, %shift-right-logical.116790.7.clone.1)
+  %xor.122873.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250510.3.clone.1, %or.116302.5.clone.1)
+  %add.250511.3.clone.1 = u32[1280,1280]{1,0} add(%add.250510.3.clone.1, %xor.122873.3.clone.1)
+  %add.250513.7.clone.1 = u32[1280,1280]{1,0} add(%add.250511.3.clone.1, %broadcast.255682.44.clone.1)
+  %shift-left.110536.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122873.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116791.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122873.3.clone.1, %broadcast.244418.4352)
+  %or.116303.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110536.7.clone.1, %shift-right-logical.116791.7.clone.1)
+  %xor.122875.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250511.3.clone.1, %or.116303.5.clone.1)
+  %constant_218476_1_clone_1 = u32[] constant(107869934)
+  %broadcast.255713.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218476_1_clone_1), dimensions={}
+  %add.250514.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122875.3.clone.1, %broadcast.255713.5.clone.1)
+  %add.250515.5.clone.1 = u32[1280,1280]{1,0} add(%add.250513.7.clone.1, %add.250514.5.clone.1)
+  %shift-left.110537.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250514.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116792.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250514.5.clone.1, %broadcast.244416.5760)
+  %or.116305.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110537.9.clone.1, %shift-right-logical.116792.9.clone.1)
+  %xor.122876.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250515.5.clone.1, %or.116305.7.clone.1)
+  %add.250516.3.clone.1 = u32[1280,1280]{1,0} add(%add.250515.5.clone.1, %xor.122876.5.clone.1)
+  %shift-left.110538.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122876.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116793.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122876.5.clone.1, %broadcast.244429.2304)
+  %or.116306.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110538.9.clone.1, %shift-right-logical.116793.9.clone.1)
+  %xor.122877.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250516.3.clone.1, %or.116306.7.clone.1)
+  %add.250518.3.clone.1 = u32[1280,1280]{1,0} add(%add.250516.3.clone.1, %xor.122877.5.clone.1)
+  %shift-left.110539.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122877.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116794.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122877.5.clone.1, %broadcast.244430.4608)
+  %or.116307.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110539.9.clone.1, %shift-right-logical.116794.9.clone.1)
+  %xor.122878.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250518.3.clone.1, %or.116307.7.clone.1)
+  %add.250519.3.clone.1 = u32[1280,1280]{1,0} add(%add.250518.3.clone.1, %xor.122878.5.clone.1)
+  %add.250520.7.clone.1 = u32[1280,1280]{1,0} add(%add.250519.3.clone.1, %broadcast.255683.113.clone.1)
+  %shift-left.110541.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122878.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116795.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122878.5.clone.1, %broadcast.244434.2816)
+  %or.116308.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110541.11.clone.1, %shift-right-logical.116795.11.clone.1)
+  %xor.122880.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250519.3.clone.1, %or.116308.9.clone.1)
+  %constant_218477_1_clone_1 = u32[] constant(977241959)
+  %broadcast.255723.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218477_1_clone_1), dimensions={}
+  %add.250521.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122880.7.clone.1, %broadcast.255723.5.clone.1)
+  %add.250522.5.clone.1 = u32[1280,1280]{1,0} add(%add.250520.7.clone.1, %add.250521.5.clone.1)
+  %shift-left.110542.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250521.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116796.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250521.5.clone.1, %broadcast.244415.6016)
+  %or.116310.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110542.9.clone.1, %shift-right-logical.116796.9.clone.1)
+  %xor.122881.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250522.5.clone.1, %or.116310.7.clone.1)
+  %add.250524.3.clone.1 = u32[1280,1280]{1,0} add(%add.250522.5.clone.1, %xor.122881.5.clone.1)
+  %shift-left.110543.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122881.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116797.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122881.5.clone.1, %broadcast.244417.5760)
+  %or.116311.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110543.9.clone.1, %shift-right-logical.116797.9.clone.1)
+  %xor.122882.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250524.3.clone.1, %or.116311.7.clone.1)
+  %add.250528.3.clone.1 = u32[1280,1280]{1,0} add(%add.250524.3.clone.1, %xor.122882.5.clone.1)
+  %shift-left.110544.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122882.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116798.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122882.5.clone.1, %broadcast.244419.4352)
+  %or.116312.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110544.5.clone.1, %shift-right-logical.116798.5.clone.1)
+  %xor.122883.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250528.3.clone.1, %or.116312.3.clone.1)
+  %add.250529.3.clone.1 = u32[1280,1280]{1,0} add(%add.250528.3.clone.1, %xor.122883.3.clone.1)
+  %add.250530.17.clone.1 = u32[1280,1280]{1,0} add(%add.250529.3.clone.1, %broadcast.255700.24.clone.1)
+  %shift-left.110546.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122883.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116799.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122883.3.clone.1, %broadcast.244418.4352)
+  %or.116313.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110546.5.clone.1, %shift-right-logical.116799.5.clone.1)
+  %xor.122884.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250529.3.clone.1, %or.116313.3.clone.1)
+  %constant_218478_1_clone_1 = u32[] constant(662923863)
+  %broadcast.255733.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218478_1_clone_1), dimensions={}
+  %add.250531.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122884.15.clone.1, %broadcast.255733.19.clone.1)
+  %xor.122885.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250530.17.clone.1, %add.250531.19.clone.1)
+  %shift-right-logical.116800.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122885.17.clone.1, %broadcast.244468.1920)
+  %or.116315.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116800.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5797.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116315.13.clone.1)
+  %add.250533.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5797.11.clone.1, %broadcast.244470.1152)
+  %multiply.26903.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250533.9.clone.1, %broadcast.244471.896)
+  %add.250534.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26903.7.clone.1, %broadcast.244408.1024)
+  %maximum.3729.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250534.5.clone.1)
+  %abs.1569.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3729.3.clone.1)
+  %compare.7292.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1569.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26904.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3729.3.clone.1, %broadcast.244476.1152)
+  %negate.4643.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3729.3.clone.1)
+  %multiply.26905.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3729.3.clone.1, %negate.4643.5.clone.1)
+  %log-plus-one.1569.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26905.5.clone.1)
+  %negate.4644.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1569.3.clone.1)
+  %compare.7293.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4644.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21345.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21346.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21347.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21348.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21349.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21350.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21351.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21352.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21353.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250535.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4644.4.clone.1, %broadcast.244496.640)
+  %sqrt.1569.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4644.4.clone.1)
+  %add.250536.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1569.5.clone.1, %broadcast.244498.640)
+  %select.21354.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7293.3.clone.1, %add.250535.5.clone.1, %add.250536.5.clone.1)
+  %multiply.26906.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21353.3.clone.1, %select.21354.3.clone.1)
+  %add.250538.1.clone.1 = f32[1280,1280]{1,0} add(%select.21352.3.clone.1, %multiply.26906.1.clone.1)
+  %multiply.26907.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250538.1.clone.1, %select.21354.3.clone.1)
+  %add.250539.1.clone.1 = f32[1280,1280]{1,0} add(%select.21351.3.clone.1, %multiply.26907.1.clone.1)
+  %multiply.26908.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250539.1.clone.1, %select.21354.3.clone.1)
+  %add.250540.1.clone.1 = f32[1280,1280]{1,0} add(%select.21350.3.clone.1, %multiply.26908.1.clone.1)
+  %multiply.26909.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250540.1.clone.1, %select.21354.3.clone.1)
+  %add.250541.1.clone.1 = f32[1280,1280]{1,0} add(%select.21349.3.clone.1, %multiply.26909.1.clone.1)
+  %multiply.26910.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250541.1.clone.1, %select.21354.3.clone.1)
+  %add.250543.3.clone.1 = f32[1280,1280]{1,0} add(%select.21348.5.clone.1, %multiply.26910.1.clone.1)
+  %multiply.26911.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250543.3.clone.1, %select.21354.3.clone.1)
+  %add.250544.3.clone.1 = f32[1280,1280]{1,0} add(%select.21347.5.clone.1, %multiply.26911.1.clone.1)
+  %multiply.26913.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250544.3.clone.1, %select.21354.3.clone.1)
+  %add.250545.9.clone.1 = f32[1280,1280]{1,0} add(%select.21346.11.clone.1, %multiply.26913.7.clone.1)
+  %multiply.26914.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250545.9.clone.1, %select.21354.3.clone.1)
+  %add.250546.7.clone.1 = f32[1280,1280]{1,0} add(%select.21345.7.clone.1, %multiply.26914.7.clone.1)
+  %multiply.26915.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250546.7.clone.1, %maximum.3729.3.clone.1)
+  %select.21355.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7292.3.clone.1, %multiply.26904.9.clone.1, %multiply.26915.7.clone.1)
+  %multiply.26916.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21355.7.clone.1, %broadcast.244500.640)
+  %clamp.1213.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26916.5.clone.1, %broadcast.244501.384)
+  %multiply.26917.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1213.3.clone.1, %broadcast.244502.1)
+  %constant_170948_1_clone_1 = u32[] constant(2274471178)
+  %broadcast.249962.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170948_1_clone_1), dimensions={}
+  %add.247240.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249962.44.clone.1)
+  %constant_170956_1_clone_1 = u32[] constant(1719103117)
+  %broadcast.249963.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170956_1_clone_1), dimensions={}
+  %add.247241.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249963.113.clone.1)
+  %add.247243.35.clone.1 = u32[1280,1280]{1,0} add(%add.247240.37.clone.1, %add.247241.99.clone.1)
+  %shift-left.109116.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247241.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115285.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247241.99.clone.1, %broadcast.244415.6016)
+  %or.114808.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109116.31.clone.1, %shift-right-logical.115285.29.clone.1)
+  %xor.121355.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247243.35.clone.1, %or.114808.29.clone.1)
+  %add.247244.5.clone.1 = u32[1280,1280]{1,0} add(%add.247243.35.clone.1, %xor.121355.27.clone.1)
+  %shift-left.109117.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121355.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115287.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121355.27.clone.1, %broadcast.244417.5760)
+  %or.114809.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109117.9.clone.1, %shift-right-logical.115287.9.clone.1)
+  %xor.121356.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247244.5.clone.1, %or.114809.7.clone.1)
+  %add.247245.3.clone.1 = u32[1280,1280]{1,0} add(%add.247244.5.clone.1, %xor.121356.5.clone.1)
+  %shift-left.109119.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121356.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115288.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121356.5.clone.1, %broadcast.244419.4352)
+  %or.114810.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109119.5.clone.1, %shift-right-logical.115288.5.clone.1)
+  %xor.121357.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247245.3.clone.1, %or.114810.3.clone.1)
+  %add.247246.3.clone.1 = u32[1280,1280]{1,0} add(%add.247245.3.clone.1, %xor.121357.3.clone.1)
+  %add.247247.7.clone.1 = u32[1280,1280]{1,0} add(%add.247246.3.clone.1, %broadcast.249963.113.clone.1)
+  %shift-left.109120.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121357.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115289.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121357.3.clone.1, %broadcast.244418.4352)
+  %or.114811.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109120.5.clone.1, %shift-right-logical.115289.5.clone.1)
+  %xor.121358.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247246.3.clone.1, %or.114811.3.clone.1)
+  %constant_218111_1_clone_1 = u32[] constant(4197963870)
+  %broadcast.249973.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218111_1_clone_1), dimensions={}
+  %add.247249.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121358.3.clone.1, %broadcast.249973.5.clone.1)
+  %add.247252.5.clone.1 = u32[1280,1280]{1,0} add(%add.247247.7.clone.1, %add.247249.5.clone.1)
+  %shift-left.109121.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247249.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115290.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247249.5.clone.1, %broadcast.244416.5760)
+  %or.114812.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109121.9.clone.1, %shift-right-logical.115290.9.clone.1)
+  %xor.121359.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247252.5.clone.1, %or.114812.7.clone.1)
+  %add.247253.3.clone.1 = u32[1280,1280]{1,0} add(%add.247252.5.clone.1, %xor.121359.5.clone.1)
+  %shift-left.109122.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121359.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115291.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121359.5.clone.1, %broadcast.244429.2304)
+  %or.114813.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109122.9.clone.1, %shift-right-logical.115291.9.clone.1)
+  %xor.121360.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247253.3.clone.1, %or.114813.7.clone.1)
+  %add.247254.3.clone.1 = u32[1280,1280]{1,0} add(%add.247253.3.clone.1, %xor.121360.5.clone.1)
+  %shift-left.109124.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121360.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115292.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121360.5.clone.1, %broadcast.244430.4608)
+  %or.114814.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109124.9.clone.1, %shift-right-logical.115292.9.clone.1)
+  %xor.121361.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247254.3.clone.1, %or.114814.7.clone.1)
+  %add.247255.3.clone.1 = u32[1280,1280]{1,0} add(%add.247254.3.clone.1, %xor.121361.5.clone.1)
+  %constant_170958_1_clone_1 = u32[] constant(4197963869)
+  %broadcast.249980.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170958_1_clone_1), dimensions={}
+  %add.247256.7.clone.1 = u32[1280,1280]{1,0} add(%add.247255.3.clone.1, %broadcast.249980.24.clone.1)
+  %shift-left.109125.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121361.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115293.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121361.5.clone.1, %broadcast.244434.2816)
+  %or.114815.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109125.11.clone.1, %shift-right-logical.115293.11.clone.1)
+  %xor.121362.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247255.3.clone.1, %or.114815.9.clone.1)
+  %constant_218112_1_clone_1 = u32[] constant(2274471180)
+  %broadcast.249983.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218112_1_clone_1), dimensions={}
+  %add.247257.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121362.7.clone.1, %broadcast.249983.5.clone.1)
+  %add.247258.5.clone.1 = u32[1280,1280]{1,0} add(%add.247256.7.clone.1, %add.247257.5.clone.1)
+  %shift-left.109126.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247257.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115294.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247257.5.clone.1, %broadcast.244415.6016)
+  %or.114816.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109126.9.clone.1, %shift-right-logical.115294.9.clone.1)
+  %xor.121363.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247258.5.clone.1, %or.114816.7.clone.1)
+  %add.247259.3.clone.1 = u32[1280,1280]{1,0} add(%add.247258.5.clone.1, %xor.121363.5.clone.1)
+  %shift-left.109127.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121363.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115295.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121363.5.clone.1, %broadcast.244417.5760)
+  %or.114817.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109127.9.clone.1, %shift-right-logical.115295.9.clone.1)
+  %xor.121364.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247259.3.clone.1, %or.114817.7.clone.1)
+  %add.247260.3.clone.1 = u32[1280,1280]{1,0} add(%add.247259.3.clone.1, %xor.121364.5.clone.1)
+  %shift-left.109129.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121364.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115296.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121364.5.clone.1, %broadcast.244419.4352)
+  %or.114818.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109129.7.clone.1, %shift-right-logical.115296.7.clone.1)
+  %xor.121365.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247260.3.clone.1, %or.114818.5.clone.1)
+  %add.247261.3.clone.1 = u32[1280,1280]{1,0} add(%add.247260.3.clone.1, %xor.121365.3.clone.1)
+  %add.247262.7.clone.1 = u32[1280,1280]{1,0} add(%add.247261.3.clone.1, %broadcast.249962.44.clone.1)
+  %shift-left.109130.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121365.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115297.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121365.3.clone.1, %broadcast.244418.4352)
+  %or.114819.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109130.7.clone.1, %shift-right-logical.115297.7.clone.1)
+  %xor.121366.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247261.3.clone.1, %or.114819.5.clone.1)
+  %constant_218113_1_clone_1 = u32[] constant(1719103120)
+  %broadcast.249993.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218113_1_clone_1), dimensions={}
+  %add.247263.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121366.3.clone.1, %broadcast.249993.5.clone.1)
+  %add.247264.5.clone.1 = u32[1280,1280]{1,0} add(%add.247262.7.clone.1, %add.247263.5.clone.1)
+  %shift-left.109131.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247263.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115298.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247263.5.clone.1, %broadcast.244416.5760)
+  %or.114820.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109131.9.clone.1, %shift-right-logical.115298.9.clone.1)
+  %xor.121367.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247264.5.clone.1, %or.114820.7.clone.1)
+  %add.247265.3.clone.1 = u32[1280,1280]{1,0} add(%add.247264.5.clone.1, %xor.121367.5.clone.1)
+  %shift-left.109132.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121367.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115299.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121367.5.clone.1, %broadcast.244429.2304)
+  %or.114821.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109132.9.clone.1, %shift-right-logical.115299.9.clone.1)
+  %xor.121368.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247265.3.clone.1, %or.114821.7.clone.1)
+  %add.247266.3.clone.1 = u32[1280,1280]{1,0} add(%add.247265.3.clone.1, %xor.121368.5.clone.1)
+  %shift-left.109134.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121368.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115300.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121368.5.clone.1, %broadcast.244430.4608)
+  %or.114822.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109134.9.clone.1, %shift-right-logical.115300.9.clone.1)
+  %xor.121369.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247266.3.clone.1, %or.114822.7.clone.1)
+  %add.247267.3.clone.1 = u32[1280,1280]{1,0} add(%add.247266.3.clone.1, %xor.121369.5.clone.1)
+  %add.247268.7.clone.1 = u32[1280,1280]{1,0} add(%add.247267.3.clone.1, %broadcast.249963.113.clone.1)
+  %shift-left.109135.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121369.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115301.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121369.5.clone.1, %broadcast.244434.2816)
+  %or.114823.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109135.11.clone.1, %shift-right-logical.115301.11.clone.1)
+  %xor.121370.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247267.3.clone.1, %or.114823.9.clone.1)
+  %constant_218114_1_clone_1 = u32[] constant(4197963873)
+  %broadcast.250003.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218114_1_clone_1), dimensions={}
+  %add.247269.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121370.7.clone.1, %broadcast.250003.5.clone.1)
+  %add.247270.5.clone.1 = u32[1280,1280]{1,0} add(%add.247268.7.clone.1, %add.247269.5.clone.1)
+  %shift-left.109136.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247269.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115302.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247269.5.clone.1, %broadcast.244415.6016)
+  %or.114824.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109136.9.clone.1, %shift-right-logical.115302.9.clone.1)
+  %xor.121371.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247270.5.clone.1, %or.114824.7.clone.1)
+  %add.247271.3.clone.1 = u32[1280,1280]{1,0} add(%add.247270.5.clone.1, %xor.121371.5.clone.1)
+  %shift-left.109137.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121371.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115303.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121371.5.clone.1, %broadcast.244417.5760)
+  %or.114825.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109137.9.clone.1, %shift-right-logical.115303.9.clone.1)
+  %xor.121372.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247271.3.clone.1, %or.114825.7.clone.1)
+  %add.247272.3.clone.1 = u32[1280,1280]{1,0} add(%add.247271.3.clone.1, %xor.121372.5.clone.1)
+  %shift-left.109138.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121372.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115304.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121372.5.clone.1, %broadcast.244419.4352)
+  %or.114826.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109138.5.clone.1, %shift-right-logical.115304.5.clone.1)
+  %xor.121373.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247272.3.clone.1, %or.114826.3.clone.1)
+  %add.247273.3.clone.1 = u32[1280,1280]{1,0} add(%add.247272.3.clone.1, %xor.121373.3.clone.1)
+  %add.247274.17.clone.1 = u32[1280,1280]{1,0} add(%add.247273.3.clone.1, %broadcast.249980.24.clone.1)
+  %shift-left.109139.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121373.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115305.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121373.3.clone.1, %broadcast.244418.4352)
+  %or.114827.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109139.5.clone.1, %shift-right-logical.115305.5.clone.1)
+  %xor.121374.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247273.3.clone.1, %or.114827.3.clone.1)
+  %constant_218115_1_clone_1 = u32[] constant(2274471183)
+  %broadcast.250013.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218115_1_clone_1), dimensions={}
+  %add.247275.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121374.15.clone.1, %broadcast.250013.19.clone.1)
+  %xor.121375.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247274.17.clone.1, %add.247275.19.clone.1)
+  %shift-right-logical.115306.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121375.17.clone.1, %broadcast.244468.1920)
+  %or.114828.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115306.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5732.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114828.13.clone.1)
+  %add.247276.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5732.11.clone.1, %broadcast.244470.1152)
+  %multiply.26242.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247276.9.clone.1, %broadcast.244471.896)
+  %add.247277.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26242.7.clone.1, %broadcast.244408.1024)
+  %maximum.3664.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247277.5.clone.1)
+  %abs.1526.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3664.3.clone.1)
+  %compare.7200.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1526.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26243.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3664.3.clone.1, %broadcast.244476.1152)
+  %negate.4557.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3664.3.clone.1)
+  %multiply.26244.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3664.3.clone.1, %negate.4557.5.clone.1)
+  %log-plus-one.1526.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26244.5.clone.1)
+  %negate.4558.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1526.3.clone.1)
+  %compare.7201.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4558.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20851.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20852.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20853.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20854.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20855.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20856.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20857.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20858.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20859.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247278.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4558.4.clone.1, %broadcast.244496.640)
+  %sqrt.1526.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4558.4.clone.1)
+  %add.247279.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1526.5.clone.1, %broadcast.244498.640)
+  %select.20860.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7201.3.clone.1, %add.247278.5.clone.1, %add.247279.5.clone.1)
+  %multiply.26245.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20859.3.clone.1, %select.20860.3.clone.1)
+  %add.247280.1.clone.1 = f32[1280,1280]{1,0} add(%select.20858.3.clone.1, %multiply.26245.1.clone.1)
+  %multiply.26246.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247280.1.clone.1, %select.20860.3.clone.1)
+  %add.247281.1.clone.1 = f32[1280,1280]{1,0} add(%select.20857.3.clone.1, %multiply.26246.1.clone.1)
+  %multiply.26247.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247281.1.clone.1, %select.20860.3.clone.1)
+  %add.247282.1.clone.1 = f32[1280,1280]{1,0} add(%select.20856.3.clone.1, %multiply.26247.1.clone.1)
+  %multiply.26248.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247282.1.clone.1, %select.20860.3.clone.1)
+  %add.247283.1.clone.1 = f32[1280,1280]{1,0} add(%select.20855.3.clone.1, %multiply.26248.1.clone.1)
+  %multiply.26249.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247283.1.clone.1, %select.20860.3.clone.1)
+  %add.247284.3.clone.1 = f32[1280,1280]{1,0} add(%select.20854.5.clone.1, %multiply.26249.1.clone.1)
+  %multiply.26250.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247284.3.clone.1, %select.20860.3.clone.1)
+  %add.247285.3.clone.1 = f32[1280,1280]{1,0} add(%select.20853.5.clone.1, %multiply.26250.1.clone.1)
+  %multiply.26251.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247285.3.clone.1, %select.20860.3.clone.1)
+  %add.247286.9.clone.1 = f32[1280,1280]{1,0} add(%select.20852.11.clone.1, %multiply.26251.7.clone.1)
+  %multiply.26252.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247286.9.clone.1, %select.20860.3.clone.1)
+  %add.247287.7.clone.1 = f32[1280,1280]{1,0} add(%select.20851.7.clone.1, %multiply.26252.7.clone.1)
+  %multiply.26253.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247287.7.clone.1, %maximum.3664.3.clone.1)
+  %select.20861.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7200.3.clone.1, %multiply.26243.9.clone.1, %multiply.26253.7.clone.1)
+  %multiply.26254.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20861.7.clone.1, %broadcast.244500.640)
+  %clamp.1170.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26254.5.clone.1, %broadcast.244501.384)
+  %multiply.26255.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1170.3.clone.1, %broadcast.244502.1)
+  %constant_190473_1_clone_1 = u32[] constant(2952426392)
+  %broadcast.258389.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190473_1_clone_1), dimensions={}
+  %add.252060.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258389.44.clone.1)
+  %constant_190480_1_clone_1 = u32[] constant(3094339752)
+  %broadcast.258390.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190480_1_clone_1), dimensions={}
+  %add.252061.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258390.113.clone.1)
+  %add.252062.35.clone.1 = u32[1280,1280]{1,0} add(%add.252060.37.clone.1, %add.252061.99.clone.1)
+  %shift-left.111220.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252061.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117488.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252061.99.clone.1, %broadcast.244415.6016)
+  %or.117020.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111220.31.clone.1, %shift-right-logical.117488.29.clone.1)
+  %xor.123576.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252062.35.clone.1, %or.117020.29.clone.1)
+  %add.252063.5.clone.1 = u32[1280,1280]{1,0} add(%add.252062.35.clone.1, %xor.123576.27.clone.1)
+  %shift-left.111221.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123576.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117489.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123576.27.clone.1, %broadcast.244417.5760)
+  %or.117021.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111221.9.clone.1, %shift-right-logical.117489.9.clone.1)
+  %xor.123577.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252063.5.clone.1, %or.117021.7.clone.1)
+  %add.252065.3.clone.1 = u32[1280,1280]{1,0} add(%add.252063.5.clone.1, %xor.123577.5.clone.1)
+  %shift-left.111222.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123577.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117490.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123577.5.clone.1, %broadcast.244419.4352)
+  %or.117022.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111222.5.clone.1, %shift-right-logical.117490.5.clone.1)
+  %xor.123578.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252065.3.clone.1, %or.117022.3.clone.1)
+  %add.252066.3.clone.1 = u32[1280,1280]{1,0} add(%add.252065.3.clone.1, %xor.123578.3.clone.1)
+  %add.252067.7.clone.1 = u32[1280,1280]{1,0} add(%add.252066.3.clone.1, %broadcast.258390.113.clone.1)
+  %shift-left.111223.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123578.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117491.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123578.3.clone.1, %broadcast.244418.4352)
+  %or.117023.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111223.5.clone.1, %shift-right-logical.117491.5.clone.1)
+  %xor.123580.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252066.3.clone.1, %or.117023.3.clone.1)
+  %constant_218650_1_clone_1 = u32[] constant(205818091)
+  %broadcast.258400.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218650_1_clone_1), dimensions={}
+  %add.252068.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123580.3.clone.1, %broadcast.258400.5.clone.1)
+  %add.252070.5.clone.1 = u32[1280,1280]{1,0} add(%add.252067.7.clone.1, %add.252068.5.clone.1)
+  %shift-left.111224.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252068.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117492.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252068.5.clone.1, %broadcast.244416.5760)
+  %or.117024.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111224.9.clone.1, %shift-right-logical.117492.9.clone.1)
+  %xor.123581.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252070.5.clone.1, %or.117024.7.clone.1)
+  %add.252071.3.clone.1 = u32[1280,1280]{1,0} add(%add.252070.5.clone.1, %xor.123581.5.clone.1)
+  %shift-left.111225.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123581.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117493.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123581.5.clone.1, %broadcast.244429.2304)
+  %or.117025.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111225.9.clone.1, %shift-right-logical.117493.9.clone.1)
+  %xor.123582.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252071.3.clone.1, %or.117025.7.clone.1)
+  %add.252072.3.clone.1 = u32[1280,1280]{1,0} add(%add.252071.3.clone.1, %xor.123582.5.clone.1)
+  %shift-left.111226.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123582.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117494.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123582.5.clone.1, %broadcast.244430.4608)
+  %or.117026.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111226.9.clone.1, %shift-right-logical.117494.9.clone.1)
+  %xor.123583.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252072.3.clone.1, %or.117026.7.clone.1)
+  %add.252073.3.clone.1 = u32[1280,1280]{1,0} add(%add.252072.3.clone.1, %xor.123583.5.clone.1)
+  %constant_190482_1_clone_1 = u32[] constant(205818090)
+  %broadcast.258409.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190482_1_clone_1), dimensions={}
+  %add.252075.7.clone.1 = u32[1280,1280]{1,0} add(%add.252073.3.clone.1, %broadcast.258409.24.clone.1)
+  %shift-left.111227.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123583.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117496.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123583.5.clone.1, %broadcast.244434.2816)
+  %or.117027.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111227.11.clone.1, %shift-right-logical.117496.11.clone.1)
+  %xor.123585.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252073.3.clone.1, %or.117027.9.clone.1)
+  %constant_218651_1_clone_1 = u32[] constant(2952426394)
+  %broadcast.258412.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218651_1_clone_1), dimensions={}
+  %add.252076.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123585.7.clone.1, %broadcast.258412.5.clone.1)
+  %add.252077.5.clone.1 = u32[1280,1280]{1,0} add(%add.252075.7.clone.1, %add.252076.5.clone.1)
+  %shift-left.111228.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252076.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117497.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252076.5.clone.1, %broadcast.244415.6016)
+  %or.117028.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111228.9.clone.1, %shift-right-logical.117497.9.clone.1)
+  %xor.123586.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252077.5.clone.1, %or.117028.7.clone.1)
+  %add.252078.3.clone.1 = u32[1280,1280]{1,0} add(%add.252077.5.clone.1, %xor.123586.5.clone.1)
+  %shift-left.111229.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123586.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117498.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123586.5.clone.1, %broadcast.244417.5760)
+  %or.117029.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111229.9.clone.1, %shift-right-logical.117498.9.clone.1)
+  %xor.123587.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252078.3.clone.1, %or.117029.7.clone.1)
+  %add.252079.3.clone.1 = u32[1280,1280]{1,0} add(%add.252078.3.clone.1, %xor.123587.5.clone.1)
+  %shift-left.111230.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123587.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117499.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123587.5.clone.1, %broadcast.244419.4352)
+  %or.117030.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111230.7.clone.1, %shift-right-logical.117499.7.clone.1)
+  %xor.123588.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252079.3.clone.1, %or.117030.5.clone.1)
+  %add.252081.3.clone.1 = u32[1280,1280]{1,0} add(%add.252079.3.clone.1, %xor.123588.3.clone.1)
+  %add.252085.7.clone.1 = u32[1280,1280]{1,0} add(%add.252081.3.clone.1, %broadcast.258389.44.clone.1)
+  %shift-left.111231.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123588.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117501.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123588.3.clone.1, %broadcast.244418.4352)
+  %or.117031.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111231.7.clone.1, %shift-right-logical.117501.7.clone.1)
+  %xor.123590.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252081.3.clone.1, %or.117031.5.clone.1)
+  %constant_218652_1_clone_1 = u32[] constant(3094339755)
+  %broadcast.258422.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218652_1_clone_1), dimensions={}
+  %add.252086.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123590.3.clone.1, %broadcast.258422.5.clone.1)
+  %add.252087.5.clone.1 = u32[1280,1280]{1,0} add(%add.252085.7.clone.1, %add.252086.5.clone.1)
+  %shift-left.111232.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252086.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117502.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252086.5.clone.1, %broadcast.244416.5760)
+  %or.117032.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111232.9.clone.1, %shift-right-logical.117502.9.clone.1)
+  %xor.123591.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252087.5.clone.1, %or.117032.7.clone.1)
+  %add.252088.3.clone.1 = u32[1280,1280]{1,0} add(%add.252087.5.clone.1, %xor.123591.5.clone.1)
+  %shift-left.111233.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123591.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117503.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123591.5.clone.1, %broadcast.244429.2304)
+  %or.117033.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111233.9.clone.1, %shift-right-logical.117503.9.clone.1)
+  %xor.123592.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252088.3.clone.1, %or.117033.7.clone.1)
+  %add.252090.3.clone.1 = u32[1280,1280]{1,0} add(%add.252088.3.clone.1, %xor.123592.5.clone.1)
+  %shift-left.111234.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123592.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117504.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123592.5.clone.1, %broadcast.244430.4608)
+  %or.117034.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111234.9.clone.1, %shift-right-logical.117504.9.clone.1)
+  %xor.123593.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252090.3.clone.1, %or.117034.7.clone.1)
+  %add.252091.3.clone.1 = u32[1280,1280]{1,0} add(%add.252090.3.clone.1, %xor.123593.5.clone.1)
+  %add.252092.7.clone.1 = u32[1280,1280]{1,0} add(%add.252091.3.clone.1, %broadcast.258390.113.clone.1)
+  %shift-left.111235.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123593.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117506.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123593.5.clone.1, %broadcast.244434.2816)
+  %or.117035.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111235.11.clone.1, %shift-right-logical.117506.11.clone.1)
+  %xor.123594.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252091.3.clone.1, %or.117035.9.clone.1)
+  %constant_218653_1_clone_1 = u32[] constant(205818094)
+  %broadcast.258434.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218653_1_clone_1), dimensions={}
+  %add.252093.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123594.7.clone.1, %broadcast.258434.5.clone.1)
+  %add.252095.5.clone.1 = u32[1280,1280]{1,0} add(%add.252092.7.clone.1, %add.252093.5.clone.1)
+  %shift-left.111236.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252093.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117507.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252093.5.clone.1, %broadcast.244415.6016)
+  %or.117036.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111236.9.clone.1, %shift-right-logical.117507.9.clone.1)
+  %xor.123595.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252095.5.clone.1, %or.117036.7.clone.1)
+  %add.252096.3.clone.1 = u32[1280,1280]{1,0} add(%add.252095.5.clone.1, %xor.123595.5.clone.1)
+  %shift-left.111237.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123595.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117508.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123595.5.clone.1, %broadcast.244417.5760)
+  %or.117037.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111237.9.clone.1, %shift-right-logical.117508.9.clone.1)
+  %xor.123596.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252096.3.clone.1, %or.117037.7.clone.1)
+  %add.252097.3.clone.1 = u32[1280,1280]{1,0} add(%add.252096.3.clone.1, %xor.123596.5.clone.1)
+  %shift-left.111238.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123596.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117509.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123596.5.clone.1, %broadcast.244419.4352)
+  %or.117038.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111238.5.clone.1, %shift-right-logical.117509.5.clone.1)
+  %xor.123597.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252097.3.clone.1, %or.117038.3.clone.1)
+  %add.252098.3.clone.1 = u32[1280,1280]{1,0} add(%add.252097.3.clone.1, %xor.123597.3.clone.1)
+  %add.252100.17.clone.1 = u32[1280,1280]{1,0} add(%add.252098.3.clone.1, %broadcast.258409.24.clone.1)
+  %shift-left.111239.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123597.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117511.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123597.3.clone.1, %broadcast.244418.4352)
+  %or.117039.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111239.5.clone.1, %shift-right-logical.117511.5.clone.1)
+  %xor.123598.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252098.3.clone.1, %or.117039.3.clone.1)
+  %constant_218654_1_clone_1 = u32[] constant(2952426397)
+  %broadcast.258444.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218654_1_clone_1), dimensions={}
+  %add.252101.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123598.15.clone.1, %broadcast.258444.19.clone.1)
+  %xor.123600.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252100.17.clone.1, %add.252101.19.clone.1)
+  %shift-right-logical.117512.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123600.17.clone.1, %broadcast.244468.1920)
+  %or.117040.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117512.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5828.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117040.13.clone.1)
+  %add.252102.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5828.11.clone.1, %broadcast.244470.1152)
+  %multiply.27232.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252102.9.clone.1, %broadcast.244471.896)
+  %add.252103.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27232.7.clone.1, %broadcast.244408.1024)
+  %maximum.3760.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252103.5.clone.1)
+  %abs.1590.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3760.3.clone.1)
+  %compare.7342.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1590.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27233.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3760.3.clone.1, %broadcast.244476.1152)
+  %negate.4685.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3760.3.clone.1)
+  %multiply.27234.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3760.3.clone.1, %negate.4685.5.clone.1)
+  %log-plus-one.1590.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27234.5.clone.1)
+  %negate.4686.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1590.3.clone.1)
+  %compare.7343.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4686.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21597.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21598.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21599.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21600.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21601.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21602.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21603.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21604.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21605.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252104.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4686.4.clone.1, %broadcast.244496.640)
+  %sqrt.1590.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4686.4.clone.1)
+  %add.252106.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1590.5.clone.1, %broadcast.244498.640)
+  %select.21606.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7343.3.clone.1, %add.252104.5.clone.1, %add.252106.5.clone.1)
+  %multiply.27235.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21605.3.clone.1, %select.21606.3.clone.1)
+  %add.252109.1.clone.1 = f32[1280,1280]{1,0} add(%select.21604.3.clone.1, %multiply.27235.1.clone.1)
+  %multiply.27236.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252109.1.clone.1, %select.21606.3.clone.1)
+  %add.252110.1.clone.1 = f32[1280,1280]{1,0} add(%select.21603.3.clone.1, %multiply.27236.1.clone.1)
+  %multiply.27237.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252110.1.clone.1, %select.21606.3.clone.1)
+  %add.252111.1.clone.1 = f32[1280,1280]{1,0} add(%select.21602.3.clone.1, %multiply.27237.1.clone.1)
+  %multiply.27238.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252111.1.clone.1, %select.21606.3.clone.1)
+  %add.252112.1.clone.1 = f32[1280,1280]{1,0} add(%select.21601.3.clone.1, %multiply.27238.1.clone.1)
+  %multiply.27239.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252112.1.clone.1, %select.21606.3.clone.1)
+  %add.252113.3.clone.1 = f32[1280,1280]{1,0} add(%select.21600.5.clone.1, %multiply.27239.1.clone.1)
+  %multiply.27240.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252113.3.clone.1, %select.21606.3.clone.1)
+  %add.252114.3.clone.1 = f32[1280,1280]{1,0} add(%select.21599.5.clone.1, %multiply.27240.1.clone.1)
+  %multiply.27241.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252114.3.clone.1, %select.21606.3.clone.1)
+  %add.252115.9.clone.1 = f32[1280,1280]{1,0} add(%select.21598.11.clone.1, %multiply.27241.7.clone.1)
+  %multiply.27242.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252115.9.clone.1, %select.21606.3.clone.1)
+  %add.252116.7.clone.1 = f32[1280,1280]{1,0} add(%select.21597.7.clone.1, %multiply.27242.7.clone.1)
+  %multiply.27243.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252116.7.clone.1, %maximum.3760.3.clone.1)
+  %select.21607.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7342.3.clone.1, %multiply.27233.9.clone.1, %multiply.27243.7.clone.1)
+  %multiply.27244.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21607.7.clone.1, %broadcast.244500.640)
+  %clamp.1234.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27244.5.clone.1, %broadcast.244501.384)
+  %multiply.27245.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1234.3.clone.1, %broadcast.244502.1)
+  %constant_170738_1_clone_1 = u32[] constant(1424143128)
+  %broadcast.249861.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170738_1_clone_1), dimensions={}
+  %add.247180.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249861.44.clone.1)
+  %constant_170745_1_clone_1 = u32[] constant(187212167)
+  %broadcast.249862.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170745_1_clone_1), dimensions={}
+  %add.247181.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249862.113.clone.1)
+  %add.247183.35.clone.1 = u32[1280,1280]{1,0} add(%add.247180.37.clone.1, %add.247181.99.clone.1)
+  %shift-left.109092.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247181.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115260.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247181.99.clone.1, %broadcast.244415.6016)
+  %or.114787.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109092.31.clone.1, %shift-right-logical.115260.29.clone.1)
+  %xor.121334.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247183.35.clone.1, %or.114787.29.clone.1)
+  %add.247184.5.clone.1 = u32[1280,1280]{1,0} add(%add.247183.35.clone.1, %xor.121334.27.clone.1)
+  %shift-left.109094.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121334.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115262.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121334.27.clone.1, %broadcast.244417.5760)
+  %or.114788.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109094.9.clone.1, %shift-right-logical.115262.9.clone.1)
+  %xor.121335.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247184.5.clone.1, %or.114788.7.clone.1)
+  %add.247185.3.clone.1 = u32[1280,1280]{1,0} add(%add.247184.5.clone.1, %xor.121335.5.clone.1)
+  %shift-left.109095.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121335.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115263.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121335.5.clone.1, %broadcast.244419.4352)
+  %or.114789.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109095.5.clone.1, %shift-right-logical.115263.5.clone.1)
+  %xor.121336.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247185.3.clone.1, %or.114789.3.clone.1)
+  %add.247186.3.clone.1 = u32[1280,1280]{1,0} add(%add.247185.3.clone.1, %xor.121336.3.clone.1)
+  %add.247188.7.clone.1 = u32[1280,1280]{1,0} add(%add.247186.3.clone.1, %broadcast.249862.113.clone.1)
+  %shift-left.109096.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121336.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115264.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121336.3.clone.1, %broadcast.244418.4352)
+  %or.114790.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109096.5.clone.1, %shift-right-logical.115264.5.clone.1)
+  %xor.121337.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247186.3.clone.1, %or.114790.3.clone.1)
+  %constant_218106_1_clone_1 = u32[] constant(1142622534)
+  %broadcast.249875.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218106_1_clone_1), dimensions={}
+  %add.247189.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121337.3.clone.1, %broadcast.249875.5.clone.1)
+  %add.247190.5.clone.1 = u32[1280,1280]{1,0} add(%add.247188.7.clone.1, %add.247189.5.clone.1)
+  %shift-left.109097.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247189.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115265.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247189.5.clone.1, %broadcast.244416.5760)
+  %or.114791.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109097.9.clone.1, %shift-right-logical.115265.9.clone.1)
+  %xor.121338.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247190.5.clone.1, %or.114791.7.clone.1)
+  %add.247191.3.clone.1 = u32[1280,1280]{1,0} add(%add.247190.5.clone.1, %xor.121338.5.clone.1)
+  %shift-left.109099.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121338.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115266.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121338.5.clone.1, %broadcast.244429.2304)
+  %or.114792.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109099.9.clone.1, %shift-right-logical.115266.9.clone.1)
+  %xor.121339.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247191.3.clone.1, %or.114792.7.clone.1)
+  %add.247193.3.clone.1 = u32[1280,1280]{1,0} add(%add.247191.3.clone.1, %xor.121339.5.clone.1)
+  %shift-left.109100.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121339.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115267.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121339.5.clone.1, %broadcast.244430.4608)
+  %or.114793.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109100.9.clone.1, %shift-right-logical.115267.9.clone.1)
+  %xor.121340.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247193.3.clone.1, %or.114793.7.clone.1)
+  %add.247194.3.clone.1 = u32[1280,1280]{1,0} add(%add.247193.3.clone.1, %xor.121340.5.clone.1)
+  %constant_170747_1_clone_1 = u32[] constant(1142622533)
+  %broadcast.249886.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170747_1_clone_1), dimensions={}
+  %add.247195.7.clone.1 = u32[1280,1280]{1,0} add(%add.247194.3.clone.1, %broadcast.249886.24.clone.1)
+  %shift-left.109101.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121340.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115268.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121340.5.clone.1, %broadcast.244434.2816)
+  %or.114794.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109101.11.clone.1, %shift-right-logical.115268.11.clone.1)
+  %xor.121341.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247194.3.clone.1, %or.114794.9.clone.1)
+  %constant_218107_1_clone_1 = u32[] constant(1424143130)
+  %broadcast.249889.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218107_1_clone_1), dimensions={}
+  %add.247196.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121341.7.clone.1, %broadcast.249889.5.clone.1)
+  %add.247197.5.clone.1 = u32[1280,1280]{1,0} add(%add.247195.7.clone.1, %add.247196.5.clone.1)
+  %shift-left.109102.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247196.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115269.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247196.5.clone.1, %broadcast.244415.6016)
+  %or.114795.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109102.9.clone.1, %shift-right-logical.115269.9.clone.1)
+  %xor.121342.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247197.5.clone.1, %or.114795.7.clone.1)
+  %add.247199.3.clone.1 = u32[1280,1280]{1,0} add(%add.247197.5.clone.1, %xor.121342.5.clone.1)
+  %shift-left.109104.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121342.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115270.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121342.5.clone.1, %broadcast.244417.5760)
+  %or.114796.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109104.9.clone.1, %shift-right-logical.115270.9.clone.1)
+  %xor.121343.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247199.3.clone.1, %or.114796.7.clone.1)
+  %add.247203.3.clone.1 = u32[1280,1280]{1,0} add(%add.247199.3.clone.1, %xor.121343.5.clone.1)
+  %shift-left.109105.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121343.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115272.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121343.5.clone.1, %broadcast.244419.4352)
+  %or.114797.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109105.7.clone.1, %shift-right-logical.115272.7.clone.1)
+  %xor.121344.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247203.3.clone.1, %or.114797.5.clone.1)
+  %add.247204.3.clone.1 = u32[1280,1280]{1,0} add(%add.247203.3.clone.1, %xor.121344.3.clone.1)
+  %add.247205.7.clone.1 = u32[1280,1280]{1,0} add(%add.247204.3.clone.1, %broadcast.249861.44.clone.1)
+  %shift-left.109106.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121344.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115273.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121344.3.clone.1, %broadcast.244418.4352)
+  %or.114798.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109106.7.clone.1, %shift-right-logical.115273.7.clone.1)
+  %xor.121345.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247204.3.clone.1, %or.114798.5.clone.1)
+  %constant_218108_1_clone_1 = u32[] constant(187212170)
+  %broadcast.249901.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218108_1_clone_1), dimensions={}
+  %add.247206.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121345.3.clone.1, %broadcast.249901.5.clone.1)
+  %add.247208.5.clone.1 = u32[1280,1280]{1,0} add(%add.247205.7.clone.1, %add.247206.5.clone.1)
+  %shift-left.109107.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247206.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115274.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247206.5.clone.1, %broadcast.244416.5760)
+  %or.114799.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109107.9.clone.1, %shift-right-logical.115274.9.clone.1)
+  %xor.121346.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247208.5.clone.1, %or.114799.7.clone.1)
+  %add.247209.3.clone.1 = u32[1280,1280]{1,0} add(%add.247208.5.clone.1, %xor.121346.5.clone.1)
+  %shift-left.109109.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121346.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115275.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121346.5.clone.1, %broadcast.244429.2304)
+  %or.114800.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109109.9.clone.1, %shift-right-logical.115275.9.clone.1)
+  %xor.121347.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247209.3.clone.1, %or.114800.7.clone.1)
+  %add.247210.3.clone.1 = u32[1280,1280]{1,0} add(%add.247209.3.clone.1, %xor.121347.5.clone.1)
+  %shift-left.109110.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121347.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115277.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121347.5.clone.1, %broadcast.244430.4608)
+  %or.114801.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109110.9.clone.1, %shift-right-logical.115277.9.clone.1)
+  %xor.121348.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247210.3.clone.1, %or.114801.7.clone.1)
+  %add.247211.3.clone.1 = u32[1280,1280]{1,0} add(%add.247210.3.clone.1, %xor.121348.5.clone.1)
+  %add.247213.7.clone.1 = u32[1280,1280]{1,0} add(%add.247211.3.clone.1, %broadcast.249862.113.clone.1)
+  %shift-left.109111.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121348.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115278.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121348.5.clone.1, %broadcast.244434.2816)
+  %or.114802.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109111.11.clone.1, %shift-right-logical.115278.11.clone.1)
+  %xor.121349.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247211.3.clone.1, %or.114802.9.clone.1)
+  %constant_218109_1_clone_1 = u32[] constant(1142622537)
+  %broadcast.249915.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218109_1_clone_1), dimensions={}
+  %add.247214.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121349.7.clone.1, %broadcast.249915.5.clone.1)
+  %add.247215.5.clone.1 = u32[1280,1280]{1,0} add(%add.247213.7.clone.1, %add.247214.5.clone.1)
+  %shift-left.109112.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247214.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115279.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247214.5.clone.1, %broadcast.244415.6016)
+  %or.114803.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109112.9.clone.1, %shift-right-logical.115279.9.clone.1)
+  %xor.121350.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247215.5.clone.1, %or.114803.7.clone.1)
+  %add.247216.3.clone.1 = u32[1280,1280]{1,0} add(%add.247215.5.clone.1, %xor.121350.5.clone.1)
+  %shift-left.109113.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121350.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115280.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121350.5.clone.1, %broadcast.244417.5760)
+  %or.114804.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109113.9.clone.1, %shift-right-logical.115280.9.clone.1)
+  %xor.121351.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247216.3.clone.1, %or.114804.7.clone.1)
+  %add.247218.3.clone.1 = u32[1280,1280]{1,0} add(%add.247216.3.clone.1, %xor.121351.5.clone.1)
+  %shift-left.109114.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121351.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115282.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121351.5.clone.1, %broadcast.244419.4352)
+  %or.114805.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109114.5.clone.1, %shift-right-logical.115282.5.clone.1)
+  %xor.121352.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247218.3.clone.1, %or.114805.3.clone.1)
+  %add.247219.3.clone.1 = u32[1280,1280]{1,0} add(%add.247218.3.clone.1, %xor.121352.3.clone.1)
+  %add.247220.17.clone.1 = u32[1280,1280]{1,0} add(%add.247219.3.clone.1, %broadcast.249886.24.clone.1)
+  %shift-left.109115.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121352.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115283.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121352.3.clone.1, %broadcast.244418.4352)
+  %or.114806.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109115.5.clone.1, %shift-right-logical.115283.5.clone.1)
+  %xor.121353.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247219.3.clone.1, %or.114806.3.clone.1)
+  %constant_218110_1_clone_1 = u32[] constant(1424143133)
+  %broadcast.249927.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218110_1_clone_1), dimensions={}
+  %add.247221.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121353.15.clone.1, %broadcast.249927.19.clone.1)
+  %xor.121354.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247220.17.clone.1, %add.247221.19.clone.1)
+  %shift-right-logical.115284.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121354.17.clone.1, %broadcast.244468.1920)
+  %or.114807.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115284.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5731.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114807.13.clone.1)
+  %add.247222.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5731.11.clone.1, %broadcast.244470.1152)
+  %multiply.26228.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247222.9.clone.1, %broadcast.244471.896)
+  %add.247224.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26228.7.clone.1, %broadcast.244408.1024)
+  %maximum.3663.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247224.5.clone.1)
+  %abs.1525.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3663.3.clone.1)
+  %compare.7198.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1525.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26229.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3663.3.clone.1, %broadcast.244476.1152)
+  %negate.4555.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3663.3.clone.1)
+  %multiply.26230.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3663.3.clone.1, %negate.4555.5.clone.1)
+  %log-plus-one.1525.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26230.5.clone.1)
+  %negate.4556.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1525.3.clone.1)
+  %compare.7199.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4556.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20840.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20841.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20842.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20843.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20844.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20845.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20846.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20847.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20848.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247228.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4556.4.clone.1, %broadcast.244496.640)
+  %sqrt.1525.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4556.4.clone.1)
+  %add.247229.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1525.5.clone.1, %broadcast.244498.640)
+  %select.20849.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7199.3.clone.1, %add.247228.5.clone.1, %add.247229.5.clone.1)
+  %multiply.26231.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20848.3.clone.1, %select.20849.3.clone.1)
+  %add.247230.1.clone.1 = f32[1280,1280]{1,0} add(%select.20847.3.clone.1, %multiply.26231.1.clone.1)
+  %multiply.26232.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247230.1.clone.1, %select.20849.3.clone.1)
+  %add.247231.1.clone.1 = f32[1280,1280]{1,0} add(%select.20846.3.clone.1, %multiply.26232.1.clone.1)
+  %multiply.26233.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247231.1.clone.1, %select.20849.3.clone.1)
+  %add.247233.1.clone.1 = f32[1280,1280]{1,0} add(%select.20845.3.clone.1, %multiply.26233.1.clone.1)
+  %multiply.26234.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247233.1.clone.1, %select.20849.3.clone.1)
+  %add.247234.1.clone.1 = f32[1280,1280]{1,0} add(%select.20844.3.clone.1, %multiply.26234.1.clone.1)
+  %multiply.26235.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247234.1.clone.1, %select.20849.3.clone.1)
+  %add.247235.3.clone.1 = f32[1280,1280]{1,0} add(%select.20843.5.clone.1, %multiply.26235.1.clone.1)
+  %multiply.26236.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247235.3.clone.1, %select.20849.3.clone.1)
+  %add.247236.3.clone.1 = f32[1280,1280]{1,0} add(%select.20842.5.clone.1, %multiply.26236.1.clone.1)
+  %multiply.26237.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247236.3.clone.1, %select.20849.3.clone.1)
+  %add.247238.9.clone.1 = f32[1280,1280]{1,0} add(%select.20841.11.clone.1, %multiply.26237.7.clone.1)
+  %multiply.26238.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247238.9.clone.1, %select.20849.3.clone.1)
+  %add.247239.7.clone.1 = f32[1280,1280]{1,0} add(%select.20840.7.clone.1, %multiply.26238.7.clone.1)
+  %multiply.26239.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247239.7.clone.1, %maximum.3663.3.clone.1)
+  %select.20850.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7198.3.clone.1, %multiply.26229.9.clone.1, %multiply.26239.7.clone.1)
+  %multiply.26240.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20850.7.clone.1, %broadcast.244500.640)
+  %clamp.1169.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26240.5.clone.1, %broadcast.244501.384)
+  %multiply.26241.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1169.3.clone.1, %broadcast.244502.1)
+  %constant_183930_1_clone_1 = u32[] constant(396441042)
+  %broadcast.255570.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183930_1_clone_1), dimensions={}
+  %add.250428.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255570.44.clone.1)
+  %constant_183937_1_clone_1 = u32[] constant(1703058310)
+  %broadcast.255571.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183937_1_clone_1), dimensions={}
+  %add.250429.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.255571.113.clone.1)
+  %add.250430.35.clone.1 = u32[1280,1280]{1,0} add(%add.250428.37.clone.1, %add.250429.99.clone.1)
+  %shift-left.110500.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250429.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116755.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250429.99.clone.1, %broadcast.244415.6016)
+  %or.116266.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110500.31.clone.1, %shift-right-logical.116755.29.clone.1)
+  %xor.122836.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250430.35.clone.1, %or.116266.29.clone.1)
+  %add.250431.5.clone.1 = u32[1280,1280]{1,0} add(%add.250430.35.clone.1, %xor.122836.27.clone.1)
+  %shift-left.110501.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122836.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116756.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122836.27.clone.1, %broadcast.244417.5760)
+  %or.116267.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110501.9.clone.1, %shift-right-logical.116756.9.clone.1)
+  %xor.122837.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250431.5.clone.1, %or.116267.7.clone.1)
+  %add.250433.3.clone.1 = u32[1280,1280]{1,0} add(%add.250431.5.clone.1, %xor.122837.5.clone.1)
+  %shift-left.110502.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122837.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116757.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122837.5.clone.1, %broadcast.244419.4352)
+  %or.116268.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110502.5.clone.1, %shift-right-logical.116757.5.clone.1)
+  %xor.122838.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250433.3.clone.1, %or.116268.3.clone.1)
+  %add.250434.3.clone.1 = u32[1280,1280]{1,0} add(%add.250433.3.clone.1, %xor.122838.3.clone.1)
+  %add.250435.7.clone.1 = u32[1280,1280]{1,0} add(%add.250434.3.clone.1, %broadcast.255571.113.clone.1)
+  %shift-left.110503.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122838.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116758.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122838.3.clone.1, %broadcast.244418.4352)
+  %or.116269.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110503.5.clone.1, %shift-right-logical.116758.5.clone.1)
+  %xor.122840.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250434.3.clone.1, %or.116269.3.clone.1)
+  %constant_218469_1_clone_1 = u32[] constant(1777513871)
+  %broadcast.255583.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218469_1_clone_1), dimensions={}
+  %add.250436.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122840.3.clone.1, %broadcast.255583.5.clone.1)
+  %add.250438.5.clone.1 = u32[1280,1280]{1,0} add(%add.250435.7.clone.1, %add.250436.5.clone.1)
+  %shift-left.110504.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250436.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116759.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250436.5.clone.1, %broadcast.244416.5760)
+  %or.116270.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110504.9.clone.1, %shift-right-logical.116759.9.clone.1)
+  %xor.122841.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250438.5.clone.1, %or.116270.7.clone.1)
+  %add.250439.3.clone.1 = u32[1280,1280]{1,0} add(%add.250438.5.clone.1, %xor.122841.5.clone.1)
+  %shift-left.110505.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122841.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116760.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122841.5.clone.1, %broadcast.244429.2304)
+  %or.116271.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110505.9.clone.1, %shift-right-logical.116760.9.clone.1)
+  %xor.122842.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250439.3.clone.1, %or.116271.7.clone.1)
+  %add.250440.3.clone.1 = u32[1280,1280]{1,0} add(%add.250439.3.clone.1, %xor.122842.5.clone.1)
+  %shift-left.110506.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122842.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116761.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122842.5.clone.1, %broadcast.244430.4608)
+  %or.116272.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110506.9.clone.1, %shift-right-logical.116761.9.clone.1)
+  %xor.122843.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250440.3.clone.1, %or.116272.7.clone.1)
+  %add.250441.3.clone.1 = u32[1280,1280]{1,0} add(%add.250440.3.clone.1, %xor.122843.5.clone.1)
+  %constant_183939_1_clone_1 = u32[] constant(1777513870)
+  %broadcast.255591.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183939_1_clone_1), dimensions={}
+  %add.250443.7.clone.1 = u32[1280,1280]{1,0} add(%add.250441.3.clone.1, %broadcast.255591.24.clone.1)
+  %shift-left.110507.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122843.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116762.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122843.5.clone.1, %broadcast.244434.2816)
+  %or.116273.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110507.11.clone.1, %shift-right-logical.116762.11.clone.1)
+  %xor.122845.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250441.3.clone.1, %or.116273.9.clone.1)
+  %constant_218470_1_clone_1 = u32[] constant(396441044)
+  %broadcast.255597.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218470_1_clone_1), dimensions={}
+  %add.250444.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122845.7.clone.1, %broadcast.255597.5.clone.1)
+  %add.250445.5.clone.1 = u32[1280,1280]{1,0} add(%add.250443.7.clone.1, %add.250444.5.clone.1)
+  %shift-left.110508.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250444.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116764.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250444.5.clone.1, %broadcast.244415.6016)
+  %or.116275.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110508.9.clone.1, %shift-right-logical.116764.9.clone.1)
+  %xor.122846.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250445.5.clone.1, %or.116275.7.clone.1)
+  %add.250446.3.clone.1 = u32[1280,1280]{1,0} add(%add.250445.5.clone.1, %xor.122846.5.clone.1)
+  %shift-left.110509.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122846.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116765.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122846.5.clone.1, %broadcast.244417.5760)
+  %or.116276.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110509.9.clone.1, %shift-right-logical.116765.9.clone.1)
+  %xor.122847.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250446.3.clone.1, %or.116276.7.clone.1)
+  %add.250447.3.clone.1 = u32[1280,1280]{1,0} add(%add.250446.3.clone.1, %xor.122847.5.clone.1)
+  %shift-left.110511.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122847.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116766.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122847.5.clone.1, %broadcast.244419.4352)
+  %or.116277.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110511.7.clone.1, %shift-right-logical.116766.7.clone.1)
+  %xor.122848.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250447.3.clone.1, %or.116277.5.clone.1)
+  %add.250449.3.clone.1 = u32[1280,1280]{1,0} add(%add.250447.3.clone.1, %xor.122848.3.clone.1)
+  %add.250453.7.clone.1 = u32[1280,1280]{1,0} add(%add.250449.3.clone.1, %broadcast.255570.44.clone.1)
+  %shift-left.110512.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122848.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116767.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122848.3.clone.1, %broadcast.244418.4352)
+  %or.116278.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110512.7.clone.1, %shift-right-logical.116767.7.clone.1)
+  %xor.122850.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250449.3.clone.1, %or.116278.5.clone.1)
+  %constant_218471_1_clone_1 = u32[] constant(1703058313)
+  %broadcast.255617.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218471_1_clone_1), dimensions={}
+  %add.250454.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122850.3.clone.1, %broadcast.255617.5.clone.1)
+  %add.250455.5.clone.1 = u32[1280,1280]{1,0} add(%add.250453.7.clone.1, %add.250454.5.clone.1)
+  %shift-left.110513.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250454.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116769.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250454.5.clone.1, %broadcast.244416.5760)
+  %or.116280.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110513.9.clone.1, %shift-right-logical.116769.9.clone.1)
+  %xor.122851.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250455.5.clone.1, %or.116280.7.clone.1)
+  %add.250456.3.clone.1 = u32[1280,1280]{1,0} add(%add.250455.5.clone.1, %xor.122851.5.clone.1)
+  %shift-left.110514.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122851.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116770.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122851.5.clone.1, %broadcast.244429.2304)
+  %or.116281.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110514.9.clone.1, %shift-right-logical.116770.9.clone.1)
+  %xor.122852.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250456.3.clone.1, %or.116281.7.clone.1)
+  %add.250458.3.clone.1 = u32[1280,1280]{1,0} add(%add.250456.3.clone.1, %xor.122852.5.clone.1)
+  %shift-left.110516.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122852.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116771.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122852.5.clone.1, %broadcast.244430.4608)
+  %or.116282.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110516.9.clone.1, %shift-right-logical.116771.9.clone.1)
+  %xor.122853.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250458.3.clone.1, %or.116282.7.clone.1)
+  %add.250459.3.clone.1 = u32[1280,1280]{1,0} add(%add.250458.3.clone.1, %xor.122853.5.clone.1)
+  %add.250460.7.clone.1 = u32[1280,1280]{1,0} add(%add.250459.3.clone.1, %broadcast.255571.113.clone.1)
+  %shift-left.110517.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122853.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116772.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122853.5.clone.1, %broadcast.244434.2816)
+  %or.116283.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110517.11.clone.1, %shift-right-logical.116772.11.clone.1)
+  %xor.122855.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250459.3.clone.1, %or.116283.9.clone.1)
+  %constant_218472_1_clone_1 = u32[] constant(1777513874)
+  %broadcast.255637.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218472_1_clone_1), dimensions={}
+  %add.250461.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122855.7.clone.1, %broadcast.255637.5.clone.1)
+  %add.250463.5.clone.1 = u32[1280,1280]{1,0} add(%add.250460.7.clone.1, %add.250461.5.clone.1)
+  %shift-left.110518.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250461.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116774.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250461.5.clone.1, %broadcast.244415.6016)
+  %or.116285.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110518.9.clone.1, %shift-right-logical.116774.9.clone.1)
+  %xor.122856.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250463.5.clone.1, %or.116285.7.clone.1)
+  %add.250464.3.clone.1 = u32[1280,1280]{1,0} add(%add.250463.5.clone.1, %xor.122856.5.clone.1)
+  %shift-left.110519.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122856.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116775.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122856.5.clone.1, %broadcast.244417.5760)
+  %or.116286.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110519.9.clone.1, %shift-right-logical.116775.9.clone.1)
+  %xor.122857.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250464.3.clone.1, %or.116286.7.clone.1)
+  %add.250465.3.clone.1 = u32[1280,1280]{1,0} add(%add.250464.3.clone.1, %xor.122857.5.clone.1)
+  %shift-left.110521.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122857.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116776.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122857.5.clone.1, %broadcast.244419.4352)
+  %or.116287.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110521.5.clone.1, %shift-right-logical.116776.5.clone.1)
+  %xor.122858.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250465.3.clone.1, %or.116287.3.clone.1)
+  %add.250466.3.clone.1 = u32[1280,1280]{1,0} add(%add.250465.3.clone.1, %xor.122858.3.clone.1)
+  %add.250468.17.clone.1 = u32[1280,1280]{1,0} add(%add.250466.3.clone.1, %broadcast.255591.24.clone.1)
+  %shift-left.110522.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122858.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116777.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122858.3.clone.1, %broadcast.244418.4352)
+  %or.116288.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110522.5.clone.1, %shift-right-logical.116777.5.clone.1)
+  %xor.122859.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250466.3.clone.1, %or.116288.3.clone.1)
+  %constant_218473_1_clone_1 = u32[] constant(396441047)
+  %broadcast.255647.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218473_1_clone_1), dimensions={}
+  %add.250469.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122859.15.clone.1, %broadcast.255647.19.clone.1)
+  %xor.122860.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250468.17.clone.1, %add.250469.19.clone.1)
+  %shift-right-logical.116779.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122860.17.clone.1, %broadcast.244468.1920)
+  %or.116290.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116779.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5796.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116290.13.clone.1)
+  %add.250470.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5796.11.clone.1, %broadcast.244470.1152)
+  %multiply.26889.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250470.9.clone.1, %broadcast.244471.896)
+  %add.250471.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26889.7.clone.1, %broadcast.244408.1024)
+  %maximum.3728.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250471.5.clone.1)
+  %abs.1568.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3728.3.clone.1)
+  %compare.7287.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1568.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26890.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3728.3.clone.1, %broadcast.244476.1152)
+  %negate.4641.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3728.3.clone.1)
+  %multiply.26891.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3728.3.clone.1, %negate.4641.5.clone.1)
+  %log-plus-one.1568.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26891.5.clone.1)
+  %negate.4642.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1568.3.clone.1)
+  %compare.7291.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4642.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21334.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21335.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21336.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21337.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21338.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21339.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21340.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21341.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21342.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250472.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4642.4.clone.1, %broadcast.244496.640)
+  %sqrt.1568.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4642.4.clone.1)
+  %add.250474.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1568.5.clone.1, %broadcast.244498.640)
+  %select.21343.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7291.3.clone.1, %add.250472.5.clone.1, %add.250474.5.clone.1)
+  %multiply.26892.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21342.3.clone.1, %select.21343.3.clone.1)
+  %add.250478.1.clone.1 = f32[1280,1280]{1,0} add(%select.21341.3.clone.1, %multiply.26892.1.clone.1)
+  %multiply.26893.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250478.1.clone.1, %select.21343.3.clone.1)
+  %add.250479.1.clone.1 = f32[1280,1280]{1,0} add(%select.21340.3.clone.1, %multiply.26893.1.clone.1)
+  %multiply.26894.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250479.1.clone.1, %select.21343.3.clone.1)
+  %add.250480.1.clone.1 = f32[1280,1280]{1,0} add(%select.21339.3.clone.1, %multiply.26894.1.clone.1)
+  %multiply.26895.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250480.1.clone.1, %select.21343.3.clone.1)
+  %add.250481.1.clone.1 = f32[1280,1280]{1,0} add(%select.21338.3.clone.1, %multiply.26895.1.clone.1)
+  %multiply.26896.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250481.1.clone.1, %select.21343.3.clone.1)
+  %add.250483.3.clone.1 = f32[1280,1280]{1,0} add(%select.21337.5.clone.1, %multiply.26896.1.clone.1)
+  %multiply.26897.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250483.3.clone.1, %select.21343.3.clone.1)
+  %add.250484.3.clone.1 = f32[1280,1280]{1,0} add(%select.21336.5.clone.1, %multiply.26897.1.clone.1)
+  %multiply.26898.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250484.3.clone.1, %select.21343.3.clone.1)
+  %add.250485.9.clone.1 = f32[1280,1280]{1,0} add(%select.21335.11.clone.1, %multiply.26898.7.clone.1)
+  %multiply.26899.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250485.9.clone.1, %select.21343.3.clone.1)
+  %add.250486.7.clone.1 = f32[1280,1280]{1,0} add(%select.21334.7.clone.1, %multiply.26899.7.clone.1)
+  %multiply.26900.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250486.7.clone.1, %maximum.3728.3.clone.1)
+  %select.21344.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7287.3.clone.1, %multiply.26890.9.clone.1, %multiply.26900.7.clone.1)
+  %multiply.26901.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21344.7.clone.1, %broadcast.244500.640)
+  %clamp.1212.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26901.5.clone.1, %broadcast.244501.384)
+  %multiply.26902.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1212.3.clone.1, %broadcast.244502.1)
+  %constant_170506_1_clone_1 = u32[] constant(2641164229)
+  %broadcast.249769.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170506_1_clone_1), dimensions={}
+  %add.247119.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249769.44.clone.1)
+  %constant_170513_1_clone_1 = u32[] constant(3440052210)
+  %broadcast.249770.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170513_1_clone_1), dimensions={}
+  %add.247120.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249770.113.clone.1)
+  %add.247121.35.clone.1 = u32[1280,1280]{1,0} add(%add.247119.37.clone.1, %add.247120.99.clone.1)
+  %shift-left.109069.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247120.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115235.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247120.99.clone.1, %broadcast.244415.6016)
+  %or.114765.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109069.31.clone.1, %shift-right-logical.115235.29.clone.1)
+  %xor.121313.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247121.35.clone.1, %or.114765.29.clone.1)
+  %add.247122.5.clone.1 = u32[1280,1280]{1,0} add(%add.247121.35.clone.1, %xor.121313.27.clone.1)
+  %shift-left.109070.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121313.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115237.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121313.27.clone.1, %broadcast.244417.5760)
+  %or.114766.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109070.9.clone.1, %shift-right-logical.115237.9.clone.1)
+  %xor.121314.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247122.5.clone.1, %or.114766.7.clone.1)
+  %add.247123.3.clone.1 = u32[1280,1280]{1,0} add(%add.247122.5.clone.1, %xor.121314.5.clone.1)
+  %shift-left.109071.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121314.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115238.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121314.5.clone.1, %broadcast.244419.4352)
+  %or.114767.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109071.5.clone.1, %shift-right-logical.115238.5.clone.1)
+  %xor.121315.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247123.3.clone.1, %or.114767.3.clone.1)
+  %add.247125.3.clone.1 = u32[1280,1280]{1,0} add(%add.247123.3.clone.1, %xor.121315.3.clone.1)
+  %add.247128.7.clone.1 = u32[1280,1280]{1,0} add(%add.247125.3.clone.1, %broadcast.249770.113.clone.1)
+  %shift-left.109072.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121315.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115239.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121315.3.clone.1, %broadcast.244418.4352)
+  %or.114768.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109072.5.clone.1, %shift-right-logical.115239.5.clone.1)
+  %xor.121316.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247125.3.clone.1, %or.114768.3.clone.1)
+  %constant_218101_1_clone_1 = u32[] constant(1270279150)
+  %broadcast.249780.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218101_1_clone_1), dimensions={}
+  %add.247129.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121316.3.clone.1, %broadcast.249780.5.clone.1)
+  %add.247130.5.clone.1 = u32[1280,1280]{1,0} add(%add.247128.7.clone.1, %add.247129.5.clone.1)
+  %shift-left.109074.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247129.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115240.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247129.5.clone.1, %broadcast.244416.5760)
+  %or.114769.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109074.9.clone.1, %shift-right-logical.115240.9.clone.1)
+  %xor.121317.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247130.5.clone.1, %or.114769.7.clone.1)
+  %add.247131.3.clone.1 = u32[1280,1280]{1,0} add(%add.247130.5.clone.1, %xor.121317.5.clone.1)
+  %shift-left.109075.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121317.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115241.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121317.5.clone.1, %broadcast.244429.2304)
+  %or.114770.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109075.9.clone.1, %shift-right-logical.115241.9.clone.1)
+  %xor.121318.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247131.3.clone.1, %or.114770.7.clone.1)
+  %add.247133.3.clone.1 = u32[1280,1280]{1,0} add(%add.247131.3.clone.1, %xor.121318.5.clone.1)
+  %shift-left.109076.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121318.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115242.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121318.5.clone.1, %broadcast.244430.4608)
+  %or.114771.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109076.9.clone.1, %shift-right-logical.115242.9.clone.1)
+  %xor.121319.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247133.3.clone.1, %or.114771.7.clone.1)
+  %add.247134.3.clone.1 = u32[1280,1280]{1,0} add(%add.247133.3.clone.1, %xor.121319.5.clone.1)
+  %constant_170515_1_clone_1 = u32[] constant(1270279149)
+  %broadcast.249787.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170515_1_clone_1), dimensions={}
+  %add.247135.7.clone.1 = u32[1280,1280]{1,0} add(%add.247134.3.clone.1, %broadcast.249787.24.clone.1)
+  %shift-left.109077.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121319.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115243.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121319.5.clone.1, %broadcast.244434.2816)
+  %or.114772.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109077.11.clone.1, %shift-right-logical.115243.11.clone.1)
+  %xor.121320.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247134.3.clone.1, %or.114772.9.clone.1)
+  %constant_218102_1_clone_1 = u32[] constant(2641164231)
+  %broadcast.249792.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218102_1_clone_1), dimensions={}
+  %add.247136.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121320.7.clone.1, %broadcast.249792.5.clone.1)
+  %add.247138.5.clone.1 = u32[1280,1280]{1,0} add(%add.247135.7.clone.1, %add.247136.5.clone.1)
+  %shift-left.109079.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247136.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115244.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247136.5.clone.1, %broadcast.244415.6016)
+  %or.114773.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109079.9.clone.1, %shift-right-logical.115244.9.clone.1)
+  %xor.121321.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247138.5.clone.1, %or.114773.7.clone.1)
+  %add.247139.3.clone.1 = u32[1280,1280]{1,0} add(%add.247138.5.clone.1, %xor.121321.5.clone.1)
+  %shift-left.109080.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121321.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115245.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121321.5.clone.1, %broadcast.244417.5760)
+  %or.114774.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109080.9.clone.1, %shift-right-logical.115245.9.clone.1)
+  %xor.121322.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247139.3.clone.1, %or.114774.7.clone.1)
+  %add.247140.3.clone.1 = u32[1280,1280]{1,0} add(%add.247139.3.clone.1, %xor.121322.5.clone.1)
+  %shift-left.109081.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121322.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115247.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121322.5.clone.1, %broadcast.244419.4352)
+  %or.114775.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109081.7.clone.1, %shift-right-logical.115247.7.clone.1)
+  %xor.121323.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247140.3.clone.1, %or.114775.5.clone.1)
+  %add.247141.3.clone.1 = u32[1280,1280]{1,0} add(%add.247140.3.clone.1, %xor.121323.3.clone.1)
+  %add.247143.7.clone.1 = u32[1280,1280]{1,0} add(%add.247141.3.clone.1, %broadcast.249769.44.clone.1)
+  %shift-left.109082.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121323.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115248.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121323.3.clone.1, %broadcast.244418.4352)
+  %or.114776.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109082.7.clone.1, %shift-right-logical.115248.7.clone.1)
+  %xor.121324.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247141.3.clone.1, %or.114776.5.clone.1)
+  %constant_218103_1_clone_1 = u32[] constant(3440052213)
+  %broadcast.249802.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218103_1_clone_1), dimensions={}
+  %add.247144.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121324.3.clone.1, %broadcast.249802.5.clone.1)
+  %add.247145.5.clone.1 = u32[1280,1280]{1,0} add(%add.247143.7.clone.1, %add.247144.5.clone.1)
+  %shift-left.109084.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247144.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115249.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247144.5.clone.1, %broadcast.244416.5760)
+  %or.114777.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109084.9.clone.1, %shift-right-logical.115249.9.clone.1)
+  %xor.121325.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247145.5.clone.1, %or.114777.7.clone.1)
+  %add.247146.3.clone.1 = u32[1280,1280]{1,0} add(%add.247145.5.clone.1, %xor.121325.5.clone.1)
+  %shift-left.109085.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121325.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115250.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121325.5.clone.1, %broadcast.244429.2304)
+  %or.114779.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109085.9.clone.1, %shift-right-logical.115250.9.clone.1)
+  %xor.121326.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247146.3.clone.1, %or.114779.7.clone.1)
+  %add.247147.3.clone.1 = u32[1280,1280]{1,0} add(%add.247146.3.clone.1, %xor.121326.5.clone.1)
+  %shift-left.109086.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121326.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115252.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121326.5.clone.1, %broadcast.244430.4608)
+  %or.114780.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109086.9.clone.1, %shift-right-logical.115252.9.clone.1)
+  %xor.121327.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247147.3.clone.1, %or.114780.7.clone.1)
+  %add.247149.3.clone.1 = u32[1280,1280]{1,0} add(%add.247147.3.clone.1, %xor.121327.5.clone.1)
+  %add.247153.7.clone.1 = u32[1280,1280]{1,0} add(%add.247149.3.clone.1, %broadcast.249770.113.clone.1)
+  %shift-left.109087.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121327.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115253.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121327.5.clone.1, %broadcast.244434.2816)
+  %or.114781.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109087.11.clone.1, %shift-right-logical.115253.11.clone.1)
+  %xor.121328.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247149.3.clone.1, %or.114781.9.clone.1)
+  %constant_218104_1_clone_1 = u32[] constant(1270279153)
+  %broadcast.249812.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218104_1_clone_1), dimensions={}
+  %add.247154.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121328.7.clone.1, %broadcast.249812.5.clone.1)
+  %add.247155.5.clone.1 = u32[1280,1280]{1,0} add(%add.247153.7.clone.1, %add.247154.5.clone.1)
+  %shift-left.109088.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247154.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115254.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247154.5.clone.1, %broadcast.244415.6016)
+  %or.114782.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109088.9.clone.1, %shift-right-logical.115254.9.clone.1)
+  %xor.121329.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247155.5.clone.1, %or.114782.7.clone.1)
+  %add.247156.3.clone.1 = u32[1280,1280]{1,0} add(%add.247155.5.clone.1, %xor.121329.5.clone.1)
+  %shift-left.109089.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121329.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115255.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121329.5.clone.1, %broadcast.244417.5760)
+  %or.114783.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109089.9.clone.1, %shift-right-logical.115255.9.clone.1)
+  %xor.121330.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247156.3.clone.1, %or.114783.7.clone.1)
+  %add.247158.3.clone.1 = u32[1280,1280]{1,0} add(%add.247156.3.clone.1, %xor.121330.5.clone.1)
+  %shift-left.109090.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121330.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115257.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121330.5.clone.1, %broadcast.244419.4352)
+  %or.114784.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109090.5.clone.1, %shift-right-logical.115257.5.clone.1)
+  %xor.121331.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247158.3.clone.1, %or.114784.3.clone.1)
+  %add.247159.3.clone.1 = u32[1280,1280]{1,0} add(%add.247158.3.clone.1, %xor.121331.3.clone.1)
+  %add.247160.17.clone.1 = u32[1280,1280]{1,0} add(%add.247159.3.clone.1, %broadcast.249787.24.clone.1)
+  %shift-left.109091.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121331.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115258.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121331.3.clone.1, %broadcast.244418.4352)
+  %or.114785.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109091.5.clone.1, %shift-right-logical.115258.5.clone.1)
+  %xor.121332.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247159.3.clone.1, %or.114785.3.clone.1)
+  %constant_218105_1_clone_1 = u32[] constant(2641164234)
+  %broadcast.249824.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218105_1_clone_1), dimensions={}
+  %add.247161.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121332.15.clone.1, %broadcast.249824.19.clone.1)
+  %xor.121333.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247160.17.clone.1, %add.247161.19.clone.1)
+  %shift-right-logical.115259.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121333.17.clone.1, %broadcast.244468.1920)
+  %or.114786.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115259.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5730.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114786.13.clone.1)
+  %add.247163.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5730.11.clone.1, %broadcast.244470.1152)
+  %multiply.26214.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247163.9.clone.1, %broadcast.244471.896)
+  %add.247164.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26214.7.clone.1, %broadcast.244408.1024)
+  %maximum.3662.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247164.5.clone.1)
+  %abs.1524.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3662.3.clone.1)
+  %compare.7196.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1524.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26215.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3662.3.clone.1, %broadcast.244476.1152)
+  %negate.4553.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3662.3.clone.1)
+  %multiply.26216.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3662.3.clone.1, %negate.4553.5.clone.1)
+  %log-plus-one.1524.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26216.5.clone.1)
+  %negate.4554.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1524.3.clone.1)
+  %compare.7197.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4554.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20829.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20830.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20831.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20832.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20833.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20834.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20835.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20836.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20837.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247165.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4554.4.clone.1, %broadcast.244496.640)
+  %sqrt.1524.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4554.4.clone.1)
+  %add.247166.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1524.5.clone.1, %broadcast.244498.640)
+  %select.20838.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7197.3.clone.1, %add.247165.5.clone.1, %add.247166.5.clone.1)
+  %multiply.26217.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20837.3.clone.1, %select.20838.3.clone.1)
+  %add.247168.1.clone.1 = f32[1280,1280]{1,0} add(%select.20836.3.clone.1, %multiply.26217.1.clone.1)
+  %multiply.26218.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247168.1.clone.1, %select.20838.3.clone.1)
+  %add.247169.1.clone.1 = f32[1280,1280]{1,0} add(%select.20835.3.clone.1, %multiply.26218.1.clone.1)
+  %multiply.26219.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247169.1.clone.1, %select.20838.3.clone.1)
+  %add.247170.1.clone.1 = f32[1280,1280]{1,0} add(%select.20834.3.clone.1, %multiply.26219.1.clone.1)
+  %multiply.26220.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247170.1.clone.1, %select.20838.3.clone.1)
+  %add.247171.1.clone.1 = f32[1280,1280]{1,0} add(%select.20833.3.clone.1, %multiply.26220.1.clone.1)
+  %multiply.26221.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247171.1.clone.1, %select.20838.3.clone.1)
+  %add.247172.3.clone.1 = f32[1280,1280]{1,0} add(%select.20832.5.clone.1, %multiply.26221.1.clone.1)
+  %multiply.26222.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247172.3.clone.1, %select.20838.3.clone.1)
+  %add.247174.3.clone.1 = f32[1280,1280]{1,0} add(%select.20831.5.clone.1, %multiply.26222.1.clone.1)
+  %multiply.26223.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247174.3.clone.1, %select.20838.3.clone.1)
+  %add.247178.9.clone.1 = f32[1280,1280]{1,0} add(%select.20830.11.clone.1, %multiply.26223.7.clone.1)
+  %multiply.26224.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247178.9.clone.1, %select.20838.3.clone.1)
+  %add.247179.7.clone.1 = f32[1280,1280]{1,0} add(%select.20829.7.clone.1, %multiply.26224.7.clone.1)
+  %multiply.26225.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247179.7.clone.1, %maximum.3662.3.clone.1)
+  %select.20839.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7196.3.clone.1, %multiply.26215.9.clone.1, %multiply.26225.7.clone.1)
+  %multiply.26226.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20839.7.clone.1, %broadcast.244500.640)
+  %clamp.1168.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26226.5.clone.1, %broadcast.244501.384)
+  %multiply.26227.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1168.3.clone.1, %broadcast.244502.1)
+  %constant_195327_1_clone_1 = u32[] constant(912190203)
+  %broadcast.260497.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195327_1_clone_1), dimensions={}
+  %add.253251.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.260497.44.clone.1)
+  %constant_195334_1_clone_1 = u32[] constant(953710832)
+  %broadcast.260499.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195334_1_clone_1), dimensions={}
+  %add.253252.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.260499.113.clone.1)
+  %add.253254.35.clone.1 = u32[1280,1280]{1,0} add(%add.253251.37.clone.1, %add.253252.99.clone.1)
+  %shift-left.111737.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253252.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118034.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253252.99.clone.1, %broadcast.244415.6016)
+  %or.117569.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111737.31.clone.1, %shift-right-logical.118034.29.clone.1)
+  %xor.124124.27.clone.1 = u32[1280,1280]{1,0} xor(%add.253254.35.clone.1, %or.117569.29.clone.1)
+  %add.253255.5.clone.1 = u32[1280,1280]{1,0} add(%add.253254.35.clone.1, %xor.124124.27.clone.1)
+  %shift-left.111738.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124124.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118035.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124124.27.clone.1, %broadcast.244417.5760)
+  %or.117570.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111738.9.clone.1, %shift-right-logical.118035.9.clone.1)
+  %xor.124126.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253255.5.clone.1, %or.117570.7.clone.1)
+  %add.253256.3.clone.1 = u32[1280,1280]{1,0} add(%add.253255.5.clone.1, %xor.124126.5.clone.1)
+  %shift-left.111739.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124126.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118037.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124126.5.clone.1, %broadcast.244419.4352)
+  %or.117571.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111739.5.clone.1, %shift-right-logical.118037.5.clone.1)
+  %xor.124127.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253256.3.clone.1, %or.117571.3.clone.1)
+  %add.253257.3.clone.1 = u32[1280,1280]{1,0} add(%add.253256.3.clone.1, %xor.124127.3.clone.1)
+  %add.253259.7.clone.1 = u32[1280,1280]{1,0} add(%add.253257.3.clone.1, %broadcast.260499.113.clone.1)
+  %shift-left.111741.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124127.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118038.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124127.3.clone.1, %broadcast.244418.4352)
+  %or.117572.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111741.5.clone.1, %shift-right-logical.118038.5.clone.1)
+  %xor.124128.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253257.3.clone.1, %or.117572.3.clone.1)
+  %constant_218787_1_clone_1 = u32[] constant(358058450)
+  %broadcast.260514.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218787_1_clone_1), dimensions={}
+  %add.253260.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124128.3.clone.1, %broadcast.260514.5.clone.1)
+  %add.253261.5.clone.1 = u32[1280,1280]{1,0} add(%add.253259.7.clone.1, %add.253260.5.clone.1)
+  %shift-left.111742.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253260.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118039.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253260.5.clone.1, %broadcast.244416.5760)
+  %or.117573.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111742.9.clone.1, %shift-right-logical.118039.9.clone.1)
+  %xor.124129.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253261.5.clone.1, %or.117573.7.clone.1)
+  %add.253262.3.clone.1 = u32[1280,1280]{1,0} add(%add.253261.5.clone.1, %xor.124129.5.clone.1)
+  %shift-left.111743.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124129.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118040.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124129.5.clone.1, %broadcast.244429.2304)
+  %or.117574.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111743.9.clone.1, %shift-right-logical.118040.9.clone.1)
+  %xor.124131.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253262.3.clone.1, %or.117574.7.clone.1)
+  %add.253263.3.clone.1 = u32[1280,1280]{1,0} add(%add.253262.3.clone.1, %xor.124131.5.clone.1)
+  %shift-left.111744.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124131.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118041.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124131.5.clone.1, %broadcast.244430.4608)
+  %or.117575.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111744.9.clone.1, %shift-right-logical.118041.9.clone.1)
+  %xor.124132.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253263.3.clone.1, %or.117575.7.clone.1)
+  %add.253265.3.clone.1 = u32[1280,1280]{1,0} add(%add.253263.3.clone.1, %xor.124132.5.clone.1)
+  %constant_195336_1_clone_1 = u32[] constant(358058449)
+  %broadcast.260521.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195336_1_clone_1), dimensions={}
+  %add.253269.7.clone.1 = u32[1280,1280]{1,0} add(%add.253265.3.clone.1, %broadcast.260521.24.clone.1)
+  %shift-left.111746.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124132.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118042.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124132.5.clone.1, %broadcast.244434.2816)
+  %or.117576.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111746.11.clone.1, %shift-right-logical.118042.11.clone.1)
+  %xor.124133.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253265.3.clone.1, %or.117576.9.clone.1)
+  %constant_218788_1_clone_1 = u32[] constant(912190205)
+  %broadcast.260524.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218788_1_clone_1), dimensions={}
+  %add.253270.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124133.7.clone.1, %broadcast.260524.5.clone.1)
+  %add.253271.5.clone.1 = u32[1280,1280]{1,0} add(%add.253269.7.clone.1, %add.253270.5.clone.1)
+  %shift-left.111747.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253270.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118043.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253270.5.clone.1, %broadcast.244415.6016)
+  %or.117577.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111747.9.clone.1, %shift-right-logical.118043.9.clone.1)
+  %xor.124134.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253271.5.clone.1, %or.117577.7.clone.1)
+  %add.253272.3.clone.1 = u32[1280,1280]{1,0} add(%add.253271.5.clone.1, %xor.124134.5.clone.1)
+  %shift-left.111748.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124134.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118044.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124134.5.clone.1, %broadcast.244417.5760)
+  %or.117578.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111748.9.clone.1, %shift-right-logical.118044.9.clone.1)
+  %xor.124136.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253272.3.clone.1, %or.117578.7.clone.1)
+  %add.253274.3.clone.1 = u32[1280,1280]{1,0} add(%add.253272.3.clone.1, %xor.124136.5.clone.1)
+  %shift-left.111749.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124136.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118045.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124136.5.clone.1, %broadcast.244419.4352)
+  %or.117579.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111749.7.clone.1, %shift-right-logical.118045.7.clone.1)
+  %xor.124137.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253274.3.clone.1, %or.117579.5.clone.1)
+  %add.253275.3.clone.1 = u32[1280,1280]{1,0} add(%add.253274.3.clone.1, %xor.124137.3.clone.1)
+  %add.253276.7.clone.1 = u32[1280,1280]{1,0} add(%add.253275.3.clone.1, %broadcast.260497.44.clone.1)
+  %shift-left.111751.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124137.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118047.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124137.3.clone.1, %broadcast.244418.4352)
+  %or.117580.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111751.7.clone.1, %shift-right-logical.118047.7.clone.1)
+  %xor.124138.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253275.3.clone.1, %or.117580.5.clone.1)
+  %constant_218789_1_clone_1 = u32[] constant(953710835)
+  %broadcast.260536.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218789_1_clone_1), dimensions={}
+  %add.253277.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124138.3.clone.1, %broadcast.260536.5.clone.1)
+  %add.253279.5.clone.1 = u32[1280,1280]{1,0} add(%add.253276.7.clone.1, %add.253277.5.clone.1)
+  %shift-left.111752.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253277.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118048.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253277.5.clone.1, %broadcast.244416.5760)
+  %or.117581.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111752.9.clone.1, %shift-right-logical.118048.9.clone.1)
+  %xor.124139.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253279.5.clone.1, %or.117581.7.clone.1)
+  %add.253280.3.clone.1 = u32[1280,1280]{1,0} add(%add.253279.5.clone.1, %xor.124139.5.clone.1)
+  %shift-left.111753.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124139.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118049.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124139.5.clone.1, %broadcast.244429.2304)
+  %or.117582.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111753.9.clone.1, %shift-right-logical.118049.9.clone.1)
+  %xor.124141.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253280.3.clone.1, %or.117582.7.clone.1)
+  %add.253281.3.clone.1 = u32[1280,1280]{1,0} add(%add.253280.3.clone.1, %xor.124141.5.clone.1)
+  %shift-left.111754.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124141.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118050.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124141.5.clone.1, %broadcast.244430.4608)
+  %or.117583.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111754.9.clone.1, %shift-right-logical.118050.9.clone.1)
+  %xor.124142.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253281.3.clone.1, %or.117583.7.clone.1)
+  %add.253282.3.clone.1 = u32[1280,1280]{1,0} add(%add.253281.3.clone.1, %xor.124142.5.clone.1)
+  %add.253284.7.clone.1 = u32[1280,1280]{1,0} add(%add.253282.3.clone.1, %broadcast.260499.113.clone.1)
+  %shift-left.111755.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124142.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118052.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124142.5.clone.1, %broadcast.244434.2816)
+  %or.117584.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111755.11.clone.1, %shift-right-logical.118052.11.clone.1)
+  %xor.124143.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253282.3.clone.1, %or.117584.9.clone.1)
+  %constant_218790_1_clone_1 = u32[] constant(358058453)
+  %broadcast.260546.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218790_1_clone_1), dimensions={}
+  %add.253285.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124143.7.clone.1, %broadcast.260546.5.clone.1)
+  %add.253286.5.clone.1 = u32[1280,1280]{1,0} add(%add.253284.7.clone.1, %add.253285.5.clone.1)
+  %shift-left.111756.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253285.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118053.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253285.5.clone.1, %broadcast.244415.6016)
+  %or.117585.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111756.9.clone.1, %shift-right-logical.118053.9.clone.1)
+  %xor.124144.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253286.5.clone.1, %or.117585.7.clone.1)
+  %add.253287.3.clone.1 = u32[1280,1280]{1,0} add(%add.253286.5.clone.1, %xor.124144.5.clone.1)
+  %shift-left.111757.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124144.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118054.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124144.5.clone.1, %broadcast.244417.5760)
+  %or.117586.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111757.9.clone.1, %shift-right-logical.118054.9.clone.1)
+  %xor.124145.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253287.3.clone.1, %or.117586.7.clone.1)
+  %add.253288.3.clone.1 = u32[1280,1280]{1,0} add(%add.253287.3.clone.1, %xor.124145.5.clone.1)
+  %shift-left.111758.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124145.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118055.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124145.5.clone.1, %broadcast.244419.4352)
+  %or.117587.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111758.5.clone.1, %shift-right-logical.118055.5.clone.1)
+  %xor.124146.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253288.3.clone.1, %or.117587.3.clone.1)
+  %add.253290.3.clone.1 = u32[1280,1280]{1,0} add(%add.253288.3.clone.1, %xor.124146.3.clone.1)
+  %add.253294.17.clone.1 = u32[1280,1280]{1,0} add(%add.253290.3.clone.1, %broadcast.260521.24.clone.1)
+  %shift-left.111759.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124146.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118057.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124146.3.clone.1, %broadcast.244418.4352)
+  %or.117588.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111759.5.clone.1, %shift-right-logical.118057.5.clone.1)
+  %xor.124147.15.clone.1 = u32[1280,1280]{1,0} xor(%add.253290.3.clone.1, %or.117588.3.clone.1)
+  %constant_218791_1_clone_1 = u32[] constant(912190208)
+  %broadcast.260556.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218791_1_clone_1), dimensions={}
+  %add.253295.19.clone.1 = u32[1280,1280]{1,0} add(%xor.124147.15.clone.1, %broadcast.260556.19.clone.1)
+  %xor.124148.17.clone.1 = u32[1280,1280]{1,0} xor(%add.253294.17.clone.1, %add.253295.19.clone.1)
+  %shift-right-logical.118058.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124148.17.clone.1, %broadcast.244468.1920)
+  %or.117589.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.118058.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5852.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117589.13.clone.1)
+  %add.253296.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5852.11.clone.1, %broadcast.244470.1152)
+  %multiply.27473.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253296.9.clone.1, %broadcast.244471.896)
+  %add.253297.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27473.7.clone.1, %broadcast.244408.1024)
+  %maximum.3784.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.253297.5.clone.1)
+  %abs.1606.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3784.3.clone.1)
+  %compare.7374.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1606.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27474.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3784.3.clone.1, %broadcast.244476.1152)
+  %negate.4717.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3784.3.clone.1)
+  %multiply.27475.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3784.3.clone.1, %negate.4717.5.clone.1)
+  %log-plus-one.1606.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27475.5.clone.1)
+  %negate.4718.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1606.3.clone.1)
+  %compare.7375.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4718.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21773.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21774.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21775.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21776.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21777.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21778.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21779.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21780.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21781.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.253299.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4718.4.clone.1, %broadcast.244496.640)
+  %sqrt.1606.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4718.4.clone.1)
+  %add.253300.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1606.5.clone.1, %broadcast.244498.640)
+  %select.21782.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7375.3.clone.1, %add.253299.5.clone.1, %add.253300.5.clone.1)
+  %multiply.27476.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21781.3.clone.1, %select.21782.3.clone.1)
+  %add.253301.1.clone.1 = f32[1280,1280]{1,0} add(%select.21780.3.clone.1, %multiply.27476.1.clone.1)
+  %multiply.27477.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253301.1.clone.1, %select.21782.3.clone.1)
+  %add.253302.1.clone.1 = f32[1280,1280]{1,0} add(%select.21779.3.clone.1, %multiply.27477.1.clone.1)
+  %multiply.27478.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253302.1.clone.1, %select.21782.3.clone.1)
+  %add.253304.1.clone.1 = f32[1280,1280]{1,0} add(%select.21778.3.clone.1, %multiply.27478.1.clone.1)
+  %multiply.27479.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253304.1.clone.1, %select.21782.3.clone.1)
+  %add.253305.1.clone.1 = f32[1280,1280]{1,0} add(%select.21777.3.clone.1, %multiply.27479.1.clone.1)
+  %multiply.27481.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253305.1.clone.1, %select.21782.3.clone.1)
+  %add.253306.3.clone.1 = f32[1280,1280]{1,0} add(%select.21776.5.clone.1, %multiply.27481.1.clone.1)
+  %multiply.27482.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253306.3.clone.1, %select.21782.3.clone.1)
+  %add.253307.3.clone.1 = f32[1280,1280]{1,0} add(%select.21775.5.clone.1, %multiply.27482.1.clone.1)
+  %multiply.27483.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253307.3.clone.1, %select.21782.3.clone.1)
+  %add.253309.9.clone.1 = f32[1280,1280]{1,0} add(%select.21774.11.clone.1, %multiply.27483.7.clone.1)
+  %multiply.27484.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253309.9.clone.1, %select.21782.3.clone.1)
+  %add.253310.7.clone.1 = f32[1280,1280]{1,0} add(%select.21773.7.clone.1, %multiply.27484.7.clone.1)
+  %multiply.27485.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253310.7.clone.1, %maximum.3784.3.clone.1)
+  %select.21783.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7374.3.clone.1, %multiply.27474.9.clone.1, %multiply.27485.7.clone.1)
+  %multiply.27486.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21783.7.clone.1, %broadcast.244500.640)
+  %clamp.1250.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27486.5.clone.1, %broadcast.244501.384)
+  %multiply.27487.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1250.3.clone.1, %broadcast.244502.1)
+  %constant_170295_1_clone_1 = u32[] constant(162910663)
+  %broadcast.249664.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170295_1_clone_1), dimensions={}
+  %add.247068.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249664.44.clone.1)
+  %constant_170302_1_clone_1 = u32[] constant(507276444)
+  %broadcast.249665.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170302_1_clone_1), dimensions={}
+  %add.247069.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249665.113.clone.1)
+  %add.247071.35.clone.1 = u32[1280,1280]{1,0} add(%add.247068.37.clone.1, %add.247069.99.clone.1)
+  %shift-left.109045.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247069.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115210.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247069.99.clone.1, %broadcast.244415.6016)
+  %or.114744.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109045.31.clone.1, %shift-right-logical.115210.29.clone.1)
+  %xor.121291.27.clone.1 = u32[1280,1280]{1,0} xor(%add.247071.35.clone.1, %or.114744.29.clone.1)
+  %add.247072.5.clone.1 = u32[1280,1280]{1,0} add(%add.247071.35.clone.1, %xor.121291.27.clone.1)
+  %shift-left.109046.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121291.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115212.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121291.27.clone.1, %broadcast.244417.5760)
+  %or.114745.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109046.9.clone.1, %shift-right-logical.115212.9.clone.1)
+  %xor.121292.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247072.5.clone.1, %or.114745.7.clone.1)
+  %add.247074.3.clone.1 = u32[1280,1280]{1,0} add(%add.247072.5.clone.1, %xor.121292.5.clone.1)
+  %shift-left.109047.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121292.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115213.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121292.5.clone.1, %broadcast.244419.4352)
+  %or.114746.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109047.5.clone.1, %shift-right-logical.115213.5.clone.1)
+  %xor.121293.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247074.3.clone.1, %or.114746.3.clone.1)
+  %add.247075.3.clone.1 = u32[1280,1280]{1,0} add(%add.247074.3.clone.1, %xor.121293.3.clone.1)
+  %add.247077.7.clone.1 = u32[1280,1280]{1,0} add(%add.247075.3.clone.1, %broadcast.249665.113.clone.1)
+  %shift-left.109049.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121293.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115214.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121293.3.clone.1, %broadcast.244418.4352)
+  %or.114747.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109049.5.clone.1, %shift-right-logical.115214.5.clone.1)
+  %xor.121294.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247075.3.clone.1, %or.114747.3.clone.1)
+  %constant_218096_1_clone_1 = u32[] constant(207136386)
+  %broadcast.249675.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218096_1_clone_1), dimensions={}
+  %add.247078.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121294.3.clone.1, %broadcast.249675.5.clone.1)
+  %add.247080.5.clone.1 = u32[1280,1280]{1,0} add(%add.247077.7.clone.1, %add.247078.5.clone.1)
+  %shift-left.109050.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247078.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115215.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247078.5.clone.1, %broadcast.244416.5760)
+  %or.114748.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109050.9.clone.1, %shift-right-logical.115215.9.clone.1)
+  %xor.121296.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247080.5.clone.1, %or.114748.7.clone.1)
+  %add.247081.3.clone.1 = u32[1280,1280]{1,0} add(%add.247080.5.clone.1, %xor.121296.5.clone.1)
+  %shift-left.109051.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121296.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115216.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121296.5.clone.1, %broadcast.244429.2304)
+  %or.114749.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109051.9.clone.1, %shift-right-logical.115216.9.clone.1)
+  %xor.121297.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247081.3.clone.1, %or.114749.7.clone.1)
+  %add.247083.3.clone.1 = u32[1280,1280]{1,0} add(%add.247081.3.clone.1, %xor.121297.5.clone.1)
+  %shift-left.109052.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121297.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115217.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121297.5.clone.1, %broadcast.244430.4608)
+  %or.114750.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109052.9.clone.1, %shift-right-logical.115217.9.clone.1)
+  %xor.121298.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247083.3.clone.1, %or.114750.7.clone.1)
+  %add.247084.3.clone.1 = u32[1280,1280]{1,0} add(%add.247083.3.clone.1, %xor.121298.5.clone.1)
+  %constant_170304_1_clone_1 = u32[] constant(207136385)
+  %broadcast.249682.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_170304_1_clone_1), dimensions={}
+  %add.247086.7.clone.1 = u32[1280,1280]{1,0} add(%add.247084.3.clone.1, %broadcast.249682.24.clone.1)
+  %shift-left.109054.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121298.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115218.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121298.5.clone.1, %broadcast.244434.2816)
+  %or.114751.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109054.11.clone.1, %shift-right-logical.115218.11.clone.1)
+  %xor.121299.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247084.3.clone.1, %or.114751.9.clone.1)
+  %constant_218097_1_clone_1 = u32[] constant(162910665)
+  %broadcast.249685.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218097_1_clone_1), dimensions={}
+  %add.247087.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121299.7.clone.1, %broadcast.249685.5.clone.1)
+  %add.247089.5.clone.1 = u32[1280,1280]{1,0} add(%add.247086.7.clone.1, %add.247087.5.clone.1)
+  %shift-left.109055.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247087.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115219.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247087.5.clone.1, %broadcast.244415.6016)
+  %or.114752.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109055.9.clone.1, %shift-right-logical.115219.9.clone.1)
+  %xor.121300.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247089.5.clone.1, %or.114752.7.clone.1)
+  %add.247090.3.clone.1 = u32[1280,1280]{1,0} add(%add.247089.5.clone.1, %xor.121300.5.clone.1)
+  %shift-left.109056.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121300.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115220.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121300.5.clone.1, %broadcast.244417.5760)
+  %or.114753.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109056.9.clone.1, %shift-right-logical.115220.9.clone.1)
+  %xor.121301.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247090.3.clone.1, %or.114753.7.clone.1)
+  %add.247091.3.clone.1 = u32[1280,1280]{1,0} add(%add.247090.3.clone.1, %xor.121301.5.clone.1)
+  %shift-left.109057.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121301.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115222.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121301.5.clone.1, %broadcast.244419.4352)
+  %or.114754.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109057.7.clone.1, %shift-right-logical.115222.7.clone.1)
+  %xor.121302.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247091.3.clone.1, %or.114754.5.clone.1)
+  %add.247092.3.clone.1 = u32[1280,1280]{1,0} add(%add.247091.3.clone.1, %xor.121302.3.clone.1)
+  %add.247093.7.clone.1 = u32[1280,1280]{1,0} add(%add.247092.3.clone.1, %broadcast.249664.44.clone.1)
+  %shift-left.109059.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121302.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115223.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121302.3.clone.1, %broadcast.244418.4352)
+  %or.114755.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109059.7.clone.1, %shift-right-logical.115223.7.clone.1)
+  %xor.121303.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247092.3.clone.1, %or.114755.5.clone.1)
+  %constant_218098_1_clone_1 = u32[] constant(507276447)
+  %broadcast.249695.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218098_1_clone_1), dimensions={}
+  %add.247094.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121303.3.clone.1, %broadcast.249695.5.clone.1)
+  %add.247095.5.clone.1 = u32[1280,1280]{1,0} add(%add.247093.7.clone.1, %add.247094.5.clone.1)
+  %shift-left.109060.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247094.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115224.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247094.5.clone.1, %broadcast.244416.5760)
+  %or.114756.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109060.9.clone.1, %shift-right-logical.115224.9.clone.1)
+  %xor.121304.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247095.5.clone.1, %or.114756.7.clone.1)
+  %add.247096.3.clone.1 = u32[1280,1280]{1,0} add(%add.247095.5.clone.1, %xor.121304.5.clone.1)
+  %shift-left.109061.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121304.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115225.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121304.5.clone.1, %broadcast.244429.2304)
+  %or.114757.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109061.9.clone.1, %shift-right-logical.115225.9.clone.1)
+  %xor.121305.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247096.3.clone.1, %or.114757.7.clone.1)
+  %add.247097.3.clone.1 = u32[1280,1280]{1,0} add(%add.247096.3.clone.1, %xor.121305.5.clone.1)
+  %shift-left.109062.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121305.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115227.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121305.5.clone.1, %broadcast.244430.4608)
+  %or.114758.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109062.9.clone.1, %shift-right-logical.115227.9.clone.1)
+  %xor.121306.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247097.3.clone.1, %or.114758.7.clone.1)
+  %add.247098.3.clone.1 = u32[1280,1280]{1,0} add(%add.247097.3.clone.1, %xor.121306.5.clone.1)
+  %add.247099.7.clone.1 = u32[1280,1280]{1,0} add(%add.247098.3.clone.1, %broadcast.249665.113.clone.1)
+  %shift-left.109063.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121306.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115228.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121306.5.clone.1, %broadcast.244434.2816)
+  %or.114759.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109063.11.clone.1, %shift-right-logical.115228.11.clone.1)
+  %xor.121307.7.clone.1 = u32[1280,1280]{1,0} xor(%add.247098.3.clone.1, %or.114759.9.clone.1)
+  %constant_218099_1_clone_1 = u32[] constant(207136389)
+  %broadcast.249705.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218099_1_clone_1), dimensions={}
+  %add.247100.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121307.7.clone.1, %broadcast.249705.5.clone.1)
+  %add.247101.5.clone.1 = u32[1280,1280]{1,0} add(%add.247099.7.clone.1, %add.247100.5.clone.1)
+  %shift-left.109064.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.247100.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115229.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.247100.5.clone.1, %broadcast.244415.6016)
+  %or.114760.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109064.9.clone.1, %shift-right-logical.115229.9.clone.1)
+  %xor.121308.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247101.5.clone.1, %or.114760.7.clone.1)
+  %add.247102.3.clone.1 = u32[1280,1280]{1,0} add(%add.247101.5.clone.1, %xor.121308.5.clone.1)
+  %shift-left.109065.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121308.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115230.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121308.5.clone.1, %broadcast.244417.5760)
+  %or.114761.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109065.9.clone.1, %shift-right-logical.115230.9.clone.1)
+  %xor.121309.5.clone.1 = u32[1280,1280]{1,0} xor(%add.247102.3.clone.1, %or.114761.7.clone.1)
+  %add.247103.3.clone.1 = u32[1280,1280]{1,0} add(%add.247102.3.clone.1, %xor.121309.5.clone.1)
+  %shift-left.109066.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121309.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115232.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121309.5.clone.1, %broadcast.244419.4352)
+  %or.114762.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109066.5.clone.1, %shift-right-logical.115232.5.clone.1)
+  %xor.121310.3.clone.1 = u32[1280,1280]{1,0} xor(%add.247103.3.clone.1, %or.114762.3.clone.1)
+  %add.247104.3.clone.1 = u32[1280,1280]{1,0} add(%add.247103.3.clone.1, %xor.121310.3.clone.1)
+  %add.247105.17.clone.1 = u32[1280,1280]{1,0} add(%add.247104.3.clone.1, %broadcast.249682.24.clone.1)
+  %shift-left.109067.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121310.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115233.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121310.3.clone.1, %broadcast.244418.4352)
+  %or.114763.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.109067.5.clone.1, %shift-right-logical.115233.5.clone.1)
+  %xor.121311.15.clone.1 = u32[1280,1280]{1,0} xor(%add.247104.3.clone.1, %or.114763.3.clone.1)
+  %constant_218100_1_clone_1 = u32[] constant(162910668)
+  %broadcast.249724.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218100_1_clone_1), dimensions={}
+  %add.247106.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121311.15.clone.1, %broadcast.249724.19.clone.1)
+  %xor.121312.17.clone.1 = u32[1280,1280]{1,0} xor(%add.247105.17.clone.1, %add.247106.19.clone.1)
+  %shift-right-logical.115234.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121312.17.clone.1, %broadcast.244468.1920)
+  %or.114764.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115234.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5729.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114764.13.clone.1)
+  %add.247107.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5729.11.clone.1, %broadcast.244470.1152)
+  %multiply.26200.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247107.9.clone.1, %broadcast.244471.896)
+  %add.247108.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26200.7.clone.1, %broadcast.244408.1024)
+  %maximum.3661.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.247108.5.clone.1)
+  %abs.1523.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3661.3.clone.1)
+  %compare.7194.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1523.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26201.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3661.3.clone.1, %broadcast.244476.1152)
+  %negate.4551.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3661.3.clone.1)
+  %multiply.26202.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3661.3.clone.1, %negate.4551.5.clone.1)
+  %log-plus-one.1523.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26202.5.clone.1)
+  %negate.4552.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1523.3.clone.1)
+  %compare.7195.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4552.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20818.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20819.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20820.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20821.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20822.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20823.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20824.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20825.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20826.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.247109.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4552.4.clone.1, %broadcast.244496.640)
+  %sqrt.1523.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4552.4.clone.1)
+  %add.247110.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1523.5.clone.1, %broadcast.244498.640)
+  %select.20827.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7195.3.clone.1, %add.247109.5.clone.1, %add.247110.5.clone.1)
+  %multiply.26203.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20826.3.clone.1, %select.20827.3.clone.1)
+  %add.247111.1.clone.1 = f32[1280,1280]{1,0} add(%select.20825.3.clone.1, %multiply.26203.1.clone.1)
+  %multiply.26204.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247111.1.clone.1, %select.20827.3.clone.1)
+  %add.247112.1.clone.1 = f32[1280,1280]{1,0} add(%select.20824.3.clone.1, %multiply.26204.1.clone.1)
+  %multiply.26205.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247112.1.clone.1, %select.20827.3.clone.1)
+  %add.247113.1.clone.1 = f32[1280,1280]{1,0} add(%select.20823.3.clone.1, %multiply.26205.1.clone.1)
+  %multiply.26206.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247113.1.clone.1, %select.20827.3.clone.1)
+  %add.247114.1.clone.1 = f32[1280,1280]{1,0} add(%select.20822.3.clone.1, %multiply.26206.1.clone.1)
+  %multiply.26207.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247114.1.clone.1, %select.20827.3.clone.1)
+  %add.247115.3.clone.1 = f32[1280,1280]{1,0} add(%select.20821.5.clone.1, %multiply.26207.1.clone.1)
+  %multiply.26208.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.247115.3.clone.1, %select.20827.3.clone.1)
+  %add.247116.3.clone.1 = f32[1280,1280]{1,0} add(%select.20820.5.clone.1, %multiply.26208.1.clone.1)
+  %multiply.26209.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247116.3.clone.1, %select.20827.3.clone.1)
+  %add.247117.9.clone.1 = f32[1280,1280]{1,0} add(%select.20819.11.clone.1, %multiply.26209.7.clone.1)
+  %multiply.26210.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247117.9.clone.1, %select.20827.3.clone.1)
+  %add.247118.7.clone.1 = f32[1280,1280]{1,0} add(%select.20818.7.clone.1, %multiply.26210.7.clone.1)
+  %multiply.26211.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.247118.7.clone.1, %maximum.3661.3.clone.1)
+  %select.20828.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7194.3.clone.1, %multiply.26201.9.clone.1, %multiply.26211.7.clone.1)
+  %multiply.26212.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20828.7.clone.1, %broadcast.244500.640)
+  %clamp.1167.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26212.5.clone.1, %broadcast.244501.384)
+  %multiply.26213.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1167.3.clone.1, %broadcast.244502.1)
+  %constant_183698_1_clone_1 = u32[] constant(3274149152)
+  %broadcast.255466.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183698_1_clone_1), dimensions={}
+  %add.250377.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255466.44.clone.1)
+  %constant_183705_1_clone_1 = u32[] constant(879129337)
+  %broadcast.255468.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183705_1_clone_1), dimensions={}
+  %add.250378.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.255468.113.clone.1)
+  %add.250380.35.clone.1 = u32[1280,1280]{1,0} add(%add.250377.37.clone.1, %add.250378.99.clone.1)
+  %shift-left.110480.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250378.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116730.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250378.99.clone.1, %broadcast.244415.6016)
+  %or.116245.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110480.31.clone.1, %shift-right-logical.116730.29.clone.1)
+  %xor.122813.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250380.35.clone.1, %or.116245.29.clone.1)
+  %add.250381.5.clone.1 = u32[1280,1280]{1,0} add(%add.250380.35.clone.1, %xor.122813.27.clone.1)
+  %shift-left.110481.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122813.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116731.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122813.27.clone.1, %broadcast.244417.5760)
+  %or.116246.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110481.9.clone.1, %shift-right-logical.116731.9.clone.1)
+  %xor.122814.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250381.5.clone.1, %or.116246.7.clone.1)
+  %add.250383.3.clone.1 = u32[1280,1280]{1,0} add(%add.250381.5.clone.1, %xor.122814.5.clone.1)
+  %shift-left.110482.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122814.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116732.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122814.5.clone.1, %broadcast.244419.4352)
+  %or.116247.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110482.5.clone.1, %shift-right-logical.116732.5.clone.1)
+  %xor.122815.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250383.3.clone.1, %or.116247.3.clone.1)
+  %add.250384.3.clone.1 = u32[1280,1280]{1,0} add(%add.250383.3.clone.1, %xor.122815.3.clone.1)
+  %add.250386.7.clone.1 = u32[1280,1280]{1,0} add(%add.250384.3.clone.1, %broadcast.255468.113.clone.1)
+  %shift-left.110483.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122815.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116733.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122815.3.clone.1, %broadcast.244418.4352)
+  %or.116248.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110483.5.clone.1, %shift-right-logical.116733.5.clone.1)
+  %xor.122816.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250384.3.clone.1, %or.116248.3.clone.1)
+  %constant_218464_1_clone_1 = u32[] constant(3968918532)
+  %broadcast.255478.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218464_1_clone_1), dimensions={}
+  %add.250387.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122816.3.clone.1, %broadcast.255478.5.clone.1)
+  %add.250389.5.clone.1 = u32[1280,1280]{1,0} add(%add.250386.7.clone.1, %add.250387.5.clone.1)
+  %shift-left.110484.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250387.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116734.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250387.5.clone.1, %broadcast.244416.5760)
+  %or.116249.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110484.9.clone.1, %shift-right-logical.116734.9.clone.1)
+  %xor.122817.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250389.5.clone.1, %or.116249.7.clone.1)
+  %add.250390.3.clone.1 = u32[1280,1280]{1,0} add(%add.250389.5.clone.1, %xor.122817.5.clone.1)
+  %shift-left.110485.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122817.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116735.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122817.5.clone.1, %broadcast.244429.2304)
+  %or.116250.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110485.9.clone.1, %shift-right-logical.116735.9.clone.1)
+  %xor.122818.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250390.3.clone.1, %or.116250.7.clone.1)
+  %add.250391.3.clone.1 = u32[1280,1280]{1,0} add(%add.250390.3.clone.1, %xor.122818.5.clone.1)
+  %shift-left.110486.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122818.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116736.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122818.5.clone.1, %broadcast.244430.4608)
+  %or.116251.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110486.9.clone.1, %shift-right-logical.116736.9.clone.1)
+  %xor.122819.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250391.3.clone.1, %or.116251.7.clone.1)
+  %add.250392.3.clone.1 = u32[1280,1280]{1,0} add(%add.250391.3.clone.1, %xor.122819.5.clone.1)
+  %constant_183707_1_clone_1 = u32[] constant(3968918531)
+  %broadcast.255485.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183707_1_clone_1), dimensions={}
+  %add.250393.7.clone.1 = u32[1280,1280]{1,0} add(%add.250392.3.clone.1, %broadcast.255485.24.clone.1)
+  %shift-left.110487.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122819.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116737.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122819.5.clone.1, %broadcast.244434.2816)
+  %or.116252.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110487.11.clone.1, %shift-right-logical.116737.11.clone.1)
+  %xor.122820.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250392.3.clone.1, %or.116252.9.clone.1)
+  %constant_218465_1_clone_1 = u32[] constant(3274149154)
+  %broadcast.255488.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218465_1_clone_1), dimensions={}
+  %add.250394.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122820.7.clone.1, %broadcast.255488.5.clone.1)
+  %add.250395.5.clone.1 = u32[1280,1280]{1,0} add(%add.250393.7.clone.1, %add.250394.5.clone.1)
+  %shift-left.110488.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250394.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116739.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250394.5.clone.1, %broadcast.244415.6016)
+  %or.116253.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110488.9.clone.1, %shift-right-logical.116739.9.clone.1)
+  %xor.122821.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250395.5.clone.1, %or.116253.7.clone.1)
+  %add.250396.3.clone.1 = u32[1280,1280]{1,0} add(%add.250395.5.clone.1, %xor.122821.5.clone.1)
+  %shift-left.110489.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122821.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116740.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122821.5.clone.1, %broadcast.244417.5760)
+  %or.116254.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110489.9.clone.1, %shift-right-logical.116740.9.clone.1)
+  %xor.122822.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250396.3.clone.1, %or.116254.7.clone.1)
+  %add.250397.3.clone.1 = u32[1280,1280]{1,0} add(%add.250396.3.clone.1, %xor.122822.5.clone.1)
+  %shift-left.110490.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122822.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116741.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122822.5.clone.1, %broadcast.244419.4352)
+  %or.116255.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110490.7.clone.1, %shift-right-logical.116741.7.clone.1)
+  %xor.122823.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250397.3.clone.1, %or.116255.5.clone.1)
+  %add.250398.3.clone.1 = u32[1280,1280]{1,0} add(%add.250397.3.clone.1, %xor.122823.3.clone.1)
+  %add.250399.7.clone.1 = u32[1280,1280]{1,0} add(%add.250398.3.clone.1, %broadcast.255466.44.clone.1)
+  %shift-left.110491.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122823.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116742.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122823.3.clone.1, %broadcast.244418.4352)
+  %or.116256.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110491.7.clone.1, %shift-right-logical.116742.7.clone.1)
+  %xor.122824.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250398.3.clone.1, %or.116256.5.clone.1)
+  %constant_218466_1_clone_1 = u32[] constant(879129340)
+  %broadcast.255500.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218466_1_clone_1), dimensions={}
+  %add.250400.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122824.3.clone.1, %broadcast.255500.5.clone.1)
+  %add.250401.5.clone.1 = u32[1280,1280]{1,0} add(%add.250399.7.clone.1, %add.250400.5.clone.1)
+  %shift-left.110492.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250400.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116744.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250400.5.clone.1, %broadcast.244416.5760)
+  %or.116257.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110492.9.clone.1, %shift-right-logical.116744.9.clone.1)
+  %xor.122825.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250401.5.clone.1, %or.116257.7.clone.1)
+  %add.250402.3.clone.1 = u32[1280,1280]{1,0} add(%add.250401.5.clone.1, %xor.122825.5.clone.1)
+  %shift-left.110493.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122825.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116745.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122825.5.clone.1, %broadcast.244429.2304)
+  %or.116258.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110493.9.clone.1, %shift-right-logical.116745.9.clone.1)
+  %xor.122826.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250402.3.clone.1, %or.116258.7.clone.1)
+  %add.250403.3.clone.1 = u32[1280,1280]{1,0} add(%add.250402.3.clone.1, %xor.122826.5.clone.1)
+  %shift-left.110494.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122826.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116746.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122826.5.clone.1, %broadcast.244430.4608)
+  %or.116259.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110494.9.clone.1, %shift-right-logical.116746.9.clone.1)
+  %xor.122827.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250403.3.clone.1, %or.116259.7.clone.1)
+  %add.250404.3.clone.1 = u32[1280,1280]{1,0} add(%add.250403.3.clone.1, %xor.122827.5.clone.1)
+  %add.250405.7.clone.1 = u32[1280,1280]{1,0} add(%add.250404.3.clone.1, %broadcast.255468.113.clone.1)
+  %shift-left.110495.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122827.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116747.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122827.5.clone.1, %broadcast.244434.2816)
+  %or.116260.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110495.11.clone.1, %shift-right-logical.116747.11.clone.1)
+  %xor.122828.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250404.3.clone.1, %or.116260.9.clone.1)
+  %constant_218467_1_clone_1 = u32[] constant(3968918535)
+  %broadcast.255510.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218467_1_clone_1), dimensions={}
+  %add.250406.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122828.7.clone.1, %broadcast.255510.5.clone.1)
+  %add.250407.5.clone.1 = u32[1280,1280]{1,0} add(%add.250405.7.clone.1, %add.250406.5.clone.1)
+  %shift-left.110496.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250406.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116749.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250406.5.clone.1, %broadcast.244415.6016)
+  %or.116261.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110496.9.clone.1, %shift-right-logical.116749.9.clone.1)
+  %xor.122831.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250407.5.clone.1, %or.116261.7.clone.1)
+  %add.250408.3.clone.1 = u32[1280,1280]{1,0} add(%add.250407.5.clone.1, %xor.122831.5.clone.1)
+  %shift-left.110497.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122831.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116750.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122831.5.clone.1, %broadcast.244417.5760)
+  %or.116262.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110497.9.clone.1, %shift-right-logical.116750.9.clone.1)
+  %xor.122832.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250408.3.clone.1, %or.116262.7.clone.1)
+  %add.250409.3.clone.1 = u32[1280,1280]{1,0} add(%add.250408.3.clone.1, %xor.122832.5.clone.1)
+  %shift-left.110498.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122832.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116751.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122832.5.clone.1, %broadcast.244419.4352)
+  %or.116263.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110498.5.clone.1, %shift-right-logical.116751.5.clone.1)
+  %xor.122833.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250409.3.clone.1, %or.116263.3.clone.1)
+  %add.250410.3.clone.1 = u32[1280,1280]{1,0} add(%add.250409.3.clone.1, %xor.122833.3.clone.1)
+  %add.250411.17.clone.1 = u32[1280,1280]{1,0} add(%add.250410.3.clone.1, %broadcast.255485.24.clone.1)
+  %shift-left.110499.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122833.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116752.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122833.3.clone.1, %broadcast.244418.4352)
+  %or.116264.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110499.5.clone.1, %shift-right-logical.116752.5.clone.1)
+  %xor.122834.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250410.3.clone.1, %or.116264.3.clone.1)
+  %constant_218468_1_clone_1 = u32[] constant(3274149157)
+  %broadcast.255522.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218468_1_clone_1), dimensions={}
+  %add.250412.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122834.15.clone.1, %broadcast.255522.19.clone.1)
+  %xor.122835.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250411.17.clone.1, %add.250412.19.clone.1)
+  %shift-right-logical.116754.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122835.17.clone.1, %broadcast.244468.1920)
+  %or.116265.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116754.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5795.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116265.13.clone.1)
+  %add.250413.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5795.11.clone.1, %broadcast.244470.1152)
+  %multiply.26875.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250413.9.clone.1, %broadcast.244471.896)
+  %add.250414.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26875.7.clone.1, %broadcast.244408.1024)
+  %maximum.3727.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250414.5.clone.1)
+  %abs.1567.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3727.3.clone.1)
+  %compare.7285.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1567.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26876.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3727.3.clone.1, %broadcast.244476.1152)
+  %negate.4639.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3727.3.clone.1)
+  %multiply.26877.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3727.3.clone.1, %negate.4639.5.clone.1)
+  %log-plus-one.1567.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26877.5.clone.1)
+  %negate.4640.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1567.3.clone.1)
+  %compare.7286.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4640.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21323.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21324.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21325.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21326.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21327.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21328.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21329.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21330.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21331.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250415.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4640.4.clone.1, %broadcast.244496.640)
+  %sqrt.1567.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4640.4.clone.1)
+  %add.250416.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1567.5.clone.1, %broadcast.244498.640)
+  %select.21332.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7286.3.clone.1, %add.250415.5.clone.1, %add.250416.5.clone.1)
+  %multiply.26878.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21331.3.clone.1, %select.21332.3.clone.1)
+  %add.250417.1.clone.1 = f32[1280,1280]{1,0} add(%select.21330.3.clone.1, %multiply.26878.1.clone.1)
+  %multiply.26879.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250417.1.clone.1, %select.21332.3.clone.1)
+  %add.250418.1.clone.1 = f32[1280,1280]{1,0} add(%select.21329.3.clone.1, %multiply.26879.1.clone.1)
+  %multiply.26880.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250418.1.clone.1, %select.21332.3.clone.1)
+  %add.250419.1.clone.1 = f32[1280,1280]{1,0} add(%select.21328.3.clone.1, %multiply.26880.1.clone.1)
+  %multiply.26881.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250419.1.clone.1, %select.21332.3.clone.1)
+  %add.250420.1.clone.1 = f32[1280,1280]{1,0} add(%select.21327.3.clone.1, %multiply.26881.1.clone.1)
+  %multiply.26882.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250420.1.clone.1, %select.21332.3.clone.1)
+  %add.250421.3.clone.1 = f32[1280,1280]{1,0} add(%select.21326.5.clone.1, %multiply.26882.1.clone.1)
+  %multiply.26883.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250421.3.clone.1, %select.21332.3.clone.1)
+  %add.250422.3.clone.1 = f32[1280,1280]{1,0} add(%select.21325.5.clone.1, %multiply.26883.1.clone.1)
+  %multiply.26884.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250422.3.clone.1, %select.21332.3.clone.1)
+  %add.250423.9.clone.1 = f32[1280,1280]{1,0} add(%select.21324.11.clone.1, %multiply.26884.7.clone.1)
+  %multiply.26885.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250423.9.clone.1, %select.21332.3.clone.1)
+  %add.250425.7.clone.1 = f32[1280,1280]{1,0} add(%select.21323.7.clone.1, %multiply.26885.7.clone.1)
+  %multiply.26886.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250425.7.clone.1, %maximum.3727.3.clone.1)
+  %select.21333.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7285.3.clone.1, %multiply.26876.9.clone.1, %multiply.26886.7.clone.1)
+  %multiply.26887.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21333.7.clone.1, %broadcast.244500.640)
+  %clamp.1211.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26887.5.clone.1, %broadcast.244501.384)
+  %multiply.26888.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1211.3.clone.1, %broadcast.244502.1)
+  %constant_169744_1_clone_1 = u32[] constant(2598801053)
+  %broadcast.249427.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169744_1_clone_1), dimensions={}
+  %add.246951.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249427.44.clone.1)
+  %constant_169751_1_clone_1 = u32[] constant(1962997650)
+  %broadcast.249429.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169751_1_clone_1), dimensions={}
+  %add.246952.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249429.113.clone.1)
+  %add.246953.35.clone.1 = u32[1280,1280]{1,0} add(%add.246951.37.clone.1, %add.246952.99.clone.1)
+  %shift-left.108980.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246952.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115140.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246952.99.clone.1, %broadcast.244415.6016)
+  %or.114669.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108980.31.clone.1, %shift-right-logical.115140.29.clone.1)
+  %xor.121222.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246953.35.clone.1, %or.114669.29.clone.1)
+  %add.246955.5.clone.1 = u32[1280,1280]{1,0} add(%add.246953.35.clone.1, %xor.121222.27.clone.1)
+  %shift-left.108981.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121222.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115141.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121222.27.clone.1, %broadcast.244417.5760)
+  %or.114670.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108981.9.clone.1, %shift-right-logical.115141.9.clone.1)
+  %xor.121223.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246955.5.clone.1, %or.114670.7.clone.1)
+  %add.246958.3.clone.1 = u32[1280,1280]{1,0} add(%add.246955.5.clone.1, %xor.121223.5.clone.1)
+  %shift-left.108982.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121223.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115142.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121223.5.clone.1, %broadcast.244419.4352)
+  %or.114672.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108982.5.clone.1, %shift-right-logical.115142.5.clone.1)
+  %xor.121224.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246958.3.clone.1, %or.114672.3.clone.1)
+  %add.246959.3.clone.1 = u32[1280,1280]{1,0} add(%add.246958.3.clone.1, %xor.121224.3.clone.1)
+  %add.246960.7.clone.1 = u32[1280,1280]{1,0} add(%add.246959.3.clone.1, %broadcast.249429.113.clone.1)
+  %shift-left.108983.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121224.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115143.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121224.3.clone.1, %broadcast.244418.4352)
+  %or.114673.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108983.5.clone.1, %shift-right-logical.115143.5.clone.1)
+  %xor.121225.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246959.3.clone.1, %or.114673.3.clone.1)
+  %constant_218081_1_clone_1 = u32[] constant(4097270486)
+  %broadcast.249446.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218081_1_clone_1), dimensions={}
+  %add.246961.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121225.3.clone.1, %broadcast.249446.5.clone.1)
+  %add.246962.5.clone.1 = u32[1280,1280]{1,0} add(%add.246960.7.clone.1, %add.246961.5.clone.1)
+  %shift-left.108984.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246961.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115144.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246961.5.clone.1, %broadcast.244416.5760)
+  %or.114674.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108984.9.clone.1, %shift-right-logical.115144.9.clone.1)
+  %xor.121227.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246962.5.clone.1, %or.114674.7.clone.1)
+  %add.246963.3.clone.1 = u32[1280,1280]{1,0} add(%add.246962.5.clone.1, %xor.121227.5.clone.1)
+  %shift-left.108985.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121227.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115145.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121227.5.clone.1, %broadcast.244429.2304)
+  %or.114675.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108985.9.clone.1, %shift-right-logical.115145.9.clone.1)
+  %xor.121228.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246963.3.clone.1, %or.114675.7.clone.1)
+  %add.246964.3.clone.1 = u32[1280,1280]{1,0} add(%add.246963.3.clone.1, %xor.121228.5.clone.1)
+  %shift-left.108986.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121228.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115146.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121228.5.clone.1, %broadcast.244430.4608)
+  %or.114677.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108986.9.clone.1, %shift-right-logical.115146.9.clone.1)
+  %xor.121229.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246964.3.clone.1, %or.114677.7.clone.1)
+  %add.246965.3.clone.1 = u32[1280,1280]{1,0} add(%add.246964.3.clone.1, %xor.121229.5.clone.1)
+  %constant_169753_1_clone_1 = u32[] constant(4097270485)
+  %broadcast.249460.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169753_1_clone_1), dimensions={}
+  %add.246966.7.clone.1 = u32[1280,1280]{1,0} add(%add.246965.3.clone.1, %broadcast.249460.24.clone.1)
+  %shift-left.108987.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121229.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115147.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121229.5.clone.1, %broadcast.244434.2816)
+  %or.114678.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108987.11.clone.1, %shift-right-logical.115147.11.clone.1)
+  %xor.121230.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246965.3.clone.1, %or.114678.9.clone.1)
+  %constant_218082_1_clone_1 = u32[] constant(2598801055)
+  %broadcast.249466.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218082_1_clone_1), dimensions={}
+  %add.246967.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121230.7.clone.1, %broadcast.249466.5.clone.1)
+  %add.246968.5.clone.1 = u32[1280,1280]{1,0} add(%add.246966.7.clone.1, %add.246967.5.clone.1)
+  %shift-left.108988.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246967.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115148.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246967.5.clone.1, %broadcast.244415.6016)
+  %or.114679.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108988.9.clone.1, %shift-right-logical.115148.9.clone.1)
+  %xor.121232.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246968.5.clone.1, %or.114679.7.clone.1)
+  %add.246969.3.clone.1 = u32[1280,1280]{1,0} add(%add.246968.5.clone.1, %xor.121232.5.clone.1)
+  %shift-left.108989.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121232.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115149.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121232.5.clone.1, %broadcast.244417.5760)
+  %or.114680.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108989.9.clone.1, %shift-right-logical.115149.9.clone.1)
+  %xor.121233.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246969.3.clone.1, %or.114680.7.clone.1)
+  %add.246970.3.clone.1 = u32[1280,1280]{1,0} add(%add.246969.3.clone.1, %xor.121233.5.clone.1)
+  %shift-left.108990.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121233.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115150.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121233.5.clone.1, %broadcast.244419.4352)
+  %or.114682.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108990.7.clone.1, %shift-right-logical.115150.7.clone.1)
+  %xor.121234.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246970.3.clone.1, %or.114682.5.clone.1)
+  %add.246971.3.clone.1 = u32[1280,1280]{1,0} add(%add.246970.3.clone.1, %xor.121234.3.clone.1)
+  %add.246972.7.clone.1 = u32[1280,1280]{1,0} add(%add.246971.3.clone.1, %broadcast.249427.44.clone.1)
+  %shift-left.108991.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121234.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115151.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121234.3.clone.1, %broadcast.244418.4352)
+  %or.114683.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108991.7.clone.1, %shift-right-logical.115151.7.clone.1)
+  %xor.121235.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246971.3.clone.1, %or.114683.5.clone.1)
+  %constant_218083_1_clone_1 = u32[] constant(1962997653)
+  %broadcast.249485.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218083_1_clone_1), dimensions={}
+  %add.246973.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121235.3.clone.1, %broadcast.249485.5.clone.1)
+  %add.246974.5.clone.1 = u32[1280,1280]{1,0} add(%add.246972.7.clone.1, %add.246973.5.clone.1)
+  %shift-left.108992.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246973.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115152.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246973.5.clone.1, %broadcast.244416.5760)
+  %or.114684.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108992.9.clone.1, %shift-right-logical.115152.9.clone.1)
+  %xor.121236.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246974.5.clone.1, %or.114684.7.clone.1)
+  %add.246975.3.clone.1 = u32[1280,1280]{1,0} add(%add.246974.5.clone.1, %xor.121236.5.clone.1)
+  %shift-left.108993.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121236.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115153.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121236.5.clone.1, %broadcast.244429.2304)
+  %or.114685.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108993.9.clone.1, %shift-right-logical.115153.9.clone.1)
+  %xor.121237.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246975.3.clone.1, %or.114685.7.clone.1)
+  %add.246976.3.clone.1 = u32[1280,1280]{1,0} add(%add.246975.3.clone.1, %xor.121237.5.clone.1)
+  %shift-left.108994.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121237.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115154.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121237.5.clone.1, %broadcast.244430.4608)
+  %or.114687.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108994.9.clone.1, %shift-right-logical.115154.9.clone.1)
+  %xor.121238.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246976.3.clone.1, %or.114687.7.clone.1)
+  %add.246977.3.clone.1 = u32[1280,1280]{1,0} add(%add.246976.3.clone.1, %xor.121238.5.clone.1)
+  %add.246978.7.clone.1 = u32[1280,1280]{1,0} add(%add.246977.3.clone.1, %broadcast.249429.113.clone.1)
+  %shift-left.108995.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121238.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115155.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121238.5.clone.1, %broadcast.244434.2816)
+  %or.114688.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108995.11.clone.1, %shift-right-logical.115155.11.clone.1)
+  %xor.121239.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246977.3.clone.1, %or.114688.9.clone.1)
+  %constant_218084_1_clone_1 = u32[] constant(4097270489)
+  %broadcast.249495.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218084_1_clone_1), dimensions={}
+  %add.246979.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121239.7.clone.1, %broadcast.249495.5.clone.1)
+  %add.246980.5.clone.1 = u32[1280,1280]{1,0} add(%add.246978.7.clone.1, %add.246979.5.clone.1)
+  %shift-left.108996.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246979.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115156.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246979.5.clone.1, %broadcast.244415.6016)
+  %or.114689.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108996.9.clone.1, %shift-right-logical.115156.9.clone.1)
+  %xor.121240.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246980.5.clone.1, %or.114689.7.clone.1)
+  %add.246981.3.clone.1 = u32[1280,1280]{1,0} add(%add.246980.5.clone.1, %xor.121240.5.clone.1)
+  %shift-left.108997.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121240.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115157.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121240.5.clone.1, %broadcast.244417.5760)
+  %or.114690.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108997.9.clone.1, %shift-right-logical.115157.9.clone.1)
+  %xor.121242.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246981.3.clone.1, %or.114690.7.clone.1)
+  %add.246982.3.clone.1 = u32[1280,1280]{1,0} add(%add.246981.3.clone.1, %xor.121242.5.clone.1)
+  %shift-left.108998.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121242.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115158.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121242.5.clone.1, %broadcast.244419.4352)
+  %or.114691.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108998.5.clone.1, %shift-right-logical.115158.5.clone.1)
+  %xor.121243.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246982.3.clone.1, %or.114691.3.clone.1)
+  %add.246983.3.clone.1 = u32[1280,1280]{1,0} add(%add.246982.3.clone.1, %xor.121243.3.clone.1)
+  %add.246984.17.clone.1 = u32[1280,1280]{1,0} add(%add.246983.3.clone.1, %broadcast.249460.24.clone.1)
+  %shift-left.108999.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121243.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115159.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121243.3.clone.1, %broadcast.244418.4352)
+  %or.114692.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108999.5.clone.1, %shift-right-logical.115159.5.clone.1)
+  %xor.121244.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246983.3.clone.1, %or.114692.3.clone.1)
+  %constant_218085_1_clone_1 = u32[] constant(2598801058)
+  %broadcast.249505.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218085_1_clone_1), dimensions={}
+  %add.246985.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121244.15.clone.1, %broadcast.249505.19.clone.1)
+  %xor.121245.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246984.17.clone.1, %add.246985.19.clone.1)
+  %shift-right-logical.115160.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121245.17.clone.1, %broadcast.244468.1920)
+  %or.114693.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115160.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5726.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114693.13.clone.1)
+  %add.246986.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5726.11.clone.1, %broadcast.244470.1152)
+  %multiply.26182.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246986.9.clone.1, %broadcast.244471.896)
+  %add.246987.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26182.7.clone.1, %broadcast.244408.1024)
+  %maximum.3658.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246987.5.clone.1)
+  %abs.1522.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3658.3.clone.1)
+  %compare.7192.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1522.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26183.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3658.3.clone.1, %broadcast.244476.1152)
+  %negate.4549.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3658.3.clone.1)
+  %multiply.26184.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3658.3.clone.1, %negate.4549.5.clone.1)
+  %log-plus-one.1522.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26184.5.clone.1)
+  %negate.4550.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1522.3.clone.1)
+  %compare.7193.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4550.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20807.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20808.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20809.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20810.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20811.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20812.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20813.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20814.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20815.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246988.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4550.4.clone.1, %broadcast.244496.640)
+  %sqrt.1522.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4550.4.clone.1)
+  %add.246989.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1522.5.clone.1, %broadcast.244498.640)
+  %select.20816.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7193.3.clone.1, %add.246988.5.clone.1, %add.246989.5.clone.1)
+  %multiply.26185.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20815.3.clone.1, %select.20816.3.clone.1)
+  %add.246990.1.clone.1 = f32[1280,1280]{1,0} add(%select.20814.3.clone.1, %multiply.26185.1.clone.1)
+  %multiply.26186.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246990.1.clone.1, %select.20816.3.clone.1)
+  %add.246991.1.clone.1 = f32[1280,1280]{1,0} add(%select.20813.3.clone.1, %multiply.26186.1.clone.1)
+  %multiply.26187.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246991.1.clone.1, %select.20816.3.clone.1)
+  %add.246992.1.clone.1 = f32[1280,1280]{1,0} add(%select.20812.3.clone.1, %multiply.26187.1.clone.1)
+  %multiply.26188.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246992.1.clone.1, %select.20816.3.clone.1)
+  %add.246993.1.clone.1 = f32[1280,1280]{1,0} add(%select.20811.3.clone.1, %multiply.26188.1.clone.1)
+  %multiply.26189.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246993.1.clone.1, %select.20816.3.clone.1)
+  %add.246994.3.clone.1 = f32[1280,1280]{1,0} add(%select.20810.5.clone.1, %multiply.26189.1.clone.1)
+  %multiply.26190.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246994.3.clone.1, %select.20816.3.clone.1)
+  %add.246995.3.clone.1 = f32[1280,1280]{1,0} add(%select.20809.5.clone.1, %multiply.26190.1.clone.1)
+  %multiply.26191.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246995.3.clone.1, %select.20816.3.clone.1)
+  %add.246996.9.clone.1 = f32[1280,1280]{1,0} add(%select.20808.11.clone.1, %multiply.26191.7.clone.1)
+  %multiply.26192.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246996.9.clone.1, %select.20816.3.clone.1)
+  %add.246998.7.clone.1 = f32[1280,1280]{1,0} add(%select.20807.7.clone.1, %multiply.26192.7.clone.1)
+  %multiply.26193.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246998.7.clone.1, %maximum.3658.3.clone.1)
+  %select.20817.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7192.3.clone.1, %multiply.26183.9.clone.1, %multiply.26193.7.clone.1)
+  %multiply.26194.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20817.7.clone.1, %broadcast.244500.640)
+  %clamp.1166.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26194.5.clone.1, %broadcast.244501.384)
+  %multiply.26195.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1166.3.clone.1, %broadcast.244502.1)
+  %constant_190241_1_clone_1 = u32[] constant(3290012183)
+  %broadcast.258286.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190241_1_clone_1), dimensions={}
+  %add.251997.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258286.44.clone.1)
+  %constant_190248_1_clone_1 = u32[] constant(4257276142)
+  %broadcast.258287.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190248_1_clone_1), dimensions={}
+  %add.251998.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258287.113.clone.1)
+  %add.252000.35.clone.1 = u32[1280,1280]{1,0} add(%add.251997.37.clone.1, %add.251998.99.clone.1)
+  %shift-left.111200.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251998.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117463.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251998.99.clone.1, %broadcast.244415.6016)
+  %or.116999.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111200.31.clone.1, %shift-right-logical.117463.29.clone.1)
+  %xor.123552.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252000.35.clone.1, %or.116999.29.clone.1)
+  %add.252001.5.clone.1 = u32[1280,1280]{1,0} add(%add.252000.35.clone.1, %xor.123552.27.clone.1)
+  %shift-left.111201.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123552.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117464.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123552.27.clone.1, %broadcast.244417.5760)
+  %or.117000.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111201.9.clone.1, %shift-right-logical.117464.9.clone.1)
+  %xor.123553.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252001.5.clone.1, %or.117000.7.clone.1)
+  %add.252002.3.clone.1 = u32[1280,1280]{1,0} add(%add.252001.5.clone.1, %xor.123553.5.clone.1)
+  %shift-left.111202.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123553.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117465.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123553.5.clone.1, %broadcast.244419.4352)
+  %or.117001.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111202.5.clone.1, %shift-right-logical.117465.5.clone.1)
+  %xor.123554.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252002.3.clone.1, %or.117001.3.clone.1)
+  %add.252003.3.clone.1 = u32[1280,1280]{1,0} add(%add.252002.3.clone.1, %xor.123554.3.clone.1)
+  %add.252004.7.clone.1 = u32[1280,1280]{1,0} add(%add.252003.3.clone.1, %broadcast.258287.113.clone.1)
+  %shift-left.111203.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123554.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117466.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123554.3.clone.1, %broadcast.244418.4352)
+  %or.117002.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111203.5.clone.1, %shift-right-logical.117466.5.clone.1)
+  %xor.123555.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252003.3.clone.1, %or.117002.3.clone.1)
+  %constant_218645_1_clone_1 = u32[] constant(570974500)
+  %broadcast.258297.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218645_1_clone_1), dimensions={}
+  %add.252006.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123555.3.clone.1, %broadcast.258297.5.clone.1)
+  %add.252010.5.clone.1 = u32[1280,1280]{1,0} add(%add.252004.7.clone.1, %add.252006.5.clone.1)
+  %shift-left.111204.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252006.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117467.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252006.5.clone.1, %broadcast.244416.5760)
+  %or.117003.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111204.9.clone.1, %shift-right-logical.117467.9.clone.1)
+  %xor.123556.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252010.5.clone.1, %or.117003.7.clone.1)
+  %add.252011.3.clone.1 = u32[1280,1280]{1,0} add(%add.252010.5.clone.1, %xor.123556.5.clone.1)
+  %shift-left.111205.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123556.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117468.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123556.5.clone.1, %broadcast.244429.2304)
+  %or.117004.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111205.9.clone.1, %shift-right-logical.117468.9.clone.1)
+  %xor.123557.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252011.3.clone.1, %or.117004.7.clone.1)
+  %add.252012.3.clone.1 = u32[1280,1280]{1,0} add(%add.252011.3.clone.1, %xor.123557.5.clone.1)
+  %shift-left.111206.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123557.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117469.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123557.5.clone.1, %broadcast.244430.4608)
+  %or.117005.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111206.9.clone.1, %shift-right-logical.117469.9.clone.1)
+  %xor.123558.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252012.3.clone.1, %or.117005.7.clone.1)
+  %add.252013.3.clone.1 = u32[1280,1280]{1,0} add(%add.252012.3.clone.1, %xor.123558.5.clone.1)
+  %constant_190250_1_clone_1 = u32[] constant(570974499)
+  %broadcast.258304.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190250_1_clone_1), dimensions={}
+  %add.252015.7.clone.1 = u32[1280,1280]{1,0} add(%add.252013.3.clone.1, %broadcast.258304.24.clone.1)
+  %shift-left.111207.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123558.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117471.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123558.5.clone.1, %broadcast.244434.2816)
+  %or.117006.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111207.11.clone.1, %shift-right-logical.117471.11.clone.1)
+  %xor.123559.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252013.3.clone.1, %or.117006.9.clone.1)
+  %constant_218646_1_clone_1 = u32[] constant(3290012185)
+  %broadcast.258307.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218646_1_clone_1), dimensions={}
+  %add.252016.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123559.7.clone.1, %broadcast.258307.5.clone.1)
+  %add.252017.5.clone.1 = u32[1280,1280]{1,0} add(%add.252015.7.clone.1, %add.252016.5.clone.1)
+  %shift-left.111208.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252016.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117472.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252016.5.clone.1, %broadcast.244415.6016)
+  %or.117007.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111208.9.clone.1, %shift-right-logical.117472.9.clone.1)
+  %xor.123560.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252017.5.clone.1, %or.117007.7.clone.1)
+  %add.252018.3.clone.1 = u32[1280,1280]{1,0} add(%add.252017.5.clone.1, %xor.123560.5.clone.1)
+  %shift-left.111209.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123560.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117473.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123560.5.clone.1, %broadcast.244417.5760)
+  %or.117008.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111209.9.clone.1, %shift-right-logical.117473.9.clone.1)
+  %xor.123561.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252018.3.clone.1, %or.117008.7.clone.1)
+  %add.252020.3.clone.1 = u32[1280,1280]{1,0} add(%add.252018.3.clone.1, %xor.123561.5.clone.1)
+  %shift-left.111210.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123561.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117474.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123561.5.clone.1, %broadcast.244419.4352)
+  %or.117009.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111210.7.clone.1, %shift-right-logical.117474.7.clone.1)
+  %xor.123562.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252020.3.clone.1, %or.117009.5.clone.1)
+  %add.252021.3.clone.1 = u32[1280,1280]{1,0} add(%add.252020.3.clone.1, %xor.123562.3.clone.1)
+  %add.252022.7.clone.1 = u32[1280,1280]{1,0} add(%add.252021.3.clone.1, %broadcast.258286.44.clone.1)
+  %shift-left.111211.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123562.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117476.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123562.3.clone.1, %broadcast.244418.4352)
+  %or.117010.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111211.7.clone.1, %shift-right-logical.117476.7.clone.1)
+  %xor.123563.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252021.3.clone.1, %or.117010.5.clone.1)
+  %constant_218647_1_clone_1 = u32[] constant(4257276145)
+  %broadcast.258317.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218647_1_clone_1), dimensions={}
+  %add.252023.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123563.3.clone.1, %broadcast.258317.5.clone.1)
+  %add.252025.5.clone.1 = u32[1280,1280]{1,0} add(%add.252022.7.clone.1, %add.252023.5.clone.1)
+  %shift-left.111212.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252023.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117477.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252023.5.clone.1, %broadcast.244416.5760)
+  %or.117011.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111212.9.clone.1, %shift-right-logical.117477.9.clone.1)
+  %xor.123566.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252025.5.clone.1, %or.117011.7.clone.1)
+  %add.252026.3.clone.1 = u32[1280,1280]{1,0} add(%add.252025.5.clone.1, %xor.123566.5.clone.1)
+  %shift-left.111213.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123566.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117478.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123566.5.clone.1, %broadcast.244429.2304)
+  %or.117012.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111213.9.clone.1, %shift-right-logical.117478.9.clone.1)
+  %xor.123567.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252026.3.clone.1, %or.117012.7.clone.1)
+  %add.252027.3.clone.1 = u32[1280,1280]{1,0} add(%add.252026.3.clone.1, %xor.123567.5.clone.1)
+  %shift-left.111214.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123567.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117479.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123567.5.clone.1, %broadcast.244430.4608)
+  %or.117013.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111214.9.clone.1, %shift-right-logical.117479.9.clone.1)
+  %xor.123568.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252027.3.clone.1, %or.117013.7.clone.1)
+  %add.252028.3.clone.1 = u32[1280,1280]{1,0} add(%add.252027.3.clone.1, %xor.123568.5.clone.1)
+  %add.252029.7.clone.1 = u32[1280,1280]{1,0} add(%add.252028.3.clone.1, %broadcast.258287.113.clone.1)
+  %shift-left.111215.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123568.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117481.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123568.5.clone.1, %broadcast.244434.2816)
+  %or.117014.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111215.11.clone.1, %shift-right-logical.117481.11.clone.1)
+  %xor.123569.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252028.3.clone.1, %or.117014.9.clone.1)
+  %constant_218648_1_clone_1 = u32[] constant(570974503)
+  %broadcast.258327.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218648_1_clone_1), dimensions={}
+  %add.252031.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123569.7.clone.1, %broadcast.258327.5.clone.1)
+  %add.252035.5.clone.1 = u32[1280,1280]{1,0} add(%add.252029.7.clone.1, %add.252031.5.clone.1)
+  %shift-left.111216.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252031.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117482.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252031.5.clone.1, %broadcast.244415.6016)
+  %or.117015.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111216.9.clone.1, %shift-right-logical.117482.9.clone.1)
+  %xor.123570.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252035.5.clone.1, %or.117015.7.clone.1)
+  %add.252036.3.clone.1 = u32[1280,1280]{1,0} add(%add.252035.5.clone.1, %xor.123570.5.clone.1)
+  %shift-left.111217.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123570.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117483.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123570.5.clone.1, %broadcast.244417.5760)
+  %or.117016.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111217.9.clone.1, %shift-right-logical.117483.9.clone.1)
+  %xor.123571.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252036.3.clone.1, %or.117016.7.clone.1)
+  %add.252037.3.clone.1 = u32[1280,1280]{1,0} add(%add.252036.3.clone.1, %xor.123571.5.clone.1)
+  %shift-left.111218.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123571.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117484.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123571.5.clone.1, %broadcast.244419.4352)
+  %or.117017.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111218.5.clone.1, %shift-right-logical.117484.5.clone.1)
+  %xor.123572.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252037.3.clone.1, %or.117017.3.clone.1)
+  %add.252038.3.clone.1 = u32[1280,1280]{1,0} add(%add.252037.3.clone.1, %xor.123572.3.clone.1)
+  %add.252040.17.clone.1 = u32[1280,1280]{1,0} add(%add.252038.3.clone.1, %broadcast.258304.24.clone.1)
+  %shift-left.111219.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123572.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117486.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123572.3.clone.1, %broadcast.244418.4352)
+  %or.117018.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111219.5.clone.1, %shift-right-logical.117486.5.clone.1)
+  %xor.123573.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252038.3.clone.1, %or.117018.3.clone.1)
+  %constant_218649_1_clone_1 = u32[] constant(3290012188)
+  %broadcast.258337.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218649_1_clone_1), dimensions={}
+  %add.252041.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123573.15.clone.1, %broadcast.258337.19.clone.1)
+  %xor.123575.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252040.17.clone.1, %add.252041.19.clone.1)
+  %shift-right-logical.117487.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123575.17.clone.1, %broadcast.244468.1920)
+  %or.117019.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117487.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5827.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117019.13.clone.1)
+  %add.252042.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5827.11.clone.1, %broadcast.244470.1152)
+  %multiply.27218.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252042.9.clone.1, %broadcast.244471.896)
+  %add.252043.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27218.7.clone.1, %broadcast.244408.1024)
+  %maximum.3759.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252043.5.clone.1)
+  %abs.1589.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3759.3.clone.1)
+  %compare.7340.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1589.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27219.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3759.3.clone.1, %broadcast.244476.1152)
+  %negate.4683.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3759.3.clone.1)
+  %multiply.27220.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3759.3.clone.1, %negate.4683.5.clone.1)
+  %log-plus-one.1589.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27220.5.clone.1)
+  %negate.4684.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1589.3.clone.1)
+  %compare.7341.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4684.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21581.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21582.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21583.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21584.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21586.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21591.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21592.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21593.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21594.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252045.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4684.4.clone.1, %broadcast.244496.640)
+  %sqrt.1589.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4684.4.clone.1)
+  %add.252046.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1589.5.clone.1, %broadcast.244498.640)
+  %select.21595.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7341.3.clone.1, %add.252045.5.clone.1, %add.252046.5.clone.1)
+  %multiply.27221.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21594.3.clone.1, %select.21595.3.clone.1)
+  %add.252047.1.clone.1 = f32[1280,1280]{1,0} add(%select.21593.3.clone.1, %multiply.27221.1.clone.1)
+  %multiply.27222.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252047.1.clone.1, %select.21595.3.clone.1)
+  %add.252048.1.clone.1 = f32[1280,1280]{1,0} add(%select.21592.3.clone.1, %multiply.27222.1.clone.1)
+  %multiply.27223.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252048.1.clone.1, %select.21595.3.clone.1)
+  %add.252050.1.clone.1 = f32[1280,1280]{1,0} add(%select.21591.3.clone.1, %multiply.27223.1.clone.1)
+  %multiply.27224.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252050.1.clone.1, %select.21595.3.clone.1)
+  %add.252051.1.clone.1 = f32[1280,1280]{1,0} add(%select.21586.3.clone.1, %multiply.27224.1.clone.1)
+  %multiply.27225.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252051.1.clone.1, %select.21595.3.clone.1)
+  %add.252052.3.clone.1 = f32[1280,1280]{1,0} add(%select.21584.5.clone.1, %multiply.27225.1.clone.1)
+  %multiply.27226.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252052.3.clone.1, %select.21595.3.clone.1)
+  %add.252053.3.clone.1 = f32[1280,1280]{1,0} add(%select.21583.5.clone.1, %multiply.27226.1.clone.1)
+  %multiply.27227.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252053.3.clone.1, %select.21595.3.clone.1)
+  %add.252054.9.clone.1 = f32[1280,1280]{1,0} add(%select.21582.11.clone.1, %multiply.27227.7.clone.1)
+  %multiply.27228.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252054.9.clone.1, %select.21595.3.clone.1)
+  %add.252056.7.clone.1 = f32[1280,1280]{1,0} add(%select.21581.7.clone.1, %multiply.27228.7.clone.1)
+  %multiply.27229.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252056.7.clone.1, %maximum.3759.3.clone.1)
+  %select.21596.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7340.3.clone.1, %multiply.27219.9.clone.1, %multiply.27229.7.clone.1)
+  %multiply.27230.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21596.7.clone.1, %broadcast.244500.640)
+  %clamp.1233.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27230.5.clone.1, %broadcast.244501.384)
+  %multiply.27231.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1233.3.clone.1, %broadcast.244502.1)
+  %constant_169501_1_clone_1 = u32[] constant(415291909)
+  %broadcast.249328.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169501_1_clone_1), dimensions={}
+  %add.246891.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249328.44.clone.1)
+  %constant_169508_1_clone_1 = u32[] constant(4252968395)
+  %broadcast.249329.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169508_1_clone_1), dimensions={}
+  %add.246892.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249329.113.clone.1)
+  %add.246894.35.clone.1 = u32[1280,1280]{1,0} add(%add.246891.37.clone.1, %add.246892.99.clone.1)
+  %shift-left.108960.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246892.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115119.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246892.99.clone.1, %broadcast.244415.6016)
+  %or.114644.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108960.31.clone.1, %shift-right-logical.115119.29.clone.1)
+  %xor.121197.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246894.35.clone.1, %or.114644.29.clone.1)
+  %add.246895.5.clone.1 = u32[1280,1280]{1,0} add(%add.246894.35.clone.1, %xor.121197.27.clone.1)
+  %shift-left.108961.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121197.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115120.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121197.27.clone.1, %broadcast.244417.5760)
+  %or.114645.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108961.9.clone.1, %shift-right-logical.115120.9.clone.1)
+  %xor.121198.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246895.5.clone.1, %or.114645.7.clone.1)
+  %add.246896.3.clone.1 = u32[1280,1280]{1,0} add(%add.246895.5.clone.1, %xor.121198.5.clone.1)
+  %shift-left.108962.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121198.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115121.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121198.5.clone.1, %broadcast.244419.4352)
+  %or.114647.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108962.5.clone.1, %shift-right-logical.115121.5.clone.1)
+  %xor.121199.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246896.3.clone.1, %or.114647.3.clone.1)
+  %add.246897.3.clone.1 = u32[1280,1280]{1,0} add(%add.246896.3.clone.1, %xor.121199.3.clone.1)
+  %add.246899.7.clone.1 = u32[1280,1280]{1,0} add(%add.246897.3.clone.1, %broadcast.249329.113.clone.1)
+  %shift-left.108963.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121199.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115122.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121199.3.clone.1, %broadcast.244418.4352)
+  %or.114648.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108963.5.clone.1, %shift-right-logical.115122.5.clone.1)
+  %xor.121200.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246897.3.clone.1, %or.114648.3.clone.1)
+  %constant_218076_1_clone_1 = u32[] constant(4268680213)
+  %broadcast.249341.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218076_1_clone_1), dimensions={}
+  %add.246900.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121200.3.clone.1, %broadcast.249341.5.clone.1)
+  %add.246901.5.clone.1 = u32[1280,1280]{1,0} add(%add.246899.7.clone.1, %add.246900.5.clone.1)
+  %shift-left.108964.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246900.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115123.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246900.5.clone.1, %broadcast.244416.5760)
+  %or.114649.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108964.9.clone.1, %shift-right-logical.115123.9.clone.1)
+  %xor.121202.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246901.5.clone.1, %or.114649.7.clone.1)
+  %add.246902.3.clone.1 = u32[1280,1280]{1,0} add(%add.246901.5.clone.1, %xor.121202.5.clone.1)
+  %shift-left.108965.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121202.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115124.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121202.5.clone.1, %broadcast.244429.2304)
+  %or.114650.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108965.9.clone.1, %shift-right-logical.115124.9.clone.1)
+  %xor.121203.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246902.3.clone.1, %or.114650.7.clone.1)
+  %add.246903.3.clone.1 = u32[1280,1280]{1,0} add(%add.246902.3.clone.1, %xor.121203.5.clone.1)
+  %shift-left.108966.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121203.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115125.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121203.5.clone.1, %broadcast.244430.4608)
+  %or.114652.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108966.9.clone.1, %shift-right-logical.115125.9.clone.1)
+  %xor.121204.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246903.3.clone.1, %or.114652.7.clone.1)
+  %add.246905.3.clone.1 = u32[1280,1280]{1,0} add(%add.246903.3.clone.1, %xor.121204.5.clone.1)
+  %constant_169510_1_clone_1 = u32[] constant(4268680212)
+  %broadcast.249348.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169510_1_clone_1), dimensions={}
+  %add.246909.7.clone.1 = u32[1280,1280]{1,0} add(%add.246905.3.clone.1, %broadcast.249348.24.clone.1)
+  %shift-left.108967.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121204.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115126.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121204.5.clone.1, %broadcast.244434.2816)
+  %or.114653.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108967.11.clone.1, %shift-right-logical.115126.11.clone.1)
+  %xor.121205.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246905.3.clone.1, %or.114653.9.clone.1)
+  %constant_218077_1_clone_1 = u32[] constant(415291911)
+  %broadcast.249351.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218077_1_clone_1), dimensions={}
+  %add.246910.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121205.7.clone.1, %broadcast.249351.5.clone.1)
+  %add.246911.5.clone.1 = u32[1280,1280]{1,0} add(%add.246909.7.clone.1, %add.246910.5.clone.1)
+  %shift-left.108968.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246910.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115127.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246910.5.clone.1, %broadcast.244415.6016)
+  %or.114654.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108968.9.clone.1, %shift-right-logical.115127.9.clone.1)
+  %xor.121207.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246911.5.clone.1, %or.114654.7.clone.1)
+  %add.246912.3.clone.1 = u32[1280,1280]{1,0} add(%add.246911.5.clone.1, %xor.121207.5.clone.1)
+  %shift-left.108969.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121207.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115128.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121207.5.clone.1, %broadcast.244417.5760)
+  %or.114655.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108969.9.clone.1, %shift-right-logical.115128.9.clone.1)
+  %xor.121208.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246912.3.clone.1, %or.114655.7.clone.1)
+  %add.246914.3.clone.1 = u32[1280,1280]{1,0} add(%add.246912.3.clone.1, %xor.121208.5.clone.1)
+  %shift-left.108970.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121208.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115129.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121208.5.clone.1, %broadcast.244419.4352)
+  %or.114657.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108970.7.clone.1, %shift-right-logical.115129.7.clone.1)
+  %xor.121209.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246914.3.clone.1, %or.114657.5.clone.1)
+  %add.246915.3.clone.1 = u32[1280,1280]{1,0} add(%add.246914.3.clone.1, %xor.121209.3.clone.1)
+  %add.246916.7.clone.1 = u32[1280,1280]{1,0} add(%add.246915.3.clone.1, %broadcast.249328.44.clone.1)
+  %shift-left.108971.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121209.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115130.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121209.3.clone.1, %broadcast.244418.4352)
+  %or.114658.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108971.7.clone.1, %shift-right-logical.115130.7.clone.1)
+  %xor.121210.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246915.3.clone.1, %or.114658.5.clone.1)
+  %constant_218078_1_clone_1 = u32[] constant(4252968398)
+  %broadcast.249363.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218078_1_clone_1), dimensions={}
+  %add.246917.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121210.3.clone.1, %broadcast.249363.5.clone.1)
+  %add.246919.5.clone.1 = u32[1280,1280]{1,0} add(%add.246916.7.clone.1, %add.246917.5.clone.1)
+  %shift-left.108972.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246917.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115131.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246917.5.clone.1, %broadcast.244416.5760)
+  %or.114659.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108972.9.clone.1, %shift-right-logical.115131.9.clone.1)
+  %xor.121211.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246919.5.clone.1, %or.114659.7.clone.1)
+  %add.246920.3.clone.1 = u32[1280,1280]{1,0} add(%add.246919.5.clone.1, %xor.121211.5.clone.1)
+  %shift-left.108973.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121211.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115132.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121211.5.clone.1, %broadcast.244429.2304)
+  %or.114660.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108973.9.clone.1, %shift-right-logical.115132.9.clone.1)
+  %xor.121212.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246920.3.clone.1, %or.114660.7.clone.1)
+  %add.246921.3.clone.1 = u32[1280,1280]{1,0} add(%add.246920.3.clone.1, %xor.121212.5.clone.1)
+  %shift-left.108974.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121212.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115133.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121212.5.clone.1, %broadcast.244430.4608)
+  %or.114662.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108974.9.clone.1, %shift-right-logical.115133.9.clone.1)
+  %xor.121213.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246921.3.clone.1, %or.114662.7.clone.1)
+  %add.246922.3.clone.1 = u32[1280,1280]{1,0} add(%add.246921.3.clone.1, %xor.121213.5.clone.1)
+  %add.246924.7.clone.1 = u32[1280,1280]{1,0} add(%add.246922.3.clone.1, %broadcast.249329.113.clone.1)
+  %shift-left.108975.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121213.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115134.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121213.5.clone.1, %broadcast.244434.2816)
+  %or.114663.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108975.11.clone.1, %shift-right-logical.115134.11.clone.1)
+  %xor.121214.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246922.3.clone.1, %or.114663.9.clone.1)
+  %constant_218079_1_clone_1 = u32[] constant(4268680216)
+  %broadcast.249373.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218079_1_clone_1), dimensions={}
+  %add.246925.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121214.7.clone.1, %broadcast.249373.5.clone.1)
+  %add.246926.5.clone.1 = u32[1280,1280]{1,0} add(%add.246924.7.clone.1, %add.246925.5.clone.1)
+  %shift-left.108976.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246925.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115135.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246925.5.clone.1, %broadcast.244415.6016)
+  %or.114664.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108976.9.clone.1, %shift-right-logical.115135.9.clone.1)
+  %xor.121215.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246926.5.clone.1, %or.114664.7.clone.1)
+  %add.246927.3.clone.1 = u32[1280,1280]{1,0} add(%add.246926.5.clone.1, %xor.121215.5.clone.1)
+  %shift-left.108977.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121215.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115136.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121215.5.clone.1, %broadcast.244417.5760)
+  %or.114665.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108977.9.clone.1, %shift-right-logical.115136.9.clone.1)
+  %xor.121217.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246927.3.clone.1, %or.114665.7.clone.1)
+  %add.246928.3.clone.1 = u32[1280,1280]{1,0} add(%add.246927.3.clone.1, %xor.121217.5.clone.1)
+  %shift-left.108978.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121217.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115137.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121217.5.clone.1, %broadcast.244419.4352)
+  %or.114666.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108978.5.clone.1, %shift-right-logical.115137.5.clone.1)
+  %xor.121218.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246928.3.clone.1, %or.114666.3.clone.1)
+  %add.246930.3.clone.1 = u32[1280,1280]{1,0} add(%add.246928.3.clone.1, %xor.121218.3.clone.1)
+  %add.246934.17.clone.1 = u32[1280,1280]{1,0} add(%add.246930.3.clone.1, %broadcast.249348.24.clone.1)
+  %shift-left.108979.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121218.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115138.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121218.3.clone.1, %broadcast.244418.4352)
+  %or.114667.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108979.5.clone.1, %shift-right-logical.115138.5.clone.1)
+  %xor.121219.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246930.3.clone.1, %or.114667.3.clone.1)
+  %constant_218080_1_clone_1 = u32[] constant(415291914)
+  %broadcast.249383.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218080_1_clone_1), dimensions={}
+  %add.246935.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121219.15.clone.1, %broadcast.249383.19.clone.1)
+  %xor.121220.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246934.17.clone.1, %add.246935.19.clone.1)
+  %shift-right-logical.115139.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121220.17.clone.1, %broadcast.244468.1920)
+  %or.114668.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115139.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5725.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114668.13.clone.1)
+  %add.246936.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5725.11.clone.1, %broadcast.244470.1152)
+  %multiply.26168.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246936.9.clone.1, %broadcast.244471.896)
+  %add.246937.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26168.7.clone.1, %broadcast.244408.1024)
+  %maximum.3657.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246937.5.clone.1)
+  %abs.1521.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3657.3.clone.1)
+  %compare.7190.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1521.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26169.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3657.3.clone.1, %broadcast.244476.1152)
+  %negate.4547.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3657.3.clone.1)
+  %multiply.26170.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3657.3.clone.1, %negate.4547.5.clone.1)
+  %log-plus-one.1521.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26170.5.clone.1)
+  %negate.4548.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1521.3.clone.1)
+  %compare.7191.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4548.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20796.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20797.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20798.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20799.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20800.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20801.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20802.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20803.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20804.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246939.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4548.4.clone.1, %broadcast.244496.640)
+  %sqrt.1521.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4548.4.clone.1)
+  %add.246940.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1521.5.clone.1, %broadcast.244498.640)
+  %select.20805.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7191.3.clone.1, %add.246939.5.clone.1, %add.246940.5.clone.1)
+  %multiply.26171.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20804.3.clone.1, %select.20805.3.clone.1)
+  %add.246941.1.clone.1 = f32[1280,1280]{1,0} add(%select.20803.3.clone.1, %multiply.26171.1.clone.1)
+  %multiply.26172.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246941.1.clone.1, %select.20805.3.clone.1)
+  %add.246942.1.clone.1 = f32[1280,1280]{1,0} add(%select.20802.3.clone.1, %multiply.26172.1.clone.1)
+  %multiply.26173.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246942.1.clone.1, %select.20805.3.clone.1)
+  %add.246944.1.clone.1 = f32[1280,1280]{1,0} add(%select.20801.3.clone.1, %multiply.26173.1.clone.1)
+  %multiply.26174.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246944.1.clone.1, %select.20805.3.clone.1)
+  %add.246945.1.clone.1 = f32[1280,1280]{1,0} add(%select.20800.3.clone.1, %multiply.26174.1.clone.1)
+  %multiply.26175.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246945.1.clone.1, %select.20805.3.clone.1)
+  %add.246946.3.clone.1 = f32[1280,1280]{1,0} add(%select.20799.5.clone.1, %multiply.26175.1.clone.1)
+  %multiply.26176.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246946.3.clone.1, %select.20805.3.clone.1)
+  %add.246947.3.clone.1 = f32[1280,1280]{1,0} add(%select.20798.5.clone.1, %multiply.26176.1.clone.1)
+  %multiply.26177.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246947.3.clone.1, %select.20805.3.clone.1)
+  %add.246949.9.clone.1 = f32[1280,1280]{1,0} add(%select.20797.11.clone.1, %multiply.26177.7.clone.1)
+  %multiply.26178.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246949.9.clone.1, %select.20805.3.clone.1)
+  %add.246950.7.clone.1 = f32[1280,1280]{1,0} add(%select.20796.7.clone.1, %multiply.26178.7.clone.1)
+  %multiply.26179.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246950.7.clone.1, %maximum.3657.3.clone.1)
+  %select.20806.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7190.3.clone.1, %multiply.26169.9.clone.1, %multiply.26179.7.clone.1)
+  %multiply.26180.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20806.7.clone.1, %broadcast.244500.640)
+  %clamp.1165.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26180.5.clone.1, %broadcast.244501.384)
+  %multiply.26181.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1165.3.clone.1, %broadcast.244502.1)
+  %constant_183147_1_clone_1 = u32[] constant(528311313)
+  %broadcast.255234.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183147_1_clone_1), dimensions={}
+  %add.250260.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255234.44.clone.1)
+  %constant_183154_1_clone_1 = u32[] constant(1367963542)
+  %broadcast.255235.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183154_1_clone_1), dimensions={}
+  %add.250261.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.255235.113.clone.1)
+  %add.250262.35.clone.1 = u32[1280,1280]{1,0} add(%add.250260.37.clone.1, %add.250261.99.clone.1)
+  %shift-left.110420.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250261.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116655.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250261.99.clone.1, %broadcast.244415.6016)
+  %or.116177.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110420.31.clone.1, %shift-right-logical.116655.29.clone.1)
+  %xor.122749.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250262.35.clone.1, %or.116177.29.clone.1)
+  %add.250263.5.clone.1 = u32[1280,1280]{1,0} add(%add.250262.35.clone.1, %xor.122749.27.clone.1)
+  %shift-left.110421.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122749.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116656.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122749.27.clone.1, %broadcast.244417.5760)
+  %or.116178.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110421.9.clone.1, %shift-right-logical.116656.9.clone.1)
+  %xor.122750.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250263.5.clone.1, %or.116178.7.clone.1)
+  %add.250264.3.clone.1 = u32[1280,1280]{1,0} add(%add.250263.5.clone.1, %xor.122750.5.clone.1)
+  %shift-left.110422.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122750.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116657.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122750.5.clone.1, %broadcast.244419.4352)
+  %or.116179.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110422.5.clone.1, %shift-right-logical.116657.5.clone.1)
+  %xor.122751.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250264.3.clone.1, %or.116179.3.clone.1)
+  %add.250265.3.clone.1 = u32[1280,1280]{1,0} add(%add.250264.3.clone.1, %xor.122751.3.clone.1)
+  %add.250266.7.clone.1 = u32[1280,1280]{1,0} add(%add.250265.3.clone.1, %broadcast.255235.113.clone.1)
+  %shift-left.110423.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122751.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116658.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122751.3.clone.1, %broadcast.244418.4352)
+  %or.116181.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110423.5.clone.1, %shift-right-logical.116658.5.clone.1)
+  %xor.122752.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250265.3.clone.1, %or.116181.3.clone.1)
+  %constant_218449_1_clone_1 = u32[] constant(1428490334)
+  %broadcast.255245.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218449_1_clone_1), dimensions={}
+  %add.250267.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122752.3.clone.1, %broadcast.255245.5.clone.1)
+  %add.250268.5.clone.1 = u32[1280,1280]{1,0} add(%add.250266.7.clone.1, %add.250267.5.clone.1)
+  %shift-left.110424.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250267.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116659.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250267.5.clone.1, %broadcast.244416.5760)
+  %or.116182.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110424.9.clone.1, %shift-right-logical.116659.9.clone.1)
+  %xor.122753.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250268.5.clone.1, %or.116182.7.clone.1)
+  %add.250269.3.clone.1 = u32[1280,1280]{1,0} add(%add.250268.5.clone.1, %xor.122753.5.clone.1)
+  %shift-left.110425.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122753.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116660.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122753.5.clone.1, %broadcast.244429.2304)
+  %or.116183.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110425.9.clone.1, %shift-right-logical.116660.9.clone.1)
+  %xor.122754.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250269.3.clone.1, %or.116183.7.clone.1)
+  %add.250270.3.clone.1 = u32[1280,1280]{1,0} add(%add.250269.3.clone.1, %xor.122754.5.clone.1)
+  %shift-left.110426.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122754.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116661.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122754.5.clone.1, %broadcast.244430.4608)
+  %or.116184.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110426.9.clone.1, %shift-right-logical.116661.9.clone.1)
+  %xor.122755.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250270.3.clone.1, %or.116184.7.clone.1)
+  %add.250271.3.clone.1 = u32[1280,1280]{1,0} add(%add.250270.3.clone.1, %xor.122755.5.clone.1)
+  %constant_183156_1_clone_1 = u32[] constant(1428490333)
+  %broadcast.255252.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_183156_1_clone_1), dimensions={}
+  %add.250272.7.clone.1 = u32[1280,1280]{1,0} add(%add.250271.3.clone.1, %broadcast.255252.24.clone.1)
+  %shift-left.110427.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122755.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116662.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122755.5.clone.1, %broadcast.244434.2816)
+  %or.116185.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110427.11.clone.1, %shift-right-logical.116662.11.clone.1)
+  %xor.122756.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250271.3.clone.1, %or.116185.9.clone.1)
+  %constant_218450_1_clone_1 = u32[] constant(528311315)
+  %broadcast.255255.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218450_1_clone_1), dimensions={}
+  %add.250273.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122756.7.clone.1, %broadcast.255255.5.clone.1)
+  %add.250274.5.clone.1 = u32[1280,1280]{1,0} add(%add.250272.7.clone.1, %add.250273.5.clone.1)
+  %shift-left.110428.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250273.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116664.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250273.5.clone.1, %broadcast.244415.6016)
+  %or.116186.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110428.9.clone.1, %shift-right-logical.116664.9.clone.1)
+  %xor.122757.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250274.5.clone.1, %or.116186.7.clone.1)
+  %add.250275.3.clone.1 = u32[1280,1280]{1,0} add(%add.250274.5.clone.1, %xor.122757.5.clone.1)
+  %shift-left.110429.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122757.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116665.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122757.5.clone.1, %broadcast.244417.5760)
+  %or.116187.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110429.9.clone.1, %shift-right-logical.116665.9.clone.1)
+  %xor.122758.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250275.3.clone.1, %or.116187.7.clone.1)
+  %add.250276.3.clone.1 = u32[1280,1280]{1,0} add(%add.250275.3.clone.1, %xor.122758.5.clone.1)
+  %shift-left.110430.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122758.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116666.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122758.5.clone.1, %broadcast.244419.4352)
+  %or.116188.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110430.7.clone.1, %shift-right-logical.116666.7.clone.1)
+  %xor.122759.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250276.3.clone.1, %or.116188.5.clone.1)
+  %add.250277.3.clone.1 = u32[1280,1280]{1,0} add(%add.250276.3.clone.1, %xor.122759.3.clone.1)
+  %add.250278.7.clone.1 = u32[1280,1280]{1,0} add(%add.250277.3.clone.1, %broadcast.255234.44.clone.1)
+  %shift-left.110431.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122759.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116667.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122759.3.clone.1, %broadcast.244418.4352)
+  %or.116189.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110431.7.clone.1, %shift-right-logical.116667.7.clone.1)
+  %xor.122760.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250277.3.clone.1, %or.116189.5.clone.1)
+  %constant_218451_1_clone_1 = u32[] constant(1367963545)
+  %broadcast.255265.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218451_1_clone_1), dimensions={}
+  %add.250279.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122760.3.clone.1, %broadcast.255265.5.clone.1)
+  %add.250280.5.clone.1 = u32[1280,1280]{1,0} add(%add.250278.7.clone.1, %add.250279.5.clone.1)
+  %shift-left.110432.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250279.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116669.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250279.5.clone.1, %broadcast.244416.5760)
+  %or.116191.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110432.9.clone.1, %shift-right-logical.116669.9.clone.1)
+  %xor.122761.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250280.5.clone.1, %or.116191.7.clone.1)
+  %add.250281.3.clone.1 = u32[1280,1280]{1,0} add(%add.250280.5.clone.1, %xor.122761.5.clone.1)
+  %shift-left.110433.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122761.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116670.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122761.5.clone.1, %broadcast.244429.2304)
+  %or.116192.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110433.9.clone.1, %shift-right-logical.116670.9.clone.1)
+  %xor.122762.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250281.3.clone.1, %or.116192.7.clone.1)
+  %add.250282.3.clone.1 = u32[1280,1280]{1,0} add(%add.250281.3.clone.1, %xor.122762.5.clone.1)
+  %shift-left.110434.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122762.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116671.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122762.5.clone.1, %broadcast.244430.4608)
+  %or.116193.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110434.9.clone.1, %shift-right-logical.116671.9.clone.1)
+  %xor.122763.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250282.3.clone.1, %or.116193.7.clone.1)
+  %add.250283.3.clone.1 = u32[1280,1280]{1,0} add(%add.250282.3.clone.1, %xor.122763.5.clone.1)
+  %add.250284.7.clone.1 = u32[1280,1280]{1,0} add(%add.250283.3.clone.1, %broadcast.255235.113.clone.1)
+  %shift-left.110435.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122763.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116672.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122763.5.clone.1, %broadcast.244434.2816)
+  %or.116194.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110435.11.clone.1, %shift-right-logical.116672.11.clone.1)
+  %xor.122764.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250283.3.clone.1, %or.116194.9.clone.1)
+  %constant_218452_1_clone_1 = u32[] constant(1428490337)
+  %broadcast.255275.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218452_1_clone_1), dimensions={}
+  %add.250285.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122764.7.clone.1, %broadcast.255275.5.clone.1)
+  %add.250286.5.clone.1 = u32[1280,1280]{1,0} add(%add.250284.7.clone.1, %add.250285.5.clone.1)
+  %shift-left.110436.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250285.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116674.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250285.5.clone.1, %broadcast.244415.6016)
+  %or.116196.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110436.9.clone.1, %shift-right-logical.116674.9.clone.1)
+  %xor.122765.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250286.5.clone.1, %or.116196.7.clone.1)
+  %add.250287.3.clone.1 = u32[1280,1280]{1,0} add(%add.250286.5.clone.1, %xor.122765.5.clone.1)
+  %shift-left.110437.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122765.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116675.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122765.5.clone.1, %broadcast.244417.5760)
+  %or.116197.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110437.9.clone.1, %shift-right-logical.116675.9.clone.1)
+  %xor.122766.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250287.3.clone.1, %or.116197.7.clone.1)
+  %add.250288.3.clone.1 = u32[1280,1280]{1,0} add(%add.250287.3.clone.1, %xor.122766.5.clone.1)
+  %shift-left.110438.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122766.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116676.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122766.5.clone.1, %broadcast.244419.4352)
+  %or.116198.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110438.5.clone.1, %shift-right-logical.116676.5.clone.1)
+  %xor.122767.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250288.3.clone.1, %or.116198.3.clone.1)
+  %add.250289.3.clone.1 = u32[1280,1280]{1,0} add(%add.250288.3.clone.1, %xor.122767.3.clone.1)
+  %add.250290.17.clone.1 = u32[1280,1280]{1,0} add(%add.250289.3.clone.1, %broadcast.255252.24.clone.1)
+  %shift-left.110439.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122767.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116677.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122767.3.clone.1, %broadcast.244418.4352)
+  %or.116199.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110439.5.clone.1, %shift-right-logical.116677.5.clone.1)
+  %xor.122768.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250289.3.clone.1, %or.116199.3.clone.1)
+  %constant_218453_1_clone_1 = u32[] constant(528311318)
+  %broadcast.255285.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218453_1_clone_1), dimensions={}
+  %add.250291.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122768.15.clone.1, %broadcast.255285.19.clone.1)
+  %xor.122769.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250290.17.clone.1, %add.250291.19.clone.1)
+  %shift-right-logical.116679.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122769.17.clone.1, %broadcast.244468.1920)
+  %or.116201.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116679.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5792.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116201.13.clone.1)
+  %add.250292.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5792.11.clone.1, %broadcast.244470.1152)
+  %multiply.26857.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250292.9.clone.1, %broadcast.244471.896)
+  %add.250293.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26857.7.clone.1, %broadcast.244408.1024)
+  %maximum.3724.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250293.5.clone.1)
+  %abs.1566.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3724.3.clone.1)
+  %compare.7282.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1566.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26858.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3724.3.clone.1, %broadcast.244476.1152)
+  %negate.4637.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3724.3.clone.1)
+  %multiply.26859.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3724.3.clone.1, %negate.4637.5.clone.1)
+  %log-plus-one.1566.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26859.5.clone.1)
+  %negate.4638.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1566.3.clone.1)
+  %compare.7283.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4638.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21312.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21313.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21314.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21315.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21316.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21317.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21318.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21319.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21320.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250294.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4638.4.clone.1, %broadcast.244496.640)
+  %sqrt.1566.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4638.4.clone.1)
+  %add.250295.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1566.5.clone.1, %broadcast.244498.640)
+  %select.21321.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7283.3.clone.1, %add.250294.5.clone.1, %add.250295.5.clone.1)
+  %multiply.26860.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21320.3.clone.1, %select.21321.3.clone.1)
+  %add.250296.1.clone.1 = f32[1280,1280]{1,0} add(%select.21319.3.clone.1, %multiply.26860.1.clone.1)
+  %multiply.26861.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250296.1.clone.1, %select.21321.3.clone.1)
+  %add.250298.1.clone.1 = f32[1280,1280]{1,0} add(%select.21318.3.clone.1, %multiply.26861.1.clone.1)
+  %multiply.26862.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250298.1.clone.1, %select.21321.3.clone.1)
+  %add.250299.1.clone.1 = f32[1280,1280]{1,0} add(%select.21317.3.clone.1, %multiply.26862.1.clone.1)
+  %multiply.26863.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250299.1.clone.1, %select.21321.3.clone.1)
+  %add.250300.1.clone.1 = f32[1280,1280]{1,0} add(%select.21316.3.clone.1, %multiply.26863.1.clone.1)
+  %multiply.26864.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250300.1.clone.1, %select.21321.3.clone.1)
+  %add.250301.3.clone.1 = f32[1280,1280]{1,0} add(%select.21315.5.clone.1, %multiply.26864.1.clone.1)
+  %multiply.26865.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250301.3.clone.1, %select.21321.3.clone.1)
+  %add.250302.3.clone.1 = f32[1280,1280]{1,0} add(%select.21314.5.clone.1, %multiply.26865.1.clone.1)
+  %multiply.26866.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250302.3.clone.1, %select.21321.3.clone.1)
+  %add.250303.9.clone.1 = f32[1280,1280]{1,0} add(%select.21313.11.clone.1, %multiply.26866.7.clone.1)
+  %multiply.26867.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250303.9.clone.1, %select.21321.3.clone.1)
+  %add.250304.7.clone.1 = f32[1280,1280]{1,0} add(%select.21312.7.clone.1, %multiply.26867.7.clone.1)
+  %multiply.26868.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250304.7.clone.1, %maximum.3724.3.clone.1)
+  %select.21322.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7282.3.clone.1, %multiply.26858.9.clone.1, %multiply.26868.7.clone.1)
+  %multiply.26869.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21322.7.clone.1, %broadcast.244500.640)
+  %clamp.1210.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26869.5.clone.1, %broadcast.244501.384)
+  %multiply.26870.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1210.3.clone.1, %broadcast.244502.1)
+  %constant_169285_1_clone_1 = u32[] constant(3205688226)
+  %broadcast.249221.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169285_1_clone_1), dimensions={}
+  %add.246829.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249221.44.clone.1)
+  %constant_169292_1_clone_1 = u32[] constant(4173809523)
+  %broadcast.249222.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169292_1_clone_1), dimensions={}
+  %add.246831.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249222.113.clone.1)
+  %add.246834.35.clone.1 = u32[1280,1280]{1,0} add(%add.246829.37.clone.1, %add.246831.99.clone.1)
+  %shift-left.108940.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246831.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115095.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246831.99.clone.1, %broadcast.244415.6016)
+  %or.114619.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108940.31.clone.1, %shift-right-logical.115095.29.clone.1)
+  %xor.121172.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246834.35.clone.1, %or.114619.29.clone.1)
+  %add.246835.5.clone.1 = u32[1280,1280]{1,0} add(%add.246834.35.clone.1, %xor.121172.27.clone.1)
+  %shift-left.108941.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121172.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115096.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121172.27.clone.1, %broadcast.244417.5760)
+  %or.114620.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108941.9.clone.1, %shift-right-logical.115096.9.clone.1)
+  %xor.121173.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246835.5.clone.1, %or.114620.7.clone.1)
+  %add.246836.3.clone.1 = u32[1280,1280]{1,0} add(%add.246835.5.clone.1, %xor.121173.5.clone.1)
+  %shift-left.108942.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121173.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115098.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121173.5.clone.1, %broadcast.244419.4352)
+  %or.114622.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108942.5.clone.1, %shift-right-logical.115098.5.clone.1)
+  %xor.121174.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246836.3.clone.1, %or.114622.3.clone.1)
+  %add.246837.3.clone.1 = u32[1280,1280]{1,0} add(%add.246836.3.clone.1, %xor.121174.3.clone.1)
+  %add.246839.7.clone.1 = u32[1280,1280]{1,0} add(%add.246837.3.clone.1, %broadcast.249222.113.clone.1)
+  %shift-left.108943.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121174.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115099.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121174.3.clone.1, %broadcast.244418.4352)
+  %or.114623.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108943.5.clone.1, %shift-right-logical.115099.5.clone.1)
+  %xor.121175.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246837.3.clone.1, %or.114623.3.clone.1)
+  %constant_218071_1_clone_1 = u32[] constant(1543812876)
+  %broadcast.249236.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218071_1_clone_1), dimensions={}
+  %add.246840.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121175.3.clone.1, %broadcast.249236.5.clone.1)
+  %add.246841.5.clone.1 = u32[1280,1280]{1,0} add(%add.246839.7.clone.1, %add.246840.5.clone.1)
+  %shift-left.108944.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246840.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115100.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246840.5.clone.1, %broadcast.244416.5760)
+  %or.114624.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108944.9.clone.1, %shift-right-logical.115100.9.clone.1)
+  %xor.121177.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246841.5.clone.1, %or.114624.7.clone.1)
+  %add.246842.3.clone.1 = u32[1280,1280]{1,0} add(%add.246841.5.clone.1, %xor.121177.5.clone.1)
+  %shift-left.108945.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121177.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115101.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121177.5.clone.1, %broadcast.244429.2304)
+  %or.114625.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108945.9.clone.1, %shift-right-logical.115101.9.clone.1)
+  %xor.121178.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246842.3.clone.1, %or.114625.7.clone.1)
+  %add.246844.3.clone.1 = u32[1280,1280]{1,0} add(%add.246842.3.clone.1, %xor.121178.5.clone.1)
+  %shift-left.108946.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121178.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115103.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121178.5.clone.1, %broadcast.244430.4608)
+  %or.114627.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108946.9.clone.1, %shift-right-logical.115103.9.clone.1)
+  %xor.121179.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246844.3.clone.1, %or.114627.7.clone.1)
+  %add.246845.3.clone.1 = u32[1280,1280]{1,0} add(%add.246844.3.clone.1, %xor.121179.5.clone.1)
+  %constant_169294_1_clone_1 = u32[] constant(1543812875)
+  %broadcast.249250.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169294_1_clone_1), dimensions={}
+  %add.246846.7.clone.1 = u32[1280,1280]{1,0} add(%add.246845.3.clone.1, %broadcast.249250.24.clone.1)
+  %shift-left.108947.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121179.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115104.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121179.5.clone.1, %broadcast.244434.2816)
+  %or.114628.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108947.11.clone.1, %shift-right-logical.115104.11.clone.1)
+  %xor.121180.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246845.3.clone.1, %or.114628.9.clone.1)
+  %constant_218072_1_clone_1 = u32[] constant(3205688228)
+  %broadcast.249256.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218072_1_clone_1), dimensions={}
+  %add.246847.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121180.7.clone.1, %broadcast.249256.5.clone.1)
+  %add.246849.5.clone.1 = u32[1280,1280]{1,0} add(%add.246846.7.clone.1, %add.246847.5.clone.1)
+  %shift-left.108948.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246847.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115105.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246847.5.clone.1, %broadcast.244415.6016)
+  %or.114629.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108948.9.clone.1, %shift-right-logical.115105.9.clone.1)
+  %xor.121182.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246849.5.clone.1, %or.114629.7.clone.1)
+  %add.246850.3.clone.1 = u32[1280,1280]{1,0} add(%add.246849.5.clone.1, %xor.121182.5.clone.1)
+  %shift-left.108949.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121182.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115106.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121182.5.clone.1, %broadcast.244417.5760)
+  %or.114630.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108949.9.clone.1, %shift-right-logical.115106.9.clone.1)
+  %xor.121183.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246850.3.clone.1, %or.114630.7.clone.1)
+  %add.246851.3.clone.1 = u32[1280,1280]{1,0} add(%add.246850.3.clone.1, %xor.121183.5.clone.1)
+  %shift-left.108950.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121183.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115107.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121183.5.clone.1, %broadcast.244419.4352)
+  %or.114632.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108950.7.clone.1, %shift-right-logical.115107.7.clone.1)
+  %xor.121184.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246851.3.clone.1, %or.114632.5.clone.1)
+  %add.246852.3.clone.1 = u32[1280,1280]{1,0} add(%add.246851.3.clone.1, %xor.121184.3.clone.1)
+  %add.246853.7.clone.1 = u32[1280,1280]{1,0} add(%add.246852.3.clone.1, %broadcast.249221.44.clone.1)
+  %shift-left.108951.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121184.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115108.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121184.3.clone.1, %broadcast.244418.4352)
+  %or.114633.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108951.7.clone.1, %shift-right-logical.115108.7.clone.1)
+  %xor.121185.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246852.3.clone.1, %or.114633.5.clone.1)
+  %constant_218073_1_clone_1 = u32[] constant(4173809526)
+  %broadcast.249269.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218073_1_clone_1), dimensions={}
+  %add.246855.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121185.3.clone.1, %broadcast.249269.5.clone.1)
+  %add.246859.5.clone.1 = u32[1280,1280]{1,0} add(%add.246853.7.clone.1, %add.246855.5.clone.1)
+  %shift-left.108952.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246855.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115109.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246855.5.clone.1, %broadcast.244416.5760)
+  %or.114634.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108952.9.clone.1, %shift-right-logical.115109.9.clone.1)
+  %xor.121186.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246859.5.clone.1, %or.114634.7.clone.1)
+  %add.246860.3.clone.1 = u32[1280,1280]{1,0} add(%add.246859.5.clone.1, %xor.121186.5.clone.1)
+  %shift-left.108953.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121186.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115110.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121186.5.clone.1, %broadcast.244429.2304)
+  %or.114635.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108953.9.clone.1, %shift-right-logical.115110.9.clone.1)
+  %xor.121187.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246860.3.clone.1, %or.114635.7.clone.1)
+  %add.246861.3.clone.1 = u32[1280,1280]{1,0} add(%add.246860.3.clone.1, %xor.121187.5.clone.1)
+  %shift-left.108954.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121187.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115111.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121187.5.clone.1, %broadcast.244430.4608)
+  %or.114637.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108954.9.clone.1, %shift-right-logical.115111.9.clone.1)
+  %xor.121188.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246861.3.clone.1, %or.114637.7.clone.1)
+  %add.246862.3.clone.1 = u32[1280,1280]{1,0} add(%add.246861.3.clone.1, %xor.121188.5.clone.1)
+  %add.246864.7.clone.1 = u32[1280,1280]{1,0} add(%add.246862.3.clone.1, %broadcast.249222.113.clone.1)
+  %shift-left.108955.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121188.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115112.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121188.5.clone.1, %broadcast.244434.2816)
+  %or.114638.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108955.11.clone.1, %shift-right-logical.115112.11.clone.1)
+  %xor.121189.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246862.3.clone.1, %or.114638.9.clone.1)
+  %constant_218074_1_clone_1 = u32[] constant(1543812879)
+  %broadcast.249279.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218074_1_clone_1), dimensions={}
+  %add.246865.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121189.7.clone.1, %broadcast.249279.5.clone.1)
+  %add.246866.5.clone.1 = u32[1280,1280]{1,0} add(%add.246864.7.clone.1, %add.246865.5.clone.1)
+  %shift-left.108956.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246865.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115113.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246865.5.clone.1, %broadcast.244415.6016)
+  %or.114639.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108956.9.clone.1, %shift-right-logical.115113.9.clone.1)
+  %xor.121190.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246866.5.clone.1, %or.114639.7.clone.1)
+  %add.246867.3.clone.1 = u32[1280,1280]{1,0} add(%add.246866.5.clone.1, %xor.121190.5.clone.1)
+  %shift-left.108957.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121190.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115114.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121190.5.clone.1, %broadcast.244417.5760)
+  %or.114640.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108957.9.clone.1, %shift-right-logical.115114.9.clone.1)
+  %xor.121192.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246867.3.clone.1, %or.114640.7.clone.1)
+  %add.246869.3.clone.1 = u32[1280,1280]{1,0} add(%add.246867.3.clone.1, %xor.121192.5.clone.1)
+  %shift-left.108958.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121192.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115116.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121192.5.clone.1, %broadcast.244419.4352)
+  %or.114641.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108958.5.clone.1, %shift-right-logical.115116.5.clone.1)
+  %xor.121193.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246869.3.clone.1, %or.114641.3.clone.1)
+  %add.246870.3.clone.1 = u32[1280,1280]{1,0} add(%add.246869.3.clone.1, %xor.121193.3.clone.1)
+  %add.246871.17.clone.1 = u32[1280,1280]{1,0} add(%add.246870.3.clone.1, %broadcast.249250.24.clone.1)
+  %shift-left.108959.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121193.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115117.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121193.3.clone.1, %broadcast.244418.4352)
+  %or.114642.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108959.5.clone.1, %shift-right-logical.115117.5.clone.1)
+  %xor.121194.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246870.3.clone.1, %or.114642.3.clone.1)
+  %constant_218075_1_clone_1 = u32[] constant(3205688231)
+  %broadcast.249291.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218075_1_clone_1), dimensions={}
+  %add.246872.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121194.15.clone.1, %broadcast.249291.19.clone.1)
+  %xor.121195.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246871.17.clone.1, %add.246872.19.clone.1)
+  %shift-right-logical.115118.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121195.17.clone.1, %broadcast.244468.1920)
+  %or.114643.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115118.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5724.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114643.13.clone.1)
+  %add.246874.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5724.11.clone.1, %broadcast.244470.1152)
+  %multiply.26154.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246874.9.clone.1, %broadcast.244471.896)
+  %add.246875.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26154.7.clone.1, %broadcast.244408.1024)
+  %maximum.3656.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246875.5.clone.1)
+  %abs.1520.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3656.3.clone.1)
+  %compare.7188.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1520.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26155.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3656.3.clone.1, %broadcast.244476.1152)
+  %negate.4545.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3656.3.clone.1)
+  %multiply.26156.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3656.3.clone.1, %negate.4545.5.clone.1)
+  %log-plus-one.1520.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26156.5.clone.1)
+  %negate.4546.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1520.3.clone.1)
+  %compare.7189.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4546.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20785.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20786.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20787.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20788.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20789.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20790.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20791.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20792.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20793.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246876.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4546.4.clone.1, %broadcast.244496.640)
+  %sqrt.1520.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4546.4.clone.1)
+  %add.246877.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1520.5.clone.1, %broadcast.244498.640)
+  %select.20794.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7189.3.clone.1, %add.246876.5.clone.1, %add.246877.5.clone.1)
+  %multiply.26157.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20793.3.clone.1, %select.20794.3.clone.1)
+  %add.246878.1.clone.1 = f32[1280,1280]{1,0} add(%select.20792.3.clone.1, %multiply.26157.1.clone.1)
+  %multiply.26158.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246878.1.clone.1, %select.20794.3.clone.1)
+  %add.246880.1.clone.1 = f32[1280,1280]{1,0} add(%select.20791.3.clone.1, %multiply.26158.1.clone.1)
+  %multiply.26159.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246880.1.clone.1, %select.20794.3.clone.1)
+  %add.246884.1.clone.1 = f32[1280,1280]{1,0} add(%select.20790.3.clone.1, %multiply.26159.1.clone.1)
+  %multiply.26160.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246884.1.clone.1, %select.20794.3.clone.1)
+  %add.246885.1.clone.1 = f32[1280,1280]{1,0} add(%select.20789.3.clone.1, %multiply.26160.1.clone.1)
+  %multiply.26161.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246885.1.clone.1, %select.20794.3.clone.1)
+  %add.246886.3.clone.1 = f32[1280,1280]{1,0} add(%select.20788.5.clone.1, %multiply.26161.1.clone.1)
+  %multiply.26162.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246886.3.clone.1, %select.20794.3.clone.1)
+  %add.246887.3.clone.1 = f32[1280,1280]{1,0} add(%select.20787.5.clone.1, %multiply.26162.1.clone.1)
+  %multiply.26163.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246887.3.clone.1, %select.20794.3.clone.1)
+  %add.246889.9.clone.1 = f32[1280,1280]{1,0} add(%select.20786.11.clone.1, %multiply.26163.7.clone.1)
+  %multiply.26164.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246889.9.clone.1, %select.20794.3.clone.1)
+  %add.246890.7.clone.1 = f32[1280,1280]{1,0} add(%select.20785.7.clone.1, %multiply.26164.7.clone.1)
+  %multiply.26165.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246890.7.clone.1, %maximum.3656.3.clone.1)
+  %select.20795.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7188.3.clone.1, %multiply.26155.9.clone.1, %multiply.26165.7.clone.1)
+  %multiply.26166.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20795.7.clone.1, %broadcast.244500.640)
+  %clamp.1164.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26166.5.clone.1, %broadcast.244501.384)
+  %multiply.26167.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1164.3.clone.1, %broadcast.244502.1)
+  %constant_193664_1_clone_1 = u32[] constant(3522665278)
+  %broadcast.259787.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193664_1_clone_1), dimensions={}
+  %add.252859.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.259787.44.clone.1)
+  %constant_193671_1_clone_1 = u32[] constant(151387858)
+  %broadcast.259788.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193671_1_clone_1), dimensions={}
+  %add.252860.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.259788.113.clone.1)
+  %add.252862.35.clone.1 = u32[1280,1280]{1,0} add(%add.252859.37.clone.1, %add.252860.99.clone.1)
+  %shift-left.111558.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252860.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117855.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252860.99.clone.1, %broadcast.244415.6016)
+  %or.117381.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111558.31.clone.1, %shift-right-logical.117855.29.clone.1)
+  %xor.123934.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252862.35.clone.1, %or.117381.29.clone.1)
+  %add.252866.5.clone.1 = u32[1280,1280]{1,0} add(%add.252862.35.clone.1, %xor.123934.27.clone.1)
+  %shift-left.111559.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123934.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117856.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123934.27.clone.1, %broadcast.244417.5760)
+  %or.117382.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111559.9.clone.1, %shift-right-logical.117856.9.clone.1)
+  %xor.123935.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252866.5.clone.1, %or.117382.7.clone.1)
+  %add.252867.3.clone.1 = u32[1280,1280]{1,0} add(%add.252866.5.clone.1, %xor.123935.5.clone.1)
+  %shift-left.111560.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123935.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117857.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123935.5.clone.1, %broadcast.244419.4352)
+  %or.117383.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111560.5.clone.1, %shift-right-logical.117857.5.clone.1)
+  %xor.123936.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252867.3.clone.1, %or.117383.3.clone.1)
+  %add.252868.3.clone.1 = u32[1280,1280]{1,0} add(%add.252867.3.clone.1, %xor.123936.3.clone.1)
+  %add.252869.7.clone.1 = u32[1280,1280]{1,0} add(%add.252868.3.clone.1, %broadcast.259788.113.clone.1)
+  %shift-left.111562.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123936.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117858.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123936.3.clone.1, %broadcast.244418.4352)
+  %or.117384.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111562.5.clone.1, %shift-right-logical.117858.5.clone.1)
+  %xor.123937.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252868.3.clone.1, %or.117384.3.clone.1)
+  %constant_218731_1_clone_1 = u32[] constant(3273883191)
+  %broadcast.259798.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218731_1_clone_1), dimensions={}
+  %add.252871.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123937.3.clone.1, %broadcast.259798.5.clone.1)
+  %add.252872.5.clone.1 = u32[1280,1280]{1,0} add(%add.252869.7.clone.1, %add.252871.5.clone.1)
+  %shift-left.111563.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252871.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117859.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252871.5.clone.1, %broadcast.244416.5760)
+  %or.117385.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111563.9.clone.1, %shift-right-logical.117859.9.clone.1)
+  %xor.123938.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252872.5.clone.1, %or.117385.7.clone.1)
+  %add.252873.3.clone.1 = u32[1280,1280]{1,0} add(%add.252872.5.clone.1, %xor.123938.5.clone.1)
+  %shift-left.111564.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123938.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117860.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123938.5.clone.1, %broadcast.244429.2304)
+  %or.117386.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111564.9.clone.1, %shift-right-logical.117860.9.clone.1)
+  %xor.123939.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252873.3.clone.1, %or.117386.7.clone.1)
+  %add.252874.3.clone.1 = u32[1280,1280]{1,0} add(%add.252873.3.clone.1, %xor.123939.5.clone.1)
+  %shift-left.111565.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123939.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117861.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123939.5.clone.1, %broadcast.244430.4608)
+  %or.117387.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111565.9.clone.1, %shift-right-logical.117861.9.clone.1)
+  %xor.123940.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252874.3.clone.1, %or.117387.7.clone.1)
+  %add.252876.3.clone.1 = u32[1280,1280]{1,0} add(%add.252874.3.clone.1, %xor.123940.5.clone.1)
+  %constant_193673_1_clone_1 = u32[] constant(3273883190)
+  %broadcast.259807.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193673_1_clone_1), dimensions={}
+  %add.252877.7.clone.1 = u32[1280,1280]{1,0} add(%add.252876.3.clone.1, %broadcast.259807.24.clone.1)
+  %shift-left.111567.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123940.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117862.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123940.5.clone.1, %broadcast.244434.2816)
+  %or.117388.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111567.11.clone.1, %shift-right-logical.117862.11.clone.1)
+  %xor.123942.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252876.3.clone.1, %or.117388.9.clone.1)
+  %constant_218732_1_clone_1 = u32[] constant(3522665280)
+  %broadcast.259810.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218732_1_clone_1), dimensions={}
+  %add.252878.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123942.7.clone.1, %broadcast.259810.5.clone.1)
+  %add.252879.5.clone.1 = u32[1280,1280]{1,0} add(%add.252877.7.clone.1, %add.252878.5.clone.1)
+  %shift-left.111568.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252878.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117863.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252878.5.clone.1, %broadcast.244415.6016)
+  %or.117389.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111568.9.clone.1, %shift-right-logical.117863.9.clone.1)
+  %xor.123943.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252879.5.clone.1, %or.117389.7.clone.1)
+  %add.252881.3.clone.1 = u32[1280,1280]{1,0} add(%add.252879.5.clone.1, %xor.123943.5.clone.1)
+  %shift-left.111569.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123943.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117864.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123943.5.clone.1, %broadcast.244417.5760)
+  %or.117390.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111569.9.clone.1, %shift-right-logical.117864.9.clone.1)
+  %xor.123944.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252881.3.clone.1, %or.117390.7.clone.1)
+  %add.252882.3.clone.1 = u32[1280,1280]{1,0} add(%add.252881.3.clone.1, %xor.123944.5.clone.1)
+  %shift-left.111570.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123944.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117865.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123944.5.clone.1, %broadcast.244419.4352)
+  %or.117391.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111570.7.clone.1, %shift-right-logical.117865.7.clone.1)
+  %xor.123945.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252882.3.clone.1, %or.117391.5.clone.1)
+  %add.252883.3.clone.1 = u32[1280,1280]{1,0} add(%add.252882.3.clone.1, %xor.123945.3.clone.1)
+  %add.252884.7.clone.1 = u32[1280,1280]{1,0} add(%add.252883.3.clone.1, %broadcast.259787.44.clone.1)
+  %shift-left.111571.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123945.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117866.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123945.3.clone.1, %broadcast.244418.4352)
+  %or.117392.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111571.7.clone.1, %shift-right-logical.117866.7.clone.1)
+  %xor.123947.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252883.3.clone.1, %or.117392.5.clone.1)
+  %constant_218733_1_clone_1 = u32[] constant(151387861)
+  %broadcast.259820.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218733_1_clone_1), dimensions={}
+  %add.252885.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123947.3.clone.1, %broadcast.259820.5.clone.1)
+  %add.252887.5.clone.1 = u32[1280,1280]{1,0} add(%add.252884.7.clone.1, %add.252885.5.clone.1)
+  %shift-left.111572.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252885.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117867.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252885.5.clone.1, %broadcast.244416.5760)
+  %or.117393.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111572.9.clone.1, %shift-right-logical.117867.9.clone.1)
+  %xor.123948.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252887.5.clone.1, %or.117393.7.clone.1)
+  %add.252890.3.clone.1 = u32[1280,1280]{1,0} add(%add.252887.5.clone.1, %xor.123948.5.clone.1)
+  %shift-left.111573.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123948.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117868.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123948.5.clone.1, %broadcast.244429.2304)
+  %or.117394.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111573.9.clone.1, %shift-right-logical.117868.9.clone.1)
+  %xor.123949.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252890.3.clone.1, %or.117394.7.clone.1)
+  %add.252891.3.clone.1 = u32[1280,1280]{1,0} add(%add.252890.3.clone.1, %xor.123949.5.clone.1)
+  %shift-left.111574.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123949.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117869.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123949.5.clone.1, %broadcast.244430.4608)
+  %or.117395.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111574.9.clone.1, %shift-right-logical.117869.9.clone.1)
+  %xor.123950.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252891.3.clone.1, %or.117395.7.clone.1)
+  %add.252892.3.clone.1 = u32[1280,1280]{1,0} add(%add.252891.3.clone.1, %xor.123950.5.clone.1)
+  %add.252893.7.clone.1 = u32[1280,1280]{1,0} add(%add.252892.3.clone.1, %broadcast.259788.113.clone.1)
+  %shift-left.111575.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123950.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117870.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123950.5.clone.1, %broadcast.244434.2816)
+  %or.117396.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111575.11.clone.1, %shift-right-logical.117870.11.clone.1)
+  %xor.123952.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252892.3.clone.1, %or.117396.9.clone.1)
+  %constant_218734_1_clone_1 = u32[] constant(3273883194)
+  %broadcast.259832.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218734_1_clone_1), dimensions={}
+  %add.252894.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123952.7.clone.1, %broadcast.259832.5.clone.1)
+  %add.252895.5.clone.1 = u32[1280,1280]{1,0} add(%add.252893.7.clone.1, %add.252894.5.clone.1)
+  %shift-left.111576.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252894.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117871.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252894.5.clone.1, %broadcast.244415.6016)
+  %or.117397.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111576.9.clone.1, %shift-right-logical.117871.9.clone.1)
+  %xor.123953.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252895.5.clone.1, %or.117397.7.clone.1)
+  %add.252896.3.clone.1 = u32[1280,1280]{1,0} add(%add.252895.5.clone.1, %xor.123953.5.clone.1)
+  %shift-left.111577.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123953.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117872.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123953.5.clone.1, %broadcast.244417.5760)
+  %or.117398.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111577.9.clone.1, %shift-right-logical.117872.9.clone.1)
+  %xor.123954.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252896.3.clone.1, %or.117398.7.clone.1)
+  %add.252897.3.clone.1 = u32[1280,1280]{1,0} add(%add.252896.3.clone.1, %xor.123954.5.clone.1)
+  %shift-left.111578.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123954.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117873.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123954.5.clone.1, %broadcast.244419.4352)
+  %or.117399.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111578.5.clone.1, %shift-right-logical.117873.5.clone.1)
+  %xor.123955.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252897.3.clone.1, %or.117399.3.clone.1)
+  %add.252898.3.clone.1 = u32[1280,1280]{1,0} add(%add.252897.3.clone.1, %xor.123955.3.clone.1)
+  %add.252899.17.clone.1 = u32[1280,1280]{1,0} add(%add.252898.3.clone.1, %broadcast.259807.24.clone.1)
+  %shift-left.111579.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123955.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117874.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123955.3.clone.1, %broadcast.244418.4352)
+  %or.117400.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111579.5.clone.1, %shift-right-logical.117874.5.clone.1)
+  %xor.123957.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252898.3.clone.1, %or.117400.3.clone.1)
+  %constant_218735_1_clone_1 = u32[] constant(3522665283)
+  %broadcast.259842.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218735_1_clone_1), dimensions={}
+  %add.252900.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123957.15.clone.1, %broadcast.259842.19.clone.1)
+  %xor.123958.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252899.17.clone.1, %add.252900.19.clone.1)
+  %shift-right-logical.117875.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123958.17.clone.1, %broadcast.244468.1920)
+  %or.117401.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117875.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5844.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117401.13.clone.1)
+  %add.252901.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5844.11.clone.1, %broadcast.244470.1152)
+  %multiply.27384.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252901.9.clone.1, %broadcast.244471.896)
+  %add.252902.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27384.7.clone.1, %broadcast.244408.1024)
+  %maximum.3776.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252902.5.clone.1)
+  %abs.1600.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3776.3.clone.1)
+  %compare.7362.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1600.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27385.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3776.3.clone.1, %broadcast.244476.1152)
+  %negate.4705.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3776.3.clone.1)
+  %multiply.27386.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3776.3.clone.1, %negate.4705.5.clone.1)
+  %log-plus-one.1600.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27386.5.clone.1)
+  %negate.4706.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1600.3.clone.1)
+  %compare.7363.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4706.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21707.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21708.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21709.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21710.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21711.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21712.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21713.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21714.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21715.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252903.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4706.4.clone.1, %broadcast.244496.640)
+  %sqrt.1600.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4706.4.clone.1)
+  %add.252904.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1600.5.clone.1, %broadcast.244498.640)
+  %select.21716.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7363.3.clone.1, %add.252903.5.clone.1, %add.252904.5.clone.1)
+  %multiply.27387.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21715.3.clone.1, %select.21716.3.clone.1)
+  %add.252905.1.clone.1 = f32[1280,1280]{1,0} add(%select.21714.3.clone.1, %multiply.27387.1.clone.1)
+  %multiply.27388.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252905.1.clone.1, %select.21716.3.clone.1)
+  %add.252906.1.clone.1 = f32[1280,1280]{1,0} add(%select.21713.3.clone.1, %multiply.27388.1.clone.1)
+  %multiply.27389.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252906.1.clone.1, %select.21716.3.clone.1)
+  %add.252907.1.clone.1 = f32[1280,1280]{1,0} add(%select.21712.3.clone.1, %multiply.27389.1.clone.1)
+  %multiply.27390.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252907.1.clone.1, %select.21716.3.clone.1)
+  %add.252908.1.clone.1 = f32[1280,1280]{1,0} add(%select.21711.3.clone.1, %multiply.27390.1.clone.1)
+  %multiply.27391.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252908.1.clone.1, %select.21716.3.clone.1)
+  %add.252909.3.clone.1 = f32[1280,1280]{1,0} add(%select.21710.5.clone.1, %multiply.27391.1.clone.1)
+  %multiply.27392.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252909.3.clone.1, %select.21716.3.clone.1)
+  %add.252910.3.clone.1 = f32[1280,1280]{1,0} add(%select.21709.5.clone.1, %multiply.27392.1.clone.1)
+  %multiply.27393.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252910.3.clone.1, %select.21716.3.clone.1)
+  %add.252911.9.clone.1 = f32[1280,1280]{1,0} add(%select.21708.11.clone.1, %multiply.27393.7.clone.1)
+  %multiply.27394.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252911.9.clone.1, %select.21716.3.clone.1)
+  %add.252912.7.clone.1 = f32[1280,1280]{1,0} add(%select.21707.7.clone.1, %multiply.27394.7.clone.1)
+  %multiply.27395.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252912.7.clone.1, %maximum.3776.3.clone.1)
+  %select.21717.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7362.3.clone.1, %multiply.27385.9.clone.1, %multiply.27395.7.clone.1)
+  %multiply.27396.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21717.7.clone.1, %broadcast.244500.640)
+  %clamp.1244.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27396.5.clone.1, %broadcast.244501.384)
+  %multiply.27397.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1244.3.clone.1, %broadcast.244502.1)
+  %constant_169075_1_clone_1 = u32[] constant(737138696)
+  %broadcast.249135.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169075_1_clone_1), dimensions={}
+  %add.246785.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.249135.44.clone.1)
+  %constant_169082_1_clone_1 = u32[] constant(2005701892)
+  %broadcast.249136.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169082_1_clone_1), dimensions={}
+  %add.246786.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.249136.113.clone.1)
+  %add.246787.35.clone.1 = u32[1280,1280]{1,0} add(%add.246785.37.clone.1, %add.246786.99.clone.1)
+  %shift-left.108920.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246786.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115070.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246786.99.clone.1, %broadcast.244415.6016)
+  %or.114598.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108920.31.clone.1, %shift-right-logical.115070.29.clone.1)
+  %xor.121147.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246787.35.clone.1, %or.114598.29.clone.1)
+  %add.246788.5.clone.1 = u32[1280,1280]{1,0} add(%add.246787.35.clone.1, %xor.121147.27.clone.1)
+  %shift-left.108921.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121147.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115071.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121147.27.clone.1, %broadcast.244417.5760)
+  %or.114599.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108921.9.clone.1, %shift-right-logical.115071.9.clone.1)
+  %xor.121148.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246788.5.clone.1, %or.114599.7.clone.1)
+  %add.246789.3.clone.1 = u32[1280,1280]{1,0} add(%add.246788.5.clone.1, %xor.121148.5.clone.1)
+  %shift-left.108922.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121148.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115073.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121148.5.clone.1, %broadcast.244419.4352)
+  %or.114600.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108922.5.clone.1, %shift-right-logical.115073.5.clone.1)
+  %xor.121149.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246789.3.clone.1, %or.114600.3.clone.1)
+  %add.246790.3.clone.1 = u32[1280,1280]{1,0} add(%add.246789.3.clone.1, %xor.121149.3.clone.1)
+  %add.246791.7.clone.1 = u32[1280,1280]{1,0} add(%add.246790.3.clone.1, %broadcast.249136.113.clone.1)
+  %shift-left.108923.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121149.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115074.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121149.3.clone.1, %broadcast.244418.4352)
+  %or.114601.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108923.5.clone.1, %shift-right-logical.115074.5.clone.1)
+  %xor.121150.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246790.3.clone.1, %or.114601.3.clone.1)
+  %constant_218066_1_clone_1 = u32[] constant(1202869975)
+  %broadcast.249146.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218066_1_clone_1), dimensions={}
+  %add.246792.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121150.3.clone.1, %broadcast.249146.5.clone.1)
+  %add.246793.5.clone.1 = u32[1280,1280]{1,0} add(%add.246791.7.clone.1, %add.246792.5.clone.1)
+  %shift-left.108924.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246792.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115075.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246792.5.clone.1, %broadcast.244416.5760)
+  %or.114602.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108924.9.clone.1, %shift-right-logical.115075.9.clone.1)
+  %xor.121152.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246793.5.clone.1, %or.114602.7.clone.1)
+  %add.246794.3.clone.1 = u32[1280,1280]{1,0} add(%add.246793.5.clone.1, %xor.121152.5.clone.1)
+  %shift-left.108925.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121152.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115076.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121152.5.clone.1, %broadcast.244429.2304)
+  %or.114603.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108925.9.clone.1, %shift-right-logical.115076.9.clone.1)
+  %xor.121153.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246794.3.clone.1, %or.114603.7.clone.1)
+  %add.246795.3.clone.1 = u32[1280,1280]{1,0} add(%add.246794.3.clone.1, %xor.121153.5.clone.1)
+  %shift-left.108926.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121153.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115078.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121153.5.clone.1, %broadcast.244430.4608)
+  %or.114604.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108926.9.clone.1, %shift-right-logical.115078.9.clone.1)
+  %xor.121154.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246795.3.clone.1, %or.114604.7.clone.1)
+  %add.246796.3.clone.1 = u32[1280,1280]{1,0} add(%add.246795.3.clone.1, %xor.121154.5.clone.1)
+  %constant_169084_1_clone_1 = u32[] constant(1202869974)
+  %broadcast.249153.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_169084_1_clone_1), dimensions={}
+  %add.246797.7.clone.1 = u32[1280,1280]{1,0} add(%add.246796.3.clone.1, %broadcast.249153.24.clone.1)
+  %shift-left.108927.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121154.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115079.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121154.5.clone.1, %broadcast.244434.2816)
+  %or.114605.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108927.11.clone.1, %shift-right-logical.115079.11.clone.1)
+  %xor.121155.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246796.3.clone.1, %or.114605.9.clone.1)
+  %constant_218067_1_clone_1 = u32[] constant(737138698)
+  %broadcast.249156.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218067_1_clone_1), dimensions={}
+  %add.246798.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121155.7.clone.1, %broadcast.249156.5.clone.1)
+  %add.246799.5.clone.1 = u32[1280,1280]{1,0} add(%add.246797.7.clone.1, %add.246798.5.clone.1)
+  %shift-left.108928.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246798.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115080.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246798.5.clone.1, %broadcast.244415.6016)
+  %or.114606.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108928.9.clone.1, %shift-right-logical.115080.9.clone.1)
+  %xor.121157.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246799.5.clone.1, %or.114606.7.clone.1)
+  %add.246800.3.clone.1 = u32[1280,1280]{1,0} add(%add.246799.5.clone.1, %xor.121157.5.clone.1)
+  %shift-left.108929.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121157.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115081.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121157.5.clone.1, %broadcast.244417.5760)
+  %or.114607.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108929.9.clone.1, %shift-right-logical.115081.9.clone.1)
+  %xor.121158.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246800.3.clone.1, %or.114607.7.clone.1)
+  %add.246801.3.clone.1 = u32[1280,1280]{1,0} add(%add.246800.3.clone.1, %xor.121158.5.clone.1)
+  %shift-left.108930.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121158.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115082.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121158.5.clone.1, %broadcast.244419.4352)
+  %or.114608.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108930.7.clone.1, %shift-right-logical.115082.7.clone.1)
+  %xor.121159.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246801.3.clone.1, %or.114608.5.clone.1)
+  %add.246802.3.clone.1 = u32[1280,1280]{1,0} add(%add.246801.3.clone.1, %xor.121159.3.clone.1)
+  %add.246803.7.clone.1 = u32[1280,1280]{1,0} add(%add.246802.3.clone.1, %broadcast.249135.44.clone.1)
+  %shift-left.108931.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121159.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115083.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121159.3.clone.1, %broadcast.244418.4352)
+  %or.114609.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108931.7.clone.1, %shift-right-logical.115083.7.clone.1)
+  %xor.121160.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246802.3.clone.1, %or.114609.5.clone.1)
+  %constant_218068_1_clone_1 = u32[] constant(2005701895)
+  %broadcast.249166.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218068_1_clone_1), dimensions={}
+  %add.246804.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121160.3.clone.1, %broadcast.249166.5.clone.1)
+  %add.246805.5.clone.1 = u32[1280,1280]{1,0} add(%add.246803.7.clone.1, %add.246804.5.clone.1)
+  %shift-left.108932.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246804.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115084.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246804.5.clone.1, %broadcast.244416.5760)
+  %or.114610.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108932.9.clone.1, %shift-right-logical.115084.9.clone.1)
+  %xor.121161.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246805.5.clone.1, %or.114610.7.clone.1)
+  %add.246806.3.clone.1 = u32[1280,1280]{1,0} add(%add.246805.5.clone.1, %xor.121161.5.clone.1)
+  %shift-left.108933.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121161.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115085.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121161.5.clone.1, %broadcast.244429.2304)
+  %or.114611.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108933.9.clone.1, %shift-right-logical.115085.9.clone.1)
+  %xor.121162.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246806.3.clone.1, %or.114611.7.clone.1)
+  %add.246807.3.clone.1 = u32[1280,1280]{1,0} add(%add.246806.3.clone.1, %xor.121162.5.clone.1)
+  %shift-left.108934.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121162.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115086.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121162.5.clone.1, %broadcast.244430.4608)
+  %or.114612.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108934.9.clone.1, %shift-right-logical.115086.9.clone.1)
+  %xor.121163.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246807.3.clone.1, %or.114612.7.clone.1)
+  %add.246808.3.clone.1 = u32[1280,1280]{1,0} add(%add.246807.3.clone.1, %xor.121163.5.clone.1)
+  %add.246809.7.clone.1 = u32[1280,1280]{1,0} add(%add.246808.3.clone.1, %broadcast.249136.113.clone.1)
+  %shift-left.108935.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121163.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115088.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121163.5.clone.1, %broadcast.244434.2816)
+  %or.114613.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108935.11.clone.1, %shift-right-logical.115088.11.clone.1)
+  %xor.121164.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246808.3.clone.1, %or.114613.9.clone.1)
+  %constant_218069_1_clone_1 = u32[] constant(1202869978)
+  %broadcast.249176.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218069_1_clone_1), dimensions={}
+  %add.246810.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121164.7.clone.1, %broadcast.249176.5.clone.1)
+  %add.246811.5.clone.1 = u32[1280,1280]{1,0} add(%add.246809.7.clone.1, %add.246810.5.clone.1)
+  %shift-left.108936.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246810.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115089.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246810.5.clone.1, %broadcast.244415.6016)
+  %or.114614.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108936.9.clone.1, %shift-right-logical.115089.9.clone.1)
+  %xor.121165.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246811.5.clone.1, %or.114614.7.clone.1)
+  %add.246812.3.clone.1 = u32[1280,1280]{1,0} add(%add.246811.5.clone.1, %xor.121165.5.clone.1)
+  %shift-left.108937.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121165.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115090.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121165.5.clone.1, %broadcast.244417.5760)
+  %or.114615.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108937.9.clone.1, %shift-right-logical.115090.9.clone.1)
+  %xor.121167.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246812.3.clone.1, %or.114615.7.clone.1)
+  %add.246813.3.clone.1 = u32[1280,1280]{1,0} add(%add.246812.3.clone.1, %xor.121167.5.clone.1)
+  %shift-left.108938.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121167.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115091.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121167.5.clone.1, %broadcast.244419.4352)
+  %or.114616.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108938.5.clone.1, %shift-right-logical.115091.5.clone.1)
+  %xor.121168.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246813.3.clone.1, %or.114616.3.clone.1)
+  %add.246814.3.clone.1 = u32[1280,1280]{1,0} add(%add.246813.3.clone.1, %xor.121168.3.clone.1)
+  %add.246815.17.clone.1 = u32[1280,1280]{1,0} add(%add.246814.3.clone.1, %broadcast.249153.24.clone.1)
+  %shift-left.108939.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121168.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115093.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121168.3.clone.1, %broadcast.244418.4352)
+  %or.114617.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108939.5.clone.1, %shift-right-logical.115093.5.clone.1)
+  %xor.121169.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246814.3.clone.1, %or.114617.3.clone.1)
+  %constant_218070_1_clone_1 = u32[] constant(737138701)
+  %broadcast.249186.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218070_1_clone_1), dimensions={}
+  %add.246816.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121169.15.clone.1, %broadcast.249186.19.clone.1)
+  %xor.121170.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246815.17.clone.1, %add.246816.19.clone.1)
+  %shift-right-logical.115094.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121170.17.clone.1, %broadcast.244468.1920)
+  %or.114618.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115094.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5723.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114618.13.clone.1)
+  %add.246817.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5723.11.clone.1, %broadcast.244470.1152)
+  %multiply.26140.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246817.9.clone.1, %broadcast.244471.896)
+  %add.246818.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26140.7.clone.1, %broadcast.244408.1024)
+  %maximum.3655.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246818.5.clone.1)
+  %abs.1519.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3655.3.clone.1)
+  %compare.7186.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1519.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26141.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3655.3.clone.1, %broadcast.244476.1152)
+  %negate.4543.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3655.3.clone.1)
+  %multiply.26142.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3655.3.clone.1, %negate.4543.5.clone.1)
+  %log-plus-one.1519.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26142.5.clone.1)
+  %negate.4544.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1519.3.clone.1)
+  %compare.7187.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4544.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20774.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20775.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20776.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20777.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20778.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20779.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20780.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20781.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20782.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246819.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4544.4.clone.1, %broadcast.244496.640)
+  %sqrt.1519.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4544.4.clone.1)
+  %add.246820.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1519.5.clone.1, %broadcast.244498.640)
+  %select.20783.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7187.3.clone.1, %add.246819.5.clone.1, %add.246820.5.clone.1)
+  %multiply.26143.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20782.3.clone.1, %select.20783.3.clone.1)
+  %add.246821.1.clone.1 = f32[1280,1280]{1,0} add(%select.20781.3.clone.1, %multiply.26143.1.clone.1)
+  %multiply.26144.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246821.1.clone.1, %select.20783.3.clone.1)
+  %add.246822.1.clone.1 = f32[1280,1280]{1,0} add(%select.20780.3.clone.1, %multiply.26144.1.clone.1)
+  %multiply.26145.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246822.1.clone.1, %select.20783.3.clone.1)
+  %add.246823.1.clone.1 = f32[1280,1280]{1,0} add(%select.20779.3.clone.1, %multiply.26145.1.clone.1)
+  %multiply.26146.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246823.1.clone.1, %select.20783.3.clone.1)
+  %add.246824.1.clone.1 = f32[1280,1280]{1,0} add(%select.20778.3.clone.1, %multiply.26146.1.clone.1)
+  %multiply.26147.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246824.1.clone.1, %select.20783.3.clone.1)
+  %add.246825.3.clone.1 = f32[1280,1280]{1,0} add(%select.20777.5.clone.1, %multiply.26147.1.clone.1)
+  %multiply.26148.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246825.3.clone.1, %select.20783.3.clone.1)
+  %add.246826.3.clone.1 = f32[1280,1280]{1,0} add(%select.20776.5.clone.1, %multiply.26148.1.clone.1)
+  %multiply.26149.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246826.3.clone.1, %select.20783.3.clone.1)
+  %add.246827.9.clone.1 = f32[1280,1280]{1,0} add(%select.20775.11.clone.1, %multiply.26149.7.clone.1)
+  %multiply.26150.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246827.9.clone.1, %select.20783.3.clone.1)
+  %add.246828.7.clone.1 = f32[1280,1280]{1,0} add(%select.20774.7.clone.1, %multiply.26150.7.clone.1)
+  %multiply.26151.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246828.7.clone.1, %maximum.3655.3.clone.1)
+  %select.20784.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7186.3.clone.1, %multiply.26141.9.clone.1, %multiply.26151.7.clone.1)
+  %multiply.26152.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20784.7.clone.1, %broadcast.244500.640)
+  %clamp.1163.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26152.5.clone.1, %broadcast.244501.384)
+  %multiply.26153.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1163.3.clone.1, %broadcast.244502.1)
+  %constant_182936_1_clone_1 = u32[] constant(1379923113)
+  %broadcast.255148.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182936_1_clone_1), dimensions={}
+  %add.250199.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255148.44.clone.1)
+  %constant_182943_1_clone_1 = u32[] constant(1461771423)
+  %broadcast.255149.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182943_1_clone_1), dimensions={}
+  %add.250200.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.255149.113.clone.1)
+  %add.250201.35.clone.1 = u32[1280,1280]{1,0} add(%add.250199.37.clone.1, %add.250200.99.clone.1)
+  %shift-left.110396.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250200.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116634.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250200.99.clone.1, %broadcast.244415.6016)
+  %or.116152.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110396.31.clone.1, %shift-right-logical.116634.29.clone.1)
+  %xor.122724.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250201.35.clone.1, %or.116152.29.clone.1)
+  %add.250202.5.clone.1 = u32[1280,1280]{1,0} add(%add.250201.35.clone.1, %xor.122724.27.clone.1)
+  %shift-left.110397.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122724.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116635.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122724.27.clone.1, %broadcast.244417.5760)
+  %or.116153.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110397.9.clone.1, %shift-right-logical.116635.9.clone.1)
+  %xor.122725.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250202.5.clone.1, %or.116153.7.clone.1)
+  %add.250203.3.clone.1 = u32[1280,1280]{1,0} add(%add.250202.5.clone.1, %xor.122725.5.clone.1)
+  %shift-left.110398.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122725.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116636.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122725.5.clone.1, %broadcast.244419.4352)
+  %or.116154.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110398.5.clone.1, %shift-right-logical.116636.5.clone.1)
+  %xor.122726.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250203.3.clone.1, %or.116154.3.clone.1)
+  %add.250205.3.clone.1 = u32[1280,1280]{1,0} add(%add.250203.3.clone.1, %xor.122726.3.clone.1)
+  %add.250209.7.clone.1 = u32[1280,1280]{1,0} add(%add.250205.3.clone.1, %broadcast.255149.113.clone.1)
+  %shift-left.110399.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122726.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116637.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122726.3.clone.1, %broadcast.244418.4352)
+  %or.116156.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110399.5.clone.1, %shift-right-logical.116637.5.clone.1)
+  %xor.122727.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250205.3.clone.1, %or.116156.3.clone.1)
+  %constant_218444_1_clone_1 = u32[] constant(516830189)
+  %broadcast.255159.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218444_1_clone_1), dimensions={}
+  %add.250210.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122727.3.clone.1, %broadcast.255159.5.clone.1)
+  %add.250211.5.clone.1 = u32[1280,1280]{1,0} add(%add.250209.7.clone.1, %add.250210.5.clone.1)
+  %shift-left.110400.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250210.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116638.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250210.5.clone.1, %broadcast.244416.5760)
+  %or.116157.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110400.9.clone.1, %shift-right-logical.116638.9.clone.1)
+  %xor.122728.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250211.5.clone.1, %or.116157.7.clone.1)
+  %add.250212.3.clone.1 = u32[1280,1280]{1,0} add(%add.250211.5.clone.1, %xor.122728.5.clone.1)
+  %shift-left.110401.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122728.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116639.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122728.5.clone.1, %broadcast.244429.2304)
+  %or.116158.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110401.9.clone.1, %shift-right-logical.116639.9.clone.1)
+  %xor.122729.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250212.3.clone.1, %or.116158.7.clone.1)
+  %add.250214.3.clone.1 = u32[1280,1280]{1,0} add(%add.250212.3.clone.1, %xor.122729.5.clone.1)
+  %shift-left.110402.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122729.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116640.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122729.5.clone.1, %broadcast.244430.4608)
+  %or.116159.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110402.9.clone.1, %shift-right-logical.116640.9.clone.1)
+  %xor.122730.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250214.3.clone.1, %or.116159.7.clone.1)
+  %add.250215.3.clone.1 = u32[1280,1280]{1,0} add(%add.250214.3.clone.1, %xor.122730.5.clone.1)
+  %constant_182945_1_clone_1 = u32[] constant(516830188)
+  %broadcast.255166.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182945_1_clone_1), dimensions={}
+  %add.250216.7.clone.1 = u32[1280,1280]{1,0} add(%add.250215.3.clone.1, %broadcast.255166.24.clone.1)
+  %shift-left.110403.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122730.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116641.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122730.5.clone.1, %broadcast.244434.2816)
+  %or.116160.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110403.11.clone.1, %shift-right-logical.116641.11.clone.1)
+  %xor.122731.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250215.3.clone.1, %or.116160.9.clone.1)
+  %constant_218445_1_clone_1 = u32[] constant(1379923115)
+  %broadcast.255169.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218445_1_clone_1), dimensions={}
+  %add.250217.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122731.7.clone.1, %broadcast.255169.5.clone.1)
+  %add.250219.5.clone.1 = u32[1280,1280]{1,0} add(%add.250216.7.clone.1, %add.250217.5.clone.1)
+  %shift-left.110405.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250217.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116642.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250217.5.clone.1, %broadcast.244415.6016)
+  %or.116161.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110405.9.clone.1, %shift-right-logical.116642.9.clone.1)
+  %xor.122732.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250219.5.clone.1, %or.116161.7.clone.1)
+  %add.250220.3.clone.1 = u32[1280,1280]{1,0} add(%add.250219.5.clone.1, %xor.122732.5.clone.1)
+  %shift-left.110406.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122732.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116643.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122732.5.clone.1, %broadcast.244417.5760)
+  %or.116162.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110406.9.clone.1, %shift-right-logical.116643.9.clone.1)
+  %xor.122734.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250220.3.clone.1, %or.116162.7.clone.1)
+  %add.250221.3.clone.1 = u32[1280,1280]{1,0} add(%add.250220.3.clone.1, %xor.122734.5.clone.1)
+  %shift-left.110407.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122734.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116644.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122734.5.clone.1, %broadcast.244419.4352)
+  %or.116163.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110407.7.clone.1, %shift-right-logical.116644.7.clone.1)
+  %xor.122735.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250221.3.clone.1, %or.116163.5.clone.1)
+  %add.250222.3.clone.1 = u32[1280,1280]{1,0} add(%add.250221.3.clone.1, %xor.122735.3.clone.1)
+  %add.250224.7.clone.1 = u32[1280,1280]{1,0} add(%add.250222.3.clone.1, %broadcast.255148.44.clone.1)
+  %shift-left.110408.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122735.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116645.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122735.3.clone.1, %broadcast.244418.4352)
+  %or.116164.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110408.7.clone.1, %shift-right-logical.116645.7.clone.1)
+  %xor.122736.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250222.3.clone.1, %or.116164.5.clone.1)
+  %constant_218446_1_clone_1 = u32[] constant(1461771426)
+  %broadcast.255179.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218446_1_clone_1), dimensions={}
+  %add.250225.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122736.3.clone.1, %broadcast.255179.5.clone.1)
+  %add.250226.5.clone.1 = u32[1280,1280]{1,0} add(%add.250224.7.clone.1, %add.250225.5.clone.1)
+  %shift-left.110410.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250225.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116646.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250225.5.clone.1, %broadcast.244416.5760)
+  %or.116166.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110410.9.clone.1, %shift-right-logical.116646.9.clone.1)
+  %xor.122737.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250226.5.clone.1, %or.116166.7.clone.1)
+  %add.250227.3.clone.1 = u32[1280,1280]{1,0} add(%add.250226.5.clone.1, %xor.122737.5.clone.1)
+  %shift-left.110411.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122737.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116647.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122737.5.clone.1, %broadcast.244429.2304)
+  %or.116167.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110411.9.clone.1, %shift-right-logical.116647.9.clone.1)
+  %xor.122739.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250227.3.clone.1, %or.116167.7.clone.1)
+  %add.250228.3.clone.1 = u32[1280,1280]{1,0} add(%add.250227.3.clone.1, %xor.122739.5.clone.1)
+  %shift-left.110412.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122739.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116648.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122739.5.clone.1, %broadcast.244430.4608)
+  %or.116168.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110412.9.clone.1, %shift-right-logical.116648.9.clone.1)
+  %xor.122740.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250228.3.clone.1, %or.116168.7.clone.1)
+  %add.250230.3.clone.1 = u32[1280,1280]{1,0} add(%add.250228.3.clone.1, %xor.122740.5.clone.1)
+  %add.250234.7.clone.1 = u32[1280,1280]{1,0} add(%add.250230.3.clone.1, %broadcast.255149.113.clone.1)
+  %shift-left.110413.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122740.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116649.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122740.5.clone.1, %broadcast.244434.2816)
+  %or.116169.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110413.11.clone.1, %shift-right-logical.116649.11.clone.1)
+  %xor.122741.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250230.3.clone.1, %or.116169.9.clone.1)
+  %constant_218447_1_clone_1 = u32[] constant(516830192)
+  %broadcast.255189.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218447_1_clone_1), dimensions={}
+  %add.250235.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122741.7.clone.1, %broadcast.255189.5.clone.1)
+  %add.250236.5.clone.1 = u32[1280,1280]{1,0} add(%add.250234.7.clone.1, %add.250235.5.clone.1)
+  %shift-left.110415.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250235.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116650.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250235.5.clone.1, %broadcast.244415.6016)
+  %or.116171.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110415.9.clone.1, %shift-right-logical.116650.9.clone.1)
+  %xor.122742.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250236.5.clone.1, %or.116171.7.clone.1)
+  %add.250237.3.clone.1 = u32[1280,1280]{1,0} add(%add.250236.5.clone.1, %xor.122742.5.clone.1)
+  %shift-left.110416.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122742.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116651.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122742.5.clone.1, %broadcast.244417.5760)
+  %or.116172.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110416.9.clone.1, %shift-right-logical.116651.9.clone.1)
+  %xor.122744.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250237.3.clone.1, %or.116172.7.clone.1)
+  %add.250239.3.clone.1 = u32[1280,1280]{1,0} add(%add.250237.3.clone.1, %xor.122744.5.clone.1)
+  %shift-left.110417.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122744.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116652.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122744.5.clone.1, %broadcast.244419.4352)
+  %or.116173.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110417.5.clone.1, %shift-right-logical.116652.5.clone.1)
+  %xor.122745.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250239.3.clone.1, %or.116173.3.clone.1)
+  %add.250240.3.clone.1 = u32[1280,1280]{1,0} add(%add.250239.3.clone.1, %xor.122745.3.clone.1)
+  %add.250241.17.clone.1 = u32[1280,1280]{1,0} add(%add.250240.3.clone.1, %broadcast.255166.24.clone.1)
+  %shift-left.110418.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122745.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116653.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122745.3.clone.1, %broadcast.244418.4352)
+  %or.116174.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110418.5.clone.1, %shift-right-logical.116653.5.clone.1)
+  %xor.122746.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250240.3.clone.1, %or.116174.3.clone.1)
+  %constant_218448_1_clone_1 = u32[] constant(1379923118)
+  %broadcast.255199.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218448_1_clone_1), dimensions={}
+  %add.250242.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122746.15.clone.1, %broadcast.255199.19.clone.1)
+  %xor.122747.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250241.17.clone.1, %add.250242.19.clone.1)
+  %shift-right-logical.116654.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122747.17.clone.1, %broadcast.244468.1920)
+  %or.116176.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116654.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5791.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116176.13.clone.1)
+  %add.250244.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5791.11.clone.1, %broadcast.244470.1152)
+  %multiply.26843.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250244.9.clone.1, %broadcast.244471.896)
+  %add.250245.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26843.7.clone.1, %broadcast.244408.1024)
+  %maximum.3723.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250245.5.clone.1)
+  %abs.1565.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3723.3.clone.1)
+  %compare.7280.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1565.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26844.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3723.3.clone.1, %broadcast.244476.1152)
+  %negate.4635.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3723.3.clone.1)
+  %multiply.26845.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3723.3.clone.1, %negate.4635.5.clone.1)
+  %log-plus-one.1565.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26845.5.clone.1)
+  %negate.4636.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1565.3.clone.1)
+  %compare.7281.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4636.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21301.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21302.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21303.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21304.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21305.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21306.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21307.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21308.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21309.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250246.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4636.4.clone.1, %broadcast.244496.640)
+  %sqrt.1565.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4636.4.clone.1)
+  %add.250247.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1565.5.clone.1, %broadcast.244498.640)
+  %select.21310.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7281.3.clone.1, %add.250246.5.clone.1, %add.250247.5.clone.1)
+  %multiply.26846.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21309.3.clone.1, %select.21310.3.clone.1)
+  %add.250249.1.clone.1 = f32[1280,1280]{1,0} add(%select.21308.3.clone.1, %multiply.26846.1.clone.1)
+  %multiply.26847.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250249.1.clone.1, %select.21310.3.clone.1)
+  %add.250250.1.clone.1 = f32[1280,1280]{1,0} add(%select.21307.3.clone.1, %multiply.26847.1.clone.1)
+  %multiply.26848.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250250.1.clone.1, %select.21310.3.clone.1)
+  %add.250251.1.clone.1 = f32[1280,1280]{1,0} add(%select.21306.3.clone.1, %multiply.26848.1.clone.1)
+  %multiply.26849.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250251.1.clone.1, %select.21310.3.clone.1)
+  %add.250252.1.clone.1 = f32[1280,1280]{1,0} add(%select.21305.3.clone.1, %multiply.26849.1.clone.1)
+  %multiply.26850.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250252.1.clone.1, %select.21310.3.clone.1)
+  %add.250253.3.clone.1 = f32[1280,1280]{1,0} add(%select.21304.5.clone.1, %multiply.26850.1.clone.1)
+  %multiply.26851.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250253.3.clone.1, %select.21310.3.clone.1)
+  %add.250255.3.clone.1 = f32[1280,1280]{1,0} add(%select.21303.5.clone.1, %multiply.26851.1.clone.1)
+  %multiply.26852.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250255.3.clone.1, %select.21310.3.clone.1)
+  %add.250258.9.clone.1 = f32[1280,1280]{1,0} add(%select.21302.11.clone.1, %multiply.26852.7.clone.1)
+  %multiply.26853.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250258.9.clone.1, %select.21310.3.clone.1)
+  %add.250259.7.clone.1 = f32[1280,1280]{1,0} add(%select.21301.7.clone.1, %multiply.26853.7.clone.1)
+  %multiply.26854.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250259.7.clone.1, %maximum.3723.3.clone.1)
+  %select.21311.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7280.3.clone.1, %multiply.26844.9.clone.1, %multiply.26854.7.clone.1)
+  %multiply.26855.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21311.7.clone.1, %broadcast.244500.640)
+  %clamp.1209.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26855.5.clone.1, %broadcast.244501.384)
+  %multiply.26856.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1209.3.clone.1, %broadcast.244502.1)
+  %constant_168524_1_clone_1 = u32[] constant(552660468)
+  %broadcast.248904.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168524_1_clone_1), dimensions={}
+  %add.246636.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248904.44.clone.1)
+  %constant_168531_1_clone_1 = u32[] constant(385509064)
+  %broadcast.248905.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168531_1_clone_1), dimensions={}
+  %add.246637.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248905.113.clone.1)
+  %add.246638.35.clone.1 = u32[1280,1280]{1,0} add(%add.246636.37.clone.1, %add.246637.99.clone.1)
+  %shift-left.108860.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246637.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114995.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246637.99.clone.1, %broadcast.244415.6016)
+  %or.114531.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108860.31.clone.1, %shift-right-logical.114995.29.clone.1)
+  %xor.121080.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246638.35.clone.1, %or.114531.29.clone.1)
+  %add.246639.5.clone.1 = u32[1280,1280]{1,0} add(%add.246638.35.clone.1, %xor.121080.27.clone.1)
+  %shift-left.108861.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121080.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114996.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121080.27.clone.1, %broadcast.244417.5760)
+  %or.114532.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108861.9.clone.1, %shift-right-logical.114996.9.clone.1)
+  %xor.121081.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246639.5.clone.1, %or.114532.7.clone.1)
+  %add.246640.3.clone.1 = u32[1280,1280]{1,0} add(%add.246639.5.clone.1, %xor.121081.5.clone.1)
+  %shift-left.108862.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121081.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114998.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121081.5.clone.1, %broadcast.244419.4352)
+  %or.114533.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108862.5.clone.1, %shift-right-logical.114998.5.clone.1)
+  %xor.121082.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246640.3.clone.1, %or.114533.3.clone.1)
+  %add.246641.3.clone.1 = u32[1280,1280]{1,0} add(%add.246640.3.clone.1, %xor.121082.3.clone.1)
+  %add.246642.7.clone.1 = u32[1280,1280]{1,0} add(%add.246641.3.clone.1, %broadcast.248905.113.clone.1)
+  %shift-left.108863.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121082.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114999.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121082.3.clone.1, %broadcast.244418.4352)
+  %or.114534.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108863.5.clone.1, %shift-right-logical.114999.5.clone.1)
+  %xor.121083.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246641.3.clone.1, %or.114534.3.clone.1)
+  %constant_218051_1_clone_1 = u32[] constant(769364199)
+  %broadcast.248915.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218051_1_clone_1), dimensions={}
+  %add.246643.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121083.3.clone.1, %broadcast.248915.5.clone.1)
+  %add.246644.5.clone.1 = u32[1280,1280]{1,0} add(%add.246642.7.clone.1, %add.246643.5.clone.1)
+  %shift-left.108864.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246643.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115000.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246643.5.clone.1, %broadcast.244416.5760)
+  %or.114535.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108864.9.clone.1, %shift-right-logical.115000.9.clone.1)
+  %xor.121084.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246644.5.clone.1, %or.114535.7.clone.1)
+  %add.246645.3.clone.1 = u32[1280,1280]{1,0} add(%add.246644.5.clone.1, %xor.121084.5.clone.1)
+  %shift-left.108865.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121084.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115001.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121084.5.clone.1, %broadcast.244429.2304)
+  %or.114536.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108865.9.clone.1, %shift-right-logical.115001.9.clone.1)
+  %xor.121085.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246645.3.clone.1, %or.114536.7.clone.1)
+  %add.246647.3.clone.1 = u32[1280,1280]{1,0} add(%add.246645.3.clone.1, %xor.121085.5.clone.1)
+  %shift-left.108866.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121085.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115003.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121085.5.clone.1, %broadcast.244430.4608)
+  %or.114538.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108866.9.clone.1, %shift-right-logical.115003.9.clone.1)
+  %xor.121086.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246647.3.clone.1, %or.114538.7.clone.1)
+  %add.246650.3.clone.1 = u32[1280,1280]{1,0} add(%add.246647.3.clone.1, %xor.121086.5.clone.1)
+  %constant_168533_1_clone_1 = u32[] constant(769364198)
+  %broadcast.248924.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168533_1_clone_1), dimensions={}
+  %add.246651.7.clone.1 = u32[1280,1280]{1,0} add(%add.246650.3.clone.1, %broadcast.248924.24.clone.1)
+  %shift-left.108867.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121086.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115004.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121086.5.clone.1, %broadcast.244434.2816)
+  %or.114539.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108867.11.clone.1, %shift-right-logical.115004.11.clone.1)
+  %xor.121087.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246650.3.clone.1, %or.114539.9.clone.1)
+  %constant_218052_1_clone_1 = u32[] constant(552660470)
+  %broadcast.248927.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218052_1_clone_1), dimensions={}
+  %add.246652.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121087.7.clone.1, %broadcast.248927.5.clone.1)
+  %add.246653.5.clone.1 = u32[1280,1280]{1,0} add(%add.246651.7.clone.1, %add.246652.5.clone.1)
+  %shift-left.108868.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246652.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115005.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246652.5.clone.1, %broadcast.244415.6016)
+  %or.114540.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108868.9.clone.1, %shift-right-logical.115005.9.clone.1)
+  %xor.121088.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246653.5.clone.1, %or.114540.7.clone.1)
+  %add.246655.3.clone.1 = u32[1280,1280]{1,0} add(%add.246653.5.clone.1, %xor.121088.5.clone.1)
+  %shift-left.108869.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121088.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115006.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121088.5.clone.1, %broadcast.244417.5760)
+  %or.114541.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108869.9.clone.1, %shift-right-logical.115006.9.clone.1)
+  %xor.121089.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246655.3.clone.1, %or.114541.7.clone.1)
+  %add.246656.3.clone.1 = u32[1280,1280]{1,0} add(%add.246655.3.clone.1, %xor.121089.5.clone.1)
+  %shift-left.108870.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121089.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115007.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121089.5.clone.1, %broadcast.244419.4352)
+  %or.114543.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108870.7.clone.1, %shift-right-logical.115007.7.clone.1)
+  %xor.121090.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246656.3.clone.1, %or.114543.5.clone.1)
+  %add.246657.3.clone.1 = u32[1280,1280]{1,0} add(%add.246656.3.clone.1, %xor.121090.3.clone.1)
+  %add.246658.7.clone.1 = u32[1280,1280]{1,0} add(%add.246657.3.clone.1, %broadcast.248904.44.clone.1)
+  %shift-left.108871.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121090.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115008.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121090.3.clone.1, %broadcast.244418.4352)
+  %or.114544.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108871.7.clone.1, %shift-right-logical.115008.7.clone.1)
+  %xor.121091.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246657.3.clone.1, %or.114544.5.clone.1)
+  %constant_218053_1_clone_1 = u32[] constant(385509067)
+  %broadcast.248937.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218053_1_clone_1), dimensions={}
+  %add.246660.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121091.3.clone.1, %broadcast.248937.5.clone.1)
+  %add.246661.5.clone.1 = u32[1280,1280]{1,0} add(%add.246658.7.clone.1, %add.246660.5.clone.1)
+  %shift-left.108872.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246660.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.115009.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246660.5.clone.1, %broadcast.244416.5760)
+  %or.114545.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108872.9.clone.1, %shift-right-logical.115009.9.clone.1)
+  %xor.121092.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246661.5.clone.1, %or.114545.7.clone.1)
+  %add.246662.3.clone.1 = u32[1280,1280]{1,0} add(%add.246661.5.clone.1, %xor.121092.5.clone.1)
+  %shift-left.108873.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121092.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.115010.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121092.5.clone.1, %broadcast.244429.2304)
+  %or.114546.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108873.9.clone.1, %shift-right-logical.115010.9.clone.1)
+  %xor.121093.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246662.3.clone.1, %or.114546.7.clone.1)
+  %add.246663.3.clone.1 = u32[1280,1280]{1,0} add(%add.246662.3.clone.1, %xor.121093.5.clone.1)
+  %shift-left.108874.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121093.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.115011.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121093.5.clone.1, %broadcast.244430.4608)
+  %or.114548.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108874.9.clone.1, %shift-right-logical.115011.9.clone.1)
+  %xor.121094.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246663.3.clone.1, %or.114548.7.clone.1)
+  %add.246665.3.clone.1 = u32[1280,1280]{1,0} add(%add.246663.3.clone.1, %xor.121094.5.clone.1)
+  %add.246666.7.clone.1 = u32[1280,1280]{1,0} add(%add.246665.3.clone.1, %broadcast.248905.113.clone.1)
+  %shift-left.108875.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121094.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.115013.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121094.5.clone.1, %broadcast.244434.2816)
+  %or.114549.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108875.11.clone.1, %shift-right-logical.115013.11.clone.1)
+  %xor.121095.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246665.3.clone.1, %or.114549.9.clone.1)
+  %constant_218054_1_clone_1 = u32[] constant(769364202)
+  %broadcast.248949.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218054_1_clone_1), dimensions={}
+  %add.246667.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121095.7.clone.1, %broadcast.248949.5.clone.1)
+  %add.246668.5.clone.1 = u32[1280,1280]{1,0} add(%add.246666.7.clone.1, %add.246667.5.clone.1)
+  %shift-left.108876.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246667.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.115014.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246667.5.clone.1, %broadcast.244415.6016)
+  %or.114550.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108876.9.clone.1, %shift-right-logical.115014.9.clone.1)
+  %xor.121096.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246668.5.clone.1, %or.114550.7.clone.1)
+  %add.246669.3.clone.1 = u32[1280,1280]{1,0} add(%add.246668.5.clone.1, %xor.121096.5.clone.1)
+  %shift-left.108877.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121096.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.115015.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121096.5.clone.1, %broadcast.244417.5760)
+  %or.114551.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108877.9.clone.1, %shift-right-logical.115015.9.clone.1)
+  %xor.121097.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246669.3.clone.1, %or.114551.7.clone.1)
+  %add.246671.3.clone.1 = u32[1280,1280]{1,0} add(%add.246669.3.clone.1, %xor.121097.5.clone.1)
+  %shift-left.108878.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121097.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.115016.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121097.5.clone.1, %broadcast.244419.4352)
+  %or.114553.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108878.5.clone.1, %shift-right-logical.115016.5.clone.1)
+  %xor.121098.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246671.3.clone.1, %or.114553.3.clone.1)
+  %add.246675.3.clone.1 = u32[1280,1280]{1,0} add(%add.246671.3.clone.1, %xor.121098.3.clone.1)
+  %add.246676.17.clone.1 = u32[1280,1280]{1,0} add(%add.246675.3.clone.1, %broadcast.248924.24.clone.1)
+  %shift-left.108879.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121098.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.115018.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121098.3.clone.1, %broadcast.244418.4352)
+  %or.114554.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108879.5.clone.1, %shift-right-logical.115018.5.clone.1)
+  %xor.121099.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246675.3.clone.1, %or.114554.3.clone.1)
+  %constant_218055_1_clone_1 = u32[] constant(552660473)
+  %broadcast.248959.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218055_1_clone_1), dimensions={}
+  %add.246677.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121099.15.clone.1, %broadcast.248959.19.clone.1)
+  %xor.121100.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246676.17.clone.1, %add.246677.19.clone.1)
+  %shift-right-logical.115019.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121100.17.clone.1, %broadcast.244468.1920)
+  %or.114555.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.115019.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5720.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114555.13.clone.1)
+  %add.246678.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5720.11.clone.1, %broadcast.244470.1152)
+  %multiply.26122.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246678.9.clone.1, %broadcast.244471.896)
+  %add.246680.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26122.7.clone.1, %broadcast.244408.1024)
+  %maximum.3652.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246680.5.clone.1)
+  %abs.1518.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3652.3.clone.1)
+  %compare.7184.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1518.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26123.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3652.3.clone.1, %broadcast.244476.1152)
+  %negate.4541.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3652.3.clone.1)
+  %multiply.26124.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3652.3.clone.1, %negate.4541.5.clone.1)
+  %log-plus-one.1518.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26124.5.clone.1)
+  %negate.4542.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1518.3.clone.1)
+  %compare.7185.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4542.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20763.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20764.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20765.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20766.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20767.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20768.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20769.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20770.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20771.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246681.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4542.4.clone.1, %broadcast.244496.640)
+  %sqrt.1518.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4542.4.clone.1)
+  %add.246682.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1518.5.clone.1, %broadcast.244498.640)
+  %select.20772.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7185.3.clone.1, %add.246681.5.clone.1, %add.246682.5.clone.1)
+  %multiply.26125.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20771.3.clone.1, %select.20772.3.clone.1)
+  %add.246683.1.clone.1 = f32[1280,1280]{1,0} add(%select.20770.3.clone.1, %multiply.26125.1.clone.1)
+  %multiply.26126.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246683.1.clone.1, %select.20772.3.clone.1)
+  %add.246685.1.clone.1 = f32[1280,1280]{1,0} add(%select.20769.3.clone.1, %multiply.26126.1.clone.1)
+  %multiply.26127.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246685.1.clone.1, %select.20772.3.clone.1)
+  %add.246686.1.clone.1 = f32[1280,1280]{1,0} add(%select.20768.3.clone.1, %multiply.26127.1.clone.1)
+  %multiply.26128.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246686.1.clone.1, %select.20772.3.clone.1)
+  %add.246687.1.clone.1 = f32[1280,1280]{1,0} add(%select.20767.3.clone.1, %multiply.26128.1.clone.1)
+  %multiply.26129.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246687.1.clone.1, %select.20772.3.clone.1)
+  %add.246688.3.clone.1 = f32[1280,1280]{1,0} add(%select.20766.5.clone.1, %multiply.26129.1.clone.1)
+  %multiply.26130.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246688.3.clone.1, %select.20772.3.clone.1)
+  %add.246690.3.clone.1 = f32[1280,1280]{1,0} add(%select.20765.5.clone.1, %multiply.26130.1.clone.1)
+  %multiply.26131.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246690.3.clone.1, %select.20772.3.clone.1)
+  %add.246691.9.clone.1 = f32[1280,1280]{1,0} add(%select.20764.11.clone.1, %multiply.26131.7.clone.1)
+  %multiply.26132.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246691.9.clone.1, %select.20772.3.clone.1)
+  %add.246692.7.clone.1 = f32[1280,1280]{1,0} add(%select.20763.7.clone.1, %multiply.26132.7.clone.1)
+  %multiply.26133.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246692.7.clone.1, %maximum.3652.3.clone.1)
+  %select.20773.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7184.3.clone.1, %multiply.26123.9.clone.1, %multiply.26133.7.clone.1)
+  %multiply.26134.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20773.7.clone.1, %broadcast.244500.640)
+  %clamp.1162.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26134.5.clone.1, %broadcast.244501.384)
+  %multiply.26135.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1162.3.clone.1, %broadcast.244502.1)
+  %constant_190030_1_clone_1 = u32[] constant(2898079861)
+  %broadcast.258200.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190030_1_clone_1), dimensions={}
+  %add.251947.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258200.44.clone.1)
+  %constant_190037_1_clone_1 = u32[] constant(1982886832)
+  %broadcast.258201.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190037_1_clone_1), dimensions={}
+  %add.251948.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258201.113.clone.1)
+  %add.251950.35.clone.1 = u32[1280,1280]{1,0} add(%add.251947.37.clone.1, %add.251948.99.clone.1)
+  %shift-left.111176.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251948.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117438.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251948.99.clone.1, %broadcast.244415.6016)
+  %or.116976.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111176.31.clone.1, %shift-right-logical.117438.29.clone.1)
+  %xor.123531.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251950.35.clone.1, %or.116976.29.clone.1)
+  %add.251951.5.clone.1 = u32[1280,1280]{1,0} add(%add.251950.35.clone.1, %xor.123531.27.clone.1)
+  %shift-left.111177.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123531.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117439.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123531.27.clone.1, %broadcast.244417.5760)
+  %or.116977.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111177.9.clone.1, %shift-right-logical.117439.9.clone.1)
+  %xor.123532.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251951.5.clone.1, %or.116977.7.clone.1)
+  %add.251952.3.clone.1 = u32[1280,1280]{1,0} add(%add.251951.5.clone.1, %xor.123532.5.clone.1)
+  %shift-left.111178.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123532.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117440.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123532.5.clone.1, %broadcast.244419.4352)
+  %or.116978.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111178.5.clone.1, %shift-right-logical.117440.5.clone.1)
+  %xor.123533.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251952.3.clone.1, %or.116978.3.clone.1)
+  %add.251953.3.clone.1 = u32[1280,1280]{1,0} add(%add.251952.3.clone.1, %xor.123533.3.clone.1)
+  %add.251954.7.clone.1 = u32[1280,1280]{1,0} add(%add.251953.3.clone.1, %broadcast.258201.113.clone.1)
+  %shift-left.111179.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123533.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117441.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123533.3.clone.1, %broadcast.244418.4352)
+  %or.116979.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111179.5.clone.1, %shift-right-logical.117441.5.clone.1)
+  %xor.123534.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251953.3.clone.1, %or.116979.3.clone.1)
+  %constant_218640_1_clone_1 = u32[] constant(3244054560)
+  %broadcast.258211.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218640_1_clone_1), dimensions={}
+  %add.251955.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123534.3.clone.1, %broadcast.258211.5.clone.1)
+  %add.251956.5.clone.1 = u32[1280,1280]{1,0} add(%add.251954.7.clone.1, %add.251955.5.clone.1)
+  %shift-left.111180.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251955.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117442.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251955.5.clone.1, %broadcast.244416.5760)
+  %or.116981.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111180.9.clone.1, %shift-right-logical.117442.9.clone.1)
+  %xor.123535.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251956.5.clone.1, %or.116981.7.clone.1)
+  %add.251957.3.clone.1 = u32[1280,1280]{1,0} add(%add.251956.5.clone.1, %xor.123535.5.clone.1)
+  %shift-left.111181.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123535.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117443.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123535.5.clone.1, %broadcast.244429.2304)
+  %or.116982.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111181.9.clone.1, %shift-right-logical.117443.9.clone.1)
+  %xor.123536.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251957.3.clone.1, %or.116982.7.clone.1)
+  %add.251958.3.clone.1 = u32[1280,1280]{1,0} add(%add.251957.3.clone.1, %xor.123536.5.clone.1)
+  %shift-left.111182.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123536.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117444.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123536.5.clone.1, %broadcast.244430.4608)
+  %or.116983.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111182.9.clone.1, %shift-right-logical.117444.9.clone.1)
+  %xor.123537.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251958.3.clone.1, %or.116983.7.clone.1)
+  %add.251959.3.clone.1 = u32[1280,1280]{1,0} add(%add.251958.3.clone.1, %xor.123537.5.clone.1)
+  %constant_190039_1_clone_1 = u32[] constant(3244054559)
+  %broadcast.258218.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_190039_1_clone_1), dimensions={}
+  %add.251960.7.clone.1 = u32[1280,1280]{1,0} add(%add.251959.3.clone.1, %broadcast.258218.24.clone.1)
+  %shift-left.111183.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123537.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117446.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123537.5.clone.1, %broadcast.244434.2816)
+  %or.116984.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111183.11.clone.1, %shift-right-logical.117446.11.clone.1)
+  %xor.123538.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251959.3.clone.1, %or.116984.9.clone.1)
+  %constant_218641_1_clone_1 = u32[] constant(2898079863)
+  %broadcast.258221.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218641_1_clone_1), dimensions={}
+  %add.251961.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123538.7.clone.1, %broadcast.258221.5.clone.1)
+  %add.251962.5.clone.1 = u32[1280,1280]{1,0} add(%add.251960.7.clone.1, %add.251961.5.clone.1)
+  %shift-left.111185.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251961.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117447.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251961.5.clone.1, %broadcast.244415.6016)
+  %or.116986.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111185.9.clone.1, %shift-right-logical.117447.9.clone.1)
+  %xor.123539.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251962.5.clone.1, %or.116986.7.clone.1)
+  %add.251963.3.clone.1 = u32[1280,1280]{1,0} add(%add.251962.5.clone.1, %xor.123539.5.clone.1)
+  %shift-left.111186.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123539.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117448.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123539.5.clone.1, %broadcast.244417.5760)
+  %or.116987.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111186.9.clone.1, %shift-right-logical.117448.9.clone.1)
+  %xor.123540.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251963.3.clone.1, %or.116987.7.clone.1)
+  %add.251964.3.clone.1 = u32[1280,1280]{1,0} add(%add.251963.3.clone.1, %xor.123540.5.clone.1)
+  %shift-left.111187.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123540.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117449.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123540.5.clone.1, %broadcast.244419.4352)
+  %or.116988.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111187.7.clone.1, %shift-right-logical.117449.7.clone.1)
+  %xor.123541.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251964.3.clone.1, %or.116988.5.clone.1)
+  %add.251965.3.clone.1 = u32[1280,1280]{1,0} add(%add.251964.3.clone.1, %xor.123541.3.clone.1)
+  %add.251966.7.clone.1 = u32[1280,1280]{1,0} add(%add.251965.3.clone.1, %broadcast.258200.44.clone.1)
+  %shift-left.111188.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123541.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117451.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123541.3.clone.1, %broadcast.244418.4352)
+  %or.116989.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111188.7.clone.1, %shift-right-logical.117451.7.clone.1)
+  %xor.123542.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251965.3.clone.1, %or.116989.5.clone.1)
+  %constant_218642_1_clone_1 = u32[] constant(1982886835)
+  %broadcast.258231.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218642_1_clone_1), dimensions={}
+  %add.251967.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123542.3.clone.1, %broadcast.258231.5.clone.1)
+  %add.251968.5.clone.1 = u32[1280,1280]{1,0} add(%add.251966.7.clone.1, %add.251967.5.clone.1)
+  %shift-left.111190.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251967.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117452.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251967.5.clone.1, %broadcast.244416.5760)
+  %or.116990.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111190.9.clone.1, %shift-right-logical.117452.9.clone.1)
+  %xor.123543.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251968.5.clone.1, %or.116990.7.clone.1)
+  %add.251969.3.clone.1 = u32[1280,1280]{1,0} add(%add.251968.5.clone.1, %xor.123543.5.clone.1)
+  %shift-left.111191.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123543.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117453.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123543.5.clone.1, %broadcast.244429.2304)
+  %or.116991.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111191.9.clone.1, %shift-right-logical.117453.9.clone.1)
+  %xor.123544.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251969.3.clone.1, %or.116991.7.clone.1)
+  %add.251970.3.clone.1 = u32[1280,1280]{1,0} add(%add.251969.3.clone.1, %xor.123544.5.clone.1)
+  %shift-left.111192.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123544.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117454.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123544.5.clone.1, %broadcast.244430.4608)
+  %or.116992.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111192.9.clone.1, %shift-right-logical.117454.9.clone.1)
+  %xor.123545.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251970.3.clone.1, %or.116992.7.clone.1)
+  %add.251971.3.clone.1 = u32[1280,1280]{1,0} add(%add.251970.3.clone.1, %xor.123545.5.clone.1)
+  %add.251972.7.clone.1 = u32[1280,1280]{1,0} add(%add.251971.3.clone.1, %broadcast.258201.113.clone.1)
+  %shift-left.111193.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123545.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117456.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123545.5.clone.1, %broadcast.244434.2816)
+  %or.116993.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111193.11.clone.1, %shift-right-logical.117456.11.clone.1)
+  %xor.123546.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251971.3.clone.1, %or.116993.9.clone.1)
+  %constant_218643_1_clone_1 = u32[] constant(3244054563)
+  %broadcast.258241.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218643_1_clone_1), dimensions={}
+  %add.251973.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123546.7.clone.1, %broadcast.258241.5.clone.1)
+  %add.251974.5.clone.1 = u32[1280,1280]{1,0} add(%add.251972.7.clone.1, %add.251973.5.clone.1)
+  %shift-left.111195.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251973.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117457.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251973.5.clone.1, %broadcast.244415.6016)
+  %or.116994.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111195.9.clone.1, %shift-right-logical.117457.9.clone.1)
+  %xor.123547.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251974.5.clone.1, %or.116994.7.clone.1)
+  %add.251975.3.clone.1 = u32[1280,1280]{1,0} add(%add.251974.5.clone.1, %xor.123547.5.clone.1)
+  %shift-left.111196.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123547.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117458.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123547.5.clone.1, %broadcast.244417.5760)
+  %or.116995.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111196.9.clone.1, %shift-right-logical.117458.9.clone.1)
+  %xor.123548.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251975.3.clone.1, %or.116995.7.clone.1)
+  %add.251976.3.clone.1 = u32[1280,1280]{1,0} add(%add.251975.3.clone.1, %xor.123548.5.clone.1)
+  %shift-left.111197.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123548.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117459.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123548.5.clone.1, %broadcast.244419.4352)
+  %or.116996.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111197.5.clone.1, %shift-right-logical.117459.5.clone.1)
+  %xor.123549.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251976.3.clone.1, %or.116996.3.clone.1)
+  %add.251977.3.clone.1 = u32[1280,1280]{1,0} add(%add.251976.3.clone.1, %xor.123549.3.clone.1)
+  %add.251978.17.clone.1 = u32[1280,1280]{1,0} add(%add.251977.3.clone.1, %broadcast.258218.24.clone.1)
+  %shift-left.111198.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123549.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117461.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123549.3.clone.1, %broadcast.244418.4352)
+  %or.116997.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111198.5.clone.1, %shift-right-logical.117461.5.clone.1)
+  %xor.123550.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251977.3.clone.1, %or.116997.3.clone.1)
+  %constant_218644_1_clone_1 = u32[] constant(2898079866)
+  %broadcast.258251.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218644_1_clone_1), dimensions={}
+  %add.251979.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123550.15.clone.1, %broadcast.258251.19.clone.1)
+  %xor.123551.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251978.17.clone.1, %add.251979.19.clone.1)
+  %shift-right-logical.117462.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123551.17.clone.1, %broadcast.244468.1920)
+  %or.116998.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117462.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5826.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116998.13.clone.1)
+  %add.251980.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5826.11.clone.1, %broadcast.244470.1152)
+  %multiply.27204.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251980.9.clone.1, %broadcast.244471.896)
+  %add.251982.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27204.7.clone.1, %broadcast.244408.1024)
+  %maximum.3758.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251982.5.clone.1)
+  %abs.1588.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3758.3.clone.1)
+  %compare.7338.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1588.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27205.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3758.3.clone.1, %broadcast.244476.1152)
+  %negate.4681.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3758.3.clone.1)
+  %multiply.27206.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3758.3.clone.1, %negate.4681.5.clone.1)
+  %log-plus-one.1588.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27206.5.clone.1)
+  %negate.4682.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1588.3.clone.1)
+  %compare.7339.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4682.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21570.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21571.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21572.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21573.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21574.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21575.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21576.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21577.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21578.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251985.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4682.4.clone.1, %broadcast.244496.640)
+  %sqrt.1588.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4682.4.clone.1)
+  %add.251986.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1588.5.clone.1, %broadcast.244498.640)
+  %select.21579.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7339.3.clone.1, %add.251985.5.clone.1, %add.251986.5.clone.1)
+  %multiply.27207.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21578.3.clone.1, %select.21579.3.clone.1)
+  %add.251987.1.clone.1 = f32[1280,1280]{1,0} add(%select.21577.3.clone.1, %multiply.27207.1.clone.1)
+  %multiply.27208.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251987.1.clone.1, %select.21579.3.clone.1)
+  %add.251988.1.clone.1 = f32[1280,1280]{1,0} add(%select.21576.3.clone.1, %multiply.27208.1.clone.1)
+  %multiply.27209.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251988.1.clone.1, %select.21579.3.clone.1)
+  %add.251990.1.clone.1 = f32[1280,1280]{1,0} add(%select.21575.3.clone.1, %multiply.27209.1.clone.1)
+  %multiply.27210.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251990.1.clone.1, %select.21579.3.clone.1)
+  %add.251991.1.clone.1 = f32[1280,1280]{1,0} add(%select.21574.3.clone.1, %multiply.27210.1.clone.1)
+  %multiply.27211.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251991.1.clone.1, %select.21579.3.clone.1)
+  %add.251992.3.clone.1 = f32[1280,1280]{1,0} add(%select.21573.5.clone.1, %multiply.27211.1.clone.1)
+  %multiply.27212.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251992.3.clone.1, %select.21579.3.clone.1)
+  %add.251993.3.clone.1 = f32[1280,1280]{1,0} add(%select.21572.5.clone.1, %multiply.27212.1.clone.1)
+  %multiply.27213.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251993.3.clone.1, %select.21579.3.clone.1)
+  %add.251995.9.clone.1 = f32[1280,1280]{1,0} add(%select.21571.11.clone.1, %multiply.27213.7.clone.1)
+  %multiply.27214.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251995.9.clone.1, %select.21579.3.clone.1)
+  %add.251996.7.clone.1 = f32[1280,1280]{1,0} add(%select.21570.7.clone.1, %multiply.27214.7.clone.1)
+  %multiply.27215.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251996.7.clone.1, %maximum.3758.3.clone.1)
+  %select.21580.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7338.3.clone.1, %multiply.27205.9.clone.1, %multiply.27215.7.clone.1)
+  %multiply.27216.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21580.7.clone.1, %broadcast.244500.640)
+  %clamp.1232.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27216.5.clone.1, %broadcast.244501.384)
+  %multiply.27217.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1232.3.clone.1, %broadcast.244502.1)
+  %constant_168292_1_clone_1 = u32[] constant(3723649444)
+  %broadcast.248799.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168292_1_clone_1), dimensions={}
+  %add.246590.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248799.44.clone.1)
+  %constant_168299_1_clone_1 = u32[] constant(3203058576)
+  %broadcast.248800.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168299_1_clone_1), dimensions={}
+  %add.246591.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248800.113.clone.1)
+  %add.246592.35.clone.1 = u32[1280,1280]{1,0} add(%add.246590.37.clone.1, %add.246591.99.clone.1)
+  %shift-left.108840.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246591.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114972.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246591.99.clone.1, %broadcast.244415.6016)
+  %or.114506.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108840.31.clone.1, %shift-right-logical.114972.29.clone.1)
+  %xor.121055.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246592.35.clone.1, %or.114506.29.clone.1)
+  %add.246593.5.clone.1 = u32[1280,1280]{1,0} add(%add.246592.35.clone.1, %xor.121055.27.clone.1)
+  %shift-left.108841.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121055.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114973.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121055.27.clone.1, %broadcast.244417.5760)
+  %or.114507.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108841.9.clone.1, %shift-right-logical.114973.9.clone.1)
+  %xor.121056.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246593.5.clone.1, %or.114507.7.clone.1)
+  %add.246594.3.clone.1 = u32[1280,1280]{1,0} add(%add.246593.5.clone.1, %xor.121056.5.clone.1)
+  %shift-left.108842.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121056.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114974.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121056.5.clone.1, %broadcast.244419.4352)
+  %or.114508.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108842.5.clone.1, %shift-right-logical.114974.5.clone.1)
+  %xor.121058.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246594.3.clone.1, %or.114508.3.clone.1)
+  %add.246596.3.clone.1 = u32[1280,1280]{1,0} add(%add.246594.3.clone.1, %xor.121058.3.clone.1)
+  %add.246597.7.clone.1 = u32[1280,1280]{1,0} add(%add.246596.3.clone.1, %broadcast.248800.113.clone.1)
+  %shift-left.108843.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121058.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114975.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121058.3.clone.1, %broadcast.244418.4352)
+  %or.114509.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108843.5.clone.1, %shift-right-logical.114975.5.clone.1)
+  %xor.121059.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246596.3.clone.1, %or.114509.3.clone.1)
+  %constant_218046_1_clone_1 = u32[] constant(2026484207)
+  %broadcast.248810.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218046_1_clone_1), dimensions={}
+  %add.246598.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121059.3.clone.1, %broadcast.248810.5.clone.1)
+  %add.246599.5.clone.1 = u32[1280,1280]{1,0} add(%add.246597.7.clone.1, %add.246598.5.clone.1)
+  %shift-left.108844.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246598.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114976.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246598.5.clone.1, %broadcast.244416.5760)
+  %or.114510.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108844.9.clone.1, %shift-right-logical.114976.9.clone.1)
+  %xor.121060.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246599.5.clone.1, %or.114510.7.clone.1)
+  %add.246600.3.clone.1 = u32[1280,1280]{1,0} add(%add.246599.5.clone.1, %xor.121060.5.clone.1)
+  %shift-left.108845.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121060.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114977.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121060.5.clone.1, %broadcast.244429.2304)
+  %or.114511.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108845.9.clone.1, %shift-right-logical.114977.9.clone.1)
+  %xor.121061.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246600.3.clone.1, %or.114511.7.clone.1)
+  %add.246601.3.clone.1 = u32[1280,1280]{1,0} add(%add.246600.3.clone.1, %xor.121061.5.clone.1)
+  %shift-left.108846.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121061.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114978.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121061.5.clone.1, %broadcast.244430.4608)
+  %or.114513.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108846.9.clone.1, %shift-right-logical.114978.9.clone.1)
+  %xor.121063.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246601.3.clone.1, %or.114513.7.clone.1)
+  %add.246602.3.clone.1 = u32[1280,1280]{1,0} add(%add.246601.3.clone.1, %xor.121063.5.clone.1)
+  %constant_168301_1_clone_1 = u32[] constant(2026484206)
+  %broadcast.248817.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168301_1_clone_1), dimensions={}
+  %add.246603.7.clone.1 = u32[1280,1280]{1,0} add(%add.246602.3.clone.1, %broadcast.248817.24.clone.1)
+  %shift-left.108847.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121063.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114979.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121063.5.clone.1, %broadcast.244434.2816)
+  %or.114514.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108847.11.clone.1, %shift-right-logical.114979.11.clone.1)
+  %xor.121064.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246602.3.clone.1, %or.114514.9.clone.1)
+  %constant_218047_1_clone_1 = u32[] constant(3723649446)
+  %broadcast.248820.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218047_1_clone_1), dimensions={}
+  %add.246604.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121064.7.clone.1, %broadcast.248820.5.clone.1)
+  %add.246605.5.clone.1 = u32[1280,1280]{1,0} add(%add.246603.7.clone.1, %add.246604.5.clone.1)
+  %shift-left.108848.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246604.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114980.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246604.5.clone.1, %broadcast.244415.6016)
+  %or.114515.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108848.9.clone.1, %shift-right-logical.114980.9.clone.1)
+  %xor.121065.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246605.5.clone.1, %or.114515.7.clone.1)
+  %add.246606.3.clone.1 = u32[1280,1280]{1,0} add(%add.246605.5.clone.1, %xor.121065.5.clone.1)
+  %shift-left.108849.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121065.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114981.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121065.5.clone.1, %broadcast.244417.5760)
+  %or.114516.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108849.9.clone.1, %shift-right-logical.114981.9.clone.1)
+  %xor.121066.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246606.3.clone.1, %or.114516.7.clone.1)
+  %add.246607.3.clone.1 = u32[1280,1280]{1,0} add(%add.246606.3.clone.1, %xor.121066.5.clone.1)
+  %shift-left.108850.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121066.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114982.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121066.5.clone.1, %broadcast.244419.4352)
+  %or.114518.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108850.7.clone.1, %shift-right-logical.114982.7.clone.1)
+  %xor.121068.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246607.3.clone.1, %or.114518.5.clone.1)
+  %add.246608.3.clone.1 = u32[1280,1280]{1,0} add(%add.246607.3.clone.1, %xor.121068.3.clone.1)
+  %add.246609.7.clone.1 = u32[1280,1280]{1,0} add(%add.246608.3.clone.1, %broadcast.248799.44.clone.1)
+  %shift-left.108851.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121068.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114983.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121068.3.clone.1, %broadcast.244418.4352)
+  %or.114519.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108851.7.clone.1, %shift-right-logical.114983.7.clone.1)
+  %xor.121069.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246608.3.clone.1, %or.114519.5.clone.1)
+  %constant_218048_1_clone_1 = u32[] constant(3203058579)
+  %broadcast.248830.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218048_1_clone_1), dimensions={}
+  %add.246610.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121069.3.clone.1, %broadcast.248830.5.clone.1)
+  %add.246611.5.clone.1 = u32[1280,1280]{1,0} add(%add.246609.7.clone.1, %add.246610.5.clone.1)
+  %shift-left.108852.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246610.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114984.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246610.5.clone.1, %broadcast.244416.5760)
+  %or.114520.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108852.9.clone.1, %shift-right-logical.114984.9.clone.1)
+  %xor.121070.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246611.5.clone.1, %or.114520.7.clone.1)
+  %add.246612.3.clone.1 = u32[1280,1280]{1,0} add(%add.246611.5.clone.1, %xor.121070.5.clone.1)
+  %shift-left.108853.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121070.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114985.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121070.5.clone.1, %broadcast.244429.2304)
+  %or.114521.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108853.9.clone.1, %shift-right-logical.114985.9.clone.1)
+  %xor.121071.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246612.3.clone.1, %or.114521.7.clone.1)
+  %add.246613.3.clone.1 = u32[1280,1280]{1,0} add(%add.246612.3.clone.1, %xor.121071.5.clone.1)
+  %shift-left.108854.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121071.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114986.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121071.5.clone.1, %broadcast.244430.4608)
+  %or.114523.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108854.9.clone.1, %shift-right-logical.114986.9.clone.1)
+  %xor.121073.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246613.3.clone.1, %or.114523.7.clone.1)
+  %add.246614.3.clone.1 = u32[1280,1280]{1,0} add(%add.246613.3.clone.1, %xor.121073.5.clone.1)
+  %add.246616.7.clone.1 = u32[1280,1280]{1,0} add(%add.246614.3.clone.1, %broadcast.248800.113.clone.1)
+  %shift-left.108855.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121073.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114988.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121073.5.clone.1, %broadcast.244434.2816)
+  %or.114524.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108855.11.clone.1, %shift-right-logical.114988.11.clone.1)
+  %xor.121074.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246614.3.clone.1, %or.114524.9.clone.1)
+  %constant_218049_1_clone_1 = u32[] constant(2026484210)
+  %broadcast.248842.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218049_1_clone_1), dimensions={}
+  %add.246617.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121074.7.clone.1, %broadcast.248842.5.clone.1)
+  %add.246618.5.clone.1 = u32[1280,1280]{1,0} add(%add.246616.7.clone.1, %add.246617.5.clone.1)
+  %shift-left.108856.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246617.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114989.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246617.5.clone.1, %broadcast.244415.6016)
+  %or.114525.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108856.9.clone.1, %shift-right-logical.114989.9.clone.1)
+  %xor.121075.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246618.5.clone.1, %or.114525.7.clone.1)
+  %add.246619.3.clone.1 = u32[1280,1280]{1,0} add(%add.246618.5.clone.1, %xor.121075.5.clone.1)
+  %shift-left.108857.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121075.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114990.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121075.5.clone.1, %broadcast.244417.5760)
+  %or.114526.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108857.9.clone.1, %shift-right-logical.114990.9.clone.1)
+  %xor.121076.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246619.3.clone.1, %or.114526.7.clone.1)
+  %add.246620.3.clone.1 = u32[1280,1280]{1,0} add(%add.246619.3.clone.1, %xor.121076.5.clone.1)
+  %shift-left.108858.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121076.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114991.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121076.5.clone.1, %broadcast.244419.4352)
+  %or.114528.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108858.5.clone.1, %shift-right-logical.114991.5.clone.1)
+  %xor.121077.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246620.3.clone.1, %or.114528.3.clone.1)
+  %add.246621.3.clone.1 = u32[1280,1280]{1,0} add(%add.246620.3.clone.1, %xor.121077.3.clone.1)
+  %add.246622.17.clone.1 = u32[1280,1280]{1,0} add(%add.246621.3.clone.1, %broadcast.248817.24.clone.1)
+  %shift-left.108859.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121077.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114993.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121077.3.clone.1, %broadcast.244418.4352)
+  %or.114529.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108859.5.clone.1, %shift-right-logical.114993.5.clone.1)
+  %xor.121078.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246621.3.clone.1, %or.114529.3.clone.1)
+  %constant_218050_1_clone_1 = u32[] constant(3723649449)
+  %broadcast.248862.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218050_1_clone_1), dimensions={}
+  %add.246623.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121078.15.clone.1, %broadcast.248862.19.clone.1)
+  %xor.121079.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246622.17.clone.1, %add.246623.19.clone.1)
+  %shift-right-logical.114994.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121079.17.clone.1, %broadcast.244468.1920)
+  %or.114530.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114994.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5719.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114530.13.clone.1)
+  %add.246624.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5719.11.clone.1, %broadcast.244470.1152)
+  %multiply.26108.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246624.9.clone.1, %broadcast.244471.896)
+  %add.246625.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26108.7.clone.1, %broadcast.244408.1024)
+  %maximum.3651.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246625.5.clone.1)
+  %abs.1517.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3651.3.clone.1)
+  %compare.7182.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1517.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26109.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3651.3.clone.1, %broadcast.244476.1152)
+  %negate.4539.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3651.3.clone.1)
+  %multiply.26110.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3651.3.clone.1, %negate.4539.5.clone.1)
+  %log-plus-one.1517.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26110.5.clone.1)
+  %negate.4540.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1517.3.clone.1)
+  %compare.7183.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4540.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20752.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20753.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20754.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20755.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20756.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20757.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20758.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20759.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20760.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246626.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4540.4.clone.1, %broadcast.244496.640)
+  %sqrt.1517.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4540.4.clone.1)
+  %add.246627.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1517.5.clone.1, %broadcast.244498.640)
+  %select.20761.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7183.3.clone.1, %add.246626.5.clone.1, %add.246627.5.clone.1)
+  %multiply.26111.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20760.3.clone.1, %select.20761.3.clone.1)
+  %add.246628.1.clone.1 = f32[1280,1280]{1,0} add(%select.20759.3.clone.1, %multiply.26111.1.clone.1)
+  %multiply.26112.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246628.1.clone.1, %select.20761.3.clone.1)
+  %add.246629.1.clone.1 = f32[1280,1280]{1,0} add(%select.20758.3.clone.1, %multiply.26112.1.clone.1)
+  %multiply.26113.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246629.1.clone.1, %select.20761.3.clone.1)
+  %add.246630.1.clone.1 = f32[1280,1280]{1,0} add(%select.20757.3.clone.1, %multiply.26113.1.clone.1)
+  %multiply.26114.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246630.1.clone.1, %select.20761.3.clone.1)
+  %add.246631.1.clone.1 = f32[1280,1280]{1,0} add(%select.20756.3.clone.1, %multiply.26114.1.clone.1)
+  %multiply.26115.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246631.1.clone.1, %select.20761.3.clone.1)
+  %add.246632.3.clone.1 = f32[1280,1280]{1,0} add(%select.20755.5.clone.1, %multiply.26115.1.clone.1)
+  %multiply.26116.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246632.3.clone.1, %select.20761.3.clone.1)
+  %add.246633.3.clone.1 = f32[1280,1280]{1,0} add(%select.20754.5.clone.1, %multiply.26116.1.clone.1)
+  %multiply.26117.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246633.3.clone.1, %select.20761.3.clone.1)
+  %add.246634.9.clone.1 = f32[1280,1280]{1,0} add(%select.20753.11.clone.1, %multiply.26117.7.clone.1)
+  %multiply.26118.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246634.9.clone.1, %select.20761.3.clone.1)
+  %add.246635.7.clone.1 = f32[1280,1280]{1,0} add(%select.20752.7.clone.1, %multiply.26118.7.clone.1)
+  %multiply.26119.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246635.7.clone.1, %maximum.3651.3.clone.1)
+  %select.20762.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7182.3.clone.1, %multiply.26109.9.clone.1, %multiply.26119.7.clone.1)
+  %multiply.26120.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20762.7.clone.1, %broadcast.244500.640)
+  %clamp.1161.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26120.5.clone.1, %broadcast.244501.384)
+  %multiply.26121.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1161.3.clone.1, %broadcast.244502.1)
+  %constant_182688_1_clone_1 = u32[] constant(4003218824)
+  %broadcast.255039.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182688_1_clone_1), dimensions={}
+  %add.250139.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.255039.44.clone.1)
+  %constant_182695_1_clone_1 = u32[] constant(1756142685)
+  %broadcast.255040.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182695_1_clone_1), dimensions={}
+  %add.250140.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.255040.113.clone.1)
+  %add.250141.35.clone.1 = u32[1280,1280]{1,0} add(%add.250139.37.clone.1, %add.250140.99.clone.1)
+  %shift-left.110372.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250140.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116613.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250140.99.clone.1, %broadcast.244415.6016)
+  %or.116127.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110372.31.clone.1, %shift-right-logical.116613.29.clone.1)
+  %xor.122699.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250141.35.clone.1, %or.116127.29.clone.1)
+  %add.250142.5.clone.1 = u32[1280,1280]{1,0} add(%add.250141.35.clone.1, %xor.122699.27.clone.1)
+  %shift-left.110373.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122699.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116614.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122699.27.clone.1, %broadcast.244417.5760)
+  %or.116128.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110373.9.clone.1, %shift-right-logical.116614.9.clone.1)
+  %xor.122700.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250142.5.clone.1, %or.116128.7.clone.1)
+  %add.250144.3.clone.1 = u32[1280,1280]{1,0} add(%add.250142.5.clone.1, %xor.122700.5.clone.1)
+  %shift-left.110374.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122700.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116615.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122700.5.clone.1, %broadcast.244419.4352)
+  %or.116129.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110374.5.clone.1, %shift-right-logical.116615.5.clone.1)
+  %xor.122701.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250144.3.clone.1, %or.116129.3.clone.1)
+  %add.250145.3.clone.1 = u32[1280,1280]{1,0} add(%add.250144.3.clone.1, %xor.122701.3.clone.1)
+  %add.250146.7.clone.1 = u32[1280,1280]{1,0} add(%add.250145.3.clone.1, %broadcast.255040.113.clone.1)
+  %shift-left.110375.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122701.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116616.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122701.3.clone.1, %broadcast.244418.4352)
+  %or.116131.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110375.5.clone.1, %shift-right-logical.116616.5.clone.1)
+  %xor.122702.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250145.3.clone.1, %or.116131.3.clone.1)
+  %constant_218439_1_clone_1 = u32[] constant(2648818192)
+  %broadcast.255059.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218439_1_clone_1), dimensions={}
+  %add.250147.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122702.3.clone.1, %broadcast.255059.5.clone.1)
+  %add.250149.5.clone.1 = u32[1280,1280]{1,0} add(%add.250146.7.clone.1, %add.250147.5.clone.1)
+  %shift-left.110376.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250147.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116617.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250147.5.clone.1, %broadcast.244416.5760)
+  %or.116132.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110376.9.clone.1, %shift-right-logical.116617.9.clone.1)
+  %xor.122703.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250149.5.clone.1, %or.116132.7.clone.1)
+  %add.250150.3.clone.1 = u32[1280,1280]{1,0} add(%add.250149.5.clone.1, %xor.122703.5.clone.1)
+  %shift-left.110377.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122703.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116618.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122703.5.clone.1, %broadcast.244429.2304)
+  %or.116133.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110377.9.clone.1, %shift-right-logical.116618.9.clone.1)
+  %xor.122704.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250150.3.clone.1, %or.116133.7.clone.1)
+  %add.250151.3.clone.1 = u32[1280,1280]{1,0} add(%add.250150.3.clone.1, %xor.122704.5.clone.1)
+  %shift-left.110378.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122704.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116619.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122704.5.clone.1, %broadcast.244430.4608)
+  %or.116134.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110378.9.clone.1, %shift-right-logical.116619.9.clone.1)
+  %xor.122705.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250151.3.clone.1, %or.116134.7.clone.1)
+  %add.250152.3.clone.1 = u32[1280,1280]{1,0} add(%add.250151.3.clone.1, %xor.122705.5.clone.1)
+  %constant_182697_1_clone_1 = u32[] constant(2648818191)
+  %broadcast.255073.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182697_1_clone_1), dimensions={}
+  %add.250153.7.clone.1 = u32[1280,1280]{1,0} add(%add.250152.3.clone.1, %broadcast.255073.24.clone.1)
+  %shift-left.110380.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122705.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116620.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122705.5.clone.1, %broadcast.244434.2816)
+  %or.116135.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110380.11.clone.1, %shift-right-logical.116620.11.clone.1)
+  %xor.122706.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250152.3.clone.1, %or.116135.9.clone.1)
+  %constant_218440_1_clone_1 = u32[] constant(4003218826)
+  %broadcast.255079.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218440_1_clone_1), dimensions={}
+  %add.250155.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122706.7.clone.1, %broadcast.255079.5.clone.1)
+  %add.250159.5.clone.1 = u32[1280,1280]{1,0} add(%add.250153.7.clone.1, %add.250155.5.clone.1)
+  %shift-left.110381.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250155.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116621.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250155.5.clone.1, %broadcast.244415.6016)
+  %or.116136.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110381.9.clone.1, %shift-right-logical.116621.9.clone.1)
+  %xor.122707.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250159.5.clone.1, %or.116136.7.clone.1)
+  %add.250160.3.clone.1 = u32[1280,1280]{1,0} add(%add.250159.5.clone.1, %xor.122707.5.clone.1)
+  %shift-left.110382.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122707.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116622.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122707.5.clone.1, %broadcast.244417.5760)
+  %or.116137.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110382.9.clone.1, %shift-right-logical.116622.9.clone.1)
+  %xor.122709.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250160.3.clone.1, %or.116137.7.clone.1)
+  %add.250161.3.clone.1 = u32[1280,1280]{1,0} add(%add.250160.3.clone.1, %xor.122709.5.clone.1)
+  %shift-left.110383.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122709.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116623.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122709.5.clone.1, %broadcast.244419.4352)
+  %or.116138.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110383.7.clone.1, %shift-right-logical.116623.7.clone.1)
+  %xor.122710.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250161.3.clone.1, %or.116138.5.clone.1)
+  %add.250162.3.clone.1 = u32[1280,1280]{1,0} add(%add.250161.3.clone.1, %xor.122710.3.clone.1)
+  %add.250164.7.clone.1 = u32[1280,1280]{1,0} add(%add.250162.3.clone.1, %broadcast.255039.44.clone.1)
+  %shift-left.110385.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122710.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116624.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122710.3.clone.1, %broadcast.244418.4352)
+  %or.116139.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110385.7.clone.1, %shift-right-logical.116624.7.clone.1)
+  %xor.122711.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250162.3.clone.1, %or.116139.5.clone.1)
+  %constant_218441_1_clone_1 = u32[] constant(1756142688)
+  %broadcast.255093.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218441_1_clone_1), dimensions={}
+  %add.250165.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122711.3.clone.1, %broadcast.255093.5.clone.1)
+  %add.250166.5.clone.1 = u32[1280,1280]{1,0} add(%add.250164.7.clone.1, %add.250165.5.clone.1)
+  %shift-left.110386.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250165.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116625.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250165.5.clone.1, %broadcast.244416.5760)
+  %or.116141.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110386.9.clone.1, %shift-right-logical.116625.9.clone.1)
+  %xor.122712.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250166.5.clone.1, %or.116141.7.clone.1)
+  %add.250167.3.clone.1 = u32[1280,1280]{1,0} add(%add.250166.5.clone.1, %xor.122712.5.clone.1)
+  %shift-left.110387.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122712.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116626.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122712.5.clone.1, %broadcast.244429.2304)
+  %or.116142.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110387.9.clone.1, %shift-right-logical.116626.9.clone.1)
+  %xor.122714.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250167.3.clone.1, %or.116142.7.clone.1)
+  %add.250169.3.clone.1 = u32[1280,1280]{1,0} add(%add.250167.3.clone.1, %xor.122714.5.clone.1)
+  %shift-left.110388.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122714.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116627.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122714.5.clone.1, %broadcast.244430.4608)
+  %or.116143.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110388.9.clone.1, %shift-right-logical.116627.9.clone.1)
+  %xor.122715.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250169.3.clone.1, %or.116143.7.clone.1)
+  %add.250170.3.clone.1 = u32[1280,1280]{1,0} add(%add.250169.3.clone.1, %xor.122715.5.clone.1)
+  %add.250171.7.clone.1 = u32[1280,1280]{1,0} add(%add.250170.3.clone.1, %broadcast.255040.113.clone.1)
+  %shift-left.110390.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122715.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116628.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122715.5.clone.1, %broadcast.244434.2816)
+  %or.116144.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110390.11.clone.1, %shift-right-logical.116628.11.clone.1)
+  %xor.122716.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250170.3.clone.1, %or.116144.9.clone.1)
+  %constant_218442_1_clone_1 = u32[] constant(2648818195)
+  %broadcast.255103.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218442_1_clone_1), dimensions={}
+  %add.250172.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122716.7.clone.1, %broadcast.255103.5.clone.1)
+  %add.250174.5.clone.1 = u32[1280,1280]{1,0} add(%add.250171.7.clone.1, %add.250172.5.clone.1)
+  %shift-left.110391.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250172.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116629.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250172.5.clone.1, %broadcast.244415.6016)
+  %or.116146.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110391.9.clone.1, %shift-right-logical.116629.9.clone.1)
+  %xor.122717.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250174.5.clone.1, %or.116146.7.clone.1)
+  %add.250175.3.clone.1 = u32[1280,1280]{1,0} add(%add.250174.5.clone.1, %xor.122717.5.clone.1)
+  %shift-left.110392.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122717.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116630.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122717.5.clone.1, %broadcast.244417.5760)
+  %or.116147.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110392.9.clone.1, %shift-right-logical.116630.9.clone.1)
+  %xor.122719.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250175.3.clone.1, %or.116147.7.clone.1)
+  %add.250176.3.clone.1 = u32[1280,1280]{1,0} add(%add.250175.3.clone.1, %xor.122719.5.clone.1)
+  %shift-left.110393.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122719.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116631.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122719.5.clone.1, %broadcast.244419.4352)
+  %or.116148.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110393.5.clone.1, %shift-right-logical.116631.5.clone.1)
+  %xor.122720.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250176.3.clone.1, %or.116148.3.clone.1)
+  %add.250177.3.clone.1 = u32[1280,1280]{1,0} add(%add.250176.3.clone.1, %xor.122720.3.clone.1)
+  %add.250178.17.clone.1 = u32[1280,1280]{1,0} add(%add.250177.3.clone.1, %broadcast.255073.24.clone.1)
+  %shift-left.110395.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122720.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116632.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122720.3.clone.1, %broadcast.244418.4352)
+  %or.116149.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110395.5.clone.1, %shift-right-logical.116632.5.clone.1)
+  %xor.122721.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250177.3.clone.1, %or.116149.3.clone.1)
+  %constant_218443_1_clone_1 = u32[] constant(4003218829)
+  %broadcast.255113.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218443_1_clone_1), dimensions={}
+  %add.250180.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122721.15.clone.1, %broadcast.255113.19.clone.1)
+  %xor.122722.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250178.17.clone.1, %add.250180.19.clone.1)
+  %shift-right-logical.116633.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122722.17.clone.1, %broadcast.244468.1920)
+  %or.116151.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116633.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5790.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116151.13.clone.1)
+  %add.250184.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5790.11.clone.1, %broadcast.244470.1152)
+  %multiply.26829.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250184.9.clone.1, %broadcast.244471.896)
+  %add.250185.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26829.7.clone.1, %broadcast.244408.1024)
+  %maximum.3722.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250185.5.clone.1)
+  %abs.1564.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3722.3.clone.1)
+  %compare.7277.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1564.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26830.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3722.3.clone.1, %broadcast.244476.1152)
+  %negate.4633.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3722.3.clone.1)
+  %multiply.26831.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3722.3.clone.1, %negate.4633.5.clone.1)
+  %log-plus-one.1564.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26831.5.clone.1)
+  %negate.4634.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1564.3.clone.1)
+  %compare.7279.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4634.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21290.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21291.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21292.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21293.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21294.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21295.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21296.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21297.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21298.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250186.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4634.4.clone.1, %broadcast.244496.640)
+  %sqrt.1564.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4634.4.clone.1)
+  %add.250187.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1564.5.clone.1, %broadcast.244498.640)
+  %select.21299.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7279.3.clone.1, %add.250186.5.clone.1, %add.250187.5.clone.1)
+  %multiply.26832.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21298.3.clone.1, %select.21299.3.clone.1)
+  %add.250189.1.clone.1 = f32[1280,1280]{1,0} add(%select.21297.3.clone.1, %multiply.26832.1.clone.1)
+  %multiply.26833.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250189.1.clone.1, %select.21299.3.clone.1)
+  %add.250190.1.clone.1 = f32[1280,1280]{1,0} add(%select.21296.3.clone.1, %multiply.26833.1.clone.1)
+  %multiply.26834.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250190.1.clone.1, %select.21299.3.clone.1)
+  %add.250191.1.clone.1 = f32[1280,1280]{1,0} add(%select.21295.3.clone.1, %multiply.26834.1.clone.1)
+  %multiply.26835.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250191.1.clone.1, %select.21299.3.clone.1)
+  %add.250192.1.clone.1 = f32[1280,1280]{1,0} add(%select.21294.3.clone.1, %multiply.26835.1.clone.1)
+  %multiply.26836.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250192.1.clone.1, %select.21299.3.clone.1)
+  %add.250194.3.clone.1 = f32[1280,1280]{1,0} add(%select.21293.5.clone.1, %multiply.26836.1.clone.1)
+  %multiply.26837.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250194.3.clone.1, %select.21299.3.clone.1)
+  %add.250195.3.clone.1 = f32[1280,1280]{1,0} add(%select.21292.5.clone.1, %multiply.26837.1.clone.1)
+  %multiply.26838.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250195.3.clone.1, %select.21299.3.clone.1)
+  %add.250196.9.clone.1 = f32[1280,1280]{1,0} add(%select.21291.11.clone.1, %multiply.26838.7.clone.1)
+  %multiply.26839.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250196.9.clone.1, %select.21299.3.clone.1)
+  %add.250197.7.clone.1 = f32[1280,1280]{1,0} add(%select.21290.7.clone.1, %multiply.26839.7.clone.1)
+  %multiply.26840.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250197.7.clone.1, %maximum.3722.3.clone.1)
+  %select.21300.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7277.3.clone.1, %multiply.26830.9.clone.1, %multiply.26840.7.clone.1)
+  %multiply.26841.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21300.7.clone.1, %broadcast.244500.640)
+  %clamp.1208.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26841.5.clone.1, %broadcast.244501.384)
+  %multiply.26842.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1208.3.clone.1, %broadcast.244502.1)
+  %constant_168081_1_clone_1 = u32[] constant(623170565)
+  %broadcast.248713.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168081_1_clone_1), dimensions={}
+  %add.246536.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248713.44.clone.1)
+  %constant_168088_1_clone_1 = u32[] constant(1375940360)
+  %broadcast.248714.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168088_1_clone_1), dimensions={}
+  %add.246537.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248714.113.clone.1)
+  %add.246538.35.clone.1 = u32[1280,1280]{1,0} add(%add.246536.37.clone.1, %add.246537.99.clone.1)
+  %shift-left.108820.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246537.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114951.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246537.99.clone.1, %broadcast.244415.6016)
+  %or.114481.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108820.31.clone.1, %shift-right-logical.114951.29.clone.1)
+  %xor.121030.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246538.35.clone.1, %or.114481.29.clone.1)
+  %add.246540.5.clone.1 = u32[1280,1280]{1,0} add(%add.246538.35.clone.1, %xor.121030.27.clone.1)
+  %shift-left.108821.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121030.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114952.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121030.27.clone.1, %broadcast.244417.5760)
+  %or.114482.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108821.9.clone.1, %shift-right-logical.114952.9.clone.1)
+  %xor.121031.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246540.5.clone.1, %or.114482.7.clone.1)
+  %add.246544.3.clone.1 = u32[1280,1280]{1,0} add(%add.246540.5.clone.1, %xor.121031.5.clone.1)
+  %shift-left.108822.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121031.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114953.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121031.5.clone.1, %broadcast.244419.4352)
+  %or.114483.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108822.5.clone.1, %shift-right-logical.114953.5.clone.1)
+  %xor.121033.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246544.3.clone.1, %or.114483.3.clone.1)
+  %add.246545.3.clone.1 = u32[1280,1280]{1,0} add(%add.246544.3.clone.1, %xor.121033.3.clone.1)
+  %add.246546.7.clone.1 = u32[1280,1280]{1,0} add(%add.246545.3.clone.1, %broadcast.248714.113.clone.1)
+  %shift-left.108823.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121033.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114954.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121033.3.clone.1, %broadcast.244418.4352)
+  %or.114484.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108823.5.clone.1, %shift-right-logical.114954.5.clone.1)
+  %xor.121034.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246545.3.clone.1, %or.114484.3.clone.1)
+  %constant_218041_1_clone_1 = u32[] constant(1828118744)
+  %broadcast.248724.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218041_1_clone_1), dimensions={}
+  %add.246547.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121034.3.clone.1, %broadcast.248724.5.clone.1)
+  %add.246549.5.clone.1 = u32[1280,1280]{1,0} add(%add.246546.7.clone.1, %add.246547.5.clone.1)
+  %shift-left.108824.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246547.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114955.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246547.5.clone.1, %broadcast.244416.5760)
+  %or.114485.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108824.9.clone.1, %shift-right-logical.114955.9.clone.1)
+  %xor.121035.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246549.5.clone.1, %or.114485.7.clone.1)
+  %add.246550.3.clone.1 = u32[1280,1280]{1,0} add(%add.246549.5.clone.1, %xor.121035.5.clone.1)
+  %shift-left.108825.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121035.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114956.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121035.5.clone.1, %broadcast.244429.2304)
+  %or.114486.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108825.9.clone.1, %shift-right-logical.114956.9.clone.1)
+  %xor.121036.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246550.3.clone.1, %or.114486.7.clone.1)
+  %add.246551.3.clone.1 = u32[1280,1280]{1,0} add(%add.246550.3.clone.1, %xor.121036.5.clone.1)
+  %shift-left.108826.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121036.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114957.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121036.5.clone.1, %broadcast.244430.4608)
+  %or.114488.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108826.9.clone.1, %shift-right-logical.114957.9.clone.1)
+  %xor.121038.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246551.3.clone.1, %or.114488.7.clone.1)
+  %add.246552.3.clone.1 = u32[1280,1280]{1,0} add(%add.246551.3.clone.1, %xor.121038.5.clone.1)
+  %constant_168090_1_clone_1 = u32[] constant(1828118743)
+  %broadcast.248731.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_168090_1_clone_1), dimensions={}
+  %add.246554.7.clone.1 = u32[1280,1280]{1,0} add(%add.246552.3.clone.1, %broadcast.248731.24.clone.1)
+  %shift-left.108827.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121038.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114958.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121038.5.clone.1, %broadcast.244434.2816)
+  %or.114489.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108827.11.clone.1, %shift-right-logical.114958.11.clone.1)
+  %xor.121039.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246552.3.clone.1, %or.114489.9.clone.1)
+  %constant_218042_1_clone_1 = u32[] constant(623170567)
+  %broadcast.248734.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218042_1_clone_1), dimensions={}
+  %add.246555.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121039.7.clone.1, %broadcast.248734.5.clone.1)
+  %add.246556.5.clone.1 = u32[1280,1280]{1,0} add(%add.246554.7.clone.1, %add.246555.5.clone.1)
+  %shift-left.108828.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246555.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114959.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246555.5.clone.1, %broadcast.244415.6016)
+  %or.114490.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108828.9.clone.1, %shift-right-logical.114959.9.clone.1)
+  %xor.121040.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246556.5.clone.1, %or.114490.7.clone.1)
+  %add.246557.3.clone.1 = u32[1280,1280]{1,0} add(%add.246556.5.clone.1, %xor.121040.5.clone.1)
+  %shift-left.108829.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121040.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114960.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121040.5.clone.1, %broadcast.244417.5760)
+  %or.114491.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108829.9.clone.1, %shift-right-logical.114960.9.clone.1)
+  %xor.121041.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246557.3.clone.1, %or.114491.7.clone.1)
+  %add.246559.3.clone.1 = u32[1280,1280]{1,0} add(%add.246557.3.clone.1, %xor.121041.5.clone.1)
+  %shift-left.108830.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121041.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114961.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121041.5.clone.1, %broadcast.244419.4352)
+  %or.114493.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108830.7.clone.1, %shift-right-logical.114961.7.clone.1)
+  %xor.121043.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246559.3.clone.1, %or.114493.5.clone.1)
+  %add.246560.3.clone.1 = u32[1280,1280]{1,0} add(%add.246559.3.clone.1, %xor.121043.3.clone.1)
+  %add.246561.7.clone.1 = u32[1280,1280]{1,0} add(%add.246560.3.clone.1, %broadcast.248713.44.clone.1)
+  %shift-left.108831.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121043.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114962.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121043.3.clone.1, %broadcast.244418.4352)
+  %or.114494.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108831.7.clone.1, %shift-right-logical.114962.7.clone.1)
+  %xor.121044.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246560.3.clone.1, %or.114494.5.clone.1)
+  %constant_218043_1_clone_1 = u32[] constant(1375940363)
+  %broadcast.248744.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218043_1_clone_1), dimensions={}
+  %add.246562.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121044.3.clone.1, %broadcast.248744.5.clone.1)
+  %add.246563.5.clone.1 = u32[1280,1280]{1,0} add(%add.246561.7.clone.1, %add.246562.5.clone.1)
+  %shift-left.108832.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246562.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114963.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246562.5.clone.1, %broadcast.244416.5760)
+  %or.114495.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108832.9.clone.1, %shift-right-logical.114963.9.clone.1)
+  %xor.121045.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246563.5.clone.1, %or.114495.7.clone.1)
+  %add.246565.3.clone.1 = u32[1280,1280]{1,0} add(%add.246563.5.clone.1, %xor.121045.5.clone.1)
+  %shift-left.108833.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121045.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114964.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121045.5.clone.1, %broadcast.244429.2304)
+  %or.114496.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108833.9.clone.1, %shift-right-logical.114964.9.clone.1)
+  %xor.121046.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246565.3.clone.1, %or.114496.7.clone.1)
+  %add.246568.3.clone.1 = u32[1280,1280]{1,0} add(%add.246565.3.clone.1, %xor.121046.5.clone.1)
+  %shift-left.108834.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121046.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114965.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121046.5.clone.1, %broadcast.244430.4608)
+  %or.114498.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108834.9.clone.1, %shift-right-logical.114965.9.clone.1)
+  %xor.121048.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246568.3.clone.1, %or.114498.7.clone.1)
+  %add.246569.3.clone.1 = u32[1280,1280]{1,0} add(%add.246568.3.clone.1, %xor.121048.5.clone.1)
+  %add.246570.7.clone.1 = u32[1280,1280]{1,0} add(%add.246569.3.clone.1, %broadcast.248714.113.clone.1)
+  %shift-left.108835.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121048.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114966.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121048.5.clone.1, %broadcast.244434.2816)
+  %or.114499.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108835.11.clone.1, %shift-right-logical.114966.11.clone.1)
+  %xor.121049.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246569.3.clone.1, %or.114499.9.clone.1)
+  %constant_218044_1_clone_1 = u32[] constant(1828118747)
+  %broadcast.248754.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218044_1_clone_1), dimensions={}
+  %add.246571.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121049.7.clone.1, %broadcast.248754.5.clone.1)
+  %add.246572.5.clone.1 = u32[1280,1280]{1,0} add(%add.246570.7.clone.1, %add.246571.5.clone.1)
+  %shift-left.108836.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246571.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114967.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246571.5.clone.1, %broadcast.244415.6016)
+  %or.114500.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108836.9.clone.1, %shift-right-logical.114967.9.clone.1)
+  %xor.121050.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246572.5.clone.1, %or.114500.7.clone.1)
+  %add.246573.3.clone.1 = u32[1280,1280]{1,0} add(%add.246572.5.clone.1, %xor.121050.5.clone.1)
+  %shift-left.108837.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121050.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114968.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121050.5.clone.1, %broadcast.244417.5760)
+  %or.114501.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108837.9.clone.1, %shift-right-logical.114968.9.clone.1)
+  %xor.121051.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246573.3.clone.1, %or.114501.7.clone.1)
+  %add.246574.3.clone.1 = u32[1280,1280]{1,0} add(%add.246573.3.clone.1, %xor.121051.5.clone.1)
+  %shift-left.108838.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121051.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114969.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121051.5.clone.1, %broadcast.244419.4352)
+  %or.114503.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108838.5.clone.1, %shift-right-logical.114969.5.clone.1)
+  %xor.121052.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246574.3.clone.1, %or.114503.3.clone.1)
+  %add.246575.3.clone.1 = u32[1280,1280]{1,0} add(%add.246574.3.clone.1, %xor.121052.3.clone.1)
+  %add.246576.17.clone.1 = u32[1280,1280]{1,0} add(%add.246575.3.clone.1, %broadcast.248731.24.clone.1)
+  %shift-left.108839.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121052.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114970.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121052.3.clone.1, %broadcast.244418.4352)
+  %or.114504.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108839.5.clone.1, %shift-right-logical.114970.5.clone.1)
+  %xor.121053.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246575.3.clone.1, %or.114504.3.clone.1)
+  %constant_218045_1_clone_1 = u32[] constant(623170570)
+  %broadcast.248764.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218045_1_clone_1), dimensions={}
+  %add.246577.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121053.15.clone.1, %broadcast.248764.19.clone.1)
+  %xor.121054.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246576.17.clone.1, %add.246577.19.clone.1)
+  %shift-right-logical.114971.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121054.17.clone.1, %broadcast.244468.1920)
+  %or.114505.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114971.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5718.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114505.13.clone.1)
+  %add.246578.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5718.11.clone.1, %broadcast.244470.1152)
+  %multiply.26094.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246578.9.clone.1, %broadcast.244471.896)
+  %add.246579.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26094.7.clone.1, %broadcast.244408.1024)
+  %maximum.3650.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246579.5.clone.1)
+  %abs.1516.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3650.3.clone.1)
+  %compare.7180.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1516.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26095.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3650.3.clone.1, %broadcast.244476.1152)
+  %negate.4537.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3650.3.clone.1)
+  %multiply.26096.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3650.3.clone.1, %negate.4537.5.clone.1)
+  %log-plus-one.1516.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26096.5.clone.1)
+  %negate.4538.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1516.3.clone.1)
+  %compare.7181.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4538.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20741.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20742.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20743.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20744.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20745.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20746.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20747.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20748.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20749.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246580.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4538.4.clone.1, %broadcast.244496.640)
+  %sqrt.1516.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4538.4.clone.1)
+  %add.246581.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1516.5.clone.1, %broadcast.244498.640)
+  %select.20750.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7181.3.clone.1, %add.246580.5.clone.1, %add.246581.5.clone.1)
+  %multiply.26097.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20749.3.clone.1, %select.20750.3.clone.1)
+  %add.246582.1.clone.1 = f32[1280,1280]{1,0} add(%select.20748.3.clone.1, %multiply.26097.1.clone.1)
+  %multiply.26098.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246582.1.clone.1, %select.20750.3.clone.1)
+  %add.246583.1.clone.1 = f32[1280,1280]{1,0} add(%select.20747.3.clone.1, %multiply.26098.1.clone.1)
+  %multiply.26099.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246583.1.clone.1, %select.20750.3.clone.1)
+  %add.246584.1.clone.1 = f32[1280,1280]{1,0} add(%select.20746.3.clone.1, %multiply.26099.1.clone.1)
+  %multiply.26100.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246584.1.clone.1, %select.20750.3.clone.1)
+  %add.246585.1.clone.1 = f32[1280,1280]{1,0} add(%select.20745.3.clone.1, %multiply.26100.1.clone.1)
+  %multiply.26101.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246585.1.clone.1, %select.20750.3.clone.1)
+  %add.246586.3.clone.1 = f32[1280,1280]{1,0} add(%select.20744.5.clone.1, %multiply.26101.1.clone.1)
+  %multiply.26102.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246586.3.clone.1, %select.20750.3.clone.1)
+  %add.246587.3.clone.1 = f32[1280,1280]{1,0} add(%select.20743.5.clone.1, %multiply.26102.1.clone.1)
+  %multiply.26103.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246587.3.clone.1, %select.20750.3.clone.1)
+  %add.246588.9.clone.1 = f32[1280,1280]{1,0} add(%select.20742.11.clone.1, %multiply.26103.7.clone.1)
+  %multiply.26104.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246588.9.clone.1, %select.20750.3.clone.1)
+  %add.246589.7.clone.1 = f32[1280,1280]{1,0} add(%select.20741.7.clone.1, %multiply.26104.7.clone.1)
+  %multiply.26105.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246589.7.clone.1, %maximum.3650.3.clone.1)
+  %select.20751.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7180.3.clone.1, %multiply.26095.9.clone.1, %multiply.26105.7.clone.1)
+  %multiply.26106.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20751.7.clone.1, %broadcast.244500.640)
+  %clamp.1160.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26106.5.clone.1, %broadcast.244501.384)
+  %multiply.26107.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1160.3.clone.1, %broadcast.244502.1)
+  %constant_196564_1_clone_1 = u32[] constant(3909802021)
+  %broadcast.261019.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_196564_1_clone_1), dimensions={}
+  %add.253565.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.261019.44.clone.1)
+  %constant_196571_1_clone_1 = u32[] constant(983326418)
+  %broadcast.261020.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_196571_1_clone_1), dimensions={}
+  %add.253566.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.261020.113.clone.1)
+  %add.253567.35.clone.1 = u32[1280,1280]{1,0} add(%add.253565.37.clone.1, %add.253566.99.clone.1)
+  %shift-left.111860.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253566.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118174.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253566.99.clone.1, %broadcast.244415.6016)
+  %or.117713.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111860.31.clone.1, %shift-right-logical.118174.29.clone.1)
+  %xor.124270.27.clone.1 = u32[1280,1280]{1,0} xor(%add.253567.35.clone.1, %or.117713.29.clone.1)
+  %add.253568.5.clone.1 = u32[1280,1280]{1,0} add(%add.253567.35.clone.1, %xor.124270.27.clone.1)
+  %shift-left.111861.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124270.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118175.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124270.27.clone.1, %broadcast.244417.5760)
+  %or.117714.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111861.9.clone.1, %shift-right-logical.118175.9.clone.1)
+  %xor.124271.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253568.5.clone.1, %or.117714.7.clone.1)
+  %add.253569.3.clone.1 = u32[1280,1280]{1,0} add(%add.253568.5.clone.1, %xor.124271.5.clone.1)
+  %shift-left.111862.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124271.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118176.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124271.5.clone.1, %broadcast.244419.4352)
+  %or.117716.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111862.5.clone.1, %shift-right-logical.118176.5.clone.1)
+  %xor.124272.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253569.3.clone.1, %or.117716.3.clone.1)
+  %add.253570.3.clone.1 = u32[1280,1280]{1,0} add(%add.253569.3.clone.1, %xor.124272.3.clone.1)
+  %add.253571.7.clone.1 = u32[1280,1280]{1,0} add(%add.253570.3.clone.1, %broadcast.261020.113.clone.1)
+  %shift-left.111863.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124272.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118177.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124272.3.clone.1, %broadcast.244418.4352)
+  %or.117717.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111863.5.clone.1, %shift-right-logical.118177.5.clone.1)
+  %xor.124273.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253570.3.clone.1, %or.117717.3.clone.1)
+  %constant_218817_1_clone_1 = u32[] constant(3360136494)
+  %broadcast.261032.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218817_1_clone_1), dimensions={}
+  %add.253572.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124273.3.clone.1, %broadcast.261032.5.clone.1)
+  %add.253573.5.clone.1 = u32[1280,1280]{1,0} add(%add.253571.7.clone.1, %add.253572.5.clone.1)
+  %shift-left.111864.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253572.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118178.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253572.5.clone.1, %broadcast.244416.5760)
+  %or.117718.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111864.9.clone.1, %shift-right-logical.118178.9.clone.1)
+  %xor.124274.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253573.5.clone.1, %or.117718.7.clone.1)
+  %add.253574.3.clone.1 = u32[1280,1280]{1,0} add(%add.253573.5.clone.1, %xor.124274.5.clone.1)
+  %shift-left.111865.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124274.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118179.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124274.5.clone.1, %broadcast.244429.2304)
+  %or.117719.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111865.9.clone.1, %shift-right-logical.118179.9.clone.1)
+  %xor.124275.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253574.3.clone.1, %or.117719.7.clone.1)
+  %add.253575.3.clone.1 = u32[1280,1280]{1,0} add(%add.253574.3.clone.1, %xor.124275.5.clone.1)
+  %shift-left.111866.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124275.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118181.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124275.5.clone.1, %broadcast.244430.4608)
+  %or.117721.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111866.9.clone.1, %shift-right-logical.118181.9.clone.1)
+  %xor.124276.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253575.3.clone.1, %or.117721.7.clone.1)
+  %add.253576.3.clone.1 = u32[1280,1280]{1,0} add(%add.253575.3.clone.1, %xor.124276.5.clone.1)
+  %constant_196573_1_clone_1 = u32[] constant(3360136493)
+  %broadcast.261044.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_196573_1_clone_1), dimensions={}
+  %add.253577.7.clone.1 = u32[1280,1280]{1,0} add(%add.253576.3.clone.1, %broadcast.261044.24.clone.1)
+  %shift-left.111867.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124276.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118182.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124276.5.clone.1, %broadcast.244434.2816)
+  %or.117722.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111867.11.clone.1, %shift-right-logical.118182.11.clone.1)
+  %xor.124277.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253576.3.clone.1, %or.117722.9.clone.1)
+  %constant_218818_1_clone_1 = u32[] constant(3909802023)
+  %broadcast.261047.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218818_1_clone_1), dimensions={}
+  %add.253578.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124277.7.clone.1, %broadcast.261047.5.clone.1)
+  %add.253580.5.clone.1 = u32[1280,1280]{1,0} add(%add.253577.7.clone.1, %add.253578.5.clone.1)
+  %shift-left.111868.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253578.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118183.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253578.5.clone.1, %broadcast.244415.6016)
+  %or.117723.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111868.9.clone.1, %shift-right-logical.118183.9.clone.1)
+  %xor.124278.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253580.5.clone.1, %or.117723.7.clone.1)
+  %add.253581.3.clone.1 = u32[1280,1280]{1,0} add(%add.253580.5.clone.1, %xor.124278.5.clone.1)
+  %shift-left.111869.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124278.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118184.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124278.5.clone.1, %broadcast.244417.5760)
+  %or.117724.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111869.9.clone.1, %shift-right-logical.118184.9.clone.1)
+  %xor.124280.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253581.3.clone.1, %or.117724.7.clone.1)
+  %add.253582.3.clone.1 = u32[1280,1280]{1,0} add(%add.253581.3.clone.1, %xor.124280.5.clone.1)
+  %shift-left.111870.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124280.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118186.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124280.5.clone.1, %broadcast.244419.4352)
+  %or.117725.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111870.7.clone.1, %shift-right-logical.118186.7.clone.1)
+  %xor.124281.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253582.3.clone.1, %or.117725.5.clone.1)
+  %add.253583.3.clone.1 = u32[1280,1280]{1,0} add(%add.253582.3.clone.1, %xor.124281.3.clone.1)
+  %add.253584.7.clone.1 = u32[1280,1280]{1,0} add(%add.253583.3.clone.1, %broadcast.261019.44.clone.1)
+  %shift-left.111871.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124281.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118187.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124281.3.clone.1, %broadcast.244418.4352)
+  %or.117726.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111871.7.clone.1, %shift-right-logical.118187.7.clone.1)
+  %xor.124282.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253583.3.clone.1, %or.117726.5.clone.1)
+  %constant_218819_1_clone_1 = u32[] constant(983326421)
+  %broadcast.261059.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218819_1_clone_1), dimensions={}
+  %add.253585.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124282.3.clone.1, %broadcast.261059.5.clone.1)
+  %add.253586.5.clone.1 = u32[1280,1280]{1,0} add(%add.253584.7.clone.1, %add.253585.5.clone.1)
+  %shift-left.111872.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253585.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118188.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253585.5.clone.1, %broadcast.244416.5760)
+  %or.117727.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111872.9.clone.1, %shift-right-logical.118188.9.clone.1)
+  %xor.124283.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253586.5.clone.1, %or.117727.7.clone.1)
+  %add.253587.3.clone.1 = u32[1280,1280]{1,0} add(%add.253586.5.clone.1, %xor.124283.5.clone.1)
+  %shift-left.111873.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124283.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118189.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124283.5.clone.1, %broadcast.244429.2304)
+  %or.117728.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111873.9.clone.1, %shift-right-logical.118189.9.clone.1)
+  %xor.124284.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253587.3.clone.1, %or.117728.7.clone.1)
+  %add.253588.3.clone.1 = u32[1280,1280]{1,0} add(%add.253587.3.clone.1, %xor.124284.5.clone.1)
+  %shift-left.111874.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124284.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118191.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124284.5.clone.1, %broadcast.244430.4608)
+  %or.117729.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111874.9.clone.1, %shift-right-logical.118191.9.clone.1)
+  %xor.124285.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253588.3.clone.1, %or.117729.7.clone.1)
+  %add.253589.3.clone.1 = u32[1280,1280]{1,0} add(%add.253588.3.clone.1, %xor.124285.5.clone.1)
+  %add.253590.7.clone.1 = u32[1280,1280]{1,0} add(%add.253589.3.clone.1, %broadcast.261020.113.clone.1)
+  %shift-left.111875.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124285.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118192.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124285.5.clone.1, %broadcast.244434.2816)
+  %or.117730.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111875.11.clone.1, %shift-right-logical.118192.11.clone.1)
+  %xor.124286.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253589.3.clone.1, %or.117730.9.clone.1)
+  %constant_218820_1_clone_1 = u32[] constant(3360136497)
+  %broadcast.261071.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218820_1_clone_1), dimensions={}
+  %add.253591.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124286.7.clone.1, %broadcast.261071.5.clone.1)
+  %add.253592.5.clone.1 = u32[1280,1280]{1,0} add(%add.253590.7.clone.1, %add.253591.5.clone.1)
+  %shift-left.111876.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253591.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118193.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253591.5.clone.1, %broadcast.244415.6016)
+  %or.117731.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111876.9.clone.1, %shift-right-logical.118193.9.clone.1)
+  %xor.124287.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253592.5.clone.1, %or.117731.7.clone.1)
+  %add.253593.3.clone.1 = u32[1280,1280]{1,0} add(%add.253592.5.clone.1, %xor.124287.5.clone.1)
+  %shift-left.111877.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124287.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118194.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124287.5.clone.1, %broadcast.244417.5760)
+  %or.117732.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111877.9.clone.1, %shift-right-logical.118194.9.clone.1)
+  %xor.124288.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253593.3.clone.1, %or.117732.7.clone.1)
+  %add.253594.3.clone.1 = u32[1280,1280]{1,0} add(%add.253593.3.clone.1, %xor.124288.5.clone.1)
+  %shift-left.111878.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124288.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118196.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124288.5.clone.1, %broadcast.244419.4352)
+  %or.117733.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111878.5.clone.1, %shift-right-logical.118196.5.clone.1)
+  %xor.124289.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253594.3.clone.1, %or.117733.3.clone.1)
+  %add.253595.3.clone.1 = u32[1280,1280]{1,0} add(%add.253594.3.clone.1, %xor.124289.3.clone.1)
+  %add.253596.17.clone.1 = u32[1280,1280]{1,0} add(%add.253595.3.clone.1, %broadcast.261044.24.clone.1)
+  %shift-left.111879.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124289.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118197.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124289.3.clone.1, %broadcast.244418.4352)
+  %or.117734.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111879.5.clone.1, %shift-right-logical.118197.5.clone.1)
+  %xor.124290.15.clone.1 = u32[1280,1280]{1,0} xor(%add.253595.3.clone.1, %or.117734.3.clone.1)
+  %constant_218821_1_clone_1 = u32[] constant(3909802026)
+  %broadcast.261085.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218821_1_clone_1), dimensions={}
+  %add.253597.19.clone.1 = u32[1280,1280]{1,0} add(%xor.124290.15.clone.1, %broadcast.261085.19.clone.1)
+  %xor.124291.17.clone.1 = u32[1280,1280]{1,0} xor(%add.253596.17.clone.1, %add.253597.19.clone.1)
+  %shift-right-logical.118198.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124291.17.clone.1, %broadcast.244468.1920)
+  %or.117735.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.118198.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5858.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117735.13.clone.1)
+  %add.253598.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5858.11.clone.1, %broadcast.244470.1152)
+  %multiply.27534.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253598.9.clone.1, %broadcast.244471.896)
+  %add.253600.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27534.7.clone.1, %broadcast.244408.1024)
+  %maximum.3790.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.253600.5.clone.1)
+  %abs.1610.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3790.3.clone.1)
+  %compare.7382.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1610.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27535.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3790.3.clone.1, %broadcast.244476.1152)
+  %negate.4725.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3790.3.clone.1)
+  %multiply.27536.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3790.3.clone.1, %negate.4725.5.clone.1)
+  %log-plus-one.1610.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27536.5.clone.1)
+  %negate.4726.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1610.3.clone.1)
+  %compare.7383.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4726.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21817.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21818.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21819.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21820.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21821.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21822.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21823.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21824.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21825.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.253601.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4726.4.clone.1, %broadcast.244496.640)
+  %sqrt.1610.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4726.4.clone.1)
+  %add.253602.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1610.5.clone.1, %broadcast.244498.640)
+  %select.21826.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7383.3.clone.1, %add.253601.5.clone.1, %add.253602.5.clone.1)
+  %multiply.27537.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21825.3.clone.1, %select.21826.3.clone.1)
+  %add.253603.1.clone.1 = f32[1280,1280]{1,0} add(%select.21824.3.clone.1, %multiply.27537.1.clone.1)
+  %multiply.27538.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253603.1.clone.1, %select.21826.3.clone.1)
+  %add.253604.1.clone.1 = f32[1280,1280]{1,0} add(%select.21823.3.clone.1, %multiply.27538.1.clone.1)
+  %multiply.27539.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253604.1.clone.1, %select.21826.3.clone.1)
+  %add.253605.1.clone.1 = f32[1280,1280]{1,0} add(%select.21822.3.clone.1, %multiply.27539.1.clone.1)
+  %multiply.27540.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253605.1.clone.1, %select.21826.3.clone.1)
+  %add.253606.1.clone.1 = f32[1280,1280]{1,0} add(%select.21821.3.clone.1, %multiply.27540.1.clone.1)
+  %multiply.27541.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253606.1.clone.1, %select.21826.3.clone.1)
+  %add.253607.3.clone.1 = f32[1280,1280]{1,0} add(%select.21820.5.clone.1, %multiply.27541.1.clone.1)
+  %multiply.27542.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253607.3.clone.1, %select.21826.3.clone.1)
+  %add.253608.3.clone.1 = f32[1280,1280]{1,0} add(%select.21819.5.clone.1, %multiply.27542.1.clone.1)
+  %multiply.27543.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253608.3.clone.1, %select.21826.3.clone.1)
+  %add.253609.9.clone.1 = f32[1280,1280]{1,0} add(%select.21818.11.clone.1, %multiply.27543.7.clone.1)
+  %multiply.27544.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253609.9.clone.1, %select.21826.3.clone.1)
+  %add.253610.7.clone.1 = f32[1280,1280]{1,0} add(%select.21817.7.clone.1, %multiply.27544.7.clone.1)
+  %multiply.27545.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253610.7.clone.1, %maximum.3790.3.clone.1)
+  %select.21827.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7382.3.clone.1, %multiply.27535.9.clone.1, %multiply.27545.7.clone.1)
+  %multiply.27546.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21827.7.clone.1, %broadcast.244500.640)
+  %clamp.1254.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27546.5.clone.1, %broadcast.244501.384)
+  %multiply.27547.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1254.3.clone.1, %broadcast.244502.1)
+  %constant_167833_1_clone_1 = u32[] constant(3727816689)
+  %broadcast.248614.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167833_1_clone_1), dimensions={}
+  %add.246476.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248614.44.clone.1)
+  %constant_167845_1_clone_1 = u32[] constant(589999196)
+  %broadcast.248616.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167845_1_clone_1), dimensions={}
+  %add.246477.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248616.113.clone.1)
+  %add.246479.35.clone.1 = u32[1280,1280]{1,0} add(%add.246476.37.clone.1, %add.246477.99.clone.1)
+  %shift-left.108800.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246477.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114930.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246477.99.clone.1, %broadcast.244415.6016)
+  %or.114456.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108800.31.clone.1, %shift-right-logical.114930.29.clone.1)
+  %xor.121005.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246479.35.clone.1, %or.114456.29.clone.1)
+  %add.246480.5.clone.1 = u32[1280,1280]{1,0} add(%add.246479.35.clone.1, %xor.121005.27.clone.1)
+  %shift-left.108801.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121005.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114931.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121005.27.clone.1, %broadcast.244417.5760)
+  %or.114457.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108801.9.clone.1, %shift-right-logical.114931.9.clone.1)
+  %xor.121006.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246480.5.clone.1, %or.114457.7.clone.1)
+  %add.246481.3.clone.1 = u32[1280,1280]{1,0} add(%add.246480.5.clone.1, %xor.121006.5.clone.1)
+  %shift-left.108802.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121006.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114932.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121006.5.clone.1, %broadcast.244419.4352)
+  %or.114458.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108802.5.clone.1, %shift-right-logical.114932.5.clone.1)
+  %xor.121008.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246481.3.clone.1, %or.114458.3.clone.1)
+  %add.246482.3.clone.1 = u32[1280,1280]{1,0} add(%add.246481.3.clone.1, %xor.121008.3.clone.1)
+  %add.246484.7.clone.1 = u32[1280,1280]{1,0} add(%add.246482.3.clone.1, %broadcast.248616.113.clone.1)
+  %shift-left.108803.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121008.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114933.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121008.3.clone.1, %broadcast.244418.4352)
+  %or.114459.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108803.5.clone.1, %shift-right-logical.114933.5.clone.1)
+  %xor.121009.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246482.3.clone.1, %or.114459.3.clone.1)
+  %constant_218036_1_clone_1 = u32[] constant(3872014456)
+  %broadcast.248630.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218036_1_clone_1), dimensions={}
+  %add.246485.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121009.3.clone.1, %broadcast.248630.5.clone.1)
+  %add.246486.5.clone.1 = u32[1280,1280]{1,0} add(%add.246484.7.clone.1, %add.246485.5.clone.1)
+  %shift-left.108804.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246485.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114934.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246485.5.clone.1, %broadcast.244416.5760)
+  %or.114460.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108804.9.clone.1, %shift-right-logical.114934.9.clone.1)
+  %xor.121010.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246486.5.clone.1, %or.114460.7.clone.1)
+  %add.246487.3.clone.1 = u32[1280,1280]{1,0} add(%add.246486.5.clone.1, %xor.121010.5.clone.1)
+  %shift-left.108805.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121010.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114935.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121010.5.clone.1, %broadcast.244429.2304)
+  %or.114461.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108805.9.clone.1, %shift-right-logical.114935.9.clone.1)
+  %xor.121011.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246487.3.clone.1, %or.114461.7.clone.1)
+  %add.246488.3.clone.1 = u32[1280,1280]{1,0} add(%add.246487.3.clone.1, %xor.121011.5.clone.1)
+  %shift-left.108806.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121011.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114936.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121011.5.clone.1, %broadcast.244430.4608)
+  %or.114463.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108806.9.clone.1, %shift-right-logical.114936.9.clone.1)
+  %xor.121013.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246488.3.clone.1, %or.114463.7.clone.1)
+  %add.246490.3.clone.1 = u32[1280,1280]{1,0} add(%add.246488.3.clone.1, %xor.121013.5.clone.1)
+  %constant_167847_1_clone_1 = u32[] constant(3872014455)
+  %broadcast.248639.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167847_1_clone_1), dimensions={}
+  %add.246494.7.clone.1 = u32[1280,1280]{1,0} add(%add.246490.3.clone.1, %broadcast.248639.24.clone.1)
+  %shift-left.108807.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121013.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114937.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121013.5.clone.1, %broadcast.244434.2816)
+  %or.114464.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108807.11.clone.1, %shift-right-logical.114937.11.clone.1)
+  %xor.121014.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246490.3.clone.1, %or.114464.9.clone.1)
+  %constant_218037_1_clone_1 = u32[] constant(3727816691)
+  %broadcast.248642.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218037_1_clone_1), dimensions={}
+  %add.246495.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121014.7.clone.1, %broadcast.248642.5.clone.1)
+  %add.246496.5.clone.1 = u32[1280,1280]{1,0} add(%add.246494.7.clone.1, %add.246495.5.clone.1)
+  %shift-left.108808.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246495.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114938.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246495.5.clone.1, %broadcast.244415.6016)
+  %or.114465.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108808.9.clone.1, %shift-right-logical.114938.9.clone.1)
+  %xor.121015.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246496.5.clone.1, %or.114465.7.clone.1)
+  %add.246497.3.clone.1 = u32[1280,1280]{1,0} add(%add.246496.5.clone.1, %xor.121015.5.clone.1)
+  %shift-left.108809.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121015.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114939.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121015.5.clone.1, %broadcast.244417.5760)
+  %or.114466.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108809.9.clone.1, %shift-right-logical.114939.9.clone.1)
+  %xor.121016.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246497.3.clone.1, %or.114466.7.clone.1)
+  %add.246499.3.clone.1 = u32[1280,1280]{1,0} add(%add.246497.3.clone.1, %xor.121016.5.clone.1)
+  %shift-left.108810.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121016.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114940.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121016.5.clone.1, %broadcast.244419.4352)
+  %or.114468.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108810.7.clone.1, %shift-right-logical.114940.7.clone.1)
+  %xor.121018.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246499.3.clone.1, %or.114468.5.clone.1)
+  %add.246500.3.clone.1 = u32[1280,1280]{1,0} add(%add.246499.3.clone.1, %xor.121018.3.clone.1)
+  %add.246501.7.clone.1 = u32[1280,1280]{1,0} add(%add.246500.3.clone.1, %broadcast.248614.44.clone.1)
+  %shift-left.108811.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121018.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114941.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121018.3.clone.1, %broadcast.244418.4352)
+  %or.114469.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108811.7.clone.1, %shift-right-logical.114941.7.clone.1)
+  %xor.121019.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246500.3.clone.1, %or.114469.5.clone.1)
+  %constant_218038_1_clone_1 = u32[] constant(589999199)
+  %broadcast.248656.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218038_1_clone_1), dimensions={}
+  %add.246502.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121019.3.clone.1, %broadcast.248656.5.clone.1)
+  %add.246504.5.clone.1 = u32[1280,1280]{1,0} add(%add.246501.7.clone.1, %add.246502.5.clone.1)
+  %shift-left.108812.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246502.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114942.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246502.5.clone.1, %broadcast.244416.5760)
+  %or.114470.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108812.9.clone.1, %shift-right-logical.114942.9.clone.1)
+  %xor.121020.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246504.5.clone.1, %or.114470.7.clone.1)
+  %add.246505.3.clone.1 = u32[1280,1280]{1,0} add(%add.246504.5.clone.1, %xor.121020.5.clone.1)
+  %shift-left.108813.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121020.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114943.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121020.5.clone.1, %broadcast.244429.2304)
+  %or.114471.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108813.9.clone.1, %shift-right-logical.114943.9.clone.1)
+  %xor.121021.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246505.3.clone.1, %or.114471.7.clone.1)
+  %add.246506.3.clone.1 = u32[1280,1280]{1,0} add(%add.246505.3.clone.1, %xor.121021.5.clone.1)
+  %shift-left.108814.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121021.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114944.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121021.5.clone.1, %broadcast.244430.4608)
+  %or.114473.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108814.9.clone.1, %shift-right-logical.114944.9.clone.1)
+  %xor.121023.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246506.3.clone.1, %or.114473.7.clone.1)
+  %add.246507.3.clone.1 = u32[1280,1280]{1,0} add(%add.246506.3.clone.1, %xor.121023.5.clone.1)
+  %add.246509.7.clone.1 = u32[1280,1280]{1,0} add(%add.246507.3.clone.1, %broadcast.248616.113.clone.1)
+  %shift-left.108815.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121023.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114945.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121023.5.clone.1, %broadcast.244434.2816)
+  %or.114474.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108815.11.clone.1, %shift-right-logical.114945.11.clone.1)
+  %xor.121024.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246507.3.clone.1, %or.114474.9.clone.1)
+  %constant_218039_1_clone_1 = u32[] constant(3872014459)
+  %broadcast.248668.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218039_1_clone_1), dimensions={}
+  %add.246510.5.clone.1 = u32[1280,1280]{1,0} add(%xor.121024.7.clone.1, %broadcast.248668.5.clone.1)
+  %add.246511.5.clone.1 = u32[1280,1280]{1,0} add(%add.246509.7.clone.1, %add.246510.5.clone.1)
+  %shift-left.108816.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246510.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114946.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246510.5.clone.1, %broadcast.244415.6016)
+  %or.114475.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108816.9.clone.1, %shift-right-logical.114946.9.clone.1)
+  %xor.121025.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246511.5.clone.1, %or.114475.7.clone.1)
+  %add.246512.3.clone.1 = u32[1280,1280]{1,0} add(%add.246511.5.clone.1, %xor.121025.5.clone.1)
+  %shift-left.108817.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121025.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114947.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121025.5.clone.1, %broadcast.244417.5760)
+  %or.114476.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108817.9.clone.1, %shift-right-logical.114947.9.clone.1)
+  %xor.121026.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246512.3.clone.1, %or.114476.7.clone.1)
+  %add.246513.3.clone.1 = u32[1280,1280]{1,0} add(%add.246512.3.clone.1, %xor.121026.5.clone.1)
+  %shift-left.108818.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121026.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114948.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121026.5.clone.1, %broadcast.244419.4352)
+  %or.114478.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108818.5.clone.1, %shift-right-logical.114948.5.clone.1)
+  %xor.121027.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246513.3.clone.1, %or.114478.3.clone.1)
+  %add.246515.3.clone.1 = u32[1280,1280]{1,0} add(%add.246513.3.clone.1, %xor.121027.3.clone.1)
+  %add.246519.17.clone.1 = u32[1280,1280]{1,0} add(%add.246515.3.clone.1, %broadcast.248639.24.clone.1)
+  %shift-left.108819.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.121027.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114949.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121027.3.clone.1, %broadcast.244418.4352)
+  %or.114479.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108819.5.clone.1, %shift-right-logical.114949.5.clone.1)
+  %xor.121028.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246515.3.clone.1, %or.114479.3.clone.1)
+  %constant_218040_1_clone_1 = u32[] constant(3727816694)
+  %broadcast.248678.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218040_1_clone_1), dimensions={}
+  %add.246520.19.clone.1 = u32[1280,1280]{1,0} add(%xor.121028.15.clone.1, %broadcast.248678.19.clone.1)
+  %xor.121029.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246519.17.clone.1, %add.246520.19.clone.1)
+  %shift-right-logical.114950.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.121029.17.clone.1, %broadcast.244468.1920)
+  %or.114480.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114950.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5717.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114480.13.clone.1)
+  %add.246521.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5717.11.clone.1, %broadcast.244470.1152)
+  %multiply.26080.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246521.9.clone.1, %broadcast.244471.896)
+  %add.246522.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26080.7.clone.1, %broadcast.244408.1024)
+  %maximum.3649.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246522.5.clone.1)
+  %abs.1515.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3649.3.clone.1)
+  %compare.7178.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1515.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26081.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3649.3.clone.1, %broadcast.244476.1152)
+  %negate.4535.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3649.3.clone.1)
+  %multiply.26082.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3649.3.clone.1, %negate.4535.5.clone.1)
+  %log-plus-one.1515.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26082.5.clone.1)
+  %negate.4536.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1515.3.clone.1)
+  %compare.7179.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4536.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20730.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20731.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20732.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20733.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20734.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20735.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20736.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20737.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20738.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246524.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4536.4.clone.1, %broadcast.244496.640)
+  %sqrt.1515.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4536.4.clone.1)
+  %add.246525.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1515.5.clone.1, %broadcast.244498.640)
+  %select.20739.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7179.3.clone.1, %add.246524.5.clone.1, %add.246525.5.clone.1)
+  %multiply.26083.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20738.3.clone.1, %select.20739.3.clone.1)
+  %add.246526.1.clone.1 = f32[1280,1280]{1,0} add(%select.20737.3.clone.1, %multiply.26083.1.clone.1)
+  %multiply.26084.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246526.1.clone.1, %select.20739.3.clone.1)
+  %add.246527.1.clone.1 = f32[1280,1280]{1,0} add(%select.20736.3.clone.1, %multiply.26084.1.clone.1)
+  %multiply.26085.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246527.1.clone.1, %select.20739.3.clone.1)
+  %add.246529.1.clone.1 = f32[1280,1280]{1,0} add(%select.20735.3.clone.1, %multiply.26085.1.clone.1)
+  %multiply.26086.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246529.1.clone.1, %select.20739.3.clone.1)
+  %add.246530.1.clone.1 = f32[1280,1280]{1,0} add(%select.20734.3.clone.1, %multiply.26086.1.clone.1)
+  %multiply.26087.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246530.1.clone.1, %select.20739.3.clone.1)
+  %add.246531.3.clone.1 = f32[1280,1280]{1,0} add(%select.20733.5.clone.1, %multiply.26087.1.clone.1)
+  %multiply.26088.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246531.3.clone.1, %select.20739.3.clone.1)
+  %add.246532.3.clone.1 = f32[1280,1280]{1,0} add(%select.20732.5.clone.1, %multiply.26088.1.clone.1)
+  %multiply.26089.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246532.3.clone.1, %select.20739.3.clone.1)
+  %add.246534.9.clone.1 = f32[1280,1280]{1,0} add(%select.20731.11.clone.1, %multiply.26089.7.clone.1)
+  %multiply.26090.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246534.9.clone.1, %select.20739.3.clone.1)
+  %add.246535.7.clone.1 = f32[1280,1280]{1,0} add(%select.20730.7.clone.1, %multiply.26090.7.clone.1)
+  %multiply.26091.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246535.7.clone.1, %maximum.3649.3.clone.1)
+  %select.20740.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7178.3.clone.1, %multiply.26081.9.clone.1, %multiply.26091.7.clone.1)
+  %multiply.26092.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20740.7.clone.1, %broadcast.244500.640)
+  %clamp.1159.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26092.5.clone.1, %broadcast.244501.384)
+  %multiply.26093.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1159.3.clone.1, %broadcast.244502.1)
+  %constant_182477_1_clone_1 = u32[] constant(437258332)
+  %broadcast.254933.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182477_1_clone_1), dimensions={}
+  %add.250091.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.254933.44.clone.1)
+  %constant_182484_1_clone_1 = u32[] constant(2304223561)
+  %broadcast.254934.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182484_1_clone_1), dimensions={}
+  %add.250092.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.254934.113.clone.1)
+  %add.250093.35.clone.1 = u32[1280,1280]{1,0} add(%add.250091.37.clone.1, %add.250092.99.clone.1)
+  %shift-left.110348.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250092.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116590.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250092.99.clone.1, %broadcast.244415.6016)
+  %or.116102.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110348.31.clone.1, %shift-right-logical.116590.29.clone.1)
+  %xor.122674.27.clone.1 = u32[1280,1280]{1,0} xor(%add.250093.35.clone.1, %or.116102.29.clone.1)
+  %add.250094.5.clone.1 = u32[1280,1280]{1,0} add(%add.250093.35.clone.1, %xor.122674.27.clone.1)
+  %shift-left.110349.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122674.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116591.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122674.27.clone.1, %broadcast.244417.5760)
+  %or.116103.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110349.9.clone.1, %shift-right-logical.116591.9.clone.1)
+  %xor.122675.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250094.5.clone.1, %or.116103.7.clone.1)
+  %add.250095.3.clone.1 = u32[1280,1280]{1,0} add(%add.250094.5.clone.1, %xor.122675.5.clone.1)
+  %shift-left.110350.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122675.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116592.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122675.5.clone.1, %broadcast.244419.4352)
+  %or.116104.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110350.5.clone.1, %shift-right-logical.116592.5.clone.1)
+  %xor.122676.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250095.3.clone.1, %or.116104.3.clone.1)
+  %add.250096.3.clone.1 = u32[1280,1280]{1,0} add(%add.250095.3.clone.1, %xor.122676.3.clone.1)
+  %add.250097.7.clone.1 = u32[1280,1280]{1,0} add(%add.250096.3.clone.1, %broadcast.254934.113.clone.1)
+  %shift-left.110351.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122676.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116593.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122676.3.clone.1, %broadcast.244418.4352)
+  %or.116106.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110351.5.clone.1, %shift-right-logical.116593.5.clone.1)
+  %xor.122677.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250096.3.clone.1, %or.116106.3.clone.1)
+  %constant_218434_1_clone_1 = u32[] constant(2291579600)
+  %broadcast.254946.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218434_1_clone_1), dimensions={}
+  %add.250098.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122677.3.clone.1, %broadcast.254946.5.clone.1)
+  %add.250099.5.clone.1 = u32[1280,1280]{1,0} add(%add.250097.7.clone.1, %add.250098.5.clone.1)
+  %shift-left.110352.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250098.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116595.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250098.5.clone.1, %broadcast.244416.5760)
+  %or.116107.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110352.9.clone.1, %shift-right-logical.116595.9.clone.1)
+  %xor.122678.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250099.5.clone.1, %or.116107.7.clone.1)
+  %add.250100.3.clone.1 = u32[1280,1280]{1,0} add(%add.250099.5.clone.1, %xor.122678.5.clone.1)
+  %shift-left.110353.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122678.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116596.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122678.5.clone.1, %broadcast.244429.2304)
+  %or.116108.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110353.9.clone.1, %shift-right-logical.116596.9.clone.1)
+  %xor.122679.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250100.3.clone.1, %or.116108.7.clone.1)
+  %add.250101.3.clone.1 = u32[1280,1280]{1,0} add(%add.250100.3.clone.1, %xor.122679.5.clone.1)
+  %shift-left.110355.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122679.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116597.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122679.5.clone.1, %broadcast.244430.4608)
+  %or.116109.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110355.9.clone.1, %shift-right-logical.116597.9.clone.1)
+  %xor.122680.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250101.3.clone.1, %or.116109.7.clone.1)
+  %add.250102.3.clone.1 = u32[1280,1280]{1,0} add(%add.250101.3.clone.1, %xor.122680.5.clone.1)
+  %constant_182486_1_clone_1 = u32[] constant(2291579599)
+  %broadcast.254953.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_182486_1_clone_1), dimensions={}
+  %add.250103.7.clone.1 = u32[1280,1280]{1,0} add(%add.250102.3.clone.1, %broadcast.254953.24.clone.1)
+  %shift-left.110356.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122680.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116598.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122680.5.clone.1, %broadcast.244434.2816)
+  %or.116110.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110356.11.clone.1, %shift-right-logical.116598.11.clone.1)
+  %xor.122681.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250102.3.clone.1, %or.116110.9.clone.1)
+  %constant_218435_1_clone_1 = u32[] constant(437258334)
+  %broadcast.254956.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218435_1_clone_1), dimensions={}
+  %add.250104.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122681.7.clone.1, %broadcast.254956.5.clone.1)
+  %add.250105.5.clone.1 = u32[1280,1280]{1,0} add(%add.250103.7.clone.1, %add.250104.5.clone.1)
+  %shift-left.110357.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250104.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116599.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250104.5.clone.1, %broadcast.244415.6016)
+  %or.116111.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110357.9.clone.1, %shift-right-logical.116599.9.clone.1)
+  %xor.122682.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250105.5.clone.1, %or.116111.7.clone.1)
+  %add.250106.3.clone.1 = u32[1280,1280]{1,0} add(%add.250105.5.clone.1, %xor.122682.5.clone.1)
+  %shift-left.110358.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122682.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116600.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122682.5.clone.1, %broadcast.244417.5760)
+  %or.116112.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110358.9.clone.1, %shift-right-logical.116600.9.clone.1)
+  %xor.122684.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250106.3.clone.1, %or.116112.7.clone.1)
+  %add.250107.3.clone.1 = u32[1280,1280]{1,0} add(%add.250106.3.clone.1, %xor.122684.5.clone.1)
+  %shift-left.110360.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122684.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116601.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122684.5.clone.1, %broadcast.244419.4352)
+  %or.116113.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110360.7.clone.1, %shift-right-logical.116601.7.clone.1)
+  %xor.122685.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250107.3.clone.1, %or.116113.5.clone.1)
+  %add.250108.3.clone.1 = u32[1280,1280]{1,0} add(%add.250107.3.clone.1, %xor.122685.3.clone.1)
+  %add.250109.7.clone.1 = u32[1280,1280]{1,0} add(%add.250108.3.clone.1, %broadcast.254933.44.clone.1)
+  %shift-left.110361.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122685.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116602.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122685.3.clone.1, %broadcast.244418.4352)
+  %or.116114.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110361.7.clone.1, %shift-right-logical.116602.7.clone.1)
+  %xor.122686.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250108.3.clone.1, %or.116114.5.clone.1)
+  %constant_218436_1_clone_1 = u32[] constant(2304223564)
+  %broadcast.254968.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218436_1_clone_1), dimensions={}
+  %add.250110.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122686.3.clone.1, %broadcast.254968.5.clone.1)
+  %add.250111.5.clone.1 = u32[1280,1280]{1,0} add(%add.250109.7.clone.1, %add.250110.5.clone.1)
+  %shift-left.110362.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250110.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116603.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250110.5.clone.1, %broadcast.244416.5760)
+  %or.116116.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110362.9.clone.1, %shift-right-logical.116603.9.clone.1)
+  %xor.122687.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250111.5.clone.1, %or.116116.7.clone.1)
+  %add.250112.3.clone.1 = u32[1280,1280]{1,0} add(%add.250111.5.clone.1, %xor.122687.5.clone.1)
+  %shift-left.110363.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122687.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116604.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122687.5.clone.1, %broadcast.244429.2304)
+  %or.116117.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110363.9.clone.1, %shift-right-logical.116604.9.clone.1)
+  %xor.122689.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250112.3.clone.1, %or.116117.7.clone.1)
+  %add.250113.3.clone.1 = u32[1280,1280]{1,0} add(%add.250112.3.clone.1, %xor.122689.5.clone.1)
+  %shift-left.110365.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122689.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116605.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122689.5.clone.1, %broadcast.244430.4608)
+  %or.116118.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110365.9.clone.1, %shift-right-logical.116605.9.clone.1)
+  %xor.122690.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250113.3.clone.1, %or.116118.7.clone.1)
+  %add.250114.3.clone.1 = u32[1280,1280]{1,0} add(%add.250113.3.clone.1, %xor.122690.5.clone.1)
+  %add.250115.7.clone.1 = u32[1280,1280]{1,0} add(%add.250114.3.clone.1, %broadcast.254934.113.clone.1)
+  %shift-left.110366.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122690.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116606.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122690.5.clone.1, %broadcast.244434.2816)
+  %or.116119.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110366.11.clone.1, %shift-right-logical.116606.11.clone.1)
+  %xor.122691.7.clone.1 = u32[1280,1280]{1,0} xor(%add.250114.3.clone.1, %or.116119.9.clone.1)
+  %constant_218437_1_clone_1 = u32[] constant(2291579603)
+  %broadcast.254981.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218437_1_clone_1), dimensions={}
+  %add.250116.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122691.7.clone.1, %broadcast.254981.5.clone.1)
+  %add.250117.5.clone.1 = u32[1280,1280]{1,0} add(%add.250115.7.clone.1, %add.250116.5.clone.1)
+  %shift-left.110367.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.250116.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116608.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.250116.5.clone.1, %broadcast.244415.6016)
+  %or.116121.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110367.9.clone.1, %shift-right-logical.116608.9.clone.1)
+  %xor.122692.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250117.5.clone.1, %or.116121.7.clone.1)
+  %add.250118.3.clone.1 = u32[1280,1280]{1,0} add(%add.250117.5.clone.1, %xor.122692.5.clone.1)
+  %shift-left.110368.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122692.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116609.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122692.5.clone.1, %broadcast.244417.5760)
+  %or.116122.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110368.9.clone.1, %shift-right-logical.116609.9.clone.1)
+  %xor.122694.5.clone.1 = u32[1280,1280]{1,0} xor(%add.250118.3.clone.1, %or.116122.7.clone.1)
+  %add.250119.3.clone.1 = u32[1280,1280]{1,0} add(%add.250118.3.clone.1, %xor.122694.5.clone.1)
+  %shift-left.110370.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122694.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116610.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122694.5.clone.1, %broadcast.244419.4352)
+  %or.116123.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110370.5.clone.1, %shift-right-logical.116610.5.clone.1)
+  %xor.122695.3.clone.1 = u32[1280,1280]{1,0} xor(%add.250119.3.clone.1, %or.116123.3.clone.1)
+  %add.250120.3.clone.1 = u32[1280,1280]{1,0} add(%add.250119.3.clone.1, %xor.122695.3.clone.1)
+  %add.250121.17.clone.1 = u32[1280,1280]{1,0} add(%add.250120.3.clone.1, %broadcast.254953.24.clone.1)
+  %shift-left.110371.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122695.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116611.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122695.3.clone.1, %broadcast.244418.4352)
+  %or.116124.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110371.5.clone.1, %shift-right-logical.116611.5.clone.1)
+  %xor.122696.15.clone.1 = u32[1280,1280]{1,0} xor(%add.250120.3.clone.1, %or.116124.3.clone.1)
+  %constant_218438_1_clone_1 = u32[] constant(437258337)
+  %broadcast.254995.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218438_1_clone_1), dimensions={}
+  %add.250122.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122696.15.clone.1, %broadcast.254995.19.clone.1)
+  %xor.122697.17.clone.1 = u32[1280,1280]{1,0} xor(%add.250121.17.clone.1, %add.250122.19.clone.1)
+  %shift-right-logical.116612.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122697.17.clone.1, %broadcast.244468.1920)
+  %or.116126.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116612.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5789.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116126.13.clone.1)
+  %add.250123.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5789.11.clone.1, %broadcast.244470.1152)
+  %multiply.26815.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250123.9.clone.1, %broadcast.244471.896)
+  %add.250124.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26815.7.clone.1, %broadcast.244408.1024)
+  %maximum.3721.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.250124.5.clone.1)
+  %abs.1563.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3721.3.clone.1)
+  %compare.7275.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1563.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26816.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3721.3.clone.1, %broadcast.244476.1152)
+  %negate.4631.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3721.3.clone.1)
+  %multiply.26817.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3721.3.clone.1, %negate.4631.5.clone.1)
+  %log-plus-one.1563.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26817.5.clone.1)
+  %negate.4632.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1563.3.clone.1)
+  %compare.7276.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4632.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21279.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21280.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21281.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21282.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21283.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21284.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21285.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21286.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21287.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.250125.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4632.4.clone.1, %broadcast.244496.640)
+  %sqrt.1563.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4632.4.clone.1)
+  %add.250126.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1563.5.clone.1, %broadcast.244498.640)
+  %select.21288.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7276.3.clone.1, %add.250125.5.clone.1, %add.250126.5.clone.1)
+  %multiply.26818.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21287.3.clone.1, %select.21288.3.clone.1)
+  %add.250127.1.clone.1 = f32[1280,1280]{1,0} add(%select.21286.3.clone.1, %multiply.26818.1.clone.1)
+  %multiply.26819.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250127.1.clone.1, %select.21288.3.clone.1)
+  %add.250128.1.clone.1 = f32[1280,1280]{1,0} add(%select.21285.3.clone.1, %multiply.26819.1.clone.1)
+  %multiply.26820.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250128.1.clone.1, %select.21288.3.clone.1)
+  %add.250129.1.clone.1 = f32[1280,1280]{1,0} add(%select.21284.3.clone.1, %multiply.26820.1.clone.1)
+  %multiply.26821.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250129.1.clone.1, %select.21288.3.clone.1)
+  %add.250131.1.clone.1 = f32[1280,1280]{1,0} add(%select.21283.3.clone.1, %multiply.26821.1.clone.1)
+  %multiply.26822.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250131.1.clone.1, %select.21288.3.clone.1)
+  %add.250134.3.clone.1 = f32[1280,1280]{1,0} add(%select.21282.5.clone.1, %multiply.26822.1.clone.1)
+  %multiply.26823.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.250134.3.clone.1, %select.21288.3.clone.1)
+  %add.250135.3.clone.1 = f32[1280,1280]{1,0} add(%select.21281.5.clone.1, %multiply.26823.1.clone.1)
+  %multiply.26824.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250135.3.clone.1, %select.21288.3.clone.1)
+  %add.250136.9.clone.1 = f32[1280,1280]{1,0} add(%select.21280.11.clone.1, %multiply.26824.7.clone.1)
+  %multiply.26825.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250136.9.clone.1, %select.21288.3.clone.1)
+  %add.250137.7.clone.1 = f32[1280,1280]{1,0} add(%select.21279.7.clone.1, %multiply.26825.7.clone.1)
+  %multiply.26826.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250137.7.clone.1, %maximum.3721.3.clone.1)
+  %select.21289.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7275.3.clone.1, %multiply.26816.9.clone.1, %multiply.26826.7.clone.1)
+  %multiply.26827.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21289.7.clone.1, %broadcast.244500.640)
+  %clamp.1207.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26827.5.clone.1, %broadcast.244501.384)
+  %multiply.26828.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1207.3.clone.1, %broadcast.244502.1)
+  %constant_167282_1_clone_1 = u32[] constant(4076729066)
+  %broadcast.248377.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167282_1_clone_1), dimensions={}
+  %add.246341.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248377.44.clone.1)
+  %constant_167294_1_clone_1 = u32[] constant(10925196)
+  %broadcast.248378.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167294_1_clone_1), dimensions={}
+  %add.246342.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248378.113.clone.1)
+  %add.246343.35.clone.1 = u32[1280,1280]{1,0} add(%add.246341.37.clone.1, %add.246342.99.clone.1)
+  %shift-left.108734.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246342.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114867.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246342.99.clone.1, %broadcast.244415.6016)
+  %or.114387.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108734.31.clone.1, %shift-right-logical.114867.29.clone.1)
+  %xor.120932.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246343.35.clone.1, %or.114387.29.clone.1)
+  %add.246345.5.clone.1 = u32[1280,1280]{1,0} add(%add.246343.35.clone.1, %xor.120932.27.clone.1)
+  %shift-left.108735.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120932.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114868.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120932.27.clone.1, %broadcast.244417.5760)
+  %or.114388.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108735.9.clone.1, %shift-right-logical.114868.9.clone.1)
+  %xor.120933.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246345.5.clone.1, %or.114388.7.clone.1)
+  %add.246346.3.clone.1 = u32[1280,1280]{1,0} add(%add.246345.5.clone.1, %xor.120933.5.clone.1)
+  %shift-left.108737.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120933.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114869.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120933.5.clone.1, %broadcast.244419.4352)
+  %or.114389.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108737.5.clone.1, %shift-right-logical.114869.5.clone.1)
+  %xor.120934.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246346.3.clone.1, %or.114389.3.clone.1)
+  %add.246347.3.clone.1 = u32[1280,1280]{1,0} add(%add.246346.3.clone.1, %xor.120934.3.clone.1)
+  %add.246348.7.clone.1 = u32[1280,1280]{1,0} add(%add.246347.3.clone.1, %broadcast.248378.113.clone.1)
+  %shift-left.108738.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120934.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114870.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120934.3.clone.1, %broadcast.244418.4352)
+  %or.114390.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108738.5.clone.1, %shift-right-logical.114870.5.clone.1)
+  %xor.120935.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246347.3.clone.1, %or.114390.3.clone.1)
+  %constant_218021_1_clone_1 = u32[] constant(3918159293)
+  %broadcast.248388.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218021_1_clone_1), dimensions={}
+  %add.246350.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120935.3.clone.1, %broadcast.248388.5.clone.1)
+  %add.246351.5.clone.1 = u32[1280,1280]{1,0} add(%add.246348.7.clone.1, %add.246350.5.clone.1)
+  %shift-left.108739.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246350.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114871.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246350.5.clone.1, %broadcast.244416.5760)
+  %or.114391.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108739.9.clone.1, %shift-right-logical.114871.9.clone.1)
+  %xor.120936.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246351.5.clone.1, %or.114391.7.clone.1)
+  %add.246352.3.clone.1 = u32[1280,1280]{1,0} add(%add.246351.5.clone.1, %xor.120936.5.clone.1)
+  %shift-left.108740.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120936.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114872.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120936.5.clone.1, %broadcast.244429.2304)
+  %or.114393.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108740.9.clone.1, %shift-right-logical.114872.9.clone.1)
+  %xor.120937.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246352.3.clone.1, %or.114393.7.clone.1)
+  %add.246353.3.clone.1 = u32[1280,1280]{1,0} add(%add.246352.3.clone.1, %xor.120937.5.clone.1)
+  %shift-left.108742.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120937.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114873.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120937.5.clone.1, %broadcast.244430.4608)
+  %or.114394.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108742.9.clone.1, %shift-right-logical.114873.9.clone.1)
+  %xor.120938.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246353.3.clone.1, %or.114394.7.clone.1)
+  %add.246354.3.clone.1 = u32[1280,1280]{1,0} add(%add.246353.3.clone.1, %xor.120938.5.clone.1)
+  %constant_167296_1_clone_1 = u32[] constant(3918159292)
+  %broadcast.248395.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167296_1_clone_1), dimensions={}
+  %add.246356.7.clone.1 = u32[1280,1280]{1,0} add(%add.246354.3.clone.1, %broadcast.248395.24.clone.1)
+  %shift-left.108743.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120938.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114874.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120938.5.clone.1, %broadcast.244434.2816)
+  %or.114396.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108743.11.clone.1, %shift-right-logical.114874.11.clone.1)
+  %xor.120939.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246354.3.clone.1, %or.114396.9.clone.1)
+  %constant_218022_1_clone_1 = u32[] constant(4076729068)
+  %broadcast.248398.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218022_1_clone_1), dimensions={}
+  %add.246360.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120939.7.clone.1, %broadcast.248398.5.clone.1)
+  %add.246361.5.clone.1 = u32[1280,1280]{1,0} add(%add.246356.7.clone.1, %add.246360.5.clone.1)
+  %shift-left.108744.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246360.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114875.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246360.5.clone.1, %broadcast.244415.6016)
+  %or.114397.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108744.9.clone.1, %shift-right-logical.114875.9.clone.1)
+  %xor.120940.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246361.5.clone.1, %or.114397.7.clone.1)
+  %add.246362.3.clone.1 = u32[1280,1280]{1,0} add(%add.246361.5.clone.1, %xor.120940.5.clone.1)
+  %shift-left.108745.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120940.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114876.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120940.5.clone.1, %broadcast.244417.5760)
+  %or.114398.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108745.9.clone.1, %shift-right-logical.114876.9.clone.1)
+  %xor.120941.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246362.3.clone.1, %or.114398.7.clone.1)
+  %add.246363.3.clone.1 = u32[1280,1280]{1,0} add(%add.246362.3.clone.1, %xor.120941.5.clone.1)
+  %shift-left.108746.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120941.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114877.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120941.5.clone.1, %broadcast.244419.4352)
+  %or.114399.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108746.7.clone.1, %shift-right-logical.114877.7.clone.1)
+  %xor.120942.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246363.3.clone.1, %or.114399.5.clone.1)
+  %add.246365.3.clone.1 = u32[1280,1280]{1,0} add(%add.246363.3.clone.1, %xor.120942.3.clone.1)
+  %add.246366.7.clone.1 = u32[1280,1280]{1,0} add(%add.246365.3.clone.1, %broadcast.248377.44.clone.1)
+  %shift-left.108747.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120942.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114878.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120942.3.clone.1, %broadcast.244418.4352)
+  %or.114400.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108747.7.clone.1, %shift-right-logical.114878.7.clone.1)
+  %xor.120943.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246365.3.clone.1, %or.114400.5.clone.1)
+  %constant_218023_1_clone_1 = u32[] constant(10925199)
+  %broadcast.248408.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218023_1_clone_1), dimensions={}
+  %add.246367.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120943.3.clone.1, %broadcast.248408.5.clone.1)
+  %add.246368.5.clone.1 = u32[1280,1280]{1,0} add(%add.246366.7.clone.1, %add.246367.5.clone.1)
+  %shift-left.108748.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246367.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114879.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246367.5.clone.1, %broadcast.244416.5760)
+  %or.114401.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108748.9.clone.1, %shift-right-logical.114879.9.clone.1)
+  %xor.120944.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246368.5.clone.1, %or.114401.7.clone.1)
+  %add.246370.3.clone.1 = u32[1280,1280]{1,0} add(%add.246368.5.clone.1, %xor.120944.5.clone.1)
+  %shift-left.108749.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120944.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114880.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120944.5.clone.1, %broadcast.244429.2304)
+  %or.114402.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108749.9.clone.1, %shift-right-logical.114880.9.clone.1)
+  %xor.120945.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246370.3.clone.1, %or.114402.7.clone.1)
+  %add.246371.3.clone.1 = u32[1280,1280]{1,0} add(%add.246370.3.clone.1, %xor.120945.5.clone.1)
+  %shift-left.108750.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120945.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114881.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120945.5.clone.1, %broadcast.244430.4608)
+  %or.114403.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108750.9.clone.1, %shift-right-logical.114881.9.clone.1)
+  %xor.120946.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246371.3.clone.1, %or.114403.7.clone.1)
+  %add.246372.3.clone.1 = u32[1280,1280]{1,0} add(%add.246371.3.clone.1, %xor.120946.5.clone.1)
+  %add.246373.7.clone.1 = u32[1280,1280]{1,0} add(%add.246372.3.clone.1, %broadcast.248378.113.clone.1)
+  %shift-left.108752.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120946.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114882.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120946.5.clone.1, %broadcast.244434.2816)
+  %or.114404.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108752.11.clone.1, %shift-right-logical.114882.11.clone.1)
+  %xor.120949.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246372.3.clone.1, %or.114404.9.clone.1)
+  %constant_218024_1_clone_1 = u32[] constant(3918159296)
+  %broadcast.248418.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218024_1_clone_1), dimensions={}
+  %add.246375.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120949.7.clone.1, %broadcast.248418.5.clone.1)
+  %add.246376.5.clone.1 = u32[1280,1280]{1,0} add(%add.246373.7.clone.1, %add.246375.5.clone.1)
+  %shift-left.108753.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246375.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114883.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246375.5.clone.1, %broadcast.244415.6016)
+  %or.114405.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108753.9.clone.1, %shift-right-logical.114883.9.clone.1)
+  %xor.120950.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246376.5.clone.1, %or.114405.7.clone.1)
+  %add.246377.3.clone.1 = u32[1280,1280]{1,0} add(%add.246376.5.clone.1, %xor.120950.5.clone.1)
+  %shift-left.108754.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120950.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114884.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120950.5.clone.1, %broadcast.244417.5760)
+  %or.114406.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108754.9.clone.1, %shift-right-logical.114884.9.clone.1)
+  %xor.120951.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246377.3.clone.1, %or.114406.7.clone.1)
+  %add.246378.3.clone.1 = u32[1280,1280]{1,0} add(%add.246377.3.clone.1, %xor.120951.5.clone.1)
+  %shift-left.108755.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120951.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114885.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120951.5.clone.1, %broadcast.244419.4352)
+  %or.114407.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108755.5.clone.1, %shift-right-logical.114885.5.clone.1)
+  %xor.120952.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246378.3.clone.1, %or.114407.3.clone.1)
+  %add.246379.3.clone.1 = u32[1280,1280]{1,0} add(%add.246378.3.clone.1, %xor.120952.3.clone.1)
+  %add.246381.17.clone.1 = u32[1280,1280]{1,0} add(%add.246379.3.clone.1, %broadcast.248395.24.clone.1)
+  %shift-left.108757.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120952.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114886.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120952.3.clone.1, %broadcast.244418.4352)
+  %or.114408.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108757.5.clone.1, %shift-right-logical.114886.5.clone.1)
+  %xor.120953.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246379.3.clone.1, %or.114408.3.clone.1)
+  %constant_218025_1_clone_1 = u32[] constant(4076729071)
+  %broadcast.248428.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218025_1_clone_1), dimensions={}
+  %add.246384.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120953.15.clone.1, %broadcast.248428.19.clone.1)
+  %xor.120954.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246381.17.clone.1, %add.246384.19.clone.1)
+  %shift-right-logical.114887.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120954.17.clone.1, %broadcast.244468.1920)
+  %or.114409.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114887.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5714.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114409.13.clone.1)
+  %add.246385.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5714.11.clone.1, %broadcast.244470.1152)
+  %multiply.26062.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246385.9.clone.1, %broadcast.244471.896)
+  %add.246386.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26062.7.clone.1, %broadcast.244408.1024)
+  %maximum.3646.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246386.5.clone.1)
+  %abs.1514.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3646.3.clone.1)
+  %compare.7176.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1514.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26063.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3646.3.clone.1, %broadcast.244476.1152)
+  %negate.4533.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3646.3.clone.1)
+  %multiply.26064.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3646.3.clone.1, %negate.4533.5.clone.1)
+  %log-plus-one.1514.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26064.5.clone.1)
+  %negate.4534.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1514.3.clone.1)
+  %compare.7177.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4534.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20719.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20720.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20721.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20722.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20723.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20724.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20725.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20726.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20727.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246387.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4534.4.clone.1, %broadcast.244496.640)
+  %sqrt.1514.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4534.4.clone.1)
+  %add.246388.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1514.5.clone.1, %broadcast.244498.640)
+  %select.20728.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7177.3.clone.1, %add.246387.5.clone.1, %add.246388.5.clone.1)
+  %multiply.26065.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20727.3.clone.1, %select.20728.3.clone.1)
+  %add.246389.1.clone.1 = f32[1280,1280]{1,0} add(%select.20726.3.clone.1, %multiply.26065.1.clone.1)
+  %multiply.26066.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246389.1.clone.1, %select.20728.3.clone.1)
+  %add.246390.1.clone.1 = f32[1280,1280]{1,0} add(%select.20725.3.clone.1, %multiply.26066.1.clone.1)
+  %multiply.26067.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246390.1.clone.1, %select.20728.3.clone.1)
+  %add.246391.1.clone.1 = f32[1280,1280]{1,0} add(%select.20724.3.clone.1, %multiply.26067.1.clone.1)
+  %multiply.26068.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246391.1.clone.1, %select.20728.3.clone.1)
+  %add.246392.1.clone.1 = f32[1280,1280]{1,0} add(%select.20723.3.clone.1, %multiply.26068.1.clone.1)
+  %multiply.26069.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246392.1.clone.1, %select.20728.3.clone.1)
+  %add.246393.3.clone.1 = f32[1280,1280]{1,0} add(%select.20722.5.clone.1, %multiply.26069.1.clone.1)
+  %multiply.26070.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246393.3.clone.1, %select.20728.3.clone.1)
+  %add.246394.3.clone.1 = f32[1280,1280]{1,0} add(%select.20721.5.clone.1, %multiply.26070.1.clone.1)
+  %multiply.26071.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246394.3.clone.1, %select.20728.3.clone.1)
+  %add.246395.9.clone.1 = f32[1280,1280]{1,0} add(%select.20720.11.clone.1, %multiply.26071.7.clone.1)
+  %multiply.26072.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246395.9.clone.1, %select.20728.3.clone.1)
+  %add.246396.7.clone.1 = f32[1280,1280]{1,0} add(%select.20719.7.clone.1, %multiply.26072.7.clone.1)
+  %multiply.26073.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246396.7.clone.1, %maximum.3646.3.clone.1)
+  %select.20729.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7176.3.clone.1, %multiply.26063.9.clone.1, %multiply.26073.7.clone.1)
+  %multiply.26074.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20729.7.clone.1, %broadcast.244500.640)
+  %clamp.1158.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26074.5.clone.1, %broadcast.244501.384)
+  %multiply.26075.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1158.3.clone.1, %broadcast.244502.1)
+  %constant_189782_1_clone_1 = u32[] constant(1759179695)
+  %broadcast.258099.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189782_1_clone_1), dimensions={}
+  %add.251902.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.258099.44.clone.1)
+  %constant_189789_1_clone_1 = u32[] constant(154785442)
+  %broadcast.258100.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189789_1_clone_1), dimensions={}
+  %add.251903.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.258100.113.clone.1)
+  %add.251904.35.clone.1 = u32[1280,1280]{1,0} add(%add.251902.37.clone.1, %add.251903.99.clone.1)
+  %shift-left.111152.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251903.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117413.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251903.99.clone.1, %broadcast.244415.6016)
+  %or.116951.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111152.31.clone.1, %shift-right-logical.117413.29.clone.1)
+  %xor.123510.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251904.35.clone.1, %or.116951.29.clone.1)
+  %add.251905.5.clone.1 = u32[1280,1280]{1,0} add(%add.251904.35.clone.1, %xor.123510.27.clone.1)
+  %shift-left.111153.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123510.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117414.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123510.27.clone.1, %broadcast.244417.5760)
+  %or.116952.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111153.9.clone.1, %shift-right-logical.117414.9.clone.1)
+  %xor.123511.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251905.5.clone.1, %or.116952.7.clone.1)
+  %add.251906.3.clone.1 = u32[1280,1280]{1,0} add(%add.251905.5.clone.1, %xor.123511.5.clone.1)
+  %shift-left.111154.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123511.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117415.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123511.5.clone.1, %broadcast.244419.4352)
+  %or.116953.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111154.5.clone.1, %shift-right-logical.117415.5.clone.1)
+  %xor.123512.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251906.3.clone.1, %or.116953.3.clone.1)
+  %add.251907.3.clone.1 = u32[1280,1280]{1,0} add(%add.251906.3.clone.1, %xor.123512.3.clone.1)
+  %add.251908.7.clone.1 = u32[1280,1280]{1,0} add(%add.251907.3.clone.1, %broadcast.258100.113.clone.1)
+  %shift-left.111155.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123512.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117416.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123512.3.clone.1, %broadcast.244418.4352)
+  %or.116954.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111155.5.clone.1, %shift-right-logical.117416.5.clone.1)
+  %xor.123513.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251907.3.clone.1, %or.116954.3.clone.1)
+  %constant_218635_1_clone_1 = u32[] constant(2050113240)
+  %broadcast.258110.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218635_1_clone_1), dimensions={}
+  %add.251909.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123513.3.clone.1, %broadcast.258110.5.clone.1)
+  %add.251910.5.clone.1 = u32[1280,1280]{1,0} add(%add.251908.7.clone.1, %add.251909.5.clone.1)
+  %shift-left.111156.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251909.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117417.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251909.5.clone.1, %broadcast.244416.5760)
+  %or.116956.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111156.9.clone.1, %shift-right-logical.117417.9.clone.1)
+  %xor.123514.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251910.5.clone.1, %or.116956.7.clone.1)
+  %add.251911.3.clone.1 = u32[1280,1280]{1,0} add(%add.251910.5.clone.1, %xor.123514.5.clone.1)
+  %shift-left.111157.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123514.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117418.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123514.5.clone.1, %broadcast.244429.2304)
+  %or.116957.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111157.9.clone.1, %shift-right-logical.117418.9.clone.1)
+  %xor.123515.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251911.3.clone.1, %or.116957.7.clone.1)
+  %add.251912.3.clone.1 = u32[1280,1280]{1,0} add(%add.251911.3.clone.1, %xor.123515.5.clone.1)
+  %shift-left.111158.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123515.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117419.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123515.5.clone.1, %broadcast.244430.4608)
+  %or.116958.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111158.9.clone.1, %shift-right-logical.117419.9.clone.1)
+  %xor.123516.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251912.3.clone.1, %or.116958.7.clone.1)
+  %add.251913.3.clone.1 = u32[1280,1280]{1,0} add(%add.251912.3.clone.1, %xor.123516.5.clone.1)
+  %constant_189791_1_clone_1 = u32[] constant(2050113239)
+  %broadcast.258119.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189791_1_clone_1), dimensions={}
+  %add.251914.7.clone.1 = u32[1280,1280]{1,0} add(%add.251913.3.clone.1, %broadcast.258119.24.clone.1)
+  %shift-left.111160.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123516.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117421.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123516.5.clone.1, %broadcast.244434.2816)
+  %or.116959.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111160.11.clone.1, %shift-right-logical.117421.11.clone.1)
+  %xor.123517.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251913.3.clone.1, %or.116959.9.clone.1)
+  %constant_218636_1_clone_1 = u32[] constant(1759179697)
+  %broadcast.258122.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218636_1_clone_1), dimensions={}
+  %add.251915.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123517.7.clone.1, %broadcast.258122.5.clone.1)
+  %add.251916.5.clone.1 = u32[1280,1280]{1,0} add(%add.251914.7.clone.1, %add.251915.5.clone.1)
+  %shift-left.111161.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251915.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117422.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251915.5.clone.1, %broadcast.244415.6016)
+  %or.116961.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111161.9.clone.1, %shift-right-logical.117422.9.clone.1)
+  %xor.123518.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251916.5.clone.1, %or.116961.7.clone.1)
+  %add.251917.3.clone.1 = u32[1280,1280]{1,0} add(%add.251916.5.clone.1, %xor.123518.5.clone.1)
+  %shift-left.111162.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123518.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117423.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123518.5.clone.1, %broadcast.244417.5760)
+  %or.116962.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111162.9.clone.1, %shift-right-logical.117423.9.clone.1)
+  %xor.123519.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251917.3.clone.1, %or.116962.7.clone.1)
+  %add.251918.3.clone.1 = u32[1280,1280]{1,0} add(%add.251917.3.clone.1, %xor.123519.5.clone.1)
+  %shift-left.111163.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123519.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117424.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123519.5.clone.1, %broadcast.244419.4352)
+  %or.116963.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111163.7.clone.1, %shift-right-logical.117424.7.clone.1)
+  %xor.123520.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251918.3.clone.1, %or.116963.5.clone.1)
+  %add.251919.3.clone.1 = u32[1280,1280]{1,0} add(%add.251918.3.clone.1, %xor.123520.3.clone.1)
+  %add.251920.7.clone.1 = u32[1280,1280]{1,0} add(%add.251919.3.clone.1, %broadcast.258099.44.clone.1)
+  %shift-left.111165.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123520.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117426.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123520.3.clone.1, %broadcast.244418.4352)
+  %or.116964.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111165.7.clone.1, %shift-right-logical.117426.7.clone.1)
+  %xor.123521.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251919.3.clone.1, %or.116964.5.clone.1)
+  %constant_218637_1_clone_1 = u32[] constant(154785445)
+  %broadcast.258137.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218637_1_clone_1), dimensions={}
+  %add.251921.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123521.3.clone.1, %broadcast.258137.5.clone.1)
+  %add.251922.5.clone.1 = u32[1280,1280]{1,0} add(%add.251920.7.clone.1, %add.251921.5.clone.1)
+  %shift-left.111166.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251921.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117427.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251921.5.clone.1, %broadcast.244416.5760)
+  %or.116965.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111166.9.clone.1, %shift-right-logical.117427.9.clone.1)
+  %xor.123522.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251922.5.clone.1, %or.116965.7.clone.1)
+  %add.251923.3.clone.1 = u32[1280,1280]{1,0} add(%add.251922.5.clone.1, %xor.123522.5.clone.1)
+  %shift-left.111167.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123522.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117428.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123522.5.clone.1, %broadcast.244429.2304)
+  %or.116966.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111167.9.clone.1, %shift-right-logical.117428.9.clone.1)
+  %xor.123523.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251923.3.clone.1, %or.116966.7.clone.1)
+  %add.251924.3.clone.1 = u32[1280,1280]{1,0} add(%add.251923.3.clone.1, %xor.123523.5.clone.1)
+  %shift-left.111168.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123523.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117429.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123523.5.clone.1, %broadcast.244430.4608)
+  %or.116967.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111168.9.clone.1, %shift-right-logical.117429.9.clone.1)
+  %xor.123524.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251924.3.clone.1, %or.116967.7.clone.1)
+  %add.251925.3.clone.1 = u32[1280,1280]{1,0} add(%add.251924.3.clone.1, %xor.123524.5.clone.1)
+  %add.251926.7.clone.1 = u32[1280,1280]{1,0} add(%add.251925.3.clone.1, %broadcast.258100.113.clone.1)
+  %shift-left.111170.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123524.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117431.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123524.5.clone.1, %broadcast.244434.2816)
+  %or.116968.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111170.11.clone.1, %shift-right-logical.117431.11.clone.1)
+  %xor.123525.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251925.3.clone.1, %or.116968.9.clone.1)
+  %constant_218638_1_clone_1 = u32[] constant(2050113243)
+  %broadcast.258149.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218638_1_clone_1), dimensions={}
+  %add.251927.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123525.7.clone.1, %broadcast.258149.5.clone.1)
+  %add.251928.5.clone.1 = u32[1280,1280]{1,0} add(%add.251926.7.clone.1, %add.251927.5.clone.1)
+  %shift-left.111171.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251927.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117432.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251927.5.clone.1, %broadcast.244415.6016)
+  %or.116969.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111171.9.clone.1, %shift-right-logical.117432.9.clone.1)
+  %xor.123526.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251928.5.clone.1, %or.116969.7.clone.1)
+  %add.251930.3.clone.1 = u32[1280,1280]{1,0} add(%add.251928.5.clone.1, %xor.123526.5.clone.1)
+  %shift-left.111172.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123526.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117433.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123526.5.clone.1, %broadcast.244417.5760)
+  %or.116971.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111172.9.clone.1, %shift-right-logical.117433.9.clone.1)
+  %xor.123527.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251930.3.clone.1, %or.116971.7.clone.1)
+  %add.251931.3.clone.1 = u32[1280,1280]{1,0} add(%add.251930.3.clone.1, %xor.123527.5.clone.1)
+  %shift-left.111173.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123527.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117434.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123527.5.clone.1, %broadcast.244419.4352)
+  %or.116972.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111173.5.clone.1, %shift-right-logical.117434.5.clone.1)
+  %xor.123528.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251931.3.clone.1, %or.116972.3.clone.1)
+  %add.251932.3.clone.1 = u32[1280,1280]{1,0} add(%add.251931.3.clone.1, %xor.123528.3.clone.1)
+  %add.251933.17.clone.1 = u32[1280,1280]{1,0} add(%add.251932.3.clone.1, %broadcast.258119.24.clone.1)
+  %shift-left.111175.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123528.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117436.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123528.3.clone.1, %broadcast.244418.4352)
+  %or.116973.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111175.5.clone.1, %shift-right-logical.117436.5.clone.1)
+  %xor.123529.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251932.3.clone.1, %or.116973.3.clone.1)
+  %constant_218639_1_clone_1 = u32[] constant(1759179700)
+  %broadcast.258160.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218639_1_clone_1), dimensions={}
+  %add.251934.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123529.15.clone.1, %broadcast.258160.19.clone.1)
+  %xor.123530.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251933.17.clone.1, %add.251934.19.clone.1)
+  %shift-right-logical.117437.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123530.17.clone.1, %broadcast.244468.1920)
+  %or.116974.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117437.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5825.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116974.13.clone.1)
+  %add.251935.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5825.11.clone.1, %broadcast.244470.1152)
+  %multiply.27190.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251935.9.clone.1, %broadcast.244471.896)
+  %add.251936.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27190.7.clone.1, %broadcast.244408.1024)
+  %maximum.3757.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251936.5.clone.1)
+  %abs.1587.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3757.3.clone.1)
+  %compare.7336.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1587.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27191.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3757.3.clone.1, %broadcast.244476.1152)
+  %negate.4679.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3757.3.clone.1)
+  %multiply.27192.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3757.3.clone.1, %negate.4679.5.clone.1)
+  %log-plus-one.1587.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27192.5.clone.1)
+  %negate.4680.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1587.3.clone.1)
+  %compare.7337.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4680.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21554.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21555.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21556.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21557.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21558.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21559.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21560.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21561.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21562.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251937.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4680.4.clone.1, %broadcast.244496.640)
+  %sqrt.1587.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4680.4.clone.1)
+  %add.251938.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1587.5.clone.1, %broadcast.244498.640)
+  %select.21564.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7337.3.clone.1, %add.251937.5.clone.1, %add.251938.5.clone.1)
+  %multiply.27193.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21562.3.clone.1, %select.21564.3.clone.1)
+  %add.251939.1.clone.1 = f32[1280,1280]{1,0} add(%select.21561.3.clone.1, %multiply.27193.1.clone.1)
+  %multiply.27194.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251939.1.clone.1, %select.21564.3.clone.1)
+  %add.251940.1.clone.1 = f32[1280,1280]{1,0} add(%select.21560.3.clone.1, %multiply.27194.1.clone.1)
+  %multiply.27195.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251940.1.clone.1, %select.21564.3.clone.1)
+  %add.251941.1.clone.1 = f32[1280,1280]{1,0} add(%select.21559.3.clone.1, %multiply.27195.1.clone.1)
+  %multiply.27196.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251941.1.clone.1, %select.21564.3.clone.1)
+  %add.251942.1.clone.1 = f32[1280,1280]{1,0} add(%select.21558.3.clone.1, %multiply.27196.1.clone.1)
+  %multiply.27197.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251942.1.clone.1, %select.21564.3.clone.1)
+  %add.251943.3.clone.1 = f32[1280,1280]{1,0} add(%select.21557.5.clone.1, %multiply.27197.1.clone.1)
+  %multiply.27198.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251943.3.clone.1, %select.21564.3.clone.1)
+  %add.251944.3.clone.1 = f32[1280,1280]{1,0} add(%select.21556.5.clone.1, %multiply.27198.1.clone.1)
+  %multiply.27199.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251944.3.clone.1, %select.21564.3.clone.1)
+  %add.251945.9.clone.1 = f32[1280,1280]{1,0} add(%select.21555.11.clone.1, %multiply.27199.7.clone.1)
+  %multiply.27200.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251945.9.clone.1, %select.21564.3.clone.1)
+  %add.251946.7.clone.1 = f32[1280,1280]{1,0} add(%select.21554.7.clone.1, %multiply.27200.7.clone.1)
+  %multiply.27201.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251946.7.clone.1, %maximum.3757.3.clone.1)
+  %select.21569.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7336.3.clone.1, %multiply.27191.9.clone.1, %multiply.27201.7.clone.1)
+  %multiply.27202.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21569.7.clone.1, %broadcast.244500.640)
+  %clamp.1231.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27202.5.clone.1, %broadcast.244501.384)
+  %multiply.27203.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1231.3.clone.1, %broadcast.244502.1)
+  %constant_167071_1_clone_1 = u32[] constant(2639504033)
+  %broadcast.248291.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167071_1_clone_1), dimensions={}
+  %add.246278.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248291.44.clone.1)
+  %constant_167078_1_clone_1 = u32[] constant(3763626449)
+  %broadcast.248292.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167078_1_clone_1), dimensions={}
+  %add.246279.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248292.113.clone.1)
+  %add.246281.35.clone.1 = u32[1280,1280]{1,0} add(%add.246278.37.clone.1, %add.246279.99.clone.1)
+  %shift-left.108710.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246279.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114846.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246279.99.clone.1, %broadcast.244415.6016)
+  %or.114364.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108710.31.clone.1, %shift-right-logical.114846.29.clone.1)
+  %xor.120911.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246281.35.clone.1, %or.114364.29.clone.1)
+  %add.246285.5.clone.1 = u32[1280,1280]{1,0} add(%add.246281.35.clone.1, %xor.120911.27.clone.1)
+  %shift-left.108712.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120911.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114847.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120911.27.clone.1, %broadcast.244417.5760)
+  %or.114365.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108712.9.clone.1, %shift-right-logical.114847.9.clone.1)
+  %xor.120912.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246285.5.clone.1, %or.114365.7.clone.1)
+  %add.246286.3.clone.1 = u32[1280,1280]{1,0} add(%add.246285.5.clone.1, %xor.120912.5.clone.1)
+  %shift-left.108713.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120912.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114848.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120912.5.clone.1, %broadcast.244419.4352)
+  %or.114366.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108713.5.clone.1, %shift-right-logical.114848.5.clone.1)
+  %xor.120913.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246286.3.clone.1, %or.114366.3.clone.1)
+  %add.246287.3.clone.1 = u32[1280,1280]{1,0} add(%add.246286.3.clone.1, %xor.120913.3.clone.1)
+  %add.246288.7.clone.1 = u32[1280,1280]{1,0} add(%add.246287.3.clone.1, %broadcast.248292.113.clone.1)
+  %shift-left.108714.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120913.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114849.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120913.3.clone.1, %broadcast.244418.4352)
+  %or.114367.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108714.5.clone.1, %shift-right-logical.114849.5.clone.1)
+  %xor.120914.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246287.3.clone.1, %or.114367.3.clone.1)
+  %constant_218016_1_clone_1 = u32[] constant(1725356203)
+  %broadcast.248302.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218016_1_clone_1), dimensions={}
+  %add.246290.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120914.3.clone.1, %broadcast.248302.5.clone.1)
+  %add.246291.5.clone.1 = u32[1280,1280]{1,0} add(%add.246288.7.clone.1, %add.246290.5.clone.1)
+  %shift-left.108715.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246290.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114850.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246290.5.clone.1, %broadcast.244416.5760)
+  %or.114368.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108715.9.clone.1, %shift-right-logical.114850.9.clone.1)
+  %xor.120915.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246291.5.clone.1, %or.114368.7.clone.1)
+  %add.246292.3.clone.1 = u32[1280,1280]{1,0} add(%add.246291.5.clone.1, %xor.120915.5.clone.1)
+  %shift-left.108717.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120915.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114851.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120915.5.clone.1, %broadcast.244429.2304)
+  %or.114369.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108717.9.clone.1, %shift-right-logical.114851.9.clone.1)
+  %xor.120916.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246292.3.clone.1, %or.114369.7.clone.1)
+  %add.246293.3.clone.1 = u32[1280,1280]{1,0} add(%add.246292.3.clone.1, %xor.120916.5.clone.1)
+  %shift-left.108718.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120916.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114852.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120916.5.clone.1, %broadcast.244430.4608)
+  %or.114371.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108718.9.clone.1, %shift-right-logical.114852.9.clone.1)
+  %xor.120917.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246293.3.clone.1, %or.114371.7.clone.1)
+  %add.246295.3.clone.1 = u32[1280,1280]{1,0} add(%add.246293.3.clone.1, %xor.120917.5.clone.1)
+  %constant_167080_1_clone_1 = u32[] constant(1725356202)
+  %broadcast.248309.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_167080_1_clone_1), dimensions={}
+  %add.246296.7.clone.1 = u32[1280,1280]{1,0} add(%add.246295.3.clone.1, %broadcast.248309.24.clone.1)
+  %shift-left.108719.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120917.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114853.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120917.5.clone.1, %broadcast.244434.2816)
+  %or.114372.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108719.11.clone.1, %shift-right-logical.114853.11.clone.1)
+  %xor.120918.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246295.3.clone.1, %or.114372.9.clone.1)
+  %constant_218017_1_clone_1 = u32[] constant(2639504035)
+  %broadcast.248312.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218017_1_clone_1), dimensions={}
+  %add.246297.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120918.7.clone.1, %broadcast.248312.5.clone.1)
+  %add.246298.5.clone.1 = u32[1280,1280]{1,0} add(%add.246296.7.clone.1, %add.246297.5.clone.1)
+  %shift-left.108720.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246297.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114854.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246297.5.clone.1, %broadcast.244415.6016)
+  %or.114374.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108720.9.clone.1, %shift-right-logical.114854.9.clone.1)
+  %xor.120919.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246298.5.clone.1, %or.114374.7.clone.1)
+  %add.246300.3.clone.1 = u32[1280,1280]{1,0} add(%add.246298.5.clone.1, %xor.120919.5.clone.1)
+  %shift-left.108721.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120919.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114855.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120919.5.clone.1, %broadcast.244417.5760)
+  %or.114375.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108721.9.clone.1, %shift-right-logical.114855.9.clone.1)
+  %xor.120920.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246300.3.clone.1, %or.114375.7.clone.1)
+  %add.246301.3.clone.1 = u32[1280,1280]{1,0} add(%add.246300.3.clone.1, %xor.120920.5.clone.1)
+  %shift-left.108722.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120920.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114856.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120920.5.clone.1, %broadcast.244419.4352)
+  %or.114376.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108722.7.clone.1, %shift-right-logical.114856.7.clone.1)
+  %xor.120921.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246301.3.clone.1, %or.114376.5.clone.1)
+  %add.246302.3.clone.1 = u32[1280,1280]{1,0} add(%add.246301.3.clone.1, %xor.120921.3.clone.1)
+  %add.246303.7.clone.1 = u32[1280,1280]{1,0} add(%add.246302.3.clone.1, %broadcast.248291.44.clone.1)
+  %shift-left.108723.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120921.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114857.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120921.3.clone.1, %broadcast.244418.4352)
+  %or.114377.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108723.7.clone.1, %shift-right-logical.114857.7.clone.1)
+  %xor.120922.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246302.3.clone.1, %or.114377.5.clone.1)
+  %constant_218018_1_clone_1 = u32[] constant(3763626452)
+  %broadcast.248322.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218018_1_clone_1), dimensions={}
+  %add.246304.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120922.3.clone.1, %broadcast.248322.5.clone.1)
+  %add.246306.5.clone.1 = u32[1280,1280]{1,0} add(%add.246303.7.clone.1, %add.246304.5.clone.1)
+  %shift-left.108724.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246304.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114858.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246304.5.clone.1, %broadcast.244416.5760)
+  %or.114378.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108724.9.clone.1, %shift-right-logical.114858.9.clone.1)
+  %xor.120923.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246306.5.clone.1, %or.114378.7.clone.1)
+  %add.246310.3.clone.1 = u32[1280,1280]{1,0} add(%add.246306.5.clone.1, %xor.120923.5.clone.1)
+  %shift-left.108725.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120923.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114859.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120923.5.clone.1, %broadcast.244429.2304)
+  %or.114379.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108725.9.clone.1, %shift-right-logical.114859.9.clone.1)
+  %xor.120924.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246310.3.clone.1, %or.114379.7.clone.1)
+  %add.246311.3.clone.1 = u32[1280,1280]{1,0} add(%add.246310.3.clone.1, %xor.120924.5.clone.1)
+  %shift-left.108727.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120924.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114860.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120924.5.clone.1, %broadcast.244430.4608)
+  %or.114380.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108727.9.clone.1, %shift-right-logical.114860.9.clone.1)
+  %xor.120925.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246311.3.clone.1, %or.114380.7.clone.1)
+  %add.246312.3.clone.1 = u32[1280,1280]{1,0} add(%add.246311.3.clone.1, %xor.120925.5.clone.1)
+  %add.246313.7.clone.1 = u32[1280,1280]{1,0} add(%add.246312.3.clone.1, %broadcast.248292.113.clone.1)
+  %shift-left.108728.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120925.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114861.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120925.5.clone.1, %broadcast.244434.2816)
+  %or.114381.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108728.11.clone.1, %shift-right-logical.114861.11.clone.1)
+  %xor.120926.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246312.3.clone.1, %or.114381.9.clone.1)
+  %constant_218019_1_clone_1 = u32[] constant(1725356206)
+  %broadcast.248332.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218019_1_clone_1), dimensions={}
+  %add.246315.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120926.7.clone.1, %broadcast.248332.5.clone.1)
+  %add.246316.5.clone.1 = u32[1280,1280]{1,0} add(%add.246313.7.clone.1, %add.246315.5.clone.1)
+  %shift-left.108729.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246315.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114862.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246315.5.clone.1, %broadcast.244415.6016)
+  %or.114382.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108729.9.clone.1, %shift-right-logical.114862.9.clone.1)
+  %xor.120927.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246316.5.clone.1, %or.114382.7.clone.1)
+  %add.246317.3.clone.1 = u32[1280,1280]{1,0} add(%add.246316.5.clone.1, %xor.120927.5.clone.1)
+  %shift-left.108730.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120927.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114863.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120927.5.clone.1, %broadcast.244417.5760)
+  %or.114383.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108730.9.clone.1, %shift-right-logical.114863.9.clone.1)
+  %xor.120928.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246317.3.clone.1, %or.114383.7.clone.1)
+  %add.246318.3.clone.1 = u32[1280,1280]{1,0} add(%add.246317.3.clone.1, %xor.120928.5.clone.1)
+  %shift-left.108732.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120928.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114864.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120928.5.clone.1, %broadcast.244419.4352)
+  %or.114384.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108732.5.clone.1, %shift-right-logical.114864.5.clone.1)
+  %xor.120929.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246318.3.clone.1, %or.114384.3.clone.1)
+  %add.246320.3.clone.1 = u32[1280,1280]{1,0} add(%add.246318.3.clone.1, %xor.120929.3.clone.1)
+  %add.246321.17.clone.1 = u32[1280,1280]{1,0} add(%add.246320.3.clone.1, %broadcast.248309.24.clone.1)
+  %shift-left.108733.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120929.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114865.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120929.3.clone.1, %broadcast.244418.4352)
+  %or.114385.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108733.5.clone.1, %shift-right-logical.114865.5.clone.1)
+  %xor.120930.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246320.3.clone.1, %or.114385.3.clone.1)
+  %constant_218020_1_clone_1 = u32[] constant(2639504038)
+  %broadcast.248342.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218020_1_clone_1), dimensions={}
+  %add.246322.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120930.15.clone.1, %broadcast.248342.19.clone.1)
+  %xor.120931.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246321.17.clone.1, %add.246322.19.clone.1)
+  %shift-right-logical.114866.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120931.17.clone.1, %broadcast.244468.1920)
+  %or.114386.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114866.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5713.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114386.13.clone.1)
+  %add.246323.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5713.11.clone.1, %broadcast.244470.1152)
+  %multiply.26048.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246323.9.clone.1, %broadcast.244471.896)
+  %add.246325.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26048.7.clone.1, %broadcast.244408.1024)
+  %maximum.3645.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246325.5.clone.1)
+  %abs.1513.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3645.3.clone.1)
+  %compare.7174.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1513.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26049.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3645.3.clone.1, %broadcast.244476.1152)
+  %negate.4531.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3645.3.clone.1)
+  %multiply.26050.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3645.3.clone.1, %negate.4531.5.clone.1)
+  %log-plus-one.1513.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26050.5.clone.1)
+  %negate.4532.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1513.3.clone.1)
+  %compare.7175.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4532.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20708.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20709.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20710.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20711.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20712.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20713.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20714.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20715.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20716.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246326.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4532.4.clone.1, %broadcast.244496.640)
+  %sqrt.1513.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4532.4.clone.1)
+  %add.246327.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1513.5.clone.1, %broadcast.244498.640)
+  %select.20717.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7175.3.clone.1, %add.246326.5.clone.1, %add.246327.5.clone.1)
+  %multiply.26051.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20716.3.clone.1, %select.20717.3.clone.1)
+  %add.246328.1.clone.1 = f32[1280,1280]{1,0} add(%select.20715.3.clone.1, %multiply.26051.1.clone.1)
+  %multiply.26052.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246328.1.clone.1, %select.20717.3.clone.1)
+  %add.246329.1.clone.1 = f32[1280,1280]{1,0} add(%select.20714.3.clone.1, %multiply.26052.1.clone.1)
+  %multiply.26053.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246329.1.clone.1, %select.20717.3.clone.1)
+  %add.246331.1.clone.1 = f32[1280,1280]{1,0} add(%select.20713.3.clone.1, %multiply.26053.1.clone.1)
+  %multiply.26054.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246331.1.clone.1, %select.20717.3.clone.1)
+  %add.246335.1.clone.1 = f32[1280,1280]{1,0} add(%select.20712.3.clone.1, %multiply.26054.1.clone.1)
+  %multiply.26055.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246335.1.clone.1, %select.20717.3.clone.1)
+  %add.246336.3.clone.1 = f32[1280,1280]{1,0} add(%select.20711.5.clone.1, %multiply.26055.1.clone.1)
+  %multiply.26056.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246336.3.clone.1, %select.20717.3.clone.1)
+  %add.246337.3.clone.1 = f32[1280,1280]{1,0} add(%select.20710.5.clone.1, %multiply.26056.1.clone.1)
+  %multiply.26057.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246337.3.clone.1, %select.20717.3.clone.1)
+  %add.246338.9.clone.1 = f32[1280,1280]{1,0} add(%select.20709.11.clone.1, %multiply.26057.7.clone.1)
+  %multiply.26058.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246338.9.clone.1, %select.20717.3.clone.1)
+  %add.246340.7.clone.1 = f32[1280,1280]{1,0} add(%select.20708.7.clone.1, %multiply.26058.7.clone.1)
+  %multiply.26059.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246340.7.clone.1, %maximum.3645.3.clone.1)
+  %select.20718.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7174.3.clone.1, %multiply.26049.9.clone.1, %multiply.26059.7.clone.1)
+  %multiply.26060.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20718.7.clone.1, %broadcast.244500.640)
+  %clamp.1157.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26060.5.clone.1, %broadcast.244501.384)
+  %multiply.26061.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1157.3.clone.1, %broadcast.244502.1)
+  %constant_181926_1_clone_1 = u32[] constant(2048903110)
+  %broadcast.254700.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181926_1_clone_1), dimensions={}
+  %add.249942.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.254700.44.clone.1)
+  %constant_181933_1_clone_1 = u32[] constant(733765324)
+  %broadcast.254701.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181933_1_clone_1), dimensions={}
+  %add.249943.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.254701.113.clone.1)
+  %add.249944.35.clone.1 = u32[1280,1280]{1,0} add(%add.249942.37.clone.1, %add.249943.99.clone.1)
+  %shift-left.110280.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249943.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116515.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249943.99.clone.1, %broadcast.244415.6016)
+  %or.116034.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110280.31.clone.1, %shift-right-logical.116515.29.clone.1)
+  %xor.122601.27.clone.1 = u32[1280,1280]{1,0} xor(%add.249944.35.clone.1, %or.116034.29.clone.1)
+  %add.249945.5.clone.1 = u32[1280,1280]{1,0} add(%add.249944.35.clone.1, %xor.122601.27.clone.1)
+  %shift-left.110281.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122601.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116516.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122601.27.clone.1, %broadcast.244417.5760)
+  %or.116035.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110281.9.clone.1, %shift-right-logical.116516.9.clone.1)
+  %xor.122602.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249945.5.clone.1, %or.116035.7.clone.1)
+  %add.249947.3.clone.1 = u32[1280,1280]{1,0} add(%add.249945.5.clone.1, %xor.122602.5.clone.1)
+  %shift-left.110282.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122602.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116517.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122602.5.clone.1, %broadcast.244419.4352)
+  %or.116036.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110282.5.clone.1, %shift-right-logical.116517.5.clone.1)
+  %xor.122603.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249947.3.clone.1, %or.116036.3.clone.1)
+  %add.249950.3.clone.1 = u32[1280,1280]{1,0} add(%add.249947.3.clone.1, %xor.122603.3.clone.1)
+  %add.249951.7.clone.1 = u32[1280,1280]{1,0} add(%add.249950.3.clone.1, %broadcast.254701.113.clone.1)
+  %shift-left.110283.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122603.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116518.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122603.3.clone.1, %broadcast.244418.4352)
+  %or.116037.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110283.5.clone.1, %shift-right-logical.116518.5.clone.1)
+  %xor.122604.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249950.3.clone.1, %or.116037.3.clone.1)
+  %constant_218419_1_clone_1 = u32[] constant(1249018577)
+  %broadcast.254711.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218419_1_clone_1), dimensions={}
+  %add.249952.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122604.3.clone.1, %broadcast.254711.5.clone.1)
+  %add.249953.5.clone.1 = u32[1280,1280]{1,0} add(%add.249951.7.clone.1, %add.249952.5.clone.1)
+  %shift-left.110284.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249952.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116520.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249952.5.clone.1, %broadcast.244416.5760)
+  %or.116038.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110284.9.clone.1, %shift-right-logical.116520.9.clone.1)
+  %xor.122605.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249953.5.clone.1, %or.116038.7.clone.1)
+  %add.249955.3.clone.1 = u32[1280,1280]{1,0} add(%add.249953.5.clone.1, %xor.122605.5.clone.1)
+  %shift-left.110285.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122605.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116521.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122605.5.clone.1, %broadcast.244429.2304)
+  %or.116039.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110285.9.clone.1, %shift-right-logical.116521.9.clone.1)
+  %xor.122606.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249955.3.clone.1, %or.116039.7.clone.1)
+  %add.249956.3.clone.1 = u32[1280,1280]{1,0} add(%add.249955.3.clone.1, %xor.122606.5.clone.1)
+  %shift-left.110286.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122606.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116522.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122606.5.clone.1, %broadcast.244430.4608)
+  %or.116040.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110286.9.clone.1, %shift-right-logical.116522.9.clone.1)
+  %xor.122607.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249956.3.clone.1, %or.116040.7.clone.1)
+  %add.249957.3.clone.1 = u32[1280,1280]{1,0} add(%add.249956.3.clone.1, %xor.122607.5.clone.1)
+  %constant_181935_1_clone_1 = u32[] constant(1249018576)
+  %broadcast.254718.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181935_1_clone_1), dimensions={}
+  %add.249958.7.clone.1 = u32[1280,1280]{1,0} add(%add.249957.3.clone.1, %broadcast.254718.24.clone.1)
+  %shift-left.110287.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122607.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116523.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122607.5.clone.1, %broadcast.244434.2816)
+  %or.116041.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110287.11.clone.1, %shift-right-logical.116523.11.clone.1)
+  %xor.122608.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249957.3.clone.1, %or.116041.9.clone.1)
+  %constant_218420_1_clone_1 = u32[] constant(2048903112)
+  %broadcast.254721.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218420_1_clone_1), dimensions={}
+  %add.249960.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122608.7.clone.1, %broadcast.254721.5.clone.1)
+  %add.249961.5.clone.1 = u32[1280,1280]{1,0} add(%add.249958.7.clone.1, %add.249960.5.clone.1)
+  %shift-left.110288.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249960.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116524.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249960.5.clone.1, %broadcast.244415.6016)
+  %or.116042.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110288.9.clone.1, %shift-right-logical.116524.9.clone.1)
+  %xor.122609.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249961.5.clone.1, %or.116042.7.clone.1)
+  %add.249962.3.clone.1 = u32[1280,1280]{1,0} add(%add.249961.5.clone.1, %xor.122609.5.clone.1)
+  %shift-left.110289.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122609.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116525.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122609.5.clone.1, %broadcast.244417.5760)
+  %or.116043.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110289.9.clone.1, %shift-right-logical.116525.9.clone.1)
+  %xor.122610.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249962.3.clone.1, %or.116043.7.clone.1)
+  %add.249963.3.clone.1 = u32[1280,1280]{1,0} add(%add.249962.3.clone.1, %xor.122610.5.clone.1)
+  %shift-left.110290.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122610.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116526.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122610.5.clone.1, %broadcast.244419.4352)
+  %or.116044.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110290.7.clone.1, %shift-right-logical.116526.7.clone.1)
+  %xor.122611.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249963.3.clone.1, %or.116044.5.clone.1)
+  %add.249965.3.clone.1 = u32[1280,1280]{1,0} add(%add.249963.3.clone.1, %xor.122611.3.clone.1)
+  %add.249966.7.clone.1 = u32[1280,1280]{1,0} add(%add.249965.3.clone.1, %broadcast.254700.44.clone.1)
+  %shift-left.110291.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122611.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116527.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122611.3.clone.1, %broadcast.244418.4352)
+  %or.116046.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110291.7.clone.1, %shift-right-logical.116527.7.clone.1)
+  %xor.122612.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249965.3.clone.1, %or.116046.5.clone.1)
+  %constant_218421_1_clone_1 = u32[] constant(733765327)
+  %broadcast.254731.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218421_1_clone_1), dimensions={}
+  %add.249967.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122612.3.clone.1, %broadcast.254731.5.clone.1)
+  %add.249968.5.clone.1 = u32[1280,1280]{1,0} add(%add.249966.7.clone.1, %add.249967.5.clone.1)
+  %shift-left.110292.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249967.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116528.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249967.5.clone.1, %broadcast.244416.5760)
+  %or.116047.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110292.9.clone.1, %shift-right-logical.116528.9.clone.1)
+  %xor.122613.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249968.5.clone.1, %or.116047.7.clone.1)
+  %add.249969.3.clone.1 = u32[1280,1280]{1,0} add(%add.249968.5.clone.1, %xor.122613.5.clone.1)
+  %shift-left.110293.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122613.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116530.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122613.5.clone.1, %broadcast.244429.2304)
+  %or.116049.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110293.9.clone.1, %shift-right-logical.116530.9.clone.1)
+  %xor.122614.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249969.3.clone.1, %or.116049.7.clone.1)
+  %add.249971.3.clone.1 = u32[1280,1280]{1,0} add(%add.249969.3.clone.1, %xor.122614.5.clone.1)
+  %shift-left.110294.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122614.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116531.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122614.5.clone.1, %broadcast.244430.4608)
+  %or.116050.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110294.9.clone.1, %shift-right-logical.116531.9.clone.1)
+  %xor.122615.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249971.3.clone.1, %or.116050.7.clone.1)
+  %add.249975.3.clone.1 = u32[1280,1280]{1,0} add(%add.249971.3.clone.1, %xor.122615.5.clone.1)
+  %add.249976.7.clone.1 = u32[1280,1280]{1,0} add(%add.249975.3.clone.1, %broadcast.254701.113.clone.1)
+  %shift-left.110295.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122615.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116532.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122615.5.clone.1, %broadcast.244434.2816)
+  %or.116051.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110295.11.clone.1, %shift-right-logical.116532.11.clone.1)
+  %xor.122616.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249975.3.clone.1, %or.116051.9.clone.1)
+  %constant_218422_1_clone_1 = u32[] constant(1249018580)
+  %broadcast.254741.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218422_1_clone_1), dimensions={}
+  %add.249977.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122616.7.clone.1, %broadcast.254741.5.clone.1)
+  %add.249978.5.clone.1 = u32[1280,1280]{1,0} add(%add.249976.7.clone.1, %add.249977.5.clone.1)
+  %shift-left.110296.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249977.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116533.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249977.5.clone.1, %broadcast.244415.6016)
+  %or.116052.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110296.9.clone.1, %shift-right-logical.116533.9.clone.1)
+  %xor.122617.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249978.5.clone.1, %or.116052.7.clone.1)
+  %add.249980.3.clone.1 = u32[1280,1280]{1,0} add(%add.249978.5.clone.1, %xor.122617.5.clone.1)
+  %shift-left.110297.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122617.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116535.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122617.5.clone.1, %broadcast.244417.5760)
+  %or.116053.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110297.9.clone.1, %shift-right-logical.116535.9.clone.1)
+  %xor.122618.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249980.3.clone.1, %or.116053.7.clone.1)
+  %add.249981.3.clone.1 = u32[1280,1280]{1,0} add(%add.249980.3.clone.1, %xor.122618.5.clone.1)
+  %shift-left.110298.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122618.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116536.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122618.5.clone.1, %broadcast.244419.4352)
+  %or.116054.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110298.5.clone.1, %shift-right-logical.116536.5.clone.1)
+  %xor.122619.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249981.3.clone.1, %or.116054.3.clone.1)
+  %add.249982.3.clone.1 = u32[1280,1280]{1,0} add(%add.249981.3.clone.1, %xor.122619.3.clone.1)
+  %add.249983.17.clone.1 = u32[1280,1280]{1,0} add(%add.249982.3.clone.1, %broadcast.254718.24.clone.1)
+  %shift-left.110299.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122619.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116537.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122619.3.clone.1, %broadcast.244418.4352)
+  %or.116055.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110299.5.clone.1, %shift-right-logical.116537.5.clone.1)
+  %xor.122620.15.clone.1 = u32[1280,1280]{1,0} xor(%add.249982.3.clone.1, %or.116055.3.clone.1)
+  %constant_218423_1_clone_1 = u32[] constant(2048903115)
+  %broadcast.254751.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218423_1_clone_1), dimensions={}
+  %add.249985.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122620.15.clone.1, %broadcast.254751.19.clone.1)
+  %xor.122621.17.clone.1 = u32[1280,1280]{1,0} xor(%add.249983.17.clone.1, %add.249985.19.clone.1)
+  %shift-right-logical.116538.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122621.17.clone.1, %broadcast.244468.1920)
+  %or.116056.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116538.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5786.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116056.13.clone.1)
+  %add.249986.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5786.11.clone.1, %broadcast.244470.1152)
+  %multiply.26797.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249986.9.clone.1, %broadcast.244471.896)
+  %add.249987.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26797.7.clone.1, %broadcast.244408.1024)
+  %maximum.3718.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.249987.5.clone.1)
+  %abs.1562.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3718.3.clone.1)
+  %compare.7273.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1562.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26798.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3718.3.clone.1, %broadcast.244476.1152)
+  %negate.4629.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3718.3.clone.1)
+  %multiply.26799.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3718.3.clone.1, %negate.4629.5.clone.1)
+  %log-plus-one.1562.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26799.5.clone.1)
+  %negate.4630.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1562.3.clone.1)
+  %compare.7274.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4630.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21268.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21269.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21270.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21271.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21272.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21273.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21274.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21275.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21276.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.249988.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4630.4.clone.1, %broadcast.244496.640)
+  %sqrt.1562.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4630.4.clone.1)
+  %add.249990.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1562.5.clone.1, %broadcast.244498.640)
+  %select.21277.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7274.3.clone.1, %add.249988.5.clone.1, %add.249990.5.clone.1)
+  %multiply.26800.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21276.3.clone.1, %select.21277.3.clone.1)
+  %add.249991.1.clone.1 = f32[1280,1280]{1,0} add(%select.21275.3.clone.1, %multiply.26800.1.clone.1)
+  %multiply.26801.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249991.1.clone.1, %select.21277.3.clone.1)
+  %add.249992.1.clone.1 = f32[1280,1280]{1,0} add(%select.21274.3.clone.1, %multiply.26801.1.clone.1)
+  %multiply.26802.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249992.1.clone.1, %select.21277.3.clone.1)
+  %add.249993.1.clone.1 = f32[1280,1280]{1,0} add(%select.21273.3.clone.1, %multiply.26802.1.clone.1)
+  %multiply.26803.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249993.1.clone.1, %select.21277.3.clone.1)
+  %add.249994.1.clone.1 = f32[1280,1280]{1,0} add(%select.21272.3.clone.1, %multiply.26803.1.clone.1)
+  %multiply.26804.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249994.1.clone.1, %select.21277.3.clone.1)
+  %add.249996.3.clone.1 = f32[1280,1280]{1,0} add(%select.21271.5.clone.1, %multiply.26804.1.clone.1)
+  %multiply.26805.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249996.3.clone.1, %select.21277.3.clone.1)
+  %add.250000.3.clone.1 = f32[1280,1280]{1,0} add(%select.21270.5.clone.1, %multiply.26805.1.clone.1)
+  %multiply.26806.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250000.3.clone.1, %select.21277.3.clone.1)
+  %add.250001.9.clone.1 = f32[1280,1280]{1,0} add(%select.21269.11.clone.1, %multiply.26806.7.clone.1)
+  %multiply.26807.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250001.9.clone.1, %select.21277.3.clone.1)
+  %add.250002.7.clone.1 = f32[1280,1280]{1,0} add(%select.21268.7.clone.1, %multiply.26807.7.clone.1)
+  %multiply.26808.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.250002.7.clone.1, %maximum.3718.3.clone.1)
+  %select.21278.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7273.3.clone.1, %multiply.26798.9.clone.1, %multiply.26808.7.clone.1)
+  %multiply.26809.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21278.7.clone.1, %broadcast.244500.640)
+  %clamp.1206.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26809.5.clone.1, %broadcast.244501.384)
+  %multiply.26810.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1206.3.clone.1, %broadcast.244502.1)
+  %constant_166861_1_clone_1 = u32[] constant(2479642663)
+  %broadcast.248189.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166861_1_clone_1), dimensions={}
+  %add.246228.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248189.44.clone.1)
+  %constant_166868_1_clone_1 = u32[] constant(3220179177)
+  %broadcast.248191.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166868_1_clone_1), dimensions={}
+  %add.246229.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248191.113.clone.1)
+  %add.246230.35.clone.1 = u32[1280,1280]{1,0} add(%add.246228.37.clone.1, %add.246229.99.clone.1)
+  %shift-left.108687.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246229.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114825.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246229.99.clone.1, %broadcast.244415.6016)
+  %or.114343.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108687.31.clone.1, %shift-right-logical.114825.29.clone.1)
+  %xor.120890.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246230.35.clone.1, %or.114343.29.clone.1)
+  %add.246231.5.clone.1 = u32[1280,1280]{1,0} add(%add.246230.35.clone.1, %xor.120890.27.clone.1)
+  %shift-left.108688.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120890.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114826.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120890.27.clone.1, %broadcast.244417.5760)
+  %or.114344.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108688.9.clone.1, %shift-right-logical.114826.9.clone.1)
+  %xor.120891.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246231.5.clone.1, %or.114344.7.clone.1)
+  %add.246232.3.clone.1 = u32[1280,1280]{1,0} add(%add.246231.5.clone.1, %xor.120891.5.clone.1)
+  %shift-left.108689.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120891.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114827.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120891.5.clone.1, %broadcast.244419.4352)
+  %or.114345.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108689.5.clone.1, %shift-right-logical.114827.5.clone.1)
+  %xor.120892.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246232.3.clone.1, %or.114345.3.clone.1)
+  %add.246233.3.clone.1 = u32[1280,1280]{1,0} add(%add.246232.3.clone.1, %xor.120892.3.clone.1)
+  %add.246234.7.clone.1 = u32[1280,1280]{1,0} add(%add.246233.3.clone.1, %broadcast.248191.113.clone.1)
+  %shift-left.108690.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120892.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114828.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120892.3.clone.1, %broadcast.244418.4352)
+  %or.114346.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108690.5.clone.1, %shift-right-logical.114828.5.clone.1)
+  %xor.120893.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246233.3.clone.1, %or.114346.3.clone.1)
+  %constant_218011_1_clone_1 = u32[] constant(938298133)
+  %broadcast.248201.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218011_1_clone_1), dimensions={}
+  %add.246235.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120893.3.clone.1, %broadcast.248201.5.clone.1)
+  %add.246236.5.clone.1 = u32[1280,1280]{1,0} add(%add.246234.7.clone.1, %add.246235.5.clone.1)
+  %shift-left.108692.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246235.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114829.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246235.5.clone.1, %broadcast.244416.5760)
+  %or.114347.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108692.9.clone.1, %shift-right-logical.114829.9.clone.1)
+  %xor.120894.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246236.5.clone.1, %or.114347.7.clone.1)
+  %add.246237.3.clone.1 = u32[1280,1280]{1,0} add(%add.246236.5.clone.1, %xor.120894.5.clone.1)
+  %shift-left.108693.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120894.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114830.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120894.5.clone.1, %broadcast.244429.2304)
+  %or.114348.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108693.9.clone.1, %shift-right-logical.114830.9.clone.1)
+  %xor.120895.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246237.3.clone.1, %or.114348.7.clone.1)
+  %add.246238.3.clone.1 = u32[1280,1280]{1,0} add(%add.246237.3.clone.1, %xor.120895.5.clone.1)
+  %shift-left.108694.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120895.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114831.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120895.5.clone.1, %broadcast.244430.4608)
+  %or.114349.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108694.9.clone.1, %shift-right-logical.114831.9.clone.1)
+  %xor.120896.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246238.3.clone.1, %or.114349.7.clone.1)
+  %add.246239.3.clone.1 = u32[1280,1280]{1,0} add(%add.246238.3.clone.1, %xor.120896.5.clone.1)
+  %constant_166870_1_clone_1 = u32[] constant(938298132)
+  %broadcast.248208.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166870_1_clone_1), dimensions={}
+  %add.246240.7.clone.1 = u32[1280,1280]{1,0} add(%add.246239.3.clone.1, %broadcast.248208.24.clone.1)
+  %shift-left.108695.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120896.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114832.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120896.5.clone.1, %broadcast.244434.2816)
+  %or.114350.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108695.11.clone.1, %shift-right-logical.114832.11.clone.1)
+  %xor.120897.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246239.3.clone.1, %or.114350.9.clone.1)
+  %constant_218012_1_clone_1 = u32[] constant(2479642665)
+  %broadcast.248211.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218012_1_clone_1), dimensions={}
+  %add.246241.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120897.7.clone.1, %broadcast.248211.5.clone.1)
+  %add.246242.5.clone.1 = u32[1280,1280]{1,0} add(%add.246240.7.clone.1, %add.246241.5.clone.1)
+  %shift-left.108696.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246241.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114833.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246241.5.clone.1, %broadcast.244415.6016)
+  %or.114351.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108696.9.clone.1, %shift-right-logical.114833.9.clone.1)
+  %xor.120898.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246242.5.clone.1, %or.114351.7.clone.1)
+  %add.246243.3.clone.1 = u32[1280,1280]{1,0} add(%add.246242.5.clone.1, %xor.120898.5.clone.1)
+  %shift-left.108697.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120898.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114834.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120898.5.clone.1, %broadcast.244417.5760)
+  %or.114352.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108697.9.clone.1, %shift-right-logical.114834.9.clone.1)
+  %xor.120899.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246243.3.clone.1, %or.114352.7.clone.1)
+  %add.246244.3.clone.1 = u32[1280,1280]{1,0} add(%add.246243.3.clone.1, %xor.120899.5.clone.1)
+  %shift-left.108698.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120899.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114835.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120899.5.clone.1, %broadcast.244419.4352)
+  %or.114353.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108698.7.clone.1, %shift-right-logical.114835.7.clone.1)
+  %xor.120900.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246244.3.clone.1, %or.114353.5.clone.1)
+  %add.246245.3.clone.1 = u32[1280,1280]{1,0} add(%add.246244.3.clone.1, %xor.120900.3.clone.1)
+  %add.246246.7.clone.1 = u32[1280,1280]{1,0} add(%add.246245.3.clone.1, %broadcast.248189.44.clone.1)
+  %shift-left.108699.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120900.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114836.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120900.3.clone.1, %broadcast.244418.4352)
+  %or.114354.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108699.7.clone.1, %shift-right-logical.114836.7.clone.1)
+  %xor.120901.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246245.3.clone.1, %or.114354.5.clone.1)
+  %constant_218013_1_clone_1 = u32[] constant(3220179180)
+  %broadcast.248223.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218013_1_clone_1), dimensions={}
+  %add.246247.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120901.3.clone.1, %broadcast.248223.5.clone.1)
+  %add.246248.5.clone.1 = u32[1280,1280]{1,0} add(%add.246246.7.clone.1, %add.246247.5.clone.1)
+  %shift-left.108700.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246247.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114837.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246247.5.clone.1, %broadcast.244416.5760)
+  %or.114355.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108700.9.clone.1, %shift-right-logical.114837.9.clone.1)
+  %xor.120902.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246248.5.clone.1, %or.114355.7.clone.1)
+  %add.246249.3.clone.1 = u32[1280,1280]{1,0} add(%add.246248.5.clone.1, %xor.120902.5.clone.1)
+  %shift-left.108702.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120902.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114838.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120902.5.clone.1, %broadcast.244429.2304)
+  %or.114356.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108702.9.clone.1, %shift-right-logical.114838.9.clone.1)
+  %xor.120903.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246249.3.clone.1, %or.114356.7.clone.1)
+  %add.246250.3.clone.1 = u32[1280,1280]{1,0} add(%add.246249.3.clone.1, %xor.120903.5.clone.1)
+  %shift-left.108703.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120903.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114839.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120903.5.clone.1, %broadcast.244430.4608)
+  %or.114357.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108703.9.clone.1, %shift-right-logical.114839.9.clone.1)
+  %xor.120904.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246250.3.clone.1, %or.114357.7.clone.1)
+  %add.246251.3.clone.1 = u32[1280,1280]{1,0} add(%add.246250.3.clone.1, %xor.120904.5.clone.1)
+  %add.246252.7.clone.1 = u32[1280,1280]{1,0} add(%add.246251.3.clone.1, %broadcast.248191.113.clone.1)
+  %shift-left.108704.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120904.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114840.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120904.5.clone.1, %broadcast.244434.2816)
+  %or.114358.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108704.11.clone.1, %shift-right-logical.114840.11.clone.1)
+  %xor.120905.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246251.3.clone.1, %or.114358.9.clone.1)
+  %constant_218014_1_clone_1 = u32[] constant(938298136)
+  %broadcast.248238.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218014_1_clone_1), dimensions={}
+  %add.246253.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120905.7.clone.1, %broadcast.248238.5.clone.1)
+  %add.246254.5.clone.1 = u32[1280,1280]{1,0} add(%add.246252.7.clone.1, %add.246253.5.clone.1)
+  %shift-left.108705.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246253.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114841.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246253.5.clone.1, %broadcast.244415.6016)
+  %or.114359.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108705.9.clone.1, %shift-right-logical.114841.9.clone.1)
+  %xor.120906.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246254.5.clone.1, %or.114359.7.clone.1)
+  %add.246255.3.clone.1 = u32[1280,1280]{1,0} add(%add.246254.5.clone.1, %xor.120906.5.clone.1)
+  %shift-left.108707.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120906.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114842.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120906.5.clone.1, %broadcast.244417.5760)
+  %or.114360.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108707.9.clone.1, %shift-right-logical.114842.9.clone.1)
+  %xor.120907.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246255.3.clone.1, %or.114360.7.clone.1)
+  %add.246257.3.clone.1 = u32[1280,1280]{1,0} add(%add.246255.3.clone.1, %xor.120907.5.clone.1)
+  %shift-left.108708.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120907.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114843.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120907.5.clone.1, %broadcast.244419.4352)
+  %or.114361.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108708.5.clone.1, %shift-right-logical.114843.5.clone.1)
+  %xor.120908.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246257.3.clone.1, %or.114361.3.clone.1)
+  %add.246260.3.clone.1 = u32[1280,1280]{1,0} add(%add.246257.3.clone.1, %xor.120908.3.clone.1)
+  %add.246261.17.clone.1 = u32[1280,1280]{1,0} add(%add.246260.3.clone.1, %broadcast.248208.24.clone.1)
+  %shift-left.108709.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120908.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114844.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120908.3.clone.1, %broadcast.244418.4352)
+  %or.114362.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108709.5.clone.1, %shift-right-logical.114844.5.clone.1)
+  %xor.120909.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246260.3.clone.1, %or.114362.3.clone.1)
+  %constant_218015_1_clone_1 = u32[] constant(2479642668)
+  %broadcast.248250.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218015_1_clone_1), dimensions={}
+  %add.246262.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120909.15.clone.1, %broadcast.248250.19.clone.1)
+  %xor.120910.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246261.17.clone.1, %add.246262.19.clone.1)
+  %shift-right-logical.114845.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120910.17.clone.1, %broadcast.244468.1920)
+  %or.114363.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114845.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5712.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114363.13.clone.1)
+  %add.246263.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5712.11.clone.1, %broadcast.244470.1152)
+  %multiply.26034.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246263.9.clone.1, %broadcast.244471.896)
+  %add.246265.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26034.7.clone.1, %broadcast.244408.1024)
+  %maximum.3644.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246265.5.clone.1)
+  %abs.1512.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3644.3.clone.1)
+  %compare.7172.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1512.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26035.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3644.3.clone.1, %broadcast.244476.1152)
+  %negate.4529.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3644.3.clone.1)
+  %multiply.26036.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3644.3.clone.1, %negate.4529.5.clone.1)
+  %log-plus-one.1512.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26036.5.clone.1)
+  %negate.4530.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1512.3.clone.1)
+  %compare.7173.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4530.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20697.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20698.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20699.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20700.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20701.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20702.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20703.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20704.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20705.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246266.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4530.4.clone.1, %broadcast.244496.640)
+  %sqrt.1512.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4530.4.clone.1)
+  %add.246267.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1512.5.clone.1, %broadcast.244498.640)
+  %select.20706.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7173.3.clone.1, %add.246266.5.clone.1, %add.246267.5.clone.1)
+  %multiply.26037.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20705.3.clone.1, %select.20706.3.clone.1)
+  %add.246268.1.clone.1 = f32[1280,1280]{1,0} add(%select.20704.3.clone.1, %multiply.26037.1.clone.1)
+  %multiply.26038.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246268.1.clone.1, %select.20706.3.clone.1)
+  %add.246270.1.clone.1 = f32[1280,1280]{1,0} add(%select.20703.3.clone.1, %multiply.26038.1.clone.1)
+  %multiply.26039.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246270.1.clone.1, %select.20706.3.clone.1)
+  %add.246271.1.clone.1 = f32[1280,1280]{1,0} add(%select.20702.3.clone.1, %multiply.26039.1.clone.1)
+  %multiply.26040.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246271.1.clone.1, %select.20706.3.clone.1)
+  %add.246272.1.clone.1 = f32[1280,1280]{1,0} add(%select.20701.3.clone.1, %multiply.26040.1.clone.1)
+  %multiply.26041.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246272.1.clone.1, %select.20706.3.clone.1)
+  %add.246273.3.clone.1 = f32[1280,1280]{1,0} add(%select.20700.5.clone.1, %multiply.26041.1.clone.1)
+  %multiply.26042.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246273.3.clone.1, %select.20706.3.clone.1)
+  %add.246275.3.clone.1 = f32[1280,1280]{1,0} add(%select.20699.5.clone.1, %multiply.26042.1.clone.1)
+  %multiply.26043.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246275.3.clone.1, %select.20706.3.clone.1)
+  %add.246276.9.clone.1 = f32[1280,1280]{1,0} add(%select.20698.11.clone.1, %multiply.26043.7.clone.1)
+  %multiply.26044.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246276.9.clone.1, %select.20706.3.clone.1)
+  %add.246277.7.clone.1 = f32[1280,1280]{1,0} add(%select.20697.7.clone.1, %multiply.26044.7.clone.1)
+  %multiply.26045.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246277.7.clone.1, %maximum.3644.3.clone.1)
+  %select.20707.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7172.3.clone.1, %multiply.26035.9.clone.1, %multiply.26045.7.clone.1)
+  %multiply.26046.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20707.7.clone.1, %broadcast.244500.640)
+  %clamp.1156.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26046.5.clone.1, %broadcast.244501.384)
+  %multiply.26047.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1156.3.clone.1, %broadcast.244502.1)
+  %constant_193448_1_clone_1 = u32[] constant(3979012375)
+  %broadcast.259680.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193448_1_clone_1), dimensions={}
+  %add.252799.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.259680.44.clone.1)
+  %constant_193455_1_clone_1 = u32[] constant(824634254)
+  %broadcast.259681.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193455_1_clone_1), dimensions={}
+  %add.252801.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.259681.113.clone.1)
+  %add.252802.35.clone.1 = u32[1280,1280]{1,0} add(%add.252799.37.clone.1, %add.252801.99.clone.1)
+  %shift-left.111534.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252801.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117834.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252801.99.clone.1, %broadcast.244415.6016)
+  %or.117358.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111534.31.clone.1, %shift-right-logical.117834.29.clone.1)
+  %xor.123911.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252802.35.clone.1, %or.117358.29.clone.1)
+  %add.252803.5.clone.1 = u32[1280,1280]{1,0} add(%add.252802.35.clone.1, %xor.123911.27.clone.1)
+  %shift-left.111535.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123911.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117835.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123911.27.clone.1, %broadcast.244417.5760)
+  %or.117359.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111535.9.clone.1, %shift-right-logical.117835.9.clone.1)
+  %xor.123912.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252803.5.clone.1, %or.117359.7.clone.1)
+  %add.252804.3.clone.1 = u32[1280,1280]{1,0} add(%add.252803.5.clone.1, %xor.123912.5.clone.1)
+  %shift-left.111537.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123912.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117836.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123912.5.clone.1, %broadcast.244419.4352)
+  %or.117360.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111537.5.clone.1, %shift-right-logical.117836.5.clone.1)
+  %xor.123913.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252804.3.clone.1, %or.117360.3.clone.1)
+  %add.252806.3.clone.1 = u32[1280,1280]{1,0} add(%add.252804.3.clone.1, %xor.123913.3.clone.1)
+  %add.252807.7.clone.1 = u32[1280,1280]{1,0} add(%add.252806.3.clone.1, %broadcast.259681.113.clone.1)
+  %shift-left.111538.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123913.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117837.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123913.3.clone.1, %broadcast.244418.4352)
+  %or.117361.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111538.5.clone.1, %shift-right-logical.117837.5.clone.1)
+  %xor.123914.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252806.3.clone.1, %or.117361.3.clone.1)
+  %constant_218725_1_clone_1 = u32[] constant(3353155908)
+  %broadcast.259691.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218725_1_clone_1), dimensions={}
+  %add.252808.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123914.3.clone.1, %broadcast.259691.5.clone.1)
+  %add.252809.5.clone.1 = u32[1280,1280]{1,0} add(%add.252807.7.clone.1, %add.252808.5.clone.1)
+  %shift-left.111539.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252808.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117838.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252808.5.clone.1, %broadcast.244416.5760)
+  %or.117362.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111539.9.clone.1, %shift-right-logical.117838.9.clone.1)
+  %xor.123915.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252809.5.clone.1, %or.117362.7.clone.1)
+  %add.252810.3.clone.1 = u32[1280,1280]{1,0} add(%add.252809.5.clone.1, %xor.123915.5.clone.1)
+  %shift-left.111540.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123915.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117839.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123915.5.clone.1, %broadcast.244429.2304)
+  %or.117363.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111540.9.clone.1, %shift-right-logical.117839.9.clone.1)
+  %xor.123916.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252810.3.clone.1, %or.117363.7.clone.1)
+  %add.252812.3.clone.1 = u32[1280,1280]{1,0} add(%add.252810.3.clone.1, %xor.123916.5.clone.1)
+  %shift-left.111542.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123916.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117840.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123916.5.clone.1, %broadcast.244430.4608)
+  %or.117364.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111542.9.clone.1, %shift-right-logical.117840.9.clone.1)
+  %xor.123917.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252812.3.clone.1, %or.117364.7.clone.1)
+  %add.252816.3.clone.1 = u32[1280,1280]{1,0} add(%add.252812.3.clone.1, %xor.123917.5.clone.1)
+  %constant_193457_1_clone_1 = u32[] constant(3353155907)
+  %broadcast.259703.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_193457_1_clone_1), dimensions={}
+  %add.252817.7.clone.1 = u32[1280,1280]{1,0} add(%add.252816.3.clone.1, %broadcast.259703.24.clone.1)
+  %shift-left.111543.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123917.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117841.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123917.5.clone.1, %broadcast.244434.2816)
+  %or.117365.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111543.11.clone.1, %shift-right-logical.117841.11.clone.1)
+  %xor.123918.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252816.3.clone.1, %or.117365.9.clone.1)
+  %constant_218726_1_clone_1 = u32[] constant(3979012377)
+  %broadcast.259709.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218726_1_clone_1), dimensions={}
+  %add.252818.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123918.7.clone.1, %broadcast.259709.5.clone.1)
+  %add.252819.5.clone.1 = u32[1280,1280]{1,0} add(%add.252817.7.clone.1, %add.252818.5.clone.1)
+  %shift-left.111544.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252818.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117842.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252818.5.clone.1, %broadcast.244415.6016)
+  %or.117366.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111544.9.clone.1, %shift-right-logical.117842.9.clone.1)
+  %xor.123919.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252819.5.clone.1, %or.117366.7.clone.1)
+  %add.252821.3.clone.1 = u32[1280,1280]{1,0} add(%add.252819.5.clone.1, %xor.123919.5.clone.1)
+  %shift-left.111545.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123919.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117843.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123919.5.clone.1, %broadcast.244417.5760)
+  %or.117367.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111545.9.clone.1, %shift-right-logical.117843.9.clone.1)
+  %xor.123920.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252821.3.clone.1, %or.117367.7.clone.1)
+  %add.252822.3.clone.1 = u32[1280,1280]{1,0} add(%add.252821.3.clone.1, %xor.123920.5.clone.1)
+  %shift-left.111546.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123920.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117844.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123920.5.clone.1, %broadcast.244419.4352)
+  %or.117368.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111546.7.clone.1, %shift-right-logical.117844.7.clone.1)
+  %xor.123921.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252822.3.clone.1, %or.117368.5.clone.1)
+  %add.252823.3.clone.1 = u32[1280,1280]{1,0} add(%add.252822.3.clone.1, %xor.123921.3.clone.1)
+  %add.252824.7.clone.1 = u32[1280,1280]{1,0} add(%add.252823.3.clone.1, %broadcast.259680.44.clone.1)
+  %shift-left.111547.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123921.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117845.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123921.3.clone.1, %broadcast.244418.4352)
+  %or.117369.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111547.7.clone.1, %shift-right-logical.117845.7.clone.1)
+  %xor.123922.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252823.3.clone.1, %or.117369.5.clone.1)
+  %constant_218727_1_clone_1 = u32[] constant(824634257)
+  %broadcast.259727.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218727_1_clone_1), dimensions={}
+  %add.252826.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123922.3.clone.1, %broadcast.259727.5.clone.1)
+  %add.252827.5.clone.1 = u32[1280,1280]{1,0} add(%add.252824.7.clone.1, %add.252826.5.clone.1)
+  %shift-left.111548.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252826.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117846.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252826.5.clone.1, %broadcast.244416.5760)
+  %or.117370.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111548.9.clone.1, %shift-right-logical.117846.9.clone.1)
+  %xor.123923.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252827.5.clone.1, %or.117370.7.clone.1)
+  %add.252828.3.clone.1 = u32[1280,1280]{1,0} add(%add.252827.5.clone.1, %xor.123923.5.clone.1)
+  %shift-left.111549.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123923.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117847.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123923.5.clone.1, %broadcast.244429.2304)
+  %or.117371.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111549.9.clone.1, %shift-right-logical.117847.9.clone.1)
+  %xor.123924.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252828.3.clone.1, %or.117371.7.clone.1)
+  %add.252829.3.clone.1 = u32[1280,1280]{1,0} add(%add.252828.3.clone.1, %xor.123924.5.clone.1)
+  %shift-left.111550.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123924.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117848.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123924.5.clone.1, %broadcast.244430.4608)
+  %or.117372.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111550.9.clone.1, %shift-right-logical.117848.9.clone.1)
+  %xor.123925.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252829.3.clone.1, %or.117372.7.clone.1)
+  %add.252831.3.clone.1 = u32[1280,1280]{1,0} add(%add.252829.3.clone.1, %xor.123925.5.clone.1)
+  %add.252832.7.clone.1 = u32[1280,1280]{1,0} add(%add.252831.3.clone.1, %broadcast.259681.113.clone.1)
+  %shift-left.111552.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123925.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117849.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123925.5.clone.1, %broadcast.244434.2816)
+  %or.117373.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111552.11.clone.1, %shift-right-logical.117849.11.clone.1)
+  %xor.123926.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252831.3.clone.1, %or.117373.9.clone.1)
+  %constant_218728_1_clone_1 = u32[] constant(3353155911)
+  %broadcast.259738.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218728_1_clone_1), dimensions={}
+  %add.252833.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123926.7.clone.1, %broadcast.259738.5.clone.1)
+  %add.252834.5.clone.1 = u32[1280,1280]{1,0} add(%add.252832.7.clone.1, %add.252833.5.clone.1)
+  %shift-left.111553.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252833.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117850.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252833.5.clone.1, %broadcast.244415.6016)
+  %or.117374.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111553.9.clone.1, %shift-right-logical.117850.9.clone.1)
+  %xor.123927.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252834.5.clone.1, %or.117374.7.clone.1)
+  %add.252835.3.clone.1 = u32[1280,1280]{1,0} add(%add.252834.5.clone.1, %xor.123927.5.clone.1)
+  %shift-left.111554.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123927.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117851.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123927.5.clone.1, %broadcast.244417.5760)
+  %or.117375.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111554.9.clone.1, %shift-right-logical.117851.9.clone.1)
+  %xor.123928.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252835.3.clone.1, %or.117375.7.clone.1)
+  %add.252837.3.clone.1 = u32[1280,1280]{1,0} add(%add.252835.3.clone.1, %xor.123928.5.clone.1)
+  %shift-left.111555.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123928.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117852.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123928.5.clone.1, %broadcast.244419.4352)
+  %or.117377.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111555.5.clone.1, %shift-right-logical.117852.5.clone.1)
+  %xor.123929.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252837.3.clone.1, %or.117377.3.clone.1)
+  %add.252841.3.clone.1 = u32[1280,1280]{1,0} add(%add.252837.3.clone.1, %xor.123929.3.clone.1)
+  %add.252842.17.clone.1 = u32[1280,1280]{1,0} add(%add.252841.3.clone.1, %broadcast.259703.24.clone.1)
+  %shift-left.111557.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123929.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117853.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123929.3.clone.1, %broadcast.244418.4352)
+  %or.117378.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111557.5.clone.1, %shift-right-logical.117853.5.clone.1)
+  %xor.123930.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252841.3.clone.1, %or.117378.3.clone.1)
+  %constant_218730_1_clone_1 = u32[] constant(3979012380)
+  %broadcast.259748.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218730_1_clone_1), dimensions={}
+  %add.252843.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123930.15.clone.1, %broadcast.259748.19.clone.1)
+  %xor.123933.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252842.17.clone.1, %add.252843.19.clone.1)
+  %shift-right-logical.117854.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123933.17.clone.1, %broadcast.244468.1920)
+  %or.117380.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117854.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5843.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117380.13.clone.1)
+  %add.252844.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5843.11.clone.1, %broadcast.244470.1152)
+  %multiply.27370.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252844.9.clone.1, %broadcast.244471.896)
+  %add.252846.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27370.7.clone.1, %broadcast.244408.1024)
+  %maximum.3775.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252846.5.clone.1)
+  %abs.1599.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3775.3.clone.1)
+  %compare.7360.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1599.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27371.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3775.3.clone.1, %broadcast.244476.1152)
+  %negate.4703.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3775.3.clone.1)
+  %multiply.27372.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3775.3.clone.1, %negate.4703.5.clone.1)
+  %log-plus-one.1599.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27372.5.clone.1)
+  %negate.4704.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1599.3.clone.1)
+  %compare.7361.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4704.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21696.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21697.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21698.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21699.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21700.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21701.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21702.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21703.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21704.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252847.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4704.4.clone.1, %broadcast.244496.640)
+  %sqrt.1599.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4704.4.clone.1)
+  %add.252848.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1599.5.clone.1, %broadcast.244498.640)
+  %select.21705.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7361.3.clone.1, %add.252847.5.clone.1, %add.252848.5.clone.1)
+  %multiply.27373.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21704.3.clone.1, %select.21705.3.clone.1)
+  %add.252849.1.clone.1 = f32[1280,1280]{1,0} add(%select.21703.3.clone.1, %multiply.27373.1.clone.1)
+  %multiply.27374.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252849.1.clone.1, %select.21705.3.clone.1)
+  %add.252851.1.clone.1 = f32[1280,1280]{1,0} add(%select.21702.3.clone.1, %multiply.27374.1.clone.1)
+  %multiply.27375.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252851.1.clone.1, %select.21705.3.clone.1)
+  %add.252852.1.clone.1 = f32[1280,1280]{1,0} add(%select.21701.3.clone.1, %multiply.27375.1.clone.1)
+  %multiply.27376.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252852.1.clone.1, %select.21705.3.clone.1)
+  %add.252853.1.clone.1 = f32[1280,1280]{1,0} add(%select.21700.3.clone.1, %multiply.27376.1.clone.1)
+  %multiply.27377.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252853.1.clone.1, %select.21705.3.clone.1)
+  %add.252854.3.clone.1 = f32[1280,1280]{1,0} add(%select.21699.5.clone.1, %multiply.27377.1.clone.1)
+  %multiply.27378.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252854.3.clone.1, %select.21705.3.clone.1)
+  %add.252856.3.clone.1 = f32[1280,1280]{1,0} add(%select.21698.5.clone.1, %multiply.27378.1.clone.1)
+  %multiply.27379.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252856.3.clone.1, %select.21705.3.clone.1)
+  %add.252857.9.clone.1 = f32[1280,1280]{1,0} add(%select.21697.11.clone.1, %multiply.27379.7.clone.1)
+  %multiply.27380.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252857.9.clone.1, %select.21705.3.clone.1)
+  %add.252858.7.clone.1 = f32[1280,1280]{1,0} add(%select.21696.7.clone.1, %multiply.27380.7.clone.1)
+  %multiply.27381.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252858.7.clone.1, %maximum.3775.3.clone.1)
+  %select.21706.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7360.3.clone.1, %multiply.27371.9.clone.1, %multiply.27381.7.clone.1)
+  %multiply.27382.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21706.7.clone.1, %broadcast.244500.640)
+  %clamp.1243.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27382.5.clone.1, %broadcast.244501.384)
+  %multiply.27383.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1243.3.clone.1, %broadcast.244502.1)
+  %constant_166629_1_clone_1 = u32[] constant(4239085431)
+  %broadcast.248096.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166629_1_clone_1), dimensions={}
+  %add.246176.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.248096.44.clone.1)
+  %constant_166636_1_clone_1 = u32[] constant(142116863)
+  %broadcast.248097.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166636_1_clone_1), dimensions={}
+  %add.246177.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.248097.113.clone.1)
+  %add.246178.35.clone.1 = u32[1280,1280]{1,0} add(%add.246176.37.clone.1, %add.246177.99.clone.1)
+  %shift-left.108663.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246177.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114804.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246177.99.clone.1, %broadcast.244415.6016)
+  %or.114322.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108663.31.clone.1, %shift-right-logical.114804.29.clone.1)
+  %xor.120869.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246178.35.clone.1, %or.114322.29.clone.1)
+  %add.246179.5.clone.1 = u32[1280,1280]{1,0} add(%add.246178.35.clone.1, %xor.120869.27.clone.1)
+  %shift-left.108664.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120869.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114805.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120869.27.clone.1, %broadcast.244417.5760)
+  %or.114323.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108664.9.clone.1, %shift-right-logical.114805.9.clone.1)
+  %xor.120870.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246179.5.clone.1, %or.114323.7.clone.1)
+  %add.246180.3.clone.1 = u32[1280,1280]{1,0} add(%add.246179.5.clone.1, %xor.120870.5.clone.1)
+  %shift-left.108665.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120870.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114806.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120870.5.clone.1, %broadcast.244419.4352)
+  %or.114324.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108665.5.clone.1, %shift-right-logical.114806.5.clone.1)
+  %xor.120871.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246180.3.clone.1, %or.114324.3.clone.1)
+  %add.246181.3.clone.1 = u32[1280,1280]{1,0} add(%add.246180.3.clone.1, %xor.120871.3.clone.1)
+  %add.246182.7.clone.1 = u32[1280,1280]{1,0} add(%add.246181.3.clone.1, %broadcast.248097.113.clone.1)
+  %shift-left.108667.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120871.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114807.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120871.3.clone.1, %broadcast.244418.4352)
+  %or.114325.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108667.5.clone.1, %shift-right-logical.114807.5.clone.1)
+  %xor.120872.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246181.3.clone.1, %or.114325.3.clone.1)
+  %constant_218006_1_clone_1 = u32[] constant(4009939795)
+  %broadcast.248107.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218006_1_clone_1), dimensions={}
+  %add.246183.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120872.3.clone.1, %broadcast.248107.5.clone.1)
+  %add.246184.5.clone.1 = u32[1280,1280]{1,0} add(%add.246182.7.clone.1, %add.246183.5.clone.1)
+  %shift-left.108668.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246183.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114808.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246183.5.clone.1, %broadcast.244416.5760)
+  %or.114326.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108668.9.clone.1, %shift-right-logical.114808.9.clone.1)
+  %xor.120873.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246184.5.clone.1, %or.114326.7.clone.1)
+  %add.246185.3.clone.1 = u32[1280,1280]{1,0} add(%add.246184.5.clone.1, %xor.120873.5.clone.1)
+  %shift-left.108669.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120873.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114809.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120873.5.clone.1, %broadcast.244429.2304)
+  %or.114327.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108669.9.clone.1, %shift-right-logical.114809.9.clone.1)
+  %xor.120874.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246185.3.clone.1, %or.114327.7.clone.1)
+  %add.246186.3.clone.1 = u32[1280,1280]{1,0} add(%add.246185.3.clone.1, %xor.120874.5.clone.1)
+  %shift-left.108670.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120874.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114810.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120874.5.clone.1, %broadcast.244430.4608)
+  %or.114328.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108670.9.clone.1, %shift-right-logical.114810.9.clone.1)
+  %xor.120875.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246186.3.clone.1, %or.114328.7.clone.1)
+  %add.246187.3.clone.1 = u32[1280,1280]{1,0} add(%add.246186.3.clone.1, %xor.120875.5.clone.1)
+  %constant_166638_1_clone_1 = u32[] constant(4009939794)
+  %broadcast.248116.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166638_1_clone_1), dimensions={}
+  %add.246188.7.clone.1 = u32[1280,1280]{1,0} add(%add.246187.3.clone.1, %broadcast.248116.24.clone.1)
+  %shift-left.108671.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120875.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114811.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120875.5.clone.1, %broadcast.244434.2816)
+  %or.114329.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108671.11.clone.1, %shift-right-logical.114811.11.clone.1)
+  %xor.120876.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246187.3.clone.1, %or.114329.9.clone.1)
+  %constant_218007_1_clone_1 = u32[] constant(4239085433)
+  %broadcast.248119.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218007_1_clone_1), dimensions={}
+  %add.246189.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120876.7.clone.1, %broadcast.248119.5.clone.1)
+  %add.246190.5.clone.1 = u32[1280,1280]{1,0} add(%add.246188.7.clone.1, %add.246189.5.clone.1)
+  %shift-left.108672.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246189.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114812.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246189.5.clone.1, %broadcast.244415.6016)
+  %or.114330.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108672.9.clone.1, %shift-right-logical.114812.9.clone.1)
+  %xor.120877.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246190.5.clone.1, %or.114330.7.clone.1)
+  %add.246191.3.clone.1 = u32[1280,1280]{1,0} add(%add.246190.5.clone.1, %xor.120877.5.clone.1)
+  %shift-left.108673.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120877.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114813.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120877.5.clone.1, %broadcast.244417.5760)
+  %or.114331.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108673.9.clone.1, %shift-right-logical.114813.9.clone.1)
+  %xor.120878.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246191.3.clone.1, %or.114331.7.clone.1)
+  %add.246192.3.clone.1 = u32[1280,1280]{1,0} add(%add.246191.3.clone.1, %xor.120878.5.clone.1)
+  %shift-left.108674.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120878.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114814.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120878.5.clone.1, %broadcast.244419.4352)
+  %or.114332.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108674.7.clone.1, %shift-right-logical.114814.7.clone.1)
+  %xor.120879.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246192.3.clone.1, %or.114332.5.clone.1)
+  %add.246193.3.clone.1 = u32[1280,1280]{1,0} add(%add.246192.3.clone.1, %xor.120879.3.clone.1)
+  %add.246194.7.clone.1 = u32[1280,1280]{1,0} add(%add.246193.3.clone.1, %broadcast.248096.44.clone.1)
+  %shift-left.108675.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120879.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114815.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120879.3.clone.1, %broadcast.244418.4352)
+  %or.114333.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108675.7.clone.1, %shift-right-logical.114815.7.clone.1)
+  %xor.120880.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246193.3.clone.1, %or.114333.5.clone.1)
+  %constant_218008_1_clone_1 = u32[] constant(142116866)
+  %broadcast.248129.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218008_1_clone_1), dimensions={}
+  %add.246195.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120880.3.clone.1, %broadcast.248129.5.clone.1)
+  %add.246196.5.clone.1 = u32[1280,1280]{1,0} add(%add.246194.7.clone.1, %add.246195.5.clone.1)
+  %shift-left.108677.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246195.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114816.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246195.5.clone.1, %broadcast.244416.5760)
+  %or.114334.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108677.9.clone.1, %shift-right-logical.114816.9.clone.1)
+  %xor.120881.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246196.5.clone.1, %or.114334.7.clone.1)
+  %add.246197.3.clone.1 = u32[1280,1280]{1,0} add(%add.246196.5.clone.1, %xor.120881.5.clone.1)
+  %shift-left.108678.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120881.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114817.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120881.5.clone.1, %broadcast.244429.2304)
+  %or.114335.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108678.9.clone.1, %shift-right-logical.114817.9.clone.1)
+  %xor.120882.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246197.3.clone.1, %or.114335.7.clone.1)
+  %add.246198.3.clone.1 = u32[1280,1280]{1,0} add(%add.246197.3.clone.1, %xor.120882.5.clone.1)
+  %shift-left.108679.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120882.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114818.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120882.5.clone.1, %broadcast.244430.4608)
+  %or.114336.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108679.9.clone.1, %shift-right-logical.114818.9.clone.1)
+  %xor.120883.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246198.3.clone.1, %or.114336.7.clone.1)
+  %add.246200.3.clone.1 = u32[1280,1280]{1,0} add(%add.246198.3.clone.1, %xor.120883.5.clone.1)
+  %add.246201.7.clone.1 = u32[1280,1280]{1,0} add(%add.246200.3.clone.1, %broadcast.248097.113.clone.1)
+  %shift-left.108680.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120883.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114819.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120883.5.clone.1, %broadcast.244434.2816)
+  %or.114337.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108680.11.clone.1, %shift-right-logical.114819.11.clone.1)
+  %xor.120884.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246200.3.clone.1, %or.114337.9.clone.1)
+  %constant_218009_1_clone_1 = u32[] constant(4009939798)
+  %broadcast.248141.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218009_1_clone_1), dimensions={}
+  %add.246203.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120884.7.clone.1, %broadcast.248141.5.clone.1)
+  %add.246204.5.clone.1 = u32[1280,1280]{1,0} add(%add.246201.7.clone.1, %add.246203.5.clone.1)
+  %shift-left.108682.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246203.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114820.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246203.5.clone.1, %broadcast.244415.6016)
+  %or.114338.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108682.9.clone.1, %shift-right-logical.114820.9.clone.1)
+  %xor.120885.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246204.5.clone.1, %or.114338.7.clone.1)
+  %add.246206.3.clone.1 = u32[1280,1280]{1,0} add(%add.246204.5.clone.1, %xor.120885.5.clone.1)
+  %shift-left.108683.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120885.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114821.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120885.5.clone.1, %broadcast.244417.5760)
+  %or.114339.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108683.9.clone.1, %shift-right-logical.114821.9.clone.1)
+  %xor.120886.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246206.3.clone.1, %or.114339.7.clone.1)
+  %add.246207.3.clone.1 = u32[1280,1280]{1,0} add(%add.246206.3.clone.1, %xor.120886.5.clone.1)
+  %shift-left.108684.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120886.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114822.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120886.5.clone.1, %broadcast.244419.4352)
+  %or.114340.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108684.5.clone.1, %shift-right-logical.114822.5.clone.1)
+  %xor.120887.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246207.3.clone.1, %or.114340.3.clone.1)
+  %add.246209.3.clone.1 = u32[1280,1280]{1,0} add(%add.246207.3.clone.1, %xor.120887.3.clone.1)
+  %add.246210.17.clone.1 = u32[1280,1280]{1,0} add(%add.246209.3.clone.1, %broadcast.248116.24.clone.1)
+  %shift-left.108685.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120887.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114823.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120887.3.clone.1, %broadcast.244418.4352)
+  %or.114341.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108685.5.clone.1, %shift-right-logical.114823.5.clone.1)
+  %xor.120888.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246209.3.clone.1, %or.114341.3.clone.1)
+  %constant_218010_1_clone_1 = u32[] constant(4239085436)
+  %broadcast.248151.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218010_1_clone_1), dimensions={}
+  %add.246212.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120888.15.clone.1, %broadcast.248151.19.clone.1)
+  %xor.120889.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246210.17.clone.1, %add.246212.19.clone.1)
+  %shift-right-logical.114824.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120889.17.clone.1, %broadcast.244468.1920)
+  %or.114342.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114824.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5711.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114342.13.clone.1)
+  %add.246213.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5711.11.clone.1, %broadcast.244470.1152)
+  %multiply.26020.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246213.9.clone.1, %broadcast.244471.896)
+  %add.246215.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26020.7.clone.1, %broadcast.244408.1024)
+  %maximum.3643.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246215.5.clone.1)
+  %abs.1511.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3643.3.clone.1)
+  %compare.7170.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1511.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26021.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3643.3.clone.1, %broadcast.244476.1152)
+  %negate.4527.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3643.3.clone.1)
+  %multiply.26022.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3643.3.clone.1, %negate.4527.5.clone.1)
+  %log-plus-one.1511.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26022.5.clone.1)
+  %negate.4528.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1511.3.clone.1)
+  %compare.7171.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4528.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20686.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20687.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20688.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20689.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20690.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20691.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20692.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20693.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20694.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246216.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4528.4.clone.1, %broadcast.244496.640)
+  %sqrt.1511.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4528.4.clone.1)
+  %add.246218.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1511.5.clone.1, %broadcast.244498.640)
+  %select.20695.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7171.3.clone.1, %add.246216.5.clone.1, %add.246218.5.clone.1)
+  %multiply.26023.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20694.3.clone.1, %select.20695.3.clone.1)
+  %add.246219.1.clone.1 = f32[1280,1280]{1,0} add(%select.20693.3.clone.1, %multiply.26023.1.clone.1)
+  %multiply.26024.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246219.1.clone.1, %select.20695.3.clone.1)
+  %add.246221.1.clone.1 = f32[1280,1280]{1,0} add(%select.20692.3.clone.1, %multiply.26024.1.clone.1)
+  %multiply.26025.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246221.1.clone.1, %select.20695.3.clone.1)
+  %add.246222.1.clone.1 = f32[1280,1280]{1,0} add(%select.20691.3.clone.1, %multiply.26025.1.clone.1)
+  %multiply.26026.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246222.1.clone.1, %select.20695.3.clone.1)
+  %add.246223.1.clone.1 = f32[1280,1280]{1,0} add(%select.20690.3.clone.1, %multiply.26026.1.clone.1)
+  %multiply.26027.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246223.1.clone.1, %select.20695.3.clone.1)
+  %add.246224.3.clone.1 = f32[1280,1280]{1,0} add(%select.20689.5.clone.1, %multiply.26027.1.clone.1)
+  %multiply.26028.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246224.3.clone.1, %select.20695.3.clone.1)
+  %add.246225.3.clone.1 = f32[1280,1280]{1,0} add(%select.20688.5.clone.1, %multiply.26028.1.clone.1)
+  %multiply.26029.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246225.3.clone.1, %select.20695.3.clone.1)
+  %add.246226.9.clone.1 = f32[1280,1280]{1,0} add(%select.20687.11.clone.1, %multiply.26029.7.clone.1)
+  %multiply.26030.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246226.9.clone.1, %select.20695.3.clone.1)
+  %add.246227.7.clone.1 = f32[1280,1280]{1,0} add(%select.20686.7.clone.1, %multiply.26030.7.clone.1)
+  %multiply.26031.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246227.7.clone.1, %maximum.3643.3.clone.1)
+  %select.20696.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7170.3.clone.1, %multiply.26021.9.clone.1, %multiply.26031.7.clone.1)
+  %multiply.26032.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20696.7.clone.1, %broadcast.244500.640)
+  %clamp.1155.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26032.5.clone.1, %broadcast.244501.384)
+  %multiply.26033.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1155.3.clone.1, %broadcast.244502.1)
+  %constant_181713_1_clone_1 = u32[] constant(1982119221)
+  %broadcast.254614.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181713_1_clone_1), dimensions={}
+  %add.249897.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.254614.44.clone.1)
+  %constant_181723_1_clone_1 = u32[] constant(1790761844)
+  %broadcast.254615.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181723_1_clone_1), dimensions={}
+  %add.249898.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.254615.113.clone.1)
+  %add.249899.35.clone.1 = u32[1280,1280]{1,0} add(%add.249897.37.clone.1, %add.249898.99.clone.1)
+  %shift-left.110260.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249898.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116490.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249898.99.clone.1, %broadcast.244415.6016)
+  %or.116011.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110260.31.clone.1, %shift-right-logical.116490.29.clone.1)
+  %xor.122580.27.clone.1 = u32[1280,1280]{1,0} xor(%add.249899.35.clone.1, %or.116011.29.clone.1)
+  %add.249900.5.clone.1 = u32[1280,1280]{1,0} add(%add.249899.35.clone.1, %xor.122580.27.clone.1)
+  %shift-left.110261.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122580.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116491.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122580.27.clone.1, %broadcast.244417.5760)
+  %or.116012.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110261.9.clone.1, %shift-right-logical.116491.9.clone.1)
+  %xor.122581.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249900.5.clone.1, %or.116012.7.clone.1)
+  %add.249901.3.clone.1 = u32[1280,1280]{1,0} add(%add.249900.5.clone.1, %xor.122581.5.clone.1)
+  %shift-left.110262.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122581.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116492.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122581.5.clone.1, %broadcast.244419.4352)
+  %or.116013.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110262.5.clone.1, %shift-right-logical.116492.5.clone.1)
+  %xor.122582.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249901.3.clone.1, %or.116013.3.clone.1)
+  %add.249902.3.clone.1 = u32[1280,1280]{1,0} add(%add.249901.3.clone.1, %xor.122582.3.clone.1)
+  %add.249903.7.clone.1 = u32[1280,1280]{1,0} add(%add.249902.3.clone.1, %broadcast.254615.113.clone.1)
+  %shift-left.110263.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122582.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116493.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122582.3.clone.1, %broadcast.244418.4352)
+  %or.116014.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110263.5.clone.1, %shift-right-logical.116493.5.clone.1)
+  %xor.122583.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249902.3.clone.1, %or.116014.3.clone.1)
+  %constant_218414_1_clone_1 = u32[] constant(122256796)
+  %broadcast.254625.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218414_1_clone_1), dimensions={}
+  %add.249904.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122583.3.clone.1, %broadcast.254625.5.clone.1)
+  %add.249905.5.clone.1 = u32[1280,1280]{1,0} add(%add.249903.7.clone.1, %add.249904.5.clone.1)
+  %shift-left.110264.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249904.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116495.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249904.5.clone.1, %broadcast.244416.5760)
+  %or.116015.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110264.9.clone.1, %shift-right-logical.116495.9.clone.1)
+  %xor.122584.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249905.5.clone.1, %or.116015.7.clone.1)
+  %add.249906.3.clone.1 = u32[1280,1280]{1,0} add(%add.249905.5.clone.1, %xor.122584.5.clone.1)
+  %shift-left.110265.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122584.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116496.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122584.5.clone.1, %broadcast.244429.2304)
+  %or.116016.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110265.9.clone.1, %shift-right-logical.116496.9.clone.1)
+  %xor.122585.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249906.3.clone.1, %or.116016.7.clone.1)
+  %add.249907.3.clone.1 = u32[1280,1280]{1,0} add(%add.249906.3.clone.1, %xor.122585.5.clone.1)
+  %shift-left.110266.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122585.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116497.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122585.5.clone.1, %broadcast.244430.4608)
+  %or.116017.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110266.9.clone.1, %shift-right-logical.116497.9.clone.1)
+  %xor.122586.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249907.3.clone.1, %or.116017.7.clone.1)
+  %add.249908.3.clone.1 = u32[1280,1280]{1,0} add(%add.249907.3.clone.1, %xor.122586.5.clone.1)
+  %constant_181725_1_clone_1 = u32[] constant(122256795)
+  %broadcast.254632.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181725_1_clone_1), dimensions={}
+  %add.249909.7.clone.1 = u32[1280,1280]{1,0} add(%add.249908.3.clone.1, %broadcast.254632.24.clone.1)
+  %shift-left.110267.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122586.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116498.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122586.5.clone.1, %broadcast.244434.2816)
+  %or.116018.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110267.11.clone.1, %shift-right-logical.116498.11.clone.1)
+  %xor.122587.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249908.3.clone.1, %or.116018.9.clone.1)
+  %constant_218415_1_clone_1 = u32[] constant(1982119223)
+  %broadcast.254635.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218415_1_clone_1), dimensions={}
+  %add.249910.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122587.7.clone.1, %broadcast.254635.5.clone.1)
+  %add.249911.5.clone.1 = u32[1280,1280]{1,0} add(%add.249909.7.clone.1, %add.249910.5.clone.1)
+  %shift-left.110268.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249910.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116499.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249910.5.clone.1, %broadcast.244415.6016)
+  %or.116019.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110268.9.clone.1, %shift-right-logical.116499.9.clone.1)
+  %xor.122588.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249911.5.clone.1, %or.116019.7.clone.1)
+  %add.249912.3.clone.1 = u32[1280,1280]{1,0} add(%add.249911.5.clone.1, %xor.122588.5.clone.1)
+  %shift-left.110269.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122588.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116500.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122588.5.clone.1, %broadcast.244417.5760)
+  %or.116020.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110269.9.clone.1, %shift-right-logical.116500.9.clone.1)
+  %xor.122589.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249912.3.clone.1, %or.116020.7.clone.1)
+  %add.249913.3.clone.1 = u32[1280,1280]{1,0} add(%add.249912.3.clone.1, %xor.122589.5.clone.1)
+  %shift-left.110270.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122589.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116501.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122589.5.clone.1, %broadcast.244419.4352)
+  %or.116021.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110270.7.clone.1, %shift-right-logical.116501.7.clone.1)
+  %xor.122590.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249913.3.clone.1, %or.116021.5.clone.1)
+  %add.249914.3.clone.1 = u32[1280,1280]{1,0} add(%add.249913.3.clone.1, %xor.122590.3.clone.1)
+  %add.249916.7.clone.1 = u32[1280,1280]{1,0} add(%add.249914.3.clone.1, %broadcast.254614.44.clone.1)
+  %shift-left.110271.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122590.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116502.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122590.3.clone.1, %broadcast.244418.4352)
+  %or.116022.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110271.7.clone.1, %shift-right-logical.116502.7.clone.1)
+  %xor.122591.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249914.3.clone.1, %or.116022.5.clone.1)
+  %constant_218416_1_clone_1 = u32[] constant(1790761847)
+  %broadcast.254645.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218416_1_clone_1), dimensions={}
+  %add.249917.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122591.3.clone.1, %broadcast.254645.5.clone.1)
+  %add.249918.5.clone.1 = u32[1280,1280]{1,0} add(%add.249916.7.clone.1, %add.249917.5.clone.1)
+  %shift-left.110272.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249917.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116503.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249917.5.clone.1, %broadcast.244416.5760)
+  %or.116024.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110272.9.clone.1, %shift-right-logical.116503.9.clone.1)
+  %xor.122592.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249918.5.clone.1, %or.116024.7.clone.1)
+  %add.249919.3.clone.1 = u32[1280,1280]{1,0} add(%add.249918.5.clone.1, %xor.122592.5.clone.1)
+  %shift-left.110273.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122592.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116505.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122592.5.clone.1, %broadcast.244429.2304)
+  %or.116025.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110273.9.clone.1, %shift-right-logical.116505.9.clone.1)
+  %xor.122593.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249919.3.clone.1, %or.116025.7.clone.1)
+  %add.249920.3.clone.1 = u32[1280,1280]{1,0} add(%add.249919.3.clone.1, %xor.122593.5.clone.1)
+  %shift-left.110274.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122593.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116506.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122593.5.clone.1, %broadcast.244430.4608)
+  %or.116027.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110274.9.clone.1, %shift-right-logical.116506.9.clone.1)
+  %xor.122594.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249920.3.clone.1, %or.116027.7.clone.1)
+  %add.249921.3.clone.1 = u32[1280,1280]{1,0} add(%add.249920.3.clone.1, %xor.122594.5.clone.1)
+  %add.249922.7.clone.1 = u32[1280,1280]{1,0} add(%add.249921.3.clone.1, %broadcast.254615.113.clone.1)
+  %shift-left.110275.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122594.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116507.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122594.5.clone.1, %broadcast.244434.2816)
+  %or.116028.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110275.11.clone.1, %shift-right-logical.116507.11.clone.1)
+  %xor.122595.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249921.3.clone.1, %or.116028.9.clone.1)
+  %constant_218417_1_clone_1 = u32[] constant(122256799)
+  %broadcast.254655.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218417_1_clone_1), dimensions={}
+  %add.249923.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122595.7.clone.1, %broadcast.254655.5.clone.1)
+  %add.249924.5.clone.1 = u32[1280,1280]{1,0} add(%add.249922.7.clone.1, %add.249923.5.clone.1)
+  %shift-left.110276.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249923.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116508.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249923.5.clone.1, %broadcast.244415.6016)
+  %or.116029.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110276.9.clone.1, %shift-right-logical.116508.9.clone.1)
+  %xor.122596.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249924.5.clone.1, %or.116029.7.clone.1)
+  %add.249925.3.clone.1 = u32[1280,1280]{1,0} add(%add.249924.5.clone.1, %xor.122596.5.clone.1)
+  %shift-left.110277.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122596.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116510.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122596.5.clone.1, %broadcast.244417.5760)
+  %or.116030.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110277.9.clone.1, %shift-right-logical.116510.9.clone.1)
+  %xor.122597.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249925.3.clone.1, %or.116030.7.clone.1)
+  %add.249926.3.clone.1 = u32[1280,1280]{1,0} add(%add.249925.3.clone.1, %xor.122597.5.clone.1)
+  %shift-left.110278.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122597.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116511.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122597.5.clone.1, %broadcast.244419.4352)
+  %or.116031.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110278.5.clone.1, %shift-right-logical.116511.5.clone.1)
+  %xor.122598.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249926.3.clone.1, %or.116031.3.clone.1)
+  %add.249927.3.clone.1 = u32[1280,1280]{1,0} add(%add.249926.3.clone.1, %xor.122598.3.clone.1)
+  %add.249928.17.clone.1 = u32[1280,1280]{1,0} add(%add.249927.3.clone.1, %broadcast.254632.24.clone.1)
+  %shift-left.110279.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122598.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116512.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122598.3.clone.1, %broadcast.244418.4352)
+  %or.116032.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110279.5.clone.1, %shift-right-logical.116512.5.clone.1)
+  %xor.122599.15.clone.1 = u32[1280,1280]{1,0} xor(%add.249927.3.clone.1, %or.116032.3.clone.1)
+  %constant_218418_1_clone_1 = u32[] constant(1982119226)
+  %broadcast.254665.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218418_1_clone_1), dimensions={}
+  %add.249929.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122599.15.clone.1, %broadcast.254665.19.clone.1)
+  %xor.122600.17.clone.1 = u32[1280,1280]{1,0} xor(%add.249928.17.clone.1, %add.249929.19.clone.1)
+  %shift-right-logical.116513.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122600.17.clone.1, %broadcast.244468.1920)
+  %or.116033.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116513.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5785.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116033.13.clone.1)
+  %add.249930.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5785.11.clone.1, %broadcast.244470.1152)
+  %multiply.26783.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249930.9.clone.1, %broadcast.244471.896)
+  %add.249931.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26783.7.clone.1, %broadcast.244408.1024)
+  %maximum.3717.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.249931.5.clone.1)
+  %abs.1561.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3717.3.clone.1)
+  %compare.7271.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1561.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26784.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3717.3.clone.1, %broadcast.244476.1152)
+  %negate.4627.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3717.3.clone.1)
+  %multiply.26785.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3717.3.clone.1, %negate.4627.5.clone.1)
+  %log-plus-one.1561.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26785.5.clone.1)
+  %negate.4628.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1561.3.clone.1)
+  %compare.7272.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4628.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21257.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21258.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21259.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21260.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21261.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21262.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21263.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21264.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21265.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.249932.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4628.4.clone.1, %broadcast.244496.640)
+  %sqrt.1561.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4628.4.clone.1)
+  %add.249933.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1561.5.clone.1, %broadcast.244498.640)
+  %select.21266.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7272.3.clone.1, %add.249932.5.clone.1, %add.249933.5.clone.1)
+  %multiply.26786.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21265.3.clone.1, %select.21266.3.clone.1)
+  %add.249934.1.clone.1 = f32[1280,1280]{1,0} add(%select.21264.3.clone.1, %multiply.26786.1.clone.1)
+  %multiply.26787.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249934.1.clone.1, %select.21266.3.clone.1)
+  %add.249935.1.clone.1 = f32[1280,1280]{1,0} add(%select.21263.3.clone.1, %multiply.26787.1.clone.1)
+  %multiply.26788.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249935.1.clone.1, %select.21266.3.clone.1)
+  %add.249936.1.clone.1 = f32[1280,1280]{1,0} add(%select.21262.3.clone.1, %multiply.26788.1.clone.1)
+  %multiply.26789.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249936.1.clone.1, %select.21266.3.clone.1)
+  %add.249937.1.clone.1 = f32[1280,1280]{1,0} add(%select.21261.3.clone.1, %multiply.26789.1.clone.1)
+  %multiply.26790.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249937.1.clone.1, %select.21266.3.clone.1)
+  %add.249938.3.clone.1 = f32[1280,1280]{1,0} add(%select.21260.5.clone.1, %multiply.26790.1.clone.1)
+  %multiply.26791.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249938.3.clone.1, %select.21266.3.clone.1)
+  %add.249939.3.clone.1 = f32[1280,1280]{1,0} add(%select.21259.5.clone.1, %multiply.26791.1.clone.1)
+  %multiply.26792.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249939.3.clone.1, %select.21266.3.clone.1)
+  %add.249940.9.clone.1 = f32[1280,1280]{1,0} add(%select.21258.11.clone.1, %multiply.26792.7.clone.1)
+  %multiply.26793.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249940.9.clone.1, %select.21266.3.clone.1)
+  %add.249941.7.clone.1 = f32[1280,1280]{1,0} add(%select.21257.7.clone.1, %multiply.26793.7.clone.1)
+  %multiply.26794.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249941.7.clone.1, %maximum.3717.3.clone.1)
+  %select.21267.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7271.3.clone.1, %multiply.26784.9.clone.1, %multiply.26794.7.clone.1)
+  %multiply.26795.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21267.7.clone.1, %broadcast.244500.640)
+  %clamp.1205.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26795.5.clone.1, %broadcast.244501.384)
+  %multiply.26796.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1205.3.clone.1, %broadcast.244502.1)
+  %constant_166078_1_clone_1 = u32[] constant(1224719244)
+  %broadcast.247863.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166078_1_clone_1), dimensions={}
+  %add.246052.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247863.44.clone.1)
+  %constant_166085_1_clone_1 = u32[] constant(3697509252)
+  %broadcast.247864.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166085_1_clone_1), dimensions={}
+  %add.246053.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247864.113.clone.1)
+  %add.246054.35.clone.1 = u32[1280,1280]{1,0} add(%add.246052.37.clone.1, %add.246053.99.clone.1)
+  %shift-left.108600.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246053.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114739.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246053.99.clone.1, %broadcast.244415.6016)
+  %or.114259.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108600.31.clone.1, %shift-right-logical.114739.29.clone.1)
+  %xor.120806.27.clone.1 = u32[1280,1280]{1,0} xor(%add.246054.35.clone.1, %or.114259.29.clone.1)
+  %add.246056.5.clone.1 = u32[1280,1280]{1,0} add(%add.246054.35.clone.1, %xor.120806.27.clone.1)
+  %shift-left.108601.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120806.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114740.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120806.27.clone.1, %broadcast.244417.5760)
+  %or.114260.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108601.9.clone.1, %shift-right-logical.114740.9.clone.1)
+  %xor.120807.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246056.5.clone.1, %or.114260.7.clone.1)
+  %add.246057.3.clone.1 = u32[1280,1280]{1,0} add(%add.246056.5.clone.1, %xor.120807.5.clone.1)
+  %shift-left.108602.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120807.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114741.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120807.5.clone.1, %broadcast.244419.4352)
+  %or.114261.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108602.5.clone.1, %shift-right-logical.114741.5.clone.1)
+  %xor.120808.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246057.3.clone.1, %or.114261.3.clone.1)
+  %add.246058.3.clone.1 = u32[1280,1280]{1,0} add(%add.246057.3.clone.1, %xor.120808.3.clone.1)
+  %add.246059.7.clone.1 = u32[1280,1280]{1,0} add(%add.246058.3.clone.1, %broadcast.247864.113.clone.1)
+  %shift-left.108603.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120808.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114742.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120808.3.clone.1, %broadcast.244418.4352)
+  %or.114262.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108603.5.clone.1, %shift-right-logical.114742.5.clone.1)
+  %xor.120809.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246058.3.clone.1, %or.114262.3.clone.1)
+  %constant_217986_1_clone_1 = u32[] constant(2404197331)
+  %broadcast.247880.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217986_1_clone_1), dimensions={}
+  %add.246060.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120809.3.clone.1, %broadcast.247880.5.clone.1)
+  %add.246062.5.clone.1 = u32[1280,1280]{1,0} add(%add.246059.7.clone.1, %add.246060.5.clone.1)
+  %shift-left.108604.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246060.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114743.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246060.5.clone.1, %broadcast.244416.5760)
+  %or.114263.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108604.9.clone.1, %shift-right-logical.114743.9.clone.1)
+  %xor.120810.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246062.5.clone.1, %or.114263.7.clone.1)
+  %add.246066.3.clone.1 = u32[1280,1280]{1,0} add(%add.246062.5.clone.1, %xor.120810.5.clone.1)
+  %shift-left.108605.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120810.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114744.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120810.5.clone.1, %broadcast.244429.2304)
+  %or.114264.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108605.9.clone.1, %shift-right-logical.114744.9.clone.1)
+  %xor.120811.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246066.3.clone.1, %or.114264.7.clone.1)
+  %add.246067.3.clone.1 = u32[1280,1280]{1,0} add(%add.246066.3.clone.1, %xor.120811.5.clone.1)
+  %shift-left.108606.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120811.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114745.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120811.5.clone.1, %broadcast.244430.4608)
+  %or.114265.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108606.9.clone.1, %shift-right-logical.114745.9.clone.1)
+  %xor.120812.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246067.3.clone.1, %or.114265.7.clone.1)
+  %add.246068.3.clone.1 = u32[1280,1280]{1,0} add(%add.246067.3.clone.1, %xor.120812.5.clone.1)
+  %constant_166087_1_clone_1 = u32[] constant(2404197330)
+  %broadcast.247887.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_166087_1_clone_1), dimensions={}
+  %add.246069.7.clone.1 = u32[1280,1280]{1,0} add(%add.246068.3.clone.1, %broadcast.247887.24.clone.1)
+  %shift-left.108607.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120812.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114746.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120812.5.clone.1, %broadcast.244434.2816)
+  %or.114266.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108607.11.clone.1, %shift-right-logical.114746.11.clone.1)
+  %xor.120813.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246068.3.clone.1, %or.114266.9.clone.1)
+  %constant_217988_1_clone_1 = u32[] constant(1224719246)
+  %broadcast.247890.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217988_1_clone_1), dimensions={}
+  %add.246071.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120813.7.clone.1, %broadcast.247890.5.clone.1)
+  %add.246072.5.clone.1 = u32[1280,1280]{1,0} add(%add.246069.7.clone.1, %add.246071.5.clone.1)
+  %shift-left.108608.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246071.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114747.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246071.5.clone.1, %broadcast.244415.6016)
+  %or.114267.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108608.9.clone.1, %shift-right-logical.114747.9.clone.1)
+  %xor.120814.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246072.5.clone.1, %or.114267.7.clone.1)
+  %add.246073.3.clone.1 = u32[1280,1280]{1,0} add(%add.246072.5.clone.1, %xor.120814.5.clone.1)
+  %shift-left.108609.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120814.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114748.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120814.5.clone.1, %broadcast.244417.5760)
+  %or.114268.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108609.9.clone.1, %shift-right-logical.114748.9.clone.1)
+  %xor.120815.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246073.3.clone.1, %or.114268.7.clone.1)
+  %add.246074.3.clone.1 = u32[1280,1280]{1,0} add(%add.246073.3.clone.1, %xor.120815.5.clone.1)
+  %shift-left.108610.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120815.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114749.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120815.5.clone.1, %broadcast.244419.4352)
+  %or.114269.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108610.7.clone.1, %shift-right-logical.114749.7.clone.1)
+  %xor.120816.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246074.3.clone.1, %or.114269.5.clone.1)
+  %add.246076.3.clone.1 = u32[1280,1280]{1,0} add(%add.246074.3.clone.1, %xor.120816.3.clone.1)
+  %add.246077.7.clone.1 = u32[1280,1280]{1,0} add(%add.246076.3.clone.1, %broadcast.247863.44.clone.1)
+  %shift-left.108611.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120816.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114750.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120816.3.clone.1, %broadcast.244418.4352)
+  %or.114270.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108611.7.clone.1, %shift-right-logical.114750.7.clone.1)
+  %xor.120817.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246076.3.clone.1, %or.114270.5.clone.1)
+  %constant_217990_1_clone_1 = u32[] constant(3697509255)
+  %broadcast.247900.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217990_1_clone_1), dimensions={}
+  %add.246078.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120817.3.clone.1, %broadcast.247900.5.clone.1)
+  %add.246079.5.clone.1 = u32[1280,1280]{1,0} add(%add.246077.7.clone.1, %add.246078.5.clone.1)
+  %shift-left.108612.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246078.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114751.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246078.5.clone.1, %broadcast.244416.5760)
+  %or.114271.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108612.9.clone.1, %shift-right-logical.114751.9.clone.1)
+  %xor.120818.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246079.5.clone.1, %or.114271.7.clone.1)
+  %add.246081.3.clone.1 = u32[1280,1280]{1,0} add(%add.246079.5.clone.1, %xor.120818.5.clone.1)
+  %shift-left.108613.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120818.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114752.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120818.5.clone.1, %broadcast.244429.2304)
+  %or.114272.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108613.9.clone.1, %shift-right-logical.114752.9.clone.1)
+  %xor.120819.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246081.3.clone.1, %or.114272.7.clone.1)
+  %add.246082.3.clone.1 = u32[1280,1280]{1,0} add(%add.246081.3.clone.1, %xor.120819.5.clone.1)
+  %shift-left.108614.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120819.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114753.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120819.5.clone.1, %broadcast.244430.4608)
+  %or.114273.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108614.9.clone.1, %shift-right-logical.114753.9.clone.1)
+  %xor.120820.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246082.3.clone.1, %or.114273.7.clone.1)
+  %add.246083.3.clone.1 = u32[1280,1280]{1,0} add(%add.246082.3.clone.1, %xor.120820.5.clone.1)
+  %add.246084.7.clone.1 = u32[1280,1280]{1,0} add(%add.246083.3.clone.1, %broadcast.247864.113.clone.1)
+  %shift-left.108615.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120820.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114754.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120820.5.clone.1, %broadcast.244434.2816)
+  %or.114274.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108615.11.clone.1, %shift-right-logical.114754.11.clone.1)
+  %xor.120821.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246083.3.clone.1, %or.114274.9.clone.1)
+  %constant_217992_1_clone_1 = u32[] constant(2404197334)
+  %broadcast.247910.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217992_1_clone_1), dimensions={}
+  %add.246085.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120821.7.clone.1, %broadcast.247910.5.clone.1)
+  %add.246087.5.clone.1 = u32[1280,1280]{1,0} add(%add.246084.7.clone.1, %add.246085.5.clone.1)
+  %shift-left.108616.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246085.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114755.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246085.5.clone.1, %broadcast.244415.6016)
+  %or.114275.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108616.9.clone.1, %shift-right-logical.114755.9.clone.1)
+  %xor.120822.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246087.5.clone.1, %or.114275.7.clone.1)
+  %add.246090.3.clone.1 = u32[1280,1280]{1,0} add(%add.246087.5.clone.1, %xor.120822.5.clone.1)
+  %shift-left.108617.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120822.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114756.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120822.5.clone.1, %broadcast.244417.5760)
+  %or.114276.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108617.9.clone.1, %shift-right-logical.114756.9.clone.1)
+  %xor.120823.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246090.3.clone.1, %or.114276.7.clone.1)
+  %add.246091.3.clone.1 = u32[1280,1280]{1,0} add(%add.246090.3.clone.1, %xor.120823.5.clone.1)
+  %shift-left.108618.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120823.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114757.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120823.5.clone.1, %broadcast.244419.4352)
+  %or.114277.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108618.5.clone.1, %shift-right-logical.114757.5.clone.1)
+  %xor.120824.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246091.3.clone.1, %or.114277.3.clone.1)
+  %add.246092.3.clone.1 = u32[1280,1280]{1,0} add(%add.246091.3.clone.1, %xor.120824.3.clone.1)
+  %add.246093.17.clone.1 = u32[1280,1280]{1,0} add(%add.246092.3.clone.1, %broadcast.247887.24.clone.1)
+  %shift-left.108619.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120824.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114758.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120824.3.clone.1, %broadcast.244418.4352)
+  %or.114278.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108619.5.clone.1, %shift-right-logical.114758.5.clone.1)
+  %xor.120825.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246092.3.clone.1, %or.114278.3.clone.1)
+  %constant_217994_1_clone_1 = u32[] constant(1224719249)
+  %broadcast.247920.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217994_1_clone_1), dimensions={}
+  %add.246094.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120825.15.clone.1, %broadcast.247920.19.clone.1)
+  %xor.120826.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246093.17.clone.1, %add.246094.19.clone.1)
+  %shift-right-logical.114759.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120826.17.clone.1, %broadcast.244468.1920)
+  %or.114279.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114759.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5708.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114279.13.clone.1)
+  %add.246095.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5708.11.clone.1, %broadcast.244470.1152)
+  %multiply.26002.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246095.9.clone.1, %broadcast.244471.896)
+  %add.246096.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26002.7.clone.1, %broadcast.244408.1024)
+  %maximum.3640.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246096.5.clone.1)
+  %abs.1510.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3640.3.clone.1)
+  %compare.7168.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1510.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26003.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3640.3.clone.1, %broadcast.244476.1152)
+  %negate.4525.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3640.3.clone.1)
+  %multiply.26004.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3640.3.clone.1, %negate.4525.5.clone.1)
+  %log-plus-one.1510.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26004.5.clone.1)
+  %negate.4526.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1510.3.clone.1)
+  %compare.7169.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4526.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20675.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20676.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20677.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20678.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20679.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20680.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20681.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20682.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20683.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246097.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4526.4.clone.1, %broadcast.244496.640)
+  %sqrt.1510.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4526.4.clone.1)
+  %add.246098.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1510.5.clone.1, %broadcast.244498.640)
+  %select.20684.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7169.3.clone.1, %add.246097.5.clone.1, %add.246098.5.clone.1)
+  %multiply.26005.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20683.3.clone.1, %select.20684.3.clone.1)
+  %add.246099.1.clone.1 = f32[1280,1280]{1,0} add(%select.20682.3.clone.1, %multiply.26005.1.clone.1)
+  %multiply.26006.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246099.1.clone.1, %select.20684.3.clone.1)
+  %add.246100.1.clone.1 = f32[1280,1280]{1,0} add(%select.20681.3.clone.1, %multiply.26006.1.clone.1)
+  %multiply.26007.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246100.1.clone.1, %select.20684.3.clone.1)
+  %add.246101.1.clone.1 = f32[1280,1280]{1,0} add(%select.20680.3.clone.1, %multiply.26007.1.clone.1)
+  %multiply.26008.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246101.1.clone.1, %select.20684.3.clone.1)
+  %add.246102.1.clone.1 = f32[1280,1280]{1,0} add(%select.20679.3.clone.1, %multiply.26008.1.clone.1)
+  %multiply.26009.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246102.1.clone.1, %select.20684.3.clone.1)
+  %add.246103.3.clone.1 = f32[1280,1280]{1,0} add(%select.20678.5.clone.1, %multiply.26009.1.clone.1)
+  %multiply.26010.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246103.3.clone.1, %select.20684.3.clone.1)
+  %add.246104.3.clone.1 = f32[1280,1280]{1,0} add(%select.20677.5.clone.1, %multiply.26010.1.clone.1)
+  %multiply.26011.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246104.3.clone.1, %select.20684.3.clone.1)
+  %add.246105.9.clone.1 = f32[1280,1280]{1,0} add(%select.20676.11.clone.1, %multiply.26011.7.clone.1)
+  %multiply.26012.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246105.9.clone.1, %select.20684.3.clone.1)
+  %add.246106.7.clone.1 = f32[1280,1280]{1,0} add(%select.20675.7.clone.1, %multiply.26012.7.clone.1)
+  %multiply.26013.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246106.7.clone.1, %maximum.3640.3.clone.1)
+  %select.20685.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7168.3.clone.1, %multiply.26003.9.clone.1, %multiply.26013.7.clone.1)
+  %multiply.26014.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20685.7.clone.1, %broadcast.244500.640)
+  %clamp.1154.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26014.5.clone.1, %broadcast.244501.384)
+  %multiply.26015.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1154.3.clone.1, %broadcast.244502.1)
+  %constant_189231_1_clone_1 = u32[] constant(1760383797)
+  %broadcast.257864.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189231_1_clone_1), dimensions={}
+  %add.251753.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.257864.44.clone.1)
+  %constant_189238_1_clone_1 = u32[] constant(3661218658)
+  %broadcast.257865.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189238_1_clone_1), dimensions={}
+  %add.251754.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.257865.113.clone.1)
+  %add.251755.35.clone.1 = u32[1280,1280]{1,0} add(%add.251753.37.clone.1, %add.251754.99.clone.1)
+  %shift-left.111080.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251754.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117350.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251754.99.clone.1, %broadcast.244415.6016)
+  %or.116876.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111080.31.clone.1, %shift-right-logical.117350.29.clone.1)
+  %xor.123435.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251755.35.clone.1, %or.116876.29.clone.1)
+  %add.251756.5.clone.1 = u32[1280,1280]{1,0} add(%add.251755.35.clone.1, %xor.123435.27.clone.1)
+  %shift-left.111081.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123435.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117351.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123435.27.clone.1, %broadcast.244417.5760)
+  %or.116877.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111081.9.clone.1, %shift-right-logical.117351.9.clone.1)
+  %xor.123436.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251756.5.clone.1, %or.116877.7.clone.1)
+  %add.251757.3.clone.1 = u32[1280,1280]{1,0} add(%add.251756.5.clone.1, %xor.123436.5.clone.1)
+  %shift-left.111082.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123436.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117352.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123436.5.clone.1, %broadcast.244419.4352)
+  %or.116878.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111082.5.clone.1, %shift-right-logical.117352.5.clone.1)
+  %xor.123437.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251757.3.clone.1, %or.116878.3.clone.1)
+  %add.251758.3.clone.1 = u32[1280,1280]{1,0} add(%add.251757.3.clone.1, %xor.123437.3.clone.1)
+  %add.251759.7.clone.1 = u32[1280,1280]{1,0} add(%add.251758.3.clone.1, %broadcast.257865.113.clone.1)
+  %shift-left.111083.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123437.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117353.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123437.3.clone.1, %broadcast.244418.4352)
+  %or.116879.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111083.5.clone.1, %shift-right-logical.117353.5.clone.1)
+  %xor.123438.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251758.3.clone.1, %or.116879.3.clone.1)
+  %constant_218620_1_clone_1 = u32[] constant(2835712910)
+  %broadcast.257875.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218620_1_clone_1), dimensions={}
+  %add.251760.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123438.3.clone.1, %broadcast.257875.5.clone.1)
+  %add.251761.5.clone.1 = u32[1280,1280]{1,0} add(%add.251759.7.clone.1, %add.251760.5.clone.1)
+  %shift-left.111085.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251760.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117354.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251760.5.clone.1, %broadcast.244416.5760)
+  %or.116881.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111085.9.clone.1, %shift-right-logical.117354.9.clone.1)
+  %xor.123439.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251761.5.clone.1, %or.116881.7.clone.1)
+  %add.251762.3.clone.1 = u32[1280,1280]{1,0} add(%add.251761.5.clone.1, %xor.123439.5.clone.1)
+  %shift-left.111086.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123439.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117355.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123439.5.clone.1, %broadcast.244429.2304)
+  %or.116882.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111086.9.clone.1, %shift-right-logical.117355.9.clone.1)
+  %xor.123441.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251762.3.clone.1, %or.116882.7.clone.1)
+  %add.251763.3.clone.1 = u32[1280,1280]{1,0} add(%add.251762.3.clone.1, %xor.123441.5.clone.1)
+  %shift-left.111087.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123441.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117356.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123441.5.clone.1, %broadcast.244430.4608)
+  %or.116883.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111087.9.clone.1, %shift-right-logical.117356.9.clone.1)
+  %xor.123442.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251763.3.clone.1, %or.116883.7.clone.1)
+  %add.251764.3.clone.1 = u32[1280,1280]{1,0} add(%add.251763.3.clone.1, %xor.123442.5.clone.1)
+  %constant_189240_1_clone_1 = u32[] constant(2835712909)
+  %broadcast.257882.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189240_1_clone_1), dimensions={}
+  %add.251765.7.clone.1 = u32[1280,1280]{1,0} add(%add.251764.3.clone.1, %broadcast.257882.24.clone.1)
+  %shift-left.111088.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123442.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117357.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123442.5.clone.1, %broadcast.244434.2816)
+  %or.116884.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111088.11.clone.1, %shift-right-logical.117357.11.clone.1)
+  %xor.123443.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251764.3.clone.1, %or.116884.9.clone.1)
+  %constant_218621_1_clone_1 = u32[] constant(1760383799)
+  %broadcast.257885.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218621_1_clone_1), dimensions={}
+  %add.251766.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123443.7.clone.1, %broadcast.257885.5.clone.1)
+  %add.251767.5.clone.1 = u32[1280,1280]{1,0} add(%add.251765.7.clone.1, %add.251766.5.clone.1)
+  %shift-left.111090.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251766.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117358.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251766.5.clone.1, %broadcast.244415.6016)
+  %or.116886.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111090.9.clone.1, %shift-right-logical.117358.9.clone.1)
+  %xor.123444.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251767.5.clone.1, %or.116886.7.clone.1)
+  %add.251768.3.clone.1 = u32[1280,1280]{1,0} add(%add.251767.5.clone.1, %xor.123444.5.clone.1)
+  %shift-left.111091.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123444.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117359.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123444.5.clone.1, %broadcast.244417.5760)
+  %or.116887.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111091.9.clone.1, %shift-right-logical.117359.9.clone.1)
+  %xor.123446.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251768.3.clone.1, %or.116887.7.clone.1)
+  %add.251769.3.clone.1 = u32[1280,1280]{1,0} add(%add.251768.3.clone.1, %xor.123446.5.clone.1)
+  %shift-left.111092.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123446.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117360.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123446.5.clone.1, %broadcast.244419.4352)
+  %or.116888.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111092.7.clone.1, %shift-right-logical.117360.7.clone.1)
+  %xor.123447.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251769.3.clone.1, %or.116888.5.clone.1)
+  %add.251770.3.clone.1 = u32[1280,1280]{1,0} add(%add.251769.3.clone.1, %xor.123447.3.clone.1)
+  %add.251771.7.clone.1 = u32[1280,1280]{1,0} add(%add.251770.3.clone.1, %broadcast.257864.44.clone.1)
+  %shift-left.111093.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123447.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117361.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123447.3.clone.1, %broadcast.244418.4352)
+  %or.116889.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111093.7.clone.1, %shift-right-logical.117361.7.clone.1)
+  %xor.123448.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251770.3.clone.1, %or.116889.5.clone.1)
+  %constant_218622_1_clone_1 = u32[] constant(3661218661)
+  %broadcast.257895.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218622_1_clone_1), dimensions={}
+  %add.251772.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123448.3.clone.1, %broadcast.257895.5.clone.1)
+  %add.251773.5.clone.1 = u32[1280,1280]{1,0} add(%add.251771.7.clone.1, %add.251772.5.clone.1)
+  %shift-left.111095.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251772.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117362.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251772.5.clone.1, %broadcast.244416.5760)
+  %or.116890.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111095.9.clone.1, %shift-right-logical.117362.9.clone.1)
+  %xor.123449.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251773.5.clone.1, %or.116890.7.clone.1)
+  %add.251775.3.clone.1 = u32[1280,1280]{1,0} add(%add.251773.5.clone.1, %xor.123449.5.clone.1)
+  %shift-left.111096.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123449.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117363.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123449.5.clone.1, %broadcast.244429.2304)
+  %or.116891.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111096.9.clone.1, %shift-right-logical.117363.9.clone.1)
+  %xor.123451.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251775.3.clone.1, %or.116891.7.clone.1)
+  %add.251778.3.clone.1 = u32[1280,1280]{1,0} add(%add.251775.3.clone.1, %xor.123451.5.clone.1)
+  %shift-left.111097.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123451.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117364.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123451.5.clone.1, %broadcast.244430.4608)
+  %or.116892.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111097.9.clone.1, %shift-right-logical.117364.9.clone.1)
+  %xor.123452.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251778.3.clone.1, %or.116892.7.clone.1)
+  %add.251779.3.clone.1 = u32[1280,1280]{1,0} add(%add.251778.3.clone.1, %xor.123452.5.clone.1)
+  %add.251780.7.clone.1 = u32[1280,1280]{1,0} add(%add.251779.3.clone.1, %broadcast.257865.113.clone.1)
+  %shift-left.111098.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123452.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117365.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123452.5.clone.1, %broadcast.244434.2816)
+  %or.116893.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111098.11.clone.1, %shift-right-logical.117365.11.clone.1)
+  %xor.123453.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251779.3.clone.1, %or.116893.9.clone.1)
+  %constant_218623_1_clone_1 = u32[] constant(2835712913)
+  %broadcast.257905.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218623_1_clone_1), dimensions={}
+  %add.251781.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123453.7.clone.1, %broadcast.257905.5.clone.1)
+  %add.251783.5.clone.1 = u32[1280,1280]{1,0} add(%add.251780.7.clone.1, %add.251781.5.clone.1)
+  %shift-left.111100.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251781.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117366.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251781.5.clone.1, %broadcast.244415.6016)
+  %or.116894.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111100.9.clone.1, %shift-right-logical.117366.9.clone.1)
+  %xor.123454.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251783.5.clone.1, %or.116894.7.clone.1)
+  %add.251784.3.clone.1 = u32[1280,1280]{1,0} add(%add.251783.5.clone.1, %xor.123454.5.clone.1)
+  %shift-left.111101.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123454.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117367.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123454.5.clone.1, %broadcast.244417.5760)
+  %or.116896.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111101.9.clone.1, %shift-right-logical.117367.9.clone.1)
+  %xor.123456.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251784.3.clone.1, %or.116896.7.clone.1)
+  %add.251785.3.clone.1 = u32[1280,1280]{1,0} add(%add.251784.3.clone.1, %xor.123456.5.clone.1)
+  %shift-left.111102.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123456.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117368.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123456.5.clone.1, %broadcast.244419.4352)
+  %or.116897.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111102.5.clone.1, %shift-right-logical.117368.5.clone.1)
+  %xor.123457.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251785.3.clone.1, %or.116897.3.clone.1)
+  %add.251786.3.clone.1 = u32[1280,1280]{1,0} add(%add.251785.3.clone.1, %xor.123457.3.clone.1)
+  %add.251788.17.clone.1 = u32[1280,1280]{1,0} add(%add.251786.3.clone.1, %broadcast.257882.24.clone.1)
+  %shift-left.111103.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123457.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117369.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123457.3.clone.1, %broadcast.244418.4352)
+  %or.116898.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111103.5.clone.1, %shift-right-logical.117369.5.clone.1)
+  %xor.123458.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251786.3.clone.1, %or.116898.3.clone.1)
+  %constant_218624_1_clone_1 = u32[] constant(1760383802)
+  %broadcast.257915.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218624_1_clone_1), dimensions={}
+  %add.251789.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123458.15.clone.1, %broadcast.257915.19.clone.1)
+  %xor.123459.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251788.17.clone.1, %add.251789.19.clone.1)
+  %shift-right-logical.117370.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123459.17.clone.1, %broadcast.244468.1920)
+  %or.116899.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117370.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5822.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116899.13.clone.1)
+  %add.251790.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5822.11.clone.1, %broadcast.244470.1152)
+  %multiply.27172.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251790.9.clone.1, %broadcast.244471.896)
+  %add.251791.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27172.7.clone.1, %broadcast.244408.1024)
+  %maximum.3754.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251791.5.clone.1)
+  %abs.1586.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3754.3.clone.1)
+  %compare.7334.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1586.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27173.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3754.3.clone.1, %broadcast.244476.1152)
+  %negate.4677.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3754.3.clone.1)
+  %multiply.27174.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3754.3.clone.1, %negate.4677.5.clone.1)
+  %log-plus-one.1586.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27174.5.clone.1)
+  %negate.4678.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1586.3.clone.1)
+  %compare.7335.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4678.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21542.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21543.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21544.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21545.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21547.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21548.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21549.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21550.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21551.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251793.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4678.4.clone.1, %broadcast.244496.640)
+  %sqrt.1586.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4678.4.clone.1)
+  %add.251794.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1586.5.clone.1, %broadcast.244498.640)
+  %select.21552.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7335.3.clone.1, %add.251793.5.clone.1, %add.251794.5.clone.1)
+  %multiply.27175.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21551.3.clone.1, %select.21552.3.clone.1)
+  %add.251795.1.clone.1 = f32[1280,1280]{1,0} add(%select.21550.3.clone.1, %multiply.27175.1.clone.1)
+  %multiply.27176.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251795.1.clone.1, %select.21552.3.clone.1)
+  %add.251796.1.clone.1 = f32[1280,1280]{1,0} add(%select.21549.3.clone.1, %multiply.27176.1.clone.1)
+  %multiply.27177.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251796.1.clone.1, %select.21552.3.clone.1)
+  %add.251797.1.clone.1 = f32[1280,1280]{1,0} add(%select.21548.3.clone.1, %multiply.27177.1.clone.1)
+  %multiply.27178.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251797.1.clone.1, %select.21552.3.clone.1)
+  %add.251799.1.clone.1 = f32[1280,1280]{1,0} add(%select.21547.3.clone.1, %multiply.27178.1.clone.1)
+  %multiply.27179.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251799.1.clone.1, %select.21552.3.clone.1)
+  %add.251803.3.clone.1 = f32[1280,1280]{1,0} add(%select.21545.5.clone.1, %multiply.27179.1.clone.1)
+  %multiply.27180.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251803.3.clone.1, %select.21552.3.clone.1)
+  %add.251804.3.clone.1 = f32[1280,1280]{1,0} add(%select.21544.5.clone.1, %multiply.27180.1.clone.1)
+  %multiply.27181.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251804.3.clone.1, %select.21552.3.clone.1)
+  %add.251805.9.clone.1 = f32[1280,1280]{1,0} add(%select.21543.11.clone.1, %multiply.27181.7.clone.1)
+  %multiply.27182.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251805.9.clone.1, %select.21552.3.clone.1)
+  %add.251806.7.clone.1 = f32[1280,1280]{1,0} add(%select.21542.7.clone.1, %multiply.27182.7.clone.1)
+  %multiply.27183.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251806.7.clone.1, %maximum.3754.3.clone.1)
+  %select.21553.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7334.3.clone.1, %multiply.27173.9.clone.1, %multiply.27183.7.clone.1)
+  %multiply.27184.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21553.7.clone.1, %broadcast.244500.640)
+  %clamp.1230.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27184.5.clone.1, %broadcast.244501.384)
+  %multiply.27185.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1230.3.clone.1, %broadcast.244502.1)
+  %constant_165867_1_clone_1 = u32[] constant(2880272204)
+  %broadcast.247764.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165867_1_clone_1), dimensions={}
+  %add.245992.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247764.44.clone.1)
+  %constant_165874_1_clone_1 = u32[] constant(2024700297)
+  %broadcast.247765.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165874_1_clone_1), dimensions={}
+  %add.245993.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247765.113.clone.1)
+  %add.245994.35.clone.1 = u32[1280,1280]{1,0} add(%add.245992.37.clone.1, %add.245993.99.clone.1)
+  %shift-left.108579.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245993.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114714.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245993.99.clone.1, %broadcast.244415.6016)
+  %or.114238.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108579.31.clone.1, %shift-right-logical.114714.29.clone.1)
+  %xor.120785.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245994.35.clone.1, %or.114238.29.clone.1)
+  %add.245996.5.clone.1 = u32[1280,1280]{1,0} add(%add.245994.35.clone.1, %xor.120785.27.clone.1)
+  %shift-left.108580.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120785.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114715.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120785.27.clone.1, %broadcast.244417.5760)
+  %or.114239.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108580.9.clone.1, %shift-right-logical.114715.9.clone.1)
+  %xor.120786.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245996.5.clone.1, %or.114239.7.clone.1)
+  %add.245997.3.clone.1 = u32[1280,1280]{1,0} add(%add.245996.5.clone.1, %xor.120786.5.clone.1)
+  %shift-left.108581.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120786.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114716.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120786.5.clone.1, %broadcast.244419.4352)
+  %or.114240.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108581.5.clone.1, %shift-right-logical.114716.5.clone.1)
+  %xor.120787.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245997.3.clone.1, %or.114240.3.clone.1)
+  %add.245998.3.clone.1 = u32[1280,1280]{1,0} add(%add.245997.3.clone.1, %xor.120787.3.clone.1)
+  %add.245999.7.clone.1 = u32[1280,1280]{1,0} add(%add.245998.3.clone.1, %broadcast.247765.113.clone.1)
+  %shift-left.108583.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120787.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114717.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120787.3.clone.1, %broadcast.244418.4352)
+  %or.114241.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108583.5.clone.1, %shift-right-logical.114717.5.clone.1)
+  %xor.120788.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245998.3.clone.1, %or.114241.3.clone.1)
+  %constant_217976_1_clone_1 = u32[] constant(3369211168)
+  %broadcast.247777.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217976_1_clone_1), dimensions={}
+  %add.246001.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120788.3.clone.1, %broadcast.247777.5.clone.1)
+  %add.246002.5.clone.1 = u32[1280,1280]{1,0} add(%add.245999.7.clone.1, %add.246001.5.clone.1)
+  %shift-left.108584.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246001.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114718.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246001.5.clone.1, %broadcast.244416.5760)
+  %or.114242.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108584.9.clone.1, %shift-right-logical.114718.9.clone.1)
+  %xor.120789.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246002.5.clone.1, %or.114242.7.clone.1)
+  %add.246003.3.clone.1 = u32[1280,1280]{1,0} add(%add.246002.5.clone.1, %xor.120789.5.clone.1)
+  %shift-left.108585.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120789.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114719.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120789.5.clone.1, %broadcast.244429.2304)
+  %or.114243.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108585.9.clone.1, %shift-right-logical.114719.9.clone.1)
+  %xor.120790.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246003.3.clone.1, %or.114243.7.clone.1)
+  %add.246004.3.clone.1 = u32[1280,1280]{1,0} add(%add.246003.3.clone.1, %xor.120790.5.clone.1)
+  %shift-left.108586.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120790.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114721.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120790.5.clone.1, %broadcast.244430.4608)
+  %or.114244.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108586.9.clone.1, %shift-right-logical.114721.9.clone.1)
+  %xor.120791.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246004.3.clone.1, %or.114244.7.clone.1)
+  %add.246006.3.clone.1 = u32[1280,1280]{1,0} add(%add.246004.3.clone.1, %xor.120791.5.clone.1)
+  %constant_165876_1_clone_1 = u32[] constant(3369211167)
+  %broadcast.247784.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165876_1_clone_1), dimensions={}
+  %add.246007.7.clone.1 = u32[1280,1280]{1,0} add(%add.246006.3.clone.1, %broadcast.247784.24.clone.1)
+  %shift-left.108587.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120791.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114722.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120791.5.clone.1, %broadcast.244434.2816)
+  %or.114245.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108587.11.clone.1, %shift-right-logical.114722.11.clone.1)
+  %xor.120792.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246006.3.clone.1, %or.114245.9.clone.1)
+  %constant_217978_1_clone_1 = u32[] constant(2880272206)
+  %broadcast.247787.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217978_1_clone_1), dimensions={}
+  %add.246008.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120792.7.clone.1, %broadcast.247787.5.clone.1)
+  %add.246009.5.clone.1 = u32[1280,1280]{1,0} add(%add.246007.7.clone.1, %add.246008.5.clone.1)
+  %shift-left.108588.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246008.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114723.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246008.5.clone.1, %broadcast.244415.6016)
+  %or.114246.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108588.9.clone.1, %shift-right-logical.114723.9.clone.1)
+  %xor.120793.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246009.5.clone.1, %or.114246.7.clone.1)
+  %add.246010.3.clone.1 = u32[1280,1280]{1,0} add(%add.246009.5.clone.1, %xor.120793.5.clone.1)
+  %shift-left.108589.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120793.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114724.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120793.5.clone.1, %broadcast.244417.5760)
+  %or.114247.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108589.9.clone.1, %shift-right-logical.114724.9.clone.1)
+  %xor.120794.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246010.3.clone.1, %or.114247.7.clone.1)
+  %add.246012.3.clone.1 = u32[1280,1280]{1,0} add(%add.246010.3.clone.1, %xor.120794.5.clone.1)
+  %shift-left.108590.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120794.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114726.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120794.5.clone.1, %broadcast.244419.4352)
+  %or.114248.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108590.7.clone.1, %shift-right-logical.114726.7.clone.1)
+  %xor.120795.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246012.3.clone.1, %or.114248.5.clone.1)
+  %add.246016.3.clone.1 = u32[1280,1280]{1,0} add(%add.246012.3.clone.1, %xor.120795.3.clone.1)
+  %add.246017.7.clone.1 = u32[1280,1280]{1,0} add(%add.246016.3.clone.1, %broadcast.247764.44.clone.1)
+  %shift-left.108591.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120795.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114727.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120795.3.clone.1, %broadcast.244418.4352)
+  %or.114249.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108591.7.clone.1, %shift-right-logical.114727.7.clone.1)
+  %xor.120796.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246016.3.clone.1, %or.114249.5.clone.1)
+  %constant_217980_1_clone_1 = u32[] constant(2024700300)
+  %broadcast.247798.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217980_1_clone_1), dimensions={}
+  %add.246018.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120796.3.clone.1, %broadcast.247798.5.clone.1)
+  %add.246019.5.clone.1 = u32[1280,1280]{1,0} add(%add.246017.7.clone.1, %add.246018.5.clone.1)
+  %shift-left.108592.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246018.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114728.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246018.5.clone.1, %broadcast.244416.5760)
+  %or.114250.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108592.9.clone.1, %shift-right-logical.114728.9.clone.1)
+  %xor.120797.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246019.5.clone.1, %or.114250.7.clone.1)
+  %add.246021.3.clone.1 = u32[1280,1280]{1,0} add(%add.246019.5.clone.1, %xor.120797.5.clone.1)
+  %shift-left.108593.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120797.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114729.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120797.5.clone.1, %broadcast.244429.2304)
+  %or.114251.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108593.9.clone.1, %shift-right-logical.114729.9.clone.1)
+  %xor.120798.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246021.3.clone.1, %or.114251.7.clone.1)
+  %add.246022.3.clone.1 = u32[1280,1280]{1,0} add(%add.246021.3.clone.1, %xor.120798.5.clone.1)
+  %shift-left.108594.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120798.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114731.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120798.5.clone.1, %broadcast.244430.4608)
+  %or.114252.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108594.9.clone.1, %shift-right-logical.114731.9.clone.1)
+  %xor.120799.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246022.3.clone.1, %or.114252.7.clone.1)
+  %add.246023.3.clone.1 = u32[1280,1280]{1,0} add(%add.246022.3.clone.1, %xor.120799.5.clone.1)
+  %add.246024.7.clone.1 = u32[1280,1280]{1,0} add(%add.246023.3.clone.1, %broadcast.247765.113.clone.1)
+  %shift-left.108595.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120799.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114732.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120799.5.clone.1, %broadcast.244434.2816)
+  %or.114253.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108595.11.clone.1, %shift-right-logical.114732.11.clone.1)
+  %xor.120800.7.clone.1 = u32[1280,1280]{1,0} xor(%add.246023.3.clone.1, %or.114253.9.clone.1)
+  %constant_217982_1_clone_1 = u32[] constant(3369211171)
+  %broadcast.247809.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217982_1_clone_1), dimensions={}
+  %add.246026.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120800.7.clone.1, %broadcast.247809.5.clone.1)
+  %add.246027.5.clone.1 = u32[1280,1280]{1,0} add(%add.246024.7.clone.1, %add.246026.5.clone.1)
+  %shift-left.108596.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.246026.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114733.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.246026.5.clone.1, %broadcast.244415.6016)
+  %or.114254.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108596.9.clone.1, %shift-right-logical.114733.9.clone.1)
+  %xor.120801.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246027.5.clone.1, %or.114254.7.clone.1)
+  %add.246028.3.clone.1 = u32[1280,1280]{1,0} add(%add.246027.5.clone.1, %xor.120801.5.clone.1)
+  %shift-left.108597.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120801.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114734.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120801.5.clone.1, %broadcast.244417.5760)
+  %or.114255.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108597.9.clone.1, %shift-right-logical.114734.9.clone.1)
+  %xor.120802.5.clone.1 = u32[1280,1280]{1,0} xor(%add.246028.3.clone.1, %or.114255.7.clone.1)
+  %add.246029.3.clone.1 = u32[1280,1280]{1,0} add(%add.246028.3.clone.1, %xor.120802.5.clone.1)
+  %shift-left.108598.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120802.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114736.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120802.5.clone.1, %broadcast.244419.4352)
+  %or.114256.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108598.5.clone.1, %shift-right-logical.114736.5.clone.1)
+  %xor.120803.3.clone.1 = u32[1280,1280]{1,0} xor(%add.246029.3.clone.1, %or.114256.3.clone.1)
+  %add.246031.3.clone.1 = u32[1280,1280]{1,0} add(%add.246029.3.clone.1, %xor.120803.3.clone.1)
+  %add.246032.17.clone.1 = u32[1280,1280]{1,0} add(%add.246031.3.clone.1, %broadcast.247784.24.clone.1)
+  %shift-left.108599.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120803.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114737.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120803.3.clone.1, %broadcast.244418.4352)
+  %or.114257.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108599.5.clone.1, %shift-right-logical.114737.5.clone.1)
+  %xor.120804.15.clone.1 = u32[1280,1280]{1,0} xor(%add.246031.3.clone.1, %or.114257.3.clone.1)
+  %constant_217984_1_clone_1 = u32[] constant(2880272209)
+  %broadcast.247819.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217984_1_clone_1), dimensions={}
+  %add.246033.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120804.15.clone.1, %broadcast.247819.19.clone.1)
+  %xor.120805.17.clone.1 = u32[1280,1280]{1,0} xor(%add.246032.17.clone.1, %add.246033.19.clone.1)
+  %shift-right-logical.114738.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120805.17.clone.1, %broadcast.244468.1920)
+  %or.114258.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114738.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5707.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114258.13.clone.1)
+  %add.246034.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5707.11.clone.1, %broadcast.244470.1152)
+  %multiply.25987.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246034.9.clone.1, %broadcast.244471.896)
+  %add.246035.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25987.7.clone.1, %broadcast.244408.1024)
+  %maximum.3639.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.246035.5.clone.1)
+  %abs.1509.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3639.3.clone.1)
+  %compare.7166.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1509.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25988.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3639.3.clone.1, %broadcast.244476.1152)
+  %negate.4523.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3639.3.clone.1)
+  %multiply.25989.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3639.3.clone.1, %negate.4523.5.clone.1)
+  %log-plus-one.1509.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25989.5.clone.1)
+  %negate.4524.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1509.3.clone.1)
+  %compare.7167.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4524.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20664.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20665.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20666.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20667.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20668.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20669.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20670.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20671.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20672.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.246037.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4524.4.clone.1, %broadcast.244496.640)
+  %sqrt.1509.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4524.4.clone.1)
+  %add.246041.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1509.5.clone.1, %broadcast.244498.640)
+  %select.20673.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7167.3.clone.1, %add.246037.5.clone.1, %add.246041.5.clone.1)
+  %multiply.25990.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20672.3.clone.1, %select.20673.3.clone.1)
+  %add.246042.1.clone.1 = f32[1280,1280]{1,0} add(%select.20671.3.clone.1, %multiply.25990.1.clone.1)
+  %multiply.25991.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246042.1.clone.1, %select.20673.3.clone.1)
+  %add.246043.1.clone.1 = f32[1280,1280]{1,0} add(%select.20670.3.clone.1, %multiply.25991.1.clone.1)
+  %multiply.25992.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246043.1.clone.1, %select.20673.3.clone.1)
+  %add.246044.1.clone.1 = f32[1280,1280]{1,0} add(%select.20669.3.clone.1, %multiply.25992.1.clone.1)
+  %multiply.25993.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246044.1.clone.1, %select.20673.3.clone.1)
+  %add.246046.1.clone.1 = f32[1280,1280]{1,0} add(%select.20668.3.clone.1, %multiply.25993.1.clone.1)
+  %multiply.25994.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246046.1.clone.1, %select.20673.3.clone.1)
+  %add.246047.3.clone.1 = f32[1280,1280]{1,0} add(%select.20667.5.clone.1, %multiply.25994.1.clone.1)
+  %multiply.25995.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.246047.3.clone.1, %select.20673.3.clone.1)
+  %add.246048.3.clone.1 = f32[1280,1280]{1,0} add(%select.20666.5.clone.1, %multiply.25995.1.clone.1)
+  %multiply.25996.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246048.3.clone.1, %select.20673.3.clone.1)
+  %add.246049.9.clone.1 = f32[1280,1280]{1,0} add(%select.20665.11.clone.1, %multiply.25996.7.clone.1)
+  %multiply.25997.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246049.9.clone.1, %select.20673.3.clone.1)
+  %add.246051.7.clone.1 = f32[1280,1280]{1,0} add(%select.20664.7.clone.1, %multiply.25997.7.clone.1)
+  %multiply.25998.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.246051.7.clone.1, %maximum.3639.3.clone.1)
+  %select.20674.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7166.3.clone.1, %multiply.25988.9.clone.1, %multiply.25998.7.clone.1)
+  %multiply.25999.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20674.7.clone.1, %broadcast.244500.640)
+  %clamp.1153.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25999.5.clone.1, %broadcast.244501.384)
+  %multiply.26001.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1153.3.clone.1, %broadcast.244502.1)
+  %constant_181484_1_clone_1 = u32[] constant(2866377858)
+  %broadcast.254519.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181484_1_clone_1), dimensions={}
+  %add.249846.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.254519.44.clone.1)
+  %constant_181491_1_clone_1 = u32[] constant(2908281955)
+  %broadcast.254521.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181491_1_clone_1), dimensions={}
+  %add.249847.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.254521.113.clone.1)
+  %add.249849.35.clone.1 = u32[1280,1280]{1,0} add(%add.249846.37.clone.1, %add.249847.99.clone.1)
+  %shift-left.110240.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249847.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116466.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249847.99.clone.1, %broadcast.244415.6016)
+  %or.115990.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110240.31.clone.1, %shift-right-logical.116466.29.clone.1)
+  %xor.122557.27.clone.1 = u32[1280,1280]{1,0} xor(%add.249849.35.clone.1, %or.115990.29.clone.1)
+  %add.249850.5.clone.1 = u32[1280,1280]{1,0} add(%add.249849.35.clone.1, %xor.122557.27.clone.1)
+  %shift-left.110241.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122557.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116467.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122557.27.clone.1, %broadcast.244417.5760)
+  %or.115991.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110241.9.clone.1, %shift-right-logical.116467.9.clone.1)
+  %xor.122558.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249850.5.clone.1, %or.115991.7.clone.1)
+  %add.249851.3.clone.1 = u32[1280,1280]{1,0} add(%add.249850.5.clone.1, %xor.122558.5.clone.1)
+  %shift-left.110242.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122558.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116468.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122558.5.clone.1, %broadcast.244419.4352)
+  %or.115992.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110242.5.clone.1, %shift-right-logical.116468.5.clone.1)
+  %xor.122560.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249851.3.clone.1, %or.115992.3.clone.1)
+  %add.249852.3.clone.1 = u32[1280,1280]{1,0} add(%add.249851.3.clone.1, %xor.122560.3.clone.1)
+  %add.249854.7.clone.1 = u32[1280,1280]{1,0} add(%add.249852.3.clone.1, %broadcast.254521.113.clone.1)
+  %shift-left.110243.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122560.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116469.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122560.3.clone.1, %broadcast.244418.4352)
+  %or.115993.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110243.5.clone.1, %shift-right-logical.116469.5.clone.1)
+  %xor.122561.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249852.3.clone.1, %or.115993.3.clone.1)
+  %constant_218409_1_clone_1 = u32[] constant(475050812)
+  %broadcast.254539.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218409_1_clone_1), dimensions={}
+  %add.249855.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122561.3.clone.1, %broadcast.254539.5.clone.1)
+  %add.249856.5.clone.1 = u32[1280,1280]{1,0} add(%add.249854.7.clone.1, %add.249855.5.clone.1)
+  %shift-left.110244.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249855.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116470.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249855.5.clone.1, %broadcast.244416.5760)
+  %or.115994.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110244.9.clone.1, %shift-right-logical.116470.9.clone.1)
+  %xor.122562.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249856.5.clone.1, %or.115994.7.clone.1)
+  %add.249857.3.clone.1 = u32[1280,1280]{1,0} add(%add.249856.5.clone.1, %xor.122562.5.clone.1)
+  %shift-left.110245.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122562.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116471.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122562.5.clone.1, %broadcast.244429.2304)
+  %or.115995.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110245.9.clone.1, %shift-right-logical.116471.9.clone.1)
+  %xor.122563.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249857.3.clone.1, %or.115995.7.clone.1)
+  %add.249859.3.clone.1 = u32[1280,1280]{1,0} add(%add.249857.3.clone.1, %xor.122563.5.clone.1)
+  %shift-left.110246.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122563.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116472.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122563.5.clone.1, %broadcast.244430.4608)
+  %or.115996.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110246.9.clone.1, %shift-right-logical.116472.9.clone.1)
+  %xor.122565.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249859.3.clone.1, %or.115996.7.clone.1)
+  %add.249860.3.clone.1 = u32[1280,1280]{1,0} add(%add.249859.3.clone.1, %xor.122565.5.clone.1)
+  %constant_181493_1_clone_1 = u32[] constant(475050811)
+  %broadcast.254546.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181493_1_clone_1), dimensions={}
+  %add.249861.7.clone.1 = u32[1280,1280]{1,0} add(%add.249860.3.clone.1, %broadcast.254546.24.clone.1)
+  %shift-left.110247.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122565.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116473.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122565.5.clone.1, %broadcast.244434.2816)
+  %or.115997.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110247.11.clone.1, %shift-right-logical.116473.11.clone.1)
+  %xor.122566.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249860.3.clone.1, %or.115997.9.clone.1)
+  %constant_218410_1_clone_1 = u32[] constant(2866377860)
+  %broadcast.254549.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218410_1_clone_1), dimensions={}
+  %add.249862.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122566.7.clone.1, %broadcast.254549.5.clone.1)
+  %add.249863.5.clone.1 = u32[1280,1280]{1,0} add(%add.249861.7.clone.1, %add.249862.5.clone.1)
+  %shift-left.110248.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249862.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116474.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249862.5.clone.1, %broadcast.244415.6016)
+  %or.115998.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110248.9.clone.1, %shift-right-logical.116474.9.clone.1)
+  %xor.122567.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249863.5.clone.1, %or.115998.7.clone.1)
+  %add.249865.3.clone.1 = u32[1280,1280]{1,0} add(%add.249863.5.clone.1, %xor.122567.5.clone.1)
+  %shift-left.110249.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122567.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116475.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122567.5.clone.1, %broadcast.244417.5760)
+  %or.115999.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110249.9.clone.1, %shift-right-logical.116475.9.clone.1)
+  %xor.122568.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249865.3.clone.1, %or.115999.7.clone.1)
+  %add.249868.3.clone.1 = u32[1280,1280]{1,0} add(%add.249865.3.clone.1, %xor.122568.5.clone.1)
+  %shift-left.110250.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122568.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116476.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122568.5.clone.1, %broadcast.244419.4352)
+  %or.116000.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110250.7.clone.1, %shift-right-logical.116476.7.clone.1)
+  %xor.122569.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249868.3.clone.1, %or.116000.5.clone.1)
+  %add.249869.3.clone.1 = u32[1280,1280]{1,0} add(%add.249868.3.clone.1, %xor.122569.3.clone.1)
+  %add.249870.7.clone.1 = u32[1280,1280]{1,0} add(%add.249869.3.clone.1, %broadcast.254519.44.clone.1)
+  %shift-left.110251.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122569.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116477.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122569.3.clone.1, %broadcast.244418.4352)
+  %or.116001.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110251.7.clone.1, %shift-right-logical.116477.7.clone.1)
+  %xor.122570.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249869.3.clone.1, %or.116001.5.clone.1)
+  %constant_218411_1_clone_1 = u32[] constant(2908281958)
+  %broadcast.254559.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218411_1_clone_1), dimensions={}
+  %add.249871.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122570.3.clone.1, %broadcast.254559.5.clone.1)
+  %add.249872.5.clone.1 = u32[1280,1280]{1,0} add(%add.249870.7.clone.1, %add.249871.5.clone.1)
+  %shift-left.110252.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249871.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116478.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249871.5.clone.1, %broadcast.244416.5760)
+  %or.116002.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110252.9.clone.1, %shift-right-logical.116478.9.clone.1)
+  %xor.122571.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249872.5.clone.1, %or.116002.7.clone.1)
+  %add.249873.3.clone.1 = u32[1280,1280]{1,0} add(%add.249872.5.clone.1, %xor.122571.5.clone.1)
+  %shift-left.110253.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122571.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116480.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122571.5.clone.1, %broadcast.244429.2304)
+  %or.116003.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110253.9.clone.1, %shift-right-logical.116480.9.clone.1)
+  %xor.122572.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249873.3.clone.1, %or.116003.7.clone.1)
+  %add.249874.3.clone.1 = u32[1280,1280]{1,0} add(%add.249873.3.clone.1, %xor.122572.5.clone.1)
+  %shift-left.110254.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122572.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116481.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122572.5.clone.1, %broadcast.244430.4608)
+  %or.116004.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110254.9.clone.1, %shift-right-logical.116481.9.clone.1)
+  %xor.122573.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249874.3.clone.1, %or.116004.7.clone.1)
+  %add.249875.3.clone.1 = u32[1280,1280]{1,0} add(%add.249874.3.clone.1, %xor.122573.5.clone.1)
+  %add.249876.7.clone.1 = u32[1280,1280]{1,0} add(%add.249875.3.clone.1, %broadcast.254521.113.clone.1)
+  %shift-left.110255.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122573.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116482.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122573.5.clone.1, %broadcast.244434.2816)
+  %or.116005.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110255.11.clone.1, %shift-right-logical.116482.11.clone.1)
+  %xor.122574.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249875.3.clone.1, %or.116005.9.clone.1)
+  %constant_218412_1_clone_1 = u32[] constant(475050815)
+  %broadcast.254569.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218412_1_clone_1), dimensions={}
+  %add.249877.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122574.7.clone.1, %broadcast.254569.5.clone.1)
+  %add.249878.5.clone.1 = u32[1280,1280]{1,0} add(%add.249876.7.clone.1, %add.249877.5.clone.1)
+  %shift-left.110256.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249877.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116483.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249877.5.clone.1, %broadcast.244415.6016)
+  %or.116006.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110256.9.clone.1, %shift-right-logical.116483.9.clone.1)
+  %xor.122575.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249878.5.clone.1, %or.116006.7.clone.1)
+  %add.249879.3.clone.1 = u32[1280,1280]{1,0} add(%add.249878.5.clone.1, %xor.122575.5.clone.1)
+  %shift-left.110257.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122575.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116485.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122575.5.clone.1, %broadcast.244417.5760)
+  %or.116007.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110257.9.clone.1, %shift-right-logical.116485.9.clone.1)
+  %xor.122576.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249879.3.clone.1, %or.116007.7.clone.1)
+  %add.249880.3.clone.1 = u32[1280,1280]{1,0} add(%add.249879.3.clone.1, %xor.122576.5.clone.1)
+  %shift-left.110258.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122576.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116486.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122576.5.clone.1, %broadcast.244419.4352)
+  %or.116008.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110258.5.clone.1, %shift-right-logical.116486.5.clone.1)
+  %xor.122577.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249880.3.clone.1, %or.116008.3.clone.1)
+  %add.249881.3.clone.1 = u32[1280,1280]{1,0} add(%add.249880.3.clone.1, %xor.122577.3.clone.1)
+  %add.249882.17.clone.1 = u32[1280,1280]{1,0} add(%add.249881.3.clone.1, %broadcast.254546.24.clone.1)
+  %shift-left.110259.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122577.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116487.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122577.3.clone.1, %broadcast.244418.4352)
+  %or.116009.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110259.5.clone.1, %shift-right-logical.116487.5.clone.1)
+  %xor.122578.15.clone.1 = u32[1280,1280]{1,0} xor(%add.249881.3.clone.1, %or.116009.3.clone.1)
+  %constant_218413_1_clone_1 = u32[] constant(2866377863)
+  %broadcast.254579.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218413_1_clone_1), dimensions={}
+  %add.249883.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122578.15.clone.1, %broadcast.254579.19.clone.1)
+  %xor.122579.17.clone.1 = u32[1280,1280]{1,0} xor(%add.249882.17.clone.1, %add.249883.19.clone.1)
+  %shift-right-logical.116488.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122579.17.clone.1, %broadcast.244468.1920)
+  %or.116010.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116488.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5784.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116010.13.clone.1)
+  %add.249884.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5784.11.clone.1, %broadcast.244470.1152)
+  %multiply.26769.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249884.9.clone.1, %broadcast.244471.896)
+  %add.249885.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26769.7.clone.1, %broadcast.244408.1024)
+  %maximum.3716.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.249885.5.clone.1)
+  %abs.1560.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3716.3.clone.1)
+  %compare.7269.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1560.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26770.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3716.3.clone.1, %broadcast.244476.1152)
+  %negate.4625.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3716.3.clone.1)
+  %multiply.26771.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3716.3.clone.1, %negate.4625.5.clone.1)
+  %log-plus-one.1560.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26771.5.clone.1)
+  %negate.4626.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1560.3.clone.1)
+  %compare.7270.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4626.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21246.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21247.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21248.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21249.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21250.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21251.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21252.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21253.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21254.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.249886.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4626.4.clone.1, %broadcast.244496.640)
+  %sqrt.1560.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4626.4.clone.1)
+  %add.249887.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1560.5.clone.1, %broadcast.244498.640)
+  %select.21255.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7270.3.clone.1, %add.249886.5.clone.1, %add.249887.5.clone.1)
+  %multiply.26772.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21254.3.clone.1, %select.21255.3.clone.1)
+  %add.249888.1.clone.1 = f32[1280,1280]{1,0} add(%select.21253.3.clone.1, %multiply.26772.1.clone.1)
+  %multiply.26773.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249888.1.clone.1, %select.21255.3.clone.1)
+  %add.249889.1.clone.1 = f32[1280,1280]{1,0} add(%select.21252.3.clone.1, %multiply.26773.1.clone.1)
+  %multiply.26774.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249889.1.clone.1, %select.21255.3.clone.1)
+  %add.249890.1.clone.1 = f32[1280,1280]{1,0} add(%select.21251.3.clone.1, %multiply.26774.1.clone.1)
+  %multiply.26775.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249890.1.clone.1, %select.21255.3.clone.1)
+  %add.249891.1.clone.1 = f32[1280,1280]{1,0} add(%select.21250.3.clone.1, %multiply.26775.1.clone.1)
+  %multiply.26776.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249891.1.clone.1, %select.21255.3.clone.1)
+  %add.249892.3.clone.1 = f32[1280,1280]{1,0} add(%select.21249.5.clone.1, %multiply.26776.1.clone.1)
+  %multiply.26777.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249892.3.clone.1, %select.21255.3.clone.1)
+  %add.249893.3.clone.1 = f32[1280,1280]{1,0} add(%select.21248.5.clone.1, %multiply.26777.1.clone.1)
+  %multiply.26778.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249893.3.clone.1, %select.21255.3.clone.1)
+  %add.249894.9.clone.1 = f32[1280,1280]{1,0} add(%select.21247.11.clone.1, %multiply.26778.7.clone.1)
+  %multiply.26779.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249894.9.clone.1, %select.21255.3.clone.1)
+  %add.249896.7.clone.1 = f32[1280,1280]{1,0} add(%select.21246.7.clone.1, %multiply.26779.7.clone.1)
+  %multiply.26780.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249896.7.clone.1, %maximum.3716.3.clone.1)
+  %select.21256.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7269.3.clone.1, %multiply.26770.9.clone.1, %multiply.26780.7.clone.1)
+  %multiply.26781.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21256.7.clone.1, %broadcast.244500.640)
+  %clamp.1204.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26781.5.clone.1, %broadcast.244501.384)
+  %multiply.26782.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1204.3.clone.1, %broadcast.244502.1)
+  %constant_165619_1_clone_1 = u32[] constant(3292448747)
+  %broadcast.247657.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165619_1_clone_1), dimensions={}
+  %add.245938.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247657.44.clone.1)
+  %constant_165626_1_clone_1 = u32[] constant(64167697)
+  %broadcast.247658.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165626_1_clone_1), dimensions={}
+  %add.245939.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247658.113.clone.1)
+  %add.245940.35.clone.1 = u32[1280,1280]{1,0} add(%add.245938.37.clone.1, %add.245939.99.clone.1)
+  %shift-left.108555.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245939.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114689.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245939.99.clone.1, %broadcast.244415.6016)
+  %or.114216.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108555.31.clone.1, %shift-right-logical.114689.29.clone.1)
+  %xor.120764.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245940.35.clone.1, %or.114216.29.clone.1)
+  %add.245941.5.clone.1 = u32[1280,1280]{1,0} add(%add.245940.35.clone.1, %xor.120764.27.clone.1)
+  %shift-left.108556.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120764.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114690.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120764.27.clone.1, %broadcast.244417.5760)
+  %or.114217.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108556.9.clone.1, %shift-right-logical.114690.9.clone.1)
+  %xor.120765.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245941.5.clone.1, %or.114217.7.clone.1)
+  %add.245942.3.clone.1 = u32[1280,1280]{1,0} add(%add.245941.5.clone.1, %xor.120765.5.clone.1)
+  %shift-left.108558.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120765.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114691.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120765.5.clone.1, %broadcast.244419.4352)
+  %or.114218.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108558.5.clone.1, %shift-right-logical.114691.5.clone.1)
+  %xor.120766.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245942.3.clone.1, %or.114218.3.clone.1)
+  %add.245943.3.clone.1 = u32[1280,1280]{1,0} add(%add.245942.3.clone.1, %xor.120766.3.clone.1)
+  %add.245944.7.clone.1 = u32[1280,1280]{1,0} add(%add.245943.3.clone.1, %broadcast.247658.113.clone.1)
+  %shift-left.108559.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120766.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114692.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120766.3.clone.1, %broadcast.244418.4352)
+  %or.114219.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108559.5.clone.1, %shift-right-logical.114692.5.clone.1)
+  %xor.120767.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245943.3.clone.1, %or.114219.3.clone.1)
+  %constant_217966_1_clone_1 = u32[] constant(3694969633)
+  %broadcast.247671.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217966_1_clone_1), dimensions={}
+  %add.245945.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120767.3.clone.1, %broadcast.247671.5.clone.1)
+  %add.245946.5.clone.1 = u32[1280,1280]{1,0} add(%add.245944.7.clone.1, %add.245945.5.clone.1)
+  %shift-left.108560.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245945.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114693.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245945.5.clone.1, %broadcast.244416.5760)
+  %or.114220.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108560.9.clone.1, %shift-right-logical.114693.9.clone.1)
+  %xor.120768.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245946.5.clone.1, %or.114220.7.clone.1)
+  %add.245947.3.clone.1 = u32[1280,1280]{1,0} add(%add.245946.5.clone.1, %xor.120768.5.clone.1)
+  %shift-left.108561.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120768.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114694.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120768.5.clone.1, %broadcast.244429.2304)
+  %or.114221.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108561.9.clone.1, %shift-right-logical.114694.9.clone.1)
+  %xor.120769.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245947.3.clone.1, %or.114221.7.clone.1)
+  %add.245948.3.clone.1 = u32[1280,1280]{1,0} add(%add.245947.3.clone.1, %xor.120769.5.clone.1)
+  %shift-left.108562.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120769.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114696.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120769.5.clone.1, %broadcast.244430.4608)
+  %or.114222.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108562.9.clone.1, %shift-right-logical.114696.9.clone.1)
+  %xor.120770.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245948.3.clone.1, %or.114222.7.clone.1)
+  %add.245949.3.clone.1 = u32[1280,1280]{1,0} add(%add.245948.3.clone.1, %xor.120770.5.clone.1)
+  %constant_165628_1_clone_1 = u32[] constant(3694969632)
+  %broadcast.247685.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165628_1_clone_1), dimensions={}
+  %add.245950.7.clone.1 = u32[1280,1280]{1,0} add(%add.245949.3.clone.1, %broadcast.247685.24.clone.1)
+  %shift-left.108563.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120770.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114697.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120770.5.clone.1, %broadcast.244434.2816)
+  %or.114223.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108563.11.clone.1, %shift-right-logical.114697.11.clone.1)
+  %xor.120771.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245949.3.clone.1, %or.114223.9.clone.1)
+  %constant_217968_1_clone_1 = u32[] constant(3292448749)
+  %broadcast.247691.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217968_1_clone_1), dimensions={}
+  %add.245951.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120771.7.clone.1, %broadcast.247691.5.clone.1)
+  %add.245952.5.clone.1 = u32[1280,1280]{1,0} add(%add.245950.7.clone.1, %add.245951.5.clone.1)
+  %shift-left.108564.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245951.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114698.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245951.5.clone.1, %broadcast.244415.6016)
+  %or.114224.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108564.9.clone.1, %shift-right-logical.114698.9.clone.1)
+  %xor.120772.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245952.5.clone.1, %or.114224.7.clone.1)
+  %add.245953.3.clone.1 = u32[1280,1280]{1,0} add(%add.245952.5.clone.1, %xor.120772.5.clone.1)
+  %shift-left.108565.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120772.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114699.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120772.5.clone.1, %broadcast.244417.5760)
+  %or.114225.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108565.9.clone.1, %shift-right-logical.114699.9.clone.1)
+  %xor.120773.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245953.3.clone.1, %or.114225.7.clone.1)
+  %add.245954.3.clone.1 = u32[1280,1280]{1,0} add(%add.245953.3.clone.1, %xor.120773.5.clone.1)
+  %shift-left.108566.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120773.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114701.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120773.5.clone.1, %broadcast.244419.4352)
+  %or.114226.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108566.7.clone.1, %shift-right-logical.114701.7.clone.1)
+  %xor.120774.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245954.3.clone.1, %or.114226.5.clone.1)
+  %add.245955.3.clone.1 = u32[1280,1280]{1,0} add(%add.245954.3.clone.1, %xor.120774.3.clone.1)
+  %add.245956.7.clone.1 = u32[1280,1280]{1,0} add(%add.245955.3.clone.1, %broadcast.247657.44.clone.1)
+  %shift-left.108568.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120774.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114702.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120774.3.clone.1, %broadcast.244418.4352)
+  %or.114228.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108568.7.clone.1, %shift-right-logical.114702.7.clone.1)
+  %xor.120775.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245955.3.clone.1, %or.114228.5.clone.1)
+  %constant_217970_1_clone_1 = u32[] constant(64167700)
+  %broadcast.247705.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217970_1_clone_1), dimensions={}
+  %add.245957.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120775.3.clone.1, %broadcast.247705.5.clone.1)
+  %add.245958.5.clone.1 = u32[1280,1280]{1,0} add(%add.245956.7.clone.1, %add.245957.5.clone.1)
+  %shift-left.108569.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245957.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114703.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245957.5.clone.1, %broadcast.244416.5760)
+  %or.114229.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108569.9.clone.1, %shift-right-logical.114703.9.clone.1)
+  %xor.120776.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245958.5.clone.1, %or.114229.7.clone.1)
+  %add.245959.3.clone.1 = u32[1280,1280]{1,0} add(%add.245958.5.clone.1, %xor.120776.5.clone.1)
+  %shift-left.108570.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120776.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114704.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120776.5.clone.1, %broadcast.244429.2304)
+  %or.114230.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108570.9.clone.1, %shift-right-logical.114704.9.clone.1)
+  %xor.120777.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245959.3.clone.1, %or.114230.7.clone.1)
+  %add.245960.3.clone.1 = u32[1280,1280]{1,0} add(%add.245959.3.clone.1, %xor.120777.5.clone.1)
+  %shift-left.108571.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120777.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114706.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120777.5.clone.1, %broadcast.244430.4608)
+  %or.114231.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108571.9.clone.1, %shift-right-logical.114706.9.clone.1)
+  %xor.120778.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245960.3.clone.1, %or.114231.7.clone.1)
+  %add.245961.3.clone.1 = u32[1280,1280]{1,0} add(%add.245960.3.clone.1, %xor.120778.5.clone.1)
+  %add.245963.7.clone.1 = u32[1280,1280]{1,0} add(%add.245961.3.clone.1, %broadcast.247658.113.clone.1)
+  %shift-left.108573.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120778.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114707.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120778.5.clone.1, %broadcast.244434.2816)
+  %or.114232.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108573.11.clone.1, %shift-right-logical.114707.11.clone.1)
+  %xor.120779.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245961.3.clone.1, %or.114232.9.clone.1)
+  %constant_217972_1_clone_1 = u32[] constant(3694969636)
+  %broadcast.247715.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217972_1_clone_1), dimensions={}
+  %add.245966.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120779.7.clone.1, %broadcast.247715.5.clone.1)
+  %add.245967.5.clone.1 = u32[1280,1280]{1,0} add(%add.245963.7.clone.1, %add.245966.5.clone.1)
+  %shift-left.108574.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245966.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114708.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245966.5.clone.1, %broadcast.244415.6016)
+  %or.114233.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108574.9.clone.1, %shift-right-logical.114708.9.clone.1)
+  %xor.120780.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245967.5.clone.1, %or.114233.7.clone.1)
+  %add.245968.3.clone.1 = u32[1280,1280]{1,0} add(%add.245967.5.clone.1, %xor.120780.5.clone.1)
+  %shift-left.108575.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120780.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114709.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120780.5.clone.1, %broadcast.244417.5760)
+  %or.114234.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108575.9.clone.1, %shift-right-logical.114709.9.clone.1)
+  %xor.120781.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245968.3.clone.1, %or.114234.7.clone.1)
+  %add.245969.3.clone.1 = u32[1280,1280]{1,0} add(%add.245968.3.clone.1, %xor.120781.5.clone.1)
+  %shift-left.108576.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120781.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114711.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120781.5.clone.1, %broadcast.244419.4352)
+  %or.114235.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108576.5.clone.1, %shift-right-logical.114711.5.clone.1)
+  %xor.120782.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245969.3.clone.1, %or.114235.3.clone.1)
+  %add.245971.3.clone.1 = u32[1280,1280]{1,0} add(%add.245969.3.clone.1, %xor.120782.3.clone.1)
+  %add.245972.17.clone.1 = u32[1280,1280]{1,0} add(%add.245971.3.clone.1, %broadcast.247685.24.clone.1)
+  %shift-left.108578.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120782.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114712.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120782.3.clone.1, %broadcast.244418.4352)
+  %or.114236.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108578.5.clone.1, %shift-right-logical.114712.5.clone.1)
+  %xor.120783.15.clone.1 = u32[1280,1280]{1,0} xor(%add.245971.3.clone.1, %or.114236.3.clone.1)
+  %constant_217974_1_clone_1 = u32[] constant(3292448752)
+  %broadcast.247727.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217974_1_clone_1), dimensions={}
+  %add.245973.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120783.15.clone.1, %broadcast.247727.19.clone.1)
+  %xor.120784.17.clone.1 = u32[1280,1280]{1,0} xor(%add.245972.17.clone.1, %add.245973.19.clone.1)
+  %shift-right-logical.114713.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120784.17.clone.1, %broadcast.244468.1920)
+  %or.114237.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114713.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5706.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114237.13.clone.1)
+  %add.245974.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5706.11.clone.1, %broadcast.244470.1152)
+  %multiply.25973.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245974.9.clone.1, %broadcast.244471.896)
+  %add.245976.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25973.7.clone.1, %broadcast.244408.1024)
+  %maximum.3638.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.245976.5.clone.1)
+  %abs.1508.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3638.3.clone.1)
+  %compare.7164.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1508.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25974.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3638.3.clone.1, %broadcast.244476.1152)
+  %negate.4521.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3638.3.clone.1)
+  %multiply.25975.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3638.3.clone.1, %negate.4521.5.clone.1)
+  %log-plus-one.1508.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25975.5.clone.1)
+  %negate.4522.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1508.3.clone.1)
+  %compare.7165.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4522.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20653.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20654.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20655.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20656.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20657.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20658.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20659.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20660.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20661.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.245977.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4522.4.clone.1, %broadcast.244496.640)
+  %sqrt.1508.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4522.4.clone.1)
+  %add.245978.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1508.5.clone.1, %broadcast.244498.640)
+  %select.20662.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7165.3.clone.1, %add.245977.5.clone.1, %add.245978.5.clone.1)
+  %multiply.25976.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20661.3.clone.1, %select.20662.3.clone.1)
+  %add.245979.1.clone.1 = f32[1280,1280]{1,0} add(%select.20660.3.clone.1, %multiply.25976.1.clone.1)
+  %multiply.25977.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245979.1.clone.1, %select.20662.3.clone.1)
+  %add.245981.1.clone.1 = f32[1280,1280]{1,0} add(%select.20659.3.clone.1, %multiply.25977.1.clone.1)
+  %multiply.25978.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245981.1.clone.1, %select.20662.3.clone.1)
+  %add.245982.1.clone.1 = f32[1280,1280]{1,0} add(%select.20658.3.clone.1, %multiply.25978.1.clone.1)
+  %multiply.25979.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245982.1.clone.1, %select.20662.3.clone.1)
+  %add.245983.1.clone.1 = f32[1280,1280]{1,0} add(%select.20657.3.clone.1, %multiply.25979.1.clone.1)
+  %multiply.25980.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245983.1.clone.1, %select.20662.3.clone.1)
+  %add.245984.3.clone.1 = f32[1280,1280]{1,0} add(%select.20656.5.clone.1, %multiply.25980.1.clone.1)
+  %multiply.25981.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245984.3.clone.1, %select.20662.3.clone.1)
+  %add.245985.3.clone.1 = f32[1280,1280]{1,0} add(%select.20655.5.clone.1, %multiply.25981.1.clone.1)
+  %multiply.25982.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245985.3.clone.1, %select.20662.3.clone.1)
+  %add.245987.9.clone.1 = f32[1280,1280]{1,0} add(%select.20654.11.clone.1, %multiply.25982.7.clone.1)
+  %multiply.25983.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245987.9.clone.1, %select.20662.3.clone.1)
+  %add.245991.7.clone.1 = f32[1280,1280]{1,0} add(%select.20653.7.clone.1, %multiply.25983.7.clone.1)
+  %multiply.25984.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245991.7.clone.1, %maximum.3638.3.clone.1)
+  %select.20663.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7164.3.clone.1, %multiply.25974.9.clone.1, %multiply.25984.7.clone.1)
+  %multiply.25985.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20663.7.clone.1, %broadcast.244500.640)
+  %clamp.1152.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25985.5.clone.1, %broadcast.244501.384)
+  %multiply.25986.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1152.3.clone.1, %broadcast.244502.1)
+  %constant_195112_1_clone_1 = u32[] constant(398121435)
+  %broadcast.260400.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195112_1_clone_1), dimensions={}
+  %add.253202.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.260400.44.clone.1)
+  %constant_195119_1_clone_1 = u32[] constant(1738677882)
+  %broadcast.260401.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195119_1_clone_1), dimensions={}
+  %add.253203.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.260401.113.clone.1)
+  %add.253205.35.clone.1 = u32[1280,1280]{1,0} add(%add.253202.37.clone.1, %add.253203.99.clone.1)
+  %shift-left.111713.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253203.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118009.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253203.99.clone.1, %broadcast.244415.6016)
+  %or.117548.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111713.31.clone.1, %shift-right-logical.118009.29.clone.1)
+  %xor.124101.27.clone.1 = u32[1280,1280]{1,0} xor(%add.253205.35.clone.1, %or.117548.29.clone.1)
+  %add.253206.5.clone.1 = u32[1280,1280]{1,0} add(%add.253205.35.clone.1, %xor.124101.27.clone.1)
+  %shift-left.111714.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124101.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118010.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124101.27.clone.1, %broadcast.244417.5760)
+  %or.117549.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111714.9.clone.1, %shift-right-logical.118010.9.clone.1)
+  %xor.124102.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253206.5.clone.1, %or.117549.7.clone.1)
+  %add.253207.3.clone.1 = u32[1280,1280]{1,0} add(%add.253206.5.clone.1, %xor.124102.5.clone.1)
+  %shift-left.111716.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124102.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118012.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124102.5.clone.1, %broadcast.244419.4352)
+  %or.117550.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111716.5.clone.1, %shift-right-logical.118012.5.clone.1)
+  %xor.124103.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253207.3.clone.1, %or.117550.3.clone.1)
+  %add.253208.3.clone.1 = u32[1280,1280]{1,0} add(%add.253207.3.clone.1, %xor.124103.3.clone.1)
+  %add.253209.7.clone.1 = u32[1280,1280]{1,0} add(%add.253208.3.clone.1, %broadcast.260401.113.clone.1)
+  %shift-left.111717.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124103.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118013.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124103.3.clone.1, %broadcast.244418.4352)
+  %or.117551.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111717.5.clone.1, %shift-right-logical.118013.5.clone.1)
+  %xor.124104.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253208.3.clone.1, %or.117551.3.clone.1)
+  %constant_218782_1_clone_1 = u32[] constant(1808391292)
+  %broadcast.260411.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218782_1_clone_1), dimensions={}
+  %add.253210.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124104.3.clone.1, %broadcast.260411.5.clone.1)
+  %add.253211.5.clone.1 = u32[1280,1280]{1,0} add(%add.253209.7.clone.1, %add.253210.5.clone.1)
+  %shift-left.111718.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253210.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118014.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253210.5.clone.1, %broadcast.244416.5760)
+  %or.117552.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111718.9.clone.1, %shift-right-logical.118014.9.clone.1)
+  %xor.124105.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253211.5.clone.1, %or.117552.7.clone.1)
+  %add.253212.3.clone.1 = u32[1280,1280]{1,0} add(%add.253211.5.clone.1, %xor.124105.5.clone.1)
+  %shift-left.111719.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124105.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118015.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124105.5.clone.1, %broadcast.244429.2304)
+  %or.117553.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111719.9.clone.1, %shift-right-logical.118015.9.clone.1)
+  %xor.124106.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253212.3.clone.1, %or.117553.7.clone.1)
+  %add.253213.3.clone.1 = u32[1280,1280]{1,0} add(%add.253212.3.clone.1, %xor.124106.5.clone.1)
+  %shift-left.111721.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124106.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118016.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124106.5.clone.1, %broadcast.244430.4608)
+  %or.117554.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111721.9.clone.1, %shift-right-logical.118016.9.clone.1)
+  %xor.124107.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253213.3.clone.1, %or.117554.7.clone.1)
+  %add.253214.3.clone.1 = u32[1280,1280]{1,0} add(%add.253213.3.clone.1, %xor.124107.5.clone.1)
+  %constant_195121_1_clone_1 = u32[] constant(1808391291)
+  %broadcast.260418.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_195121_1_clone_1), dimensions={}
+  %add.253215.7.clone.1 = u32[1280,1280]{1,0} add(%add.253214.3.clone.1, %broadcast.260418.24.clone.1)
+  %shift-left.111722.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124107.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118017.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124107.5.clone.1, %broadcast.244434.2816)
+  %or.117555.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111722.11.clone.1, %shift-right-logical.118017.11.clone.1)
+  %xor.124108.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253214.3.clone.1, %or.117555.9.clone.1)
+  %constant_218783_1_clone_1 = u32[] constant(398121437)
+  %broadcast.260421.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218783_1_clone_1), dimensions={}
+  %add.253216.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124108.7.clone.1, %broadcast.260421.5.clone.1)
+  %add.253217.5.clone.1 = u32[1280,1280]{1,0} add(%add.253215.7.clone.1, %add.253216.5.clone.1)
+  %shift-left.111723.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253216.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118018.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253216.5.clone.1, %broadcast.244415.6016)
+  %or.117556.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111723.9.clone.1, %shift-right-logical.118018.9.clone.1)
+  %xor.124109.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253217.5.clone.1, %or.117556.7.clone.1)
+  %add.253218.3.clone.1 = u32[1280,1280]{1,0} add(%add.253217.5.clone.1, %xor.124109.5.clone.1)
+  %shift-left.111724.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124109.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118019.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124109.5.clone.1, %broadcast.244417.5760)
+  %or.117557.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111724.9.clone.1, %shift-right-logical.118019.9.clone.1)
+  %xor.124110.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253218.3.clone.1, %or.117557.7.clone.1)
+  %add.253219.3.clone.1 = u32[1280,1280]{1,0} add(%add.253218.3.clone.1, %xor.124110.5.clone.1)
+  %shift-left.111726.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124110.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118020.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124110.5.clone.1, %broadcast.244419.4352)
+  %or.117558.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111726.7.clone.1, %shift-right-logical.118020.7.clone.1)
+  %xor.124111.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253219.3.clone.1, %or.117558.5.clone.1)
+  %add.253220.3.clone.1 = u32[1280,1280]{1,0} add(%add.253219.3.clone.1, %xor.124111.3.clone.1)
+  %add.253221.7.clone.1 = u32[1280,1280]{1,0} add(%add.253220.3.clone.1, %broadcast.260400.44.clone.1)
+  %shift-left.111727.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124111.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118022.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124111.3.clone.1, %broadcast.244418.4352)
+  %or.117559.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111727.7.clone.1, %shift-right-logical.118022.7.clone.1)
+  %xor.124112.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253220.3.clone.1, %or.117559.5.clone.1)
+  %constant_218784_1_clone_1 = u32[] constant(1738677885)
+  %broadcast.260431.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218784_1_clone_1), dimensions={}
+  %add.253222.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124112.3.clone.1, %broadcast.260431.5.clone.1)
+  %add.253223.5.clone.1 = u32[1280,1280]{1,0} add(%add.253221.7.clone.1, %add.253222.5.clone.1)
+  %shift-left.111728.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253222.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.118023.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253222.5.clone.1, %broadcast.244416.5760)
+  %or.117560.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111728.9.clone.1, %shift-right-logical.118023.9.clone.1)
+  %xor.124113.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253223.5.clone.1, %or.117560.7.clone.1)
+  %add.253224.3.clone.1 = u32[1280,1280]{1,0} add(%add.253223.5.clone.1, %xor.124113.5.clone.1)
+  %shift-left.111729.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124113.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.118024.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124113.5.clone.1, %broadcast.244429.2304)
+  %or.117561.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111729.9.clone.1, %shift-right-logical.118024.9.clone.1)
+  %xor.124114.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253224.3.clone.1, %or.117561.7.clone.1)
+  %add.253225.3.clone.1 = u32[1280,1280]{1,0} add(%add.253224.3.clone.1, %xor.124114.5.clone.1)
+  %shift-left.111730.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124114.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.118025.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124114.5.clone.1, %broadcast.244430.4608)
+  %or.117562.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111730.9.clone.1, %shift-right-logical.118025.9.clone.1)
+  %xor.124117.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253225.3.clone.1, %or.117562.7.clone.1)
+  %add.253226.3.clone.1 = u32[1280,1280]{1,0} add(%add.253225.3.clone.1, %xor.124117.5.clone.1)
+  %add.253227.7.clone.1 = u32[1280,1280]{1,0} add(%add.253226.3.clone.1, %broadcast.260401.113.clone.1)
+  %shift-left.111731.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124117.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.118027.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124117.5.clone.1, %broadcast.244434.2816)
+  %or.117563.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111731.11.clone.1, %shift-right-logical.118027.11.clone.1)
+  %xor.124118.7.clone.1 = u32[1280,1280]{1,0} xor(%add.253226.3.clone.1, %or.117563.9.clone.1)
+  %constant_218785_1_clone_1 = u32[] constant(1808391295)
+  %broadcast.260441.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218785_1_clone_1), dimensions={}
+  %add.253228.5.clone.1 = u32[1280,1280]{1,0} add(%xor.124118.7.clone.1, %broadcast.260441.5.clone.1)
+  %add.253229.5.clone.1 = u32[1280,1280]{1,0} add(%add.253227.7.clone.1, %add.253228.5.clone.1)
+  %shift-left.111732.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.253228.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.118028.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.253228.5.clone.1, %broadcast.244415.6016)
+  %or.117564.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111732.9.clone.1, %shift-right-logical.118028.9.clone.1)
+  %xor.124119.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253229.5.clone.1, %or.117564.7.clone.1)
+  %add.253230.3.clone.1 = u32[1280,1280]{1,0} add(%add.253229.5.clone.1, %xor.124119.5.clone.1)
+  %shift-left.111733.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124119.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.118029.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124119.5.clone.1, %broadcast.244417.5760)
+  %or.117565.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111733.9.clone.1, %shift-right-logical.118029.9.clone.1)
+  %xor.124120.5.clone.1 = u32[1280,1280]{1,0} xor(%add.253230.3.clone.1, %or.117565.7.clone.1)
+  %add.253231.3.clone.1 = u32[1280,1280]{1,0} add(%add.253230.3.clone.1, %xor.124120.5.clone.1)
+  %shift-left.111734.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124120.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.118030.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124120.5.clone.1, %broadcast.244419.4352)
+  %or.117566.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111734.5.clone.1, %shift-right-logical.118030.5.clone.1)
+  %xor.124121.3.clone.1 = u32[1280,1280]{1,0} xor(%add.253231.3.clone.1, %or.117566.3.clone.1)
+  %add.253232.3.clone.1 = u32[1280,1280]{1,0} add(%add.253231.3.clone.1, %xor.124121.3.clone.1)
+  %add.253233.17.clone.1 = u32[1280,1280]{1,0} add(%add.253232.3.clone.1, %broadcast.260418.24.clone.1)
+  %shift-left.111736.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.124121.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.118032.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124121.3.clone.1, %broadcast.244418.4352)
+  %or.117567.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111736.5.clone.1, %shift-right-logical.118032.5.clone.1)
+  %xor.124122.15.clone.1 = u32[1280,1280]{1,0} xor(%add.253232.3.clone.1, %or.117567.3.clone.1)
+  %constant_218786_1_clone_1 = u32[] constant(398121440)
+  %broadcast.260451.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218786_1_clone_1), dimensions={}
+  %add.253234.19.clone.1 = u32[1280,1280]{1,0} add(%xor.124122.15.clone.1, %broadcast.260451.19.clone.1)
+  %xor.124123.17.clone.1 = u32[1280,1280]{1,0} xor(%add.253233.17.clone.1, %add.253234.19.clone.1)
+  %shift-right-logical.118033.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.124123.17.clone.1, %broadcast.244468.1920)
+  %or.117568.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.118033.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5851.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117568.13.clone.1)
+  %add.253235.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5851.11.clone.1, %broadcast.244470.1152)
+  %multiply.27458.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253235.9.clone.1, %broadcast.244471.896)
+  %add.253236.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27458.7.clone.1, %broadcast.244408.1024)
+  %maximum.3783.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.253236.5.clone.1)
+  %abs.1605.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3783.3.clone.1)
+  %compare.7372.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1605.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27459.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3783.3.clone.1, %broadcast.244476.1152)
+  %negate.4715.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3783.3.clone.1)
+  %multiply.27460.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3783.3.clone.1, %negate.4715.5.clone.1)
+  %log-plus-one.1605.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27460.5.clone.1)
+  %negate.4716.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1605.3.clone.1)
+  %compare.7373.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4716.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21762.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21763.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21764.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21765.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21766.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21767.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21768.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21769.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21770.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.253237.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4716.4.clone.1, %broadcast.244496.640)
+  %sqrt.1605.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4716.4.clone.1)
+  %add.253238.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1605.5.clone.1, %broadcast.244498.640)
+  %select.21771.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7373.3.clone.1, %add.253237.5.clone.1, %add.253238.5.clone.1)
+  %multiply.27461.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21770.3.clone.1, %select.21771.3.clone.1)
+  %add.253239.1.clone.1 = f32[1280,1280]{1,0} add(%select.21769.3.clone.1, %multiply.27461.1.clone.1)
+  %multiply.27462.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253239.1.clone.1, %select.21771.3.clone.1)
+  %add.253241.1.clone.1 = f32[1280,1280]{1,0} add(%select.21768.3.clone.1, %multiply.27462.1.clone.1)
+  %multiply.27464.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253241.1.clone.1, %select.21771.3.clone.1)
+  %add.253244.1.clone.1 = f32[1280,1280]{1,0} add(%select.21767.3.clone.1, %multiply.27464.1.clone.1)
+  %multiply.27465.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253244.1.clone.1, %select.21771.3.clone.1)
+  %add.253245.1.clone.1 = f32[1280,1280]{1,0} add(%select.21766.3.clone.1, %multiply.27465.1.clone.1)
+  %multiply.27466.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253245.1.clone.1, %select.21771.3.clone.1)
+  %add.253246.3.clone.1 = f32[1280,1280]{1,0} add(%select.21765.5.clone.1, %multiply.27466.1.clone.1)
+  %multiply.27467.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.253246.3.clone.1, %select.21771.3.clone.1)
+  %add.253247.3.clone.1 = f32[1280,1280]{1,0} add(%select.21764.5.clone.1, %multiply.27467.1.clone.1)
+  %multiply.27468.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253247.3.clone.1, %select.21771.3.clone.1)
+  %add.253249.9.clone.1 = f32[1280,1280]{1,0} add(%select.21763.11.clone.1, %multiply.27468.7.clone.1)
+  %multiply.27469.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253249.9.clone.1, %select.21771.3.clone.1)
+  %add.253250.7.clone.1 = f32[1280,1280]{1,0} add(%select.21762.7.clone.1, %multiply.27469.7.clone.1)
+  %multiply.27470.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.253250.7.clone.1, %maximum.3783.3.clone.1)
+  %select.21772.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7372.3.clone.1, %multiply.27459.9.clone.1, %multiply.27470.7.clone.1)
+  %multiply.27471.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21772.7.clone.1, %broadcast.244500.640)
+  %clamp.1249.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27471.5.clone.1, %broadcast.244501.384)
+  %multiply.27472.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1249.3.clone.1, %broadcast.244502.1)
+  %constant_165408_1_clone_1 = u32[] constant(3110233334)
+  %broadcast.247571.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165408_1_clone_1), dimensions={}
+  %add.245889.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247571.44.clone.1)
+  %constant_165415_1_clone_1 = u32[] constant(3521699035)
+  %broadcast.247572.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165415_1_clone_1), dimensions={}
+  %add.245890.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247572.113.clone.1)
+  %add.245892.35.clone.1 = u32[1280,1280]{1,0} add(%add.245889.37.clone.1, %add.245890.99.clone.1)
+  %shift-left.108531.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245890.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114664.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245890.99.clone.1, %broadcast.244415.6016)
+  %or.114195.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108531.31.clone.1, %shift-right-logical.114664.29.clone.1)
+  %xor.120742.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245892.35.clone.1, %or.114195.29.clone.1)
+  %add.245893.5.clone.1 = u32[1280,1280]{1,0} add(%add.245892.35.clone.1, %xor.120742.27.clone.1)
+  %shift-left.108533.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120742.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114665.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120742.27.clone.1, %broadcast.244417.5760)
+  %or.114196.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108533.9.clone.1, %shift-right-logical.114665.9.clone.1)
+  %xor.120743.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245893.5.clone.1, %or.114196.7.clone.1)
+  %add.245894.3.clone.1 = u32[1280,1280]{1,0} add(%add.245893.5.clone.1, %xor.120743.5.clone.1)
+  %shift-left.108534.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120743.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114666.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120743.5.clone.1, %broadcast.244419.4352)
+  %or.114197.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108534.5.clone.1, %shift-right-logical.114666.5.clone.1)
+  %xor.120745.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245894.3.clone.1, %or.114197.3.clone.1)
+  %add.245895.3.clone.1 = u32[1280,1280]{1,0} add(%add.245894.3.clone.1, %xor.120745.3.clone.1)
+  %add.245897.7.clone.1 = u32[1280,1280]{1,0} add(%add.245895.3.clone.1, %broadcast.247572.113.clone.1)
+  %shift-left.108535.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120745.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114667.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120745.3.clone.1, %broadcast.244418.4352)
+  %or.114198.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108535.5.clone.1, %shift-right-logical.114667.5.clone.1)
+  %xor.120746.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245895.3.clone.1, %or.114198.3.clone.1)
+  %constant_217960_1_clone_1 = u32[] constant(1935385592)
+  %broadcast.247582.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217960_1_clone_1), dimensions={}
+  %add.245898.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120746.3.clone.1, %broadcast.247582.5.clone.1)
+  %add.245899.5.clone.1 = u32[1280,1280]{1,0} add(%add.245897.7.clone.1, %add.245898.5.clone.1)
+  %shift-left.108536.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245898.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114668.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245898.5.clone.1, %broadcast.244416.5760)
+  %or.114199.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108536.9.clone.1, %shift-right-logical.114668.9.clone.1)
+  %xor.120747.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245899.5.clone.1, %or.114199.7.clone.1)
+  %add.245900.3.clone.1 = u32[1280,1280]{1,0} add(%add.245899.5.clone.1, %xor.120747.5.clone.1)
+  %shift-left.108537.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120747.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114669.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120747.5.clone.1, %broadcast.244429.2304)
+  %or.114200.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108537.9.clone.1, %shift-right-logical.114669.9.clone.1)
+  %xor.120748.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245900.3.clone.1, %or.114200.7.clone.1)
+  %add.245901.3.clone.1 = u32[1280,1280]{1,0} add(%add.245900.3.clone.1, %xor.120748.5.clone.1)
+  %shift-left.108538.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120748.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114671.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120748.5.clone.1, %broadcast.244430.4608)
+  %or.114201.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108538.9.clone.1, %shift-right-logical.114671.9.clone.1)
+  %xor.120749.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245901.3.clone.1, %or.114201.7.clone.1)
+  %add.245903.3.clone.1 = u32[1280,1280]{1,0} add(%add.245901.3.clone.1, %xor.120749.5.clone.1)
+  %constant_165417_1_clone_1 = u32[] constant(1935385591)
+  %broadcast.247589.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_165417_1_clone_1), dimensions={}
+  %add.245906.7.clone.1 = u32[1280,1280]{1,0} add(%add.245903.3.clone.1, %broadcast.247589.24.clone.1)
+  %shift-left.108539.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120749.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114672.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120749.5.clone.1, %broadcast.244434.2816)
+  %or.114202.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108539.11.clone.1, %shift-right-logical.114672.11.clone.1)
+  %xor.120750.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245903.3.clone.1, %or.114202.9.clone.1)
+  %constant_217961_1_clone_1 = u32[] constant(3110233336)
+  %broadcast.247592.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217961_1_clone_1), dimensions={}
+  %add.245907.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120750.7.clone.1, %broadcast.247592.5.clone.1)
+  %add.245908.5.clone.1 = u32[1280,1280]{1,0} add(%add.245906.7.clone.1, %add.245907.5.clone.1)
+  %shift-left.108540.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245907.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114673.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245907.5.clone.1, %broadcast.244415.6016)
+  %or.114203.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108540.9.clone.1, %shift-right-logical.114673.9.clone.1)
+  %xor.120751.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245908.5.clone.1, %or.114203.7.clone.1)
+  %add.245909.3.clone.1 = u32[1280,1280]{1,0} add(%add.245908.5.clone.1, %xor.120751.5.clone.1)
+  %shift-left.108541.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120751.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114674.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120751.5.clone.1, %broadcast.244417.5760)
+  %or.114204.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108541.9.clone.1, %shift-right-logical.114674.9.clone.1)
+  %xor.120752.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245909.3.clone.1, %or.114204.7.clone.1)
+  %add.245910.3.clone.1 = u32[1280,1280]{1,0} add(%add.245909.3.clone.1, %xor.120752.5.clone.1)
+  %shift-left.108543.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120752.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114676.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120752.5.clone.1, %broadcast.244419.4352)
+  %or.114205.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108543.7.clone.1, %shift-right-logical.114676.7.clone.1)
+  %xor.120753.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245910.3.clone.1, %or.114205.5.clone.1)
+  %add.245911.3.clone.1 = u32[1280,1280]{1,0} add(%add.245910.3.clone.1, %xor.120753.3.clone.1)
+  %add.245912.7.clone.1 = u32[1280,1280]{1,0} add(%add.245911.3.clone.1, %broadcast.247571.44.clone.1)
+  %shift-left.108544.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120753.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114677.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120753.3.clone.1, %broadcast.244418.4352)
+  %or.114206.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108544.7.clone.1, %shift-right-logical.114677.7.clone.1)
+  %xor.120754.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245911.3.clone.1, %or.114206.5.clone.1)
+  %constant_217962_1_clone_1 = u32[] constant(3521699038)
+  %broadcast.247602.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217962_1_clone_1), dimensions={}
+  %add.245913.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120754.3.clone.1, %broadcast.247602.5.clone.1)
+  %add.245914.5.clone.1 = u32[1280,1280]{1,0} add(%add.245912.7.clone.1, %add.245913.5.clone.1)
+  %shift-left.108545.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245913.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114678.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245913.5.clone.1, %broadcast.244416.5760)
+  %or.114207.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108545.9.clone.1, %shift-right-logical.114678.9.clone.1)
+  %xor.120755.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245914.5.clone.1, %or.114207.7.clone.1)
+  %add.245915.3.clone.1 = u32[1280,1280]{1,0} add(%add.245914.5.clone.1, %xor.120755.5.clone.1)
+  %shift-left.108546.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120755.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114679.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120755.5.clone.1, %broadcast.244429.2304)
+  %or.114208.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108546.9.clone.1, %shift-right-logical.114679.9.clone.1)
+  %xor.120756.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245915.3.clone.1, %or.114208.7.clone.1)
+  %add.245916.3.clone.1 = u32[1280,1280]{1,0} add(%add.245915.3.clone.1, %xor.120756.5.clone.1)
+  %shift-left.108548.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120756.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114681.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120756.5.clone.1, %broadcast.244430.4608)
+  %or.114209.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108548.9.clone.1, %shift-right-logical.114681.9.clone.1)
+  %xor.120757.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245916.3.clone.1, %or.114209.7.clone.1)
+  %add.245917.3.clone.1 = u32[1280,1280]{1,0} add(%add.245916.3.clone.1, %xor.120757.5.clone.1)
+  %add.245918.7.clone.1 = u32[1280,1280]{1,0} add(%add.245917.3.clone.1, %broadcast.247572.113.clone.1)
+  %shift-left.108549.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120757.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114682.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120757.5.clone.1, %broadcast.244434.2816)
+  %or.114210.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108549.11.clone.1, %shift-right-logical.114682.11.clone.1)
+  %xor.120758.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245917.3.clone.1, %or.114210.9.clone.1)
+  %constant_217963_1_clone_1 = u32[] constant(1935385595)
+  %broadcast.247612.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217963_1_clone_1), dimensions={}
+  %add.245919.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120758.7.clone.1, %broadcast.247612.5.clone.1)
+  %add.245920.5.clone.1 = u32[1280,1280]{1,0} add(%add.245918.7.clone.1, %add.245919.5.clone.1)
+  %shift-left.108550.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245919.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114683.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245919.5.clone.1, %broadcast.244415.6016)
+  %or.114211.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108550.9.clone.1, %shift-right-logical.114683.9.clone.1)
+  %xor.120759.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245920.5.clone.1, %or.114211.7.clone.1)
+  %add.245921.3.clone.1 = u32[1280,1280]{1,0} add(%add.245920.5.clone.1, %xor.120759.5.clone.1)
+  %shift-left.108551.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120759.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114684.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120759.5.clone.1, %broadcast.244417.5760)
+  %or.114212.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108551.9.clone.1, %shift-right-logical.114684.9.clone.1)
+  %xor.120760.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245921.3.clone.1, %or.114212.7.clone.1)
+  %add.245922.3.clone.1 = u32[1280,1280]{1,0} add(%add.245921.3.clone.1, %xor.120760.5.clone.1)
+  %shift-left.108553.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120760.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114686.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120760.5.clone.1, %broadcast.244419.4352)
+  %or.114213.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108553.5.clone.1, %shift-right-logical.114686.5.clone.1)
+  %xor.120761.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245922.3.clone.1, %or.114213.3.clone.1)
+  %add.245923.3.clone.1 = u32[1280,1280]{1,0} add(%add.245922.3.clone.1, %xor.120761.3.clone.1)
+  %add.245924.17.clone.1 = u32[1280,1280]{1,0} add(%add.245923.3.clone.1, %broadcast.247589.24.clone.1)
+  %shift-left.108554.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120761.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114687.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120761.3.clone.1, %broadcast.244418.4352)
+  %or.114214.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108554.5.clone.1, %shift-right-logical.114687.5.clone.1)
+  %xor.120762.15.clone.1 = u32[1280,1280]{1,0} xor(%add.245923.3.clone.1, %or.114214.3.clone.1)
+  %constant_217964_1_clone_1 = u32[] constant(3110233339)
+  %broadcast.247622.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217964_1_clone_1), dimensions={}
+  %add.245925.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120762.15.clone.1, %broadcast.247622.19.clone.1)
+  %xor.120763.17.clone.1 = u32[1280,1280]{1,0} xor(%add.245924.17.clone.1, %add.245925.19.clone.1)
+  %shift-right-logical.114688.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120763.17.clone.1, %broadcast.244468.1920)
+  %or.114215.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114688.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5705.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114215.13.clone.1)
+  %add.245926.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5705.11.clone.1, %broadcast.244470.1152)
+  %multiply.25959.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245926.9.clone.1, %broadcast.244471.896)
+  %add.245927.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25959.7.clone.1, %broadcast.244408.1024)
+  %maximum.3637.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.245927.5.clone.1)
+  %abs.1507.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3637.3.clone.1)
+  %compare.7162.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1507.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25960.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3637.3.clone.1, %broadcast.244476.1152)
+  %negate.4519.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3637.3.clone.1)
+  %multiply.25961.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3637.3.clone.1, %negate.4519.5.clone.1)
+  %log-plus-one.1507.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25961.5.clone.1)
+  %negate.4520.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1507.3.clone.1)
+  %compare.7163.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4520.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20642.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20643.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20644.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20645.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20646.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20647.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20648.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20649.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20650.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.245928.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4520.4.clone.1, %broadcast.244496.640)
+  %sqrt.1507.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4520.4.clone.1)
+  %add.245929.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1507.5.clone.1, %broadcast.244498.640)
+  %select.20651.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7163.3.clone.1, %add.245928.5.clone.1, %add.245929.5.clone.1)
+  %multiply.25962.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20650.3.clone.1, %select.20651.3.clone.1)
+  %add.245930.1.clone.1 = f32[1280,1280]{1,0} add(%select.20649.3.clone.1, %multiply.25962.1.clone.1)
+  %multiply.25963.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245930.1.clone.1, %select.20651.3.clone.1)
+  %add.245931.1.clone.1 = f32[1280,1280]{1,0} add(%select.20648.3.clone.1, %multiply.25963.1.clone.1)
+  %multiply.25964.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245931.1.clone.1, %select.20651.3.clone.1)
+  %add.245932.1.clone.1 = f32[1280,1280]{1,0} add(%select.20647.3.clone.1, %multiply.25964.1.clone.1)
+  %multiply.25965.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245932.1.clone.1, %select.20651.3.clone.1)
+  %add.245933.1.clone.1 = f32[1280,1280]{1,0} add(%select.20646.3.clone.1, %multiply.25965.1.clone.1)
+  %multiply.25966.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245933.1.clone.1, %select.20651.3.clone.1)
+  %add.245934.3.clone.1 = f32[1280,1280]{1,0} add(%select.20645.5.clone.1, %multiply.25966.1.clone.1)
+  %multiply.25967.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245934.3.clone.1, %select.20651.3.clone.1)
+  %add.245935.3.clone.1 = f32[1280,1280]{1,0} add(%select.20644.5.clone.1, %multiply.25967.1.clone.1)
+  %multiply.25968.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245935.3.clone.1, %select.20651.3.clone.1)
+  %add.245936.9.clone.1 = f32[1280,1280]{1,0} add(%select.20643.11.clone.1, %multiply.25968.7.clone.1)
+  %multiply.25969.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245936.9.clone.1, %select.20651.3.clone.1)
+  %add.245937.7.clone.1 = f32[1280,1280]{1,0} add(%select.20642.7.clone.1, %multiply.25969.7.clone.1)
+  %multiply.25970.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245937.7.clone.1, %maximum.3637.3.clone.1)
+  %select.20652.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7162.3.clone.1, %multiply.25960.9.clone.1, %multiply.25970.7.clone.1)
+  %multiply.25971.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20652.7.clone.1, %broadcast.244500.640)
+  %clamp.1151.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25971.5.clone.1, %broadcast.244501.384)
+  %multiply.25972.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1151.3.clone.1, %broadcast.244502.1)
+  %constant_181257_1_clone_1 = u32[] constant(4107876913)
+  %broadcast.254401.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181257_1_clone_1), dimensions={}
+  %add.249784.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.254401.44.clone.1)
+  %constant_181264_1_clone_1 = u32[] constant(467309358)
+  %broadcast.254402.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181264_1_clone_1), dimensions={}
+  %add.249785.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.254402.113.clone.1)
+  %add.249786.35.clone.1 = u32[1280,1280]{1,0} add(%add.249784.37.clone.1, %add.249785.99.clone.1)
+  %shift-left.110216.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249785.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116445.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249785.99.clone.1, %broadcast.244415.6016)
+  %or.115969.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110216.31.clone.1, %shift-right-logical.116445.29.clone.1)
+  %xor.122532.27.clone.1 = u32[1280,1280]{1,0} xor(%add.249786.35.clone.1, %or.115969.29.clone.1)
+  %add.249787.5.clone.1 = u32[1280,1280]{1,0} add(%add.249786.35.clone.1, %xor.122532.27.clone.1)
+  %shift-left.110217.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122532.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116446.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122532.27.clone.1, %broadcast.244417.5760)
+  %or.115970.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110217.9.clone.1, %shift-right-logical.116446.9.clone.1)
+  %xor.122533.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249787.5.clone.1, %or.115970.7.clone.1)
+  %add.249788.3.clone.1 = u32[1280,1280]{1,0} add(%add.249787.5.clone.1, %xor.122533.5.clone.1)
+  %shift-left.110218.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122533.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116447.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122533.5.clone.1, %broadcast.244419.4352)
+  %or.115971.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110218.5.clone.1, %shift-right-logical.116447.5.clone.1)
+  %xor.122535.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249788.3.clone.1, %or.115971.3.clone.1)
+  %add.249790.3.clone.1 = u32[1280,1280]{1,0} add(%add.249788.3.clone.1, %xor.122535.3.clone.1)
+  %add.249794.7.clone.1 = u32[1280,1280]{1,0} add(%add.249790.3.clone.1, %broadcast.254402.113.clone.1)
+  %shift-left.110219.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122535.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116448.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122535.3.clone.1, %broadcast.244418.4352)
+  %or.115972.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110219.5.clone.1, %shift-right-logical.116448.5.clone.1)
+  %xor.122536.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249790.3.clone.1, %or.115972.3.clone.1)
+  %constant_218404_1_clone_1 = u32[] constant(4107449030)
+  %broadcast.254412.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218404_1_clone_1), dimensions={}
+  %add.249795.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122536.3.clone.1, %broadcast.254412.5.clone.1)
+  %add.249796.5.clone.1 = u32[1280,1280]{1,0} add(%add.249794.7.clone.1, %add.249795.5.clone.1)
+  %shift-left.110221.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249795.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116449.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249795.5.clone.1, %broadcast.244416.5760)
+  %or.115973.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110221.9.clone.1, %shift-right-logical.116449.9.clone.1)
+  %xor.122537.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249796.5.clone.1, %or.115973.7.clone.1)
+  %add.249797.3.clone.1 = u32[1280,1280]{1,0} add(%add.249796.5.clone.1, %xor.122537.5.clone.1)
+  %shift-left.110222.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122537.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116450.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122537.5.clone.1, %broadcast.244429.2304)
+  %or.115974.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110222.9.clone.1, %shift-right-logical.116450.9.clone.1)
+  %xor.122538.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249797.3.clone.1, %or.115974.7.clone.1)
+  %add.249799.3.clone.1 = u32[1280,1280]{1,0} add(%add.249797.3.clone.1, %xor.122538.5.clone.1)
+  %shift-left.110223.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122538.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116451.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122538.5.clone.1, %broadcast.244430.4608)
+  %or.115975.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110223.9.clone.1, %shift-right-logical.116451.9.clone.1)
+  %xor.122540.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249799.3.clone.1, %or.115975.7.clone.1)
+  %add.249800.3.clone.1 = u32[1280,1280]{1,0} add(%add.249799.3.clone.1, %xor.122540.5.clone.1)
+  %constant_181282_1_clone_1 = u32[] constant(4107449029)
+  %broadcast.254421.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_181282_1_clone_1), dimensions={}
+  %add.249801.7.clone.1 = u32[1280,1280]{1,0} add(%add.249800.3.clone.1, %broadcast.254421.24.clone.1)
+  %shift-left.110224.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122540.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116452.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122540.5.clone.1, %broadcast.244434.2816)
+  %or.115976.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110224.11.clone.1, %shift-right-logical.116452.11.clone.1)
+  %xor.122541.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249800.3.clone.1, %or.115976.9.clone.1)
+  %constant_218405_1_clone_1 = u32[] constant(4107876915)
+  %broadcast.254424.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218405_1_clone_1), dimensions={}
+  %add.249802.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122541.7.clone.1, %broadcast.254424.5.clone.1)
+  %add.249804.5.clone.1 = u32[1280,1280]{1,0} add(%add.249801.7.clone.1, %add.249802.5.clone.1)
+  %shift-left.110226.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249802.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116453.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249802.5.clone.1, %broadcast.244415.6016)
+  %or.115977.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110226.9.clone.1, %shift-right-logical.116453.9.clone.1)
+  %xor.122542.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249804.5.clone.1, %or.115977.7.clone.1)
+  %add.249805.3.clone.1 = u32[1280,1280]{1,0} add(%add.249804.5.clone.1, %xor.122542.5.clone.1)
+  %shift-left.110227.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122542.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116454.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122542.5.clone.1, %broadcast.244417.5760)
+  %or.115978.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110227.9.clone.1, %shift-right-logical.116454.9.clone.1)
+  %xor.122543.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249805.3.clone.1, %or.115978.7.clone.1)
+  %add.249806.3.clone.1 = u32[1280,1280]{1,0} add(%add.249805.3.clone.1, %xor.122543.5.clone.1)
+  %shift-left.110228.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122543.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116455.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122543.5.clone.1, %broadcast.244419.4352)
+  %or.115979.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110228.7.clone.1, %shift-right-logical.116455.7.clone.1)
+  %xor.122544.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249806.3.clone.1, %or.115979.5.clone.1)
+  %add.249807.3.clone.1 = u32[1280,1280]{1,0} add(%add.249806.3.clone.1, %xor.122544.3.clone.1)
+  %add.249809.7.clone.1 = u32[1280,1280]{1,0} add(%add.249807.3.clone.1, %broadcast.254401.44.clone.1)
+  %shift-left.110229.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122544.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116456.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122544.3.clone.1, %broadcast.244418.4352)
+  %or.115980.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110229.7.clone.1, %shift-right-logical.116456.7.clone.1)
+  %xor.122545.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249807.3.clone.1, %or.115980.5.clone.1)
+  %constant_218406_1_clone_1 = u32[] constant(467309361)
+  %broadcast.254439.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218406_1_clone_1), dimensions={}
+  %add.249810.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122545.3.clone.1, %broadcast.254439.5.clone.1)
+  %add.249811.5.clone.1 = u32[1280,1280]{1,0} add(%add.249809.7.clone.1, %add.249810.5.clone.1)
+  %shift-left.110231.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249810.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116457.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249810.5.clone.1, %broadcast.244416.5760)
+  %or.115981.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110231.9.clone.1, %shift-right-logical.116457.9.clone.1)
+  %xor.122546.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249811.5.clone.1, %or.115981.7.clone.1)
+  %add.249812.3.clone.1 = u32[1280,1280]{1,0} add(%add.249811.5.clone.1, %xor.122546.5.clone.1)
+  %shift-left.110232.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122546.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116458.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122546.5.clone.1, %broadcast.244429.2304)
+  %or.115982.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110232.9.clone.1, %shift-right-logical.116458.9.clone.1)
+  %xor.122547.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249812.3.clone.1, %or.115982.7.clone.1)
+  %add.249813.3.clone.1 = u32[1280,1280]{1,0} add(%add.249812.3.clone.1, %xor.122547.5.clone.1)
+  %shift-left.110233.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122547.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116459.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122547.5.clone.1, %broadcast.244430.4608)
+  %or.115983.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110233.9.clone.1, %shift-right-logical.116459.9.clone.1)
+  %xor.122548.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249813.3.clone.1, %or.115983.7.clone.1)
+  %add.249815.3.clone.1 = u32[1280,1280]{1,0} add(%add.249813.3.clone.1, %xor.122548.5.clone.1)
+  %add.249819.7.clone.1 = u32[1280,1280]{1,0} add(%add.249815.3.clone.1, %broadcast.254402.113.clone.1)
+  %shift-left.110234.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122548.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116460.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122548.5.clone.1, %broadcast.244434.2816)
+  %or.115984.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110234.11.clone.1, %shift-right-logical.116460.11.clone.1)
+  %xor.122550.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249815.3.clone.1, %or.115984.9.clone.1)
+  %constant_218407_1_clone_1 = u32[] constant(4107449033)
+  %broadcast.254451.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218407_1_clone_1), dimensions={}
+  %add.249820.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122550.7.clone.1, %broadcast.254451.5.clone.1)
+  %add.249821.5.clone.1 = u32[1280,1280]{1,0} add(%add.249819.7.clone.1, %add.249820.5.clone.1)
+  %shift-left.110236.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249820.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116461.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249820.5.clone.1, %broadcast.244415.6016)
+  %or.115985.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110236.9.clone.1, %shift-right-logical.116461.9.clone.1)
+  %xor.122551.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249821.5.clone.1, %or.115985.7.clone.1)
+  %add.249822.3.clone.1 = u32[1280,1280]{1,0} add(%add.249821.5.clone.1, %xor.122551.5.clone.1)
+  %shift-left.110237.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122551.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116462.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122551.5.clone.1, %broadcast.244417.5760)
+  %or.115986.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110237.9.clone.1, %shift-right-logical.116462.9.clone.1)
+  %xor.122552.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249822.3.clone.1, %or.115986.7.clone.1)
+  %add.249824.3.clone.1 = u32[1280,1280]{1,0} add(%add.249822.3.clone.1, %xor.122552.5.clone.1)
+  %shift-left.110238.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122552.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116463.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122552.5.clone.1, %broadcast.244419.4352)
+  %or.115987.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110238.5.clone.1, %shift-right-logical.116463.5.clone.1)
+  %xor.122553.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249824.3.clone.1, %or.115987.3.clone.1)
+  %add.249825.3.clone.1 = u32[1280,1280]{1,0} add(%add.249824.3.clone.1, %xor.122553.3.clone.1)
+  %add.249826.17.clone.1 = u32[1280,1280]{1,0} add(%add.249825.3.clone.1, %broadcast.254421.24.clone.1)
+  %shift-left.110239.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122553.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116464.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122553.3.clone.1, %broadcast.244418.4352)
+  %or.115988.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110239.5.clone.1, %shift-right-logical.116464.5.clone.1)
+  %xor.122555.15.clone.1 = u32[1280,1280]{1,0} xor(%add.249825.3.clone.1, %or.115988.3.clone.1)
+  %constant_218408_1_clone_1 = u32[] constant(4107876918)
+  %broadcast.254464.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218408_1_clone_1), dimensions={}
+  %add.249827.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122555.15.clone.1, %broadcast.254464.19.clone.1)
+  %xor.122556.17.clone.1 = u32[1280,1280]{1,0} xor(%add.249826.17.clone.1, %add.249827.19.clone.1)
+  %shift-right-logical.116465.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122556.17.clone.1, %broadcast.244468.1920)
+  %or.115989.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116465.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5783.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115989.13.clone.1)
+  %add.249829.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5783.11.clone.1, %broadcast.244470.1152)
+  %multiply.26755.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249829.9.clone.1, %broadcast.244471.896)
+  %add.249830.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26755.7.clone.1, %broadcast.244408.1024)
+  %maximum.3715.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.249830.5.clone.1)
+  %abs.1559.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3715.3.clone.1)
+  %compare.7267.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1559.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26756.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3715.3.clone.1, %broadcast.244476.1152)
+  %negate.4623.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3715.3.clone.1)
+  %multiply.26757.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3715.3.clone.1, %negate.4623.5.clone.1)
+  %log-plus-one.1559.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26757.5.clone.1)
+  %negate.4624.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1559.3.clone.1)
+  %compare.7268.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4624.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21235.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21236.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21237.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21238.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21239.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21240.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21241.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21242.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21243.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.249831.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4624.4.clone.1, %broadcast.244496.640)
+  %sqrt.1559.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4624.4.clone.1)
+  %add.249832.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1559.5.clone.1, %broadcast.244498.640)
+  %select.21244.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7268.3.clone.1, %add.249831.5.clone.1, %add.249832.5.clone.1)
+  %multiply.26758.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21243.3.clone.1, %select.21244.3.clone.1)
+  %add.249834.1.clone.1 = f32[1280,1280]{1,0} add(%select.21242.3.clone.1, %multiply.26758.1.clone.1)
+  %multiply.26759.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249834.1.clone.1, %select.21244.3.clone.1)
+  %add.249835.1.clone.1 = f32[1280,1280]{1,0} add(%select.21241.3.clone.1, %multiply.26759.1.clone.1)
+  %multiply.26760.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249835.1.clone.1, %select.21244.3.clone.1)
+  %add.249836.1.clone.1 = f32[1280,1280]{1,0} add(%select.21240.3.clone.1, %multiply.26760.1.clone.1)
+  %multiply.26761.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249836.1.clone.1, %select.21244.3.clone.1)
+  %add.249837.1.clone.1 = f32[1280,1280]{1,0} add(%select.21239.3.clone.1, %multiply.26761.1.clone.1)
+  %multiply.26762.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249837.1.clone.1, %select.21244.3.clone.1)
+  %add.249838.3.clone.1 = f32[1280,1280]{1,0} add(%select.21238.5.clone.1, %multiply.26762.1.clone.1)
+  %multiply.26763.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249838.3.clone.1, %select.21244.3.clone.1)
+  %add.249840.3.clone.1 = f32[1280,1280]{1,0} add(%select.21237.5.clone.1, %multiply.26763.1.clone.1)
+  %multiply.26764.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249840.3.clone.1, %select.21244.3.clone.1)
+  %add.249844.9.clone.1 = f32[1280,1280]{1,0} add(%select.21236.11.clone.1, %multiply.26764.7.clone.1)
+  %multiply.26765.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249844.9.clone.1, %select.21244.3.clone.1)
+  %add.249845.7.clone.1 = f32[1280,1280]{1,0} add(%select.21235.7.clone.1, %multiply.26765.7.clone.1)
+  %multiply.26766.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249845.7.clone.1, %maximum.3715.3.clone.1)
+  %select.21245.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7267.3.clone.1, %multiply.26756.9.clone.1, %multiply.26766.7.clone.1)
+  %multiply.26767.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21245.7.clone.1, %broadcast.244500.640)
+  %clamp.1203.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26767.5.clone.1, %broadcast.244501.384)
+  %multiply.26768.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1203.3.clone.1, %broadcast.244502.1)
+  %constant_164857_1_clone_1 = u32[] constant(1739548742)
+  %broadcast.247340.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164857_1_clone_1), dimensions={}
+  %add.245744.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247340.44.clone.1)
+  %constant_164864_1_clone_1 = u32[] constant(2992964051)
+  %broadcast.247341.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164864_1_clone_1), dimensions={}
+  %add.245745.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247341.113.clone.1)
+  %add.245746.35.clone.1 = u32[1280,1280]{1,0} add(%add.245744.37.clone.1, %add.245745.99.clone.1)
+  %shift-left.108460.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245745.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114593.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245745.99.clone.1, %broadcast.244415.6016)
+  %or.114121.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108460.31.clone.1, %shift-right-logical.114593.29.clone.1)
+  %xor.120673.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245746.35.clone.1, %or.114121.29.clone.1)
+  %add.245748.5.clone.1 = u32[1280,1280]{1,0} add(%add.245746.35.clone.1, %xor.120673.27.clone.1)
+  %shift-left.108461.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120673.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114594.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120673.27.clone.1, %broadcast.244417.5760)
+  %or.114122.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108461.9.clone.1, %shift-right-logical.114594.9.clone.1)
+  %xor.120674.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245748.5.clone.1, %or.114122.7.clone.1)
+  %add.245749.3.clone.1 = u32[1280,1280]{1,0} add(%add.245748.5.clone.1, %xor.120674.5.clone.1)
+  %shift-left.108462.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120674.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114595.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120674.5.clone.1, %broadcast.244419.4352)
+  %or.114123.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108462.5.clone.1, %shift-right-logical.114595.5.clone.1)
+  %xor.120676.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245749.3.clone.1, %or.114123.3.clone.1)
+  %add.245750.3.clone.1 = u32[1280,1280]{1,0} add(%add.245749.3.clone.1, %xor.120676.3.clone.1)
+  %add.245751.7.clone.1 = u32[1280,1280]{1,0} add(%add.245750.3.clone.1, %broadcast.247341.113.clone.1)
+  %shift-left.108463.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120676.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114596.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120676.3.clone.1, %broadcast.244418.4352)
+  %or.114124.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108463.5.clone.1, %shift-right-logical.114596.5.clone.1)
+  %xor.120677.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245750.3.clone.1, %or.114124.3.clone.1)
+  %constant_217944_1_clone_1 = u32[] constant(3457905232)
+  %broadcast.247351.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217944_1_clone_1), dimensions={}
+  %add.245752.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120677.3.clone.1, %broadcast.247351.5.clone.1)
+  %add.245753.5.clone.1 = u32[1280,1280]{1,0} add(%add.245751.7.clone.1, %add.245752.5.clone.1)
+  %shift-left.108464.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245752.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114597.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245752.5.clone.1, %broadcast.244416.5760)
+  %or.114126.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108464.9.clone.1, %shift-right-logical.114597.9.clone.1)
+  %xor.120678.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245753.5.clone.1, %or.114126.7.clone.1)
+  %add.245754.3.clone.1 = u32[1280,1280]{1,0} add(%add.245753.5.clone.1, %xor.120678.5.clone.1)
+  %shift-left.108465.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120678.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114598.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120678.5.clone.1, %broadcast.244429.2304)
+  %or.114127.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108465.9.clone.1, %shift-right-logical.114598.9.clone.1)
+  %xor.120679.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245754.3.clone.1, %or.114127.7.clone.1)
+  %add.245755.3.clone.1 = u32[1280,1280]{1,0} add(%add.245754.3.clone.1, %xor.120679.5.clone.1)
+  %shift-left.108466.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120679.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114599.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120679.5.clone.1, %broadcast.244430.4608)
+  %or.114128.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108466.9.clone.1, %shift-right-logical.114599.9.clone.1)
+  %xor.120681.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245755.3.clone.1, %or.114128.7.clone.1)
+  %add.245756.3.clone.1 = u32[1280,1280]{1,0} add(%add.245755.3.clone.1, %xor.120681.5.clone.1)
+  %constant_164866_1_clone_1 = u32[] constant(3457905231)
+  %broadcast.247360.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164866_1_clone_1), dimensions={}
+  %add.245757.7.clone.1 = u32[1280,1280]{1,0} add(%add.245756.3.clone.1, %broadcast.247360.24.clone.1)
+  %shift-left.108468.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120681.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114600.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120681.5.clone.1, %broadcast.244434.2816)
+  %or.114129.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108468.11.clone.1, %shift-right-logical.114600.11.clone.1)
+  %xor.120682.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245756.3.clone.1, %or.114129.9.clone.1)
+  %constant_217945_1_clone_1 = u32[] constant(1739548744)
+  %broadcast.247363.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217945_1_clone_1), dimensions={}
+  %add.245758.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120682.7.clone.1, %broadcast.247363.5.clone.1)
+  %add.245759.5.clone.1 = u32[1280,1280]{1,0} add(%add.245757.7.clone.1, %add.245758.5.clone.1)
+  %shift-left.108469.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245758.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114601.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245758.5.clone.1, %broadcast.244415.6016)
+  %or.114131.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108469.9.clone.1, %shift-right-logical.114601.9.clone.1)
+  %xor.120683.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245759.5.clone.1, %or.114131.7.clone.1)
+  %add.245760.3.clone.1 = u32[1280,1280]{1,0} add(%add.245759.5.clone.1, %xor.120683.5.clone.1)
+  %shift-left.108470.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120683.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114602.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120683.5.clone.1, %broadcast.244417.5760)
+  %or.114132.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108470.9.clone.1, %shift-right-logical.114602.9.clone.1)
+  %xor.120684.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245760.3.clone.1, %or.114132.7.clone.1)
+  %add.245761.3.clone.1 = u32[1280,1280]{1,0} add(%add.245760.3.clone.1, %xor.120684.5.clone.1)
+  %shift-left.108471.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120684.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114603.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120684.5.clone.1, %broadcast.244419.4352)
+  %or.114133.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108471.7.clone.1, %shift-right-logical.114603.7.clone.1)
+  %xor.120685.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245761.3.clone.1, %or.114133.5.clone.1)
+  %add.245762.3.clone.1 = u32[1280,1280]{1,0} add(%add.245761.3.clone.1, %xor.120685.3.clone.1)
+  %add.245763.7.clone.1 = u32[1280,1280]{1,0} add(%add.245762.3.clone.1, %broadcast.247340.44.clone.1)
+  %shift-left.108473.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120685.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114604.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120685.3.clone.1, %broadcast.244418.4352)
+  %or.114134.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108473.7.clone.1, %shift-right-logical.114604.7.clone.1)
+  %xor.120686.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245762.3.clone.1, %or.114134.5.clone.1)
+  %constant_217946_1_clone_1 = u32[] constant(2992964054)
+  %broadcast.247373.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217946_1_clone_1), dimensions={}
+  %add.245764.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120686.3.clone.1, %broadcast.247373.5.clone.1)
+  %add.245765.5.clone.1 = u32[1280,1280]{1,0} add(%add.245763.7.clone.1, %add.245764.5.clone.1)
+  %shift-left.108474.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245764.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114605.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245764.5.clone.1, %broadcast.244416.5760)
+  %or.114136.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108474.9.clone.1, %shift-right-logical.114605.9.clone.1)
+  %xor.120687.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245765.5.clone.1, %or.114136.7.clone.1)
+  %add.245766.3.clone.1 = u32[1280,1280]{1,0} add(%add.245765.5.clone.1, %xor.120687.5.clone.1)
+  %shift-left.108475.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120687.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114606.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120687.5.clone.1, %broadcast.244429.2304)
+  %or.114137.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108475.9.clone.1, %shift-right-logical.114606.9.clone.1)
+  %xor.120688.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245766.3.clone.1, %or.114137.7.clone.1)
+  %add.245767.3.clone.1 = u32[1280,1280]{1,0} add(%add.245766.3.clone.1, %xor.120688.5.clone.1)
+  %shift-left.108476.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120688.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114607.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120688.5.clone.1, %broadcast.244430.4608)
+  %or.114138.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108476.9.clone.1, %shift-right-logical.114607.9.clone.1)
+  %xor.120689.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245767.3.clone.1, %or.114138.7.clone.1)
+  %add.245768.3.clone.1 = u32[1280,1280]{1,0} add(%add.245767.3.clone.1, %xor.120689.5.clone.1)
+  %add.245769.7.clone.1 = u32[1280,1280]{1,0} add(%add.245768.3.clone.1, %broadcast.247341.113.clone.1)
+  %shift-left.108478.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120689.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114608.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120689.5.clone.1, %broadcast.244434.2816)
+  %or.114139.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108478.11.clone.1, %shift-right-logical.114608.11.clone.1)
+  %xor.120691.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245768.3.clone.1, %or.114139.9.clone.1)
+  %constant_217948_1_clone_1 = u32[] constant(3457905235)
+  %broadcast.247385.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217948_1_clone_1), dimensions={}
+  %add.245770.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120691.7.clone.1, %broadcast.247385.5.clone.1)
+  %add.245771.5.clone.1 = u32[1280,1280]{1,0} add(%add.245769.7.clone.1, %add.245770.5.clone.1)
+  %shift-left.108479.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245770.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114609.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245770.5.clone.1, %broadcast.244415.6016)
+  %or.114140.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108479.9.clone.1, %shift-right-logical.114609.9.clone.1)
+  %xor.120692.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245771.5.clone.1, %or.114140.7.clone.1)
+  %add.245772.3.clone.1 = u32[1280,1280]{1,0} add(%add.245771.5.clone.1, %xor.120692.5.clone.1)
+  %shift-left.108480.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120692.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114610.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120692.5.clone.1, %broadcast.244417.5760)
+  %or.114141.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108480.9.clone.1, %shift-right-logical.114610.9.clone.1)
+  %xor.120693.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245772.3.clone.1, %or.114141.7.clone.1)
+  %add.245773.3.clone.1 = u32[1280,1280]{1,0} add(%add.245772.3.clone.1, %xor.120693.5.clone.1)
+  %shift-left.108481.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120693.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114611.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120693.5.clone.1, %broadcast.244419.4352)
+  %or.114142.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108481.5.clone.1, %shift-right-logical.114611.5.clone.1)
+  %xor.120694.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245773.3.clone.1, %or.114142.3.clone.1)
+  %add.245774.3.clone.1 = u32[1280,1280]{1,0} add(%add.245773.3.clone.1, %xor.120694.3.clone.1)
+  %add.245775.17.clone.1 = u32[1280,1280]{1,0} add(%add.245774.3.clone.1, %broadcast.247360.24.clone.1)
+  %shift-left.108483.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120694.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114612.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120694.3.clone.1, %broadcast.244418.4352)
+  %or.114143.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108483.5.clone.1, %shift-right-logical.114612.5.clone.1)
+  %xor.120696.15.clone.1 = u32[1280,1280]{1,0} xor(%add.245774.3.clone.1, %or.114143.3.clone.1)
+  %constant_217949_1_clone_1 = u32[] constant(1739548747)
+  %broadcast.247395.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217949_1_clone_1), dimensions={}
+  %add.245776.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120696.15.clone.1, %broadcast.247395.19.clone.1)
+  %xor.120697.17.clone.1 = u32[1280,1280]{1,0} xor(%add.245775.17.clone.1, %add.245776.19.clone.1)
+  %shift-right-logical.114613.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120697.17.clone.1, %broadcast.244468.1920)
+  %or.114144.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114613.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5702.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114144.13.clone.1)
+  %add.245777.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5702.11.clone.1, %broadcast.244470.1152)
+  %multiply.25933.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245777.9.clone.1, %broadcast.244471.896)
+  %add.245779.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25933.7.clone.1, %broadcast.244408.1024)
+  %maximum.3634.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.245779.5.clone.1)
+  %abs.1506.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3634.3.clone.1)
+  %compare.7160.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1506.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25935.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3634.3.clone.1, %broadcast.244476.1152)
+  %negate.4517.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3634.3.clone.1)
+  %multiply.25936.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3634.3.clone.1, %negate.4517.5.clone.1)
+  %log-plus-one.1506.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25936.5.clone.1)
+  %negate.4518.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1506.3.clone.1)
+  %compare.7161.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4518.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20631.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20632.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20633.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20634.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20635.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20636.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20637.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20638.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20639.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.245782.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4518.4.clone.1, %broadcast.244496.640)
+  %sqrt.1506.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4518.4.clone.1)
+  %add.245783.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1506.5.clone.1, %broadcast.244498.640)
+  %select.20640.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7161.3.clone.1, %add.245782.5.clone.1, %add.245783.5.clone.1)
+  %multiply.25938.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20639.3.clone.1, %select.20640.3.clone.1)
+  %add.245784.1.clone.1 = f32[1280,1280]{1,0} add(%select.20638.3.clone.1, %multiply.25938.1.clone.1)
+  %multiply.25939.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245784.1.clone.1, %select.20640.3.clone.1)
+  %add.245785.1.clone.1 = f32[1280,1280]{1,0} add(%select.20637.3.clone.1, %multiply.25939.1.clone.1)
+  %multiply.25941.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245785.1.clone.1, %select.20640.3.clone.1)
+  %add.245787.1.clone.1 = f32[1280,1280]{1,0} add(%select.20636.3.clone.1, %multiply.25941.1.clone.1)
+  %multiply.25942.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245787.1.clone.1, %select.20640.3.clone.1)
+  %add.245788.1.clone.1 = f32[1280,1280]{1,0} add(%select.20635.3.clone.1, %multiply.25942.1.clone.1)
+  %multiply.25944.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245788.1.clone.1, %select.20640.3.clone.1)
+  %add.245789.3.clone.1 = f32[1280,1280]{1,0} add(%select.20634.5.clone.1, %multiply.25944.1.clone.1)
+  %multiply.25945.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245789.3.clone.1, %select.20640.3.clone.1)
+  %add.245790.3.clone.1 = f32[1280,1280]{1,0} add(%select.20633.5.clone.1, %multiply.25945.1.clone.1)
+  %multiply.25947.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245790.3.clone.1, %select.20640.3.clone.1)
+  %add.245792.9.clone.1 = f32[1280,1280]{1,0} add(%select.20632.11.clone.1, %multiply.25947.7.clone.1)
+  %multiply.25949.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245792.9.clone.1, %select.20640.3.clone.1)
+  %add.245793.7.clone.1 = f32[1280,1280]{1,0} add(%select.20631.7.clone.1, %multiply.25949.7.clone.1)
+  %multiply.25950.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245793.7.clone.1, %maximum.3634.3.clone.1)
+  %select.20641.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7160.3.clone.1, %multiply.25935.9.clone.1, %multiply.25950.7.clone.1)
+  %multiply.25952.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20641.7.clone.1, %broadcast.244500.640)
+  %clamp.1150.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25952.5.clone.1, %broadcast.244501.384)
+  %multiply.25954.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1150.3.clone.1, %broadcast.244502.1)
+  %constant_189020_1_clone_1 = u32[] constant(2412923323)
+  %broadcast.257777.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189020_1_clone_1), dimensions={}
+  %add.251705.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.257777.44.clone.1)
+  %constant_189027_1_clone_1 = u32[] constant(1171010214)
+  %broadcast.257779.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189027_1_clone_1), dimensions={}
+  %add.251706.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.257779.113.clone.1)
+  %add.251707.35.clone.1 = u32[1280,1280]{1,0} add(%add.251705.37.clone.1, %add.251706.99.clone.1)
+  %shift-left.111060.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251706.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117329.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251706.99.clone.1, %broadcast.244415.6016)
+  %or.116853.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111060.31.clone.1, %shift-right-logical.117329.29.clone.1)
+  %xor.123410.27.clone.1 = u32[1280,1280]{1,0} xor(%add.251707.35.clone.1, %or.116853.29.clone.1)
+  %add.251709.5.clone.1 = u32[1280,1280]{1,0} add(%add.251707.35.clone.1, %xor.123410.27.clone.1)
+  %shift-left.111061.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123410.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117330.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123410.27.clone.1, %broadcast.244417.5760)
+  %or.116854.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111061.9.clone.1, %shift-right-logical.117330.9.clone.1)
+  %xor.123411.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251709.5.clone.1, %or.116854.7.clone.1)
+  %add.251710.3.clone.1 = u32[1280,1280]{1,0} add(%add.251709.5.clone.1, %xor.123411.5.clone.1)
+  %shift-left.111062.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123411.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117331.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123411.5.clone.1, %broadcast.244419.4352)
+  %or.116855.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111062.5.clone.1, %shift-right-logical.117331.5.clone.1)
+  %xor.123412.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251710.3.clone.1, %or.116855.3.clone.1)
+  %add.251711.3.clone.1 = u32[1280,1280]{1,0} add(%add.251710.3.clone.1, %xor.123412.3.clone.1)
+  %add.251712.7.clone.1 = u32[1280,1280]{1,0} add(%add.251711.3.clone.1, %broadcast.257779.113.clone.1)
+  %shift-left.111063.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123412.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117332.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123412.3.clone.1, %broadcast.244418.4352)
+  %or.116856.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111063.5.clone.1, %shift-right-logical.117332.5.clone.1)
+  %xor.123413.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251711.3.clone.1, %or.116856.3.clone.1)
+  %constant_218615_1_clone_1 = u32[] constant(3520028872)
+  %broadcast.257789.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218615_1_clone_1), dimensions={}
+  %add.251713.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123413.3.clone.1, %broadcast.257789.5.clone.1)
+  %add.251715.5.clone.1 = u32[1280,1280]{1,0} add(%add.251712.7.clone.1, %add.251713.5.clone.1)
+  %shift-left.111064.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251713.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117333.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251713.5.clone.1, %broadcast.244416.5760)
+  %or.116857.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111064.9.clone.1, %shift-right-logical.117333.9.clone.1)
+  %xor.123414.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251715.5.clone.1, %or.116857.7.clone.1)
+  %add.251718.3.clone.1 = u32[1280,1280]{1,0} add(%add.251715.5.clone.1, %xor.123414.5.clone.1)
+  %shift-left.111065.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123414.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117334.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123414.5.clone.1, %broadcast.244429.2304)
+  %or.116858.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111065.9.clone.1, %shift-right-logical.117334.9.clone.1)
+  %xor.123416.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251718.3.clone.1, %or.116858.7.clone.1)
+  %add.251719.3.clone.1 = u32[1280,1280]{1,0} add(%add.251718.3.clone.1, %xor.123416.5.clone.1)
+  %shift-left.111066.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123416.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117335.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123416.5.clone.1, %broadcast.244430.4608)
+  %or.116859.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111066.9.clone.1, %shift-right-logical.117335.9.clone.1)
+  %xor.123417.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251719.3.clone.1, %or.116859.7.clone.1)
+  %add.251720.3.clone.1 = u32[1280,1280]{1,0} add(%add.251719.3.clone.1, %xor.123417.5.clone.1)
+  %constant_189029_1_clone_1 = u32[] constant(3520028871)
+  %broadcast.257796.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_189029_1_clone_1), dimensions={}
+  %add.251721.7.clone.1 = u32[1280,1280]{1,0} add(%add.251720.3.clone.1, %broadcast.257796.24.clone.1)
+  %shift-left.111067.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123417.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117336.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123417.5.clone.1, %broadcast.244434.2816)
+  %or.116860.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111067.11.clone.1, %shift-right-logical.117336.11.clone.1)
+  %xor.123418.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251720.3.clone.1, %or.116860.9.clone.1)
+  %constant_218616_1_clone_1 = u32[] constant(2412923325)
+  %broadcast.257799.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218616_1_clone_1), dimensions={}
+  %add.251722.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123418.7.clone.1, %broadcast.257799.5.clone.1)
+  %add.251723.5.clone.1 = u32[1280,1280]{1,0} add(%add.251721.7.clone.1, %add.251722.5.clone.1)
+  %shift-left.111068.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251722.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117337.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251722.5.clone.1, %broadcast.244415.6016)
+  %or.116861.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111068.9.clone.1, %shift-right-logical.117337.9.clone.1)
+  %xor.123419.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251723.5.clone.1, %or.116861.7.clone.1)
+  %add.251724.3.clone.1 = u32[1280,1280]{1,0} add(%add.251723.5.clone.1, %xor.123419.5.clone.1)
+  %shift-left.111069.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123419.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117338.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123419.5.clone.1, %broadcast.244417.5760)
+  %or.116862.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111069.9.clone.1, %shift-right-logical.117338.9.clone.1)
+  %xor.123421.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251724.3.clone.1, %or.116862.7.clone.1)
+  %add.251725.3.clone.1 = u32[1280,1280]{1,0} add(%add.251724.3.clone.1, %xor.123421.5.clone.1)
+  %shift-left.111070.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123421.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117339.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123421.5.clone.1, %broadcast.244419.4352)
+  %or.116863.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111070.7.clone.1, %shift-right-logical.117339.7.clone.1)
+  %xor.123422.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251725.3.clone.1, %or.116863.5.clone.1)
+  %add.251726.3.clone.1 = u32[1280,1280]{1,0} add(%add.251725.3.clone.1, %xor.123422.3.clone.1)
+  %add.251727.7.clone.1 = u32[1280,1280]{1,0} add(%add.251726.3.clone.1, %broadcast.257777.44.clone.1)
+  %shift-left.111071.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123422.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117340.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123422.3.clone.1, %broadcast.244418.4352)
+  %or.116864.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111071.7.clone.1, %shift-right-logical.117340.7.clone.1)
+  %xor.123423.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251726.3.clone.1, %or.116864.5.clone.1)
+  %constant_218617_1_clone_1 = u32[] constant(1171010217)
+  %broadcast.257809.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218617_1_clone_1), dimensions={}
+  %add.251728.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123423.3.clone.1, %broadcast.257809.5.clone.1)
+  %add.251729.5.clone.1 = u32[1280,1280]{1,0} add(%add.251727.7.clone.1, %add.251728.5.clone.1)
+  %shift-left.111072.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251728.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117341.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251728.5.clone.1, %broadcast.244416.5760)
+  %or.116865.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111072.9.clone.1, %shift-right-logical.117341.9.clone.1)
+  %xor.123424.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251729.5.clone.1, %or.116865.7.clone.1)
+  %add.251730.3.clone.1 = u32[1280,1280]{1,0} add(%add.251729.5.clone.1, %xor.123424.5.clone.1)
+  %shift-left.111073.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123424.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117342.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123424.5.clone.1, %broadcast.244429.2304)
+  %or.116866.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111073.9.clone.1, %shift-right-logical.117342.9.clone.1)
+  %xor.123426.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251730.3.clone.1, %or.116866.7.clone.1)
+  %add.251731.3.clone.1 = u32[1280,1280]{1,0} add(%add.251730.3.clone.1, %xor.123426.5.clone.1)
+  %shift-left.111074.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123426.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117343.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123426.5.clone.1, %broadcast.244430.4608)
+  %or.116867.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111074.9.clone.1, %shift-right-logical.117343.9.clone.1)
+  %xor.123427.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251731.3.clone.1, %or.116867.7.clone.1)
+  %add.251732.3.clone.1 = u32[1280,1280]{1,0} add(%add.251731.3.clone.1, %xor.123427.5.clone.1)
+  %add.251733.7.clone.1 = u32[1280,1280]{1,0} add(%add.251732.3.clone.1, %broadcast.257779.113.clone.1)
+  %shift-left.111075.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123427.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117344.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123427.5.clone.1, %broadcast.244434.2816)
+  %or.116868.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111075.11.clone.1, %shift-right-logical.117344.11.clone.1)
+  %xor.123428.7.clone.1 = u32[1280,1280]{1,0} xor(%add.251732.3.clone.1, %or.116868.9.clone.1)
+  %constant_218618_1_clone_1 = u32[] constant(3520028875)
+  %broadcast.257819.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218618_1_clone_1), dimensions={}
+  %add.251734.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123428.7.clone.1, %broadcast.257819.5.clone.1)
+  %add.251735.5.clone.1 = u32[1280,1280]{1,0} add(%add.251733.7.clone.1, %add.251734.5.clone.1)
+  %shift-left.111076.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.251734.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117345.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.251734.5.clone.1, %broadcast.244415.6016)
+  %or.116869.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111076.9.clone.1, %shift-right-logical.117345.9.clone.1)
+  %xor.123429.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251735.5.clone.1, %or.116869.7.clone.1)
+  %add.251736.3.clone.1 = u32[1280,1280]{1,0} add(%add.251735.5.clone.1, %xor.123429.5.clone.1)
+  %shift-left.111077.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123429.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117346.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123429.5.clone.1, %broadcast.244417.5760)
+  %or.116871.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111077.9.clone.1, %shift-right-logical.117346.9.clone.1)
+  %xor.123431.5.clone.1 = u32[1280,1280]{1,0} xor(%add.251736.3.clone.1, %or.116871.7.clone.1)
+  %add.251737.3.clone.1 = u32[1280,1280]{1,0} add(%add.251736.3.clone.1, %xor.123431.5.clone.1)
+  %shift-left.111078.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123431.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117347.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123431.5.clone.1, %broadcast.244419.4352)
+  %or.116872.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111078.5.clone.1, %shift-right-logical.117347.5.clone.1)
+  %xor.123432.3.clone.1 = u32[1280,1280]{1,0} xor(%add.251737.3.clone.1, %or.116872.3.clone.1)
+  %add.251738.3.clone.1 = u32[1280,1280]{1,0} add(%add.251737.3.clone.1, %xor.123432.3.clone.1)
+  %add.251739.17.clone.1 = u32[1280,1280]{1,0} add(%add.251738.3.clone.1, %broadcast.257796.24.clone.1)
+  %shift-left.111079.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123432.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117348.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123432.3.clone.1, %broadcast.244418.4352)
+  %or.116873.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111079.5.clone.1, %shift-right-logical.117348.5.clone.1)
+  %xor.123433.15.clone.1 = u32[1280,1280]{1,0} xor(%add.251738.3.clone.1, %or.116873.3.clone.1)
+  %constant_218619_1_clone_1 = u32[] constant(2412923328)
+  %broadcast.257829.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218619_1_clone_1), dimensions={}
+  %add.251740.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123433.15.clone.1, %broadcast.257829.19.clone.1)
+  %xor.123434.17.clone.1 = u32[1280,1280]{1,0} xor(%add.251739.17.clone.1, %add.251740.19.clone.1)
+  %shift-right-logical.117349.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123434.17.clone.1, %broadcast.244468.1920)
+  %or.116874.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117349.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5821.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.116874.13.clone.1)
+  %add.251741.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5821.11.clone.1, %broadcast.244470.1152)
+  %multiply.27158.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251741.9.clone.1, %broadcast.244471.896)
+  %add.251742.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27158.7.clone.1, %broadcast.244408.1024)
+  %maximum.3753.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.251742.5.clone.1)
+  %abs.1585.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3753.3.clone.1)
+  %compare.7332.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1585.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27159.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3753.3.clone.1, %broadcast.244476.1152)
+  %negate.4675.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3753.3.clone.1)
+  %multiply.27160.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3753.3.clone.1, %negate.4675.5.clone.1)
+  %log-plus-one.1585.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27160.5.clone.1)
+  %negate.4676.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1585.3.clone.1)
+  %compare.7333.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4676.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21526.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21528.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21529.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21531.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21532.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21534.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21535.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21537.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21538.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.251743.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4676.4.clone.1, %broadcast.244496.640)
+  %sqrt.1585.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4676.4.clone.1)
+  %add.251744.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1585.5.clone.1, %broadcast.244498.640)
+  %select.21540.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7333.3.clone.1, %add.251743.5.clone.1, %add.251744.5.clone.1)
+  %multiply.27161.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21538.3.clone.1, %select.21540.3.clone.1)
+  %add.251745.1.clone.1 = f32[1280,1280]{1,0} add(%select.21537.3.clone.1, %multiply.27161.1.clone.1)
+  %multiply.27162.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251745.1.clone.1, %select.21540.3.clone.1)
+  %add.251746.1.clone.1 = f32[1280,1280]{1,0} add(%select.21535.3.clone.1, %multiply.27162.1.clone.1)
+  %multiply.27163.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251746.1.clone.1, %select.21540.3.clone.1)
+  %add.251747.1.clone.1 = f32[1280,1280]{1,0} add(%select.21534.3.clone.1, %multiply.27163.1.clone.1)
+  %multiply.27164.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251747.1.clone.1, %select.21540.3.clone.1)
+  %add.251748.1.clone.1 = f32[1280,1280]{1,0} add(%select.21532.3.clone.1, %multiply.27164.1.clone.1)
+  %multiply.27165.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251748.1.clone.1, %select.21540.3.clone.1)
+  %add.251749.3.clone.1 = f32[1280,1280]{1,0} add(%select.21531.5.clone.1, %multiply.27165.1.clone.1)
+  %multiply.27166.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.251749.3.clone.1, %select.21540.3.clone.1)
+  %add.251750.3.clone.1 = f32[1280,1280]{1,0} add(%select.21529.5.clone.1, %multiply.27166.1.clone.1)
+  %multiply.27167.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251750.3.clone.1, %select.21540.3.clone.1)
+  %add.251751.9.clone.1 = f32[1280,1280]{1,0} add(%select.21528.11.clone.1, %multiply.27167.7.clone.1)
+  %multiply.27168.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251751.9.clone.1, %select.21540.3.clone.1)
+  %add.251752.7.clone.1 = f32[1280,1280]{1,0} add(%select.21526.7.clone.1, %multiply.27168.7.clone.1)
+  %multiply.27169.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.251752.7.clone.1, %maximum.3753.3.clone.1)
+  %select.21541.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7332.3.clone.1, %multiply.27159.9.clone.1, %multiply.27169.7.clone.1)
+  %multiply.27170.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21541.7.clone.1, %broadcast.244500.640)
+  %clamp.1229.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27170.5.clone.1, %broadcast.244501.384)
+  %multiply.27171.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1229.3.clone.1, %broadcast.244502.1)
+  %constant_164647_1_clone_1 = u32[] constant(2003854816)
+  %broadcast.247235.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164647_1_clone_1), dimensions={}
+  %add.245697.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247235.44.clone.1)
+  %constant_164654_1_clone_1 = u32[] constant(2270062255)
+  %broadcast.247236.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164654_1_clone_1), dimensions={}
+  %add.245700.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247236.113.clone.1)
+  %add.245701.35.clone.1 = u32[1280,1280]{1,0} add(%add.245697.37.clone.1, %add.245700.99.clone.1)
+  %shift-left.108440.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245700.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114572.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245700.99.clone.1, %broadcast.244415.6016)
+  %or.114096.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108440.31.clone.1, %shift-right-logical.114572.29.clone.1)
+  %xor.120648.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245701.35.clone.1, %or.114096.29.clone.1)
+  %add.245702.5.clone.1 = u32[1280,1280]{1,0} add(%add.245701.35.clone.1, %xor.120648.27.clone.1)
+  %shift-left.108441.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120648.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114573.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120648.27.clone.1, %broadcast.244417.5760)
+  %or.114097.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108441.9.clone.1, %shift-right-logical.114573.9.clone.1)
+  %xor.120649.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245702.5.clone.1, %or.114097.7.clone.1)
+  %add.245703.3.clone.1 = u32[1280,1280]{1,0} add(%add.245702.5.clone.1, %xor.120649.5.clone.1)
+  %shift-left.108442.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120649.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114574.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120649.5.clone.1, %broadcast.244419.4352)
+  %or.114098.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108442.5.clone.1, %shift-right-logical.114574.5.clone.1)
+  %xor.120651.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245703.3.clone.1, %or.114098.3.clone.1)
+  %add.245704.3.clone.1 = u32[1280,1280]{1,0} add(%add.245703.3.clone.1, %xor.120651.3.clone.1)
+  %add.245705.7.clone.1 = u32[1280,1280]{1,0} add(%add.245704.3.clone.1, %broadcast.247236.113.clone.1)
+  %shift-left.108443.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120651.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114575.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120651.3.clone.1, %broadcast.244418.4352)
+  %or.114099.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108443.5.clone.1, %shift-right-logical.114575.5.clone.1)
+  %xor.120652.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245704.3.clone.1, %or.114099.3.clone.1)
+  %constant_217939_1_clone_1 = u32[] constant(3958315158)
+  %broadcast.247246.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217939_1_clone_1), dimensions={}
+  %add.245706.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120652.3.clone.1, %broadcast.247246.5.clone.1)
+  %add.245707.5.clone.1 = u32[1280,1280]{1,0} add(%add.245705.7.clone.1, %add.245706.5.clone.1)
+  %shift-left.108444.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245706.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114576.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245706.5.clone.1, %broadcast.244416.5760)
+  %or.114101.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108444.9.clone.1, %shift-right-logical.114576.9.clone.1)
+  %xor.120653.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245707.5.clone.1, %or.114101.7.clone.1)
+  %add.245708.3.clone.1 = u32[1280,1280]{1,0} add(%add.245707.5.clone.1, %xor.120653.5.clone.1)
+  %shift-left.108445.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120653.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114577.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120653.5.clone.1, %broadcast.244429.2304)
+  %or.114102.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108445.9.clone.1, %shift-right-logical.114577.9.clone.1)
+  %xor.120654.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245708.3.clone.1, %or.114102.7.clone.1)
+  %add.245709.3.clone.1 = u32[1280,1280]{1,0} add(%add.245708.3.clone.1, %xor.120654.5.clone.1)
+  %shift-left.108446.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120654.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114578.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120654.5.clone.1, %broadcast.244430.4608)
+  %or.114103.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108446.9.clone.1, %shift-right-logical.114578.9.clone.1)
+  %xor.120656.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245709.3.clone.1, %or.114103.7.clone.1)
+  %add.245710.3.clone.1 = u32[1280,1280]{1,0} add(%add.245709.3.clone.1, %xor.120656.5.clone.1)
+  %constant_164656_1_clone_1 = u32[] constant(3958315157)
+  %broadcast.247253.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164656_1_clone_1), dimensions={}
+  %add.245711.7.clone.1 = u32[1280,1280]{1,0} add(%add.245710.3.clone.1, %broadcast.247253.24.clone.1)
+  %shift-left.108447.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120656.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114579.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120656.5.clone.1, %broadcast.244434.2816)
+  %or.114104.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108447.11.clone.1, %shift-right-logical.114579.11.clone.1)
+  %xor.120657.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245710.3.clone.1, %or.114104.9.clone.1)
+  %constant_217940_1_clone_1 = u32[] constant(2003854818)
+  %broadcast.247256.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217940_1_clone_1), dimensions={}
+  %add.245712.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120657.7.clone.1, %broadcast.247256.5.clone.1)
+  %add.245713.5.clone.1 = u32[1280,1280]{1,0} add(%add.245711.7.clone.1, %add.245712.5.clone.1)
+  %shift-left.108448.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245712.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114580.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245712.5.clone.1, %broadcast.244415.6016)
+  %or.114106.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108448.9.clone.1, %shift-right-logical.114580.9.clone.1)
+  %xor.120658.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245713.5.clone.1, %or.114106.7.clone.1)
+  %add.245714.3.clone.1 = u32[1280,1280]{1,0} add(%add.245713.5.clone.1, %xor.120658.5.clone.1)
+  %shift-left.108449.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120658.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114581.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120658.5.clone.1, %broadcast.244417.5760)
+  %or.114107.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108449.9.clone.1, %shift-right-logical.114581.9.clone.1)
+  %xor.120659.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245714.3.clone.1, %or.114107.7.clone.1)
+  %add.245715.3.clone.1 = u32[1280,1280]{1,0} add(%add.245714.3.clone.1, %xor.120659.5.clone.1)
+  %shift-left.108450.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120659.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114582.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120659.5.clone.1, %broadcast.244419.4352)
+  %or.114108.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108450.7.clone.1, %shift-right-logical.114582.7.clone.1)
+  %xor.120660.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245715.3.clone.1, %or.114108.5.clone.1)
+  %add.245716.3.clone.1 = u32[1280,1280]{1,0} add(%add.245715.3.clone.1, %xor.120660.3.clone.1)
+  %add.245717.7.clone.1 = u32[1280,1280]{1,0} add(%add.245716.3.clone.1, %broadcast.247235.44.clone.1)
+  %shift-left.108451.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120660.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114583.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120660.3.clone.1, %broadcast.244418.4352)
+  %or.114109.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108451.7.clone.1, %shift-right-logical.114583.7.clone.1)
+  %xor.120661.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245716.3.clone.1, %or.114109.5.clone.1)
+  %constant_217941_1_clone_1 = u32[] constant(2270062258)
+  %broadcast.247266.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217941_1_clone_1), dimensions={}
+  %add.245718.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120661.3.clone.1, %broadcast.247266.5.clone.1)
+  %add.245719.5.clone.1 = u32[1280,1280]{1,0} add(%add.245717.7.clone.1, %add.245718.5.clone.1)
+  %shift-left.108452.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245718.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114584.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245718.5.clone.1, %broadcast.244416.5760)
+  %or.114111.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108452.9.clone.1, %shift-right-logical.114584.9.clone.1)
+  %xor.120662.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245719.5.clone.1, %or.114111.7.clone.1)
+  %add.245720.3.clone.1 = u32[1280,1280]{1,0} add(%add.245719.5.clone.1, %xor.120662.5.clone.1)
+  %shift-left.108453.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120662.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114585.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120662.5.clone.1, %broadcast.244429.2304)
+  %or.114112.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108453.9.clone.1, %shift-right-logical.114585.9.clone.1)
+  %xor.120663.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245720.3.clone.1, %or.114112.7.clone.1)
+  %add.245721.3.clone.1 = u32[1280,1280]{1,0} add(%add.245720.3.clone.1, %xor.120663.5.clone.1)
+  %shift-left.108454.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120663.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114586.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120663.5.clone.1, %broadcast.244430.4608)
+  %or.114113.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108454.9.clone.1, %shift-right-logical.114586.9.clone.1)
+  %xor.120664.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245721.3.clone.1, %or.114113.7.clone.1)
+  %add.245722.3.clone.1 = u32[1280,1280]{1,0} add(%add.245721.3.clone.1, %xor.120664.5.clone.1)
+  %add.245723.7.clone.1 = u32[1280,1280]{1,0} add(%add.245722.3.clone.1, %broadcast.247236.113.clone.1)
+  %shift-left.108455.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120664.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114587.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120664.5.clone.1, %broadcast.244434.2816)
+  %or.114114.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108455.11.clone.1, %shift-right-logical.114587.11.clone.1)
+  %xor.120666.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245722.3.clone.1, %or.114114.9.clone.1)
+  %constant_217942_1_clone_1 = u32[] constant(3958315161)
+  %broadcast.247278.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217942_1_clone_1), dimensions={}
+  %add.245724.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120666.7.clone.1, %broadcast.247278.5.clone.1)
+  %add.245725.5.clone.1 = u32[1280,1280]{1,0} add(%add.245723.7.clone.1, %add.245724.5.clone.1)
+  %shift-left.108456.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245724.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114588.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245724.5.clone.1, %broadcast.244415.6016)
+  %or.114115.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108456.9.clone.1, %shift-right-logical.114588.9.clone.1)
+  %xor.120667.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245725.5.clone.1, %or.114115.7.clone.1)
+  %add.245726.3.clone.1 = u32[1280,1280]{1,0} add(%add.245725.5.clone.1, %xor.120667.5.clone.1)
+  %shift-left.108457.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120667.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114589.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120667.5.clone.1, %broadcast.244417.5760)
+  %or.114116.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108457.9.clone.1, %shift-right-logical.114589.9.clone.1)
+  %xor.120668.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245726.3.clone.1, %or.114116.7.clone.1)
+  %add.245728.3.clone.1 = u32[1280,1280]{1,0} add(%add.245726.3.clone.1, %xor.120668.5.clone.1)
+  %shift-left.108458.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120668.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114590.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120668.5.clone.1, %broadcast.244419.4352)
+  %or.114117.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108458.5.clone.1, %shift-right-logical.114590.5.clone.1)
+  %xor.120669.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245728.3.clone.1, %or.114117.3.clone.1)
+  %add.245729.3.clone.1 = u32[1280,1280]{1,0} add(%add.245728.3.clone.1, %xor.120669.3.clone.1)
+  %add.245730.17.clone.1 = u32[1280,1280]{1,0} add(%add.245729.3.clone.1, %broadcast.247253.24.clone.1)
+  %shift-left.108459.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120669.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114591.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120669.3.clone.1, %broadcast.244418.4352)
+  %or.114118.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108459.5.clone.1, %shift-right-logical.114591.5.clone.1)
+  %xor.120671.15.clone.1 = u32[1280,1280]{1,0} xor(%add.245729.3.clone.1, %or.114118.3.clone.1)
+  %constant_217943_1_clone_1 = u32[] constant(2003854821)
+  %broadcast.247298.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217943_1_clone_1), dimensions={}
+  %add.245731.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120671.15.clone.1, %broadcast.247298.19.clone.1)
+  %xor.120672.17.clone.1 = u32[1280,1280]{1,0} xor(%add.245730.17.clone.1, %add.245731.19.clone.1)
+  %shift-right-logical.114592.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120672.17.clone.1, %broadcast.244468.1920)
+  %or.114119.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114592.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5701.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114119.13.clone.1)
+  %add.245732.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5701.11.clone.1, %broadcast.244470.1152)
+  %multiply.25916.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245732.9.clone.1, %broadcast.244471.896)
+  %add.245733.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25916.7.clone.1, %broadcast.244408.1024)
+  %maximum.3633.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.245733.5.clone.1)
+  %abs.1505.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3633.3.clone.1)
+  %compare.7158.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1505.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25917.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3633.3.clone.1, %broadcast.244476.1152)
+  %negate.4515.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3633.3.clone.1)
+  %multiply.25918.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3633.3.clone.1, %negate.4515.5.clone.1)
+  %log-plus-one.1505.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25918.5.clone.1)
+  %negate.4516.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1505.3.clone.1)
+  %compare.7159.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4516.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20620.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20621.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20622.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20623.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20624.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20625.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20626.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20627.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20628.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.245734.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4516.4.clone.1, %broadcast.244496.640)
+  %sqrt.1505.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4516.4.clone.1)
+  %add.245735.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1505.5.clone.1, %broadcast.244498.640)
+  %select.20629.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7159.3.clone.1, %add.245734.5.clone.1, %add.245735.5.clone.1)
+  %multiply.25919.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20628.3.clone.1, %select.20629.3.clone.1)
+  %add.245736.1.clone.1 = f32[1280,1280]{1,0} add(%select.20627.3.clone.1, %multiply.25919.1.clone.1)
+  %multiply.25920.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245736.1.clone.1, %select.20629.3.clone.1)
+  %add.245737.1.clone.1 = f32[1280,1280]{1,0} add(%select.20626.3.clone.1, %multiply.25920.1.clone.1)
+  %multiply.25921.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245737.1.clone.1, %select.20629.3.clone.1)
+  %add.245738.1.clone.1 = f32[1280,1280]{1,0} add(%select.20625.3.clone.1, %multiply.25921.1.clone.1)
+  %multiply.25922.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245738.1.clone.1, %select.20629.3.clone.1)
+  %add.245739.1.clone.1 = f32[1280,1280]{1,0} add(%select.20624.3.clone.1, %multiply.25922.1.clone.1)
+  %multiply.25923.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245739.1.clone.1, %select.20629.3.clone.1)
+  %add.245740.3.clone.1 = f32[1280,1280]{1,0} add(%select.20623.5.clone.1, %multiply.25923.1.clone.1)
+  %multiply.25924.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245740.3.clone.1, %select.20629.3.clone.1)
+  %add.245741.3.clone.1 = f32[1280,1280]{1,0} add(%select.20622.5.clone.1, %multiply.25924.1.clone.1)
+  %multiply.25926.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245741.3.clone.1, %select.20629.3.clone.1)
+  %add.245742.9.clone.1 = f32[1280,1280]{1,0} add(%select.20621.11.clone.1, %multiply.25926.7.clone.1)
+  %multiply.25927.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245742.9.clone.1, %select.20629.3.clone.1)
+  %add.245743.7.clone.1 = f32[1280,1280]{1,0} add(%select.20620.7.clone.1, %multiply.25927.7.clone.1)
+  %multiply.25929.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245743.7.clone.1, %maximum.3633.3.clone.1)
+  %select.20630.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7158.3.clone.1, %multiply.25917.9.clone.1, %multiply.25929.7.clone.1)
+  %multiply.25930.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20630.7.clone.1, %broadcast.244500.640)
+  %clamp.1149.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25930.5.clone.1, %broadcast.244501.384)
+  %multiply.25932.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1149.3.clone.1, %broadcast.244502.1)
+  %constant_180706_1_clone_1 = u32[] constant(951851306)
+  %broadcast.254166.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_180706_1_clone_1), dimensions={}
+  %add.249648.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.254166.44.clone.1)
+  %constant_180713_1_clone_1 = u32[] constant(3477382136)
+  %broadcast.254167.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_180713_1_clone_1), dimensions={}
+  %add.249650.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.254167.113.clone.1)
+  %add.249651.35.clone.1 = u32[1280,1280]{1,0} add(%add.249648.37.clone.1, %add.249650.99.clone.1)
+  %shift-left.110144.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249650.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116377.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249650.99.clone.1, %broadcast.244415.6016)
+  %or.115906.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110144.31.clone.1, %shift-right-logical.116377.29.clone.1)
+  %xor.122457.27.clone.1 = u32[1280,1280]{1,0} xor(%add.249651.35.clone.1, %or.115906.29.clone.1)
+  %add.249652.5.clone.1 = u32[1280,1280]{1,0} add(%add.249651.35.clone.1, %xor.122457.27.clone.1)
+  %shift-left.110146.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122457.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116379.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122457.27.clone.1, %broadcast.244417.5760)
+  %or.115907.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110146.9.clone.1, %shift-right-logical.116379.9.clone.1)
+  %xor.122458.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249652.5.clone.1, %or.115907.7.clone.1)
+  %add.249653.3.clone.1 = u32[1280,1280]{1,0} add(%add.249652.5.clone.1, %xor.122458.5.clone.1)
+  %shift-left.110147.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122458.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116380.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122458.5.clone.1, %broadcast.244419.4352)
+  %or.115908.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110147.5.clone.1, %shift-right-logical.116380.5.clone.1)
+  %xor.122460.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249653.3.clone.1, %or.115908.3.clone.1)
+  %add.249654.3.clone.1 = u32[1280,1280]{1,0} add(%add.249653.3.clone.1, %xor.122460.3.clone.1)
+  %add.249656.7.clone.1 = u32[1280,1280]{1,0} add(%add.249654.3.clone.1, %broadcast.254167.113.clone.1)
+  %shift-left.110148.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122460.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116381.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122460.3.clone.1, %broadcast.244418.4352)
+  %or.115909.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110148.5.clone.1, %shift-right-logical.116381.5.clone.1)
+  %xor.122461.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249654.3.clone.1, %or.115909.3.clone.1)
+  %constant_218389_1_clone_1 = u32[] constant(3962151177)
+  %broadcast.254177.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218389_1_clone_1), dimensions={}
+  %add.249660.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122461.3.clone.1, %broadcast.254177.5.clone.1)
+  %add.249661.5.clone.1 = u32[1280,1280]{1,0} add(%add.249656.7.clone.1, %add.249660.5.clone.1)
+  %shift-left.110149.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249660.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116382.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249660.5.clone.1, %broadcast.244416.5760)
+  %or.115910.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110149.9.clone.1, %shift-right-logical.116382.9.clone.1)
+  %xor.122462.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249661.5.clone.1, %or.115910.7.clone.1)
+  %add.249662.3.clone.1 = u32[1280,1280]{1,0} add(%add.249661.5.clone.1, %xor.122462.5.clone.1)
+  %shift-left.110151.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122462.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116384.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122462.5.clone.1, %broadcast.244429.2304)
+  %or.115911.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110151.9.clone.1, %shift-right-logical.116384.9.clone.1)
+  %xor.122463.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249662.3.clone.1, %or.115911.7.clone.1)
+  %add.249663.3.clone.1 = u32[1280,1280]{1,0} add(%add.249662.3.clone.1, %xor.122463.5.clone.1)
+  %shift-left.110152.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122463.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116385.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122463.5.clone.1, %broadcast.244430.4608)
+  %or.115912.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110152.9.clone.1, %shift-right-logical.116385.9.clone.1)
+  %xor.122465.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249663.3.clone.1, %or.115912.7.clone.1)
+  %add.249665.3.clone.1 = u32[1280,1280]{1,0} add(%add.249663.3.clone.1, %xor.122465.5.clone.1)
+  %constant_180715_1_clone_1 = u32[] constant(3962151176)
+  %broadcast.254184.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_180715_1_clone_1), dimensions={}
+  %add.249666.7.clone.1 = u32[1280,1280]{1,0} add(%add.249665.3.clone.1, %broadcast.254184.24.clone.1)
+  %shift-left.110153.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122465.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116386.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122465.5.clone.1, %broadcast.244434.2816)
+  %or.115913.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110153.11.clone.1, %shift-right-logical.116386.11.clone.1)
+  %xor.122466.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249665.3.clone.1, %or.115913.9.clone.1)
+  %constant_218390_1_clone_1 = u32[] constant(951851308)
+  %broadcast.254187.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218390_1_clone_1), dimensions={}
+  %add.249667.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122466.7.clone.1, %broadcast.254187.5.clone.1)
+  %add.249668.5.clone.1 = u32[1280,1280]{1,0} add(%add.249666.7.clone.1, %add.249667.5.clone.1)
+  %shift-left.110154.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249667.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116387.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249667.5.clone.1, %broadcast.244415.6016)
+  %or.115914.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110154.9.clone.1, %shift-right-logical.116387.9.clone.1)
+  %xor.122467.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249668.5.clone.1, %or.115914.7.clone.1)
+  %add.249670.3.clone.1 = u32[1280,1280]{1,0} add(%add.249668.5.clone.1, %xor.122467.5.clone.1)
+  %shift-left.110156.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122467.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116389.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122467.5.clone.1, %broadcast.244417.5760)
+  %or.115915.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110156.9.clone.1, %shift-right-logical.116389.9.clone.1)
+  %xor.122468.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249670.3.clone.1, %or.115915.7.clone.1)
+  %add.249671.3.clone.1 = u32[1280,1280]{1,0} add(%add.249670.3.clone.1, %xor.122468.5.clone.1)
+  %shift-left.110157.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122468.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116390.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122468.5.clone.1, %broadcast.244419.4352)
+  %or.115916.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110157.7.clone.1, %shift-right-logical.116390.7.clone.1)
+  %xor.122469.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249671.3.clone.1, %or.115916.5.clone.1)
+  %add.249672.3.clone.1 = u32[1280,1280]{1,0} add(%add.249671.3.clone.1, %xor.122469.3.clone.1)
+  %add.249673.7.clone.1 = u32[1280,1280]{1,0} add(%add.249672.3.clone.1, %broadcast.254166.44.clone.1)
+  %shift-left.110158.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122469.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116391.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122469.3.clone.1, %broadcast.244418.4352)
+  %or.115917.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110158.7.clone.1, %shift-right-logical.116391.7.clone.1)
+  %xor.122470.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249672.3.clone.1, %or.115917.5.clone.1)
+  %constant_218391_1_clone_1 = u32[] constant(3477382139)
+  %broadcast.254197.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218391_1_clone_1), dimensions={}
+  %add.249675.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122470.3.clone.1, %broadcast.254197.5.clone.1)
+  %add.249676.5.clone.1 = u32[1280,1280]{1,0} add(%add.249673.7.clone.1, %add.249675.5.clone.1)
+  %shift-left.110159.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249675.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116392.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249675.5.clone.1, %broadcast.244416.5760)
+  %or.115918.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110159.9.clone.1, %shift-right-logical.116392.9.clone.1)
+  %xor.122471.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249676.5.clone.1, %or.115918.7.clone.1)
+  %add.249677.3.clone.1 = u32[1280,1280]{1,0} add(%add.249676.5.clone.1, %xor.122471.5.clone.1)
+  %shift-left.110161.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122471.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116393.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122471.5.clone.1, %broadcast.244429.2304)
+  %or.115919.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110161.9.clone.1, %shift-right-logical.116393.9.clone.1)
+  %xor.122472.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249677.3.clone.1, %or.115919.7.clone.1)
+  %add.249678.3.clone.1 = u32[1280,1280]{1,0} add(%add.249677.3.clone.1, %xor.122472.5.clone.1)
+  %shift-left.110162.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122472.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116394.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122472.5.clone.1, %broadcast.244430.4608)
+  %or.115920.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110162.9.clone.1, %shift-right-logical.116394.9.clone.1)
+  %xor.122473.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249678.3.clone.1, %or.115920.7.clone.1)
+  %add.249679.3.clone.1 = u32[1280,1280]{1,0} add(%add.249678.3.clone.1, %xor.122473.5.clone.1)
+  %add.249681.7.clone.1 = u32[1280,1280]{1,0} add(%add.249679.3.clone.1, %broadcast.254167.113.clone.1)
+  %shift-left.110163.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122473.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116395.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122473.5.clone.1, %broadcast.244434.2816)
+  %or.115921.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110163.11.clone.1, %shift-right-logical.116395.11.clone.1)
+  %xor.122475.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249679.3.clone.1, %or.115921.9.clone.1)
+  %constant_218392_1_clone_1 = u32[] constant(3962151180)
+  %broadcast.254207.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218392_1_clone_1), dimensions={}
+  %add.249684.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122475.7.clone.1, %broadcast.254207.5.clone.1)
+  %add.249685.5.clone.1 = u32[1280,1280]{1,0} add(%add.249681.7.clone.1, %add.249684.5.clone.1)
+  %shift-left.110164.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249684.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116396.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249684.5.clone.1, %broadcast.244415.6016)
+  %or.115922.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110164.9.clone.1, %shift-right-logical.116396.9.clone.1)
+  %xor.122476.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249685.5.clone.1, %or.115922.7.clone.1)
+  %add.249686.3.clone.1 = u32[1280,1280]{1,0} add(%add.249685.5.clone.1, %xor.122476.5.clone.1)
+  %shift-left.110165.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122476.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116397.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122476.5.clone.1, %broadcast.244417.5760)
+  %or.115923.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110165.9.clone.1, %shift-right-logical.116397.9.clone.1)
+  %xor.122477.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249686.3.clone.1, %or.115923.7.clone.1)
+  %add.249687.3.clone.1 = u32[1280,1280]{1,0} add(%add.249686.3.clone.1, %xor.122477.5.clone.1)
+  %shift-left.110166.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122477.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116398.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122477.5.clone.1, %broadcast.244419.4352)
+  %or.115924.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110166.5.clone.1, %shift-right-logical.116398.5.clone.1)
+  %xor.122478.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249687.3.clone.1, %or.115924.3.clone.1)
+  %add.249688.3.clone.1 = u32[1280,1280]{1,0} add(%add.249687.3.clone.1, %xor.122478.3.clone.1)
+  %add.249689.17.clone.1 = u32[1280,1280]{1,0} add(%add.249688.3.clone.1, %broadcast.254184.24.clone.1)
+  %shift-left.110167.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122478.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116399.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122478.3.clone.1, %broadcast.244418.4352)
+  %or.115925.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110167.5.clone.1, %shift-right-logical.116399.5.clone.1)
+  %xor.122480.15.clone.1 = u32[1280,1280]{1,0} xor(%add.249688.3.clone.1, %or.115925.3.clone.1)
+  %constant_218393_1_clone_1 = u32[] constant(951851311)
+  %broadcast.254217.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218393_1_clone_1), dimensions={}
+  %add.249690.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122480.15.clone.1, %broadcast.254217.19.clone.1)
+  %xor.122481.17.clone.1 = u32[1280,1280]{1,0} xor(%add.249689.17.clone.1, %add.249690.19.clone.1)
+  %shift-right-logical.116400.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122481.17.clone.1, %broadcast.244468.1920)
+  %or.115926.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116400.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5780.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115926.13.clone.1)
+  %add.249691.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5780.11.clone.1, %broadcast.244470.1152)
+  %multiply.26737.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249691.9.clone.1, %broadcast.244471.896)
+  %add.249692.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26737.7.clone.1, %broadcast.244408.1024)
+  %maximum.3712.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.249692.5.clone.1)
+  %abs.1558.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3712.3.clone.1)
+  %compare.7265.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1558.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26738.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3712.3.clone.1, %broadcast.244476.1152)
+  %negate.4621.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3712.3.clone.1)
+  %multiply.26739.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3712.3.clone.1, %negate.4621.5.clone.1)
+  %log-plus-one.1558.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26739.5.clone.1)
+  %negate.4622.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1558.3.clone.1)
+  %compare.7266.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4622.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21224.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21225.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21226.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21227.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21228.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21229.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21230.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21231.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21232.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.249693.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4622.4.clone.1, %broadcast.244496.640)
+  %sqrt.1558.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4622.4.clone.1)
+  %add.249694.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1558.5.clone.1, %broadcast.244498.640)
+  %select.21233.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7266.3.clone.1, %add.249693.5.clone.1, %add.249694.5.clone.1)
+  %multiply.26740.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21232.3.clone.1, %select.21233.3.clone.1)
+  %add.249695.1.clone.1 = f32[1280,1280]{1,0} add(%select.21231.3.clone.1, %multiply.26740.1.clone.1)
+  %multiply.26741.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249695.1.clone.1, %select.21233.3.clone.1)
+  %add.249696.1.clone.1 = f32[1280,1280]{1,0} add(%select.21230.3.clone.1, %multiply.26741.1.clone.1)
+  %multiply.26742.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249696.1.clone.1, %select.21233.3.clone.1)
+  %add.249697.1.clone.1 = f32[1280,1280]{1,0} add(%select.21229.3.clone.1, %multiply.26742.1.clone.1)
+  %multiply.26743.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249697.1.clone.1, %select.21233.3.clone.1)
+  %add.249698.1.clone.1 = f32[1280,1280]{1,0} add(%select.21228.3.clone.1, %multiply.26743.1.clone.1)
+  %multiply.26744.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249698.1.clone.1, %select.21233.3.clone.1)
+  %add.249699.3.clone.1 = f32[1280,1280]{1,0} add(%select.21227.5.clone.1, %multiply.26744.1.clone.1)
+  %multiply.26745.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249699.3.clone.1, %select.21233.3.clone.1)
+  %add.249700.3.clone.1 = f32[1280,1280]{1,0} add(%select.21226.5.clone.1, %multiply.26745.1.clone.1)
+  %multiply.26746.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249700.3.clone.1, %select.21233.3.clone.1)
+  %add.249701.9.clone.1 = f32[1280,1280]{1,0} add(%select.21225.11.clone.1, %multiply.26746.7.clone.1)
+  %multiply.26747.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249701.9.clone.1, %select.21233.3.clone.1)
+  %add.249702.7.clone.1 = f32[1280,1280]{1,0} add(%select.21224.7.clone.1, %multiply.26747.7.clone.1)
+  %multiply.26748.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249702.7.clone.1, %maximum.3712.3.clone.1)
+  %select.21234.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7265.3.clone.1, %multiply.26738.9.clone.1, %multiply.26748.7.clone.1)
+  %multiply.26749.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21234.7.clone.1, %broadcast.244500.640)
+  %clamp.1202.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26749.5.clone.1, %broadcast.244501.384)
+  %multiply.26750.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1202.3.clone.1, %broadcast.244502.1)
+  %constant_164415_1_clone_1 = u32[] constant(187457133)
+  %broadcast.247149.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164415_1_clone_1), dimensions={}
+  %add.245637.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247149.44.clone.1)
+  %constant_164422_1_clone_1 = u32[] constant(1046480133)
+  %broadcast.247150.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164422_1_clone_1), dimensions={}
+  %add.245638.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247150.113.clone.1)
+  %add.245639.35.clone.1 = u32[1280,1280]{1,0} add(%add.245637.37.clone.1, %add.245638.99.clone.1)
+  %shift-left.108420.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245638.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114549.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245638.99.clone.1, %broadcast.244415.6016)
+  %or.114071.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108420.31.clone.1, %shift-right-logical.114549.29.clone.1)
+  %xor.120623.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245639.35.clone.1, %or.114071.29.clone.1)
+  %add.245641.5.clone.1 = u32[1280,1280]{1,0} add(%add.245639.35.clone.1, %xor.120623.27.clone.1)
+  %shift-left.108421.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120623.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114550.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120623.27.clone.1, %broadcast.244417.5760)
+  %or.114072.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108421.9.clone.1, %shift-right-logical.114550.9.clone.1)
+  %xor.120624.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245641.5.clone.1, %or.114072.7.clone.1)
+  %add.245642.3.clone.1 = u32[1280,1280]{1,0} add(%add.245641.5.clone.1, %xor.120624.5.clone.1)
+  %shift-left.108422.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120624.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114552.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120624.5.clone.1, %broadcast.244419.4352)
+  %or.114073.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108422.5.clone.1, %shift-right-logical.114552.5.clone.1)
+  %xor.120626.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245642.3.clone.1, %or.114073.3.clone.1)
+  %add.245643.3.clone.1 = u32[1280,1280]{1,0} add(%add.245642.3.clone.1, %xor.120626.3.clone.1)
+  %add.245644.7.clone.1 = u32[1280,1280]{1,0} add(%add.245643.3.clone.1, %broadcast.247150.113.clone.1)
+  %shift-left.108423.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120626.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114553.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120626.3.clone.1, %broadcast.244418.4352)
+  %or.114074.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108423.5.clone.1, %shift-right-logical.114553.5.clone.1)
+  %xor.120627.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245643.3.clone.1, %or.114074.3.clone.1)
+  %constant_217934_1_clone_1 = u32[] constant(782057651)
+  %broadcast.247160.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217934_1_clone_1), dimensions={}
+  %add.245645.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120627.3.clone.1, %broadcast.247160.5.clone.1)
+  %add.245647.5.clone.1 = u32[1280,1280]{1,0} add(%add.245644.7.clone.1, %add.245645.5.clone.1)
+  %shift-left.108424.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245645.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114554.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245645.5.clone.1, %broadcast.244416.5760)
+  %or.114076.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108424.9.clone.1, %shift-right-logical.114554.9.clone.1)
+  %xor.120628.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245647.5.clone.1, %or.114076.7.clone.1)
+  %add.245651.3.clone.1 = u32[1280,1280]{1,0} add(%add.245647.5.clone.1, %xor.120628.5.clone.1)
+  %shift-left.108425.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120628.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114555.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120628.5.clone.1, %broadcast.244429.2304)
+  %or.114077.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108425.9.clone.1, %shift-right-logical.114555.9.clone.1)
+  %xor.120629.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245651.3.clone.1, %or.114077.7.clone.1)
+  %add.245652.3.clone.1 = u32[1280,1280]{1,0} add(%add.245651.3.clone.1, %xor.120629.5.clone.1)
+  %shift-left.108426.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120629.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114556.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120629.5.clone.1, %broadcast.244430.4608)
+  %or.114078.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108426.9.clone.1, %shift-right-logical.114556.9.clone.1)
+  %xor.120631.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245652.3.clone.1, %or.114078.7.clone.1)
+  %add.245653.3.clone.1 = u32[1280,1280]{1,0} add(%add.245652.3.clone.1, %xor.120631.5.clone.1)
+  %constant_164424_1_clone_1 = u32[] constant(782057650)
+  %broadcast.247167.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164424_1_clone_1), dimensions={}
+  %add.245654.7.clone.1 = u32[1280,1280]{1,0} add(%add.245653.3.clone.1, %broadcast.247167.24.clone.1)
+  %shift-left.108427.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120631.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114557.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120631.5.clone.1, %broadcast.244434.2816)
+  %or.114079.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108427.11.clone.1, %shift-right-logical.114557.11.clone.1)
+  %xor.120632.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245653.3.clone.1, %or.114079.9.clone.1)
+  %constant_217935_1_clone_1 = u32[] constant(187457135)
+  %broadcast.247170.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217935_1_clone_1), dimensions={}
+  %add.245656.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120632.7.clone.1, %broadcast.247170.5.clone.1)
+  %add.245657.5.clone.1 = u32[1280,1280]{1,0} add(%add.245654.7.clone.1, %add.245656.5.clone.1)
+  %shift-left.108428.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245656.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114558.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245656.5.clone.1, %broadcast.244415.6016)
+  %or.114081.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108428.9.clone.1, %shift-right-logical.114558.9.clone.1)
+  %xor.120633.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245657.5.clone.1, %or.114081.7.clone.1)
+  %add.245658.3.clone.1 = u32[1280,1280]{1,0} add(%add.245657.5.clone.1, %xor.120633.5.clone.1)
+  %shift-left.108429.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120633.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114559.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120633.5.clone.1, %broadcast.244417.5760)
+  %or.114082.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108429.9.clone.1, %shift-right-logical.114559.9.clone.1)
+  %xor.120634.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245658.3.clone.1, %or.114082.7.clone.1)
+  %add.245659.3.clone.1 = u32[1280,1280]{1,0} add(%add.245658.3.clone.1, %xor.120634.5.clone.1)
+  %shift-left.108430.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120634.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114560.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120634.5.clone.1, %broadcast.244419.4352)
+  %or.114083.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108430.7.clone.1, %shift-right-logical.114560.7.clone.1)
+  %xor.120635.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245659.3.clone.1, %or.114083.5.clone.1)
+  %add.245661.3.clone.1 = u32[1280,1280]{1,0} add(%add.245659.3.clone.1, %xor.120635.3.clone.1)
+  %add.245662.7.clone.1 = u32[1280,1280]{1,0} add(%add.245661.3.clone.1, %broadcast.247149.44.clone.1)
+  %shift-left.108431.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120635.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114561.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120635.3.clone.1, %broadcast.244418.4352)
+  %or.114084.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108431.7.clone.1, %shift-right-logical.114561.7.clone.1)
+  %xor.120636.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245661.3.clone.1, %or.114084.5.clone.1)
+  %constant_217936_1_clone_1 = u32[] constant(1046480136)
+  %broadcast.247180.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217936_1_clone_1), dimensions={}
+  %add.245663.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120636.3.clone.1, %broadcast.247180.5.clone.1)
+  %add.245664.5.clone.1 = u32[1280,1280]{1,0} add(%add.245662.7.clone.1, %add.245663.5.clone.1)
+  %shift-left.108432.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245663.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114562.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245663.5.clone.1, %broadcast.244416.5760)
+  %or.114086.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108432.9.clone.1, %shift-right-logical.114562.9.clone.1)
+  %xor.120637.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245664.5.clone.1, %or.114086.7.clone.1)
+  %add.245666.3.clone.1 = u32[1280,1280]{1,0} add(%add.245664.5.clone.1, %xor.120637.5.clone.1)
+  %shift-left.108433.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120637.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114563.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120637.5.clone.1, %broadcast.244429.2304)
+  %or.114087.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108433.9.clone.1, %shift-right-logical.114563.9.clone.1)
+  %xor.120638.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245666.3.clone.1, %or.114087.7.clone.1)
+  %add.245667.3.clone.1 = u32[1280,1280]{1,0} add(%add.245666.3.clone.1, %xor.120638.5.clone.1)
+  %shift-left.108434.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120638.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114565.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120638.5.clone.1, %broadcast.244430.4608)
+  %or.114088.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108434.9.clone.1, %shift-right-logical.114565.9.clone.1)
+  %xor.120639.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245667.3.clone.1, %or.114088.7.clone.1)
+  %add.245668.3.clone.1 = u32[1280,1280]{1,0} add(%add.245667.3.clone.1, %xor.120639.5.clone.1)
+  %add.245669.7.clone.1 = u32[1280,1280]{1,0} add(%add.245668.3.clone.1, %broadcast.247150.113.clone.1)
+  %shift-left.108435.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120639.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114566.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120639.5.clone.1, %broadcast.244434.2816)
+  %or.114089.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108435.11.clone.1, %shift-right-logical.114566.11.clone.1)
+  %xor.120641.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245668.3.clone.1, %or.114089.9.clone.1)
+  %constant_217937_1_clone_1 = u32[] constant(782057654)
+  %broadcast.247190.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217937_1_clone_1), dimensions={}
+  %add.245670.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120641.7.clone.1, %broadcast.247190.5.clone.1)
+  %add.245672.5.clone.1 = u32[1280,1280]{1,0} add(%add.245669.7.clone.1, %add.245670.5.clone.1)
+  %shift-left.108436.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245670.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114567.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245670.5.clone.1, %broadcast.244415.6016)
+  %or.114090.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108436.9.clone.1, %shift-right-logical.114567.9.clone.1)
+  %xor.120642.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245672.5.clone.1, %or.114090.7.clone.1)
+  %add.245676.3.clone.1 = u32[1280,1280]{1,0} add(%add.245672.5.clone.1, %xor.120642.5.clone.1)
+  %shift-left.108437.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120642.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114568.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120642.5.clone.1, %broadcast.244417.5760)
+  %or.114091.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108437.9.clone.1, %shift-right-logical.114568.9.clone.1)
+  %xor.120643.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245676.3.clone.1, %or.114091.7.clone.1)
+  %add.245677.3.clone.1 = u32[1280,1280]{1,0} add(%add.245676.3.clone.1, %xor.120643.5.clone.1)
+  %shift-left.108438.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120643.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114569.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120643.5.clone.1, %broadcast.244419.4352)
+  %or.114092.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108438.5.clone.1, %shift-right-logical.114569.5.clone.1)
+  %xor.120644.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245677.3.clone.1, %or.114092.3.clone.1)
+  %add.245678.3.clone.1 = u32[1280,1280]{1,0} add(%add.245677.3.clone.1, %xor.120644.3.clone.1)
+  %add.245679.17.clone.1 = u32[1280,1280]{1,0} add(%add.245678.3.clone.1, %broadcast.247167.24.clone.1)
+  %shift-left.108439.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120644.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114570.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120644.3.clone.1, %broadcast.244418.4352)
+  %or.114093.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108439.5.clone.1, %shift-right-logical.114570.5.clone.1)
+  %xor.120646.15.clone.1 = u32[1280,1280]{1,0} xor(%add.245678.3.clone.1, %or.114093.3.clone.1)
+  %constant_217938_1_clone_1 = u32[] constant(187457138)
+  %broadcast.247200.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217938_1_clone_1), dimensions={}
+  %add.245681.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120646.15.clone.1, %broadcast.247200.19.clone.1)
+  %xor.120647.17.clone.1 = u32[1280,1280]{1,0} xor(%add.245679.17.clone.1, %add.245681.19.clone.1)
+  %shift-right-logical.114571.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120647.17.clone.1, %broadcast.244468.1920)
+  %or.114094.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114571.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5700.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114094.13.clone.1)
+  %add.245682.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5700.11.clone.1, %broadcast.244470.1152)
+  %multiply.25901.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245682.9.clone.1, %broadcast.244471.896)
+  %add.245683.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25901.7.clone.1, %broadcast.244408.1024)
+  %maximum.3632.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.245683.5.clone.1)
+  %abs.1504.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3632.3.clone.1)
+  %compare.7156.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1504.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25902.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3632.3.clone.1, %broadcast.244476.1152)
+  %negate.4513.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3632.3.clone.1)
+  %multiply.25903.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3632.3.clone.1, %negate.4513.5.clone.1)
+  %log-plus-one.1504.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25903.5.clone.1)
+  %negate.4514.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1504.3.clone.1)
+  %compare.7157.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4514.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20609.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20610.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20611.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20612.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20613.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20614.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20615.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20616.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20617.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.245684.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4514.4.clone.1, %broadcast.244496.640)
+  %sqrt.1504.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4514.4.clone.1)
+  %add.245686.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1504.5.clone.1, %broadcast.244498.640)
+  %select.20618.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7157.3.clone.1, %add.245684.5.clone.1, %add.245686.5.clone.1)
+  %multiply.25904.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20617.3.clone.1, %select.20618.3.clone.1)
+  %add.245687.1.clone.1 = f32[1280,1280]{1,0} add(%select.20616.3.clone.1, %multiply.25904.1.clone.1)
+  %multiply.25905.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245687.1.clone.1, %select.20618.3.clone.1)
+  %add.245688.1.clone.1 = f32[1280,1280]{1,0} add(%select.20615.3.clone.1, %multiply.25905.1.clone.1)
+  %multiply.25906.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245688.1.clone.1, %select.20618.3.clone.1)
+  %add.245689.1.clone.1 = f32[1280,1280]{1,0} add(%select.20614.3.clone.1, %multiply.25906.1.clone.1)
+  %multiply.25907.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245689.1.clone.1, %select.20618.3.clone.1)
+  %add.245691.1.clone.1 = f32[1280,1280]{1,0} add(%select.20613.3.clone.1, %multiply.25907.1.clone.1)
+  %multiply.25908.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245691.1.clone.1, %select.20618.3.clone.1)
+  %add.245692.3.clone.1 = f32[1280,1280]{1,0} add(%select.20612.5.clone.1, %multiply.25908.1.clone.1)
+  %multiply.25909.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245692.3.clone.1, %select.20618.3.clone.1)
+  %add.245693.3.clone.1 = f32[1280,1280]{1,0} add(%select.20611.5.clone.1, %multiply.25909.1.clone.1)
+  %multiply.25910.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245693.3.clone.1, %select.20618.3.clone.1)
+  %add.245694.9.clone.1 = f32[1280,1280]{1,0} add(%select.20610.11.clone.1, %multiply.25910.7.clone.1)
+  %multiply.25911.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245694.9.clone.1, %select.20618.3.clone.1)
+  %add.245695.7.clone.1 = f32[1280,1280]{1,0} add(%select.20609.7.clone.1, %multiply.25911.7.clone.1)
+  %multiply.25912.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245695.7.clone.1, %maximum.3632.3.clone.1)
+  %select.20619.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7156.3.clone.1, %multiply.25902.9.clone.1, %multiply.25912.7.clone.1)
+  %multiply.25913.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20619.7.clone.1, %broadcast.244500.640)
+  %clamp.1148.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25913.5.clone.1, %broadcast.244501.384)
+  %multiply.25914.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1148.3.clone.1, %broadcast.244502.1)
+  %constant_192897_1_clone_1 = u32[] constant(749277645)
+  %broadcast.259469.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_192897_1_clone_1), dimensions={}
+  %add.252670.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.259469.44.clone.1)
+  %constant_192904_1_clone_1 = u32[] constant(3963949667)
+  %broadcast.259471.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_192904_1_clone_1), dimensions={}
+  %add.252671.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.259471.113.clone.1)
+  %add.252672.35.clone.1 = u32[1280,1280]{1,0} add(%add.252670.37.clone.1, %add.252671.99.clone.1)
+  %shift-left.111463.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252671.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117771.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252671.99.clone.1, %broadcast.244415.6016)
+  %or.117293.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111463.31.clone.1, %shift-right-logical.117771.29.clone.1)
+  %xor.123848.27.clone.1 = u32[1280,1280]{1,0} xor(%add.252672.35.clone.1, %or.117293.29.clone.1)
+  %add.252673.5.clone.1 = u32[1280,1280]{1,0} add(%add.252672.35.clone.1, %xor.123848.27.clone.1)
+  %shift-left.111464.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123848.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117772.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123848.27.clone.1, %broadcast.244417.5760)
+  %or.117294.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111464.9.clone.1, %shift-right-logical.117772.9.clone.1)
+  %xor.123849.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252673.5.clone.1, %or.117294.7.clone.1)
+  %add.252675.3.clone.1 = u32[1280,1280]{1,0} add(%add.252673.5.clone.1, %xor.123849.5.clone.1)
+  %shift-left.111465.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123849.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117773.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123849.5.clone.1, %broadcast.244419.4352)
+  %or.117295.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111465.5.clone.1, %shift-right-logical.117773.5.clone.1)
+  %xor.123850.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252675.3.clone.1, %or.117295.3.clone.1)
+  %add.252676.3.clone.1 = u32[1280,1280]{1,0} add(%add.252675.3.clone.1, %xor.123850.3.clone.1)
+  %add.252677.7.clone.1 = u32[1280,1280]{1,0} add(%add.252676.3.clone.1, %broadcast.259471.113.clone.1)
+  %shift-left.111467.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123850.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117774.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123850.3.clone.1, %broadcast.244418.4352)
+  %or.117296.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111467.5.clone.1, %shift-right-logical.117774.5.clone.1)
+  %xor.123851.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252676.3.clone.1, %or.117296.3.clone.1)
+  %constant_218710_1_clone_1 = u32[] constant(3678214261)
+  %broadcast.259481.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218710_1_clone_1), dimensions={}
+  %add.252678.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123851.3.clone.1, %broadcast.259481.5.clone.1)
+  %add.252679.5.clone.1 = u32[1280,1280]{1,0} add(%add.252677.7.clone.1, %add.252678.5.clone.1)
+  %shift-left.111468.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252678.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117775.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252678.5.clone.1, %broadcast.244416.5760)
+  %or.117297.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111468.9.clone.1, %shift-right-logical.117775.9.clone.1)
+  %xor.123852.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252679.5.clone.1, %or.117297.7.clone.1)
+  %add.252681.3.clone.1 = u32[1280,1280]{1,0} add(%add.252679.5.clone.1, %xor.123852.5.clone.1)
+  %shift-left.111469.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123852.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117776.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123852.5.clone.1, %broadcast.244429.2304)
+  %or.117298.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111469.9.clone.1, %shift-right-logical.117776.9.clone.1)
+  %xor.123853.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252681.3.clone.1, %or.117298.7.clone.1)
+  %add.252684.3.clone.1 = u32[1280,1280]{1,0} add(%add.252681.3.clone.1, %xor.123853.5.clone.1)
+  %shift-left.111470.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123853.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117777.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123853.5.clone.1, %broadcast.244430.4608)
+  %or.117299.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111470.9.clone.1, %shift-right-logical.117777.9.clone.1)
+  %xor.123854.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252684.3.clone.1, %or.117299.7.clone.1)
+  %add.252685.3.clone.1 = u32[1280,1280]{1,0} add(%add.252684.3.clone.1, %xor.123854.5.clone.1)
+  %constant_192906_1_clone_1 = u32[] constant(3678214260)
+  %broadcast.259488.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_192906_1_clone_1), dimensions={}
+  %add.252686.7.clone.1 = u32[1280,1280]{1,0} add(%add.252685.3.clone.1, %broadcast.259488.24.clone.1)
+  %shift-left.111471.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123854.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117778.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123854.5.clone.1, %broadcast.244434.2816)
+  %or.117300.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111471.11.clone.1, %shift-right-logical.117778.11.clone.1)
+  %xor.123855.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252685.3.clone.1, %or.117300.9.clone.1)
+  %constant_218711_1_clone_1 = u32[] constant(749277647)
+  %broadcast.259491.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218711_1_clone_1), dimensions={}
+  %add.252687.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123855.7.clone.1, %broadcast.259491.5.clone.1)
+  %add.252688.5.clone.1 = u32[1280,1280]{1,0} add(%add.252686.7.clone.1, %add.252687.5.clone.1)
+  %shift-left.111472.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252687.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117779.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252687.5.clone.1, %broadcast.244415.6016)
+  %or.117301.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111472.9.clone.1, %shift-right-logical.117779.9.clone.1)
+  %xor.123856.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252688.5.clone.1, %or.117301.7.clone.1)
+  %add.252689.3.clone.1 = u32[1280,1280]{1,0} add(%add.252688.5.clone.1, %xor.123856.5.clone.1)
+  %shift-left.111473.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123856.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117780.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123856.5.clone.1, %broadcast.244417.5760)
+  %or.117302.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111473.9.clone.1, %shift-right-logical.117780.9.clone.1)
+  %xor.123857.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252689.3.clone.1, %or.117302.7.clone.1)
+  %add.252690.3.clone.1 = u32[1280,1280]{1,0} add(%add.252689.3.clone.1, %xor.123857.5.clone.1)
+  %shift-left.111474.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123857.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117781.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123857.5.clone.1, %broadcast.244419.4352)
+  %or.117303.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111474.7.clone.1, %shift-right-logical.117781.7.clone.1)
+  %xor.123858.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252690.3.clone.1, %or.117303.5.clone.1)
+  %add.252691.3.clone.1 = u32[1280,1280]{1,0} add(%add.252690.3.clone.1, %xor.123858.3.clone.1)
+  %add.252692.7.clone.1 = u32[1280,1280]{1,0} add(%add.252691.3.clone.1, %broadcast.259469.44.clone.1)
+  %shift-left.111475.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123858.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117782.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123858.3.clone.1, %broadcast.244418.4352)
+  %or.117304.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111475.7.clone.1, %shift-right-logical.117782.7.clone.1)
+  %xor.123859.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252691.3.clone.1, %or.117304.5.clone.1)
+  %constant_218712_1_clone_1 = u32[] constant(3963949670)
+  %broadcast.259501.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218712_1_clone_1), dimensions={}
+  %add.252693.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123859.3.clone.1, %broadcast.259501.5.clone.1)
+  %add.252694.5.clone.1 = u32[1280,1280]{1,0} add(%add.252692.7.clone.1, %add.252693.5.clone.1)
+  %shift-left.111477.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252693.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.117783.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252693.5.clone.1, %broadcast.244416.5760)
+  %or.117305.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111477.9.clone.1, %shift-right-logical.117783.9.clone.1)
+  %xor.123860.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252694.5.clone.1, %or.117305.7.clone.1)
+  %add.252695.3.clone.1 = u32[1280,1280]{1,0} add(%add.252694.5.clone.1, %xor.123860.5.clone.1)
+  %shift-left.111478.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123860.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.117784.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123860.5.clone.1, %broadcast.244429.2304)
+  %or.117306.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111478.9.clone.1, %shift-right-logical.117784.9.clone.1)
+  %xor.123861.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252695.3.clone.1, %or.117306.7.clone.1)
+  %add.252696.3.clone.1 = u32[1280,1280]{1,0} add(%add.252695.3.clone.1, %xor.123861.5.clone.1)
+  %shift-left.111479.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123861.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.117785.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123861.5.clone.1, %broadcast.244430.4608)
+  %or.117307.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111479.9.clone.1, %shift-right-logical.117785.9.clone.1)
+  %xor.123862.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252696.3.clone.1, %or.117307.7.clone.1)
+  %add.252697.3.clone.1 = u32[1280,1280]{1,0} add(%add.252696.3.clone.1, %xor.123862.5.clone.1)
+  %add.252698.7.clone.1 = u32[1280,1280]{1,0} add(%add.252697.3.clone.1, %broadcast.259471.113.clone.1)
+  %shift-left.111480.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123862.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.117786.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123862.5.clone.1, %broadcast.244434.2816)
+  %or.117308.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111480.11.clone.1, %shift-right-logical.117786.11.clone.1)
+  %xor.123863.7.clone.1 = u32[1280,1280]{1,0} xor(%add.252697.3.clone.1, %or.117308.9.clone.1)
+  %constant_218713_1_clone_1 = u32[] constant(3678214264)
+  %broadcast.259511.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218713_1_clone_1), dimensions={}
+  %add.252699.5.clone.1 = u32[1280,1280]{1,0} add(%xor.123863.7.clone.1, %broadcast.259511.5.clone.1)
+  %add.252700.5.clone.1 = u32[1280,1280]{1,0} add(%add.252698.7.clone.1, %add.252699.5.clone.1)
+  %shift-left.111482.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.252699.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.117787.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.252699.5.clone.1, %broadcast.244415.6016)
+  %or.117309.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111482.9.clone.1, %shift-right-logical.117787.9.clone.1)
+  %xor.123864.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252700.5.clone.1, %or.117309.7.clone.1)
+  %add.252701.3.clone.1 = u32[1280,1280]{1,0} add(%add.252700.5.clone.1, %xor.123864.5.clone.1)
+  %shift-left.111483.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123864.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.117788.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123864.5.clone.1, %broadcast.244417.5760)
+  %or.117310.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111483.9.clone.1, %shift-right-logical.117788.9.clone.1)
+  %xor.123865.5.clone.1 = u32[1280,1280]{1,0} xor(%add.252701.3.clone.1, %or.117310.7.clone.1)
+  %add.252702.3.clone.1 = u32[1280,1280]{1,0} add(%add.252701.3.clone.1, %xor.123865.5.clone.1)
+  %shift-left.111484.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123865.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.117789.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123865.5.clone.1, %broadcast.244419.4352)
+  %or.117311.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111484.5.clone.1, %shift-right-logical.117789.5.clone.1)
+  %xor.123866.3.clone.1 = u32[1280,1280]{1,0} xor(%add.252702.3.clone.1, %or.117311.3.clone.1)
+  %add.252703.3.clone.1 = u32[1280,1280]{1,0} add(%add.252702.3.clone.1, %xor.123866.3.clone.1)
+  %add.252704.17.clone.1 = u32[1280,1280]{1,0} add(%add.252703.3.clone.1, %broadcast.259488.24.clone.1)
+  %shift-left.111485.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.123866.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.117790.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123866.3.clone.1, %broadcast.244418.4352)
+  %or.117312.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.111485.5.clone.1, %shift-right-logical.117790.5.clone.1)
+  %xor.123867.15.clone.1 = u32[1280,1280]{1,0} xor(%add.252703.3.clone.1, %or.117312.3.clone.1)
+  %constant_218714_1_clone_1 = u32[] constant(749277650)
+  %broadcast.259521.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218714_1_clone_1), dimensions={}
+  %add.252705.19.clone.1 = u32[1280,1280]{1,0} add(%xor.123867.15.clone.1, %broadcast.259521.19.clone.1)
+  %xor.123868.17.clone.1 = u32[1280,1280]{1,0} xor(%add.252704.17.clone.1, %add.252705.19.clone.1)
+  %shift-right-logical.117791.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.123868.17.clone.1, %broadcast.244468.1920)
+  %or.117313.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.117791.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5840.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.117313.13.clone.1)
+  %add.252706.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5840.11.clone.1, %broadcast.244470.1152)
+  %multiply.27352.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252706.9.clone.1, %broadcast.244471.896)
+  %add.252707.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.27352.7.clone.1, %broadcast.244408.1024)
+  %maximum.3772.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.252707.5.clone.1)
+  %abs.1598.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3772.3.clone.1)
+  %compare.7358.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1598.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.27353.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3772.3.clone.1, %broadcast.244476.1152)
+  %negate.4701.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3772.3.clone.1)
+  %multiply.27354.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3772.3.clone.1, %negate.4701.5.clone.1)
+  %log-plus-one.1598.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.27354.5.clone.1)
+  %negate.4702.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1598.3.clone.1)
+  %compare.7359.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4702.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21685.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21686.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21687.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21688.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21689.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21690.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21691.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21692.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21693.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.252708.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4702.4.clone.1, %broadcast.244496.640)
+  %sqrt.1598.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4702.4.clone.1)
+  %add.252709.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1598.5.clone.1, %broadcast.244498.640)
+  %select.21694.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7359.3.clone.1, %add.252708.5.clone.1, %add.252709.5.clone.1)
+  %multiply.27355.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21693.3.clone.1, %select.21694.3.clone.1)
+  %add.252710.1.clone.1 = f32[1280,1280]{1,0} add(%select.21692.3.clone.1, %multiply.27355.1.clone.1)
+  %multiply.27356.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252710.1.clone.1, %select.21694.3.clone.1)
+  %add.252712.1.clone.1 = f32[1280,1280]{1,0} add(%select.21691.3.clone.1, %multiply.27356.1.clone.1)
+  %multiply.27357.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252712.1.clone.1, %select.21694.3.clone.1)
+  %add.252713.1.clone.1 = f32[1280,1280]{1,0} add(%select.21690.3.clone.1, %multiply.27357.1.clone.1)
+  %multiply.27358.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252713.1.clone.1, %select.21694.3.clone.1)
+  %add.252714.1.clone.1 = f32[1280,1280]{1,0} add(%select.21689.3.clone.1, %multiply.27358.1.clone.1)
+  %multiply.27359.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252714.1.clone.1, %select.21694.3.clone.1)
+  %add.252715.3.clone.1 = f32[1280,1280]{1,0} add(%select.21688.5.clone.1, %multiply.27359.1.clone.1)
+  %multiply.27360.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.252715.3.clone.1, %select.21694.3.clone.1)
+  %add.252716.3.clone.1 = f32[1280,1280]{1,0} add(%select.21687.5.clone.1, %multiply.27360.1.clone.1)
+  %multiply.27361.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252716.3.clone.1, %select.21694.3.clone.1)
+  %add.252717.9.clone.1 = f32[1280,1280]{1,0} add(%select.21686.11.clone.1, %multiply.27361.7.clone.1)
+  %multiply.27362.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252717.9.clone.1, %select.21694.3.clone.1)
+  %add.252718.7.clone.1 = f32[1280,1280]{1,0} add(%select.21685.7.clone.1, %multiply.27362.7.clone.1)
+  %multiply.27363.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.252718.7.clone.1, %maximum.3772.3.clone.1)
+  %select.21695.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7358.3.clone.1, %multiply.27353.9.clone.1, %multiply.27363.7.clone.1)
+  %multiply.27364.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21695.7.clone.1, %broadcast.244500.640)
+  %clamp.1242.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.27364.5.clone.1, %broadcast.244501.384)
+  %multiply.27365.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1242.3.clone.1, %broadcast.244502.1)
+  %constant_164188_1_clone_1 = u32[] constant(961610870)
+  %broadcast.247063.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164188_1_clone_1), dimensions={}
+  %add.245577.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.247063.44.clone.1)
+  %constant_164211_1_clone_1 = u32[] constant(4021814316)
+  %broadcast.247064.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164211_1_clone_1), dimensions={}
+  %add.245578.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.247064.113.clone.1)
+  %add.245579.35.clone.1 = u32[1280,1280]{1,0} add(%add.245577.37.clone.1, %add.245578.99.clone.1)
+  %shift-left.108400.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245578.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114524.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245578.99.clone.1, %broadcast.244415.6016)
+  %or.114049.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108400.31.clone.1, %shift-right-logical.114524.29.clone.1)
+  %xor.120598.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245579.35.clone.1, %or.114049.29.clone.1)
+  %add.245581.5.clone.1 = u32[1280,1280]{1,0} add(%add.245579.35.clone.1, %xor.120598.27.clone.1)
+  %shift-left.108401.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120598.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114525.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120598.27.clone.1, %broadcast.244417.5760)
+  %or.114050.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108401.9.clone.1, %shift-right-logical.114525.9.clone.1)
+  %xor.120599.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245581.5.clone.1, %or.114050.7.clone.1)
+  %add.245582.3.clone.1 = u32[1280,1280]{1,0} add(%add.245581.5.clone.1, %xor.120599.5.clone.1)
+  %shift-left.108402.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120599.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114527.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120599.5.clone.1, %broadcast.244419.4352)
+  %or.114051.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108402.5.clone.1, %shift-right-logical.114527.5.clone.1)
+  %xor.120601.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245582.3.clone.1, %or.114051.3.clone.1)
+  %add.245583.3.clone.1 = u32[1280,1280]{1,0} add(%add.245582.3.clone.1, %xor.120601.3.clone.1)
+  %add.245584.7.clone.1 = u32[1280,1280]{1,0} add(%add.245583.3.clone.1, %broadcast.247064.113.clone.1)
+  %shift-left.108403.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120601.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114528.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120601.3.clone.1, %broadcast.244418.4352)
+  %or.114052.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108403.5.clone.1, %shift-right-logical.114528.5.clone.1)
+  %xor.120602.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245583.3.clone.1, %or.114052.3.clone.1)
+  %constant_217929_1_clone_1 = u32[] constant(3443006337)
+  %broadcast.247074.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217929_1_clone_1), dimensions={}
+  %add.245586.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120602.3.clone.1, %broadcast.247074.5.clone.1)
+  %add.245587.5.clone.1 = u32[1280,1280]{1,0} add(%add.245584.7.clone.1, %add.245586.5.clone.1)
+  %shift-left.108404.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245586.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114529.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245586.5.clone.1, %broadcast.244416.5760)
+  %or.114053.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108404.9.clone.1, %shift-right-logical.114529.9.clone.1)
+  %xor.120603.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245587.5.clone.1, %or.114053.7.clone.1)
+  %add.245588.3.clone.1 = u32[1280,1280]{1,0} add(%add.245587.5.clone.1, %xor.120603.5.clone.1)
+  %shift-left.108405.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120603.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114530.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120603.5.clone.1, %broadcast.244429.2304)
+  %or.114054.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108405.9.clone.1, %shift-right-logical.114530.9.clone.1)
+  %xor.120604.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245588.3.clone.1, %or.114054.7.clone.1)
+  %add.245589.3.clone.1 = u32[1280,1280]{1,0} add(%add.245588.3.clone.1, %xor.120604.5.clone.1)
+  %shift-left.108406.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120604.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114531.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120604.5.clone.1, %broadcast.244430.4608)
+  %or.114055.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108406.9.clone.1, %shift-right-logical.114531.9.clone.1)
+  %xor.120606.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245589.3.clone.1, %or.114055.7.clone.1)
+  %add.245591.3.clone.1 = u32[1280,1280]{1,0} add(%add.245589.3.clone.1, %xor.120606.5.clone.1)
+  %constant_164213_1_clone_1 = u32[] constant(3443006336)
+  %broadcast.247081.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_164213_1_clone_1), dimensions={}
+  %add.245592.7.clone.1 = u32[1280,1280]{1,0} add(%add.245591.3.clone.1, %broadcast.247081.24.clone.1)
+  %shift-left.108407.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120606.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114532.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120606.5.clone.1, %broadcast.244434.2816)
+  %or.114056.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108407.11.clone.1, %shift-right-logical.114532.11.clone.1)
+  %xor.120607.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245591.3.clone.1, %or.114056.9.clone.1)
+  %constant_217930_1_clone_1 = u32[] constant(961610872)
+  %broadcast.247084.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217930_1_clone_1), dimensions={}
+  %add.245593.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120607.7.clone.1, %broadcast.247084.5.clone.1)
+  %add.245594.5.clone.1 = u32[1280,1280]{1,0} add(%add.245592.7.clone.1, %add.245593.5.clone.1)
+  %shift-left.108408.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245593.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114533.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245593.5.clone.1, %broadcast.244415.6016)
+  %or.114057.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108408.9.clone.1, %shift-right-logical.114533.9.clone.1)
+  %xor.120608.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245594.5.clone.1, %or.114057.7.clone.1)
+  %add.245595.3.clone.1 = u32[1280,1280]{1,0} add(%add.245594.5.clone.1, %xor.120608.5.clone.1)
+  %shift-left.108409.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120608.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114534.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120608.5.clone.1, %broadcast.244417.5760)
+  %or.114058.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108409.9.clone.1, %shift-right-logical.114534.9.clone.1)
+  %xor.120609.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245595.3.clone.1, %or.114058.7.clone.1)
+  %add.245597.3.clone.1 = u32[1280,1280]{1,0} add(%add.245595.3.clone.1, %xor.120609.5.clone.1)
+  %shift-left.108410.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120609.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114535.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120609.5.clone.1, %broadcast.244419.4352)
+  %or.114059.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108410.7.clone.1, %shift-right-logical.114535.7.clone.1)
+  %xor.120610.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245597.3.clone.1, %or.114059.5.clone.1)
+  %add.245601.3.clone.1 = u32[1280,1280]{1,0} add(%add.245597.3.clone.1, %xor.120610.3.clone.1)
+  %add.245602.7.clone.1 = u32[1280,1280]{1,0} add(%add.245601.3.clone.1, %broadcast.247063.44.clone.1)
+  %shift-left.108411.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120610.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114537.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120610.3.clone.1, %broadcast.244418.4352)
+  %or.114060.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108411.7.clone.1, %shift-right-logical.114537.7.clone.1)
+  %xor.120611.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245601.3.clone.1, %or.114060.5.clone.1)
+  %constant_217931_1_clone_1 = u32[] constant(4021814319)
+  %broadcast.247094.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217931_1_clone_1), dimensions={}
+  %add.245603.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120611.3.clone.1, %broadcast.247094.5.clone.1)
+  %add.245604.5.clone.1 = u32[1280,1280]{1,0} add(%add.245602.7.clone.1, %add.245603.5.clone.1)
+  %shift-left.108412.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245603.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114538.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245603.5.clone.1, %broadcast.244416.5760)
+  %or.114061.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108412.9.clone.1, %shift-right-logical.114538.9.clone.1)
+  %xor.120612.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245604.5.clone.1, %or.114061.7.clone.1)
+  %add.245606.3.clone.1 = u32[1280,1280]{1,0} add(%add.245604.5.clone.1, %xor.120612.5.clone.1)
+  %shift-left.108413.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120612.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114539.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120612.5.clone.1, %broadcast.244429.2304)
+  %or.114062.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108413.9.clone.1, %shift-right-logical.114539.9.clone.1)
+  %xor.120613.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245606.3.clone.1, %or.114062.7.clone.1)
+  %add.245607.3.clone.1 = u32[1280,1280]{1,0} add(%add.245606.3.clone.1, %xor.120613.5.clone.1)
+  %shift-left.108414.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120613.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114540.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120613.5.clone.1, %broadcast.244430.4608)
+  %or.114063.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108414.9.clone.1, %shift-right-logical.114540.9.clone.1)
+  %xor.120614.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245607.3.clone.1, %or.114063.7.clone.1)
+  %add.245608.3.clone.1 = u32[1280,1280]{1,0} add(%add.245607.3.clone.1, %xor.120614.5.clone.1)
+  %add.245609.7.clone.1 = u32[1280,1280]{1,0} add(%add.245608.3.clone.1, %broadcast.247064.113.clone.1)
+  %shift-left.108415.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120614.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114542.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120614.5.clone.1, %broadcast.244434.2816)
+  %or.114064.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108415.11.clone.1, %shift-right-logical.114542.11.clone.1)
+  %xor.120616.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245608.3.clone.1, %or.114064.9.clone.1)
+  %constant_217932_1_clone_1 = u32[] constant(3443006340)
+  %broadcast.247104.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217932_1_clone_1), dimensions={}
+  %add.245611.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120616.7.clone.1, %broadcast.247104.5.clone.1)
+  %add.245612.5.clone.1 = u32[1280,1280]{1,0} add(%add.245609.7.clone.1, %add.245611.5.clone.1)
+  %shift-left.108416.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245611.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114543.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245611.5.clone.1, %broadcast.244415.6016)
+  %or.114065.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108416.9.clone.1, %shift-right-logical.114543.9.clone.1)
+  %xor.120617.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245612.5.clone.1, %or.114065.7.clone.1)
+  %add.245613.3.clone.1 = u32[1280,1280]{1,0} add(%add.245612.5.clone.1, %xor.120617.5.clone.1)
+  %shift-left.108417.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120617.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114544.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120617.5.clone.1, %broadcast.244417.5760)
+  %or.114066.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108417.9.clone.1, %shift-right-logical.114544.9.clone.1)
+  %xor.120618.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245613.3.clone.1, %or.114066.7.clone.1)
+  %add.245614.3.clone.1 = u32[1280,1280]{1,0} add(%add.245613.3.clone.1, %xor.120618.5.clone.1)
+  %shift-left.108418.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120618.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114545.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120618.5.clone.1, %broadcast.244419.4352)
+  %or.114067.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108418.5.clone.1, %shift-right-logical.114545.5.clone.1)
+  %xor.120619.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245614.3.clone.1, %or.114067.3.clone.1)
+  %add.245616.3.clone.1 = u32[1280,1280]{1,0} add(%add.245614.3.clone.1, %xor.120619.3.clone.1)
+  %add.245617.17.clone.1 = u32[1280,1280]{1,0} add(%add.245616.3.clone.1, %broadcast.247081.24.clone.1)
+  %shift-left.108419.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120619.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114547.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120619.3.clone.1, %broadcast.244418.4352)
+  %or.114068.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108419.5.clone.1, %shift-right-logical.114547.5.clone.1)
+  %xor.120621.15.clone.1 = u32[1280,1280]{1,0} xor(%add.245616.3.clone.1, %or.114068.3.clone.1)
+  %constant_217933_1_clone_1 = u32[] constant(961610875)
+  %broadcast.247114.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217933_1_clone_1), dimensions={}
+  %add.245618.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120621.15.clone.1, %broadcast.247114.19.clone.1)
+  %xor.120622.17.clone.1 = u32[1280,1280]{1,0} xor(%add.245617.17.clone.1, %add.245618.19.clone.1)
+  %shift-right-logical.114548.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120622.17.clone.1, %broadcast.244468.1920)
+  %or.114069.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114548.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5699.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114069.13.clone.1)
+  %add.245619.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5699.11.clone.1, %broadcast.244470.1152)
+  %multiply.25887.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245619.9.clone.1, %broadcast.244471.896)
+  %add.245620.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25887.7.clone.1, %broadcast.244408.1024)
+  %maximum.3631.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.245620.5.clone.1)
+  %abs.1503.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3631.3.clone.1)
+  %compare.7154.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1503.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25888.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3631.3.clone.1, %broadcast.244476.1152)
+  %negate.4511.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3631.3.clone.1)
+  %multiply.25889.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3631.3.clone.1, %negate.4511.5.clone.1)
+  %log-plus-one.1503.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25889.5.clone.1)
+  %negate.4512.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1503.3.clone.1)
+  %compare.7155.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4512.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20598.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20599.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20600.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20601.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20602.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20603.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20604.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20605.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20606.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.245622.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4512.4.clone.1, %broadcast.244496.640)
+  %sqrt.1503.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4512.4.clone.1)
+  %add.245626.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1503.5.clone.1, %broadcast.244498.640)
+  %select.20607.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7155.3.clone.1, %add.245622.5.clone.1, %add.245626.5.clone.1)
+  %multiply.25890.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20606.3.clone.1, %select.20607.3.clone.1)
+  %add.245627.1.clone.1 = f32[1280,1280]{1,0} add(%select.20605.3.clone.1, %multiply.25890.1.clone.1)
+  %multiply.25891.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245627.1.clone.1, %select.20607.3.clone.1)
+  %add.245628.1.clone.1 = f32[1280,1280]{1,0} add(%select.20604.3.clone.1, %multiply.25891.1.clone.1)
+  %multiply.25892.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245628.1.clone.1, %select.20607.3.clone.1)
+  %add.245629.1.clone.1 = f32[1280,1280]{1,0} add(%select.20603.3.clone.1, %multiply.25892.1.clone.1)
+  %multiply.25893.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245629.1.clone.1, %select.20607.3.clone.1)
+  %add.245631.1.clone.1 = f32[1280,1280]{1,0} add(%select.20602.3.clone.1, %multiply.25893.1.clone.1)
+  %multiply.25894.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245631.1.clone.1, %select.20607.3.clone.1)
+  %add.245632.3.clone.1 = f32[1280,1280]{1,0} add(%select.20601.5.clone.1, %multiply.25894.1.clone.1)
+  %multiply.25895.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245632.3.clone.1, %select.20607.3.clone.1)
+  %add.245633.3.clone.1 = f32[1280,1280]{1,0} add(%select.20600.5.clone.1, %multiply.25895.1.clone.1)
+  %multiply.25896.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245633.3.clone.1, %select.20607.3.clone.1)
+  %add.245634.9.clone.1 = f32[1280,1280]{1,0} add(%select.20599.11.clone.1, %multiply.25896.7.clone.1)
+  %multiply.25897.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245634.9.clone.1, %select.20607.3.clone.1)
+  %add.245636.7.clone.1 = f32[1280,1280]{1,0} add(%select.20598.7.clone.1, %multiply.25897.7.clone.1)
+  %multiply.25898.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245636.7.clone.1, %maximum.3631.3.clone.1)
+  %select.20608.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7154.3.clone.1, %multiply.25888.9.clone.1, %multiply.25898.7.clone.1)
+  %multiply.25899.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20608.7.clone.1, %broadcast.244500.640)
+  %clamp.1147.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25899.5.clone.1, %broadcast.244501.384)
+  %multiply.25900.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1147.3.clone.1, %broadcast.244502.1)
+  %constant_180474_1_clone_1 = u32[] constant(594086741)
+  %broadcast.254080.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_180474_1_clone_1), dimensions={}
+  %add.249588.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.254080.44.clone.1)
+  %constant_180481_1_clone_1 = u32[] constant(3794055208)
+  %broadcast.254081.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_180481_1_clone_1), dimensions={}
+  %add.249590.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.254081.113.clone.1)
+  %add.249591.35.clone.1 = u32[1280,1280]{1,0} add(%add.249588.37.clone.1, %add.249590.99.clone.1)
+  %shift-left.110121.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249590.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116352.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249590.99.clone.1, %broadcast.244415.6016)
+  %or.115885.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110121.31.clone.1, %shift-right-logical.116352.29.clone.1)
+  %xor.122432.27.clone.1 = u32[1280,1280]{1,0} xor(%add.249591.35.clone.1, %or.115885.29.clone.1)
+  %add.249592.5.clone.1 = u32[1280,1280]{1,0} add(%add.249591.35.clone.1, %xor.122432.27.clone.1)
+  %shift-left.110122.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122432.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116354.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122432.27.clone.1, %broadcast.244417.5760)
+  %or.115886.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110122.9.clone.1, %shift-right-logical.116354.9.clone.1)
+  %xor.122433.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249592.5.clone.1, %or.115886.7.clone.1)
+  %add.249593.3.clone.1 = u32[1280,1280]{1,0} add(%add.249592.5.clone.1, %xor.122433.5.clone.1)
+  %shift-left.110123.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122433.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116355.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122433.5.clone.1, %broadcast.244419.4352)
+  %or.115887.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110123.5.clone.1, %shift-right-logical.116355.5.clone.1)
+  %xor.122434.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249593.3.clone.1, %or.115887.3.clone.1)
+  %add.249595.3.clone.1 = u32[1280,1280]{1,0} add(%add.249593.3.clone.1, %xor.122434.3.clone.1)
+  %add.249596.7.clone.1 = u32[1280,1280]{1,0} add(%add.249595.3.clone.1, %broadcast.254081.113.clone.1)
+  %shift-left.110124.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122434.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116356.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122434.3.clone.1, %broadcast.244418.4352)
+  %or.115888.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110124.5.clone.1, %shift-right-logical.116356.5.clone.1)
+  %xor.122435.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249595.3.clone.1, %or.115888.3.clone.1)
+  %constant_218381_1_clone_1 = u32[] constant(3667697832)
+  %broadcast.254091.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218381_1_clone_1), dimensions={}
+  %add.249597.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122435.3.clone.1, %broadcast.254091.5.clone.1)
+  %add.249598.5.clone.1 = u32[1280,1280]{1,0} add(%add.249596.7.clone.1, %add.249597.5.clone.1)
+  %shift-left.110126.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249597.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116357.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249597.5.clone.1, %broadcast.244416.5760)
+  %or.115889.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110126.9.clone.1, %shift-right-logical.116357.9.clone.1)
+  %xor.122436.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249598.5.clone.1, %or.115889.7.clone.1)
+  %add.249600.3.clone.1 = u32[1280,1280]{1,0} add(%add.249598.5.clone.1, %xor.122436.5.clone.1)
+  %shift-left.110127.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122436.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116359.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122436.5.clone.1, %broadcast.244429.2304)
+  %or.115890.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110127.9.clone.1, %shift-right-logical.116359.9.clone.1)
+  %xor.122437.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249600.3.clone.1, %or.115890.7.clone.1)
+  %add.249601.3.clone.1 = u32[1280,1280]{1,0} add(%add.249600.3.clone.1, %xor.122437.5.clone.1)
+  %shift-left.110128.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122437.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116360.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122437.5.clone.1, %broadcast.244430.4608)
+  %or.115891.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110128.9.clone.1, %shift-right-logical.116360.9.clone.1)
+  %xor.122438.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249601.3.clone.1, %or.115891.7.clone.1)
+  %add.249602.3.clone.1 = u32[1280,1280]{1,0} add(%add.249601.3.clone.1, %xor.122438.5.clone.1)
+  %constant_180483_1_clone_1 = u32[] constant(3667697831)
+  %broadcast.254098.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_180483_1_clone_1), dimensions={}
+  %add.249603.7.clone.1 = u32[1280,1280]{1,0} add(%add.249602.3.clone.1, %broadcast.254098.24.clone.1)
+  %shift-left.110129.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122438.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116361.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122438.5.clone.1, %broadcast.244434.2816)
+  %or.115892.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110129.11.clone.1, %shift-right-logical.116361.11.clone.1)
+  %xor.122441.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249602.3.clone.1, %or.115892.9.clone.1)
+  %constant_218383_1_clone_1 = u32[] constant(594086743)
+  %broadcast.254101.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218383_1_clone_1), dimensions={}
+  %add.249604.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122441.7.clone.1, %broadcast.254101.5.clone.1)
+  %add.249606.5.clone.1 = u32[1280,1280]{1,0} add(%add.249603.7.clone.1, %add.249604.5.clone.1)
+  %shift-left.110131.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249604.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116362.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249604.5.clone.1, %broadcast.244415.6016)
+  %or.115893.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110131.9.clone.1, %shift-right-logical.116362.9.clone.1)
+  %xor.122442.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249606.5.clone.1, %or.115893.7.clone.1)
+  %add.249610.3.clone.1 = u32[1280,1280]{1,0} add(%add.249606.5.clone.1, %xor.122442.5.clone.1)
+  %shift-left.110132.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122442.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116364.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122442.5.clone.1, %broadcast.244417.5760)
+  %or.115894.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110132.9.clone.1, %shift-right-logical.116364.9.clone.1)
+  %xor.122443.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249610.3.clone.1, %or.115894.7.clone.1)
+  %add.249611.3.clone.1 = u32[1280,1280]{1,0} add(%add.249610.3.clone.1, %xor.122443.5.clone.1)
+  %shift-left.110133.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122443.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116365.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122443.5.clone.1, %broadcast.244419.4352)
+  %or.115895.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110133.7.clone.1, %shift-right-logical.116365.7.clone.1)
+  %xor.122444.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249611.3.clone.1, %or.115895.5.clone.1)
+  %add.249612.3.clone.1 = u32[1280,1280]{1,0} add(%add.249611.3.clone.1, %xor.122444.3.clone.1)
+  %add.249613.7.clone.1 = u32[1280,1280]{1,0} add(%add.249612.3.clone.1, %broadcast.254080.44.clone.1)
+  %shift-left.110134.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122444.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116366.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122444.3.clone.1, %broadcast.244418.4352)
+  %or.115896.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110134.7.clone.1, %shift-right-logical.116366.7.clone.1)
+  %xor.122445.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249612.3.clone.1, %or.115896.5.clone.1)
+  %constant_218385_1_clone_1 = u32[] constant(3794055211)
+  %broadcast.254111.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218385_1_clone_1), dimensions={}
+  %add.249615.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122445.3.clone.1, %broadcast.254111.5.clone.1)
+  %add.249616.5.clone.1 = u32[1280,1280]{1,0} add(%add.249613.7.clone.1, %add.249615.5.clone.1)
+  %shift-left.110136.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249615.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.116367.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249615.5.clone.1, %broadcast.244416.5760)
+  %or.115897.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110136.9.clone.1, %shift-right-logical.116367.9.clone.1)
+  %xor.122446.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249616.5.clone.1, %or.115897.7.clone.1)
+  %add.249617.3.clone.1 = u32[1280,1280]{1,0} add(%add.249616.5.clone.1, %xor.122446.5.clone.1)
+  %shift-left.110137.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122446.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.116368.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122446.5.clone.1, %broadcast.244429.2304)
+  %or.115898.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110137.9.clone.1, %shift-right-logical.116368.9.clone.1)
+  %xor.122447.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249617.3.clone.1, %or.115898.7.clone.1)
+  %add.249618.3.clone.1 = u32[1280,1280]{1,0} add(%add.249617.3.clone.1, %xor.122447.5.clone.1)
+  %shift-left.110138.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122447.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.116369.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122447.5.clone.1, %broadcast.244430.4608)
+  %or.115899.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110138.9.clone.1, %shift-right-logical.116369.9.clone.1)
+  %xor.122448.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249618.3.clone.1, %or.115899.7.clone.1)
+  %add.249620.3.clone.1 = u32[1280,1280]{1,0} add(%add.249618.3.clone.1, %xor.122448.5.clone.1)
+  %add.249621.7.clone.1 = u32[1280,1280]{1,0} add(%add.249620.3.clone.1, %broadcast.254081.113.clone.1)
+  %shift-left.110139.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122448.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.116370.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122448.5.clone.1, %broadcast.244434.2816)
+  %or.115900.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110139.11.clone.1, %shift-right-logical.116370.11.clone.1)
+  %xor.122450.7.clone.1 = u32[1280,1280]{1,0} xor(%add.249620.3.clone.1, %or.115900.9.clone.1)
+  %constant_218387_1_clone_1 = u32[] constant(3667697835)
+  %broadcast.254121.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218387_1_clone_1), dimensions={}
+  %add.249622.5.clone.1 = u32[1280,1280]{1,0} add(%xor.122450.7.clone.1, %broadcast.254121.5.clone.1)
+  %add.249623.5.clone.1 = u32[1280,1280]{1,0} add(%add.249621.7.clone.1, %add.249622.5.clone.1)
+  %shift-left.110140.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.249622.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.116371.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.249622.5.clone.1, %broadcast.244415.6016)
+  %or.115901.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110140.9.clone.1, %shift-right-logical.116371.9.clone.1)
+  %xor.122451.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249623.5.clone.1, %or.115901.7.clone.1)
+  %add.249625.3.clone.1 = u32[1280,1280]{1,0} add(%add.249623.5.clone.1, %xor.122451.5.clone.1)
+  %shift-left.110141.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122451.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.116372.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122451.5.clone.1, %broadcast.244417.5760)
+  %or.115902.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110141.9.clone.1, %shift-right-logical.116372.9.clone.1)
+  %xor.122452.5.clone.1 = u32[1280,1280]{1,0} xor(%add.249625.3.clone.1, %or.115902.7.clone.1)
+  %add.249626.3.clone.1 = u32[1280,1280]{1,0} add(%add.249625.3.clone.1, %xor.122452.5.clone.1)
+  %shift-left.110142.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122452.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.116374.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122452.5.clone.1, %broadcast.244419.4352)
+  %or.115903.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110142.5.clone.1, %shift-right-logical.116374.5.clone.1)
+  %xor.122453.3.clone.1 = u32[1280,1280]{1,0} xor(%add.249626.3.clone.1, %or.115903.3.clone.1)
+  %add.249627.3.clone.1 = u32[1280,1280]{1,0} add(%add.249626.3.clone.1, %xor.122453.3.clone.1)
+  %add.249628.17.clone.1 = u32[1280,1280]{1,0} add(%add.249627.3.clone.1, %broadcast.254098.24.clone.1)
+  %shift-left.110143.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.122453.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.116375.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122453.3.clone.1, %broadcast.244418.4352)
+  %or.115904.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.110143.5.clone.1, %shift-right-logical.116375.5.clone.1)
+  %xor.122455.15.clone.1 = u32[1280,1280]{1,0} xor(%add.249627.3.clone.1, %or.115904.3.clone.1)
+  %constant_218388_1_clone_1 = u32[] constant(594086746)
+  %broadcast.254131.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_218388_1_clone_1), dimensions={}
+  %add.249629.19.clone.1 = u32[1280,1280]{1,0} add(%xor.122455.15.clone.1, %broadcast.254131.19.clone.1)
+  %xor.122456.17.clone.1 = u32[1280,1280]{1,0} xor(%add.249628.17.clone.1, %add.249629.19.clone.1)
+  %shift-right-logical.116376.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.122456.17.clone.1, %broadcast.244468.1920)
+  %or.115905.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.116376.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5779.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.115905.13.clone.1)
+  %add.249631.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5779.11.clone.1, %broadcast.244470.1152)
+  %multiply.26723.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249631.9.clone.1, %broadcast.244471.896)
+  %add.249635.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.26723.7.clone.1, %broadcast.244408.1024)
+  %maximum.3711.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.249635.5.clone.1)
+  %abs.1557.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3711.3.clone.1)
+  %compare.7263.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1557.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.26724.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3711.3.clone.1, %broadcast.244476.1152)
+  %negate.4619.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3711.3.clone.1)
+  %multiply.26725.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3711.3.clone.1, %negate.4619.5.clone.1)
+  %log-plus-one.1557.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.26725.5.clone.1)
+  %negate.4620.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1557.3.clone.1)
+  %compare.7264.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4620.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.21213.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.21214.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.21215.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.21216.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.21217.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.21218.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.21219.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.21220.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.21221.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.249636.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4620.4.clone.1, %broadcast.244496.640)
+  %sqrt.1557.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4620.4.clone.1)
+  %add.249637.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1557.5.clone.1, %broadcast.244498.640)
+  %select.21222.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7264.3.clone.1, %add.249636.5.clone.1, %add.249637.5.clone.1)
+  %multiply.26726.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.21221.3.clone.1, %select.21222.3.clone.1)
+  %add.249638.1.clone.1 = f32[1280,1280]{1,0} add(%select.21220.3.clone.1, %multiply.26726.1.clone.1)
+  %multiply.26727.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249638.1.clone.1, %select.21222.3.clone.1)
+  %add.249640.1.clone.1 = f32[1280,1280]{1,0} add(%select.21219.3.clone.1, %multiply.26727.1.clone.1)
+  %multiply.26728.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249640.1.clone.1, %select.21222.3.clone.1)
+  %add.249641.1.clone.1 = f32[1280,1280]{1,0} add(%select.21218.3.clone.1, %multiply.26728.1.clone.1)
+  %multiply.26729.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249641.1.clone.1, %select.21222.3.clone.1)
+  %add.249642.1.clone.1 = f32[1280,1280]{1,0} add(%select.21217.3.clone.1, %multiply.26729.1.clone.1)
+  %multiply.26730.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249642.1.clone.1, %select.21222.3.clone.1)
+  %add.249643.3.clone.1 = f32[1280,1280]{1,0} add(%select.21216.5.clone.1, %multiply.26730.1.clone.1)
+  %multiply.26731.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.249643.3.clone.1, %select.21222.3.clone.1)
+  %add.249645.3.clone.1 = f32[1280,1280]{1,0} add(%select.21215.5.clone.1, %multiply.26731.1.clone.1)
+  %multiply.26732.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249645.3.clone.1, %select.21222.3.clone.1)
+  %add.249646.9.clone.1 = f32[1280,1280]{1,0} add(%select.21214.11.clone.1, %multiply.26732.7.clone.1)
+  %multiply.26733.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249646.9.clone.1, %select.21222.3.clone.1)
+  %add.249647.7.clone.1 = f32[1280,1280]{1,0} add(%select.21213.7.clone.1, %multiply.26733.7.clone.1)
+  %multiply.26734.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.249647.7.clone.1, %maximum.3711.3.clone.1)
+  %select.21223.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7263.3.clone.1, %multiply.26724.9.clone.1, %multiply.26734.7.clone.1)
+  %multiply.26735.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.21223.7.clone.1, %broadcast.244500.640)
+  %clamp.1201.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.26735.5.clone.1, %broadcast.244501.384)
+  %multiply.26736.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1201.3.clone.1, %broadcast.244502.1)
+  %constant_163637_1_clone_1 = u32[] constant(1982261340)
+  %broadcast.246792.44.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_163637_1_clone_1), dimensions={}
+  %add.245442.37.clone.1 = u32[1280,1280]{1,0} add(%convert.3610.4865, %broadcast.246792.44.clone.1)
+  %constant_163660_1_clone_1 = u32[] constant(4196244297)
+  %broadcast.246793.113.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_163660_1_clone_1), dimensions={}
+  %add.245443.99.clone.1 = u32[1280,1280]{1,0} add(%convert.3611.12801, %broadcast.246793.113.clone.1)
+  %add.245444.35.clone.1 = u32[1280,1280]{1,0} add(%add.245442.37.clone.1, %add.245443.99.clone.1)
+  %shift-left.108340.31.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245443.99.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114449.29.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245443.99.clone.1, %broadcast.244415.6016)
+  %or.113982.29.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108340.31.clone.1, %shift-right-logical.114449.29.clone.1)
+  %xor.120531.27.clone.1 = u32[1280,1280]{1,0} xor(%add.245444.35.clone.1, %or.113982.29.clone.1)
+  %add.245445.5.clone.1 = u32[1280,1280]{1,0} add(%add.245444.35.clone.1, %xor.120531.27.clone.1)
+  %shift-left.108341.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120531.27.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114450.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120531.27.clone.1, %broadcast.244417.5760)
+  %or.113983.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108341.9.clone.1, %shift-right-logical.114450.9.clone.1)
+  %xor.120532.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245445.5.clone.1, %or.113983.7.clone.1)
+  %add.245447.3.clone.1 = u32[1280,1280]{1,0} add(%add.245445.5.clone.1, %xor.120532.5.clone.1)
+  %shift-left.108342.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120532.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114452.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120532.5.clone.1, %broadcast.244419.4352)
+  %or.113984.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108342.5.clone.1, %shift-right-logical.114452.5.clone.1)
+  %xor.120533.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245447.3.clone.1, %or.113984.3.clone.1)
+  %add.245448.3.clone.1 = u32[1280,1280]{1,0} add(%add.245447.3.clone.1, %xor.120533.3.clone.1)
+  %add.245449.7.clone.1 = u32[1280,1280]{1,0} add(%add.245448.3.clone.1, %broadcast.246793.113.clone.1)
+  %shift-left.108343.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120533.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114453.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120533.3.clone.1, %broadcast.244418.4352)
+  %or.113985.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108343.5.clone.1, %shift-right-logical.114453.5.clone.1)
+  %xor.120534.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245448.3.clone.1, %or.113985.3.clone.1)
+  %constant_217914_1_clone_1 = u32[] constant(2548721872)
+  %broadcast.246810.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217914_1_clone_1), dimensions={}
+  %add.245450.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120534.3.clone.1, %broadcast.246810.5.clone.1)
+  %add.245452.5.clone.1 = u32[1280,1280]{1,0} add(%add.245449.7.clone.1, %add.245450.5.clone.1)
+  %shift-left.108344.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245450.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114454.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245450.5.clone.1, %broadcast.244416.5760)
+  %or.113987.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108344.9.clone.1, %shift-right-logical.114454.9.clone.1)
+  %xor.120535.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245452.5.clone.1, %or.113987.7.clone.1)
+  %add.245453.3.clone.1 = u32[1280,1280]{1,0} add(%add.245452.5.clone.1, %xor.120535.5.clone.1)
+  %shift-left.108345.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120535.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114455.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120535.5.clone.1, %broadcast.244429.2304)
+  %or.113988.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108345.9.clone.1, %shift-right-logical.114455.9.clone.1)
+  %xor.120536.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245453.3.clone.1, %or.113988.7.clone.1)
+  %add.245454.3.clone.1 = u32[1280,1280]{1,0} add(%add.245453.3.clone.1, %xor.120536.5.clone.1)
+  %shift-left.108346.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120536.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114456.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120536.5.clone.1, %broadcast.244430.4608)
+  %or.113989.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108346.9.clone.1, %shift-right-logical.114456.9.clone.1)
+  %xor.120537.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245454.3.clone.1, %or.113989.7.clone.1)
+  %add.245455.3.clone.1 = u32[1280,1280]{1,0} add(%add.245454.3.clone.1, %xor.120537.5.clone.1)
+  %constant_163662_1_clone_1 = u32[] constant(2548721871)
+  %broadcast.246824.24.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_163662_1_clone_1), dimensions={}
+  %add.245457.7.clone.1 = u32[1280,1280]{1,0} add(%add.245455.3.clone.1, %broadcast.246824.24.clone.1)
+  %shift-left.108347.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120537.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114457.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120537.5.clone.1, %broadcast.244434.2816)
+  %or.113990.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108347.11.clone.1, %shift-right-logical.114457.11.clone.1)
+  %xor.120538.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245455.3.clone.1, %or.113990.9.clone.1)
+  %constant_217915_1_clone_1 = u32[] constant(1982261342)
+  %broadcast.246828.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217915_1_clone_1), dimensions={}
+  %add.245458.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120538.7.clone.1, %broadcast.246828.5.clone.1)
+  %add.245459.5.clone.1 = u32[1280,1280]{1,0} add(%add.245457.7.clone.1, %add.245458.5.clone.1)
+  %shift-left.108348.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245458.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114458.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245458.5.clone.1, %broadcast.244415.6016)
+  %or.113992.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108348.9.clone.1, %shift-right-logical.114458.9.clone.1)
+  %xor.120539.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245459.5.clone.1, %or.113992.7.clone.1)
+  %add.245460.3.clone.1 = u32[1280,1280]{1,0} add(%add.245459.5.clone.1, %xor.120539.5.clone.1)
+  %shift-left.108349.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120539.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114459.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120539.5.clone.1, %broadcast.244417.5760)
+  %or.113993.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108349.9.clone.1, %shift-right-logical.114459.9.clone.1)
+  %xor.120540.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245460.3.clone.1, %or.113993.7.clone.1)
+  %add.245461.3.clone.1 = u32[1280,1280]{1,0} add(%add.245460.3.clone.1, %xor.120540.5.clone.1)
+  %shift-left.108350.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120540.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114460.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120540.5.clone.1, %broadcast.244419.4352)
+  %or.113994.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108350.7.clone.1, %shift-right-logical.114460.7.clone.1)
+  %xor.120541.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245461.3.clone.1, %or.113994.5.clone.1)
+  %add.245463.3.clone.1 = u32[1280,1280]{1,0} add(%add.245461.3.clone.1, %xor.120541.3.clone.1)
+  %add.245467.7.clone.1 = u32[1280,1280]{1,0} add(%add.245463.3.clone.1, %broadcast.246792.44.clone.1)
+  %shift-left.108351.7.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120541.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114462.7.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120541.3.clone.1, %broadcast.244418.4352)
+  %or.113995.5.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108351.7.clone.1, %shift-right-logical.114462.7.clone.1)
+  %xor.120542.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245463.3.clone.1, %or.113995.5.clone.1)
+  %constant_217916_1_clone_1 = u32[] constant(4196244300)
+  %broadcast.246840.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217916_1_clone_1), dimensions={}
+  %add.245468.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120542.3.clone.1, %broadcast.246840.5.clone.1)
+  %add.245469.5.clone.1 = u32[1280,1280]{1,0} add(%add.245467.7.clone.1, %add.245468.5.clone.1)
+  %shift-left.108352.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245468.5.clone.1, %broadcast.244417.5760)
+  %shift-right-logical.114463.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245468.5.clone.1, %broadcast.244416.5760)
+  %or.113997.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108352.9.clone.1, %shift-right-logical.114463.9.clone.1)
+  %xor.120543.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245469.5.clone.1, %or.113997.7.clone.1)
+  %add.245470.3.clone.1 = u32[1280,1280]{1,0} add(%add.245469.5.clone.1, %xor.120543.5.clone.1)
+  %shift-left.108353.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120543.5.clone.1, %broadcast.244428.2304)
+  %shift-right-logical.114464.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120543.5.clone.1, %broadcast.244429.2304)
+  %or.113998.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108353.9.clone.1, %shift-right-logical.114464.9.clone.1)
+  %xor.120544.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245470.3.clone.1, %or.113998.7.clone.1)
+  %add.245472.3.clone.1 = u32[1280,1280]{1,0} add(%add.245470.3.clone.1, %xor.120544.5.clone.1)
+  %shift-left.108354.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120544.5.clone.1, %broadcast.244430.4608)
+  %shift-right-logical.114465.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120544.5.clone.1, %broadcast.244430.4608)
+  %or.113999.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108354.9.clone.1, %shift-right-logical.114465.9.clone.1)
+  %xor.120545.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245472.3.clone.1, %or.113999.7.clone.1)
+  %add.245473.3.clone.1 = u32[1280,1280]{1,0} add(%add.245472.3.clone.1, %xor.120545.5.clone.1)
+  %add.245474.7.clone.1 = u32[1280,1280]{1,0} add(%add.245473.3.clone.1, %broadcast.246793.113.clone.1)
+  %shift-left.108355.11.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120545.5.clone.1, %broadcast.244433.2816)
+  %shift-right-logical.114467.11.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120545.5.clone.1, %broadcast.244434.2816)
+  %or.114000.9.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108355.11.clone.1, %shift-right-logical.114467.11.clone.1)
+  %xor.120546.7.clone.1 = u32[1280,1280]{1,0} xor(%add.245473.3.clone.1, %or.114000.9.clone.1)
+  %constant_217917_1_clone_1 = u32[] constant(2548721875)
+  %broadcast.246850.5.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217917_1_clone_1), dimensions={}
+  %add.245475.5.clone.1 = u32[1280,1280]{1,0} add(%xor.120546.7.clone.1, %broadcast.246850.5.clone.1)
+  %add.245477.5.clone.1 = u32[1280,1280]{1,0} add(%add.245474.7.clone.1, %add.245475.5.clone.1)
+  %shift-left.108356.9.clone.1 = u32[1280,1280]{1,0} shift-left(%add.245475.5.clone.1, %broadcast.244414.6272)
+  %shift-right-logical.114468.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%add.245475.5.clone.1, %broadcast.244415.6016)
+  %or.114002.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108356.9.clone.1, %shift-right-logical.114468.9.clone.1)
+  %xor.120547.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245477.5.clone.1, %or.114002.7.clone.1)
+  %add.245478.3.clone.1 = u32[1280,1280]{1,0} add(%add.245477.5.clone.1, %xor.120547.5.clone.1)
+  %shift-left.108357.9.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120547.5.clone.1, %broadcast.244416.5760)
+  %shift-right-logical.114469.9.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120547.5.clone.1, %broadcast.244417.5760)
+  %or.114003.7.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108357.9.clone.1, %shift-right-logical.114469.9.clone.1)
+  %xor.120548.5.clone.1 = u32[1280,1280]{1,0} xor(%add.245478.3.clone.1, %or.114003.7.clone.1)
+  %add.245479.3.clone.1 = u32[1280,1280]{1,0} add(%add.245478.3.clone.1, %xor.120548.5.clone.1)
+  %shift-left.108358.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120548.5.clone.1, %broadcast.244418.4352)
+  %shift-right-logical.114470.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120548.5.clone.1, %broadcast.244419.4352)
+  %or.114004.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108358.5.clone.1, %shift-right-logical.114470.5.clone.1)
+  %xor.120549.3.clone.1 = u32[1280,1280]{1,0} xor(%add.245479.3.clone.1, %or.114004.3.clone.1)
+  %add.245480.3.clone.1 = u32[1280,1280]{1,0} add(%add.245479.3.clone.1, %xor.120549.3.clone.1)
+  %add.245482.17.clone.1 = u32[1280,1280]{1,0} add(%add.245480.3.clone.1, %broadcast.246824.24.clone.1)
+  %shift-left.108359.5.clone.1 = u32[1280,1280]{1,0} shift-left(%xor.120549.3.clone.1, %broadcast.244419.4352)
+  %shift-right-logical.114472.5.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120549.3.clone.1, %broadcast.244418.4352)
+  %or.114005.3.clone.1 = u32[1280,1280]{1,0} or(%shift-left.108359.5.clone.1, %shift-right-logical.114472.5.clone.1)
+  %xor.120550.15.clone.1 = u32[1280,1280]{1,0} xor(%add.245480.3.clone.1, %or.114005.3.clone.1)
+  %constant_217918_1_clone_1 = u32[] constant(1982261345)
+  %broadcast.246862.19.clone.1 = u32[1280,1280]{1,0} broadcast(%constant_217918_1_clone_1), dimensions={}
+  %add.245483.19.clone.1 = u32[1280,1280]{1,0} add(%xor.120550.15.clone.1, %broadcast.246862.19.clone.1)
+  %xor.120551.17.clone.1 = u32[1280,1280]{1,0} xor(%add.245482.17.clone.1, %add.245483.19.clone.1)
+  %shift-right-logical.114473.15.clone.1 = u32[1280,1280]{1,0} shift-right-logical(%xor.120551.17.clone.1, %broadcast.244468.1920)
+  %or.114006.13.clone.1 = u32[1280,1280]{1,0} or(%shift-right-logical.114473.15.clone.1, %broadcast.244469.1664)
+  %bitcast-convert.5696.11.clone.1 = f32[1280,1280]{1,0} bitcast-convert(%or.114006.13.clone.1)
+  %add.245484.9.clone.1 = f32[1280,1280]{1,0} add(%bitcast-convert.5696.11.clone.1, %broadcast.244470.1152)
+  %multiply.25869.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245484.9.clone.1, %broadcast.244471.896)
+  %add.245485.5.clone.1 = f32[1280,1280]{1,0} add(%multiply.25869.7.clone.1, %broadcast.244408.1024)
+  %maximum.3628.3.clone.1 = f32[1280,1280]{1,0} maximum(%broadcast.244408.1024, %add.245485.5.clone.1)
+  %abs.1502.3.clone.1 = f32[1280,1280]{1,0} abs(%maximum.3628.3.clone.1)
+  %compare.7152.3.clone.1 = pred[1280,1280]{1,0} compare(%abs.1502.3.clone.1, %broadcast.244475.384), direction=EQ
+  %multiply.25870.9.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3628.3.clone.1, %broadcast.244476.1152)
+  %negate.4509.5.clone.1 = f32[1280,1280]{1,0} negate(%maximum.3628.3.clone.1)
+  %multiply.25871.5.clone.1 = f32[1280,1280]{1,0} multiply(%maximum.3628.3.clone.1, %negate.4509.5.clone.1)
+  %log-plus-one.1502.3.clone.1 = f32[1280,1280]{1,0} log-plus-one(%multiply.25871.5.clone.1)
+  %negate.4510.4.clone.1 = f32[1280,1280]{1,0} negate(%log-plus-one.1502.3.clone.1)
+  %compare.7153.3.clone.1 = pred[1280,1280]{1,0} compare(%negate.4510.4.clone.1, %broadcast.244477.384), direction=LT
+  %select.20587.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244478.896, %broadcast.244479.896)
+  %select.20588.11.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244480.1408, %broadcast.244481.1408)
+  %select.20589.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244482.640, %broadcast.244483.640)
+  %select.20590.5.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244484.640, %broadcast.244485.640)
+  %select.20591.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244486.384, %broadcast.244487.384)
+  %select.20592.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244488.384, %broadcast.244489.384)
+  %select.20593.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244490.384, %broadcast.244491.384)
+  %select.20594.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244492.384, %broadcast.244493.384)
+  %select.20595.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %broadcast.244494.384, %broadcast.244495.384)
+  %add.245486.5.clone.1 = f32[1280,1280]{1,0} add(%negate.4510.4.clone.1, %broadcast.244496.640)
+  %sqrt.1502.5.clone.1 = f32[1280,1280]{1,0} sqrt(%negate.4510.4.clone.1)
+  %add.245488.5.clone.1 = f32[1280,1280]{1,0} add(%sqrt.1502.5.clone.1, %broadcast.244498.640)
+  %select.20596.3.clone.1 = f32[1280,1280]{1,0} select(%compare.7153.3.clone.1, %add.245486.5.clone.1, %add.245488.5.clone.1)
+  %multiply.25872.1.clone.1 = f32[1280,1280]{1,0} multiply(%select.20595.3.clone.1, %select.20596.3.clone.1)
+  %add.245492.1.clone.1 = f32[1280,1280]{1,0} add(%select.20594.3.clone.1, %multiply.25872.1.clone.1)
+  %multiply.25873.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245492.1.clone.1, %select.20596.3.clone.1)
+  %add.245493.1.clone.1 = f32[1280,1280]{1,0} add(%select.20593.3.clone.1, %multiply.25873.1.clone.1)
+  %multiply.25874.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245493.1.clone.1, %select.20596.3.clone.1)
+  %add.245494.1.clone.1 = f32[1280,1280]{1,0} add(%select.20592.3.clone.1, %multiply.25874.1.clone.1)
+  %multiply.25875.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245494.1.clone.1, %select.20596.3.clone.1)
+  %add.245495.1.clone.1 = f32[1280,1280]{1,0} add(%select.20591.3.clone.1, %multiply.25875.1.clone.1)
+  %multiply.25876.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245495.1.clone.1, %select.20596.3.clone.1)
+  %add.245497.3.clone.1 = f32[1280,1280]{1,0} add(%select.20590.5.clone.1, %multiply.25876.1.clone.1)
+  %multiply.25877.1.clone.1 = f32[1280,1280]{1,0} multiply(%add.245497.3.clone.1, %select.20596.3.clone.1)
+  %add.245498.3.clone.1 = f32[1280,1280]{1,0} add(%select.20589.5.clone.1, %multiply.25877.1.clone.1)
+  %multiply.25878.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245498.3.clone.1, %select.20596.3.clone.1)
+  %add.245499.9.clone.1 = f32[1280,1280]{1,0} add(%select.20588.11.clone.1, %multiply.25878.7.clone.1)
+  %multiply.25879.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245499.9.clone.1, %select.20596.3.clone.1)
+  %add.245500.7.clone.1 = f32[1280,1280]{1,0} add(%select.20587.7.clone.1, %multiply.25879.7.clone.1)
+  %multiply.25880.7.clone.1 = f32[1280,1280]{1,0} multiply(%add.245500.7.clone.1, %maximum.3628.3.clone.1)
+  %select.20597.7.clone.1 = f32[1280,1280]{1,0} select(%compare.7152.3.clone.1, %multiply.25870.9.clone.1, %multiply.25880.7.clone.1)
+  %multiply.25881.5.clone.1 = f32[1280,1280]{1,0} multiply(%select.20597.7.clone.1, %broadcast.244500.640)
+  %clamp.1146.3.clone.1 = f32[1280,1280]{1,0} clamp(%broadcast.244407.384, %multiply.25881.5.clone.1, %broadcast.244501.384)
+  %multiply.25882.1.clone.1 = f32[1280,1280]{1,0} multiply(%clamp.1146.3.clone.1, %broadcast.244502.1)
+  ROOT %tuple.3392 = (f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=5*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=10*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=15*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=20*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=25*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=30*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=35*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=40*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=45*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=50*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=55*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=60*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=65*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=70*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=75*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=80*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=85*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, /*index=90*/f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}, f32[1280,1280]{1,0}) tuple(%multiply.25596.1, %multiply.25624.1.clone.1, %multiply.26588.1.clone.1, %multiply.26602.1.clone.1, %multiply.27082.1.clone.1, /*index=5*/%multiply.26570.1.clone.1, %multiply.27323.1.clone.1, %multiply.26556.1.clone.1, %multiply.27064.1.clone.1, %multiply.26541.1.clone.1, /*index=10*/%multiply.27443.1.clone.1, %multiply.26527.1.clone.1, %multiply.27047.1.clone.1, %multiply.26509.1.clone.1, %multiply.27305.1.clone.1, /*index=15*/%multiply.26489.1.clone.1, %multiply.27026.1.clone.1, %multiply.26470.1.clone.1, %multiply.27505.1.clone.1, %multiply.26455.1.clone.1, /*index=20*/%multiply.27010.1.clone.1, %multiply.26437.1.clone.1, %multiply.27291.1.clone.1, %multiply.26423.1.clone.1, %multiply.26992.1.clone.1, /*index=25*/%multiply.26409.1.clone.1, %multiply.27425.1.clone.1, %multiply.26395.1.clone.1, %multiply.26978.1.clone.1, %multiply.26376.1.clone.1, /*index=30*/%multiply.27277.1.clone.1, %multiply.26361.1.clone.1, %multiply.26964.1.clone.1, %multiply.26347.1.clone.1, %multiply.27533.1.clone.1, /*index=35*/%multiply.26333.1.clone.1, %multiply.26950.1.clone.1, %multiply.26315.1.clone.1, %multiply.27263.1.clone.1, %multiply.26301.1.clone.1, /*index=40*/%multiply.26932.1.clone.1, %multiply.26287.1.clone.1, %multiply.27411.1.clone.1, %multiply.26273.1.clone.1, %multiply.26917.1.clone.1, /*index=45*/%multiply.26255.1.clone.1, %multiply.27245.1.clone.1, %multiply.26241.1.clone.1, %multiply.26902.1.clone.1, %multiply.26227.1.clone.1, /*index=50*/%multiply.27487.1.clone.1, %multiply.26213.1.clone.1, %multiply.26888.1.clone.1, %multiply.26195.1.clone.1, %multiply.27231.1.clone.1, /*index=55*/%multiply.26181.1.clone.1, %multiply.26870.1.clone.1, %multiply.26167.1.clone.1, %multiply.27397.1.clone.1, %multiply.26153.1.clone.1, /*index=60*/%multiply.26856.1.clone.1, %multiply.26135.1.clone.1, %multiply.27217.1.clone.1, %multiply.26121.1.clone.1, %multiply.26842.1.clone.1, /*index=65*/%multiply.26107.1.clone.1, %multiply.27547.1.clone.1, %multiply.26093.1.clone.1, %multiply.26828.1.clone.1, %multiply.26075.1.clone.1, /*index=70*/%multiply.27203.1.clone.1, %multiply.26061.1.clone.1, %multiply.26810.1.clone.1, %multiply.26047.1.clone.1, %multiply.27383.1.clone.1, /*index=75*/%multiply.26033.1.clone.1, %multiply.26796.1.clone.1, %multiply.26015.1.clone.1, %multiply.27185.1.clone.1, %multiply.26001.1.clone.1, /*index=80*/%multiply.26782.1.clone.1, %multiply.25986.1.clone.1, %multiply.27472.1.clone.1, %multiply.25972.1.clone.1, %multiply.26768.1.clone.1, /*index=85*/%multiply.25954.1.clone.1, %multiply.27171.1.clone.1, %multiply.25932.1.clone.1, %multiply.26750.1.clone.1, %multiply.25914.1.clone.1, /*index=90*/%multiply.27365.1.clone.1, %multiply.25900.1.clone.1, %multiply.26736.1.clone.1, %multiply.25882.1.clone.1)
+}
+// CHECK-PARTITIONED-HLO-COUNT-100:  func.func private
+// CHECK-COUNT-100:       func.func private
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/side_output_s4.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/side_output_s4.hlo
new file mode 100644
index 000000000000..fb708e728a4b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/reduce_row/side_output_s4.hlo
@@ -0,0 +1,18 @@
+// RUN: fusion_to_mlir %s | FileCheck %s
+// RUN: test_correctness %s
+
+add {
+  p0 = s32[] parameter(0)
+  p1 = s32[] parameter(1)
+  ROOT add = s32[] add(p0, p1)
+}
+
+fusion {
+  p0 = s4[64,64] parameter(0)
+  convert = s32[64,64] convert(p0)
+  c0 = s32[] constant(0)
+
+  reduce = s32[64] reduce(convert, c0), dimensions={1}, to_apply=add
+  ROOT tuple = (s32[64], s32[64,64]) tuple(reduce, convert)
+}
+// CHECK: shuffle_reduce
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/scatter_s4.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/scatter_s4.hlo
new file mode 100644
index 000000000000..54fa46918ca5
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/scatter_s4.hlo
@@ -0,0 +1,20 @@
+// RUN: test_correctness %s --bijection_inputs=scatter:2
+
+add {
+  p0 = s4[] parameter(0)
+  p1 = s4[] parameter(1)
+  ROOT sum = s4[] add(p0, p1)
+}
+
+ scatter_fusion {
+  %p0 = s4[8,128,1,4]{3,2,1,0:E(4)} parameter(0)
+  %p1 = s32[8,2]{1,0} parameter(1)
+  %p2 = s4[8,1,1,1,4]{4,3,2,1,0:E(4)} parameter(2)
+
+  ROOT %scatter = s4[8,128,1,4]{3,2,1,0:E(4)} scatter(%p0, %p1, %p2),
+    update_window_dims={1,2,3,4},
+    inserted_window_dims={},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=1,
+    to_apply=add
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/scatter_s8.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/scatter_s8.hlo
new file mode 100644
index 000000000000..ccb0e6fbb4d8
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/scatter_s8.hlo
@@ -0,0 +1,27 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize --inline |\
+// RUN: FileCheck %s
+// RUN: test_correctness %s --bijection_inputs=scatter:2
+
+HloModule scatter
+
+reducer {
+  p0 = s8[] parameter(0)
+  p1 = s8[] parameter(1)
+  ROOT sum = s8[] add(p0, p1)
+}
+
+scatter_fusion {
+  p0 = s8[16,528,1,256] parameter(0)
+  p1 = s32[96,2] parameter(1)
+  p2 = s8[16,6,256] parameter(2)
+  bitcast = s8[96,1,1,1,256] bitcast(p2)
+  ROOT scatter = s8[16,528,1,256] scatter(p0, p1, bitcast),
+    update_window_dims={1,2,3,4},
+    inserted_window_dims={},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=1,
+    to_apply=reducer
+}
+// There should be only one function in the module after inlining.
+// CHECK: func.func
+// CHECK-NOT: func.func
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/sorted_indices_large.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/sorted_indices_large.hlo
new file mode 100644
index 000000000000..189d82e14327
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/scatter/sorted_indices_large.hlo
@@ -0,0 +1,27 @@
+// RUN: fusion_to_mlir %s   | emitters_opt -xla-gpu-test-optimize \
+// RUN:   -xla-gpu-test-transform-loops  | FileCheck %s
+
+mul {
+  %p0 = s32[] parameter(0)
+  %p1 = s32[] parameter(1)
+  ROOT %prod = s32[] multiply(%p0, %p1)
+}
+scatter {
+  %operand = s32[100]  parameter(0)
+  %indices = s32[2008,1] parameter(1)
+  %update = s32[2008,64] parameter(2)
+
+  ROOT %scatter = s32[100] scatter(
+      s32[100] %operand,
+      s32[2008,1] %indices,
+      s32[2008,64] %update
+    ),
+    update_window_dims={1},
+    inserted_window_dims={},
+    scatter_dims_to_operand_dims={0},
+    index_vector_dim=1,
+    indices_are_sorted=true,
+    unique_indices=false,
+    to_apply=mul
+}
+// CHECK: arith.constant dense<0> : vector<2xi32>
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_bf16.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_bf16.hlo
new file mode 100644
index 000000000000..2f73cf437a10
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_bf16.hlo
@@ -0,0 +1,9 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  p0 = bf16[30,16,30] parameter(0)
+  ROOT transpose = bf16[30,16,30] transpose(p0), dimensions={2,1,0}
+}
+// CHECK:  xla_gpu.allocate_shared : tensor<64x64xbf16>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_f16.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_f16.hlo
new file mode 100644
index 000000000000..11091a0ca34d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_f16.hlo
@@ -0,0 +1,9 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  p0 = f16[28,2,6,32] parameter(0)
+  ROOT transpose = f16[2,32,6,28] transpose(p0), dimensions={1,3,2,0}
+}
+// CHECK:  xla_gpu.allocate_shared : tensor<64x64xf16>
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_multiple_heroes.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_multiple_heroes.hlo
new file mode 100644
index 000000000000..d9f550eaee12
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_multiple_heroes.hlo
@@ -0,0 +1,20 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s --dump-input=fail
+// RUN: test_correctness %s
+
+ fusion {
+   p0 = f16[512,3072]{0,1} parameter(0)
+   bitcast1 = f16[3072,512] bitcast(p0)
+   slice1 = f16[1024,512] slice(bitcast1), slice={[2048:3072], [0:512]}
+   bitcast2 = f16[16,64,512] bitcast(slice1)
+   transpose1 = f16[16,512,64] transpose(bitcast2), dimensions={0,2,1}
+   slice2 = f16[1024,512] slice(bitcast1), slice={[0:1024], [0:512]}
+   slice3 = f16[1024,512] slice(bitcast1), slice={[1024:2048], [0:512]}
+   bitcast3 = f16[16,64,512] bitcast(slice3)
+   transpose2 = f16[16,512,64] transpose(bitcast3), dimensions={0,2,1}
+   bitcast4 = f16[16,64,512] bitcast(slice2)
+   transpose3 = f16[16,512,64] transpose(bitcast4), dimensions={0,2,1}
+   ROOT tuple = (f16[16,512,64], f16[1024,512], f16[16,512,64], f16[16,512,64])
+     tuple(transpose1, slice2, transpose2, transpose3)
+ }
+// CHECK-COUNT-2:  xla_gpu.allocate_shared : tensor<64x64xf16>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_multiple_roots.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_multiple_roots.hlo
new file mode 100644
index 000000000000..352bfabafbf3
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_multiple_roots.hlo
@@ -0,0 +1,21 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fused_transpose {
+  %iota.32.11 = s32[64,64]{1,0} iota(), iota_dimension=1
+  %iota.33.11 = s32[64,64]{1,0} iota(), iota_dimension=0
+  %compare.7.11 = pred[64,64]{1,0} compare(%iota.32.11, %iota.33.11), direction=GE
+  %transpose.47.1 = pred[64,64]{1,0} transpose(%compare.7.11), dimensions={1,0}
+  %copy.36 = pred[64,64]{1,0} copy(%transpose.47.1)
+  %copy.37 = pred[64,64]{1,0} copy(%transpose.47.1)
+  %copy.38 = pred[64,64]{1,0} copy(%transpose.47.1)
+  %copy.39 = pred[64,64]{1,0} copy(%transpose.47.1)
+  %copy.40 = pred[64,64]{1,0} copy(%transpose.47.1)
+  ROOT %tuple.10 = (pred[64,64]{1,0}, pred[64,64]{1,0}, pred[64,64]{1,0},
+    pred[64,64]{1,0}, pred[64,64]{1,0}, /*index=5*/pred[64,64]{1,0})
+   tuple(%transpose.47.1, %copy.36, %copy.37,
+         %copy.38, %copy.39, /*index=5*/%copy.40)
+}
+// CHECK:  xla_gpu.allocate_shared : tensor<128x128xi8>
+// CHECK-NOT:  xla_gpu.allocate_shared
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s16.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s16.hlo
new file mode 100644
index 000000000000..b26a8cf55247
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s16.hlo
@@ -0,0 +1,49 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize --inline |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+fusion {
+  p0 = s16[2,4,64,10,6] parameter(0)
+  ROOT transpose = s16[4,2,6,10,64] transpose(p0), dimensions={1,0,4,3,2}
+}
+
+// CHECK-LABEL:   func.func @main(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<2x4x64x10x6xi16>
+// CHECK-SAME:      %[[OUTPU:.*]]: tensor<4x2x6x10x64xi16>
+
+// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i16
+// CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:  %[[VC0:.*]] = arith.constant dense<0> : vector<2x2xi16>
+
+// CHECK:  xla_gpu.allocate_shared : tensor<64x64xi16>
+
+// CHECK:      %[[SHMEM_W_DATA:.*]] = xla.loop
+// CHECK:        tensor.extract %[[INPUT]]
+// CHECK:        tensor.insert
+// CHECK:        xla.yield %{{.*}} : tensor<64x64xi16>
+// CHECK:      }
+// CHECK:      %[[SHMEM_SYNC:.*]] = xla_gpu.sync_threads %[[SHMEM_W_DATA]]
+// CHECK:      xla.loop
+
+// Reading the first horizontal vector.
+// CHECK:        %[[V0:.*]] = vector.transfer_read %[[SHMEM_SYNC]]
+// CHECK-SAME:     : tensor<64x64xi16>, vector<2xi16>
+// CHECK:        %[[V1:.*]] = vector.extract %[[V0]][0] : i16 from vector<2xi16>
+// CHECK:        %[[V2:.*]] = vector.insert %[[V1]], %[[VC0]] [0, 0]
+// CHECK:        %[[V3:.*]] = vector.extract %[[V0]][1] : i16 from vector<2xi16>
+// CHECK:        %[[V4:.*]] = vector.insert %[[V3]], %[[V2]] [1, 0]
+// CHECK:        %[[V5:.*]] = arith.addi
+
+// Reading the second horizontal vector.
+// CHECK:        %[[V6:.*]] = vector.transfer_read %[[SHMEM_SYNC]][%[[V5]]
+// CHECK-SAME:     : tensor<64x64xi16>, vector<2xi16>
+// CHECK:        %[[V7:.*]] = vector.extract %{{.*}}[0] : i16 from vector<2xi16>
+// CHECK:        %[[V8:.*]] = vector.insert %{{.*}}[0, 1]
+// CHECK:        %[[V9:.*]] = vector.extract %{{.*}}[1] : i16 from vector<2xi16>
+// CHECK:        %[[V10:.*]] = vector.insert %{{.*}}[1, 1]
+
+// Writing back the transpose <VECTOR_SIZE x VECTOR_SIZE> vector.
+// CHECK:        xla.loop
+// CHECK:          vector.extract %[[V10]]
+// CHECK:          tensor.insert
+// CHECK:          xla.yield %{{.*}} : tensor<4x2x6x10x64xi16>
+// CHECK:        }
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s4.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s4.hlo
new file mode 100644
index 000000000000..8640e004f071
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s4.hlo
@@ -0,0 +1,10 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  %param_0 = s4[128, 256, 8, 32] parameter(0)
+  ROOT %transpose= s4[128, 32, 8, 256] transpose(%param_0),
+    dimensions={0,3,2,1}
+}
+// CHECK: xla_gpu.allocate_shared : tensor<256x256xi4>
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s8.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s8.hlo
new file mode 100644
index 000000000000..3eecdf18e00a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s8.hlo
@@ -0,0 +1,9 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  p0 = s8[8,64,68] parameter(0)
+  ROOT transpose = s8[8,68,64] transpose(p0), dimensions={0, 2, 1}
+}
+// CHECK:  xla_gpu.allocate_shared : tensor<128x128xi8>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_side_output.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_side_output.hlo
new file mode 100644
index 000000000000..fe7373ef7ceb
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_side_output.hlo
@@ -0,0 +1,14 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+   p0 = f16[1,2,128,4,32] parameter(0)
+   bitcast1 = f16[2,128,128] bitcast(p0)
+   transpose1 = f16[2,128,128] transpose(bitcast1), dimensions={0,2,1}
+   bitcast2 = f16[2,128,4,32] bitcast(p0)
+   transpose2 = f16[2,4,128,32] transpose(bitcast2), dimensions={0,2,1,3}
+   ROOT tuple = (f16[2,128,128], f16[2,4,128,32]) tuple(transpose1, transpose2)
+ }
+// CHECK:        xla_gpu.allocate_shared
+// CHECK-NOT:    xla_gpu.allocate_shared
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_two_heroes.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_two_heroes.hlo
new file mode 100644
index 000000000000..664abf2491fa
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_two_heroes.hlo
@@ -0,0 +1,19 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  p0 = s16[2,4,64,10,6] parameter(0)
+  p1 = bf16[2,4,64,10,6] parameter(1)
+
+  tr0 = s16[4,2,6,10,64] transpose(p0), dimensions={1,0,4,3,2}
+  tr1 = bf16[4,2,6,10,64] transpose(p1), dimensions={1,0,4,3,2}
+
+  ROOT tuple = (s16[4,2,6,10,64], bf16[4,2,6,10,64]) tuple(tr0, tr1)
+}
+
+// CHECK-DAG:  arith.constant dense<0> : vector<2x2xi16>
+// CHECK-DAG:  arith.constant dense<0.000000e+00> : vector<2x2xbf16>
+
+// CHECK:  xla_gpu.allocate_shared : tensor<64x64xi16>
+// CHECK:  xla_gpu.allocate_shared : tensor<64x64xbf16>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/partial_tile_s4.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/partial_tile_s4.hlo
new file mode 100644
index 000000000000..4caf6e6b41ec
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/partial_tile_s4.hlo
@@ -0,0 +1,12 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  %p0 = s4[24,2,24] parameter(0)
+  ROOT %t = s4[24,2,24] transpose(%p0), dimensions={2,1,0}
+}
+// CHECK: xla.loop
+// TODO(b/415741994): TransposeEmitter is regressing for S4 when the last
+// dimension is being transposed and we are using LoopEmitter instead.
+// CHECK-NOT: xla_gpu.allocate_shared
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/prefer_large_vectors_021.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/prefer_large_vectors_021.hlo
deleted file mode 100644
index 4f1858c81db9..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/prefer_large_vectors_021.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
-// RUN:   FileCheck %s
-// RUN: test_correctness %s
-
-fusion {
-  %p0 = u8[256,256,256] parameter(0)
-  %transpose = u8[256,256,256] transpose(%p0), dimensions={0,2,1}
-}
-// CHECK: xla_gpu.allocate_shared : tensor<1x128x129xi8>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/prefer_large_vectors_210.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/prefer_large_vectors_210.hlo
deleted file mode 100644
index b70586064c08..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/prefer_large_vectors_210.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
-// RUN:   FileCheck %s
-// RUN: test_correctness %s
-
-fusion {
-  %p0 = u8[256,256,256] parameter(0)
-  %transpose = u8[256,256,256] transpose(%p0), dimensions={2,1,0}
-}
-// CHECK: xla_gpu.allocate_shared : tensor<128x1x129xi8>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_021_vectorized.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_021_vectorized.hlo
deleted file mode 100644
index 92d4562d1cfc..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_021_vectorized.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
-// RUN:   FileCheck %s
-// RUN: test_correctness %s
-
-fusion {
-  %input = f16[8192,64,64] parameter(0)
-  ROOT transpose = f16[8192,64,64] transpose(%input), dimensions={0,2,1}
-}
-// CHECK: xla_gpu.allocate_shared : tensor<1x64x65xf16>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_102_s4.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_102_s4.hlo
new file mode 100644
index 000000000000..65920b66ccff
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_102_s4.hlo
@@ -0,0 +1,11 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s
+
+fusion {
+  %param_0 = s4[64, 16, 4] parameter(0)
+  ROOT %transpose= s4[16, 64, 4] transpose(%param_0),
+    dimensions={1,0,2}
+}
+// CHECK-NOT: xla_gpu.allocate_shared : tensor<256x256xi4>
+// CHECK: xla_gpu.allocate_shared
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_10_s4.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_10_s4.hlo
new file mode 100644
index 000000000000..9ba01c48824a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_10_s4.hlo
@@ -0,0 +1,12 @@
+// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
+// RUN:   FileCheck %s
+// RUN: test_correctness %s --bijection_inputs=transpose --bijection_outputs=transpose
+
+fusion {
+  %param_0 = s4[64, 64] parameter(0)
+  ROOT %transpose= s4[64,64] transpose( %param_0), dimensions={1,0}
+}
+// CHECK: xla.loop
+// TODO(b/415741994): TransposeEmitter is regressing for S4 when the last
+// dimension is being transposed and we are using LoopEmitter instead.
+// CHECK-NOT: xla_gpu.allocate_shared
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_210_vectorized.hlo b/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_210_vectorized.hlo
deleted file mode 100644
index 4f20c856bd70..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/tests/transpose/transpose_210_vectorized.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: fusion_to_mlir %s | emitters_opt -xla-gpu-test-optimize |\
-// RUN:   FileCheck %s
-// RUN: test_correctness %s
-
-fusion {
-  %input = f16[128,128,8192] parameter(0)
-  ROOT transpose = f16[8192,128,128] transpose(%input), dimensions={2,1,0}
-}
-// CHECK: xla_gpu.allocate_shared : tensor<64x1x65xf16>
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
index 2222d2372aad..f00866292804 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
@@ -20,15 +20,10 @@ package_group(
 
 gentbl_cc_library(
     name = "passes_inc_gen",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=GpuFusionTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=GpuFusionTransforms",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     visibility = ["//visibility:private"],
@@ -38,22 +33,25 @@ gentbl_cc_library(
 cc_library(
     name = "passes",
     srcs = [
+        "convert_float_amd.cc",
         "convert_float_nvidia.cc",
         "convert_index_type.cc",
         "fuse_loops.cc",
         "optimize_loops.cc",
         "peel_loops.cc",
-        "vectorize_loads_stores.cc",
     ],
     hdrs = ["passes.h"],
     copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":passes_inc_gen",
+        "//xla:shape_util",
         "//xla:util",
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/transforms:atomic_rmw_utils",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/mlir_hlo",
+        "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
@@ -69,12 +67,12 @@ cc_library(
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DataLayoutInterfaces",
-        "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MathTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFUtils",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
new file mode 100644
index 000000000000..2679fb22ed0a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
@@ -0,0 +1,578 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/APFloat.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+#define GEN_PASS_DEF_CONVERTFLOATAMDPASS
+#include "xla/backends/gpu/codegen/emitters/transforms/passes.h.inc"
+
+namespace {
+
+namespace LLVM = ::mlir::LLVM;
+namespace arith = ::mlir::arith;
+namespace vector = ::mlir::vector;
+
+template <typename SourceOp>
+struct Fp8OpRewritePattern : public mlir::OpRewritePattern<SourceOp> {
+  using FixedVectorValue = mlir::TypedValue<mlir::FixedVectorType>;
+  using FloatValue = mlir::TypedValue<mlir::FloatType>;
+  Fp8OpRewritePattern(mlir::MLIRContext* context, bool nativeNanooFp8)
+      : mlir::OpRewritePattern<SourceOp>(context),
+        nativeNanooFp8_(nativeNanooFp8) {}
+  bool isFp8(const mlir::Type& type) const {
+    return nativeNanooFp8_ ? llvm::isa<mlir::Float8E4M3FNUZType>(type)
+                           : llvm::isa<mlir::Float8E4M3FNType>(type);
+  }
+  bool isBf8(const mlir::Type& type) const {
+    return nativeNanooFp8_ ? llvm::isa<mlir::Float8E5M2FNUZType>(type)
+                           : llvm::isa<mlir::Float8E5M2Type>(type);
+  }
+
+ private:
+  bool nativeNanooFp8_;
+};
+
+struct RewriteFp8TruncFPattern : public Fp8OpRewritePattern<arith::TruncFOp> {
+  using Fp8OpRewritePattern::Fp8OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      arith::TruncFOp op, mlir::PatternRewriter& rewriter) const override {
+    auto src = mlir::cast<FloatValue>(op.getOperand());
+    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    if (!isFp8(dst_ty) && !isBf8(dst_ty)) {
+      return rewriter.notifyMatchFailure(op, "unsupported float conversion");
+    }
+
+    auto match = MatchBuildVector(op, src, dst_ty);
+
+    if (match) {
+      auto [inputs, output] = *match;
+      rewriter.setInsertionPointAfter(output.getDefiningOp());
+      mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+      rewriter.replaceOp(
+          output.getDefiningOp(),
+          EmitVectorizedTruncToF8Intrinsic(inputs, output.getType(), b));
+      return mlir::success();
+    }
+
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    rewriter.replaceOp(op, EmitTruncToF8Intrinsic(src, dst_ty, b));
+    return mlir::success();
+  }
+
+  std::optional<std::tuple<llvm::SmallVector<mlir::Value, 4>, FixedVectorValue>>
+  MatchBuildVector(arith::TruncFOp op, FloatValue value,
+                   mlir::FloatType to_ty) const {
+    auto matchPos = [](vector::InsertOp insert, size_t* pos) -> bool {
+      llvm::APInt ap_pos;
+      auto position = insert.getMixedPosition();
+      if (position.size() != 1) {
+        return false;
+      }
+      if (auto attr = mlir::dyn_cast<mlir::Attribute>(position[0])) {
+        if (!mlir::matchPattern(attr, mlir::m_ConstantInt(&ap_pos))) {
+          return false;
+        }
+      } else {
+        if (!mlir::matchPattern(mlir::cast<mlir::Value>(position[0]),
+                                mlir::m_ConstantInt(&ap_pos))) {
+          return false;
+        }
+      }
+
+      *pos = ap_pos.getZExtValue();
+      return true;
+    };
+
+    if (!op->hasOneUse()) {
+      return std::nullopt;
+    }
+
+    size_t pos;
+    auto insert = mlir::dyn_cast<vector::InsertOp>(op->use_begin()->getOwner());
+    if (!insert || insert.getValueToStore() != op->getResult(0) ||
+        !matchPos(insert, &pos) || !insert.getDest().hasOneUse()) {
+      return std::nullopt;
+    }
+
+    mlir::Value vector = insert.getDest();
+
+    size_t element_count =
+        mlir::cast<FixedVectorValue>(vector).getType().getNumElements();
+
+    if (!llvm::isPowerOf2_64(element_count) || element_count == 1) {
+      return std::nullopt;
+    }
+
+    llvm::SmallVector<mlir::Value, 4> inputs(element_count);
+
+    auto addInput = [&](mlir::Value input, size_t index) -> bool {
+      if (index >= element_count) {
+        return false;
+      }
+      if (inputs[index]) {
+        return false;
+      }
+      inputs[index] = input;
+      return true;
+    };
+
+    addInput(value, pos);
+
+    mlir::Value input;
+    mlir::Operation* to_match = vector.getDefiningOp();
+    while (mlir::matchPattern(to_match, mlir::m_Op<vector::InsertOp>(
+                                            mlir::m_Op<arith::TruncFOp>(
+                                                mlir::matchers::m_Any(&input)),
+                                            mlir::matchers::m_Any(&vector))) &&
+           matchPos(mlir::cast<vector::InsertOp>(to_match), &pos) &&
+           vector.hasOneUse()) {
+      if (!addInput(input, pos)) {
+        return std::nullopt;
+      }
+      to_match = vector.getDefiningOp();
+    }
+
+    while (
+        insert->hasOneUse() &&
+        mlir::matchPattern(
+            insert->use_begin()->getOwner(),
+            mlir::m_Op<vector::InsertOp>(
+                mlir::m_Op<arith::TruncFOp>(mlir::matchers::m_Any(&input)),
+                mlir::matchers::m_Val(insert->getResult(0)))) &&
+        matchPos(mlir::cast<vector::InsertOp>(insert->use_begin()->getOwner()),
+                 &pos) &&
+        input.getType() == value.getType()) {
+      if (!addInput(input, pos)) {
+        return std::nullopt;
+      }
+      insert = mlir::cast<vector::InsertOp>(insert->use_begin()->getOwner());
+    }
+
+    if (llvm::any_of(inputs, [](mlir::Value input) { return !input; })) {
+      return std::nullopt;
+    }
+    return std::make_tuple(std::move(inputs),
+                           mlir::cast<FixedVectorValue>(insert->getResult(0)));
+  }
+
+  mlir::Value EmitVectorizedTruncToF8Intrinsic(
+      llvm::SmallVector<mlir::Value, 4>& inputs, mlir::FixedVectorType to_ty,
+      mlir::ImplicitLocOpBuilder& b) const {
+    assert(isFp8(to_ty.getElementType()) || isBf8(to_ty.getElementType()));
+
+    mlir::FloatType f32_ty = b.getF32Type();
+    mlir::IntegerType i32_ty = b.getI32Type();
+    mlir::IntegerType i8_ty = b.getI8Type();
+    mlir::IntegerType i1_ty = b.getI1Type();
+
+    llvm::transform(inputs, inputs.begin(), [&](mlir::Value v) -> mlir::Value {
+      if (v.getType().getIntOrFloatBitWidth() < f32_ty.getWidth()) {
+        return b.create<arith::ExtFOp>(f32_ty, v);
+      } else if (v.getType() != f32_ty) {
+        return b.create<arith::TruncFOp>(f32_ty, v);
+      } else {
+        return v;
+      }
+    });
+
+    mlir::StringAttr cvtIntr = b.getStringAttr(
+        isFp8(to_ty.getElementType()) ? "llvm.amdgcn.cvt.pk.fp8.f32"
+                                      : "llvm.amdgcn.cvt.pk.bf8.f32");
+
+    size_t num_elements = to_ty.getNumElements();
+    assert(num_elements == inputs.size() &&
+           (num_elements == 2 || num_elements % 4 == 0));
+
+    size_t num_chunks = (num_elements + 2) / 4;
+
+    mlir::Type chunks_ty = mlir::VectorType::get(num_chunks, i32_ty);
+    mlir::Value chunks = b.create<LLVM::UndefOp>(chunks_ty);
+    bool pos = false;
+    for (size_t i = 0; i < inputs.size() / 2; i++) {
+      mlir::Value chunk_pos = b.create<LLVM::ConstantOp>(i32_ty, 2 * i / 4);
+      mlir::Value chunk = b.create<LLVM::ExtractElementOp>(chunks, chunk_pos);
+      LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
+          i32_ty, cvtIntr,
+          mlir::ValueRange{inputs[2 * i], inputs[2 * i + 1], chunk,
+                           b.create<LLVM::ConstantOp>(i1_ty, pos)});
+      chunks = b.create<LLVM::InsertElementOp>(chunks, cvtOp.getResult(0),
+                                               chunk_pos);
+      pos ^= true;
+    }
+
+    if (num_elements == 2) {
+      return b
+          .create<mlir::UnrealizedConversionCastOp>(
+              to_ty,
+              mlir::ValueRange{b.create<LLVM::BitcastOp>(
+                  mlir::VectorType::get(num_elements, i8_ty),
+                  b.create<LLVM::ExtractElementOp>(
+                      b.create<LLVM::BitcastOp>(
+                          mlir::VectorType::get(2, b.getI16Type()), chunks),
+                      b.create<LLVM::ConstantOp>(i32_ty, 0)))})
+          .getResult(0);
+    }
+
+    return b
+        .create<mlir::UnrealizedConversionCastOp>(
+            to_ty, mlir::ValueRange{b.create<LLVM::BitcastOp>(
+                       mlir::VectorType::get(num_elements, i8_ty), chunks)})
+        .getResult(0);
+  }
+
+  mlir::Value EmitTruncToF8Intrinsic(mlir::Value value, mlir::FloatType to_ty,
+                                     mlir::ImplicitLocOpBuilder& b) const {
+    assert(isFp8(to_ty) || isBf8(to_ty));
+
+    mlir::FloatType f32_ty = b.getF32Type();
+    mlir::IntegerType i32_ty = b.getI32Type();
+    if (value.getType().getIntOrFloatBitWidth() < f32_ty.getWidth()) {
+      value = b.create<arith::ExtFOp>(f32_ty, value);
+    } else if (value.getType() != f32_ty) {
+      value = b.create<arith::TruncFOp>(f32_ty, value);
+    }
+
+    mlir::StringAttr cvtIntr =
+        b.getStringAttr(isFp8(to_ty) ? "llvm.amdgcn.cvt.pk.fp8.f32"
+                                     : "llvm.amdgcn.cvt.pk.bf8.f32");
+
+    LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
+        i32_ty, cvtIntr,
+        mlir::ValueRange{value, b.create<LLVM::UndefOp>(f32_ty),
+                         b.create<LLVM::UndefOp>(i32_ty),
+                         b.create<LLVM::ConstantOp>(b.getI1Type(), 0)});
+    mlir::Value res =
+        b.create<LLVM::TruncOp>(b.getI8Type(), cvtOp.getResults());
+    return b
+        .create<mlir::UnrealizedConversionCastOp>(to_ty, mlir::ValueRange{res})
+        .getResult(0);
+  }
+};
+
+struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
+  using Fp8OpRewritePattern::Fp8OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      arith::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
+    auto src = mlir::cast<FloatValue>(op.getOperand());
+    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    if (!isFp8(src.getType()) && !isBf8(src.getType())) {
+      return rewriter.notifyMatchFailure(op, "unsupported float conversion");
+    }
+
+    auto match = MatchDecomposeVector(op, src, dst_ty);
+
+    if (match) {
+      auto [input, outputs] = *match;
+      if (mlir::Operation* input_op = input.getDefiningOp()) {
+        rewriter.setInsertionPointAfter(input_op);
+      } else {
+        rewriter.setInsertionPointToStart(
+            mlir::cast<mlir::BlockArgument>(input).getOwner());
+      }
+      mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+      auto new_outputs = EmitVectorizedExtFromF8Intrinsic(
+          input, mlir::cast<mlir::FloatType>(outputs[0].getType()), b);
+      for (auto [old_value, new_value] :
+           llvm::zip_equal(outputs, new_outputs)) {
+        rewriter.replaceOp(old_value.getDefiningOp(), new_value);
+      }
+
+      return mlir::success();
+    }
+
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    rewriter.replaceOp(op, EmitExtFromF8Intrinsic(src, dst_ty, b));
+    return mlir::success();
+  }
+  std::optional<std::tuple<FixedVectorValue, llvm::SmallVector<mlir::Value, 4>>>
+  MatchDecomposeVector(arith::ExtFOp op, FloatValue value,
+                       mlir::FloatType to_ty) const {
+    auto matchPos = [](vector::ExtractOp extract, size_t* pos) -> bool {
+      llvm::APInt ap_pos;
+      auto position = extract.getMixedPosition();
+      if (position.size() != 1) {
+        return false;
+      }
+      if (auto attr = mlir::dyn_cast<mlir::Attribute>(position[0])) {
+        if (!mlir::matchPattern(attr, mlir::m_ConstantInt(&ap_pos))) {
+          return false;
+        }
+      } else {
+        if (!mlir::matchPattern(mlir::cast<mlir::Value>(position[0]),
+                                mlir::m_ConstantInt(&ap_pos))) {
+          return false;
+        }
+      }
+      *pos = ap_pos.getZExtValue();
+      return true;
+    };
+
+    size_t pos;
+    auto extract = value.getDefiningOp<vector::ExtractOp>();
+    if (!extract || !extract->hasOneUse() || !matchPos(extract, &pos)) {
+      return std::nullopt;
+    }
+
+    mlir::Value vector = extract.getVector();
+
+    size_t element_count =
+        mlir::cast<FixedVectorValue>(vector).getType().getNumElements();
+
+    if (!llvm::isPowerOf2_64(element_count) || element_count == 1) {
+      return std::nullopt;
+    }
+
+    llvm::SmallVector<mlir::Value, 4> outputs(element_count);
+
+    auto addOutput = [&](mlir::Value output, size_t index) -> bool {
+      if (index >= element_count) {
+        return false;
+      }
+      if (outputs[index]) {
+        return false;
+      }
+      outputs[index] = output;
+      return true;
+    };
+
+    for (const mlir::OpOperand& use : vector.getUses()) {
+      extract = mlir::dyn_cast<vector::ExtractOp>(use.getOwner());
+      if (!extract || !extract->hasOneUse() || extract.getVector() != vector ||
+          !matchPos(extract, &pos)) {
+        return std::nullopt;
+      }
+      auto extf =
+          mlir::dyn_cast<arith::ExtFOp>(extract->use_begin()->getOwner());
+      if (!extf || extf.getType() != to_ty || extf.getOperand() != extract) {
+        return std::nullopt;
+      }
+      if (!addOutput(extf, pos)) {
+        return std::nullopt;
+      }
+    }
+
+    if (llvm::any_of(outputs, [](mlir::Value output) { return !output; })) {
+      return std::nullopt;
+    }
+    return std::make_tuple(mlir::cast<FixedVectorValue>(vector),
+                           std::move(outputs));
+  }
+
+  mlir::Value ConvertFromFloat(mlir::Value v, mlir::FloatType to_ty,
+                               mlir::ImplicitLocOpBuilder& b) const {
+    mlir::FloatType f32_ty = b.getF32Type();
+    mlir::IntegerType i32_ty = b.getI32Type();
+    if (to_ty == f32_ty) {
+      return v;
+    }
+
+    if (to_ty.getWidth() > f32_ty.getWidth()) {
+      return b.create<arith::ExtFOp>(to_ty, v);
+    }
+
+    if (to_ty.isBF16()) {
+      return b.create<LLVM::BitcastOp>(
+          to_ty,
+          b.create<LLVM::TruncOp>(
+              b.getI16Type(),
+              b.create<LLVM::LShrOp>(b.create<LLVM::BitcastOp>(i32_ty, v),
+                                     b.create<LLVM::ConstantOp>(i32_ty, 16))));
+    }
+
+    assert(to_ty.getWidth() < f32_ty.getWidth());
+    return b.create<arith::TruncFOp>(to_ty, v);
+  }
+
+  llvm::SmallVector<mlir::Value, 4> EmitVectorizedExtFromF8Intrinsic(
+      FixedVectorValue value, mlir::FloatType to_ty,
+      mlir::ImplicitLocOpBuilder& b) const {
+    mlir::FloatType f32_ty = b.getF32Type();
+    mlir::IntegerType i32_ty = b.getI32Type();
+    mlir::IntegerType i16_ty = b.getI16Type();
+    mlir::IntegerType i8_ty = b.getI8Type();
+    mlir::IntegerType i1_ty = b.getI1Type();
+    mlir::Value zero_cst = b.create<LLVM::ConstantOp>(i32_ty, 0);
+    mlir::Value one_cst = b.create<LLVM::ConstantOp>(i32_ty, 1);
+
+    size_t num_elements = value.getType().getNumElements();
+    assert(num_elements == 2 || num_elements % 4 == 0);
+
+    size_t num_chunks = (num_elements + 2) / 4;
+    mlir::Type chunks_ty = mlir::VectorType::get(num_chunks, i32_ty);
+    mlir::Value chunks;
+
+    if (num_elements == 2) {
+      chunks = b.create<LLVM::BitcastOp>(
+          chunks_ty,
+          b.create<LLVM::InsertElementOp>(
+              b.create<LLVM::UndefOp>(mlir::VectorType::get(2, i16_ty)),
+              b.create<LLVM::BitcastOp>(
+                  i16_ty, b.create<mlir::UnrealizedConversionCastOp>(
+                               mlir::VectorType::get(num_elements, i8_ty),
+                               mlir::ValueRange{value})
+                              .getResult(0)),
+              zero_cst));
+    } else {
+      chunks = b.create<LLVM::BitcastOp>(
+          chunks_ty, b.create<mlir::UnrealizedConversionCastOp>(
+                          mlir::VectorType::get(num_elements, i8_ty),
+                          mlir::ValueRange{value})
+                         .getResult(0));
+    }
+
+    llvm::SmallVector<mlir::Value, 4> results;
+    mlir::StringAttr cvtIntr = b.getStringAttr(
+        isFp8(value.getType().getElementType()) ? "llvm.amdgcn.cvt.pk.f32.fp8"
+                                                : "llvm.amdgcn.cvt.pk.f32.bf8");
+    mlir::Type result_ty = mlir::VectorType::get(2, f32_ty);
+    LLVM::FastmathFlagsAttr flags =
+        LLVM::FastmathFlagsAttr::get(b.getContext(), LLVM::FastmathFlags::ninf);
+    for (size_t i = 0; i < num_elements / 2; i++) {
+      mlir::Value chunk_pos = b.create<LLVM::ConstantOp>(i32_ty, (2 * i) / 4);
+      mlir::Value chunk = b.create<LLVM::ExtractElementOp>(chunks, chunk_pos);
+      LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
+          result_ty, cvtIntr,
+          mlir::ValueRange{
+              chunk, b.create<LLVM::ConstantOp>(i1_ty, ((2 * i) % 4) != 0)},
+          flags);
+
+      results.push_back(
+          b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), zero_cst));
+      results.push_back(
+          b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), one_cst));
+    }
+
+    if (to_ty.isF16()) {
+      result_ty = mlir::VectorType::get(2, b.getF16Type());
+      cvtIntr = b.getStringAttr("llvm.amdgcn.cvt.pkrtz");
+      for (size_t i = 0; i < num_elements / 2; i++) {
+        LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
+            result_ty, cvtIntr,
+            mlir::ValueRange{results[2 * i], results[2 * i + 1]}, flags);
+
+        results[2 * i] =
+            b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), zero_cst);
+        results[2 * i + 1] =
+            b.create<LLVM::ExtractElementOp>(cvtOp.getResult(0), one_cst);
+      }
+    } else if (to_ty != f32_ty) {
+      llvm::transform(results, results.begin(),
+                      [&](mlir::Value v) -> mlir::Value {
+                        return ConvertFromFloat(v, to_ty, b);
+                      });
+    }
+
+    return results;
+  }
+
+  mlir::Value EmitExtFromF8Intrinsic(mlir::Value value, mlir::FloatType to_ty,
+                                     mlir::ImplicitLocOpBuilder& b) const {
+    assert(isFp8(value.getType()) || isBf8(value.getType()));
+
+    mlir::FloatType f32_ty = b.getF32Type();
+    mlir::IntegerType i32_ty = b.getI32Type();
+    mlir::IntegerType i8_ty = b.getI8Type();
+    mlir::Value zero_cst = b.create<LLVM::ConstantOp>(i32_ty, 0);
+    // Emulate anyext
+    mlir::Value input = b.create<LLVM::BitcastOp>(
+        i32_ty, b.create<LLVM::InsertElementOp>(
+                    b.create<LLVM::UndefOp>(mlir::VectorType::get(4, i8_ty)),
+                    b.create<mlir::UnrealizedConversionCastOp>(
+                         i8_ty, mlir::ValueRange{value})
+                        .getResult(0),
+                    zero_cst));
+    mlir::StringAttr cvtIntr =
+        b.getStringAttr(isFp8(value.getType()) ? "llvm.amdgcn.cvt.f32.fp8"
+                                               : "llvm.amdgcn.cvt.f32.bf8");
+    LLVM::FastmathFlagsAttr flags =
+        LLVM::FastmathFlagsAttr::get(b.getContext(), LLVM::FastmathFlags::ninf);
+    LLVM::CallIntrinsicOp cvtOp = b.create<LLVM::CallIntrinsicOp>(
+        mlir::TypeRange{f32_ty}, cvtIntr, mlir::ValueRange{input, zero_cst},
+        flags);
+
+    return ConvertFromFloat(cvtOp.getResult(0), to_ty, b);
+  }
+};
+
+class ConvertFloatAMDPass
+    : public impl::ConvertFloatAMDPassBase<ConvertFloatAMDPass> {
+ public:
+  explicit ConvertFloatAMDPass(const ConvertFloatAMDPassOptions& options)
+      : ConvertFloatAMDPassBase(options) {}
+
+  explicit ConvertFloatAMDPass(const se::RocmComputeCapability& cc) : cc_(cc) {}
+
+  void runOnOperation() override {
+    if (!gpu_device_info_.empty()) {
+      se::GpuDeviceInfoProto device_info;
+      CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
+                                                       &device_info));
+      cc_ = se::DeviceDescription(device_info).rocm_compute_capability();
+    }
+    mlir::RewritePatternSet patterns(&getContext());
+    bool nativeNanooFp8 = cc_.has_nanoo_fp8_support();
+    patterns.add<RewriteFp8TruncFPattern, RewriteFp8ExtFPattern>(
+        &getContext(), nativeNanooFp8);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+
+ private:
+  se::RocmComputeCapability cc_;
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
+    const std::string& gpu_device_info) {
+  ConvertFloatAMDPassOptions options;
+  options.gpu_device_info_ = gpu_device_info;
+  return std::make_unique<ConvertFloatAMDPass>(options);
+}
+
+std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
+    const se::RocmComputeCapability& cc) {
+  return std::make_unique<ConvertFloatAMDPass>(cc);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
index 3816a077b075..fd763ade93cb 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_nvidia.cc
@@ -31,12 +31,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/semantic_version.h"
 
-#ifdef GOOGLE_CUDA
-#include "xla/service/gpu/llvm_gpu_backend/nvptx_backend.h"
-#endif
 
 namespace xla {
 namespace gpu {
@@ -84,7 +79,7 @@ struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
     if (value.getType() == b.getF16Type()) {
       // Fast path for truncating F16 type.
       Value vec =
-          b.create<ml::UndefOp>(ml::getFixedVectorType(value.getType(), 2));
+          b.create<ml::UndefOp>(mlir::VectorType::get(2, value.getType()));
       vec = b.create<ml::InsertElementOp>(vec, value,
                                           b.create<ma::ConstantIntOp>(0, 8));
       auto cvtIntr = llvm::isa<mlir::Float8E4M3FNType>(to_ty)
@@ -220,9 +215,9 @@ struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
                        ? "llvm.nvvm.e4m3x2.to.f16x2.rn"
                        : "llvm.nvvm.e5m2x2.to.f16x2.rn";
     mlir::FloatType f16_ty = b.getF16Type();
-    auto cvtOp = b.create<ml::CallIntrinsicOp>(
-        ml::getFixedVectorType(f16_ty, 2), b.getStringAttr(cvtIntr),
-        mlir::ValueRange{input});
+    auto cvtOp = b.create<ml::CallIntrinsicOp>(mlir::VectorType::get(2, f16_ty),
+                                               b.getStringAttr(cvtIntr),
+                                               mlir::ValueRange{input});
     Value res = b.create<ml::ExtractElementOp>(
         cvtOp.getResults(), b.create<ma::ConstantIntOp>(0, 8));
     if (to_ty.getWidth() > f16_ty.getWidth()) {
@@ -258,23 +253,5 @@ std::unique_ptr<mlir::Pass> CreateConvertFloatNvidiaPass() {
   return std::make_unique<ConvertFloatNvidiaPass>();
 }
 
-std::optional<std::unique_ptr<mlir::Pass>> MaybeCreateConvertFloatNvidiaPass(
-    const se::DeviceDescription& device_description) {
-#ifdef GOOGLE_CUDA
-  se::SemanticVersion ptx_version =
-      nvptx::DetermineHighestSupportedPtxVersionFromCudaVersion(
-          device_description.runtime_version());
-  se::CudaComputeCapability cc = device_description.cuda_compute_capability();
-
-  // FP8 conversion intrinsics are available on sm89 since ptx 8.1
-  // Older ptx versions only support FP8 conversion for sm90
-  if ((ptx_version >= se::SemanticVersion(8, 1, 0) && cc.IsAtLeast(8, 9)) ||
-      (ptx_version >= se::SemanticVersion(7, 8, 0) && cc.IsAtLeast(9, 0))) {
-    return CreateConvertFloatNvidiaPass();
-  }
-#endif
-  return std::nullopt;
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/expand_float_ops.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/expand_float_ops.cc
deleted file mode 100644
index 00556c88cd3b..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/expand_float_ops.cc
+++ /dev/null
@@ -1,714 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "absl/log/check.h"
-#include "llvm/ADT/APFloat.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/Math/Transforms/Passes.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-namespace ma = ::mlir::arith;
-
-using ma::SelectOp;
-using mlir::Value;
-
-#define GEN_PASS_DEF_EXPANDFLOATOPSPASS
-#include "xla/backends/gpu/codegen/emitters/transforms/passes.h.inc"
-
-namespace {
-
-// Wraps a Value to provide operator overloading for more readable expressions.
-struct Val {
-  Value value;
-  mlir::ImplicitLocOpBuilder* b;
-
-  operator Value() const { return value; }  // NOLINT
-
-  Val operator+(int64_t rhs) const { return Binop<ma::AddIOp>(rhs); }
-  Val operator+(Value rhs) const { return Binop<ma::AddIOp>(rhs); }
-  Val operator-(int64_t rhs) const { return Binop<ma::SubIOp>(rhs); }
-  Val operator-(Value rhs) const { return Binop<ma::SubIOp>(rhs); }
-  Val operator*(int64_t rhs) const { return Binop<ma::MulIOp>(rhs); }
-  Val operator*(Value rhs) const { return Binop<ma::MulIOp>(rhs); }
-  Val operator&(Value rhs) const { return Binop<ma::AndIOp>(rhs); }
-  Val operator&(int64_t rhs) const { return Binop<ma::AndIOp>(rhs); }
-  Val operator|(Value rhs) const { return Binop<ma::OrIOp>(rhs); }
-  Val operator|(int64_t rhs) const { return Binop<ma::OrIOp>(rhs); }
-  Val operator^(Value rhs) const { return Binop<ma::XOrIOp>(rhs); }
-  Val shl(Value rhs) const { return Binop<ma::ShLIOp>(rhs); }
-  Val shl(int64_t rhs) const { return Binop<ma::ShLIOp>(rhs); }
-  Val shrui(Value rhs) const { return Binop<ma::ShRUIOp>(rhs); }
-  Val shrui(int64_t rhs) const { return Binop<ma::ShRUIOp>(rhs); }
-
-  Val cmp(ma::CmpIPredicate pred, Value rhs) const {
-    return {b->create<ma::CmpIOp>(pred, value, rhs), b};
-  }
-  Val cmp(ma::CmpIPredicate pred, int64_t rhs) const {
-    return cmp(pred, MakeConstant(rhs));
-  }
-  Val operator==(Value rhs) const { return cmp(ma::CmpIPredicate::eq, rhs); }
-  Val operator==(int64_t rhs) const { return cmp(ma::CmpIPredicate::eq, rhs); }
-  Val operator!=(int64_t rhs) const { return cmp(ma::CmpIPredicate::ne, rhs); }
-
-  Val MakeConstant(int64_t c) const {
-    return {b->create<ma::ConstantIntOp>(c, value.getType()), b};
-  }
-
- private:
-  template <typename Op>
-  Val Binop(Value rhs) const {
-    return {b->create<Op>(value, rhs), b};
-  }
-
-  template <typename Op>
-  Val Binop(int64_t rhs) const {
-    return Binop<Op>(MakeConstant(rhs));
-  }
-};
-
-struct RewriteErf32Pattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      mlir::math::ErfOp op, mlir::PatternRewriter& rewriter) const override {
-    if (!op.getType().isF32()) {
-      return rewriter.notifyMatchFailure(op, "not an f32 erf");
-    }
-
-    static const std::array<float, 5> kAlpha{
-        0.00022905065861350646f, 0.0034082910107109506f, 0.050955695062380861f,
-        0.18520832239976145f, 1.128379143519084f};
-
-    static const std::array<float, 7> kBeta{-1.1791602954361697e-7,
-                                            0.000023547966471313185f,
-                                            0.0010179625278914885f,
-                                            0.014070470171167667f,
-                                            0.11098505178285362f,
-                                            0.49746925110067538f,
-                                            1.0f};
-
-    // We clamp x to be within [-c;c] where c = erfinv(1-2^-23), outside of
-    // which x should be +/-1.
-    constexpr float kErfInvOneMinusHalfULP = 3.7439211627767994f;
-
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    auto c = [&](float v) -> Value {
-      return b.create<ma::ConstantFloatOp>(llvm::APFloat(v),
-                                           rewriter.getF32Type());
-    };
-
-    auto poly = [&](auto x, auto coefficients) -> Value {
-      auto r = c(coefficients[0]);
-      for (int i = 1; i < coefficients.size(); ++i) {
-        r = b.create<mlir::math::FmaOp>(r, x, c(coefficients[i]));
-      }
-      return r;
-    };
-
-    Value x = op.getOperand();
-    x = b.create<ma::MaximumFOp>(x, c(-kErfInvOneMinusHalfULP));
-    x = b.create<ma::MinimumFOp>(x, c(kErfInvOneMinusHalfULP));
-    Value x2 = b.create<ma::MulFOp>(x, x);
-
-    rewriter.replaceOpWithNewOp<ma::DivFOp>(
-        op, b.create<ma::MulFOp>(x, poly(x2, kAlpha)), poly(x2, kBeta));
-
-    return mlir::success();
-  }
-};
-
-int GetSignificandBits(mlir::FloatType ty) {
-  return llvm::APFloat::semanticsPrecision(ty.getFloatSemantics()) - 1;
-}
-
-int GetExponentBias(mlir::FloatType ty) {
-  return 1 - llvm::APFloat::semanticsMinExponent(ty.getFloatSemantics()) -
-         llvm::isa<mlir::Float8E8M0FNUType>(ty);  // No zero exponent for E8M0.
-}
-
-bool IsFNUZ(mlir::FloatType ty) {
-  return llvm::isa<mlir::Float8E4M3B11FNUZType, mlir::Float8E4M3FNUZType,
-                   mlir::Float8E5M2FNUZType>(ty);
-}
-
-Value IsInf(Value value, mlir::ImplicitLocOpBuilder& b) {
-  auto ty = mlir::cast<mlir::FloatType>(value.getType());
-  if (mlir::LLVM::isCompatibleOuterType(ty)) {
-    value = b.create<mlir::math::AbsFOp>(value);
-    Value inf = b.create<ma::ConstantFloatOp>(
-        llvm::APFloat::getInf(ty.getFloatSemantics()), ty);
-    return b.create<ma::CmpFOp>(ma::CmpFPredicate::OEQ, value, inf);
-  }
-
-  assert(ty.getIntOrFloatBitWidth() <= 8);
-  // F8E5M2, F8E4M3, F8E3M4 are the only 8 bit float with infinities.
-  if (llvm::isa<mlir::Float8E5M2Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
-    return (bits & 0x7F) == 0x7C;
-  } else if (llvm::isa<mlir::Float8E4M3Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
-    return (bits & 0x7F) == 0x78;
-  } else if (llvm::isa<mlir::Float8E3M4Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
-    return (bits & 0x7F) == 0x70;
-  } else {
-    return b.create<ma::ConstantIntOp>(false, b.getI1Type());
-  }
-}
-
-Value IsNaN(Value value, mlir::ImplicitLocOpBuilder& b) {
-  auto ty = value.getType();
-  if (mlir::LLVM::isCompatibleOuterType(ty)) {
-    return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNO, value, value);
-  }
-  if (llvm::isa<mlir::Float4E2M1FNType>(ty)) {
-    return b.create<ma::ConstantIntOp>(false, b.getI1Type());
-  }
-
-  assert(ty.getIntOrFloatBitWidth() == 8);
-  Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
-  if (llvm::isa<mlir::Float8E5M2Type>(ty)) {
-    return (bits & 0b0111'1111).cmp(ma::CmpIPredicate::ugt, 0b0111'1100);
-  } else if (llvm::isa<mlir::Float8E4M3Type>(ty)) {
-    return (bits & 0b0111'1111).cmp(ma::CmpIPredicate::ugt, 0b0111'1000);
-  } else if (llvm::isa<mlir::Float8E4M3FNType>(ty)) {
-    return (bits & 0b0111'1111) == 0b0111'1111;
-  } else if (llvm::isa<mlir::Float8E3M4Type>(ty)) {
-    return (bits & 0b0111'1111).cmp(ma::CmpIPredicate::ugt, 0b0111'0000);
-  } else if (llvm::isa<mlir::Float8E8M0FNUType>(ty)) {
-    return bits == 0xFF;
-  }
-  return bits == 0x80;
-}
-
-Value EmitReducePrecision(Value value, int exponent_bits, int mantissa_bits,
-                          mlir::ImplicitLocOpBuilder& b) {
-  mlir::mhlo::ReducePrecisionOp::Properties properties;
-  properties.exponent_bits = b.getI32IntegerAttr(exponent_bits);
-  properties.mantissa_bits = b.getI32IntegerAttr(mantissa_bits);
-  return mlir::mhlo::MhloOpToStdScalarOp::mapOpOfType<
-      mlir::mhlo::ReducePrecisionOp>(
-      b.getLoc(), value.getType(), {value.getType()},
-      mlir::mhlo::ReducePrecisionOp::Adaptor(value, nullptr, properties),
-      /*attributes=*/std::nullopt, &b);
-}
-
-Value EmitF16ToF8e5m2(Value in, mlir::ImplicitLocOpBuilder& b) {
-  Val in_bits{b.create<ma::BitcastOp>(b.getI16Type(), in), &b};
-  // Use this method of checking for NaN because it's the same as what's used
-  // in the reduce precision lowering.
-  Value is_nan = (in_bits & 32767).cmp(ma::CmpIPredicate::ugt, 31744);
-
-  Value value = EmitReducePrecision(in, 5, 2, b);
-  value = b.create<ma::BitcastOp>(b.getI16Type(), value);
-  value = b.create<ma::ShRUIOp>(value,
-                                b.create<ma::ConstantIntOp>(8, b.getI16Type()));
-  value = b.create<ma::TruncIOp>(b.getI8Type(), value);
-  // When the input is NaN, just truncating can turn a NaN into an inf if the
-  // mantissa becomes 0.
-  value = b.create<ma::SelectOp>(
-      is_nan, b.create<ma::ConstantIntOp>(0x7F, value.getType()), value);
-  return b.create<ma::BitcastOp>(b.getType<mlir::Float8E5M2Type>(), value);
-}
-
-Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
-                          mlir::ImplicitLocOpBuilder& b) {
-  using ma::CmpIPredicate;
-
-  auto from_ty = mlir::cast<mlir::FloatType>(value.getType());
-  if (to_ty == b.getType<mlir::Float8E5M2Type>() && from_ty == b.getF16Type()) {
-    return EmitF16ToF8e5m2(value, b);
-  }
-
-  if (to_ty == b.getType<mlir::Float8E5M2Type>() &&
-      from_ty == b.getBF16Type()) {
-    // Going through f32 and f16 is significantly faster than the fallback code
-    // below.
-    return EmitF16ToF8e5m2(
-        b.create<ma::TruncFOp>(b.getF16Type(),
-                               b.create<ma::ExtFOp>(b.getF32Type(), value)),
-        b);
-  }
-
-  // Fallback code. The generated code here is not good. If you end up here,
-  // you might want to add a more specific conversion.
-  // This is a port of ConvertImpl in
-  // https://github.com/jax-ml/ml_dtypes/blob/main/ml_dtypes/include/float8.h
-
-  int from_mantissa = GetSignificandBits(from_ty);
-  int from_bias = GetExponentBias(from_ty);
-  int from_min_exp =
-      llvm::APFloat::semanticsMinExponent(from_ty.getFloatSemantics());
-  int from_max_exp =
-      llvm::APFloat::semanticsMaxExponent(from_ty.getFloatSemantics());
-  auto from_int_ty = b.getIntegerType(from_ty.getIntOrFloatBitWidth());
-
-  int to_mantissa = GetSignificandBits(to_ty);
-  int to_bias = GetExponentBias(to_ty);
-  int to_min_exp =
-      llvm::APFloat::semanticsMinExponent(to_ty.getFloatSemantics());
-  int to_max_exp =
-      llvm::APFloat::semanticsMaxExponent(to_ty.getFloatSemantics());
-  auto to_int_ty = b.getIntegerType(to_ty.getIntOrFloatBitWidth());
-
-  mlir::IntegerType wide_int_ty;
-  if (from_ty.getWidth() <= 8 && to_ty.getWidth() <= 8) {
-    wide_int_ty = b.getI16Type();
-  } else {
-    wide_int_ty = b.getIntegerType(
-        std::max(from_int_ty.getWidth(), to_int_ty.getWidth()));
-    // Avoid overflow for bit shifts.
-    auto may_overflow = [&](mlir::Type a, mlir::Type b) {
-      return llvm::isa<mlir::Float8E8M0FNUType>(a) && b.isF16();
-    };
-    if (may_overflow(from_ty, to_ty) || may_overflow(to_ty, from_ty)) {
-      wide_int_ty = b.getI32Type();
-    }
-  }
-  auto convert_int = [&](mlir::Type ty, Value v) -> Val {
-    if (v.getType() == ty) {
-      return {v, &b};
-    }
-    if (ty.getIntOrFloatBitWidth() < v.getType().getIntOrFloatBitWidth()) {
-      return {b.create<ma::TruncIOp>(ty, v), &b};
-    }
-    return {b.create<ma::ExtUIOp>(ty, v), &b};
-  };
-
-  int64_t exp_offset = to_bias - from_bias;
-  int digit_shift = to_mantissa - from_mantissa;
-
-  int from_width = value.getType().getIntOrFloatBitWidth();
-  Val from_bits{b.create<ma::BitcastOp>(b.getIntegerType(from_width), value),
-                &b};
-  if (from_width < 8) {
-    from_bits = convert_int(b.getIntegerType(8), from_bits);
-  }
-
-  auto cst = [&](mlir::Type ty, int64_t n) -> Val {
-    return {b.create<ma::ConstantIntOp>(n, ty), &b};
-  };
-
-  // Shift bits to destination type, without sign bit.
-  Val from_sign_bit;
-  if (!llvm::isa<mlir::Float8E8M0FNUType>(from_ty)) {
-    from_sign_bit = from_bits.shrui(from_width - 1) != 0;
-    from_bits = from_bits & ((1ULL << (from_width - 1)) - 1);
-  }
-
-  auto cst_bits = [&](llvm::APFloat f) {
-    return cst(b.getIntegerType(llvm::APFloat::getSizeInBits(f.getSemantics())),
-               f.bitcastToAPInt().getZExtValue());
-  };
-  Value to_nan;
-  Value to_inf;
-  Val to_zero;
-
-  // MX float types have neither infinities nor NaNs.
-  if (llvm::isa<mlir::Float4E2M1FNType>(to_ty)) {
-    to_zero = cst_bits(llvm::APFloat::getZero(to_ty.getFloatSemantics()));
-    to_nan = to_zero | 0x8;
-    to_inf = cst_bits(llvm::APFloat::getLargest(to_ty.getFloatSemantics()));
-  } else if (llvm::isa<mlir::Float8E8M0FNUType>(to_ty)) {
-    to_nan = cst_bits(llvm::APFloat::getNaN(to_ty.getFloatSemantics()));
-    to_inf = to_nan;
-    to_zero = Val{to_nan, &b};
-  } else {
-    to_inf = cst_bits(llvm::APFloat::getInf(to_ty.getFloatSemantics()));
-    to_nan = cst_bits(llvm::APFloat::getNaN(to_ty.getFloatSemantics()));
-    to_zero = cst_bits(llvm::APFloat::getZero(to_ty.getFloatSemantics()));
-  }
-
-  auto round_bits_to_nearest_even = [&](Val bits, Val roundoff,
-                                        bool use_implicit_bit = false) {
-    assert(bits.value.getType() == roundoff.value.getType());
-    // Round to nearest even by adding a bias term.
-    // Consider a bit pattern
-    //   FFF...FLRTT...T,
-    // where bits RTT...T need to be rounded-off.  We add a bias term to the
-    // bit pattern s.t. a carry is introduced to round up only if
-    // - L is 1, R is 1, OR
-    // - L is 0, R is 1, any T is one.
-    // We do this by adding L to a bit pattern consisting of all T = 1.
-    Val bias = !use_implicit_bit
-                   ? (bits.shrui(roundoff) & 1) +
-                         (bits.MakeConstant(1).shl(roundoff - 1) - 1)
-                   : bits.MakeConstant(1).shl(roundoff - 1);
-    return bits + bias;
-  };
-
-  // Happy path: no subnormals, infinities or NaNs.
-  Value result;
-  {
-    // Round the mantissa if it is shrinking.
-    Val rounded_from_bits = convert_int(wide_int_ty, from_bits);
-    if (digit_shift < 0) {
-      rounded_from_bits =
-          round_bits_to_nearest_even(
-              rounded_from_bits, rounded_from_bits.MakeConstant(-digit_shift),
-              /*use_implicit_bit=*/to_mantissa == 0) &
-          ~((1ll << (-digit_shift)) - 1);
-    }
-
-    // Re-bias the exponent.
-    rounded_from_bits = rounded_from_bits + (exp_offset << from_mantissa);
-
-    // Check for overflows by aligning the significands. We always align the
-    // narrower significand to the wider significand.
-    int64_t to_highest = llvm::APFloat::getLargest(to_ty.getFloatSemantics())
-                             .bitcastToAPInt()
-                             .getZExtValue();
-    int64_t aligned_highest = to_highest;
-    if (digit_shift < 0) {
-      aligned_highest <<= -digit_shift;
-      // Shift down, all dropped bits should already be zero.
-      result = rounded_from_bits.shrui(-digit_shift);
-    } else {
-      // Shift up, inserting zeros in the newly created digits.
-      rounded_from_bits = rounded_from_bits.shl(digit_shift);
-      result = rounded_from_bits;
-    }
-    result = convert_int(to_int_ty, result);
-
-    // `From` supports larger values than `To`, we may overflow.
-    if (std::make_pair(to_max_exp, to_mantissa) <
-        std::make_pair(from_max_exp, from_mantissa)) {
-      result = b.create<SelectOp>(
-          rounded_from_bits.cmp(CmpIPredicate::ugt, aligned_highest), to_inf,
-          result);
-    }
-  }
-
-  auto i32_ty = b.getI32Type();
-  Val biased_from_exp = convert_int(i32_ty, from_bits.shrui(from_mantissa));
-
-  if (to_min_exp < from_min_exp) {
-    // `To` supports more exponents near zero which means that some subnormal
-    // values in `From` may become normal.
-
-    // Subnormals.
-    Val bits = convert_int(wide_int_ty, from_bits);
-
-    // Determine exponent in target type.
-    Value clz = convert_int(
-        i32_ty, b.create<mlir::math::CountLeadingZerosOp>(from_bits));
-    Value msb = cst(i32_ty, std::max(from_width, 8) - 1) - clz;
-    Value normalization_factor = cst(i32_ty, from_mantissa) - msb;
-
-    Val biased_exponent = cst(i32_ty, exp_offset + 1) - normalization_factor;
-    // If the result is subnormal, adjust the subnormal bits to account for
-    // the difference in exponent bias.
-    Value subnormal_bits = bits;
-    if (exp_offset < wide_int_ty.getWidth()) {
-      subnormal_bits = bits.shl(exp_offset);
-    }
-
-    // Result is normal. Shift the mantissa to account for the number of
-    // leading zero digits, and clear the hidden bit.
-    // Insert the exponent bits.
-    Value normal_bits =
-        (bits.shl(convert_int(wide_int_ty, normalization_factor)) &
-         ~(1 << from_mantissa)) |
-        convert_int(wide_int_ty, biased_exponent).shl(from_mantissa);
-
-    Value biased_exp_sle_zero = biased_exponent.cmp(CmpIPredicate::sle, 0);
-    bits.value =
-        b.create<SelectOp>(biased_exp_sle_zero, subnormal_bits, normal_bits);
-    if (digit_shift >= 0) {
-      bits = bits.shl(digit_shift);
-    } else {
-      bits = round_bits_to_nearest_even(
-          bits, bits.MakeConstant(-digit_shift),
-          /*use_implicit_bit=*/to_mantissa == 0 && exp_offset != 0);
-      bits = bits.shrui(-digit_shift);
-    }
-    bits = convert_int(to_int_ty, bits);
-
-    result = b.create<SelectOp>(biased_from_exp == 0, bits, result);
-  } else if (to_min_exp > from_min_exp) {
-    // `To` supports fewer exponents near zero which means that some values in
-    // `From` may become subnormal.
-    Val biased_to_exp = biased_from_exp + (to_bias - from_bias);
-    // Subnormals and zero.
-    // Round and shift mantissa down.
-    Val from_has_leading_one = !llvm::isa<mlir::Float8E8M0FNUType>(from_ty)
-                                   ? biased_from_exp != 0
-                                   : cst(i32_ty, 1);
-    Val from_has_leading_one_i32 = convert_int(i32_ty, from_has_leading_one);
-    from_has_leading_one = convert_int(from_int_ty, from_has_leading_one);
-    Val exponent_shift_i32 =
-        (from_has_leading_one_i32 - biased_to_exp) - digit_shift;
-    // Insert the implicit leading 1 bit on the mantissa for normalized
-    // inputs.
-    Val rounded_from_bits = (from_bits & ((1ll << from_mantissa) - 1)) |
-                            from_has_leading_one.shl(from_mantissa);
-
-    // NOTE: we need to round again from the original from_bits,
-    // otherwise the lower precision bits may already be lost.  There is
-    // an edge-case where rounding to a normalized value would normally
-    // round down, but for a subnormal, we need to round up.
-    Val exponent_shift_from_ty = convert_int(from_int_ty, exponent_shift_i32);
-    Val exponent_shift_to_ty = convert_int(to_int_ty, exponent_shift_i32);
-    Val positive_bits = convert_int(
-        to_int_ty,
-        round_bits_to_nearest_even(rounded_from_bits, exponent_shift_from_ty)
-            .shrui(exponent_shift_from_ty));
-    // To avoid UB, limit rounding and shifting to the full mantissa plus
-    // leading 1.
-    positive_bits.value = b.create<SelectOp>(
-        exponent_shift_i32.cmp(CmpIPredicate::sle, from_mantissa + 1),
-        positive_bits, to_zero);
-
-    Val negative_bits = convert_int(to_int_ty, rounded_from_bits)
-                            .shl(to_zero - exponent_shift_to_ty);
-    Value bits =
-        b.create<SelectOp>(exponent_shift_i32.cmp(CmpIPredicate::sgt, 0),
-                           positive_bits, negative_bits);
-    result = b.create<SelectOp>(biased_to_exp.cmp(CmpIPredicate::sle, 0), bits,
-                                result);
-  }
-
-  Value result_is_inf = IsInf(value, b);
-  Value input_is_nan = IsNaN(value, b);
-
-  if (llvm::isa<mlir::Float8E8M0FNUType>(to_ty)) {
-    // Converting a negative number to E8M0 results in NaN.
-    input_is_nan = from_sign_bit | input_is_nan;
-  } else if (IsFNUZ(to_ty)) {
-    // Clear the sign bit if the result is zero (the output has no negative
-    // zero). Handle the edge case when the input is zero and the result is not.
-    Val result_is_non_zero =
-        (digit_shift > 0 ? from_bits : Val{result, &b}) != 0;
-    from_sign_bit = from_sign_bit & result_is_non_zero;
-  } else if (IsFNUZ(from_ty)) {
-    // Clear the sign bit if the input is NaN (it's positive but encoded as
-    // negative 0).
-    from_sign_bit = from_sign_bit ^ input_is_nan;
-  }
-
-  if (!llvm::isa<mlir::Float8E8M0FNUType>(from_ty)) {
-    result = b.create<SelectOp>(from_bits == 0, to_zero, result);
-  }
-  result = b.create<SelectOp>(result_is_inf, to_inf, result);
-  result = b.create<SelectOp>(input_is_nan, to_nan, result);
-
-  // Insert sign bit.
-  if (!llvm::isa<mlir::Float8E8M0FNUType>(from_ty)) {
-    Value neg_result = Val{result, &b} | (1ll << (to_int_ty.getWidth() - 1));
-    result = b.create<SelectOp>(from_sign_bit, neg_result, result);
-  }
-  result = b.create<ma::BitcastOp>(to_ty, result);
-  return result;
-}
-
-struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      ma::TruncFOp op, mlir::PatternRewriter& rewriter) const override {
-    using FloatValue = mlir::TypedValue<mlir::FloatType>;
-    auto src = mlir::cast<FloatValue>(op.getOperand());
-    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
-    if (dst_ty.getWidth() > 8) {
-      return rewriter.notifyMatchFailure(op, "not an 8 bit (or less) truncf");
-    }
-
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    rewriter.replaceOp(op, EmitFloatConversion(src, dst_ty, b));
-    return mlir::success();
-  }
-};
-
-struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      ma::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
-    using FloatValue = mlir::TypedValue<mlir::FloatType>;
-    auto src = mlir::cast<FloatValue>(op.getOperand());
-    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
-    if (src.getType().getWidth() > 8) {
-      return rewriter.notifyMatchFailure(op, "not an 8 bit (or less) extf");
-    }
-
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    rewriter.replaceOp(op, EmitFloatConversion(src, dst_ty, b));
-    return mlir::success();
-  }
-};
-
-// Lowering for cmpf : f8 for float to pred conversions.
-struct RewriteF8Cst : public mlir::OpRewritePattern<ma::CmpFOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      ma::CmpFOp op, mlir::PatternRewriter& rewriter) const override {
-    using FloatValue = mlir::TypedValue<mlir::FloatType>;
-    auto lhs = mlir::cast<FloatValue>(op.getLhs());
-    auto rhs = mlir::cast<FloatValue>(op.getRhs());
-
-    if (lhs.getType().getWidth() > 8) {
-      return rewriter.notifyMatchFailure(op, "not an 8 bit (or less) cmpf");
-    }
-
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    // Skip the f32 conversion if we're comparing UNE.cst.
-    llvm::APFloat rhs_cst(rhs.getType().getFloatSemantics());
-    if (op.getPredicate() == ma::CmpFPredicate::UNE &&
-        mlir::matchPattern(rhs, mlir::m_ConstantFloat(&rhs_cst))) {
-      mlir::Type int_ty = rewriter.getIntegerType(lhs.getType().getWidth());
-      Val int_value{b.create<ma::BitcastOp>(int_ty, lhs), &b};
-      int64_t constant = rhs_cst.bitcastToAPInt().getZExtValue();
-      // If we're comparing to +-0, compare the absolute values.
-      if (rhs_cst.isZero() && !IsFNUZ(lhs.getType())) {
-        int64_t mask = (1 << (lhs.getType().getWidth() - 1)) - 1;
-        int_value = int_value & mask;
-        constant &= mask;
-      }
-      auto cst = b.create<ma::ConstantIntOp>(constant, int_ty);
-      rewriter.replaceOpWithNewOp<ma::CmpIOp>(op, ma::CmpIPredicate::ne,
-                                              int_value, cst);
-      return mlir::success();
-    }
-
-    auto lhs_ext = b.create<ma::ExtFOp>(b.getF32Type(), lhs);
-    auto rhs_ext = b.create<ma::ExtFOp>(b.getF32Type(), rhs);
-    rewriter.replaceOpWithNewOp<ma::CmpFOp>(op, op->getResultTypes(),
-                                            mlir::ValueRange{lhs_ext, rhs_ext},
-                                            op->getAttrs());
-    return mlir::success();
-  }
-};
-
-struct RewriteAbsFPattern : public mlir::OpRewritePattern<mlir::math::AbsFOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      mlir::math::AbsFOp op, mlir::PatternRewriter& rewriter) const override {
-    using FloatValue = mlir::TypedValue<mlir::FloatType>;
-    auto src = mlir::cast<FloatValue>(op.getOperand());
-    // LowerGpuOpsToNVVMOps has a lowering for abs that doesn't work with bf16.
-    // Once that's removed, remove the code for BF16 here.
-    if (src.getType().getWidth() > 8 && !src.getType().isBF16()) {
-      return rewriter.notifyMatchFailure(op,
-                                         "not an f8 (or less) or bf16 absf");
-    }
-
-    // If type is unsigned (E8M0), the operation is no-op.
-    if (!llvm::APFloat::semanticsHasSignedRepr(
-            src.getType().getFloatSemantics())) {
-      rewriter.replaceAllOpUsesWith(op, op.getOperand());
-      return mlir::success();
-    }
-
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    mlir::Type i_ty = rewriter.getIntegerType(src.getType().getWidth());
-    Val value{b.create<ma::BitcastOp>(i_ty, src), &b};
-    int64_t mask = (1ull << (src.getType().getWidth() - 1)) - 1;
-    value = value & mask;
-    rewriter.replaceOpWithNewOp<ma::BitcastOp>(op, src.getType(), value);
-    return mlir::success();
-  }
-};
-
-template <typename Op>
-struct RewriteIToFpPattern : public mlir::OpRewritePattern<Op> {
-  using mlir::OpRewritePattern<Op>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      Op op, mlir::PatternRewriter& rewriter) const override {
-    if (op.getType().getIntOrFloatBitWidth() > 8) {
-      return rewriter.notifyMatchFailure(op, "not an f8 (or less) itofp");
-    }
-    Value to_float =
-        rewriter.create<Op>(op.getLoc(), rewriter.getF32Type(), op.getIn());
-    rewriter.replaceOpWithNewOp<ma::TruncFOp>(op, op.getType(), to_float);
-    return mlir::success();
-  }
-};
-
-template <typename Op>
-struct RewriteFpToIPattern : public mlir::OpRewritePattern<Op> {
-  using mlir::OpRewritePattern<Op>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      Op op, mlir::PatternRewriter& rewriter) const override {
-    if (op.getIn().getType().getIntOrFloatBitWidth() > 8) {
-      return rewriter.notifyMatchFailure(op, "not an f8 (or less) fptoi");
-    }
-    Value to_f32 = rewriter.create<ma::ExtFOp>(
-        op.getLoc(), rewriter.getF32Type(), op.getIn());
-    rewriter.replaceOpWithNewOp<Op>(op, op.getType(), to_f32);
-    return mlir::success();
-  }
-};
-
-class ExpandFloatOpsPass
-    : public impl::ExpandFloatOpsPassBase<ExpandFloatOpsPass> {
- public:
-  using ExpandFloatOpsPassBase::ExpandFloatOpsPassBase;
-  void runOnOperation() override {
-    mlir::RewritePatternSet patterns(&getContext());
-    patterns.add<RewriteTruncFPattern, RewriteExtFPattern, RewriteAbsFPattern,
-                 RewriteF8Cst, RewriteIToFpPattern<ma::SIToFPOp>,
-                 RewriteIToFpPattern<ma::UIToFPOp>,
-                 RewriteFpToIPattern<ma::FPToSIOp>,
-                 RewriteFpToIPattern<ma::FPToUIOp>>(&getContext());
-    mlir::populatePolynomialApproximateTanhPattern(patterns);
-    patterns.add<RewriteErf32Pattern>(&getContext());
-    if (mlir::failed(
-            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> CreateExpandFloatOpsPass() {
-  return std::make_unique<ExpandFloatOpsPass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/flatten_tensors.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/flatten_tensors.cc
deleted file mode 100644
index d5b730c45406..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/flatten_tensors.cc
+++ /dev/null
@@ -1,728 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/LogicalResult.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Utils/Utils.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeRange.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
-#include "xla/hlo/analysis/indexing_analysis.h"
-#include "xla/layout_util.h"
-#include "xla/shape_util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-#define GEN_PASS_DEF_FLATTENTENSORSPASS
-#include "xla/backends/gpu/codegen/emitters/transforms/passes.h.inc"
-
-using mlir::Attribute;
-using mlir::Location;
-using mlir::LogicalResult;
-using mlir::MLIRContext;
-using mlir::OpRewritePattern;
-using mlir::PatternRewriter;
-using mlir::RankedTensorType;
-using mlir::ShapedType;
-using mlir::SmallVector;
-using mlir::Type;
-using mlir::TypedValue;
-using mlir::TypeRange;
-using mlir::UnrealizedConversionCastOp;
-using mlir::Value;
-using mlir::ValueRange;
-using mlir::VectorType;
-using mlir::func::FuncOp;
-using mlir::func::ReturnOp;
-using mlir::scf::ForOp;
-using mlir::scf::IfOp;
-using mlir::scf::IndexSwitchOp;
-using mlir::tensor::ExtractOp;
-using mlir::tensor::InsertOp;
-namespace mv = mlir::vector;
-
-RankedTensorType GetFlattenedType(RankedTensorType tensor_type) {
-  return RankedTensorType::get({tensor_type.getNumElements()},
-                               tensor_type.getElementType());
-}
-
-VectorType GetFlattenedType(VectorType vector_type) {
-  return VectorType::get({vector_type.getNumElements()},
-                         vector_type.getElementType());
-}
-
-ShapedType GetFlattenedType(Type type) {
-  if (auto vector_type = mlir::dyn_cast<VectorType>(type)) {
-    return GetFlattenedType(vector_type);
-  }
-  return GetFlattenedType(mlir::cast<RankedTensorType>(type));
-}
-
-bool IsScalarOrFlat(Type type) {
-  if (auto shaped_type = mlir::dyn_cast<ShapedType>(type)) {
-    return shaped_type.getRank() < 2;
-  }
-  return true;
-}
-
-bool HasOnlyFlatTensorsFlatVectorsOrScalars(TypeRange types) {
-  return llvm::all_of(types, IsScalarOrFlat);
-}
-
-Value Flatten(Value value, PatternRewriter& rewriter) {
-  if (IsScalarOrFlat(value.getType())) return value;
-  auto flat_type = GetFlattenedType(value.getType());
-  return rewriter
-      .create<UnrealizedConversionCastOp>(value.getLoc(), flat_type, value)
-      .getResult(0);
-}
-
-struct RewriteFunctionSignatures : OpRewritePattern<FuncOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(FuncOp op,
-                                PatternRewriter& rewriter) const override {
-    auto input_types = op.getFunctionType().getInputs();
-    auto result_types = op.getFunctionType().getResults();
-    if (HasOnlyFlatTensorsFlatVectorsOrScalars(input_types) &&
-        HasOnlyFlatTensorsFlatVectorsOrScalars(result_types)) {
-      return rewriter.notifyMatchFailure(op, "nothing to flatten");
-    }
-
-    auto loc = op.getLoc();
-    mlir::Block* entry_block = &op.getBody().front();
-    SmallVector<Type> new_result_types;
-    SmallVector<Value> new_results;
-
-    // If some results are tensors or vectors, we need to flatten them.
-    auto terminator = entry_block->getTerminator();
-    rewriter.setInsertionPoint(terminator);
-
-    for (Value result : terminator->getOperands()) {
-      Value flattened = Flatten(result, rewriter);
-      new_results.push_back(flattened);
-      new_result_types.push_back(flattened.getType());
-    }
-    rewriter.replaceOpWithNewOp<ReturnOp>(terminator, new_results);
-
-    // Cast all function arguments to the original type.
-    SmallVector<Type> new_operand_types(input_types);
-    rewriter.setInsertionPointToStart(entry_block);
-    for (auto&& [index, operand_type] : llvm::enumerate(new_operand_types)) {
-      if (IsScalarOrFlat(operand_type)) continue;
-      mlir::BlockArgument func_argument = op.getArgument(index);
-      auto cast_to_orig_type = rewriter.create<UnrealizedConversionCastOp>(
-          loc, operand_type, func_argument);
-      func_argument.replaceAllUsesExcept(cast_to_orig_type.getResult(0),
-                                         cast_to_orig_type);
-      operand_type = GetFlattenedType(operand_type);
-    }
-    // Replace the function arguments with the new types.
-    for (auto [arg, arg_type] :
-         llvm::zip(entry_block->getArguments(), new_operand_types)) {
-      arg.setType(arg_type);
-    }
-    // Update function signature.
-    op.setType(rewriter.getFunctionType(new_operand_types, new_result_types));
-    return mlir::success();
-  }
-};
-
-struct RewritePureCall : OpRewritePattern<PureCallOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(PureCallOp op,
-                                PatternRewriter& rewriter) const override {
-    if (HasOnlyFlatTensorsFlatVectorsOrScalars(op.getOperandTypes()) &&
-        HasOnlyFlatTensorsFlatVectorsOrScalars(op.getResultTypes())) {
-      return rewriter.notifyMatchFailure(op, "nothing to flatten");
-    }
-    SmallVector<Value> flat_operands;
-    flat_operands.reserve(op.getNumOperands());
-    for (Value operand : op.getOperands()) {
-      flat_operands.push_back(Flatten(operand, rewriter));
-    }
-    SmallVector<Type> flat_result_types;
-    flat_result_types.reserve(op.getNumResults());
-    llvm::SmallBitVector results_to_update(op.getNumResults(), false);
-    for (auto [index, result_type] : llvm::enumerate(op.getResultTypes())) {
-      if (IsScalarOrFlat(result_type)) {
-        flat_result_types.push_back(result_type);
-        continue;
-      }
-      results_to_update.set(index);
-      flat_result_types.push_back(GetFlattenedType(result_type));
-    }
-    Location loc = op.getLoc();
-    auto new_call_op = rewriter.create<PureCallOp>(
-        loc, flat_result_types, op.getCalleeAttr(), flat_operands);
-    SmallVector<Value> new_results;
-    new_results.reserve(op.getNumResults());
-    for (auto [index, new_result] : llvm::enumerate(new_call_op.getResults())) {
-      if (results_to_update.test(index)) {
-        new_results.push_back(new_result);
-        continue;
-      }
-      auto cast_to_orig_type = rewriter.create<UnrealizedConversionCastOp>(
-          loc, op.getResult(index).getType(), new_result);
-      new_results.push_back(cast_to_orig_type.getResult(0));
-    }
-    rewriter.replaceOp(op, new_results);
-    return mlir::success();
-  }
-};
-
-// Returns the linearized index, if the rank is greater than 1. Otherwise,
-// returns nullptr.
-Value LinearizeIndex(Value value, ShapedType type, ValueRange indices,
-                     PatternRewriter& rewriter, Attribute encoding = nullptr) {
-  if (type.getRank() < 2) {
-    return nullptr;
-  }
-  auto byte_shape = ShapeUtil::MakeShape(U8, type.getShape());
-  if (encoding) {
-    *byte_shape.mutable_layout() = LayoutUtil::MakeLayout(llvm::to_vector(
-        mlir::cast<mlir::DenseElementsAttr>(encoding).getValues<int64_t>()));
-  }
-  auto linear_shape =
-      ShapeUtil::MakeShape(U8, {ShapeUtil::ElementsIn(byte_shape)});
-  auto linearized_map =
-      GetBitcastMap(byte_shape, linear_shape, value.getContext());
-  mlir::SmallVector<Value> result;
-  rewriter.createOrFold<ApplyIndexingOp>(result, value.getLoc(), indices,
-                                         ValueRange{}, linearized_map);
-  return result.front();
-}
-
-struct RewriteAllocateShared : OpRewritePattern<AllocateSharedOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AllocateSharedOp op,
-                                PatternRewriter& rewriter) const override {
-    auto tensor_type = op.getResult().getType();
-    if (IsScalarOrFlat(tensor_type)) {
-      return rewriter.notifyMatchFailure(op, "the tensor is already flat");
-    }
-    auto flat_type = GetFlattenedType(tensor_type);
-    Location loc = op.getLoc();
-    Value new_op = rewriter.create<AllocateSharedOp>(op.getLoc(), flat_type);
-    auto cast_to_orig_type =
-        rewriter.create<UnrealizedConversionCastOp>(loc, tensor_type, new_op);
-    rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
-    return mlir::success();
-  }
-};
-
-struct RewriteConstant : OpRewritePattern<mlir::arith::ConstantOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mlir::arith::ConstantOp op,
-                                PatternRewriter& rewriter) const override {
-    if (IsScalarOrFlat(op.getType())) {
-      return rewriter.notifyMatchFailure(
-          op, "the tensor or vector is already flat");
-    }
-    auto dense_attr = mlir::dyn_cast<mlir::DenseElementsAttr>(op.getValue());
-    auto new_type = GetFlattenedType(op.getType());
-    Value new_constant = rewriter.create<mlir::arith::ConstantOp>(
-        op.getLoc(), dense_attr.reshape(new_type));
-    rewriter.replaceOpWithNewOp<UnrealizedConversionCastOp>(op, op.getType(),
-                                                            new_constant);
-    return mlir::success();
-  }
-};
-
-struct RewriteTensorExtract : OpRewritePattern<ExtractOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ExtractOp op,
-                                PatternRewriter& rewriter) const override {
-    auto tensor = op.getTensor();
-    auto tensor_type = tensor.getType();
-    auto linear_index = LinearizeIndex(tensor, tensor_type, op.getIndices(),
-                                       rewriter, tensor_type.getEncoding());
-    if (linear_index == nullptr) {
-      return rewriter.notifyMatchFailure(op, "the tensor is already flat");
-    }
-    auto tensor_1D = rewriter
-                         .create<UnrealizedConversionCastOp>(
-                             op.getLoc(), GetFlattenedType(tensor_type), tensor)
-                         .getResult(0);
-    rewriter.replaceOpWithNewOp<ExtractOp>(op, tensor_1D, linear_index);
-    return mlir::success();
-  }
-};
-
-struct RewriteVectorExtract : OpRewritePattern<mv::ExtractOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mv::ExtractOp op,
-                                PatternRewriter& rewriter) const override {
-    auto vector = op.getVector();
-    auto vector_type = vector.getType();
-    auto indices =
-        mv::getAsValues(rewriter, op.getLoc(), op.getMixedPosition());
-    auto linear_index = LinearizeIndex(vector, vector_type, indices, rewriter);
-    if (linear_index == nullptr) {
-      return rewriter.notifyMatchFailure(op, "the vector is already flat");
-    }
-    auto vector_1D = rewriter
-                         .create<UnrealizedConversionCastOp>(
-                             op.getLoc(), GetFlattenedType(vector_type), vector)
-                         .getResult(0);
-    rewriter.replaceOpWithNewOp<mv::ExtractOp>(op, vector_1D, linear_index);
-    return mlir::success();
-  }
-};
-
-struct RewriteTensorInsert : OpRewritePattern<InsertOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(InsertOp op,
-                                PatternRewriter& rewriter) const override {
-    auto tensor = op.getDest();
-    auto tensor_type = tensor.getType();
-    auto linear_index = LinearizeIndex(tensor, tensor_type, op.getIndices(),
-                                       rewriter, tensor_type.getEncoding());
-    if (linear_index == nullptr) {
-      return rewriter.notifyMatchFailure(op, "the tensor is already flat");
-    }
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    auto tensor_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(tensor_type), tensor)
-                         .getResult(0);
-    auto new_insert =
-        b.create<InsertOp>(op.getScalar(), tensor_1D, linear_index);
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        tensor_type, new_insert.getResult());
-    rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
-    return mlir::success();
-  }
-};
-
-struct RewriteVectorInsert : OpRewritePattern<mv::InsertOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mv::InsertOp op,
-                                PatternRewriter& rewriter) const override {
-    auto vector = op.getDest();
-    auto vector_type = vector.getType();
-    auto indices =
-        mv::getAsValues(rewriter, op.getLoc(), op.getMixedPosition());
-    auto linear_index = LinearizeIndex(vector, vector_type, indices, rewriter);
-    if (linear_index == nullptr) {
-      return rewriter.notifyMatchFailure(op, "the vector is already flat");
-    }
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    auto vector_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(vector_type), vector)
-                         .getResult(0);
-    auto new_insert =
-        b.create<mv::InsertOp>(op.getSource(), vector_1D, linear_index);
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        vector_type, new_insert.getResult());
-    rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
-    return mlir::success();
-  }
-};
-
-struct RewriteAtomicRMW : OpRewritePattern<AtomicRMWOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AtomicRMWOp op,
-                                PatternRewriter& rewriter) const override {
-    auto tensor = op.getInput();
-    auto tensor_type = tensor.getType();
-    auto linear_index = LinearizeIndex(tensor, tensor_type, op.getIndices(),
-                                       rewriter, tensor_type.getEncoding());
-    if (linear_index == nullptr) {
-      return rewriter.notifyMatchFailure(op, "the tensor is already flat");
-    }
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    auto tensor_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(tensor_type), tensor)
-                         .getResult(0);
-    auto new_atomic_rmw = b.create<AtomicRMWOp>(tensor_1D, linear_index);
-    rewriter.inlineRegionBefore(op.getRegion(),
-                                &new_atomic_rmw.getRegion().front());
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        tensor_type, new_atomic_rmw.getResult());
-    rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
-    return mlir::success();
-  }
-};
-
-// Returns the rank of the tensor or vector or nullopt if it is of neither type.
-std::optional<int64_t> GetRankOfTensorOrVector(Type type) {
-  if (auto shaped_type = mlir::dyn_cast<ShapedType>(type)) {
-    return shaped_type.getRank();
-  }
-  return std::nullopt;
-}
-
-// Checks that the value is produced by an unrealized conversion cast from 1D
-// tensor or vector to ND. Returns the 1D tensor or vector if so.
-std::optional<Value> GetDelinearizedTensorOrVector(Value value) {
-  auto rank = GetRankOfTensorOrVector(value.getType());
-  if (!rank.has_value() || *rank < 2) {
-    return std::nullopt;
-  }
-  auto cast = value.getDefiningOp<UnrealizedConversionCastOp>();
-  if (!cast || cast->getNumResults() != 1 || cast->getNumOperands() != 1) {
-    return std::nullopt;
-  }
-  auto rank_before_linearization =
-      GetRankOfTensorOrVector(cast->getOperand(0).getType());
-  if (!rank_before_linearization.has_value() ||
-      *rank_before_linearization != 1) {
-    return std::nullopt;
-  }
-  return cast->getOperand(0);
-}
-
-struct RewriteFor : public OpRewritePattern<ForOp> {
-  using OpRewritePattern<ForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ForOp op,
-                                PatternRewriter& rewriter) const override {
-    llvm::SmallBitVector args_to_update(op.getNumResults(), false);
-    mlir::SmallVector<Value> new_init_args;
-    new_init_args.reserve(op.getNumResults());
-    for (auto [index, arg] : llvm::enumerate(op.getInitArgs())) {
-      auto type_before_linearization = GetDelinearizedTensorOrVector(arg);
-      if (!type_before_linearization.has_value()) {
-        new_init_args.push_back(arg);
-        continue;
-      }
-      new_init_args.push_back(*type_before_linearization);
-      args_to_update.set(index);
-    }
-    if (args_to_update.none()) {
-      return rewriter.notifyMatchFailure(op, "no args to update");
-    }
-    // Create new ForOp with updated init args.
-    Location loc = op.getLoc();
-    auto new_for_op =
-        rewriter.create<ForOp>(loc, op.getLowerBound(), op.getUpperBound(),
-                               op.getStep(), new_init_args);
-    new_for_op->setAttrs(op->getAttrs());
-
-    // Insert casts for the block arguments.
-    mlir::Block* new_body = new_for_op.getBody();
-    mlir::Block* old_body = op.getBody();
-    rewriter.setInsertionPoint(new_body, new_body->begin());
-    SmallVector<Value, 4> updated_block_args{new_body->getArguments().begin(),
-                                             new_body->getArguments().end()};
-    for (auto [index, arg] :
-         llvm::enumerate(new_body->getArguments().drop_front())) {
-      if (!args_to_update.test(index)) continue;
-      updated_block_args[index + 1] =
-          rewriter
-              .create<UnrealizedConversionCastOp>(
-                  loc, old_body->getArgument(index + 1).getType(), arg)
-              .getResult(0);
-    }
-
-    // Move the body of the old ForOp to the new one.
-    rewriter.mergeBlocks(old_body, new_body, updated_block_args);
-
-    // Update the terminator.
-    auto new_terminator =
-        mlir::cast<mlir::scf::YieldOp>(new_body->getTerminator());
-    rewriter.setInsertionPoint(new_terminator);
-    for (auto&& [index, yielded_value] :
-         llvm::enumerate(new_terminator.getResultsMutable())) {
-      if (!args_to_update.test(index)) continue;
-      yielded_value.assign(
-          rewriter
-              .create<UnrealizedConversionCastOp>(
-                  loc, new_init_args[index].getType(), yielded_value.get())
-              .getResult(0));
-    }
-
-    // Cast back the results.
-    rewriter.setInsertionPointAfter(new_for_op);
-    SmallVector<Value> new_results(new_for_op.getResults());
-    for (auto&& [index, result] : llvm::enumerate(new_results)) {
-      if (!args_to_update.test(index)) continue;
-      result = rewriter
-                   .create<UnrealizedConversionCastOp>(
-                       loc, op->getResult(index).getType(), result)
-                   .getResult(0);
-    }
-    rewriter.replaceOp(op, new_results);
-    return mlir::success();
-  }
-};
-
-struct RewriteIf : public OpRewritePattern<IfOp> {
-  using OpRewritePattern<IfOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(IfOp op,
-                                PatternRewriter& rewriter) const override {
-    auto result_types = op.getResultTypes();
-    if (HasOnlyFlatTensorsFlatVectorsOrScalars(result_types)) {
-      return rewriter.notifyMatchFailure(op, "nothing to flatten");
-    }
-    mlir::scf::YieldOp then_yield = op.thenYield();
-    SmallVector<Type> new_result_types;
-    new_result_types.reserve(then_yield.getNumOperands());
-    bool found_cast = false;
-    for (auto& result : then_yield->getOpOperands()) {
-      auto delinearized_tensor = GetDelinearizedTensorOrVector(result.get());
-      if (!delinearized_tensor.has_value()) {
-        new_result_types.push_back(result.get().getType());
-        continue;
-      }
-      new_result_types.push_back(delinearized_tensor->getType());
-      result.set(*delinearized_tensor);
-      found_cast = true;
-    }
-    if (!found_cast) {
-      return rewriter.notifyMatchFailure(op, "no cast found");
-    }
-    Location loc = op.getLoc();
-    // Update the else branch if present.
-    bool has_else_region = !op.getElseRegion().empty();
-    if (has_else_region) {
-      mlir::scf::YieldOp else_yield = op.elseYield();
-      mlir::OpBuilder::InsertionGuard g(rewriter);
-      rewriter.setInsertionPoint(else_yield);
-      for (auto&& [result, type] :
-           llvm::zip(else_yield->getOpOperands(), new_result_types)) {
-        if (result.get().getType() == type) continue;
-        result.set(
-            rewriter.create<UnrealizedConversionCastOp>(loc, type, result.get())
-                .getResult(0));
-      }
-    }
-    // Create new IfOp and move the old op's regions to the new one.
-    auto new_if_op = rewriter.create<IfOp>(loc, new_result_types,
-                                           op.getCondition(), has_else_region);
-    rewriter.inlineRegionBefore(op.getThenRegion(),
-                                &new_if_op.getThenRegion().back());
-    rewriter.eraseBlock(&new_if_op.getThenRegion().back());
-    if (has_else_region) {
-      rewriter.inlineRegionBefore(op.getElseRegion(),
-                                  &new_if_op.getElseRegion().back());
-      rewriter.eraseBlock(&new_if_op.getElseRegion().back());
-    }
-
-    // Update the results.
-    rewriter.setInsertionPointAfter(new_if_op);
-    SmallVector<Value> new_results(new_if_op.getResults());
-    for (auto&& [index, result] : llvm::enumerate(new_results)) {
-      Type old_type = op->getResult(index).getType();
-      if (result.getType() == old_type) continue;
-      result =
-          rewriter.create<UnrealizedConversionCastOp>(loc, old_type, result)
-              .getResult(0);
-    }
-    rewriter.replaceOp(op, new_results);
-    return mlir::success();
-  }
-};
-
-struct RewriteIndexSwitch : public OpRewritePattern<IndexSwitchOp> {
-  using OpRewritePattern<IndexSwitchOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(IndexSwitchOp op,
-                                PatternRewriter& rewriter) const override {
-    auto result_types = op.getResultTypes();
-    if (HasOnlyFlatTensorsFlatVectorsOrScalars(result_types)) {
-      return rewriter.notifyMatchFailure(op, "nothing to flatten");
-    }
-    auto default_yield =
-        mlir::cast<mlir::scf::YieldOp>(op.getDefaultBlock().getTerminator());
-    SmallVector<Type> new_result_types;
-    new_result_types.reserve(default_yield.getNumOperands());
-    bool found_cast = false;
-    for (auto& result : default_yield->getOpOperands()) {
-      auto delinearized_tensor = GetDelinearizedTensorOrVector(result.get());
-      if (!delinearized_tensor.has_value()) {
-        new_result_types.push_back(result.get().getType());
-        continue;
-      }
-      new_result_types.push_back(delinearized_tensor->getType());
-      result.set(*delinearized_tensor);
-      found_cast = true;
-    }
-    if (!found_cast) {
-      return rewriter.notifyMatchFailure(op, "no cast found");
-    }
-    Location loc = op.getLoc();
-    // Update the "case" regions.
-    for (auto& case_region : op.getCaseRegions()) {
-      auto yield = mlir::cast<mlir::scf::YieldOp>(
-          case_region.getBlocks().front().getTerminator());
-      mlir::OpBuilder::InsertionGuard g(rewriter);
-      rewriter.setInsertionPoint(yield);
-      for (auto&& [result, type] :
-           llvm::zip(yield->getOpOperands(), new_result_types)) {
-        if (result.get().getType() == type) continue;
-        result.set(
-            rewriter.create<UnrealizedConversionCastOp>(loc, type, result.get())
-                .getResult(0));
-      }
-    }
-    // Create new IndexSwitchOp and move the old op's regions to the new one.
-    auto new_index_switch = rewriter.create<IndexSwitchOp>(
-        loc, new_result_types, op.getArg(), op.getCases(), op.getNumCases());
-    for (auto&& [old_region, new_region] :
-         llvm::zip(op.getRegions(), new_index_switch.getRegions())) {
-      rewriter.inlineRegionBefore(*old_region, *new_region, new_region->end());
-    }
-    // Update the results.
-    rewriter.setInsertionPointAfter(new_index_switch);
-    SmallVector<Value> new_results(new_index_switch.getResults());
-    for (auto&& [index, result] : llvm::enumerate(new_results)) {
-      Type old_type = op->getResult(index).getType();
-      if (result.getType() == old_type) continue;
-      result =
-          rewriter.create<UnrealizedConversionCastOp>(loc, old_type, result)
-              .getResult(0);
-    }
-    rewriter.replaceOp(op, new_results);
-    return mlir::success();
-  }
-};
-
-struct RewriteSyncThreads : OpRewritePattern<SyncThreadsOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SyncThreadsOp op,
-                                PatternRewriter& rewriter) const override {
-    auto types = op.getResultTypes();
-    if (HasOnlyFlatTensorsFlatVectorsOrScalars(types)) {
-      return rewriter.notifyMatchFailure(op, "nothing to flatten");
-    }
-
-    auto loc = op.getLoc();
-
-    SmallVector<Value> new_operands;
-    new_operands.reserve(op.getNumOperands());
-    llvm::SmallBitVector results_to_update(op.getNumResults(), false);
-    for (auto& operand : op->getOpOperands()) {
-      auto tensor_type = mlir::cast<RankedTensorType>(operand.get().getType());
-      if (tensor_type.getRank() < 2) continue;
-      results_to_update.set(operand.getOperandNumber());
-      new_operands.push_back(
-          rewriter
-              .create<UnrealizedConversionCastOp>(
-                  loc, GetFlattenedType(tensor_type), operand.get())
-              .getResult(0));
-    }
-    auto new_op = rewriter.create<SyncThreadsOp>(loc, TypeRange(new_operands),
-                                                 new_operands);
-    SmallVector<Value> new_results;
-    new_results.reserve(op.getNumResults());
-    for (auto [index, result] : llvm::enumerate(new_op.getResults())) {
-      if (!results_to_update.test(index)) {
-        new_results.push_back(result);
-        continue;
-      }
-      auto cast_to_orig_type = rewriter.create<UnrealizedConversionCastOp>(
-          loc, result.getType(), result);
-      new_results.push_back(cast_to_orig_type.getResult(0));
-    }
-    rewriter.replaceOp(op, new_results);
-    return mlir::success();
-  }
-};
-
-class FlattenTensorsPass
-    : public impl::FlattenTensorsPassBase<FlattenTensorsPass> {
- public:
-  void runOnOperation() override {
-    mlir::ModuleOp module = getOperation();
-    MLIRContext* mlir_context = &getContext();
-    mlir::RewritePatternSet patterns(mlir_context);
-    // clang-format off
-    patterns.add<
-        RewriteAllocateShared,
-        RewriteAtomicRMW,
-        RewriteConstant,
-        RewriteFor,
-        RewriteFunctionSignatures,
-        RewriteIf,
-        RewriteIndexSwitch,
-        RewritePureCall,
-        RewriteSyncThreads,
-        RewriteTensorExtract,
-        RewriteTensorInsert,
-        RewriteVectorExtract,
-        RewriteVectorInsert
-    >(mlir_context);
-    // clang-format on
-    ApplyIndexingOp::getCanonicalizationPatterns(patterns, mlir_context);
-    if (mlir::failed(
-            mlir::applyPatternsGreedily(module, std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-    // Check if there are no unrealized_conversion_casts.
-    bool module_has_casts = module
-                                .walk([](UnrealizedConversionCastOp op) {
-                                  return mlir::WalkResult::interrupt();
-                                })
-                                .wasInterrupted();
-    if (module_has_casts) {
-      llvm::outs() << "FlattenTensorsPass failed to converge";
-      signalPassFailure();
-      return;
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> CreateFlattenTensorsPass() {
-  return std::make_unique<FlattenTensorsPass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/fuse_loops.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/fuse_loops.cc
index 43fcbb9c96fa..561408e034ad 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/fuse_loops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/fuse_loops.cc
@@ -148,7 +148,7 @@ void FuseExtractInsertLoopPair(MLIRContext* mlir_context, LoopOp insert_loop,
   auto vector_cst = insert_loop.getInits().back();
   insert_loop->replaceAllUsesWith(ValueRange(vector_cst));
   extract_loop->replaceAllUsesWith(new_loop.getResults());
-  extract.replaceAllUsesWith(insert.getSource());
+  extract.replaceAllUsesWith(insert.getValueToStore());
   auto insert_loop_yield =
       mlir::dyn_cast<YieldOp>(insert_loop.getRegion().front().back());
   rewriter.eraseOp(insert_loop_yield);
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc
index b764c8626e17..97448a64042c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/optimize_loops.cc
@@ -54,175 +54,6 @@ namespace gpu {
 
 namespace {
 
-mlir::Value GetSource(mlir::vector::TransferReadOp op) {
-  return op.getSource();
-}
-
-bool DoIndicesDependOnInductionVar(mlir::ValueRange indices,
-                                   mlir::scf::ForOp loop) {
-  // We assume LICM ran, so we can just check if any index is defined in the
-  // loop.
-  return absl::c_any_of(indices, [&](mlir::Value v) {
-    return v.getParentRegion() == &loop.getBodyRegion();
-  });
-}
-
-bool CanReplaceInductionVar(mlir::ValueRange indices) {
-  return absl::c_all_of(indices, [&](mlir::Value v) {
-    if (auto bbarg = mlir::dyn_cast<mlir::BlockArgument>(v)) {
-      auto for_op = mlir::dyn_cast_or_null<mlir::scf::ForOp>(
-          v.getParentRegion()->getParentOp());
-      // This is a bbarg that is defined outside of the loop, so it doesn't
-      // affect pipelining.
-      if (!for_op) {
-        return true;
-      }
-      // We can only replace the induction variable, not other loop-carried
-      // values.
-      return v == for_op.getInductionVar();
-    }
-    auto* op = v.getDefiningOp();
-    return op &&
-           mlir::isa<mlir::arith::ConstantOp, ApplyIndexingOp,
-                     mlir::arith::MaxSIOp, mlir::arith::MinSIOp,
-                     mlir::arith::IndexCastOp, mlir::arith::IndexCastUIOp>(
-               op) &&
-           CanReplaceInductionVar(op->getOperands());
-  });
-}
-
-llvm::SmallVector<mlir::Value> ReplaceInductionVar(
-    mlir::Value induction_var, mlir::Value replacement,
-    llvm::SmallVector<mlir::Value> indices,
-    mlir::ImplicitLocOpBuilder& builder) {
-  for (mlir::Value& index : indices) {
-    if (mlir::isa<mlir::BlockArgument>(index)) {
-      if (index == induction_var) {
-        index = replacement;
-      }
-      continue;
-    }
-
-    auto* op = index.getDefiningOp();
-    CHECK(op) << "Did CanReplaceInductionVar() fail?";
-    if (mlir::isa<mlir::arith::ConstantOp>(op)) {
-      continue;
-    }
-
-    CHECK(
-        (mlir::isa<ApplyIndexingOp, mlir::arith::MaxSIOp, mlir::arith::MinSIOp,
-                   mlir::arith::IndexCastOp, mlir::arith::IndexCastUIOp>(op)))
-        << "Did CanReplaceInductionVar() fail?";
-    auto replaced_args = ReplaceInductionVar(induction_var, replacement,
-                                             op->getOperands(), builder);
-    index = builder
-                .create(builder.getLoc(), op->getName().getIdentifier(),
-                        replaced_args, op->getResultTypes(), op->getAttrs())
-                ->getResult(0);
-  }
-  return indices;
-}
-
-mlir::Value GetSource(mlir::tensor::ExtractOp op) { return op.getTensor(); }
-
-// TODO(jreiffers): Use a shared memory queue for pipelining instead of
-// registers.
-template <typename Op>
-struct PipelineLoad : mlir::OpRewritePattern<Op> {
-  using mlir::OpRewritePattern<Op>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      Op op, mlir::PatternRewriter& rewriter) const override {
-    auto loop = mlir::dyn_cast_or_null<mlir::scf::ForOp>(op->getParentOp());
-    if (!loop) {
-      return rewriter.notifyMatchFailure(op, "no loop found");
-    }
-
-    if (auto step = loop.getConstantStep();
-        !step || step->getSExtValue() != 1) {
-      return rewriter.notifyMatchFailure(op, "loop step is not 1");
-    }
-
-    llvm::APInt lb, ub;
-    if (!mlir::matchPattern(loop.getLowerBound(), mlir::m_ConstantInt(&lb)) ||
-        !mlir::matchPattern(loop.getUpperBound(), mlir::m_ConstantInt(&ub))) {
-      return rewriter.notifyMatchFailure(op, "bounds are not constants");
-    }
-    if (lb.getSExtValue() != 0) {
-      return rewriter.notifyMatchFailure(op, "lower bound is not 0");
-    }
-
-    auto source = GetSource(op);
-    if (!source.getParentRegion()->isProperAncestor(&loop.getBodyRegion())) {
-      return rewriter.notifyMatchFailure(
-          op, "source is not defined outside the loop");
-    }
-
-    if (!DoIndicesDependOnInductionVar(op.getIndices(), loop)) {
-      // We don't run LICM between iterations, so this could happen.
-      // Just hoist the load out of the loop.
-      rewriter.moveOpBefore(op, loop);
-      return mlir::success();
-    }
-
-    if (!CanReplaceInductionVar(op.getIndices())) {
-      return rewriter.notifyMatchFailure(op, "unable to replace indices");
-    }
-
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    mlir::Value zero = b.create<mlir::arith::ConstantIndexOp>(0);
-
-    b.setInsertionPoint(loop);
-    auto first_args =
-        ReplaceInductionVar(loop.getInductionVar(), zero, op.getOperands(), b);
-    auto loaded_first =
-        b.create<Op>(op->getResultTypes(), first_args, op->getAttrs());
-    auto ub_minus_one =
-        b.create<mlir::arith::ConstantIndexOp>(ub.getSExtValue() - 1);
-
-    b.setInsertionPointToStart(loop.getBody());
-
-    auto needs_load = b.create<mlir::arith::CmpIOp>(
-        mlir::arith::CmpIPredicate::ult, loop.getInductionVar(), ub_minus_one);
-    auto next_value =
-        b.create<mlir::scf::IfOp>(op->getResultTypes(), needs_load, true, true);
-    auto new_for =
-        mlir::cast<mlir::scf::ForOp>(*loop.replaceWithAdditionalYields(
-            rewriter, loaded_first->getResult(0),
-            /*replaceInitOperandUsesInLoop=*/false,
-            [&](mlir::OpBuilder&, mlir::Location,
-                llvm::ArrayRef<mlir::BlockArgument>) {
-              return llvm::SmallVector<mlir::Value>{next_value->getResult(0)};
-            }));
-    rewriter.replaceAllUsesWith(op, new_for.getRegionIterArgs().back());
-
-    b.setInsertionPointToStart(next_value.thenBlock());
-    auto yield = b.create<mlir::scf::YieldOp>(op->getResult(0));
-
-    // We use this convoluted way to add 1 so folding works properly.
-    auto plus_one_map = mlir::AffineMap::get(
-        1, 0, mlir::getAffineDimExpr(0, this->getContext()) + 1);
-    b.setInsertionPoint(next_value);
-    IndexingMap indexing_map(plus_one_map,
-                             {IndexingMap::Variable{0, ub.getSExtValue() - 1}},
-                             /*range_vars=*/{}, /*rt_vars=*/{});
-    auto induction_plus_one =
-        b.create<ApplyIndexingOp>(new_for.getInductionVar(), indexing_map)
-            ->getResult(0);
-
-    // Create the new apply_indexing ops outside the if, to improve CSE.
-    rewriter.modifyOpInPlace(op, [&]() {
-      op->setOperands(ReplaceInductionVar(
-          new_for.getInductionVar(), induction_plus_one, op->getOperands(), b));
-    });
-    rewriter.moveOpBefore(op, yield);
-
-    b.setInsertionPointToStart(next_value.elseBlock());
-    b.create<mlir::scf::YieldOp>(new_for.getRegionIterArgs().back());
-    return mlir::success();
-  }
-};
-
 int GetUnrollingFactor(mlir::scf::ForOp op) {
   // We only unroll loops with a step of 1 and a lower bound of 0. That's the
   // only type we generate.
@@ -317,15 +148,6 @@ class OptimizeLoopsPass
       signalPassFailure();
       return;
     }
-
-    // Then pipeline the remaining loops.
-    mlir::RewritePatternSet patterns(&getContext());
-    patterns.add<PipelineLoad<mlir::vector::TransferReadOp>,
-                 PipelineLoad<mlir::tensor::ExtractOp>>(&getContext());
-    if (mlir::failed(
-            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-    }
   }
 };
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
index 26d1a2fec7bd..0afe833dc305 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
@@ -34,16 +34,14 @@ namespace gpu {
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h.inc"
 
 std::unique_ptr<mlir::Pass> CreateConvertFloatNvidiaPass();
-std::optional<std::unique_ptr<mlir::Pass>> MaybeCreateConvertFloatNvidiaPass(
-    const se::DeviceDescription& device_description);
+std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
+    const std::string& gpu_device_info = "");
+std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
+    const se::RocmComputeCapability& cc);
 std::unique_ptr<mlir::Pass> CreateConvertIndexTypePass();
 std::unique_ptr<mlir::Pass> CreateOptimizeLoopsPass();
 std::unique_ptr<mlir::Pass> CreateFuseLoopsPass();
 std::unique_ptr<mlir::Pass> CreatePeelLoopsPass();
-std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
-    const std::string& gpu_device_info = "");
-std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
-    const se::DeviceDescription& device_description);
 
 #define GEN_PASS_REGISTRATION
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h.inc"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.td b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.td
index 2d1be983a8de..c1e964aca781 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.td
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.td
@@ -37,36 +37,27 @@ def ConvertIndexTypePass : Pass<"xla-gpu-convert-index-type", "mlir::ModuleOp">
 }
 
 def ConvertFloatNvidiaPass : Pass<"xla-gpu-convert-float-nvidia", "mlir::ModuleOp"> {
-  let summary = "Convert floating point types using NVidia intrinsics.";
+  let summary = "Convert floating point types using NVPTX intrinsics.";
 
   let dependentDialects = [
     "mlir::LLVM::LLVMDialect",
     "mlir::arith::ArithDialect",
   ];
-
   let constructor = "CreateConvertFloatNvidiaPass()";
 }
 
-def VectorizeLoadsAndStoresPass :
-   Pass<"xla-gpu-vectorize-loads-stores", "mlir::func::FuncOp"> {
-  let summary = "Vectorizes loads and stores.";
-
-  let description = [{
-    Rewrites tensor.extract and tensor.insert ops inside loops to their vector
-    equivalents (vector.transfer_read and vector.transfer_write + vector.extract
-    and vector.insert).
-  }];
+def ConvertFloatAMDPass : Pass<"xla-gpu-convert-float-amd", "mlir::ModuleOp"> {
+  let summary = "Convert floating point types using AMDGCN intrinsics.";
 
   let dependentDialects = [
-    "mlir::vector::VectorDialect",
+    "mlir::LLVM::LLVMDialect",
+    "mlir::arith::ArithDialect",
   ];
-
   let options = [
     Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
            "Serialized stream_executor::GPUDeviceInfo proto.">,
   ];
-
-  let constructor = "CreateVectorizeLoadsAndStoresPass()";
+  let constructor = "CreateConvertFloatAMDPass()";
 }
 
 def FuseLoopsPass : Pass<"xla-gpu-fuse-loops", "mlir::func::FuncOp"> {
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/convert_float_amd.mlir b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/convert_float_amd.mlir
new file mode 100644
index 000000000000..5c453542be8d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/convert_float_amd.mlir
@@ -0,0 +1,399 @@
+// RUN: emitters_opt %s -split-input-file -xla-gpu-convert-float-amd="gpu_device_info='rocm_compute_capability {gcn_arch_name: \"gfx942:sramecc+:xnack\"}'" -canonicalize | FileCheck %s
+
+module {
+  func.func @intr_f16_to_f8(%arg0: f16) -> (f8E4M3FNUZ, f8E5M2FNUZ) {
+    %a = arith.truncf %arg0 : f16 to f8E4M3FNUZ
+    %b = arith.truncf %arg0 : f16 to f8E5M2FNUZ
+    return %a, %b : f8E4M3FNUZ, f8E5M2FNUZ
+  }
+}
+
+// CHECK-LABEL: @intr_f16_to_f8
+// CHECK: arith.extf %{{.+}} : f16 to f32
+// CHECK: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_bf16_to_f8(%arg0: bf16) -> (f8E4M3FNUZ, f8E5M2FNUZ) {
+    %a = arith.truncf %arg0 : bf16 to f8E4M3FNUZ
+    %b = arith.truncf %arg0 : bf16 to f8E5M2FNUZ
+    return %a, %b : f8E4M3FNUZ, f8E5M2FNUZ
+  }
+}
+
+// CHECK-LABEL: @intr_bf16_to_f8
+// CHECK: arith.extf %{{.+}} : bf16 to f32
+// CHECK: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_f32_to_f8(%arg0: f32) -> (f8E4M3FNUZ, f8E5M2FNUZ) {
+    %a = arith.truncf %arg0 : f32 to f8E4M3FNUZ
+    %b = arith.truncf %arg0 : f32 to f8E5M2FNUZ
+    return %a, %b : f8E4M3FNUZ, f8E5M2FNUZ
+  }
+}
+
+// CHECK-LABEL: @intr_f32_to_f8
+// CHECK: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_f64_to_f8(%arg0: f64) -> (f8E4M3FNUZ, f8E5M2FNUZ) {
+    %a = arith.truncf %arg0 : f64 to f8E4M3FNUZ
+    %b = arith.truncf %arg0 : f64 to f8E5M2FNUZ
+    return %a, %b : f8E4M3FNUZ, f8E5M2FNUZ
+  }
+}
+
+// CHECK-LABEL: @intr_f64_to_f8
+// CHECK: arith.truncf %{{.+}} : f64 to f32
+// CHECK: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK: arith.truncf %{{.+}} : f64 to f32
+// CHECK: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_f8_to_f16(%arg0: f8E4M3FNUZ, %arg1: f8E5M2FNUZ) -> (f16, f16) {
+    %a = arith.extf %arg0 : f8E4M3FNUZ to f16
+    %b = arith.extf %arg1 : f8E5M2FNUZ to f16
+    return %a, %b : f16, f16
+  }
+}
+
+// CHECK-LABEL: @intr_f8_to_f16
+// CHECK: llvm.amdgcn.cvt.f32.fp8
+// CHECK: llvm.amdgcn.cvt.f32.bf8
+// CHECK: arith.truncf %{{.+}} : f32 to f16
+
+// -----
+
+module {
+  func.func @intr_f8_to_bf16(%arg0: f8E4M3FNUZ, %arg1: f8E5M2FNUZ) -> (bf16, bf16) {
+    %a = arith.extf %arg0 : f8E4M3FNUZ to bf16
+    %b = arith.extf %arg1 : f8E5M2FNUZ to bf16
+    return %a, %b : bf16, bf16
+  }
+}
+
+// CHECK-LABEL: @intr_f8_to_bf16
+// CHECK: llvm.amdgcn.cvt.f32.fp8
+// CHECK: llvm.amdgcn.cvt.f32.bf8
+// CHECK: llvm.bitcast %{{.+}} : f32 to i32
+// CHECK: llvm.bitcast %{{.+}} : i16 to bf16
+// CHECK-NOT: arith.truncf %{{.+}} : f32 to bf16
+
+// -----
+
+module {
+  func.func @intr_f8_to_f32(%arg0: f8E4M3FNUZ, %arg1: f8E5M2FNUZ) -> (f32, f32) {
+    %a = arith.extf %arg0 : f8E4M3FNUZ to f32
+    %b = arith.extf %arg1 : f8E5M2FNUZ to f32
+    return %a, %b : f32, f32
+  }
+}
+
+// CHECK-LABEL: @intr_f8_to_f32
+// CHECK: llvm.amdgcn.cvt.f32.fp8
+// CHECK: llvm.amdgcn.cvt.f32.bf8
+
+
+// -----
+
+module {
+  func.func @intr_f8_to_f64(%arg0: f8E4M3FNUZ, %arg1: f8E5M2FNUZ) -> (f64, f64) {
+    %a = arith.extf %arg0 : f8E4M3FNUZ to f64
+    %b = arith.extf %arg1 : f8E5M2FNUZ to f64
+    return %a, %b : f64, f64
+  }
+}
+
+// CHECK-LABEL: @intr_f8_to_f64
+// CHECK: llvm.amdgcn.cvt.f32.fp8
+// CHECK: arith.extf %{{.+}} : f32 to f64
+// CHECK: llvm.amdgcn.cvt.f32.bf8
+// CHECK: arith.extf %{{.+}} : f32 to f64
+
+// -----
+
+module {
+  func.func @intr_f16_to_4f8(%arg0: f16, %arg1: f16, %arg2: f16, %arg3: f16) -> (vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>) {
+    %a0 = arith.truncf %arg0 : f16 to f8E4M3FNUZ
+    %a1 = arith.truncf %arg1 : f16 to f8E4M3FNUZ
+    %a2 = arith.truncf %arg2 : f16 to f8E4M3FNUZ
+    %a3 = arith.truncf %arg3 : f16 to f8E4M3FNUZ
+    %b0 = arith.truncf %arg0 : f16 to f8E5M2FNUZ
+    %b1 = arith.truncf %arg1 : f16 to f8E5M2FNUZ
+    %b2 = arith.truncf %arg2 : f16 to f8E5M2FNUZ
+    %b3 = arith.truncf %arg3 : f16 to f8E5M2FNUZ
+    %a_init = arith.constant dense<0.000000e+00> : vector<4xf8E4M3FNUZ>
+    %b_init = arith.constant dense<0.000000e+00> : vector<4xf8E5M2FNUZ>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %0 = vector.insert %a0, %a_init [%c0] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %1 = vector.insert %a1, %0 [%c1] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %2 = vector.insert %a2, %1 [%c2] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %a = vector.insert %a3, %2 [%c3] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %3 = vector.insert %b0, %b_init [%c0] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %4 = vector.insert %b1, %3 [%c1] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %5 = vector.insert %b2, %4 [%c2] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %b = vector.insert %b3, %5 [%c3] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    return %a, %b : vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>
+  }
+}
+
+// CHECK-LABEL: @intr_f16_to_4f8
+// CHECK-COUNT-4: arith.extf %{{.+}} : f16 to f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK-COUNT-4: arith.extf %{{.+}} : f16 to f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_bf16_to_4f8(%arg0: bf16, %arg1: bf16, %arg2: bf16, %arg3: bf16) -> (vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>) {
+    %a0 = arith.truncf %arg0 : bf16 to f8E4M3FNUZ
+    %a1 = arith.truncf %arg1 : bf16 to f8E4M3FNUZ
+    %a2 = arith.truncf %arg2 : bf16 to f8E4M3FNUZ
+    %a3 = arith.truncf %arg3 : bf16 to f8E4M3FNUZ
+    %b0 = arith.truncf %arg0 : bf16 to f8E5M2FNUZ
+    %b1 = arith.truncf %arg1 : bf16 to f8E5M2FNUZ
+    %b2 = arith.truncf %arg2 : bf16 to f8E5M2FNUZ
+    %b3 = arith.truncf %arg3 : bf16 to f8E5M2FNUZ
+    %a_init = arith.constant dense<0.000000e+00> : vector<4xf8E4M3FNUZ>
+    %b_init = arith.constant dense<0.000000e+00> : vector<4xf8E5M2FNUZ>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %0 = vector.insert %a0, %a_init [%c0] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %1 = vector.insert %a1, %0 [%c1] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %2 = vector.insert %a2, %1 [%c2] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %a = vector.insert %a3, %2 [%c3] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %3 = vector.insert %b0, %b_init [%c0] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %4 = vector.insert %b1, %3 [%c1] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %5 = vector.insert %b2, %4 [%c2] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %b = vector.insert %b3, %5 [%c3] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    return %a, %b : vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>
+  }
+}
+
+// CHECK-LABEL: @intr_bf16_to_4f8
+// CHECK-COUNT-4: arith.extf %{{.+}} : bf16 to f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK-COUNT-4: arith.extf %{{.+}} : bf16 to f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_f32_to_4f8(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> (vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>) {
+    %a0 = arith.truncf %arg0 : f32 to f8E4M3FNUZ
+    %a1 = arith.truncf %arg1 : f32 to f8E4M3FNUZ
+    %a2 = arith.truncf %arg2 : f32 to f8E4M3FNUZ
+    %a3 = arith.truncf %arg3 : f32 to f8E4M3FNUZ
+    %b0 = arith.truncf %arg0 : f32 to f8E5M2FNUZ
+    %b1 = arith.truncf %arg1 : f32 to f8E5M2FNUZ
+    %b2 = arith.truncf %arg2 : f32 to f8E5M2FNUZ
+    %b3 = arith.truncf %arg3 : f32 to f8E5M2FNUZ
+    %a_init = arith.constant dense<0.000000e+00> : vector<4xf8E4M3FNUZ>
+    %b_init = arith.constant dense<0.000000e+00> : vector<4xf8E5M2FNUZ>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %0 = vector.insert %a0, %a_init [%c0] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %1 = vector.insert %a1, %0 [%c1] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %2 = vector.insert %a2, %1 [%c2] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %a = vector.insert %a3, %2 [%c3] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %3 = vector.insert %b0, %b_init [%c0] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %4 = vector.insert %b1, %3 [%c1] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %5 = vector.insert %b2, %4 [%c2] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %b = vector.insert %b3, %5 [%c3] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    return %a, %b : vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>
+  }
+}
+
+// CHECK-LABEL: @intr_f32_to_4f8
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_f64_to_4f8(%arg0: f64, %arg1: f64, %arg2: f64, %arg3: f64) -> (vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>) {
+    %a0 = arith.truncf %arg0 : f64 to f8E4M3FNUZ
+    %a1 = arith.truncf %arg1 : f64 to f8E4M3FNUZ
+    %a2 = arith.truncf %arg2 : f64 to f8E4M3FNUZ
+    %a3 = arith.truncf %arg3 : f64 to f8E4M3FNUZ
+    %b0 = arith.truncf %arg0 : f64 to f8E5M2FNUZ
+    %b1 = arith.truncf %arg1 : f64 to f8E5M2FNUZ
+    %b2 = arith.truncf %arg2 : f64 to f8E5M2FNUZ
+    %b3 = arith.truncf %arg3 : f64 to f8E5M2FNUZ
+    %a_init = arith.constant dense<0.000000e+00> : vector<4xf8E4M3FNUZ>
+    %b_init = arith.constant dense<0.000000e+00> : vector<4xf8E5M2FNUZ>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %0 = vector.insert %a0, %a_init [%c0] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %1 = vector.insert %a1, %0 [%c1] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %2 = vector.insert %a2, %1 [%c2] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %a = vector.insert %a3, %2 [%c3] : f8E4M3FNUZ into vector<4xf8E4M3FNUZ>
+    %3 = vector.insert %b0, %b_init [%c0] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %4 = vector.insert %b1, %3 [%c1] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %5 = vector.insert %b2, %4 [%c2] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    %b = vector.insert %b3, %5 [%c3] : f8E5M2FNUZ into vector<4xf8E5M2FNUZ>
+    return %a, %b : vector<4xf8E4M3FNUZ>, vector<4xf8E5M2FNUZ>
+  }
+}
+
+// CHECK-LABEL: @intr_f64_to_4f8
+// CHECK-COUNT-4: arith.truncf %{{.+}} : f64 to f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.fp8.f32
+// CHECK-COUNT-4: arith.truncf %{{.+}} : f64 to f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.bf8.f32
+
+// -----
+
+module {
+  func.func @intr_4f8_to_f16(%arg0: vector<4xf8E4M3FNUZ>, %arg1: vector<4xf8E5M2FNUZ>) -> (f16, f16, f16, f16, f16, f16, f16, f16) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %a0 = vector.extract %arg0[%c0] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a1 = vector.extract %arg0[%c1] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a2 = vector.extract %arg0[%c2] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a3 = vector.extract %arg0[%c3] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %b0 = vector.extract %arg1[%c0] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b1 = vector.extract %arg1[%c1] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b2 = vector.extract %arg1[%c2] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b3 = vector.extract %arg1[%c3] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %0 = arith.extf %a0 : f8E4M3FNUZ to f16
+    %1 = arith.extf %a1 : f8E4M3FNUZ to f16
+    %2 = arith.extf %a2 : f8E4M3FNUZ to f16
+    %3 = arith.extf %a3 : f8E4M3FNUZ to f16
+    %4 = arith.extf %b0 : f8E5M2FNUZ to f16
+    %5 = arith.extf %b1 : f8E5M2FNUZ to f16
+    %6 = arith.extf %b2 : f8E5M2FNUZ to f16
+    %7 = arith.extf %b3 : f8E5M2FNUZ to f16
+    return %0, %1, %2, %3, %4, %5, %6, %7 : f16, f16, f16, f16, f16, f16, f16, f16
+  }
+}
+
+// CHECK-LABEL: @intr_4f8_to_f16
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.fp8
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pkrtz
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.bf8
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pkrtz
+
+// -----
+
+module {
+  func.func @intr_4f8_to_bf16(%arg0: vector<4xf8E4M3FNUZ>, %arg1: vector<4xf8E5M2FNUZ>) -> (bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %a0 = vector.extract %arg0[%c0] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a1 = vector.extract %arg0[%c1] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a2 = vector.extract %arg0[%c2] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a3 = vector.extract %arg0[%c3] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %b0 = vector.extract %arg1[%c0] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b1 = vector.extract %arg1[%c1] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b2 = vector.extract %arg1[%c2] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b3 = vector.extract %arg1[%c3] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %0 = arith.extf %a0 : f8E4M3FNUZ to bf16
+    %1 = arith.extf %a1 : f8E4M3FNUZ to bf16
+    %2 = arith.extf %a2 : f8E4M3FNUZ to bf16
+    %3 = arith.extf %a3 : f8E4M3FNUZ to bf16
+    %4 = arith.extf %b0 : f8E5M2FNUZ to bf16
+    %5 = arith.extf %b1 : f8E5M2FNUZ to bf16
+    %6 = arith.extf %b2 : f8E5M2FNUZ to bf16
+    %7 = arith.extf %b3 : f8E5M2FNUZ to bf16
+    return %0, %1, %2, %3, %4, %5, %6, %7 : bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16
+  }
+}
+
+// CHECK-LABEL: @intr_4f8_to_bf16
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.fp8
+// CHECK-COUNT-8: llvm.bitcast
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.bf8
+// CHECK-COUNT-8: llvm.bitcast
+// CHECK-NOT: arith.truncf %{{.+}} : f32 to bf16
+
+// -----
+
+module {
+  func.func @intr_4f8_to_f32(%arg0: vector<4xf8E4M3FNUZ>, %arg1: vector<4xf8E5M2FNUZ>) -> (f32, f32, f32, f32, f32, f32, f32, f32) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %a0 = vector.extract %arg0[%c0] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a1 = vector.extract %arg0[%c1] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a2 = vector.extract %arg0[%c2] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a3 = vector.extract %arg0[%c3] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %b0 = vector.extract %arg1[%c0] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b1 = vector.extract %arg1[%c1] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b2 = vector.extract %arg1[%c2] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b3 = vector.extract %arg1[%c3] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %0 = arith.extf %a0 : f8E4M3FNUZ to f32
+    %1 = arith.extf %a1 : f8E4M3FNUZ to f32
+    %2 = arith.extf %a2 : f8E4M3FNUZ to f32
+    %3 = arith.extf %a3 : f8E4M3FNUZ to f32
+    %4 = arith.extf %b0 : f8E5M2FNUZ to f32
+    %5 = arith.extf %b1 : f8E5M2FNUZ to f32
+    %6 = arith.extf %b2 : f8E5M2FNUZ to f32
+    %7 = arith.extf %b3 : f8E5M2FNUZ to f32
+    return %0, %1, %2, %3, %4, %5, %6, %7 : f32, f32, f32, f32, f32, f32, f32, f32
+  }
+}
+
+// CHECK-LABEL: @intr_4f8_to_f32
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.fp8
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.bf8
+
+// -----
+
+module {
+  func.func @intr_4f8_to_f64(%arg0: vector<4xf8E4M3FNUZ>, %arg1: vector<4xf8E5M2FNUZ>) -> (f64, f64, f64, f64, f64, f64, f64, f64) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %a0 = vector.extract %arg0[%c0] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a1 = vector.extract %arg0[%c1] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a2 = vector.extract %arg0[%c2] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %a3 = vector.extract %arg0[%c3] : f8E4M3FNUZ from vector<4xf8E4M3FNUZ>
+    %b0 = vector.extract %arg1[%c0] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b1 = vector.extract %arg1[%c1] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b2 = vector.extract %arg1[%c2] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %b3 = vector.extract %arg1[%c3] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+    %0 = arith.extf %a0 : f8E4M3FNUZ to f64
+    %1 = arith.extf %a1 : f8E4M3FNUZ to f64
+    %2 = arith.extf %a2 : f8E4M3FNUZ to f64
+    %3 = arith.extf %a3 : f8E4M3FNUZ to f64
+    %4 = arith.extf %b0 : f8E5M2FNUZ to f64
+    %5 = arith.extf %b1 : f8E5M2FNUZ to f64
+    %6 = arith.extf %b2 : f8E5M2FNUZ to f64
+    %7 = arith.extf %b3 : f8E5M2FNUZ to f64
+    return %0, %1, %2, %3, %4, %5, %6, %7 : f64, f64, f64, f64, f64, f64, f64, f64
+  }
+}
+
+// CHECK-LABEL: @intr_4f8_to_f64
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.fp8
+// CHECK-COUNT-4: arith.extf %{{.+}} : f32 to f64
+// CHECK-COUNT-2: llvm.amdgcn.cvt.pk.f32.bf8
+// CHECK-COUNT-4: arith.extf %{{.+}} : f32 to f64
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/optimize_loops.mlir b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/optimize_loops.mlir
index 8fe920c3abfc..345c7680139d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/optimize_loops.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/optimize_loops.mlir
@@ -105,104 +105,3 @@ module {
 // CHECK-LABEL: @do_not_unroll
 // CHECK: %[[C1:.*]] = arith.constant 1 : index
 // CHECK: scf.for {{.*}} step %[[C1]]
-
-// -----
-
-module {
-  func.func @pipeline_extract(%arg: tensor<31xf32>) -> f32 {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c31 = arith.constant 31 : index
-    %cst = arith.constant 0.0 : f32
-    %ret = scf.for %i = %c0 to %c31 step %c1 iter_args (%iter = %cst) -> (f32) {
-      %val = tensor.extract %arg[%i] : tensor<31xf32>
-      %log = math.log %val : f32
-      %add = arith.addf %log, %iter : f32
-      scf.yield %add : f32
-    }
-    return %ret : f32
-  }
-}
-
-// CHECK: #[[$MAP:.*]] = #xla.indexing_map<"(d0) -> (d0 + 1),
-// CHECK-LABEL: @pipeline_extract
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C30:.*]] = arith.constant 30 : index
-// CHECK:      %[[VAL0:.*]] = tensor.extract %[[ARG0:.*]][%[[C0]]]
-// CHECK:      scf.for %[[I:.*]] = %[[C0]] {{.*}} iter_args(%[[ITER:.*]] = {{.*}}, %[[VAL:.*]] = %[[VAL0]])
-// CHECK-DAG:  %[[NEXT_I_EXISTS:.*]] = arith.cmpi ult, %[[I]], %[[C30]]
-// CHECK-DAG:  %[[NEXT_I:.*]] = xla.apply_indexing #[[$MAP]](%[[I]]
-// CHECK:      %[[NEXT_VAL:.*]] = scf.if %[[NEXT_I_EXISTS]]
-// CHECK-NEXT:   tensor.extract %[[ARG0]][%[[NEXT_I]]]
-// CHECK-NEXT:   yield
-// CHECK-NEXT: else
-// CHECK-NEXT:   yield %[[VAL]]
-// CHECK:     math.log %[[VAL]]
-// CHECK:     %[[ADD:.*]] = arith.addf
-// CHECK:     yield %[[ADD]], %[[NEXT_VAL]]
-
-// -----
-
-module {
-  func.func @pipeline_transfer(%arg: tensor<34xf32>) -> vector<2xf32> {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c17 = arith.constant 17 : index
-    %cst = arith.constant dense<[0.0, 0.0]> : vector<2xf32>
-    %cst0  = arith.constant 0.0 : f32
-    %ret = scf.for %i = %c0 to %c17 step %c1 iter_args (%iter = %cst) -> (vector<2xf32>) {
-      %base = xla.apply_indexing #xla.indexing_map<"(d0) -> (d0 * 2), domain: d0 in [0, 15]">(%i)
-      %val = vector.transfer_read %arg[%base], %cst0 : tensor<34xf32>, vector<2xf32>
-      %log = math.log %val : vector<2xf32>
-      %add = arith.addf %log, %iter : vector<2xf32>
-      scf.yield %add : vector<2xf32>
-    }
-    return %ret : vector<2xf32>
-  }
-}
-
-// CHECK-DAG: #[[$MAP0:.*]] = #xla.indexing_map<"(d0) -> (d0 * 2),
-// CHECK-DAG: #[[$MAP1:.*]] = #xla.indexing_map<"(d0) -> (d0 + 1),
-// CHECK-LABEL: @pipeline_transfer
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C16:.*]] = arith.constant 16 : index
-// CHECK:      %[[BASE0:.*]] = xla.apply_indexing #[[$MAP0]](%[[C0]]
-// CHECK:      %[[VAL0:.*]] = vector.transfer_read %[[ARG0:.*]][%[[BASE0]]]
-// CHECK:      scf.for %[[I:.*]] = %[[C0]] {{.*}} iter_args(%[[ITER:.*]] = {{.*}}, %[[VAL:.*]] = %[[VAL0]])
-// CHECK-DAG:  %[[NEXT_I_EXISTS:.*]] = arith.cmpi ult, %[[I]], %[[C16]]
-// CHECK-DAG:  %[[NEXT_I:.*]] = xla.apply_indexing #[[$MAP1]](%[[I]]
-// CHECK-DAG:  %[[NEXT_BASE:.*]] = xla.apply_indexing #[[$MAP0]](%[[NEXT_I]]
-// CHECK:      %[[NEXT_VAL:.*]] = scf.if %[[NEXT_I_EXISTS]]
-// CHECK-NEXT:   vector.transfer_read %[[ARG0]][%[[NEXT_BASE]]]
-// CHECK-NEXT:   yield
-// CHECK-NEXT: else
-// CHECK-NEXT:   yield %[[VAL]]
-// CHECK:     math.log %[[VAL]]
-// CHECK:     %[[ADD:.*]] = arith.addf
-// CHECK:     yield %[[ADD]], %[[NEXT_VAL]]
-
-// -----
-
-module {
-  func.func @sequential_extract(%arg0: tensor<6xindex>, %arg1: tensor<22xindex>) -> (index) {
-    %c1 = arith.constant 1 : index
-    %c733 = arith.constant 733 : index
-    %c0 = arith.constant 0 : index
-    %2 = scf.for %i = %c0 to %c733 step %c1 iter_args(%x = %c1) -> (index) {
-      %extracted = tensor.extract %arg0[%i] : tensor<6xindex>
-      %extracted_1 = tensor.extract %arg1[%extracted] : tensor<22xindex>
-      scf.yield %extracted_1 : index
-    }
-    return %2 : index
-  }
-}
-
-// Once `extracted` is pipelined, it becomes an iter arg, so `extracted_1` is
-// extract %arg1[%arg]. While it is possible to pipeline this in principle, we
-// do not currently do this.
-
-// CHECK-LABEL: @sequential_extract
-// CHECK-SAME: (%[[ARG0:.*]]: tensor<6xindex>, %[[ARG1:.*]]: tensor<22xindex>)
-// CHECK: tensor.extract %[[ARG0]]
-// CHECK-NOT: tensor.extract
-// CHECK: scf.for
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
index 069a43aefcdb..10343a38a7d1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
@@ -17,39 +17,54 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <iterator>
+#include <memory>
 #include <optional>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
+#include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/type_util.h"
+#include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -67,65 +82,64 @@ namespace {
 using emitters::ApplyIndexing;
 using llvm::SmallVector;
 using mlir::AffineExpr;
+using mlir::getAffineConstantExpr;
+using mlir::getAffineDimExpr;
+using mlir::getAffineSymbolExpr;
 using mlir::ImplicitLocOpBuilder;
 using mlir::MLIRContext;
 using mlir::RankedTensorType;
+using mlir::Type;
 using mlir::Value;
 using mlir::ValueRange;
+using mlir::VectorType;
 using mlir::func::FuncOp;
 using mlir::func::ReturnOp;
 
+namespace mt = ::mlir::tensor;
+namespace mv = ::mlir::vector;
+
 constexpr int kNumRows = 4;
 constexpr int kNumThreadsPerBlock = 128;
 constexpr int kMaxVectorizedBytes = 4;
 
-}  // namespace
-
-TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose) {
-  auto inv_permutation = InversePermutation(transpose->dimensions());
-  auto& output_shape = transpose->shape();
-  SmallVector<int64_t, 3> canonical_output_shape =
-      llvm::to_vector<3>(output_shape.dimensions());
-  SmallVector<int64_t, 3> canonical_permutation =
-      llvm::to_vector<3>(transpose->dimensions());
-
-  // If the last dimension is transposed, add a size-1 B dimension.
-  if (canonical_permutation.back() != canonical_output_shape.size() - 1) {
-    canonical_permutation.push_back(output_shape.rank());
-    canonical_output_shape.push_back(1);
-  }
-  int64_t dim_t1 = -1;
-  int64_t dim_t2 = -1;
-  for (int64_t i = canonical_permutation.size() - 1; i >= 0; --i) {
-    if (canonical_permutation[i] != i) {
-      dim_t2 = canonical_permutation[i];
-      dim_t1 = i;
-      break;
-    }
-  }
-  // If T1 and T2 are adjacent, insert a size-1 A dimension between them.
-  if (dim_t1 - dim_t2 == 1) {
-    canonical_output_shape.insert(canonical_output_shape.begin() + dim_t1, 1);
-    for (auto& p : canonical_permutation) {
-      if (p > dim_t2) p++;
+// Reads the 2D vector tile <vector_size x vector_size> from the shared memory
+// at the given indices.
+Value ReadVectorTileFromShmem(ImplicitLocOpBuilder& b, Value shmem,
+                              ValueRange shmem_indices,
+                              Value vector_tile_init) {
+  int64_t vector_size =
+      mlir::cast<VectorType>(vector_tile_init.getType()).getDimSize(0);
+  Value vector_tile = vector_tile_init;
+  SmallVector<Value> shmem_indices_vec(shmem_indices.begin(),
+                                       shmem_indices.end());
+  auto elem_type =
+      mlir::cast<mlir::RankedTensorType>(shmem.getType()).getElementType();
+  auto vector_type = mlir::VectorType::get({vector_size}, elem_type);
+  for (int64_t i = 0; i < vector_size; ++i) {
+    Value loaded_vector = b.create<mv::TransferReadOp>(
+        vector_type, shmem, shmem_indices_vec, llvm::ArrayRef<bool>{true});
+    for (int64_t j = 0; j < vector_size; ++j) {
+      Value elem =
+          b.create<mv::ExtractOp>(loaded_vector, SmallVector<int64_t>{j});
+      vector_tile =
+          b.create<mv::InsertOp>(elem, vector_tile, SmallVector<int64_t>{j, i});
     }
-    canonical_permutation.insert(canonical_permutation.begin() + dim_t1,
-                                 dim_t1);
+    shmem_indices_vec.front() = b.create<mlir::arith::AddIOp>(
+        shmem_indices_vec.front(), b.create<mlir::arith::ConstantIndexOp>(1));
   }
-  auto canonical_inv_permutation = InversePermutation(canonical_permutation);
-  auto canonical_input_shape =
-      Permute(canonical_output_shape, canonical_inv_permutation);
-  return TransposeSpec{
-      transpose,
-      llvm::to_vector<3>(transpose->dimensions()),
-      llvm::to_vector<3>(inv_permutation),
-      canonical_output_shape,
-      canonical_permutation,
-      llvm::to_vector<3>(canonical_inv_permutation),
-      llvm::to_vector<3>(canonical_input_shape),
-  };
+  return vector_tile;
+}
+
+// Offsets each VECTOR_SIZE x VECTOR_SIZE tile in the shared memory by
+// vector_size to the right. This is needed to avoid bank conflicts.
+AffineExpr Swizzle(AffineExpr shmem_row, AffineExpr shmem_col,
+                   int vector_size) {
+  return (shmem_col + shmem_row.floorDiv(vector_size) * vector_size) %
+         (kNumShmemBanks * vector_size);
 }
 
+}  // namespace
+
 TransposeFusion::TransposeFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis),
       transpose_(analysis.tiled_transpose()),
@@ -365,7 +379,7 @@ TransposeFusion::WriteResult TransposeFusion::EmitWriteToShMemMlir(
     for (const auto& [value, indices, output] :
          llvm::zip(side_outputs, side_output_indices, output_tensors)) {
       result_tensors.push_back(
-          nested_b.create<mlir::tensor::InsertOp>(value, output, indices));
+          nested_b.create<mt::InsertOp>(value, output, indices));
     }
 
     return result_tensors;
@@ -412,7 +426,7 @@ void TransposeFusion::EmitReadFromShMemMlir(
         for (auto [transpose, shmem] :
              llvm::zip(shmem_transposes_, written.shmem_tensors)) {
           transpose_values[transpose].push_back(
-              nested_b.create<mlir::tensor::ExtractOp>(shmem, shmem_indices));
+              nested_b.create<mt::ExtractOp>(shmem, shmem_indices));
         }
         llvm::SmallVector<Value> epilogue_indices = thread_and_block_ids;
         absl::c_copy(symbol_values, std::back_inserter(epilogue_indices));
@@ -426,7 +440,7 @@ void TransposeFusion::EmitReadFromShMemMlir(
                        shmem_transpose_root_indices_)) {
           llvm::SmallVector<Value> indices = ApplyIndexing(
               indexing, thread_and_block_ids, symbol_values, nested_b);
-          results[root_index] = nested_b.create<mlir::tensor::InsertOp>(
+          results[root_index] = nested_b.create<mt::InsertOp>(
               result_scalars.at(root).front(), results[root_index], indices);
         }
         return results;
@@ -472,10 +486,10 @@ absl::Status TransposeFusion::EmitEntryFunction(
 
 llvm::SmallVector<mlir::AffineExpr, 4> TransposeFusion::GetThreadOffsets(
     bool read, mlir::MLIRContext* ctx) const {
-  auto thread = mlir::getAffineDimExpr(
+  auto thread = getAffineDimExpr(
       KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
-  auto loop = mlir::getAffineSymbolExpr(0, ctx);
-  auto vector = mlir::getAffineSymbolExpr(1, ctx);
+  auto loop = getAffineSymbolExpr(0, ctx);
+  auto vector = getAffineSymbolExpr(1, ctx);
   int loop_stride = block_size_ * kNumRows;
   if (MostMinorDimensionUnchanged()) {
     loop_stride *= vector_size_;
@@ -487,8 +501,8 @@ llvm::SmallVector<mlir::AffineExpr, 4> TransposeFusion::GetThreadOffsets(
 
 IndexingMap TransposeFusion::GetIndexing(bool input, const xla::Shape& shape,
                                          mlir::MLIRContext* ctx) const {
-  auto raw_id = mlir::getAffineDimExpr(
-      KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+  auto raw_id =
+      getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
   auto block_ids = DelinearizeInBoundsIndex(raw_id, block_counts_);
   if (!input) {
     absl::c_copy(Permute(block_ids, permutation_), block_ids.begin());
@@ -527,5 +541,504 @@ bool TransposeFusion::MostMinorDimensionUnchanged() const {
   return permutation_.back() == permutation_.size() - 1;
 }
 
+std::vector<int64_t> GetBlockCounts(absl::Span<const int64_t> shape,
+                                    absl::Span<const int64_t> tile) {
+  std::vector<int64_t> block_counts;
+  for (auto [dim, tile_size] : llvm::zip(shape, tile)) {
+    block_counts.push_back(CeilOfRatio(dim, tile_size));
+  }
+  return block_counts;
+}
+
+// TODO(b/370690811): Create the base class for both transpose emitters.
+PackedTranspose::PackedTranspose(const HloFusionAnalysis& analysis,
+                                 const TransposeSpec& spec,
+                                 absl::Span<const int64_t> output_block_tile,
+                                 int64_t num_warps)
+    : analysis_(analysis),
+      spec_(spec),
+      output_tile_(output_block_tile.begin(), output_block_tile.end()),
+      input_tile_(Permute(output_tile_, spec_.canonical_inv_permutation)),
+      block_counts_(GetBlockCounts(spec_.canonical_output_shape, output_tile_)),
+      num_warps_per_block_(num_warps),
+      tile_size_t1_(input_tile_[spec_.dim_T1_input_id()]),
+      tile_size_a_(input_tile_[spec_.dim_A_id()]),
+      tile_size_t2_(input_tile_[spec_.dim_T2_input_id()]),
+      populated_shmem_cols_(tile_size_a_ * tile_size_t1_),
+      populated_shmem_rows_(tile_size_t2_) {
+  VLOG(5) << "Transpose spec: " << spec.ToString()
+          << "Output block tile: " << absl::StrJoin(output_block_tile, ", ")
+          << "\nNumber of warps: " << num_warps << "\n";
+  auto bits_per_element = GetBitwidth(spec_.elem_type());
+  vector_size_ = kBankBitwidth / bits_per_element;
+  CHECK_GE(vector_size_, 1);
+
+  int64_t index = 0;
+  ConstHloInstructionSet transposes_to_tile;
+  for (auto [root, hero] :
+       llvm::zip(analysis_.fusion_roots(), analysis_.fusion_heroes())) {
+    if (auto transpose =
+            GetDescriptionForTiledTransposeEmitter(hero.instruction())) {
+      transposes_to_tile.insert(&hero.instruction());
+      shmem_transpose_roots_.push_back(&root.instruction());
+      shmem_transpose_root_indices_.push_back(index);
+    } else {
+      side_output_roots_.push_back(&root.instruction());
+      side_output_root_indices_.push_back(index);
+    }
+    ++index;
+  }
+  shmem_transposes_ = {transposes_to_tile.begin(), transposes_to_tile.end()};
+}
+
+std::optional<IndexingMap> PackedTranspose::ComputeThreadIdToOutputIndexing(
+    int64_t root_index, MLIRContext* mlir_context) const {
+  return GetOutputIndexing(mlir_context);
+}
+
+std::optional<IndexingMap> PackedTranspose::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    MLIRContext* mlir_context) const {
+  return GetInputIndexing(mlir_context);
+}
+
+LaunchDimensions PackedTranspose::launch_dimensions() const {
+  return LaunchDimensions(Product(block_counts_), kNumThreadsPerBlock);
+}
+
+PackedTranspose::WriteResult PackedTranspose::EmitWriteToShMemMlir(
+    mlir::ImplicitLocOpBuilder& builder, FuncOp entry_function,
+    const HloFusionInstruction& fusion,
+    const emitters::PartitionedComputation& root_computation,
+    const emitters::CallTargetProvider& call_target_provider,
+    ValueRange output_args, mlir::ValueRange thread_and_block_ids) const {
+  MLIRContext* ctx = builder.getContext();
+  IndexingMap input_indexing = GetInputIndexing(ctx);
+  IndexingMap shmem_write_indexing = GetShmemWriteIndexing(ctx);
+
+  int64_t shmem_dim = kNumShmemBanks * vector_size_;
+  SmallVector<Value> shmem_tensors;
+  for (auto* transpose : shmem_transposes_) {
+    Type elem_type = emitters::PrimitiveTypeToMlirType(
+        transpose->shape().element_type(), builder);
+    Value shmem = builder.create<AllocateSharedOp>(
+        RankedTensorType::get({shmem_dim, shmem_dim}, elem_type));
+
+    auto tids_and_bids = EmitThreadAndBlockIds(builder);
+    Value updated_shmem =
+        emitters::EmitXlaLoopOp(
+            builder, tids_and_bids, shmem, input_indexing,
+            [&](ImplicitLocOpBuilder& nested_b, ValueRange ivs,
+                ValueRange input_indices,
+                ValueRange iter_arg) -> SmallVector<Value> {
+              Value input_element =
+                  emitters::ProvideParameter(root_computation, transpose,
+                                             /*operand_index=*/0, input_indices,
+                                             call_target_provider,
+                                             entry_function, nested_b)
+                      .front();
+              auto shmem_indices = emitters::ApplyIndexing(
+                  shmem_write_indexing, tids_and_bids, ivs, nested_b);
+              return nested_b
+                  .create<mt::InsertOp>(input_element, iter_arg.front(),
+                                        shmem_indices)
+                  ->getResults();
+            })
+            .front();
+    shmem_tensors.push_back(updated_shmem);
+  }
+
+  // Produce all side outputs and then write them.
+  int num_inputs = fusion.fused_instructions_computation()->num_parameters();
+  SmallVector<Value> side_output_inits;
+  for (int index : side_output_root_indices_) {
+    side_output_inits.push_back(entry_function.getArgument(num_inputs + index));
+  }
+  auto body_builder = [&](ImplicitLocOpBuilder& nested_b,
+                          ValueRange symbol_values, ValueRange map_results,
+                          ValueRange output_tensors) -> SmallVector<Value> {
+    SmallVector<Value> side_outputs;
+    SmallVector<SmallVector<Value>> side_output_indices;
+    auto* root_tuple = fusion.fused_expression_root();
+    for (auto root : side_output_roots_) {
+      auto indexing = ComposeIndexingMaps(
+          input_indexing,
+          GetBitcastMap(spec_.input_shape(), root->shape(), ctx));
+      indexing.Simplify();
+      side_output_indices.push_back(ApplyIndexing(
+          indexing, thread_and_block_ids, symbol_values, nested_b));
+      ValueRange param_values = emitters::ProvideParameter(
+          root_computation, root_tuple, root_tuple->operand_index(root),
+          side_output_indices.back(), call_target_provider, entry_function,
+          nested_b);
+      side_outputs.append(param_values.begin(), param_values.end());
+    }
+
+    SmallVector<Value> result_tensors;
+    for (const auto& [value, indices, output] :
+         llvm::zip(side_outputs, side_output_indices, output_tensors)) {
+      result_tensors.push_back(
+          nested_b.create<mt::InsertOp>(value, output, indices));
+    }
+
+    return result_tensors;
+  };
+  mlir::ValueRange side_output_vector;
+  if (!side_output_inits.empty()) {
+    side_output_vector = emitters::EmitXlaLoopOp(builder, thread_and_block_ids,
+                                                 side_output_inits,
+                                                 input_indexing, body_builder);
+  }
+
+  WriteResult result;
+  result.shmem_tensors =
+      builder
+          .create<SyncThreadsOp>(mlir::TypeRange(shmem_tensors), shmem_tensors)
+          .getResults();
+  result.updated_outputs = output_args;
+  for (auto [index, side_output_result] :
+       llvm::zip(side_output_root_indices_, side_output_vector)) {
+    result.updated_outputs[index] = side_output_result;
+  }
+  return result;
+}
+
+Value GetZeroVector(ImplicitLocOpBuilder& b, PrimitiveType elem_type,
+                    llvm::ArrayRef<int64_t> shape) {
+  auto mlir_elem_type = emitters::PrimitiveTypeToMlirType(elem_type, b);
+  auto accumulator_type = mlir::VectorType::get(shape, mlir_elem_type);
+  return b.create<mlir::arith::ConstantOp>(
+      accumulator_type, emitters::GetZeroDenseElementsAttr(accumulator_type));
+}
+
+void PackedTranspose::EmitReadFromShMemMlir(
+    mlir::ImplicitLocOpBuilder& builder, FuncOp entry_function,
+    const HloFusionInstruction& fusion,
+    const emitters::PartitionedComputations& computations,
+    const WriteResult& written, mlir::ValueRange thread_and_block_ids) const {
+  auto* mlir_context = builder.getContext();
+  auto shmem_read_indexing = GetShmemReadIndexing(mlir_context);
+  auto outer_loop_indexing = ConvertRangeVariablesToDimensions(
+      shmem_read_indexing, /*range_var_indices=*/{1, 2});
+  auto output_indexing = GetOutputIndexing(mlir_context);
+  auto output_indexing_over_vectors = ConvertRangeVariablesToDimensions(
+      output_indexing, /*range_var_indices=*/{0});
+
+  auto c0 = builder.create<mlir::arith::ConstantIndexOp>(0);
+  SmallVector<Value> grid_and_vector_ids{thread_and_block_ids};
+  grid_and_vector_ids.append({c0, c0});
+  absl::flat_hash_map<PrimitiveType, Value> elem_type_to_vector_tile;
+  for (const HloInstruction* transpose : shmem_transposes_) {
+    PrimitiveType elem_type = transpose->shape().element_type();
+    if (!elem_type_to_vector_tile.contains(elem_type)) {
+      elem_type_to_vector_tile[elem_type] =
+          GetZeroVector(builder, elem_type, {vector_size_, vector_size_});
+    }
+  }
+  absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<Value>>
+      transpose_values;
+  // The outer loop reads <vector_size x vector_size> tiles.
+  auto outer_loop_results = emitters::EmitXlaLoopOp(
+      builder, grid_and_vector_ids, written.updated_outputs,
+      outer_loop_indexing,
+      [&](ImplicitLocOpBuilder& nested_b, ValueRange iv, ValueRange map_results,
+          ValueRange output_tensors) -> SmallVector<Value> {
+        SmallVector<Value> dims{thread_and_block_ids};
+        dims.push_back(iv.front());
+        SmallVector<Value> results;
+        for (auto [transpose, shmem] :
+             llvm::zip(shmem_transposes_, written.shmem_tensors)) {
+          ValueRange shmem_indices = map_results;
+          Value vector_tile =
+              elem_type_to_vector_tile[transpose->shape().element_type()];
+          vector_tile = ReadVectorTileFromShmem(nested_b, shmem, shmem_indices,
+                                                vector_tile);
+          transpose_values[transpose] = {vector_tile};
+        }
+        // The inner loop writes columns of the <vector_size x vector_size>
+        // tiles.
+        auto inner_loop_results = emitters::EmitXlaLoopOp(
+            nested_b, dims, output_tensors, output_indexing_over_vectors,
+            [&](ImplicitLocOpBuilder& nested_b_2, ValueRange ivs,
+                ValueRange map_results,
+                ValueRange output_tensors) -> SmallVector<Value> {
+              for (auto [transpose, shmem] :
+                   llvm::zip(shmem_transposes_, written.shmem_tensors)) {
+                Value elem = nested_b_2.create<mv::ExtractOp>(
+                    transpose_values[transpose].front(),
+                    getAsOpFoldResult(ivs));
+                transpose_values[transpose] = {elem};
+              }
+              llvm::SmallVector<Value> epilogue_indices = thread_and_block_ids;
+              absl::c_copy(iv, std::back_inserter(epilogue_indices));
+              absl::c_copy(ivs, std::back_inserter(epilogue_indices));
+              auto result_scalars = EmitEpilogue(
+                  /*epilogue_index=*/0, computations, entry_function,
+                  transpose_values, epilogue_indices, nested_b_2);
+              SmallVector<Value> results = output_tensors;
+              for (auto [root, indexing, root_index] :
+                   llvm::zip(shmem_transpose_roots_,
+                             computations.epilogues().front().root_indexing,
+                             shmem_transpose_root_indices_)) {
+                SmallVector<Value> symbols{iv};
+                symbols.append(ivs.begin(), ivs.end());
+                llvm::SmallVector<Value> indices = ApplyIndexing(
+                    indexing, thread_and_block_ids, symbols, nested_b);
+                results[root_index] = nested_b_2.create<mt::InsertOp>(
+                    result_scalars.at(root).front(), results[root_index],
+                    indices);
+              }
+              return results;
+            });
+        return inner_loop_results;
+      });
+  builder.create<ReturnOp>(outer_loop_results);
+}
+
+std::vector<emitters::EpilogueSpecification> PackedTranspose::GetEpilogues(
+    const HloFusionInstruction& fusion, MLIRContext* mlir_context) const {
+  std::vector<emitters::EpilogueSpecification> epilogues{
+      GetEpilogueForOutputIndexing(analysis_, shmem_transposes_,
+                                   shmem_transpose_roots_, mlir_context)};
+  // Add empty epilogues for the side outputs. This ensures their roots don't
+  // get "fused" into the tuple function.
+  for (const auto* root : side_output_roots_) {
+    epilogues.push_back(emitters::EpilogueSpecification::FromIdentityIndexing(
+        root, root, mlir_context));
+  }
+  return epilogues;
+}
+
+absl::Status PackedTranspose::EmitEntryFunction(
+    const emitters::PartitionedComputations& computations,
+    const emitters::CallTargetProvider& call_targets,
+    mlir::func::FuncOp entry_function,
+    const HloFusionInstruction& fusion) const {
+  const auto& root_computation = computations.FindPartitionedComputation(
+      fusion.fused_instructions_computation());
+  // Write intermediate results to shmem.
+  mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
+  builder.setInsertionPointToStart(entry_function.addEntryBlock());
+  auto thread_and_block_ids = EmitThreadAndBlockIds(builder);
+  auto written = EmitWriteToShMemMlir(
+      builder, entry_function, fusion, root_computation, call_targets,
+      entry_function.getArguments().take_back(analysis_.fusion_roots().size()),
+      thread_and_block_ids);
+  // Read intermediate results from shmem and compute epilogues.
+  EmitReadFromShMemMlir(builder, entry_function, fusion, computations, written,
+                        thread_and_block_ids);
+  return absl::OkStatus();
+}
+
+IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* ctx) const {
+  // Dimensions variables.
+  auto thread_id = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
+  auto block_id =
+      getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+  auto warp_size = WarpSize(analysis_.device_info());
+  auto lane_id = thread_id % warp_size;
+  auto warp_id = thread_id.floorDiv(warp_size);
+  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
+      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+
+  // Range variables.
+  auto loop = getAffineSymbolExpr(0, ctx);
+  auto vector_element_id = getAffineSymbolExpr(1, ctx);
+  std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
+      {{CeilOfRatio(tile_size_t2_, num_warps_per_block_), vector_size_}});
+
+  // Block offsets.
+  auto block_ids = DelinearizeInBoundsIndex(block_id, block_counts_);
+  absl::c_copy(Permute(block_ids, spec_.canonical_inv_permutation),
+               block_ids.begin());
+
+  // Shmem expressions.
+  auto shmem_row = loop * num_warps_per_block_ + warp_id;
+  auto shmem_col = lane_id * vector_size_ + vector_element_id;
+
+  // Offsets within the block.
+  auto c0 = getAffineConstantExpr(0, ctx);
+  int64_t canonical_rank = spec_.canonical_rank();
+  llvm::SmallVector<AffineExpr, 4> offsets_within_tile(canonical_rank, c0);
+  offsets_within_tile[spec_.dim_A_id()] = shmem_col.floorDiv(tile_size_t1_);
+  offsets_within_tile[spec_.dim_T1_input_id()] = shmem_col % tile_size_t1_;
+  offsets_within_tile[spec_.dim_T2_input_id()] = shmem_row;
+
+  // Canonical indexing.
+  llvm::SmallVector<AffineExpr, 4> canonical_offsets;
+  canonical_offsets.reserve(canonical_rank + 2);
+  for (auto [thread_offset, block_index, tile_size] :
+       llvm::zip(offsets_within_tile, block_ids, input_tile_)) {
+    canonical_offsets.push_back(block_index * tile_size + thread_offset);
+  }
+  llvm::SmallVector<std::pair<AffineExpr, Interval>> constraints{
+      {shmem_col, Interval{0, populated_shmem_cols_ - 1}},
+      {shmem_row, Interval{0, populated_shmem_rows_ - 1}}};
+  IndexingMap canonical_input_indexing{
+      mlir::AffineMap::get(/*num_dims=*/6, /*num_symbols=*/2, canonical_offsets,
+                           ctx),
+      std::move(dim_vars), std::move(range_vars), /*rt_vars=*/{}, constraints};
+  canonical_input_indexing.Simplify();
+
+  // Actual indexing.
+  auto canonical_input_shape_to_real_shape =
+      GetBitcastMap(spec_.canonical_input_shape, spec_.input_shape(), ctx);
+  // When we compose, the constraints w.r.t. to the input dimension sizes will
+  // be added.
+  auto input_indexing = ComposeIndexingMaps(
+      canonical_input_indexing, canonical_input_shape_to_real_shape);
+  input_indexing.Simplify();
+  return input_indexing;
+}
+
+IndexingMap PackedTranspose::GetShmemWriteIndexing(
+    mlir::MLIRContext* ctx) const {
+  // Dimensions variables.
+  auto thread_id = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
+  auto warp_size = WarpSize(analysis_.device_info());
+  auto lane_id = thread_id % warp_size;
+  auto warp_id = thread_id.floorDiv(warp_size);
+  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
+      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+
+  // Range variables.
+  auto loop = getAffineSymbolExpr(0, ctx);
+  auto vector_element_id = getAffineSymbolExpr(1, ctx);
+  std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
+      {CeilOfRatio(tile_size_t2_, num_warps_per_block_), vector_size_});
+
+  // Shmem expressions.
+  auto shmem_row = loop * num_warps_per_block_ + warp_id;
+  auto shmem_col = lane_id * vector_size_ + vector_element_id;
+  llvm::SmallVector<std::pair<AffineExpr, Interval>> constraints{
+      {shmem_col, Interval{0, populated_shmem_cols_ - 1}},
+      {shmem_row, Interval{0, populated_shmem_rows_ - 1}}};
+  shmem_col = Swizzle(shmem_row, shmem_col, vector_size_);
+
+  IndexingMap shmem_write_indexing_map{
+      mlir::AffineMap::get(6, 2, {shmem_row, shmem_col}, ctx), dim_vars,
+      range_vars, /*rt_vars=*/{}, constraints};
+  shmem_write_indexing_map.Simplify();
+  return shmem_write_indexing_map;
+}
+
+IndexingMap PackedTranspose::GetShmemReadIndexing(
+    mlir::MLIRContext* ctx) const {
+  // Dimensions variables.
+  auto thread_id = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
+  auto warp_size = WarpSize(analysis_.device_info());
+  auto lane_id = thread_id % warp_size;
+  auto warp_id = thread_id.floorDiv(warp_size);
+  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
+      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+
+  // Range variables.
+  auto loop = getAffineSymbolExpr(0, ctx);
+  auto vector_horizontal = getAffineSymbolExpr(1, ctx);
+  auto vector_vertical = getAffineSymbolExpr(2, ctx);
+  std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
+      {CeilOfRatio(populated_shmem_cols_,
+                   (vector_size_ * num_warps_per_block_)),
+       vector_size_, vector_size_});
+
+  // Shmem expressions.
+  auto shmem_row = lane_id * vector_size_ + vector_vertical;
+  auto shmem_col = (loop * num_warps_per_block_ + warp_id) * vector_size_ +
+                   vector_horizontal;
+  llvm::SmallVector<std::pair<AffineExpr, Interval>> constraints{
+      {shmem_col, Interval{0, populated_shmem_cols_ - 1}},
+      {shmem_row, Interval{0, populated_shmem_rows_ - 1}}};
+  shmem_col = Swizzle(shmem_row, shmem_col, vector_size_);
+
+  IndexingMap shmem_read_indexing_map{
+      mlir::AffineMap::get(6, 3, {shmem_row, shmem_col}, ctx), dim_vars,
+      range_vars, /*rt_vars=*/{}, constraints};
+  shmem_read_indexing_map.Simplify();
+  return shmem_read_indexing_map;
+}
+
+IndexingMap PackedTranspose::GetOutputIndexing(mlir::MLIRContext* ctx) const {
+  // Dimensions variables.
+  auto thread_id = getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
+  auto block_id =
+      getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+  auto warp_size = WarpSize(analysis_.device_info());
+  auto lane_id = thread_id % warp_size;
+  auto warp_id = thread_id.floorDiv(warp_size);
+  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
+      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+
+  // Range variables.
+  auto loop = getAffineSymbolExpr(0, ctx);
+  auto vector_horizontal = getAffineSymbolExpr(1, ctx);
+  auto vector_vertical = getAffineSymbolExpr(2, ctx);
+  std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
+      {CeilOfRatio(populated_shmem_cols_, vector_size_ * num_warps_per_block_),
+       vector_size_, vector_size_});
+
+  // Block offsets.
+  auto block_ids = DelinearizeInBoundsIndex(block_id, block_counts_);
+
+  // Shmem expressions.
+  auto shmem_col = (loop * num_warps_per_block_ + warp_id) * vector_size_ +
+                   vector_horizontal;
+  auto shmem_row = lane_id * vector_size_ + vector_vertical;
+
+  // Offsets within the block.
+  auto c0 = getAffineConstantExpr(0, ctx);
+  int64_t canonical_rank = spec_.canonical_rank();
+  llvm::SmallVector<AffineExpr, 4> offsets_within_tile(canonical_rank, c0);
+  offsets_within_tile[spec_.dim_A_id()] = shmem_col.floorDiv(tile_size_t1_);
+  offsets_within_tile[spec_.dim_T1_output_id()] = shmem_col % tile_size_t1_;
+  offsets_within_tile[spec_.dim_T2_output_id()] = shmem_row;
+
+  // Canonical indexing.
+  llvm::SmallVector<AffineExpr, 4> canonical_offsets;
+  canonical_offsets.reserve(canonical_rank + 2);
+  for (auto [thread_offset, block_index, tile_size] :
+       llvm::zip(offsets_within_tile, block_ids, output_tile_)) {
+    canonical_offsets.push_back(block_index * tile_size + thread_offset);
+  }
+  llvm::SmallVector<std::pair<AffineExpr, Interval>> constraints{
+      {shmem_col, Interval{0, populated_shmem_cols_ - 1}},
+      {shmem_row, Interval{0, populated_shmem_rows_ - 1}}};
+  IndexingMap canonical_output_indexing{
+      mlir::AffineMap::get(6, 3, canonical_offsets, ctx), std::move(dim_vars),
+      std::move(range_vars), /*rt_vars=*/{}, constraints};
+  canonical_output_indexing.Simplify();
+
+  // Actual indexing.
+  auto canonical_output_shape_to_real_shape =
+      GetBitcastMap(spec_.canonical_output_shape, spec_.output_shape(), ctx);
+  // When we compose, the constraints w.r.t. to the output dimension sizes will
+  // be added.
+  auto output_indexing = ComposeIndexingMaps(
+      canonical_output_indexing, canonical_output_shape_to_real_shape);
+  output_indexing.Simplify();
+  return output_indexing;
+}
+
+std::unique_ptr<EmitterBase> CreateTransposeFusion(
+    const HloFusionAnalysis& analysis) {
+  auto transpose_it = absl::c_find_if(
+      analysis.fusion_heroes(), [](const HloInstructionAdaptor& hero) {
+        return hero.opcode() == HloOpcode::kTranspose;
+      });
+  if (transpose_it != analysis.fusion_heroes().end()) {
+    auto spec = GetTransposeSpec(
+        Cast<HloTransposeInstruction>(&transpose_it->instruction()));
+    auto packed_transpose_tile = GetPackedTransposeTileSizes(spec);
+    if (packed_transpose_tile.ok()) {
+      return std::make_unique<PackedTranspose>(
+          analysis, spec, *packed_transpose_tile, /* num_warps= */ 4);
+    }
+  }
+  return std::make_unique<TransposeFusion>(analysis);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
index 0a18d24f4914..9a2152672083 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
@@ -16,11 +16,14 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_CODEGEN_EMITTERS_TRANSPOSE_H_
 
 #include <cstdint>
+#include <memory>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/AffineExpr.h"
@@ -44,35 +47,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// TODO(pifon): Unify this with TransposeDescription.
-struct TransposeSpec {
-  const Shape& input_shape() const { return transpose->operand(0)->shape(); }
-  const Shape& output_shape() const { return transpose->shape(); }
-  PrimitiveType elem_type() const { return input_shape().element_type(); }
-
-  const HloTransposeInstruction* transpose;
-
-  llvm::SmallVector<int64_t, 3> permutation;
-  llvm::SmallVector<int64_t, 3> inv_permutation;
-
-  // Canonical transpose permutates the input shape
-  // <D_0 x ... x D_n x T2 x D_{n+1} x ... x D_m x A x T1 x B> into
-  // <D'_0 x ... x D'_n' x T1 x D'_{n'+1} x ... x D'_m x A x T2 x B>.
-  // Note that the `D` dimensions are batch dimensions. They can also be
-  // permuted, but they are tiled by 1.
-  //
-  // Examples:
-  // 1. <8x32> -> <32x8> will be canonicalized to <8x1x32x1> -> <32x1x8x1>.
-  // 2. <8x2x32> -> <32x2x8> will be canonicalized to <8x2x32x1> -> <32x2x8x1>.
-  // 3. <8x2x32x7x6> -> <6x32x2x7x8> becomes <8x2x32x7x6x1> -> <6x32x2x7x8x1>.
-
-  llvm::SmallVector<int64_t, 3> canonical_output_shape;
-  llvm::SmallVector<int64_t, 3> canonical_permutation;
-  llvm::SmallVector<int64_t, 3> canonical_inv_permutation;
-  llvm::SmallVector<int64_t, 3> canonical_input_shape;
-};
-TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose);
-
 // Lowers kTranspose fusion to LLVM via MLIR using GPU's shared memory.
 
 // Each thread block of `kWarpSize` x `kNumRows` threads
@@ -151,6 +125,178 @@ class TransposeFusion : public EmitterBase {
   std::vector<int> side_output_root_indices_;
 };
 
+// Packed transpose is a more advanced version of the transpose emitter.
+// It considers the canonical transpose described by TransposeSpec class,
+// i.e. [T2, A, T1, B] -> [T1, A, T2, B] and tries to pack as many T1 rows into
+// shared memory as possible.
+//
+// Let's describe the algorithm for a concrete example.
+//   bf16 [640,100,6,1] - > bf16 [6,100,640,1]
+//
+// 1. Compute the vector size based on the bitwidth of the element type and the
+//    width of the shared memory bank (32 bits):
+//
+//    vector_size = 32 bits / 16 bits = 2
+//
+// 2. Decide the shared memory size based on the vector size and the number of
+//    banks.
+//
+//    shmem_size = 32 * vector_size = 64
+//
+// 3. Allocate shared memory
+//
+//    %shmem = xla.allocate_shared : tensor<64x64xbf16>
+//
+// 4. Compute the tile sizes to pack as many T1 rows as possible into the
+//    columns of the shared memory tensor.
+//
+//    tile_size_t1 = min(t1, shmem_size) = min(6, 64) = 6
+//    tile_size_a = shmem_size / tile_size_t1 = 64 / 6 = 10
+//    tile_size_t2 = min(t2, shmem_size) = min(64, 64) = 64
+//
+//    populated_shmem_cols = tile_size_a * tile_size_t1 = 60
+//    populated_shmem_rows = tile_size_t2 = 64
+//
+//    In this case we are packing 64 x 10 x 6 x bf16 tile into 64 x 60 x bf16
+//    slice of shared memory.
+//
+// 5. Every GPU block gets a single 64 x 10 x 6 x bf16 tile.
+//    The tile is read by `num_warps_per_block` warps.
+//    Let's assume that there are 4 warps per block. In this case, on every
+//    iteration each warp will read 10 x 6 x bf16 elements, i.e. every thread
+//    (30 out of 32) performs a vector load of 2 x bf16 and stores it to the
+//    shared memory. In total, there will be 16 iterations performed by each
+//    block.
+//
+//    The following code snippet shows how the data is read from the input
+//    tensor into the shared memory:
+//
+//    for I = 0 to CEIL(shmem_rows, num_warps_per_block):
+//      for J = 0 to VECTOR_SIZE:
+//        ROW = WARP_ID + NUM_WARPS * I
+//        COL = LANE_ID * VECTOR_SIZE + J
+//        SHMEM[ROW, COL] = INPUT[ROW, COL / 10, COL % 10]
+//
+//    After the data is read, xla_gpy.sync_threads will be inserted.
+//
+// 6. Each thread reads a VECTOR_SIZE x VECTOR_SIZE x bf16 tile from the shared
+//    memory and performs the write of each of the columns of the tile.
+//
+//    for I = 0 to CEIL(shmem_cols, VECTOR_SIZE * num_warps_per_block):
+//      VECTOR_2D = arith.constant dense<0>
+//        : vector<VECTOR_SIZE x VECTOR_SIZE x bf16>
+//      for J = 0 to VECTOR_SIZE:
+//        VECTOR_2D[J, :] = SHMEM[LANE_ID * VECTOR_SIZE + J, I:I+2]
+//      for J = 0 to VECTOR_SIZE:
+//        for K = 0 to VECTOR_SIZE:
+//          OUTPUT[(I + J) % 10, (I + J) / 10,
+//                  LANE_ID * VECTOR_SIZE + K] =  VECTOR_2D[K, J]
+class PackedTranspose : public EmitterBase {
+ public:
+  explicit PackedTranspose(const HloFusionAnalysis& analysis,
+                           const TransposeSpec& spec,
+                           absl::Span<const int64_t> output_block_tile,
+                           int64_t num_warps);
+
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* mlir_context) const override;
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const emitters::PartitionedComputations& computations,
+      const emitters::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<emitters::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
+
+  struct WriteResult {
+    // All output tensors of the fusion, with side outputs written to them.
+    mlir::SmallVector<mlir::Value> updated_outputs;
+    // Shared memory tiles for transpose heroes.
+    mlir::ValueRange shmem_tensors;
+  };
+
+  WriteResult EmitWriteToShMemMlir(
+      mlir::ImplicitLocOpBuilder& builder, mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion,
+      const emitters::PartitionedComputation& root_computation,
+      const emitters::CallTargetProvider& call_target_provider,
+      mlir::ValueRange output_args,
+      mlir::ValueRange thread_and_block_ids) const;
+
+  void EmitReadFromShMemMlir(
+      mlir::ImplicitLocOpBuilder& builder, mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion,
+      const emitters::PartitionedComputations& computations,
+      const WriteResult& written, mlir::ValueRange thread_and_block_ids) const;
+
+ private:
+  IndexingMap GetInputIndexing(mlir::MLIRContext* ctx) const;
+  IndexingMap GetShmemWriteIndexing(mlir::MLIRContext* ctx) const;
+
+  IndexingMap GetShmemReadIndexing(mlir::MLIRContext* ctx) const;
+  IndexingMap GetOutputIndexing(mlir::MLIRContext* ctx) const;
+
+  const HloFusionAnalysis& analysis_;
+  TransposeSpec spec_;
+
+  // Tile sizes for the canonical input shape.
+  std::vector<int64_t> output_tile_;
+
+  // Tile sizes for the canonical output shape.
+  std::vector<int64_t> input_tile_;
+
+  // Block counts for the canonical output shape.
+  std::vector<int64_t> block_counts_;
+
+  // Vector size in elements.
+  int64_t vector_size_;
+
+  // Number of warps per block.
+  int64_t num_warps_per_block_;
+
+  // Tile sizes for the canonicalical dimensions
+  // [T2, A, T1, 1] -> [T1, A, T2, 1].
+  int64_t tile_size_t1_;
+  int64_t tile_size_a_;
+  int64_t tile_size_t2_;
+
+  // Number of populated columns in the shared memory tensor.
+  int64_t populated_shmem_cols_;
+
+  // Number of populated rows in the shared memory tensor.
+  int64_t populated_shmem_rows_;
+
+  // Transpose instructions that require shared memory. Note that not all
+  // transposes require shared memory, e.g. the ones with a large innermost
+  // dimension.
+  std::vector<const HloInstruction*> shmem_transposes_;
+
+  // Roots that have shmem transposes as heroes.
+  std::vector<const HloInstruction*> shmem_transpose_roots_;
+
+  // Root indices for shmem_transpose_roots_.
+  std::vector<int> shmem_transpose_root_indices_;
+
+  // Roots that don't have a transpose hero.
+  std::vector<const HloInstruction*> side_output_roots_;
+
+  // Root indices for side_output_roots_.
+  std::vector<int> side_output_root_indices_;
+};
+
+std::unique_ptr<EmitterBase> CreateTransposeFusion(
+    const HloFusionAnalysis& analysis);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose_test.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose_test.cc
deleted file mode 100644
index df33fa2b5f3b..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose_test.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/codegen/emitters/transpose.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/tests/hlo_test_base.h"
-
-namespace xla::gpu {
-namespace {
-
-using ::testing::ElementsAre;
-
-class TransposeTest : public HloTestBase {
- public:
-  TransposeSpec GetTransposeSpecFromRoot(absl::string_view hlo_text) {
-    auto module = ParseAndReturnVerifiedModule(hlo_text).value();
-    auto* root = module->entry_computation()->root_instruction();
-    return GetTransposeSpec(Cast<HloTransposeInstruction>(root));
-  }
-};
-
-TEST_F(TransposeTest, Transpose_10) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
-    p0 = f32[8, 32] parameter(0)
-    ROOT transpose_p0 = f32[32, 8] transpose(p0), dimensions={1, 0}
-  })");
-  EXPECT_THAT(spec.permutation, ElementsAre(1, 0));
-  EXPECT_THAT(spec.inv_permutation, ElementsAre(1, 0));
-  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 1, 32, 1));
-  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 1, 8, 1));
-  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
-  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
-}
-
-TEST_F(TransposeTest, Transpose_210) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
-    p0 = f32[8, 2, 32] parameter(0)
-    ROOT transpose_p0 = f32[32, 2, 8] transpose(p0), dimensions={2, 1, 0}
-  })");
-  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 1));
-  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 2, 8, 1));
-  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
-  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
-}
-
-TEST_F(TransposeTest, Transpose_102) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
-    p0 = f32[8, 2, 32] parameter(0)
-    ROOT transpose_p0 = f32[2, 8, 32] transpose(p0), dimensions={1, 0, 2}
-  })");
-  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 1, 2, 32));
-  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(2, 1, 8, 32));
-  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
-  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
-}
-
-TEST_F(TransposeTest, Transpose_42130) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
-    p0 = f32[8, 2, 32, 7, 6] parameter(0)
-    ROOT transpose_p0 = f32[6, 32, 2, 7, 8] transpose(p0),
-      dimensions={4, 2, 1, 3, 0}
-  })");
-  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 7, 6, 1));
-  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(6, 32, 2, 7, 8, 1));
-  EXPECT_THAT(spec.canonical_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
-  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
-}
-
-}  // namespace
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
index 16028867ad56..ef1cc1347235 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -34,23 +33,22 @@ limitations under the License.
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
 #include "llvm/TargetParser/Triple.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
-#include "xla/backends/gpu/runtime/kernel_thunk.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/layout_util.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/kernel_arguments.h"
-#include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -60,8 +58,6 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -99,7 +95,7 @@ absl::Status AnnotateKernelLaunchDimensions(
 IndexingMap KernelFusionInterface::GetDefaultThreadIdIndexingMap(
     const LaunchDimensions& launch_dims, int unroll_factor, const Shape& shape,
     mlir::MLIRContext* ctx) {
-  std::vector<mlir::AffineExpr> output_dims(shape.rank());
+  std::vector<mlir::AffineExpr> output_dims(shape.dimensions().size());
 
   std::array<uint64_t, 3> thread_counts{
       launch_dims.thread_counts_per_block().x,
@@ -181,13 +177,14 @@ std::string GetSanitizedUniqueName(IrEmitterContext& ir_emitter_context,
 absl::StatusOr<std::tuple<llvm::Function*, std::vector<llvm_ir::IrArray>,
                           std::vector<llvm_ir::IrArray>>>
 BuildKernelPrototype(IrEmitterContext& ir_emitter_context,
+                     const std::string& impl_fn_name,
                      const std::string& suggested_name,
                      absl::Span<const KernelArgument> arguments,
                      size_t num_inputs,
                      const LaunchDimensions& launch_dimensions,
                      llvm::IRBuilderBase* builder) {
   return BuildKernelPrototypeFromUniqueName(
-      ir_emitter_context,
+      ir_emitter_context, impl_fn_name,
       GetSanitizedUniqueName(ir_emitter_context, suggested_name), arguments,
       num_inputs, launch_dimensions, builder);
 }
@@ -195,6 +192,7 @@ BuildKernelPrototype(IrEmitterContext& ir_emitter_context,
 absl::StatusOr<std::tuple<llvm::Function*, std::vector<llvm_ir::IrArray>,
                           std::vector<llvm_ir::IrArray>>>
 BuildKernelPrototypeFromUniqueName(IrEmitterContext& ir_emitter_context,
+                                   const std::string& impl_fn_name,
                                    const std::string& unique_kernel_name,
                                    absl::Span<const KernelArgument> arguments,
                                    size_t num_inputs,
@@ -234,9 +232,6 @@ BuildKernelPrototypeFromUniqueName(IrEmitterContext& ir_emitter_context,
       AnnotateKernelLaunchDimensions(ir_emitter_context.gpu_device_info(),
                                      launch_dimensions, kernel, llvm_module));
 
-  // TODO(b/65380986): Investigate if adding fast math flags for generated
-  // kernels makes sense.
-
   // Update the insert point to the entry basic block.
   llvm::BasicBlock* entry_bb =
       llvm::BasicBlock::Create(context, /*Name=*/"entry", /*Parent=*/kernel);
@@ -244,26 +239,40 @@ BuildKernelPrototypeFromUniqueName(IrEmitterContext& ir_emitter_context,
   // Emit a "return void" at entry_bb's end, and set the insert point before
   // that return instruction.
   builder->SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb));
+  // Get the original function to extract attributes.
+  auto impl_func = llvm_module->getFunction(impl_fn_name);
 
   for (size_t llvm_arg_no = 0; llvm_arg_no < kernel->arg_size();
        ++llvm_arg_no) {
     const KernelArgument& kernel_argument = arguments[to_arg_no[llvm_arg_no]];
-    llvm::Argument& llvm_arg = *kernel->getArg(llvm_arg_no);
-
-    llvm_arg.setName(absl::StrCat("arg", llvm_arg_no));
-
-    kernel->addDereferenceableParamAttr(llvm_arg_no,
-                                        kernel_argument.slice().size());
+    // Get the original argument to extract attributes from if they exist.
+    llvm::Argument* impl_arg =
+        impl_func ? impl_func->getArg(llvm_arg_no) : nullptr;
+    llvm::Argument& new_arg = *kernel->getArg(llvm_arg_no);
+    new_arg.setName(absl::StrCat("arg", llvm_arg_no));
 
-    kernel->addParamAttr(
-        llvm_arg_no,
-        llvm::Attribute::get(llvm_arg.getContext(), llvm::Attribute::Alignment,
-                             kernel_argument.alignment()));
-
-    if (!kernel_argument.aliased()) {
+    if (impl_arg && impl_arg->hasByValAttr()) {
       kernel->addParamAttr(llvm_arg_no,
-                           llvm::Attribute::get(llvm_arg.getContext(),
-                                                llvm::Attribute::NoAlias));
+                           impl_arg->getAttribute(llvm::Attribute::ByVal));
+    } else {
+      kernel->addDereferenceableParamAttr(llvm_arg_no,
+                                          kernel_argument.slice().size());
+    }
+    // If the alignment has been specified in the original function, use it.
+    // Otherwise, use the alignment from the kernel argument.
+    if (impl_arg && impl_arg->hasAttribute(llvm::Attribute::Alignment)) {
+      kernel->addParamAttr(llvm_arg_no,
+                           impl_arg->getAttribute(llvm::Attribute::Alignment));
+    } else {
+      kernel->addParamAttr(
+          llvm_arg_no,
+          llvm::Attribute::get(new_arg.getContext(), llvm::Attribute::Alignment,
+                               kernel_argument.alignment()));
+    }
+    if (!kernel_argument.aliased()) {
+      kernel->addParamAttr(
+          llvm_arg_no,
+          llvm::Attribute::get(new_arg.getContext(), llvm::Attribute::NoAlias));
     }
   }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h
index 941664acae1a..038a89c80d44 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.h
@@ -104,6 +104,7 @@ absl::StatusOr<
     std::tuple<llvm::Function*, std::vector<llvm_ir::IrArray /*inputs*/>,
                std::vector<llvm_ir::IrArray> /*outputs*/>>
 BuildKernelPrototype(IrEmitterContext& ir_emitter_context,
+                     const std::string& impl_fn_name,
                      const std::string& suggested_name,
                      absl::Span<const KernelArgument> arguments,
                      size_t num_inputs,
@@ -113,6 +114,7 @@ absl::StatusOr<
     std::tuple<llvm::Function*, std::vector<llvm_ir::IrArray /*inputs*/>,
                std::vector<llvm_ir::IrArray> /*outputs*/>>
 BuildKernelPrototypeFromUniqueName(IrEmitterContext& ir_emitter_context,
+                                   const std::string& impl_fn_name,
                                    const std::string& unique_name,
                                    absl::Span<const KernelArgument> arguments,
                                    size_t num_inputs,
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.cc b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
index 36ef356f3839..7ff0aee936a5 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
-#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/gpu/codegen/copy.h"
 #include "xla/backends/gpu/codegen/cudnn.h"
 #include "xla/backends/gpu/codegen/custom.h"
@@ -59,6 +59,22 @@ bool IsDynamicUpdateSliceFusion(const HloFusionAnalysis& analysis) {
 
 std::optional<std::unique_ptr<FusionInterface>> HloFusionInfo::GetCopyFusion()
     const {
+  if (analysis().GetEmitterFusionKind() ==
+      HloFusionAnalysis::EmitterFusionKind::kDynamicMemcpy) {
+    if (IsDynamicUpdateSliceFusion(analysis()) &&
+        !CanEmitDynamicUpdateSliceInPlace()) {
+      // We currently only implement in-place DUSes as memcpys.
+      return std::nullopt;
+    }
+
+    auto dynamic_memcpy =
+        DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(*instr_);
+    if (dynamic_memcpy) {
+      return std::make_unique<DynamicMemcpyFusion>(
+          analysis(), buffer_assignment_, std::move(*dynamic_memcpy));
+    }
+  }
+
   for (const HloInstructionAdaptor& root_adaptor : analysis().fusion_roots()) {
     const HloInstruction* root = &root_adaptor.instruction();
     if (root->opcode() != HloOpcode::kCopy ||
@@ -104,23 +120,28 @@ std::unique_ptr<FusionInterface> GetFusionEmitter(
     }
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
       return std::make_unique<InputSlicesFusion>(analysis);
+    case HloFusionAnalysis::EmitterFusionKind::kDynamicMemcpy:
     case HloFusionAnalysis::EmitterFusionKind::kLoop: {
+      // Check for a memcpy fusion before checking if a DUS can be emitted in
+      // place. DUS cmemcpy fusions can be emitted in place, but lowering them
+      // to a memcpy is still better.
+      if (auto copy_fusion = fusion_info.GetCopyFusion()) {
+        return *std::move(copy_fusion);
+      }
       if (IsDynamicUpdateSliceFusion(analysis) &&
           fusion_info.CanEmitDynamicUpdateSliceInPlace()) {
         return std::make_unique<InPlaceDynamicUpdateSliceFusion>(analysis);
       }
-      if (auto copy_fusion = fusion_info.GetCopyFusion()) {
-        return *std::move(copy_fusion);
-      }
       return std::make_unique<LoopFusion>(analysis);
     }
-    case HloFusionAnalysis::EmitterFusionKind::kReduction:
+    case HloFusionAnalysis::EmitterFusionKind::kReduction: {
       return CreateReductionFusion(analysis);
+    }
     case HloFusionAnalysis::EmitterFusionKind::kScatter: {
       return CreateScatterFusion(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kTranspose: {
-      return std::make_unique<TransposeFusion>(analysis);
+      return CreateTransposeFusion(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate: {
       return std::make_unique<ConcatenateFusion>(analysis);
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
index ccd74407e001..3dc858984f14 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
@@ -1,5 +1,13 @@
-load("//xla:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
-load("//xla:xla.bzl", "xla_cc_binary")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load("//xla:py_strict.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
+load("//xla:xla.default.bzl", "xla_cc_binary")
+load(
+    "//xla/tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -71,15 +79,7 @@ xla_cc_binary(
     visibility = ["//xla/backends/gpu/codegen:__subpackages__"],
     deps = [
         ":test_lib",
-        "//xla:debug_options_flags",
-        "//xla:error_spec",
-        "//xla:shape_util",
-        "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:indexing_test_utils",
-        "//xla/service:gpu_plugin",
-        "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/tests:hlo_test_base",
-        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_googletest//:gtest",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -87,10 +87,25 @@ xla_cc_binary(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
+        "//xla:debug_options_flags",
+        "//xla:error_spec",
+        "//xla:shape_util",
+        # Tool doesn't need to run actual cross GPU collectives.
+        # Otherwise NCCL library should be linked like it's done for lit library when target is built with RBE.
+        "//xla/backends/gpu/collectives:gpu_collectives_stub",
+        "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/analysis:indexing_test_utils",
+        "//xla/service:gpu_plugin_without_collectives",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/tests:hlo_test_base",
+        "//xla/tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
-    ],
+    ] + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:all_runtime",
+    ]) + if_rocm_is_configured([
+        "//xla/stream_executor/rocm:all_runtime",
+    ]),
 )
 
 xla_cc_binary(
@@ -121,10 +136,7 @@ py_strict_binary(
 py_strict_library(
     name = "ncu_rep_lib",
     srcs = ["ncu_rep_lib.py"],
-    deps = [
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
-    ],
+    deps = ["@absl_py//absl:app"],
 )
 
 py_strict_test(
@@ -132,7 +144,6 @@ py_strict_test(
     srcs = ["ncu_rep_test.py"],
     deps = [
         ":ncu_rep_lib",
-        "@absl_py//absl/flags",
         "@absl_py//absl/testing:absltest",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep.py b/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep.py
index db98165dc987..87539f042e3f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep.py
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep.py
@@ -15,17 +15,16 @@
 """Print metrics from ncu-rep file.
 
 Usage:
-  ncu_rep -i <ncu-rep-file> [metrics|kernels|value]
+  ncu_rep <ncu-rep-file> [--list_kernels] [--list_metrics]
+    [--filter <filter kernels>]
     [-f <format>] [-k <kernel name>]
-    [-m metric1] [-m metric2]
-  metrics: print all metric names
-  kernels: print all kernel names
-  value (default): print values of metrics as in -m
+    [-m metric1] [-m metric2]f metrics: print all metric names
 """
 
 from collections.abc import Sequence
 import csv
 import logging
+import os
 import shutil
 import subprocess
 import sys
@@ -33,9 +32,6 @@
 from absl import flags
 from xla.backends.gpu.codegen.tools import ncu_rep_lib
 
-_INPUT_FILE = flags.DEFINE_string(
-    "i", None, "Input .ncu-rep file", required=False
-)
 _METRICS = flags.DEFINE_multi_string(
     "m",
     [
@@ -45,18 +41,28 @@
         "dram__bytes_write.sum",
         "launch__registers_per_thread",
     ],
-    "Input .ncu-rep file",
+    "Names of metrics to print",
 )
 _FORMAT = flags.DEFINE_enum(
     "f",
     "md",
     ["md", "csv", "json", "raw"],
-    "Output format: md (default), csv, or json",
+    "Output format: md (default), csv, json or plain text",
 )
-_KERNEL = flags.DEFINE_string(
-    "k",
+_KERNEL_FILTER = flags.DEFINE_list(
+    "filter",
     None,
-    "kernel to print (prints first kernel if empty)",
+    "kernel filter: comma-separated list of kernel predicates:\n-"
+    " 'name:<regex>' matches part of the name, case sensitive. Use"
+    " 'name:^<regex>$' to match the whole string\n- 'id:<number>' - numeric"
+    " kernel id\n- 'after:<matcher>' - match all kernels after the all matces"
+    " of the matcher\nFilters are applied in order they are specified",
+)
+_LIST_KERNELS = flags.DEFINE_bool(
+    "list_kernels", None, "print kernel names and exit", required=False
+)
+_LIST_METRICS = flags.DEFINE_bool(
+    "list_metrics", None, "print metric names and exit", required=False
 )
 
 ncu_bin = shutil.which("ncu")
@@ -66,32 +72,61 @@
 
 
 def main(argv: Sequence[str]) -> None:
-  input_name = _INPUT_FILE.value
-  if not input_name:
-    # We can't use required=True due to unit tests.
-    raise app.UsageError("input file (-i) is required")
-  cmd = [ncu_bin, "-i", input_name, "--csv", "--page", "raw"]
-  out = subprocess.check_output(cmd, text=True).strip()
+  if len(argv) != 2:
+    raise app.UsageError("provide .ncu-rep file path")
+  input_file_name = argv[1]
+  if not os.path.exists(input_file_name):
+    raise app.UsageError(f"file '{input_file_name}' does not exist")
+  cmd = [
+      ncu_bin,
+      "-i",
+      input_file_name,
+      "--csv",
+      "--print-units",
+      "base",
+      "--page",
+      "raw",
+  ]
+  env_with_locale = os.environ.copy()
+  # Force locale to en_US.UTF-8 to get consistent output.
+  env_with_locale["LC_ALL"] = "en_US.UTF-8"
+  # env_with_locale["LC_ALL"] = "de_DE.UTF-8"
+  out = subprocess.check_output(cmd, text=True, env=env_with_locale).strip()
   rows = list(csv.reader(out.splitlines()))
   name_index = {}
   for i, name in enumerate(rows[0]):
     name_index[name] = i
 
-  op = argv[1] if len(argv) > 1 else "value"
-  if op == "metrics":
+  if _LIST_METRICS.value:
     for name in rows[0]:
       print(name)
     return
 
-  metrics_by_kernel = ncu_rep_lib.get_metrics_by_kernel(rows)
+  all_kernels = ncu_rep_lib.get_metrics_by_kernel(rows)
+  filtered_kernels = all_kernels
+  for f in _KERNEL_FILTER.value or []:
+    filtered_kernels = ncu_rep_lib.filter_kernels(filtered_kernels, f)
+  if not filtered_kernels:
+    raise app.UsageError(
+        "No kernels matched the filter, use --list_kernels without --filter to"
+        " see all kernels"
+    )
 
-  if op == "kernels":
-    for name in metrics_by_kernel:
-      print(name)
+  if _LIST_KERNELS.value:
+    for row in filtered_kernels:
+      print(
+          row[ncu_rep_lib.KERNEL_ID_FIELD][0],
+          row[ncu_rep_lib.KERNEL_NAME_FIELD][0],
+      )
     return
 
-  metrics = ncu_rep_lib.get_kernel_metrics_rows(
-      _METRICS.value, metrics_by_kernel, _KERNEL.value
+  if len(filtered_kernels) > 1:
+    sys.stderr.write(
+        f"aggregating {len(filtered_kernels)} kernels\n",
+    )
+
+  metrics = ncu_rep_lib.aggregate_kernel_metrics(
+      _METRICS.value, filtered_kernels
   )
 
   fmt = _FORMAT.value
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_lib.py b/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_lib.py
index 962dec6c0b40..bbc53bafc009 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_lib.py
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_lib.py
@@ -18,14 +18,20 @@
 """
 
 import csv
+import itertools
 import json
+import logging
+import re
 from typing import TextIO
 from absl import app
 
+KERNEL_NAME_FIELD = "Kernel Name"
+KERNEL_ID_FIELD = "ID"
+
 
 def get_metrics_by_kernel(
     rows: list[list[str]],
-) -> dict[str, dict[str, tuple[str, str]]]:
+) -> list[dict[str, tuple[str, str]]]:
   """Converts ncu-rep table to a dictionary of metrics by kernel.
 
   Args:
@@ -38,43 +44,110 @@ def get_metrics_by_kernel(
   units = rows[1]
   for i, name in enumerate(rows[0]):
     name_index[name] = i
-  results = {}
+  results = []
   for kernel in rows[2:]:
     values = {}
     for idx, name in enumerate(rows[0]):
       values[name] = (kernel[idx], units[idx])
-    kernel_name = values["Kernel Name"][0]
-    results[kernel_name] = values
+    results.append(values)
   return results
 
 
-def get_kernel_metrics_rows(
-    metrics: list[str],
-    all_metrics: dict[str, dict[str, tuple[str, str]]],
-    kernel_name: str,
+def aggregate(metric_values: list[float], metric_name: str):
+  """Aggregates metric values using a function based on the metric name.
+
+  If metric name does not match any of the known patterns, the first value from
+  the input list is returned.
+
+  Args:
+    metric_values: list of metric values, floats
+    metric_name: metric name
+
+  Returns:
+    aggregated metric value
+  """
+  if str.endswith(metric_name, ".max"):
+    return max(metric_values)
+  if str.endswith(metric_name, ".min"):
+    return min(metric_values)
+  if str.endswith(metric_name, ".sum"):
+    return sum(metric_values)
+  return metric_values[0]
+
+
+def aggregate_kernel_metrics(
+    metrics: list[str], kernel_metrics: list[dict[str, tuple[str, str]]]
 ) -> list[list[str]]:
-  """Returns the metrics to print for the given kernel.
+  """Aggregates and returns the metrics for the given kernels.
 
   Args:
     metrics: list of metrics names to print
-    all_metrics: dictionary of metrics by kernel, extracted from ncu-rep table
-    kernel_name: kernel name to print, returns first kernel if empty
+    kernel_metrics: dictionary of metrics by kernel
 
   Returns:
     list of rows [name, value, unit] per metric.
   """
-  if not all_metrics:
+  if not kernel_metrics:
     raise app.UsageError("no metrics found")
-  for kernel, vals in all_metrics.items():
-    if kernel_name and kernel != kernel_name:
-      continue
-    result = []
+  results: dict[str, tuple[list[float], str]] = {}  # name -> (float[], unit)
+  for vals in kernel_metrics:
     for name in metrics:
       if name not in vals:
         raise app.UsageError(f"metric '{name}' is not found")
-      result.append([name, vals[name][0], vals[name][1]])
-    return result
-  raise app.UsageError(f"kernel '{kernel_name}' is not found")
+      value, unit = vals[name]
+      if name not in results:
+        results[name] = ([], unit)
+      if results[name][1] != unit:
+        # That should not happen with `--print-units base` but left as a
+        # safety check.
+        raise app.UsageError(f"unit mismatch for metric '{name}'")
+      # Replace ',' in value to parse it as a float. It is printed in
+      # en_US.UTF-8 locale but we don't want to change the runtime locale just
+      # for this as that might affect downstream.
+      results[name][0].append(float(value.replace(",", "")))
+  kernel_metrics = []
+  for name, (values, unit) in results.items():
+    a = aggregate(values, name)
+    if round(a) == a:
+      kernel_metrics.append([name, f"{round(a)}", unit])
+    else:
+      kernel_metrics.append([name, f"{round(a, 2)}", unit])
+  return kernel_metrics
+
+
+def filter_kernels(
+    kernels: list[dict[str, tuple[str, str]]], condition: str
+) -> list[dict[str, tuple[str, str]]]:
+  """Filters kernels by a condition.
+
+  Args:
+    kernels: list of kernel tuples, extracted from ncu-rep CSV
+    condition: filter condition. Supported filter expressions: 'id:<value>' -
+      kernel with an ID 'name:<regex>' - kernel with a name matching the regex
+      'after:<filter>' - kernels after the last kernel matching the filter.
+
+  Returns:
+    matching kernels
+  """
+  if condition.startswith("id:"):
+    i = condition.removeprefix("id:")
+    return [v for v in kernels if v[KERNEL_ID_FIELD][0] == i]
+  if condition.startswith("name:"):
+    r = condition.removeprefix("name:")
+    return [v for v in kernels if re.search(r, v[KERNEL_NAME_FIELD][0])]
+  if condition.startswith("after:"):
+    r = condition.removeprefix("after:")
+    sub = filter_kernels(kernels, r)
+    if not sub:
+      logging.warning("no kernels matched '%s', 'after:' has no effect", r)
+      return kernels
+    after_id = sub[-1][KERNEL_ID_FIELD][0]
+    return list(
+        itertools.dropwhile(
+            lambda v: v[KERNEL_ID_FIELD][0] != after_id, kernels
+        )
+    )[1:]
+  raise app.UsageError(f"unsupported filter: {condition}")
 
 
 def write_metrics_markdown(out: TextIO, metrics: list[list[str]]):
@@ -85,9 +158,11 @@ def write_metrics_markdown(out: TextIO, metrics: list[list[str]]):
   out.write(
       f"{'Metric'.ljust(name_width)} | {'Value'.rjust(value_width)} | Unit\n"
   )
-  out.write(
-      f"{'-' * name_width }-|-{'-' * value_width }-|-{'-' * unit_width }\n"
-  )
+  out.write("-" * name_width)
+  out.write("-|-")
+  out.write("-" * value_width)
+  out.write("-|-")
+  out.write("-" * unit_width + "\n")
   for name, value, unit in metrics:
     out.write(
         f"{name.ljust(name_width)} | {value.rjust(value_width)} | {unit}\n"
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_test.py b/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_test.py
index c3ad580ac14e..f861b280b8ef 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_test.py
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/ncu_rep_test.py
@@ -30,55 +30,80 @@ def test_get_metrics_by_kernel(self):
     ])
     self.assertEqual(
         by_kernel,
-        {
-            "kernel1": {
+        [
+            {
                 "Kernel Name": ("kernel1", ""),
                 "Metric 1": ("1", "s"),
                 "Metric 2": ("2", "Gb"),
             },
-            "kernel2": {
+            {
                 "Kernel Name": ("kernel2", ""),
                 "Metric 1": ("3", "s"),
                 "Metric 2": ("4", "Gb"),
             },
-        },
+        ],
     )
 
-  def test_get_kernel_metrics_to_print(self):
-    by_kernel = {
-        "kernel1": {
+  def test_aggregate_kernel_metrics(self):
+    data = [
+        {
+            "ID": ("1", ""),
             "Kernel Name": ("kernel1", ""),
-            "Metric 1": ("1", "s"),
-            "Metric 2": ("2", "Gb"),
+            "a.sum": ("12,345", "s"),
+            "b.max": ("2", "registers"),
+            "c.min": ("3", "b"),
+            "d": ("4", "b"),
+            "e": ("10", "b"),
         },
-        "kernel2": {
+        {
+            "ID": ("2", ""),
             "Kernel Name": ("kernel2", ""),
-            "Metric 1": ("3", "s"),
-            "Metric 2": ("4", "Gb"),
+            "a.sum": ("345,678.1", "s"),
+            "b.max": ("4", "registers"),
+            "c.min": ("5.0", "b"),
+            "d": ("6", "b"),
+            "e": ("11", "b"),
         },
-    }
+    ]
     self.assertEqual(
-        ncu_rep_lib.get_kernel_metrics_rows(
-            ["Metric 1", "Metric 2"], by_kernel, ""
+        ncu_rep_lib.aggregate_kernel_metrics(
+            ["a.sum", "b.max", "c.min", "d"], data
         ),
         [
-            ["Metric 1", "1", "s"],
-            ["Metric 2", "2", "Gb"],
-        ],
-    )
-    self.assertEqual(
-        ncu_rep_lib.get_kernel_metrics_rows(["Metric 1"], by_kernel, "kernel1"),
-        [
-            ["Metric 1", "1", "s"],
-        ],
-    )
-    self.assertEqual(
-        ncu_rep_lib.get_kernel_metrics_rows(["Metric 2"], by_kernel, "kernel2"),
-        [
-            ["Metric 2", "4", "Gb"],
+            ["a.sum", "358023.1", "s"],
+            ["b.max", "4", "registers"],
+            ["c.min", "3", "b"],
+            ["d", "4", "b"],
         ],
     )
 
+  def test_filter_kernels(self):
+    data = [
+        {
+            "ID": ("1", ""),
+            "Kernel Name": ("kernel1", ""),
+            "a.sum": ("1,000.0", "s"),
+            "b.max": ("2", "registers"),
+            "c.min": ("3", "b"),
+            "d": ("4", "b"),
+            "e": ("10", "b"),
+        },
+        {
+            "ID": ("2", ""),
+            "Kernel Name": ("kernel2", ""),
+            "a.sum": ("3.0", "s"),
+            "b.max": ("4", "registers"),
+            "c.min": ("5.0", "b"),
+            "d": ("6", "b"),
+            "e": ("11", "b"),
+        },
+    ]
+    self.assertEqual(ncu_rep_lib.filter_kernels(data, "id:1"), [data[0]])
+    self.assertEqual(ncu_rep_lib.filter_kernels(data, "name:^k.*2$"), [data[1]])
+    self.assertEqual(ncu_rep_lib.filter_kernels(data, "name:2"), [data[1]])
+    self.assertEqual(ncu_rep_lib.filter_kernels(data, "name:kernel"), data)
+    self.assertEqual(ncu_rep_lib.filter_kernels(data, "after:id:1"), [data[1]])
+
   def test_write_metrics_markdown(self):
     with io.StringIO() as f:
       ncu_rep_lib.write_metrics_markdown(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 1f1563148e0b..6928620c07d5 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -1,6 +1,6 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
@@ -76,10 +76,10 @@ xla_cc_test(
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
@@ -98,16 +98,17 @@ cc_library(
     deps = [
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_query",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/mlir_hlo:transformation_helpers",
         "//xla/service/gpu:target_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -119,7 +120,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@triton//:TritonDialects",
     ],
@@ -181,9 +181,11 @@ cc_library(
     hdrs = ["fusion_emitter.h"],
     deps = [
         ":compilation_pipeline",
+        ":dot_algorithms",
         ":emitter_helpers",
         ":fusion_emitter_legacy_matmul",
         ":support",
+        ":tma_utils",
         "//xla:autotuning_proto_cc",
         "//xla:permutation_util",
         "//xla:shape_util",
@@ -200,6 +202,7 @@ cc_library(
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/hlo/analysis:indexing_analysis",
+        "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
         "//xla/hlo/utils:hlo_traversal",
@@ -218,9 +221,12 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -240,7 +246,9 @@ cc_library(
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
+        "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:IndexToLLVM",
         "@llvm-project//mlir:LLVMDialect",
@@ -250,8 +258,10 @@ cc_library(
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:errors",
@@ -286,14 +296,15 @@ cc_library(
     ),
     hdrs = ["fusion_emitter_legacy_matmul.h"],
     deps = [
+        ":dot_algorithms",
         ":emitter_helpers",
+        "//xla:autotuning_proto_cc",
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
@@ -302,6 +313,7 @@ cc_library(
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/mlir_hlo:transformation_helpers",
         "//xla/service:algorithm_util",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_indexing_utils",
@@ -314,7 +326,6 @@ cc_library(
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -329,13 +340,40 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
+        "@triton//:TritonDialects",
+    ],
+)
+
+cc_library(
+    name = "dot_algorithms",
+    srcs = ["dot_algorithms.cc"],
+    hdrs = ["dot_algorithms.h"],
+    deps = [
+        ":emitter_helpers",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/codegen:emitter_loc_op_builder",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_traversal",
+        "//xla/service:algorithm_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@triton//:TritonDialects",
     ],
@@ -355,6 +393,7 @@ cc_library(
     ],
     deps = [
         "//xla:autotuning_proto_cc",
+        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
@@ -372,6 +411,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@triton//:TritonDialects",
@@ -388,34 +428,38 @@ xla_cc_test(
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
+        "//xla/service:hlo_module_config",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
-xla_test(
+xla_cc_test(
     name = "fusion_emitter_deviceless_test",
     srcs = ["fusion_emitter_deviceless_test.cc"],
-    backends = ["gpu"],
+    tags = ["no_oss"],  # Doesn't pass in OSS when building with the `fusion_emitter_stub`.
     deps = [
         ":fusion_emitter",
+        "//xla:xla_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
-        "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -447,17 +491,67 @@ xla_test(
         ":test_utils",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:pattern_matcher",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:path",
+    ],
+)
+
+xla_test(
+    name = "fusion_emitter_device_legacy_port_test",
+    srcs = if_gpu_is_configured(["fusion_emitter_device_legacy_port_test.cc"]),
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+        "gpu_b200",
+        "gpu_amd_any",
+    ],
+    tags = [
+        "no_mac",
+    ],
+    deps = [
+        ":fusion_emitter",
+        ":test_utils",
+        "//xla:autotuning_proto_cc",
+        "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
@@ -465,6 +559,8 @@ xla_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:ir_headers",
@@ -478,14 +574,40 @@ xla_test(
     name = "fusion_emitter_int4_device_test",
     size = "large",
     srcs = if_gpu_is_configured(["fusion_emitter_int4_device_test.cc"]),
-    # TODO(b/372714955): Fix the memory leak!
-    backend_args = if_google(
-        {
-            "gpu_h100": ["--heap_check="],
-            "gpu_a100": ["--heap_check="],
-        },
-        {},
-    ),
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+        "gpu_b200",
+        "gpu_amd_any",
+    ],
+    shard_count = 10,
+    tags = [
+        "large",
+        "no_mac",
+    ],
+    deps = [
+        "//xla:autotuning_proto_cc",
+        "//xla:error_spec",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/service/gpu/transforms:nest_gemm_fusion",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:path",
+    ],
+)
+
+xla_test(
+    name = "fusion_emitter_legacy_int4_device_test",
+    size = "large",
+    srcs = if_gpu_is_configured(["fusion_emitter_legacy_int4_device_test.cc"]),
     backends = [
         "gpu_a100",
         "gpu_h100",
@@ -509,7 +631,6 @@ xla_test(
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:path",
@@ -554,17 +675,21 @@ xla_test(
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:path",
     ],
 )
@@ -578,20 +703,26 @@ xla_test(
         "gpu_b200",
         "gpu_amd_any",
     ],
+    shard_count = 5,
     tags = [
         "no_mac",
     ],
     deps = [
         ":fusion_emitter",
+        ":support",
         ":test_utils",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:algorithm_util",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
@@ -600,8 +731,10 @@ xla_test(
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -623,7 +756,7 @@ cc_library(
         "//xla/backends/profiler/gpu:cupti_collector",
         "//xla/backends/profiler/gpu:cupti_tracer",
         "//xla/tsl/profiler/utils:time_utils",
-        "@com_google_absl//absl/algorithm:container",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -654,6 +787,8 @@ cc_library(
         ":fusion_emitter",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
@@ -666,52 +801,24 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
+        "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "fusion_emitter_mem_utils_test",
-    srcs = if_cuda_is_configured(["fusion_emitter_mem_utils_test.cc"]),
-    deps = [
-        ":fusion_emitter",
-        "//xla/codegen:emitter_loc_op_builder",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/hlo/utils:hlo_traversal",
-        "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu/model:symbolic_tile_analysis",
-        "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
-        "//xla/service/gpu/model:triton_emitter_constraints",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:logging",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:NVVMDialect",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:logging",
-        "@triton//:TritonDialects",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -740,6 +847,7 @@ xla_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -753,6 +861,34 @@ xla_test(
         "gpu_b200",
         "gpu_amd_any",
     ],
+    tags = ["no_mac"],
+    deps = [
+        ":support",
+        ":test_utils",
+        "//xla:comparison_util",
+        "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+xla_test(
+    name = "fusion_emitter_parametrized_legacy_test",
+    srcs = if_gpu_is_configured(["fusion_emitter_parametrized_legacy_test.cc"]),
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+        "gpu_b200",
+        "gpu_amd_any",
+    ],
     shard_count = 10,
     tags = ["no_mac"],
     deps = [
@@ -786,16 +922,21 @@ cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:algorithm_util",
         "//xla/service:instruction_fusion",
+        "//xla/service:overload",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:variant_visitor",
+        "//xla/service/gpu:matmul_indexing_utils",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
@@ -818,11 +959,15 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -843,7 +988,6 @@ xla_test(
     tags = ["no_mac"],
     deps = [
         ":fusion_emitter",
-        ":kernel_name_tracer",
         ":support",
         ":test_utils",
         "//xla:error_spec",
@@ -870,18 +1014,12 @@ cc_library(
     srcs = ["tma_utils.cc"],
     hdrs = ["tma_utils.h"],
     deps = [
-        "//xla:shape_util",
-        "//xla/codegen:emitter_loc_op_builder",
-        "//xla/service:hlo_module_config",
-        "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@triton//:TritonDialects",
     ],
 )
 
@@ -891,6 +1029,7 @@ xla_cc_test(
     deps = [
         ":tma_utils",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/codegen:emitter_loc_op_builder",
         "//xla/service:hlo_module_config",
         "//xla/service/llvm_ir:llvm_util",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
index b57300ea8891..a781c19d0b46 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -53,6 +53,7 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   const int ccAsInt = cc.major * 10 + cc.minor;
   const int threadsPerWarp = 32;
 
+  pm->addPass(mt_xla::CreateRoundF32ToTF32ForTf32DotRewritePass());
   if (is_xla_fusion) {
     pm->addPass(mt_xla::CreateInt4ToPackedInt4RewritePass());
   }
@@ -65,7 +66,6 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   pm->addPass(mt::createCombineOpsPass());
   pm->addPass(mt::createReorderBroadcastPass());
   pm->addPass(mlir::createCSEPass());
-  pm->addPass(mlir::createLoopInvariantCodeMotionPass());
   pm->addPass(mlir::createSymbolDCEPass());
   pm->addPass(mt::createLoopUnrollPass());
 
@@ -74,8 +74,6 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   pm->addPass(mt::createConvertTritonToTritonGPUPass(
       absl::StrFormat("cuda:%u", ccAsInt), num_warps, threadsPerWarp,
       num_ctas));
-  pm->addPass(
-      mt_xla::CreateSparseAddEncodingPass(num_warps, threadsPerWarp, num_ctas));
   pm->addPass(mt::gpu::createTritonGPUCoalesce());
   if (cc.IsAtLeastAmpere()) {
     pm->addPass(mt::gpu::createTritonGPUF32DotTC());
@@ -83,35 +81,42 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   pm->addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
-  pm->addPass(mt_xla::CreateSparseBlockedToMMAPass());
   pm->addPass(mt::gpu::createTritonGPUAccelerateMatmul());
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(
       mt::gpu::createTritonGPUOptimizeDotOperands({cc.IsAtLeastAmpere()}));
+  pm->addPass(mlir::createTritonNvidiaGPUOptimizeDescriptorEncodingPass());
   pm->addPass(mlir::createCSEPass());
 
   if (cc.IsAtLeastBlackwell()) {
+    pm->addPass(mt::gpu::createTritonGPUFuseNestedLoops());
+    pm->addPass(mlir::createCanonicalizerPass());
+    pm->addPass(mlir::createLoopInvariantCodeMotionPass());
     pm->addPass(mt::gpu::createTritonGPUOptimizeAccumulatorInit());
-    pm->addPass(mt::gpu::createTritonGPULoopScheduling({num_stages}));
+    pm->addPass(
+        mt::gpu::createTritonGPUAutomaticWarpSpecialization({num_stages}));
+    pm->addPass(mt::gpu::createTritonGPUHoistTMEMAlloc());
     pm->addPass(mt::gpu::createTritonGPUPipeline({num_stages}));
     pm->addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
     pm->addPass(mlir::createTritonNvidiaGPUPromoteLHSToTMemPass());
-    pm->addPass(mlir::createTritonNvidiaGPUKeepAccInTMemPass());
     pm->addPass(mlir::createCanonicalizerPass());
   } else if (cc.IsAtLeastAmpere()) {
     // Even though we don't run on pre-Ampere architectures anymore, we keep
     // this check for consistency with the upstream pipeline
-    pm->addPass(mt::gpu::createTritonGPUOptimizeAccumulatorInit());
+    pm->addPass(mt::gpu::createTritonGPUFuseNestedLoops());
+    pm->addPass(mlir::createCanonicalizerPass());
+    pm->addPass(mlir::createLoopInvariantCodeMotionPass());
+    pm->addPass(mlir::createCanonicalizerPass());
     pm->addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
-    pm->addPass(mt::gpu::createTritonGPULoopScheduling({num_stages}));
     pm->addPass(mt::gpu::createTritonGPUPipeline({num_stages}));
+  } else {
+    pm->addPass(mlir::createLoopInvariantCodeMotionPass());
   }
   pm->addPass(mt::gpu::createTritonGPUPrefetch());
   pm->addPass(
       mt::gpu::createTritonGPUOptimizeDotOperands({cc.IsAtLeastAmpere()}));
   pm->addPass(mt::gpu::createTritonGPUCoalesceAsyncCopy());
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
-  pm->addPass(mt_xla::CreateSparseRemoveLayoutConversionPass());
   pm->addPass(mt::gpu::createTritonGPUReduceDataDuplication());
   pm->addPass(mt::gpu::createTritonGPUReorderInstructions());
   pm->addPass(mlir::createCSEPass());
@@ -122,6 +127,10 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   }
   pm->addPass(mlir::createCanonicalizerPass());
 
+  // Corresponds to "mod.get_tensordesc_metadata()"
+  // in @triton//:third_party/nvidia/backend/compiler.py
+  pm->addPass(mt_xla::CreateExtractTmaInfoPass());
+
   // Based on make_llir() in
   // @triton//:third_party/nvidia/backend/compiler.py
   // This pass reduces Hopper compile time extensively: b/344841434.
@@ -130,21 +139,17 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   }
   pm->addPass(mlir::createTritonNvidiaGPUMMALoweringPass());
   pm->addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
+  pm->addPass(mt::gpu::createTritonGPUAllocateWarpGroups());
   pm->addPass(mlir::createSCFToControlFlowPass());
-  pm->addPass(mlir::createConvertIndexToLLVMPass());
-  pm->addPass(mt::gpu::createAllocateSharedMemoryPass());
+  pm->addPass(mt::gpu::createAllocateSharedMemory());
   pm->addPass(mt::gpu::createTritonGPUGlobalScratchAllocationPass());
-  pm->addPass(mt_xla::CreateSparseLocalLoadToLLVMPass());
   pm->addPass(mlir::createTensorMemoryAllocationPass());
   pm->addPass(mt::gpu::createTritonGPUGlobalScratchAllocationPass());
   pm->addPass(mt::createConvertTritonGPUToLLVMPass(ccAsInt));
-  // The triton_xla.sparse_dot ops need to be rewritten after
-  // ModuleAxisInfoAnalysis inside convert-triton-gpu-to-llvm.
-  pm->addPass(mt_xla::CreateSparseDotOpToLLVMPass());
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mt::createConvertNVGPUToLLVMPass());
-  pm->addPass(mt_xla::CreateSparseWGMMAOpToLLVMPass());
+  pm->addPass(mt::createConvertWarpSpecializeToLLVM());
   pm->addPass(mlir::createArithToLLVMConversionPass());
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
index 03fc4bb230dd..359032fb8e7f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -40,10 +40,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Value 0 for num_stages is used to represent AMD specific register
-// file double buffering.
-constexpr int kAmdDoubleBuffering = 0;
-
 namespace ma = ::mlir::arith;
 namespace mm = ::mlir::math;
 namespace ml = ::mlir::LLVM;
@@ -90,40 +86,62 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   pm->addPass(mt::gpu::createTritonGPUCoalesce());
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
+  // TODO ROCm Pass cc.gfx_version() after fixing issue with fmfa
   pm->addPass(mlir::createTritonAMDGPUAccelerateMatmulPass());
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   // TODO ROCm Check if we want to compare MI100 and greater
   pm->addPass(mlir::createTritonAMDGPUOptimizeEpiloguePass());
   pm->addPass(mt::gpu::createTritonGPUOptimizeDotOperands({true}));
-  if (num_stages == kAmdDoubleBuffering && cc.has_amd_matrix_core()) {
-    pm->addPass(mlir::createTritonAMDGPUStreamPipelinePass(
-        num_stages, /*stream_prefetch=*/true));
+  pm->addNestedPass<mlir::triton::FuncOp>(
+      mlir::createTritonAMDGPUHoistLayoutConversionsPass());
+
+  pm->addPass(mt::gpu::createTritonGPUFuseNestedLoops());
+  pm->addPass(mlir::createCSEPass());
+  pm->addPass(mlir::createLoopInvariantCodeMotionPass());
+  pm->addPass(mlir::createCanonicalizerPass());
+
+  if (cc.has_amd_matrix_core()) {
+    pm->addPass(mlir::createTritonAMDGPUStreamPipelinePass(num_stages));
+    // TODO(ROCm) Modify when corresponding run time flags are introduced.
+    if (/*use_async_copy=*/false) {  // Not enabled by default.
+      pm->addPass(mlir::createTritonAMDGPUCoalesceAsyncCopyPass());
+    }
     pm->addPass(mlir::createCanonicalizerPass());
   }
-  pm->addPass(mt::createTritonAMDGPUInsertInstructionSchedHintsPass());
+  if (/*(instruction_sched_variant=="none") == */ false) {
+    pm->addPass(mt::createTritonAMDGPUInsertInstructionSchedHintsPass("none"));
+  }
   pm->addPass(mt::gpu::createTritonGPUOptimizeDotOperands({true}));
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUReduceDataDuplication());
-  if (num_stages != kAmdDoubleBuffering) {
+  if (/*(instruction_sched_variant=="none") == */ false) {
+    pm->addPass(mlir::createTritonAMDGPUInThreadTransposePass());
+    pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
+  }
+  if (cc.has_amd_matrix_core()) {
     pm->addPass(mt::gpu::createTritonGPUReorderInstructions());
   }
-  pm->addPass(mlir::createTritonAMDGPUCanonicalizePointersPass());
-  pm->addPass(mlir::createCanonicalizerPass());
-  pm->addPass(mlir::createTritonAMDGPUConvertToBufferOpsPass(arch_name));
+  if (/*(use_block_pingpong == "none") ==*/false) {
+    pm->addPass(mlir::createTritonAMDGPUBlockPingpongPass(num_stages));
+  }
+  if (/*use_buffer_ops=*/false) {  // Not enabled by default.
+    pm->addPass(mlir::createTritonAMDGPUCanonicalizePointersPass());
+    pm->addPass(mlir::createCanonicalizerPass());
+    pm->addPass(mlir::createTritonAMDGPUConvertToBufferOpsPass(arch_name));
+  }
+  pm->addPass(mlir::createTritonAMDGPUFoldTrueCmpIPass());
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mlir::createSymbolDCEPass());
 
   // Based on make_llir() in
   // @triton//:third_party/amd/backend/compiler.py
-  pm->addPass(mlir::triton::AMD::createDecomposeUnsupportedConversionsPass(
-      cc.gfx_version()));
   const int custom_lds_size = 0;
   pm->addPass(mlir::triton::AMD::createOptimizeLDSUsagePass(cc.gfx_version(),
                                                             custom_lds_size));
   pm->addPass(mlir::createSCFToControlFlowPass());
   pm->addPass(mlir::createConvertIndexToLLVMPass());
-  pm->addPass(mt::gpu::createAllocateSharedMemoryPass());
+  pm->addPass(mt::gpu::createAllocateSharedMemory());
   pm->addPass(mt::createConvertTritonAMDGPUToLLVMPass(cc.gfx_version(), true));
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
@@ -133,8 +151,10 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mlir::createSymbolDCEPass());
-  pm->addPass(mt::createTritonAMDGPULowerInstructionSchedHintsPass(
-      cc.gfx_version(), num_stages, "default"));
+  if (/*(instruction_sched_variant=="none") == */ false) {
+    pm->addPass(mt::createTritonAMDGPULowerInstructionSchedHintsPass(
+        cc.gfx_version(), num_stages));
+  }
   pm->addPass(mt::createConvertBuiltinFuncToLLVMPass(/*ftz=*/true));
   // There is no clusters in ROCm for now.
   out_cluster_info.clusterDimX = 1;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
new file mode 100644
index 000000000000..e1eee0f45a89
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
@@ -0,0 +1,514 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
+
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/primitive_util.h"
+#include "xla/service/algorithm_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/tensor_float_32_utils.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace xla {
+namespace gpu {
+namespace triton {
+
+namespace {
+
+namespace arith = ::mlir::arith;
+namespace math = ::mlir::math;
+namespace ttir = ::mlir::triton;
+
+using ::mlir::ShapedType;
+using ::mlir::Type;
+using ::mlir::Value;
+
+Type ElementType(Value v) { return mlir::getElementTypeOrSelf(v); }
+
+// Precision-relevant configuration bits for `dot`s.
+struct PrecisionSpec {
+  PrecisionConfig::Algorithm algorithm;
+  // TODO(bchetioui): we hope to get rid of operand precisions eventually, they
+  // are currently a (XLA-wide) bridge to work with ALG_UNSET.
+  PrecisionConfig::Precision lhs_operand_precision;
+  PrecisionConfig::Precision rhs_operand_precision;
+  // Encodes `tt.dot`'s `inputPrecision` attribute.
+  ttir::InputPrecision ttir_input_precision;
+};
+
+using AlgorithmEmitter = absl::StatusOr<Value> (*)(EmitterLocOpBuilder&,
+                                                   const DotOperands&,
+                                                   const PrecisionSpec&);
+
+Value RoundToBF16(EmitterLocOpBuilder b, Value input) {
+  return Cast(b, input, b.getBF16Type());
+}
+
+// Truncates |input| of F32 type to the number representable in Bf16 toward
+// zero.
+Value MaskToBF16(EmitterLocOpBuilder& b, Value input) {
+  ShapedType input_type = mlir::dyn_cast<ShapedType>(input.getType());
+  Type input_type_as_i32 = input_type.clone(b.getI32Type());
+  Value input_as_i32 = b.create<ttir::BitcastOp>(input_type_as_i32, input);
+  Value mask = triton::CreateConst<uint32_t>(b, b.getI32Type(), 0xFFFF0000u,
+                                             input_type.getShape())
+                   .UnwrapTensor();
+  Value high_bits =
+      b.create<arith::AndIOp>(input_type_as_i32, input_as_i32, mask);
+
+  return b.create<ttir::BitcastOp>(input_type, high_bits);
+}
+
+// If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
+// If rhs is +infinity, we will have:
+// +infinity * 1.0 = +infinity
+// +infinity * 0.0 = NaN
+// We would get the wrong result if we sum these partial products. Instead, we
+// must override any accumulated result if the last partial product is
+// non-finite. See b/115844437.
+Value ZeroNaNs(EmitterLocOpBuilder& b, Value input) {
+  Value positive_inf =
+      CreateConst<float>(b, b.getF32Type(),
+                         std::numeric_limits<float>::infinity(),
+                         mlir::cast<ShapedType>(input.getType()).getShape())
+          .UnwrapTensor();
+  Value abs_input = b.create<math::AbsFOp>(input);
+  Value is_finite = b.create<arith::CmpFOp>(arith::CmpFPredicate::OGT,
+                                            positive_inf, abs_input);
+  return b.create<arith::SelectOp>(is_finite, input, ZerosLike(b, input));
+}
+
+absl::Status ExpectType(Value v, Type expected_type) {
+  if (ElementType(v) != expected_type) {
+    std::string expected_type_str, actual_type_str;
+    {
+      llvm::raw_string_ostream os_expected(expected_type_str);
+      llvm::raw_string_ostream os_actual(actual_type_str);
+      expected_type.print(os_expected);
+      ElementType(v).print(os_actual);
+    }
+    return absl::FailedPreconditionError(absl::StrCat(
+        "Expected type ", expected_type_str, " but got ", actual_type_str));
+  }
+  return absl::OkStatus();
+}
+
+std::vector<Value> SplitF32(EmitterLocOpBuilder b, Value input,
+                            int split_count) {
+  std::vector<Value> split_inputs;
+  split_inputs.reserve(split_count);
+  for (int i = 0; i < split_count; ++i) {
+    if (i != split_count - 1) {
+      Value masked = MaskToBF16(b, input);
+      input = b.create<arith::SubFOp>(input, masked);
+      split_inputs.push_back(RoundToBF16(b, masked));
+    } else {
+      split_inputs.push_back(RoundToBF16(b, input));
+    }
+  }
+  return split_inputs;
+}
+
+Value IEEEDot(EmitterLocOpBuilder b, Value lhs, Value rhs, Value acc) {
+  return b.create<ttir::DotOp>(lhs, rhs, acc,
+                               /*inputPrecision=*/ttir::InputPrecision::IEEE,
+                               /*maxNumImpreciseAcc=*/0);
+}
+
+// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
+// from https://arxiv.org/pdf/1904.06376.pdf.
+absl::StatusOr<Value> EmitBF16x9Matmul(EmitterLocOpBuilder& b,
+                                       const DotOperands& dot_operands,
+                                       const PrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 3;
+  constexpr int kHigh = 0;
+  constexpr int kMid = 1;
+  constexpr int kLow = 2;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = triton::ZerosLike(b, dot_operands.accumulator);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kLow], result);
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kLow], result);
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
+
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
+  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
+  return result;
+}
+
+// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
+// from https://arxiv.org/pdf/1904.06376.pdf.
+absl::StatusOr<Value> EmitBF16x6Matmul(EmitterLocOpBuilder& b,
+                                       const DotOperands& dot_operands,
+                                       const PrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 3;
+  constexpr int kHigh = 0;
+  constexpr int kMid = 1;
+  constexpr int kLow = 2;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_parts = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_parts = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = triton::ZerosLike(b, dot_operands.accumulator);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kMid], result);
+
+  result = IEEEDot(b, lhs_parts[kLow], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kLow], result);
+
+  result = IEEEDot(b, lhs_parts[kMid], rhs_parts[kHigh], result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kMid], result);
+
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_parts[kHigh], rhs_parts[kHigh], result);
+  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
+  return result;
+}
+
+// Compute F32 matmul with 3 BF16 dots. It is less accurate than
+// EmitBF16x6Matmul.
+absl::StatusOr<Value> EmitBF16x3Matmul(EmitterLocOpBuilder& b,
+                                       const DotOperands& dot_operands,
+                                       const PrecisionSpec& precision_spec) {
+  constexpr int kNumParts = 2;
+  constexpr int kHigh = 0;
+  constexpr int kLow = 1;
+
+  Type f32 = b.getF32Type();
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.lhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.rhs, f32));
+  TF_RETURN_IF_ERROR(ExpectType(dot_operands.accumulator, f32));
+
+  std::vector<Value> lhs_bf16 = SplitF32(b, dot_operands.lhs, kNumParts);
+  std::vector<Value> rhs_bf16 = SplitF32(b, dot_operands.rhs, kNumParts);
+
+  Value result = triton::ZerosLike(b, dot_operands.accumulator);
+  result = IEEEDot(b, lhs_bf16[kLow], rhs_bf16[kHigh], result);
+  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kLow], result);
+  result = ZeroNaNs(b, result);
+  result = IEEEDot(b, lhs_bf16[kHigh], rhs_bf16[kHigh], result);
+  result = b.create<arith::AddFOp>(dot_operands.accumulator, result);
+  return result;
+}
+
+bool IsTf32Allowed(const HloDotInstruction& dot) {
+  auto precision_config = dot.precision_config();
+  if (precision_config.algorithm() == PrecisionConfig::ALG_UNSET) {
+    return tsl::tensor_float_32_execution_enabled() &&
+           precision_config.operand_precision(0) == PrecisionConfig::DEFAULT &&
+           precision_config.operand_precision(1) == PrecisionConfig::DEFAULT;
+  }
+  return algorithm_util::HasTf32InputType(precision_config.algorithm());
+}
+
+ttir::InputPrecision InferDotPrecision(const HloDotInstruction& dot) {
+  if (dot.precision_config().algorithm() ==
+      PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
+    return ttir::InputPrecision::TF32x3;
+  }
+
+  return IsTf32Allowed(dot) ? ttir::InputPrecision::TF32
+                            : ttir::InputPrecision::IEEE;
+}
+
+absl::StatusOr<Type> GetAlgUnsetAccumulatorType(EmitterLocOpBuilder& b,
+                                                const HloDotInstruction& dot) {
+  TF_ASSIGN_OR_RETURN(Type lhs_type,
+                      TritonType(b, dot.operand(0)->shape().element_type()));
+  TF_ASSIGN_OR_RETURN(Type rhs_type,
+                      TritonType(b, dot.operand(1)->shape().element_type()));
+  TF_ASSIGN_OR_RETURN(Type accumulator_type,
+                      TritonType(b, dot.shape().element_type()));
+
+  // The code below assumes that lhs and rhs have the same type. However
+  // this may not always be the case with f8 matmuls, e.g. e4m3×e5m2 is
+  // supported at the hardware level. NVIDIA GPUs currently only support f32
+  // accumulators for such matmuls.
+  if (lhs_type.isFloat(8) && rhs_type.isFloat(8)) {
+    return b.getF32Type();
+  }
+
+  CHECK(lhs_type == rhs_type);
+
+  // Currently allowing 8x8-bit ints -> i32.
+  if (lhs_type == b.getIntegerType(8) && accumulator_type.isInteger(32)) {
+    return b.getI32Type();
+  }
+  return (accumulator_type.isF64() && lhs_type.isF64()) ? b.getF64Type()
+                                                        : b.getF32Type();
+}
+
+absl::StatusOr<Value> EmitDotAlgUnset(EmitterLocOpBuilder& b,
+                                      const DotOperands& dot_operands,
+                                      const PrecisionSpec& precision_spec) {
+  // Execute matrix multiplication of input tiles and pass the accumulator.
+  // TODO(manany): Should be looked into once we enable Hopper workloads.
+  // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
+  // lower precision than the output type. The change was introduced here:
+  // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
+  Value lhs = dot_operands.lhs;
+  Value rhs = dot_operands.rhs;
+  Value acc = dot_operands.accumulator;
+
+  int max_num_imprecise_acc = 0;
+  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
+    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
+    // sense to enable frequent accumulator promotion at higher matmul
+    // precisions set in the config.
+    max_num_imprecise_acc = std::numeric_limits<int>::max();
+  }
+
+  return b.create<ttir::DotOp>(
+      lhs, rhs, acc,
+      /*inputPrecision=*/precision_spec.ttir_input_precision,
+      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
+}
+
+absl::StatusOr<Value> EmitRegularDot(EmitterLocOpBuilder& b,
+                                     const DotOperands& dot_operands,
+                                     const PrecisionSpec& precision_spec) {
+  Value lhs = dot_operands.lhs;
+  Value rhs = dot_operands.rhs;
+
+  int max_num_imprecise_acc = 0;
+  if (ElementType(lhs).isFloat(8) || ElementType(rhs).isFloat(8)) {
+    // For fp8 dots, disable accumulator promotion to mimick cuBLAS. It may make
+    // sense to enable frequent accumulator promotion at higher matmul
+    // precisions set in the config.
+    max_num_imprecise_acc = std::numeric_limits<int>::max();
+  }
+
+  // Cast F32 inputs to BF16 if the algorithm is BF16_BF16_F32.
+  // TODO(bchetioui): abstract this.
+  if (precision_spec.algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32) {
+    if (ElementType(lhs).isF32()) {
+      lhs = Cast(b, lhs, b.getBF16Type());
+    }
+
+    if (ElementType(rhs).isF32()) {
+      rhs = Cast(b, rhs, b.getBF16Type());
+    }
+  }
+
+  return b.create<ttir::DotOp>(
+      dot_operands.lhs, dot_operands.rhs, dot_operands.accumulator,
+      /*inputPrecision=*/precision_spec.ttir_input_precision,
+      /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
+}
+
+// Returns an emitter for the given dot algorithm. Raises an
+// `UnimplementedError` if the algorithm is not supported.
+absl::StatusOr<AlgorithmEmitter> GetAlgorithmEmitter(
+    const PrecisionConfig::Algorithm algorithm) {
+  switch (algorithm) {
+    case PrecisionConfig::ALG_UNSET:
+      return EmitDotAlgUnset;
+    case PrecisionConfig::ALG_DOT_F16_F16_F16:
+    case PrecisionConfig::ALG_DOT_F32_F32_F32:
+    case PrecisionConfig::ALG_DOT_F64_F64_F64:
+    case PrecisionConfig::ALG_DOT_F16_F16_F32:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+      return EmitRegularDot;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+      return EmitBF16x3Matmul;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      return EmitBF16x6Matmul;
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      // TODO(bchetioui): this should be factored out of EmitRegularDot.
+      return EmitRegularDot;
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+      // TODO(bchetioui): this should be factored out of EmitRegularDot.
+      return EmitRegularDot;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      return EmitBF16x9Matmul;
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
+    default:
+      break;
+  }
+
+  // Couldn't find an algorithm emitter for this algorithm. Raise an error.
+  return absl::UnimplementedError(
+      absl::StrCat("This algorithm is not supported yet: ",
+                   PrecisionConfig::Algorithm_Name(algorithm)));
+}
+
+// Returns the `Type` that the dot operands should be casted to if there is a
+// clear candidate. Raises an error if there are multiple allowed choices but
+// the operands do not already conform to any of them. Returns `std::nullopt` if
+// no casting is a priori needed.
+absl::StatusOr<std::optional<Type>> GetForceOperandsType(
+    EmitterLocOpBuilder& b, const HloDotInstruction& dot,
+    const DotOperands& dot_operands) {
+  PrecisionConfig::Algorithm algorithm = dot.precision_config().algorithm();
+  if (algorithm == PrecisionConfig::ALG_UNSET) {
+    return std::nullopt;
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<PrimitiveType> allowed_operands_primitive_types,
+      algorithm_util::GetAllowedOperandsTypeForAlgorithm(algorithm));
+  CHECK(!allowed_operands_primitive_types.empty());
+
+  std::vector<Type> allowed_operands_types;
+  allowed_operands_types.reserve(allowed_operands_primitive_types.size());
+  for (PrimitiveType primitive_type : allowed_operands_primitive_types) {
+    TF_ASSIGN_OR_RETURN(Type type, TritonType(b, primitive_type));
+    allowed_operands_types.push_back(type);
+  }
+
+  Type lhs_type = ElementType(dot_operands.lhs);
+  Type rhs_type = ElementType(dot_operands.rhs);
+  if (allowed_operands_types.size() == 1) {
+    // If there is a single allowed operand type, we force the operands to use
+    // this type.
+    return allowed_operands_types.front();
+
+  } else {
+    // If there are several allowed operand types, we just check that the
+    // operands have the same type, and that this type is one of the allowed
+    // ones. Raise an error otherwise.
+    if (lhs_type != rhs_type ||
+        !absl::c_linear_search(allowed_operands_types, lhs_type)) {
+      std::string allowed_operands_types_str = absl::StrJoin(
+          allowed_operands_types, ", ", [&](std::string* out, Type type) {
+            absl::StrAppend(out, MlirToString(type));
+          });
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Expected dot operands to both have the same type, and for this type "
+          "to be one of the following types: ",
+          allowed_operands_types_str, " but got ", MlirToString(lhs_type),
+          " and ", MlirToString(rhs_type)));
+    }
+  }
+
+  return std::nullopt;
+}
+
+}  // namespace
+
+absl::StatusOr<Type> GetDotAccumulatorType(EmitterLocOpBuilder& b,
+                                           const HloDotInstruction& dot) {
+  const PrecisionConfig::Algorithm algorithm =
+      dot.precision_config().algorithm();
+
+  if (algorithm == PrecisionConfig::ALG_UNSET) {
+    return GetAlgUnsetAccumulatorType(b, dot);
+  }
+
+  TF_ASSIGN_OR_RETURN(PrimitiveType accumulator_type,
+                      algorithm_util::GetDotAccumulatorType(algorithm));
+  return TritonType(b, accumulator_type);
+}
+
+absl::StatusOr<Value> EmitSingleTileDot(EmitterLocOpBuilder& b,
+                                        const HloDotInstruction& dot,
+                                        DotOperands dot_operands) {
+  PrecisionConfig::Algorithm algorithm = dot.precision_config().algorithm();
+  PrecisionSpec precision_spec{
+      algorithm, dot.precision_config().operand_precision(0),
+      dot.precision_config().operand_precision(1), InferDotPrecision(dot)};
+
+  TF_ASSIGN_OR_RETURN(AlgorithmEmitter algorithm_emitter,
+                      GetAlgorithmEmitter(algorithm));
+
+  TF_ASSIGN_OR_RETURN(std::optional<Type> force_operands_type,
+                      GetForceOperandsType(b, dot, dot_operands));
+
+  TF_ASSIGN_OR_RETURN(Type force_accumulator_type,
+                      GetDotAccumulatorType(b, dot));
+
+  if (force_operands_type.has_value()) {
+    if (ElementType(dot_operands.lhs) != *force_operands_type) {
+      dot_operands.lhs = Cast(b, dot_operands.lhs, *force_operands_type);
+    }
+
+    if (ElementType(dot_operands.rhs) != *force_operands_type) {
+      dot_operands.rhs = Cast(b, dot_operands.rhs, *force_operands_type);
+    }
+  }
+
+  if (ElementType(dot_operands.accumulator) != force_accumulator_type) {
+    dot_operands.accumulator =
+        Cast(b, dot_operands.accumulator, force_accumulator_type);
+  }
+
+  TF_ASSIGN_OR_RETURN(Value result,
+                      algorithm_emitter(b, dot_operands, precision_spec));
+
+  // TODO(b/393299275): once we've moved on from the legacy emitter, we should
+  // make sure that this accumulator type is equal to the one derived here.
+  Type outer_accumulator_type = ElementType(dot_operands.accumulator);
+  if (ElementType(result) != outer_accumulator_type) {
+    result = Cast(b, result, outer_accumulator_type);
+  }
+
+  return result;
+}
+
+}  // namespace triton
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
new file mode 100644
index 000000000000..f04eb1d2d7ed
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
@@ -0,0 +1,53 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_DOT_ALGORITHMS_H_
+#define XLA_BACKENDS_GPU_CODEGEN_TRITON_DOT_ALGORITHMS_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+
+namespace xla {
+namespace gpu {
+namespace triton {
+
+// Carries named `Value`s corresponding to `dot` operands. This includes an
+// accumulator.
+struct DotOperands {
+  ::mlir::Value lhs;
+  ::mlir::Value rhs;
+  ::mlir::Value accumulator;
+};
+
+// Returns the type to use for accumulation for the given `dot` instruction.
+// This also handles the case where the algorithm is `ALG_UNSET`.
+absl::StatusOr<::mlir::Type> GetDotAccumulatorType(
+    EmitterLocOpBuilder& b, const HloDotInstruction& dot);
+
+// Emits a single-tile dot, considering the given `dot` instruction's algorithm
+// and operand precisions. Raises an `UnimplementedError` if the algorithm is
+// not supported.
+absl::StatusOr<::mlir::Value> EmitSingleTileDot(EmitterLocOpBuilder& b,
+                                                const HloDotInstruction& dot,
+                                                DotOperands dot_operands);
+
+}  // namespace triton
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_DOT_ALGORITHMS_H_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
index 01a3d7592449..6eb6dd6ad644 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
+#include <cstdlib>
 #include <initializer_list>
 #include <iterator>
 #include <limits>
@@ -27,6 +29,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -36,12 +39,15 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/codegen/triton/kernel_name_tracer.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/literal.h"
@@ -52,6 +58,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
@@ -65,15 +72,7 @@ class AlgorithmTest : public GpuCodegenTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    if (debug_options.xla_dump_to().empty()) {
-      debug_options.set_xla_dump_to("sponge");
-    }
-    debug_options.set_xla_dump_hlo_pass_re(".*");
-    debug_options.set_xla_gpu_dump_autotuned_gemm_fusions(true);
-
-    // Enable triton fusion for all supported GEMMs.
-    debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
-
+    debug_options.set_xla_gpu_autotune_level(0);
     return debug_options;
   }
 
@@ -94,17 +93,6 @@ class AlgorithmTest : public GpuCodegenTest {
   const stream_executor::GpuComputeCapability& GpuComputeComp() {
     return device_desc().gpu_compute_capability();
   }
-  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-            GpuComputeComp())) {
-      return stream_executor::GpuComputeCapability{
-          device_desc().rocm_compute_capability()};
-    } else {
-      return stream_executor::GpuComputeCapability{
-          stream_executor::CudaComputeCapability{
-              stream_executor::CudaComputeCapability::kAmpere, 0}};
-    }
-  }
 
  protected:
   const stream_executor::DeviceDescription& device_desc() {
@@ -115,21 +103,6 @@ class AlgorithmTest : public GpuCodegenTest {
 // In these tests, we depend on "algorithm" annotations for selecting the 6XBF16
 // algorithm.
 class Triton6xBF16GemmTest : public AlgorithmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = AlgorithmTest::GetDebugOptionsForTest();
-    // These 2 flags are not strictly necessary now, but we're adding them to be
-    // on the safe side against future flakiness.
-    //
-    // Do not fall back to cuBLAS, we are testing Triton.
-    debug_options.set_xla_gpu_cublas_fallback(false);
-
-    // Do not autotune split-k by default, since this prevents deterministically
-    // matching the optimized HLO.
-    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
-    return debug_options;
-  }
-
  protected:
   void SetUp() override {
     if (!SupportsBF16(GpuComputeComp())) {
@@ -142,28 +115,12 @@ class BlasAlgorithmTest : public AlgorithmTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = AlgorithmTest::GetDebugOptionsForTest();
-    // Do not autotune split-k by default, since this prevents deterministically
-    // matching the optimized HLO.
-    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
     debug_options.set_xla_gpu_enable_triton_gemm(false);
     return debug_options;
   }
 };
 
-class TritonAlgorithmTest : public AlgorithmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = AlgorithmTest::GetDebugOptionsForTest();
-    // Do not fall back to cuBLAS, we are testing Triton.
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    // Enable gemm for any hlo including pure matmuls.
-    debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
-    // Do not autotune split-k by default, since this prevents deterministically
-    // matching the optimized HLO.
-    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
-    return debug_options;
-  }
-};
+using TritonAlgorithmTest = AlgorithmTest;
 
 TEST_F(AlgorithmTest, Algorithm3xBF16) {
   constexpr absl::string_view kHloText = R"(
@@ -285,7 +242,11 @@ TEST_F(AlgorithmTest, Algorithm_BF16_BF16_F32_on_BF16_input_for_multiply) {
     CHECK:    %[[reduce:.*]] = f32[256]{0} reduce(
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), pattern));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ok,
+      RunFileCheck(
+          module->ToString(HloPrintOptions().set_print_operand_shape(true)),
+          pattern));
   ASSERT_TRUE(ok);
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), ErrorSpec{/*aabs=*/1e-7, /*arel=*/1e-7}));
@@ -561,24 +522,7 @@ CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
 
 // In these tests, we depend on "algorithm" annotations for selecting the 3XBF16
 // algorithm.
-class Triton3xBF16GemmTest : public AlgorithmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = AlgorithmTest::GetDebugOptionsForTest();
-    // These 2 flags are not strictly necessary now, but we're adding them the
-    // to be on the safe side against future flakiness.
-    //
-    // Enable triton fusion for all supported GEMMs.
-    debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
-    // Do not fall back to cuBLAS, we are testing Triton.
-    debug_options.set_xla_gpu_cublas_fallback(false);
-
-    // Do not autotune split-k by default, since this prevents deterministically
-    // matching the optimized HLO.
-    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
-    return debug_options;
-  }
-};
+using Triton3xBF16GemmTest = AlgorithmTest;
 
 TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
   constexpr absl::string_view kHloText = R"(
@@ -1275,13 +1219,6 @@ class TritonAndBlasSupportForDifferentTensorSizes
     : public WithParamInterface<PC::Algorithm>,
       public AlgorithmTest {
  public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = AlgorithmTest::GetDebugOptionsForTest();
-    debug_options.clear_xla_dump_hlo_pass_re();  // Too many dumps.
-    debug_options.clear_xla_gpu_dump_autotuned_gemm_fusions();
-    return debug_options;
-  }
-
   static auto GetModuleConfig(const DebugOptions& debug_options) {
     HloModuleConfig config;
     config.set_debug_options(debug_options);
@@ -1325,12 +1262,9 @@ class TritonAndBlasSupportForDifferentTensorSizes
     debug_options_ = GetDebugOptionsForTest();
 
     triton_options_ = debug_options_;
-    triton_options_.set_xla_gpu_unsupported_force_triton_gemm(true);
-    triton_options_.set_xla_gpu_cublas_fallback(false);
 
     blas_options_ = debug_options_;
     blas_options_.set_xla_gpu_enable_triton_gemm(false);
-    blas_options_.set_xla_gpu_cublas_fallback(true);
 
     algorithm_ = AlgorithmToString(GetParam());
   }
@@ -1521,7 +1455,7 @@ TEST_P(TritonAndBlasSupportForDifferentTensorSizes,
     case PC::ALG_DOT_BF16_BF16_F32_X6:
     case PC::ALG_DOT_BF16_BF16_F32_X9:
     case PC::ALG_DOT_F32_F32_F32:
-      EXPECT_TRUE(result_or_status.status().ok())
+      ASSERT_TRUE(result_or_status.status().ok())
           << "failed to compile " << algorithm_;
       EXPECT_TRUE(result_or_status.value())
           << "wrong result for " << algorithm_;
@@ -1536,6 +1470,74 @@ TEST_P(TritonAndBlasSupportForDifferentTensorSizes,
   }
 }
 
+// Applies elementwise absolute value to all arguments to make them
+// non-negative.
+void MakeNonNegative(std::vector<Literal>& fake_arguments) {
+  for (Literal& literal : fake_arguments) {
+    literal.MutableEachCell<float>([](absl::Span<const int64_t> indices,
+                                      float value) { return std::abs(value); });
+  }
+}
+
+std::vector<const Literal*> GetLiteralPointers(
+    const std::vector<Literal>& fake_arguments) {
+  std::vector<const Literal*> fake_argument_ptrs;
+  fake_argument_ptrs.reserve(fake_arguments.size());
+  for (const Literal& literal : fake_arguments) {
+    fake_argument_ptrs.push_back(&literal);
+  }
+  return fake_argument_ptrs;
+}
+
+// Returns the maximum relative error for the algorithm, assuming that the
+// majority of the error comes from rounding to narrower type, and not error
+// due to floating point arithmetic calculation. I.e., we assume that:
+//    <contracting dimension> << <narrowing error> / <fp arithmetic error>
+// E.g., for BF16xBF16 -> F32, this would mean k << 2^-7 / 2^-23 = 64k
+double GetMaxRelErrorForSmallContractingDim(PC::Algorithm algorithm) {
+  // With `ulp` denoting the "unit in the last place", and proper floating point
+  // implementation, the test does k multiplications and then k-1 additions per
+  // output element. However, we also get an initial error per element due to
+  // rounding to bf16, or tf32, depending on the algorithm.
+  //
+  // Our total error then ends up being k*ulp_f32 + 2*ulp_bf16/tf32.  We can
+  // look at an example of a dot product of 2-value vectors [a,b] and [x,y], to
+  // get an intuition for it:
+  //  (1+ulp_f32)((1+ulp_f32)((1+ulp_bf16)a * (1+ulp_bf16)x)
+  //      + (1+ulp_f32)((1+ulp_bf16)b * (1+ulp_bf16)y))
+  //   = (1+ulp_f32)(1+ulp_f32)(1+ulp_bf16)(1+ulp_bf16)(ax+by)
+  //  ~= (1+2ulp_f32+2ulp_bf16)(ax+by)
+  //
+  // In the last equality we discard any higher-order errors because they are
+  // orders of magnitude smaller than the 1st-order term.
+  //
+  // Thus, we get 2*ulp_bf16 because the multiplication adds up the errors of
+  // the factors, and addition just factors a single error term out. Then we get
+  // k*ulp_f32 because each "layer" of operations adds another rounding error
+  // (and we have 1 layer of multiplications and k-1 layers of additions).
+  //
+  // If we have a small k, such as k=8 then the error bounds are:
+  //
+  // BF16xBF16 -> F32: 8*2^-23 + 2*2^-7 = 2^-20 + 2^-6 ~= 1.6e-2
+  // TF32xTF32 -> F32: 8*2^-23 + 2*2^-10 = 2^-20 + 2^-9 ~= 2.0e-3
+  //
+  // Thus, they do not actually depend on k, since f32 has much higher precision
+  // than the rounding mode.
+  const absl::flat_hash_map<PC::Algorithm, double> kMaxMeanRelError = {
+      {PC::ALG_DOT_BF16_BF16_F32, 1.6e-2},
+      {PC::ALG_DOT_TF32_TF32_F32, 2.0e-3},
+      // TODO: b/407744579 - Understand what the expected error is with various
+      // precision-recovering algorithms. For now we just use the errors that
+      // we got assuming that the implementation is correct.
+      {PC::ALG_DOT_BF16_BF16_F32_X3, 3e-5},
+      {PC::ALG_DOT_BF16_BF16_F32_X6, 4e-7},
+      {PC::ALG_DOT_BF16_BF16_F32_X9, 4e-7},
+      {PC::ALG_DOT_TF32_TF32_F32_X3, 5e-7}};
+  auto max_rel_error_it = kMaxMeanRelError.find(algorithm);
+  CHECK(max_rel_error_it != kMaxMeanRelError.end());
+  return max_rel_error_it->second;
+}
+
 INSTANTIATE_TEST_SUITE_P(
     TritonAndBlasSupportForDifferentTensorSizes,
     TritonAndBlasSupportForDifferentTensorSizes,
@@ -1546,6 +1548,108 @@ INSTANTIATE_TEST_SUITE_P(
          PC::ALG_DOT_TF32_TF32_F32_X3, PC::ALG_DOT_F64_F64_F64, PC::ALG_UNSET}),
     AlgorithmTestParamToString);
 
+class PrecisionTestsForTriton : public TritonAlgorithmTest,
+                                public NumericTestsArguments,
+                                public WithParamInterface<PC::Algorithm> {
+ protected:
+  absl::StatusOr<std::unique_ptr<HloModule>> GetSimpleDotModule(
+      int lhs_outer_dim, int rhs_outer_dim, int contracting_dim,
+      PC::Algorithm algorithm) {
+    std::string hlo_text = absl::StrReplaceAll(
+        kHloTextPattern, {{"${test_name}", HloModuleTestName()},
+                          {"${m}", absl::StrCat(lhs_outer_dim)},
+                          {"${n}", absl::StrCat(rhs_outer_dim)},
+                          {"${k}", absl::StrCat(contracting_dim)},
+                          {"${algorithm}", AlgorithmToString(algorithm)}});
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        GetOptimizedModule(hlo_text));
+    TF_ASSIGN_OR_RETURN(
+        bool ok, RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
+    if (!ok) {
+      return absl::InternalError(
+          "The module does not contain the pattern __triton_gemm.");
+    }
+    return module;
+  }
+
+ private:
+  static constexpr absl::string_view kHloTextPattern = R"(
+    HloModule ${test_name}
+
+    ENTRY main {
+      p0 = f32[${m},${k}]{1,0} parameter(0)
+      p1 = f32[${k},${n}]{1,0} parameter(1)
+      ROOT %dot = f32[${m},${n}]{1,0} dot(p0, p1),
+        lhs_contracting_dims={1},
+        rhs_contracting_dims={0},
+        algorithm=${algorithm}
+    }
+  )";
+};
+
+MATCHER_P(RelativeDifferenceIsWithin, max_rel_difference, "") {
+  double got = std::get<0>(arg);
+  double expected = std::get<1>(arg);
+  double rel_difference = std::abs((got - expected) / expected);
+  *result_listener << "has relative difference " << rel_difference << " = ("
+                   << got << " - " << expected << ") / " << expected
+                   << " that should be within " << max_rel_difference;
+  return rel_difference <= max_rel_difference;
+}
+
+TEST_P(PrecisionTestsForTriton, PrecisionCheck) {
+  if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
+    GTEST_SKIP() << "Precision tests is unknown for ROCM.";
+  }
+
+  PC::Algorithm algorithm = GetParam();
+  // Use small contracting dimensions to avoid false-negatives due to changing
+  // contracting dimension tiling factors.
+  constexpr int kLhsOuterDim = 1024;
+  constexpr int kRhsOuterDim = 1024;
+  constexpr int kContractingDim = 8;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> test_module,
+                          GetSimpleDotModule(kLhsOuterDim, kRhsOuterDim,
+                                             kContractingDim, algorithm));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> ref_module,
+      GetSimpleDotModule(kLhsOuterDim, kRhsOuterDim, kContractingDim,
+                         PC::ALG_DOT_F32_F32_F32));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> fake_arguments,
+      MakeFakeArguments(test_module.get(), /*pseudo_random=*/true,
+                        /*use_large_range=*/false,
+                        /*treat_gte_as_data_formatting=*/false,
+                        /*max_bits_of_precision=*/23));
+  // Ensure there are no negative arguments to avoid unbounded relative errors
+  // due to subtracting two similarly large numbers.
+  MakeNonNegative(fake_arguments);
+  std::vector<const Literal*> fake_argument_ptrs =
+      GetLiteralPointers(fake_arguments);
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal ref_result,
+      test_runner().Execute(std::move(ref_module), fake_argument_ptrs,
+                            /*run_hlo_passes=*/false));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal test_result,
+      test_runner().Execute(std::move(test_module), fake_argument_ptrs,
+                            /*run_hlo_passes=*/false));
+
+  EXPECT_THAT(llvm::zip(test_result.data<float>(), ref_result.data<float>()),
+              ::testing::Each(RelativeDifferenceIsWithin(
+                  GetMaxRelErrorForSmallContractingDim(algorithm))));
+}
+
+INSTANTIATE_TEST_SUITE_P(PrecisionTestsForTriton, PrecisionTestsForTriton,
+                         ::testing::ValuesIn({PC::ALG_DOT_TF32_TF32_F32,
+                                              PC::ALG_DOT_TF32_TF32_F32_X3,
+                                              PC::ALG_DOT_BF16_BF16_F32,
+                                              PC::ALG_DOT_BF16_BF16_F32_X3,
+                                              PC::ALG_DOT_BF16_BF16_F32_X6,
+                                              PC::ALG_DOT_BF16_BF16_F32_X9}),
+                         AlgorithmTestParamToString);
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
index e4ac102bc7d7..7c8c8e46b22a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
@@ -95,10 +96,12 @@ absl::StatusOr<Type> TritonType(EmitterLocOpBuilder& b, PrimitiveType t) {
       return b.getI32Type();
     case S16:
       return b.getI16Type();
-    case PRED:
-      return b.getI1Type();
     case S8:
       return b.getI8Type();
+    case S4:
+      return b.getI4Type();
+    case PRED:
+      return b.getI1Type();
     case F8E5M2:
       return b.getType<mlir::Float8E5M2Type>();
     case F8E4M3FN:
@@ -110,9 +113,25 @@ absl::StatusOr<Type> TritonType(EmitterLocOpBuilder& b, PrimitiveType t) {
   }
 }
 
-Type StorageType(EmitterLocOpBuilder& b, Type t) {
-  if (t.isInteger(1)) {
-    return b.getI8Type();
+absl::StatusOr<PrimitiveType> GetPrimitiveType(Type t) {
+  if (t.isF64()) return F64;
+  if (t.isF32()) return F32;
+  if (t.isF16()) return F16;
+  if (t.isBF16()) return BF16;
+  if (t.isInteger(64)) return S64;
+  if (t.isInteger(32)) return S32;
+  if (t.isInteger(16)) return S16;
+  if (t.isInteger(8)) return S8;
+  if (t.isInteger(4)) return S4;
+  if (t.isInteger(1)) return PRED;
+  if (mlir::isa<mlir::Float8E5M2Type>(t)) return F8E5M2;
+  if (mlir::isa<mlir::Float8E4M3FNType>(t)) return F8E4M3FN;
+  return absl::UnimplementedError("Unsupported type in getPrimitiveType.\n");
+}
+
+Type StorageType(Type t) {
+  if (auto i = mlir::dyn_cast<mlir::IntegerType>(t); i && i.getWidth() == 1) {
+    return i.get(i.getContext(), 8, i.getSignedness());
   }
   return t;
 }
@@ -126,11 +145,13 @@ bool IsFp8Type(Type t) {
 Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
   Type src_ty = value.getType();
   Type src_element_ty = src_ty;
+  Type fp16_ty = b.getF16Type();
   Type fp32_ty = b.getF32Type();
   Type dst_ty = dst_element_ty;
   if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
     src_element_ty = src_shaped_ty.getElementType();
     dst_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), dst_element_ty);
+    fp16_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF16Type());
     fp32_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF32Type());
   }
   if (src_ty == dst_ty) {
@@ -154,16 +175,22 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
   if (src_fp_element_ty && dst_fp_element_ty) {
     // F8 <-> FP16, BF16, FP32, FP64 need to be handled via Triton's tt.fp_to_fp
     // because LLVM doesn't support casts from/to FP8.
-    // TODO(b/266862493): Add end-to-end test once FP8 support lands in XLA as
-    // we can't test the code below without patching the feature.
-    if (IsFp8Type(src_element_ty)) {
+    // TODO(b/413272992): Add better test coverage for FpToFpOp.
+    if (IsFp8Type(src_element_ty) && !IsFp8Type(dst_element_ty)) {
       return b.create<mt::FpToFpOp>(dst_ty, value);
     }
-    if (IsFp8Type(dst_element_ty)) {
+    if (IsFp8Type(dst_element_ty) && !IsFp8Type(src_element_ty)) {
       return b.create<mt::FpToFpOp>(
           dst_ty, value,
           mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
     }
+    if (IsFp8Type(src_element_ty) && IsFp8Type(dst_element_ty)) {
+      // FP8 <-> FP8 conversion needs to go through FP16
+      auto fp16_value = b.create<mt::FpToFpOp>(fp16_ty, value);
+      return b.create<mt::FpToFpOp>(
+          dst_ty, fp16_value,
+          mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
+    }
 
     if (src_fp_element_ty.getFPMantissaWidth() >
         dst_fp_element_ty.getFPMantissaWidth()) {
@@ -186,7 +213,7 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
   }
   // int => float
   if (mlir::isa<mlir::IntegerType>(src_element_ty) && dst_fp_element_ty) {
-    // TODO(b/266862493): Support unsigned integer types.
+    // The current logic handles signed integer types only.
     if (src_element_ty.isInteger(1)) {
       return b.create<ma::UIToFPOp>(dst_ty, value);
     }
@@ -198,7 +225,6 @@ Value Cast(EmitterLocOpBuilder& b, Value value, Type dst_element_ty) {
       return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNE, value,
                                   ZerosLike(b, value));
     }
-    // TODO(b/266862493): Support unsigned integer types.
     // The current logic handles signed integer types only. Additional handling
     // is needed for unsigned integer types.
     auto cst_int = [&](int64_t x) {
@@ -460,7 +486,7 @@ absl::StatusOr<ScalarOrTensor> EmitConstant(EmitterLocOpBuilder& b,
   llvm::SmallVector<int64_t> shape{constant.shape().dimensions().begin(),
                                    constant.shape().dimensions().end()};
 
-  if (constant.shape().IsInteger()) {
+  if (constant.shape().AreAllLeavesIntegers()) {
     if (constant.shape().element_type() == U64) {
       return CreateConst(b, ty, ScalarConstantValue<uint64_t>(constant, U64),
                          shape);
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
index feb79ddb89d4..9217ed4b9e53 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
@@ -17,15 +17,17 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_CODEGEN_TRITON_EMITTER_HELPERS_H_
 
 #include <cstdint>
-#include <variant>
+#include <string>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
@@ -34,16 +36,25 @@ limitations under the License.
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/status.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu::triton {
 
+// Returns a string representation of the given MLIR entity.
+template <typename T>
+std::string MlirToString(T&& value) {
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  value.print(os);
+  return result;
+}
+
 // This is a wrapper around mlir::Value that can hold either a scalar or a
 // non-0D tensor. An attempt to use this class with 0D tensors will CHECK-fail
 // because 0D tensors are not supported by Triton.
@@ -92,7 +103,10 @@ llvm::SmallVector<int64_t> GetPaddedTileSizes(
 // XLA -> Triton type conversions.
 absl::StatusOr<mlir::Type> TritonType(EmitterLocOpBuilder& b, PrimitiveType t);
 
-mlir::Type StorageType(EmitterLocOpBuilder& b, mlir::Type t);
+// Triton type -> XLA type conversions.
+absl::StatusOr<PrimitiveType> GetPrimitiveType(mlir::Type t);
+
+mlir::Type StorageType(mlir::Type t);
 
 // Get the value of the scalar constant's literal in a C++ type.
 template <typename T>
@@ -112,6 +126,12 @@ ScalarOrTensor CreateConst(EmitterLocOpBuilder& b, mlir::Type type, T value) {
         b.create<mlir::arith::ConstantOp>(b.getIntegerAttr(type, value));
     return ScalarOrTensor(result);
   }
+
+  if (mlir::isa<mlir::IndexType>(type)) {
+    auto result = b.create<mlir::arith::ConstantOp>(b.getIndexAttr(value));
+    return ScalarOrTensor(result);
+  }
+
   if (mlir::isa<mlir::FloatType>(type)) {
     auto result = b.create<mlir::arith::ConstantOp>(
         b.getFloatAttr(type, static_cast<double>(value)));
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
index d54187c55fe2..978b0b8f9791 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/MLIRContext.h"
@@ -64,6 +65,30 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// Since we are creating the kernel and splicing the impl_fn into it, we
+// need to manually annotate the kernel with the nvvm.annotations.
+static void PopulateNvvmAnnotations(
+    llvm::Module* llvm_module, llvm::Function* kernel,
+    TritonWrapperResult& triton_wrapper_result) {
+  llvm::NamedMDNode* dest_nvvm_annotations =
+      llvm_module->getOrInsertNamedMetadata("nvvm.annotations");
+  for (auto md : triton_wrapper_result.nvvm_annotations) {
+    if (auto node = llvm::dyn_cast<llvm::MDNode>(md)) {
+      if (node->getNumOperands() >= 1) {
+        std::vector<llvm::Metadata*> new_operands;
+        new_operands.reserve(node->getNumOperands());
+        new_operands.push_back(llvm::ValueAsMetadata::get(kernel));
+        for (unsigned i = 1; i < node->getNumOperands(); ++i) {
+          new_operands.push_back(node->getOperand(i));
+        }
+        llvm::MDNode* new_node =
+            llvm::MDNode::get(llvm_module->getContext(), new_operands);
+        dest_nvvm_annotations->addOperand(new_node);
+      }
+    }
+  }
+}
+
 absl::StatusOr<TritonWrapperResult>
 TritonFusion::GenerateTritonKernelAndWrapper(
     const HloFusionInstruction& fusion, absl::string_view impl_fn_name,
@@ -75,7 +100,8 @@ TritonFusion::GenerateTritonKernelAndWrapper(
   absl::string_view fusion_kind = backend_config.kind();
   TritonWrapperResult triton_wrapper_result;
 
-  if (fusion_kind == kTritonFusionKind) {
+  if (fusion_kind == kTritonFusionKind ||
+      fusion_kind == kTritonNestedGemmFusionKind) {
     std::optional<LaunchConfig> launch_config = this->launch_config();
     if (!launch_config.has_value()) {
       return absl::InvalidArgumentError(absl::StrCat(
@@ -145,7 +171,8 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     absl::string_view fusion_kind = backend_config.kind();
 
     LaunchDimensions launch_dimensions;
-    if (fusion_kind == kTritonFusionKind) {
+    if (fusion_kind == kTritonFusionKind ||
+        fusion_kind == kTritonNestedGemmFusionKind) {
       std::optional<LaunchConfig> launch_config = this->launch_config();
       // This check should be enforced by `GenerateTritonKernelWrapper`.
       CHECK(launch_config.has_value());
@@ -193,9 +220,12 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     std::vector<llvm_ir::IrArray> outputs;
     TF_ASSIGN_OR_RETURN(
         std::tie(kernel, inputs, outputs),
-        BuildKernelPrototype(ir_emitter_context, suggested_kernel_name,
-                             kernel_arguments.args(), impl_fn->arg_size(),
-                             launch_dimensions, &builder));
+        BuildKernelPrototype(ir_emitter_context, impl_fn_name,
+                             suggested_kernel_name, kernel_arguments.args(),
+                             impl_fn->arg_size(), launch_dimensions, &builder));
+
+    PopulateNvvmAnnotations(ir_emitter_context.llvm_module(), kernel,
+                            triton_wrapper_result);
 
     // Move function body into kernel prototype.
     llvm::Function* prototype_func = builder.GetInsertBlock()->getParent();
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
index 132a829fff6e..e7dbd5409631 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
@@ -18,14 +18,14 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <numeric>
 #include <optional>
 #include <string>
-#include <system_error>  // NOLINT(build/c++11): required to interface with LLVM
+#include <system_error>  // NOLINT
 #include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -35,9 +35,11 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/FileSystem.h"
@@ -50,10 +52,13 @@ limitations under the License.
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
@@ -70,6 +75,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/IR/Verifier.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LLVM.h"
@@ -84,9 +90,12 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
 #include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
+#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/codegen/emitters/elemental_hlo_to_mlir.h"
@@ -94,12 +103,15 @@ limitations under the License.
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/permutation_util.h"
@@ -114,12 +126,15 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -137,6 +152,7 @@ namespace gpu {
 
 namespace arith = ::mlir::arith;
 namespace ttir = ::mlir::triton;
+namespace mtx = ::mlir::triton::xla;
 
 using ::llvm::SmallVector;
 using ::mlir::ArrayRef;
@@ -156,6 +172,101 @@ using ::xla::gpu::triton::TritonType;
 
 namespace {
 
+absl::StatusOr<SmallVector<Value>> ComputeOffsetsForTile(
+    EmitterLocOpBuilder& b, ValueRange tile_multi_index,
+    const TiledHloInstruction& tiled_hlo) {
+  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
+                      tiled_hlo.tile_offsets_indexing());
+  return emitters::ApplyIndexing(tile_offsets_indexing,
+                                 /*dims=*/tile_multi_index, /*symbols=*/{}, b);
+}
+
+SmallVector<Value> CreateIndexValues(EmitterLocOpBuilder& builder,
+                                     const ArrayRef<int64_t>& values) {
+  SmallVector<Value> result;
+  result.reserve(values.size());
+  for (int64_t value : values) {
+    result.push_back(
+        CreateConst(builder, builder.getIndexType(), value).UnwrapScalar());
+  }
+  return result;
+}
+
+// Constructs and holds information needed to construct a tile. This information
+// is propagated to Extract/Insert ops to use them to load and store the correct
+// tiles.
+class TileInfo {
+ public:
+  static absl::StatusOr<TileInfo> Construct(
+      EmitterLocOpBuilder& b, ValueRange tile_multi_index,
+      const TiledHloInstruction& tiled_hlo);
+
+  // Tile offsets. Its size is equal to the rank of the output shape.
+  ValueRange offsets() const { return offsets_; }
+
+  // Tile strides. Its size is equal to the rank of the output shape.
+  ValueRange tile_strides() const { return tile_strides_; }
+
+  // The original shape of the tensor.
+  ArrayRef<int64_t> original_shape() const { return original_shape_; }
+
+  // Tile sizes after padding to a power of 2 (Triton requirement).
+  ArrayRef<int64_t> padded_tile_sizes() const { return padded_tile_sizes_; }
+
+  // The layout of the tensor in minor-to-major order.
+  const SmallVector<int64_t>& minor_to_major_layout() const {
+    return minor_to_major_layout_;
+  }
+
+  // The storage type of the tensor. This could be different from the element
+  // type. e.g. predicates are stored as i8 instead of i1.
+  Type storage_type() const { return storage_type_; }
+
+ private:
+  SmallVector<Value> offsets_;
+  SmallVector<Value> tile_strides_;
+  SmallVector<int64_t> original_shape_;
+  SmallVector<int64_t> padded_tile_sizes_;
+  SmallVector<int64_t> minor_to_major_layout_;
+  Type storage_type_;
+
+  explicit TileInfo(SmallVector<Value> offsets, SmallVector<Value> tile_strides,
+                    SmallVector<int64_t> original_shape,
+                    SmallVector<int64_t> padded_tile_sizes,
+                    SmallVector<int64_t> minor_to_major_layout,
+                    Type storage_type)
+      : offsets_(std::move(offsets)),
+        tile_strides_(std::move(tile_strides)),
+        original_shape_(std::move(original_shape)),
+        padded_tile_sizes_(std::move(padded_tile_sizes)),
+        minor_to_major_layout_(std::move(minor_to_major_layout)),
+        storage_type_(std::move(storage_type)) {}
+};
+
+absl::StatusOr<TileInfo> TileInfo::Construct(
+    EmitterLocOpBuilder& b, ValueRange tile_multi_index,
+    const TiledHloInstruction& tiled_hlo) {
+  TF_ASSIGN_OR_RETURN(SmallVector<Value> offsets,
+                      ComputeOffsetsForTile(b, tile_multi_index, tiled_hlo));
+
+  // Triton requires that all block dimensions are a power of 2.
+  auto padded_tile_sizes = GetPaddedTileSizes(tiled_hlo.tile_sizes());
+  SmallVector<int64_t> original_shape;
+  original_shape.assign(tiled_hlo.hlo()->shape().dimensions().begin(),
+                        tiled_hlo.hlo()->shape().dimensions().end());
+
+  const Shape& shape = tiled_hlo.hlo()->shape();
+  TF_ASSIGN_OR_RETURN(Type expected_element_type,
+                      TritonType(b, shape.element_type()));
+  auto storage_type = StorageType(expected_element_type);
+
+  auto tile_strides = CreateIndexValues(b, tiled_hlo.tile_strides());
+  auto minor_to_major_layout = llvm::to_vector(LayoutUtil::MinorToMajor(shape));
+
+  return TileInfo(offsets, tile_strides, original_shape, padded_tile_sizes,
+                  minor_to_major_layout, storage_type);
+}
+
 using TensorValue = mlir::TypedValue<mlir::RankedTensorType>;
 
 ScalarOrTensor Broadcast(EmitterLocOpBuilder& b, TensorValue value,
@@ -169,30 +280,22 @@ ScalarOrTensor Range(EmitterLocOpBuilder& b, int32_t limit) {
   return ScalarOrTensor(b.create<ttir::MakeRangeOp>(type, 0, limit));
 }
 
-Value AddPtr(EmitterLocOpBuilder& b, Value ptr, Value offset) {
-  return b.create<ttir::AddPtrOp>(ptr.getType(), ptr, offset);
-}
-
-ScalarOrTensor EmitParameterLoad(EmitterLocOpBuilder& b, Value pointer,
-                                 ArrayRef<int32_t> boundary_checks) {
+ScalarOrTensor EmitParameterExtract(EmitterLocOpBuilder& b,
+                                    const TileInfo& tile_info,
+                                    Value parent_base_ptr) {
   // For a pointer to a scalar or a zero-dimensional tensor, load the base
   // pointer directly. This shortcut is necessary because Triton does not
-  // support 0-D tensors. Looking for the defining make_tensor_ptr op is
-  // sufficient because pointers to 0-D tensors are never modified by e.g.
-  // `tt.advance`.
-  if (auto make_tensor_ptr = pointer.getDefiningOp<ttir::MakeTensorPtrOp>();
-      make_tensor_ptr && make_tensor_ptr.getShape().empty()) {
-    pointer = make_tensor_ptr.getBase();
-  }
-
-  std::optional<ttir::PaddingOption> padding;
-  if (!boundary_checks.empty()) {
-    padding = ttir::PaddingOption::PAD_ZERO;
-  }
-  bool is_volatile = false;
-  return ScalarOrTensor(b.create<ttir::LoadOp>(
-      pointer, boundary_checks, padding, ttir::CacheModifier::NONE,
-      ttir::EvictionPolicy::NORMAL, is_volatile));
+  // support 0-D tensors.
+  if (tile_info.padded_tile_sizes().empty()) {
+    return ScalarOrTensor(
+        b.create<mlir::tensor::ExtractOp>(parent_base_ptr, {}));
+  }
+
+  return ScalarOrTensor(b.create<mtx::ExtractOp>(
+      mlir::RankedTensorType::get(tile_info.padded_tile_sizes(),
+                                  tile_info.storage_type()),
+      parent_base_ptr, tile_info.offsets(), tile_info.tile_strides(),
+      tile_info.minor_to_major_layout()));
 }
 
 absl::StatusOr<ScalarOrTensor> EmitScope(
@@ -347,10 +450,9 @@ absl::StatusOr<ScalarOrTensor> EmitNestedFusion(
 ScalarOrTensor EmitTiledBroadcast(
     EmitterLocOpBuilder& b, const TiledHloInstruction& tiled_broadcast,
     absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
-  const llvm::SmallVector<int64_t>& input_tile_shape =
+  const SmallVector<int64_t>& input_tile_shape =
       tiled_broadcast.operand(0)->tile_sizes();
-  const llvm::SmallVector<int64_t>& output_tile_shape =
-      tiled_broadcast.tile_sizes();
+  const SmallVector<int64_t>& output_tile_shape = tiled_broadcast.tile_sizes();
 
   if (input_tile_shape.empty() && output_tile_shape.empty()) {
     return values[tiled_broadcast.operand(0)];
@@ -464,7 +566,7 @@ ScalarOrTensor ReshapeTensorToScalar(EmitterLocOpBuilder& b, Value input) {
   if (mlir::cast<ShapedType>(input.getType()).getRank() > 1) {
     Type output_tensor_type = mlir::RankedTensorType::get({1}, element_type);
     single_dim_tensor = b.create<ttir::ReshapeOp>(output_tensor_type, input,
-                                                  /*allow_reorder*/ true);
+                                                  /*allow_reorder=*/true);
   }
 
   // Second, reduce to a scalar.
@@ -505,7 +607,6 @@ absl::StatusOr<ScalarOrTensor> EmitTiledReshape(EmitterLocOpBuilder& b,
   }
 
   // At this point we know that the input is a non-0D tensor.
-
   auto input_shaped_type = mlir::cast<ShapedType>(input.getType());
 
   // Handle the case of reshaping [1,1,1...] to a scalar.
@@ -514,7 +615,6 @@ absl::StatusOr<ScalarOrTensor> EmitTiledReshape(EmitterLocOpBuilder& b,
   }
 
   // At this point we know that neither the input nor the output are 0D tensors.
-
   Type output_tensor_type = mlir::RankedTensorType::get(
       padded_tile_sizes, input_shaped_type.getElementType());
 
@@ -595,22 +695,496 @@ absl::StatusOr<ScalarOrTensor> EmitTiledBitcast(
                                normalized_reshape)};
 }
 
+absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
+    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const HloFusionInstruction* fusion,
+    const TiledHloComputation& tiled_computation, mlir::FunctionOpInterface fn,
+    ValueRange tile_multi_index);
+
+bool UseGenericTritonEmitterForGemms(const HloInstruction* hlo) {
+  return hlo->GetModule()
+      ->config()
+      .debug_options()
+      .xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms();
+}
+
+// Returns the number of iterations of the loop over the contracting
+// dimension of matrix multiplication.
+absl::StatusOr<int64_t> GetDotLoopIterationCount(
+    const TiledHloInstruction& tiled_dot) {
+  // As LHS (and RHS) must point to the outline fusion computation that is
+  // tiled with contracting dimension, we can get the
+  // - size from the shape of the operand
+  // - tile size from the tiling of the nested fusion root
+  // using the contracting dimension from the dot instruction.
+  const HloDotInstruction& dot =
+      *::xla::Cast<HloDotInstruction>(tiled_dot.hlo());
+  const auto& dims = dot.dot_dimension_numbers();
+  if (dims.lhs_contracting_dimensions_size() != 1) {
+    return absl::UnimplementedError(
+        absl::StrCat("Only one contracting dimension is supported, got ",
+                     dims.lhs_contracting_dimensions_size()));
+  }
+  auto contracting_dim_idx = dims.lhs_contracting_dimensions(0);
+  int64_t k = dot.operand(0)->shape().dimensions(contracting_dim_idx);
+
+  const TiledHloFusionInstruction* tiled_hlo_fusion =
+      static_cast<const TiledHloFusionInstruction*>(tiled_dot.operand(0));
+  auto fusion_tile_sizes =
+      tiled_hlo_fusion->called_computation()->GetRoots()[0]->tile_sizes();
+  int64_t tile_k = fusion_tile_sizes[contracting_dim_idx];
+
+  return CeilOfRatio(k, tile_k);
+}
+
+// TODO(b/393299275): unify with the logic in `EmitReduce`.
+// Computes and applies a mask to the reduction dimension of the dot operand
+// passed as a parameter.
+//
+// Note: we currently assume that contracting_dimension_tile_index is an i32
+// scalar.
+absl::StatusOr<Value> MaskDotOperand(EmitterLocOpBuilder& b,
+                                     const TiledHloInstruction& dot_operand,
+                                     Value dot_operand_value,
+                                     Value contracting_dimension_tile_index,
+                                     int contraction_dimension_index) {
+  if (contracting_dimension_tile_index.getType() != b.getI32Type()) {
+    return absl::FailedPreconditionError(
+        "contracting_dimension_tile_index must be an i32 scalar");
+  }
+
+  llvm::ArrayRef<int64_t> tile_shape =
+      mlir::cast<ShapedType>(dot_operand_value.getType()).getShape();
+
+  int64_t rank = dot_operand.hlo()->shape().dimensions().size();
+  int64_t contracting_dimension_size =
+      dot_operand.hlo()->shape().dimensions(contraction_dimension_index);
+  int64_t tile_size = tile_shape[contraction_dimension_index];
+
+  if (contracting_dimension_size % tile_size != 0) {
+    // When the contracting dimension is not divisible by the tile size, we
+    // need to mask out the last tile. We do this with the following logic:
+    //
+    // indices =
+    //   contracting_dimension_tile_index * tile_size + range(0, tile_size)
+    // mask = indices < contracting_dimension_size
+    // operand = select(broadcast(mask, operand.shape), operand, 0)
+    Value range = Range(b, tile_size).UnwrapTensor();
+    Value tile_size_value =
+        CreateConst(b, b.getI32Type(), tile_size, {}).UnwrapScalar();
+    Value tile_offset = b.create<arith::MulIOp>(
+        contracting_dimension_tile_index, tile_size_value);
+    Value broadcasted_tile_offset =
+        Splat(b, ScalarOrTensor(tile_offset), {tile_size}).UnwrapTensor();
+    Value indices = b.create<arith::AddIOp>(range, broadcasted_tile_offset);
+
+    Value boundary =
+        CreateConst(b, b.getI32Type(), contracting_dimension_size, {tile_size})
+            .UnwrapTensor();
+
+    Value mask =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::slt, indices, boundary);
+
+    // Triton's broadcast requires that the rank of the source and broadcasted
+    // result are equal.
+    for (int i = 0; i < rank - 1; i++) {
+      int axis = (i < contraction_dimension_index) ? 0 : i + 1;
+      mask = b.create<ttir::ExpandDimsOp>(mask, axis);
+    }
+    mask =
+        Broadcast(b, mlir::cast<TensorValue>(mask), tile_shape).UnwrapTensor();
+
+    TF_ASSIGN_OR_RETURN(
+        auto element_type,
+        TritonType(b, dot_operand.hlo()->shape().element_type()));
+
+    ScalarOrTensor zero = CreateConst(b, element_type, 0.0f, tile_shape);
+
+    return b.create<arith::SelectOp>(mask, dot_operand_value,
+                                     zero.UnwrapTensor());
+  }
+
+  return dot_operand_value;
+}
+
+// Returns `shape` without all its unit dimensions, as well as the index of the
+// remaining dimensions in the original `shape`.
+std::pair<SmallVector<int64_t>, SmallVector<int64_t>> CollapseUnitDims(
+    llvm::ArrayRef<int64_t> shape) {
+  SmallVector<int64_t> shape_without_unit_dims;
+  SmallVector<int64_t> non_unit_dims_indices;
+  for (auto [i, size] : llvm::enumerate(shape)) {
+    if (size != 1) {
+      shape_without_unit_dims.push_back(size);
+      non_unit_dims_indices.push_back(i);
+    }
+  }
+  return {std::move(shape_without_unit_dims), std::move(non_unit_dims_indices)};
+}
+
+enum class DotOperandSide { kLhs, kRhs };
+
+// Canonicalizes the given operand of a dot operation, i.e. make it a 2D tensor,
+// and make sure that the contracting dimension is where we expect it to be for
+// the given side (the second dimension for LHS, the first dimension for the
+// RHS).
+//
+// Returns an error if canonicalization is not possible.
+absl::StatusOr<Value> CanonicalizeDotOperand(EmitterLocOpBuilder& b,
+                                             Value operand,
+                                             int64_t contracting_dim_idx,
+                                             DotOperandSide side) {
+  llvm::ArrayRef<int64_t> shape =
+      mlir::cast<ShapedType>(operand.getType()).getShape();
+  auto [shape_without_unit_dims, non_unit_dims_indices] =
+      CollapseUnitDims(shape);
+
+  if (shape_without_unit_dims.size() != 2) {
+    return absl::FailedPreconditionError(
+        "Expected dot operand tile to have exactly two non-unit tile sizes");
+  }
+
+  if (shape.size() != shape_without_unit_dims.size()) {
+    TF_ASSIGN_OR_RETURN(
+        ScalarOrTensor wrapped_operand,
+        EmitTiledReshape(b, shape_without_unit_dims, ScalarOrTensor(operand)));
+    operand = wrapped_operand.UnwrapTensor();
+  }
+
+  int expected_contracting_dim_position = side == DotOperandSide::kLhs ? 1 : 0;
+  bool is_transposed =
+      non_unit_dims_indices[expected_contracting_dim_position] !=
+      contracting_dim_idx;
+
+  if (is_transposed) {
+    SmallVector<int64_t, 2> transposed_shape{shape_without_unit_dims[1],
+                                             shape_without_unit_dims[0]};
+    operand =
+        EmitTiledTranspose(b, transposed_shape, /*dimensions=*/{1, 0}, operand);
+  }
+
+  return operand;
+}
+
+absl::StatusOr<ScalarOrTensor> EmitDot(EmitterLocOpBuilder& b,
+                                       absl::string_view libdevice_path,
+                                       const se::DeviceDescription& device_info,
+                                       const HloFusionInstruction* fusion,
+                                       const TiledHloInstruction& tiled_hlo_dot,
+                                       mlir::FunctionOpInterface fn,
+                                       ValueRange tile_multi_index) {
+  // We expect to get a tiled HLO in form:
+  //
+  // left { ... }
+  // right { ... }
+  // kernel {
+  //   p0 = parameter(0)
+  //   p1 = parameter(1)
+  //   ..
+  //   a = fusion(p0, p1, ...), calls=left
+  //   b = fusion(p0, p1, ...), calls=right
+  //   ...
+  //   c = f32[32,512]{1,0} dot(a, b),
+  //     lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  //   ...
+  // }
+  //
+  // Where `left` and `right` fusions already have been tiled to be emitted
+  // as part of the loop over the contracting dimension. Their
+  // parameters are literally the parameters of `kernel`, not the results of
+  // other instructions in the `kernel`. From that we will emit:
+  //
+  // acc = [tile_m, tile_n] 0.0f
+  // for (k = 0 .. size_k / tile_k) {
+  //   a = "left" computation for left tiling at (pid)[k]
+  //   b = "right" computation for right tiling at (pid)[k]
+  //   acc = a x b
+  // }
+  // c = acc
+  VLOG(2) << "EmitDot: " << tiled_hlo_dot.ToString();
+  const HloDotInstruction& dot =
+      *::xla::Cast<HloDotInstruction>(tiled_hlo_dot.hlo());
+  if (dot.sparse_operands() > 0) {
+    return absl::UnimplementedError("Sparse configuration is not supported");
+  }
+  if (!absl::c_all_of(tiled_hlo_dot.operands(),
+                      [](const TiledHloInstruction* operand) {
+                        return operand->hlo()->opcode() == HloOpcode::kFusion;
+                      })) {
+    return absl::FailedPreconditionError("Expected dot operands to be fusions");
+  }
+
+  SmallVector<int64_t> padded_tile_sizes =
+      GetPaddedTileSizes(tiled_hlo_dot.tile_sizes());
+
+  SmallVector<int64_t, 2> padded_tile_sizes_no_unit_dims =
+      CollapseUnitDims(padded_tile_sizes).first;
+
+  // Sanity check: Triton historically did not support non-2D dots (and still
+  // doesn't support arbitrary nD dots), so we require that the dot is tiled
+  // with exactly two non-unit tile sizes. This anyway matches the hardware's
+  // expectations, so seems like a reasonable requirement.
+  // TODO(b/393299275): this needs to be enforced in tiling.
+  if (padded_tile_sizes_no_unit_dims.size() != 2) {
+    return absl::FailedPreconditionError(
+        "Expected dot to be tiled with exactly two non-unit tile sizes");
+  }
+
+  // The specific accumulator type to use may not correspond to the output type
+  // of the dot. In particular, that is the case when an algorithm is specified
+  // and the dot's output type does not match its expectations.
+  TF_ASSIGN_OR_RETURN(Type accumulator_type,
+                      triton::GetDotAccumulatorType(b, dot));
+  Value accumulator =
+      CreateConst(b, accumulator_type, 0.0f, padded_tile_sizes_no_unit_dims)
+          .UnwrapTensor();
+
+  auto ci64 = [&](int64_t value) -> Value {
+    return b.create<arith::ConstantOp>(b.getIntegerAttr(b.getI64Type(), value));
+  };
+  TF_ASSIGN_OR_RETURN(int64_t loop_iteration_count,
+                      GetDotLoopIterationCount(tiled_hlo_dot));
+  auto for_op = b.create<mlir::scf::ForOp>(
+      /*lowerBound=*/ci64(0), /*upperBound=*/ci64(loop_iteration_count),
+      /*step=*/ci64(1), SmallVector<Value>{accumulator});
+  {  // Loop body.
+    mlir::OpBuilder::InsertionGuard g(b);
+    b.setInsertionPointToStart(for_op.getBody());
+    SmallVector<TensorValue> dot_args;
+    // Nested fusions are tiled with indexing map
+    // (tile multi-index.., loop index) -> ....
+    SmallVector<Value> computation_index(tile_multi_index);
+    Value ki = for_op.getInductionVar();
+    const Value ki_index = b.create<arith::IndexCastUIOp>(b.getIndexType(), ki);
+    computation_index.push_back(ki_index);
+    for (const TiledHloInstruction* operand : tiled_hlo_dot.operands()) {
+      VLOG(3) << "Emitting dot operand: " << operand->ToString();
+      const TiledHloFusionInstruction* tiled_fusion_operand =
+          static_cast<const TiledHloFusionInstruction*>(operand);
+      TF_ASSIGN_OR_RETURN(
+          std::vector<ScalarOrTensor> result,
+          EmitTiledComputation(
+              b, libdevice_path, device_info,
+              ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+              *tiled_fusion_operand->called_computation(), fn,
+              computation_index));
+      if (result.size() != 1) {
+        return absl::InternalError(absl::StrCat(
+            "Expected nested fusion computation to emit a single value, got ",
+            result.size()));
+      }
+      dot_args.push_back(result.front().UnwrapTensor());
+    }
+    Value acc = for_op.getRegionIterArgs().front();
+    int64_t lhs_contracting_dim_idx =
+        dot.dot_dimension_numbers().lhs_contracting_dimensions(0);
+
+    int64_t rhs_contracting_dim_idx =
+        dot.dot_dimension_numbers().rhs_contracting_dimensions(0);
+
+    // TODO(b/393299275): masking is only necessary during the last iteration of
+    // the loop. We should evaluate whether adding a conditional mask helps or
+    // hinders performance for Triton.
+    Value ki_i32 = b.create<arith::TruncIOp>(b.getI32Type(), ki);
+    TF_ASSIGN_OR_RETURN(
+        Value lhs, MaskDotOperand(b, *tiled_hlo_dot.operand(0), dot_args[0],
+                                  ki_i32, lhs_contracting_dim_idx));
+
+    TF_ASSIGN_OR_RETURN(
+        Value rhs, MaskDotOperand(b, *tiled_hlo_dot.operand(1), dot_args[1],
+                                  ki_i32, rhs_contracting_dim_idx));
+
+    // Canonicalize the dot operands to match Triton's/the hardware's
+    // expectations.
+    TF_ASSIGN_OR_RETURN(lhs,
+                        CanonicalizeDotOperand(b, lhs, lhs_contracting_dim_idx,
+                                               DotOperandSide::kLhs));
+    TF_ASSIGN_OR_RETURN(rhs,
+                        CanonicalizeDotOperand(b, rhs, rhs_contracting_dim_idx,
+                                               DotOperandSide::kRhs));
+
+    TF_ASSIGN_OR_RETURN(
+        Value acc_next,
+        triton::EmitSingleTileDot(b, dot, triton::DotOperands{lhs, rhs, acc}));
+    b.create<mlir::scf::YieldOp>(acc_next);
+  }
+
+  // The output of the loop may not match the expected output type of the dot.
+  // We make sure to issue a conversion if necessary.
+  TF_ASSIGN_OR_RETURN(Type dot_output_type,
+                      TritonType(b, dot.shape().element_type()));
+
+  Value result = for_op.getResult(0);
+  if (dot_output_type != accumulator_type) {
+    result = Cast(b, result, dot_output_type);
+  }
+
+  if (padded_tile_sizes.size() != padded_tile_sizes_no_unit_dims.size()) {
+    TF_ASSIGN_OR_RETURN(
+        ScalarOrTensor wrapped_result,
+        EmitTiledReshape(b, padded_tile_sizes, ScalarOrTensor(result)));
+    result = wrapped_result.UnwrapTensor();
+  }
+
+  return ScalarOrTensor(result);
+}
+
+absl::StatusOr<ScalarOrTensor> EmitConcatenate(
+    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const HloFusionInstruction* fusion,
+    const TiledHloInstruction& tiled_concatenate, mlir::FunctionOpInterface fn,
+    ValueRange tile_multi_index) {
+  const int64_t concatenate_dimension =
+      tiled_concatenate.hlo()->concatenate_dimension();
+
+  // TODO(b/393299275): get rid of calls to `GetPaddedTileSizes` once tiling
+  // is using power of twos everywhere, including when propagating into the
+  // prologue of reductions.
+  SmallVector<int64_t> padded_tile_sizes =
+      GetPaddedTileSizes(tiled_concatenate.tile_sizes());
+  int64_t concatenate_dimension_tile_size =
+      padded_tile_sizes[concatenate_dimension];
+
+  for (const TiledHloInstruction* operand : tiled_concatenate.operands()) {
+    if (operand->hlo()->opcode() != HloOpcode::kFusion) {
+      // Sanity check: all operands should be nested fusions.
+      return absl::FailedPreconditionError(
+          "Expected concatenate operands to be nested fusions.");
+    }
+
+    int64_t operand_concatenate_dimension_size =
+        tiled_concatenate.hlo()->shape().dimensions(concatenate_dimension);
+
+    if (operand_concatenate_dimension_size % concatenate_dimension_tile_size !=
+        0) {
+      // Sanity check: concatenation dimension should be divisible by the tile
+      // size for each operand. This is not a fundamental limitation, but this
+      // lowering will emit incorrect code if this does not hold---so we gate
+      // against it explicitly.
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Expected the tile size of the concatenation dimension of operand ",
+          operand->ToString(), "to divide the dimension size exactly, but got",
+          operand_concatenate_dimension_size, " % ",
+          concatenate_dimension_tile_size, " != 0"));
+    }
+  }
+  TF_ASSIGN_OR_RETURN(
+      Type element_type,
+      TritonType(b, tiled_concatenate.hlo()->shape().element_type()));
+  Type result_type =
+      mlir::RankedTensorType::get(padded_tile_sizes, element_type);
+
+  // We will load and compute from a single operand, so we need to figure out
+  // which one by looking at the offset within the concatenation dimension.
+  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
+                      tiled_concatenate.tile_offsets_indexing());
+
+  Value concatenate_dimension_offset =
+      emitters::ApplyIndexing(tile_offsets_indexing, /*dims=*/tile_multi_index,
+                              /*symbols=*/{}, b)[concatenate_dimension];
+
+  // It would have been nice to be able to use `scf::IndexSwitchOp`, but Triton
+  // does not want to deal with the `Index` type, and does not support the op.
+  // Instead, we generate a sequence of nested `scf::IfOp`s.
+  SmallVector<mlir::scf::IfOp, 4> if_ops;
+  int64_t limit = 0;
+  for (auto [i, operand] : llvm::enumerate(tiled_concatenate.operands())) {
+    // Write in the else branch of the previous if op if one exists.
+    if (!if_ops.empty()) {
+      b.setInsertionPointToStart(if_ops.back().elseBlock());
+    }
+
+    // Add an `if_op` if we have not reached the last operand. The last operand
+    // directly populates the `else` block of the previous `if_op`.
+    if (if_ops.size() < tiled_concatenate.operands().size() - 1) {
+      limit += operand->hlo()->shape().dimensions()[concatenate_dimension];
+      Value offset_limit =
+          CreateConst(b, b.getIndexType(), limit, {}).UnwrapScalar();
+
+      auto cond =
+          b.create<arith::CmpIOp>(arith::CmpIPredicate::slt,
+                                  concatenate_dimension_offset, offset_limit);
+      auto if_op = b.create<mlir::scf::IfOp>(mlir::TypeRange(result_type), cond,
+                                             /*withElseRegion=*/true);
+
+      // Propagate the result from the nested `if_op` if we were already within
+      // an `if_op`.
+      if (!if_ops.empty()) {
+        b.create<mlir::scf::YieldOp>(if_op.getResult(0));
+      }
+
+      b.setInsertionPointToStart(if_op.thenBlock());
+      if_ops.push_back(if_op);
+    }
+
+    const TiledHloFusionInstruction* tiled_fusion_operand =
+        static_cast<const TiledHloFusionInstruction*>(
+            tiled_concatenate.operand(i));
+    TF_ASSIGN_OR_RETURN(
+        std::vector<ScalarOrTensor> result,
+        EmitTiledComputation(
+            b, libdevice_path, device_info,
+            ::xla::Cast<HloFusionInstruction>(tiled_fusion_operand->hlo()),
+            *tiled_fusion_operand->called_computation(), fn, tile_multi_index));
+    CHECK_EQ(result.size(), 1);
+    b.create<mlir::scf::YieldOp>(result.front().UnwrapTensor());
+  }
+
+  b.setInsertionPointAfter(if_ops.front());
+
+  return ScalarOrTensor(if_ops.front().getResult(0));
+}
+
+// Given an operand to a (potentially nested) fusion instruction, finds the
+// index of the operand to the outermost fusion it corresponds to.
+//
+// Nested fusion parameter chains should always only traverse parameter nodes.
+int64_t GetOutermostFusionOperandParameterIndex(
+    const HloFusionInstruction* fusion, const HloInstruction* operand) {
+  CHECK(fusion->IsUserOf(operand));
+
+  // Simple case: `fusion` is the outermost fusion.
+  if (!operand->parent()->IsFusionComputation()) {
+    return fusion->operand_index(operand);
+  }
+
+  // While operand is in a nested fusion, walk up to the outermost fusion.
+  while (
+      operand->parent()->FusionInstruction()->parent()->IsFusionComputation()) {
+    // Nests operands should always point to parameters.
+    CHECK(operand->opcode() == HloOpcode::kParameter);
+    int64_t param_number = operand->parameter_number();
+    operand = operand->parent()->FusionInstruction()->operand(param_number);
+  }
+
+  CHECK(operand->parent()->IsFusionComputation());
+  CHECK(operand->opcode() == HloOpcode::kParameter);
+  return operand->parameter_number();
+}
+
 absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
     EmitterLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info,
     const HloFusionInstruction* fusion, const TiledHloInstruction& tiled_hlo,
-    mlir::triton::FuncOp fn, ValueRange tile_multi_index,
+    mlir::FunctionOpInterface fn, ValueRange tile_multi_index,
     absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor>& values) {
   const HloInstruction* hlo = tiled_hlo.hlo();
+  VLOG(4) << "EmitTiledHloInstruction: " << hlo->ToString();
 
-  if (fusion->IsUserOf(hlo)) {
-    TF_ASSIGN_OR_RETURN(auto make_tensor,
-                        ir_emitter_triton_internal::CreateMakeTensorPtrOp(
-                            b, tile_multi_index, tiled_hlo,
-                            fn.getArgument(fusion->operand_index(hlo))));
+  if (hlo->IsRoot() && hlo->opcode() == HloOpcode::kParameter) {
+    hlo = hlo->parent()->FusionInstruction()->operand(hlo->parameter_number());
+  }
 
+  if (fusion->IsUserOf(hlo)) {
+    // If the fusion instruction is a user of `hlo`, then `hlo` is an operand
+    // to the fusion instruction.
+    int64_t arg_index = GetOutermostFusionOperandParameterIndex(fusion, hlo);
+    TF_ASSIGN_OR_RETURN(auto tile_info,
+                        TileInfo::Construct(b, tile_multi_index, tiled_hlo));
     ScalarOrTensor parameter =
-        EmitParameterLoad(b, make_tensor.op, make_tensor.boundary_checks);
+        EmitParameterExtract(b, tile_info, fn.getArgument(arg_index));
 
     // Some types are stored using different types, e.g. i1 is stored in memory
     // as i8. It's important to type checking that we perform a conversion after
@@ -623,7 +1197,7 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
     if (expected_element_type != loaded_element_type) {
       // Ensure that we didn't mess up somewhere else by checking that we
       // indeed loaded the expected storage type for the expected element type.
-      if (loaded_element_type != StorageType(b, expected_element_type)) {
+      if (loaded_element_type != StorageType(expected_element_type)) {
         return absl::InternalError(absl::StrCat(
             "Parameters were loaded with an unexpected element type "
             "while lowering ",
@@ -636,6 +1210,16 @@ absl::StatusOr<ScalarOrTensor> EmitTiledHloInstruction(
     return parameter;
   }
 
+  if (hlo->opcode() == HloOpcode::kConcatenate) {
+    return EmitConcatenate(b, libdevice_path, device_info, fusion, tiled_hlo,
+                           fn, tile_multi_index);
+  }
+
+  if (hlo->opcode() == HloOpcode::kDot) {
+    return EmitDot(b, libdevice_path, device_info, fusion, tiled_hlo, fn,
+                   tile_multi_index);
+  }
+
   if (hlo->opcode() == HloOpcode::kConstant) {
     if (ShapeUtil::IsEffectiveScalar(hlo->shape())) {
       return EmitConstant(b, *hlo);
@@ -704,19 +1288,32 @@ absl::StatusOr<std::vector<ScalarOrTensor>> EmitTiledComputation(
     EmitterLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info,
     const HloFusionInstruction* fusion,
-    const TiledHloComputation& tiled_computation, mlir::triton::FuncOp fn,
+    const TiledHloComputation& tiled_computation, mlir::FunctionOpInterface fn,
     ValueRange tile_multi_index) {
+  VLOG(2) << "EmitTiledComputation: " << tiled_computation.ToString();
   absl::flat_hash_map<const TiledHloInstruction*, ScalarOrTensor> values;
   for (const TiledHloInstruction* tiled_hlo :
        tiled_computation.instructions()) {
+    const HloInstruction* hlo = tiled_hlo->hlo();
+    // Skip generating nested fusions, they are emitted by their consumer.
+    if (hlo->parent()->IsFusionComputation() &&
+        hlo->opcode() == HloOpcode::kFusion) {
+      CodegenDecision decision = IsTritonSupportedInstruction(
+          *hlo, device_info.gpu_compute_capability());
+      if (!decision.CanFuse()) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Fusion ", hlo->ToString(),
+                         " is not supported: ", decision.Explain()));
+      }
+      VLOG(1) << "Skipping nested fusion: " << hlo->ToString();
+      continue;
+    }
     TF_ASSIGN_OR_RETURN(
         ScalarOrTensor result,
         EmitTiledHloInstruction(b, libdevice_path, device_info, fusion,
                                 *tiled_hlo, fn, tile_multi_index, values));
-    TF_RET_CHECK(values.insert({tiled_hlo, result}).second)
-        << tiled_hlo->hlo()->ToString();
-    VLOG(8) << "Emitted "
-            << tiled_hlo->hlo()->ToString(HloPrintOptions::ShortParsable());
+    TF_RET_CHECK(values.insert({tiled_hlo, result}).second) << hlo->ToString();
+    VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
   }
   std::vector<ScalarOrTensor> results;
   results.reserve(tiled_computation.GetRoots().size());
@@ -788,33 +1385,6 @@ absl::StatusOr<ScalarOrTensor> EmitScope(
   }
   return values[instructions.back()];
 }
-
-// Computes the base pointer offset for the given tile multi-index and hlo shape
-// taking into account the physical layout of the hlo buffer.
-absl::StatusOr<Value> ComputeBasePtrOffset(
-    EmitterLocOpBuilder& b, ValueRange tile_multi_index,
-    const TiledHloInstruction& tiled_hlo) {
-  const Shape& shape = tiled_hlo.hlo()->shape();
-  Shape linear_shape = ShapeUtil::MakeShape(shape.element_type(),
-                                            {ShapeUtil::ElementsIn(shape)});
-
-  // Bitcast map gives an indexing map from the parameter shape (multi-index) to
-  // a linear index respecting physical layout of the memory.
-  auto bitcast_map = GetBitcastMap(shape, linear_shape, b.getContext());
-
-  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
-                      tiled_hlo.tile_offsets_indexing());
-
-  auto compose_indexing_maps =
-      ComposeIndexingMaps(tile_offsets_indexing, bitcast_map);
-  compose_indexing_maps.Simplify();
-
-  return b.create<arith::IndexCastUIOp>(
-      b.getI64Type(), emitters::ApplyIndexing(compose_indexing_maps,
-                                              /*dims=*/tile_multi_index,
-                                              /*symbols=*/{}, b)[0]);
-}
-
 }  // namespace
 
 namespace ir_emitter_triton_internal {
@@ -846,103 +1416,22 @@ SmallVector<Value, 3> ComputeDelinearizedTileIndex(
                                  /*symbols=*/{}, b);
 }
 
-absl::StatusOr<MakeTensorPtrOpAndBoundaryChecks> CreateMakeTensorPtrOp(
-    EmitterLocOpBuilder& b, ValueRange tile_multi_index,
-    const TiledHloInstruction& tiled_hlo, Value parent_base_ptr) {
-  const Shape& shape = tiled_hlo.hlo()->shape();
-
-  // Compute physical strides of the tile. `tile_strides` contains strides for
-  // individual dimensions. We need to convert them to strides in the buffer
-  // taking into account physical layout.
-  // TODO(b/331332678): Compute indexing maps to physical layout indexing in
-  // SymbolicTileAnalysis.
-  const llvm::SmallVector<int64_t>& tile_strides = tiled_hlo.tile_strides();
-  llvm::SmallVector<Value> strides(tile_strides.size());
-  int64_t current_stride = 1;
-  for (int64_t cur_dim : LayoutUtil::MinorToMajor(shape)) {
-    strides[cur_dim] =
-        CreateConst(b, b.getI64Type(), tile_strides[cur_dim] * current_stride)
-            .UnwrapScalar();
-    current_stride *= shape.dimensions(cur_dim);
-  }
-
-  TF_ASSIGN_OR_RETURN(IndexingMap tile_offsets_indexing,
-                      tiled_hlo.tile_offsets_indexing());
-  auto tile_offsets_as_indices =
-      emitters::ApplyIndexing(tile_offsets_indexing,
-                              /*dims=*/tile_multi_index,
-                              /*symbols=*/{}, b);
-
-  // Triton requires that all block dimensions are a power of 2.
-  SmallVector<int64_t> padded_tile_sizes =
-      GetPaddedTileSizes(tiled_hlo.tile_sizes());
-
-  // TensorPtr is intended to be a  base pointer of the TiledHloInstruction and
-  // plus the necessary offsets so that Triton can compute the pointer to the
-  // block specific to the given pid. This option would yield simpler code, but
-  // cannot handle all combinations of strides and offsets, because Triton
-  // always multiplies the offset by the stride. E.g., it's not possible to
-  // slice [10] with [1:5:2] because the first element will always be at an even
-  // offset.
-  //
-  // Instead, we output a TensorPtr that points directly to the tile specific
-  // to the pid. All offset computation is done in advance. MakeTensorPtrOp
-  // sees 0 offsets. This allows Triton to read any block regardless of strides
-  // size or offsets. To make sure that masking is correct, we compute a
-  // "residual shape" which is the original parent shape minus the offsets.
-
-  llvm::SmallVector<Value> residual_shape;
-  llvm::SmallVector<int32_t> boundary_checks;
-  for (int dim_idx = 0; dim_idx < padded_tile_sizes.size(); ++dim_idx) {
-    Value parent_size =
-        CreateConst(b, b.getI64Type(), shape.dimensions(dim_idx))
-            .UnwrapScalar();
-    // Offsets are necessarily positive since they represent a distance between
-    // 0 and the size of the tensor on the given axis. Therefore, it is safe to
-    // use 'IndexCastUI' here. This allows index canonicalizations later on.
-    Value offset = b.create<arith::IndexCastUIOp>(
-        b.getI64Type(), tile_offsets_as_indices[dim_idx]);
-    residual_shape.push_back(b.create<arith::SubIOp>(parent_size, offset));
-
-    if (shape.dimensions(dim_idx) % padded_tile_sizes[dim_idx] != 0) {
-      boundary_checks.push_back(dim_idx);
-    }
-  }
-
-  TF_ASSIGN_OR_RETURN(Value ptr_offset,
-                      ComputeBasePtrOffset(b, tile_multi_index, tiled_hlo));
-  auto tile_ptr = AddPtr(b, parent_base_ptr, ptr_offset);
-
-  llvm::SmallVector<Value> offsets(
-      padded_tile_sizes.size(),
-      CreateConst(b, b.getI32Type(), 0).UnwrapScalar());
-
-  // TODO(b/342989850): Clarify and comment what `order` exactly is. It's not
-  // entirely clear from the Triton docs.
-  llvm::SmallVector<int32_t> order(padded_tile_sizes.size());
-  std::iota(order.rbegin(), order.rend(), 0);
-
-  auto make_tensor_ptr = b.create<ttir::MakeTensorPtrOp>(
-      /*base*/ tile_ptr,
-      /*shape*/ residual_shape,
-      /*strides*/ strides,
-      /*offsets*/ offsets,
-      /*tensorShape*/ llvm::to_vector_of<int32_t>(padded_tile_sizes),
-      /*order*/ order);
-
-  return MakeTensorPtrOpAndBoundaryChecks{make_tensor_ptr, boundary_checks};
-}
-
 }  // namespace ir_emitter_triton_internal
 
 namespace {
+
+using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
+
 // Generate Triton IR inside 'fn', using the given block_level_parameters.
-absl::Status EmitGeneric(mlir::OpBuilder builder,
-                         absl::string_view libdevice_path,
-                         const se::DeviceDescription& device_info,
-                         const HloFusionInstruction* fusion,
-                         mlir::triton::FuncOp fn,
-                         const BlockLevelParameters& block_level_parameters) {
+absl::StatusOr<SmallVector<Value>> EmitGeneric(
+    mlir::OpBuilder builder, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const HloFusionInstruction* fusion, mlir::FunctionOpInterface fn,
+    const BlockLevelParameters& block_level_parameters) {
+  if (VLOG_IS_ON(6)) {
+    VLOG(6) << "Emitting Triton IR for fusion\n"
+            << ExtractInstructionIntoNewModule(*fusion)->ToString();
+  }
   const HloComputation* computation = fusion->fused_instructions_computation();
   SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
       SymbolicTileAnalysis::AnalyzeComputation(
@@ -976,7 +1465,8 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
                           block_level_parameters.output_tile_sizes[root_index],
                           /*constraints_are_known_satisfied=*/false,
                           /*compute_all_tile_offset_indexing_maps=*/true));
-  VLOG(3) << "Tiled HLO computation: " << tiled_hlo_computation.ToString();
+  VLOG(3) << "EmitGeneric: tiled HLO computation:\n"
+          << tiled_hlo_computation.ToString();
 
   SmallVector<Value, 3> tile_multi_index =
       ir_emitter_triton_internal::ComputeDelinearizedTileIndex(
@@ -987,6 +1477,7 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
       EmitTiledComputation(b, libdevice_path, device_info, fusion,
                            tiled_hlo_computation, fn, tile_multi_index));
 
+  SmallVector<Value> insert_results;
   for (auto [root, result, parent_base_ptr] :
        llvm::zip(tiled_hlo_computation.GetRoots(), results,
                  fn.getArguments().drop_front(computation->num_parameters()))) {
@@ -994,7 +1485,7 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
     // as i8. It's important to check converted types before storing if the type
     // of the result does not match the type of the output pointer.
     Type result_element_type = getElementTypeOrSelf(result.getType());
-    Type result_storage_type = StorageType(b, result_element_type);
+    Type result_storage_type = StorageType(result_element_type);
 
     if (result_element_type != result_storage_type) {
       result =
@@ -1002,22 +1493,30 @@ absl::Status EmitGeneric(mlir::OpBuilder builder,
     }
 
     if (result.IsScalar()) {
-      b.create<ttir::StoreOp>(parent_base_ptr, result.UnwrapScalar(),
-                              ttir::CacheModifier::NONE,
-                              ttir::EvictionPolicy::NORMAL);
-      return absl::OkStatus();
+      ValueRange indices = {};
+      insert_results.push_back(
+          b.create<mlir::tensor::InsertOp>(result.UnwrapScalar(),
+                                           parent_base_ptr, indices)
+              .getResult());
+      continue;
     }
 
-    CHECK(root->hlo()->shape().IsArray() && root->hlo()->shape().rank() > 0);
-    TF_ASSIGN_OR_RETURN(auto make_tensor,
-                        ir_emitter_triton_internal::CreateMakeTensorPtrOp(
-                            b, tile_multi_index, *root, parent_base_ptr));
-    b.create<ttir::StoreOp>(
-        make_tensor.op, result.UnwrapTensor(), make_tensor.boundary_checks,
-        ttir::CacheModifier::NONE, ttir::EvictionPolicy::NORMAL);
+    CHECK(root->hlo()->shape().IsArray() &&
+          !root->hlo()->shape().dimensions().empty());
+    TF_ASSIGN_OR_RETURN(auto tile_info,
+                        TileInfo::Construct(b, tile_multi_index, *root));
+
+    // Should not be scalar at this point.
+    CHECK(!tile_info.padded_tile_sizes().empty())
+        << "Unexpected scalar encountered. Expected padded_tile_sizes() to be "
+           "non-empty.";
+
+    insert_results.push_back(b.create<mtx::InsertOp>(
+        result.UnwrapTensor(), parent_base_ptr, tile_info.offsets(),
+        tile_info.tile_strides(), tile_info.minor_to_major_layout()));
   }
 
-  return absl::OkStatus();
+  return insert_results;
 }
 
 }  // namespace
@@ -1027,7 +1526,8 @@ void LoadMlirDialectsForTriton(mlir::MLIRContext& mlir_context) {
       .loadDialect<ttir::TritonDialect, ttir::gpu::TritonGPUDialect,
                    mlir::arith::ArithDialect, mlir::affine::AffineDialect,
                    mlir::LLVM::LLVMDialect, xla::XlaDialect,
-                   xla::gpu::XlaGpuDialect, ttir::xla::XlaTritonDialect>();
+                   xla::gpu::XlaGpuDialect, ttir::xla::XlaTritonDialect,
+                   mlir::func::FuncDialect, mlir::tensor::TensorDialect>();
   mlir::DialectRegistry registry;
   mlir::func::registerInlinerExtension(registry);
   mlir::LLVM::registerInlinerInterface(registry);
@@ -1057,17 +1557,6 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> TranslateLLVMToLLVMIR(
   return llvmModule;
 }
 
-std::string DumpTritonIR(mlir::ModuleOp triton_module, bool dump_annotations) {
-  std::string triton_ir;
-  llvm::raw_string_ostream os(triton_ir);
-  triton_module.print(os, mlir::OpPrintingFlags().enableDebugInfo(
-                              dump_annotations, dump_annotations));
-  if (dump_annotations) {
-    return EmitterLocOpBuilder::FormatTritonIrWithAnnotations(triton_ir);
-  }
-  return triton_ir;
-}
-
 absl::Status CreateInternalError(absl::string_view message,
                                  const HloFusionInstruction* fusion,
                                  mlir::ModuleOp triton_module) {
@@ -1077,12 +1566,100 @@ absl::Status CreateInternalError(absl::string_view message,
   os << "fusion instruction: " << fusion->ToString() << "\n";
   os << "HLO module to reproduce:\n"
      << ExtractInstructionIntoNewModule(*fusion)->ToString();
-  os << "triton_module: \n";
+  os << "triton_module>>>\n";
   triton_module->print(os, mlir::OpPrintingFlags().enableDebugInfo(true, true));
+  os << "<<<triton_module\n";
   return absl::InternalError(err);
 }
 
-absl::StatusOr<TritonModule> CreateTritonModule(
+// Legacy emitter works with tt.func. New emitter works with func.func.
+// TODO(393299275): Remove legacy optionality once migration is complete.
+void AppendFuncArgType(absl::Span<const int64_t> dims,
+                       absl::string_view fusion_kind, Type ir_type,
+                       SmallVector<Type>& fn_arg_types) {
+  if (fusion_kind == kTritonGemmFusionKind) {
+    fn_arg_types.push_back(ttir::PointerType::get(
+        StorageType(ir_type), mlir::NVVM::kGlobalMemorySpace));
+  } else {
+    fn_arg_types.push_back(mlir::RankedTensorType::get(
+        llvm::ArrayRef<int64_t>(dims.data(), dims.size()),
+        StorageType(ir_type)));
+  }
+}
+
+// Only needed for the new emitter since we are using func.func instead of
+// tt.func.
+// TODO(393299275): Remove legacy optionality once migration is complete.
+void AppendFuncResultType(absl::string_view fusion_kind,
+                          absl::Span<const int64_t> dims, Type ir_type,
+                          SmallVector<Type>& fn_result_types) {
+  if (fusion_kind != kTritonGemmFusionKind) {
+    fn_result_types.push_back(mlir::RankedTensorType::get(
+        llvm::ArrayRef<int64_t>(dims.data(), dims.size()),
+        StorageType(ir_type)));
+  }
+}
+
+// Legacy emitter works with tt.func. New emitter works with func.func.
+// TODO(393299275): Remove legacy optionality once migration is complete.
+mlir::FunctionOpInterface CreateFuncOp(EmitterLocOpBuilder& b,
+                                       absl::string_view fn_name,
+                                       absl::string_view fusion_kind,
+                                       SmallVector<Type>& fn_arg_types,
+                                       SmallVector<Type>& fn_result_types) {
+  mlir::FunctionOpInterface fn;
+  if (fusion_kind == kTritonGemmFusionKind) {
+    fn = b.create<ttir::FuncOp>(fn_name,
+                                b.getFunctionType(fn_arg_types, std::nullopt));
+    for (int i = 0; i < fn.getNumArguments(); ++i) {
+      fn.setArgAttr(i, "tt.divisibility", b.getIntegerAttr(b.getI32Type(), 16));
+    }
+  } else {
+    fn = b.create<mlir::func::FuncOp>(
+        fn_name, b.getFunctionType(fn_arg_types, fn_result_types));
+  }
+  return fn;
+}
+
+// Legacy emitter works with tt.return. New emitter works with func.return.
+// TODO(393299275): Remove legacy optionality once migration is complete.
+void EmitReturnOp(EmitterLocOpBuilder& b, absl::string_view fusion_kind,
+                  SmallVector<Value> insert_results) {
+  if (fusion_kind == kTritonGemmFusionKind) {
+    b.create<ttir::ReturnOp>();
+  } else {
+    b.create<mlir::func::ReturnOp>(insert_results);
+  }
+}
+
+absl::StatusOr<stream_executor::gpu::TmaMetadata> ExtractTmaMetadata(
+    mlir::ModuleOp triton_module, absl::string_view kernel_name) {
+  stream_executor::gpu::TmaMetadata tma_metadata;
+  SmallVector<mlir::LLVM::LLVMFuncOp> func_ops;
+  for (auto func : triton_module.getOps<mlir::LLVM::LLVMFuncOp>()) {
+    // Custom calls will also match to LLVMFuncOp, so we are only interested in
+    // the entry function.
+    if (func.getName().str() == kernel_name) {
+      func_ops.push_back(func);
+    }
+  }
+  CHECK_EQ(func_ops.size(), 1)
+      << "Expected a single LLVMFuncOp in the module for the entry function.";
+
+  for (auto [idx, arg] : llvm::enumerate(func_ops[0].getArguments())) {
+    if (auto attr = func_ops[0].getArgAttrOfType<mtx::TmaDescriptorAttr>(
+            idx, "tt.tma_descriptor")) {
+      TF_ASSIGN_OR_RETURN(
+          auto tma_desc,
+          Create2DTmaDescriptor(attr.getGlobalShape(), attr.getBlockShape(),
+                                attr.getLayout(), attr.getElementByteSize()));
+      tma_metadata.arg_index_to_tma_info.insert({idx, tma_desc});
+    }
+  }
+  return tma_metadata;
+}
+
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     absl::string_view fn_name, const HloFusionInstruction* fusion,
     const se::DeviceDescription& device_info,
     const BlockLevelParameters& block_level_parameters,
@@ -1103,6 +1680,10 @@ absl::StatusOr<TritonModule> CreateTritonModule(
       llvm_ir::CreateMlirModuleOp(loc);
   b.setInsertionPointToEnd(triton_module->getBody());
 
+  auto backend_config =
+      fusion->backend_config<GpuBackendConfig>()->fusion_backend_config();
+  absl::string_view fusion_kind = backend_config.kind();
+
   // Build Triton kernel.
   SmallVector<Type> fn_arg_types;
   for (HloInstruction* p : hlo_computation->parameter_instructions()) {
@@ -1115,22 +1696,24 @@ absl::StatusOr<TritonModule> CreateTritonModule(
     } else {
       TF_ASSIGN_OR_RETURN(ir_type, TritonType(b, type));
     }
-    fn_arg_types.push_back(ttir::PointerType::get(
-        StorageType(b, ir_type), mlir::NVVM::kGlobalMemorySpace));
+
+    AppendFuncArgType(p->shape().dimensions(), fusion_kind, ir_type,
+                      fn_arg_types);
   }
 
+  SmallVector<Type> fn_result_types;
+
   for (const ShapeUtil::IndexedShape& s :
        ShapeUtil::GetLeafShapes(fusion->shape())) {
     TF_ASSIGN_OR_RETURN(Type triton_ty, TritonType(b, s.shape.element_type()));
-    fn_arg_types.push_back(ttir::PointerType::get(
-        StorageType(b, triton_ty), mlir::NVVM::kGlobalMemorySpace));
+    AppendFuncArgType(s.shape.dimensions(), fusion_kind, triton_ty,
+                      fn_arg_types);
+    AppendFuncResultType(fusion_kind, s.shape.dimensions(), triton_ty,
+                         fn_result_types);
   }
 
-  auto fn = b.create<ttir::FuncOp>(
-      fn_name, b.getFunctionType(fn_arg_types, std::nullopt));
-  for (int i = 0; i < fn.getNumArguments(); ++i) {
-    fn.setArgAttr(i, "tt.divisibility", b.getIntegerAttr(b.getI32Type(), 16));
-  }
+  mlir::FunctionOpInterface fn =
+      CreateFuncOp(b, fn_name, fusion_kind, fn_arg_types, fn_result_types);
 
   fn.addEntryBlock();
   b.setInsertionPointToStart(&fn.front());
@@ -1138,25 +1721,27 @@ absl::StatusOr<TritonModule> CreateTritonModule(
   std::string libdevice_path =
       GetLibdevicePath(fusion->GetModule()->config(), device_info);
 
-  auto backend_config =
-      fusion->backend_config<GpuBackendConfig>()->fusion_backend_config();
-  absl::string_view fusion_kind = backend_config.kind();
-
-  // It's okay for tma_metadata to be empty; it's only populated when used
-  // explicitly.
-  std::optional<stream_executor::gpu::TmaMetadata> tma_metadata = std::nullopt;
+  SmallVector<Value> insert_results;
   if (fusion_kind == kTritonGemmFusionKind) {
-    TF_ASSIGN_OR_RETURN(tma_metadata,
-                        EmitMatMul(b, libdevice_path, device_info, fusion, fn,
-                                   block_level_parameters));
-  } else if (fusion_kind == kTritonFusionKind) {
-    TF_RETURN_IF_ERROR(EmitGeneric(b, libdevice_path, device_info, fusion, fn,
-                                   block_level_parameters));
+    // If the generic Triton emitter is enabled, we should never go through the
+    // legacy MatMul emitter.
+    if (UseGenericTritonEmitterForGemms(fusion)) {
+      return absl::FailedPreconditionError(
+          "The generic Triton emitter is enabled, but the legacy MatMul "
+          "emitter is being used.");
+    }
+    TF_RETURN_IF_ERROR(EmitMatMul(b, libdevice_path, device_info, fusion, fn,
+                                  block_level_parameters));
+  } else if (fusion_kind == kTritonFusionKind ||
+             fusion_kind == kTritonNestedGemmFusionKind) {
+    TF_ASSIGN_OR_RETURN(insert_results,
+                        EmitGeneric(b, libdevice_path, device_info, fusion, fn,
+                                    block_level_parameters));
   } else {
     return Internal("Unsupported fusion kind: %s", fusion_kind);
   }
 
-  b.create<ttir::ReturnOp>();
+  EmitReturnOp(b, fusion_kind, insert_results);
 
   if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
     auto suffix = absl::StrCat(fusion->name(), ".before_validation.ttir");
@@ -1178,6 +1763,7 @@ absl::StatusOr<TritonModule> CreateTritonModule(
   }
 
   mlir::PassManager pm(&mlir_context);
+
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
   if (mlir::failed(pm.run(triton_module.get()))) {
@@ -1203,7 +1789,7 @@ absl::StatusOr<TritonModule> CreateTritonModule(
                          .xla_gpu_unsupported_annotate_with_emitter_loc()));
   }
 
-  return TritonModule{std::move(triton_module), tma_metadata};
+  return std::move(triton_module);
 }
 
 absl::StatusOr<TritonWrapperResult> TritonWrapper(
@@ -1222,7 +1808,7 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     }
   }
 
-  TF_ASSIGN_OR_RETURN(auto triton_module,
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> triton_module,
                       CreateTritonModule(fn_name, fusion, device_info,
                                          block_level_parameters, mlir_context));
 
@@ -1232,18 +1818,14 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
 
   // Compile Triton kernel to LLVM.
   const HloModule* hlo_module = fusion->GetModule();
-  TF_ASSIGN_OR_RETURN(
-      TritonWrapperResult result,
-      CompileTritonToLLVM(hlo_module->config(), hlo_module->name(), device_info,
-                          block_level_parameters, triton_module.module.get(),
-                          llvm_module, mlir_context,
-                          /*is_xla_fusion=*/true));
-  result.tma_metadata = triton_module.tma_metadata;
-  return result;
+  return CompileTritonToLLVM(fn_name, *hlo_module, device_info,
+                             block_level_parameters, triton_module.get(),
+                             llvm_module, mlir_context,
+                             /*is_xla_fusion=*/true);
 }
 
 absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
-    const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
+    absl::string_view kernel_name, const HloModule& hlo_module,
     const se::DeviceDescription& device_info,
     const BlockLevelParameters& block_level_parameters,
     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
@@ -1261,25 +1843,33 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     }
   }
 
+  const HloModuleConfig& hlo_config = hlo_module.config();
+
   bool should_verify =
       (hlo_config.debug_options().xla_gpu_llvm_verification_level() >= 1);
 #ifndef NDEBUG
   should_verify = true;
 #endif
 
+  bool should_dump_mlir_passes =
+      hlo_config.debug_options().xla_enable_dumping() &&
+      DumpingEnabledForHloModule(hlo_module) &&
+      DumpingEnabledForHloPass("triton-fusion-emitter",
+                               hlo_config.debug_options());
+
   mlir::PassManager pm(&mlir_context);
   pm.enableVerifier(should_verify);
 
   std::optional<llvm::raw_fd_ostream> log_stream;
-  if (hlo_config.debug_options().xla_gpu_dump_llvmir()) {
-    const std::string basename =
-        absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module_name)),
-                     ".triton-passes.log");
+  if (should_dump_mlir_passes) {
     std::string outputs_dir;
     if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
       outputs_dir = hlo_config.debug_options().xla_dump_to();
     }
     if (!outputs_dir.empty()) {
+      const std::string basename =
+          absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module.name())),
+                       ".", kernel_name, ".triton-passes.log");
       std::string path = tsl::io::JoinPath(outputs_dir, basename);
       std::error_code err;
       log_stream.emplace(path, err, llvm::sys::fs::OF_None);
@@ -1297,12 +1887,17 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
                             /*opPrintingFlags=*/{});
       }
     } else {
-      LOG(ERROR) << "--xla_gpu_dump_llvmir is set, but neither the environment "
-                 << "variable TEST_UNDECLARED_OUTPUTS_DIR nor the flag "
-                 << "--xla_dump_to is set, so the llvm dumps are disabled.";
+      LOG(ERROR)
+          << "--xla_dump_hlo_pass_re=triton-fusion-emitter is set, but neither "
+          << "the environment variable TEST_UNDECLARED_OUTPUTS_DIR nor the "
+          << "flag --xla_dump_to is set, so the llvm dumps are disabled.";
     }
   }
 
+  pm.addPass(mlir::triton::xla::CreateTritonXLAExtractInsertToTritonPass(
+      device_info,
+      hlo_config.debug_options().xla_gpu_experimental_enable_triton_tma()));
+
   // Lower affine expressions into arithmetic ops.
   pm.addPass(mlir::createLowerAffinePass());
 
@@ -1310,11 +1905,19 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   pm.addPass(emitters::CreateSimplifyAffinePass());
   pm.addPass(CreateConvertIndexTypePass());
 
+  int64_t num_warps = block_level_parameters.num_warps;
+  int num_ctas = block_level_parameters.num_ctas;
+  int num_stages = block_level_parameters.num_stages;
+
+  if (num_warps <= 0 || num_ctas <= 0 || num_stages <= 0) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "(num_warps, num_ctas, num_stages) must be positive, but got: (",
+        num_warps, ", ", num_ctas, ", ", num_stages, ")"));
+  }
+
   mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
-  if (!CreateTritonPipeline(&pm, arch_name, block_level_parameters.num_warps,
-                            block_level_parameters.num_ctas,
-                            block_level_parameters.num_stages, cluster_info,
-                            is_xla_fusion)
+  if (!CreateTritonPipeline(&pm, arch_name, num_warps, num_ctas, num_stages,
+                            cluster_info, is_xla_fusion)
            .ok()) {
     return Internal("Failed to create Triton pipeline.");
   }
@@ -1324,16 +1927,8 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   // llvm::Linker::linkModules() segfaults if we don't strip locations.
   pm.addPass(mlir::createStripDebugInfoPass());
 
-  if (log_stream.has_value()) {
-    pm.printAsTextualPipeline(log_stream.value());
-    log_stream->write("\n\n", 2);
-  }
   bool succeeded = mlir::succeeded(pm.run(triton_module));
 
-  if (log_stream.has_value()) {
-    log_stream->flush();
-  }
-
   if (!succeeded) {
     return Internal("Failed to compile Triton kernel.");
   }
@@ -1341,13 +1936,31 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   const int shared_mem_bytes =
       triton_module->getAttrOfType<mlir::IntegerAttr>("ttg.shared").getInt();
   VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
-  if (std::holds_alternative<se::CudaComputeCapability>(cc) &&
-      shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
+  if (shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
     return absl::ResourceExhaustedError(absl::StrFormat(
         "Shared memory size limit exceeded: requested %d, available: %d",
         shared_mem_bytes, device_info.shared_memory_per_block_optin()));
   }
 
+  if (std::holds_alternative<se::CudaComputeCapability>(cc) &&
+      std::get<se::CudaComputeCapability>(cc).IsBlackwell()) {
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory
+    constexpr int kTensorMemoryColumns = 512;
+    const int tensor_mem_columns =
+        triton_module
+            ->getAttrOfType<mlir::IntegerAttr>("ttg.tensor_memory_size")
+            .getInt();
+    if (tensor_mem_columns > 0) {
+      VLOG(2) << "Tensor memory usage: " << tensor_mem_columns << " columns";
+    }
+    if (tensor_mem_columns > kTensorMemoryColumns) {
+      return absl::ResourceExhaustedError(absl::StrFormat(
+          "Tensor memory size limit exceeded: requested %d, available: %d",
+          tensor_mem_columns, kTensorMemoryColumns));
+    }
+  }
+
+  std::vector<llvm::Metadata*> captured_nvvm_annotations;
   if (emit_kernel) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<llvm::Module> ll_triton_module,
@@ -1358,11 +1971,12 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     }
 
     // Integrate LLVM matmul kernel into XLA's LLVM module.
-    // TODO(goncharov): remove once we integrated past LLVM
-    // 6c2e170d043d3a7d7b32635e887cfd255ef5c2ce that removes nvvm.annotations.
     auto* nvvm_annotations =
         ll_triton_module->getNamedMetadata("nvvm.annotations");
     if (nvvm_annotations) {
+      for (auto operand : nvvm_annotations->operands()) {
+        captured_nvvm_annotations.push_back(operand);
+      }
       ll_triton_module->eraseNamedMetadata(nvvm_annotations);
     }
     ll_triton_module->setDataLayout(llvm_module->getDataLayout());
@@ -1395,7 +2009,14 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
                  cluster_info.clusterDimY == 1 &&
                  cluster_info.clusterDimZ == 1);
   }
-  return {{shared_mem_bytes, cluster_dim}};
+
+  // It's okay for tma_metadata to be empty; it's only populated when used
+  // explicitly.
+  TF_ASSIGN_OR_RETURN(stream_executor::gpu::TmaMetadata tma_metadata,
+                      ExtractTmaMetadata(triton_module, kernel_name));
+
+  return {
+      {shared_mem_bytes, cluster_dim, tma_metadata, captured_nvvm_annotations}};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
index 04e661bd19fc..b0b7d40513bc 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.h
@@ -18,30 +18,31 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
-#include <string>
+#include <vector>
 
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "mlir/IR/BuiltinOps.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/autotuning.pb.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace mlir {
 namespace triton {
@@ -58,13 +59,11 @@ struct TritonWrapperResult {
   int64_t shmem_bytes = 0;
   std::optional<se::ClusterDim> cluster_dim;
   std::optional<stream_executor::gpu::TmaMetadata> tma_metadata;
-};
 
-// A wrapper containing a Triton module and optional TmaMetadata, which must be
-// extracted from compile-time and passed to the runtime.
-struct TritonModule {
-  mlir::OwningOpRef<mlir::ModuleOp> module;
-  std::optional<stream_executor::gpu::TmaMetadata> tma_metadata;
+  // The captured nvvm.annotations from the lowest level LLVM IR coming from
+  // Triton. We need to propagate them because we later create the kernel and
+  // splice the impl_fn into it.
+  std::vector<llvm::Metadata*> nvvm_annotations;
 };
 
 // Load the MLIR dialects required for Triton IR generation.
@@ -81,7 +80,7 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
 
 // Creates the initial Triton module for the given fusion. Visible for testing,
 // use TritonWrapper instead.
-absl::StatusOr<TritonModule> CreateTritonModule(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     absl::string_view fn_name, const HloFusionInstruction* fusion,
     const se::DeviceDescription& device_info,
     const BlockLevelParameters& block_level_parameters,
@@ -92,7 +91,7 @@ absl::StatusOr<TritonModule> CreateTritonModule(
 // the kernels, but it still returns correctly filled TritonWrapperResult.
 // That is useful when deserializing from the compilation cache.
 absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
-    const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
+    absl::string_view kernel_name, const HloModule& hlo_module,
     const se::DeviceDescription& device_info,
     const BlockLevelParameters& block_level_parameters,
     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
@@ -102,6 +101,8 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
 std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
                              const se::DeviceDescription& device_info);
 
+// TODO(b/406472229): Move the contents of this namespace to a helpers file
+// to avoid polluting `fusion_emitter.h`.
 // Exposed for testing purposes only. Do not use.
 namespace ir_emitter_triton_internal {
 
@@ -109,27 +110,24 @@ namespace ir_emitter_triton_internal {
 llvm::SmallVector<mlir::Value, 3> ComputeDelinearizedTileIndex(
     EmitterLocOpBuilder& b, absl::Span<const int64_t> num_output_tiles_per_dim);
 
-// Used for creating Triton Load and Store ops.
-struct MakeTensorPtrOpAndBoundaryChecks {
-  ::mlir::triton::MakeTensorPtrOp op;
-
-  // Indices of dimensions where the original tile size is not a power of 2 and
-  // requires a boundary check.
-  llvm::SmallVector<int32_t> boundary_checks;
-};
-
-absl::StatusOr<MakeTensorPtrOpAndBoundaryChecks> CreateMakeTensorPtrOp(
-    EmitterLocOpBuilder& b, mlir::ValueRange tile_multi_index,
-    const TiledHloInstruction& tiled_hlo, mlir::Value parent_base_ptr);
-}  // namespace ir_emitter_triton_internal
-
 // Dumps the Triton IR to a string.
 //
 // If `dump_annotations` is true, then the function also dumps the loc
 // attributes of the instructions. Otherwise, it dumps the IR without
 // annotations.
-std::string DumpTritonIR(mlir::ModuleOp triton_module, bool dump_annotations);
+inline std::string DumpTritonIR(mlir::ModuleOp triton_module,
+                                bool dump_annotations) {
+  std::string triton_ir;
+  llvm::raw_string_ostream os(triton_ir);
+  triton_module.print(os, mlir::OpPrintingFlags().enableDebugInfo(
+                              dump_annotations, dump_annotations));
+  if (dump_annotations) {
+    return EmitterLocOpBuilder::FormatTritonIrWithAnnotations(triton_ir);
+  }
+  return triton_ir;
+}
 
+}  // namespace ir_emitter_triton_internal
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
new file mode 100644
index 000000000000..4328440e2335
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_port_test.cc
@@ -0,0 +1,3247 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "llvm/IR/LLVMContext.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/PassManager.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/triton/test_utils.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/path.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace m = ::xla::match;
+using tsl::testing::StatusIs;
+
+struct ModuleAndNestedFusionMetadata {
+  std::unique_ptr<VerifiedHloModule> module;
+  HloComputation* computation;
+  BlockLevelParameters block_level_parameters;
+};
+
+// Returns the "real" root instruction of a computation, which is either the
+// root instruction itself, or the first instruction feeding into the root that
+// is not a bitcast.
+HloInstruction* GetNonBitcastRoot(const HloComputation* computation) {
+  HloInstruction* root = computation->root_instruction();
+  while (root->opcode() == HloOpcode::kBitcast) {
+    root = root->mutable_operand(0);
+  }
+  return root;
+}
+
+class TritonTest : public GpuCodegenTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    debug_options
+        .set_xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms(true);
+    // Disable autotuning by default, re-enable it on a per-test basis in order
+    // to avoid unnecessary slowness.
+    debug_options.set_xla_gpu_autotune_level(0);
+    return debug_options;
+  }
+
+  stream_executor::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+
+  const stream_executor::GpuComputeCapability& GpuComputeCapability() {
+    return device_desc().gpu_compute_capability();
+  }
+
+  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
+    if (std::holds_alternative<stream_executor::RocmComputeCapability>(
+            GpuComputeCapability())) {
+      return stream_executor::GpuComputeCapability{
+          device_desc().rocm_compute_capability()};
+    }
+    return se::CudaComputeCapability::Ampere();
+  }
+
+  // Returns the module, its fusion computation and associated block level
+  // parameters from an HLO module text whose entry computation contains a
+  // single GEMM fusion.
+  absl::StatusOr<ModuleAndNestedFusionMetadata>
+  GetModuleAndNestedFusionMetadata(absl::string_view hlo_text) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> module,
+                        ParseAndReturnVerifiedModule(hlo_text));
+    TF_ASSIGN_OR_RETURN(
+        bool fusion_was_nested,
+        NestGemmFusion(GpuComputeCapability()).Run(module.get()));
+    if (!fusion_was_nested) {
+      return absl::InternalError("Failed to nest the GEMM fusion.");
+    }
+    HloFusionInstruction* fusion =
+        Cast<HloFusionInstruction>(hlo_query::GetFirstInstructionWithOpcode(
+            *module->entry_computation(), HloOpcode::kFusion));
+    HloComputation* computation = fusion->fused_instructions_computation();
+    BlockLevelParameters block_level_parameters =
+        BlockLevelParameters::FromBlockLevelFusionConfig(
+            fusion->backend_config<GpuBackendConfig>()
+                ->fusion_backend_config()
+                .block_level_fusion_config());
+    return ModuleAndNestedFusionMetadata{std::move(module), computation,
+                                         std::move(block_level_parameters)};
+  }
+
+ protected:
+  const stream_executor::DeviceDescription& device_desc() {
+    return backend().default_stream_executor()->GetDeviceDescription();
+  }
+};
+
+class TritonGemmTest : public TritonTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = TritonTest::GetDebugOptionsForTest();
+    // Disable autotuning by default, re-enable it on a per-test basis in order
+    // to avoid unnecessary slowness.
+    debug_options.set_xla_gpu_autotune_level(0);
+    // Do not fall back to cuBLAS and disable cuDNN; we are testing Triton.
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(0);
+    // Do not autotune split-k by default, since this prevents deterministically
+    // matching the optimized HLO.
+    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
+    // Always rewrite Gemms with Triton regardless of size.
+    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
+    return debug_options;
+  }
+
+  void MatchHloModule(HloModule& module, absl::string_view pattern) {
+    TF_ASSERT_OK_AND_ASSIGN(bool filecheck_result,
+                            RunFileCheck(module.ToString(), pattern));
+    EXPECT_TRUE(filecheck_result);
+  }
+};
+
+class TritonGemmTestWithSplitK : public TritonGemmTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_split_k_autotuning(true);
+    return debug_options;
+  }
+};
+
+TEST_F(TritonTest, TestGemmWithTrivialNonContractingDimension) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t, is_scheduled=true
+
+triton_dot {
+  p0 = f32[137,115]{1,0} parameter(0)
+  p1 = f32[1,115]{1,0} parameter(1)
+  ROOT dot = f32[137,1]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[137,115]{1,0} parameter(0)
+  p1 = f32[1,115]{1,0} parameter(1)
+  ROOT custom-call = f32[137,1]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  TF_EXPECT_OK(
+      CreateTritonIrAndFileCheck(*module_and_metadata.computation,
+                                 module_and_metadata.block_level_parameters,
+                                 R"(
+CHECK: tt.dot {{.*}} : tensor<16x32xf32> * tensor<32x16xf32> -> tensor<16x16xf32>
+)"));
+}
+
+TEST_F(TritonTest, PredParametersAreTruncatedToI1) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm_computation {
+  p = pred[2,2]{1,0} parameter(0)
+  a = f32[2,2]{1,0} parameter(1)
+  b = f32[2,2]{1,0} parameter(2)
+  c = f32[2,2]{1,0} parameter(3)
+  compare = pred[2,2]{1,0} compare(a, b), direction=LT
+  and = pred[2,2]{1,0} and(p, compare)
+  convert = f32[2,2]{1,0} convert(and)
+  ROOT r = f32[2,2]{1,0} dot(convert, c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p = pred[2,2]{1,0} parameter(0)
+  a = f32[2,2]{1,0} parameter(1)
+  b = f32[2,2]{1,0} parameter(2)
+  c = f32[2,2]{1,0} parameter(3)
+  ROOT triton_gemm = f32[2,2]{1,0} fusion(p, a, b, c), kind=kCustom,
+    calls=triton_gemm_computation,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+        triton_gemm_config: {
+          "block_m":16,"block_n":16,"block_k":16,
+          "split_k":1,"num_stages":1,"num_warps":1,
+          "num_ctas":1
+        }
+      }
+    }
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  TF_EXPECT_OK(
+      CreateTritonIrAndFileCheck(*module_and_metadata.computation,
+                                 module_and_metadata.block_level_parameters,
+                                 R"(
+CHECK: %[[LOAD:.*]] = triton_xla.extract {{.*}} : tensor<2x2xi8> to tensor<16x16xi8>
+CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
+CHECK: %{{.*}} = arith.andi %[[TRUNCI]], %{{.*}} : tensor<16x16xi1>
+)"));
+}
+
+TEST_F(TritonTest, CodegenBatchedDotWithConcatenationWithCorrectBatchStride) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+triton_gemm {
+  parameter_0 = f32[2,3,10]{2,1,0} parameter(0)
+  parameter_1 = f32[2,10,128]{2,1,0} parameter(1)
+  parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
+  concatenate = f32[2,10,384]{2,1,0} concatenate(parameter_1, parameter_2), dimensions={2}
+  ROOT dot = f32[2,3,384]{2,1,0} dot(parameter_0, concatenate),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  parameter_0 = f32[2,3,10]{2,1,0} parameter(0)
+  parameter_1 = f32[2,10,128]{2,1,0} parameter(1)
+  parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
+  ROOT dot = f32[2,3,384]{2,1,0} fusion(parameter_0, parameter_1, parameter_2),
+    kind=kCustom, calls=triton_gemm,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  TF_EXPECT_OK(
+      CreateTritonIrAndFileCheck(*module_and_metadata.computation,
+                                 module_and_metadata.block_level_parameters, R"(
+CHECK: scf.if {{.*}} -> (tensor<1x32x64xf32>)
+CHECK: tt.dot {{.*}} : tensor<16x32xf32> * tensor<32x64xf32> -> tensor<16x64xf32>
+)"));
+}
+
+// TODO(b/393299275): this requires adding support for dynamic-slice in the
+// generic Triton emitter.
+TEST_F(TritonTest, DISABLED_CodegenDynamicSliceWithCorrectOffsets) {
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+triton_gemm {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  start_index2 = s32[] parameter(4)
+  dynamic_slice = f32[1,5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2), dynamic_slice_sizes={1,5,2}
+  bitcast = f32[5,2] bitcast(dynamic_slice)
+  ROOT dot = f32[4,5] dot(dot_lhs, bitcast), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] constant(0)
+  start_index2 = s32[] constant(0)
+  ROOT fusion = f32[4,5] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  ASSERT_THAT(
+      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_gemm", R"(
+CHECK:     tt.func @triton_fn({{[^,]*}}, %[[DYNAMIC_SLICE_INPUT:[^:]*]]: !tt.ptr<f32> {{[^,]*}}, %[[START_INDEX0_PTR:[^:]*]]: !tt.ptr<i32>
+CHECK-DAG:   %[[C0_i32:.*]] = arith.constant 0 : i32
+CHECK-DAG:   %[[C1_i64:.*]] = arith.constant 1 : i64
+CHECK-DAG:   %[[C2_i64:.*]] = arith.constant 2 : i64
+CHECK-DAG:   %[[C3_i32:.*]] = arith.constant 3 : i32
+CHECK-DAG:   %[[C5_i32:.*]] = arith.constant 5 : i32
+CHECK-DAG:   %[[C5_i64:.*]] = arith.constant 5 : i64
+CHECK-DAG:   %[[START_INDEX0:.*]] = tt.load %[[START_INDEX0_PTR]] : !tt.ptr<i32>
+CHECK-DAG:   %[[SEMI_CLAMPED_START_INDEX0:.*]] = arith.maxsi %[[START_INDEX0]], %[[C0_i32]] : i32
+CHECK-DAG:   %[[CLAMPED_START_INDEX0:.*]] = arith.minsi %[[SEMI_CLAMPED_START_INDEX0]], %[[C3_i32]] : i32
+CHECK-DAG:   %[[ROW_OFFSET:.*]] = arith.muli %[[CLAMPED_START_INDEX0]], %[[C5_i32]] : i32
+CHECK-DAG:   %[[ROW_OFFSET_i64:.*]] = arith.extsi %[[ROW_OFFSET]] : i32 to i64
+CHECK-DAG:   %[[ROW_LIMIT:.*]] = arith.addi %[[ROW_OFFSET_i64]], %[[C5_i64]] : i64
+CHECK-DAG:   tt.make_tensor_ptr %[[DYNAMIC_SLICE_INPUT]], [%[[C2_i64]], %[[ROW_LIMIT]]], [%[[C1_i64]], %[[C2_i64]]], [%[[C0_i32]], %[[ROW_OFFSET]]]
+)"),
+      tsl::testing::IsOk());
+}
+
+TEST_F(TritonGemmTest, DoNotUseTensorCoresWithNonDefaultPrecision) {
+  constexpr absl::string_view kHloText = R"(
+triton_gemm_r {
+  parameter_0 = s8[80,15]{1,0} parameter(0)
+  convert.3 = f32[80,15]{1,0} convert(parameter_0)
+  parameter_1 = f32[16,15]{1,0} parameter(1)
+  ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  p1 = f32[16,15]{1,0} parameter(1)
+  p0 = s8[80,15]{1,0} parameter(0)
+  ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
+      {"block_m":32,"block_n":32,"block_k":32,
+       "split_k":1,"num_stages":1,"num_warps":2,
+       "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+
+  CompileAndOptionallyVerifyPtx(std::move(module_and_metadata.module),
+                                R"(
+CHECK-NOT: mma
+)");
+}
+
+TEST_F(TritonGemmTest, DebugOptionsArePropagated) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f16[30,30] parameter(0)
+  p1 = s8[30,30] parameter(1)
+  cp1 = f16[30,30] convert(p1)
+  ROOT _ = f16[30,30] dot(p0, cp1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> verified_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  std::string output_directory;
+  if (!tsl::io::GetTestUndeclaredOutputsDir(&output_directory)) {
+    output_directory = tsl::testing::TmpDir();
+  }
+  DebugOptions debug_options = verified_module->config().debug_options();
+  debug_options.set_xla_dump_to(output_directory);
+  debug_options.set_xla_dump_hlo_pass_re("triton-fusion-emitter");
+  verified_module->mutable_config().set_debug_options(debug_options);
+
+  EXPECT_TRUE(RunAndCompare(std::move(verified_module),
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+
+  std::vector<std::string> paths;
+  TF_EXPECT_OK(tsl::Env::Default()->GetMatchingPaths(
+      tsl::io::JoinPath(output_directory, "*.triton-passes.log"), &paths));
+  EXPECT_EQ(paths.size(), 1);
+}
+
+TEST_F(TritonGemmTest, DotWithPredFromCompareProducesCorrectResult) {
+  constexpr absl::string_view kHloText = R"(
+triton_dot {
+  parameter_0 = s32[4,128]{1,0} parameter(0)
+  broadcast.255 = s32[4,128,64]{2,1,0} broadcast(parameter_0), dimensions={0,1}
+  parameter_1 = s32[4,128,64]{2,1,0} parameter(1)
+  compare.39 = pred[4,128,64]{2,1,0} compare(broadcast.255, parameter_1), direction=EQ
+  bitcast.1097 = pred[512,64]{1,0} reshape(compare.39)
+  convert.229 = bf16[512,64]{1,0} convert(bitcast.1097)
+  parameter_2 = bf16[64,256]{0,1} parameter(2)
+  ROOT dot.21 = bf16[512,256]{1,0} dot(convert.229, parameter_2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+ENTRY main {
+  p0 = s32[4,128]{1,0} parameter(0)
+  p1 = s32[4,128,64]{2,1,0} parameter(1)
+  p2 = bf16[64,256]{0,1} parameter(2)
+  ROOT gemm_fusion_dot.0 = bf16[512,256]{1,0} fusion(p0, p1, p2), kind=kCustom, calls=triton_dot, backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"64","block_n":"128","block_k":"32","split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(module_and_metadata.module->ToString(),
+                               ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonGemmTest, UseTensorCoresForF32OnAmpere) {
+  constexpr absl::string_view kHloText = R"(
+triton_gemm_r {
+  parameter_0 = f16[80,15]{1,0} parameter(0)
+  convert.3 = f32[80,15]{1,0} convert(parameter_0)
+  parameter_1 = f32[16,15]{1,0} parameter(1)
+  ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p1 = f32[16,15]{1,0} parameter(1)
+  p0 = f16[80,15]{1,0} parameter(0)
+  ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
+      {"block_m":32,"block_n":32,"block_k":32,
+      "split_k":1,"num_stages":1,"num_warps":2,
+      "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  CompileAndOptionallyVerifyPtx(std::move(module_and_metadata.module),
+                                R"(
+CHECK: mma
+)");
+}
+
+// TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
+// moved to deviceless test file.
+TEST_F(TritonGemmTest, FailIfTooMuchShmem) {
+  auto cc = se::CudaComputeCapability::Ampere();
+  const se::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  constexpr absl::string_view kHloTextTemplate = R"(
+triton_gemm_dot {
+  p0 = s8[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  c0 = f32[1024,1024] convert(p0)
+  ROOT dot = f32[1024,1024] dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = s8[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  ROOT r = f32[1024,1024] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config: {"block_m":$0,"block_n":$1,"block_k":$2,
+      "split_k":1,"num_stages":$3,"num_warps":4,
+      "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module1_and_metadata,
+                          GetModuleAndNestedFusionMetadata(absl::Substitute(
+                              kHloTextTemplate, 16, 32, 512, 8)));
+
+  const HloFusionInstruction* fusion1 = Cast<HloFusionInstruction>(
+      module1_and_metadata.computation->FusionInstruction());
+  EXPECT_THAT(
+      TritonWrapper("test_fn", fusion1, cc, device_info,
+                    module1_and_metadata.block_level_parameters, &llvm_module,
+                    mlir_context),
+      StatusIs(tsl::error::RESOURCE_EXHAUSTED,
+               ::testing::HasSubstr("Shared memory size limit exceeded")));
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module2_and_metadata,
+                          GetModuleAndNestedFusionMetadata(absl::Substitute(
+                              kHloTextTemplate, 64, 128, 128, 1)));
+
+  const HloFusionInstruction* fusion2 = Cast<HloFusionInstruction>(
+      module2_and_metadata.computation->FusionInstruction());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      const auto result,
+      TritonWrapper("test_fn", fusion2, cc, device_info,
+                    module2_and_metadata.block_level_parameters, &llvm_module,
+                    mlir_context));
+  // Use optin shared memory which is > shared_memory_per_block.
+  EXPECT_GT(result.shmem_bytes, device_info.shared_memory_per_block());
+}
+
+TEST_F(TritonGemmTest, MultipleDims) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f16[1,16,17,3] parameter(0)
+  p1 = s8[16,17,3] parameter(1)
+  cp1 = f16[16,17,3] convert(p1)
+  ROOT _ = f16[1,16,16] dot(p0, cp1),
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NOT:  convert
+; CHECK: fusion(
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: "__triton_nested_gemm_fusion"
+  )");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, PredWithBF16DotProducesCorrectResult) {
+  constexpr absl::string_view kHloText = R"(
+triton_dot {
+  p0 = pred[8,640]{1,0} parameter(0)
+  cvt = bf16[8,640]{1,0} convert(pred[8,640]{1,0} p0)
+  p1 = bf16[4096,640]{1,0} parameter(1)
+  ROOT dot.10277 = bf16[8,4096]{1,0} dot(cvt, p1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = pred[8,640]{1,0} parameter(0)
+  p1 = bf16[4096,640]{1,0} parameter(1)
+  ROOT dot = bf16[8,4096]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
+      {"block_m":16,"block_n":32,"block_k":64,
+      "split_k":1,"num_stages":2,"num_warps":8,
+      "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(module_and_metadata.module->ToString(),
+                               ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, SplitLhsNoncontractingTransposeRhs) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = pred[3,122,96,12]{3,2,1,0} parameter(0)
+  cp0 = f16[3,122,96,12]{3,2,1,0} convert(p0)
+  p1 = pred[1,5,122]{2,1,0} parameter(1)
+  cp1 = f16[1,5,122]{2,1,0} convert(p1)
+  ROOT _ = f16[3,96,12,1,5]{4,3,2,1,0} dot(cp0, cp1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={2}
+})";
+
+  // Check that the transpose is in the nested fusion but not in the entry.
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: transpose
+; CHECK: ENTRY
+; CHECK-NOT: transpose
+; CHECK: fusion(
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: __triton_nested_gemm_fusion
+)");
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
+}
+
+TEST_F(TritonGemmTest, SplitLhsNoncontracting) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f32[72,72] parameter(0)
+  bc1 = f32[4,3,3,2,4,3,3,2] reshape(p0)
+  tr = f32[4,3,3,2,2,4,3,3] transpose(bc1), dimensions={0,1,2,3,7,4,5,6}
+  bc2 = f32[144,36] reshape(tr)
+  p1 = f16[36,3] parameter(1)
+  c7 = f32[36,3] convert(p1)
+  ROOT _ = f32[144,3] dot(bc2, c7),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  // Check that the transpose is in the nested fusion but not in the entry.
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: f32[72,2,36]{2,1,0} transpose(
+; CHECK-NEXT: ROOT
+; CHECK: ENTRY
+; CHECK-NOT: transpose
+; CHECK: fusion(
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: __triton_nested_gemm_fusion
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, SplitAndTransposeLhsExecutesCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  tmp_0 = s8[5,50,2,128] parameter(1)
+  tmp_2 = s8[50,5,2,128] transpose(tmp_0), dimensions={1,0,2,3}
+  tmp_3 = s8[50,1280] reshape(tmp_2)
+  tmp_4 = f16[50,1280] convert(tmp_3)
+  tmp_5 = f16[50,79] parameter(0)
+  ROOT tmp_6 = f16[1280,79] dot(tmp_4, tmp_5),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NOT: transpose
+; CHECK-NOT: convert
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: backend_config={{.*}}"kind":"__triton_nested_gemm_fusion"
+)");
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+// TODO(b/393299275): it's not clear that this test is actually testing what it
+// claims to be testing. It should either be rewritten to start from
+// post-optimization HLO, or hoisted out to test the fusion logic specifically.
+TEST_F(TritonGemmTest, NondefaultOperandLayoutIsSupported) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY r {
+  p1 = f16[3,10,128]{2,1,0} parameter(1)
+  cp = f16[3,10,128]{2,0,1} copy(p1)
+  cv = f32[3,10,128]{2,0,1} convert(cp)
+  p0 = f32[3,10,123]{2,1,0} parameter(0)
+  ROOT d = f32[3,128,123]{2,1,0} dot(cv, p0),
+    lhs_batch_dims={0}, lhs_contracting_dims={1},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: fusion(
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: "__triton_nested_gemm_fusion"
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+// TODO(b/393299275): this is a pure test of fusion logic. It should be moved to
+// a separate, fusion-specific, deviceless test.
+TEST_F(TritonGemmTest, DoNotFuseSplitRhsContractingTranspose) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f16[5,8] parameter(0)
+  p1 = s8[2,3,4] parameter(1)
+  c0 = f16[2,3,4] convert(p1)
+  t1 = f16[3,2,4] transpose(c0), dimensions={1,0,2}
+  r1 = f16[3,8] reshape(t1)
+  ROOT _ = f16[5,3] dot(p0, r1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK: transpose
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: "__triton_nested_gemm_fusion"
+)");
+}
+
+// TODO(b/393299275): this is a pure test of fusion logic. It should be moved to
+// a separate, fusion-specific, deviceless test.
+TEST_F(TritonGemmTest, DoNotFuseSplitLhsContractingTranspose) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f16[3,16,25]{2,1,0} parameter(0)
+  p0t = f16[16,3,25]{2,1,0} transpose(p0), dimensions={1,0,2}
+  p0tr = f16[16,75]{1,0} reshape(p0t)
+  p1 = s8[128,75]{1,0} parameter(1)
+  cp1 = f16[128,75]{1,0} convert(p1)
+  ROOT dot = f16[16,128]{1,0} dot(p0tr, cp1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK: transpose
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: "__triton_nested_gemm_fusion"
+)");
+}
+
+// TODO(b/393299275): this test should be rewritten to start from
+// post-optimization HLO. (Though I'm not entirely sure it's even worth keeping
+// it.)
+TEST_F(TritonGemmTest, NonMajorMostInputBatchWorksCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+ENTRY e {
+  x = f32[20,50,30] parameter(0)
+  y = f16[30,50,40] parameter(1)
+  cy = f32[30,50,40] convert(y)
+  ROOT _ = f32[50,20,40] dot(x, cy),
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    lhs_batch_dims={1}, rhs_batch_dims={1}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: "__triton_nested_gemm_fusion"
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+// TODO(b/393299275): this is a pure test of fusion logic. It should be moved to
+// a separate, fusion-specific, deviceless test.
+TEST_F(TritonGemmTest, DoNotFuseArbitraryReshape) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[5,2,3] parameter(0)
+  p0c = f32[5,2,3] convert(p0)
+  p1 = f32[20,3] parameter(1)
+  p1r = f32[5,3,4] reshape(p1)
+  ROOT dot = f32[5,2,4] dot(p0c, p1r),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK: f32[5,3,4]{2,1,0} bitcast
+; CHECK: fusion
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: "__triton_nested_gemm_fusion"
+)");
+}
+
+TEST_F(TritonGemmTest, MultipleBatchRequireSeparateTranspose) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  Arg_0 = f16[3,4,2,5,4] parameter(0)
+  c = f32[3,4,2,5,4] convert(Arg_0)
+  Arg_1 = f32[5,3,4,3,2] parameter(1)
+  ROOT dot.3 = f32[5,3,4,4,3] dot(c, Arg_1),
+    lhs_batch_dims={3,0,1}, lhs_contracting_dims={2},
+    rhs_batch_dims={0,1,2}, rhs_contracting_dims={4}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ROOT
+; CHECK: transpose(
+; CHECK: bitcast(
+; CHECK: kCustom
+; CHECK-SAME: backend_config={{.*}}"kind":"__triton_nested_gemm_fusion"
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
+}
+
+// This tests the complexity heuristics in TritonWrapper.
+// TODO(b/393299275): this is not worth keeping as a codegen test. Really, we
+// should not reject tilings that are slow/spill in codegen. If this has use in
+// autotuning, then this should be tested/called in the autotuner.
+// The generic Triton emitter does not want to deal with this.
+TEST_F(TritonGemmTest, DISABLED_FailForTooComplexTiling) {
+  auto cc = se::CudaComputeCapability::Ampere();
+  const se::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  constexpr absl::string_view kHloTextTemplate = R"(
+HloModule module
+
+triton_gemm_dot {
+  p0 = s8[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  c0 = f32[1024,1024] convert(p0)
+  ROOT dot.0 = f32[1024,1024] dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = s8[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  ROOT r = f32[1024,1024] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config: {"block_m":$0,"block_n":$1,"block_k":$2,
+      "split_k":1,"num_stages":1,"num_warps":2,
+      "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module1_and_metadata,
+                          GetModuleAndNestedFusionMetadata(absl::Substitute(
+                              kHloTextTemplate, 512, 512, 32)));
+
+  const HloFusionInstruction* fusion1 = Cast<HloFusionInstruction>(
+      module1_and_metadata.computation->FusionInstruction());
+  EXPECT_THAT(TritonWrapper("test_fn", fusion1, cc, device_info,
+                            module1_and_metadata.block_level_parameters,
+                            &llvm_module, mlir_context),
+              StatusIs(tsl::error::RESOURCE_EXHAUSTED,
+                       "Tiling complexity heuristic exceeded"));
+
+  // Succeeds if the tiling is not too complex.
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module2_and_metadata,
+                          GetModuleAndNestedFusionMetadata(
+                              absl::Substitute(kHloTextTemplate, 32, 32, 32)));
+
+  const HloFusionInstruction* fusion2 = Cast<HloFusionInstruction>(
+      module1_and_metadata.computation->FusionInstruction());
+
+  TF_EXPECT_OK(TritonWrapper("test_fn", fusion2, cc, device_info,
+                             module2_and_metadata.block_level_parameters,
+                             &llvm_module, mlir_context)
+                   .status());
+}
+
+// TODO(b/393299275): this test may have some value while Triton tiling
+// propagation is being replaced, but has little worth as a codegen test.
+// Consider moving this.
+TEST_F(TritonGemmTest,
+       BroadcastsOfTriviallySizedNonContractingDimensionsAreSupported) {
+  constexpr absl::string_view kHloText = R"(
+f {
+  p0 = f32[64,6464] parameter(0)
+  p1 = f32[16,6464] parameter(1)
+  dot = f32[16,64] dot(p1, p0),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bc0 = f32[1,16,64] bitcast(dot)
+  p2 = f32[64] parameter(2)
+  bc1 = f32[1,64] bitcast(p2)
+  br = f32[1,16,64] broadcast(bc1), dimensions={0,2}
+  m = f32[1,16,64] multiply(bc0, br)
+}
+
+e {
+  p0 = f32[64,6464] parameter(0)
+  p1 = f32[16,6464] parameter(1)
+  p2 = f32[64] parameter(2)
+  f = f32[1,16,64] fusion(p0, p1, p2),
+    kind=kCustom, calls=f, backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config": {"block_m":"16","block_n":"16","block_k":"64","split_k":"1",
+          "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(std::move(module_and_metadata.module),
+                               ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+// TODO(b/393299275): this test may have some value while Triton tiling
+// propagation is being replaced, but has little worth as a codegen test.
+// Consider moving this.
+// TODO(b/393299275): likely uncovered a bug in `NestGemmFusion`, where after
+// transformations and collapse of a dimension, broadcast dimensions are wrong.
+TEST_F(TritonGemmTest,
+       DISABLED_BroadcastsOfTriviallySizedContractingDimensionsAreSupported) {
+  constexpr absl::string_view kHloText = R"(
+f {
+  a = f16[2] parameter(0)
+  bc0 = f16[1,2] bitcast(a)
+  br = f16[1,4000,2] broadcast(bc0), dimensions={0,2}
+  bc1 = f16[4000,2] bitcast(br)
+  b = f16[3,4000] parameter(1)
+  d = f16[2,3] dot(bc1, b),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+e {
+  a = f16[2] parameter(0)
+  b = f16[3,4000] parameter(1)
+  f = f16[2,3] fusion(a, b),
+    kind=kCustom, calls=f, backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config": {"block_m":"16","block_n":"16","block_k":"16","split_k":"1",
+          "num_stages":"1","num_warps":"1","num_ctas":"1"}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(std::move(module_and_metadata.module),
+                               ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+// TODO(b/393299275): this requires adding support for dynamic-slice in the
+// generic Triton emitter.
+TEST_F(TritonGemmTest, DISABLED_DynamicSliceIsSupportedInLhsEndToEnd) {
+  // The select is used to restrict the start index to values that make sense.
+  // If it was constant, then the dynamic-slice would be optimized to slice. It
+  // is not strictly needed, because we also support clamping the indices.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[7,2] parameter(1)
+  pred0 = pred[] parameter(2)
+  c1 = s32[] constant(1)
+  c2 = s32[] constant(2)
+  start_index0 = s32[] select(pred0, c1, c2)
+  start_index1 = s32[] constant(0)
+  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  ROOT dot = f32[4,5] dot(dot_lhs, dynamic_slice),
+          lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(),
+                           m::Fusion(m::Parameter()), m::Constant())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+  // Check that it's not optimized away.
+  MatchHloModule(*module, "; CHECK: dynamic-slice(");
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+// TODO(b/393299275): this requires adding support for dynamic-slice in the
+// generic Triton emitter.
+TEST_F(TritonGemmTest, DISABLED_DynamicSliceIsSupportedInRhs) {
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  ROOT dot = f32[5, 4] dot(dynamic_slice, dot_rhs),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] constant(1)
+  start_index1 = s32[] constant(0)
+  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+class TritonGemmDynamicSliceClampingTest
+    : public TritonTest,
+      public ::testing::WithParamInterface<int> {};
+
+// TODO(b/393299275): this requires adding support for dynamic-slice in the
+// generic Triton emitter.
+TEST_P(TritonGemmDynamicSliceClampingTest,
+       DISABLED_DynamicSliceIsSupportedWhenTheStartIndexNeedsClamping) {
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+
+  const std::string hlo_text = absl::Substitute(R"(
+HloModule m
+
+triton_gemm {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  ROOT dot = f32[5, 4] dot(dynamic_slice, dot_rhs),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] constant($0)
+  start_index1 = s32[] constant(0)
+  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})",
+                                                GetParam());
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+std::string OffsetParamToString(const ::testing::TestParamInfo<int>& data) {
+  return absl::StrCat("WithOffsetEq", data.param < 0 ? "Negative" : "",
+                      std::abs(data.param));
+}
+
+INSTANTIATE_TEST_SUITE_P(All, TritonGemmDynamicSliceClampingTest,
+                         ::testing::Values(-100, 3, 999), OffsetParamToString);
+
+// TODO(b/393299275): this requires adding support for dynamic-slice in the
+// generic Triton emitter.
+TEST_F(TritonGemmTest,
+       DISABLED_DynamicSliceOfMajormostContractingDimIsSupported) {
+  // Tests that dynamic-slice works on the majormost dimension even if that
+  // dimension is contracted.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[5,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  dynamic_slice = f32[2,4] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={2,4}
+  ROOT dot = f32[4,4] dot(dot_lhs, dynamic_slice),
+             lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[5,4] parameter(1)
+  start_index0 = s32[] constant(2)
+  start_index1 = s32[] constant(0)
+  ROOT fusion = f32[4,4] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+// TODO(b/393299275): this requires adding support for dynamic-slice in the
+// generic Triton emitter.
+TEST_F(TritonGemmTest, DISABLED_DynamicSliceOfMajormostBatchDimIsSupported) {
+  // Tests that dynamic-slice works on the majormost dimension even if that
+  // dimension is a batch.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dot_lhs = f32[2,2,4] parameter(0)
+  dynamic_slice_input = f32[7,2,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  start_index2 = s32[] parameter(4)
+  dynamic_slice = f32[2,2,4] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2),
+                  dynamic_slice_sizes={2,2,4}
+  ROOT dot = f32[2,4,4] dot(dot_lhs, dynamic_slice),
+             lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,2,4] parameter(0)
+  dynamic_slice_input = f32[7,2,4] parameter(1)
+  start_index0 = s32[] constant(2)
+  start_index1 = s32[] constant(0)
+  start_index2 = s32[] constant(0)
+  ROOT fusion = f32[2,4,4] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+// TODO(b/393299275): this requires adding support for dynamic-slice in the
+// generic Triton emitter.
+TEST_F(TritonGemmTest,
+       DISABLED_DynamicSliceSingleDimensionIntoReshapeIsSupported) {
+  // This directly tests the targeted use case (b/307922364) of iterating over
+  // layer weights and extracting them with dynamic slice.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  start_index2 = s32[] parameter(4)
+  dynamic_slice = f32[1,5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2),
+                             dynamic_slice_sizes={1,5,2}
+  reshape = f32[5,2] reshape(dynamic_slice)
+  ROOT d = f32[4,5] dot(dot_lhs, reshape),
+           lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] constant(3)
+  start_index1 = s32[] constant(0)
+  start_index2 = s32[] constant(0)
+  ROOT fusion = f32[4,5] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, DoNotFuseConcatenationOfSplitNonContractingDimension) {
+  if (std::holds_alternative<se::RocmComputeCapability>(
+          GpuComputeCapability())) {
+    GTEST_SKIP() << "Not using autotuner on ROCM yet.";
+  }
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  x = bf16[2,128,10] parameter(0)
+  y = bf16[2,256,10] parameter(1)
+  concat = bf16[2,384,10] concatenate(x, y), dimensions={1}
+  z = bf16[10,20] parameter(2)
+  ROOT d = bf16[2,384,20] dot(concat, z), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK:      ENTRY
+; CHECK:      concatenate
+; CHECK:      fusion
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   "__triton_nested_gemm_fusion"
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, BinaryOperationWithSmallInputsIsFused) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = s8[7,3] parameter(0)
+  p1 = f32[3,16] parameter(1)
+  p2 = f32[3,16] parameter(2)
+  e = f32[3,16] exponential(p1)
+  a = f32[3,16] add(e, p2)
+  c = f32[7,3] convert(p0)
+  ROOT d = f32[7,16] dot(c, a),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+TEST_F(TritonGemmTest, BinaryOperationWithLargeInputsIsNotFused) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[333,1000] parameter(0)
+  p1 = f32[1000,333] parameter(1)
+  p1n = f32[1000,333] negate(p1)
+  p2 = f32[1000,333] parameter(2)
+  p2n = f32[1000,333] negate(p2)
+  s = f32[1000,333] subtract(p1n, p2n)
+  c = f32[333,1000] convert(p0)
+  ROOT d = f32[1000,1000] dot(s, c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: fused_subtract
+; CHECK: negate
+; CHECK: negate
+; CHECK: ROOT
+; CHECK-SAME: subtract
+; CHECK: ENTRY
+; CHECK: kLoop
+; CHECK: kCustom
+; CHECK-SAME: backend_config={{.*}}"kind":"__triton_nested_gemm_fusion"
+)");
+}
+
+TEST_F(TritonGemmTest, ParametersWithDifferentLayoutsAreSupportedInOneScope) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = s8[5,3] parameter(0)
+  p0c = f16[5,3] convert(p0)
+  p1 = f16[5,7] parameter(1)
+  p2 = f16[7,5] parameter(2)
+  t = f16[5,7] transpose(p2), dimensions={1,0}
+  a = f16[5,7] add(t, p1)
+  ROOT d = f16[3,7] dot(p0c, a),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+TEST_F(TritonGemmTest, BinaryOperationOnLargeParametersIsFused) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[1000,111] parameter(0)
+  p1 = f32[111,10000] parameter(1)
+  p2 = f32[111,10000] parameter(2)
+  s = f32[111,10000] subtract(p1, p2)
+  c = f32[1000,111] convert(p0)
+  ROOT d = f32[10000,1000] dot(s, c),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+TEST_F(TritonGemmTest, LinkingLibdeviceTwiceWorks) {
+  // TODO(b/393299275): This test looks weird. It's testing the whole
+  // optimization pipeline end-to-end to check that linking libdevice twice
+  // works? rewrite this to just use post-optimization HLO
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = s8[7,3] parameter(0)
+  c0 = f32[7,3] convert(p0)
+  p1 = f32[3,16] parameter(1)
+  e1 = f32[3,16] exponential(p1)
+  d0 = f32[7,16] dot(c0, e1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  p2 = s8[7,3] parameter(2)
+  c2 = f32[7,3] convert(p2)
+  e2 = f32[7,3] exponential(c2)
+  p3 = f32[3,16] parameter(3)
+  d1 = f32[7,16] dot(e2, p3),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT a = f32[7,16] add(d0, d1)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Add(
+                  m::Fusion(m::Parameter(), m::Parameter())
+                      .WithFusionKind(HloInstruction::FusionKind::kCustom),
+                  m::Fusion(m::Parameter(), m::Parameter())
+                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, BroadcastOfScalarParameterIsFused) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f16[64,256] parameter(0)
+  p0c = f32[64,256] convert(p0)
+  p1 = f32[] parameter(1)
+  b = f32[256,128] broadcast(p1), dimensions={}
+  ROOT d = f32[64,128] dot(p0c, b),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, BroadcastOfScalarConstantIsFused) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[70,30] parameter(0)
+  p0c = f32[70,30] convert(p0)
+  constant_3663 = f32[] constant(4321)
+  bc0 = f32[30,5] broadcast(constant_3663)
+  ROOT d = f32[70,5] dot(p0c, bc0),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, DoubleBroadcastOfScalarConstantIsFused) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  c = s32[] constant(1)
+  bc1 = s32[21]{0} broadcast(c), dimensions={}
+  p0 = s32[21]{0} parameter(0)
+  cmp = pred[21]{0} compare(bc1, p0), direction=EQ
+  convert.6 = bf16[21]{0} convert(cmp)
+  bc2 = bf16[3,21]{1,0} broadcast(convert.6), dimensions={1}
+  p1 = bf16[21,71]{1,0} parameter(1)
+  ROOT d = bf16[3,71]{1,0} dot(bc2, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      GetNonBitcastRoot(module->entry_computation()),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, BroadcastOfVectorConstantIsFused) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = s8[60,5] parameter(0)
+  c0 = f16[60,5] convert(p0)
+  cst1 = f16[120] constant({...})
+  r1 = f16[5,120] broadcast(cst1), dimensions={1}
+  ROOT d = f16[60,120] dot(c0, r1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      GetNonBitcastRoot(module->entry_computation()),
+      GmockMatch(m::Fusion(m::Parameter(), m::Constant())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, AlwaysFuseScalarConstantAtBroadcastInput) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = bf16[2,3,3]{2,1,0} parameter(0)
+  p1 = bf16[3,2,3]{2,1,0} parameter(1)
+  d = bf16[2,3,3]{2,1,0} dot(p0, p1),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={1}, rhs_contracting_dims={0}
+  t = bf16[3,2,3]{2,0,1} transpose(d), dimensions={1,0,2}
+  c = bf16[] constant(0.123)
+  b = bf16[3,2,3]{2,1,0} broadcast(c), dimensions={}
+  m = bf16[3,2,3]{2,0,1} multiply(t, b)
+  ROOT tu = (bf16[3,2,3]{2,0,1}, bf16[3,2,3]{2,1,0}) tuple(m, b)
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: gemm_fusion_dot
+; CHECK: dot(
+; CHECK: bf16[] constant(0.123)
+; CHECK: ROOT
+; CHECK: ENTRY
+; CHECK: kCustom
+; CHECK-SAME: backend_config={{.*}}"kind":"__triton_nested_gemm_fusion"
+)");
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, FuseConcatenation) {
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  constexpr absl::string_view kHloText = R"(
+e {
+  p0 = s8[153,1536] parameter(0)
+  p1 = s8[153,128] parameter(1)
+  p2 = s8[153,128] parameter(2)
+  cat = s8[153,1792] concatenate(p0, p1, p2), dimensions={1}
+  cvt = bf16[153,1792] convert(cat)
+  p3 = bf16[16,153] parameter(3)
+  ROOT d = bf16[16,1792] dot(p3, cvt),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      GetNonBitcastRoot(module->entry_computation()),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
+                           m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2,
+                                                /*arel=*/1e-2}));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, SineOutputIsNotFused) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = s8[7,101] parameter(0)
+  p1 = f32[101,16] parameter(1)
+  c = f32[7,101] convert(p0)
+  d = f32[7,16] dot(c, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT r = f32[7,16] sine(d)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Sin(
+                  m::Fusion(m::Parameter(), m::Parameter())
+                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, SliceInputIsFused) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f16[97,121] parameter(0)
+  s0 = f16[7,101] slice(p0), slice={[3:10], [10:111]}
+  p1 = f32[101,16] parameter(1)
+  c = f32[7,101] convert(s0)
+  ROOT d = f32[16,7] dot(p1, c),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, SliceInputWithReshapeIsFused) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f32[363,1536] parameter(0)
+  p1 = f32[4,1536,611] parameter(1)
+  s = f32[1,1536,611] slice(p1),
+    slice={[1:2], [0:1536], [0:611]}
+  r = f32[1536,611] reshape(s)
+  ROOT d = f32[363,611] dot(p0, r),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      GetNonBitcastRoot(module->entry_computation()),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory. From the point of view of codegen, this is purely
+// about compositionality.
+TEST_F(TritonGemmTest, NestedSlicingWorks) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p1 = f32[6,24] parameter(1)
+  slice1 = f32[5,20] slice(p1), slice={[1:6], [3:23]}
+  n1 = f32[5,20] negate(slice1)
+  slice2 = f32[3,7] slice(n1), slice={[1:4], [13:20]}
+  p0 = f32[7,37] parameter(0)
+  ROOT d = f32[3,37] dot(slice2, p0),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, SlicedBatchDimensionIsSupported) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f16[3,3,256] parameter(0)
+  s0 = f16[3,3,128] slice(p0), slice={[0:3], [0:3], [123:251]}
+  r0 = f16[3,3,128] reshape(s0)
+  p1 = f16[3,3,256] parameter(1)
+  svar1 = f16[3,3,128] slice(p1), slice={[0:3], [0:3], [30:158]}
+  r1 = f16[3,3,128] reshape(svar1)
+  ROOT d = f16[128,3,3]{2,1,0} dot(r0, r1),
+    lhs_batch_dims={2}, lhs_contracting_dims={1},
+    rhs_batch_dims={2}, rhs_contracting_dims={1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): symbolic tile analysis fails to derive a tile for one
+// outer parameter here. However, we shouldn't be deriving this tile anyway,
+// and the underlying indexing map is incorrect. This requires a fix in
+// symbolic tile derivation.
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTestWithSplitK,
+       DISABLED_SplitKDoesNotBreakSlicedFragmentedContractingDimension) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f16[16,8,128]{2,1,0} parameter(0)
+  s0 = f16[16,4,128]{2,1,0} slice(p0),
+    slice={[0:16], [0:4], [0:128]}
+  r0 = f16[16,512]{1,0} reshape(s0)
+  p1 = s8[4096,4,128]{2,1,0} parameter(1)
+  r1 = s8[512,4096]{0,1} reshape(p1)
+  c1 = f16[512,4096]{0,1} convert(r1)
+  ROOT d = f16[16,4096]{1,0} dot(r0, c1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+// TODO(b/393299275): this should be rewritten to work on post-optimization HLO,
+// and potentially have an associated fusion test.
+TEST_F(TritonGemmTestWithSplitK, SplitKWithTrivialDimension) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY entry_computation {
+  p0 = f16[1001,1]{1,0} parameter(0)
+  convert = f32[1001,1]{1,0} convert(p0)
+  p1 = f32[1001,2048]{1,0} parameter(1)
+  ROOT dot = f32[1,2048]{1,0} dot(convert, p1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, NarrowingConvertOutputIsFused) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = s8[22,80] parameter(0)
+  p1 = f32[80,54] parameter(1)
+  c = f32[22,80] convert(p0)
+  d = f32[54,22] dot(p1, c),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+  ROOT r = f16[54,22] convert(d)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, ParameterAfterDotIsFused) {
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = bf16[350,1280]{1,0} parameter(0)
+  p1 = s16[1280,690]{0,1} parameter(1)
+  p1c = bf16[1280,690]{0,1} convert(p1)
+  dot.21 = bf16[350,690]{1,0} dot(p0, p1c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  p2 = bf16[350,690]{1,0} parameter(2)
+  ROOT r = bf16[350,690]{1,0} multiply(p2, dot.21)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  const HloInstruction* instr = module->entry_computation()->root_instruction();
+  if (!instr->IsCustomFusion()) {
+    instr = instr->operand(0);
+    ASSERT_TRUE(instr->IsCustomFusion());
+  }
+  EXPECT_THAT(
+      instr,
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-2, /*arel=*/2e-2}));
+}
+
+TEST_F(TritonGemmTest, OutputFusionExecutesCorrectly) {
+  // TODO(b/393299275): it this test useful?
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[350,1280]{1,0} parameter(0)
+  p0c = bf16[350,1280]{1,0} convert(p0)
+  p1 = bf16[1280,690]{0,1} parameter(1)
+  d = bf16[350,690]{1,0} dot(p0c, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  p3 = bf16[350,690]{1,0} parameter(3)
+  multiply.8811 = bf16[350,690]{1,0} multiply(d, p3)
+  neg.484 = bf16[350,690]{1,0} negate(multiply.8811)
+  p2 = bf16[350,690]{1,0} parameter(2)
+  ROOT multiply.8808 = bf16[350,690]{1,0} multiply(neg.484, p2)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  const HloInstruction* instr = module->entry_computation()->root_instruction();
+  if (!instr->IsCustomFusion()) {
+    instr = instr->operand(0);
+    ASSERT_TRUE(instr->IsCustomFusion());
+  }
+  EXPECT_THAT(
+      instr,
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
+                           m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-2, /*arel=*/2e-2}));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+TEST_F(TritonGemmTest, SplitLHSOutputTransposeAloneIsNotFused) {
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  p0 = s8[18,15000] parameter(0)
+  p0c = bf16[18,15000] convert(p0)
+  p1 = bf16[42,18] parameter(1)
+  d = bf16[15000,42] dot(p0c, p1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+  r1 = bf16[5,200,15,42] reshape(d)
+  ROOT t1 = bf16[5,42,200,15] transpose(r1), dimensions={0,3,1,2}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Bitcast(
+          m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
+                        .WithFusionKind(HloInstruction::FusionKind::kCustom))
+              .WithFusionKind(HloInstruction::FusionKind::kInput))));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+// TODO(b/393299275): this should just be a fusion test and does not need to be
+// in the codegen directory.
+// TODO(b/393299275): symbolic tile analysis fails to derive a tile for one
+// outer parameter here. However, we shouldn't be deriving this tile anyway,
+// and the underlying indexing map is incorrect. This requires a fix in
+// symbolic tile derivation.
+TEST_F(TritonGemmTest, DISABLED_SplitLHSInputOutputIsFused) {
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  if (std::holds_alternative<se::RocmComputeCapability>(
+          GpuComputeCapability())) {
+    GTEST_SKIP() << "Skipped until corresponding issue on ROCm is fixed.";
+  }
+
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = s8[5,18,20,150] parameter(0)
+  p0c = bf16[5,18,20,150] convert(p0)
+  t0 = bf16[18,5,20,150] transpose(p0c), dimensions={1,0,2,3}
+  r0 = bf16[18,15000] reshape(t0)
+  p1 = bf16[42,18] parameter(1)
+  d = bf16[15000,42] dot(r0, p1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+  r1 = bf16[5,20,150,42] reshape(d)
+  ROOT t1 = bf16[5,42,20,150] transpose(r1), dimensions={0,3,1,2}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, SupportPredParametersUsedInExpressions) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p = pred[2,2]{1,0} parameter(0)
+  a = f32[2,2]{1,0} parameter(1)
+  b = f32[2,2]{1,0} parameter(2)
+  c = f32[2,2]{1,0} parameter(3)
+  compare = pred[2,2]{1,0} compare(a, b), direction=LT
+  and = pred[2,2]{1,0} and(p, compare)
+  convert = f32[2,2]{1,0} convert(and)
+  ROOT r = f32[2,2]{1,0} dot(convert, c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
+                           m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): This test name might be a bit misleading, since the dot is
+// given a non-contracting dimension by the time it gets passed down to the
+// Triton emitter. This should probably be a fusion test.
+// TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
+// moved to deviceless test file.
+TEST_F(TritonGemmTest, LowerDotWithLhsWithoutNonContractingDimThroughTriton) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  parameter_0 = f32[1,40] parameter(0)
+  parameter_1 = f32[1,40,250000] parameter(1)
+  ROOT dot = f32[1,250000] dot(parameter_0, parameter_1), lhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      GetNonBitcastRoot(module->entry_computation()),
+      GmockMatch(
+          m::Fusion(m::Bitcast(m::Parameter()), m::Bitcast(m::Parameter()))
+              .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// TODO(b/393299275): This test name might be a bit misleading, since the dot is
+// given a non-contracting dimension by the time it gets passed down to the
+// Triton emitter. This should probably be a fusion test.
+// TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
+// moved to deviceless test file.
+TEST_F(TritonGemmTest, LowerDotWithRhsWithoutNonContractingDimThroughTriton) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  parameter_0 = f32[1,40,250000] parameter(0)
+  parameter_1 = f32[1,40] parameter(1)
+  ROOT dot = f32[1,250000] dot(parameter_0, parameter_1), lhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      GetNonBitcastRoot(module->entry_computation()),
+      GmockMatch(
+          m::Fusion(m::Bitcast(m::Parameter()), m::Bitcast(m::Parameter()))
+              .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+}
+
+// This group of tests compares GPU results of dots already rewritten
+// into Triton fusions.
+using CompareTest = TritonGemmTest;
+
+TEST_F(CompareTest, F32WithTrivialNonContractingDimension) {
+  constexpr absl::string_view kHloTextRef = R"(
+HloModule r
+
+ENTRY e {
+  arg0 = f32[5,7] parameter(0)
+  arg1 = f32[1,7] parameter(1)
+  gemm = (f32[5,1], s8[0]{0}) custom-call(arg0, arg1),
+    custom_call_target="__cublas$gemm",
+    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[5,1]{1,0} get-tuple-element((f32[5,1]{1,0}, s8[0]{0}) gemm), index=0
+}
+)";
+
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[5,7] parameter(0)
+  p1 = f32[1,7] parameter(1)
+  ROOT dot = f32[5,1] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[5,7]{1,0} parameter(0)
+  p1 = f32[1,7]{1,0} parameter(1)
+  ROOT _ = f32[5,1] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":1,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata test_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloText));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> ref_module,
+                          ParseAndReturnVerifiedModule(kHloTextRef));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      std::move(ref_module), std::move(test_module_and_metadata.module),
+      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
+      /*run_hlo_passes=*/false));
+}
+
+// TODO(b/353484968, b/393299275): the e2e test path was never really testing
+// anything useful until now---it used to not actually set opt-in shared memory
+// on properly. Additionally, it claimed to be testing Ampere specifically but
+// runs across every possible chip that is supported by Triton. The test should
+// probably be made deviceless and repurposed to test that opt-in shared memory
+// is used only.
+TEST_F(CompareTest, UsingOptinSharedMemoryProducesSameResult) {
+  if (std::holds_alternative<se::RocmComputeCapability>(
+          GpuComputeCapability())) {
+    GTEST_SKIP() << "No Optin Shared Memory on AMD.";
+  }
+  const se::DeviceDescription dev_info =
+      backend().default_stream_executor()->GetDeviceDescription();
+  // TODO(b/353484968): pin this test to a specific device type to ensure
+  // correct expectations.
+  //
+  // On Hopper, the RHS has to be provided through shared memory, so a minima,
+  // the kernel will get away with using
+  //   num_stages * block_k * block_n * sizeof(rhs_element_type)
+  // = 2          * 128     * 128     * 2
+  // = 65536 bytes.
+  //
+  // This should hold on Blackwell as well.
+  constexpr int kBytesOfSharedMemoryTested = 2 * 128 * 128 * 2;
+  EXPECT_GE(dev_info.shared_memory_per_block_optin(),
+            kBytesOfSharedMemoryTested);
+
+  const std::string kHloTextOptinShmem = R"(
+triton_dot {
+  p0 = s8[332,441]{1,0} parameter(0)
+  convert = bf16[332,441]{1,0} convert(p0)
+  p1 = bf16[441,39]{1,0} parameter(1)
+  // Fix an algorithm on the dot in order to explicitly control the size of the
+  // operands in shared memory, as well as the precision.
+  ROOT dot = bf16[332,39]{1,0} dot(convert, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_bf16_bf16_f32
+}
+
+ENTRY e {
+  p0 = s8[332,441]{1,0} parameter(0)
+  p1 = bf16[441,39]{1,0} parameter(1)
+  ROOT _ = bf16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":128,"block_n":128,"block_k":128,
+                         "split_k":1,"num_stages":2,"num_warps":32,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata optin_shmem_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloTextOptinShmem));
+  const HloFusionInstruction* triton_dot_fusion = Cast<HloFusionInstruction>(
+      optin_shmem_module_and_metadata.computation->FusionInstruction());
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      const auto result,
+      TritonWrapper("test_fn", triton_dot_fusion, GpuComputeCapability(),
+                    dev_info,
+                    optin_shmem_module_and_metadata.block_level_parameters,
+                    &llvm_module, mlir_context));
+  // The config is chosen so that the used memory size is slightly above the
+  // 48 kB boundary of standard / opt-in shared memory so that any GPU that
+  // has the opt-in one should be able to execute the test.
+  EXPECT_EQ(result.shmem_bytes, kBytesOfSharedMemoryTested);
+  // Make sure the written config indeed has to use optin shared memory.
+  EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
+
+  const std::string kHloTextLowShmem = R"(
+HloModule t
+
+triton_dot {
+  p0 = s8[332,441]{1,0} parameter(0)
+  convert = bf16[332,441]{1,0} convert(p0)
+  p1 = bf16[441,39]{1,0} parameter(1)
+  // Fix an algorithm on the dot in order to explicitly control the size of the
+  // operands in shared memory, as well as the precision.
+  ROOT dot = bf16[332,39]{1,0} dot(convert, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_bf16_bf16_f32
+}
+
+ENTRY e {
+  p0 = s8[332,441]{1,0} parameter(0)
+  p1 = bf16[441,39]{1,0} parameter(1)
+  ROOT _ = bf16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata low_shmem_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloTextLowShmem));
+
+  EXPECT_TRUE(
+      RunAndCompareTwoModules(std::move(low_shmem_module_and_metadata.module),
+                              std::move(optin_shmem_module_and_metadata.module),
+                              ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
+                              /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, SplitK) {
+  // This test checks that the result of split-k HLO with reduce result is
+  // similar to the non-split-k version. As introduction of split-k changes
+  // the order of floating point operations we expect the results to be
+  // slightly different.
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  constexpr absl::string_view hlo_text_ref = R"(
+HloModule t
+
+triton_gemm_r {
+  p0 = s8[16,64] parameter(0)
+  c0 = f32[16,64] convert(p0)
+  p1 = bf16[16,64] parameter(1)
+  c1 = f32[16,64] convert(p1)
+  ROOT d0 = f32[16,16] dot(c0, c1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = s8[16,64] parameter(0)
+  p1 = bf16[16,64] parameter(1)
+  ROOT r = f32[16,16] fusion(p0, p1), kind=kCustom, calls=triton_gemm_r,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":64,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
+}
+)";
+
+  constexpr absl::string_view hlo_text_splitk = R"(
+HloModule t
+
+triton_gemm_r {
+  p0 = s8[16,64] parameter(0)
+  c0 = f32[16,64] convert(p0)
+  b0 = f32[16,4,16] bitcast(c0)
+  p1 = bf16[16,64] parameter(1)
+  c1 = f32[16,64] convert(p1)
+  b1 = f32[16,4,16] bitcast(c1)
+  ROOT dot1 = f32[4,16,16] dot(b0, b1),
+    lhs_batch_dims={1}, lhs_contracting_dims={2},
+    rhs_batch_dims={1}, rhs_contracting_dims={2}
+}
+
+add {
+  p1 = f32[] parameter(1)
+  p0 = f32[] parameter(0)
+  ROOT add1 = f32[] add(p0, p1)
+}
+
+fused_computation {
+  p0 = f32[4,16,16] parameter(0)
+  c0 = f32[] constant(0)
+  ROOT r1 = f32[16,16] reduce(p0, c0), dimensions={0},
+    to_apply=add
+}
+
+ENTRY e {
+  p0 = s8[16,64] parameter(0)
+  p1 = bf16[16,64] parameter(1)
+  gemm = f32[4,16,16] fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":64,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
+  ROOT f1 = f32[16,16] fusion(gemm), kind=kLoop,
+    calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata test_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(hlo_text_splitk));
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata ref_module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(hlo_text_ref));
+
+  EXPECT_TRUE(
+      RunAndCompareTwoModules(std::move(ref_module_and_metadata.module),
+                              std::move(test_module_and_metadata.module),
+                              ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
+                              /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, SplitKBatch) {
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  const std::string kHloTextRef = R"(
+triton_gemm_dot.24 {
+  parameter_1 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  bitcast.3 = bf16[800,5,128]{2,1,0} bitcast(parameter_1)
+  convert.3 = f32[800,5,128]{2,1,0} convert(bitcast.3)
+  parameter_0 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  bitcast.2 = f32[5,700,800]{2,1,0} bitcast(parameter_0)
+  ROOT dot.26 = f32[5,128,700]{2,1,0} dot(convert.3, bitcast.2), lhs_batch_dims={1}, lhs_contracting_dims={0}, rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+ENTRY e {
+  tmp_3 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  ROOT triton_gemm_dot.24 = f32[5,128,700]{2,1,0} fusion(tmp_3, tmp_0),
+    kind=kCustom, calls=triton_gemm_dot.24,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
+                         "split_k":1,"num_stages":2,"num_warps":8,
+                         "num_ctas":1}}}
+})";
+
+  const std::string kHloTextTest = R"(
+triton_gemm_dot {
+  parameter_1 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  bitcast.3 = bf16[800,5,128]{2,1,0} bitcast(parameter_1)
+  convert.3 = f32[800,5,128]{2,1,0} convert(bitcast.3)
+  bitcast = f32[8,100,5,128]{3,2,1,0} bitcast(convert.3)
+  parameter_0 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  bitcast.2 = f32[5,700,800]{2,1,0} bitcast(parameter_0)
+  bitcast.1 = f32[5,700,8,100]{3,2,1,0} bitcast(bitcast.2)
+  ROOT dot = f32[8,5,128,700]{3,2,1,0} dot(bitcast, bitcast.1), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={2,0}, rhs_contracting_dims={3}
+}
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY e {
+  tmp_3 = f32[1,5,700,800]{3,2,1,0} parameter(0)
+  tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
+  triton_gemm_dot.24 = f32[8,5,128,700]{3,2,1,0} fusion(tmp_3, tmp_0),
+    kind=kCustom, calls=triton_gemm_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
+                         "split_k":8,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
+  constant = f32[] constant(0)
+  ROOT reduce = f32[5,128,700]{2,1,0} reduce(triton_gemm_dot.24, constant), dimensions={0}, to_apply=add
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata test_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata ref_module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextRef));
+
+  EXPECT_TRUE(
+      RunAndCompareTwoModules(std::move(ref_module_and_metadata.module),
+                              std::move(test_module_and_metadata.module),
+                              ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
+                              /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, SplitKNontrivialBitcast) {
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  const std::string kHloTextRef = R"(
+HloModule module
+
+dot {
+  p0 = s8[512,96]{1,0} parameter(0)
+  convert = bf16[512,96]{1,0} convert(p0)
+  p1 = bf16[16,4,128]{2,1,0} parameter(1)
+  bitcast = bf16[16,512]{1,0} bitcast(p1)
+  ROOT dot = bf16[16,96]{1,0} dot(bitcast, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = s8[512,96]{1,0} parameter(0)
+  p1 = bf16[16,4,128]{2,1,0} parameter(1)
+  ROOT dot = bf16[16,96]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":256,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
+})";
+
+  const std::string kHloTextTest = R"(
+HloModule module
+
+dot {
+  p0 = s8[512,96]{1,0} parameter(0)
+  convert = bf16[512,96]{1,0} convert(p0)
+  p1 = bf16[16,4,128]{2,1,0} parameter(1)
+  bitcast_p1 = bf16[16,16,32]{2,1,0} bitcast(p1)
+  bitcast_convert = bf16[16,32,96]{2,1,0} bitcast(convert)
+  ROOT dot = bf16[16,16,96]{2,1,0} dot(bitcast_p1, bitcast_convert),
+    lhs_batch_dims={1}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+reducer {
+  rhs = f32[] parameter(1)
+  lhs = f32[] parameter(0)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+split_k_reducer {
+  p0 = bf16[16,16,96]{2,1,0} parameter(0)
+  convert = f32[16,16,96]{2,1,0} convert(p0)
+  c0 = f32[] constant(0)
+  reduce = f32[16,96]{1,0} reduce(convert, c0),
+    dimensions={0}, to_apply=reducer
+  ROOT output = bf16[16,96]{1,0} convert(reduce)
+}
+
+ENTRY entry {
+  p0 = s8[512,96]{1,0} parameter(0)
+  p1 = bf16[16,4,128]{2,1,0} parameter(1)
+  dot = bf16[16,16,96]{2,1,0} fusion(p0, p1),
+    kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":32,
+                         "split_k":16,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
+  ROOT output = bf16[16,96]{1,0} fusion(dot), kind=kLoop, calls=split_k_reducer
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata test_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata ref_module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextRef));
+
+  EXPECT_TRUE(
+      RunAndCompareTwoModules(std::move(ref_module_and_metadata.module),
+                              std::move(test_module_and_metadata.module),
+                              ErrorSpec{/*aabs=*/2, /*arel=*/1e-2},
+                              /*run_hlo_passes=*/false));
+}
+
+// This is based on gemm_fusion_test.cc/SplitKTest.SupportsIndivisible.
+//
+// There were relatively large numeric errors with an f16 temporary buffer, so I
+// ended up using --xla_gpu_triton_gemm_disable_reduced_precision_reduction=true
+// when generating this test case.
+//
+// TODO(b/393299275): transform this test once padding derivation if fixed.
+TEST_F(CompareTest, DISABLED_SupportsSplitKWithIndivisibleKComplexExample) {
+  constexpr absl::string_view kHloTextRef = R"(
+dot {
+  p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
+  bitcast = s8[3,5,32,129]{2,1,3,0} bitcast(p0)
+  copy = s8[3,5,32,129]{3,2,1,0} copy(bitcast)
+  reshape = s8[480,129]{1,0} reshape(copy)
+  convert = f16[480,129]{1,0} convert(reshape)
+  p1 = f16[16,129]{1,0} parameter(1)
+  ROOT dot = f16[480,16]{1,0} dot(convert, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY entry_computation {
+  p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
+  p1 = f16[16,129]{1,0} parameter(1)
+  ROOT fusion = f16[480,16]{1,0} fusion(p0, p1), kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256",
+                          "split_k":"1","num_stages":"1","num_warps":"4",
+                          "num_ctas":"1"}}}
+}
+)";
+
+  constexpr absl::string_view kHloTextTest = R"(
+dot {
+  p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
+  bitcast_p0 = s8[3,5,32,129]{2,1,3,0} bitcast(p0)
+  copy_p0 = s8[3,5,32,129]{3,2,1,0} copy(bitcast_p0)
+  reshape_p0 = s8[480,129]{1,0} reshape(copy_p0)
+  convert.8 = f16[480,129]{1,0} convert(reshape_p0)
+  c0 = f16[] constant(0)
+  pad_p0 = f16[480,130]{1,0} pad(convert.8, c0), padding=0_0x0_1
+  bitcast_pad_p0 = f16[480,2,65]{2,1,0} bitcast(pad_p0)
+  dot_lhs = f32[480,2,65]{2,1,0} convert(bitcast_pad_p0)
+  p1 = f16[16,129]{1,0} parameter(1)
+  pad_p1 = f16[16,130]{1,0} pad(p1, c0), padding=0_0x0_1
+  bitcast_pad_p1 = f16[16,2,65]{2,1,0} bitcast(pad_p1)
+  dot_rhs = f32[16,2,65]{2,1,0} convert(bitcast_pad_p1)
+  ROOT dot.2 = f32[2,480,16]{2,1,0} dot(dot_lhs, dot_rhs),
+    lhs_batch_dims={1}, lhs_contracting_dims={2},
+    rhs_batch_dims={1}, rhs_contracting_dims={2}
+}
+
+reducer {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+split_k_reducer {
+  p0 = f32[2,480,16]{2,1,0} parameter(0)
+  c0 = f32[] constant(0)
+  reduce = f32[480,16]{1,0} reduce(p0, c0), dimensions={0}, to_apply=reducer
+  ROOT convert = f16[480,16]{1,0} convert(reduce)
+}
+
+ENTRY entry_computation {
+  p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
+  p1 = f16[16,129]{1,0} parameter(1)
+  fusion = f32[2,480,16]{2,1,0} fusion(p0, p1), kind=kCustom, calls=dot,
+  backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+  "triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"64",
+                        "split_k":"2","num_stages":"1","num_warps":"8",
+                        "num_ctas":"1"}}}
+  ROOT output = f16[480,16]{1,0} fusion(fusion), kind=kLoop,
+    calls=split_k_reducer
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata test_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata ref_module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextRef));
+
+  EXPECT_TRUE(
+      RunAndCompareTwoModules(std::move(ref_module_and_metadata.module),
+                              std::move(test_module_and_metadata.module),
+                              ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
+                              /*run_hlo_passes=*/false));
+}
+
+// TODO(b/393299275): transform this test once padding derivation if fixed.
+TEST_F(CompareTest, DISABLED_SupportsSplitKWithIndivisibleKUsingPaddingEqual1) {
+  constexpr absl::string_view kHloTextRef = R"(
+HloModule extracted, entry_computation_layout={(f16[1,8,4,1023]{3,2,1,0}, f16[1,1023,128]{2,1,0})->f16[1,8,4,128]{3,2,1,0}}
+
+triton_gemm_dot.7103_computation.clone {
+  parameter_0.499 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
+  bitcast.7923 = f16[32,1023]{1,0} bitcast(parameter_0.499)
+  parameter_1.499 = f16[1,1023,128]{2,1,0} parameter(1)
+  bitcast.7924 = f16[1023,128]{1,0} bitcast(parameter_1.499)
+  dot.9350 = f16[32,128]{1,0} dot(bitcast.7923, bitcast.7924), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT bitcast.7925 = f16[1,8,4,128]{3,2,1,0} bitcast(dot.9350)
+}
+
+ENTRY entry_computation {
+  p0 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
+  p1 = f16[1,1023,128]{2,1,0} parameter(1)
+  ROOT triton_gemm_dot.7103 = f16[1,8,4,128]{3,2,1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot.7103_computation.clone,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"32",
+                          "split_k":"1","num_stages":"4","num_warps":"4",
+                          "num_ctas":"1"}}}
+})";
+
+  constexpr absl::string_view kHloTextSplitK = R"(
+HloModule extracted, entry_computation_layout={(f16[1,8,4,1023]{3,2,1,0}, f16[1,1023,128]{2,1,0})->f16[1,8,4,128]{3,2,1,0}}
+
+triton_gemm_dot.7103_computation.clone {
+  parameter_0.499 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
+  bitcast.7923 = f16[32,1023]{1,0} bitcast(parameter_0.499)
+  constant = f16[] constant(0)
+  pad = f16[32,1024]{1,0} pad(bitcast.7923, constant), padding=0_0x0_1
+  bitcast = f16[32,8,128]{2,1,0} bitcast(pad)
+  parameter_1.499 = f16[1,1023,128]{2,1,0} parameter(1)
+  bitcast.7924 = f16[1023,128]{1,0} bitcast(parameter_1.499)
+  constant.1 = f16[] constant(0)
+  pad.1 = f16[1024,128]{1,0} pad(bitcast.7924, constant.1), padding=0_1x0_0
+  bitcast.1 = f16[8,128,128]{2,1,0} bitcast(pad.1)
+  dot.1 = f16[8,32,128]{2,1,0} dot(bitcast, bitcast.1), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  ROOT bitcast.7925.clone = f16[8,1,8,4,128]{4,3,2,1,0} bitcast(dot.1)
+}
+
+triton_gemm_dot.7103.reduce_sub_computation.clone {
+  lhs.1 = f32[] parameter(0)
+  rhs.1 = f32[] parameter(1)
+  add.2 = f32[] add(lhs.1, rhs.1)
+  convert.13 = f16[] convert(add.2)
+  ROOT convert.12 = f32[] convert(convert.13)
+}
+
+fused_computation.1 {
+  param_0.5 = f16[8,1,8,4,128]{4,3,2,1,0} parameter(0)
+  convert.16 = f32[8,1,8,4,128]{4,3,2,1,0} convert(param_0.5)
+  constant.3 = f16[] constant(0)
+  convert.15 = f32[] convert(constant.3)
+  reduce.1 = f32[1,8,4,128]{3,2,1,0} reduce(convert.16, convert.15), dimensions={0}, to_apply=triton_gemm_dot.7103.reduce_sub_computation.clone
+  ROOT convert.14 = f16[1,8,4,128]{3,2,1,0} convert(reduce.1)
+}
+
+ENTRY entry_computation {
+  p0 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
+  p1 = f16[1,1023,128]{2,1,0} parameter(1)
+  triton_gemm_dot.7103 = f16[8,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_dot.7103_computation.clone,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"16","block_n":"128","block_k":"32",
+                          "split_k":"8","num_stages":"1","num_warps":"4",
+                          "num_ctas":"1"}}}
+  ROOT fusion.1 = f16[1,8,4,128]{3,2,1,0} fusion(triton_gemm_dot.7103), kind=kLoop, calls=fused_computation.1
+}
+)";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
+                                      ErrorSpec{/*aabs=*/4e-2, /*arel=*/2e-2},
+                                      /*run_hlo_passes=*/false));
+}
+
+// TODO(b/393299275): symbolic tile derivation fails for one of the padded
+// operands, with indexing map
+//  (d0, d1, d2, d3) -> (d1, d0 * 64 + d3)
+//    domain: d0 in [0, 15]
+//            d1 in [0, 31]
+//            d2 in [0, 127]
+//            d3 in [0, 63]
+//            d0 * 64 + d3 in [0, 1018]
+// While the expression should be processed without any issue, padding
+// introduces a non-redundant pre-existing constraint d0 * 64 + d3 in [0, 1018],
+// which causes the derivation to be rejected. The reason for this is that it's
+// not quite clear how to handle these pre-existing constraints in the general
+// sense. But wrt HLO specifically and symbolic tile analysis, we could probably
+// decide to just drop them from symbolic tile derivation: the reason for that
+// is that offset constraints are handled via `tile_offsets_indexing` anyway,
+// and it's all that should be relevant afaik. We can probably let the caller
+// decide to drop pre-existing constraints.
+TEST_F(CompareTest, DISABLED_SupportsSplitKWithIndivisibleKUsingPaddingEqual5) {
+  constexpr absl::string_view kHloTextRef = R"(
+HloModule extracted
+
+dot {
+  p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
+  bitcast_p0 = f16[32,1019]{1,0} bitcast(p0)
+  p1 = f16[1,1019,128]{2,1,0} parameter(1)
+  bitcast_p1 = f16[1019,128]{1,0} bitcast(p1)
+  dot = f16[32,128]{1,0} dot(bitcast_p0, bitcast_p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT bitcast_dot = f16[1,8,4,128]{3,2,1,0} bitcast(dot)
+}
+
+ENTRY entry_computation {
+  p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
+  p1 = f16[1,1019,128]{2,1,0} parameter(1)
+  ROOT dot = f16[1,8,4,128]{3,2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=dot,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256",
+                          "split_k":"1","num_stages":"1","num_warps":"4",
+                          "num_ctas":"1"}}}
+}
+)";
+
+  constexpr absl::string_view kHloTextTest = R"(
+HloModule extracted
+
+split_k_dot {
+  p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
+  bitcast_p0 = f16[32,1019]{1,0} bitcast(p0)
+  c0 = f16[] constant(0)
+  pad_p0 = f16[32,1024]{1,0} pad(bitcast_p0, c0), padding=0_0x0_5
+  bitcast_pad_p0 = f16[32,16,64]{2,1,0} bitcast(pad_p0)
+  p1 = f16[1,1019,128]{2,1,0} parameter(1)
+  bitcast_p1 = f16[1019,128]{1,0} bitcast(p1)
+  pad_p1 = f16[1024,128]{1,0} pad(bitcast_p1, c0), padding=0_5x0_0
+  bitcast_pad_p1 = f16[16,64,128]{2,1,0} bitcast(pad_p1)
+  dot = f16[16,32,128]{2,1,0} dot(bitcast_pad_p0, bitcast_pad_p1),
+    lhs_batch_dims={1}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+  ROOT bitcast.7925.clone = f16[16,1,8,4,128]{4,3,2,1,0} bitcast(dot)
+}
+
+reducer {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  add = f32[] add(lhs, rhs)
+  convert = f16[] convert(add)
+  ROOT output = f32[] convert(convert)
+}
+
+split_k_reducer {
+  p0 = f16[16,1,8,4,128]{4,3,2,1,0} parameter(0)
+  convert = f32[16,1,8,4,128]{4,3,2,1,0} convert(p0)
+  c0 = f32[] constant(0)
+  reduce = f32[1,8,4,128]{3,2,1,0} reduce(convert, c0), dimensions={0}, to_apply=reducer
+  ROOT output = f16[1,8,4,128]{3,2,1,0} convert(reduce)
+}
+
+ENTRY entry_computation {
+  p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
+  p1 = f16[1,1019,128]{2,1,0} parameter(1)
+  dot = f16[16,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=split_k_dot,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"64","block_n":"32","block_k":"32",
+                          "split_k":"16","num_stages":"1","num_warps":"4",
+                          "num_ctas":"1"}}}
+  ROOT fusion = f16[1,8,4,128]{3,2,1,0} fusion(dot), kind=kLoop,
+    calls=split_k_reducer
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata test_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata ref_module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextRef));
+
+  EXPECT_TRUE(
+      RunAndCompareTwoModules(std::move(ref_module_and_metadata.module),
+                              std::move(test_module_and_metadata.module),
+                              ErrorSpec{/*aabs=*/4e-2, /*arel=*/2e-2},
+                              /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, NonMajorMostOutputBatchWorksCorrectly) {
+  const std::string kHloTextTest = R"(
+HloModule m
+
+dot {
+  p0 = pred[32,26,104]{2,1,0} parameter(0)
+  p1 = f32[32,50,104]{2,1,0} parameter(1)
+  convert = f32[32,26,104]{2,1,0} convert(p0)
+  ROOT dot = f32[32,50,26]{2,0,1} dot(p1, convert),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+ENTRY e {
+  p0 = pred[32,26,104]{2,1,0} parameter(0)
+  p1 = f32[32,50,104]{2,1,0} parameter(1)
+  ROOT dot = f32[32,50,26]{2,0,1} fusion(p0, p1),
+    kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":16,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
+})";
+
+  const std::string kHloTextRef = R"(
+HloModule m
+
+dot {
+  p0 = pred[32,26,104]{2,1,0} parameter(0)
+  p1 = f32[32,50,104]{2,1,0} parameter(1)
+  convert = f32[32,26,104]{2,1,0} convert(p0)
+  ROOT dot = f32[32,50,26]{2,1,0} dot(p1, convert),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+}
+
+loop_fusion {
+  p0 = f32[32,50,26]{2,1,0} parameter(0)
+  transpose = f32[50,32,26]{2,1,0} transpose(p0), dimensions={1,0,2}
+  ROOT bitcast = f32[32,50,26]{2,0,1} bitcast(transpose)
+}
+
+ENTRY e {
+  p0 = pred[32,26,104]{2,1,0} parameter(0)
+  p1 = f32[32,50,104]{2,1,0} parameter(1)
+  dot = f32[32,50,26]{2,1,0} fusion(p0, p1),
+    kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":128,"block_k":64,
+                         "split_k":1,"num_stages":2,"num_warps":4,
+                         "num_ctas":1}}}
+  ROOT fusion = f32[32,50,26]{2,0,1} fusion(dot), kind=kLoop,
+    calls=loop_fusion
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndNestedFusionMetadata test_module_and_metadata,
+      GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata ref_module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextRef));
+
+  EXPECT_TRUE(
+      RunAndCompareTwoModules(std::move(ref_module_and_metadata.module),
+                              std::move(test_module_and_metadata.module),
+                              ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
+                              /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, TritonDotFusionCanHaveOnlyRHSParameter) {
+  const std::string kHloTextTest = R"(
+HloModule m, is_scheduled=true
+
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  c = f16[] constant(321)
+  b = f16[11,63] broadcast(c)
+  cc = f32[11,63] convert(b)
+  ROOT _.1 = f32[63,92]{1,0} dot(cc, parameter_0),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[92,11]{1,0} parameter(0)
+  ROOT triton_gemm__ = f32[63,92]{1,0} fusion(p0), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+                    "triton_gemm_config":{"block_m":"16","block_n":"64",
+                                          "block_k":"16","split_k":"1",
+                                          "num_stages":"3","num_warps":"2",
+                                          "num_ctas":"1"}}}
+})";
+
+  const std::string kHloTextRef = R"(
+HloModule m, is_scheduled=true
+
+ENTRY e {
+  constant_2 = f32[] constant(321)
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  broadcast.2 = f32[11,63]{1,0} broadcast(constant_2), dimensions={}
+  gemm = (f32[63,92]{1,0}, s8[0]{0}) custom-call(broadcast.2, parameter_0),
+    custom_call_target="__cublas$gemm",
+    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["1"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[63,92]{1,0} get-tuple-element((f32[63,92]{1,0}, s8[0]{0}) gemm), index=0
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> ref_module,
+                          ParseAndReturnVerifiedModule(kHloTextRef));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(ref_module),
+                                      std::move(module_and_metadata.module),
+                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, TritonDotFusionCanHaveNoParametersAtAll) {
+  const std::string kHloTextTest = R"(
+HloModule m, is_scheduled=true
+
+triton_gemm___computation {
+  c = f32[] constant(7)
+  b = f32[11,61] broadcast(c)
+  c2 = f32[] constant(5)
+  b2 = f32[61,45] broadcast(c2)
+  ROOT _.1 = f32[11,45]{1,0} dot(b, b2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  ROOT triton_gemm__ = f32[11,45]{1,0} fusion(), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+                    "triton_gemm_config":{"block_m":"16","block_n":"64",
+                                          "block_k":"16","split_k":"1",
+                                          "num_stages":"3","num_warps":"2",
+                                          "num_ctas":"1"}}}
+})";
+
+  const std::string kHloTextRef = R"(
+HloModule m, is_scheduled=true
+
+ENTRY triton_gemm___computation {
+  constant_1 = f32[] constant(7)
+  constant = f32[] constant(5)
+  broadcast = f32[11,61]{1,0} broadcast(constant), dimensions={}
+  broadcast.1 = f32[61,45]{1,0} broadcast(constant_1), dimensions={}
+  gemm = (f32[11,45]{1,0}, s8[0]{0}) custom-call(broadcast, broadcast.1),
+    custom_call_target="__cublas$gemm",
+    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[11,45]{1,0} get-tuple-element((f32[11,45]{1,0}, s8[0]{0}) gemm), index=0
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> ref_module,
+                          ParseAndReturnVerifiedModule(kHloTextRef));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(ref_module),
+                                      std::move(module_and_metadata.module),
+                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, TritonDotFusionCanHaveManyParameters) {
+  const std::string kHloTextTest = R"(
+HloModule m
+
+triton_gemm_dot_computation {
+  tmp_1 = pred[3,32]{1,0} parameter(0)
+  tmp_2 = f32[3,32]{1,0} parameter(1)
+  tmp_3 = f32[3,32]{1,0} parameter(2)
+  tmp_4 = f32[3,32]{1,0} select(tmp_1, tmp_2, tmp_3)
+  tmp_5 = f32[3,32]{1,0} parameter(3)
+  tmp_6 = f32[3,32]{1,0} multiply(tmp_4, tmp_5)
+  tmp_7 = f32[3,32]{1,0} parameter(4)
+  tmp_8 = f32[3,32]{1,0} maximum(tmp_6, tmp_7)
+  tmp_9 = f32[3,57]{1,0} parameter(9)
+  tmp_10 = f32[3,57]{1,0} parameter(10)
+  tmp_11 = f32[3,57]{1,0} multiply(tmp_9, tmp_10)
+  tmp_12 = f32[3,57]{1,0} parameter(11)
+  tmp_13 = f32[3,57]{1,0} add(tmp_11, tmp_12)
+  tmp_14 = pred[3,57]{1,0} parameter(5)
+  tmp_15 = f32[3,57]{1,0} parameter(6)
+  tmp_16 = f32[3,57]{1,0} parameter(7)
+  tmp_17 = f32[3,57]{1,0} select(tmp_14, tmp_15, tmp_16)
+  tmp_18 = f32[3,57]{1,0} parameter(8)
+  tmp_19 = f32[3,57]{1,0} multiply(tmp_17, tmp_18)
+  tmp_20 = f32[3,57]{1,0} negate(tmp_19)
+  tmp_21 = f32[3,57]{1,0} add(tmp_13, tmp_20)
+  const_1 = f32[] constant(-3e-3)
+  const_2 = f32[] constant(3e-2)
+  broadcast_1 = f32[3,57]{1,0} broadcast(const_1), dimensions={}
+  broadcast_2 = f32[3,57]{1,0} broadcast(const_2), dimensions={}
+  tmp_22 = f32[3,57]{1,0} clamp(broadcast_1, tmp_21, broadcast_2)
+  ROOT tmp_23 = f32[32,57]{0,1} dot(tmp_8, tmp_22), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  tmp_1 = pred[3,32]{1,0} parameter(0)
+  tmp_2 = f32[3,32]{1,0} parameter(1)
+  tmp_3 = f32[3,32]{1,0} parameter(2)
+  tmp_5 = f32[3,32]{1,0} parameter(3)
+  tmp_7 = f32[3,32]{1,0} parameter(4)
+  tmp_14 = pred[3,57]{1,0} parameter(5)
+  tmp_15 = f32[3,57]{1,0} parameter(6)
+  tmp_16 = f32[3,57]{1,0} parameter(7)
+  tmp_18 = f32[3,57]{1,0} parameter(8)
+  tmp_9 = f32[3,57]{1,0} parameter(9)
+  tmp_10 = f32[3,57]{1,0} parameter(10)
+  tmp_12 = f32[3,57]{1,0} parameter(11)
+  ROOT r = f32[32,57]{0,1} fusion(tmp_1, tmp_2, tmp_3, tmp_5, tmp_7, tmp_14, tmp_15, tmp_16, tmp_18, tmp_9, tmp_10, tmp_12), kind=kCustom,
+    calls=triton_gemm_dot_computation,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+                    "triton_gemm_config":{"block_m":"64","block_n":"64",
+                                          "block_k":"64","split_k":"1",
+                                          "num_stages":"1","num_warps":"4",
+                                          "num_ctas":"1"}}}
+})";
+
+  const std::string kHloTextRef = R"(
+HloModule m
+
+fused_computation {
+  param_5.1 = f32[3,57]{1,0} parameter(5)
+  param_6 = f32[3,57]{1,0} parameter(6)
+  multiply.4 = f32[3,57]{1,0} multiply(param_5.1, param_6)
+  param_4.2 = f32[3,57]{1,0} parameter(4)
+  add.3 = f32[3,57]{1,0} add(multiply.4, param_4.2)
+  param_1.4 = pred[3,57]{1,0} parameter(1)
+  param_2.2 = f32[3,57]{1,0} parameter(2)
+  param_3.1 = f32[3,57]{1,0} parameter(3)
+  select.2 = f32[3,57]{1,0} select(param_1.4, param_2.2, param_3.1)
+  param_0.1 = f32[3,57]{1,0} parameter(0)
+  multiply.3 = f32[3,57]{1,0} multiply(select.2, param_0.1)
+  negate.1 = f32[3,57]{1,0} negate(multiply.3)
+  add.2 = f32[3,57]{1,0} add(add.3, negate.1)
+  const.1 = f32[] constant(-3e-3)
+  const.2 = f32[] constant(3e-2)
+  broadcast.1 = f32[3,57]{1,0} broadcast(const.1), dimensions={}
+  broadcast.2 = f32[3,57]{1,0} broadcast(const.2), dimensions={}
+  ROOT clamp = f32[3,57]{1,0} clamp(broadcast.1, add.2, broadcast.2)
+}
+
+fused_computation.1 {
+  param_2.4 = pred[3,32]{1,0} parameter(2)
+  param_3.2 = f32[3,32]{1,0} parameter(3)
+  param_4.3 = f32[3,32]{1,0} parameter(4)
+  select.3 = f32[3,32]{1,0} select(param_2.4, param_3.2, param_4.3)
+  param_1.7 = f32[3,32]{1,0} parameter(1)
+  multiply.5 = f32[3,32]{1,0} multiply(select.3, param_1.7)
+  param_0.3 = f32[3,32]{1,0} parameter(0)
+  ROOT maximum.1 = f32[3,32]{1,0} maximum(multiply.5, param_0.3)
+}
+
+ENTRY e {
+  tmp_18 = f32[3,57]{1,0} parameter(8)
+  tmp_16 = f32[3,57]{1,0} parameter(7)
+  tmp_15 = f32[3,57]{1,0} parameter(6)
+  tmp_14 = pred[3,57]{1,0} parameter(5)
+  tmp_12 = f32[3,57]{1,0} parameter(11)
+  tmp_10 = f32[3,57]{1,0} parameter(10)
+  tmp_9 = f32[3,57]{1,0} parameter(9)
+  tmp_7 = f32[3,32]{1,0} parameter(4)
+  tmp_5 = f32[3,32]{1,0} parameter(3)
+  tmp_3 = f32[3,32]{1,0} parameter(2)
+  tmp_2 = f32[3,32]{1,0} parameter(1)
+  tmp_1 = pred[3,32]{1,0} parameter(0)
+  fusion.1 = f32[3,32]{1,0} fusion(tmp_7, tmp_5, tmp_1, tmp_2, tmp_3), kind=kLoop, calls=fused_computation.1
+  fusion = f32[3,57]{1,0} fusion(tmp_18, tmp_14, tmp_15, tmp_16, tmp_12, /*index=5*/tmp_9, tmp_10), kind=kLoop, calls=fused_computation
+  gemm = (f32[32,57]{0,1}, s8[0]{0}) custom-call(fusion.1, fusion),
+    custom_call_target="__cublas$gemm",
+    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["0"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[32,57]{0,1} get-tuple-element((f32[32,57]{0,1}, s8[0]{0}) gemm), index=0
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> ref_module,
+                          ParseAndReturnVerifiedModule(kHloTextRef));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(ref_module),
+                                      std::move(module_and_metadata.module),
+                                      ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, PredToBF16ConversionWorks) {
+  if (!SupportsBF16(GpuComputeCapability())) {
+    GTEST_SKIP() << "BF16 not supported.";
+  }
+  const std::string kHloTextTest = R"(
+HloModule m, is_scheduled=true
+
+triton_gemm_computation {
+  parameter_0 = bf16[92,11]{1,0} parameter(0)
+  parameter_1 = s32[11,63]{1,0} parameter(1)
+  parameter_2 = s32[11,63]{1,0} parameter(2)
+  f1.1 = pred[11,63]{1,0} compare(parameter_1, parameter_2), direction=GE
+  c.1 = bf16[11,63]{1,0} convert(f1.1)
+  ROOT _.1 = bf16[92,63]{1,0} dot(parameter_0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = bf16[92,11]{1,0} parameter(0)
+  p1 = s32[11,63]{1,0} parameter(1)
+  p2 = s32[11,63]{1,0} parameter(2)
+  ROOT triton_gemm__ = bf16[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
+    calls=triton_gemm_computation,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+                    "triton_gemm_config":{"block_m":"32","block_n":"16",
+                                          "block_k":"32","split_k":"1",
+                                          "num_stages":"1","num_warps":"4",
+                                          "num_ctas":"1"}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  const std::string kHloTextRef = R"(
+HloModule m, is_scheduled=true
+
+fused_computation {
+  p0 = s32[11,63]{1,0} parameter(0)
+  p1 = s32[11,63]{1,0} parameter(1)
+  f.1 = pred[11,63]{1,0} compare(p0, p1), direction=GE
+  ROOT convert.1 = bf16[11,63]{1,0} convert(f.1)
+}
+
+ENTRY e {
+  p2 = s32[11,63]{1,0} parameter(2)
+  p1 = s32[11,63]{1,0} parameter(1)
+  p0 = bf16[92,11]{1,0} parameter(0)
+  fusion = bf16[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
+  gemm = (bf16[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
+    custom_call_target="__cublas$gemm",
+    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":
+      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
+      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
+      "alpha_imag":0,"precision_config":
+      {"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = bf16[92,63]{1,0} get-tuple-element((bf16[92,63]{1,0}, s8[0]{0}) gemm), index=0
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> ref_module,
+                          ParseAndReturnVerifiedModule(kHloTextRef));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(ref_module),
+                                      std::move(module_and_metadata.module),
+                                      ErrorSpec{/*aabs=*/0, /*arel=*/0},
+                                      /*run_hlo_passes=*/false));
+}
+
+// TODO(b/393299275): symbolic tile analysis fails to derive a tile for one
+// outer parameter here. However, we shouldn't be deriving this tile anyway,
+// and the underlying indexing map is incorrect. This requires a fix in
+// symbolic tile derivation.
+TEST_F(CompareTest, DISABLED_DifferentLayoutsAreSupportedInOneScope) {
+  const std::string kHloTextTest = R"(
+triton_dot {
+  p1 = f16[3,3,2,16]{1,3,2,0} parameter(1)
+  cvt1 = f32[3,3,2,16]{1,3,2,0} convert(p1)
+  p0 = f16[9,32]{0,1} parameter(0)
+  b0 = f16[3,3,2,16]{1,0,3,2} bitcast(p0)
+  cp0b0 = f16[2,16,3,3]{3,2,1,0} bitcast(b0)
+  cp0t0 = f16[3,2,16,3]{3,2,1,0} transpose(cp0b0), dimensions={2,0,1,3}
+  cp0b1 = f16[3,3,2,16]{1,3,2,0} bitcast(cp0t0)
+  cvt0 = f32[3,3,2,16]{1,3,2,0} convert(cp0b1)
+  m = f32[3,3,2,16]{1,3,2,0} multiply(cvt1, cvt0)
+  cvt2 = f16[3,3,2,16]{1,3,2,0} convert(m)
+  cp1b0 = f16[3,2,16,3]{3,2,1,0} bitcast(cvt2)
+  cp1t0 = f16[3,3,2,16]{3,2,1,0} transpose(cp1b0), dimensions={0,3,1,2}
+  b1 = f16[9,32]{1,0} bitcast(cp1t0)
+  p2 = f16[32,32]{1,0} parameter(2)
+  ROOT r = f16[9,32]{1,0} dot(b1, p2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f16[9,32]{0,1} parameter(0)
+  p1 = f16[3,3,2,16]{1,3,2,0} parameter(1)
+  p2 = f16[32,32]{1,0} parameter(2)
+  ROOT r = f16[9,32]{1,0} fusion(p0, p1, p2),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":"1"}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloTextTest));
+
+  const std::string kHloTextRef = R"(
+ENTRY e {
+  p1 = f16[3,3,2,16]{1,3,2,0} parameter(1)
+  cvt1 = f32[3,3,2,16]{1,3,2,0} convert(p1)
+  p0 = f16[9,32]{0,1} parameter(0)
+  b0 = f16[3,3,2,16]{1,0,3,2} bitcast(p0)
+  cp0b0 = f16[2,16,3,3]{3,2,1,0} bitcast(b0)
+  cp0t0 = f16[3,2,16,3]{3,2,1,0} transpose(cp0b0), dimensions={2,0,1,3}
+  cp0b1 = f16[3,3,2,16]{1,3,2,0} bitcast(cp0t0)
+  cvt0 = f32[3,3,2,16]{1,3,2,0} convert(cp0b1)
+  m = f32[3,3,2,16]{1,3,2,0} multiply(cvt1, cvt0)
+  cvt2 = f16[3,3,2,16]{1,3,2,0} convert(m)
+  cp1b0 = f16[3,2,16,3]{3,2,1,0} bitcast(cvt2)
+  cp1t0 = f16[3,3,2,16]{3,2,1,0} transpose(cp1b0), dimensions={0,3,1,2}
+  b1 = f16[9,32]{1,0} bitcast(cp1t0)
+  p2 = f16[32,32]{1,0} parameter(2)
+  ROOT r = f16[9,32]{1,0} dot(b1, p2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> ref_module,
+                          ParseAndReturnVerifiedModule(kHloTextRef));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(ref_module),
+                                      std::move(module_and_metadata.module),
+                                      ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(TritonTest, UseTF32For8BitOrLessWithF32) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+triton_dot {
+  parameter_0 = s32[11,24]{1,0} parameter(0)
+  broadcast = s32[11,24,128]{2,1,0} broadcast(parameter_0),
+  dimensions={0,1}
+  parameter_1 = s32[11,24,128]{2,1,0} parameter(1)
+  compare = pred[11,24,128]{2,1,0} compare(broadcast, parameter_1),
+      direction=EQ
+  bitcast = pred[264,128]{1,0} bitcast(compare)
+  convert = f32[264,128]{1,0} convert(bitcast)
+  parameter_2 = f32[128,8]{1,0} parameter(2)
+  ROOT dot = f32[264,8]{1,0} dot(convert, parameter_2),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s32[11,24]{1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT _ = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config:
+        {"block_m":32,"block_n":16,"block_k":128,
+         "split_k":1,"num_stages":1,"num_warps":4,
+         "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  TF_ASSERT_OK(
+      CreateTritonIrAndFileCheck(*module_and_metadata.computation,
+                                 module_and_metadata.block_level_parameters,
+                                 R"(
+CHECK:      tt.dot
+CHECK:      inputPrecision = tf32
+  )"));
+
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(std::move(module_and_metadata.module),
+                               ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, S8ToF16DotWithSmallTileDoesNotCrash) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_dot {
+  p0 = s8[33,33]{1,0} parameter(0)
+  c0 = f16[33,33]{1,0} convert(p0)
+  p1 = f16[33,33]{1,0} parameter(1)
+  ROOT _ = f16[33,33]{1,0} dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[33,33]{1,0} parameter(0)
+  p1 = f16[33,33]{1,0} parameter(1)
+  ROOT _ = f16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":16,
+                         "split_k":1,"num_stages":2,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  EXPECT_TRUE(Run(std::move(module_and_metadata.module),
+                  /*run_hlo_passes=*/false));
+}
+
+TEST_F(TritonGemmTest, S8ToF32DotWithManyWarpsDoesNotCrash) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_dot {
+  p0 = s8[16,65]{0,1} parameter(0)
+  c0 = f32[16,65]{1,0} convert(p0)
+  p1 = f32[65,128]{1,0} parameter(1)
+  ROOT _ = f32[16,128]{1,0} dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[16,65]{1,0} parameter(0)
+  p1 = f32[65,128]{1,0} parameter(1)
+  ROOT _ = f32[16,128] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":128,"block_k":32,
+                         "split_k":1,"num_stages":2,"num_warps":16,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  EXPECT_TRUE(Run(std::move(module_and_metadata.module),
+                  /*run_hlo_passes=*/false));
+}
+
+TEST_F(TritonGemmTest, Fp8DotWithSmallTileDoesNotCrash) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
+  }
+
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_dot {
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":16,
+                         "split_k":1,"num_stages":2,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  EXPECT_TRUE(Run(std::move(module_and_metadata.module),
+                  /*run_hlo_passes=*/false));
+}
+
+TEST_F(TritonGemmTest, Fp8DotWithManyWarpsDoesNotCrash) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
+  }
+
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_dot {
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":32,
+                         "split_k":1,"num_stages":2,"num_warps":16,
+                         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+  EXPECT_TRUE(Run(std::move(module_and_metadata.module),
+                  /*run_hlo_passes=*/false));
+}
+
+// Test PreventMmaV3LoopUnrolling pass in order to keep compile time low.
+// See b/344841434.
+// TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
+// moved to deviceless test file.
+TEST_F(TritonGemmTest, TestPreventMMAV3LoopUnrolling) {
+  if (GetCudaComputeCapability().major != se::CudaComputeCapability::kHopper) {
+    GTEST_SKIP() << "wgmma instruction is only available on Hopper";
+  }
+  constexpr absl::string_view kHloText = R"(
+gemm_fusion_dot {
+  p0 = f16[64,1024]{1,0} parameter(0)
+  p1 = f16[1024,32,32]{2,1,0} parameter(1)
+  bitcast = f16[1024,1024]{0,1} bitcast(p1)
+  ROOT dot = f16[64,1024]{1,0} dot(p0, bitcast),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f16[64,1024]{1,0} parameter(0)
+  p1 = f16[1024,32,32]{2,1,0} parameter(1)
+  ROOT triton_gemm_fusion_dot = f16[64,1024]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=gemm_fusion_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config:
+        {"block_m":64,"block_n":32,"block_k":32,
+         "split_k":1,"num_stages":1,"num_warps":4,
+         "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+
+  CompileAndOptionallyVerifyPtx(std::move(module_and_metadata.module), R"(
+                                R"(
+CHECK: $L__BB0_1:
+CHECK-NEXT: // begin inline asm
+CHECK-NEXT: .pragma "nounroll";
+CHECK: wgmma
+)",
+                                /*run_optimization_passes=*/false);
+}
+
+// TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
+// moved to deviceless test file.
+TEST_F(TritonGemmTest, WgmmaIsUsedForMemBoundShape) {
+  if (GetCudaComputeCapability().major != se::CudaComputeCapability::kHopper) {
+    GTEST_SKIP() << "wgmma instruction is only available on Hopper";
+  }
+  constexpr absl::string_view kHloText = R"(
+gemm_fusion_dot {
+  p0 = s8[128,128]{1,0} parameter(0)
+  p1 = bf16[128,16]{1,0} parameter(1)
+  convert = bf16[128,128]{1,0} convert(p0)
+  ROOT %dot = bf16[128,16]{1,0} dot(convert, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[128,128]{1,0} parameter(0)
+  p1 = bf16[128,16]{1,0} parameter(1)
+  ROOT triton_gemm_fusion_dot = bf16[128,16]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=gemm_fusion_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config:
+        {"block_m":128,"block_n":16,"block_k":16,
+         "split_k":1,"num_stages":1,"num_warps":4,
+         "num_ctas":1}}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(ModuleAndNestedFusionMetadata module_and_metadata,
+                          GetModuleAndNestedFusionMetadata(kHloText));
+
+  CompileAndOptionallyVerifyPtx(std::move(module_and_metadata.module), R"(
+CHECK: wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16
+)",
+                                /*run_optimization_passes=*/false);
+}
+
+// Test presence of default matmul config information when the GEMM autotuner is
+// not present in the compilation pipeline (which is always the case on ROCM).
+//
+// TODO(b/353484968): Tests that don't run RunAndCompareNoHloPasses should be
+// moved to deviceless test file.
+TEST_F(TritonGemmTest, TestNoAutotuner) {
+  constexpr absl::string_view kHloText = R"(
+ENTRY e {
+  p0 = f16[30,30] parameter(0)
+  p1 = s8[30,30] parameter(1)
+  cp1 = f16[30,30] convert(p1)
+  ROOT _ = f16[30,30] dot(p0, cp1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> verified_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  DebugOptions debug_options = verified_module->config().debug_options();
+  debug_options.set_xla_gpu_autotune_level(0);
+  verified_module->mutable_config().set_debug_options(debug_options);
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: fusion(
+; CHECK-SAME: kind=kCustom
+; CHECK-SAME: __triton_nested_gemm_fusion
+  )");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
index 470985637299..23eac251c800 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_legacy_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
@@ -52,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/path.h"
 
 namespace xla {
@@ -131,11 +133,57 @@ class TritonGemmTestWithoutTritonGemmAny : public TritonGemmTest {
   }
 };
 
-TEST_F(TritonGemmTest, FP8DotSmallTileDoesNotCrash) {
-  GTEST_SKIP() << "TODO(b/337839570): Re-enable once the bug is fixed. "
-                  "Currently the test is not representative of the issue. "
-                  "While the test passes, the end-to-end model fails.";
+TEST_F(TritonGemmTest, S8ToF16DotWithSmallTileDoesNotCrash) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_dot {
+  p0 = s8[33,33]{1,0} parameter(0)
+  c0 = f16[33,33]{1,0} convert(p0)
+  p1 = f16[33,33]{1,0} parameter(1)
+  ROOT _ = f16[33,33]{1,0} dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[33,33]{1,0} parameter(0)
+  p1 = f16[33,33]{1,0} parameter(1)
+  ROOT _ = f16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":16,
+                         "split_k":1,"num_stages":2,"num_warps":2,
+                         "num_ctas":1}}}
+})";
 
+  EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
+}
+
+TEST_F(TritonGemmTest, S8ToF32DotWithManyWarpsDoesNotCrash) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_dot {
+  p0 = s8[16,65]{0,1} parameter(0)
+  c0 = f32[16,65]{1,0} convert(p0)
+  p1 = f32[65,128]{1,0} parameter(1)
+  ROOT _ = f32[16,128]{1,0} dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[16,65]{1,0} parameter(0)
+  p1 = f32[65,128]{1,0} parameter(1)
+  ROOT _ = f32[16,128] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":128,"block_k":32,
+                         "split_k":1,"num_stages":2,"num_warps":16,
+                         "num_ctas":1}}}
+})";
+
+  EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
+}
+
+TEST_F(TritonGemmTest, Fp8DotWithSmallTileDoesNotCrash) {
   if (!GetCudaComputeCapability().IsAtLeastHopper()) {
     GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
   }
@@ -144,15 +192,16 @@ TEST_F(TritonGemmTest, FP8DotSmallTileDoesNotCrash) {
 HloModule m
 
 triton_dot {
-  %parameter_0 = f8e4m3fn[32,32]{1,0} parameter(0)
-  %parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
-  ROOT %dot.1643 = bf16[32,32]{1,0} dot(f8e4m3fn[32,32]{1,0} %parameter_0, f8e4m3fn[32,32]{0,1} %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY e {
-  p0 = f8e4m3fn[32,32]{1,0} parameter(0)
-  p1 = f8e4m3fn[32,32]{1,0} parameter(1)
-  ROOT _ = bf16[32,32] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm",
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":16,
                          "split_k":1,"num_stages":2,"num_warps":2,
@@ -162,6 +211,34 @@ ENTRY e {
   EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
 }
 
+TEST_F(TritonGemmTest, Fp8DotWithManyWarpsDoesNotCrash) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
+  }
+
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_dot {
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f8e4m3fn[33,33]{1,0} parameter(0)
+  p1 = f8e4m3fn[33,33]{1,0} parameter(1)
+  ROOT _ = bf16[33,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":32,
+                         "split_k":1,"num_stages":2,"num_warps":16,
+                         "num_ctas":1}}}
+})";
+
+  EXPECT_TRUE(Run(kHloText, /*run_hlo_passes=*/false));
+}
+
 TEST_F(TritonTest, TestGemm) {
   constexpr absl::string_view kHloText = R"(
 HloModule t, is_scheduled=true
@@ -478,111 +555,6 @@ CHECK-DAG:   tt.make_tensor_ptr %[[DYNAMIC_SLICE_INPUT]], [%[[C2_i64]], %[[ROW_L
       tsl::testing::IsOk());
 }
 
-TEST_F(TritonTest, SparseDot) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-triton_dot {
-  lhs = f16[128,160] parameter(0)
-  rhs = f16[320,64] parameter(1)
-  meta = u16[128,20] parameter(2)
-  ROOT dot = f16[128,64] dot(lhs, rhs, meta),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
-}
-
-ENTRY e {
-  lhs = f16[128,160] parameter(0)
-  rhs = f16[320,64] parameter(1)
-  meta = u16[128,20] parameter(2)
-  ROOT _ = f16[128,64] fusion(lhs, rhs, meta), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config:
-    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
-}
-)";
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK: %[[LHS:[0-9]+]] = tt.load
-CHECK: %[[RHS:[0-9]+]] = tt.load
-CHECK: %[[META:[0-9]+]] = tt.load
-CHECK: triton_xla.sparse_dot %[[LHS]], %[[RHS]], %{{[^:]+}}, %[[META]] :
-    )"));
-}
-
-TEST_F(TritonTest, SparseDotWithMasking) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-triton_dot {
-  lhs = f16[32,24] parameter(0)
-  rhs = f16[48,32] parameter(1)
-  meta = u16[32,3] parameter(2)
-  ROOT dot = f16[32,32] dot(lhs, rhs, meta),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
-}
-
-ENTRY e {
-  lhs = f16[32,24] parameter(0)
-  rhs = f16[48,32] parameter(1)
-  meta = u16[32,3] parameter(2)
-  ROOT _ = f16[32,32] fusion(lhs, rhs, meta), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config:
-    {"block_m":32,"block_n":32,"block_k":64,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
-}
-)";
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK-DAG: %[[C24:.+]] = arith.constant dense<24>
-CHECK-DAG: %[[C48:.+]] = arith.constant dense<48>
-CHECK: %[[LHS:[0-9]+]] = tt.load %{{.+}} {boundaryCheck = array<i32: 1>
-CHECK: %[[RHS:[0-9]+]] = tt.load %{{.+}} {boundaryCheck = array<i32: 0>
-CHECK: %[[META:[0-9]+]] = tt.load %{{.+}} {boundaryCheck = array<i32: 1>
-CHECK: arith.cmpi slt, %{{.+}}, %[[C24]] :
-CHECK: %[[LHS_MASKED:[0-9]+]] = arith.select %{{.+}}, %[[LHS]],
-CHECK: arith.cmpi slt, %{{.+}}, %[[C48]] :
-CHECK: %[[RHS_MASKED:[0-9]+]] = arith.select %{{.+}}, %[[RHS]],
-CHECK: triton_xla.sparse_dot %[[LHS_MASKED]], %[[RHS_MASKED]], %{{[^:]+}}, %[[META]] :
-    )"));
-}
-
-TEST_F(TritonTest, SparseDotBroadcastMetadata) {
-  constexpr absl::string_view kHloText = R"(
-HloModule t
-
-triton_dot {
-  lhs = f16[10,32,64] parameter(0)
-  rhs = f16[10,128,256] parameter(1)
-  meta_partial = u16[8] parameter(2)
-  meta = u16[10,32,8] broadcast(meta_partial), dimensions={2}
-  ROOT dot = f16[10,32,256] dot(lhs, rhs, meta),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={1}, sparsity=L.2@2:4
-}
-
-ENTRY e {
-  lhs = f16[10,32,64] parameter(0)
-  rhs = f16[10,128,256] parameter(1)
-  meta_partial = u16[8] parameter(2)
-  ROOT _ = f16[10,32,256] fusion(lhs, rhs, meta_partial), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config:
-    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
-}
-)";
-  TF_ASSERT_OK(
-      CreateTritonIrAndFileCheckForDot(this, kHloText, "triton_dot", R"(
-CHECK: %[[TWO:.+]] = arith.constant 2 : i32
-CHECK: %[[LHS:[0-9]+]] = tt.load
-CHECK: %[[RHS:[0-9]+]] = tt.load
-CHECK: %[[T1:[0-9]+]] = tt.load %[[PTR:.+]] :
-CHECK: tt.advance %[[PTR]], [%[[TWO]]]
-CHECK: %[[T2:[0-9]+]] = tt.expand_dims %[[T1]]
-CHECK: %[[META:[0-9]+]] = tt.broadcast %[[T2]]
-CHECK: triton_xla.sparse_dot %[[LHS]], %[[RHS]], %{{[^:]+}}, %[[META]] :
-    )"));
-}
-
 TEST_F(TritonGemmTest, DoNotUseTensorCoresWithNonDefaultPrecision) {
   constexpr absl::string_view kHloText = R"(
 triton_gemm_r {
@@ -630,7 +602,7 @@ ENTRY e {
   }
   DebugOptions debug_options = verified_module->config().debug_options();
   debug_options.set_xla_dump_to(output_directory);
-  debug_options.set_xla_gpu_dump_llvmir(true);
+  debug_options.set_xla_dump_hlo_pass_re("triton-fusion-emitter");
   verified_module->mutable_config().set_debug_options(debug_options);
 
   EXPECT_TRUE(RunAndCompare(std::move(verified_module),
@@ -668,7 +640,7 @@ ENTRY main {
 TEST_F(TritonGemmTest, UseTensorCoresForF32OnAmpere) {
   constexpr absl::string_view kHloText = R"(
 triton_gemm_r {
-  parameter_0 = f16[80,15]{1,0} parameter(0)
+  parameter_0 = s8[80,15]{1,0} parameter(0)
   convert.3 = f32[80,15]{1,0} convert(parameter_0)
   parameter_1 = f32[16,15]{1,0} parameter(1)
   ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
@@ -677,7 +649,7 @@ triton_gemm_r {
 
 ENTRY e {
   p1 = f32[16,15]{1,0} parameter(1)
-  p0 = f16[80,15]{1,0} parameter(0)
+  p0 = s8[80,15]{1,0} parameter(0)
   ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
@@ -967,21 +939,16 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmTest, NondefaultOperandLayoutIsSupported) {
-  // TODO(bchetioui): reenable when b/285866137 is fixed.
-#ifndef NDEBUG
-  GTEST_SKIP() << "This test times out when -UNDEBUG is set.";
-#endif
   constexpr absl::string_view kHloText = R"(
 ENTRY r {
-  p1 = f16[9,140,128]{2,1,0} parameter(1)
-  cp = f16[9,140,128]{2,0,1} copy(p1)
-  cv = f32[9,140,128]{2,0,1} convert(cp)
-  p0 = f32[9,140,123]{2,1,0} parameter(0)
-  ROOT d = f32[9,128,123]{2,1,0} dot(cv, p0),
+  p1 = f16[3,10,128]{2,1,0} parameter(1)
+  cp = f16[3,10,128]{2,0,1} copy(p1)
+  cv = f32[3,10,128]{2,0,1} convert(cp)
+  p0 = f32[3,10,123]{2,1,0} parameter(0)
+  ROOT d = f32[3,128,123]{2,1,0} dot(cv, p0),
     lhs_batch_dims={0}, lhs_contracting_dims={1},
     rhs_batch_dims={0}, rhs_contracting_dims={1}
 })";
-
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
   EXPECT_THAT(
@@ -1503,16 +1470,7 @@ e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-class TritonGemmTestAny : public TritonGemmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
-    return debug_options;
-  }
-};
-
-TEST_F(TritonGemmTestAny, DoF32F32) {
+TEST_F(TritonGemmTest, DoF32F32) {
   const std::string hlo_text = R"(
 HloModule t
 
@@ -1532,7 +1490,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, DoAddConstantToScalarAndBroadcastThat) {
+TEST_F(TritonGemmTest, DoAddConstantToScalarAndBroadcastThat) {
   if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
     GTEST_SKIP() << "Not using autotuner on ROCM yet.";
   }
@@ -1842,8 +1800,7 @@ ENTRY e {
       kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
 }
 
-TEST_F(TritonGemmTestAny,
-       DoNotFuseConcatenationOfSplitNonContractingDimension) {
+TEST_F(TritonGemmTest, DoNotFuseConcatenationOfSplitNonContractingDimension) {
   if (std::holds_alternative<se::RocmComputeCapability>(GpuComputeComp())) {
     GTEST_SKIP() << "Not using autotuner on ROCM yet.";
   }
@@ -2237,7 +2194,7 @@ e {
                                                 /*arel=*/1e-2}));
 }
 
-TEST_F(TritonGemmTestAny, MinimumHandlesNaNsOnTheLeft) {
+TEST_F(TritonGemmTest, MinimumHandlesNaNsOnTheLeft) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2260,7 +2217,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, MinimumHandlesNaNsOnTheRight) {
+TEST_F(TritonGemmTest, MinimumHandlesNaNsOnTheRight) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2283,7 +2240,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, MaximumHandlesNaNsOnTheLeft) {
+TEST_F(TritonGemmTest, MaximumHandlesNaNsOnTheLeft) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2306,7 +2263,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, MaximumHandlesNaNsOnTheRight) {
+TEST_F(TritonGemmTest, MaximumHandlesNaNsOnTheRight) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2329,7 +2286,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, MinimumReturnsLHS) {
+TEST_F(TritonGemmTest, MinimumReturnsLHS) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2354,7 +2311,7 @@ ENTRY e {
                                                 /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, MinimumReturnsRHS) {
+TEST_F(TritonGemmTest, MinimumReturnsRHS) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2379,7 +2336,7 @@ ENTRY e {
                                                 /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, MaximumReturnsLHS) {
+TEST_F(TritonGemmTest, MaximumReturnsLHS) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2404,7 +2361,7 @@ ENTRY e {
                                                 /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny, MaximumReturnsRHS) {
+TEST_F(TritonGemmTest, MaximumReturnsRHS) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -2599,7 +2556,7 @@ ENTRY e {
       GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
                      .WithFusionKind(HloInstruction::FusionKind::kCustom)));
 
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/3e-2, /*arel=*/3e-2}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/4e-2, /*arel=*/6e-2}));
 }
 
 TEST_F(TritonGemmTest, ParameterAfterDotIsFused) {
@@ -2689,12 +2646,10 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Bitcast(
-          m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
-                        .WithFusionKind(HloInstruction::FusionKind::kCustom))
-              .WithFusionKind(HloInstruction::FusionKind::kInput))));
+  LOG(INFO) << "module: " << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Bitcast(m::Transpose(m::Fusion().WithFusionKind(
+                  HloInstruction::FusionKind::kCustom)))));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
@@ -2748,7 +2703,6 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
-
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
@@ -2776,8 +2730,7 @@ ENTRY e {
 )");
 }
 
-TEST_F(TritonGemmTestAny,
-       LowerDotWithLhsWithoutNonContractingDimThroughTriton) {
+TEST_F(TritonGemmTest, LowerDotWithLhsWithoutNonContractingDimThroughTriton) {
   const std::string hlo_text = R"(
 HloModule t
 
@@ -2798,8 +2751,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTestAny,
-       LowerDotWithRhsWithoutNonContractingDimThroughTriton) {
+TEST_F(TritonGemmTest, LowerDotWithRhsWithoutNonContractingDimThroughTriton) {
   const std::string hlo_text = R"(
 HloModule t
 
@@ -4084,128 +4036,7 @@ ENTRY e {
                                       /*run_hlo_passes=*/false));
 }
 
-class TritonGemmContractionDims : public TritonGemmTest {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_ensure_minor_dot_contraction_dims(true);
-    debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
-
-    return debug_options;
-  }
-};
-
-TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_0) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  parameter.0 = bf16[16,40]{1,0} parameter(0)
-  parameter.1 = bf16[40,32]{1,0} parameter(1)
-  ROOT dot.31472 = bf16[16,32]{1,0} dot(parameter.0, parameter.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-
-  EXPECT_THAT(module->entry_computation()
-                  ->root_instruction()
-                  ->fused_instructions_computation()
-                  ->root_instruction(),
-              GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 40}, {1, 0}),
-                                m::Op().WithShape(BF16, {40, 32}, {0, 1}))
-                             .WithShape(BF16, {16, 32}, {1, 0})));
-}
-
-TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_1_2) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  parameter_0 = bf16[32,4,36]{2,1,0} parameter(0)
-  parameter_1 = bf16[40,4,36]{2,1,0} parameter(1)
-  ROOT dot.16450 = bf16[4,32,40]{2,1,0} dot(parameter_0, parameter_1),
-      lhs_batch_dims={1}, lhs_contracting_dims={2},
-      rhs_batch_dims={1}, rhs_contracting_dims={2}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-
-  // The contracting dims were already minor, so the layout is unchanged
-  // (non-major batch dims are fine).
-  EXPECT_THAT(module->entry_computation()
-                  ->root_instruction()
-                  ->fused_instructions_computation()
-                  ->root_instruction(),
-              GmockMatch(m::Dot(m::Op().WithShape(BF16, {32, 4, 36}, {2, 1, 0}),
-                                m::Op().WithShape(BF16, {40, 4, 36}, {2, 1, 0}))
-                             .WithShape(BF16, {4, 32, 40}, {2, 1, 0})));
-}
-
-TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_0_1) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  parameter_1 = bf16[16,16,48]{2,1,0} parameter(1)
-  parameter_2 = bf16[16,48,32]{2,1,0} parameter(0)
-  ROOT dot.16125 = bf16[16,16,32]{2,1,0} dot(parameter_1, parameter_2),
-      lhs_batch_dims={1}, lhs_contracting_dims={2},
-      rhs_batch_dims={0}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-
-  // lhs has minor contracting dims, so the layout is changed.
-  // rhs changes layout to have minor contracting dims.
-  EXPECT_THAT(
-      module->entry_computation()
-          ->root_instruction()
-          ->fused_instructions_computation()
-          ->root_instruction(),
-      GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 16, 48}, {2, 1, 0}),
-                        m::Op().WithShape(BF16, {16, 48, 32}, {1, 2, 0}))
-                     .WithShape(BF16, {16, 16, 32}, {2, 1, 0})));
-}
-
-TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_1) {
-  if (!SupportsBF16(GpuComputeComp())) {
-    GTEST_SKIP() << "BF16 not supported.";
-  }
-  constexpr absl::string_view kHloText = R"(
-HloModule m
-
-ENTRY e {
-  parameter_0 = bf16[16,32]{1,0} parameter(0)
-  parameter_1 = bf16[40,32]{0,1} parameter(1)
-  ROOT dot.15148 = bf16[16,40]{1,0} dot(parameter_0, parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          GetOptimizedModule(kHloText));
-  EXPECT_THAT(module->entry_computation()
-                  ->root_instruction()
-                  ->fused_instructions_computation()
-                  ->root_instruction(),
-              GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 32}, {1, 0}),
-                                m::Op().WithShape(BF16, {32, 40}, {1, 0}))
-                             .WithShape(BF16, {16, 40}, {1, 0})));
-}
-
-// This test could be modified to allow TF32 once this bug is fixed.
-// TODO(b/320659359) Allow TF32 for 8-bit or less types with F32.
-TEST_F(TritonTest, NoTF32For8BitOrLessWithF32) {
+TEST_F(TritonTest, UseTF32For8BitOrLessWithF32) {
   const std::string hlo_text = R"(
 HloModule t
 
@@ -4237,7 +4068,7 @@ ENTRY e {
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheckForDot(this, hlo_text, "triton_dot", R"(
 CHECK:      tt.dot
-CHECK-NOT:  inputPrecision = tf32
+CHECK:  inputPrecision = tf32
   )"));
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
@@ -4307,6 +4138,36 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
 }
 
+TEST_F(TritonTest, FP8ToFP8EndToEnd) {
+  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
+    GTEST_SKIP() << "Doesn't pass on pre-Hopper GPUs.";
+  }
+
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
+  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
+  convert = f8e4m3fn[32,32]{1,0} convert(parameter_0)
+  ROOT dot = f32[32,32]{1,0} dot(convert, parameter_1),
+                lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  parameter_0 = f8e5m2[32,32]{1,0} parameter(0)
+  parameter_1 = f8e4m3fn[32,32]{1,0} parameter(1)
+  ROOT gemm_fusion_dot = f32[32,32]{1,0} fusion(parameter_0, parameter_1),
+       kind=kCustom, calls=triton_dot,
+       backend_config={
+       "fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":
+         {"block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+          "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1.0, /*arel=*/1e-3}));
+}
+
 // Test PreventMmaV3LoopUnrolling pass in order to keep compile time low.
 // See b/344841434.
 TEST_F(TritonGemmTest, TestPreventMMAV3LoopUnrolling) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index b7d5c7a5e2cf..92c564e11064 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -13,16 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
@@ -32,14 +36,16 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/primitive_util.h"
+#include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
@@ -48,8 +54,10 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -70,6 +78,47 @@ class TritonEmitterTest : public GpuCodegenTest {
   }
 };
 
+class TmaParameterizedTritonEmitterTest
+    : public TritonEmitterTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_experimental_enable_triton_tma(GetParam());
+    return debug_options;
+  }
+};
+
+INSTANTIATE_TEST_SUITE_P(TmaParameterizedTritonEmitterTestSuite,
+                         TmaParameterizedTritonEmitterTest, ::testing::Bool());
+
+// TODO(bchetioui): turn this into a general binary elementwise test.
+TEST_F(TritonEmitterTest, MinimumIsEmittedCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+computation {
+  p0 = f32[8,4] parameter(0)
+  p1 = f32[8,4] parameter(1)
+  ROOT minimum = f32[8,4] minimum(p0, p1)
+}
+
+ENTRY entry_computation {
+  p0 = f32[8,4] parameter(0)
+  p1 = f32[8,4] parameter(1)
+  ROOT fusion = f32[8,4] fusion(p0, p1), kind=kCustom,
+    calls=computation,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1", "4"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
 TEST_F(TritonEmitterTest, ReductionOnMinormostAxisIsEmittedCorrectly) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
@@ -95,7 +144,10 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["4"]}],"num_warps":"1"}}}
+          "output_tiles":[{"sizes":["4"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
@@ -132,15 +184,60 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["64"]},{"sizes":["64","512"]}],"num_warps":"2"}}}
+          "output_tiles":[{"sizes":["64"]},{"sizes":["64","512"]}],
+          "num_warps":"2",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK-COUNT-1:  tt.load
+CHECK-COUNT-1:  triton_xla.extract
 CHECK:  %[[ABS:.*]] = math.absf
 CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 1 : i32}>
-CHECK:  tt.store %{{.*}}, %[[REDUCE]] : !tt.ptr<tensor<64xf32>>
-CHECK:  tt.store %{{.*}}, %[[ABS]] : !tt.ptr<tensor<64x512xf32>>
+CHECK:  triton_xla.insert %[[REDUCE]] {{.*}} : tensor<64xf32> into tensor<128xf32>
+CHECK:  triton_xla.insert %[[ABS]] {{.*}} : tensor<64x512xf32> into tensor<128x512xf32>
+)"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+TEST_F(TritonEmitterTest, ReductionToScalarWithExtraOutputIsEmittedCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+region {
+  param_0.1 = f32[] parameter(0)
+  param_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(param_0.1, param_1)
+}
+
+fused_computation {
+  param_0.2 = f32[512] parameter(0)
+  abs = f32[512] abs(param_0.2)
+  constant = f32[] constant(-inf)
+  reduce = f32[] reduce(abs, constant), dimensions={0}, to_apply=region
+  ROOT tuple = (f32[], f32[512]) tuple(reduce, abs)
+}
+
+ENTRY entry_computation {
+  param_0.3 = f32[512] parameter(0)
+  ROOT fusion = (f32[], f32[512]) fusion(param_0.3), kind=kCustom,
+    calls=fused_computation,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":[]},{"sizes":["512"]}],
+          "num_warps":"2",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  TF_EXPECT_OK(
+      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
+CHECK-COUNT-1:  triton_xla.extract
+CHECK:  %[[ABS:.*]] = math.absf
+CHECK: %[[REDUCE:.*]] = "tt.reduce"(%[[ABS:.*]]) <{axis = 0 : i32}>
+CHECK:  tensor.insert %[[REDUCE]] {{.*}} : tensor<f32>
+CHECK:  triton_xla.insert %[[ABS]] {{.*}} : tensor<512xf32> into tensor<512xf32>
 )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -160,16 +257,17 @@ fused_computation {
 
 ENTRY entry_computation {
   param_0.2 = f32[64] parameter(0)
-  ROOT fusion = (f32[63], f32[63]) fusion(param_0.2), kind=kCustom, calls=fused_computation, backend_config={"fusion_backend_config":{"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["32"]},{"sizes":["32"]}],"num_warps":"2"}}}
+  ROOT fusion = (f32[63], f32[63]) fusion(param_0.2), kind=kCustom,
+    calls=fused_computation,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["32"]},{"sizes":["32"]}],
+          "num_warps":"2",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK-COUNT-1:  tt.load
-CHECK:  tt.store
-CHECK-SAME: {boundaryCheck = array<i32: 0>}
-CHECK:  tt.store
-CHECK-SAME: {boundaryCheck = array<i32: 0>}
-)"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -193,16 +291,11 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["32"]},{"sizes":["32"]}],"num_warps":"2"}}}
+          "output_tiles":[{"sizes":["32"]},{"sizes":["32"]}],
+          "num_warps":"2",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
-  TF_EXPECT_OK(
-      CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK-COUNT-1:  tt.load
-CHECK:  tt.store
-CHECK-SAME: {boundaryCheck = array<i32: 0>}
-CHECK:  tt.store
-CHECK-NOT: {boundaryCheck = array<i32: 0>}
-)"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -233,14 +326,16 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[
-            {"sizes":["1", "4"]},{"sizes":["16"]}],"num_warps":"2"}}}
+          "output_tiles":[{"sizes":["1", "4"]},{"sizes":["16"]}],
+          "num_warps":"2",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK-COUNT-1:  tt.load
+CHECK-COUNT-1:  triton_xla.extract
 CHECK: tt.reduce
-CHECK-COUNT-2:  tt.store
+CHECK-COUNT-2:  triton_xla.insert
 )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -272,8 +367,11 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[
-            {"sizes":["1", "1"]},{"sizes":["4"]}],"num_warps":"2"}}}
+          "output_tiles":[{"sizes":["1", "1"]},{"sizes":["4"]}],
+          "num_warps":"2",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+
 })";
   auto status =
       CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", "");
@@ -301,7 +399,16 @@ triton_reduction_computation {
 
 ENTRY main {
   param_0 = f32[5,5,5,5,3] parameter(0)
-  ROOT triton_reduction = f32[5,5,5,3] fusion(param_0), kind=kCustom, calls=triton_reduction_computation, backend_config={"fusion_backend_config":{"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["4", "2", "5", "1"]}],"num_warps":"1"}}}
+  ROOT triton_reduction = f32[5,5,5,3] fusion(param_0), kind=kCustom,
+    calls=triton_reduction_computation,
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["4", "2", "5", "1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
                                           "triton_reduction_computation", R"(
@@ -330,12 +437,21 @@ triton_reduction_computation {
 
 ENTRY main {
   param_0 = f32[5,3] parameter(0)
-  ROOT triton_reduction = f32[3] fusion(param_0), kind=kCustom, calls=triton_reduction_computation, backend_config={"fusion_backend_config":{"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["3"]}],"num_warps":"1"}}}
+  ROOT triton_reduction = f32[3] fusion(param_0), kind=kCustom,
+    calls=triton_reduction_computation,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["3"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
                                           "triton_reduction_computation", R"(
 ; Make sure input reduction tile is padded with a neutral value.
-CHECK:  %[[LOAD:.*]] = tt.load
+CHECK:  %[[LOAD:.*]] = triton_xla.extract
 CHECK:  %[[RANGE:.*]] = tt.make_range
 CHECK:  %[[EXPAND:.*]] = tt.expand_dims %[[RANGE]]
 CHECK:  %[[BROADCAST:.*]] = tt.broadcast %[[EXPAND]]
@@ -374,28 +490,21 @@ triton_softmax_computation {
 ENTRY main {
   param_0 = f32[125,127]{1,0} parameter(0)
   ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0),
-    kind=kCustom, calls=triton_softmax_computation,
-    backend_config={"fusion_backend_config":{
+    kind=kCustom, calls=triton_softmax_computation, backend_config={
+      "fusion_backend_config":{
       "kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "128"]}],
-                                   "num_warps":"1"}}}})";
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "128"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}})";
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
                                           "triton_softmax_computation", R"(
-CHECK:        #indexing_map = #xla.indexing_map<"(pid_0) -> (pid_0 * 127), domain: pid_0 in [0, 124]">
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:        %[[ZERO:.*]] = arith.constant 0 : i32
-CHECK-DAG:        %[[C125:.*]] = arith.constant 125 : i64
-CHECK-DAG:        %[[C127:.*]] = arith.constant 127 : i64
+CHECK:        func.func @triton_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}})
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_I64:.*]] = arith.extsi %[[PID]] : i32 to i64
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID_I64]] : i64 to index
-CHECK-DAG:        %[[SUB:.*]] = arith.subi %[[C125]], %[[PID_I64]] : i64
-CHECK-DAG:        %[[OFFSET_IDX:.*]] = xla.apply_indexing #indexing_map(%[[PID_INDEX]])
-CHECK-DAG:        %[[OFFSET_I64:.*]] = arith.index_castui %[[OFFSET_IDX]] : index to i64
-CHECK-DAG:        %[[BASE_PTR_LOAD:.*]] = tt.addptr %[[P0]], %[[OFFSET_I64]] : !tt.ptr<f32>, i64
-CHECK-DAG:        tt.make_tensor_ptr %[[BASE_PTR_LOAD]], [%[[SUB]], %[[C127]]], {{.*}} [%[[ZERO]], %[[ZERO]]] {order = array<i32: 1, 0>} : <tensor<1x128xf32>>
-CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 1>, padding = 1 : i32} : !tt.ptr<tensor<1x128xf32>>
+CHECK-NEXT:       triton_xla.extract %[[P0]][%[[PID_INDEX]], 0] [1, 128] [1, 1] {layout = array<i64: 1, 0>}
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG2:[^:]*]]: f32, %[[ARG3:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : f32
@@ -403,11 +512,8 @@ CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf
 CHECK-SAME:       tensor<1x128xf32>
-CHECK-DAG:        %[[BASE_PTR_STORE:.*]] = tt.addptr %[[P1]], %[[OFFSET_I64]] : !tt.ptr<f32>, i64
-CHECK:            tt.make_tensor_ptr %[[BASE_PTR_STORE]], [%[[SUB]], %[[C127]]], {{.*}} [%[[ZERO]], %[[ZERO]]] {order = array<i32: 1, 0>} : <tensor<1x128xf32>>
-CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<1x128xf32>>
-CHECK:            tt.return
+CHECK:            triton_xla.insert {{.*}} into %[[P1]][%[[PID_INDEX]], 0] [1, 128] [1, 1]  {layout = array<i64: 1, 0>}
+CHECK:            return
 CHECK:        }
 )"));
 }
@@ -442,38 +548,29 @@ ENTRY main {
     kind=kCustom, calls=triton_softmax_computation,
     backend_config={"fusion_backend_config":{
       "kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "128"]}],
-                                   "num_warps":"1"}}}})";
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "128"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}})";
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
                                           "triton_softmax_computation", R"(
-CHECK:         #indexing_map = #xla.indexing_map<"(pid_0) -> (pid_0 * 127), domain: pid_0 in [0, 124]">
-CHECK:         tt.func @triton_fn(
-CHECK-SAME:                      %[[P0:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-DAG:        %[[ZERO:.*]] = arith.constant 0 : i32
-CHECK-DAG:        %[[C125:.*]] = arith.constant 125 : i64
-CHECK-DAG:        %[[C127:.*]] = arith.constant 127 : i64
+CHECK:         func.func @triton_fn(
+CHECK-SAME:                      %[[P0:[A-Za-z0-9_]*]]: tensor<125x127xf32>
+CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: tensor<127xf32>
+CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: tensor<125x127xf32>
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_I64:.*]] = arith.extsi %[[PID]] : i32 to i64
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID_I64]] : i64 to index
-CHECK-DAG:        %[[SUB:.*]] = arith.subi %[[C125]], %[[PID_I64]] : i64
-CHECK-DAG:        %[[OFFSET_IDX:.*]] = xla.apply_indexing #indexing_map(%[[PID_INDEX]])
-CHECK-DAG:        %[[OFFSET_I64:.*]] = arith.index_castui %[[OFFSET_IDX]] : index to i64
-CHECK-DAG:        %[[BASE_PTR0_LOAD:.*]] = tt.addptr %[[P0]], %[[OFFSET_I64]] : !tt.ptr<f32>, i64
-CHECK-DAG:        tt.make_tensor_ptr %[[BASE_PTR0_LOAD]], [%[[SUB]], %[[C127]]], {{.*}} [%[[ZERO]], %[[ZERO]]] {order = array<i32: 1, 0>} : <tensor<1x128xf32>>
-CHECK-NEXT:       tt.load {{.*}} : !tt.ptr<tensor<1x128xf32>>
-CHECK-DAG:        tt.make_tensor_ptr %[[P1]], [%[[C127]]], {{.*}} [%[[ZERO]]] {order = array<i32: 0>} : <tensor<128xf32>>
-CHECK-NEXT:       tt.load {{.*}} : !tt.ptr<tensor<128xf32>>
+CHECK-DAG:        triton_xla.extract %[[P0]][%[[PID_INDEX]], 0] [1, 128] [1, 1] {layout = array<i64: 1, 0>} : tensor<125x127xf32> to tensor<1x128xf32>
+CHECK-DAG:        triton_xla.extract %[[P1]][0] [128] [1] {layout = array<i64: 0>} : tensor<127xf32> to tensor<128xf32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf
-CHECK-DAG:        %[[BASE_PTR2_LOAD:.*]] = tt.addptr %[[P2]], %[[OFFSET_I64]] : !tt.ptr<f32>, i64
-CHECK-DAG:        tt.make_tensor_ptr %[[BASE_PTR2_LOAD]], [%[[SUB]], %[[C127]]], {{.*}} [%[[ZERO]], %[[ZERO]]] {order = array<i32: 1, 0>} : <tensor<1x128xf32>>
-CHECK-DAG:        tt.store {{.*}} : !tt.ptr<tensor<1x128xf32>>
+CHECK-DAG:        triton_xla.insert {{.*}} into %[[P2]][%[[PID_INDEX]], 0] [1, 128] [1, 1] {layout = array<i64: 1, 0>} : tensor<1x128xf32> into tensor<125x127xf32>
 )"));
 }
 
@@ -507,51 +604,36 @@ ENTRY main {
   param_2 = f32[10,125]{1,0} parameter(2)
   ROOT triton_softmax = f32[10,125,127]{2,1,0} fusion(param_0, param_1, param_2),
     kind=kCustom, calls=triton_softmax_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-       "block_level_fusion_config": {"output_tiles":[{"sizes": ["1", "1", "127"]}],
-                                     "num_warps": "1"}}}
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes": ["1", "1", "127"]}],
+          "num_warps": "1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
 
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = #xla.indexing_map<"(d0) -> (d0 floordiv 125), domain: d0 in [0, 1249]">
 CHECK:        #[[MAP1:.*]] = #xla.indexing_map<"(d0) -> (d0 mod 125), domain: d0 in [0, 1249]">
-CHECK:        #[[MAP2:.*]] = #xla.indexing_map<"(d0) -> (d0 * 127), domain: d0 in [0, 1249]">
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:        %[[ZERO:.*]] = arith.constant 0 : i32
-CHECK-DAG:        %[[C10:.*]] = arith.constant 10 : i64
-CHECK-DAG:        %[[C125:.*]] = arith.constant 125 : i64
-CHECK-DAG:        %[[C127:.*]] = arith.constant 127 : i64
+CHECK:        func.func @triton_fn(%[[P0:.*]]: {{.*}}, %[[P1:.*]]: {{.*}}, %[[P2:.*]]: {{.*}}, %[[P3:.*]]: {{.*}})
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_I64:.*]] = arith.extsi %[[PID]] : i32 to i64
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID_I64]] : i64 to index
 CHECK-DAG:        %[[ROW_INDEX:.*]] = xla.apply_indexing #[[MAP]](%[[PID_INDEX]]
 CHECK-DAG:        %[[COL_INDEX:.*]] = xla.apply_indexing #[[MAP1]](%[[PID_INDEX]]
-CHECK-DAG:        %[[ROW_64:.*]] = arith.index_castui %[[ROW_INDEX]] : index to i64
-CHECK-DAG:        %[[COL_64:.*]] = arith.index_castui %[[COL_INDEX]] : index to i64
-CHECK-DAG:        %[[ROW_SUB:.*]] = arith.subi %[[C10]], %[[ROW_64]] : i64
-CHECK-DAG:        %[[COL_SUB:.*]] = arith.subi %[[C125]], %[[COL_64]] : i64
-CHECK-DAG:        %[[OFFSET_IDX:.*]] = xla.apply_indexing #[[MAP2]](%[[PID_INDEX]])
-CHECK-DAG:        %[[OFFSET_I64:.*]] = arith.index_castui %[[OFFSET_IDX]] : index to i64
-CHECK-DAG:        %[[BASE_PTR0_LOAD:.*]] = tt.addptr %[[P0]], %[[OFFSET_I64]] : !tt.ptr<f32>, i64
-CHECK-DAG:        tt.make_tensor_ptr %[[BASE_PTR0_LOAD]], [%[[ROW_SUB]], %[[COL_SUB]], %[[C127]]], {{.*}} [%[[ZERO]], %[[ZERO]], %[[ZERO]]] {order = array<i32: 2, 1, 0>} : <tensor<1x1x128xf32>>
-CHECK-NEXT:       tt.load {{.*}} : !tt.ptr<tensor<1x1x128xf32>>
-CHECK-DAG:        tt.make_tensor_ptr %[[P1]], [%[[C127]]], {{.*}} [%[[ZERO]]] {order = array<i32: 0>} : <tensor<128xf32>>
-CHECK-NEXT:       tt.load {{.*}} : !tt.ptr<tensor<128xf32>>
-CHECK-DAG:        %[[BASE_PTR2_LOAD:.*]] = tt.addptr %[[P2]], %[[PID_I64]] : !tt.ptr<f32>, i64
-CHECK-DAG:        tt.make_tensor_ptr %[[BASE_PTR2_LOAD]], [%[[ROW_SUB]], %[[COL_SUB]]], {{.*}} [%[[ZERO]], %[[ZERO]]] {order = array<i32: 1, 0>} : <tensor<1x1xf32>>
-CHECK-NEXT:       tt.load {{.*}} : !tt.ptr<tensor<1x1xf32>>
+CHECK:            triton_xla.extract %[[P0]][%[[ROW_INDEX]], %[[COL_INDEX]], 0] [1, 1, 128] [1, 1, 1] {layout = array<i64: 2, 1, 0>} : tensor<10x125x127xf32> to tensor<1x1x128xf32>
+CHECK:            triton_xla.extract %[[P1]][0] [128] [1] {layout = array<i64: 0>} : tensor<127xf32> to tensor<128xf32>
+CHECK:            triton_xla.extract %[[P2]][%[[ROW_INDEX]], %[[COL_INDEX]]] [1, 1] [1, 1] {layout = array<i64: 1, 0>} : tensor<10x125xf32> to tensor<1x1xf32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG4:[^:]*]]: f32, %[[ARG5:[^:]*]]: f32):
 CHECK-NEXT:           %[[MAX:.*]] = arith.maximumf %[[ARG4]], %[[ARG5]] : f32
 CHECK-NEXT:           tt.reduce.return %[[MAX]] : f32
 CHECK-NEXT:       }) : (tensor<1x1x128xf32>) -> tensor<1x1xf32>
-CHECK-DAG:        %[[BASE_PTR3_STORE:.*]] = tt.addptr %[[P3]], %[[OFFSET_I64]] : !tt.ptr<f32>, i64
-CHECK-DAG:        tt.make_tensor_ptr %[[BASE_PTR3_STORE]], [%[[ROW_SUB]], %[[COL_SUB]], %[[C127]]], {{.*}} [%[[ZERO]], %[[ZERO]], %[[ZERO]]] {order = array<i32: 2, 1, 0>} : <tensor<1x1x128xf32>>
-CHECK-NEXT:       tt.store {{.*}} : !tt.ptr<tensor<1x1x128xf32>>
+CHECK:            triton_xla.insert {{.*}} into %[[P3]][%[[ROW_INDEX]], %[[COL_INDEX]], 0] [1, 1, 128] [1, 1, 1] {layout = array<i64: 2, 1, 0>} : tensor<1x1x128xf32> into tensor<10x125x127xf32>
 )"));
-
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
@@ -581,7 +663,16 @@ triton_softmax_computation {
 ENTRY main {
   parameter_1 = f32[32]{0} parameter(1)
   parameter_0 = f32[32,16]{1,0} parameter(0)
-  ROOT _ = f32[32,16]{1,0} fusion(parameter_0, parameter_1), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["1","16"]}],"num_warps":"1"}}}
+  ROOT _ = f32[32,16]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","16"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -594,7 +685,6 @@ TEST_F(TritonEmitterTest, NestedReducerFusionGetsCodegenedCorrectly) {
   if (!SupportsBF16(GpuComputeCapability())) {
     GTEST_SKIP() << "BF16 not supported.";
   }
-
   constexpr absl::string_view kHloText = R"(
 HloModule softmax
 
@@ -624,7 +714,16 @@ triton_softmax_computation {
 
 ENTRY main {
   p0 = pred[10,128]{1,0} parameter(0)
-  ROOT softmax = f32[10,128] fusion(p0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["1","128"]}],"num_warps":"1"}}}
+  ROOT softmax = f32[10,128] fusion(p0), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","128"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/0,
@@ -657,7 +756,16 @@ triton_softmax_computation {
 ENTRY main {
   parameter_0 = f32[16,32]{1,0} parameter(0)
   parameter_1 = f32[32]{0} parameter(1)
-  ROOT _ = f32[16,32]{1,0} fusion(parameter_0,parameter_1), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["1","32"]}],"num_warps":"1"}}}
+  ROOT _ = f32[16,32]{1,0} fusion(parameter_0,parameter_1), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","32"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -691,10 +799,14 @@ ENTRY main {
   parameter_0 = f32[] parameter(0)
   ROOT _ = f32[64,32,16]{2,1,0} fusion(parameter_1, parameter_0), kind=kCustom,
     calls=triton_softmax_computation,
-    backend_config={"fusion_backend_config":{
+    backend_config={
+      "fusion_backend_config":{
       "kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["1","1","16"]}],
-                                   "num_warps":"1"}}}
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","1","16"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 }
 )";
 
@@ -705,13 +817,13 @@ ENTRY main {
                                           "triton_softmax_computation", R"(
 // CHECK:         #xla.indexing_map<"(d0) -> (d0 floordiv 32), domain: d0 in [0, 2047]">
 // CHECK:         #xla.indexing_map<"(d0) -> (d0 mod 32), domain: d0 in [0, 2047]">
-// CHECK-LABEL:   tt.func @triton_fn(
-// CHECK-SAME:                       %[[P0:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-// CHECK-SAME:                       %[[P1:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-// CHECK-SAME:                       %[[P2:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-// CHECK-DAG:       tt.load {{.*}} : !tt.ptr<f32>
-// CHECK-DAG:       tt.load {{.*}} : !tt.ptr<tensor<1x1x16xf32>>
-// CHECK:           tt.store {{.*}} : !tt.ptr<tensor<1x1x16xf32>>
+// CHECK-LABEL:   func.func @triton_fn(
+// CHECK-SAME:                       %[[P0:[A-Za-z0-9_]*]]: tensor<64x32x16xf32>
+// CHECK-SAME:                       %[[P1:[A-Za-z0-9_]*]]: tensor<f32>
+// CHECK-SAME:                       %[[P2:[A-Za-z0-9_]*]]: tensor<64x32x16xf32>
+// CHECK-DAG:       tensor.extract {{.*}} : tensor<f32>
+// CHECK-DAG:       triton_xla.extract {{.*}} : tensor<64x32x16xf32> to tensor<1x1x16xf32>
+// CHECK:           triton_xla.insert {{.*}} : tensor<1x1x16xf32> into tensor<64x32x16xf32>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -743,7 +855,16 @@ triton_softmax_computation {
 ENTRY main {
   parameter_1 = f32[64,32,16]{2,1,0} parameter(1)
   parameter_0 = f32[16]{0} parameter(0)
-  ROOT _ = f32[64,32,16]{2,1,0} fusion(f32[64,32,16]{2,1,0} parameter_1, f32[16]{0} parameter_0), kind=kCustom, calls=%triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["1","1","16"]}],"num_warps":"1"}}}
+  ROOT _ = f32[64,32,16]{2,1,0} fusion(f32[64,32,16]{2,1,0} parameter_1, f32[16]{0} parameter_0), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","1","16"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 }
 )";
 
@@ -765,10 +886,14 @@ ENTRY entry {
   p1 = f32[10,10] parameter(1)
   ROOT r = f32[10,10] fusion(p0, p1),
     kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config": {
+    backend_config={
+      "fusion_backend_config":{
       "kind":"__triton",
-      "block_level_fusion_config": {"output_tiles":[{"sizes": ["1","1"]}],
-                                    "num_warps": "1"}}}
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
                           ParseAndReturnVerifiedModule(kHloText));
@@ -818,10 +943,14 @@ ENTRY entry_computation {
   param_0 = f32[8192,50304] parameter(0)
   ROOT fusion = f32[8192,50304] fusion(param_0),
     kind=kCustom, calls=fused_computation,
-    backend_config={"fusion_backend_config": {
+    backend_config={
+      "fusion_backend_config":{
       "kind":"__triton",
-      "block_level_fusion_config": {"output_tiles":[{"sizes": ["1024","1"]}],
-                                    "num_warps": "1"}}}
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1024","1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })"));
   const HloFusionInstruction* triton_fusion = Cast<HloFusionInstruction>(
       hlo_module->entry_computation()->root_instruction());
@@ -875,21 +1004,27 @@ ENTRY main {
   param_1 = f32[125]{0} parameter(1)
   ROOT triton_reduction = f32[125]{0} fusion(param_0, param_1),
     kind=kCustom, calls=triton_reduction_computation,
-    backend_config={"fusion_backend_config": {
+    backend_config={
+      "fusion_backend_config":{
       "kind":"__triton",
-      "block_level_fusion_config": {"output_tiles":[{"sizes": ["1"]}],
-                                    "num_warps": "1"}}}})";
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
+
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText,
                                           "triton_reduction_computation", R"(
-CHECK:        tt.func @triton_fn(%[[P0:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: !tt.ptr<f32>
-CHECK-DAG:        tt.load {{.*}} : !tt.ptr<tensor<1xf32>>
-CHECK-DAG:        tt.load {{.*}} : !tt.ptr<tensor<1x128xf32>>
+CHECK:        func.func @triton_fn(%[[P0:[A-Za-z0-9_]*]]: tensor<125x127xf32>
+CHECK-SAME:                      %[[P1:[A-Za-z0-9_]*]]: tensor<125xf32>
+CHECK-SAME:                      %[[P2:[A-Za-z0-9_]*]]: tensor<125xf32>
+CHECK-DAG:        triton_xla.extract {{.*}} : tensor<125xf32> to tensor<1xf32>
+CHECK-DAG:        triton_xla.extract {{.*}} : tensor<125x127xf32> to tensor<1x128xf32>
 CHECK:            tt.reduce
 CHECK:              (tensor<1x128xf32>) -> tensor<1xf32>
 CHECK:            arith.mulf {{.*}} tensor<1xf32>
-CHECK:            tt.store {{.*}} : !tt.ptr<tensor<1xf32>>
+CHECK:            triton_xla.insert {{.*}} : tensor<1xf32> into tensor<125xf32>
 )"));
 }
 
@@ -913,9 +1048,14 @@ ENTRY main {
   param_0 = f32[4,12,125,127]{3,2,1,0} parameter(0)
   ROOT triton_reduce = f32[4,12,125]{2,1,0} fusion(param_0),
     kind=kCustom, calls=triton_reduction_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["2","5","16"]}],"num_warps":"4"}}}
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","8","16"]}],
+        "num_warps":"4",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -944,13 +1084,31 @@ triton_softmax_computation {
 
 ENTRY entry_computation {
   param_0.2 = f32[4,4,8] parameter(0)
-  ROOT fusion = f32[4,4,8] fusion(param_0.2), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["2","2","8"]}],"num_warps":"1"}}}
+  ROOT fusion = f32[4,4,8] fusion(param_0.2), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","2","8"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+
 })";
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-6,
                                                            /*arel=*/1e-6}));
 }
 
-TEST_F(TritonEmitterTest, TestSliceWithTileThatNeedsMasking) {
+// Parameterized to make sure that slices are also handled correctly when TMA is
+// enabled.
+TEST_P(TmaParameterizedTritonEmitterTest, TestSliceWithTileThatNeedsMasking) {
+  bool tma_enabled = GetParam();
+  if (tma_enabled) {
+    GTEST_SKIP()
+        << "TODO(b/413301521): Skipping TMA due to: contiguous dimension "
+           "size too small.";
+  }
   constexpr absl::string_view kHloText = R"(
 HloModule m
 
@@ -962,9 +1120,14 @@ fused_computation {
 ENTRY entry_computation {
   p = f32[128,32] parameter(0)
   ROOT fusion = f32[12,5] fusion(p), kind=kCustom, calls=fused_computation,
-  backend_config={"fusion_backend_config":
-    {"kind":"__triton","block_level_fusion_config":
-      {"output_tiles":[{"sizes":["8","4"]}],"num_warps":"1"}}}
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["8","4"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{0, 0}));
 }
@@ -974,18 +1137,54 @@ TEST_F(TritonEmitterTest, TestSliceWithTileElementsNotAllContiguous) {
 HloModule m
 
 fused_computation {
-  param_0.1 = f32[16,16,32] parameter(0)
-  slice = f32[4,4,8] slice(param_0.1), slice={[2:10:2], [2:6], [3:11]}
-  slice.1 = f32[4,4,8] slice(param_0.1), slice={[4:8], [8:16:2], [13:21]}
-  ROOT add.3 = f32[4,4,8] add(slice, slice.1)
+  param_0 = f32[16,16,32] parameter(0)
+  slice = f32[4,4,8] slice(param_0), slice={[2:10:2], [2:6], [3:11]}
+  slice.1 = f32[4,4,8] slice(param_0), slice={[4:8], [8:16:2], [13:21]}
+  ROOT add = f32[4,4,8] add(slice, slice.1)
 }
 
 ENTRY entry_computation {
-  param_0.2 = f32[16,16,32] parameter(0)
-  ROOT fusion = f32[4,4,8] fusion(param_0.2), kind=kCustom, calls=fused_computation, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["2","2","8"]}],"num_warps":"1"}}}
+  param_0 = f32[16,16,32] parameter(0)
+  ROOT fusion = f32[4,4,8] fusion(param_0), kind=kCustom,
+    calls=fused_computation,
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","2","8"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-6,
-                                                           /*arel=*/1e-6}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+TEST_P(TmaParameterizedTritonEmitterTest,
+       TestSlice2DWithTileElementsNotAllContiguous) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_computation {
+  param_0 = f32[16,32] parameter(0)
+  slice = f32[4,16] slice(param_0), slice={[2:6], [3:19]}
+  slice.1 = f32[4,16] slice(param_0), slice={[4:8], [13:29]}
+  ROOT add = f32[4,16] add(slice, slice.1)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[16,32] parameter(0)
+  ROOT fusion = f32[4,16] fusion(param_0), kind=kCustom,
+    calls=fused_computation,
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","8"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
 TEST_F(TritonEmitterTest, TestSliceWithTileElementsNotAllContiguousUnaligned) {
@@ -1002,15 +1201,46 @@ ENTRY entry_computation {
   ROOT fusion = f32[3,2,14] fusion(p),
     kind=kCustom, calls=fused_computation, backend_config={
       "fusion_backend_config": {
-        "kind":"__triton","block_level_fusion_config": {
-          "output_tiles":[{"sizes":["2","2","8"]}],"num_warps":"1"
-        }
-      }
-    }
+        "kind":"__triton",
+        "block_level_fusion_config": {
+          "output_tiles":[{"sizes":["2","2","8"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{0, 0}));
 }
 
+TEST_P(TmaParameterizedTritonEmitterTest,
+       TestSlice2DWithTileElementsNotAllContiguousUnaligned) {
+  bool tma_enabled = GetParam();
+  if (tma_enabled) {
+    GTEST_SKIP() << "TODO(b/413351837): Skipping TMA due to: "
+                    "CUDA_ERROR_ILLEGAL_INSTRUCTION";
+  }
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_computation {
+  p = f32[7,80] parameter(0)
+  ROOT slice = f32[2,14] slice(p), slice={[2:6:3], [35:75:3]}
+}
+
+ENTRY entry_computation {
+  p = f32[7,80] parameter(0)
+  ROOT fusion = f32[2,14] fusion(p),
+    kind=kCustom, calls=fused_computation, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton",
+        "block_level_fusion_config": {
+          "output_tiles":[{"sizes":["2","8"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
 TEST_F(TritonEmitterTest, ReshapeIntoBroadcastIsLoweredCorrectly) {
   constexpr absl::string_view kHloText = R"(
 triton_computation {
@@ -1022,11 +1252,14 @@ triton_computation {
 ENTRY main {
   param_0 = f32[128,256]{1,0} parameter(0)
   ROOT triton_fusion = f32[64,2,256,2]{3,2,1,0} fusion(param_0), kind=kCustom,
-    calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["2","2","2","2"]}],
-                                   "num_warps":"1"}}}
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["2","2","2","2"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
@@ -1047,11 +1280,14 @@ triton_computation {
 ENTRY main {
   param_0 = f32[128,256]{1,0} parameter(0)
   ROOT triton_fusion = f32[64,2,256,2]{3,2,1,0} fusion(param_0), kind=kCustom,
-    calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["4","2","8","2"]}],
-                                   "num_warps":"1"}}}
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["4","2","8","2"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
@@ -1061,56 +1297,157 @@ CHECK: tt.reshape
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
-TEST_F(TritonEmitterTest, BitcastNormalizedLayoutsIsLoweredCorrectly) {
+TEST_P(TmaParameterizedTritonEmitterTest,
+       SimpleBitcastNormalizedLayoutIsLoweredCorrectly) {
   constexpr absl::string_view kHloText = R"(
 triton_computation {
-  p = s8[5,42] parameter(0)
-  ROOT bitcast = s8[5,6,7] bitcast(p)
+  p = s16[16,64]{1,0} parameter(0)
+  ROOT bitcast = s16[16,64] bitcast(p)
 }
 
 ENTRY entry_computation {
-  p = s8[5,42] parameter(0)
-  ROOT fusion = s8[5,6,7] fusion(p), kind=kCustom, calls=triton_computation,
+  p = s16[16,64]{1,0} parameter(0)
+  ROOT fusion = s16[16,64] fusion(p), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["2","4","1"]}], "num_warps":"1"}}
-    }
+    "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+      "output_tiles":[{"sizes":["16","32"]}],
+      "num_warps":"1",
+      "num_ctas":"1",
+      "num_stages":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+// Parameterized the test to make sure that non-canonical layouts are handled
+// correctly when TMA is enabled.
+TEST_P(TmaParameterizedTritonEmitterTest,
+       SimpleBitcastNonNormalizedInputLayoutIsLoweredCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+triton_computation {
+  p = s32[64,16]{0,1} parameter(0)
+  ROOT bitcast = s32[16,64] bitcast(p)
+}
+
+ENTRY entry_computation {
+  p = s32[64,16]{0,1} parameter(0)
+  ROOT fusion = s32[16,64] fusion(p), kind=kCustom, calls=triton_computation,
+    backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16","32"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+// Parameterized the test to make sure that non-canonical layouts are handled
+// correctly when TMA is enabled.
+TEST_P(TmaParameterizedTritonEmitterTest,
+       SimpleBitcastNonNormalizedOutputLayoutIsLoweredCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+triton_computation {
+p = s32[64,16] parameter(0)
+ROOT bitcast = s32[16,64]{0,1} bitcast(p)
+}
+
+ENTRY entry_computation {
+p = s32[64,16] parameter(0)
+ROOT fusion = s32[16,64]{0,1} fusion(p), kind=kCustom, calls=triton_computation,
+backend_config={
+"fusion_backend_config":{
+ "kind":"__triton",
+ "block_level_fusion_config":{
+   "output_tiles":[{"sizes":["16","32"]}],
+   "num_warps":"1",
+   "num_ctas":"1",
+   "num_stages":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+// Parameterized to provide coverage for strided access with TMA.
+TEST_P(TmaParameterizedTritonEmitterTest,
+       BitcastNormalizedLayoutsIsLoweredCorrectly) {
+  bool tma_enabled = GetParam();
+  if (tma_enabled) {
+    GTEST_SKIP() << "TODO(b/417021441): Skipping TMA due to: "
+                    "CUDA_ERROR_ILLEGAL_INSTRUCTION. "
+                    "Potentially due to incorrect strides handling.";
+  }
+  constexpr absl::string_view kHloText = R"(
+triton_computation {
+  p = f32[8,48]{1,0} parameter(0)
+  ROOT bitcast = f32[8,16,3] bitcast(p)
+}
+
+ENTRY entry_computation {
+  p = f32[8,48]{1,0} parameter(0)
+  ROOT fusion = f32[8,16,3] fusion(p), kind=kCustom, calls=triton_computation,
+    backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","8","1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 CHECK-NOT: tt.trans
 CHECK:     tt.reshape
 CHECK-NOT: tt.trans
-CHECK:     tt.store
+CHECK:     triton_xla.insert
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
-TEST_F(TritonEmitterTest, BitcastNonNormalizedInputLayoutIsLoweredCorrectly) {
+// Parameterized to provide coverage for both strided access and non-canonical
+// layouts with TMA.
+TEST_P(TmaParameterizedTritonEmitterTest,
+       BitcastNonNormalizedInputLayoutIsLoweredCorrectly) {
+  bool tma_enabled = GetParam();
+  if (tma_enabled) {
+    GTEST_SKIP() << "TODO(b/417021441): Skipping TMA due to: "
+                    "CUDA_ERROR_ILLEGAL_INSTRUCTION. "
+                    "Potentially due to incorrect strides handling.";
+  }
   constexpr absl::string_view kHloText = R"(
 triton_computation {
-  p = s8[42,5]{0,1} parameter(0)
-  ROOT bitcast = s8[5,6,7] bitcast(p)
+  p = s32[48,16]{0,1} parameter(0)
+  ROOT bitcast = s32[16,16,3] bitcast(p)
 }
 
 ENTRY entry_computation {
-  p = s8[42,5]{0,1} parameter(0)
-  ROOT fusion = s8[5,6,7] fusion(p), kind=kCustom, calls=triton_computation,
+  p = s32[48,16]{0,1} parameter(0)
+  ROOT fusion = s32[16,16,3] fusion(p), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["2","4","1"]}], "num_warps":"1"}}
-    }
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","8","1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 CHECK:     tt.trans
 CHECK:     tt.reshape
 CHECK-NOT: tt.trans
-CHECK:     tt.store
-)"));
+CHECK:     triton_xla.insert
+  )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -1126,17 +1463,21 @@ ENTRY entry_computation {
   p = s8[5,42] parameter(0)
   ROOT fusion = s8[5,6,7]{1,2,0} fusion(p), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["2","4","1"]}], "num_warps":"1"}}
-    }
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","4","1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 CHECK-NOT: tt.trans
 CHECK:     tt.reshape
 CHECK:     tt.trans
-CHECK:     tt.store
+CHECK:     triton_xla.insert
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1154,17 +1495,21 @@ ENTRY entry_computation {
   p = s8[42,5]{0,1} parameter(0)
   ROOT fusion = s8[5,6,7]{1,2,0} fusion(p), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["2","4","1"]}], "num_warps":"1"}}
-    }
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","4","1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 CHECK:     tt.trans
 CHECK:     tt.reshape
 CHECK:     tt.trans
-CHECK:     tt.store
+CHECK:     triton_xla.insert
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1181,17 +1526,21 @@ ENTRY entry_computation {
   p = s8[42,5]{0,1} parameter(0)
   ROOT fusion = s8[5,42] fusion(p), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["4","1"]}], "num_warps":"1"}}
-    }
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["4","1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 CHECK:     tt.trans
 CHECK-NOT: tt.reshape
 CHECK-NOT: tt.trans
-CHECK:     tt.store
+CHECK:     triton_xla.insert
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1222,18 +1571,21 @@ triton_computation {
 ENTRY main {
   %p0 = bf16[2048,4,256]{2,1,0} parameter(0)
   ROOT fusion = bf16[2048,4,128]{2,1,0} fusion(p0), kind=kCustom,
-  calls=triton_computation,
-  backend_config={"fusion_backend_config":
-    {"kind":"__triton",
-     "block_level_fusion_config":{"output_tiles":[{"sizes":["8","4","128"]}],
-                                  "num_warps":"8"}}}
+  calls=triton_computation, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["8","4","128"]}],
+        "num_warps":"8",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 CHECK:     tt.reduce
 CHECK:     tt.broadcast
-CHECK:     tt.store
+CHECK:     triton_xla.insert
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1250,11 +1602,13 @@ triton_computation {
 ENTRY main {
   param_0 = f32[] parameter(0)
   ROOT triton_fusion = f32[127,125]{1,0} fusion(param_0), kind=kCustom,
-    calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["8","4"]}],
-                                   "num_warps":"1"}}}
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton","block_level_fusion_config":{
+          "output_tiles":[{"sizes":["8","4"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
@@ -1278,17 +1632,20 @@ ENTRY main {
   param_0 = f32[15] parameter(0)
   param_1 = f32[15] parameter(1)
   ROOT triton_fusion = pred[15] fusion(param_0, param_1), kind=kCustom,
-    calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["4"]}],
-                                   "num_warps":"1"}}}
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["4"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:      %[[CASTED_OUT:.*]] = arith.extui
 CHECK-SAME:   tensor<4xi1> to tensor<4xi8>
-CHECK:      tt.store {{.*}} %[[CASTED_OUT]]
+CHECK:      triton_xla.insert %[[CASTED_OUT]]
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1316,15 +1673,18 @@ ENTRY main {
   param_1 = f32[15] parameter(1)
   param_2 = f32[15] parameter(2)
   ROOT triton_fusion = f32[15] fusion(param_0, param_1, param_2),
-    kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["4"]}],
-                                   "num_warps":"1"}}}
+    kind=kCustom, calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["4"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:      %[[I8_PARAM:.*]] = tt.load {{.*}} : !tt.ptr<tensor<4xi8>>
+CHECK:      %[[I8_PARAM:.*]] = triton_xla.extract {{.*}} : tensor<15xi8> to tensor<4xi8>
 CHECK:      arith.trunci %[[I8_PARAM]] : tensor<4xi8> to tensor<4xi1>
 )"));
 
@@ -1343,22 +1703,27 @@ triton_computation {
 ENTRY main {
   param_0 = f32[15,7,3] parameter(0)
   ROOT triton_fusion = f32[3,15,7] fusion(param_0),
-    kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["1","8","4"]}],
-                                   "num_warps":"1"}}}
+    kind=kCustom, calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","8","4"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:      %[[TILE:.*]] = tt.load {{.*}} : !tt.ptr<tensor<8x4x1xf32>>
+CHECK:      %[[TILE:.*]] = triton_xla.extract {{.*}} : tensor<15x7x3xf32> to tensor<8x4x1xf32>
 CHECK:      tt.trans %[[TILE]] {order = array<i32: 2, 0, 1>} : tensor<8x4x1xf32> -> tensor<1x8x4xf32>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
-TEST_F(TritonEmitterTest, Transpose3DWithExtraOutput) {
+// TODO(b/390559452): Capture the iteration order from the propagated tiling.
+// When computing the tiling separately we need to use the same iteration order.
+TEST_F(TritonEmitterTest, DISABLED_Transpose3DWithExtraOutput) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
 
@@ -1377,16 +1742,18 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[
-            {"sizes":["1","8","4"]},{"sizes":["4","8","1"]}],"num_warps":"1"}}}
+          "output_tiles":[{"sizes":["1","8","4"]},{"sizes":["4","8","1"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "fused_computation", R"(
-CHECK:         %[[TILE:.*]] = tt.load {{.*}} : !tt.ptr<tensor<8x4x1xf32>>
-CHECK-NOT:     tt.load
+CHECK:         %[[TILE:.*]] = triton_xla.extract {{.*}} : tensor<15x7x3xf32> to tensor<8x4x1xf32>
+CHECK-NOT:     triton_xla.extract
 CHECK:         %[[ABS:.*]] = math.absf %[[TILE]]
 CHECK:         tt.trans %[[ABS]] {order = array<i32: 2, 0, 1>} : tensor<8x4x1xf32> -> tensor<1x8x4xf32>
-CHECK-COUNT-2: tt.store
+CHECK-COUNT-2: triton_xla.insert
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1406,11 +1773,14 @@ triton_computation {
 ENTRY main {
   param_0 = f32[3,8,20] parameter(0)
   ROOT triton_fusion = f32[8,3,20] fusion(param_0),
-    kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["1","1", "20"]}],
-                                   "num_warps":"4"}}}
+    kind=kCustom, calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","1", "20"]}],
+        "num_warps":"4",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
@@ -1424,11 +1794,14 @@ triton_computation {
 
 ENTRY main {
   ROOT triton_fusion = f32[3,4,182,5] fusion(),
-    kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["1","2","64","8"]}],
-                                   "num_warps":"1"}}}
+    kind=kCustom, calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","2","64","8"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
 
   TF_EXPECT_OK(
@@ -1454,11 +1827,14 @@ triton_computation {
 
 ENTRY main {
   ROOT triton_fusion = $0[3,4,1000,5] fusion(),
-    kind=kCustom, calls=triton_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["1","2","64","8"]}],
-                                   "num_warps":"1"}}}
+    kind=kCustom, calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","2","64","8"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })",
                        primitive_util::LowercasePrimitiveTypeName(data_type));
 
@@ -1474,10 +1850,16 @@ CHECK:      tt.broadcast {{.*}} -> tensor<1x2x64x8x
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
+std::string TypeTestParamToString(
+    const ::testing::TestParamInfo<PrimitiveType>& data) {
+  return primitive_util::LowercasePrimitiveTypeName(data.param);
+}
+
 INSTANTIATE_TEST_SUITE_P(IotaEmitterParametrizedTestSuite,
                          IotaEmitterParametrizedTest,
                          ::testing::ValuesIn({S8, S16, S32, S64, BF16, F16, F32,
-                                              F64}));
+                                              F64}),
+                         TypeTestParamToString);
 
 TEST_F(TritonEmitterTest, ReducePrecisionIsLoweredCorrectly) {
   const std::string kHloText = R"(
@@ -1490,13 +1872,17 @@ ENTRY entry_computation {
   p = f32[5,7] parameter(0)
   ROOT fusion = f32[5,7] fusion(p), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles": [{"sizes":["4","4"]}], "num_warps":"1"}}
-    }
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles": [{"sizes":["4","4"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
@@ -1524,16 +1910,20 @@ ENTRY entry_computation {
   p1 = f32[] parameter(1)
   ROOT fusion = bf16[] fusion(p0, p1), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles": [{"sizes":[]}], "num_warps":"1"}}
-    }
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+        "output_tiles": [{"sizes":[]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load {{.*}} !tt.ptr<f32>
+CHECK:     tensor.extract {{.*}} tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
 CHECK:     arith.subf {{.*}} f32
-CHECK:     tt.load {{.*}} !tt.ptr<f32>
+CHECK:     tensor.extract {{.*}} tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
 CHECK:     arith.subf {{.*}} f32
 CHECK:     arith.addf {{.*}} f32
@@ -1541,7 +1931,7 @@ CHECK:     arith.mulf {{.*}} f32
 CHECK:     arith.divf {{.*}} f32
 CHECK:     arith.truncf {{.*}} f32 to bf16
 CHECK:     arith.subf {{.*}} bf16
-CHECK:     tt.store {{.*}} !tt.ptr<bf16>
+CHECK:     tensor.insert {{.*}} tensor<bf16>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
@@ -1572,17 +1962,21 @@ ENTRY entry_computation {
   p = f32[] parameter(0)
   ROOT fusion = f32[] fusion(p), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles": [{"sizes":[]}], "num_warps":"1"}}
-    }
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles": [{"sizes":[]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     tensor.extract
 CHECK:     tt.splat
 CHECK:     arith.addf
 CHECK:     tt.reduce
-CHECK:     tt.store {{.*}} !tt.ptr<f32>
+CHECK:     tensor.insert {{.*}} tensor<f32>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(
@@ -1604,17 +1998,21 @@ ENTRY entry_computation {
   p1 = f32[1] parameter(1)
   ROOT fusion = f32[] fusion(p0, p1), kind=kCustom, calls=triton_computation,
     backend_config={
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles":[{"sizes":[]}], "num_warps":"1"}}
-    }
-})";
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":[]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:     tt.reshape
 CHECK:     tt.reduce{{.*}}axis = 0
 CHECK-NOT: tt.reshape
 CHECK:     tt.reduce{{.*}}axis = 0
-CHECK:     tt.store {{.*}} !tt.ptr<f32>
+CHECK:     tensor.insert {{.*}} tensor<f32>
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{0, 0}));
@@ -1642,24 +2040,75 @@ ENTRY entry_computation {
   i = s32[32] iota(), iota_dimension=0
   p = s32[4,8] bitcast(i)
 
-  ROOT r = s32[2] fusion(p),
-     kind=kCustom, calls=triton_computation,
-     backend_config={
-     "fusion_backend_config":{"kind":"__triton","block_level_fusion_config":
-     {"output_tiles":[{"sizes":["2"]}],"num_warps":"1"}}}
+  ROOT r = s32[2] fusion(p), kind=kCustom, calls=triton_computation,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["2"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
-CHECK:     tt.load
+CHECK:     triton_xla.extract
 CHECK:     tt.reshape
 CHECK:     tt.reduce
 CHECK:     tt.reduce
-CHECK:     tt.store
+CHECK:     triton_xla.insert
 )"));
 
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
+TEST_P(TmaParameterizedTritonEmitterTest, BroadcastWorksCorrectly) {
+  bool tma_enabled = GetParam();
+  if (tma_enabled) {
+    GTEST_SKIP() << "TODO(b/415758695): Skipping TMA due to incorrect handling "
+                    "of swizzle. Re-enable once fixed.";
+  }
+  constexpr absl::string_view kTritonHloText = R"(
+computation {
+  p0 = s32[256]{0} parameter(0)
+  ROOT broadcast = s32[2,256]{1,0} broadcast(p0), dimensions={1}
+}
+
+ENTRY entry_computation {
+  p0 = s32[256]{0} parameter(0)
+  ROOT fusion = s32[2,256]{1,0} fusion(p0), kind=kCustom,
+    calls=computation,
+    backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["2","256"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
+
+  constexpr absl::string_view kEmittersHloText = R"(
+computation {
+  p0 = s32[256]{0} parameter(0)
+  ROOT broadcast = s32[2,256]{1,0} broadcast(p0), dimensions={1}
+}
+
+ENTRY entry_computation {
+  p0 = s32[256]{0} parameter(0)
+  ROOT fusion = s32[2,256]{1,0} fusion(p0), kind=kCustom, calls=computation
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_module,
+                          ParseAndReturnVerifiedModule(kTritonHloText));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> emitters_module,
+                          ParseAndReturnVerifiedModule(kEmittersHloText));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(emitters_module),
+                                      std::move(triton_module), kExactMatch,
+                                      /*run_hlo_passes=*/false));
+}
+
 // Reproducer from b/384110192.
 TEST_F(TritonEmitterTest,
        FusionWithOutputContainingMoreThanInt32MaxElementsExecutesCorrectly) {
@@ -1682,7 +2131,10 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["2","256"]}],"num_warps":"1"}}}
+          "output_tiles":[{"sizes":["2","256"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
   ROOT slice = s8[1000,256]{1,0} slice(fusion), slice={[16776217:16777217], [0:256]}
 })";
 
@@ -1710,20 +2162,18 @@ ENTRY entry_computation {
                                          ->shape();
 
   ASSERT_GT(Product(triton_fusion_shape.dimensions()), 1l << 32);
-  EXPECT_TRUE(RunAndCompareTwoModules(std::move(triton_module),
-                                      std::move(emitters_module), kExactMatch,
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(emitters_module),
+                                      std::move(triton_module), kExactMatch,
                                       /*run_hlo_passes=*/false));
 }
 
 TEST_F(TritonEmitterTest, ConvertF16ToF8E5M2Exhaustive) {
   // TODO(b/396595945): enable post-Ampere once Triton respects RTNE semantics
   // on H100.
-  if (auto cc =
-          std::get_if<se::CudaComputeCapability>(&GpuComputeCapability())) {
-    if (cc->IsAtLeastHopper()) {
-      GTEST_SKIP() << "Skipping tests above Ampere, Triton's conversion isn't "
-                      "always correct";
-    }
+  if (auto cc = std::get_if<se::CudaComputeCapability>(&GpuComputeCapability());
+      cc && cc->IsAtLeastHopper()) {
+    GTEST_SKIP() << "Skipping tests above Ampere, Triton's conversion isn't "
+                    "always correct";
   }
 
   constexpr absl::string_view kHloTextTemplate = R"(
@@ -1740,7 +2190,10 @@ ENTRY entry_computation {
       "fusion_backend_config":{
         "kind":"__triton",
         "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["256"]}],"num_warps":"1"}}}
+          "output_tiles":[{"sizes":["256"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
 
   std::vector<Eigen::half> all_f16_values;
@@ -1758,6 +2211,900 @@ ENTRY entry_computation {
   EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), kExactMatch));
 }
 
+TEST_F(TritonEmitterTest, ConvertS4ToS8Exhaustive) {
+  constexpr absl::string_view kHloText = R"(
+computation {
+  p0 = s4[16] parameter(0)
+  ROOT convert = s8[16] convert(p0)
+}
+
+ENTRY entry_computation {
+  p0 = s4[16] parameter(0)
+  ROOT fusion = s8[16] fusion(p0), kind=kCustom,
+    calls=computation,
+    backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  auto values = {s4(-8), s4(-7), s4(-6), s4(-5), s4(-4), s4(-3), s4(-2), s4(-1),
+                 s4(0),  s4(1),  s4(2),  s4(3),  s4(4),  s4(5),  s4(6),  s4(7)};
+  Literal literal = LiteralUtil::CreateR1<s4>(values);
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(std::move(module), {&literal}, kExactMatch));
+}
+
+TEST_F(TritonEmitterTest, SingleTileDotWithNestedFusionsIsEmittedCorrectly) {
+  // Simplest case when everything fits into one tile that is useful for
+  // debugging. This also tests support for empty nested fusions.
+  const std::string kHloText = R"(
+flhs {
+  ROOT flhs.p0 = f32[16,16] parameter(0)
+}
+
+frhs {
+  frhs.p0 = f32[16,16] parameter(0)
+  ROOT frhs.root = f32[16,16] abs(frhs.p0)
+}
+
+fdot {
+  fdot.p0 = f32[16,16] parameter(0)
+  fdot.p1 = f32[16,16] parameter(1)
+  fdot.lhs = f32[16,16] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "16"]}]
+      }
+    }
+  }
+  fdot.rhs = f32[16,16]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "16"]}]
+      }
+    }
+  }
+  ROOT fdot.root = f32[16,16]{1,0} dot(fdot.lhs, fdot.rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY entry {
+  entry.p0 = f32[16,16] parameter(0)
+  entry.p1 = f32[16,16] parameter(1)
+  ROOT fusion = f32[16,16] fusion(entry.p0, entry.p1),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16","16"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  // We expect that for loop instruction will be optimized away.
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fdot", R"(
+CHECK: tt.dot {{.*}} -> tensor<16x16xf32>
+)"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+// Parameterized as a sanity check to make sure dots work with TMA.
+TEST_P(TmaParameterizedTritonEmitterTest,
+       DotWithNestedFusionsIsEmittedCorrectly) {
+  const std::string kHloText = R"(
+flhs {
+  flhs.p0 = f32[32,256] parameter(0)
+  ROOT lhs.root = f32[32,256] negate(flhs.p0)
+}
+
+frhs {
+  frhs.p0 = f32[256,512] parameter(0)
+  ROOT frhs.root = f32[256,512] abs(frhs.p0)
+}
+
+fdot {
+  fdot.p0 = f32[32,256] parameter(0)
+  fdot.p1 = f32[256,512] parameter(1)
+  fdot.lhs = f32[32,256] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "32"]}]
+      }
+    }
+  }
+  fdot.rhs = f32[256,512]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  ROOT fdot.root = f32[32,512]{1,0} dot(fdot.lhs, fdot.rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY entry {
+  entry.p0 = f32[32,256] parameter(0)
+  entry.p1 = f32[256,512] parameter(1)
+  ROOT fusion = f32[32,512] fusion(entry.p0, entry.p1),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16", "64"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fdot", R"(
+CHECK:      func.func @triton_fn(%[[ARG0:[A-Za-z0-9_]*]]: tensor<32x256xf32>
+CHECK-SAME:                    %[[ARG1:[A-Za-z0-9_]*]]: tensor<256x512xf32>
+CHECK-SAME:                    %[[ARG2:[A-Za-z0-9_]*]]: tensor<32x512xf32>
+CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i64
+CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : i64
+CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : i64
+CHECK:      {{.*}} = scf.for {{.*}} = %[[C0]] to %[[C8]] step %[[C1]]
+CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>)  : i64 {
+CHECK-DAG:  triton_xla.extract %[[ARG0]]
+CHECK-DAG:  triton_xla.extract %[[ARG1]]
+CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
+CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
+CHECK-DAG:  tt.dot {{.*}} tensor<16x32xf32> * tensor<32x64xf32> -> tensor<16x64xf32>
+CHECK:      scf.yield {{.*}} : tensor<16x64xf32>
+CHECK-COUNT-1: triton_xla.insert
+)"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonEmitterTest, MaskedDotIsEmittedCorrectly) {
+  const std::string kHloText = R"(
+flhs {
+  flhs.p0 = f32[32,299] parameter(0)
+  ROOT lhs.root = f32[32,299] cosine(flhs.p0)
+}
+
+frhs {
+  frhs.p0 = f32[299,512] parameter(0)
+  ROOT frhs.root = f32[299,512] cosine(frhs.p0)
+}
+
+fdot {
+  fdot.p0 = f32[32,299] parameter(0)
+  fdot.p1 = f32[299,512] parameter(1)
+  fdot.lhs = f32[32,299] fusion(fdot.p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "32"]}]
+      }
+    }
+  }
+  fdot.rhs = f32[299,512]{1,0} fusion(fdot.p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  ROOT fdot.root = f32[32,512]{1,0} dot(fdot.lhs, fdot.rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY entry {
+  entry.p0 = f32[32,299] parameter(0)
+  entry.p1 = f32[299,512] parameter(1)
+  ROOT fusion = f32[32,512] fusion(entry.p0, entry.p1),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16", "64"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonEmitterTest, DotWithMajorLhsContractingDimIsEmittedCorrectly) {
+  const std::string kHloText = R"(
+lhs {
+  ROOT p0 = f32[299,32] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[299,512] parameter(0)
+}
+
+fdot {
+  p0 = f32[299,32] parameter(0)
+  p1 = f32[299,512] parameter(1)
+  lhs = f32[299,32] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "16"]}]
+      }
+    }
+  }
+  rhs = f32[299,512]{1,0} fusion(p1), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  ROOT dot = f32[32,512]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY entry {
+  // Take in boolean inputs for the test, in order to allow exact accumulation.
+  p0 = pred[299,32] parameter(0)
+  p1 = pred[299,512] parameter(1)
+  p0_f32 = f32[299,32] convert(p0)
+  p1_f32 = f32[299,512] convert(p1)
+  ROOT fusion = f32[32,512] fusion(p0_f32, p1_f32),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16", "64"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+TEST_F(TritonEmitterTest, DotWithMinorRhsContractingDimIsEmittedCorrectly) {
+  const std::string kHloText = R"(
+lhs {
+  ROOT p0 = f32[32,299] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[512,299] parameter(0)
+}
+
+fdot {
+  p0 = f32[32,299] parameter(0)
+  p1 = f32[512,299] parameter(1)
+  lhs = f32[32,299] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "32"]}]
+      }
+    }
+  }
+  rhs = f32[512,299]{1,0} fusion(p1), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "32"]}]
+      }
+    }
+  }
+  ROOT dot = f32[32,512]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY entry {
+  // Take in boolean inputs for the test, in order to allow exact accumulation.
+  p0 = pred[32,299] parameter(0)
+  p1 = pred[512,299] parameter(1)
+  p0_f32 = f32[32,299] convert(p0)
+  p1_f32 = f32[512,299] convert(p1)
+  ROOT fusion = f32[32,512] fusion(p0_f32, p1_f32),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16", "64"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+TEST_F(TritonEmitterTest,
+       DotWithAdditionalDimensionsWithUnitTileSizesIsEmittedCorrectly) {
+  const std::string kHloText = R"(
+lhs {
+  ROOT p0 = f32[2,3,32,125] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[2,125,3,256] parameter(0)
+}
+
+fdot {
+  p0 = f32[2,3,32,125] parameter(0)
+  p1 = f32[2,125,3,256] parameter(1)
+  lhs = f32[2,3,32,125] fusion(p0), kind=kCustom, calls=lhs,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "1", "16", "32"]}]
+      }
+    }
+  }
+  rhs = f32[2,125,3,256] fusion(p1), kind=kCustom, calls=rhs,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "32", "1", "64"]}]
+      }
+    }
+  }
+  ROOT dot = f32[2,3,32,256] dot(lhs, rhs),
+    lhs_batch_dims={0,1}, rhs_batch_dims={0,2},
+    lhs_contracting_dims={3}, rhs_contracting_dims={1},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY entry {
+  // Take in boolean inputs for the test, in order to allow exact accumulation.
+  p0 = pred[2,3,32,125] parameter(0)
+  p1 = pred[2,125,3,256] parameter(1)
+  p0_f32 = f32[2,3,32,125] convert(p0)
+  p1_f32 = f32[2,125,3,256] convert(p1)
+  ROOT fusion = f32[2,3,32,256] fusion(p0_f32, p1_f32),
+    kind=kCustom, calls=fdot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1", "1", "16", "64"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+TEST_F(TritonEmitterTest, ConcatenateOfNestsIsEmittedCorrectly) {
+  const std::string kHloText = R"(
+nest0 {
+  p0 = s32[128] parameter(0)
+  ROOT abs = s32[128] abs(p0)
+}
+
+nest1 {
+  p0 = s32[128] parameter(0)
+  ROOT negate = s32[128] negate(p0)
+}
+
+nest2 {
+  ROOT p0 = s32[128] parameter(0)
+}
+
+concatenate_fusion {
+  p0 = s32[128] parameter(0)
+  p1 = s32[128] parameter(1)
+  p2 = s32[128] parameter(2)
+
+  fusion0 = s32[128] fusion(p0), kind=kCustom, calls=nest0, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+  fusion1 = s32[128] fusion(p1), kind=kCustom, calls=nest1, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+  fusion2 = s32[128] fusion(p2), kind=kCustom, calls=nest2, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+
+  ROOT concatenate = s32[384] concatenate(fusion0, fusion1, fusion2), dimensions={0}
+}
+
+ENTRY main {
+  p0 = s32[128] parameter(0)
+  p1 = s32[128] parameter(1)
+  p2 = s32[128] parameter(2)
+  ROOT fusion = s32[384] fusion(p0, p1, p2), kind=kCustom,
+    calls=concatenate_fusion, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
+
+  TF_EXPECT_OK(
+      CreateTritonIrAndFileCheck(this, kHloText, "concatenate_fusion", R"(
+    // Check that we generate three branches. This is a bit of an implementation
+    // detail, so it doesn't seem worth enforcing a lot here.
+    CHECK-COUNT-2: scf.if
+  )"));
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
+TEST_F(TritonEmitterTest, NestedFusionOfNestedFusionsExecutesCorrectly) {
+  const std::string kHloText = R"(
+lhs {
+  p0 = f32[32,299] parameter(0)
+  ROOT cos = f32[32,299] cosine(p0)
+}
+
+nest0 {
+  p0 = f32[299,128] parameter(0)
+  ROOT abs = f32[299,128] abs(p0)
+}
+
+nest1 {
+  p0 = f32[299,128] parameter(0)
+  ROOT negate = f32[299,128] negate(p0)
+}
+
+nest2 {
+  ROOT p0 = f32[299,128] parameter(0)
+}
+
+nest3 {
+  p0 = f32[299,128] parameter(0)
+  ROOT cos = f32[299,128] cosine(p0)
+}
+
+rhs {
+  p0 = f32[299,128] parameter(0)
+  p1 = f32[299,128] parameter(1)
+  p2 = f32[299,128] parameter(2)
+  p3 = f32[299,128] parameter(3)
+
+  fusion0 = f32[299,128] fusion(p0), kind=kCustom, calls=nest0, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  fusion1 = f32[299,128] fusion(p1), kind=kCustom, calls=nest1, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  fusion2 = f32[299,128] fusion(p2), kind=kCustom, calls=nest2, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  fusion3 = f32[299,128] fusion(p3), kind=kCustom, calls=nest3, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+
+  concatenate = f32[299,512] concatenate(fusion0, fusion1, fusion2, fusion3), dimensions={1}
+  ROOT cos = f32[299,512] cosine(concatenate)
+}
+
+dot {
+  p0 = f32[32,299] parameter(0)
+  p1 = f32[299,128] parameter(1)
+  p2 = f32[299,128] parameter(2)
+  p3 = f32[299,128] parameter(3)
+  p4 = f32[299,128] parameter(4)
+  lhs = f32[32,299] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "32"]}]
+      }
+    }
+  }
+  rhs = f32[299,512]{1,0} fusion(p1, p2, p3, p4), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  ROOT dot = f32[32,512]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY entry {
+  p0 = f32[32,299] parameter(0)
+  p1 = f32[299,128] parameter(1)
+  p2 = f32[299,128] parameter(2)
+  p3 = f32[299,128] parameter(3)
+  p4 = f32[299,128] parameter(4)
+  ROOT fusion = f32[32,512] fusion(p0, p1, p2, p3, p4),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16", "64"]}], "num_warps":"1",
+          "num_ctas":"1", "num_stages":"1"
+        }
+      }
+    }
+})";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+TEST_P(TmaParameterizedTritonEmitterTest, DotFromBroadcastIsEmittedCorrectly) {
+  // TODO(b/393299275): add a deviceless test to run the whole pipeline as
+  // other passes might change the module but we are starting from a fixed
+  // state.
+  const std::string kHloText = R"(
+HloModule module
+
+flhs (parameter_0: f32[264]) -> f32[264,128] {
+  parameter_0 = f32[264]{0} parameter(0)
+  ROOT flhs.1 = f32[264,128]{1,0} broadcast(parameter_0), dimensions={0}
+}
+
+frhs (parameter_0.1: f32[128,32]) -> f32[128,32] {
+  ROOT parameter_0.1 = f32[128,32]{1,0} parameter(0)
+}
+
+triton_dot (p0: f32[264], p1: f32[128,32]) -> f32[264,32] {
+  p0 = f32[264]{0} parameter(0)
+  lhs = f32[264,128]{1,0} fusion(p0), kind=kCustom, calls=flhs, backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion","block_level_fusion_config":{"num_warps":"1","output_tiles":[{"sizes":["32","16"]}]}}}
+  p1 = f32[128,32]{1,0} parameter(1)
+  rhs = f32[128,32]{1,0} fusion(p1), kind=kCustom, calls=frhs, backend_config={"fusion_backend_config":{"kind":"__triton_nested_gemm_fusion","block_level_fusion_config":{"num_warps":"1","output_tiles":[{"sizes":["16","16"]}]}}}
+  ROOT result = f32[264,32]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_f32_f32_f32
+}
+
+ENTRY e (p0.1: f32[11,1,24,1], p1.1: f32[128,32]) -> f32[264,32] {
+  p0.1 = f32[11,1,24,1]{3,2,1,0} parameter(0)
+  bitcast = f32[264]{0} bitcast(p0.1)
+  p1.1 = f32[128,32]{1,0} parameter(1)
+  ROOT result.1 = f32[264,32]{1,0} fusion(bitcast, p1.1), kind=kCustom,
+    calls=triton_dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["32","16"]}],
+          "num_warps":"1",
+          "num_stages":"1",
+          "num_ctas":"1"}}}
+}
+)";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+// The template is parametrized by the type of the lhs/rhs, the type of the
+// dot output, and the algorithm.
+constexpr absl::string_view kHloForDotAlgorithmTestTemplate = R"(
+lhs {
+  ROOT p0 = $0[512,512] parameter(0)
+}
+
+rhs {
+  ROOT p0 = $0[512,512] parameter(0)
+}
+
+dot {
+  p0 = $0[512,512] parameter(0)
+  p1 = $0[512,512] parameter(1)
+  lhs = $0[512,512] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "32"]}]
+    }}}
+  rhs = $0[512,512]{1,0} fusion(p1), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+    }}}
+  ROOT dot = $1[512,512] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=$2
+}
+
+ENTRY entry {
+  p0 = $0[512,512] parameter(0)
+  p1 = $0[512,512] parameter(1)
+  ROOT fusion = $1[512,512] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16","64"]}],
+          "num_warps":"1", "num_ctas":"1", "num_stages":"1"
+    }}}
+})";
+
+std::string GetDotAlgorithmHlo(PrimitiveType in_ty, PrimitiveType out_ty,
+                               PrecisionConfig::Algorithm algorithm) {
+  constexpr absl::string_view kAlgorithmPrefix = "ALG_";
+  std::string in_ty_str = primitive_util::LowercasePrimitiveTypeName(in_ty);
+  std::string out_ty_str = primitive_util::LowercasePrimitiveTypeName(out_ty);
+  std::string algorithm_str = PrecisionConfig::Algorithm_Name(algorithm).substr(
+      kAlgorithmPrefix.size());
+  return absl::Substitute(kHloForDotAlgorithmTestTemplate, in_ty_str,
+                          out_ty_str, algorithm_str);
+}
+
+// TODO(b/407744579): narrow down the error specs for the various dot
+// algorithms.
+//
+// The non-default values are either taken from the pre-existing
+// `dot_algorithms_test` as of 2025-04-01, or approximated. It's not clear
+// whether even the pre-existing values were derived to adhere precisely to the
+// numerical expectations of the corresponding algorithms. We should narrow this
+// down in the future.
+ErrorSpec ErrorSpecForDotAlgorithm(PrecisionConfig::Algorithm algorithm) {
+  // A default error spec, not particularly tuned to any algorithm.
+  ErrorSpec default_error_spec{/*aabs=*/1e-4, /*arel=*/1e-6};
+  switch (algorithm) {
+    case PrecisionConfig::ALG_UNSET:
+      // Give a loose tolerance to ALG_UNSET, as the expected behaviour is
+      // not deducible from the algorithm name alone.
+      return ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2};
+    case PrecisionConfig::ALG_DOT_F16_F16_F16:
+      // Computed to make the tests pass (and it seems reasonable on the face of
+      // it), and not derived from first principles.
+      return ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-3};
+    case PrecisionConfig::ALG_DOT_F32_F32_F32:
+      return default_error_spec;
+    case PrecisionConfig::ALG_DOT_F64_F64_F64:
+      // Computed to make the tests pass (and it seems reasonable on the face of
+      // it), and not derived from first principles.
+      return ErrorSpec{/*aabs=*/2e-6, /*arel=*/2e-6};
+    case PrecisionConfig::ALG_DOT_F16_F16_F32:
+      return default_error_spec;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+      // Taken from `dot_algorithms_test`.
+      return ErrorSpec{/*aabs=*/0, /*arel=*/6e-5};
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+      // Taken from `dot_algorithms_test`.
+      return ErrorSpec{/*aabs=*/0, /*arel=*/7e-6};
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      // Computed to make the tests pass (and it seems reasonable on the face of
+      // it), and not derived from first principles.
+      return ErrorSpec{/*aabs=*/2e-6, /*arel=*/2e-6};
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      // Computed to make the tests pass (and it seems reasonable on the face of
+      // it), and not derived from first principles.
+      return ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3};
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+      // Computed to make the tests pass (and it seems reasonable on the face of
+      // it), and not derived from first principles.
+      return ErrorSpec{/*aabs=*/2e-6, /*arel=*/3e-6};
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      // Computed to make the tests pass (and it seems reasonable on the face of
+      // it), and not derived from first principles.
+      return ErrorSpec{/*aabs=*/2e-6, /*arel=*/2e-6};
+    case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
+      return kExactMatch;
+    // Keep in order to make the switch exhaustive.
+    case PrecisionConfig_Algorithm_PrecisionConfig_Algorithm_INT_MIN_SENTINEL_DO_NOT_USE_:  // NOLINT(whitespace/line_length)
+    case PrecisionConfig_Algorithm_PrecisionConfig_Algorithm_INT_MAX_SENTINEL_DO_NOT_USE_:  // NOLINT(whitespace/line_length)
+      LOG(FATAL) << "Unsupported algorithm: " << algorithm;
+  }
+}
+
+class TritonEmitterTestWithAlgorithmParam
+    : public TritonEmitterTest,
+      public ::testing::WithParamInterface<PrecisionConfig::Algorithm> {};
+
+// Regroups tests for dot algorithms that have no ambiguous type parameters as
+// per `algorithm_util::GetAllowedOperandsTypeForAlgorithm` and
+// `algorithm_util::GetDotAccumulatorType`, and do not decompose each tiled step
+// into multiple `dot` operations. We call these algorithms "basic" algorithms
+// here.
+using BasicDotAlgorithmEmitterTest = TritonEmitterTestWithAlgorithmParam;
+
+constexpr std::array kBasicAlgorithms = {
+    PrecisionConfig::ALG_DOT_F16_F16_F16,
+    PrecisionConfig::ALG_DOT_F32_F32_F32,
+    PrecisionConfig::ALG_DOT_F64_F64_F64,
+    PrecisionConfig::ALG_DOT_F16_F16_F32,
+    PrecisionConfig::ALG_DOT_BF16_BF16_F32,
+    PrecisionConfig::ALG_DOT_TF32_TF32_F32,
+};
+
+TEST_P(BasicDotAlgorithmEmitterTest, BasicAlgorithmIsEmittedCorrectly) {
+  auto algorithm = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<PrimitiveType> allowed_types,
+      algorithm_util::GetAllowedOperandsTypeForAlgorithm(algorithm));
+  ASSERT_EQ(allowed_types.size(), 1);
+  PrimitiveType in_ty = allowed_types.front();
+  TF_ASSERT_OK_AND_ASSIGN(PrimitiveType out_ty,
+                          algorithm_util::GetDotAccumulatorType(algorithm));
+  const std::string kHloText = GetDotAlgorithmHlo(in_ty, out_ty, algorithm);
+
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      this, kHloText, "dot",
+      absl::Substitute(R"(
+  CHECK:  tt.dot{{.*}} : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
+  )",
+                       primitive_util::LowercasePrimitiveTypeName(in_ty),
+                       primitive_util::LowercasePrimitiveTypeName(out_ty))));
+
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(kHloText, ErrorSpecForDotAlgorithm(algorithm)));
+}
+
+std::string DotAlgorithmTestToString(
+    const ::testing::TestParamInfo<PrecisionConfig::Algorithm>& data) {
+  return PrecisionConfig::Algorithm_Name(data.param);
+}
+
+INSTANTIATE_TEST_SUITE_P(BasicDotAlgorithmEmitterTestSuite,
+                         BasicDotAlgorithmEmitterTest,
+                         ::testing::ValuesIn(kBasicAlgorithms),
+                         DotAlgorithmTestToString);
+
+// Regroups tests for dot algorithms that issue several dot instructions.
+using MultiDotAlgorithmEmitterTest = TritonEmitterTestWithAlgorithmParam;
+
+constexpr std::array kMultiDotAlgorithms = {
+    PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3,
+    PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6,
+    PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3,
+    PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9,
+};
+
+TEST_P(MultiDotAlgorithmEmitterTest, MultiDotAlgorithmIsEmittedCorrectly) {
+  auto algorithm = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(PrimitiveType out_ty,
+                          algorithm_util::GetDotAccumulatorType(algorithm));
+  PrimitiveType in_ty =
+      algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? F32 : BF16;
+  // Dummy value to ensure that the dot count is explicitly set.
+  int dot_count_for_algorithm = 0x1337;
+  std::string input_precision_string = "";
+  switch (algorithm) {
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+      dot_count_for_algorithm = 3;
+      break;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      dot_count_for_algorithm = 6;
+      break;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      dot_count_for_algorithm = 9;
+      break;
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+      // Triton implements TF32x3 as a specific precision mode.
+      input_precision_string = "tf32x3";
+      dot_count_for_algorithm = 1;
+      break;
+    default:
+      // Unreachable.
+      ASSERT_TRUE(false);
+  }
+
+  const std::string kHloText = GetDotAlgorithmHlo(in_ty, out_ty, algorithm);
+
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      this, kHloText, "dot",
+      absl::Substitute(R"(
+  CHECK-COUNT-$2:  tt.dot{{.*}}$3{{.*}} : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
+  )",
+                       primitive_util::LowercasePrimitiveTypeName(in_ty),
+                       primitive_util::LowercasePrimitiveTypeName(out_ty),
+                       dot_count_for_algorithm, input_precision_string)));
+
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(kHloText, ErrorSpecForDotAlgorithm(algorithm)));
+}
+
+INSTANTIATE_TEST_SUITE_P(MultiDotAlgorithmEmitterTestSuite,
+                         MultiDotAlgorithmEmitterTest,
+                         ::testing::ValuesIn(kMultiDotAlgorithms),
+                         DotAlgorithmTestToString);
+
+// Regroups tests that use TF32 precision by definition.
+using TF32DotAlgorithmEmitterTest = TritonEmitterTestWithAlgorithmParam;
+
+constexpr std::array kTF32DotAlgorithms = {
+    PrecisionConfig::ALG_DOT_TF32_TF32_F32,
+    PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3};
+
+TEST_P(TF32DotAlgorithmEmitterTest, TF32AlgorithmsUseTF32InputPrecision) {
+  auto algorithm = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<PrimitiveType> allowed_types,
+      algorithm_util::GetAllowedOperandsTypeForAlgorithm(algorithm));
+  ASSERT_EQ(allowed_types.size(), 1);
+  PrimitiveType in_ty = allowed_types.front();
+  TF_ASSERT_OK_AND_ASSIGN(PrimitiveType out_ty,
+                          algorithm_util::GetDotAccumulatorType(algorithm));
+  const std::string kHloText = GetDotAlgorithmHlo(in_ty, out_ty, algorithm);
+
+  std::string input_precision_string =
+      algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3 ? "tf32x3"
+                                                             : "tf32";
+
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(
+      this, kHloText, "dot",
+      absl::Substitute(R"(
+  CHECK:  tt.dot{{.*}} inputPrecision = $2 : tensor<16x32x$0> * tensor<32x64x$0> -> tensor<16x64x$1>
+  )",
+                       primitive_util::LowercasePrimitiveTypeName(in_ty),
+                       primitive_util::LowercasePrimitiveTypeName(out_ty),
+                       input_precision_string)));
+  // No need to `RunAndCompare` here, these algorithms are already covered by
+  // other tests.
+}
+
+INSTANTIATE_TEST_SUITE_P(TF32DotAlgorithmEmitterTestSuite,
+                         TF32DotAlgorithmEmitterTest,
+                         ::testing::ValuesIn(kTF32DotAlgorithms),
+                         DotAlgorithmTestToString);
+
+class DotUnsetAlgorithmEmitterTest
+    : public TritonEmitterTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, PrimitiveType>> {
+ public:
+  static std::string ParamToString(
+      const ::testing::TestParamInfo<DotUnsetAlgorithmEmitterTest::ParamType>&
+          data) {
+    auto [result_type, input_type] = data.param;
+    return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(result_type),
+                        "_",
+                        primitive_util::LowercasePrimitiveTypeName(input_type));
+  };
+};
+
+TEST_P(DotUnsetAlgorithmEmitterTest, UnsetAlgorithmIsEmittedCorrectly) {
+  auto [result_type, input_type] = GetParam();
+  const std::string kHloText =
+      GetDotAlgorithmHlo(input_type, result_type, PrecisionConfig::ALG_UNSET);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  if (!IsTritonSupportedComputation(*module->entry_computation(),
+                                    GpuComputeCapability())) {
+    GTEST_SKIP() << "Not supported on this platform.";
+  }
+
+  ErrorSpec error_spec = ErrorSpecForDotAlgorithm(PrecisionConfig::ALG_UNSET);
+  // For 8-bit floating point types, we need to allow large errors.
+  if (primitive_util::IsFloatingPointType(result_type) &&
+      primitive_util::BitWidth(result_type) == 8) {
+    error_spec = ErrorSpec{/*aabs=*/1e0, /*arel=*/1e-1};
+  }
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, error_spec));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    DotUnsetAlgorithmEmitterTestSuite, DotUnsetAlgorithmEmitterTest,
+    ::testing::Combine(::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::ValuesIn(AllXlaDataTypes())),
+    DotUnsetAlgorithmEmitterTest::ParamToString);
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
index 189382b54c4b..7c9a4d271fd9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
@@ -13,43 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "llvm/IR/LLVMContext.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
 
-#if defined(PLATFORM_GOOGLE)
-#else
-
-#endif
 namespace xla::gpu {
 namespace {
 
 using ::tsl::testing::IsOkAndHolds;
+using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
 
-class AnnotationsTest : public GpuCodegenTest {
+class AnnotationsTest : public HloHardwareIndependentTestBase {
  public:
-  const stream_executor::GpuComputeCapability& GpuComputeComp() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
-  }
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_unsupported_annotate_with_emitter_loc(true);
     return debug_options;
   }
@@ -57,56 +55,47 @@ class AnnotationsTest : public GpuCodegenTest {
 
 TEST_F(AnnotationsTest, Annotations) {
   static constexpr absl::string_view kHloText = R"(
-    HloModule Annotations
-
-    triton_dot {
-      p0 = f32[8,8] parameter(0)
-      p1 = f32[8,8] parameter(1)
-      ROOT dot = f32[8,8] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0},
-        algorithm=dot_bf16_bf16_f32_x3
-    }
+HloModule Annotations
+
+triton_dot {
+  p0 = f32[8,8] parameter(0)
+  p1 = f32[8,8] parameter(1)
+  ROOT dot = f32[8,8] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    algorithm=dot_bf16_bf16_f32_x3
+}
 
-    ENTRY e {
-      p0 = f32[8,8]{1, 0} parameter(0)
-      p1 = f32[8,8]{1, 0} parameter(1)
-      ROOT _ = f32[8,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-        backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-          triton_gemm_config:
-          {
-            "block_m":32,
-            "block_n":32,
-            "block_k":32,
-            "split_k":1,
-            "num_stages":1,
-            "num_warps":1,
-            "num_ctas":1
-          }
-        }
+ENTRY e {
+  p0 = f32[8,8]{1, 0} parameter(0)
+  p1 = f32[8,8]{1, 0} parameter(1)
+  ROOT _ = f32[8,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config:
+      {
+        "block_m":32,
+        "block_n":32,
+        "block_k":32,
+        "split_k":1,
+        "num_stages":1,
+        "num_warps":1,
+        "num_ctas":1
       }
     }
-  )";
+  }
+})";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  auto* comp = module->GetComputationWithName("triton_dot");
-  EXPECT_NE(comp, nullptr);
-  auto fusion_backend_config = comp->FusionInstruction()
-                                   ->backend_config<GpuBackendConfig>()
-                                   ->fusion_backend_config();
-  BlockLevelParameters block_level_parameters =
-      BlockLevelParameters::FromBlockLevelFusionConfig(
-          fusion_backend_config.block_level_fusion_config());
-
-  auto* fusion = Cast<HloFusionInstruction>(comp->FusionInstruction());
+  auto* fusion = Cast<HloFusionInstruction>(
+      module->entry_computation()->root_instruction());
 
   mlir::MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(
       auto triton_module,
       CreateTritonModule("triton_fn", fusion,
                          TestGpuDeviceInfo::RTXA6000DeviceInfo(),
-                         block_level_parameters, context));
+                         BlockLevelParameters(), context));
 
-  std::string annotated_ir = DumpTritonIR(triton_module.module.get(), true);
+  std::string annotated_ir = DumpTritonIR(triton_module.get(), true);
 
   if constexpr (EmitterLocOpBuilder::kSourceLocationSupported) {
     EXPECT_THAT(RunFileCheck(annotated_ir, R"(
@@ -121,5 +110,47 @@ TEST_F(AnnotationsTest, Annotations) {
   }
 }
 
+using TritonEmitterDevicelessTest = HloHardwareIndependentTestBase;
+
+TEST_F(TritonEmitterDevicelessTest, FailsGracefullyIfNumWarpsIsMissing) {
+  constexpr absl::string_view kHloText = R"(
+triton_computation {
+  p0 = f32[10,10] parameter(0)
+  p1 = f32[10,10] parameter(1)
+  ROOT add = f32[10,10] add(p0, p1)
+}
+
+ENTRY entry {
+  p0 = f32[10,10] parameter(0)
+  p1 = f32[10,10] parameter(1)
+  ROOT r = f32[10,10] fusion(p0, p1),
+    kind=kCustom, calls=triton_computation,
+    backend_config={"fusion_backend_config": {
+      "kind":"__triton",
+      "block_level_fusion_config": {"output_tiles":[{"sizes": ["1","1"]}]}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  const HloFusionInstruction* triton_fusion = Cast<HloFusionInstruction>(
+      hlo_module->entry_computation()->root_instruction());
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  BlockLevelParameters block_level_parameters;
+  block_level_parameters.output_tile_sizes = {{1, 1}};
+  block_level_parameters.num_warps = 0;
+
+  EXPECT_THAT(TritonWrapper("test_fn", triton_fusion,
+                            se::CudaComputeCapability::Hopper(), dev_info,
+                            block_level_parameters, &llvm_module, mlir_context),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kFailedPrecondition,
+                  ::testing::HasSubstr(
+                      "(num_warps, num_ctas, num_stages) must be positive")));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
index 56c7b4903464..a98b216a73b3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_int4_device_test.cc
@@ -16,22 +16,23 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
@@ -53,30 +54,73 @@ class TritonTest : public GpuCodegenTest {
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
     debug_options
         .set_xla_gpu_experimental_enable_subchannel_dequantisation_fusion(true);
-    debug_options.set_xla_gpu_crash_on_verification_failures(true);
     return debug_options;
   }
 
-  stream_executor::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
+  ::testing::AssertionResult RunAndCompare(absl::string_view hlo_text,
+                                           ErrorSpec error_spec) {
+    auto module_or = GetOptimizedModule(hlo_text);
+    if (!module_or.ok()) {
+      return ::testing::AssertionFailure() << module_or.status().message();
+    }
+    return RunAndCompareNoHloPasses(std::move(*module_or), error_spec);
   }
 
-  const stream_executor::GpuComputeCapability& GpuComputeComp() {
-    return device_desc().gpu_compute_capability();
+  ::testing::AssertionResult RunAndCompare(std::unique_ptr<HloModule> module,
+                                           ErrorSpec error_spec) {
+    auto module_or = GetOptimizedModule(std::move(module));
+    if (!module_or.ok()) {
+      return ::testing::AssertionFailure() << module_or.status().message();
+    }
+    return RunAndCompareNoHloPasses(std::move(*module_or), error_spec);
   }
-  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
-    if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-            GpuComputeComp())) {
-      return stream_executor::GpuComputeCapability{
-          device_desc().rocm_compute_capability()};
-    } else {
-      return stream_executor::GpuComputeCapability{
-          stream_executor::CudaComputeCapability{
-              stream_executor::CudaComputeCapability::kAmpere, 0}};
+
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      absl::string_view hlo_text, ErrorSpec error_spec) {
+    auto module_or = ParseAndReturnVerifiedModule(hlo_text);
+    if (!module_or.ok()) {
+      return ::testing::AssertionFailure() << module_or.status().message();
+    }
+    return RunAndCompareNoHloPasses(std::move(*module_or), error_spec);
+  }
+
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      std::unique_ptr<HloModule> module, ErrorSpec error_spec) {
+    if (auto status = MaybeAddTritonGemmConfig(module.get()); !status.ok()) {
+      return ::testing::AssertionFailure() << status.message();
+    }
+    auto nested_or = NestGemmFusion(device_desc().gpu_compute_capability())
+                         .Run(module.get());
+    if (!nested_or.ok()) {
+      return ::testing::AssertionFailure() << nested_or.status().message();
     }
+    EXPECT_TRUE(nested_or.value());
+    return GpuCodegenTest::RunAndCompareNoHloPasses(std::move(module),
+                                                    error_spec);
+  }
+
+  absl::Status MaybeAddTritonGemmConfig(HloModule* module) {
+    auto* fusion = Cast<HloFusionInstruction>(
+        module->entry_computation()->root_instruction());
+    if (!fusion) {
+      return absl::InternalError("Entry root is not a fusion.");
+    }
+    TF_ASSIGN_OR_RETURN(auto gpu_config,
+                        fusion->backend_config<GpuBackendConfig>());
+    FusionBackendConfig* backend_config =
+        gpu_config.mutable_fusion_backend_config();
+    if (backend_config->has_triton_gemm_config()) {
+      return absl::OkStatus();
+    }
+    auto* triton_gemm_key = backend_config->mutable_triton_gemm_config();
+    triton_gemm_key->set_block_m(64);
+    triton_gemm_key->set_block_k(64);
+    triton_gemm_key->set_block_n(64);
+    triton_gemm_key->set_split_k(1);
+    triton_gemm_key->set_num_stages(1);
+    triton_gemm_key->set_num_warps(2);
+    triton_gemm_key->set_num_ctas(1);
+    return fusion->set_backend_config(gpu_config);
   }
 
  protected:
@@ -96,7 +140,7 @@ class TritonTest : public GpuCodegenTest {
 //   broadcast -> reshape -> multiply -> dot.
 // On top of that there could be an additional bitcast between the parameter and
 // the broadcast.
-TEST_F(TritonTest, FuseChannelDequantizationFused) {
+TEST_F(TritonTest, DISABLED_FuseChannelDequantizationFused) {
   // This test is a Channel Dequantization fusion.
   // We run the fused version to avoid the hlo passes.
   // The case where we do:
@@ -104,35 +148,33 @@ TEST_F(TritonTest, FuseChannelDequantizationFused) {
   constexpr absl::string_view kHloText = R"(
     HloModule FuseChannelDequantizationFused
 
-    %fusion {
-      %w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
-      %w.s8 = s8[32,128,256]{2,1,0} convert(s4[32,128,256]{2,1,0:E(4)} %w.s4)
-      %w.b16 = bf16[32,128,256]{2,1,0} convert(s8[32,128,256]{2,1,0} %w.s8)
-
-      %s = bf16[32,1,256]{2,1,0} parameter(1)
-      %s.bitcast = bf16[32,256]{1,0} bitcast(bf16[32,1,256]{2,1,0} %s)
-      %s.broadcast = bf16[32,128,256]{2,1,0} broadcast(bf16[32,256]{1,0} %s.bitcast), dimensions={0,2}
-      %w.scaled = bf16[32,128,256]{2,1,0} multiply(bf16[32,128,256]{2,1,0} %w.b16, bf16[32,128,256]{2,1,0} %s.broadcast)
-      %w.scaled.bitcast = bf16[32,2,64,256]{3,2,1,0} bitcast(bf16[32,128,256]{2,1,0} %w.scaled)
-
-      %a = bf16[1,32,128,2,128]{4,3,2,1,0} parameter(2)
-      %a.bitcast = bf16[32,128,256]{2,1,0} bitcast(bf16[1,32,128,2,128]{4,3,2,1,0} %a)
-      %a.bitcast.2 = bf16[32,2,64,256]{3,2,1,0} bitcast(bf16[32,128,256]{2,1,0} %a.bitcast)
-      %dot = f32[2,32,256,256]{3,2,1,0} dot(bf16[32,2,64,256]{3,2,1,0} %w.scaled.bitcast, bf16[32,2,64,256]{3,2,1,0} %a.bitcast.2),
-        lhs_batch_dims={1,0},
-        lhs_contracting_dims={2},
-        rhs_batch_dims={1,0},
-        rhs_contracting_dims={2}
-      ROOT %bitcast = f32[2,32,256,2,1,128]{5,4,3,2,1,0} bitcast(f32[2,32,256,256]{3,2,1,0} %dot)
+    fusion {
+      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
+      w.s8 = s8[32,128,256] convert(w.s4)
+      w.b16 = bf16[32,128,256] convert(w.s8)
+
+      s = bf16[32,1,256] parameter(1)
+      s.bitcast = bf16[32,256] bitcast(s)
+      s.broadcast = bf16[32,128,256] broadcast(s.bitcast), dimensions={0,2}
+      w.scaled = bf16[32,128,256] multiply(w.b16, s.broadcast)
+      w.scaled.bitcast = bf16[32,2,64,256] bitcast(w.scaled)
+
+      a = bf16[1,32,128,2,128] parameter(2)
+      a.bitcast = bf16[32,128,256] bitcast(bf16[1,32,128,2,128] a)
+      a.bitcast.2 = bf16[32,2,64,256] bitcast(a.bitcast)
+      dot = f32[2,32,256,256] dot(w.scaled.bitcast, a.bitcast.2),
+        lhs_batch_dims={1,0}, lhs_contracting_dims={2},
+        rhs_batch_dims={1,0}, rhs_contracting_dims={2}
+      ROOT bitcast = f32[2,32,256,2,1,128] bitcast(f32[2,32,256,256] dot)
     }
 
-    ENTRY %entry_computation {
-      %w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
-      %s.bf16 = bf16[32,1,256]{2,1,0} parameter(1)
-      %a.bf16 = bf16[1,32,128,2,128]{4,3,2,1,0} parameter(2)
-      ROOT %fusion = f32[2,32,256,2,1,128]{5,4,3,2,1,0} fusion(s4[32,128,256]{2,1,0:E(4)} %w.s4, bf16[32,1,256]{2,1,0} %s.bf16, bf16[1,32,128,2,128]{4,3,2,1,0} %a.bf16),
+    ENTRY entry_computation {
+      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
+      s.bf16 = bf16[32,1,256] parameter(1)
+      a.bf16 = bf16[1,32,128,2,128] parameter(2)
+      ROOT fusion = f32[2,32,256,2,1,128] fusion(w.s4, s.bf16, a.bf16),
           kind=kCustom,
-          calls=%fusion,
+          calls=fusion,
           backend_config={
             "operation_queue_id":"0",
             "wait_on_operation_queues":[],
@@ -156,7 +198,47 @@ TEST_F(TritonTest, FuseChannelDequantizationFused) {
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, FuseSubchannelDequantization) {
+TEST_F(TritonTest, DISABLED_FuseSubchannelDequantizationWithTranspose) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseSubchannelDequantizationWithTranspose
+
+    ENTRY FuseSubchannelDequantizationWithTranspose {
+      w_s4 = s4[2,2048,64] parameter(1)
+      w_s8 = s8[2,2048,64] convert(w_s4)
+      w_s8_reshaped = s8[2,8,256,64] reshape(w_s8)
+      w_bf16 = bf16[2,8,256,64] convert(w_s8_reshaped)
+      s_bf16 = bf16[2,8,1,64]{3,1,0,2} parameter(0)
+      s_bf16_reshaped = bf16[2,8,64] reshape(s_bf16)
+      s_bf16_broadcasted = bf16[2,8,256,64] broadcast(s_bf16_reshaped),
+          dimensions={0,1,3}
+      w_bf16_scaled = bf16[2,8,256,64] multiply(w_bf16, s_bf16_broadcasted)
+      w_bf16_scaled_reshaped = bf16[2,2048,64] reshape(w_bf16_scaled)
+
+      a_bf16 = bf16[2,2048,2,32] parameter(2)
+      a_bf16_reshaped = bf16[2,2048,64] reshape(a_bf16)
+      dot = bf16[2,64,64] dot(w_bf16_scaled_reshaped, a_bf16_reshaped),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
+      dot_reshaped = bf16[2,64,2,32] reshape(dot)
+      dot_transposed = bf16[64,2,2,32] transpose(dot_reshaped),
+          dimensions={1,0,2,3}
+      ROOT root = bf16[2,64,2,32]{3,2,0,1} reshape(dot_transposed)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    CHECK:    %[[bitcast:.*]] = bf16[2,8,64]{2,1,0} bitcast({{.*}})
+    CHECK:    %[[transpose:.*]] = bf16[2,64,8]{2,1,0} transpose(%[[bitcast]]), dimensions={0,2,1}
+    CHECK:    %[[broadcast:.*]] = bf16[2,64,8,256]{3,2,1,0} broadcast(%[[transpose]]), dimensions={0,1,2}
+    CHECK:    %[[multiply:.*]] = bf16[2,64,8,256]{3,2,1,0} multiply({{.*}}, %[[broadcast]])
+  )"));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, DISABLED_FuseSubchannelDequantization) {
   // This test is a Subchannel Dequantization fusion.
   // We run the non-fused version with the goal to fail if an hlo rewrite broke
   // the dequantization logic. The case where we do:
@@ -165,32 +247,32 @@ TEST_F(TritonTest, FuseSubchannelDequantization) {
     HloModule FuseSubchannelDequantization
 
     ENTRY main {
-      w = s4[16,2048,4096]{2,1,0} parameter(0)
-      w.s8 = s8[16,2048,4096]{2,1,0} convert(w)
-      w.b16 = bf16[16,2048,4096]{2,1,0} convert(w.s8)
-      w.b16.reshaped = bf16[16,8,256,4096]{3,2,1,0} reshape(w.b16)
-
-      s = bf16[16,8,1,4096]{3,2,1,0} parameter(1)
-      s.reshaped = bf16[16,8,4096]{2,1,0} reshape(s)
-      s.broadcasted = bf16[16,8,256,4096]{3,2,1,0} broadcast(s.reshaped), dimensions={0,1,3}
-      w.scaled = bf16[16,8,256,4096]{3,2,1,0} multiply(w.b16.reshaped, s.broadcasted)
-      w.scaled.reshaped = bf16[16,2048,4096]{2,1,0} reshape(w.scaled)
-
-      a = bf16[2,16,1,2048]{3,2,1,0} parameter(2)
-      a.reshaped = bf16[2,16,2048]{2,1,0} reshape(a)
-      ROOT dot = f32[16,4096,2]{2,1,0} dot(w.scaled.reshaped, a.reshaped),
-          lhs_batch_dims={0},
-          lhs_contracting_dims={1},
-          rhs_batch_dims={1},
-          rhs_contracting_dims={2}
+      w = s4[2,2048,32] parameter(0)
+      w.s8 = s8[2,2048,32] convert(w)
+      w.b16 = bf16[2,2048,32] convert(w.s8)
+      w.b16.reshaped = bf16[2,8,256,32] reshape(w.b16)
+
+      s = bf16[2,8,1,32] parameter(1)
+      s.reshaped = bf16[2,8,32] reshape(s)
+      s.broadcasted = bf16[2,8,256,32] broadcast(s.reshaped),
+          dimensions={0,1,3}
+      w.scaled = bf16[2,8,256,32] multiply(w.b16.reshaped, s.broadcasted)
+      w.scaled.reshaped = bf16[2,2048,32] reshape(w.scaled)
+
+      a = bf16[2,2,1,2048] parameter(2)
+      a.reshaped = bf16[2,2,2048] reshape(a)
+      ROOT dot = f32[2,32,2] dot(w.scaled.reshaped, a.reshaped),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={1}, rhs_contracting_dims={2}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, FuseChannelDequantization) {
+TEST_F(TritonTest, DISABLED_FuseChannelDequantization) {
   // This test is a Channel Dequantization fusion.
   // We run the non-fused version with the goal to fail if an hlo rewrite broke
   // the dequantization logic. The case where we do:
@@ -199,28 +281,31 @@ TEST_F(TritonTest, FuseChannelDequantization) {
     HloModule FuseChannelDequantization
 
     ENTRY main {
-      w.s4 = s4[32,128,256]{2,1,0} parameter(0)
-      w.s8 = s8[32,128,256]{2,1,0} convert(w.s4)
-      w.bf16 = bf16[32,128,256]{2,1,0} convert(w.s8)
-
-      s = bf16[32,1,256]{2,1,0} parameter(1)
-      s.broadcast = bf16[32,1,256]{2,1,0} broadcast(s), dimensions={0,1,2}
-      s.reshape = bf16[32,256]{1,0} reshape(s.broadcast)
-      s.broadcast.2 = bf16[32,128,256]{2,1,0} broadcast(s.reshape), dimensions={0,2}
-      w.scaled = bf16[32,128,256]{2,1,0} multiply(w.bf16, s.broadcast.2)
-
-      a = bf16[2,1,32,128,128]{4,3,2,1,0} parameter(2)
-      ROOT dot = f32[32,256,2,1,128]{4,3,2,1,0} dot(w.scaled, a),
-          lhs_batch_dims={0},
-          lhs_contracting_dims={1},
-          rhs_batch_dims={2},
-          rhs_contracting_dims={4}
+      w.s4 = s4[32,128,256] parameter(0)
+      w.s8 = s8[32,128,256] convert(w.s4)
+      w.bf16 = bf16[32,128,256] convert(w.s8)
+
+      s = bf16[32,1,256] parameter(1)
+      s.broadcast = bf16[32,1,256] broadcast(s), dimensions={0,1,2}
+      s.reshape = bf16[32,256] reshape(s.broadcast)
+      s.broadcast.2 = bf16[32,128,256] broadcast(s.reshape),
+          dimensions={0,2}
+      w.scaled = bf16[32,128,256] multiply(w.bf16, s.broadcast.2)
+
+      a = bf16[2,1,32,128,128] parameter(2)
+      ROOT dot = f32[32,256,2,1,128] dot(w.scaled, a),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={2}, rhs_contracting_dims={4}
     }
   )";
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, FuseSubchannelDequantizationFused) {
+TEST_F(TritonTest, DISABLED_FuseSubchannelDequantizationFused) {
   // This test is a Subchannel Dequantization fusion.
   // We run the fused version to avoid the hlo passes.
   // The case where we do:
@@ -229,31 +314,29 @@ TEST_F(TritonTest, FuseSubchannelDequantizationFused) {
     HloModule FuseSubchannelDequantizationFused
 
     fusion {
-      w.s4 = s4[16,2048,4096]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[16,2048,4096]{2,1,0} convert(w.s4)
-      w.s8.bitcast = s8[16,8,256,4096]{3,2,1,0} bitcast(w.s8)
-      w.f32 = f32[16,8,256,4096]{3,2,1,0} convert(w.s8.bitcast)
-
-      s.f32 = f32[16,8,1,4096]{3,2,1,0} parameter(1)
-      s.f32.bitcast = f32[16,8,4096]{2,1,0} bitcast(s.f32)
-      s.f32.broadcast = f32[16,8,256,4096]{3,2,1,0} broadcast(s.f32.bitcast), dimensions={0,1,3}
-      w = f32[16,8,256,4096]{3,2,1,0} multiply(w.f32, s.f32.broadcast)
-      w.bitcast = f32[16,2048,4096]{2,1,0} bitcast(w)
-
-      a = f32[2,16,1,2048]{3,2,1,0} parameter(2)
-      a.bitcast = f32[2,16,2048]{2,1,0} bitcast(a)
-      ROOT dot = f32[16,4096,2]{2,1,0} dot(w.bitcast, a.bitcast),
-          lhs_batch_dims={0},
-          lhs_contracting_dims={1},
-          rhs_batch_dims={1},
-          rhs_contracting_dims={2}
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      w.s8 = s8[2,2048,32] convert(w.s4)
+      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
+      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
+
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
+      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
+      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
+      w.bitcast = bf16[2,2048,32] bitcast(w)
+
+      a = bf16[2,2,1,2048] parameter(2)
+      a.bitcast = bf16[2,2,2048] bitcast(a)
+      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={1}, rhs_contracting_dims={2}
     } // fusion
 
     ENTRY main {
-      w.s4 = s4[16,2048,4096]{2,1,0:E(4)} parameter(0)
-      s.f32 = f32[16,8,1,4096]{3,2,1,0} parameter(1)
-      a.f32 = f32[2,16,1,2048]{3,2,1,0} parameter(2)
-      ROOT fusion = f32[16,4096,2]{2,1,0} fusion(w.s4, s.f32, a.f32),
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      a.bf16 = bf16[2,2,1,2048] parameter(2)
+      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
         kind=kCustom,
         calls=fusion,
         backend_config={
@@ -279,40 +362,39 @@ TEST_F(TritonTest, FuseSubchannelDequantizationFused) {
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, FuseSubchannelDequantizationFusedWithSmallBlockKSize) {
+TEST_F(TritonTest,
+       DISABLED_FuseSubchannelDequantizationFusedWithSmallBlockKSize) {
   // This test is a Subchannel Dequantization fusion.
   // We run the fused version to avoid the hlo passes.
   // The case where we do:
   // param -> bitcast -> broadcast -> multiply -> bitcast -> dot.
   constexpr absl::string_view kHloText = R"(
-    HloModule FuseSubchannelDequantizationFused
+    HloModule FuseSubchannelDequantizationFusedWithSmallBlockKSize
 
     fusion {
-      w.s4 = s4[16,2048,4096]{2,1,0:E(4)} parameter(0)
-      w.s8 = s8[16,2048,4096]{2,1,0} convert(w.s4)
-      w.s8.bitcast = s8[16,8,256,4096]{3,2,1,0} bitcast(w.s8)
-      w.f32 = f32[16,8,256,4096]{3,2,1,0} convert(w.s8.bitcast)
-
-      s.f32 = f32[16,8,1,4096]{3,2,1,0} parameter(1)
-      s.f32.bitcast = f32[16,8,4096]{2,1,0} bitcast(s.f32)
-      s.f32.broadcast = f32[16,8,256,4096]{3,2,1,0} broadcast(s.f32.bitcast), dimensions={0,1,3}
-      w = f32[16,8,256,4096]{3,2,1,0} multiply(w.f32, s.f32.broadcast)
-      w.bitcast = f32[16,2048,4096]{2,1,0} bitcast(w)
-
-      a = f32[2,16,1,2048]{3,2,1,0} parameter(2)
-      a.bitcast = f32[2,16,2048]{2,1,0} bitcast(a)
-      ROOT dot = f32[16,4096,2]{2,1,0} dot(w.bitcast, a.bitcast), 
-          lhs_batch_dims={0},
-          lhs_contracting_dims={1},
-          rhs_batch_dims={1},
-          rhs_contracting_dims={2}
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      w.s8 = s8[2,2048,32] convert(w.s4)
+      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
+      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
+
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
+      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
+      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
+      w.bitcast = bf16[2,2048,32] bitcast(w)
+
+      a = bf16[2,2,1,2048] parameter(2)
+      a.bitcast = bf16[2,2,2048] bitcast(a)
+      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast), 
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={1}, rhs_contracting_dims={2}
     } // fusion
 
     ENTRY main {
-      w.s4 = s4[16,2048,4096]{2,1,0:E(4)} parameter(0)
-      s.f32 = f32[16,8,1,4096]{3,2,1,0} parameter(1)
-      a.f32 = f32[2,16,1,2048]{3,2,1,0} parameter(2)
-      ROOT fusion = f32[16,4096,2]{2,1,0} fusion(w.s4, s.f32, a.f32),
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      a.bf16 = bf16[2,2,1,2048] parameter(2)
+      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
         kind=kCustom,
         calls=fusion,
         backend_config={
@@ -323,7 +405,7 @@ TEST_F(TritonTest, FuseSubchannelDequantizationFusedWithSmallBlockKSize) {
             "triton_gemm_config":{
               "block_m":16,
               "block_n":16,
-              "block_k":16,
+              "block_k":128,
               "split_k":1,
               "num_stages":1,
               "num_warps":2,
@@ -343,29 +425,27 @@ TEST_F(TritonTest, FuseBroadcastInPrologue) {
     HloModule FuseBroadcastInPrologue
 
     ENTRY main {
-      lhs = bf16[16,1024]{1,0} parameter(0)
-      lhs.broadcast = bf16[16,256,1024]{2,1,0} broadcast(lhs), dimensions={0,2}
+      lhs = bf16[2,1024] parameter(0)
+      lhs.broadcast = bf16[2,256,1024] broadcast(lhs), dimensions={0,2}
 
-      rhs = bf16[16,256,512]{2,1,0} parameter(1)
+      rhs = bf16[2,256,512] parameter(1)
 
-      ROOT dot = f32[16,1024,512]{2,1,0} dot(lhs.broadcast, rhs),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={1},
-        rhs_batch_dims={0},
-        rhs_contracting_dims={1}
+      ROOT dot = f32[2,1024,512] dot(lhs.broadcast, rhs),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[16,256,1024]{2,1,0} broadcast
-    CHECK:    %[[dot:.*]] = f32[16,1024,512]{2,1,0} dot
+    CHECK:    %[[broadcast:.*]] = bf16[2,256,1024]{2,1,0} broadcast
+    CHECK:    %[[dot:.*]] = f32[2,1024,512]{2,1,0} dot
     CHECK:    ENTRY %main
   )"));
-  EXPECT_TRUE(RunAndCompare(std::move(module),
-                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, FuseBroadcastBitcastInPrologue) {
+TEST_F(TritonTest, DISABLED_FuseBroadcastBitcastInPrologue) {
   // This test is a Subchannel Dequantization fusion.
   constexpr absl::string_view kHloText = R"(
     HloModule FuseBroadcastBitcastInPrologue
@@ -378,22 +458,21 @@ TEST_F(TritonTest, FuseBroadcastBitcastInPrologue) {
       rhs = bf16[256,512] parameter(1)
 
       ROOT dot = f32[1024,512] dot(lhs.bitcast, rhs),
-        lhs_contracting_dims={0},
-        rhs_contracting_dims={0}
+        lhs_contracting_dims={0}, rhs_contracting_dims={0}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[broadcast:.*]] = bf16[2,128,1024]{2,1,0} broadcast
-    CHECK:    %[[bitcast:.*]] = bf16[256,1024]{1,0} bitcast
-    CHECK:    ROOT %[[dot:.*]] = f32[1024,512]{1,0} dot
+    CHECK:    %[[broadcast:.*]] = bf16[{{.*}}]{2,1,0} broadcast
+    CHECK:    %[[bitcast:.*]] = bf16[{{.*}}]{1,0} bitcast
+    CHECK:    ROOT %[[dot:.*]] = f32[{{.*}}]{1,0} dot
     CHECK:    ENTRY %main
   )"));
-  EXPECT_TRUE(RunAndCompare(std::move(module),
-                            ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
 
-TEST_F(TritonTest, FuseBroadcastBitcastMultiplyInPrologue) {
+TEST_F(TritonTest, DISABLED_FuseBroadcastBitcastMultiplyInPrologue) {
   // This test is a Subchannel Dequantization fusion.
   constexpr absl::string_view kHloText = R"(
     HloModule FuseBroadcastBitcastMultiplyInPrologue
@@ -411,8 +490,7 @@ TEST_F(TritonTest, FuseBroadcastBitcastMultiplyInPrologue) {
       rhs = bf16[256,512] parameter(2)
 
       ROOT dot = f32[1024,512] dot(lhs.weights.scaled, rhs),
-        lhs_contracting_dims={0},
-        rhs_contracting_dims={0}
+        lhs_contracting_dims={0}, rhs_contracting_dims={0}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
@@ -423,30 +501,28 @@ TEST_F(TritonTest, FuseBroadcastBitcastMultiplyInPrologue) {
     CHECK:    %[[dot:.*]] = f32[1024,512]{1,0} dot
     CHECK:    ENTRY %main
   )"));
-  EXPECT_TRUE(RunAndCompare(std::move(module),
-                            ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
 
-TEST_F(TritonTest, DotWithI4WeightsOnLhsWithBitcastTo3dTensor) {
+TEST_F(TritonTest, DISABLED_DotWithI4WeightsOnLhsWithBitcastTo3dTensor) {
   constexpr absl::string_view kHloText = R"(
     HloModule DotWithI4WeightsOnLhsWithBitcastTo3dTensor
 
     fusion {
-      p_0 = s4[256,16]{1,0:E(4)} parameter(0)
-      p_0.2 = bf16[256,16]{1,0} convert(p_0)
-      p_0.3 = bf16[4,64,16]{2,1,0} bitcast(p_0.2)
-      p_1 = bf16[4,32,64]{2,1,0} parameter(1)
-      ROOT dot = bf16[4,16,32]{2,1,0} dot(p_0.3, p_1),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={1},
-        rhs_batch_dims={0},
-        rhs_contracting_dims={2}
+      p0 = s4[256,16]{1,0:E(4)} parameter(0)
+      p0.2 = bf16[256,16] convert(p0)
+      p0.3 = bf16[4,64,16] bitcast(p0.2)
+      p1 = bf16[4,32,64] parameter(1)
+      ROOT dot = bf16[4,16,32] dot(p0.3, p1),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={2}
     }
 
-    ENTRY %entry_computation {
-      p_0 = s4[256,16]{1,0:E(4)} parameter(0)
-      p_1 = bf16[4,32,64]{2,1,0} parameter(1)
-      ROOT dot = bf16[4,16,32]{2,1,0} fusion(p_0, p_1),
+    ENTRY entry_computation {
+      p0 = s4[256,16]{1,0:E(4)} parameter(0)
+      p1 = bf16[4,32,64] parameter(1)
+      ROOT dot = bf16[4,16,32] fusion(p0, p1),
         kind=kCustom,
         calls=fusion,
         backend_config={
@@ -460,34 +536,33 @@ TEST_F(TritonTest, DotWithI4WeightsOnLhsWithBitcastTo3dTensor) {
       kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
 
-TEST_F(TritonTest,
-       DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue) {
+TEST_F(
+    TritonTest,
+    DISABLED_DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue) {
   constexpr absl::string_view kHloText = R"(
     HloModule DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue
 
     fusion {
-      p_0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
-      p_0.1 = s4[1,32,128]{2,1,0:E(4)} bitcast(p_0)
-      p_0.2 = bf16[1,32,128]{2,1,0} convert(p_0.1)
-      p_0.3 = bf16[1,128,32]{1,2,0} bitcast(p_0.2)
-      p_1 = bf16[128,1,64]{2,1,0} parameter(1)
-      dot = bf16[1,32,64]{2,1,0} dot(p_0.3, p_1),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={1},
-        rhs_batch_dims={1},
-        rhs_contracting_dims={0}
-      p_2 = bf16[1,1,32]{2,0,1} parameter(2)
-      p_2.1 = bf16[1,32]{1,0} bitcast(p_2)
-      p_2.2 = bf16[1,32,64]{2,1,0} broadcast(p_2.1), dimensions={0,1}
-      m = bf16[1,32,64]{2,1,0} multiply(dot, p_2.2)
-      ROOT m.1 = bf16[1,1,32,64]{3,2,1,0} bitcast(m)
+      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
+      p0.1 = s4[1,32,128]{2,1,0:E(4)} bitcast(p0)
+      p0.2 = bf16[1,32,128] convert(p0.1)
+      p0.3 = bf16[1,128,32]{1,2,0} bitcast(p0.2)
+      p1 = bf16[128,1,64] parameter(1)
+      dot = bf16[1,32,64] dot(p0.3, p1),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={1}, rhs_contracting_dims={0}
+      p2 = bf16[1,1,32]{2,0,1} parameter(2)
+      p2.1 = bf16[1,32] bitcast(p2)
+      p2.2 = bf16[1,32,64] broadcast(p2.1), dimensions={0,1}
+      m = bf16[1,32,64] multiply(dot, p2.2)
+      ROOT m.1 = bf16[1,1,32,64] bitcast(m)
     }
 
-    ENTRY %entry_computation {
-      p_0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
-      p_1 = bf16[128,1,64]{2,1,0} parameter(1)
-      p_2 = bf16[1,1,32]{2,0,1} parameter(2)
-      ROOT gemm_fusion_dot.2 = bf16[1,1,32,64]{3,2,1,0} fusion(p_0, p_1, p_2),
+    ENTRY entry_computation {
+      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
+      p1 = bf16[128,1,64] parameter(1)
+      p2 = bf16[1,1,32]{2,0,1} parameter(2)
+      ROOT gemm_fusion_dot.2 = bf16[1,1,32,64] fusion(p0, p1, p2),
         kind=kCustom,
         calls=fusion,
         backend_config={
@@ -506,25 +581,23 @@ TEST_F(TritonTest, DotWithInt4WeightsOnLhsFusedWithMultiplyByChannelScales) {
     HloModule DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales
 
     DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales {
-      w = s4[32,64,128]{2,1,0} parameter(0)
-      w.i8 = s8[32,64,128]{2,1,0} convert(w)
-      w.f32 = f32[32,64,128]{2,1,0} convert(w.i8)
-      scales = f32[32,128]{1,0} parameter(1)
-      scales.broadcast = f32[32,64,128]{2,1,0} broadcast(scales), dimensions={0,2}
-      weights.scaled = f32[32,64,128]{2,1,0} multiply(w.f32, scales.broadcast)
-      activations = f32[32,64,256]{2,1,0} parameter(2)
-      ROOT dot = f32[32,128,256]{2,1,0} dot(weights.scaled, activations),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={1},
-        rhs_batch_dims={0},
-        rhs_contracting_dims={1}
+      w = s4[32,64,128] parameter(0)
+      w.i8 = s8[32,64,128] convert(w)
+      w.bf16 = bf16[32,64,128] convert(w.i8)
+      scales = bf16[32,128] parameter(1)
+      scales.broadcast = bf16[32,64,128] broadcast(scales), dimensions={0,2}
+      weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
+      activations = bf16[32,64,256] parameter(2)
+      ROOT dot = f32[32,128,256] dot(weights.scaled, activations),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
 
     ENTRY main {
-      w = s4[32,64,128]{2,1,0} parameter(0)
-      scales = f32[32,128]{1,0} parameter(1)
-      p2 = f32[32,64,256]{2,1,0} parameter(2)
-      ROOT dot = f32[32,128,256]{2,1,0} fusion(w, scales, p2),
+      w = s4[32,64,128] parameter(0)
+      scales = bf16[32,128] parameter(1)
+      p2 = bf16[32,64,256] parameter(2)
+      ROOT dot = f32[32,128,256] fusion(w, scales, p2),
         kind=kCustom,
         calls=DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales,
         backend_config={
@@ -543,24 +616,23 @@ TEST_F(TritonTest, FuseMultiplyInPrologue) {
     HloModule FuseMultiplyInPrologue
 
     ENTRY main {
-      t = (s4[32,64,128]{2,1,0}, f32[32,128]{0,1}, f32[32,64,256]{2,1,0}) parameter(0)
-      w = s4[32,64,128]{2,1,0} get-tuple-element(t), index=0
-      w.i8 = s8[32,64,128]{2,1,0} convert(w)
-      w.f32 = f32[32,64,128]{2,1,0} convert(w.i8)
-      scales = f32[32,128]{0,1} get-tuple-element(t), index=1
-      scales.broadcast = f32[32,64,128]{2,1,0} broadcast(scales), dimensions={0,2}
-      weights.scaled = f32[32,64,128]{2,1,0} multiply(w.f32, scales.broadcast)
-      activations = f32[32,64,256]{2,1,0} get-tuple-element(t), index=2
-      ROOT dot = f32[32,128,256]{2,1,0} dot(weights.scaled, activations),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={1},
-        rhs_batch_dims={0},
-        rhs_contracting_dims={1}
+      t = (s4[32,64,128], bf16[32,128]{0,1}, bf16[32,64,256]) parameter(0)
+      w = s4[32,64,128] get-tuple-element(t), index=0
+      w.i8 = s8[32,64,128] convert(w)
+      w.bf16 = bf16[32,64,128] convert(w.i8)
+      scales = bf16[32,128]{0,1} get-tuple-element(t), index=1
+      scales.broadcast = bf16[32,64,128] broadcast(scales), dimensions={0,2}
+      weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
+      activations = bf16[32,64,256] get-tuple-element(t), index=2
+      ROOT dot = f32[32,128,256] dot(weights.scaled, activations),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  // On Ampere the multiply result type is f32, on Hopper it is bf16.
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK:    %[[multiply:.*]] = f32[32,64,128]{2,1,0} multiply
+    CHECK:    %[[multiply:.*]] = [[type:.*]]{{.*}} multiply({{.*}}, {{.*}})
     CHECK:    %[[dot:.*]] = f32[32,128,256]{2,1,0} dot
     CHECK:    ENTRY %main
   )"));
@@ -571,17 +643,15 @@ TEST_F(TritonTest, FuseMultiplyInEpilogue) {
     HloModule FuseMultiplyInEpilogue
 
     ENTRY main {
-      p_0 = s4[4,32,128]{2,1,0:E(4)} parameter(0)
-      p_0.1 = bf16[4,32,128]{2,1,0} convert(p_0)
-      p_1 = bf16[4,128,64]{2,1,0} parameter(1)
-      dot = bf16[4,32,64]{2,1,0} dot(p_0.1, p_1),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={2},
-        rhs_batch_dims={0},
-        rhs_contracting_dims={1}
-      p_2 = bf16[4,32]{1,0} parameter(2)
-      p_2.1 = bf16[4,32,64]{2,1,0} broadcast(p_2), dimensions={0,1}
-      ROOT m = bf16[4,32,64]{2,1,0} multiply(dot, p_2.1)
+      p0 = s4[4,32,128]{2,1,0:E(4)} parameter(0)
+      p0.1 = bf16[4,32,128] convert(p0)
+      p1 = bf16[4,128,64] parameter(1)
+      dot = bf16[4,32,64] dot(p0.1, p1),
+        lhs_batch_dims={0}, lhs_contracting_dims={2},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
+      p2 = bf16[4,32] parameter(2)
+      p2.1 = bf16[4,32,64] broadcast(p2), dimensions={0,1}
+      ROOT m = bf16[4,32,64] multiply(dot, p2.1)
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
@@ -592,21 +662,21 @@ TEST_F(TritonTest, FuseMultiplyInEpilogue) {
     )"));
 }
 
-TEST_F(TritonTest, NonstandardLayoutInt4) {
+TEST_F(TritonTest, DISABLED_NonstandardLayoutInt4) {
   constexpr absl::string_view kHloText = R"(
     HloModule NonstandardLayoutInt4
 
     ENTRY main {
       p0 = s4[64,128]{0,1} parameter(0)
-      p1 = bf16[256,64]{1,0} parameter(1)
-      ROOT %dot = bf16[128,256]{1,0} dot(s4[64,128]{0,1} p0, bf16[256,64]{1,0} p1),
-        lhs_contracting_dims={0},
-        rhs_contracting_dims={1}
+      p1 = bf16[256,64] parameter(1)
+      ROOT dot = bf16[128,256] dot(p0, p1),
+        lhs_contracting_dims={0}, rhs_contracting_dims={1}
     }
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 using ::testing::TestParamInfo;
@@ -649,19 +719,19 @@ TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhs) {
     HloModule lhs_${name}
 
     lhs_${name} {
-      w.s4 = s4[${lhs}]{1,0} parameter(0)
-      w.s8 = s8[${lhs}]{1,0} convert(w.s4)
-      w.f32 = f32[${lhs}]{1,0} convert(w.s8)
-      a = f32[${rhs}]{1,0} parameter(1)
-      ROOT lhs_${name} = f32[${out}]{1,0} dot(w.f32, a),
+      w.s4 = s4[${lhs}] parameter(0)
+      w.s8 = s8[${lhs}] convert(w.s4)
+      w.bf16 = bf16[${lhs}] convert(w.s8)
+      a = bf16[${rhs}] parameter(1)
+      ROOT lhs_${name} = f32[${out}] dot(w.bf16, a),
         lhs_contracting_dims={${lhs_contracting_dim}},
         rhs_contracting_dims={${rhs_contracting_dim}}
     }
 
     ENTRY main {
-      w = s4[${lhs}]{1,0} parameter(0)
-      a = f32[${rhs}]{1,0} parameter(1)
-      ROOT gemm_fusion_dot.2 = f32[${out}]{1,0} fusion(w, a),
+      w = s4[${lhs}] parameter(0)
+      a = bf16[${rhs}] parameter(1)
+      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
         kind=kCustom,
         calls=lhs_${name},
         backend_config={
@@ -685,21 +755,19 @@ TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhsWithBatchDim) {
     HloModule ${name}
 
     fusion {
-      w.s4 = s4[${lhs}]{2,1,0} parameter(0)
-      w.s8 = s8[${lhs}]{2,1,0} convert(w.s4)
-      w.f32 = f32[${lhs}]{2,1,0} convert(w.s8)
-      a = f32[${rhs}]{2,1,0} parameter(1)
-      ROOT dot.0 = f32[${out}]{2,1,0} dot(w.f32, a),
-        lhs_contracting_dims={${lhs_contracting_dim}},
-        rhs_contracting_dims={${rhs_contracting_dim}},
-        lhs_batch_dims={0},
-        rhs_batch_dims={0}
+      w.s4 = s4[${lhs}] parameter(0)
+      w.s8 = s8[${lhs}] convert(w.s4)
+      w.bf16 = bf16[${lhs}] convert(w.s8)
+      a = bf16[${rhs}] parameter(1)
+      ROOT dot.0 = f32[${out}] dot(w.bf16, a),
+        lhs_batch_dims={0}, lhs_contracting_dims={${lhs_contracting_dim}},
+        rhs_batch_dims={0}, rhs_contracting_dims={${rhs_contracting_dim}}
     }
 
     ENTRY gemm_fusion_dot_computation {
-      w = s4[${lhs}]{2,1,0} parameter(0)
-      a = f32[${rhs}]{2,1,0} parameter(1)
-      ROOT gemm_fusion_dot.2 = f32[${out}]{2,1,0} fusion(w, a),
+      w = s4[${lhs}] parameter(0)
+      a = bf16[${rhs}] parameter(1)
+      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
         kind=kCustom,
         calls=fusion,
         backend_config={
@@ -724,19 +792,19 @@ TEST_P(ParametrizedTritonTest, Int4WeightsOnTheRhs) {
     HloModule rhs_${name}
 
     rhs_${name} {
-      a = f32[${lhs}]{1,0} parameter(0)
-      w.s4 = s4[${rhs}]{1,0} parameter(1)
-      w.s8 = s8[${rhs}]{1,0} convert(w.s4)
-      w.f32 = f32[${rhs}]{1,0} convert(w.s8)
-      ROOT rhs_${name} = f32[${out}]{1,0} dot(a, w.f32),
+      a = bf16[${lhs}] parameter(0)
+      w.s4 = s4[${rhs}] parameter(1)
+      w.s8 = s8[${rhs}] convert(w.s4)
+      w.bf16 = bf16[${rhs}] convert(w.s8)
+      ROOT rhs_${name} = f32[${out}] dot(a, w.bf16),
         lhs_contracting_dims={${lhs_contracting_dim}},
         rhs_contracting_dims={${rhs_contracting_dim}}
     }
 
     ENTRY main {
-      a = f32[${lhs}]{1,0} parameter(0)
-      w = s4[${rhs}]{1,0} parameter(1)
-      ROOT rhs_${name} = f32[${out}]{1,0} fusion(a, w),
+      a = bf16[${lhs}] parameter(0)
+      w = s4[${rhs}] parameter(1)
+      ROOT rhs_${name} = f32[${out}] fusion(a, w),
         kind=kCustom,
         calls=rhs_${name},
         backend_config={
@@ -758,17 +826,21 @@ std::vector<I4TestParams> Int4TestCases() {
       {"int4_dot_128_16_x_256_128", "128,16", "256,128", 0, 1, "16,256"},
       {"int4_dot_16_128_x_256_128", "16,128", "256,128", 1, 1, "16,256"},
       {"int4_dot_16_128_x_128_256", "16,128", "128,256", 1, 0, "16,256"},
-      {"int4_dot_1_128_x_256_128", "1,128", "256,128", 1, 1, "1,256"},
-      {"int4_dot_128_1_x_256_128", "128,1", "256,128", 0, 1, "1,256"},
-      {"int4_dot_16_128_x_128_1", "16,128", "128,1", 1, 0, "16,1"},
-      {"int4_dot_16_128_x_1_128", "16,128", "1,128", 1, 1, "16,1"},
+      // These fail for Int4WeightsOnTheRhs only.
+      {"DISABLED_int4_dot_1_128_x_256_128", "1,128", "256,128", 1, 1, "1,256"},
+      {"DISABLED_int4_dot_128_1_x_256_128", "128,1", "256,128", 0, 1, "1,256"},
+      // These fail for Int4WeightsOnTheLhs only.
+      {"DISABLED_int4_dot_16_128_x_128_1", "16,128", "128,1", 1, 0, "16,1"},
+      {"DISABLED_int4_dot_16_128_x_1_128", "16,128", "1,128", 1, 1, "16,1"},
 
       {"dot_8_128_16_x_8_128_256", "8,128,16", "8,128,256", 1, 1, "8,16,256"},
       {"dot_8_128_16_x_8_256_128", "8,128,16", "8,256,128", 1, 2, "8,16,256"},
       {"dot_8_16_128_x_8_256_128", "8,16,128", "8,256,128", 2, 2, "8,16,256"},
       {"dot_8_16_128_x_8_128_256", "8,16,128", "8,128,256", 2, 1, "8,16,256"},
-      {"dot_8_1_128_x_8_256_128", "8,1,128", "8,256,128", 2, 2, "8,1,256"},
-      {"dot_8_128_1_x_8_256_128", "8,128,1", "8,256,128", 1, 2, "8,1,256"},
+      {"DISABLED_dot_8_1_128_x_8_256_128", "8,1,128", "8,256,128", 2, 2,
+       "8,1,256"},
+      {"DISABLED_dot_8_128_1_x_8_256_128", "8,128,1", "8,256,128", 1, 2,
+       "8,1,256"},
       {"dot_8_16_128_x_8_128_1", "8,16,128", "8,128,1", 2, 1, "8,16,1"},
       {"dot_8_16_128_x_8_1_128", "8,16,128", "8,1,128", 2, 2, "8,16,1"},
   };
@@ -784,15 +856,15 @@ TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDims) {
 
     ENTRY main {
           p0 = s4[128,64,192]{1,0,2} parameter(0)
-          p1 = bf16[256,64]{1,0} parameter(1)
-          ROOT %dot = bf16[128,192,256]{2,1,0} dot(p0, p1),
-            lhs_contracting_dims={1},
-            rhs_contracting_dims={1}
+          p1 = bf16[256,64] parameter(1)
+          ROOT dot = bf16[128,192,256] dot(p0, p1),
+            lhs_contracting_dims={1}, rhs_contracting_dims={1}
     }
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-2}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDimsReversedLayout) {
@@ -802,35 +874,32 @@ TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDimsReversedLayout) {
 
     ENTRY main {
           lhs = s4[128,64,192]{0,1,2} parameter(0)
-          rhs = bf16[256,64]{1,0} parameter(1)
-          ROOT %dot = bf16[128,192,256]{2,1,0} dot(lhs, rhs),
-            lhs_contracting_dims={1},
-            rhs_contracting_dims={1}
+          rhs = bf16[256,64] parameter(1)
+          ROOT dot = bf16[128,192,256] dot(lhs, rhs),
+            lhs_contracting_dims={1}, rhs_contracting_dims={1}
     }
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, NegatePlusConvertHLO) {
+TEST_F(TritonTest, DISABLED_NegatePlusConvertHLO) {
   constexpr absl::string_view kHloText = R"(
     HloModule NegatePlusConvertHLO
 
     ENTRY main {
-      lhs = s4[16,32,64]{2,1,0} parameter(0)
-      lhs_negated = s4[16,32,64]{2,1,0} negate(lhs)
-      lhs_converted = bf16[16,32,64]{2,1,0} convert(lhs_negated)
-      rhs = bf16[16,64,16]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,32,16]{2,1,0} dot(lhs_converted, rhs),
-          lhs_contracting_dims={2},
-          rhs_contracting_dims={1},
-          lhs_batch_dims={0},
-          rhs_batch_dims={0}
+      lhs = s4[2,32,64] parameter(0)
+      lhs_negated = s4[2,32,64] negate(lhs)
+      lhs_converted = bf16[2,32,64] convert(lhs_negated)
+      rhs = bf16[2,64,16] parameter(1)
+      ROOT dot = bf16[2,32,16] dot(lhs_converted, rhs),
+          lhs_batch_dims={0}, lhs_contracting_dims={2},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
   )";
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonTest, RejectTritonFusionForWithMinorBatchDim) {
@@ -838,45 +907,40 @@ TEST_F(TritonTest, RejectTritonFusionForWithMinorBatchDim) {
     HloModule RejectTritonFusionForWithMinorBatchDim
 
     ENTRY main {
-      lhs = s4[32,64,16]{2,1,0} parameter(0)
-      lhs_converted = bf16[32,64,16]{2,1,0} convert(lhs)
-      rhs = bf16[16,64,16]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,32,16]{2,1,0} dot(lhs_converted, rhs),
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={1},
-          lhs_batch_dims={2},
-          rhs_batch_dims={0}
+      lhs = s4[32,64,2] parameter(0)
+      lhs_converted = bf16[32,64,2] convert(lhs)
+      rhs = bf16[2,64,16] parameter(1)
+      ROOT dot = bf16[2,32,16] dot(lhs_converted, rhs),
+          lhs_batch_dims={2}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
   )";
 
   const std::string pattern =
       R"(CHECK-NOT: "kind":"__triton_gemm","triton_gemm_config")";
   TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
-  TF_ASSERT_OK_AND_ASSIGN(auto ok, RunFileCheck(module->ToString(), pattern));
-  EXPECT_TRUE(ok);
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), pattern));
 }
 
-TEST_F(TritonTest, LHSWithMinorDimEqualTo1) {
+TEST_F(TritonTest, DISABLED_LHSWithMinorDimEqualTo1) {
   // We prove that triton can handle int4 dot with non contracting dim size
   // equal to 1.
   constexpr absl::string_view kHloText = R"(
     HloModule LHSWithMinorDimEqualTo1
 
     triton_computation {
-      lhs = s4[16,1024,1]{2,1,0} parameter(0)
-      lhs_converted = bf16[16,1024,1]{2,1,0} convert(lhs)
-      rhs = bf16[16,64,1024]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,1,64]{2,1,0} dot(lhs_converted, rhs),
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={2},
-          lhs_batch_dims={0},
-          rhs_batch_dims={0}
+      lhs = s4[2,1024,1] parameter(0)
+      lhs_converted = bf16[2,1024,1] convert(lhs)
+      rhs = bf16[2,64,1024] parameter(1)
+      ROOT dot = bf16[2,1,64] dot(lhs_converted, rhs),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={2}
     }
 
     ENTRY main {
-      lhs = s4[16,1024,1]{2,1,0} parameter(0)
-      rhs = bf16[16,64,1024]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,1,64]{2,1,0} fusion(lhs, rhs), kind=kCustom,
+      lhs = s4[2,1024,1] parameter(0)
+      rhs = bf16[2,64,1024] parameter(1)
+      ROOT dot = bf16[2,1,64] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -885,27 +949,25 @@ TEST_F(TritonTest, LHSWithMinorDimEqualTo1) {
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, RHSWithMinorDimEqualTo1) {
+TEST_F(TritonTest, DISABLED_RHSWithMinorDimEqualTo1) {
   // We prove that triton can handle int4 dot with non contracting dim size
   // equal to 1.
   constexpr absl::string_view kHloText = R"(
     HloModule RHSWithMinorDimEqualTo1
 
     triton_computation {
-      lhs = bf16[16,1024,64]{2,1,0} parameter(0)
-      rhs = s4[16,1024,1]{2,1,0} parameter(1)
-      rhs_converted = bf16[16,1024,1]{2,1,0} convert(rhs)
-      ROOT dot = bf16[16,64,1]{2,1,0} dot(lhs, rhs_converted),
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={1},
-          lhs_batch_dims={0},
-          rhs_batch_dims={0}
+      lhs = bf16[2,1024,64] parameter(0)
+      rhs = s4[2,1024,1] parameter(1)
+      rhs_converted = bf16[2,1024,1] convert(rhs)
+      ROOT dot = bf16[2,64,1] dot(lhs, rhs_converted),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
 
     ENTRY main {
-      lhs = bf16[16,1024,64]{2,1,0} parameter(0)
-      rhs = s4[16,1024,1]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,64,1]{2,1,0} fusion(lhs, rhs), kind=kCustom,
+      lhs = bf16[2,1024,64] parameter(0)
+      rhs = s4[2,1024,1] parameter(1)
+      ROOT dot = bf16[2,64,1] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -915,25 +977,24 @@ TEST_F(TritonTest, RHSWithMinorDimEqualTo1) {
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, LHSNonMinorContractingDim) {
+TEST_F(TritonTest, DISABLED_LHSNonMinorContractingDim) {
   // We prove that triton can handle int4 dot with non minor
   // lhs_contracting_dim.
   constexpr absl::string_view kHloText = R"(
     HloModule LHSNonMinorContractingDim
 
     triton_computation {
-      lhs = s4[1024,8]{1,0} parameter(0)
-      lhs_converted = bf16[1024,8]{1,0} convert(lhs)
-      rhs = bf16[1024,4]{1,0} parameter(1)
-      ROOT dot = bf16[8,4]{1,0} dot(lhs_converted, rhs),
-          lhs_contracting_dims={0},
-          rhs_contracting_dims={0}
+      lhs = s4[1024,8] parameter(0)
+      lhs_converted = bf16[1024,8] convert(lhs)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
+          lhs_contracting_dims={0}, rhs_contracting_dims={0}
     }
 
     ENTRY main {
-      lhs = s4[1024,8]{1,0} parameter(0)
-      rhs = bf16[1024,4]{1,0} parameter(1)
-      ROOT dot = bf16[8,4]{1,0} fusion(lhs, rhs), kind=kCustom,
+      lhs = s4[1024,8] parameter(0)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -943,27 +1004,25 @@ TEST_F(TritonTest, LHSNonMinorContractingDim) {
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, LHSNonMinorContractingDimWithBatchDim0) {
+TEST_F(TritonTest, DISABLED_LHSNonMinorContractingDimWithBatchDim0) {
   // We prove that triton can handle int4 dot with non minor
   // lhs_contracting_dim.
   constexpr absl::string_view kHloText = R"(
     HloModule LHSNonMinorContractingDimWithBatchDim0
 
     triton_computation {
-      lhs = s4[16,1024,8]{2,1,0} parameter(0)
-      lhs_converted = bf16[16,1024,8]{2,1,0} convert(lhs)
-      rhs = bf16[16,1024,4]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,8,4]{2,1,0} dot(lhs_converted, rhs),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={1},
-        rhs_batch_dims={0},
-        rhs_contracting_dims={1}
+      lhs = s4[2,1024,8] parameter(0)
+      lhs_converted = bf16[2,1024,8] convert(lhs)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
 
     ENTRY main {
-      lhs = s4[16,1024,8]{2,1,0} parameter(0)
-      rhs = bf16[16,1024,4]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,8,4]{2,1,0} fusion(lhs, rhs), kind=kCustom,
+      lhs = s4[2,1024,8] parameter(0)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -972,23 +1031,23 @@ TEST_F(TritonTest, LHSNonMinorContractingDimWithBatchDim0) {
       kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonTest, LHSMinorContractingDim) {
+TEST_F(TritonTest, DISABLED_LHSMinorContractingDim) {
   // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
   constexpr absl::string_view kHloText = R"(
     HloModule LHSMinorContractingDim
 
     triton_computation {
-      lhs = s4[8,1024]{1,0} parameter(0)
-      lhs_converted = bf16[8,1024]{1,0} convert(lhs)
-      rhs = bf16[1024,4]{1,0} parameter(1)
-      ROOT dot = bf16[8,4]{1,0} dot(lhs_converted, rhs),
+      lhs = s4[8,1024] parameter(0)
+      lhs_converted = bf16[8,1024] convert(lhs)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0}
     }
 
     ENTRY main {
-      lhs = s4[8,1024]{1,0} parameter(0)
-      rhs = bf16[1024,4]{1,0} parameter(1)
-      ROOT dot = bf16[8,4]{1,0} fusion(lhs, rhs), kind=kCustom,
+      lhs = s4[8,1024] parameter(0)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -997,23 +1056,23 @@ TEST_F(TritonTest, LHSMinorContractingDim) {
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
-TEST_F(TritonTest, ConvertPlusNegate) {
+TEST_F(TritonTest, DISABLED_ConvertPlusNegate) {
   constexpr absl::string_view kHloText = R"(
     HloModule ConvertPlusNegate
 
     triton_computation {
-      lhs = s4[8,1024]{1,0} parameter(0)
-      lhs_converted = bf16[8,1024]{1,0} convert(lhs)
-      lhs_negated = bf16[8,1024]{1,0} negate(lhs_converted)
-      rhs = bf16[1024,4]{1,0} parameter(1)
-      ROOT dot = bf16[8,4]{1,0} dot(lhs_negated, rhs),
+      lhs = s4[8,1024] parameter(0)
+      lhs_converted = bf16[8,1024] convert(lhs)
+      lhs_negated = bf16[8,1024] negate(lhs_converted)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] dot(lhs_negated, rhs),
         lhs_contracting_dims={1}, rhs_contracting_dims={0}
     }
 
     ENTRY main {
-      lhs = s4[8,1024]{1,0} parameter(0)
-      rhs = bf16[1024,4]{1,0} parameter(1)
-      ROOT dot = bf16[8,4]{1,0} fusion(lhs, rhs), kind=kCustom,
+      lhs = s4[8,1024] parameter(0)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -1022,26 +1081,24 @@ TEST_F(TritonTest, ConvertPlusNegate) {
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
-TEST_F(TritonTest, LHSMinorContractingDimWithBatchDim0) {
+TEST_F(TritonTest, DISABLED_LHSMinorContractingDimWithBatchDim0) {
   // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
   constexpr absl::string_view kHloText = R"(
     HloModule LHSMinorContractingDimWithBatchDim0
 
     triton_computation {
-      lhs = s4[16,8,1024]{2,1,0} parameter(0)
-      lhs_converted = bf16[16,8,1024]{2,1,0} convert(lhs)
-      rhs = bf16[16,1024,4]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,8,4]{2,1,0} dot(lhs_converted, rhs),
-        lhs_batch_dims={0},
-        lhs_contracting_dims={2},
-        rhs_batch_dims={0},
-        rhs_contracting_dims={1}
+      lhs = s4[2,8,1024] parameter(0)
+      lhs_converted = bf16[2,8,1024] convert(lhs)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
+        lhs_batch_dims={0}, lhs_contracting_dims={2},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
 
     ENTRY main {
-      lhs = s4[16,8,1024]{2,1,0} parameter(0)
-      rhs = bf16[16,1024,4]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,8,4]{2,1,0} fusion(lhs, rhs), kind=kCustom,
+      lhs = s4[2,8,1024] parameter(0)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -1050,22 +1107,21 @@ TEST_F(TritonTest, LHSMinorContractingDimWithBatchDim0) {
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
-TEST_F(TritonTest, RHSTestWithNotMinorContractingDim) {
+TEST_F(TritonTest, DISABLED_RHSTestWithNotMinorContractingDim) {
   constexpr absl::string_view kHloText = R"(
     HloModule RHSTestWithNotMinorContractingDim
 
     triton_computation {
-      lhs = bf16[8,1024]{1,0} parameter(0)
-      rhs = s4[1024,4]{1,0} parameter(1)
-      rhs_converted = bf16[1024,4]{1,0} convert(rhs)
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[1024,4] parameter(1)
+      rhs_converted = bf16[1024,4] convert(rhs)
       ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={0}
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
     }
 
     ENTRY main {
-      lhs = bf16[8,1024]{1,0} parameter(0)
-      rhs = s4[1024,4]{1,0} parameter(1)
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[1024,4] parameter(1)
       ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
@@ -1075,22 +1131,21 @@ TEST_F(TritonTest, RHSTestWithNotMinorContractingDim) {
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
-TEST_F(TritonTest, RHSTestWithMinorContractingDim) {
+TEST_F(TritonTest, DISABLED_RHSTestWithMinorContractingDim) {
   constexpr absl::string_view kHloText = R"(
     HloModule RHSTestWithMinorContractingDim
 
     triton_computation {
-      lhs = bf16[8,1024]{1,0} parameter(0)
-      rhs = s4[4,1024]{1,0} parameter(1)
-      rhs_converted = bf16[4,1024]{1,0} convert(rhs)
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[4,1024] parameter(1)
+      rhs_converted = bf16[4,1024] convert(rhs)
       ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
-          lhs_contracting_dims={1},
-          rhs_contracting_dims={1}
+          lhs_contracting_dims={1}, rhs_contracting_dims={1}
     }
 
     ENTRY main {
-      lhs = bf16[8,1024]{1,0} parameter(0)
-      rhs = s4[4,1024]{1,0} parameter(1)
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[4,1024] parameter(1)
       ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
@@ -1100,25 +1155,23 @@ TEST_F(TritonTest, RHSTestWithMinorContractingDim) {
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
-TEST_F(TritonTest, RHSTestWithMinorContractingDimWithBatchDim) {
+TEST_F(TritonTest, DISABLED_RHSTestWithMinorContractingDimWithBatchDim) {
   constexpr absl::string_view kHloText = R"(
     HloModule RHSTestWithMinorContractingDimWithBatchDim
 
     triton_computation {
-      lhs = bf16[16,8,1024]{2,1,0} parameter(0)
-      rhs = s4[16,1024,4]{2,1,0} parameter(1)
-      rhs_converted = bf16[16,1024,4]{2,1,0} convert(rhs)
-      ROOT dot = bf16[16,8,4] dot(lhs, rhs_converted),
-          lhs_batch_dims={0},
-          lhs_contracting_dims={2},
-          rhs_batch_dims={0},
-          rhs_contracting_dims={1}
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,1024,4] parameter(1)
+      rhs_converted = bf16[2,1024,4] convert(rhs)
+      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
+          lhs_batch_dims={0}, lhs_contracting_dims={2},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
     }
 
     ENTRY main {
-      lhs = bf16[16,8,1024]{2,1,0} parameter(0)
-      rhs = s4[16,1024,4]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,8,4] fusion(lhs, rhs), kind=kCustom,
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
@@ -1127,25 +1180,23 @@ TEST_F(TritonTest, RHSTestWithMinorContractingDimWithBatchDim) {
       kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
-TEST_F(TritonTest, RHSTestWithNotMinorContractingDimWithBatchDim0) {
+TEST_F(TritonTest, DISABLED_RHSTestWithNotMinorContractingDimWithBatchDim0) {
   constexpr absl::string_view kHloText = R"(
     HloModule RHSTestWithNotMinorContractingDimWithBatchDim0
 
     triton_computation {
-      lhs = bf16[16,8,1024]{2,1,0} parameter(0)
-      rhs = s4[16,4,1024]{2,1,0} parameter(1)
-      rhs_converted = bf16[16,4,1024]{2,1,0} convert(rhs)
-      ROOT dot = bf16[16,8,4] dot(lhs, rhs_converted),
-          lhs_batch_dims={0},
-          lhs_contracting_dims={2},
-          rhs_batch_dims={0},
-          rhs_contracting_dims={2}
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,4,1024] parameter(1)
+      rhs_converted = bf16[2,4,1024] convert(rhs)
+      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
+          lhs_batch_dims={0}, lhs_contracting_dims={2},
+          rhs_batch_dims={0}, rhs_contracting_dims={2}
     }
 
     ENTRY main {
-      lhs = bf16[16,8,1024]{2,1,0} parameter(0)
-      rhs = s4[16,4,1024]{2,1,0} parameter(1)
-      ROOT dot = bf16[16,8,4] fusion(lhs, rhs), kind=kCustom,
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,4,1024] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
         calls=triton_computation,
         backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
     }
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
index a856a2b379e8..f0eea382bc50 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
+#include <variant>
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/tests/hlo_test_base.h"
@@ -157,11 +159,14 @@ triton_fusion_computation {
 ENTRY main {
   param_0 = f16[65538,32768]{1,0} parameter(0)
   ROOT fusion = f16[65538,32768]{1,0} fusion(param_0), kind=kCustom,
-    calls=triton_fusion_computation,
-     backend_config={"fusion_backend_config":
-      {"kind":"__triton",
-       "block_level_fusion_config":{"output_tiles":[{"sizes":["1","32768"]}],
-                                    "num_warps":"1"}}}
+    calls=triton_fusion_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton", 
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","32768"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
 
   // Checking that this does not crash should be enough.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_int4_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_int4_device_test.cc
new file mode 100644
index 000000000000..0c3dfd1a5068
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_int4_device_test.cc
@@ -0,0 +1,1159 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "xla/autotuning.pb.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class TritonTest : public GpuCodegenTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // Do not fall back to cuBLAS, we are testing Triton.
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    // Do not autotune split-k by default, since this prevents deterministically
+    // matching the optimized HLO.
+    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
+    // Always rewrite Gemms with Triton regardless of size.
+    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
+    debug_options
+        .set_xla_gpu_experimental_enable_subchannel_dequantisation_fusion(true);
+    return debug_options;
+  }
+
+  stream_executor::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+
+  const stream_executor::GpuComputeCapability& GpuComputeComp() {
+    return device_desc().gpu_compute_capability();
+  }
+  stream_executor::GpuComputeCapability CudaAmpereOrRocm() {
+    if (std::holds_alternative<stream_executor::RocmComputeCapability>(
+            GpuComputeComp())) {
+      return stream_executor::GpuComputeCapability{
+          device_desc().rocm_compute_capability()};
+    }
+    return stream_executor::GpuComputeCapability{
+        stream_executor::CudaComputeCapability{
+            stream_executor::CudaComputeCapability::kAmpere, 0}};
+  }
+
+ protected:
+  const stream_executor::DeviceDescription& device_desc() {
+    return backend().default_stream_executor()->GetDeviceDescription();
+  }
+};
+
+// The following tests are for the channel and subchannel dequantization
+// fusions. We run the fused version to avoid the hlo passes and prove that
+// emitters work correctly and unfused version with the goal to fail if an hlo
+// rewrite broke the dequantization logic.
+// For the subchannel dequantization there are two cases:
+// 1. The case where we do:
+//   broadcast -> multiply -> bitcast -> dot.
+// 2. The case where we do:
+//   broadcast -> reshape -> multiply -> dot.
+// On top of that there could be an additional bitcast between the parameter and
+// the broadcast.
+TEST_F(TritonTest, FuseChannelDequantizationFused) {
+  // This test is a Channel Dequantization fusion.
+  // We run the fused version to avoid the hlo passes.
+  // The case where we do:
+  // param(1) -> bitcast -> broadcast -> multiply -> bitcast -> dot.
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseChannelDequantizationFused
+
+    fusion {
+      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
+      w.s8 = s8[32,128,256] convert(w.s4)
+      w.b16 = bf16[32,128,256] convert(w.s8)
+
+      s = bf16[32,1,256] parameter(1)
+      s.bitcast = bf16[32,256] bitcast(s)
+      s.broadcast = bf16[32,128,256] broadcast(s.bitcast), dimensions={0,2}
+      w.scaled = bf16[32,128,256] multiply(w.b16, s.broadcast)
+      w.scaled.bitcast = bf16[32,2,64,256] bitcast(w.scaled)
+
+      a = bf16[1,32,128,2,128] parameter(2)
+      a.bitcast = bf16[32,128,256] bitcast(bf16[1,32,128,2,128] a)
+      a.bitcast.2 = bf16[32,2,64,256] bitcast(a.bitcast)
+      dot = f32[2,32,256,256] dot(w.scaled.bitcast, a.bitcast.2),
+        lhs_batch_dims={1,0}, lhs_contracting_dims={2},
+        rhs_batch_dims={1,0}, rhs_contracting_dims={2}
+      ROOT bitcast = f32[2,32,256,2,1,128] bitcast(f32[2,32,256,256] dot)
+    }
+
+    ENTRY entry_computation {
+      w.s4 = s4[32,128,256]{2,1,0:E(4)} parameter(0)
+      s.bf16 = bf16[32,1,256] parameter(1)
+      a.bf16 = bf16[1,32,128,2,128] parameter(2)
+      ROOT fusion = f32[2,32,256,2,1,128] fusion(w.s4, s.bf16, a.bf16),
+          kind=kCustom,
+          calls=fusion,
+          backend_config={
+            "operation_queue_id":"0",
+            "wait_on_operation_queues":[],
+            "fusion_backend_config":{
+              "kind":"__triton_gemm",
+              "triton_gemm_config":{
+                "block_m":"128",
+                "block_n":"128",
+                "block_k":"64",
+                "split_k":"2",
+                "num_stages":"1",
+                "num_warps":"8",
+                "num_ctas":"1"
+              }
+            },
+            "force_earliest_schedule":false
+          }
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, FuseSubchannelDequantizationWithTranspose) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseSubchannelDequantizationWithTranspose
+
+    ENTRY FuseSubchannelDequantizationWithTranspose {
+      w_s4 = s4[2,2048,64] parameter(1)
+      w_s8 = s8[2,2048,64] convert(w_s4)
+      w_s8_reshaped = s8[2,8,256,64] reshape(w_s8)
+      w_bf16 = bf16[2,8,256,64] convert(w_s8_reshaped)
+      s_bf16 = bf16[2,8,1,64]{3,1,0,2} parameter(0)
+      s_bf16_reshaped = bf16[2,8,64] reshape(s_bf16)
+      s_bf16_broadcasted = bf16[2,8,256,64] broadcast(s_bf16_reshaped),
+          dimensions={0,1,3}
+      w_bf16_scaled = bf16[2,8,256,64] multiply(w_bf16, s_bf16_broadcasted)
+      w_bf16_scaled_reshaped = bf16[2,2048,64] reshape(w_bf16_scaled)
+
+      a_bf16 = bf16[2,2048,2,32] parameter(2)
+      a_bf16_reshaped = bf16[2,2048,64] reshape(a_bf16)
+      dot = bf16[2,64,64] dot(w_bf16_scaled_reshaped, a_bf16_reshaped),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
+      dot_reshaped = bf16[2,64,2,32] reshape(dot)
+      dot_transposed = bf16[64,2,2,32] transpose(dot_reshaped),
+          dimensions={1,0,2,3}
+      ROOT root = bf16[2,64,2,32]{3,2,0,1} reshape(dot_transposed)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    CHECK:    %[[bitcast:.*]] = bf16[2,8,64]{2,1,0} bitcast({{.*}})
+    CHECK:    %[[transpose:.*]] = bf16[2,64,8]{2,1,0} transpose(%[[bitcast]]), dimensions={0,2,1}
+    CHECK:    %[[broadcast:.*]] = bf16[2,64,8,256]{3,2,1,0} broadcast(%[[transpose]]), dimensions={0,1,2}
+    CHECK:    %[[multiply:.*]] = bf16[2,64,8,256]{3,2,1,0} multiply({{.*}}, %[[broadcast]])
+  )"));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, FuseSubchannelDequantization) {
+  // This test is a Subchannel Dequantization fusion.
+  // We run the non-fused version with the goal to fail if an hlo rewrite broke
+  // the dequantization logic. The case where we do:
+  //  param(1) -> reshape -> broadcast -> multiply -> reshape -> dot.
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseSubchannelDequantization
+
+    ENTRY main {
+      w = s4[2,2048,32] parameter(0)
+      w.s8 = s8[2,2048,32] convert(w)
+      w.b16 = bf16[2,2048,32] convert(w.s8)
+      w.b16.reshaped = bf16[2,8,256,32] reshape(w.b16)
+
+      s = bf16[2,8,1,32] parameter(1)
+      s.reshaped = bf16[2,8,32] reshape(s)
+      s.broadcasted = bf16[2,8,256,32] broadcast(s.reshaped), dimensions={0,1,3}
+      w.scaled = bf16[2,8,256,32] multiply(w.b16.reshaped, s.broadcasted)
+      w.scaled.reshaped = bf16[2,2048,32] reshape(w.scaled)
+
+      a = bf16[2,2,1,2048] parameter(2)
+      a.reshaped = bf16[2,2,2048] reshape(a)
+      ROOT dot = f32[2,32,2] dot(w.scaled.reshaped, a.reshaped),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={1}, rhs_contracting_dims={2}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, FuseChannelDequantization) {
+  // This test is a Channel Dequantization fusion.
+  // We run the non-fused version with the goal to fail if an hlo rewrite broke
+  // the dequantization logic. The case where we do:
+  //  param(1) -> bitcast -> broadcast -> multiply -> dot.
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseChannelDequantization
+
+    ENTRY main {
+      w.s4 = s4[32,128,256] parameter(0)
+      w.s8 = s8[32,128,256] convert(w.s4)
+      w.bf16 = bf16[32,128,256] convert(w.s8)
+
+      s = bf16[32,1,256] parameter(1)
+      s.broadcast = bf16[32,1,256] broadcast(s), dimensions={0,1,2}
+      s.reshape = bf16[32,256] reshape(s.broadcast)
+      s.broadcast.2 = bf16[32,128,256] broadcast(s.reshape), dimensions={0,2}
+      w.scaled = bf16[32,128,256] multiply(w.bf16, s.broadcast.2)
+
+      a = bf16[2,1,32,128,128] parameter(2)
+      ROOT dot = f32[32,256,2,1,128] dot(w.scaled, a),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={2}, rhs_contracting_dims={4}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), "CHECK: __triton_gemm"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, FuseSubchannelDequantizationFused) {
+  // This test is a Subchannel Dequantization fusion.
+  // We run the fused version to avoid the hlo passes.
+  // The case where we do:
+  // param -> bitcast -> broadcast -> multiply -> bitcast -> dot.
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseSubchannelDequantizationFused
+
+    fusion {
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      w.s8 = s8[2,2048,32] convert(w.s4)
+      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
+      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
+
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
+      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
+      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
+      w.bitcast = bf16[2,2048,32] bitcast(w)
+
+      a = bf16[2,2,1,2048] parameter(2)
+      a.bitcast = bf16[2,2,2048] bitcast(a)
+      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={1}, rhs_contracting_dims={2}
+    } // fusion
+
+    ENTRY main {
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      a.bf16 = bf16[2,2,1,2048] parameter(2)
+      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
+        kind=kCustom,
+        calls=fusion,
+        backend_config={
+          "operation_queue_id":"0",
+          "wait_on_operation_queues":[],
+          "fusion_backend_config":{
+            "kind":"__triton_gemm",
+            "triton_gemm_config":{
+              "block_m":16,
+              "block_n":16,
+              "block_k":256,
+              "split_k":1,
+              "num_stages":1,
+              "num_warps":2,
+              "num_ctas":1
+            }
+          },
+          "force_earliest_schedule":false
+        }
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, FuseSubchannelDequantizationFusedWithSmallBlockKSize) {
+  // This test is a Subchannel Dequantization fusion.
+  // We run the fused version to avoid the hlo passes.
+  // The case where we do:
+  // param -> bitcast -> broadcast -> multiply -> bitcast -> dot.
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseSubchannelDequantizationFusedWithSmallBlockKSize
+
+    fusion {
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      w.s8 = s8[2,2048,32] convert(w.s4)
+      w.s8.bitcast = s8[2,8,256,32] bitcast(w.s8)
+      w.bf16 = bf16[2,8,256,32] convert(w.s8.bitcast)
+
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      s.bf16.bitcast = bf16[2,8,32] bitcast(s.bf16)
+      s.bf16.broadcast = bf16[2,8,256,32] broadcast(s.bf16.bitcast), dimensions={0,1,3}
+      w = bf16[2,8,256,32] multiply(w.bf16, s.bf16.broadcast)
+      w.bitcast = bf16[2,2048,32] bitcast(w)
+
+      a = bf16[2,2,1,2048] parameter(2)
+      a.bitcast = bf16[2,2,2048] bitcast(a)
+      ROOT dot = f32[2,32,2] dot(w.bitcast, a.bitcast), 
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={1}, rhs_contracting_dims={2}
+    } // fusion
+
+    ENTRY main {
+      w.s4 = s4[2,2048,32]{2,1,0:E(4)} parameter(0)
+      s.bf16 = bf16[2,8,1,32] parameter(1)
+      a.bf16 = bf16[2,2,1,2048] parameter(2)
+      ROOT fusion = f32[2,32,2] fusion(w.s4, s.bf16, a.bf16),
+        kind=kCustom,
+        calls=fusion,
+        backend_config={
+          "operation_queue_id":"0",
+          "wait_on_operation_queues":[],
+          "fusion_backend_config":{
+            "kind":"__triton_gemm",
+            "triton_gemm_config":{
+              "block_m":16,
+              "block_n":16,
+              "block_k":128,
+              "split_k":1,
+              "num_stages":1,
+              "num_warps":2,
+              "num_ctas":1
+            }
+          },
+          "force_earliest_schedule":false
+        }
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, FuseBroadcastInPrologue) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseBroadcastInPrologue
+
+    ENTRY main {
+      lhs = bf16[2,1024] parameter(0)
+      lhs.broadcast = bf16[2,256,1024] broadcast(lhs), dimensions={0,2}
+
+      rhs = bf16[2,256,512] parameter(1)
+
+      ROOT dot = f32[2,1024,512] dot(lhs.broadcast, rhs),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    CHECK:    %[[broadcast:.*]] = bf16[2,256,1024]{2,1,0} broadcast
+    CHECK:    %[[dot:.*]] = f32[2,1024,512]{2,1,0} dot
+    CHECK:    ENTRY %main
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, FuseBroadcastBitcastInPrologue) {
+  // This test is a Subchannel Dequantization fusion.
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseBroadcastBitcastInPrologue
+
+    ENTRY main {
+      lhs = bf16[2,1024] parameter(0)
+      lhs.broadcast = bf16[2,128,1024] broadcast(lhs), dimensions={0,2}
+      lhs.bitcast = bf16[256,1024] reshape(lhs.broadcast)
+
+      rhs = bf16[256,512] parameter(1)
+
+      ROOT dot = f32[1024,512] dot(lhs.bitcast, rhs),
+        lhs_contracting_dims={0}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    CHECK:    %[[broadcast:.*]] = bf16[2,128,1024]{2,1,0} broadcast
+    CHECK:    %[[bitcast:.*]] = bf16[256,1024]{1,0} bitcast
+    CHECK:    ROOT %[[dot:.*]] = f32[1024,512]{1,0} dot
+    CHECK:    ENTRY %main
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+}
+
+TEST_F(TritonTest, FuseBroadcastBitcastMultiplyInPrologue) {
+  // This test is a Subchannel Dequantization fusion.
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseBroadcastBitcastMultiplyInPrologue
+
+    ENTRY main {
+      lhs = bf16[2,1024] parameter(0)
+      lhs.broadcast = bf16[2,128,1024] broadcast(lhs), dimensions={0,2}
+      lhs.bitcast = bf16[256,1024] reshape(lhs.broadcast)
+
+      lhs.weights = s4[256,1024] parameter(1)
+      lhs.weights.i8 = s8[256,1024] convert(lhs.weights)
+      lhs.weights.bf16 = bf16[256,1024] convert(lhs.weights.i8)
+      lhs.weights.scaled = bf16[256,1024] multiply(lhs.bitcast, lhs.weights.bf16)
+
+      rhs = bf16[256,512] parameter(2)
+
+      ROOT dot = f32[1024,512] dot(lhs.weights.scaled, rhs),
+        lhs_contracting_dims={0}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    CHECK:    %[[broadcast:.*]] = bf16[{{.*}}]{2,1,0} broadcast
+    CHECK:    %[[bitcast:.*]] = bf16[{{.*}}]{1,0} bitcast
+    CHECK:    %[[multiply:.*]] = [[type:.*]][{{.*}}]{1,0} multiply
+    CHECK:    %[[dot:.*]] = f32[1024,512]{1,0} dot
+    CHECK:    ENTRY %main
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+}
+
+TEST_F(TritonTest, DotWithI4WeightsOnLhsWithBitcastTo3dTensor) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule DotWithI4WeightsOnLhsWithBitcastTo3dTensor
+
+    fusion {
+      p0 = s4[256,16]{1,0:E(4)} parameter(0)
+      p0.2 = bf16[256,16] convert(p0)
+      p0.3 = bf16[4,64,16] bitcast(p0.2)
+      p1 = bf16[4,32,64] parameter(1)
+      ROOT dot = bf16[4,16,32] dot(p0.3, p1),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={2}
+    }
+
+    ENTRY entry_computation {
+      p0 = s4[256,16]{1,0:E(4)} parameter(0)
+      p1 = bf16[4,32,64] parameter(1)
+      ROOT dot = bf16[4,16,32] fusion(p0, p1),
+        kind=kCustom,
+        calls=fusion,
+        backend_config={
+          "fusion_backend_config":{
+            "kind":"__triton_gemm"
+          }
+        }
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+}
+
+TEST_F(TritonTest,
+       DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule DotWithI4WeightsOnLhsWithNonStandardLayoutAndMultplyInEpilogue
+
+    fusion {
+      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
+      p0.1 = s4[1,32,128]{2,1,0:E(4)} bitcast(p0)
+      p0.2 = bf16[1,32,128] convert(p0.1)
+      p0.3 = bf16[1,128,32]{1,2,0} bitcast(p0.2)
+      p1 = bf16[128,1,64] parameter(1)
+      dot = bf16[1,32,64] dot(p0.3, p1),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={1}, rhs_contracting_dims={0}
+      p2 = bf16[1,1,32]{2,0,1} parameter(2)
+      p2.1 = bf16[1,32] bitcast(p2)
+      p2.2 = bf16[1,32,64] broadcast(p2.1), dimensions={0,1}
+      m = bf16[1,32,64] multiply(dot, p2.2)
+      ROOT m.1 = bf16[1,1,32,64] bitcast(m)
+    }
+
+    ENTRY entry_computation {
+      p0 = s4[1,128,32]{1,2,0:E(4)} parameter(0)
+      p1 = bf16[128,1,64] parameter(1)
+      p2 = bf16[1,1,32]{2,0,1} parameter(2)
+      ROOT gemm_fusion_dot.2 = bf16[1,1,32,64] fusion(p0, p1, p2),
+        kind=kCustom,
+        calls=fusion,
+        backend_config={
+          "fusion_backend_config":{
+            "kind":"__triton_gemm"
+          }
+        }
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+}
+
+TEST_F(TritonTest, DotWithInt4WeightsOnLhsFusedWithMultiplyByChannelScales) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales
+
+    DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales {
+      w = s4[32,64,128] parameter(0)
+      w.i8 = s8[32,64,128] convert(w)
+      w.bf16 = bf16[32,64,128] convert(w.i8)
+      scales = bf16[32,128] parameter(1)
+      scales.broadcast = bf16[32,64,128] broadcast(scales), dimensions={0,2}
+      weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
+      activations = bf16[32,64,256] parameter(2)
+      ROOT dot = f32[32,128,256] dot(weights.scaled, activations),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+
+    ENTRY main {
+      w = s4[32,64,128] parameter(0)
+      scales = bf16[32,128] parameter(1)
+      p2 = bf16[32,64,256] parameter(2)
+      ROOT dot = f32[32,128,256] fusion(w, scales, p2),
+        kind=kCustom,
+        calls=DotWithI4WeightsOnLhsFusedWithMultiplyByChannelScales,
+        backend_config={
+          "fusion_backend_config":{
+            "kind":"__triton_gemm"
+          }
+        }
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+}
+
+TEST_F(TritonTest, FuseMultiplyInPrologue) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseMultiplyInPrologue
+
+    ENTRY main {
+      t = (s4[32,64,128], bf16[32,128]{0,1}, bf16[32,64,256]) parameter(0)
+      w = s4[32,64,128] get-tuple-element(t), index=0
+      w.i8 = s8[32,64,128] convert(w)
+      w.bf16 = bf16[32,64,128] convert(w.i8)
+      scales = bf16[32,128]{0,1} get-tuple-element(t), index=1
+      scales.broadcast = bf16[32,64,128] broadcast(scales), dimensions={0,2}
+      weights.scaled = bf16[32,64,128] multiply(w.bf16, scales.broadcast)
+      activations = bf16[32,64,256] get-tuple-element(t), index=2
+      ROOT dot = f32[32,128,256] dot(weights.scaled, activations),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  // On Ampere the multiply result type is f32, on Hopper it is bf16.
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    CHECK:    %[[multiply:.*]] = [[type:.*]][{{.*}}]{{.*}} multiply({{.*}}, {{.*}})
+    CHECK:    %[[dot:.*]] = f32[32,128,256]{2,1,0} dot
+    CHECK:    ENTRY %main
+  )"));
+}
+
+TEST_F(TritonTest, FuseMultiplyInEpilogue) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule FuseMultiplyInEpilogue
+
+    ENTRY main {
+      p0 = s4[4,32,128]{2,1,0:E(4)} parameter(0)
+      p0.1 = bf16[4,32,128] convert(p0)
+      p1 = bf16[4,128,64] parameter(1)
+      dot = bf16[4,32,64] dot(p0.1, p1),
+        lhs_batch_dims={0}, lhs_contracting_dims={2},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
+      p2 = bf16[4,32] parameter(2)
+      p2.1 = bf16[4,32,64] broadcast(p2), dimensions={0,1}
+      ROOT m = bf16[4,32,64] multiply(dot, p2.1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+      CHECK:  %[[dot:.*]] = bf16[4,64,32]{1,2,0} dot
+      CHECK:  %[[multiply:.*]] = [[type:.*]][4,32,64]{2,1,0} multiply
+      CHECK:  ENTRY %main
+    )"));
+}
+
+TEST_F(TritonTest, NonstandardLayoutInt4) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule NonstandardLayoutInt4
+
+    ENTRY main {
+      p0 = s4[64,128]{0,1} parameter(0)
+      p1 = bf16[256,64] parameter(1)
+      ROOT dot = bf16[128,256] dot(p0, p1),
+        lhs_contracting_dims={0}, rhs_contracting_dims={1}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+using ::testing::TestParamInfo;
+using ::testing::WithParamInterface;
+
+struct I4TestParams {
+  static std::string ToString(const TestParamInfo<I4TestParams>& params) {
+    return params.param.name;
+  }
+
+  std::string Format(absl::string_view format) const {
+    return absl::StrReplaceAll(
+        format, {{"${name}", name},
+                 {"${lhs}", lhs},
+                 {"${rhs}", rhs},
+                 {"${lhs_contracting_dim}", absl::StrCat(lhs_contracting_dim)},
+                 {"${rhs_contracting_dim}", absl::StrCat(rhs_contracting_dim)},
+                 {"${out}", out}});
+  }
+  bool HasBatchDim() const {
+    return std::vector<std::string>(absl::StrSplit(lhs, ',')).size() > 2;
+  }
+
+  std::string name;         // The name of the test.
+  std::string lhs;          // The lhs shape like "128,16".
+  std::string rhs;          // The rhs shape like "128,256".
+  int lhs_contracting_dim;  // The contracting dimension of the lhs.
+  int rhs_contracting_dim;  // The contracting dimension of the rhs.
+  std::string out;          // The output shape like "16,256".
+};
+
+class ParametrizedTritonTest : public TritonTest,
+                               public WithParamInterface<I4TestParams> {};
+
+TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhs) {
+  if (GetParam().HasBatchDim()) {
+    GTEST_SKIP() << "2d test ignores batch dim case.";
+  }
+  constexpr absl::string_view kHloTextTemplate = R"(
+    HloModule lhs_${name}
+
+    lhs_${name} {
+      w.s4 = s4[${lhs}] parameter(0)
+      w.s8 = s8[${lhs}] convert(w.s4)
+      w.bf16 = bf16[${lhs}] convert(w.s8)
+      a = bf16[${rhs}] parameter(1)
+      ROOT lhs_${name} = f32[${out}] dot(w.bf16, a),
+        lhs_contracting_dims={${lhs_contracting_dim}},
+        rhs_contracting_dims={${rhs_contracting_dim}}
+    }
+
+    ENTRY main {
+      w = s4[${lhs}] parameter(0)
+      a = bf16[${rhs}] parameter(1)
+      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
+        kind=kCustom,
+        calls=lhs_${name},
+        backend_config={
+          "fusion_backend_config":{
+            "kind":"__triton_gemm"
+          }
+        }
+    }
+  )";
+  std::string hlo_text = GetParam().Format(kHloTextTemplate);
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
+                                       ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}))
+      << "Failed for HLO: " << hlo_text;
+}
+
+TEST_P(ParametrizedTritonTest, Int4WeightsOnTheLhsWithBatchDim) {
+  if (!GetParam().HasBatchDim()) {
+    GTEST_SKIP() << "3d test ignores 2d case.";
+  }
+  constexpr absl::string_view kHloTextTemplate = R"(
+    HloModule ${name}
+
+    fusion {
+      w.s4 = s4[${lhs}] parameter(0)
+      w.s8 = s8[${lhs}] convert(w.s4)
+      w.bf16 = bf16[${lhs}] convert(w.s8)
+      a = bf16[${rhs}] parameter(1)
+      ROOT dot.0 = f32[${out}] dot(w.bf16, a),
+        lhs_batch_dims={0}, lhs_contracting_dims={${lhs_contracting_dim}},
+        rhs_batch_dims={0}, rhs_contracting_dims={${rhs_contracting_dim}}
+    }
+
+    ENTRY gemm_fusion_dot_computation {
+      w = s4[${lhs}] parameter(0)
+      a = bf16[${rhs}] parameter(1)
+      ROOT gemm_fusion_dot.2 = f32[${out}] fusion(w, a),
+        kind=kCustom,
+        calls=fusion,
+        backend_config={
+          "fusion_backend_config":{
+            "kind":"__triton_gemm"
+          }
+        }
+    }
+  )";
+  std::string hlo_text = GetParam().Format(kHloTextTemplate);
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
+                                       ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}))
+      << "Failed for HLO: " << hlo_text;
+}
+
+TEST_P(ParametrizedTritonTest, Int4WeightsOnTheRhs) {
+  if (GetParam().HasBatchDim()) {
+    GTEST_SKIP() << "2d test ignores batch dim case.";
+  }
+
+  constexpr absl::string_view kHloTextTemplate = R"(
+    HloModule rhs_${name}
+
+    rhs_${name} {
+      a = bf16[${lhs}] parameter(0)
+      w.s4 = s4[${rhs}] parameter(1)
+      w.s8 = s8[${rhs}] convert(w.s4)
+      w.bf16 = bf16[${rhs}] convert(w.s8)
+      ROOT rhs_${name} = f32[${out}] dot(a, w.bf16),
+        lhs_contracting_dims={${lhs_contracting_dim}},
+        rhs_contracting_dims={${rhs_contracting_dim}}
+    }
+
+    ENTRY main {
+      a = bf16[${lhs}] parameter(0)
+      w = s4[${rhs}] parameter(1)
+      ROOT rhs_${name} = f32[${out}] fusion(a, w),
+        kind=kCustom,
+        calls=rhs_${name},
+        backend_config={
+          "fusion_backend_config":{
+            "kind":"__triton_gemm"
+          }
+        }
+    }
+  )";
+  std::string hlo_text = GetParam().Format(kHloTextTemplate);
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text,
+                                       ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}))
+      << "Failed for HLO: " << hlo_text;
+}
+
+std::vector<I4TestParams> Int4TestCases() {
+  return {
+      {"int4_dot_128_16_x_128_256", "128,16", "128,256", 0, 0, "16,256"},
+      {"int4_dot_128_16_x_256_128", "128,16", "256,128", 0, 1, "16,256"},
+      {"int4_dot_16_128_x_256_128", "16,128", "256,128", 1, 1, "16,256"},
+      {"int4_dot_16_128_x_128_256", "16,128", "128,256", 1, 0, "16,256"},
+      {"int4_dot_1_128_x_256_128", "1,128", "256,128", 1, 1, "1,256"},
+      {"int4_dot_128_1_x_256_128", "128,1", "256,128", 0, 1, "1,256"},
+      {"int4_dot_16_128_x_128_1", "16,128", "128,1", 1, 0, "16,1"},
+      {"int4_dot_16_128_x_1_128", "16,128", "1,128", 1, 1, "16,1"},
+
+      {"dot_8_128_16_x_8_128_256", "8,128,16", "8,128,256", 1, 1, "8,16,256"},
+      {"dot_8_128_16_x_8_256_128", "8,128,16", "8,256,128", 1, 2, "8,16,256"},
+      {"dot_8_16_128_x_8_256_128", "8,16,128", "8,256,128", 2, 2, "8,16,256"},
+      {"dot_8_16_128_x_8_128_256", "8,16,128", "8,128,256", 2, 1, "8,16,256"},
+      {"dot_8_1_128_x_8_256_128", "8,1,128", "8,256,128", 2, 2, "8,1,256"},
+      {"dot_8_128_1_x_8_256_128", "8,128,1", "8,256,128", 1, 2, "8,1,256"},
+      {"dot_8_16_128_x_8_128_1", "8,16,128", "8,128,1", 2, 1, "8,16,1"},
+      {"dot_8_16_128_x_8_1_128", "8,16,128", "8,1,128", 2, 2, "8,16,1"},
+  };
+}
+
+INSTANTIATE_TEST_SUITE_P(ParametrizedTritonTest, ParametrizedTritonTest,
+                         ::testing::ValuesIn(Int4TestCases()),
+                         I4TestParams::ToString);
+
+TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDims) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule NonstandardLayoutWithManyNonContractingDims
+
+    ENTRY main {
+          p0 = s4[128,64,192]{1,0,2} parameter(0)
+          p1 = bf16[256,64] parameter(1)
+          ROOT dot = bf16[128,192,256] dot(p0, p1),
+            lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, NonstandardLayoutWithManyNonContractingDimsReversedLayout) {
+  // We cannot do triton_gemm and we use cuBLAS instead.
+  constexpr absl::string_view kHloText = R"(
+    HloModule NonstandardLayoutWithManyNonContractingDimsReversedLayout
+
+    ENTRY main {
+          lhs = s4[128,64,192]{0,1,2} parameter(0)
+          rhs = bf16[256,64] parameter(1)
+          ROOT dot = bf16[128,192,256] dot(lhs, rhs),
+            lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(module), ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, NegatePlusConvertHLO) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule NegatePlusConvertHLO
+
+    ENTRY main {
+      lhs = s4[2,32,64] parameter(0)
+      lhs_negated = s4[2,32,64] negate(lhs)
+      lhs_converted = bf16[2,32,64] convert(lhs_negated)
+      rhs = bf16[2,64,16] parameter(1)
+      ROOT dot = bf16[2,32,16] dot(lhs_converted, rhs),
+          lhs_batch_dims={0}, lhs_contracting_dims={2},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, RejectTritonFusionForWithMinorBatchDim) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule RejectTritonFusionForWithMinorBatchDim
+
+    ENTRY main {
+      lhs = s4[32,64,2] parameter(0)
+      lhs_converted = bf16[32,64,2] convert(lhs)
+      rhs = bf16[2,64,16] parameter(1)
+      ROOT dot = bf16[2,32,16] dot(lhs_converted, rhs),
+          lhs_batch_dims={2}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+  )";
+
+  const std::string pattern =
+      R"(CHECK-NOT: "kind":"__triton_gemm","triton_gemm_config")";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHloText));
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), pattern));
+}
+
+TEST_F(TritonTest, LHSWithMinorDimEqualTo1) {
+  // We prove that triton can handle int4 dot with non contracting dim size
+  // equal to 1.
+  constexpr absl::string_view kHloText = R"(
+    HloModule LHSWithMinorDimEqualTo1
+
+    triton_computation {
+      lhs = s4[2,1024,1] parameter(0)
+      lhs_converted = bf16[2,1024,1] convert(lhs)
+      rhs = bf16[2,64,1024] parameter(1)
+      ROOT dot = bf16[2,1,64] dot(lhs_converted, rhs),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={2}
+    }
+
+    ENTRY main {
+      lhs = s4[2,1024,1] parameter(0)
+      rhs = bf16[2,64,1024] parameter(1)
+      ROOT dot = bf16[2,1,64] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, RHSWithMinorDimEqualTo1) {
+  // We prove that triton can handle int4 dot with non contracting dim size
+  // equal to 1.
+  constexpr absl::string_view kHloText = R"(
+    HloModule RHSWithMinorDimEqualTo1
+
+    triton_computation {
+      lhs = bf16[2,1024,64] parameter(0)
+      rhs = s4[2,1024,1] parameter(1)
+      rhs_converted = bf16[2,1024,1] convert(rhs)
+      ROOT dot = bf16[2,64,1] dot(lhs, rhs_converted),
+          lhs_batch_dims={0}, lhs_contracting_dims={1},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+
+    ENTRY main {
+      lhs = bf16[2,1024,64] parameter(0)
+      rhs = s4[2,1024,1] parameter(1)
+      ROOT dot = bf16[2,64,1] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, LHSNonMinorContractingDim) {
+  // We prove that triton can handle int4 dot with non minor
+  // lhs_contracting_dim.
+  constexpr absl::string_view kHloText = R"(
+    HloModule LHSNonMinorContractingDim
+
+    triton_computation {
+      lhs = s4[1024,8] parameter(0)
+      lhs_converted = bf16[1024,8] convert(lhs)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
+          lhs_contracting_dims={0}, rhs_contracting_dims={0}
+    }
+
+    ENTRY main {
+      lhs = s4[1024,8] parameter(0)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, LHSNonMinorContractingDimWithBatchDim0) {
+  // We prove that triton can handle int4 dot with non minor
+  // lhs_contracting_dim.
+  constexpr absl::string_view kHloText = R"(
+    HloModule LHSNonMinorContractingDimWithBatchDim0
+
+    triton_computation {
+      lhs = s4[2,1024,8] parameter(0)
+      lhs_converted = bf16[2,1024,8] convert(lhs)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
+        lhs_batch_dims={0}, lhs_contracting_dims={1},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+
+    ENTRY main {
+      lhs = s4[2,1024,8] parameter(0)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonTest, LHSMinorContractingDim) {
+  // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
+  constexpr absl::string_view kHloText = R"(
+    HloModule LHSMinorContractingDim
+
+    triton_computation {
+      lhs = s4[8,1024] parameter(0)
+      lhs_converted = bf16[8,1024] convert(lhs)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] dot(lhs_converted, rhs),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+
+    ENTRY main {
+      lhs = s4[8,1024] parameter(0)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, ConvertPlusNegate) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule ConvertPlusNegate
+
+    triton_computation {
+      lhs = s4[8,1024] parameter(0)
+      lhs_converted = bf16[8,1024] convert(lhs)
+      lhs_negated = bf16[8,1024] negate(lhs_converted)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] dot(lhs_negated, rhs),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+
+    ENTRY main {
+      lhs = s4[8,1024] parameter(0)
+      rhs = bf16[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, LHSMinorContractingDimWithBatchDim0) {
+  // We prove that triton can handle int4 dot with minor lhs_contracting_dim.
+  constexpr absl::string_view kHloText = R"(
+    HloModule LHSMinorContractingDimWithBatchDim0
+
+    triton_computation {
+      lhs = s4[2,8,1024] parameter(0)
+      lhs_converted = bf16[2,8,1024] convert(lhs)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] dot(lhs_converted, rhs),
+        lhs_batch_dims={0}, lhs_contracting_dims={2},
+        rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+
+    ENTRY main {
+      lhs = s4[2,8,1024] parameter(0)
+      rhs = bf16[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, RHSTestWithNotMinorContractingDim) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule RHSTestWithNotMinorContractingDim
+
+    triton_computation {
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[1024,4] parameter(1)
+      rhs_converted = bf16[1024,4] convert(rhs)
+      ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+
+    ENTRY main {
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[1024,4] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, RHSTestWithMinorContractingDim) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule RHSTestWithMinorContractingDim
+
+    triton_computation {
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[4,1024] parameter(1)
+      rhs_converted = bf16[4,1024] convert(rhs)
+      ROOT dot = bf16[8,4] dot(lhs, rhs_converted),
+          lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    }
+
+    ENTRY main {
+      lhs = bf16[8,1024] parameter(0)
+      rhs = s4[4,1024] parameter(1)
+      ROOT dot = bf16[8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, RHSTestWithMinorContractingDimWithBatchDim) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule RHSTestWithMinorContractingDimWithBatchDim
+
+    triton_computation {
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,1024,4] parameter(1)
+      rhs_converted = bf16[2,1024,4] convert(rhs)
+      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
+          lhs_batch_dims={0}, lhs_contracting_dims={2},
+          rhs_batch_dims={0}, rhs_contracting_dims={1}
+    }
+
+    ENTRY main {
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,1024,4] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+TEST_F(TritonTest, RHSTestWithNotMinorContractingDimWithBatchDim0) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule RHSTestWithNotMinorContractingDimWithBatchDim0
+
+    triton_computation {
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,4,1024] parameter(1)
+      rhs_converted = bf16[2,4,1024] convert(rhs)
+      ROOT dot = bf16[2,8,4] dot(lhs, rhs_converted),
+          lhs_batch_dims={0}, lhs_contracting_dims={2},
+          rhs_batch_dims={0}, rhs_contracting_dims={2}
+    }
+
+    ENTRY main {
+      lhs = bf16[2,8,1024] parameter(0)
+      rhs = s4[2,4,1024] parameter(1)
+      ROOT dot = bf16[2,8,4] fusion(lhs, rhs), kind=kCustom,
+        calls=triton_computation,
+        backend_config={"fusion_backend_config": {"kind":"__triton_gemm"}}
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc
index e2a7abdd645b..b0ba01487839 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <climits>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <optional>
 #include <queue>
 #include <string>
@@ -39,6 +38,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Math/IR/Math.h"
@@ -51,27 +51,29 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Support/LLVM.h"
+#include "xla/autotuning.pb.h"
+#include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/layout.h"
-#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/mlir_hlo/mhlo/transforms/transformation_helpers.h"
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_indexing_utils.h"
@@ -87,14 +89,9 @@ limitations under the License.
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/tensor_float_32_utils.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 
@@ -227,121 +224,6 @@ bool IsFp8Type(Type t) {
                    mlir::Float8E4M3B11FNUZType>(t);
 }
 
-Value Cast(EmitterLocOpBuilder b, Value value, Type dst_element_ty) {
-  Type src_ty = value.getType();
-  Type src_element_ty = src_ty;
-  Type fp32_ty = b.getF32Type();
-  Type dst_ty = dst_element_ty;
-  if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-    src_element_ty = src_shaped_ty.getElementType();
-    dst_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), dst_element_ty);
-    fp32_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF32Type());
-  }
-  if (src_ty == dst_ty) {
-    return value;
-  }
-
-  // All operations on bf16 are done through f32.
-  if (src_element_ty.isBF16()) {
-    return Cast(b, b.create<ma::ExtFOp>(fp32_ty, value), dst_element_ty);
-  }
-  if (dst_element_ty.isBF16()) {
-    // S8 -> BF16 is directly supported and doesn't need to go through f32.
-    if (!src_element_ty.isInteger(8)) {
-      return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
-    }
-  }
-
-  // float => float
-  auto src_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(src_element_ty);
-  auto dst_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(dst_element_ty);
-  if (src_fp_element_ty && dst_fp_element_ty) {
-    // F8 <-> FP16, BF16, FP32, FP64 need to be handled via Triton's tt.fp_to_fp
-    // because LLVM doesn't support casts from/to FP8.
-    // TODO(b/266862493): Add end-to-end test once FP8 support lands in XLA as
-    // we can't test the code below without patching the feature.
-    if (IsFp8Type(src_element_ty)) {
-      return b.create<mt::FpToFpOp>(dst_ty, value);
-    }
-    if (IsFp8Type(dst_element_ty)) {
-      return b.create<mt::FpToFpOp>(
-          dst_ty, value,
-          mt::RoundingModeAttr::get(b.getContext(), mt::RoundingMode::RTNE));
-    }
-
-    if (src_fp_element_ty.getFPMantissaWidth() >
-        dst_fp_element_ty.getFPMantissaWidth()) {
-      return b.create<ma::TruncFOp>(dst_ty, value);
-    } else {
-      return b.create<ma::ExtFOp>(dst_ty, value);
-    }
-  }
-  // int => int
-  if (mlir::isa<mlir::IntegerType>(src_element_ty) &&
-      mlir::isa<mlir::IntegerType>(dst_element_ty)) {
-    if (src_element_ty.getIntOrFloatBitWidth() <
-        dst_element_ty.getIntOrFloatBitWidth()) {
-      if (src_element_ty.isInteger(1)) {
-        return b.create<ma::ExtUIOp>(dst_ty, value);
-      }
-      return b.create<ma::ExtSIOp>(dst_ty, value);
-    }
-    return b.create<ma::TruncIOp>(dst_ty, value);
-  }
-  // int => float
-  if (mlir::isa<mlir::IntegerType>(src_element_ty) && dst_fp_element_ty) {
-    // TODO(b/266862493): Support unsigned integer types.
-    if (src_element_ty.isInteger(1)) {
-      return b.create<ma::UIToFPOp>(dst_ty, value);
-    }
-    return b.create<ma::SIToFPOp>(dst_ty, value);
-  }
-  // float => int
-  if (src_fp_element_ty && mlir::isa<mlir::IntegerType>(dst_element_ty)) {
-    if (dst_element_ty.isInteger(1)) {
-      return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNE, value,
-                                  ZerosLike(b, value));
-    }
-    // TODO(b/266862493): Support unsigned integer types.
-    // The current logic handles signed integer types only. Additional handling
-    // is needed for unsigned integer types.
-    auto cst_int = [&](EmitterLocOpBuilder b, int64_t x) {
-      if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, dst_element_ty, x, src_shaped_ty.getShape());
-      } else {
-        return CreateConst(b, dst_element_ty, x);
-      }
-    };
-    auto cst_float = [&](EmitterLocOpBuilder b, int64_t x) {
-      if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
-        return CreateConst(b, src_fp_element_ty, x, src_shaped_ty.getShape());
-      } else {
-        return CreateConst(b, src_fp_element_ty, x);
-      }
-    };
-    auto fptosi = b.create<ma::FPToSIOp>(dst_ty, value);
-    int64_t min = llvm::minIntN(dst_element_ty.getIntOrFloatBitWidth());
-    int64_t max = llvm::maxIntN(dst_element_ty.getIntOrFloatBitWidth());
-
-    // value <= static_cast<float>(INT_MIN) ? INT_MIN : ...
-    auto clamped = b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::OLE, value, cst_float(b, min)),
-        cst_int(b, min), fptosi);
-    // value >= static_cast<float>(INT_MAX) ? INT_MAX : ...
-    clamped = b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::OGE, value, cst_float(b, max)),
-        cst_int(b, max), clamped);
-    // isnan(value) ? 0 : ...
-    return b.create<ma::SelectOp>(
-        b.create<ma::CmpFOp>(ma::CmpFPredicate::UNO, value, value),
-        cst_int(b, 0), clamped);
-  }
-
-  LOG(FATAL) << "Type conversion not supported: "
-             << llvm_ir::DumpToString(src_element_ty) << " -> "
-             << llvm_ir::DumpToString(dst_element_ty);
-}
-
 Value Subtract(EmitterLocOpBuilder b, ValueRange values) {
   if (mlir::isa<mlir::IntegerType>(mlir::getElementTypeOrSelf(values[0]))) {
     return b.create<ma::SubIOp>(values[0], values[1]);
@@ -448,7 +330,7 @@ absl::StatusOr<Value> EmitElementwise(EmitterLocOpBuilder b,
     case HloOpcode::kConvert: {
       TF_ASSIGN_OR_RETURN(Type dst_ty,
                           TritonType(b, hlo.shape().element_type()));
-      return Cast(b, inputs[0], dst_ty);
+      return triton::Cast(b, inputs[0], dst_ty);
     }
     case HloOpcode::kAdd:
       if (is_integer) {
@@ -514,7 +396,7 @@ absl::StatusOr<Value> EmitConstant(EmitterLocOpBuilder b,
     return CreateConst(b, ty, converted.GetFirstElement<uint64_t>());
   }
 
-  if (constant.shape().IsInteger()) {
+  if (constant.shape().AreAllLeavesIntegers()) {
     TF_ASSIGN_OR_RETURN(Literal converted, constant.literal().Convert(S64));
     return CreateConst(b, ty, converted.GetFirstElement<int64_t>());
   }
@@ -541,23 +423,6 @@ Value AddPtr(EmitterLocOpBuilder b, Value ptr, Value offset) {
 
 Value EmitParameterLoad(EmitterLocOpBuilder b, Value pointer,
                         ArrayRef<int32_t> boundary_checks) {
-  // 0-D MakeTensorPtrOp
-  //
-  // Triton tries to access the -1 element of a vector and segfaults when
-  // lowering the code to load a 0-D tensor to LLVM. The workaround is to load a
-  // regular pointer + a splat.
-  if (auto make_tensor_ptr = pointer.getDefiningOp<mt::MakeTensorPtrOp>()) {
-    if (make_tensor_ptr.getOffsets().empty()) {
-      return Splat(b,
-                   b.create<mt::LoadOp>(make_tensor_ptr.getBase(),
-                                        mt::CacheModifier::NONE,
-                                        mt::EvictionPolicy::NORMAL,
-                                        /*isVolatile=*/false),
-                   {});
-    }
-  }
-
-  // Any other tensor pointer.
   if (mt::isTensorPointerType(pointer.getType())) {
     std::optional<mt::PaddingOption> padding;
     if (!boundary_checks.empty()) {
@@ -569,10 +434,7 @@ Value EmitParameterLoad(EmitterLocOpBuilder b, Value pointer,
                                 /*isVolatile=*/false);
   }
 
-  // Non-tensor pointer.
-  //
-  // TODO(b/343013366): Remove this after we delete the legacy SoftMax code.
-  // It's the only place where this code-path is used.
+  // EmitTensorPointer will not create a MakeTensorPtrOp for scalars.
   return Splat(b,
                b.create<mt::LoadOp>(pointer, mt::CacheModifier::NONE,
                                     mt::EvictionPolicy::NORMAL,
@@ -681,7 +543,7 @@ absl::StatusOr<Value> EmitScope(
     if (hlo->opcode() == HloOpcode::kConvert &&
         hlo->operand(0)->shape().element_type() == S4) {
       Value unpacked;
-      unpacked = Cast(b, values[hlo->operand(0)], b.getI8Type());
+      unpacked = triton::Cast(b, values[hlo->operand(0)], b.getI8Type());
       std::vector<Value> operands({unpacked});
       TF_ASSIGN_OR_RETURN(result, EmitElementwise(b, libdevice_path,
                                                   device_info, *hlo, operands));
@@ -786,6 +648,7 @@ struct MatMulDims {
   int64_t m;
   int64_t n;
   int64_t k;
+  TritonGemmConfig config;
 
   std::string ToString() const {
     return absl::StrCat("MxNxK: ", m, "x", n, "x", k,
@@ -811,10 +674,35 @@ struct MatMulLaunchConfig {
   mt::ProgramIDDim noncontracting_program_id_dim;
 };
 
+ma::ConstantOp Cst(EmitterLocOpBuilder b, Type index_ty, int64_t v) {
+  return CreateConst(b, index_ty, v);
+}
+
+AutotuneResult::TritonGemmKey DefaultTritonGemmKey() {
+  AutotuneResult::TritonGemmKey triton_gemm_key;
+  triton_gemm_key.set_block_m(64);
+  triton_gemm_key.set_block_k(64);
+  triton_gemm_key.set_block_n(64);
+  triton_gemm_key.set_split_k(1);
+  triton_gemm_key.set_num_stages(1);
+  triton_gemm_key.set_num_warps(2);
+  triton_gemm_key.set_num_ctas(1);
+  return triton_gemm_key;
+}
+
+ma::ConstantOp Cst32(EmitterLocOpBuilder b, int32_t v) {
+  return CreateConst(b, b.getI32Type(), v);
+}
+
+ma::ConstantOp Cst64(EmitterLocOpBuilder b, int64_t v) {
+  return CreateConst(b, b.getI64Type(), v);
+}
+
 /*static*/ absl::StatusOr<MatMulDims> MatMulDims::Create(
     const TritonGemmConfig& config, const HloDotInstruction& dot,
     const TritonFusionAnalysis& analysis) {
   MatMulDims matmul_dims;
+  matmul_dims.config = config;
   if (config.split_k > 1) {
     // split-k is always the first logical dimension.
     matmul_dims.out_split_k_dim_idx = 0;
@@ -846,8 +734,10 @@ struct MatMulLaunchConfig {
   // Logical output dimensions are always ordered as:
   //   split-K, batch, non-contracting LHS, non-contracting RHS,
   // where split-K and batch are optional.
-  matmul_dims.out_rhs_noncontracting_dim_idx = dot.shape().rank() - 1;
-  matmul_dims.out_lhs_noncontracting_dim_idx = dot.shape().rank() - 2;
+  matmul_dims.out_rhs_noncontracting_dim_idx =
+      dot.shape().dimensions().size() - 1;
+  matmul_dims.out_lhs_noncontracting_dim_idx =
+      dot.shape().dimensions().size() - 2;
 
   auto* root = dot.parent()->root_instruction();
   auto iter_spec =
@@ -962,7 +852,7 @@ absl::Status ValidateMatMulConfig(const TritonGemmConfig& config,
   TF_RET_CHECK(dims.lhs_contracting_dimensions_size() == 1);
   TF_RET_CHECK(dims.rhs_contracting_dimensions_size() == 1);
 
-  TF_RET_CHECK(dot.operand(0)->shape().rank() ==
+  TF_RET_CHECK(dot.operand(0)->shape().dimensions().size() ==
                2 + (config.split_k > 1 ? 1 : 0) + num_batch_dims);
   return absl::OkStatus();
 }
@@ -994,11 +884,9 @@ absl::Status UncompilableMatmul(absl::string_view explanation) {
   return s;
 }
 
-bool IsFp8Matmul(const HloDotInstruction* dot_instr) {
-  return absl::c_all_of(std::array<int, 2>{0, 1}, [&](int idx) {
-    return primitive_util::IsF8Type(
-        dot_instr->operand(idx)->shape().element_type());
-  });
+bool IsFp8Matmul(const HloDotInstruction* dot) {
+  return primitive_util::IsF8Type(dot->operand(0)->shape().element_type()) &&
+         primitive_util::IsF8Type(dot->operand(1)->shape().element_type());
 }
 
 class MatMulEmitterHelper {
@@ -1016,55 +904,6 @@ class MatMulEmitterHelper {
         dims_(dims),
         launch_config_(launch_config) {}
 
-  // TODO(b/266862493): Add support for more types as needed.
-  absl::StatusOr<mlir::Type> GetDotAccumulatorType(EmitterLocOpBuilder& b) {
-    const PrecisionConfig::Algorithm algorithm =
-        dot_instr_->precision_config().algorithm();
-
-    if (algorithm == PrecisionConfig::ALG_UNSET) {
-      TF_ASSIGN_OR_RETURN(Type dot_output_ty,
-                          TritonType(b, dot_instr_->shape().element_type()));
-      // The code below assumes that lhs and rhs have the same type. However
-      // it's not always the case with fp8 matmuls, e.g. e4m3×e5m2 is supported
-      // at the hardware level. NVidia GPU currently only supports f32
-      // accumulator for such matmuls.
-      if (IsFp8Matmul(dot_instr_)) {
-        return b.getF32Type();
-      }
-
-      // Data type of dot() immediate inputs.
-      TF_ASSIGN_OR_RETURN(
-          const Type lhs_ty,
-          TritonType(b, dot_instr_->operand(0)->shape().element_type()));
-      TF_ASSIGN_OR_RETURN(
-          const Type rhs_ty,
-          TritonType(b, dot_instr_->operand(1)->shape().element_type()));
-      TF_RET_CHECK(lhs_ty == rhs_ty);
-      Type dot_input_ty = lhs_ty;
-
-      // Currently allowing 8x8-bit ints -> i32.
-      if (dot_input_ty == b.getIntegerType(8) && dot_output_ty.isInteger(32)) {
-        return b.getI32Type();
-      }
-      return (dot_output_ty.isF64() && dot_input_ty.isF64()) ? b.getF64Type()
-                                                             : b.getF32Type();
-    }
-
-    absl::StatusOr<PrimitiveType> accum_type =
-        algorithm_util::GetDotAccumulatorType(algorithm);
-    CHECK(accum_type.ok()) << "Unexpected algorithm: "
-                           << PrecisionConfig::Algorithm_Name(algorithm);
-    TF_ASSIGN_OR_RETURN(Type mlir_accum_type,
-                        TritonType(b, accum_type.value()));
-    if (auto float_accum_type =
-            mlir::dyn_cast<mlir::FloatType>(mlir_accum_type)) {
-      return float_accum_type;
-    }
-    LOG(FATAL) << "Only floating point accumulator types are supported for "
-                  "now, but we got: "
-               << llvm_ir::DumpToString(mlir_accum_type);
-  }
-
   std::vector<const HloInstruction*> EpiloguePostOrderTransitiveOperands(
       const HloInstruction* root) {
     // Collect all instructions of the dot's output scope.
@@ -1187,11 +1026,10 @@ class MatMulEmitterHelper {
             increments.push_back(Cst32(b, 1));
           }
         } else {
-          increments.push_back(
-              CreateConst(b, b.getI32Type(), dim.block_size * dim.split_value));
+          increments.push_back(Cst32(b, dim.block_size * dim.split_value));
         }
       } else {
-        increments.push_back(CreateConst(b, b.getI32Type(), 0));
+        increments.push_back(Cst32(b, 0));
       }
     }
     return increments;
@@ -1234,7 +1072,7 @@ class MatMulEmitterHelper {
     }
 
     has_batch_offset |= stride_batch != 0;
-    return Cst(b, stride_batch);
+    return Cst(b, index_ty_, stride_batch);
   }
 
   // bases: The base pointers of each argument.
@@ -1334,11 +1172,13 @@ class MatMulEmitterHelper {
       // instead of being broadcasted.
       return absl::OkStatus();
     }
-    Value pid_offset =
-        (properties.pid == nullptr)
-            ? Cst32(b, 0)
-            : b.create<ma::MulIOp>(properties.pid,
-                                   Cst32(b, properties.block_size));
+    Value pid_offset;
+    if (properties.pid == nullptr) {
+      pid_offset = Cst32(b, 0);
+    } else {
+      pid_offset =
+          b.create<ma::MulIOp>(properties.pid, Cst32(b, properties.block_size));
+    }
     std::vector<const HloInstruction*> inputs;
     if (hlo->opcode() == HloOpcode::kConcatenate) {
       inputs.insert(inputs.end(), hlo->operands().cbegin(),
@@ -1467,7 +1307,7 @@ class MatMulEmitterHelper {
           "64 bit dynamic-slice indices are not supported yet.");
     }
     majormost_dim_start_index_val =
-        Cast(b, majormost_dim_start_index_val, b.getI32Type());
+        triton::Cast(b, majormost_dim_start_index_val, b.getI32Type());
     majormost_dim_start_index_val =
         b.create<ma::MaxSIOp>(majormost_dim_start_index_val, Cst32(b, 0));
     majormost_dim_start_index_val =
@@ -1552,7 +1392,7 @@ class MatMulEmitterHelper {
           b.create<mt::GetProgramIdOp>(launch_config_.batch_program_id_dim);
 
       Value pid_offset_batch = b.create<ma::MulIOp>(
-          b.create<ma::AddIOp>(Cst(b, offset_batch),
+          b.create<ma::AddIOp>(Cst(b, index_ty_, offset_batch),
                                ConvertScalar(b, pid_batch)),
           batch_stride);
 
@@ -1569,9 +1409,10 @@ class MatMulEmitterHelper {
         TritonFusionAnalysis::Scope::OUTPUT, hlo, *dims_.out_split_k_dim_idx);
     if (spec != nullptr && spec->at(0).count > 1) {
       TF_RET_CHECK(pid_k != nullptr);
-      base = AddPtr(b, base,
-                    b.create<ma::MulIOp>(ConvertScalar(b, pid_k),
-                                         Cst(b, spec->at(0).stride)));
+      base =
+          AddPtr(b, base,
+                 b.create<ma::MulIOp>(ConvertScalar(b, pid_k),
+                                      Cst(b, index_ty_, spec->at(0).stride)));
     }
     return base;
   }
@@ -1626,18 +1467,6 @@ class MatMulEmitterHelper {
     return value;
   }
 
-  Value Cst(EmitterLocOpBuilder b, int64_t v) {
-    return CreateConst(b, index_ty_, v);
-  }
-
-  Value Cst32(EmitterLocOpBuilder b, int32_t v) {
-    return CreateConst(b, b.getI32Type(), v);
-  }
-
-  Value Cst64(EmitterLocOpBuilder b, int64_t v) {
-    return CreateConst(b, b.getI64Type(), v);
-  }
-
   absl::string_view libdevice_path_;
   const se::DeviceDescription& device_info_;
   const HloDotInstruction* dot_instr_;
@@ -1647,7 +1476,7 @@ class MatMulEmitterHelper {
   MatMulLaunchConfig launch_config_;
 };
 
-absl::StatusOr<SmallVector<Value>> GetArguments(mlir::triton::FuncOp fn,
+absl::StatusOr<SmallVector<Value>> GetArguments(mlir::FunctionOpInterface fn,
                                                 const HloInstruction& input) {
   if (input.opcode() == HloOpcode::kParameter) {
     return {{fn.getArgument(input.parameter_number())}};
@@ -1692,176 +1521,6 @@ ConstHloInstructionSet ScopeInputs(const TritonFusionAnalysis& analysis,
   return result;
 }
 
-// Truncates |input| of F32 type to the number representable in Bf16 toward
-// zero.
-Value MaskToBF16(EmitterLocOpBuilder& b, Value input) {
-  ShapedType input_type = mlir::dyn_cast<ShapedType>(input.getType());
-  Type input_type_as_i32 = input_type.clone(b.getI32Type());
-  Value input_as_i32 = b.create<mt::BitcastOp>(input_type_as_i32, input);
-  Value mask = CreateConst<uint32_t>(b, b.getI32Type(), 0xFFFF0000u,
-                                     input_type.getShape());
-  Value high_bits = b.create<ma::AndIOp>(input_type_as_i32, input_as_i32, mask);
-
-  return b.create<mt::BitcastOp>(input_type, high_bits);
-}
-
-// If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
-// If rhs is +infinity, we will have:
-// +infinity * 1.0 = +infinity
-// +infinity * 0.0 = NaN
-// We would get the wrong result if we sum these partial products. Instead, we
-// must override any accumulated result if the last partial product is
-// non-finite. See b/115844437.
-Value ZeroNaNs(EmitterLocOpBuilder& b, Value input) {
-  Value positive_inf = CreateConst<float>(
-      b, b.getF32Type(), std::numeric_limits<float>::infinity(),
-      mlir::cast<ShapedType>(input.getType()).getShape());
-  Value abs_input = b.create<mm::AbsFOp>(input);
-  Value is_finite =
-      b.create<ma::CmpFOp>(ma::CmpFPredicate::OGT, positive_inf, abs_input);
-  return b.create<ma::SelectOp>(is_finite, input, ZerosLike(b, input));
-}
-
-absl::Status CheckF32Type(EmitterLocOpBuilder& b, Value lhs, Value rhs,
-                          Value acc) {
-  Type f32 = b.getF32Type();
-  TF_RET_CHECK(mlir::cast<ShapedType>(lhs.getType()).getElementType() == f32);
-  TF_RET_CHECK(mlir::cast<ShapedType>(rhs.getType()).getElementType() == f32);
-  TF_RET_CHECK(mlir::cast<ShapedType>(acc.getType()).getElementType() == f32);
-  return absl::OkStatus();
-}
-
-std::vector<Value> SplitF32(EmitterLocOpBuilder b, Value input,
-                            int split_count) {
-  auto round_to_bf16 = [](EmitterLocOpBuilder b, Value input) {
-    return Cast(b, input, b.getBF16Type());
-  };
-  std::vector<Value> split_inputs;
-  split_inputs.reserve(split_count);
-  for (int i = 0; i < split_count; ++i) {
-    Value masked = MaskToBF16(b, input);
-    if (i != split_count - 1) {
-      input = b.create<ma::SubFOp>(input, masked);
-    }
-    split_inputs.push_back(round_to_bf16(b, masked));
-  }
-  return split_inputs;
-}
-
-Value IEEEDot(EmitterLocOpBuilder b, Value lhs, Value rhs, Value acc) {
-  return b.create<mt::DotOp>(lhs, rhs, acc,
-                             /*inputPrecision=*/mt::InputPrecision::IEEE,
-                             /*maxNumImpreciseAcc=*/0);
-}
-
-// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
-// from https://arxiv.org/pdf/1904.06376.pdf.
-Value EmitBF16x9Matmul(EmitterLocOpBuilder& b, Value lhs, Value rhs,
-                       Value acc) {
-  std::vector<Value> lhs_parts = SplitF32(b, lhs, 3);
-  std::vector<Value> rhs_parts = SplitF32(b, rhs, 3);
-
-  Value local_acc = ZerosLike(b, acc);
-  Value result;
-
-  // low @ low + low @ mid + mid @ low
-  result = IEEEDot(b, lhs_parts[2], rhs_parts[2], local_acc);
-  result = IEEEDot(b, lhs_parts[1], rhs_parts[2], result);
-  result = IEEEDot(b, lhs_parts[2], rhs_parts[1], result);
-
-  // mid @ mid
-  result = IEEEDot(b, lhs_parts[1], rhs_parts[1], result);
-
-  // high @ low + low @ high
-  result = IEEEDot(b, lhs_parts[2], rhs_parts[0], result);
-  result = IEEEDot(b, lhs_parts[0], rhs_parts[2], result);
-
-  // high @ mid + mid @ high
-  result = IEEEDot(b, lhs_parts[1], rhs_parts[0], result);
-  result = IEEEDot(b, lhs_parts[0], rhs_parts[1], result);
-
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_parts[0], rhs_parts[0], result);
-  result = b.create<ma::AddFOp>(acc, result);
-  return result;
-}
-
-// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
-// from https://arxiv.org/pdf/1904.06376.pdf.
-Value EmitBF16x6Matmul(EmitterLocOpBuilder& b, Value lhs, Value rhs,
-                       Value acc) {
-  LOG(ERROR) << "EmitBF16x6Matmul";
-  std::vector<Value> lhs_parts = SplitF32(b, lhs, 3);
-  std::vector<Value> rhs_parts = SplitF32(b, rhs, 3);
-
-  Value local_acc = ZerosLike(b, acc);
-  Value result = IEEEDot(b, lhs_parts[1], rhs_parts[1], local_acc);
-  // high @ low + low @ high
-  result = IEEEDot(b, lhs_parts[2], rhs_parts[0], result);
-  result = IEEEDot(b, lhs_parts[0], rhs_parts[2], result);
-
-  // high @ mid + mid @ high
-  result = IEEEDot(b, lhs_parts[1], rhs_parts[0], result);
-  result = IEEEDot(b, lhs_parts[0], rhs_parts[1], result);
-
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_parts[0], rhs_parts[0], result);
-  result = b.create<ma::AddFOp>(acc, result);
-  return result;
-}
-
-// Compute F32 matmul with 3 BF16 dots. It is less accurate than
-// EmitBF16x6Matmul.
-Value EmitBF16x3Matmul(EmitterLocOpBuilder& b, Value lhs, Value rhs,
-                       Value acc) {
-  std::vector<Value> lhs_bf16 = SplitF32(b, lhs, 2);
-  std::vector<Value> rhs_bf16 = SplitF32(b, rhs, 2);
-
-  Value local_acc = ZerosLike(b, acc);
-  Value result = IEEEDot(b, lhs_bf16[1], rhs_bf16[0], local_acc);
-  result = IEEEDot(b, lhs_bf16[0], rhs_bf16[1], result);
-  result = ZeroNaNs(b, result);
-  result = IEEEDot(b, lhs_bf16[0], rhs_bf16[0], result);
-  result = b.create<ma::AddFOp>(acc, result);
-  return result;
-}
-
-bool IsTf32Allowed(const HloDotInstruction* dot_instr) {
-  const PrecisionConfig::Algorithm algorithm =
-      dot_instr->precision_config().algorithm();
-
-  if (algorithm == PrecisionConfig::ALG_UNSET) {
-    return tsl::tensor_float_32_execution_enabled() &&
-           absl::c_none_of(dot_instr->precision_config().operand_precision(),
-                           [](const int precision) {
-                             return precision != PrecisionConfig::DEFAULT;
-                           });
-  }
-
-  return algorithm_util::HasTf32InputType(algorithm);
-}
-
-mt::InputPrecision InferDotPrecision(const HloDotInstruction* dot_instr) {
-  auto algorithm = dot_instr->precision_config().algorithm();
-  if (algorithm == PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3) {
-    return mt::InputPrecision::TF32x3;
-  }
-  // TODO(b/320659359) Allow TF32 for 8-bit or less types with F32.
-  bool is_unsupported_bitwidth =
-      HloBfsAnyOf({dot_instr}, [&](const HloInstruction* node) {
-        if (node->opcode() != HloOpcode::kConvert) {
-          return false;
-        }
-        int in_width =
-            primitive_util::BitWidth(node->operand(0)->shape().element_type());
-        return in_width <= 8 && node->shape().element_type() == F32;
-      });
-
-  return IsTf32Allowed(dot_instr) && !is_unsupported_bitwidth
-             ? mt::InputPrecision::TF32
-             : mt::InputPrecision::IEEE;
-}
-
 // This is a heuristic that serves as a proxy for register usage and code size.
 //
 // We have noticed that tilings with very long LLVM IR code are both slow to
@@ -1903,26 +1562,24 @@ class Scopes {
  public:
   Scopes(EmitterLocOpBuilder& b, const HloInstruction* dot_instr,
          const TritonFusionAnalysis& analysis, const MatMulDims& dims,
-         const TritonGemmConfig& config, const MatMulLaunchConfig launch_config,
-         bool is_sparse)
+         const TritonGemmConfig& config, const MatMulLaunchConfig launch_config)
       : lhs_(TritonFusionAnalysis::Scope::LHS),
         rhs_(TritonFusionAnalysis::Scope::RHS),
         out_(TritonFusionAnalysis::Scope::OUTPUT) {
     constexpr int group_m = 8;
     const int64_t width = group_m * launch_config.grid_n;
 
-    auto c32 = [&](int64_t v) { return CreateConst(b, b.getI32Type(), v); };
-
     auto pid_nc = b.create<mt::GetProgramIdOp>(
         launch_config.noncontracting_program_id_dim);
     pid_k_ = (config.split_k > 1)
                  ? b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::Z)
                  : Value{};
 
-    auto group_id = b.create<ma::DivSIOp>(pid_nc, c32(width));
-    ma::ConstantOp group_m_op = c32(group_m);
+    auto group_id = b.create<ma::DivSIOp>(pid_nc, Cst32(b, width));
+    ma::ConstantOp group_m_op = Cst32(b, group_m);
     auto first_pid_m = b.create<ma::MulIOp>(group_id, group_m_op);
-    auto sub0 = b.create<ma::SubIOp>(c32(launch_config.grid_m), first_pid_m);
+    auto sub0 =
+        b.create<ma::SubIOp>(Cst32(b, launch_config.grid_m), first_pid_m);
     auto group_size = b.create<ma::SelectOp>(
         b.create<ma::CmpIOp>(ma::CmpIPredicate::slt, sub0, group_m_op), sub0,
         group_m_op);
@@ -1930,14 +1587,11 @@ class Scopes {
     pid_m_ = b.create<ma::AddIOp>(first_pid_m,
                                   b.create<ma::RemSIOp>(pid_nc, group_size));
 
-    pid_n_ = b.create<ma::DivSIOp>(b.create<ma::RemSIOp>(pid_nc, c32(width)),
-                                   group_size);
+    pid_n_ = b.create<ma::DivSIOp>(
+        b.create<ma::RemSIOp>(pid_nc, Cst32(b, width)), group_size);
 
     int lhs_non_contracting_block_size = config.block_m;
     int lhs_contracting_block_size = config.block_k;
-    if (is_sparse) {
-      lhs_contracting_block_size /= 2;
-    }
     lhs_.tiled_dims = {
         DimProperties(dims.lhs_noncontracting_dim_idx, pid_m_,
                       lhs_non_contracting_block_size,
@@ -1963,17 +1617,6 @@ class Scopes {
                                      pid_n_, config.block_n,
                                      /*split_value=*/1)};
     out_.batch_dim_idx = dims.out_batch_dim_idx;
-
-    if (is_sparse) {
-      meta_ = Side{TritonFusionAnalysis::Scope::META,
-                   /*tiled_dims=*/
-                   {DimProperties(dims.lhs_noncontracting_dim_idx, pid_m_,
-                                  config.block_m,
-                                  /*split_value=*/1),
-                    DimProperties(dims.lhs_contracting_dim_idx, pid_k_,
-                                  config.block_k / 16, config.split_k)},
-                   dims.lhs_batch_dim_idx};
-    }
   }
 
   std::vector<const Side*> input_scopes() const {
@@ -2059,7 +1702,7 @@ class IterableInput {
     Value param_value = EmitParameterLoad(b, args.front(), boundary_checks_);
     if (type_ != storage_type_) {
       // For example cast i8 to i1.
-      param_value = Cast(b, param_value, type_);
+      param_value = triton::Cast(b, param_value, type_);
     }
     return param_value;
   }
@@ -2092,12 +1735,11 @@ Value EmitMaskOnInput(EmitterLocOpBuilder& b,
                       MaskExpandDimension expand_along_dimension, Value input,
                       int dim_k_denom, Value k, int64_t dims_k, int64_t block_k,
                       Value pid_k, int64_t other_dim_block_size) {
-  auto c32 = [&](int64_t v) { return CreateConst(b, b.getI32Type(), v); };
   int block_k_size = block_k / dim_k_denom;
   auto dim_k_elements_to_keep =
-      b.create<ma::SubIOp>(c32(dims_k / dim_k_denom), k);
+      b.create<ma::SubIOp>(Cst32(b, dims_k / dim_k_denom), k);
   auto is_last_tile_cond = b.create<ma::CmpIOp>(
-      ma::CmpIPredicate::slt, dim_k_elements_to_keep, c32(block_k_size));
+      ma::CmpIPredicate::slt, dim_k_elements_to_keep, Cst32(b, block_k_size));
   auto input_type = mlir::cast<mlir::RankedTensorType>(input.getType());
   auto input_element_type = input_type.getElementType();
 
@@ -2135,7 +1777,7 @@ Value EmitMaskOnInput(EmitterLocOpBuilder& b,
         if (pid_k != nullptr) {
           range_from_0_to_k = b.create<ma::AddIOp>(
               range_from_0_to_k,
-              Splat(b, b.create<ma::MulIOp>(pid_k, c32(block_k_size)),
+              Splat(b, b.create<ma::MulIOp>(pid_k, Cst32(b, block_k_size)),
                     block_k_size));
         }
         // Make it a 2D matrix.
@@ -2173,18 +1815,8 @@ Value EmitMaskOnInput(EmitterLocOpBuilder& b,
   return if_op.getResult(0);
 }
 
-}  // namespace
-
-// Use tiling and execution parameters from 'config'. BlockLevelParameters are
-// ignored.
-// Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
-absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
-    EmitterLocOpBuilder& b, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion, mlir::triton::FuncOp fn,
-    const BlockLevelParameters&) {
-  // TODO b/315957220: Populate tma_metadata.
-  stream_executor::gpu::TmaMetadata tma_metadata;
+absl::StatusOr<TritonGemmConfig> GetTritonGemmConfig(
+    const HloFusionInstruction* fusion) {
   auto backend_config =
       fusion->backend_config<GpuBackendConfig>()->fusion_backend_config();
 
@@ -2193,19 +1825,89 @@ absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
     // may be constructed in two distinct places.
     LOG(WARNING) << "Using fallback triton GEMM config for op "
                  << fusion->name();
-    auto& triton_config = *backend_config.mutable_triton_gemm_config();
-    triton_config.set_block_m(64);
-    triton_config.set_block_k(64);
-    triton_config.set_block_n(64);
-    triton_config.set_split_k(1);
-    triton_config.set_num_stages(1);
-    triton_config.set_num_warps(2);
-    triton_config.set_num_ctas(1);
+    *backend_config.mutable_triton_gemm_config() = DefaultTritonGemmKey();
+  }
+
+  return TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
+}
+
+Type GetIndexType(EmitterLocOpBuilder& b, const HloDotInstruction& dot_instr,
+                  const TritonGemmConfig& config) {
+  // Use 32-bit indexing if addressing any of the inputs or the output (which
+  // could grow if split_k is set) does not cross the INT_MAX boundary.
+  // Otherwise, fall back to 64-bit indexing, which is slower.
+  bool use_64bit_indexing =
+      ShapeUtil::ElementsIn(dot_instr.operand(0)->shape()) > INT_MAX ||
+      ShapeUtil::ElementsIn(dot_instr.operand(1)->shape()) > INT_MAX ||
+      ShapeUtil::ElementsIn(dot_instr.shape()) * config.split_k > INT_MAX;
+  return b.getIntegerType(use_64bit_indexing ? 64 : 32);
+}
+
+absl::Status EmitForLoopBody(EmitterLocOpBuilder& b,
+                             MatMulEmitterHelper& emitter, const Scopes& scopes,
+                             const HloDotInstruction* dot_instr,
+                             const MatMulDims& dims,
+                             const llvm::SmallVector<IterableInput>& inputs,
+                             Value ki, ValueRange iter_args) {
+  SmallVector<Value> args_for_yield;
+  std::array<absl::flat_hash_map<const HloInstruction*, Value>, 3> values;
+
+  // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
+  for (const IterableInput& input : inputs) {
+    Value param_value = input.EmitLoad(b, iter_args[input.iter_arg_index_]);
+    CHECK(values[input.operand_index_]
+              .insert({input.hlo_instr_, param_value})
+              .second);
+    args_for_yield.push_back(input.EmitAdvance(b, emitter, ki, iter_args));
+  }
+
+  // Emit all operations of LHS and RHS scopes.
+  Value dot_lhs =
+      emitter.MakeInput(b, scopes.lhs(), kLhsIndex, values[kLhsIndex]);
+  Value dot_rhs =
+      emitter.MakeInput(b, scopes.rhs(), kRhsIndex, values[kRhsIndex]);
+
+  // Operation in the fusion before the dot can alter the elements of the
+  // tiles that were zero masked during loads. These have to be zeroed here
+  // again just before the dot so that they do not affect the output.
+  // Only the K dimension needs masking here because unnecessary elements in
+  // the other two get discarded by the masked store at the end.
+  const bool need_masking =
+      dims.k % (dims.config.block_k * dims.config.split_k) > 0;
+  if (need_masking) {
+    dot_lhs = EmitMaskOnInput(b, MaskExpandDimension::kMajor, dot_lhs, 1, ki,
+                              dims.k, dims.config.block_k, scopes.pid_k(),
+                              dims.config.block_m);
+    dot_rhs = EmitMaskOnInput(b, MaskExpandDimension::kMinor, dot_rhs, 1, ki,
+                              dims.k, dims.config.block_k, scopes.pid_k(),
+                              dims.config.block_n);
+    // Masking the metadata is not necessary, as the inputs are masked
+    // (i.e. zeroed out), so the padded metadata can hold any values.
   }
 
   TF_ASSIGN_OR_RETURN(
-      TritonGemmConfig config,
-      TritonGemmConfig::FromProto(backend_config.triton_gemm_config()));
+      Value acc_next,
+      triton::EmitSingleTileDot(
+          b, *dot_instr,
+          triton::DotOperands{dot_lhs, dot_rhs, iter_args.back()}));
+  args_for_yield.push_back(acc_next);
+  b.create<mlir::scf::YieldOp>(args_for_yield);
+
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+// Use tiling and execution parameters from 'config'. BlockLevelParameters are
+// ignored.
+// Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
+absl::Status EmitMatMul(EmitterLocOpBuilder& b,
+                        absl::string_view libdevice_path,
+                        const se::DeviceDescription& device_info,
+                        const HloFusionInstruction* fusion,
+                        mlir::FunctionOpInterface fn,
+                        const BlockLevelParameters&) {
+  TF_ASSIGN_OR_RETURN(TritonGemmConfig config, GetTritonGemmConfig(fusion));
   TF_ASSIGN_OR_RETURN(auto analysis,
                       TritonFusionAnalysis::Execute(
                           *fusion->called_computation(), config.split_k));
@@ -2216,24 +1918,12 @@ absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
   const HloInstruction* instr =
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
   const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(instr);
-  bool is_sparse = dot_instr->sparse_operands() > 0;
 
-  // Use 32-bit indexing if addressing any of the inputs or the output (which
-  // could grow if split_k is set) does not cross the INT_MAX boundary.
-  // Otherwise, fall back to 64-bit indexing, which is slower.
-  bool use_64bit_indexing =
-      ShapeUtil::ElementsIn(dot_instr->operand(0)->shape()) > INT_MAX ||
-      ShapeUtil::ElementsIn(dot_instr->operand(1)->shape()) > INT_MAX ||
-      ShapeUtil::ElementsIn(dot_instr->shape()) * config.split_k > INT_MAX;
-  Type index_ty = b.getIntegerType(use_64bit_indexing ? 64 : 32);
+  Type index_ty = GetIndexType(b, *dot_instr, config);
 
   const HloInstruction* root = dot_instr->parent()->root_instruction();
   TF_RET_CHECK(!root->shape().IsTuple());
 
-  // We'll be creating a lot of instructions from a single dot, use an
-  // implicit loc builder so we don't have to pass around the location all the
-  // time.
-
   TF_RETURN_IF_ERROR(ValidateMatMulConfig(config, *dot_instr));
   const int split_k = config.split_k;
   const int block_m = config.block_m;
@@ -2248,120 +1938,20 @@ absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
   MatMulEmitterHelper emitter(libdevice_path, device_info, dot_instr, index_ty,
                               dims, launch_config, analysis);
 
-  TF_ASSIGN_OR_RETURN(mlir::Type acc_ty, emitter.GetDotAccumulatorType(b));
+  TF_ASSIGN_OR_RETURN(mlir::Type acc_ty,
+                      triton::GetDotAccumulatorType(b, *dot_instr));
 
   ma::ConstantOp accumulator_init =
       CreateConst(b, acc_ty, 0, {block_m, block_n});
 
   // Calculate the sizes of the lhs, rhs, meta, and output sides.
-  Scopes scopes(b, dot_instr, analysis, dims, config, launch_config, is_sparse);
-
-  auto c32 = [&](int64_t v) { return CreateConst(b, b.getI32Type(), v); };
-
-  size_t lsize = ScopeInputs(analysis, TritonFusionAnalysis::Scope::LHS).size();
-  size_t rsize = ScopeInputs(analysis, TritonFusionAnalysis::Scope::RHS).size();
+  Scopes scopes(b, dot_instr, analysis, dims, config, launch_config);
 
   llvm::SmallVector<IterableInput> inputs;
 
-  auto body_builder_callback = [&](mlir::OpBuilder&, mlir::Location, Value ki,
-                                   ValueRange iter_args) -> void {
-    SmallVector<Value> args_for_yield;
-    args_for_yield.reserve(iter_args.size());
-    std::array<absl::flat_hash_map<const HloInstruction*, Value>, 3> values;
-
-    // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
-    for (const IterableInput& input : inputs) {
-      Value param_value = input.EmitLoad(b, iter_args[input.iter_arg_index_]);
-      CHECK(values[input.operand_index_]
-                .insert({input.hlo_instr_, param_value})
-                .second);
-      args_for_yield.push_back(input.EmitAdvance(b, emitter, ki, iter_args));
-    }
-
-    // Emit all operations of LHS and RHS scopes.
-    Value dot_lhs =
-        emitter.MakeInput(b, scopes.lhs(), kLhsIndex, values[kLhsIndex]);
-    Value dot_rhs =
-        emitter.MakeInput(b, scopes.rhs(), kRhsIndex, values[kRhsIndex]);
-    Value dot_meta = is_sparse
-                         ? emitter.MakeInput(b, *scopes.meta(), kMetaIndex,
-                                             values[kMetaIndex])
-                         : Value{};
-
-    // Operation in the fusion before the dot can alter the elements of the
-    // tiles that were zero masked during loads. These have to be zeroed here
-    // again just before the dot so that they do not affect the output.
-    // Only the K dimension needs masking here because unnecessary elements in
-    // the other two get discarded by the masked store at the end.
-    const bool need_masking = dims.k % (block_k * split_k) > 0;
-    if (need_masking) {
-      dot_lhs = EmitMaskOnInput(b, MaskExpandDimension::kMajor, dot_lhs,
-                                is_sparse ? 2 : 1, ki, dims.k, block_k,
-                                scopes.pid_k(), block_m);
-      dot_rhs = EmitMaskOnInput(b, MaskExpandDimension::kMinor, dot_rhs, 1, ki,
-                                dims.k, block_k, scopes.pid_k(), block_n);
-      // Masking the metadata is not necessary, as the inputs are masked
-      // (i.e. zeroed out), so the padded metadata can hold any values.
-    }
-
-    if (is_sparse) {
-      args_for_yield.push_back(b.create<mt::xla::SparseDotOp>(
-          dot_lhs, dot_rhs, iter_args.back(), dot_meta));
-      b.create<mlir::scf::YieldOp>(args_for_yield);
-      return;
-    }
-
-    Value acc_next;
-    auto algorithm = dot_instr->precision_config().algorithm();
-
-    if (algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9) {
-      TF_CHECK_OK(CheckF32Type(b, dot_lhs, dot_rhs, iter_args.back()));
-      acc_next = EmitBF16x9Matmul(b, dot_lhs, dot_rhs, iter_args.back());
-    } else if (algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6) {
-      TF_CHECK_OK(CheckF32Type(b, dot_lhs, dot_rhs, iter_args.back()));
-      acc_next = EmitBF16x6Matmul(b, dot_lhs, dot_rhs, iter_args.back());
-    } else if (algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3) {
-      TF_CHECK_OK(CheckF32Type(b, dot_lhs, dot_rhs, iter_args.back()));
-      acc_next = EmitBF16x3Matmul(b, dot_lhs, dot_rhs, iter_args.back());
-    } else {
-      // Execute matrix multiplication of input tiles and pass the accumulator.
-      // TODO(manany): Should be looked into once we enable Hopper workloads.
-      // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
-      // lower precision than the output type. The change was introduced here:
-      // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
-      auto dot_precision = InferDotPrecision(dot_instr);
-
-      // Cast F32 inputs to BF16 if the algorithm is BF16_BF16_F32.
-      if (dot_instr->precision_config().algorithm() ==
-          PrecisionConfig::ALG_DOT_BF16_BF16_F32) {
-        if (dot_instr->operand(0)->shape().element_type() == F32) {
-          dot_lhs = Cast(b, dot_lhs, b.getBF16Type());
-        }
-        if (dot_instr->operand(1)->shape().element_type() == F32) {
-          dot_rhs = Cast(b, dot_rhs, b.getBF16Type());
-        }
-      }
-
-      // For fp8 matmuls, disable accumulator promotion, as it's what cublas
-      // does. It may make sense to enable frequent accumulator promotion at
-      // higher matmul precisions set in the config.
-      int max_num_imprecise_acc =
-          IsFp8Matmul(dot_instr) ? std::numeric_limits<int>::max() : 0;
-      acc_next =
-          b.create<mt::DotOp>(dot_lhs, dot_rhs, iter_args.back(),
-                              /*inputPrecision=*/dot_precision,
-                              /*maxNumImpreciseAcc=*/max_num_imprecise_acc);
-    }
-    args_for_yield.push_back(acc_next);
-
-    b.create<mlir::scf::YieldOp>(args_for_yield);
-    return;
-  };
-
   // Pointers to inputs of LHS scope, then RHS, then the accumulator
   // that change with every loop iteration and are passed between them.
   SmallVector<Value> iter_args;
-  iter_args.reserve(lsize + rsize + 1 + is_sparse);
   int64_t step_k = block_k * split_k;
   for (const Side* side : scopes.input_scopes()) {
     for (const HloInstruction* input_hlo : ScopeInputs(analysis, side->scope)) {
@@ -2380,17 +1970,23 @@ absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
     }
   }
 
+  auto body_builder_callback = [&](mlir::OpBuilder&, mlir::Location, Value ki,
+                                   ValueRange iter_args) -> void {
+    CHECK_OK(EmitForLoopBody(b, emitter, scopes, dot_instr, dims, inputs, ki,
+                             iter_args));
+  };
+
   iter_args.push_back(accumulator_init);
   Value acc_final = b.create<mlir::scf::ForOp>(
-                         /*lowerBound*/ c32(0),
-                         /*upperBound*/ c32(dims.k),
-                         /*step*/ c32(step_k),
+                         /*lowerBound*/ Cst32(b, 0),
+                         /*upperBound*/ Cst32(b, dims.k),
+                         /*step*/ Cst32(b, step_k),
                          /*iterArgs*/ iter_args, body_builder_callback)
                         .getResult(iter_args.size() - 1);
   absl::flat_hash_map<const HloInstruction*, Value> values_out;
   TF_ASSIGN_OR_RETURN(Type acc_final_ty,
                       TritonType(b, dot_instr->shape().element_type()));
-  values_out[dot_instr] = Cast(b, acc_final, acc_final_ty);
+  values_out[dot_instr] = triton::Cast(b, acc_final, acc_final_ty);
 
   // Emit the output scope.
   if (std::vector<const HloInstruction*> to_emit =
@@ -2430,7 +2026,7 @@ absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
     b.create<mt::StoreOp>(tensor_pointer, values_out[producer], boundary_checks,
                           mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
   }
-  return tma_metadata;
+  return absl::OkStatus();
 }
 
 absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h
index 723cc2df2d73..23d5484f9650 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h
@@ -16,11 +16,10 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
 #define XLA_BACKENDS_GPU_CODEGEN_TRITON_FUSION_EMITTER_LEGACY_MATMUL_H_
 
-#include <optional>
-
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
@@ -29,8 +28,6 @@ limitations under the License.
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla::gpu {
 
@@ -42,11 +39,12 @@ absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
 // Use tiling and execution parameters from 'config'. BlockLevelParameters are
 // ignored.
 // Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
-absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
-    EmitterLocOpBuilder& builder, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion, mlir::triton::FuncOp fn,
-    const BlockLevelParameters&);
+absl::Status EmitMatMul(EmitterLocOpBuilder& builder,
+                        absl::string_view libdevice_path,
+                        const se::DeviceDescription& device_info,
+                        const HloFusionInstruction* fusion,
+                        mlir::FunctionOpInterface fn,
+                        const BlockLevelParameters&);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul_stub.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul_stub.cc
index e8e84fb5abd4..9b78b00de3bd 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul_stub.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul_stub.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <optional>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -27,8 +27,6 @@ limitations under the License.
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla::gpu {
 
@@ -39,11 +37,12 @@ absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
   return absl::UnimplementedError("not supported for this build configuration");
 }
 
-absl::StatusOr<std::optional<stream_executor::gpu::TmaMetadata>> EmitMatMul(
-    EmitterLocOpBuilder& builder, absl::string_view libdevice_path,
-    const se::DeviceDescription& device_info,
-    const HloFusionInstruction* fusion, mlir::triton::FuncOp fn,
-    const BlockLevelParameters&) {
+absl::Status EmitMatMul(EmitterLocOpBuilder& builder,
+                        absl::string_view libdevice_path,
+                        const se::DeviceDescription& device_info,
+                        const HloFusionInstruction* fusion,
+                        mlir::FunctionOpInterface fn,
+                        const BlockLevelParameters&) {
   return absl::UnimplementedError("not supported for this build configuration");
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_mem_utils_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_mem_utils_test.cc
deleted file mode 100644
index 0b8bef542b48..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_mem_utils_test.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/check.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/substitute.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Support/LLVM.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/model/symbolic_tile_analysis.h"
-#include "xla/service/gpu/model/tiled_hlo_computation.h"
-#include "xla/service/gpu/model/tiled_hlo_instruction.h"
-#include "xla/service/gpu/model/triton_emitter_constraints.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/logging.h"
-#include "tsl/platform/logging.h"  // IWYU pragma: keep
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Types.h"
-
-namespace xla::gpu::ir_emitter_triton_internal {
-namespace {
-
-using ::llvm::SmallVector;
-using ::mlir::MLIRContext;
-using ::mlir::OpBuilder;
-using ::mlir::Type;
-using ::mlir::Value;
-using ::testing::ElementsAre;
-
-class TritonMakeTensorPtrTest : public HloTestBase {
- public:
-  void SetUp() override { LoadMlirDialectsForTriton(mlir_context_); }
-
-  std::pair<std::unique_ptr<VerifiedHloModule>, TiledHloComputation>
-  CreateAndTileHloComputation(std::vector<int64_t> shape_sizes,
-                              const std::vector<int64_t>& tile_sizes,
-                              const std::vector<int64_t>& tile_strides);
-
-  std::pair<mlir::OwningOpRef<mlir::ModuleOp>, MakeTensorPtrOpAndBoundaryChecks>
-  CreateTestTensorPtr(const std::vector<int64_t>& parent_shape,
-                      const std::vector<int64_t>& tile_sizes,
-                      const std::vector<int64_t>& tile_strides);
-
- protected:
-  MLIRContext mlir_context_;
-};
-
-// Returns a HloModule and a corresponding TiledHloComputation using
-// `shape_sizes` to replace the placeholders in the hardcoded hlo text.
-// `tile_sizes` and `tile_strides` are used to tile the hlo computation.
-std::pair<std::unique_ptr<VerifiedHloModule>, TiledHloComputation>
-TritonMakeTensorPtrTest::CreateAndTileHloComputation(
-    std::vector<int64_t> shape_sizes, const std::vector<int64_t>& tile_sizes,
-    const std::vector<int64_t>& tile_strides) {
-  const std::string hlo_text = R"(
-  HloModule test_module
-
-  fusion {
-    p0 = f32[$0] parameter(0)
-    ROOT log = f32[$0] log(p0)
-  }
-
-  ENTRY %main{
-    p0.1 = f32[$0] parameter(0)
-    ROOT fusion = f32[$0] fusion(p0.1), kind=kLoop, calls=%fusion
-  })";
-  auto verified_hlo_module_or = ParseAndReturnVerifiedModule(
-      absl::Substitute(hlo_text, absl::StrJoin(shape_sizes, ",")));
-  CHECK_OK(verified_hlo_module_or);
-
-  std::unique_ptr<VerifiedHloModule> verified_hlo_module =
-      std::move(verified_hlo_module_or).value();
-
-  auto fusion_adaptor = HloFusionAdaptor::ForInstruction(
-      verified_hlo_module->entry_computation()->root_instruction());
-
-  SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
-      SymbolicTileAnalysis::AnalyzeFusion(
-          *fusion_adaptor, &mlir_context_,
-          TritonEmitterConstraints::GetBuilder(
-              TestGpuDeviceInfo::RTXA6000DeviceInfo()));
-  CHECK(
-      std::holds_alternative<SymbolicTileAnalysis>(symbolic_tile_analysis_or));
-
-  SymbolicTileAnalysis symbolic_tile_analysis =
-      std::get<SymbolicTileAnalysis>(std::move(symbolic_tile_analysis_or));
-
-  auto tiled_hlo_computation_or =
-      symbolic_tile_analysis.ComputeTiledHloInstructions(
-          tile_sizes, /*constraints_are_known_satisfied=*/true,
-          /*compute_all_tile_offset_indexing_maps=*/true);
-  TF_EXPECT_OK(tiled_hlo_computation_or.status());
-  return std::make_pair(std::move(verified_hlo_module),
-                        *std::move(tiled_hlo_computation_or));
-}
-
-mlir::triton::FuncOp CreateTritonFunction(
-    EmitterLocOpBuilder& b, const std::vector<int64_t> shape_sizes) {
-  auto fn = b.create<::mlir::triton::FuncOp>(
-      "func",
-      b.getFunctionType({::mlir::triton::PointerType::get(
-                            b.getF32Type(), mlir::NVVM::kGlobalMemorySpace)},
-                        std::nullopt));
-  for (int i = 0; i < fn.getNumArguments(); ++i) {
-    fn.setArgAttr(i, "tt.divisibility", b.getIntegerAttr(b.getI32Type(), 16));
-  }
-  b.setInsertionPointToStart(fn.addEntryBlock());
-  return fn;
-}
-
-std::pair<mlir::OwningOpRef<mlir::ModuleOp>, MakeTensorPtrOpAndBoundaryChecks>
-TritonMakeTensorPtrTest::CreateTestTensorPtr(
-    const std::vector<int64_t>& parent_shape,
-    const std::vector<int64_t>& tile_sizes,
-    const std::vector<int64_t>& tile_strides) {
-  auto [hlo_module, tiled_hlo_computation] =
-      CreateAndTileHloComputation(parent_shape, tile_sizes, tile_strides);
-
-  const TiledHloInstruction* tiled_parameter =
-      tiled_hlo_computation.GetRoots()[0]->operand(0);
-  const HloInstruction* parameter = tiled_parameter->hlo();
-
-  OpBuilder builder(&mlir_context_);
-  auto loc = mlir::NameLoc::get(builder.getStringAttr(parameter->name()));
-  mlir::OwningOpRef<mlir::ModuleOp> triton_module =
-      llvm_ir::CreateMlirModuleOp(loc);
-  builder.setInsertionPointToEnd(triton_module->getBody());
-
-  EmitterLocOpBuilder b(loc, builder);
-  auto fn = CreateTritonFunction(b, parent_shape);
-
-  SmallVector<Value, 3> tile_multi_index = ComputeDelinearizedTileIndex(
-      b, tiled_hlo_computation.num_output_tiles_per_dim());
-
-  return std::make_pair(
-      std::move(triton_module),
-      *ir_emitter_triton_internal::CreateMakeTensorPtrOp(
-          b, tile_multi_index, *tiled_parameter, fn.getArgument(0)));
-}
-
-std::vector<int> ConstOpValuesToInt(const mlir::ValueRange values) {
-  std::vector<int> result;
-  for (Value v : values) {
-    auto const_op = v.getDefiningOp<mlir::arith::ConstantOp>();
-    CHECK_NOTNULL(const_op);
-    auto int_attr = mlir::cast<mlir::IntegerAttr>(const_op.getValueAttr());
-    result.push_back(int_attr.getInt());
-  }
-  return result;
-}
-
-mlir::ArrayRef<int64_t> TensorShape(const ::mlir::triton::MakeTensorPtrOp& op) {
-  auto ptr =
-      mlir::cast<::mlir::triton::PointerType>(op->getResult(0).getType());
-  auto tensor = mlir::cast<mlir::TensorType>(ptr.getPointeeType());
-  return tensor.getShape();
-}
-
-void CheckSizesAreSubtractions(const mlir::ValueRange size_values) {
-  for (Value v : size_values) {
-    EXPECT_NE(v.getDefiningOp<mlir::arith::SubIOp>(), nullptr);
-  }
-}
-TEST_F(TritonMakeTensorPtrTest, BlockProperties) {
-  {
-    auto [module, ptr] = CreateTestTensorPtr({15, 20}, {3, 4}, {1, 1});
-    CheckSizesAreSubtractions(ptr.op.getShape());
-    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(4, 4));
-    EXPECT_THAT(ptr.boundary_checks, ElementsAre(0));
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(20, 1));
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0));
-    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(1, 0));
-  }
-  {
-    auto [module, ptr] = CreateTestTensorPtr({20, 20}, {4, 4}, {1, 1});
-    CheckSizesAreSubtractions(ptr.op.getShape());
-    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(4, 4));
-    EXPECT_TRUE(ptr.boundary_checks.empty());
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(20, 1));
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0));
-    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(1, 0));
-  }
-  {
-    auto [module, ptr] = CreateTestTensorPtr({5}, {1}, {1});
-    CheckSizesAreSubtractions(ptr.op.getShape());
-    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(1));
-    EXPECT_TRUE(ptr.boundary_checks.empty());
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(1));
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0));
-    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(0));
-  }
-  {
-    auto [module, ptr] = CreateTestTensorPtr({5, 5, 5}, {1, 1, 1}, {1, 1, 1});
-    CheckSizesAreSubtractions(ptr.op.getShape());
-    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(1, 1, 1));
-    EXPECT_TRUE(ptr.boundary_checks.empty());
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()), ElementsAre(25, 5, 1));
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0, 0));
-    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(2, 1, 0));
-  }
-  {
-    auto [module, ptr] = CreateTestTensorPtr({5, 15, 20}, {1, 3, 4}, {1, 1, 1});
-    CheckSizesAreSubtractions(ptr.op.getShape());
-    EXPECT_THAT(TensorShape(ptr.op), ElementsAre(1, 4, 4));
-    EXPECT_THAT(ptr.boundary_checks, ElementsAre(1));
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getStrides()),
-                ElementsAre(300, 20, 1));
-    EXPECT_THAT(ConstOpValuesToInt(ptr.op.getOffsets()), ElementsAre(0, 0, 0));
-    EXPECT_THAT(ptr.op.getOrder(), ElementsAre(2, 1, 0));
-  }
-}
-
-}  // namespace
-}  // namespace xla::gpu::ir_emitter_triton_internal
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_legacy_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_legacy_test.cc
new file mode 100644
index 000000000000..62865aa7f8d6
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_legacy_test.cc
@@ -0,0 +1,904 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <tuple>
+#include <variant>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/base/optimization.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "xla/backends/gpu/codegen/triton/support_legacy.h"
+#include "xla/backends/gpu/codegen/triton/test_utils.h"
+#include "xla/comparison_util.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+struct MixTypeParams {
+  PrimitiveType lhs_ty;
+  PrimitiveType rhs_ty;
+  int m;
+  int k;
+  int n;
+  float aabs = 1e-6;
+  float arel = 1e-6;
+};
+
+class MixedTypeTest : public GpuCodegenTest,
+                      public ::testing::WithParamInterface<MixTypeParams> {
+ public:
+  se::GpuComputeCapability GetGpuComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .gpu_compute_capability();
+  }
+
+  void SetUp() override {
+    if (std::holds_alternative<se::RocmComputeCapability>(
+            GetGpuComputeCapability())) {
+      GTEST_SKIP()
+          << "Related fusions are not performed on ROCm without Triton.";
+    }
+  }
+
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // We are testing Triton, remove cuBLAS fallback for these tests.
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    // Always rewrite Gemms with Triton regardless of size.
+    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
+    return debug_options;
+  }
+};
+
+TEST_P(MixedTypeTest, MixedTypeDotProducesCorrectResult) {
+  MixTypeParams params = GetParam();
+  const std::string hlo_string_template = R"(
+HloModule m
+
+ENTRY e {
+  p0 = $0[$2,$3] parameter(0)
+  p0c = $1[$2,$3] convert(p0)
+  p1 = $1[$3,$4] parameter(1)
+  ROOT _ = $1[$2,$4] dot(p0c, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  std::string hlo_string = absl::Substitute(
+      hlo_string_template,
+      primitive_util::LowercasePrimitiveTypeName(params.lhs_ty),
+      primitive_util::LowercasePrimitiveTypeName(params.rhs_ty), params.m,
+      params.k, params.n);
+  MatchOptimizedHlo(hlo_string, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: kCustom
+)");
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{params.aabs, params.arel}));
+}
+
+std::string GemmTestParamsParamsToString(
+    const ::testing::TestParamInfo<MixTypeParams>& data) {
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(data.param.lhs_ty), "_",
+      primitive_util::LowercasePrimitiveTypeName(data.param.rhs_ty), "_",
+      data.param.m, "_", data.param.k, "_", data.param.n);
+}
+
+INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
+                         ::testing::ValuesIn({
+                             MixTypeParams{PRED, F16, 16, 32, 8},
+                             MixTypeParams{PRED, BF16, 16, 32, 8},
+                             MixTypeParams{PRED, F32, 16, 32, 8, 2e-4, 2e-3},
+                             MixTypeParams{S8, F16, 16, 32, 8},
+                             MixTypeParams{S8, BF16, 16, 32, 8},
+                             MixTypeParams{S8, F32, 16, 32, 8, 5e-2, 1e-2},
+                             MixTypeParams{S8, F32, 101, 7, 303, 0.1, 0.1},
+                             MixTypeParams{S8, F32, 101, 32, 303, 0.1, 0.1},
+                             MixTypeParams{S8, F32, 101, 2048, 303, 0.5, 0.1},
+                             MixTypeParams{S8, F32, 101, 2555, 303, 0.5, 0.1},
+                             // Is supported but overflows.
+                             //  GemmTestParams{S32, F16},
+                             MixTypeParams{S16, F16, 30, 19, 12},
+                             MixTypeParams{S32, F32, 4, 4, 4, 1, 1e-2},
+                             MixTypeParams{F16, BF16, 16, 32, 8},
+                             MixTypeParams{F16, F32, 16, 32, 8, 1e-3, 1e-6},
+                             MixTypeParams{BF16, F16, 16, 32, 8, 1e-3, 1e-6},
+                             MixTypeParams{BF16, F32, 16, 32, 8, 1e-3, 1e-6},
+                             MixTypeParams{S8, BF16, 24, 40, 8},
+                             MixTypeParams{S8, F16, 80, 16, 32, 1e-3, 1e-6},
+                             MixTypeParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
+                             MixTypeParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
+                             MixTypeParams{BF16, F32, 77, 500, 333, 3e-3, 3e-3},
+                         }),
+                         GemmTestParamsParamsToString);
+
+class TritonTest : public GpuCodegenTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    // Always rewrite Gemms with Triton regardless of size.
+    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
+    return debug_options;
+  }
+
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+};
+
+class ElementwiseTest : public TritonTest,
+                        public ::testing::WithParamInterface<
+                            std::tuple<PrimitiveType, HloOpcode, float>> {};
+
+std::string ElementwiseTestParamsToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, HloOpcode, float>>&
+        data) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = data.param;
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(data_type), "_",
+      absl::StrReplaceAll(HloOpcodeString(opcode), {{"-", "_"}}));
+}
+
+using UnaryElementwiseTest = ElementwiseTest;
+
+TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[15,33]{1,0} parameter(0)
+  parameter_1 = $0[33,68]{1,0} parameter(1)
+  f1.1 = $0[33,68]{1,0} $1(parameter_1)
+  c.1 = f32[33,68]{1,0} convert(f1.1)
+  ROOT _.1 = f32[15,68]{1,0} dot(parameter_0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  p1 = $0[33,68]{1,0} parameter(1)
+  p0 = f32[15,33]{1,0} parameter(0)
+  ROOT triton_gemm__ = f32[15,68]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
+                    "triton_gemm_config":
+                      {"block_m":"32",
+                       "block_n":"32",
+                       "block_k":"32",
+                       "split_k":"1",
+                       "num_stages":"1",
+                       "num_warps":"4",
+                       "num_ctas":"1"}}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  const std::string kHloRefTemplate = R"(
+fused_computation {
+  param_0.1 = $0[33,68]{1,0} parameter(0)
+  f.1 = $0[33,68]{1,0} $1(param_0.1)
+  ROOT convert.1 = f32[33,68]{1,0} convert(f.1)
+}
+
+ENTRY e {
+  p1 = $0[33,68]{1,0} parameter(1)
+  p0 = f32[15,33]{1,0} parameter(0)
+  fusion = f32[33,68]{1,0} fusion(p1), kind=kLoop, calls=fused_computation
+  gemm = (f32[15,68]{1,0}, s8[0]{0}) custom-call(p0, fusion),
+    custom_call_target="__cublas$$gemm",
+    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
+      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
+      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
+      "alpha_imag":0,"precision_config":
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
+   ROOT get-tuple-element = f32[15,68]{1,0} get-tuple-element((f32[15,68]{1,0}, s8[0]{0}) gemm), index=0
+})";
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
+      /*run_hlo_passes=*/false));
+}
+
+TEST_P(UnaryElementwiseTest, ElementwiseUnaryOpExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTestTemplate = R"(
+triton_computation {
+  parameter_0 = $0[33,68]{1,0} parameter(0)
+  output = $0[33,68]{1,0} $1(parameter_0)
+  ROOT convert = f32[33,68]{1,0} convert(output)
+}
+
+ENTRY e {
+  p0 = $0[33,68]{1,0} parameter(0)
+  ROOT triton_fusion = f32[33,68]{1,0} fusion(p0), kind=kCustom,
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  const std::string kHloRefTemplate = R"(
+fused_computation {
+  param_0.1 = $0[33,68]{1,0} parameter(0)
+  output = $0[33,68]{1,0} $1(param_0.1)
+  ROOT convert = f32[33,68]{1,0} convert(output)
+}
+
+ENTRY e {
+  p0 = $0[33,68]{1,0} parameter(0)
+  ROOT fusion = f32[33,68]{1,0} fusion(p0), kind=kLoop, calls=fused_computation
+})";
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
+      /*run_hlo_passes=*/false));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuitePRED, UnaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(PRED),
+        ::testing::ValuesIn(
+            legacy_triton::
+                TritonSupportedUnaryElementwiseUpToFloatNormalization(PRED)),
+        ::testing::Values(3e-2)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteS8, UnaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(S8),
+        ::testing::ValuesIn(
+            legacy_triton::
+                TritonSupportedUnaryElementwiseUpToFloatNormalization(S8)),
+        ::testing::Values(3e-2)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteS16, UnaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(S16),
+        ::testing::ValuesIn(
+            legacy_triton::
+                TritonSupportedUnaryElementwiseUpToFloatNormalization(S16)),
+        ::testing::Values(1e-3)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteS32, UnaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(S32),
+        ::testing::ValuesIn(
+            legacy_triton::
+                TritonSupportedUnaryElementwiseUpToFloatNormalization(S32)),
+        ::testing::Values(1e-3)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF16, UnaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(F16),
+        ::testing::ValuesIn(
+            legacy_triton::
+                TritonSupportedUnaryElementwiseUpToFloatNormalization(F16)),
+        ::testing::Values(2e-4)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF32, UnaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(F32),
+        ::testing::ValuesIn(
+            legacy_triton::
+                TritonSupportedUnaryElementwiseUpToFloatNormalization(F32)),
+        ::testing::Values(1e-6)),
+    ElementwiseTestParamsToString);
+
+using BinaryElementwiseTest = ElementwiseTest;
+
+TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  f1.1 = $0[11,63]{1,0} $1(parameter_1, parameter_2)
+  c.1 = f32[11,63]{1,0} convert(f1.1)
+  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  p0 = f32[92,11]{1,0} parameter(0)
+  p1 = $0[11,63]{1,0} parameter(1)
+  p2 = $0[11,63]{1,0} parameter(2)
+  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
+                    "triton_gemm_config":
+                      {"block_m":"64",
+                       "block_n":"32",
+                       "block_k":"64",
+                       "split_k":"1",
+                       "num_stages":"2",
+                       "num_warps":"2",
+                       "num_ctas":"1"}}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  const std::string kHloRefTemplate = R"(
+fused_computation {
+  p0 = $0[11,63]{1,0} parameter(0)
+  p1 = $0[11,63]{1,0} parameter(1)
+  f.1 = $0[11,63]{1,0} $1(p0, p1)
+  ROOT convert.1 = f32[11,63]{1,0} convert(f.1)
+}
+
+ENTRY e {
+  p2 = $0[11,63]{1,0} parameter(2)
+  p1 = $0[11,63]{1,0} parameter(1)
+  p0 = f32[92,11]{1,0} parameter(0)
+  fusion = f32[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
+  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
+    custom_call_target="__cublas$$gemm",
+    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
+      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
+      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
+      "alpha_imag":0,"precision_config":
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[92,63]{1,0} get-tuple-element((f32[92,63]{1,0}, s8[0]{0}) gemm), index=0
+})";
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
+      /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/6));
+}
+
+TEST_P(BinaryElementwiseTest, ElementwiseBinaryOpExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTestTemplate = R"(
+triton_computation {
+  parameter_0 = $0[11,63]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  output = $0[11,63]{1,0} $1(parameter_0, parameter_1)
+  ROOT c.1 = f32[11,63]{1,0} convert(output)
+}
+
+ENTRY e {
+  p0 = $0[11,63]{1,0} parameter(0)
+  p1 = $0[11,63]{1,0} parameter(1)
+  ROOT triton_fusion = f32[11,63]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1", "1"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  const std::string kHloRefTemplate = R"(
+fused_computation {
+  p0 = $0[11,63]{1,0} parameter(0)
+  p1 = $0[11,63]{1,0} parameter(1)
+  output = $0[11,63]{1,0} $1(p0, p1)
+  ROOT convert.1 = f32[11,63]{1,0} convert(output)
+}
+
+ENTRY e {
+  p1 = $0[11,63]{1,0} parameter(1)
+  p0 = $0[11,63]{1,0} parameter(0)
+  ROOT fusion = f32[11,63]{1,0} fusion(p0, p1), kind=kLoop, calls=fused_computation
+})";
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
+      /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/6));
+}
+
+bool HloOpcodeIsComparison(HloOpcode opcode) {
+  return opcode == HloOpcode::kCompare;
+}
+std::vector<HloOpcode> TestedBinaryElementwise(PrimitiveType element_type) {
+  std::vector<HloOpcode> ret =
+      legacy_triton::TritonSupportedBinaryElementwiseUpToFloatNormalization(
+          element_type);
+  // Comparison requires an additional property.
+  ret.erase(std::remove_if(ret.begin(), ret.end(), HloOpcodeIsComparison),
+            ret.end());
+  return ret;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuitePRED, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(PRED),
+                       ::testing::ValuesIn(TestedBinaryElementwise(PRED)),
+                       ::testing::Values(0)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteS8, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(S8),
+                       ::testing::ValuesIn(TestedBinaryElementwise(S8)),
+                       ::testing::Values(0)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteS16, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(S16),
+                       ::testing::ValuesIn(TestedBinaryElementwise(S16)),
+                       ::testing::Values(0)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteS32, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(S32),
+                       ::testing::ValuesIn(TestedBinaryElementwise(S32)),
+                       ::testing::Values(0)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF16, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F16),
+                       ::testing::ValuesIn(TestedBinaryElementwise(F16)),
+                       ::testing::Values(2e-4)),
+    ElementwiseTestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF32, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F32),
+                       ::testing::ValuesIn(TestedBinaryElementwise(F32)),
+                       ::testing::Values(1e-6)),
+    ElementwiseTestParamsToString);
+
+class CompareTest : public TritonTest,
+                    public ::testing::WithParamInterface<
+                        std::tuple<PrimitiveType, Comparison::Direction>> {};
+
+std::string CompareTestParamsToString(
+    const ::testing::TestParamInfo<
+        std::tuple<PrimitiveType, Comparison::Direction>>& data) {
+  PrimitiveType data_type;
+  Comparison::Direction direction;
+  std::tie(data_type, direction) = data.param;
+  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type),
+                      "_", ComparisonDirectionToString(direction));
+}
+
+TEST_P(CompareTest, CompareFusionExecutesCorrectly) {
+  PrimitiveType data_type;
+  Comparison::Direction direction;
+  std::tie(data_type, direction) = GetParam();
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  f1.1 = pred[11,63]{1,0} compare(parameter_1, parameter_2), direction=$1
+  c.1 = f32[11,63]{1,0} convert(f1.1)
+  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  p0 = f32[92,11]{1,0} parameter(0)
+  p1 = $0[11,63]{1,0} parameter(1)
+  p2 = $0[11,63]{1,0} parameter(2)
+  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
+    calls=triton_gemm___computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton_gemm",
+      "triton_gemm_config": {
+        "block_m":"16",
+        "block_n":"64",
+        "block_k":"16",
+        "split_k":"1",
+        "num_stages":"3",
+        "num_warps":"2",
+        "num_ctas":"1"}}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      ComparisonDirectionToString(direction));
+
+  const std::string kHloRefTemplate = R"(
+fused_computation {
+  p0 = $0[11,63]{1,0} parameter(0)
+  p1 = $0[11,63]{1,0} parameter(1)
+  f.1 = pred[11,63]{1,0} compare(p0, p1), direction=$1
+  ROOT convert.1 = f32[11,63]{1,0} convert(f.1)
+}
+
+ENTRY e {
+  p2 = $0[11,63]{1,0} parameter(2)
+  p1 = $0[11,63]{1,0} parameter(1)
+  p0 = f32[92,11]{1,0} parameter(0)
+  fusion = f32[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
+  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
+    custom_call_target="__cublas$$gemm",
+    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
+      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
+      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
+      "alpha_imag":0,"precision_config":
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[92,63]{1,0} get-tuple-element((f32[92,63]{1,0}, s8[0]{0}) gemm), index=0
+})";
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      ComparisonDirectionToString(direction));
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case PRED:
+    case S8:
+      tolerance = 3e-2;
+      break;
+    case S16:
+      tolerance = 1e-3;
+      break;
+    case S32:
+      tolerance = 1e-5;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
+      /*run_hlo_passes=*/false));
+}
+
+using cd = Comparison::Direction;
+
+INSTANTIATE_TEST_SUITE_P(
+    CompareTestSuite, CompareTest,
+    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, F32),
+                       ::testing::Values(cd::kEq, cd::kNe, cd::kGe, cd::kGt,
+                                         cd::kLe, cd::kLt)),
+    CompareTestParamsToString);
+
+class SelectTest : public TritonTest,
+                   public ::testing::WithParamInterface<
+                       std::tuple<PrimitiveType, PrimitiveType>> {};
+
+TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
+  PrimitiveType data_type1, data_type2;
+  std::tie(data_type1, data_type2) = GetParam();
+  for (const PrimitiveType type : {data_type1, data_type2}) {
+    if (!legacy_triton::IsTritonSupportedDataType(type,
+                                                  GetCudaComputeCapability())) {
+      GTEST_SKIP() << absl::Substitute(
+          "Unsupported data type: $0",
+          primitive_util::LowercasePrimitiveTypeName(type));
+    }
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = $1[92,13]{1,0} parameter(0)
+  parameter_1 = $0[13,63]{1,0} parameter(1)
+  parameter_2 = $0[13,63]{1,0} parameter(2)
+  parameter_3 = pred[13,63]{1,0} parameter(3)
+  f1.1 = $0[13,63]{1,0} select(parameter_3, parameter_1, parameter_2)
+  c.1 = $1[13,63]{1,0} convert(f1.1)
+  ROOT _.1 = $1[92,63]{1,0} dot(parameter_0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  p0 = $1[92,13]{1,0} parameter(0)
+  p1 = $0[13,63]{1,0} parameter(1)
+  p2 = $0[13,63]{1,0} parameter(2)
+  p3 = pred[13,63]{1,0} parameter(3)
+  ROOT triton_gemm__ = $1[92,63]{1,0} fusion(p0, p1, p2, p3), kind=kCustom,
+    calls=triton_gemm___computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm",
+        "triton_gemm_config": {
+          "block_m":"16",
+          "block_n":"64",
+          "block_k":"16",
+          "split_k":"1",
+          "num_stages":"3",
+          "num_warps":"2",
+          "num_ctas":"1"}}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
+      primitive_util::LowercasePrimitiveTypeName(data_type2));
+
+  const std::string kHloRefTemplate = R"(
+fused_computation {
+  p0 = $0[13,63]{1,0} parameter(0)
+  p1 = $0[13,63]{1,0} parameter(1)
+  p2 = pred[13,63]{1,0} parameter(2)
+  f.1 = $0[13,63]{1,0} select(p2, p0, p1)
+  ROOT convert.1 = $1[13,63]{1,0} convert(f.1)
+}
+
+ENTRY e {
+  p3 = pred[13,63]{1,0} parameter(3)
+  p2 = $0[13,63]{1,0} parameter(2)
+  p1 = $0[13,63]{1,0} parameter(1)
+  p0 = $1[92,13]{1,0} parameter(0)
+  fusion = $1[13,63]{1,0} fusion(p1, p2, p3), kind=kLoop,
+    calls=fused_computation
+  gemm = ($1[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
+    custom_call_target="__cublas$$gemm",
+    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
+      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
+      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
+      "alpha_imag":0,"precision_config":
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = $1[92,63]{1,0} get-tuple-element(($1[92,63]{1,0}, s8[0]{0}) gemm), index=0
+})";
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
+      primitive_util::LowercasePrimitiveTypeName(data_type2));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/0, /*arel=*/0},
+      /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/9));
+}
+
+std::string TwoPrimitiveTypesToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, PrimitiveType>>&
+        data) {
+  PrimitiveType data_type1;
+  PrimitiveType data_type2;
+  std::tie(data_type1, data_type2) = data.param;
+  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type1),
+                      "_",
+                      primitive_util::LowercasePrimitiveTypeName(data_type2));
+}
+
+// BF16: depending on the GPU generation.
+constexpr std::array<PrimitiveType, 7> kSupportedDataTypes{PRED, S8,  S16, S32,
+                                                           F16,  F32, BF16};
+
+INSTANTIATE_TEST_SUITE_P(
+    SelectTestSuite, SelectTest,
+    ::testing::Combine(::testing::ValuesIn(kSupportedDataTypes),
+                       ::testing::Values(F16, BF16, F32)),
+    TwoPrimitiveTypesToString);
+
+class ConstantTest : public TritonTest,
+                     public ::testing::WithParamInterface<PrimitiveType> {};
+
+TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
+  const PrimitiveType data_type = GetParam();
+  if (!legacy_triton::IsTritonSupportedDataType(data_type,
+                                                GetCudaComputeCapability())) {
+    GTEST_SKIP() << absl::Substitute(
+        "Unsupported data type: $0",
+        primitive_util::LowercasePrimitiveTypeName(data_type));
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = f32[11,63]{1,0} parameter(1)
+  c = $0[] constant(123)
+  b = $0[11,63] broadcast(c)
+  cv = f32[11,63] convert(b)
+  m = f32[11,63] multiply(cv, parameter_1)
+  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, m),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  p0 = f32[92,11]{1,0} parameter(0)
+  p1 = f32[11,63]{1,0} parameter(1)
+  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm___computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm",
+        "triton_gemm_config":{
+          "block_m":"16",
+          "block_n":"64",
+          "block_k":"16",
+          "split_k":"1",
+          "num_stages":"3",
+          "num_warps":"2",
+          "num_ctas":"1"}}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  const std::string kHloRefTemplate = R"(
+fused_computation {
+  p0 = f32[11,63]{1,0} parameter(0)
+  c = $0[] constant(123)
+  b = $0[11,63] broadcast(c)
+  cv = f32[11,63] convert(b)
+  ROOT m = f32[11,63] multiply(cv, p0)
+}
+
+ENTRY e {
+  p1 = f32[11,63]{1,0} parameter(1)
+  p0 = f32[92,11]{1,0} parameter(0)
+  fusion = f32[11,63]{1,0} fusion(p1), kind=kLoop,
+    calls=fused_computation
+  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
+    custom_call_target="__cublas$$gemm",
+    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
+      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
+      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
+      "alpha_imag":0,"precision_config":
+      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[92,63]{1, 0} get-tuple-element((f32[92,63]{1, 0}, s8[0]{0}) gemm), index=0
+})";
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  float tolerance;
+  switch (data_type) {
+    case F32:
+    case BF16:
+      tolerance = 1e-6;
+      break;
+    case F16:
+      tolerance = 2e-4;
+      break;
+    case PRED:
+    case S8:
+      tolerance = 3e-2;
+      break;
+    case S16:
+      tolerance = 1e-3;
+      break;
+    case S32:
+      tolerance = 1e-5;
+      break;
+    default:
+      ABSL_UNREACHABLE();
+  }
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
+      /*run_hlo_passes=*/false));
+}
+
+INSTANTIATE_TEST_SUITE_P(ConstantTestSuite, ConstantTest,
+                         ::testing::ValuesIn(kSupportedDataTypes),
+                         TritonSupportTestTypeToString);
+
+class ConvertTest : public TritonTest,
+                    public ::testing::WithParamInterface<
+                        std::tuple<PrimitiveType, PrimitiveType>> {};
+
+TEST_P(ConvertTest, ConvertFusionExecutesCorrectly) {
+  PrimitiveType data_type1, data_type2;
+  std::tie(data_type1, data_type2) = GetParam();
+  for (const PrimitiveType type : {data_type1, data_type2}) {
+    if (!legacy_triton::IsTritonSupportedDataType(type,
+                                                  GetCudaComputeCapability())) {
+      GTEST_SKIP() << absl::Substitute(
+          "Unsupported data type: $0",
+          primitive_util::LowercasePrimitiveTypeName(type));
+    }
+  }
+
+  const std::string hlo_text = absl::Substitute(
+      R"(
+t {
+  p0 = $0[2,2] parameter(0)
+  p0c = $1[2,2] convert(p0)
+  p0cc = f32[2,2] convert(p0c)
+  p1 = f32[2,2] parameter(1)
+  ROOT r = f32[2,2] dot(p0cc, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  p0 = $0[2,2] parameter(0)
+  p1 = f32[2,2] parameter(1)
+  ROOT r = f32[2,2] fusion(p0, p1), kind=kCustom, calls=t,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})",
+      primitive_util::LowercasePrimitiveTypeName(data_type1),
+      primitive_util::LowercasePrimitiveTypeName(data_type2));
+
+  MatchOptimizedHlo(hlo_text, R"(
+CHECK: block_m
+  )");
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ConvertTestSuite, ConvertTest,
+    ::testing::Combine(::testing::ValuesIn(kSupportedDataTypes),
+                       ::testing::ValuesIn(kSupportedDataTypes)),
+    TwoPrimitiveTypesToString);
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
index 8babbd94f2a9..ecc5f51cafad 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_parametrized_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -25,14 +26,17 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/backends/gpu/codegen/triton/support_legacy.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/comparison_util.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -40,6 +44,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+constexpr ErrorSpec kExactMatch{/*aabs=*/0, /*arel=*/0};
+
 struct MixTypeParams {
   PrimitiveType lhs_ty;
   PrimitiveType rhs_ty;
@@ -51,61 +57,65 @@ struct MixTypeParams {
 };
 
 class MixedTypeTest : public GpuCodegenTest,
-                      public ::testing::WithParamInterface<MixTypeParams> {
- public:
-  se::GpuComputeCapability GetGpuComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
-  }
-
-  void SetUp() override {
-    if (std::holds_alternative<se::RocmComputeCapability>(
-            GetGpuComputeCapability())) {
-      GTEST_SKIP()
-          << "Related fusions are not performed on ROCm without Triton.";
-    }
-  }
-
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    // We are testing Triton, remove cuBLAS fallback for these tests.
-    debug_options.set_xla_gpu_cublas_fallback(false);
-    // Always rewrite Gemms with Triton regardless of size.
-    debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
-    return debug_options;
-  }
-};
+                      public ::testing::WithParamInterface<MixTypeParams> {};
 
+// TODO(b/393299275): there is a significant amount of overlap between this test
+// and tests for ALG_UNSET in `fusion_emitter_device_test.cc`. We should
+// eventually unify them by outlining whatever needs to be a special case here,
+// if anything. (I suspect this whole test suite can just be deleted, but
+// keeping it for now for the sake of the migration to the new emitter.)
 TEST_P(MixedTypeTest, MixedTypeDotProducesCorrectResult) {
   MixTypeParams params = GetParam();
   const std::string hlo_string_template = R"(
-HloModule m
+lhs {
+  p0 = $0[$2,$3] parameter(0)
+  ROOT convert = $1[$2,$3] convert(p0)
+}
 
-ENTRY e {
+rhs {
+  ROOT p0 = $1[$3,$4] parameter(0)
+}
+
+dot {
   p0 = $0[$2,$3] parameter(0)
-  p0c = $1[$2,$3] convert(p0)
   p1 = $1[$3,$4] parameter(1)
-  ROOT _ = $1[$2,$4] dot(p0c, p1),
+  lhs = $1[$2,$3] fusion(p0), kind=kCustom, calls=lhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "16"]}]
+    }}}
+  rhs = $1[$3,$4]{1,0} fusion(p1), kind=kCustom, calls=rhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "16"]}]
+    }}}
+  ROOT dot = $1[$2,$4] dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = $0[$2,$3] parameter(0)
+  p1 = $1[$3,$4] parameter(1)
+  ROOT fusion = $1[$2,$4] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16","16"]}],
+          "num_warps":"1", "num_ctas":"1", "num_stages":"1"
+    }}}
 })";
+
   std::string hlo_string = absl::Substitute(
       hlo_string_template,
       primitive_util::LowercasePrimitiveTypeName(params.lhs_ty),
       primitive_util::LowercasePrimitiveTypeName(params.rhs_ty), params.m,
       params.k, params.n);
-  MatchOptimizedHlo(hlo_string, R"(
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: kCustom
-)");
 
-  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{params.aabs, params.arel}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string,
+                                       ErrorSpec{params.aabs, params.arel}));
 }
 
-std::string GemmTestParamsParamsToString(
+std::string DotTestParamsToString(
     const ::testing::TestParamInfo<MixTypeParams>& data) {
   return absl::StrCat(
       primitive_util::LowercasePrimitiveTypeName(data.param.lhs_ty), "_",
@@ -117,7 +127,7 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
                          ::testing::ValuesIn({
                              MixTypeParams{PRED, F16, 16, 32, 8},
                              MixTypeParams{PRED, BF16, 16, 32, 8},
-                             MixTypeParams{PRED, F32, 16, 32, 8, 1e-4, 1e-3},
+                             MixTypeParams{PRED, F32, 16, 32, 8, 2e-4, 2e-3},
                              MixTypeParams{S8, F16, 16, 32, 8},
                              MixTypeParams{S8, BF16, 16, 32, 8},
                              MixTypeParams{S8, F32, 16, 32, 8, 5e-2, 1e-2},
@@ -125,33 +135,24 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
                              MixTypeParams{S8, F32, 101, 32, 303, 0.1, 0.1},
                              MixTypeParams{S8, F32, 101, 2048, 303, 0.5, 0.1},
                              MixTypeParams{S8, F32, 101, 2555, 303, 0.5, 0.1},
-                             // Is supported but overflows.
-                             //  GemmTestParams{S32, F16},
                              MixTypeParams{S16, F16, 30, 19, 12},
                              MixTypeParams{S32, F32, 4, 4, 4, 1, 1e-2},
                              MixTypeParams{F16, BF16, 16, 32, 8},
                              MixTypeParams{F16, F32, 16, 32, 8, 1e-3, 1e-6},
                              MixTypeParams{BF16, F16, 16, 32, 8, 1e-3, 1e-6},
                              MixTypeParams{BF16, F32, 16, 32, 8, 1e-3, 1e-6},
-                             // Supported but disabled because narrowing
-                             // converts should rather belong to producers.
-                             // TODO(b/266862493): Move these to CompareTest.
-                             // TritonRewriteTest2Params{S32, BF16},
-                             //  TritonRewriteTest2Params{F32, F16},
-                             //  TritonRewriteTest2Params{F32, BF16},
                              MixTypeParams{S8, BF16, 24, 40, 8},
                              MixTypeParams{S8, F16, 80, 16, 32, 1e-3, 1e-6},
                              MixTypeParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
                              MixTypeParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
                              MixTypeParams{BF16, F32, 77, 500, 333, 3e-3, 3e-3},
                          }),
-                         GemmTestParamsParamsToString);
+                         DotTestParamsToString);
 
 class TritonTest : public GpuCodegenTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
     debug_options.set_xla_gpu_cublas_fallback(false);
     // Always rewrite Gemms with Triton regardless of size.
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
@@ -184,7 +185,7 @@ std::string ElementwiseTestParamsToString(
 
 using UnaryElementwiseTest = ElementwiseTest;
 
-TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+TEST_P(UnaryElementwiseTest, DISABLED_ElementwiseFusionExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -249,7 +250,7 @@ ENTRY e {
       /*run_hlo_passes=*/false));
 }
 
-TEST_P(UnaryElementwiseTest, ElementwiseUnaryOpExecutesCorrectly) {
+TEST_P(UnaryElementwiseTest, DISABLED_ElementwiseUnaryOpExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -265,9 +266,14 @@ triton_computation {
 ENTRY e {
   p0 = $0[33,68]{1,0} parameter(0)
   ROOT triton_fusion = f32[33,68]{1,0} fusion(p0), kind=kCustom,
-    calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton",
-                    "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "1"]}],"num_warps":"1"}}}
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "1"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -355,7 +361,7 @@ INSTANTIATE_TEST_SUITE_P(
 
 using BinaryElementwiseTest = ElementwiseTest;
 
-TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+TEST_P(BinaryElementwiseTest, DISABLED_ElementwiseFusionExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -424,7 +430,7 @@ ENTRY e {
       /*run_hlo_passes=*/false, /*args_max_bits_of_precision=*/6));
 }
 
-TEST_P(BinaryElementwiseTest, ElementwiseBinaryOpExecutesCorrectly) {
+TEST_P(BinaryElementwiseTest, DISABLED_ElementwiseBinaryOpExecutesCorrectly) {
   PrimitiveType data_type;
   HloOpcode opcode;
   float tolerance;
@@ -442,9 +448,14 @@ ENTRY e {
   p0 = $0[11,63]{1,0} parameter(0)
   p1 = $0[11,63]{1,0} parameter(1)
   ROOT triton_fusion = f32[11,63]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton",
-                    "block_level_fusion_config":{"output_tiles":[{"sizes":["1", "1"]}],"num_warps":"1"}}}
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1", "1"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -541,109 +552,66 @@ std::string CompareTestParamsToString(
                       "_", ComparisonDirectionToString(direction));
 }
 
-TEST_P(CompareTest, CompareFusionExecutesCorrectly) {
-  PrimitiveType data_type;
-  Comparison::Direction direction;
-  std::tie(data_type, direction) = GetParam();
+TEST_P(CompareTest, CompareExecutesCorrectly) {
+  auto [data_type, direction] = GetParam();
 
   const std::string kHloTestTemplate = R"(
-triton_gemm___computation {
-  parameter_0 = f32[92,11]{1,0} parameter(0)
-  parameter_1 = $0[11,63]{1,0} parameter(1)
-  parameter_2 = $0[11,63]{1,0} parameter(2)
-  f1.1 = pred[11,63]{1,0} compare(parameter_1, parameter_2), direction=$1
-  c.1 = f32[11,63]{1,0} convert(f1.1)
-  ROOT _.1 = f32[92,63]{1,0} dot(parameter_0, c.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0},
-    operand_precision={HIGH, HIGH}
+triton_computation {
+  p0 = $0[11,63]{1,0} parameter(0)
+  p1 = $0[11,63]{1,0} parameter(1)
+  ROOT compare = pred[11,63]{1,0} compare(p0, p1), direction=$1
 }
 
 ENTRY e {
-  p0 = f32[92,11]{1,0} parameter(0)
+  p0 = $0[11,63]{1,0} parameter(0)
   p1 = $0[11,63]{1,0} parameter(1)
-  p2 = $0[11,63]{1,0} parameter(2)
-  ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"16",
-                       "block_n":"64",
-                       "block_k":"16",
-                       "split_k":"1",
-                       "num_stages":"3",
-                       "num_warps":"2",
-                       "num_ctas":"1"}}}
+  ROOT fusion = pred[11,63]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1", "16"]}],
+          "num_warps":"1",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
       ComparisonDirectionToString(direction));
 
-  const std::string kHloRefTemplate = R"(
-fused_computation {
-  p0 = $0[11,63]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  f.1 = pred[11,63]{1,0} compare(p0, p1), direction=$1
-  ROOT convert.1 = f32[11,63]{1,0} convert(f.1)
-}
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
 
-ENTRY e {
-  p2 = $0[11,63]{1,0} parameter(2)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p0 = f32[92,11]{1,0} parameter(0)
-  fusion = f32[11,63]{1,0} fusion(p1, p2), kind=kLoop, calls=fused_computation
-  gemm = (f32[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
-    custom_call_target="__cublas$$gemm",
-    backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":
-      {"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],
-      "lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},
-      "alpha_imag":0,"precision_config":
-      {"operand_precision":["HIGHEST","HIGHEST"]},"epilogue":"DEFAULT"}}
-  ROOT get-tuple-element = f32[92,63]{1,0} get-tuple-element((f32[92,63]{1,0}, s8[0]{0}) gemm), index=0
-})";
-  const std::string hlo_ref = absl::Substitute(
-      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
-      ComparisonDirectionToString(direction));
+  HloInstruction* compare =
+      module->entry_computation()->root_instruction()->fused_expression_root();
 
-  float tolerance;
-  switch (data_type) {
-    case F32:
-      tolerance = 1e-6;
-      break;
-    case F16:
-      tolerance = 2e-4;
-      break;
-    case PRED:
-    case S8:
-      tolerance = 3e-2;
-      break;
-    case S16:
-      tolerance = 1e-3;
-      break;
-    case S32:
-      tolerance = 1e-5;
-      break;
-    default:
-      ABSL_UNREACHABLE();
+  // We generate all possible instruction combinations, but we need to skip
+  // those that are known to not be supported by the backend on the relevant
+  // chip.
+  if (!IsTritonSupportedInstruction(*compare, se::GpuComputeCapability())) {
+    GTEST_SKIP() << "Unsupported type combination for comparison, skipping.";
   }
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      hlo_ref, hlo_test, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance},
-      /*run_hlo_passes=*/false));
-}
 
-using cd = Comparison::Direction;
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_test, kExactMatch));
+}
 
 INSTANTIATE_TEST_SUITE_P(
     CompareTestSuite, CompareTest,
-    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, F32),
-                       ::testing::Values(cd::kEq, cd::kNe, cd::kGe, cd::kGt,
-                                         cd::kLe, cd::kLt)),
+    ::testing::Combine(::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::Values(Comparison::Direction::kEq,
+                                         Comparison::Direction::kNe,
+                                         Comparison::Direction::kGe,
+                                         Comparison::Direction::kGt,
+                                         Comparison::Direction::kLe,
+                                         Comparison::Direction::kLt)),
     CompareTestParamsToString);
 
 class SelectTest : public TritonTest,
                    public ::testing::WithParamInterface<
                        std::tuple<PrimitiveType, PrimitiveType>> {};
 
-TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
+TEST_P(SelectTest, DISABLED_SelectFusionExecutesCorrectly) {
   PrimitiveType data_type1, data_type2;
   std::tie(data_type1, data_type2) = GetParam();
   for (const PrimitiveType type : {data_type1, data_type2}) {
@@ -674,16 +642,17 @@ ENTRY e {
   p2 = $0[13,63]{1,0} parameter(2)
   p3 = pred[13,63]{1,0} parameter(3)
   ROOT triton_gemm__ = $1[92,63]{1,0} fusion(p0, p1, p2, p3), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"16",
-                       "block_n":"64",
-                       "block_k":"16",
-                       "split_k":"1",
-                       "num_stages":"3",
-                       "num_warps":"2",
-                       "num_ctas":"1"}}}
+    calls=triton_gemm___computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm",
+        "triton_gemm_config": {
+          "block_m":"16",
+          "block_n":"64",
+          "block_k":"16",
+          "split_k":"1",
+          "num_stages":"3",
+          "num_warps":"2",
+          "num_ctas":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
@@ -747,7 +716,7 @@ INSTANTIATE_TEST_SUITE_P(
 class ConstantTest : public TritonTest,
                      public ::testing::WithParamInterface<PrimitiveType> {};
 
-TEST_P(ConstantTest, ConstantFusionExecutesCorrectly) {
+TEST_P(ConstantTest, DISABLED_ConstantFusionExecutesCorrectly) {
   const PrimitiveType data_type = GetParam();
   if (!legacy_triton::IsTritonSupportedDataType(data_type,
                                                 GetCudaComputeCapability())) {
@@ -773,16 +742,17 @@ ENTRY e {
   p0 = f32[92,11]{1,0} parameter(0)
   p1 = f32[11,63]{1,0} parameter(1)
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_gemm___computation,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":
-                      {"block_m":"16",
-                       "block_n":"64",
-                       "block_k":"16",
-                       "split_k":"1",
-                       "num_stages":"3",
-                       "num_warps":"2",
-                       "num_ctas":"1"}}}
+    calls=triton_gemm___computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm",
+        "triton_gemm_config":{
+          "block_m":"16",
+          "block_n":"64",
+          "block_k":"16",
+          "split_k":"1",
+          "num_stages":"3",
+          "num_warps":"2",
+          "num_ctas":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
@@ -848,7 +818,7 @@ class ConvertTest : public TritonTest,
                     public ::testing::WithParamInterface<
                         std::tuple<PrimitiveType, PrimitiveType>> {};
 
-TEST_P(ConvertTest, ConvertFusionExecutesCorrectly) {
+TEST_P(ConvertTest, DISABLED_ConvertFusionExecutesCorrectly) {
   PrimitiveType data_type1, data_type2;
   std::tie(data_type1, data_type2) = GetParam();
   for (const PrimitiveType type : {data_type1, data_type2}) {
@@ -892,6 +862,8 @@ INSTANTIATE_TEST_SUITE_P(
                        ::testing::ValuesIn(kSupportedDataTypes)),
     TwoPrimitiveTypesToString);
 
+// TODO(b/412651198): lots of tests in TritonNormalizationTest are no longer
+// relevant. Clean this up.
 class TritonNormalizationTest
     : public GpuCodegenTest,
       public ::testing::WithParamInterface<PrimitiveType> {
@@ -1218,7 +1190,7 @@ ENTRY main {
 
 TEST_P(
     TritonNormalizationTest,
-    CanFuseAndEmitTwoDiamondsWithSecondDiamondProducerEqualToFirstDiamondRoot) {
+    CanFuseAndEmitTwoDiamondsWithSecondDiamondProducerEqualToFirstDiamondRoot) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
   const std::string hlo_text_template = R"(
@@ -1445,8 +1417,9 @@ ENTRY main {
                             ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
 }
 
-TEST_P(TritonNormalizationTest,
-       CanFuseAndEmitConvertInvolvingBF16InputIntoSoftmaxDiamondCorrectly) {
+TEST_P(
+    TritonNormalizationTest,
+    CanFuseAndEmitConvertInvolvingBF16InputIntoSoftmaxDiamondCorrectly) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
   const std::string hlo_text_template = R"(
@@ -2351,10 +2324,14 @@ triton_computation {
 ENTRY entry_computation {
   p = $0[400,16] parameter(0)
   ROOT fusion = $0[400] fusion(p), kind=kCustom, calls=triton_computation,
-    backend_config={ "operation_queue_id":"0", "wait_on_operation_queues":[],
-      "fusion_backend_config":{ "kind":"__triton", "block_level_fusion_config":{
-          "output_tiles":[{"sizes":["400"]}], "num_warps":"1"}},
-      "force_earliest_schedule":false}
+    backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["400"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
index 2dd2eb45abf1..40623137433b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub.cc
@@ -25,15 +25,15 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
-#include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -60,7 +60,7 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
   return absl::UnimplementedError("not supported for this build configuration");
 }
 
-absl::StatusOr<TritonModule> CreateTritonModule(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     absl::string_view fn_name, const HloFusionInstruction* fusion,
     const se::DeviceDescription& device_info,
     const BlockLevelParameters& block_level_parameters,
@@ -69,7 +69,7 @@ absl::StatusOr<TritonModule> CreateTritonModule(
 }
 
 absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
-    const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
+    absl::string_view kernel_name, const HloModule& hlo_module,
     const se::DeviceDescription& device_info,
     const BlockLevelParameters& block_level_parameters,
     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
@@ -90,11 +90,6 @@ llvm::SmallVector<mlir::Value, 3> ComputeDelinearizedTileIndex(
   return {};
 }
 
-absl::StatusOr<MakeTensorPtrOpAndBoundaryChecks> CreateMakeTensorPtrOp(
-    EmitterLocOpBuilder& b, mlir::ValueRange tile_multi_index,
-    const TiledHloInstruction& tiled_hlo, mlir::Value parent_base_ptr) {
-  return absl::UnimplementedError("not supported for this build configuration");
-}
 }  // namespace ir_emitter_triton_internal
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
index 20accf012b52..17a8d27866b6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
@@ -22,11 +22,12 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/fusion_emitter_legacy_matmul.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
-#include "tsl/platform/test.h"
+#include "xla/service/hlo_module_config.h"
 
 namespace mlir::triton::nvidia_gpu {
 // We define ClusterInfo here in order to avoid having to import a GPU-only
@@ -44,7 +45,8 @@ TEST(TritonStub, CallStubApi) {
   LoadMlirDialectsForTriton(context);
   EXPECT_FALSE(TritonWrapper({}, nullptr, {}, {}, {}, nullptr, context).ok());
   EXPECT_FALSE(CreateTritonModule({}, nullptr, {}, {}, context).ok());
-  EXPECT_FALSE(CompileTritonToLLVM({}, {}, {}, {}, {}, nullptr, context,
+  EXPECT_FALSE(CompileTritonToLLVM("", HloModule("test", HloModuleConfig()), {},
+                                   {}, {}, nullptr, context,
                                    /*is_xla_fusion=*/true, {})
                    .ok());
 
@@ -65,10 +67,6 @@ TEST(TritonStub, CallStubApi) {
   HloConstantInstruction constant(LiteralUtil::CreateR1<int>({1, 1}));
   auto tiled_hlo = TiledHloInstruction::Create(&constant, {}, {1}, {1}, {});
   EXPECT_TRUE(tiled_hlo.ok());
-
-  EXPECT_FALSE(ir_emitter_triton_internal::CreateMakeTensorPtrOp(
-                   builder, {}, *tiled_hlo.value(), {})
-                   .ok());
 }
 
 TEST(TritonStub, CallLegacyMatMulApis) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
index f0ed8e26d4f5..63fa1d84bfe7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -40,7 +40,7 @@ namespace {
 using ::testing::ElementsAre;
 using ::tsl::testing::StatusIs;
 
-class TritonFusionTest : public HloTestBase {};
+class TritonFusionTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(TritonFusionTest,
        TritonFusionWithBlockLevelFusionConfig_LaunchConfigIsCorrect) {
@@ -56,7 +56,7 @@ ENTRY entry_computation {
     calls=triton_computation,
     backend_config={"fusion_backend_config":{
       "kind":"__triton",
-      "block_level_fusion_config":{"output_tiles":[{"sizes":["3","127"]}],
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["4","127"]}],
                                    "num_warps":"4"}}}
 })"));
 
@@ -74,12 +74,12 @@ ENTRY entry_computation {
       triton_fusion->launch_config();
   ASSERT_NE(launch_config, std::nullopt);
   EXPECT_EQ(launch_config->launch_dimensions.num_blocks(),
-            /*ceil(125 / 3)=*/42);
+            /*ceil(125 / 4)=*/32);
   EXPECT_EQ(launch_config->launch_dimensions.num_threads_per_block(),
             /*32 * num_warps=*/128);
   EXPECT_EQ(launch_config->block_level_parameters.output_tile_sizes.size(), 1);
   EXPECT_THAT(launch_config->block_level_parameters.output_tile_sizes[0],
-              ElementsAre(3, 127));
+              ElementsAre(4, 127));
 }
 
 TEST_F(TritonFusionTest,
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD
index 7f9a41931fbc..22a88fad6d24 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/BUILD
@@ -27,16 +27,10 @@ td_library(
 gentbl_cc_library(
     name = "triton_xla_dialect_inc_gen",
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-dialect-decls"],
-            "triton_xla_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "triton_xla_dialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "triton_xla_dialect.h.inc": ["-gen-dialect-decls"],
+        "triton_xla_dialect.cc.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "triton_xla_dialect.td",
     deps = [":triton_xla_td_files"],
@@ -45,16 +39,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "triton_xla_ops_inc_gen",
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "triton_xla_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "triton_xla_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "triton_xla_ops.h.inc": ["-gen-op-decls"],
+        "triton_xla_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "triton_xla_ops.td",
     deps = [
@@ -66,49 +54,21 @@ gentbl_cc_library(
     ],
 )
 
-gentbl_cc_library(
-    name = "triton_xla_types_inc_gen",
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-typedef-decls",
-                "-typedefs-dialect=triton_xla",
-            ],
-            "triton_xla_types.h.inc",
-        ),
-        (
-            [
-                "-gen-typedef-defs",
-                "-typedefs-dialect=triton_xla",
-            ],
-            "triton_xla_types.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "triton_xla_types.td",
-    deps = [":triton_xla_td_files"],
-)
-
 gentbl_cc_library(
     name = "triton_xla_attrs_inc_gen",
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-attrdef-decls",
-                "-attrdefs-dialect=triton_xla",
-            ],
-            "triton_xla_attrs.h.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-defs",
-                "-attrdefs-dialect=triton_xla",
-            ],
-            "triton_xla_attrs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "triton_xla_enums.h.inc": ["-gen-enum-decls"],
+        "triton_xla_enums.cc.inc": ["-gen-enum-defs"],
+        "triton_xla_attrs.h.inc": [
+            "-gen-attrdef-decls",
+            "-attrdefs-dialect=triton_xla",
+        ],
+        "triton_xla_attrs.cc.inc": [
+            "-gen-attrdef-defs",
+            "-attrdefs-dialect=triton_xla",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "triton_xla_attrs.td",
     deps = [
@@ -123,19 +83,19 @@ cc_library(
         "triton_xla_attrs.cc",
         "triton_xla_dialect.cc",
         "triton_xla_ops.cc",
-        "triton_xla_types.cc",
     ],
     hdrs = ["triton_xla_ops.h"],
     deps = [
         ":triton_xla_attrs_inc_gen",
         ":triton_xla_dialect_inc_gen",
         ":triton_xla_ops_inc_gen",
-        ":triton_xla_types_inc_gen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ViewLikeInterface",
         "@triton//:TritonDialects",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/attrs.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/attrs.mlir
index edebb571a55b..13b3e1b722cb 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/attrs.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/attrs.mlir
@@ -5,10 +5,33 @@ tt.func @tma_descriptor_params(%arg0: tensor<512x128xf32>)
     tma = #triton_xla.tma_descriptor<
       global_shape = [512, 128],
       block_shape = [32, 64],
+      layout = [0, 1],
       element_byte_size = 4>
   } {
   tt.return %arg0  : tensor<512x128xf32>
 }
 // CHECK:  #tma_descriptor =  #triton_xla.tma_descriptor<
 // CHECK-SAME:   global_shape = [512, 128],
-// CHECK-SAME:   block_shape = [32, 64], element_byte_size = 4>
\ No newline at end of file
+// CHECK-SAME:   block_shape = [32, 64],
+// CHECK-SAME:   layout = [0, 1],
+// CHECK-SAME:   element_byte_size = 4>
+
+// -----
+
+tt.func @tma_descriptor_params(%arg0: tensor<512x128xf32>)
+  -> tensor<512x128xf32> attributes {
+    tma = #triton_xla.tma_descriptor<
+      global_shape = [512, 128],
+      block_shape = [32, 64],
+      layout = [0, 1],
+      element_byte_size = 4,
+      swizzle_mode = "32b">
+  } {
+  tt.return %arg0  : tensor<512x128xf32>
+}
+// CHECK:  #tma_descriptor =  #triton_xla.tma_descriptor<
+// CHECK-SAME:   global_shape = [512, 128],
+// CHECK-SAME:   block_shape = [32, 64],
+// CHECK-SAME:   layout = [0, 1],
+// CHECK-SAME:   element_byte_size = 4,
+// CHECK-SAME:   swizzle_mode = "32b">
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/canonicalize.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/canonicalize.mlir
new file mode 100644
index 000000000000..cbbdcc95a909
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/canonicalize.mlir
@@ -0,0 +1,36 @@
+// RUN: xla-opt %s --split-input-file --canonicalize | FileCheck %s
+
+tt.func @xla_triton_extract(%arg0: tensor<512x128xbf16>, %i : index)
+    -> tensor<16x64xbf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c128 = arith.constant 128 : index
+  %extracted_tensor = triton_xla.extract %arg0 [%c0, %i] [16, 64] [%c128, %c1]
+    {layout = array<i64:1, 0>, noinline = false}
+    : tensor<512x128xbf16> to tensor<16x64xbf16>
+  tt.return %extracted_tensor : tensor<16x64xbf16>
+}
+// CHECK-LABEL: xla_triton_extract
+
+// CHECK:       triton_xla.extract
+// CHECK-SAME:    [0, %{{.*}}] [16, 64] [128, 1]
+// CHECK-SAME:    {layout = array<i64: 1, 0>, noinline = false}
+// CHECK-SAME:    : tensor<512x128xbf16> to tensor<16x64xbf16>
+
+// -----
+
+tt.func @xla_triton_insert(%src: tensor<16x64xbf16>, %dst: tensor<512x128xbf16>,
+    %j: index) -> tensor<512x128xbf16> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c64 = arith.constant 64 : index
+  %updated_tensor = triton_xla.insert %src into %dst [%c0, %c0][16, 64][%j, %c1]
+    {layout = array<i64:1, 0>, noinline = false}
+    : tensor<16x64xbf16> into tensor<512x128xbf16>
+  tt.return %updated_tensor : tensor<512x128xbf16>
+}
+// CHECK-LABEL: xla_triton_insert
+// CHECK:       triton_xla.insert
+// CHECK-SAME:    [0, 0] [16, 64] [%{{.*}}, 1]
+// CHECK-SAME:    {layout = array<i64: 1, 0>, noinline = false}
+// CHECK-SAME:    : tensor<16x64xbf16> into tensor<512x128xbf16>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/invalid.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/invalid.mlir
index 63f428d55767..723a1dee7560 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/invalid.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/invalid.mlir
@@ -1,193 +1,38 @@
 // RUN: xla-opt --split-input-file --verify-diagnostics %s
 
-tt.func @sparse_dot(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_lhs_type(%lhs: tensor<128x32xf32>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{element type of operand A is not supported}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf32> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_lhs_shape(%lhs: tensor<1x128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{shape of operand A is incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<1x128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_rhs_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xf32>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{element type of operand B is not supported}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xf32> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_rhs_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<1x64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{shape of operand B is incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<1x64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_acc_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xbf16>
-  // expected-error @+1 {{element type of operand C is not supported}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xbf16>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_acc_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<16384xf32>
-  // expected-error @+1 {{shape of operand C is incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<16384xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_mismatch_lhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<64x128xf32>
-  // expected-error @+1 {{operand shape dimensions are incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<64x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_mismatch_rhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x64xf32>
-  // expected-error @+1 {{operand shape dimensions are incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x64xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_mismatch_lhs_rhs(%lhs: tensor<128x32xbf16>, %rhs: tensor<32x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{operand shape dimensions are incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<32x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_mismatch_input_types(%lhs: tensor<128x32xf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{operand element types do not match}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_meta_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi8>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{sparse metadata tensor is invalid}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi8> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_invalid_meta_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<512xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{sparse metadata tensor is invalid}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<512xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_mismatch_meta_noncontracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<64x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<64x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-tt.func @sparse_dot_mismatch_meta_contracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x8xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x8xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-  tt.return
-}
-
-// -----
-#mma0 = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-#enc0 = #ttg.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
-tt.func @sparse_dot_encoding_operand_mismatch(%lhs: tensor<128x32xbf16, #enc0>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-  // expected-error @+1 {{mismatching encoding between A and B operands}}
-  %res = triton_xla.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16, #enc0> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+tt.func @extract_0d(%arg0: tensor<bf16>) {
+  // expected-error @+1 {{cannot extract a 0-d tensor}}
+  %extracted_tensor = triton_xla.extract %arg0 [][][]
+    {layout = array<i64:1, 0>} : tensor<bf16> to tensor<bf16>
   tt.return
 }
 
 // -----
 
-tt.func @tile_mismatch_rank(
-        %arg0: tensor<256x256xbf16>) {
-  %cst = arith.constant 0 : i32
-  // expected-error @+1 {{mismatch between tensor rank and one or more of offsets/sizes/strides}}
-  %tiled_tensor = triton_xla.tile %arg0 [0][16, 64][1, 1, 1]
-        : !triton_xla.tiled_tensor<16x64|256x256xbf16>
+tt.func @insert_0d(%arg0: tensor<bf16>, %arg1: tensor<bf16>) {
+  %cst = arith.constant 0 : index
+  // expected-error @+1 {{cannot insert a 0-d tensor}}
+  %inserted_tensor = triton_xla.insert %arg0 into %arg1 [][][]
+    {layout = array<i64:1, 0>} : tensor<bf16> into tensor<bf16>
   tt.return
 }
 
 // -----
 
-tt.func @extract_mismatch_rank(
-        %arg0: !triton_xla.tiled_tensor<16x64|256x256xbf16>) {
-  %cst = arith.constant 0 : i32
-  // expected-error @+1 {{source tensor rank does not match number of offsets}}
-  %extracted_tensor = triton_xla.extract %arg0 [%cst]
-        : tensor<256x256xbf16> to tensor<16x64xbf16>
+tt.func @extract_wrong_layout(%arg0: tensor<16xbf16>) {
+  // expected-error @+1 {{layout attribute has a wrong size}}
+  %extracted_tensor = triton_xla.extract %arg0 [0][8][1]
+    {layout = array<i64:1, 0>} : tensor<16xbf16> to tensor<8xbf16>
   tt.return
 }
 
 // -----
 
-tt.func @insert_mismatch_rank(
-        %arg0: tensor<16x64xbf16>,
-        %arg1: !triton_xla.tiled_tensor<16x64|256x256xbf16>) {
-  %cst = arith.constant 0 : i32
-  // expected-error @+1 {{destination tensor rank does not match number of offsets}}
-  %inserted_tensor = triton_xla.insert %arg0 into %arg1 [%cst,%cst,%cst]
-        : tensor<16x64xbf16> into tensor<256x256xbf16>
+tt.func @insert_wrong_layout(%arg0: tensor<8xbf16>, %arg1: tensor<16xbf16>) {
+  %cst = arith.constant 0 : index
+  // expected-error @+1 {{layout attribute has a wrong size}}
+  %inserted_tensor = triton_xla.insert %arg0 into %arg1 [0][8][1]
+    {layout = array<i64:1, 0>} : tensor<8xbf16> into tensor<16xbf16>
   tt.return
 }
 
-// -----
-
-"tt.func"() <{function_type = (tensor<bf16>) -> !triton_xla.tiled_tensor<|bf16>, sym_name = "xla_triton_tile"}> ({
-^bb0(%arg0: tensor<bf16>):
-  // expected-error @+1 {{cannot tile a 0-d tensor}}
-  %0 = "triton_xla.tile"(%arg0) <{offsets = array<i32>, sizes = array<i32>, strides = array<i64>}>
-    : (tensor<bf16>) -> !triton_xla.tiled_tensor<|bf16>
-  "tt.return"(%0) : (!triton_xla.tiled_tensor<|bf16>) -> ()
-}) : () -> ()
-
-// -----
-
-"tt.func"() <{function_type = (!triton_xla.tiled_tensor<|bf16>) -> tensor<bf16>, sym_name = "xla_triton_extract"}> ({
-^bb0(%arg0: !triton_xla.tiled_tensor<|bf16>):
-  %0 = "arith.constant"() <{value = 0 : i32}> : () -> i32
-  // expected-error @+1 {{cannot extract a 0-d tensor}}
-  %1 = "triton_xla.extract"(%arg0, %0, %0) : (!triton_xla.tiled_tensor<|bf16>, i32, i32) -> tensor<bf16>
-  "tt.return"(%1) : (tensor<bf16>) -> ()
-}) : () -> ()
-
-// -----
-
-"tt.func"() <{function_type = (tensor<bf16>, !triton_xla.tiled_tensor<|bf16>) -> tensor<bf16>, sym_name = "xla_triton_insert"}> ({
-^bb0(%arg0: tensor<bf16>, %arg1: !triton_xla.tiled_tensor<|bf16>):
-  %0 = "arith.constant"() <{value = 0 : i32}> : () -> i32
-  // expected-error @+1 {{cannot insert a 0-d tensor}}
-  %1 = "triton_xla.insert"(%arg0, %arg1, %0, %0) : (tensor<bf16>, !triton_xla.tiled_tensor<|bf16>, i32, i32) -> tensor<bf16>
-  "tt.return"(%1) : (tensor<bf16>) -> ()
-}) : () -> ()
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/ops.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/ops.mlir
index 227942bb197a..5fd40a27414c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/ops.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/ops.mlir
@@ -4,59 +4,22 @@
 // Verify the generic form can be parsed.
 // RUN: xla-opt %s --split-input-file --mlir-print-op-generic | xla-opt --split-input-file | FileCheck %s
 
-// CHECK-LABEL: xla_triton_tile
-tt.func @xla_triton_tile(%arg0: tensor<512x128xbf16>)
-    -> !triton_xla.tiled_tensor<16x64|512x128xbf16> {
-  // CHECK: triton_xla.tile
-  %tiled_tensor = triton_xla.tile %arg0 [0, 0] [16, 64] [128, 1]
-    : !triton_xla.tiled_tensor<16x64|512x128xbf16>
-  tt.return %tiled_tensor : !triton_xla.tiled_tensor<16x64|512x128xbf16>
-}
-
-// -----
-
-// CHECK-LABEL: xla_triton_extract
-tt.func @xla_triton_extract(%arg0: !triton_xla.tiled_tensor<16x64|512x128xbf16>)
+tt.func @xla_triton_extract(%arg0: tensor<512x128xbf16>, %i : index)
     -> tensor<16x64xbf16> {
-  %cst = arith.constant 0 : i32
-  %extracted_tensor = triton_xla.extract %arg0 [%cst, %cst]
-    : tensor<512x128xbf16> to tensor<16x64xbf16>
+  %extracted_tensor = triton_xla.extract %arg0 [0, %i] [16, 64] [128, 1]
+    {layout = array<i64:1, 0>} : tensor<512x128xbf16> to tensor<16x64xbf16>
   tt.return %extracted_tensor : tensor<16x64xbf16>
 }
-// CHECK: triton_xla.extract
+// CHECK-LABEL: xla_triton_extract
+//       CHECK:   triton_xla.extract
 
 // -----
 
-// CHECK-LABEL: xla_triton_insert
-tt.func @xla_triton_insert(%src: tensor<16x64xbf16>,
-    %dst: !triton_xla.tiled_tensor<16x64|512x128xbf16>) -> tensor<512x128xbf16> {
-  %cst = arith.constant 0 : i32
-  %updated_tensor = triton_xla.insert %src into %dst [%cst, %cst]
-  : tensor<16x64xbf16> into tensor<512x128xbf16>
+tt.func @xla_triton_insert(%src: tensor<16x64xbf16>, %dst: tensor<512x128xbf16>,
+    %j: index) -> tensor<512x128xbf16> {
+  %updated_tensor = triton_xla.insert %src into %dst [0, 0][16, 64][%j, 1]
+    {layout = array<i64:1, 0>} : tensor<16x64xbf16> into tensor<512x128xbf16>
   tt.return %updated_tensor : tensor<512x128xbf16>
 }
-// CHECK: triton_xla.insert
-
-// -----
-
-#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2],
-  CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1],
-  instrShape = [16, 8]}>
-#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#mma, kWidth=2}>
-#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#mma, kWidth=2}>
-#dot_meta_enc = #triton_xla.sparse_dot_meta<{parent=#mma}>
-
-module attributes {"ttg.num-warps" = 4 : i32} {
-  // CHECK-LABEL: sparse_xla_triton_op
-  tt.func @sparse_xla_triton_op(%A_dot: tensor<32x32xf16, #dot_operand_a>,
-   %B_dot: tensor<64x32xf16, #dot_operand_b>,
-   %meta_reg: tensor<32x4xi16, #dot_meta_enc>) {
-    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
-    // CHECK-LABEL: triton_xla.sparse_dot
-    %D = triton_xla.sparse_dot %A_dot, %B_dot, %acc, %meta_reg :
-      tensor<32x32xf16, #dot_operand_a> meta tensor<32x4xi16,
-      #dot_meta_enc> * tensor<64x32xf16, #dot_operand_b>
-        -> tensor<32x32xf32, #mma>
-    tt.return
-  }
-}
+// CHECK-LABEL: xla_triton_insert
+//       CHECK:   triton_xla.insert
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/types.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/types.mlir
deleted file mode 100644
index 7a35546bfe28..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/tests/types.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: xla-opt %s -split-input-file | FileCheck %s
-
-// CHECK-LABEL: @xla_tiled_tensor_type
-tt.func private @xla_tiled_tensor_type(
-  %arg0: !triton_xla.tiled_tensor<16x64|320x512xbf16>)
-
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
index fe8ffa669965..f4850a992991 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
@@ -13,103 +13,65 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
-
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/OpDefinition.h"  // IWYU pragma: keep
 #include "mlir/IR/Types.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
-#include "triton/Dialect/Triton/IR/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Attributes.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
-#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
-#include "triton/Tools/LinearLayout.h"
 
 namespace mlir::triton::xla {
 
-//--- SparseDotMetaEncodingAttr ---
-unsigned SparseDotMetaEncodingAttr::getTotalElemsPerThread(
-    ArrayRef<int64_t> shape, Type eltTy) const {
-  constexpr int kMetadataElementsPerWarp = 16;
-  auto mmaLayout = mlir::cast<gpu::NvidiaMmaEncodingAttr>(getParent());
-  return product<int64_t>(shape) /
-         (mmaLayout.getWarpsPerCTA()[0] * kMetadataElementsPerWarp);
-}
-
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getElemsPerThread(
-    ArrayRef<int64_t> shape, Type eltTy) const {
-  llvm_unreachable("getElemsPerThread is not supported for sparse dot meta");
-  return SmallVector<unsigned>();
-}
-
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTAsPerCGA() const {
-  return gpu::getCTAsPerCGA(getParent());
-}
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTAOrder() const {
-  return gpu::getCTAOrder(getParent());
-}
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getCTASplitNum() const {
-  return gpu::getCTASplitNum(getParent());
-}
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getWarpsPerCTA() const {
-  return gpu::getWarpsPerCTA(getParent());
-}
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getWarpOrder() const {
-  return {1, 0};
-}
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getThreadsPerWarp() const {
-  return gpu::getThreadsPerWarp(getParent());
-}
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getThreadOrder() const {
-  return {1, 0};
-}
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getSizePerThread() const {
-  return gpu::getSizePerThread(getParent());
-}
-LinearLayout SparseDotMetaEncodingAttr::toLinearLayout(
-    ArrayRef<int64_t> shape) const {
-  return gpu::toLinearLayout(shape, getParent());
-}
-
-SmallVector<unsigned> SparseDotMetaEncodingAttr::getRepOrder() const {
-  // TODO: b/381422752 - Maybe we should reuse upstream's implementation from
-  // lib/Dialect/TritonGPU/IR/Dialect.cpp, but we would need to make it public
-  // first.
-  if (auto parent = mlir::dyn_cast<gpu::DistributedEncodingTrait>(getParent()))
-    return parent.getRepOrder();
-  llvm::report_fatal_error("Unimplemented usage of getRepOrder");
-}
-
-namespace {
-mlir::ParseResult parseI64ArrayAttr(mlir::AsmParser& parser,
-                                    mlir::DenseI64ArrayAttr& array) {
+static mlir::ParseResult parseI64ArrayAttr(mlir::AsmParser& parser,
+                                           mlir::DenseI64ArrayAttr& array) {
   array = mlir::dyn_cast_or_null<mlir::DenseI64ArrayAttr>(
       mlir::DenseI64ArrayAttr::parse(parser, mlir::Type{}));
   if (!array) return mlir::failure();
   return mlir::success();
 }
-}  // namespace
+
+ParseResult ParseOptionalSwizzleMode(mlir::AsmParser& parser,
+                                     SwizzleModeAttr& swizzle_mode) {
+  if (parser.parseOptionalComma()) {
+    // If there is no comma, we don't have a swizzle mode, but it's still valid.
+    swizzle_mode = nullptr;
+    return mlir::success();
+  }
+  StringAttr swizzle_mode_str;
+  if (parser.parseKeyword("swizzle_mode") || parser.parseEqual() ||
+      parser.parseAttribute(swizzle_mode_str)) {
+    return mlir::failure();
+  }
+  auto maybe_swizzle_mode = symbolizeSwizzleMode(swizzle_mode_str);
+  if (!maybe_swizzle_mode.has_value()) {
+    return mlir::failure();
+  }
+  swizzle_mode =
+      SwizzleModeAttr::get(parser.getContext(), maybe_swizzle_mode.value());
+  return mlir::success();
+}
 
 Attribute TmaDescriptorAttr::parse(mlir::AsmParser& parser, mlir::Type) {
   int element_byte_size;
-  DenseI64ArrayAttr global_shape, block_shape;
+  DenseI64ArrayAttr global_shape, block_shape, layout;
+  SwizzleModeAttr swizzle_mode = nullptr;
 
   if (parser.parseLess() || parser.parseKeyword("global_shape") ||
       parser.parseEqual() || parseI64ArrayAttr(parser, global_shape) ||
       parser.parseComma() || parser.parseKeyword("block_shape") ||
       parser.parseEqual() || parseI64ArrayAttr(parser, block_shape) ||
+      parser.parseComma() || parser.parseKeyword("layout") ||
+      parser.parseEqual() || parseI64ArrayAttr(parser, layout) ||
       parser.parseComma() || parser.parseKeyword("element_byte_size") ||
       parser.parseEqual() || parser.parseInteger(element_byte_size) ||
-      parser.parseGreater()) {
+      ParseOptionalSwizzleMode(parser, swizzle_mode) || parser.parseGreater()) {
     return {};
   }
+
   return TmaDescriptorAttr::get(parser.getContext(), global_shape.asArrayRef(),
-                                block_shape.asArrayRef(), element_byte_size);
+                                block_shape.asArrayRef(), layout.asArrayRef(),
+                                element_byte_size, swizzle_mode);
 }
 
 void TmaDescriptorAttr::print(mlir::AsmPrinter& printer) const {
@@ -117,7 +79,14 @@ void TmaDescriptorAttr::print(mlir::AsmPrinter& printer) const {
   llvm::interleaveComma(getGlobalShape(), printer);
   printer << "], block_shape = [";
   llvm::interleaveComma(getBlockShape(), printer);
-  printer << "], element_byte_size = " << getElementByteSize() << ">";
+  printer << "], layout = [";
+  llvm::interleaveComma(getLayout(), printer);
+  printer << "], element_byte_size = " << getElementByteSize();
+  if (getSwizzleMode()) {
+    printer << ", swizzle_mode = \""
+            << stringifySwizzleMode(getSwizzleMode().getValue()) << "\"";
+  }
+  printer << ">";
 }
 
 }  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.td b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.td
index baf26900a504..78703c3014ac 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.td
@@ -17,34 +17,54 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_ATTRS_TD_
 
 include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
+include "mlir/IR/EnumAttr.td"
 include "xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td"
 
-def TTXLA_SparseDotMetaEncodingAttr : DistributedEncoding<"SparseDotMetaEncoding",
-"sparse_dot_meta_encoding", [], XlaTritonDialect> {
-  let mnemonic = "sparse_dot_meta";
-
-  let parameters = (ins "Attribute":$parent);
-  let assemblyFormat = "`<``{` struct(params) `}``>`";
-  let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<unsigned> getContigPerThread() {
-      return getSizePerThread();
-    };
-  }];
-}
-
 class TTXLA_Attr<string name, list<Trait> traits = []> :
       AttrDef<XlaTritonDialect, name, traits> {
 }
 
+def XLAGPU_SwizzleMode :
+  I32EnumAttr<"SwizzleMode", "tensor map swizzle mode",
+    [
+      I32EnumAttrCase<"kNone", 0, "none">,
+      I32EnumAttrCase<"k32b", 1, "32b">,
+      I32EnumAttrCase<"k64b", 2, "64b">,
+      I32EnumAttrCase<"k128b", 3, "128b">,
+    ]> {
+  let cppNamespace = "::mlir::triton::xla";
+  let genSpecializedAttr = 0;
+}
+
+def XLAGPU_SwizzleModeAttr : EnumAttr<XlaTritonDialect, XLAGPU_SwizzleMode, "swizzle_mode"> {
+  let assemblyFormat = "$value";
+}
+
 def XLA_TmaDescriptorAttr : TTXLA_Attr<"TmaDescriptor"> {
   let summary = "Contains the necessary data to generate TMA descriptors.";
   let mnemonic = "tma_descriptor";
   let parameters = (ins
     ArrayRefParameter<"int64_t", "">:$global_shape,
     ArrayRefParameter<"int64_t", "">:$block_shape,
-    "int":$element_byte_size
+    ArrayRefParameter<"int64_t", "">:$layout,
+    "int":$element_byte_size,
+    // Swizzle mode corresponds to the enum CUtensorMapSwizzle
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g0bc04417bd8ce2c64d204bc3cbc25b58
+    OptionalParameter<"SwizzleModeAttr", "swizzle mode">:$swizzle_mode
   );
   let hasCustomAssemblyFormat = 1;
+  let builders = [
+      AttrBuilder<(ins
+            "ArrayRef<int64_t>":$global_shape,
+            "ArrayRef<int64_t>":$block_shape,
+            "ArrayRef<int64_t>":$layout,
+            "int":$element_byte_size), [{
+            // Swizzle mode is decided later in the pipeline than these TMA
+            // descriptors are first created. Set it to nullptr here.
+            return $_get(context, global_shape, block_shape, layout,
+              element_byte_size, /*swizzle_mode=*/nullptr);
+      }]>
+  ];
 }
 
 #endif // XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_ATTRS_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.cc b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.cc
index a3c5e0a8f58e..ef4b2417d76c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 
+// The order of includes here is important.
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_enums.cc.inc"
 #define GET_ATTRDEF_CLASSES
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc.inc"
-#define GET_TYPEDEF_CLASSES
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_types.cc.inc"
 
 namespace mlir::triton::xla {
 
@@ -50,10 +50,6 @@ void XlaTritonDialect::initialize() {
 #define GET_ATTRDEF_LIST
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc.inc"
       >();
-  addTypes<
-#define GET_TYPEDEF_LIST
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_types.cc.inc"
-      >();
   addInterfaces<TritonXlaOpAsmDialectInterface>();
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td
index 08a513ed0ea2..b72dad61d38a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td
@@ -26,17 +26,8 @@ def XlaTritonDialect : Dialect {
     This dialect contains ops included in the xla extension point for Triton.
   }];
 
-  // We need this to register interfaces for tensor and !ttg.memdesc types.
-  // TODO: b/382459490 - This is wrong layering, triton_xla should not depend on
-  // triton_gpu, remove this once we refactor the extension and catch up to
-  // upstream.
-  let dependentDialects = [
-    "::mlir::triton::gpu::TritonGPUDialect",
-  ];
-
   let cppNamespace = "::mlir::triton::xla";
   let useDefaultAttributePrinterParser = 1;
-  let useDefaultTypePrinterParser = 1;
 }
 
 #endif // XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_DIALECT_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc
index d96eaa143848..5cc9dc8fad2f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.cc
@@ -16,248 +16,104 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 
 #include <cassert>
-#include <optional>
+#include <cstdint>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"  // IWYU pragma: keep
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/DialectImplementation.h"  // IWYU pragma: keep
 #include "mlir/IR/MLIRContext.h"  // IWYU pragma: keep
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"  // IWYU pragma: keep
-#include "mlir/IR/Region.h"
 #include "mlir/IR/TypeUtilities.h"  // IWYU pragma: keep
-#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.cc.inc"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/TritonGPU/IR/Types.h"
 
-using mlir::Dialect;
-using mlir::DictionaryAttr;
-using mlir::Location;
 using mlir::LogicalResult;
-using mlir::MLIRContext;
-using mlir::OpaqueProperties;
-using mlir::RankedTensorType;
-using mlir::RegionRange;
-using mlir::SmallVectorImpl;
 using mlir::Type;
-using mlir::ValueRange;
-using mlir::triton::gpu::TensorOrMemDesc;
 
 namespace mlir::triton::xla {
-
-//===----------------------------------------------------------------------===//
-// SparseDotOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult SparseDotOp::inferReturnTypes(
-    MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  // DotOp::inferReturnTypes() no longer handles MemDescType, so we need to
-  // handle it ourselves.
-  // TODO: b/382459490 - Remove the need for our own implementation once we've
-  // cleaned up the sparsity extension.
-
-  // type is the same as the accumulator
-  auto accTy = cast<RankedTensorType>(operands[2].getType());
-  inferredReturnTypes.push_back(accTy);
-
-  // verify encodings
-  auto aEnc = cast<TensorOrMemDesc>(operands[0].getType()).getEncoding();
-  auto bEnc = cast<TensorOrMemDesc>(operands[1].getType()).getEncoding();
-  auto retEnc = accTy.getEncoding();
-  if (aEnc) {
-    assert(bEnc && retEnc);
-    Dialect& dialect = retEnc.getDialect();
-    auto interface = dyn_cast<DialectInferLayoutInterface>(&dialect);
-    if (interface->inferDotOpEncoding(aEnc, 0, retEnc, location).failed())
-      return failure();
-    if (interface->inferDotOpEncoding(bEnc, 1, retEnc, location).failed())
-      return failure();
-  }
-  return success();
-}
-
-LogicalResult SparseDotOp::verify() {
-  // Implied properties of 2:4 sparse dots.
-  constexpr int kContractingFactor = 2;
-  constexpr int kMetadataElementsPerPackedValue = 8;
-  // Verify operand A.
-  auto aTensorTy = llvm::cast<TensorOrMemDesc>(getOperand(0).getType());
-  auto aElemTy = aTensorTy.getElementType();
-  if (!aElemTy.isF16() && !aElemTy.isBF16())
-    return emitError("element type of operand A is not supported");
-  auto aShape = aTensorTy.getShape();
-  if (aShape.size() != 2) return emitError("shape of operand A is incorrect");
-
-  // Verify operand B.
-  auto bTensorTy = llvm::cast<TensorOrMemDesc>(getOperand(1).getType());
-  auto bElemTy = bTensorTy.getElementType();
-  if (!bElemTy.isF16() && !bElemTy.isBF16())
-    return emitError("element type of operand B is not supported");
-  auto bShape = bTensorTy.getShape();
-  if (bShape.size() != 2) return emitError("shape of operand B is incorrect");
-
-  // Verify operand C.
-  auto cTensorTy = llvm::cast<RankedTensorType>(getOperand(2).getType());
-  auto cElemTy = cTensorTy.getElementType();
-  if (!cElemTy.isF32())
-    return emitError("element type of operand C is not supported");
-  auto cShape = cTensorTy.getShape();
-  if (cShape.size() != 2) return emitError("shape of operand C is incorrect");
-
-  // Check operand dependencies.
-  if (aShape[0] != cShape[0] || bShape[1] != cShape[1] ||
-      bShape[0] != aShape[1] * kContractingFactor)
-    return emitError("operand shape dimensions are incorrect");
-  if (aElemTy != bElemTy)
-    return emitError("operand element types do not match");
-
-  // Verify sparse metadata.
-  auto metaTy = llvm::cast<RankedTensorType>(getOperand(3).getType());
-  auto metaShape = metaTy.getShape();
-  if (!metaTy.getElementType().isInteger(16) || metaShape.size() != 2)
-    return emitError("sparse metadata tensor is invalid");
-  if (metaShape[0] != aShape[0] ||
-      metaShape[1] * kMetadataElementsPerPackedValue != aShape[1])
-    return emitError("sparse metadata shape dimensions are incorrect");
-
-  // Verify tensor encoding.
-  auto aEncoding = aTensorTy.getEncoding();
-  auto bEncoding = bTensorTy.getEncoding();
-  if (!aEncoding && !bEncoding) return mlir::success();
-  if (!aEncoding || !bEncoding)
-    return emitError("mismatching encoding between A and B operands");
-
-  Dialect& dialect = aEncoding.getDialect();
-  auto interface = llvm::cast<DialectInferLayoutInterface>(&dialect);
-  return interface->verifyDotOpEncodingCompatibility(getOperation(), aEncoding,
-                                                     bEncoding);
-}
-
 //===----------------------------------------------------------------------===//
-// TileOp
+// ExtractOp
 //===----------------------------------------------------------------------===//
 
-void TileOp::getAsmResultNames(function_ref<void(Value, StringRef)> setNameFn) {
-  setNameFn(getResult(), "tiled_tensor");
-}
-
-template <typename DenseIntArrayAttrType>
-mlir::ParseResult parseDenseIntArrayAttr(mlir::AsmParser& parser,
-                                         DenseIntArrayAttrType& array) {
-  array = mlir::dyn_cast_or_null<DenseIntArrayAttrType>(
-      DenseIntArrayAttrType::parse(parser, mlir::Type{}));
-  if (!array) return mlir::failure();
-  return mlir::success();
+void ExtractOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "extracted_tile");
 }
 
-ParseResult TileOp::parse(OpAsmParser& parser, OperationState& result) {
-  OpAsmParser::UnresolvedOperand src;
-  TiledTensorType tiled_tensor_type;
-  DenseI64ArrayAttr strides;
-  DenseI32ArrayAttr offsets, sizes;
-  if (parser.parseOperand(src) || parseDenseIntArrayAttr(parser, offsets) ||
-      parseDenseIntArrayAttr(parser, sizes) ||
-      parseDenseIntArrayAttr(parser, strides) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(tiled_tensor_type)) {
-    return failure();
+LogicalResult ExtractOp::verify() {
+  int64_t rank = getResultType().getRank();
+  if (rank == 0) {
+    return emitError("cannot extract a 0-d tensor");
   }
-  if (parser.resolveOperand(src, tiled_tensor_type.getOriginalType(),
-                            result.operands)) {
-    return failure();
+  if (rank != getLayout().size()) {
+    return emitError("layout attribute has a wrong size");
   }
-  result.addAttribute("offsets", offsets);
-  result.addAttribute("sizes", sizes);
-  result.addAttribute("strides", strides);
-  result.addTypes(tiled_tensor_type);
   return success();
 }
 
-void TileOp::print(OpAsmPrinter& p) {
-  p << ' ' << getTensor();
-  p << '[';
-  llvm::interleaveComma(getOffsets(), p);
-  p << "][";
-  llvm::interleaveComma(getSizes(), p);
-  p << "][";
-  llvm::interleaveComma(getStrides(), p);
-  p << "] : " << getType();
+void ExtractOp::build(OpBuilder &b, OperationState &result,
+                      RankedTensorType result_type, Value src,
+                      ArrayRef<OpFoldResult> offsets,
+                      ArrayRef<OpFoldResult> strides, ArrayRef<int64_t> layout,
+                      ArrayRef<NamedAttribute> attrs) {
+  SmallVector<int64_t> static_offsets, static_sizes, static_strides;
+  SmallVector<Value> dynamic_offsets, dynamic_sizes, dynamic_strides;
+  dispatchIndexOpFoldResults(offsets, dynamic_offsets, static_offsets);
+  dispatchIndexOpFoldResults(strides, dynamic_strides, static_strides);
+  result.addAttribute(InsertOp::getLayoutAttrName(OperationName(
+                          InsertOp::getOperationName(), b.getContext())),
+                      b.getDenseI64ArrayAttr(layout));
+  result.addAttributes(attrs);
+  build(b, result, result_type, src, dynamic_offsets, {}, dynamic_strides,
+        b.getDenseI64ArrayAttr(static_offsets),
+        b.getDenseI64ArrayAttr(result_type.getShape()),
+        b.getDenseI64ArrayAttr(static_strides), {});
 }
 
-LogicalResult TileOp::verify() {
-  if (getTensor().getType().getRank() == 0) {
-    return emitError("cannot tile a 0-d tensor");
-  }
-  auto tensor_rank = getTensor().getType().getRank();
-  if (tensor_rank != getOffsets().size() || tensor_rank != getSizes().size() ||
-      tensor_rank != getStrides().size())
-    return emitError(
-        "mismatch between tensor rank and one or more of "
-        "offsets/sizes/strides");
-  return success();
+void ExtractOp::build(OpBuilder &b, OperationState &result,
+                      RankedTensorType result_type, Value src,
+                      ValueRange offsets, ValueRange strides,
+                      ArrayRef<int64_t> layout,
+                      ArrayRef<NamedAttribute> attrs) {
+  build(b, result, result_type, src, getAsOpFoldResult(offsets),
+        getAsOpFoldResult(strides), layout, attrs);
 }
 
-//===----------------------------------------------------------------------===//
-// ExtractOp
-//===----------------------------------------------------------------------===//
-
-void ExtractOp::getAsmResultNames(
-    function_ref<void(Value, StringRef)> setNameFn) {
-  setNameFn(getResult(), "extracted_tile");
-}
+class ExtractOpOffsetsSizesStridesFolder final
+    : public OpRewritePattern<ExtractOp> {
+ public:
+  using OpRewritePattern<ExtractOp>::OpRewritePattern;
 
-ParseResult ExtractOp::parse(OpAsmParser& parser, OperationState& result) {
-  Builder& builder = parser.getBuilder();
+  LogicalResult matchAndRewrite(ExtractOp op,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<OpFoldResult> mixed_offsets(op.getMixedOffsets());
+    SmallVector<OpFoldResult> mixed_strides(op.getMixedStrides());
 
-  OpAsmParser::UnresolvedOperand tiled_tensor;
-  Type tile_type, original_type;
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> offsets;
-  if (parser.parseOperand(tiled_tensor) ||
-      parser.parseOperandList(offsets, OpAsmParser::Delimiter::Square) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(original_type) || parser.parseKeyword("to") ||
-      parser.parseType(tile_type)) {
-    return failure();
-  }
-  auto tiled_tensor_type = TiledTensorType::get(
-      parser.getContext(), mlir::cast<RankedTensorType>(tile_type),
-      mlir::cast<RankedTensorType>(original_type));
-  auto offset_type = builder.getI32Type();
-  if (parser.resolveOperand(tiled_tensor, tiled_tensor_type, result.operands) ||
-      parser.resolveOperands(offsets, offset_type, result.operands)) {
-    return failure();
+    // No constant operands were folded, just return;
+    if (failed(foldDynamicIndexList(mixed_offsets, /*onlyNonNegative=*/true)) &&
+        failed(foldDynamicIndexList(mixed_strides))) {
+      return failure();
+    }
+    // Create the new op in canonical form.
+    rewriter.replaceOpWithNewOp<ExtractOp>(
+        op, op.getResultType(), op.getSrc(), mixed_offsets, mixed_strides,
+        op.getLayout(), llvm::to_vector(op->getDiscardableAttrs()));
+    return success();
   }
-  result.addTypes(tile_type);
-  return success();
-}
+};
 
-void ExtractOp::print(OpAsmPrinter& p) {
-  TiledTensorType tiled_type = getSrc().getType();
-  p << ' ' << getSrc() << '[';
-  llvm::interleaveComma(getOffsets(), p);
-  p << ']';
-  p.printOptionalAttrDict((*this)->getAttrs());
-  p << " : " << tiled_type.getOriginalType() << " to "
-    << tiled_type.getTileType();
-}
-
-LogicalResult ExtractOp::verify() {
-  if (getResult().getType().getRank() == 0) {
-    return emitError("cannot extract a 0-d tensor");
-  }
-  if (getSrc().getType().getRank() != getOffsets().size())
-    return emitError("source tensor rank does not match number of offsets");
-  return success();
+void ExtractOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                            MLIRContext *context) {
+  results.add<ExtractOpOffsetsSizesStridesFolder>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -269,52 +125,69 @@ void InsertOp::getAsmResultNames(
   setNameFn(getResult(), "inserted_tile");
 }
 
-ParseResult InsertOp::parse(OpAsmParser& parser, OperationState& result) {
-  Builder& builder = parser.getBuilder();
-
-  OpAsmParser::UnresolvedOperand tile, tiled_tensor;
-  Type tile_type, original_type;
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> offsets;
-  if (parser.parseOperand(tile) || parser.parseKeyword("into") ||
-      parser.parseOperand(tiled_tensor) ||
-      parser.parseOperandList(offsets, OpAsmParser::Delimiter::Square) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(tile_type) || parser.parseKeyword("into") ||
-      parser.parseType(original_type) ||
-      parser.resolveOperand(tile, tile_type, result.operands)) {
-    return failure();
+LogicalResult InsertOp::verify() {
+  int64_t rank = getSrcType().getRank();
+  if (rank == 0) {
+    return emitError("cannot insert a 0-d tensor");
   }
-  auto tiled_tensor_type = TiledTensorType::get(
-      parser.getContext(), mlir::cast<RankedTensorType>(tile_type),
-      mlir::cast<RankedTensorType>(original_type));
-
-  auto offset_type = builder.getI32Type();
-  if (parser.resolveOperand(tiled_tensor, tiled_tensor_type, result.operands) ||
-      parser.resolveOperands(offsets, offset_type, result.operands)) {
-    return failure();
+  if (rank != getLayout().size()) {
+    return emitError("layout attribute has a wrong size");
   }
-  result.addTypes(original_type);
   return success();
 }
 
-void InsertOp::print(OpAsmPrinter& p) {
-  TiledTensorType tiled_type = getDst().getType();
-  p << ' ' << getSrc() << " into " << getDst() << "[";
-  llvm::interleaveComma(getOffsets(), p);
-  p << ']';
-  p.printOptionalAttrDict((*this)->getAttrs());
-  p << " : " << tiled_type.getTileType() << " into "
-    << tiled_type.getOriginalType();
+void InsertOp::build(OpBuilder &b, OperationState &result, Value src, Value dst,
+                     ArrayRef<OpFoldResult> offsets,
+                     ArrayRef<OpFoldResult> strides, ArrayRef<int64_t> layout,
+                     ArrayRef<NamedAttribute> attrs) {
+  RankedTensorType src_type = mlir::cast<RankedTensorType>(src.getType());
+  RankedTensorType dst_type = mlir::cast<RankedTensorType>(dst.getType());
+  SmallVector<int64_t> static_offsets, static_sizes, static_strides;
+  SmallVector<Value> dynamic_offsets, dynamic_sizes, dynamic_strides;
+  dispatchIndexOpFoldResults(offsets, dynamic_offsets, static_offsets);
+  dispatchIndexOpFoldResults(strides, dynamic_strides, static_strides);
+  result.addAttribute(InsertOp::getLayoutAttrName(OperationName(
+                          InsertOp::getOperationName(), b.getContext())),
+                      b.getDenseI64ArrayAttr(layout));
+  result.addAttributes(attrs);
+  build(b, result, dst_type, src, dst, dynamic_offsets, {}, dynamic_strides,
+        b.getDenseI64ArrayAttr(static_offsets),
+        b.getDenseI64ArrayAttr(src_type.getShape()),
+        b.getDenseI64ArrayAttr(static_strides), {});
 }
 
-LogicalResult InsertOp::verify() {
-  if (getSrc().getType().getRank() == 0) {
-    return emitError("cannot insert a 0-d tensor");
+void InsertOp::build(OpBuilder &b, OperationState &result, Value src, Value dst,
+                     ValueRange offsets, ValueRange strides,
+                     ArrayRef<int64_t> layout, ArrayRef<NamedAttribute> attrs) {
+  build(b, result, src, dst, getAsOpFoldResult(offsets),
+        getAsOpFoldResult(strides), layout, attrs);
+}
+
+class InsertOpOffsetsSizesStridesFolder final
+    : public OpRewritePattern<InsertOp> {
+ public:
+  using OpRewritePattern<InsertOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(InsertOp op,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<OpFoldResult> mixed_offsets(op.getMixedOffsets());
+    SmallVector<OpFoldResult> mixed_strides(op.getMixedStrides());
+    // No constant operands were folded, just return;
+    if (failed(foldDynamicIndexList(mixed_offsets, /*onlyNonNegative=*/true)) &&
+        failed(foldDynamicIndexList(mixed_strides))) {
+      return failure();
+    }
+    // Create the new op in canonical form.
+    rewriter.replaceOpWithNewOp<InsertOp>(
+        op, op.getSrc(), op.getDst(), mixed_offsets, mixed_strides,
+        op.getLayout(), llvm::to_vector(op->getDiscardableAttrs()));
+    return success();
   }
-  if (getDst().getType().getRank() != getOffsets().size())
-    return emitError(
-        "destination tensor rank does not match number of offsets");
-  return success();
+};
+
+void InsertOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                           MLIRContext *context) {
+  results.add<InsertOpOffsetsSizesStridesFolder>(context);
 }
 
 }  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h
index a540cf57e52b..eea33f8d91c3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h
@@ -15,39 +15,25 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_OPS_H_
 #define XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_OPS_H_
 
+#include "mlir/Dialect/Utils/StaticValueUtils.h"  // IWYU pragma: keep
 #include "mlir/IR/Attributes.h"  // IWYU pragma: keep
 #include "mlir/IR/BuiltinTypes.h"  // IWYU pragma: keep
 #include "mlir/IR/Dialect.h"  // IWYU pragma: keep
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // IWYU pragma: keep
 #include "mlir/IR/MLIRContext.h"  // IWYU pragma: keep
 #include "mlir/IR/OpDefinition.h"  // IWYU pragma: keep
 #include "mlir/IR/OpImplementation.h"  // IWYU pragma: keep
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // IWYU pragma: keep
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // IWYU pragma: keep
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.h.inc"  // IWYU pragma: keep
-#include "triton/Dialect/Triton/IR/Dialect.h"  // IWYU pragma: keep
-#include "triton/Dialect/Triton/IR/Traits.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_enums.h.inc"
+#include "triton/Dialect/Triton/IR/Dialect.h"       // IWYU pragma: keep
+#include "triton/Dialect/Triton/IR/OpInterfaces.h"  // IWYU pragma: keep
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"    // IWYU pragma: keep
 #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"  // IWYU pragma: keep
 
-namespace mlir::triton::xla {
-class SparseDotOp;
-}
-namespace mlir::OpTrait {
-// Template specialization for DotLike<SparseDotOp> to skip verification, which
-// would fail because the sparse dot has different shapes and operands.
-template <>
-class DotLike<triton::xla::SparseDotOp>
-    : public TraitBase<triton::xla::SparseDotOp, DotLike> {
- public:
-  // TODO (b/350928208) : Add a proper verifier for SparseDotOp.
-  static LogicalResult verifyTrait(Operation *op) { return success(); }
-};
-}  // namespace mlir::OpTrait
-
 #define GET_ATTRDEF_CLASSES
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.h.inc"
-#define GET_TYPEDEF_CLASSES
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_types.h.inc"
 #define GET_OP_CLASSES
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h.inc"
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
index 43a179561115..8560bf7be54d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_ops.td
@@ -23,10 +23,10 @@ include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
 include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
 include "mlir/Interfaces/ViewLikeInterface.td" // OffsetSizeAndStrideOpInterface
 include "xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td"
-include "xla/backends/gpu/codegen/triton/ir/triton_xla_types.td"
 include "triton/Dialect/Triton/IR/TritonInterfaces.td"
-include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
+include "triton/Dialect/Triton/IR/TritonOpInterfaces.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
 
 // -----------------------------------------------------------------------------
 // Triton XLA Ops
@@ -36,102 +36,163 @@ class TTXLA_Op<string mnemonic, list<Trait> traits = []> :
       Op<XlaTritonDialect, mnemonic, traits> {
 }
 
-def TTXLA_SparseDotOp : TTXLA_Op<"sparse_dot", [
-        Pure, DotLike, DeclareOpInterfaceMethods<InferTypeOpInterface>,
-        TypesMatchWith<"result's type matches accumulator's type", "d", "c", "$_self">]> {
-    let summary = "sparse dot";
-
-    let arguments = (ins
-      TTG_TensorOrMemDesc:$a,
-      TTG_TensorOrMemDesc:$b,
-      TT_FpIntTensor:$c,
-      TT_IntTensor: $aMeta);
-    let results = (outs TT_FpIntTensor:$d);
-    let assemblyFormat = [{
-      $a`,` $b`,` $c`,` $aMeta attr-dict
-        `:` type($a) `meta` type($aMeta) `*` type($b) `->` type($d)
-    }];
-    let hasVerifier = 1;
-}
-
-def TTXLA_TileOp : TTXLA_Op<"tile", [Pure,
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>]> {
-  let summary = "Capture the tiling information of a tensor.";
-  let description = [{
-    This op captures the tiling information of a tensor, which can be used to
-    extract/insert tiles with the `triton_xla.extract` and `triton_xla.insert`
-    ops.
-
-    Example:
-      ```
-      %tensor: tensor<120x320xbf16>
-      ...
-      %tiled_tensor = triton_xla.tile %tensor [0, 0][16, 64][120, 1]
-      : !triton_xla.tiled_tensor<16x64|120x320xbf16>
-      ```
+// Base class for ops with static/dynamic offset, sizes and strides
+// attributes/arguments.
+class TTXLA_OpWithOffsetSizesAndStrides<string mnemonic,
+                                        list<Trait> traits = []>
+    : TTXLA_Op<mnemonic, !listconcat(traits, [
+      Pure,
+      AttrSizedOperandSegments,
+      OffsetSizeAndStrideOpInterface,
+      DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>
+  ])> {
+  let results = (outs AnyRankedTensor:$result);
+  code extraBaseClassDeclaration = [{
+    /// Return the type of the base tensor operand.
+    ::mlir::RankedTensorType getSrcType() {
+      return ::llvm::cast<RankedTensorType>(getSrc().getType());
+    }
+
+    /// Return the type of the result tensor.
+    ::mlir::RankedTensorType getResultType() {
+      return ::llvm::cast<RankedTensorType>(getResult().getType());
+    }
+
+    SmallVector<Value> getOffsetsAsValues(::mlir::ImplicitLocOpBuilder &b) {
+      return ::mlir::getValueOrCreateConstantIndexOp(b, b.getLoc(),
+        getMixedOffsets());
+    }
+
+    SmallVector<Value> getSizesAsValues(::mlir::ImplicitLocOpBuilder &b) {
+      return ::mlir::getValueOrCreateConstantIndexOp(b, b.getLoc(),
+        getMixedSizes());
+    }
+
+    SmallVector<Value> getStridesAsValues(::mlir::ImplicitLocOpBuilder &b) {
+      return ::mlir::getValueOrCreateConstantIndexOp(b, b.getLoc(),
+        getMixedStrides());
+    }
+
+    /// Return the expected rank of each of the `static_offsets`, `static_sizes`
+    /// and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank = getResultType().getRank();
+      return {rank, rank, rank};
+    }
   }];
-
-  let arguments = (ins
-    AnyRankedTensor:$tensor,
-    DenseI32ArrayAttr:$offsets,
-    DenseI32ArrayAttr:$sizes,
-    DenseI64ArrayAttr:$strides
-  );
-  let results = (outs TTXLA_TiledTensorType:$tiled_tensor);
-  let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
+  let hasCanonicalizer = 1;
 }
 
-def TTXLA_ExtractOp : TTXLA_Op<"extract", [Pure,
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>]> {
+def TTXLA_ExtractOp : TTXLA_OpWithOffsetSizesAndStrides<"extract", []> {
   let summary = "Extract a tile from a tensor.";
   let description = [{
-    This op is used to extract a tile from a tiled tensor.
+    Offsets, strides, and destination tensor size are used to specify the tile
+    to extract. Minor-to-major layout should also be specified.
 
     Example:
       ```
-      %tiled_tensor = triton_xla.tile %tensor [0, 0][16, 64][120, 1]
-      : !triton_xla.tiled_tensor<16x64|120x320xbf16>
+      %tensor: tensor<120x320xbf16>
+      %of = arith.constant 0 : index
+      %st1 = arith.constant 120 : index
+      %st2 = arith.constant 1 : index
       ...
-      %extracted_tensor = triton_xla.extract %tiled_tensor [%cst, %cst]
-        : tensor<120x320xbf16> to tensor<16x64xbf16>
+      %extracted_tensor = triton_xla.extract %tensor [%of, %of][%st1, %st2]
+        {layout = array<i64:1, 0>} : tensor<120x320xbf16> to tensor<16x64xbf16>
       ```
   }];
+  let builders = [
+    OpBuilder<(ins "RankedTensorType":$result_type, "Value":$src,
+      "ValueRange":$offsets,  "ValueRange":$strides,
+      CArg<"ArrayRef<int64_t>", "{}">:$layout,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    OpBuilder<(ins "RankedTensorType":$result_type, "Value":$src,
+      "ArrayRef<OpFoldResult>":$offsets,  "ArrayRef<OpFoldResult>":$strides,
+      CArg<"ArrayRef<int64_t>", "{}">:$layout,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
+  ];
 
   let arguments = (ins
-    TTXLA_TiledTensorType:$src,
-    Variadic<I32>:$offsets
+    AnyStaticShapeTensor:$src,
+    Variadic<Index>:$offsets,
+    Variadic<Index>:$sizes,
+    Variadic<Index>:$strides,
+    DenseI64ArrayAttr:$static_offsets,
+    DenseI64ArrayAttr:$static_sizes,
+    DenseI64ArrayAttr:$static_strides,
+    DenseI64ArrayAttr:$layout
   );
-  let results = (outs AnyRankedTensor:$result);
-  let hasCustomAssemblyFormat = 1;
-  let hasVerifier = 1;
+  let assemblyFormat = [{
+    $src ``
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    custom<DynamicIndexList>($sizes, $static_sizes)
+    custom<DynamicIndexList>($strides, $static_strides)
+    attr-dict `:` type($src) `to` type($result)
+  }];
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    /// Return the number of leading operands before the `offsets`, `sizes` and
+    /// and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
+  }];
 }
 
-def TTXLA_InsertOp : TTXLA_Op<"insert", [Pure,
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>]> {
+def TTXLA_InsertOp : TTXLA_OpWithOffsetSizesAndStrides<"insert", [
+    TypesMatchWith<"expected result type to match dest type",
+                   "dst", "result", "$_self">
+  ]> {
   let summary = "Insert a tile into a tensor.";
   let description = [{
-    This op is used to insert a tile into a tensor. The tiling information can
-    be captured using triton_xla.tile and passed to this op.
+    Offsets, strides, and source tensor size are used to specify the tile to
+    insert into. Minor-to-major layout should also be specified.
 
     Example:
       ```
-      %tiled_tensor = triton_xla.tile %tensor [0, 0][16, 64][120, 1]
-      : !triton_xla.tiled_tensor<16x64|120x320xbf16>
+      %src: tensor<16x64xbf16>
+      %dst: tensor<120x320xbf16>
+      %of = arith.constant 0 : index
+      %st1 = arith.constant 120 : index
+      %st2 = arith.constant 1 : index
       ...
-      %updated_tensor = triton_xla.insert %arg0 into %tiled_tensor[%cst, %cst]
-        : tensor<16x64xbf16> into tensor<120x320xbf16>
+      %updated_tensor = triton_xla.insert %src into %dst [%of, %of][%st1, %st2]
+        {layout = array<i64:1, 0>} : tensor<16x64xbf16> into tensor<120x320xbf16>
       ```
   }];
+  let builders = [
+    OpBuilder<(ins "Value":$src, "Value":$dst,
+      "ValueRange":$offsets,  "ValueRange":$strides,
+      CArg<"ArrayRef<int64_t>", "{}">:$layout,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    OpBuilder<(ins "Value":$src, "Value":$dst,
+      "ArrayRef<OpFoldResult>":$offsets,  "ArrayRef<OpFoldResult>":$strides,
+      CArg<"ArrayRef<int64_t>", "{}">:$layout,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
+  ];
 
   let arguments = (ins
-    AnyRankedTensor:$src,
-    TTXLA_TiledTensorType:$dst,
-    Variadic<I32>:$offsets
+    AnyStaticShapeTensor:$src,
+    AnyStaticShapeTensor:$dst,
+    Variadic<Index>:$offsets,
+    Variadic<Index>:$sizes,
+    Variadic<Index>:$strides,
+    DenseI64ArrayAttr:$static_offsets,
+    DenseI64ArrayAttr:$static_sizes,
+    DenseI64ArrayAttr:$static_strides,
+    DenseI64ArrayAttr:$layout
   );
-  let results = (outs AnyRankedTensor:$result);
-  let hasCustomAssemblyFormat = 1;
-  let hasVerifier = 1;
+
+  let assemblyFormat = [{
+    $src `into` $dst ``
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    custom<DynamicIndexList>($sizes, $static_sizes)
+    custom<DynamicIndexList>($strides, $static_strides)
+    attr-dict `:` type($src) `into` type($dst)
+  }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    /// Return the number of leading operands before the `offsets`, `sizes` and
+    /// and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }
+  }];
 }
 
 #endif // XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_OPS_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_types.cc b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_types.cc
deleted file mode 100644
index 84a9188d267c..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_types.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/OpImplementation.h"  // IWYU pragma: keep
-#include "mlir/IR/Types.h"  // IWYU pragma: keep
-#include "mlir/Support/LLVM.h"
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
-
-namespace mlir::triton::xla {
-
-mlir::Type TiledTensorType::parse(mlir::AsmParser &parser) {
-  mlir::SmallVector<int64_t, 4> tile_shape, original_shape;
-  mlir::Type type;
-  if (parser.parseLess() ||
-      parser.parseDimensionList(tile_shape, /*allowDynamic=*/false,
-                                /*withTrailingX=*/false) ||
-      parser.parseVerticalBar() ||
-      parser.parseDimensionList(original_shape, /*allowDynamic=*/false,
-                                /*withTrailingX=*/true) ||
-      parser.parseType(type) || parser.parseGreater()) {
-    return {};
-  }
-  return TiledTensorType::get(parser.getContext(), tile_shape, original_shape,
-                              type);
-}
-
-void TiledTensorType::print(mlir::AsmPrinter &printer) const {
-  printer << "<";
-  printer.printDimensionList(getTileShape());
-  printer << "|";
-  printer.printDimensionList(getOriginalShape());
-  printer << "x" << getElementType() << ">";
-}
-
-LogicalResult TiledTensorType::verify(
-    llvm::function_ref<InFlightDiagnostic()> emit_error,
-    ArrayRef<int64_t> tile_shape, ArrayRef<int64_t> original_shape,
-    Type element_type) {
-  if (tile_shape.size() != original_shape.size()) {
-    return emit_error() << "tile shape and original shape have different ranks";
-  }
-
-  for (const auto &[tile_dim, original_dim] :
-       llvm::zip(tile_shape, original_shape)) {
-    if (tile_dim > original_dim) {
-      return emit_error()
-             << "tile dim value can't be greater than original dim value";
-    }
-  }
-
-  return success();
-}
-
-}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_types.td b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_types.td
deleted file mode 100644
index de22c3312fa7..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_types.td
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_TYPES_TD_
-#define XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_TYPES_TD_
-
-include "xla/backends/gpu/codegen/triton/ir/triton_xla_dialect.td"
-include "mlir/IR/BuiltinTypes.td" // ValueSemantics
-include "mlir/IR/BuiltinTypeInterfaces.td"
-
-// -----------------------------------------------------------------------------
-// TiledTensorType
-// -----------------------------------------------------------------------------
-
-class TTXLA_Type<string name, string typeMnemonic, list<Trait> traits = []> :
-      TypeDef<XlaTritonDialect, name, traits> {
-  let mnemonic = typeMnemonic;
-}
-
-def TTXLA_TiledTensorType : TTXLA_Type<"TiledTensor", "tiled_tensor", [
-    ValueSemantics]> {
-  let summary = "A tile of a tensor.";
-  let description = [{
-    Usage:
-      This type will typically be constructed via triton_xla.tile op. The intent
-      is to capture tiling information and pass it along to other ops such as
-      triton_xla.extract and triton_xla.insert. Refer to the ops for examples.
-
-    Example:
-      ```
-      !triton_xla.tiled_tensor<16x64|320x512xbf16>
-      ```
-      this type represents a 16x64 tile of a 320x512 bf16 tensor.
-  }];
-
-  let parameters = (ins
-    ArrayRefParameter<"int64_t">:$tile_shape,
-    ArrayRefParameter<"int64_t">:$original_shape,
-    "Type":$element_type
-  );
-
- let builders = [
-    TypeBuilder<(ins
-      "RankedTensorType":$tile_type,
-      "RankedTensorType":$original_type),
-    [{
-      assert(original_type.getElementType() == tile_type.getElementType());
-      return $_get($_ctxt, tile_type.getShape(), original_type.getShape(),
-                           original_type.getElementType());
-    }]>
-  ];
-  let extraClassDeclaration = [{
-    RankedTensorType getOriginalType() const {
-      return RankedTensorType::get(getOriginalShape(), getElementType());
-    }
-    RankedTensorType getTileType() const {
-      return RankedTensorType::get(getTileShape(), getElementType());
-    }
-    int64_t getRank() const {
-      return getOriginalShape().size();
-    }
-  }];
-  let hasCustomAssemblyFormat = 1;
-  let genVerifyDecl = 1;
-}
-
-#endif // XLA_BACKENDS_GPU_CODEGEN_TRITON_IR_TRITON_XLA_TYPES_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/kernel_name_tracer_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/kernel_name_tracer_cuda.cc
index ae3bb172e6cb..e6941772374a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/kernel_name_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/kernel_name_tracer_cuda.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_tracer.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace xla::gpu {
 
@@ -54,7 +55,7 @@ void KernelNameTracerCuda::start() {
       collector_options, start_walltime_ns, start_gputime_ns);
   profiler::CuptiTracerOptions options;
   options.activities_selected = {CUPTI_ACTIVITY_KIND_KERNEL};
-  cupti_tracer_->Enable(options, cupti_collector_.get());
+  cupti_tracer_->Enable(options, cupti_collector_.get()).IgnoreError();
 }
 
 std::vector<std::string> KernelNameTracerCuda::stop() {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
index 628a7feb3942..78e9bdecffc6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
@@ -15,24 +15,33 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/support.h"
 
+#include <string>
 #include <variant>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
 #include "xla/primitive_util.h"
+#include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/matmul_indexing_utils.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -44,6 +53,7 @@ bool IsTritonSupportedDataType(PrimitiveType type,
                                const se::GpuComputeCapability& gpu_version) {
   switch (type) {
     case PRED:
+    case S4:
     case S8:
     case S16:
     case S32:
@@ -69,15 +79,18 @@ bool IsTritonSupportedDataType(PrimitiveType type,
 absl::flat_hash_set<HloOpcode> TritonSupportedUnaryElementwiseOps(
     PrimitiveType element_type) {
   if (element_type == PrimitiveType::PRED) {
-    return {HloOpcode::kConvert, HloOpcode::kNot, HloOpcode::kCopy};
+    return {HloOpcode::kNot, HloOpcode::kCopy};
+  }
+
+  if (element_type == PrimitiveType::S4) {
+    return {};
   }
 
   if (element_type == PrimitiveType::U16) {
     return {HloOpcode::kAbs};
   }
 
-  absl::flat_hash_set<HloOpcode> ret{HloOpcode::kAbs, HloOpcode::kConvert,
-                                     HloOpcode::kCopy};
+  absl::flat_hash_set<HloOpcode> ret{HloOpcode::kAbs, HloOpcode::kCopy};
 
   if (element_type != PrimitiveType::F8E5M2 &&
       element_type != PrimitiveType::F8E4M3FN) {
@@ -128,10 +141,20 @@ CodegenDecision IsTritonSupportedConversion(
     return error_message();
   }
 
-  if (input != output &&
-      (any_is(PrimitiveType::F8E4M3FN) || any_is(PrimitiveType::F8E5M2)) &&
-      !(any_is(PrimitiveType::F16) || any_is(PrimitiveType::BF16) ||
-        any_is(PrimitiveType::F32))) {
+  bool is_f8_conversion =
+      any_is(PrimitiveType::F8E4M3FN) && any_is(PrimitiveType::F8E5M2);
+  bool is_f8 = any_is(PrimitiveType::F8E4M3FN) || any_is(PrimitiveType::F8E5M2);
+  bool is_f16_or_f32 = any_is(PrimitiveType::F16) ||
+                       any_is(PrimitiveType::BF16) ||
+                       any_is(PrimitiveType::F32);
+  if (input != output && is_f8 && !is_f8_conversion && !is_f16_or_f32) {
+    return error_message();
+  }
+
+  if (input == S4 && output != S8) {
+    return error_message();
+  }
+  if (output == S4) {
     return error_message();
   }
 
@@ -146,7 +169,7 @@ CodegenDecision IsTritonSupportedConversion(
 // Set of binary element-wise ops that are genuinely supported by Triton.
 absl::flat_hash_set<HloOpcode> TritonSupportedBinaryElementwiseOps(
     PrimitiveType element_type, const se::GpuComputeCapability& gpu_version) {
-  if (element_type == PrimitiveType::U16 ||
+  if (element_type == PrimitiveType::S4 || element_type == PrimitiveType::U16 ||
       element_type == PrimitiveType::F8E5M2 ||
       element_type == PrimitiveType::F8E4M3FN) {
     return {};
@@ -192,7 +215,7 @@ absl::flat_hash_set<HloOpcode> TritonSupportedBinaryElementwiseOps(
 // Set of ternary elementwise ops that are genuinely supported by Triton.
 absl::flat_hash_set<HloOpcode> TritonSupportedTernaryElementwiseOps(
     PrimitiveType element_type, const se::GpuComputeCapability& gpu_version) {
-  if (element_type == PrimitiveType::U16) {
+  if (element_type == PrimitiveType::S4 || element_type == PrimitiveType::U16) {
     return {};
   }
 
@@ -246,10 +269,289 @@ CodegenDecision CanTritonHandleReduce(
       "Reduction is not a row-reduction of a single operand.");
 }
 
+bool IsInTritonNestedGemmFusion(const HloInstruction& hlo) {
+  const HloComputation* computation = hlo.parent();
+  if (!computation->IsFusionComputation()) {
+    return false;
+  }
+  absl::StatusOr<GpuBackendConfig> backend_config =
+      computation->FusionInstruction()->backend_config<GpuBackendConfig>();
+  if (!backend_config.ok()) {
+    return false;
+  }
+  absl::string_view fusion_kind =
+      backend_config.value().fusion_backend_config().kind();
+  return fusion_kind == kTritonNestedGemmFusionKind;
+}
+
+absl::Status CheckSupportedCheckDotDimensions(const HloDotInstruction& dot) {
+  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+  // Only checking one side of bach and contracting dimensions, since they must
+  // be the same for left and right.
+  if (dim_numbers.lhs_contracting_dimensions_size() != 1) {
+    return absl::UnimplementedError(absl::StrCat(
+        "Exactly one contracting dimension is supported, got ",
+        absl::StrJoin(dim_numbers.lhs_contracting_dimensions(), ",")));
+  }
+  return absl::OkStatus();
+}
+
+bool IsSupportedDotAlgorithm(PrecisionConfig::Algorithm algorithm) {
+  switch (algorithm) {
+    case PrecisionConfig::ALG_UNSET:
+    case PrecisionConfig::ALG_DOT_F16_F16_F16:
+    case PrecisionConfig::ALG_DOT_F32_F32_F32:
+    case PrecisionConfig::ALG_DOT_F64_F64_F64:
+    case PrecisionConfig::ALG_DOT_F16_F16_F32:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      return true;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
+    default:
+      break;
+  }
+
+  return false;
+}
+
+CodegenDecision AreTypesSupportedByAlgUnsetDot(
+    PrimitiveType input_type, PrimitiveType result_type,
+    const se::GpuComputeCapability& gpu_version) {
+  if (input_type == F64 && result_type != F64) {
+    return CodegenDecision::Forbid(
+        "Dot operation only supports F64 result type for F64 input type.");
+  }
+
+  if (input_type == F8E4M3FN || result_type == F8E4M3FN) {
+    if (auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
+        cuda_cc && !cuda_cc->IsAtLeastHopper()) {
+      return CodegenDecision::Forbid(
+          "Dot operation for F8E4M3FN is not supported before Hopper.");
+    }
+  }
+
+  auto supported_float_types = {BF16, F16, F32, F64, F8E5M2, F8E4M3FN};
+  if (absl::c_linear_search(supported_float_types, input_type)) {
+    return CodegenDecision::Allow();
+  }
+
+  if (input_type == S8 && result_type == S32) {
+    return CodegenDecision::Allow();
+  }
+
+  auto partially_supported_signed_types = {S4, S8, S16, S32, S64};
+  if (absl::c_linear_search(partially_supported_signed_types, input_type)) {
+    if (absl::c_linear_search(partially_supported_signed_types, result_type)) {
+      return CodegenDecision::Forbid(
+          "Dot operation does not support these signed integer types.");
+    }
+    if (primitive_util::IsFloatingPointType(result_type)) {
+      return CodegenDecision::Forbid(
+          "Dot operation does not support floating point input and signed "
+          "integer result types.");
+    }
+    return CodegenDecision::Allow();
+  }
+
+  return CodegenDecision::Forbid("Unsupported types.");
+}
+
+// Checks whether the conversions generated during the lowering of the relevant
+// dot algorithm for the relevant input and output types are supported by
+// Triton.
+//
+// When the algorithm is `ALG_UNSET`, nothing is checked.
+CodegenDecision AreDotAlgorithmInputAndOutputConversionsSupported(
+    PrecisionConfig::Algorithm algorithm, PrimitiveType lhs_type,
+    PrimitiveType rhs_type, PrimitiveType result_type,
+    const se::GpuComputeCapability& gpu_version) {
+  if (algorithm == PrecisionConfig::ALG_UNSET) {
+    return CodegenDecision::Allow();
+  }
+
+  auto forbid = [&algorithm](absl::string_view message) {
+    return CodegenDecision::Forbid(
+        absl::StrCat(message, " for dot algorithm ",
+                     PrecisionConfig::Algorithm_Name(algorithm)));
+  };
+
+  absl::StatusOr<std::vector<PrimitiveType>> allowed_operands_types_or =
+      algorithm_util::GetAllowedOperandsTypeForAlgorithm(algorithm);
+  absl::StatusOr<PrimitiveType> expected_accumulator_type =
+      algorithm_util::GetDotAccumulatorType(algorithm);
+  if (!allowed_operands_types_or.ok() || !expected_accumulator_type.ok()) {
+    return forbid("Failed to recover operands types or accumulator type");
+  }
+  CHECK(!allowed_operands_types_or->empty());
+
+  if (result_type != *expected_accumulator_type) {
+    if (!IsTritonSupportedConversion(*expected_accumulator_type, result_type,
+                                     gpu_version) ||
+        !IsTritonSupportedConversion(result_type, *expected_accumulator_type,
+                                     gpu_version)) {
+      return forbid("Unsupported result conversion");
+    }
+  }
+
+  if (allowed_operands_types_or->size() != 1 &&
+      (lhs_type != rhs_type ||
+       !absl::c_linear_search(*allowed_operands_types_or, lhs_type))) {
+    return forbid("Unsupported operand types");
+  } else if (allowed_operands_types_or->size() == 1) {
+    return CodegenDecision::Allow();
+  }
+
+  PrimitiveType expected_operands_type = allowed_operands_types_or->front();
+
+  if (lhs_type != expected_operands_type &&
+      !IsTritonSupportedConversion(expected_operands_type, lhs_type,
+                                   gpu_version)) {
+    return forbid("Unsupported lhs conversion");
+  }
+  if (rhs_type != expected_operands_type &&
+      !IsTritonSupportedConversion(expected_operands_type, rhs_type,
+                                   gpu_version)) {
+    return forbid("Unsupported rhs conversion");
+  }
+
+  return CodegenDecision::Allow();
+}
+
+CodegenDecision IsTritonSupportedDot(
+    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
+  if (!IsInTritonNestedGemmFusion(dot)) {
+    return CodegenDecision::Forbid(
+        "Dot operation is only supported in nested GEMM fusions.");
+  }
+  PrimitiveType result_type = dot.shape().element_type();
+  const Shape& lhs_shape = dot.operand(0)->shape();
+  const Shape& rhs_shape = dot.operand(1)->shape();
+  PrimitiveType lhs_type = lhs_shape.element_type();
+  PrimitiveType rhs_type = rhs_shape.element_type();
+
+  if (dot.operand(0)->opcode() != HloOpcode::kFusion ||
+      dot.operand(1)->opcode() != HloOpcode::kFusion) {
+    return CodegenDecision::Forbid(
+        "Only operands that are fusions are supported.");
+  }
+
+  // TODO(b/393299275): add support tests for mixed types.
+  if (lhs_type != rhs_type) {
+    return CodegenDecision::Forbid(
+        "Dot operation only supports same types for lhs and rhs.");
+  }
+
+  if (result_type == PrimitiveType::S4) {
+    return CodegenDecision::Forbid("S4 is not supported.");
+  }
+
+  absl::Status status = CheckSupportedCheckDotDimensions(dot);
+  if (!status.ok()) {
+    return CodegenDecision::Forbid(status.message());
+  }
+
+  const PrecisionConfig& precision_config = dot.precision_config();
+  const PrecisionConfig::Algorithm algorithm = precision_config.algorithm();
+
+  if (!IsSupportedDotAlgorithm(algorithm)) {
+    return CodegenDecision::Forbid(
+        absl::StrCat("Unsupported dot algorithm: ",
+                     PrecisionConfig::Algorithm_Name(algorithm)));
+  }
+
+  if (algorithm == PrecisionConfig::ALG_UNSET) {
+    if (CodegenDecision decision =
+            AreTypesSupportedByAlgUnsetDot(lhs_type, result_type, gpu_version);
+        !decision) {
+      return decision;
+    }
+  }
+
+  if (CodegenDecision conversion_decision =
+          AreDotAlgorithmInputAndOutputConversionsSupported(
+              algorithm, lhs_type, rhs_type, result_type, gpu_version);
+      !conversion_decision) {
+    return conversion_decision;
+  }
+
+  return CodegenDecision::Allow();
+}
+
+// Verifies that the nested fusion instruction conforms to the assumptions of
+// the emitter. Currently, we expect nested fusions:
+// - of kind `__triton_nested_gemm_fusion`;
+// - to have a single user that is either a `dot` or a `concatenate`;
+// - calls a supported computation.
+CodegenDecision IsSupportedFusion(const HloFusionInstruction& hlo,
+                                  const se::GpuComputeCapability& capability) {
+  // TODO(b/393299275): test cases when there are multiple dot users of the
+  // same fusion.
+  if (hlo.user_count() != 1) {
+    return CodegenDecision::Forbid(
+        absl::StrCat("Expected only one user for fusion ", hlo.ToString(),
+                     " but got ", hlo.user_count()));
+  }
+  absl::StatusOr<GpuBackendConfig> backend_config =
+      hlo.backend_config<GpuBackendConfig>();
+  if (!backend_config.ok()) {
+    return CodegenDecision(backend_config.status());
+  }
+  if (const std::string& kind =
+          backend_config.value().fusion_backend_config().kind();
+      kind != kTritonNestedGemmFusionKind) {
+    return CodegenDecision::Forbid(
+        absl::StrCat("Expected ", hlo.ToString(), " with fusion backend kind ",
+                     kTritonNestedGemmFusionKind, ", got ", kind));
+  }
+  const HloInstruction* user = hlo.users().front();
+  switch (user->opcode()) {
+    case HloOpcode::kDot:
+    case HloOpcode::kConcatenate:
+      break;
+    default:
+      return CodegenDecision::Forbid(absl::StrCat(
+          "Unexpected user opcode ", user->opcode(), " of nested fusion"));
+  }
+  CodegenDecision decision =
+      IsTritonSupportedComputation(*hlo.called_computation(), capability);
+  if (decision.CanFuse()) {
+    return CodegenDecision::Allow();
+  }
+  return CodegenDecision::Forbid(
+      absl::StrCat("Computation called by fusion ", hlo.ToString(),
+                   " is not supported: ", decision.Explain()));
+}
+
+CodegenDecision IsTritonSupportedConcatenate(const HloInstruction& hlo) {
+  CHECK(hlo.opcode() == HloOpcode::kConcatenate);
+  if (!IsInTritonNestedGemmFusion(hlo)) {
+    return CodegenDecision::Forbid(
+        "Only concatenates in nested GEMM fusions are supported.");
+  }
+  // TODO(b/393299275): remove this operand filter once migration is
+  // complete and priority fusion can produce nests.
+  if (absl::c_any_of(hlo.operands(), [](const HloInstruction* operand) {
+        return operand->opcode() != HloOpcode::kFusion;
+      })) {
+    return CodegenDecision::Forbid(
+        "Only support concatenates with nested GEMM fusions as a "
+        "parameter.");
+  }
+  return CodegenDecision(hlo.shape().element_type() != S4,
+                         "S4 is not supported.");
+}
+
 CodegenDecision IsTritonSupportedInstructionImpl(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
   if (internal::IsTritonUnsupportedOpcode(instr.opcode())) {
-    return CodegenDecision::Forbid("Unsupported opcode.");
+    return CodegenDecision::Forbid(
+        absl::StrCat("Unsupported opcode ", HloOpcodeString(instr.opcode())));
   }
 
   // Special handling for the kConvert instruction, which has a non-standard
@@ -277,55 +579,68 @@ CodegenDecision IsTritonSupportedInstructionImpl(
     return CodegenDecision::Forbid("Unsupported input data type.");
   }
 
+  if (instr.opcode() == HloOpcode::kConcatenate) {
+    return IsTritonSupportedConcatenate(instr);
+  }
+
   // Const is technically an elementwise op, so this check must be before the
   // elementwise check.
   if (instr.opcode() == HloOpcode::kConstant) {
-    return ShapeUtil::IsEffectiveScalar(instr.shape())
-               ? CodegenDecision::Allow()
-               : CodegenDecision::Forbid(
-                     "Only scalar constants are supported in Triton.");
+    if (type == PrimitiveType::S4) {
+      return CodegenDecision::Forbid("S4 is not supported.");
+    }
+    return CodegenDecision(ShapeUtil::IsEffectiveScalar(instr.shape()),
+                           "Only scalar constants are supported in Triton.");
   }
 
   if (instr.opcode() == HloOpcode::kIota) {
     PrimitiveType element_type = instr.shape().element_type();
-    return element_type != PrimitiveType::F8E4M3FN &&
-                   element_type != PrimitiveType::F8E5M2
-               ? CodegenDecision::Allow()
-               : CodegenDecision::Forbid(
-                     "F8E4M3FN and F8E5M2 are not supported for iota.");
+    return CodegenDecision(
+        element_type != PrimitiveType::F8E4M3FN &&
+            element_type != PrimitiveType::F8E5M2 &&
+            element_type != PrimitiveType::S4,
+        "F8E4M3FN, F8E5M2 and S4 are not supported for iota.");
   }
 
-  if (instr.IsElementwise()) {
-    if (!IsTritonSupportedElementwise(
-            instr.opcode(),
-            // Use the last operand below in order to support both `compare`
-            // and `select` which have a fixed PRED type in the output and first
-            // operand.
-            instr.operand(instr.operand_count() - 1)->shape().element_type(),
-            gpu_version)) {
-      return CodegenDecision::Forbid("Unsupported elementwise operation.");
-    }
-    return CodegenDecision::Allow();
-  }
-
-  // TODO(bchetioui): support kDot, kPad, and kDynamicSlice.
   switch (instr.opcode()) {
     case HloOpcode::kReduce: {
       return CanTritonHandleReduce(*Cast<HloReduceInstruction>(&instr),
                                    gpu_version);
     }
+    case HloOpcode::kParameter:
+      return CodegenDecision::Allow();
     case HloOpcode::kBitcast:
     case HloOpcode::kBroadcast:
-    case HloOpcode::kParameter:
     case HloOpcode::kReshape:
     case HloOpcode::kSlice:
     case HloOpcode::kTranspose:
-      return CodegenDecision::Allow();
+      return CodegenDecision(instr.shape().element_type() != S4,
+                             "S4 is not supported.");
+    case HloOpcode::kDot:
+      return IsTritonSupportedDot(*Cast<HloDotInstruction>(&instr),
+                                  gpu_version);
+    case HloOpcode::kFusion:
+      return IsSupportedFusion(*Cast<HloFusionInstruction>(&instr),
+                               gpu_version);
     default:
-      VLOG(2) << "Unsupported instruction: " << instr.ToString();
+      // Not all instructions have a special handling.
       break;
   }
-  return CodegenDecision::Forbid("Unsupported opcode.");
+
+  if (instr.IsElementwise()) {
+    if (!IsTritonSupportedElementwise(
+            instr.opcode(),
+            // Use the last operand below in order to support both `compare`
+            // and `select` which have a fixed PRED type in the output and first
+            // operand.
+            instr.operand(instr.operand_count() - 1)->shape().element_type(),
+            gpu_version)) {
+      return CodegenDecision::Forbid("Unsupported elementwise operation.");
+    }
+    return CodegenDecision::Allow();
+  }
+  return CodegenDecision::Forbid(absl::StrCat("Unsupported instruction opcode ",
+                                              HloOpcodeString(instr.opcode())));
 }
 
 }  // namespace
@@ -333,57 +648,28 @@ CodegenDecision IsTritonSupportedInstructionImpl(
 namespace internal {
 bool IsTritonUnsupportedOpcode(HloOpcode opcode) {
   switch (opcode) {
-    case HloOpcode::kAddDependency:
-    case HloOpcode::kAfterAll:
-    case HloOpcode::kBatchNormGrad:
-    case HloOpcode::kBatchNormInference:
-    case HloOpcode::kBatchNormTraining:
-    case HloOpcode::kBitcastConvert:
-    case HloOpcode::kCall:
-    case HloOpcode::kCholesky:
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
-    case HloOpcode::kCopyDone:
-    case HloOpcode::kCopyStart:
-    case HloOpcode::kCustomCall:
-    case HloOpcode::kDomain:
-    case HloOpcode::kDot:
     case HloOpcode::kDynamicReshape:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
-    case HloOpcode::kFft:
-    case HloOpcode::kFusion:
     case HloOpcode::kGather:
-    case HloOpcode::kGetDimensionSize:
-    case HloOpcode::kGetTupleElement:
-    case HloOpcode::kInfeed:
-    case HloOpcode::kMap:
-    case HloOpcode::kOptimizationBarrier:
-    case HloOpcode::kOutfeed:
     case HloOpcode::kPad:
     case HloOpcode::kRaggedDot:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kReduceWindow:
-    case HloOpcode::kReverse:
-    case HloOpcode::kRngGetAndUpdateState:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kSetDimensionSize:
     case HloOpcode::kSort:
-    case HloOpcode::kStochasticConvert:
-    case HloOpcode::kTopK:
-    case HloOpcode::kTriangularSolve:
-    case HloOpcode::kTuple:
-    case HloOpcode::kWhile:
       return true;
     default:
       return false;
   }
 }
+
 }  // namespace internal
 
 absl::Status EnsureTritonSupportsComputeCapability(
@@ -412,14 +698,16 @@ CodegenDecision IsTritonSupportedInstruction(
     const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
   CodegenDecision decision =
       IsTritonSupportedInstructionImpl(instr, gpu_version);
-  VLOG(2) << "IsTritonSupportedInstruction: " << instr.ToString() << " "
-          << bool(decision);
+  VLOG(2) << absl::StrCat("IsTritonSupportedInstruction: ", instr.ToString(),
+                          " ",
+                          (decision.CanFuse() ? "yes" : decision.Explain()));
   return decision;
 }
 
 CodegenDecision IsTritonSupportedComputation(
     const HloComputation& computation,
     const se::GpuComputeCapability& gpu_compute_capability) {
+  VLOG(3) << "IsTritonSupportedComputation: " << computation.ToString();
   for (const auto* instruction : computation.instructions()) {
     if (CodegenDecision can_codegen =
             IsTritonSupportedInstruction(*instruction, gpu_compute_capability);
@@ -427,7 +715,6 @@ CodegenDecision IsTritonSupportedComputation(
       return can_codegen;
     }
   }
-
   return CodegenDecision::Allow();
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
index 6b2f9c38d3c6..3693972b5147 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_legacy.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/variant_visitor.h"
+#include "xla/service/overload.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/tensor_float_32_utils.h"
@@ -67,45 +67,40 @@ bool IsTritonSupportedDotOutputType(
     case F32:
       return true;
     case F8E5M2:
-      return std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                                         return cc.IsAtLeastAmpere();
-                                       },
-                                       [](const se::RocmComputeCapability& cc) {
-                                         return false;
-                                       }},
-                        gpu_version);
+      return std::visit(
+          Overload{[](const se::CudaComputeCapability& cc) {
+                     return cc.IsAtLeastAmpere();
+                   },
+                   [](const se::RocmComputeCapability& cc) { return false; }},
+          gpu_version);
 
     case F8E4M3FN:
-      return std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                                         return cc.IsAtLeastHopper();
-                                       },
-                                       [](const se::RocmComputeCapability& cc) {
-                                         return false;
-                                       }},
-                        gpu_version);
+      return std::visit(
+          Overload{[](const se::CudaComputeCapability& cc) {
+                     return cc.IsAtLeastHopper();
+                   },
+                   [](const se::RocmComputeCapability& cc) { return false; }},
+          gpu_version);
     case BF16:
-      return std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                                         return true;
-                                       },
-                                       [](const se::RocmComputeCapability& cc) {
-                                         return cc.has_bf16_dtype_support();
-                                       }},
-                        gpu_version);
+      return std::visit(
+          Overload{[](const se::CudaComputeCapability& cc) { return true; },
+                   [](const se::RocmComputeCapability& cc) {
+                     return cc.has_bf16_dtype_support();
+                   }},
+          gpu_version);
     case S32:
-      return std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                                         return cc.IsAtLeastAmpere();
-                                       },
-                                       [](const se::RocmComputeCapability& cc) {
-                                         return false;
-                                       }},
-                        gpu_version);
+      return std::visit(
+          Overload{[](const se::CudaComputeCapability& cc) {
+                     return cc.IsAtLeastAmpere();
+                   },
+                   [](const se::RocmComputeCapability& cc) { return false; }},
+          gpu_version);
     default:
       return false;
   }
 };
 
 // Data types that are supported by the Triton emitters.
-// TODO(b/266862493): Support more data types (F8, F64, etc.).
 bool IsTritonSupportedDataType(PrimitiveType type,
                                const se::GpuComputeCapability& gpu_version) {
   if (IsTritonSupportedDotOutputType(type, gpu_version)) {
@@ -267,7 +262,6 @@ CodegenDecision AreDotInputAndOutputTypesSupportedAndCompatible(
   auto lhs_type = dot.operand(0)->shape().element_type();
   auto rhs_type = dot.operand(1)->shape().element_type();
 
-  // TODO(b/266862493): Support more output types.
   if (!IsTritonSupportedDotOutputType(output_type, gpu_version)) {
     return CodegenDecision::Forbid("Unsupported output data type for Dot op.");
   }
@@ -334,10 +328,10 @@ bool NoNonContractingDimension(const HloDotInstruction& dot) {
   const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
   if (dim_numbers.lhs_batch_dimensions().size() +
               dim_numbers.lhs_contracting_dimensions().size() ==
-          dot.operand(0)->shape().rank() ||
+          dot.operand(0)->shape().dimensions().size() ||
       dim_numbers.rhs_batch_dimensions().size() +
               dim_numbers.rhs_contracting_dimensions().size() ==
-          dot.operand(1)->shape().rank()) {
+          dot.operand(1)->shape().dimensions().size()) {
     return true;
   }
   return false;
@@ -366,7 +360,7 @@ CodegenDecision IsTritonSupportedDynamicSlice(
   int64_t majormost_dim_id =
       in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
 
-  for (int i = 0; i < input->shape().dimensions_size(); ++i) {
+  for (int i = 0; i < input->shape().dimensions().size(); ++i) {
     if (i == majormost_dim_id) {
       continue;
     } else if (input->shape().dimensions(i) != instr.slice_sizes(i)) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
index 8697003453ac..de8604424bfe 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <iterator>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -27,17 +28,23 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"
@@ -49,23 +56,6 @@ namespace {
 using ::testing::Not;
 using ::tsl::testing::IsOk;
 
-std::vector<xla::PrimitiveType> AllXlaDataTypes() {
-  std::vector<xla::PrimitiveType> xla_data_types;
-  std::vector<xla::PrimitiveType> to_filter_out = {PRIMITIVE_TYPE_INVALID,
-                                                   TUPLE, OPAQUE_TYPE, TOKEN};
-  const tsl::protobuf::EnumDescriptor* xla_type_descriptor =
-      tsl::protobuf::GetEnumDescriptor<xla::PrimitiveType>();
-  for (int enum_ix = 0; enum_ix < xla_type_descriptor->value_count();
-       ++enum_ix) {
-    xla::PrimitiveType xla_type = static_cast<xla::PrimitiveType>(
-        xla_type_descriptor->value(enum_ix)->number());
-    if (!absl::c_linear_search(to_filter_out, xla_type)) {
-      xla_data_types.push_back(xla_type);
-    }
-  }
-  return xla_data_types;
-}
-
 // Returns true if the given `opcode` supports the given `type` with respect to
 // HLO semantics. This is completely independent of the what Triton supports or
 // what the hardware supports.
@@ -99,6 +89,8 @@ bool DoesOpSupportType(HloOpcode opcode, PrimitiveType type) {
     case HloOpcode::kReal:
     case HloOpcode::kImag:
     case HloOpcode::kLogistic:
+    case HloOpcode::kCholesky:
+    case HloOpcode::kTriangularSolve:
       return pu::IsFloatingPointType(type) || pu::IsComplexType(type);
     case HloOpcode::kCbrt:
     case HloOpcode::kErf:
@@ -130,6 +122,13 @@ bool DoesOpSupportType(HloOpcode opcode, PrimitiveType type) {
       return !pu::IsComplexType(type);
     case HloOpcode::kComplex:
       return type == F32 || type == F64;
+    case HloOpcode::kDot:
+      return type != PRED;
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kStochasticConvert:
+      return pu::IsFloatingPointType(type);
     default:
       // Returning true by default ensures that newly added ops are not
       // skipped.
@@ -137,6 +136,27 @@ bool DoesOpSupportType(HloOpcode opcode, PrimitiveType type) {
   }
 }
 
+std::vector<xla::PrimitiveType> AllOpSupportedTypes(HloOpcode opcode) {
+  std::vector<xla::PrimitiveType> result;
+  absl::c_copy_if(AllXlaDataTypes(), std::back_inserter(result),
+                  [&](PrimitiveType data_type) {
+                    return DoesOpSupportType(opcode, data_type);
+                  });
+  return result;
+}
+
+std::vector<PrecisionConfig::Algorithm> AllPrecisionAlgorithms() {
+  std::vector<PrecisionConfig::Algorithm> algorithms;
+  const tsl::protobuf::EnumDescriptor* algorithm_descriptor =
+      tsl::protobuf::GetEnumDescriptor<PrecisionConfig::Algorithm>();
+  for (int enum_ix = 0; enum_ix < algorithm_descriptor->value_count();
+       ++enum_ix) {
+    algorithms.push_back(static_cast<PrecisionConfig::Algorithm>(
+        algorithm_descriptor->value(enum_ix)->number()));
+  }
+  return algorithms;
+}
+
 auto AllDevicesToTest() {
   using cc = se::GpuComputeCapability;
 #ifdef TENSORFLOW_USE_ROCM
@@ -145,7 +165,8 @@ auto AllDevicesToTest() {
   return std::vector<cc>{cc(example_rocm_compute_capability)};
 #else  // GOOGLE_CUDA
   return std::vector<cc>{cc(se::CudaComputeCapability::Ampere()),
-                         cc(se::CudaComputeCapability::Hopper())};
+                         cc(se::CudaComputeCapability::Hopper()),
+                         cc(se::CudaComputeCapability::Blackwell())};
 #endif
 }
 
@@ -166,6 +187,18 @@ auto AllTestCombinationsForOpcodes(absl::Span<const HloOpcode> opcodes) {
   return ::testing::ValuesIn(test_combinations);
 };
 
+// Expected failure mode of the Triton lowering.
+enum class ExpectedFailMode {
+  // Denotes a graceful failure, e.g. a verifier failure, or an absl::Status.
+  kFail,
+  // Denotes a crash. That is typically the case when encountering a bug in
+  // the Triton compiler itself.
+  kCrash,
+  // Use only in cases when the failure mode depends on the compilation mode
+  // (i.e. when the failure is caused by a CHECK).
+  kFailOrCrash,
+};
+
 class TritonSupportTest : public TritonSupportTestBase {
  public:
   // Runs a support test for the given `TestedInstruction` and the given
@@ -186,14 +219,38 @@ class TritonSupportTest : public TritonSupportTestBase {
   void RunSupportTest(TestedInstruction ti,
                       std::vector<int64_t> output_tile_sizes,
                       se::GpuComputeCapability cc,
-                      bool skip_failure_branch_to_avoid_crash = false) {
+                      ExpectedFailMode failure_mode = ExpectedFailMode::kFail) {
+    // output_tile_sizes is embedded in a vector of 1 element to share the logic
+    // with the multiple output tiles case.
+    RunSupportTestMultipleOutputTiles(
+        std::move(ti), {std::move(output_tile_sizes)}, cc, failure_mode);
+  }
+
+  void RunSupportTestMultipleOutputTiles(
+      TestedInstruction ti, std::vector<std::vector<int64_t>> output_tile_sizes,
+      se::GpuComputeCapability cc,
+      ExpectedFailMode failure_mode = ExpectedFailMode::kFail) {
     // Ensure that the caller provided the right number of output tile sizes.
     // If that is not the case, codegen could fail for that reason---which
-    // wouldn't give any valuable signal here.  We skip the check for non-array
-    // output shapes, since we have no meaningful way of providing tile sizes
-    // for them at the moment.
-    if (ti.Instruction().shape().IsArray()) {
-      ASSERT_EQ(output_tile_sizes.size(), ti.Instruction().shape().rank());
+    // wouldn't give any valuable signal here. The check is only done for array
+    // and tuple shapes (only one layer of nesting is supported for tuples).
+    const auto& root_instruction = ti.TritonComputation().root_instruction();
+    if (root_instruction->shape().IsArray()) {
+      ASSERT_EQ(output_tile_sizes.size(), 1);
+      ASSERT_EQ(output_tile_sizes[0].size(),
+                root_instruction->shape().dimensions().size());
+    } else if (root_instruction->shape().IsTuple()) {
+      ASSERT_EQ(output_tile_sizes.size(),
+                root_instruction->shape().tuple_shapes().size());
+      for (int64_t i = 0; i < output_tile_sizes.size(); ++i) {
+        const auto& shape = root_instruction->shape().tuple_shapes(i);
+        if (shape.IsTuple() || shape.IsToken()) {
+          continue;  // No validation for nested tuples, as there is no way to
+                     // specify output tile sizes for them.
+        }
+        ASSERT_TRUE(shape.IsArray());
+        ASSERT_EQ(shape.dimensions().size(), output_tile_sizes[i].size());
+      }
     }
     BlockLevelParameters block_level_parameters =
         FromOutputTileSizes(std::move(output_tile_sizes));
@@ -208,25 +265,28 @@ class TritonSupportTest : public TritonSupportTestBase {
     };
 
     if (IsTritonSupportedInstruction(ti.Instruction(), cc)) {
-      EXPECT_THAT(run_triton_codegen(), IsOk());
-    } else {
-      if (skip_failure_branch_to_avoid_crash) {
-        EXPECT_DEATH(
-            // We need to catch exceptions and abort(), because in OSS there
-            // seem to be cases where exceptions are used instead of terminating
-            // the program.
-            try { run_triton_codegen().IgnoreError(); } catch (...) {
-              abort();
-            },
-            // It's not possible to find stable matching patterns for all
-            // aborting code paths that occur here, so we at least make sure
-            // that we don't interpret sanitizer errors as success.
-            ::testing::Not(::testing::HasSubstr("Sanitizer:")));
-
-      } else {
-        EXPECT_THAT(run_triton_codegen(), Not(IsOk()));
-      }
+      EXPECT_THAT(run_triton_codegen(), IsOk()) << ti.Module()->ToString();
+      return;
     }
+    if (failure_mode == ExpectedFailMode::kFail) {
+      EXPECT_THAT(run_triton_codegen(), Not(IsOk()));
+      return;
+    }
+    EXPECT_DEATH(
+        // We need to catch exceptions and abort(), because in OSS there
+        // seem to be cases where exceptions are used instead of terminating
+        // the program.
+        try {
+          absl::StatusOr<TritonWrapperResult> s = run_triton_codegen();
+          if (!s.ok() && failure_mode == ExpectedFailMode::kFailOrCrash) {
+            // Force a crash if failure is also acceptable.
+            abort();
+          }
+        } catch (...) { abort(); },
+        // It's not possible to find stable matching patterns for all
+        // aborting code paths that occur here, so we at least make sure
+        // that we don't interpret sanitizer errors as success.
+        ::testing::Not(::testing::HasSubstr("Sanitizer:")));
   }
 };
 
@@ -340,6 +400,7 @@ constexpr std::array kTestedOpsUnaryElementwise = {
     HloOpcode::kLogistic,
     HloOpcode::kNegate,
     HloOpcode::kNot,
+    HloOpcode::kOptimizationBarrier,
     HloOpcode::kPopulationCount,
     HloOpcode::kReal,
     HloOpcode::kReducePrecision,
@@ -384,8 +445,6 @@ ENTRY triton_computation {
           hlo_text, data_type_in,  // The type provided here is irrelevant.
           HloOpcode::kConvert));
 
-  bool skip_failure_branch_to_avoid_crash = false;
-
   // The two variables below are only needed prior to C++20 as capturing
   // structured bindings is not supported.
   // TODO(b/328238952): remove this indirection after XLA moves to C++20.
@@ -396,33 +455,33 @@ ENTRY triton_computation {
     return captured_in == compare || captured_out == compare;
   };
 
+  bool crashes_on_failure = false;
   if (data_type_in != data_type_out && any_is(PrimitiveType::F8E4M3FN) &&
       std::holds_alternative<se::CudaComputeCapability>(cc) &&
       !std::get<se::CudaComputeCapability>(cc).IsAtLeastHopper()) {
-    skip_failure_branch_to_avoid_crash |=
-        any_is(F16) || any_is(BF16) || any_is(F32);
+    crashes_on_failure |= any_is(F16) || any_is(BF16) || any_is(F32);
 
     // Crashes due to unsupported/unspecified rounding mode.
-    skip_failure_branch_to_avoid_crash |=
-        (data_type_in == PrimitiveType::F8E4M3FN &&
-         data_type_out == PrimitiveType::F64);
+    crashes_on_failure |= (data_type_in == PrimitiveType::F8E4M3FN &&
+                           data_type_out == PrimitiveType::F64);
+
+    crashes_on_failure |=
+        any_is(PrimitiveType::F8E4M3FN) && any_is(PrimitiveType::F8E5M2);
   }
 
   // Crashes due to unsupported/unspecified rounding mode.
-  skip_failure_branch_to_avoid_crash |=
-      (any_is(PrimitiveType::F8E4M3FN) && any_is(PrimitiveType::F8E5M2)) ||
-      (data_type_in == PrimitiveType::F64 &&
-       (data_type_out == PrimitiveType::F8E4M3FN ||
-        data_type_out == PrimitiveType::F8E5M2));
+  crashes_on_failure |= (data_type_in == PrimitiveType::F64 &&
+                         (data_type_out == PrimitiveType::F8E4M3FN ||
+                          data_type_out == PrimitiveType::F8E5M2));
 
   // Crashes due to unsupported conversion.
-  skip_failure_branch_to_avoid_crash |=
-      (data_type_out == PrimitiveType::F64 &&
-       (data_type_in == PrimitiveType::F8E4M3FN ||
-        data_type_in == PrimitiveType::F8E5M2));
+  crashes_on_failure |= (data_type_out == PrimitiveType::F64 &&
+                         (data_type_in == PrimitiveType::F8E4M3FN ||
+                          data_type_in == PrimitiveType::F8E5M2));
 
-  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 32}, cc,
-                 skip_failure_branch_to_avoid_crash);
+  RunSupportTest(
+      std::move(ti), /*output_tile_sizes=*/{1, 32}, cc,
+      crashes_on_failure ? ExpectedFailMode::kCrash : ExpectedFailMode::kFail);
 }
 
 constexpr std::array kTestedOpsConvert = {HloOpcode::kConvert};
@@ -459,14 +518,15 @@ ENTRY triton_computation {
                                          : kHloTestTemplate,
                                      data_type, opcode));
 
-  bool skip_failure_branch_to_avoid_crash =
-      opcode == HloOpcode::kDivide &&
+  ExpectedFailMode fail_mode = ExpectedFailMode::kFail;
+  if (opcode == HloOpcode::kDivide &&
       (data_type == PrimitiveType::BF16 || data_type == PrimitiveType::F16 ||
        data_type == PrimitiveType::F8E5M2 ||
-       data_type == PrimitiveType::F8E4M3FN);
+       data_type == PrimitiveType::F8E4M3FN)) {
+    fail_mode = ExpectedFailMode::kCrash;
+  };
 
-  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 32}, cc,
-                 skip_failure_branch_to_avoid_crash);
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 32}, cc, fail_mode);
 }
 
 TEST_P(BinaryElementwiseTest, IsTritonSupportedBinaryElementwise0D) {
@@ -492,14 +552,15 @@ ENTRY triton_computation {
                                          : kHloTestTemplate,
                                      data_type, opcode));
 
-  bool skip_failure_branch_to_avoid_crash =
-      opcode == HloOpcode::kDivide &&
+  ExpectedFailMode fail_mode = ExpectedFailMode::kFail;
+  if (opcode == HloOpcode::kDivide &&
       (data_type == PrimitiveType::BF16 || data_type == PrimitiveType::F16 ||
        data_type == PrimitiveType::F8E5M2 ||
-       data_type == PrimitiveType::F8E4M3FN);
+       data_type == PrimitiveType::F8E4M3FN)) {
+    fail_mode = ExpectedFailMode::kCrash;
+  }
 
-  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{}, cc,
-                 skip_failure_branch_to_avoid_crash);
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{}, cc, fail_mode);
 }
 
 constexpr std::array kTestedOpsBinaryElementwise = {
@@ -679,16 +740,16 @@ add {
 ENTRY triton_computation {
   parameter_0 = $$0[125,127] parameter(0)
   constant_0 = $$0[] constant($0)
-  tuple = ($$0[125], $$0[125]) reduce(
+  ROOT reduce = ($$0[125], $$0[125]) reduce(
     parameter_0, parameter_0, constant_0, constant_0),
       dimensions={1}, to_apply=add
-  ROOT reduce = $$0[125] get-tuple-element(tuple), index=0
 })",
                                                         init_value(data_type));
   TF_ASSERT_OK_AND_ASSIGN(
       TestedInstruction ti,
       ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
-  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc);
+  RunSupportTestMultipleOutputTiles(std::move(ti),
+                                    /*output_tile_sizes=*/{{1}, {1}}, cc);
 }
 
 TEST_F(ReduceTest, ReduceWithNonConstReduceValueIsSupportedWithTriton) {
@@ -770,15 +831,14 @@ ENTRY triton_computation {
                           ParseTemplateAndGetInstruction(
                               kHloTestTemplate, data_type, HloOpcode::kReduce));
 
-  // TODO(b/361526623): Reduce the cases where setting
-  // skip_failure_branch_to_avoid_crash is needed.
-  bool skip_failure_branch_to_avoid_crash =
-      opcode == HloOpcode::kDivide &&
+  // TODO(b/361526623): Reduce the cases where emitter crashes.
+  ExpectedFailMode fail_mode = ExpectedFailMode::kFail;
+  if (opcode == HloOpcode::kDivide &&
       (data_type == BF16 || data_type == F16 || data_type == F8E4M3FN ||
-       data_type == F8E5M2);
-
-  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc,
-                 skip_failure_branch_to_avoid_crash);
+       data_type == F8E5M2)) {
+    fail_mode = ExpectedFailMode::kCrash;
+  }
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc, fail_mode);
 }
 
 std::vector<HloOpcode> ExcludeOps(absl::Span<const HloOpcode> all_ops,
@@ -875,6 +935,98 @@ INSTANTIATE_TEST_SUITE_P(SliceTestSuite, SliceTest,
                          AllTestCombinationsForOpcodes(kTestedOpsSlice),
                          TritonSupportTestTypeAndOpcodeAndDeviceToString);
 
+class TritonSupportTestWithDeviceParam
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<se::GpuComputeCapability> {};
+
+using ConcatenateDeviceTest = TritonSupportTestWithDeviceParam;
+
+TEST_P(ConcatenateDeviceTest,
+       TritonDoesNotSupportConcatenateOfUnnestedParameters) {
+  auto cc = GetParam();
+  const std::string kHloTestTemplate = R"(
+ENTRY triton_computation {
+  p0 = $0[18,128,20] parameter(0)
+  p1 = $0[18,128,20] parameter(1)
+  p2 = $0[18,128,20] parameter(2)
+  ROOT concatenate = $0[18,384,20] concatenate(p0, p1, p2), dimensions={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTestTemplate, F32, HloOpcode::kConcatenate,
+                              /*use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 64, 1}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(ConcatenateTestSuite, ConcatenateDeviceTest,
+                         ::testing::ValuesIn(AllDevicesToTest()));
+
+// TODO(b/393299275): remove the boolean parameter once the migration is
+// complete.
+class TritonSupportTestWithTypeAndDeviceAndBoolParam
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, se::GpuComputeCapability, bool>> {
+ public:
+  static std::string ParamToString(
+      const ::testing::TestParamInfo<ParamType>& info) {
+    auto [data_type, cc, use_nested_gemm_fusions] = info.param;
+    return absl::StrCat(PrimitiveType_Name(data_type), "_",
+                        ComputeCapabilityToString(cc), "_",
+                        use_nested_gemm_fusions ? "nested_gemm_fusions"
+                                                : "no_nested_gemm_fusions");
+  }
+};
+
+using ConcatenateTest = TritonSupportTestWithTypeAndDeviceAndBoolParam;
+
+TEST_P(ConcatenateTest, IsTritonSupportedConcatenate) {
+  auto [data_type, cc, use_nested_gemm_fusions] = GetParam();
+  const std::string kHloTestTemplate = R"(
+nest0 {
+  ROOT p0 = $0[128] parameter(0)
+}
+
+nest1 {
+  ROOT p0 = $0[128] parameter(0)
+}
+
+nest2 {
+  ROOT p0 = $0[128] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128] parameter(0)
+  p1 = $0[128] parameter(1)
+  p2 = $0[128] parameter(2)
+
+  fusion0 = $0[128] fusion(p0), kind=kCustom, calls=nest0, backend_config={
+    "fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":["64"]}]}}}
+  fusion1 = $0[128] fusion(p1), kind=kCustom, calls=nest1, backend_config={
+    "fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":["64"]}]}}}
+  fusion2 = $0[128] fusion(p2), kind=kCustom, calls=nest2, backend_config={
+    "fusion_backend_config":{"kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":["64"]}]}}}
+  ROOT result = $0[384] concatenate(fusion0, fusion1, fusion2), dimensions={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
+                                                    kHloTestTemplate, data_type,
+                                                    HloOpcode::kConcatenate,
+                                                    use_nested_gemm_fusions));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{64}, cc);
+}
+
+constexpr std::array kTestedOpsConcatenate = {HloOpcode::kConcatenate};
+
+INSTANTIATE_TEST_SUITE_P(
+    ConcatenateTestSuite, ConcatenateTest,
+    ::testing::Combine(::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::ValuesIn(AllDevicesToTest()),
+                       ::testing::Bool()),
+    ConcatenateTest::ParamToString);
+
 using CollectiveTest = TritonSupportTestWithTypeAndDeviceParam;
 
 TEST_P(CollectiveTest, UnsupportedAllGatherFailsGracefullyWithTriton) {
@@ -903,7 +1055,8 @@ ENTRY triton_computation {
       TestedInstruction ti,
       ParseTemplateAndGetInstruction(kHloTestTemplate, data_type,
                                      HloOpcode::kAllGatherStart));
-  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{2, 2}, cc);
+  RunSupportTestMultipleOutputTiles(std::move(ti),
+                                    /*output_tile_sizes=*/{{2, 2}, {2, 2}}, cc);
 }
 
 TEST_P(CollectiveTest, UnsupportedAllGatherDoneFailsGracefullyWithTriton) {
@@ -1198,15 +1351,22 @@ using ParameterTest = TritonSupportTestWithTypeAndDeviceParam;
 
 TEST_P(ParameterTest, Parameter) {
   auto [data_type, cc] = GetParam();
-  const std::string kHloTestTemplate = R"(
+  std::string hlo_test_template =
+      R"(
+ENTRY triton_computation {
+  ROOT root = $0[35,131] parameter(0)
+})";
+  if (data_type == S4) {  // S4 is not a valid output, convert it to S8.
+    hlo_test_template = R"(
 ENTRY triton_computation {
   input = $0[35,131] parameter(0)
-  // TODO(b/363961478) remove the line below once parameters can be ROOT.
-  ROOT noop = $0[35,131] convert(input)
+  ROOT noop = s8[35,131] convert(input)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
-                                                    kHloTestTemplate, data_type,
-                                                    HloOpcode::kParameter));
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_test_template, data_type,
+                                     HloOpcode::kParameter));
   RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32}, cc);
 }
 
@@ -1314,20 +1474,15 @@ ENTRY triton_computation {
   TF_ASSERT_OK_AND_ASSIGN(
       TestedInstruction ti,
       ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
-  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32}, cc);
+  RunSupportTestMultipleOutputTiles(std::move(ti),
+                                    /*output_tile_sizes=*/{{1}, {16, 32}}, cc);
 }
 
-constexpr std::array kTestedOpsRngBitGenerator = {HloOpcode::kRngBitGenerator};
-
 INSTANTIATE_TEST_SUITE_P(
     RngBitGeneratorTestSuite, RngBitGeneratorTest,
-    AllTestCombinationsForOpcodes(kTestedOpsRngBitGenerator),
+    AllTestCombinationsForOpcodes({HloOpcode::kRngBitGenerator}),
     TritonSupportTestTypeAndOpcodeAndDeviceToString);
 
-class TritonSupportTestWithDeviceParam
-    : public TritonSupportTest,
-      public ::testing::WithParamInterface<se::GpuComputeCapability> {};
-
 using RngGetAndUpdateStateTest = TritonSupportTestWithDeviceParam;
 
 TEST_P(RngGetAndUpdateStateTest, RngGetAndUpdateState) {
@@ -1343,8 +1498,6 @@ TEST_P(RngGetAndUpdateStateTest, RngGetAndUpdateState) {
                                      HloOpcode::kRngGetAndUpdateState));
   RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc);
 }
-constexpr std::array kTestedOpsRngGetAndUpdateState = {
-    HloOpcode::kRngGetAndUpdateState};
 
 INSTANTIATE_TEST_SUITE_P(RngGetAndUpdateStateTestSuite,
                          RngGetAndUpdateStateTest,
@@ -1377,96 +1530,1542 @@ ENTRY triton_computation {
   RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32}, cc);
 }
 
-constexpr std::array kTestedOpsComplex = {HloOpcode::kComplex};
-
 INSTANTIATE_TEST_SUITE_P(ComplexTestSuite, ComplexTest,
-                         AllTestCombinationsForOpcodes(kTestedOpsComplex),
+                         AllTestCombinationsForOpcodes({HloOpcode::kComplex}),
                          TritonSupportTestTypeAndOpcodeAndDeviceToString);
 
-constexpr std::array kUnsupportedOps = {
-    // clang-format off
-    // go/keep-sorted start
-    HloOpcode::kAddDependency,
-    HloOpcode::kAfterAll,
-    HloOpcode::kBatchNormGrad,
-    HloOpcode::kBatchNormInference,
-    HloOpcode::kBatchNormTraining,
-    HloOpcode::kBitcastConvert,
-    HloOpcode::kCall,
-    HloOpcode::kCholesky,
-    HloOpcode::kConcatenate,
-    HloOpcode::kConditional,
-    HloOpcode::kConvolution,
-    HloOpcode::kCopyDone,
-    HloOpcode::kCopyStart,
-    HloOpcode::kCustomCall,
-    HloOpcode::kDomain,
-    HloOpcode::kDot,
-    HloOpcode::kDynamicReshape,
-    HloOpcode::kDynamicSlice,
-    HloOpcode::kDynamicUpdateSlice,
-    HloOpcode::kFft,
-    HloOpcode::kFusion,
-    HloOpcode::kGather,
-    HloOpcode::kGetDimensionSize,
-    HloOpcode::kGetTupleElement,
-    HloOpcode::kInfeed,
-    HloOpcode::kMap,
-    HloOpcode::kOptimizationBarrier,
-    HloOpcode::kOutfeed,
-    HloOpcode::kPad,
-    HloOpcode::kRaggedDot,
-    HloOpcode::kRecv,
-    HloOpcode::kRecvDone,
-    HloOpcode::kReduceWindow,
-    HloOpcode::kReverse,
-    HloOpcode::kScatter,
-    HloOpcode::kSelectAndScatter,
-    HloOpcode::kSend,
-    HloOpcode::kSendDone,
-    HloOpcode::kSetDimensionSize,
-    HloOpcode::kSort,
-    HloOpcode::kStochasticConvert,
-    HloOpcode::kTopK,
-    HloOpcode::kTriangularSolve,
-    HloOpcode::kTuple,
-    HloOpcode::kWhile
-    // go/keep-sorted end
-    // clang-format on
+using ConditionalTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(ConditionalTest, Conditional) {
+  auto [data_type, opcode, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+true_branch {
+  p_true = $0[10] parameter(0)
+  ROOT add = $0[10] add(p_true, p_true)
+}
+false_branch {
+  p_false = $0[10] parameter(0)
+  ROOT mul = $0[10] multiply(p_false, p_false)
+}
+ENTRY triton_computation {
+  cond = pred[] parameter(0)
+  operand = $0[10] parameter(1)
+  ROOT conditional_op = $0[10] conditional(cond, operand, operand),
+                              true_computation=true_branch,
+                              false_computation=false_branch
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ConditionalTestSuite, ConditionalTest,
+    AllTestCombinationsForOpcodes({HloOpcode::kConditional}),
+    TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+using WhileTest = TritonSupportTestWithDeviceParam;
+// TODO: b/363981282 - Add tests for more data types.
+TEST_P(WhileTest, While) {
+  auto cc = GetParam();
+  const std::string kHloTestTemplate = R"(
+body {
+  constant = s32[] constant(1)
+  prev = s32[] parameter(0)
+  ROOT add = s32[] add(constant, prev)
+}
+condition {
+  constant = s32[] constant(5)
+  prev = s32[] parameter(0)
+  ROOT greater-than = pred[] compare(constant, prev), direction=GT
+}
+ENTRY triton_computation {
+  constant = s32[] constant(0)
+  ROOT while = s32[] while(constant), condition=condition, body=body
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate,
+                                     F16,  // data_type doesn't matter here
+                                     HloOpcode::kWhile));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(WhileTestSuite, WhileTest,
+                         ::testing::ValuesIn(AllDevicesToTest()),
+                         TritonSupportTestDeviceToString);
+
+using CallTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(CallTest, Call) {
+  auto [data_type, opcode, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+called_computation {
+  p = $0[10] parameter(0)
+  ROOT add = $0[10] add(p, p)
+}
+
+ENTRY triton_computation {
+  operand = $0[10] parameter(0)
+  ROOT call_op = $0[10] call(operand), to_apply=called_computation
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(CallTestSuite, CallTest,
+                         AllTestCombinationsForOpcodes({HloOpcode::kCall}),
+                         TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+using BatchNormInferenceTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(BatchNormInferenceTest, BatchNormInference) {
+  auto [data_type, opcode, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+ENTRY triton_computation {
+  operand = $0[4,8,16,32] parameter(0)
+  scale = $0[32] parameter(1)
+  offset = $0[32] parameter(2)
+  mean = $0[32] parameter(3)
+  variance = $0[32] parameter(4)
+  ROOT bn_inf = $0[4,8,16,32] batch-norm-inference(operand, scale, offset, mean, variance),
+    epsilon=0.001, feature_index=3
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 1, 4, 8}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BatchNormInferenceSuite, BatchNormInferenceTest,
+    AllTestCombinationsForOpcodes({HloOpcode::kBatchNormInference}),
+    TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+using BatchNormTrainingTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(BatchNormTrainingTest, BatchNormTraining) {
+  auto [data_type, opcode, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+ENTRY triton_computation {
+  operand = $0[4,8,16,32] parameter(0)
+  scale = $0[32] parameter(1)
+  offset = $0[32] parameter(2)
+  ROOT bn_train = ($0[4,8,16,32], $0[32], $0[32]) batch-norm-training(operand, scale, offset),
+    epsilon=0.001, feature_index=3
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTestMultipleOutputTiles(
+      std::move(ti), /*output_tile_sizes=*/{{1, 1, 4, 8}, {1}, {1}}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BatchNormTrainingSuite, BatchNormTrainingTest,
+    AllTestCombinationsForOpcodes({HloOpcode::kBatchNormTraining}),
+    TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+using BatchNormGradTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(BatchNormGradTest, BatchNormGrad) {
+  auto [data_type, opcode, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+ENTRY triton_computation {
+  operand = $0[4,8,16,32] parameter(0)
+  scale = $0[32] parameter(1)
+  mean = $0[32] parameter(2)
+  variance = $0[32] parameter(3)
+  grad_output = $0[4,8,16,32] parameter(4)
+  ROOT bn_grad = ($0[4,8,16,32], $0[32], $0[32]) batch-norm-grad(operand, scale, mean, variance, grad_output),
+    epsilon=0.001, feature_index=3
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTestMultipleOutputTiles(
+      std::move(ti), /*output_tile_sizes=*/{{1, 1, 4, 8}, {1}, {1}}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BatchNormGradSuite, BatchNormGradTest,
+    AllTestCombinationsForOpcodes({HloOpcode::kBatchNormGrad}),
+    TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+using DomainTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(DomainTest, Domain) {
+  auto [data_type, opcode, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+ENTRY triton_computation {
+  operand = $0[] parameter(0)
+  ROOT domain_op = $0[] domain(operand), domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(DomainSuite, DomainTest,
+                         AllTestCombinationsForOpcodes({HloOpcode::kDomain}),
+                         TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+using GetDimensionSizeTest = TritonSupportTestWithDeviceParam;
+
+TEST_P(GetDimensionSizeTest, GetDimensionSize) {
+  const auto cc = GetParam();
+
+  const std::string kHloTestTemplate = R"(
+ENTRY triton_computation {
+  operand = s32[16, 32] parameter(0)
+  ROOT get_dim_size = s32[] get-dimension-size(operand), dimensions={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate,
+                                     F16,  // data_type doesn't matter here
+                                     HloOpcode::kGetDimensionSize));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(GetDimensionSizeSuite, GetDimensionSizeTest,
+                         ::testing::ValuesIn(AllDevicesToTest()),
+                         TritonSupportTestDeviceToString);
+
+using ReverseTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(ReverseTest, Reverse) {
+  auto [data_type, opcode, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+ENTRY triton_computation {
+  operand = $0[16,32] parameter(0)
+  ROOT reverse_op = $0[16,32] reverse(operand), dimensions={0, 1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{4, 8}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(ReverseSuite, ReverseTest,
+                         AllTestCombinationsForOpcodes({HloOpcode::kReverse}),
+                         TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+using DotTest = TritonSupportTest;
+
+class DotTypesTest
+    : public DotTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, PrimitiveType, se::GpuComputeCapability>> {
+ public:
+  static std::string ParamToString(
+      const ::testing::TestParamInfo<DotTypesTest::ParamType>& data) {
+    auto [result_type, input_type, cc] = data.param;
+    return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(result_type),
+                        "_",
+                        primitive_util::LowercasePrimitiveTypeName(input_type),
+                        "_", ComputeCapabilityToString(cc));
+  };
 };
 
-absl::flat_hash_set<HloOpcode> AllTestedOpcodes() {
-  absl::flat_hash_set<HloOpcode> ret;
-  ret.insert(kTestedOpsBitcastReshape.begin(), kTestedOpsBitcastReshape.end());
-  ret.insert(kTestedOpsUnaryElementwise.begin(),
-             kTestedOpsUnaryElementwise.end());
-  ret.insert(kTestedOpsConvert.begin(), kTestedOpsConvert.end());
-  ret.insert(kTestedOpsBinaryElementwise.begin(),
-             kTestedOpsBinaryElementwise.end());
-  ret.insert(kTestedOpsTernaryElementwise.begin(),
-             kTestedOpsTernaryElementwise.end());
-  ret.insert(kTestedOpsReduction.begin(), kTestedOpsReduction.end());
-  ret.insert(kTestedOpsSlice.begin(), kTestedOpsSlice.end());
-  ret.insert(kTestedOpsTranspose.begin(), kTestedOpsTranspose.end());
-  ret.insert(kTestedOpsCollectives.begin(), kTestedOpsCollectives.end());
-  ret.insert(kTestedOpsBroadcast.begin(), kTestedOpsBroadcast.end());
-  ret.insert(kTestedOpsParameter.begin(), kTestedOpsParameter.end());
-  ret.insert(kTestedOpsConstant.begin(), kTestedOpsConstant.end());
-  ret.insert(kTestedOpsIota.begin(), kTestedOpsIota.end());
-  ret.insert(kTestedOpsRng.begin(), kTestedOpsRng.end());
-  ret.insert(kTestedOpsRngBitGenerator.begin(),
-             kTestedOpsRngBitGenerator.end());
-  ret.insert(kTestedOpsRngGetAndUpdateState.begin(),
-             kTestedOpsRngGetAndUpdateState.end());
-  ret.insert(kTestedOpsComplex.begin(), kTestedOpsComplex.end());
+TEST_P(DotTypesTest, Dot) {
+  // Testing B[] = dot(A[], A[]).
+  auto [result_type, input_type, cc] = GetParam();
 
+  ExpectedFailMode fail_mode = ExpectedFailMode::kFail;
+  if (input_type == F8E4M3FN || result_type == F8E4M3FN) {
+    if (auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&cc);
+        cuda_cc && !cuda_cc->IsAtLeastHopper()) {
+      // Hits llvm::report_fatal_error during Triton compilation.
+      fail_mode = ExpectedFailMode::kFailOrCrash;
+    }
+  }
+
+  std::string hlo_text = R"(
+flhs {
+  ROOT result = $0[128,256] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,256] parameter(0)
+  p1 = $0[256,512] parameter(1)
+  lhs = $0[128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "64"]}]
+      }
+    }
+  }
+  rhs = $0[256,512]{1,0} fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "32"]}]
+      }
+    }
+  }
+  ROOT result = $1[128,512]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  hlo_text = absl::Substitute(
+      hlo_text, primitive_util::LowercasePrimitiveTypeName(input_type),
+      primitive_util::LowercasePrimitiveTypeName(result_type));
+
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              hlo_text, PRIMITIVE_TYPE_INVALID, HloOpcode::kDot,
+                              /*use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32}, cc, fail_mode);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    DotTestSuite, DotTypesTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(AllOpSupportedTypes(HloOpcode::kDot)),
+        ::testing::ValuesIn(AllOpSupportedTypes(HloOpcode::kDot)),
+        ::testing::ValuesIn(AllDevicesToTest())),
+    DotTypesTest::ParamToString);
+
+TEST_F(DotTest, NonFusionRhs) {
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[128,256] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,256] parameter(0)
+  p1 = $0[256,512] parameter(1)
+  lhs = $0[128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "64"]}]
+      }
+    }
+  }
+  ROOT result = $0[128,512] dot(lhs, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+TEST_F(DotTest, NonFusionLhs) {
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,256] parameter(0)
+  p1 = $0[256,512] parameter(1)
+  rhs = $0[256,512] fusion(p1), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "64"]}]
+      }
+    }
+  }
+  ROOT result = $0[128,512] dot(p0, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+TEST_F(DotTest, SingleBatchDim) {
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[16,128,256] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[16,256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[16,128,256] parameter(0)
+  p1 = $0[16,256,512] parameter(1)
+  lhs = $0[16,128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "16", "64"]}]
+      }
+    }
+  }
+  rhs = $0[16,256,512] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "64", "32"]}]
+      }
+    }
+  }
+  ROOT result = $0[16,128,512] dot(lhs, rhs),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 16, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+TEST_F(DotTest, MultipleNonContractingDimensions) {
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[16,128,256] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[16,256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[16,128,256] parameter(0)
+  p1 = $0[16,256,512] parameter(1)
+  lhs = $0[16,128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "16", "64"]}]
+      }
+    }
+  }
+  rhs = $0[16,256,512] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1", "64", "32"]}]
+      }
+    }
+  }
+  ROOT result = $0[16,128,16,512] dot(lhs, rhs),
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 16, 1, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+TEST_F(DotTest, MultipleContractingDimensions) {
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[128,16,256] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[16,256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,16,256] parameter(0)
+  lhs = $0[128,16,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "4", "64"]}]
+      }
+    }
+  }
+  p1 = $0[16,256,512] parameter(1)
+  rhs = $0[16,256,512] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "4", "32"]}]
+      }
+    }
+  }
+  ROOT result = $0[128,512] dot(lhs, rhs),
+    lhs_contracting_dims={1, 2},
+    rhs_contracting_dims={0, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+TEST_F(DotTest, NonDefaultDimensionOrder_kmkn) {
+  // Multiplying as [k, m] x [k, n] = [m, n].
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[256,128] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[256,128] parameter(0)
+  p1 = $0[256,512] parameter(1)
+  lhs = $0[256,128] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "16"]}]
+      }
+    }
+  }
+  rhs = $0[256,512] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "32"]}]
+      }
+    }
+  }
+  ROOT result = $0[128,512] dot(lhs, rhs),
+    lhs_contracting_dims={0},
+    rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+TEST_F(DotTest, NonDefaultDimensionOrder_mknk) {
+  // Muliplying as [m, k] x [n, k] = [m, n].
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[128,256] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[512,256] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,256] parameter(0)
+  p1 = $0[512,256] parameter(1)
+  lhs = $0[128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "64"]}]
+      }
+    }
+  }
+  rhs = $0[512,256] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["32", "64"]}]
+      }
+    }
+  }
+  ROOT result = $0[128,512] dot(lhs, rhs),
+    lhs_contracting_dims={1},
+    rhs_contracting_dims={1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+TEST_F(DotTest, SparsityConfiguration) {
+  // Note that support rejects this HLO as u16 is not supported.
+  const std::string kHloTestTemplate = R"(
+flhs {
+  ROOT result = $0[128,128] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,128] parameter(0)
+  p1 = $0[256,512] parameter(1)
+  lhs = $0[128,128] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "64"]}]
+      }
+    }
+  }
+  rhs = $0[256,512] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "32"]}]
+      }
+    }
+  }
+  meta = u16[128,16] parameter(2)
+  ROOT result = $0[128,512] dot(lhs, rhs, meta),
+    lhs_contracting_dims={1},
+    rhs_contracting_dims={0},
+    sparsity=L.1@2:4
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32},
+                 se::CudaComputeCapability::Ampere());
+}
+
+class DotPrecisionTest
+    : public DotTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, PrecisionConfig::Precision,
+                     PrecisionConfig::Precision, se::GpuComputeCapability>> {};
+
+std::string DotPrecisionTestName(
+    const ::testing::TestParamInfo<
+        std::tuple<PrimitiveType, PrecisionConfig::Precision,
+                   PrecisionConfig::Precision, se::GpuComputeCapability>>&
+        data) {
+  auto [type, lhs_precision, rhs_precision, cc] = data.param;
+  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(type), "_",
+                      PrecisionToString(lhs_precision), "_",
+                      PrecisionToString(rhs_precision), "_",
+                      ComputeCapabilityToString(cc));
+}
+
+TEST_P(DotPrecisionTest, OperandPrecision) {
+  auto [data_type, lhs_precision, rhs_precision, cc] = GetParam();
+  std::string hlo_text = absl::Substitute(
+      R"(
+flhs {
+  ROOT result = $0[128,256] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,256] parameter(0)
+  p1 = $0[256,512] parameter(1)
+  lhs = $0[128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "64"]}]
+      }
+    }
+  }
+  rhs = $0[256,512] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "32"]}]
+      }
+    }
+  }
+  ROOT result = $0[128,512] dot(lhs, rhs),
+    lhs_contracting_dims={1},
+    rhs_contracting_dims={0},
+    operand_precision={$1, $2}
+}
+)",
+      primitive_util::LowercasePrimitiveTypeName(data_type),
+      PrecisionToString(lhs_precision), PrecisionToString(rhs_precision));
+
+  ExpectedFailMode fail_mode = ExpectedFailMode::kFail;
+  if (absl::c_linear_search(std::vector{F8E5M2, F8E4M3FN, S8}, data_type)) {
+    fail_mode = ExpectedFailMode::kFailOrCrash;
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(
+          hlo_text, PrimitiveType::PRIMITIVE_TYPE_INVALID, HloOpcode::kDot,
+          /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32}, cc, fail_mode);
+}
+
+constexpr std::array kOperandPrecisions = {
+    // All precisions except PACKED_NIBBLE.
+    PrecisionConfig::DEFAULT,
+    PrecisionConfig::HIGH,
+    PrecisionConfig::HIGHEST,
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    DotPrecisionTestSuite, DotPrecisionTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(AllOpSupportedTypes(HloOpcode::kDot)),
+        ::testing::ValuesIn(kOperandPrecisions),
+        ::testing::ValuesIn(kOperandPrecisions),
+        ::testing::ValuesIn(AllDevicesToTest())),
+    DotPrecisionTestName);
+
+class DotPrecisionAlgorithmTest
+    : public DotTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, PrecisionConfig::Algorithm,
+                     se::GpuComputeCapability>> {};
+
+std::string DotPrecisionAlgorithmTestName(
+    const ::testing::TestParamInfo<std::tuple<
+        PrimitiveType, PrecisionConfig::Algorithm, se::GpuComputeCapability>>&
+        data) {
+  auto [type, algorigthm, cc] = data.param;
+  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(type), "_",
+                      AlgorithmToString(algorigthm), "_",
+                      ComputeCapabilityToString(cc));
+}
+
+TEST_P(DotPrecisionAlgorithmTest, Algorithm) {
+  auto [data_type, algorithm, cc] = GetParam();
+  std::string hlo_text =
+      absl::Substitute(R"(
+flhs {
+  ROOT result = $0[128,256] parameter(0)
+}
+
+frhs {
+  ROOT result = $0[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = $0[128,256] parameter(0)
+  p1 = $0[256,512] parameter(1)
+  lhs = $0[128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["16", "64"]}]
+      }
+    }
+  }
+  rhs = $0[256,512] fusion(p1), kind=kCustom, calls=frhs, backend_config={
+    "fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["64", "32"]}]
+      }
+    }
+  }
+  ROOT result = $0[128,512] dot(lhs, rhs),
+    lhs_contracting_dims={1},
+    rhs_contracting_dims={0},
+    algorithm=$1
+}
+)",
+                       primitive_util::LowercasePrimitiveTypeName(data_type),
+                       AlgorithmToString(algorithm));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, F32, HloOpcode::kDot,
+                                     /* use_nested_gemm_fusions=*/true));
+
+  ExpectedFailMode fail_mode = ExpectedFailMode::kFail;
+  if (absl::c_linear_search(std::vector{F8E5M2, F8E4M3FN, F8E4M3, S8},
+                            data_type)) {
+    fail_mode = ExpectedFailMode::kFailOrCrash;
+  }
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32}, cc, fail_mode);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    DotPrecisionTestSuite, DotPrecisionAlgorithmTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(AllOpSupportedTypes(HloOpcode::kDot)),
+        ::testing::ValuesIn(AllPrecisionAlgorithms()),
+        ::testing::ValuesIn(AllDevicesToTest())),
+    DotPrecisionAlgorithmTestName);
+
+class FusionKindsTest
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<
+          std::tuple<absl::string_view, se::GpuComputeCapability>> {};
+
+TEST_P(FusionKindsTest, OperandOfDot) {
+  auto [kind, cc] = GetParam();
+  const std::string hlo_text = absl::Substitute(
+      R"(
+flhs {
+  ROOT result = f32[128,256] parameter(0)
+}
+
+frhs {
+  ROOT result = f32[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = f32[128,256] parameter(0)
+  p1 = f32[256,512] parameter(1)
+  lhs = f32[128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{"kind":"$0", "block_level_fusion_config":{
+    "output_tiles":[{"sizes":["16", "64"]}]}}}
+  rhs = f32[256,512]{1,0} fusion(p1), kind=kCustom, calls=frhs,
+    backend_config={ "fusion_backend_config":{ "kind":"$0",
+    "block_level_fusion_config": {"output_tiles":[{"sizes":["64", "32"]}]}}}
+  ROOT result = f32[128,512]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)",
+      kind);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, F32, HloOpcode::kFusion,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{16, 32}, cc);
+}
+
+std::string FusionKindsTestName(
+    const ::testing::TestParamInfo<
+        std::tuple<absl::string_view, se::GpuComputeCapability>>& data) {
+  auto [kind, cc] = data.param;
+  return absl::StrCat(kind, "_", ComputeCapabilityToString(cc));
+}
+
+TEST_P(FusionKindsTest, OperandOfConcatenate) {
+  auto [kind, cc] = GetParam();
+  const std::string hlo_text = absl::Substitute(
+      R"(
+nest0 {
+  ROOT p0 = f32[128] parameter(0)
+}
+
+nest1 {
+  ROOT p0 = f32[128] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = f32[128] parameter(0)
+  p1 = f32[128] parameter(1)
+
+  fusion0 = f32[128] fusion(p0), kind=kCustom, calls=nest0, backend_config={
+    "fusion_backend_config":{"kind":"$0",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":["64"]}]}}}
+  fusion1 = f32[128] fusion(p1), kind=kCustom, calls=nest1, backend_config={
+    "fusion_backend_config":{"kind":"$0",
+    "block_level_fusion_config":{"output_tiles":[{"sizes":["64"]}]}}}
+  ROOT result = f32[256] concatenate(fusion0, fusion1), dimensions={0}
+}
+)",
+      kind);
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, F32, HloOpcode::kFusion,
+                                     /* use_nested_gemm_fusions=*/true));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{64}, cc);
+}
+
+std::vector<absl::string_view> FusionKindsForTest() {
+  return {kTritonFusionKind, kTritonNestedGemmFusionKind, "__invalid"};
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FusionTestSuite, FusionKindsTest,
+    ::testing::Combine(::testing::ValuesIn(FusionKindsForTest()),
+                       ::testing::ValuesIn(AllDevicesToTest())),
+    FusionKindsTestName);
+
+using FusionTest = TritonSupportTest;
+
+TEST_F(FusionTest, FusionComputationIsCheckedRecursively) {
+  // We expect test for fail as `flhs` is not a supported computation as
+  // fusion there is not an operand of a dot or a concatenate.
+  absl::string_view hlo_text = R"(
+identity {
+  ROOT result = f32[128,256] parameter(0)
+}
+
+flhs {
+  p0 = f32[128,256] parameter(0)
+  ROOT result = f32[128,256] fusion(p0), kind=kCustom, calls=identity, backend_config={
+    "fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+    "output_tiles":[{"sizes":["16", "64"]}]}}}
+}
+
+frhs {
+  ROOT result = f32[256,512] parameter(0)
+}
+
+ENTRY triton_computation {
+  p0 = f32[128,256] parameter(0)
+  p1 = f32[256,512] parameter(1)
+  lhs = f32[128,256] fusion(p0), kind=kCustom, calls=flhs, backend_config={
+    "fusion_backend_config":{"kind":"__triton_nested_gemm_fusion", "block_level_fusion_config":{
+    "output_tiles":[{"sizes":["16", "64"]}]}}}
+  rhs = f32[256,512]{1,0} fusion(p1), kind=kCustom, calls=frhs,
+    backend_config={ "fusion_backend_config":{ "kind":"__triton_nested_gemm_fusion",
+    "block_level_fusion_config": {"output_tiles":[{"sizes":["64", "32"]}]}}}
+  ROOT result = f32[128,512]{1,0} dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, F32, HloOpcode::kFusion,
+                                     /*use_nested_gemm_fusions=*/true));
+  se::GpuComputeCapability cc = se::CudaComputeCapability::Ampere();
+  ASSERT_FALSE(IsTritonSupportedInstruction(ti.Instruction(), cc));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{64, 32}, cc);
+}
+
+class BitcastConvertTest
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, PrimitiveType, se::GpuComputeCapability>> {
+};
+
+TEST_P(BitcastConvertTest, BitcastConvert) {
+  auto [data_type_in, data_type_out, cc] = GetParam();
+
+  if (primitive_util::IsComplexType(data_type_in) !=
+      primitive_util::IsComplexType(data_type_out)) {
+    GTEST_SKIP()
+        << "BitcastConvert does not support complex <-> real conversion.";
+  }
+
+  std::string hlo_text;
+  std::vector<int64_t> output_tile_sizes = {1, 32};
+
+  const int bit_width_in = primitive_util::BitWidth(data_type_in);
+  const int bit_width_out = primitive_util::BitWidth(data_type_out);
+  const std::string data_type_in_str =
+      primitive_util::LowercasePrimitiveTypeName(data_type_in);
+  const std::string data_type_out_str =
+      primitive_util::LowercasePrimitiveTypeName(data_type_out);
+
+  if (bit_width_in == bit_width_out) {
+    hlo_text = absl::Substitute(
+        R"(
+ENTRY triton_computation {
+  parameter = $0[33,68] parameter(0)
+  ROOT bc_convert = $1[33,68] bitcast-convert(parameter)
+})",
+        data_type_in_str, data_type_out_str);
+  } else if (bit_width_in > bit_width_out) {
+    hlo_text = absl::Substitute(
+        R"(
+ENTRY triton_computation {
+  parameter = $0[33] parameter(0)
+  ROOT bc_convert = $1[33, $2] bitcast-convert(parameter)
+})",
+        data_type_in_str, data_type_out_str, bit_width_in / bit_width_out);
+  } else {  // bit_width_in < bit_width_out
+    hlo_text = absl::Substitute(
+        R"(
+ENTRY triton_computation {
+parameter = $0[33, $1] parameter(0)
+ROOT bc_convert = $2[33] bitcast-convert(parameter)
+})",
+        data_type_in_str, bit_width_out / bit_width_in, data_type_out_str);
+    output_tile_sizes = {1};
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, data_type_in,
+                                     HloOpcode::kBitcastConvert));
+
+  RunSupportTest(std::move(ti), output_tile_sizes, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BitcastConvertSuite, BitcastConvertTest,
+    ::testing::Combine(::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::ValuesIn(AllDevicesToTest())),
+    TritonSupportTestTwoTypesAndDeviceToString);
+
+using AddDependencyTest = TritonSupportTestWithDeviceParam;
+
+TEST_P(AddDependencyTest, AddDependency) {
+  auto cc = GetParam();
+  const std::string kHloTestTemplate = R"(
+    ENTRY triton_computation {
+      param = f32[10] parameter(0)
+      token0 = token[] after-all()
+      ROOT add_dep = f32[10] add-dependency(param, token0)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
+                                                    kHloTestTemplate,
+                                                    F32,  // Type is irrelevant.
+                                                    HloOpcode::kAddDependency));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(AddDependencySuite, AddDependencyTest,
+                         ::testing::ValuesIn(AllDevicesToTest()),
+                         TritonSupportTestDeviceToString);
+
+using AfterAllTest = TritonSupportTestWithDeviceParam;
+
+TEST_P(AfterAllTest, AfterAll) {
+  auto cc = GetParam();
+  const std::string kHloTestTemplate = R"(
+    ENTRY triton_computation {
+      token0 = token[] after-all()
+      token1 = token[] after-all()
+      ROOT token2 = token[] after-all(token0, token1)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
+                                                    kHloTestTemplate,
+                                                    F32,  // Type is irrelevant.
+                                                    HloOpcode::kAfterAll));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(AfterAllSuite, AfterAllTest,
+                         ::testing::ValuesIn(AllDevicesToTest()),
+                         TritonSupportTestDeviceToString);
+
+using TupleTest = TritonSupportTestWithDeviceParam;
+
+TEST_P(TupleTest, Tuple) {
+  auto cc = GetParam();
+  const std::string kHloTestTemplate = R"(
+    ENTRY triton_computation {
+      p0 = f32[10] parameter(0)
+      p1 = s32[5] parameter(1)
+      ROOT tuple_op = (f32[10], s32[5]) tuple(p0, p1)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTestTemplate,
+                              PRIMITIVE_TYPE_INVALID,  // Type is irrelevant.
+                              HloOpcode::kTuple));
+  RunSupportTestMultipleOutputTiles(std::move(ti),
+                                    /*output_tile_sizes=*/{{1}, {1}}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(TupleSuite, TupleTest,
+                         ::testing::ValuesIn(AllDevicesToTest()),
+                         TritonSupportTestDeviceToString);
+
+using GetTupleElementTest = TritonSupportTestWithDeviceParam;
+
+TEST_P(GetTupleElementTest, GetTupleElement) {
+  auto cc = GetParam();
+  const std::string kHloTestTemplate = R"(
+    ENTRY triton_computation {
+      tuple_op = (f32[10], s32[5]) parameter(0)
+      ROOT gte = f32[10] get-tuple-element(tuple_op), index=0
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTestTemplate,
+                              PRIMITIVE_TYPE_INVALID,  // Type is irrelevant.
+                              HloOpcode::kGetTupleElement));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(GetTupleElementSuite, GetTupleElementTest,
+                         ::testing::ValuesIn(AllDevicesToTest()),
+                         TritonSupportTestDeviceToString);
+
+using CustomCallTest = TritonSupportTestWithDeviceParam;
+
+TEST_P(CustomCallTest, CustomCall) {
+  auto cc = GetParam();
+  const std::string kHloTestTemplate = R"(
+    ENTRY triton_computation {
+      parameter = f32[10] parameter(0)
+      ROOT custom_call_op = f32[10] custom-call(parameter), custom_call_target="SomeTarget"
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
+                                                    kHloTestTemplate,
+                                                    F32,  // Type is irrelevant.
+                                                    HloOpcode::kCustomCall));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(CustomCallSuite, CustomCallTest,
+                         ::testing::ValuesIn(AllDevicesToTest()),
+                         TritonSupportTestDeviceToString);
+
+class CholeskyTest
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<
+          // The bool parameter is used to parametrize the lower=?.
+          std::tuple<PrimitiveType, se::GpuComputeCapability, bool>> {};
+
+TEST_P(CholeskyTest, Cholesky) {
+  auto [data_type, cc, lower] = GetParam();
+
+  const std::string kHloTestTemplate = absl::Substitute(
+      R"(
+    ENTRY triton_computation {
+      parameter = $0[4,4] parameter(0)
+      ROOT cholesky_op = $0[4,4] cholesky(parameter), lower=$1
+    })",
+      primitive_util::LowercasePrimitiveTypeName(data_type), lower);
+
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
+                                                    kHloTestTemplate, data_type,
+                                                    HloOpcode::kCholesky));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{2, 2}, cc);
+}
+
+std::string CholeskyTestName(
+    const ::testing::TestParamInfo<
+        std::tuple<PrimitiveType, se::GpuComputeCapability, bool>>& data) {
+  const auto [data_type, cc, lower] = data.param;
+  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type),
+                      "_", ComputeCapabilityToString(cc), "_", lower);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CholeskySuite, CholeskyTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(AllOpSupportedTypes(HloOpcode::kCholesky)),
+        ::testing::ValuesIn(AllDevicesToTest()), ::testing::Bool()),
+    CholeskyTestName);
+
+class TriangularSolveParamTest
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, se::GpuComputeCapability, bool /*lower*/,
+                     bool /*unit_diagonal*/,
+                     TriangularSolveOptions::Transpose /*transpose_a*/>> {
+ public:
+  static std::string ParamToString(
+      const ::testing::TestParamInfo<ParamType>& info) {
+    auto [data_type, cc, lower, unit_diagonal, transpose_a] = info.param;
+    return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type),
+                        "_", ComputeCapabilityToString(cc), "_lower", lower,
+                        "_unitdiag", unit_diagonal, "_",
+                        TriangularSolveOptions::Transpose_Name(transpose_a));
+  }
+};
+
+TEST_P(TriangularSolveParamTest, TriangularSolveLeftSideTrue) {
+  auto [data_type, cc, lower, unit_diagonal, transpose_a] = GetParam();
+
+  const std::string hlo_text = absl::Substitute(
+      R"(
+ENTRY triton_computation {
+  a = $0[2,4,4] parameter(0)
+  b = $0[2,4,2] parameter(1)
+  ROOT result = $0[2,4,2] triangular-solve(a, b),
+    left_side=true, lower=$1, unit_diagonal=$2, transpose_a=$3
+})",
+      primitive_util::LowercasePrimitiveTypeName(data_type),
+      lower ? "true" : "false", unit_diagonal ? "true" : "false",
+      TriangularSolveOptions::Transpose_Name(transpose_a));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, data_type,
+                                     HloOpcode::kTriangularSolve));
+  RunSupportTest(std::move(ti), {1, 2, 1}, cc);
+}
+
+TEST_P(TriangularSolveParamTest, TriangularSolveLeftSideFalse) {
+  auto [data_type, cc, lower, unit_diagonal, transpose_a] = GetParam();
+
+  const std::string hlo_text = absl::Substitute(
+      R"(
+ENTRY triton_computation {
+  a = $0[2,4,4] parameter(0)
+  b = $0[2,2,4] parameter(1)
+  ROOT result = $0[2,2,4] triangular-solve(a, b),
+    left_side=false, lower=$1, unit_diagonal=$2, transpose_a=$3
+})",
+      primitive_util::LowercasePrimitiveTypeName(data_type),
+      lower ? "true" : "false", unit_diagonal ? "true" : "false",
+      TriangularSolveOptions::Transpose_Name(transpose_a));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, data_type,
+                                     HloOpcode::kTriangularSolve));
+  RunSupportTest(std::move(ti), {1, 1, 2}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TriangularSolveSuite, TriangularSolveParamTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(AllOpSupportedTypes(HloOpcode::kTriangularSolve)),
+        ::testing::ValuesIn(AllDevicesToTest()),
+        ::testing::Bool(),  // lower
+        ::testing::Bool(),  // unit_diagonal
+        ::testing::ValuesIn({TriangularSolveOptions::NO_TRANSPOSE,
+                             TriangularSolveOptions::TRANSPOSE,
+                             TriangularSolveOptions::ADJOINT})),
+    TriangularSolveParamTest::ParamToString);
+
+class FftTest : public TritonSupportTest,
+                public ::testing::WithParamInterface<
+                    std::tuple<PrimitiveType, se::GpuComputeCapability>> {};
+
+TEST_P(FftTest, FFT) {
+  auto [data_type, cc] = GetParam();
+
+  const std::string hlo_text = R"(
+    ENTRY triton_computation {
+      parameter = $0[16,16] parameter(0)
+      ROOT fft_op = $0[16,16] fft(parameter), fft_type=FFT, fft_length={16}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, data_type, HloOpcode::kFft));
+
+  RunSupportTest(std::move(ti), {4, 4}, cc);
+}
+
+TEST_P(FftTest, IFFT) {
+  auto [data_type, cc] = GetParam();
+
+  const std::string hlo_text = R"(
+    ENTRY triton_computation {
+      parameter = $0[16,16] parameter(0)
+      ROOT fft_op = $0[16,16] fft(parameter), fft_type=IFFT, fft_length={16}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, data_type, HloOpcode::kFft));
+
+  RunSupportTest(std::move(ti), {4, 4}, cc);
+}
+
+TEST_P(FftTest, RFFT) {
+  auto [data_type, cc] = GetParam();
+  const std::string complex_data_type_str =
+      primitive_util::LowercasePrimitiveTypeName(data_type);
+  // Real type matching the complex type for real -> complex conversion.
+  const std::string real_data_type_str =
+      primitive_util::LowercasePrimitiveTypeName(
+          primitive_util::ComplexComponentType(data_type));
+
+  const std::string hlo_text = absl::Substitute(
+      R"(
+    ENTRY triton_computation {
+      parameter = $0[16,16,32] parameter(0)
+      ROOT fft_op = $1[16,16,17] fft(parameter), fft_type=RFFT, fft_length={16,32}
+    })",
+      real_data_type_str, complex_data_type_str);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, data_type, HloOpcode::kFft));
+
+  RunSupportTest(std::move(ti), {4, 4, 4}, cc);
+}
+
+TEST_P(FftTest, IRFFT) {
+  auto [data_type, cc] = GetParam();
+  const std::string complex_data_type_str =
+      primitive_util::LowercasePrimitiveTypeName(data_type);
+  // Real type matching the complex type for complex -> real conversion.
+  const std::string real_data_type_str =
+      primitive_util::LowercasePrimitiveTypeName(
+          primitive_util::ComplexComponentType(data_type));
+
+  const std::string hlo_text = absl::Substitute(
+      R"(
+  ENTRY triton_computation {
+    parameter = $0[16,16,32,33] parameter(0)
+    ROOT fft_op = $1[16,16,32,64] fft(parameter), fft_type=IRFFT, fft_length={16,32,64}
+  })",
+      complex_data_type_str, real_data_type_str);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(hlo_text, data_type, HloOpcode::kFft));
+
+  RunSupportTest(std::move(ti), {4, 4, 4, 4}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FftTestSuite, FftTest,
+    // FFT takes a complex type either as input, output or both. When there is a
+    // complex <-> real conversion, the real type can be directly inferred from
+    // the complex type (C64 <-> F32, C128 <-> F64).
+    ::testing::Combine(::testing::ValuesIn({C64, C128}),
+                       ::testing::ValuesIn(AllDevicesToTest())),
+    TritonSupportTestTypeAndDeviceToString);
+
+using CopyStartDoneTest = TritonSupportTestWithTypeAndDeviceParam;
+
+TEST_P(CopyStartDoneTest, CopyStartDone) {
+  auto [data_type, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+    ENTRY triton_computation {
+      parameter = $0[10,10,10] parameter(0)
+      cp_start = ($0[10,10,10], $0[10,10,10], u32[]) copy-start(parameter)
+      ROOT cp_done = $0[10,10,10] copy-done(cp_start)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti_start,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type,
+                                     HloOpcode::kCopyStart));
+  RunSupportTest(std::move(ti_start), /*output_tile_sizes=*/{1, 1, 1}, cc);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti_done,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type,
+                                     HloOpcode::kCopyDone));
+  RunSupportTest(std::move(ti_done), /*output_tile_sizes=*/{1, 1, 1}, cc);
+}
+constexpr std::array kTestedOpsCopy = {HloOpcode::kCopyStart,
+                                       HloOpcode::kCopyDone};
+
+INSTANTIATE_TEST_SUITE_P(
+    CopyStartDoneSuite, CopyStartDoneTest,
+    ::testing::Combine(::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::ValuesIn(AllDevicesToTest())),
+    TritonSupportTestTypeAndDeviceToString);
+
+using InfeedTest = TritonSupportTestWithTypeAndDeviceParam;
+
+TEST_P(InfeedTest, Infeed) {
+  auto [data_type, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+        ENTRY triton_computation {
+          token0 = token[] after-all()
+          ROOT infeed_op = ($0[10], token[]) infeed(token0)
+        })";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTestTemplate, data_type, HloOpcode::kInfeed));
+  RunSupportTestMultipleOutputTiles(std::move(ti),
+                                    /*output_tile_sizes=*/{{1}, {}}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    InfeedSuite, InfeedTest,
+    ::testing::Combine(::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::ValuesIn(AllDevicesToTest())),
+    TritonSupportTestTypeAndDeviceToString);
+
+using OutfeedTest = TritonSupportTestWithTypeAndDeviceParam;
+
+TEST_P(OutfeedTest, Outfeed) {
+  auto [data_type, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+        ENTRY triton_computation {
+          data = $0[10] parameter(0)
+          token0 = token[] after-all()
+          ROOT outfeed_op = token[] outfeed(data, token0)
+        })";
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti, ParseTemplateAndGetInstruction(
+                                                    kHloTestTemplate, data_type,
+                                                    HloOpcode::kOutfeed));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{}, cc);
+}
+
+using MapTest = TritonSupportTestWithTypeAndOpcodeAndDeviceParam;
+
+TEST_P(MapTest, Map) {
+  auto [data_type, opcode, cc] = GetParam();
+
+  // Note: the test is only relevant for datatypes supported by kAdd op.
+  const std::string kHloTestTemplate = R"(
+map_computation {
+  p = $0[] parameter(0)
+  ROOT add = $0[] add(p, p)
+}
+
+ENTRY triton_computation {
+  parameter = $0[10, 20] parameter(0)
+  ROOT map_op = $0[10, 20] map(parameter), dimensions={0, 1}, to_apply=map_computation
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type, opcode));
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{4, 8}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(MapSuite, MapTest,
+                         AllTestCombinationsForOpcodes({HloOpcode::kMap}),
+                         TritonSupportTestTypeAndOpcodeAndDeviceToString);
+
+INSTANTIATE_TEST_SUITE_P(
+    OutfeedSuite, OutfeedTest,
+    ::testing::Combine(::testing::ValuesIn(AllXlaDataTypes()),
+                       ::testing::ValuesIn(AllDevicesToTest())),
+    TritonSupportTestTypeAndDeviceToString);
+
+class StochasticConvertTest
+    : public TritonSupportTest,
+      public ::testing::WithParamInterface<
+          std::tuple<PrimitiveType, PrimitiveType, se::GpuComputeCapability>> {
+};
+
+TEST_P(StochasticConvertTest, StochasticConvert) {
+  auto [operand_type, new_element_type, cc] = GetParam();
+
+  PrimitiveType random_type = primitive_util::UnsignedIntegralTypeForBitWidth(
+      primitive_util::BitWidth(operand_type));
+
+  ASSERT_NE(random_type, PRIMITIVE_TYPE_INVALID)
+      << "Could not determine a valid random_type for operand_type: "
+      << PrimitiveType_Name(operand_type);
+
+  const std::string hlo_text = absl::Substitute(
+      R"(
+      ENTRY triton_computation {
+        operand = $0[33,68] parameter(0)
+        random = $1[33,68] parameter(1)
+        ROOT result = $2[33,68] stochastic-convert(operand, random)
+      })",
+      primitive_util::LowercasePrimitiveTypeName(operand_type),
+      primitive_util::LowercasePrimitiveTypeName(random_type),
+      primitive_util::LowercasePrimitiveTypeName(new_element_type));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti,
+      ParseTemplateAndGetInstruction(
+          hlo_text, PRIMITIVE_TYPE_INVALID,  // Type is irrelevant.
+          HloOpcode::kStochasticConvert));
+
+  RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 32}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    StochasticConvertTestSuite, StochasticConvertTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(AllOpSupportedTypes(
+            HloOpcode::kStochasticConvert)),     // Operand type.
+        ::testing::ValuesIn(AllXlaDataTypes()),  // New element type.
+        ::testing::ValuesIn(AllDevicesToTest())),
+    TritonSupportTestTwoTypesAndDeviceToString);
+
+using TopKTest = TritonSupportTestWithTypeAndDeviceAndBoolParam;
+
+std::string ParamToStringTopK(
+    const ::testing::TestParamInfo<TopKTest::ParamType>& info) {
+  auto [data_type, cc, largest] = info.param;
+  return absl::StrCat(PrimitiveType_Name(data_type), "_",
+                      ComputeCapabilityToString(cc), "_",
+                      largest ? "largest" : "smallest");
+}
+
+TEST_P(TopKTest, TopK) {
+  auto [data_type, cc, largest] = GetParam();
+  const std::string kHloTestTemplate = absl::Substitute(
+      R"(
+ENTRY triton_computation {
+  operand = $$0[11,33,77] parameter(0)
+  ROOT topk_op = ($$0[11,33,10], s32[11,33,10]) topk(operand), k=10, largest=$0
+})",
+      largest);
+  TF_ASSERT_OK_AND_ASSIGN(TestedInstruction ti,
+                          ParseTemplateAndGetInstruction(
+                              kHloTestTemplate, data_type, HloOpcode::kTopK));
+  RunSupportTestMultipleOutputTiles(
+      std::move(ti),
+      /*output_tile_sizes=*/{{2, 2, 1}, {2, 2, 1}}, cc);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TopKSuite, TopKTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(AllOpSupportedTypes(HloOpcode::kTopK)),
+        ::testing::ValuesIn(AllDevicesToTest()),
+        ::testing::Bool()),  // largest?
+    ParamToStringTopK);
+
+constexpr std::array kUnsupportedOps = {
+    // clang-format off
+    // go/keep-sorted start
+    HloOpcode::kConvolution,
+    HloOpcode::kDynamicReshape,
+    HloOpcode::kDynamicSlice,
+    HloOpcode::kDynamicUpdateSlice,
+    HloOpcode::kGather,
+    HloOpcode::kPad,
+    HloOpcode::kRaggedDot,
+    HloOpcode::kRecv,
+    HloOpcode::kRecvDone,
+    HloOpcode::kReduceWindow,
+    HloOpcode::kScatter,
+    HloOpcode::kSelectAndScatter,
+    HloOpcode::kSend,
+    HloOpcode::kSendDone,
+    HloOpcode::kSetDimensionSize,
+    HloOpcode::kSort,
+    // go/keep-sorted end
+    // clang-format on
+};
+
+absl::flat_hash_set<HloOpcode> AllTestedOpcodes() {
+  absl::flat_hash_set<HloOpcode> ret;
+  ret.insert(kTestedOpsBitcastReshape.begin(), kTestedOpsBitcastReshape.end());
+  ret.insert(kTestedOpsUnaryElementwise.begin(),
+             kTestedOpsUnaryElementwise.end());
+  ret.insert(kTestedOpsConvert.begin(), kTestedOpsConvert.end());
+  ret.insert(kTestedOpsBinaryElementwise.begin(),
+             kTestedOpsBinaryElementwise.end());
+  ret.insert(kTestedOpsTernaryElementwise.begin(),
+             kTestedOpsTernaryElementwise.end());
+  ret.insert(kTestedOpsReduction.begin(), kTestedOpsReduction.end());
+  ret.insert(kTestedOpsSlice.begin(), kTestedOpsSlice.end());
+  ret.insert(kTestedOpsConcatenate.begin(), kTestedOpsConcatenate.end());
+  ret.insert(kTestedOpsTranspose.begin(), kTestedOpsTranspose.end());
+  ret.insert(kTestedOpsCollectives.begin(), kTestedOpsCollectives.end());
+  ret.insert(kTestedOpsBroadcast.begin(), kTestedOpsBroadcast.end());
+  ret.insert(kTestedOpsParameter.begin(), kTestedOpsParameter.end());
+  ret.insert(kTestedOpsConstant.begin(), kTestedOpsConstant.end());
+  ret.insert(kTestedOpsIota.begin(), kTestedOpsIota.end());
+  ret.insert(kTestedOpsRng.begin(), kTestedOpsRng.end());
+  ret.insert(kTestedOpsCopy.begin(), kTestedOpsCopy.end());
+
+  ret.emplace(HloOpcode::kAfterAll);
+  ret.emplace(HloOpcode::kAddDependency);
+  ret.emplace(HloOpcode::kBatchNormGrad);
+  ret.emplace(HloOpcode::kBatchNormInference);
+  ret.emplace(HloOpcode::kBatchNormTraining);
+  ret.emplace(HloOpcode::kBitcastConvert);
+  ret.emplace(HloOpcode::kCall);
+  ret.emplace(HloOpcode::kCholesky);
+  ret.emplace(HloOpcode::kComplex);
+  ret.emplace(HloOpcode::kConditional);
+  ret.emplace(HloOpcode::kCustomCall);
+  ret.emplace(HloOpcode::kDomain);
+  ret.emplace(HloOpcode::kDot);
+  ret.emplace(HloOpcode::kFft);
+  ret.emplace(HloOpcode::kGetDimensionSize);
+  ret.emplace(HloOpcode::kGetTupleElement);
+  ret.emplace(HloOpcode::kMap);
+  ret.emplace(HloOpcode::kReverse);
+  ret.emplace(HloOpcode::kRngBitGenerator);
+  ret.emplace(HloOpcode::kRngGetAndUpdateState);
+  ret.emplace(HloOpcode::kTuple);
+  ret.emplace(HloOpcode::kWhile);
+  ret.emplace(HloOpcode::kFusion);
+  ret.emplace(HloOpcode::kInfeed);
+  ret.emplace(HloOpcode::kOutfeed);
+  ret.emplace(HloOpcode::kStochasticConvert);
+  ret.emplace(HloOpcode::kTopK);
+  ret.emplace(HloOpcode::kTriangularSolve);
   ret.insert(kUnsupportedOps.begin(), kUnsupportedOps.end());
+
   return ret;
 }
 
 TEST(OpCoverage, UnsupportedOpcodes) {
   for (HloOpcode opcode : kUnsupportedOps) {
-    EXPECT_TRUE(internal::IsTritonUnsupportedOpcode(opcode));
+    EXPECT_TRUE(internal::IsTritonUnsupportedOpcode(opcode))
+        << "Opcode `" << HloOpcodeString(opcode)
+        << "` is not expected to be supported.";
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
index dba29b07e23d..8be55479f93e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -32,7 +33,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -51,13 +54,34 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla::gpu {
 
+std::vector<xla::PrimitiveType> AllXlaDataTypes() {
+  std::vector<xla::PrimitiveType> xla_data_types;
+  std::vector<xla::PrimitiveType> to_filter_out = {
+      PRIMITIVE_TYPE_INVALID, TUPLE, BUFFER, OPAQUE_TYPE, TOKEN};
+  const tsl::protobuf::EnumDescriptor* xla_type_descriptor =
+      tsl::protobuf::GetEnumDescriptor<xla::PrimitiveType>();
+  for (int enum_ix = 0; enum_ix < xla_type_descriptor->value_count();
+       ++enum_ix) {
+    xla::PrimitiveType xla_type = static_cast<xla::PrimitiveType>(
+        xla_type_descriptor->value(enum_ix)->number());
+    if (!absl::c_linear_search(to_filter_out, xla_type)) {
+      xla_data_types.push_back(xla_type);
+    }
+  }
+  return xla_data_types;
+}
+
 bool SupportsBF16(const stream_executor::GpuComputeCapability& cc) {
   if (std::holds_alternative<stream_executor::CudaComputeCapability>(cc)) {
     return std::get<stream_executor::CudaComputeCapability>(cc).IsAtLeast(
@@ -77,7 +101,8 @@ absl::Status CreateTritonIrAndFileCheck(HloTestBase* test,
   TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> verified_module,
                       test->ParseAndReturnVerifiedModule(hlo_text));
   auto* comp = verified_module->GetComputationWithName(triton_fusion_name);
-  TF_RET_CHECK(comp != nullptr);
+  TF_RET_CHECK(comp != nullptr) << absl::StrCat(
+      "Computation '", triton_fusion_name, "' is not found in the module");
   auto fusion_backend_config = comp->FusionInstruction()
                                    ->backend_config<GpuBackendConfig>()
                                    ->fusion_backend_config();
@@ -96,14 +121,14 @@ absl::Status CreateTritonIrAndFileCheck(
 
   mlir::MLIRContext context;
   TF_ASSIGN_OR_RETURN(
-      auto triton_module,
+      mlir::OwningOpRef<mlir::ModuleOp> triton_module,
       CreateTritonModule("triton_fn", fusion,
                          TestGpuDeviceInfo::RTXA6000DeviceInfo(),
                          block_level_parameters, context));
 
   std::string out;
   llvm::raw_string_ostream os(out);
-  triton_module.module->print(os);
+  triton_module->print(os);
   TF_ASSIGN_OR_RETURN(bool succeeded, RunFileCheck(out, filecheck_pattern));
   if (!succeeded) {
     return absl::InternalError("FileCheck failed.");
@@ -145,6 +170,8 @@ std::string PrimitiveTypeAndHloOpcodeToString(PrimitiveType data_type,
       absl::StrReplaceAll(HloOpcodeString(opcode), {{"-", "_"}}));
 }
 
+}  // namespace
+
 std::string ComputeCapabilityToString(
     const stream_executor::GpuComputeCapability& cc) {
   if (auto cuda_cc = std::get_if<se::CudaComputeCapability>(&cc)) {
@@ -155,8 +182,6 @@ std::string ComputeCapabilityToString(
   }
 }
 
-}  // namespace
-
 std::string TritonSupportTestTypeAndDeviceToString(
     const ::testing::TestParamInfo<
         std::tuple<PrimitiveType, se::GpuComputeCapability>>& data) {
@@ -207,7 +232,8 @@ namespace {
 // computation whose root is a fusion. Otherwise, creates a new entry
 // computation whose root is a fusion instruction that calls the original entry
 // computation. The new fusion instruction uses the generic Triton backend kind.
-absl::Status ConvertEntryToTritonFusion(HloModule* module) {
+absl::Status ConvertEntryToTritonFusion(HloModule* module,
+                                        bool use_nested_gemm_fusions) {
   if (module->entry_computation()->root_instruction()->opcode() ==
       HloOpcode::kFusion) {
     return absl::OkStatus();
@@ -229,8 +255,13 @@ absl::Status ConvertEntryToTritonFusion(HloModule* module) {
       module->entry_computation()));
 
   gpu::GpuBackendConfig gpu_config;
-  gpu_config.mutable_fusion_backend_config()->set_kind(
-      std::string(kTritonFusionKind));
+  if (use_nested_gemm_fusions) {
+    gpu_config.mutable_fusion_backend_config()->set_kind(
+        std::string(kTritonNestedGemmFusionKind));
+  } else {
+    gpu_config.mutable_fusion_backend_config()->set_kind(
+        std::string(kTritonFusionKind));
+  }
   TF_RETURN_IF_ERROR(fusion->set_backend_config(gpu_config));
 
   auto new_entry =
@@ -253,13 +284,14 @@ DebugOptions TritonSupportTestBase::GetDebugOptionsForTest() const {
 absl::StatusOr<TritonSupportTestBase::TestedInstruction>
 TritonSupportTestBase::ParseTemplateAndGetInstruction(
     absl::string_view hlo_template, xla::PrimitiveType data_type,
-    xla::HloOpcode opcode) {
+    xla::HloOpcode opcode, bool use_nested_gemm_fusions) {
   const std::string hlo_text = absl::Substitute(
       hlo_template, primitive_util::LowercasePrimitiveTypeName(data_type),
       HloOpcodeString(opcode));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       ParseAndReturnVerifiedModule(hlo_text));
-  TF_RETURN_IF_ERROR(ConvertEntryToTritonFusion(module.get()));
+  TF_RETURN_IF_ERROR(
+      ConvertEntryToTritonFusion(module.get(), use_nested_gemm_fusions));
   const HloComputation* computation =
       module->GetComputationWithName("triton_computation");
   if (computation == module->entry_computation()) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
index 6aec64f4a474..62c6d09c1af6 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
@@ -39,11 +39,18 @@ limitations under the License.
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 
+std::vector<xla::PrimitiveType> AllXlaDataTypes();
+
 bool SupportsBF16(const stream_executor::GpuComputeCapability& cc);
 
+std::string ComputeCapabilityToString(
+    const stream_executor::GpuComputeCapability& cc);
+
 absl::Status CreateTritonIrAndFileCheck(HloTestBase* test,
                                         absl::string_view hlo_text,
                                         absl::string_view triton_fusion_name,
@@ -62,9 +69,9 @@ absl::Status CreateTritonIrAndFileCheckForDot(
     const HloComputation& computation, absl::string_view filecheck_pattern);
 
 inline BlockLevelParameters FromOutputTileSizes(
-    std::vector<int64_t> output_tile_sizes) {
+    std::vector<std::vector<int64_t>> output_tile_sizes) {
   BlockLevelParameters block_level_parameters;
-  block_level_parameters.output_tile_sizes.push_back(output_tile_sizes);
+  block_level_parameters.output_tile_sizes = std::move(output_tile_sizes);
   return block_level_parameters;
 }
 
@@ -118,9 +125,12 @@ class TritonSupportTestBase : public HloTestBase {
   // `triton_computation` with the generic Triton emitter. Tests that need
   // the `__triton_gemm` backend kind should provide their own ENTRY
   // computation.
+  //
+  // TODO(b/393299275): remove `use_nested_gemm_fusions` once the migration is
+  // complete.
   absl::StatusOr<TestedInstruction> ParseTemplateAndGetInstruction(
       absl::string_view hlo_template, xla::PrimitiveType data_type,
-      xla::HloOpcode opcode);
+      xla::HloOpcode opcode, bool use_nested_gemm_fusions = false);
 
   llvm::LLVMContext llvm_ctx_;
   llvm::Module llvm_module_{"module", llvm_ctx_};
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
index 518490d358b1..fff18195f35b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
@@ -16,58 +16,44 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/tma_utils.h"
 
 #include <cstdint>
-#include <optional>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
-#include "xla/shape.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/statusor.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Types.h"
 
 namespace xla::gpu {
 
-namespace mt = ::mlir::triton;
-
 using ::llvm::SmallVector;
-using ::mlir::RankedTensorType;
-using ::mlir::Type;
-using ::mlir::Value;
 using ::stream_executor::gpu::TmaDescriptor;
-using ::stream_executor::gpu::TmaMetadata;
 
 // Returns a TmaDescriptor for a 2D tensor to be emitted in Triton.
 //
 // This function follows the defaults and logic found in fill2DTMADescriptor in
 // @triton/third_party/nvidia/backend/cuda_utils.cc
 absl::StatusOr<TmaDescriptor> Create2DTmaDescriptor(
-    Shape global_shape, llvm::ArrayRef<int64_t> block_shape,
-    Type element_type) {
-  if (global_shape.dimensions().size() != 2) {
+    llvm::ArrayRef<int64_t> global_shape, llvm::ArrayRef<int64_t> block_shape,
+    llvm::ArrayRef<int64_t> layout, int element_byte_size) {
+  if (global_shape.size() != 2) {
     return absl::InvalidArgumentError("expected 2D global shape");
   }
   if (block_shape.size() != 2) {
     return absl::InvalidArgumentError("expected 2D block shape");
   }
-  int byte_width = element_type.getIntOrFloatBitWidth() / 8;
+
   SmallVector<uint64_t, 2> global_dims = {
-      static_cast<uint64_t>(global_shape.dimensions(1)),
-      static_cast<uint64_t>(global_shape.dimensions(0))};
-  auto global_strides = {global_dims[0] * byte_width};
-  SmallVector<uint32_t, 2> box_dims = {static_cast<uint32_t>(block_shape[1]),
-                                       static_cast<uint32_t>(block_shape[0])};
+      static_cast<uint64_t>(global_shape[layout[0]]),
+      static_cast<uint64_t>(global_shape[layout[1]])};
+  auto global_strides = {global_dims[0] * element_byte_size};
+  SmallVector<uint32_t, 2> box_dims = {
+      static_cast<uint32_t>(block_shape[layout[0]]),
+      static_cast<uint32_t>(block_shape[layout[1]])};
   SmallVector<uint32_t, 2> element_strides = {1, 1};
   TmaDescriptor::TmaSwizzle swizzle;
-  uint32_t contig_dim_size_in_byte = byte_width * box_dims[0];
+  uint32_t contig_dim_size_in_byte = element_byte_size * box_dims[0];
   if (contig_dim_size_in_byte >= 128) {
     swizzle = TmaDescriptor::TmaSwizzle::k128B;
   } else if (contig_dim_size_in_byte >= 64) {
@@ -75,34 +61,17 @@ absl::StatusOr<TmaDescriptor> Create2DTmaDescriptor(
   } else if (contig_dim_size_in_byte >= 32) {
     swizzle = TmaDescriptor::TmaSwizzle::k32B;
   } else {
-    return absl::FailedPreconditionError(
-        "continguous dimension size too small");
+    return absl::FailedPreconditionError("contiguous dimension size too small");
   }
   if (contig_dim_size_in_byte > 128) {
-    box_dims[0] = 128 / byte_width;
+    box_dims[0] = 128 / element_byte_size;
   }
   TF_ASSIGN_OR_RETURN(
       auto tma_desc, TmaDescriptor::Create(
                          global_dims, global_strides, box_dims, element_strides,
-                         byte_width, TmaDescriptor::TmaInterleave::kNone,
+                         element_byte_size, TmaDescriptor::TmaInterleave::kNone,
                          swizzle, TmaDescriptor::TmaL2Promotion::k128B));
   return tma_desc;
 }
 
-Value EmitTmaDescriptor(EmitterLocOpBuilder& b, Value arg,
-                        RankedTensorType tensor_type) {
-  auto desc_type = mt::TensorDescType::get(b.getContext(), tensor_type);
-  return b.create<mt::ReinterpretTensorDescOp>(desc_type, arg);
-}
-
-void RewriteFunctionForTma(EmitterLocOpBuilder& b, mlir::triton::FuncOp fn,
-                           std::optional<TmaMetadata> tma_metadata) {
-  if (!tma_metadata.has_value()) {
-    return;
-  }
-  for (auto& [parameter_number, _] : tma_metadata->arg_index_to_tma_info) {
-    fn.setArgAttr(parameter_number, "tt.nv_tma_desc", b.getI32IntegerAttr(1));
-  }
-}
-
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h
index 4bc39621ff74..192ab4fdc108 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.h
@@ -17,35 +17,17 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_CODEGEN_TRITON_TMA_UTILS_H_
 
 #include <cstdint>
-#include <optional>
 
 #include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
-#include "xla/shape.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla::gpu {
 
 // Returns a TmaDescriptor for a 2D tensor to be emitted in Triton.
 absl::StatusOr<stream_executor::gpu::TmaDescriptor> Create2DTmaDescriptor(
-    Shape global_shape, llvm::ArrayRef<int64_t> block_shape,
-    mlir::Type element_type);
-
-// Emit a TmaDescriptor for the given argument & tensor type. It can then be
-// used to load a tensor using the ExperimentalDescriptorLoadOp.
-mlir::Value EmitTmaDescriptor(EmitterLocOpBuilder& b, mlir::Value arg,
-                              mlir::RankedTensorType tensor_type);
-
-// Loading arguments by TMA changes the kernel signature and must be updated
-// appropriately.
-void RewriteFunctionForTma(
-    EmitterLocOpBuilder& b, mlir::triton::FuncOp fn,
-    std::optional<stream_executor::gpu::TmaMetadata> tma_metadata);
+    llvm::ArrayRef<int64_t> global_shape, llvm::ArrayRef<int64_t> block_shape,
+    llvm::ArrayRef<int64_t> layout, int element_byte_size);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc
index e5b6387b86e4..2bf4af9b6d31 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils_test.cc
@@ -16,46 +16,23 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/tma_utils.h"
 
 #include <cstdint>
-#include <optional>
-#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "xla/codegen/emitter_loc_op_builder.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Types.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::absl::StatusCode;
 using ::llvm::SmallVector;
-using ::mlir::RankedTensorType;
-using ::mlir::Type;
-using ::mlir::Value;
-using ::mlir::triton::FuncOp;
-using ::mlir::triton::PointerType;
 using ::stream_executor::gpu::TmaDescriptor;
-using ::stream_executor::gpu::TmaMetadata;
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
@@ -63,14 +40,15 @@ using ::tsl::testing::StatusIs;
 TEST(Create2DTmaDescriptorTest, ValidInputReturnCorrectDescriptor) {
   mlir::MLIRContext mlir_context;
   mlir::Builder b(&mlir_context);
-  Shape global_shape = ShapeUtil::MakeShape(F32, {256, 128});
+  llvm::SmallVector<int64_t, 2> global_shape = {256, 128};
   llvm::SmallVector<int64_t, 2> block_shape = {64, 32};
-  mlir::Type element_type = b.getF32Type();
-  TF_ASSERT_OK_AND_ASSIGN(
-      TmaDescriptor tma_desc,
-      Create2DTmaDescriptor(global_shape, block_shape, element_type));
+  llvm::SmallVector<int64_t, 2> layout = {1, 0};
+  int element_byte_size = 4;
+  TF_ASSERT_OK_AND_ASSIGN(TmaDescriptor tma_desc,
+                          Create2DTmaDescriptor(global_shape, block_shape,
+                                                layout, element_byte_size));
   EXPECT_EQ(tma_desc.element_size(), 4);
-  EXPECT_EQ(tma_desc.rank(), 2);
+  EXPECT_EQ(tma_desc.num_dimensions(), 2);
   EXPECT_THAT(tma_desc.global_dims(), ElementsAre(128, 256));
   EXPECT_THAT(tma_desc.global_strides(), ElementsAre(128 * 4));
   EXPECT_THAT(tma_desc.box_dims(), ElementsAre(32, 64));
@@ -84,10 +62,12 @@ TEST(Create2DTmaDescriptorTest, ValidInputReturnCorrectDescriptor) {
 TEST(Create2DTmaDescriptorTest, BadGlobalShapeFailsGracefully) {
   mlir::MLIRContext mlir_context;
   mlir::Builder b(&mlir_context);
-  Shape global_shape = ShapeUtil::MakeShape(F32, {128});
+  llvm::SmallVector<int64_t, 1> global_shape = {128};
   llvm::SmallVector<int64_t, 2> block_shape = {128, 128};
-  mlir::Type element_type = b.getF32Type();
-  EXPECT_THAT(Create2DTmaDescriptor(global_shape, block_shape, element_type),
+  llvm::SmallVector<int64_t, 2> layout = {1, 0};
+  int element_byte_size = 4;
+  EXPECT_THAT(Create2DTmaDescriptor(global_shape, block_shape, layout,
+                                    element_byte_size),
               StatusIs(StatusCode::kInvalidArgument,
                        HasSubstr("expected 2D global shape")));
 }
@@ -95,10 +75,12 @@ TEST(Create2DTmaDescriptorTest, BadGlobalShapeFailsGracefully) {
 TEST(Create2DTmaDescriptorTest, BadBlockShapeFailsGracefully) {
   mlir::MLIRContext mlir_context;
   mlir::Builder b(&mlir_context);
-  Shape global_shape = ShapeUtil::MakeShape(F32, {128, 128});
+  llvm::SmallVector<int64_t, 2> global_shape = {128, 128};
   llvm::SmallVector<int64_t, 2> block_shape = {128};
-  mlir::Type element_type = b.getF32Type();
-  EXPECT_THAT(Create2DTmaDescriptor(global_shape, block_shape, element_type),
+  llvm::SmallVector<int64_t, 2> layout = {1, 0};
+  int element_byte_size = 4;
+  EXPECT_THAT(Create2DTmaDescriptor(global_shape, block_shape, layout,
+                                    element_byte_size),
               StatusIs(StatusCode::kInvalidArgument,
                        HasSubstr("expected 2D block shape")));
 }
@@ -106,98 +88,15 @@ TEST(Create2DTmaDescriptorTest, BadBlockShapeFailsGracefully) {
 TEST(Create2DTmaDescriptorTest, SmallBlockShapeFailsGracefully) {
   mlir::MLIRContext mlir_context;
   mlir::Builder b(&mlir_context);
-  Shape global_shape = ShapeUtil::MakeShape(F32, {128, 128});
+  llvm::SmallVector<int64_t, 2> global_shape = {128, 128};
   llvm::SmallVector<int64_t, 2> block_shape = {128, 2};
-  mlir::Type element_type = b.getF32Type();
-  EXPECT_THAT(Create2DTmaDescriptor(global_shape, block_shape, element_type),
+  llvm::SmallVector<int64_t, 2> layout = {1, 0};
+  int element_byte_size = 4;
+  EXPECT_THAT(Create2DTmaDescriptor(global_shape, block_shape, layout,
+                                    element_byte_size),
               StatusIs(StatusCode::kFailedPrecondition,
                        HasSubstr("dimension size too small")));
 }
 
-class TmaUtilsFixture : public testing::Test {
- public:
-  void SetUp() override {
-    mlir_context_.loadDialect<mlir::triton::TritonDialect>();
-    std::string fn_name = "test_fn";
-    auto loc = mlir::NameLoc::get(b_.getStringAttr(fn_name));
-    triton_module_ = llvm_ir::CreateMlirModuleOp(loc);
-    b_.setInsertionPointToEnd(triton_module_->getBody());
-  }
-
-  EmitterLocOpBuilder GetEmitterLocOpBuilder() {
-    return EmitterLocOpBuilder(mlir::NameLoc::get(b_.getStringAttr("test_fn")),
-                               b_);
-  }
-
-  FuncOp CreateTestFunction(EmitterLocOpBuilder& b) {
-    std::string fn_name = "test_fn";
-    SmallVector<Type, 3> fn_arg_types{
-        PointerType::get(b.getF32Type(), mlir::NVVM::kGlobalMemorySpace),
-        PointerType::get(b.getF32Type(), mlir::NVVM::kGlobalMemorySpace),
-        PointerType::get(b.getF32Type(), mlir::NVVM::kGlobalMemorySpace)};
-    auto func_type = b.getFunctionType(fn_arg_types, std::nullopt);
-    FuncOp fn = b.create<FuncOp>(fn_name, func_type);
-    b.setInsertionPointToStart(fn.addEntryBlock());
-    return fn;
-  }
-
- protected:
-  mlir::MLIRContext mlir_context_;
-  mlir::OpBuilder b_{&mlir_context_};
-  mlir::OwningOpRef<mlir::ModuleOp> triton_module_;
-};
-
-TEST_F(TmaUtilsFixture,
-       EmitTmaDescriptor_ValidInputReturnsCorrectTmaDescriptor) {
-  EmitterLocOpBuilder b = GetEmitterLocOpBuilder();
-  FuncOp fn = CreateTestFunction(b);
-  Value arg = fn.getArgument(0);
-  RankedTensorType tensor_type =
-      RankedTensorType::get({128, 128}, b.getF32Type());
-  Value tma_desc = EmitTmaDescriptor(b, arg, tensor_type);
-  EXPECT_EQ(tma_desc.getType(),
-            mlir::triton::TensorDescType::get(b.getContext(), tensor_type));
-}
-
-TEST_F(TmaUtilsFixture,
-       RewriteFunctionForTma_TmaDescriptorsSetCorrectTmaAttribute) {
-  EmitterLocOpBuilder b = GetEmitterLocOpBuilder();
-  FuncOp fn = CreateTestFunction(b);
-  TmaMetadata tma_metadata;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto tma_desc,
-      TmaDescriptor::Create({128, 128}, {128}, {64, 64}, {1, 1}, 4));
-  tma_metadata.arg_index_to_tma_info.insert({0, tma_desc});
-  TF_ASSERT_OK_AND_ASSIGN(
-      tma_desc, TmaDescriptor::Create({128, 128}, {128}, {64, 64}, {1, 1}, 4));
-  tma_metadata.arg_index_to_tma_info.insert({2, tma_desc});
-
-  RewriteFunctionForTma(b, fn, tma_metadata);
-  EXPECT_EQ(fn.getArgAttr(0, "tt.nv_tma_desc"), b_.getI32IntegerAttr(1));
-  EXPECT_FALSE(fn.getArgAttr(1, "tt.nv_tma_desc"));
-  EXPECT_EQ(fn.getArgAttr(2, "tt.nv_tma_desc"), b_.getI32IntegerAttr(1));
-}
-
-TEST_F(TmaUtilsFixture,
-       RewriteFunctionForTma_NoTmaMetadataDoesNotSetTmaAttribute) {
-  EmitterLocOpBuilder b = GetEmitterLocOpBuilder();
-  FuncOp fn = CreateTestFunction(b);
-  RewriteFunctionForTma(b, fn, std::nullopt);
-  EXPECT_FALSE(fn.getArgAttr(0, "tt.nv_tma_desc"));
-  EXPECT_FALSE(fn.getArgAttr(1, "tt.nv_tma_desc"));
-  EXPECT_FALSE(fn.getArgAttr(2, "tt.nv_tma_desc"));
-}
-
-TEST_F(TmaUtilsFixture,
-       RewriteFunctionForTma_EmptyTmaMetadataDoesNotSetTmaAttribute) {
-  EmitterLocOpBuilder b = GetEmitterLocOpBuilder();
-  FuncOp fn = CreateTestFunction(b);
-  TmaMetadata tma_metadata;
-  RewriteFunctionForTma(b, fn, tma_metadata);
-  EXPECT_FALSE(fn.getArgAttr(0, "tt.nv_tma_desc"));
-  EXPECT_FALSE(fn.getArgAttr(1, "tt.nv_tma_desc"));
-  EXPECT_FALSE(fn.getArgAttr(2, "tt.nv_tma_desc"));
-}
-
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
index 2b26f144db92..4e255d9cfb2f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
@@ -15,16 +15,11 @@ package_group(
 
 gentbl_cc_library(
     name = "passes_inc_gen",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=TritonXlaTransforms",
-                "-attrdefs-dialect=triton_xla",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=TritonXlaTransforms",
+        "-attrdefs-dialect=triton_xla",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     visibility = ["//visibility:private"],
@@ -34,32 +29,33 @@ gentbl_cc_library(
 cc_library(
     name = "passes",
     srcs = [
+        "extract_tma_info_pass.cc",
         "generalize_kernel_signature.cc",
         "int4_passes.cc",
         "prevent_mmav3_loop_unrolling_pass.cc",
-        "sparse_passes.cc",
+        "round_f32_to_tf32_for_tf32_dot_pass.cc",
         "triton_xla_extract_insert_to_triton_pass.cc",
     ],
     hdrs = ["passes.h"],
     deps = [
         ":passes_inc_gen",
+        "//xla:shape_util",
         "//xla/backends/gpu/codegen/triton:emitter_helpers",
-        "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen:emitter_loc_op_builder",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/hlo/analysis:indexing_analysis",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:GPUToNVVMTransforms",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
@@ -69,11 +65,6 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@triton//:TritonDialects",
-        "@triton//:TritonGPUToLLVM",
         "@triton//:TritonGPUTransforms",
-        "@triton//:TritonToTritonGPU",
-        "@triton//third_party/nvidia:NVGPUDialect",
-        "@triton//third_party/nvidia:NVGPUToLLVM",
-        "@triton//third_party/nvidia:TritonNVIDIAGPUToLLVM",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/extract_tma_info_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/extract_tma_info_pass.cc
new file mode 100644
index 000000000000..593bb076fdb0
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/extract_tma_info_pass.cc
@@ -0,0 +1,111 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
+
+namespace mlir::triton::xla {
+namespace {
+
+#define GEN_PASS_DEF_EXTRACTTMAINFOPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+struct ExtractTmaInfoPass
+    : public impl::ExtractTmaInfoPassBase<ExtractTmaInfoPass> {
+  std::optional<SwizzleMode> GetSwizzleMode(TensorDescType type) {
+    auto swizzle_mode = nvidia_gpu::getTMASwizzleMode(
+        /*op=*/nullptr, type);
+    if (!swizzle_mode.has_value()) {
+      return std::nullopt;
+    }
+    SwizzleMode swizzle_mode_enum;
+    switch (swizzle_mode.value()) {
+      case 0:
+        swizzle_mode_enum = SwizzleMode::kNone;
+        break;
+      case 1:
+        swizzle_mode_enum = SwizzleMode::k32b;
+        break;
+      case 2:
+        swizzle_mode_enum = SwizzleMode::k64b;
+        break;
+      case 3:
+        swizzle_mode_enum = SwizzleMode::k128b;
+        break;
+      default:
+        return std::nullopt;
+    }
+    return swizzle_mode_enum;
+  }
+
+  void runOnOperation() override {
+    mlir::ModuleOp mod = getOperation();
+    mod.walk([&](mlir::triton::FuncOp func_op) -> void {
+      for (auto arg : func_op.getArguments()) {
+        if (!isa<TensorDescType>(arg.getType())) {
+          continue;
+        }
+
+        auto tma_descriptor_attr = func_op.getArgAttrOfType<TmaDescriptorAttr>(
+            arg.getArgNumber(), "tt.tma_descriptor");
+        if (!tma_descriptor_attr) {
+          emitError(arg.getLoc(),
+                    "Argument of type tt.tensordesc must have attribute "
+                    "tt.tma_descriptor");
+          signalPassFailure();
+          return;
+        }
+
+        auto swizzle_mode =
+            GetSwizzleMode(mlir::cast<TensorDescType>(arg.getType()));
+        if (!swizzle_mode.has_value()) {
+          emitError(arg.getLoc(),
+                    "Unable to determine swizzle mode from TensorDescType");
+          signalPassFailure();
+          return;
+        }
+
+        IRRewriter rewriter(&getContext());
+        func_op.setArgAttr(
+            arg.getArgNumber(), "tt.tma_descriptor",
+            rewriter.getAttr<TmaDescriptorAttr>(
+                tma_descriptor_attr.getGlobalShape(),
+                tma_descriptor_attr.getBlockShape(),
+                tma_descriptor_attr.getLayout(),
+                tma_descriptor_attr.getElementByteSize(),
+                SwizzleModeAttr::get(&getContext(), swizzle_mode.value())));
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateExtractTmaInfoPass() {
+  return std::make_unique<ExtractTmaInfoPass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
index a9fc2ac4b23b..6871b53e2d0c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -75,7 +77,7 @@ class I4ToI8Converter : public TypeConverter {
       return type;  // Only handle static shapes for simplicity
 
     std::vector<int64_t> new_shape = shape;
-    new_shape[new_shape.size() - packed_dim_idx_ - 1] /= 2;
+    new_shape[packed_dimension()] /= 2;
 
     auto new_type = RankedTensorType::get(
         new_shape, IntegerType::get(type.getContext(), 8));
@@ -113,8 +115,8 @@ class I4ToI8Converter : public TypeConverter {
     return new_func_type;
   }
 
-  explicit I4ToI8Converter(int packed_dim_idx)
-      : packed_dim_idx_(packed_dim_idx) {
+  explicit I4ToI8Converter(int packed_dimension)
+      : packed_dimension_(packed_dimension) {
     // Passthrough for other types.
     addConversion([](Type type) {
       VLOG(2) << "I4ToI8Converter: passthrough for " << DumpToString(type);
@@ -138,10 +140,10 @@ class I4ToI8Converter : public TypeConverter {
     addConversion(
         [this](FunctionType type) { return this->convertFunctionType(type); });
   }
-  int packed_dim_idx() const { return packed_dim_idx_; }
+  int packed_dimension() const { return packed_dimension_; }
 
  private:
-  int packed_dim_idx_;
+  int packed_dimension_;
 };
 
 // Divides a value by an integer constant.
@@ -191,9 +193,9 @@ class MakeTensorPtrOpConversionPattern
     }
 
     SmallVector<Value, 2> shape = adaptor.getShape();
-    int affected_dim_idx = shape.size() - 1 - converter_.packed_dim_idx();
+    int packed_dimension = converter_.packed_dimension();
     // The shape of the i8 tensor is half of the i4 tensor but at least 1.
-    shape[affected_dim_idx] = ceilDiv(r, shape[affected_dim_idx], 2);
+    shape[packed_dimension] = ceilDiv(r, shape[packed_dimension], 2);
 
     // The stride of the i8 tensor is half of the i4 tensor but at least 1.
     SmallVector<Value, 2> new_strides = adaptor.getStrides();
@@ -249,7 +251,7 @@ class AdvanceOpConversionPattern : public OpConversionPattern<AdvanceOp> {
       AdvanceOp op, typename OpConversionPattern<AdvanceOp>::OpAdaptor adaptor,
       ConversionPatternRewriter &r) const override {
     VLOG(2) << "AvanceOpConversionPattern: matching\n"
-            << DumpToString(static_cast<Operation *>(op.getOperation()));
+            << DumpToString(op.getOperation());
     // Convert the tensor type using the TypeConverter
     auto new_type = converter_.convertType(op.getType());
     if (op.getType() == new_type) {
@@ -258,13 +260,13 @@ class AdvanceOpConversionPattern : public OpConversionPattern<AdvanceOp> {
       return r.notifyMatchFailure(op, "no conversion needed");
     }
     SmallVector<Value, 2> offsets = adaptor.getOffsets();
-    int affected_dim_idx = offsets.size() - 1 - converter_.packed_dim_idx();
-    offsets[affected_dim_idx] = div(r, offsets[affected_dim_idx], 2);
+    int packed_dimension = converter_.packed_dimension();
+    offsets[packed_dimension] = div(r, offsets[packed_dimension], 2);
     auto new_op = r.replaceOpWithNewOp<AdvanceOp>(op, new_type,
                                                   adaptor.getPtr(), offsets);
     VLOG(2) << "AdvanceOpConversionPattern: replaced "
             << DumpToString(op.getOperation()) << " with "
-            << DumpToString(static_cast<Operation *>(new_op));
+            << DumpToString(new_op.getOperation());
     return success();
   }
 
@@ -318,7 +320,7 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
   LogicalResult matchAndRewrite(ma::ExtSIOp op, OpAdaptor adaptor,
                                 ConversionPatternRewriter &r) const override {
     VLOG(2) << "ExtSIInt4ToInt8Pattern: matching\n"
-            << DumpToString(static_cast<Operation *>(op));
+            << DumpToString(op.getOperation());
     auto input_type = cast<RankedTensorType>(op.getIn().getType());
     auto packed_type = converter_.convertType(input_type);
     if (input_type == packed_type) {
@@ -336,7 +338,7 @@ class ExtSIInt4ToInt8Pattern : public OpConversionPattern<ma::ExtSIOp> {
     Value lo = r.create<ma::ShRSIOp>(loc, packed_type, shifted_lo, shift4);
     Value hi = r.create<ma::ShRSIOp>(loc, packed_type, adaptor.getIn(), shift4);
     Value hi_lo = r.create<mt::JoinOp>(loc, lo, hi);
-    if (converter_.packed_dim_idx() != 0) {
+    if (converter_.packed_dimension() + 1 != input_type.getRank()) {
       auto trans_attr = r.getDenseI32ArrayAttr({0, 2, 1});
       hi_lo = r.create<mt::TransOp>(loc, hi_lo, trans_attr);
     }
@@ -386,7 +388,7 @@ std::vector<Operation *> TraverseUpwards(Operation *op) {
 std::vector<Operation *> FindInt4ExtSIOp(const ModuleOp &module) {
   // It does not matter which packed dimension idx we use here, because use the
   // converter to detect that the conversion is needed.
-  I4ToI8Converter converter(/*packed_dim_idx=*/0);
+  I4ToI8Converter converter(/*packed_dimension=*/0);
   std::vector<Operation *> result;
   module->walk([&](Operation *op) {
     if (auto extSI = dyn_cast<arith::ExtSIOp>(op)) {
@@ -401,31 +403,29 @@ std::vector<Operation *> FindInt4ExtSIOp(const ModuleOp &module) {
   return result;
 }
 
-// Finds the packed dimension idx from the MakeTensorPtrOp.
+// Finds the packed dimension from the MakeTensorPtrOp.
 // The tensor is packed along the minor dimension. Minor dimension is the one
-// that has the stride 1. If there are two dimensions with the stride 1, then we
-// need to check which exact shape dim is equal to 1.
-// We relay on the fact that shape and strides are the const values.
-int GetPackedDimIdx(MLIRContext *ctx, const std::vector<Operation *> &ops) {
+// that has a stride of 1 but a shape that is not 1. For a shape dimension of 1
+// the stride can be any value.
+int GetPackedDimension(MLIRContext *ctx, const std::vector<Operation *> &ops) {
   for (auto *op : ops) {
-    if (!isa<MakeTensorPtrOp>(op)) continue;
-
     auto make_tensor_ptr = dyn_cast<MakeTensorPtrOp>(op);
+    if (!make_tensor_ptr) {
+      continue;
+    }
+    // The order attribute is ignored in Triton, check for default order here.
+    CHECK(absl::c_is_sorted(make_tensor_ptr.getOrder(), std::greater<int>()))
+        << "Not default order: " << DumpToString(op);
     auto shape = make_tensor_ptr.getShape();
     auto strides = make_tensor_ptr.getStrides();
-    int stride_0 = GetConstValue(strides[0]).value_or(1);
-    int stride_1 = GetConstValue(strides[1]).value_or(1);
-    int dim_0 = GetConstValue(shape[0]).value_or(1);
-    int dim_1 = GetConstValue(shape[1]).value_or(1);
-    if (stride_0 != 1 && stride_1 == 1) return 0;
-    if (stride_0 == 1 && stride_1 != 1) return 1;
-    if (stride_0 == 1 && stride_1 == 1 && dim_0 == 1 && dim_1 != 1) return 0;
-    if (stride_0 == 1 && stride_1 == 1 && dim_0 != 1 && dim_1 == 1) return 1;
-    LOG(FATAL) << "Unsupported case for the MakeTensorPtrOp: "
-               << DumpToString(static_cast<Operation *>(make_tensor_ptr));
+    for (auto dim : make_tensor_ptr.getOrder()) {
+      if (GetConstValue(strides[dim]).value_or(1) == 1 &&
+          GetConstValue(shape[dim]).value_or(0) != 1) {
+        return dim;
+      }
+    }
   }
   LOG(FATAL) << "No MakeTensorPtrOp found";
-  return 0;  // Default to minor dimension.
 }
 
 struct PlainInt4ToPackedInt4RewritePass
@@ -442,17 +442,17 @@ struct PlainInt4ToPackedInt4RewritePass
     auto module = getOperation();
 
     auto ext_ops = FindInt4ExtSIOp(module);
-    int packed_dim_idx = 0;
+    int packed_dimension = 0;
     // TODO(b/383255324): Support the case when both sides of the dot are packed
     // differently.
     for (auto *op : ext_ops) {
       VLOG(2) << "ext_op: " << DumpToString(op);
       auto ops = TraverseUpwards(op);
-      packed_dim_idx = GetPackedDimIdx(ctx, ops);
+      packed_dimension = GetPackedDimension(ctx, ops);
     }
 
     ConversionTarget target(*ctx);
-    I4ToI8Converter converter(packed_dim_idx);
+    I4ToI8Converter converter(packed_dimension);
     target.markUnknownOpDynamicallyLegal([&](Operation *op) {
       if (auto func_op = dyn_cast<FuncOp>(op)) {
         VLOG(2) << "check funcOp: " << DumpToString(func_op);
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
index 30664bf49d2f..6c04bcc775e7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
@@ -36,15 +36,10 @@ std::unique_ptr<mlir::Pass> CreateTritonXLAExtractInsertToTritonPass(
     const stream_executor::DeviceDescription& device_description,
     bool tma_enabled);
 std::unique_ptr<mlir::Pass> CreateGeneralizeKernelSignaturePass();
-std::unique_ptr<mlir::Pass> CreateSparseAddEncodingPass(
-    int32_t num_warps = 4, int32_t threads_per_warp = 32, int32_t num_ctas = 1);
-std::unique_ptr<mlir::Pass> CreateSparseBlockedToMMAPass();
-std::unique_ptr<mlir::Pass> CreateSparseRemoveLayoutConversionPass();
-std::unique_ptr<mlir::Pass> CreateSparseLocalLoadToLLVMPass();
-std::unique_ptr<mlir::Pass> CreateSparseDotOpToLLVMPass();
-std::unique_ptr<mlir::Pass> CreateSparseWGMMAOpToLLVMPass();
 std::unique_ptr<mlir::Pass> CreatePreventMmaV3LoopUnrollingPass();
 std::unique_ptr<mlir::Pass> CreateInt4ToPackedInt4RewritePass();
+std::unique_ptr<mlir::Pass> CreateRoundF32ToTF32ForTf32DotRewritePass();
+std::unique_ptr<mlir::Pass> CreateExtractTmaInfoPass();
 
 // Returns true if the `op` contains an operation in it's regions that satisfies
 // the `fn`.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
index b85bb400d9a0..43f9c41f0482 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
@@ -19,14 +19,15 @@ limitations under the License.
 include "mlir/Pass/PassBase.td"
 
 def TritonXLAExtractInsertToTritonPass : Pass<"triton-xla-extract-insert-to-triton", "mlir::ModuleOp"> {
-  let summary = "Convert XLA Triton tile, extract and, insert ops to Triton ops.";
+  let summary = "Convert Triton XLA extract and insert ops to Triton ops.";
   let description = [{
-    This pass converts `triton_xla.extract`, `triton_xla.insert` and
-    `triton_xla.tile` ops to Triton ops. It also rewrites `func` args to
-    `tt.ptr` types and removes function return args.
+    This pass converts `triton_xla.extract` and `triton_xla.insert` ops to
+    Triton ops. It also rewrites `func` args to `tt.ptr` types and removes
+    function return args.
   }];
   let dependentDialects = [
     "triton::TritonDialect",
+    "::xla::XlaDialect"
   ];
   let options = [
     Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
@@ -47,71 +48,6 @@ def GeneralizeKernelSignaturePass
   let constructor = "CreateGeneralizeKernelSignaturePass()";
 }
 
-def SparseAddEncodingPass : Pass<"sparse-add-encoding", "mlir::ModuleOp"> {
-  let summary = "Add sparse encoding for all the arguments of a SparseDotOp.";
-  let options = [
-    Option<"num_warps_", "num-warps", "int32_t", /*default=*/"4",
-           "Number of warps">,
-    Option<"threads_per_warp_", "threads-per-warp", "int32_t", /*default=*/"32",
-           "Number of threads per warp">,
-    Option<"num_ctas_", "num-ctas", "int32_t", /*default=*/"1",
-           "Number of CTAs in a CGA">,
-  ];
-  let dependentDialects = [
-    "triton::gpu::TritonGPUDialect",
-  ];
-  let constructor = "CreateSparseAddEncodingPass()";
-}
-
-def SparseBlockedToMMAPass : Pass<"sparse-blocked-to-mma", "mlir::ModuleOp"> {
-  let summary = "Add convert layouts to/from MMA before and after SparseDotOp.";
-  let description = [{
-     Add convert layouts to and from MMA before and after SparseDotOp. In MMAV3,
-     shared memory allocations will be used for A and B operands.
-  }];
-  let dependentDialects = [
-    "triton::gpu::TritonGPUDialect",
-  ];
-  let constructor = "CreateSparseBlockedToMMAPass()";
-}
-
-def SparseRemoveLayoutConversionPass
-    : Pass<"sparse-remove-layout-conversion", "mlir::ModuleOp"> {
-  let summary = "Replaces ConvertLayoutOp with sparse dot encoding";
-  let dependentDialects = [
-    "triton::gpu::TritonGPUDialect",
-  ];
-  let constructor = "CreateSparseRemoveLayoutConversionPass()";
-}
-
-def SparseLocalLoadToLLVMPass
-    : Pass<"sparse-local-load-to-llvm", "mlir::ModuleOp"> {
-  let summary = "Lowers sparse local load to LLVM";
-  let dependentDialects = [
-    "triton::gpu::TritonGPUDialect",
-    "mlir::LLVM::LLVMDialect"
-  ];
-  let constructor = "CreateSparseLocalLoadToLLVMPass()";
-}
-
-def SparseDotOpToLLVMPass : Pass<"sparse-dot-to-llvm", "mlir::ModuleOp"> {
-  let summary = "Lowers sparse dot to LLVM";
-  let constructor = "CreateSparseDotOpToLLVMPass()";
-  let dependentDialects = [
-    "triton::gpu::TritonGPUDialect",
-    "mlir::triton::nvgpu::NVGPUDialect",
-  ];
-}
-
-def SparseWGMMAOpToLLVMPass : Pass<"sparse-wgmma-to-llvm", "mlir::ModuleOp"> {
-  let summary = "Lowers sparse WGMMA to LLVM";
-  let dependentDialects = [
-    "triton::gpu::TritonGPUDialect",
-    "mlir::triton::nvgpu::NVGPUDialect",
-  ];
-  let constructor = "CreateSparseWGMMAOpToLLVMPass()";
-}
-
 def PreventMmaV3LoopUnrollingPass
     : Pass<"prevent-mmav3-loop-unrolling", "mlir::ModuleOp"> {
   let summary = "Prevent MMAv3 loop unrolling.";
@@ -124,6 +60,17 @@ def PreventMmaV3LoopUnrollingPass
   let constructor = "CreatePreventMmaV3LoopUnrollingPass()";
 }
 
+def ExtractTmaInfoPass
+    : Pass<"extract-tma-info", "mlir::ModuleOp"> {
+  let summary = "Extract TMA info during Triton lowering.";
+  let description = [{
+    Some information needed from TMA is created mid-pipeline by Triton. This
+    pass extracts the information while it still exists and copies it into XLA's
+    tma_descriptor attribute to be read at the LLVM IR level.
+  }];
+  let constructor = "CreateExtractTmaInfoPass()";
+}
+
 def LoadInt4RewritePass
     : Pass<"int4-to-packed-int4-rewrite", "mlir::ModuleOp"> {
   let summary = "Converts ops with int4 tensors to the ops with int4 packed to int8 tensors.";
@@ -135,4 +82,16 @@ def LoadInt4RewritePass
   let constructor = "CreateInt4ToPackedInt4RewritePass()";
 }
 
+def RoundF32ToTF32ForTf32DotRewritePass
+    : Pass<"round-f32-to-tf32-for-tf32-dot-rewrite", "mlir::ModuleOp"> {
+  let summary = "dot with tf32 algorithm requires explicit rounding.";
+  let description = [{
+    This pass adds explicit rounding from f32 to tf32 for the dot with tf32 algorithm.
+    This is required because mma instruction does not have explicit rounding and
+    by default does truncation. As a result, the dot with tf32 algorithm has too
+    small precision. It is even less than for the dot with BF16 arguments.
+  }];
+  let constructor = "CreateRoundF32ToTF32ForTf32DotRewritePass()";
+}
+
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_PASSES_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc
new file mode 100644
index 000000000000..3faf5f231dc4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/round_f32_to_tf32_for_tf32_dot_pass.cc
@@ -0,0 +1,95 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::xla {
+
+namespace mt = ::mlir::triton;
+
+#define GEN_PASS_DEF_ROUNDF32TOTF32FORTF32DOTREWRITEPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+class Tf32DotPattern : public OpRewritePattern<mt::DotOp> {
+ public:
+  explicit Tf32DotPattern(MLIRContext *context)
+      : OpRewritePattern<mt::DotOp>(context) {}
+
+  using OpRewritePattern<mt::DotOp>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mt::DotOp op, PatternRewriter &rewriter) const override {
+    constexpr auto tf32_args_rounded = "tf32_arguments_rounded";
+    if (op.getInputPrecision() != mt::InputPrecision::TF32) return failure();
+    if (!op.getA().getType().getElementType().isF32()) return failure();
+    if (!op.getB().getType().getElementType().isF32()) return failure();
+    if (op->hasAttr(tf32_args_rounded)) return failure();
+
+    auto f32ToTF32 = [&](Value value) -> Value {
+      return rewriter
+          .create<ElementwiseInlineAsmOp>(
+              op.getLoc(), value.getType(), "cvt.rna.tf32.f32 $0, $1;", "=r,r",
+              /*isPure=*/true, /*pack=*/1, ArrayRef<Value>{value})
+          ->getResult(0);
+    };
+    auto lhs = f32ToTF32(op.getA());
+    auto rhs = f32ToTF32(op.getB());
+    auto dot = rewriter.replaceOpWithNewOp<mt::DotOp>(
+        op, op.getC().getType(), lhs, rhs, op.getC(), mt::InputPrecision::TF32,
+        /*maxNumImpreciseAcc=*/0);
+    dot->setAttr(tf32_args_rounded, rewriter.getUnitAttr());
+
+    return success();
+  }
+};
+
+struct RoundF32ToTF32ForTf32DotRewritePass
+    : public impl::RoundF32ToTF32ForTf32DotRewritePassBase<
+          RoundF32ToTF32ForTf32DotRewritePass> {
+  void runOnOperation() override {
+    auto module = getOperation();
+    RewritePatternSet patterns(&getContext(),
+                               std::make_unique<Tf32DotPattern>(&getContext()));
+    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> CreateRoundF32ToTF32ForTf32DotRewritePass() {
+  return std::make_unique<RoundF32ToTF32ForTf32DotRewritePass>();
+}
+
+}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/sparse_passes.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/sparse_passes.cc
deleted file mode 100644
index e848b60af7cf..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/sparse_passes.cc
+++ /dev/null
@@ -1,1020 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "Dialect/NVGPU/IR/Dialect.h"
-#include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
-#include "nvidia/include/TritonNVIDIAGPUToLLVM/PTXAsmFormat.h"
-#include "nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h"
-#include "absl/log/check.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
-#include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
-#include "xla/backends/gpu/codegen/triton/transforms/passes.h"
-#include "triton/Analysis/Allocation.h"
-#include "triton/Analysis/Membar.h"
-#include "triton/Conversion/TritonGPUToLLVM/TypeConverter.h"
-#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
-#include "triton/Dialect/Triton/IR/Types.h"
-#include "triton/Dialect/Triton/IR/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Attributes.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h"
-#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
-#include "triton/Tools/Sys/GetEnv.hpp"
-
-using namespace mlir;  // NOLINT(build/namespaces)
-
-namespace ttn = triton::nvgpu;
-using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
-using ::mlir::triton::gpu::getShapePerCTA;
-using ::mlir::triton::gpu::getShapePerCTATile;
-using ::mlir::triton::gpu::SharedEncodingAttr;
-using ttn::OperandsAndConstraints;
-
-// TODO: b/382250044 - Declare these functions in the header files of the
-// corresponding C++ files and include them here instead of forward-declaring.
-// The functions below are defined in AccelerateMatmul.cpp.
-namespace mlir::triton::gpu {
-SmallVector<unsigned, 3> getWarpsPerTile(
-    Operation *dotOp, ArrayRef<int64_t> shape, int version, int numWarps,
-    const SmallVector<unsigned, 3> &instrShape);
-int computeOrigBitWidth(Value x);
-Value getSharedMemMMAOperand(Value v, mlir::PatternRewriter &rewriter,
-                             int opIdx, bool allowTranspose);
-}  // namespace mlir::triton::gpu
-
-// The functions below are defined in WGMMA.cpp.
-Value createDescriptor(ConversionPatternRewriter &rewriter, Location loc,
-                       int64_t swizzling, uint32_t stride);
-int64_t getSwizzlingFromLayout(const triton::gpu::SharedEncodingAttr &layout,
-                               uint32_t widthInByte);
-ttn::WGMMAEltType getMmaRetType(Value);
-ttn::WGMMAEltType getMmaOperandType(Value, bool);
-
-// The functions below are defined in MMAv2.cpp.
-using ValueTableV2 = std::map<std::array<int, 3>, Value>;
-ValueTableV2 getValuesFromDotOperandLayoutStruct(
-    const LLVMTypeConverter *typeConverter, Location loc,
-    ConversionPatternRewriter &rewriter, Value value, int batch, int repOuter,
-    int repK, RankedTensorType type);
-
-namespace mlir::triton::xla {
-namespace {
-
-bool ContainsOp(mlir::Operation *op,
-                llvm::function_ref<bool(mlir::Operation *)> fn) {
-  auto visitor = [&](mlir::Operation *nested_op) {
-    return fn(nested_op) ? mlir::WalkResult::interrupt()
-                         : mlir::WalkResult::advance();
-  };
-  return op->walk(visitor).wasInterrupted();
-}
-
-#define GEN_PASS_DEF_SPARSEADDENCODINGPASS
-#define GEN_PASS_DEF_SPARSEBLOCKEDTOMMAPASS
-#define GEN_PASS_DEF_SPARSEDOTOPTOLLVMPASS
-#define GEN_PASS_DEF_SPARSELOCALLOADTOLLVMPASS
-#define GEN_PASS_DEF_SPARSEREMOVELAYOUTCONVERSIONPASS
-#define GEN_PASS_DEF_SPARSEWGMMAOPTOLLVMPASS
-#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
-
-constexpr int kThreadsPerWarp = 32;
-// Each 16x16 original sparse matrix tile requires 16 metadata values of
-// 16-bit size, where the first thread (T0) in each 4-thread group holds two
-// such values in a register (32-bit).
-// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#sparse-matrix-storage
-constexpr int kTileSize = 16;
-constexpr int kMetaElementsBitSize = 2;
-// Metadata elements are packed into 16-bits values.
-constexpr int kMetaElementsPerPackedValue = 16 / kMetaElementsBitSize;
-constexpr int kColumnsPerCtaTile = kTileSize / kMetaElementsPerPackedValue;
-
-struct SparseAddEncoding : public OpConversionPattern<SparseDotOp> {
-  using OpConversionPattern<SparseDotOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      SparseDotOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    RankedTensorType op_type = cast<RankedTensorType>(op.getType());
-
-    auto op_shape = op_type.getShape();
-    auto type_converter = getTypeConverter<TritonGPUTypeConverter>();
-    int num_warps = type_converter->getNumWarps();
-    int threads_per_warp = type_converter->getThreadsPerWarp();
-    int num_ctas = type_converter->getNumCTAs();
-
-    auto rank = op_shape.size();
-    auto num_elements = product<int64_t>(op_shape);
-    SmallVector<unsigned> ret_size_per_thread(rank, 1);
-    if (num_elements / (num_warps * threads_per_warp) >= 4) {
-      ret_size_per_thread[rank - 1] = 2;
-      ret_size_per_thread[rank - 2] = 2;
-    }
-    if (num_elements / (num_warps * threads_per_warp) >= 16) {
-      ret_size_per_thread[rank - 1] = 4;
-      ret_size_per_thread[rank - 2] = 4;
-    }
-    SmallVector<unsigned> ret_order(rank);
-    for (unsigned i = 0; i < rank; ++i) ret_order[i] = rank - 1 - i;
-
-    Attribute d_encoding = triton::gpu::BlockedEncodingAttr::get(
-        getContext(), op_shape, ret_size_per_thread, ret_order, num_warps,
-        threads_per_warp, num_ctas);
-    RankedTensorType return_type =
-        RankedTensorType::get(op_shape, op_type.getElementType(), d_encoding);
-
-    // a must be of smem layout
-    auto a_type = cast<RankedTensorType>(adaptor.getA().getType());
-    Type a_element_type = a_type.getElementType();
-    Attribute a_encoding = a_type.getEncoding();
-    if (!a_encoding) return failure();
-    Value a = adaptor.getA();
-    if (!isa<triton::gpu::DotOperandEncodingAttr>(a_encoding)) {
-      Attribute new_encoding = triton::gpu::DotOperandEncodingAttr::get(
-          getContext(), 0, d_encoding, a_element_type);
-      auto tensor_type = RankedTensorType::get(a_type.getShape(),
-                                               a_element_type, new_encoding);
-      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), tensor_type,
-                                                        a);
-    }
-
-    // b must be of smem layout
-    auto b_type = cast<RankedTensorType>(adaptor.getB().getType());
-    Type b_element_type = b_type.getElementType();
-    Attribute b_encoding = b_type.getEncoding();
-    if (!b_encoding) return failure();
-    Value b = adaptor.getB();
-    if (!isa<triton::gpu::DotOperandEncodingAttr>(b_encoding)) {
-      Attribute new_encoding = triton::gpu::DotOperandEncodingAttr::get(
-          getContext(), 1, d_encoding, b_element_type);
-      auto tensor_type = RankedTensorType::get(b_type.getShape(),
-                                               b_element_type, new_encoding);
-      b = rewriter.create<triton::gpu::ConvertLayoutOp>(b.getLoc(), tensor_type,
-                                                        b);
-    }
-    Value c = adaptor.getC();
-    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), return_type,
-                                                      c);
-
-    // aMeta must be of smem layout
-    auto a_meta_type = cast<RankedTensorType>(adaptor.getAMeta().getType());
-    Attribute a_meta_encoding = a_meta_type.getEncoding();
-    if (!a_meta_encoding) return failure();
-    Value a_meta = adaptor.getAMeta();
-    if (!isa<SparseDotMetaEncodingAttr>(a_meta_encoding)) {
-      Attribute new_encoding =
-          SparseDotMetaEncodingAttr::get(getContext(), d_encoding);
-      auto tensor_type = RankedTensorType::get(
-          a_meta_type.getShape(), a_meta_type.getElementType(), new_encoding);
-      a_meta = rewriter.create<triton::gpu::ConvertLayoutOp>(
-          a_meta.getLoc(), tensor_type, a_meta);
-    }
-
-    auto new_op = rewriter.replaceOpWithNewOp<SparseDotOp>(op, return_type, a,
-                                                           b, c, a_meta);
-    for (const NamedAttribute attr : op->getAttrs()) {
-      if (!new_op->hasAttr(attr.getName()))
-        new_op->setAttr(attr.getName(), attr.getValue());
-    }
-
-    return success();
-  }
-};
-
-struct SparseAddEncodingPass
-    : public impl::SparseAddEncodingPassBase<SparseAddEncodingPass> {
-  using impl::SparseAddEncodingPassBase<
-      SparseAddEncodingPass>::SparseAddEncodingPassBase;
-
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    TritonGPUTypeConverter type_converter(context, num_warps_,
-                                          threads_per_warp_, num_ctas_);
-    auto pattern = std::make_unique<SparseAddEncoding>(type_converter, context);
-    RewritePatternSet patterns(context, std::move(pattern));
-    TritonGPUConversionTarget target(*context, type_converter);
-    target.addDynamicallyLegalOp<SparseDotOp>([](SparseDotOp op) {
-      return op.getAMeta().getType().getEncoding() != nullptr;
-    });
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-class SparseBlockedToMMA : public RewritePattern {
-  using ConvertLayoutOp = triton::gpu::ConvertLayoutOp;
-  using NvidiaMmaEncodingAttr = triton::gpu::NvidiaMmaEncodingAttr;
-
- public:
-  SparseBlockedToMMA(MLIRContext *context, int compute_capability)
-      : RewritePattern(SparseDotOp::getOperationName(), 2, context),
-        compute_capability_(compute_capability) {}
-
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    auto dot_op = cast<SparseDotOp>(op);
-    auto context = op->getContext();
-    Value a = dot_op.getA();
-    Value b = dot_op.getB();
-
-    // Check data-types and SM compatibility
-    RankedTensorType ret_type = dot_op.getType();
-    if (!ret_type.getEncoding() ||
-        isa<NvidiaMmaEncodingAttr>(ret_type.getEncoding()))
-      return failure();
-
-    assert(compute_capability_ >= 80 &&
-           "SparseDot is only supported on Ampere or higher");
-    bool allow_v3 = !triton::tools::getBoolEnv("DISABLE_MMA_V3");
-    // Sparse dot is supported for MMA v2 and v3 only, and sm100/sm120 should
-    // use MMA v2 (v3 is Hopper-only).
-    int triton_mma_version = compute_capability_ == 90 && allow_v3 ? 3 : 2;
-
-    // get MMA encoding and new return type given the number of warps
-    auto ret_shape_per_cta = triton::gpu::getShapePerCTA(ret_type);
-    auto mod = op->getParentOfType<ModuleOp>();
-    int num_warps = triton::gpu::TritonGPUDialect::getNumWarps(mod);
-    auto cta_layout = triton::gpu::getCTALayout(ret_type.getEncoding());
-
-    auto instr_shape =
-        mmaVersionToInstrShape(triton_mma_version, ret_shape_per_cta,
-                               getElementTypeOrSelf(a.getType()), num_warps);
-    auto warps_per_tile = mlir::triton::gpu::getWarpsPerTile(
-        dot_op, ret_shape_per_cta, triton_mma_version, num_warps, instr_shape);
-    NvidiaMmaEncodingAttr mma_enc = NvidiaMmaEncodingAttr::get(
-        context, triton_mma_version, /*versionMinor=*/0, warps_per_tile,
-        cta_layout, instr_shape);
-    auto new_ret_type = RankedTensorType::get(
-        ret_type.getShape(), ret_type.getElementType(), mma_enc);
-
-    // convert accumulator
-    auto acc = dot_op.getOperand(2);
-    auto new_acc =
-        rewriter.create<ConvertLayoutOp>(acc.getLoc(), new_ret_type, acc);
-
-    if (triton_mma_version == 2) {  // MMAV2
-      int min_bit_width = std::min(triton::gpu::computeOrigBitWidth(a),
-                                   triton::gpu::computeOrigBitWidth(b));
-      int k_width = 32 / min_bit_width;
-
-      // convert A operand
-      auto new_a_encoding =
-          DotOperandEncodingAttr::get(context, 0, mma_enc, k_width);
-      auto a_type = cast<RankedTensorType>(a.getType());
-      a_type = RankedTensorType::get(a_type.getShape(), a_type.getElementType(),
-                                     new_a_encoding);
-      a = rewriter.create<ConvertLayoutOp>(a.getLoc(), a_type, a);
-
-      // convert B operand
-      auto new_b_encoding =
-          DotOperandEncodingAttr::get(context, 1, mma_enc, k_width);
-      auto b_type = cast<RankedTensorType>(b.getType());
-      b_type = RankedTensorType::get(b_type.getShape(), b_type.getElementType(),
-                                     new_b_encoding);
-      b = rewriter.create<ConvertLayoutOp>(b.getLoc(), b_type, b);
-
-    } else {  // MMAV3
-      assert(triton_mma_version == 3 &&
-             "Sparsity is only supported with MMAV2 or higher");
-      auto elt_type = dot_op.getA().getType().getElementType();
-      // In MMAV3 transpose is only supported for f16 and bf16.
-      bool allow_transpose = elt_type.isF16() || elt_type.isBF16();
-      // Shared memory allocations that will be used by the dot op.
-      a = triton::gpu::getSharedMemMMAOperand(a, rewriter, 0, allow_transpose);
-      b = triton::gpu::getSharedMemMMAOperand(b, rewriter, 1, allow_transpose);
-    }
-
-    // convert metadata
-    Value meta = dot_op.getAMeta();
-    auto meta_type = cast<RankedTensorType>(meta.getType());
-    meta_type =
-        RankedTensorType::get(meta_type.getShape(), meta_type.getElementType(),
-                              SparseDotMetaEncodingAttr::get(context, mma_enc));
-    meta = rewriter.create<ConvertLayoutOp>(meta.getLoc(), meta_type, meta);
-
-    // convert dot instruction
-    auto new_dot = rewriter.create<SparseDotOp>(dot_op.getLoc(), new_ret_type,
-                                                a, b, new_acc, meta);
-
-    // convert back to return type
-    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, ret_type,
-                                                 new_dot.getResult());
-    return success();
-  }
-
- private:
-  int compute_capability_;
-};
-
-struct SparseBlockedToMMAPass
-    : public impl::SparseBlockedToMMAPassBase<SparseBlockedToMMAPass> {
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    ModuleOp module = getOperation();
-    auto compute_capability = getNVIDIAComputeCapability(module);
-    auto pattern =
-        std::make_unique<SparseBlockedToMMA>(context, compute_capability);
-    RewritePatternSet patterns(context, std::move(pattern));
-    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-struct SparseRemoveLayoutConversionPass
-    : public impl::SparseRemoveLayoutConversionPassBase<
-          SparseRemoveLayoutConversionPass> {
-  void runOnOperation() override {
-    getOperation().walk([&](triton::gpu::ConvertLayoutOp op) {
-      ImplicitLocOpBuilder builder(op.getLoc(), op);
-      // Skip if the source is already in shared memory.
-      auto src_encoding =
-          cast<RankedTensorType>(op.getSrc().getType()).getEncoding();
-      if (isa<triton::gpu::SharedEncodingAttr>(src_encoding)) {
-        return;
-      }
-      auto dst_type = cast<RankedTensorType>(op.getType());
-      // Skip if the destination is not a sparse dot meta.
-      if (!isa<SparseDotMetaEncodingAttr>(dst_type.getEncoding())) {
-        return;
-      }
-
-      auto shared_layout = builder.getAttr<triton::gpu::SharedEncodingAttr>(
-          // Packing metadata elements together. No swizzling.
-          /*vec=*/kMetaElementsPerPackedValue, /*perPhase=*/1, /*maxPhase=*/1,
-          triton::gpu::getOrder(src_encoding),
-          triton::gpu::getCTALayout(src_encoding));
-      auto mem_type = triton::gpu::MemDescType::get(
-          dst_type.getShape(), dst_type.getElementType(), shared_layout,
-          builder.getAttr<triton::gpu::SharedMemorySpaceAttr>());
-      Value alloc =
-          builder.create<triton::gpu::LocalAllocOp>(mem_type, op.getSrc());
-      Value convert = builder.create<triton::gpu::LocalLoadOp>(dst_type, alloc);
-      op.replaceAllUsesWith(convert);
-      op.erase();
-    });
-  }
-};
-
-class SparseLocalLoadToLLVM
-    : public ConvertOpToLLVMPattern<triton::gpu::LocalLoadOp> {
- public:
-  using ConvertOpToLLVMPattern<
-      triton::gpu::LocalLoadOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      triton::gpu::LocalLoadOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    triton::gpu::MemDescType src_ty = op.getSrc().getType();
-    if (!isa<triton::gpu::SharedEncodingAttr>(src_ty.getEncoding()))
-      return failure();
-    RankedTensorType dst_ty = op.getType();
-    if (!isa<SparseDotMetaEncodingAttr>(dst_ty.getEncoding())) return failure();
-    return lowerSharedToSparseMeta(op, adaptor, rewriter);
-  }
-
- private:
-  // lowering metadata (local_load: shared -> sparse dot meta) to LLVM
-  LogicalResult lowerSharedToSparseMeta(
-      triton::gpu::LocalLoadOp op, triton::gpu::LocalLoadOpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const {
-    auto loc = op.getLoc();
-    auto load_sparse_encoding = cast<SparseDotMetaEncodingAttr>(
-        cast<RankedTensorType>(op.getResult().getType()).getEncoding());
-
-    // Calculate tile size as number of mask elements (4xi4).
-    NvidiaMmaEncodingAttr mma_layout =
-        cast<NvidiaMmaEncodingAttr>(load_sparse_encoding.getParent());
-    SmallVector<unsigned> warps_per_cta = mma_layout.getWarpsPerCTA();
-
-    // Calculate offset in the tile for the current thread.
-    Value threads_per_warp = i32_val(kThreadsPerWarp);
-    Value thread_id = getThreadId(rewriter, loc);
-    Value warp_id = udiv(thread_id, threads_per_warp);
-    Value warp_group_id;
-    if (mma_layout.isHopper()) {
-      // Hopper MMA instructions force a warp order of [0, 1]. See docs:
-      // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-wgmma-mma-async-m64nnk8
-      warp_group_id = urem(warp_id, i32_val(warps_per_cta[0]));
-    } else {
-      assert(mma_layout.isAmpere() &&
-             "SparseDot is only supported on Ampere and Hopper");
-      warp_group_id = udiv(warp_id, i32_val(warps_per_cta[1]));
-    }
-    // Calculate row and column id, based on mma.sp.sync.aligned.m16n8k32:
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#sparse-mma-metadata-16832-f16bf16.
-    // column-id takes into consideration that we pack elements for metadata.
-    constexpr int kThreadsInGroup = 4;
-    constexpr int kMetadataLineOffset = kThreadsPerWarp / kThreadsInGroup;
-    Value lane_id = urem(thread_id, threads_per_warp);
-    Value lane_group_id = udiv(lane_id, i32_val(kThreadsInGroup));
-    Value row_id = add(mul(warp_group_id, i32_val(kTileSize)), lane_group_id);
-    SmallVector<unsigned> shape_per_cta_tile = {kTileSize * warps_per_cta[0],
-                                                kColumnsPerCtaTile};
-    Value column_id = urem(lane_id, i32_val(shape_per_cta_tile[1]));
-
-    // Calculate number of tile repetitions.
-    Value tensor = op.getSrc();
-    auto mem_desc = cast<triton::gpu::MemDescType>(tensor.getType());
-    auto shape = mem_desc.getShape();
-    int rep_m = shape[0] / shape_per_cta_tile[0];
-    int rep_k = shape[1] / shape_per_cta_tile[1];
-    CHECK_GT(rep_m, 0) << shape[0] << "/" << shape_per_cta_tile[0];
-    CHECK_GT(rep_k, 0) << shape[1] << "/" << shape_per_cta_tile[1];
-
-    // Load sparse metadata from shared memory.
-    auto elem_ty = getTypeConverter()->convertType(mem_desc.getElementType());
-    auto s_mem_obj = LLVM::getSharedMemoryObjectFromStruct(
-        loc, adaptor.getSrc(), elem_ty, rewriter);
-    const SmallVector<Value> strides =
-        s_mem_obj.getStrides(mem_desc, loc, rewriter);
-    Value stride_m = strides[0];
-    Value stride_k = strides[1];
-    MLIRContext *ctx = tensor.getContext();
-    Type ptr_ty = ptr_ty(ctx, 3);
-    Value base = gep(ptr_ty, i16_ty, s_mem_obj.getBase(), i32_val(0));
-    SmallVector<Value> values;
-
-    for (int k = 0; k < rep_k; ++k) {
-      for (int m = 0; m < rep_m; ++m) {
-        // Each thread processes two different rows.
-        Value row_lower = add(row_id, i32_val(m * shape_per_cta_tile[0]));
-        Value row_upper = add(row_lower, i32_val(kMetadataLineOffset));
-        Value column = add(column_id, i32_val(k * shape_per_cta_tile[1]));
-        Value offset_lower =
-            add(mul(row_lower, stride_m), mul(column, stride_k));
-        Value offset_upper =
-            add(mul(row_upper, stride_m), mul(column, stride_k));
-        Value lower = load(i16_ty, gep(ptr_ty, i16_ty, base, offset_lower));
-        Value upper = load(i16_ty, gep(ptr_ty, i16_ty, base, offset_upper));
-        values.push_back(lower);
-        values.push_back(upper);
-      }
-    }
-
-    // Pack resulting values as LLVM struct.
-    Type struct_ty = struct_ty(SmallVector<Type>(values.size(), i16_ty));
-    Value res =
-        packLLElements(loc, getTypeConverter(), values, rewriter, struct_ty);
-
-    rewriter.replaceOp(op, res);
-    return success();
-  }
-};
-
-bool IsLocalLoadWithSparseEncoding(Operation *op) {
-  auto local_load = mlir::dyn_cast<triton::gpu::LocalLoadOp>(op);
-  if (!local_load) return false;
-  return isa<SparseDotMetaEncodingAttr>(local_load.getType().getEncoding());
-}
-
-struct SparseLocalLoadToLLVMPass
-    : public impl::SparseLocalLoadToLLVMPassBase<SparseLocalLoadToLLVMPass> {
-  void runOnOperation() override {
-    // Exit early if there are no sparse ops.
-    ModuleOp mod = getOperation();
-    if (!ContainsOp(mod, IsLocalLoadWithSparseEncoding)) return;
-
-    // Allocate shared memory and set barrier
-    // This is also done in the TritonGPUToLLVMPass but we need to do it before
-    // we write the local load op to LLVM to have barriers in the right place.
-    // See b/358375493.
-    ModuleAllocation allocation(getOperation());
-    ModuleMembarAnalysis membar_pass(&allocation);
-    membar_pass.run();
-
-    MLIRContext *context = &getContext();
-    ConversionTarget target(*context);
-    target.addLegalDialect<LLVM::LLVMDialect, mlir::gpu::GPUDialect,
-                           arith::ArithDialect>();
-    target.addDynamicallyLegalOp<triton::gpu::LocalLoadOp>(
-        [](triton::gpu::LocalLoadOp op) {
-          return !isa<SparseDotMetaEncodingAttr>(op.getType().getEncoding());
-        });
-    LowerToLLVMOptions option(context);
-    auto compute_capability = getNVIDIAComputeCapability(mod);
-    int ptx_version = 0;  // TritonGPUToLLVMTypeConverter does not use this.
-    mlir::triton::NVIDIA::TargetInfo target_info(compute_capability,
-                                                 ptx_version);
-    TritonGPUToLLVMTypeConverter type_converter(context, option, target_info);
-    auto pattern = std::make_unique<SparseLocalLoadToLLVM>(type_converter);
-    RewritePatternSet patterns(context, std::move(pattern));
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-constexpr int kContractingFactor = 2;  // implied by N:M (2:4)
-constexpr int kCore = 2;               // number of core matrices per batch
-constexpr int kCoreTile = kCore * kContractingFactor;
-
-// ----- Ampere implementation.
-std::string getMmaSpPtxInstruction(Type type) {
-  if (type.isF16()) {
-    return "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32";
-  } else if (type.isBF16()) {
-    return "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32";
-  }
-  llvm::report_fatal_error("Unsupported SparseDotOp operand type");
-}
-
-LogicalResult convertSparseMMA(SparseDotOp op, SparseDotOp::Adaptor adaptor,
-                               const LLVMTypeConverter *typeConverter,
-                               ConversionPatternRewriter &rewriter) {
-  // Get number of repetitions across the dimensions.
-  auto aTensorTy = cast<RankedTensorType>(op.getA().getType());
-  auto bTensorTy = cast<RankedTensorType>(op.getB().getType());
-
-  auto layoutA = dyn_cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
-  auto layoutB = dyn_cast<DotOperandEncodingAttr>(bTensorTy.getEncoding());
-  assert(layoutA != nullptr && layoutB != nullptr);
-
-  int bitwidth = aTensorTy.getElementType().getIntOrFloatBitWidth();
-  int kWidth = layoutA.getKWidth();
-  auto mmaEnc = cast<NvidiaMmaEncodingAttr>(layoutA.getParent());
-  auto repA = mmaEnc.getRepForOperand(triton::gpu::getShapePerCTA(aTensorTy),
-                                      bitwidth, kWidth, layoutA.getOpIdx());
-  auto repB = mmaEnc.getRepForOperand(triton::gpu::getShapePerCTA(bTensorTy),
-                                      bitwidth, kWidth, layoutB.getOpIdx());
-
-  assert(repA[0] == 1 && repB[0] == 1);  // batch size
-  assert(repB[1] == repA[2] * kContractingFactor);
-  int repM = repA[1], repN = repB[2], repK = repB[1];
-  int repBatch = repA[0];
-  // Arrange loaded values into positions.
-  Location loc = op.getLoc();
-  auto ha = getValuesFromDotOperandLayoutStruct(
-      typeConverter, loc, rewriter, adaptor.getA(), repBatch, repM,
-      repK / kContractingFactor, aTensorTy);
-  auto hb = getValuesFromDotOperandLayoutStruct(typeConverter, loc, rewriter,
-                                                adaptor.getB(), repBatch, repN,
-                                                repK, bTensorTy);
-
-  // Combine loaded metadata values.
-  auto hMeta = unpackLLElements(loc, adaptor.getAMeta(), rewriter);
-  SmallVector<Value> hMetaPacked;
-  for (int i = 0; i < hMeta.size(); i += kCore) {
-    Value lower = zext(i32_ty, hMeta[i]);
-    Value upper = zext(i32_ty, hMeta[i + 1]);
-    Value packed = or_(shl(upper, i32_val(16)), lower);
-    hMetaPacked.push_back(packed);
-  }
-
-  // Flatten accumulator values.
-  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
-
-  // Create `mma.sp` instruction for 4/8 core matrices.
-  auto callMma = [&](int m, int n, int k) {
-    triton::PTXBuilder builder;
-    auto &mma =
-        *builder.create(getMmaSpPtxInstruction(aTensorTy.getElementType()));
-
-    auto retArgs = builder.newListOperand(kCoreTile, "=f");
-    auto cArgs = builder.newListOperand();
-    int baseIdx = m * repN * kCore + n * kCoreTile;
-    for (int i = 0; i < kCoreTile; ++i) {
-      cArgs->listAppend(builder.newOperand(fc[baseIdx + i], std::to_string(i)));
-    }
-    int b = 0;
-    int i = k / kContractingFactor;
-    auto aArgs = builder.newListOperand({
-        {ha.at({b, m, i}), "r"},
-        {ha.at({b, m + 1, i}), "r"},
-        {ha.at({b, m, i + 1}), "r"},
-        {ha.at({b, m + 1, i + 1}), "r"},
-    });
-    auto bArgs = builder.newListOperand({
-        {hb.at({b, n, k}), "r"},
-        {hb.at({b, n, k + 1}), "r"},
-        {hb.at({b, n, k + 2}), "r"},
-        {hb.at({b, n, k + 3}), "r"},
-    });
-    auto metaArg =
-        builder.newOperand(hMetaPacked[k / kCoreTile * repM + m / kCore], "r");
-    auto selector = builder.newConstantOperand(0);
-    mma(retArgs, aArgs, bArgs, cArgs, metaArg, selector);
-
-    Type fp32x4Ty = LLVM::LLVMStructType::getLiteral(
-        op.getContext(), SmallVector<Type>(kCoreTile, f32_ty));
-    Value mmaOut = builder.launch(rewriter, loc, fp32x4Ty);
-    for (int i = 0; i < kCoreTile; ++i) {
-      fc[baseIdx + i] = extract_val(f32_ty, mmaOut, i);
-    }
-  };
-
-  for (int k = 0; k < repK; k += kContractingFactor)
-    for (int m = 0; m < repM; ++m)
-      for (int n = 0; n < repN; ++n) callMma(kCore * m, n, kCore * k);
-
-  // Replace with new packed result.
-  Type structTy = LLVM::LLVMStructType::getLiteral(
-      op.getContext(), SmallVector<Type>(fc.size(), f32_ty));
-  Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
-  rewriter.replaceOp(op, res);
-
-  return success();
-}
-
-// ----- Hopper implementation.
-
-constexpr int kWarpsInGroup = 4;
-constexpr int kMmaAccumulatorCount = 2;
-constexpr int kMmaLineSize = 128;
-constexpr int kMmaAlignment = 16;
-
-// Shared memory descriptor builder for WGMMA.
-Value smemDescriptor(int a, int b, ConversionPatternRewriter &rewriter,
-                     Location loc, std::vector<unsigned int> instrShape,
-                     bool trans, int dimWpt, Value warpId,
-                     triton::gpu::MemDescType tensorTy, Value baseDesc,
-                     int minor) {
-  auto sharedLayout = cast<SharedEncodingAttr>(tensorTy.getEncoding());
-  int elemBytes = tensorTy.getElementTypeBitWidth() / 8;
-  int elemsPerSwizzlingRow =
-      kMmaLineSize / sharedLayout.getPerPhase() / elemBytes;
-  Value elemsPerSwizzlingRowVal = i32_val(elemsPerSwizzlingRow);
-
-  Value k = i32_val(b * instrShape[1]);
-  Value m = add(i32_val(a * dimWpt * instrShape[0]),
-                mul(warpId, i32_val(instrShape[0])));
-  if (trans) {
-    std::swap(k, m);
-  }
-  Value leading_offset = mul(udiv(k, elemsPerSwizzlingRowVal),
-                             i32_val(minor * elemsPerSwizzlingRow));
-  Value stride_offset = mul(m, elemsPerSwizzlingRowVal);
-  Value offset =
-      add(add(leading_offset, stride_offset), urem(k, elemsPerSwizzlingRowVal));
-  Value off1 = mul(i32_val(elemBytes), offset);
-  Value off_ = zext(i64_ty, udiv(off1, i32_val(kMmaAlignment)));
-
-  return add(baseDesc, off_);
-}
-
-LogicalResult convertSparseWGMMA(SparseDotOp op, SparseDotOp::Adaptor adaptor,
-                                 const LLVMTypeConverter *typeConverter,
-                                 ConversionPatternRewriter &rewriter,
-                                 Value thread) {
-  // Get number of repetitions across the dimensions.
-  auto aTensorTy = cast<triton::gpu::MemDescType>(op.getA().getType());
-  auto bTensorTy = cast<triton::gpu::MemDescType>(op.getB().getType());
-  auto dTensorTy = cast<RankedTensorType>(op.getD().getType());
-  auto mmaEnc = cast<NvidiaMmaEncodingAttr>(dTensorTy.getEncoding());
-
-  auto shapePerCTA = getShapePerCTA(dTensorTy);
-  auto shapePerCTATile = getShapePerCTATile(mmaEnc);
-  auto instrShape = mmaEnc.getInstrShape();
-  int repM = ceil<unsigned>(shapePerCTA[0], shapePerCTATile[0]);
-  int repN = ceil<unsigned>(shapePerCTA[1], shapePerCTATile[1]);
-  int repK = ceil<unsigned>(bTensorTy.getShape()[0],
-                            instrShape[2] * kContractingFactor);
-
-  // Flatten accumulator values.
-  auto loc = op.getLoc();
-  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
-  int accSize = kMmaAccumulatorCount * (instrShape[1] / kWarpsInGroup);
-  assert(fc.size() == repM * repN * accSize);
-
-  // Get warp ID.
-  auto wpt = mmaEnc.getWarpsPerCTA();
-  Value warp =
-      and_(udiv(thread, i32_val(kThreadsPerWarp)), i32_val(0xFFFFFFFC));
-  Value warpM = urem(warp, i32_val(wpt[0]));
-  Value warpMN = udiv(warp, i32_val(wpt[0]));
-  Value warpN = urem(warpMN, i32_val(wpt[1]));
-
-  // Create descriptor.
-  auto getSharedData = [&](Value arg, triton::gpu::MemDescType tensorTy) {
-    auto sharedObj = getSharedMemoryObjectFromStruct(
-        loc, arg, typeConverter->convertType(tensorTy.getElementType()),
-        rewriter);
-    auto sharedLayout = cast<SharedEncodingAttr>(tensorTy.getEncoding());
-    auto shape = getShapePerCTA(tensorTy);
-    auto ord = sharedLayout.getOrder();
-    int byteSize = aTensorTy.getElementTypeBitWidth() / 8;
-    int64_t swizzling =
-        getSwizzlingFromLayout(sharedLayout, shape[ord[0]] * byteSize);
-    Value baseDesc = createDescriptor(rewriter, loc, swizzling, shape[ord[1]]);
-    baseDesc = add(baseDesc,
-                   lshr(ptrtoint(i64_ty, sharedObj.getBase()), int_val(64, 4)));
-    return std::make_tuple(shape, ord, baseDesc);
-  };
-
-  // Create descriptor for loading A from shared memory.
-  auto tA = getSharedData(adaptor.getA(), aTensorTy);
-  Value warpA = urem(warpM, i32_val(std::get<0>(tA)[0] / instrShape[0]));
-  bool transA = std::get<1>(tA)[0] == 0;
-  auto loadA = [&](int m, int k) {
-    return smemDescriptor(m, k, rewriter, loc, {instrShape[0], instrShape[2]},
-                          transA, wpt[0], warpA, aTensorTy, std::get<2>(tA),
-                          std::get<0>(tA)[std::get<1>(tA)[1]]);
-  };
-
-  // Create descriptor for loading B from shared memory.
-  auto tB = getSharedData(adaptor.getB(), bTensorTy);
-  Value warpB = urem(warpN, i32_val(std::get<0>(tB)[1] / instrShape[1]));
-  bool transB = std::get<1>(tB)[0] == 1;
-  auto loadB = [&](int n, int k) {
-    return smemDescriptor(n, k, rewriter, loc,
-                          {instrShape[1], instrShape[2] * kContractingFactor},
-                          transB, wpt[1], warpB, bTensorTy, std::get<2>(tB),
-                          std::get<0>(tB)[std::get<1>(tB)[1]]);
-  };
-
-  // Load metadata from shared memory.
-  auto hMeta = unpackLLElements(loc, adaptor.getAMeta(), rewriter);
-  SmallVector<Value> hMetaPacked;
-  for (int i = 0; i < hMeta.size(); i += kCore) {
-    Value lower = zext(i32_ty, hMeta[i]);
-    Value upper = zext(i32_ty, hMeta[i + 1]);
-    Value packed = or_(shl(upper, i32_val(16)), lower);
-    hMetaPacked.push_back(packed);
-  }
-  assert(hMetaPacked.size() == repM * repK);
-
-  // Generate prologue.
-  ttn::WGMMAEltType eltTypeA = getMmaOperandType(op.getA(), false);
-  ttn::WGMMAEltType eltTypeB = getMmaOperandType(op.getB(), false);
-  ttn::WGMMAEltType eltTypeC = getMmaRetType(op.getD());
-
-  ttn::WGMMALayout layoutA =
-      transA ? ttn::WGMMALayout::col : ttn::WGMMALayout::row;
-  ttn::WGMMALayout layoutB =
-      transB ? ttn::WGMMALayout::row : ttn::WGMMALayout::col;
-
-  rewriter.create<ttn::FenceAsyncSharedOp>(loc, 0);
-  rewriter.create<ttn::WGMMAFenceOp>(loc);
-
-  // Generate main loop.
-  for (int m = 0; m < repM; ++m) {
-    for (int n = 0; n < repN; ++n) {
-      llvm::MutableArrayRef acc(&fc[(m * repN + n) * accSize], accSize);
-      auto accTy = LLVM::LLVMStructType::getLiteral(
-          op.getContext(), SmallVector<Type>(accSize, f32_ty));
-      Value d = packLLElements(loc, typeConverter, acc, rewriter, accTy);
-      for (int k = 0; k < repK; ++k) {
-        Value a = loadA(m, k);
-        Value b = loadB(n, k);
-        Value meta = hMetaPacked[k * repM + m];
-        d = rewriter.create<ttn::SparseWGMMAOp>(
-            loc, accTy, a, meta, b, d, kWarpsInGroup * instrShape[0],
-            instrShape[1], kContractingFactor * instrShape[2], eltTypeC,
-            eltTypeA, eltTypeB, layoutA, layoutB);
-      }
-      auto res = unpackLLElements(loc, d, rewriter);
-      for (int i = 0; i < res.size(); ++i) {
-        acc[i] = res[i];
-      }
-    }
-  }
-
-  // Replace with new packed result.
-  Type structTy = LLVM::LLVMStructType::getLiteral(
-      op.getContext(), SmallVector<Type>(fc.size(), f32_ty));
-  Value res = packLLElements(loc, typeConverter, fc, rewriter, structTy);
-
-  rewriter.create<ttn::WGMMACommitGroupOp>(loc);
-  res = rewriter.create<ttn::WGMMAWaitGroupOp>(loc, res, 0);
-  rewriter.replaceOp(op, res);
-
-  return success();
-}
-
-// ----- Dispatch based on architecture.
-
-LogicalResult rewriteSparseDotOp(SparseDotOp op, SparseDotOp::Adaptor adaptor,
-                                 const LLVMTypeConverter *typeConverter,
-                                 ConversionPatternRewriter &rewriter) {
-  auto resultTy = cast<RankedTensorType>(op.getResult().getType());
-  NvidiaMmaEncodingAttr mmaLayout =
-      cast<NvidiaMmaEncodingAttr>(resultTy.getEncoding());
-
-  if (mmaLayout.isAmpere()) {
-    return convertSparseMMA(op, adaptor, typeConverter, rewriter);
-  }
-  if (mmaLayout.isHopper()) {
-    return convertSparseWGMMA(op, adaptor, typeConverter, rewriter,
-                              getThreadId(rewriter, op.getLoc()));
-  }
-
-  llvm::report_fatal_error(
-      "Unsupported SparseDotOp found when converting TritonGPU to LLVM.");
-}
-
-struct SparseDotOpConversion : public ConvertOpToLLVMPattern<SparseDotOp> {
-  using ConvertOpToLLVMPattern<SparseDotOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      SparseDotOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    return rewriteSparseDotOp(op, adaptor, getTypeConverter(), rewriter);
-  }
-};
-
-struct SparseDotOpToLLVMPass
-    : public impl::SparseDotOpToLLVMPassBase<SparseDotOpToLLVMPass> {
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    ConversionTarget target(*context);
-    target.addLegalDialect<LLVM::LLVMDialect, NVVM::NVVMDialect,
-                           arith::ArithDialect, ttn::NVGPUDialect>();
-    target.addIllegalOp<SparseDotOp>();
-    target.addIllegalDialect<mlir::gpu::GPUDialect>();
-    LowerToLLVMOptions option(context);
-    ModuleOp module = getOperation();
-    auto computeCapability = getNVIDIAComputeCapability(module);
-    int ptx_version = 0;  // TritonGPUToLLVMTypeConverter does not use this.
-    mlir::triton::NVIDIA::TargetInfo targetInfo(computeCapability, ptx_version);
-    TritonGPUToLLVMTypeConverter typeConverter(context, option, targetInfo);
-    RewritePatternSet patterns(context);
-    patterns.add<SparseDotOpConversion>(typeConverter);
-    // TODO(b/358375493): Remove this once TritonGPUToLLVMTypeConverter is
-    // splitted into smaller passes.
-    populateGpuToNVVMConversionPatterns(typeConverter, patterns);
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-class SparseWGMMAOpPattern : public OpRewritePattern<ttn::SparseWGMMAOp> {
- public:
-  using OpRewritePattern<ttn::SparseWGMMAOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ttn::SparseWGMMAOp op,
-                                PatternRewriter &rewriter) const override {
-    return rewriteAsPtxAsm(op, rewriter, getPtxAsm(op),
-                           getOperandsAndConstraints(op),
-                           getOutputConstraints(op));
-  }
-
-  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
-    auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
-    uint32_t numOutputRegs = outputStructType.getBody().size();
-    std::string output =
-        outputStructType.getBody().front().isF32() ? "=f" : "=r";
-    return std::vector<std::string>(numOutputRegs, output);
-  }
-
-  OperandsAndConstraints getOperandsAndConstraints(
-      ttn::SparseWGMMAOp op) const {
-    return {{op.getOpC(), "0"},
-            {op.getOpA(), "l"},
-            {op.getOpB(), "l"},
-            {op.getMetaA(), "r"}};
-  }
-
-  std::string getPtxAsm(ttn::SparseWGMMAOp op) const {
-    auto m = op.getM();
-    auto n = op.getN();
-    auto k = op.getK();
-    auto eltTypeC = op.getEltTypeC();
-    auto eltTypeA = op.getEltTypeA();
-    auto eltTypeB = op.getEltTypeB();
-    auto layoutA = op.getLayoutA();
-    auto layoutB = op.getLayoutB();
-
-    // Only f16/bf16 variant is supported.
-    using WGMMAEltType = ttn::WGMMAEltType;
-    [[maybe_unused]] bool supported =
-        eltTypeC == WGMMAEltType::f32 &&
-        ((eltTypeA == WGMMAEltType::f16 && eltTypeB == WGMMAEltType::f16) ||
-         (eltTypeA == WGMMAEltType::bf16 && eltTypeB == WGMMAEltType::bf16)) &&
-        (m == 64 && 8 <= n && n <= 256 && n % 8 == 0 && k == 32);
-    assert(supported && "Sparse WGMMA type or shape is not supported");
-
-    // Operands
-    uint32_t asmOpIdx = 0;
-    std::string args = "";
-
-    // Output and operand C
-    uint32_t numCRegs =
-        cast<LLVM::LLVMStructType>(op.getType()).getBody().size();
-    args += "{";
-    for (uint32_t i = 0; i < numCRegs; ++i) {
-      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
-    }
-    args += "}, ";
-    asmOpIdx += numCRegs;
-
-    // Operands A and B (must be `desc`)
-    args += "$" + std::to_string(asmOpIdx++) + ", ";
-    args += "$" + std::to_string(asmOpIdx++) + ", ";
-
-    // Metadata for A
-    args += "$" + std::to_string(asmOpIdx++) + ", 0, ";
-
-    // `scale-d`, `imm-scale-a`, and `imm-scale-b` are 1 by default
-    args += "1, 1, 1";
-
-    // `trans-a` and `trans-b`
-    using WGMMALayout = ttn::WGMMALayout;
-    args += ", " + std::to_string(layoutA == WGMMALayout::col);
-    args += ", " + std::to_string(layoutB == WGMMALayout::row);
-
-    auto ptxAsm =
-        "wgmma.mma_async.sp.sync.aligned"
-        ".m" +
-        std::to_string(m) + "n" + std::to_string(n) + "k" + std::to_string(k) +
-        "." + stringifyEnum(eltTypeC).str() + "." +
-        stringifyEnum(eltTypeA).str() + "." + stringifyEnum(eltTypeB).str() +
-        " " + args + ";";
-    return ptxAsm;
-  }
-};
-
-struct SparseWGMMAOpToLLVMPass
-    : public impl::SparseWGMMAOpToLLVMPassBase<SparseWGMMAOpToLLVMPass> {
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    auto pattern = std::make_unique<SparseWGMMAOpPattern>(context);
-    RewritePatternSet patterns(context, std::move(pattern));
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<Pass> CreateSparseAddEncodingPass(int32_t num_warps,
-                                                  int32_t threads_per_warp,
-                                                  int32_t num_ctas) {
-  SparseAddEncodingPassOptions options;
-  options.num_warps_ = num_warps;
-  options.threads_per_warp_ = threads_per_warp;
-  options.num_ctas_ = num_ctas;
-  return std::make_unique<SparseAddEncodingPass>(options);
-}
-
-std::unique_ptr<Pass> CreateSparseBlockedToMMAPass() {
-  return std::make_unique<SparseBlockedToMMAPass>();
-}
-
-std::unique_ptr<Pass> CreateSparseRemoveLayoutConversionPass() {
-  return std::make_unique<SparseRemoveLayoutConversionPass>();
-}
-
-std::unique_ptr<Pass> CreateSparseLocalLoadToLLVMPass() {
-  return std::make_unique<SparseLocalLoadToLLVMPass>();
-}
-
-std::unique_ptr<Pass> CreateSparseDotOpToLLVMPass() {
-  return std::make_unique<SparseDotOpToLLVMPass>();
-}
-
-std::unique_ptr<Pass> CreateSparseWGMMAOpToLLVMPass() {
-  return std::make_unique<SparseWGMMAOpToLLVMPass>();
-}
-
-}  // namespace mlir::triton::xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/extract_tma_info.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/extract_tma_info.mlir
new file mode 100644
index 000000000000..00526a1de3dd
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/extract_tma_info.mlir
@@ -0,0 +1,72 @@
+// RUN: xla-opt %s -split-input-file \
+// RUN: -extract-tma-info \
+// RUN: | FileCheck %s
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
+tt.func @extract_tma_info_128b(%arg0: !tt.tensordesc<tensor<16x32xf32, #shared>>
+  {tt.nv_tma_desc = 1 : i32,
+   tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [32, 256],
+                                                  block_shape = [16, 32],
+                                                  layout = [1, 0],
+                                                  element_byte_size = 4>}) {
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @extract_tma_info_128b
+// CHECK-SAME:  %[[ARG_0:.*]]: !tt.tensordesc<tensor<16x32xf32, #shared>>
+// CHECK-SAME: #triton_xla.tma_descriptor<global_shape = [32, 256],
+// CHECK-SAME: block_shape = [16, 32], layout = [1, 0],
+// CHECK-SAME: element_byte_size = 4, swizzle_mode = "128b">
+
+// -----
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 32}>
+tt.func @extract_tma_info_64b(%arg0: !tt.tensordesc<tensor<16x32xf32, #shared>>
+  {tt.nv_tma_desc = 1 : i32,
+   tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [32, 256],
+                                                  block_shape = [16, 32],
+                                                  layout = [1, 0],
+                                                  element_byte_size = 4>}) {
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @extract_tma_info_64b
+// CHECK-SAME:  %[[ARG_0:.*]]: !tt.tensordesc<tensor<16x32xf32, #shared>>
+// CHECK-SAME: #triton_xla.tma_descriptor<global_shape = [32, 256],
+// CHECK-SAME: block_shape = [16, 32], layout = [1, 0],
+// CHECK-SAME: element_byte_size = 4, swizzle_mode = "64b">
+
+// -----
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 32, transposed = false, elementBitWidth = 32}>
+tt.func @extract_tma_info_32b(%arg0: !tt.tensordesc<tensor<16x32xf32, #shared>>
+  {tt.nv_tma_desc = 1 : i32,
+   tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [32, 256],
+                                                  block_shape = [16, 32],
+                                                  layout = [1, 0],
+                                                  element_byte_size = 4>}) {
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @extract_tma_info_32b
+// CHECK-SAME:  %[[ARG_0:.*]]: !tt.tensordesc<tensor<16x32xf32, #shared>>
+// CHECK-SAME: #triton_xla.tma_descriptor<global_shape = [32, 256],
+// CHECK-SAME: block_shape = [16, 32], layout = [1, 0],
+// CHECK-SAME: element_byte_size = 4, swizzle_mode = "32b">
+
+// -----
+
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+tt.func @extract_tma_info_swizzled_shared(%arg0: !tt.tensordesc<tensor<16x32xf32, #shared>>
+  {tt.nv_tma_desc = 1 : i32,
+   tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [32, 256],
+                                                  block_shape = [16, 32],
+                                                  layout = [1, 0],
+                                                  element_byte_size = 4>}) {
+  tt.return
+}
+// CHECK-LABEL: tt.func @extract_tma_info_swizzled_shared
+// CHECK-SAME:  %[[ARG_0:.*]]: !tt.tensordesc<tensor<16x32xf32, #shared>>
+// CHECK-SAME: #triton_xla.tma_descriptor<global_shape = [32, 256],
+// CHECK-SAME: block_shape = [16, 32], layout = [1, 0],
+// CHECK-SAME: element_byte_size = 4, swizzle_mode = "none">
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/extract_tma_info_invalid.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/extract_tma_info_invalid.mlir
new file mode 100644
index 000000000000..2620a3de061d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/extract_tma_info_invalid.mlir
@@ -0,0 +1,24 @@
+// RUN: xla-opt %s -split-input-file \
+// RUN: -extract-tma-info \
+// RUN: --verify-diagnostics
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
+tt.func @extract_tma_info_no_tma_descriptor(
+// expected-error @+1 {{Argument of type tt.tensordesc must have attribute tt.tma_descriptor}}
+  %arg0: !tt.tensordesc<tensor<16x32xf32, #shared>>
+  {tt.nv_tma_desc = 1 : i32}) {
+  tt.return
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 1], order = [1, 0]}>
+// expected-error @+1 {{Unable to determine swizzle mode from TensorDescType}}
+tt.func @extract_tma_info_invalid_tma_layout(%arg0: !tt.tensordesc<tensor<16x32xf32, #blocked>>
+  {tt.nv_tma_desc = 1 : i32,
+   tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [32, 256],
+                                                  block_shape = [16, 32],
+                                                  layout = [1, 0],
+                                                  element_byte_size = 4>}) {
+  tt.return
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_1d.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_1d.mlir
index 2bec41cda991..02f2a9f4aca7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_1d.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_1d.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=never %s
+// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=fail %s
 
 module {
   tt.func @major_1d(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32}) {
@@ -9,7 +9,7 @@ module {
     %c64_i32 = arith.constant 64 : i32
     %cst = arith.constant dense<0> : tensor<64x64xi8>
 
-                    %0 = tt.make_tensor_ptr %arg0, [%c1_i64, %c128_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>, packed_dim = 1 } : <tensor<64x64xi4>>
+                    %0 = tt.make_tensor_ptr %arg0, [%c1_i64, %c128_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xi4>>
 // CHECK:           %0 = tt.make_tensor_ptr %arg0, [%c1_i64, %c64_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x32xi8>>
 
                     %1 = tt.advance %0, [%c64_i32, %c0_i32] : <tensor<64x64xi4>>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_2d.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_2d.mlir
index 5d7668e2cb20..1b2cd26af510 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_2d.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_major_2d.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=never %s
+// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=fail %s
 
 module {
   tt.func @major_2d(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32}) {
@@ -10,7 +10,7 @@ module {
     %c64_i32 = arith.constant 64 : i32
     %cst = arith.constant dense<0> : tensor<64x64xi8>
 
-                    %0 = tt.make_tensor_ptr %arg0, [%c16_i64, %c128_i64], [%c1_i64, %c16_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>, packed_dim = 1 } : <tensor<64x64xi4>>
+                    %0 = tt.make_tensor_ptr %arg0, [%c16_i64, %c128_i64], [%c1_i64, %c16_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xi4>>
 // CHECK:           %0 = tt.make_tensor_ptr %arg0, [%c8_i64, %c128_i64], [%c1_i64, %c8_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xi8>>
 
                     %1 = tt.advance %0, [%c64_i32, %c0_i32] : <tensor<64x64xi4>>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_1d.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_1d.mlir
index 11fe57f82c91..116694d7553a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_1d.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_1d.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=never %s
+// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=fail %s
 
 module {
   tt.func @minor_1d(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32}) {
@@ -9,7 +9,7 @@ module {
     %c64_i32 = arith.constant 64 : i32
     %cst = arith.constant dense<0> : tensor<64x64xi8>
 
-                    %0 = tt.make_tensor_ptr %arg0, [%c128_i64, %c1_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>, packed_dim = 0 } : <tensor<64x64xi4>>
+                    %0 = tt.make_tensor_ptr %arg0, [%c128_i64, %c1_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xi4>>
 // CHECK:           %0 = tt.make_tensor_ptr %arg0, [%c64_i64, %c1_i64], [%c1_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x64xi8>>
 
                     %1 = tt.advance %0, [%c0_i32, %c64_i32] : <tensor<64x64xi4>>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_2d.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_2d.mlir
index 072807366c45..356d6da3b87d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_2d.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/int4_packed_dim_minor_2d.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=never %s
+// RUN: xla-opt --int4-to-packed-int4-rewrite --canonicalize -- %s | FileCheck --dump-input=fail %s
 
 module {
   tt.func @minor_2d(%arg0: !tt.ptr<i4> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i8> {tt.divisibility = 16 : i32}) {
@@ -9,7 +9,7 @@ module {
     %c64_i32 = arith.constant 64 : i32
     %cst = arith.constant dense<0> : tensor<64x64xi8>
 
-                     %0 = tt.make_tensor_ptr %arg0, [%c1_i64, %c128_i64], [%c128_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>, packed_dim = 0 } : <tensor<64x64xi4>>
+                     %0 = tt.make_tensor_ptr %arg0, [%c1_i64, %c128_i64], [%c128_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x64xi4>>
 // CHECK:            %0 = tt.make_tensor_ptr %arg0, [%c1_i64, %c64_i64], [%c64_i64, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x32xi8>>
 
                      %1 = tt.advance %0, [%c64_i32, %c0_i32] : <tensor<64x64xi4>>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/prevent_mmav3_loop_unrolling.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/prevent_mmav3_loop_unrolling.mlir
index a75f86d08867..6627d463b4c3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/prevent_mmav3_loop_unrolling.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/prevent_mmav3_loop_unrolling.mlir
@@ -1,7 +1,7 @@
 // RUN: xla-opt %s -split-input-file -prevent-mmav3-loop-unrolling | FileCheck %s
 
 #mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 32, 16]}>
-#shared = #ttg.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = true}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
 #smem = #ttg.shared_memory
 // CHECK-LABEL: @add_pragma_nounroll
 tt.func @add_pragma_nounroll(%arg0: !ttg.memdesc<64x32xf16, #shared, #smem>, %arg1: !ttg.memdesc<32x32xf16, #shared, #smem>) {
@@ -41,7 +41,7 @@ tt.func @do_not_unroll_loops_without_mmav3(%arg0: tensor<64x32xf16, #dot_a>, %ar
 // -----
 
 #mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 32, 16]}>
-#shared = #ttg.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = true}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
 #smem = #ttg.shared_memory
 // CHECK-LABEL: @add_pragma_unroll_exactly_once
 tt.func @add_pragma_unroll_exactly_once(%arg0: !ttg.memdesc<64x32xf16, #shared, #smem>, %arg1: !ttg.memdesc<32x32xf16, #shared, #smem>) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_add_encoding.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_add_encoding.mlir
deleted file mode 100644
index 75608281b3fe..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_add_encoding.mlir
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: xla-opt %s \
-// RUN:   -convert-triton-to-tritongpu='target=cuda:80' \
-// RUN:   -sparse-add-encoding -canonicalize \
-// RUN: | FileCheck %s
-
-// Note: 'canonicalize' folds redundant (back-and-forth) convert_layout ops.
-
-// CHECK-DAG: #[[BLOCKED4x4:.*]] = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
-// CHECK-DAG: #[[BLOCKED1x1:.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
-
-module {
-  // CHECK: @sparse_dot
-  tt.func @sparse_dot() {
-    // CHECK-NEXT: %[[A:.*]] = arith.constant dense<1.000000e+00>
-    // CHECK-SAME:   : tensor<64x32xf16, #ttg.dot_op<{opIdx = 0, parent = #[[BLOCKED4x4]]}>>
-    // CHECK-NEXT: %[[B:.*]] = arith.constant dense<2.000000e+00>
-    // CHECK-SAME:   : tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED4x4]]}>>
-    // CHECK-NEXT: %[[C:.*]] = arith.constant dense<0.000000e+00>
-    // CHECK-SAME:   : tensor<64x64xf32, #[[BLOCKED4x4]]>
-    // CHECK-NEXT: %[[META:.*]] = arith.constant dense<13107>
-    // CHECK-SAME:   : tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #[[BLOCKED4x4]]}>>
-    %a = arith.constant dense<1.00e+00> : tensor<64x32xf16>
-    %b = arith.constant dense<2.00e+00> : tensor<64x64xf16>
-    %c = arith.constant dense<0.00e+00> : tensor<64x64xf32>
-    %meta = arith.constant dense<0x3333> : tensor<64x4xi16>
-
-    // CHECK-NEXT: %[[D:.*]] = triton_xla.sparse_dot %[[A]], %[[B]], %[[C]], %[[META]]
-    // CHECK-SAME:   : tensor<64x32xf16, #ttg.dot_op<{opIdx = 0, parent = #[[BLOCKED4x4]]}>>
-    // CHECK-SAME:     meta tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #[[BLOCKED4x4]]}>>
-    // CHECK-SAME:     * tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #[[BLOCKED4x4]]}>>
-    // CHECK-SAME:     -> tensor<64x64xf32, #[[BLOCKED4x4]]>
-    %d = triton_xla.sparse_dot %a, %b, %c, %meta
-      : tensor<64x32xf16> meta tensor<64x4xi16> * tensor<64x64xf16> -> tensor<64x64xf32>
-
-    // CHECK-NEXT: %[[CVT:.*]] = ttg.convert_layout %[[D]]
-    // CHECK-SAME:   : tensor<64x64xf32, #[[BLOCKED4x4]]>
-    // CHECK-SAME:     -> tensor<64x64xf32, #[[BLOCKED1x1]]>
-    // CHECK-NEXT: tt.print "" {hex = false, isSigned = array<i32: 0>} : %[[CVT]]
-    // CHECK-SAME:   : tensor<64x64xf32, #[[BLOCKED1x1]]>
-    // A use with side effects so we don't DCE the whole function.
-    tt.print "" { hex = false, isSigned = array<i32: 0>} : %d : tensor<64x64xf32>
-
-    // CHECK-NEXT: tt.return
-    tt.return
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_blocked_to_mma.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_blocked_to_mma.mlir
deleted file mode 100644
index 6ff4044c06a9..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_blocked_to_mma.mlir
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: xla-opt %s -split-input-file -sparse-blocked-to-mma | FileCheck %s
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-// CHECK: #[[$MMA:.+]] = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-#lhs = #ttg.dot_op<{opIdx = 0, parent = #blocked}>
-#rhs = #ttg.dot_op<{opIdx = 1, parent = #blocked}>
-module attributes {"ttg.target" = "cuda:80", "ttg.num-warps" = 4 : i32} {
-  // CHECK-LABEL: sparse_blocked_to_mma_ampere
-  tt.func @sparse_blocked_to_mma_ampere(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
-    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
-    // CHECK-DAG: %[[LHS:.+]] = ttg.convert_layout {{.+}} : {{.+}} -> tensor<64x32xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$MMA]], kWidth = 2}>>
-    // CHECK-DAG: %[[RHS:.+]] = ttg.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$MMA]], kWidth = 2}>>
-    // CHECK-DAG: %[[ACC:.+]] = ttg.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf32, #[[$MMA]]>
-    // CHECK-DAG: %[[META:.+]] = ttg.convert_layout {{.+}} : {{.+}} -> tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #[[$MMA]]}>>
-    // CHECK: %[[OUT:.+]] = triton_xla.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[$MMA]]>
-    %D = triton_xla.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16, #lhs> meta tensor<64x4xi16, #blocked> * tensor<64x64xf16, #rhs> -> tensor<64x64xf32, #blocked>
-    // CHECK: ttg.convert_layout %[[OUT]] : tensor<64x64xf32, #[[$MMA]]> -> tensor<64x64xf32, #blocked>
-    tt.return %D : tensor<64x64xf32, #blocked>
-  }
-}
-
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-// CHECK: #[[$MMA:.+]] = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-#lhs = #ttg.dot_op<{opIdx = 0, parent = #blocked}>
-#rhs = #ttg.dot_op<{opIdx = 1, parent = #blocked}>
-module attributes {"ttg.target" = "cuda:90", "ttg.num-warps" = 4 : i32} {
-  // CHECK-LABEL: sparse_blocked_to_mma_hopper
-  tt.func @sparse_blocked_to_mma_hopper(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
-    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
-    // CHECK-DAG: %[[LHS_TEMP:.+]] = ttg.convert_layout {{.+}} : tensor<64x32xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> -> tensor<64x32xf16, #blocked>
-    // CHECK-DAG: %[[LHS:.+]] = ttg.local_alloc %[[LHS_TEMP]] : (tensor<64x32xf16, #blocked>) -> !ttg.memdesc<64x32xf16, #{{.+}}>
-    // CHECK-DAG: %[[RHS_TEMP:.+]] = ttg.convert_layout {{.+}} : tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<64x64xf16, #blocked>
-    // CHECK-DAG: %[[RHS:.+]] = ttg.local_alloc %[[RHS_TEMP]] : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #{{.+}}>
-    // CHECK-DAG: %[[ACC:.+]] = ttg.convert_layout {{.+}} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #[[$MMA]]>
-    // CHECK-DAG: %[[META:.+]] = ttg.convert_layout {{.+}} : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #[[$MMA]]}>>
-    // CHECK: %[[OUT:.+]] = triton_xla.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[$MMA]]>
-    %D = triton_xla.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16, #lhs> meta tensor<64x4xi16, #blocked> * tensor<64x64xf16, #rhs> -> tensor<64x64xf32, #blocked>
-    // CHECK: ttg.convert_layout %[[OUT]] : tensor<64x64xf32, #[[$MMA]]> -> tensor<64x64xf32, #blocked>
-    tt.return %D : tensor<64x64xf32, #blocked>
-  }
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_convert_triton_to_tritongpu.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_convert_triton_to_tritongpu.mlir
deleted file mode 100644
index 5f6440cd0845..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_convert_triton_to_tritongpu.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: xla-opt %s -convert-triton-to-tritongpu='target=cuda:80' | FileCheck %s
-
-module attributes {} {
-  tt.func @gemm_fusion_dot_1_impl() {
-    %c0_i32 = arith.constant 0 : i32
-    %c32_i32 = arith.constant 32 : i32
-    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
-    %a = arith.constant dense<0.000000e+00> : tensor<32x16xbf16>
-    // CHECK: %[[A:.+]] = ttg.convert_layout {{.+}} : tensor<32x16xbf16, {{.+}}> -> tensor<32x16xbf16>
-    %b = arith.constant dense<0.000000e+00> : tensor<32x32xbf16>
-    // CHECK: %[[B:.+]] = ttg.convert_layout {{.+}} : tensor<32x32xbf16, {{.+}}> -> tensor<32x32xbf16>
-    %meta = arith.constant dense<0> : tensor<32x2xi16>
-    // CHECK: %[[META:.+]] = ttg.convert_layout {{.+}} : tensor<32x2xi16, {{.+}}> -> tensor<32x2xi16>
-    %35:1 = scf.for %arg4 = %c0_i32 to %c32_i32 step %c32_i32 iter_args(%arg8 = %acc) -> (tensor<32x32xf32>)  : i32 {
-      // CHECK: %[[ACC:.+]] = ttg.convert_layout {{.+}} : tensor<32x32xf32, {{.+}}> -> tensor<32x32xf32>
-      // CHECK-NEXT: %[[D:.*]] = triton_xla.sparse_dot %[[A]], %[[B]], %[[ACC]], %[[META]]
-      // CHECK-SAME:   : tensor<32x16xbf16> meta tensor<32x2xi16>
-      // CHECK-SAME:     * tensor<32x32xbf16> -> tensor<32x32xf32>
-      %74 = triton_xla.sparse_dot %a, %b, %arg8, %meta : tensor<32x16xbf16> meta tensor<32x2xi16> * tensor<32x32xbf16> -> tensor<32x32xf32>
-      // CHECK: %[[ACC:.+]] = ttg.convert_layout {{.+}} : tensor<32x32xf32> -> tensor<32x32xf32, {{.+}}>
-      scf.yield %74 : tensor<32x32xf32>
-    }
-    tt.return
-  }
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_dot_to_llvm_ampere.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_dot_to_llvm_ampere.mlir
deleted file mode 100644
index a7d4004e55bd..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_dot_to_llvm_ampere.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: xla-opt %s --sparse-dot-to-llvm | FileCheck %s
-
-#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
-#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#mma, kWidth=2}>
-#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#mma, kWidth=2}>
-#dot_meta_enc = #triton_xla.sparse_dot_meta<{parent=#mma}>
-
-module attributes {"ttg.num-warps" = 4 : i32,  "ttg.target" = "cuda:80"} {
-  // CHECK-LABEL: sparse_dot_to_llvm_ampere
-  tt.func @sparse_dot_to_llvm_ampere(%A_dot: tensor<32x32xf16, #dot_operand_a>, %B_dot: tensor<64x32xf16, #dot_operand_b>, %meta_reg: tensor<32x4xi16, #dot_meta_enc>) {
-    // CHECK: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
-    // CHECK-SAME: (f32, f32, f32, f32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> !llvm.struct<(f32, f32, f32, f32)>
-    // CHECK: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
-    // CHECK-SAME: (f32, f32, f32, f32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> !llvm.struct<(f32, f32, f32, f32)>
-    // CHECK: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
-    // CHECK-SAME: (f32, f32, f32, f32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> !llvm.struct<(f32, f32, f32, f32)>
-    // CHECK: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
-    // CHECK-SAME: (f32, f32, f32, f32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> !llvm.struct<(f32, f32, f32, f32)>
-    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
-    %D = triton_xla.sparse_dot %A_dot, %B_dot, %acc, %meta_reg : tensor<32x32xf16, #dot_operand_a> meta tensor<32x4xi16, #dot_meta_enc> * tensor<64x32xf16, #dot_operand_b> -> tensor<32x32xf32, #mma>
-    tt.return
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_dot_to_llvm_hopper.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_dot_to_llvm_hopper.mlir
deleted file mode 100644
index f1319c0ff94d..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_dot_to_llvm_hopper.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-// TODO(b/350928208): Isolate --sparse-dot-to-llvm pass in this test.
-// RUN: xla-opt %s \
-// RUN:   --convert-triton-gpu-to-llvm=compute-capability=90 \
-// RUN:   --sparse-dot-to-llvm \
-// RUN: | FileCheck %s
-
-#shared = #ttg.shared<{vec = 1, perPhase=2, maxPhase=4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#mma = #ttg.nvidia_mma<{versionMajor = 3, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 64, 16]}>
-#dot_meta_enc = #triton_xla.sparse_dot_meta<{parent=#mma}>
-
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:90"} {
-  // CHECK-LABEL: sparse_dot_to_llvm_hopper
-  tt.func @sparse_dot_to_llvm_hopper(%A_alloc: !ttg.memdesc<64x32xf16, #shared, #ttg.shared_memory>,
-                      %B_alloc: !ttg.memdesc<64x64xf16, #shared, #ttg.shared_memory>,
-                      %meta_reg: tensor<64x4xi16, #dot_meta_enc>) {
-    // CHECK-NOT: gpu.thread_id
-    // CHECK: nvgpu.wgmma_fence
-    // CHECK-COUNT-2: nvgpu.wgmma_sp %[[A:.*]] meta %[[M:.*]], %[[B:.*]], %[[C:.*]] {
-    // CHECK-DAG: layoutA = 0 : i32
-    // CHECK-DAG: layoutB = 0 : i32
-    // CHECK-DAG: m = 64 : i32
-    // CHECK-DAG: n = 64 : i32
-    // CHECK-DAG: k = 32 : i32
-    // CHECK: nvgpu.wgmma_commit_group
-    %acc = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
-    %D = triton_xla.sparse_dot %A_alloc, %B_alloc, %acc, %meta_reg : !ttg.memdesc<64x32xf16, #shared, #ttg.shared_memory> meta tensor<64x4xi16, #dot_meta_enc> * !ttg.memdesc<64x64xf16, #shared, #ttg.shared_memory> -> tensor<64x64xf32, #mma>
-    tt.return
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_local_load_to_llvm.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_local_load_to_llvm.mlir
deleted file mode 100644
index cdff37628da4..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_local_load_to_llvm.mlir
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: xla-opt %s -split-input-file --sparse-local-load-to-llvm | FileCheck %s
-
-#shared = #ttg.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
-#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#mma, kWidth=2}>
-#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#mma, kWidth=2}>
-#dot_meta_enc = #triton_xla.sparse_dot_meta<{parent=#mma}>
-
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
-  // CHECK-LABEL: sparse_local_load_ampere
-  tt.func @sparse_local_load_ampere(%A_alloc: !ttg.memdesc<32x32xf16, #shared, #ttg.shared_memory>,
-                      %B_alloc: !ttg.memdesc<64x32xf16, #shared, #ttg.shared_memory>,
-                      %meta_alloc: !ttg.memdesc<32x4xi16, #shared, #ttg.shared_memory>) {
-    // A_dot and B_dot local loads shouldn not match with -sparse-local-load-to-llvm
-    // CHECK-COUNT-2: ttg.local_load
-    %A_dot = ttg.local_load %A_alloc : !ttg.memdesc<32x32xf16, #shared, #ttg.shared_memory> -> tensor<32x32xf16, #dot_operand_a>
-    %B_dot = ttg.local_load %B_alloc : !ttg.memdesc<64x32xf16, #shared, #ttg.shared_memory> -> tensor<64x32xf16, #dot_operand_b>
-    // CHECK-COUNT-4: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
-    %meta_reg = ttg.local_load %meta_alloc : !ttg.memdesc<32x4xi16, #shared, #ttg.shared_memory> -> tensor<32x4xi16, #dot_meta_enc>
-    tt.return
-  }
-}
-
-// -----
-
-#shared = #ttg.shared<{vec = 1, perPhase=2, maxPhase=4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#mma = #ttg.nvidia_mma<{versionMajor = 3, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 64, 16]}>
-#dot_meta_enc = #triton_xla.sparse_dot_meta<{parent=#mma}>
-
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
-  // CHECK-LABEL: sparse_local_load_hopper
-  tt.func @sparse_local_load_hopper(%meta_alloc: !ttg.memdesc<64x4xi16, #shared, #ttg.shared_memory>) {
-    // CHECK-COUNT-2: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
-    %meta_reg = ttg.local_load %meta_alloc : !ttg.memdesc<64x4xi16, #shared, #ttg.shared_memory> -> tensor<64x4xi16, #dot_meta_enc>
-   tt.return
-  }
-}
-
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#shared = #ttg.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
-#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#mma, kWidth=2}>
-#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#mma, kWidth=2}>
-
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {
-  // CHECK-LABEL: skip_pass_if_no_sparse_loads
-  tt.func @skip_pass_if_no_sparse_loads(%A: tensor<32x64xf16, #blocked>, %B: tensor<64x32xf16, #blocked>) {
-    // CHECK-NOT: llvm
-    // CHECK-NOT: barrier
-    %A_alloc = ttg.local_alloc %A {allocation.offset = 0 : i32} : (tensor<32x64xf16, #blocked>) -> !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory>
-    %A_dot = ttg.local_load %A_alloc : !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory> -> tensor<32x64xf16, #dot_operand_a>
-    %B_alloc = ttg.local_alloc %B {allocation.offset = 2048 : i32} : (tensor<64x32xf16, #blocked>) -> !ttg.memdesc<64x32xf16, #shared, #ttg.shared_memory>
-    %B_dot = ttg.local_load %B_alloc : !ttg.memdesc<64x32xf16, #shared, #ttg.shared_memory> -> tensor<64x32xf16, #dot_operand_b>
-    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
-    %D = tt.dot %A_dot, %B_dot, %acc : tensor<32x64xf16, #dot_operand_a> * tensor<64x32xf16, #dot_operand_b> -> tensor<32x32xf32, #mma>
-    tt.return
-  }
-}
-
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_remove_layout_conversion.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_remove_layout_conversion.mlir
deleted file mode 100644
index 8e3d07c089e8..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_remove_layout_conversion.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: xla-opt %s --sparse-remove-layout-conversion | FileCheck %s
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-#mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-// CHECK: #[[SHARED:.+]] = #ttg.shared
-module attributes {"ttg.num-warps" = 4 : i32} {
-  tt.func @sparse_dot_metadata(%meta: tensor<64x4xi16, #blocked>) {
-    // CHECK: %[[META:.+]] = ttg.local_alloc {{.+}} : (tensor<64x4xi16, #blocked>) -> !ttg.memdesc<64x4xi16, #[[SHARED]], #smem>
-    // CHECK: ttg.local_load %[[META]] : !ttg.memdesc<64x4xi16, #[[SHARED]], #smem> -> tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #mma}>>
-    %0 = ttg.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #mma}>>
-    tt.return
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_test_wgmma.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_test_wgmma.mlir
deleted file mode 100644
index 7ee62fc849f9..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_test_wgmma.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: xla-opt %s --sparse-wgmma-to-llvm | FileCheck %s
-
-module attributes {"ttg.num-warps" = 4 : i32} {
-  tt.func @wgmma_sp(%descA: i64, %metaA: i32, %descB: i64, %acc: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) {
-    // CHECK: @wgmma_sp(%[[LHS:.*]]: i64, %[[META:.*]]: i32, %[[RHS:.*]]: i64,
-    // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = []
-    // CHECK-SAME: "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7}, $16, $17, $18, 0, 1, 1, 1, 0, 0;"
-    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,l,l,r" %0, %1, %2, %3, %4, %5, %6, %7, %[[LHS]], %[[RHS]], %[[META]]
-    %acc0 = nvgpu.wgmma_sp %descA meta %metaA, %descB, %acc
-    {eltTypeA = 5 : i32, eltTypeB = 5 : i32, eltTypeC = 7 : i32, layoutA = 0 : i32, layoutB = 1 : i32, m = 64 : i32, n = 16 : i32, k = 32 : i32} :
-    (i64, i32, i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
-    tt.return
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_ttg_fence_insertion.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_ttg_fence_insertion.mlir
deleted file mode 100644
index 86d961d6c124..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_ttg_fence_insertion.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: xla-opt %s -split-input-file -triton-nvidia-gpu-fence-insertion | FileCheck %s
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-#lhs = #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>
-#shared = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
-#smem = #ttg.shared_memory
-module attributes {"ttg.num-warps" = 4 : i32} {
-  tt.func public @sparse_dot_fence(%A: tensor<64x32xf16, #lhs>, %B: !ttg.memdesc<64x64xf16, #shared, #smem>, %meta: tensor<64x4xi16, #blocked>) {
-    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
-    %0 = ttg.local_alloc %A : (tensor<64x32xf16, #lhs>) -> !ttg.memdesc<64x32xf16, #shared, #smem>
-    %2 = ttg.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #mma}>>
-    // CHECK: ttng.fence_async_shared
-    %3 = triton_xla.sparse_dot %0, %B, %C, %2 : !ttg.memdesc<64x32xf16, #shared, #smem> meta tensor<64x4xi16, #triton_xla.sparse_dot_meta<{parent = #mma}>> * !ttg.memdesc<64x64xf16, #shared, #smem> -> tensor<64x64xf32, #mma>
-    tt.return
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_ttg_loop_pipeline.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_ttg_loop_pipeline.mlir
deleted file mode 100644
index fd1cc919eaa9..000000000000
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/sparse_ttg_loop_pipeline.mlir
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: xla-opt %s -split-input-file -tritongpu-loop-scheduling=num-stages=3 -tritongpu-pipeline=num-stages=3 | FileCheck %s
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-#sliced = #ttg.slice<{parent=#blocked, dim=0}>
-#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}>
-#dot_operand_a = #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth=2}>
-#dot_operand_b = #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth=2}>
-#dot_meta_enc = #triton_xla.sparse_dot_meta<{parent=#mma}>
-
-module attributes {"ttg.num-warps" = 4 : i32} {
-  tt.func @sparse_dot_loop(%lb : index, %ub : index, %step : index,
-        %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-        %B : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-        %A_meta : !tt.ptr<i16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> {
-    // CHECK-COUNT-6: ttg.async_copy_global_to_local
-    // CHECK: ttg.async_wait {{.+}}, {{.+}} {num = 3 : i32}
-    %a_ptr_splat = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #blocked>
-    %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #sliced>
-    %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #sliced> -> tensor<1x32xi32, #blocked>
-    %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked>
-    %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
-
-    %b_ptr_splat = tt.splat %B : !tt.ptr<f16> -> tensor<64x128x!tt.ptr<f16>, #blocked>
-    %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #sliced>
-    %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #sliced> -> tensor<1x128xi32, #blocked>
-    %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked>
-    %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
-
-    %meta_ptr_splat = tt.splat %A_meta : !tt.ptr<i16> -> tensor<128x4x!tt.ptr<i16>, #blocked>
-    %meta_tmp0 = tt.make_range {end = 4: i32, start = 0: i32} : tensor<4xi32, #sliced>
-    %meta_tmp1 = tt.expand_dims %meta_tmp0 {axis = 0 : i32} : tensor<4xi32, #sliced> -> tensor<1x4xi32, #blocked>
-    %meta_offs = tt.broadcast %meta_tmp1 : tensor<1x4xi32, #blocked> -> tensor<128x4xi32, #blocked>
-    %meta_ptr_init = tt.addptr %meta_ptr_splat, %meta_offs : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
-
-    %a_off = arith.constant dense<4> : tensor<128x32xi32, #blocked>
-    %b_off = arith.constant dense<4> : tensor<64x128xi32, #blocked>
-    %meta_off = arith.constant dense<4> : tensor<128x4xi32, #blocked>
-    %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #mma>
-
-    // CHECK: scf.for
-    %loop:4 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %c = %c_init, %meta_ptr = %meta_ptr_init)
-        -> (tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>) {
-      // CHECK-COUNT-3: ttg.local_load
-      // CHECK: triton_xla.sparse_dot
-      // CHECK-COUNT-3: ttg.async_copy_global_to_local
-      %a_ = tt.load %a_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32x!tt.ptr<f16>, #blocked>
-      %a = ttg.convert_layout %a_ : tensor<128x32xf16, #blocked> -> tensor<128x32xf16, #dot_operand_a>
-      %b_ = tt.load %b_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x128x!tt.ptr<f16>, #blocked>
-      %b = ttg.convert_layout %b_ : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #dot_operand_b>
-      %meta_ = tt.load %meta_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x4x!tt.ptr<i16>, #blocked>
-      %meta = ttg.convert_layout %meta_ : tensor<128x4xi16, #blocked> -> tensor<128x4xi16, #dot_meta_enc>
-      %d = triton_xla.sparse_dot %a, %b, %c, %meta : tensor<128x32xf16, #dot_operand_a> meta tensor<128x4xi16, #dot_meta_enc> * tensor<64x128xf16, #dot_operand_b> -> tensor<128x128xf32, #mma>
-
-      %a_ptr_next = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
-      %b_ptr_next = tt.addptr %b_ptr, %b_off : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
-      %meta_ptr_next = tt.addptr %meta_ptr, %meta_off : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
-      scf.yield %a_ptr_next, %b_ptr_next, %d, %meta_ptr_next : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>
-    }
-    tt.return %loop#2: tensor<128x128xf32, #mma>
-  }
-}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
index 28f60b1b2cb8..da0c665f0975 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/triton_xla_extract_insert_to_triton.mlir
@@ -6,38 +6,31 @@
 // RUN: -triton-xla-extract-insert-to-triton="gpu_device_info='cuda_compute_capability {major: 9}' tma_enabled=1" \
 // RUN: | FileCheck %s --check-prefix=CHECK-TMA
 
-func.func @lower_tile_extract_insert(%arg0: tensor<512x128xbf16>,
+func.func @lower_extract_insert(%arg0: tensor<512x128xbf16>,
           %arg1: tensor<256x256xbf16>) -> tensor<256x256xbf16> {
-  %cst = arith.constant 1 : i32
-  %tiled_tensor_in = triton_xla.tile %arg0 [0, 0] [16, 64] [128, 1]
-    : !triton_xla.tiled_tensor<16x64|512x128xbf16>
-  %tiled_tensor_out = triton_xla.tile %arg1 [0, 0] [16, 64] [128, 1]
-    : !triton_xla.tiled_tensor<16x64|256x256xbf16>
-  %extracted_tensor = triton_xla.extract %tiled_tensor_in [%cst, %cst]
-    : tensor<512x128xbf16> to tensor<16x64xbf16>
+  %extracted_tensor = triton_xla.extract %arg0 [0, 0] [16, 64] [128, 1]
+    {layout = array<i64:1, 0>} : tensor<512x128xbf16> to tensor<16x64xbf16>
   %updated_tensor = triton_xla.insert %extracted_tensor into
-    %tiled_tensor_out [%cst, %cst]
+    %arg1 [0, 0] [16, 64] [128, 1] {layout = array<i64:1, 0>}
     : tensor<16x64xbf16> into tensor<256x256xbf16>
   func.return %updated_tensor : tensor<256x256xbf16>
 }
 
-// CHECK-LABEL: tt.func @lower_tile_extract_insert
-// CHECK-SAME:  %[[ARG_0:.*]]: !tt.ptr<bf16>, %[[ARG_1:.*]]: !tt.ptr<bf16>
-// CHECK:         %[[PTR_0:.*]] = tt.make_tensor_ptr %[[ARG_0]]
-// CHECK:         %[[PTR_1:.*]] = tt.make_tensor_ptr %[[ARG_1]]
-// CHECK:         %[[ADV_0:.*]] = tt.advance %[[PTR_0]]
-// CHECK:         %[[LOAD:.*]] = tt.load %[[ADV_0]]
-// CHECK:         %[[ADV_1:.*]] = tt.advance %[[PTR_1]]
-// CHECK:         tt.store %[[ADV_1]], %[[LOAD]]
+// CHECK-LABEL: tt.func @lower_extract_insert
+// CHECK-SAME:  %[[ARG_0:.*]]: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %[[ARG_1:.*]]: !tt.ptr<bf16> {tt.divisibility = 16 : i32}
+// CHECK:         %[[ADDPTR_0:.*]] = tt.addptr %[[ARG_0]]
+// CHECK:         %[[PTR_0:.*]] = tt.make_tensor_ptr %[[ADDPTR_0]]
+// CHECK:         %[[LOAD:.*]] = tt.load %[[PTR_0]]
+// CHECK:         %[[ADDPTR_1:.*]] = tt.addptr %[[ARG_1]]
+// CHECK:         %[[PTR_1:.*]] = tt.make_tensor_ptr %[[ADDPTR_1]]
+// CHECK:         tt.store %[[PTR_1]], %[[LOAD]]
 // CHECK:       tt.return
 
-// CHECK-TMA-LABEL:tt.func @lower_tile_extract_insert
-// CHECK-TMA-SAME:  %[[ARG_0:.*]]: !tt.ptr<bf16> {tt.nv_tma_desc = 1 : i32, tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [512, 128], block_shape = [16, 64], element_byte_size = 2>},
-// CHECK-TMA-SAME:  %[[ARG_1:.*]]: !tt.ptr<bf16> {tt.nv_tma_desc = 1 : i32, tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [256, 256], block_shape = [16, 64], element_byte_size = 2>}
-// CHECK-TMA:    %[[DESC_0:.*]] = tt.reinterpret_tensor_descriptor %[[ARG_0]]
-// CHECK-TMA:    %[[DESC_1:.*]] = tt.reinterpret_tensor_descriptor %[[ARG_1]]
-// CHECK-TMA:    %[[LOAD:.*]] = tt.experimental_descriptor_load %[[DESC_0]]
-// CHECK-TMA:    tt.experimental_descriptor_store %[[DESC_1]][{{.*}}], %[[LOAD]]
+// CHECK-TMA-LABEL: tt.func @lower_extract_insert
+// CHECK-TMA-SAME:  %[[ARG_0:.*]]: !tt.tensordesc<tensor<16x64xbf16>> {tt.nv_tma_desc = 1 : i32, tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [512, 128], block_shape = [16, 64], layout = [1, 0], element_byte_size = 2>},
+// CHECK-TMA-SAME:  %[[ARG_1:.*]]: !tt.tensordesc<tensor<16x64xbf16>> {tt.nv_tma_desc = 1 : i32, tt.tma_descriptor = #triton_xla.tma_descriptor<global_shape = [256, 256], block_shape = [16, 64], layout = [1, 0], element_byte_size = 2>}
+// CHECK-TMA:    %[[LOAD:.*]] = tt.descriptor_load %[[ARG_0]]
+// CHECK-TMA:    tt.descriptor_store %[[ARG_1]][{{.*}}], %[[LOAD]]
 // CHECK-TMA:    tt.return
 
 // -----
@@ -45,15 +38,11 @@ func.func @lower_tile_extract_insert(%arg0: tensor<512x128xbf16>,
 func.func @non_perfect_tile_shape(
                 %arg0: tensor<300x300xbf16>, %arg1: tensor<300x300xbf16>)
                 -> tensor<300x300xbf16> {
-  %cst = arith.constant 0 : i32
-  %tiled_tensor_in = triton_xla.tile %arg0 [0, 0] [8, 8] [1, 1]
-    : !triton_xla.tiled_tensor<8x8|300x300xbf16>
-  %tiled_tensor_out = triton_xla.tile %arg1 [0, 0] [8, 8] [1, 1]
-    : !triton_xla.tiled_tensor<8x8|300x300xbf16>
-  %extracted_tensor = triton_xla.extract %tiled_tensor_in [%cst, %cst]
-    : tensor<300x300xbf16> to tensor<8x8xbf16>
+  %extracted_tensor = triton_xla.extract %arg0 [0, 0] [8, 8] [1, 1]
+    {layout = array<i64:1, 0>} : tensor<300x300xbf16> to tensor<8x8xbf16>
   %updated_tensor = triton_xla.insert %extracted_tensor into
-    %tiled_tensor_out [%cst, %cst] : tensor<8x8xbf16> into tensor<300x300xbf16>
+    %arg1 [0, 0] [8, 8] [1, 1] {layout = array<i64:1, 0>}
+    : tensor<8x8xbf16> into tensor<300x300xbf16>
   func.return %updated_tensor : tensor<300x300xbf16>
 }
 
@@ -67,22 +56,77 @@ func.func @non_perfect_tile_shape(
 
 func.func @incompatible_tma_shapes(%arg0: tensor<1000x1000xbf16>,
           %arg1: tensor<1024x1024xbf16>) -> tensor<1024x1024xbf16> {
-  %cst = arith.constant 1 : i32
-  %tiled_tensor_in = triton_xla.tile %arg0 [0, 0] [512, 256] [128, 1]
-    : !triton_xla.tiled_tensor<512x256|1000x1000xbf16>
-  %tiled_tensor_out = triton_xla.tile %arg1 [0, 0] [512, 256] [128, 1]
-    : !triton_xla.tiled_tensor<512x256|1024x1024xbf16>
-  %extracted_tensor = triton_xla.extract %tiled_tensor_in [%cst, %cst]
-    : tensor<1000x1000xbf16> to tensor<512x256xbf16>
+  %extracted_tensor = triton_xla.extract %arg0 [0, 0] [16, 64] [128, 1]
+    {layout = array<i64:1, 0>} : tensor<1000x1000xbf16> to tensor<512x256xbf16>
   %updated_tensor = triton_xla.insert %extracted_tensor into
-    %tiled_tensor_out [%cst, %cst]
+    %arg1 [0, 0] [16, 64] [128, 1] {layout = array<i64:1, 0>}
     : tensor<512x256xbf16> into tensor<1024x1024xbf16>
   func.return %updated_tensor : tensor<1024x1024xbf16>
 }
 
 // CHECK-TMA:   tt.make_tensor_ptr
-// CHECK-TMA:   tt.make_tensor_ptr
-// CHECK-TMA:   tt.advance
 // CHECK-TMA:   tt.load
-// CHECK-TMA:   tt.advance
+// CHECK-TMA:   tt.make_tensor_ptr
 // CHECK-TMA:   tt.store
+
+// -----
+
+#indexing_map = #xla.indexing_map<"(pid_0) -> (pid_0 * 32), domain: pid_0 in [0, 1]">
+module {
+  func.func @slice_with_tiling_that_needs_padding_has_boundary_checks(
+          %arg0: tensor<64xf32>, %arg1: tensor<63xf32>, %arg2: tensor<63xf32>)
+          -> (tensor<63xf32>, tensor<63xf32>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.index_castui %1 : i64 to index
+    %3 = xla.apply_indexing #indexing_map(%2)
+    %extracted_tile = triton_xla.extract %arg0[%3][32][1]
+      {layout = array<i64:0>} : tensor<64xf32> to tensor<32xf32>
+    %4 = math.absf %extracted_tile : tensor<32xf32>
+    %5 = arith.subf %cst, %4 : tensor<32xf32>
+    %inserted_tile = triton_xla.insert %5 into %arg1[%3][32][1]
+      {layout = array<i64:0>} : tensor<32xf32> into tensor<63xf32>
+    %inserted_tile_2 = triton_xla.insert %4 into %arg2[%3][32][1]
+      {layout = array<i64:0>} : tensor<32xf32> into tensor<63xf32>
+    return %inserted_tile, %inserted_tile_2 : tensor<63xf32>, tensor<63xf32>
+  }
+}
+
+// CHECK-LABEL:   func @slice_with_tiling_that_needs_padding_has_boundary_checks
+// CHECK-COUNT-1: tt.load
+// CHECK:         tt.store
+// CHECK-SAME:    boundaryCheck = array<i32: 0>
+// CHECK:         tt.store
+// CHECK-SAME:    boundaryCheck = array<i32: 0>
+
+// -----
+
+#indexing_map = #xla.indexing_map<"(pid_0) -> (pid_0 * 32), domain: pid_0 in [0, 1]">
+module {
+  func.func @slice_with_extra_output_that_can_reuse_tile_due_to_padding(
+            %arg0: tensor<64xf32>, %arg1: tensor<63xf32>, %arg2: tensor<64xf32>)
+            -> (tensor<63xf32>, tensor<64xf32>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.index_castui %1 : i64 to index
+    %3 = xla.apply_indexing #indexing_map(%2)
+    %extracted_tile = triton_xla.extract %arg0[%3][32][1]
+      {layout = array<i64:0>} : tensor<64xf32> to tensor<32xf32>
+    %4 = math.absf %extracted_tile : tensor<32xf32>
+    %5 = arith.subf %cst, %4 : tensor<32xf32>
+    %inserted_tile = triton_xla.insert %5 into %arg1[%3][32][1]
+      {layout = array<i64:0>} : tensor<32xf32> into tensor<63xf32>
+    %inserted_tile_2 = triton_xla.insert %4 into %arg2[%3][32][1]
+      {layout = array<i64:0>} : tensor<32xf32> into tensor<64xf32>
+    return %inserted_tile, %inserted_tile_2 : tensor<63xf32>, tensor<64xf32>
+  }
+}
+
+// CHECK-LABEL:   func @slice_with_extra_output_that_can_reuse_tile_due_to_padding
+// CHECK-COUNT-1: tt.load
+// CHECK:         tt.store
+// CHECK-SAME:    boundaryCheck = array<i32: 0>
+// CHECK:         tt.store
+// CHECK-NOT:     boundaryCheck = array<i32: 0>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
index e5711c8b931f..7da9b6ba2d36 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/triton_xla_extract_insert_to_triton_pass.cc
@@ -13,11 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <stdbool.h>
-
+#include <algorithm>
 #include <cstdint>
 #include <memory>
-#include <numeric>
 #include <optional>
 #include <string>
 #include <utility>
@@ -33,6 +31,8 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -48,9 +48,12 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "xla/backends/gpu/codegen/triton/emitter_helpers.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
-#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
 #include "xla/codegen/emitter_loc_op_builder.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -66,33 +69,27 @@ namespace {
 #define GEN_PASS_DEF_TRITONXLAEXTRACTINSERTTOTRITONPASS
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
 
-template <typename T>
-SmallVector<Value> GetValueRange(::xla::EmitterLocOpBuilder& builder,
-                                 llvm::ArrayRef<T> array_ref) {
-  SmallVector<mlir::Value> values;
-  for (T value : array_ref) {
-    values.push_back(builder.create<arith::ConstantIntOp>(
-        value, builder.getIntegerType(sizeof(T) * 8)));
-  }
-  return values;
-}
-
-PointerType GetTensorPtrType(::xla::EmitterLocOpBuilder& builder, Type type) {
-  return PointerType::get(xgt::StorageType(builder, type),
+PointerType GetTensorPtrType(Type type) {
+  return PointerType::get(xgt::StorageType(type),
                           mlir::NVVM::kGlobalMemorySpace);
 }
 
-TensorDescType GetTensorDescPtrType(::xla::EmitterLocOpBuilder& builder,
-                                    RankedTensorType type) {
-  return TensorDescType::get(builder.getContext(), type);
-}
-
 bool AreRankedTensors(ArrayRef<Type> types) {
   return llvm::all_of(types, [](mlir::Type type) {
     return mlir::isa<mlir::RankedTensorType>(type);
   });
 }
 
+SmallVector<Value> IndexCastUI(::xla::EmitterLocOpBuilder& builder, Type type,
+                               ValueRange values) {
+  SmallVector<Value> result;
+  result.reserve(values.size());
+  for (auto value : values) {
+    result.push_back(builder.create<arith::IndexCastUIOp>(type, value));
+  }
+  return result;
+}
+
 bool TmaIsEnabledForDevice(
     const stream_executor::DeviceDescription& device_info) {
   bool is_cuda = std::holds_alternative<stream_executor::CudaComputeCapability>(
@@ -102,8 +99,9 @@ bool TmaIsEnabledForDevice(
 
 bool CanUseTMA(::xla::EmitterLocOpBuilder& builder, bool tma_enabled,
                const stream_executor::DeviceDescription& device_description,
-               TiledTensorType tiled_tensor_type,
-               TypedValue<RankedTensorType> tensor) {
+               const ArrayRef<int64_t>& tile_shape,
+               const TypedValue<RankedTensorType>& tensor,
+               const ArrayRef<int64_t>& layout) {
   if (!tma_enabled) {
     return false;
   }
@@ -111,7 +109,8 @@ bool CanUseTMA(::xla::EmitterLocOpBuilder& builder, bool tma_enabled,
     return false;
   }
   // Currently only 2D tensors are supported.
-  if (tiled_tensor_type.getTileShape().size() != 2) {
+  // TODO(b/417039624): Support more dimensions.
+  if (tile_shape.size() != 2) {
     return false;
   }
 
@@ -131,43 +130,206 @@ bool CanUseTMA(::xla::EmitterLocOpBuilder& builder, bool tma_enabled,
   // - The block size must be less than 256 in every dimension.
   // See source:
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
-  if (tiled_tensor_type.getOriginalShape()[1] % 16 != 0) {
+  if (tensor.getType().getShape()[layout[0]] % 16 != 0) {
     return false;
   }
-  return llvm::none_of(tiled_tensor_type.getTileShape(),
-                       [](int64_t dim) { return dim > 256; });
+  return llvm::none_of(tile_shape, [](int64_t dim) { return dim > 256; });
 }
 
-// Tile Op is rewritten to tt.reinterpret_tensor_desc if TMA is used.
-// During rewriting of other ops, such as ExtractOp and InsertOp, we need to
-// check if TMA is used or not. This function basically checks that the
-// backward slice of the op contains a ReinterpretTensorDescOp, indicating that
-// TMA is to be used.
-bool IsTmaUsed(Operation* op) {
-  SetVector<Operation*> backwardSlice;
-  BackwardSliceOptions opt;
-  mlir::getBackwardSlice(op, &backwardSlice, opt);
-  for (auto op : backwardSlice) {
-    if (mlir::isa<ReinterpretTensorDescOp>(op)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void ComputeBoundaryChecks(std::vector<int32_t>& boundary_checks,
-                           const TiledTensorType& tiled_tensor_type) {
+SmallVector<int32_t> ComputeBoundaryChecks(
+    const ArrayRef<int64_t>& original_shape,
+    const ArrayRef<int64_t>& tile_shape) {
+  SmallVector<int32_t> boundary_checks;
   for (auto [dim_idx, sizes] :
-       llvm::enumerate(llvm::zip(tiled_tensor_type.getOriginalShape(),
-                                 tiled_tensor_type.getTileShape()))) {
+       llvm::enumerate(llvm::zip(original_shape, tile_shape))) {
     auto [dim_size, tile_size] = sizes;
     if (dim_size % tile_size) {
       boundary_checks.push_back(dim_idx);
     }
   }
+  return boundary_checks;
+}
+
+// TensorPtr is intended to wrap the base pointer of the TiledHloInstruction and
+// the necessary offsets so that Triton can compute the pointer to the
+// block specific to the given pid. This option would yield simpler code,
+// but cannot handle all combinations of strides and offsets, because Triton
+// always multiplies the offset by the stride. E.g., it's not possible to
+// slice [10] with [1:5:2] because the offset is misaligned with regards to the
+// stride.
+//
+// Instead, we output a TensorPtr that points directly to the tile specific
+// to the pid. All offset computation is done in advance. MakeTensorPtrOp
+// sees 0 offsets. This allows Triton to read any block regardless of
+// strides size or offsets. To make sure that masking is correct, we compute
+// a "residual shape" which is the original parent shape minus the offsets.
+SmallVector<Value> ComputeResidualShape(::xla::EmitterLocOpBuilder& builder,
+                                        ArrayRef<int64_t> original_shape,
+                                        ValueRange tile_offsets) {
+  SmallVector<Value> residual_shape;
+  for (auto [dim_idx, shape_and_tile_offset] :
+       llvm::enumerate(llvm::zip(original_shape, tile_offsets))) {
+    auto [shape, tile_offset] = shape_and_tile_offset;
+    Value size =
+        ::xla::gpu::triton::CreateConst(builder, builder.getI64Type(), shape)
+            .UnwrapScalar();
+    // Offsets are necessarily positive since they represent a distance
+    // between 0 and the size of the tensor on the given axis. Therefore, it
+    // is safe to use 'IndexCastUI' here. This allows index canonicalizations
+    // later on.
+    Value offset =
+        builder.create<arith::IndexCastUIOp>(builder.getI64Type(), tile_offset);
+    residual_shape.push_back(builder.create<arith::SubIOp>(size, offset));
+  }
+
+  return residual_shape;
+}
+
+// Compute physical strides of the tile. `tile_strides` contains strides for
+// individual dimensions. We need to convert them to strides in the buffer
+// taking into account physical layout. Note that we should pass in the
+// minor-to-major layout for this to work correctly.
+SmallVector<Value> ComputeStrides(::xla::EmitterLocOpBuilder& builder,
+                                  ArrayRef<int64_t> original_shape,
+                                  ValueRange tile_strides,
+                                  ArrayRef<int64_t> minor_to_major_layout) {
+  SmallVector<Value> strides(tile_strides.size());
+  int64_t current_stride = 1;
+  for (int64_t cur_dim : minor_to_major_layout) {
+    strides[cur_dim] = builder.create<arith::MulIOp>(
+        builder.create<arith::IndexCastUIOp>(builder.getI64Type(),
+                                             tile_strides[cur_dim]),
+        ::xla::gpu::triton::CreateConst(builder, builder.getI64Type(),
+                                        current_stride)
+            .UnwrapScalar());
+    current_stride *= original_shape[cur_dim];
+  }
+  return strides;
+}
+
+// Based on the multi-dimensional offsets and layout of the shape, we compute
+// a linear offset. We do this because we move the pointer to the correct
+// position via tt.addptr prior to calling tt.make_tensor_ptr.
+Value ComputeLinearOffset(::xla::EmitterLocOpBuilder& builder,
+                          const RankedTensorType& tensor_type,
+                          ValueRange offsets, llvm::ArrayRef<int64_t> layout) {
+  ::xla::Shape shape = ::xla::ShapeUtil::MakeShapeWithDenseLayout(
+      xgt::GetPrimitiveType(tensor_type.getElementType()).value(),
+      tensor_type.getShape(), layout);
+
+  ::xla::Shape linear_shape = ::xla::ShapeUtil::MakeShape(
+      shape.element_type(), {::xla::ShapeUtil::ElementsIn(shape)});
+  auto bitcast_map =
+      ::xla::GetBitcastMap(shape, linear_shape, builder.getContext());
+
+  return builder.create<arith::IndexCastUIOp>(
+      builder.getI64Type(),
+      builder.create<::xla::ApplyIndexingOp>(offsets, bitcast_map)
+          .getResult(0));
+}
+
+// Add TMA attributes to the corresponding argument in the function.
+void AddTmaAttributes(::xla::EmitterLocOpBuilder& builder,
+                      const TypedValue<RankedTensorType>& tensor,
+                      const ArrayRef<int64_t>& tile_shape,
+                      const ArrayRef<int64_t>& layout) {
+  auto block_arg = mlir::dyn_cast<BlockArgument>(tensor);
+  auto func_op =
+      mlir::dyn_cast<func::FuncOp>(block_arg.getOwner()->getParentOp());
+  func_op.setArgAttr(block_arg.getArgNumber(), "tt.nv_tma_desc",
+                     builder.getI32IntegerAttr(1));
+  // Prefixing the attribute name with "tt", otherwise tt.func will
+  // complain that it is not part of the dialect. Not the best way to
+  // do this, but it works for now.
+  func_op.setArgAttr(
+      block_arg.getArgNumber(), "tt.tma_descriptor",
+      builder.getAttr<TmaDescriptorAttr>(
+          tensor.getType().getShape(), tile_shape, layout,
+          tensor.getType().getElementType().getIntOrFloatBitWidth() / 8));
+}
+
+// Normalized layout is in the form of [N-1, N-2, ... 1, 0]. It is identical
+// to HLO's layout.
+bool IsNormalizedLayout(ArrayRef<int64_t> layout) {
+  for (auto&& [idx, layout_entry] : llvm::enumerate(layout)) {
+    if (layout_entry != layout.size() - 1 - idx) {
+      return false;
+    }
+  }
+  return true;
 }
 
-struct RewriteFuncOp : mlir::OpRewritePattern<func::FuncOp> {
+// Permutes the given array based on the given layout.
+template <typename T>
+SmallVector<T> NormalizeImpl(ArrayRef<T> values, ArrayRef<int64_t> layout) {
+  if (IsNormalizedLayout(layout)) {
+    return llvm::to_vector(values);
+  }
+
+  auto reversed_layout = llvm::to_vector(layout);
+  std::reverse(reversed_layout.begin(), reversed_layout.end());
+  std::vector<T> normalized_values = ::xla::Permute(values, reversed_layout);
+  return SmallVector<T>(normalized_values.begin(), normalized_values.end());
+}
+
+SmallVector<Value> Normalize(ValueRange values, ArrayRef<int64_t> layout) {
+  SmallVector<Value> values_vec = llvm::to_vector(values);
+  return NormalizeImpl<Value>(values_vec, layout);
+}
+
+SmallVector<int64_t> Normalize(ArrayRef<int64_t> values,
+                               ArrayRef<int64_t> layout) {
+  return NormalizeImpl<int64_t>(values, layout);
+}
+
+Value CreateAddPtrOp(::xla::EmitterLocOpBuilder& builder,
+                     const TypedValue<RankedTensorType>& tensor,
+                     ValueRange offsets, llvm::ArrayRef<int64_t> layout) {
+  // tensor -> !tt.ptr<>
+  auto cast_to_tensor_ptr_type =
+      builder
+          .create<mlir::UnrealizedConversionCastOp>(
+              GetTensorPtrType(tensor.getType().getElementType()), tensor)
+          .getResult(0);
+
+  auto linear_offset =
+      ComputeLinearOffset(builder, tensor.getType(), offsets, layout);
+  return builder.create<AddPtrOp>(cast_to_tensor_ptr_type.getType(),
+                                  cast_to_tensor_ptr_type, linear_offset);
+}
+
+Value CreateMakeTensorPtrOp(::xla::EmitterLocOpBuilder& builder, Value ptr,
+                            ArrayRef<int64_t> original_shape,
+                            ArrayRef<int64_t> tile_shape,
+                            SmallVector<Value> offsets,
+                            SmallVector<Value> tile_strides,
+                            ArrayRef<int64_t> layout) {
+  // TODO(b/342989850): Clarify and comment what `order` exactly is. It's
+  // not entirely clear from the Triton docs. Currently we are propagating
+  // the layout from the original tensor.
+  auto dim_order = llvm::to_vector_of<int32_t>(layout);
+
+  SmallVector<Value> residual_shape =
+      ComputeResidualShape(builder, original_shape, offsets);
+
+  // Offsets are always passed as 0 since we are using "residual shape".
+  SmallVector<Value> zero_offsets(
+      tile_shape.size(),
+      ::xla::gpu::triton::CreateConst(builder, builder.getI32Type(), 0)
+          .UnwrapScalar());
+
+  SmallVector<Value> strides =
+      ComputeStrides(builder, original_shape, tile_strides, layout);
+
+  return builder
+      .create<MakeTensorPtrOp>(ptr, residual_shape, strides, zero_offsets,
+                               llvm::to_vector_of<int32_t>(tile_shape),
+                               dim_order)
+      .getResult();
+}
+
+class RewriteFuncOp : public mlir::OpRewritePattern<func::FuncOp> {
+ public:
   using OpRewritePattern::OpRewritePattern;
 
   // Rewrite tensors<> to !tt.ptr<tensor>
@@ -187,14 +349,30 @@ struct RewriteFuncOp : mlir::OpRewritePattern<func::FuncOp> {
     SmallVector<Type> new_operand_types(input_types);
     for (auto&& [index, operand_type] : llvm::enumerate(new_operand_types)) {
       mlir::BlockArgument func_arg = op.getArgument(index);
-
-      // !tt.ptr<> -> tensor
-      auto cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
-          operand_type, func_arg);
+      auto element_type = mlir::cast<TensorType>(operand_type).getElementType();
+
+      mlir::UnrealizedConversionCastOp cast_to_orig_type;
+      if (auto attr = op.getArgAttr(index, "tt.tma_descriptor")) {
+        auto tma_descriptor = mlir::cast<TmaDescriptorAttr>(attr);
+        auto layout = tma_descriptor.getLayout();
+        auto block_shape = tma_descriptor.getBlockShape();
+        SmallVector<int64_t> normalized_block_shape =
+            Normalize(block_shape, layout);
+
+        operand_type = TensorDescType::get(
+            builder.getContext(),
+            RankedTensorType::get(normalized_block_shape, element_type));
+        // !tt.tensordesc<tensor<block_shape x element_type>> -> tensor
+        cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
+            operand_type, func_arg);
+      } else {
+        // !tt.ptr<> -> tensor
+        cast_to_orig_type = builder.create<mlir::UnrealizedConversionCastOp>(
+            operand_type, func_arg);
+        operand_type = GetTensorPtrType(element_type);
+      }
       func_arg.replaceAllUsesExcept(cast_to_orig_type.getResult(0),
                                     cast_to_orig_type);
-      operand_type = GetTensorPtrType(
-          builder, mlir::cast<TensorType>(operand_type).getElementType());
     }
 
     // Replace the function arguments with the new types.
@@ -219,8 +397,17 @@ struct RewriteFuncOp : mlir::OpRewritePattern<func::FuncOp> {
 
     // Currently not propagating any function attributes to the new function.
     ArrayRef<NamedAttribute> attrs;
-    auto new_func = rewriter.create<triton::FuncOp>(
-        op.getLoc(), op.getName(), new_function_type, attrs, arg_attrs);
+    auto new_func = builder.create<triton::FuncOp>(
+        op.getName(), new_function_type, attrs, arg_attrs);
+
+    for (int i = 0; i < new_func.getNumArguments(); ++i) {
+      // TMA arguments don't require tt.divisibility.
+      if (op.getArgAttr(i, "tt.nv_tma_desc")) {
+        continue;
+      }
+      new_func.setArgAttr(i, "tt.divisibility",
+                          builder.getIntegerAttr(builder.getI32Type(), 16));
+    }
 
     rewriter.inlineRegionBefore(op.getRegion(), new_func.getFunctionBody(),
                                 new_func.end());
@@ -235,218 +422,181 @@ struct RewriteFuncOp : mlir::OpRewritePattern<func::FuncOp> {
   }
 };
 
-struct RewriteTile : mlir::OpRewritePattern<TileOp> {
-  RewriteTile(mlir::MLIRContext* context,
-              const stream_executor::DeviceDescription* device_description,
-              bool tma_enabled)
+class RewriteExtract : public mlir::OpRewritePattern<ExtractOp> {
+ public:
+  RewriteExtract(mlir::MLIRContext* context,
+                 const stream_executor::DeviceDescription* device_description,
+                 bool tma_enabled)
       : OpRewritePattern(context),
-        device_description(device_description),
-        tma_enabled(tma_enabled) {}
+        device_description_(device_description),
+        tma_enabled_(tma_enabled) {}
   using OpRewritePattern::OpRewritePattern;
 
-  // Rewriting TileOp as tt.make_tensor_ptr if TMA is not enabled, otherwise
-  // tt.reinterpret_tensor_desc.
-  mlir::LogicalResult matchAndRewrite(
-      TileOp op, mlir::PatternRewriter& rewriter) const override {
-    ::xla::EmitterLocOpBuilder builder(op.getLoc(), rewriter);
-
-    if (CanUseTMA(builder, tma_enabled, *device_description,
-                  op.getTiledTensor().getType(), op.getTensor())) {
-      // Add TMA attributes to the corresponding argument in the function.
-      auto block_arg = mlir::dyn_cast<BlockArgument>(op.getTensor());
-      auto func_op =
-          mlir::dyn_cast<func::FuncOp>(block_arg.getOwner()->getParentOp());
-      func_op.setArgAttr(block_arg.getArgNumber(), "tt.nv_tma_desc",
-                         builder.getI32IntegerAttr(1));
-      // Prefixing the attribute name with "tt", otherwise tt.func will
-      // complain that it is not part of the dialect. Not the best way to
-      // do this, but it works for now.
-      auto tiled_tensor_type = op.getTiledTensor().getType();
-      func_op.setArgAttr(
-          block_arg.getArgNumber(), "tt.tma_descriptor",
-          builder.getAttr<TmaDescriptorAttr>(
-              tiled_tensor_type.getOriginalShape(),
-              tiled_tensor_type.getTileShape(),
-              tiled_tensor_type.getElementType().getIntOrFloatBitWidth() / 8));
-
-      // tensor -> !tt.ptr<>
-      auto cast_to_tensor_ptr_type =
-          builder
-              .create<mlir::UnrealizedConversionCastOp>(
-                  GetTensorPtrType(builder,
-                                   op.getTensor().getType().getElementType()),
-                  op.getTensor())
-              .getResult(0);
-
-      auto reinterpret_tensor_desc =
-          xg::EmitTmaDescriptor(builder, cast_to_tensor_ptr_type,
-                                op.getTiledTensor().getType().getTileType());
-
-      // !tt.tensordesc<tensor> -> tiled_tensor
-      auto cast_desc_ptr_to_tiled_tensor_ptr_type =
-          builder.create<mlir::UnrealizedConversionCastOp>(
-              op.getTiledTensor().getType(), reinterpret_tensor_desc);
-
-      rewriter.replaceOp(op, cast_desc_ptr_to_tiled_tensor_ptr_type);
-      return mlir::success();
-    }
-
-    // Order is 0, 1, ..., rank - 1.
-    std::vector<int32_t> dim_order(op.getSizes().size());
-    std::iota(dim_order.begin(), dim_order.end(), 0);
-
-    // tensor -> !tt.ptr<>
-    auto cast_to_tensor_ptr_type =
-        builder
-            .create<mlir::UnrealizedConversionCastOp>(
-                GetTensorPtrType(builder,
-                                 op.getTensor().getType().getElementType()),
-                op.getTensor())
-            .getResult(0);
-
-    auto tensor_ptr =
-        builder
-            .create<MakeTensorPtrOp>(
-                cast_to_tensor_ptr_type,
-                GetValueRange(builder, op.getTensor().getType().getShape()),
-                GetValueRange(builder, op.getStrides()),
-                GetValueRange(builder, op.getOffsets()), op.getSizes(),
-                dim_order)
-            .getResult();
-
-    // !tt.ptr<tensor> -> tiled_tensor
-    auto cast_to_tiled_tensor_type =
-        builder.create<mlir::UnrealizedConversionCastOp>(
-            op.getTiledTensor().getType(), tensor_ptr);
-
-    rewriter.replaceOp(op, cast_to_tiled_tensor_type);
-    return mlir::success();
-  }
-
-  const stream_executor::DeviceDescription* device_description;
-  const bool tma_enabled;
-};
-
-struct RewriteExtract : mlir::OpRewritePattern<ExtractOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  // Rewriting ExtractOp as tt.advance + tt.load if TMA is not enabled,
-  // otherwise tt.experimental_descriptor_load.
+  // Rewriting ExtractOp as:
+  // Without TMA:
+  // tt.addptr + tt.make_tensor_ptr + tt.load.
+  // Offsets are resolved in tt.addptr.
+  //
+  // With TMA:
+  // tt.descriptor_load.
+  // Offsets are resolved in tt.descriptor_load.
+  // If the layout is not normalized, we insert a transpose to ensure that
+  // the tile loaded in both TMA and non-TMA cases is the same:
+  // tt.descriptor_load + tt.transpose.
   mlir::LogicalResult matchAndRewrite(
       ExtractOp op, mlir::PatternRewriter& rewriter) const override {
     ::xla::EmitterLocOpBuilder builder(op.getLoc(), rewriter);
-
-    if (IsTmaUsed(op.getSrc().getDefiningOp())) {
-      // tiled_tensor -> !tt.tensordesc<tensor>
-      auto cast_to_tensor_desc_ptr_type =
+    RankedTensorType original_type = op.getSrcType();
+    RankedTensorType tile_type = op.getResultType();
+    ArrayRef<int64_t> original_shape = original_type.getShape();
+    ArrayRef<int64_t> tile_shape = tile_type.getShape();
+
+    auto offsets = op.getOffsetsAsValues(builder);
+    if (CanUseTMA(builder, tma_enabled_, *device_description_, tile_shape,
+                  op.getSrc(), op.getLayout())) {
+      AddTmaAttributes(builder, op.getSrc(), tile_shape, op.getLayout());
+
+      SmallVector<int64_t> normalized_tile_shape =
+          Normalize(tile_shape, op.getLayout());
+      auto normalized_tile_type = RankedTensorType::get(
+          normalized_tile_shape, tile_type.getElementType());
+      auto normalized_offsets = Normalize(offsets, op.getLayout());
+
+      // tensor -> !tt.tensordesc<tile_type>
+      auto cast_to_tensor_desc =
           builder
               .create<mlir::UnrealizedConversionCastOp>(
-                  GetTensorDescPtrType(
-                      builder, RankedTensorType::get(
-                                   op.getSrc().getType().getTileShape(),
-                                   op.getSrc().getType().getElementType())),
+                  TensorDescType::get(builder.getContext(),
+                                      normalized_tile_type),
                   op.getSrc())
               .getResult(0);
 
-      auto descriptor_load =
-          builder
-              .create<ExperimentalDescriptorLoadOp>(
-                  op.getResult().getType(), cast_to_tensor_desc_ptr_type,
-                  op.getOffsets())
-              .getResult();
+      auto descriptor_load = builder.create<DescriptorLoadOp>(
+          normalized_tile_type, cast_to_tensor_desc,
+          IndexCastUI(builder, builder.getI32Type(), normalized_offsets));
+
+      // Insert a transpose if the layout is not normalized.
+      // TODO(b/417039624): This needs to be generalized beyond 2D tensors. We
+      // would need to figure out what dim_order should be used and pass it to
+      // the transpose op.
+      if (!IsNormalizedLayout(op.getLayout())) {
+        auto dim_order = llvm::to_vector_of<int32_t>(op.getLayout());
+        std::reverse(dim_order.begin(), dim_order.end());
+        auto transpose = builder.create<TransOp>(op.getResultType(),
+                                                 descriptor_load, dim_order);
+        rewriter.replaceOp(op, transpose);
+        return mlir::success();
+      }
 
       rewriter.replaceOp(op, descriptor_load);
       return mlir::success();
     }
-    // tiled_tensor -> !tt.ptr<tensor>
-    auto cast_to_tensor_ptr_type =
-        builder
-            .create<mlir::UnrealizedConversionCastOp>(
-                GetTensorPtrType(builder,
-                                 RankedTensorType::get(
-                                     op.getSrc().getType().getTileShape(),
-                                     op.getSrc().getType().getElementType())),
-                op.getSrc())
-            .getResult(0);
 
-    auto advance =
-        builder.create<AdvanceOp>(cast_to_tensor_ptr_type.getType(),
-                                  cast_to_tensor_ptr_type, op.getOffsets());
-    std::vector<int32_t> boundary_checks;
-    ComputeBoundaryChecks(boundary_checks, op.getSrc().getType());
+    auto ptr = CreateAddPtrOp(builder, op.getSrc(), offsets, op.getLayout());
+    auto strides = op.getStridesAsValues(builder);
+    ptr = CreateMakeTensorPtrOp(builder, ptr, original_shape, tile_shape,
+                                offsets, strides, op.getLayout());
+    auto boundary_checks = ComputeBoundaryChecks(original_shape, tile_shape);
     std::optional<PaddingOption> padding;
     if (!boundary_checks.empty()) {
       padding = PaddingOption::PAD_ZERO;
     }
-    auto load = builder
-                    .create<LoadOp>(advance, boundary_checks, padding,
-                                    CacheModifier::NONE, EvictionPolicy::NORMAL,
-                                    /*isVolatile=*/false)
-                    .getResult();
-
+    auto load =
+        builder.create<LoadOp>(ptr, boundary_checks, padding,
+                               CacheModifier::NONE, EvictionPolicy::NORMAL,
+                               /*isVolatile=*/false);
     rewriter.replaceOp(op, load);
     return mlir::success();
   }
+
+  const stream_executor::DeviceDescription* device_description_;
+  const bool tma_enabled_;
 };
 
-struct RewriteInsert : mlir::OpRewritePattern<InsertOp> {
+class RewriteInsert : public mlir::OpRewritePattern<InsertOp> {
+ public:
+  RewriteInsert(mlir::MLIRContext* context,
+                const stream_executor::DeviceDescription* device_description,
+                bool tma_enabled)
+      : OpRewritePattern(context),
+        device_description_(device_description),
+        tma_enabled_(tma_enabled) {}
   using OpRewritePattern::OpRewritePattern;
 
-  // Rewriting InsertOp as tt.advance + tt.store if TMA is not enabled,
-  // otherwise tt.experimental_descriptor_store.
+  // Rewriting InsertOp as:
+  // Without TMA:
+  // tt.addptr + tt.make_tensor_ptr + tt.store.
+  // Offsets are resolved in tt.addptr.
+  //
+  // With TMA:
+  // tt.descriptor_store.
+  // Offsets are resolved in tt.descriptor_store.
+  // If the layout is not normalized, we insert a transpose to to be compatible
+  // with TMA's physical restrictions.
+  // tt.transpose + tt.descriptor_store.
   mlir::LogicalResult matchAndRewrite(
       InsertOp op, mlir::PatternRewriter& rewriter) const override {
     ::xla::EmitterLocOpBuilder builder(op.getLoc(), rewriter);
-
-    if (IsTmaUsed(op.getDst().getDefiningOp())) {
-      // tiled_tensor -> !tt.tensordesc<tensor>
-      auto cast_to_tensor_desc_ptr_type =
-          builder
-              .create<mlir::UnrealizedConversionCastOp>(
-                  GetTensorDescPtrType(
-                      builder, RankedTensorType::get(
-                                   op.getDst().getType().getTileShape(),
-                                   op.getDst().getType().getElementType())),
-                  op.getDst())
-              .getResult(0);
-
-      builder.create<ExperimentalDescriptorStoreOp>(
-          cast_to_tensor_desc_ptr_type, op.getSrc(), op.getOffsets());
-    } else {
-      // tiled_tensor -> !tt.ptr<tensor>
-      auto cast_dst_to_tensor_ptr_type =
+    RankedTensorType original_type = op.getResultType();
+    RankedTensorType tile_type = op.getSrcType();
+    ArrayRef<int64_t> original_shape = original_type.getShape();
+    ArrayRef<int64_t> tile_shape = tile_type.getShape();
+
+    auto offsets = op.getOffsetsAsValues(builder);
+    if (CanUseTMA(builder, tma_enabled_, *device_description_, tile_shape,
+                  op.getDst(), op.getLayout())) {
+      AddTmaAttributes(builder, op.getDst(), tile_shape, op.getLayout());
+
+      SmallVector<int64_t> normalized_tile_shape =
+          Normalize(tile_shape, op.getLayout());
+      auto normalized_tile_type = RankedTensorType::get(
+          normalized_tile_shape, tile_type.getElementType());
+      auto normalized_offsets = Normalize(offsets, op.getLayout());
+
+      // tensor -> !tt.tensordesc<tile_type>
+      auto cast_to_tensor_desc =
           builder
               .create<mlir::UnrealizedConversionCastOp>(
-                  GetTensorPtrType(builder,
-                                   RankedTensorType::get(
-                                       op.getDst().getType().getTileShape(),
-                                       op.getDst().getType().getElementType())),
+                  TensorDescType::get(builder.getContext(),
+                                      normalized_tile_type),
                   op.getDst())
               .getResult(0);
 
-      auto advance = builder.create<AdvanceOp>(
-          cast_dst_to_tensor_ptr_type.getType(), cast_dst_to_tensor_ptr_type,
-          op.getOffsets());
-      std::vector<int32_t> boundary_checks;
-      ComputeBoundaryChecks(boundary_checks, op.getDst().getType());
-      std::optional<PaddingOption> padding;
-      if (!boundary_checks.empty()) {
-        padding = PaddingOption::PAD_ZERO;
+      // Insert a transpose if the layout is not normalized.
+      // TODO(b/417039624): This needs to be generalized beyond 2D tensors. We
+      // would need to figure out what dim_order should be used and pass it to
+      // the transpose op.
+      auto src = op.getSrc();
+      if (!IsNormalizedLayout(op.getLayout())) {
+        auto dim_order = llvm::to_vector_of<int32_t>(op.getLayout());
+        std::reverse(dim_order.begin(), dim_order.end());
+        src = builder.create<TransOp>(normalized_tile_type, op.getSrc(),
+                                      dim_order);
       }
-      rewriter.create<StoreOp>(op->getLoc(), advance, op.getSrc(),
-                               boundary_checks, CacheModifier::NONE,
-                               EvictionPolicy::NORMAL);
+      builder.create<DescriptorStoreOp>(
+          cast_to_tensor_desc, src,
+          IndexCastUI(builder, builder.getI32Type(), normalized_offsets));
+    } else {
+      auto ptr = CreateAddPtrOp(builder, op.getDst(), offsets, op.getLayout());
+      auto strides = op.getStridesAsValues(builder);
+      ptr = CreateMakeTensorPtrOp(builder, ptr, original_shape, tile_shape,
+                                  offsets, strides, op.getLayout());
+      builder.create<StoreOp>(ptr, op.getSrc(),
+                              ComputeBoundaryChecks(original_shape, tile_shape),
+                              CacheModifier::NONE, EvictionPolicy::NORMAL);
     }
-
     // InsertOp has a result, so we propagate it to the users.
     op->replaceAllUsesWith(ValueRange(op.getDst()));
-
     return mlir::success();
   }
+
+  const stream_executor::DeviceDescription* device_description_;
+  const bool tma_enabled_;
 };
 
 // Rewriting tensor::InsertOp as tt.store.
-struct RewriteScalarInsert : mlir::OpRewritePattern<tensor::InsertOp> {
+class RewriteScalarInsert : public mlir::OpRewritePattern<tensor::InsertOp> {
+ public:
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult matchAndRewrite(
@@ -455,7 +605,7 @@ struct RewriteScalarInsert : mlir::OpRewritePattern<tensor::InsertOp> {
       return rewriter.notifyMatchFailure(op, "Expected dest to be scalar.");
     }
     ::xla::EmitterLocOpBuilder builder(op.getLoc(), rewriter);
-    auto ptr_type = GetTensorPtrType(builder, op.getScalar().getType());
+    auto ptr_type = GetTensorPtrType(op.getScalar().getType());
     auto cast_dst_to_tensor_ptr_type =
         builder.create<mlir::UnrealizedConversionCastOp>(ptr_type, op.getDest())
             .getResult(0);
@@ -467,7 +617,8 @@ struct RewriteScalarInsert : mlir::OpRewritePattern<tensor::InsertOp> {
   }
 };
 
-struct RewriteScalarExtract : mlir::OpRewritePattern<tensor::ExtractOp> {
+class RewriteScalarExtract : public mlir::OpRewritePattern<tensor::ExtractOp> {
+ public:
   using OpRewritePattern::OpRewritePattern;
 
   // Rewriting ExtractOp as tt.advance + tt.store.
@@ -477,7 +628,7 @@ struct RewriteScalarExtract : mlir::OpRewritePattern<tensor::ExtractOp> {
       return rewriter.notifyMatchFailure(op, "Expected src to be scalar.");
     }
     ::xla::EmitterLocOpBuilder builder(op.getLoc(), rewriter);
-    auto ptr_type = GetTensorPtrType(builder, op.getType());
+    auto ptr_type = GetTensorPtrType(op.getType());
     auto cast_src_to_tensor_ptr_type =
         builder
             .create<mlir::UnrealizedConversionCastOp>(ptr_type, op.getTensor())
@@ -490,9 +641,10 @@ struct RewriteScalarExtract : mlir::OpRewritePattern<tensor::ExtractOp> {
   }
 };
 
-struct TritonXLAExtractInsertToTritonPass
+class TritonXLAExtractInsertToTritonPass
     : public impl::TritonXLAExtractInsertToTritonPassBase<
           TritonXLAExtractInsertToTritonPass> {
+ public:
   explicit TritonXLAExtractInsertToTritonPass(
       const TritonXLAExtractInsertToTritonPassOptions& options)
       : TritonXLAExtractInsertToTritonPassBase(options) {}
@@ -500,43 +652,39 @@ struct TritonXLAExtractInsertToTritonPass
   explicit TritonXLAExtractInsertToTritonPass(
       const stream_executor::DeviceDescription& device_description,
       bool tma_enabled)
-      : device_description(device_description), tma_enabled(tma_enabled) {}
+      : device_description_(device_description), is_tma_enabled_(tma_enabled) {}
 
   void runOnOperation() override {
     if (!gpu_device_info_.empty()) {
       stream_executor::GpuDeviceInfoProto device_info;
       CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
                                                        &device_info));
-      device_description = stream_executor::DeviceDescription(device_info);
+      device_description_ = stream_executor::DeviceDescription(device_info);
+    }
+    if (tma_enabled_.hasValue()) {
+      is_tma_enabled_ = tma_enabled_.getValue();
     }
-    tma_enabled = tma_enabled_;
 
     mlir::MLIRContext* mlir_context = &getContext();
-
-    mlir::RewritePatternSet tile_pattern_set(mlir_context);
-    tile_pattern_set.add<RewriteTile>(mlir_context, &device_description,
-                                      tma_enabled);
-    auto tile_result = mlir::applyPatternsGreedily(getOperation(),
-                                                   std::move(tile_pattern_set));
-
     mlir::RewritePatternSet patterns(mlir_context);
-    // clang-format off
-    patterns.add<RewriteExtract,
-                 RewriteFuncOp,
-                 RewriteInsert,
-                 RewriteScalarExtract,
-                 RewriteScalarInsert>(mlir_context);
-    // clang-format on
-    auto result =
-        mlir::applyPatternsGreedily(getOperation(), std::move(patterns));
-
-    if (mlir::failed(tile_result) && mlir::failed(result)) {
+    patterns.add<RewriteExtract, RewriteInsert>(
+        mlir_context, &device_description_, is_tma_enabled_);
+    patterns.add<RewriteScalarExtract, RewriteScalarInsert>(mlir_context);
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+    }
+
+    mlir::RewritePatternSet func_pattern(mlir_context);
+    func_pattern.add<RewriteFuncOp>(mlir_context);
+    if (mlir::failed(mlir::applyPatternsGreedily(getOperation(),
+                                                 std::move(func_pattern)))) {
       signalPassFailure();
     }
   }
 
-  stream_executor::DeviceDescription device_description;
-  bool tma_enabled;
+  stream_executor::DeviceDescription device_description_;
+  bool is_tma_enabled_;
 };
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index 0af7fb667fc8..834f142b5a40 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -1,7 +1,9 @@
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
-load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
@@ -18,14 +20,49 @@ package_group(
     ],
 )
 
+# Allows to explicitely disable nvshmem collectives using invocation flag.
+bool_flag(
+    name = "nvshmem_enabled",
+    build_setting_default = True,
+)
+
+config_setting(
+    name = "nvshmem_supported",
+    constraint_values = [
+        # TODO(b/409709288): Fix nvshmem ARM issues and remove this condition.
+        "@platforms//cpu:x86_64",
+        "@platforms//os:linux",
+    ],
+    flag_values = {
+        ":nvshmem_enabled": "True",
+    },
+)
+
+# Since selects can't be nested we need to create this intermediate target
+cc_library(
+    name = "nvshmem_collectives_if_supported",
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps =
+        select({
+            ":nvshmem_supported": [":nvshmem_collectives"],
+            "//conditions:default": [],
+        }),
+)
+
 # Build target that registers all available GPU collectives implementations with the collectives
 # registry at link time.
 cc_library(
     name = "gpu_collectives_plugin",
     deps = [
         ":gpu_collectives_stub",
+    ] + if_gpu_is_configured([
         ":nccl_collectives",
-    ],
+    ]) + if_cuda_is_configured([
+        ":nvshmem_collectives_if_supported",
+    ]),
 )
 
 cc_library(
@@ -53,8 +90,6 @@ cc_library(
     srcs = ["gpu_clique_key.cc"],
     hdrs = ["gpu_clique_key.h"],
     deps = [
-        "//xla/core/collectives",
-        "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
         "//xla/service:global_device_id",
         "//xla/tsl/lib/gtl:int_type",
@@ -76,11 +111,9 @@ xla_cc_test(
         ":gpu_clique_key",
         "//xla/core/collectives:clique_id",
         "//xla/service:global_device_id",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -95,7 +128,6 @@ cc_library(
         ":gpu_collectives",
         "//xla:debug_options_flags",
         "//xla:executable_run_options",
-        "//xla:status_macros",
         "//xla:types",
         "//xla:util",
         "//xla/core/collectives",
@@ -133,6 +165,8 @@ cc_library(
     srcs = ["gpu_collectives.cc"],
     hdrs = ["gpu_collectives.h"],
     deps = [
+        ":gpu_communicator",
+        "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -141,10 +175,12 @@ cc_library(
         "//xla/core/collectives:clique_key",
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -153,6 +189,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_communicator",
+    hdrs = ["gpu_communicator.h"],
+    deps = [
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/service:collective_ops_utils",
+        "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "gpu_collectives_stub",
     srcs = ["gpu_collectives_stub.cc"],
@@ -175,29 +227,46 @@ cc_library(
 
 cc_library(
     name = "nccl_errors",
-    hdrs = if_gpu_is_configured(["nccl_errors.h"]),
+    srcs = ["nccl_errors.cc"],
+    hdrs = ["nccl_errors.h"],
+    local_defines =
+        if_rocm_is_configured([
+            "TENSORFLOW_USE_ROCM=1",
+        ]),
+    tags = ["gpu"],
     visibility = ["//visibility:private"],
     deps = [
         "//xla:util",
         "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:str_format",
-    ],
+        "@com_google_absl//absl/time",
+    ] + if_cuda_is_configured([
+        "@local_config_nccl//:nccl",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+        "@local_config_rocm//rocm:rccl",
+    ]),
 )
 
-# TODO(b/380457503): Update visibility to "//visibility:private".
 cc_library(
     name = "nccl_collectives",
-    srcs = if_gpu_is_configured(["nccl_collectives.cc"]),
-    hdrs = if_gpu_is_configured(["nccl_collectives.h"]),
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    srcs = ["nccl_collectives.cc"],
+    hdrs = ["nccl_collectives.h"],
+    local_defines =
+        if_rocm_is_configured([
+            "TENSORFLOW_USE_ROCM=1",
+        ]),
+    tags = ["gpu"],
+    visibility = ["//visibility:private"],
     deps = [
+        ":gpu_clique_key",
         ":gpu_collectives",
+        ":gpu_communicator",
         ":nccl_communicator",
         ":nccl_errors",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla/core/collectives",
@@ -206,17 +275,34 @@ cc_library(
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/debugging:leak_check",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:numbers",
     ] + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
     ]) + if_rocm_is_configured([
@@ -226,36 +312,58 @@ cc_library(
     alwayslink = True,  # registers collectives implementation
 )
 
-# TODO(b/380457503): Update visibility to "//visibility:private".
 cc_library(
     name = "nccl_communicator",
-    srcs = if_gpu_is_configured(["nccl_communicator.cc"]),
-    hdrs = if_gpu_is_configured(["nccl_communicator.h"]),
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]) + if_rocm_is_configured([
+    srcs = ["nccl_communicator.cc"],
+    hdrs = ["nccl_communicator.h"],
+    local_defines = if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
+    tags = ["gpu"],
+    visibility = ["//visibility:private"],
     deps = [
         ":gpu_collectives",
+        ":gpu_communicator",
         ":nccl_errors",
+        ":single_threaded_executor",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:util",
+        "//xla/core/collectives",
+        "//xla/core/collectives:clique_id",
+        "//xla/core/collectives:clique_key",
+        "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:gpu_executable_run_options",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/debugging:leak_check",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:numbers",
     ] + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
     ]) + if_rocm_is_configured([
@@ -266,13 +374,23 @@ cc_library(
 
 cc_library(
     name = "nvshmem_collectives",
-    srcs = ["nvshmem_collectives.cc"],
-    hdrs = ["nvshmem_collectives.h"],
+    srcs = [
+        "nvshmem_collectives.cc",
+        "nvshmem_communicator.cc",
+    ],
+    hdrs = [
+        "nvshmem_collectives.h",
+        "nvshmem_communicator.h",
+    ],
     tags = [
         "cuda-only",
         "gpu",
     ],
+    visibility = ["//visibility:private"],
     deps = [
+        ":gpu_collectives",
+        "//xla:shape_util",
+        "//xla:util",
         "//xla/core/collectives",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
@@ -280,9 +398,16 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:collective_ops_utils",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -291,23 +416,23 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:numbers",
         "@nvshmem//:nvshmem_lib",
     ],
-    alwayslink = True,  # registers collectives implementation
+    alwayslink = True,
 )
 
-xla_cc_test(
+xla_test(
     name = "nccl_communicator_test",
-    srcs = if_gpu_is_configured(["nccl_communicator_test.cc"]),
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    srcs = ["nccl_communicator_test.cc"],
+    backends = ["gpu"],
+    local_defines =
+        if_rocm_is_configured([
+            "TENSORFLOW_USE_ROCM=1",
+        ]),
     tags = [
-        "requires-gpu-nvidia",
         # Stop chloroxylenol from running this test with msan because msan does
         # not work with CUDA.
         #
@@ -315,15 +440,20 @@ xla_cc_test(
         # go/cuda#memorysanitizer
         "nomsan",
     ],
+    visibility = ["//visibility:private"],
     deps = [
         ":gpu_collectives",
+        ":nccl_collectives",
         ":nccl_communicator",
         ":nccl_errors",
+        "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -337,3 +467,58 @@ xla_cc_test(
         "@local_config_rocm//rocm:rccl",
     ]),
 )
+
+xla_cc_test(
+    name = "nvshmem_collectives_test",
+    srcs = ["nvshmem_collectives_test.cc"],
+    tags = [
+        "cuda-only",
+        "gpu",
+        "no_oss",
+        "nomsan",
+        "notap",  # TODO(b/399931591): Re-enable once flakiness is resolved.
+    ] + if_google(google_value = ["requires-gpu-nvidia:2"]),  # This tag is not documented in OSS yet.
+    deps = [
+        ":nvshmem_collectives",
+        "//xla:debug_options_flags",
+        "//xla:status_macros",
+        "//xla/pjrt/distributed",
+        "//xla/pjrt/distributed:client",
+        "//xla/pjrt/distributed:service",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:subprocess",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "single_threaded_executor",
+    srcs = ["single_threaded_executor.cc"],
+    hdrs = ["single_threaded_executor.h"],
+    deps = [
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:threadpool_async_executor",
+    ],
+)
+
+xla_cc_test(
+    name = "single_threaded_executor_test",
+    srcs = ["single_threaded_executor_test.cc"],
+    deps = [
+        ":single_threaded_executor",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique.cc
index 007e69a8679b..9da969cf86cb 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique.cc
@@ -38,8 +38,12 @@ namespace xla::gpu {
 
 GpuClique::GpuClique(
     GpuCliqueKey key, std::optional<CliqueIds> ids,
-    absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators)
-    : Clique(std::move(communicators)), key_(key), ids_(ids) {}
+    absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators,
+    bool peer_access_enabled)
+    : Clique(std::move(communicators)),
+      key_(key),
+      ids_(ids),
+      peer_access_enabled_(peer_access_enabled) {}
 
 std::string GpuClique::DebugString() const {
   std::string out = absl::StrFormat(
@@ -71,8 +75,10 @@ std::string GpuClique::LockableName::ToString(const GpuClique& clique) {
 
 LockableGpuClique::LockableGpuClique(
     GpuCliqueKey clique_key, std::optional<CliqueIds> clique_ids,
-    absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators)
-    : Lockable(std::move(clique_key), clique_ids, std::move(communicators)) {}
+    absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators,
+    bool peer_access_enabled)
+    : Lockable(std::move(clique_key), clique_ids, std::move(communicators),
+               peer_access_enabled) {}
 
 absl::Status LockableGpuClique::HealthCheck() const {
   return value().HealthCheck();
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique.h b/third_party/xla/xla/backends/gpu/collectives/gpu_clique.h
index c5c48712d9ed..ec99b2cd9d90 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique.h
@@ -16,15 +16,12 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_
 #define XLA_BACKENDS_GPU_COLLECTIVES_GPU_CLIQUE_H_
 
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 
 #include "absl/container/btree_map.h"
 #include "absl/status/status.h"
-#include "absl/strings/str_format.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/core/collectives/clique.h"
 #include "xla/core/collectives/clique_id.h"
@@ -41,14 +38,12 @@ class GpuClique : public Clique {
  public:
   GpuClique(
       GpuCliqueKey key, std::optional<CliqueIds> ids,
-      absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators);
-
-  // Returns true if clique is local: all communicators belong to current
-  // process. Non-local cliques spans multiple processes (typically hosts).
-  bool IsLocal() const { return num_communicators() == key_.devices().size(); }
+      absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators,
+      bool peer_access_enabled);
 
   const GpuCliqueKey& key() const { return key_; }
   const std::optional<CliqueIds>& ids() const { return ids_; }
+  bool peer_access_enabled() const { return peer_access_enabled_; }
 
   std::string DebugString() const final;
   absl::Status HealthCheck() const final;
@@ -63,6 +58,10 @@ class GpuClique : public Clique {
 
   GpuCliqueKey key_;
   std::optional<CliqueIds> ids_;
+
+  // True if peer device memory access is possible between all local devices in
+  // the clique.
+  bool peer_access_enabled_;
 };
 
 // A lockable version of GpuClique that guarantees exclusive access to the
@@ -71,7 +70,8 @@ class LockableGpuClique : public Lockable<GpuClique, GpuClique::LockableName> {
  public:
   LockableGpuClique(
       GpuCliqueKey clique_key, std::optional<CliqueIds> clique_ids,
-      absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators);
+      absl::btree_map<RankId, std::unique_ptr<Communicator>> communicators,
+      bool peer_access_enabled);
 
   std::string DebugString() const;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
index ed74ec4d2216..b12a8b9e0a0a 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
@@ -43,11 +43,12 @@ CollectiveStreamId GetCollectiveStreamId(bool is_async,
 }
 
 GpuCliqueKey::GpuCliqueKey(
-    std::vector<GlobalDeviceId> devices, CollectiveStreamId stream_id,
-    AsyncStreamKind stream_kind,
+    std::vector<GlobalDeviceId> devices, int64_t num_local_participants,
+    CollectiveStreamId stream_id, AsyncStreamKind stream_kind,
     std::vector<std::vector<GlobalDeviceId>> participant_groups,
     GlobalDeviceId root_device)
     : CliqueKey(std::move(devices)),
+      num_local_participants_(num_local_participants),
       stream_id_(stream_id),
       stream_kind_(stream_kind),
       participant_groups_(std::move(participant_groups)),
@@ -109,9 +110,11 @@ std::string GpuCliqueKey::ToString() const {
     }
     group_string = absl::StrFormat("; groups=[%s]", absl::StrJoin(values, ","));
   }
-  return absl::StrFormat("devices=[%s]; stream=%d%s; root_device=%lld",
-                         GlobalDeviceIdsToString(devices()), stream_id_.value(),
-                         group_string, root_device_.value());
+  return absl::StrFormat(
+      "devices=[%s]; stream=%d%s; root_device=%lld; "
+      "num_local_participants=%lld",
+      GlobalDeviceIdsToString(devices()), stream_id_.value(), group_string,
+      root_device_.value(), num_local_participants_);
 }
 
 void GpuCliqueKey::HashValue(absl::HashState state) const {
@@ -122,6 +125,7 @@ void GpuCliqueKey::HashValue(absl::HashState state) const {
 bool operator==(const GpuCliqueKey& a, const GpuCliqueKey& b) {
   return a.devices() == b.devices() && a.stream_id_ == b.stream_id_ &&
          a.participant_groups_ == b.participant_groups_ &&
+         a.num_local_participants_ == b.num_local_participants_ &&
          a.root_device_ == b.root_device_;
 }
 
@@ -135,6 +139,9 @@ bool operator<(const GpuCliqueKey& a, const GpuCliqueKey& b) {
   if (a.root_device_ < b.root_device_) return true;
   if (b.root_device_ < a.root_device_) return false;
 
+  if (a.num_local_participants_ < b.num_local_participants_) return true;
+  if (b.num_local_participants_ < a.num_local_participants_) return false;
+
   return a.stream_id_.value() < b.stream_id_.value();
 }
 
@@ -148,6 +155,9 @@ bool operator>(const GpuCliqueKey& a, const GpuCliqueKey& b) {
   if (a.root_device_ > b.root_device_) return true;
   if (b.root_device_ > a.root_device_) return false;
 
+  if (a.num_local_participants_ > b.num_local_participants_) return true;
+  if (b.num_local_participants_ > a.num_local_participants_) return false;
+
   // We still use `<` to order by stream id as we want to acquire sync cliques
   // before async ones.
   return a.stream_id_.value() < b.stream_id_.value();
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
index b3a0640dac15..418800a0a90a 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
@@ -55,7 +55,7 @@ CollectiveStreamId GetCollectiveStreamId(
 class GpuCliqueKey : public CliqueKey {
  public:
   explicit GpuCliqueKey(
-      std::vector<GlobalDeviceId> devices,
+      std::vector<GlobalDeviceId> devices, int64_t num_local_participants,
       CollectiveStreamId stream_id = CollectiveStreamId(0),
       AsyncStreamKind stream_kind = AsyncStreamKind::kCollective,
       std::vector<std::vector<GlobalDeviceId>> participant_groups = {},
@@ -86,6 +86,16 @@ class GpuCliqueKey : public CliqueKey {
   // specify what configuration to pass for each type of operation.
   AsyncStreamKind stream_kind() const { return stream_kind_; }
 
+  // The number of participant devices that are local to the current process (in
+  // multi-host environments this likely to be all devices on the same host).
+  // This number should never be different in two cliques over the same sets of
+  // devices.
+  int64_t num_local_participants() const { return num_local_participants_; }
+
+  // Returns true if this clique is local to the current process (in multi-host
+  // environments this likely to be all devices on the same host).
+  bool is_local() const { return num_local_participants_ == devices().size(); }
+
   std::string ToString() const final;
 
   // GPU clique keys have a total order on which we rely on for acquiring
@@ -97,6 +107,9 @@ class GpuCliqueKey : public CliqueKey {
  private:
   void HashValue(absl::HashState state) const final;
 
+  // See comment on `num_local_participants()`.
+  int64_t num_local_participants_;
+
   CollectiveStreamId stream_id_;
   AsyncStreamKind stream_kind_;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
index d210aa0ea9f9..0e81700292e4 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key_test.cc
@@ -33,12 +33,15 @@ limitations under the License.
 namespace xla::gpu {
 
 static GpuCliqueKey GetBaseCliqueKey() {
-  return GpuCliqueKey({GlobalDeviceId(0), GlobalDeviceId(1)},
-                      CollectiveStreamId(0), AsyncStreamKind::kCollective,
+  return GpuCliqueKey(/*devices=*/{GlobalDeviceId(0), GlobalDeviceId(1)},
+                      /*num_local_participants=*/2,
+                      /*stream_id=*/CollectiveStreamId(0),
+                      /*stream_kind=*/AsyncStreamKind::kCollective,
+                      /*participant_groups=*/
                       std::vector<std::vector<GlobalDeviceId>>{
                           {GlobalDeviceId(0), GlobalDeviceId(1)},
                           {GlobalDeviceId(2), GlobalDeviceId(3)}},
-                      GlobalDeviceId(0));
+                      /*root_device=*/GlobalDeviceId(0));
 }
 TEST(GpuCliqueKeyTest, IsSubsetOf) {
   GlobalDeviceId id0 = GlobalDeviceId(0);
@@ -46,10 +49,14 @@ TEST(GpuCliqueKeyTest, IsSubsetOf) {
   GlobalDeviceId id2 = GlobalDeviceId(2);
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
-  GpuCliqueKey key0({id0, id1}, CollectiveStreamId(0));
-  GpuCliqueKey key1({id0, id1, id2, id3}, CollectiveStreamId(0));
-  GpuCliqueKey key2({id0, id1, id2, id3}, CollectiveStreamId(1));
-  GpuCliqueKey key3({id1, id2, id3}, CollectiveStreamId(0));
+  GpuCliqueKey key0({id0, id1}, /*num_local_participants=*/2,
+                    CollectiveStreamId(0));
+  GpuCliqueKey key1({id0, id1, id2, id3}, /*num_local_participants=*/4,
+                    CollectiveStreamId(0));
+  GpuCliqueKey key2({id0, id1, id2, id3}, /*num_local_participants=*/4,
+                    CollectiveStreamId(1));
+  GpuCliqueKey key3({id1, id2, id3}, /*num_local_participants=*/3,
+                    CollectiveStreamId(0));
 
   EXPECT_TRUE(key0.IsSubsetOf(key1));
   EXPECT_FALSE(key0.IsSubsetOf(key2));
@@ -62,7 +69,8 @@ TEST(GpuCliqueKeyTest, GetSubKeys) {
   GlobalDeviceId id2 = GlobalDeviceId(2);
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
-  GpuCliqueKey key({id0, id1, id2, id3}, CollectiveStreamId(1));
+  GpuCliqueKey key({id0, id1, id2, id3}, /*num_local_participants=*/1,
+                   CollectiveStreamId(1));
   std::array<int64_t, 4> nroots{1, 2, 3, 4};
   std::vector<std::vector<GlobalDeviceId>> exp_root_devs{
       {id0}, {id0, id2}, {id0, id2, id3}, {id0, id1, id2, id3}};
@@ -71,9 +79,13 @@ TEST(GpuCliqueKeyTest, GetSubKeys) {
     const auto& subkeys = key.GetSubKeys(n);
     EXPECT_EQ(subkeys.size(), exp_root_devs[ridx].size());
     for (int kidx = 0; kidx < subkeys.size(); ++kidx) {
-      GpuCliqueKey exp_subkey({id0, id1, id2, id3}, CollectiveStreamId(1),
-                              AsyncStreamKind::kCollective, {},
-                              exp_root_devs[ridx][kidx]);
+      GpuCliqueKey exp_subkey(
+          /*devices=*/{id0, id1, id2, id3},
+          /*num_local_participants=*/1,
+          /*stream_id=*/CollectiveStreamId(1),
+          /*stream_kind=*/AsyncStreamKind::kCollective,
+          /*participant_groups=*/{},
+          /*root_device=*/exp_root_devs[ridx][kidx]);
       EXPECT_EQ(subkeys[kidx], exp_subkey);
     }
   }
@@ -85,9 +97,12 @@ TEST(GpuCliqueKeyTest, Compare) {
   GlobalDeviceId id2 = GlobalDeviceId(2);
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
-  GpuCliqueKey key0({id0, id1}, CollectiveStreamId(0));
-  GpuCliqueKey key1({id1, id2, id3}, CollectiveStreamId(0));
-  GpuCliqueKey key2({id1, id2, id3}, CollectiveStreamId(1));
+  GpuCliqueKey key0({id0, id1}, /*num_local_participants=*/1,
+                    CollectiveStreamId(0));
+  GpuCliqueKey key1({id1, id2, id3}, /*num_local_participants=*/1,
+                    CollectiveStreamId(0));
+  GpuCliqueKey key2({id1, id2, id3}, /*num_local_participants=*/1,
+                    CollectiveStreamId(1));
 
   EXPECT_LT(key0, key1);
   EXPECT_GT(key1, key0);
@@ -101,17 +116,20 @@ TEST(GpuCliqueKeyTest, CompareWithParticipantGroups) {
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
   // The keys are not equal because the replica groups are different.
-  GpuCliqueKey key0({id0, id1}, CollectiveStreamId(0),
-                    AsyncStreamKind::kCollective,
+  GpuCliqueKey key0({id0, id1}, /*num_local_participants=*/1,
+                    CollectiveStreamId(0), AsyncStreamKind::kCollective,
                     std::vector<std::vector<GlobalDeviceId>>{{id0, id1}});
   GpuCliqueKey key1(
-      {id0, id1}, CollectiveStreamId(0), AsyncStreamKind::kCollective,
+      {id0, id1}, /*num_local_participants=*/1, CollectiveStreamId(0),
+      AsyncStreamKind::kCollective,
       std::vector<std::vector<GlobalDeviceId>>{{id0, id1}, {id2, id3}});
   EXPECT_FALSE(key0 == key1);
 
   // With no replica groups, the keys are equal
-  GpuCliqueKey key0_nogroups({id0, id1}, CollectiveStreamId(0));
-  GpuCliqueKey key1_nogroups({id0, id1}, CollectiveStreamId(0));
+  GpuCliqueKey key0_nogroups({id0, id1}, /*num_local_participants=*/1,
+                             CollectiveStreamId(0));
+  GpuCliqueKey key1_nogroups({id0, id1}, /*num_local_participants=*/1,
+                             CollectiveStreamId(0));
   EXPECT_EQ(key0_nogroups, key1_nogroups);
 }
 
@@ -123,15 +141,20 @@ TEST(GpuCliqueKeyTest, CompareWithPermutedParticipantGroups) {
 
   // The keys are equal because the replica groups are same up to permutation.
   GpuCliqueKey key0(
-      {id0, id1}, CollectiveStreamId(0), AsyncStreamKind::kCollective,
+      {id0, id1},
+      /*num_local_participants=*/1, CollectiveStreamId(0),
+      AsyncStreamKind::kCollective,
       std::vector<std::vector<GlobalDeviceId>>{{id3, id2}, {id0, id1}});
   GpuCliqueKey key1(
-      {id0, id1}, CollectiveStreamId(0), AsyncStreamKind::kCollective,
+      {id0, id1},
+      /*num_local_participants=*/1, CollectiveStreamId(0),
+      AsyncStreamKind::kCollective,
       std::vector<std::vector<GlobalDeviceId>>{{id0, id1}, {id2, id3}});
   EXPECT_EQ(key0, key1);
 
   GpuCliqueKey key_other(
-      {id0, id1}, CollectiveStreamId(0), AsyncStreamKind::kCollective,
+      {id0, id1}, /*num_local_participants=*/1, CollectiveStreamId(0),
+      AsyncStreamKind::kCollective,
       std::vector<std::vector<GlobalDeviceId>>{{id0, id2}, {id1, id3}});
   EXPECT_FALSE(key0 == key_other);
 }
@@ -142,8 +165,10 @@ TEST(GpuCliqueKeyTest, BtreeIterationOrder) {
   GlobalDeviceId id2 = GlobalDeviceId(2);
   GlobalDeviceId id3 = GlobalDeviceId(3);
 
-  GpuCliqueKey key0({id0, id2}, CollectiveStreamId(0));
-  GpuCliqueKey key1({id0, id1, id2, id3}, CollectiveStreamId(0));
+  GpuCliqueKey key0({id0, id2}, /*num_local_participants=*/1,
+                    CollectiveStreamId(0));
+  GpuCliqueKey key1({id0, id1, id2, id3},
+                    /*num_local_participants=*/1, CollectiveStreamId(0));
 
   absl::btree_map<GpuCliqueKey, int64_t, std::greater<GpuCliqueKey>> map;
   map[key0] = 0;
@@ -172,7 +197,8 @@ TEST(GpuCliqueKeyGettersTest, StreamId) {
 
 TEST(GpuCliqueKeyGetterTest, ToString) {
   EXPECT_EQ(GetBaseCliqueKey().ToString(),
-            "devices=[0,1]; stream=0; groups=[[0,1],[2,3]]; root_device=0");
+            "devices=[0,1]; stream=0; groups=[[0,1],[2,3]]; root_device=0; "
+            "num_local_participants=2");
 }
 
 TEST(GpuCliqueIdGettersTest, Data) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
index 6f3fddcb95b5..b6134784ee26 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
@@ -188,6 +188,39 @@ static auto DeviceRanksToString(absl::Span<const DeviceRank> ranks) {
   });
 }
 
+// Returns true if peer access is possible between all devices in `ranks`. As a
+// side effect, enables peer access even if it was not enabled before.
+static absl::StatusOr<bool> EnablePeerAccess(
+    const GpuCliqueKey& key, absl::Span<const DeviceRank> ranks) {
+  if (key.devices().size() != ranks.size()) {
+    // The clique is not local, so we can't enable peer access.
+    return false;
+  }
+
+  std::vector<se::StreamExecutor*> devices;
+  devices.reserve(ranks.size());
+  for (int64_t i = 0; i < ranks.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(auto device, GpuCollectives::TryCast(ranks[i].device));
+    devices.push_back(device->stream_executor());
+  }
+
+  for (int64_t i = 0; i < devices.size(); ++i) {
+    for (int64_t j = 0; j < devices.size(); ++j) {
+      // An attempt to enable peer access to itself will fail.
+      if (i == j) continue;
+
+      // To check if peer access is possible, we need to enable it and check
+      // the result. OkStatus means that peer access is possible.
+      auto status = devices[i]->EnablePeerAccessTo(devices[j]);
+      if (!status.ok()) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 // Joins a GpuClique initialization rendezvous for a `clique_key` and returns
 // a lock that gives an access to initialized clique (access is shared between
 // all participating ranks that own a shared pointer).
@@ -210,12 +243,10 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
       xla::GetDebugOptionsFromFlags()
           .xla_gpu_nccl_init_max_rank_per_root_ratio();
   int64_t nranks = clique_key.num_devices();
-  int64_t nroots = 1;
-  // Ceiling division to get number of roots
-  if (nccl_init_rank_per_root_ratio > 0) {
-    nroots = (nranks + nccl_init_rank_per_root_ratio - 1) /
-             nccl_init_rank_per_root_ratio;
-  }
+  int64_t nroots = nccl_init_rank_per_root_ratio != 0
+                       ? CeilOfRatio(nranks, nccl_init_rank_per_root_ratio)
+                       : 1;
+
   // Initializes a GpuClique for given device ranks and returns a lock that
   // gives access to clique communicators.
   auto initialize = [&](absl::Span<const RendezvousArg* const> args)
@@ -250,11 +281,15 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
     // Sort device ranks, mainly to get more readable logs below.
     absl::c_sort(ranks, [](auto& a, auto& b) { return a.rank < b.rank; });
 
+    // Check if peer access is possible between all devices in the clique.
+    TF_ASSIGN_OR_RETURN(bool peer_access_enabled,
+                        EnablePeerAccess(clique_key, ranks));
+
     VLOG(3) << absl::StreamFormat(
         "Create GPU communicators for clique %s; ranks=[%s]; "
-        "nroots=%lld; fingerprint(id)=%d",
+        "nroots=%lld; fingerprint(id)=%d, peer_access_enabled=%d",
         clique_key.ToString(), DeviceRanksToString(ranks), nroots,
-        clique_ids.fingerprint());
+        clique_ids.fingerprint(), peer_access_enabled);
 
     TF_ASSIGN_OR_RETURN(
         std::vector<std::unique_ptr<Communicator>> created_comms,
@@ -268,16 +303,17 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
 
     VLOG(3) << absl::StreamFormat(
         "Created GPU communicators for clique %s; ranks=[%s]; "
-        "nroots=%lld; fingerprint(id)=%d",
+        "nroots=%lld; fingerprint(id)=%d, peer_access_enabled=%d",
         clique_key.ToString(), DeviceRanksToString(ranks), nroots,
-        clique_ids.fingerprint());
+        clique_ids.fingerprint(), peer_access_enabled);
 
     ProcessGpuCliques& cliques = GetProcessGpuCliques();
     absl::MutexLock lock(&cliques.mu);
 
     // Create a new clique with given clique key and communicators.
-    auto emplaced = cliques.map.try_emplace(clique_key, clique_key, clique_ids,
-                                            std::move(comms));
+    auto emplaced =
+        cliques.map.try_emplace(clique_key, clique_key, clique_ids,
+                                std::move(comms), peer_access_enabled);
 
     // We can have a race to create a clique for a given key, the winner
     // inserts it into a map and the looser destroys all communicators.
@@ -312,7 +348,7 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
   // processes are not able to synchronize device activity.
   RendezvousArg rendezvous_arg = std::make_pair(device_rank, synchronized);
 
-  return Rendezvous<absl::StatusOr<LockableGpuClique::Lock>>(
+  return Rendezvous<LockableGpuClique::Lock>(
       initialization_rendezvous_name, rendezvous_key, rendezvous_arg,
       num_local_participants, initialize, WarnStuckTimeout(),
       TerminateTimeout());
@@ -354,8 +390,10 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
           << " in parent clique " << parent_clique_key.ToString()
           << "; num_local_participants=" << num_local_participants;
 
-  using RankPair = std::pair<RankId, RankId>;
-  RankPair rank_pair = {parent_rank, rank};
+  using RankPair = std::pair<RankId, DeviceRank>;
+  GpuCollectives::Device gpu_device(device);
+  GpuCollectives::DeviceRank device_rank = {&gpu_device, rank};
+  RankPair rank_pair = {parent_rank, device_rank};
 
   // Current approach for communicator splitting works because of XLAs SPMD
   // programming model where all collective operations have replica groups that
@@ -375,7 +413,7 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
     // Collect mapping from ranks in parent clique to ranks in a new clique.
     absl::btree_map<RankId, RankId> rank_mapping;
     for (auto* rank_pair : rank_pairs) {
-      rank_mapping[rank_pair->first] = rank_pair->second;
+      rank_mapping[rank_pair->first] = rank_pair->second.rank;
     }
 
     auto rank_mapping_formatter = [](std::string* str, auto mapping) {
@@ -402,10 +440,28 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
     // Get a globally consistent color value for newly created clique.
     int32_t color = GetCommSplitColor(clique_key);
 
+    bool peer_access_enabled = false;
+    if ((*parent_clique)->key().is_local()) {
+      // The parent clique is local, we can be sure that peer access was already
+      // enabled.
+      peer_access_enabled = (*parent_clique)->peer_access_enabled();
+    } else {
+      // The parent clique is not local, but this clique can be local. We need
+      // to check if peer access is possible between all devices in this clique.
+      std::vector<DeviceRank> ranks;
+      ranks.reserve(rank_pairs.size());
+      for (auto& rank_pair : rank_pairs) {
+        ranks.emplace_back(rank_pair->second);
+      }
+      TF_ASSIGN_OR_RETURN(peer_access_enabled,
+                          EnablePeerAccess(clique_key, ranks));
+    }
+
     VLOG(3) << absl::StreamFormat(
         "Create GPU communicators for clique %s; parent=%s; color=%d; "
-        "rank_mapping=[%s]",
+        "peer_access_enabled=%d; rank_mapping=[%s]",
         clique_key.ToString(), parent_clique_key.ToString(), color,
+        peer_access_enabled,
         absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
 
     TF_ASSIGN_OR_RETURN(
@@ -419,16 +475,19 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
 
     VLOG(3) << absl::StreamFormat(
         "Created GPU communicators for clique %s; parent=%s; color=%d; "
+        "peer_access_enabled=%d; "
         "rank_mapping=[%s]",
         clique_key.ToString(), parent_clique_key.ToString(), color,
+        peer_access_enabled,
         absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
 
     ProcessGpuCliques& cliques = GetProcessGpuCliques();
     absl::MutexLock lock(&cliques.mu);
 
     // Create a new clique with given clique key and communicators.
-    auto emplaced = cliques.map.try_emplace(clique_key, clique_key,
-                                            std::nullopt, std::move(comms));
+    auto emplaced =
+        cliques.map.try_emplace(clique_key, clique_key, std::nullopt,
+                                std::move(comms), peer_access_enabled);
 
     // We can have a race to create a clique for a given key, the winner
     // inserts it into a map and the looser destroys all communicators.
@@ -451,7 +510,7 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
       rank.value(), clique_key.ToString(), run_id.ToInt(),
       parent_clique_key.ToString());
 
-  return Rendezvous<absl::StatusOr<LockableGpuClique::Lock>>(
+  return Rendezvous<LockableGpuClique::Lock>(
       initialization_rendezvous_name, rendezvous_key, rank_pair,
       num_local_participants, split, WarnStuckTimeout(), TerminateTimeout());
 }
@@ -462,8 +521,8 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
     GpuCollectives* collectives, se::StreamExecutor* device, RunId run_id,
     const GpuCliqueKey& clique_key,
     const GpuCollectives::CliqueIdCallback& clique_id_callback, RankId rank,
-    size_t num_local_participants, const AcquiredCliquesMap& acquired_cliques,
-    int64_t max_nchannels) {
+    const AcquiredCliquesMap& acquired_cliques, int64_t max_nchannels) {
+  int64_t num_local_participants = clique_key.num_local_participants();
   VLOG(2) << "Acquire GPU clique " << clique_key.ToString() << "; run"
           << run_id.ToString() << "; rank " << rank
           << "; num_local_participants=" << num_local_participants
@@ -486,7 +545,7 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<LockableGpuClique::Lock> clique,
-      Rendezvous<absl::StatusOr<LockableGpuClique::Lock>>(
+      Rendezvous<LockableGpuClique::Lock>(
           rendezvous_name, rendezvous_key, num_local_participants,
           [&] {
             tsl::profiler::TraceMe trace("LockGpuClique");
@@ -513,9 +572,13 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
 
   // We enable resource sharing between parent and split communicators by
   // default because that's the only reason why we use comm splitting.
+  //
+  // TODO(mwhittaker): Make some of these flags.
   GpuCollectives::Config config;
   config.split_share = true;
   config.max_nchannels = max_nchannels;
+  config.blocking_communicators = true;
+  config.async_execution = false;
 
   if (enable_nccl_comm_splitting) {
     for (auto& [acquired_clique_key, acquired_clique] : acquired_cliques) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
index 9825949cf37f..fa29d69a40ac 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.h
@@ -65,8 +65,7 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
     GpuCollectives* collectives, se::StreamExecutor* device, RunId run_id,
     const GpuCliqueKey& clique_key,
     const GpuCollectives::CliqueIdCallback& clique_id_callback, RankId rank,
-    size_t num_local_participants, const AcquiredCliquesMap& acquired_cliques,
-    int64_t max_nchannels = 0);
+    const AcquiredCliquesMap& acquired_cliques, int64_t max_nchannels = 0);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
index 196a37a9a6a9..c8fa9609e054 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
@@ -20,8 +20,10 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
+#include "xla/core/collectives/communicator.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
@@ -74,6 +76,21 @@ absl::StatusOr<const GpuCollectives::Config*> GpuCollectives::TryCast(
   return InvalidArgument("Collectvies config is not a GPU config");
 }
 
+absl::StatusOr<GpuCommunicator*> GpuCollectives::TryCast(Communicator* comm) {
+  if (auto* gpu_comm = tsl::down_cast<GpuCommunicator*>(comm)) {
+    return gpu_comm;
+  }
+  return InvalidArgument("Collectvies config is not a GPU config");
+}
+
+absl::StatusOr<const GpuCommunicator*> GpuCollectives::TryCast(
+    const Communicator* comm) {
+  if (auto* gpu_comm = tsl::down_cast<const GpuCommunicator*>(comm)) {
+    return gpu_comm;
+  }
+  return InvalidArgument("Collectvies config is not a GPU config");
+}
+
 se::DeviceMemoryBase GpuCollectives::Slice(se::DeviceMemoryBase buff,
                                            PrimitiveType dtype, size_t offset,
                                            size_t count) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
index 7e6fbe79cfbf..1698de033c5d 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
@@ -19,13 +19,19 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <memory>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/executable_run_options.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -72,6 +78,21 @@ class GpuCollectives : public Collectives {
   struct Config : public Collectives::Config {
     bool split_share = false;
     int64_t max_nchannels = 0;
+
+    // There are two types of NCCL communicators: blocking and non-blocking.
+    // When a collective operation is called on a blocking communicator, the
+    // communicator blocks until the operation has been scheduled on the GPU. A
+    // non-blocking communicator, on the other hand, returns immediately.
+    //
+    // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#c.blocking
+    bool blocking_communicators = true;
+
+    // If true, Communicator methods (e.g., AllReduce) return AsyncValueRefs
+    // that are filled in asynchronously. If false, Communicator methods return
+    // AsyncValueRefs that are already filled in.
+    //
+    // If blocking_communicators is false, then async_execution must be true.
+    bool async_execution = false;
   };
 
   // Returns true if GPU collectives are implemented.
@@ -91,18 +112,40 @@ class GpuCollectives : public Collectives {
       stream_executor::DeviceMemoryBase buff, PrimitiveType dtype,
       size_t offset, size_t count);
 
-  // Starts a group call.
-  virtual absl::Status GroupStart() = 0;
-
-  // Ends a group call.
-  virtual absl::Status GroupEnd() = 0;
-
   // Tries to cast a Collectives::Device to a GpuCollectives::Device.
   static absl::StatusOr<Device*> TryCast(Collectives::Device* device);
 
   // Tries to cast a Collectives::Config to a GpuCollectives::Config.
   static absl::StatusOr<const Config*> TryCast(
       const Collectives::Config* config);
+
+  // Tries to cast a Communicator to a GpuCommunicator.
+  static absl::StatusOr<GpuCommunicator*> TryCast(Communicator* comm);
+
+  // Tries to cast a Communicator to a GpuCommunicator.
+  static absl::StatusOr<const GpuCommunicator*> TryCast(
+      const Communicator* comm);
+
+  // TODO(b/410686553): Use smart wrapper instead of void*.
+  virtual absl::StatusOr<void*> Allocate(uint64_t bytes) = 0;
+
+  virtual absl::Status Deallocate(void* buffer) = 0;
+
+  struct Topology {
+    int32_t node_id;
+    int32_t num_nodes;
+    size_t device_count_per_process;
+    std::shared_ptr<KeyValueStoreInterface> kv_store;
+    absl::flat_hash_map<GlobalDeviceId, int32_t> device_id_to_node_id;
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options;
+  };
+
+  // Initializes the topology information for the collectives backend.
+  virtual absl::Status InitializeTopology(Topology topology) = 0;
+
+  // Creates a single communicator.
+  virtual absl::StatusOr<std::unique_ptr<Communicator>>
+  CreateCommunicator() = 0;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
index f034e3dd01ea..f56e0e522494 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives_stub.h
@@ -62,8 +62,19 @@ class GpuCollectivesStub : public GpuCollectives {
     return UnimplementedError();
   }
 
-  absl::Status GroupStart() final { return UnimplementedError(); }
-  absl::Status GroupEnd() final { return UnimplementedError(); }
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final {
+    return UnimplementedError();
+  }
+
+  absl::Status Deallocate(void* buffer) final { return UnimplementedError(); }
+
+  absl::Status InitializeTopology(Topology topology) final {
+    return UnimplementedError();
+  }
+
+  absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator() final {
+    return UnimplementedError();
+  }
 
  protected:
   static absl::Status UnimplementedError() {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_communicator.h b/third_party/xla/xla/backends/gpu/collectives/gpu_communicator.h
new file mode 100644
index 000000000000..857fefe0aba7
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_communicator.h
@@ -0,0 +1,94 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_GPU_COMMUNICATOR_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_GPU_COMMUNICATOR_H_
+
+#include <cstddef>
+#include <optional>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::gpu {
+
+// GpuCommunicator extends Communicator with synchronous versions of the
+// collective methods.
+//
+// For example, the Communicator::AllReduce method (which is asynchronous and
+// returns an AsyncValueRef<Event>) has a corresponding
+// GpuCommunicator::LaunchAllReduce method (which is synchronous and returns an
+// absl::Status).
+class GpuCommunicator : public Communicator {
+ public:
+  ~GpuCommunicator() override = default;
+
+  // Executes f in a group. f should invoke synchronous collective methods like
+  // LaunchAllReduce and not asynchronous collective methods like AllReduce.
+  virtual tsl::AsyncValueRef<Communicator::Event> GroupExecute(
+      absl::AnyInvocable<absl::Status(GpuCommunicator*)> f) = 0;
+
+  virtual absl::Status LaunchAllReduce(se::DeviceMemoryBase send_buffer,
+                                       se::DeviceMemoryBase recv_buffer,
+                                       PrimitiveType dtype, size_t count,
+                                       ReductionKind reduction_kind,
+                                       const Executor& executor) = 0;
+
+  virtual absl::Status LaunchBroadcast(se::DeviceMemoryBase send_buffer,
+                                       se::DeviceMemoryBase recv_buffer,
+                                       PrimitiveType dtype, size_t count,
+                                       RankId root,
+                                       const Executor& executor) = 0;
+
+  virtual absl::Status LaunchReduceScatter(se::DeviceMemoryBase send_buffer,
+                                           se::DeviceMemoryBase recv_buffer,
+                                           PrimitiveType dtype, size_t count,
+                                           ReductionKind reduction_kind,
+                                           const Executor& executor) = 0;
+
+  virtual absl::Status LaunchAllGather(se::DeviceMemoryBase send_buffer,
+                                       se::DeviceMemoryBase recv_buffer,
+                                       PrimitiveType dtype, size_t count,
+                                       const Executor& executor) = 0;
+
+  virtual absl::Status LaunchAllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) = 0;
+
+  virtual absl::Status LaunchCollectivePermute(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+      absl::Span<const RankId> target_ranks, const Executor& executor) = 0;
+
+  virtual absl::Status LaunchSend(se::DeviceMemoryBase send_buffer,
+                                  PrimitiveType dtype, size_t count,
+                                  RankId peer, const Executor& executor) = 0;
+
+  virtual absl::Status LaunchRecv(se::DeviceMemoryBase recv_buffer,
+                                  PrimitiveType dtype, size_t count,
+                                  RankId peer, const Executor& executor) = 0;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_GPU_COMMUNICATOR_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
index 832d67ed3260..cefc8502d92d 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
@@ -17,19 +17,25 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstdlib>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/nccl_communicator.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
@@ -39,12 +45,19 @@ limitations under the License.
 #include "xla/core/collectives/collectives_registry.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/debug_options_flags.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
+#include "tsl/platform/numbers.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -86,13 +99,14 @@ NcclCollectives::GetCliqueIdCallback(const CliqueIdCallback* clique_id_callback,
       << "If non-local devices are taking part of a collective API on "
          "GPU, the clique_id_callback must be provided by the client.";
 
-  static auto* local_callback = new CliqueIdCallback(
+  static auto* const local_callback = new CliqueIdCallback(
       [this](const CliqueKey&) { return CreateUniqueCliqueId(); });
   return local_callback;
 }
 
 static ncclConfig_t AsNcclConfig(const GpuCollectives::Config& config) {
   ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
+  comm_config.blocking = config.blocking_communicators ? 1 : 0;
 #if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
   comm_config.splitShare = config.split_share;
 #endif
@@ -119,48 +133,67 @@ NcclCollectives::CreateCommunicators(const CliqueKey& clique_key,
                                      const std::optional<CliqueIds>& clique_ids,
                                      absl::Span<const DeviceRank> ranks,
                                      const Collectives::Config& config) {
-  // With NCCL backend we rely on host to exchange unique clique ids.
+  // Validate clique ids. With the NCCL backend, we rely on the host to exchange
+  // unique clique ids.
   if (!clique_ids.has_value() || clique_ids->data().empty()) {
     return InvalidArgument("CliqueId is required to create NCCL communicators");
   }
-
-  VLOG(1) << "Initialize NCCL communicator for " << ranks.size() << " devices"
-          << "; fingerprint(id)=" << clique_ids->fingerprint();
-
-  TF_ASSIGN_OR_RETURN(auto* gpu_config, TryCast(&config));
-  ncclConfig_t comm_config = AsNcclConfig(*gpu_config);
-
-  std::vector<ncclComm_t> comm_handles;
-  std::vector<std::unique_ptr<Communicator>> comms;
-
-  comm_handles.resize(ranks.size(), nullptr);
-  comms.reserve(ranks.size());
-
   if (clique_ids->data().size() != 1) {
     return InvalidArgument(
         "CliqueIds size must be 1 for NCCL communicator initialization");
   }
+  VLOG(1) << "Initialize NCCL communicator for " << ranks.size() << " devices"
+          << "; fingerprint(id)=" << clique_ids->fingerprint();
 
-  TF_RETURN_IF_ERROR(GroupStart());
-  for (size_t i = 0; i < ranks.size(); ++i) {
+  // Validate collectives config.
+  TF_ASSIGN_OR_RETURN(const GpuCollectives::Config* gpu_config,
+                      TryCast(&config));
+  if (!gpu_config->blocking_communicators && !gpu_config->async_execution) {
+    return FailedPrecondition(
+        "GpuCollectives::Config blocking_communicators is false, but "
+        "async_execution is false. Non-blocking communicators require "
+        "asynchronous execution.");
+  }
+  ncclConfig_t comm_config = AsNcclConfig(*gpu_config);
+
+  // make_comm returns a new ncclComm_t.
+  auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
     VLOG(1) << "Initialize NCCL communicator for rank #" << ranks[i].rank
             << " of " << clique_key.num_devices()
             << "; fingerprint(id)=" << clique_ids->fingerprint()
             << "; size(id)=" << clique_ids->data().size();
     TF_ASSIGN_OR_RETURN(auto* device, TryCast(ranks[i].device));
     auto activate_context = device->stream_executor()->Activate();
-
     TF_ASSIGN_OR_RETURN(auto nccl_unique_id, AsNcclUniqueId(clique_ids->at(0)));
-    XLA_NCCL_RETURN_IF_ERROR(ncclCommInitRankConfig(
-        &comm_handles[i], clique_key.num_devices(), nccl_unique_id,
-        ranks[i].rank.value(), &comm_config));
-  }
-  TF_RETURN_IF_ERROR(GroupEnd());
+    ncclComm_t comm;
+    XLA_NCCL_RETURN_IF_ERROR(
+        ncclCommInitRankConfig(&comm, clique_key.num_devices(), nccl_unique_id,
+                               ranks[i].rank.value(), &comm_config));
+    return comm;
+  };
 
-  for (ncclComm_t comm_handle : comm_handles) {
-    comms.emplace_back(std::make_unique<NcclCommunicator>(comm_handle));
+  // Create all communicators. Each communicator is created on its own thread.
+  std::vector<std::unique_ptr<Communicator>> comms(ranks.size());
+  std::vector<absl::Status> statuses(ranks.size());
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "CreateCommunicators",
+                                 ranks.size());
+    for (size_t i = 0; i < ranks.size(); ++i) {
+      pool.Schedule([&, i]() {
+        absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
+            NcclCommunicator::Create(std::bind(make_comm, i),
+                                     gpu_config->async_execution);
+        if (!comm.ok()) {
+          statuses[i] = comm.status();
+          return;
+        }
+        comms[i] = *std::move(comm);
+      });
+    }
+  }  // pool's destructor blocks until all scheduled work is done.
+  for (const absl::Status& s : statuses) {
+    TF_RETURN_IF_ERROR(s);
   }
-
   return comms;
 }
 
@@ -186,28 +219,36 @@ NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
   TF_ASSIGN_OR_RETURN(auto* gpu_config, TryCast(&config));
   ncclConfig_t comm_config = AsNcclConfig(*gpu_config);
 
-  // In contrast to grouped initialization communicator splitting initializes
-  // communicators only after a successful call to `GroupEnd`, so we keep a
-  // vector of handles and after successful splitting convert to RAII wrappers.
-  std::vector<ncclComm_t> split_comms_handles;
-  split_comms_handles.resize(comms.size(), nullptr);
-
 #if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
-  TF_RETURN_IF_ERROR(GroupStart());
-  for (size_t i = 0; i < comms.size(); ++i) {
+  auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
     VLOG(1) << "Split NCCL communicator " << comms[i] << " with color " << color
             << " and key " << keys[i];
-    XLA_NCCL_RETURN_IF_ERROR(
-        ncclCommSplit(Cast(comms[i]), color, keys[i].value(),
-                      &split_comms_handles[i], &comm_config));
-  }
-  TF_RETURN_IF_ERROR(GroupEnd());
+    ncclComm_t split_comm;
+    XLA_NCCL_RETURN_IF_ERROR(ncclCommSplit(
+        Cast(comms[i]), color, keys[i].value(), &split_comm, &comm_config));
+    return split_comm;
+  };
 
-  std::vector<std::unique_ptr<Communicator>> split_comms;
-  split_comms.reserve(split_comms_handles.size());
-  for (size_t i = 0; i < split_comms_handles.size(); ++i) {
-    split_comms.emplace_back(
-        std::make_unique<NcclCommunicator>(split_comms_handles[i]));
+  std::vector<absl::Status> statuses(comms.size());
+  std::vector<std::unique_ptr<Communicator>> split_comms(comms.size());
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "SplitCommunicators",
+                                 comms.size());
+    for (size_t i = 0; i < comms.size(); ++i) {
+      pool.Schedule([&, i]() {
+        absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
+            NcclCommunicator::Create(std::bind(make_comm, i),
+                                     gpu_config->async_execution);
+        if (!comm.ok()) {
+          statuses[i] = comm.status();
+          return;
+        }
+        split_comms[i] = *std::move(comm);
+      });
+    }
+  }  // pool's destructor blocks until all scheduled work is done.
+  for (const absl::Status& s : statuses) {
+    TF_RETURN_IF_ERROR(s);
   }
   return split_comms;
 #else
@@ -217,16 +258,127 @@ NcclCollectives::SplitCommunicators(absl::Span<const Communicator* const> comms,
 #endif  // !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
 }
 
-absl::Status NcclCollectives::GroupStart() {
-  VLOG(5) << "Start NCCL group";
-  return XLA_NCCL_STATUS(ncclGroupStart());
+static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Get("gpu", "nvshmem"));
+  xla::gpu::GpuCollectives* nvshmem_collectives =
+      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+  if (nvshmem_collectives == nullptr) {
+    return absl::InternalError("Failed to get NVSHMEM collectives");
+  }
+
+  return nvshmem_collectives;
 }
 
-absl::Status NcclCollectives::GroupEnd() {
-  VLOG(5) << "End NCCL group";
-  return XLA_NCCL_STATUS(ncclGroupEnd());
+absl::StatusOr<void*> NcclCollectives::Allocate(uint64_t bytes) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Allocate(bytes);
+  }
+
+  void* ptr = nullptr;
+  ncclResult_t res = ncclMemAlloc(&ptr, bytes);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to allocate %s (%llu bytes) from device collective memory: %s, "
+        "Last NCCL warning(error) log entry (may be unrelated): %s",
+        tsl::strings::HumanReadableNumBytes(bytes), bytes,
+        ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+  VLOG(2) << "Allocated collective memory " << ptr << " of " << bytes
+          << " bytes";
+  return ptr;
 }
 
+absl::Status NcclCollectives::Deallocate(void* location) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Deallocate(location);
+  }
+
+  ncclResult_t res = ncclMemFree(location);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to free device collective memory at %p; result: %s, Last NCCL "
+        "warning(error) log entry (may be unrelated): %s",
+        location, ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+
+  VLOG(2) << "Deallocated collective memory " << location;
+  return absl::OkStatus();
+}
+
+class NcclIdStore {
+ public:
+  NcclIdStore(int node_id,
+              absl::flat_hash_map<GlobalDeviceId, int> device_to_node,
+              std::shared_ptr<KeyValueStoreInterface> kv_store)
+      : node_id_(node_id),
+        device_to_node_(std::move(device_to_node)),
+        kv_store_(std::move(kv_store)) {}
+
+  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key) {
+    auto* gpu_key = tsl::down_cast<const gpu::GpuCliqueKey*>(&key);
+    if (gpu_key == nullptr) {
+      return InvalidArgument("Expected GPU clique key");
+    }
+
+    // The caller must ensure that threads calling this method concurrently have
+    // unique keys, otherwise the global key-value store may hold the wrong
+    // value.
+    {
+      absl::MutexLock lock(&mu_);
+      auto it = cache_.find(*gpu_key);
+      if (it != cache_.end()) {
+        return it->second;
+      }
+    }
+    CliqueId clique_id;
+    int primary_node_id = device_to_node_.at(gpu_key->root_device());
+    if (node_id_ == primary_node_id) {
+      TF_ASSIGN_OR_RETURN(
+          clique_id, gpu::GpuCollectives::Default()->CreateUniqueCliqueId());
+      TF_RETURN_IF_ERROR(
+          kv_store_->Set(gpu_key->ToString(), clique_id.ToString()));
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          std::string id_str,
+          kv_store_->Get(gpu_key->ToString(), absl::Minutes(10)));
+      clique_id = CliqueId(id_str);
+    }
+    absl::MutexLock lock(&mu_);
+    auto result = cache_.emplace(*gpu_key, std::move(clique_id));
+    TF_RET_CHECK(result.second) << "Unique ID already in cache.";
+    return result.first->second;
+  }
+
+ private:
+  const int node_id_;
+  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
+  const std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<gpu::GpuCliqueKey, CliqueId> cache_ ABSL_GUARDED_BY(mu_);
+};
+
+absl::Status NcclCollectives::InitializeTopology(
+    NcclCollectives::Topology topology) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    TF_RETURN_IF_ERROR(nvshmem_collectives->InitializeTopology(topology));
+  }
+
+  if (topology.num_nodes > 1) {
+    auto nccl_id_store = std::make_shared<NcclIdStore>(
+        topology.node_id, topology.device_id_to_node_id,
+        std::move(topology.kv_store));
+    topology.gpu_executable_run_options->set_clique_id_callback(
+        [nccl_id_store](const CliqueKey& key) {
+          return nccl_id_store->GetNcclUniqueId(key);
+        });
+  }
+  return absl::OkStatus();
+}
 }  // namespace xla::gpu
 
 XLA_COLLECTIVES_REGISTER("gpu", "nccl", 1,
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
index 8245c8fbdaca..07da8ff48581 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.h
@@ -45,18 +45,24 @@ class NcclCollectives : public GpuCollectives {
 
   absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final;
 
-  absl::Status GroupStart() final;
-  absl::Status GroupEnd() final;
-
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
   CreateCommunicators(const CliqueKey& clique_key,
                       const std::optional<CliqueIds>& clique_ids,
                       absl::Span<const DeviceRank> ranks,
                       const Collectives::Config& config) final;
 
+  absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator() final {
+    return absl::UnimplementedError("Not implemented.");
+  }
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
       absl::Span<const RankId> keys, const Collectives::Config& config) final;
+
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final;
+
+  absl::Status Deallocate(void* location) final;
+
+  absl::Status InitializeTopology(Topology topology) final;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
index 05e5f91068ec..889b285ff3b6 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
@@ -20,14 +20,21 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
+#include "xla/backends/gpu/collectives/single_threaded_executor.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/primitive_util.h"
@@ -36,10 +43,13 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/casts.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -53,6 +63,35 @@ limitations under the License.
 #endif  // TENSORFLOW_USE_ROCM
 
 namespace xla::gpu {
+namespace {
+
+// Blocks until ref is ready and returns its value (or error).
+template <typename T>
+absl::StatusOr<T> BlockAndGet(tsl::AsyncValueRef<T> ref) {
+  tsl::BlockUntilReady(ref);
+  if (ref.IsError()) {
+    return ref.GetError();
+  }
+  return std::move(std::move(ref).get());
+}
+
+// Blocks until ref is ready and returns its value (or error).
+absl::Status BlockAndGet(tsl::AsyncValueRef<absl::Status> ref) {
+  tsl::BlockUntilReady(ref);
+  if (ref.IsError()) {
+    return ref.GetError();
+  }
+  return ref.get();
+}
+
+// Blocks until ref is ready and returns absl::OkStatus() (or error).
+absl::Status BlockAndGet(tsl::AsyncValueRef<NcclCommunicator::Event> ref) {
+  tsl::BlockUntilReady(ref);
+  if (ref.IsError()) {
+    return ref.GetError();
+  }
+  return absl::OkStatus();
+}
 
 //==-----------------------------------------------------------------------===//
 // Conversions between XLA and NCCL data types
@@ -125,113 +164,287 @@ static ncclRedOp_t ToNcclReduction(ReductionKind kind) {
 }
 
 //==-----------------------------------------------------------------------===//
-// NCCL Communicator
+// NCCL Registered Buffer Handle
 //==-----------------------------------------------------------------------===//
 
-namespace {
 // An RAII handle for user buffers registered with an NCCL communicator.
 class NcclRegisteredBufferHandle : public Communicator::RegisteredBufferHandle {
  public:
-  NcclRegisteredBufferHandle(ncclComm_t comm_, void* handle);
-  ~NcclRegisteredBufferHandle() override;
+  NcclRegisteredBufferHandle(NcclCommunicator* comm, void* handle,
+                             tsl::AsyncValue::Executor* executor)
+      : comm_(comm), handle_(handle), executor_() {}
+
+  ~NcclRegisteredBufferHandle() override {
+    if (auto status = Unregister(); !status.ok()) {
+      LOG(ERROR) << status.message();
+    }
+  }
 
-  absl::Status Unregister() final;
+  absl::Status Unregister() final {
+    VLOG(3) << absl::StreamFormat(
+        "Deregister buffer for NCCL communicator; handle=%p; comm=%p", handle_,
+        comm_);
+#if (NCCL_VERSION_CODE >= 21901)
+    auto f = [this]() -> absl::Status {
+      XLA_NCCL_RETURN_IF_ERROR(ncclCommDeregister(comm_->comm(), handle_));
+      return PollUntilDone(comm_->comm());
+    };
+    if (!executor_) {
+      return f();
+    }
+    return BlockAndGet(tsl::MakeAsyncValueRef(*executor_, f));
+#else
+    return Unimplemented("NCCL version does not support ncclCommDeregister");
+#endif  // NCCL_VERSION_CODE >= 21901
+  }
 
  private:
-  ncclComm_t comm_;
+  NcclCommunicator* comm_;
   void* handle_;
+  tsl::AsyncValue::Executor* executor_;
 };
+
 }  // namespace
 
-NcclRegisteredBufferHandle::NcclRegisteredBufferHandle(ncclComm_t comm,
-                                                       void* handle)
-    : comm_(comm), handle_(handle) {}
+//==-----------------------------------------------------------------------===//
+// NCCL Communicator
+//==-----------------------------------------------------------------------===//
 
-NcclRegisteredBufferHandle::~NcclRegisteredBufferHandle() {
-  if (auto status = Unregister(); !status.ok()) {
-    LOG(ERROR) << status.message();
-  }
-}
+absl::StatusOr<std::unique_ptr<NcclCommunicator>> NcclCommunicator::Create(
+    absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm, bool is_async,
+    tsl::Env& env) {
+  auto f = [&make_comm]() -> absl::StatusOr<ncclComm_t> {
+    TF_ASSIGN_OR_RETURN(ncclComm_t comm, make_comm());
+    TF_RETURN_IF_ERROR(PollUntilDone(comm));
+    return comm;
+  };
 
-absl::Status NcclRegisteredBufferHandle::Unregister() {
-  VLOG(3) << absl::StreamFormat(
-      "Deregister buffer for NCCL communicator; handle=%p; comm=%p", handle_,
-      comm_);
-#if (NCCL_VERSION_CODE >= 21901)
-  return XLA_NCCL_STATUS(ncclCommDeregister(comm_, handle_));
-#else
-  return Unimplemented("NCCL version does not support ncclCommDeregister");
-#endif  // NCCL_VERSION_CODE >= 21901
-}
+  if (!is_async) {
+    // If this NcclCommunicator is synchronous, construct ncclComm_t in the
+    // calling thread.
+    TF_ASSIGN_OR_RETURN(ncclComm_t comm, f());
+    return absl::WrapUnique(new NcclCommunicator(comm, nullptr));
+  }
 
-NcclCommunicator::NcclCommunicator(ncclComm_t comm) : comm_(comm) {
-  VLOG(1) << "Created " << *this;
+  // If this NcclCommunicator is asynchronous, then all operations on the
+  // underlying ncclComm_t, including its creation, must take place on the
+  // single threaded executor.
+  auto executor = std::make_unique<SingleThreadedExecutor>(env);
+  TF_ASSIGN_OR_RETURN(ncclComm_t comm,
+                      BlockAndGet(tsl::TryMakeAsyncValueRef(*executor, f)));
+  return absl::WrapUnique(new NcclCommunicator(comm, std::move(executor)));
 }
 
 NcclCommunicator::~NcclCommunicator() {
-  if (!aborted_) {
-    // Don't destroy the communicator if it has already been aborted.
+  auto f = [this]() -> absl::Status {
+    if (comm_ == nullptr) {
+      VLOG(1) << "Skipping destruction; null comm_ " << *this;
+      return absl::OkStatus();
+    }
+
+    if (aborted_) {
+      VLOG(1) << "Skipping destruction; already aborted " << *this;
+      return absl::OkStatus();
+    }
+
+    // Note that we intentionally don't call PollUntilDone. Once comm_ has been
+    // destroyed, we can no longer safely touch it.
     VLOG(1) << "Destroy " << *this;
-    XLA_NCCL_LOG_IF_ERROR(ncclCommDestroy(comm_));
-  } else {
-    VLOG(1) << "Skipping destruction; already aborted " << *this;
+    return XLA_NCCL_STATUS(ncclCommDestroy(comm_));
+  };
+
+  if (absl::Status s = BlockAndGet(Execute(f)); !s.ok()) {
+    LOG(ERROR) << "NcclCommunicator::~NcclCommunicator: " << s;
   }
 }
 
 absl::Status NcclCommunicator::Abort() {
-  VLOG(1) << "Abort NCCL communicator: " << ToString();
-  if (aborted_) {
-    return FailedPrecondition("NcclCommunicator aborted");
-  }
-  aborted_ = true;
-  // TODO(mwhittaker): It is only safe to abort a non-blocking communicator.
-  // Ensure that comm_ is non-blocking.
-  return XLA_NCCL_STATUS(ncclCommAbort(comm_));
+  // TODO(mwhittaker): This is not live for now. Calling abort on the background
+  // thread can lead to deadlock.
+  return BlockAndGet(Execute([this]() -> absl::Status {
+    VLOG(1) << "Abort NCCL communicator: " << *this;
+    if (aborted_) {
+      return FailedPrecondition("NcclCommunicator aborted");
+    }
+    aborted_ = true;
+    // Note that we intentionally don't call PollUntilDone. Once comm_ has been
+    // aborted, we can no longer safely touch it.
+    return XLA_NCCL_STATUS(ncclCommAbort(comm_));
+  }));
 }
 
 absl::Status NcclCommunicator::HealthCheck() const {
-  VLOG(5) << "Get last async error for NCCL communicator: " << ToString();
-  if (aborted_) {
-    return absl::FailedPreconditionError("NcclCommunicator aborted");
-  }
-
-  ncclResult_t async_err;
-  XLA_NCCL_RETURN_IF_ERROR(ncclCommGetAsyncError(comm_, &async_err));
-  if (async_err == ncclSuccess) return absl::OkStatus();
-
-  return Internal("%s. Last NCCL error (maybe unrelated): %s",
-                  ncclGetLastError(comm_), ncclGetErrorString(async_err));
+  return BlockAndGet(Execute([this]() -> absl::Status {
+    VLOG(5) << "Get last async error for NCCL communicator: " << *this;
+    if (aborted_) {
+      return absl::FailedPreconditionError("NcclCommunicator aborted");
+    }
+
+    ncclResult_t async_err;
+    XLA_NCCL_RETURN_IF_ERROR(ncclCommGetAsyncError(comm_, &async_err));
+    if (async_err == ncclSuccess) {
+      return absl::OkStatus();
+    }
+
+    return Internal("%s. Last NCCL error (maybe unrelated): %s",
+                    ncclGetLastError(comm_), ncclGetErrorString(async_err));
+  }));
 }
 
 absl::StatusOr<size_t> NcclCommunicator::NumRanks() const {
-  VLOG(5) << "Get the number of ranks in NCCL communicator: " << ToString();
-  if (aborted_) {
-    return absl::FailedPreconditionError("NcclCommunicator aborted");
-  }
-  int32_t count;
-  XLA_NCCL_RETURN_IF_ERROR(ncclCommCount(comm_, &count));
-  return count;
+  return BlockAndGet(Execute<size_t>([this]() -> absl::StatusOr<size_t> {
+    VLOG(5) << "Get the number of ranks in NCCL communicator: " << *this;
+    if (aborted_) {
+      return absl::FailedPreconditionError("NcclCommunicator aborted");
+    }
+
+    // We intentionally don't call PollUntilDone. ncclCommCount is blocking.
+    int32_t count = 0;
+    XLA_NCCL_RETURN_IF_ERROR(ncclCommCount(comm_, &count));
+    return count;
+  }));
 }
 
 absl::StatusOr<std::unique_ptr<Communicator::RegisteredBufferHandle>>
 NcclCommunicator::RegisterBuffer(stream_executor::DeviceMemoryBase buffer) {
-  VLOG(3) << absl::StreamFormat(
-      "Register buffer for NCCL communicator; buffer=%p; size=%d; comm=%p",
-      buffer.opaque(), buffer.size(), comm_);
-  if (aborted_) {
-    return absl::FailedPreconditionError("NcclCommunicator aborted");
-  }
 #if (NCCL_VERSION_CODE >= 21901)
-  void* handle = nullptr;
-  XLA_NCCL_RETURN_IF_ERROR(
-      ncclCommRegister(comm_, buffer.opaque(), buffer.size(), &handle));
-  return std::make_unique<NcclRegisteredBufferHandle>(comm_, handle);
+  using Handle = std::unique_ptr<Communicator::RegisteredBufferHandle>;
+  return BlockAndGet(
+      Execute<Handle>([&buffer, this]() -> absl::StatusOr<Handle> {
+        VLOG(3) << absl::StreamFormat(
+            "Register buffer for NCCL communicator; buffer=%p; size=%d; "
+            "comm=%p",
+            buffer.opaque(), buffer.size(), comm_);
+        if (aborted_) {
+          return absl::FailedPreconditionError("NcclCommunicator aborted");
+        }
+        void* handle = nullptr;
+        XLA_NCCL_RETURN_IF_ERROR(
+            ncclCommRegister(comm_, buffer.opaque(), buffer.size(), &handle));
+        if (group_nesting_level_ == 0) {
+          TF_RETURN_IF_ERROR(PollUntilDone(comm_));
+        }
+        return std::make_unique<NcclRegisteredBufferHandle>(this, handle,
+                                                            executor_.get());
+      }));
 #else
   return Unimplemented("NCCL version does not support ncclCommRegister");
 #endif  // NCCL_VERSION_CODE >= 21901
 }
 
-absl::Status NcclCommunicator::AllReduce(
+tsl::AsyncValueRef<Communicator::Event> NcclCommunicator::GroupExecute(
+    absl::AnyInvocable<absl::Status(GpuCommunicator*)> f) {
+  return Execute([f = std::move(f), this]() mutable -> absl::Status {
+    TF_RETURN_IF_ERROR(GroupStart());
+    TF_RETURN_IF_ERROR(f(this));
+    TF_RETURN_IF_ERROR(GroupEnd());
+    return absl::OkStatus();
+  });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::AllReduce(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Communicator::Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, reduction_kind,
+                  &executor, this]() -> absl::Status {
+    return LaunchAllReduce(send_buffer, recv_buffer, dtype, count,
+                           reduction_kind, executor);
+  });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::Broadcast(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, RankId root, const Executor& executor) {
+  return Execute(
+      [send_buffer, recv_buffer, dtype, count, root, &executor, this]() {
+        return LaunchBroadcast(send_buffer, recv_buffer, dtype, count, root,
+                               executor);
+      });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::ReduceScatter(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, reduction_kind,
+                  &executor, this]() {
+    return LaunchReduceScatter(send_buffer, recv_buffer, dtype, count,
+                               reduction_kind, executor);
+  });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::AllGather(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, &executor, this]() {
+    return LaunchAllGather(send_buffer, recv_buffer, dtype, count, executor);
+  });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::AllToAll(
+    absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  return Execute([send_buffers, recv_buffers, dtype, count, &executor, this]() {
+    return LaunchAllToAll(send_buffers, recv_buffers, dtype, count, executor);
+  });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::CollectivePermute(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+    absl::Span<const RankId> target_ranks, const Executor& executor) {
+  std::vector<RankId> owned_target_ranks(target_ranks.begin(),
+                                         target_ranks.end());
+  return Execute([send_buffer, recv_buffer, dtype, count, source_rank,
+                  owned_target_ranks = std::move(owned_target_ranks), &executor,
+                  this]() {
+    return LaunchCollectivePermute(send_buffer, recv_buffer, dtype, count,
+                                   source_rank, owned_target_ranks, executor);
+  });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::Send(
+    se::DeviceMemoryBase send_buffer, PrimitiveType dtype, size_t count,
+    RankId peer, const Executor& executor) {
+  return Execute([send_buffer, dtype, count, peer, &executor, this]() {
+    return LaunchSend(send_buffer, dtype, count, peer, executor);
+  });
+}
+
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::Recv(
+    se::DeviceMemoryBase recv_buffer, PrimitiveType dtype, size_t count,
+    RankId peer, const Executor& executor) {
+  return Execute([recv_buffer, dtype, count, peer, &executor, this]() {
+    return LaunchRecv(recv_buffer, dtype, count, peer, executor);
+  });
+}
+
+absl::Status NcclCommunicator::GroupStart() {
+  VLOG(5) << "Start NCCL group";
+  XLA_NCCL_RETURN_IF_ERROR(ncclGroupStart());
+  group_nesting_level_++;
+  return absl::OkStatus();
+}
+
+absl::Status NcclCommunicator::GroupEnd() {
+  VLOG(5) << "End NCCL group";
+  XLA_NCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  group_nesting_level_--;
+  if (group_nesting_level_ > 0) {
+    // Though NCCL allows groups to be nested, no operations are actually
+    // performed until the outermost group ends. The inner calls to
+    // GroupStart() and GroupEnd() are effectively noops.
+    //
+    // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/groups.html
+    return absl::OkStatus();
+  }
+  // Wait for the communicator to finish.
+  return PollUntilDone(comm());
+}
+
+absl::Status NcclCommunicator::LaunchAllReduce(
     se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
     PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
     const Communicator::Executor& executor) {
@@ -250,17 +463,21 @@ absl::Status NcclCommunicator::AllReduce(
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
-  return XLA_NCCL_STATUS(ncclAllReduce(
+  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclAllReduce(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
       nccl_dtype, ToNcclReduction(reduction_kind), comm_,
-      se::gpu::AsGpuStreamValue(stream)));
+      se::gpu::AsGpuStreamValue(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone(comm_));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status NcclCommunicator::Broadcast(se::DeviceMemoryBase send_buffer,
-                                         se::DeviceMemoryBase recv_buffer,
-                                         PrimitiveType dtype, size_t count,
-                                         RankId root,
-                                         const Executor& executor) {
+absl::Status NcclCommunicator::LaunchBroadcast(se::DeviceMemoryBase send_buffer,
+                                               se::DeviceMemoryBase recv_buffer,
+                                               PrimitiveType dtype,
+                                               size_t count, RankId root,
+                                               const Executor& executor) {
   if (aborted_) {
     return absl::FailedPreconditionError("NcclCommunicator aborted");
   }
@@ -276,16 +493,19 @@ absl::Status NcclCommunicator::Broadcast(se::DeviceMemoryBase send_buffer,
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
-  return XLA_NCCL_STATUS(ncclBroadcast(
+  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclBroadcast(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
-      nccl_dtype, root.value(), comm_, se::gpu::AsGpuStreamValue(stream)));
+      nccl_dtype, root.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone(comm_));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status NcclCommunicator::ReduceScatter(se::DeviceMemoryBase send_buffer,
-                                             se::DeviceMemoryBase recv_buffer,
-                                             PrimitiveType dtype, size_t count,
-                                             ReductionKind reduction_kind,
-                                             const Executor& executor) {
+absl::Status NcclCommunicator::LaunchReduceScatter(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
   if (aborted_) {
     return absl::FailedPreconditionError("NcclCommunicator aborted");
   }
@@ -301,16 +521,21 @@ absl::Status NcclCommunicator::ReduceScatter(se::DeviceMemoryBase send_buffer,
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
-  return XLA_NCCL_STATUS(ncclReduceScatter(
+  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclReduceScatter(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
       nccl_dtype, ToNcclReduction(reduction_kind), comm_,
-      se::gpu::AsGpuStreamValue(stream)));
+      se::gpu::AsGpuStreamValue(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone(comm_));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status NcclCommunicator::AllGather(se::DeviceMemoryBase send_buffer,
-                                         se::DeviceMemoryBase recv_buffer,
-                                         PrimitiveType dtype, size_t count,
-                                         const Executor& executor) {
+absl::Status NcclCommunicator::LaunchAllGather(se::DeviceMemoryBase send_buffer,
+                                               se::DeviceMemoryBase recv_buffer,
+                                               PrimitiveType dtype,
+                                               size_t count,
+                                               const Executor& executor) {
   if (aborted_) {
     return absl::FailedPreconditionError("NcclCommunicator aborted");
   }
@@ -325,15 +550,19 @@ absl::Status NcclCommunicator::AllGather(se::DeviceMemoryBase send_buffer,
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
-  return XLA_NCCL_STATUS(ncclAllGather(
+  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclAllGather(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
-      nccl_dtype, comm_, se::gpu::AsGpuStreamValue(stream)));
+      nccl_dtype, comm_, se::gpu::AsGpuStreamValue(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone(comm_));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status NcclCommunicator::AllToAll(
-    absl::Span<const se::DeviceMemoryBase> send_buffers,
-    absl::Span<const se::DeviceMemoryBase> recv_buffers, PrimitiveType dtype,
-    size_t count, const Executor& executor) {
+absl::Status NcclCommunicator::LaunchAllToAll(
+    absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
   if (aborted_) {
     return absl::FailedPreconditionError("NcclCommunicator aborted");
   }
@@ -368,8 +597,7 @@ absl::Status NcclCommunicator::AllToAll(
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
-  XLA_NCCL_RETURN_IF_ERROR(ncclGroupStart());
-
+  TF_RETURN_IF_ERROR(GroupStart());
   for (size_t i = 0; i < send_buffers.size(); ++i) {
     se::DeviceMemoryBase send_buffer = send_buffers[i];
     se::DeviceMemoryBase recv_buffer = recv_buffers[i];
@@ -382,13 +610,11 @@ absl::Status NcclCommunicator::AllToAll(
         ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype, i,
                  comm_, se::gpu::AsGpuStreamValue(stream)));
   }
-
-  XLA_NCCL_RETURN_IF_ERROR(ncclGroupEnd());
-
+  TF_RETURN_IF_ERROR(GroupEnd());
   return absl::OkStatus();
 }
 
-absl::Status NcclCommunicator::CollectivePermute(
+absl::Status NcclCommunicator::LaunchCollectivePermute(
     se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
     PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
     absl::Span<const RankId> target_ranks, const Executor& executor) {
@@ -417,7 +643,7 @@ absl::Status NcclCommunicator::CollectivePermute(
     return absl::OkStatus();
   }
 
-  XLA_NCCL_RETURN_IF_ERROR(ncclGroupStart());
+  TF_RETURN_IF_ERROR(GroupStart());
 
   if (source_rank) {
     XLA_NCCL_RETURN_IF_ERROR(ncclRecv(
@@ -431,14 +657,15 @@ absl::Status NcclCommunicator::CollectivePermute(
         target_rank.value(), comm_, se::gpu::AsGpuStreamValue(stream)));
   }
 
-  XLA_NCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  TF_RETURN_IF_ERROR(GroupEnd());
 
   return absl::OkStatus();
 }
 
-absl::Status NcclCommunicator::Send(se::DeviceMemoryBase send_buffer,
-                                    PrimitiveType dtype, size_t count,
-                                    RankId peer, const Executor& executor) {
+absl::Status NcclCommunicator::LaunchSend(se::DeviceMemoryBase send_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          RankId peer,
+                                          const Executor& executor) {
   if (aborted_) {
     return absl::FailedPreconditionError("NcclCommunicator aborted");
   }
@@ -453,14 +680,19 @@ absl::Status NcclCommunicator::Send(se::DeviceMemoryBase send_buffer,
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
-  return XLA_NCCL_STATUS(
+  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(
       ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream)));
+               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone(comm_));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status NcclCommunicator::Recv(se::DeviceMemoryBase recv_buffer,
-                                    PrimitiveType dtype, size_t count,
-                                    RankId peer, const Executor& executor) {
+absl::Status NcclCommunicator::LaunchRecv(se::DeviceMemoryBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          RankId peer,
+                                          const Executor& executor) {
   if (aborted_) {
     return absl::FailedPreconditionError("NcclCommunicator aborted");
   }
@@ -475,13 +707,19 @@ absl::Status NcclCommunicator::Recv(se::DeviceMemoryBase recv_buffer,
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
-  return XLA_NCCL_STATUS(
+  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(
       ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream)));
+               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone(comm_));
+  }
+  return absl::OkStatus();
 }
 
 std::string NcclCommunicator::ToString() const {
-  return absl::StrFormat("NccCommunicator(ncclComm_t=%p)", comm_);
+  // comm_ should not be "touched" outside of executor_, but we are printing the
+  // pointer itself and not touching the value, so this is safe.
+  return absl::StrFormat("NcclCommunicator(ncclComm_t=%p)", comm_);
 }
 
 absl::StatusOr<se::Stream*> NcclCommunicator::ToStream(
@@ -493,4 +731,35 @@ absl::StatusOr<se::Stream*> NcclCommunicator::ToStream(
   return InvalidArgument("Communicator executor is not a GPU executor");
 }
 
+tsl::AsyncValueRef<NcclCommunicator::Event> NcclCommunicator::Execute(
+    absl::AnyInvocable<absl::Status()> f) const {
+  if (!executor_) {
+    // Execute on the calling thread.
+    TF_RETURN_IF_ERROR(std::move(f)());
+    return OkEvent();
+  }
+
+  // Execute on executor_.
+  return tsl::TryMakeAsyncValueRef(
+      *executor_,
+      [f = std::move(f)]() mutable -> absl::StatusOr<NcclCommunicator::Event> {
+        TF_RETURN_IF_ERROR(std::move(f)());
+        return NcclCommunicator::Event{};
+      });
+}
+
+template <typename T>
+tsl::AsyncValueRef<T> NcclCommunicator::Execute(
+    absl::AnyInvocable<absl::StatusOr<T>()> f) const {
+  if (!executor_) {
+    // Execute on the calling thread.
+    auto ref = tsl::MakeUnconstructedAsyncValueRef<T>();
+    ref.emplace(std::move(f)());
+    return ref;
+  }
+
+  // Execute on executor_.
+  return tsl::TryMakeAsyncValueRef(*executor_, std::move(f));
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
index 77bc950b983f..fdd84153e0e8 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
@@ -17,19 +17,26 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_COLLECTIVES_NCCL_COMMUNICATOR_H_
 
 #include <cstddef>
-#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/env.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -45,9 +52,21 @@ limitations under the License.
 namespace xla::gpu {
 
 // XLA collectives communicator wrapping an NCCL communicator.
-class NcclCommunicator : public Communicator {
+class NcclCommunicator : public GpuCommunicator {
  public:
-  explicit NcclCommunicator(ncclComm_t comm);
+  // Creates a NCCL communicator.
+  //
+  // make_comm should construct and return a new ncclComm_t. For example, it
+  // could call ncclCommInitRank. make_comm should not return a ncclComm_t that
+  // was created by a different thread.
+  //
+  // If is_async is true, all collective methods (e.g., AllReduce) are performed
+  // asynchronously on a separate thread. Otherwise, they are performed
+  // synchronously on the calling thread.
+  static absl::StatusOr<std::unique_ptr<NcclCommunicator>> Create(
+      absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm,
+      bool is_async = false, tsl::Env& env = *tsl::Env::Default());
+
   ~NcclCommunicator() override;
 
   // NcclCommunicator is not copyable or movable.
@@ -63,43 +82,49 @@ class NcclCommunicator : public Communicator {
   absl::StatusOr<std::unique_ptr<RegisteredBufferHandle>> RegisterBuffer(
       se::DeviceMemoryBase buffer) final;
 
-  absl::Status AllReduce(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, ReductionKind reduction_kind,
-                         const Executor& executor) final;
-
-  absl::Status Broadcast(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, RankId root,
-                         const Executor& executor) final;
-
-  absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
-                             PrimitiveType dtype, size_t count,
-                             ReductionKind reduction_kind,
-                             const Executor& executor) final;
-
-  absl::Status AllGather(se::DeviceMemoryBase send_buffer,
-                         se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                         size_t count, const Executor& executor) final;
-
-  absl::Status AllToAll(absl::Span<const se::DeviceMemoryBase> send_buffers,
-                        absl::Span<const se::DeviceMemoryBase> recv_buffers,
-                        PrimitiveType dtype, size_t count,
-                        const Executor& executor) final;
-
-  absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
-                                 PrimitiveType dtype, size_t count,
-                                 std::optional<RankId> source_rank,
-                                 absl::Span<const RankId> target_ranks,
+  tsl::AsyncValueRef<Communicator::Event> GroupExecute(
+      absl::AnyInvocable<absl::Status(GpuCommunicator*)> f) final;
+
+  tsl::AsyncValueRef<Event> AllReduce(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      ReductionKind reduction_kind,
+                                      const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> Broadcast(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      RankId root,
+                                      const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                          se::DeviceMemoryBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          ReductionKind reduction_kind,
+                                          const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> AllGather(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> AllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> CollectivePermute(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+      absl::Span<const RankId> target_ranks, const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> Send(se::DeviceMemoryBase send_buffer,
+                                 PrimitiveType dtype, size_t count, RankId peer,
                                  const Executor& executor) final;
 
-  absl::Status Send(se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
-                    size_t count, RankId peer, const Executor& executor) final;
-
-  absl::Status Recv(se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
-                    size_t count, RankId peer, const Executor& executor) final;
+  tsl::AsyncValueRef<Event> Recv(se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count, RankId peer,
+                                 const Executor& executor) final;
 
   std::string ToString() const final;
 
@@ -108,8 +133,90 @@ class NcclCommunicator : public Communicator {
  private:
   static absl::StatusOr<se::Stream*> ToStream(const Executor& executor);
 
+  explicit NcclCommunicator(ncclComm_t comm,
+                            std::unique_ptr<tsl::AsyncValue::Executor> executor)
+      : comm_(comm), executor_(std::move(executor)) {
+    VLOG(1) << "Created " << *this;
+  }
+
+  absl::Status GroupStart();
+  absl::Status GroupEnd();
+
+  absl::Status LaunchAllReduce(se::DeviceMemoryBase send_buffer,
+                               se::DeviceMemoryBase recv_buffer,
+                               PrimitiveType dtype, size_t count,
+                               ReductionKind reduction_kind,
+                               const Executor& executor) final;
+
+  absl::Status LaunchBroadcast(se::DeviceMemoryBase send_buffer,
+                               se::DeviceMemoryBase recv_buffer,
+                               PrimitiveType dtype, size_t count, RankId root,
+                               const Executor& executor) final;
+
+  absl::Status LaunchReduceScatter(se::DeviceMemoryBase send_buffer,
+                                   se::DeviceMemoryBase recv_buffer,
+                                   PrimitiveType dtype, size_t count,
+                                   ReductionKind reduction_kind,
+                                   const Executor& executor) final;
+
+  absl::Status LaunchAllGather(se::DeviceMemoryBase send_buffer,
+                               se::DeviceMemoryBase recv_buffer,
+                               PrimitiveType dtype, size_t count,
+                               const Executor& executor) final;
+
+  absl::Status LaunchAllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) final;
+
+  absl::Status LaunchCollectivePermute(se::DeviceMemoryBase send_buffer,
+                                       se::DeviceMemoryBase recv_buffer,
+                                       PrimitiveType dtype, size_t count,
+                                       std::optional<RankId> source_rank,
+                                       absl::Span<const RankId> target_ranks,
+                                       const Executor& executor) final;
+
+  absl::Status LaunchSend(se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+                          size_t count, RankId peer,
+                          const Executor& executor) final;
+
+  absl::Status LaunchRecv(se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+                          size_t count, RankId peer,
+                          const Executor& executor) final;
+
+  // Executes f on executor_, or calls f directly if executor_ is null.
+  tsl::AsyncValueRef<Event> Execute(absl::AnyInvocable<absl::Status()> f) const;
+
+  // Executes f on executor_, or calls f directly if executor_ is null.
+  template <typename T>
+  tsl::AsyncValueRef<T> Execute(
+      absl::AnyInvocable<absl::StatusOr<T>()> f) const;
+
+  // Underlying NCCL communicator.
   ncclComm_t comm_;
-  bool aborted_ = false;  // Has Abort() been called?
+
+  // If not null, used to execute methods.
+  //
+  // NCCL communicators (instances of ncclComm_t) are not thread safe. Thus,
+  // multiple threads cannot concurrently access the same ncclComm_t. This is
+  // not surprising. What is very surprising is that multiple threads cannot
+  // serially access the same ncclComm_t. In fact, a ncclComm_t must be created
+  // by, live on, and be destroyed by a single thread. A ncclComm_t cannot be
+  // accessed by any thread except the one that created it. To accomplish this,
+  // we perform all comm_ operations on executor_, if it is not null.
+  //
+  // Concretely, the lack of thread safety comes from the fact that the NCCL
+  // code uses thread-local variables that do not work properly when a
+  // ncclComm_t is accessed from multiple threads. Emperically, the lack of
+  // thread safety only manifests as buggy behavior when using non-blocking
+  // communicators.
+  std::unique_ptr<tsl::AsyncValue::Executor> executor_;
+
+  // Has Abort() been called?
+  bool aborted_ = false;
+
+  // Nesting level of current NCCL group
+  int group_nesting_level_ = 0;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
index f69500373b1c..6c564e81daa4 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/nccl_communicator.h"
 
 #include <cstddef>
+#include <memory>
 #include <optional>
 
 #include <gmock/gmock.h>
@@ -24,14 +25,16 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "absl/utility/utility.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
+#include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_matchers.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -48,97 +51,121 @@ namespace xla::gpu {
 namespace {
 
 using ::testing::HasSubstr;
+using ::tsl::testing::IsOk;
 using ::tsl::testing::StatusIs;
 
 constexpr absl::string_view kCudaError = "unhandled cuda error";
 
-// Creates a non-blocking NCCL communicator.
-absl::StatusOr<NcclCommunicator> CreateNonBlockingCommunicator() {
-  // Create a unique NCCL Id.
-  ncclUniqueId id;
-  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclGetUniqueId(&id)));
-
-  // Initialize a communicator.
-  ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
-  config.blocking = 0;  // non-blocking
-  ncclComm_t comm;
-  ncclResult_t r =
-      ncclCommInitRankConfig(&comm, /*nranks=*/1, id, /*rank=*/0, &config);
-  if (r == ncclUnhandledCudaError) {
-    // If this test runs on a machine without any CUDA-capable devices
-    // available, we get a ncclUnhandledCudaError. We return a specific error
-    // and skip the test.
-    LOG(ERROR) << XLA_NCCL_STATUS(r);
-    return absl::FailedPreconditionError(kCudaError);
-  }
-  if (r != ncclSuccess && r != ncclInProgress) {
-    return XLA_NCCL_STATUS(r);
-  }
+void AssertAborted(absl::Status s) {
+  ASSERT_THAT(
+      s, StatusIs(absl::StatusCode::kFailedPrecondition, HasSubstr("aborted")));
+};
 
-  // Wait for the communicator to finish initializing.
-  ncclResult_t state = ncclInProgress;
-  while (state == ncclInProgress) {
-    TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclCommGetAsyncError(comm, &state)));
-  }
-  TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(state));
+void AssertEventAborted(tsl::AsyncValueRef<Communicator::Event> event) {
+  tsl::BlockUntilReady(event);
+  ASSERT_TRUE(event.IsError());
+  ASSERT_THAT(event.GetError(), StatusIs(absl::StatusCode::kFailedPrecondition,
+                                         HasSubstr("aborted")));
+};
 
-  // Wrap and return the communicator.
-  return absl::StatusOr<NcclCommunicator>(absl::in_place_t(), comm);
+// Creates a non-blocking NCCL communicator.
+absl::StatusOr<std::unique_ptr<NcclCommunicator>> CreateCommunicator(
+    bool blocking) {
+  auto f = [blocking]() -> absl::StatusOr<ncclComm_t> {
+    // Create a unique NCCL Id.
+    ncclUniqueId id;
+    TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclGetUniqueId(&id)));
+
+    // Initialize a communicator.
+    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+    config.blocking = blocking ? 1 : 0;
+    ncclComm_t comm;
+    ncclResult_t r =
+        ncclCommInitRankConfig(&comm, /*nranks=*/1, id, /*rank=*/0, &config);
+    if (r == ncclUnhandledCudaError) {
+      // If this test runs on a machine without any CUDA-capable devices
+      // available, we get a ncclUnhandledCudaError. We return a specific error
+      // and skip the test.
+      LOG(ERROR) << XLA_NCCL_STATUS(r);
+      return absl::FailedPreconditionError(kCudaError);
+    }
+    if (r != ncclSuccess && r != ncclInProgress) {
+      return XLA_NCCL_STATUS(r);
+    }
+
+    // Wait for the communicator to finish initializing.
+    ncclResult_t state = ncclInProgress;
+    while (state == ncclInProgress) {
+      TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclCommGetAsyncError(comm, &state)));
+    }
+    TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(state));
+    return comm;
+  };
+  bool is_async = !blocking;
+  return NcclCommunicator::Create(f, is_async);
 }
 
 TEST(NcclCommunicator, AbortSucceeds) {
-  absl::StatusOr<NcclCommunicator> comm = CreateNonBlockingCommunicator();
-  if (comm.status().message() == kCudaError) {
-    GTEST_SKIP() << "unhandled cuda error";
+  for (const bool blocking : {true, false}) {
+    absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm, IsOk());
+    ASSERT_THAT((*comm)->Abort(), IsOk());
   }
-  TF_ASSERT_OK(comm.status());
-  TF_ASSERT_OK(comm->Abort());
 }
 
 TEST(NcclCommunicator, DoubleAbortFails) {
-  absl::StatusOr<NcclCommunicator> comm = CreateNonBlockingCommunicator();
-  if (comm.status().message() == kCudaError) {
-    GTEST_SKIP() << "unhandled cuda error";
+  for (const bool blocking : {true, false}) {
+    absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm.status(), IsOk());
+    ASSERT_THAT((*comm)->Abort(), IsOk());
+    ASSERT_THAT(
+        (*comm)->Abort(),
+        StatusIs(absl::StatusCode::kFailedPrecondition, HasSubstr("aborted")));
   }
-  TF_ASSERT_OK(comm.status());
-  TF_ASSERT_OK(comm->Abort());
-  ASSERT_THAT(comm->Abort(), StatusIs(absl::StatusCode::kFailedPrecondition,
-                                      HasSubstr("aborted")));
 }
 
 TEST(NcclCommunicator, OperationsFailAfterAbort) {
-  auto assert_aborted = [](absl::Status s) {
-    ASSERT_THAT(s, StatusIs(absl::StatusCode::kFailedPrecondition,
-                            HasSubstr("aborted")));
-  };
-
-  // Declare placeholder variables to make the operations below compile.
-  se::DeviceMemoryBase buf;
-  PrimitiveType dtype = PrimitiveType::U64;
-  size_t count = 0;
-  ReductionKind rk = ReductionKind::SUM;
-  GpuCollectives::Executor executor(nullptr);
-
-  // Execute NcclCommunicator operations. They should all immediately fail
-  // because the communicator has been aborted.
-  absl::StatusOr<NcclCommunicator> comm = CreateNonBlockingCommunicator();
-  if (comm.status().message() == kCudaError) {
-    GTEST_SKIP() << "unhandled cuda error";
+  for (const bool blocking : {true, false}) {
+    // Declare placeholder variables to make the operations below compile.
+    se::DeviceMemoryBase buf;
+    PrimitiveType dtype = PrimitiveType::U64;
+    size_t count = 0;
+    ReductionKind rk = ReductionKind::SUM;
+    GpuCollectives::Executor executor(nullptr);
+
+    // Execute NcclCommunicator operations. They should all immediately fail
+    // because the communicator has been aborted.
+    absl::StatusOr<std::unique_ptr<NcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm.status(), IsOk());
+    ASSERT_THAT((*comm)->Abort(), IsOk());
+    AssertAborted((*comm)->HealthCheck());
+    AssertAborted((*comm)->NumRanks().status());
+    AssertAborted((*comm)->RegisterBuffer(buf).status());
+    AssertEventAborted(
+        (*comm)->AllReduce(buf, buf, dtype, count, rk, executor));
+    AssertEventAborted(
+        (*comm)->Broadcast(buf, buf, dtype, count, RankId(0), executor));
+    AssertEventAborted(
+        (*comm)->ReduceScatter(buf, buf, dtype, count, rk, executor));
+    AssertEventAborted((*comm)->AllGather(buf, buf, dtype, count, executor));
+    AssertEventAborted((*comm)->AllToAll({}, {}, dtype, count, executor));
+    AssertEventAborted(
+        (*comm)->CollectivePermute(buf, buf, dtype, count, {}, {}, executor));
+    AssertEventAborted((*comm)->Send(buf, dtype, count, RankId(0), executor));
+    AssertEventAborted((*comm)->Recv(buf, dtype, count, RankId(0), executor));
   }
-  TF_ASSERT_OK(comm.status());
-  TF_ASSERT_OK(comm->Abort());
-  assert_aborted(comm->HealthCheck());
-  assert_aborted(comm->NumRanks().status());
-  assert_aborted(comm->RegisterBuffer(buf).status());
-  assert_aborted(comm->AllReduce(buf, buf, dtype, count, rk, executor));
-  assert_aborted(comm->Broadcast(buf, buf, dtype, count, RankId(0), executor));
-  assert_aborted(comm->ReduceScatter(buf, buf, dtype, count, rk, executor));
-  assert_aborted(comm->AllGather(buf, buf, dtype, count, executor));
-  assert_aborted(comm->AllToAll({}, {}, dtype, count, executor));
-  assert_aborted(
-      comm->CollectivePermute(buf, buf, dtype, count, {}, {}, executor));
-  assert_aborted(comm->Send(buf, dtype, count, RankId(0), executor));
-  assert_aborted(comm->Recv(buf, dtype, count, RankId(0), executor));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
new file mode 100644
index 000000000000..3c4240a93c27
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
@@ -0,0 +1,56 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/nccl_errors.h"
+
+#include "absl/log/log.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+#else
+#include "third_party/nccl/nccl.h"
+#endif  // TENSORFLOW_USE_ROCM
+
+namespace xla::gpu {
+
+absl::Status PollUntilDone(ncclComm_t comm) {
+  auto poll = [](ncclComm_t comm) -> absl::Status {
+    ncclResult_t state = ncclInProgress;
+    while (state == ncclInProgress) {
+      XLA_NCCL_RETURN_IF_ERROR(ncclCommGetAsyncError(comm, &state));
+    }
+    return XLA_NCCL_STATUS(state);
+  };
+
+  if (!VLOG_IS_ON(1)) {
+    return poll(comm);
+  }
+
+  absl::Time start = absl::Now();
+  absl::Status s = poll(comm);
+  absl::Time stop = absl::Now();
+  VLOG(1) << "Polled NCCL communicator " << comm << " for " << (stop - start)
+          << ": " << s;
+  return s;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
index 71fa13d28f2a..0d5fd2817774 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
@@ -19,6 +19,17 @@ limitations under the License.
 #include "absl/strings/str_format.h"  // IWYU pragma: keep
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/util.h"  // IWYU pragma: keep
+                                                       //
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+#else
+#include "third_party/nccl/nccl.h"
+#endif  // TENSORFLOW_USE_ROCM
 
 //===----------------------------------------------------------------------===//
 // Collection of helper macros for handling NCCL errors.
@@ -26,7 +37,9 @@ limitations under the License.
 
 #define XLA_NCCL_STATUS(expr)                                         \
   [](ncclResult_t s, absl::string_view str) -> absl::Status {         \
-    if (s == ncclSuccess) return absl::OkStatus();                    \
+    if (s == ncclSuccess || s == ncclInProgress) {                    \
+      return absl::OkStatus();                                        \
+    }                                                                 \
     return xla::Internal(                                             \
         "NCCL operation %s failed: %s. Last NCCL warning(error) log " \
         "entry (may be unrelated) '%s'.",                             \
@@ -51,4 +64,23 @@ limitations under the License.
 
 #define XLA_NCCL_CHECK(expr) CHECK(XLA_NCCL_STATUS(expr).ok())
 
+namespace xla::gpu {
+
+// Polls the provided communicator until it is "done".
+//
+// NCCL communicators can be blocking or non-blocking. Operations performed on
+// non-blocking communicators return immediately, and it is the responsibility
+// of the programmer to repeatedly call ncclCommGetAsyncError on the
+// communicator until ncclCommGetAsyncError no long returns inProgress. That is
+// what PollUntilDone does.
+//
+// Note, however, that the semantics of NCCL collectives are a bit subtle. For
+// example, a collective operation may report itself as done when it is
+// scheduled on the GPU but has not yet executed. Refer to the NCCL
+// documentation and exercise caution when reasoning about whether an operation
+// is really "done".
+absl::Status PollUntilDone(ncclComm_t comm);
+
+}  // namespace xla::gpu
+
 #endif  // XLA_BACKENDS_GPU_COLLECTIVES_NCCL_ERRORS_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
index acb5c8cf13eb..8b2f42edb654 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
 #include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/collectives/nvshmem_communicator.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
@@ -57,6 +58,12 @@ NvshmemCollectives* NvshmemCollectives::Default() {
   LOG(FATAL) << "Unsupported collectives implementation for NVSHMEM";
 }
 
+absl::Status NvshmemCollectives::InitializeTopology(Topology topology) {
+  SetEnvInfo(topology.node_id, topology.num_nodes,
+             topology.device_count_per_process, topology.kv_store);
+  return absl::OkStatus();
+}
+
 void NvshmemCollectives::SetEnvInfo(
     int process_id, size_t num_processes, size_t device_count_per_process,
     std::weak_ptr<KeyValueStoreInterface> kv_store) {
@@ -153,6 +160,12 @@ absl::Status NvshmemCollectives::Deallocate(void* buffer) {
   return absl::OkStatus();
 }
 
+absl::StatusOr<std::unique_ptr<Communicator>>
+NvshmemCollectives::CreateCommunicator() {
+  auto comm = std::make_unique<NvshmemCommunicator>(this);
+  return comm;
+}
+
 }  // namespace xla::gpu
 
 // NvshmemCollectives currently does not implement GpuCollectives, so it cannot
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
index f7cd034e1c8a..896780560351 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
 #include "xla/core/collectives/collectives.h"
@@ -35,38 +36,52 @@ limitations under the License.
 namespace xla::gpu {
 
 // NVIDIA NVSHMEM library
-class NvshmemCollectives : public Collectives {
+class NvshmemCollectives : public GpuCollectives {
  public:
   ~NvshmemCollectives() override;
 
   static NvshmemCollectives* Default();
+  bool IsInitialized() { return initialized_; }
 
   void SetEnvInfo(int process_id, size_t num_processes,
                   size_t device_count_per_process,
                   std::weak_ptr<KeyValueStoreInterface> kv_store);
 
-  absl::StatusOr<void*> Allocate(uint64_t bytes);
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final;
 
-  absl::Status Deallocate(void* buffer);
+  absl::Status Deallocate(void* buffer) final;
 
   absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final {
     return absl::UnimplementedError("Not implemented.");
   }
 
+  bool IsImplemented() const final { return true; }
+
+  bool IsGlobalConfig() const final { return false; }
+
+  absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
+      const CliqueIdCallback* clique_id_callback, bool is_local) final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
   CreateCommunicators(const CliqueKey& clique_key,
                       const std::optional<CliqueIds>& clique_ids,
                       absl::Span<const DeviceRank> ranks,
-                      const Config& config) final {
+                      const Collectives::Config& config) override {
     return absl::UnimplementedError("Not implemented.");
   }
 
+  absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator() final;
+
   absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
       absl::Span<const Communicator* const> comms, int32_t color,
-      absl::Span<const RankId> keys, const Config& config) final {
+      absl::Span<const RankId> keys, const Collectives::Config& config) final {
     return absl::UnimplementedError("Not implemented.");
   }
 
+  absl::Status InitializeTopology(Topology topology) final;
+
  private:
   absl::Status InitializeOnce();
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
new file mode 100644
index 000000000000..9598c9aa7e86
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/nvshmem_collectives.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/time.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "xla/debug_options_flags.h"
+#include "xla/pjrt/distributed/client.h"
+#include "xla/pjrt/distributed/distributed.h"
+#include "xla/pjrt/distributed/service.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/subprocess.h"
+#include "xla/tsl/util/command_line_flags.h"
+
+namespace xla::gpu {
+namespace {
+
+// Tests that NVSHMEM library can be loaded and initialized.
+TEST(NvshmemTest, Initialization) {
+  const int num_nodes = 2;
+  tsl::SubProcess child[num_nodes];
+  for (int node_id = 0; node_id < num_nodes; ++node_id) {
+    std::vector<std::string> argv;
+    argv.push_back("nvshmem_test");
+    argv.push_back(absl::StrFormat("--node_id=%d", node_id));
+    argv.push_back(absl::StrFormat("--num_nodes=%d", num_nodes));
+    child[node_id].SetProgram("/proc/self/exe", argv);
+    child[node_id].SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+    child[node_id].SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+    ASSERT_TRUE(child[node_id].Start()) << "node " << node_id;
+  }
+  for (int node_id = 0; node_id < num_nodes; ++node_id) {
+    std::string stdout_str;
+    std::string stderr_str;
+    int child_status =
+        child[node_id].Communicate(nullptr, &stdout_str, &stderr_str);
+    EXPECT_EQ(child_status, 0) << " node " << node_id << "\nstdout:\n"
+                               << stdout_str << "\nstderr:\n"
+                               << stderr_str;
+  }
+}
+
+absl::Status InitializationTestBody(const int node_id, const int num_nodes) {
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+  if (node_id == 0) {
+    xla::CoordinationServiceImpl::Options service_options;
+    service_options.num_nodes = num_nodes;
+    TF_ASSIGN_OR_RETURN(service, xla::GetDistributedRuntimeService(
+                                     "[::]:12345", service_options));
+  }
+
+  xla::DistributedRuntimeClient::Options distributed_options;
+  distributed_options.node_id = node_id;
+  distributed_options.init_timeout = absl::Seconds(120);
+  auto distributed_client =
+      GetDistributedRuntimeClient("127.0.0.1:12345", distributed_options);
+  TF_QCHECK_OK(distributed_client->Connect());
+  auto kv_store =
+      GetDistributedKeyValueStore(distributed_client, /*key_prefix=*/"gpu:");
+
+  NvshmemCollectives::Default()->SetEnvInfo(node_id, num_nodes, 1, kv_store);
+  cudaSetDevice(node_id);
+  TF_ASSIGN_OR_RETURN(void* ptr, NvshmemCollectives::Default()->Allocate(1024));
+  TF_RET_CHECK(ptr != nullptr);
+  TF_RETURN_IF_ERROR(NvshmemCollectives::Default()->Deallocate(ptr));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Communicator> comm,
+                      NvshmemCollectives::Default()->CreateCommunicator());
+  TF_RET_CHECK(*comm->NumRanks() == num_nodes);
+  TF_RET_CHECK(*comm->CurrentRank() == node_id);
+  return absl::OkStatus();
+}
+
+}  // namespace
+}  // namespace xla::gpu
+
+int main(int argc, char* argv[]) {
+  // Save name of binary so that it may invoke itself.
+  int node_id = -1;
+  int num_nodes = -1;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("node_id", &node_id, "Node ID for Initialization test."),
+      tsl::Flag("num_nodes", &num_nodes,
+                "Number of nodes for Initialization test."),
+  };
+  xla::AppendDebugOptionsFlags(&flag_list);
+  std::string usage = tsl::Flags::Usage(argv[0], flag_list);
+  tsl::Flags::Parse(&argc, argv, flag_list);
+  testing::InitGoogleTest(&argc, argv);
+  if (node_id >= 0) {
+    absl::Status result = xla::gpu::InitializationTestBody(node_id, num_nodes);
+    if (!result.ok()) {
+      LOG(ERROR) << result;
+    }
+    return result.raw_code();
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
new file mode 100644
index 000000000000..58027f26ccbd
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
@@ -0,0 +1,299 @@
+/* Copyright 2026 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/nvshmem_communicator.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "third_party/gpus/cuda/include/cuda_bf16.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
+#include "third_party/nvshmem/nvshmemx.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/nvshmem_collectives.h"
+#include "xla/primitive_util.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+
+namespace xla::gpu {
+
+//==-----------------------------------------------------------------------===//
+// NVSHMEM Utility Functions
+//==-----------------------------------------------------------------------===//
+
+size_t ToRealCount(PrimitiveType dtype, size_t count) {
+  return primitive_util::IsComplexType(dtype) ? count * 2 : count;
+}
+
+//==-----------------------------------------------------------------------===//
+// NVSHMEM Templated APIs
+//==-----------------------------------------------------------------------===//
+
+#define CALL_NVSHMEM_COLL(coll, TYPENAME, TYPE, OP, team, source_ptr,         \
+                          dest_ptr, stream)                                   \
+  do {                                                                        \
+    if (nvshmemx_##TYPENAME##_##OP##_##coll##_on_stream(                      \
+            team, (TYPE*)dest_ptr, (const TYPE*)source_ptr, count, stream) != \
+        0) {                                                                  \
+      return absl::InternalError("Nvshmem collective failed");                \
+    }                                                                         \
+  } while (0)
+
+#define NVSHMEM_BITWISE_REDUCTION_BITWISE_DATATYPE(                     \
+    coll, TYPENAME, TYPE, team, source_ptr, dest_ptr, count, stream,    \
+    reduction_kind)                                                     \
+  switch (reduction_kind) {                                             \
+    case ReductionKind::SUM:                                            \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, sum, team, source_ptr,  \
+                        dest_ptr, stream);                              \
+      break;                                                            \
+    case ReductionKind::MIN:                                            \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, min, team, source_ptr,  \
+                        dest_ptr, stream);                              \
+      break;                                                            \
+    case ReductionKind::MAX:                                            \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, max, team, source_ptr,  \
+                        dest_ptr, stream);                              \
+      break;                                                            \
+    case ReductionKind::PRODUCT:                                        \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, prod, team, source_ptr, \
+                        dest_ptr, stream);                              \
+      break;                                                            \
+    default:                                                            \
+      return absl::InternalError("Invalid NVSHMEM reduction kind.");    \
+  }
+
+#define NVSHMEM_REDUCTION_DATATYPE(coll, TYPENAME, TYPE, team, source_ptr, \
+                                   dest_ptr, num_elements, gpu_stream,     \
+                                   reduction_kind)                         \
+  switch (reduction_kind) {                                                \
+    case ReductionKind::SUM:                                               \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, sum, team, source_ptr,     \
+                        dest_ptr, gpu_stream);                             \
+      break;                                                               \
+    case ReductionKind::MIN:                                               \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, min, team, source_ptr,     \
+                        dest_ptr, gpu_stream);                             \
+      break;                                                               \
+    case ReductionKind::MAX:                                               \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, max, team, source_ptr,     \
+                        dest_ptr, gpu_stream);                             \
+      break;                                                               \
+    case ReductionKind::PRODUCT:                                           \
+      CALL_NVSHMEM_COLL(reduce, TYPENAME, TYPE, prod, team, source_ptr,    \
+                        dest_ptr, gpu_stream);                             \
+      break;                                                               \
+    default:                                                               \
+      return absl::InternalError("Invalid NVSHMEM reduction kind.");       \
+  }
+
+#define CALL_NVSHMEM_REDUCTION_DATATYPE(TYPENAME, TYPE, team, gpu_stream,     \
+                                        reduction_kind, dest_ptr, source_ptr, \
+                                        count)                                \
+  NVSHMEM_REDUCTION_DATATYPE(reduce, TYPENAME, TYPE, NVSHMEM_TEAM_WORLD,      \
+                             (TYPE*)source_ptr, (TYPE*)dest_ptr, count,       \
+                             gpu_stream, reduction_kind);
+#define CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(TYPENAME, TYPE, team,        \
+                                                gpu_stream, reduction_kind,  \
+                                                dest_ptr, source_ptr, count) \
+  NVSHMEM_BITWISE_REDUCTION_BITWISE_DATATYPE(                                \
+      reduce, TYPENAME, TYPE, NVSHMEM_TEAM_WORLD, (TYPE*)source_ptr,         \
+      (TYPE*)dest_ptr, count, gpu_stream, reduction_kind);
+
+//==-----------------------------------------------------------------------===//
+// NVSHMEM Communicator
+//==-----------------------------------------------------------------------===//
+
+NvshmemCommunicator::NvshmemCommunicator(NvshmemCollectives* collectives)
+    : collectives_(collectives) {
+  VLOG(1) << "Created " << *this;
+}
+
+absl::Status NvshmemCommunicator::Abort() {
+  VLOG(1) << "Abort NVSHMEM communicator: " << ToString();
+  if (aborted_) {
+    return FailedPrecondition("NvshmemCommunicator aborted");
+  }
+  if (!collectives_->IsInitialized()) {
+    return FailedPrecondition("NvshmemCollectives not initialized.");
+  }
+
+  aborted_ = true;
+  // Call nvshmem_global_exit with a non-zero return code
+  // to abort the program.
+  nvshmem_global_exit(1);
+  return absl::OkStatus();
+}
+
+absl::Status NvshmemCommunicator::Barrier(
+    const Communicator::Executor& executor) {
+  VLOG(1) << "Barrier NVSHMEM communicator: " << ToString();
+  if (aborted_) {
+    return FailedPrecondition("NvshmemCommunicator aborted");
+  }
+  if (!collectives_->IsInitialized()) {
+    return FailedPrecondition("NvshmemCollectives not initialized.");
+  }
+
+  TF_ASSIGN_OR_RETURN(se::Stream * stream, ToStream(executor));
+
+  auto gpu_stream = se::gpu::AsGpuStreamValue(stream);
+
+  if (nvshmemx_barrier_on_stream(NVSHMEMX_TEAM_NODE, gpu_stream) != 0) {
+    return absl::InternalError("Nvshmem team barrier failed.");
+  }
+  return absl::OkStatus();
+}
+absl::StatusOr<size_t> NvshmemCommunicator::NumRanks() const {
+  VLOG(5) << "Get the number of ranks in NVSHMEM communicator: " << ToString();
+  if (aborted_) {
+    return absl::FailedPreconditionError("NvshmemCommunicator aborted");
+  }
+  if (!collectives_->IsInitialized()) {
+    return FailedPrecondition("NvshmemCollectives not initialized.");
+  }
+
+  int32_t count = 0;
+  count = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE);
+  if (count < 0) {
+    return absl::InvalidArgumentError(
+        "NvshmemCommunicator::NumRanks invalid team.");
+  }
+  return count;
+}
+
+absl::StatusOr<size_t> NvshmemCommunicator::CurrentRank() {
+  VLOG(5) << "Get current rank in NVSHMEM communicator: " << ToString();
+  if (aborted_) {
+    return absl::FailedPreconditionError("NvshmemCommunicator aborted");
+  }
+  if (!collectives_->IsInitialized()) {
+    return FailedPrecondition("NvshmemCollectives not initialized.");
+  }
+
+  int32_t rank = 0;
+  rank = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
+  if (rank < 0) {
+    return absl::InvalidArgumentError(
+        "NvshmemCommunicator::NumRanks invalid team.");
+  }
+  return rank;
+}
+
+tsl::AsyncValueRef<NvshmemCommunicator::Event> NvshmemCommunicator::AllReduce(
+    se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Communicator::Executor& executor) {
+  if (aborted_) {
+    return absl::FailedPreconditionError("NvshmemCommunicator aborted");
+  }
+  if (!collectives_->IsInitialized()) {
+    return FailedPrecondition("NvshmemCollectives not initialized.");
+  }
+
+  TF_ASSIGN_OR_RETURN(se::Stream * stream, ToStream(executor));
+
+  void* dest_ptr = send_buffer.opaque();
+  void* source_ptr = recv_buffer.opaque();
+  count = ToRealCount(dtype, count);
+  VLOG(3) << absl::StreamFormat(
+      "Launch NVSHMEM AllReduce operation on device #%d; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; reduction_kind=%s; comm=node; "
+      "team=%d;"
+      "stream=%p",
+      nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, ReductionKindToString(reduction_kind), NVSHMEMX_TEAM_NODE, stream);
+
+  switch (dtype) {
+    case PrimitiveType::F64: {
+      CALL_NVSHMEM_REDUCTION_DATATYPE(
+          double, double, NVSHMEMX_TEAM_NODE, se::gpu::AsGpuStreamValue(stream),
+          reduction_kind, dest_ptr, source_ptr, count);
+      break;
+    }
+    case PrimitiveType::F16: {
+      CALL_NVSHMEM_REDUCTION_DATATYPE(
+          half, __half, NVSHMEMX_TEAM_NODE, se::gpu::AsGpuStreamValue(stream),
+          reduction_kind, dest_ptr, source_ptr, count);
+      break;
+    }
+    case PrimitiveType::F32: {
+      CALL_NVSHMEM_REDUCTION_DATATYPE(
+          float, float, NVSHMEMX_TEAM_NODE, se::gpu::AsGpuStreamValue(stream),
+          reduction_kind, dest_ptr, source_ptr, count);
+      break;
+    }
+    case PrimitiveType::BF16: {
+      CALL_NVSHMEM_REDUCTION_DATATYPE(
+          bfloat16, __nv_bfloat16, NVSHMEMX_TEAM_NODE,
+          se::gpu::AsGpuStreamValue(stream), reduction_kind, dest_ptr,
+          source_ptr, count);
+      break;
+    }
+    case PrimitiveType::S32: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          int32, int32_t, NVSHMEMX_TEAM_NODE, se::gpu::AsGpuStreamValue(stream),
+          reduction_kind, dest_ptr, source_ptr, count);
+      break;
+    }
+    case PrimitiveType::S64: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          int64, int64_t, NVSHMEMX_TEAM_NODE, se::gpu::AsGpuStreamValue(stream),
+          reduction_kind, dest_ptr, source_ptr, count);
+      break;
+    }
+    case PrimitiveType::U32: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          uint32, uint32_t, NVSHMEMX_TEAM_NODE,
+          se::gpu::AsGpuStreamValue(stream), reduction_kind, dest_ptr,
+          source_ptr, count);
+      break;
+    }
+    case PrimitiveType::U64: {
+      CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
+          uint64, uint64_t, NVSHMEMX_TEAM_NODE,
+          se::gpu::AsGpuStreamValue(stream), reduction_kind, dest_ptr,
+          source_ptr, count);
+      break;
+    }
+    default:
+      return absl::InternalError("Invalid Nvshmem reduction type.");
+  }
+  return OkEvent();
+}
+
+std::string NvshmemCommunicator::ToString() const {
+  return absl::StrFormat("NvshmemCommunicator(nvshmem_team_t=%d)",
+                         NVSHMEMX_TEAM_NODE);
+}
+
+absl::StatusOr<se::Stream*> NvshmemCommunicator::ToStream(
+    const Executor& executor) {
+  if (auto* gpu_executor =
+          tsl::down_cast<const GpuCollectives::Executor*>(&executor)) {
+    return gpu_executor->stream();
+  }
+  return InvalidArgument("Communicator executor is not a GPU executor");
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.h b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.h
new file mode 100644
index 000000000000..58fded142a6c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.h
@@ -0,0 +1,123 @@
+/* Copyright 2025 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COMMUNICATOR_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COMMUNICATOR_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla::gpu {
+
+class NvshmemCollectives;
+
+// XLA collectives communicator wrapping an NVSHMEM communicator.
+class NvshmemCommunicator : public Communicator {
+ public:
+  explicit NvshmemCommunicator(NvshmemCollectives* collectives);
+  // Since the communicator is hardcoded to use pre-defined node
+  // team for now, we don't need to call nvshmem_team_destroy on it.
+  // If user-defined comms are used, we need to add a destructor to
+  // call nvshmem_team_destroy on each one.
+
+  // NvshmemCommunicator is not copyable or movable.
+  NvshmemCommunicator(const NvshmemCommunicator&) = delete;
+  NvshmemCommunicator(NvshmemCommunicator&&) = delete;
+  NvshmemCommunicator& operator=(const NvshmemCommunicator&) = delete;
+  NvshmemCommunicator& operator=(NvshmemCommunicator&&) = delete;
+
+  absl::Status Abort() final;
+  absl::StatusOr<size_t> NumRanks() const final;
+  absl::StatusOr<size_t> CurrentRank() final;
+
+  absl::Status Barrier(const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> AllReduce(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      ReductionKind reduction_kind,
+                                      const Executor& executor) final;
+
+  tsl::AsyncValueRef<Event> Broadcast(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      RankId root,
+                                      const Executor& executor) final {
+    return absl::UnimplementedError("Not implemented.");
+  };
+
+  tsl::AsyncValueRef<Event> ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                          se::DeviceMemoryBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          ReductionKind reduction_kind,
+                                          const Executor& executor) final {
+    return absl::UnimplementedError("Not implemented.");
+  };
+
+  tsl::AsyncValueRef<Event> AllGather(se::DeviceMemoryBase send_buffer,
+                                      se::DeviceMemoryBase recv_buffer,
+                                      PrimitiveType dtype, size_t count,
+                                      const Executor& executor) final {
+    return absl::UnimplementedError("Not implemented.");
+  };
+
+  tsl::AsyncValueRef<Event> AllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) final {
+    return absl::UnimplementedError("Not implemented.");
+  };
+
+  tsl::AsyncValueRef<Event> CollectivePermute(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+      absl::Span<const RankId> target_ranks, const Executor& executor) final {
+    return absl::UnimplementedError("Not implemented.");
+  };
+  ;
+
+  tsl::AsyncValueRef<Event> Send(se::DeviceMemoryBase send_buffer,
+                                 PrimitiveType dtype, size_t count, RankId peer,
+                                 const Executor& executor) final {
+    return absl::UnimplementedError("Not implemented.");
+  };
+
+  tsl::AsyncValueRef<Event> Recv(se::DeviceMemoryBase recv_buffer,
+                                 PrimitiveType dtype, size_t count, RankId peer,
+                                 const Executor& executor) final {
+    return absl::UnimplementedError("Not implemented.");
+  };
+
+  std::string ToString() const final;
+
+ private:
+  static absl::StatusOr<se::Stream*> ToStream(const Executor& executor);
+
+  NvshmemCollectives* collectives_;  // Parent NvshmemCollectives instance
+  bool aborted_ = false;             // Has Abort() been called?
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COMMUNICATOR_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor.cc b/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor.cc
new file mode 100644
index 000000000000..453908a9917a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor.cc
@@ -0,0 +1,34 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/single_threaded_executor.h"
+
+#include <utility>
+
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/platform/threadpool_async_executor.h"
+
+namespace xla::gpu {
+
+SingleThreadedExecutor::SingleThreadedExecutor(tsl::Env& env)
+    : thread_pool_(&env, "SingleThreadedExecutor", 1),
+      executor_(&thread_pool_) {}
+
+void SingleThreadedExecutor::Execute(SingleThreadedExecutor::Task task) {
+  executor_.Execute(std::move(task));
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor.h b/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor.h
new file mode 100644
index 000000000000..51ba7892d59a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor.h
@@ -0,0 +1,42 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_SINGLE_THREADED_EXECUTOR_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_SINGLE_THREADED_EXECUTOR_H_
+
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/platform/threadpool_async_executor.h"
+
+namespace xla::gpu {
+
+// An Executor that executes all tasks on a single thread.
+//
+// Tasks are executed concurrently to the thread that calls the Execute method,
+// but tasks are not executed concurrently to each other.
+class SingleThreadedExecutor : public tsl::AsyncValue::Executor {
+ public:
+  explicit SingleThreadedExecutor(tsl::Env& env = *tsl::Env::Default());
+  void Execute(Task task) override;
+
+ private:
+  tsl::thread::ThreadPool thread_pool_;
+  tsl::thread::ThreadPoolAsyncExecutor executor_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_SINGLE_THREADED_EXECUTOR_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor_test.cc b/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor_test.cc
new file mode 100644
index 000000000000..a78609d9f02e
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/single_threaded_executor_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/single_threaded_executor.h"
+
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/notification.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace {
+
+TEST(SingleThreadedExecutor, ConcurrentCallsToRunGetExecutedSerially) {
+  // Issue 100 concurrent calls to Run. The functions, though submitted
+  // concurrently, should execute serially.
+  tsl::Env* env = tsl::Env::Default();
+  const std::string name = "ConcurrentCallsToRunGetExecutedSerially";
+  xla::gpu::SingleThreadedExecutor executor(*env);
+
+  const int num_threads = 100;
+  int x = 0;
+  {
+    tsl::thread::ThreadPool pool(env, name, num_threads);
+    absl::BlockingCounter done(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      pool.Schedule([&]() {
+        executor.Execute([&]() {
+          x++;
+          done.DecrementCount();
+        });
+      });
+    }
+    done.Wait();
+  }
+  EXPECT_EQ(x, num_threads);
+}
+
+TEST(SingleThreadedExecutor, FunctionsRunOnOneThread) {
+  tsl::Env* env = tsl::Env::Default();
+  xla::gpu::SingleThreadedExecutor executor(*env);
+
+  // Get the thread id of the worker thread.
+  int64_t thread_id = 0;
+  absl::Notification done;
+  executor.Execute([&]() {
+    thread_id = env->GetCurrentThreadId();
+    done.Notify();
+  });
+  done.WaitForNotification();
+
+  // Confirm that every function runs on the same thread.
+  absl::BlockingCounter all_done(10);
+  for (int i = 0; i < 10; ++i) {
+    executor.Execute([&]() {
+      EXPECT_EQ(thread_id, env->GetCurrentThreadId());
+      all_done.DecrementCount();
+    });
+  }
+  all_done.Wait();
+}
+
+}  // namespace
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 9f57a36199ba..aeb5e45d99fc 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1,9 +1,9 @@
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility", "nvtx_headers")
+load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
@@ -21,6 +21,12 @@ package_group(
 # Runtime tracing libraries
 #===-------------------------------------------------------------------------------------------===//
 
+cc_library(
+    name = "thunk_runtime_dependencies",
+    # Register GPU collectives plugin.
+    deps = ["//xla/backends/gpu/collectives:gpu_collectives_plugin"],
+)
+
 cc_library(
     name = "annotation",
     srcs = ["annotation.cc"],
@@ -50,14 +56,15 @@ cc_library(
     srcs = ["command_buffer_cmd.cc"],
     hdrs = ["command_buffer_cmd.h"],
     deps = [
+        ":all_gather_thunk",
+        ":all_reduce_thunk",
+        ":all_to_all_thunk",
         ":annotation",
+        ":collective_broadcast_thunk",
+        ":collective_thunk",
         ":custom_call_thunk",
         ":dynamic_slice_thunk",
-        ":nccl_all_gather_thunk",
-        ":nccl_all_reduce_thunk",
-        ":nccl_all_to_all_thunk",
-        ":nccl_collective_broadcast_thunk",
-        ":nccl_collective_thunk",
+        ":gpublas_lt_matmul_thunk",
         ":thunk",
         "//xla:debug_options_flags",
         "//xla:executable_run_options",
@@ -72,6 +79,9 @@ cc_library(
         "//xla/ffi/api:c_api",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
+        "//xla/runtime:execution_graph",
+        "//xla/runtime:object_pool",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
@@ -93,7 +103,12 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:trace_command_buffer_factory",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tsl/lib/gtl:int_type",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -101,53 +116,18 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
-cc_library(
-    name = "command_buffer_cmd_emitter",
-    srcs = ["command_buffer_cmd_emitter.cc"],
-    hdrs = ["command_buffer_cmd_emitter.h"],
-    deps = [
-        ":command_buffer_cmd",
-        ":conditional_thunk",
-        ":copy_thunk",
-        ":cudnn_thunk",
-        ":custom_call_thunk",
-        ":gemm_thunk",
-        ":gpublas_lt_matmul_thunk",
-        ":kernel_thunk",
-        ":memset_thunk",
-        ":nccl_all_gather_thunk",
-        ":nccl_all_reduce_thunk",
-        ":nccl_all_to_all_thunk",
-        ":nccl_collective_thunk",
-        ":replica_id_thunk",
-        ":sequential_thunk",
-        ":thunk",
-        ":wait_for_streams_thunk",
-        ":while_thunk",
-        "//xla:util",
-        "//xla/runtime:buffer_use",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 xla_test(
     name = "command_buffer_cmd_test",
     srcs = if_gpu_is_configured(["command_buffer_cmd_test.cc"]),
@@ -171,15 +151,54 @@ xla_test(
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
+    ],
+)
+
+cc_library(
+    name = "command_buffer_cmd_emitter",
+    srcs = ["command_buffer_cmd_emitter.cc"],
+    hdrs = ["command_buffer_cmd_emitter.h"],
+    deps = [
+        ":all_gather_thunk",
+        ":all_reduce_thunk",
+        ":all_to_all_thunk",
+        ":collective_thunk",
+        ":command_buffer_cmd",
+        ":conditional_thunk",
+        ":copy_thunk",
+        ":cudnn_thunk",
+        ":custom_call_thunk",
+        ":dynamic_slice_thunk",
+        ":gemm_thunk",
+        ":gpublas_lt_matmul_thunk",
+        ":kernel_thunk",
+        ":memset_thunk",
+        ":replica_id_thunk",
+        ":sequential_thunk",
+        ":thunk",
+        ":wait_for_streams_thunk",
+        ":while_thunk",
+        "//xla:util",
+        "//xla/runtime:buffer_use",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -281,6 +300,7 @@ cc_library(
     hdrs = if_gpu_is_configured(["cholesky_thunk.h"]),
     deps = if_gpu_is_configured([
         # keep sorted
+        ":make_batch_pointers",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -288,7 +308,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:make_batch_pointers",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:gpu_solver_context",
@@ -300,7 +319,10 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
-    ]),
+    ]) + [
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/log:check",
+    ],
 )
 
 cc_library(
@@ -317,16 +339,16 @@ cc_library(
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
@@ -346,22 +368,29 @@ xla_test(
         "gpu_b200",
         "gpu_amd_any",
     ],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":command_buffer_cmd",
         ":command_buffer_thunk",
+        ":dynamic_slice_thunk",
+        ":gpublas_lt_matmul_thunk",
         ":memset_thunk",
         ":sequential_thunk",
         ":thunk",
+        "//xla:error_spec",
         "//xla:shape_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
+        "//xla/service:hlo_module_config",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/kernels:custom_kernel",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_description",
@@ -369,28 +398,71 @@ xla_test(
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:gpu_test_kernels",
         "//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
     ]),
 )
 
+xla_test(
+    name = "cuda_command_buffer_thunk_test",
+    srcs = ["cuda_command_buffer_thunk_test.cc"],
+    backends = ["gpu"],
+    tags = ["cuda-only"],
+    deps = [
+        ":command_buffer_cmd",
+        ":command_buffer_thunk",
+        ":sequential_thunk",
+        ":thunk",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/stream_executor:command_buffer",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:dnn",
+        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/cuda:cudnn_plugin",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@cudnn_frontend_archive//:cudnn_frontend",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 cc_library(
     name = "conditional_thunk",
     srcs = ["conditional_thunk.cc"],
@@ -399,26 +471,42 @@ cc_library(
         ":host_memory_pool",
         ":sequential_thunk",
         ":thunk",
-        "//xla:shape_util",
+        ":thunk_proto_cc",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:variant_visitor",
+        "//xla/service:overload",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "conditional_thunk_test",
+    srcs = ["conditional_thunk_test.cc"],
+    deps = [
+        ":conditional_thunk",
+        ":sequential_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -454,47 +542,70 @@ cc_library(
     hdrs = ["copy_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
+        ":while_thunk",
+        "//xla:literal_util",
+        "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "copy_thunk_test",
+    srcs = ["copy_thunk_test.cc"],
+    deps = [
+        ":copy_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
     name = "cub_sort_thunk",
-    srcs = if_gpu_is_configured(["cub_sort_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["cub_sort_thunk.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    deps = if_gpu_is_configured([
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/backends/gpu/runtime:thunk",
-        "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/stream_executor/gpu:gpu_types_header",
+    srcs = ["cub_sort_thunk.cc"],
+    hdrs = ["cub_sort_thunk.h"],
+    deps = [
+        ":thunk",
+        "//xla:executable_run_options",
         "//xla:shape_util",
-        "//xla:util",
         "//xla:xla_data_proto_cc",
-        "@local_tsl//tsl/platform:errors",
-    ] + ["//xla/service/gpu:cub_sort_kernel_" + suffix for suffix in get_cub_sort_kernel_types()]) + [
+        "//xla/ffi:call_frame",
+        "//xla/ffi:ffi_api",
+        "//xla/ffi/api:c_api",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -513,6 +624,7 @@ cc_library(
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/hlo/ir:hlo",
+        "//xla/runtime:object_pool",
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
@@ -521,11 +633,14 @@ cc_library(
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -600,6 +715,15 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "gemm_thunk_proto",
+    srcs = ["gemm_thunk.proto"],
+    protodeps = [
+        "//xla/service:buffer_assignment_proto",
+        "//xla/stream_executor/gpu:gpu_blas_lt_proto",
+    ],
+)
+
 cc_library(
     name = "gpublas_lt_matmul_thunk",
     srcs = ["gpublas_lt_matmul_thunk.cc"],
@@ -610,11 +734,13 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:logging",
@@ -622,6 +748,44 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "gpublas_lt_matmul_thunk_test",
+    srcs = ["gpublas_lt_matmul_thunk_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":gpublas_lt_matmul_thunk",
+        ":thunk",
+        "//xla:error_spec",
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tests:hlo_test_base",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "infeed_thunk",
     srcs = ["infeed_thunk.cc"],
@@ -661,6 +825,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:statusor",
@@ -668,6 +833,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -691,60 +857,81 @@ cc_library(
 )
 
 cc_library(
-    name = "nccl_all_gather_thunk",
-    srcs = ["nccl_all_gather_thunk.cc"],
-    hdrs = ["nccl_all_gather_thunk.h"],
+    name = "all_gather_thunk",
+    srcs = ["all_gather_thunk.cc"],
+    hdrs = ["all_gather_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
+        ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:stream",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "nccl_all_reduce_thunk",
-    srcs = ["nccl_all_reduce_thunk.cc"],
-    hdrs = ["nccl_all_reduce_thunk.h"],
+    name = "all_reduce_thunk",
+    srcs = ["all_reduce_thunk.cc"],
+    hdrs = ["all_reduce_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
+        ":all_reduce",
+        ":collective_thunk",
         ":thunk",
+        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
+        "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_handle",
+        "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "nccl_all_to_all_thunk",
-    srcs = ["nccl_all_to_all_thunk.cc"],
-    hdrs = ["nccl_all_to_all_thunk.h"],
+    name = "all_to_all_thunk",
+    srcs = ["all_to_all_thunk.cc"],
+    hdrs = ["all_to_all_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
+        ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -752,6 +939,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
@@ -760,6 +948,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -776,16 +965,19 @@ cc_library(
 )
 
 cc_library(
-    name = "nccl_ragged_all_to_all_thunk",
-    srcs = ["nccl_ragged_all_to_all_thunk.cc"],
-    hdrs = ["nccl_ragged_all_to_all_thunk.h"],
+    name = "ragged_all_to_all_thunk",
+    srcs = ["ragged_all_to_all_thunk.cc"],
+    hdrs = ["ragged_all_to_all_thunk.h"],
+    tags = ["gpu"],
     deps = [
-        ":nccl_collective_thunk",
+        ":collective_thunk",
+        ":ragged_all_to_all",
         ":thunk",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
@@ -797,11 +989,13 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -814,14 +1008,16 @@ cc_library(
 )
 
 cc_library(
-    name = "nccl_collective_broadcast_thunk",
-    srcs = ["nccl_collective_broadcast_thunk.cc"],
-    hdrs = ["nccl_collective_broadcast_thunk.h"],
+    name = "collective_broadcast_thunk",
+    srcs = ["collective_broadcast_thunk.cc"],
+    hdrs = ["collective_broadcast_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
+        ":collective_thunk",
         ":thunk",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
@@ -829,40 +1025,45 @@ cc_library(
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "nccl_collective_permute_thunk",
-    srcs = ["nccl_collective_permute_thunk.cc"],
-    hdrs = ["nccl_collective_permute_thunk.h"],
+    name = "collective_permute_thunk",
+    srcs = ["collective_permute_thunk.cc"],
+    hdrs = ["collective_permute_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
-        ":nccl_p2p_thunk_common",
+        ":collective_thunk",
+        ":p2p_thunk_common",
         ":thunk",
-        "//xla:status_macros",
+        "//xla:executable_run_options",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
+        "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -872,13 +1073,14 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ],
 )
 
 cc_library(
-    name = "nccl_collective_thunk",
-    srcs = ["nccl_collective_thunk.cc"],
-    hdrs = ["nccl_collective_thunk.h"],
+    name = "collective_thunk",
+    srcs = ["collective_thunk.cc"],
+    hdrs = ["collective_thunk.h"],
     deps = [
         ":thunk",
         "//xla:debug_options_flags",
@@ -886,9 +1088,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_cliques",
         "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/backends/gpu/collectives:gpu_collectives_plugin",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
@@ -899,45 +1099,39 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service:rendezvous",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-    ]),
+    ],
 )
 
 cc_library(
-    name = "nccl_p2p_thunk_common",
-    srcs = ["nccl_p2p_thunk_common.cc"],
-    hdrs = ["nccl_p2p_thunk_common.h"],
+    name = "p2p_thunk_common",
+    srcs = ["p2p_thunk_common.cc"],
+    hdrs = ["p2p_thunk_common.h"],
     deps = [
-        ":nccl_collective_thunk",
+        ":collective_thunk",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:collective_ops_utils",
@@ -954,12 +1148,12 @@ cc_library(
 )
 
 cc_library(
-    name = "nccl_recv_thunk",
-    srcs = ["nccl_recv_thunk.cc"],
-    hdrs = ["nccl_recv_thunk.h"],
+    name = "recv_thunk",
+    srcs = ["recv_thunk.cc"],
+    hdrs = ["recv_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
-        ":nccl_p2p_thunk_common",
+        ":collective_thunk",
+        ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
         "//xla/backends/gpu/collectives:gpu_clique_key",
@@ -972,6 +1166,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -982,12 +1177,12 @@ cc_library(
 )
 
 cc_library(
-    name = "nccl_send_thunk",
-    srcs = ["nccl_send_thunk.cc"],
-    hdrs = ["nccl_send_thunk.h"],
+    name = "send_thunk",
+    srcs = ["send_thunk.cc"],
+    hdrs = ["send_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
-        ":nccl_p2p_thunk_common",
+        ":collective_thunk",
+        ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
         "//xla/backends/gpu/collectives:gpu_clique_key",
@@ -1000,6 +1195,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -1010,22 +1206,27 @@ cc_library(
 )
 
 cc_library(
-    name = "nccl_group_thunk",
-    srcs = ["nccl_group_thunk.cc"],
-    hdrs = ["nccl_group_thunk.h"],
+    name = "collective_group_thunk",
+    srcs = ["collective_group_thunk.cc"],
+    hdrs = ["collective_group_thunk.h"],
     deps = [
-        ":nccl_collective_thunk",
+        ":collective_thunk",
         ":thunk",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/backends/gpu/collectives:gpu_communicator",
+        "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
-        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -1086,24 +1287,43 @@ cc_library(
     name = "sequential_thunk",
     srcs = ["sequential_thunk.cc"],
     hdrs = ["sequential_thunk.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         ":annotation",
         ":thunk",
+        ":thunk_proto_cc",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+xla_cc_test(
+    name = "sequential_thunk_test",
+    srcs = ["sequential_thunk_test.cc"],
+    deps = [
+        ":sequential_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
-    name = "send_recv_thunk",
-    srcs = ["send_recv_thunk.cc"],
-    hdrs = ["send_recv_thunk.h"],
+    name = "host_send_recv_thunk",
+    srcs = ["host_send_recv_thunk.cc"],
+    hdrs = ["host_send_recv_thunk.h"],
     deps = [
         ":thunk",
         "//xla:shape_util",
@@ -1112,17 +1332,19 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:event",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -1132,6 +1354,7 @@ cc_library(
     srcs = ["thunk.cc"],
     hdrs = ["thunk.h"],
     deps = [
+        ":thunk_proto_cc",
         "//xla:executable_run_options",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
@@ -1153,13 +1376,13 @@ cc_library(
         "//xla/tsl/lib/gtl:int_type",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1187,6 +1410,7 @@ cc_library(
     hdrs = if_gpu_is_configured(["triangular_solve_thunk.h"]),
     deps = if_gpu_is_configured([
         # keep sorted
+        ":make_batch_pointers",
         "//xla:status_macros",
         "//xla:types",
         "//xla:util",
@@ -1195,7 +1419,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:make_batch_pointers",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
@@ -1216,24 +1439,23 @@ cc_library(
         ":host_memory_pool",
         ":sequential_thunk",
         ":thunk",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
@@ -1241,10 +1463,15 @@ xla_test(
     name = "while_thunk_test",
     srcs = ["while_thunk_test.cc"],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
+        ":sequential_thunk",
         ":thunk",
         ":while_thunk",
         "//xla:executable_run_options",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
@@ -1252,12 +1479,15 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1317,3 +1547,273 @@ cc_library(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+cc_library(
+    name = "buffer_comparator",
+    srcs = ["buffer_comparator.cc"],
+    hdrs = ["buffer_comparator.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_handle",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:buffer_comparator_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "@com_google_absl//absl/status:statusor",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+xla_test(
+    name = "buffer_comparator_test",
+    srcs = ["buffer_comparator_test.cc"],
+    backends = ["gpu"],
+    disabled_backends = [],
+    deps = [
+        ":buffer_comparator",
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:stream_executor_util",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_handle",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:ml_dtypes",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "make_batch_pointers",
+    srcs = ["make_batch_pointers.cc"],
+    hdrs = ["make_batch_pointers.h"],
+    deps = [
+        "//xla:types",
+        "//xla:util",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:make_batch_pointers_kernel",
+        "//xla/stream_executor/rocm:rocm_platform_id",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+xla_test(
+    name = "make_batch_pointers_test",
+    srcs = ["make_batch_pointers_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":make_batch_pointers",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "all_reduce",
+    srcs = ["all_reduce.cc"],
+    hdrs = ["all_reduce.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:all_reduce_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "all_reduce_test",
+    srcs = ["all_reduce_test.cc"],
+    backends = ["gpu"],
+    disabled_backends = [],
+    deps = [
+        ":all_reduce",
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_handle",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor/gpu:gpu_init",
+        "//xla/stream_executor/host:host_platform",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "ragged_all_to_all",
+    srcs = ["ragged_all_to_all.cc"],
+    hdrs = ["ragged_all_to_all.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:ragged_all_to_all_kernel",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "ragged_all_to_all_test",
+    srcs = ["ragged_all_to_all_test.cc"],
+    backends = ["gpu"],
+    disabled_backends = [],
+    deps = [
+        ":ragged_all_to_all",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_handle",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor/gpu:gpu_init",
+        "//xla/stream_executor/gpu:ragged_all_to_all_kernel",
+        "//xla/stream_executor/host:host_platform",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "topk",
+    srcs = ["topk.cc"],
+    hdrs = ["topk.h"],
+    compatible_with = [],
+    tags = ["gpu"],
+    deps = [
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:topk_kernel",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_test(
+    name = "topk_test",
+    srcs = ["topk_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":topk",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/host:host_platform",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_proto_library(
+    name = "thunk_proto",
+    srcs = [
+        "thunk.proto",
+    ],
+    protodeps = [
+        "//xla/service:buffer_assignment_proto",
+    ],
+)
+
+cc_library(
+    name = "thunk_proto_deserialization",
+    srcs = ["thunk_proto_deserialization.cc"],
+    hdrs = ["thunk_proto_deserialization.h"],
+    deps = [
+        ":sequential_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "thunk_proto_deserialization_test",
+    srcs = ["thunk_proto_deserialization_test.cc"],
+    deps = [
+        ":sequential_thunk",
+        ":thunk",
+        ":thunk_proto_deserialization",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
new file mode 100644
index 000000000000..d0220d086870
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
@@ -0,0 +1,135 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace impl {
+AllGatherConfig GetAllGatherConfig(const HloAllGatherInstruction* inst) {
+  AllGatherConfig config;
+  config.config = GetCollectiveConfig(inst, inst->use_global_device_ids());
+  return config;
+}
+
+absl::Status CheckImplementableInst(const HloAllGatherInstruction* inst) {
+  for (HloInstruction* operand : inst->operands()) {
+    const Shape& shape = operand->shape();
+
+    TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kAllGather));
+
+    if (!ShapeUtil::IsEffectivelyMostMajorDimension(
+            shape, inst->all_gather_dimension())) {
+      return absl::AbortedError(absl::StrFormat(
+          "all-gather dim %u is not the most major in input shape %s",
+          inst->all_gather_dimension(), shape.ToString(/*print_layout=*/true)));
+    }
+  }
+
+  return absl::OkStatus();
+}
+}  // namespace impl
+
+AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
+                                         const HloAllGatherInstruction* inst,
+                                         std::vector<Buffer> buffers,
+                                         bool p2p_memcpy_enabled)
+    : CollectiveThunk(Thunk::kAllGatherStart, thunk_info,
+                      IsGPUSyncCollective(*inst), AsyncStreamKind::kCollective),
+      config_(impl::GetAllGatherConfig(inst)),
+      buffers_(std::move(buffers)) {
+  CHECK_EQ(config_.config.operand_count, buffers_.size());
+}
+
+/*static*/ absl::Status AllGatherStartThunk::CheckImplementable(
+    const HloAllGatherInstruction* inst, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<AllGatherStartThunk>(
+      impl::CheckImplementableInst(inst), inst, replica_count, partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode AllGatherStartThunk::GetGroupMode(
+    const HloAllGatherInstruction* inst) {
+  return impl::GetAllGatherConfig(inst).config.group_mode;
+}
+
+absl::Status AllGatherStartThunk::RunCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_,
+                             config_.config.operand_element_type));
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  return xla::gpu::RunAllGather(collectives, device_buffers, stream,
+                                comm_handle.comm);
+}
+
+absl::Status RunAllGather(GpuCollectives* collectives,
+                          std::vector<DeviceBufferPair>& buffers,
+                          se::Stream& stream, Communicator* comm) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing all-gather from device ordinal: " << device_ordinal;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
+
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm, collectives->TryCast(comm));
+  tsl::AsyncValueRef<Communicator::Event> event = gpu_comm->GroupExecute(
+      [&buffers, &stream](GpuCommunicator* comm) -> absl::Status {
+        for (DeviceBufferPair& buffer : buffers) {
+          TF_RETURN_IF_ERROR(comm->LaunchAllGather(
+              buffer.source_buffer, buffer.destination_buffer,
+              buffer.element_type, buffer.element_count,
+              GpuCollectives::On(stream)));
+        }
+        return absl::OkStatus();
+      });
+
+  tsl::BlockUntilReady(event);
+  VLOG(3) << "Done performing all-gather for ordinal: " << device_ordinal;
+  if (event.IsError()) {
+    return event.GetError();
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
new file mode 100644
index 000000000000..76d9a61a9486
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
@@ -0,0 +1,73 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_ALL_GATHER_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_ALL_GATHER_THUNK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct AllGatherConfig {
+  CollectiveConfig config;
+};
+
+// Thunk that performs an All-Gather among CUDA GPU-based replicas.
+class AllGatherStartThunk : public CollectiveThunk {
+ public:
+  AllGatherStartThunk(ThunkInfo thunk_info, const HloAllGatherInstruction* inst,
+                      std::vector<Buffer> buffers,
+                      bool p2p_memcpy_enabled = false);
+
+  static const char* GetHloOpName() { return "all-gather-start"; }
+
+  static absl::Status CheckImplementable(const HloAllGatherInstruction* inst,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloAllGatherInstruction* inst);
+
+  const CollectiveConfig& config() const override { return config_.config; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+
+ private:
+  const AllGatherConfig config_;
+  const std::vector<Buffer> buffers_;
+};
+
+absl::Status RunAllGather(GpuCollectives* collectives,
+                          std::vector<DeviceBufferPair>& buffers,
+                          se::Stream& stream, Communicator* comm);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_ALL_GATHER_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
new file mode 100644
index 000000000000..5dff74714a78
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
@@ -0,0 +1,120 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_reduce.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/all_reduce_kernel.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace {
+template <typename T>
+absl::Status LaunchTypedKernel(
+    se::Stream* stream, se::StreamExecutor* executor,
+    const se::ThreadDim& thread_dims, const se::BlockDim& block_dims,
+    const std::array<T*, stream_executor::gpu::kMaxNumAllReduceInputPtrs>&
+        input_ptrs,
+    se::DeviceMemoryBase output_buffer, int64_t num_inputs,
+    int64_t num_elements) {
+  TF_ASSIGN_OR_RETURN(auto kernel,
+                      se::gpu::GpuKernelRegistry::GetGlobalRegistry()
+                          .LoadKernel<se::gpu::AllReduceKernel<T>>(executor));
+
+  return kernel.Launch(thread_dims, block_dims, stream, input_ptrs,
+                       output_buffer, num_inputs, num_elements);
+}
+}  // namespace
+
+bool IsAllReduceKernelSupported(int64_t num_inputs, int64_t num_elements,
+                                PrimitiveType element_type) {
+  // The kernel always vectorizes to 4 elements per thread.
+  if (num_elements % 4 != 0) {
+    return false;
+  }
+
+  // The kernel is only supported for up to 8 devices.
+  if (num_inputs > stream_executor::gpu::kMaxNumAllReduceInputPtrs) {
+    return false;
+  }
+
+  return element_type == BF16 || element_type == F32;
+}
+
+absl::Status RunAllReduceKernel(
+    se::Stream* stream, PrimitiveType element_type,
+    absl::Span<const se::DeviceMemoryBase> input_buffers,
+    se::DeviceMemoryBase output_buffer, int64_t num_inputs,
+    int64_t num_elements) {
+  if (input_buffers.size() > stream_executor::gpu::kMaxNumAllReduceInputPtrs) {
+    return absl::InvalidArgumentError(
+        "Number of input pointers exceeds the maximum supported number of "
+        "input pointers.");
+  }
+
+  se::StreamExecutor* executor = stream->parent();
+
+  // TODO(b/383125489): Fine tune the block and thread dimensions.
+  static constexpr size_t kBlocks = 8;
+  static constexpr size_t kThreads = 512;
+  se::ThreadDim thread_dims(kThreads, 1, 1);
+  se::BlockDim block_dims(kBlocks, 1, 1);
+
+  auto launch_kernel = [&](auto type) -> absl::Status {
+    using T = decltype(type);
+
+    std::array<T*, stream_executor::gpu::kMaxNumAllReduceInputPtrs> input_ptrs;
+    absl::c_transform(input_buffers, input_ptrs.begin(),
+                      [](se::DeviceMemoryBase buffer) {
+                        return tsl::safe_reinterpret_cast<T*>(buffer.opaque());
+                      });
+
+    return LaunchTypedKernel<T>(stream, executor, thread_dims, block_dims,
+                                input_ptrs, output_buffer, num_inputs,
+                                num_elements);
+  };
+
+  switch (element_type) {
+    case BF16:
+      return launch_kernel(xla::bfloat16{});
+    case F32:
+      return launch_kernel(float{});
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported element type: ",
+                       primitive_util::LowercasePrimitiveTypeName(element_type),
+                       " for AllReduce kernel."));
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
new file mode 100644
index 000000000000..0c97d811c673
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.h
@@ -0,0 +1,59 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_ALL_REDUCE_H_
+#define XLA_BACKENDS_GPU_RUNTIME_ALL_REDUCE_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+// Returns true if the all-reduce kernel is supported for the given number of
+// inputs, elements and element type.
+bool IsAllReduceKernelSupported(int64_t num_inputs, int64_t num_elements,
+                                PrimitiveType element_type);
+
+// Performs element-wise addition of all input buffers and stores the result in
+// the output buffer.
+// The kernel is intended to be used for all-reduce operations in environment
+// where direct peer memory access is available. Input buffers can point to
+// memory on different devices. The caller is responsible to gather pointers
+// from different devices.
+//
+// TODO(b/383125489): Add synchronization between blocks in the kernek.
+// The caller is also responsible to synchronize streams on all participating
+// devices before and after the kernel execution.
+//
+// Input arguments:
+//  - input_buffers: A list of input buffers.
+//  - output_buffer: The buffer to store the result.
+//  - num_inputs: The number of input buffers.
+//  - num_elements: The number of elements in each buffer.
+absl::Status RunAllReduceKernel(
+    se::Stream* stream, PrimitiveType element_type,
+    absl::Span<const se::DeviceMemoryBase> input_buffers,
+    se::DeviceMemoryBase output_buffer, int64_t num_inputs,
+    int64_t num_elements);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_ALL_REDUCE_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
new file mode 100644
index 000000000000..9d8d8ad3cf10
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_reduce.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_handle.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+se::StreamExecutor* GetGpuExecutor() {
+  auto* platform =
+      se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
+  return platform->ExecutorForDevice(0).value();
+}
+
+template <typename T>
+class AllReduceKernelTest : public ::testing::Test {};
+
+using AllReduceKernelTestTypes = ::testing::Types<float, bfloat16>;
+
+class AllReduceKernelTestNameGenerator {
+ public:
+  template <typename T>
+  static std::string GetName(int) {
+    if constexpr (std::is_same_v<T, float>) {
+      return "f32";
+    }
+    if constexpr (std::is_same_v<T, bfloat16>) {
+      return "bf16";
+    }
+  }
+};
+
+TYPED_TEST_SUITE(AllReduceKernelTest, AllReduceKernelTestTypes,
+                 AllReduceKernelTestNameGenerator);
+
+TYPED_TEST(AllReduceKernelTest, SimpleKernelTest) {
+  using T = TypeParam;
+
+  auto* executor = GetGpuExecutor();
+  auto stream = executor->CreateStream().value();
+
+  constexpr int64_t num_inputs = 2;
+  constexpr int64_t num_elements = 128000;
+
+  std::vector<se::DeviceMemoryHandle> input_buffers;
+  for (int64_t i = 0; i < num_inputs; ++i) {
+    input_buffers.emplace_back(executor,
+                               executor->AllocateArray<T>(num_elements));
+    ASSERT_TRUE(!input_buffers[i].memory().is_null());
+  }
+
+  se::DeviceMemoryHandle output_buffer(
+      executor, executor->AllocateArray<T>(num_elements));
+  ASSERT_TRUE(!output_buffer.memory().is_null());
+
+  std::vector<T> output_data(num_elements);
+  for (int i = 0; i < num_inputs; ++i) {
+    std::vector<T> input_data(num_elements);
+    std::iota(input_data.begin(), input_data.end(), static_cast<T>(0));
+
+    TF_ASSERT_OK(stream->Memcpy(input_buffers[i].memory_ptr(),
+                                input_data.data(), num_elements * sizeof(T)));
+
+    std::transform(input_data.begin(), input_data.end(), output_data.begin(),
+                   output_data.begin(), std::plus<T>());
+  }
+
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  std::vector<se::DeviceMemoryBase> input_buffers_span;
+  for (auto& input_buffer : input_buffers) {
+    input_buffers_span.push_back(input_buffer.memory());
+  }
+
+  TF_ASSERT_OK(RunAllReduceKernel(
+      stream.get(), primitive_util::NativeToPrimitiveType<T>(),
+      input_buffers_span, output_buffer.memory(), num_inputs, num_elements));
+
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  std::vector<T> output_results(num_elements);
+  TF_ASSERT_OK(stream->Memcpy(output_results.data(), output_buffer.memory(),
+                              num_elements * sizeof(T)));
+
+  EXPECT_EQ(output_results, output_data);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
new file mode 100644
index 000000000000..3a5a0e9e8940
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
@@ -0,0 +1,486 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/all_reduce.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+#include "xla/service/rendezvous.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_handle.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+constexpr int64_t kMaxOneShotAllReduceSizeBytes = 256 * 1024;
+
+// Contains the values that are passed between host threads with rendezvous.
+struct RendezvousValue {
+  RankId rank;
+  se::DeviceMemoryBase input_buffer;
+  se::Event* start_event;
+  se::Event* end_event;
+
+  bool operator<(const RendezvousValue& other) const {
+    return rank < other.rank;
+  }
+};
+
+// Executes the rendezvous before the kernel start.
+// Inserts CUDA events into the stream to ensure that all devices have reached
+// the start event before the kernel starts.
+absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
+RendezvousBeforeKernelStart(const GpuCliqueKey& clique_key, RankId rank,
+                            int64_t num_ranks,
+                            const se::DeviceMemoryBase& input_buffer,
+                            se::Stream& stream, se::Event* start_event,
+                            se::Event* end_event) {
+  RendezvousValue rendezvous_value;
+  rendezvous_value.rank = rank;
+  rendezvous_value.input_buffer = input_buffer;
+  rendezvous_value.start_event = start_event;
+  rendezvous_value.end_event = end_event;
+
+  // Record that this device has started executing the kernel. We do
+  // this before the rendezvous to make sure that RecordEvent is called before
+  // WaitFor on another stream.
+  TF_RETURN_IF_ERROR(stream.RecordEvent(start_event));
+
+  auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
+    std::vector<RendezvousValue> values_copy;
+    for (const auto& value : values) {
+      values_copy.push_back(*value);
+    }
+    // Sort to make sure that values are in the same order as the devices are
+    // ordered in the communicator.
+    absl::c_sort(values_copy);
+    return values_copy;
+  };
+
+  std::string start_rendezvous_key =
+      absl::StrFormat("start one-shot all-reduce for rank %d, clique %s",
+                      rank.value(), clique_key.ToString());
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
+      Rendezvous<std::vector<RendezvousValue>>(
+          /*name=*/start_rendezvous_key, /*key=*/clique_key,
+          /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
+          rendezvous_fn));
+
+  // Wait for all devices to reach the start event. This indicates that all
+  // output buffers are ready for transfer.
+  for (auto& value : *rendezvous_values) {
+    TF_RETURN_IF_ERROR(stream.WaitFor(value.start_event));
+  }
+
+  return rendezvous_values;
+}
+
+// Executes the rendezvous after the kernel finish. Waits for all devices to
+// reach the end event.
+absl::Status RendezvousAfterKernelFinish(
+    const GpuCliqueKey& clique_key, RankId rank, int64_t num_ranks,
+    se::Stream& stream, se::Event* end_event,
+    const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
+  // Record that this device has finished executing the kernel.
+  TF_RETURN_IF_ERROR(stream.RecordEvent(end_event));
+
+  // Do another rendezvous to make sure that we call RecordEvent for end_event
+  // before WaitFor on another stream.
+  std::string finish_rendezvous_key =
+      absl::StrFormat("finish one-shot all-reduce for rank %d, clique %s",
+                      rank.value(), clique_key.ToString());
+  TF_RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
+                                /*key=*/clique_key,
+                                /*num_threads=*/num_ranks));
+
+  // Wait for all devices to reach the end event. This indicates that all
+  // updates from other devices have arrived.
+  for (auto& value : *rendezvous_values) {
+    TF_RETURN_IF_ERROR(stream.WaitFor(value.end_event));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status RunOneShotAllReduce(const GpuCliqueKey& clique_key, RankId rank,
+                                 std::vector<DeviceBufferPair>& buffers,
+                                 se::Stream& stream, Communicator* comm,
+                                 se::DeviceMemoryBase local_buffer,
+                                 se::Event* start_event, se::Event* end_event) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing one-shot all-reduce from device ordinal: "
+          << device_ordinal;
+
+  // TODO(b/407736956): Support variadic all-reduce.
+  if (buffers.size() > 1) {
+    return absl::UnimplementedError(
+        "One-shot kernel does not support variadic all-reduce");
+  }
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+
+  const DeviceBufferPair& buffer = buffers[0];
+
+  // Buffer assignment aliases the source buffer to the destination buffer. This
+  // works for NCCL implementation, but for one-shot kernel, input and output
+  // buffers should be different. We do not have enough information at buffer
+  // assignement time to change aliasing, so we allocate a new device buffer
+  // ourselves and copy the data to it.
+  // TODO(b/407736956): Fuse the copy into the one-shot kernel.
+  TF_RETURN_IF_ERROR(stream.MemcpyD2D(&local_buffer, buffer.source_buffer,
+                                      buffer.source_buffer.size()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
+      RendezvousBeforeKernelStart(clique_key, rank, num_ranks, local_buffer,
+                                  stream, start_event, end_event));
+
+  absl::InlinedVector<se::DeviceMemoryBase, 4> input_ptrs;
+  for (auto& value : *rendezvous_values) {
+    input_ptrs.push_back(value.input_buffer);
+  }
+
+  TF_RETURN_IF_ERROR(RunAllReduceKernel(&stream, buffer.element_type,
+                                        input_ptrs, buffer.destination_buffer,
+                                        num_ranks, buffer.element_count));
+
+  TF_RETURN_IF_ERROR(RendezvousAfterKernelFinish(
+      clique_key, rank, num_ranks, stream, end_event, rendezvous_values));
+
+  return absl::OkStatus();
+}
+
+absl::Status CheckImplementableInst(const HloInstruction* inst,
+                                    Thunk::Kind reduction_op) {
+  for (HloInstruction* operand : inst->operands()) {
+    TF_RETURN_IF_ERROR(IsValidOperand(operand->shape(), reduction_op));
+  }
+
+  if (!MatchReductionComputation(inst->called_computations().front())
+           .has_value()) {
+    return absl::UnimplementedError("Unrecognized reduction computation");
+  }
+
+  return absl::OkStatus();
+}
+
+template <typename HloInstType>
+AllReduceConfig GetAllReduceConfigInst(HloInstType* inst) {
+  std::optional<ReductionKind> reduction_kind =
+      MatchReductionComputation(inst->called_computations().front());
+  CHECK(reduction_kind.has_value());
+
+  AllReduceConfig config;
+  config.config = GetCollectiveConfig(inst, inst->use_global_device_ids());
+  config.reduction_kind = *reduction_kind;
+  return config;
+}
+
+template <typename HloInstType>
+CollectiveOpGroupMode GetGroupModeInst(HloInstType* inst) {
+  return GetAllReduceConfigInst(inst).config.group_mode;
+}
+
+}  // namespace
+
+absl::Status RunAllReduce(GpuCollectives* collectives,
+                          ReductionKind reduction_kind,
+                          std::vector<DeviceBufferPair>& buffers,
+                          se::Stream& stream, Communicator* comm) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing all-reduce from device ordinal: " << device_ordinal;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
+
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm, collectives->TryCast(comm));
+  tsl::AsyncValueRef<Communicator::Event> event =
+      gpu_comm->GroupExecute([reduction_kind, &buffers,
+                              &stream](GpuCommunicator* comm) -> absl::Status {
+        for (DeviceBufferPair& buffer : buffers) {
+          TF_RETURN_IF_ERROR(comm->LaunchAllReduce(
+              buffer.source_buffer, buffer.destination_buffer,
+              buffer.element_type, buffer.element_count, reduction_kind,
+              GpuCollectives::On(stream)));
+        }
+        return absl::OkStatus();
+      });
+  tsl::BlockUntilReady(event);
+  if (event.IsError()) {
+    return event.GetError();
+  }
+  return absl::OkStatus();
+}
+
+AllReduceReduceScatterThunkBase::AllReduceReduceScatterThunkBase(
+    Thunk::Kind kind, ThunkInfo thunk_info, AllReduceConfig config,
+    std::vector<Buffer> buffers, bool is_sync)
+    : CollectiveThunk(kind, thunk_info, is_sync, AsyncStreamKind::kCollective),
+      config_(std::move(config)),
+      buffers_(std::move(buffers)) {
+  CHECK_EQ(config_.config.operand_count, buffers_.size());
+}
+
+AllReduceStartThunk::AllReduceStartThunk(ThunkInfo thunk_info,
+                                         const HloAllReduceInstruction* inst,
+                                         std::vector<Buffer> buffers,
+                                         bool p2p_memcpy_enabled)
+    : AllReduceReduceScatterThunkBase(
+          Thunk::kAllReduceStart, thunk_info, GetAllReduceConfigInst(inst),
+          std::move(buffers), IsGPUSyncCollective(*inst)),
+      one_shot_kernel_enabled_(
+          inst->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_unsupported_use_all_reduce_one_shot_kernel()) {}
+
+absl::Status AllReduceStartThunk::CheckImplementable(
+    const HloAllReduceInstruction* inst, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<AllReduceStartThunk>(
+      CheckImplementableInst(inst, Thunk::kAllReduceStart), inst, replica_count,
+      partition_count);
+}
+
+CollectiveOpGroupMode AllReduceStartThunk::GetGroupMode(
+    const HloAllReduceInstruction* inst) {
+  return GetGroupModeInst(inst);
+}
+
+absl::StatusOr<bool> AllReduceStartThunk::ShouldUseOneShotAllReduceKernel(
+    const GpuCliqueKey& clique_key,
+    const CollectiveCliques* collective_cliques) {
+  if (!one_shot_kernel_enabled_) {
+    return false;
+  }
+
+  // TODO(b/407736956): Support variadic all-reduce.
+  if (buffers_.size() != 1) {
+    return false;
+  }
+
+  int64_t num_elements = buffers_[0].element_count;
+  PrimitiveType element_type = config().operand_element_type[0];
+
+  int64_t input_size_bytes =
+      num_elements * ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+
+  // One-shot all-reduce is only beneficial for small inputs.
+  if (input_size_bytes > kMaxOneShotAllReduceSizeBytes) {
+    return false;
+  }
+
+  TF_ASSIGN_OR_RETURN(bool peer_access_enabled,
+                      collective_cliques->peer_access_enabled(clique_key));
+
+  // Check that peer access is enabled.
+  if (!peer_access_enabled) {
+    return false;
+  }
+
+  return IsAllReduceKernelSupported(clique_key.num_local_participants(),
+                                    num_elements,
+                                    config().operand_element_type[0]);
+}
+
+absl::Status AllReduceStartThunk::Initialize(const InitializeParams& params) {
+  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  TF_ASSIGN_OR_RETURN(
+      GpuCliqueKey clique_key,
+      GetGpuCliqueKey(collectives, *params.collective_params,
+                      config().replica_groups, config().group_mode,
+                      GetAsyncStreamKind()));
+
+  TF_ASSIGN_OR_RETURN(
+      bool use_one_shot_kernel,
+      ShouldUseOneShotAllReduceKernel(clique_key, params.collective_cliques));
+
+  if (use_one_shot_kernel) {
+    absl::MutexLock lock(&mutex_);
+
+    if (!local_buffer_allocs_.contains(params.executor)) {
+      int64_t max_size = 0;
+      for (auto buffer : buffers_) {
+        max_size = std::max(max_size, buffer.source_buffer.size());
+      }
+
+      se::DeviceMemoryHandle local_buffer_alloc(
+          params.executor, params.executor->Allocate(max_size));
+
+      local_buffer_allocs_.emplace(params.executor,
+                                   std::move(local_buffer_alloc));
+    }
+
+    if (!start_events_.contains(params.executor)) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Event> event,
+                          params.executor->CreateEvent());
+      start_events_.emplace(params.executor, std::move(event));
+    }
+
+    if (!end_events_.contains(params.executor)) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Event> event,
+                          params.executor->CreateEvent());
+      end_events_.emplace(params.executor, std::move(event));
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status AllReduceStartThunk::RunCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_,
+                             config_.config.operand_element_type));
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+
+  TF_ASSIGN_OR_RETURN(bool use_one_shot_kernel,
+                      ShouldUseOneShotAllReduceKernel(
+                          comm_handle.clique_key, params.collective_cliques));
+
+  if (use_one_shot_kernel) {
+    se::Event* start_event = nullptr;
+    se::Event* end_event = nullptr;
+    se::DeviceMemoryBase local_buffer;
+    {
+      absl::MutexLock lock(&mutex_);
+      local_buffer = local_buffer_allocs_[stream.parent()].memory();
+      start_event = start_events_[stream.parent()].get();
+      end_event = end_events_[stream.parent()].get();
+    }
+
+    std::optional<RankId> rank =
+        comm_handle.clique_key.rank(params.collective_params->global_device_id);
+
+    return RunOneShotAllReduce(comm_handle.clique_key, *rank, device_buffers,
+                               stream, comm_handle.comm, local_buffer,
+                               start_event, end_event);
+  }
+
+  return RunAllReduce(collectives, config_.reduction_kind, device_buffers,
+                      stream, comm_handle.comm);
+}
+
+ReduceScatterStartThunk::ReduceScatterStartThunk(
+    ThunkInfo thunk_info, const HloReduceScatterInstruction* inst,
+    std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
+    : AllReduceReduceScatterThunkBase(
+          Thunk::kReduceScatterStart, thunk_info, GetAllReduceConfigInst(inst),
+          std::move(buffers), IsGPUSyncCollective(*inst)) {}
+
+/*static*/ absl::Status ReduceScatterStartThunk::CheckImplementable(
+    const HloReduceScatterInstruction* inst, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<ReduceScatterStartThunk>(
+      CheckImplementableInst(inst, Thunk::kReduceScatterStart), inst,
+      replica_count, partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode ReduceScatterStartThunk::GetGroupMode(
+    const HloReduceScatterInstruction* inst) {
+  return GetGroupModeInst(inst);
+}
+
+absl::Status ReduceScatterStartThunk::RunCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_,
+                             config_.config.operand_element_type));
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  return RunReduceScatter(collectives, config_.reduction_kind, device_buffers,
+                          stream, comm_handle.comm);
+}
+
+absl::Status RunReduceScatter(GpuCollectives* collectives,
+                              ReductionKind reduction_kind,
+                              std::vector<DeviceBufferPair>& buffers,
+                              se::Stream& stream, Communicator* comm) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing reduce-scatter from device ordinal: "
+          << device_ordinal;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
+
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm, collectives->TryCast(comm));
+  tsl::AsyncValueRef<Communicator::Event> event =
+      gpu_comm->GroupExecute([num_ranks, reduction_kind, &buffers,
+                              &stream](GpuCommunicator* comm) -> absl::Status {
+        for (DeviceBufferPair& buffer : buffers) {
+          // buffer.element_count is the source buffers element count. For
+          // ncclReduceScatter, we need the destination buffers element count.
+          TF_RET_CHECK(buffer.element_count % num_ranks == 0)
+              << "Source buffer was not an exact multiple of the number of "
+                 "participants.";
+
+          TF_RETURN_IF_ERROR(comm->LaunchReduceScatter(
+              buffer.source_buffer, buffer.destination_buffer,
+              buffer.element_type, buffer.element_count / num_ranks,
+              reduction_kind, GpuCollectives::On(stream)));
+        }
+        return absl::OkStatus();
+      });
+  tsl::BlockUntilReady(event);
+  if (event.IsError()) {
+    return event.GetError();
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
new file mode 100644
index 000000000000..8dc2d14ba5cf
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
@@ -0,0 +1,154 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_ALL_REDUCE_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_ALL_REDUCE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory_handle.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct AllReduceConfig {
+  CollectiveConfig config;
+  ReductionKind reduction_kind;
+};
+
+// Thunk that performs a NCCL-based All-Reduce or Reduce-Scatter among CUDA
+// GPU-based replicas.
+class AllReduceReduceScatterThunkBase : public CollectiveThunk {
+ public:
+  AllReduceReduceScatterThunkBase(Kind kind, ThunkInfo thunk_info,
+                                  AllReduceConfig config,
+                                  std::vector<Buffer> buffers, bool is_sync);
+
+  const CollectiveConfig& config() const override { return config_.config; }
+  ReductionKind reduction_kind() const { return config_.reduction_kind; }
+
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  const AllReduceConfig config_;
+  const std::vector<Buffer> buffers_;
+};
+
+// -----------------------------------------------------------------------------
+// AllReduce thunk.
+// -----------------------------------------------------------------------------
+
+class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
+ public:
+  AllReduceStartThunk(ThunkInfo thunk_info, const HloAllReduceInstruction* inst,
+                      std::vector<Buffer> buffers,
+                      bool p2p_memcpy_enabled = false);
+
+  static const char* GetHloOpName() { return "all-reduce-start"; }
+
+  static absl::Status CheckImplementable(const HloAllReduceInstruction* inst,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloAllReduceInstruction* inst);
+
+  absl::StatusOr<bool> ShouldUseOneShotAllReduceKernel(
+      const GpuCliqueKey& clique_key,
+      const CollectiveCliques* collective_cliques);
+
+  absl::Status Initialize(const InitializeParams& params) override;
+
+ protected:
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+
+ private:
+  bool one_shot_kernel_enabled_ = false;
+
+  absl::Mutex mutex_;
+
+  // Local buffer allocations to copy input data for the one-shot kernel.
+  absl::flat_hash_map<se::StreamExecutor*, se::DeviceMemoryHandle>
+      local_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
+
+  // Events to synchronize steams on different devices at the start of the
+  // one-shot kernel.
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>>
+      start_events_ ABSL_GUARDED_BY(mutex_);
+
+  // Events to synchronize steams on different devices at the end of the
+  // one-shot kernel.
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>>
+      end_events_ ABSL_GUARDED_BY(mutex_);
+};
+
+// -----------------------------------------------------------------------------
+// ReduceScatter thunk
+// -----------------------------------------------------------------------------
+
+class ReduceScatterStartThunk : public AllReduceReduceScatterThunkBase {
+ public:
+  ReduceScatterStartThunk(ThunkInfo thunk_info,
+                          const HloReduceScatterInstruction* inst,
+                          std::vector<Buffer> buffers,
+                          bool p2p_memcpy_enabled = false);
+
+  static const char* GetHloOpName() { return "reduce-scatter-start"; }
+
+  static absl::Status CheckImplementable(
+      const HloReduceScatterInstruction* inst, int64_t replica_count,
+      int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloReduceScatterInstruction* inst);
+
+ protected:
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+};
+
+// -----------------------------------------------------------------------------
+
+absl::Status RunAllReduce(GpuCollectives* collectives,
+                          ReductionKind reduction_kind,
+                          std::vector<DeviceBufferPair>& buffers,
+                          se::Stream& stream, Communicator* comm);
+
+absl::Status RunReduceScatter(GpuCollectives* collectives,
+                              ReductionKind reduction_kind,
+                              std::vector<DeviceBufferPair>& buffers,
+                              se::Stream& stream, Communicator* comm);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_ALL_REDUCE_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
new file mode 100644
index 000000000000..e216c6f8de89
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
@@ -0,0 +1,387 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/substitute.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+AllToAllConfig GetAllToAllConfig(const HloAllToAllInstruction* instr) {
+  AllToAllConfig config;
+  // FIXME(b/180174349): LMHLO AllToAll incorrectly has use_global_device_ids
+  // attribute and it should be removed.
+  config.config = GetCollectiveConfig(instr, std::nullopt);
+  config.has_split_dimension = instr->split_dimension().has_value();
+  return config;
+}
+
+}  // namespace
+
+AllToAllStartThunk::AllToAllStartThunk(
+    ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
+    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
+    : CollectiveThunk(Thunk::kAllToAllStart, thunk_info,
+                      IsGPUSyncCollective(*instr),
+                      AsyncStreamKind::kCollective),
+      config_(GetAllToAllConfig(instr)),
+      buffers_(std::move(buffers)),
+      p2p_memcpy_enabled_(p2p_memcpy_enabled) {
+  CHECK_EQ(config_.config.operand_count, buffers_.size());
+}
+
+/*static*/ absl::Status AllToAllStartThunk::CheckImplementable(
+    const HloAllToAllInstruction* instr, int64_t replica_count,
+    int64_t partition_count) {
+  auto status = [&instr]() -> absl::Status {
+    std::optional<uint64_t> split_dim = instr->split_dimension();
+    for (HloInstruction* operand : instr->operands()) {
+      Shape shape = operand->shape();
+      TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kAllToAll));
+      if (split_dim &&
+          !ShapeUtil::IsEffectivelyMostMajorDimension(shape, *split_dim)) {
+        return absl::UnimplementedError(absl::Substitute(
+            "all-to-all split dim $0 is not the most major in input shape $1",
+            *split_dim, shape.ToString(/*print_layout=*/true)));
+      }
+    }
+    return absl::OkStatus();
+  };
+  return AddOpDescription<AllToAllStartThunk>(status(), instr, replica_count,
+                                              partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode AllToAllStartThunk::GetGroupMode(
+    const HloAllToAllInstruction* instr) {
+  return GetAllToAllConfig(instr).config.group_mode;
+}
+
+absl::Status AllToAllStartThunk::Initialize(const InitializeParams& params) {
+  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+  device_count_ = params.local_device_count;
+  CHECK_GT(device_count_, 0);
+  VLOG(5) << "Local device count: " << device_count_;
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+
+  if (is_local() && p2p_memcpy_enabled_) {
+    AsyncStreamKind stream_kind = GetAsyncStreamKind();
+    TF_ASSIGN_OR_RETURN(
+        CommunicatorHandle comm_handle,
+        GetComm(collectives, *params.collective_params,
+                *params.collective_cliques, config().replica_groups,
+                config().group_mode, stream_kind));
+    TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm_handle.comm->NumRanks());
+    se::StreamExecutor* executor = params.executor;
+    {
+      absl::MutexLock lock(&pointer_maps_mutex_);
+      if (!send_pointer_maps_.count(executor)) {
+        TF_ASSIGN_OR_RETURN(
+            std::unique_ptr<se::MemoryAllocation> alloc,
+            executor->HostMemoryAllocate(num_ranks * sizeof(uint64_t)));
+        bool inserted =
+            send_pointer_maps_.insert({executor, std::move(alloc)}).second;
+        CHECK(inserted);
+        TF_ASSIGN_OR_RETURN(
+            alloc, executor->HostMemoryAllocate(num_ranks * sizeof(uint64_t)));
+        inserted =
+            receive_pointer_maps_.insert({executor, std::move(alloc)}).second;
+        CHECK(inserted);
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status AllToAllStartThunk::RunCollective(const ExecuteParams& params,
+                                               se::Stream& stream,
+                                               CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_,
+                             config_.config.operand_element_type));
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+
+  if (is_local() && p2p_memcpy_enabled_) {
+    uint64_t* send_pointer_map = nullptr;
+    uint64_t* receive_pointer_map = nullptr;
+    {
+      absl::MutexLock lock(&pointer_maps_mutex_);
+      send_pointer_map = reinterpret_cast<uint64_t*>(
+          send_pointer_maps_[stream.parent()]->opaque());
+      receive_pointer_map = reinterpret_cast<uint64_t*>(
+          receive_pointer_maps_[stream.parent()]->opaque());
+    }
+    return xla::gpu::RunMemCpyAllToAll(collectives, config_.has_split_dimension,
+                                       device_buffers, stream, comm_handle.comm,
+                                       send_pointer_map, receive_pointer_map);
+  }
+  return xla::gpu::RunAllToAll(collectives, config_.has_split_dimension,
+                               device_buffers, stream, comm_handle.comm);
+}
+
+AsyncStreamKind AllToAllStartThunk::GetAsyncStreamKind() const {
+  if (is_local() && p2p_memcpy_enabled_) {
+    return AsyncStreamKind::kMemCpyP2P;
+  }
+  return CollectiveThunk::GetAsyncStreamKind();
+}
+
+bool AllToAllStartThunk::is_local() const {
+  for (const auto& replica_group : config_.config.replica_groups) {
+    const int64_t node_id = replica_group.replica_ids().at(0) / device_count_;
+    if (!absl::c_all_of(replica_group.replica_ids(),
+                        [this, node_id](const int64_t rank) {
+                          return rank / device_count_ == node_id;
+                        })) {
+      return false;
+    }
+  }
+  return true;
+}
+
+absl::Status RunAllToAll(GpuCollectives* collectives, bool has_split_dimension,
+                         std::vector<DeviceBufferPair>& buffers,
+                         se::Stream& stream, Communicator* comm) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing all-to-all from device ordinal: " << device_ordinal
+          << ", has_split_dimension: " << has_split_dimension;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
+
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+
+  PrimitiveType element_type = buffers[0].element_type;
+  int64_t element_count = buffers[0].element_count;
+
+  // All buffers must have the same element type and count.
+  bool all_buffers_match = absl::c_all_of(buffers, [&](const auto& buffer) {
+    return buffer.element_type == element_type &&
+           buffer.element_count == element_count;
+  });
+
+  if (!all_buffers_match) {
+    return InvalidArgument(
+        "All buffers must have the same element type and count");
+  }
+
+  // AllToAll can operate in two modes. Either it specifies a split dimension,
+  // in which case inputs are split and outputs concatenated in that dimension
+  // (here, we only support dimension 0), or it takes a list of inputs
+  // and produces a tuple of outputs.
+  absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers;
+  absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers;
+
+  if (has_split_dimension) {
+    TF_RET_CHECK(element_count % num_ranks == 0)
+        << "Buffer element count must be an exact multiple of the number of "
+           "participants";
+    size_t chunk_element_count = element_count / num_ranks;
+
+    for (const DeviceBufferPair& buffer : buffers) {
+      for (int peer = 0; peer < num_ranks; ++peer) {
+        send_buffers.push_back(collectives->Slice(
+            buffer.source_buffer, element_type, peer * chunk_element_count,
+            chunk_element_count));
+        recv_buffers.push_back(collectives->Slice(
+            buffer.destination_buffer, element_type, peer * chunk_element_count,
+            chunk_element_count));
+      }
+    }
+
+    auto event = comm->AllToAll(
+        std::move(send_buffers), std::move(recv_buffers), element_type,
+        chunk_element_count, GpuCollectives::On(stream));
+
+    tsl::BlockUntilReady(event);
+    if (event.IsError()) {
+      return event.GetError();
+    }
+  } else {
+    for (const DeviceBufferPair& buffer : buffers) {
+      send_buffers.push_back(buffer.source_buffer);
+      recv_buffers.push_back(buffer.destination_buffer);
+    }
+
+    auto event =
+        comm->AllToAll(std::move(send_buffers), std::move(recv_buffers),
+                       element_type, element_count, GpuCollectives::On(stream));
+
+    tsl::BlockUntilReady(event);
+    if (event.IsError()) {
+      return event.GetError();
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+static absl::Status SendPtrToPeer(void* ptr, RankId peer, GpuCommunicator* comm,
+                                  se::Stream& stream) {
+  VLOG(3) << absl::StreamFormat(
+      "RecvPtrFromPeer on device #%d; peer=%d; comm=%p; stream=%p",
+      stream.parent()->device_ordinal(), peer.value(), comm, &stream);
+
+  return comm->LaunchSend(se::DeviceMemoryBase(ptr, sizeof(void*)), U64, 1,
+                          peer, GpuCollectives::On(stream));
+}
+
+static absl::Status RecvPtrFromPeer(void* ptr, RankId peer,
+                                    GpuCommunicator* comm, se::Stream& stream) {
+  VLOG(3) << absl::StreamFormat(
+      "RecvPtrFromPeer on device #%d; peer=%d; comm=%p; stream=%p",
+      stream.parent()->device_ordinal(), peer.value(), comm, &stream);
+
+  return comm->LaunchRecv(se::DeviceMemoryBase(ptr, sizeof(void*)), U64, 1,
+                          peer, GpuCollectives::On(stream));
+}
+
+// TODO(b/380457503): Memcpy AllToAll implementation must be moved to
+// NcclCommunicator implementation.
+absl::Status RunMemCpyAllToAll(GpuCollectives* collectives,
+                               bool has_split_dimension,
+                               std::vector<DeviceBufferPair>& buffers,
+                               se::Stream& stream, Communicator* comm,
+                               uint64_t send_pointer_map[],
+                               uint64_t receive_pointer_map[]) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing mem-copy-all-to-all from device ordinal: "
+          << device_ordinal;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
+
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm, collectives->TryCast(comm));
+
+  // AllToAll can operate in two modes. Either it specifies a split dimension,
+  // in which case inputs are split and outputs concatenated in that dimension
+  // (here, we only support dimension 0), or it takes a list of inputs
+  // and produces a tuple of outputs.
+  if (has_split_dimension) {
+    for (DeviceBufferPair& buffer : buffers) {
+      TF_RET_CHECK(buffer.element_count % num_ranks == 0)
+          << "Buffer was not an exact multiple of the number of participants.";
+
+      size_t chunk_element_count = buffer.element_count / num_ranks;
+
+      tsl::AsyncValueRef<Communicator::Event> event = gpu_comm->GroupExecute(
+          [num_ranks, collectives, chunk_element_count, send_pointer_map,
+           receive_pointer_map, &buffer,
+           &stream](GpuCommunicator* comm) -> absl::Status {
+            for (int peer = 0; peer < num_ranks; ++peer) {
+              se::DeviceMemoryBase recv_slice = collectives->Slice(
+                  buffer.destination_buffer, buffer.element_type,
+                  peer * chunk_element_count, chunk_element_count);
+              send_pointer_map[peer] = (uint64_t)recv_slice.opaque();
+
+              TF_RETURN_IF_ERROR(SendPtrToPeer(&send_pointer_map[peer],
+                                               RankId(peer), comm, stream));
+              TF_RETURN_IF_ERROR(RecvPtrFromPeer(&receive_pointer_map[peer],
+                                                 RankId(peer), comm, stream));
+            }
+            return absl::OkStatus();
+          });
+      tsl::BlockUntilReady(event);
+      if (event.IsError()) {
+        return event.GetError();
+      }
+      TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+
+      for (int peer = 0; peer < num_ranks; ++peer) {
+        se::DeviceMemoryBase send_slice =
+            collectives->Slice(buffer.source_buffer, buffer.element_type,
+                               peer * chunk_element_count, chunk_element_count);
+        se::DeviceMemoryBase dst_addr =
+            se::DeviceMemoryBase((void*)receive_pointer_map[peer]);
+        TF_RETURN_IF_ERROR(
+            stream.MemcpyD2D(&dst_addr, send_slice, send_slice.size()));
+      }
+    }
+  } else {
+    TF_RET_CHECK(buffers.size() == num_ranks)
+        << "Number of inputs didn't match the number of participants.";
+
+    tsl::AsyncValueRef<Communicator::Event> event = gpu_comm->GroupExecute(
+        [num_ranks, send_pointer_map, receive_pointer_map, &buffers,
+         &stream](GpuCommunicator* comm) -> absl::Status {
+          for (int peer = 0; peer < num_ranks; ++peer) {
+            send_pointer_map[peer] =
+                (uint64_t)buffers[peer].destination_buffer.opaque();
+
+            TF_RETURN_IF_ERROR(SendPtrToPeer(&send_pointer_map[peer],
+                                             RankId(peer), comm, stream));
+            TF_RETURN_IF_ERROR(RecvPtrFromPeer(&receive_pointer_map[peer],
+                                               RankId(peer), comm, stream));
+          }
+          return absl::OkStatus();
+        });
+    tsl::BlockUntilReady(event);
+    if (event.IsError()) {
+      return event.GetError();
+    }
+    TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+
+    for (int peer = 0; peer < num_ranks; ++peer) {
+      // double buffer, exchange data with peer
+      se::DeviceMemoryBase dst_addr =
+          se::DeviceMemoryBase((void*)receive_pointer_map[peer]);
+      TF_RETURN_IF_ERROR(stream.MemcpyD2D(&dst_addr,
+                                          buffers[peer].source_buffer,
+                                          buffers[peer].source_buffer.size()));
+    }
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
new file mode 100644
index 000000000000..f8504e12293e
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_ALL_TO_ALL_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_ALL_TO_ALL_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct AllToAllConfig {
+  CollectiveConfig config;
+  bool has_split_dimension;
+};
+
+// Thunk that performs an All-to-All among CUDA GPU-based replicas.
+class AllToAllStartThunk : public CollectiveThunk {
+ public:
+  AllToAllStartThunk(ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
+                     std::vector<Buffer> buffers, bool p2p_memcpy_enabled);
+
+  // Returns whether the given instruction can be lowered to an all-to-all
+  // call.
+  static absl::Status CheckImplementable(const HloAllToAllInstruction* instr,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  static const char* GetHloOpName() { return "all-to-all-start"; }
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloAllToAllInstruction* instr);
+
+  const CollectiveConfig& config() const override { return config_.config; }
+  bool has_split_dimension() const { return config_.has_split_dimension; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+
+  AsyncStreamKind GetAsyncStreamKind() const override;
+
+  bool is_local() const;
+
+ private:
+  const AllToAllConfig config_;
+  const std::vector<Buffer> buffers_;
+  int64_t device_count_ = 1;
+  bool p2p_memcpy_enabled_ = false;
+  absl::Mutex pointer_maps_mutex_;
+  // Maps from a device to a uint64_t array of size num_devices. The array is
+  // written to and used in each call to RunCollective(), but is
+  // preallocated as CUDA host memory in the first call to Initialize(), since
+  // allocating CUDA host memory every call to RunCollective() is expensive.
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      send_pointer_maps_ ABSL_GUARDED_BY(pointer_maps_mutex_);
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      receive_pointer_maps_ ABSL_GUARDED_BY(pointer_maps_mutex_);
+};
+
+absl::Status RunAllToAll(GpuCollectives* collectives, bool has_split_dimension,
+                         std::vector<DeviceBufferPair>& buffers,
+                         se::Stream& stream, Communicator* comm);
+
+absl::Status RunMemCpyAllToAll(GpuCollectives* collectives,
+                               bool has_split_dimension,
+                               std::vector<DeviceBufferPair>& buffers,
+                               se::Stream& stream, Communicator* comm,
+                               uint64_t send_pointer_map[],
+                               uint64_t receive_pointer_map[]);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
similarity index 79%
rename from third_party/xla/xla/service/gpu/buffer_comparator.cc
rename to third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
index 5c4ea6573993..6a17ae12ad60 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/buffer_comparator.h"
+#include "xla/backends/gpu/runtime/buffer_comparator.h"
 
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
-#include <string>
 #include <type_traits>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "Eigen/Core"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -29,23 +29,15 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_handle.h"
-#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/gpu/buffer_comparator_kernel.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
-template <typename ElementT>
-using ComparisonKernelT =
-    se::TypedKernel<se::DeviceMemory<ElementT>, se::DeviceMemory<ElementT>,
-                    float, uint64_t, se::DeviceMemory<uint64_t>>;
-
 struct ComparisonParams {
   double relative_tol = 0.1;
   bool verbose = true;
@@ -59,9 +51,7 @@ struct ComparisonParams {
 //
 // Returns `true` if two buffers are equal, `false` otherwise.
 template <typename ElementT>
-static absl::StatusOr<bool> DeviceCompare(absl::string_view kernel_name,
-                                          void* kernel_symbol,
-                                          const ComparisonParams& params) {
+static absl::StatusOr<bool> DeviceCompare(const ComparisonParams& params) {
   se::StreamExecutor* executor = params.stream->parent();
 
   se::DeviceMemoryHandle out(executor, executor->AllocateScalar<uint64_t>());
@@ -78,11 +68,10 @@ static absl::StatusOr<bool> DeviceCompare(absl::string_view kernel_name,
   uint64_t buffer_size = current_typed.ElementCount();
 
   TF_ASSIGN_OR_RETURN(
-      ComparisonKernelT<ElementT> comparison_kernel,
-      (se::TypedKernelFactory<
-          se::DeviceMemory<ElementT>, se::DeviceMemory<ElementT>, float,
-          uint64_t, se::DeviceMemory<uint64_t>>::Create(executor, kernel_name,
-                                                        kernel_symbol)));
+      auto comparison_kernel,
+      stream_executor::gpu::GpuKernelRegistry::GetGlobalRegistry()
+          .LoadKernel<stream_executor::gpu::BufferComparatorKernel<ElementT>>(
+              executor));
 
   const se::DeviceDescription& gpu_device_info =
       executor->GetDeviceDescription();
@@ -162,12 +151,9 @@ static absl::StatusOr<bool> HostCompare(const ComparisonParams& params) {
 
 template <typename ElementT, typename ComparisonT>
 static absl::StatusOr<bool> CompareEqualParameterized(
-    absl::string_view kernel_name, void* kernel_symbol,
     const ComparisonParams& params) {
   XLA_SCOPED_LOGGING_TIMER("BufferComparator::CompareEqual");
-  TF_ASSIGN_OR_RETURN(
-      bool result, DeviceCompare<ElementT>(kernel_name, kernel_symbol, params));
-
+  TF_ASSIGN_OR_RETURN(bool result, DeviceCompare<ElementT>(params));
   if (result) {
     return true;
   }
@@ -185,31 +171,22 @@ absl::StatusOr<bool> BufferComparator::CompareEqual(
   ComparisonParams params{relative_tol_, verbose_, &shape_,
                           stream,        current,  expected};
 
-  void* kernel_symbol = buffer_comparator::comparison_fn(shape_.element_type());
-  if (kernel_symbol == nullptr) {
-    return Unimplemented("Unimplemented element type for device kernel");
-  }
-
-  std::string kernel_name = absl::StrCat(
-      primitive_util::LowercasePrimitiveTypeName(shape_.element_type()),
-      "_comparison");
-
   auto do_compare = [&](auto cst_type) {
     using ElementT = primitive_util::NativeTypeOf<cst_type>;
     using ComparisonT =
-        std::conditional_t<std::is_same_v<ElementT, double>, double, float>;
-    return CompareEqualParameterized<ElementT, ComparisonT>(
-        kernel_name, kernel_symbol, params);
+        std::conditional_t<std::is_same_v<ElementT, double>,
+                           double, float>;
+    return CompareEqualParameterized<ElementT, ComparisonT>(params);
   };
 
   if (primitive_util::IsFloatingPointType(shape_.element_type())) {
-    return xla::primitive_util::FloatingPointTypeSwitch<absl::StatusOr<bool>>(
-        do_compare, shape_.element_type());
+    return xla::primitive_util::FloatingPointTypeSwitch(do_compare,
+                                                        shape_.element_type());
   }
 
   if (primitive_util::IsIntegralType(shape_.element_type())) {
-    return xla::primitive_util::IntegralTypeSwitch<absl::StatusOr<bool>>(
-        do_compare, shape_.element_type());
+    return xla::primitive_util::IntegralTypeSwitch(do_compare,
+                                                   shape_.element_type());
   }
 
   return Unimplemented("Unimplemented element type for host function");
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.h b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.h
similarity index 85%
rename from third_party/xla/xla/service/gpu/buffer_comparator.h
rename to third_party/xla/xla/backends/gpu/runtime/buffer_comparator.h
index 76e605bb24ad..d3b016b7ca11 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.h
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
-#define XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+#ifndef XLA_BACKENDS_GPU_RUNTIME_BUFFER_COMPARATOR_H_
+#define XLA_BACKENDS_GPU_RUNTIME_BUFFER_COMPARATOR_H_
 
 #include "absl/status/statusor.h"
 #include "xla/shape.h"
@@ -50,12 +50,6 @@ class BufferComparator {
   bool verbose_;         // whether to print out error message on mismatch
 };
 
-namespace buffer_comparator {
-
-// Returns a pointer to CUDA C++ device function implementing comparison.
-void* comparison_fn(xla::PrimitiveType type);
-
-}  // namespace buffer_comparator
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+#endif  // XLA_BACKENDS_GPU_RUNTIME_BUFFER_COMPARATOR_H_
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/buffer_comparator_test.cc
rename to third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
index acd7f60ac94f..942a951b50f5 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_comparator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/buffer_comparator.h"
+#include "xla/backends/gpu/runtime/buffer_comparator.h"
 
 #include <cmath>
 #include <complex>
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/types.h"
 #include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc
index 8ff71f6b01ba..7562386bca43 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cholesky_thunk.cc
@@ -21,19 +21,19 @@ limitations under the License.
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "xla/backends/gpu/runtime/make_batch_pointers.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/make_batch_pointers.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu_solver_context.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
new file mode 100644
index 000000000000..652a89ea994b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.cc
@@ -0,0 +1,98 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+CollectiveBroadcastStartThunk::CollectiveBroadcastStartThunk(
+    ThunkInfo thunk_info, const HloCollectiveBroadcastInstruction* instr,
+    std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
+    : CollectiveThunk(Thunk::kCollectiveBroadcastStart, thunk_info,
+                      IsGPUSyncCollective(*instr),
+                      AsyncStreamKind::kCollective),
+      config_(GetCollectiveConfig(instr, std::nullopt)),
+      buffers_(std::move(buffers)) {}
+
+/*static*/ absl::Status CollectiveBroadcastStartThunk::CheckImplementable(
+    const HloInstruction* instr, int64_t replica_count,
+    int64_t partition_count) {
+  return absl::OkStatus();
+}
+
+/*static*/ CollectiveOpGroupMode CollectiveBroadcastStartThunk::GetGroupMode(
+    const HloCollectiveBroadcastInstruction* inst) {
+  return GetCollectiveConfig(inst, std::nullopt).group_mode;
+}
+
+absl::Status CollectiveBroadcastStartThunk::RunCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_, config_.operand_element_type));
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  return ::xla::gpu::RunCollectiveBroadcast(device_buffers, stream,
+                                            comm_handle.comm, collectives);
+}
+
+absl::Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
+                                    se::Stream& stream, Communicator* comm,
+                                    GpuCollectives* collectives) {
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm, collectives->TryCast(comm));
+  tsl::AsyncValueRef<Communicator::Event> event = gpu_comm->GroupExecute(
+      [&buffers, &stream](GpuCommunicator* comm) -> absl::Status {
+        for (auto buffer : buffers) {
+          se::DeviceMemoryBase src_addr = buffer.source_buffer;
+          se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
+          TF_RETURN_IF_ERROR(comm->LaunchBroadcast(
+              // Always use rank 0 since we always broadcast from the first id
+              // in replica_groups
+              src_addr, dest_addr, buffer.element_type, buffer.element_count,
+              RankId(0), GpuCollectives::On(stream)));
+        }
+        return absl::OkStatus();
+      });
+  tsl::BlockUntilReady(event);
+  if (event.IsError()) {
+    return event.GetError();
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
new file mode 100644
index 000000000000..e443372e2f8a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_broadcast_thunk.h
@@ -0,0 +1,68 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_BROADCAST_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_BROADCAST_THUNK_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+
+// Thunk that performs a collective broadcast.
+class CollectiveBroadcastStartThunk : public CollectiveThunk {
+ public:
+  static absl::Status CheckImplementable(const HloInstruction* instr,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloCollectiveBroadcastInstruction* inst);
+
+  const CollectiveConfig& config() const override { return config_; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+  static const char* GetHloOpName() { return "collective-broadcast-start"; }
+
+  CollectiveBroadcastStartThunk(ThunkInfo thunk_info,
+                                const HloCollectiveBroadcastInstruction* instr,
+                                std::vector<Buffer> buffers,
+                                bool p2p_memcpy_enabled = false);
+
+ protected:
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+
+ private:
+  const CollectiveConfig config_;
+  const std::vector<Buffer> buffers_;
+};
+
+absl::Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
+                                    se::Stream& stream, Communicator* comm,
+                                    GpuCollectives* collectives);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_BROADCAST_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
new file mode 100644
index 000000000000..ba355dda3021
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.cc
@@ -0,0 +1,134 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_group_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+CollectiveGroupThunk::CollectiveGroupThunk(
+    const HloInstruction* instruction, Thunk::Kind kind,
+    std::vector<std::unique_ptr<Thunk>> thunks, AsyncStreamKind stream_kind)
+    : Thunk(kind, ThunkInfo::WithProfileAnnotation(instruction)),
+      stream_kind_(stream_kind),
+      async_events_(new CollectiveThunk::AsyncEvents()) {
+  for (auto& thunk : thunks) {
+    thunks_.emplace_back(std::move(thunk));
+  }
+}
+absl::Status CollectiveGroupThunk::Prepare(
+    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+  for (const std::unique_ptr<Thunk>& thunk : thunks_) {
+    TF_RETURN_IF_ERROR(thunk->Prepare(params, resource_requests));
+  }
+  return absl::OkStatus();
+}
+absl::Status CollectiveGroupThunk::Initialize(const InitializeParams& params) {
+  if (async_events_) {
+    TF_RETURN_IF_ERROR(async_events_->Initialize(params.executor));
+  }
+  for (const std::unique_ptr<Thunk>& thunk : thunks_) {
+    TF_RETURN_IF_ERROR(thunk->Initialize(params));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CollectiveGroupThunk::ExecuteOnStream(
+    const Thunk::ExecuteParams& params) {
+  int64_t async_stream_idx = static_cast<int64_t>(stream_kind_);
+  // Async streams are already assigned in gpu_executable.cc::ExecuteThunks.
+  // async_streams is therefore guaranteed to be non-null and to have enough
+  // elements to index by the AsyncStreamKind enum.
+  se::Stream* async_stream =
+      params.collective_params->async_streams.at(async_stream_idx);
+  TF_RETURN_IF_ERROR(async_stream->WaitFor(params.stream));
+
+  // Gather the set of all communicators. There should be only one.
+  absl::flat_hash_set<Communicator*> communicator_set;
+  absl::Status s;
+  ForAllThunks([&params, &s, &communicator_set](const Thunk* thunk) {
+    absl::StatusOr<std::vector<Communicator*>> communicators =
+        thunk->GetCommunicators(params);
+    if (!communicators.ok()) {
+      s = communicators.status();
+      return;
+    }
+    for (Communicator* comm : *communicators) {
+      communicator_set.insert(comm);
+    }
+  });
+  if (communicator_set.empty()) {
+    return absl::InvalidArgumentError("No communicators in NCCL group");
+  }
+  if (communicator_set.size() > 1) {
+    return absl::InvalidArgumentError(
+        "More than one communicator in NCCL group");
+  }
+
+  Communicator* comm = *communicator_set.begin();
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm,
+                      GpuCollectives::TryCast(comm));
+  tsl::AsyncValueRef<Communicator::Event> group_event = gpu_comm->GroupExecute(
+      [this, &params](GpuCommunicator* comm) -> absl::Status {
+        for (const std::unique_ptr<Thunk>& thunk : thunks_) {
+          TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params));
+        }
+        return absl::OkStatus();
+      });
+  tsl::BlockUntilReady(group_event);
+  if (group_event.IsError()) {
+    return group_event.GetError();
+  }
+
+  TF_ASSIGN_OR_RETURN(se::Event * event,
+                      async_events_->GetEvent(params.stream->parent()));
+  TF_RETURN_IF_ERROR(async_stream->RecordEvent(event));
+
+  return absl::OkStatus();
+}
+
+void CollectiveGroupThunk::ForAllThunks(
+    absl::FunctionRef<void(const Thunk*)> fn) const {
+  fn(this);
+  for (const std::unique_ptr<Thunk>& thunk : thunks_) {
+    thunk->ForAllThunks(fn);
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
new file mode 100644
index 000000000000..7e0c5d47019e
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_group_thunk.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_GROUP_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_GROUP_THUNK_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace gpu {
+
+// Collective group thunk fuses together a set of arbitrary collective
+// operations into a single group call in order for them to be dispatched
+// together. Implementation is backend-specific and might not be supported by
+// all collective implementations.
+
+class CollectiveGroupThunk : public Thunk {
+ public:
+  CollectiveGroupThunk(const HloInstruction* instruction, Thunk::Kind kind,
+                       std::vector<std::unique_ptr<Thunk>> thunks,
+                       AsyncStreamKind stream_kind);
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequestsInterface& resource_requests) override;
+  absl::Status ExecuteOnStream(const Thunk::ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override;
+  void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events() const {
+    return async_events_;
+  }
+
+ private:
+  ThunkSequence thunks_;
+  AsyncStreamKind stream_kind_;
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_GROUP_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
new file mode 100644
index 000000000000..2735c42b9c05
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
@@ -0,0 +1,474 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+#include "xla/service/rendezvous.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+absl::StatusOr<const int64_t> GetCurrentId(
+    Thunk::CollectiveExecuteParams* collective_params,
+    const P2PConfig& config) {
+  GlobalDeviceId global_device_id = collective_params->global_device_id;
+  TF_ASSIGN_OR_RETURN(
+      const DeviceAssignment::LogicalID current_logical_id,
+      collective_params->device_assn->LogicalIdForDevice(global_device_id));
+  const int64_t current_id =
+      config.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+          ? current_logical_id.replica_id
+          : current_logical_id.computation_id;
+  return current_id;
+}
+
+bool IsLocalPeerTransfer(const P2PConfig::SourceTargetMapEntry& source_target,
+                         const int64_t current_id, const int64_t device_count) {
+  const std::optional<int64_t> source_id = source_target.source;
+  const std::optional<int64_t> target_id = source_target.target;
+  // Mixing nccl p2p with p2p memcopy will cause random deadlocks, namely
+  // when calling nccl call and cuda memcpy p2p together(which both are
+  // synchronizing devices), in this case if this rank is sending across host
+  // using a nccl call but receiving from a local peer which is going through
+  // cuda api, the deadlock could happen because nccl cannot ensure the
+  // order of cuda api calls.
+  // We determine if it's a local peer if the source/target id is within a node
+  // if they are present.
+  int64_t host_id = (current_id / device_count);
+  if (source_id && host_id != *source_id / device_count) return false;
+  if (target_id && host_id != *target_id / device_count) return false;
+  return true;
+}
+
+}  // namespace
+
+CollectivePermuteStartThunk::CollectivePermuteStartThunk(
+    ThunkInfo thunk_info, const HloCollectivePermuteInstruction* instr,
+    int64_t replica_count, int64_t partition_count,
+    const std::vector<Buffer>& buffers, bool p2p_memcpy_enabled,
+    AsyncStreamKind stream_kind)
+    : CollectiveThunk(Thunk::kCollectivePermuteStart, thunk_info,
+                      IsGPUSyncCollective(*instr), stream_kind),
+      config_(GetP2PConfig(instr, replica_count, partition_count)),
+      buffers_(buffers),
+      p2p_memcpy_enabled_(p2p_memcpy_enabled) {}
+
+/*static*/ P2PConfig CollectivePermuteStartThunk::GetP2PConfig(
+    const HloCollectivePermuteInstruction* instr, int64_t replica_count,
+    int64_t partition_count) {
+  P2PConfig collective_permute_config;
+  auto& config = collective_permute_config.config;
+
+  config.operand_count = instr->operand_count();
+  for (int i = 0; i < config.operand_count; ++i) {
+    config.operand_element_type.push_back(
+        instr->operand(i)->shape().element_type());
+  }
+  config.SetCollectiveOpKindAndID(instr);
+  config.group_mode = GetGroupMode(instr);
+
+  // With a collective permute, all execution instances together form one
+  // replica group.
+  const int64_t num_participants =
+      config.group_mode == CollectiveOpGroupMode::kCrossReplica
+          ? replica_count
+          : partition_count;
+  config.replica_groups.emplace_back();
+  ReplicaGroup& replica_group = config.replica_groups.front();
+  for (int i = 0; i < num_participants; ++i) {
+    replica_group.add_replica_ids(i);
+  }
+
+  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs =
+      instr->source_target_pairs();
+
+  for (const std::pair<int64_t, int64_t>& source_target : source_target_pairs) {
+    int64_t source = source_target.first;
+    int64_t target = source_target.second;
+
+    collective_permute_config.id_to_source_target.insert({target, {}})
+        .first->second.source = source;
+    collective_permute_config.id_to_source_target.insert({source, {}})
+        .first->second.target = target;
+  }
+
+  return collective_permute_config;
+}
+
+/*static*/ bool CollectivePermuteStartThunk::IsDegenerate(
+    const HloCollectivePermuteInstruction* instr, int64_t replica_count,
+    int64_t partition_count) {
+  // The collective permute is degenerate if all source-target pairs are
+  // identity, and all the IDs appear in the list.
+  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs =
+      instr->source_target_pairs();
+  // Each ID can appear only once as a source and as a target. So if all pairs
+  // are identity, all IDs must appear in the list is the size == number of
+  // replicas/partitions.
+  const int64_t expected_size =
+      instr->channel_id().has_value() ? partition_count : replica_count;
+  return source_target_pairs.size() == expected_size &&
+         absl::c_all_of(source_target_pairs,
+                        [](const std::pair<int64_t, int64_t>& source_target) {
+                          return source_target.first == source_target.second;
+                        });
+}
+
+/*static*/ CollectiveOpGroupMode CollectivePermuteStartThunk::GetGroupMode(
+    const HloCollectivePermuteInstruction* instr) {
+  return GetCollectiveOpGroupMode(instr->channel_id().has_value(), std::nullopt)
+      .value();
+}
+
+absl::Status CollectivePermuteStartThunk::Initialize(
+    const InitializeParams& params) {
+  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+  device_count_ = params.local_device_count;
+  CHECK_GT(device_count_, 0);
+  VLOG(5) << "Local device count: " << device_count_;
+
+  if (p2p_memcpy_enabled_) {
+    TF_ASSIGN_OR_RETURN(const int64_t current_id,
+                        GetCurrentId(params.collective_params, config_));
+    {
+      absl::MutexLock lock(&barrier_mutex_);
+      if (receiver_barrier_events_.find(current_id) ==
+          receiver_barrier_events_.end()) {
+        TF_ASSIGN_OR_RETURN(auto receiver_event,
+                            params.executor->CreateEvent());
+        receiver_barrier_events_.emplace(current_id, std::move(receiver_event));
+      }
+      if (sender_barrier_events_.find(current_id) ==
+          sender_barrier_events_.end()) {
+        TF_ASSIGN_OR_RETURN(auto sender_event, params.executor->CreateEvent());
+        sender_barrier_events_.emplace(current_id, std::move(sender_event));
+      }
+    }
+    TF_ASSIGN_OR_RETURN(
+        std::vector<DeviceBufferPair> device_buffers,
+        ConvertToDeviceBuffers(params.buffer_allocations, {buffers_},
+                               config_.config.operand_element_type));
+    const P2PConfig::SourceTargetMapEntry source_target =
+        P2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
+
+    const std::optional<int64_t> source_id = source_target.source;
+
+    TF_RETURN_IF_ERROR(recv_ptr_map_.InitializeId(current_id));
+
+    if (source_id) {
+      std::vector<se::DeviceMemoryBase> dest_addrs;
+      std::transform(device_buffers.begin(), device_buffers.end(),
+                     std::back_inserter(dest_addrs),
+                     [](const DeviceBufferPair& buffer) {
+                       return buffer.destination_buffer;
+                     });
+      std::vector<void*> dest_opaques;
+      std::transform(
+          dest_addrs.begin(), dest_addrs.end(),
+          std::back_inserter(dest_opaques),
+          [](se::DeviceMemoryBase dest_addr) { return dest_addr.opaque(); });
+      TF_RETURN_IF_ERROR(recv_ptr_map_.PutRecvPtr(current_id, dest_opaques));
+    }
+  }
+
+  return absl::OkStatus();
+}
+struct CallRendezvousKey {
+  RunId run_id;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const CallRendezvousKey& key) {
+    return H::combine(std::move(h), key.run_id);
+  }
+};
+
+bool operator==(const CallRendezvousKey& a, const CallRendezvousKey& b) {
+  return a.run_id == b.run_id;
+}
+
+absl::Status CollectivePermuteStartThunk::RunCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params,
+                             std::vector<CollectiveThunk::Buffer>(buffers_),
+                             config_.config.operand_element_type));
+  TF_ASSIGN_OR_RETURN(const int64_t current_id,
+                      GetCurrentId(params.collective_params, config_));
+  std::string device_string = GetDeviceString(*params.collective_params);
+
+  const P2PConfig::SourceTargetMapEntry source_target =
+      P2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
+  bool is_local_peer =
+      IsLocalPeerTransfer(source_target, current_id, device_count_);
+  VLOG(5) << "Is local peer : " << (is_local_peer ? "true" : "false");
+
+  bool use_memcpy = is_local_peer && recv_ptr_map_.IsInitialized(current_id) &&
+                    p2p_memcpy_enabled_;
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  if (use_memcpy) {
+    std::optional<int64_t> source_id = source_target.source;
+    std::optional<int64_t> target_id = source_target.target;
+    // Due to the one-sided push mechanism of memcpy p2p, we need to make sure
+    // the buffer on the receiving side is ready before sender pushes the data.
+    // Receiving side will record an event and the sender will wait for the
+    // event before proceeding.
+    if (source_id) {
+      absl::MutexLock lock(&barrier_mutex_);
+      auto receiver_event = receiver_barrier_events_.find(current_id);
+      TF_RETURN_IF_ERROR(stream.RecordEvent(receiver_event->second.get()));
+    }
+
+    TF_ASSIGN_OR_RETURN(size_t num_local_participants,
+                        comm_handle.comm->NumRanks());
+
+    auto rendezvous_name = absl::StrFormat(
+        "rendezvous before calling collective-permute; run_id=%d; op id:%d; "
+        "num_local_participants:%d",
+        params.collective_params->run_id.ToInt(), config_.config.op_id,
+        num_local_participants);
+    auto rendezvous_key = CallRendezvousKey{params.collective_params->run_id};
+
+    // Perform a rendezvous to make sure all receivers have their events
+    // recorded.
+    TF_RETURN_IF_ERROR(Rendezvous(rendezvous_name, rendezvous_key,
+                                  num_local_participants,
+                                  /*warn_stuck_timeout=*/absl::Seconds(20),
+                                  /*terminate_timeout=*/absl::Seconds(40)));
+
+    // For sending side, wait for the recorded event from the receiving side.
+    if (target_id) {
+      absl::MutexLock lock(&barrier_mutex_);
+      auto receiver_event = receiver_barrier_events_.find(*target_id);
+      TF_RETURN_IF_ERROR(stream.WaitFor(receiver_event->second.get()));
+    }
+  }
+
+  auto status = ::xla::gpu::RunCollectivePermute(
+      collectives, source_target, device_buffers, stream, comm_handle.comm,
+      device_string, current_id, use_memcpy, recv_ptr_map_);
+
+  if (use_memcpy) {
+    std::optional<int64_t> source_id = source_target.source;
+    std::optional<int64_t> target_id = source_target.target;
+    // After the memcpy p2p is dispatched, the receiver needs to
+    // wait for the sender's event before proceeding to ensure
+    // data has been copied.
+    if (target_id) {
+      absl::MutexLock lock(&barrier_mutex_);
+      auto sender_event = sender_barrier_events_.find(current_id);
+      TF_RETURN_IF_ERROR(stream.RecordEvent(sender_event->second.get()));
+    }
+
+    TF_ASSIGN_OR_RETURN(size_t num_local_participants,
+                        comm_handle.comm->NumRanks());
+
+    auto rendezvous_name = absl::StrFormat(
+        "rendezvous after calling collective-permute; run_id=%d; op id:%d; "
+        "num_local_participants:%d",
+        params.collective_params->run_id.ToInt(), config_.config.op_id,
+        num_local_participants);
+    auto rendezvous_key = CallRendezvousKey{params.collective_params->run_id};
+
+    // Perform a rendezvous to make sure all senders have their events
+    // recorded.
+    TF_RETURN_IF_ERROR(Rendezvous(rendezvous_name, rendezvous_key,
+                                  num_local_participants,
+                                  /*warn_stuck_timeout=*/absl::Seconds(20),
+                                  /*terminate_timeout=*/absl::Seconds(40)));
+
+    // For receiving side, wait for the recorded event from the sending side.
+    if (source_id) {
+      absl::MutexLock lock(&barrier_mutex_);
+      auto sender_event = sender_barrier_events_.find(*source_id);
+      TF_RETURN_IF_ERROR(stream.WaitFor(sender_event->second.get()));
+    }
+  }
+
+  return status;
+}
+
+absl::Status RunCollectivePermute(
+    GpuCollectives* collectives, P2PConfig::SourceTargetMapEntry source_target,
+    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    Communicator* comm, absl::string_view device_string, int64_t current_id,
+    bool use_memcpy, CollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map) {
+  // Determine the source and target IDs for this instance. The source ID is the
+  // ID which will copy its data to this instance. The destination ID is the ID
+  // to which this instance will copy its data. Either are optional.
+  //
+  // No source and no dest:
+  //  - this instance does not actually participate, no one send it any data and
+  //    it does not have to send any data as well. Since there is no dest,
+  //    just memzero() the dest buffer as required by the collective permute
+  //    semantics.
+  //
+  // No source, dest present:
+  //  - This instance has to send data to 'dest' Issue an send of the input.
+  //    Since there is no source, memzero the dest buffer.
+  //
+  // Source present, no destination:
+  //  - This instance received data from the source, does not have to send data
+  //    to anyone, Issue a receive.
+  //
+  // Source and dest both present:
+  //   - Issue a send of the input to dest, receive for the output from the
+  //     src.
+  //
+  //
+
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing collective permute from device ordinal: "
+          << device_ordinal << " current_id " << current_id;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
+
+  std::optional<int64_t> source_id = source_target.source;
+  std::optional<int64_t> target_id = source_target.target;
+
+  std::vector<se::DeviceMemoryBase> src_addrs, dest_addrs;
+  std::transform(
+      buffers.begin(), buffers.end(), std::back_inserter(src_addrs),
+      [](const DeviceBufferPair& buffer) { return buffer.source_buffer; });
+  std::transform(
+      buffers.begin(), buffers.end(), std::back_inserter(dest_addrs),
+      [](const DeviceBufferPair& buffer) { return buffer.destination_buffer; });
+
+  VLOG(3) << absl::StreamFormat("%s : id = %d, source_id = %d, target_id = %d",
+                                device_string, current_id,
+                                source_id.value_or(-1), target_id.value_or(-1));
+
+  if (!use_memcpy) {
+    // GroupStart/End API is needed if we need to dispatch multiple NCCL kernels
+    // for multiple buffers
+    const bool is_nccl_group_needed = (buffers.size() > 1);
+
+    std::optional<RankId> source_rank;
+    std::vector<RankId> target_ranks;
+    if (source_id) source_rank = RankId(*source_id);
+    if (target_id) target_ranks.push_back(RankId(*target_id));
+
+    if (!is_nccl_group_needed) {
+      for (uint64_t idx = 0; idx < buffers.size(); ++idx) {
+        const auto src_addr = src_addrs.at(idx);
+        const auto dest_addr = dest_addrs.at(idx);
+        const auto buffer = buffers.at(idx);
+        auto event = comm->CollectivePermute(
+            src_addr, dest_addr, buffer.element_type, buffer.element_count,
+            source_rank, target_ranks, GpuCollectives::On(stream));
+        tsl::BlockUntilReady(event);
+        if (event.IsError()) {
+          return event.GetError();
+        }
+      }
+    } else {
+      TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm,
+                          collectives->TryCast(comm));
+      tsl::AsyncValueRef<Communicator::Event> event = gpu_comm->GroupExecute(
+          [source_rank, &buffers, &src_addrs, &dest_addrs, &target_ranks,
+           &stream](GpuCommunicator* comm) -> absl::Status {
+            for (uint64_t idx = 0; idx < buffers.size(); ++idx) {
+              const auto src_addr = src_addrs.at(idx);
+              const auto dest_addr = dest_addrs.at(idx);
+              const auto buffer = buffers.at(idx);
+              TF_RETURN_IF_ERROR(comm->LaunchCollectivePermute(
+                  src_addr, dest_addr, buffer.element_type,
+                  buffer.element_count, source_rank, target_ranks,
+                  GpuCollectives::On(stream)));
+            }
+            return absl::OkStatus();
+          });
+      tsl::BlockUntilReady(event);
+      if (event.IsError()) {
+        return event.GetError();
+      }
+    }
+  }
+
+  if (!source_id) {
+    // If there is no source peer, i.e. no one send us any data, zero out dest
+    // buffer.
+    VLOG(3) << absl::StreamFormat("%s : collective-Permute: Issuing MemZero",
+                                  device_string);
+    for (se::DeviceMemoryBase& dest_addr : dest_addrs) {
+      TF_RETURN_IF_ERROR(stream.MemZero(&dest_addr, dest_addr.size()));
+    }
+  }
+
+  if (use_memcpy && target_id) {
+    TF_ASSIGN_OR_RETURN(auto recv_ptrs, recv_ptr_map.GetRecvPtr(*target_id));
+
+    VLOG(3) << "Using memcpy, received target pointers, current_id: "
+            << current_id << " target_id: " << *target_id;
+
+    VLOG(3) << current_id << " initiating memcpy to " << *target_id;
+    for (uint64_t idx = 0; idx < buffers.size(); ++idx) {
+      se::DeviceMemoryBase dst_addr =
+          se::DeviceMemoryBase(recv_ptrs.get().at(idx));
+      auto src_addr = src_addrs.at(idx);
+      TF_RETURN_IF_ERROR(
+          stream.MemcpyD2D(&dst_addr, src_addr, src_addr.size()));
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
new file mode 100644
index 000000000000..bb046c08d17e
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
@@ -0,0 +1,141 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_PERMUTE_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_PERMUTE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla {
+namespace gpu {
+
+using tsl::AsyncValueRef;
+
+// Thunk that performs a collective permute.
+class CollectivePermuteStartThunk : public CollectiveThunk {
+ public:
+  class RecvPtrMap {
+   public:
+    bool IsInitialized(int64_t current_id) {
+      absl::MutexLock lock(&mutex_);
+      return recv_ptrs_.find(current_id) != recv_ptrs_.end();
+    }
+
+    absl::Status InitializeId(int64_t current_id) {
+      absl::MutexLock lock(&mutex_);
+      recv_ptrs_[current_id] =
+          tsl::MakeUnconstructedAsyncValueRef<std::vector<void*>>();
+      return absl::OkStatus();
+    }
+
+    absl::Status PutRecvPtr(int64_t current_id,
+                            const std::vector<void*>& ptrs) {
+      if (!IsInitialized(current_id)) {
+        return absl::InternalError(absl::StrCat("Current ID ", current_id,
+                                                " has not been initialized!"));
+      }
+      absl::MutexLock lock(&mutex_);
+      if (recv_ptrs_.at(current_id).IsUnavailable()) {
+        VLOG(3) << "Putting pointers to current_id " << current_id;
+        recv_ptrs_.at(current_id).emplace(ptrs);
+      }
+      return absl::OkStatus();
+    }
+
+    absl::StatusOr<AsyncValueRef<std::vector<void*>>> GetRecvPtr(
+        int64_t target_id) {
+      if (!IsInitialized(target_id)) {
+        return absl::InternalError(absl::StrCat("Target ID ", target_id,
+                                                " has not been initialized!"));
+      }
+      absl::MutexLock lock(&mutex_);
+      return recv_ptrs_[target_id];
+    }
+
+   private:
+    absl::Mutex mutex_;
+    absl::node_hash_map<int64_t, AsyncValueRef<std::vector<void*>>> recv_ptrs_
+        ABSL_GUARDED_BY(mutex_);
+  };
+
+  static P2PConfig GetP2PConfig(const HloCollectivePermuteInstruction* instr,
+                                int64_t replica_count, int64_t partition_count);
+
+  static bool IsDegenerate(const HloCollectivePermuteInstruction* instr,
+                           int64_t replica_count, int64_t partition_count);
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloCollectivePermuteInstruction* instr);
+
+  CollectivePermuteStartThunk(ThunkInfo thunk_info,
+                              const HloCollectivePermuteInstruction* instr,
+                              int64_t replica_count, int64_t partition_count,
+                              const std::vector<Buffer>& buffers,
+                              bool p2p_memcpy_enabled,
+                              AsyncStreamKind stream_kind);
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  static const char* GetHloOpName() { return "collective-permute-start"; }
+
+ protected:
+  const CollectiveConfig& config() const override { return config_.config; }
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+
+ private:
+  const P2PConfig config_;
+  std::vector<Buffer> buffers_;
+  RecvPtrMap recv_ptr_map_;
+  absl::Mutex barrier_mutex_;
+  absl::flat_hash_map<int64_t, std::unique_ptr<se::Event>>
+      receiver_barrier_events_;
+  absl::flat_hash_map<int64_t, std::unique_ptr<se::Event>>
+      sender_barrier_events_;
+
+  bool p2p_memcpy_enabled_ = false;
+  int64_t device_count_;
+};
+
+absl::Status RunCollectivePermute(
+    GpuCollectives* collectives, P2PConfig::SourceTargetMapEntry source_target,
+    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    Communicator* comm, absl::string_view device_string, int64_t current_id,
+    bool use_memcpy, CollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_PERMUTE_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
new file mode 100644
index 000000000000..330a3769407b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -0,0 +1,562 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/layout_util.h"
+#include "xla/primitive_util.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/rendezvous.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+namespace {
+
+static constexpr int64_t kCollectiveMemorySpaceColor = 1;
+static constexpr CollectiveStreamId kNoStreamId = CollectiveStreamId(0);
+
+bool IsTypeSupportedBy(PrimitiveType element_type, Thunk::Kind reduction_op) {
+  switch (element_type) {
+    case S8:
+    case PRED:
+    case U8:
+    case S32:
+    case U32:
+    case S64:
+    case U64:
+    case F16:
+    case F32:
+    case F64:
+    case BF16:
+    case C64:
+    case C128:
+      return true;
+    case S16:
+    case U16:
+      // 16-bit integer reductions are not directly supported by NCCL and cannot
+      // be implicitly converted into other 16-bit types like ncclFloat16 as
+      // they involve actual computation and not just data movement.
+    case F8E5M2:
+    case F8E4M3FN:
+    case F8E5M2FNUZ:
+    case F8E4M3FNUZ:
+    case F8E8M0FNU:
+      return !IsReductionCollective(reduction_op);
+    default:
+      return false;
+  }
+}
+
+absl::StatusOr<int64_t> GetNumLocalParticipants(
+    const Thunk::CollectiveExecuteParams& params,
+    const std::vector<GlobalDeviceId>& participants) {
+  if (!params.global_device_id_map) {
+    return participants.size();
+  }
+
+  std::vector<GlobalDeviceId> local_devices;
+  local_devices.reserve(params.global_device_id_map->size());
+  for (const auto& entry : *params.global_device_id_map) {
+    local_devices.push_back(entry.second);
+  }
+
+  return absl::c_count_if(participants, [&](const GlobalDeviceId& device_id) {
+    return absl::c_linear_search(local_devices, device_id);
+  });
+}
+
+}  // namespace
+
+// This file runs collective ops (i.e. ops that communicate between multiple
+// GPUs) using NCCL.
+//
+// Here's a high-level overview of how running an op works.
+//
+//  - Multiple threads call ExecuteOnStream.
+//  - All threads that "go together" (i.e. are participating in the "same"
+//    collective op) choose the same Rendezvous object from a global map.
+//  - Once all threads have arrived at the Rendezvous, we know exactly which
+//    GPUs are participating in the op, so we get or create a NcclClique
+//    containing those GPUs.
+//  - We perform the NCCL operation using the clique.
+
+// Returns if the collective communication operation is degenerate because all
+// the groups formed by the operation are singleton. A given op can be
+// degenerate under several conditions, corresponding to the modes supported
+// in GetParticipatingDevices().
+//   1. no channel id, use_global_device_ids = false:
+//         degenerate if replica_groups are singleton, or groups empty and
+//         replica_count == 1.
+//   2. channel_id is set, use_global_device_ids = false:
+//         degenerate if replica_groups are singleton and num_partitions == 1,
+//         or groups empty and num_replicas == 1 && num_partitions == 1.
+//   3. channel_id is set, use_global_device_ids = true (flattened-ids):
+//         degenerate if replica_groups are singleton (groups cannot be empty).
+//   4. no channel_id, no use_global_device_ids:
+//         identical to 1.
+//   5. channel_id is set, no use_global_device_ids:
+//         degenerate if replica_groups are singleton or group emty and
+//         num_partitions == 1 (since replica groups contain partition ids).
+//
+bool CollectiveConfig::IsDegenerate(int64_t replica_count,
+                                    int64_t partition_count) const {
+  bool groups_empty = replica_groups.empty();
+
+  // check if all replica_groups are singleton. If not, then the operation is
+  // not degenerate.
+  bool all_groups_singleton =
+      !groups_empty &&
+      absl::c_all_of(replica_groups, [](const ReplicaGroup& group) {
+        return group.replica_ids_size() == 1;
+      });
+
+  switch (group_mode) {
+    case CollectiveOpGroupMode::kCrossReplica:
+      return all_groups_singleton || (groups_empty && replica_count == 1);
+    case CollectiveOpGroupMode::kCrossPartition:
+      return all_groups_singleton || (groups_empty && partition_count == 1);
+    case CollectiveOpGroupMode::kCrossReplicaAndPartition:
+      return (all_groups_singleton && partition_count == 1) ||
+             (groups_empty && replica_count == 1 && partition_count == 1);
+    case CollectiveOpGroupMode::kFlattenedID:
+      CHECK(!groups_empty)
+          << "replica groups cannot be empty if use_global_device_ids = true";
+      return all_groups_singleton;
+    default:
+      CHECK(0) << "Invalid collective op mode";
+      return false;
+  }
+}
+
+void CollectiveConfig::SetCollectiveOpKindAndID(
+    const HloCollectivePermuteInstruction* instr) {
+  if (instr->channel_id().has_value()) {
+    collective_op_kind = RendezvousKey::kCrossModule;
+    op_id = instr->channel_id().value();
+  } else {
+    collective_op_kind = RendezvousKey::kCrossReplica;
+    op_id = static_cast<int64_t>(instr->GetModule()->unique_id());
+  }
+}
+
+void CollectiveConfig::SetCollectiveOpKindAndID(
+    const HloSendRecvInstruction* instr) {
+  int64_t channel_id = instr->channel_id().value_or(0);
+  if (channel_id > 0) {
+    collective_op_kind = RendezvousKey::kCrossModule;
+    op_id = channel_id;
+  } else {
+    collective_op_kind = RendezvousKey::kCrossReplica;
+    op_id = static_cast<int64_t>(instr->GetModule()->unique_id());
+  }
+}
+
+CollectiveConfig GetCollectiveConfig(
+    const HloInstruction* hlo, std::optional<bool> use_global_device_ids) {
+  CollectiveConfig config;
+  config.operand_count = hlo->operands().size();
+  config.operand_element_type.reserve(config.operand_count);
+  for (int i = 0; i < config.operand_count; i++) {
+    config.operand_element_type.push_back(
+        hlo->operand(i)->shape().element_type());
+  }
+  config.replica_groups = hlo->replica_groups();
+
+  if (hlo->channel_id().has_value()) {
+    config.collective_op_kind = RendezvousKey::kCrossModule;
+    config.op_id = *hlo->channel_id();
+  } else {
+    config.collective_op_kind = RendezvousKey::kCrossReplica;
+    config.op_id = static_cast<int64_t>(hlo->GetModule()->unique_id());
+  }
+
+  config.group_mode = GetCollectiveOpGroupMode(hlo->channel_id().has_value(),
+                                               use_global_device_ids)
+                          .value();
+
+  return config;
+}
+
+CollectiveThunk::CollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync,
+                                 AsyncStreamKind stream_kind)
+    : Thunk(kind, thunk_info),
+      stream_kind_(stream_kind),
+      async_events_(is_sync ? nullptr : new AsyncEvents()) {}
+
+absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
+    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind) {
+  GlobalDeviceId global_device_id = params.global_device_id;
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<GlobalDeviceId> participants,
+      GetParticipatingDevices(global_device_id, *params.device_assn,
+                              replica_groups, group_mode));
+
+  // If splitting is enabled, participating groups must match in order for a
+  // clique to be reused from the cache. We can ignore the participating groups
+  // otherwise.
+  static const int64_t enable_nccl_comm_splitting =
+      xla::GetDebugOptionsFromFlags().xla_gpu_enable_nccl_comm_splitting();
+  std::vector<std::vector<GlobalDeviceId>> participant_groups;
+  if (enable_nccl_comm_splitting) {
+    TF_ASSIGN_OR_RETURN(participant_groups,
+                        GetParticipatingDevicesGroups(
+                            *params.device_assn, replica_groups, group_mode));
+  }
+
+  if (collectives->IsGlobalConfig() &&
+      (participants.size() != params.device_assn->replica_count())) {
+    return InvalidArgument(
+        "Partial replica groups are not allowed when using NCCL_COMM_ID "
+        "environment configuration.");
+  }
+
+  TF_ASSIGN_OR_RETURN(int64_t num_local_participants,
+                      GetNumLocalParticipants(params, participants));
+
+  return GpuCliqueKey(std::move(participants), num_local_participants,
+                      kNoStreamId, stream_kind, std::move(participant_groups));
+}
+
+absl::StatusOr<CommunicatorHandle> GetComm(
+    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    const Thunk::CollectiveCliques& collective_cliques,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind) {
+  TF_ASSIGN_OR_RETURN(GpuCliqueKey clique_key,
+                      GetGpuCliqueKey(collectives, params, replica_groups,
+                                      group_mode, stream_kind));
+
+  std::optional<RankId> rank = clique_key.rank(params.global_device_id);
+  TF_ASSIGN_OR_RETURN(Communicator * comm,
+                      collective_cliques.GetComm(clique_key, *rank));
+
+  return CommunicatorHandle(comm, std::move(clique_key));
+}
+
+absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
+    const Thunk::ExecuteParams& params,
+    const std::vector<CollectiveThunk::Buffer>& buffers,
+    const std::vector<PrimitiveType>& element_types) {
+  return ConvertToDeviceBuffers(params.buffer_allocations, buffers,
+                                element_types);
+}
+
+absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
+    const BufferAllocations* buffer_allocations,
+    const std::vector<CollectiveThunk::Buffer>& buffers,
+    const std::vector<PrimitiveType>& element_types) {
+  if (buffers.size() != element_types.size())
+    return FailedPrecondition("Mismatch in operand buffer counts.");
+
+  std::vector<DeviceBufferPair> device_buffers;
+  device_buffers.reserve(buffers.size());
+  for (int i = 0; i < buffers.size(); ++i) {
+    device_buffers.emplace_back(DeviceBufferPair{
+        element_types[i], buffers[i].element_count,
+        buffer_allocations->GetDeviceAddress(buffers[i].source_buffer),
+        buffer_allocations->GetDeviceAddress(buffers[i].destination_buffer),
+        buffers[i].source_memory_space, buffers[i].destination_memory_space});
+  }
+  return device_buffers;
+}
+
+absl::Status RegisterBufferOnce(GpuCollectives* collectives,
+                                se::StreamExecutor* executor,
+                                Communicator* comm,
+                                se::DeviceMemoryBase buffer) {
+  // Keep track of which communicators we have registered for already.
+  // Each ncclMemAlloc'd buffer needs to be registered once per comm.
+  struct RegisteredBuffers {
+    absl::Mutex mu;
+    // Device ordinal, communicator, and base pointer address.
+    absl::flat_hash_set<std::tuple<int, uint64_t, Communicator*, void*>> records
+        ABSL_GUARDED_BY(mu);
+    // Buffers could be deregistered with ncclCommDeregister.
+    std::vector<std::unique_ptr<Communicator::RegisteredBufferHandle>> handles
+        ABSL_GUARDED_BY(mu);
+  };
+  static auto& all_registered = *new RegisteredBuffers;
+
+  // Since each XLA buffer is a slice into a larger BFCAllocator chunk, first
+  // get the base address of buffer. We will use the base address to keep track
+  // of which chunks we have registered.
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase base_buffer,
+                      executor->GetMemoryRange(buffer));
+
+  absl::MutexLock lock(&all_registered.mu);
+  if (!all_registered.records.contains(
+          {executor->device_ordinal(), buffer.size(), comm, buffer.opaque()})) {
+    // ncclCommRegister will internally get and use the base address/size of the
+    // address we provide.
+    VLOG(5) << "Registering " << buffer.opaque()
+            << " with size: " << buffer.size()
+            << " and base pointer: " << base_buffer.opaque();
+    TF_ASSIGN_OR_RETURN(auto handle, comm->RegisterBuffer(buffer));
+    all_registered.handles.push_back(std::move(handle));
+    all_registered.records.insert(
+        {executor->device_ordinal(), buffer.size(), comm, buffer.opaque()});
+  } else {
+    VLOG(5) << "Buffer: " << buffer.opaque() << " with size: " << buffer.size()
+            << " and base pointer: " << base_buffer.opaque()
+            << " is already registered.";
+  }
+  return absl::OkStatus();
+}
+
+absl::Status MaybeRegisterBuffers(GpuCollectives* collectives,
+                                  se::StreamExecutor* executor,
+                                  const std::vector<DeviceBufferPair>& buffers,
+                                  Communicator* comm) {
+  for (int i = 0; i < buffers.size(); ++i) {
+    if (buffers[i].source_memory_space == kCollectiveMemorySpaceColor) {
+      TF_RETURN_IF_ERROR(RegisterBufferOnce(collectives, executor, comm,
+                                            buffers[i].source_buffer));
+    }
+    if (buffers[i].destination_memory_space == kCollectiveMemorySpaceColor) {
+      TF_RETURN_IF_ERROR(RegisterBufferOnce(collectives, executor, comm,
+                                            buffers[i].destination_buffer));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CollectiveThunk::AsyncEvents::Initialize(
+    se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mu_);
+  if (events_.contains(executor)) return absl::OkStatus();
+
+  TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
+
+  events_.try_emplace(executor, std::move(event));
+  return absl::OkStatus();
+}
+
+absl::StatusOr<se::Event*> CollectiveThunk::AsyncEvents::GetEvent(
+    se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mu_);
+
+  auto event = events_.find(executor);
+  if (event == events_.end()) {
+    return absl::InternalError(
+        "Collective operation async completion event not initialized");
+  }
+
+  return event->second.get();
+}
+
+absl::Status CollectiveThunk::Prepare(
+    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  TF_ASSIGN_OR_RETURN(
+      GpuCliqueKey clique_key,
+      GetGpuCliqueKey(collectives, *params.collective_params,
+                      config().replica_groups, config().group_mode,
+                      GetAsyncStreamKind()));
+  return resource_requests.AddClique(clique_key);
+}
+
+absl::Status CollectiveThunk::Initialize(const InitializeParams& params) {
+  if (async_events_) {
+    TF_RETURN_IF_ERROR(async_events_->Initialize(params.executor));
+  }
+  return absl::OkStatus();
+}
+
+namespace {
+// Wrap GpuCliqueKey into a unique struct to guarantee we do not accidentally
+// try to run multiple unrelated rendezvous for a same key.
+struct FirstCallRendezvousKey {
+  GpuCliqueKey clique_key;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const FirstCallRendezvousKey& key) {
+    return H::combine(std::move(h), key.clique_key);
+  }
+};
+
+bool operator==(const FirstCallRendezvousKey& a,
+                const FirstCallRendezvousKey& b) {
+  return a.clique_key == b.clique_key;
+}
+}  // namespace
+
+absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(1) << absl::StreamFormat("Starting %s %s.", IsAsync() ? "async" : "sync",
+                                Thunk::KindToString(kind()));
+  AsyncStreamKind stream_kind = GetAsyncStreamKind();
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *params.collective_params,
+              *params.collective_cliques, config().replica_groups,
+              config().group_mode, stream_kind));
+  se::StreamExecutor* executor = params.stream->parent();
+  int64_t async_stream_idx = static_cast<int64_t>(stream_kind);
+
+  if (IsAsync()) {
+    // Launch collective operation on an async stream.
+    se::Stream& async_stream =
+        *params.collective_params->async_streams.at(async_stream_idx);
+
+    // Wait for main compute stream to make sure all buffers are ready.
+    TF_RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
+
+    TF_RETURN_IF_ERROR(RunCollective(params, async_stream, comm_handle));
+
+    // Record collective operation completion.
+    TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+    TF_RETURN_IF_ERROR(async_stream.RecordEvent(event));
+
+  } else {
+    // Launch collective operation on a main stream.
+    TF_RETURN_IF_ERROR(RunCollective(params, *params.stream, comm_handle));
+  }
+
+  // After a first execution of this instance of collective operation do a
+  // rendezvous with other participants to make sure that all of them allocated
+  // required state (internal to NCCL) and ready to continue. Going too far
+  // ahead on one rank leads to deadlocks in NCCL.
+  if (NeedFirstCallRendzevous() && !first_call_rendezvous_flag_.IsCompleted()) {
+    GpuCliqueKey clique_key = comm_handle.clique_key;
+    size_t num_local_participants = clique_key.num_local_participants();
+
+    auto global_device_id = params.collective_params->global_device_id;
+    RankId rank = clique_key.rank(global_device_id).value_or(RankId(-1));
+    VLOG(1) << "Do a rendezvous after a first call to "
+            << Thunk::KindToString(kind())
+            << "; run_id=" << params.collective_params->run_id.ToInt()
+            << "; op_id=" << config().op_id
+            << "; num_local_participants=" << num_local_participants
+            << "; rank=" << rank.value()
+            << "; clique_key=" << clique_key.ToString();
+
+    auto rendezvous_key = FirstCallRendezvousKey{std::move(clique_key)};
+    auto rendezvous_name = absl::StrFormat(
+        "first call to collective operation %d; run_id=%d", config().op_id,
+        params.collective_params->run_id.ToInt());
+
+    const xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
+
+    TF_RETURN_IF_ERROR(Rendezvous(
+        first_call_rendezvous_flag_, rendezvous_name, rendezvous_key,
+        num_local_participants,
+        /*warn_stuck_timeout=*/
+        absl::Seconds(
+            debug_options
+                .xla_gpu_first_collective_call_warn_stuck_timeout_seconds()),
+        /*terminate_timeout=*/
+        absl::Seconds(
+            debug_options
+                .xla_gpu_first_collective_call_terminate_timeout_seconds())
+
+            ));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::vector<Communicator*>> CollectiveThunk::GetCommunicators(
+    const ExecuteParams& params) const {
+  AsyncStreamKind stream_kind = GetAsyncStreamKind();
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  TF_ASSIGN_OR_RETURN(
+      CommunicatorHandle comm_handle,
+      GetComm(collectives, *params.collective_params,
+              *params.collective_cliques, config().replica_groups,
+              config().group_mode, stream_kind));
+  return std::vector<Communicator*>{comm_handle.comm};
+}
+
+std::string CollectiveThunk::GetDeviceString(
+    const Thunk::CollectiveExecuteParams& collective_params) {
+  GlobalDeviceId global_device_id = collective_params.global_device_id;
+  DeviceAssignment::LogicalID logical_id =
+      collective_params.device_assn->LogicalIdForDevice(global_device_id)
+          .value();
+  return absl::StrFormat("(r%d, p%d) : GlobalID %d, ord %d",
+                         logical_id.replica_id, logical_id.computation_id,
+                         global_device_id.value(),
+                         collective_params.local_device_ordinal);
+}
+
+CollectiveDoneThunk::CollectiveDoneThunk(
+    Thunk::Kind kind, ThunkInfo thunk_info,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
+    AsyncStreamKind async_stream_kind)
+    : Thunk(kind, std::move(thunk_info)),
+      async_events_(async_events),
+      async_stream_kind_(async_stream_kind) {}
+
+absl::Status CollectiveDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+  return params.stream->WaitFor(event);
+}
+
+absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op) {
+  if (!LayoutUtil::IsDenseArray(shape)) {
+    return absl::AbortedError(
+        absl::StrFormat("input is not a dense array: %s",
+                        shape.ToString(/*print_layout=*/true)));
+  }
+  if (!IsTypeSupportedBy(shape.element_type(), reduction_op)) {
+    return absl::AbortedError(absl::StrFormat(
+        "element type %s not suppored by NCCL",
+        primitive_util::LowercasePrimitiveTypeName(shape.element_type())));
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
new file mode 100644
index 000000000000..6a4d34b0088d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
@@ -0,0 +1,323 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Value.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/rendezvous.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+struct CollectiveConfig {
+  int64_t operand_count;
+  std::vector<PrimitiveType> operand_element_type;
+  std::vector<ReplicaGroup> replica_groups;
+  RendezvousKey::CollectiveOpKind collective_op_kind;
+  int64_t op_id;
+  CollectiveOpGroupMode group_mode;
+
+  template <typename OpT>
+  void SetCollectiveOpKindAndID(OpT op);
+  void SetCollectiveOpKindAndID(const HloCollectivePermuteInstruction* instr);
+  void SetCollectiveOpKindAndID(const HloSendRecvInstruction* instr);
+  bool IsDegenerate(int64_t replica_count, int64_t partition_count) const;
+};
+
+template <typename OpT>
+void CollectiveConfig::SetCollectiveOpKindAndID(OpT op) {
+  if (op.getChannelId()) {
+    collective_op_kind = RendezvousKey::kCrossModule;
+    op_id = static_cast<int64_t>(op.getChannelId()->getHandle());
+  } else {
+    collective_op_kind = RendezvousKey::kCrossReplica;
+    mlir::ModuleOp parent = op->template getParentOfType<mlir::ModuleOp>();
+    mlir::IntegerAttr unique_id =
+        parent->getAttrOfType<mlir::IntegerAttr>("hlo.unique_id");
+    op_id = static_cast<int64_t>(unique_id.getInt());
+  }
+}
+
+CollectiveConfig GetCollectiveConfig(const HloInstruction* hlo,
+                                     std::optional<bool> use_global_device_ids);
+
+template <typename OpT>
+CollectiveConfig GetCollectiveConfigForMlir(
+    OpT op, std::optional<bool> use_global_device_ids) {
+  CollectiveConfig config;
+  config.operand_count = op.getInputs().size();
+  config.operand_element_type.reserve(config.operand_count);
+  for (int i = 0; i < config.operand_count; i++) {
+    const Shape shape = GetShape(op.getInputs()[i]);
+    config.operand_element_type.push_back(shape.element_type());
+  }
+  config.replica_groups = ConvertReplicaGroups(op.getReplicaGroups()).value();
+  config.SetCollectiveOpKindAndID(op);
+  config.group_mode = GetCollectiveOpGroupMode(op.getChannelId().has_value(),
+                                               use_global_device_ids)
+                          .value();
+  return config;
+}
+
+// Handle to a communicator object with corresponding clique key.
+struct CommunicatorHandle {
+  CommunicatorHandle(Communicator* comm, GpuCliqueKey clique_key)
+      : comm(comm), clique_key(std::move(clique_key)) {}
+
+  Communicator* comm;       // communicator object
+  GpuCliqueKey clique_key;  // clique key
+};
+
+//===----------------------------------------------------------------------===//
+// CollectiveThunk
+//===----------------------------------------------------------------------===//
+
+// Forward declare.
+class CollectiveDoneThunk;
+
+// Thunk base class for XLA:GPU collective operations.
+class CollectiveThunk : public Thunk {
+ public:
+  CollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync,
+                  AsyncStreamKind stream_kind);
+
+  struct Buffer {
+    int64_t element_count;
+    BufferAllocation::Slice source_buffer;
+    BufferAllocation::Slice destination_buffer;
+    int64_t source_memory_space;
+    int64_t destination_memory_space;
+    mlir::Value source_value;
+    mlir::Value destination_value;
+  };
+
+  // Completion events for asynchronous collective operations (operations
+  // launched on a dedicated stream that is synchronized with main compute
+  // stream only when needed).
+  class AsyncEvents {
+   private:
+    friend class CollectiveThunk;
+    friend class CollectiveDoneThunk;
+    friend class CollectiveGroupThunk;
+
+    absl::Status Initialize(se::StreamExecutor* executor);
+    absl::StatusOr<se::Event*> GetEvent(se::StreamExecutor* executor);
+
+   private:
+    absl::Mutex mu_;
+    absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>> events_
+        ABSL_GUARDED_BY(mu_);
+  };
+
+  // Logging support.
+  static std::string GetDeviceString(
+      const Thunk::CollectiveExecuteParams& params);
+
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequestsInterface& resource_requests) override;
+
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  absl::StatusOr<std::vector<Communicator*>> GetCommunicators(
+      const ExecuteParams& params) const override;
+
+  std::shared_ptr<AsyncEvents> async_events() const { return async_events_; }
+  void set_async_events(std::shared_ptr<AsyncEvents> async_events) {
+    async_events_ = async_events;
+  }
+
+  CollectiveStreamId nccl_stream_id() const {
+    return xla::gpu::GetCollectiveStreamId(IsAsync(), GetAsyncStreamKind());
+  }
+
+  ExecutionStreamId nccl_execution_stream_id() const {
+    return ExecutionStreamId(execution_stream_id().value() +
+                             nccl_stream_id().value());
+  }
+
+ protected:
+  virtual absl::Status RunCollective(const ExecuteParams& params,
+                                     se::Stream& stream,
+                                     CommunicatorHandle comm) = 0;
+  virtual const CollectiveConfig& config() const = 0;
+  virtual AsyncStreamKind GetAsyncStreamKind() const { return stream_kind_; }
+
+  // A collective thunk is normally an independent operation in a sense that
+  // different instances of the same collective thunk communicate each other.
+  // The only exception are SendThunk and RecvThunk. Assume two devices are
+  // executing a program contains the following instructions, the Recv from
+  // device 1 will release the Send from device 0. Adding first call
+  // rendezvous on the SendThunk would cause a runtime deadlock.
+  //  Send(src_target={0,1})
+  //  Recv(src_target={0,1})
+  virtual bool NeedFirstCallRendzevous() const { return true; }
+
+ private:
+  const AsyncStreamKind stream_kind_;
+
+  bool IsAsync() const { return async_events_ != nullptr; }
+  std::shared_ptr<AsyncEvents> async_events_;
+
+  // After a first call to this particular instance of a collective thunk we do
+  // a round of rendezvous to make sure that all participants successfully
+  // allocated on-device state required for executing collective operation. This
+  // is required to avoid deadlocks when one device goes too far ahead and
+  // causes a deadlock in CUDA driver (root cause is mysterious).
+  //
+  // TODO(ezhulenev): Try to move this flag to NCCL clique as we need to make
+  // sure that all NCCL resources are allocated just once.
+  RendezvousFlag first_call_rendezvous_flag_;
+};
+
+//===----------------------------------------------------------------------===//
+// CollectiveDoneThunk
+//===----------------------------------------------------------------------===//
+
+class CollectiveDoneThunk : public Thunk {
+ public:
+  CollectiveDoneThunk(
+      Thunk::Kind kind, ThunkInfo thunk_info,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
+      AsyncStreamKind async_stream_kind);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  // return the execution stream id wheer previous async operator was launched
+  // to.
+  ExecutionStreamId nccl_execution_stream_id() const {
+    return ExecutionStreamId(
+        execution_stream_id().value() +
+        xla::gpu::GetCollectiveStreamId(true, async_stream_kind_).value());
+  }
+
+ private:
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events_;
+  AsyncStreamKind async_stream_kind_ = AsyncStreamKind::kCollective;
+};
+
+//===----------------------------------------------------------------------===//
+
+absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op);
+
+absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op);
+
+template <typename CollectiveThunkType, typename OpT>
+absl::Status AddOpDescription(absl::Status status, OpT op,
+                              int64_t replica_count, int64_t partition_count) {
+  if (status.ok()) {
+    return status;
+  }
+  CollectiveOpGroupMode group_mode = CollectiveThunkType::GetGroupMode(op);
+
+  int64_t operand_count = 0;
+  std::string str;
+
+  if constexpr (std::is_base_of_v<HloInstruction, std::remove_pointer_t<OpT>>) {
+    operand_count = op->operand_count();
+    str = op->ToString();
+  } else {
+    operand_count = op->getNumOperands() / 2;
+    str = llvm_ir::DumpToString(op.getOperation());
+  }
+
+  return absl::Status(
+      status.code(),
+      absl::StrFormat(
+          "%s\n"
+          "%s with replica_count: %d, partition_count: %d, group_mode: %s, "
+          "operand_count: %d\n%s",
+          status.message(), CollectiveThunkType::GetHloOpName(), replica_count,
+          partition_count, CollectiveOpGroupModeToString(group_mode),
+          operand_count, str));
+}
+
+//===----------------------------------------------------------------------===//
+
+absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
+    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind);
+
+// Returns a communicator and additional information about the clique.
+absl::StatusOr<CommunicatorHandle> GetComm(
+    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
+    const Thunk::CollectiveCliques& collective_cliques,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode, AsyncStreamKind stream_kind);
+
+struct DeviceBufferPair {
+  PrimitiveType element_type;
+  int64_t element_count;
+  se::DeviceMemoryBase source_buffer;
+  se::DeviceMemoryBase destination_buffer;
+  int64_t source_memory_space;
+  int64_t destination_memory_space;
+};
+
+absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
+    const Thunk::ExecuteParams& params,
+    const std::vector<CollectiveThunk::Buffer>& buffers,
+    const std::vector<PrimitiveType>& element_types);
+
+absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
+    const BufferAllocations* buffer_allocations,
+    const std::vector<CollectiveThunk::Buffer>& buffers,
+    const std::vector<PrimitiveType>& element_types);
+
+// Registers buffers allocated in collective memory with a communicator to
+// enable zero-copy collectives.
+//
+// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
+absl::Status MaybeRegisterBuffers(GpuCollectives* collectives,
+                                  se::StreamExecutor* executor,
+                                  const std::vector<DeviceBufferPair>& buffers,
+                                  Communicator* comm);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index e8f87b31a20f..a5fc9bf8604c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
@@ -33,27 +33,32 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/annotation.h"
+#include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_gather_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_reduce_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_to_all_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/runtime/execution_graph.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
@@ -68,27 +73,23 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/trace_command_buffer_factory.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla::gpu {
 
-using ExecutionScopeId = se::CommandBuffer::ExecutionScopeId;
 using MemoryAccess = BufferUse::MemoryAccess;
 
 std::string CommandBufferCmdString(CommandBufferCmdType type) {
@@ -116,255 +117,405 @@ static absl::string_view ReductionKindString(ReductionKind kind) {
   }
 }
 
-// Creates command buffer builder from a cmd sequence.
-static se::CommandBuffer::Builder CreateBuilder(
-    CommandBufferCmdSequence* commands,
+// Create a callback to create a command buffer from a command sequence.
+static se::CommandBuffer::CreateCommands CreateCommands(
+    const CommandBufferCmdExecutor* commands,
     const Thunk::ExecuteParams* execute_params,
     const CommandBufferCmd::RecordParams* record_params) {
-  return [=](se::CommandBuffer* command_buffer) {
-    return commands->Record(*execute_params, *record_params, command_buffer,
-                            CommandBufferCmdSequence::RecordMode::kConditional);
+  return [=](se::CommandBuffer* command_buffer,
+             absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+    return commands->RecordCreate(*execute_params, *record_params,
+                                  command_buffer, dependencies);
   };
 }
 
-// Creates command buffer builders from a span of cmd sequences.
-static std::vector<se::CommandBuffer::Builder> CreateBuilders(
-    absl::Span<CommandBufferCmdSequence> commands,
+// Create callbacks to create a command buffer from command sequences.
+static std::vector<se::CommandBuffer::CreateCommands> CreateCommands(
+    absl::Span<const CommandBufferCmdExecutor> commands,
     const Thunk::ExecuteParams* execute_params,
     const CommandBufferCmd::RecordParams* record_params) {
-  std::vector<se::CommandBuffer::Builder> builders;
-  for (CommandBufferCmdSequence& cmd : commands) {
-    builders.push_back(CreateBuilder(&cmd, execute_params, record_params));
+  std::vector<se::CommandBuffer::CreateCommands> create_commands;
+  for (const CommandBufferCmdExecutor& cmd : commands) {
+    create_commands.push_back(
+        CreateCommands(&cmd, execute_params, record_params));
   }
-  return builders;
+  return create_commands;
 }
 
-// Creates command buffer execution scope builder from a cmd sequence.
-static se::CommandBuffer::ExecutionScopeBuilder CreateExecutionScopeBuilder(
-    CommandBufferCmdSequence* commands,
+// Create a callback to update a command buffer with command sequence.
+static se::CommandBuffer::UpdateCommands UpdateCommands(
+    const CommandBufferCmdExecutor* commands,
     const Thunk::ExecuteParams* execute_params,
     const CommandBufferCmd::RecordParams* record_params) {
-  return [=](ExecutionScopeId id, se::CommandBuffer* command_buffer) {
-    CommandBufferCmd::RecordParams params = *record_params;
-    params.execution_scope_id = id;
-    return commands->Record(*execute_params, params, command_buffer,
-                            CommandBufferCmdSequence::RecordMode::kConditional);
+  return [=](se::CommandBuffer* command_buffer) {
+    return commands->RecordUpdate(*execute_params, *record_params,
+                                  command_buffer);
   };
 }
 
+// Create callbacks to update a command buffer with command sequence.
+static std::vector<se::CommandBuffer::UpdateCommands> UpdateCommands(
+    absl::Span<const CommandBufferCmdExecutor> commands,
+    const Thunk::ExecuteParams* execute_params,
+    const CommandBufferCmd::RecordParams* record_params) {
+  std::vector<se::CommandBuffer::UpdateCommands> update_commands;
+  for (const CommandBufferCmdExecutor& cmd : commands) {
+    update_commands.push_back(
+        UpdateCommands(&cmd, execute_params, record_params));
+  }
+  return update_commands;
+}
+
+//===----------------------------------------------------------------------===//
+// CommandBufferCmd::RecordAction helpers.
+//===----------------------------------------------------------------------===//
+
+using CreateCommand =
+    absl::FunctionRef<absl::StatusOr<const se::CommandBuffer::Command*>(
+        absl::Span<const se::CommandBuffer::Command* const> dependencies)>;
+
+using UpdateCommand =
+    absl::FunctionRef<absl::Status(const se::CommandBuffer::Command* command)>;
+
+// Handles a record action by calling one of the user-provided functions.
+static absl::StatusOr<const se::CommandBuffer::Command*> Handle(
+    CommandBufferCmd::RecordAction action, CreateCommand create_command,
+    UpdateCommand update_command) {
+  if (auto* create = std::get_if<CommandBufferCmd::RecordCreate>(&action)) {
+    return create_command(create->dependencies);
+  }
+
+  if (auto* update = std::get_if<CommandBufferCmd::RecordUpdate>(&action)) {
+    TF_RETURN_IF_ERROR(update_command(update->command));
+    return update->command;
+  }
+
+  return Internal("Invalid record action");
+}
+
 //===----------------------------------------------------------------------===//
 // CommandBufferCmd
 //===----------------------------------------------------------------------===//
 
+CommandBufferCmd::StateManager::TypeId
+CommandBufferCmd::StateManager::GetNextTypeId() {
+  static auto* counter = new std::atomic<int64_t>(1);
+  return TypeId(counter->fetch_add(1));
+}
+
 CommandBufferCmd::State* CommandBufferCmd::StateManager::GetOrNull(
-    const CommandBufferCmd* cmd) {
-  if (auto it = state_.find(cmd); it != state_.end()) {
+    const CommandBufferCmd* cmd, const se::CommandBuffer* command_buffer,
+    TypeId type_id) {
+  Key key = {cmd, command_buffer, type_id};
+  if (auto it = state_.find(key); it != state_.end()) {
     return it->second.get();
   }
   return nullptr;
 }
 
 CommandBufferCmd::State* CommandBufferCmd::StateManager::GetOrCreate(
-    const CommandBufferCmd* cmd,
-    absl::FunctionRef<std::unique_ptr<State>()> create) {
-  if (auto it = state_.find(cmd); it != state_.end()) {
+    const CommandBufferCmd* cmd, const se::CommandBuffer* command_buffer,
+    TypeId type_id, absl::FunctionRef<std::unique_ptr<State>()> create) {
+  Key key = {cmd, command_buffer, type_id};
+  if (auto it = state_.find(key); it != state_.end()) {
     return it->second.get();
   }
-  return state_.try_emplace(cmd, create()).first->second.get();
-}
-
-se::CommandBuffer::ExecutionScopeId CommandBufferCmd::GetExecutionScope(
-    const RecordParams& record_params,
-    ExecutionStreamId execution_stream_id) const {
-  uint64_t base = record_params.execution_scope_id.value();
-  uint64_t offset = execution_stream_id.value();
-  return se::CommandBuffer::ExecutionScopeId(base + offset);
-}
-
-se::CommandBuffer::ExecutionScopeId CommandBufferCmd::GetExecutionScope(
-    const RecordParams& record_params) const {
-  return GetExecutionScope(record_params, execution_stream_id_);
+  return state_.try_emplace(key, create()).first->second.get();
 }
 
 //===----------------------------------------------------------------------===//
 // CommandBufferCmdSequence
 //===----------------------------------------------------------------------===//
 
-CommandBufferCmdSequence::CommandBufferCmdSequence(
-    SynchronizationMode synchronization_mode)
-    : synchronization_mode_(synchronization_mode) {}
+namespace {
+// An adaptor from CommandBufferCmd to ExecutionGraph::Operation for building an
+// execution graph from a command sequence.
+class CommandOperation : public ExecutionGraph::Operation {
+ public:
+  explicit CommandOperation(const CommandBufferCmd* cmd)
+      : name_(cmd->ToString()), buffers_(cmd->buffers()) {}
+
+  absl::string_view name() const final { return name_; }
+  absl::Span<const BufferUse> BufferUses() const final { return buffers_; }
+  absl::Span<const ResourceUse> ResourceUses() const final { return {}; }
+
+ private:
+  std::string name_;
+  CommandBufferCmd::BufferUseVector buffers_;
+};
+}  // namespace
 
-void CommandBufferCmdSequence::Append(std::unique_ptr<CommandBufferCmd> cmd) {
-  for (const BufferUse& buffer : cmd->buffers()) {
-    buffers_.insert(buffer);
-    allocs_indices_.insert(buffer.slice().index());
+absl::StatusOr<CommandBufferCmdExecutor> CommandBufferCmdExecutor::Create(
+    CommandBufferCmdSequence commands,
+    SynchronizationMode synchronization_mode) {
+  std::optional<ExecutionGraph> execution_graph = std::nullopt;
+
+  // In automatic synchronization mode construct an execution graph for the
+  // sequence of commands and derive the structure of command dependencies
+  // from the buffer use conflicts.
+  if (synchronization_mode == SynchronizationMode::kAutomatic) {
+    std::vector<CommandOperation> operations;
+    operations.reserve(commands.size());
+    for (const std::unique_ptr<CommandBufferCmd>& cmd : commands) {
+      operations.emplace_back(cmd.get());
+    }
+    TF_ASSIGN_OR_RETURN(execution_graph,
+                        ExecutionGraph::Create<CommandOperation>(operations));
   }
 
-  ExecutionStreamId execution_stream_id = cmd->execution_stream_id();
-  CommandBufferCmd::BufferUseVector buffers = cmd->buffers();
-  bool requires_barrier = HasConflicts(execution_stream_id, buffers);
-
-  // Always add barriers between commands if we want to serialize execution.
-  if (synchronization_mode_ == SynchronizationMode::kSerialize &&
-      !commands_.empty()) {
-    requires_barrier = true;
-  }
+  return CommandBufferCmdExecutor(synchronization_mode, std::move(commands),
+                                  std::move(execution_graph));
+}
 
-  // If the first recorded command is implemented as a nested command buffer we
-  // force a barrier before recording the next command as a workaround for CUDA
-  // graph bug, where child CUDA graph must be a single CUDA graph root node.
-  if (commands_.size() == 1 && commands_.front().cmd->IsNestedCommandBuffer()) {
-    requires_barrier = true;
+CommandBufferCmdExecutor::CommandBufferCmdExecutor(
+    SynchronizationMode synchronization_mode, CommandBufferCmdSequence commands,
+    std::optional<ExecutionGraph> execution_graph)
+    : synchronization_mode_(synchronization_mode),
+      commands_(std::move(commands)),
+      execution_graph_(std::move(execution_graph)) {
+  // Record all buffers used by commands in the sequence.
+  for (const std::unique_ptr<CommandBufferCmd>& cmd : commands_) {
+    for (const BufferUse& buffer : cmd->buffers()) {
+      buffers_.insert(buffer);
+      allocs_indices_.insert(buffer.slice().index());
+    }
   }
-
-  if (requires_barrier) ClearTrackedBuffers(execution_stream_id);
-
-  commands_.push_back({std::move(cmd), requires_barrier});
-  TrackBuffers(execution_stream_id, buffers);
 }
 
-absl::Status CommandBufferCmdSequence::Prepare(
+absl::Status CommandBufferCmdExecutor::Prepare(
     const Thunk::PrepareParams& params,
     Thunk::ResourceRequestsInterface& resource_requests) {
   for (auto& command : commands_) {
-    TF_RETURN_IF_ERROR(command.cmd->Prepare(params, resource_requests));
+    TF_RETURN_IF_ERROR(command->Prepare(params, resource_requests));
   }
   return absl::OkStatus();
 }
 
-absl::Status CommandBufferCmdSequence::Initialize(
+absl::Status CommandBufferCmdExecutor::Initialize(
     const Thunk::InitializeParams& params,
     CommandBufferCmd::StateManager& state) {
   for (auto& command : commands_) {
-    TF_RETURN_IF_ERROR(command.cmd->Initialize(params, state));
+    TF_RETURN_IF_ERROR(command->Initialize(params, state));
   }
   return absl::OkStatus();
 }
 
-namespace {
-// Returns true if slice overlaps with any of the slices in read set.
-bool Overlaps(const BufferAllocation::Slice& slice,
-              const absl::flat_hash_set<BufferAllocation::Slice>& slices) {
-  if (slices.contains(slice)) return true;
-  for (auto& read : slices)
-    if (read.OverlapsWith(slice)) return true;
-  return false;
-}
-}  // namespace
-
-bool CommandBufferCmdSequence::HasConflicts(
-    ExecutionStreamId execution_stream_id,
-    const CommandBufferCmd::BufferUseVector& buffers) {
-  auto& rwset = read_write_sets_[execution_stream_id];
-
-  return absl::c_any_of(buffers, [&](const auto& buffer) {
-    return buffer.access() == MemoryAccess::kWrite
-               ? Overlaps(buffer.slice(), rwset.write) ||
-                     Overlaps(buffer.slice(), rwset.read)
-               : Overlaps(buffer.slice(), rwset.write);
-  });
-}
+absl::Status CommandBufferCmdExecutor::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const CommandBufferCmd::RecordParams& record_params,
+    se::CommandBuffer* command_buffer) {
+  VLOG(3) << "Record " << commands_.size() << " commands into command buffer";
 
-void CommandBufferCmdSequence::TrackBuffers(
-    ExecutionStreamId execution_stream_id,
-    const CommandBufferCmd::BufferUseVector& buffers) {
-  auto& rwset = read_write_sets_[execution_stream_id];
-  for (const BufferUse& buffer : buffers) {
-    if (buffer.access() == MemoryAccess::kWrite)
-      rwset.write.insert(buffer.slice());
-    if (buffer.access() == MemoryAccess::kRead)
-      rwset.read.insert(buffer.slice());
+  if (command_buffer->state() == se::CommandBuffer::State::kFinalized) {
+    TF_RETURN_IF_ERROR(command_buffer->Update());
   }
-}
-
-void CommandBufferCmdSequence::ClearTrackedBuffers(
-    ExecutionStreamId execution_stream_id) {
-  read_write_sets_[execution_stream_id] = ReadWriteSet();
-}
 
-static absl::string_view RecordModeString(
-    CommandBufferCmdSequence::RecordMode mode) {
-  switch (mode) {
-    case CommandBufferCmdSequence::RecordMode::kExclusive:
-      return "exclusive";
-    case CommandBufferCmdSequence::RecordMode::kConditional:
-      return "conditional";
+  if (command_buffer->state() == se::CommandBuffer::State::kUpdate) {
+    TF_RETURN_IF_ERROR(
+        RecordUpdate(execute_params, record_params, command_buffer));
+  } else {
+    TF_RETURN_IF_ERROR(
+        RecordCreate(execute_params, record_params, command_buffer, {})
+            .status());
   }
+
+  return command_buffer->Finalize();
 }
 
-absl::Status CommandBufferCmdSequence::Record(
+absl::StatusOr<std::vector<const se::CommandBuffer::Command*>>
+CommandBufferCmdExecutor::RecordCreate(
     const Thunk::ExecuteParams& execute_params,
-    const CommandBufferCmd::RecordParams& record_params,
-    se::CommandBuffer* command_buffer, RecordMode mode) {
-  VLOG(3) << "Record " << commands_.size() << " commands into command buffer"
-          << "; mode=" << RecordModeString(mode);
+    const RecordParams& record_params, se::CommandBuffer* command_buffer,
+    absl::Span<const se::CommandBuffer::Command* const> dependencies) const {
+  // Command buffer must be in create state.
+  TF_RETURN_IF_ERROR(CheckCommandBufferState(
+      command_buffer, se::CommandBuffer::State::kCreate));
+
+  VLOG(3) << "Create " << commands_.size() << " commands";
   uint64_t start_micros = tsl::Env::Default()->NowMicros();
 
-  if (mode == RecordMode::kExclusive) {
-    if (command_buffer->state() == se::CommandBuffer::State::kFinalized) {
-      TF_RETURN_IF_ERROR(command_buffer->Update());
-    }
+  // Short-circuit if there are no commands to record.
+  if (commands_.empty()) {
+    return std::vector<const se::CommandBuffer::Command*>{};
   }
 
-  // Track the number of commands recorded between barriers.
-  absl::flat_hash_map<ExecutionScopeId, int64_t> num_recorded_commands;
+  // Keep a state associated with commands in the sequence in the state manager.
+  CommandBufferCmd::StateManager& state = record_params.state;
+
+  // Collect sink commands while recording the command sequence.
+  std::vector<const se::CommandBuffer::Command*> sink_commands;
 
-  for (CommandInfo& command : commands_) {
+  for (CommandId id = 0; id < commands_.size(); ++id) {
+    CommandBufferCmd* command = commands_[id].get();
+
+    std::optional<tsl::profiler::ScopedAnnotation> annotation =
+        GetKernelAnnotation(command->profile_annotation());
+
+    // Skip recording collective commands if mock collectives are enabled.
     if (execute_params.mock_collectives &&
-        dynamic_cast<CollectiveCmd*>(command.cmd.get())) {
+        dynamic_cast<CollectiveCmd*>(command)) {
       continue;
     }
 
-    ExecutionScopeId execution_scope_id =
-        command.cmd->GetExecutionScope(record_params);
-    std::optional<tsl::profiler::ScopedAnnotation> annotation =
-        GetKernelAnnotation(command.cmd->profile_annotation());
-
-    if (command.requires_barrier) {
-      VLOG(3) << "Add command buffer barrier after "
-              << num_recorded_commands[execution_scope_id]
-              << " recorded commands into the execution scope #"
-              << execution_scope_id.value();
-      TF_RETURN_IF_ERROR(command_buffer->Barrier(execution_scope_id));
-      num_recorded_commands.erase(execution_scope_id);
+    // Create new commands by recording them into the command buffer.
+    DCHECK(!state.GetOrNull<RecordState>(command, command_buffer))
+        << "Record state must be null for " << command->ToString();
+    auto* record_state =
+        state.GetOrCreate<RecordState>(command, command_buffer);
+
+    std::vector<const se::CommandBuffer::Command*> command_dependencies =
+        Dependencies(record_params, command_buffer, id);
+
+    // Source command must depend on external dependencies passed by the caller,
+    // internal commands dependencies are defined by the command sequence
+    // structure (buffer and resource dependencies).
+    auto record_action =
+        IsSource(id) ? CommandBufferCmd::RecordCreate{dependencies}
+                     : CommandBufferCmd::RecordCreate{command_dependencies};
+
+    TF_ASSIGN_OR_RETURN(
+        record_state->command,
+        command->Record(execute_params, record_params, std::move(record_action),
+                        command_buffer));
+
+    // Collect sink commands as external dependencies for the next command
+    // sequence recorded into the same command buffer.
+    if (IsSink(id)) {
+      sink_commands.push_back(record_state->command);
     }
-    VLOG(5) << "Record command buffer with scope id "
-            << execution_scope_id.value();
+  }
 
-    TF_RETURN_IF_ERROR(
-        command.cmd->Record(execute_params, record_params, command_buffer));
-    ++num_recorded_commands[execution_scope_id];
+  uint64_t end_micros = tsl::Env::Default()->NowMicros();
+  VLOG(3) << absl::StrFormat(
+      "Created %d commands in %d μs (num sink commands: %d)", commands_.size(),
+      end_micros - start_micros, sink_commands.size());
+
+  return sink_commands;
+}
+
+absl::Status CommandBufferCmdExecutor::RecordUpdate(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params,
+    se::CommandBuffer* command_buffer) const {
+  // Command buffer must be already prepared for recording updates.
+  TF_RETURN_IF_ERROR(CheckCommandBufferState(
+      command_buffer, se::CommandBuffer::State::kUpdate));
+
+  VLOG(3) << "Update " << commands_.size() << " commands";
+  uint64_t start_micros = tsl::Env::Default()->NowMicros();
+
+  // Short-circuit if there are no commands to update.
+  if (commands_.empty()) {
+    return absl::OkStatus();
   }
 
-  if (mode == RecordMode::kExclusive) {
-    TF_RETURN_IF_ERROR(command_buffer->Finalize());
+  // Keep a state associated with commands in the sequence in the state manager.
+  CommandBufferCmd::StateManager& state = record_params.state;
+
+  for (CommandId id = 0; id < commands_.size(); ++id) {
+    CommandBufferCmd* command = commands_[id].get();
+
+    std::optional<tsl::profiler::ScopedAnnotation> annotation =
+        GetKernelAnnotation(command->profile_annotation());
+
+    // Skip updating collective commands if mock collectives are enabled.
+    if (execute_params.mock_collectives &&
+        dynamic_cast<CollectiveCmd*>(command)) {
+      continue;
+    }
+
+    // Update existing commands in the command buffer.
+    auto* record_state = state.GetOrNull<RecordState>(command, command_buffer);
+    DCHECK(record_state) << "Record state must be not null for "
+                         << command->ToString();
+
+    auto record_action = CommandBufferCmd::RecordUpdate{record_state->command};
+    TF_ASSIGN_OR_RETURN(
+        record_state->command,
+        command->Record(execute_params, record_params, std::move(record_action),
+                        command_buffer));
   }
 
   uint64_t end_micros = tsl::Env::Default()->NowMicros();
-  VLOG(3) << "Recorded " << commands_.size()
-          << " commands into command buffer in " << (end_micros - start_micros)
-          << " μs; mode=" << RecordModeString(mode);
+  VLOG(3) << "Updated " << commands_.size() << " commands in "
+          << (end_micros - start_micros) << " μs";
 
   return absl::OkStatus();
 }
 
-const absl::flat_hash_set<BufferUse>& CommandBufferCmdSequence::buffers()
+absl::Status CommandBufferCmdExecutor::CheckCommandBufferState(
+    se::CommandBuffer* command_buffer,
+    se::CommandBuffer::State expected_state) const {
+  if (command_buffer->state() != expected_state) {
+    return Internal("Command buffer must be in %v state, got %v",
+                    expected_state, command_buffer->state());
+  }
+  return absl::OkStatus();
+}
+
+bool CommandBufferCmdExecutor::IsSource(CommandId id) const {
+  return execution_graph_ ? execution_graph_->is_source(id) : id == 0;
+}
+
+bool CommandBufferCmdExecutor::IsSink(CommandId id) const {
+  return execution_graph_ ? execution_graph_->is_sink(id)
+                          : id + 1 == commands_.size();
+}
+
+std::vector<const se::CommandBuffer::Command*>
+CommandBufferCmdExecutor::Dependencies(const RecordParams& record_params,
+                                       se::CommandBuffer* command_buffer,
+                                       CommandId id) const {
+  // Source commands have no dependencies.
+  if (IsSource(id)) {
+    return {};
+  }
+
+  // Collect commands that are dependencies of the command `id`.
+  absl::InlinedVector<CommandId, 4> dependencies_ids;
+  if (execution_graph_) {
+    for (const ExecutionGraph::NodeEdge& in_edge :
+         execution_graph_->in_edges(id)) {
+      dependencies_ids.push_back(in_edge.id);
+    }
+  } else {
+    dependencies_ids.push_back(id - 1);
+  }
+
+  // Collect dependencies from the recorded command state.
+  std::vector<const se::CommandBuffer::Command*> dependencies;
+  for (CommandId dependency_id : dependencies_ids) {
+    auto* record_state = record_params.state.GetOrNull<RecordState>(
+        commands_[dependency_id].get(), command_buffer);
+    DCHECK(record_state) << "Record state must be not null for "
+                         << commands_[dependency_id]->ToString();
+
+    if (record_state->command == nullptr) {
+      // Some commands might end up not recording anything into the command
+      // buffer, e.g. memcpy commands where source and destination are the same.
+      // We have to follow dependencies of such commands to find the real
+      // dependencies, so we don't record a command that is immediately ready to
+      // execute, as it will create data races.
+      auto deps = Dependencies(record_params, command_buffer, dependency_id);
+      dependencies.insert(dependencies.end(), deps.begin(), deps.end());
+    } else {
+      dependencies.push_back(record_state->command);
+    }
+  }
+
+  return dependencies;
+}
+
+const absl::flat_hash_set<BufferUse>& CommandBufferCmdExecutor::buffers()
     const {
   return buffers_;
 }
 
 const absl::flat_hash_set<BufferAllocation::Index>&
-CommandBufferCmdSequence::allocs_indices() const {
+CommandBufferCmdExecutor::allocs_indices() const {
   return allocs_indices_;
 }
 
-std::vector<bool> CommandBufferCmdSequence::barriers() const {
-  std::vector<bool> barriers;
-  absl::c_transform(commands_, std::back_inserter(barriers),
-                    [](auto& command) { return command.requires_barrier; });
-  return barriers;
-}
-
 //===----------------------------------------------------------------------===//
 // TracedCommandBuffer
 //===----------------------------------------------------------------------===//
@@ -447,12 +598,14 @@ TracedCommandBufferCmd::TracedCommandBufferCmd(
     CommandBufferCmdType cmd_type, ExecutionStreamId execution_stream_id)
     : CommandBufferCmd(cmd_type, execution_stream_id) {}
 
-absl::Status TracedCommandBufferCmd::AddTracedCommandBuffer(
+absl::StatusOr<const se::CommandBuffer::Command*>
+TracedCommandBufferCmd::RecordTracedCommand(
     const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer,
     absl::FunctionRef<absl::Status(se::Stream*)> trace) {
-  auto traced_cmd =
-      record_params.state.GetOrCreate<TracedCommandBuffer>(this, [&] {
+  auto traced_cmd = record_params.state.GetOrCreate<TracedCommandBuffer>(
+      this, command_buffer, [&] {
         const auto& debug_options = xla::GetDebugOptionsFromFlags();
         return std::make_unique<TracedCommandBuffer>(
             this, buffers(), debug_options.xla_cmd_buffer_trace_cache_size());
@@ -464,72 +617,21 @@ absl::Status TracedCommandBufferCmd::AddTracedCommandBuffer(
           execute_params.buffer_allocations, execute_params.stream->parent(),
           execute_params.command_buffer_trace_stream, trace));
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "Add nested command buffer to execution scope: "
-          << execution_scope_id.value();
-  return command_buffer->AddNestedCommandBuffer(execution_scope_id,
-                                                *nested_cmd);
+  VLOG(5) << "Record traced command into command buffer: " << command_buffer;
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateNestedCommand(*nested_cmd, dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateNestedCommand(command, *nested_cmd);
+      });
 }
 
 //===----------------------------------------------------------------------===//
 // ComputationId
 //===----------------------------------------------------------------------===//
 
-// TODO(ezhulenev): PTX kernel should be replaced with CUDA C++ kernel but
-// today we accidentally try to build them without CUDA support. We need to
-// clean our build and testing infrastructure first.
-
-// PTX kernel compiled from:
-//
-// __global__ void memset32(int64_t n, uint32_t value, uint32_t* dst)
-// {
-//   int i = blockIdx.x*blockDim.x + threadIdx.x;
-//   if (i < n) dst[i] = value;
-// }
-//
-// Easiest way to get PTX from C++ is to use https://godbolt.org.
-inline constexpr absl::string_view kMemset32Kernel = R"(
-.version 4.0
-.target sm_50
-.address_size 64
-
-.visible .entry memset32(
-        .param .u64 memset32_param_0,
-        .param .u32 memset32_param_1,
-        .param .u64 memset32_param_2
-)
-{
-        .reg .pred      %p<2>;
-        .reg .b32       %r<6>;
-        .reg .b64       %rd<7>;
-        .loc    1 3 0
-
-        ld.param.u64    %rd3, [memset32_param_0];
-        ld.param.u32    %r1, [memset32_param_1];
-        ld.param.u64    %rd2, [memset32_param_2];
-        .loc    1 5 3
-        mov.u32         %r2, %ctaid.x;
-        mov.u32         %r3, %ntid.x;
-        mov.u32         %r4, %tid.x;
-        mad.lo.s32      %r5, %r2, %r3, %r4;
-        .loc    1 6 3
-        cvt.s64.s32     %rd1, %r5;
-        setp.ge.s64     %p1, %rd1, %rd3;
-        @%p1 bra        $L__BB0_2;
-
-        .loc    1 5 3
-        cvta.to.global.u64      %rd4, %rd2;
-        .loc    1 6 3
-        shl.b64         %rd5, %rd1, 2;
-        add.s64         %rd6, %rd4, %rd5;
-        st.global.u32   [%rd6], %r1;
-
-$L__BB0_2:
-        .loc    1 7 1
-        ret;
-
-})";
-
 ComputationIdCmd::ComputationIdCmd(ExecutionStreamId execution_stream_id,
                                    BufferAllocation::Slice dest, Kind kind)
     : CommandBufferCmd(CommandBufferCmdType::kComputationIdCmd,
@@ -537,34 +639,14 @@ ComputationIdCmd::ComputationIdCmd(ExecutionStreamId execution_stream_id,
       dest_(dest),
       kind_(kind) {}
 
-CommandBufferCmd::BufferUseVector ComputationIdCmd::buffers() {
+CommandBufferCmd::BufferUseVector ComputationIdCmd::buffers() const {
   return {{dest_, MemoryAccess::kWrite}};
 }
 
-absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params,
-                                          StateManager& state) {
-  auto cuda_cc = std::get_if<stream_executor::CudaComputeCapability>(
-      &params.executor->GetDeviceDescription().gpu_compute_capability());
-  if (cuda_cc != nullptr) {
-    {
-      absl::MutexLock lock(&mutex_);
-      if (memset_kernels_.contains(params.executor)) return absl::OkStatus();
-    }
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
-                        CreateKernel("memset32", 3, kMemset32Kernel,
-                                     /*cubin_data=*/{}, params.executor,
-                                     /*shared_mem_bytes=*/0));
-
-    absl::MutexLock lock(&mutex_);
-    memset_kernels_.emplace(params.executor, std::move(kernel));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ComputationIdCmd::Record(
+absl::StatusOr<const se::CommandBuffer::Command*> ComputationIdCmd::Record(
     const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase dst =
       execute_params.buffer_allocations->GetDeviceAddress(dest_);
 
@@ -578,35 +660,21 @@ absl::Status ComputationIdCmd::Record(
   uint32_t value = kind_ == Kind::kReplica ? logical_id.replica_id
                                            : logical_id.computation_id;
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
   VLOG(5) << "ComputationIdCmd"
           << ": kind=" << (kind_ == Kind::kReplica ? "replica" : "partition")
-          << "; value=" << value
-          << "; execution_scope_id=" << execution_scope_id.value();
+          << "; value=" << value;
   VLOG(5) << "  Id: " << dest_ << " (" << dst.opaque() << ")";
-  auto cuda_cc = std::get_if<stream_executor::CudaComputeCapability>(
-      &execute_params.stream->parent()
-           ->GetDeviceDescription()
-           .gpu_compute_capability());
-
-  if (cuda_cc != nullptr) {
-    se::Kernel* memset_kernel = [&] {
-      absl::MutexLock lock(&mutex_);
-      return memset_kernels_[execute_params.stream->parent()].get();
-    }();
-
-    if (memset_kernel == nullptr) {
-      return absl::InternalError(
-          "Memset kernel not loaded on a command buffer executor");
-    }
 
-    auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst);
-    return command_buffer->Launch(execution_scope_id, se::ThreadDim(1),
-                                  se::BlockDim(1), *memset_kernel, *args);
-  } else {
-    return command_buffer->Memset(execution_scope_id, &dst, value,
-                                  /*num_elements=*/1);
-  }
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateMemset(&dst, value, /*num_elements=*/1,
+                                            dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateMemset(command, &dst, value,
+                                            /*num_elements=*/1);
+      });
 }
 
 //===----------------------------------------------------------------------===//
@@ -642,13 +710,12 @@ absl::Status LaunchCmd::Initialize(const Thunk::InitializeParams& params,
   return absl::OkStatus();
 }
 
-absl::Status LaunchCmd::Record(const Thunk::ExecuteParams& execute_params,
-                               const RecordParams& record_params,
-                               se::CommandBuffer* command_buffer) {
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
+absl::StatusOr<const se::CommandBuffer::Command*> LaunchCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   VLOG(5) << "LaunchCmd: kernel=" << kernel_name_
-          << "; shmem_bytes=" << shmem_bytes_
-          << "; execution_scope_id=" << execution_scope_id.value();
+          << "; shmem_bytes=" << shmem_bytes_;
 
   se::Kernel* kernel = [&] {
     absl::MutexLock lock(&mutex_);
@@ -668,15 +735,25 @@ absl::Status LaunchCmd::Record(const Thunk::ExecuteParams& execute_params,
     buffers.push_back(buf);
   }
 
-  TF_ASSIGN_OR_RETURN(auto kernel_args,
-                      se::PackKernelArgs(buffers, shmem_bytes_));
-
-  return command_buffer->Launch(execution_scope_id,
-                                dims_.thread_counts_per_block(),
-                                dims_.block_counts(), *kernel, *kernel_args);
+  TF_ASSIGN_OR_RETURN(
+      auto kernel_args,
+      se::PackKernelArgs<se::DeviceMemoryBase>(buffers, shmem_bytes_));
+
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateLaunch(dims_.thread_counts_per_block(),
+                                            dims_.block_counts(), *kernel,
+                                            *kernel_args, dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateLaunch(
+            command, dims_.thread_counts_per_block(), dims_.block_counts(),
+            *kernel, *kernel_args);
+      });
 }
 
-CommandBufferCmd::BufferUseVector LaunchCmd::buffers() {
+CommandBufferCmd::BufferUseVector LaunchCmd::buffers() const {
   BufferUseVector buffers;
   for (int32_t i = 0; i < args_.size(); ++i) {
     buffers.emplace_back(args_[i], args_access_[i]);
@@ -714,12 +791,11 @@ absl::Status CustomKernelLaunchCmd::Initialize(
   return absl::OkStatus();
 }
 
-absl::Status CustomKernelLaunchCmd::Record(
+absl::StatusOr<const se::CommandBuffer::Command*> CustomKernelLaunchCmd::Record(
     const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "CustomKernelLaunchCmd: custom_kernel=" << custom_kernel_.name()
-          << "; execution_scope_id=" << execution_scope_id.value();
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
+  VLOG(5) << "CustomKernelLaunchCmd: custom_kernel=" << custom_kernel_.name();
 
   se::Kernel* kernel = [&] {
     absl::MutexLock lock(&mutex_);
@@ -743,12 +819,21 @@ absl::Status CustomKernelLaunchCmd::Record(
   se::KernelArgsDeviceMemoryArray kernel_args(
       buffers, custom_kernel_.shared_memory_bytes());
 
-  return command_buffer->Launch(
-      execution_scope_id, custom_kernel_.thread_dims(),
-      custom_kernel_.block_dims(), *kernel, kernel_args);
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateLaunch(custom_kernel_.thread_dims(),
+                                            custom_kernel_.block_dims(),
+                                            *kernel, kernel_args, dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateLaunch(
+            command, custom_kernel_.thread_dims(), custom_kernel_.block_dims(),
+            *kernel, kernel_args);
+      });
 }
 
-CommandBufferCmd::BufferUseVector CustomKernelLaunchCmd::buffers() {
+CommandBufferCmd::BufferUseVector CustomKernelLaunchCmd::buffers() const {
   BufferUseVector buffers;
   for (int32_t i = 0; i < args_.size(); ++i) {
     buffers.emplace_back(args_[i], args_access_[i]);
@@ -769,30 +854,37 @@ MemcpyDeviceToDeviceCmd::MemcpyDeviceToDeviceCmd(
       src_(src),
       num_bytes_(num_bytes) {}
 
-absl::Status MemcpyDeviceToDeviceCmd::Record(
-    const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*>
+MemcpyDeviceToDeviceCmd::Record(const Thunk::ExecuteParams& execute_params,
+                                const RecordParams& record_params,
+                                RecordAction record_action,
+                                se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase dst =
       execute_params.buffer_allocations->GetDeviceAddress(dst_);
   se::DeviceMemoryBase src =
       execute_params.buffer_allocations->GetDeviceAddress(src_);
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "MemcpyDeviceToDeviceCmd: num_bytes = " << num_bytes_
-          << "; execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "MemcpyDeviceToDeviceCmd: num_bytes = " << num_bytes_;
   VLOG(5) << "  Dst: " << dst_ << " (" << dst.opaque() << ")";
   VLOG(5) << "  Src: " << src_ << " (" << src.opaque() << ")";
 
   if (num_bytes_ == 0) {
     VLOG(5) << "Skip recording MemcpyDeviceToDeviceCmd command of 0 bytes";
-    return absl::OkStatus();
+    return nullptr;
   }
 
-  return command_buffer->MemcpyDeviceToDevice(execution_scope_id, &dst, src,
-                                              num_bytes_);
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateMemcpyD2D(&dst, src, num_bytes_,
+                                               dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateMemcpyD2D(command, &dst, src, num_bytes_);
+      });
 }
 
-CommandBufferCmd::BufferUseVector MemcpyDeviceToDeviceCmd::buffers() {
+CommandBufferCmd::BufferUseVector MemcpyDeviceToDeviceCmd::buffers() const {
   return {{dst_, MemoryAccess::kWrite}, {src_, MemoryAccess::kRead}};
 }
 
@@ -805,26 +897,35 @@ MemzeroCmd::MemzeroCmd(ExecutionStreamId execution_stream_id,
     : CommandBufferCmd(CommandBufferCmdType::kMemzeroCmd, execution_stream_id),
       dst_(dst) {}
 
-absl::Status MemzeroCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                const RecordParams& record_params,
-                                se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*> MemzeroCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase dst =
       execute_params.buffer_allocations->GetDeviceAddress(dst_);
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "MemzeroCmd: execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "MemzeroCmd:";
   VLOG(5) << "  Dst: " << dst_ << " (" << dst.opaque() << ")";
 
   if (dst_.size() == 0) {
     VLOG(5) << "Skip recording MemzeroCmd command of 0 bytes";
-    return absl::OkStatus();
-  }
-
-  return command_buffer->Memset(execution_scope_id, &dst, uint8_t{0},
-                                /*num_elements=*/dst_.size());
+    return nullptr;
+  }
+
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateMemset(&dst, uint8_t{0},
+                                            /*num_elements=*/dst_.size(),
+                                            dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateMemset(command, &dst, uint8_t{0},
+                                            /*num_elements=*/dst_.size());
+      });
 }
 
-CommandBufferCmd::BufferUseVector MemzeroCmd::buffers() {
+CommandBufferCmd::BufferUseVector MemzeroCmd::buffers() const {
   return {{dst_, MemoryAccess::kWrite}};
 }
 
@@ -838,233 +939,121 @@ Memset32Cmd::Memset32Cmd(ExecutionStreamId execution_stream_id,
       dst_(dst),
       bit_pattern_(bit_pattern) {}
 
-absl::Status Memset32Cmd::Record(const Thunk::ExecuteParams& execute_params,
-                                 const RecordParams& record_params,
-                                 se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*> Memset32Cmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase dst =
       execute_params.buffer_allocations->GetDeviceAddress(dst_);
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "Memset32Cmd: bit_pattern=" << bit_pattern_
-          << "; execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "Memset32Cmd: bit_pattern=" << bit_pattern_;
   VLOG(5) << "  Dst: " << dst_ << " (" << dst.opaque() << ")";
 
   if (dst_.size() == 0) {
     VLOG(5) << "Skip recording Memset32Cmd command of 0 bytes";
-    return absl::OkStatus();
-  }
-
-  return command_buffer->Memset(
-      execution_scope_id, &dst, bit_pattern_,
-      /*num_elements=*/dst_.size() / sizeof(uint32_t));
+    return nullptr;
+  }
+
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateMemset(
+            &dst, bit_pattern_,
+            /*num_elements=*/dst_.size() / sizeof(uint32_t), dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateMemset(
+            command, &dst, bit_pattern_,
+            /*num_elements=*/dst_.size() / sizeof(uint32_t));
+      });
 }
 
-CommandBufferCmd::BufferUseVector Memset32Cmd::buffers() {
+CommandBufferCmd::BufferUseVector Memset32Cmd::buffers() const {
   return {{dst_, MemoryAccess::kWrite}};
 }
 
-//===----------------------------------------------------------------------===//
-// IfCmd
-//===----------------------------------------------------------------------===//
-
-IfCmd::IfCmd(ExecutionStreamId execution_stream_id,
-             BufferAllocation::Slice pred,
-             CommandBufferCmdSequence then_commands)
-    : CommandBufferCmd(CommandBufferCmdType::kIfCmd, execution_stream_id),
-      pred_(pred),
-      then_commands_(std::move(then_commands)) {}
-
-absl::Status IfCmd::Initialize(const Thunk::InitializeParams& params,
-                               StateManager& state) {
-  return then_commands_.Initialize(params, state);
-}
-
-absl::Status IfCmd::Record(const Thunk::ExecuteParams& execute_params,
-                           const RecordParams& record_params,
-                           se::CommandBuffer* command_buffer) {
-  se::DeviceMemoryBase pred =
-      execute_params.buffer_allocations->GetDeviceAddress(pred_);
-
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "IfCmd: execution_scope_id=" << execution_scope_id.value();
-  VLOG(5) << "  pred: " << pred_ << " (" << pred.opaque() << ")";
-
-  return command_buffer->If(
-      execution_scope_id, se::DeviceMemory<bool>(pred),
-      CreateBuilder(&then_commands_, &execute_params, &record_params));
-}
-
-bool IfCmd::force_update() { return then_commands_.force_update(); }
-
-CommandBufferCmd::BufferUseVector IfCmd::buffers() {
-  absl::flat_hash_set<BufferUse> buffers;
-  buffers.emplace(pred_, MemoryAccess::kRead);
-  buffers.insert(then_commands_.buffers().begin(),
-                 then_commands_.buffers().end());
-  return {buffers.begin(), buffers.end()};
-}
-
-//===----------------------------------------------------------------------===//
-// IfElseCmd
-//===----------------------------------------------------------------------===//
-
-IfElseCmd::IfElseCmd(ExecutionStreamId execution_stream_id,
-                     BufferAllocation::Slice pred,
-                     CommandBufferCmdSequence then_commands,
-                     CommandBufferCmdSequence else_commands)
-    : CommandBufferCmd(CommandBufferCmdType::kIfElseCmd, execution_stream_id),
-      pred_(pred),
-      then_commands_(std::move(then_commands)),
-      else_commands_(std::move(else_commands)) {}
-
-absl::Status IfElseCmd::Initialize(const Thunk::InitializeParams& params,
-                                   StateManager& state) {
-  TF_RETURN_IF_ERROR(then_commands_.Initialize(params, state));
-  TF_RETURN_IF_ERROR(else_commands_.Initialize(params, state));
-  return absl::OkStatus();
-}
-
-absl::Status IfElseCmd::Record(const Thunk::ExecuteParams& execute_params,
-                               const RecordParams& record_params,
-                               se::CommandBuffer* command_buffer) {
-  se::DeviceMemoryBase pred =
-      execute_params.buffer_allocations->GetDeviceAddress(pred_);
-
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "IfElseCmd: execution_scope_id=" << execution_scope_id.value();
-  VLOG(5) << "  pred: " << pred_ << " (" << pred.opaque() << ")";
-
-  return command_buffer->IfElse(
-      execution_scope_id, se::DeviceMemory<bool>(pred),
-      CreateBuilder(&then_commands_, &execute_params, &record_params),
-      CreateBuilder(&else_commands_, &execute_params, &record_params));
-}
-
-bool IfElseCmd::force_update() {
-  return (then_commands_.force_update() || else_commands_.force_update());
-}
-
-CommandBufferCmd::BufferUseVector IfElseCmd::buffers() {
-  absl::flat_hash_set<BufferUse> buffers;
-  buffers.emplace(pred_, MemoryAccess::kRead);
-  buffers.insert(then_commands_.buffers().begin(),
-                 then_commands_.buffers().end());
-  buffers.insert(else_commands_.buffers().begin(),
-                 else_commands_.buffers().end());
-  return {buffers.begin(), buffers.end()};
-}
-
 //===----------------------------------------------------------------------===//
 // CaseCmd
 //===----------------------------------------------------------------------===//
 
 CaseCmd::CaseCmd(ExecutionStreamId execution_stream_id,
                  BufferAllocation::Slice index, bool index_is_bool,
-                 std::vector<CommandBufferCmdSequence> branches_commands)
+                 std::vector<CommandBufferCmdExecutor> branches)
     : CommandBufferCmd(CommandBufferCmdType::kCaseCmd, execution_stream_id),
       index_(index),
       index_is_bool_(index_is_bool),
-      branches_commands_(std::move(branches_commands)) {}
+      branches_(std::move(branches)) {}
 
 absl::Status CaseCmd::Initialize(const Thunk::InitializeParams& params,
                                  StateManager& state) {
-  for (auto& branch : branches_commands_) {
+  for (auto& branch : branches_) {
     TF_RETURN_IF_ERROR(branch.Initialize(params, state));
   }
   return absl::OkStatus();
 }
 
-absl::Status CaseCmd::Record(const Thunk::ExecuteParams& execute_params,
-                             const RecordParams& record_params,
-                             se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*> CaseCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase index =
       execute_params.buffer_allocations->GetDeviceAddress(index_);
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "CaseCmd: execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "CaseCmd:";
   VLOG(5) << "  index: " << index_ << " (" << index.opaque() << ")";
 
-  if (index_is_bool_) {
-    return command_buffer->Case(
-        execution_scope_id, se::DeviceMemory<bool>(index),
-        CreateBuilders(absl::MakeSpan(branches_commands_), &execute_params,
-                       &record_params));
-  } else {
-    return command_buffer->Case(
-        execution_scope_id, se::DeviceMemory<int32_t>(index),
-        CreateBuilders(absl::MakeSpan(branches_commands_), &execute_params,
-                       &record_params));
-  }
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        if (index_is_bool_) {
+          return command_buffer->CreateCase(
+              se::DeviceMemory<bool>(index),
+              CreateCommands(branches_, &execute_params, &record_params),
+              dependencies);
+
+        } else {
+          return command_buffer->CreateCase(
+              se::DeviceMemory<int32_t>(index),
+              CreateCommands(branches_, &execute_params, &record_params),
+              dependencies);
+        }
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        if (index_is_bool_) {
+          return command_buffer->UpdateCase(
+              command, se::DeviceMemory<bool>(index),
+              UpdateCommands(branches_, &execute_params, &record_params));
+
+        } else {
+          return command_buffer->UpdateCase(
+              command, se::DeviceMemory<int32_t>(index),
+              UpdateCommands(branches_, &execute_params, &record_params));
+        }
+      });
 }
 
 bool CaseCmd::force_update() {
-  return absl::c_any_of(branches_commands_,
+  return absl::c_any_of(branches_,
                         [](const auto& seq) { return seq.force_update(); });
 }
 
-CommandBufferCmd::BufferUseVector CaseCmd::buffers() {
+CommandBufferCmd::BufferUseVector CaseCmd::buffers() const {
   absl::flat_hash_set<BufferUse> buffers;
   buffers.emplace(index_, MemoryAccess::kRead);
-  for (auto& branch : branches_commands_) {
+  for (auto& branch : branches_) {
     buffers.insert(branch.buffers().begin(), branch.buffers().end());
   }
   return {buffers.begin(), buffers.end()};
 }
 
-//===----------------------------------------------------------------------===//
-// ForCmd
-//===----------------------------------------------------------------------===//
-
-ForCmd::ForCmd(ExecutionStreamId execution_stream_id, int32_t num_iterations,
-               BufferAllocation::Slice loop_counter,
-               CommandBufferCmdSequence body_commands)
-    : CommandBufferCmd(CommandBufferCmdType::kForCmd, execution_stream_id),
-      num_iterations_(num_iterations),
-      loop_counter_(loop_counter),
-      body_commands_(std::move(body_commands)) {}
-
-absl::Status ForCmd::Initialize(const Thunk::InitializeParams& params,
-                                StateManager& state) {
-  return body_commands_.Initialize(params, state);
-}
-
-absl::Status ForCmd::Record(const Thunk::ExecuteParams& execute_params,
-                            const RecordParams& record_params,
-                            se::CommandBuffer* command_buffer) {
-  se::DeviceMemoryBase loop_counter =
-      execute_params.buffer_allocations->GetDeviceAddress(loop_counter_);
-
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "ForCmd: num_iterations=" << num_iterations_
-          << "; body_commands=" << body_commands_.size()
-          << "; execution_scope_id=" << execution_scope_id.value();
-  VLOG(5) << "  loop_counter: " << loop_counter_ << " ("
-          << loop_counter.opaque() << ")";
-
-  return command_buffer->For(
-      execution_scope_id, num_iterations_,
-      se::DeviceMemory<int32_t>(loop_counter),
-      CreateBuilder(&body_commands_, &execute_params, &record_params));
-}
-
-bool ForCmd::force_update() { return body_commands_.force_update(); }
-
-CommandBufferCmd::BufferUseVector ForCmd::buffers() {
-  absl::flat_hash_set<BufferUse> buffers;
-  buffers.emplace(loop_counter_, MemoryAccess::kWrite);
-  buffers.insert(body_commands_.buffers().begin(),
-                 body_commands_.buffers().end());
-  return {buffers.begin(), buffers.end()};
-}
-
 //===----------------------------------------------------------------------===//
 // WhileCmd
 //===----------------------------------------------------------------------===//
 
 WhileCmd::WhileCmd(ExecutionStreamId execution_stream_id,
                    BufferAllocation::Slice pred,
-                   CommandBufferCmdSequence cond_commands,
-                   CommandBufferCmdSequence body_commands)
+                   CommandBufferCmdExecutor cond_commands,
+                   CommandBufferCmdExecutor body_commands)
     : CommandBufferCmd(CommandBufferCmdType::kWhileCmd, execution_stream_id),
       pred_(pred),
       cond_commands_(std::move(cond_commands)),
@@ -1076,30 +1065,39 @@ absl::Status WhileCmd::Initialize(const Thunk::InitializeParams& params,
   return body_commands_.Initialize(params, state);
 }
 
-absl::Status WhileCmd::Record(const Thunk::ExecuteParams& execute_params,
-                              const RecordParams& record_params,
-                              se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*> WhileCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase pred =
       execute_params.buffer_allocations->GetDeviceAddress(pred_);
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
   VLOG(5) << "WhileCmd: cond_commands=" << cond_commands_.size()
-          << " body_commands=" << body_commands_.size()
-          << "; execution_scope_id=" << execution_scope_id.value();
+          << " body_commands=" << body_commands_.size();
   VLOG(5) << "  pred: " << pred_ << " (" << pred.opaque() << ")";
 
-  return command_buffer->While(
-      execution_scope_id, se::DeviceMemory<bool>(pred),
-      CreateExecutionScopeBuilder(&cond_commands_, &execute_params,
-                                  &record_params),
-      CreateBuilder(&body_commands_, &execute_params, &record_params));
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateWhile(
+            se::DeviceMemory<bool>(pred),
+            CreateCommands(&cond_commands_, &execute_params, &record_params),
+            CreateCommands(&body_commands_, &execute_params, &record_params),
+            dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateWhile(
+            command, se::DeviceMemory<bool>(pred),
+            UpdateCommands(&cond_commands_, &execute_params, &record_params),
+            UpdateCommands(&body_commands_, &execute_params, &record_params));
+      });
 }
 
 bool WhileCmd::force_update() {
   return (cond_commands_.force_update() || body_commands_.force_update());
 }
 
-CommandBufferCmd::BufferUseVector WhileCmd::buffers() {
+CommandBufferCmd::BufferUseVector WhileCmd::buffers() const {
   absl::flat_hash_set<BufferUse> buffers;
   buffers.emplace(pred_, MemoryAccess::kWrite);
   buffers.insert(cond_commands_.buffers().begin(),
@@ -1135,9 +1133,10 @@ absl::Status GemmCmd::Initialize(const Thunk::InitializeParams& params,
   return absl::OkStatus();
 }
 
-absl::Status GemmCmd::Record(const Thunk::ExecuteParams& execute_params,
-                             const RecordParams& record_params,
-                             se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*> GemmCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase lhs =
       execute_params.buffer_allocations->GetDeviceAddress(lhs_buffer_);
   se::DeviceMemoryBase rhs =
@@ -1147,22 +1146,21 @@ absl::Status GemmCmd::Record(const Thunk::ExecuteParams& execute_params,
   se::DeviceMemoryBase workspace =
       execute_params.buffer_allocations->GetDeviceAddress(workspace_);
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "GemmCmd: deterministic=" << deterministic_
-          << "; execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "GemmCmd: deterministic=" << deterministic_;
   VLOG(5) << "  Lhs: " << lhs_buffer_ << " (" << lhs.opaque() << ")";
   VLOG(5) << "  Lhs: " << rhs_buffer_ << " (" << rhs.opaque() << ")";
   VLOG(5) << "  Out: " << output_buffer_ << " (" << out.opaque() << ")";
   VLOG(5) << "  Workspace: " << workspace_ << " (" << workspace.opaque() << ")";
 
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
-        return RunGemm(config_, lhs, rhs, out, workspace, deterministic_,
-                       stream);
-      });
+  return RecordTracedCommand(execute_params, record_params,
+                             std::move(record_action), command_buffer,
+                             [&](se::Stream* stream) {
+                               return RunGemm(config_, lhs, rhs, out, workspace,
+                                              deterministic_, stream);
+                             });
 }
 
-CommandBufferCmd::BufferUseVector GemmCmd::buffers() {
+CommandBufferCmd::BufferUseVector GemmCmd::buffers() const {
   return {{lhs_buffer_, MemoryAccess::kRead},
           {rhs_buffer_, MemoryAccess::kRead},
           {output_buffer_, MemoryAccess::kWrite},
@@ -1173,167 +1171,77 @@ CommandBufferCmd::BufferUseVector GemmCmd::buffers() {
 // CublasLtCmd
 //===----------------------------------------------------------------------===//
 
-CublasLtCmd::CublasLtCmd(
-    ExecutionStreamId execution_stream_id, GemmConfig gemm_config,
-    se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
-    BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
-    BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
-    BufferAllocation::Slice bias_buffer /* may be null */,
-    BufferAllocation::Slice aux_buffer /* may be null */,
-    BufferAllocation::Slice a_scale_buffer /* may be null */,
-    BufferAllocation::Slice b_scale_buffer /* may be null */,
-    BufferAllocation::Slice c_scale_buffer /* may be null */,
-    BufferAllocation::Slice d_scale_buffer /* may be null */,
-    BufferAllocation::Slice d_amax_buffer /* may be null */,
-    BufferAllocation::Slice workspace_buffer)
+CublasLtCmd::CublasLtCmd(ExecutionStreamId execution_stream_id,
+                         const CublasLtMatmulThunk& matmul_thunk)
     : TracedCommandBufferCmd(CommandBufferCmdType::kCublasLtCmd,
                              execution_stream_id),
-      gemm_config_(std::move(gemm_config)),
-      epilogue_(epilogue),
-      algorithm_idx_(algorithm_idx),
-      a_buffer_(a_buffer),
-      b_buffer_(b_buffer),
-      c_buffer_(c_buffer),
-      d_buffer_(d_buffer),
-      bias_buffer_(bias_buffer),
-      aux_buffer_(aux_buffer),
-      a_scale_buffer_(a_scale_buffer),
-      b_scale_buffer_(b_scale_buffer),
-      c_scale_buffer_(c_scale_buffer),
-      d_scale_buffer_(d_scale_buffer),
-      d_amax_buffer_(d_amax_buffer),
-      workspace_buffer_(workspace_buffer) {}
-
-absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtCmd::GetMatmulPlan(
-    const se::Stream* stream) {
-  auto it = matmul_plans_cache_.find(stream);
-  if (it != matmul_plans_cache_.end()) return it->second.get();
-  TF_ASSIGN_OR_RETURN(auto plan, se::gpu::BlasLt::GetMatmulPlan(
-                                     stream, gemm_config_, epilogue_));
-  auto [it_insert, _] = matmul_plans_cache_.emplace(stream, std::move(plan));
-  return it_insert->second.get();
-}
-
-absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm>
-CublasLtCmd::GetMatmulAlgorithm(const se::Stream* stream,
-                                const se::gpu::BlasLt::MatmulPlan* plan,
-                                int64_t max_workspace) {
-  auto it = matmul_algorithm_cache_.find(plan);
-  if (it != matmul_algorithm_cache_.end()) return it->second;
-  TF_ASSIGN_OR_RETURN(
-      auto algorithms,
-      plan->GetAlgorithms(stream, /*max_algorithm_count*/ 128,
-                          /*max_workspace_size*/ max_workspace));
-  TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
-  auto [it_insert, _] =
-      matmul_algorithm_cache_.emplace(plan, algorithms[algorithm_idx_]);
-  return it_insert->second;
-}
+      CublasLtMatmulThunk(matmul_thunk) {}
 
 absl::Status CublasLtCmd::Initialize(const Thunk::InitializeParams& params,
                                      StateManager& state) {
-  if (!params.stream->parent()->AsBlas()) {
-    return absl::InternalError("Failed to initialize BLAS support for GemmCmd");
-  }
-  // Populate plan and algorithm cache;
-  TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(params.stream));
-  TF_RETURN_IF_ERROR(
-      GetMatmulAlgorithm(params.stream, plan, workspace_buffer_.size())
-          .status());
+  TF_RETURN_IF_ERROR(CublasLtMatmulThunk::Initialize(params));
   return absl::OkStatus();
 }
 
-absl::Status CublasLtCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                 const RecordParams& record_params,
-                                 se::CommandBuffer* command_buffer) {
-  TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(execute_params.stream));
-  TF_ASSIGN_OR_RETURN(auto algorithm,
-                      GetMatmulAlgorithm(execute_params.stream, plan,
-                                         workspace_buffer_.size()));
-
-  const BufferAllocations& allocs = *execute_params.buffer_allocations;
-
-  se::DeviceMemoryBase bias, a_scale, b_scale, c_scale, d_scale, aux, d_amax;
-  if (bias_buffer_.allocation() != nullptr) {
-    bias = allocs.GetDeviceAddress(bias_buffer_);
-  }
-  if (a_scale_buffer_.allocation() != nullptr) {
-    a_scale = allocs.GetDeviceAddress(a_scale_buffer_);
-  }
-  if (b_scale_buffer_.allocation() != nullptr) {
-    b_scale = allocs.GetDeviceAddress(b_scale_buffer_);
-  }
-  if (c_scale_buffer_.allocation() != nullptr) {
-    c_scale = allocs.GetDeviceAddress(c_scale_buffer_);
-  }
-  if (d_scale_buffer_.allocation() != nullptr) {
-    d_scale = allocs.GetDeviceAddress(d_scale_buffer_);
-  }
-  if (d_amax_buffer_.allocation() != nullptr) {
-    d_amax = allocs.GetDeviceAddress(d_amax_buffer_);
-  }
-  if (aux_buffer_.allocation() != nullptr) {
-    aux = allocs.GetDeviceAddress(aux_buffer_);
-  }
-
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-
-  VLOG(5) << "CublasLtCmd with execution_scope_id: "
-          << execution_scope_id.value();
-  VLOG(5) << "  a_buffer: " << a_buffer_.ToString();
-  VLOG(5) << "  b_buffer: " << b_buffer_.ToString();
-  VLOG(5) << "  c_buffer: " << c_buffer_.ToString();
-  VLOG(5) << "  d_buffer: " << d_buffer_.ToString();
-  VLOG(5) << "  bias_buffer: " << bias_buffer_.ToString();
-  VLOG(5) << "  aux_buffer: " << aux_buffer_.ToString();
-  VLOG(5) << "  a_scale_buffer: " << a_scale_buffer_.ToString();
-  VLOG(5) << "  b_scale_buffer: " << b_scale_buffer_.ToString();
-  VLOG(5) << "  c_scale_buffer: " << c_scale_buffer_.ToString();
-  VLOG(5) << "  d_scale_buffer: " << d_scale_buffer_.ToString();
-  VLOG(5) << "  d_amax_buffer: " << d_amax_buffer_.ToString();
-  VLOG(5) << "  workspace_buffer: " << workspace_buffer_.ToString();
-
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
-        return plan->ExecuteOnStream(
-            stream, allocs.GetDeviceAddress(a_buffer_),
-            allocs.GetDeviceAddress(b_buffer_),
-            allocs.GetDeviceAddress(c_buffer_),
-            allocs.GetDeviceAddress(d_buffer_), bias, aux, a_scale, b_scale,
-            c_scale, d_scale, d_amax, algorithm,
-            allocs.GetDeviceAddress(workspace_buffer_));
+absl::StatusOr<const se::CommandBuffer::Command*> CublasLtCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
+  // This call is required to make sure matmul plan is already created and
+  // cached before recording the command buffer.
+  TF_RETURN_IF_ERROR(GetCachedMatmulPlan(execute_params).status());
+
+  VLOG(5) << "CublasLtCmd:";
+  VLOG(5) << "  a_buffer: " << a_.ToString();
+  VLOG(5) << "  b_buffer: " << b_.ToString();
+  VLOG(5) << "  c_buffer: " << c_.ToString();
+  VLOG(5) << "  d_buffer: " << d_.ToString();
+  VLOG(5) << "  bias_buffer: " << bias_.ToString();
+  VLOG(5) << "  aux_buffer: " << aux_.ToString();
+  VLOG(5) << "  a_scale_buffer: " << a_scale_.ToString();
+  VLOG(5) << "  b_scale_buffer: " << b_scale_.ToString();
+  VLOG(5) << "  c_scale_buffer: " << c_scale_.ToString();
+  VLOG(5) << "  d_scale_buffer: " << d_scale_.ToString();
+  VLOG(5) << "  d_amax_buffer: " << d_amax_.ToString();
+  // workspace buffer is guaranteed to be non-null here.
+  VLOG(5) << "  workspace_buffer: " << workspace_->ToString();
+
+  return RecordTracedCommand(
+      execute_params, record_params, std::move(record_action), command_buffer,
+      [&](se::Stream* stream) {
+        return ExecuteOnStreamInternal(stream, execute_params);
       });
 }
 
-CommandBufferCmd::BufferUseVector CublasLtCmd::buffers() {
+CommandBufferCmd::BufferUseVector CublasLtCmd::buffers() const {
   BufferUseVector buffer_usage;
   buffer_usage.reserve(13);
-  buffer_usage.push_back({a_buffer_, MemoryAccess::kRead});
-  buffer_usage.push_back({b_buffer_, MemoryAccess::kRead});
-  buffer_usage.push_back({c_buffer_, MemoryAccess::kRead});
-  buffer_usage.push_back({d_buffer_, MemoryAccess::kWrite});
-  buffer_usage.push_back({workspace_buffer_, MemoryAccess::kWrite});
+  buffer_usage.push_back({a_, MemoryAccess::kRead});
+  buffer_usage.push_back({b_, MemoryAccess::kRead});
+  buffer_usage.push_back({c_, MemoryAccess::kRead});
+  buffer_usage.push_back({d_, MemoryAccess::kWrite});
+  buffer_usage.push_back({*workspace_, MemoryAccess::kWrite});
 
-  if (bias_buffer_.allocation() != nullptr) {
-    buffer_usage.push_back({bias_buffer_, MemoryAccess::kRead});
+  if (bias_.allocation() != nullptr) {
+    buffer_usage.push_back({bias_, MemoryAccess::kRead});
   }
-  if (a_scale_buffer_.allocation() != nullptr) {
-    buffer_usage.push_back({a_scale_buffer_, MemoryAccess::kRead});
+  if (a_scale_.allocation() != nullptr) {
+    buffer_usage.push_back({a_scale_, MemoryAccess::kRead});
   }
-  if (b_scale_buffer_.allocation() != nullptr) {
-    buffer_usage.push_back({b_scale_buffer_, MemoryAccess::kRead});
+  if (b_scale_.allocation() != nullptr) {
+    buffer_usage.push_back({b_scale_, MemoryAccess::kRead});
   }
-  if (c_scale_buffer_.allocation() != nullptr) {
-    buffer_usage.push_back({c_scale_buffer_, MemoryAccess::kRead});
+  if (c_scale_.allocation() != nullptr) {
+    buffer_usage.push_back({c_scale_, MemoryAccess::kRead});
   }
-  if (d_scale_buffer_.allocation() != nullptr) {
-    buffer_usage.push_back({d_scale_buffer_, MemoryAccess::kRead});
+  if (d_scale_.allocation() != nullptr) {
+    buffer_usage.push_back({d_scale_, MemoryAccess::kRead});
   }
-  if (aux_buffer_.allocation() != nullptr) {
-    buffer_usage.push_back({aux_buffer_, MemoryAccess::kWrite});
+  if (aux_.allocation() != nullptr) {
+    buffer_usage.push_back({aux_, MemoryAccess::kWrite});
   }
-  if (d_amax_buffer_.allocation() != nullptr) {
-    buffer_usage.push_back({d_amax_buffer_, MemoryAccess::kRead});
+  if (d_amax_.allocation() != nullptr) {
+    buffer_usage.push_back({d_amax_, MemoryAccess::kRead});
   }
   return buffer_usage;
 }
@@ -1358,9 +1266,10 @@ absl::Status CuDnnCmd::Initialize(const Thunk::InitializeParams& params,
   return absl::OkStatus();
 }
 
-absl::Status CuDnnCmd::Record(const Thunk::ExecuteParams& execute_params,
-                              const RecordParams& record_params,
-                              se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*> CuDnnCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   CHECK(graph_ != nullptr);
   std::vector<se::DeviceMemoryBase> operands;
   operands.reserve(args_.size());
@@ -1370,16 +1279,33 @@ absl::Status CuDnnCmd::Record(const Thunk::ExecuteParams& execute_params,
     VLOG(5) << "  Arg: " << arg << ": " << buf.opaque();
     operands.push_back(buf);
   }
-
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
+  TF_ASSIGN_OR_RETURN(
+      const bool supports_explicit,
+      graph_->get()->SupportsExplicitCommandBufferConstruction());
+  if (supports_explicit) {
+    return Handle(
+        std::move(record_action),
+        [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+          return command_buffer->CreateDnnGraphCommand(
+              *graph_->get(), *execute_params.stream,
+              absl::Span<se::DeviceMemoryBase>(operands), dependencies);
+        },
+        [&](const se::CommandBuffer::Command* command) {
+          return command_buffer->UpdateDnnGraphCommand(
+              command, *graph_->get(), *execute_params.stream,
+              absl::Span<se::DeviceMemoryBase>(operands));
+        });
+  }
+  return RecordTracedCommand(
+      execute_params, record_params, std::move(record_action), command_buffer,
+      [&](se::Stream* stream) {
         return graph_->get()->Execute(
             *stream, absl::Span<se::DeviceMemoryBase>(operands),
             execute_params.collective_params->local_device_ordinal);
       });
 }
 
-CommandBufferCmd::BufferUseVector CuDnnCmd::buffers() {
+CommandBufferCmd::BufferUseVector CuDnnCmd::buffers() const {
   CommandBufferCmd::BufferUseVector buffer_usage;
   buffer_usage.reserve(args_.size());
   for (int i = 0; i < args_.size() - 1; ++i) {
@@ -1393,14 +1319,16 @@ CommandBufferCmd::BufferUseVector CuDnnCmd::buffers() {
 // CustomCallCmd
 //===----------------------------------------------------------------------===//
 
-absl::Status CustomCallCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                   const RecordParams& record_params,
-                                   se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*> CustomCallCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   if (handler_ == nullptr) {
     return RecordLegacyCustomCall(execute_params, record_params,
-                                  command_buffer);
+                                  std::move(record_action), command_buffer);
   }
-  return RecordXlaFfiCall(execute_params, record_params, command_buffer);
+  return RecordXlaFfiCall(execute_params, record_params,
+                          std::move(record_action), command_buffer);
 }
 
 namespace {
@@ -1431,14 +1359,15 @@ absl::Status GetBuffers(
 }
 }  // namespace
 
-absl::Status CustomCallCmd::RecordLegacyCustomCall(
+absl::StatusOr<const se::CommandBuffer::Command*>
+CustomCallCmd::RecordLegacyCustomCall(
     const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   std::vector<void*> buffers;
   buffers.reserve(operands_.size() + results_.size());
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "CustomCallCmd: target_name=" << target_name_
-          << ", execution_scope_id=" << execution_scope_id.value();
+
+  VLOG(5) << "CustomCallCmd: target_name=" << target_name_;
   TF_RETURN_IF_ERROR(
       GetBuffers(execute_params, operands_, buffers, "  Operand "));
   TF_RETURN_IF_ERROR(
@@ -1460,63 +1389,67 @@ absl::Status CustomCallCmd::RecordLegacyCustomCall(
             return absl::OkStatus();
           }));
 
-  return command_buffer->AddNestedCommandBuffer(execution_scope_id,
-                                                *nested_cmd);
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateNestedCommand(*nested_cmd, dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateNestedCommand(command, *nested_cmd);
+      });
 }
 
-absl::Status CustomCallCmd::RecordXlaFfiCall(
-    const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
+absl::StatusOr<const se::CommandBuffer::Command*>
+CustomCallCmd::RecordXlaFfiCall(const Thunk::ExecuteParams& execute_params,
+                                const RecordParams& record_params,
+                                RecordAction record_action,
+                                se::CommandBuffer* command_buffer) {
   // TODO(ezhulenev): This is not the most optimal approach, as we'll be doing
   // a lot of extra allocation on every call. We have to keep attributes
   // separate from arguments, as they do not change after thunk is
   // constructed.
   ffi::CallFrameBuilder builder(operands_.size(), results_.size());
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "CustomCallCmd: target_name=" << target_name_
-          << ", execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "CustomCallCmd: target_name=" << target_name_;
+
+  absl::InlinedVector<se::DeviceMemoryBase, 4> arguments;
+  arguments.reserve(operands_.size());
 
   for (int i = 0; i < operands_.size(); ++i) {
     const std::optional<Slice>& slice = operands_[i];
-    // TODO(ezhulenev): Add a token argument type to XLA:FFI.
     if (!slice.has_value()) {
-      return Internal("FFI handlers do not support tokens (yet)!");
+      arguments.push_back(se::DeviceMemoryBase{});
+      continue;
     }
 
-    if (!slice->slice.allocation())
-      return Internal("custom call input missing buffer allocation");
-
     se::DeviceMemoryBase buffer =
         execute_params.buffer_allocations->GetDeviceAddress(slice->slice);
     VLOG(5) << "  Operand " << i << ": " << slice->slice << " ("
             << buffer.opaque() << ")";
-    builder.AddBufferArg(buffer, slice->shape.element_type(),
-                         slice->shape.dimensions());
+    arguments.push_back(buffer);
   }
 
+  absl::InlinedVector<se::DeviceMemoryBase, 4> results;
+  results.reserve(results_.size());
+
   for (int i = 0; i < results_.size(); ++i) {
     const std::optional<Slice>& slice = results_[i];
-    // TODO(ezhulenev): Add a token argument type to XLA:FFI.
     if (!slice.has_value()) {
-      return Internal("FFI handlers do not support tokens (yet)!");
+      results.push_back(se::DeviceMemoryBase{});
+      continue;
     }
 
-    if (!slice->slice.allocation())
-      return Internal("custom call input missing buffer allocation");
-
     se::DeviceMemoryBase buffer =
         execute_params.buffer_allocations->GetDeviceAddress(slice->slice);
     VLOG(5) << "  Result " << i << ": " << slice->slice << " ("
             << buffer.opaque() << ")";
-    builder.AddBufferRet(buffer, slice->shape.element_type(),
-                         slice->shape.dimensions());
+    results.push_back(buffer);
   }
 
-  ffi::CallFrameBuilder::AttributesBuilder attrs;
-  attrs.Append(attributes_);
-  builder.AddAttributes(attrs.Build());
-  ffi::CallFrame call_frame = builder.Build();
+  // Borrow the FFI call frame from the object pool and update with the actual
+  // device memory addresses.
+  TF_ASSIGN_OR_RETURN(auto call_frame, call_frames_->GetOrCreate());
+  TF_RETURN_IF_ERROR(call_frame->UpdateWithBuffers(arguments, results));
 
   RunId run_id = execute_params.collective_params->run_id;
 
@@ -1532,14 +1465,20 @@ absl::Status CustomCallCmd::RecordXlaFfiCall(
                     execute_params.buffer_allocations->memory_allocator()},
                 /*called_computation=*/nullptr,  // TODO(b/342285364)
                 execute_params.ffi_execution_context};
-            return ffi::Call(handler_, call_frame, options);
+            return ffi::Call(handler_, *call_frame, options);
           }));
 
-  return command_buffer->AddNestedCommandBuffer(execution_scope_id,
-                                                *nested_cmd);
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateNestedCommand(*nested_cmd, dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateNestedCommand(command, *nested_cmd);
+      });
 }
 
-CommandBufferCmd::BufferUseVector CustomCallCmd::buffers() {
+CommandBufferCmd::BufferUseVector CustomCallCmd::buffers() const {
   CommandBufferCmd::BufferUseVector buffer_usage;
   for (auto& slices : {operands_, results_}) {
     for (const std::optional<Slice>& slice : slices) {
@@ -1550,31 +1489,6 @@ CommandBufferCmd::BufferUseVector CustomCallCmd::buffers() {
   return buffer_usage;
 }
 
-//===----------------------------------------------------------------------===//
-// BarrierCmd
-//===----------------------------------------------------------------------===//
-
-BarrierCmd::BarrierCmd(ExecutionStreamId execution_stream_id,
-                       ExecutionStreamId from_stream_id)
-    : CommandBufferCmd(CommandBufferCmdType::kBarrierCmd, execution_stream_id),
-      from_stream_id_(from_stream_id) {}
-
-absl::Status BarrierCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                const RecordParams& record_params,
-                                se::CommandBuffer* command_buffer) {
-  VLOG(5) << "BarrierCmd from stream " << from_stream_id_.value()
-          << " to stream " << execution_stream_id().value();
-  if (from_stream_id_ != execution_stream_id()) {
-    TF_RETURN_IF_ERROR(command_buffer->Barrier(
-        CommandBufferCmd::GetExecutionScope(record_params, from_stream_id_),
-        CommandBufferCmd::GetExecutionScope(record_params,
-                                            execution_stream_id())));
-  }
-  return absl::OkStatus();
-}
-
-BarrierCmd::BufferUseVector BarrierCmd::buffers() { return {}; }
-
 //===----------------------------------------------------------------------===//
 // CollectiveCmd
 //===----------------------------------------------------------------------===//
@@ -1582,27 +1496,11 @@ BarrierCmd::BufferUseVector BarrierCmd::buffers() { return {}; }
 CollectiveCmd::CollectiveCmd(CommandBufferCmdType cmd_type,
                              ExecutionStreamId execution_stream_id,
                              ExecutionStreamId async_from_stream_id,
-                             NcclCollectiveConfig config)
+                             CollectiveConfig config)
     : CommandBufferCmd(cmd_type, execution_stream_id),
       async_from_stream_id_(async_from_stream_id),
       config_(std::move(config)) {}
 
-absl::Status CollectiveCmd::BarrierIfAsync(
-    se::CommandBuffer* command_buffer, se::StreamExecutor* executor,
-    const CommandBufferCmd::RecordParams& record_params) {
-  if (IsAsync()) {
-    TF_RETURN_IF_ERROR(
-        command_buffer->Barrier(CommandBufferCmd::GetExecutionScope(
-                                    record_params, async_from_stream_id_),
-                                CommandBufferCmd::GetExecutionScope(
-                                    record_params, execution_stream_id())));
-    VLOG(5) << "Insert Async barrier from stream "
-            << async_from_stream_id_.value() << " to stream "
-            << execution_stream_id().value();
-  }
-  return absl::OkStatus();
-}
-
 absl::Status CollectiveCmd::Prepare(
     const Thunk::PrepareParams& params,
     Thunk::ResourceRequestsInterface& resource_requests) {
@@ -1612,56 +1510,55 @@ absl::Status CollectiveCmd::Prepare(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(collectives, *params.collective_params,
                       config().replica_groups, config().group_mode,
-                      nccl_stream_id(), GetAsyncStreamKind()));
-  TF_ASSIGN_OR_RETURN(
-      size_t num_local_participants,
-      GetNumLocalParticipants(*params.collective_params,
-                              config().replica_groups, config().group_mode));
-  return resource_requests.AddClique(clique_key, num_local_participants);
+                      GetAsyncStreamKind()));
+  return resource_requests.AddClique(clique_key);
 }
 
-absl::Status CollectiveCmd::AddTracedCommandBuffer(
+absl::StatusOr<const se::CommandBuffer::Command*>
+CollectiveCmd::RecordTracedCommand(
     const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer,
     absl::FunctionRef<absl::Status(se::Stream*)> trace) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<se::CommandBuffer> nested_cmd,
                       se::TraceCommandBufferFactory::Create(
                           execute_params.stream->parent(),
                           execute_params.command_buffer_trace_stream, trace));
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  return command_buffer->AddNestedCommandBuffer(execution_scope_id,
-                                                *nested_cmd);
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateNestedCommand(*nested_cmd, dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateNestedCommand(command, *nested_cmd);
+      });
 }
 
 //===----------------------------------------------------------------------===//
 // AllReduceCmd
 //===----------------------------------------------------------------------===//
 
-AllReduceCmd::AllReduceCmd(
-    ExecutionStreamId execution_stream_id,
-    ExecutionStreamId async_from_stream_id, NcclCollectiveConfig config,
-    ReductionKind reduction_kind,
-    absl::Span<const NcclCollectiveThunk::Buffer> buffers)
+AllReduceCmd::AllReduceCmd(ExecutionStreamId execution_stream_id,
+                           ExecutionStreamId async_from_stream_id,
+                           CollectiveConfig config,
+                           ReductionKind reduction_kind,
+                           absl::Span<const CollectiveThunk::Buffer> buffers)
     : CollectiveCmd(CommandBufferCmdType::kAllReduceCmd, execution_stream_id,
                     async_from_stream_id, std::move(config)),
       reduction_kind_(reduction_kind),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status AllReduceCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                  const RecordParams& record_params,
-                                  se::CommandBuffer* command_buffer) {
-  TF_RETURN_IF_ERROR(BarrierIfAsync(
-      command_buffer, execute_params.stream->parent(), record_params));
-
+absl::StatusOr<const se::CommandBuffer::Command*> AllReduceCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
                              config().operand_element_type));
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "AllReduceCmd: reduction=" << ReductionKindString(reduction_kind_)
-          << "; execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "AllReduceCmd: reduction=" << ReductionKindString(reduction_kind_);
 
   for (size_t i = 0; i < device_buffers.size(); ++i) {
     VLOG(5) << "  Src: " << buffers_[i].source_buffer << " ("
@@ -1680,18 +1577,19 @@ absl::Status AllReduceCmd::Record(const Thunk::ExecuteParams& execute_params,
 
   TF_ASSIGN_OR_RETURN(
       CommunicatorHandle comm_handle,
-      GetNcclComm(collectives, *execute_params.collective_params,
-                  *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode, GetAsyncStreamKind()));
 
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
+  return RecordTracedCommand(
+      execute_params, record_params, std::move(record_action), command_buffer,
+      [&](se::Stream* stream) {
         return RunAllReduce(collectives, reduction_kind_, device_buffers,
                             *stream, comm_handle.comm);
       });
 }
 
-CommandBufferCmd::BufferUseVector AllReduceCmd::buffers() {
+CommandBufferCmd::BufferUseVector AllReduceCmd::buffers() const {
   BufferUseVector buffer_usage;
   for (auto& buffer : buffers_) {
     buffer_usage.emplace_back(buffer.source_buffer, MemoryAccess::kRead);
@@ -1706,29 +1604,25 @@ CommandBufferCmd::BufferUseVector AllReduceCmd::buffers() {
 
 ReduceScatterCmd::ReduceScatterCmd(
     ExecutionStreamId execution_stream_id,
-    ExecutionStreamId async_from_stream_id, NcclCollectiveConfig config,
+    ExecutionStreamId async_from_stream_id, CollectiveConfig config,
     ReductionKind reduction_kind,
-    absl::Span<const NcclCollectiveThunk::Buffer> buffers)
+    absl::Span<const CollectiveThunk::Buffer> buffers)
     : CollectiveCmd(CommandBufferCmdType::kReduceScatter, execution_stream_id,
                     async_from_stream_id, std::move(config)),
       reduction_kind_(reduction_kind),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status ReduceScatterCmd::Record(
+absl::StatusOr<const se::CommandBuffer::Command*> ReduceScatterCmd::Record(
     const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
-  TF_RETURN_IF_ERROR(BarrierIfAsync(
-      command_buffer, execute_params.stream->parent(), record_params));
-
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
                              config().operand_element_type));
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
   VLOG(5) << "ReduceScatterCmd: reduction="
-          << ReductionKindString(reduction_kind_)
-          << "; execution_scope_id=" << execution_scope_id.value();
+          << ReductionKindString(reduction_kind_);
 
   for (size_t i = 0; i < device_buffers.size(); ++i) {
     VLOG(5) << "  Src: " << buffers_[i].source_buffer << " ("
@@ -1747,18 +1641,19 @@ absl::Status ReduceScatterCmd::Record(
 
   TF_ASSIGN_OR_RETURN(
       CommunicatorHandle comm_handle,
-      GetNcclComm(collectives, *execute_params.collective_params,
-                  *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
-
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
-        return RunReduceScatter(collectives, reduction_kind_, device_buffers,
-                                *stream, comm_handle.comm);
-      });
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode, GetAsyncStreamKind()));
+
+  return RecordTracedCommand(execute_params, record_params, record_action,
+                             command_buffer, [&](se::Stream* stream) {
+                               return RunReduceScatter(
+                                   collectives, reduction_kind_, device_buffers,
+                                   *stream, comm_handle.comm);
+                             });
 }
 
-CommandBufferCmd::BufferUseVector ReduceScatterCmd::buffers() {
+CommandBufferCmd::BufferUseVector ReduceScatterCmd::buffers() const {
   BufferUseVector buffer_usage;
   for (auto& buffer : buffers_) {
     buffer_usage.emplace_back(buffer.source_buffer, MemoryAccess::kRead);
@@ -1773,27 +1668,23 @@ CommandBufferCmd::BufferUseVector ReduceScatterCmd::buffers() {
 
 AllToAllCmd::AllToAllCmd(ExecutionStreamId execution_stream_id,
                          ExecutionStreamId async_from_stream_id,
-                         NcclCollectiveConfig config, bool has_split_dimension,
-                         absl::Span<const NcclCollectiveThunk::Buffer> buffers)
+                         CollectiveConfig config, bool has_split_dimension,
+                         absl::Span<const CollectiveThunk::Buffer> buffers)
     : CollectiveCmd(CommandBufferCmdType::kAllToAll, execution_stream_id,
                     async_from_stream_id, std::move(config)),
       has_split_dimension_(has_split_dimension),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status AllToAllCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                 const RecordParams& record_params,
-                                 se::CommandBuffer* command_buffer) {
-  TF_RETURN_IF_ERROR(BarrierIfAsync(
-      command_buffer, execute_params.stream->parent(), record_params));
-
+absl::StatusOr<const se::CommandBuffer::Command*> AllToAllCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
                              config().operand_element_type));
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "AllToAllCmd, has_split_dimension=" << has_split_dimension_
-          << ", execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "AllToAllCmd, has_split_dimension=" << has_split_dimension_;
 
   for (size_t i = 0; i < device_buffers.size(); ++i) {
     VLOG(5) << "  Src: " << buffers_[i].source_buffer << " ("
@@ -1811,18 +1702,19 @@ absl::Status AllToAllCmd::Record(const Thunk::ExecuteParams& execute_params,
                       Thunk::GetGpuCollectives(execute_params));
   TF_ASSIGN_OR_RETURN(
       CommunicatorHandle comm_handle,
-      GetNcclComm(collectives, *execute_params.collective_params,
-                  *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode, GetAsyncStreamKind()));
 
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
+  return RecordTracedCommand(
+      execute_params, record_params, std::move(record_action), command_buffer,
+      [&](se::Stream* stream) {
         return RunAllToAll(collectives, has_split_dimension_, device_buffers,
                            *stream, comm_handle.comm);
       });
 }
 
-CommandBufferCmd::BufferUseVector AllToAllCmd::buffers() {
+CommandBufferCmd::BufferUseVector AllToAllCmd::buffers() const {
   BufferUseVector buffer_usage;
   for (auto& buffer : buffers_) {
     buffer_usage.emplace_back(buffer.source_buffer, MemoryAccess::kRead);
@@ -1835,27 +1727,24 @@ CommandBufferCmd::BufferUseVector AllToAllCmd::buffers() {
 // AllGatherCmd
 //===----------------------------------------------------------------------===//
 
-AllGatherCmd::AllGatherCmd(
-    ExecutionStreamId execution_stream_id,
-    ExecutionStreamId async_from_stream_id, NcclCollectiveConfig config,
-    absl::Span<const NcclCollectiveThunk::Buffer> buffers)
+AllGatherCmd::AllGatherCmd(ExecutionStreamId execution_stream_id,
+                           ExecutionStreamId async_from_stream_id,
+                           CollectiveConfig config,
+                           absl::Span<const CollectiveThunk::Buffer> buffers)
     : CollectiveCmd(CommandBufferCmdType::kAllGatherCmd, execution_stream_id,
                     async_from_stream_id, std::move(config)),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status AllGatherCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                  const RecordParams& record_params,
-                                  se::CommandBuffer* command_buffer) {
-  TF_RETURN_IF_ERROR(BarrierIfAsync(
-      command_buffer, execute_params.stream->parent(), record_params));
-
+absl::StatusOr<const se::CommandBuffer::Command*> AllGatherCmd::Record(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
                              config().operand_element_type));
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "AllGatherCmd: execution_scope_id=" << execution_scope_id.value();
+  VLOG(5) << "AllGatherCmd:";
 
   for (size_t i = 0; i < device_buffers.size(); ++i) {
     VLOG(5) << "  Src: " << buffers_[i].source_buffer << " ("
@@ -1874,18 +1763,19 @@ absl::Status AllGatherCmd::Record(const Thunk::ExecuteParams& execute_params,
 
   TF_ASSIGN_OR_RETURN(
       CommunicatorHandle comm_handle,
-      GetNcclComm(collectives, *execute_params.collective_params,
-                  *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
-
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
-        return RunAllGather(collectives, device_buffers, *stream,
-                            comm_handle.comm);
-      });
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode, GetAsyncStreamKind()));
+
+  return RecordTracedCommand(execute_params, record_params,
+                             std::move(record_action), command_buffer,
+                             [&](se::Stream* stream) {
+                               return RunAllGather(collectives, device_buffers,
+                                                   *stream, comm_handle.comm);
+                             });
 }
 
-CommandBufferCmd::BufferUseVector AllGatherCmd::buffers() {
+CommandBufferCmd::BufferUseVector AllGatherCmd::buffers() const {
   BufferUseVector buffer_usage;
   for (auto& buffer : buffers_) {
     buffer_usage.emplace_back(buffer.source_buffer, MemoryAccess::kRead);
@@ -1900,27 +1790,24 @@ CommandBufferCmd::BufferUseVector AllGatherCmd::buffers() {
 
 CollectiveBroadcastCmd::CollectiveBroadcastCmd(
     ExecutionStreamId execution_stream_id,
-    ExecutionStreamId async_from_stream_id, NcclCollectiveConfig config,
-    absl::Span<const NcclCollectiveThunk::Buffer> buffers)
+    ExecutionStreamId async_from_stream_id, CollectiveConfig config,
+    absl::Span<const CollectiveThunk::Buffer> buffers)
     : CollectiveCmd(CommandBufferCmdType::kCollectiveBroadcastCmd,
                     execution_stream_id, async_from_stream_id,
                     std::move(config)),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status CollectiveBroadcastCmd::Record(
-    const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
-  TF_RETURN_IF_ERROR(BarrierIfAsync(
-      command_buffer, execute_params.stream->parent(), record_params));
-
+absl::StatusOr<const se::CommandBuffer::Command*>
+CollectiveBroadcastCmd::Record(const Thunk::ExecuteParams& execute_params,
+                               const RecordParams& record_params,
+                               RecordAction record_action,
+                               se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
                              config().operand_element_type));
 
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(5) << "CollectiveBroadcastCmd: execution_scope_id="
-          << execution_scope_id.value();
+  VLOG(5) << "CollectiveBroadcastCmd:";
 
   for (size_t i = 0; i < device_buffers.size(); ++i) {
     VLOG(5) << "  Src: " << buffers_[i].source_buffer << " ("
@@ -1939,18 +1826,19 @@ absl::Status CollectiveBroadcastCmd::Record(
 
   TF_ASSIGN_OR_RETURN(
       CommunicatorHandle comm_handle,
-      GetNcclComm(collectives, *execute_params.collective_params,
-                  *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
+      GetComm(collectives, *execute_params.collective_params,
+              *execute_params.collective_cliques, config().replica_groups,
+              config().group_mode, GetAsyncStreamKind()));
 
-  return AddTracedCommandBuffer(
-      execute_params, record_params, command_buffer, [&](se::Stream* stream) {
+  return RecordTracedCommand(
+      execute_params, record_params, std::move(record_action), command_buffer,
+      [&](se::Stream* stream) {
         return RunCollectiveBroadcast(device_buffers, *stream, comm_handle.comm,
                                       collectives);
       });
 }
 
-CommandBufferCmd::BufferUseVector CollectiveBroadcastCmd::buffers() {
+CommandBufferCmd::BufferUseVector CollectiveBroadcastCmd::buffers() const {
   BufferUseVector buffer_usage;
   for (auto& buffer : buffers_) {
     buffer_usage.emplace_back(buffer.source_buffer, MemoryAccess::kRead);
@@ -1965,7 +1853,7 @@ CommandBufferCmd::BufferUseVector CollectiveBroadcastCmd::buffers() {
 
 DynamicSliceFusionCmd::DynamicSliceFusionCmd(
     ExecutionStreamId execution_stream_id,
-    std::unique_ptr<CommandBufferCmdSequence> embedded_commands,
+    CommandBufferCmdExecutor embedded_commands,
     std::vector<std::optional<BufferAllocation::Slice>> arguments,
     std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
     std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>> offsets,
@@ -1998,7 +1886,8 @@ DynamicSliceFusionCmd::DynamicSliceFusionCmd(
   for (DynamicSliceThunk::SliceDef& slice : slices_) {
     offsets_allocs_base_.push_back(offsets_allocs_size_);
     if (slice.sliced_shape.has_value()) {
-      offsets_allocs_size_ += slice.sliced_shape->rank() * sizeof(int64_t);
+      offsets_allocs_size_ +=
+          slice.sliced_shape->dimensions().size() * sizeof(int64_t);
     }
   }
 }
@@ -2019,7 +1908,7 @@ bool DynamicSliceFusionCmd::force_update() {
 
 absl::Status DynamicSliceFusionCmd::Initialize(
     const Thunk::InitializeParams& params, StateManager& state) {
-  TF_RETURN_IF_ERROR(embedded_commands_->Initialize(params, state));
+  TF_RETURN_IF_ERROR(embedded_commands_.Initialize(params, state));
   absl::MutexLock lock(&mutex_);
   if (offsets_allocs_.contains(params.executor)) return absl::OkStatus();
 
@@ -2045,19 +1934,21 @@ absl::Status DynamicSliceFusionCmd::Prepare(
       TF_RET_CHECK(slice.orig_shape->IsArray());
       TF_RET_CHECK(slice.sliced_shape->IsArray());
 
-      TF_RET_CHECK(slice.offsets->size() == slice.orig_shape->rank());
-      TF_RET_CHECK(slice.sliced_shape->rank() == slice.orig_shape->rank());
+      TF_RET_CHECK(slice.offsets->size() ==
+                   slice.orig_shape->dimensions().size());
+      TF_RET_CHECK(slice.sliced_shape->dimensions().size() ==
+                   slice.orig_shape->dimensions().size());
     }
   }
-  TF_RETURN_IF_ERROR(embedded_commands_->Prepare(params, resource_requests));
+  TF_RETURN_IF_ERROR(embedded_commands_.Prepare(params, resource_requests));
   return absl::OkStatus();
 }
 
-absl::Status DynamicSliceFusionCmd::Record(
+absl::StatusOr<const se::CommandBuffer::Command*> DynamicSliceFusionCmd::Record(
     const Thunk::ExecuteParams& execute_params,
-    const RecordParams& record_params, se::CommandBuffer* command_buffer) {
+    const RecordParams& record_params, RecordAction record_action,
+    se::CommandBuffer* command_buffer) {
   se::Stream& stream = *execute_params.stream;
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
 
   const BufferAllocations& orig_allocations =
       *execute_params.buffer_allocations;
@@ -2097,14 +1988,14 @@ absl::Status DynamicSliceFusionCmd::Record(
     const Shape& dst_shape = *slice.sliced_shape;
 
     absl::InlinedVector<int64_t, 4> slice_starts;
-    slice_starts.reserve(dst_shape.rank());
+    slice_starts.reserve(dst_shape.dimensions().size());
 
     // Number of issues d2h transfers to copy offset values from device to
     // host.
     int64_t num_transfers = 0;
 
-    // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
-    // components.
+    // Get offset for `argument_idx`-th argument, which has
+    // `dst_shape.dimensions_size()` components.
     for (auto [offset_idx, values] : llvm::enumerate(llvm::zip(
              *slice.offsets, src_shape.dimensions(), dst_shape.dimensions()))) {
       auto [offset, src_dim, dst_dim] = values;
@@ -2181,24 +2072,37 @@ absl::Status DynamicSliceFusionCmd::Record(
   Thunk::ExecuteParams new_params =
       Thunk::ExecuteParams::CloneWithNewAllocations(execute_params,
                                                     slice_allocations);
+
+  // TODO(b/406370928): Instead of creating a nested command buffer on every
+  // call we should create it once and update it. CommandBufferThunk state
+  // manager relies on command buffer pointer as an identity for command
+  // buffers, and it means that command buffer commands sequence should not
+  // create ephemeral command buffers at run time.
   auto nested_command_buffer =
       execute_params.stream->parent()
           ->CreateCommandBuffer(se::CommandBuffer::Mode::kNested)
           .value();
-  TF_RETURN_IF_ERROR(embedded_commands_->Record(new_params, record_params,
-                                                nested_command_buffer.get()));
-  return command_buffer->AddNestedCommandBuffer(execution_scope_id,
-                                                *nested_command_buffer);
+  TF_RETURN_IF_ERROR(embedded_commands_.Record(new_params, record_params,
+                                               nested_command_buffer.get()));
+
+  return Handle(
+      std::move(record_action),
+      [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
+        return command_buffer->CreateNestedCommand(*nested_command_buffer,
+                                                   dependencies);
+      },
+      [&](const se::CommandBuffer::Command* command) {
+        return command_buffer->UpdateNestedCommand(command,
+                                                   *nested_command_buffer);
+      });
 }
 
-CommandBufferCmd::BufferUseVector DynamicSliceFusionCmd::buffers() {
+CommandBufferCmd::BufferUseVector DynamicSliceFusionCmd::buffers() const {
   CommandBufferCmd::BufferUseVector buffers;
-  auto embed_buffers = embedded_commands_->buffers();
+  auto embed_buffers = embedded_commands_.buffers();
   for (auto buffer_usage : embed_buffers) {
-    CHECK(
-        embeded_to_origin_slice_map_[buffer_usage.slice().index()].has_value());
     buffers.emplace_back(
-        embeded_to_origin_slice_map_[buffer_usage.slice().index()].value(),
+        *embeded_to_origin_slice_map_.at(buffer_usage.slice().index()),
         buffer_usage.access());
   }
   return buffers;
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index 5a119070d2ec..e4bba17ec91a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -16,13 +16,14 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
 #define XLA_BACKENDS_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -37,13 +38,17 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/call_frame.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/runtime/execution_graph.h"
+#include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/buffer_allocations.h"
@@ -59,6 +64,7 @@ limitations under the License.
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/gtl/int_type.h"
 
 namespace xla::gpu {
 
@@ -74,10 +80,7 @@ namespace xla::gpu {
   V(kMemcpyDeviceToDeviceCmd, "MemcpyDeviceToDeviceCmd") \
   V(kMemzeroCmd, "MemzeroCmd")                           \
   V(kMemset32Cmd, "Memset32Cmd")                         \
-  V(kIfCmd, "IfCmd")                                     \
-  V(kIfElseCmd, "IfElseCmd")                             \
   V(kCaseCmd, "CaseCmd")                                 \
-  V(kForCmd, "ForCmd")                                   \
   V(kWhileCmd, "WhileCmd")                               \
   V(kCustomCallCmd, "CustomCallCmd")                     \
   V(kBarrierCmd, "BarrierCmd")                           \
@@ -138,40 +141,60 @@ class CommandBufferCmd {
     virtual ~State() = default;
   };
 
-  // An external manager for a state attached to commands.
+  // An external manager for a state attached to commands recorded into command
+  // buffers (same command can be recorded into multiple command buffers).
   class StateManager {
    public:
     virtual ~StateManager() = default;
 
     template <typename ConcreteState>
-    ConcreteState* GetOrNull(const CommandBufferCmd* cmd) {
+    ConcreteState* GetOrNull(const CommandBufferCmd* cmd,
+                             const se::CommandBuffer* command_buffer) {
       static_assert(std::is_base_of_v<State, ConcreteState>);
-      return static_cast<ConcreteState*>(GetOrNull(cmd));
+      return static_cast<ConcreteState*>(
+          GetOrNull(cmd, command_buffer, GetTypeId<ConcreteState>()));
     }
 
     template <typename ConcreteState>
     ConcreteState* GetOrCreate(
-        const CommandBufferCmd* cmd,
+        const CommandBufferCmd* cmd, const se::CommandBuffer* command_buffer,
         absl::FunctionRef<std::unique_ptr<ConcreteState>()> create) {
       static_assert(std::is_base_of_v<State, ConcreteState>);
-      return static_cast<ConcreteState*>(GetOrCreate(
-          cmd, [&]() -> std::unique_ptr<State> { return create(); }));
+      return static_cast<ConcreteState*>(GetOrCreate(cmd, command_buffer,
+                                                     GetTypeId<ConcreteState>(),
+                                                     [&] { return create(); }));
     }
 
     template <typename ConcreteState>
-    ConcreteState* GetOrCreate(const CommandBufferCmd* cmd) {
-      static_assert(std::is_base_of_v<State, ConcreteState>);
-      return static_cast<ConcreteState*>(
-          GetOrCreate(cmd, [] { return std::make_unique<ConcreteState>(); }));
+    ConcreteState* GetOrCreate(const CommandBufferCmd* cmd,
+                               const se::CommandBuffer* command_buffer) {
+      return GetOrCreate<ConcreteState>(cmd, command_buffer, [] {
+        return std::make_unique<ConcreteState>();
+      });
     }
 
    private:
-    State* GetOrNull(const CommandBufferCmd* cmd);
+    // We use TypeId to distinguish between different state types.
+    TSL_LIB_GTL_DEFINE_INT_TYPE(TypeId, int64_t);
+
+    template <typename F>
+    static TypeId GetTypeId() {
+      static const TypeId id = GetNextTypeId();
+      return id;
+    }
+
+    static TypeId GetNextTypeId();
+
+    State* GetOrNull(const CommandBufferCmd* cmd,
+                     const se::CommandBuffer* command_buffer, TypeId type_id);
 
     State* GetOrCreate(const CommandBufferCmd* cmd,
+                       const se::CommandBuffer* command_buffer, TypeId type_id,
                        absl::FunctionRef<std::unique_ptr<State>()> create);
 
-    absl::flat_hash_map<const CommandBufferCmd*, std::unique_ptr<State>> state_;
+    using Key =
+        std::tuple<const CommandBufferCmd*, const se::CommandBuffer*, TypeId>;
+    absl::flat_hash_map<Key, std::unique_ptr<State>> state_;
   };
 
   // Parameters for recording commands into the command buffer.
@@ -179,20 +202,25 @@ class CommandBufferCmd {
     // An external state manager that gives efficient access to per-device state
     // to commands without a need to add expensive synchronization.
     StateManager& state;
+  };
+
+  // Create new commands in the command buffer using the given dependencies.
+  struct RecordCreate {
+    absl::Span<const se::CommandBuffer::Command* const> dependencies;
+  };
 
-    // Execution scope id defines the default execution scope that should be
-    // used for recording commands. Each individual command uses this scope plus
-    // its own execution stream id to compute the execution scope that will be
-    // used for adding commands to command buffer. It is a command sequence
-    // responsibility to guarantee that all commands eventually will be
-    // correctly synchronized with an execution scope id passed as argument.
-    //
-    // This argument allows conditional commands to record a command sequence
-    // into non-default execution scope.
-    se::CommandBuffer::ExecutionScopeId execution_scope_id =
-        se::CommandBuffer::kDefaultExecutionScope;
+  // Update previously recorded commands in the command buffer.
+  struct RecordUpdate {
+    const se::CommandBuffer::Command* command;
   };
 
+  // When recording a command into the command buffer we can either update
+  // previously recorded commands or create new ones. The command DAG structure
+  // can be defined only when we record commands the first time, after that we
+  // can only update previously recorded commands parameters (i.e. with pointers
+  // to new buffer allocations).
+  using RecordAction = std::variant<RecordCreate, RecordUpdate>;
+
   // See Thunk documentation for XLA execution stages (prepare, initialize,
   // execute). Commands mirror thunks as they are executed as CommandBufferThunk
   // that is plugged into the Thunk execution cycle.
@@ -213,10 +241,13 @@ class CommandBufferCmd {
     return absl::OkStatus();
   }
 
-  // Records command into the command buffer using given execution scope.
-  virtual absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                              const RecordParams& record_params,
-                              se::CommandBuffer* command_buffer) = 0;
+  // Records commands into the command buffer. Returned commands will be passed
+  // back on the next call to `Record` into the same command buffer, so that it
+  // can do efficient command buffer updates.
+  virtual absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) = 0;
 
   // For some commands need to force update on Record even the input device
   // pointers do not change, e.g. command that has state that can be changed by
@@ -225,22 +256,11 @@ class CommandBufferCmd {
 
   // Returns all buffers used by the cmd. These will be used to track cmd
   // updates, thus they need to be consistent across calls to the function.
-  virtual BufferUseVector buffers() = 0;
+  virtual BufferUseVector buffers() const = 0;
 
   // Returns true if command implemented as a nested command buffer.
   virtual bool IsNestedCommandBuffer() const { return false; }
 
-  // Returns a command execution scope created from the specified
-  // 'execution_stream_id'.
-  se::CommandBuffer::ExecutionScopeId GetExecutionScope(
-      const RecordParams& record_params,
-      ExecutionStreamId execution_stream_id) const;
-
-  // Return the execution scope created from the execution stream id of the
-  // thunk which is lowered to current command.
-  virtual se::CommandBuffer::ExecutionScopeId GetExecutionScope(
-      const CommandBufferCmd::RecordParams& record_params) const;
-
   absl::string_view profile_annotation() const { return profile_annotation_; }
   void set_profile_annotation(absl::string_view profile_annotation) {
     profile_annotation_ = profile_annotation;
@@ -260,67 +280,60 @@ class CommandBufferCmd {
   ExecutionStreamId execution_stream_id_;
 };
 
+// A sequence of commands (corresponds to a ThunkSequence from the Thunk API).
+class CommandBufferCmdSequence
+    : public std::vector<std::unique_ptr<CommandBufferCmd>> {
+ public:
+  template <typename Command, typename... Args>
+  void Emplace(Args&&... args) {
+    this->emplace_back(std::make_unique<Command>(std::forward<Args>(args)...));
+  }
+};
+
 //===----------------------------------------------------------------------===//
-// CommandBufferCmdSequence
+// CommandBufferCmdExecutor
 //===----------------------------------------------------------------------===//
 
-// A sequence of command buffer commands that create or update a command buffer.
-// You can think of CommandBufferCmdSequence as a mini interpreter whose sole
-// purpose is to manipulate command buffers at run time.
-class CommandBufferCmdSequence {
+// Command executor is responsible for recording commands sequence into the
+// underlying command buffer and setting up dependencies between commands.
+class CommandBufferCmdExecutor {
  public:
-  // Synchronization mode defines how execution streams gets converted to
-  // command buffer execution scopes and barriers.
-  //
-  // Each individual Thunk assigned an execution stream id, and we have explicit
-  // inter-stream synchronization (`Thunk::Kind::kWaitForStreams`) between
-  // streams. Thunks assigned to the same stream are implicitly synchronized.
-  //
-  // Command buffers on the other hand by default can execute commands
-  // concurrently and require barriers to enforce execution order.
-  //
-  // WARNING: We do not have implicit synchronization between execution scopes
-  // corresponding to different execution streams and rely on explicit barriers
-  // emitted from thunks. Synchronization mode controls only barriers within
-  // a single exection scope (corresponds to execution stream).
+  CommandBufferCmdExecutor() = default;
+  CommandBufferCmdExecutor(CommandBufferCmdExecutor&&) = default;
+  CommandBufferCmdExecutor& operator=(CommandBufferCmdExecutor&&) = default;
+
+  using RecordParams = CommandBufferCmd::RecordParams;
+
+  // Synchronization mode defines how much concurrency is allowed between
+  // commands in the sequence.
   enum class SynchronizationMode {
-    // Adds barriers between all commands recorded into the same execution scope
-    // (thunks sharing execution stream) and enforces completely serialized
-    // execution order that matches what would happen in a ThunkSequence.
+    // Serializes execution of all commands recorded into the command buffer
+    // by adding a dependency between them.
     kSerialize,
 
-    // Relies on buffer use analysis to insert barriers only between commands
-    // that have read-write conflicts into the same buffers. Conflicts are
-    // detected only between commands using the same stream id, and inter-stream
-    // synchronization is a user responsibility.
+    // Relies on execution graph to insert dependencies between commands
+    // that have buffer of resource conflicts, and building a DAG of commands.
     kAutomatic
   };
 
-  enum class RecordMode {
-    // In exclusive mode no one else is recording commands into the command
-    // buffer argument, and cmd sequence is responsible for updating command
-    // buffer state: finalizing after all commands recorded, and
-    // switching to update state before recording updates.
-    kExclusive,
-
-    // In conditional mode multiple cmd sequences can be recorded into the
-    // command buffer argument, and with command buffer state managed externally
-    // cmd sequence should not finalize or update it. This mode is used when
-    // command buffer cmd sequence is recorded into conditional command buffers
-    // owned by the parent command buffer.
-    kConditional
-  };
-
-  explicit CommandBufferCmdSequence(SynchronizationMode synchronization_mode =
-                                        SynchronizationMode::kAutomatic);
-
-  void Append(std::unique_ptr<CommandBufferCmd> cmd);
-
-  template <typename T, typename... Args>
-  void Emplace(Args... args) {
-    Append(std::make_unique<T>(std::forward<Args>(args)...));
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, SynchronizationMode mode) {
+    switch (mode) {
+      case SynchronizationMode::kSerialize:
+        sink.Append("serialize");
+        break;
+      case SynchronizationMode::kAutomatic:
+        sink.Append("automatic");
+        break;
+    }
   }
 
+  // Creates a command executor from a sequence of commands using given
+  // synchronization mode.
+  static absl::StatusOr<CommandBufferCmdExecutor> Create(
+      CommandBufferCmdSequence commands,
+      SynchronizationMode synchronization_mode);
+
   // Prepares all commands added to a sequence.
   absl::Status Prepare(const Thunk::PrepareParams& params,
                        Thunk::ResourceRequestsInterface& resource_requests);
@@ -329,11 +342,29 @@ class CommandBufferCmdSequence {
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           CommandBufferCmd::StateManager& state);
 
-  // Records all commands added to a sequence into the given command buffer.
+  // Records commands into the command buffer. This method automatically
+  // switches between `RecordCreate` or `RecordUpdate` depending on the command
+  // buffer state. This method assumes that no other command buffer sequence is
+  // recorded into the same command buffer, and doesn't set up initial
+  // dependencies for recorded commands.
   absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const CommandBufferCmd::RecordParams& record_params,
-                      se::CommandBuffer* command_buffer,
-                      RecordMode mode = RecordMode::kExclusive);
+                      const RecordParams& record_params,
+                      se::CommandBuffer* command_buffer);
+
+  // Records command creation into the command buffer. Command buffer must be
+  // in create state. The next command sequence recorded into the same command
+  // buffer must use returned commands as dependencies, to guarantee that it is
+  // correctly ordered after this command sequence.
+  absl::StatusOr<std::vector<const se::CommandBuffer::Command*>> RecordCreate(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, se::CommandBuffer* command_buffer,
+      absl::Span<const se::CommandBuffer::Command* const> dependencies) const;
+
+  // Records command updates into the command buffer. Command buffer must be
+  // in update state.
+  absl::Status RecordUpdate(const Thunk::ExecuteParams& execute_params,
+                            const RecordParams& record_params,
+                            se::CommandBuffer* command_buffer) const;
 
   // Returns buffers referenced by commands in this sequence.
   const absl::flat_hash_set<BufferUse>& buffers() const;
@@ -341,51 +372,55 @@ class CommandBufferCmdSequence {
   // Returns buffer allocations indices referenced by commands in this sequence.
   const absl::flat_hash_set<BufferAllocation::Index>& allocs_indices() const;
 
-  // Returns a vector that tells if command at the given index requires a
-  // barrier.
-  std::vector<bool> barriers() const;
-
   bool empty() const { return commands_.empty(); }
   size_t size() const { return commands_.size(); }
 
   bool force_update() const {
-    return absl::c_any_of(commands_, [](const CommandInfo& cmd_info) {
-      return cmd_info.cmd->force_update();
-    });
+    return absl::c_any_of(commands_,
+                          [](const auto& cmd) { return cmd->force_update(); });
   }
 
  private:
-  struct CommandInfo {
-    std::unique_ptr<CommandBufferCmd> cmd;
-    bool requires_barrier;
+  // We use index into the `commands_` vector as a command id.
+  using CommandId = int64_t;
+
+  // A state associated with commands in the sequence. We rely on this state to
+  // efficiently update command recorded into the command buffer.
+  struct RecordState : public CommandBufferCmd::State {
+    const se::CommandBuffer::Command* command;
   };
 
-  // Functions for tracking buffer usage of recorded commands and figuring out
-  // when the next command requires a barrier for correctness.
-  bool HasConflicts(ExecutionStreamId execution_stream_id,
-                    const CommandBufferCmd::BufferUseVector& buffers);
-  void TrackBuffers(ExecutionStreamId execution_stream_id,
-                    const CommandBufferCmd::BufferUseVector& buffers);
-  void ClearTrackedBuffers(ExecutionStreamId execution_stream_id);
+  CommandBufferCmdExecutor(SynchronizationMode synchronization_mode,
+                           CommandBufferCmdSequence commands,
+                           std::optional<ExecutionGraph> execution_graph);
+
+  absl::Status CheckCommandBufferState(
+      se::CommandBuffer* command_buffer,
+      se::CommandBuffer::State expected_state) const;
+
+  // Returns true if command has no dependencies.
+  bool IsSource(CommandId id) const;
+
+  // Returns true if command is not a dependency of any other commands.
+  bool IsSink(CommandId id) const;
+
+  // Returns dependencies of the command with the given id.
+  std::vector<const se::CommandBuffer::Command*> Dependencies(
+      const RecordParams& record_params, se::CommandBuffer* command_buffer,
+      CommandId id) const;
 
   SynchronizationMode synchronization_mode_;
-  std::vector<CommandInfo> commands_;
+  CommandBufferCmdSequence commands_;
+
+  // In automatic synchronization mode we build an execution graph for the
+  // sequence of commands and use it to set up dependencies between commands.
+  std::optional<ExecutionGraph> execution_graph_;
 
   // Buffers referenced by commands in this sequence.
   absl::flat_hash_set<BufferUse> buffers_;
 
   // Buffer allocations indices referenced by commands in this sequence.
   absl::flat_hash_set<BufferAllocation::Index> allocs_indices_;
-
-  // We track read and write sets of commands recorded into the command
-  // sequence to detect conflicts and insert explicit barriers. These are the
-  // buffer allocation slices used by commands appended since the last barrier.
-  struct ReadWriteSet {
-    absl::flat_hash_set<BufferAllocation::Slice> read;
-    absl::flat_hash_set<BufferAllocation::Slice> write;
-  };
-
-  absl::flat_hash_map<ExecutionStreamId, ReadWriteSet> read_write_sets_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -433,9 +468,10 @@ class TracedCommandBufferCmd : public CommandBufferCmd {
   // Creates a command buffer by calling a user-provided `trace` function and
   // adds it as a nested command to `command_buffer`. Traced command buffers
   // cached and reused in an instance of `TracedCommandBuffer` kept in `state`.
-  absl::Status AddTracedCommandBuffer(
+  absl::StatusOr<const se::CommandBuffer::Command*> RecordTracedCommand(
       const Thunk::ExecuteParams& execute_params,
-      const RecordParams& record_params, se::CommandBuffer* command_buffer,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer,
       absl::FunctionRef<absl::Status(se::Stream*)> trace);
 };
 
@@ -450,30 +486,16 @@ class ComputationIdCmd : public CommandBufferCmd {
   ComputationIdCmd(ExecutionStreamId execution_stream_id,
                    BufferAllocation::Slice dest, Kind kind);
 
-  absl::Status Initialize(const Thunk::InitializeParams& params,
-                          StateManager& state) override;
-
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   BufferAllocation::Slice dest_;
   Kind kind_;
-
-  // Command sequence can be recorded concurrently for multiple command buffers
-  // on different stream executors and we need to synchronize mutable state.
-  absl::Mutex mutex_;
-
-  // TODO(ezhulenev): This is a workaround for CUDA graphs + conditional nodes
-  // bug that will be fixed in CUDA 12.4.1 release: currently it's impossible to
-  // update a memset node inside a conditional graph. Instead of using memset
-  // node we replace it with a kernel launch node of CUDA kernels doing 1D
-  // memset. This should be removed when bug is fixed in CUDA.
-  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
-      memset_kernels_ ABSL_GUARDED_BY(mutex_);
 };
 
 //===----------------------------------------------------------------------===//
@@ -490,11 +512,12 @@ class LaunchCmd : public CommandBufferCmd {
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   std::string kernel_name_;
@@ -524,11 +547,12 @@ class CustomKernelLaunchCmd : public CommandBufferCmd {
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   std::vector<BufferAllocation::Slice> args_;
@@ -552,11 +576,12 @@ class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
                           BufferAllocation::Slice dst,
                           BufferAllocation::Slice src, int64_t num_bytes);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   BufferAllocation::Slice dst_;
@@ -573,11 +598,12 @@ class MemzeroCmd : public CommandBufferCmd {
   MemzeroCmd(ExecutionStreamId execution_stream_id,
              BufferAllocation::Slice dst);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   BufferAllocation::Slice dst_;
@@ -592,69 +618,18 @@ class Memset32Cmd : public CommandBufferCmd {
   Memset32Cmd(ExecutionStreamId execution_stream_id,
               BufferAllocation::Slice dst, uint32_t bit_pattern);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   BufferAllocation::Slice dst_;
   uint32_t bit_pattern_;
 };
 
-//===----------------------------------------------------------------------===//
-// IfCmd
-//===----------------------------------------------------------------------===//
-
-class IfCmd : public CommandBufferCmd {
- public:
-  IfCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice pred,
-        CommandBufferCmdSequence then_commands);
-
-  absl::Status Initialize(const Thunk::InitializeParams& params,
-                          StateManager& state) override;
-
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
-
-  bool force_update() override;
-
-  BufferUseVector buffers() override;
-
- private:
-  BufferAllocation::Slice pred_;
-  CommandBufferCmdSequence then_commands_;
-};
-
-//===----------------------------------------------------------------------===//
-// IfElseCmd
-//===----------------------------------------------------------------------===//
-
-class IfElseCmd : public CommandBufferCmd {
- public:
-  IfElseCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice pred,
-            CommandBufferCmdSequence then_commands,
-            CommandBufferCmdSequence else_commands);
-
-  absl::Status Initialize(const Thunk::InitializeParams& params,
-                          StateManager& state) override;
-
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
-
-  bool force_update() override;
-
-  BufferUseVector buffers() override;
-
- private:
-  BufferAllocation::Slice pred_;
-  CommandBufferCmdSequence then_commands_;
-  CommandBufferCmdSequence else_commands_;
-};
-
 //===----------------------------------------------------------------------===//
 // CaseCmd
 //===----------------------------------------------------------------------===//
@@ -662,51 +637,24 @@ class IfElseCmd : public CommandBufferCmd {
 class CaseCmd : public CommandBufferCmd {
  public:
   CaseCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice index,
-          bool index_is_bool,
-          std::vector<CommandBufferCmdSequence> branches_commands);
+          bool index_is_bool, std::vector<CommandBufferCmdExecutor> branches);
 
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
   bool force_update() override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   BufferAllocation::Slice index_;
   bool index_is_bool_;
-  std::vector<CommandBufferCmdSequence> branches_commands_;
-};
-
-//===----------------------------------------------------------------------===//
-// ForCmd
-//===----------------------------------------------------------------------===//
-
-class ForCmd : public CommandBufferCmd {
- public:
-  ForCmd(ExecutionStreamId execution_stream_id, int32_t num_iterations,
-         BufferAllocation::Slice loop_counter,
-         CommandBufferCmdSequence body_commands);
-
-  absl::Status Initialize(const Thunk::InitializeParams& params,
-                          StateManager& state) override;
-
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
-
-  bool force_update() override;
-
-  BufferUseVector buffers() override;
-
- private:
-  int32_t num_iterations_;
-  BufferAllocation::Slice loop_counter_;
-  CommandBufferCmdSequence body_commands_;
+  std::vector<CommandBufferCmdExecutor> branches_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -716,24 +664,25 @@ class ForCmd : public CommandBufferCmd {
 class WhileCmd : public CommandBufferCmd {
  public:
   WhileCmd(ExecutionStreamId execution_stream_id, BufferAllocation::Slice pred,
-           CommandBufferCmdSequence cond_commands,
-           CommandBufferCmdSequence body_commands);
+           CommandBufferCmdExecutor cond_commands,
+           CommandBufferCmdExecutor body_commands);
 
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
   bool force_update() override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
   BufferAllocation::Slice pred_;
-  CommandBufferCmdSequence cond_commands_;
-  CommandBufferCmdSequence body_commands_;
+  CommandBufferCmdExecutor cond_commands_;
+  CommandBufferCmdExecutor body_commands_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -751,11 +700,12 @@ class GemmCmd : public TracedCommandBufferCmd {
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   bool IsNestedCommandBuffer() const final { return true; }
 
@@ -773,64 +723,27 @@ class GemmCmd : public TracedCommandBufferCmd {
 // CublasLtCmd
 //===----------------------------------------------------------------------===//
 
-class CublasLtCmd : public TracedCommandBufferCmd {
+class CublasLtCmd : public TracedCommandBufferCmd, public CublasLtMatmulThunk {
  public:
-  CublasLtCmd(ExecutionStreamId execution_stream_id, GemmConfig gemm_config,
-              se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
-              BufferAllocation::Slice a_buffer,
-              BufferAllocation::Slice b_buffer,
-              BufferAllocation::Slice c_buffer,
-              BufferAllocation::Slice d_buffer,
-              BufferAllocation::Slice bias_buffer /* may be null */,
-              BufferAllocation::Slice aux_buffer /* may be null */,
-              BufferAllocation::Slice a_scale_buffer /* may be null */,
-              BufferAllocation::Slice b_scale_buffer /* may be null */,
-              BufferAllocation::Slice c_scale_buffer /* may be null */,
-              BufferAllocation::Slice d_scale_buffer /* may be null */,
-              BufferAllocation::Slice d_amax_buffer /* may be null */,
-              BufferAllocation::Slice workspace_buffer);
+  CublasLtCmd(ExecutionStreamId execution_stream_id,
+              const CublasLtMatmulThunk& matmul_thunk);
 
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  // This is needed to avoid compile errors about "shadowed" virtual function
+  absl::Status Initialize(const InitializeParams& params) override {
+    return CublasLtMatmulThunk::Initialize(params);
+  }
+
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   bool IsNestedCommandBuffer() const final { return true; }
-
- private:
-  absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
-      const se::Stream* stream);
-
-  absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
-      const se::Stream* stream, const se::gpu::BlasLt::MatmulPlan* plan,
-      int64_t max_workspace);
-
-  absl::flat_hash_map<const se::Stream*, se::gpu::BlasLt::MatmulPlanPtr>
-      matmul_plans_cache_;
-
-  absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
-                      se::gpu::BlasLt::MatmulAlgorithm>
-      matmul_algorithm_cache_;
-
-  const GemmConfig gemm_config_;
-  const se::gpu::BlasLt::Epilogue epilogue_;
-  const int64_t algorithm_idx_;
-  const BufferAllocation::Slice a_buffer_;
-  const BufferAllocation::Slice b_buffer_;
-  const BufferAllocation::Slice c_buffer_;
-  const BufferAllocation::Slice d_buffer_;
-  const BufferAllocation::Slice bias_buffer_;
-  const BufferAllocation::Slice aux_buffer_;
-  const BufferAllocation::Slice a_scale_buffer_;
-  const BufferAllocation::Slice b_scale_buffer_;
-  const BufferAllocation::Slice c_scale_buffer_;
-  const BufferAllocation::Slice d_scale_buffer_;
-  const BufferAllocation::Slice d_amax_buffer_;
-  const BufferAllocation::Slice workspace_buffer_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -846,11 +759,12 @@ class CuDnnCmd : public TracedCommandBufferCmd {
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   bool IsNestedCommandBuffer() const final { return true; }
 
@@ -888,31 +802,36 @@ class CustomCallCmd : public CommandBufferCmd {
                 XLA_FFI_Handler* handler,
                 std::vector<std::optional<Slice>> operands,
                 std::vector<std::optional<Slice>> results,
-                AttributesMap attributes,
+                ffi::CallFrame call_frame,
                 const HloComputation* called_computation)
       : CommandBufferCmd(CommandBufferCmdType::kCustomCallCmd,
                          execution_stream_id),
         target_name_(std::move(target_name)),
         handler_(handler),
-        attributes_(std::move(attributes)),
+        call_frame_(std::move(call_frame)),
+        call_frames_([this] { return call_frame_->Copy(); }),
         called_computation_(called_computation),
         operands_(std::move(operands)),
         results_(std::move(results)) {}
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
   bool IsNestedCommandBuffer() const final { return true; }
 
  private:
-  absl::Status RecordLegacyCustomCall(const Thunk::ExecuteParams& execute_param,
-                                      const RecordParams& record_params,
-                                      se::CommandBuffer* command_buffer);
-  absl::Status RecordXlaFfiCall(const Thunk::ExecuteParams& execute_param,
-                                const RecordParams& record_params,
-                                se::CommandBuffer* command_buffer);
+  absl::StatusOr<const se::CommandBuffer::Command*> RecordLegacyCustomCall(
+      const Thunk::ExecuteParams& execute_param,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer);
+
+  absl::StatusOr<const se::CommandBuffer::Command*> RecordXlaFfiCall(
+      const Thunk::ExecuteParams& execute_param,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer);
 
   std::string target_name_;
 
@@ -925,37 +844,18 @@ class CustomCallCmd : public CommandBufferCmd {
   // functions with XLA runtime. It's under construction, and still misses
   // a lot of features. Long term it will replace legacy custom calls.
   XLA_FFI_Handler* handler_ = nullptr;
-  AttributesMap attributes_;
-  const HloComputation* called_computation_;
 
-  std::vector<std::optional<Slice>> operands_;
-  std::vector<std::optional<Slice>> results_;
-};
+  // Reference call frame pre-initialized at construction time.
+  std::optional<ffi::CallFrame> call_frame_;
 
-//===----------------------------------------------------------------------===//
-// BarrierCmd insert a barrier from the execution scope created from the
-// 'from_stream_id' to the execution scope created from the
-// 'execution_stream_id', e.g. Async operator lowered to command buffer requires
-// a barrier from the launching stream to the async operator's execution stream.
-//
-// In other words, all future commands added to `execution_stream_id` are
-// guaranteed to begin executing only after all already-added commands in
-// `from_stream_id` have completed.
-//===----------------------------------------------------------------------===//
+  // A pool of call frames used at run time. Newly created call frames are
+  // copied from the reference call frame and updated with buffer addresses.
+  std::optional<ObjectPool<ffi::CallFrame>> call_frames_;
 
-class BarrierCmd : public CommandBufferCmd {
- public:
-  BarrierCmd(ExecutionStreamId execution_stream_id,
-             ExecutionStreamId from_stream_id);
-
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
-
-  BufferUseVector buffers() override;
+  const HloComputation* called_computation_;
 
- private:
-  const ExecutionStreamId from_stream_id_;
+  std::vector<std::optional<Slice>> operands_;
+  std::vector<std::optional<Slice>> results_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -967,7 +867,7 @@ class CollectiveCmd : public CommandBufferCmd {
   CollectiveCmd(CommandBufferCmdType cmd_type,
                 ExecutionStreamId execution_stream_id,
                 ExecutionStreamId async_from_stream_id,
-                NcclCollectiveConfig config);
+                CollectiveConfig config);
 
   absl::Status Prepare(
       const Thunk::PrepareParams& params,
@@ -977,9 +877,10 @@ class CollectiveCmd : public CommandBufferCmd {
 
   bool IsNestedCommandBuffer() const final { return true; }
 
-  absl::Status AddTracedCommandBuffer(
+  absl::StatusOr<const se::CommandBuffer::Command*> RecordTracedCommand(
       const Thunk::ExecuteParams& execute_params,
-      const RecordParams& record_params, se::CommandBuffer* command_buffer,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer,
       absl::FunctionRef<absl::Status(se::Stream*)> trace);
 
   virtual AsyncStreamKind GetAsyncStreamKind() = 0;
@@ -996,16 +897,12 @@ class CollectiveCmd : public CommandBufferCmd {
     return async_from_stream_id_;
   }
 
-  absl::Status BarrierIfAsync(
-      se::CommandBuffer* command_buffer, se::StreamExecutor* executor,
-      const CommandBufferCmd::RecordParams& record_params);
-
  protected:
-  const NcclCollectiveConfig& config() const { return config_; }
+  const CollectiveConfig& config() const { return config_; }
 
  private:
   ExecutionStreamId async_from_stream_id_;
-  NcclCollectiveConfig config_;
+  CollectiveConfig config_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1015,15 +912,16 @@ class CollectiveCmd : public CommandBufferCmd {
 class AllReduceCmd : public CollectiveCmd {
  public:
   AllReduceCmd(ExecutionStreamId execution_stream_id,
-               ExecutionStreamId async_from_stream_id,
-               NcclCollectiveConfig config, ReductionKind reduction_kind,
-               absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+               ExecutionStreamId async_from_stream_id, CollectiveConfig config,
+               ReductionKind reduction_kind,
+               absl::Span<const CollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   AsyncStreamKind GetAsyncStreamKind() override {
     return AsyncStreamKind::kCollective;
@@ -1031,7 +929,7 @@ class AllReduceCmd : public CollectiveCmd {
 
  private:
   ReductionKind reduction_kind_;
-  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+  std::vector<CollectiveThunk::Buffer> buffers_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1042,14 +940,15 @@ class ReduceScatterCmd : public CollectiveCmd {
  public:
   ReduceScatterCmd(ExecutionStreamId execution_stream_id,
                    ExecutionStreamId async_from_stream_id,
-                   NcclCollectiveConfig config, ReductionKind reduction_kind,
-                   absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+                   CollectiveConfig config, ReductionKind reduction_kind,
+                   absl::Span<const CollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   AsyncStreamKind GetAsyncStreamKind() override {
     return AsyncStreamKind::kCollective;
@@ -1057,7 +956,7 @@ class ReduceScatterCmd : public CollectiveCmd {
 
  private:
   ReductionKind reduction_kind_;
-  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+  std::vector<CollectiveThunk::Buffer> buffers_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1067,15 +966,16 @@ class ReduceScatterCmd : public CollectiveCmd {
 class AllToAllCmd : public CollectiveCmd {
  public:
   AllToAllCmd(ExecutionStreamId execution_stream_id,
-              ExecutionStreamId async_from_stream_id,
-              NcclCollectiveConfig config, bool has_split_dimension,
-              absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+              ExecutionStreamId async_from_stream_id, CollectiveConfig config,
+              bool has_split_dimension,
+              absl::Span<const CollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   AsyncStreamKind GetAsyncStreamKind() override {
     return AsyncStreamKind::kCollective;
@@ -1083,7 +983,7 @@ class AllToAllCmd : public CollectiveCmd {
 
  private:
   bool has_split_dimension_;
-  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+  std::vector<CollectiveThunk::Buffer> buffers_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1093,22 +993,22 @@ class AllToAllCmd : public CollectiveCmd {
 class AllGatherCmd : public CollectiveCmd {
  public:
   AllGatherCmd(ExecutionStreamId execution_stream_id,
-               ExecutionStreamId async_from_stream_id,
-               NcclCollectiveConfig config,
-               absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+               ExecutionStreamId async_from_stream_id, CollectiveConfig config,
+               absl::Span<const CollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   AsyncStreamKind GetAsyncStreamKind() override {
     return AsyncStreamKind::kCollective;
   };
 
  private:
-  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+  std::vector<CollectiveThunk::Buffer> buffers_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1119,17 +1019,18 @@ class CollectiveBroadcastCmd : public CollectiveCmd {
  public:
   CollectiveBroadcastCmd(ExecutionStreamId execution_stream_id,
                          ExecutionStreamId async_from_stream_id,
-                         NcclCollectiveConfig config,
-                         absl::Span<const NcclCollectiveThunk::Buffer> buffers);
+                         CollectiveConfig config,
+                         absl::Span<const CollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
  private:
-  std::vector<NcclCollectiveThunk::Buffer> buffers_;
+  std::vector<CollectiveThunk::Buffer> buffers_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -1140,7 +1041,7 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
  public:
   DynamicSliceFusionCmd(
       ExecutionStreamId execution_stream_id,
-      std::unique_ptr<CommandBufferCmdSequence> embedded_commands,
+      CommandBufferCmdExecutor embedded_commands,
       std::vector<std::optional<BufferAllocation::Slice>> arguments,
       std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
       std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
@@ -1156,18 +1057,19 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
       const Thunk::PrepareParams& params,
       Thunk::ResourceRequestsInterface& resource_requests) final;
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, RecordAction record_action,
+      se::CommandBuffer* command_buffer) override;
 
-  BufferUseVector buffers() override;
+  BufferUseVector buffers() const override;
 
   bool force_update() override;
 
   bool IsNestedCommandBuffer() const final { return true; }
 
  private:
-  std::unique_ptr<CommandBufferCmdSequence> embedded_commands_;
+  CommandBufferCmdExecutor embedded_commands_;
   std::vector<DynamicSliceThunk::SliceDef> slices_;
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
index c2e6212a9041..2762c3750672 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -21,49 +21,49 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/cudnn_thunk.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
+#include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/memset_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_gather_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_reduce_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_to_all_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/backends/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/runtime/buffer_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
 // Appends command(s) converted from `thunk` to `cmd_sequence`.
-static absl::Status AppendCommands(
-    CommandBufferCmdSequence& cmd_sequence, const Thunk& thunk,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode);
+static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
+                                   const Thunk& thunk,
+                                   const ConvertToCommandsOptions& options);
 
 // Appends command(s) converted from `sequence` to `cmd_sequence`.
-static absl::Status AppendCommands(
-    CommandBufferCmdSequence& cmd_sequence, const ThunkSequence& sequence,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode);
+static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
+                                   const ThunkSequence& sequence,
+                                   const ConvertToCommandsOptions& options);
 
 //===----------------------------------------------------------------------===//
 // Conversions from Thunk to Command
 //===----------------------------------------------------------------------===//
 
 using Command = std::unique_ptr<CommandBufferCmd>;
-using xla::BufferUse;
 
 static auto ArgsAccess(const std::vector<bool>& written) {
   absl::InlinedVector<BufferUse::MemoryAccess, 4> args_access;
@@ -105,15 +105,14 @@ static absl::StatusOr<Command> Convert(const Memset32BitValueThunk& thunk) {
 }
 
 static absl::StatusOr<Command> Convert(
-    const WhileThunk& thunk,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode) {
+    const WhileThunk& thunk, const ConvertToCommandsOptions& options) {
   TF_ASSIGN_OR_RETURN(
-      CommandBufferCmdSequence cond_cmds,
-      ConvertToCommands(thunk.condition_thunk_sequence()->thunks(),
-                        synchronization_mode));
-  TF_ASSIGN_OR_RETURN(CommandBufferCmdSequence body_cmds,
-                      ConvertToCommands(thunk.body_thunk_sequence()->thunks(),
-                                        synchronization_mode));
+      CommandBufferCmdExecutor cond_cmds,
+      ConvertToCommands(thunk.condition_thunk_sequence()->thunks(), options));
+  TF_ASSIGN_OR_RETURN(
+      CommandBufferCmdExecutor body_cmds,
+      ConvertToCommands(thunk.body_thunk_sequence()->thunks(), options));
+
   return std::make_unique<WhileCmd>(thunk.execution_stream_id(),
                                     thunk.condition_result_buffer(),
                                     std::move(cond_cmds), std::move(body_cmds));
@@ -135,35 +134,27 @@ static absl::StatusOr<Command> Convert(const CublasLtMatmulThunk& thunk) {
     return absl::InternalError(
         "Gemm thunk does not contain a workspace buffer");
   }
-  return std::make_unique<CublasLtCmd>(
-      thunk.execution_stream_id(), thunk.config(), thunk.epilogue(),
-      thunk.algorithm_idx(), thunk.a_buffer(), thunk.b_buffer(),
-      thunk.c_buffer(), thunk.d_buffer(), thunk.bias_buffer(),
-      thunk.aux_buffer(), thunk.a_scale_buffer(), thunk.b_scale_buffer(),
-      thunk.c_scale_buffer(), thunk.d_scale_buffer(), thunk.d_amax_buffer(),
-      thunk.workspace().value());
+  return std::make_unique<CublasLtCmd>(thunk.execution_stream_id(), thunk);
 }
 
 static absl::StatusOr<Command> Convert(
-    const ConditionalThunk& thunk,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode) {
-  std::vector<CommandBufferCmdSequence> branch_cmds;
+    const ConditionalThunk& thunk, const ConvertToCommandsOptions& options) {
+  std::vector<CommandBufferCmdExecutor> branch_cmds;
   branch_cmds.reserve(thunk.branch_thunks().size());
   if (thunk.branch_index_is_bool()) {
     // For boolean predicates, we need to convert the branches in reverse order
     // because the first branch is the "false" branch and the second is "true"
     CHECK_EQ(thunk.branch_thunks().size(), 2);
-    TF_ASSIGN_OR_RETURN(branch_cmds.emplace_back(),
-                        ConvertToCommands(thunk.branch_thunks()[1]->thunks(),
-                                          synchronization_mode));
-    TF_ASSIGN_OR_RETURN(branch_cmds.emplace_back(),
-                        ConvertToCommands(thunk.branch_thunks()[0]->thunks(),
-                                          synchronization_mode));
+    TF_ASSIGN_OR_RETURN(
+        branch_cmds.emplace_back(),
+        ConvertToCommands(thunk.branch_thunks()[1]->thunks(), options));
+    TF_ASSIGN_OR_RETURN(
+        branch_cmds.emplace_back(),
+        ConvertToCommands(thunk.branch_thunks()[0]->thunks(), options));
   } else {
     for (auto& branch_thunk : thunk.branch_thunks()) {
-      TF_ASSIGN_OR_RETURN(
-          CommandBufferCmdSequence cmds,
-          ConvertToCommands(branch_thunk->thunks(), synchronization_mode));
+      TF_ASSIGN_OR_RETURN(CommandBufferCmdExecutor cmds,
+                          ConvertToCommands(branch_thunk->thunks(), options));
       branch_cmds.emplace_back(std::move(cmds));
     }
   }
@@ -172,42 +163,35 @@ static absl::StatusOr<Command> Convert(
       thunk.branch_index_is_bool(), std::move(branch_cmds));
 }
 
-static absl::StatusOr<Command> Convert(const NcclAllReduceStartThunk& thunk) {
+static absl::StatusOr<Command> Convert(const AllReduceStartThunk& thunk) {
   return std::make_unique<AllReduceCmd>(
       thunk.nccl_execution_stream_id(), thunk.execution_stream_id(),
       thunk.config(), thunk.reduction_kind(), thunk.buffers());
 }
 
-static absl::StatusOr<Command> Convert(
-    const NcclReduceScatterStartThunk& thunk) {
+static absl::StatusOr<Command> Convert(const ReduceScatterStartThunk& thunk) {
   return std::make_unique<ReduceScatterCmd>(
       thunk.nccl_execution_stream_id(), thunk.execution_stream_id(),
       thunk.config(), thunk.reduction_kind(), thunk.buffers());
 }
 
-static absl::StatusOr<Command> Convert(const NcclAllToAllStartThunk& thunk) {
+static absl::StatusOr<Command> Convert(const AllToAllStartThunk& thunk) {
   return std::make_unique<AllToAllCmd>(
       thunk.nccl_execution_stream_id(), thunk.execution_stream_id(),
       thunk.config(), thunk.has_split_dimension(), thunk.buffers());
 }
 
-static absl::StatusOr<Command> Convert(const NcclAllGatherStartThunk& thunk) {
+static absl::StatusOr<Command> Convert(const AllGatherStartThunk& thunk) {
   return std::make_unique<AllGatherCmd>(thunk.nccl_execution_stream_id(),
                                         thunk.execution_stream_id(),
                                         thunk.config(), thunk.buffers());
 }
 
-static absl::StatusOr<Command> Convert(const NcclCollectiveDoneThunk& thunk) {
-  return std::make_unique<BarrierCmd>(thunk.execution_stream_id(),
-                                      thunk.nccl_execution_stream_id());
-}
-
-static absl::StatusOr<Command> Convert(const DynamicSliceThunk& thunk) {
-  auto cmd_sequence = std::make_unique<CommandBufferCmdSequence>();
-  auto embed_thunk = thunk.get_embeded_thunk();
-  TF_RETURN_IF_ERROR(AppendCommands(
-      *cmd_sequence, embed_thunk->thunks(),
-      CommandBufferCmdSequence::SynchronizationMode::kAutomatic));
+static absl::StatusOr<Command> Convert(
+    const DynamicSliceThunk& thunk, const ConvertToCommandsOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      CommandBufferCmdExecutor embedded_cmds,
+      ConvertToCommands(thunk.get_embedded_thunk()->thunks(), options));
 
   auto& thunk_fake_allocations = thunk.get_fake_allocations();
   std::vector<std::unique_ptr<BufferAllocation>> fake_allocations;
@@ -216,7 +200,7 @@ static absl::StatusOr<Command> Convert(const DynamicSliceThunk& thunk) {
     fake_allocations.push_back(std::make_unique<BufferAllocation>(**it));
   }
   return std::make_unique<DynamicSliceFusionCmd>(
-      thunk.execution_stream_id(), std::move(cmd_sequence),
+      thunk.execution_stream_id(), std::move(embedded_cmds),
       thunk.get_arguments(), std::move(fake_allocations), thunk.get_offsets(),
       thunk.get_orig_shapes(), thunk.get_sliced_shapes(),
       thunk.get_offset_byte_sizes());
@@ -238,7 +222,7 @@ static absl::StatusOr<Command> Convert(const CustomCallThunk& thunk) {
   if (auto bundle = thunk.bundle(); bundle.has_value()) {
     return std::make_unique<CustomCallCmd>(
         thunk.execution_stream_id(), thunk.target_name(), bundle->execute,
-        thunk.operands(), thunk.results(), thunk.attributes(),
+        thunk.operands(), thunk.results(), *thunk.call_frame(),
         /*called_computation=*/nullptr);  // TODO(b/342285364)
   } else {
     return std::make_unique<CustomCallCmd>(
@@ -252,11 +236,6 @@ static absl::StatusOr<Command> Convert(const CuDnnThunk& thunk) {
                                     thunk.arguments(), thunk.graph());
 }
 
-static absl::StatusOr<Command> Convert(const WaitForStreamsThunk& thunk) {
-  return std::make_unique<BarrierCmd>(thunk.stream_id(),
-                                      thunk.wait_for_stream_id());
-}
-
 //===----------------------------------------------------------------------===//
 static absl::StatusOr<Command> CopyMetadata(absl::StatusOr<Command> cmd,
                                             const Thunk& thunk) {
@@ -267,24 +246,19 @@ static absl::StatusOr<Command> CopyMetadata(absl::StatusOr<Command> cmd,
   return cmd;
 }
 
-template <typename ThunkType>
-static absl::StatusOr<Command> Convert(const Thunk& thunk) {
-  return CopyMetadata(Convert(static_cast<const ThunkType&>(thunk)), thunk);
+template <typename ThunkType, typename... Args>
+static absl::StatusOr<Command> Convert(const Thunk& thunk, Args&&... args) {
+  return CopyMetadata(Convert(static_cast<const ThunkType&>(thunk),
+                              std::forward<Args>(args)...),
+                      thunk);
 }
 
-template <typename ThunkType>
-static absl::StatusOr<Command> Convert(
-    const Thunk& thunk,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode) {
-  return Convert(static_cast<const ThunkType&>(thunk), synchronization_mode);
-}
-
-static absl::Status AppendCommands(
-    CommandBufferCmdSequence& cmd_sequence, const Thunk& thunk,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode) {
+static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
+                                   const Thunk& thunk,
+                                   const ConvertToCommandsOptions& options) {
   auto append = [&](absl::StatusOr<Command> command) -> absl::Status {
     if (command.ok()) {
-      cmd_sequence.Append(std::move(*command));
+      cmd_sequence.push_back(std::move(*command));
       return absl::OkStatus();
     }
     return command.status();
@@ -292,7 +266,7 @@ static absl::Status AppendCommands(
 
   switch (thunk.kind()) {
     case Thunk::Kind::kConditional:
-      return append(Convert<ConditionalThunk>(thunk, synchronization_mode));
+      return append(Convert<ConditionalThunk>(thunk, options));
     case Thunk::Kind::kCopy:
       return append(Convert<DeviceToDeviceCopyThunk>(thunk));
     case Thunk::Kind::kCustomCall:
@@ -309,41 +283,40 @@ static absl::Status AppendCommands(
       return append(Convert<Memset32BitValueThunk>(thunk));
     case Thunk::Kind::kMemzero:
       return append(Convert<MemzeroThunk>(thunk));
-    case Thunk::Kind::kNcclAllGatherStart:
-      return append(Convert<NcclAllGatherStartThunk>(thunk));
-    case Thunk::Kind::kNcclAllReduceStart:
-      return append(Convert<NcclAllReduceStartThunk>(thunk));
-    case Thunk::Kind::kNcclReduceScatterStart:
-      return append(Convert<NcclReduceScatterStartThunk>(thunk));
-    case Thunk::Kind::kNcclAllToAllStart:
-      return append(Convert<NcclAllToAllStartThunk>(thunk));
+    case Thunk::Kind::kAllGatherStart:
+      return append(Convert<AllGatherStartThunk>(thunk));
+    case Thunk::Kind::kAllReduceStart:
+      return append(Convert<AllReduceStartThunk>(thunk));
+    case Thunk::Kind::kReduceScatterStart:
+      return append(Convert<ReduceScatterStartThunk>(thunk));
+    case Thunk::Kind::kAllToAllStart:
+      return append(Convert<AllToAllStartThunk>(thunk));
     case Thunk::Kind::kPartitionId:
       return append(Convert<PartitionIdThunk>(thunk));
     case Thunk::Kind::kReplicaId:
       return append(Convert<ReplicaIdThunk>(thunk));
     case Thunk::Kind::kWhile:
-      return append(Convert<WhileThunk>(thunk, synchronization_mode));
+      return append(Convert<WhileThunk>(thunk, options));
     case Thunk::Kind::kCuDnn:
       return append(Convert<CuDnnThunk>(thunk));
+    case Thunk::Kind::kDynamicSlice:
+      return append(Convert<DynamicSliceThunk>(thunk, options));
 
     // Sequential thunk does not have any special semantics and we simply inline
     // all nested thunks into command buffer.
     case Thunk::Kind::kSequential:
       return AppendCommands(cmd_sequence,
                             static_cast<const SequentialThunk&>(thunk).thunks(),
-                            synchronization_mode);
-
-    case Thunk::Kind::kNcclAllGatherDone:
-    case Thunk::Kind::kNcclAllReduceDone:
-    case Thunk::Kind::kNcclReduceScatterDone:
-    case Thunk::Kind::kNcclAllToAllDone:
-      return append(Convert<NcclCollectiveDoneThunk>(thunk));
-
-    case Thunk::Kind::kDynamicSlice:
-      return append(Convert<DynamicSliceThunk>(thunk));
-
+                            options);
+
+    // Thunks that simply wait for stream events are no-op in the command buffer
+    // context, as we convert async thunks to command dependency graph.
+    case Thunk::Kind::kAllGatherDone:
+    case Thunk::Kind::kAllReduceDone:
+    case Thunk::Kind::kReduceScatterDone:
+    case Thunk::Kind::kAllToAllDone:
     case Thunk::Kind::kWaitForStreams:
-      return append(Convert<WaitForStreamsThunk>(thunk));
+      return absl::OkStatus();
 
     case Thunk::Kind::kCommandBuffer:
       return Internal(
@@ -358,23 +331,20 @@ static absl::Status AppendCommands(
   }
 }
 
-static absl::Status AppendCommands(
-    CommandBufferCmdSequence& cmd_sequence, const ThunkSequence& sequence,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode) {
+static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
+                                   const ThunkSequence& sequence,
+                                   const ConvertToCommandsOptions& options) {
   for (const std::unique_ptr<Thunk>& thunk : sequence)
-    TF_RETURN_IF_ERROR(
-        AppendCommands(cmd_sequence, *thunk, synchronization_mode));
+    TF_RETURN_IF_ERROR(AppendCommands(cmd_sequence, *thunk, options));
   return absl::OkStatus();
 }
 
-// TODO(vuson): Add unit tests.
-absl::StatusOr<CommandBufferCmdSequence> ConvertToCommands(
-    const ThunkSequence& sequence,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode) {
-  CommandBufferCmdSequence cmd_sequence(synchronization_mode);
-  TF_RETURN_IF_ERROR(
-      AppendCommands(cmd_sequence, sequence, synchronization_mode));
-  return cmd_sequence;
+absl::StatusOr<CommandBufferCmdExecutor> ConvertToCommands(
+    const ThunkSequence& sequence, const ConvertToCommandsOptions& options) {
+  CommandBufferCmdSequence cmd_sequence;
+  TF_RETURN_IF_ERROR(AppendCommands(cmd_sequence, sequence, options));
+  return CommandBufferCmdExecutor::Create(std::move(cmd_sequence),
+                                          options.synchronization_mode);
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.h
index 9622723803e7..96fb4c0c84f3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.h
@@ -22,14 +22,15 @@ limitations under the License.
 
 namespace xla::gpu {
 
-// Converts thunk sequence to a command buffer cmd sequence. If
-// `synchronization_mode` is kSerialize, we automatically insert barriers
-// between all commands in a sequence. Otherwise we use buffer usage aliasing to
-// allow commands to run concurrently and insert barriers only when needed for
-// correctness.
-absl::StatusOr<CommandBufferCmdSequence> ConvertToCommands(
-    const ThunkSequence& sequence,
-    CommandBufferCmdSequence::SynchronizationMode synchronization_mode);
+// Options for converting from thunks to command buffer commands.
+struct ConvertToCommandsOptions {
+  CommandBufferCmdExecutor::SynchronizationMode synchronization_mode =
+      CommandBufferCmdExecutor::SynchronizationMode::kSerialize;
+};
+
+// Converts thunk sequence to a command buffer cmd sequence.
+absl::StatusOr<CommandBufferCmdExecutor> ConvertToCommands(
+    const ThunkSequence& sequence, const ConvertToCommandsOptions& options);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
index 4f3a68bfc592..d27b887aca5d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
@@ -17,13 +17,17 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <utility>
 #include <vector>
 
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -37,15 +41,16 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "xla/types.h"  // IWYU pragma: keep
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
 
 namespace xla::gpu {
 
-using xla::BufferUse;
 using BufferUseVector = CommandBufferCmd::BufferUseVector;
 using MemoryAccess = BufferUse::MemoryAccess;
 
@@ -56,13 +61,16 @@ static se::StreamExecutor* GpuExecutor() {
   return platform->ExecutorForDevice(0).value();
 }
 
-// Give a short aliases to execution threads.
+// Give a short alias to execution thread.
 static constexpr auto s0 = ExecutionStreamId(0);
-static constexpr auto s1 = ExecutionStreamId(1);
+
+// Give a short alias to synchronization mode.
+static constexpr auto serialize =
+    CommandBufferCmdExecutor::SynchronizationMode::kSerialize;
 
 // A command buffer cmd for testing automatic barriers insertion by the command
-// buffer cmd sequence. We never execute this command, we need it only to pass
-// buffer usage vector to the command buffer cmd sequence.
+// buffer cmd commands. We never execute this command, we need it only to pass
+// buffer usage vector to the command buffer cmd commands.
 struct TestOnlyCommandBufferCmd : public CommandBufferCmd {
   TestOnlyCommandBufferCmd(ExecutionStreamId execution_stream_id,
                            BufferUseVector buffer_usage)
@@ -70,30 +78,74 @@ struct TestOnlyCommandBufferCmd : public CommandBufferCmd {
                          execution_stream_id),
         buffer_usage(buffer_usage) {}
 
-  absl::Status Record(const Thunk::ExecuteParams&, const RecordParams&,
-                      se::CommandBuffer*) override {
-    return absl::OkStatus();
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams&, const RecordParams&, RecordAction,
+      se::CommandBuffer*) override {
+    return nullptr;
   }
 
-  BufferUseVector buffers() override { return buffer_usage; }
+  BufferUseVector buffers() const override { return buffer_usage; }
 
   BufferUseVector buffer_usage;
 };
 
 class FakeCmd : public CommandBufferCmd {
  public:
-  FakeCmd(ExecutionStreamId execution_stream_id)
+  explicit FakeCmd(ExecutionStreamId execution_stream_id)
       : CommandBufferCmd(CommandBufferCmdType::kTracedCommandBufferCmd,
                          execution_stream_id) {}
 
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override {
-    return absl::OkStatus();
+  absl::StatusOr<const se::CommandBuffer::Command*> Record(
+      const Thunk::ExecuteParams&, const RecordParams&, RecordAction,
+      se::CommandBuffer*) override {
+    return nullptr;
   }
-  BufferUseVector buffers() override { return BufferUseVector{}; }
+
+  BufferUseVector buffers() const override { return BufferUseVector{}; }
 };
 
+TEST(CommandBufferCmdStateManageTest, GetOrCreateState) {
+  struct StateA : public CommandBufferCmd::State {
+    int32_t value = 0;
+  };
+
+  struct StateB : public CommandBufferCmd::State {
+    float value = 0;
+  };
+
+  // We need a fake command buffer pointer to use as a key.
+  auto* cmd =
+      tsl::safe_reinterpret_cast<CommandBufferCmd*>(std::intptr_t{0x1234567});
+  auto* command_buffer =
+      tsl::safe_reinterpret_cast<se::CommandBuffer*>(std::intptr_t{0x1234567});
+
+  CommandBufferCmd::StateManager state_manager;
+
+  // Create a state of type StateA.
+  auto* stateA0 = state_manager.GetOrNull<StateA>(cmd, command_buffer);
+  ASSERT_EQ(stateA0, nullptr);
+
+  auto* stateA1 = state_manager.GetOrCreate<StateA>(cmd, command_buffer);
+  ASSERT_EQ(stateA1->value, 0);
+  stateA1->value += 42;
+
+  auto* stateA2 = state_manager.GetOrCreate<StateA>(cmd, command_buffer);
+  ASSERT_EQ(stateA2->value, 42);
+  ASSERT_EQ(stateA1, stateA2);
+
+  // StateB has a different type, and has no connection to StateA created above.
+  auto* stateB0 = state_manager.GetOrNull<StateB>(cmd, command_buffer);
+  ASSERT_EQ(stateB0, nullptr);
+
+  auto* stateB1 = state_manager.GetOrCreate<StateB>(cmd, command_buffer);
+  ASSERT_EQ(stateB1->value, 0);
+  stateB1->value += 42.0;
+
+  auto* stateB2 = state_manager.GetOrCreate<StateB>(cmd, command_buffer);
+  ASSERT_EQ(stateB2->value, 42.0);
+  ASSERT_EQ(stateB1, stateB2);
+}
+
 TEST(CommandBufferCmdTest, SerializeExecution) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
 
@@ -104,14 +156,14 @@ TEST(CommandBufferCmdTest, SerializeExecution) {
   auto use0 = BufferUse(slice0, BufferUse::kRead);
   auto use1 = BufferUse(slice1, BufferUse::kRead);
 
-  CommandBufferCmdSequence commands(
-      CommandBufferCmdSequence::SynchronizationMode::kSerialize);
+  CommandBufferCmdSequence commands;
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use0});
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use1});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
-  ASSERT_EQ(commands.barriers().size(), 2);
-  EXPECT_EQ(commands.barriers().at(0), false);
-  EXPECT_EQ(commands.barriers().at(1), true);
+  // TODO(ezhulenev): Check that executor correctly infer dependencies.
 }
 
 TEST(CommandBufferCmdTest, NoReadBarrier) {
@@ -127,10 +179,11 @@ TEST(CommandBufferCmdTest, NoReadBarrier) {
   CommandBufferCmdSequence commands;
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use0});
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use1});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
-  ASSERT_EQ(commands.barriers().size(), 2);
-  EXPECT_EQ(commands.barriers().at(0), false);
-  EXPECT_EQ(commands.barriers().at(1), false);
+  // TODO(ezhulenev): Check that executor correctly infer dependencies.
 }
 
 TEST(CommandBufferCmdTest, NoWriteBarrier) {
@@ -146,10 +199,11 @@ TEST(CommandBufferCmdTest, NoWriteBarrier) {
   CommandBufferCmdSequence commands;
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use0});
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use1});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
-  ASSERT_EQ(commands.barriers().size(), 2);
-  EXPECT_EQ(commands.barriers().at(0), false);
-  EXPECT_EQ(commands.barriers().at(1), false);
+  // TODO(ezhulenev): Check that executor correctly infer dependencies.
 }
 
 TEST(CommandBufferCmdTest, WriteConflictBarrier) {
@@ -168,43 +222,25 @@ TEST(CommandBufferCmdTest, WriteConflictBarrier) {
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use0});
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use1});
   commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use2});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
-  ASSERT_EQ(commands.barriers().size(), 3);
-  EXPECT_EQ(commands.barriers().at(0), false);
-  EXPECT_EQ(commands.barriers().at(1), false);
-  EXPECT_EQ(commands.barriers().at(2), true);
-}
-
-TEST(CommandBufferCmdTest, NoWriteConflictsAcrossStreams) {
-  BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
-
-  auto slice0 = BufferAllocation::Slice(&alloc0, 0, 100);
-  auto slice1 = BufferAllocation::Slice(&alloc0, 50, 100);
-
-  // Read and write happens on different execution streams and we do not insert
-  // any automatic barriers between streams.
-  auto use0 = BufferUse(slice0, BufferUse::kRead);
-  auto use1 = BufferUse(slice1, BufferUse::kWrite);
-
-  CommandBufferCmdSequence commands;
-  commands.Emplace<TestOnlyCommandBufferCmd>(s0, BufferUseVector{use0});
-  commands.Emplace<TestOnlyCommandBufferCmd>(s1, BufferUseVector{use1});
-
-  ASSERT_EQ(commands.barriers().size(), 2);
-  EXPECT_EQ(commands.barriers().at(0), false);
-  EXPECT_EQ(commands.barriers().at(1), false);
+  // TODO(ezhulenev): Check that executor correctly infer dependencies.
 }
 
 TEST(CommandBufferCmdTest, MemcpyCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  auto stream = executor->CreateStream().value();
+  auto stream = stream_executor->CreateStream().value();
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
@@ -219,9 +255,12 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
   commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a, b}, 0, &allocator);
 
   CommandBufferCmd::StateManager state;
@@ -231,9 +270,10 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
 
   CommandBufferCmd::RecordParams record_params = {state};
 
-  auto command_buffer =
-      executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary).value();
-  TF_ASSERT_OK(commands.Record(params, record_params, command_buffer.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto command_buffer,
+      stream_executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary));
+  TF_ASSERT_OK(executor.Record(params, record_params, command_buffer.get()));
 
   // Execute command buffer and verify that it copied the memory.
   TF_ASSERT_OK(command_buffer->Submit(stream.get()));
@@ -245,97 +285,18 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
 }
 
-TEST(CommandBufferCmdTest, BarrierCmd) {
-  // This test covers both CUDA version < 12040 (use empty kernel node as
-  // barrier node) and >=12040 (use cuda graph empty node as barrier node).
-  se::StreamExecutor* executor = GpuExecutor();
-
-  auto stream = executor->CreateStream().value();
-
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> e = executor->AllocateArray<int32_t>(length, 0);
-
-  TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&b, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&c, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&d, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&e, byte_length));
-
-  // Prepare buffer allocations for recording command buffer.
-  BufferAllocation alloc_a(/*index=*/0, byte_length, /*color=*/0);
-  BufferAllocation alloc_b(/*index=*/1, byte_length, /*color=*/0);
-  BufferAllocation alloc_c(/*index=*/2, byte_length, /*color=*/0);
-  BufferAllocation alloc_d(/*index=*/3, byte_length, /*color=*/0);
-  BufferAllocation alloc_e(/*index=*/4, byte_length, /*color=*/0);
-
-  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
-  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
-  BufferAllocation::Slice slice_c(&alloc_c, 0, byte_length);
-  BufferAllocation::Slice slice_d(&alloc_d, 0, byte_length);
-  BufferAllocation::Slice slice_e(&alloc_e, 0, byte_length);
-
-  // Prepare commands sequence for constructing command buffer.
-  CommandBufferCmdSequence commands;
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
-  commands.Emplace<BarrierCmd>(s1, s0);
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(s1, slice_c, slice_b, byte_length);
-  commands.Emplace<BarrierCmd>(s0, s1);
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_d, slice_c, byte_length);
-  commands.Emplace<BarrierCmd>(s1, s0);
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(s1, slice_e, slice_d, byte_length);
-
-  ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
-  BufferAllocations allocations({a, b, c, d, e}, 0, &allocator);
-
-  CommandBufferCmd::StateManager state;
-
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
-
-  CommandBufferCmd::RecordParams record_params = {state};
-
-  auto command_buffer =
-      executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary).value();
-  TF_ASSERT_OK(commands.Record(params, record_params, command_buffer.get()));
-
-  // Execute command buffer and verify that it copied the memory.
-  TF_ASSERT_OK(command_buffer->Submit(stream.get()));
-
-  // Copy data back to host, correct executor order should populate all buffers
-  // with expected value.
-  std::vector<int32_t> dst_b(4, 0);
-  std::vector<int32_t> dst_c(4, 0);
-  std::vector<int32_t> dst_d(4, 0);
-  std::vector<int32_t> dst_e(4, 0);
-  TF_ASSERT_OK(stream->Memcpy(dst_b.data(), b, byte_length));
-  TF_ASSERT_OK(stream->Memcpy(dst_c.data(), c, byte_length));
-  TF_ASSERT_OK(stream->Memcpy(dst_d.data(), d, byte_length));
-  TF_ASSERT_OK(stream->Memcpy(dst_e.data(), e, byte_length));
-
-  ASSERT_EQ(dst_b, std::vector<int32_t>(4, 42));
-  ASSERT_EQ(dst_c, std::vector<int32_t>(4, 42));
-  ASSERT_EQ(dst_d, std::vector<int32_t>(4, 42));
-  ASSERT_EQ(dst_e, std::vector<int32_t>(4, 42));
-}
-
 TEST(CommandBufferCmdTest, LaunchCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  auto stream = executor->CreateStream().value();
+  auto stream = stream_executor->CreateStream().value();
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
@@ -355,18 +316,21 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
   commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
                               LaunchDimensions(1, 4),
                               /*shmem_bytes=*/0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
-  // Initialize command sequence and load device kernels.
+  // Initialize command commands and load device kernels.
   TF_ASSERT_OK_AND_ASSIGN(std::vector<uint8_t> fatbin,
                           se::gpu::GetGpuTestKernelsFatbin());
   Thunk::ExecutableSource source = {/*text=*/{},
                                     /*binary=*/fatbin};
 
   CommandBufferCmd::StateManager state;
-  TF_ASSERT_OK(commands.Initialize({executor, source}, state));
+  TF_ASSERT_OK(executor.Initialize({stream_executor, source}, state));
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a, b}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -374,9 +338,10 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
 
   CommandBufferCmd::RecordParams record_params = {state};
 
-  auto command_buffer =
-      executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary).value();
-  TF_ASSERT_OK(commands.Record(params, record_params, command_buffer.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto command_buffer,
+      stream_executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary));
+  TF_ASSERT_OK(executor.Record(params, record_params, command_buffer.get()));
 
   // Execute command buffer and verify that it copied the memory.
   TF_ASSERT_OK(command_buffer->Submit(stream.get()));
@@ -388,28 +353,6 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
 }
 
-TEST(CommandBufferCmdStateManageTest, GetOrCreateState) {
-  struct TestState : public CommandBufferCmd::State {
-    int32_t value = 0;
-  };
-
-  // We need a fake command buffer pointer to use as a key.
-  CommandBufferCmd* cmd = reinterpret_cast<CommandBufferCmd*>(0x1234567);
-
-  CommandBufferCmd::StateManager state_manager;
-
-  auto* state0 = state_manager.GetOrNull<TestState>(cmd);
-  ASSERT_EQ(state0, nullptr);
-
-  auto* state1 = state_manager.GetOrCreate<TestState>(cmd);
-  ASSERT_EQ(state1->value, 0);
-  state1->value += 42;
-
-  auto* state2 = state_manager.GetOrCreate<TestState>(cmd);
-  ASSERT_EQ(state2->value, 42);
-  ASSERT_EQ(state1, state2);
-}
-
 TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
   auto run_traced_test = [](int trace_cache_size) {
     se::StreamExecutor* executor = GpuExecutor();
@@ -432,9 +375,14 @@ TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
     se::StreamExecutorMemoryAllocator allocator(executor);
     BufferAllocations allocations({mem0, mem1}, 0, &allocator);
 
-    // No-op trace callback to count how many times it was called.
+    se::DeviceMemory<int32_t> mem = executor->AllocateArray<int32_t>(16, 0);
+
+    // Count how many times trace callback was called. We also need to record
+    // something on the given stream because we can't leave traced command
+    // buffer empty.
     int64_t num_calls = 0;
-    auto trace = [&](se::Stream*) {
+    auto trace = [&](se::Stream* stream) -> absl::Status {
+      TF_RETURN_IF_ERROR(stream->Memset32(&mem, 42, 16));
       num_calls++;
       return absl::OkStatus();
     };
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
index 9e2db69aa810..05a663ad3bd3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <memory>
-#include <optional>
 #include <utility>
 #include <vector>
 
@@ -26,7 +25,6 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/runtime/annotation.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -35,10 +33,10 @@ limitations under the License.
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
@@ -57,7 +55,7 @@ CommandBufferThunk::ExecutorCommandBuffer::ExecutorCommandBuffer(
     : command_buffer(std::move(command_buffer)) {}
 
 CommandBufferThunk::CommandBufferThunk(
-    CommandBufferCmdSequence commands, ThunkInfo thunk_info,
+    CommandBufferCmdExecutor commands, ThunkInfo thunk_info,
     std::unique_ptr<SequentialThunk> thunks,
     bool enable_command_buffers_during_profiling)
     : Thunk(Thunk::kCommandBuffer, std::move(thunk_info)),
@@ -83,7 +81,7 @@ CommandBufferThunk::CommandBufferThunk(
 }
 
 bool CommandBufferThunk::ExecutorCommandBuffer::ShouldUpdateCommandBuffer(
-    const CommandBufferCmdSequence& commands,
+    const CommandBufferCmdExecutor& commands,
     const Thunk::ExecuteParams& params) {
   if (commands.force_update()) {
     return true;
@@ -298,7 +296,7 @@ struct CommandBufferThunk::GlobalState {
 };
 
 CommandBufferThunk::GlobalState* CommandBufferThunk::GetGlobalState() {
-  static auto* global_state = new GlobalState();
+  static auto* const global_state = new GlobalState();
   return global_state;
 }
 
@@ -348,4 +346,11 @@ void CommandBufferThunk::ForAllThunks(
     thunks_->ForAllThunks(fn);
   }
 }
+
+std::string CommandBufferThunk::ToString(int indent) const {
+  std::string result = "\n";
+  absl::StrAppend(&result, thunks_->ToString(indent + 1));
+  return result;
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
index 63fe04a682ef..8016648c17e0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <optional>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -38,7 +37,7 @@ namespace xla::gpu {
 
 class CommandBufferThunk : public Thunk {
  public:
-  CommandBufferThunk(CommandBufferCmdSequence commands, ThunkInfo thunk_info,
+  CommandBufferThunk(CommandBufferCmdExecutor commands, ThunkInfo thunk_info,
                      std::unique_ptr<SequentialThunk> thunks = nullptr,
                      bool enable_command_buffers_during_profiling = false);
 
@@ -57,6 +56,8 @@ class CommandBufferThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
 
+  std::string ToString(int indent) const override;
+
  private:
   // Command buffer instantiated on a `se::StreamExecutor` instance, and
   // auxiliary state required for efficient command buffer updates.
@@ -66,7 +67,7 @@ class CommandBufferThunk : public Thunk {
 
     // Returns true if `commands` cmd sequence has to be recorded into
     // `command_buffer` to update it (see `recorded_allocs` below).
-    bool ShouldUpdateCommandBuffer(const CommandBufferCmdSequence& commands,
+    bool ShouldUpdateCommandBuffer(const CommandBufferCmdExecutor& commands,
                                    const Thunk::ExecuteParams& params)
         ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex);
 
@@ -124,8 +125,8 @@ class CommandBufferThunk : public Thunk {
   // Evicts all previously instantiated command buffers.
   static void EvictCommandBuffers();
 
-  // Command sequence that initializes command buffers on each executor.
-  CommandBufferCmdSequence commands_;
+  // Commands executor that initializes command buffers on each stream executor.
+  CommandBufferCmdExecutor commands_;
 
   // Thunk sequence that executes the same commands as in `commands_` but using
   // thunk mechanism. We use it as a fallback mechanism to work around CUPTI
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index 7499c69b3b9d..bbcb3b5f0856 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -28,21 +29,30 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
+#include "xla/backends/gpu/runtime/dynamic_slice_thunk.h"
+#include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -51,17 +61,18 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_types.h"  // IWYU pragma: keep
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 
 #ifdef GOOGLE_CUDA
@@ -74,6 +85,7 @@ using MemoryAccess = BufferUse::MemoryAccess;
 using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;
 
 namespace {
+
 se::StreamExecutor* GpuExecutor() {
   auto name =
       absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
@@ -107,8 +119,8 @@ KernelArgsPacking CreateDefaultArgsPacking() {
 }
 
 // Some of the tests rely on CUDA 12.3+ features.
-bool IsAtLeastCuda12300(const se::StreamExecutor* executor) {
-  const auto& device_description = executor->GetDeviceDescription();
+bool IsAtLeastCuda12300(const se::StreamExecutor* stream_executor) {
+  const auto& device_description = stream_executor->GetDeviceDescription();
   const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
       &device_description.gpu_compute_capability());
   if (cuda_cc != nullptr) {
@@ -124,19 +136,26 @@ bool IsAtLeastCuda12300(const se::StreamExecutor* executor) {
 // Give a short aliases to execution threads.
 constexpr auto s0 = ExecutionStreamId(0);
 constexpr auto s1 = ExecutionStreamId(1);
+
+// Give a short alias to synchronization mode.
+static constexpr auto serialize =
+    CommandBufferCmdExecutor::SynchronizationMode::kSerialize;
+
 }  // namespace
 
 TEST(CommandBufferThunkTest, MemcpyCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
@@ -151,11 +170,14 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
   commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a, b}, 0, &allocator);
 
@@ -187,15 +209,16 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
 }
 
 TEST(CommandBufferThunkTest, MemzeroCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
 
   // Prepare buffer allocations for recording command buffer.
@@ -205,12 +228,15 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
   commands.Emplace<MemzeroCmd>(s0, slice_a);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -228,15 +254,16 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
 }
 
 TEST(CommandBufferThunkTest, Memset32Cmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
 
@@ -247,12 +274,15 @@ TEST(CommandBufferThunkTest, Memset32Cmd) {
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
   commands.Emplace<Memset32Cmd>(s0, slice_a, int32_t{84});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -270,15 +300,16 @@ TEST(CommandBufferThunkTest, Memset32Cmd) {
 }
 
 TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersDisabledDuringProfiling) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
 
@@ -296,15 +327,18 @@ TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersDisabledDuringProfiling) {
   // be used.
   CommandBufferCmdSequence commands;
   commands.Emplace<Memset32Cmd>(s0, slice_a, int32_t{12});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   constexpr bool kProfileCommandBuffersEnabled = false;
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(),
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo(),
                            std::move(seq_thunks),
                            kProfileCommandBuffersEnabled);
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -324,15 +358,16 @@ TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersDisabledDuringProfiling) {
 }
 
 TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersEnabledDuringProfiling) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
 
@@ -350,15 +385,18 @@ TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersEnabledDuringProfiling) {
   // be used.
   CommandBufferCmdSequence commands;
   commands.Emplace<Memset32Cmd>(s0, slice_a, int32_t{12});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   constexpr bool kProfileCommandBuffersEnabled = true;
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(),
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo(),
                            std::move(seq_thunks),
                            kProfileCommandBuffersEnabled);
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -378,11 +416,11 @@ TEST(CommandBufferThunkTest, Memset32CmdCommandBuffersEnabledDuringProfiling) {
 }
 
 TEST(CommandBufferThunkTest, Memset32CmdOnDifferentStreams) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(2, 0);
+  se::DeviceMemory<int32_t> a = stream_executor->AllocateArray<int32_t>(2, 0);
   TF_ASSERT_OK(stream->MemZero(&a, 2 * sizeof(int32_t)));
 
   // Prepare buffer allocations for recording command buffer.
@@ -394,12 +432,15 @@ TEST(CommandBufferThunkTest, Memset32CmdOnDifferentStreams) {
   CommandBufferCmdSequence commands;
   commands.Emplace<Memset32Cmd>(s0, slice0, int32_t{12});
   commands.Emplace<Memset32Cmd>(s1, slice1, int32_t{34});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -417,16 +458,18 @@ TEST(CommandBufferThunkTest, Memset32CmdOnDifferentStreams) {
 }
 
 TEST(CommandBufferThunkTest, LaunchCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
@@ -447,21 +490,24 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
   commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
                               LaunchDimensions(1, 4),
                               /*shmem_bytes=*/0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a, b}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
-                        &allocations, stream.get()}));
+  TF_ASSERT_OK(thunk.Initialize({stream_executor,
+                                 static_cast<Thunk::ExecutableSource>(source),
+                                 &allocations, stream.get()}));
 
   // Execute command buffer thunk and verify that it added the value.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -474,7 +520,8 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
 
   // Prepare buffer allocation for updating command buffer: c=0
-  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> c =
+      stream_executor->AllocateArray<int32_t>(length, 0);
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Update buffer allocation #1 to buffer `c`.
@@ -505,9 +552,9 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
 }
 
 TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   auto packing = CreateDefaultArgsPacking();
 
@@ -522,8 +569,10 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
@@ -544,21 +593,24 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
   commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
                               LaunchDimensions(1, 4),
                               /*shmem_bytes=*/0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a, b}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
-                        &allocations, stream.get()}));
+  TF_ASSERT_OK(thunk.Initialize({stream_executor,
+                                 static_cast<Thunk::ExecutableSource>(source),
+                                 &allocations, stream.get()}));
 
   // Execute command buffer thunk and verify that it added the value.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -571,7 +623,8 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
 
   // Prepare buffer allocation for updating command buffer: c=0
-  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> c =
+      stream_executor->AllocateArray<int32_t>(length, 0);
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Update buffer allocation #1 to buffer `c`.
@@ -602,13 +655,13 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
 }
 
 TEST(CommandBufferThunkTest, GemmCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  if (!IsAtLeastCuda12300(executor)) {
+  if (!IsAtLeastCuda12300(stream_executor)) {
     GTEST_SKIP() << "CUDA graph tracing is not supported";
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 4 * 3;
@@ -621,19 +674,19 @@ TEST(CommandBufferThunkTest, GemmCmd) {
   //        1.0, 1.0, 1.0
   //        1.0, 1.0, 1.0
   //        1.0, 1.0, 1.0]
-  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  se::DeviceMemory<float> lhs = stream_executor->AllocateArray<float>(2 * 4);
   std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
   TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
-  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(4 * 3);
+  se::DeviceMemory<float> rhs = stream_executor->AllocateArray<float>(4 * 3);
   std::vector<float> rhs_arr(12, 1);
   TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
-  se::DeviceMemory<float> out = executor->AllocateArray<float>(2 * 3);
+  se::DeviceMemory<float> out = stream_executor->AllocateArray<float>(2 * 3);
   TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
-      executor->AllocateArray<float>(1024 * 1024);
+      stream_executor->AllocateArray<float>(1024 * 1024);
   TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   // Prepare buffer allocations for recording command buffer.
@@ -653,7 +706,7 @@ TEST(CommandBufferThunkTest, GemmCmd) {
       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 3}), 1.0, 0.0, 0.0,
       PrecisionConfig::ALG_UNSET, std::nullopt,
       se::blas::kDefaultComputePrecision, false, false,
-      executor->GetDeviceDescription().gpu_compute_capability());
+      stream_executor->GetDeviceDescription().gpu_compute_capability());
   ASSERT_TRUE(config.ok());
 
   // Prepare commands sequence for constructing command buffer.
@@ -661,12 +714,15 @@ TEST(CommandBufferThunkTest, GemmCmd) {
   commands.Emplace<GemmCmd>(s0, config.value(), slice_lhs, slice_rhs, slice_out,
                             slice_workspace,
                             /*deterministic=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({lhs, rhs, out, workspace}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -674,7 +730,7 @@ TEST(CommandBufferThunkTest, GemmCmd) {
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
-      {executor, source, &allocations, stream.get(), stream.get()}));
+      {stream_executor, source, &allocations, stream.get(), stream.get()}));
 
   // Execute command buffer thunk and verify that it executed a GEMM.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -687,7 +743,8 @@ TEST(CommandBufferThunkTest, GemmCmd) {
   ASSERT_EQ(dst, std::vector<float>({10, 10, 10, 26, 26, 26}));
 
   // Prepare buffer allocation for updating command buffer.
-  se::DeviceMemory<float> updated_out = executor->AllocateArray<float>(2 * 3);
+  se::DeviceMemory<float> updated_out =
+      stream_executor->AllocateArray<float>(2 * 3);
   TF_ASSERT_OK(stream->MemZero(&updated_out, out_length));
 
   // Update buffer allocation to updated `out` buffer.
@@ -718,14 +775,14 @@ TEST(CommandBufferThunkTest, GemmCmd) {
   ASSERT_EQ(dst, std::vector<float>({10, 10, 10, 26, 26, 26}));
 }
 
-TEST(CommandBufferThunkTest, DynamicSliceFusionCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+TEST(CommandBufferThunkTest, DISABLED_DynamicSliceFusionCmd) {
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  if (!IsAtLeastCuda12300(executor)) {
+  if (!IsAtLeastCuda12300(stream_executor)) {
     GTEST_SKIP() << "CUDA graph tracing is not supported";
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t lhs_length = sizeof(float) * 4 * 4;
   int64_t fake_lhs_length = sizeof(float) * 2 * 4;
@@ -739,19 +796,19 @@ TEST(CommandBufferThunkTest, DynamicSliceFusionCmd) {
   //        1.0, 1.0, 1.0
   //        1.0, 1.0, 1.0
   //        1.0, 1.0, 1.0]
-  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(4 * 4);
+  se::DeviceMemory<float> lhs = stream_executor->AllocateArray<float>(4 * 4);
   std::vector<float> lhs_arr{0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8};
   TF_ASSERT_OK(stream->Memcpy(&lhs, lhs_arr.data(), lhs_length));
 
-  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(4 * 3);
+  se::DeviceMemory<float> rhs = stream_executor->AllocateArray<float>(4 * 3);
   std::vector<float> rhs_arr(12, 1);
   TF_ASSERT_OK(stream->Memcpy(&rhs, rhs_arr.data(), rhs_length));
 
-  se::DeviceMemory<float> out = executor->AllocateArray<float>(2 * 3);
+  se::DeviceMemory<float> out = stream_executor->AllocateArray<float>(2 * 3);
   TF_ASSERT_OK(stream->MemZero(&out, out_length));
 
   se::DeviceMemory<float> workspace =
-      executor->AllocateArray<float>(1024 * 1024);
+      stream_executor->AllocateArray<float>(1024 * 1024);
   TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
   // Prepare buffer allocations for recording command buffer.
@@ -779,15 +836,17 @@ TEST(CommandBufferThunkTest, DynamicSliceFusionCmd) {
       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 3}), 1.0, 0.0, 0.0,
       PrecisionConfig::ALG_UNSET, std::nullopt,
       se::blas::kDefaultComputePrecision, false, false,
-      executor->GetDeviceDescription().gpu_compute_capability());
+      stream_executor->GetDeviceDescription().gpu_compute_capability());
   ASSERT_TRUE(config.ok());
 
   // Prepare commands sequence for constructing command buffer.
-  std::unique_ptr<CommandBufferCmdSequence> embed_commands =
-      std::make_unique<CommandBufferCmdSequence>();
-  embed_commands->Emplace<GemmCmd>(s0, config.value(), fake_slice_lhs,
-                                   slice_rhs, slice_out, slice_workspace,
-                                   /*deterministic=*/true);
+  CommandBufferCmdSequence embed_commands;
+  embed_commands.Emplace<GemmCmd>(s0, config.value(), fake_slice_lhs, slice_rhs,
+                                  slice_out, slice_workspace,
+                                  /*deterministic=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor embed_executor,
+      CommandBufferCmdExecutor::Create(std::move(embed_commands), serialize));
 
   BufferAllocation alloc_lhs(/*index=*/0, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
@@ -815,14 +874,17 @@ TEST(CommandBufferThunkTest, DynamicSliceFusionCmd) {
 
   CommandBufferCmdSequence commands;
   commands.Emplace<DynamicSliceFusionCmd>(
-      s0, std::move(embed_commands), arguments, std::move(fake_allocations),
+      s0, std::move(embed_executor), arguments, std::move(fake_allocations),
       offsets, orig_shapes, sliced_shapes, offset_byte_sizes);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({lhs, rhs, out, workspace}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -830,7 +892,7 @@ TEST(CommandBufferThunkTest, DynamicSliceFusionCmd) {
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
-      {executor, source, &allocations, stream.get(), stream.get()}));
+      {stream_executor, source, &allocations, stream.get(), stream.get()}));
 
   // Execute command buffer thunk and verify that it executed a GEMM.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -843,7 +905,8 @@ TEST(CommandBufferThunkTest, DynamicSliceFusionCmd) {
   ASSERT_EQ(dst, std::vector<float>({10, 10, 10, 26, 26, 26}));
 
   // Prepare buffer allocation for updating command buffer.
-  se::DeviceMemory<float> updated_out = executor->AllocateArray<float>(2 * 3);
+  se::DeviceMemory<float> updated_out =
+      stream_executor->AllocateArray<float>(2 * 3);
   TF_ASSERT_OK(stream->MemZero(&updated_out, out_length));
 
   // Update buffer allocation to updated `out` buffer.
@@ -875,14 +938,14 @@ TEST(CommandBufferThunkTest, DynamicSliceFusionCmd) {
 }
 
 TEST(CommandBufferThunkTest, CublasLtCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  if (!IsAtLeastCuda12300(executor)) {
+  if (!IsAtLeastCuda12300(stream_executor)) {
     GTEST_SKIP() << "CUDA graph tracing is not supported";
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream1, executor->CreateStream());
-  TF_ASSERT_OK_AND_ASSIGN(auto stream2, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream1, stream_executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream2, stream_executor->CreateStream());
 
   // CublasLt formula: D = alpha*(A*B) + beta*(C),
 
@@ -918,20 +981,25 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
       /*algorithm*/ std::nullopt,
       /*compute_precision*/ se::blas::kDefaultComputePrecision,
       /*grad_x*/ false, /*grad_y*/ false,
-      executor->GetDeviceDescription().gpu_compute_capability());
+      stream_executor->GetDeviceDescription().gpu_compute_capability());
   ASSERT_TRUE(config.ok());
 
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
   commands.Emplace<CublasLtCmd>(
-      s0, config.value(), se::gpu::BlasLt::Epilogue::kDefault, 0, slice_a,
-      slice_b, slice_c, slice_d, BufferAllocation::Slice(),
-      BufferAllocation::Slice(), BufferAllocation::Slice(),
-      BufferAllocation::Slice(), BufferAllocation::Slice(),
-      BufferAllocation::Slice(), BufferAllocation::Slice(), slice_workspace);
+      s0, CublasLtMatmulThunk(
+              nullptr, config.value(), se::gpu::BlasLt::Epilogue::kDefault, 0,
+              slice_a, slice_b, slice_c, slice_d, BufferAllocation::Slice(),
+              BufferAllocation::Slice(), BufferAllocation::Slice(),
+              BufferAllocation::Slice(), BufferAllocation::Slice(),
+              BufferAllocation::Slice(), BufferAllocation::Slice(),
+              slice_workspace));
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   std::vector<float> a_arr_1{1, 2, 3, 4, 5, 6, 7, 8};
   std::vector<float> a_arr_2{2, 3, 4, 5, 6, 7, 8, 9};
@@ -941,26 +1009,26 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
   auto run_cublaslt_test = [&](std::unique_ptr<se::Stream>& stream,
                                std::vector<float> a_arr,
                                std::vector<float> result) {
-    se::DeviceMemory<float> a = executor->AllocateArray<float>(2 * 4);
+    se::DeviceMemory<float> a = stream_executor->AllocateArray<float>(2 * 4);
     TF_ASSERT_OK(stream->Memcpy(&a, a_arr.data(), a_length));
 
-    se::DeviceMemory<float> b = executor->AllocateArray<float>(4 * 3);
+    se::DeviceMemory<float> b = stream_executor->AllocateArray<float>(4 * 3);
     std::vector<float> b_arr(12, 1);
     TF_ASSERT_OK(stream->Memcpy(&b, b_arr.data(), b_length));
 
-    se::DeviceMemory<float> c = executor->AllocateArray<float>(2 * 3);
+    se::DeviceMemory<float> c = stream_executor->AllocateArray<float>(2 * 3);
     std::vector<float> c_arr(6, 1);
     TF_ASSERT_OK(stream->Memcpy(&c, c_arr.data(), c_length));
 
-    se::DeviceMemory<float> d = executor->AllocateArray<float>(2 * 3);
+    se::DeviceMemory<float> d = stream_executor->AllocateArray<float>(2 * 3);
     TF_ASSERT_OK(stream->MemZero(&d, d_length));
 
     se::DeviceMemory<float> workspace =
-        executor->AllocateArray<float>(1024 * 1024);
+        stream_executor->AllocateArray<float>(1024 * 1024);
     TF_ASSERT_OK(stream->MemZero(&workspace, 1024 * 1024));
 
     ServiceExecutableRunOptions run_options;
-    se::StreamExecutorMemoryAllocator allocator(executor);
+    se::StreamExecutorMemoryAllocator allocator(stream_executor);
     BufferAllocations allocations({a, b, c, d, workspace}, 0, &allocator);
 
     Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
@@ -968,7 +1036,7 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
 
     Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
     TF_ASSERT_OK(thunk.Initialize(
-        {executor, source, &allocations, stream.get(), stream.get()}));
+        {stream_executor, source, &allocations, stream.get(), stream.get()}));
 
     // Execute command buffer thunk and verify that it executed a GEMM.
     TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -981,7 +1049,8 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
     ASSERT_EQ(dst, result);
 
     // Prepare buffer allocation for updating command buffer.
-    se::DeviceMemory<float> updated_d = executor->AllocateArray<float>(2 * 3);
+    se::DeviceMemory<float> updated_d =
+        stream_executor->AllocateArray<float>(2 * 3);
     TF_ASSERT_OK(stream->MemZero(&updated_d, d_length));
 
     // Update buffer allocation to updated `d` buffer.
@@ -1020,18 +1089,22 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
 }
 
 TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> c =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> d =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
@@ -1062,21 +1135,24 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   commands.Emplace<LaunchCmd>(s0, "AddI32", args_1, args_access,
                               LaunchDimensions(1, 4),
                               /*shmem_bytes=*/0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({a, b, c, d}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
-                        &allocations, stream.get()}));
+  TF_ASSERT_OK(thunk.Initialize({stream_executor,
+                                 static_cast<Thunk::ExecutableSource>(source),
+                                 &allocations, stream.get()}));
 
   // Execute command buffer thunk and verify that it added the value.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -1096,7 +1172,8 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   BufferAllocation::Slice slice_e(&alloc_e, 0, byte_length);
 
   // Prepare buffer allocation for updating command buffer: e=0
-  se::DeviceMemory<int32_t> e = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> e =
+      stream_executor->AllocateArray<int32_t>(length, 0);
   TF_ASSERT_OK(stream->MemZero(&e, byte_length));
 
   // Update buffer allocation #1 to buffer `c`.
@@ -1134,203 +1211,25 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 21 + 21));
 }
 
-TEST(CommandBufferThunkTest, IfCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
-
-  if (!IsAtLeastCuda12300(executor)) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  // Prepare arguments: pred=true, a=42, b=0
-  se::DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-
-  constexpr bool kTrue = true;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kTrue, 1));
-  TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&b, byte_length));
-
-  // Prepare buffer allocations for recording command buffer.
-  BufferAllocation alloc_p(/*index=*/0, 1, /*color=*/0);
-  BufferAllocation alloc_a(/*index=*/1, byte_length, /*color=*/0);
-  BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
-
-  BufferAllocation::Slice slice_p(&alloc_p, 0, 1);
-  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
-  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
-
-  auto args = {slice_a, slice_a, slice_b};  // b = a + a
-  auto args_access = {MemoryAccess::kRead, MemoryAccess::kRead,
-                      MemoryAccess::kWrite};
-
-  // Prepare commands sequence for `then` branch.
-  CommandBufferCmdSequence then_commands;
-  then_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
-                                   LaunchDimensions(1, 4),
-                                   /*shmem_bytes=*/0);
-
-  // Prepare commands sequence for thunk.
-  CommandBufferCmdSequence commands;
-  commands.Emplace<IfCmd>(s0, slice_p, std::move(then_commands));
-
-  // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
-
-  ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
-  BufferAllocations allocations({pred, a, b}, 0, &allocator);
-
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
-
-  TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
-                        &allocations, stream.get()}));
-
-  // Execute command buffer thunk and verify that it added the value.
-  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  // Copy `b` data back to host.
-  std::vector<int32_t> dst(4, 0);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), b, byte_length));
-
-  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
-
-  // Prepare buffer allocation for updating command buffer: c=0
-  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-  TF_ASSERT_OK(stream->MemZero(&c, byte_length));
-
-  // Update buffer allocation #2 to buffer `c`.
-  allocations = BufferAllocations({pred, a, c}, 0, &allocator);
-
-  // Thunk execution should automatically update underlying command buffer.
-  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  // Copy `c` data back to host.
-  std::fill(dst.begin(), dst.end(), 0);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), c, byte_length));
-
-  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
-}
-
-TEST(CommandBufferThunkTest, IfElseCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
-
-  if (!IsAtLeastCuda12300(executor)) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  // Prepare arguments: pred=true, a=42, b=0
-  se::DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-
-  constexpr bool kTrue = true;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kTrue, 1));
-  TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&b, byte_length));
-
-  // Prepare buffer allocations for recording command buffer.
-  BufferAllocation alloc_p(/*index=*/0, 1, /*color=*/0);
-  BufferAllocation alloc_a(/*index=*/1, byte_length, /*color=*/0);
-  BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
-
-  BufferAllocation::Slice slice_p(&alloc_p, 0, 1);
-  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
-  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
-
-  // Prepare commands sequence for `then` & `else` branches.
-  CommandBufferCmdSequence then_commands;
-  CommandBufferCmdSequence else_commands;
-
-  auto args_access = {MemoryAccess::kRead, MemoryAccess::kRead,
-                      MemoryAccess::kWrite};
-
-  {  // Then: b = a + a
-    auto args = {slice_a, slice_a, slice_b};
-    then_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
-                                     LaunchDimensions(1, 4),
-                                     /*shmem_bytes=*/0);
-  }
-
-  {  // Else: b = b + b
-    auto args = {slice_b, slice_b, slice_b};
-    else_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
-                                     LaunchDimensions(1, 4),
-                                     /*shmem_bytes=*/0);
-  }
-
-  // Prepare commands sequence for thunk.
-  CommandBufferCmdSequence commands;
-  commands.Emplace<IfElseCmd>(s0, slice_p, std::move(then_commands),
-                              std::move(else_commands));
-
-  // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
-
-  ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
-  BufferAllocations allocations({pred, a, b}, 0, &allocator);
-
-  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
-      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
-
-  TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
-                        &allocations, stream.get()}));
-
-  // Execute command buffer thunk and verify that it added the value.
-  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  // Copy `b` data back to host.
-  std::vector<int32_t> dst(4, 0);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), b, byte_length));
-
-  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
-
-  // Change branch to `else` and check that it updated the `b` buffer.
-  constexpr bool kFalse = false;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kFalse, 1));
-
-  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), b, byte_length));
-  ASSERT_EQ(dst, std::vector<int32_t>(4, 2 * (42 + 42)));
-}
-
 TEST(CommandBufferThunkTest, CaseCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  if (!IsAtLeastCuda12300(executor)) {
+  if (!IsAtLeastCuda12300(stream_executor)) {
     GTEST_SKIP() << "CUDA graph conditionals are not supported";
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
   // Prepare arguments: index=0, a=42, b=0
-  se::DeviceMemory<int32_t> index = executor->AllocateArray<int32_t>(1, 0);
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> index =
+      stream_executor->AllocateArray<int32_t>(1, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&index, 0, sizeof(int32_t)));
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
@@ -1346,43 +1245,54 @@ TEST(CommandBufferThunkTest, CaseCmd) {
   BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
 
   // Prepare commands sequence for branches.
-  std::vector<CommandBufferCmdSequence> branches(2);
+  std::vector<CommandBufferCmdSequence> branches_sequence(2);
 
   auto args_access = {MemoryAccess::kRead, MemoryAccess::kRead,
                       MemoryAccess::kWrite};
 
   {  // Case 0: b = a + a
     auto args = {slice_a, slice_a, slice_b};
-    branches[0].Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
-                                   LaunchDimensions(1, 4),
-                                   /*shmem_bytes=*/0);
+    branches_sequence[0].Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
+                                            LaunchDimensions(1, 4),
+                                            /*shmem_bytes=*/0);
   }
 
   {  // Case 1: b = b + b
     auto args = {slice_b, slice_b, slice_b};
-    branches[1].Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
-                                   LaunchDimensions(1, 4),
-                                   /*shmem_bytes=*/0);
+    branches_sequence[1].Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
+                                            LaunchDimensions(1, 4),
+                                            /*shmem_bytes=*/0);
   }
 
+  std::vector<CommandBufferCmdExecutor> branches(2);
+  TF_ASSERT_OK_AND_ASSIGN(branches[0],
+                          CommandBufferCmdExecutor::Create(
+                              std::move(branches_sequence[0]), serialize));
+  TF_ASSERT_OK_AND_ASSIGN(branches[1],
+                          CommandBufferCmdExecutor::Create(
+                              std::move(branches_sequence[1]), serialize));
+
   // Prepare commands sequence for thunk.
   CommandBufferCmdSequence commands;
   commands.Emplace<CaseCmd>(s0, slice_i, false, std::move(branches));
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
   BufferAllocations allocations({index, a, b}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
-                        &allocations, stream.get()}));
+  TF_ASSERT_OK(thunk.Initialize({stream_executor,
+                                 static_cast<Thunk::ExecutableSource>(source),
+                                 &allocations, stream.get()}));
 
   // Execute command buffer thunk and verify that it added the value.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -1404,65 +1314,96 @@ TEST(CommandBufferThunkTest, CaseCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 2 * (42 + 42)));
 }
 
-TEST(CommandBufferThunkTest, ForCmd) {
-  se::StreamExecutor* executor = GpuExecutor();
+TEST(CommandBufferThunkTest, WhileCmd) {
+  se::StreamExecutor* stream_executor = GpuExecutor();
 
-  if (!IsAtLeastCuda12300(executor)) {
+  if (!IsAtLeastCuda12300(stream_executor)) {
     GTEST_SKIP() << "CUDA graph conditionals are not supported";
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
-  // Prepare arguments: loop_cnt=0, a=1, b=0
-  se::DeviceMemory<int32_t> loop_cnt = executor->AllocateArray<int32_t>(1, 0);
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  // Prepare arguments: loop_cnt=0, num_iters=10, a=1, b=0
+  se::DeviceMemory<bool> pred = stream_executor->AllocateArray<bool>(1, 0);
+  se::DeviceMemory<int32_t> loop_cnt =
+      stream_executor->AllocateArray<int32_t>(1, 0);
+  se::DeviceMemory<int32_t> num_iters =
+      stream_executor->AllocateArray<int32_t>(1, 0);
+  se::DeviceMemory<int32_t> a =
+      stream_executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b =
+      stream_executor->AllocateArray<int32_t>(length, 0);
 
   TF_ASSERT_OK(stream->Memset32(&loop_cnt, 0, sizeof(int32_t)));
+  TF_ASSERT_OK(stream->Memset32(&num_iters, 10, sizeof(int32_t)));
   TF_ASSERT_OK(stream->Memset32(&a, 1, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
 
   // Prepare buffer allocations for recording command buffer.
-  BufferAllocation alloc_cnt(/*index=*/0, 1, /*color=*/0);
-  BufferAllocation alloc_a(/*index=*/1, byte_length, /*color=*/0);
-  BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
-
-  BufferAllocation::Slice slice_cnt(&alloc_cnt, 0, sizeof(int32_t));
+  BufferAllocation alloc_pred(/*index=*/0, sizeof(bool), /*color=*/0);
+  BufferAllocation alloc_loop_cnt(/*index=*/1, sizeof(int32_t), /*color=*/0);
+  BufferAllocation alloc_num_iters(/*index=*/2, sizeof(int32_t), /*color=*/0);
+  BufferAllocation alloc_a(/*index=*/3, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/4, byte_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_pred(&alloc_pred, 0, sizeof(bool));
+  BufferAllocation::Slice slice_loop_cnt(&alloc_loop_cnt, 0, sizeof(int32_t));
+  BufferAllocation::Slice slice_num_iters(&alloc_num_iters, 0, sizeof(int32_t));
   BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
   BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
 
-  auto args = {slice_a, slice_b, slice_b};  // b = a + b
-  auto args_access = {MemoryAccess::kRead, MemoryAccess::kRead,
-                      MemoryAccess::kWrite};
+  auto cond_args = {slice_loop_cnt, slice_pred, slice_num_iters};
+  auto cond_args_access = {MemoryAccess::kWrite, MemoryAccess::kWrite,
+                           MemoryAccess::kRead};
+
+  auto body_args = {slice_a, slice_b, slice_b};  // b = a + b
+  auto body_args_access = {MemoryAccess::kRead, MemoryAccess::kRead,
+                           MemoryAccess::kWrite};
+
+  // Prepare commands sequence for loop `cond`.
+  CommandBufferCmdSequence cond_commands;
+  cond_commands.Emplace<LaunchCmd>(s0, "IncAndCmp", cond_args, cond_args_access,
+                                   LaunchDimensions(1, 1),
+                                   /*shmem_bytes=*/0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor cond_executor,
+      CommandBufferCmdExecutor::Create(std::move(cond_commands), serialize));
 
   // Prepare commands sequence for loop `body`.
   CommandBufferCmdSequence body_commands;
-  body_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
+  body_commands.Emplace<LaunchCmd>(s0, "AddI32", body_args, body_args_access,
                                    LaunchDimensions(1, 4),
                                    /*shmem_bytes=*/0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor body_executor,
+      CommandBufferCmdExecutor::Create(std::move(body_commands), serialize));
 
   // Prepare commands sequence for thunk.
   CommandBufferCmdSequence commands;
-  commands.Emplace<ForCmd>(s0, /*num_iterations=*/10, slice_cnt,
-                           std::move(body_commands));
+  commands.Emplace<WhileCmd>(s0, slice_pred, std::move(cond_executor),
+                             std::move(body_executor));
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  se::StreamExecutorMemoryAllocator allocator(executor);
-  BufferAllocations allocations({loop_cnt, a, b}, 0, &allocator);
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
+  BufferAllocations allocations({pred, loop_cnt, num_iters, a, b}, 0,
+                                &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
-  TF_ASSERT_OK(
-      thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
-                        &allocations, stream.get()}));
+  TF_ASSERT_OK(thunk.Initialize({stream_executor,
+                                 static_cast<Thunk::ExecutableSource>(source),
+                                 &allocations, stream.get()}));
 
   // Execute command buffer thunk and verify that it added the value 10 times.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -1473,18 +1414,21 @@ TEST(CommandBufferThunkTest, ForCmd) {
   TF_ASSERT_OK(stream->Memcpy(dst.data(), b, byte_length));
 
   ASSERT_EQ(dst, std::vector<int32_t>(4, 10));
-}
 
-TEST(CommandBufferThunkTest, WhileCmd) {
-  // TODO(ezhulenev): Find a way to test WhileCmd: add a test only TraceCmd that
-  // could allow us trace custom kernels to update while loop iterations. Or
-  // maybe add a CustomLaunchCmd and wrap loop update into custom kernel.
+  // Initialize `loop_cnt` to `5` and check that we run only 5 iterations.
+  TF_ASSERT_OK(stream->Memset32(&loop_cnt, 5, sizeof(int32_t)));
+
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  TF_ASSERT_OK(stream->Memcpy(dst.data(), b, byte_length));
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 15));
 }
 
-class CmdBufferTest : public HloTestBase {
+class CmdBufferTest : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options = HloPjRtTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_autotune_level(0);
     debug_options.set_xla_gpu_enable_dynamic_slice_fusion(true);
     debug_options.set_xla_gpu_graph_min_graph_size(1);
@@ -1589,4 +1533,22 @@ ENTRY main.49 {
   EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-3, 2e-3}));
 }
 
+TEST(CommandBufferThunkTest, ToStringPrintsNestedThunks) {
+  BufferAllocation alloc_a(/*index=*/0, /*size=*/4, /*color=*/0);
+  BufferAllocation::Slice slice_a(&alloc_a, /*offset=*/0, /*size=*/4);
+  CommandBufferCmdSequence commands;
+  commands.Emplace<Memset32Cmd>(s0, slice_a, int32_t{42});
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.emplace_back(
+      std::make_unique<Memset32BitValueThunk>(Thunk::ThunkInfo(), 42, slice_a));
+  CommandBufferThunk thunk(
+      std::move(executor), Thunk::ThunkInfo(),
+      std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), std::move(thunks)));
+  EXPECT_TRUE(
+      absl::StrContains(thunk.ToString(/*indent=*/1), "    kMemset32BitValue"));
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
index e85708f63b1e..816e9d8aca2c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
@@ -19,57 +19,59 @@ limitations under the License.
 #include <memory>
 #include <utility>
 #include <variant>
+#include <vector>
 
 #include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/variant_visitor.h"
+#include "xla/service/overload.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
 ConditionalThunk::ConditionalThunk(
-    ThunkInfo thunk_info, ConditionalThunkConfig config,
-    const BufferAllocation::Slice& branch_index_buffer_index)
+    ThunkInfo thunk_info,
+    const BufferAllocation::Slice& branch_index_buffer_index,
+    std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks,
+    bool branch_index_is_bool)
     : Thunk(Kind::kConditional, thunk_info),
-      config_(std::move(config)),
-      branch_index_buffer_index_(branch_index_buffer_index) {}
+      branch_index_buffer_index_(branch_index_buffer_index),
+      branch_thunks_(std::move(branch_thunks)),
+      branch_index_is_bool_(branch_index_is_bool) {}
 
 absl::Status ConditionalThunk::Prepare(
     const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
-  if (config_.branch_index_is_bool) {
-    TF_RET_CHECK(config_.branch_thunks.size() == 2);
+  if (branch_index_is_bool_) {
+    TF_RET_CHECK(branch_thunks_.size() == 2);
   } else {
-    TF_RET_CHECK(!config_.branch_thunks.empty());
+    TF_RET_CHECK(!branch_thunks_.empty());
   }
-  for (auto& branch_thunk : config_.branch_thunks) {
+  for (auto& branch_thunk : branch_thunks_) {
     TF_RETURN_IF_ERROR(branch_thunk->Prepare(params, resource_requests));
   }
   return absl::OkStatus();
 }
 
 absl::Status ConditionalThunk::Initialize(const InitializeParams& params) {
-  if (config_.branch_index_is_bool) {
-    TF_RET_CHECK(config_.branch_thunks.size() == 2);
+  if (branch_index_is_bool_) {
+    TF_RET_CHECK(branch_thunks_.size() == 2);
   } else {
-    TF_RET_CHECK(!config_.branch_thunks.empty());
+    TF_RET_CHECK(!branch_thunks_.empty());
   }
-  for (auto& branch_thunk : config_.branch_thunks) {
+  for (auto& branch_thunk : branch_thunks_) {
     TF_RETURN_IF_ERROR(branch_thunk->Initialize(params));
   }
 
@@ -77,7 +79,7 @@ absl::Status ConditionalThunk::Initialize(const InitializeParams& params) {
 
   if (!host_memory_pools_.contains(params.executor)) {
     PrimitiveType type =
-        config_.branch_index_is_bool ? PrimitiveType::PRED : PrimitiveType::S32;
+        branch_index_is_bool_ ? PrimitiveType::PRED : PrimitiveType::S32;
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HostMemoryPool> pool,
                         HostMemoryPool::Create(params.executor, type));
     host_memory_pools_[params.executor] = std::move(pool);
@@ -98,16 +100,15 @@ absl::Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   // Copy the predicate value from device.
   auto branch_index_or_pred = [&]() -> std::variant<int32_t*, bool*> {
-    if (config_.branch_index_is_bool) {
+    if (branch_index_is_bool_) {
       return handle.get<bool>();
-    } else {
-      return handle.get<int32_t>();
     }
+    return handle.get<int32_t>();
   }();
 
   se::DeviceMemoryBase branch_index_address =
       params.buffer_allocations->GetDeviceAddress(branch_index_buffer_index_);
-  if (config_.branch_index_is_bool) {
+  if (branch_index_is_bool_) {
     TF_RETURN_IF_ERROR(stream.Memcpy(std::get<bool*>(branch_index_or_pred),
                                      branch_index_address, sizeof(bool)));
   } else {
@@ -120,27 +121,25 @@ absl::Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
                     &stream, blocked.message());
   }
 
-  int32_t branch_index = std::visit(
-      VariantVisitor{[](int32_t* branch_index) { return *branch_index; },
-                     [](bool* pred) { return *pred ? 0 : 1; }},
-      branch_index_or_pred);
-
-  absl::string_view branch_kind =
-      std::visit(VariantVisitor{[](int32_t*) { return "index"; },
-                                [](bool*) { return "pred"; }},
+  int32_t branch_index =
+      std::visit(Overload{[](int32_t* branch_index) { return *branch_index; },
+                          [](bool* pred) { return *pred ? 0 : 1; }},
                  branch_index_or_pred);
 
+  absl::string_view branch_kind = std::visit(
+      Overload{[](int32_t*) { return "index"; }, [](bool*) { return "pred"; }},
+      branch_index_or_pred);
+
   VLOG(3) << "ConditionalThunk: branch_index=" << branch_index
           << " (kind: " << branch_kind << ")";
 
   // Handle default scenario for branch_index not in [0, num_branches).
-  if (branch_index < 0 || branch_index >= config_.branch_count) {
-    branch_index = config_.branch_count - 1;
+  if (branch_index < 0 || branch_index >= branch_thunks_.size()) {
+    branch_index = static_cast<int32_t>(branch_thunks_.size()) - 1;
   }
 
   // Execute the branch computation corresponding to the value of branch_index.
-  TF_RETURN_IF_ERROR(
-      config_.branch_thunks[branch_index]->ExecuteOnStream(params));
+  TF_RETURN_IF_ERROR(branch_thunks_[branch_index]->ExecuteOnStream(params));
 
   return absl::OkStatus();
 }
@@ -148,11 +147,26 @@ absl::Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
 void ConditionalThunk::ForAllThunks(
     absl::FunctionRef<void(const Thunk*)> fn) const {
   fn(this);
-  for (const std::unique_ptr<SequentialThunk>& branch_thunk :
-       config_.branch_thunks) {
+  for (const std::unique_ptr<SequentialThunk>& branch_thunk : branch_thunks_) {
     branch_thunk->ForAllThunks(fn);
   }
 }
 
+absl::StatusOr<ThunkProto> ConditionalThunk::ToProto() const {
+  TF_ASSIGN_OR_RETURN(ThunkProto proto, Thunk::ToProto());
+  auto* conditional_thunk_proto = proto.mutable_conditional_thunk();
+  TF_ASSIGN_OR_RETURN(*conditional_thunk_proto->mutable_branch_index_buffer(),
+                      branch_index_buffer_index_.ToProto());
+
+  for (const auto& seq_thunk : branch_thunks_) {
+    TF_ASSIGN_OR_RETURN(ThunkProto seq_thunk_proto, seq_thunk->ToProto());
+    *conditional_thunk_proto->add_branch_thunks() =
+        std::move(seq_thunk_proto).sequential_thunk();
+  }
+
+  conditional_thunk_proto->set_branch_index_is_bool(branch_index_is_bool_);
+  return proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
index 3853ff258eb4..245bd6744c93 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_CONDITIONAL_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_CONDITIONAL_THUNK_H_
 
-#include <cstdint>
 #include <memory>
 #include <vector>
 
@@ -24,24 +23,18 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
 
-struct ConditionalThunkConfig {
-  bool branch_index_is_bool;
-  int64_t branch_count;
-  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
-};
-
 // ConditionalThunk implements the conditional instruction on GPU by reading the
 // predicate of the conditional and executing the true or the false computation
 // depending on the value of the predicate.
@@ -54,8 +47,11 @@ struct ConditionalThunkConfig {
 // false computation share the same allocation.
 class ConditionalThunk : public Thunk {
  public:
-  ConditionalThunk(ThunkInfo thunk_info, ConditionalThunkConfig config,
-                   const BufferAllocation::Slice& branch_index_buffer_index);
+  ConditionalThunk(
+      ThunkInfo thunk_info,
+      const BufferAllocation::Slice& branch_index_buffer_index,
+      std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks,
+      bool branch_index_is_bool);
 
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
@@ -66,7 +62,7 @@ class ConditionalThunk : public Thunk {
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
   absl::Span<const std::unique_ptr<SequentialThunk>> branch_thunks() const {
-    return config_.branch_thunks;
+    return branch_thunks_;
   }
 
   const BufferAllocation::Slice& branch_index_buffer() const {
@@ -74,11 +70,14 @@ class ConditionalThunk : public Thunk {
   }
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
-  bool branch_index_is_bool() const { return config_.branch_index_is_bool; }
+  bool branch_index_is_bool() const { return branch_index_is_bool_; }
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
 
  private:
-  const ConditionalThunkConfig config_;
   const BufferAllocation::Slice branch_index_buffer_index_;
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks_;
+  bool branch_index_is_bool_;
 
   // Host memory pool for transferring predicate value from device to host.
   absl::Mutex mutex_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
new file mode 100644
index 000000000000..2599801b1e49
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/conditional_thunk.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::Pointee;
+using ::testing::Property;
+using ::tsl::proto_testing::EqualsProto;
+using Kind = Thunk::Kind;
+
+// A dummy `Thunk` that does nothing.
+struct DummyThunk : public Thunk {
+  explicit DummyThunk(Thunk::Kind kind, Thunk::ThunkInfo thunk_info)
+      : Thunk(kind, std::move(thunk_info)) {}
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+};
+
+ConditionalThunk CreateConditionalThunk(
+    const Thunk::ThunkInfo& thunk_info,
+    const BufferAllocation::Slice& branch_index_buffer_index,
+    std::vector<ThunkSequence> branch_thunk_sequences,
+    bool kBranchIndexIsBool) {
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+  for (auto& thunk_sequence : branch_thunk_sequences) {
+    branch_thunks.push_back(std::make_unique<SequentialThunk>(
+        thunk_info, std::move(thunk_sequence)));
+  }
+
+  return ConditionalThunk(thunk_info, branch_index_buffer_index,
+                          std::move(branch_thunks), kBranchIndexIsBool);
+}
+
+TEST(ConditionalThunkTest, BufferUses) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 123;
+
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+
+  ThunkSequence false_seq;
+  false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+  false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+
+  ThunkSequence true_seq;
+  true_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+  true_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+
+  std::vector<ThunkSequence> branch_thunk_sequences;
+  branch_thunk_sequences.push_back(std::move(false_seq));
+  branch_thunk_sequences.push_back(std::move(true_seq));
+
+  constexpr bool kBranchIndexIsBool = true;
+  ConditionalThunk thunk = CreateConditionalThunk(
+      thunk_info, slice, std::move(branch_thunk_sequences), kBranchIndexIsBool);
+
+  EXPECT_EQ(thunk.branch_index_is_bool(), kBranchIndexIsBool);
+  EXPECT_EQ(thunk.branch_index_buffer(), slice);
+
+  auto thunk_matcher = Pointee(Property(&Thunk::kind, Thunk::Kind::kGemm));
+  auto branch_matcher = Pointee(Property(
+      &SequentialThunk::thunks, ElementsAre(thunk_matcher, thunk_matcher)));
+  EXPECT_THAT(thunk.branch_thunks(),
+              ElementsAre(branch_matcher, branch_matcher));
+}
+
+TEST(ConditionalThunkTest, ToProto) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 123;
+
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+
+  ThunkSequence false_seq;
+  false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+  false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+
+  ThunkSequence true_seq;
+  true_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+  true_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+
+  std::vector<ThunkSequence> branch_thunk_seq;
+  branch_thunk_seq.push_back(std::move(false_seq));
+  branch_thunk_seq.push_back(std::move(true_seq));
+
+  constexpr bool kBranchIndexIsBool = true;
+  ConditionalThunk thunk = CreateConditionalThunk(
+      thunk_info, slice, std::move(branch_thunk_seq), kBranchIndexIsBool);
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+
+  std::string expected = R"pb(
+    thunk_info {
+      profile_annotation: "profile_annotation"
+      execution_stream_id: 123
+    }
+    conditional_thunk {
+      branch_index_buffer { size: 256 }
+      branch_thunks {
+        thunks {
+          thunk_info {
+            profile_annotation: "profile_annotation"
+            execution_stream_id: 123
+          }
+        }
+        thunks {
+          thunk_info {
+            profile_annotation: "profile_annotation"
+            execution_stream_id: 123
+          }
+        }
+      }
+      branch_thunks {
+        thunks {
+          thunk_info {
+            profile_annotation: "profile_annotation"
+            execution_stream_id: 123
+          }
+        }
+        thunks {
+          thunk_info {
+            profile_annotation: "profile_annotation"
+            execution_stream_id: 123
+          }
+        }
+      }
+      branch_index_is_bool: true
+    }
+  )pb";
+  EXPECT_THAT(proto, EqualsProto(expected));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
index de565c373364..a2669befcb5e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
@@ -19,17 +19,22 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/container/node_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/backends/gpu/runtime/while_thunk.h"
+#include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -102,6 +107,17 @@ absl::StatusOr<std::unique_ptr<se::Event>> CopyThunk::AsyncEvents::Extract(
   return absl::InternalError("Async copy event was not found!");
 }
 
+absl::StatusOr<ThunkProto> CopyThunk::ToProto() const {
+  TF_ASSIGN_OR_RETURN(ThunkProto proto, Thunk::ToProto());
+  CopyThunkProto* copy_thunk_proto = proto.mutable_copy_thunk();
+  TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_source_buffer(),
+                      source().ToProto());
+  TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_destination_buffer(),
+                      destination().ToProto());
+  copy_thunk_proto->set_mem_size(size_bytes());
+  return proto;
+}
+
 //===----------------------------------------------------------------------===//
 // DeviceToHostCopyThunk
 //===----------------------------------------------------------------------===//
@@ -140,6 +156,19 @@ absl::Status DeviceToHostCopyThunk::ExecuteOnStream(
   return async_events_->Emplace(executor, instr_, std::move(event));
 }
 
+absl::StatusOr<ThunkProto> DeviceToHostCopyThunk::ToProto() const {
+  TF_ASSIGN_OR_RETURN(ThunkProto proto, Thunk::ToProto());
+  DeviceToHostCopyThunkProto* d2h_copy_thunk_proto =
+      proto.mutable_device_to_host_copy_thunk();
+  CopyThunkProto* copy_thunk_proto = d2h_copy_thunk_proto->mutable_copy_thunk();
+  TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_source_buffer(),
+                      source().ToProto());
+  TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_destination_buffer(),
+                      destination().ToProto());
+  copy_thunk_proto->set_mem_size(size_bytes());
+  return proto;
+}
+
 //===----------------------------------------------------------------------===//
 // HostToDeviceCopyThunk
 //===----------------------------------------------------------------------===//
@@ -178,9 +207,23 @@ absl::Status HostToDeviceCopyThunk::ExecuteOnStream(
   return async_events_->Emplace(executor, instr_, std::move(event));
 }
 
+absl::StatusOr<ThunkProto> HostToDeviceCopyThunk::ToProto() const {
+  TF_ASSIGN_OR_RETURN(ThunkProto proto, Thunk::ToProto());
+  HostToDeviceCopyThunkProto* h2d_copy_thunk_proto =
+      proto.mutable_host_to_device_copy_thunk();
+  CopyThunkProto* copy_thunk_proto = h2d_copy_thunk_proto->mutable_copy_thunk();
+  TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_source_buffer(),
+                      source().ToProto());
+  TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_destination_buffer(),
+                      destination().ToProto());
+  copy_thunk_proto->set_mem_size(size_bytes());
+  return proto;
+}
+
 //===----------------------------------------------------------------------===//
 // CopyDoneThunk
 //===----------------------------------------------------------------------===//
+
 CopyDoneThunk::CopyDoneThunk(
     Thunk::Kind kind, ThunkInfo thunk_info,
     std::shared_ptr<CopyThunk::AsyncEvents> async_events,
@@ -198,5 +241,203 @@ absl::Status CopyDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   return params.stream->WaitFor(event.get());
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicMemcpyThunk
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class DynamicOffsetEvaluator {
+ public:
+  // Evaluates the clamped array index for the given offset.
+  absl::StatusOr<int64_t> EvaluateArrayIndexForOffset(
+      const DynamicMemcpyThunk::MemcpyDescriptor::DynamicOffset& offset) {
+    TF_ASSIGN_OR_RETURN(auto call_stack, ComputeCallStack(offset));
+
+    // Walk up the call stack and compute the required parameter's values at
+    // each step, using them as the substitutions for the next call. By
+    // definition, the first call can only depend on the induction variable.
+    TF_ASSIGN_OR_RETURN(auto substitutions,
+                        GetInductionVariableSubstitutions(offset));
+    HloEvaluator evaluator(/*max_loop_iterations=*/0);
+    for (auto it = call_stack.rbegin(), e = call_stack.rend(); it != e; ++it) {
+      const HloInstruction* caller = *it;
+      VLOG(3) << "Evaluating required operands of caller " << caller->name()
+              << ".";
+      if (VLOG_IS_ON(4)) {
+        VLOG(4) << "Current substitutions:";
+        for (auto [instr, value] : substitutions) {
+          VLOG(4) << "  " << instr->name() << " -> " << value->ToString();
+        }
+      }
+      absl::flat_hash_map<const HloInstruction*, const LiteralBase*>
+          next_substitutions;
+      for (auto [parameter, operand] :
+           GetRequiredParametersAndOperands(offset, caller)) {
+        // Only compute the value if we didn't already need it for a different
+        // offset.
+        if (!known_values_.contains(operand)) {
+          TF_ASSIGN_OR_RETURN(
+              known_values_[operand],
+              evaluator.Evaluate(operand, {}, true, substitutions));
+        }
+        next_substitutions[parameter] = &known_values_[operand];
+      }
+
+      std::swap(substitutions, next_substitutions);
+    }
+
+    // We now have the parameter values for the innermost call, so we can
+    // compute the offset.
+    TF_ASSIGN_OR_RETURN(
+        auto array_index_literal,
+        evaluator.Evaluate(offset.offset, {}, true, substitutions));
+
+    std::optional<int64_t> array_index =
+        LiteralUtil::LiteralAsScalarInt64(array_index_literal);
+    if (!array_index) {
+      return absl::InternalError("Failed to evaluate offset");
+    }
+
+    int64_t clamped_index =
+        std::max<int64_t>(0, std::min(*array_index, offset.dimension_size - 1));
+    VLOG(3) << "Computed dynamic array index " << clamped_index << ".";
+
+    return clamped_index;
+  }
+
+ private:
+  // Computes the call stack between `offset`'s while loop and the derived
+  // value. Typically, there will be up to three items in the stack: 1) a
+  // fusion, 2) optionally an async-start, 3) optionally a command buffer. The
+  // while loop instruction is not included.
+  static absl::StatusOr<absl::InlinedVector<HloInstruction*, 4>>
+  ComputeCallStack(
+      const DynamicMemcpyThunk::MemcpyDescriptor::DynamicOffset& offset) {
+    VLOG(3) << "Computing call stack for " << offset.offset->name() << ".";
+    const HloComputation* current_computation = offset.offset->parent();
+    const HloComputation* while_body = offset.induction_variable->parent();
+
+    absl::InlinedVector<HloInstruction*, 4> call_stack;
+    while (current_computation && current_computation != while_body) {
+      VLOG(3) << "Current computation: " << current_computation->name() << ".";
+      auto callers = current_computation->caller_instructions();
+
+      // If there isn't a single caller, the thunk was not constructed
+      // correctly.
+      TF_RET_CHECK(callers.size() == 1);
+
+      call_stack.push_back(callers.front());
+      current_computation = callers.front()->parent();
+    }
+
+    // If we didn't arrive at the while body, the thunk was not constructed
+    // correctly.
+    TF_RET_CHECK(current_computation == while_body);
+    return call_stack;
+  }
+
+  // Returns the pairs of {computation parameter, computation caller operand}
+  // that are required in the given computation to compute the given offset.
+  static absl::InlinedVector<
+      std::pair<const HloInstruction*, const HloInstruction*>, 1>
+  GetRequiredParametersAndOperands(
+      const DynamicMemcpyThunk::MemcpyDescriptor::DynamicOffset& offset,
+      const HloInstruction* caller) {
+    absl::InlinedVector<std::pair<const HloInstruction*, const HloInstruction*>,
+                        1>
+        result;
+    const HloComputation* callee = caller->called_computations().front();
+    if (auto maybe_required = offset.required_parameters.find(callee);
+        maybe_required != offset.required_parameters.end()) {
+      const auto& required_parameters = maybe_required->second;
+      for (int i = 0; i < required_parameters.size(); ++i) {
+        if (required_parameters[i]) {
+          result.push_back(
+              {callee->parameter_instruction(i), caller->operand(i)});
+        }
+      }
+    }
+    return result;
+  }
+
+  absl::StatusOr<absl::flat_hash_map<const HloInstruction*, const LiteralBase*>>
+  GetInductionVariableSubstitutions(
+      const DynamicMemcpyThunk::MemcpyDescriptor::DynamicOffset& offset) {
+    // Set the value of the induction variable, if it's not known yet.
+    if (!known_values_.contains(offset.induction_variable)) {
+      TF_ASSIGN_OR_RETURN(
+          auto config,
+          offset.while_loop->backend_config<xla::WhileLoopBackendConfig>());
+      TF_RET_CHECK(config.has_known_init_step());
+      TF_ASSIGN_OR_RETURN(int64_t iteration,
+                          WhileThunk::CurrentLoopIteration(offset.while_loop));
+      int64_t induction_variable = config.known_init_step().init() +
+                                   iteration * config.known_init_step().step();
+
+      Literal induction_variable_literal(offset.induction_variable->shape());
+      TF_RETURN_IF_ERROR(
+          induction_variable_literal.SetIntegralAsS64({}, induction_variable));
+      known_values_[offset.induction_variable] =
+          std::move(induction_variable_literal);
+    }
+
+    return {{{offset.induction_variable,
+              &known_values_.at(offset.induction_variable)}}};
+  }
+
+  absl::node_hash_map<const HloInstruction*, Literal> known_values_;
+};
+
+absl::StatusOr<int64_t> EvaluateDynamicOffsets(
+    absl::Span<const DynamicMemcpyThunk::MemcpyDescriptor::DynamicOffset>
+        offsets) {
+  int64_t offset_sum = 0;
+  DynamicOffsetEvaluator evaluator;
+  for (const auto& offset : offsets) {
+    TF_ASSIGN_OR_RETURN(int64_t clamped_index,
+                        evaluator.EvaluateArrayIndexForOffset(offset));
+    offset_sum += clamped_index * offset.byte_stride;
+  }
+  return offset_sum;
+}
+
+}  // namespace
+
+DynamicMemcpyThunk::DynamicMemcpyThunk(
+    ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
+    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size,
+    DynamicMemcpyThunk::MemcpyDescriptor descriptor)
+    : Thunk(Kind::kCopy, std::move(thunk_info)),
+      source_buffer_(source_buffer),
+      destination_buffer_(destination_buffer),
+      mem_size_(mem_size),
+      descriptor_(descriptor) {}
+
+absl::Status DynamicMemcpyThunk::ExecuteOnStream(const ExecuteParams& params) {
+  se::DeviceMemoryBase src_data =
+      params.buffer_allocations->GetDeviceAddress(source_buffer_);
+  se::DeviceMemoryBase dst_data =
+      params.buffer_allocations->GetDeviceAddress(destination_buffer_);
+
+  TF_ASSIGN_OR_RETURN(int64_t src_offset,
+                      EvaluateDynamicOffsets(descriptor_.src_dynamic_offsets));
+  src_offset += descriptor_.src_byte_static_offset;
+
+  TF_ASSIGN_OR_RETURN(int64_t dst_offset,
+                      EvaluateDynamicOffsets(descriptor_.dst_dynamic_offsets));
+  dst_offset += descriptor_.dst_byte_static_offset;
+
+  auto src_with_offset = src_data.GetByteSlice(src_offset, mem_size_);
+  auto dst_with_offset = dst_data.GetByteSlice(dst_offset, mem_size_);
+  VLOG(3) << "Memcpy of size " << mem_size_ << " from "
+          << src_with_offset.opaque() << " (offset " << src_offset << ") to "
+          << dst_with_offset.opaque() << " (offset " << dst_offset << ")";
+  TF_ASSIGN_OR_RETURN(
+      se::Stream * stream,
+      GetStreamForExecution(Thunk::execution_stream_id(), params));
+  return stream->Memcpy(&dst_with_offset, src_with_offset, mem_size_);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
index f435ca4cf057..def75e43bfff 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
@@ -19,13 +19,16 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/event.h"
@@ -95,6 +98,8 @@ class CopyThunk : public Thunk {
   }
   uint64_t size_bytes() const { return mem_size_; }
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
   const BufferAllocation::Slice source_buffer_;
   const BufferAllocation::Slice destination_buffer_;
@@ -121,6 +126,8 @@ class DeviceToHostCopyThunk : public CopyThunk {
                         const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
   std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
   const HloInstruction* instr_;
@@ -146,6 +153,8 @@ class HostToDeviceCopyThunk : public CopyThunk {
                         const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
   std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
   const HloInstruction* instr_;
@@ -168,6 +177,57 @@ class CopyDoneThunk : public Thunk {
   const HloInstruction* copy_start_instr_;
 };
 
+//===----------------------------------------------------------------------===//
+// DynamicMemcpyThunk
+//===----------------------------------------------------------------------===//
+
+class DynamicMemcpyThunk : public Thunk {
+ public:
+  struct MemcpyDescriptor {
+    struct DynamicOffset {
+      // The while loop whose induction variable defines the offset.
+      const HloInstruction* while_loop;
+      const HloInstruction* induction_variable;
+
+      // See documentation for ResolveFunctionalDependencyOnInductionVariable.
+      absl::flat_hash_map<const HloComputation*, absl::InlinedVector<bool, 1>>
+          required_parameters;
+
+      // All dependencies of `offset` must end in `induction_variable` or
+      // constants only.
+      const HloInstruction* offset;
+
+      // The size of the dimension that this offset corresponds to. As per HLO
+      // semantics, values of `offset` will be clamped to one less than this.
+      int64_t dimension_size;
+
+      // The stride with which to multiply the induction variable's value.
+      int64_t byte_stride;
+    };
+
+    std::vector<DynamicOffset> src_dynamic_offsets;
+    int64_t src_byte_static_offset = 0;
+
+    std::vector<DynamicOffset> dst_dynamic_offsets;
+    int64_t dst_byte_static_offset = 0;
+  };
+
+  DynamicMemcpyThunk(ThunkInfo thunk_info,
+                     const BufferAllocation::Slice& source_buffer,
+                     const BufferAllocation::Slice& destination_buffer,
+                     uint64_t mem_size, MemcpyDescriptor descriptor);
+  DynamicMemcpyThunk(const DynamicMemcpyThunk&) = delete;
+  DynamicMemcpyThunk& operator=(const DynamicMemcpyThunk&) = delete;
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  const BufferAllocation::Slice source_buffer_;
+  const BufferAllocation::Slice destination_buffer_;
+  const uint64_t mem_size_;
+  MemcpyDescriptor descriptor_;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
new file mode 100644
index 000000000000..7b63d995f1ea
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/copy_thunk.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CopyThunkTest, ToProto) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 123;
+
+  BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
+  auto src_slice =
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+  auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+
+  CopyThunk thunk(thunk_info, src_slice, dst_slice, /*mem_size=*/256);
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+  EXPECT_THAT(proto, EqualsProto(R"pb(
+                thunk_info {
+                  profile_annotation: "profile_annotation"
+                  execution_stream_id: 123
+                }
+                copy_thunk {
+                  source_buffer { offset: 128 size: 384 }
+                  destination_buffer { size: 256 buffer_allocation_index: 1 }
+                  mem_size: 256
+                }
+              )pb"));
+}
+
+TEST(DeviceToHostCopyThunkProtoTest, ToProto) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 123;
+
+  BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
+  auto src_slice =
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+  auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+
+  DeviceToHostCopyThunk thunk(thunk_info, src_slice, dst_slice,
+                              /*mem_size=*/256,
+                              /*events=*/nullptr,
+                              /*instr=*/nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+  EXPECT_THAT(proto, EqualsProto(R"pb(
+                thunk_info {
+                  profile_annotation: "profile_annotation"
+                  execution_stream_id: 123
+                }
+                device_to_host_copy_thunk {
+                  copy_thunk {
+                    source_buffer { offset: 128 size: 384 }
+                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    mem_size: 256
+                  }
+                }
+              )pb"));
+}
+
+TEST(HostToDeviceCopyThunkProtoTest, ToProto) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 123;
+
+  BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
+  auto src_slice =
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+  auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+
+  HostToDeviceCopyThunk thunk(thunk_info, src_slice, dst_slice,
+                              /*mem_size=*/256,
+                              /*events=*/nullptr,
+                              /*instr=*/nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+  EXPECT_THAT(proto, EqualsProto(R"pb(
+                thunk_info {
+                  profile_annotation: "profile_annotation"
+                  execution_stream_id: 123
+                }
+                host_to_device_copy_thunk {
+                  copy_thunk {
+                    source_buffer { offset: 128 size: 384 }
+                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    mem_size: 256
+                  }
+                }
+              )pb"));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
index 8195bf0a1ab2..1e3e58df9790 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.cc
@@ -17,28 +17,32 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/executable_run_options.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/call_frame.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/cub_sort_kernel.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
@@ -65,11 +69,8 @@ absl::Status CopyOffsets(se::Stream* stream, se::DeviceMemoryBase scratch,
 // Template class for sorting a single tensor.
 class CubSortKeysImpl : public CubSortRunnerInterface {
  public:
-  using SortKeysFn =
-      std::function<const char*(void*, size_t&, const void*, void*, size_t,
-                                bool, size_t, se::gpu::GpuStreamHandle)>;
-
-  explicit CubSortKeysImpl(SortKeysFn sort_keys_fn, PrimitiveType type)
+  explicit CubSortKeysImpl(ffi::HandlerRegistration sort_keys_fn,
+                           PrimitiveType type)
       : sort_keys_fn_(sort_keys_fn), type_(type) {}
 
   absl::Status Run(se::DeviceMemoryBase input_keys,
@@ -84,7 +85,7 @@ class CubSortKeysImpl : public CubSortRunnerInterface {
                                          int64_t batch_size) override;
 
  private:
-  SortKeysFn sort_keys_fn_;
+  ffi::HandlerRegistration sort_keys_fn_;
   PrimitiveType type_;
 };
 
@@ -103,14 +104,26 @@ absl::Status CubSortKeysImpl::Run(se::DeviceMemoryBase input_keys,
         CopyOffsets(stream, scratch, batch_size, num_items / batch_size));
     temp_bytes -= GetOffsetsSize(batch_size);
   }
-  const char* error = sort_keys_fn_(
-      scratch.opaque(), temp_bytes, input_keys.opaque(), output_keys.opaque(),
-      num_items, descending, batch_size, se::gpu::AsGpuStreamValue(stream));
-  if (error != nullptr) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("CubSortKeys error: ", error));
-  }
-  return absl::OkStatus();
+
+  ffi::CallFrameBuilder builder(2, 1);
+  builder.AddBufferArg(scratch, PrimitiveType::U8,
+                       {static_cast<int64_t>(temp_bytes)});
+  builder.AddBufferArg(input_keys, PrimitiveType::U8,
+                       {static_cast<int64_t>(input_keys.size())});
+  builder.AddBufferRet(output_keys, PrimitiveType::U8,
+                       {static_cast<int64_t>(output_keys.size())});
+
+  ffi::CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("num_items", static_cast<size_t>(num_items));
+  attrs.Insert("descending", descending);
+  attrs.Insert("batch_size", static_cast<size_t>(batch_size));
+  builder.AddAttributes(attrs.Build());
+  ffi::CallFrame call_frame = builder.Build();
+
+  ffi::CallOptions options{};
+  options.backend_options = ffi::CallOptions::GpuOptions{stream, nullptr};
+  return ffi::Call(sort_keys_fn_.bundle.execute, call_frame, options,
+                   XLA_FFI_ExecutionStage_EXECUTE);
 }
 
 absl::Status CubSortKeysImpl::Run(const Thunk::ExecuteParams& params,
@@ -124,24 +137,27 @@ absl::Status CubSortKeysImpl::Run(const Thunk::ExecuteParams& params,
 
 absl::StatusOr<int64_t> CubSortKeysImpl::GetScratchSize(int64_t num_items,
                                                         int64_t batch_size) {
+  ffi::CallFrameBuilder builder(0, 0);
+
+  ffi::CallFrameBuilder::AttributesBuilder attrs;
   size_t temp_bytes = 0;
-  const char* error = sort_keys_fn_(nullptr, temp_bytes, nullptr, nullptr,
-                                    num_items, false, batch_size, nullptr);
-  if (error != nullptr) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("CubSortKeys error: ", error));
-  }
+  attrs.Insert("temp_bytes", absl::bit_cast<int64_t>(&temp_bytes));
+  attrs.Insert("num_items", static_cast<size_t>(num_items));
+  attrs.Insert("batch_size", static_cast<size_t>(batch_size));
+  builder.AddAttributes(attrs.Build());
+  ffi::CallFrame call_frame = builder.Build();
+
+  TF_RETURN_IF_ERROR(ffi::Call(sort_keys_fn_.bundle.initialize, call_frame,
+                               ffi::CallOptions{},
+                               XLA_FFI_ExecutionStage_INITIALIZE));
   return temp_bytes;
 }
 
 // Template class for sorting a pair of tensors.
 class CubSortPairsImpl : public CubSortRunnerInterface {
  public:
-  using SortPairsFn = std::function<const char*(
-      void*, size_t&, const void*, void*, const void*, void*, size_t, bool,
-      size_t, se::gpu::GpuStreamHandle)>;
-
-  explicit CubSortPairsImpl(SortPairsFn sort_pairs_fn, PrimitiveType type)
+  explicit CubSortPairsImpl(ffi::HandlerRegistration sort_pairs_fn,
+                            PrimitiveType type)
       : sort_pairs_fn_(sort_pairs_fn), type_(type) {}
 
   absl::Status Run(se::DeviceMemoryBase input_keys,
@@ -156,7 +172,7 @@ class CubSortPairsImpl : public CubSortRunnerInterface {
                                          int64_t batch_size) override;
 
  private:
-  SortPairsFn sort_pairs_fn_;
+  ffi::HandlerRegistration sort_pairs_fn_;
   PrimitiveType type_;
 };
 
@@ -174,15 +190,30 @@ absl::Status CubSortPairsImpl::Run(se::DeviceMemoryBase input_keys,
         CopyOffsets(stream, scratch, batch_size, num_items / batch_size));
     temp_bytes -= GetOffsetsSize(batch_size);
   }
-  const char* error = sort_pairs_fn_(
-      scratch.opaque(), temp_bytes, input_keys.opaque(), output_keys.opaque(),
-      input_values.opaque(), output_values.opaque(), num_items, descending,
-      batch_size, se::gpu::AsGpuStreamValue(stream));
-  if (error != nullptr) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("CubSortPairs error: ", error));
-  }
-  return absl::OkStatus();
+
+  ffi::CallFrameBuilder builder(3, 2);
+  builder.AddBufferArg(scratch, PrimitiveType::U8,
+                       {static_cast<int64_t>(temp_bytes)});
+  builder.AddBufferArg(input_keys, PrimitiveType::U8,
+                       {static_cast<int64_t>(input_keys.size())});
+  builder.AddBufferRet(output_keys, PrimitiveType::U8,
+                       {static_cast<int64_t>(output_keys.size())});
+  builder.AddBufferArg(input_values, PrimitiveType::U8,
+                       {static_cast<int64_t>(input_values.size())});
+  builder.AddBufferRet(output_values, PrimitiveType::U8,
+                       {static_cast<int64_t>(output_values.size())});
+
+  ffi::CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("num_items", static_cast<size_t>(num_items));
+  attrs.Insert("descending", descending);
+  attrs.Insert("batch_size", static_cast<size_t>(batch_size));
+  builder.AddAttributes(attrs.Build());
+  ffi::CallFrame call_frame = builder.Build();
+
+  ffi::CallOptions options{};
+  options.backend_options = ffi::CallOptions::GpuOptions{stream, nullptr};
+  return ffi::Call(sort_pairs_fn_.bundle.execute, call_frame, options,
+                   XLA_FFI_ExecutionStage_EXECUTE);
 }
 
 absl::Status CubSortPairsImpl::Run(const Thunk::ExecuteParams& params,
@@ -198,84 +229,58 @@ absl::Status CubSortPairsImpl::Run(const Thunk::ExecuteParams& params,
 
 absl::StatusOr<int64_t> CubSortPairsImpl::GetScratchSize(int64_t num_items,
                                                          int64_t batch_size) {
+  ffi::CallFrameBuilder::AttributesBuilder attrs;
   size_t temp_bytes = 0;
-  const char* error =
-      sort_pairs_fn_(nullptr, temp_bytes, nullptr, nullptr, nullptr, nullptr,
-                     num_items, false, batch_size, nullptr);
-  if (error != nullptr) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("CubSortPairs error: ", error));
-  }
+  // FFI expects a pointer to be passed as an int64_t.
+  attrs.Insert("temp_bytes", absl::bit_cast<int64_t>(&temp_bytes));
+  attrs.Insert("num_items", static_cast<size_t>(num_items));
+  attrs.Insert("batch_size", static_cast<size_t>(batch_size));
+
+  ffi::CallFrameBuilder builder(0, 0);
+  builder.AddAttributes(attrs.Build());
+  ffi::CallFrame call_frame = builder.Build();
+
+  TF_RETURN_IF_ERROR(ffi::Call(sort_pairs_fn_.bundle.initialize, call_frame,
+                               ffi::CallOptions{},
+                               XLA_FFI_ExecutionStage_INITIALIZE));
   return temp_bytes;
 }
 
 absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateCubSortRunner(
-    PrimitiveType type) {
-  switch (type) {
-    case F16:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_f16, F16);
-    case F32:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_f32, F32);
-    case F64:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_f64, F64);
-    case S8:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_s8, S8);
-    case S16:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_s16, S16);
-    case S32:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_s32, S32);
-    case S64:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_s64, S64);
-    case U8:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_u8, U8);
-    case U16:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_u16, U16);
-    case U32:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_u32, U32);
-    case U64:
-      return std::make_unique<CubSortKeysImpl>(CubSortKeys_u64, U64);
-    default:
-      return InvalidArgument("Unsupported type of the sort kernel: %s",
-                             primitive_util::LowercasePrimitiveTypeName(type));
-  }
+    PrimitiveType type, absl::string_view platform_name) {
+  TF_ASSIGN_OR_RETURN(
+      ffi::HandlerRegistration handler,
+      ffi::FindHandler("xla.gpu.ext.cub_sort_keys_" +
+                           primitive_util::LowercasePrimitiveTypeName(type),
+                       platform_name));
+  return std::make_unique<CubSortKeysImpl>(handler, type);
 }
 
 // Returns an interface for calling CubSortPairs on the given key and value
-// types. key_type can be only unsigned integer types. value_type can be any
-// type of 16/32/64 bit width.
+// types. key_type can be any unsigned integer types or F32. value_type can be
+// any type of 16/32/64 bit width.
 absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateCubSortRunner(
-    PrimitiveType key_type, PrimitiveType value_type) {
-  int value_width = primitive_util::BitWidth(value_type);
-  CubSortPairsImpl::SortPairsFn sort_fn = nullptr;
-  if (key_type == U8 && value_width == 16) sort_fn = CubSortPairs_u8_b16;
-  if (key_type == U8 && value_width == 32) sort_fn = CubSortPairs_u8_b32;
-  if (key_type == U8 && value_width == 64) sort_fn = CubSortPairs_u8_b64;
-  if (key_type == U16 && value_width == 16) sort_fn = CubSortPairs_u16_b16;
-  if (key_type == U16 && value_width == 32) sort_fn = CubSortPairs_u16_b32;
-  if (key_type == U16 && value_width == 64) sort_fn = CubSortPairs_u16_b64;
-  if (key_type == U32 && value_width == 16) sort_fn = CubSortPairs_u32_b16;
-  if (key_type == U32 && value_width == 32) sort_fn = CubSortPairs_u32_b32;
-  if (key_type == U32 && value_width == 64) sort_fn = CubSortPairs_u32_b64;
-  if (key_type == U64 && value_width == 16) sort_fn = CubSortPairs_u64_b16;
-  if (key_type == U64 && value_width == 32) sort_fn = CubSortPairs_u64_b32;
-  if (key_type == U64 && value_width == 64) sort_fn = CubSortPairs_u64_b64;
-
-  if (sort_fn == nullptr) {
-    return InvalidArgument(
-        "Unsupported key/value type combination for CubSortPairs: %s/%s",
-        primitive_util::LowercasePrimitiveTypeName(key_type),
-        primitive_util::LowercasePrimitiveTypeName(value_type));
-  }
-  return std::make_unique<CubSortPairsImpl>(sort_fn, key_type);
+    PrimitiveType key_type, PrimitiveType value_type,
+    absl::string_view platform_name) {
+  TF_ASSIGN_OR_RETURN(
+      ffi::HandlerRegistration handler,
+      ffi::FindHandler(
+          absl::StrFormat("xla.gpu.ext.cub_sort_pairs_%s_b%d",
+                          primitive_util::LowercasePrimitiveTypeName(key_type),
+                          primitive_util::BitWidth(value_type)),
+          platform_name));
+  return std::make_unique<CubSortPairsImpl>(handler, key_type);
 }
 
 }  // namespace
 
 absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>>
 CubSortRunnerInterface::Create(PrimitiveType type,
-                               std::optional<PrimitiveType> value_type) {
-  return value_type.has_value() ? CreateCubSortRunner(type, *value_type)
-                                : CreateCubSortRunner(type);
+                               std::optional<PrimitiveType> value_type,
+                               absl::string_view platform_name) {
+  return value_type.has_value()
+             ? CreateCubSortRunner(type, *value_type, platform_name)
+             : CreateCubSortRunner(type, platform_name);
 }
 
 CubSortThunk::CubSortThunk(
@@ -283,9 +288,11 @@ CubSortThunk::CubSortThunk(
     std::optional<PrimitiveType> value_type,
     absl::InlinedVector<BufferAllocation::Slice, 2> operands,
     absl::InlinedVector<BufferAllocation::Slice, 2> results,
-    BufferAllocation::Slice scratch, bool descending, int64_t batch_size)
+    BufferAllocation::Slice scratch, bool descending, int64_t batch_size,
+    absl::string_view platform_name)
     : Thunk(Thunk::kCubSort, thunk_info),
-      runner_(CubSortRunnerInterface::Create(type, value_type).value()),
+      runner_(CubSortRunnerInterface::Create(type, value_type, platform_name)
+                  .value()),
       operands_(std::move(operands)),
       results_(std::move(results)),
       scratch_(scratch),
diff --git a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
index b0b4b0d8aff5..f66c1259b7f2 100644
--- a/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/cub_sort_thunk.h
@@ -23,9 +23,12 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -46,7 +49,8 @@ class CubSortRunnerInterface {
                                                  int64_t batch_size) = 0;
 
   static absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> Create(
-      PrimitiveType type, std::optional<PrimitiveType> value_type);
+      PrimitiveType type, std::optional<PrimitiveType> value_type,
+      absl::string_view platform_name);
 };
 
 class CubSortThunk : public Thunk {
@@ -56,7 +60,7 @@ class CubSortThunk : public Thunk {
                absl::InlinedVector<BufferAllocation::Slice, 2> operands,
                absl::InlinedVector<BufferAllocation::Slice, 2> results,
                BufferAllocation::Slice scratch, bool descending,
-               int64_t batch_size);
+               int64_t batch_size, absl::string_view platform_name);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override {
     return runner_->Run(params, this);
diff --git a/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
new file mode 100644
index 000000000000..af832b775697
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/cuda_command_buffer_thunk_test.cc
@@ -0,0 +1,215 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend.h"  // IWYU pragma: keep - cudnn frontend headers are not hermetic
+#include "third_party/cudnn_frontend/include/cudnn_frontend/graph_interface.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend/graph_properties.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/backends/gpu/runtime/command_buffer_cmd.h"
+#include "xla/backends/gpu/runtime/command_buffer_thunk.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/platform_util.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/cuda/cuda_dnn.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+using MemoryAccess = BufferUse::MemoryAccess;
+using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;
+
+namespace {
+
+se::StreamExecutor* GpuExecutor() {
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  auto* platform = se::PlatformManager::PlatformWithName(name).value();
+  return platform->ExecutorForDevice(0).value();
+}
+
+// Give a short aliases to execution threads.
+constexpr auto s0 = ExecutionStreamId(0);
+
+// Give a short alias to synchronization mode.
+static constexpr auto serialize =
+    CommandBufferCmdExecutor::SynchronizationMode::kSerialize;
+
+}  // namespace
+
+TEST(CommandBufferThunkTest, CuDnnCmd) {
+  se::StreamExecutor* stream_executor = GpuExecutor();
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_executor->CreateStream());
+  se::dnn::DnnSupport& dnn_support = *stream_executor->AsDnn();
+
+  if (dnn_support.GetVersion().value_or(se::dnn::VersionInfo{0, 0, 0}) <
+      se::dnn::VersionInfo(9, 7, 0)) {
+    GTEST_SKIP() << "Requires cuDNN 9.7.0 or later.";
+  }
+
+  constexpr int kDimSize = 32;
+  constexpr int kTotalElements = kDimSize * kDimSize;
+
+  se::gpu::CudnnGraph graph([]() {
+    cudnn_frontend::graph::Graph graph;
+    graph.set_compute_data_type(cudnn_frontend::DataType_t::INT32);
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> lhs =
+        graph.tensor(cudnn_frontend::graph::Tensor_attributes()
+                         .set_dim({1, kDimSize, kDimSize})
+                         .set_stride({kDimSize * kDimSize, kDimSize, 1})
+                         .set_data_type(cudnn_frontend::DataType_t::INT8)
+                         .set_uid(1));
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> rhs =
+        graph.tensor_like(lhs);
+    rhs->set_uid(2);
+    graph.matmul(lhs, rhs, cudnn_frontend::graph::Matmul_attributes())
+        ->set_output(true)
+        .set_data_type(cudnn_frontend::DataType_t::INT32)
+        .set_uid(3);
+    return graph;
+  }());
+  int64_t workspace_size = graph.Graph().get_workspace_size();
+  TF_ASSERT_OK(graph.Prepare(dnn_support, se::NumericOptions{}));
+  TF_ASSERT_OK(graph.Build(dnn_support, /*plan_id=*/std::nullopt));
+  EXPECT_THAT(graph.SupportsExplicitCommandBufferConstruction(),
+              tsl::testing::IsOkAndHolds(true));
+
+  std::vector<BufferAllocation::Slice> args;
+  BufferAllocation alloc_input(/*index=*/0, kTotalElements, /*color=*/0);
+  BufferAllocation alloc_output(/*index=*/1, kTotalElements * sizeof(int32_t),
+                                /*color=*/0);
+
+  BufferAllocation::Slice slice_input(&alloc_input, 0, kTotalElements);
+  BufferAllocation::Slice slice_output(&alloc_output, 0,
+                                       kTotalElements * sizeof(int32_t));
+
+  args.reserve(4);
+  args.push_back(slice_input);  // multiplying the input by itself
+  args.push_back(slice_input);
+  args.push_back(slice_output);
+
+  if (workspace_size > 0) {
+    BufferAllocation alloc_workspace(
+        /*index=*/2, workspace_size, /*color=*/0);
+    BufferAllocation::Slice slice_workspace(&alloc_workspace, 0,
+                                            workspace_size);
+    args.push_back(slice_workspace);
+  }
+
+  auto dnn_graph = std::make_unique<se::gpu::CudnnGraph>(std::move(graph));
+  CommandBufferCmdSequence commands;
+  commands.Emplace<CuDnnCmd>(
+      s0, args, std::make_shared<se::dnn::LazyDnnGraph>(std::move(dnn_graph)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      CommandBufferCmdExecutor executor,
+      CommandBufferCmdExecutor::Create(std::move(commands), serialize));
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(executor), Thunk::ThunkInfo());
+
+  std::vector<se::DeviceMemoryBase> operands;
+  operands.reserve(3);
+
+  se::DeviceMemory<int8_t> input =
+      stream_executor->AllocateArray<int8_t>(kTotalElements);
+  TF_ASSERT_OK(stream->MemZero(&input, input.size()));
+
+  se::DeviceMemory<int32_t> output0 =
+      stream_executor->AllocateArray<int32_t>(kTotalElements);
+  TF_ASSERT_OK(stream->Memset32(&output0, 123, output0.size()));
+
+  operands.push_back(input);  // multiplying the input by itself
+  operands.push_back(output0);
+
+  se::DeviceMemoryBase workspace;
+  if (workspace_size > 0) {
+    workspace = stream_executor->Allocate(workspace_size);
+    operands.push_back(workspace);
+  }
+
+  ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(stream_executor);
+  BufferAllocations allocations(operands, 0, &allocator);
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(
+      {stream_executor, source, &allocations, stream.get(), stream.get()}));
+
+  // Execute command buffer thunk and verify that it executed a GEMM.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  // Copy output0 data back to host.
+  std::vector<int32_t> dst(kTotalElements, 1);
+  TF_ASSERT_OK(
+      stream->Memcpy(dst.data(), output0, kTotalElements * sizeof(int32_t)));
+
+  ASSERT_EQ(dst, std::vector<int32_t>(kTotalElements, 0));
+
+  // Prepare buffer allocation for updating command buffer.
+  se::DeviceMemory<int32_t> output1 =
+      stream_executor->AllocateArray<int32_t>(kTotalElements);
+  TF_ASSERT_OK(stream->Memset32(&output1, 456, output1.size()));
+
+  // Update buffer allocation
+  operands[1] = output1;
+  allocations = BufferAllocations(operands, 0, &allocator);
+  // Thunk execution should automatically update underlying command
+  // buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  // Copy output1 data back to host.
+  std::fill(dst.begin(), dst.end(), 1);
+  TF_ASSERT_OK(
+      stream->Memcpy(dst.data(), output1, kTotalElements * sizeof(int32_t)));
+
+  ASSERT_EQ(dst, std::vector<int32_t>(kTotalElements, 0));
+}
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
index d45931c4b758..5ae09523ce5c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
@@ -16,16 +16,20 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
@@ -33,6 +37,7 @@ limitations under the License.
 #include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
@@ -50,6 +55,63 @@ using xla::ffi::CallFrame;
 using xla::ffi::CallFrameBuilder;
 using xla::ffi::CallOptions;
 
+// Builds a call frame prototype for typed-FFI custom calls with dummy device
+// memory addresses. This is called once when creating the CustomCall thunk,
+// then the thunk will need to update the addresses at runtime.
+static absl::StatusOr<ffi::CallFrame> BuildCallFramePrototype(
+    absl::Span<const std::optional<CustomCallThunk::Slice>> operands,
+    absl::Span<const std::optional<CustomCallThunk::Slice>> results,
+    CustomCallThunk::AttributesMap attributes) {
+  CallFrameBuilder builder(
+      /*num_args=*/operands.size(),
+      /*num_rets=*/results.size());
+
+  // Add prototype input buffers with actual data types and shapes. Device
+  // memory addresses will be updated at runtime.
+  for (int i = 0; i < operands.size(); ++i) {
+    auto& operand = operands[i];
+
+    if (!operand.has_value()) {
+      builder.AddTokenArg();
+      continue;
+    }
+
+    auto elements = absl::c_accumulate(operand->shape.dimensions(), 1ULL,
+                                       std::multiplies<int64_t>());
+    auto dtype_bytes = primitive_util::ByteWidth(operand->shape.element_type());
+    se::DeviceMemoryBase placeholder_arg(nullptr, elements * dtype_bytes);
+    builder.AddBufferArg(placeholder_arg, operand->shape.element_type(),
+                         operand->shape.dimensions());
+  }
+
+  // Add prototype output buffers with actual data types and shapes. Device
+  // memory addresses will be updated at runtime.
+  for (int i = 0; i < results.size(); ++i) {
+    auto& result = results[i];
+
+    if (!result.has_value()) {
+      builder.AddTokenRet();
+      continue;
+    }
+
+    auto elements = absl::c_accumulate(result->shape.dimensions(), 1ULL,
+                                       std::multiplies<int64_t>());
+    auto dtype_bytes = primitive_util::ByteWidth(result->shape.element_type());
+    se::DeviceMemoryBase placeholder_ret(nullptr, elements * dtype_bytes);
+    builder.AddBufferRet(placeholder_ret, result->shape.element_type(),
+                         result->shape.dimensions());
+  }
+
+  // Add attributes if any.
+  if (!attributes.empty()) {
+    ffi::CallFrameBuilder::AttributesBuilder attrs;
+    attrs.Append(std::move(attributes));
+    builder.AddAttributes(attrs.Build());
+  }
+
+  return builder.Build();
+}
+
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name, CustomCallTarget call_target,
     std::vector<std::optional<Slice>> operands,
@@ -84,9 +146,13 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
                             XLA_FFI_ExecutionStage_INSTANTIATE));
   }
 
+  TF_ASSIGN_OR_RETURN(
+      CallFrame call_frame,
+      BuildCallFramePrototype(operands, results, std::move(attributes)));
+
   return absl::WrapUnique(new CustomCallThunk(
       thunk_info, std::move(target_name), bundle, std::move(operands),
-      std::move(results), std::move(attributes), std::move(execution_state),
+      std::move(results), std::move(call_frame), std::move(execution_state),
       called_computation));
 }
 
@@ -105,7 +171,7 @@ CustomCallThunk::CustomCallThunk(ThunkInfo thunk_info, std::string target_name,
 CustomCallThunk::CustomCallThunk(
     ThunkInfo thunk_info, std::string target_name,
     XLA_FFI_Handler_Bundle bundle, std::vector<std::optional<Slice>> operands,
-    std::vector<std::optional<Slice>> results, AttributesMap attributes,
+    std::vector<std::optional<Slice>> results, CallFrame call_frame,
     std::unique_ptr<ffi::ExecutionState> execution_state,
     const HloComputation* called_computation)
     : Thunk(Thunk::kCustomCall, thunk_info),
@@ -113,7 +179,8 @@ CustomCallThunk::CustomCallThunk(
       operands_(std::move(operands)),
       results_(std::move(results)),
       bundle_(bundle),
-      attributes_(std::move(attributes)),
+      call_frame_(std::move(call_frame)),
+      call_frames_([this] { return call_frame_->Copy(); }),
       execution_state_(std::move(execution_state)),
       called_computation_(called_computation) {}
 
@@ -128,16 +195,20 @@ absl::Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
         continue;
       }
 
-      if (!slice->slice.allocation())
+      if (!slice->slice.allocation()) {
         return Internal("custom call input missing buffer allocation");
+      }
 
       buffers.push_back(
           params.buffer_allocations->GetDeviceAddress(slice->slice).opaque());
     }
   }
 
+  TF_ASSIGN_OR_RETURN(
+      se::Stream * stream,
+      GetStreamForExecution(Thunk::execution_stream_id(), params));
   XlaCustomCallStatus custom_call_status;
-  call_target_(params.stream, buffers.data(), opaque_.data(), opaque_.size(),
+  call_target_(stream, buffers.data(), opaque_.data(), opaque_.size(),
                &custom_call_status);
   auto message = CustomCallStatusGetMessage(&custom_call_status);
   if (message) {
@@ -159,50 +230,37 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(
     return absl::InternalError("buffer allocations and stream are required");
   }
 
-  // TODO(ezhulenev): This is not the most optimal approach, as we'll be doing
-  // a lot of extra allocation on every call. We have to keep attributes
-  // separate from arguments, as they do not change after thunk is constructed.
-  CallFrameBuilder builder(operands_.size(), results_.size());
-  auto device_address =
-      [buffer_allocations](
-          BufferAllocation::Slice slice) -> se::DeviceMemoryBase {
+  auto device_memory = [&](BufferAllocation::Slice slice) {
     return buffer_allocations ? buffer_allocations->GetDeviceAddress(slice)
                               : se::DeviceMemoryBase{};
   };
 
+  // Collect arguments buffers.
+  absl::InlinedVector<se::DeviceMemoryBase, 8> arguments;
+  arguments.reserve(operands_.size());
   for (auto& operand : operands_) {
     if (!operand.has_value()) {
-      builder.AddTokenArg();
-      continue;
+      arguments.push_back(se::DeviceMemoryBase{});
+    } else {
+      arguments.push_back(device_memory(operand->slice));
     }
-
-    if (!operand->slice.allocation())
-      return Internal("custom call argument missing buffer allocation");
-
-    builder.AddBufferArg(device_address(operand->slice),
-                         operand->shape.element_type(),
-                         operand->shape.dimensions());
   }
 
+  // Collect results buffers.
+  absl::InlinedVector<se::DeviceMemoryBase, 4> results;
+  results.reserve(results_.size());
   for (auto& result : results_) {
     if (!result.has_value()) {
-      builder.AddTokenRet();
-      continue;
+      results.push_back(se::DeviceMemoryBase{});
+    } else {
+      results.push_back(device_memory(result->slice));
     }
-
-    if (!result->slice.allocation())
-      return Internal("custom call result missing buffer allocation");
-
-    builder.AddBufferRet(device_address(result->slice),
-                         result->shape.element_type(),
-                         result->shape.dimensions());
   }
 
-  CallFrameBuilder::AttributesBuilder attrs;
-  attrs.Append(attributes_);
-
-  builder.AddAttributes(attrs.Build());
-  CallFrame call_frame = builder.Build();
+  // Borrow the FFI call frame from the object pool and update with the actual
+  // device memory addresses.
+  TF_ASSIGN_OR_RETURN(auto call_frame, call_frames_->GetOrCreate());
+  TF_RETURN_IF_ERROR(call_frame->UpdateWithBuffers(arguments, results));
 
   int32_t device_ordinal = -1;
   se::DeviceMemoryAllocator* allocator = nullptr;
@@ -217,7 +275,7 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(
                          called_computation_,
                          execution_context,
                          execution_state_.get()};
-  return Call(handler, call_frame, options, stage);
+  return Call(handler, *call_frame, options, stage);
 }
 
 absl::Status CustomCallThunk::Prepare(
@@ -246,10 +304,13 @@ absl::Status CustomCallThunk::Initialize(const InitializeParams& params) {
 }
 
 absl::Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
+  TF_ASSIGN_OR_RETURN(
+      se::Stream * stream,
+      GetStreamForExecution(Thunk::execution_stream_id(), params));
   if (bundle_.has_value()) {
     return ExecuteFfiHandler(
         params.collective_params ? params.collective_params->run_id : RunId{-1},
-        bundle_->execute, XLA_FFI_ExecutionStage_EXECUTE, params.stream,
+        bundle_->execute, XLA_FFI_ExecutionStage_EXECUTE, stream,
         params.ffi_execution_context, params.buffer_allocations);
   }
   return ExecuteCustomCall(params);
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
index 914d7b4724aa..30a978626472 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/execution_state.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/runtime/object_pool.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/gpu/buffer_allocations.h"
@@ -89,7 +90,9 @@ class CustomCallThunk : public Thunk {
   const std::string& target_name() const { return target_name_; }
   CustomCallTarget call_target() const { return call_target_; }
   std::optional<XLA_FFI_Handler_Bundle> bundle() const { return bundle_; }
-  const AttributesMap& attributes() const { return attributes_; }
+  std::optional<ffi::CallFrame> call_frame() const {
+    return call_frame_ ? std::make_optional(call_frame_->Copy()) : std::nullopt;
+  }
 
   const std::vector<std::optional<Slice>>& operands() const {
     return operands_;
@@ -109,7 +112,7 @@ class CustomCallThunk : public Thunk {
                   XLA_FFI_Handler_Bundle bundle,
                   std::vector<std::optional<Slice>> operands,
                   std::vector<std::optional<Slice>> results,
-                  AttributesMap attributes,
+                  ffi::CallFrame call_frame,
                   std::unique_ptr<ffi::ExecutionState> execution_state,
                   const HloComputation* called_computation);
 
@@ -135,7 +138,13 @@ class CustomCallThunk : public Thunk {
   // functions with XLA runtime. It's under construction, and still misses
   // a lot of features. Long term it will replace legacy custom calls.
   std::optional<XLA_FFI_Handler_Bundle> bundle_;
-  AttributesMap attributes_;
+
+  // Reference call frame pre-initialized at construction time.
+  std::optional<ffi::CallFrame> call_frame_;
+
+  // A pool of call frames used at run time. Newly created call frames are
+  // copied from the reference call frame and updated with buffer addresses.
+  std::optional<ObjectPool<ffi::CallFrame>> call_frames_;
 
   // Execution state bound to the FFI handler. Optional.
   std::unique_ptr<ffi::ExecutionState> execution_state_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
index 4770e7e68dc8..18c1fdb24283 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
@@ -71,5 +71,37 @@ TEST(CustomCallThunkTest, SimpleCustomCall) {
   EXPECT_TRUE(was_called);
 }
 
+TEST(CustomCallThunkTest, CustomCallOnCustomStream) {
+  // Whitebox test to ensure that custom calls respect execution_stream_id
+  // assignments.
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> extra_stream,
+                          executor->CreateStream());
+  // Setup the additional streams.
+  Thunk::ExecutionStreamIdMap additional_compute_streams = {};
+  additional_compute_streams[ExecutionStreamId(1)] = extra_stream.get();
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), BufferAllocations({}, 0, &allocator),
+      stream.get(), stream.get(), nullptr, nullptr, additional_compute_streams);
+
+  CustomCallThunk::CustomCallTarget target =
+      [&](se::Stream* stream_in_callback, void** args, const char* target_name,
+          size_t num_args, XlaCustomCallStatus* status) {
+        // We should be launching on the extra stream and not the default one.
+        EXPECT_THAT(stream_in_callback, ::testing::Eq(extra_stream.get()));
+      };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto thunk, CustomCallThunk::Create(Thunk::ThunkInfo(), "target_name",
+                                          target, {}, {}, ""));
+  // Setting this tells the thunk to dispatch on one of the additional streams.
+  thunk->set_execution_stream_id(ExecutionStreamId(1));
+  EXPECT_THAT(thunk->ExecuteOnStream(Thunk::ExecuteParams(params)),
+              ::tsl::testing::IsOk());
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
index 8c555858a0da..7e8347abf38f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
@@ -106,7 +106,8 @@ DynamicSliceThunk::DynamicSliceThunk(
   for (SliceDef& slice : slices_) {
     offsets_allocs_base_.push_back(offsets_allocs_size_);
     if (slice.sliced_shape.has_value()) {
-      offsets_allocs_size_ += slice.sliced_shape->rank() * sizeof(int64_t);
+      offsets_allocs_size_ +=
+          slice.sliced_shape->dimensions().size() * sizeof(int64_t);
     }
   }
 }
@@ -123,8 +124,10 @@ absl::Status DynamicSliceThunk::Prepare(
       TF_RET_CHECK(slice.orig_shape->IsArray());
       TF_RET_CHECK(slice.sliced_shape->IsArray());
 
-      TF_RET_CHECK(slice.offsets->size() == slice.orig_shape->rank());
-      TF_RET_CHECK(slice.sliced_shape->rank() == slice.orig_shape->rank());
+      TF_RET_CHECK(slice.offsets->size() ==
+                   slice.orig_shape->dimensions().size());
+      TF_RET_CHECK(slice.sliced_shape->dimensions().size() ==
+                   slice.orig_shape->dimensions().size());
     }
   }
 
@@ -198,14 +201,14 @@ absl::Status DynamicSliceThunk::ExecuteOnStream(const ExecuteParams& params) {
     const Shape& dst_shape = *slice.sliced_shape;
 
     absl::InlinedVector<int64_t, 4> slice_starts;
-    slice_starts.reserve(dst_shape.rank());
+    slice_starts.reserve(dst_shape.dimensions().size());
 
     // Number of issues d2h transfers to copy offset values from device to
     // host.
     int64_t num_transfers = 0;
 
-    // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
-    // components.
+    // Get offset for `argument_idx`-th argument, which has
+    // `dst_shape.dimensions_size()` components.
     for (auto [offset_idx, values] : llvm::enumerate(llvm::zip(
              *slice.offsets, src_shape.dimensions(), dst_shape.dimensions()))) {
       auto [offset, src_dim, dst_dim] = values;
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
index 6b0717265d85..e994be3f02d5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
@@ -81,14 +81,14 @@ class DynamicSliceThunk : public Thunk {
       Shape init_output_shape =
           this->indvar_init->entry_computation()->root_instruction()->shape();
       CHECK(this->indvar_init->entry_computation()->num_parameters() == 0 &&
-            init_output_shape.IsInteger() &&
+            init_output_shape.AreAllLeavesIntegers() &&
             ShapeUtil::IsScalar(init_output_shape))
           << "Induction variable init module expected with signature `() -> "
              "integer[]`.";
       Shape update_output_shape =
           this->indvar_update->entry_computation()->root_instruction()->shape();
       CHECK(this->indvar_update->entry_computation()->num_parameters() == 1 &&
-            update_output_shape.IsInteger() &&
+            update_output_shape.AreAllLeavesIntegers() &&
             ShapeUtil::IsScalar(update_output_shape))
           << "Induction variable update module expected with signature "
              "`(integer[]) -> integer[]`.";
@@ -96,7 +96,7 @@ class DynamicSliceThunk : public Thunk {
                                      ->parameter_instruction(0)
                                      ->shape();
       CHECK(ShapeUtil::IsScalar(update_input_shape) &&
-            update_input_shape.IsInteger())
+            update_input_shape.AreAllLeavesIntegers())
           << "Induction variable update module expected with signature "
              "`(integer[]) -> integer[]`.";
     }
@@ -132,7 +132,7 @@ class DynamicSliceThunk : public Thunk {
     std::optional<uint64_t> offset_byte_size;
   };
 
-  const SequentialThunk* get_embeded_thunk() const {
+  const SequentialThunk* get_embedded_thunk() const {
     return embedded_thunk_.get();
   }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc
index 5c571499c388..826ec109352d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/fft_thunk.cc
@@ -148,7 +148,7 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
     const int64_t fft_rank = fft_len.size();
     CHECK_LE(fft_rank, 3);
     int batch_size = 1;
-    for (int i = 0; i < input_shape.dimensions_size() - fft_rank; ++i) {
+    for (int i = 0; i < input_shape.dimensions().size() - fft_rank; ++i) {
       batch_size *= input_shape.dimensions(i);
     }
     uint64_t fft_length[3];
@@ -160,7 +160,7 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
     uint64_t output_distance = 1;
 
     for (int i = 0; i < fft_rank; ++i) {
-      auto dim_offset = input_shape.dimensions_size() - fft_rank + i;
+      auto dim_offset = input_shape.dimensions().size() - fft_rank + i;
       fft_length[i] = static_cast<uint64_t>(fft_len[i]);
       input_embed[i] = input_shape.dimensions(dim_offset);
       input_distance *= input_shape.dimensions(dim_offset);
diff --git a/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc b/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
index 777634369c38..2babea4a3bf6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
@@ -83,7 +83,7 @@ TEST(ForAllThunksTest, CommandBufferThunk) {
       Thunk::ThunkInfo(), std::move(thunk_sequence));
   Thunk* sequential_thunk_ptr = sequential_thunk.get();
 
-  CommandBufferThunk command_buffer_thunk(CommandBufferCmdSequence(),
+  CommandBufferThunk command_buffer_thunk(CommandBufferCmdExecutor(),
                                           Thunk::ThunkInfo(),
                                           std::move(sequential_thunk));
   EXPECT_THAT(GetAllThunks(&command_buffer_thunk),
@@ -102,10 +102,11 @@ TEST(ForAllThunksTest, ConditionalThunk) {
       Thunk::ThunkInfo(), std::move(thunk_sequence));
   SequentialThunk* sequential_thunk_ptr = sequential_thunk.get();
 
-  ConditionalThunkConfig config;
-  config.branch_thunks.push_back(std::move(sequential_thunk));
-  ConditionalThunk conditional_thunk(Thunk::ThunkInfo(), std::move(config),
-                                     BufferAllocation::Slice());
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+  branch_thunks.push_back(std::move(sequential_thunk));
+  ConditionalThunk conditional_thunk(
+      Thunk::ThunkInfo(), BufferAllocation::Slice(), std::move(branch_thunks),
+      /*branch_index_is_bool=*/false);
 
   EXPECT_THAT(GetAllThunks(&conditional_thunk),
               UnorderedElementsAre(thunk_ptr, sequential_thunk_ptr,
diff --git a/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.proto b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.proto
new file mode 100644
index 000000000000..9a77894a7a35
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/gemm_thunk.proto
@@ -0,0 +1,28 @@
+
+/* Copyright 2025 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+import "xla/service/buffer_assignment.proto";
+import "xla/stream_executor/gpu/gpu_blas_lt.proto";
+
+message GemmThunk {
+  xla.GemmConfigProto gemm_config = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto lhs_buffer = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto rhs_buffer = 3;
+  xla.buffer_assignment.BufferAllocationSliceProto output_buffer = 4;
+  optional xla.buffer_assignment.BufferAllocationSliceProto workspace = 5;
+  bool deterministic = 6;
+}
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
index 5f6be14eb1f7..ef7f10627a7e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -19,12 +19,15 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/container/node_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/stream_executor_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
@@ -35,117 +38,127 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+CublasLtMatmulThunk::CublasLtMatmulThunk(const CublasLtMatmulThunk& rhs)
+    : Thunk(Kind::kCublasLtMatmul, {}),
+      gemm_config_(rhs.gemm_config_),
+      epilogue_(rhs.epilogue_),
+      algorithm_idx_(rhs.algorithm_idx_),
+      canonical_hlo_(rhs.canonical_hlo_),
+      a_(rhs.a_),
+      b_(rhs.b_),
+      c_(rhs.c_),
+      d_(rhs.d_),
+      bias_(rhs.bias_),
+      aux_(rhs.aux_),
+      a_scale_(rhs.a_scale_),
+      b_scale_(rhs.b_scale_),
+      c_scale_(rhs.c_scale_),
+      d_scale_(rhs.d_scale_),
+      d_amax_(rhs.d_amax_),
+      workspace_(rhs.workspace_) {}
+
 CublasLtMatmulThunk::CublasLtMatmulThunk(
-    ThunkInfo thunk_info, GemmConfig gemm_config,
+    const HloInstruction* instr, GemmConfig gemm_config,
     se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
-    BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
-    BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
-    BufferAllocation::Slice bias_buffer, BufferAllocation::Slice aux_buffer,
+    BufferAllocation::Slice a, BufferAllocation::Slice b,
+    BufferAllocation::Slice c, BufferAllocation::Slice d,
+    BufferAllocation::Slice bias, BufferAllocation::Slice aux,
     BufferAllocation::Slice a_scale, BufferAllocation::Slice b_scale,
     BufferAllocation::Slice c_scale, BufferAllocation::Slice d_scale,
     BufferAllocation::Slice d_amax,
-    std::optional<const BufferAllocation::Slice> workspace_buffer)
-    : Thunk(Kind::kCublasLtMatmul, thunk_info),
+    std::optional<const BufferAllocation::Slice> workspace)
+    : Thunk(Kind::kCublasLtMatmul,
+            instr ? Thunk::ThunkInfo::WithProfileAnnotation(instr)
+                  : Thunk::ThunkInfo{}),
       gemm_config_(std::move(gemm_config)),
       epilogue_(epilogue),
       algorithm_idx_(algorithm_idx),
-      a_buffer_(a_buffer),
-      b_buffer_(b_buffer),
-      c_buffer_(c_buffer),
-      d_buffer_(d_buffer),
-      bias_buffer_(bias_buffer),
-      aux_buffer_(aux_buffer),
-      a_scale_buffer_(a_scale),
-      b_scale_buffer_(b_scale),
-      c_scale_buffer_(c_scale),
-      d_scale_buffer_(d_scale),
-      d_amax_buffer_(d_amax),
-      workspace_buffer_(workspace_buffer) {}
-
-absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
-  TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(params.stream));
-
-  TF_ASSIGN_OR_RETURN(auto algorithm,
-                      GetMatmulAlgorithm(params.stream, plan,
-                                         workspace_buffer_.has_value()
-                                             ? workspace_buffer_.value().size()
-                                             : 0));
+      a_(a),
+      b_(b),
+      c_(c),
+      d_(d),
+      bias_(bias),
+      aux_(aux),
+      a_scale_(a_scale),
+      b_scale_(b_scale),
+      c_scale_(c_scale),
+      d_scale_(d_scale),
+      d_amax_(d_amax),
+      workspace_(workspace) {
+  // The tests creating CublasLtMatmulThunk directly might not provide the
+  // pointer to the actual instruction, in this case Matmul plans are not
+  // cached.
+  if (instr != nullptr) {
+    canonical_hlo_ = xla::gpu::AutotuneCacheKey("unused", *instr).GetHlo();
+  }
+}
+
+absl::Status CublasLtMatmulThunk::ExecuteOnStreamInternal(
+    se::Stream* stream, const ExecuteParams& params) {
+  TF_ASSIGN_OR_RETURN(auto* plan, GetCachedMatmulPlan(params));
 
   VLOG(3) << "Running cublas_lt matmul thunk";
   const BufferAllocations& allocs = *params.buffer_allocations;
 
-  se::DeviceMemoryBase bias, a_scale, b_scale, c_scale, d_scale, d_amax;
-  if (bias_buffer_.allocation() != nullptr) {
-    bias = allocs.GetDeviceAddress(bias_buffer_);
+  se::DeviceMemoryBase bias, a_scale, b_scale, c_scale, d_scale, d_amax, aux,
+      workspace;
+  if (bias_.allocation() != nullptr) {
+    bias = allocs.GetDeviceAddress(bias_);
   }
-  if (a_scale_buffer_.allocation() != nullptr) {
-    a_scale = allocs.GetDeviceAddress(a_scale_buffer_);
+  if (a_scale_.allocation() != nullptr) {
+    a_scale = allocs.GetDeviceAddress(a_scale_);
   }
-  if (b_scale_buffer_.allocation() != nullptr) {
-    b_scale = allocs.GetDeviceAddress(b_scale_buffer_);
+  if (b_scale_.allocation() != nullptr) {
+    b_scale = allocs.GetDeviceAddress(b_scale_);
   }
-  if (c_scale_buffer_.allocation() != nullptr) {
-    c_scale = allocs.GetDeviceAddress(c_scale_buffer_);
+  if (c_scale_.allocation() != nullptr) {
+    c_scale = allocs.GetDeviceAddress(c_scale_);
   }
-  if (d_scale_buffer_.allocation() != nullptr) {
-    d_scale = allocs.GetDeviceAddress(d_scale_buffer_);
+  if (d_scale_.allocation() != nullptr) {
+    d_scale = allocs.GetDeviceAddress(d_scale_);
   }
-  if (d_amax_buffer_.allocation() != nullptr) {
-    d_amax = allocs.GetDeviceAddress(d_amax_buffer_);
+  if (d_amax_.allocation() != nullptr) {
+    d_amax = allocs.GetDeviceAddress(d_amax_);
   }
-
-  se::DeviceMemoryBase aux;
-  if (aux_buffer_.allocation() != nullptr) {
-    aux = allocs.GetDeviceAddress(aux_buffer_);
+  if (aux_.allocation() != nullptr) {
+    aux = allocs.GetDeviceAddress(aux_);
   }
-
-  se::DeviceMemoryBase workspace;
-  if (workspace_buffer_.has_value()) {
-    workspace = allocs.GetDeviceAddress(workspace_buffer_.value());
+  if (workspace_.has_value()) {
+    workspace = allocs.GetDeviceAddress(workspace_.value());
   }
 
   return plan->ExecuteOnStream(
-      params.stream, allocs.GetDeviceAddress(a_buffer_),
-      allocs.GetDeviceAddress(b_buffer_), allocs.GetDeviceAddress(c_buffer_),
-      allocs.GetDeviceAddress(d_buffer_), bias, aux, a_scale, b_scale, c_scale,
-      d_scale, d_amax, algorithm, workspace);
+      stream, allocs.GetDeviceAddress(a_), allocs.GetDeviceAddress(b_),
+      allocs.GetDeviceAddress(c_), allocs.GetDeviceAddress(d_), bias, aux,
+      a_scale, b_scale, c_scale, d_scale, d_amax, workspace);
 }
 
-absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
-    const se::Stream* stream) {
-  {
-    absl::MutexLock lock(&matmul_plans_cache_mutex_);
-    auto it = matmul_plans_cache_.find(stream);
-    if (it != matmul_plans_cache_.end()) return it->second.get();
-  }
-  TF_ASSIGN_OR_RETURN(auto plan, se::gpu::BlasLt::GetMatmulPlan(
-                                     stream, gemm_config_, epilogue_));
-
-  absl::MutexLock lock(&matmul_plans_cache_mutex_);
-  auto [it, _] = matmul_plans_cache_.emplace(stream, std::move(plan));
-  return it->second.get();
-}
-
-absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm>
-CublasLtMatmulThunk::GetMatmulAlgorithm(const se::Stream* stream,
-                                        const se::gpu::BlasLt::MatmulPlan* plan,
-                                        int64_t max_workspace) {
-  {
-    absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
-    auto it = matmul_algorithm_cache_.find(plan);
-    if (it != matmul_algorithm_cache_.end()) return it->second;
-  }
-  TF_ASSIGN_OR_RETURN(
-      auto algorithms,
-      plan->GetAlgorithms(stream,
-                          /*max_algorithm_count*/ 128,
-                          /*max_workspace_size*/ max_workspace));
-  TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
-
-  absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
-  auto [it, _] =
-      matmul_algorithm_cache_.emplace(plan, algorithms[algorithm_idx_]);
-  return it->second;
+absl::StatusOr<se::gpu::BlasLt::MatmulPlan*>
+CublasLtMatmulThunk::GetCachedMatmulPlan(const ExecuteParams& params) {
+  auto* blas_lt = se::gpu::BlasLt::Get(params.stream);
+  auto create = [&]() -> absl::StatusOr<se::gpu::BlasLt::MatmulPlanPtr> {
+    VLOG(2) << this << ": Adding new MatmulPlan for stream: " << params.stream
+            << " instr: " << canonical_hlo_;
+
+    TF_ASSIGN_OR_RETURN(auto plan,
+                        blas_lt->GetMatmulPlan(gemm_config_, epilogue_));
+    // if workspace buffer is not provided, consider onlt the algorithms which
+    // do not require a scratch space
+    int64_t max_workspace =
+        workspace_.has_value() ? workspace_.value().size() : 0;
+
+    // If autotuning is disabled, there is no point on retrieving all
+    // algorithms, it's enough to get the default one only.
+    int64_t num_algorithms =
+        algorithm_idx_ == 0 ? 1 : GemmConfig::kNumAlgorithms;
+    TF_ASSIGN_OR_RETURN(
+        auto algorithms,
+        plan->GetAlgorithms(params.stream, num_algorithms, max_workspace));
+
+    TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithms[algorithm_idx_]));
+    return std::move(plan);
+  };
+  return blas_lt->GetOrCreateMatmulPlan(canonical_hlo_, create);
 }
 
 absl::Status CublasLtMatmulThunk::Initialize(const InitializeParams& params) {
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
index 3f8570764024..60d7112b60c7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -20,9 +20,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -34,74 +32,52 @@ namespace gpu {
 
 class CublasLtMatmulThunk : public Thunk {
  public:
-  CublasLtMatmulThunk(
-      ThunkInfo thunk_info, GemmConfig gemm_config,
-      se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
-      BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
-      BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
-      BufferAllocation::Slice bias_buffer /* may be null */,
-      BufferAllocation::Slice aux_buffer /* may be null */,
-      BufferAllocation::Slice a_scale_buffer /* may be null */,
-      BufferAllocation::Slice b_scale_buffer /* may be null */,
-      BufferAllocation::Slice c_scale_buffer /* may be null */,
-      BufferAllocation::Slice d_scale_buffer /* may be null */,
-      BufferAllocation::Slice d_amax_buffer /* may be null */,
-      std::optional<const BufferAllocation::Slice> workspace_buffer);
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  CublasLtMatmulThunk(const HloInstruction* instr, GemmConfig gemm_config,
+                      se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+                      BufferAllocation::Slice a, BufferAllocation::Slice b,
+                      BufferAllocation::Slice c, BufferAllocation::Slice d,
+                      BufferAllocation::Slice bias /* may be null */,
+                      BufferAllocation::Slice aux /* may be null */,
+                      BufferAllocation::Slice a_scale /* may be null */,
+                      BufferAllocation::Slice b_scale /* may be null */,
+                      BufferAllocation::Slice c_scale /* may be null */,
+                      BufferAllocation::Slice d_scale /* may be null */,
+                      BufferAllocation::Slice d_amax /* may be null */,
+                      std::optional<const BufferAllocation::Slice> workspace);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return ExecuteOnStreamInternal(params.stream, params);
+  }
   absl::Status Initialize(const InitializeParams& params) override;
-
-  GemmConfig config() const { return gemm_config_; }
-  se::gpu::BlasLt::Epilogue epilogue() const { return epilogue_; }
-  int64_t algorithm_idx() const { return algorithm_idx_; }
-
-  BufferAllocation::Slice a_buffer() const { return a_buffer_; }
-  BufferAllocation::Slice b_buffer() const { return b_buffer_; }
-  BufferAllocation::Slice c_buffer() const { return c_buffer_; }
-  BufferAllocation::Slice d_buffer() const { return d_buffer_; }
-  BufferAllocation::Slice bias_buffer() const { return bias_buffer_; }
-  BufferAllocation::Slice aux_buffer() const { return aux_buffer_; }
-  BufferAllocation::Slice a_scale_buffer() const { return a_scale_buffer_; }
-  BufferAllocation::Slice b_scale_buffer() const { return b_scale_buffer_; }
-  BufferAllocation::Slice c_scale_buffer() const { return c_scale_buffer_; }
-  BufferAllocation::Slice d_scale_buffer() const { return d_scale_buffer_; }
-  BufferAllocation::Slice d_amax_buffer() const { return d_amax_buffer_; }
   std::optional<const BufferAllocation::Slice> workspace() const {
-    return workspace_buffer_;
+    return workspace_;
   }
 
- private:
-  absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
-      const stream_executor::Stream* stream);
-  absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
-      const se::Stream* stream, const se::gpu::BlasLt::MatmulPlan* plan,
-      int64_t max_workspace);
-
-  absl::Mutex matmul_plans_cache_mutex_;
-  absl::flat_hash_map<const stream_executor::Stream*,
-                      se::gpu::BlasLt::MatmulPlanPtr>
-      matmul_plans_cache_ ABSL_GUARDED_BY(matmul_plans_cache_mutex_);
+ protected:
+  CublasLtMatmulThunk(const CublasLtMatmulThunk& rhs);
 
-  absl::Mutex matmul_algorithm_cache_mutex_;
-  absl::flat_hash_map<const se::gpu::BlasLt::MatmulPlan*,
-                      se::gpu::BlasLt::MatmulAlgorithm>
-      matmul_algorithm_cache_ ABSL_GUARDED_BY(matmul_algorithm_cache_mutex_);
+  absl::Status ExecuteOnStreamInternal(se::Stream* stream,
+                                       const ExecuteParams& params);
+  absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetCachedMatmulPlan(
+      const ExecuteParams& params);
 
+ protected:
   GemmConfig gemm_config_;
   se::gpu::BlasLt::Epilogue epilogue_;
   int64_t algorithm_idx_;
-  BufferAllocation::Slice a_buffer_;
-  BufferAllocation::Slice b_buffer_;
-  BufferAllocation::Slice c_buffer_;
-  BufferAllocation::Slice d_buffer_;
-  BufferAllocation::Slice bias_buffer_;
-  BufferAllocation::Slice aux_buffer_;
-  BufferAllocation::Slice a_scale_buffer_;
-  BufferAllocation::Slice b_scale_buffer_;
-  BufferAllocation::Slice c_scale_buffer_;
-  BufferAllocation::Slice d_scale_buffer_;
-  BufferAllocation::Slice d_amax_buffer_;
-  std::optional<const BufferAllocation::Slice> workspace_buffer_;
+  std::string canonical_hlo_;
+  BufferAllocation::Slice a_;
+  BufferAllocation::Slice b_;
+  BufferAllocation::Slice c_;
+  BufferAllocation::Slice d_;
+  BufferAllocation::Slice bias_;
+  BufferAllocation::Slice aux_;
+  BufferAllocation::Slice a_scale_;
+  BufferAllocation::Slice b_scale_;
+  BufferAllocation::Slice c_scale_;
+  BufferAllocation::Slice d_scale_;
+  BufferAllocation::Slice d_amax_;
+  std::optional<const BufferAllocation::Slice> workspace_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
new file mode 100644
index 000000000000..80c7f1d11865
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -0,0 +1,371 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+
+#include <cstddef>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <random>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/error_spec.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/semantic_version.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/test_macros.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace xla::gpu {
+
+namespace {
+
+class GpuBlasLtMatmulThunkTest : public HloTestBase {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_cublaslt(true);
+    debug_options.set_xla_gpu_enable_triton_gemm(false);
+    return debug_options;
+  }
+  se::StreamExecutor* default_exec() {
+    return backend().default_stream_executor();
+  }
+  const se::DeviceDescription& device_desc(se::StreamExecutor* exec = nullptr) {
+    if (exec == nullptr) {
+      exec = default_exec();
+    }
+    return exec->GetDeviceDescription();
+  }
+  const se::GpuComputeCapability& gpu_comp(se::StreamExecutor* exec = nullptr) {
+    return device_desc(exec).gpu_compute_capability();
+  }
+
+  void SetUp() override {
+    if (auto* rocm = std::get_if<se::RocmComputeCapability>(&gpu_comp());
+        rocm != nullptr && !rocm->has_hipblaslt()) {
+      GTEST_SKIP() << "No hipblas-lt support on this architecture!";
+    }
+  }
+
+  void CreateExecuteThunksFromHLO(se::StreamExecutor* executor,
+                                  absl::string_view hlo_string);
+};
+
+struct GpuBlasLtThunkBuilder {
+  GpuBlasLtThunkBuilder(se::StreamExecutor* exec,
+                        const se::GpuComputeCapability& gpu_comp)
+      : exec_(exec), allocator_(exec), gpu_comp_(gpu_comp) {}
+
+  absl::StatusOr<std::unique_ptr<CublasLtMatmulThunk>> CreateThunk(
+      HloInstruction* gemm) {
+    TF_ASSIGN_OR_RETURN(const auto gpu_config,
+                        gemm->backend_config<GpuBackendConfig>());
+    const auto& backend_config = gpu_config.gemm_backend_config();
+
+    TF_ASSIGN_OR_RETURN(
+        bool has_vector_bias,
+        gpublas_lt::EpilogueAddsVectorBias(backend_config.epilogue()));
+    bool has_matrix_bias = backend_config.beta() != 0;
+    TF_ASSIGN_OR_RETURN(
+        auto epilogue, gpublas_lt::AsBlasLtEpilogue(backend_config.epilogue()));
+
+    std::vector<BufferAllocation::Slice> slices;
+    std::vector<size_t> buf_sizes;
+    for (auto op : gemm->operands()) {
+      auto size = ShapeUtil::ByteSizeOf(op->shape());
+      buf_sizes.push_back(size);
+    }
+    const auto& output_shape =
+        gemm->shape().IsTuple() ? gemm->shape().tuple_shapes(0) : gemm->shape();
+    buf_sizes.push_back(ShapeUtil::ByteSizeOf(output_shape));
+
+    size_t idx = allocs_.size();
+    slices.reserve(buf_sizes.size());
+    for (auto size : buf_sizes) {
+      mem_buffers_.emplace_back();
+      TF_ASSIGN_OR_RETURN(mem_buffers_.back(),
+                          allocator_.Allocate(exec_->device_ordinal(), size));
+      allocs_.emplace_back(/*index=*/idx++, size, /*color=*/0);
+      slices.emplace_back(&allocs_.back(), /*offset*/ 0, size);
+    }
+    // we need at least 3 buffers: lhs, rhs and output
+    EXPECT_EQ(slices.size(),
+              3 + size_t{has_matrix_bias} + size_t{has_vector_bias});
+    TF_ASSIGN_OR_RETURN(auto gemm_config, GemmConfig::For(gemm, gpu_comp_));
+
+    BufferAllocation::Slice bias;
+    if (has_vector_bias) {
+      bias = slices[has_matrix_bias ? 3 : 2];
+    }
+
+    return std::make_unique<CublasLtMatmulThunk>(
+        gemm, std::move(gemm_config), epilogue,
+        /*algorithm_idx*/ 0, slices[0], slices[1],
+        has_matrix_bias ? slices[2] : slices.back(), slices.back(), bias,
+        BufferAllocation::Slice{} /* aux */,
+        BufferAllocation::Slice{} /* a_scale */,
+        BufferAllocation::Slice{} /* b_scale */,
+        BufferAllocation::Slice{} /* c_scale */,
+        BufferAllocation::Slice{} /* d_scale */,
+        BufferAllocation::Slice{} /* d_amax */, std::nullopt /* workspace */);
+  }
+
+  std::unique_ptr<BufferAllocations> buffer_allocations() {
+    std::vector<se::DeviceMemoryBase> buffers(mem_buffers_.size());
+    for (size_t i = 0; i < buffers.size(); i++) {
+      buffers[i] = *mem_buffers_[i];
+    }
+    return std::make_unique<BufferAllocations>(buffers, exec_->device_ordinal(),
+                                               &allocator_);
+  }
+
+ private:
+  se::StreamExecutor* exec_;
+  se::StreamExecutorMemoryAllocator allocator_;
+  se::GpuComputeCapability gpu_comp_;
+  std::deque<BufferAllocation> allocs_;
+  std::vector<se::OwningDeviceMemory> mem_buffers_;
+};
+
+void GpuBlasLtMatmulThunkTest::CreateExecuteThunksFromHLO(
+    se::StreamExecutor* executor, absl::string_view hlo_string) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          this->ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      RunHloPass(
+          GemmRewriter(gpu_comp(executor),
+                       /*toolkit_version=*/se::SemanticVersion{12, 4, 0}),
+          module.get()));
+  ASSERT_TRUE(changed);
+
+  GpuBlasLtThunkBuilder builder(executor, gpu_comp(executor));
+  std::vector<std::unique_ptr<CublasLtMatmulThunk>> gemm_thunks;
+
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (IsCublasLtMatmul(*instr)) {
+      TF_ASSERT_OK_AND_ASSIGN(auto thunk, builder.CreateThunk(instr));
+      gemm_thunks.push_back(std::move(thunk));
+    }
+  }
+  auto allocs = builder.buffer_allocations();
+  ServiceExecutableRunOptions run_options;
+
+  auto thread_func = [&](se::Stream* stream) -> absl::Status {
+    auto thunk_params = Thunk::ExecuteParams::Create(
+        run_options, *allocs, stream, stream, nullptr, nullptr);
+
+    Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+    for (auto& thunk : gemm_thunks) {
+      TF_RETURN_IF_ERROR(
+          thunk->Initialize({executor, source, allocs.get(), stream, stream}));
+      TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(thunk_params));
+    }
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+    return absl::OkStatus();
+  };
+
+  // Running BlasLt thunks across multiple streams with shared matmul plan
+  int num_streams = 10;
+  struct StreamInfo {
+    std::unique_ptr<se::Stream> stream;
+    absl::Status result;
+  };
+  std::vector<StreamInfo> threads(num_streams);
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "test_streams",
+                                 num_streams);
+    // use two different loops to make sure all threads start at the same time
+    for (auto& [s, _] : threads) {
+      TF_ASSERT_OK_AND_ASSIGN(s, executor->CreateStream());
+    }
+    // some compilers complain about lambda capture of structured bindings
+    for (auto& info : threads) {
+      pool.Schedule([&] { info.result = thread_func(info.stream.get()); });
+    }
+  }
+  for (const auto& [_, res] : threads) {
+    TF_ASSERT_OK(res);
+  }
+}
+
+const absl::string_view hlo_single_plan = R"(
+HloModule SharedMatmulPlan
+
+ENTRY test {
+  x1 = f32[101,407] parameter(0)
+  x2 = f32[101,407] parameter(1)
+  x3 = f32[101,407] parameter(2)
+  y = f32[407,400] parameter(3)
+  z = f32[407,400] parameter(4)
+  w = f32[407,400] parameter(5)
+  dot_a = f32[101,400] dot(x1, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_b = f32[101,400] dot(x2, z), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_c = f32[101,400] dot(x3, w), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  mul_ab = f32[101,400] multiply(dot_a, dot_b)
+  ROOT abc = f32[101,400] subtract(mul_ab, dot_c)
+})";
+
+// same as above but now we have non-default epilogue for one dot operation
+const absl::string_view hlo_two_plans =
+    R"(
+HloModule SharedMatmulPlan
+
+ENTRY test {
+  x1 = f32[101,407] parameter(0)
+  x2 = f32[101,407] parameter(1)
+  x3 = f32[101,407] parameter(2)
+  y = f32[407,400] parameter(3)
+  z = f32[407,400] parameter(4)
+  w = f32[407,400] parameter(5)
+  c = f32[] constant(0)
+  c_bcast = f32[101,400] broadcast(c), dimensions={}
+  dot_a = f32[101,400] dot(x1, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  out_a = f32[101,400] maximum(dot_a, c_bcast)
+  dot_b = f32[101,400] dot(x2, z), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_c = f32[101,400] dot(x3, w), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  mul_ab = f32[101,400] multiply(out_a, dot_b)
+  ROOT abc = f32[101,400] subtract(mul_ab, dot_c)
+})";
+
+XLA_TEST_F(GpuBlasLtMatmulThunkTest, SharedMatmulPlansUnit) {
+  auto* exec = default_exec();
+  auto* blas_lt = exec->AsBlas()->GetBlasLt();
+  EXPECT_NE(blas_lt, nullptr);
+  blas_lt->ClearMatmulPlanCache();
+
+  CreateExecuteThunksFromHLO(exec, hlo_single_plan);
+  // Assert that only one matmul plan was created
+  EXPECT_EQ(blas_lt->GetMatmulPlanCacheSize(), 1);
+
+  CreateExecuteThunksFromHLO(exec, hlo_two_plans);
+  // Assert that we have now 2 MatmulPlans (one more created for ReLu epilogue).
+  EXPECT_EQ(blas_lt->GetMatmulPlanCacheSize(), 2);
+}
+
+// Same as above but instead of creating thunks manually, we use XLA runtime
+XLA_TEST_F(GpuBlasLtMatmulThunkTest, SharedMatmulPlansFunctional) {
+  auto* exec = default_exec();
+  auto* blas_lt = exec->AsBlas()->GetBlasLt();
+  EXPECT_NE(blas_lt, nullptr);
+  blas_lt->ClearMatmulPlanCache();
+
+  EXPECT_TRUE(RunAndCompare(hlo_single_plan, ErrorSpec{1e-3, 1e-3}));
+  // Assert that only one MatmulPlan cache entry was created.
+  EXPECT_EQ(blas_lt->GetMatmulPlanCacheSize(), 1);
+
+  EXPECT_TRUE(RunAndCompare(hlo_two_plans, ErrorSpec{1e-3, 1e-3}));
+  // Assert that we have now 2 MatmulPlans (one more created for ReLu epilogue).
+  EXPECT_EQ(blas_lt->GetMatmulPlanCacheSize(), 2);
+}
+
+// Mock BlasLt interface to test only the cache function
+struct MockBlasLt : public se::gpu::BlasLt {
+  absl::Status Init() override { return absl::OkStatus(); }
+
+  absl::StatusOr<MatmulPlanPtr> GetMatmulPlan(const se::gpu::GemmConfig&,
+                                              Epilogue) const override {
+    return MatmulPlanPtr{};
+  }
+  ~MockBlasLt() override = default;
+};
+
+XLA_TEST_F(GpuBlasLtMatmulThunkTest, CacheUnitTest) {
+  auto thread_func = [&](MockBlasLt* blas_lt, const std::string& key,
+                         int sleep_ms) -> absl::Status {
+    auto create_func = [&]() -> absl::StatusOr<se::gpu::BlasLt::MatmulPlanPtr> {
+      // We don't care about creation of matmul plans -> emulate it with a sleep
+      absl::SleepFor(absl::Milliseconds(sleep_ms));
+      return se::gpu::BlasLt::MatmulPlanPtr{};
+    };
+
+    return blas_lt->GetOrCreateMatmulPlan(key, create_func).status();
+  };  // thread_func
+
+  const int num_blas_lts = 30, num_streams = 30,
+            total = num_blas_lts * num_streams, mod = 11;
+
+  std::vector<absl::Status> results(total);
+  std::vector<MockBlasLt> blas_lts(num_blas_lts);
+
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "test_streams", total);
+    std::random_device rand_dev;
+    std::default_random_engine engine(rand_dev());
+    std::uniform_int_distribution<int> uniform_sleeps(1, 500);
+
+    for (int j = 0, k = 0; j < num_blas_lts; j++) {
+      for (int i = 0; i < num_streams; i++, k++) {
+        int sleep_ms = uniform_sleeps(engine), x = i + j + 1;
+        // we could have same keys for different executors
+        auto key = std::to_string((x * x * x) % mod);
+        VLOG(1) << j << "," << i << " :" << key;
+        pool.Schedule([&, key, sleep_ms, k, j] {
+          results[k] = thread_func(&blas_lts[j], key, sleep_ms);
+        });
+      }
+    }  // for j
+  }  // end block
+  for (auto& res : results) {
+    TF_ASSERT_OK(res);
+  }
+
+  // We assert that we have the same number of cache entries for each executor
+  // and that this number is <= mod (based on our logic to create keys)
+  std::optional<size_t> size;
+  for (const auto& blas_lt : blas_lts) {
+    if (!size) {
+      size = blas_lt.GetMatmulPlanCacheSize();
+    } else {
+      EXPECT_EQ(*size, blas_lt.GetMatmulPlanCacheSize());
+    }
+  }
+  EXPECT_TRUE(size.has_value() && static_cast<int>(*size <= mod));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
new file mode 100644
index 000000000000..2d63042ee172
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.cc
@@ -0,0 +1,270 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/global_device_id.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla::gpu {
+
+using tsl::AsyncValueRef;
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+
+// For sharded buffers we should execute Send/Recv operations only on devices
+// with maximal sharding, and do nothing on every other device.
+static absl::StatusOr<bool> ShouldSkip(
+    absl::string_view operation, const Thunk::ExecuteParams& params,
+    const std::optional<GlobalDeviceId>& device_constraint) {
+  if (!device_constraint.has_value()) return false;
+
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
+  bool skip = global_device_id != *device_constraint;
+  if (skip) {
+    VLOG(3) << "Skip " << operation << " as device id " << global_device_id
+            << " doesn't match device id constraint " << *device_constraint;
+  }
+
+  return skip;
+}
+
+//===----------------------------------------------------------------------===//
+// HostSendRecvAsyncEvents
+//===----------------------------------------------------------------------===//
+
+absl::Status HostSendRecvAsyncEvents::Emplace(
+    se::StreamExecutor* executor, int32_t channel_id,
+    tsl::AsyncValueRef<std::unique_ptr<se::Event>> event) {
+  Key key = {executor, channel_id};
+
+  absl::MutexLock lock(&mutex_);
+  if (auto it = events_.try_emplace(key, std::move(event)); it.second)
+    return absl::OkStatus();
+
+  return absl::InternalError(absl::StrFormat(
+      "Async send/recv event already exists (channel_id=%d)", channel_id));
+}
+
+absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>>
+HostSendRecvAsyncEvents::Extract(se::StreamExecutor* executor,
+                                 int32_t channel_id) {
+  Key key = {executor, channel_id};
+
+  absl::MutexLock lock(&mutex_);
+  if (auto event = events_.extract(key)) return std::move(event.mapped());
+
+  return absl::InternalError(absl::StrFormat(
+      "Async send/recv event was not found (channel_id==%d)", channel_id));
+}
+
+//===----------------------------------------------------------------------===//
+// HostSendThunk
+//===----------------------------------------------------------------------===//
+
+HostSendThunk::HostSendThunk(
+    ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+    int64_t channel_id, std::shared_ptr<HostSendRecvAsyncEvents> events,
+    absl::flat_hash_map<std::string, std::string> frontend_attrs,
+    std::optional<GlobalDeviceId> device_constraint)
+    : Thunk(Thunk::kHostSend, thunk_info),
+      shape_(shape),
+      buffer_(buffer),
+      channel_id_(channel_id),
+      events_(std::move(events)),
+      frontend_attrs_(std::move(frontend_attrs)),
+      device_constraint_(device_constraint) {}
+
+absl::Status HostSendThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Send buffer: channel_id=" << channel_id_
+          << "; shape=" << shape_.ToString();
+
+  TF_ASSIGN_OR_RETURN(bool skip,
+                      ShouldSkip("sending buffer", params, device_constraint_));
+  if (skip) return absl::OkStatus();
+
+  TraceMe trace(
+      [&] { return TraceMeEncode("Send", {{"channel_id", channel_id_}}); });
+
+  // Use device_to_host stream if it is available.
+  se::Stream* stream = params.device_to_host_stream;
+  if (stream) {
+    TF_RETURN_IF_ERROR(stream->WaitFor(params.stream));
+  } else {
+    stream = params.stream;
+  }
+
+  se::DeviceMemoryBase src =
+      params.buffer_allocations->GetDeviceAddress(buffer_);
+
+  // Send buffer to a handler registered with the executable.
+  if (auto* send = params.send_device_memory_function) {
+    TF_ASSIGN_OR_RETURN(
+        AsyncValueRef<std::unique_ptr<se::Event>> done,
+        (*send)(channel_id_, stream, shape_, src, frontend_attrs_));
+    return events_->Emplace(stream->parent(), channel_id_, std::move(done));
+  }
+
+  return absl::InvalidArgumentError(
+      "SendDeviceMemoryFunction is not available");
+}
+
+//===----------------------------------------------------------------------===//
+// HostSendDoneThunk
+//===----------------------------------------------------------------------===//
+
+HostSendDoneThunk::HostSendDoneThunk(
+    ThunkInfo thunk_info, int64_t channel_id,
+    std::shared_ptr<HostSendRecvAsyncEvents> events,
+    std::optional<GlobalDeviceId> device_constraint)
+    : Thunk(Thunk::kHostSendDone, thunk_info),
+      channel_id_(channel_id),
+      events_(std::move(events)),
+      device_constraint_(device_constraint) {}
+
+absl::Status HostSendDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Wait for send completion: channel_id=" << channel_id_;
+
+  TF_ASSIGN_OR_RETURN(bool skip, ShouldSkip("waiting for send completion",
+                                            params, device_constraint_));
+  if (skip) return absl::OkStatus();
+
+  TraceMe trace(
+      [&] { return TraceMeEncode("SendDone", {{"channel_id", channel_id_}}); });
+
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(auto done_event, events_->Extract(executor, channel_id_));
+
+  // Wait until send handler will record an event on the stream.
+  BlockUntilReady(done_event.GetAsyncValue());
+  if (done_event.IsError()) return done_event.GetError();
+
+  VLOG(5) << "Completed Send operation: channel_id=" << channel_id_;
+
+  // Once event is recorded we can add a stream dependency.
+  return params.stream->WaitFor(done_event.get().get());
+}
+
+//===----------------------------------------------------------------------===//
+// HostRecvThunk
+//===----------------------------------------------------------------------===//
+
+HostRecvThunk::HostRecvThunk(
+    ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+    int64_t channel_id, std::shared_ptr<HostSendRecvAsyncEvents> events,
+    absl::flat_hash_map<std::string, std::string> frontend_attrs,
+    std::optional<GlobalDeviceId> device_constraint)
+    : Thunk(Thunk::kHostRecv, thunk_info),
+      shape_(shape),
+      buffer_(buffer),
+      channel_id_(channel_id),
+      events_(std::move(events)),
+      frontend_attrs_(std::move(frontend_attrs)),
+      device_constraint_(device_constraint) {}
+
+absl::Status HostRecvThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Recv buffer: channel_id=" << channel_id_
+          << "; shape=" << shape_.ToString();
+
+  TF_ASSIGN_OR_RETURN(
+      bool skip, ShouldSkip("receiving buffer", params, device_constraint_));
+  if (skip) return absl::OkStatus();
+
+  TraceMe trace(
+      [&] { return TraceMeEncode("Recv", {{"channel_id", channel_id_}}); });
+
+  // Use host_to_device stream if it is available.
+  se::Stream* stream = params.host_to_device_stream;
+  if (stream) {
+    TF_RETURN_IF_ERROR(stream->WaitFor(params.stream));
+  } else {
+    stream = params.stream;
+  }
+
+  se::DeviceMemoryBase dst =
+      params.buffer_allocations->GetDeviceAddress(buffer_);
+
+  // Recv buffer from a handler registered with the run options.
+  if (auto* recv = params.recv_device_memory_function) {
+    TF_ASSIGN_OR_RETURN(
+        AsyncValueRef<std::unique_ptr<se::Event>> done,
+        (*recv)(channel_id_, stream, shape_, &dst, frontend_attrs_));
+    return events_->Emplace(stream->parent(), channel_id_, std::move(done));
+  }
+
+  return absl::InvalidArgumentError(
+      "RecvDeviceMemoryFunction is not available");
+}
+
+//===----------------------------------------------------------------------===//
+// HostRecvDoneThunk
+//===----------------------------------------------------------------------===//
+
+HostRecvDoneThunk::HostRecvDoneThunk(
+    ThunkInfo thunk_info, int64_t channel_id,
+    std::shared_ptr<HostSendRecvAsyncEvents> events,
+    std::optional<GlobalDeviceId> device_constraint)
+    : Thunk(Thunk::kHostRecvDone, thunk_info),
+      channel_id_(channel_id),
+      events_(std::move(events)) {}
+
+absl::Status HostRecvDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Wait for recv completion: channel_id=" << channel_id_;
+
+  TF_ASSIGN_OR_RETURN(bool skip, ShouldSkip("waiting for recv completion",
+                                            params, device_constraint_));
+  if (skip) return absl::OkStatus();
+
+  TraceMe trace(
+      [&] { return TraceMeEncode("RecvDone", {{"channel_id", channel_id_}}); });
+
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(auto done_event, events_->Extract(executor, channel_id_));
+
+  // Wait until send handler will record an event on the stream.
+  BlockUntilReady(done_event.GetAsyncValue());
+  if (done_event.IsError()) return done_event.GetError();
+
+  VLOG(5) << "Completed Recv operation: channel=" << channel_id_;
+
+  // Once event is recorded we can add a stream dependency.
+  return params.stream->WaitFor(done_event.get().get());
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
new file mode 100644
index 000000000000..8334d0d31b0c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/host_send_recv_thunk.h
@@ -0,0 +1,171 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_HOST_SEND_RECV_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_HOST_SEND_RECV_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/global_device_id.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// HostSendRecvAsyncEvents
+//===----------------------------------------------------------------------===//
+
+// Host Send/Recv operations have two levels of async behavior:
+//
+// (1) AsyncValueRef will become available only after send/recv handler
+//     schedules all activities on the device.
+//
+// (2) se::Event will become available when device activity recorded by
+//     send/recv handlers complete.
+//
+// We  keep track of Send/Recv commands in flight, and synchronize `send` and
+// `recv` operations with corresponding `send-done` and `recv-done`.
+//
+// Each channel can have at most one event in flight for a given executor.
+//
+// We have a single instance of `HostSendRecvAsyncEvents` for each Gpu
+// executable, and all thunks share it using a shared pointer.
+//
+// TODO(ezhulenev): Rename to `SendRecvEvents` once we remove deprecated XLA
+// runtime, as it has name conflict.
+class HostSendRecvAsyncEvents {
+ public:
+  // Emplace a new send/recv completion event.
+  absl::Status Emplace(se::StreamExecutor* executor, int32_t channel_id,
+                       tsl::AsyncValueRef<std::unique_ptr<se::Event>> event);
+
+  // Extract a send/recv completion event.
+  absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> Extract(
+      se::StreamExecutor* executor, int32_t channel_id);
+
+ private:
+  using Key = std::pair<se::StreamExecutor*, /*channel_id=*/int64_t>;
+
+  absl::Mutex mutex_;
+  absl::flat_hash_map<Key, tsl::AsyncValueRef<std::unique_ptr<se::Event>>>
+      events_ ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// HostSendThunk
+//===----------------------------------------------------------------------===//
+
+class HostSendThunk : public Thunk {
+ public:
+  HostSendThunk(ThunkInfo thunk_info, Shape shape,
+                BufferAllocation::Slice buffer, int64_t channel_id,
+                std::shared_ptr<HostSendRecvAsyncEvents> events,
+                absl::flat_hash_map<std::string, std::string> frontend_attrs,
+                std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  Shape shape_;
+  BufferAllocation::Slice buffer_;
+
+  int64_t channel_id_;
+
+  std::shared_ptr<HostSendRecvAsyncEvents> events_;
+  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+//===----------------------------------------------------------------------===//
+// HostSendDoneThunk
+//===----------------------------------------------------------------------===//
+
+class HostSendDoneThunk : public Thunk {
+ public:
+  HostSendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+                    std::shared_ptr<HostSendRecvAsyncEvents> events,
+                    std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  int64_t channel_id_;
+
+  std::shared_ptr<HostSendRecvAsyncEvents> events_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+//===----------------------------------------------------------------------===//
+// HostRecvThunk
+//===----------------------------------------------------------------------===//
+
+class HostRecvThunk : public Thunk {
+ public:
+  HostRecvThunk(ThunkInfo thunk_info, Shape shape,
+                BufferAllocation::Slice buffer, int64_t channel_id,
+                std::shared_ptr<HostSendRecvAsyncEvents> events,
+                absl::flat_hash_map<std::string, std::string> frontend_attrs,
+                std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  Shape shape_;
+  BufferAllocation::Slice buffer_;
+
+  int64_t channel_id_;
+
+  std::shared_ptr<HostSendRecvAsyncEvents> events_;
+  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+//===----------------------------------------------------------------------===//
+// HostRecvDoneThunk
+//===----------------------------------------------------------------------===//
+
+class HostRecvDoneThunk : public Thunk {
+ public:
+  HostRecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+                    std::shared_ptr<HostSendRecvAsyncEvents> events,
+                    std::optional<GlobalDeviceId> device_constraint);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  int64_t channel_id_;
+
+  std::shared_ptr<HostSendRecvAsyncEvents> events_;
+  std::optional<GlobalDeviceId> device_constraint_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_HOST_SEND_RECV_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
index 97df5aa688fd..f956b4a7036f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
@@ -20,9 +20,10 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <vector>
+#include <variant>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/logging.h"
@@ -66,7 +68,7 @@ KernelThunk::KernelThunk(
       tma_metadata_(std::move(tma_metadata)) {
   args_.reserve(kernel_arguments.size());
   written_.reserve(kernel_arguments.size());
-  for (const auto& kernel_argument : kernel_arguments) {
+  for (const KernelArgument& kernel_argument : kernel_arguments) {
     if (!kernel_argument.first_with_same_slice().has_value()) {
       args_.push_back(kernel_argument.slice());
       written_.push_back(kernel_argument.written());
@@ -89,8 +91,7 @@ absl::Status KernelThunk::Initialize(const InitializeParams& params) {
   // We could alternatively do this within ExecuteOnStream, but doing it here
   // lets the time spent loading the kernel not count towards our execution
   // profiles.
-  auto it = kernel_cache_.find(params.executor);
-  if (kernel_cache_.end() == it) {
+  if (!kernel_cache_.contains(params.executor)) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::Kernel> kernel,
         CreateKernel(kernel_name_, args_.size(), params.src.text,
@@ -103,27 +104,35 @@ absl::Status KernelThunk::Initialize(const InitializeParams& params) {
 }
 
 static void PrintBufferContents(
-    se::Stream* stream, absl::Span<const se::DeviceMemoryBase> buffer_args) {
+    se::Stream* stream, absl::Span<const se::KernelArgument> kernel_args) {
   int input_idx = 0;
-  for (const se::DeviceMemoryBase& buf : buffer_args) {
-    auto host_buffer = std::make_unique<char[]>(buf.size());
-    CHECK_OK(stream->Memcpy(host_buffer.get(), buf, buf.size()));
-    CHECK_OK(stream->BlockHostUntilDone());
-
-    std::string buffer_contents;
-    for (int i = 0; i < buf.size(); i++) {
-      absl::StrAppendFormat(&buffer_contents, "%x ",
-                            static_cast<unsigned>(host_buffer[i]));
+  for (const se::KernelArgument& arg : kernel_args) {
+    if (std::holds_alternative<se::DeviceMemoryBase>(arg)) {
+      se::DeviceMemoryBase buf = std::get<se::DeviceMemoryBase>(arg);
+
+      auto host_buffer = std::make_unique<char[]>(buf.size());
+      CHECK_OK(stream->Memcpy(host_buffer.get(), buf, buf.size()));
+      CHECK_OK(stream->BlockHostUntilDone());
+
+      std::string buffer_contents;
+      for (int i = 0; i < buf.size(); ++i) {
+        absl::StrAppendFormat(&buffer_contents, "%x ",
+                              static_cast<unsigned>(host_buffer[i]));
+      }
+      VLOG(100) << "BUF(" << input_idx++ << ") = " << buffer_contents;
+    } else {
+      se::TensorMap tensor_map = std::get<se::TensorMap>(arg);
+      VLOG(100) << "TENSOR_MAP(" << input_idx++ << ") = ";
+      for (std::byte element : tensor_map.storage) {
+        VLOG(100) << absl::StrFormat("%x ", static_cast<unsigned>(element));
+      }
     }
-    VLOG(100) << "BUF(" << input_idx++ << ") = " << buffer_contents;
   }
 }
 
 absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
   // Load the kernel.
   se::StreamExecutor* executor = params.stream->parent();
-  LaunchDimensions launch_dimensions;
-  std::optional<se::ClusterDim> cluster_dim;
   se::Kernel* kernel = nullptr;
 
   TF_ASSIGN_OR_RETURN(
@@ -135,44 +144,43 @@ absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
     auto it = kernel_cache_.find(executor);
     CHECK(it != kernel_cache_.end())
         << "Initialize() not called for StreamExecutor " << executor;
-    launch_dimensions = launch_dimensions_;
-    cluster_dim = cluster_dim_;
     kernel = it->second.get();
   }
 
   VLOG(3) << "Launching " << kernel->name();
-  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
+  absl::InlinedVector<std::variant<se::DeviceMemoryBase, se::TensorMap>, 4>
+      kernel_args;
   stream_executor::gpu::TmaMetadata tma_metadata =
       tma_metadata_.value_or(stream_executor::gpu::TmaMetadata{});
   for (const auto& [idx, arg] : llvm::enumerate(args_)) {
     se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
     VLOG(3) << "  Arg: alloc #" << arg.index() << ", offset: " << arg.offset()
             << ": " << buf.opaque() << " (" << buf.size() << "B)";
-    auto it = tma_metadata.arg_index_to_tma_info.find(idx);
-    if (it != tma_metadata.arg_index_to_tma_info.end()) {
+
+    if (auto it = tma_metadata.arg_index_to_tma_info.find(idx);
+        it != tma_metadata.arg_index_to_tma_info.end()) {
+      // TMA descriptor argument.
       stream_executor::gpu::TmaDescriptor tma_desc = it->second;
-      TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase tensor_map,
+      TF_ASSIGN_OR_RETURN(se::TensorMap tensor_map,
                           executor->CreateTensorMap(tma_desc, buf.opaque()));
       VLOG(3) << "  Using TensorMap for arg #" << arg.index() << ": "
-              << tma_desc.ToString() << "; buffer: " << tensor_map.opaque()
-              << " (" << tensor_map.size() << "B)";
-      buffer_args.push_back(tensor_map);
+              << tma_desc.ToString();
+      kernel_args.push_back(std::move(tensor_map));
     } else {
-      buffer_args.push_back(buf);
+      // Buffer argument.
+      kernel_args.push_back(buf);
     }
   }
 
   if (VLOG_IS_ON(100)) {
-    PrintBufferContents(stream, buffer_args);
+    PrintBufferContents(stream, kernel_args);
   }
 
-  if (cluster_dim.has_value()) {
-    return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
-                                 cluster_dim.value(), stream);
-  } else {
-    return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
-                                 stream);
-  }
+  return ExecuteKernelOnStream(
+      *kernel,
+      absl::Span<std::variant<se::DeviceMemoryBase, se::TensorMap>>(
+          kernel_args.data(), kernel_args.size()),
+      launch_dimensions_, cluster_dim_, stream);
 }
 
 //===----------------------------------------------------------------------===//
@@ -187,7 +195,7 @@ CustomKernelThunk::CustomKernelThunk(
       custom_kernel_(std::move(custom_kernel)) {
   args_.reserve(kernel_arguments.size());
   written_.reserve(kernel_arguments.size());
-  for (const auto& kernel_argument : kernel_arguments) {
+  for (const KernelArgument& kernel_argument : kernel_arguments) {
     if (!kernel_argument.first_with_same_slice().has_value()) {
       args_.push_back(kernel_argument.slice());
       written_.push_back(kernel_argument.written());
@@ -202,8 +210,7 @@ std::string CustomKernelThunk::ToString(int indent) const {
 absl::Status CustomKernelThunk::Initialize(const InitializeParams& params) {
   absl::MutexLock lock(&mutex_);
 
-  auto it = kernel_cache_.find(params.executor);
-  if (kernel_cache_.end() == it) {
+  if (!kernel_cache_.contains(params.executor)) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::Kernel> kernel,
         params.executor->LoadKernel(custom_kernel_.kernel_spec()));
@@ -233,7 +240,11 @@ absl::Status CustomKernelThunk::ExecuteOnStream(const ExecuteParams& params) {
   }
 
   if (VLOG_IS_ON(100)) {
-    PrintBufferContents(params.stream, buffer_args);
+    absl::InlinedVector<se::KernelArgument, 4> kernel_args;
+    for (const se::DeviceMemoryBase& arg : buffer_args) {
+      kernel_args.push_back(arg);
+    }
+    PrintBufferContents(params.stream, kernel_args);
   }
 
   se::KernelArgsDeviceMemoryArray args(buffer_args,
diff --git a/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers.cc b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers.cc
new file mode 100644
index 000000000000..bff8fe2bb77c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers.cc
@@ -0,0 +1,57 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/make_batch_pointers.h"
+
+#include <cstddef>
+
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/make_batch_pointers_kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+
+absl::Status MakeBatchPointers(se::Stream* stream,
+                               se::DeviceMemoryBase base_ptr,
+                               size_t stride_bytes, size_t n,
+                               se::DeviceMemoryBase ptrs_out) {
+  se::StreamExecutor* executor = stream->parent();
+  size_t threads_per_block = [&] {
+    if (executor->GetPlatform()->id() ==
+        stream_executor::rocm::kROCmPlatformId) {
+      return 256;
+    } else {
+      return 128;
+    }
+  }();
+
+  TF_ASSIGN_OR_RETURN(
+      auto kernel,
+      stream_executor::gpu::GpuKernelRegistry::GetGlobalRegistry()
+          .LoadKernel<stream_executor::gpu::MakeBatchPointersKernel>(executor));
+
+  return kernel.Launch(se::ThreadDim(threads_per_block, 1, 1),
+                       se::BlockDim(CeilOfRatio(n, threads_per_block), 1, 1),
+                       stream, base_ptr, stride_bytes, n, ptrs_out);
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.h b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers.h
similarity index 92%
rename from third_party/xla/xla/service/gpu/make_batch_pointers.h
rename to third_party/xla/xla/backends/gpu/runtime/make_batch_pointers.h
index 6e437fafdcb6..0ebc10d2e681 100644
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.h
+++ b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
-#define XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
+#ifndef XLA_BACKENDS_GPU_RUNTIME_MAKE_BATCH_POINTERS_H_
+#define XLA_BACKENDS_GPU_RUNTIME_MAKE_BATCH_POINTERS_H_
 
 #include <cstddef>
 
@@ -56,4 +56,4 @@ absl::Status MakeBatchPointers(se::Stream* stream,
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
+#endif  // XLA_BACKENDS_GPU_RUNTIME_MAKE_BATCH_POINTERS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
new file mode 100644
index 000000000000..96d454ec2009
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/make_batch_pointers_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/make_batch_pointers.h"
+
+#include <array>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+using ::testing::ElementsAreArray;
+using tsl::testing::IsOk;
+
+static absl::StatusOr<stream_executor::StreamExecutor*> GpuExecutor() {
+  TF_ASSIGN_OR_RETURN(stream_executor::Platform * platform,
+                      PlatformUtil::GetDefaultPlatform());
+  return platform->ExecutorForDevice(0);
+}
+
+TEST(MakeBatchPointersTest, Basic) {
+  TF_ASSERT_OK_AND_ASSIGN(stream_executor::StreamExecutor * executor,
+                          GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<stream_executor::Stream> stream,
+                          executor->CreateStream());
+
+  // We don't care what `base` points to, we only need a pointer to a buffer
+  // that we can use as a base.
+  stream_executor::DeviceMemory<char> base = executor->AllocateScalar<char>();
+  stream_executor::DeviceMemory<void*> ptrs_out =
+      executor->AllocateArray<void*>(8);
+
+  constexpr int kStride = 13;
+  constexpr int kN = 8;
+
+  EXPECT_THAT(MakeBatchPointers(stream.get(), base, kStride, kN, ptrs_out),
+              IsOk());
+
+  std::array<void*, kN> result = {};
+
+  EXPECT_THAT(
+      executor->SynchronousMemcpy(result.data(), ptrs_out, kN * sizeof(void*)),
+      IsOk());
+
+  std::array<void*, kN> expected = {
+      base.base() + 0 * kStride, base.base() + 1 * kStride,
+      base.base() + 2 * kStride, base.base() + 3 * kStride,
+      base.base() + 4 * kStride, base.base() + 5 * kStride,
+      base.base() + 6 * kStride, base.base() + 7 * kStride,
+  };
+
+  EXPECT_THAT(result, ElementsAreArray(expected));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_all_gather_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_all_gather_thunk.cc
deleted file mode 100644
index dfe9f1d0a5ef..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_all_gather_thunk.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_all_gather_thunk.h"
-
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/stream.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-namespace impl {
-NcclAllGatherConfig GetNcclAllGatherConfig(
-    const HloAllGatherInstruction* inst) {
-  NcclAllGatherConfig config;
-  config.config = GetNcclCollectiveConfig(inst, inst->use_global_device_ids());
-  return config;
-}
-
-absl::Status CheckImplementableInst(const HloAllGatherInstruction* inst) {
-  for (HloInstruction* operand : inst->operands()) {
-    const Shape& shape = operand->shape();
-
-    TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kNcclAllGather));
-
-    if (!ShapeUtil::IsEffectivelyMostMajorDimension(
-            shape, inst->all_gather_dimension())) {
-      return absl::AbortedError(absl::StrFormat(
-          "all-gather dim %u is not the most major in input shape %s",
-          inst->all_gather_dimension(), shape.ToString(/*print_layout=*/true)));
-    }
-  }
-
-  return absl::OkStatus();
-}
-}  // namespace impl
-
-NcclAllGatherStartThunk::NcclAllGatherStartThunk(
-    ThunkInfo thunk_info, const HloAllGatherInstruction* inst,
-    std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
-    : NcclCollectiveThunk(Thunk::kNcclAllGatherStart, thunk_info,
-                          IsGPUSyncCollective(*inst),
-                          AsyncStreamKind::kCollective),
-      config_(impl::GetNcclAllGatherConfig(inst)),
-      buffers_(std::move(buffers)) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
-}
-
-/*static*/ absl::Status NcclAllGatherStartThunk::CheckImplementable(
-    const HloAllGatherInstruction* inst, int64_t replica_count,
-    int64_t partition_count) {
-  return AddOpDescription<NcclAllGatherStartThunk>(
-      impl::CheckImplementableInst(inst), inst, replica_count, partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclAllGatherStartThunk::GetGroupMode(
-    const HloAllGatherInstruction* inst) {
-  return impl::GetNcclAllGatherConfig(inst).config.group_mode;
-}
-
-absl::Status NcclAllGatherStartThunk::RunNcclCollective(
-    const ExecuteParams& params, se::Stream& stream,
-    CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  return xla::gpu::RunAllGather(collectives, device_buffers, stream,
-                                comm_handle.comm);
-}
-
-absl::Status RunAllGather(GpuCollectives* collectives,
-                          std::vector<DeviceBufferPair>& buffers,
-                          se::Stream& stream, Communicator* comm) {
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing all-gather from device ordinal: " << device_ordinal;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
-
-  TF_RETURN_IF_ERROR(collectives->GroupStart());
-
-  for (DeviceBufferPair& buffer : buffers) {
-    TF_RETURN_IF_ERROR(comm->AllGather(
-        buffer.source_buffer, buffer.destination_buffer, buffer.element_type,
-        buffer.element_count, GpuCollectives::On(stream)));
-  }
-
-  TF_RETURN_IF_ERROR(collectives->GroupEnd());
-
-  VLOG(3) << "Done performing all-gather for ordinal: " << device_ordinal;
-  return absl::OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_all_gather_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_all_gather_thunk.h
deleted file mode 100644
index 4d20e59d5736..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_all_gather_thunk.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/stream_executor/stream.h"
-
-namespace xla {
-namespace gpu {
-
-struct NcclAllGatherConfig {
-  NcclCollectiveConfig config;
-};
-
-// Thunk that performs a NCCL-based All-Gather among CUDA GPU-based replicas.
-class NcclAllGatherStartThunk : public NcclCollectiveThunk {
- public:
-  NcclAllGatherStartThunk(ThunkInfo thunk_info,
-                          const HloAllGatherInstruction* inst,
-                          std::vector<Buffer> buffers,
-                          bool p2p_memcpy_enabled = false);
-
-  static const char* GetHloOpName() { return "all-gather-start"; }
-
-  static absl::Status CheckImplementable(const HloAllGatherInstruction* inst,
-                                         int64_t replica_count,
-                                         int64_t partition_count);
-
-  static CollectiveOpGroupMode GetGroupMode(
-      const HloAllGatherInstruction* inst);
-
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-  absl::Span<const Buffer> buffers() const { return buffers_; }
-
- protected:
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-
- private:
-  const NcclAllGatherConfig config_;
-  const std::vector<Buffer> buffers_;
-};
-
-absl::Status RunAllGather(GpuCollectives* collectives,
-                          std::vector<DeviceBufferPair>& buffers,
-                          se::Stream& stream, Communicator* comm);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_all_reduce_thunk.cc
deleted file mode 100644
index 401761e6ef0f..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_all_reduce_thunk.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_all_reduce_thunk.h"
-
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-absl::Status RunAllReduce(GpuCollectives* collectives,
-                          ReductionKind reduction_kind,
-                          std::vector<DeviceBufferPair>& buffers,
-                          se::Stream& stream, Communicator* comm) {
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing all-reduce from device ordinal: " << device_ordinal;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
-
-  TF_RETURN_IF_ERROR(collectives->GroupStart());
-  for (DeviceBufferPair& buffer : buffers) {
-    TF_RETURN_IF_ERROR(comm->AllReduce(
-        buffer.source_buffer, buffer.destination_buffer, buffer.element_type,
-        buffer.element_count, reduction_kind, GpuCollectives::On(stream)));
-  }
-
-  return collectives->GroupEnd();
-}
-
-namespace impl {
-
-absl::Status CheckImplementableInst(const HloInstruction* inst,
-                                    Thunk::Kind reduction_op) {
-  for (HloInstruction* operand : inst->operands()) {
-    TF_RETURN_IF_ERROR(IsValidOperand(operand->shape(), reduction_op));
-  }
-
-  if (!MatchReductionComputation(inst->called_computations().front())
-           .has_value()) {
-    return absl::UnimplementedError("Unrecognized reduction computation");
-  }
-
-  return absl::OkStatus();
-}
-
-template <typename HloInstType>
-NcclAllReduceConfig GetNcclAllReduceConfigInst(HloInstType* inst) {
-  std::optional<ReductionKind> reduction_kind =
-      MatchReductionComputation(inst->called_computations().front());
-  CHECK(reduction_kind.has_value());
-
-  NcclAllReduceConfig config;
-  config.config = GetNcclCollectiveConfig(inst, inst->use_global_device_ids());
-  config.reduction_kind = *reduction_kind;
-  return config;
-}
-
-template <typename HloInstType>
-CollectiveOpGroupMode GetGroupModeInst(HloInstType* inst) {
-  return GetNcclAllReduceConfigInst(inst).config.group_mode;
-}
-
-}  // namespace impl
-
-NcclAllReduceReduceScatterThunkBase::NcclAllReduceReduceScatterThunkBase(
-    Thunk::Kind kind, ThunkInfo thunk_info, NcclAllReduceConfig config,
-    std::vector<Buffer> buffers, bool is_sync)
-    : NcclCollectiveThunk(kind, thunk_info, is_sync,
-                          AsyncStreamKind::kCollective),
-      config_(std::move(config)),
-      buffers_(std::move(buffers)) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
-}
-
-NcclAllReduceStartThunk::NcclAllReduceStartThunk(
-    ThunkInfo thunk_info, const HloAllReduceInstruction* inst,
-    std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
-    : NcclAllReduceReduceScatterThunkBase(
-          Thunk::kNcclAllReduceStart, thunk_info,
-          impl::GetNcclAllReduceConfigInst(inst), std::move(buffers),
-          IsGPUSyncCollective(*inst)) {}
-
-absl::Status NcclAllReduceStartThunk::CheckImplementable(
-    const HloAllReduceInstruction* inst, int64_t replica_count,
-    int64_t partition_count) {
-  return AddOpDescription<NcclAllReduceStartThunk>(
-      impl::CheckImplementableInst(inst, Thunk::kNcclAllReduceStart), inst,
-      replica_count, partition_count);
-}
-
-CollectiveOpGroupMode NcclAllReduceStartThunk::GetGroupMode(
-    const HloAllReduceInstruction* inst) {
-  return impl::GetGroupModeInst(inst);
-}
-
-absl::Status NcclAllReduceStartThunk::RunNcclCollective(
-    const ExecuteParams& params, se::Stream& stream,
-    CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  return ::xla::gpu::RunAllReduce(collectives, config_.reduction_kind,
-                                  device_buffers, stream, comm_handle.comm);
-}
-
-NcclReduceScatterStartThunk::NcclReduceScatterStartThunk(
-    ThunkInfo thunk_info, const HloReduceScatterInstruction* inst,
-    std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
-    : NcclAllReduceReduceScatterThunkBase(
-          Thunk::kNcclReduceScatterStart, thunk_info,
-          impl::GetNcclAllReduceConfigInst(inst), std::move(buffers),
-          IsGPUSyncCollective(*inst)) {}
-
-/*static*/ absl::Status NcclReduceScatterStartThunk::CheckImplementable(
-    const HloReduceScatterInstruction* inst, int64_t replica_count,
-    int64_t partition_count) {
-  return AddOpDescription<NcclReduceScatterStartThunk>(
-      impl::CheckImplementableInst(inst, Thunk::kNcclReduceScatterStart), inst,
-      replica_count, partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclReduceScatterStartThunk::GetGroupMode(
-    const HloReduceScatterInstruction* inst) {
-  return impl::GetGroupModeInst(inst);
-}
-
-absl::Status NcclReduceScatterStartThunk::RunNcclCollective(
-    const ExecuteParams& params, se::Stream& stream,
-    CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  return ::xla::gpu::RunReduceScatter(collectives, config_.reduction_kind,
-                                      device_buffers, stream, comm_handle.comm);
-}
-
-absl::Status RunReduceScatter(GpuCollectives* collectives,
-                              ReductionKind reduction_kind,
-                              std::vector<DeviceBufferPair>& buffers,
-                              se::Stream& stream, Communicator* comm) {
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing reduce-scatter from device ordinal: "
-          << device_ordinal;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
-
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
-
-  TF_RETURN_IF_ERROR(collectives->GroupStart());
-
-  for (DeviceBufferPair& buffer : buffers) {
-    // buffer.element_count is the source buffers element count. For
-    // ncclReduceScatter, we need the destination buffers element count.
-    TF_RET_CHECK(buffer.element_count % num_ranks == 0)
-        << "Source buffer was not an exact multiple of the number of "
-           "participants.";
-
-    TF_RETURN_IF_ERROR(comm->ReduceScatter(
-        buffer.source_buffer, buffer.destination_buffer, buffer.element_type,
-        buffer.element_count / num_ranks, reduction_kind,
-        GpuCollectives::On(stream)));
-  }
-
-  return collectives->GroupEnd();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_all_reduce_thunk.h
deleted file mode 100644
index f204a707eb78..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_all_reduce_thunk.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/stream_executor/stream.h"
-
-namespace xla {
-namespace gpu {
-
-struct NcclAllReduceConfig {
-  NcclCollectiveConfig config;
-  ReductionKind reduction_kind;
-};
-
-// Thunk that performs a NCCL-based All-Reduce or Reduce-Scatter among CUDA
-// GPU-based replicas.
-class NcclAllReduceReduceScatterThunkBase : public NcclCollectiveThunk {
- public:
-  NcclAllReduceReduceScatterThunkBase(Kind kind, ThunkInfo thunk_info,
-                                      NcclAllReduceConfig config,
-                                      std::vector<Buffer> buffers,
-                                      bool is_sync);
-
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-  ReductionKind reduction_kind() const { return config_.reduction_kind; }
-
-  absl::Span<const Buffer> buffers() const { return buffers_; }
-
- protected:
-  const NcclAllReduceConfig config_;
-  const std::vector<Buffer> buffers_;
-};
-
-// -----------------------------------------------------------------------------
-// AllReduce thunk.
-// -----------------------------------------------------------------------------
-
-class NcclAllReduceStartThunk : public NcclAllReduceReduceScatterThunkBase {
- public:
-  NcclAllReduceStartThunk(ThunkInfo thunk_info,
-                          const HloAllReduceInstruction* inst,
-                          std::vector<Buffer> buffers,
-                          bool p2p_memcpy_enabled = false);
-
-  static const char* GetHloOpName() { return "all-reduce-start"; }
-
-  static absl::Status CheckImplementable(const HloAllReduceInstruction* inst,
-                                         int64_t replica_count,
-                                         int64_t partition_count);
-
-  static CollectiveOpGroupMode GetGroupMode(
-      const HloAllReduceInstruction* inst);
-
- protected:
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-};
-
-// -----------------------------------------------------------------------------
-// ReduceScatter thunk
-// -----------------------------------------------------------------------------
-class NcclReduceScatterStartThunk : public NcclAllReduceReduceScatterThunkBase {
- public:
-  NcclReduceScatterStartThunk(ThunkInfo thunk_info,
-                              const HloReduceScatterInstruction* inst,
-                              std::vector<Buffer> buffers,
-                              bool p2p_memcpy_enabled = false);
-
-  static const char* GetHloOpName() { return "reduce-scatter-start"; }
-
-  static absl::Status CheckImplementable(
-      const HloReduceScatterInstruction* inst, int64_t replica_count,
-      int64_t partition_count);
-
-  static CollectiveOpGroupMode GetGroupMode(
-      const HloReduceScatterInstruction* inst);
-
- protected:
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-};
-
-// -----------------------------------------------------------------------------
-
-absl::Status RunAllReduce(GpuCollectives* collectives,
-                          ReductionKind reduction_kind,
-                          std::vector<DeviceBufferPair>& buffers,
-                          se::Stream& stream, Communicator* comm);
-
-absl::Status RunReduceScatter(GpuCollectives* collectives,
-                              ReductionKind reduction_kind,
-                              std::vector<DeviceBufferPair>& buffers,
-                              se::Stream& stream, Communicator* comm);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_all_to_all_thunk.cc
deleted file mode 100644
index cc8446043ec2..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_all_to_all_thunk.cc
+++ /dev/null
@@ -1,358 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_all_to_all_thunk.h"
-
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/substitute.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-NcclAllToAllConfig GetNcclAllToAllConfig(const HloAllToAllInstruction* instr) {
-  NcclAllToAllConfig config;
-  // FIXME(b/180174349): LMHLO AllToAll incorrectly has use_global_device_ids
-  // attribute and it should be removed.
-  config.config = GetNcclCollectiveConfig(instr, std::nullopt);
-  config.has_split_dimension = instr->split_dimension().has_value();
-  return config;
-}
-
-}  // namespace
-
-NcclAllToAllStartThunk::NcclAllToAllStartThunk(
-    ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
-    std::vector<NcclCollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
-    : NcclCollectiveThunk(Thunk::kNcclAllToAllStart, thunk_info,
-                          IsGPUSyncCollective(*instr),
-                          AsyncStreamKind::kCollective),
-      config_(GetNcclAllToAllConfig(instr)),
-      buffers_(std::move(buffers)),
-      p2p_memcpy_enabled_(p2p_memcpy_enabled) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
-}
-
-/*static*/ absl::Status NcclAllToAllStartThunk::CheckImplementable(
-    const HloAllToAllInstruction* instr, int64_t replica_count,
-    int64_t partition_count) {
-  auto status = [&instr]() -> absl::Status {
-    std::optional<uint64_t> split_dim = instr->split_dimension();
-    for (HloInstruction* operand : instr->operands()) {
-      Shape shape = operand->shape();
-      TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kNcclAllToAll));
-      if (split_dim &&
-          !ShapeUtil::IsEffectivelyMostMajorDimension(shape, *split_dim)) {
-        return absl::UnimplementedError(absl::Substitute(
-            "all-to-all split dim $0 is not the most major in input shape $1",
-            *split_dim, shape.ToString(/*print_layout=*/true)));
-      }
-    }
-    return absl::OkStatus();
-  };
-  return AddOpDescription<NcclAllToAllStartThunk>(
-      status(), instr, replica_count, partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclAllToAllStartThunk::GetGroupMode(
-    const HloAllToAllInstruction* instr) {
-  return GetNcclAllToAllConfig(instr).config.group_mode;
-}
-
-absl::Status NcclAllToAllStartThunk::Initialize(
-    const InitializeParams& params) {
-  TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
-  device_count_ = params.local_device_count;
-  CHECK_GT(device_count_, 0);
-  VLOG(5) << "Local device count: " << device_count_;
-
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-
-  if (is_local() && p2p_memcpy_enabled_) {
-    const CollectiveStreamId stream_id = nccl_stream_id();
-    AsyncStreamKind stream_kind = GetAsyncStreamKind();
-    TF_ASSIGN_OR_RETURN(
-        CommunicatorHandle comm_handle,
-        GetNcclComm(collectives, *params.collective_params,
-                    *params.collective_cliques, config().replica_groups,
-                    config().group_mode, stream_id, stream_kind));
-    TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm_handle.comm->NumRanks());
-    se::StreamExecutor* executor = params.executor;
-    {
-      absl::MutexLock lock(&pointer_maps_mutex_);
-      if (!send_pointer_maps_.count(executor)) {
-        TF_ASSIGN_OR_RETURN(
-            std::unique_ptr<se::MemoryAllocation> alloc,
-            executor->HostMemoryAllocate(num_ranks * sizeof(uint64_t)));
-        bool inserted =
-            send_pointer_maps_.insert({executor, std::move(alloc)}).second;
-        CHECK(inserted);
-        TF_ASSIGN_OR_RETURN(
-            alloc, executor->HostMemoryAllocate(num_ranks * sizeof(uint64_t)));
-        inserted =
-            receive_pointer_maps_.insert({executor, std::move(alloc)}).second;
-        CHECK(inserted);
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status NcclAllToAllStartThunk::RunNcclCollective(
-    const ExecuteParams& params, se::Stream& stream,
-    CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-
-  if (is_local() && p2p_memcpy_enabled_) {
-    uint64_t* send_pointer_map = nullptr;
-    uint64_t* receive_pointer_map = nullptr;
-    {
-      absl::MutexLock lock(&pointer_maps_mutex_);
-      send_pointer_map = reinterpret_cast<uint64_t*>(
-          send_pointer_maps_[stream.parent()]->opaque());
-      receive_pointer_map = reinterpret_cast<uint64_t*>(
-          receive_pointer_maps_[stream.parent()]->opaque());
-    }
-    return xla::gpu::RunMemCpyAllToAll(collectives, config_.has_split_dimension,
-                                       device_buffers, stream, comm_handle.comm,
-                                       send_pointer_map, receive_pointer_map);
-  }
-  return xla::gpu::RunAllToAll(collectives, config_.has_split_dimension,
-                               device_buffers, stream, comm_handle.comm);
-}
-
-AsyncStreamKind NcclAllToAllStartThunk::GetAsyncStreamKind() const {
-  if (is_local() && p2p_memcpy_enabled_) {
-    return AsyncStreamKind::kMemCpyP2P;
-  }
-  return NcclCollectiveThunk::GetAsyncStreamKind();
-}
-
-bool NcclAllToAllStartThunk::is_local() const {
-  for (const auto& replica_group : config_.config.replica_groups) {
-    const int64_t node_id = replica_group.replica_ids().at(0) / device_count_;
-    if (!absl::c_all_of(replica_group.replica_ids(),
-                        [this, node_id](const int64_t rank) {
-                          return rank / device_count_ == node_id;
-                        })) {
-      return false;
-    }
-  }
-  return true;
-}
-
-absl::Status RunAllToAll(GpuCollectives* collectives, bool has_split_dimension,
-                         std::vector<DeviceBufferPair>& buffers,
-                         se::Stream& stream, Communicator* comm) {
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing all-to-all from device ordinal: " << device_ordinal
-          << ", has_split_dimension: " << has_split_dimension;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
-
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
-
-  PrimitiveType element_type = buffers[0].element_type;
-  int32_t element_count = buffers[0].element_count;
-
-  // All buffers must have the same element type and count.
-  bool all_buffers_match = absl::c_all_of(buffers, [&](const auto& buffer) {
-    return buffer.element_type == element_type &&
-           buffer.element_count == element_count;
-  });
-
-  if (!all_buffers_match) {
-    return InvalidArgument(
-        "All buffers must have the same element type and count");
-  }
-
-  // AllToAll can operate in two modes. Either it specifies a split dimension,
-  // in which case inputs are split and outputs concatenated in that dimension
-  // (here, we only support dimension 0), or it takes a list of inputs
-  // and produces a tuple of outputs.
-  absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers;
-  absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers;
-
-  if (has_split_dimension) {
-    TF_RET_CHECK(element_count % num_ranks == 0)
-        << "Buffer element count must be an exact multiple of the number of "
-           "participants";
-    size_t chunk_element_count = element_count / num_ranks;
-
-    for (const DeviceBufferPair& buffer : buffers) {
-      for (int peer = 0; peer < num_ranks; ++peer) {
-        send_buffers.push_back(collectives->Slice(
-            buffer.source_buffer, element_type, peer * chunk_element_count,
-            chunk_element_count));
-        recv_buffers.push_back(collectives->Slice(
-            buffer.destination_buffer, element_type, peer * chunk_element_count,
-            chunk_element_count));
-      }
-    }
-
-    return comm->AllToAll(send_buffers, recv_buffers, element_type,
-                          chunk_element_count, GpuCollectives::On(stream));
-
-  } else {
-    for (const DeviceBufferPair& buffer : buffers) {
-      send_buffers.push_back(buffer.source_buffer);
-      recv_buffers.push_back(buffer.destination_buffer);
-    }
-
-    return comm->AllToAll(send_buffers, recv_buffers, element_type,
-                          element_count, GpuCollectives::On(stream));
-  }
-}
-
-static absl::Status SendPtrToPeer(void* ptr, RankId peer, Communicator* comm,
-                                  se::Stream& stream) {
-  VLOG(3) << absl::StreamFormat(
-      "RecvPtrFromPeer on device #%d; peer=%d; comm=%p; stream=%p",
-      stream.parent()->device_ordinal(), peer.value(), comm, &stream);
-
-  return comm->Send(se::DeviceMemoryBase(ptr, sizeof(void*)), U64, 1, peer,
-                    GpuCollectives::On(stream));
-}
-
-static absl::Status RecvPtrFromPeer(void* ptr, RankId peer, Communicator* comm,
-                                    se::Stream& stream) {
-  VLOG(3) << absl::StreamFormat(
-      "RecvPtrFromPeer on device #%d; peer=%d; comm=%p; stream=%p",
-      stream.parent()->device_ordinal(), peer.value(), comm, &stream);
-
-  return comm->Recv(se::DeviceMemoryBase(ptr, sizeof(void*)), U64, 1, peer,
-                    GpuCollectives::On(stream));
-}
-
-// TODO(b/380457503): Memcpy AllToAll implementation must be moved to
-// NcclCommunicator implementation.
-absl::Status RunMemCpyAllToAll(GpuCollectives* collectives,
-                               bool has_split_dimension,
-                               std::vector<DeviceBufferPair>& buffers,
-                               se::Stream& stream, Communicator* comm,
-                               uint64_t send_pointer_map[],
-                               uint64_t receive_pointer_map[]) {
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing mem-copy-all-to-all from device ordinal: "
-          << device_ordinal;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
-
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
-
-  // AllToAll can operate in two modes. Either it specifies a split dimension,
-  // in which case inputs are split and outputs concatenated in that dimension
-  // (here, we only support dimension 0), or it takes a list of inputs
-  // and produces a tuple of outputs.
-  if (has_split_dimension) {
-    for (DeviceBufferPair& buffer : buffers) {
-      TF_RET_CHECK(buffer.element_count % num_ranks == 0)
-          << "Buffer was not an exact multiple of the number of participants.";
-
-      size_t chunk_element_count = buffer.element_count / num_ranks;
-
-      TF_RETURN_IF_ERROR(collectives->GroupStart());
-      for (int peer = 0; peer < num_ranks; ++peer) {
-        se::DeviceMemoryBase recv_slice =
-            collectives->Slice(buffer.destination_buffer, buffer.element_type,
-                               peer * chunk_element_count, chunk_element_count);
-        send_pointer_map[peer] = (uint64_t)recv_slice.opaque();
-
-        TF_RETURN_IF_ERROR(
-            SendPtrToPeer(&send_pointer_map[peer], RankId(peer), comm, stream));
-        TF_RETURN_IF_ERROR(RecvPtrFromPeer(&receive_pointer_map[peer],
-                                           RankId(peer), comm, stream));
-      }
-      TF_RETURN_IF_ERROR(collectives->GroupEnd());
-      TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
-
-      for (int peer = 0; peer < num_ranks; ++peer) {
-        se::DeviceMemoryBase send_slice =
-            collectives->Slice(buffer.source_buffer, buffer.element_type,
-                               peer * chunk_element_count, chunk_element_count);
-        se::DeviceMemoryBase dst_addr =
-            se::DeviceMemoryBase((void*)receive_pointer_map[peer]);
-        TF_RETURN_IF_ERROR(
-            stream.MemcpyD2D(&dst_addr, send_slice, send_slice.size()));
-      }
-    }
-  } else {
-    TF_RET_CHECK(buffers.size() == num_ranks)
-        << "Number of inputs didn't match the number of participants.";
-
-    TF_RETURN_IF_ERROR(collectives->GroupStart());
-    for (int peer = 0; peer < num_ranks; ++peer) {
-      send_pointer_map[peer] =
-          (uint64_t)buffers[peer].destination_buffer.opaque();
-
-      TF_RETURN_IF_ERROR(
-          SendPtrToPeer(&send_pointer_map[peer], RankId(peer), comm, stream));
-      TF_RETURN_IF_ERROR(RecvPtrFromPeer(&receive_pointer_map[peer],
-                                         RankId(peer), comm, stream));
-    }
-    TF_RETURN_IF_ERROR(collectives->GroupEnd());
-    TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
-
-    for (int peer = 0; peer < num_ranks; ++peer) {
-      // double buffer, exchange data with peer
-      se::DeviceMemoryBase dst_addr =
-          se::DeviceMemoryBase((void*)receive_pointer_map[peer]);
-      TF_RETURN_IF_ERROR(stream.MemcpyD2D(&dst_addr,
-                                          buffers[peer].source_buffer,
-                                          buffers[peer].source_buffer.size()));
-    }
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_all_to_all_thunk.h
deleted file mode 100644
index e0416b5f997f..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_all_to_all_thunk.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream.h"
-
-namespace xla {
-namespace gpu {
-
-struct NcclAllToAllConfig {
-  NcclCollectiveConfig config;
-  bool has_split_dimension;
-};
-
-// Thunk that performs a NCCL-based All-to-All among CUDA GPU-based replicas.
-class NcclAllToAllStartThunk : public NcclCollectiveThunk {
- public:
-  NcclAllToAllStartThunk(ThunkInfo thunk_info,
-                         const HloAllToAllInstruction* instr,
-                         std::vector<Buffer> buffers, bool p2p_memcpy_enabled);
-
-  // Returns whether the given instruction can be lowered to a nccl all-to-all
-  // call.
-  static absl::Status CheckImplementable(const HloAllToAllInstruction* instr,
-                                         int64_t replica_count,
-                                         int64_t partition_count);
-
-  absl::Status Initialize(const InitializeParams& params) override;
-
-  static const char* GetHloOpName() { return "all-to-all-start"; }
-
-  static CollectiveOpGroupMode GetGroupMode(
-      const HloAllToAllInstruction* instr);
-
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-  bool has_split_dimension() const { return config_.has_split_dimension; }
-  absl::Span<const Buffer> buffers() const { return buffers_; }
-
- protected:
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-
-  AsyncStreamKind GetAsyncStreamKind() const override;
-
-  bool is_local() const;
-
- private:
-  const NcclAllToAllConfig config_;
-  const std::vector<Buffer> buffers_;
-  int64_t device_count_ = 1;
-  bool p2p_memcpy_enabled_ = false;
-  absl::Mutex pointer_maps_mutex_;
-  // Maps from a device to a uint64_t array of size num_devices. The array is
-  // written to and used in each call to RunNcclCollective(), but is
-  // preallocated as CUDA host memory in the first call to Initialize(), since
-  // allocating CUDA host memory every call to RunNcclCollective() is expensive.
-  absl::flat_hash_map<se::StreamExecutor*,
-                      std::unique_ptr<se::MemoryAllocation>>
-      send_pointer_maps_ ABSL_GUARDED_BY(pointer_maps_mutex_);
-  absl::flat_hash_map<se::StreamExecutor*,
-                      std::unique_ptr<se::MemoryAllocation>>
-      receive_pointer_maps_ ABSL_GUARDED_BY(pointer_maps_mutex_);
-};
-
-absl::Status RunAllToAll(GpuCollectives* collectives, bool has_split_dimension,
-                         std::vector<DeviceBufferPair>& buffers,
-                         se::Stream& stream, Communicator* comm);
-
-absl::Status RunMemCpyAllToAll(GpuCollectives* collectives,
-                               bool has_split_dimension,
-                               std::vector<DeviceBufferPair>& buffers,
-                               se::Stream& stream, Communicator* comm,
-                               uint64_t send_pointer_map[],
-                               uint64_t receive_pointer_map[]);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.cc
deleted file mode 100644
index 72ff096e6b2f..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.h"
-
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla::gpu {
-
-NcclCollectiveBroadcastStartThunk::NcclCollectiveBroadcastStartThunk(
-    ThunkInfo thunk_info, const HloCollectiveBroadcastInstruction* instr,
-    std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
-    : NcclCollectiveThunk(Thunk::kNcclCollectiveBroadcastStart, thunk_info,
-                          IsGPUSyncCollective(*instr),
-                          AsyncStreamKind::kCollective),
-      config_(GetNcclCollectiveConfig(instr, std::nullopt)),
-      buffers_(std::move(buffers)) {}
-
-/*static*/ absl::Status NcclCollectiveBroadcastStartThunk::CheckImplementable(
-    const HloInstruction* instr, int64_t replica_count,
-    int64_t partition_count) {
-  return absl::OkStatus();
-}
-
-/*static*/ CollectiveOpGroupMode
-NcclCollectiveBroadcastStartThunk::GetGroupMode(
-    const HloCollectiveBroadcastInstruction* inst) {
-  return GetNcclCollectiveConfig(inst, std::nullopt).group_mode;
-}
-
-absl::Status NcclCollectiveBroadcastStartThunk::RunNcclCollective(
-    const ExecuteParams& params, se::Stream& stream,
-    CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_, config_.operand_element_type));
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  return ::xla::gpu::RunCollectiveBroadcast(device_buffers, stream,
-                                            comm_handle.comm, collectives);
-}
-
-absl::Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
-                                    se::Stream& stream, Communicator* comm,
-                                    GpuCollectives* collectives) {
-  TF_RETURN_IF_ERROR(collectives->GroupStart());
-  for (auto buffer : buffers) {
-    se::DeviceMemoryBase src_addr = buffer.source_buffer;
-    se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
-    TF_RETURN_IF_ERROR(comm->Broadcast(
-        // Always use rank 0 since we always broadcast from the first id in
-        // replica_groups
-        src_addr, dest_addr, buffer.element_type, buffer.element_count,
-        RankId(0), GpuCollectives::On(stream)));
-  }
-  return collectives->GroupEnd();
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.h
deleted file mode 100644
index 9c4416fc50ca..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_BROADCAST_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_BROADCAST_THUNK_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/stream_executor/stream.h"
-
-namespace xla::gpu {
-// Thunk that performs a NCCL-based collective broadcast.
-class NcclCollectiveBroadcastStartThunk : public NcclCollectiveThunk {
- public:
-  static absl::Status CheckImplementable(const HloInstruction* instr,
-                                         int64_t replica_count,
-                                         int64_t partition_count);
-
-  static CollectiveOpGroupMode GetGroupMode(
-      const HloCollectiveBroadcastInstruction* inst);
-
-  const NcclCollectiveConfig& config() const override { return config_; }
-  absl::Span<const Buffer> buffers() const { return buffers_; }
-
-  static const char* GetHloOpName() { return "collective-broadcast-start"; }
-
-  NcclCollectiveBroadcastStartThunk(
-      ThunkInfo thunk_info, const HloCollectiveBroadcastInstruction* instr,
-      std::vector<Buffer> buffers, bool p2p_memcpy_enabled = false);
-
- protected:
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-
- private:
-  const NcclCollectiveConfig config_;
-  const std::vector<Buffer> buffers_;
-};
-
-absl::Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
-                                    se::Stream& stream, Communicator* comm,
-                                    GpuCollectives* collectives);
-
-}  // namespace xla::gpu
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_BROADCAST_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_collective_permute_thunk.cc
deleted file mode 100644
index f7a2b1320556..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_permute_thunk.cc
+++ /dev/null
@@ -1,406 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_collective_permute_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-absl::StatusOr<const int64_t> GetCurrentId(
-    Thunk::CollectiveExecuteParams* collective_params,
-    const NcclP2PConfig& config) {
-  GlobalDeviceId global_device_id = collective_params->global_device_id;
-  TF_ASSIGN_OR_RETURN(
-      const DeviceAssignment::LogicalID current_logical_id,
-      collective_params->device_assn->LogicalIdForDevice(global_device_id));
-  const int64_t current_id =
-      config.config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
-  return current_id;
-}
-
-bool IsLocalPeerTransfer(
-    const NcclP2PConfig::SourceTargetMapEntry& source_target,
-    const int64_t current_id, const int64_t device_count) {
-  const std::optional<int64_t> source_id = source_target.source;
-  const std::optional<int64_t> target_id = source_target.target;
-  // Mixing nccl p2p with p2p memcopy will cause random deadlocks, namely
-  // when calling nccl call and cuda memcpy p2p together(which both are
-  // synchronizing devices), in this case if this rank is sending across host
-  // using a nccl call but receiving from a local peer which is going through
-  // cuda api, the deadlock could happen because nccl cannot ensure the
-  // order of cuda api calls.
-  // We determine if it's a local peer if the source/target id is within a node
-  // if they are present.
-  int64_t host_id = (current_id / device_count);
-  if (source_id && host_id != *source_id / device_count) return false;
-  if (target_id && host_id != *target_id / device_count) return false;
-  return true;
-}
-
-}  // namespace
-
-NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
-    ThunkInfo thunk_info, const HloCollectivePermuteInstruction* instr,
-    int64_t replica_count, int64_t partition_count,
-    const std::vector<Buffer>& buffers, bool p2p_memcpy_enabled,
-    AsyncStreamKind stream_kind)
-    : NcclCollectiveThunk(Thunk::kNcclCollectivePermuteStart, thunk_info,
-                          IsGPUSyncCollective(*instr), stream_kind),
-      config_(GetNcclP2PConfig(instr, replica_count, partition_count)),
-      buffers_(buffers),
-      p2p_memcpy_enabled_(p2p_memcpy_enabled) {}
-
-/*static*/ NcclP2PConfig NcclCollectivePermuteStartThunk::GetNcclP2PConfig(
-    const HloCollectivePermuteInstruction* instr, int64_t replica_count,
-    int64_t partition_count) {
-  NcclP2PConfig collective_permute_config;
-  auto& config = collective_permute_config.config;
-
-  config.operand_count = instr->operand_count();
-  for (int i = 0; i < config.operand_count; ++i) {
-    config.operand_element_type.push_back(
-        instr->operand(i)->shape().element_type());
-  }
-  config.SetCollectiveOpKindAndID(instr);
-  config.group_mode = GetGroupMode(instr);
-
-  // With a collective permute, all execution instances together form one
-  // replica group.
-  const int64_t num_participants =
-      config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? replica_count
-          : partition_count;
-  config.replica_groups.emplace_back();
-  ReplicaGroup& replica_group = config.replica_groups.front();
-  for (int i = 0; i < num_participants; ++i) {
-    replica_group.add_replica_ids(i);
-  }
-
-  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs =
-      instr->source_target_pairs();
-
-  for (const std::pair<int64_t, int64_t>& source_target : source_target_pairs) {
-    int64_t source = source_target.first;
-    int64_t target = source_target.second;
-
-    collective_permute_config.id_to_source_target.insert({target, {}})
-        .first->second.source = source;
-    collective_permute_config.id_to_source_target.insert({source, {}})
-        .first->second.target = target;
-  }
-
-  return collective_permute_config;
-}
-
-/*static*/ bool NcclCollectivePermuteStartThunk::IsDegenerate(
-    const HloCollectivePermuteInstruction* instr, int64_t replica_count,
-    int64_t partition_count) {
-  // The collective permute is degenerate if all source-target pairs are
-  // identity, and all the IDs appear in the list.
-  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs =
-      instr->source_target_pairs();
-  // Each ID can appear only once as a source and as a target. So if all pairs
-  // are identity, all IDs must appear in the list is the size == number of
-  // replicas/partitions.
-  const int64_t expected_size =
-      instr->channel_id().has_value() ? partition_count : replica_count;
-  return source_target_pairs.size() == expected_size &&
-         absl::c_all_of(source_target_pairs,
-                        [](const std::pair<int64_t, int64_t>& source_target) {
-                          return source_target.first == source_target.second;
-                        });
-}
-
-/*static*/ CollectiveOpGroupMode NcclCollectivePermuteStartThunk::GetGroupMode(
-    const HloCollectivePermuteInstruction* instr) {
-  return GetCollectiveOpGroupMode(instr->channel_id().has_value(), std::nullopt)
-      .value();
-}
-
-absl::Status NcclCollectivePermuteStartThunk::Initialize(
-    const InitializeParams& params) {
-  TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
-  device_count_ = params.local_device_count;
-  CHECK_GT(device_count_, 0);
-  VLOG(5) << "Local device count: " << device_count_;
-
-  if (p2p_memcpy_enabled_) {
-    TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                        GetCurrentId(params.collective_params, config_));
-    {
-      absl::MutexLock lock(&barrier_mutex_);
-      if (receiver_barrier_events_.find(current_id) ==
-          receiver_barrier_events_.end()) {
-        TF_ASSIGN_OR_RETURN(auto receiver_event,
-                            params.executor->CreateEvent());
-        receiver_barrier_events_.emplace(current_id, std::move(receiver_event));
-      }
-    }
-    TF_ASSIGN_OR_RETURN(
-        std::vector<DeviceBufferPair> device_buffers,
-        ConvertToDeviceBuffers(params.buffer_allocations, {buffers_},
-                               config_.config.operand_element_type));
-    const NcclP2PConfig::SourceTargetMapEntry source_target =
-        NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
-
-    const std::optional<int64_t> source_id = source_target.source;
-
-    TF_RETURN_IF_ERROR(recv_ptr_map_.InitializeId(current_id));
-
-    if (source_id) {
-      std::vector<se::DeviceMemoryBase> dest_addrs;
-      std::transform(device_buffers.begin(), device_buffers.end(),
-                     std::back_inserter(dest_addrs),
-                     [](const DeviceBufferPair& buffer) {
-                       return buffer.destination_buffer;
-                     });
-      std::vector<void*> dest_opaques;
-      std::transform(
-          dest_addrs.begin(), dest_addrs.end(),
-          std::back_inserter(dest_opaques),
-          [](se::DeviceMemoryBase dest_addr) { return dest_addr.opaque(); });
-      TF_RETURN_IF_ERROR(recv_ptr_map_.PutRecvPtr(current_id, dest_opaques));
-    }
-  }
-
-  return absl::OkStatus();
-}
-struct CallRendezvousKey {
-  RunId run_id;
-
-  template <typename H>
-  friend H AbslHashValue(H h, const CallRendezvousKey& key) {
-    return H::combine(std::move(h), key.run_id);
-  }
-};
-
-bool operator==(const CallRendezvousKey& a, const CallRendezvousKey& b) {
-  return a.run_id == b.run_id;
-}
-
-absl::Status NcclCollectivePermuteStartThunk::RunNcclCollective(
-    const ExecuteParams& params, se::Stream& stream,
-    CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params,
-                             std::vector<NcclCollectiveThunk::Buffer>(buffers_),
-                             config_.config.operand_element_type));
-  TF_ASSIGN_OR_RETURN(const int64_t current_id,
-                      GetCurrentId(params.collective_params, config_));
-  std::string device_string = GetDeviceString(*params.collective_params);
-
-  const NcclP2PConfig::SourceTargetMapEntry source_target =
-      NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
-  bool is_local_peer =
-      IsLocalPeerTransfer(source_target, current_id, device_count_);
-  VLOG(5) << "Is local peer : " << (is_local_peer ? "true" : "false");
-
-  bool use_memcpy = is_local_peer && recv_ptr_map_.IsInitialized(current_id) &&
-                    p2p_memcpy_enabled_;
-
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  if (use_memcpy) {
-    std::optional<int64_t> source_id = source_target.source;
-    std::optional<int64_t> target_id = source_target.target;
-    // Due to the one-sided push mechanism of memcpy p2p, we need to make sure
-    // the buffer on the receiving side is ready before sender pushes the data.
-    // Receiving side will record an event and the sender will wait for the
-    // event before proceeding.
-    if (source_id) {
-      absl::MutexLock lock(&barrier_mutex_);
-      auto receiver_event = receiver_barrier_events_.find(current_id);
-      TF_RETURN_IF_ERROR(stream.RecordEvent(receiver_event->second.get()));
-    }
-    TF_ASSIGN_OR_RETURN(
-        size_t num_local_participants,
-        GetNumLocalParticipants(*params.collective_params,
-                                config().replica_groups, config().group_mode));
-
-    auto rendezvous_name = absl::StrFormat(
-        "rendezvous of collective-permute; run_id=%d; op id:%d; "
-        "num_local_participants:%d",
-        params.collective_params->run_id.ToInt(), config_.config.op_id,
-        num_local_participants);
-    auto rendezvous_key = CallRendezvousKey{params.collective_params->run_id};
-
-    // Perform a rendezvous to make sure all receivers have their events
-    // recorded.
-    Rendezvous(rendezvous_name, rendezvous_key, num_local_participants,
-               /*warn_stuck_timeout=*/absl::Seconds(20),
-               /*terminate_timeout=*/absl::Seconds(40));
-
-    // For sending side, wait for the recorded event from the receiving side.
-    if (target_id) {
-      absl::MutexLock lock(&barrier_mutex_);
-      auto receiver_event = receiver_barrier_events_.find(*target_id);
-      TF_RETURN_IF_ERROR(stream.WaitFor(receiver_event->second.get()));
-    }
-  }
-
-  return ::xla::gpu::RunCollectivePermute(
-      collectives, source_target, device_buffers, stream, comm_handle.comm,
-      device_string, current_id, use_memcpy, recv_ptr_map_);
-}
-
-absl::Status RunCollectivePermute(
-    GpuCollectives* collectives,
-    NcclP2PConfig::SourceTargetMapEntry source_target,
-    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
-    Communicator* comm, absl::string_view device_string, int64_t current_id,
-    bool use_memcpy,
-    NcclCollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map) {
-  // Determine the source and target IDs for this instance. The source ID is the
-  // ID which will copy its data to this instance. The destination ID is the ID
-  // to which this instance will copy its data. Either are optional.
-  //
-  // No source and no dest:
-  //  - this instance does not actually participate, no one send it any data and
-  //    it does not have to send any data as well. Since there is no dest,
-  //    just memzero() the dest buffer as required by the collective permute
-  //    semantics.
-  //
-  // No source, dest present:
-  //  - This instance has to send data to 'dest' Issue an send of the input.
-  //    Since there is no source, memzero the dest buffer.
-  //
-  // Source present, no destination:
-  //  - This instance received data from the source, does not have to send data
-  //    to anyone, Issue a receive.
-  //
-  // Source and dest both present:
-  //   - Issue a send of the input to dest, receive for the output from the
-  //     src.
-  //
-  //
-
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing collective permute from device ordinal: "
-          << device_ordinal << " current_id " << current_id;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
-
-  std::optional<int64_t> source_id = source_target.source;
-  std::optional<int64_t> target_id = source_target.target;
-
-  std::vector<se::DeviceMemoryBase> src_addrs, dest_addrs;
-  std::transform(
-      buffers.begin(), buffers.end(), std::back_inserter(src_addrs),
-      [](const DeviceBufferPair& buffer) { return buffer.source_buffer; });
-  std::transform(
-      buffers.begin(), buffers.end(), std::back_inserter(dest_addrs),
-      [](const DeviceBufferPair& buffer) { return buffer.destination_buffer; });
-
-  VLOG(3) << absl::StreamFormat("%s : id = %d, source_id = %d, target_id = %d",
-                                device_string, current_id,
-                                source_id.value_or(-1), target_id.value_or(-1));
-
-  if (!use_memcpy) {
-    // GroupStart/End API is needed if we need to dispatch multiple NCCL kernels
-    // for multiple buffers
-    const bool is_nccl_group_needed = (buffers.size() > 1);
-    if (is_nccl_group_needed) {
-      TF_RETURN_IF_ERROR(collectives->GroupStart());
-    }
-
-    std::optional<RankId> source_rank;
-    std::vector<RankId> target_ranks;
-    if (source_id) source_rank = RankId(*source_id);
-    if (target_id) target_ranks.push_back(RankId(*target_id));
-
-    for (uint64_t idx = 0; idx < buffers.size(); ++idx) {
-      const auto src_addr = src_addrs.at(idx);
-      const auto dest_addr = dest_addrs.at(idx);
-      const auto buffer = buffers.at(idx);
-      TF_RETURN_IF_ERROR(comm->CollectivePermute(
-          src_addr, dest_addr, buffer.element_type, buffer.element_count,
-          source_rank, target_ranks, GpuCollectives::On(stream)));
-    }
-
-    if (is_nccl_group_needed) {
-      TF_RETURN_IF_ERROR(collectives->GroupEnd());
-    }
-  }
-
-  if (!source_id) {
-    // If there is no source peer, i.e. no one send us any data, zero out dest
-    // buffer.
-    VLOG(3) << absl::StreamFormat("%s : collective-Permute: Issuing MemZero",
-                                  device_string);
-    for (se::DeviceMemoryBase& dest_addr : dest_addrs) {
-      TF_RETURN_IF_ERROR(stream.MemZero(&dest_addr, dest_addr.size()));
-    }
-  }
-
-  if (use_memcpy && target_id) {
-    TF_ASSIGN_OR_RETURN(auto recv_ptrs, recv_ptr_map.GetRecvPtr(*target_id));
-
-    VLOG(3) << "Using memcpy, received target pointers, current_id: "
-            << current_id << " target_id: " << *target_id;
-
-    VLOG(3) << current_id << " initiating memcpy to " << *target_id;
-    for (uint64_t idx = 0; idx < buffers.size(); ++idx) {
-      se::DeviceMemoryBase dst_addr =
-          se::DeviceMemoryBase(recv_ptrs.get().at(idx));
-      auto src_addr = src_addrs.at(idx);
-      TF_RETURN_IF_ERROR(
-          stream.MemcpyD2D(&dst_addr, src_addr, src_addr.size()));
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_collective_permute_thunk.h
deleted file mode 100644
index 49530175a09a..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_permute_thunk.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-#include <unordered_map>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/concurrency/async_value.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-
-namespace xla {
-namespace gpu {
-
-using tsl::AsyncValueRef;
-
-// Thunk that performs a NCCL-based collective permute.
-class NcclCollectivePermuteStartThunk : public NcclCollectiveThunk {
- public:
-  class RecvPtrMap {
-   public:
-    bool IsInitialized(int64_t current_id) {
-      absl::MutexLock lock(&mutex_);
-      return recv_ptrs_.find(current_id) != recv_ptrs_.end();
-    }
-
-    absl::Status InitializeId(int64_t current_id) {
-      absl::MutexLock lock(&mutex_);
-      recv_ptrs_[current_id] =
-          tsl::MakeUnconstructedAsyncValueRef<std::vector<void*>>();
-      return absl::OkStatus();
-    }
-
-    absl::Status PutRecvPtr(int64_t current_id,
-                            const std::vector<void*>& ptrs) {
-      if (!IsInitialized(current_id)) {
-        return absl::InternalError(absl::StrCat("Current ID ", current_id,
-                                                " has not been initialized!"));
-      }
-      absl::MutexLock lock(&mutex_);
-      if (recv_ptrs_.at(current_id).IsUnavailable()) {
-        VLOG(3) << "Putting pointers to current_id " << current_id;
-        recv_ptrs_.at(current_id).emplace(ptrs);
-      }
-      return absl::OkStatus();
-    }
-
-    absl::StatusOr<AsyncValueRef<std::vector<void*>>> GetRecvPtr(
-        int64_t target_id) {
-      if (!IsInitialized(target_id)) {
-        return absl::InternalError(absl::StrCat("Target ID ", target_id,
-                                                " has not been initialized!"));
-      }
-      absl::MutexLock lock(&mutex_);
-      return recv_ptrs_[target_id];
-    }
-
-   private:
-    absl::Mutex mutex_;
-    absl::node_hash_map<int64_t, AsyncValueRef<std::vector<void*>>> recv_ptrs_
-        ABSL_GUARDED_BY(mutex_);
-  };
-
-  static NcclP2PConfig GetNcclP2PConfig(
-      const HloCollectivePermuteInstruction* instr, int64_t replica_count,
-      int64_t partition_count);
-
-  static bool IsDegenerate(const HloCollectivePermuteInstruction* instr,
-                           int64_t replica_count, int64_t partition_count);
-
-  static CollectiveOpGroupMode GetGroupMode(
-      const HloCollectivePermuteInstruction* instr);
-
-  NcclCollectivePermuteStartThunk(ThunkInfo thunk_info,
-                                  const HloCollectivePermuteInstruction* instr,
-                                  int64_t replica_count,
-                                  int64_t partition_count,
-                                  const std::vector<Buffer>& buffers,
-                                  bool p2p_memcpy_enabled,
-                                  AsyncStreamKind stream_kind);
-  absl::Status Initialize(const InitializeParams& params) override;
-
-  static const char* GetHloOpName() { return "collective-permute-start"; }
-
- protected:
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-
- private:
-  const NcclP2PConfig config_;
-  std::vector<Buffer> buffers_;
-  RecvPtrMap recv_ptr_map_;
-  absl::Mutex barrier_mutex_;
-  std::unordered_map<int64_t, std::unique_ptr<se::Event>>
-      receiver_barrier_events_;
-  bool p2p_memcpy_enabled_ = false;
-  int64_t device_count_;
-};
-
-absl::Status RunCollectivePermute(
-    GpuCollectives* collectives,
-    NcclP2PConfig::SourceTargetMapEntry source_target,
-    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
-    Communicator* comm, absl::string_view device_string, int64_t current_id,
-    bool use_memcpy, NcclCollectivePermuteStartThunk::RecvPtrMap& recv_ptr_map);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_collective_thunk.cc
deleted file mode 100644
index c83d1ec1901c..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_thunk.cc
+++ /dev/null
@@ -1,559 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/time/time.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/core/collectives/rank_id.h"
-#include "xla/debug_options_flags.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/layout_util.h"
-#include "xla/primitive_util.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
-#include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/rendezvous.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/event.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-static constexpr int64_t kCollectiveMemorySpaceColor = 1;
-static constexpr CollectiveStreamId kNoStreamId = CollectiveStreamId(0);
-
-bool IsTypeSupportedByNccl(PrimitiveType element_type,
-                           Thunk::Kind reduction_op) {
-  switch (element_type) {
-    case S8:
-    case PRED:
-    case U8:
-    case S32:
-    case U32:
-    case S64:
-    case U64:
-    case F16:
-    case F32:
-    case F64:
-    case BF16:
-    case C64:
-    case C128:
-      return true;
-    case S16:
-    case U16:
-      // 16-bit integer reductions are not directly supported by NCCL and cannot
-      // be implicitly converted into other 16-bit types like ncclFloat16 as
-      // they involve actual computation and not just data movement.
-    case F8E5M2:
-    case F8E4M3FN:
-    case F8E5M2FNUZ:
-    case F8E4M3FNUZ:
-    case F8E8M0FNU:
-      return !IsReductionCollective(reduction_op);
-    default:
-      return false;
-  }
-}
-
-}  // namespace
-
-// This file runs collective ops (i.e. ops that communicate between multiple
-// GPUs) using NCCL.
-//
-// Here's a high-level overview of how running an op works.
-//
-//  - Multiple threads call ExecuteOnStream.
-//  - All threads that "go together" (i.e. are participating in the "same"
-//    collective op) choose the same Rendezvous object from a global map.
-//  - Once all threads have arrived at the Rendezvous, we know exactly which
-//    GPUs are participating in the op, so we get or create a NcclClique
-//    containing those GPUs.
-//  - We perform the NCCL operation using the clique.
-
-// Returns if the collective communication operation is degenerate because all
-// the groups formed by the operation are singleton. A given op can be
-// degenerate under several conditions, corresponding to the modes supported
-// in GetParticipatingDevices().
-//   1. no channel id, use_global_device_ids = false:
-//         degenerate if replica_groups are singleton, or groups empty and
-//         replica_count == 1.
-//   2. channel_id is set, use_global_device_ids = false:
-//         degenerate if replica_groups are singleton and num_partitions == 1,
-//         or groups empty and num_replicas == 1 && num_partitions == 1.
-//   3. channel_id is set, use_global_device_ids = true (flattened-ids):
-//         degenerate if replica_groups are singleton (groups cannot be empty).
-//   4. no channel_id, no use_global_device_ids:
-//         identical to 1.
-//   5. channel_id is set, no use_global_device_ids:
-//         degenerate if replica_groups are singleton or group emty and
-//         num_partitions == 1 (since replica groups contain partition ids).
-//
-bool NcclCollectiveConfig::IsDegenerate(int64_t replica_count,
-                                        int64_t partition_count) const {
-  bool groups_empty = replica_groups.empty();
-
-  // check if all replica_groups are singleton. If not, then the operation is
-  // not degenerate.
-  bool all_groups_singleton =
-      !groups_empty &&
-      absl::c_all_of(replica_groups, [](const ReplicaGroup& group) {
-        return group.replica_ids_size() == 1;
-      });
-
-  switch (group_mode) {
-    case CollectiveOpGroupMode::kCrossReplica:
-      return all_groups_singleton || (groups_empty && replica_count == 1);
-    case CollectiveOpGroupMode::kCrossPartition:
-      return all_groups_singleton || (groups_empty && partition_count == 1);
-    case CollectiveOpGroupMode::kCrossReplicaAndPartition:
-      return (all_groups_singleton && partition_count == 1) ||
-             (groups_empty && replica_count == 1 && partition_count == 1);
-    case CollectiveOpGroupMode::kFlattenedID:
-      CHECK(!groups_empty)
-          << "replica groups cannot be empty if use_global_device_ids = true";
-      return all_groups_singleton;
-    default:
-      CHECK(0) << "Invalid collective op mode";
-      return false;
-  }
-}
-
-void NcclCollectiveConfig::SetCollectiveOpKindAndID(
-    const HloCollectivePermuteInstruction* instr) {
-  if (instr->channel_id().has_value()) {
-    collective_op_kind = RendezvousKey::kCrossModule;
-    op_id = instr->channel_id().value();
-  } else {
-    collective_op_kind = RendezvousKey::kCrossReplica;
-    op_id = static_cast<int64_t>(instr->GetModule()->unique_id());
-  }
-}
-
-void NcclCollectiveConfig::SetCollectiveOpKindAndID(
-    const HloSendRecvInstruction* instr) {
-  int64_t channel_id = instr->channel_id().value_or(0);
-  if (channel_id > 0) {
-    collective_op_kind = RendezvousKey::kCrossModule;
-    op_id = channel_id;
-  } else {
-    collective_op_kind = RendezvousKey::kCrossReplica;
-    op_id = static_cast<int64_t>(instr->GetModule()->unique_id());
-  }
-}
-
-NcclCollectiveConfig GetNcclCollectiveConfig(
-    const HloInstruction* hlo, std::optional<bool> use_global_device_ids) {
-  NcclCollectiveConfig config;
-  config.operand_count = hlo->operands().size();
-  config.operand_element_type.reserve(config.operand_count);
-  for (int i = 0; i < config.operand_count; i++) {
-    config.operand_element_type.push_back(
-        hlo->operand(i)->shape().element_type());
-  }
-  config.replica_groups = hlo->replica_groups();
-
-  if (hlo->channel_id().has_value()) {
-    config.collective_op_kind = RendezvousKey::kCrossModule;
-    config.op_id = *hlo->channel_id();
-  } else {
-    config.collective_op_kind = RendezvousKey::kCrossReplica;
-    config.op_id = static_cast<int64_t>(hlo->GetModule()->unique_id());
-  }
-
-  config.group_mode = GetCollectiveOpGroupMode(hlo->channel_id().has_value(),
-                                               use_global_device_ids)
-                          .value();
-
-  return config;
-}
-
-NcclCollectiveThunk::NcclCollectiveThunk(Kind kind, ThunkInfo thunk_info,
-                                         bool is_sync,
-                                         AsyncStreamKind stream_kind)
-    : Thunk(kind, thunk_info),
-      stream_kind_(stream_kind),
-      async_events_(is_sync ? nullptr : new AsyncEvents()) {}
-
-absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, CollectiveStreamId stream_id,
-    AsyncStreamKind stream_kind) {
-  GlobalDeviceId global_device_id = params.global_device_id;
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<GlobalDeviceId> participants,
-      GetParticipatingDevices(global_device_id, *params.device_assn,
-                              replica_groups, group_mode));
-
-  // If splitting is enabled, participating groups must match in order for a
-  // clique to be reused from the cache. We can ignore the participating groups
-  // otherwise.
-  static const int64_t enable_nccl_comm_splitting =
-      xla::GetDebugOptionsFromFlags().xla_gpu_enable_nccl_comm_splitting();
-  std::vector<std::vector<GlobalDeviceId>> participant_groups;
-  if (enable_nccl_comm_splitting) {
-    TF_ASSIGN_OR_RETURN(participant_groups,
-                        GetParticipatingDevicesGroups(
-                            *params.device_assn, replica_groups, group_mode));
-  }
-
-  if (collectives->IsGlobalConfig() &&
-      (participants.size() != params.device_assn->replica_count())) {
-    return InvalidArgument(
-        "Partial replica groups are not allowed when using NCCL_COMM_ID "
-        "environment configuration.");
-  }
-  static const bool enable_per_stream_comms =
-      xla::GetDebugOptionsFromFlags().xla_gpu_enable_nccl_per_stream_comms();
-
-  return GpuCliqueKey(std::move(participants),
-                      enable_per_stream_comms ? stream_id : kNoStreamId,
-                      stream_kind, std::move(participant_groups));
-}
-
-absl::StatusOr<CommunicatorHandle> GetNcclComm(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
-    const Thunk::CollectiveCliques& collective_cliques,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, CollectiveStreamId stream_id,
-    AsyncStreamKind stream_kind) {
-  TF_ASSIGN_OR_RETURN(GpuCliqueKey clique_key,
-                      GetGpuCliqueKey(collectives, params, replica_groups,
-                                      group_mode, stream_id, stream_kind));
-
-  std::optional<RankId> rank = clique_key.rank(params.global_device_id);
-  TF_ASSIGN_OR_RETURN(bool is_local,
-                      collective_cliques.is_local_clique(clique_key));
-  TF_ASSIGN_OR_RETURN(Communicator * comm,
-                      collective_cliques.GetComm(std::move(clique_key), *rank));
-
-  return CommunicatorHandle(comm, is_local);
-}
-
-absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
-    const Thunk::ExecuteParams& params,
-    const std::vector<NcclCollectiveThunk::Buffer>& buffers,
-    const std::vector<PrimitiveType>& element_types) {
-  return ConvertToDeviceBuffers(params.buffer_allocations, buffers,
-                                element_types);
-}
-
-absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
-    const BufferAllocations* buffer_allocations,
-    const std::vector<NcclCollectiveThunk::Buffer>& buffers,
-    const std::vector<PrimitiveType>& element_types) {
-  if (buffers.size() != element_types.size())
-    return FailedPrecondition("Mismatch in operand buffer counts.");
-
-  std::vector<DeviceBufferPair> device_buffers;
-  device_buffers.reserve(buffers.size());
-  for (int i = 0; i < buffers.size(); ++i) {
-    device_buffers.emplace_back(DeviceBufferPair{
-        element_types[i], buffers[i].element_count,
-        buffer_allocations->GetDeviceAddress(buffers[i].source_buffer),
-        buffer_allocations->GetDeviceAddress(buffers[i].destination_buffer),
-        buffers[i].source_memory_space, buffers[i].destination_memory_space});
-  }
-  return device_buffers;
-}
-
-absl::Status RegisterBufferOnce(GpuCollectives* collectives,
-                                se::StreamExecutor* executor,
-                                Communicator* comm,
-                                se::DeviceMemoryBase buffer) {
-  // Keep track of which communicators we have registered for already.
-  // Each ncclMemAlloc'd buffer needs to be registered once per comm.
-  struct RegisteredBuffers {
-    absl::Mutex mu;
-    // Device ordinal, communicator, and base pointer address.
-    absl::flat_hash_set<std::tuple<int, Communicator*, void*>> records
-        ABSL_GUARDED_BY(mu);
-    // Buffers could be deregistered with ncclCommDeregister.
-    std::vector<std::unique_ptr<Communicator::RegisteredBufferHandle>> handles
-        ABSL_GUARDED_BY(mu);
-  };
-  static auto& all_registered = *new RegisteredBuffers;
-
-  // Since each XLA buffer is a slice into a larger BFCAllocator chunk, first
-  // get the base address of buffer. We will use the base address to keep track
-  // of which chunks we have registered.
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase base_buffer,
-                      executor->GetMemoryRange(buffer));
-
-  absl::MutexLock lock(&all_registered.mu);
-  if (!all_registered.records.contains(
-          {executor->device_ordinal(), comm, base_buffer.opaque()})) {
-    // ncclCommRegister will internally get and use the base address/size of the
-    // address we provide.
-    TF_ASSIGN_OR_RETURN(auto handle, comm->RegisterBuffer(buffer));
-    all_registered.handles.push_back(std::move(handle));
-    all_registered.records.insert(
-        {executor->device_ordinal(), comm, base_buffer.opaque()});
-  }
-  return absl::OkStatus();
-}
-
-absl::Status MaybeRegisterBuffers(GpuCollectives* collectives,
-                                  se::StreamExecutor* executor,
-                                  const std::vector<DeviceBufferPair>& buffers,
-                                  Communicator* comm) {
-  for (int i = 0; i < buffers.size(); ++i) {
-    if (buffers[i].source_memory_space == kCollectiveMemorySpaceColor) {
-      TF_RETURN_IF_ERROR(RegisterBufferOnce(collectives, executor, comm,
-                                            buffers[i].source_buffer));
-    }
-    if (buffers[i].destination_memory_space == kCollectiveMemorySpaceColor) {
-      TF_RETURN_IF_ERROR(RegisterBufferOnce(collectives, executor, comm,
-                                            buffers[i].destination_buffer));
-    }
-  }
-  return absl::OkStatus();
-}
-
-absl::Status NcclCollectiveThunk::AsyncEvents::Initialize(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mu_);
-  if (events_.contains(executor)) return absl::OkStatus();
-
-  TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
-
-  events_.try_emplace(executor, std::move(event));
-  return absl::OkStatus();
-}
-
-absl::StatusOr<se::Event*> NcclCollectiveThunk::AsyncEvents::GetEvent(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mu_);
-
-  auto event = events_.find(executor);
-  if (event == events_.end()) {
-    return absl::InternalError(
-        "Collective operation async completion event not initialized");
-  }
-
-  return event->second.get();
-}
-
-absl::Status NcclCollectiveThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  TF_ASSIGN_OR_RETURN(
-      GpuCliqueKey clique_key,
-      GetGpuCliqueKey(collectives, *params.collective_params,
-                      config().replica_groups, config().group_mode,
-                      nccl_stream_id(), GetAsyncStreamKind()));
-  TF_ASSIGN_OR_RETURN(
-      size_t num_local_participants,
-      GetNumLocalParticipants(*params.collective_params,
-                              config().replica_groups, config().group_mode));
-  return resource_requests.AddClique(clique_key, num_local_participants);
-}
-
-absl::Status NcclCollectiveThunk::Initialize(const InitializeParams& params) {
-  if (async_events_) {
-    TF_RETURN_IF_ERROR(async_events_->Initialize(params.executor));
-  }
-  return absl::OkStatus();
-}
-
-namespace {
-// Wrap GpuCliqueKey into a unique struct to guarantee we do not accidentally
-// try to run multiple unrelated rendezvous for a same key.
-struct FirstCallRendezvousKey {
-  GpuCliqueKey clique_key;
-
-  template <typename H>
-  friend H AbslHashValue(H h, const FirstCallRendezvousKey& key) {
-    return H::combine(std::move(h), key.clique_key);
-  }
-};
-
-bool operator==(const FirstCallRendezvousKey& a,
-                const FirstCallRendezvousKey& b) {
-  return a.clique_key == b.clique_key;
-}
-}  // namespace
-
-absl::Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(1) << absl::StreamFormat("Starting %s %s.", IsAsync() ? "async" : "sync",
-                                Thunk::KindToString(kind()));
-  const CollectiveStreamId stream_id = nccl_stream_id();
-  AsyncStreamKind stream_kind = GetAsyncStreamKind();
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  TF_ASSIGN_OR_RETURN(
-      CommunicatorHandle comm_handle,
-      GetNcclComm(collectives, *params.collective_params,
-                  *params.collective_cliques, config().replica_groups,
-                  config().group_mode, stream_id, stream_kind));
-  se::StreamExecutor* executor = params.stream->parent();
-  int64_t async_stream_idx = static_cast<int64_t>(stream_kind);
-
-  if (IsAsync()) {
-    // Launch collective operation on an async stream.
-    se::Stream& async_stream =
-        *params.collective_params->async_streams.at(async_stream_idx);
-
-    // Wait for main compute stream to make sure all buffers are ready.
-    TF_RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
-
-    TF_RETURN_IF_ERROR(RunNcclCollective(params, async_stream, comm_handle));
-
-    // Record collective operation completion.
-    TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
-    TF_RETURN_IF_ERROR(async_stream.RecordEvent(event));
-
-  } else {
-    // Launch collective operation on a main stream.
-    TF_RETURN_IF_ERROR(RunNcclCollective(params, *params.stream, comm_handle));
-  }
-
-  // After a first execution of this instance of collective operation do a
-  // rendezvous with other participants to make sure that all of them allocated
-  // required state (internal to NCCL) and ready to continue. Going too far
-  // ahead on one rank leads to deadlocks in NCCL.
-  if (NeedFirstCallRendzevous() && !first_call_rendezvous_flag_.IsCompleted()) {
-    TF_ASSIGN_OR_RETURN(GpuCollectives * collectives,
-                        GetGpuCollectives(params));
-    TF_ASSIGN_OR_RETURN(
-        GpuCliqueKey clique_key,
-        GetGpuCliqueKey(collectives, *params.collective_params,
-                        config().replica_groups, config().group_mode, stream_id,
-                        stream_kind));
-
-    TF_ASSIGN_OR_RETURN(
-        size_t num_local_participants,
-        params.collective_cliques->num_communicators(clique_key));
-
-    auto global_device_id = params.collective_params->global_device_id;
-    RankId rank = clique_key.rank(global_device_id).value_or(RankId(-1));
-    VLOG(1) << "Do a rendezvous after a first call to "
-            << Thunk::KindToString(kind())
-            << "; run_id=" << params.collective_params->run_id.ToInt()
-            << "; op_id=" << config().op_id
-            << "; num_local_participants=" << num_local_participants
-            << "; rank=" << rank.value()
-            << "; clique_key=" << clique_key.ToString();
-
-    auto rendezvous_key = FirstCallRendezvousKey{std::move(clique_key)};
-    auto rendezvous_name = absl::StrFormat(
-        "first call to collective operation %d; run_id=%d", config().op_id,
-        params.collective_params->run_id.ToInt());
-
-    Rendezvous(first_call_rendezvous_flag_, rendezvous_name, rendezvous_key,
-               num_local_participants,
-               /*warn_stuck_timeout=*/absl::Seconds(20),
-               /*terminate_timeout=*/absl::Seconds(40));
-  }
-
-  return absl::OkStatus();
-}
-
-std::string NcclCollectiveThunk::GetDeviceString(
-    const Thunk::CollectiveExecuteParams& collective_params) {
-  GlobalDeviceId global_device_id = collective_params.global_device_id;
-  DeviceAssignment::LogicalID logical_id =
-      collective_params.device_assn->LogicalIdForDevice(global_device_id)
-          .value();
-  return absl::StrFormat("(r%d, p%d) : GlobalID %d, ord %d",
-                         logical_id.replica_id, logical_id.computation_id,
-                         global_device_id.value(),
-                         collective_params.local_device_ordinal);
-}
-
-NcclCollectiveDoneThunk::NcclCollectiveDoneThunk(
-    Thunk::Kind kind, ThunkInfo thunk_info,
-    std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events,
-    AsyncStreamKind async_stream_kind)
-    : Thunk(kind, std::move(thunk_info)),
-      async_events_(async_events),
-      async_stream_kind_(async_stream_kind) {}
-
-absl::Status NcclCollectiveDoneThunk::ExecuteOnStream(
-    const ExecuteParams& params) {
-  se::StreamExecutor* executor = params.stream->parent();
-  TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
-  return params.stream->WaitFor(event);
-}
-
-absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op) {
-  if (!LayoutUtil::IsDenseArray(shape)) {
-    return absl::AbortedError(
-        absl::StrFormat("input is not a dense array: %s",
-                        shape.ToString(/*print_layout=*/true)));
-  }
-  if (!IsTypeSupportedByNccl(shape.element_type(), reduction_op)) {
-    return absl::AbortedError(absl::StrFormat(
-        "element type %s not suppored by NCCL",
-        primitive_util::LowercasePrimitiveTypeName(shape.element_type())));
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<size_t> GetNumLocalParticipants(
-    const Thunk::CollectiveExecuteParams& params,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<GlobalDeviceId> participants,
-      GetParticipatingDevices(params.global_device_id, *params.device_assn,
-                              replica_groups, group_mode));
-  if (!params.global_device_id_map) {
-    return participants.size();
-  }
-
-  std::vector<GlobalDeviceId> local_devices;
-  local_devices.reserve(params.global_device_id_map->size());
-  for (const auto& entry : *params.global_device_id_map) {
-    local_devices.push_back(entry.second);
-  }
-
-  return absl::c_count_if(participants, [&](const GlobalDeviceId& device_id) {
-    return absl::c_linear_search(local_devices, device_id);
-  });
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_collective_thunk.h
deleted file mode 100644
index c95d55d4baff..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_collective_thunk.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <type_traits>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/synchronization/mutex.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Value.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/global_device_id.h"
-#include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/service/rendezvous.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/event.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-struct NcclCollectiveConfig {
-  int64_t operand_count;
-  std::vector<PrimitiveType> operand_element_type;
-  std::vector<ReplicaGroup> replica_groups;
-  RendezvousKey::CollectiveOpKind collective_op_kind;
-  int64_t op_id;
-  CollectiveOpGroupMode group_mode;
-
-  template <typename OpT>
-  void SetCollectiveOpKindAndID(OpT op);
-  void SetCollectiveOpKindAndID(const HloCollectivePermuteInstruction* instr);
-  void SetCollectiveOpKindAndID(const HloSendRecvInstruction* instr);
-  bool IsDegenerate(int64_t replica_count, int64_t partition_count) const;
-};
-
-template <typename OpT>
-void NcclCollectiveConfig::SetCollectiveOpKindAndID(OpT op) {
-  if (op.getChannelId()) {
-    collective_op_kind = RendezvousKey::kCrossModule;
-    op_id = static_cast<int64_t>(op.getChannelId()->getHandle());
-  } else {
-    collective_op_kind = RendezvousKey::kCrossReplica;
-    mlir::ModuleOp parent = op->template getParentOfType<mlir::ModuleOp>();
-    mlir::IntegerAttr unique_id =
-        parent->getAttrOfType<mlir::IntegerAttr>("hlo.unique_id");
-    op_id = static_cast<int64_t>(unique_id.getInt());
-  }
-}
-
-NcclCollectiveConfig GetNcclCollectiveConfig(
-    const HloInstruction* hlo, std::optional<bool> use_global_device_ids);
-
-template <typename OpT>
-NcclCollectiveConfig GetNcclCollectiveConfigForMlir(
-    OpT op, std::optional<bool> use_global_device_ids) {
-  NcclCollectiveConfig config;
-  config.operand_count = op.getInputs().size();
-  config.operand_element_type.reserve(config.operand_count);
-  for (int i = 0; i < config.operand_count; i++) {
-    const Shape shape = GetShape(op.getInputs()[i]);
-    config.operand_element_type.push_back(shape.element_type());
-  }
-  config.replica_groups = ConvertReplicaGroups(op.getReplicaGroups()).value();
-  config.SetCollectiveOpKindAndID(op);
-  config.group_mode = GetCollectiveOpGroupMode(op.getChannelId().has_value(),
-                                               use_global_device_ids)
-                          .value();
-  return config;
-}
-
-// Handle to a communicator object with its `is_local` property.
-struct CommunicatorHandle {
-  CommunicatorHandle(Communicator* comm, bool is_local)
-      : comm(comm), is_local(is_local) {}
-
-  Communicator* comm;  // communicator object
-  bool is_local;       // whether this comm is a node-local comm
-};
-
-//===----------------------------------------------------------------------===//
-// NcclCollectiveThunk
-//===----------------------------------------------------------------------===//
-
-// Forward declare.
-class NcclCollectiveDoneThunk;
-
-// Thunk base class for NCCL collective operations.
-class NcclCollectiveThunk : public Thunk {
- public:
-  NcclCollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync,
-                      AsyncStreamKind stream_kind);
-
-  struct Buffer {
-    int64_t element_count;
-    BufferAllocation::Slice source_buffer;
-    BufferAllocation::Slice destination_buffer;
-    int64_t source_memory_space;
-    int64_t destination_memory_space;
-    mlir::Value source_value;
-    mlir::Value destination_value;
-  };
-
-  // Completion events for asynchronous collective operations (operations
-  // launched on a dedicated stream that is synchronized with main compute
-  // stream only when needed).
-  class AsyncEvents {
-   private:
-    friend class NcclCollectiveThunk;
-    friend class NcclCollectiveDoneThunk;
-    friend class NcclGroupThunk;
-
-    absl::Status Initialize(se::StreamExecutor* executor);
-    absl::StatusOr<se::Event*> GetEvent(se::StreamExecutor* executor);
-
-   private:
-    absl::Mutex mu_;
-    absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>> events_
-        ABSL_GUARDED_BY(mu_);
-  };
-
-  // Logging support.
-  static std::string GetDeviceString(
-      const Thunk::CollectiveExecuteParams& params);
-
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
-
-  absl::Status Initialize(const InitializeParams& params) override;
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
-  std::shared_ptr<AsyncEvents> async_events() const { return async_events_; }
-  void set_async_events(std::shared_ptr<AsyncEvents> async_events) {
-    async_events_ = async_events;
-  }
-
-  CollectiveStreamId nccl_stream_id() const {
-    return xla::gpu::GetCollectiveStreamId(IsAsync(), GetAsyncStreamKind());
-  }
-
-  ExecutionStreamId nccl_execution_stream_id() const {
-    return ExecutionStreamId(execution_stream_id().value() +
-                             nccl_stream_id().value());
-  }
-
- protected:
-  virtual absl::Status RunNcclCollective(const ExecuteParams& params,
-                                         se::Stream& stream,
-                                         CommunicatorHandle comm) = 0;
-  virtual const NcclCollectiveConfig& config() const = 0;
-  virtual AsyncStreamKind GetAsyncStreamKind() const { return stream_kind_; }
-
-  // A collective thunk is normally an independent operation in a sense that
-  // different instances of the same collective thunk communicate each other.
-  // The only exception are SendThunk and RecvThunk. Assume two devices are
-  // executing a program contains the following instructions, the Recv from
-  // device 1 will release the Send from device 0. Adding first call
-  // rendezvous on the SendThunk would cause a runtime deadlock.
-  //  Send(src_target={0,1})
-  //  Recv(src_target={0,1})
-  virtual bool NeedFirstCallRendzevous() const { return true; }
-
- private:
-  const AsyncStreamKind stream_kind_;
-
-  bool IsAsync() const { return async_events_ != nullptr; }
-  std::shared_ptr<AsyncEvents> async_events_;
-
-  // After a first call to this particular instance of a NCCL collective thunk
-  // we do a round of rendezvous to make sure that all participants successfully
-  // allocated on-device state required for executing collective operation. This
-  // is required to avoid deadlocks when one device goes too far ahead and
-  // causes a deadlock in CUDA driver (root cause is mysterious).
-  //
-  // TODO(ezhulenev): Try to move this flag to NCCL clique as we need to make
-  // sure that all NCCL resources are allocated just once.
-  RendezvousFlag first_call_rendezvous_flag_;
-};
-
-//===----------------------------------------------------------------------===//
-// NcclCollectiveDoneThunk
-//===----------------------------------------------------------------------===//
-
-class NcclCollectiveDoneThunk : public Thunk {
- public:
-  NcclCollectiveDoneThunk(
-      Thunk::Kind kind, ThunkInfo thunk_info,
-      std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events,
-      AsyncStreamKind async_stream_kind);
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
-  // return the execution stream id wheer previous async operator was launched
-  // to.
-  ExecutionStreamId nccl_execution_stream_id() const {
-    return ExecutionStreamId(
-        execution_stream_id().value() +
-        xla::gpu::GetCollectiveStreamId(true, async_stream_kind_).value());
-  }
-
- private:
-  std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events_;
-  AsyncStreamKind async_stream_kind_ = AsyncStreamKind::kCollective;
-};
-
-//===----------------------------------------------------------------------===//
-
-absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op);
-
-absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op);
-
-template <typename NcclThunkType, typename OpT>
-absl::Status AddOpDescription(absl::Status status, OpT op,
-                              int64_t replica_count, int64_t partition_count) {
-  if (status.ok()) {
-    return status;
-  }
-  CollectiveOpGroupMode group_mode = NcclThunkType::GetGroupMode(op);
-
-  int64_t operand_count = 0;
-  std::string str;
-
-  if constexpr (std::is_base_of_v<HloInstruction, std::remove_pointer_t<OpT>>) {
-    operand_count = op->operand_count();
-    str = op->ToString();
-  } else {
-    operand_count = op->getNumOperands() / 2;
-    str = llvm_ir::DumpToString(op.getOperation());
-  }
-
-  return absl::Status(
-      status.code(),
-      absl::StrFormat(
-          "%s\n"
-          "%s with replica_count: %d, partition_count: %d, group_mode: %s, "
-          "operand_count: %d\n%s",
-          status.message(), NcclThunkType::GetHloOpName(), replica_count,
-          partition_count, CollectiveOpGroupModeToString(group_mode),
-          operand_count, str));
-}
-
-//===----------------------------------------------------------------------===//
-
-absl::StatusOr<GpuCliqueKey> GetGpuCliqueKey(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, CollectiveStreamId stream_id,
-    AsyncStreamKind stream_kind);
-
-absl::StatusOr<size_t> GetNumLocalParticipants(
-    const Thunk::CollectiveExecuteParams& params,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode);
-
-// Returns a nccl comm and a flag indicating if it's a local communicator.
-absl::StatusOr<CommunicatorHandle> GetNcclComm(
-    GpuCollectives* collectives, const Thunk::CollectiveExecuteParams& params,
-    const Thunk::CollectiveCliques& collective_cliques,
-    const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, CollectiveStreamId stream_id,
-    AsyncStreamKind stream_kind);
-
-struct DeviceBufferPair {
-  PrimitiveType element_type;
-  int64_t element_count;
-  se::DeviceMemoryBase source_buffer;
-  se::DeviceMemoryBase destination_buffer;
-  int64_t source_memory_space;
-  int64_t destination_memory_space;
-};
-
-absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
-    const Thunk::ExecuteParams& params,
-    const std::vector<NcclCollectiveThunk::Buffer>& buffers,
-    const std::vector<PrimitiveType>& element_types);
-
-absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
-    const BufferAllocations* buffer_allocations,
-    const std::vector<NcclCollectiveThunk::Buffer>& buffers,
-    const std::vector<PrimitiveType>& element_types);
-
-// Registers buffers allocated in collective memory (see ncclMemAlloc) with a
-// communicator to enable zero-copy collectives.
-//
-// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
-absl::Status MaybeRegisterBuffers(GpuCollectives* collectives,
-                                  se::StreamExecutor* executor,
-                                  const std::vector<DeviceBufferPair>& buffers,
-                                  Communicator* comm);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_group_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_group_thunk.cc
deleted file mode 100644
index e292cb791810..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_group_thunk.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_group_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/stream_executor/event.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/util.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-NcclGroupThunk::NcclGroupThunk(const HloInstruction* instruction,
-                               Thunk::Kind kind,
-                               std::vector<std::unique_ptr<Thunk>> thunks,
-                               AsyncStreamKind stream_kind)
-    : Thunk(kind, ThunkInfo::WithProfileAnnotation(instruction)),
-      stream_kind_(stream_kind),
-      async_events_(new NcclCollectiveThunk::AsyncEvents()) {
-  for (auto& thunk : thunks) {
-    thunks_.emplace_back(std::move(thunk));
-  }
-}
-absl::Status NcclGroupThunk::Prepare(
-    const PrepareParams& params, ResourceRequestsInterface& resource_requests) {
-  for (const std::unique_ptr<Thunk>& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Prepare(params, resource_requests));
-  }
-  return absl::OkStatus();
-}
-absl::Status NcclGroupThunk::Initialize(const InitializeParams& params) {
-  if (async_events_) {
-    TF_RETURN_IF_ERROR(async_events_->Initialize(params.executor));
-  }
-  for (const std::unique_ptr<Thunk>& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(params));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status NcclGroupThunk::ExecuteOnStream(
-    const Thunk::ExecuteParams& params) {
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  int64_t async_stream_idx = static_cast<int64_t>(stream_kind_);
-  // Async streams are already assigned in gpu_executable.cc::ExecuteThunks.
-  // async_streams is therefore guaranteed to be non-null and to have enough
-  // elements to index by the AsyncStreamKind enum.
-  se::Stream* async_stream =
-      params.collective_params->async_streams.at(async_stream_idx);
-  TF_RETURN_IF_ERROR(async_stream->WaitFor(params.stream));
-  TF_RETURN_IF_ERROR(collectives->GroupStart());
-  for (const std::unique_ptr<Thunk>& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params));
-  }
-  TF_RETURN_IF_ERROR(collectives->GroupEnd());
-  TF_ASSIGN_OR_RETURN(se::Event * event,
-                      async_events_->GetEvent(params.stream->parent()));
-  TF_RETURN_IF_ERROR(async_stream->RecordEvent(event));
-
-  return absl::OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_group_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_group_thunk.h
deleted file mode 100644
index ceef74981def..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_group_thunk.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_GROUP_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_GROUP_THUNK_H_
-
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-
-namespace xla {
-namespace gpu {
-
-// NCCL group thunk fuses together a set of arbitrary operations into a single
-// NCCL group call in order for them to be dispatched to NCCL as a NCCL group.
-// NCCL may or may not execute them in parallel.
-
-class NcclGroupThunk : public Thunk {
- public:
-  NcclGroupThunk(const HloInstruction* instruction, Thunk::Kind kind,
-                 std::vector<std::unique_ptr<Thunk>> thunks,
-                 AsyncStreamKind stream_kind);
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequestsInterface& resource_requests) override;
-  absl::Status ExecuteOnStream(const Thunk::ExecuteParams& params) override;
-  absl::Status Initialize(const InitializeParams& params) override;
-  std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events() const {
-    return async_events_;
-  }
-
- private:
-  ThunkSequence thunks_;
-  AsyncStreamKind stream_kind_;
-  std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_GROUP_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_p2p_thunk_common.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_p2p_thunk_common.cc
deleted file mode 100644
index d6c4cdc70973..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_p2p_thunk_common.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/synchronization/mutex.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/executable_run_options.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/shape.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-absl::Status ExecutionCounters::Initialize(se::StreamExecutor* executor,
-                                           RunId run_id) {
-  absl::MutexLock lock(&mu_);
-  CounterKey key = {executor, run_id};
-  if (counters_.contains(key)) return absl::OkStatus();
-  counters_.emplace(key, 0);
-  return absl::OkStatus();
-}
-
-absl::StatusOr<int64_t*> ExecutionCounters::GetCounter(
-    se::StreamExecutor* executor, RunId run_id) {
-  absl::MutexLock lock(&mu_);
-  CounterKey key = {executor, run_id};
-  auto counter = counters_.find(key);
-  if (counter == counters_.end()) {
-    return absl::InternalError("Execution counter not initialized");
-  }
-
-  return &counter->second;
-}
-
-absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> GetSourceTargetPairs(
-    mlir::DictionaryAttr frontend_attributes) {
-  mlir::StringAttr src_dst_string = frontend_attributes.getAs<mlir::StringAttr>(
-      kSendRecvSourceTargetPairsAttr);
-  if (!src_dst_string) {
-    return absl::AbortedError(
-        absl::StrCat("expecting send/recv op with string attribute ",
-                     kSendRecvSourceTargetPairsAttr));
-  }
-  TF_ASSIGN_OR_RETURN(std::vector<ReplicaGroup> replica_groups,
-                      ParseReplicaGroupsOnly(src_dst_string.str()));
-  std::vector<std::pair<int64_t, int64_t>> source_target_pairs;
-  source_target_pairs.reserve(replica_groups.size());
-  for (const ReplicaGroup& replica_group : replica_groups) {
-    TF_RET_CHECK(replica_group.replica_ids_size() == 2);
-    source_target_pairs.emplace_back(replica_group.replica_ids(0),
-                                     replica_group.replica_ids(1));
-  }
-  return source_target_pairs;
-}
-
-NcclP2PConfig GetNcclP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
-                                          const Shape& shape,
-                                          int64_t replica_count,
-                                          int64_t partition_count) {
-  NcclP2PConfig p2p_config;
-  auto& config = p2p_config.config;
-
-  config.operand_count = 1;
-  config.operand_element_type.push_back(shape.element_type());
-  config.SetCollectiveOpKindAndID(instr);
-  config.group_mode = GetCollectiveOpGroupMode(
-                          instr->channel_id().value_or(0) > 0, std::nullopt)
-                          .value();
-
-  // All execution instances of a Send/Recv together form a replica group.
-  const int64_t num_participants =
-      config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? replica_count
-          : partition_count;
-  config.replica_groups.emplace_back();
-  ReplicaGroup& replica_group = config.replica_groups.front();
-  for (int i = 0; i < num_participants; ++i) {
-    replica_group.add_replica_ids(i);
-  }
-
-  std::optional<std::string> source_target_pairs_string =
-      instr->frontend_attributes().map().at(kSendRecvSourceTargetPairsAttr);
-
-  // We currently ignore problems related to the source-target-pair string to
-  // avoid using absl::StatusOr for the return type. This should be ok as
-  // Send/Recv are generated by the compiler.
-  if (!source_target_pairs_string.has_value()) {
-    return p2p_config;
-  }
-  auto statusor = ParseReplicaGroupsOnly(*source_target_pairs_string);
-  if (!statusor.ok()) {
-    return p2p_config;
-  }
-
-  std::vector<ReplicaGroup> replica_groups = statusor.value();
-  auto validation_it =
-      instr->frontend_attributes().map().find(kSendRecvValidationAttr);
-  NcclP2PConfig::ValidationKind validation_kind =
-      NcclP2PConfig::ValidationKind::kValid;
-  std::vector<ReplicaGroup> bounds;
-  if (validation_it != instr->frontend_attributes().map().end()) {
-    if (validation_it->second == "invalid") {
-      validation_kind = NcclP2PConfig::ValidationKind::kInvalid;
-    } else {
-      auto statusor_bounds = ParseReplicaGroupsOnly(validation_it->second);
-      if (!statusor_bounds.ok() ||
-          statusor_bounds.value().size() != replica_groups.size()) {
-        // Ignore problems related to the source-target-pair string to avoid
-        // using absl::StatusOr for the return type.
-        return p2p_config;
-      }
-      validation_kind = NcclP2PConfig::ValidationKind::kConditional;
-      bounds = statusor_bounds.value();
-    }
-  }
-
-  int i = 0;
-  p2p_config.validation_kind = validation_kind;
-  NcclP2PConfig::SourceTargetToBounds& source_target_to_bounds =
-      p2p_config.source_target_to_bounds;
-  for (const ReplicaGroup& replica_group : replica_groups) {
-    int64_t source = replica_group.replica_ids(0);
-    int64_t target = replica_group.replica_ids(1);
-
-    p2p_config.id_to_source_target.insert({target, {}}).first->second.source =
-        source;
-    p2p_config.id_to_source_target.insert({source, {}}).first->second.target =
-        target;
-
-    if (validation_kind == NcclP2PConfig::ValidationKind::kConditional) {
-      const ReplicaGroup& bound = bounds[i];
-      int64_t lower = bound.replica_ids(0);
-      int64_t upper = bound.replica_ids(1);
-      source_target_to_bounds[std::make_pair(source, target)] =
-          std::make_pair(lower, upper);
-      i++;
-    }
-  }
-
-  return p2p_config;
-}
-
-AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr) {
-  const auto& fe_map = instr->frontend_attributes().map();
-
-  // kCollectiveStreamAttrName takes precedence over kSendRecvPipelineAttr.
-  {
-    const auto it = fe_map.find(kCollectiveStreamAttrName);
-    if (it != fe_map.end() && it->second == kCollectiveStreamP2P) {
-      // Use any of the two p2p streams.
-      return AsyncStreamKind::kP2P0;
-    }
-  }
-
-  const auto it = fe_map.find(kSendRecvPipelineAttr);
-  if (it != fe_map.end() && it->second == "1") {
-    return AsyncStreamKind::kP2P1;
-  }
-  return AsyncStreamKind::kP2P0;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_p2p_thunk_common.h b/third_party/xla/xla/backends/gpu/runtime/nccl_p2p_thunk_common.h
deleted file mode 100644
index 92adf168fa5f..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_p2p_thunk_common.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_P2P_THUNK_COMMON_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_P2P_THUNK_COMMON_H_
-
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/synchronization/mutex.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/shape.h"
-
-namespace xla {
-namespace gpu {
-
-// Count the number of times a Send or Recv instruction executed on a device.
-class ExecutionCounters {
- public:
-  absl::Status Initialize(se::StreamExecutor* executor, RunId run_id);
-  absl::StatusOr<int64_t*> GetCounter(se::StreamExecutor* executor,
-                                      RunId run_id);
-
- private:
-  using CounterKey = std::pair<se::StreamExecutor*, RunId>;
-  absl::Mutex mu_;
-  // TODO(b/338288906): may need to clean up the counters for finished runs.
-  absl::flat_hash_map<CounterKey, int64_t> counters_ ABSL_GUARDED_BY(mu_);
-};
-
-// Records the information for implementing CollectivePermute, Send and Recv.
-struct NcclP2PConfig {
-  // Record the target ID for sending a data and the source ID from which to
-  // receive a data. Either target or source can be optional.
-  struct SourceTargetMapEntry {
-    std::optional<int64_t> source;
-    std::optional<int64_t> target;
-  };
-
-  using IdToSourceTargetMap =
-      absl::flat_hash_map<int64_t, SourceTargetMapEntry>;
-
-  enum class ValidationKind { kValid = 0, kInvalid = 1, kConditional = 2 };
-
-  using SourceTargetToBounds = absl::flat_hash_map<std::pair<int64_t, int64_t>,
-                                                   std::pair<int64_t, int64_t>>;
-
-  // Returns the source and target ID corresponding to the given ID (these IDs
-  // are replica_ids for cross replica permute or partition_ids for cross
-  // partition permute). The source ID is the id which will send data to this
-  // ID and the target ID is the id to which this ID will send its data. Either
-  // can be optional.
-  static SourceTargetMapEntry GetSourceTarget(
-      const IdToSourceTargetMap& id_to_source_target, int64_t id) {
-    auto it = id_to_source_target.find(id);
-    if (it != id_to_source_target.end()) return it->second;
-    return SourceTargetMapEntry{};
-  }
-
-  NcclCollectiveConfig config;
-  IdToSourceTargetMap id_to_source_target;
-  ValidationKind validation_kind = ValidationKind::kValid;
-  // When a Send or Recv has validation_kind = ValidationKind::kConditional,
-  // record the valid execution numbers as a pair of [lower-bound, upper-bound]
-  // for each source and target pair.
-  SourceTargetToBounds source_target_to_bounds;
-};
-
-// Extracts source/target pairs for send/recv from frontend attributes.
-absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> GetSourceTargetPairs(
-    mlir::DictionaryAttr frontend_attributes);
-
-// Constructs the NcclP2PConfig for an HLO Send or Recv instruction.
-NcclP2PConfig GetNcclP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
-                                          const Shape& shape,
-                                          int64_t replica_count,
-                                          int64_t partition_count);
-// Returns the stream kind for the asynchronous stream used to execute an HLO
-// Send or Recv instruction, by inspecting the frontend attributes of the
-// instruction.
-AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_P2P_THUNK_COMMON_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.cc
deleted file mode 100644
index 00dcde90961c..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.cc
+++ /dev/null
@@ -1,525 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/strings/substitute.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "xla/service/rendezvous.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_handle.h"
-#include "xla/stream_executor/event.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-// RaggedAllToAll has 4 operands with ragged tensor metadata: input_offsets,
-// send_sizes, output_offsets, and recv_sizes.
-constexpr int64_t kNumRaggedMetadataOperands = 4;
-
-NcclRaggedAllToAllConfig GetNcclRaggedAllToAllConfig(
-    const HloRaggedAllToAllInstruction* instr) {
-  NcclRaggedAllToAllConfig config;
-  config.config = GetNcclCollectiveConfig(instr, std::nullopt);
-
-  const Shape& input_size_shape = instr->operand(2)->shape();
-  config.num_total_updates = input_size_shape.dimensions(0);
-  config.ragged_row_element_size =
-      ShapeUtil::ElementsIn(instr->shape()) / instr->shape().dimensions(0);
-  return config;
-}
-
-// Loads the offsets and sizes of the input and output ragged tensors from
-// device memory.
-//
-// The parameter `ragged_metadata_allocs` is a vector of pointers to the buffers
-// in the host memory allocated by StreamExecutor to copy data from the device
-// memory.
-absl::Status LoadRaggedTensorMetadata(
-    se::Stream& stream, const std::vector<DeviceBufferPair>& buffers,
-    const std::vector<int64_t*>& ragged_metadata_allocs) {
-  for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
-    TF_RETURN_IF_ERROR(stream.Memcpy(ragged_metadata_allocs[i],
-                                     buffers[i + 2].source_buffer,
-                                     buffers[i + 2].source_buffer.size()));
-  }
-
-  // Wait for the copies to complete.
-  if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
-    return absl::InternalError(absl::StrFormat(
-        "Failed to complete all kernels launched on stream %p: %s", &stream,
-        blocked.message()));
-  }
-
-  return absl::OkStatus();
-}
-
-// Runs AllToAll on a buffer that contains ragged tensor metadata.
-absl::Status RunAllToAllOnIndexBuffer(
-    GpuCollectives* collectives, const se::DeviceMemoryBase& source_buffer,
-    int64_t num_updates_per_replica,
-    const se::DeviceMemoryBase& destination_buffer, PrimitiveType element_type,
-    se::Stream& stream, Communicator* comm) {
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
-
-  TF_RETURN_IF_ERROR(collectives->GroupStart());
-  for (int peer = 0; peer < num_ranks; ++peer) {
-    int64_t offset = peer * num_updates_per_replica;
-    se::DeviceMemoryBase send_slice = collectives->Slice(
-        source_buffer, element_type, offset, /*count=*/num_updates_per_replica);
-    se::DeviceMemoryBase recv_slice =
-        collectives->Slice(destination_buffer, element_type, offset,
-                           /*count=*/num_updates_per_replica);
-
-    TF_RETURN_IF_ERROR(comm->Send(send_slice, element_type,
-                                  /*count=*/num_updates_per_replica,
-                                  RankId(peer), GpuCollectives::On(stream)));
-
-    TF_RETURN_IF_ERROR(comm->Recv(recv_slice, element_type,
-                                  /*count=*/num_updates_per_replica,
-                                  RankId(peer), GpuCollectives::On(stream)));
-  }
-
-  TF_RETURN_IF_ERROR(collectives->GroupEnd());
-  return stream.BlockHostUntilDone();
-}
-
-absl::Status RunRaggedAllToAll(
-    GpuCollectives* collectives, int64_t ragged_row_element_size,
-    int64_t num_total_updates,
-    const std::vector<DeviceBufferPair>& original_buffers, se::Stream& stream,
-    Communicator* comm, const std::vector<int64_t*>& ragged_metadata_allocs,
-    const se::DeviceMemoryBase& output_offsets_device_buffer) {
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing ragged-all-to-all from device ordinal: "
-          << device_ordinal;
-  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(collectives, stream.parent(),
-                                          original_buffers, comm));
-
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
-
-  std::vector<DeviceBufferPair> buffers = original_buffers;
-
-  int64_t num_updates_per_replica = num_total_updates / num_ranks;
-
-  // `output_offsets` of the RaggedAllToAll instruction are sharded in a way,
-  // that `output_offset[i]` is an offset in the i-th peer output buffer. To
-  // make it work for NCCL model with send/recv, we need to know offsets in the
-  // local output buffer. To get the correct offsets we perform an AllToAll on
-  // the output_offsets buffer.
-  DeviceBufferPair& output_offsets_buffer_pair = buffers[4];
-  TF_RETURN_IF_ERROR(RunAllToAllOnIndexBuffer(
-      collectives, output_offsets_buffer_pair.source_buffer,
-      num_updates_per_replica, output_offsets_device_buffer,
-      output_offsets_buffer_pair.element_type, stream, comm));
-  output_offsets_buffer_pair.source_buffer = output_offsets_device_buffer;
-
-  TF_RETURN_IF_ERROR(
-      LoadRaggedTensorMetadata(stream, buffers, ragged_metadata_allocs));
-
-  const int64_t* input_offsets = ragged_metadata_allocs[0];
-  const int64_t* send_sizes = ragged_metadata_allocs[1];
-  const int64_t* output_offsets = ragged_metadata_allocs[2];
-  const int64_t* recv_sizes = ragged_metadata_allocs[3];
-
-  TF_RETURN_IF_ERROR(collectives->GroupStart());
-
-  PrimitiveType element_type = buffers[0].element_type;
-
-  se::DeviceMemoryBase input_buffer = buffers[0].source_buffer;
-  se::DeviceMemoryBase output_buffer = buffers[1].destination_buffer;
-
-  for (int64_t i = 0; i < num_updates_per_replica; ++i) {
-    for (int peer = 0; peer < num_ranks; ++peer) {
-      int64_t idx = peer * num_updates_per_replica + i;
-      se::DeviceMemoryBase send_slice =
-          collectives->Slice(input_buffer, element_type,
-                             input_offsets[idx] * ragged_row_element_size,
-                             send_sizes[idx] * ragged_row_element_size);
-
-      se::DeviceMemoryBase recv_slice =
-          collectives->Slice(output_buffer, element_type,
-                             output_offsets[idx] * ragged_row_element_size,
-                             recv_sizes[idx] * ragged_row_element_size);
-
-      TF_RETURN_IF_ERROR(comm->Send(send_slice, element_type,
-                                    send_sizes[idx] * ragged_row_element_size,
-                                    RankId(peer), GpuCollectives::On(stream)));
-
-      TF_RETURN_IF_ERROR(comm->Recv(recv_slice, element_type,
-                                    recv_sizes[idx] * ragged_row_element_size,
-                                    RankId(peer), GpuCollectives::On(stream)));
-    }
-  }
-
-  return collectives->GroupEnd();
-}
-
-// Contains the values that are passed between host threads with rendezvous.
-struct RendezvousValue {
-  RankId rank;
-  se::DeviceMemoryBase output_buffer;
-  se::Event* start_event;
-  se::Event* end_event;
-
-  bool operator<(const RendezvousValue& other) const {
-    return rank < other.rank;
-  }
-};
-
-// Executes the rendezvous before the kernel start.
-// Inserts CUDA events into the stream to ensure that all devices have reached
-// the start event before the kernel starts.
-absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
-RendezvousBeforeKernelStart(absl::string_view name,
-                            const GpuCliqueKey& clique_key, RankId rank,
-                            int64_t num_ranks,
-                            const se::DeviceMemoryBase& output_buffer,
-                            se::Stream& stream, se::Event* start_event,
-                            se::Event* end_event) {
-  RendezvousValue rendezvous_value;
-  rendezvous_value.rank = rank;
-  rendezvous_value.output_buffer = output_buffer;
-  rendezvous_value.start_event = start_event;
-  rendezvous_value.end_event = end_event;
-
-  // Record that this device has started the memcpy ragged-all-to-all. We do
-  // this before the rendezvous to make sure that RecordEvent is called before
-  // WaitFor on another stream.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(start_event));
-
-  auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
-    std::vector<RendezvousValue> values_copy;
-    for (const auto& value : values) {
-      values_copy.push_back(*value);
-    }
-    // Sort to make sure that values are in the same order as the devices are
-    // ordered in the communicator.
-    absl::c_sort(values_copy);
-    return values_copy;
-  };
-
-  std::string start_rendezvous_key =
-      absl::StrFormat("start %s ragged-all-to-all for rank %d, clique %s", name,
-                      rank.value(), clique_key.ToString());
-  std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values =
-      Rendezvous<std::vector<RendezvousValue>>(
-          /*name=*/
-          start_rendezvous_key, /*key=*/clique_key,
-          /*value=*/rendezvous_value, /*num_threads=*/num_ranks, rendezvous_fn);
-
-  // Wait for all devices to reach the start event. This indicates that all
-  // output buffers are ready for transfer.
-  for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.start_event));
-  }
-
-  return rendezvous_values;
-}
-
-// Executes the rendezvous after the kernel finish. Waits for all devices to
-// reach the end event.
-absl::Status RendezvousAfterKernelFinish(
-    absl::string_view name, const GpuCliqueKey& clique_key, RankId rank,
-    int64_t num_ranks, se::Stream& stream, se::Event* end_event,
-    const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
-  // Record that this device has finished the memcpy ragged-all-to-all.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(end_event));
-
-  // Do another rendezvous to make sure that we call RecordEvent for end_event
-  // before WaitFor on another stream.
-  std::string finish_rendezvous_key =
-      absl::StrFormat("finish %s ragged-all-to-all for rank %d, clique %s",
-                      name, rank.value(), clique_key.ToString());
-  Rendezvous(/*name=*/finish_rendezvous_key,
-             /*key=*/clique_key,
-             /*num_threads=*/num_ranks);
-
-  // Wait for all devices to reach the end event. This indicates that all
-  // updates from other devices have arrived.
-  for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.end_event));
-  }
-
-  return absl::OkStatus();
-}
-
-// TODO(b/380457503): Memcpy AllToAll implementation must be moved to
-// NcclCommunicator implementation.
-absl::Status RunMemCpyRaggedAllToAll(
-    GpuCollectives* collectives, const GpuCliqueKey& clique_key, RankId rank,
-    int64_t ragged_row_element_size, int64_t num_total_updates,
-    const std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
-    Communicator* comm, const std::vector<int64_t*>& ragged_metadata_allocs,
-    se::Event* start_event, se::Event* end_event) {
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing mem-copy-ragged-all-to-all from device ordinal: "
-          << device_ordinal;
-  TF_RETURN_IF_ERROR(
-      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
-
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
-
-  PrimitiveType element_type = buffers[0].element_type;
-
-  se::DeviceMemoryBase input_buffer = buffers[0].source_buffer;
-  se::DeviceMemoryBase output_buffer = buffers[1].destination_buffer;
-
-  TF_RETURN_IF_ERROR(
-      LoadRaggedTensorMetadata(stream, buffers, ragged_metadata_allocs));
-
-  int64_t num_updates_per_replica = num_total_updates / num_ranks;
-
-  const int64_t* input_offsets = ragged_metadata_allocs[0];
-  const int64_t* send_sizes = ragged_metadata_allocs[1];
-  const int64_t* output_offsets = ragged_metadata_allocs[2];
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
-      RendezvousBeforeKernelStart(
-          /*name=*/"memcpy", clique_key, rank, num_ranks, output_buffer, stream,
-          start_event, end_event));
-
-  // Transfer a slice of data to each peer's output buffer.
-  for (int64_t i = 0; i < num_updates_per_replica; ++i) {
-    for (int peer = 0; peer < num_ranks; ++peer) {
-      int64_t idx = peer * num_updates_per_replica + i;
-      se::DeviceMemoryBase send_slice =
-          collectives->Slice(input_buffer, element_type,
-                             input_offsets[idx] * ragged_row_element_size,
-                             send_sizes[idx] * ragged_row_element_size);
-      se::DeviceMemoryBase dst_slice = collectives->Slice(
-          (*rendezvous_values)[peer].output_buffer, element_type,
-          output_offsets[idx] * ragged_row_element_size,
-          send_sizes[idx] * ragged_row_element_size);
-      TF_RETURN_IF_ERROR(
-          stream.MemcpyD2D(&dst_slice, send_slice, send_slice.size()));
-    }
-  }
-
-  TF_RETURN_IF_ERROR(RendezvousAfterKernelFinish(
-      /*name=*/"memcpy", clique_key, rank, num_ranks, stream, end_event,
-      rendezvous_values));
-
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-NcclRaggedAllToAllStartThunk::NcclRaggedAllToAllStartThunk(
-    ThunkInfo thunk_info, const HloRaggedAllToAllInstruction* instr,
-    std::vector<NcclCollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
-    : NcclCollectiveThunk(Thunk::kNcclRaggedAllToAllStart, thunk_info,
-                          IsGPUSyncCollective(*instr),
-                          AsyncStreamKind::kCollective),
-      config_(GetNcclRaggedAllToAllConfig(instr)),
-      buffers_(std::move(buffers)),
-      p2p_memcpy_enabled_(p2p_memcpy_enabled) {
-  CHECK_EQ(config_.config.operand_count, buffers_.size());
-}
-
-/*static*/ absl::Status NcclRaggedAllToAllStartThunk::CheckImplementable(
-    const HloRaggedAllToAllInstruction* instr, int64_t replica_count,
-    int64_t partition_count) {
-  auto status = [&instr]() -> absl::Status {
-    for (HloInstruction* operand : instr->operands()) {
-      Shape shape = operand->shape();
-      TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kNcclRaggedAllToAll));
-    }
-
-    if (!ShapeUtil::IsEffectivelyMostMajorDimension(instr->shape(), 0)) {
-      return absl::UnimplementedError(absl::Substitute(
-          "ragged-all-to-all must have the ragged dimension (0) in the most "
-          "major position in the layout $0.",
-          instr->shape().layout().ToString()));
-    }
-
-    if (instr->operand(2)->shape().element_type() != S64) {
-      return absl::InvalidArgumentError(
-          "RaggedAllToAllDecomposer only supports S64 offsets. Was "
-          "`ragged-all-to-all-canonicalizer` pass executed?");
-    }
-
-    return absl::OkStatus();
-  };
-  return AddOpDescription<NcclRaggedAllToAllStartThunk>(
-      status(), instr, replica_count, partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclRaggedAllToAllStartThunk::GetGroupMode(
-    const HloRaggedAllToAllInstruction* instr) {
-  return GetNcclRaggedAllToAllConfig(instr).config.group_mode;
-}
-
-absl::Status NcclRaggedAllToAllStartThunk::Initialize(
-    const InitializeParams& params) {
-  TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
-  device_count_ = params.local_device_count;
-
-  // Allocate temp buffers in the host memory to load the sizes and offsets of
-  // ragged tensors from device memory.
-  absl::MutexLock lock(&mutex_);
-  if (!host_buffer_allocs_.contains(params.executor)) {
-    std::vector<std::unique_ptr<se::MemoryAllocation>> allocs;
-    for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> alloc,
-                          params.executor->HostMemoryAllocate(
-                              config_.num_total_updates * sizeof(int64_t)));
-      allocs.push_back(std::move(alloc));
-    }
-    host_buffer_allocs_.emplace(params.executor, std::move(allocs));
-  }
-
-  if (!device_buffer_allocs_.contains(params.executor)) {
-    se::DeviceMemoryHandle output_offsets_device_buffer{
-        params.executor,
-        params.executor->Allocate(config_.num_total_updates * sizeof(int64_t))};
-
-    if (output_offsets_device_buffer.memory().is_null()) {
-      return absl::InternalError("Failed to allocate output offsets buffer.");
-    }
-
-    device_buffer_allocs_.emplace(params.executor,
-                                  std::move(output_offsets_device_buffer));
-  }
-
-  if (should_use_memcpy()) {
-    se::StreamExecutor* executor = params.executor;
-    {
-      absl::MutexLock lock(&events_mutex_);
-      if (!start_events_.count(executor)) {
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Event> event,
-                            executor->CreateEvent());
-        start_events_.insert({executor, std::move(event)});
-      }
-
-      if (!end_events_.count(executor)) {
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Event> event,
-                            executor->CreateEvent());
-        end_events_.insert({executor, std::move(event)});
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-bool NcclRaggedAllToAllStartThunk::is_local() const {
-  CHECK_NE(device_count_, -1);
-  for (const auto& replica_group : config_.config.replica_groups) {
-    const int64_t node_id = replica_group.replica_ids().at(0) / device_count_;
-    if (!absl::c_all_of(replica_group.replica_ids(),
-                        [this, node_id](const int64_t rank) {
-                          return rank / device_count_ == node_id;
-                        })) {
-      return false;
-    }
-  }
-  return true;
-}
-
-absl::Status NcclRaggedAllToAllStartThunk::RunNcclCollective(
-    const ExecuteParams& params, se::Stream& stream,
-    CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-
-  // Get buffer allocs to load sizes and offsets of ragged tensors from device
-  // memory.
-  std::vector<int64_t*> ragged_metadata_allocs(kNumRaggedMetadataOperands);
-  se::DeviceMemoryBase output_offsets_device_buffer;
-  {
-    absl::MutexLock lock(&mutex_);
-    auto it = host_buffer_allocs_.find(stream.parent());
-    CHECK(it != host_buffer_allocs_.end());
-
-    for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
-      ragged_metadata_allocs[i] =
-          reinterpret_cast<int64_t*>(it->second[i]->opaque());
-    }
-
-    auto jt = device_buffer_allocs_.find(stream.parent());
-    CHECK(jt != device_buffer_allocs_.end());
-    output_offsets_device_buffer = jt->second.memory();
-  }
-
-  if (should_use_memcpy()) {
-    se::Event* start_event = nullptr;
-    se::Event* end_event = nullptr;
-    {
-      absl::MutexLock lock(&events_mutex_);
-      start_event = start_events_[stream.parent()].get();
-      end_event = end_events_[stream.parent()].get();
-    }
-    TF_ASSIGN_OR_RETURN(
-        GpuCliqueKey clique_key,
-        GetGpuCliqueKey(collectives, *params.collective_params,
-                        config().replica_groups, config().group_mode,
-                        nccl_stream_id(), GetAsyncStreamKind()));
-
-    std::optional<RankId> rank =
-        clique_key.rank(params.collective_params->global_device_id);
-
-    return RunMemCpyRaggedAllToAll(
-        collectives, clique_key, *rank, config_.ragged_row_element_size,
-        config_.num_total_updates, device_buffers, stream, comm_handle.comm,
-        ragged_metadata_allocs, start_event, end_event);
-  }
-
-  return RunRaggedAllToAll(collectives, config_.ragged_row_element_size,
-                           config_.num_total_updates, device_buffers, stream,
-                           comm_handle.comm, ragged_metadata_allocs,
-                           output_offsets_device_buffer);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.h
deleted file mode 100644
index d49e106a8dc9..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_RAGGED_ALL_TO_ALL_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_RAGGED_ALL_TO_ALL_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/stream_executor/device_memory_handle.h"
-#include "xla/stream_executor/event.h"
-#include "xla/stream_executor/memory_allocation.h"
-#include "xla/stream_executor/stream.h"
-
-namespace xla {
-namespace gpu {
-
-struct NcclRaggedAllToAllConfig {
-  NcclCollectiveConfig config;
-  int64_t num_total_updates = 1;
-  int64_t ragged_row_element_size = 1;
-};
-
-// Thunk that performs a NCCL-based Ragged-All-to-All among CUDA GPU-based
-// replicas.
-class NcclRaggedAllToAllStartThunk : public NcclCollectiveThunk {
- public:
-  NcclRaggedAllToAllStartThunk(ThunkInfo thunk_info,
-                               const HloRaggedAllToAllInstruction* instr,
-                               std::vector<Buffer> buffers,
-                               bool p2p_memcpy_enabled);
-
-  // Returns whether the given instruction can be lowered to a nccl
-  // ragged-all-to-all call.
-  static absl::Status CheckImplementable(
-      const HloRaggedAllToAllInstruction* instr, int64_t replica_count,
-      int64_t partition_count);
-
-  absl::Status Initialize(const InitializeParams& params) override;
-
-  static const char* GetHloOpName() { return "ragged-all-to-all-start"; }
-
-  static CollectiveOpGroupMode GetGroupMode(
-      const HloRaggedAllToAllInstruction* instr);
-
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-  absl::Span<const Buffer> buffers() const { return buffers_; }
-
- protected:
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-
- private:
-  bool is_local() const;
-  bool should_use_memcpy() const { return p2p_memcpy_enabled_ && is_local(); }
-
-  const NcclRaggedAllToAllConfig config_;
-  const std::vector<Buffer> buffers_;
-  int64_t device_count_ = -1;
-  const bool p2p_memcpy_enabled_;
-
-  absl::Mutex mutex_;
-  absl::flat_hash_map<se::StreamExecutor*,
-                      std::vector<std::unique_ptr<se::MemoryAllocation>>>
-      host_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
-
-  absl::flat_hash_map<se::StreamExecutor*, se::DeviceMemoryHandle>
-      device_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
-
-  absl::Mutex events_mutex_;
-  // Events to synchronize steams on different devices at the start of the
-  // kernel.
-  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>>
-      start_events_ ABSL_GUARDED_BY(events_mutex_);
-  // Events to synchronize steams on different devices at the end of the kernel.
-  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>>
-      end_events_ ABSL_GUARDED_BY(events_mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_RAGGED_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_recv_thunk.cc
deleted file mode 100644
index e1b2e0bdfc24..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_recv_thunk.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_recv_thunk.h"
-
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-NcclRecvThunk::NcclRecvThunk(ThunkInfo thunk_info,
-                             const HloRecvInstruction* instr,
-                             int64_t replica_count, int64_t partition_count,
-                             const Buffer& buffer)
-    : NcclCollectiveThunk(Thunk::kNcclRecv, thunk_info,
-                          /*is_sync=*/false, GetStreamKindForP2P(instr)),
-      config_(GetNcclP2PConfigForSendRecv(instr, instr->shape().tuple_shapes(0),
-                                          replica_count, partition_count)),
-      buffer_(buffer),
-      execution_counters_(config_.validation_kind ==
-                                  NcclP2PConfig::ValidationKind::kConditional
-                              ? new ExecutionCounters()
-                              : nullptr),
-      hlo_name_(instr->name()) {}
-
-absl::Status NcclRecvThunk::Initialize(const InitializeParams& params) {
-  TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
-  if (execution_counters_) {
-    TF_RETURN_IF_ERROR(execution_counters_->Initialize(
-        params.executor, params.collective_params->run_id));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status NcclRecvThunk::RunNcclCollective(const ExecuteParams& params,
-                                              se::Stream& stream,
-                                              CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, {buffer_},
-                             config_.config.operand_element_type));
-  TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
-
-  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
-
-  TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
-                      params.collective_params->device_assn->LogicalIdForDevice(
-                          global_device_id));
-  const int64_t current_id =
-      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
-  std::string device_string = GetDeviceString(*params.collective_params);
-
-  const NcclP2PConfig::SourceTargetMapEntry source_target =
-      NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
-  DeviceBufferPair& buffer = device_buffers[0];
-
-  // Determine the source IDs for this instance. The source ID is the ID for
-  // the peer that will copy its data to this instance. If there is no
-  // source, just memzero() the destination buffer.
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing Recv from device ordinal: " << device_ordinal
-          << ", current_id: " << current_id << ", group mode: "
-          << CollectiveOpGroupModeToString(config_.config.group_mode) << " ("
-          << hlo_name_ << ")";
-
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(collectives, stream.parent(),
-                                          {buffer}, comm_handle.comm));
-
-  const std::optional<int64_t> source_id = source_target.source;
-  se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
-
-  VLOG(3) << absl::StreamFormat("%s : id = %d, source_id = %d", device_string,
-                                current_id, source_id.value_or(-1));
-
-  // Receive data from the source peer to the destination buffer.
-  if (source_id) {
-    bool should_run =
-        config_.validation_kind == NcclP2PConfig::ValidationKind::kInvalid
-            ? false
-            : true;
-    if (config_.validation_kind ==
-        NcclP2PConfig::ValidationKind::kConditional) {
-      se::StreamExecutor* executor = params.stream->parent();
-      TF_ASSIGN_OR_RETURN(int64_t* counter,
-                          execution_counters_->GetCounter(
-                              executor, params.collective_params->run_id));
-      auto it = config_.source_target_to_bounds.find(
-          std::make_pair(*source_target.source, current_id));
-      if (it == config_.source_target_to_bounds.end()) {
-        return absl::InternalError("Missing bounds for conditional Recv");
-      }
-      if (*counter < it->second.first || *counter > it->second.second) {
-        should_run = false;
-      }
-      VLOG(3) << "RunNcclCollective counter " << *counter << " " << should_run;
-      ++(*counter);
-    }
-    if (should_run) {
-      TF_RETURN_IF_ERROR(comm_handle.comm->Recv(
-          dest_addr, buffer.element_type, buffer.element_count,
-          RankId(*source_id), GpuCollectives::On(stream)));
-    } else {
-      VLOG(3) << "Skipping Recv";
-    }
-
-  } else {
-    // If there is no source peer, i.e. no sender to this instance, zero out
-    // the destination buffer.
-    VLOG(3) << absl::StreamFormat("%s : Recv: Issuing MemZero", device_string);
-    TF_RETURN_IF_ERROR(stream.MemZero(&dest_addr, dest_addr.size()));
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_recv_thunk.h
deleted file mode 100644
index 3df7b82ed4c1..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_recv_thunk.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_RECV_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_RECV_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/stream_executor/stream.h"
-
-namespace xla {
-namespace gpu {
-
-// Thunk that performs a NCCL-recv.
-class NcclRecvThunk : public NcclCollectiveThunk {
- public:
-  NcclRecvThunk(ThunkInfo thunk_info, const HloRecvInstruction* instr,
-                int64_t replica_count, int64_t partition_count,
-                const Buffer& buffer);
-  absl::Status Initialize(const InitializeParams& params) override;
-
- protected:
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-  bool NeedFirstCallRendzevous() const override { return false; }
-
- private:
-  const NcclP2PConfig config_;
-  const Buffer buffer_;
-  std::shared_ptr<ExecutionCounters> execution_counters_;
-  std::string hlo_name_;
-};
-
-absl::Status RunRecv(GpuCollectives* collectives,
-                     NcclP2PConfig::SourceTargetMapEntry source_target,
-                     DeviceBufferPair& buffer, se::Stream& stream,
-                     Communicator* comm, absl::string_view device_string,
-                     int64_t current_id);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_RECV_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nccl_send_thunk.cc
deleted file mode 100644
index 46c124b5e486..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_send_thunk.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/nccl_send_thunk.h"
-
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/core/collectives/rank_id.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-NcclSendThunk::NcclSendThunk(ThunkInfo thunk_info,
-                             const HloSendInstruction* instr,
-                             int64_t replica_count, int64_t partition_count,
-                             const Buffer& buffer)
-    : NcclCollectiveThunk(Thunk::kNcclSend, thunk_info,
-                          /*is_sync=*/false, GetStreamKindForP2P(instr)),
-      config_(GetNcclP2PConfigForSendRecv(instr, instr->operand(0)->shape(),
-                                          replica_count, partition_count)),
-      buffer_(buffer),
-      execution_counters_(config_.validation_kind ==
-                                  NcclP2PConfig::ValidationKind::kConditional
-                              ? new ExecutionCounters()
-                              : nullptr),
-      hlo_name_(instr->name()) {}
-
-absl::Status NcclSendThunk::Initialize(const InitializeParams& params) {
-  TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
-  if (execution_counters_) {
-    TF_RETURN_IF_ERROR(execution_counters_->Initialize(
-        params.executor, params.collective_params->run_id));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status NcclSendThunk::RunNcclCollective(const ExecuteParams& params,
-                                              se::Stream& stream,
-                                              CommunicatorHandle comm_handle) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, {buffer_},
-                             config_.config.operand_element_type));
-  TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
-
-  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
-
-  TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
-                      params.collective_params->device_assn->LogicalIdForDevice(
-                          global_device_id));
-  const int64_t current_id =
-      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
-          ? current_logical_id.replica_id
-          : current_logical_id.computation_id;
-  std::string device_string = GetDeviceString(*params.collective_params);
-
-  const NcclP2PConfig::SourceTargetMapEntry source_target =
-      NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
-  DeviceBufferPair& buffer = device_buffers[0];
-
-  // Determine the target IDs for this instance. The target ID is the ID
-  // to which this instance will copy its data.
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Performing Send from device ordinal: " << device_ordinal
-          << ", current_id: " << current_id << ", group mode: "
-          << CollectiveOpGroupModeToString(config_.config.group_mode) << " ("
-          << hlo_name_ << ")";
-
-  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
-  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(collectives, stream.parent(),
-                                          {buffer}, comm_handle.comm));
-
-  const std::optional<int64_t> target_id = source_target.target;
-  se::DeviceMemoryBase src_addr = buffer.source_buffer;
-
-  VLOG(3) << absl::StreamFormat("%s : id = %d, target_id = %d", device_string,
-                                current_id, target_id.value_or(-1));
-
-  // Send source buffer to target peer if needed.
-  if (target_id) {
-    bool should_run =
-        config_.validation_kind == NcclP2PConfig::ValidationKind::kInvalid
-            ? false
-            : true;
-    if (config_.validation_kind ==
-        NcclP2PConfig::ValidationKind::kConditional) {
-      se::StreamExecutor* executor = params.stream->parent();
-      TF_ASSIGN_OR_RETURN(int64_t* counter,
-                          execution_counters_->GetCounter(
-                              executor, params.collective_params->run_id));
-      auto it = config_.source_target_to_bounds.find(
-          std::make_pair(current_id, *source_target.target));
-      if (it == config_.source_target_to_bounds.end()) {
-        return absl::InternalError("Missing bounds for conditional Send");
-      }
-      if (*counter < it->second.first || *counter > it->second.second) {
-        should_run = false;
-      }
-      VLOG(3) << "RunNcclCollective counter " << *counter << " " << should_run;
-      ++(*counter);
-    }
-
-    if (should_run) {
-      TF_RETURN_IF_ERROR(comm_handle.comm->Send(
-          src_addr, buffer.element_type, buffer.element_count,
-          RankId(*target_id), GpuCollectives::On(stream)));
-    } else {
-      VLOG(3) << "Skipping Send";
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/nccl_send_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nccl_send_thunk.h
deleted file mode 100644
index 8fa91e1b95f9..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/nccl_send_thunk.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_NCCL_SEND_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_NCCL_SEND_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-#include <string>
-
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/core/collectives/communicator.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/stream_executor/stream.h"
-
-namespace xla {
-namespace gpu {
-
-// Thunk that performs a NCCL-send.
-class NcclSendThunk : public NcclCollectiveThunk {
- public:
-  NcclSendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
-                int64_t replica_count, int64_t partition_count,
-                const Buffer& buffer);
-  absl::Status Initialize(const InitializeParams& params) override;
-
- protected:
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-  absl::Status RunNcclCollective(const ExecuteParams& params,
-                                 se::Stream& stream,
-                                 CommunicatorHandle comm_handle) override;
-  bool NeedFirstCallRendzevous() const override { return false; }
-
- private:
-  const NcclP2PConfig config_;
-  const Buffer buffer_;
-  std::shared_ptr<ExecutionCounters> execution_counters_;
-  std::string hlo_name_;
-};
-
-absl::Status RunSend(GpuCollectives* collectives,
-                     NcclP2PConfig::SourceTargetMapEntry source_target,
-                     DeviceBufferPair& buffer, se::Stream& stream,
-                     Communicator* comm, absl::string_view device_string,
-                     int64_t current_id);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_NCCL_SEND_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
new file mode 100644
index 000000000000..e986d9db749d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
@@ -0,0 +1,191 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/shape.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+absl::Status ExecutionCounters::Initialize(se::StreamExecutor* executor,
+                                           RunId run_id) {
+  absl::MutexLock lock(&mu_);
+  CounterKey key = {executor, run_id};
+  if (counters_.contains(key)) return absl::OkStatus();
+  counters_.emplace(key, 0);
+  return absl::OkStatus();
+}
+
+absl::StatusOr<int64_t*> ExecutionCounters::GetCounter(
+    se::StreamExecutor* executor, RunId run_id) {
+  absl::MutexLock lock(&mu_);
+  CounterKey key = {executor, run_id};
+  auto counter = counters_.find(key);
+  if (counter == counters_.end()) {
+    return absl::InternalError("Execution counter not initialized");
+  }
+
+  return &counter->second;
+}
+
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> GetSourceTargetPairs(
+    mlir::DictionaryAttr frontend_attributes) {
+  mlir::StringAttr src_dst_string = frontend_attributes.getAs<mlir::StringAttr>(
+      kSendRecvSourceTargetPairsAttr);
+  if (!src_dst_string) {
+    return absl::AbortedError(
+        absl::StrCat("expecting send/recv op with string attribute ",
+                     kSendRecvSourceTargetPairsAttr));
+  }
+  TF_ASSIGN_OR_RETURN(std::vector<ReplicaGroup> replica_groups,
+                      ParseReplicaGroupsOnly(src_dst_string.str()));
+  std::vector<std::pair<int64_t, int64_t>> source_target_pairs;
+  source_target_pairs.reserve(replica_groups.size());
+  for (const ReplicaGroup& replica_group : replica_groups) {
+    TF_RET_CHECK(replica_group.replica_ids_size() == 2);
+    source_target_pairs.emplace_back(replica_group.replica_ids(0),
+                                     replica_group.replica_ids(1));
+  }
+  return source_target_pairs;
+}
+
+P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
+                                  const Shape& shape, int64_t replica_count,
+                                  int64_t partition_count) {
+  P2PConfig p2p_config;
+  auto& config = p2p_config.config;
+
+  config.operand_count = 1;
+  config.operand_element_type.push_back(shape.element_type());
+  config.SetCollectiveOpKindAndID(instr);
+  config.group_mode = GetCollectiveOpGroupMode(
+                          instr->channel_id().value_or(0) > 0, std::nullopt)
+                          .value();
+
+  // All execution instances of a Send/Recv together form a replica group.
+  const int64_t num_participants =
+      config.group_mode == CollectiveOpGroupMode::kCrossReplica
+          ? replica_count
+          : partition_count;
+  config.replica_groups.emplace_back();
+  ReplicaGroup& replica_group = config.replica_groups.front();
+  for (int i = 0; i < num_participants; ++i) {
+    replica_group.add_replica_ids(i);
+  }
+
+  std::optional<std::string> source_target_pairs_string =
+      instr->frontend_attributes().map().at(kSendRecvSourceTargetPairsAttr);
+
+  // We currently ignore problems related to the source-target-pair string to
+  // avoid using absl::StatusOr for the return type. This should be ok as
+  // Send/Recv are generated by the compiler.
+  if (!source_target_pairs_string.has_value()) {
+    return p2p_config;
+  }
+  auto statusor = ParseReplicaGroupsOnly(*source_target_pairs_string);
+  if (!statusor.ok()) {
+    return p2p_config;
+  }
+
+  std::vector<ReplicaGroup> replica_groups = statusor.value();
+  auto validation_it =
+      instr->frontend_attributes().map().find(kSendRecvValidationAttr);
+  P2PConfig::ValidationKind validation_kind = P2PConfig::ValidationKind::kValid;
+  std::vector<ReplicaGroup> bounds;
+  if (validation_it != instr->frontend_attributes().map().end()) {
+    if (validation_it->second == "invalid") {
+      validation_kind = P2PConfig::ValidationKind::kInvalid;
+    } else {
+      auto statusor_bounds = ParseReplicaGroupsOnly(validation_it->second);
+      if (!statusor_bounds.ok() ||
+          statusor_bounds.value().size() != replica_groups.size()) {
+        // Ignore problems related to the source-target-pair string to avoid
+        // using absl::StatusOr for the return type.
+        return p2p_config;
+      }
+      validation_kind = P2PConfig::ValidationKind::kConditional;
+      bounds = statusor_bounds.value();
+    }
+  }
+
+  int i = 0;
+  p2p_config.validation_kind = validation_kind;
+  P2PConfig::SourceTargetToBounds& source_target_to_bounds =
+      p2p_config.source_target_to_bounds;
+  for (const ReplicaGroup& replica_group : replica_groups) {
+    int64_t source = replica_group.replica_ids(0);
+    int64_t target = replica_group.replica_ids(1);
+
+    p2p_config.id_to_source_target.insert({target, {}}).first->second.source =
+        source;
+    p2p_config.id_to_source_target.insert({source, {}}).first->second.target =
+        target;
+
+    if (validation_kind == P2PConfig::ValidationKind::kConditional) {
+      const ReplicaGroup& bound = bounds[i];
+      int64_t lower = bound.replica_ids(0);
+      int64_t upper = bound.replica_ids(1);
+      source_target_to_bounds[std::make_pair(source, target)] =
+          std::make_pair(lower, upper);
+      i++;
+    }
+  }
+
+  return p2p_config;
+}
+
+AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr) {
+  const auto& fe_map = instr->frontend_attributes().map();
+
+  // kCollectiveStreamAttrName takes precedence over kSendRecvPipelineAttr.
+  {
+    const auto it = fe_map.find(kCollectiveStreamAttrName);
+    if (it != fe_map.end() && it->second == kCollectiveStreamP2P) {
+      // Use any of the two p2p streams.
+      return AsyncStreamKind::kP2P0;
+    }
+  }
+
+  const auto it = fe_map.find(kSendRecvPipelineAttr);
+  if (it != fe_map.end() && it->second == "1") {
+    return AsyncStreamKind::kP2P1;
+  }
+  return AsyncStreamKind::kP2P0;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
new file mode 100644
index 000000000000..bc5871faea00
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
@@ -0,0 +1,106 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_P2P_THUNK_COMMON_H_
+#define XLA_BACKENDS_GPU_RUNTIME_P2P_THUNK_COMMON_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/shape.h"
+
+namespace xla {
+namespace gpu {
+
+// Count the number of times a Send or Recv instruction executed on a device.
+class ExecutionCounters {
+ public:
+  absl::Status Initialize(se::StreamExecutor* executor, RunId run_id);
+  absl::StatusOr<int64_t*> GetCounter(se::StreamExecutor* executor,
+                                      RunId run_id);
+
+ private:
+  using CounterKey = std::pair<se::StreamExecutor*, RunId>;
+  absl::Mutex mu_;
+  // TODO(b/338288906): may need to clean up the counters for finished runs.
+  absl::flat_hash_map<CounterKey, int64_t> counters_ ABSL_GUARDED_BY(mu_);
+};
+
+// Records the information for implementing CollectivePermute, Send and Recv.
+struct P2PConfig {
+  // Record the target ID for sending a data and the source ID from which to
+  // receive a data. Either target or source can be optional.
+  struct SourceTargetMapEntry {
+    std::optional<int64_t> source;
+    std::optional<int64_t> target;
+  };
+
+  using IdToSourceTargetMap =
+      absl::flat_hash_map<int64_t, SourceTargetMapEntry>;
+
+  enum class ValidationKind { kValid = 0, kInvalid = 1, kConditional = 2 };
+
+  using SourceTargetToBounds = absl::flat_hash_map<std::pair<int64_t, int64_t>,
+                                                   std::pair<int64_t, int64_t>>;
+
+  // Returns the source and target ID corresponding to the given ID (these IDs
+  // are replica_ids for cross replica permute or partition_ids for cross
+  // partition permute). The source ID is the id which will send data to this
+  // ID and the target ID is the id to which this ID will send its data. Either
+  // can be optional.
+  static SourceTargetMapEntry GetSourceTarget(
+      const IdToSourceTargetMap& id_to_source_target, int64_t id) {
+    auto it = id_to_source_target.find(id);
+    if (it != id_to_source_target.end()) return it->second;
+    return SourceTargetMapEntry{};
+  }
+
+  CollectiveConfig config;
+  IdToSourceTargetMap id_to_source_target;
+  ValidationKind validation_kind = ValidationKind::kValid;
+  // When a Send or Recv has validation_kind = ValidationKind::kConditional,
+  // record the valid execution numbers as a pair of [lower-bound, upper-bound]
+  // for each source and target pair.
+  SourceTargetToBounds source_target_to_bounds;
+};
+
+// Extracts source/target pairs for send/recv from frontend attributes.
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> GetSourceTargetPairs(
+    mlir::DictionaryAttr frontend_attributes);
+
+// Constructs the P2PConfig for an HLO Send or Recv instruction.
+P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
+                                  const Shape& shape, int64_t replica_count,
+                                  int64_t partition_count);
+// Returns the stream kind for the asynchronous stream used to execute an HLO
+// Send or Recv instruction, by inspecting the frontend attributes of the
+// instruction.
+AsyncStreamKind GetStreamKindForP2P(const HloInstruction* instr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_P2P_THUNK_COMMON_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all.cc
new file mode 100644
index 000000000000..39c29d0a690d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all.cc
@@ -0,0 +1,139 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/ragged_all_to_all_kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace {
+
+template <typename T>
+absl::Status LaunchTypedKernel(
+    se::Stream* stream, se::StreamExecutor* executor,
+    const se::ThreadDim& thread_dims, const se::BlockDim& block_dims,
+    se::DeviceMemoryBase input_buffer,
+    const std::array<void*,
+                     stream_executor::gpu::kMaxNumRaggedAllToAllOutputPtrs>&
+        output_ptrs,
+    se::DeviceMemoryBase input_offsets_buffer,
+    se::DeviceMemoryBase send_sizes_buffer,
+    se::DeviceMemoryBase output_offsets_buffer, int64_t num_updates_per_output,
+    int64_t num_row_elements) {
+  TF_ASSIGN_OR_RETURN(
+      auto kernel, se::gpu::GpuKernelRegistry::GetGlobalRegistry()
+                       .LoadKernel<se::gpu::RaggedAllToAllKernel<T>>(executor));
+
+  return kernel.Launch(thread_dims, block_dims, stream, input_buffer,
+                       output_ptrs, input_offsets_buffer, send_sizes_buffer,
+                       output_offsets_buffer, num_updates_per_output,
+                       num_row_elements);
+}
+
+}  // namespace
+
+bool IsRaggedAllToAllKernelSupported(int64_t num_outputs,
+                                     PrimitiveType element_type) {
+  int bit_width = primitive_util::BitWidth(element_type);
+
+  return num_outputs <= stream_executor::gpu::kMaxNumRaggedAllToAllOutputPtrs &&
+         (bit_width == 8 || bit_width == 16 || bit_width == 32 ||
+          bit_width == 64);
+}
+
+absl::Status RunRaggedAllToAllKernel(
+    se::Stream* stream, PrimitiveType element_type,
+    se::DeviceMemoryBase input_buffer,
+    absl::Span<const se::DeviceMemoryBase> output_buffers,
+    se::DeviceMemoryBase input_offsets_buffer,
+    se::DeviceMemoryBase send_sizes_buffer,
+    se::DeviceMemoryBase output_offsets_buffer, int64_t num_outputs,
+    int64_t num_updates_per_output, int64_t num_input_rows,
+    int64_t num_row_elements) {
+  if (output_buffers.size() >
+      stream_executor::gpu::kMaxNumRaggedAllToAllOutputPtrs) {
+    return absl::InvalidArgumentError(
+        "Number of output pointers exceeds the maximum supported number of "
+        "output pointers.");
+  }
+
+  se::StreamExecutor* executor = stream->parent();
+  static constexpr size_t kThreads = 128;
+  static constexpr size_t kMaxBlocksPerUpdate = 1024;
+
+  // blockIdx.x is the index of the update.
+  int64_t num_blocks_x = num_updates_per_output * num_outputs;
+
+  // blockIdx.y and threadIdx.x are used to iterate over the elements of the
+  // update. Since the size of each update is not known at compile time, the
+  // kernel assumes the worst case of `num_input_rows * num_row_elements`
+  // elements per update and uses a loop up to `send_size * num_row_elements` to
+  // terminate early.
+  size_t num_blocks_y =
+      std::min(CeilOfRatio<size_t>(num_input_rows * num_row_elements, kThreads),
+               kMaxBlocksPerUpdate);
+
+  se::ThreadDim thread_dims(kThreads, 1, 1);
+  se::BlockDim block_dims(num_blocks_x, num_blocks_y, 1);
+
+  std::array<void*, stream_executor::gpu::kMaxNumRaggedAllToAllOutputPtrs>
+      output_ptrs;
+  for (int64_t i = 0; i < output_buffers.size(); ++i) {
+    output_ptrs[i] = output_buffers[i].opaque();
+  }
+
+  auto launch_kernel = [&](auto type) -> absl::Status {
+    using T = decltype(type);
+    return LaunchTypedKernel<T>(stream, executor, thread_dims, block_dims,
+                                input_buffer, output_ptrs, input_offsets_buffer,
+                                send_sizes_buffer, output_offsets_buffer,
+                                num_updates_per_output, num_row_elements);
+  };
+
+  switch (xla::primitive_util::BitWidth(element_type)) {
+    case 8:
+      return launch_kernel(uint8_t{});
+    case 16:
+      return launch_kernel(uint16_t{});
+    case 32:
+      return launch_kernel(uint32_t{});
+    case 64:
+      return launch_kernel(uint64_t{});
+    default:
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Unsupported element type: ",
+          primitive_util::LowercasePrimitiveTypeName(element_type),
+          " (bit width ", xla::primitive_util::BitWidth(element_type),
+          ") for RaggedAllToAll kernel."));
+  }
+}
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all.h
new file mode 100644
index 000000000000..408816b15896
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all.h
@@ -0,0 +1,61 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_RAGGED_ALL_TO_ALL_H_
+#define XLA_BACKENDS_GPU_RUNTIME_RAGGED_ALL_TO_ALL_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/types.h"  // IWYU pragma: keep
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+// Returns true if the kernel is supported for the given number of outputs and
+// element type.
+bool IsRaggedAllToAllKernelSupported(int64_t num_outputs,
+                                     PrimitiveType element_type);
+
+// Input:
+//  - input_buffer: dtype[num_input_rows, num_row_elements]
+//  - input_offsets_buffer: s64[num_ranks * num_updates_per_rank]
+//  - send_sizes_buffer: s64[num_ranks * num_updates_per_rank]
+//  - output_offsets_buffer: s64[num_ranks * num_updates_per_rank]
+//  - num_outputs: number of output buffers
+//  - num_updates_per_output: number of updates to write to each output buffer
+//  - num_input_rows: number of input rows
+//  - num_row_elements: number of elements in each row
+// Output:
+//  - output_buffers[num_outputs, num_output_rows, num_row_elements]
+// Outputs are filled with the updates from the input data. Number of output
+// rows is not explicitly specified in the API, but should be enough to fit all
+// the inputs. It is the responsibility of the caller to make sure that it is
+// the case.
+absl::Status RunRaggedAllToAllKernel(
+    se::Stream* stream, PrimitiveType element_type,
+    se::DeviceMemoryBase input_buffer,
+    absl::Span<const se::DeviceMemoryBase> output_buffers,
+    se::DeviceMemoryBase input_offsets_buffer,
+    se::DeviceMemoryBase send_sizes_buffer,
+    se::DeviceMemoryBase output_offsets_buffer, int64_t num_outputs,
+    int64_t num_updates_per_output, int64_t num_input_rows,
+    int64_t num_row_elements);
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_RAGGED_ALL_TO_ALL_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_test.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_test.cc
new file mode 100644
index 000000000000..fed99cc0e7b2
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_test.cc
@@ -0,0 +1,158 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/ragged_all_to_all.h"
+
+#include <cstdint>
+#include <numeric>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_handle.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+se::StreamExecutor* GetGpuExecutor() {
+  auto* platform =
+      se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
+  return platform->ExecutorForDevice(0).value();
+}
+
+template <typename T>
+std::vector<std::vector<T>> GetExpectedOutputResults(
+    absl::Span<const T> input_data, absl::Span<const int64_t> input_offsets,
+    absl::Span<const int64_t> send_sizes,
+    absl::Span<const int64_t> output_offsets, int64_t num_ranks,
+    int64_t num_updates_per_rank, int64_t num_input_rows,
+    int64_t num_row_elements) {
+  std::vector<std::vector<T>> expected_output(
+      num_ranks, std::vector<T>(num_input_rows * num_row_elements, 0));
+
+  for (int64_t i = 0; i < num_ranks; ++i) {
+    for (int64_t j = 0; j < num_updates_per_rank; ++j) {
+      int64_t update_idx = i * num_updates_per_rank + j;
+      int64_t input_offset = input_offsets[update_idx];
+      int64_t send_size = send_sizes[update_idx];
+      int64_t output_offset = output_offsets[update_idx];
+
+      for (int k = 0; k < send_size * num_row_elements; ++k) {
+        expected_output[i][output_offset * num_row_elements + k] =
+            input_data[input_offset * num_row_elements + k];
+      }
+    }
+  }
+  return expected_output;
+}
+
+using RaggedAllToAllKernelTest = ::testing::Test;
+
+TEST_F(RaggedAllToAllKernelTest, SimpleKernelTest) {
+  using T = float;
+
+  auto* executor = GetGpuExecutor();
+  auto stream = executor->CreateStream().value();
+
+  constexpr int64_t num_outputs = 2;
+  constexpr int64_t num_update_per_output = 2;
+  constexpr int64_t num_input_rows = 8;
+  constexpr int64_t num_row_elements = 2;
+  constexpr int64_t n = num_input_rows * num_row_elements;
+
+  stream_executor::DeviceMemoryHandle input_buffer(
+      executor, executor->AllocateArray<T>(n));
+
+  std::vector<stream_executor::DeviceMemoryHandle> output_buffers;
+  for (int64_t i = 0; i < num_outputs; ++i) {
+    output_buffers.emplace_back(executor, executor->AllocateArray<T>(n));
+    ASSERT_TRUE(!output_buffers[i].memory().is_null());
+    TF_ASSERT_OK(
+        stream->MemZero(output_buffers[i].memory_ptr(), n * sizeof(T)));
+  }
+
+  stream_executor::DeviceMemoryHandle input_offsets_buffer(
+      executor,
+      executor->AllocateArray<int64_t>(num_outputs * num_update_per_output));
+  stream_executor::DeviceMemoryHandle send_sizes_buffer(
+      executor,
+      executor->AllocateArray<int64_t>(num_outputs * num_update_per_output));
+  stream_executor::DeviceMemoryHandle output_offsets_buffer(
+      executor,
+      executor->AllocateArray<int64_t>(num_outputs * num_update_per_output));
+
+  ASSERT_TRUE(!(input_offsets_buffer.memory().is_null() ||
+                input_offsets_buffer.memory().is_null() ||
+                output_offsets_buffer.memory().is_null()));
+
+  std::vector<T> input_data(n);
+  std::iota(input_data.begin(), input_data.end(), 0);
+  TF_ASSERT_OK(stream->Memcpy(input_buffer.memory_ptr(), input_data.data(),
+                              n * sizeof(T)));
+
+  std::vector<int64_t> input_offsets = {1, 4, 0, 3};
+  std::vector<int64_t> send_sizes = {2, 3, 1, 2};
+  std::vector<int64_t> output_offsets = {0, 4, 1, 5};
+
+  TF_ASSERT_OK(stream->Memcpy(input_offsets_buffer.memory_ptr(),
+                              input_offsets.data(),
+                              input_offsets.size() * sizeof(int64_t)));
+  TF_ASSERT_OK(stream->Memcpy(send_sizes_buffer.memory_ptr(), send_sizes.data(),
+                              send_sizes.size() * sizeof(int64_t)));
+  TF_ASSERT_OK(stream->Memcpy(output_offsets_buffer.memory_ptr(),
+                              output_offsets.data(),
+                              output_offsets.size() * sizeof(int64_t)));
+
+  std::vector<se::DeviceMemoryBase> output_buffers_span;
+  for (auto& output_buffer : output_buffers) {
+    output_buffers_span.push_back(output_buffer.memory());
+  }
+
+  TF_ASSERT_OK(RunRaggedAllToAllKernel(
+      stream.get(), primitive_util::NativeToPrimitiveType<T>(),
+      input_buffer.memory(), output_buffers_span, input_offsets_buffer.memory(),
+      send_sizes_buffer.memory(), output_offsets_buffer.memory(), num_outputs,
+      num_update_per_output, num_input_rows, num_row_elements));
+
+  std::vector<std::vector<T>> output_results(num_outputs);
+
+  for (int64_t i = 0; i < num_outputs; ++i) {
+    output_results[i].resize(n);
+    TF_ASSERT_OK(stream->Memcpy(output_results[i].data(),
+                                output_buffers[i].memory(), n * sizeof(T)));
+  }
+
+  std::vector<std::vector<T>> expected_output_results =
+      GetExpectedOutputResults<T>(
+          input_data, input_offsets, send_sizes, output_offsets, num_outputs,
+          num_update_per_output, num_input_rows, num_row_elements);
+
+  ASSERT_EQ(output_results.size(), expected_output_results.size());
+  EXPECT_EQ(output_results, expected_output_results);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
new file mode 100644
index 000000000000..00e2bf9e2ebc
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -0,0 +1,610 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/ragged_all_to_all.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+#include "xla/service/rendezvous.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_handle.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+// RaggedAllToAll has 4 operands with ragged tensor metadata: input_offsets,
+// send_sizes, output_offsets, and recv_sizes.
+constexpr int64_t kNumRaggedMetadataOperands = 4;
+
+RaggedAllToAllConfig GetRaggedAllToAllConfig(
+    const HloRaggedAllToAllInstruction* instr) {
+  RaggedAllToAllConfig config;
+  config.config = GetCollectiveConfig(instr, std::nullopt);
+
+  const Shape& input_size_shape = instr->operand(2)->shape();
+  config.num_total_updates = input_size_shape.dimensions(0);
+  config.num_input_rows = instr->operand(0)->shape().dimensions(0);
+  config.num_row_elements =
+      ShapeUtil::ElementsIn(instr->shape()) / instr->shape().dimensions(0);
+  return config;
+}
+
+// Loads the offsets and sizes of the input and output ragged tensors from
+// device memory.
+//
+// The parameter `ragged_metadata_allocs` is a vector of pointers to the buffers
+// in the host memory allocated by StreamExecutor to copy data from the device
+// memory.
+absl::Status LoadRaggedTensorMetadata(
+    se::Stream& stream, const std::vector<DeviceBufferPair>& buffers,
+    const std::vector<int64_t*>& ragged_metadata_allocs) {
+  for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
+    TF_RETURN_IF_ERROR(stream.Memcpy(ragged_metadata_allocs[i],
+                                     buffers[i + 2].source_buffer,
+                                     buffers[i + 2].source_buffer.size()));
+  }
+
+  // Wait for the copies to complete.
+  if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to complete all kernels launched on stream %p: %s", &stream,
+        blocked.message()));
+  }
+
+  return absl::OkStatus();
+}
+
+// Runs AllToAll on a buffer that contains ragged tensor metadata.
+absl::Status RunAllToAllOnIndexBuffer(
+    GpuCollectives* collectives, const se::DeviceMemoryBase& source_buffer,
+    int64_t num_updates_per_replica,
+    const se::DeviceMemoryBase& destination_buffer, PrimitiveType element_type,
+    se::Stream& stream, Communicator* comm) {
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm, collectives->TryCast(comm));
+  tsl::AsyncValueRef<Communicator::Event> event =
+      gpu_comm->GroupExecute([num_ranks, num_updates_per_replica, collectives,
+                              element_type, &source_buffer, &destination_buffer,
+                              &stream](GpuCommunicator* comm) -> absl::Status {
+        for (int peer = 0; peer < num_ranks; ++peer) {
+          int64_t offset = peer * num_updates_per_replica;
+          se::DeviceMemoryBase send_slice =
+              collectives->Slice(source_buffer, element_type, offset,
+                                 /*count=*/num_updates_per_replica);
+          se::DeviceMemoryBase recv_slice =
+              collectives->Slice(destination_buffer, element_type, offset,
+                                 /*count=*/num_updates_per_replica);
+          TF_RETURN_IF_ERROR(comm->LaunchSend(send_slice, element_type,
+                                              /*count=*/num_updates_per_replica,
+                                              RankId(peer),
+                                              GpuCollectives::On(stream)));
+          TF_RETURN_IF_ERROR(comm->LaunchRecv(recv_slice, element_type,
+                                              /*count=*/num_updates_per_replica,
+                                              RankId(peer),
+                                              GpuCollectives::On(stream)));
+        }
+        return absl::OkStatus();
+      });
+  tsl::BlockUntilReady(event);
+  if (event.IsError()) {
+    return event.GetError();
+  }
+  return stream.BlockHostUntilDone();
+}
+
+absl::Status RunRaggedAllToAll(
+    GpuCollectives* collectives, int64_t ragged_row_element_size,
+    int64_t num_total_updates,
+    const std::vector<DeviceBufferPair>& original_buffers, se::Stream& stream,
+    Communicator* comm, const std::vector<int64_t*>& ragged_metadata_allocs,
+    const se::DeviceMemoryBase& output_offsets_device_buffer) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing ragged-all-to-all from device ordinal: "
+          << device_ordinal;
+  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(collectives, stream.parent(),
+                                          original_buffers, comm));
+
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+
+  std::vector<DeviceBufferPair> buffers = original_buffers;
+
+  int64_t num_updates_per_replica = num_total_updates / num_ranks;
+
+  // `output_offsets` of the RaggedAllToAll instruction are sharded in a way,
+  // that `output_offset[i]` is an offset in the i-th peer output buffer. To
+  // make it work for NCCL model with send/recv, we need to know offsets in the
+  // local output buffer. To get the correct offsets we perform an AllToAll on
+  // the output_offsets buffer.
+  DeviceBufferPair& output_offsets_buffer_pair = buffers[4];
+  TF_RETURN_IF_ERROR(RunAllToAllOnIndexBuffer(
+      collectives, output_offsets_buffer_pair.source_buffer,
+      num_updates_per_replica, output_offsets_device_buffer,
+      output_offsets_buffer_pair.element_type, stream, comm));
+  output_offsets_buffer_pair.source_buffer = output_offsets_device_buffer;
+
+  TF_RETURN_IF_ERROR(
+      LoadRaggedTensorMetadata(stream, buffers, ragged_metadata_allocs));
+
+  const int64_t* input_offsets = ragged_metadata_allocs[0];
+  const int64_t* send_sizes = ragged_metadata_allocs[1];
+  const int64_t* output_offsets = ragged_metadata_allocs[2];
+  const int64_t* recv_sizes = ragged_metadata_allocs[3];
+
+  TF_ASSIGN_OR_RETURN(GpuCommunicator * gpu_comm, collectives->TryCast(comm));
+  tsl::AsyncValueRef<Communicator::Event> event = gpu_comm->GroupExecute(
+      [num_updates_per_replica, num_ranks, collectives, input_offsets,
+       send_sizes, output_offsets, recv_sizes, ragged_row_element_size,
+       &buffers, &stream](GpuCommunicator* comm) -> absl::Status {
+        PrimitiveType element_type = buffers[0].element_type;
+
+        se::DeviceMemoryBase input_buffer = buffers[0].source_buffer;
+        se::DeviceMemoryBase output_buffer = buffers[1].destination_buffer;
+
+        for (int64_t i = 0; i < num_updates_per_replica; ++i) {
+          for (int peer = 0; peer < num_ranks; ++peer) {
+            int64_t idx = peer * num_updates_per_replica + i;
+            se::DeviceMemoryBase send_slice =
+                collectives->Slice(input_buffer, element_type,
+                                   input_offsets[idx] * ragged_row_element_size,
+                                   send_sizes[idx] * ragged_row_element_size);
+
+            se::DeviceMemoryBase recv_slice = collectives->Slice(
+                output_buffer, element_type,
+                output_offsets[idx] * ragged_row_element_size,
+                recv_sizes[idx] * ragged_row_element_size);
+
+            TF_RETURN_IF_ERROR(
+                comm->LaunchSend(send_slice, element_type,
+                                 send_sizes[idx] * ragged_row_element_size,
+                                 RankId(peer), GpuCollectives::On(stream)));
+
+            TF_RETURN_IF_ERROR(
+                comm->LaunchRecv(recv_slice, element_type,
+                                 recv_sizes[idx] * ragged_row_element_size,
+                                 RankId(peer), GpuCollectives::On(stream)));
+          }
+        }
+
+        return absl::OkStatus();
+      });
+  tsl::BlockUntilReady(event);
+  if (event.IsError()) {
+    return event.GetError();
+  }
+  return absl::OkStatus();
+}
+
+// Contains the values that are passed between host threads with rendezvous.
+struct RendezvousValue {
+  RankId rank;
+  se::DeviceMemoryBase output_buffer;
+  se::Event* start_event;
+  se::Event* end_event;
+
+  bool operator<(const RendezvousValue& other) const {
+    return rank < other.rank;
+  }
+};
+
+// Executes the rendezvous before the kernel start.
+// Inserts CUDA events into the stream to ensure that all devices have reached
+// the start event before the kernel starts.
+absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
+RendezvousBeforeKernelStart(absl::string_view name,
+                            const GpuCliqueKey& clique_key, RankId rank,
+                            int64_t num_ranks,
+                            const se::DeviceMemoryBase& output_buffer,
+                            se::Stream& stream, se::Event* start_event,
+                            se::Event* end_event) {
+  RendezvousValue rendezvous_value;
+  rendezvous_value.rank = rank;
+  rendezvous_value.output_buffer = output_buffer;
+  rendezvous_value.start_event = start_event;
+  rendezvous_value.end_event = end_event;
+
+  // Record that this device has started the memcpy ragged-all-to-all. We do
+  // this before the rendezvous to make sure that RecordEvent is called before
+  // WaitFor on another stream.
+  TF_RETURN_IF_ERROR(stream.RecordEvent(start_event));
+
+  auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
+    std::vector<RendezvousValue> values_copy;
+    for (const auto& value : values) {
+      values_copy.push_back(*value);
+    }
+    // Sort to make sure that values are in the same order as the devices are
+    // ordered in the communicator.
+    absl::c_sort(values_copy);
+    return values_copy;
+  };
+
+  std::string start_rendezvous_key =
+      absl::StrFormat("start %s ragged-all-to-all for rank %d, clique %s", name,
+                      rank.value(), clique_key.ToString());
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
+      Rendezvous<std::vector<RendezvousValue>>(
+          /*name=*/
+          start_rendezvous_key, /*key=*/clique_key,
+          /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
+          rendezvous_fn));
+
+  // Wait for all devices to reach the start event. This indicates that all
+  // output buffers are ready for transfer.
+  for (auto& value : *rendezvous_values) {
+    TF_RETURN_IF_ERROR(stream.WaitFor(value.start_event));
+  }
+
+  return rendezvous_values;
+}
+
+// Executes the rendezvous after the kernel finish. Waits for all devices to
+// reach the end event.
+absl::Status RendezvousAfterKernelFinish(
+    absl::string_view name, const GpuCliqueKey& clique_key, RankId rank,
+    int64_t num_ranks, se::Stream& stream, se::Event* end_event,
+    const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
+  // Record that this device has finished the memcpy ragged-all-to-all.
+  TF_RETURN_IF_ERROR(stream.RecordEvent(end_event));
+
+  // Do another rendezvous to make sure that we call RecordEvent for end_event
+  // before WaitFor on another stream.
+  std::string finish_rendezvous_key =
+      absl::StrFormat("finish %s ragged-all-to-all for rank %d, clique %s",
+                      name, rank.value(), clique_key.ToString());
+  TF_RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
+                                /*key=*/clique_key,
+                                /*num_threads=*/num_ranks));
+
+  // Wait for all devices to reach the end event. This indicates that all
+  // updates from other devices have arrived.
+  for (auto& value : *rendezvous_values) {
+    TF_RETURN_IF_ERROR(stream.WaitFor(value.end_event));
+  }
+
+  return absl::OkStatus();
+}
+
+// TODO(b/380457503): Memcpy AllToAll implementation must be moved to
+// NcclCommunicator implementation.
+absl::Status RunMemCpyRaggedAllToAll(
+    GpuCollectives* collectives, const GpuCliqueKey& clique_key, RankId rank,
+    int64_t ragged_row_element_size, int64_t num_total_updates,
+    const std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    Communicator* comm, const std::vector<int64_t*>& ragged_metadata_allocs,
+    se::Event* start_event, se::Event* end_event) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing mem-copy-ragged-all-to-all from device ordinal: "
+          << device_ordinal;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(collectives, stream.parent(), buffers, comm));
+
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+
+  PrimitiveType element_type = buffers[0].element_type;
+
+  se::DeviceMemoryBase input_buffer = buffers[0].source_buffer;
+  se::DeviceMemoryBase output_buffer = buffers[1].destination_buffer;
+
+  TF_RETURN_IF_ERROR(
+      LoadRaggedTensorMetadata(stream, buffers, ragged_metadata_allocs));
+
+  int64_t num_updates_per_replica = num_total_updates / num_ranks;
+
+  const int64_t* input_offsets = ragged_metadata_allocs[0];
+  const int64_t* send_sizes = ragged_metadata_allocs[1];
+  const int64_t* output_offsets = ragged_metadata_allocs[2];
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
+      RendezvousBeforeKernelStart(
+          /*name=*/"memcpy", clique_key, rank, num_ranks, output_buffer, stream,
+          start_event, end_event));
+
+  // Transfer a slice of data to each peer's output buffer.
+  for (int64_t i = 0; i < num_updates_per_replica; ++i) {
+    for (int peer = 0; peer < num_ranks; ++peer) {
+      int64_t idx = peer * num_updates_per_replica + i;
+      se::DeviceMemoryBase send_slice =
+          collectives->Slice(input_buffer, element_type,
+                             input_offsets[idx] * ragged_row_element_size,
+                             send_sizes[idx] * ragged_row_element_size);
+      se::DeviceMemoryBase dst_slice = collectives->Slice(
+          (*rendezvous_values)[peer].output_buffer, element_type,
+          output_offsets[idx] * ragged_row_element_size,
+          send_sizes[idx] * ragged_row_element_size);
+      TF_RETURN_IF_ERROR(
+          stream.MemcpyD2D(&dst_slice, send_slice, send_slice.size()));
+    }
+  }
+
+  TF_RETURN_IF_ERROR(RendezvousAfterKernelFinish(
+      /*name=*/"memcpy", clique_key, rank, num_ranks, stream, end_event,
+      rendezvous_values));
+
+  return absl::OkStatus();
+}
+
+absl::Status RunOneShotRaggedAllToAll(
+    GpuCollectives* collectives, const GpuCliqueKey& clique_key,
+    int64_t num_input_rows, int64_t num_row_elements, int64_t num_total_updates,
+    const std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
+    RankId rank, Communicator* comm, se::Event* start_event,
+    se::Event* end_event) {
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing one-shot ragged-all-to-all from device ordinal: "
+          << device_ordinal << ", rank: " << rank.value();
+
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm->NumRanks());
+
+  PrimitiveType element_type = buffers[0].element_type;
+
+  se::DeviceMemoryBase input_buffer = buffers[0].source_buffer;
+  se::DeviceMemoryBase output_buffer = buffers[1].destination_buffer;
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
+      RendezvousBeforeKernelStart(
+          /*name=*/"one-shot", clique_key, rank, num_ranks, output_buffer,
+          stream, start_event, end_event));
+
+  int64_t num_updates_per_replica = num_total_updates / num_ranks;
+
+  absl::InlinedVector<se::DeviceMemoryBase, 4> output_ptrs;
+  for (auto& value : *rendezvous_values) {
+    output_ptrs.push_back(value.output_buffer);
+  }
+
+  TF_RETURN_IF_ERROR(RunRaggedAllToAllKernel(
+      &stream, element_type, input_buffer, output_ptrs,
+      buffers[2].source_buffer, buffers[3].source_buffer,
+      buffers[4].source_buffer, num_ranks, num_updates_per_replica,
+      num_input_rows, num_row_elements));
+
+  return RendezvousAfterKernelFinish(
+      /*name=*/"one-shot", clique_key, rank, num_ranks, stream, end_event,
+      rendezvous_values);
+}
+
+}  // namespace
+
+RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
+    ThunkInfo thunk_info, const HloRaggedAllToAllInstruction* instr,
+    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
+    : CollectiveThunk(Thunk::kRaggedAllToAllStart, thunk_info,
+                      IsGPUSyncCollective(*instr),
+                      AsyncStreamKind::kCollective),
+      config_(GetRaggedAllToAllConfig(instr)),
+      buffers_(std::move(buffers)),
+      p2p_memcpy_enabled_(p2p_memcpy_enabled),
+      one_shot_kernel_enabled_(
+          instr->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel()) {
+  CHECK_EQ(config_.config.operand_count, buffers_.size());
+}
+
+/*static*/ absl::Status RaggedAllToAllStartThunk::CheckImplementable(
+    const HloRaggedAllToAllInstruction* instr, int64_t replica_count,
+    int64_t partition_count) {
+  auto status = [&instr]() -> absl::Status {
+    for (HloInstruction* operand : instr->operands()) {
+      Shape shape = operand->shape();
+      TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kRaggedAllToAll));
+    }
+
+    if (!ShapeUtil::IsEffectivelyMostMajorDimension(instr->shape(), 0)) {
+      return absl::UnimplementedError(absl::Substitute(
+          "ragged-all-to-all must have the ragged dimension (0) in the most "
+          "major position in the layout $0.",
+          instr->shape().layout().ToString()));
+    }
+
+    if (instr->operand(2)->shape().element_type() != S64) {
+      return absl::InvalidArgumentError(
+          "RaggedAllToAllDecomposer only supports S64 offsets. Was "
+          "`ragged-all-to-all-canonicalizer` pass executed?");
+    }
+
+    return absl::OkStatus();
+  };
+  return AddOpDescription<RaggedAllToAllStartThunk>(
+      status(), instr, replica_count, partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode RaggedAllToAllStartThunk::GetGroupMode(
+    const HloRaggedAllToAllInstruction* instr) {
+  return GetRaggedAllToAllConfig(instr).config.group_mode;
+}
+
+absl::Status RaggedAllToAllStartThunk::Initialize(
+    const InitializeParams& params) {
+  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+  device_count_ = params.local_device_count;
+
+  // Allocate temp buffers in the host memory to load the sizes and offsets of
+  // ragged tensors from device memory.
+  absl::MutexLock lock(&mutex_);
+  if (!host_buffer_allocs_.contains(params.executor)) {
+    std::vector<std::unique_ptr<se::MemoryAllocation>> allocs;
+    for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> alloc,
+                          params.executor->HostMemoryAllocate(
+                              config_.num_total_updates * sizeof(int64_t)));
+      allocs.push_back(std::move(alloc));
+    }
+    host_buffer_allocs_.emplace(params.executor, std::move(allocs));
+  }
+
+  if (!device_buffer_allocs_.contains(params.executor)) {
+    se::DeviceMemoryHandle output_offsets_device_buffer{
+        params.executor,
+        params.executor->Allocate(config_.num_total_updates * sizeof(int64_t))};
+
+    if (output_offsets_device_buffer.memory().is_null()) {
+      return absl::InternalError("Failed to allocate output offsets buffer.");
+    }
+
+    device_buffer_allocs_.emplace(params.executor,
+                                  std::move(output_offsets_device_buffer));
+  }
+
+  if (is_local()) {
+    se::StreamExecutor* executor = params.executor;
+    {
+      absl::MutexLock lock(&events_mutex_);
+      if (!start_events_.count(executor)) {
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Event> event,
+                            executor->CreateEvent());
+        start_events_.insert({executor, std::move(event)});
+      }
+
+      if (!end_events_.count(executor)) {
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Event> event,
+                            executor->CreateEvent());
+        end_events_.insert({executor, std::move(event)});
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+bool RaggedAllToAllStartThunk::is_local() const {
+  CHECK_NE(device_count_, -1);
+  for (const auto& replica_group : config_.config.replica_groups) {
+    const int64_t node_id = replica_group.replica_ids().at(0) / device_count_;
+    if (!absl::c_all_of(replica_group.replica_ids(),
+                        [this, node_id](const int64_t rank) {
+                          return rank / device_count_ == node_id;
+                        })) {
+      return false;
+    }
+  }
+  return true;
+}
+
+absl::Status RaggedAllToAllStartThunk::RunCollective(
+    const ExecuteParams& params, se::Stream& stream,
+    CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_,
+                             config_.config.operand_element_type));
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+
+  // Get buffer allocs to load sizes and offsets of ragged tensors from device
+  // memory.
+  std::vector<int64_t*> ragged_metadata_allocs(kNumRaggedMetadataOperands);
+  se::DeviceMemoryBase output_offsets_device_buffer;
+  {
+    absl::MutexLock lock(&mutex_);
+    auto it = host_buffer_allocs_.find(stream.parent());
+    CHECK(it != host_buffer_allocs_.end());
+
+    for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
+      ragged_metadata_allocs[i] =
+          reinterpret_cast<int64_t*>(it->second[i]->opaque());
+    }
+
+    auto jt = device_buffer_allocs_.find(stream.parent());
+    CHECK(jt != device_buffer_allocs_.end());
+    output_offsets_device_buffer = jt->second.memory();
+  }
+
+  std::optional<RankId> rank =
+      comm_handle.clique_key.rank(params.collective_params->global_device_id);
+  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm_handle.comm->NumRanks());
+
+  TF_ASSIGN_OR_RETURN(
+      bool peer_access_enabled,
+      params.collective_cliques->peer_access_enabled(comm_handle.clique_key));
+
+  se::Event* start_event = nullptr;
+  se::Event* end_event = nullptr;
+  {
+    absl::MutexLock lock(&events_mutex_);
+    start_event = start_events_[stream.parent()].get();
+    end_event = end_events_[stream.parent()].get();
+  }
+
+  bool should_use_one_shot_kernel =
+      is_local() && one_shot_kernel_enabled_ && peer_access_enabled &&
+      IsRaggedAllToAllKernelSupported(num_ranks,
+                                      device_buffers[0].element_type);
+
+  if (should_use_one_shot_kernel) {
+    return RunOneShotRaggedAllToAll(
+        collectives, comm_handle.clique_key, config_.num_input_rows,
+        config_.num_row_elements, config_.num_total_updates, device_buffers,
+        stream, *rank, comm_handle.comm, start_event, end_event);
+  }
+
+  if (should_use_memcpy()) {
+    return RunMemCpyRaggedAllToAll(
+        collectives, comm_handle.clique_key, *rank, config_.num_row_elements,
+        config_.num_total_updates, device_buffers, stream, comm_handle.comm,
+        ragged_metadata_allocs, start_event, end_event);
+  }
+
+  return RunRaggedAllToAll(collectives, config_.num_row_elements,
+                           config_.num_total_updates, device_buffers, stream,
+                           comm_handle.comm, ragged_metadata_allocs,
+                           output_offsets_device_buffer);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
new file mode 100644
index 000000000000..6037ccb1b9b5
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -0,0 +1,106 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_RAGGED_ALL_TO_ALL_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_RAGGED_ALL_TO_ALL_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/device_memory_handle.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+struct RaggedAllToAllConfig {
+  CollectiveConfig config;
+  int64_t num_total_updates = 1;
+  int64_t num_input_rows = 1;
+  int64_t num_row_elements = 1;
+};
+
+// Thunk that performs a NCCL-based Ragged-All-to-All among CUDA GPU-based
+// replicas.
+class RaggedAllToAllStartThunk : public CollectiveThunk {
+ public:
+  RaggedAllToAllStartThunk(ThunkInfo thunk_info,
+                           const HloRaggedAllToAllInstruction* instr,
+                           std::vector<Buffer> buffers,
+                           bool p2p_memcpy_enabled);
+
+  // Returns whether the given instruction can be lowered to a nccl
+  // ragged-all-to-all call.
+  static absl::Status CheckImplementable(
+      const HloRaggedAllToAllInstruction* instr, int64_t replica_count,
+      int64_t partition_count);
+
+  absl::Status Initialize(const InitializeParams& params) override;
+
+  static const char* GetHloOpName() { return "ragged-all-to-all-start"; }
+
+  static CollectiveOpGroupMode GetGroupMode(
+      const HloRaggedAllToAllInstruction* instr);
+
+  const CollectiveConfig& config() const override { return config_.config; }
+  absl::Span<const Buffer> buffers() const { return buffers_; }
+
+ protected:
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+
+ private:
+  bool is_local() const;
+  bool should_use_memcpy() const { return p2p_memcpy_enabled_ && is_local(); }
+
+  const RaggedAllToAllConfig config_;
+  const std::vector<Buffer> buffers_;
+  int64_t device_count_ = -1;
+  const bool p2p_memcpy_enabled_;
+  const bool one_shot_kernel_enabled_;
+
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::vector<std::unique_ptr<se::MemoryAllocation>>>
+      host_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
+
+  absl::flat_hash_map<se::StreamExecutor*, se::DeviceMemoryHandle>
+      device_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
+
+  absl::Mutex events_mutex_;
+  // Events to synchronize steams on different devices at the start of the
+  // kernel.
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>>
+      start_events_ ABSL_GUARDED_BY(events_mutex_);
+  // Events to synchronize steams on different devices at the end of the kernel.
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>>
+      end_events_ ABSL_GUARDED_BY(events_mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_RAGGED_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
new file mode 100644
index 000000000000..17c15e319072
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.cc
@@ -0,0 +1,156 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/recv_thunk.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+RecvThunk::RecvThunk(ThunkInfo thunk_info, const HloRecvInstruction* instr,
+                     int64_t replica_count, int64_t partition_count,
+                     const Buffer& buffer)
+    : CollectiveThunk(Thunk::kRecv, thunk_info,
+                      /*is_sync=*/false, GetStreamKindForP2P(instr)),
+      config_(GetP2PConfigForSendRecv(instr, instr->shape().tuple_shapes(0),
+                                      replica_count, partition_count)),
+      buffer_(buffer),
+      execution_counters_(config_.validation_kind ==
+                                  P2PConfig::ValidationKind::kConditional
+                              ? new ExecutionCounters()
+                              : nullptr),
+      hlo_name_(instr->name()) {}
+
+absl::Status RecvThunk::Initialize(const InitializeParams& params) {
+  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+  if (execution_counters_) {
+    TF_RETURN_IF_ERROR(execution_counters_->Initialize(
+        params.executor, params.collective_params->run_id));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RecvThunk::RunCollective(const ExecuteParams& params,
+                                      se::Stream& stream,
+                                      CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, {buffer_},
+                             config_.config.operand_element_type));
+  TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
+
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
+
+  TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
+                      params.collective_params->device_assn->LogicalIdForDevice(
+                          global_device_id));
+  const int64_t current_id =
+      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+          ? current_logical_id.replica_id
+          : current_logical_id.computation_id;
+  std::string device_string = GetDeviceString(*params.collective_params);
+
+  const P2PConfig::SourceTargetMapEntry source_target =
+      P2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
+  DeviceBufferPair& buffer = device_buffers[0];
+
+  // Determine the source IDs for this instance. The source ID is the ID for
+  // the peer that will copy its data to this instance. If there is no
+  // source, just memzero() the destination buffer.
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing Recv from device ordinal: " << device_ordinal
+          << ", current_id: " << current_id << ", group mode: "
+          << CollectiveOpGroupModeToString(config_.config.group_mode) << " ("
+          << hlo_name_ << ")";
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(collectives, stream.parent(),
+                                          {buffer}, comm_handle.comm));
+
+  const std::optional<int64_t> source_id = source_target.source;
+  se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
+
+  VLOG(3) << absl::StreamFormat("%s : id = %d, source_id = %d", device_string,
+                                current_id, source_id.value_or(-1));
+
+  // Receive data from the source peer to the destination buffer.
+  if (source_id) {
+    bool should_run =
+        config_.validation_kind == P2PConfig::ValidationKind::kInvalid ? false
+                                                                       : true;
+    if (config_.validation_kind == P2PConfig::ValidationKind::kConditional) {
+      se::StreamExecutor* executor = params.stream->parent();
+      TF_ASSIGN_OR_RETURN(int64_t* counter,
+                          execution_counters_->GetCounter(
+                              executor, params.collective_params->run_id));
+      auto it = config_.source_target_to_bounds.find(
+          std::make_pair(*source_target.source, current_id));
+      if (it == config_.source_target_to_bounds.end()) {
+        return absl::InternalError("Missing bounds for conditional Recv");
+      }
+      if (*counter < it->second.first || *counter > it->second.second) {
+        should_run = false;
+      }
+      VLOG(3) << "RunCollective counter " << *counter << " " << should_run;
+      ++(*counter);
+    }
+    if (should_run) {
+      auto event = comm_handle.comm->Recv(
+          dest_addr, buffer.element_type, buffer.element_count,
+          RankId(*source_id), GpuCollectives::On(stream));
+
+      tsl::BlockUntilReady(event);
+      if (event.IsError()) {
+        return event.GetError();
+      }
+    } else {
+      VLOG(3) << "Skipping Recv";
+    }
+
+  } else {
+    // If there is no source peer, i.e. no sender to this instance, zero out
+    // the destination buffer.
+    VLOG(3) << absl::StreamFormat("%s : Recv: Issuing MemZero", device_string);
+    TF_RETURN_IF_ERROR(stream.MemZero(&dest_addr, dest_addr.size()));
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
new file mode 100644
index 000000000000..06eaaeadd793
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/recv_thunk.h
@@ -0,0 +1,65 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_RECV_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_RECV_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk that performs a recv operation.
+class RecvThunk : public CollectiveThunk {
+ public:
+  RecvThunk(ThunkInfo thunk_info, const HloRecvInstruction* instr,
+            int64_t replica_count, int64_t partition_count,
+            const Buffer& buffer);
+  absl::Status Initialize(const InitializeParams& params) override;
+
+ protected:
+  const CollectiveConfig& config() const override { return config_.config; }
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+  bool NeedFirstCallRendzevous() const override { return false; }
+
+ private:
+  const P2PConfig config_;
+  const Buffer buffer_;
+  std::shared_ptr<ExecutionCounters> execution_counters_;
+  std::string hlo_name_;
+};
+
+absl::Status RunRecv(GpuCollectives* collectives,
+                     P2PConfig::SourceTargetMapEntry source_target,
+                     DeviceBufferPair& buffer, se::Stream& stream,
+                     Communicator* comm, absl::string_view device_string,
+                     int64_t current_id);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_RECV_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/send_recv_thunk.cc
deleted file mode 100644
index 9506cd81e374..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/send_recv_thunk.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/gpu/runtime/send_recv_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/global_device_id.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/event.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/concurrency/async_value.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/profiler/lib/traceme.h"
-
-namespace xla::gpu {
-
-using tsl::AsyncValueRef;
-using tsl::profiler::TraceMe;
-using tsl::profiler::TraceMeEncode;
-
-// For sharded buffers we should execute Send/Recv operations only on devices
-// with maximal sharding, and do nothing on every other device.
-static absl::StatusOr<bool> ShouldSkip(
-    absl::string_view operation, const Thunk::ExecuteParams& params,
-    const std::optional<GlobalDeviceId>& device_constraint) {
-  if (!device_constraint.has_value()) return false;
-
-  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
-  bool skip = global_device_id != *device_constraint;
-  if (skip) {
-    VLOG(3) << "Skip " << operation << " as device id " << global_device_id
-            << " doesn't match device id constraint " << *device_constraint;
-  }
-
-  return skip;
-}
-
-//===----------------------------------------------------------------------===//
-// SendRecvAsyncEvents
-//===----------------------------------------------------------------------===//
-
-absl::Status SendRecvAsyncEvents::Emplace(
-    se::StreamExecutor* executor, int32_t channel_id,
-    tsl::AsyncValueRef<std::unique_ptr<se::Event>> event) {
-  Key key = {executor, channel_id};
-
-  absl::MutexLock lock(&mutex_);
-  if (auto it = events_.try_emplace(key, std::move(event)); it.second)
-    return absl::OkStatus();
-
-  return absl::InternalError(absl::StrFormat(
-      "Async send/recv event already exists (channel_id=%d)", channel_id));
-}
-
-absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>>
-SendRecvAsyncEvents::Extract(se::StreamExecutor* executor, int32_t channel_id) {
-  Key key = {executor, channel_id};
-
-  absl::MutexLock lock(&mutex_);
-  if (auto event = events_.extract(key)) return std::move(event.mapped());
-
-  return absl::InternalError(absl::StrFormat(
-      "Async send/recv event was not found (channel_id==%d)", channel_id));
-}
-
-//===----------------------------------------------------------------------===//
-// SendThunk
-//===----------------------------------------------------------------------===//
-
-SendThunk::SendThunk(
-    ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
-    int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
-    absl::flat_hash_map<std::string, std::string> frontend_attrs,
-    std::optional<GlobalDeviceId> device_constraint)
-    : Thunk(Thunk::kSend, thunk_info),
-      shape_(shape),
-      buffer_(buffer),
-      channel_id_(channel_id),
-      events_(std::move(events)),
-      frontend_attrs_(std::move(frontend_attrs)),
-      device_constraint_(device_constraint) {}
-
-absl::Status SendThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(3) << "Send buffer: channel_id=" << channel_id_
-          << "; shape=" << shape_.ToString();
-
-  TF_ASSIGN_OR_RETURN(bool skip,
-                      ShouldSkip("sending buffer", params, device_constraint_));
-  if (skip) return absl::OkStatus();
-
-  TraceMe trace(
-      [&] { return TraceMeEncode("Send", {{"channel_id", channel_id_}}); });
-
-  // Use device_to_host stream if it is available.
-  se::Stream* stream = params.device_to_host_stream;
-  if (stream) {
-    TF_RETURN_IF_ERROR(stream->WaitFor(params.stream));
-  } else {
-    stream = params.stream;
-  }
-
-  se::DeviceMemoryBase src =
-      params.buffer_allocations->GetDeviceAddress(buffer_);
-
-  // Send buffer to a handler registered with the executable.
-  if (auto* send = params.send_device_memory_function) {
-    TF_ASSIGN_OR_RETURN(
-        AsyncValueRef<std::unique_ptr<se::Event>> done,
-        (*send)(channel_id_, stream, shape_, src, frontend_attrs_));
-    return events_->Emplace(stream->parent(), channel_id_, std::move(done));
-  }
-
-  return absl::InvalidArgumentError(
-      "SendDeviceMemoryFunction is not available");
-}
-
-//===----------------------------------------------------------------------===//
-// SendDoneThunk
-//===----------------------------------------------------------------------===//
-
-SendDoneThunk::SendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
-                             std::shared_ptr<SendRecvAsyncEvents> events,
-                             std::optional<GlobalDeviceId> device_constraint)
-    : Thunk(Thunk::kSend, thunk_info),
-      channel_id_(channel_id),
-      events_(std::move(events)),
-      device_constraint_(device_constraint) {}
-
-absl::Status SendDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(3) << "Wait for send completion: channel_id=" << channel_id_;
-
-  TF_ASSIGN_OR_RETURN(bool skip, ShouldSkip("waiting for send completion",
-                                            params, device_constraint_));
-  if (skip) return absl::OkStatus();
-
-  TraceMe trace(
-      [&] { return TraceMeEncode("SendDone", {{"channel_id", channel_id_}}); });
-
-  se::StreamExecutor* executor = params.stream->parent();
-  TF_ASSIGN_OR_RETURN(auto done_event, events_->Extract(executor, channel_id_));
-
-  // Wait until send handler will record an event on the stream.
-  BlockUntilReady(done_event.GetAsyncValue());
-  if (done_event.IsError()) return done_event.GetError();
-
-  VLOG(5) << "Completed Send operation: channel_id=" << channel_id_;
-
-  // Once event is recorded we can add a stream dependency.
-  return params.stream->WaitFor(done_event.get().get());
-}
-
-//===----------------------------------------------------------------------===//
-// RecvThunk
-//===----------------------------------------------------------------------===//
-
-RecvThunk::RecvThunk(
-    ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
-    int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
-    absl::flat_hash_map<std::string, std::string> frontend_attrs,
-    std::optional<GlobalDeviceId> device_constraint)
-    : Thunk(Thunk::kSend, thunk_info),
-      shape_(shape),
-      buffer_(buffer),
-      channel_id_(channel_id),
-      events_(std::move(events)),
-      frontend_attrs_(std::move(frontend_attrs)),
-      device_constraint_(device_constraint) {}
-
-absl::Status RecvThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(3) << "Recv buffer: channel_id=" << channel_id_
-          << "; shape=" << shape_.ToString();
-
-  TF_ASSIGN_OR_RETURN(
-      bool skip, ShouldSkip("receiving buffer", params, device_constraint_));
-  if (skip) return absl::OkStatus();
-
-  TraceMe trace(
-      [&] { return TraceMeEncode("Recv", {{"channel_id", channel_id_}}); });
-
-  // Use host_to_device stream if it is available.
-  se::Stream* stream = params.host_to_device_stream;
-  if (stream) {
-    TF_RETURN_IF_ERROR(stream->WaitFor(params.stream));
-  } else {
-    stream = params.stream;
-  }
-
-  se::DeviceMemoryBase dst =
-      params.buffer_allocations->GetDeviceAddress(buffer_);
-
-  // Recv buffer from a handler registered with the run options.
-  if (auto* recv = params.recv_device_memory_function) {
-    TF_ASSIGN_OR_RETURN(
-        AsyncValueRef<std::unique_ptr<se::Event>> done,
-        (*recv)(channel_id_, stream, shape_, &dst, frontend_attrs_));
-    return events_->Emplace(stream->parent(), channel_id_, std::move(done));
-  }
-
-  return absl::InvalidArgumentError(
-      "RecvDeviceMemoryFunction is not available");
-}
-
-//===----------------------------------------------------------------------===//
-// RecvDoneThunk
-//===----------------------------------------------------------------------===//
-
-RecvDoneThunk::RecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
-                             std::shared_ptr<SendRecvAsyncEvents> events,
-                             std::optional<GlobalDeviceId> device_constraint)
-    : Thunk(Thunk::kSend, thunk_info),
-      channel_id_(channel_id),
-      events_(std::move(events)) {}
-
-absl::Status RecvDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(3) << "Wait for recv completion: channel_id=" << channel_id_;
-
-  TF_ASSIGN_OR_RETURN(bool skip, ShouldSkip("waiting for recv completion",
-                                            params, device_constraint_));
-  if (skip) return absl::OkStatus();
-
-  TraceMe trace(
-      [&] { return TraceMeEncode("RecvDone", {{"channel_id", channel_id_}}); });
-
-  se::StreamExecutor* executor = params.stream->parent();
-  TF_ASSIGN_OR_RETURN(auto done_event, events_->Extract(executor, channel_id_));
-
-  // Wait until send handler will record an event on the stream.
-  BlockUntilReady(done_event.GetAsyncValue());
-  if (done_event.IsError()) return done_event.GetError();
-
-  VLOG(5) << "Completed Recv operation: channel=" << channel_id_;
-
-  // Once event is recorded we can add a stream dependency.
-  return params.stream->WaitFor(done_event.get().get());
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_recv_thunk.h b/third_party/xla/xla/backends/gpu/runtime/send_recv_thunk.h
deleted file mode 100644
index a67c876c3cb6..000000000000
--- a/third_party/xla/xla/backends/gpu/runtime/send_recv_thunk.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_GPU_RUNTIME_SEND_RECV_THUNK_H_
-#define XLA_BACKENDS_GPU_RUNTIME_SEND_RECV_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/runtime/thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/global_device_id.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/event.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu {
-
-//===----------------------------------------------------------------------===//
-// SendRecvAsyncEvents
-//===----------------------------------------------------------------------===//
-
-// Send/Recv operations have two levels of async behavior:
-//
-// (1) AsyncValueRef will become available only after send/recv handler
-//     schedules all activities on the device.
-//
-// (2) se::Event will become available when device activity recorded by
-//     send/recv handlers complete.
-//
-// We  keep track of Send/Recv commands in flight, and synchronize `send` and
-// `recv` operations with corresponding `send-done` and `recv-done`.
-//
-// Each channel can have at most one event in flight for a given executor.
-//
-// We have a single instance of `SendRecvAsyncEvents` for each Gpu executable,
-// and all thunks share it using a shared pointer.
-//
-// TODO(ezhulenev): Rename to `SendRecvEvents` once we remove deprecated XLA
-// runtime, as it has name conflict.
-class SendRecvAsyncEvents {
- public:
-  // Emplace a new send/recv completion event.
-  absl::Status Emplace(se::StreamExecutor* executor, int32_t channel_id,
-                       tsl::AsyncValueRef<std::unique_ptr<se::Event>> event);
-
-  // Extract a send/recv completion event.
-  absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> Extract(
-      se::StreamExecutor* executor, int32_t channel_id);
-
- private:
-  using Key = std::pair<se::StreamExecutor*, /*channel_id=*/int64_t>;
-
-  absl::Mutex mutex_;
-  absl::flat_hash_map<Key, tsl::AsyncValueRef<std::unique_ptr<se::Event>>>
-      events_ ABSL_GUARDED_BY(mutex_);
-};
-
-//===----------------------------------------------------------------------===//
-// SendThunk
-//===----------------------------------------------------------------------===//
-
-class SendThunk : public Thunk {
- public:
-  SendThunk(ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
-            int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
-            absl::flat_hash_map<std::string, std::string> frontend_attrs,
-            std::optional<GlobalDeviceId> device_constraint);
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  Shape shape_;
-  BufferAllocation::Slice buffer_;
-
-  int64_t channel_id_;
-
-  std::shared_ptr<SendRecvAsyncEvents> events_;
-  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
-  std::optional<GlobalDeviceId> device_constraint_;
-};
-
-//===----------------------------------------------------------------------===//
-// SendDoneThunk
-//===----------------------------------------------------------------------===//
-
-class SendDoneThunk : public Thunk {
- public:
-  SendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
-                std::shared_ptr<SendRecvAsyncEvents> events,
-                std::optional<GlobalDeviceId> device_constraint);
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  int64_t channel_id_;
-
-  std::shared_ptr<SendRecvAsyncEvents> events_;
-  std::optional<GlobalDeviceId> device_constraint_;
-};
-
-//===----------------------------------------------------------------------===//
-// RecvThunk
-//===----------------------------------------------------------------------===//
-
-class RecvThunk : public Thunk {
- public:
-  RecvThunk(ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
-            int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
-            absl::flat_hash_map<std::string, std::string> frontend_attrs,
-            std::optional<GlobalDeviceId> device_constraint);
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  Shape shape_;
-  BufferAllocation::Slice buffer_;
-
-  int64_t channel_id_;
-
-  std::shared_ptr<SendRecvAsyncEvents> events_;
-  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
-  std::optional<GlobalDeviceId> device_constraint_;
-};
-
-//===----------------------------------------------------------------------===//
-// RecvDoneThunk
-//===----------------------------------------------------------------------===//
-
-class RecvDoneThunk : public Thunk {
- public:
-  RecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
-                std::shared_ptr<SendRecvAsyncEvents> events,
-                std::optional<GlobalDeviceId> device_constraint);
-
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  int64_t channel_id_;
-
-  std::shared_ptr<SendRecvAsyncEvents> events_;
-  std::optional<GlobalDeviceId> device_constraint_;
-};
-
-}  // namespace xla::gpu
-
-#endif  // XLA_BACKENDS_GPU_RUNTIME_SEND_RECV_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
new file mode 100644
index 000000000000..29a8aeb27e59
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
@@ -0,0 +1,152 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/send_thunk.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+SendThunk::SendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
+                     int64_t replica_count, int64_t partition_count,
+                     const Buffer& buffer)
+    : CollectiveThunk(Thunk::kSend, thunk_info,
+                      /*is_sync=*/false, GetStreamKindForP2P(instr)),
+      config_(GetP2PConfigForSendRecv(instr, instr->operand(0)->shape(),
+                                      replica_count, partition_count)),
+      buffer_(buffer),
+      execution_counters_(config_.validation_kind ==
+                                  P2PConfig::ValidationKind::kConditional
+                              ? new ExecutionCounters()
+                              : nullptr),
+      hlo_name_(instr->name()) {}
+
+absl::Status SendThunk::Initialize(const InitializeParams& params) {
+  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+  if (execution_counters_) {
+    TF_RETURN_IF_ERROR(execution_counters_->Initialize(
+        params.executor, params.collective_params->run_id));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status SendThunk::RunCollective(const ExecuteParams& params,
+                                      se::Stream& stream,
+                                      CommunicatorHandle comm_handle) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, {buffer_},
+                             config_.config.operand_element_type));
+  TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
+
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
+
+  TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
+                      params.collective_params->device_assn->LogicalIdForDevice(
+                          global_device_id));
+  const int64_t current_id =
+      config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
+          ? current_logical_id.replica_id
+          : current_logical_id.computation_id;
+  std::string device_string = GetDeviceString(*params.collective_params);
+
+  const P2PConfig::SourceTargetMapEntry source_target =
+      P2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
+  DeviceBufferPair& buffer = device_buffers[0];
+
+  // Determine the target IDs for this instance. The target ID is the ID
+  // to which this instance will copy its data.
+  int device_ordinal = stream.parent()->device_ordinal();
+  VLOG(3) << "Performing Send from device ordinal: " << device_ordinal
+          << ", current_id: " << current_id << ", group mode: "
+          << CollectiveOpGroupModeToString(config_.config.group_mode) << " ("
+          << hlo_name_ << ")";
+
+  TF_ASSIGN_OR_RETURN(GpuCollectives * collectives, GetGpuCollectives(params));
+  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(collectives, stream.parent(),
+                                          {buffer}, comm_handle.comm));
+
+  const std::optional<int64_t> target_id = source_target.target;
+  se::DeviceMemoryBase src_addr = buffer.source_buffer;
+
+  VLOG(3) << absl::StreamFormat("%s : id = %d, target_id = %d", device_string,
+                                current_id, target_id.value_or(-1));
+
+  // Send source buffer to target peer if needed.
+  if (target_id) {
+    bool should_run =
+        config_.validation_kind == P2PConfig::ValidationKind::kInvalid ? false
+                                                                       : true;
+    if (config_.validation_kind == P2PConfig::ValidationKind::kConditional) {
+      se::StreamExecutor* executor = params.stream->parent();
+      TF_ASSIGN_OR_RETURN(int64_t* counter,
+                          execution_counters_->GetCounter(
+                              executor, params.collective_params->run_id));
+      auto it = config_.source_target_to_bounds.find(
+          std::make_pair(current_id, *source_target.target));
+      if (it == config_.source_target_to_bounds.end()) {
+        return absl::InternalError("Missing bounds for conditional Send");
+      }
+      if (*counter < it->second.first || *counter > it->second.second) {
+        should_run = false;
+      }
+      VLOG(3) << "RunCollective counter " << *counter << " " << should_run;
+      ++(*counter);
+    }
+
+    if (should_run) {
+      auto event = comm_handle.comm->Send(
+          src_addr, buffer.element_type, buffer.element_count,
+          RankId(*target_id), GpuCollectives::On(stream));
+
+      tsl::BlockUntilReady(event);
+      if (event.IsError()) {
+        return event.GetError();
+      }
+    } else {
+      VLOG(3) << "Skipping Send";
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
new file mode 100644
index 000000000000..8f0d91137dd1
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
@@ -0,0 +1,66 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_SEND_THUNK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_SEND_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+// Thunk that performs a send operation.
+class SendThunk : public CollectiveThunk {
+ public:
+  SendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
+            int64_t replica_count, int64_t partition_count,
+            const Buffer& buffer);
+  absl::Status Initialize(const InitializeParams& params) override;
+
+ protected:
+  const CollectiveConfig& config() const override { return config_.config; }
+  absl::Status RunCollective(const ExecuteParams& params, se::Stream& stream,
+                             CommunicatorHandle comm_handle) override;
+  bool NeedFirstCallRendzevous() const override { return false; }
+
+ private:
+  const P2PConfig config_;
+  const Buffer buffer_;
+  std::shared_ptr<ExecutionCounters> execution_counters_;
+  std::string hlo_name_;
+};
+
+absl::Status RunSend(GpuCollectives* collectives,
+                     P2PConfig::SourceTargetMapEntry source_target,
+                     DeviceBufferPair& buffer, se::Stream& stream,
+                     Communicator* comm, absl::string_view device_string,
+                     int64_t current_id);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_SEND_THUNK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
index 2544374c315d..06860d3be34b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.cc
@@ -15,18 +15,24 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/annotation.h"
 #include "xla/backends/gpu/runtime/thunk.h"
-#include "tsl/platform/errors.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
 namespace gpu {
@@ -36,7 +42,9 @@ SequentialThunk::SequentialThunk(ThunkInfo thunk_info, ThunkSequence thunks)
 
 std::string SequentialThunk::ToString(int indent) const {
   const std::string indent_str(indent * 2, ' ');
-  if (thunks_.empty()) return indent_str + "No thunks.";
+  if (thunks_.empty()) {
+    return indent_str + "No thunks.";
+  }
 
   auto thunk_with_longest_kind = absl::c_max_element(
       thunks_,
@@ -78,12 +86,21 @@ absl::Status SequentialThunk::ExecuteOnStream(const ExecuteParams& params) {
   std::optional<tsl::profiler::ScopedAnnotation> seq_annotation =
       GetKernelAnnotation(profile_annotation());
   for (const std::unique_ptr<Thunk>& thunk : thunks_) {
+    tsl::profiler::TraceMe trace(thunk->profile_annotation());
+
     std::optional<tsl::profiler::ScopedAnnotation> annotation =
         GetKernelAnnotation(thunk->profile_annotation());
     if (params.mock_collectives && thunk->IsCollective()) {
       continue;
     }
+
+    VLOG(1) << "[" << params.stream->parent()->device_ordinal() << "] "
+            << "Start SequentialThunk::ExecuteOnStream: "
+            << thunk->profile_annotation();
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params));
+    VLOG(1) << "[" << params.stream->parent()->device_ordinal() << "] "
+            << "End SequentialThunk::ExecuteOnStream: "
+            << thunk->profile_annotation();
   }
   return absl::OkStatus();
 }
@@ -96,5 +113,30 @@ void SequentialThunk::ForAllThunks(
   }
 }
 
+absl::StatusOr<ThunkProto> SequentialThunk::ToProto() const {
+  TF_ASSIGN_OR_RETURN(ThunkProto proto, Thunk::ToProto());
+  // This sets the oneof-type to the sequential thunk, even if the thunk list is
+  // empty.
+  proto.mutable_sequential_thunk();
+  for (const auto& thunk : thunks_) {
+    TF_ASSIGN_OR_RETURN(*proto.mutable_sequential_thunk()->add_thunks(),
+                        thunk->ToProto());
+  }
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<SequentialThunk>> SequentialThunk::FromProto(
+    ThunkInfo thunk_info, const SequentialThunkProto& thunk_proto,
+    const Deserializer& deserializer) {
+  ThunkSequence thunk_sequence;
+  for (const auto& sub_thunk_proto : thunk_proto.thunks()) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> sub_thunk,
+                        deserializer(sub_thunk_proto));
+    thunk_sequence.push_back(std::move(sub_thunk));
+  }
+
+  return std::make_unique<SequentialThunk>(std::move(thunk_info),
+                                           std::move(thunk_sequence));
+}
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
index 00daad7faf95..4f13a1e8b3af 100644
--- a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
 
+#include <memory>
 #include <string>
 
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -45,6 +48,12 @@ class SequentialThunk : public Thunk {
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
+  static absl::StatusOr<std::unique_ptr<SequentialThunk>> FromProto(
+      ThunkInfo thunk_info, const SequentialThunkProto& thunk_proto,
+      const Deserializer& deserializer);
+
  private:
   // The list of sub-thunks.
   ThunkSequence thunks_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
new file mode 100644
index 000000000000..097d3cd1bbf8
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/sequential_thunk_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::IsEmpty;
+
+constexpr ExecutionStreamId kExecutionStreamId{123};
+constexpr absl::string_view kProfileAnnotation = "profile_annotation";
+
+Thunk::ThunkInfo GetExampleThunkInfo() {
+  Thunk::ThunkInfo thunk_info{};
+  thunk_info.execution_stream_id = kExecutionStreamId;
+  thunk_info.profile_annotation = kProfileAnnotation;
+  return thunk_info;
+}
+
+TEST(SequentialThunkTest, EmptySequentialThunkToProto) {
+  SequentialThunk thunk{GetExampleThunkInfo(), {}};
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+  ASSERT_TRUE(proto.has_sequential_thunk());
+  EXPECT_EQ(proto.sequential_thunk().thunks_size(), 0);
+
+  ASSERT_TRUE(proto.has_thunk_info());
+  EXPECT_EQ(proto.thunk_info().execution_stream_id(), kExecutionStreamId);
+  EXPECT_EQ(proto.thunk_info().profile_annotation(), kProfileAnnotation);
+}
+
+TEST(SequentialThunkTest, EmptySequentialThunkFromProto) {
+  SequentialThunkProto proto;
+
+  Thunk::Deserializer deserializer =
+      [](const ThunkProto&) -> absl::StatusOr<std::unique_ptr<Thunk>> {
+    return absl::InternalError("This should never be called");
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SequentialThunk> sequential_thunk,
+      SequentialThunk::FromProto(GetExampleThunkInfo(), proto, deserializer));
+
+  ASSERT_NE(sequential_thunk, nullptr);
+  EXPECT_EQ(sequential_thunk->execution_stream_id(), kExecutionStreamId);
+  EXPECT_EQ(sequential_thunk->profile_annotation(), kProfileAnnotation);
+  EXPECT_THAT(sequential_thunk->thunks(), IsEmpty());
+}
+
+TEST(SequentialThunkTest, SequentialThunkChainFromProto) {
+  SequentialThunkProto outer_proto;
+  // This adds an inner SequentialThunk into the ThunkSequence of the outer
+  // sequential thunk.
+  ThunkProto* inner_proto = outer_proto.add_thunks();
+  inner_proto->mutable_sequential_thunk();
+  inner_proto->mutable_thunk_info()->set_profile_annotation(
+      std::string{kProfileAnnotation});
+  inner_proto->mutable_thunk_info()->set_execution_stream_id(
+      kExecutionStreamId.value());
+
+  Thunk::Deserializer always_fail_deserializer = [](const ThunkProto&) {
+    return absl::InternalError("This should never be called.");
+  };
+
+  Thunk::Deserializer only_supports_sequential_thunk_deserializer =
+      [&](const ThunkProto& proto) -> absl::StatusOr<std::unique_ptr<Thunk>> {
+    if (!proto.has_sequential_thunk()) {
+      return absl::InvalidArgumentError("This should be a sequential thunk!");
+    }
+
+    return SequentialThunk::FromProto(GetExampleThunkInfo(),
+                                      proto.sequential_thunk(),
+                                      always_fail_deserializer);
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SequentialThunk> outer_thunk,
+      SequentialThunk::FromProto(GetExampleThunkInfo(), outer_proto,
+                                 only_supports_sequential_thunk_deserializer));
+
+  ASSERT_NE(outer_thunk, nullptr);
+  EXPECT_EQ(outer_thunk->execution_stream_id(), kExecutionStreamId);
+  EXPECT_EQ(outer_thunk->profile_annotation(), kProfileAnnotation);
+
+  ASSERT_EQ(outer_thunk->thunks().size(), 1);
+  const SequentialThunk* inner_thunk =
+      dynamic_cast<const SequentialThunk*>(outer_thunk->thunks().front().get());
+  ASSERT_NE(inner_thunk, nullptr);
+  EXPECT_THAT(inner_thunk->thunks(), IsEmpty());
+  EXPECT_EQ(inner_thunk->execution_stream_id(), kExecutionStreamId);
+  EXPECT_EQ(inner_thunk->profile_annotation(), kProfileAnnotation);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.cc b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
index 42c6408668f6..c1376d7dc4bf 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -78,7 +77,7 @@ absl::StatusOr<Communicator*> Thunk::CollectiveCliques::GetComm(
   return *communicator;
 }
 
-absl::StatusOr<bool> Thunk::CollectiveCliques::is_local_clique(
+absl::StatusOr<bool> Thunk::CollectiveCliques::peer_access_enabled(
     const GpuCliqueKey& clique_key) const {
   // Check that we locked access to a clique for `clique_key`.
   auto clique = cliques_map_.find(clique_key);
@@ -87,19 +86,7 @@ absl::StatusOr<bool> Thunk::CollectiveCliques::is_local_clique(
                                             clique_key.ToString()));
   }
 
-  return (*clique->second)->IsLocal();
-}
-
-absl::StatusOr<size_t> Thunk::CollectiveCliques::num_communicators(
-    const GpuCliqueKey& clique_key) const {
-  // Check that we locked access to a clique for `clique_key`.
-  auto clique = cliques_map_.find(clique_key);
-  if (clique == cliques_map_.end()) {
-    return absl::NotFoundError(absl::StrCat("No clique found for clique key: ",
-                                            clique_key.ToString()));
-  }
-
-  return (*clique->second)->num_communicators();
+  return (*clique->second)->peer_access_enabled();
 }
 
 //===----------------------------------------------------------------------===//
@@ -249,64 +236,66 @@ Thunk::ExecuteParams::ExecuteParams(
   case Thunk::x: \
     return #x
   switch (kind) {
-    CASE(kDynamicSlice);
+    // # go/keep-sorted start
+    CASE(kAllGather);
+    CASE(kAllGatherDone);
+    CASE(kAllGatherStart);
+    CASE(kAllReduce);
+    CASE(kAllReduceDone);
+    CASE(kAllReduceStart);
+    CASE(kAllToAll);
+    CASE(kAllToAllDone);
+    CASE(kAllToAllStart);
     CASE(kCholesky);
+    CASE(kCollectiveBroadcast);
+    CASE(kCollectiveBroadcastDone);
+    CASE(kCollectiveBroadcastStart);
+    CASE(kCollectivePermute);
+    CASE(kCollectivePermuteDone);
+    CASE(kCollectivePermuteStart);
     CASE(kCommandBuffer);
     CASE(kConditional);
     CASE(kConvolution);
     CASE(kConvolutionReorder);
     CASE(kCopy);
     CASE(kCopyDone);
+    CASE(kCuDnn);
     CASE(kCubSort);
     CASE(kCublasLtMatmul);
     CASE(kCustomCall);
     CASE(kCustomKernel);
-    CASE(kNcclAllGather);
-    CASE(kNcclAllGatherStart);
-    CASE(kNcclAllGatherDone);
-    CASE(kNcclAllReduce);
-    CASE(kNcclAllReduceStart);
-    CASE(kNcclAllReduceDone);
-    CASE(kNcclCollectiveBroadcast);
-    CASE(kNcclCollectiveBroadcastStart);
-    CASE(kNcclCollectiveBroadcastDone);
-    CASE(kNcclCollectivePermute);
-    CASE(kNcclCollectivePermuteStart);
-    CASE(kNcclCollectivePermuteDone);
-    CASE(kNcclGroupStart);
-    CASE(kNcclGroupDone);
-    CASE(kNcclReduceScatter);
-    CASE(kNcclReduceScatterStart);
-    CASE(kNcclReduceScatterDone);
-    CASE(kNcclAllToAll);
-    CASE(kNcclAllToAllStart);
-    CASE(kNcclAllToAllDone);
-    CASE(kNcclSend);
-    CASE(kNcclSendDone);
-    CASE(kNcclRaggedAllToAll);
-    CASE(kNcclRaggedAllToAllStart);
-    CASE(kNcclRaggedAllToAllDone);
-    CASE(kNcclRecv);
-    CASE(kNcclRecvDone);
+    CASE(kDynamicSlice);
     CASE(kFft);
     CASE(kGemm);
+    CASE(kGroupDone);
+    CASE(kGroupStart);
+    CASE(kHostRecv);
+    CASE(kHostRecvDone);
+    CASE(kHostSend);
+    CASE(kHostSendDone);
     CASE(kInfeed);
     CASE(kKernel);
     CASE(kMemset32BitValue);
     CASE(kMemzero);
     CASE(kNorm);
     CASE(kOutfeed);
-    CASE(kSend);
-    CASE(kSendDone);
     CASE(kPartitionId);
-    CASE(kReplicaId);
+    CASE(kRaggedAllToAll);
+    CASE(kRaggedAllToAllDone);
+    CASE(kRaggedAllToAllStart);
     CASE(kRecv);
     CASE(kRecvDone);
+    CASE(kReduceScatter);
+    CASE(kReduceScatterDone);
+    CASE(kReduceScatterStart);
+    CASE(kReplicaId);
+    CASE(kSend);
+    CASE(kSendDone);
     CASE(kSequential);
     CASE(kTriangularSolve);
-    CASE(kWhile);
     CASE(kWaitForStreams);
-    CASE(kCuDnn);
+    CASE(kWhile);
+    // # go/keep-sorted end
   }
 }
 
@@ -328,9 +317,8 @@ std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
 }
 
 bool IsReductionCollective(Thunk::Kind kind) {
-  return kind == Thunk::kNcclAllReduce || kind == Thunk::kNcclAllReduceStart ||
-         kind == Thunk::kNcclReduceScatter ||
-         kind == Thunk::kNcclReduceScatterStart;
+  return kind == Thunk::kAllReduce || kind == Thunk::kAllReduceStart ||
+         kind == Thunk::kReduceScatter || kind == Thunk::kReduceScatterStart;
 }
 
 Thunk::ThunkInfo Thunk::ThunkInfo::WithProfileAnnotation(
@@ -348,33 +336,35 @@ Thunk::ThunkInfo Thunk::ThunkInfo::WithProfileAnnotation(
 
 bool Thunk::IsCollective() const {
   switch (kind()) {
-    case kNcclAllGather:
-    case kNcclAllGatherStart:
-    case kNcclAllGatherDone:
-    case kNcclAllReduce:
-    case kNcclAllReduceStart:
-    case kNcclAllReduceDone:
-    case kNcclCollectiveBroadcast:
-    case kNcclCollectiveBroadcastStart:
-    case kNcclCollectiveBroadcastDone:
-    case kNcclCollectivePermute:
-    case kNcclCollectivePermuteStart:
-    case kNcclCollectivePermuteDone:
-    case kNcclReduceScatter:
-    case kNcclReduceScatterStart:
-    case kNcclReduceScatterDone:
-    case kNcclAllToAll:
-    case kNcclAllToAllStart:
-    case kNcclAllToAllDone:
-    case kNcclRaggedAllToAll:
-    case kNcclRaggedAllToAllStart:
-    case kNcclRaggedAllToAllDone:
-    case kNcclSend:
-    case kNcclSendDone:
-    case kNcclRecv:
-    case kNcclRecvDone:
-    case kNcclGroupStart:
-    case kNcclGroupDone:
+    // go/keep-sorted start
+    case kAllGather:
+    case kAllGatherDone:
+    case kAllGatherStart:
+    case kAllReduce:
+    case kAllReduceDone:
+    case kAllReduceStart:
+    case kAllToAll:
+    case kAllToAllDone:
+    case kAllToAllStart:
+    case kCollectiveBroadcast:
+    case kCollectiveBroadcastDone:
+    case kCollectiveBroadcastStart:
+    case kCollectivePermute:
+    case kCollectivePermuteDone:
+    case kCollectivePermuteStart:
+    case kGroupDone:
+    case kGroupStart:
+    case kRaggedAllToAll:
+    case kRaggedAllToAllDone:
+    case kRaggedAllToAllStart:
+    case kRecv:
+    case kRecvDone:
+    case kReduceScatter:
+    case kReduceScatterDone:
+    case kReduceScatterStart:
+    case kSend:
+    case kSendDone:
+      // go/keep-sorted end
       return true;
     default:
       return false;
@@ -385,5 +375,13 @@ void Thunk::ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const {
   fn(this);
 }
 
+absl::StatusOr<ThunkProto> Thunk::ToProto() const {
+  ThunkProto proto;
+  proto.mutable_thunk_info()->set_execution_stream_id(
+      execution_stream_id_.value());
+  proto.mutable_thunk_info()->set_profile_annotation(profile_annotation_);
+  return proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.h b/third_party/xla/xla/backends/gpu/runtime/thunk.h
index 57aafa7a6086..6eff98df1366 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_H_
 #define XLA_BACKENDS_GPU_RUNTIME_THUNK_H_
 
-#include <cstddef>
 #include <cstdint>
 #include <map>
 #include <memory>
@@ -26,6 +25,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_cliques.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/executable_run_options.h"
@@ -116,64 +117,66 @@ class Thunk {
   static constexpr auto kDefaultExecutionStreamId = ExecutionStreamId(0);
 
   enum Kind {
-    kDynamicSlice,
+    // # go/keep-sorted start
+    kAllGather,
+    kAllGatherDone,
+    kAllGatherStart,
+    kAllReduce,
+    kAllReduceDone,
+    kAllReduceStart,
+    kAllToAll,
+    kAllToAllDone,
+    kAllToAllStart,
     kCholesky,
+    kCollectiveBroadcast,
+    kCollectiveBroadcastDone,
+    kCollectiveBroadcastStart,
+    kCollectivePermute,
+    kCollectivePermuteDone,
+    kCollectivePermuteStart,
+    kCommandBuffer,
     kConditional,
     kConvolution,
     kConvolutionReorder,
     kCopy,
     kCopyDone,
-    kCommandBuffer,
+    kCuDnn,
     kCubSort,
     kCublasLtMatmul,
     kCustomCall,
     kCustomKernel,
+    kDynamicSlice,
     kFft,
     kGemm,
+    kGroupDone,
+    kGroupStart,
+    kHostRecv,
+    kHostRecvDone,
+    kHostSend,
+    kHostSendDone,
     kInfeed,
     kKernel,
     kMemset32BitValue,
     kMemzero,
-    kNcclAllGather,
-    kNcclAllGatherStart,
-    kNcclAllGatherDone,
-    kNcclAllReduce,
-    kNcclAllReduceStart,
-    kNcclAllReduceDone,
-    kNcclCollectiveBroadcast,
-    kNcclCollectiveBroadcastStart,
-    kNcclCollectiveBroadcastDone,
-    kNcclCollectivePermute,
-    kNcclCollectivePermuteStart,
-    kNcclCollectivePermuteDone,
-    kNcclGroupStart,
-    kNcclGroupDone,
-    kNcclReduceScatter,
-    kNcclReduceScatterStart,
-    kNcclReduceScatterDone,
-    kNcclAllToAll,
-    kNcclAllToAllStart,
-    kNcclAllToAllDone,
-    kNcclRaggedAllToAll,
-    kNcclRaggedAllToAllStart,
-    kNcclRaggedAllToAllDone,
-    kNcclSend,
-    kNcclSendDone,
-    kNcclRecv,
-    kNcclRecvDone,
     kNorm,
     kOutfeed,
     kPartitionId,
+    kRaggedAllToAll,
+    kRaggedAllToAllDone,
+    kRaggedAllToAllStart,
     kRecv,
     kRecvDone,
+    kReduceScatter,
+    kReduceScatterDone,
+    kReduceScatterStart,
     kReplicaId,
-    kSequential,
     kSend,
     kSendDone,
+    kSequential,
     kTriangularSolve,
-    kWhile,
     kWaitForStreams,
-    kCuDnn
+    kWhile
+    // go/keep-sorted end
   };
 
   // TODO(ezhulenev): This should become a part of StreamExecutor library, but
@@ -205,8 +208,7 @@ class Thunk {
   class ResourceRequestsInterface {
    public:
     virtual ~ResourceRequestsInterface() = default;
-    virtual absl::Status AddClique(const GpuCliqueKey& clique_key,
-                                   int32_t num_local_participants) = 0;
+    virtual absl::Status AddClique(const GpuCliqueKey& clique_key) = 0;
   };
 
   //===--------------------------------------------------------------------===//
@@ -224,14 +226,11 @@ class Thunk {
     absl::StatusOr<Communicator*> GetComm(const GpuCliqueKey& clique_key,
                                           RankId rank) const;
 
-    // Returns the number of communicators in a collective clique. Returns error
-    // if we do not have an acquired clique for a given key.
-    absl::StatusOr<size_t> num_communicators(
+    // Returns whether peer device memory access is possible between all devices
+    // in the clique.
+    absl::StatusOr<bool> peer_access_enabled(
         const GpuCliqueKey& clique_key) const;
 
-    // Returns whether the clique is a local clique.
-    absl::StatusOr<bool> is_local_clique(const GpuCliqueKey& clique_key) const;
-
     bool empty() const { return cliques_map_.empty(); }
 
     bool num_transient_cliques() const { return num_transient_cliques_; }
@@ -477,6 +476,12 @@ class Thunk {
   // Returns `true` if this thunk requires inter-GPU communication.
   bool IsCollective() const;
 
+  // Returns any communicators used during execution.
+  virtual absl::StatusOr<std::vector<Communicator*>> GetCommunicators(
+      const ExecuteParams& params) const {
+    return std::vector<Communicator*>();
+  }
+
   // Invokes `fn` with this thunk and all nested thunks.
   virtual void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const;
 
@@ -494,6 +499,15 @@ class Thunk {
     return params.collective_params->collectives;
   }
 
+  // Serializes the thunk into a `ThunkProto`.
+  virtual absl::StatusOr<ThunkProto> ToProto() const;
+
+  // This declares a deserializer callback that `FromProto` Thunk factory
+  // functions can use to deserialize sub messages.
+  using Deserializer =
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<Thunk>>(
+          const ThunkProto&) const>;
+
  private:
   Kind kind_;
   std::string profile_annotation_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
new file mode 100644
index 000000000000..9d40dde528b1
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.gpu;
+
+import "xla/service/buffer_assignment.proto";
+
+// Contains basic pieces of information that every thunk type has.
+message ThunkInfoProto {
+  string profile_annotation = 1;
+  int64 execution_stream_id = 2;
+}
+
+message CopyThunkProto {
+  xla.buffer_assignment.BufferAllocationSliceProto source_buffer = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto destination_buffer = 2;
+  int64 mem_size = 3;
+}
+
+message DeviceToHostCopyThunkProto {
+  CopyThunkProto copy_thunk = 1;
+}
+
+message HostToDeviceCopyThunkProto {
+  CopyThunkProto copy_thunk = 1;
+}
+
+message ConditionalThunkProto {
+  xla.buffer_assignment.BufferAllocationSliceProto branch_index_buffer = 1;
+  repeated SequentialThunkProto branch_thunks = 2;
+  bool branch_index_is_bool = 3;
+}
+
+message WhileThunkProto {
+  xla.buffer_assignment.BufferAllocationSliceProto
+      condition_result_buffer_index = 1;
+  SequentialThunkProto condition_thunk_sequence = 2;
+  SequentialThunkProto body_thunk_sequence = 3;
+  optional int64 trip_count = 4;
+}
+
+message ThunkProto {
+  ThunkInfoProto thunk_info = 1;
+
+  oneof impl {
+    SequentialThunkProto sequential_thunk = 2;
+    CopyThunkProto copy_thunk = 3;
+    DeviceToHostCopyThunkProto device_to_host_copy_thunk = 4;
+    HostToDeviceCopyThunkProto host_to_device_copy_thunk = 5;
+    ConditionalThunkProto conditional_thunk = 6;
+    WhileThunkProto while_thunk = 7;
+  }
+}
+
+message SequentialThunkProto {
+  repeated ThunkProto thunks = 1;
+}
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
new file mode 100644
index 000000000000..a7a6063cce3c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -0,0 +1,47 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
+    const ThunkProto& thunk_proto) {
+  if (thunk_proto.thunk_info().execution_stream_id() < 0) {
+    return absl::InvalidArgumentError(
+        "The thunk execution stream ID must be non-negative.");
+  }
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.execution_stream_id =
+      thunk_proto.thunk_info().execution_stream_id();
+  thunk_info.profile_annotation = thunk_proto.thunk_info().profile_annotation();
+
+  if (thunk_proto.has_sequential_thunk()) {
+    return SequentialThunk::FromProto(
+        thunk_info, thunk_proto.sequential_thunk(), DeserializeThunkProto);
+  }
+
+  return absl::InvalidArgumentError("Unknown thunk type found in ThunkProto.");
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
new file mode 100644
index 000000000000..56e5a09b21fb
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.h
@@ -0,0 +1,33 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_THUNK_PROTO_DESERIALIZATION_H_
+#define XLA_BACKENDS_GPU_RUNTIME_THUNK_PROTO_DESERIALIZATION_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+
+namespace xla::gpu {
+
+// Deserializes the given `thunk_proto` into a Thunk.
+absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
+    const ThunkProto& thunk_proto);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_THUNK_PROTO_DESERIALIZATION_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
new file mode 100644
index 000000000000..ed08e58de9bb
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/thunk_proto_deserialization.h"
+
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::Pointer;
+using ::testing::Property;
+using ::testing::WhenDynamicCastTo;
+
+TEST(ThunkProtoDeserializationTest, SequentialThunkChain) {
+  constexpr ExecutionStreamId kExecutionStreamId{123};
+  constexpr absl::string_view kProfileAnnotation = "profile_annotation";
+
+  Thunk::ThunkInfo thunk_info{};
+  thunk_info.execution_stream_id = kExecutionStreamId;
+  thunk_info.profile_annotation = kProfileAnnotation;
+
+  // This constructs the following thunk tree:
+  // `SequentialThunk{SequentialThunk{}}`
+  std::unique_ptr<Thunk> inner_thunk =
+      std::make_unique<SequentialThunk>(thunk_info, ThunkSequence{});
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(inner_thunk));
+  SequentialThunk outer_thunk(thunk_info, std::move(thunk_sequence));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, outer_thunk.ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Thunk> new_thunk,
+                          DeserializeThunkProto(proto));
+
+  EXPECT_THAT(new_thunk.get(),
+              WhenDynamicCastTo<const SequentialThunk*>(Property(
+                  &SequentialThunk::thunks,
+                  ElementsAre(Pointer(WhenDynamicCastTo<const SequentialThunk*>(
+                      Property(&SequentialThunk::thunks, IsEmpty())))))));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/topk.cc b/third_party/xla/xla/backends/gpu/runtime/topk.cc
new file mode 100644
index 000000000000..8836c418eb77
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/topk.cc
@@ -0,0 +1,176 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/topk.h"
+
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/topk_kernel.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::topk {
+
+namespace {
+
+using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;
+
+// The optimal number of threads is the smaller value between the number of
+// threads available per block and the number of slices of data.
+size_t EstimateOptimalNumThreads(size_t n, size_t k, size_t batch_size) {
+  // Estimate number of threads per block that can run concurrently given the
+  // register footprint (k elements are kept in registers at all times).
+  constexpr size_t kEstimatedThreadsPerBlock = 512;
+  constexpr size_t kMaxKValue = 16;
+  size_t simultaneous_threads_per_block =
+      kEstimatedThreadsPerBlock * (kMaxKValue / k);
+  size_t threads_per_block =
+      std::min(simultaneous_threads_per_block,
+               stream_executor::gpu::kTopKMaxThreadsPerBlock);
+  // Minimum amount of data that each thread needs to receive for the algorithm.
+  size_t min_slice = absl::bit_floor(n / absl::bit_ceil(k));
+  return std::min(threads_per_block, min_slice);
+}
+
+// Returns the function creating packed arguments for TopK kernel.
+template <typename T>
+KernelArgsPacking CreateTopKArgsPacking(size_t num_elements, size_t k) {
+  using Packed = absl::StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>;
+
+  return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
+    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
+
+    se::DeviceMemory<T> data(mem_args->device_memory_args()[0]);
+    se::DeviceMemory<T> top_elements(mem_args->device_memory_args()[1]);
+    se::DeviceMemory<uint32_t> top_indices(mem_args->device_memory_args()[2]);
+
+    return se::PackKernelArgs(args.number_of_shared_bytes(), data, num_elements,
+                              top_elements, top_indices, k);
+  };
+}
+
+// Finds the TopK kernel for the given platform registered in the global
+// registry.
+template <size_t K, typename T, typename VT>
+absl::StatusOr<se::MultiKernelLoaderSpec> GetTopKKernelForPlatform(
+    se::Platform::Id id) {
+  return se::gpu::GpuKernelRegistry::GetGlobalRegistry()
+      .FindKernel<se::gpu::TopKKernel<K, T, VT>>(id);
+}
+
+// Gets the right version of TopK kernel based on the value of `k`.
+template <typename T, typename VT>
+absl::StatusOr<se::MultiKernelLoaderSpec> GetTopKKernelForKAndPlatform(
+    size_t k, se::Platform::Id id) {
+  if (k <= 1) {
+    return GetTopKKernelForPlatform<1, T, VT>(id);
+  }
+  if (k <= 2) {
+    return GetTopKKernelForPlatform<2, T, VT>(id);
+  }
+  if (k <= 4) {
+    return GetTopKKernelForPlatform<4, T, VT>(id);
+  }
+  if (k <= 8) {
+    return GetTopKKernelForPlatform<8, T, VT>(id);
+  }
+  if (k <= 16) {
+    return GetTopKKernelForPlatform<16, T, VT>(id);
+  }
+  return absl::UnimplementedError(absl::StrCat("Unsupported K: ", k));
+}
+
+// Gets the right version of TopK kernel based on the value of `n`.
+template <typename T>
+absl::StatusOr<se::MultiKernelLoaderSpec> GetTopKKernelForKAndPlatformAndN(
+    size_t k, se::Platform::Id id, size_t n) {
+  // TODO(doak): Switch to uint32_t if we don't have an efficient
+  // implementation for uint16_t.
+  if (n < std::numeric_limits<uint16_t>::max()) {
+    return GetTopKKernelForKAndPlatform<T, uint16_t>(k, id);
+  }
+  return GetTopKKernelForKAndPlatform<T, uint32_t>(k, id);
+}
+
+// Implementation for creating a CustomKernel for TopK operation with element
+// type `T`.
+template <typename T>
+absl::StatusOr<CustomKernel> GetTypedTopK(std::string name, size_t num_elements,
+                                          size_t k, size_t batch_size,
+                                          absl::string_view platform_name,
+                                          size_t wavefront_size) {
+  constexpr size_t kMaxKVSize = sizeof(uint64_t);
+  // Allocate shmem assuming we have a full reduction.
+  int shmem_size = absl::bit_ceil(k) * kMaxKVSize * wavefront_size;
+  int num_threads = EstimateOptimalNumThreads(num_elements, k, batch_size);
+  if (num_threads == 0) {
+    return absl::FailedPreconditionError(
+        "Invalid kernel parameters. This is likely a bug in the "
+        "TopkSpecializer.");
+  }
+
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      se::PlatformManager::PlatformWithName(platform_name));
+  TF_ASSIGN_OR_RETURN(
+      se::MultiKernelLoaderSpec spec,
+      GetTopKKernelForKAndPlatformAndN<T>(k, platform->id(), num_elements));
+
+  spec.set_kernel_args_packing(CreateTopKArgsPacking<T>(num_elements, k));
+  return CustomKernel(std::move(name), std::move(spec),
+                      se::BlockDim(batch_size, 1, 1),
+                      se::ThreadDim(num_threads, 1, 1), shmem_size);
+}
+
+}  // namespace
+
+absl::StatusOr<CustomKernel> GetTopKKernel(
+    std::string name, PrimitiveType dtype, size_t num_elements, size_t k,
+    size_t batch_size, absl::string_view platform_name, size_t wavefront_size) {
+  switch (dtype) {
+    case PrimitiveType::F32:
+      return GetTypedTopK<float>(std::move(name), num_elements, k, batch_size,
+                                 platform_name, wavefront_size);
+    case PrimitiveType::BF16:
+      return GetTypedTopK<bfloat16>(std::move(name), num_elements, k,
+                                    batch_size, platform_name, wavefront_size);
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported GpuTopK data type: ", dtype));
+  }
+}
+
+}  // namespace xla::gpu::kernel::topk
diff --git a/third_party/xla/xla/backends/gpu/runtime/topk.h b/third_party/xla/xla/backends/gpu/runtime/topk.h
new file mode 100644
index 000000000000..3901e91a1e61
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/topk.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_TOPK_H_
+#define XLA_BACKENDS_GPU_RUNTIME_TOPK_H_
+
+#include <cstddef>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::topk {
+
+// Creates a CustomKernel for TopK operation.
+absl::StatusOr<CustomKernel> GetTopKKernel(
+    std::string name, PrimitiveType dtype, size_t num_elements, size_t k,
+    size_t batch_size, absl::string_view platform_name, size_t wavefront_size);
+
+}  // namespace xla::gpu::kernel::topk
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_TOPK_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/topk_test.cc b/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
new file mode 100644
index 000000000000..8d7006242908
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/topk_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/topk.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <tuple>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/random/random.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/substitute.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::topk {
+
+using ::testing::Combine;
+using ::testing::Values;
+
+template <typename T>
+std::vector<T> RandomVecRange(int num_elements, T start, T end) {
+  std::vector<T> local;
+  local.reserve(num_elements);
+  thread_local absl::BitGen gen;
+  for (int i = 0; i < num_elements; ++i) {
+    local.push_back(absl::Uniform<T>(gen, start, end));
+  }
+  return local;
+}
+
+template <typename T>
+std::vector<T> RandomVec(int num_elements) {
+  return RandomVecRange(num_elements, static_cast<T>(0),
+                        static_cast<T>(num_elements));
+}
+
+template <typename T>
+std::vector<T> RandomVecNegative(int num_elements) {
+  return RandomVecRange(num_elements, -static_cast<T>(num_elements),
+                        static_cast<T>(0));
+}
+
+PrimitiveType Get(float) { return PrimitiveType::F32; }
+
+PrimitiveType Get(bfloat16) { return PrimitiveType::BF16; }
+
+// Params:
+//  - n_kb: number of elements in kilobytes.
+//  - k: number of elements to return.
+//  - batch_size
+//  - offset
+using TopKKernelTest = ::testing::TestWithParam<std::tuple<int, int, int, int>>;
+
+// In this test we only check that the TopK logic works with float. For the full
+// dtype coverage suite, please add them to topk_test.cc, where we can use XLA
+// utilities to simplify the test logic.
+TEST_P(TopKKernelTest, TopKFloat) {
+  using T = float;
+
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  auto stream = executor->CreateStream().value();
+
+  const auto [n_kb, k, batch_size, offset] = GetParam();
+  const size_t n = n_kb * 1024 + offset;
+
+  se::DeviceMemory<T> input_buffer =
+      executor->AllocateArray<T>(n * batch_size, 0);
+  se::DeviceMemory<T> output_values =
+      executor->AllocateArray<T>(k * batch_size, 0);
+  se::DeviceMemory<uint32_t> output_indices =
+      executor->AllocateArray<uint32_t>(k * batch_size, 0);
+
+  auto source = RandomVec<T>(n * batch_size);
+  TF_ASSERT_OK(
+      stream->Memcpy(&input_buffer, source.data(), n * batch_size * sizeof(T)));
+  TF_ASSERT_OK(stream->MemZero(&output_values, k * batch_size * sizeof(T)));
+  TF_ASSERT_OK(
+      stream->MemZero(&output_indices, k * batch_size * sizeof(uint32_t)));
+
+  auto custom_kernel = GetTopKKernel("topk", PrimitiveType::F32, n, k,
+                                     batch_size, platform->Name(), 32);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto kernel,
+                          executor->LoadKernel(custom_kernel->kernel_spec()));
+
+  // Launch topk kernel with device memory arguments.
+  se::KernelArgsDeviceMemoryArray arr(
+      std::vector<se::DeviceMemoryBase>(
+          {input_buffer, output_values, output_indices}),
+      custom_kernel->shared_memory_bytes());
+  TF_ASSERT_OK(kernel->Launch(custom_kernel->thread_dims(),
+                              custom_kernel->block_dims(), stream.get(), arr));
+
+  std::vector<T> got(k);
+  ASSERT_TRUE(stream->BlockHostUntilDone().ok());
+  for (int i = 0; i < batch_size; i++) {
+    TF_ASSERT_OK(stream->Memcpy(got.data(), output_values.GetSlice(k * i, k),
+                                k * sizeof(T)));
+    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
+    std::sort(slice.begin(), slice.end(), std::greater<T>());
+    slice.resize(k);
+    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
+        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
+  }
+}
+
+TEST_P(TopKKernelTest, TopKPackedNegative) {
+  using T = float;
+
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  auto stream = executor->CreateStream().value();
+
+  const auto [n_kb, k, batch_size, offset] = GetParam();
+  const size_t n = n_kb * 1024 + offset;
+
+  se::DeviceMemory<T> input_buffer =
+      executor->AllocateArray<T>(n * batch_size, 0);
+  se::DeviceMemory<T> output_values =
+      executor->AllocateArray<T>(k * batch_size, 0);
+  se::DeviceMemory<uint32_t> output_indices =
+      executor->AllocateArray<uint32_t>(k * batch_size, 0);
+
+  auto source = RandomVecNegative<T>(n * batch_size);
+  TF_ASSERT_OK(
+      stream->Memcpy(&input_buffer, source.data(), n * batch_size * sizeof(T)));
+  TF_ASSERT_OK(stream->MemZero(&output_values, k * batch_size * sizeof(T)));
+  TF_ASSERT_OK(
+      stream->MemZero(&output_indices, k * batch_size * sizeof(uint32_t)));
+
+  auto custom_kernel = GetTopKKernel("topk", PrimitiveType::F32, n, k,
+                                     batch_size, platform->Name(), 32);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto kernel,
+                          executor->LoadKernel(custom_kernel->kernel_spec()));
+
+  // Launch topk kernel with device memory arguments.
+  se::KernelArgsDeviceMemoryArray arr(
+      std::vector<se::DeviceMemoryBase>(
+          {input_buffer, output_values, output_indices}),
+      custom_kernel->shared_memory_bytes());
+  TF_ASSERT_OK(kernel->Launch(custom_kernel->thread_dims(),
+                              custom_kernel->block_dims(), stream.get(), arr));
+
+  std::vector<T> got(k);
+  ASSERT_TRUE(stream->BlockHostUntilDone().ok());
+  for (int i = 0; i < batch_size; i++) {
+    TF_ASSERT_OK(stream->Memcpy(got.data(), output_values.GetSlice(k * i, k),
+                                k * sizeof(T)));
+    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
+    std::sort(slice.begin(), slice.end(), std::greater<T>());
+    slice.resize(k);
+    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
+        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(TopKTests, TopKKernelTest,
+                         Combine(
+                             /*n_kb=*/Values(1, 8, 12, 64, 128),
+                             /*k=*/Values(1, 2, 8, 16, 7, 12),
+                             /*batch_size=*/Values(1, 16, 64, 128),
+                             /*offset=*/Values(0, 7, 4)),
+                         [](const auto& info) {
+                           return absl::Substitute(
+                               "n$0KiB_k$1_batch_size$2_offset$3",
+                               std::get<0>(info.param), std::get<1>(info.param),
+                               std::get<2>(info.param),
+                               std::get<3>(info.param));
+                         });
+
+}  // namespace xla::gpu::kernel::topk
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
index e7f9768fd7fe..d4642b58026e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
@@ -20,17 +20,15 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "xla/backends/gpu/runtime/make_batch_pointers.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/make_batch_pointers.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
index 0499789cff19..c843bf0e2b29 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.cc
@@ -20,23 +20,25 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 
 #include "absl/cleanup/cleanup.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/memory_allocation.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
@@ -190,5 +192,29 @@ std::string WhileThunk::ToString(int indent) const {
   return result;
 }
 
+absl::StatusOr<ThunkProto> WhileThunk::ToProto() const {
+  TF_ASSIGN_OR_RETURN(ThunkProto proto, Thunk::ToProto());
+  auto* while_proto = proto.mutable_while_thunk();
+  TF_ASSIGN_OR_RETURN(*while_proto->mutable_condition_result_buffer_index(),
+                      condition_result_buffer_index_.ToProto());
+
+  if (condition_thunk_sequence_) {
+    TF_ASSIGN_OR_RETURN(
+        *while_proto->mutable_condition_thunk_sequence()->add_thunks(),
+        condition_thunk_sequence_->ToProto());
+  }
+
+  if (body_thunk_sequence_) {
+    TF_ASSIGN_OR_RETURN(
+        *while_proto->mutable_body_thunk_sequence()->add_thunks(),
+        body_thunk_sequence_->ToProto());
+  }
+
+  if (trip_count_.has_value()) {
+    while_proto->set_trip_count(*trip_count_);
+  }
+  return proto;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
index daecdf5c4728..c40288634116 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
@@ -29,8 +30,8 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -78,6 +79,8 @@ class WhileThunk : public Thunk {
     return condition_result_buffer_index_;
   }
 
+  std::optional<int64_t> trip_count() const { return trip_count_; }
+
   // Returns the current loop iteration if the caller is inside a while loop(s).
   //
   // Implementation relies on thread local storage, be careful when call it from
@@ -90,6 +93,8 @@ class WhileThunk : public Thunk {
 
   std::string ToString(int indent) const override;
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  private:
   const HloInstruction* loop_;
   const BufferAllocation::Slice condition_result_buffer_index_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
index e74deefb1910..7e77a75afcd0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk_test.cc
@@ -15,29 +15,68 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/while_thunk.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::testing::ElementsAre;
+using ::tsl::proto_testing::EqualsProto;
 using ::tsl::testing::IsOk;
+using Kind = Thunk::Kind;
+
+// A dummy `Thunk` that does nothing.
+struct DummyThunk : public Thunk {
+  explicit DummyThunk(Thunk::Kind kind, Thunk::ThunkInfo thunk_info)
+      : Thunk(kind, std::move(thunk_info)) {}
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+};
+
+WhileThunk CreateWhileThunk(
+    const Thunk::ThunkInfo& thunk_info,
+    const BufferAllocation::Slice& condition_result_buffer_index,
+    ThunkSequence condition_thunks, ThunkSequence body_thunks,
+    std::optional<int64_t> trip_count) {
+  auto condition_thunk_sequence = std::make_unique<SequentialThunk>(
+      thunk_info, std::move(condition_thunks));
+
+  auto body_thunk_sequence =
+      std::make_unique<SequentialThunk>(thunk_info, std::move(body_thunks));
+
+  return WhileThunk(thunk_info, /*loop=*/nullptr, condition_result_buffer_index,
+                    std::move(condition_thunk_sequence),
+                    std::move(body_thunk_sequence), trip_count);
+}
 
 class IterationLoggerThunk : public Thunk {
  public:
@@ -65,7 +104,7 @@ class IterationLoggerThunk : public Thunk {
 
 // Non-known trip count while thunks are difficult to unit test, so we only have
 // a unit test for the known trip count case.
-class KnownTripCountWhileThunkTest : public HloTestBase {
+class KnownTripCountWhileThunkTest : public HloPjRtTestBase {
  protected:
   absl::StatusOr<const HloInstruction*> CreateFakeWhileInstruction() {
     constexpr absl::string_view kDummyModule = R"(
@@ -194,5 +233,87 @@ TEST_F(KnownTripCountWhileThunkTest, CurrentLoopIterationUnknownLoopTest) {
               ElementsAre(std::nullopt, std::nullopt, std::nullopt));
 }
 
+TEST(WhileThunkTest, ToProto) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "profile_annotation";
+  thunk_info.execution_stream_id = 123;
+
+  BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+
+  ThunkSequence condition_thunks;
+  condition_thunks.push_back(
+      std::make_unique<DummyThunk>(Kind::kConditional, thunk_info));
+  condition_thunks.push_back(
+      std::make_unique<DummyThunk>(Kind::kConditional, thunk_info));
+
+  ThunkSequence body_thunks;
+  body_thunks.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
+  body_thunks.push_back(
+      std::make_unique<DummyThunk>(Kind::kCustomCall, thunk_info));
+
+  WhileThunk thunk =
+      CreateWhileThunk(thunk_info, slice, std::move(condition_thunks),
+                       std::move(body_thunks), /*trip_count=*/10);
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+
+  constexpr absl::string_view expected = R"pb(
+    thunk_info {
+      profile_annotation: "profile_annotation"
+      execution_stream_id: 123
+    }
+    while_thunk {
+      condition_result_buffer_index { size: 256 }
+      condition_thunk_sequence {
+        thunks {
+          thunk_info {
+            profile_annotation: "profile_annotation"
+            execution_stream_id: 123
+          }
+          sequential_thunk {
+            thunks {
+              thunk_info {
+                profile_annotation: "profile_annotation"
+                execution_stream_id: 123
+              }
+            }
+            thunks {
+              thunk_info {
+                profile_annotation: "profile_annotation"
+                execution_stream_id: 123
+              }
+            }
+          }
+        }
+      }
+      body_thunk_sequence {
+        thunks {
+          thunk_info {
+            profile_annotation: "profile_annotation"
+            execution_stream_id: 123
+          }
+          sequential_thunk {
+            thunks {
+              thunk_info {
+                profile_annotation: "profile_annotation"
+                execution_stream_id: 123
+              }
+            }
+            thunks {
+              thunk_info {
+                profile_annotation: "profile_annotation"
+                execution_stream_id: 123
+              }
+            }
+          }
+        }
+      }
+      trip_count: 10
+    }
+  )pb";
+
+  EXPECT_THAT(proto, EqualsProto(expected));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index 036ba547cd59..ef940176b53e 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -163,17 +163,12 @@ cc_library(
     srcs = ["executor.cc"],
     hdrs = ["executor.h"],
     deps = [
-        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:event",
         "//xla/stream_executor:generic_memory_allocation",
         "//xla/stream_executor:generic_memory_allocator",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:kernel_spec",
-        "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:memory_allocator",
         "//xla/stream_executor:platform",
@@ -186,6 +181,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 996d1e03c2e7..9cb747b63805 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
@@ -62,7 +63,7 @@ class InterpreterStream : public host::HostStream {
 
   absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
                       uint64_t size) override {
-    void *src_mem = const_cast<void *>(gpu_src.opaque());
+    void *src_mem = gpu_src.opaque();
     EnqueueTask(
         [host_dst, src_mem, size]() { memcpy(host_dst, src_mem, size); });
     return BlockUntilDone();
diff --git a/third_party/xla/xla/backends/profiler/BUILD b/third_party/xla/xla/backends/profiler/BUILD
index 65f5da2138f0..a9c349a87cf2 100644
--- a/third_party/xla/xla/backends/profiler/BUILD
+++ b/third_party/xla/xla/backends/profiler/BUILD
@@ -2,7 +2,6 @@ load(
     "//xla/tsl:tsl.bzl",
     "if_with_tpu_support",
     "internal_visibility",
-    "tsl_gpu_library",
 )
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
@@ -16,8 +15,9 @@ package_group(
     ],
 )
 
-tsl_gpu_library(
+cc_library(
     name = "profiler_backends",
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
     visibility = internal_visibility(["//xla:internal"]),
     deps = [
         "//xla/backends/profiler/cpu:host_tracer",
diff --git a/third_party/xla/xla/backends/profiler/cpu/BUILD b/third_party/xla/xla/backends/profiler/cpu/BUILD
index e648650f70e1..3f70ca4a62ae 100644
--- a/third_party/xla/xla/backends/profiler/cpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/cpu/BUILD
@@ -1,5 +1,5 @@
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility")
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 2be5360977da..e6c96723e5c3 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -92,8 +92,8 @@ tsl_gpu_library(
     visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/debugging:leak_check",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
@@ -181,6 +181,7 @@ tsl_gpu_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -244,11 +245,11 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:abi",
         "@local_tsl//tsl/platform:env_time",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:types",
@@ -281,12 +282,14 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:types",
     ],
 )
@@ -327,8 +330,8 @@ tsl_gpu_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:abi",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:types",
@@ -356,8 +359,8 @@ tsl_gpu_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:thread_annotations",
     ] + if_cuda(["//xla/tsl/cuda:cupti"]),
@@ -392,8 +395,6 @@ xla_test(
     ],
     deps = [
         ":cupti_buffer_events",
-        ":cupti_collector",
-        ":cupti_tracer",
         ":cupti_utils",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
@@ -404,20 +405,21 @@ xla_test(
     name = "cupti_collector_test",
     size = "small",
     srcs = ["cupti_collector_test.cc"],
-    backends = ["cpu"],
+    backends = ["gpu"],
     copts = tf_profiler_copts() + tsl_copts(),
     tags = [
+        "cuda-only",
         "no_mac",
-    ] + if_google(["config-cuda-only"]),
-    deps = if_cuda([
+    ],
+    deps = [
         ":cupti_collector",
         ":cupti_tracer",
         ":cupti_utils",
-    ]) + [
         "//xla/tsl/profiler/utils:xplane_builder",
+        "//xla/tsl/profiler/utils:xplane_schema",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
@@ -435,6 +437,7 @@ cuda_library(
     local_defines = if_oss(["NVTX_VERSION_3_1=1"]),
     tags = ["cuda-only"],
     visibility = ["//visibility:public"],
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
 
 xla_test(
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
index 4f34107808e8..125c2c3c895c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
@@ -598,6 +598,53 @@ static absl::Status ConvertActivityBuffer(
 
 }  // namespace
 
+const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
+  // Do not use a default so that this gives a build error when
+  // CuptiTracerEventType is extended but this is not.
+  switch (type) {
+    case CuptiTracerEventType::MemcpyH2D:
+      return "MemcpyH2D";
+    case CuptiTracerEventType::MemcpyD2H:
+      return "MemcpyD2H";
+    case CuptiTracerEventType::MemcpyD2D:
+      return "MemcpyD2D";
+    case CuptiTracerEventType::MemcpyP2P:
+      return "MemcpyP2P";
+    case CuptiTracerEventType::MemcpyOther:
+      return "MemcpyOther";
+    case CuptiTracerEventType::Kernel:
+      return "Compute";
+    case CuptiTracerEventType::MemoryAlloc:
+      return "MemoryAlloc";
+    case CuptiTracerEventType::MemoryFree:
+      return "MemoryFree";
+    case CuptiTracerEventType::Memset:
+      return "Memset";
+    case CuptiTracerEventType::Overhead:
+      return "Overhead";
+    case CuptiTracerEventType::UnifiedMemory:
+      return "UnifiedMemory";
+    case CuptiTracerEventType::Generic:
+      return "Generic";
+    case CuptiTracerEventType::MemoryResidency:
+      return "MemoryResidency";
+    case CuptiTracerEventType::HostRegister:
+      return "HostRegister";
+    case CuptiTracerEventType::HostUnregister:
+      return "HostUnregister";
+    case CuptiTracerEventType::CudaGraph:
+      return "CudaGraph";
+    case CuptiTracerEventType::ThreadMarkerRange:
+      return "ThreadMarkerRange";
+    case CuptiTracerEventType::ThreadMarkerStart:
+      return "ThreadMarkerStart";
+    case CuptiTracerEventType::ThreadMarkerEnd:
+      return "ThreadMarkerEnd";
+    case CuptiTracerEventType::Unsupported:
+      return "";
+  }
+}
+
 absl::string_view StringDeduper::Dedup(absl::string_view str,
                                        size_t max_unique_count) {
   if (str.empty()) return absl::string_view();
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
index fb77d4c08081..f78b7c92a652 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -31,9 +31,9 @@ limitations under the License.
 #include "absl/container/node_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/profiler/utils/buffer_pool.h"
 #include "xla/tsl/profiler/utils/lock_free_queue.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace xla {
@@ -327,20 +327,20 @@ class CuptiActivityBufferManager {
   void ReclaimBuffer(uint8_t* p) { buffer_pool_.ReclaimBuffer(p); }
 
   void CacheCuptiFilledActivityBuffer(uint8_t* p, size_t sz) {
-    tsl::mutex_lock lock(buffer_mutex_);
+    absl::MutexLock lock(&buffer_mutex_);
     cached_buffers_.emplace_back(p, sz);
   }
 
   std::list<ActivityBufferAndSize> PopCachedBuffers() {
     std::list<ActivityBufferAndSize> result;
-    tsl::mutex_lock lock(buffer_mutex_);
+    absl::MutexLock lock(&buffer_mutex_);
     std::swap(result, cached_buffers_);
     return result;
   }
 
  private:
   tsl::profiler::BufferPool buffer_pool_;
-  tsl::mutex buffer_mutex_;
+  absl::Mutex buffer_mutex_;
   std::list<ActivityBufferAndSize> cached_buffers_ TF_GUARDED_BY(buffer_mutex_);
 };
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index fc8a396e5aa0..38dc948aeebf 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "tsl/platform/abi.h"
 #include "tsl/platform/host_info.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
@@ -66,12 +66,11 @@ using tensorflow::profiler::XLine;
 using tensorflow::profiler::XPlane;
 using tensorflow::profiler::XSpace;
 using tensorflow::profiler::XStatMetadata;
-using tsl::mutex;
-using tsl::mutex_lock;
 using tsl::profiler::Annotation;
 using tsl::profiler::FindMutablePlaneWithName;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
 using tsl::profiler::GpuPlaneName;
+using tsl::profiler::kCuptiActivityNvtxPlaneName;
 using tsl::profiler::kCuptiDriverApiPlaneName;
 using tsl::profiler::kDeviceVendorNvidia;
 using tsl::profiler::kScopeRangeIdTreePlaneName;
@@ -82,11 +81,9 @@ using tsl::profiler::XEventBuilder;
 using tsl::profiler::XLineBuilder;
 using tsl::profiler::XPlaneBuilder;
 
-static constexpr int64_t kNvtxLineIdStart = 1LL << 32;
-static constexpr int64_t kNvtxLineIdEnd = 2LL << 32;
-
-bool IsNvtxLine(int64_t line_id) {
-  return line_id >= kNvtxLineIdStart && line_id < kNvtxLineIdEnd;
+bool IsNvtxActivityEvent(const CuptiTracerEvent& event) {
+  return event.source == CuptiTracerEventSource::Activity &&
+         event.type == CuptiTracerEventType::ThreadMarkerRange;
 }
 
 bool IsHostEvent(const CuptiTracerEvent& event, int64_t* line_id) {
@@ -95,11 +92,10 @@ bool IsHostEvent(const CuptiTracerEvent& event, int64_t* line_id) {
     *line_id = event.thread_id;
     return true;
   }
-  // nvtx marker events from activity source are host events. Those markers
-  // are put into a separate line whose id value greater than kNvtxLineIdStart.
-  if (event.source == CuptiTracerEventSource::Activity &&
-      event.type == CuptiTracerEventType::ThreadMarkerRange) {
-    *line_id = kNvtxLineIdStart + event.thread_id;
+  // nvtx marker events from activity source are host events. Just put those
+  // events on the original host thread line.
+  if (IsNvtxActivityEvent(event)) {
+    *line_id = event.thread_id;
     return true;
   }
   // Other non-overhead activity events are device events.
@@ -130,30 +126,6 @@ int64_t GetNextAvailableLineId(absl::flat_hash_set<int64_t>& occupied_line_ids,
   return next_line_id;
 }
 
-// Change the line id of the lines where line id >= kNvtxLineIdStart to
-// any non-occupied line id start from 1, making sure the lower 32 bits value of
-// the line ids are unique. This is to avoid the effective line id conflict
-// which only count on the lower 32 bits of the line id in further analysis.
-void AdjustHostPlaneNvtxLines(XPlane* plane) {
-  // Get all occupied line ids with value less than kNvtxLineIdStart.
-  absl::flat_hash_set<int64_t> occupied_line_ids;
-  for (const XLine& line : plane->lines()) {
-    if (line.id() < kNvtxLineIdStart) {
-      occupied_line_ids.insert(line.id());
-    }
-  }
-
-  // Change the line id,  whose id value > kNvtxLineIdStart, to a non-occupied
-  // line id in uint32 range.
-  int64_t next_line_id = 0;
-  for (XLine& line : *plane->mutable_lines()) {
-    if (line.id() >= kNvtxLineIdStart) {
-      next_line_id = GetNextAvailableLineId(occupied_line_ids, next_line_id);
-      line.set_id(next_line_id);
-    }
-  }
-}
-
 struct DeviceOccupancyParams {
   cudaOccFuncAttributes attributes = {};
   int block_size = 0;
@@ -431,13 +403,14 @@ class PerDeviceCollector {
   PerDeviceCollector() = default;
 
   void AddEvent(CuptiTracerEvent&& event) {
-    mutex_lock l(m_);
+    absl::MutexLock l(&m_);
     events_.emplace_back(std::move(event));
   }
 
   size_t Flush(uint64_t start_gpu_ns, uint64_t end_gpu_ns,
-               XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
-    mutex_lock l(m_);
+               XPlaneBuilder* device_plane, XPlaneBuilder* host_plane,
+               XPlaneBuilder* nvtx_plane) {
+    absl::MutexLock l(&m_);
     // Tracking event types per line.
     absl::flat_hash_map<int64_t, absl::flat_hash_set<CuptiTracerEventType>>
         events_types_per_line;
@@ -449,7 +422,10 @@ class PerDeviceCollector {
         VLOG(9) << "Ignoring event, type=" << static_cast<int>(event.type);
         continue;
       }
-      auto* plane = is_host_event ? host_plane : device_plane;
+      auto* plane = is_host_event ? ((nvtx_plane && IsNvtxActivityEvent(event))
+                                         ? nvtx_plane
+                                         : host_plane)
+                                  : device_plane;
       VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
               << " line_id=" << line_id
               << (is_host_event ? " host plane=" : " device plane=")
@@ -464,16 +440,13 @@ class PerDeviceCollector {
           GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
     });
     host_plane->ForEachLine([&](XLineBuilder line) {
-      if (IsNvtxLine(line.Id())) {
-        // Lines will order by name, by appending suffix to the normal cupti
-        // line name, the nvtx lines will be placed right after their
-        // corresponding cupti lines.
-        line.SetName(absl::StrCat("Host Threads/",
-                                  static_cast<uint32_t>(line.Id()), "/NVTX"));
-      } else {
-        line.SetName(absl::StrCat("Host Threads/", line.Id()));
-      }
+      line.SetName(absl::StrCat("Host Threads/", line.Id()));
     });
+    if (nvtx_plane) {
+      nvtx_plane->ForEachLine([&](XLineBuilder line) {
+        line.SetName(absl::StrCat("Host Threads/", line.Id(), "/NVTX"));
+      });
+    }
     size_t num_events = events_.size();
     events_.clear();
     return num_events;
@@ -590,7 +563,7 @@ class PerDeviceCollector {
   }
 
  private:
-  mutex m_;
+  absl::Mutex m_;
   std::vector<CuptiTracerEvent> events_ TF_GUARDED_BY(m_);
   cudaOccDeviceProp device_properties_;
   absl::flat_hash_map<DeviceOccupancyParams, OccupancyStats> occupancy_cache_;
@@ -800,6 +773,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     size_t num_events = 0;
     XPlaneBuilder host_plane(
         FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
+    XPlaneBuilder nvtx_plane(
+        FindOrAddMutablePlaneWithName(space, kCuptiActivityNvtxPlaneName));
     for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
       std::string name = GpuPlaneName(device_ordinal);
       XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
@@ -812,11 +787,9 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
       per_device_collector_[device_ordinal].GetDeviceCapabilities(
           device_ordinal, &device_plane);
       num_events += per_device_collector_[device_ordinal].Flush(
-          start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
+          start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane, &nvtx_plane);
       NormalizeTimeStamps(&device_plane, start_walltime_ns_);
     }
-    AdjustHostPlaneNvtxLines(
-        FindMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
     NormalizeTimeStamps(&host_plane, start_walltime_ns_);
     return num_events > 0;
   }
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector_test.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector_test.cc
index 0bcd36ab50d4..bb52cee96f86 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector_test.cc
@@ -13,15 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#include <gtest/gtest.h>
 
-#include "xla/backends/profiler/gpu/cupti_collector.h"
+#if GOOGLE_CUDA
 
 #include <cstdint>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "tsl/platform/test.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
@@ -63,6 +66,88 @@ TEST(CuptiCollectorTest, TestPmSamplingDataToCounterLine) {
                                                        Pair(200000, 4.0)))));
 }
 
+TEST(CuptiCollectorTest, ExportCallbackActivityAndNvtxEvents) {
+  CuptiTracerCollectorOptions options;
+  options.max_activity_api_events = 100;
+  options.max_callback_api_events = 100;
+  options.num_gpus = 1;
+  std::unique_ptr<CuptiTraceCollector> collector =
+      CreateCuptiCollector(options, 0, 0);
+
+  collector->AddEvent(CuptiTracerEvent{
+      .type = CuptiTracerEventType::CudaGraph,
+      .source = CuptiTracerEventSource::Activity,
+      .name = "CudaGraphExec:2",
+      .annotation = "annotation",
+      .nvtx_range = "",
+      .start_time_ns = 100,
+      .end_time_ns = 200,
+      .device_id = 0,
+      .correlation_id = 8,
+      .thread_id = 100,
+      .context_id = 1,
+      .stream_id = 2,
+      .graph_id = 5,
+  });
+
+  collector->AddEvent(CuptiTracerEvent{
+      .type = CuptiTracerEventType::Generic,
+      .source = CuptiTracerEventSource::DriverCallback,
+      .name = "cudaGraphLaunch",
+      .annotation = "annotation",
+      .nvtx_range = "",
+      .start_time_ns = 90,
+      .end_time_ns = 120,
+      .device_id = 0,
+      .correlation_id = 8,
+      .thread_id = 100,
+      .context_id = 1,
+      .stream_id = 2,
+      .graph_id = 5,
+  });
+
+  collector->AddEvent(CuptiTracerEvent{
+      .type = CuptiTracerEventType::ThreadMarkerRange,
+      .source = CuptiTracerEventSource::Activity,
+      .name = "NVTX::MarkCudaGraphLaunch",
+      .annotation = "annotation",
+      .nvtx_range = "",
+      .start_time_ns = 85,
+      .end_time_ns = 125,
+      .device_id = 0,
+      .correlation_id = 0,
+      .thread_id = 100,
+      .context_id = 1,
+      .stream_id = 2,
+      .graph_id = 5,
+  });
+
+  ::tensorflow::profiler::XSpace space;
+  collector->Export(&space, 210);
+
+  // All the three planes must exist in the space:
+  // Cupti-Driver-API, Cupti-NVTX, GpuDevice.
+  const std::string gpu_device_plane_name = ::tsl::profiler::GpuPlaneName(0);
+  const absl::flat_hash_set<absl::string_view> plane_names = {
+      ::tsl::profiler::kCuptiDriverApiPlaneName,
+      ::tsl::profiler::kCuptiActivityNvtxPlaneName, gpu_device_plane_name};
+  int num_planes_to_check = 0;
+  for (const auto& plane : space.planes()) {
+    if (plane_names.contains(plane.name())) {
+      ++num_planes_to_check;
+    }
+  }
+  EXPECT_EQ(num_planes_to_check, static_cast<int>(plane_names.size()));
+
+  // In each above plane, only one line is created, and it has one event.
+  for (const auto& plane : space.planes()) {
+    if (plane_names.contains(plane.name())) {
+      ASSERT_EQ(plane.lines_size(), 1);
+      ASSERT_EQ(plane.lines(0).events_size(), 1);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
index 94535afc9c24..b7cc0c5adc2b 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
@@ -18,13 +18,12 @@ limitations under the License.
 #include <utility>
 
 #include "absl/debugging/leak_check.h"
+#include "absl/synchronization/mutex.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
 namespace profiler {
 
-using tsl::mutex_lock;
-
 CuptiErrorManager::CuptiErrorManager(std::unique_ptr<CuptiInterface> interface)
     : interface_(std::move(interface)), disabled_(0), undo_disabled_(false) {}
 
@@ -51,7 +50,7 @@ CuptiErrorManager::CuptiErrorManager(std::unique_ptr<CuptiInterface> interface)
 
 void CuptiErrorManager::RegisterUndoFunction(
     const CuptiErrorManager::UndoFunction& func) {
-  mutex_lock lock(undo_stack_mu_);
+  absl::MutexLock lock(&undo_stack_mu_);
   undo_stack_.push_back(func);
 }
 
@@ -227,7 +226,7 @@ void CuptiErrorManager::UndoAndDisable() {
     return;
   }
   // Iterates undo log and call undo APIs one by one.
-  mutex_lock lock(undo_stack_mu_);
+  absl::MutexLock lock(&undo_stack_mu_);
   undo_disabled_ = true;
   while (!undo_stack_.empty()) {
     LOG(ERROR) << "CuptiErrorManager is disabling profiling automatically.";
@@ -291,7 +290,7 @@ void CuptiErrorManager::CleanUp() {
   if (undo_disabled_) {  // prevent deadlock
     return;
   }
-  mutex_lock lock(undo_stack_mu_);
+  absl::MutexLock lock(&undo_stack_mu_);
   undo_disabled_ = true;
   while (!undo_stack_.empty()) {
     undo_stack_.pop_back();
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
index 79b124a5c194..4e56891002c9 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace xla {
@@ -158,7 +158,7 @@ class CuptiErrorManager : public xla::profiler::CuptiInterface {
   // be extremely low. In other words, it will be contended only when the
   // profiling is being enabled or disabled, and we will have at most two
   // threads that will contend for this mutex.
-  tsl::mutex undo_stack_mu_;
+  absl::Mutex undo_stack_mu_;
 
   // Once an error is detected, we will ignore any CUPTI API call.
   std::atomic<int> disabled_;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc
index 7b369fa6fa59..448ea6c6d06b 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc
@@ -80,7 +80,7 @@ class CuptiErrorManagerTest : public ::testing::Test {
   }
 
   void EnableProfiling(const CuptiTracerOptions& option) {
-    cupti_tracer_->Enable(option, cupti_collector_.get());
+    cupti_tracer_->Enable(option, cupti_collector_.get()).IgnoreError();
   }
 
   void DisableProfiling() { cupti_tracer_->Disable(); }
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index c6ccf2ece89f..a5c9cd6323bf 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_result.h"
-#include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/backends/profiler/gpu/cupti_buffer_events.h"
 #include "xla/backends/profiler/gpu/cupti_collector.h"
@@ -443,8 +443,8 @@ void SetCuMemAllocEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id,
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(dptr);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemAlloc API exit." << " dptr=" << dptr
-          << " sz=" << params->bytesize;
+  VLOG(3) << "Cuda MemAlloc API exit."
+          << " dptr=" << dptr << " sz=" << params->bytesize;
 }
 
 void SetCuMemAllocPitchEventUponApiExit(
@@ -465,8 +465,8 @@ void SetCuMemAllocPitchEventUponApiExit(
   const size_t size_in_bytes = *params->pPitch * params->Height;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(dptr);
   event.memalloc_info.num_bytes = size_in_bytes;
-  VLOG(3) << "Cuda MemAllocPitch API exit." << " dptr=" << dptr
-          << " sz=" << size_in_bytes;
+  VLOG(3) << "Cuda MemAllocPitch API exit."
+          << " dptr=" << dptr << " sz=" << size_in_bytes;
 }
 
 void SetCuMemAllocManagedEventUponApiExit(
@@ -486,8 +486,8 @@ void SetCuMemAllocManagedEventUponApiExit(
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(dptr);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemAllocManaged API exit." << " dptr=" << dptr
-          << " sz=" << params->bytesize;
+  VLOG(3) << "Cuda MemAllocManaged API exit."
+          << " dptr=" << dptr << " sz=" << params->bytesize;
 }
 
 void SetCuMemAllocHostEventUponApiExit(CuptiTracerEvent &event,
@@ -508,8 +508,8 @@ void SetCuMemAllocHostEventUponApiExit(CuptiTracerEvent &event,
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(*params->pp);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemAllocHost API exit." << " pp=" << *params->pp
-          << " sz=" << params->bytesize;
+  VLOG(3) << "Cuda MemAllocHost API exit."
+          << " pp=" << *params->pp << " sz=" << params->bytesize;
 }
 
 void SetCuMemHostAllocEventUponApiExit(CuptiTracerEvent &event,
@@ -530,8 +530,9 @@ void SetCuMemHostAllocEventUponApiExit(CuptiTracerEvent &event,
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(*params->pp);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemHostAlloc API exit." << " pp=" << *params->pp
-          << " sz=" << params->bytesize << " Flags=" << params->Flags;
+  VLOG(3) << "Cuda MemHostAlloc API exit."
+          << " pp=" << *params->pp << " sz=" << params->bytesize
+          << " Flags=" << params->Flags;
 }
 
 void SetCuMemFreeEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id,
@@ -551,7 +552,8 @@ void SetCuMemFreeEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id,
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.memfree_info.address = reinterpret_cast<uintptr_t>(dptr);
-  VLOG(3) << "Cuda MemFree API exit." << " dptr=" << dptr;
+  VLOG(3) << "Cuda MemFree API exit."
+          << " dptr=" << dptr;
 }
 
 void SetCuMemFreeHostEventUponApiExit(CuptiTracerEvent &event,
@@ -570,7 +572,8 @@ void SetCuMemFreeHostEventUponApiExit(CuptiTracerEvent &event,
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.memfree_info.address = reinterpret_cast<uintptr_t>(params->p);
-  VLOG(3) << "Cuda MemFreeHost API exit." << " p=" << params->p;
+  VLOG(3) << "Cuda MemFreeHost API exit."
+          << " p=" << params->p;
 }
 
 void SetCuMemHostRegisterEventUponApiExit(
@@ -590,8 +593,9 @@ void SetCuMemHostRegisterEventUponApiExit(
   event.host_register_info.address = reinterpret_cast<uintptr_t>(params->p);
   event.host_register_info.num_bytes = params->bytesize;
   event.host_register_info.flags = params->Flags;
-  VLOG(3) << "Cuda HostRegister API exit." << " p=" << params->p
-          << " bytesize=" << params->bytesize << " flags=" << params->Flags;
+  VLOG(3) << "Cuda HostRegister API exit."
+          << " p=" << params->p << " bytesize=" << params->bytesize
+          << " flags=" << params->Flags;
 }
 
 void SetCuMemHostUnregisterEventUponApiExit(
@@ -609,7 +613,8 @@ void SetCuMemHostUnregisterEventUponApiExit(
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.host_unregister_info.address = reinterpret_cast<uintptr_t>(params->p);
-  VLOG(3) << "Cuda HostUnregister API exit." << " p=" << params->p;
+  VLOG(3) << "Cuda HostUnregister API exit."
+          << " p=" << params->p;
 }
 
 struct GraphResourceCreationInfo {
@@ -650,7 +655,8 @@ void SetCudaGraphEventUponApiExit(CuptiTracerEvent &event,
   event.cuda_graph_info.cbid = cbid;
   event.graph_id = graph_id_info.graph_id;
   event.cuda_graph_info.orig_graph_id = graph_id_info.orig_graph_id;
-  VLOG(3) << "Observed CudaGraph API exit." << " name=" << cbdata->functionName;
+  VLOG(3) << "Observed CudaGraph API exit."
+          << " name=" << cbdata->functionName;
 }
 
 void SetGenericEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id,
@@ -667,7 +673,8 @@ void SetGenericEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id,
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.generic_info.cbid = cbid;
-  VLOG(3) << "Observed generic API exit." << " name=" << cbdata->functionName;
+  VLOG(3) << "Observed generic API exit."
+          << " name=" << cbdata->functionName;
 }
 
 static void SetCallbackEventUponApiExit(CuptiTracerEvent &event,
@@ -796,24 +803,24 @@ static void SetCallbackEventUponApiExit(CuptiTracerEvent &event,
 class GuardedCallbackAnnotationsAndEvents {
  public:
   CallbackAnnotationsAndEvents Consume() {
-    tsl::mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     CallbackAnnotationsAndEvents grabbed;
     std::swap(grabbed, annotations_and_events_);
     return grabbed;
   }
 
   void Clear() {
-    tsl::mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     annotations_and_events_.Clear();
   }
 
   void IncNumDroppedEvents() {
-    tsl::mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     annotations_and_events_.IncNumDroppedEvents();
   }
 
   void Push(const CuptiTracer &tracer, CuptiTracerEvent &&event) {
-    tsl::mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     // Some logic change as no cross thread string comparison should be
     // made here. The max_annotation_string is used to limit per-thread
     // annotation string count. And annotation string is not collected
@@ -832,7 +839,7 @@ class GuardedCallbackAnnotationsAndEvents {
       const int64_t *head = sequence.data();
       const int64_t *curr = &sequence.back();
 
-      tsl::mutex_lock lock(mu_);
+      absl::MutexLock lock(&mu_);
       ScopeRangeIdTree &tree = annotations_and_events_.scope_range_id_tree();
       for (; curr > head && !tree.contains(*curr); --curr) {
         tree.emplace(*curr, *(curr - 1));
@@ -841,7 +848,7 @@ class GuardedCallbackAnnotationsAndEvents {
   }
 
  private:
-  tsl::mutex mu_;
+  absl::Mutex mu_;
   CallbackAnnotationsAndEvents annotations_and_events_ TF_GUARDED_BY(mu_);
 };
 
@@ -955,53 +962,6 @@ absl::Span<const uint32_t> GetCudaGraphTracingResourceCbids() {
 
 }  // namespace
 
-const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
-  // Do not use a default so that this gives a build error when
-  // CuptiTracerEventType is extended but this is not.
-  switch (type) {
-    case CuptiTracerEventType::MemcpyH2D:
-      return "MemcpyH2D";
-    case CuptiTracerEventType::MemcpyD2H:
-      return "MemcpyD2H";
-    case CuptiTracerEventType::MemcpyD2D:
-      return "MemcpyD2D";
-    case CuptiTracerEventType::MemcpyP2P:
-      return "MemcpyP2P";
-    case CuptiTracerEventType::MemcpyOther:
-      return "MemcpyOther";
-    case CuptiTracerEventType::Kernel:
-      return "Compute";
-    case CuptiTracerEventType::MemoryAlloc:
-      return "MemoryAlloc";
-    case CuptiTracerEventType::MemoryFree:
-      return "MemoryFree";
-    case CuptiTracerEventType::Memset:
-      return "Memset";
-    case CuptiTracerEventType::Overhead:
-      return "Overhead";
-    case CuptiTracerEventType::UnifiedMemory:
-      return "UnifiedMemory";
-    case CuptiTracerEventType::Generic:
-      return "Generic";
-    case CuptiTracerEventType::MemoryResidency:
-      return "MemoryResidency";
-    case CuptiTracerEventType::HostRegister:
-      return "HostRegister";
-    case CuptiTracerEventType::HostUnregister:
-      return "HostUnregister";
-    case CuptiTracerEventType::CudaGraph:
-      return "CudaGraph";
-    case CuptiTracerEventType::ThreadMarkerRange:
-      return "ThreadMarkerRange";
-    case CuptiTracerEventType::ThreadMarkerStart:
-      return "ThreadMarkerStart";
-    case CuptiTracerEventType::ThreadMarkerEnd:
-      return "ThreadMarkerEnd";
-    case CuptiTracerEventType::Unsupported:
-      return "";
-  }
-}
-
 CuptiTracer::CuptiTracer(CuptiInterface *cupti_interface)
     : num_gpus_(NumGpus()), cupti_interface_(cupti_interface) {}
 
@@ -1029,8 +989,8 @@ int CuptiTracer::NumGpus() {
   return num_gpus;
 }
 
-void CuptiTracer::Enable(const CuptiTracerOptions &option,
-                         CuptiTraceCollector *collector) {
+absl::Status CuptiTracer::Enable(const CuptiTracerOptions &option,
+                                 CuptiTraceCollector *collector) {
   option_ = option;
   collector_ = collector;
 
@@ -1052,10 +1012,13 @@ void CuptiTracer::Enable(const CuptiTracerOptions &option,
 
   absl::Status status = EnableApiTracing();
   need_root_access_ |= status.code() == tsl::error::PERMISSION_DENIED;
-  if (!status.ok()) return;
+  if (!status.ok()) {
+    return status;
+  }
 
   EnableActivityTracing().IgnoreError();
   tsl::profiler::AnnotationStack::Enable(true);
+  return status;
 }
 
 void CuptiTracer::Disable() {
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
index e53f5f2a54c4..59d941b897ee 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
@@ -81,7 +81,8 @@ class CuptiTracer {
   bool IsAvailable() const;
   bool NeedRootAccess() const { return need_root_access_; }
 
-  void Enable(const CuptiTracerOptions& option, CuptiTraceCollector* collector);
+  absl::Status Enable(const CuptiTracerOptions& option,
+                      CuptiTraceCollector* collector);
   void Disable();
 
   // Control threads could periodically call this function to flush the
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
index 2d675afba107..4c5712216604 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -80,52 +80,75 @@ absl::Status GpuTracer::DoStart() {
   }
 
   options_.cbids_selected = {
-    // KERNEL
-    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
+      // KERNEL
+      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
 #if CUDA_VERSION >= 11080  // CUDA 11.8
-    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx,
+      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx,
 #endif  // CUDA_VERSION >= 11080
-    // MEMCPY
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
-    // MemAlloc
-    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2,
-    // MemFree
-    CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2,
-    // Memset
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async,
-    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async,
-    // GENERIC
-    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize,
+      // MEMCPY
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
+      // MemAlloc
+      CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2,
+      // MemFree
+      CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2,
+      // Memset
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async,
+      CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async,
+      // GENERIC
+      CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize,
+#if CUDA_VERSION >= 12080  // CUDA 12.8
+      // CUDA graph related callbacks need to be specially specified in new
+      // versions of CUDA, otherwise CUPTI did not send their callback.
+      // More cuda graph callback need to be added when we need mapping internal
+      // cuda graph node to xla ops, like cuGraphAddKernelNode_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphCreate,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphClone,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz,
+      // Following callbacks are treated as general cuda function, not cuda
+      // graph
+      // specific treatment will be applied to them from profiler side. They are
+      // added here to avoid missing events heavily used in the
+      // command_buffer::update().
+      CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemcpyNodeSetParams,
+      CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams_v2,
+      CUPTI_DRIVER_TRACE_CBID_cuFuncSetAttribute,
+#endif  // CUDA_VERSION >= 12080
   };
 
   bool trace_concurrent_kernels = false;
@@ -152,7 +175,7 @@ absl::Status GpuTracer::DoStart() {
   cupti_collector_ = CreateCuptiCollector(collector_options, start_walltime_ns,
                                           start_gputime_ns);
 
-  cupti_tracer_->Enable(options_, cupti_collector_.get());
+  cupti_tracer_->Enable(options_, cupti_collector_.get()).IgnoreError();
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 09b457ee3d38..27786386ec3f 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tsl/platform/env_time.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/profiler/lib/profiler_factory.h"
 #include "tsl/profiler/lib/profiler_interface.h"
@@ -45,8 +44,6 @@ namespace xla {
 namespace profiler {
 
 using tensorflow::ProfileOptions;
-using tsl::mutex;
-using tsl::mutex_lock;
 using tsl::profiler::Annotation;
 using tsl::profiler::AnnotationStack;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
index 29f817b2ea02..b2ef634bf6fb 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
 #include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
 #include "xla/tsl/profiler/utils/parse_annotation.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tsl/platform/env_time.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
@@ -45,8 +45,6 @@ namespace profiler {
 
 namespace se = ::stream_executor;
 using tensorflow::ProfileOptions;
-using tsl::mutex;
-using tsl::mutex_lock;
 using tsl::profiler::Annotation;
 using tsl::profiler::AnnotationStack;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
@@ -393,7 +391,7 @@ class PerDeviceCollector {
   }
 
   void SortByStartTime() {
-    mutex_lock lock(events_mutex);
+    absl::MutexLock lock(&events_mutex);
     std::sort(events.begin(), events.end(),
               [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
                 return event1.start_time_ns < event2.start_time_ns;
@@ -438,7 +436,7 @@ class PerDeviceCollector {
               uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
               XPlaneBuilder* host_plane) {
     int host_ev_cnt = 0, dev_ev_cnt = 0;
-    mutex_lock l(events_mutex);
+    absl::MutexLock l(&events_mutex);
     // Tracking event types per line.
     absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
         events_types_per_line;
@@ -481,7 +479,7 @@ class PerDeviceCollector {
   PerDeviceCollector() = default;
 
   void AddEvent(const RocmTracerEvent& event) {
-    mutex_lock l(events_mutex);
+    absl::MutexLock l(&events_mutex);
     if (event.source == RocmTracerEventSource::ApiCallback) {
       // Cupti api callback events were used to populate launch times etc.
       if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
@@ -562,7 +560,7 @@ class PerDeviceCollector {
   }
 
  private:
-  mutex events_mutex;
+  absl::Mutex events_mutex;
   std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
   absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
       TF_GUARDED_BY(events_mutex);
@@ -599,7 +597,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
   uint64_t start_gputime_ns_;
   int num_gpus_;
 
-  mutex event_maps_mutex_;
+  absl::Mutex event_maps_mutex_;
   absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
       TF_GUARDED_BY(event_maps_mutex_);
 
@@ -623,7 +621,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
 
 void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent&& event,
                                       bool is_auxiliary) {
-  mutex_lock lock(event_maps_mutex_);
+  absl::MutexLock lock(&event_maps_mutex_);
 
   if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
     if (num_callback_events_ > options_.max_callback_api_events) {
@@ -665,7 +663,7 @@ void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent&& event,
 }
 
 void RocmTraceCollectorImpl::Flush() {
-  mutex_lock lock(event_maps_mutex_);
+  absl::MutexLock lock(&event_maps_mutex_);
   auto& aggregated_events_ = ApiActivityInfoExchange();
 
   VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index fcf51dbac9a6..0700eb66c8f2 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
 #include "rocm/rocm_config.h"
 #include "xla/tsl/profiler/backends/cpu/annotation_stack.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
@@ -33,8 +34,6 @@ namespace xla {
 namespace profiler {
 
 namespace se = ::stream_executor;
-using tsl::mutex;
-using tsl::mutex_lock;
 using tsl::profiler::AnnotationStack;
 
 constexpr uint32_t RocmTracerEvent::kInvalidDeviceId;
@@ -304,7 +303,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
   if (data->phase == ACTIVITY_API_PHASE_ENTER) {
     if (options_.api_tracking_set.find(cbid) !=
         options_.api_tracking_set.end()) {
-      mutex_lock lock(api_call_start_mutex_);
+      absl::MutexLock lock(&api_call_start_mutex_);
       api_call_start_time_.emplace(data->correlation_id,
                                    RocmTracer::GetTimestamp());
     }
@@ -317,7 +316,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
 
     if (options_.api_tracking_set.find(cbid) !=
         options_.api_tracking_set.end()) {
-      mutex_lock lock(api_call_start_mutex_);
+      absl::MutexLock lock(&api_call_start_mutex_);
       if (api_call_start_time_.find(data->correlation_id) !=
           api_call_start_time_.end()) {
         enter_time = api_call_start_time_.at(data->correlation_id);
@@ -1324,7 +1323,7 @@ void RocmActivityCallbackImpl::AddHipOpsMemsetActivityEvent(
 }
 
 /* static */ RocmTracer* RocmTracer::GetRocmTracerSingleton() {
-  static auto* singleton = new RocmTracer();
+  static auto* const singleton = new RocmTracer();
   return singleton;
 }
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
index 2af1bbfa2c7b..d9100528e9aa 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -22,11 +22,13 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
 
 namespace xla {
@@ -85,7 +87,7 @@ class RocmApiCallbackImpl {
   RocmTracerOptions options_;
   RocmTracer* tracer_ = nullptr;
   RocmTraceCollector* collector_ = nullptr;
-  tsl::mutex api_call_start_mutex_;
+  absl::Mutex api_call_start_mutex_;
   // TODO(rocm-profiler): replace this with absl hashmap
   // keep a map from the corr. id to enter time for API callbacks.
   std::map<uint32_t, uint64_t> api_call_start_time_
diff --git a/third_party/xla/xla/backends/profiler/plugin/BUILD b/third_party/xla/xla/backends/profiler/plugin/BUILD
index fee1081255b0..6c3c5620477b 100644
--- a/third_party/xla/xla/backends/profiler/plugin/BUILD
+++ b/third_party/xla/xla/backends/profiler/plugin/BUILD
@@ -1,5 +1,5 @@
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 26dea059b9e1..b5759611fd88 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   XLA client libraries.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -27,15 +27,6 @@ filegroup(
     ]),
 )
 
-cc_library(
-    name = "padding",
-    hdrs = ["padding.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder:padding instead.",
-    deps = [
-        "//xla/hlo/builder:padding",
-    ],
-)
-
 cc_library(
     name = "client",
     srcs = ["client.cc"],
@@ -93,8 +84,8 @@ xla_cc_test(
     srcs = ["executable_build_options_test.cc"],
     deps = [
         ":executable_build_options",
-        "//xla:protobuf_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/pjrt:compile_options_proto_cc",
         "//xla/service:computation_placer",
         "//xla/service:test_compilation_environment_proto_cc",
@@ -103,6 +94,7 @@ xla_cc_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
@@ -201,42 +193,3 @@ cc_library(
         "@com_google_absl//absl/synchronization",
     ],
 )
-
-cc_library(
-    name = "sharding_builder",
-    hdrs = ["sharding_builder.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder:sharding_builder instead.",
-    deps = [
-        "//xla/hlo/builder:sharding_builder",
-    ],
-)
-
-cc_library(
-    name = "xla_computation",
-    hdrs = ["xla_computation.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder:xla_computation instead.",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/hlo/builder:xla_computation",
-    ],
-)
-
-cc_library(
-    name = "value_inference",
-    hdrs = ["value_inference.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder:value_inference instead.",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/hlo/builder:value_inference",
-    ],
-)
-
-cc_library(
-    name = "xla_builder",
-    hdrs = ["xla_builder.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder:xla_builder instead.",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/hlo/builder:xla_builder",
-    ],
-)
diff --git a/third_party/xla/xla/client/client.cc b/third_party/xla/xla/client/client.cc
index 81fdbe753f73..5362bef55b06 100644
--- a/third_party/xla/xla/client/client.cc
+++ b/third_party/xla/xla/client/client.cc
@@ -82,8 +82,9 @@ absl::StatusOr<Literal> Client::ExecuteAndTransfer(
 
   std::optional<Shape> shape_with_output_layout;
   if (execution_options && execution_options->has_shape_with_output_layout()) {
-    shape_with_output_layout =
-        Shape(execution_options->shape_with_output_layout());
+    TF_ASSIGN_OR_RETURN(
+        shape_with_output_layout,
+        Shape::FromProto(execution_options->shape_with_output_layout()));
   }
   return Transfer(*data, shape_with_output_layout.has_value()
                              ? &(*shape_with_output_layout)
diff --git a/third_party/xla/xla/client/executable_build_options.cc b/third_party/xla/xla/client/executable_build_options.cc
index a2b64bf1fec4..06e60953fc87 100644
--- a/third_party/xla/xla/client/executable_build_options.cc
+++ b/third_party/xla/xla/client/executable_build_options.cc
@@ -212,7 +212,9 @@ absl::StatusOr<ExecutableBuildOptions> ExecutableBuildOptionsFromProto(
     output.set_device_ordinal(input.device_ordinal());
   }
   if (input.has_result_layout()) {
-    output.set_result_layout(xla::Shape(input.result_layout()));
+    TF_ASSIGN_OR_RETURN(Shape result_layout,
+                        Shape::FromProto(input.result_layout()));
+    output.set_result_layout(result_layout);
   }
   if (input.has_comp_envs()) {
     TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/client/executable_build_options_test.cc b/third_party/xla/xla/client/executable_build_options_test.cc
index 9233c1aab84c..45f4469fc543 100644
--- a/third_party/xla/xla/client/executable_build_options_test.cc
+++ b/third_party/xla/xla/client/executable_build_options_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/pjrt/compile_options.pb.h"
-#include "xla/protobuf_util.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/test_compilation_environment.pb.h"
 #include "xla/shape.h"
@@ -33,11 +32,15 @@ limitations under the License.
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
 
+using ::tsl::proto_testing::EqualsProto;
+
 // In order to use TestCompilationEnvironment* with CompilationEnvironments, we
 // must define ProcessNewEnv for them.
 std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv(
@@ -50,8 +53,12 @@ std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv(
 TEST(ExecutableBuildOptionsTest, ProtoRoundTripWorks) {
   ExecutableBuildOptionsProto p;
   p.set_device_ordinal(1);
+
+  // Set result_layout to an array shape.
+  p.mutable_result_layout()->set_element_type(PrimitiveType::F32);
   p.mutable_result_layout()->add_dimensions(2);
   p.mutable_result_layout()->add_is_dynamic_dimension(true);
+
   {
     CompilationEnvironments::RegisterProcessNewEnvFn(
         test::TestCompilationEnvironment1::descriptor(), ProcessNewEnv);
@@ -87,7 +94,7 @@ TEST(ExecutableBuildOptionsTest, ProtoRoundTripWorks) {
                           ExecutableBuildOptionsFromProto(p));
   TF_ASSERT_OK_AND_ASSIGN(const ExecutableBuildOptionsProto p2,
                           options.ToProto());
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(p, p2));
+  EXPECT_THAT(p2, EqualsProto(p));
 }
 
 TEST(ExecutableBuildOptionsTest, SerializationFailsOnNonSerializableFields) {
diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index 80aa72001f03..8a47aae8c50c 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -23,170 +23,6 @@ filegroup(
 # Generate test_suites for all backends, named "${backend}_tests".
 generate_backend_suites()
 
-cc_library(
-    name = "arithmetic",
-    hdrs = ["arithmetic.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:arithmetic instead.",
-    deps = [
-        "//xla/hlo/builder/lib:arithmetic",
-    ],
-)
-
-cc_library(
-    name = "comparators",
-    hdrs = [
-        "comparators.h",
-    ],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:comparators instead.",
-    deps = [
-        "//xla/hlo/builder/lib:comparators",
-    ],
-)
-
-cc_library(
-    name = "constants",
-    hdrs = ["constants.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:constants instead.",
-    deps = [
-        "//xla/hlo/builder/lib:constants",
-    ],
-)
-
-cc_library(
-    name = "broadcast",
-    hdrs = ["broadcast.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:broadcast instead.",
-    deps = [
-        "//xla/hlo/builder/lib:broadcast",
-    ],
-)
-
-cc_library(
-    name = "conv_grad_size_util",
-    hdrs = ["conv_grad_size_util.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:conv_grad_size_util instead.",
-    deps = [
-        "//xla/hlo/builder/lib:conv_grad_size_util",
-    ],
-)
-
-cc_library(
-    name = "dynamic_shaped_ops",
-    hdrs = ["dynamic_shaped_ops.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:dynamic_shaped_ops instead.",
-    deps = [
-        "//xla/hlo/builder/lib:dynamic_shaped_ops",
-    ],
-)
-
-cc_library(
-    name = "loops",
-    hdrs = ["loops.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:loops instead.",
-    deps = [
-        "//xla/hlo/builder/lib:loops",
-    ],
-)
-
-cc_library(
-    name = "math",
-    hdrs = [
-        "math.h",
-    ],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:math instead.",
-    deps = [
-        "//xla/hlo/builder/lib:math",
-    ],
-)
-
-cc_library(
-    name = "matrix",
-    hdrs = ["matrix.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:matrix instead.",
-    deps = [
-        "//xla/hlo/builder/lib:matrix",
-    ],
-)
-
-cc_library(
-    name = "pooling",
-    hdrs = ["pooling.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:pooling instead.",
-    deps = [
-        "//xla/hlo/builder/lib:pooling",
-    ],
-)
-
-cc_library(
-    name = "prng",
-    hdrs = ["prng.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:prng instead.",
-    deps = [
-        "//xla/hlo/builder/lib:prng",
-    ],
-)
-
-cc_library(
-    name = "qr",
-    hdrs = ["qr.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:qr instead.",
-    deps = [
-        "//xla/hlo/builder/lib:qr",
-    ],
-)
-
-cc_library(
-    name = "lu_decomposition",
-    hdrs = ["lu_decomposition.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:lu_decomposition instead.",
-    deps = [
-        "//xla/hlo/builder/lib:lu_decomposition",
-    ],
-)
-
-cc_library(
-    name = "approx_topk",
-    hdrs = ["approx_topk.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:approx_topk instead.",
-    deps = [
-        "//xla/hlo/builder/lib:approx_topk",
-    ],
-)
-
-cc_library(
-    name = "approx_topk_shape",
-    hdrs = ["approx_topk_shape.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:approx_topk_shape instead.",
-    deps = ["//xla/hlo/builder/lib:approx_topk_shape"],
-)
-
-cc_library(
-    name = "slicing",
-    hdrs = ["slicing.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:slicing instead.",
-    deps = [
-        "//xla/hlo/builder/lib:slicing",
-    ],
-)
-
-cc_library(
-    name = "sorting",
-    hdrs = ["sorting.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:sorting instead.",
-    deps = [
-        "//xla/hlo/builder/lib:sorting",
-    ],
-)
-
-cc_library(
-    name = "quantize",
-    hdrs = ["quantize.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:quantize instead.",
-    deps = [
-        "//xla/hlo/builder/lib:quantize",
-    ],
-)
-
 cc_library(
     name = "testing",
     srcs = ["testing.cc"],
@@ -202,53 +38,10 @@ cc_library(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/service",
+        "//xla/tsl/lib/gtl:value_or_die",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
     ],
 )
-
-cc_library(
-    name = "self_adjoint_eig",
-    hdrs = ["self_adjoint_eig.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:self_adjoint_eig instead.",
-    deps = [
-        "//xla/hlo/builder/lib:self_adjoint_eig",
-    ],
-)
-
-cc_library(
-    name = "svd",
-    hdrs = ["svd.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:svd instead.",
-    deps = [
-        "//xla/hlo/builder/lib:svd",
-    ],
-)
-
-cc_library(
-    name = "tridiagonal",
-    hdrs = ["tridiagonal.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:tridiagonal instead.",
-    deps = [
-        "//xla/hlo/builder/lib:tridiagonal",
-    ],
-)
-
-cc_library(
-    name = "logdet",
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:logdet instead.",
-    deps = [
-        "//xla/hlo/builder/lib:logdet",
-    ],
-)
-
-cc_library(
-    name = "tuple",
-    hdrs = ["tuple.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/builder/lib:tuple instead.",
-    deps = [
-        "//xla/hlo/builder/lib:tuple",
-    ],
-)
diff --git a/third_party/xla/xla/client/lib/approx_topk.h b/third_party/xla/xla/client/lib/approx_topk.h
deleted file mode 100644
index 175a12cad0e9..000000000000
--- a/third_party/xla/xla/client/lib/approx_topk.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_APPROX_TOPK_H_
-#define XLA_CLIENT_LIB_APPROX_TOPK_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/approx_topk.h"
-
-#endif  // XLA_CLIENT_LIB_APPROX_TOPK_H_
diff --git a/third_party/xla/xla/client/lib/approx_topk_shape.h b/third_party/xla/xla/client/lib/approx_topk_shape.h
deleted file mode 100644
index eef1e296f36f..000000000000
--- a/third_party/xla/xla/client/lib/approx_topk_shape.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
-#define XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/approx_topk_shape.h"
-
-#endif  // XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
diff --git a/third_party/xla/xla/client/lib/arithmetic.h b/third_party/xla/xla/client/lib/arithmetic.h
deleted file mode 100644
index 0b8e000a2f27..000000000000
--- a/third_party/xla/xla/client/lib/arithmetic.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_ARITHMETIC_H_
-#define XLA_CLIENT_LIB_ARITHMETIC_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/arithmetic.h"
-
-#endif  // XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/third_party/xla/xla/client/lib/broadcast.h b/third_party/xla/xla/client/lib/broadcast.h
deleted file mode 100644
index deb85ae9ab85..000000000000
--- a/third_party/xla/xla/client/lib/broadcast.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_BROADCAST_H_
-#define XLA_CLIENT_LIB_BROADCAST_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/broadcast.h"
-
-#endif  // XLA_CLIENT_LIB_BROADCAST_H_
diff --git a/third_party/xla/xla/client/lib/comparators.h b/third_party/xla/xla/client/lib/comparators.h
deleted file mode 100644
index ad9b37d716d7..000000000000
--- a/third_party/xla/xla/client/lib/comparators.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_COMPARATORS_H_
-#define XLA_CLIENT_LIB_COMPARATORS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/comparators.h"
-
-#endif  // XLA_CLIENT_LIB_COMPARATORS_H_
diff --git a/third_party/xla/xla/client/lib/constants.h b/third_party/xla/xla/client/lib/constants.h
deleted file mode 100644
index 2135f4819773..000000000000
--- a/third_party/xla/xla/client/lib/constants.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_CONSTANTS_H_
-#define XLA_CLIENT_LIB_CONSTANTS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/constants.h"
-
-#endif  // XLA_CLIENT_LIB_CONSTANTS_H_
diff --git a/third_party/xla/xla/client/lib/conv_grad_size_util.h b/third_party/xla/xla/client/lib/conv_grad_size_util.h
deleted file mode 100644
index e991982968da..000000000000
--- a/third_party/xla/xla/client/lib/conv_grad_size_util.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
-#define XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/conv_grad_size_util.h"
-
-#endif  // XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
diff --git a/third_party/xla/xla/client/lib/dynamic_shaped_ops.h b/third_party/xla/xla/client/lib/dynamic_shaped_ops.h
deleted file mode 100644
index cf62a37d6f92..000000000000
--- a/third_party/xla/xla/client/lib/dynamic_shaped_ops.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_DYNAMIC_SHAPED_OPS_H_
-#define XLA_CLIENT_LIB_DYNAMIC_SHAPED_OPS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/dynamic_shaped_ops.h"
-
-#endif  // XLA_CLIENT_LIB_DYNAMIC_SHAPED_OPS_H_
diff --git a/third_party/xla/xla/client/lib/loops.h b/third_party/xla/xla/client/lib/loops.h
deleted file mode 100644
index d714efeaa415..000000000000
--- a/third_party/xla/xla/client/lib/loops.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_LOOPS_H_
-#define XLA_CLIENT_LIB_LOOPS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/loops.h"
-
-#endif  // XLA_CLIENT_LIB_LOOPS_H_
diff --git a/third_party/xla/xla/client/lib/lu_decomposition.h b/third_party/xla/xla/client/lib/lu_decomposition.h
deleted file mode 100644
index 752e84c9d2b1..000000000000
--- a/third_party/xla/xla/client/lib/lu_decomposition.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
-#define XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/lu_decomposition.h"
-
-#endif  // XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
diff --git a/third_party/xla/xla/client/lib/math.h b/third_party/xla/xla/client/lib/math.h
deleted file mode 100644
index 9956776ee87d..000000000000
--- a/third_party/xla/xla/client/lib/math.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_MATH_H_
-#define XLA_CLIENT_LIB_MATH_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/math.h"
-
-#endif  // XLA_CLIENT_LIB_MATH_H_
diff --git a/third_party/xla/xla/client/lib/matrix.h b/third_party/xla/xla/client/lib/matrix.h
deleted file mode 100644
index aaf938786fc0..000000000000
--- a/third_party/xla/xla/client/lib/matrix.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_MATRIX_H_
-#define XLA_CLIENT_LIB_MATRIX_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/matrix.h"
-
-#endif  // XLA_CLIENT_LIB_MATRIX_H_
diff --git a/third_party/xla/xla/client/lib/pooling.h b/third_party/xla/xla/client/lib/pooling.h
deleted file mode 100644
index 22f3d2f0b07b..000000000000
--- a/third_party/xla/xla/client/lib/pooling.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_POOLING_H_
-#define XLA_CLIENT_LIB_POOLING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/pooling.h"
-
-#endif  // XLA_CLIENT_LIB_POOLING_H_
diff --git a/third_party/xla/xla/client/lib/prng.h b/third_party/xla/xla/client/lib/prng.h
deleted file mode 100644
index 0c9e460ba10c..000000000000
--- a/third_party/xla/xla/client/lib/prng.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_PRNG_H_
-#define XLA_CLIENT_LIB_PRNG_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/prng.h"
-
-#endif  // XLA_CLIENT_LIB_PRNG_H_
diff --git a/third_party/xla/xla/client/lib/qr.h b/third_party/xla/xla/client/lib/qr.h
deleted file mode 100644
index 743b36503b61..000000000000
--- a/third_party/xla/xla/client/lib/qr.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_QR_H_
-#define XLA_CLIENT_LIB_QR_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/qr.h"
-
-#endif  // XLA_CLIENT_LIB_QR_H_
diff --git a/third_party/xla/xla/client/lib/quantize.h b/third_party/xla/xla/client/lib/quantize.h
deleted file mode 100644
index 459716b36b54..000000000000
--- a/third_party/xla/xla/client/lib/quantize.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_QUANTIZE_H_
-#define XLA_CLIENT_LIB_QUANTIZE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/quantize.h"
-
-#endif  // XLA_CLIENT_LIB_QUANTIZE_H_
diff --git a/third_party/xla/xla/client/lib/self_adjoint_eig.h b/third_party/xla/xla/client/lib/self_adjoint_eig.h
deleted file mode 100644
index ae81dbc0baf5..000000000000
--- a/third_party/xla/xla/client/lib/self_adjoint_eig.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
-#define XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/self_adjoint_eig.h"
-
-#endif  // XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
diff --git a/third_party/xla/xla/client/lib/slicing.h b/third_party/xla/xla/client/lib/slicing.h
deleted file mode 100644
index c2ea243ae2c9..000000000000
--- a/third_party/xla/xla/client/lib/slicing.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_SLICING_H_
-#define XLA_CLIENT_LIB_SLICING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/slicing.h"
-
-#endif  // XLA_CLIENT_LIB_SLICING_H_
diff --git a/third_party/xla/xla/client/lib/sorting.h b/third_party/xla/xla/client/lib/sorting.h
deleted file mode 100644
index 5cb81a43c11f..000000000000
--- a/third_party/xla/xla/client/lib/sorting.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_SORTING_H_
-#define XLA_CLIENT_LIB_SORTING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/sorting.h"
-
-#endif  // XLA_CLIENT_LIB_SORTING_H_
diff --git a/third_party/xla/xla/client/lib/svd.h b/third_party/xla/xla/client/lib/svd.h
deleted file mode 100644
index 54893697c5fc..000000000000
--- a/third_party/xla/xla/client/lib/svd.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_SVD_H_
-#define XLA_CLIENT_LIB_SVD_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/svd.h"
-
-#endif  // XLA_CLIENT_LIB_SVD_H_
diff --git a/third_party/xla/xla/client/lib/testing.cc b/third_party/xla/xla/client/lib/testing.cc
index 22e4e79567cc..ea829b8132b1 100644
--- a/third_party/xla/xla/client/lib/testing.cc
+++ b/third_party/xla/xla/client/lib/testing.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/service/service.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/lib/gtl/value_or_die.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -112,7 +113,8 @@ std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
 
   std::vector<std::unique_ptr<GlobalData>> results;
   for (const ShapeProto& shape : program_shape.parameters()) {
-    results.push_back(MakeFakeDataOrDie(Shape(shape), client, debug_opts));
+    results.push_back(MakeFakeDataOrDie(
+        tsl::gtl::ValueOrDie(Shape::FromProto(shape)), client, debug_opts));
   }
   return results;
 }
diff --git a/third_party/xla/xla/client/lib/tridiagonal.h b/third_party/xla/xla/client/lib/tridiagonal.h
deleted file mode 100644
index 5cc51c5e9826..000000000000
--- a/third_party/xla/xla/client/lib/tridiagonal.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_TRIDIAGONAL_H_
-#define XLA_CLIENT_LIB_TRIDIAGONAL_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/tridiagonal.h"
-
-#endif  // XLA_CLIENT_LIB_TRIDIAGONAL_H_
diff --git a/third_party/xla/xla/client/lib/tuple.h b/third_party/xla/xla/client/lib/tuple.h
deleted file mode 100644
index c1dc9de027a5..000000000000
--- a/third_party/xla/xla/client/lib/tuple.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_LIB_TUPLE_H_
-#define XLA_CLIENT_LIB_TUPLE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/lib/tuple.h"
-
-#endif  // XLA_CLIENT_LIB_TUPLE_H_
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index 1eac9fda20d6..ce57cec6e9d4 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -473,20 +473,34 @@ LocalClient::CompileAheadOfTime(
 absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
     const std::string& serialized_aot_result,
     const ExecutableBuildOptions& options) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
+                      Compiler::GetForPlatform(platform()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::AotCompilationResult> aot_result,
+      compiler->LoadAotCompilationResult(serialized_aot_result));
+  return LoadInternal(std::move(aot_result), compiler.get(), options);
+}
+
+absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
+    std::unique_ptr<xla::AotCompilationResult> aot_result,
+    const ExecutableBuildOptions& options) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
+                      Compiler::GetForPlatform(platform()));
+  return LoadInternal(std::move(aot_result), compiler.get(), options);
+}
+
+absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
+    std::unique_ptr<xla::AotCompilationResult> aot_result, Compiler* compiler,
+    const ExecutableBuildOptions& options) {
   TF_ASSIGN_OR_RETURN(ExecutableBuildOptions updated_options,
                       UpdateBuildOptions(options, default_device_ordinal()));
   TF_ASSIGN_OR_RETURN(
       se::StreamExecutor * executor,
       backend().stream_executor(updated_options.device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(Compiler * compiler,
-                      Compiler::GetForPlatform(platform()));
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<xla::AotCompilationResult> aot_result,
-      compiler->LoadAotCompilationResult(serialized_aot_result));
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      aot_result->LoadExecutable(compiler, executor));
+      std::unique_ptr<Executable> executable,
+      std::move(*aot_result).LoadExecutable(compiler, executor));
   return std::make_unique<LocalExecutable>(std::move(executable),
                                            local_service_->mutable_backend(),
                                            updated_options);
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 2640ea62fa35..c687766fcc37 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -86,6 +86,10 @@ class LocalExecutable {
   // build device.
   absl::Status VerifyRunDeviceCompatible(int run_device_ordinal) const;
 
+  absl::StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
+  RunHelper(absl::Span<const Shape* const> argument_shapes,
+            ExecutableRunOptions run_options);
+
  private:
   absl::StatusOr<ExecutionOutput> RunAsync(
       absl::Span<Shape const* const> argument_host_shapes,
@@ -103,10 +107,6 @@ class LocalExecutable {
   absl::StatusOr<Literal> LiteralFromShapedBuffer(
       const ShapedBuffer& shaped_buffer);
 
-  absl::StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
-  RunHelper(absl::Span<const Shape* const> argument_shapes,
-            ExecutableRunOptions run_options);
-
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
   // Backend::devices_equivalent).
@@ -174,6 +174,11 @@ class LocalClient : public Client {
       const std::string& serialized_aot_result,
       const ExecutableBuildOptions& options);
 
+  // Variant of `Load()` that accepts an AotCompilationResult.
+  absl::StatusOr<std::unique_ptr<LocalExecutable>> Load(
+      std::unique_ptr<xla::AotCompilationResult> aot_result,
+      const ExecutableBuildOptions& options);
+
   // Copy the literal data to the device with the given ordinal and return as a
   // ScopedShapedBuffer. If non-null the given memory allocator is used for
   // device memory allocation. If null, the default memory allocator for the
@@ -244,6 +249,10 @@ class LocalClient : public Client {
 
  private:
   LocalService* local_service_;
+
+  absl::StatusOr<std::unique_ptr<LocalExecutable>> LoadInternal(
+      std::unique_ptr<xla::AotCompilationResult> aot_result, Compiler* compiler,
+      const ExecutableBuildOptions& options);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/client/padding.h b/third_party/xla/xla/client/padding.h
deleted file mode 100644
index a9e928d865da..000000000000
--- a/third_party/xla/xla/client/padding.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_PADDING_H_
-#define XLA_CLIENT_PADDING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/padding.h"
-
-#endif  // XLA_CLIENT_PADDING_H_
diff --git a/third_party/xla/xla/client/sharding_builder.h b/third_party/xla/xla/client/sharding_builder.h
deleted file mode 100644
index 995978b165f8..000000000000
--- a/third_party/xla/xla/client/sharding_builder.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_SHARDING_BUILDER_H_
-#define XLA_CLIENT_SHARDING_BUILDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/sharding_builder.h"
-
-#endif  // XLA_CLIENT_SHARDING_BUILDER_H_
diff --git a/third_party/xla/xla/client/value_inference.h b/third_party/xla/xla/client/value_inference.h
deleted file mode 100644
index f717cc703b25..000000000000
--- a/third_party/xla/xla/client/value_inference.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef XLA_CLIENT_VALUE_INFERENCE_H_
-#define XLA_CLIENT_VALUE_INFERENCE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/value_inference.h"
-
-#endif  // XLA_CLIENT_VALUE_INFERENCE_H_
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
deleted file mode 100644
index 1599160a7130..000000000000
--- a/third_party/xla/xla/client/xla_builder.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_XLA_BUILDER_H_
-#define XLA_CLIENT_XLA_BUILDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/xla_builder.h"
-
-#endif  // XLA_CLIENT_XLA_BUILDER_H_
diff --git a/third_party/xla/xla/client/xla_computation.h b/third_party/xla/xla/client/xla_computation.h
deleted file mode 100644
index 685fcfecb0b0..000000000000
--- a/third_party/xla/xla/client/xla_computation.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_CLIENT_XLA_COMPUTATION_H_
-#define XLA_CLIENT_XLA_COMPUTATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/builder/xla_computation.h"
-
-#endif  // XLA_CLIENT_XLA_COMPUTATION_H_
diff --git a/third_party/xla/xla/codegen/BUILD b/third_party/xla/xla/codegen/BUILD
index 32af35eee644..1dcf1b08722b 100644
--- a/third_party/xla/xla/codegen/BUILD
+++ b/third_party/xla/xla/codegen/BUILD
@@ -1,4 +1,4 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "if_google")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -15,6 +15,17 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "autotuner",
+    hdrs = ["autotuner.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:thunk",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/time",
+    ],
+)
+
 cc_library(
     name = "device_spec",
     hdrs = ["device_spec.h"],
@@ -34,27 +45,28 @@ cc_library(
     ],
     deps = [
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform",
     ] + if_google(["@com_google_absl//absl/types:source_location"]),
 )
 
-xla_test(
+xla_cc_test(
     name = "emitter_loc_op_builder_test",
     srcs = ["emitter_loc_op_builder_test.cc"],
-    backends = ["gpu"],
+    tags = ["gpu"],
     deps = [
         ":emitter_loc_op_builder",
         "//xla/backends/gpu/codegen/triton:fusion_emitter",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/llvm_ir:llvm_util",
-        "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -106,3 +118,20 @@ cc_library(
         "//xla/tsl/platform:logging",
     ],
 )
+
+cc_library(
+    name = "mlir_kernel_source",
+    srcs = ["mlir_kernel_source.cc"],
+    hdrs = ["mlir_kernel_source.h"],
+    deps = [
+        ":kernel_source",
+        "//xla:util",
+        "//xla/mlir/utils:error_util",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/autotuner.h b/third_party/xla/xla/codegen/autotuner.h
new file mode 100644
index 000000000000..a21ea2a49c0d
--- /dev/null
+++ b/third_party/xla/xla/codegen/autotuner.h
@@ -0,0 +1,120 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_AUTOTUNER_H_
+#define XLA_CODEGEN_AUTOTUNER_H_
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/runtime/thunk.h"
+
+namespace xla {
+
+// The XLA autotuner is responsible for choosing the best backend for compiling
+// an XLA fusion to thunks. It relies on an executable provider to provide a set
+// of candidates to choose from and on the runtime profiles to measure execution
+// time and other metrics.
+//
+// Example: autotuning a dot fusion
+//
+//   %dot_fusion {
+//     p0 = f32[1024, 1024] parameter(0)
+//     p1 = f32[1024, 1024] parameter(1)
+//     ROOT dot = f32[1024, 1024] dot(p0, p1)
+//   }
+//
+// In XLA we have various backends that can generate thunks for the given
+// fusion, and each backend also can choose from various strategies:
+//
+//   1. We can use vendor libraries (i.e. cuBLAS on NVIDIA GPUs), which
+//      typically have various algorithms to choose from.
+//
+//   2. We can use a compiler that generates machine code for the given dot
+//      (i.e. Triton on NVIDIA GPUs), and we can choose from various tile sizes
+//      and other compiler options.
+//
+// Executable provider implementations are responsible for compiling (or
+// simply lowering to library calls) a fusion to a sequence of thunks, and
+// returning them to the autotuner.
+//
+// Autotuner chooses the best candidate, and uses an executable config to
+// annotate the fusion with the chosen configuration (backend name and
+// backend-specific knobs), so that later at compile time we can immediately
+// select the best strategy and emit the most efficient thunks.
+class Autotuner {
+ public:
+  virtual ~Autotuner() = default;
+
+  // Result of compiling a fusion to thunk sequence ready for execution.
+  //
+  // Example: Executable {0, {KernelThunk<triton-kernel-inside>}}
+  struct Executable {
+    ThunkSequence sequence;
+    size_t scratch_allocation;
+  };
+
+  // Result of running and profiling an executable candidate.
+  struct ExecutableProfile {
+    absl::Duration execution_time;
+  };
+
+  // Executable provider is an interface for providing a set of candidates for
+  // an XLA autotuner to choose from.
+  class ExecutableProvider {
+   public:
+    virtual ~ExecutableProvider() = default;
+
+    // Executable config defines provider-specific parameters for compiling a
+    // given fusion to an executable. It's defined as a virtual base class to
+    // allow different executable providers to define their own config types.
+    //
+    // Example: TritonCodegenConfig {"t0": 128, "t1": 256}
+    class ExecutableConfig {
+     public:
+      virtual ~ExecutableConfig() = default;
+
+      // Returns a human-readable string representation of the config.
+      virtual std::string ToString() const = 0;
+    };
+
+    // Returns the default executable config for the given fusion.
+    virtual std::unique_ptr<ExecutableConfig> GetDefaultConfig(
+        const HloFusionInstruction* fusion) = 0;
+
+    // Returns a list of executable configs that can be used to compile
+    // a given fusion to a thunk sequence.
+    virtual absl::StatusOr<std::vector<std::unique_ptr<ExecutableConfig>>>
+    SupportedExecutableConfigs(const HloFusionInstruction* fusion) = 0;
+
+    // Compiles a given fusion with a given executable config.
+    virtual absl::StatusOr<Executable> Compile(
+        const HloFusionInstruction* fusion, const ExecutableConfig& config) = 0;
+  };
+
+  // Runs an autotuner result and measures execution time + whatever metric is
+  // important.
+  virtual absl::StatusOr<ExecutionProfile> Run(
+      const HloFusionInstruction* fusion, Executable executable) = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_AUTOTUNER_H_
diff --git a/third_party/xla/xla/codegen/emitter_loc_op_builder.h b/third_party/xla/xla/codegen/emitter_loc_op_builder.h
index bb5df401c12f..c89f050cce9c 100644
--- a/third_party/xla/xla/codegen/emitter_loc_op_builder.h
+++ b/third_party/xla/xla/codegen/emitter_loc_op_builder.h
@@ -28,8 +28,6 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 // The source_location.h is not available in open source.
 #include "absl/types/source_location.h"
-#else
-#include <string_view>
 #endif
 
 namespace xla {
diff --git a/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc b/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
index 9ef66216a9d1..e1a2da61f4a6 100644
--- a/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
+++ b/third_party/xla/xla/codegen/emitter_loc_op_builder_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -26,9 +28,9 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "tsl/platform/status_matchers.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -37,7 +39,9 @@ using mlir::NameLoc;
 using mlir::StringAttr;
 using ::tsl::testing::IsOkAndHolds;
 
-class EmitterLocOpBuilderTest : public ::testing::Test {
+using ::xla::gpu::ir_emitter_triton_internal::DumpTritonIR;
+
+class EmitterLocOpBuilderTest : public HloHardwareIndependentTestBase {
  protected:
   void SetUp() override { gpu::LoadMlirDialectsForTriton(context_); }
 
@@ -63,8 +67,7 @@ TEST_F(EmitterLocOpBuilderTest, IRWithAnnotations) {
   auto loc = NameLoc(context_, "IRWithAnnotations");
   EmitterLocOpBuilder b(loc, &context_, /*annotate_loc=*/true);
   auto triton_module = MakeModuleWithOneOp(context_, b);
-  std::string ir =
-      gpu::DumpTritonIR(triton_module.get(), /*dump_annotations=*/true);
+  std::string ir = DumpTritonIR(triton_module.get(), /*dump_annotations=*/true);
   if constexpr (EmitterLocOpBuilder::kSourceLocationSupported) {
     EXPECT_THAT(RunFileCheck(ir, R"(
       CHECK: "IRWithAnnotations -> [[FILE:.*_test.cc]]:[[LINE:[0-9]+]]"
@@ -83,7 +86,7 @@ TEST_F(EmitterLocOpBuilderTest, IRWithoutAnnotations) {
   EmitterLocOpBuilder b(loc, &context_, /*annotate_loc=*/false);
   auto triton_module = MakeModuleWithOneOp(context_, b);
   std::string ir =
-      gpu::DumpTritonIR(triton_module.get(), /*dump_annotations=*/false);
+      DumpTritonIR(triton_module.get(), /*dump_annotations=*/false);
   EXPECT_THAT(RunFileCheck(ir, R"(
     CHECK-NOT: IRWithoutAnnotations
   )"),
diff --git a/third_party/xla/xla/codegen/emitters/BUILD b/third_party/xla/xla/codegen/emitters/BUILD
index df4d6bd69459..cdaa6a4894fe 100644
--- a/third_party/xla/xla/codegen/emitters/BUILD
+++ b/third_party/xla/xla/codegen/emitters/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -48,15 +48,27 @@ xla_cc_test(
         ":computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
     ],
 )
 
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "elemental_hlo_to_mlir",
     srcs = ["elemental_hlo_to_mlir.cc"],
@@ -117,9 +129,9 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/mlir_hlo",
         "//xla/service/llvm_ir:llvm_util",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status",
@@ -143,6 +155,21 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "fusion_wrapper_base",
+    srcs = ["fusion_wrapper_base.cc"],
+    hdrs = ["fusion_wrapper_base.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "type_util",
     srcs = ["type_util.cc"],
@@ -153,6 +180,7 @@ cc_library(
         "//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
         "//xla/mlir/utils:type_util",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
index 174ee7fd9290..2bd6d627042a 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
@@ -59,7 +59,7 @@ namespace emitters {
 namespace {
 
 int Arity(const Shape& shape) {
-  return shape.IsTuple() ? shape.tuple_shapes_size() : 1;
+  return shape.IsTuple() ? shape.tuple_shapes().size() : 1;
 }
 
 const Shape& TupleShape(const Shape& shape, int index) {
@@ -95,8 +95,10 @@ EpilogueSpecification EpilogueSpecification::FromIdentityIndexing(
     const HloInstruction* hero, const HloInstruction* root,
     mlir::MLIRContext* mlir_context) {
   EpilogueSpecification result;
-  absl::c_copy(root->shape().dimensions(),
-               std::back_inserter(result.index_ranges));
+  if (root->shape().IsArray()) {
+    absl::c_copy(root->shape().dimensions(),
+                 std::back_inserter(result.index_ranges));
+  }
   result.roots.push_back(root);
   result.root_indexing.push_back(
       CreateIdentityMap(root->shape(), mlir_context));
@@ -107,10 +109,13 @@ EpilogueSpecification EpilogueSpecification::FromIdentityIndexing(
 std::string PartitionedComputation::Subgraph::ToString(int indentation) const {
   std::string indent(indentation, ' ');
   std::ostringstream ss;
-  ss << indent << "SUBGRAPH " << name << " {\n";
+  ss << indent << "SUBGRAPH " << name << (has_no_compute ? " no_compute" : "")
+     << " {\n";
   for (auto* instr :
        (*instructions.begin())->parent()->MakeInstructionPostOrder()) {
-    if (!instructions.contains(instr)) continue;
+    if (!instructions.contains(instr)) {
+      continue;
+    }
     ss << indent << "  ";
     if (absl::c_linear_search(roots, instr)) {
       ss << "ROOT ";
@@ -155,7 +160,8 @@ bool IsEvaluatedMoreThanOnce(const HloInstruction* instr) {
   return absl::c_any_of(instr->users(), [&](const HloInstruction* user) {
     if (user->opcode() == HloOpcode::kGather &&
         absl::c_linear_search(user->OperandIndices(instr), 1) &&
-        instr->shape().rank() >= 2 && instr->shape().dimensions(1) > 1) {
+        instr->shape().dimensions().size() >= 2 &&
+        instr->shape().dimensions(1) > 1) {
       return true;
     }
     if (user->opcode() == HloOpcode::kConcatenate &&
@@ -168,6 +174,8 @@ bool IsEvaluatedMoreThanOnce(const HloInstruction* instr) {
 
 using SubgraphId = int;
 
+constexpr int kMaxHloOpsPerSubgraph = 2000;
+
 // HloSubgraphData is associated with a single HLO instruction and contains
 // the necessary information to partition the computation into subgraphs.
 struct HloSubgraphData {
@@ -179,6 +187,8 @@ struct HloSubgraphData {
   SubgraphId subgraph_id = -1;
   // Whether the instruction is a root of the subgraph.
   bool is_root = false;
+  // Number of users.
+  int num_users = 0;
 };
 
 PartitionedComputation::PartitionedComputation(
@@ -196,6 +206,7 @@ PartitionedComputation::PartitionedComputation(
 
   SubgraphId subgraph_count = 0;
   std::vector<HloSubgraphData> id_to_subgraph_data(pre_order.size());
+  std::vector<int> num_ops_per_subgraph;
   // Iterate over the use-def chains and check if the instruction should be
   // placed in a separate function.
   for (auto [instr_index, instr] : llvm::enumerate(pre_order)) {
@@ -207,12 +218,26 @@ PartitionedComputation::PartitionedComputation(
         is_subgraph_root(instr) ||
         instr_subgraph_data.user_subgraph_ids.size() != 1 ||
         instr_subgraph_data.indexings.size() > 1;
-    if (instr_subgraph_data.is_root) {
+    bool is_large_subgraph =
+        instr_subgraph_data.subgraph_id > -1 &&
+        num_ops_per_subgraph[instr_subgraph_data.subgraph_id] >=
+            kMaxHloOpsPerSubgraph;
+    if (instr_subgraph_data.is_root || is_large_subgraph) {
       instr_subgraph_data.subgraph_id = subgraph_count++;
       instr_subgraph_data.indexings.clear();
+      num_ops_per_subgraph.push_back(1);
     } else {
       instr_subgraph_data.subgraph_id =
           *instr_subgraph_data.user_subgraph_ids.begin();
+      ++num_ops_per_subgraph.at(instr_subgraph_data.subgraph_id);
+    }
+    if (num_ops_per_subgraph.at(instr_subgraph_data.subgraph_id) >
+            kMaxHloOpsPerSubgraph &&
+        instr_subgraph_data.num_users == 1) {
+      instr_subgraph_data.subgraph_id = subgraph_count++;
+      instr_subgraph_data.is_root = true;
+      instr_subgraph_data.indexings.clear();
+      num_ops_per_subgraph.push_back(1);
     }
     auto operands_indexing = ComputeOperandIndexingMaps(instr, mlir_context);
     // Iterate over the operands and add the func_ids of the current instruction
@@ -221,6 +246,7 @@ PartitionedComputation::PartitionedComputation(
          llvm::zip(instr->operands(), operands_indexing)) {
       auto& operand_subgraph_data =
           id_to_subgraph_data[instr_to_id[operand_instr]];
+      ++operand_subgraph_data.num_users;
       IndexingMap instr_indexing = instr_subgraph_data.indexings.empty()
                                        ? IndexingMap::GetUndefined()
                                        : *instr_subgraph_data.indexings.begin();
@@ -251,7 +277,13 @@ PartitionedComputation::PartitionedComputation(
     std::vector<const HloInstruction*> roots;
     std::vector<IndexingMap> root_indexing;
     const xla::Shape* first_root_shape = nullptr;
+    bool has_no_compute = true;
     for (auto* instruction : instructions) {
+      has_no_compute &=
+          HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kConstant,
+                           HloOpcode::kIota, HloOpcode::kParameter,
+                           HloOpcode::kReshape, HloOpcode::kReverse,
+                           HloOpcode::kTranspose>(instruction);
       if (id_to_subgraph_data[instr_to_id[instruction]].is_root) {
         roots.push_back(instruction);
         if (first_root_shape) {
@@ -291,7 +323,8 @@ PartitionedComputation::PartitionedComputation(
         /* .instructions = */ {instructions.begin(), instructions.end()},
         /* .roots = */ std::move(roots),
         /* .index_ranges = */ std::move(ranges),
-        /* .root_indexing = */ std::move(root_indexing)});
+        /* .root_indexing = */ std::move(root_indexing),
+        /* .has_no_compute = */ has_no_compute});
   }
 
   for (const auto& subgraph : subgraphs_) {
@@ -412,7 +445,7 @@ const PartitionedComputation::Subgraph& PartitionedComputations::FindSubgraph(
 CallTargetProvider PartitionedComputations::CreateCallTargetProvider(
     const absl::flat_hash_map<const PartitionedComputation::Subgraph*,
                               mlir::func::FuncOp>& subgraph_to_func) const {
-  return [&, this](const HloInstruction* instr) {
+  return [subgraph_to_func, this](const HloInstruction* instr) {
     const auto& subgraph = FindSubgraph(instr);
     CHECK(subgraph_to_func.contains(&subgraph))
         << "No function found for subgraph with instruction "
@@ -475,6 +508,9 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
   auto func_op = b.create<mlir::func::FuncOp>(
       subgraph.name, ty,
       /*attrs=*/llvm::ArrayRef<mlir::NamedAttribute>{}, arg_attrs);
+  if (subgraph.has_no_compute) {
+    func_op->setAttr(kHasNoCompute, b.getBoolAttr(true));
+  }
   // Needed so that the function can potentially be inlined in-place.
   func_op.setPrivate();
   return func_op;
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.h b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
index 41bd0b1b500f..973b0f07d270 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.h
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
@@ -102,6 +102,9 @@ class PartitionedComputation {
     // Maps from raw indices to root indices.
     std::vector<IndexingMap> root_indexing;
 
+    // Whether the subgraph has no compute.
+    bool has_no_compute = false;
+
     // For values that are function arguments (not function calls), stores
     // the mapping from value to the starting argument index. The arguments
     // always come after the tensor parameters and output indices; the indices
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
index eaf2c68584c1..9272f38941ba 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
@@ -28,7 +27,7 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 
 namespace xla {
 namespace emitters {
@@ -38,7 +37,7 @@ using ::testing::ElementsAre;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAre;
 
-class ComputationPartitionerTest : public HloTestBase {
+class ComputationPartitionerTest : public HloHardwareIndependentTestBase {
  protected:
   ComputationPartitionerTest() {
     mlir_context_.loadDialect<mlir::func::FuncDialect>();
@@ -82,26 +81,26 @@ TEST_F(ComputationPartitionerTest, PartitionDiamonds) {
 
   constexpr auto kExpected = R"(PartitionedComputation fused_computation:
       SUBGRAPH fused_computation_add3 {
-        %slice3.1 = f32[2]{0} slice(f32[3]{0} %add2), slice={[0:2]}
-        %slice3.2 = f32[2]{0} slice(f32[3]{0} %add2), slice={[1:3]}
-        ROOT %add3 = f32[2]{0} add(f32[2]{0} %slice3.1, f32[2]{0} %slice3.2)
+        %slice3.1 = f32[2]{0} slice(%add2), slice={[0:2]}
+        %slice3.2 = f32[2]{0} slice(%add2), slice={[1:3]}
+        ROOT %add3 = f32[2]{0} add(%slice3.1, %slice3.2)
       }
       SUBGRAPH fused_computation_add2 {
-        %slice2.1 = f32[3]{0} slice(f32[4]{0} %add1), slice={[0:3]}
-        %slice2.2 = f32[3]{0} slice(f32[4]{0} %add1), slice={[1:4]}
-        ROOT %add2 = f32[3]{0} add(f32[3]{0} %slice2.1, f32[3]{0} %slice2.2)
+        %slice2.1 = f32[3]{0} slice(%add1), slice={[0:3]}
+        %slice2.2 = f32[3]{0} slice(%add1), slice={[1:4]}
+        ROOT %add2 = f32[3]{0} add(%slice2.1, %slice2.2)
       }
       SUBGRAPH fused_computation_add1 {
-        %slice1.1 = f32[4]{0} slice(f32[5]{0} %add0), slice={[0:4]}
-        %slice1.2 = f32[4]{0} slice(f32[5]{0} %add0), slice={[1:5]}
-        ROOT %add1 = f32[4]{0} add(f32[4]{0} %slice1.1, f32[4]{0} %slice1.2)
+        %slice1.1 = f32[4]{0} slice(%add0), slice={[0:4]}
+        %slice1.2 = f32[4]{0} slice(%add0), slice={[1:5]}
+        ROOT %add1 = f32[4]{0} add(%slice1.1, %slice1.2)
       }
       SUBGRAPH fused_computation_add0 {
-        %slice0.1 = f32[5]{0} slice(f32[6]{0} %param), slice={[0:5]}
-        %slice0.2 = f32[5]{0} slice(f32[6]{0} %param), slice={[1:6]}
-        ROOT %add0 = f32[5]{0} add(f32[5]{0} %slice0.1, f32[5]{0} %slice0.2)
+        %slice0.1 = f32[5]{0} slice(%param), slice={[0:5]}
+        %slice0.2 = f32[5]{0} slice(%param), slice={[1:6]}
+        ROOT %add0 = f32[5]{0} add(%slice0.1, %slice0.2)
       }
-      SUBGRAPH fused_computation_param {
+      SUBGRAPH fused_computation_param no_compute {
         ROOT %param = f32[6]{0} parameter(0)
       })";
   EXPECT_EQ(computation.ToString(6), kExpected);
@@ -146,15 +145,15 @@ TEST_F(ComputationPartitionerTest, DiamondConcatenate) {
 
   constexpr auto kExpected = R"(PartitionedComputation fused_computation:
       SUBGRAPH fused_computation_concat {
-        %neg = f32[6]{0} negate(f32[6]{0} %log)
+        %neg = f32[6]{0} negate(%log)
         %param2 = f32[6]{0} parameter(1)
-        %add = f32[6]{0} add(f32[6]{0} %log, f32[6]{0} %param2)
-        %exp = f32[6]{0} exponential(f32[6]{0} %add)
-        ROOT %concat = f32[12]{0} concatenate(f32[6]{0} %neg, f32[6]{0} %exp), dimensions={0}
+        %add = f32[6]{0} add(%log, %param2)
+        %exp = f32[6]{0} exponential(%add)
+        ROOT %concat = f32[12]{0} concatenate(%neg, %exp), dimensions={0}
       }
       SUBGRAPH fused_computation_log {
         %param1 = f32[6]{0} parameter(0)
-        ROOT %log = f32[6]{0} log(f32[6]{0} %param1)
+        ROOT %log = f32[6]{0} log(%param1)
       })";
   EXPECT_EQ(computation.ToString(6), kExpected);
 }
@@ -178,9 +177,9 @@ TEST_F(ComputationPartitionerTest, TupleRoot) {
       SUBGRAPH fused_computation_root {
         %p0 = f32[6]{0} parameter(0)
         %p1 = f32[6]{0} parameter(1)
-        %add = f32[6]{0} add(f32[6]{0} %p0, f32[6]{0} %p1)
-        %sub = f32[6]{0} subtract(f32[6]{0} %p0, f32[6]{0} %p1)
-        ROOT %root = (f32[6]{0}, f32[6]{0}) tuple(f32[6]{0} %add, f32[6]{0} %sub)
+        %add = f32[6]{0} add(%p0, %p1)
+        %sub = f32[6]{0} subtract(%p0, %p1)
+        ROOT %root = (f32[6]{0}, f32[6]{0}) tuple(%add, %sub)
       })";
   EXPECT_EQ(computation.ToString(6), kExpected);
 }
@@ -254,6 +253,35 @@ TEST_F(ComputationPartitionerTest, TransposeAsRoot) {
   EXPECT_THAT(computation.GetRootSubgraph().instructions, SizeIs(2));
 }
 
+TEST_F(ComputationPartitionerTest, TransposeReverse) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+    fused_computation {
+      %p0 = f32[64, 32] parameter(0)
+      %reverse = f32[64, 32] reverse(%p0), dimensions={0}
+      %transpose = f32[32, 64] transpose(%reverse), dimensions={1, 0}
+      ROOT %root = f32[32, 64] tanh(%transpose)
+    })")
+                    .value();
+
+  auto* fusion = module->GetComputationWithName("fused_computation");
+  ASSERT_NE(fusion, nullptr);
+  PartitionedComputation computation(
+      fusion, &mlir_context_, [](const HloInstruction* instr) {
+        return instr->opcode() == HloOpcode::kTranspose;
+      });
+  constexpr auto kExpected = R"(PartitionedComputation fused_computation:
+      SUBGRAPH fused_computation_root {
+        ROOT %root = f32[32,64]{1,0} tanh(%transpose)
+      }
+      SUBGRAPH fused_computation_transpose no_compute {
+        %p0 = f32[64,32]{1,0} parameter(0)
+        %reverse = f32[64,32]{1,0} reverse(%p0), dimensions={0}
+        ROOT %transpose = f32[32,64]{1,0} transpose(%reverse), dimensions={1,0}
+      })";
+  EXPECT_EQ(computation.ToString(6), kExpected);
+}
+
 TEST_F(ComputationPartitionerTest, PartiallyMergable) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule test_module
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
index 29f3e4b4b3bc..8f47bb3ca3f2 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
@@ -319,7 +319,7 @@ absl::StatusOr<SmallVector<Value, 1>> EmitDynamicSlice(
   SmallVector<Value, 3> input_indices(indices);
 
   const auto& input_shape = instr->operand(0)->shape();
-  for (int i = 0; i < input_shape.rank(); ++i) {
+  for (int i = 0; i < input_shape.dimensions().size(); ++i) {
     TF_ASSIGN_OR_RETURN(
         auto offset, GetSingleOperandValue(operand_provider, instr, i + 1, {}));
     offset =
@@ -343,7 +343,7 @@ absl::StatusOr<SmallVector<Value, 1>> EmitDynamicUpdateSlice(
   Value is_in_bounds = b.create<ConstantOp>(b.getIntegerAttr(b.getI1Type(), 1));
   mlir::SmallVector<Value, 3> update_indices;
   const auto& updates_shape = instr->operand(1)->shape();
-  for (int i = 0; i < instr->shape().rank(); ++i) {
+  for (int i = 0; i < instr->shape().dimensions().size(); ++i) {
     int64_t update_size = updates_shape.dimensions(i);
     TF_ASSIGN_OR_RETURN(
         auto start_index,
@@ -391,25 +391,26 @@ absl::StatusOr<SmallVector<Value, 1>> EmitGather(
   auto zero = b.create<ConstantIndexOp>(0);
   // Gather allows the index vector to contain fewer elements than the rank
   // of the input. In that case, the remaining indices are 0.
-  SmallVector<Value, 3> operand_indices(instr->operand(0)->shape().rank(),
-                                        zero);
+  SmallVector<Value, 3> operand_indices(
+      instr->operand(0)->shape().dimensions().size(), zero);
 
   // Produce start indices.
   // HLO allows the index vector dimension to be implicit, and the algebraic
   // simplifier prefers this form. Therefore, we need to check the rank of the
   // indices here and do the implicit reshape in place.
   const auto& indices_shape = instr->operand(1)->shape();
-  int num_indices = indices_shape.rank() == 1 ? 1 : indices_shape.dimensions(1);
+  int num_indices =
+      indices_shape.dimensions().size() == 1 ? 1 : indices_shape.dimensions(1);
   for (int i = 0; i < num_indices; ++i) {
     auto i_val = i == 0 ? zero : b.create<ConstantIndexOp>(i);
     int64_t slice_size = instr->gather_slice_sizes()[i];
     int64_t input_size = instr->operand(0)->shape().dimensions()[i];
     // Read and clamp index.
-    TF_ASSIGN_OR_RETURN(
-        auto input_index,
-        operand_provider(instr, 1,
-                         indices_shape.rank() == 1 ? ValueRange{row}
-                                                   : ValueRange{row, i_val}));
+    TF_ASSIGN_OR_RETURN(auto input_index,
+                        operand_provider(instr, 1,
+                                         indices_shape.dimensions().size() == 1
+                                             ? ValueRange{row}
+                                             : ValueRange{row, i_val}));
     TF_RET_CHECK(input_index.size() == 1)
         << "Expected operand to be a single value.";
     operand_indices[i] =
@@ -501,6 +502,11 @@ absl::StatusOr<Value> EmitMulAdd(Value lhs, Value rhs, Value accumulator,
     return b.create<arith::OrIOp>(accumulator,
                                   b.create<arith::AndIOp>(lhs, rhs));
   }
+  if (primitive_util::IsComplexType(result_element_type)) {
+    // Handle complex types (e.g., C64, C128)
+    Value mul = b.create<mlir::complex::MulOp>(accumulator_type, lhs, rhs);
+    return b.create<mlir::complex::AddOp>(accumulator_type, accumulator, mul);
+  }
   return b.create<arith::AddIOp>(accumulator,
                                  b.create<arith::MulIOp>(lhs, rhs));
 }
@@ -517,8 +523,23 @@ absl::StatusOr<SmallVector<Value, 1>> EmitDotLoop(
 
   const mlir::Type accumulator_type =
       result_element_type.isBF16() ? b.getF32Type() : result_element_type;
-  Value accum_init_value =
-      b.create<ConstantOp>(b.getZeroAttr(accumulator_type)).getResult();
+  Value accum_init_value;
+  if (auto complex_ty = mlir::dyn_cast<mlir::ComplexType>(accumulator_type)) {
+    // For complex, build real-zero and imag-zero separately:
+    mlir::Type element_ty = complex_ty.getElementType();
+
+    // E.g. float zero
+    auto real_zero = b.create<arith::ConstantOp>(b.getZeroAttr(element_ty));
+    auto imag_zero = b.create<arith::ConstantOp>(b.getZeroAttr(element_ty));
+
+    // Create a complex<element_ty> from these two scalars
+    accum_init_value =
+        b.create<mlir::complex::CreateOp>(complex_ty, real_zero, imag_zero);
+  } else {
+    // For non-complex, just build a float or integer zero directly
+    accum_init_value =
+        b.create<arith::ConstantOp>(b.getZeroAttr(accumulator_type));
+  }
 
   // For convolutions with `batch_group_count` > 1, there is an additional
   // symbol for LHS (group id) - ignore it for RHS.
@@ -690,7 +711,7 @@ absl::StatusOr<SmallVector<Value, 1>> EmitTuple(
   while (first_shape->IsTuple()) {
     first_shape = &first_shape->tuple_shapes(0);
   }
-  CHECK_EQ(first_shape->rank(), indices.size())
+  CHECK_EQ(first_shape->dimensions().size(), indices.size())
       << "Indices for tuple must be for the first tuple element";
   SmallVector<Value, 1> operands;
   for (int i = 0; i < instr->operand_count(); ++i) {
@@ -757,10 +778,10 @@ absl::StatusOr<SmallVector<Value, 2>> GetOperands(
   if (is_elementwise && instr->shape().IsArray()) {
     // Check if the instruction is really elementwise. There may be some
     // broadcasting.
-    int64_t rank = instr->shape().rank();
+    int64_t rank = instr->shape().dimensions().size();
     is_elementwise &=
         absl::c_all_of(instr->operands(), [&](const HloInstruction* operand) {
-          return operand->shape().rank() == rank;
+          return operand->shape().dimensions().size() == rank;
         });
   }
 
@@ -1209,7 +1230,7 @@ ValueRange ProvideParameter(const PartitionedComputation& computation,
   int offset = 0;
   for (auto root : callee_subgraph.roots) {
     int root_arity =
-        root->shape().IsTuple() ? root->shape().tuple_shapes_size() : 1;
+        root->shape().IsTuple() ? root->shape().tuple_shapes().size() : 1;
     if (root == operand) {
       return results.slice(offset, root_arity);
     }
@@ -1292,7 +1313,7 @@ absl::StatusOr<SmallVector<Value>> SubgraphConverter::Convert() {
       auto injected =
           this_fn_.getArguments().take_back(subgraph_.num_injected_values);
       int arity =
-          root->shape().IsTuple() ? root->shape().tuple_shapes_size() : 1;
+          root->shape().IsTuple() ? root->shape().tuple_shapes().size() : 1;
       absl::c_copy(injected.slice(it->second, arity),
                    std::back_inserter(results));
       continue;
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
index 701814f0440b..384ae165c3e8 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
@@ -47,10 +47,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/status_macros.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -61,7 +61,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class ElementalHloToMlirTest : public HloTestBase {
+class ElementalHloToMlirTest : public HloHardwareIndependentTestBase {
  public:
   ElementalHloToMlirTest() {
     context_.loadDialect<mlir::tensor::TensorDialect, mlir::func::FuncDialect,
@@ -1765,6 +1765,64 @@ TEST_F(ElementalHloToMlirTest, BroadcastSelect) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, DotC64) {
+  TF_EXPECT_OK(Run(
+      R"(
+HloModule c64_dot_test
+
+ENTRY main {
+  p0 = c64[4] parameter(0)
+  p1 = c64[4] parameter(1)
+  dot = c64[] dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  ROOT out = c64[] add(dot, dot)
+}
+      )",
+      R"(
+      // CHECK: func.func private @main_out(
+      // CHECK-SAME: %[[ARG0:.*]]: tensor<4xcomplex<f32>>,
+      // CHECK-SAME: %[[ARG1:.*]]: tensor<4xcomplex<f32>>
+      // CHECK:   %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
+      // CHECK:   %[[INIT:.*]] = complex.create %[[CST0]], %[[CST0]] : complex<f32>
+      // CHECK:   %[[DOTRESULT:.*]] = scf.for {{.*}} = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}} = %[[INIT]]) -> (complex<f32>) {
+      // CHECK:     %[[EXTRACTED:.*]] = tensor.extract %[[ARG0]][{{.*}}]
+      // CHECK:     %[[EXTRACTED0:.*]] = tensor.extract %[[ARG1]][{{.*}}]
+      // CHECK:     %[[MUL:.*]] = complex.mul %[[EXTRACTED]], %[[EXTRACTED0]]
+      // CHECK:     %[[NEXTACC:.*]] = complex.add {{.*}}, %[[MUL]]
+      // CHECK:     scf.yield %[[NEXTACC]]
+      // CHECK:   %[[OUT:.*]] = complex.add %[[DOTRESULT]], %[[DOTRESULT]]
+      // CHECK:   return %[[OUT]]
+      )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotC128) {
+  TF_EXPECT_OK(Run(
+      R"(
+HloModule c128_dot_test
+
+ENTRY main {
+  p0 = c128[3] parameter(0)
+  p1 = c128[3] parameter(1)
+  dot = c128[] dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  ROOT out = c128[] add(dot, dot)
+}
+      )",
+      R"(
+      // CHECK: func.func private @main_out(
+      // CHECK-SAME: %[[ARG0:.*]]: tensor<3xcomplex<f64>>,
+      // CHECK-SAME: %[[ARG1:.*]]: tensor<3xcomplex<f64>>
+      // CHECK:   %[[CST0:.*]] = arith.constant 0.000000e+00 : f64
+      // CHECK:   %[[INIT:.*]] = complex.create %[[CST0]], %[[CST0]] : complex<f64>
+      // CHECK:   %[[DOTRESULT:.*]] = scf.for {{.*}} = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}} = %[[INIT]]) -> (complex<f64>) {
+      // CHECK:     %[[EXTRACTED:.*]] = tensor.extract %[[ARG0]][{{.*}}]
+      // CHECK:     %[[EXTRACTED0:.*]] = tensor.extract %[[ARG1]][{{.*}}]
+      // CHECK:     %[[MUL:.*]] = complex.mul %[[EXTRACTED]], %[[EXTRACTED0]]
+      // CHECK:     %[[NEXTACC:.*]] = complex.add {{.*}}, %[[MUL]]
+      // CHECK:     scf.yield %[[NEXTACC]]
+      // CHECK:   %[[OUT:.*]] = complex.add %[[DOTRESULT]], %[[DOTRESULT]]
+      // CHECK:   return %[[OUT]]
+      )"));
+}
+
 }  // namespace
 }  // namespace emitters
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
new file mode 100644
index 000000000000..9d0ef0b98da9
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.cc
@@ -0,0 +1,84 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/codegen/emitters/fusion_wrapper_base.h"
+
+#include <functional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/errors.h"
+
+namespace xla {
+namespace emitters {
+
+absl::StatusOr<bool> FusionWrapperBase::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  auto instructions = module->entry_computation()->MakeInstructionPostOrder();
+  bool changed = false;
+
+  std::function<absl::Status(HloInstruction*)> handle_instruction;
+  handle_instruction = [&](HloInstruction* instruction) -> absl::Status {
+    const HloOpcode opcode = instruction->opcode();
+    if (opcode == HloOpcode::kConditional || opcode == HloOpcode::kWhile ||
+        opcode == HloOpcode::kCall || opcode == HloOpcode::kAsyncStart) {
+      for (auto* computation : instruction->called_computations()) {
+        for (auto* inner_instruction :
+             computation->MakeInstructionPostOrder()) {
+          TF_RETURN_IF_ERROR(handle_instruction(inner_instruction));
+        }
+      }
+      return absl::OkStatus();
+    }
+    if (!MustWrapInstruction(opcode)) {
+      return absl::OkStatus();
+    }
+    auto* computation = instruction->parent();
+    auto* fusion_instruction =
+        computation->AddInstruction(HloInstruction::CreateFusion(
+            instruction->shape(), ChooseFusionKind(*instruction, *instruction),
+            instruction));
+    const absl::string_view wrapped_opcode =
+        HloOpcodeString(instruction->opcode());
+    module->SetAndUniquifyInstrName(fusion_instruction,
+                                    absl::StrCat("wrapped_", wrapped_opcode));
+    module->SetAndUniquifyComputationName(
+        fusion_instruction->fused_instructions_computation(),
+        absl::StrCat("wrapped_", wrapped_opcode, "_computation"));
+    if (module->has_schedule()) {
+      module->schedule().replace_instruction(computation, instruction,
+                                             fusion_instruction);
+    }
+    TF_RETURN_IF_ERROR(fusion_instruction->CopyAllControlDepsFrom(instruction));
+    TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
+    TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(fusion_instruction));
+    TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+    changed = true;
+    return absl::OkStatus();
+  };
+
+  for (auto* instruction : instructions) {
+    TF_RETURN_IF_ERROR(handle_instruction(instruction));
+  }
+  return changed;
+}
+
+}  // namespace emitters
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
new file mode 100644
index 000000000000..730561b3651d
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/fusion_wrapper_base.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_EMITTERS_FUSION_WRAPPER_BASE_H_
+#define XLA_CODEGEN_EMITTERS_FUSION_WRAPPER_BASE_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace emitters {
+
+// Wraps single operations in a fusion.
+// The derived classes determine which operations to wrap and
+// the type of the wrapper.
+class FusionWrapperBase : public HloModulePass {
+ public:
+  virtual bool MustWrapInstruction(HloOpcode opcode) = 0;
+  virtual HloInstruction::FusionKind ChooseFusionKind(
+      const HloInstruction& producer, const HloInstruction& consumer) {
+    return HloInstruction::FusionKind::kLoop;
+  };
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace emitters
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_EMITTERS_FUSION_WRAPPER_BASE_H_
diff --git a/third_party/xla/xla/codegen/emitters/ir/BUILD b/third_party/xla/xla/codegen/emitters/ir/BUILD
index 35e7e5ee8da2..b396ff3fc6ef 100644
--- a/third_party/xla/xla/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/codegen/emitters/ir/BUILD
@@ -35,16 +35,10 @@ gentbl_cc_library(
     name = "xla_dialect_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-dialect-decls"],
-            "xla_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "xla_dialect.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_dialect.h.inc": ["-gen-dialect-decls"],
+        "xla_dialect.cc.inc": ["-gen-dialect-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_dialect.td",
     deps = [":xla_td_files"],
@@ -54,16 +48,10 @@ gentbl_cc_library(
     name = "xla_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "xla_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "xla_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_ops.h.inc": ["-gen-op-decls"],
+        "xla_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_ops.td",
     deps = [":xla_td_files"],
@@ -73,28 +61,16 @@ gentbl_cc_library(
     name = "xla_attrs_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-enum-decls"],
-            "xla_enums.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "xla_enums.cc.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-decls",
-            ],
-            "xla_attrs.h.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-defs",
-            ],
-            "xla_attrs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_enums.h.inc": ["-gen-enum-decls"],
+        "xla_enums.cc.inc": ["-gen-enum-defs"],
+        "xla_attrs.h.inc": [
+            "-gen-attrdef-decls",
+        ],
+        "xla_attrs.cc.inc": [
+            "-gen-attrdef-defs",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_attrs.td",
     deps = [":xla_td_files"],
@@ -112,6 +88,7 @@ cc_library(
         ":xla_attrs_inc_gen",
         ":xla_dialect_inc_gen",
         ":xla_ops_inc_gen",
+        "//xla/codegen/emitters:type_util",
         "//xla/hlo/analysis:indexing_analysis",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -134,12 +111,13 @@ xla_test(
     name = "xla_ops_test",
     srcs = ["xla_ops_test.cc"],
     backends = ["cpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":xla",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/testlib:filecheck",
         "//xla/mlir/utils:error_util",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
diff --git a/third_party/xla/xla/codegen/emitters/ir/tests/inlining.mlir b/third_party/xla/xla/codegen/emitters/ir/tests/inlining.mlir
index 2fc85a980784..9263a7470e90 100644
--- a/third_party/xla/xla/codegen/emitters/ir/tests/inlining.mlir
+++ b/third_party/xla/xla/codegen/emitters/ir/tests/inlining.mlir
@@ -147,11 +147,11 @@ module {
 // -----
 
 module {
-  func.func private @fib0(%start : f32) -> f32 {
+  func.func private @fib0(%start : f32) -> f32 attributes {no_compute = true} {
     %zero = arith.constant 0.0 : f32
     return %zero : f32
   }
-  func.func private @fib1(%start : f32) -> f32 {
+  func.func private @fib1(%start : f32) -> f32 attributes {no_compute = true} {
     return %start : f32
   }
   func.func private @fib2(%start : f32) -> f32 {
@@ -202,16 +202,17 @@ module {
 }
 
 // CHECK-LABEL: module {
+// CHECK-NOT: fib0
+// CHECK: func.func private @fib2
+// CHECK-NOT: fib1
+// CHECK-NOT: fib3
+// CHECK-NOT: fib4
+// CHECK-NOT: fib5
+// CHECK-NOT: fib6
+// CHECK-NOT: fib7
+
 // CHECK: @caller
-// CHECK: arith.constant 0.000000e+00
-// CHECK: xla.pure_call @fib5
-// CHECK: arith.addf
-// CHECK: arith.addf
-// CHECK: arith.addf
-// CHECK: arith.addf
-// CHECK: xla.pure_call @fib5
-// CHECK: arith.addf
-// CHECK: arith.addf
+// CHECK-COUNT-8: xla.pure_call @fib2
 
 // -----
 
@@ -320,3 +321,24 @@ module {
 // CHECK-NOT:     callee2
 // CHECK:         func.func @caller
 // CHECK-COUNT-2: pure_call @callee1
+
+// -----
+
+module {
+  func.func private @has_no_compute(%a: f32) -> f32
+      attributes {no_compute = true} {
+    return %a : f32
+  }
+
+  func.func @caller(%a: f32, %b: f32) -> f32 {
+    %call1 = xla.pure_call @has_no_compute(%a) : (f32) -> (f32)
+    %call2 = xla.pure_call @has_no_compute(%b) : (f32) -> (f32)
+    %sum = arith.addf %call1, %call2 : f32
+    return %sum : f32
+  }
+}
+
+// CHECK-LABEL: module {
+// CHECK: @caller
+// CHECK-NEXT: arith.addf
+// CHECK-NEXT: return
\ No newline at end of file
diff --git a/third_party/xla/xla/codegen/emitters/ir/tests/invalid.mlir b/third_party/xla/xla/codegen/emitters/ir/tests/invalid.mlir
index 79a827cd0476..1d45c62f44f7 100644
--- a/third_party/xla/xla/codegen/emitters/ir/tests/invalid.mlir
+++ b/third_party/xla/xla/codegen/emitters/ir/tests/invalid.mlir
@@ -2,7 +2,7 @@
 
 #map0 = #xla.indexing_map<"(d0, d1)[s0] -> (d0, d1 + s0), domain: d0 in [1, 2], d1 in [5, 8], s0 in [0, 32]">
 func.func @apply_indexing(%d0: index, %d1: index, %s0: index) -> (index, index) {
-  // expected-error @+1 {{operand count must match the number of dimensions and symbols in the affine map}}
+  // expected-error @+1 {{operand count 1 does not match the sum of dimensions 2 and symbols 1 in the affine map}}
   %0:2 = xla.apply_indexing #map0 (%d0)
   func.return %0#0, %0#1 : index, index
 }
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_attrs.td b/third_party/xla/xla/codegen/emitters/ir/xla_attrs.td
index 197ac2e72c96..b410d33135fb 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_attrs.td
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_attrs.td
@@ -77,4 +77,11 @@ def XLA_BackendKindAttr :
   let assemblyFormat = "`<` $value `>`";
 }
 
+def XLA_ExtraBackendOptionsAttr
+  : ArrayOfAttr<XlaDialect,
+                "ExtraBackendOptions",
+                "extra_backend_options",
+                "mlir::StringAttr"> {
+}
+
 #endif // XLA_CODEGEN_EMITTERS_IR_XLA_ATTRS
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_dialect.cc b/third_party/xla/xla/codegen/emitters/ir/xla_dialect.cc
index 92b3fdcc96a4..6085f85bb108 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_dialect.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_dialect.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
@@ -24,6 +26,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/emitters/type_util.h"
 
 // The order of these includes is important.
 #define GET_ATTRDEF_CLASSES
@@ -33,6 +36,10 @@ limitations under the License.
 namespace xla {
 namespace {
 
+constexpr int64_t kMaxFuncSize = 4000;
+
+int64_t GetNumOps(mlir::Block& block) { return block.getOperations().size(); }
+
 struct XlaInlinerInterface : public mlir::DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
   // Returns true if the given operation 'callable', that implements the
@@ -45,11 +52,7 @@ struct XlaInlinerInterface : public mlir::DialectInlinerInterface {
   bool isLegalToInline(mlir::Operation* call, mlir::Operation* callable,
                        bool wouldBeCloned) const final {
     if (call->hasAttr("noinline")) return false;
-    if (!wouldBeCloned) {
-      // If no duplicate would be created, 'call' is likely the only caller of
-      // 'callable'.
-      return true;
-    }
+    if (callable->hasAttr(emitters::kHasNoCompute)) return true;
     // Otherwise, inline only if the called function is small. We could
     // theoretically also inline if there is no other caller in the function
     // that contains the callee that has a call path to the callable, but that
@@ -62,36 +65,38 @@ struct XlaInlinerInterface : public mlir::DialectInlinerInterface {
     if (!pure_call_op) {
       return false;
     }
-    auto region = func_op.getCallableRegion();
-    if (!region) {
+    auto callable_region = func_op.getCallableRegion();
+    if (!callable_region) {
       return false;
     }
 
     llvm::SmallDenseSet<llvm::StringRef> callee_calls;
-    for (auto call : region->getOps<PureCallOp>()) {
-      callee_calls.insert(call.getCallee());
+    for (auto callee_call : callable_region->getOps<PureCallOp>()) {
+      callee_calls.insert(callee_call.getCallee());
     }
 
     // If true, then the callee and the caller call the same third function.
     bool contains_call_to_same_function = false;
     // The number of calls to the callee in the caller.
     int num_calls_in_caller = 0;
-    for (auto neighbor_call : call->getParentRegion()->getOps<PureCallOp>()) {
-      contains_call_to_same_function |=
-          callee_calls.contains(neighbor_call.getCallee());
-      if (neighbor_call.getCallee() == pure_call_op.getCallee()) {
-        ++num_calls_in_caller;
+    if (!wouldBeCloned) {
+      num_calls_in_caller = 1;
+    } else {
+      for (auto neighbor_call : call->getParentRegion()->getOps<PureCallOp>()) {
+        contains_call_to_same_function |=
+            callee_calls.contains(neighbor_call.getCallee());
+        if (neighbor_call.getCallee() == pure_call_op.getCallee()) {
+          ++num_calls_in_caller;
+        }
       }
     }
     if (num_calls_in_caller > 1) return false;
-    if (contains_call_to_same_function) return true;
-
-    constexpr int kMaxOperationsToInline = 8;
-    int num_ops = 0;
-    region->front().walk([&](mlir::Operation* op) { ++num_ops; });
-
-    // Don't inline functions with more than `kMaxOperationsToInline` ops.
-    return num_ops <= kMaxOperationsToInline;
+    // Don't inline functions, if after inlining the size of the function
+    // becomes too big.
+    int num_ops = num_calls_in_caller * GetNumOps(callable_region->front()) +
+                  GetNumOps(call->getParentRegion()->front());
+    if (num_ops > kMaxFuncSize) return false;
+    return !wouldBeCloned || contains_call_to_same_function;
   }
 
   // Returns true if the given operation 'op', that is registered to this
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
index 25cf260e39ac..4122301b9615 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
@@ -269,9 +269,10 @@ LogicalResult ApplyIndexingOp::verify() {
   auto affine_map = getIndexingMapAttr().getIndexingMap().GetAffineMap();
   unsigned num_variables = affine_map.getNumDims() + affine_map.getNumSymbols();
   if (getOperands().size() != num_variables) {
-    return emitOpError(
-        "operand count must match the number of dimensions and symbols in the "
-        "affine map");
+    return emitOpError(absl::StrCat(
+        "operand count ", getOperands().size(),
+        " does not match the sum of dimensions ", affine_map.getNumDims(),
+        " and symbols ", affine_map.getNumSymbols(), " in the affine map"));
   }
   if (!getIndexingMap().GetConstraints().empty()) {
     return emitOpError("apply indexing op cannot have any constraints");
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
index eaa43a82e6d0..d6c565694107 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
@@ -41,7 +41,7 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/mlir/utils/error_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -67,7 +67,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
   return std::move(module);
 }
 
-class XLAOpsTest : public HloTestBase {
+class XLAOpsTest : public HloPjRtTestBase {
  public:
   mlir::MLIRContext mlir_context_;
 };
diff --git a/third_party/xla/xla/codegen/emitters/transforms/BUILD b/third_party/xla/xla/codegen/emitters/transforms/BUILD
index 737f63f509fe..4101113b9fb5 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/codegen/emitters/transforms/BUILD
@@ -32,15 +32,10 @@ cc_library(
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=Transforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=Transforms",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     visibility = ["//visibility:private"],
@@ -50,7 +45,6 @@ gentbl_cc_library(
 cc_library(
     name = "passes",
     srcs = [
-        "convert_pure_call_ops.cc",
         "erase_dead_functions.cc",
         "expand_float_ops.cc",
         "flatten_tensors.cc",
@@ -59,21 +53,22 @@ cc_library(
         "lower_xla_to_scf.cc",
         "merge_pointers_to_same_slice.cc",
         "propagate_slice_indices.cc",
-        "simplify_affine.cc",
-        "simplify_arith.cc",
         "unswitch_loops.cc",
+        "vectorize_loads_stores.cc",
     ],
     hdrs = ["passes.h"],
     deps = [
         ":atomic_rmw_utils",
+        ":convert_pure_call_ops_pass",
         ":passes_inc_gen",
+        ":simplify_affine_pass",
+        ":simplify_arith_pass",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen/emitters/ir:xla_cpu",
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen:device_spec",
-        "//xla/codegen:emitter_loc_op_builder",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
@@ -84,15 +79,13 @@ cc_library(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AMDGPUUtils",
         "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:ArithTransforms",
@@ -101,6 +94,7 @@ cc_library(
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:DataLayoutInterfaces",
+        "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUDialect",
@@ -127,3 +121,69 @@ cc_library(
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
+
+cc_library(
+    name = "convert_pure_call_ops_pass",
+    srcs = [
+        "convert_pure_call_ops.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":passes_inc_gen",
+        "//xla/codegen/emitters/ir:xla",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "simplify_affine_pass",
+    srcs = [
+        "simplify_affine.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":passes_inc_gen",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/hlo/analysis:indexing_analysis",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineUtils",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "simplify_arith_pass",
+    srcs = [
+        "simplify_arith.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":passes_inc_gen",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/hlo/analysis:indexing_analysis",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
diff --git a/third_party/xla/xla/codegen/emitters/transforms/atomic_rmw_utils.cc b/third_party/xla/xla/codegen/emitters/transforms/atomic_rmw_utils.cc
index b1126d3c3402..23388b804728 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/atomic_rmw_utils.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/atomic_rmw_utils.cc
@@ -104,6 +104,7 @@ std::optional<std::pair<Value, ml::AtomicBinOp>> GetAtomicModifierParameters(
     return std::nullopt;
   }
   // Match the kind of the atomic op.
+  // TODO(rocm): Match bf16 ops
   mlir::Operation* modifier_op = &operations.front();
   auto kind = GetAtomicBinOp(modifier_op, element_type);
   if (!kind.has_value()) {
diff --git a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
index 37f475ba3f59..8ec80544f72a 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
@@ -218,7 +218,7 @@ struct RewritePureCall : OpRewritePattern<PureCallOp> {
 };
 
 // Returns the linearized index.
-Value LinearizeIndex(Value value, ShapedType type, ValueRange indices,
+Value LinearizeIndex(Location loc, ShapedType type, ValueRange indices,
                      PatternRewriter& rewriter, Attribute encoding = nullptr) {
   auto byte_shape = ShapeUtil::MakeShape(U8, type.getShape());
   if (encoding) {
@@ -228,10 +228,10 @@ Value LinearizeIndex(Value value, ShapedType type, ValueRange indices,
   auto linear_shape =
       ShapeUtil::MakeShape(U8, {ShapeUtil::ElementsIn(byte_shape)});
   auto linearized_map =
-      GetBitcastMap(byte_shape, linear_shape, value.getContext());
+      GetBitcastMap(byte_shape, linear_shape, rewriter.getContext());
   mlir::SmallVector<Value> result;
-  rewriter.createOrFold<ApplyIndexingOp>(result, value.getLoc(), indices,
-                                         ValueRange{}, linearized_map);
+  rewriter.createOrFold<ApplyIndexingOp>(result, loc, indices, ValueRange{},
+                                         linearized_map);
   return result.front();
 }
 
@@ -304,17 +304,46 @@ struct RewriteTensorExtract : OpRewritePattern<ExtractOp> {
     if (tensor_type.getRank() < 2) {
       return rewriter.notifyMatchFailure(op, "the tensor is already flat");
     }
-    auto linear_index = LinearizeIndex(tensor, tensor_type, op.getIndices(),
+    auto loc = op.getLoc();
+    auto linear_index = LinearizeIndex(loc, tensor_type, op.getIndices(),
                                        rewriter, tensor_type.getEncoding());
     auto tensor_1D = rewriter
                          .create<UnrealizedConversionCastOp>(
-                             op.getLoc(), GetFlattenedType(tensor_type), tensor)
+                             loc, GetFlattenedType(tensor_type), tensor)
                          .getResult(0);
     rewriter.replaceOpWithNewOp<ExtractOp>(op, tensor_1D, linear_index);
     return mlir::success();
   }
 };
 
+struct RewriteVectorTransferRead : OpRewritePattern<mv::TransferReadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mv::TransferReadOp op,
+                                PatternRewriter& rewriter) const override {
+    auto vector = op.getVector();
+    auto vector_type = vector.getType();
+    if (vector_type.getRank() != 1) {
+      return rewriter.notifyMatchFailure(op, "the vector should be 1D");
+    }
+    auto tensor = op.getSource();
+    auto tensor_type = tensor.getType();
+    if (tensor_type.getRank() < 2) {
+      return rewriter.notifyMatchFailure(op,
+                                         "the source tensore is already flat");
+    }
+    auto loc = op.getLoc();
+    auto linear_index =
+        LinearizeIndex(loc, tensor_type, op.getIndices(), rewriter);
+    auto tensor_1D = rewriter
+                         .create<UnrealizedConversionCastOp>(
+                             loc, GetFlattenedType(tensor_type), tensor)
+                         .getResult(0);
+    rewriter.replaceOpWithNewOp<mv::TransferReadOp>(
+        op, vector_type, tensor_1D, linear_index, llvm::ArrayRef<bool>{true});
+    return mlir::success();
+  }
+};
 struct RewriteVectorExtract : OpRewritePattern<mv::ExtractOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -327,7 +356,8 @@ struct RewriteVectorExtract : OpRewritePattern<mv::ExtractOp> {
     }
     auto indices =
         mv::getAsValues(rewriter, op.getLoc(), op.getMixedPosition());
-    auto linear_index = LinearizeIndex(vector, vector_type, indices, rewriter);
+    auto linear_index =
+        LinearizeIndex(op.getLoc(), vector_type, indices, rewriter);
     auto vector_1D = rewriter
                          .create<UnrealizedConversionCastOp>(
                              op.getLoc(), GetFlattenedType(vector_type), vector)
@@ -347,9 +377,10 @@ struct RewriteTensorInsert : OpRewritePattern<InsertOp> {
     if (tensor_type.getRank() < 2) {
       return rewriter.notifyMatchFailure(op, "the tensor is already flat");
     }
-    auto linear_index = LinearizeIndex(tensor, tensor_type, op.getIndices(),
+    auto loc = op.getLoc();
+    auto linear_index = LinearizeIndex(loc, tensor_type, op.getIndices(),
                                        rewriter, tensor_type.getEncoding());
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    mlir::ImplicitLocOpBuilder b(loc, rewriter);
     auto tensor_1D = b.create<UnrealizedConversionCastOp>(
                           GetFlattenedType(tensor_type), tensor)
                          .getResult(0);
@@ -372,15 +403,15 @@ struct RewriteVectorInsert : OpRewritePattern<mv::InsertOp> {
     if (vector_type.getRank() < 2) {
       return rewriter.notifyMatchFailure(op, "the vector is already flat");
     }
-    auto indices =
-        mv::getAsValues(rewriter, op.getLoc(), op.getMixedPosition());
-    auto linear_index = LinearizeIndex(vector, vector_type, indices, rewriter);
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    auto loc = op.getLoc();
+    auto indices = mv::getAsValues(rewriter, loc, op.getMixedPosition());
+    auto linear_index = LinearizeIndex(loc, vector_type, indices, rewriter);
+    mlir::ImplicitLocOpBuilder b(loc, rewriter);
     auto vector_1D = b.create<UnrealizedConversionCastOp>(
                           GetFlattenedType(vector_type), vector)
                          .getResult(0);
     auto new_insert =
-        b.create<mv::InsertOp>(op.getSource(), vector_1D, linear_index);
+        b.create<mv::InsertOp>(op.getValueToStore(), vector_1D, linear_index);
     auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
         vector_type, new_insert.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
@@ -398,9 +429,10 @@ struct RewriteAtomicRMW : OpRewritePattern<AtomicRMWOp> {
     if (tensor_type.getRank() < 2) {
       return rewriter.notifyMatchFailure(op, "the tensor is already flat");
     }
-    auto linear_index = LinearizeIndex(tensor, tensor_type, op.getIndices(),
+    auto loc = op.getLoc();
+    auto linear_index = LinearizeIndex(loc, tensor_type, op.getIndices(),
                                        rewriter, tensor_type.getEncoding());
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    mlir::ImplicitLocOpBuilder b(loc, rewriter);
     auto tensor_1D = b.create<UnrealizedConversionCastOp>(
                           GetFlattenedType(tensor_type), tensor)
                          .getResult(0);
@@ -714,6 +746,7 @@ class FlattenTensorsPass
         RewriteTensorInsert,
         RewriteVectorExtract,
         RewriteVectorInsert,
+        RewriteVectorTransferRead,
         RewriteCpuLoad
     >(mlir_context);
     // clang-format on
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc
index ab3059bdc530..359019c94cdb 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_tensors.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -49,6 +50,7 @@ limitations under the License.
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Types.h"
@@ -124,6 +126,30 @@ Value GetDestinationBuffer(Value dest) {
   return dest;
 }
 
+std::optional<int> GetAlignmentFromArg(Value addr, ValueRange indices) {
+  CHECK_LE(indices.size(), 1) << "Only 0D and 1D tensors are supported";
+
+  // If the offset isn't empty or {0}, we don't return any alignment because
+  // computing it isn't trivial and it's unclear that we need to deal with that
+  // case in practice.
+  auto effective_offset_is_zero = [](ValueRange offsets) -> bool {
+    if (offsets.empty()) return true;
+    return mlir::matchPattern(offsets[0].getDefiningOp(), mlir::m_Zero());
+  };
+  if (!effective_offset_is_zero(indices)) return std::nullopt;
+
+  // Try to get the alignment from the function signature.
+  auto base = mlir::dyn_cast<mlir::BlockArgument>(addr);
+  if (!base) return std::nullopt;
+  auto func =
+      mlir::dyn_cast<mlir::func::FuncOp>(base.getOwner()->getParentOp());
+  if (!func) return std::nullopt;
+  auto align_attr =
+      func.getArgAttr(base.getArgNumber(), ml::LLVMDialect::getAlignAttrName());
+  if (!align_attr) return std::nullopt;
+  return mlir::cast<mlir::IntegerAttr>(align_attr).getValue().getSExtValue();
+}
+
 template <typename Op>
 bool IsSupportedTransfer(Op op) {
   return !absl::c_linear_search(op.getInBoundsValues(), false) &&
@@ -319,7 +345,7 @@ ml::GEPOp CreateGep(TypedValue<mlir::RankedTensorType> tensor,
   auto llvm_element_type = converter.convertType(element_type);
   auto gep =
       b.create<ml::GEPOp>(ptr, llvm_element_type, tensor_ptr, linear_index);
-  gep.setInbounds(true);
+  gep.setNoWrapFlags(mlir::LLVM::GEPNoWrapFlags::inbounds);
   return gep;
 }
 
@@ -393,7 +419,11 @@ struct RewriteTransferRead : OpRewritePattern<vector::TransferReadOp> {
 
     mlir::LLVMTypeConverter converter(b.getContext());
     auto llvm_vector_type = converter.convertType(vector_type);
-    auto loaded = b.create<ml::LoadOp>(llvm_vector_type, gep).getResult();
+    auto load = b.create<ml::LoadOp>(llvm_vector_type, gep);
+    if (auto alignment = GetAlignmentFromArg(op.getSource(), op.getIndices())) {
+      load.setAlignment(*alignment);
+    }
+    auto loaded = load.getResult();
 
     if (source_element_type.isInteger(1)) {
       Value zero = b.create<mlir::arith::ConstantOp>(
@@ -594,15 +624,17 @@ ml::GlobalOp CreateGlobalOp(mlir::Attribute value,
       element_type.getIntOrFloatBitWidth() == 4) {
     num_elements = CeilOfRatio<int64_t>(num_elements, 2);
     llvm_element_type = b.getI8Type();
-    auto unpacked_data =
-        mlir::cast<mlir::DenseElementsAttr>(value).getRawData();
-    std::vector<char> packed_data(num_elements);
-    absl::Span<char> packed_data_span =
-        absl::MakeSpan(packed_data.data(), packed_data.size());
-    PackIntN(4, unpacked_data, packed_data_span);
-    value = mlir::DenseElementsAttr::getFromRawBuffer(
-        mlir::RankedTensorType::get({num_elements}, llvm_element_type),
-        packed_data);
+    if (value) {
+      auto unpacked_data =
+          mlir::cast<mlir::DenseElementsAttr>(value).getRawData();
+      std::vector<char> packed_data(num_elements);
+      absl::Span<char> packed_data_span =
+          absl::MakeSpan(packed_data.data(), packed_data.size());
+      PackIntN(4, unpacked_data, packed_data_span);
+      value = mlir::DenseElementsAttr::getFromRawBuffer(
+          mlir::RankedTensorType::get({num_elements}, llvm_element_type),
+          packed_data);
+    }
   }
   auto array_ty = ml::LLVMArrayType::get(llvm_element_type, num_elements);
   std::string name;
@@ -728,7 +760,7 @@ bool IsAtomicIntegral(Type element_type) {
 Value CreateBitcast(mlir::ImplicitLocOpBuilder& b, mlir::Operation* op,
                     Value value, Type ty) {
   if (value.getType().isIntOrFloat() && ty.isIntOrFloat()) {
-    return b.create<ml::BitcastOp>(ty, value);
+    return b.create<arith::BitcastOp>(ty, value);
   }
 
   mlir::LLVMTypeConverter converter(b.getContext());
@@ -779,6 +811,13 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
   }
 
  private:
+  llvm::StringRef determinateScope() const {
+    if (device_spec_.IsAmdGpu()) {
+      return llvm::StringRef("agent-one-as");
+    }
+    return llvm::StringRef();
+  }
+
   // Certain computations, such as floating-point addition and integer
   // maximization, can be simply implemented using an LLVM atomic instruction.
   // If "computation" is one of this kind, emits code to do that and returns
@@ -797,8 +836,7 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
     ml::AtomicBinOp atomic_bin_op = modifier_parameters->second;
 
     Location loc = op.getLoc();
-    bool is_amd = device_spec_.IsAmdGpu();
-    llvm::StringRef sync_scope = is_amd ? "agent" : "";
+    auto sync_scope = determinateScope();
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
     Value addr = CreateGep(op.getInput(), op.getIndices(), b);
 
@@ -817,18 +855,18 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
       case ml::AtomicBinOp::umax:
       case ml::AtomicBinOp::umin: {
         rewriter.create<ml::AtomicRMWOp>(loc, atomic_bin_op, addr, modifier_arg,
-                                         ml::AtomicOrdering::seq_cst,
+                                         ml::AtomicOrdering::monotonic,
                                          sync_scope);
         return success();
       }
       case ml::AtomicBinOp::fadd: {
         // TODO(b/336367154): Introduce an atomic_rmw op with the binOp attr.
-        return is_amd
+        return device_spec_.IsAmdGpu()
                    ? emitAMDAtomicFAdd(
-                         loc, modifier_arg, addr, sync_scope,
+                         loc, modifier_arg, addr,
                          device_spec_.gpu().rocm_compute_capability(), rewriter)
                    : emitNVidiaAtomicFAdd(
-                         loc, modifier_arg, addr, sync_scope,
+                         loc, modifier_arg, addr,
                          device_spec_.gpu().cuda_compute_capability(), rewriter,
                          op);
       }
@@ -843,7 +881,7 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
   }
 
   LogicalResult emitNVidiaAtomicFAdd(
-      Location loc, Value modifier_arg, Value addr, llvm::StringRef sync_scope,
+      Location loc, Value modifier_arg, Value addr,
       const se::CudaComputeCapability& cuda_compute_capability, OpBuilder& b,
       AtomicRMWOp& op) const {
     Type element_type = modifier_arg.getType();
@@ -868,7 +906,7 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
     }
 
     b.create<ml::AtomicRMWOp>(loc, ml::AtomicBinOp::fadd, addr, modifier_arg,
-                              ml::AtomicOrdering::seq_cst, sync_scope);
+                              ml::AtomicOrdering::monotonic);
     return success();
   }
 
@@ -921,28 +959,71 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
   }
 
   LogicalResult emitAMDAtomicFAdd(
-      Location loc, Value modifier_arg, Value addr, llvm::StringRef sync_scope,
+      Location loc, Value modifier_arg, Value addr,
       const se::RocmComputeCapability& rocm_compute_capability,
       OpBuilder& b) const {
     Type element_type = modifier_arg.getType();
-    bool is_supported_f16_atomic =
-        element_type.isF16() &&
-        rocm_compute_capability.has_fp16_atomics_support();
-    if (!element_type.isF32() && !is_supported_f16_atomic) {
+    if (auto vector_type = dyn_cast_or_null<mlir::VectorType>(element_type)) {
+      // TODO(rocm) Don't vectorize atomics if we cannot satisfy 4-byte
+      // alignment
+      if (!(vector_type.getNumElements() == 2 &&
+            (vector_type.getElementType().isF16() ||
+             vector_type.getElementType().isBF16()))) {
+        return failure();
+      }
+    } else if (!element_type.isF32() && !element_type.isF16() &&
+               !element_type.isBF16() && !element_type.isF64()) {
       return failure();
     }
-    constexpr int kGlobalMemory = 1;
-    constexpr int kSharedMemory = 3;
-    auto addr_type = mlir::cast<ml::LLVMPointerType>(addr.getType());
-    // adds to shared memory are always atomic.
-    if (addr_type.getAddressSpace() != kSharedMemory) {
-      // The compiler will only generate a global_atomic_fadd if the pointer
-      // is in global addrspace (1)
-      addr = b.create<ml::AddrSpaceCastOp>(
-          loc, ml::LLVMPointerType::get(b.getContext(), kGlobalMemory), addr);
+
+    if ((element_type.isF16() &&
+         rocm_compute_capability.has_packed_fp16_atomics_support()) ||
+        (element_type.isBF16() &&
+         rocm_compute_capability.has_packed_bf16_atomics_support())) {
+      auto packed_type = mlir::VectorType::get({2}, element_type);
+      auto i64_type = b.getI64Type();
+      auto i32_type = b.getI32Type();
+      auto i16_type = b.getI16Type();
+      Value addr_int = b.create<ml::PtrToIntOp>(loc, i64_type, addr);
+      Value addr_masked = b.create<ml::AndOp>(
+          loc, addr_int, b.create<ml::ConstantOp>(loc, i64_type, -4));
+
+      Value offset = b.create<ml::AndOp>(
+          loc, b.create<ml::TruncOp>(loc, i32_type, addr_int),
+          b.create<ml::ConstantOp>(loc, i32_type, 2));
+
+      Value shift = b.create<ml::MulOp>(
+          loc, offset, b.create<ml::ConstantOp>(loc, i32_type, 8));
+
+      Value modifier_int = b.create<ml::BitcastOp>(loc, i16_type, modifier_arg);
+
+      Value modifier_masked = b.create<ml::ShlOp>(
+          loc, b.create<ml::ZExtOp>(loc, i32_type, modifier_int), shift);
+
+      constexpr int kGlobalMemory = 1;
+      addr = b.create<ml::IntToPtrOp>(
+          loc, ml::LLVMPointerType::get(b.getContext(), kGlobalMemory),
+          addr_masked);
+
+      modifier_arg = b.create<ml::BitcastOp>(loc, packed_type, modifier_masked);
+      element_type = packed_type;
     }
-    b.create<ml::AtomicRMWOp>(loc, ml::AtomicBinOp::fadd, addr, modifier_arg,
-                              ml::AtomicOrdering::seq_cst, sync_scope);
+
+    auto op = b.create<ml::AtomicRMWOp>(
+        loc, ml::AtomicBinOp::fadd, addr, modifier_arg,
+        ml::AtomicOrdering::monotonic, "agent-one-as");
+
+    auto unitAttr = b.getUnitAttr();
+    auto* rocdl =
+        op->getContext()->getOrLoadDialect<mlir::ROCDL::ROCDLDialect>();
+    auto noRemoteMemHelper = rocdl->getNoRemoteMemoryAttrHelper();
+    auto noFineMemHelper = rocdl->getNoFineGrainedMemoryAttrHelper();
+    auto ignoreDenormalModeHelper = rocdl->getIgnoreDenormalModeAttrHelper();
+
+    noRemoteMemHelper.setAttr(op, unitAttr);
+    noFineMemHelper.setAttr(op, unitAttr);
+    ignoreDenormalModeHelper.setAttr(op, unitAttr);
+
     return success();
   }
 
@@ -958,6 +1039,7 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
     // propagating -NaNs. To handle this, we check if the update value is -NaN
     // and convert it to a positive one by dropping the sign-bit.
     Value current = b.create<ml::LoadOp>(loc, element_type, addr);
+
     Value current_is_nan =
         b.create<ml::FCmpOp>(loc, ml::FCmpPredicate::uno, current, current);
     auto is_current_nan =
@@ -993,7 +1075,7 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
 
     auto then_builder =
         OpBuilder::atBlockEnd(if_need_update.thenBlock(), b.getListener());
-    Value source_float_as_int = then_builder.create<ml::BitcastOp>(
+    Value source_float_as_int = then_builder.create<arith::BitcastOp>(
         loc, then_builder.getI32Type(), no_negative_nan_source);
     Value c0 = then_builder.create<ml::ConstantOp>(loc, b.getI32Type(), 0);
     Value is_not_negative = then_builder.create<ml::ICmpOp>(
@@ -1004,14 +1086,14 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
           // atomicMax((int *)address, __float_as_int(val))
           nested_b.create<ml::AtomicRMWOp>(
               loc, ml::AtomicBinOp::max, addr, source_float_as_int,
-              ml::AtomicOrdering::seq_cst, sync_scope);
+              ml::AtomicOrdering::monotonic, sync_scope);
           nested_b.create<scf::YieldOp>(nested_loc);
         },
         [&](OpBuilder& nested_b, Location nested_loc) {
           // atomicMax((int *)address, __float_as_int(val))
           nested_b.create<ml::AtomicRMWOp>(
               loc, ml::AtomicBinOp::umin, addr, source_float_as_int,
-              ml::AtomicOrdering::seq_cst, sync_scope);
+              ml::AtomicOrdering::monotonic, sync_scope);
           nested_b.create<scf::YieldOp>(nested_loc);
         });
     then_builder.create<scf::YieldOp>(loc);
@@ -1064,7 +1146,16 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
 
     // Calculate load address for the input.
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    Value addr = CreateGep(input, op.getIndices(), b);
+    Value linear_index = GetLinearIndex(op.getIndices(), b);
+    Value is_low_nibble;
+
+    bool is_4_bit_wide =
+        result_ty.isIntOrFloat() && result_ty.getIntOrFloatBitWidth() == 4;
+    if (is_4_bit_wide) {
+      std::tie(linear_index, is_low_nibble) =
+          GetI4IndexAndNibble(linear_index, b);
+    }
+    Value addr = CreateGep(input, linear_index, b);
     Value shift, mask;
     if (small_type) {
       // Update input pointer by discarding the last two bits - i.e. align to
@@ -1079,13 +1170,20 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
           rewriter.create<ml::ConstantOp>(loc, addr_int_ty, -1));
       addr = rewriter.create<ml::GEPOp>(loc, addr.getType(),
                                         rewriter.getI8Type(), addr, index,
-                                        /*inbounds=*/true);
+                                        mlir::LLVM::GEPNoWrapFlags::inbounds);
 
       // Calculate the bit shift (assume little-endianness).
       Value offset = rewriter.create<ml::TruncOp>(loc, atomic_ty, addr_offset);
       shift = rewriter.create<ml::MulOp>(
           loc, offset,
           rewriter.create<ml::ConstantOp>(loc, offset.getType(), 8));
+      if (is_4_bit_wide) {
+        auto c0 = rewriter.create<ml::ConstantOp>(loc, shift.getType(), 0);
+        auto c4 = rewriter.create<ml::ConstantOp>(loc, shift.getType(), 4);
+        auto subshift =
+            rewriter.create<ml::SelectOp>(loc, is_low_nibble, c0, c4);
+        shift = rewriter.create<ml::AddOp>(loc, shift, subshift);
+      }
 
       // Compose the update mask.
       Value bits_long = rewriter.create<ml::ConstantOp>(loc, atomic_ty, -1);
@@ -1111,7 +1209,7 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
             Value short_value =
                 b.create<ml::TruncOp>(b.getIntegerType(result_size),
                                       b.create<ml::LShrOp>(old_value, shift));
-            input_value = b.create<ml::BitcastOp>(result_ty, short_value);
+            input_value = b.create<arith::BitcastOp>(result_ty, short_value);
           } else {
             input_value = CreateBitcast(b, op, old_value, result_ty);
           }
@@ -1127,7 +1225,7 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
           Value new_value;
           if (small_type) {
             Value cast_value = b.create<ml::ZExtOp>(
-                atomic_ty, b.create<ml::BitcastOp>(
+                atomic_ty, b.create<arith::BitcastOp>(
                                rewriter.getIntegerType(result_size), result));
             new_value =
                 b.create<ml::OrOp>(b.create<ml::AndOp>(old_value, mask),
@@ -1136,11 +1234,13 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
             new_value = CreateBitcast(b, op, result, atomic_ty);
           }
 
+          auto sync_scope = determinateScope();
+
           // Try saving the result atomically, retry if failed.
           Value cmpxchg = b.create<ml::AtomicCmpXchgOp>(
               loc, addr, old_value, new_value,
-              /*success_ordering=*/ml::AtomicOrdering::seq_cst,
-              /*failure_ordering=*/ml::AtomicOrdering::seq_cst);
+              /*success_ordering=*/ml::AtomicOrdering::monotonic,
+              /*failure_ordering=*/ml::AtomicOrdering::monotonic, sync_scope);
           Value next = b.create<ml::ExtractValueOp>(cmpxchg, 0);
           Value ok = b.create<ml::ExtractValueOp>(cmpxchg, 1);
           Value low_bit = b.create<ml::ConstantOp>(b.getOneAttr(b.getI1Type()));
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
index 26741df0c447..d2a1fdd8b2c3 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
@@ -90,8 +91,18 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
                                                        patterns);
     if (device_spec_.IsGpu()) {
       if (device_spec_.IsAmdGpu()) {
+        std::string chipset =
+            device_spec_.gpu().rocm_compute_capability().gfx_version();
+        llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
+            mlir::amdgpu::Chipset::parse(chipset);
+        if (failed(maybeChipset)) {
+          mlir::emitError(mlir::UnknownLoc::get(&getContext()),
+                          "Invalid chipset name: " + chipset);
+          return signalPassFailure();
+        }
         mlir::populateGpuToROCDLConversionPatterns(
-            type_converter, patterns, mlir::gpu::amd::Runtime::Unknown);
+            type_converter, patterns, mlir::gpu::amd::Runtime::Unknown,
+            *maybeChipset);
         mlir::configureGpuToROCDLConversionLegality(target);
       } else {
         mlir::populateGpuToNVVMConversionPatterns(type_converter, patterns);
diff --git a/third_party/xla/xla/codegen/emitters/transforms/merge_pointers_to_same_slice.cc b/third_party/xla/xla/codegen/emitters/transforms/merge_pointers_to_same_slice.cc
index f50947ba2237..38010489638d 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/merge_pointers_to_same_slice.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/merge_pointers_to_same_slice.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cassert>
 #include <memory>
 #include <optional>
 #include <string>
@@ -78,7 +79,10 @@ struct PackedArgs {
         arg.replaceAllUsesWith(op.getArgument(replacement_args[idx]));
       }
     }
-    op.eraseArguments(args_to_erase);
+
+    auto res = op.eraseArguments(args_to_erase);
+    (void)res;
+    assert(llvm::succeeded(res));
     for (int i = 0; i < op.getNumArguments(); ++i) {
       if (op.getArgAttr(i, "xla.slice_index")) {
         op.removeArgAttr(i, "xla.slice_index");
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.h b/third_party/xla/xla/codegen/emitters/transforms/passes.h
index 5d1db4dcff9c..5aac6207bc04 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.h
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.h
@@ -20,7 +20,10 @@ limitations under the License.
 #include <string>
 
 #include "mlir/Pass/Pass.h"
-#include "xla/stream_executor/device_description.h"
+
+namespace stream_executor {
+class DeviceDescription;
+}  // namespace stream_executor
 
 namespace xla {
 namespace emitters {
@@ -49,6 +52,11 @@ std::unique_ptr<mlir::Pass> CreatePropagateSliceIndicesPass();
 std::unique_ptr<mlir::Pass> CreateSimplifyAffinePass();
 std::unique_ptr<mlir::Pass> CreateSimplifyArithPass();
 std::unique_ptr<mlir::Pass> CreateUnswitchLoopsPass();
+std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
+    const std::string& target_type = "gpu",
+    const std::string& gpu_device_info = "");
+std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
+    const stream_executor::DeviceDescription& device_description);
 
 #define GEN_PASS_REGISTRATION
 #include "xla/codegen/emitters/transforms/passes.h.inc"
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.td b/third_party/xla/xla/codegen/emitters/transforms/passes.td
index ed75ce798a2e..68fe4afe892f 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.td
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.td
@@ -99,6 +99,7 @@ def LowerTensorsPass : Pass<"xla-lower-tensors", "mlir::ModuleOp"> {
     "xla::gpu::XlaGpuDialect",
     "xla::XlaDialect",
     "mlir::vector::VectorDialect",
+    "mlir::ROCDL::ROCDLDialect",
   ];
   let options = [
     Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
@@ -283,4 +284,28 @@ def UnswitchLoopsPass :
   let constructor = "CreateUnswitchLoopsPass()";
 }
 
+def VectorizeLoadsAndStoresPass :
+   Pass<"xla-vectorize-loads-stores", "mlir::func::FuncOp"> {
+  let summary = "Vectorizes loads and stores.";
+
+  let description = [{
+    Rewrites tensor.extract and tensor.insert ops inside loops to their vector
+    equivalents (vector.transfer_read and vector.transfer_write + vector.extract
+    and vector.insert).
+  }];
+
+  let dependentDialects = [
+    "mlir::vector::VectorDialect",
+  ];
+
+  let options = [
+    Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
+           "Serialized stream_executor::GPUDeviceInfo proto.">,
+    Option<"target_type_", "target_type", "std::string", /*default=*/"\"gpu\"",
+           "Whether the pass targets a 'cpu' or 'gpu'. If 'cpu', gpu_device_info_ must be empty.">,
+  ];
+
+  let constructor = "CreateVectorizeLoadsAndStoresPass()";
+}
+
 #endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_PASSES_TD_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/simplify_affine.cc b/third_party/xla/xla/codegen/emitters/transforms/simplify_affine.cc
index cd6b19cc4621..35f805bbdbf1 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/simplify_affine.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/simplify_affine.cc
@@ -311,7 +311,7 @@ struct SimplifyAffinePass
     patterns.add<RewriteAffineApply, RewriteApplyIndexingOp>(ctx);
     mlir::GreedyRewriteConfig config;
     // There's no point simplifying more than once.
-    config.strictMode = mlir::GreedyRewriteStrictness::ExistingOps;
+    config.setStrictness(mlir::GreedyRewriteStrictness::ExistingOps);
     if (mlir::failed(mlir::applyPatternsGreedily(
             getOperation(), std::move(patterns), config))) {
       signalPassFailure();
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir
index de00655d9162..8b5ef809246d 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir
@@ -242,6 +242,24 @@ func.func @vector_extract(%arg0: vector<2x3xf32>, %arg1: index) -> f32 {
 
 // -----
 
+func.func @vector_transfer_read(%arg0: tensor<64x66xbf16>, %i: index, %j: index)
+    -> vector<2xbf16> {
+  %cst = arith.constant 0.0 : bf16
+  %v = vector.transfer_read %arg0[%i, %j], %cst {in_bounds = [true]}
+    : tensor<64x66xbf16>, vector<2xbf16>
+  func.return %v : vector<2xbf16>
+}
+// CHECK: #[[$MAP:.+]] = #xla.indexing_map<"(d0, d1) -> (d0 * 66 + d1)
+// CHECK-SAME: domain: d0 in [0, 63], d1 in [0, 65]
+
+// CHECK-LABEL: func.func @vector_transfer_read(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<4224xbf16>,
+// CHECK-SAME:      %[[I:.*]]: index, %[[J:.*]]: index)
+// CHECK:        %[[INDEX:.*]] = xla.apply_indexing #[[$MAP]](%[[I]], %[[J]])
+// CHECK:        vector.transfer_read %[[SRC]][%[[INDEX]]]
+
+// -----
+
 func.func @vector_insert(%arg0: vector<10x24xf32>, %i: index)
   -> vector<10x24xf32> {
   %scalar = arith.constant 3.0 : f32
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir
index 5c2d72067dcf..1e44e1f3e1a2 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_tensors.mlir
@@ -252,7 +252,7 @@ func.func @atomic_rmw_f32(%in: tensor<8xf32>, %i: index) -> (tensor<8xf32>) {
 // CHECK: %[[ADDR:.*]] = llvm.getelementptr
 // CHECK-NEXT: %[[INIT:.*]] = llvm.load %[[ADDR]]
 // CHECK-NEXT: scf.while (%[[VAR:.*]] = %[[INIT]])
-// CHECK: %[[RES:.*]] = llvm.bitcast %{{.*}} : f32 to i32
+// CHECK: %[[RES:.*]] = arith.bitcast %{{.*}} : f32 to i32
 // CHECK-NEXT: llvm.cmpxchg %[[ADDR]], %[[VAR]], %[[RES]]
 
 // -----
@@ -277,8 +277,8 @@ func.func @atomic_rmw_f16(%in: tensor<8xf16>, %i: index)
 // CHECK-NEXT: scf.while (%[[VAR:.*]] = %[[INIT]])
 // CHECK-NEXT: %[[VAR_SHIFT:.*]] = llvm.lshr %[[VAR]], %{{.*}}
 // CHECK-NEXT: %[[VAR_TRUNC:.*]] = llvm.trunc %[[VAR_SHIFT]]
-// CHECK-NEXT: llvm.bitcast %[[VAR_TRUNC]] : i16 to f16
-// CHECK: %[[RES:.*]] = llvm.bitcast %{{.*}} : f16 to i16
+// CHECK-NEXT: arith.bitcast %[[VAR_TRUNC]] : i16 to f16
+// CHECK: %[[RES:.*]] = arith.bitcast %{{.*}} : f16 to i16
 // CHECK-NEXT: %[[RES_WIDE:.*]] = llvm.zext %[[RES]]
 // CHECK-NEXT: %[[NEW_MASKED:.*]] = llvm.and %[[VAR]], %{{.*}}
 // CHECK-NEXT: %[[RES_SHIFT:.*]] = llvm.shl %[[RES_WIDE]], %{{.*}}
@@ -287,6 +287,92 @@ func.func @atomic_rmw_f16(%in: tensor<8xf16>, %i: index)
 
 // -----
 
+func.func @atomic_rmw_f8E4M3(%in: tensor<8xf8E4M3>, %i: index)
+    -> (tensor<8xf8E4M3>) {
+  %ret = xla.atomic_rmw %in[%i] : tensor<8xf8E4M3> {
+    ^bb0(%current : f8E4M3):
+      %c1 = arith.constant 1.0 : f8E4M3
+      %add = arith.addf %current, %c1 : f8E4M3
+      xla.yield %add : f8E4M3
+  }
+  return %ret : tensor<8xf8E4M3>
+}
+// CHECK-LABEL: @atomic_rmw_f8E4M3
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-NEXT: %[[ADDR_INT:.*]] = llvm.ptrtoint %[[ADDR]]
+// CHECK-NEXT: %[[OFFSET:.*]] = llvm.and %[[ADDR_INT]], %{{.*}}
+// CHECK-NEXT: %[[INDEX:.*]] = llvm.mul %[[OFFSET]], %{{.*}}
+// CHECK-NEXT: %[[BASE:.*]] = llvm.getelementptr inbounds %[[ADDR]][%[[INDEX]]]
+// CHECK: %[[INIT:.*]] = llvm.load %[[BASE]]
+// CHECK-NEXT: scf.while (%[[VAR:.*]] = %[[INIT]])
+// CHECK-NEXT: %[[VAR_SHIFT:.*]] = llvm.lshr %[[VAR]], %{{.*}}
+// CHECK-NEXT: %[[VAR_TRUNC:.*]] = llvm.trunc %[[VAR_SHIFT]]
+// CHECK-NEXT: arith.bitcast %[[VAR_TRUNC]] : i8 to f8E4M3
+// CHECK: %[[RES:.*]] = arith.bitcast %{{.*}} : f8E4M3 to i8
+// CHECK-NEXT: %[[RES_WIDE:.*]] = llvm.zext %[[RES]]
+// CHECK-DAG: %[[RES_SHIFT:.*]] = llvm.shl %[[RES_WIDE]], %{{.*}}
+// CHECK-DAG: %[[NEW_MASKED:.*]] = llvm.and %[[VAR]], %{{.*}}
+// CHECK-NEXT: %[[NEW:.*]] = llvm.or %[[NEW_MASKED]], %[[RES_SHIFT]]
+// CHECK-NEXT: llvm.cmpxchg %[[BASE]], %[[VAR]], %[[NEW]]
+
+// -----
+
+func.func @atomic_rmw_i4(%in: tensor<8xi4>, %i: index) -> (tensor<8xi4>) {
+  %ret = xla.atomic_rmw %in[%i] : tensor<8xi4> {
+    ^bb0(%current : i4):
+      %c1 = arith.constant 1 : i4
+      %add = arith.addi %current, %c1 : i4
+      xla.yield %add : i4
+  }
+  return %ret : tensor<8xi4>
+}
+// CHECK-LABEL: func.func @atomic_rmw_i4(
+// CHECK-SAME:    %[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: index) {
+
+// CHECK-DAG:  %[[LC1_i4:.*]] = arith.constant 1 : i4
+// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i64
+// CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : i64
+// CHECK-DAG:  %[[LC3:.*]] = llvm.mlir.constant(3 : i64) : i64
+// CHECK-DAG:  %[[LCm1:.*]] = llvm.mlir.constant(-1 : i64) : i64
+// CHECK-DAG:  %[[LC8:.*]] = llvm.mlir.constant(8 : i32) : i32
+// CHECK-DAG:  %[[LC0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:  %[[LC4:.*]] = llvm.mlir.constant(4 : i32) : i32
+// CHECK-DAG:  %[[LCm1_i32:.*]] = llvm.mlir.constant(-1 : i32) : i32
+// CHECK-DAG:  %[[LC15:.*]] = llvm.mlir.constant(15 : i32) : i32
+// CHECK-DAG:  %[[LCTRUE:.*]] = llvm.mlir.constant(true) : i1
+
+// CHECK:  %[[VAL_13:.*]] = arith.index_castui %[[VAL_1]] : index to i64
+// CHECK:  %[[VAL_14:.*]] = arith.andi %[[VAL_13]], %[[C1]] : i64
+// CHECK:  %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[C0]] : i64
+// CHECK:  %[[VAL_16:.*]] = arith.shrui %[[VAL_13]], %[[C1]] : i64
+// CHECK:  %[[VAL_17:.*]] = llvm.getelementptr inbounds %[[VAL_0]][%[[VAL_16]]]
+// CHECK-SAME:   : (!llvm.ptr, i64) -> !llvm.ptr, i8
+
+// CHECK:  %[[VAL_18:.*]] = llvm.ptrtoint %[[VAL_17]] : !llvm.ptr to i64
+// CHECK:  %[[VAL_19:.*]] = llvm.and %[[VAL_18]], %[[LC3]] : i64
+// CHECK:  %[[VAL_20:.*]] = llvm.mul %[[VAL_19]], %[[LCm1]] : i64
+// CHECK:  %[[VAL_21:.*]] = llvm.getelementptr inbounds %[[VAL_17]][%[[VAL_20]]]
+// CHECK-SAME:   : (!llvm.ptr, i64) -> !llvm.ptr, i8
+
+// CHECK:  %[[VAL_22:.*]] = llvm.trunc %[[VAL_19]] : i64 to i32
+// CHECK:  %[[VAL_23:.*]] = llvm.mul %[[VAL_22]], %[[LC8]] : i32
+// CHECK:  %[[VAL_24:.*]] = llvm.select %[[VAL_15]], %[[LC0]], %[[LC4]]
+// CHECK:  %[[VAL_25:.*]] = llvm.add %[[VAL_23]], %[[VAL_24]] : i32
+// CHECK:  %[[VAL_26:.*]] = llvm.shl %[[LC15]], %[[VAL_25]] : i32
+// CHECK:  %[[VAL_27:.*]] = llvm.xor %[[LCm1_i32]], %[[VAL_26]] : i32
+// CHECK:  %[[LOAD_4BYTES:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i32
+// CHECK:  scf.while (%[[VAL_30:.*]] = %[[LOAD_4BYTES]]) : (i32) -> i32 {
+// CHECK:    %[[VAL_31:.*]] = llvm.lshr %[[VAL_30]], %[[VAL_25]] : i32
+// CHECK:    %[[VAL_32:.*]] = llvm.trunc %[[VAL_31]] : i32 to i4
+// CHECK:    %[[VAL_33:.*]] = arith.addi %[[VAL_32]], %[[LC1_i4]] : i4
+// CHECK:    %[[VAL_34:.*]] = llvm.zext %[[VAL_33]] : i4 to i32
+// CHECK:    %[[VAL_35:.*]] = llvm.and %[[VAL_30]], %[[VAL_27]] : i32
+// CHECK:    %[[VAL_36:.*]] = llvm.shl %[[VAL_34]], %[[VAL_25]] : i32
+// CHECK:    %[[VAL_37:.*]] = llvm.or %[[VAL_35]], %[[VAL_36]] : i32
+// CHECK:    llvm.cmpxchg
+
+// -----
+
 func.func @atomic_rmw_overwrite(%in: tensor<8xf16>, %i: index)
     -> (tensor<8xf16>) {
   %c1 = arith.constant 1.0 : f16
@@ -304,10 +390,32 @@ func.func @atomic_rmw_overwrite(%in: tensor<8xf16>, %i: index)
 // CHECK-NEXT: %[[BASE:.*]] = llvm.getelementptr inbounds %[[ADDR]][%[[INDEX]]]
 // CHECK: %[[INIT:.*]] = llvm.load %[[BASE]]
 // CHECK-NEXT: scf.while (%[[VAR:.*]] = %[[INIT]])
-// CHECK: %[[RES:.*]] = llvm.bitcast %{{.*}} : f16 to i16
-// CHECK-NEXT: %[[RES_WIDE:.*]] = llvm.zext %[[RES]]
-// CHECK-NEXT: %[[NEW_MASKED:.*]] = llvm.and %[[VAR]], %{{.*}}
-// CHECK-NEXT: %[[RES_SHIFT:.*]] = llvm.shl %[[RES_WIDE]], %{{.*}}
+// CHECK-DAG: %[[RES_SHIFT:.*]] = llvm.shl %{{.*}}, %{{.*}}
+// CHECK-DAG: %[[NEW_MASKED:.*]] = llvm.and %[[VAR]], %{{.*}}
+// CHECK-NEXT: %[[NEW:.*]] = llvm.or %[[NEW_MASKED]], %[[RES_SHIFT]]
+// CHECK-NEXT: llvm.cmpxchg %[[BASE]], %[[VAR]], %[[NEW]]
+
+// -----
+
+func.func @atomic_rmw_overwrite_f8E4M3(%in: tensor<8xf8E4M3>, %i: index)
+    -> (tensor<8xf8E4M3>) {
+  %c1 = arith.constant 1.0 : f8E4M3
+  %ret = xla.atomic_rmw %in[%i] : tensor<8xf8E4M3> {
+    ^bb0(%current : f8E4M3):
+      xla.yield %c1 : f8E4M3
+  }
+  return %ret : tensor<8xf8E4M3>
+}
+// CHECK-LABEL: @atomic_rmw_overwrite_f8E4M3
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-NEXT: %[[ADDR_INT:.*]] = llvm.ptrtoint %[[ADDR]]
+// CHECK-NEXT: %[[OFFSET:.*]] = llvm.and %[[ADDR_INT]], %{{.*}}
+// CHECK-NEXT: %[[INDEX:.*]] = llvm.mul %[[OFFSET]], %{{.*}}
+// CHECK-NEXT: %[[BASE:.*]] = llvm.getelementptr inbounds %[[ADDR]][%[[INDEX]]]
+// CHECK: %[[INIT:.*]] = llvm.load %[[BASE]]
+// CHECK-NEXT: scf.while (%[[VAR:.*]] = %[[INIT]])
+// CHECK-DAG: %[[RES_SHIFT:.*]] = llvm.shl %{{.*}}, %{{.*}}
+// CHECK-DAG: %[[NEW_MASKED:.*]] = llvm.and %[[VAR]], %{{.*}}
 // CHECK-NEXT: %[[NEW:.*]] = llvm.or %[[NEW_MASKED]], %[[RES_SHIFT]]
 // CHECK-NEXT: llvm.cmpxchg %[[BASE]], %[[VAR]], %[[NEW]]
 
@@ -322,6 +430,15 @@ func.func @shared_complex() -> tensor<10xcomplex<f32>> {
 
 // -----
 
+func.func @shared_i4() -> tensor<10xi4> {
+  %shared = xla_gpu.allocate_shared : tensor<10xi4>
+  return %shared : tensor<10xi4>
+}
+// CHECK: llvm.mlir.global private @{{.*}}() {addr_space = 3 : i32} : !llvm.array<5 x i8>
+// CHECK-LABEL: @shared_i4
+
+// -----
+
 func.func @i4_load_store(%arg: tensor<10xi4>, %i: index, %j: index)
     -> tensor<10xi4> {
   %v = tensor.extract %arg[%i] : tensor<10xi4>
@@ -342,18 +459,18 @@ func.func @i4_load_store(%arg: tensor<10xi4>, %i: index, %j: index)
 // CHECK: scf.while (%[[INIT:.*]] = %[[CURRENT_I32]])
 // CHECK: %[[SHIFTED:.*]] = llvm.lshr %[[INIT]]
 // CHECK: %[[CURRENT:.*]] = llvm.trunc %[[SHIFTED]]
-// CHECK: %[[MASKED_CURRENT_LO:.*]] = arith.andi %[[CURRENT]], %[[C_NEG16]] : i8
-// CHECK: %[[MASKED_VALUE_I8:.*]] = arith.andi %[[VALUE_I8]], %[[C15]] : i8
+// CHECK-DAG: %[[MASKED_VALUE_I8:.*]] = arith.andi %[[VALUE_I8]], %[[C15]] : i8
+// CHECK-DAG: %[[MASKED_CURRENT_LO:.*]] = arith.andi %[[CURRENT]], %[[C_NEG16]] : i8
 // CHECK: %[[NEW_LO:.*]] = arith.ori %[[MASKED_CURRENT_LO]], %[[MASKED_VALUE_I8]] : i8
-// CHECK: %[[MASKED_CURRENT_HI:.*]] = arith.andi %[[CURRENT]], %[[C15]] : i8
-// CHECK: %[[VALUE_HI:.*]] = arith.shli %[[VALUE_I8]], %[[C4]] : i8
+// CHECK-DAG: %[[VALUE_HI:.*]] = arith.shli %[[VALUE_I8]], %[[C4]] : i8
+// CHECK-DAG: %[[MASKED_CURRENT_HI:.*]] = arith.andi %[[CURRENT]], %[[C15]] : i8
 // CHECK: %[[NEW_HI:.*]] = arith.ori %[[MASKED_CURRENT_HI]], %[[VALUE_HI]] : i8
 // CHECK: %[[NEW_VALUE:.*]] = arith.select %{{.*}}, %[[NEW_LO]], %[[NEW_HI]] : i8
 // CHECK: %[[NEW_VALUE_I32:.*]] = llvm.zext %[[NEW_VALUE]]
-// CHECK: %[[MASKED_INIT:.*]] = llvm.and %[[INIT]]
-// CHECK: %[[NEW_VALUE_SHIFTED:.*]] = llvm.shl %[[NEW_VALUE_I32]]
+// CHECK-DAG: %[[NEW_VALUE_SHIFTED:.*]] = llvm.shl %[[NEW_VALUE_I32]]
+// CHECK-DAG: %[[MASKED_INIT:.*]] = llvm.and %[[INIT]]
 // CHECK: %[[NEW_INIT:.*]] = llvm.or %[[MASKED_INIT]], %[[NEW_VALUE_SHIFTED]]
-// CHECK: llvm.cmpxchg %{{.*}}, %[[INIT]], %[[NEW_INIT]] seq_cst seq_cst
+// CHECK: llvm.cmpxchg %{{.*}}, %[[INIT]], %[[NEW_INIT]] monotonic monotonic
 // CHECK: scf.condition
 
 // -----
@@ -387,7 +504,7 @@ func.func @direct_atomic_rmw_addi(%in: tensor<8xi32>,
 // CHECK-PASCAL-LABEL: @direct_atomic_rmw_addi
 // CHECK-PASCAL: %[[C2:.*]] = arith.constant 2
 // CHECK-PASCAL: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-PASCAL: llvm.atomicrmw add %[[ADDR]], %[[C2]] seq_cst
+// CHECK-PASCAL: llvm.atomicrmw add %[[ADDR]], %[[C2]] monotonic
 
 // -----
 
@@ -404,7 +521,7 @@ func.func @direct_atomic_rmw_maxsi(%in: tensor<8xi32>,
 // CHECK-PASCAL-LABEL: @direct_atomic_rmw_maxsi
 // CHECK-PASCAL: %[[C2:.*]] = arith.constant 2
 // CHECK-PASCAL: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-PASCAL: llvm.atomicrmw max %[[ADDR]], %[[C2]] seq_cst
+// CHECK-PASCAL: llvm.atomicrmw max %[[ADDR]], %[[C2]] monotonic
 
 // -----
 
@@ -421,7 +538,7 @@ func.func @direct_atomic_rmw_maxui(%in: tensor<8xi32>,
 // CHECK-PASCAL-LABEL: @direct_atomic_rmw_maxui
 // CHECK-PASCAL: %[[C2:.*]] = arith.constant 2
 // CHECK-PASCAL: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-PASCAL: llvm.atomicrmw umax %[[ADDR]], %[[C2]] seq_cst
+// CHECK-PASCAL: llvm.atomicrmw umax %[[ADDR]], %[[C2]] monotonic
 
 // -----
 
@@ -438,7 +555,7 @@ func.func @direct_atomic_rmw_minsi(%in: tensor<8xi32>,
 // CHECK-PASCAL-LABEL: @direct_atomic_rmw_minsi
 // CHECK-PASCAL: %[[C2:.*]] = arith.constant 2
 // CHECK-PASCAL: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-PASCAL: llvm.atomicrmw min %[[ADDR]], %[[C2]] seq_cst
+// CHECK-PASCAL: llvm.atomicrmw min %[[ADDR]], %[[C2]] monotonic
 
 // -----
 
@@ -455,7 +572,7 @@ func.func @direct_atomic_rmw_minui(%in: tensor<8xi32>,
 // CHECK-PASCAL-LABEL: @direct_atomic_rmw_minui
 // CHECK-PASCAL: %[[C2:.*]] = arith.constant 2
 // CHECK-PASCAL: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-PASCAL: llvm.atomicrmw umin %[[ADDR]], %[[C2]] seq_cst
+// CHECK-PASCAL: llvm.atomicrmw umin %[[ADDR]], %[[C2]] monotonic
 
 // -----
 
@@ -472,29 +589,27 @@ func.func @direct_atomic_rmw_fadd_f32(%in: tensor<8xf32>,
 // CHECK-PASCAL-LABEL: @direct_atomic_rmw_fadd_f32
 // CHECK-PASCAL: %[[C2:.*]] = arith.constant 2
 // CHECK-PASCAL: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-PASCAL: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-PASCAL: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-VOLTA-LABEL: @direct_atomic_rmw_fadd_f32
 // CHECK-VOLTA: %[[C2:.*]] = arith.constant 2
 // CHECK-VOLTA: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-AMPERE-LABEL: @direct_atomic_rmw_fadd_f32
 // CHECK-AMPERE: %[[C2:.*]] = arith.constant 2
 // CHECK-AMPERE: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-GFX908-MI100-LABEL: @direct_atomic_rmw_fadd_f32
 // CHECK-GFX908-MI100: %[[C2:.*]] = arith.constant 2
 // CHECK-GFX908-MI100: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-GFX908-MI100: %[[ADDR_CAST:.*]] = llvm.addrspacecast %[[ADDR]] : !llvm.ptr to !llvm.ptr<1>
-// CHECK-GFX908-MI100: llvm.atomicrmw fadd %[[ADDR_CAST]], %[[C2]] syncscope("agent") seq_cst
+// CHECK-GFX908-MI100: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] syncscope("agent-one-as") monotonic
 
 // CHECK-GFX90A-MI200-LABEL: @direct_atomic_rmw_fadd_f32
 // CHECK-GFX90A-MI200: %[[C2:.*]] = arith.constant 2
 // CHECK-GFX90A-MI200: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-GFX90A-MI200: %[[ADDR_CAST:.*]] = llvm.addrspacecast %[[ADDR]] : !llvm.ptr to !llvm.ptr<1>
-// CHECK-GFX90A-MI200: llvm.atomicrmw fadd %[[ADDR_CAST]], %[[C2]] syncscope("agent") seq_cst
+// CHECK-GFX90A-MI200: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] syncscope("agent-one-as") monotonic
 
 // -----
 
@@ -514,21 +629,78 @@ func.func @direct_atomic_rmw_fadd_f16(%in: tensor<8xf16>,
 // CHECK-VOLTA-LABEL: @direct_atomic_rmw_fadd_f16
 // CHECK-VOLTA: %[[C2:.*]] = arith.constant 2
 // CHECK-VOLTA: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-AMPERE-LABEL: @direct_atomic_rmw_fadd_f16
 // CHECK-AMPERE: %[[C2:.*]] = arith.constant 2
 // CHECK-AMPERE: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-GFX908-MI100-LABEL: @direct_atomic_rmw_fadd_f16
-// CHECK-GFX908-MI100-NOT: llvm.atomicrmw fadd
+// CHECK-GFX908-MI100: %[[CST:.*]] = arith.constant 2
+// CHECK-GFX908-MI100: %[[C_NEG4:.*]] = llvm.mlir.constant(-4 : i64) : i64
+// CHECK-GFX908-MI100: %[[C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK-GFX908-MI100: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
+// CHECK-GFX908-MI100: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-GFX908-MI100: %[[ADDR_INT:.*]] = llvm.ptrtoint %[[ADDR]]
+// CHECK-GFX908-MI100: %[[ADDR_MASKED:.*]] = llvm.and %[[ADDR_INT]], %[[C_NEG4]]
+// CHECK-GFX908-MI100: %[[ADDR_TRUNC:.*]] = llvm.trunc %[[ADDR_INT]]
+// CHECK-GFX908-MI100: %[[OFFSET:.*]] = llvm.and %[[ADDR_TRUNC]], %[[C2]]
+// CHECK-GFX908-MI100: %[[SHIFT:.*]] = llvm.mul %[[OFFSET]], %[[C8]]
+// CHECK-GFX908-MI100: %[[VAL_INT:.*]] = llvm.bitcast %[[CST]] : f16 to i16
+// CHECK-GFX908-MI100: %[[VAL_WIDE:.*]] = llvm.zext %[[VAL_INT]] : i16 to i32
+// CHECK-GFX908-MI100: %[[VAL_SHIFT:.*]] = llvm.shl %[[VAL_WIDE]], %[[SHIFT]]
+// CHECK-GFX908-MI100: %[[ADDR:.*]] = llvm.inttoptr %[[ADDR_MASKED]]
+// CHECK-GFX908-MI100: %[[VAL:.*]] = llvm.bitcast %[[VAL_SHIFT]] : i32 to vector<2xf16>
+// CHECK-GFX908-MI100: llvm.atomicrmw fadd %[[ADDR]], %[[VAL]] syncscope("agent-one-as") monotonic
+
 
 // CHECK-GFX90A-MI200-LABEL: @direct_atomic_rmw_fadd_f16
-// CHECK-GFX90A-MI200: %[[C2:.*]] = arith.constant 2
+// CHECK-GFX90A-MI200: %[[CST:.*]] = arith.constant 2
+// CHECK-GFX90A-MI200: %[[C_NEG4:.*]] = llvm.mlir.constant(-4 : i64) : i64
+// CHECK-GFX90A-MI200: %[[C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK-GFX90A-MI200: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
 // CHECK-GFX90A-MI200: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-GFX90A-MI200: %[[ADDR_CAST:.*]] = llvm.addrspacecast %[[ADDR]] : !llvm.ptr to !llvm.ptr<1>
-// CHECK-GFX90A-MI200: llvm.atomicrmw fadd %[[ADDR_CAST]], %[[C2]] syncscope("agent") seq_cst
+// CHECK-GFX90A-MI200: %[[ADDR_INT:.*]] = llvm.ptrtoint %[[ADDR]]
+// CHECK-GFX90A-MI200: %[[ADDR_MASKED:.*]] = llvm.and %[[ADDR_INT]], %[[C_NEG4]]
+// CHECK-GFX90A-MI200: %[[ADDR_TRUNC:.*]] = llvm.trunc %[[ADDR_INT]]
+// CHECK-GFX90A-MI200: %[[OFFSET:.*]] = llvm.and %[[ADDR_TRUNC]], %[[C2]]
+// CHECK-GFX90A-MI200: %[[SHIFT:.*]] = llvm.mul %[[OFFSET]], %[[C8]]
+// CHECK-GFX90A-MI200: %[[VAL_INT:.*]] = llvm.bitcast %[[CST]] : f16 to i16
+// CHECK-GFX90A-MI200: %[[VAL_WIDE:.*]] = llvm.zext %[[VAL_INT]] : i16 to i32
+// CHECK-GFX90A-MI200: %[[VAL_SHIFT:.*]] = llvm.shl %[[VAL_WIDE]], %[[SHIFT]]
+// CHECK-GFX90A-MI200: %[[ADDR:.*]] = llvm.inttoptr %[[ADDR_MASKED]]
+// CHECK-GFX90A-MI200: %[[VAL:.*]] = llvm.bitcast %[[VAL_SHIFT]] : i32 to vector<2xf16>
+// CHECK-GFX90A-MI200: llvm.atomicrmw fadd %[[ADDR]], %[[VAL]] syncscope("agent-one-as") monotonic
+
+// -----
+
+func.func @no_direct_atomic_rmw_fadd_f8E4M3(%in: tensor<8xf8E4M3>,
+    %i: index) -> (tensor<8xf8E4M3>) {
+  %c2 = arith.constant 2.0 : f8E4M3
+  %ret = xla.atomic_rmw %in[%i] : tensor<8xf8E4M3> {
+    ^bb0(%current : f8E4M3):
+      %min = arith.addf %current, %c2 : f8E4M3
+      xla.yield %c2 : f8E4M3
+  }
+  return %ret : tensor<8xf8E4M3>
+}
+// CHECK-PASCAL-LABEL: @no_direct_atomic_rmw_fadd_f8E4M3
+// CHECK-PASCAL-NOT: llvm.atomicrmw fadd
+
+// CHECK-VOLTA-LABEL: @no_direct_atomic_rmw_fadd_f8E4M3
+// CHECK-VOLTA-NOT: llvm.atomicrmw fadd
+
+// CHECK-AMPERE-LABEL: @no_direct_atomic_rmw_fadd_f8E4M3
+// CHECK-AMPERE-NOT: llvm.atomicrmw fadd
+
+// CHECK-GFX908-MI100-LABEL: @no_direct_atomic_rmw_fadd_f8E4M3
+// CHECK-GFX908-MI100-NOT: llvm.atomicrmw fadd
+// CHECK-GFX908-MI100: return
+
+// CHECK-GFX90A-MI200-LABEL: @no_direct_atomic_rmw_fadd_f8E4M3
+// CHECK-GFX90A-MI200-NOT: llvm.atomicrmw fadd
+// CHECK-GFX90A-MI200: return
 
 // -----
 
@@ -548,7 +720,7 @@ func.func @direct_atomic_rmw_fadd_bf16(%in: tensor<8xbf16>,
 // CHECK-HOPPER-LABEL: @direct_atomic_rmw_fadd_bf16
 // CHECK-HOPPER: %[[C2:.*]] = arith.constant 2
 // CHECK-HOPPER: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-HOPPER: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-HOPPER: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // -----
 
@@ -565,23 +737,29 @@ func.func @direct_atomic_rmw_fadd_f64(%in: tensor<8xf64>,
 // CHECK-PASCAL-LABEL: @direct_atomic_rmw_fadd_f64
 // CHECK-PASCAL: %[[C2:.*]] = arith.constant 2
 // CHECK-PASCAL: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-PASCAL: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-PASCAL: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-VOLTA-LABEL: @direct_atomic_rmw_fadd_f64
 // CHECK-VOLTA: %[[C2:.*]] = arith.constant 2
 // CHECK-VOLTA: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-AMPERE-LABEL: @direct_atomic_rmw_fadd_f64
 // CHECK-AMPERE: %[[C2:.*]] = arith.constant 2
 // CHECK-AMPERE: %[[ADDR:.*]] = llvm.getelementptr
-// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] monotonic
 
 // CHECK-GFX908-MI100-LABEL: @direct_atomic_rmw_fadd_f64
-// CHECK-GFX908-MI100-NOT: llvm.atomicrmw fadd
+// CHECK-GFX908-MI100: %[[C2:.*]] = arith.constant 2
+// CHECK-GFX908-MI100: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-GFX908-MI100: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] syncscope("agent-one-as") monotonic
+
 
 // CHECK-GFX90A-MI200-LABEL: @direct_atomic_rmw_fadd_f64
-// CHECK-GFX90A-MI200-NOT: llvm.atomicrmw fadd
+// CHECK-GFX90A-MI200: %[[C2:.*]] = arith.constant 2
+// CHECK-GFX90A-MI200: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-GFX90A-MI200: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] syncscope("agent-one-as") monotonic
+
 
 // -----
 
@@ -609,12 +787,12 @@ func.func @direct_atomic_rmw_maximumf(%in: tensor<8xf32>,
 // CHECK-PASCAL:   %[[MODIFIER_OR_NAN:.*]] = llvm.select %[[MODIFIER_IS_NAN]], %[[NAN]], %[[MODIFIER]] : i1, f32
 // CHECK-PASCAL:   %[[VAL_13:.*]] = llvm.fcmp "ult" %[[CURRENT]], %[[MODIFIER_OR_NAN]] : f32
 // CHECK-PASCAL:   scf.if %[[VAL_13]] {
-// CHECK-PASCAL:     %[[INT_MODIFIER_OR_NAN:.*]] = llvm.bitcast %[[MODIFIER_OR_NAN]] : f32 to i32
+// CHECK-PASCAL:     %[[INT_MODIFIER_OR_NAN:.*]] = arith.bitcast %[[MODIFIER_OR_NAN]] : f32 to i32
 // CHECK-PASCAL:     %[[IS_POSITIVE:.*]] = llvm.icmp "sge" %[[INT_MODIFIER_OR_NAN]], %[[C0]] : i32
 // CHECK-PASCAL:     scf.if %[[IS_POSITIVE]] {
-// CHECK-PASCAL:       llvm.atomicrmw max %[[ADDR]], %[[INT_MODIFIER_OR_NAN]] seq_cst
+// CHECK-PASCAL:       llvm.atomicrmw max %[[ADDR]], %[[INT_MODIFIER_OR_NAN]] monotonic
 // CHECK-PASCAL:     } else {
-// CHECK-PASCAL:       llvm.atomicrmw umin %[[ADDR]], %[[INT_MODIFIER_OR_NAN]] seq_cst
+// CHECK-PASCAL:       llvm.atomicrmw umin %[[ADDR]], %[[INT_MODIFIER_OR_NAN]] monotonic
 // CHECK-PASCAL:     }
 // CHECK-PASCAL:   }
 // CHECK-PASCAL: }
@@ -730,6 +908,32 @@ func.func @transfer_read_i1(%arg0: tensor<43xi1> {xla.slice_index = 1}) -> vecto
 
 // -----
 
+func.func @transfer_read_alignment(%arg0: tensor<8xi64> {llvm.align = 32 : index}) -> vector<8xi64> {
+  %c0 = arith.constant 0 : index
+  %c0_i64 = arith.constant 0 : i64
+  %0 = vector.transfer_read %arg0[%c0], %c0_i64 {in_bounds = [true]} : tensor<8xi64>, vector<8xi64>
+  return %0 : vector<8xi64>
+}
+// CHECK-LABEL: @transfer_read_alignment(
+// CHECK-SAME:  %[[ARG0:.*]]: !llvm.ptr
+// CHECK:           %[[LOADED:.*]] = llvm.load %[[ARG0]] {alignment = 32 : i64} : !llvm.ptr
+// CHECK:           return %[[LOADED]] : vector<8xi64>
+
+// -----
+
+func.func @transfer_read_alignment_non_zero_index(%arg0: tensor<16xi64> {llvm.align = 32 : index}) -> vector<8xi64> {
+  %c8 = arith.constant 8 : index
+  %c0_i64 = arith.constant 0 : i64
+  %0 = vector.transfer_read %arg0[%c8], %c0_i64 {in_bounds = [true]} : tensor<16xi64>, vector<8xi64>
+  return %0 : vector<8xi64>
+}
+// CHECK-LABEL: @transfer_read_alignment_non_zero_index(
+// CHECK-SAME:  %[[ARG0:.*]]: !llvm.ptr
+// CHECK:           %[[PTR:.*]] = llvm.getelementptr inbounds %[[ARG0]][8]
+// CHECK-NEXT:      llvm.load %[[PTR]] : !llvm.ptr -> vector<8xi64>
+
+// -----
+
 func.func @int4_constant(%arg0: tensor<3xi4>, %arg1: index) -> i4 {
   %cst = arith.constant dense<[1, 2, 3]> : tensor<3xi4>
   %extracted = tensor.extract %arg0[%arg1] : tensor<3xi4>
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_loops_to_scf.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_loops_to_scf.mlir
index 798e72fc1e30..bbe7f9a6f110 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_loops_to_scf.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_loops_to_scf.mlir
@@ -37,11 +37,11 @@ func.func @loop_op(%input: tensor<1024x32xf32>, %init: f32, %dim: index) -> (f32
 
 // CHECK:        %[[INDEX:.*]] = xla.apply_indexing
 // CHECK-SAME:     #[[$MAP]](%[[DIM]])[%[[I]], %[[J]]]
-// CHECK:        %[[VAL1:.*]] = arith.cmpi sge, %[[INDEX]], %[[C0]] : index
-// CHECK:        %[[VAL2:.*]] = arith.cmpi sle, %[[INDEX]], %[[C90]] : index
+// CHECK-DAG:    %[[VAL2:.*]] = arith.cmpi sle, %[[INDEX]], %[[C90]] : index
+// CHECK-DAG:    %[[VAL1:.*]] = arith.cmpi sge, %[[INDEX]], %[[C0]] : index
 // CHECK:        %[[VAL3:.*]] = arith.andi %[[VAL1]], %[[VAL2]] : i1
-// CHECK:        %[[VAL4:.*]] = arith.cmpi sge, %[[DIM]], %[[C0]] : index
-// CHECK:        %[[VAL5:.*]] = arith.cmpi sle, %[[DIM]], %[[C3]] : index
+// CHECK-DAG:    %[[VAL5:.*]] = arith.cmpi sle, %[[DIM]], %[[C3]] : index
+// CHECK-DAG:    %[[VAL4:.*]] = arith.cmpi sge, %[[DIM]], %[[C0]] : index
 // CHECK:        %[[VAL6:.*]] = arith.andi %[[VAL4]], %[[VAL5]] : i1
 // CHECK:        %[[INBOUNDS:.*]] = arith.andi %[[VAL3]], %[[VAL6]] : i1
 // CHECK:        %[[IF_RESULT:.*]] = scf.if %[[INBOUNDS]] -> (f32) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/vectorize_loads_stores.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/vectorize_loads_stores.mlir
similarity index 80%
rename from third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/vectorize_loads_stores.mlir
rename to third_party/xla/xla/codegen/emitters/transforms/tests/vectorize_loads_stores.mlir
index 66d9e7e56727..35cec5a54149 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/tests/vectorize_loads_stores.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/vectorize_loads_stores.mlir
@@ -1,9 +1,13 @@
 // RUN: emitters_opt -allow-unregistered-dialect %s -split-input-file \
-// RUN: -xla-gpu-vectorize-loads-stores="gpu_device_info='cuda_compute_capability {major: 6}'" -cse -canonicalize \
+// RUN: -xla-vectorize-loads-stores="gpu_device_info='cuda_compute_capability {major: 6}'" -cse -canonicalize \
+// RUN: | FileCheck %s
+
+// RUN: emitters_opt -allow-unregistered-dialect %s -split-input-file \
+// RUN: -xla-vectorize-loads-stores="target_type=cpu" -cse -canonicalize \
 // RUN: | FileCheck %s
 
 // RUN: emitters_opt %s --allow-unregistered-dialect -split-input-file \
-// RUN: -xla-gpu-vectorize-loads-stores="gpu_device_info='cuda_compute_capability {major: 9}'" -cse -canonicalize \
+// RUN: -xla-vectorize-loads-stores="gpu_device_info='cuda_compute_capability {major: 9}'" -cse -canonicalize \
 // RUN: | FileCheck %s --check-prefix=CHECK-HOPPER
 
 #map = #xla.indexing_map<"(d0)[s0] -> (d0 * 2 + s0),"
@@ -527,3 +531,106 @@ func.func @simple_atomic_rmw(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 // CHECK-HOPPER:               xla.atomic_rmw %[[ARG0]]
 // CHECK-HOPPER-NEXT:            ^bb0(%[[CURRENT:.*]]: vector<2xf32>):
 // CHECK-HOPPER-NEXT:              arith.addf %[[CURRENT]], %[[LOOP]]
+
+// -----
+
+func.func @fold_insert_extract(%in: tensor<64xf32>, %out: tensor<64xf32>)
+    -> tensor<64xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 2 : index
+  %loop = scf.for %j = %c0 to %c4 step %c1 iter_args(%out_ = %out) -> tensor<64xf32> {
+    %extracted = tensor.extract %in[%j] : tensor<64xf32>
+    %inserted = tensor.insert %extracted into %out_[%j] : tensor<64xf32>
+    scf.yield %inserted : tensor<64xf32>
+  }
+  return %loop : tensor<64xf32>
+}
+// CHECK-LABEL: @fold_insert_extract
+// CHECK-NOT:   scf.for
+// CHECK:         vector.transfer_read
+// CHECK-NEXT:    vector.transfer_write
+
+
+// -----
+
+func.func @fold_insert_extract_two_results(
+    %arg0: tensor<8xf64>, %arg1: tensor<8xf64>,
+    %arg2: tensor<8xf64>, %arg3: tensor<8xf64>, %arg4: tensor<8xf64>)
+      -> (tensor<8xf64>, tensor<8xf64>) {
+  %cst = arith.constant 0.00e+00 : f64
+  %cst_0 = arith.constant dense<0.00e+00> : vector<4xf64>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %18 = vector.transfer_read %arg1[%c0], %cst {in_bounds = [true]}
+    : tensor<8xf64>, vector<4xf64>
+  %16 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]}
+    : tensor<8xf64>, vector<4xf64>
+  %20 = vector.transfer_read %arg0[%c0], %cst {in_bounds = [true]}
+    : tensor<8xf64>, vector<4xf64>
+  %21:4 = scf.for %i = %c0 to %c4 step %c1
+      iter_args(%arg6 = %arg3, %arg7 = %arg4, %arg8 = %cst_0, %arg9 = %cst_0)
+      -> (tensor<8xf64>, tensor<8xf64>, vector<4xf64>, vector<4xf64>) {
+    %24 = vector.extract %20[%i] : f64 from vector<4xf64>
+    %25 = vector.extract %18[%i] : f64 from vector<4xf64>
+    %26 = arith.addf %24, %25 : f64
+    %27 = vector.extract %16[%i] : f64 from vector<4xf64>
+    %28 = vector.insert %27, %arg8 [%i] : f64 into vector<4xf64>
+    %29 = vector.insert %26, %arg9 [%i] : f64 into vector<4xf64>
+    scf.yield %arg6, %arg7, %28, %29
+      : tensor<8xf64>, tensor<8xf64>, vector<4xf64>, vector<4xf64>
+  }
+  %22 = vector.transfer_write %21#3, %arg3[%c0] {in_bounds = [true]}
+    : vector<4xf64>, tensor<8xf64>
+  %23 = vector.transfer_write %21#2, %arg4[%c0] {in_bounds = [true]}
+    : vector<4xf64>, tensor<8xf64>
+  return %22, %23 : tensor<8xf64>, tensor<8xf64>
+}
+// CHECK-LABEL:   func.func @fold_insert_extract_two_results(
+// CHECK-SAME:      %[[VAL_0:[a-zA-Z0-9_]*]]: tensor<8xf64>,
+// CHECK-SAME:      %[[VAL_1:[a-zA-Z0-9_]*]]: tensor<8xf64>,
+// CHECK-SAME:      %[[VAL_2:[a-zA-Z0-9_]*]]: tensor<8xf64>,
+// CHECK-SAME:      %[[VAL_3:[a-zA-Z0-9_]*]]: tensor<8xf64>,
+// CHECK-SAME:      %[[VAL_4:[a-zA-Z0-9_]*]]: tensor<8xf64>) -> (tensor<8xf64>, tensor<8xf64>) {
+
+// CHECK-DAG:   %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:   %[[C0_VEC:.*]] = arith.constant dense<0.000000e+00> : vector<4xf64>
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG:   %[[VAL_0_VEC:.*]] = vector.transfer_read %[[VAL_0]]
+// CHECK-DAG:   %[[VAL_1_VEC:.*]] = vector.transfer_read %[[VAL_1]]
+// CHECK-DAG:   %[[VAL_2_VEC:.*]] = vector.transfer_read %[[VAL_2]]
+
+// CHECK:   %[[FOR:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+// CHECK-SAME:   iter_args(%[[INIT:.*]] = %[[C0_VEC]]) -> (vector<4xf64>) {
+
+// CHECK:     %[[VEC_0_ELEM:.*]] = vector.extract %[[VAL_0_VEC]][%[[I]]]
+// CHECK:     %[[VEC_1_ELEM:.*]] = vector.extract %[[VAL_1_VEC]][%[[I]]]
+// CHECK:     %[[ADD:.*]] = arith.addf %[[VEC_0_ELEM]], %[[VEC_1_ELEM]] : f64
+// CHECK:     %[[INSERT:.*]] = vector.insert %[[ADD]], %[[INIT]] [%[[I]]]
+// CHECK:     scf.yield %[[INSERT]] : vector<4xf64>
+// CHECK:   }
+// CHECK:   %[[RES0:.*]] = vector.transfer_write %[[FOR]], %[[VAL_3]][%[[C0]]]
+// CHECK:   %[[RES1:.*]] = vector.transfer_write %[[VAL_2_VEC]], %[[VAL_4]][%[[C0]]]
+// CHECK:   return %[[RES0]], %[[RES1]] : tensor<8xf64>, tensor<8xf64>
+// CHECK: }
+
+// -----
+
+func.func @avoid_folding_small_tensors(%arg0: tensor<2xi4>, %arg1: tensor<2xi4>)
+    -> tensor<2xi4> {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %0 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %arg1)
+      -> (tensor<2xi4>) {
+    %extracted = tensor.extract %arg0[%arg2] : tensor<2xi4>
+    %inserted = tensor.insert %extracted into %arg3[%arg2] : tensor<2xi4>
+    scf.yield %inserted : tensor<2xi4>
+  }
+  return %0 : tensor<2xi4>
+}
+// CHECK-LABEL: func.func @avoid_folding_small_tensors
+// CHECK:         scf.for
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/vectorize_loads_stores.cc b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
similarity index 76%
rename from third_party/xla/xla/backends/gpu/codegen/emitters/transforms/vectorize_loads_stores.cc
rename to third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
index a493ffcd2c4b..fa0e1e4afcfe 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/vectorize_loads_stores.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
@@ -44,17 +44,18 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
+#include "xla/codegen/device_spec.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/atomic_rmw_utils.h"
+#include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
-namespace gpu {
+namespace emitters {
 namespace {
 
 #define GEN_PASS_DEF_VECTORIZELOADSANDSTORESPASS
-#include "xla/backends/gpu/codegen/emitters/transforms/passes.h.inc"
+#include "xla/codegen/emitters/transforms/passes.h.inc"
 
 using mlir::Value;
 
@@ -62,11 +63,6 @@ namespace arith = ::mlir::arith;
 namespace ml = ::mlir::LLVM;
 namespace scf = mlir::scf;
 
-bool IsAMD(const se::DeviceDescription& device_description) {
-  return std::holds_alternative<se::RocmComputeCapability>(
-      device_description.gpu_compute_capability());
-}
-
 // Tries to find the stride of a symbol or dimension in an affine expression.
 // Returns std::nullopt if the stride could not be determined.
 //
@@ -105,7 +101,7 @@ int64_t GetAlignmentOfRemainder(mlir::AffineExpr expr,
 
     std::optional<int64_t> rhs_cst = std::nullopt;
     if (binop.getRHS().getKind() == mlir::AffineExprKind::Constant) {
-      rhs_cst = binop.getRHS().cast<mlir::AffineConstantExpr>().getValue();
+      rhs_cst = mlir::cast<mlir::AffineConstantExpr>(binop.getRHS()).getValue();
     }
 
     switch (binop.getKind()) {
@@ -301,41 +297,28 @@ struct VectorizeLoad : mlir::OpRewritePattern<mlir::tensor::ExtractOp> {
 
 // Verifies that the insertions happening in the loop can all safely be batched
 // in the end.
-bool IsConflictFree(mlir::tensor::InsertOp op) {
+bool IsConflictFree(mlir::Operation* op, Value destination) {
   // The insertion's only use must be the yield.
   if (!op->hasOneUse() || !mlir::isa<scf::YieldOp>(*op->user_begin())) {
     return false;
   }
   // The destination must be one of the loop's block arguments, and the
   // destination must be the argument's only use.
-  auto bbarg = mlir::dyn_cast<mlir::BlockArgument>(op.getDest());
+  auto bbarg = mlir::dyn_cast<mlir::BlockArgument>(destination);
   return bbarg && bbarg.hasOneUse() &&
          bbarg.getOwner()->getParentOp() == op->getParentOp();
 }
 
-bool IsConflictFree(AtomicRMWOp op) {
-  // The insertion's only use must be the yield.
-  if (!op->hasOneUse() || !mlir::isa<scf::YieldOp>(*op->user_begin())) {
-    return false;
-  }
-  // The destination must be one of the loop's block arguments, and the
-  // destination must be the argument's only use.
-  auto bbarg = mlir::dyn_cast<mlir::BlockArgument>(op->getOpOperand(0).get());
-  return bbarg && bbarg.hasOneUse() &&
-         bbarg.getOwner()->getParentOp() == op->getParentOp();
-}
-
-struct VectorizeAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
-  VectorizeAtomicRMW(mlir::MLIRContext* context,
-                     const se::DeviceDescription* device_description)
-      : OpRewritePattern<AtomicRMWOp>(context),
-        device_description_(device_description) {}
+class VectorizeAtomicRMW : public mlir::OpRewritePattern<AtomicRMWOp> {
+ public:
+  VectorizeAtomicRMW(mlir::MLIRContext* context, const DeviceSpec& device_spec)
+      : OpRewritePattern<AtomicRMWOp>(context), device_spec_(device_spec) {}
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult matchAndRewrite(
       AtomicRMWOp op, mlir::PatternRewriter& rewriter) const override {
-    if (IsAMD(*device_description_) ||
-        !device_description_->cuda_compute_capability().IsAtLeastHopper()) {
+    if (!device_spec_.IsNvidiaGpu() ||
+        !device_spec_.gpu().cuda_compute_capability().IsAtLeastHopper()) {
       return rewriter.notifyMatchFailure(
           op,
           "atomic vectorization currently only supported on Hopper or later");
@@ -345,7 +328,7 @@ struct VectorizeAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
       return rewriter.notifyMatchFailure(op, "no loop found");
     }
 
-    if (!IsConflictFree(op)) {
+    if (!IsConflictFree(op, op.getOperand(0))) {
       return rewriter.notifyMatchFailure(op, "write may be read back by loop");
     }
 
@@ -407,7 +390,7 @@ struct VectorizeAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
     return mlir::success();
   }
 
-  const se::DeviceDescription* device_description_;
+  const DeviceSpec& device_spec_;
 };
 
 struct VectorizeStore : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
@@ -420,7 +403,7 @@ struct VectorizeStore : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
     if (!loop) {
       return rewriter.notifyMatchFailure(op, "no loop found");
     }
-    if (!IsConflictFree(op)) {
+    if (!IsConflictFree(op, op.getDest())) {
       return rewriter.notifyMatchFailure(op, "write may be read back by loop");
     }
     auto vector_type = GetVectorType(op.getDest().getType(), loop);
@@ -467,6 +450,99 @@ struct VectorizeStore : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
   }
 };
 
+struct FoldVectorInsertExtractPairs
+    : mlir::OpRewritePattern<mlir::vector::InsertOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::vector::InsertOp insert,
+      mlir::PatternRewriter& rewriter) const override {
+    // Check that the vector is 1D and the index is dynamic.
+    auto vector_type = insert.getDest().getType();
+    if (vector_type.getRank() != 1 || !insert.hasDynamicPosition()) {
+      return rewriter.notifyMatchFailure(insert, "the vector should be 1D");
+    }
+    Value index = insert.getDynamicPosition().front();
+
+    // Check that the value that we insert is produced by a vector.extract.
+    auto extract = mlir::dyn_cast_or_null<mlir::vector::ExtractOp>(
+        insert.getValueToStore().getDefiningOp());
+    if (!extract || !extract.hasDynamicPosition() || !extract->hasOneUse()) {
+      return rewriter.notifyMatchFailure(insert,
+                                         "no single-use vector.extract found");
+    }
+
+    // Check that the insert is in the loop and is used only by the yield.
+    auto loop = mlir::dyn_cast_or_null<scf::ForOp>(insert->getParentOp());
+    if (!loop) {
+      return rewriter.notifyMatchFailure(insert, "no scf.for loop found");
+    }
+    if (!IsConflictFree(insert, insert.getDest())) {
+      return rewriter.notifyMatchFailure(insert,
+                                         "write may be read back by loop");
+    }
+
+    // Avoid folding the vector.transfer_read from the vector-sized tensors.
+    // Otherwise, the following function:
+    //
+    // func.func @example(%arg0: tensor<2xi4>, %arg1: tensor<2xi4>)
+    //     -> tensor<2xi4> {
+    //   %0 = scf.for %arg2 = %c0 to %c2 step %c1
+    //       iter_args(%arg3 = %arg1) -> (tensor<2xi4>) {
+    //     %extracted = tensor.extract %arg0[%arg2] : tensor<2xi4>
+    //     %inserted = tensor.insert %extracted into %arg3[%arg2] : tensor<2xi4>
+    //     scf.yield %inserted : tensor<2xi4>
+    //   }
+    //   return %0 : tensor<2xi4>
+    // }
+    //
+    // will be folded into:
+    //
+    // func.func @example(%arg0: tensor<2xi4>,
+    //     %arg1: tensor<2xi4>) -> tensor<2xi4> {
+    //   return %arg0 : tensor<2xi4>
+    // }
+    // and the data won't be copied.
+    auto bbarg = mlir::cast<mlir::BlockArgument>(insert.getDest());
+    int64_t result_index = bbarg.getArgNumber() - 1;
+    if (auto transfer_read =
+            extract.getVector().getDefiningOp<mlir::vector::TransferReadOp>()) {
+      if (transfer_read.getSource().getType().getNumElements() ==
+          vector_type.getNumElements()) {
+        return rewriter.notifyMatchFailure(
+            insert,
+            "do not fold the vector.transfer_read from the vector-sized "
+            "tensors.");
+      }
+    }
+
+    // Check that the extract and insert use the same IV.
+    if (extract.getDynamicPosition().front() != index ||
+        index != loop.getInductionVar()) {
+      return rewriter.notifyMatchFailure(
+          insert,
+          "both insert and extract should use the IV of the parent loop");
+    }
+    // Check the loop spans the whole vector.
+    if (mlir::getConstantIntValue(loop.getUpperBound()) !=
+            vector_type.getDimSize(0) ||
+        mlir::getConstantIntValue(loop.getStep()) != 1 ||
+        mlir::getConstantIntValue(loop.getLowerBound()) != 0) {
+      return rewriter.notifyMatchFailure(
+          insert, "loop bounds don't match the vector type");
+    }
+
+    // Replace the loop result with the corresponding init.
+    auto yield_op = loop.getBody()->getTerminator();
+    rewriter.modifyOpInPlace(yield_op, [&]() {
+      yield_op->setOperand(result_index, insert.getDest());
+    });
+    rewriter.replaceAllUsesWith(loop->getResult(result_index),
+                                extract.getVector());
+    return mlir::success();
+  }
+};
+
 class VectorizeLoadsAndStoresPass
     : public impl::VectorizeLoadsAndStoresPassBase<
           VectorizeLoadsAndStoresPass> {
@@ -477,34 +553,39 @@ class VectorizeLoadsAndStoresPass
 
   explicit VectorizeLoadsAndStoresPass(
       const se::DeviceDescription& device_description)
-      : device_description_(device_description) {}
+      : device_spec_(device_description) {}
 
   void runOnOperation() override {
-    if (!gpu_device_info_.empty()) {
+    if (target_type_ == "gpu" && !gpu_device_info_.empty()) {
       se::GpuDeviceInfoProto device_info;
       CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
                                                        &device_info));
-      device_description_ = se::DeviceDescription(device_info);
+      *device_spec_.mutable_type() = se::DeviceDescription(device_info);
+    } else if (target_type_ == "cpu") {
+      CHECK(gpu_device_info_.empty());
+      *device_spec_.mutable_type() = CpuDeviceSpec{};
     }
     mlir::MLIRContext* mlir_context = &getContext();
     mlir::RewritePatternSet patterns(mlir_context);
-    patterns.add<VectorizeLoad, VectorizeStore>(mlir_context);
-    patterns.add<VectorizeAtomicRMW>(mlir_context, &device_description_);
+    patterns.add<VectorizeLoad, VectorizeStore, FoldVectorInsertExtractPairs>(
+        mlir_context);
+    patterns.add<VectorizeAtomicRMW>(mlir_context, device_spec_);
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       signalPassFailure();
     }
   }
 
-  se::DeviceDescription device_description_;
+  DeviceSpec device_spec_;
 };
 
 }  // namespace
 
 std::unique_ptr<::mlir::Pass> CreateVectorizeLoadsAndStoresPass(
-    const std::string& gpu_device_info) {
+    const std::string& target_type, const std::string& gpu_device_info) {
   VectorizeLoadsAndStoresPassOptions options;
   options.gpu_device_info_ = gpu_device_info;
+  options.target_type_ = target_type;
   return std::make_unique<VectorizeLoadsAndStoresPass>(options);
 }
 
@@ -513,5 +594,5 @@ std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
   return std::make_unique<VectorizeLoadsAndStoresPass>(device_description);
 }
 
-}  // namespace gpu
+}  // namespace emitters
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/type_util.cc b/third_party/xla/xla/codegen/emitters/type_util.cc
index 04d3a6613ec1..460b90790222 100644
--- a/third_party/xla/xla/codegen/emitters/type_util.cc
+++ b/third_party/xla/xla/codegen/emitters/type_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/mlir/utils/type_util.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -65,19 +66,11 @@ mlir::Type TensorShapeToMlirType(const Shape& shape, mlir::OpBuilder& b) {
 llvm::SmallVector<mlir::Type> ShapeToMlirTypes(const Shape& shape,
                                                mlir::OpBuilder& b) {
   llvm::SmallVector<mlir::Type> types;
-  types.reserve(shape.IsTuple() ? shape.tuple_shapes_size() : 1);
-  if (shape.IsTuple()) {
-    types.reserve(shape.tuple_shapes_size());
-    for (auto& tuple_shape : shape.tuple_shapes()) {
-      if (tuple_shape.IsTuple()) {
-        types.append(ShapeToMlirTypes(tuple_shape, b));
-      } else {
-        types.push_back(TensorShapeToMlirType(tuple_shape, b));
-      }
-    }
-  } else {
-    types.push_back(TensorShapeToMlirType(shape, b));
-  }
+  types.reserve(shape.IsTuple() ? shape.tuple_shapes().size() : 1);
+  ShapeUtil::ForEachLeafShape(
+      shape, [&](const Shape& subshape, const ShapeIndex&) {
+        types.push_back(TensorShapeToMlirType(subshape, b));
+      });
   return types;
 }
 
diff --git a/third_party/xla/xla/codegen/emitters/type_util.h b/third_party/xla/xla/codegen/emitters/type_util.h
index 60e8a9390aa2..f0f766f8c1e1 100644
--- a/third_party/xla/xla/codegen/emitters/type_util.h
+++ b/third_party/xla/xla/codegen/emitters/type_util.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef XLA_CODEGEN_EMITTERS_TYPE_UTIL_H_
 #define XLA_CODEGEN_EMITTERS_TYPE_UTIL_H_
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
@@ -24,6 +25,8 @@ limitations under the License.
 namespace xla {
 namespace emitters {
 
+inline constexpr absl::string_view kHasNoCompute = "no_compute";
+
 // Converts an XLA tensor to an MLIR ranked tensor. The layout is stored in the
 // encoding attribute, if it is not the default layout. `shape` must be an
 // array.
diff --git a/third_party/xla/xla/codegen/emitters/utils.cc b/third_party/xla/xla/codegen/emitters/utils.cc
new file mode 100644
index 000000000000..ed966925931d
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/utils.cc
@@ -0,0 +1,48 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/codegen/emitters/utils.h"
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Support/LLVM.h"
+
+namespace xla::emitters {
+
+using mlir::DenseElementsAttr;
+using mlir::ShapedType;
+
+DenseElementsAttr GetZeroDenseElementsAttr(ShapedType shaped_type) {
+  auto elem_type = shaped_type.getElementType();
+  if (auto float_type = mlir::dyn_cast<mlir::FloatType>(elem_type)) {
+    mlir::SmallVector<llvm::APFloat, 4> values(
+        shaped_type.getNumElements(),
+        mlir::APFloat::getZero(float_type.getFloatSemantics()));
+    return DenseElementsAttr::get(shaped_type, values);
+  }
+  if (auto int_type = mlir::dyn_cast<mlir::IntegerType>(elem_type)) {
+    mlir::SmallVector<llvm::APInt, 4> values(
+        shaped_type.getNumElements(),
+        mlir::APInt::getZero(int_type.getIntOrFloatBitWidth()));
+    return DenseElementsAttr::get(shaped_type, values);
+  }
+  llvm_unreachable("Unsupported element type");
+}
+
+}  // namespace xla::emitters
diff --git a/third_party/xla/xla/codegen/emitters/utils.h b/third_party/xla/xla/codegen/emitters/utils.h
new file mode 100644
index 000000000000..321fadf53f13
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/utils.h
@@ -0,0 +1,29 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_EMITTERS_UTILS_H_
+#define XLA_CODEGEN_EMITTERS_UTILS_H_
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::emitters {
+
+mlir::DenseElementsAttr GetZeroDenseElementsAttr(mlir::ShapedType shaped_type);
+
+}  // namespace xla::emitters
+
+#endif  // XLA_CODEGEN_EMITTERS_UTILS_H_
diff --git a/third_party/xla/xla/codegen/llvm_ir_kernel_source.h b/third_party/xla/xla/codegen/llvm_ir_kernel_source.h
index 872df7567b25..3a58afcd0877 100644
--- a/third_party/xla/xla/codegen/llvm_ir_kernel_source.h
+++ b/third_party/xla/xla/codegen/llvm_ir_kernel_source.h
@@ -38,7 +38,7 @@ class LlvmIrKernelSource final : public KernelSource {
       : module_(std::move(module), std::move(context)) {}
 
   LlvmIrKernelSource(LlvmIrKernelSource&& other) = default;
-  LlvmIrKernelSource& operator=(LlvmIrKernelSource&& other) = default;
+  LlvmIrKernelSource& operator=(LlvmIrKernelSource&& other) noexcept = default;
 
   llvm::orc::ThreadSafeModule thread_safe_module() && {
     return std::move(module_);
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.cc b/third_party/xla/xla/codegen/mlir_kernel_source.cc
new file mode 100644
index 000000000000..f2f0a2e651d7
--- /dev/null
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.cc
@@ -0,0 +1,59 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/codegen/mlir_kernel_source.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Parser/Parser.h"
+#include "xla/util.h"
+
+namespace xla {
+
+absl::StatusOr<MlirKernelSource> MlirKernelSource::ParseFromString(
+    absl::string_view ir, std::unique_ptr<mlir::MLIRContext> context) {
+  llvm::SourceMgr source_mgr;
+
+  std::string error_string;
+  llvm::raw_string_ostream error_stream(error_string);
+  mlir::SourceMgrDiagnosticHandler source_mgr_handler(source_mgr, context.get(),
+                                                      error_stream);
+
+  source_mgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(ir),
+                                llvm::SMLoc());
+
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
+      mlir::parseSourceFile<mlir::ModuleOp>(source_mgr, context.get());
+
+  if (!mlir_module) {
+    return Internal("Failed to parse MLIR IR: %s", error_string);
+  }
+
+  return MlirKernelSource(std::move(context), std::move(mlir_module));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.h b/third_party/xla/xla/codegen/mlir_kernel_source.h
new file mode 100644
index 000000000000..9ff4c14cbb19
--- /dev/null
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.h
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_MLIR_KERNEL_SOURCE_H_
+#define XLA_CODEGEN_MLIR_KERNEL_SOURCE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/Support/DebugStringHelper.h"
+#include "xla/codegen/kernel_source.h"
+
+namespace xla {
+
+// Kernel JIT source that is backed by MLIR and owned by a mlir::ModuleOp.
+
+// The MLIR source is typically created by a fusion emitter from either the CPU
+// or GPU backend, e.g., ScatterFusion. The specific dialect(s) that backs the
+// source is not specified but is implicit in the passed context. It is expected
+// that the source will be lowered to LLVM by the corresponding backend
+// compiler.
+class MlirKernelSource final : public KernelSource {
+ public:
+  MlirKernelSource(std::unique_ptr<mlir::MLIRContext> mlir_context,
+                   mlir::OwningOpRef<mlir::ModuleOp> mlir_module)
+      : mlir_context_(std::move(mlir_context)),
+        module_(std::move(mlir_module)) {}
+
+  MlirKernelSource(MlirKernelSource&& other) noexcept = default;
+  MlirKernelSource& operator=(MlirKernelSource&& other) noexcept = default;
+
+  static absl::StatusOr<MlirKernelSource> ParseFromString(
+      absl::string_view ir, std::unique_ptr<mlir::MLIRContext> context);
+
+  mlir::ModuleOp module() { return *module_; }
+
+  std::string ToString() const final { return mlir::debugString(*module_); }
+
+ private:
+  std::unique_ptr<mlir::MLIRContext> mlir_context_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_MLIR_KERNEL_SOURCE_H_
diff --git a/third_party/xla/xla/codegen/testlib/BUILD b/third_party/xla/xla/codegen/testlib/BUILD
index 058e8a887e98..f4e95e39f3dc 100644
--- a/third_party/xla/xla/codegen/testlib/BUILD
+++ b/third_party/xla/xla/codegen/testlib/BUILD
@@ -1,5 +1,5 @@
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-load("//xla:strict.default.bzl", "py_strict_test")
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla:pytype.bzl", "pytype_strict_library")
 load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -52,10 +52,13 @@ tsl_pybind_extension(
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_source",
         "//xla/codegen:kernel_spec",
+        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/python:nb_absl_inlined_vector",
@@ -88,7 +91,7 @@ py_strict_test(
         "no_oss",
     ],
     deps = [
-        ":_extension",
+        ":_extension",  # buildcleaner: keep
         ":testlib",
         "//third_party/py/numpy",
         "//xla/python:xla_extension",
diff --git a/third_party/xla/xla/codegen/testlib/__init__.py b/third_party/xla/xla/codegen/testlib/__init__.py
index 31599d5f9912..7a6094599d4f 100644
--- a/third_party/xla/xla/codegen/testlib/__init__.py
+++ b/third_party/xla/xla/codegen/testlib/__init__.py
@@ -30,6 +30,9 @@
 KernelEmmitter = _extension.KernelEmitter
 KernelRunner = _extension.KernelRunner
 KernelSpec = _extension.KernelSpec
+LlvmIrKernelSource = _extension.LlvmIrKernelSource
+MlirKernelSource = _extension.MlirKernelSource
+ScatterDimensionNumbers = _extension.ScatterDimensionNumbers
 # go/keep-sorted end
 
 # Functions
diff --git a/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc b/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
index 05232876a75c..e131bba8015a 100644
--- a/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/codegen/testlib/kernel_runner_extension.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <stack>
 #include <stdexcept>
 #include <string>
 #include <utility>
@@ -39,11 +38,14 @@ limitations under the License.
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/kernel_source.h"
 #include "xla/codegen/kernel_spec.h"
+#include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/codegen/testlib/kernel_runner.h"
 #include "xla/comparison_util.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
@@ -55,6 +57,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -89,6 +92,12 @@ std::unique_ptr<HloInstruction> CreateComparisonHloInstruction(
   return HloInstruction::CreateCompare(shape, lhs, rhs, direction);
 }
 
+std::unique_ptr<HloInstruction> CreateCallHloInstruction(
+    const Shape& shape, std::vector<HloInstruction*> operands,
+    HloComputation* computation) {
+  return HloInstruction::CreateCall(shape, operands, computation);
+}
+
 HloModuleConfig DefaultHloModuleConfigWithDebugOptions() {
   HloModuleConfig config;
   config.set_debug_options(GetDebugOptionsFromFlags());
@@ -165,6 +174,22 @@ NB_MODULE(_extension, kernel_runner_module) {
   nb::class_<KernelSource>(kernel_runner_module, "KernelSource")
       .def("__str__", &KernelSource::ToString);
 
+  nb::class_<LlvmIrKernelSource, KernelSource> llvm_kernel_source(
+      kernel_runner_module, "LlvmIrKernelSource");
+
+  nb::class_<MlirKernelSource, KernelSource>(kernel_runner_module,
+                                             "MlirKernelSource")
+      .def_static(
+          "parse_from_string",
+          [](absl::string_view ir, std::unique_ptr<mlir::MLIRContext> context) {
+            absl::StatusOr<MlirKernelSource> source =
+                MlirKernelSource::ParseFromString(ir, std::move(context));
+            if (!source.ok()) {
+              throw std::runtime_error(std::string(source.status().message()));
+            }
+            return std::move(source).value();
+          });
+
   nb::class_<KernelSpec> kernel_spec(kernel_runner_module, "KernelSpec");
 
   nb::class_<KernelDefinition>(kernel_runner_module, "KernelDefinition")
@@ -229,6 +254,36 @@ NB_MODULE(_extension, kernel_runner_module) {
           nb::arg("lhs_batch_dims") = std::vector<int64_t>{},
           nb::arg("rhs_batch_dims") = std::vector<int64_t>{});
 
+  nb::class_<ScatterDimensionNumbers>(kernel_runner_module,
+                                      "ScatterDimensionNumbers")
+      .def(
+          "__init__",
+          [](ScatterDimensionNumbers* self,
+             std::vector<int64_t> update_window_dims,
+             std::vector<int64_t> inserted_window_dims,
+             std::vector<int64_t> scatter_dims_to_operand_dims,
+             int64_t index_vector_dim, std::vector<int64_t> input_batching_dims,
+             std::vector<int64_t> scatter_indices_batching_dims) {
+            new (self) ScatterDimensionNumbers();
+            self->mutable_update_window_dims()->Assign(
+                update_window_dims.begin(), update_window_dims.end());
+            self->mutable_inserted_window_dims()->Assign(
+                inserted_window_dims.begin(), inserted_window_dims.end());
+            self->mutable_scatter_dims_to_operand_dims()->Assign(
+                scatter_dims_to_operand_dims.begin(),
+                scatter_dims_to_operand_dims.end());
+            self->set_index_vector_dim(index_vector_dim);
+            self->mutable_input_batching_dims()->Assign(
+                input_batching_dims.begin(), input_batching_dims.end());
+            self->mutable_scatter_indices_batching_dims()->Assign(
+                scatter_indices_batching_dims.begin(),
+                scatter_indices_batching_dims.end());
+          },
+          nb::arg("update_window_dims"), nb::arg("inserted_window_dims"),
+          nb::arg("scatter_dims_to_operand_dims"), nb::arg("index_vector_dim"),
+          nb::arg("input_batching_dims") = std::vector<int64_t>{},
+          nb::arg("scatter_indices_batching_dims") = std::vector<int64_t>{});
+
   nb::class_<HloInstruction> hlo_instruction(kernel_runner_module,
                                              "HloInstruction");
   // Factory methods
@@ -250,8 +305,22 @@ NB_MODULE(_extension, kernel_runner_module) {
                   nb::keep_alive<0, 2>(), nb::keep_alive<0, 3>())
       .def_static("create_concatenate", &HloInstruction::CreateConcatenate,
                   nb::keep_alive<0, 2>())
+      .def_static("create_call", &CreateCallHloInstruction,
+                  nb::keep_alive<0, 1>(), nb::keep_alive<0, 2>(),
+                  nb::keep_alive<0, 3>())
+      .def_static(
+          "create_scatter",
+          nb::overload_cast<const Shape&, HloInstruction*, HloInstruction*,
+                            HloInstruction*, HloComputation*,
+                            const ScatterDimensionNumbers&, bool, bool>(
+              &HloInstruction::CreateScatter),
+          nb::keep_alive<0, 2>(), nb::keep_alive<0, 3>(),
+          nb::keep_alive<0, 4>(), nb::keep_alive<0, 5>())
       .def("name", &HloInstruction::name);
 
+  nb::class_<HloFusionInstruction, HloInstruction> fusion_instruction(
+      kernel_runner_module, "HloFusionInstruction");
+
   nb::class_<HloComputation>(kernel_runner_module, "HloComputation")
       .def("__str__",
            nb::overload_cast<>(&HloComputation::ToString, nb::const_));
@@ -300,7 +369,7 @@ NB_MODULE(_extension, kernel_runner_module) {
            })
       .def("add_computation",
            [](HloModule* self, std::unique_ptr<HloComputation> computation) {
-             self->AddComputation(std::move(computation), false);
+             self->AddEmbeddedComputation(std::move(computation));
            })
       .def("set_schedule",
            [](HloModule& self, HloSchedule schedule) {
diff --git a/third_party/xla/xla/codegen/testlib/kernel_runner_test.py b/third_party/xla/xla/codegen/testlib/kernel_runner_test.py
index e2e3b37751d0..794ebde561d5 100644
--- a/third_party/xla/xla/codegen/testlib/kernel_runner_test.py
+++ b/third_party/xla/xla/codegen/testlib/kernel_runner_test.py
@@ -41,7 +41,7 @@ def test_from_instruction(self):
         "HloModule sine_module,",
         "{",
         "%input = s32[4]{0} parameter(0)",
-        "ROOT %sine = s32[4]{0} sine(s32[4]{0} %input)",
+        "ROOT %sine = s32[4]{0} sine(%input)",
         "}",
     ]
     self.assertContainsInOrder(
diff --git a/third_party/xla/xla/codegen/tools/BUILD b/third_party/xla/xla/codegen/tools/BUILD
index 666c9131d8d8..6ed9d9c32b86 100644
--- a/third_party/xla/xla/codegen/tools/BUILD
+++ b/third_party/xla/xla/codegen/tools/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/comparison_util.cc b/third_party/xla/xla/comparison_util.cc
index 7177a0ec96c3..32b255823f9d 100644
--- a/third_party/xla/xla/comparison_util.cc
+++ b/third_party/xla/xla/comparison_util.cc
@@ -167,7 +167,7 @@ absl::string_view ComparisonOrderToString(Comparison::Order order) {
 
 absl::StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction) {
-  static auto* map =
+  static auto* const map =
       new absl::flat_hash_map<std::string, Comparison::Direction>({
           {"EQ", Comparison::Direction::kEq},
           {"NE", Comparison::Direction::kNe},
@@ -185,12 +185,13 @@ absl::StatusOr<Comparison::Direction> StringToComparisonDirection(
 
 absl::StatusOr<Comparison::Type> StringToComparisonType(
     absl::string_view comparison) {
-  static auto* map = new absl::flat_hash_map<std::string, Comparison::Type>({
-      {"FLOAT", Comparison::Type::kFloat},
-      {"TOTALORDER", Comparison::Type::kFloatTotalOrder},
-      {"SIGNED", Comparison::Type::kSigned},
-      {"UNSIGNED", Comparison::Type::kUnsigned},
-  });
+  static auto* const map =
+      new absl::flat_hash_map<std::string, Comparison::Type>({
+          {"FLOAT", Comparison::Type::kFloat},
+          {"TOTALORDER", Comparison::Type::kFloatTotalOrder},
+          {"SIGNED", Comparison::Type::kSigned},
+          {"UNSIGNED", Comparison::Type::kUnsigned},
+      });
   auto it = map->find(comparison);
   if (it == map->end()) {
     return InvalidArgument("Unknown comparison type: %s", comparison);
diff --git a/third_party/xla/xla/comparison_util.h b/third_party/xla/xla/comparison_util.h
index e130a861cc33..b198e98a44ee 100644
--- a/third_party/xla/xla/comparison_util.h
+++ b/third_party/xla/xla/comparison_util.h
@@ -188,7 +188,7 @@ class Comparison {
 
   template <typename T>
   inline bool Compare(const T a, const T b) const {
-    DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
+    DCHECK(primitive_util::CanRepresent<T>(primitive_type_));
     if constexpr (is_specialized_floating_point_v<T>) {
       if (IsTotalOrder()) {
         //  -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
diff --git a/third_party/xla/xla/core/collectives/BUILD b/third_party/xla/xla/core/collectives/BUILD
index 9e2f3325c9ec..34695c08b349 100644
--- a/third_party/xla/xla/core/collectives/BUILD
+++ b/third_party/xla/xla/core/collectives/BUILD
@@ -19,7 +19,6 @@ cc_library(
     srcs = ["clique.cc"],
     hdrs = ["clique.h"],
     deps = [
-        ":clique_id",
         ":communicator",
         ":rank_id",
         "//xla:util",
@@ -31,12 +30,14 @@ cc_library(
 
 cc_library(
     name = "collectives",
+    srcs = ["collectives.cc"],
     hdrs = ["collectives.h"],
     deps = [
         ":clique_id",
         ":clique_key",
         ":communicator",
         ":rank_id",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
@@ -72,6 +73,8 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/core/collectives/collectives.cc b/third_party/xla/xla/core/collectives/collectives.cc
new file mode 100644
index 000000000000..98fe2e0a5fdb
--- /dev/null
+++ b/third_party/xla/xla/core/collectives/collectives.cc
@@ -0,0 +1,35 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/core/collectives/collectives.h"
+
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+
+namespace xla {
+
+Collectives::~Collectives() { NotifyOnDestroyCallbacks(); }
+
+void Collectives::AddOnDestroyCallback(absl::AnyInvocable<void()> callback) {
+  on_destroy_callbacks_.push_back(std::move(callback));
+}
+
+void Collectives::NotifyOnDestroyCallbacks() {
+  auto callbacks = std::move(on_destroy_callbacks_);
+  for (auto& callback : callbacks) callback();
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/core/collectives/collectives.h b/third_party/xla/xla/core/collectives/collectives.h
index 009842972e89..95a40d608128 100644
--- a/third_party/xla/xla/core/collectives/collectives.h
+++ b/third_party/xla/xla/core/collectives/collectives.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/core/collectives/clique_id.h"
@@ -42,7 +43,7 @@ namespace xla {
 // XLA:GPU device-initiated collective operations are implemented using NVSHMEM.
 class Collectives {
  public:
-  virtual ~Collectives() = default;
+  virtual ~Collectives();
 
   // A base class for the device that the collectives are running on, i.e. in
   // XLA:GPU this is the GPU device (StreamExecutor).
@@ -79,7 +80,38 @@ class Collectives {
   virtual absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
   SplitCommunicators(absl::Span<const Communicator* const> comms, int32_t color,
                      absl::Span<const RankId> keys, const Config& config) = 0;
+
+  // Collectives instance can be ephemeral and used only for a small number of
+  // XLA program executions. XLA backends that rely on the collectives instances
+  // as a part of the cache key can be notified when the collectives instance
+  // is destroyed, so that they can invalidate the cache entries.
+  //
+  // After the on-destroy callback is invoked, XLA backends must not use any
+  // of the communicators created by the collectives instance.
+  //
+  // It is an XLA client responsibility (i.e. Pathways) to guarantee that
+  // collectives instance stays alive until all the XLA program executions that
+  // use it are finished.
+  void AddOnDestroyCallback(absl::AnyInvocable<void()> callback);
+
+ protected:
+  Collectives() = default;
+  Collectives(Collectives&&) = default;
+  Collectives& operator=(Collectives&&) = default;
+
+  // Notifies all registered callbacks that the collectives instance is
+  // about to be destroyed.
+  //
+  // IMPORTANT: Because callbacks are invoked from the base class destructor,
+  // they will be called after the derived class is destroyed. If it is
+  // important to call callbacks before the derived class is destroyed, the
+  // derived class should call it explicitly in its own destructor.
+  void NotifyOnDestroyCallbacks();
+
+ private:
+  std::vector<absl::AnyInvocable<void()>> on_destroy_callbacks_;
 };
 
 }  // namespace xla
+
 #endif  // XLA_CORE_COLLECTIVES_COLLECTIVES_H_
diff --git a/third_party/xla/xla/core/collectives/communicator.h b/third_party/xla/xla/core/collectives/communicator.h
index af95f7063fc8..3a00f5437d22 100644
--- a/third_party/xla/xla/core/collectives/communicator.h
+++ b/third_party/xla/xla/core/collectives/communicator.h
@@ -22,20 +22,32 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
 // Collective communicator defines the set of communicating XLA processes.
+//
+// Returned async value signals that the communicator has successfully
+// launched the operation on the underlying executor.
+// Completion of the operation depends on the backend implementation. i.e. on
+// GPU the async value becomes available when the operation is scheduled on the
+// device stream, and on CPU it becomes available when the operation is
+// completed.
 class Communicator {
  public:
+  using Event = tsl::Chain;
+
   virtual ~Communicator() = default;
 
   // An executor is an abstraction for the underlying resource where collective
@@ -73,69 +85,85 @@ class Communicator {
   // have to wait for the completion of scheduled operations.
   virtual absl::Status HealthCheck() const { return absl::OkStatus(); }
 
+  // This is a barrier operation that blocks all participating
+  // ranks from proceeding.
+  virtual absl::Status Barrier(const Executor& executor) {
+    return Unimplemented("Barrier is not implemented");
+  }
+
   // Reduce buffers of length `count` in `send_buff` using `reduction_kind`
   // reduction and leaves identical copies of the result on each `recv_buff`.
-  virtual absl::Status AllReduce(stream_executor::DeviceMemoryBase send_buffer,
-                                 stream_executor::DeviceMemoryBase recv_buffer,
-                                 PrimitiveType dtype, size_t count,
-                                 ReductionKind reduction_kind,
-                                 const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> AllReduce(
+      stream_executor::DeviceMemoryBase send_buffer,
+      stream_executor::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+      size_t count, ReductionKind reduction_kind, const Executor& executor) = 0;
 
   // Copy data in `send_buff` from the root device to the `recv_buff` on
   // all other devices.
-  virtual absl::Status Broadcast(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
-                                 PrimitiveType dtype, size_t count, RankId root,
-                                 const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> Broadcast(se::DeviceMemoryBase send_buffer,
+                                              se::DeviceMemoryBase recv_buffer,
+                                              PrimitiveType dtype, size_t count,
+                                              RankId root,
+                                              const Executor& executor) = 0;
 
   // Reduce data in `send_buff` from all devices using the `reduction_kind`
   // operation and leave the reduced result scattered over the devices so that
   // the `recv_buff` on rank `i` will contain the i-th block of the result.
-  virtual absl::Status ReduceScatter(se::DeviceMemoryBase send_buffer,
-                                     se::DeviceMemoryBase recv_buffer,
-                                     PrimitiveType dtype, size_t count,
-                                     ReductionKind reduction_kind,
-                                     const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> ReduceScatter(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+      const Executor& executor) = 0;
 
   // Gather `count` values from all devices into `recv_buffer`, receiving data
   // from rank `i` at offset `i * sendcount`.
-  virtual absl::Status AllGather(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
-                                 PrimitiveType dtype, size_t count,
-                                 const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> AllGather(se::DeviceMemoryBase send_buffer,
+                                              se::DeviceMemoryBase recv_buffer,
+                                              PrimitiveType dtype, size_t count,
+                                              const Executor& executor) = 0;
 
   // Sends data from `send_buffer` to `target_ranks` and receives data from
   // `source_rank` into `recv_buffer`. If `source_rank` is not specified, the
   // output is filled with zeros.
-  virtual absl::Status CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                         se::DeviceMemoryBase recv_buffer,
-                                         PrimitiveType dtype, size_t count,
-                                         std::optional<RankId> source_rank,
-                                         absl::Span<const RankId> target_ranks,
-                                         const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> CollectivePermute(
+      se::DeviceMemoryBase send_buffer, se::DeviceMemoryBase recv_buffer,
+      PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+      absl::Span<const RankId> target_ranks, const Executor& executor) = 0;
 
   // Sends `count` values from `send_buffers` to other ranks and receives data
   // from other ranks into `recv_buffers`.
-  virtual absl::Status AllToAll(
-      absl::Span<const se::DeviceMemoryBase> send_buffers,
-      absl::Span<const se::DeviceMemoryBase> recv_buffers, PrimitiveType dtype,
-      size_t count, const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> AllToAll(
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `peer`.
-  virtual absl::Status Send(se::DeviceMemoryBase send_buffer,
-                            PrimitiveType dtype, size_t count, RankId peer,
-                            const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> Send(se::DeviceMemoryBase send_buffer,
+                                         PrimitiveType dtype, size_t count,
+                                         RankId peer,
+                                         const Executor& executor) = 0;
 
   // Receive data from rank `peer` into `recv_buff`.
-  virtual absl::Status Recv(se::DeviceMemoryBase recv_buffer,
-                            PrimitiveType dtype, size_t count, RankId peer,
-                            const Executor& executor) = 0;
+  virtual tsl::AsyncValueRef<Event> Recv(se::DeviceMemoryBase recv_buffer,
+                                         PrimitiveType dtype, size_t count,
+                                         RankId peer,
+                                         const Executor& executor) = 0;
 
   // Returns the number of ranks in the communicator.
   virtual absl::StatusOr<size_t> NumRanks() const = 0;
 
+  // Returns the current rank number in the communicator.
+  virtual absl::StatusOr<size_t> CurrentRank() {
+    return Unimplemented("CurrentRank is not implemented");
+  }
+
   // Returns a human-readable description of the communicator.
   virtual std::string ToString() const = 0;
+
+ protected:
+  // Returns an `Event` that is always available.
+  static tsl::AsyncValueRef<Event> OkEvent() {
+    return tsl::MakeAvailableAsyncValueRef<Event>();
+  }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Communicator& comm) {
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 9dc5d48c5385..33fa90f7e35e 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -42,12 +42,24 @@ limitations under the License.
 #include "xla/service/collective_utils.h"
 #include "xla/stream_executor/cuda/nvjitlink_support.h"
 #include "xla/stream_executor/cuda/ptx_compiler_support.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
+#include "tsl/platform/cpu_info.h"  // NOLINT
+#include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace xla {
 
+inline std::string DefaultMaxIsa() {
+  // There are many missing SVE lowerings in LLVM. Limit features to NEON for
+  // now. There shouldn't be significant performance impact as most AAarch64
+  // CPUs still use 128-bit registers.
+  // TODO(penporn): Remove this once SVE is fully supported.
+  return tsl::port::IsAarch64CPU() ? "NEON" : "";
+}
+
 DebugOptions DefaultDebugOptionsIgnoringFlags() {
   DebugOptions opts;
   opts.set_xla_llvm_enable_alias_scope_metadata(true);
@@ -79,6 +91,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_dump_hlo_as_long_text(true);
   opts.set_xla_dump_large_constants(false);
   opts.set_xla_dump_enable_mlir_pretty_form(true);
+  opts.set_xla_dump_full_hlo_config(true);
   opts.set_xla_gpu_unsupported_annotate_with_emitter_loc(false);
   opts.set_xla_debug_buffer_assignment_show_max(15);
 #ifdef ENABLE_MKL
@@ -87,6 +100,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 #ifdef XLA_CPU_USE_ACL
   opts.set_xla_cpu_use_acl(true);
 #endif
+  opts.set_xla_cpu_use_fusion_emitters(true);
   opts.set_xla_cpu_use_thunk_runtime(true);
   opts.set_xla_cpu_use_xnnpack(false);
   opts.set_xla_cpu_experimental_xnn_graph_fusion_mode(
@@ -95,7 +109,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_copy_insertion_use_region_analysis(false);
   opts.set_xla_cpu_enable_concurrency_optimized_scheduler(true);
   opts.set_xla_cpu_prefer_vector_width(256);
-  opts.set_xla_cpu_max_isa("");
+  opts.set_xla_cpu_max_isa(DefaultMaxIsa());
   opts.set_xla_cpu_generate_unique_c_style_kernel_entry_points(false);
 
   opts.set_xla_cpu_enable_fast_math(false);
@@ -110,12 +124,11 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // By default, copy TF's Eigen style min_max behavior with nans.
   opts.set_xla_cpu_enable_fast_min_max(true);
 
-  opts.set_xla_gpu_enable_cudnn_frontend(true);
-
   opts.set_xla_gpu_enable_cublaslt(false);
 
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLAS);
+  opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLASLT);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUSTOM_CALL);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
   opts.set_xla_gpu_graph_min_graph_size(5);
@@ -137,6 +150,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
       kDefaultAllGatherCombineThreshold);
   opts.set_xla_gpu_reduce_scatter_combine_threshold_bytes(
       kDefaultReduceScatterCombineThreshold);
+  opts.set_xla_gpu_collective_permute_combine_threshold_bytes(
+      kDefaultCollectivePermuteCombineThreshold);
   opts.set_xla_gpu_enable_all_gather_combine_by_dim(false);
   opts.set_xla_gpu_enable_reduce_scatter_combine_by_dim(false);
   opts.set_xla_gpu_enable_approx_costly_collectives(false);
@@ -154,8 +169,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
+  opts.set_xla_gpu_experimental_enable_nvshmem(false);
   opts.set_xla_gpu_enable_nccl_comm_splitting(true);
-  opts.set_xla_gpu_enable_nccl_per_stream_comms(false);
   opts.set_xla_gpu_nccl_init_max_rank_per_root_ratio(0);
 
   opts.set_xla_gpu_temp_buffer_use_separate_color(false);
@@ -192,7 +207,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_memory_limit_slop_factor(95);
   opts.set_xla_gpu_enable_highest_priority_async_stream(true);
 
-  opts.set_xla_gpu_enable_pipelined_collectives(false);
   opts.set_xla_gpu_enable_pipelined_all_reduce(false);
   opts.set_xla_gpu_enable_pipelined_all_gather(false);
   opts.set_xla_gpu_enable_pipelined_reduce_scatter(true);
@@ -203,15 +217,17 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_pipeline_parallelism_opt_level(
       DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE);
 
+  opts.set_xla_gpu_experimental_collective_cse_distance_threshold(0);
+
   opts.set_xla_gpu_experimental_enable_subchannel_dequantisation_fusion(false);
   opts.set_xla_partitioning_algorithm(
       DebugOptions::PARTITIONING_ALGORITHM_NOOP);
 
   opts.set_xla_gpu_enable_triton_gemm(true);
   opts.set_xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms(false);
+  opts.set_xla_gpu_unsupported_enable_triton_multi_output_fusion(false);
   opts.set_xla_gpu_enable_cudnn_int8x32_convolution_reordering(true);
   opts.set_xla_gpu_triton_gemm_any(true);
-  opts.set_xla_gpu_unsupported_force_triton_gemm(false);
   opts.set_xla_gpu_verify_triton_fusion_numerics(false);
 
   // Moving reduce-scatter out of while loops can increase memory footprint, so
@@ -227,21 +243,17 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_auto_spmd_partitioning_memory_budget_gb(0);
   opts.set_xla_gpu_auto_spmd_partitioning_memory_budget_ratio(1.1);
-  opts.set_xla_gpu_triton_gemm_disable_reduced_precision_reduction(false);
-  opts.set_xla_gpu_unsafe_pipelined_loop_annotator(false);
 
   opts.set_xla_gpu_copy_insertion_use_region_analysis(false);
   opts.set_xla_gpu_collect_cost_model_stats(false);
   opts.set_xla_gpu_enable_split_k_autotuning(true);
 
   opts.set_xla_gpu_enable_reduction_epilogue_fusion(true);
-  opts.set_xla_gpu_enable_nccl_clique_optimization(false);
   opts.set_xla_gpu_cublas_fallback(true);
   opts.set_xla_gpu_cudnn_gemm_fusion_level(0);
   opts.set_xla_gpu_enable_while_loop_double_buffering(false);
   opts.set_xla_gpu_enable_while_loop_unrolling(
       DebugOptions::WHILE_LOOP_UNROLLING_AUTO_UNROLL);
-  opts.set_xla_gpu_ensure_minor_dot_contraction_dims(false);
   opts.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(true);
   opts.set_xla_gpu_fail_ptx_compilation_on_register_spilling(false);
   opts.set_xla_gpu_llvm_verification_level(0);
@@ -252,6 +264,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_operand_bytes_threshold_for_windowed_einsum(-1);
 
   opts.set_xla_gpu_enable_triton_hopper(false);
+  opts.set_xla_gpu_experimental_enable_dynamic_dot_search_space(true);
   opts.set_xla_gpu_experimental_enable_fusion_block_level_rewriter(false);
 
   opts.set_xla_gpu_enable_llvm_module_compilation_parallelism(false);
@@ -263,7 +276,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_nccl_p2p_max_nchannels(0);
   opts.set_xla_gpu_multi_streamed_windowed_einsum(true);
 
-  opts.set_xla_gpu_experimental_stream_annotation(false);
+  opts.set_xla_gpu_experimental_stream_annotation(true);
   // Minimum combined size of matrices in matrix multiplication to
   // be rewritten to cuBLAS or Triton kernel call.
   // This threshold is a conservative estimate and has been measured
@@ -302,19 +315,33 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_executable_warn_stuck_timeout_seconds(10);
   opts.set_xla_gpu_executable_terminate_timeout_seconds(30);
+
+  opts.set_xla_gpu_first_collective_call_warn_stuck_timeout_seconds(20);
+  opts.set_xla_gpu_first_collective_call_terminate_timeout_seconds(40);
+
+  opts.set_xla_gpu_experimental_collective_perf_table_path("");
+  opts.set_xla_gpu_experimental_matmul_perf_table_path("");
   opts.set_xla_gpu_experimental_disable_binary_libraries(false);
-  opts.set_xla_ignore_channel_id(true);
+  // --xla_ignore_channel_id should be kept false by default while channel ids
+  // are load-bearing.
+  opts.set_xla_ignore_channel_id(false);
   opts.set_xla_gpu_dot_merger_threshold_mb(32);
   opts.set_xla_enable_fast_math(false);
   opts.set_xla_gpu_experimental_parallel_collective_overlap_limit(1);
   opts.set_xla_pjrt_allow_auto_layout_in_hlo(false);
   opts.set_xla_gpu_enable_scatter_determinism_expander(true);
   opts.set_xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(false);
+  opts.set_xla_gpu_unsupported_use_all_reduce_one_shot_kernel(false);
+  opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(true);
+  opts.set_xla_gpu_unsupported_enable_all_reduce_decomposer(false);
   opts.set_xla_gpu_experimental_pack_dot_operands_along_k_dimension(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_fix_max_iterations(false);
   opts.set_xla_hlo_pass_fix_detect_cycles(false);
   opts.set_xla_gpu_experimental_enable_sync_collective_combining(false);
-  opts.set_xla_allow_get_default_platform(true);
+  opts.set_xla_unsupported_crash_on_hlo_pass_silent_hlo_change(false);
+  opts.set_xla_unsupported_crash_on_hlo_pass_noop_change(false);
+  opts.set_xla_gpu_experimental_enable_split_k_rewrite(false);
+  opts.set_xla_gpu_experimental_enable_triton_tma(false);
   return opts;
 }
 
@@ -346,7 +373,7 @@ static thread_local std::unique_ptr<
 // Logs a warning if a pass's fuel was never consumed, on the theory that this
 // may be a typo in the flag value.  Called atexit.
 static void WarnIfFuelWasNeverConsumed() {
-  CHECK(fuel_ever_consumed != nullptr);
+  CHECK_NOTNULL(fuel_ever_consumed);
   for (const auto& kv : *fuel_ever_consumed) {
     absl::string_view pass = kv.first;
     bool was_consumed = kv.second;
@@ -817,7 +844,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_cpu_enable_fast_min_max",
       bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_min_max),
       debug_options->xla_cpu_enable_fast_min_max(),
-      "Enable fast floating point min/max lowering that always propagates "
+      "Enable fast floating point min/max lowering that might not propagate "
       "NaNs."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_fast_min_max",
@@ -951,6 +978,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_cpu_use_acl", bool_setter_for(&DebugOptions::set_xla_cpu_use_acl),
       debug_options->xla_cpu_use_acl(),
       "Generate calls to ACL (Arm Compute Library) in the CPU backend."));
+  flag_list->push_back(
+      tsl::Flag("xla_cpu_use_fusion_emitters",
+                bool_setter_for(&DebugOptions::set_xla_cpu_use_fusion_emitters),
+                debug_options->xla_cpu_use_fusion_emitters(),
+                "Use fusion emitters for code generation in the CPU backend. "
+                "Note: only works with --xla_cpu_use_thunk_runtime=true."));
   flag_list->push_back(
       tsl::Flag("xla_cpu_use_thunk_runtime",
                 bool_setter_for(&DebugOptions::set_xla_cpu_use_thunk_runtime),
@@ -1263,16 +1296,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "that falling back to the driver can have drawbacks like using more "
       "memory and/or other bugs during compilation, so we recommend setting "
       "this flag to false."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_unsafe_pipelined_loop_annotator",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_unsafe_pipelined_loop_annotator),
-      debug_options->xla_gpu_unsafe_pipelined_loop_annotator(),
-      "If this option is true, then the while loop with rotate right "
-      "pattern will be considered a pipelined while loop and the "
-      "operations within the pipeline bubbles may be considered no-ops. "
-      "Specifically, collective-permute may become a no-op for the iterations "
-      "within pipeline bubble. This is an unsafe flag."));
   flag_list->push_back(tsl::Flag(
       "xla_multiheap_size_constraint_per_heap",
       int32_setter_for(
@@ -1349,6 +1372,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_reduce_scatter_combine_threshold_bytes),
       debug_options->xla_gpu_reduce_scatter_combine_threshold_bytes(),
       "Size threshold (in bytes) for the GPU reduce-scatter combiner."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_collective_permute_combine_threshold_bytes",
+      int64_setter_for(
+          &DebugOptions::
+              set_xla_gpu_collective_permute_combine_threshold_bytes),
+      debug_options->xla_gpu_collective_permute_combine_threshold_bytes(),
+      "Size threshold (in bytes) for the GPU collective-permute combiner."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_all_gather_combine_by_dim",
       bool_setter_for(
@@ -1422,11 +1452,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_dump_hlo_unoptimized_snapshots(),
       "Every time an HLO module is run, dumps an HloUnoptimizedSnapshot to the "
       "directory specified by --xla_dump_to."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_cudnn_frontend",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_cudnn_frontend),
-      debug_options->xla_gpu_enable_cudnn_frontend(),
-      "Use the cuDNN frontend API for convolutions when possible."));
   flag_list->push_back(tsl::Flag("xla_gpu_enable_cudnn_fmha",
                                  noop_flag_setter<bool>, false,
                                  "[Deprecated, do not use]"));
@@ -1512,6 +1537,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "MLIR will be in the llvm-parsable format and can be processed by "
       "mlir-opt tools. "
       "Pretty print form is not legal MLIR."));
+  flag_list->push_back(
+      tsl::Flag("xla_dump_full_hlo_config",
+                bool_setter_for(&DebugOptions::set_xla_dump_full_hlo_config),
+                debug_options->xla_dump_full_hlo_config(),
+                "Enable dumping the full HloModuleConfig proto."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_custom_fusions",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_custom_fusions),
@@ -1547,6 +1577,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Enables NCCL User Buffer Registration. collective_memory_size in the "
       "allocator config must also be set to a non-zero value that is large "
       "enough to meet peak collective memory usage."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_nvshmem",
+      bool_setter_for(&DebugOptions::set_xla_gpu_experimental_enable_nvshmem),
+      debug_options->xla_gpu_experimental_enable_nvshmem(),
+      "Enables NVSHMEM."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_temp_buffer_use_separate_color",
       bool_setter_for(
@@ -1573,14 +1608,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_enable_nccl_comm_splitting(),
       "Enables NCCL communicator splitting which allows sharing NCCL resources "
       "between different NCCL cliques."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_nccl_per_stream_comms",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_nccl_per_stream_comms),
-      debug_options->xla_gpu_enable_nccl_per_stream_comms(),
-      "A separate NCCL communicator will be created for each stream that a "
-      "NCCL collective is executed on. This can lead to higher performance if "
-      "NCCL collectives are issued concurrently at the cost of more GPU memory"
-      " usage."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_nccl_init_max_rank_per_root_ratio",
       int64_setter_for(
@@ -1686,12 +1713,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_enable_highest_priority_async_stream),
       debug_options->xla_gpu_enable_highest_priority_async_stream(),
       "Enable async stream to have the highest priority."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_pipelined_collectives",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_pipelined_collectives),
-      debug_options->xla_gpu_enable_pipelined_collectives(),
-      "Enable pipelinling of collective instructions (all-reduce, all-gather, "
-      "and reduce-scatter)."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_pipelined_all_reduce",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_pipelined_all_reduce),
@@ -1745,6 +1766,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           ->xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms(),
       "Enable lowering Triton GEMM fusions through the generic Triton "
       "emitter."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_unsupported_enable_triton_multi_output_fusion",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_unsupported_enable_triton_multi_output_fusion),
+      debug_options->xla_gpu_unsupported_enable_triton_multi_output_fusion(),
+      "Enable Triton multi-output fusions."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_verify_triton_fusion_numerics",
       bool_setter_for(&DebugOptions::set_xla_gpu_verify_triton_fusion_numerics),
@@ -1771,9 +1799,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_exhaustive_tiling_search),
       debug_options->xla_gpu_exhaustive_tiling_search(),
       "Enable (slow) search for the Triton GEMM fusion tilings."));
-  flag_list->push_back(tsl::Flag("xla_gpu_enable_priority_fusion",
-                                 noop_flag_setter<bool>, true,
-                                 "[Deprecated, do not use]"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_subchannel_dequantisation_fusion",
       bool_setter_for(
@@ -1834,15 +1859,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "The memory budget is set to "
       "xla_gpu_auto_spmd_partitioning_memory_budget_ratio times the estimated "
       "memory usage lower bound."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_triton_gemm_disable_reduced_precision_reduction",
-      bool_setter_for(
-          &DebugOptions::
-              set_xla_gpu_triton_gemm_disable_reduced_precision_reduction),
-      debug_options->xla_gpu_triton_gemm_disable_reduced_precision_reduction(),
-      "Forces any reductions during matrix multiplications to use the "
-      "accumulator type and not the output type. The precision of the dot "
-      "operation may not increase that much if there is output fusion."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_dump_autotuned_gemm_fusions",
       bool_setter_for(&DebugOptions::set_xla_gpu_dump_autotuned_gemm_fusions),
@@ -1876,19 +1892,15 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_split_k_autotuning),
       debug_options->xla_gpu_enable_split_k_autotuning(),
       "Enable split_k autotuning for triton gemms."));
-
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_reduction_epilogue_fusion",
       bool_setter_for(
           &DebugOptions::set_xla_gpu_enable_reduction_epilogue_fusion),
       debug_options->xla_gpu_enable_reduction_epilogue_fusion(),
       "Enable fusion for reduction epilogues"));
-  flag_list->push_back(
-      tsl::Flag("xla_gpu_enable_nccl_clique_optimization",
-                bool_setter_for(
-                    &DebugOptions::set_xla_gpu_enable_nccl_clique_optimization),
-                debug_options->xla_gpu_enable_nccl_clique_optimization(),
-                "Allow early return when acquiring NCCL cliques"));
+  flag_list->push_back(tsl::Flag("xla_gpu_enable_nccl_clique_optimization",
+                                 noop_flag_setter<bool>, false,
+                                 "[Deprecated, do not use]."));
   flag_list->push_back(
       tsl::Flag("xla_gpu_cublas_fallback",
                 bool_setter_for(&DebugOptions::set_xla_gpu_cublas_fallback),
@@ -1912,13 +1924,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_enable_while_loop_double_buffering),
       debug_options->xla_gpu_enable_while_loop_double_buffering(),
       "Enable double buffering for while loop"));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_ensure_minor_dot_contraction_dims",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_ensure_minor_dot_contraction_dims),
-      debug_options->xla_gpu_ensure_minor_dot_contraction_dims(),
-      "Ensure that the contracting dimensions for matmul operands are the most "
-      "minor by changing layouts accordingly"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_filter_kernels_spilling_registers_on_autotuning",
       bool_setter_for(
@@ -1980,6 +1985,15 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_triton_hopper),
       debug_options->xla_gpu_enable_triton_hopper(),
       "Currently used to enable MMA_V3 for Hopper in Triton"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_dynamic_dot_search_space",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_experimental_enable_dynamic_dot_search_space),
+      debug_options->xla_gpu_experimental_enable_dynamic_dot_search_space(),
+      "Enable dynamically generating and pruning the autotuning search space "
+      "for Triton dot fusions, based on the properties of the problem and "
+      "hardware (shapes, instructions, GPU limits, etc.)."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_fusion_block_level_rewriter",
       bool_setter_for(
@@ -2162,6 +2176,22 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_executable_terminate_timeout_seconds),
       debug_options->xla_gpu_executable_terminate_timeout_seconds(),
       "Set timeout for Rendezvous termination"));
+
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_first_collective_call_warn_stuck_timeout_seconds",
+      int32_setter_for(
+          &DebugOptions::
+              set_xla_gpu_first_collective_call_warn_stuck_timeout_seconds),
+      debug_options->xla_gpu_first_collective_call_warn_stuck_timeout_seconds(),
+      "Set timeout for First Collective Call Rendezvous stuck warning"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_first_collective_call_terminate_timeout_seconds",
+      int32_setter_for(
+          &DebugOptions::
+              set_xla_gpu_first_collective_call_terminate_timeout_seconds),
+      debug_options->xla_gpu_first_collective_call_terminate_timeout_seconds(),
+      "Set timeout for First Collective Call Rendezvous termination"));
+
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_disable_binary_libraries",
       bool_setter_for(
@@ -2216,6 +2246,14 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "performance."
       "Note that even when this flag is disabled, scatter operations may still "
       "be deterministic, although with additional overhead."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_unsupported_enable_all_reduce_decomposer",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_unsupported_enable_all_reduce_decomposer),
+      debug_options->xla_gpu_unsupported_enable_all_reduce_decomposer(),
+      "Internal: Enable the AllReduceDecomposer, an unsupported pass that "
+      "rewrites small all-reduce operations as a sequence of all-gather and "
+      "reduce operations."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_unsupported_enable_ragged_all_to_all_decomposer",
       bool_setter_for(
@@ -2224,6 +2262,23 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(),
       "Internal: Enable the RaggedAllToAllDecomposer, an experimental pass "
       "that rewrites ragged-all-to-all as a dense all-to-all operation."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_unsupported_use_all_reduce_one_shot_kernel",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_unsupported_use_all_reduce_one_shot_kernel),
+      debug_options->xla_gpu_unsupported_use_all_reduce_one_shot_kernel(),
+      "Internal: Enable the one-shot kernel for single-host all-reduce "
+      "operations."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel),
+      debug_options
+          ->xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(),
+      "Internal: Enable the one-shot kernel for single-host ragged-all-to-all "
+      "operations."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_alltoall_windowed_einsum",
       bool_setter_for(
@@ -2261,11 +2316,57 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_experimental_enable_sync_collective_combining(),
       "Enable sync collective combining."));
   flag_list->push_back(tsl::Flag(
-      "xla_allow_get_default_platform",
-      bool_setter_for(&DebugOptions::set_xla_allow_get_default_platform),
-      debug_options->xla_allow_get_default_platform(),
-      "If false, GetDefaultPlatform will cause an error if called."));
-}  // NOLINT(readability/fn_size)1
+      "xla_gpu_experimental_collective_cse_distance_threshold",
+      int64_setter_for(
+          &DebugOptions::
+              set_xla_gpu_experimental_collective_cse_distance_threshold),
+      debug_options->xla_gpu_experimental_collective_cse_distance_threshold(),
+      "Set distance threshold for Collective CSE."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_collective_perf_table_path",
+      string_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_collective_perf_table_path),
+      debug_options->xla_gpu_experimental_collective_perf_table_path(),
+      "If non empty will interpret this variable as a path for performance "
+      "tables for collectives. Expects `xla.gpu.DeviceHloInstructionProfiles` "
+      "proto."));
+  flag_list->push_back(tsl::Flag(
+      "xla_unsupported_crash_on_hlo_pass_silent_hlo_change",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_unsupported_crash_on_hlo_pass_silent_hlo_change),
+      debug_options->xla_unsupported_crash_on_hlo_pass_silent_hlo_change(),
+      "Crash if a pass reports that it did not change the HLO but in fact it "
+      "did."));
+  flag_list->push_back(tsl::Flag(
+      "xla_unsupported_crash_on_hlo_pass_noop_change",
+      bool_setter_for(
+          &DebugOptions::set_xla_unsupported_crash_on_hlo_pass_noop_change),
+      debug_options->xla_unsupported_crash_on_hlo_pass_noop_change(),
+      "Crash if a pass reports that it did change the HLO but in fact it "
+      "did not."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_matmul_perf_table_path",
+      string_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_matmul_perf_table_path),
+      debug_options->xla_gpu_experimental_matmul_perf_table_path(),
+      "If non empty will interpret this variable as a path for performance "
+      "tables for matmuls. Expects `xla.gpu.DeviceHloInstructionProfiles` "
+      "proto."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_split_k_rewrite",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_enable_split_k_rewrite),
+      debug_options->xla_gpu_experimental_enable_split_k_rewrite(),
+      "Enable the pass that splits GEMMs that underutilize the GPU load by "
+      "splitting the K dimension using a heuristic."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_enable_triton_tma",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_experimental_enable_triton_tma),
+      debug_options->xla_gpu_experimental_enable_triton_tma(),
+      "Enable Triton's TMA loads/stores for arguments where applicable."));
+}  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
 // than once - its call done via call_once.
@@ -2279,6 +2380,57 @@ static void AllocateFlags(DebugOptions* defaults) {
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
+void ParseDebugOptionFlagsFromEnv(bool reset_envvar) {
+  absl::call_once(flags_init, &AllocateFlags, nullptr);
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects, reset_envvar);
+}
+
+bool ParseFlagsFromDebugOptionsFile(absl::string_view filename) {
+  absl::call_once(flags_init, &AllocateFlags, nullptr);
+  VLOG(2) << "Parsing flags from file: " << filename;
+  // Read the file content
+  std::string file_content;
+  absl::Status status = tsl::ReadFileToString(
+      tsl::Env::Default(), std::string(filename), &file_content);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to read file: " << filename
+               << ", error: " << status.ToString();
+    return false;
+  }
+  DebugOptions new_debug_options;
+  tsl::protobuf::TextFormat::Parser parser;
+  tsl::protobuf::TextFormat::ParseInfoTree tree;
+  parser.WriteLocationsTo(&tree);
+  VLOG(1) << "Debug options file contents: " << file_content;
+  if (!parser.ParseFromString(file_content, &new_debug_options)) {
+    LOG(ERROR) << "Ill formed debug options file, unable to parse: "
+               << filename;
+    return false;
+  }
+
+  // Read from new_debug_options, and overwrite the flags in debug_options that
+  // are actually mentioned in file_contents.
+  std::vector<const tsl::protobuf::FieldDescriptor*> overwritten_fields;
+  int field_count = new_debug_options.GetDescriptor()->field_count();
+  for (int i = 0; i < field_count; i++) {
+    const tsl::protobuf::FieldDescriptor* field =
+        new_debug_options.GetDescriptor()->field(i);
+    if (tree.GetLocation(field, field->is_repeated() ? 0 : -1).line != -1) {
+      VLOG(2) << "Non default field: " << field->name();
+      overwritten_fields.push_back(field);
+    }
+  }
+  flag_values->GetReflection()->SwapFields(flag_values, &new_debug_options,
+                                           overwritten_fields);
+  return true;
+};
+
+void ResetFlagValues() {
+  if (flag_values != nullptr) {
+    *flag_values = DefaultDebugOptionsIgnoringFlags();
+  }
+}
+
 void AppendDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                              DebugOptions* debug_options) {
   absl::call_once(flags_init, &AllocateFlags, debug_options);
@@ -2300,12 +2452,19 @@ void ResetThreadLocalFuel() {
 
   thread_fuel = std::make_unique<
       absl::node_hash_map<std::string, std::atomic<int64_t>>>();
-  CHECK(initial_fuel != nullptr);
+  CHECK_NOTNULL(initial_fuel);
   for (const auto& kv : *initial_fuel) {
     thread_fuel->emplace(kv.first, kv.second);
   }
 }
 
+bool PassFuelIsSet(absl::string_view pass) {
+  absl::call_once(flags_init, &AllocateFlags, nullptr);
+  auto* fuel_pool = thread_fuel ? thread_fuel.get() : global_fuel;
+  auto it = fuel_pool->find(pass);
+  return it != fuel_pool->end();
+}
+
 bool ConsumeFuel(absl::string_view pass, bool* just_ran_out) {
   absl::call_once(flags_init, &AllocateFlags, nullptr);
   if (just_ran_out != nullptr) {
diff --git a/third_party/xla/xla/debug_options_flags.h b/third_party/xla/xla/debug_options_flags.h
index 7438750351c6..28a8445abce7 100644
--- a/third_party/xla/xla/debug_options_flags.h
+++ b/third_party/xla/xla/debug_options_flags.h
@@ -38,6 +38,22 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
 void AppendDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                              DebugOptions* debug_options = nullptr);
 
+// Parses the debug option flags from XLA_FLAGS environment variable. The
+// global variable containing the debug options returned from
+// 'GetDebugOptionsFromFlags' is mutated by setting fields explicitly specified
+// in the environment variable. If `reset_envvar` is true, then the environment
+// variable is read again, otherwise the previously read value is used.
+void ParseDebugOptionFlagsFromEnv(bool reset_envvar);
+
+// Parse the debug options from debug_options file. Given a string containing
+// the textual form of a DebugOptions protobuf, parses it. The global variable
+// containing the debug options returned from 'GetDebugOptionsFromFlags' is
+// mutated by setting fields explicitly specified in the file.
+bool ParseFlagsFromDebugOptionsFile(absl::string_view filename);
+
+// Reset the flag values to default debug options ignoring flags.
+void ResetFlagValues();
+
 // Fetches a DebugOptions proto message from flags provided to the program.
 // Flags must be registered with the flags parser using AppendDebugOptionsFlags
 // first.
@@ -46,6 +62,9 @@ DebugOptions GetDebugOptionsFromFlags();
 // Gets a DebugOptions proto that reflects the defaults as if no flags were set.
 DebugOptions DefaultDebugOptionsIgnoringFlags();
 
+// Checks whether the pass fuel was explicitly set.
+bool PassFuelIsSet(absl::string_view pass);
+
 // Consumes a unit of "compiler fuel" for the given pass, and returns false if
 // we're out of fuel for that pass.
 //
diff --git a/third_party/xla/xla/debug_options_parsers_test.cc b/third_party/xla/xla/debug_options_parsers_test.cc
index 2636ef7ec312..48d6af780166 100644
--- a/third_party/xla/xla/debug_options_parsers_test.cc
+++ b/third_party/xla/xla/debug_options_parsers_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/debug_options_flags.h"
 #include "xla/parse_flags_from_env.h"
+#include "xla/service/dump.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/util/command_line_flags.h"
@@ -97,6 +98,228 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::ValuesIn(GetUppercaseStringSetterTestCases()),
     UppercaseStringSetterTest::Name);
 
+TEST(FuelTest, FuelPassCountsAreSeparate) {
+  tsl::setenv("XLA_FLAGS", "--xla_fuel=ABC=1,PQR=2", /*overwrite=*/true);
+  // Parse flags from the environment variable.
+  int* pargc;
+  std::vector<char*>* pargv;
+  ResetFlagsFromEnvForTesting("XLA_FLAGS", &pargc, &pargv);
+  ParseDebugOptionFlagsFromEnv(false);
+
+  EXPECT_TRUE(ConsumeFuel("ABC"));
+  EXPECT_FALSE(ConsumeFuel("ABC"));
+
+  EXPECT_TRUE(ConsumeFuel("PQR"));
+  EXPECT_TRUE(ConsumeFuel("PQR"));
+  EXPECT_FALSE(ConsumeFuel("PQR"));
+}
+
+TEST(FuelTest,
+     PassFuelIsSetReturnsTrueOnExplicitlyFueledPassesAndFalseOtherwise) {
+  tsl::setenv("XLA_FLAGS", "--xla_fuel=MNO=1,XYZ=2", /*overwrite=*/true);
+  // Parse flags from the environment variable.
+  int* pargc;
+  std::vector<char*>* pargv;
+  ResetFlagsFromEnvForTesting("XLA_FLAGS", &pargc, &pargv);
+  ParseDebugOptionFlagsFromEnv(true);
+  EXPECT_FALSE(PassFuelIsSet("ABC"));
+  EXPECT_TRUE(PassFuelIsSet("MNO"));
+  EXPECT_FALSE(PassFuelIsSet("PQR"));
+  EXPECT_TRUE(PassFuelIsSet("XYZ"));
+}
+
+std::string WriteDebugOptionsToTempFile(const DebugOptions& debug_options,
+                                        std::string* contents) {
+  *contents = GetNonDefaultDebugOptions(debug_options);
+  tsl::Env* env = tsl::Env::Default();
+  std::string fname;
+  EXPECT_TRUE(env->LocalTempFilename(&fname));
+  EXPECT_TRUE(
+      tsl::WriteStringToFile(tsl::Env::Default(), fname, *contents).ok());
+  return fname;
+}
+
+TEST(ParsingDebugOptionsTest, FailedParsing) {
+  tsl::Env* env = tsl::Env::Default();
+  std::string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
+  // The debug options file does not exist.
+  EXPECT_FALSE(ParseFlagsFromDebugOptionsFile(fname));
+}
+
+TEST(ParsingDebugOptionsTest, ParsingRepeatedFields) {
+  // Using the flag xla_gpu_disable_async_collectives
+  // Sanity checks: default value: empty
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  EXPECT_TRUE(debug_options.xla_gpu_disable_async_collectives().empty());
+  debug_options.add_xla_gpu_disable_async_collectives(DebugOptions::ALLGATHER);
+  debug_options.add_xla_gpu_disable_async_collectives(
+      DebugOptions::REDUCESCATTER);
+
+  ResetFlagValues();
+  std::string contents;
+  ASSERT_TRUE(ParseFlagsFromDebugOptionsFile(
+      WriteDebugOptionsToTempFile(debug_options, &contents)));
+  DebugOptions parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_TRUE(absl::StrContains(
+      contents, "xla_gpu_disable_async_collectives: ALLGATHER"));
+  EXPECT_TRUE(absl::StrContains(
+      contents, "xla_gpu_disable_async_collectives: REDUCESCATTER"));
+  EXPECT_EQ(parsed_debug_options.xla_gpu_disable_async_collectives_size(), 2);
+  EXPECT_EQ(parsed_debug_options.xla_gpu_disable_async_collectives(0),
+            DebugOptions::ALLGATHER);
+  EXPECT_EQ(parsed_debug_options.xla_gpu_disable_async_collectives(1),
+            DebugOptions::REDUCESCATTER);
+
+  // We are not resetting the flags here, because we want to ensure that
+  // [ALLGATHER, REDUCESCATTER] gets overwritted to [ALLTOALL] and not simply
+  // appended.
+  debug_options.clear_xla_gpu_disable_async_collectives();
+  debug_options.add_xla_gpu_disable_async_collectives(DebugOptions::ALLTOALL);
+  ASSERT_TRUE(ParseFlagsFromDebugOptionsFile(
+      WriteDebugOptionsToTempFile(debug_options, &contents)));
+  parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_TRUE(absl::StrContains(contents,
+                                "xla_gpu_disable_async_collectives: ALLTOALL"));
+  EXPECT_FALSE(absl::StrContains(
+      contents, "xla_gpu_disable_async_collectives: ALLGATHER"));
+  EXPECT_FALSE(absl::StrContains(
+      contents, "xla_gpu_disable_async_collectives: REDUCESCATTER"));
+  EXPECT_EQ(parsed_debug_options.xla_gpu_disable_async_collectives_size(), 1);
+  EXPECT_EQ(parsed_debug_options.xla_gpu_disable_async_collectives(0),
+            DebugOptions::ALLTOALL);
+}
+
+TEST(ParsingDebugOptionsTest, ParseFromDebugOptionsFile) {
+  // Sanity checks: The test needs to use two flags that have false and true
+  // default values.
+  // Default value of xla_hlo_pass_fix_detect_cycles is false.
+  // Default value of xla_dump_hlo_as_long_text is true.
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  EXPECT_TRUE(debug_options.xla_dump_hlo_as_long_text());
+  EXPECT_FALSE(debug_options.xla_hlo_pass_fix_detect_cycles());
+
+  // default value (should not be in debug_options file).
+  debug_options.set_xla_dump_hlo_as_long_text(true);
+  // non-default value (should be in debug_options file).
+  debug_options.set_xla_hlo_pass_fix_detect_cycles(true);
+  std::string contents;
+  auto temp_file = WriteDebugOptionsToTempFile(debug_options, &contents);
+  ResetFlagValues();
+  ASSERT_TRUE(ParseFlagsFromDebugOptionsFile(temp_file));
+  auto parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_TRUE(absl::StrContains(contents, "xla_hlo_pass_fix_detect_cycles"));
+  EXPECT_FALSE(absl::StrContains(contents, "xla_dump_hlo_as_long_text"));
+  EXPECT_TRUE(parsed_debug_options.xla_hlo_pass_fix_detect_cycles());
+  EXPECT_TRUE(parsed_debug_options.xla_dump_hlo_as_long_text());
+
+  // non-default value (should be in debug_options file).
+  debug_options.set_xla_dump_hlo_as_long_text(false);
+  // default value (should not be in debug_options file).
+  debug_options.set_xla_hlo_pass_fix_detect_cycles(false);
+  contents.clear();
+  temp_file = WriteDebugOptionsToTempFile(debug_options, &contents);
+  ResetFlagValues();
+  ASSERT_TRUE(ParseFlagsFromDebugOptionsFile(temp_file));
+  parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_TRUE(absl::StrContains(contents, "xla_dump_hlo_as_long_text"));
+  EXPECT_FALSE(absl::StrContains(contents, "xla_hlo_pass_fix_detect_cycles"));
+  EXPECT_FALSE(parsed_debug_options.xla_hlo_pass_fix_detect_cycles());
+  EXPECT_FALSE(parsed_debug_options.xla_dump_hlo_as_long_text());
+
+  // default value (should not be in debug_options file).
+  debug_options.set_xla_dump_hlo_as_long_text(true);
+  // default value (should not be in debug_options file).
+  debug_options.set_xla_hlo_pass_fix_detect_cycles(false);
+  contents.clear();
+  temp_file = WriteDebugOptionsToTempFile(debug_options, &contents);
+  ResetFlagValues();
+  ASSERT_TRUE(ParseFlagsFromDebugOptionsFile(temp_file));
+  parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_FALSE(absl::StrContains(contents, "xla_dump_hlo_as_long_text"));
+  EXPECT_FALSE(absl::StrContains(contents, "xla_hlo_pass_fix_detect_cycles"));
+  EXPECT_FALSE(parsed_debug_options.xla_hlo_pass_fix_detect_cycles());
+  EXPECT_TRUE(parsed_debug_options.xla_dump_hlo_as_long_text());
+
+  // non-default value. (should be in debug_options file).
+  debug_options.set_xla_dump_hlo_as_long_text(false);
+  // non-default value. (should be in debug_options file).
+  debug_options.set_xla_hlo_pass_fix_detect_cycles(true);
+  contents.clear();
+  temp_file = WriteDebugOptionsToTempFile(debug_options, &contents);
+  ResetFlagValues();
+  ASSERT_TRUE(ParseFlagsFromDebugOptionsFile(temp_file));
+  parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_TRUE(absl::StrContains(contents, "xla_dump_hlo_as_long_text"));
+  EXPECT_TRUE(absl::StrContains(contents, "xla_hlo_pass_fix_detect_cycles"));
+  EXPECT_FALSE(parsed_debug_options.xla_dump_hlo_as_long_text());
+  EXPECT_TRUE(parsed_debug_options.xla_hlo_pass_fix_detect_cycles());
+}
+
+TEST(ParsingDebugOptionsTest, EnvOverwritesDebugOptionsFile) {
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  debug_options.set_xla_dump_to("/path/from/debug/options/file");
+  debug_options.set_xla_gpu_target_config_filename(
+      "/gpu/target/config/from/debug/options/file");
+  std::string contents;
+  std::string debug_options_file =
+      WriteDebugOptionsToTempFile(debug_options, &contents);
+  EXPECT_TRUE(absl::StrContains(
+      contents, "xla_dump_to: \"/path/from/debug/options/file\""));
+  EXPECT_TRUE(
+      absl::StrContains(contents,
+                        "xla_gpu_target_config_filename: "
+                        "\"/gpu/target/config/from/debug/options/file\""));
+
+  ResetFlagValues();
+  tsl::setenv("XLA_FLAGS",
+              "--xla_dump_to=/path/from/env/var "
+              "--xla_gpu_per_fusion_autotune_cache_dir=/path/to/autotune/cache/"
+              "dir/from/env",
+              /*overwrite=*/true);
+  int* pargc;
+  std::vector<char*>* pargv;
+  ResetFlagsFromEnvForTesting("XLA_FLAGS", &pargc, &pargv);
+
+  // This is a proxy for the allocate call in run_hlo_module, which parses the
+  // options from env.
+  ParseDebugOptionFlagsFromEnv(false);
+
+  ASSERT_TRUE(ParseFlagsFromDebugOptionsFile(debug_options_file));
+  DebugOptions parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_EQ(parsed_debug_options.xla_dump_to(),
+            "/path/from/debug/options/file");
+  EXPECT_EQ(parsed_debug_options.xla_gpu_target_config_filename(),
+            "/gpu/target/config/from/debug/options/file");
+
+  // This is a proxy for the second parsing from env var after parsing from the
+  // file.
+  ParseDebugOptionFlagsFromEnv(true);
+  parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_EQ(parsed_debug_options.xla_dump_to(), "/path/from/env/var");
+  EXPECT_EQ(parsed_debug_options.xla_gpu_target_config_filename(),
+            "/gpu/target/config/from/debug/options/file");
+  EXPECT_EQ(parsed_debug_options.xla_gpu_per_fusion_autotune_cache_dir(),
+            "/path/to/autotune/cache/dir/from/env");
+  std::vector<tsl::Flag> flag_list;
+  xla::AppendDebugOptionsFlags(&flag_list);
+  std::vector<std::string> flag_list_str = {
+      "--xla_dump_to=/path/from/command/line/flags",
+      "--xla_gpu_experimental_collective_perf_table_path=/path/to/collective/"
+      "perf/table/from/command/line/flags"};
+  tsl::Flags::Parse(flag_list_str, flag_list);
+  parsed_debug_options = GetDebugOptionsFromFlags();
+  EXPECT_EQ(parsed_debug_options.xla_dump_to(),
+            "/path/from/command/line/flags");
+  EXPECT_EQ(parsed_debug_options.xla_gpu_target_config_filename(),
+            "/gpu/target/config/from/debug/options/file");
+  EXPECT_EQ(parsed_debug_options.xla_gpu_per_fusion_autotune_cache_dir(),
+            "/path/to/autotune/cache/dir/from/env");
+  EXPECT_EQ(
+      parsed_debug_options.xla_gpu_experimental_collective_perf_table_path(),
+      "/path/to/collective/perf/table/from/command/line/flags");
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/third_party/xla/xla/examples/axpy/BUILD b/third_party/xla/xla/examples/axpy/BUILD
index 24505cc4f954..d3f32451df2a 100644
--- a/third_party/xla/xla/examples/axpy/BUILD
+++ b/third_party/xla/xla/examples/axpy/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/xla/examples/axpy/README.md b/third_party/xla/xla/examples/axpy/README.md
index c947b63b0d14..9ff369e6886d 100644
--- a/third_party/xla/xla/examples/axpy/README.md
+++ b/third_party/xla/xla/examples/axpy/README.md
@@ -51,7 +51,8 @@ func.func @main(
 This code is in [`stablehlo_axpy.mlir`](stablehlo_axpy.mlir).
 
 **Note:** StableHLO expresses broadcasting explicitly, so we use
-`"stablehlo.broadcast_in_dim"` to broadcast our scalar to a rank-1 tensor.
+`"stablehlo.broadcast_in_dim"` to broadcast our scalar to a 1-dimensional
+tensor.
 
 ## 2. Compile the StableHLO program
 
diff --git a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
index c0285aaf7ac3..67bc9b0d9ead 100644
--- a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
+++ b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
@@ -123,11 +123,11 @@ TEST_F(StableHloAxpyTest, CompileCPUTestProgram) {
 
   // Use our client to compile our StableHLO program to an executable.
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                          client->Compile(*program, CompileOptions{}));
+                          client->CompileAndLoad(*program, CompileOptions{}));
 }
 
 TEST_F(StableHloAxpyTest, CompileAndExecuteCPUTestProgram) {
-  // TODO(masonchang): Use GetCpuClient() once the C API supports
+  // TODO(b/375660658): Use GetCpuClient() once the C API supports
   // BufferFromHostLiteral.
   xla::CpuClientOptions options;
   options.cpu_device_count = 4;
@@ -139,7 +139,7 @@ TEST_F(StableHloAxpyTest, CompileAndExecuteCPUTestProgram) {
 
   // Use our client to compile our StableHLO program to an executable.
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                          client->Compile(*program, CompileOptions{}));
+                          client->CompileAndLoad(*program, CompileOptions{}));
 
   // Create inputs to our computation.
   auto alpha_literal = xla::LiteralUtil::CreateR0<float>(3.14f);
diff --git a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
index 4218993bd106..137b3ac45075 100644
--- a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
+++ b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
@@ -1,5 +1,5 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
index 798d4ceeae08..7c45961263df 100644
--- a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
+++ b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
+#include <cstdint>
 #if GOOGLE_CUDA
 
 #include "xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.h"
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 19c8a27ce549..efa01c6fccf4 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -10,7 +10,10 @@ cc_library(
     name = "api",
     hdrs = ["//xla/ffi/api:api_headers"],
     visibility = ["//visibility:private"],
-    deps = ["//xla/ffi/api:c_api"],
+    deps = [
+        "//xla/ffi/api:c_api",
+        "@com_google_absl//absl/strings:string_view",
+    ],
 )
 
 cc_library(
@@ -25,6 +28,7 @@ cc_library(
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -33,7 +37,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -46,13 +49,11 @@ xla_cc_test(
         "//xla/ffi/api:c_api",
         "//xla/stream_executor:device_memory",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
     ],
 )
 
@@ -70,7 +71,6 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -81,11 +81,11 @@ xla_cc_test(
         ":execution_context",
         ":type_id_registry",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -110,7 +110,6 @@ xla_cc_test(
     deps = [
         ":execution_state",
         "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -174,7 +173,6 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -183,6 +181,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -193,10 +192,10 @@ cc_library(
     hdrs = ["attribute_map.h"],
     deps = [
         ":call_frame",
-        "//xla:util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -247,9 +246,12 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/tsl/lib/gtl:int_type",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -260,10 +262,10 @@ xla_cc_test(
     deps = [
         ":type_id_registry",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index cd6f915cb2c6..f42c71a4e28d 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -11,9 +11,10 @@ package(
 # Public XLA FFI API
 #===-------------------------------------------------------------------------------------------===//
 
-# XLA FFI is a header only library that does not have any dependencies on XLA. The intent is that
-# users that do want to register custom FFI handlers with XLA should copy these headers to their
-# project, build a shared object with an XLA FFI handler implementation, and load it at run time.
+# XLA FFI is a header only library that does not have any dependencies on XLA or any other
+# external libraries. The intent is that users that do want to register custom FFI handlers with XLA
+# should copy these headers to their project, build a shared object with an XLA FFI handler
+# implementation and load it at run time.
 #
 # `api.h` and `ffi.h` headers provide a C++ library for decoding XLA FFI C API structs into a more
 # user friendly C++ types. Shared objects defining XLA FFI handlers should be built with private
@@ -22,6 +23,10 @@ package(
 #
 # `ffi.h` defines builtin decoding for canonical XLA types, but users can add their own decodings
 # with template specializations.
+#
+# IMPORTANT: It is critical that exported FFI headers do not have any dependencies on XLA or any
+# other libraries, because it allows external FFI users to simply copy the headers to their project
+# and do not worry about build configuration or dependencies.
 
 # A user of the FFI interface will only need the three headers included in the `all_headers`
 # filegroup: `api.h`, `c_api.h`, and `ffi.h`.
@@ -90,7 +95,6 @@ xla_cc_test(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
@@ -98,6 +102,7 @@ xla_cc_test(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index 9787476f8f7e..20e927dcf6d9 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <cstdlib>
 #include <functional>
 #include <initializer_list>
+#include <iostream>
 #include <iterator>
 #include <memory>
 #include <optional>
@@ -101,6 +102,12 @@ inline std::ostream& operator<<(std::ostream& os,
       return os << "INVALID";
     case XLA_FFI_DataType_PRED:
       return os << "PRED";
+    case XLA_FFI_DataType_S1:
+      return os << "S1";
+    case XLA_FFI_DataType_S2:
+      return os << "S2";
+    case XLA_FFI_DataType_S4:
+      return os << "S4";
     case XLA_FFI_DataType_S8:
       return os << "S8";
     case XLA_FFI_DataType_S16:
@@ -109,6 +116,12 @@ inline std::ostream& operator<<(std::ostream& os,
       return os << "S32";
     case XLA_FFI_DataType_S64:
       return os << "S64";
+    case XLA_FFI_DataType_U1:
+      return os << "U1";
+    case XLA_FFI_DataType_U2:
+      return os << "U2";
+    case XLA_FFI_DataType_U4:
+      return os << "U4";
     case XLA_FFI_DataType_U8:
       return os << "U8";
     case XLA_FFI_DataType_U16:
@@ -251,6 +264,33 @@ class Ffi {
                                        std::string_view name,
                                        XLA_FFI_TypeId* type_id);
 
+  // This is a helper template that allows to convert function pointers from
+  // the run time values to compile time values (template arguments) with
+  // automatic template arguments deduction.
+  //
+  // Example:
+  //
+  //   static Error Foo(int32_t arg) {... }
+  //
+  //   template<typename Callable>
+  //   void call(Callable callable) { callable(42); }
+  //
+  //   call(Foo);                  // `Foo` passed as a runtime value
+  //   call(Ffi::Wrapper<Foo>())   // `Foo` passed as a template argument
+  //
+  // In the first case compiler will not be able to inline `Foo` into the `call`
+  // body. However in the second case it can do that, because function pointer
+  // is a statically known value (template non-type argument).
+  template <auto fn>
+  struct Wrapper;
+
+  template <typename Ret, typename... Args, Ret (*fn)(Args...)>
+  struct Wrapper<fn> {
+    XLA_FFI_ATTRIBUTE_ALWAYS_INLINE Ret operator()(Args... args) const {
+      return fn(std::forward<Args>(args)...);
+    }
+  };
+
  protected:
   template <typename... Args>
   static std::string StrCat(Args... args);
@@ -326,7 +366,7 @@ inline XLA_FFI_Error* Ffi::InvalidArgument(const XLA_FFI_Api* api,
 inline XLA_FFI_Error* Ffi::CheckStructSize(const XLA_FFI_Api* api,
                                            std::string_view struct_name,
                                            size_t expected, size_t actual) {
-  if (expected != actual) {
+  if (XLA_FFI_PREDICT_FALSE(expected != actual)) {
     return InvalidArgument(
         api, StrCat("Unexpected ", struct_name, " size: expected ", expected,
                     " got ", actual, ". Check installed software versions."));
@@ -337,7 +377,7 @@ inline XLA_FFI_Error* Ffi::CheckStructSize(const XLA_FFI_Api* api,
 inline XLA_FFI_Error* Ffi::StructSizeIsGreaterOrEqual(
     const XLA_FFI_Api* api, std::string_view struct_name, size_t expected,
     size_t actual) {
-  if (actual < expected) {
+  if (XLA_FFI_PREDICT_FALSE(actual < expected)) {
     return InvalidArgument(
         api, StrCat("Unexpected ", struct_name, " size: expected at least ",
                     expected, " got ", actual,
@@ -449,8 +489,10 @@ using HasRemainingRetsTag =
 //----------------------------------------------------------------------------//
 
 template <typename T>
-XLA_FFI_DataType NativeTypeToCApiDataType() {
-  if constexpr (std::is_same_v<T, bool>) {
+constexpr XLA_FFI_DataType NativeTypeToCApiDataType() {
+  if constexpr (std::is_same_v<T, char>) {
+    return XLA_FFI_DataType_U8;
+  } else if constexpr (std::is_same_v<T, bool>) {
     return XLA_FFI_DataType_PRED;
   } else if constexpr (std::is_same_v<T, int8_t>) {
     return XLA_FFI_DataType_S8;
@@ -1014,7 +1056,7 @@ struct DecodingOffsets {
 };
 
 struct DecodingContext {
-  const XLA_FFI_CallFrame* call_frame;
+  XLA_FFI_CallFrame* call_frame;
 
   const std::string* attrs_names;  // not owned
   const std::size_t* attrs_idx;    // not owned
@@ -1033,10 +1075,11 @@ struct Decode {
 
 template <typename T>
 struct Decode<OptionalArgTag<T>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<std::optional<T>> call(DecodingOffsets& offsets,
                                               DecodingContext& ctx,
                                               DiagnosticEngine& diagnostic) {
-    if (offsets.args >= ctx.call_frame->args.size) {
+    if (XLA_FFI_PREDICT_FALSE(offsets.args >= ctx.call_frame->args.size)) {
       return std::optional<T>(std::nullopt);
     }
     return Decode<T>::call(offsets, ctx, diagnostic);
@@ -1045,6 +1088,7 @@ struct Decode<OptionalArgTag<T>> {
 
 template <typename T>
 struct Decode<RetTag<T>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Result<T>> call(DecodingOffsets& offsets,
                                        DecodingContext& ctx,
                                        DiagnosticEngine& diagnostic) {
@@ -1056,10 +1100,11 @@ struct Decode<RetTag<T>> {
 
 template <typename T>
 struct Decode<OptionalRetTag<T>> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<std::optional<Result<T>>> call(
       DecodingOffsets& offsets, DecodingContext& ctx,
       DiagnosticEngine& diagnostic) {
-    if (offsets.rets >= ctx.call_frame->rets.size) {
+    if (XLA_FFI_PREDICT_FALSE(offsets.rets >= ctx.call_frame->rets.size)) {
       return std::optional<Result<T>>(std::nullopt);
     }
     return Decode<RetTag<T>>::call(offsets, ctx, diagnostic);
@@ -1070,6 +1115,7 @@ template <typename T>
 struct Decode<AttrTag<T>> {
   using R = typename AttrDecoding<T>::Type;
 
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<R> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
     // Find decoded attribute corresponding to the given attribute index.
@@ -1083,15 +1129,28 @@ struct Decode<AttrTag<T>> {
     XLA_FFI_ByteSpan* attr_name = ctx.call_frame->attrs.names[idx];
     void* attr = ctx.call_frame->attrs.attrs[idx];
 
+    // We use a handwritten string comparison function because calling builtin
+    // string compare function adds a function call overhead on a hot path, and
+    // we expect all attribute names to be short and equal.
+    auto eq = [](XLA_FFI_ByteSpan* a, std::string_view b) {
+      if (XLA_FFI_PREDICT_FALSE(a->len != b.size())) {
+        return false;
+      }
+      for (size_t i = 0; i < a->len; ++i) {
+        if (XLA_FFI_PREDICT_FALSE(a->ptr[i] != b.data()[i])) {
+          return false;
+        }
+      }
+      return true;
+    };
+
     // TODO(ezhulenev): Currently we require that attributes passed to the FFI
     // handler must match attributes referenced in a binding, however
     // we could safely ignore extra attributes. Relax this if needed.
-
-    // Attribute name does not match.
-    std::string_view attr_name_view = {attr_name->ptr, attr_name->len};
-    if (attr_name_view != ctx.attrs_names[i]) {
+    if (XLA_FFI_PREDICT_FALSE(!eq(attr_name, ctx.attrs_names[i]))) {
       return diagnostic.Emit("Attribute name mismatch: ")
-             << attr_name_view << " vs " << ctx.attrs_names[i];
+             << std::string_view{attr_name->ptr, attr_name->len} << " vs "
+             << ctx.attrs_names[i];
     }
 
     return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
@@ -1102,6 +1161,7 @@ template <typename T>
 struct Decode<CtxTag<T>> {
   using R = typename CtxDecoding<T>::Type;
 
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<R> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
     return CtxDecoding<T>::Decode(ctx.call_frame->api, ctx.call_frame->ctx,
@@ -1181,9 +1241,7 @@ class DictionaryBase {
 
   size_t size() const { return attrs_->size; }
 
-  bool contains(std::string_view name) const {
-    return Find(name) < attrs_->size;
-  }
+  bool contains(std::string_view name) const { return Find(name).has_value(); }
 
  protected:
   template <typename T, typename... Ts>
@@ -1192,29 +1250,35 @@ class DictionaryBase {
   template <typename T>
   std::optional<T> get(std::string_view name,
                        DiagnosticEngine& diagnostic) const {
-    size_t idx = Find(name);
-    if (XLA_FFI_PREDICT_FALSE(idx >= attrs_->size)) {
+    std::optional<size_t> idx = Find(name);
+    if (XLA_FFI_PREDICT_FALSE(!idx.has_value())) {
       return diagnostic.Emit("Unexpected attribute: ") << name;
     }
 
-    XLA_FFI_AttrType attr_type = attrs_->types[idx];
-    void* attr = attrs_->attrs[idx];
+    XLA_FFI_AttrType attr_type = attrs_->types[*idx];
+    void* attr = attrs_->attrs[*idx];
     return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
   }
 
  private:
-  size_t Find(std::string_view name) const {
+  std::optional<size_t> Find(std::string_view name) const {
     XLA_FFI_ByteSpan** begin = attrs_->names;
     XLA_FFI_ByteSpan** end = begin + attrs_->size;
 
-    auto name_eq = [&](XLA_FFI_ByteSpan* attr) {
-      std::string_view name_view = {attr->ptr, attr->len};
-      return name_view == name;
+    auto eq = [](XLA_FFI_ByteSpan* a, std::string_view b) {
+      return std::string_view{a->ptr, a->len} == b;
+    };
+
+    auto lt = [](XLA_FFI_ByteSpan* a, std::string_view b) {
+      return std::string_view{a->ptr, a->len} < b;
     };
 
-    // TODO(ezhulenev): Attributes names sorted by name. We can use a binary
-    // search here instead of a linear scan.
-    return std::distance(begin, std::find_if(begin, end, name_eq));
+    // Lower bound can be `end` if the attribute is not found, or the first
+    // attribute not ordered before the `name`.
+    auto lower_bound = std::lower_bound(begin, end, name, lt);
+    return lower_bound == end || !eq(*lower_bound, name)
+               ? std::nullopt
+               : std::make_optional(std::distance(begin, lower_bound));
   }
 
   const XLA_FFI_Attrs* attrs_;
@@ -1231,9 +1295,8 @@ template <typename T>
 struct internal::Decode<internal::AttrsTag<T>> {
   static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
-    return AttrDecoding<T>::Decode(
-        XLA_FFI_AttrType_DICTIONARY,
-        const_cast<XLA_FFI_Attrs*>(&ctx.call_frame->attrs), diagnostic);
+    return AttrDecoding<T>::Decode(XLA_FFI_AttrType_DICTIONARY,
+                                   &ctx.call_frame->attrs, diagnostic);
   }
 };
 
@@ -1383,14 +1446,16 @@ class Handler : public Ffi {
     // Sanity checking call frame struct size.
     if (XLA_FFI_Error* err = CheckStructSize(
             call_frame->api, "XLA_FFI_CallFrame", XLA_FFI_CallFrame_STRUCT_SIZE,
-            call_frame->struct_size)) {
+            call_frame->struct_size);
+        XLA_FFI_PREDICT_FALSE(err)) {
       return err;
     }
 
     // If passed a call frame with the metadata extension, just return the
     // metadata.
-    if (call_frame->extension_start != nullptr &&
-        call_frame->extension_start->type == XLA_FFI_Extension_Metadata) {
+    if (XLA_FFI_PREDICT_FALSE(call_frame->extension_start != nullptr &&
+                              call_frame->extension_start->type ==
+                                  XLA_FFI_Extension_Metadata)) {
       return PopulateMetadata(call_frame->api,
                               reinterpret_cast<XLA_FFI_Metadata_Extension*>(
                                   call_frame->extension_start));
@@ -1578,11 +1643,10 @@ class Handler : public Ffi {
     if constexpr (kIsEncodedErrorOrFuture) {
       if (encoded.index() == 0) {
         return std::get<0>(encoded);
-      } else {
-        call_frame->future = std::get<1>(encoded);
-        assert(call_frame->future != nullptr);
-        return nullptr;
       }
+      call_frame->future = std::get<1>(encoded);
+      assert(call_frame->future != nullptr);
+      return nullptr;
     }
 
     std::abort();  // unreachable
@@ -1596,7 +1660,9 @@ class Handler : public Ffi {
             << "Failed to decode all FFI handler operands (bad operands at: ";
     for (size_t cnt = 0, idx = 0; idx < kSize; ++idx) {
       if (!decoded[idx]) {
-        if (cnt++) message << ", ";
+        if (cnt++) {
+          message << ", ";
+        }
         message << std::to_string(idx);
       }
     }
@@ -1647,25 +1713,25 @@ class Handler : public Ffi {
 // Builtin attributes decoding
 //===----------------------------------------------------------------------===//
 
-#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T, TYPE)                \
-  template <>                                                         \
-  struct AttrDecoding<T> {                                            \
-    using Type = T;                                                   \
-    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
-                                   DiagnosticEngine& diagnostic) {    \
-      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR)) {   \
-        return diagnostic.Emit("Wrong attribute type: expected ")     \
-               << XLA_FFI_AttrType_SCALAR << " but got " << type;     \
-      }                                                               \
-                                                                      \
-      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);         \
-      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != TYPE)) {             \
-        return diagnostic.Emit("Wrong scalar data type: expected ")   \
-               << TYPE << " but got " << scalar->dtype;               \
-      }                                                               \
-                                                                      \
-      return *reinterpret_cast<T*>(scalar->value);                    \
-    }                                                                 \
+#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T, TYPE)                     \
+  template <>                                                              \
+  struct AttrDecoding<T> {                                                 \
+    using Type = T;                                                        \
+    XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(        \
+        XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) { \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR)) {        \
+        return diagnostic.Emit("Wrong attribute type: expected ")          \
+               << XLA_FFI_AttrType_SCALAR << " but got " << type;          \
+      }                                                                    \
+                                                                           \
+      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);              \
+      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != TYPE)) {                  \
+        return diagnostic.Emit("Wrong scalar data type: expected ")        \
+               << TYPE << " but got " << scalar->dtype;                    \
+      }                                                                    \
+                                                                           \
+      return *reinterpret_cast<T*>(scalar->value);                         \
+    }                                                                      \
   }
 
 XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(bool, XLA_FFI_DataType_PRED);
@@ -1735,7 +1801,9 @@ struct DecodeDictionaryAttr {
     std::tuple<std::optional<Ts>...> members = {
         dict.get<Ts>(names[Is], diagnostic)...};
     bool all_decoded = (std::get<Is>(members).has_value() && ...);
-    if (XLA_FFI_PREDICT_FALSE(!all_decoded)) return std::nullopt;
+    if (XLA_FFI_PREDICT_FALSE(!all_decoded)) {
+      return std::nullopt;
+    }
 
     return T{std::move(*std::get<Is>(members))...};
   }
@@ -1775,8 +1843,8 @@ auto DictionaryDecoder(Members... m) {
   template <>                                                                 \
   struct AttrDecoding<T> {                                                    \
     using Type = T;                                                           \
-    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr,         \
-                                   DiagnosticEngine& diagnostic) {            \
+    XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(           \
+        XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {    \
       if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {       \
         diagnostic.Emit("Wrong attribute type: expected ")                    \
             << XLA_FFI_AttrType_DICTIONARY << " but got " << type;            \
@@ -1795,34 +1863,35 @@ auto DictionaryDecoder(Members... m) {
 
 // Registers decoding for a user-defined enum class type. Uses enums underlying
 // type to decode the attribute as a scalar value and cast it to the enum type.
-#define XLA_FFI_REGISTER_ENUM_ATTR_DECODING(T)                                \
-  namespace xla::ffi {                                                        \
-  template <>                                                                 \
-  struct AttrDecoding<T> {                                                    \
-    using Type = T;                                                           \
-    using U = std::underlying_type_t<Type>;                                   \
-    static_assert(std::is_enum<Type>::value, "Expected enum class");          \
-                                                                              \
-    static std::optional<Type> Decode(XLA_FFI_AttrType attr_type, void* attr, \
-                                      DiagnosticEngine& diagnostic) {         \
-      if (XLA_FFI_PREDICT_FALSE(attr_type != XLA_FFI_AttrType_SCALAR)) {      \
-        return diagnostic.Emit("Wrong attribute type: expected ")             \
-               << XLA_FFI_AttrType_SCALAR << " but got " << attr_type;        \
-      }                                                                       \
-                                                                              \
-      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);                 \
-      auto expected_dtype =                                                   \
-          ::xla::ffi::internal::NativeTypeToCApiDataType<U>();                \
-      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != expected_dtype)) {           \
-        return diagnostic.Emit("Wrong scalar data type: expected ")           \
-               << expected_dtype << " but got " << scalar->dtype;             \
-      }                                                                       \
-                                                                              \
-      auto underlying = *reinterpret_cast<U*>(scalar->value);                 \
-      return static_cast<Type>(underlying);                                   \
-    }                                                                         \
-  };                                                                          \
-  } /* namespace xla::ffi */                                                  \
+#define XLA_FFI_REGISTER_ENUM_ATTR_DECODING(T)                           \
+  namespace xla::ffi {                                                   \
+  template <>                                                            \
+  struct AttrDecoding<T> {                                               \
+    using Type = T;                                                      \
+    using U = std::underlying_type_t<Type>;                              \
+    static_assert(std::is_enum<Type>::value, "Expected enum class");     \
+                                                                         \
+    XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(   \
+        XLA_FFI_AttrType attr_type, void* attr,                          \
+        DiagnosticEngine& diagnostic) {                                  \
+      if (XLA_FFI_PREDICT_FALSE(attr_type != XLA_FFI_AttrType_SCALAR)) { \
+        return diagnostic.Emit("Wrong attribute type: expected ")        \
+               << XLA_FFI_AttrType_SCALAR << " but got " << attr_type;   \
+      }                                                                  \
+                                                                         \
+      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);            \
+      constexpr auto expected_dtype =                                    \
+          ::xla::ffi::internal::NativeTypeToCApiDataType<U>();           \
+      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != expected_dtype)) {      \
+        return diagnostic.Emit("Wrong scalar data type: expected ")      \
+               << expected_dtype << " but got " << scalar->dtype;        \
+      }                                                                  \
+                                                                         \
+      auto underlying = *reinterpret_cast<U*>(scalar->value);            \
+      return static_cast<Type>(underlying);                              \
+    }                                                                    \
+  };                                                                     \
+  } /* namespace xla::ffi */                                             \
   static_assert(std::is_class_v<::xla::ffi::AttrDecoding<T>>)
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 1cf732ddac18..44a4e21281cc 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -179,10 +179,16 @@ typedef void XLA_FFI_Error_Destroy(XLA_FFI_Error_Destroy_Args* args);
 typedef enum {
   XLA_FFI_DataType_INVALID = 0,
   XLA_FFI_DataType_PRED = 1,
+  XLA_FFI_DataType_S1 = 30,
+  XLA_FFI_DataType_S2 = 26,
+  XLA_FFI_DataType_S4 = 21,
   XLA_FFI_DataType_S8 = 2,
   XLA_FFI_DataType_S16 = 3,
   XLA_FFI_DataType_S32 = 4,
   XLA_FFI_DataType_S64 = 5,
+  XLA_FFI_DataType_U1 = 31,
+  XLA_FFI_DataType_U2 = 27,
+  XLA_FFI_DataType_U4 = 22,
   XLA_FFI_DataType_U8 = 6,
   XLA_FFI_DataType_U16 = 7,
   XLA_FFI_DataType_U32 = 8,
@@ -468,17 +474,22 @@ typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
 // TypeId
 //===----------------------------------------------------------------------===//
 
+#define XLA_FFI_UNKNOWN_TYPE_ID XLA_FFI_TypeId{0}
+
 struct XLA_FFI_TypeId_Register_Args {
   size_t struct_size;
   XLA_FFI_Extension_Base* extension_start;
 
   XLA_FFI_ByteSpan name;
-  XLA_FFI_TypeId* type_id;  // out
+  XLA_FFI_TypeId* type_id;  // in-out
 };
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_TypeId_Register_Args, type_id);
 
-// Registers user type `name` and returns a unique `type_id`.
+// Registers user type `name` with XLA. If type id is `XLA_FFI_UNKNOWN_TYPE_ID`,
+// XLA will assign a unique type id and return it in `type_id` out argument,
+// otherwise XLA will verify that type id is unique and matches the type id of
+// the type registered with the same `name` earlier.
 typedef XLA_FFI_Error* XLA_FFI_TypeId_Register(
     XLA_FFI_TypeId_Register_Args* args);
 
@@ -661,6 +672,24 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_RunId_Get_Args, run_id);
 // Returns a unique identifier for the current logical execution.
 typedef XLA_FFI_Error* XLA_FFI_RunId_Get(XLA_FFI_RunId_Get_Args* args);
 
+//===----------------------------------------------------------------------===//
+// DeviceOrdinal
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_DeviceOrdinal_Get_Args {
+  size_t struct_size;
+  XLA_FFI_Extension_Base* extension_start;
+
+  XLA_FFI_ExecutionContext* ctx;
+  int32_t device_ordinal;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_DeviceOrdinal_Get_Args, device_ordinal);
+
+// Returns a unique identifier for the current logical execution.
+typedef XLA_FFI_Error* XLA_FFI_DeviceOrdinal_Get(
+    XLA_FFI_DeviceOrdinal_Get_Args* args);
+
 //===----------------------------------------------------------------------===//
 // Metadata extension
 //===----------------------------------------------------------------------===//
@@ -710,11 +739,12 @@ struct XLA_FFI_Api {
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Future_SetAvailable);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Future_SetError);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_RunId_Get);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_DeviceOrdinal_Get);
 };
 
 #undef _XLA_FFI_API_STRUCT_FIELD
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Api, XLA_FFI_Stream_Get);
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Api, XLA_FFI_DeviceOrdinal_Get);
 
 const XLA_FFI_Api* XLA_FFI_GetApi();
 
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index 656a7c942c37..8e55dfc03999 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_FFI_API_FFI_H_
 #define XLA_FFI_API_FFI_H_
 
+#include <string_view>
 #ifdef XLA_FFI_FFI_H_
 #error Two different XLA FFI implementations cannot be included together. \
        See README.md for more details.
@@ -37,7 +38,6 @@ limitations under the License.
 #include <optional>
 #include <ostream>
 #include <string>
-#include <string_view>
 #include <type_traits>
 #include <utility>
 #include <variant>
@@ -58,10 +58,16 @@ using TypeId = XLA_FFI_TypeId;  // NOLINT
 enum class DataType : uint8_t {
   INVALID = XLA_FFI_DataType_INVALID,
   PRED = XLA_FFI_DataType_PRED,
+  S1 = XLA_FFI_DataType_S1,
+  S2 = XLA_FFI_DataType_S2,
+  S4 = XLA_FFI_DataType_S4,
   S8 = XLA_FFI_DataType_S8,
   S16 = XLA_FFI_DataType_S16,
   S32 = XLA_FFI_DataType_S32,
   S64 = XLA_FFI_DataType_S64,
+  U1 = XLA_FFI_DataType_U1,
+  U2 = XLA_FFI_DataType_U2,
+  U4 = XLA_FFI_DataType_U4,
   U8 = XLA_FFI_DataType_U8,
   U16 = XLA_FFI_DataType_U16,
   U32 = XLA_FFI_DataType_U32,
@@ -87,10 +93,16 @@ enum class DataType : uint8_t {
 // Create aliases in ::xla::ffi namespace for all DataTypes, for consistency
 // with xla that defines PrimitiveType enums in ::xla namespace.
 inline constexpr DataType PRED = DataType::PRED;
+inline constexpr DataType S1 = DataType::S1;
+inline constexpr DataType S2 = DataType::S2;
+inline constexpr DataType S4 = DataType::S4;
 inline constexpr DataType S8 = DataType::S8;
 inline constexpr DataType S16 = DataType::S16;
 inline constexpr DataType S32 = DataType::S32;
 inline constexpr DataType S64 = DataType::S64;
+inline constexpr DataType U1 = DataType::U1;
+inline constexpr DataType U2 = DataType::U2;
+inline constexpr DataType U4 = DataType::U4;
 inline constexpr DataType U8 = DataType::U8;
 inline constexpr DataType U16 = DataType::U16;
 inline constexpr DataType U32 = DataType::U32;
@@ -123,7 +135,13 @@ constexpr size_t ByteWidth(DataType dtype) {
       return 0;
     case DataType::PRED:
       return 1;
+    case DataType::S1:
+    case DataType::S2:
+    case DataType::S4:
     case DataType::S8:
+    case DataType::U1:
+    case DataType::U2:
+    case DataType::U4:
     case DataType::U8:
     case DataType::F8E5M2:
     case DataType::F8E4M3:
@@ -434,11 +452,10 @@ class CountDownPromise {
           return state_->error;
         };
         state_->promise.SetError(take_error());
-        return true;
       } else {
         state_->promise.SetAvailable();
-        return true;
       }
+      return true;
     }
 
     return false;
@@ -949,8 +966,9 @@ struct internal::Decode<internal::RemainingRetsTag> {
   template <>                                                               \
   struct AttrDecoding<Span<const T>> {                                      \
     using Type = Span<const T>;                                             \
-    static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,    \
-                                      DiagnosticEngine& diagnostic) {       \
+                                                                            \
+    XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(      \
+        XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {  \
       if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_ARRAY)) {          \
         return diagnostic.Emit("Wrong attribute type: expected ")           \
                << XLA_FFI_AttrType_ARRAY << " but got " << type;            \
@@ -982,9 +1000,9 @@ XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(double, XLA_FFI_DataType_F64);
 template <>
 struct AttrDecoding<std::string_view> {
   using Type = std::string_view;
-  static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
-                                                void* attr,
-                                                DiagnosticEngine& diagnostic) {
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<std::string_view> Decode(
+      XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_STRING << " but got " << type;
@@ -1003,8 +1021,8 @@ template <typename T>
 struct AttrDecoding<Pointer<T>> {
   using Type = T*;
 
-  static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
-                                    DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
     auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR ||
                               scalar->dtype != XLA_FFI_DataType_S64)) {
@@ -1040,9 +1058,9 @@ class Dictionary : public internal::DictionaryBase {
 // Decode `AttrsTag` (all attributes) into a `Dictionary`.
 template <>
 struct internal::Decode<internal::AttrsTag<Dictionary>> {
-  static std::optional<Dictionary> call(DecodingOffsets& offsets,
-                                        DecodingContext& ctx,
-                                        DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Dictionary> call(
+      DecodingOffsets& offsets, DecodingContext& ctx,
+      DiagnosticEngine& diagnostic) {
     return Dictionary(&ctx.call_frame->attrs);
   }
 };
@@ -1051,8 +1069,8 @@ struct internal::Decode<internal::AttrsTag<Dictionary>> {
 template <>
 struct AttrDecoding<Dictionary> {
   using Type = Dictionary;
-  static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
-                                          DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Dictionary> Decode(
+      XLA_FFI_AttrType type, void* attr, DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
@@ -1103,6 +1121,7 @@ inline const char* GetErrorMessage(const XLA_FFI_Api* api,
 // Encodes `Error` as an FFI error.
 template <ExecutionStage stage>
 struct ResultEncoding<stage, Error> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
                                XLA_FFI_ExecutionContext* ctx, Error error) {
     if (XLA_FFI_PREDICT_TRUE(error.success())) {
@@ -1120,6 +1139,7 @@ struct ResultEncoding<ExecutionStage::kInstantiate,
   static_assert(std::is_same_v<decltype(T::id), TypeId>,
                 "State type must have a static `TypeId id` field");
 
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static XLA_FFI_Error* Encode(const XLA_FFI_Api* api,
                                XLA_FFI_ExecutionContext* ctx,
                                ErrorOr<std::unique_ptr<T>> state) {
@@ -1141,6 +1161,7 @@ struct ResultEncoding<ExecutionStage::kInstantiate,
 // Encodes `Future` as an asynchronous FFI result.
 template <ExecutionStage stage>
 struct ResultEncoding<stage, Future> {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::variant<XLA_FFI_Error*, XLA_FFI_Future*> Encode(
       const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx, Future future) {
     // Create XLA_FFI_Future object that will signal completion to the runtime.
@@ -1212,6 +1233,7 @@ struct CtxDecoding<PlatformStream<T>> {
 
   static_assert(std::is_pointer_v<T>, "stream type must be a pointer");
 
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Type> Decode(const XLA_FFI_Api* api,
                                     XLA_FFI_ExecutionContext* ctx,
                                     DiagnosticEngine& diagnostic) {
@@ -1277,9 +1299,9 @@ template <>
 struct CtxDecoding<ScratchAllocator> {
   using Type = ScratchAllocator;
 
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
     return ScratchAllocator(api, ctx, diagnostic);
   }
 };
@@ -1396,9 +1418,9 @@ template <>
 struct CtxDecoding<ThreadPool> {
   using Type = ThreadPool;
 
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
     return ThreadPool(api, ctx, diagnostic);
   }
 };
@@ -1408,6 +1430,35 @@ inline ThreadPool::ThreadPool(const XLA_FFI_Api* api,
                               DiagnosticEngine& diagnostic)
     : api_(api), ctx_(ctx), diagnostic_(diagnostic) {}
 
+//===----------------------------------------------------------------------===//
+// Context decoding for FFI internals
+//===----------------------------------------------------------------------===//
+
+struct FfiApi {};
+struct FfiExecutionContext {};
+
+template <>
+struct CtxDecoding<FfiApi> {
+  using Type = const XLA_FFI_Api*;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
+    return api;
+  }
+};
+
+template <>
+struct CtxDecoding<FfiExecutionContext> {
+  using Type = XLA_FFI_ExecutionContext*;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
+    return ctx;
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Type Registration
 //===----------------------------------------------------------------------===//
@@ -1441,9 +1492,9 @@ struct CtxDecoding<UserData<T>> {
   static_assert(std::is_same_v<decltype(T::id), TypeId>,
                 "UserData type must have a static `TypeId id` field");
 
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
     XLA_FFI_ExecutionContext_Get_Args args;
     args.struct_size = XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE;
     args.extension_start = nullptr;
@@ -1484,9 +1535,9 @@ struct CtxDecoding<State<T>> {
   static_assert(std::is_same_v<decltype(T::id), TypeId>,
                 "State type must have a static `TypeId id` field");
 
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
     XLA_FFI_State_Get_Args args;
     args.struct_size = XLA_FFI_State_Get_Args_STRUCT_SIZE;
     args.extension_start = nullptr;
@@ -1523,9 +1574,9 @@ template <>
 struct CtxDecoding<RunId> {
   using Type = RunId;
 
-  static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx,
-                                    DiagnosticEngine& diagnostic) {
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
     XLA_FFI_RunId_Get_Args args;
     args.struct_size = XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE;
     args.extension_start = nullptr;
@@ -1543,6 +1594,40 @@ struct CtxDecoding<RunId> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// DeviceOrdinal
+//===----------------------------------------------------------------------===//
+
+struct DeviceOrdinal {};
+
+// Context decoding for DeviceOrdinal.
+//
+// Example: Ffi::Bind().Ctx<DeviceOrdinal>()
+//                     .To([](int32_t device_ordinal) { ... });
+template <>
+struct CtxDecoding<DeviceOrdinal> {
+  using Type = int32_t;
+
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<Type> Decode(
+      const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+      DiagnosticEngine& diagnostic) {
+    XLA_FFI_DeviceOrdinal_Get_Args args;
+    args.struct_size = XLA_FFI_DeviceOrdinal_Get_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.ctx = ctx;
+    args.device_ordinal = 0;
+
+    if (XLA_FFI_Error* err = api->XLA_FFI_DeviceOrdinal_Get(&args); err) {
+      diagnostic.Emit("Failed to get device ordinal from execution context: ")
+          << internal::GetErrorMessage(api, err);
+      internal::DestroyError(api, err);
+      return std::nullopt;
+    }
+
+    return args.device_ordinal;
+  }
+};
+
 }  // namespace xla::ffi
 
 #endif  // XLA_FFI_API_FFI_H_
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index ba6b0b4e652d..d587f9db5b05 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <string_view>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -31,6 +30,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
@@ -69,11 +69,24 @@ enum class Int64BasedEnum : int64_t {
   kTwo = kI32MaxValue + 2,
 };
 
+// Enums for performance benchmark defined below.
+enum class Enum0 : int32_t { kA = 0, kB = 1, kC = 2, kD = 3 };
+enum class Enum1 : int32_t { kA = 0, kB = 1, kC = 2, kD = 3 };
+enum class Enum2 : int32_t { kA = 0, kB = 1, kC = 2, kD = 3 };
+enum class Enum3 : int32_t { kA = 0, kB = 1, kC = 2, kD = 3 };
+enum class Enum4 : int32_t { kA = 0, kB = 1, kC = 2, kD = 3 };
+
 }  // namespace xla::ffi
 
 XLA_FFI_REGISTER_ENUM_ATTR_DECODING(::xla::ffi::Int32BasedEnum);
 XLA_FFI_REGISTER_ENUM_ATTR_DECODING(::xla::ffi::Int64BasedEnum);
 
+XLA_FFI_REGISTER_ENUM_ATTR_DECODING(::xla::ffi::Enum0);
+XLA_FFI_REGISTER_ENUM_ATTR_DECODING(::xla::ffi::Enum1);
+XLA_FFI_REGISTER_ENUM_ATTR_DECODING(::xla::ffi::Enum2);
+XLA_FFI_REGISTER_ENUM_ATTR_DECODING(::xla::ffi::Enum3);
+XLA_FFI_REGISTER_ENUM_ATTR_DECODING(::xla::ffi::Enum4);
+
 namespace xla::ffi {
 
 struct PairOfI32AndF32 {
@@ -111,6 +124,9 @@ TEST(FfiTest, DataTypeEnumValue) {
 
   EXPECT_EQ(encoded(PrimitiveType::PRED), encoded(DataType::PRED));
 
+  EXPECT_EQ(encoded(PrimitiveType::S1), encoded(DataType::S1));
+  EXPECT_EQ(encoded(PrimitiveType::S2), encoded(DataType::S2));
+  EXPECT_EQ(encoded(PrimitiveType::S4), encoded(DataType::S4));
   EXPECT_EQ(encoded(PrimitiveType::S8), encoded(DataType::S8));
   EXPECT_EQ(encoded(PrimitiveType::S16), encoded(DataType::S16));
   EXPECT_EQ(encoded(PrimitiveType::S32), encoded(DataType::S32));
@@ -151,6 +167,12 @@ TEST(FfiTest, DataTypeByteWidth) {
   EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::PRED),
             ByteWidth(DataType::PRED));
 
+  EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::S1),
+            ByteWidth(DataType::S1));
+  EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::S2),
+            ByteWidth(DataType::S2));
+  EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::S4),
+            ByteWidth(DataType::S4));
   EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::S8),
             ByteWidth(DataType::S8));
   EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::S16),
@@ -160,6 +182,12 @@ TEST(FfiTest, DataTypeByteWidth) {
   EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::S64),
             ByteWidth(DataType::S64));
 
+  EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::U1),
+            ByteWidth(DataType::U1));
+  EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::U2),
+            ByteWidth(DataType::U2));
+  EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::U4),
+            ByteWidth(DataType::U4));
   EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::U8),
             ByteWidth(DataType::U8));
   EXPECT_EQ(primitive_util::ByteWidth(PrimitiveType::U16),
@@ -435,6 +463,24 @@ TEST(FfiTest, RunId) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, DeviceOrdinal) {
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  auto call_frame = builder.Build();
+
+  auto handler =
+      Ffi::Bind().Ctx<DeviceOrdinal>().To([&](int32_t device_ordinal) {
+        EXPECT_EQ(device_ordinal, 42);
+        return Error::Success();
+      });
+
+  CallOptions options;
+  options.device_ordinal = 42;
+
+  auto status = Call(*handler, call_frame, options);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
   se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
@@ -916,9 +962,17 @@ TEST(FfiTest, AttrsAsDictionary) {
     EXPECT_TRUE(f32.has_value());
     EXPECT_TRUE(str.has_value());
 
-    if (i32.has_value()) EXPECT_EQ(*i32, 42);
-    if (f32.has_value()) EXPECT_EQ(*f32, 42.0f);
-    if (str.has_value()) EXPECT_EQ(*str, "foo");
+    if (i32.has_value()) {
+      EXPECT_EQ(*i32, 42);
+    }
+
+    if (f32.has_value()) {
+      EXPECT_EQ(*f32, 42.0f);
+    }
+
+    if (str.has_value()) {
+      EXPECT_EQ(*str, "foo");
+    }
 
     EXPECT_FALSE(dict.contains("i64"));
     EXPECT_FALSE(dict.get<int64_t>("i32").has_value());
@@ -961,8 +1015,13 @@ TEST(FfiTest, DictionaryAttr) {
     EXPECT_TRUE(i32.has_value());
     EXPECT_TRUE(f32.has_value());
 
-    if (i32.has_value()) EXPECT_EQ(*i32, 42);
-    if (f32.has_value()) EXPECT_EQ(*f32, 42.0f);
+    if (i32.has_value()) {
+      EXPECT_EQ(*i32, 42);
+    }
+
+    if (f32.has_value()) {
+      EXPECT_EQ(*f32, 42.0f);
+    }
 
     return Error::Success();
   };
@@ -1127,30 +1186,52 @@ TEST(FfiTest, WrongEnumAttrType) {
       << status.message() << "\n";
 }
 
-struct MyData {
+struct MyDataWithAutoTypeId {
   static TypeId id;
-  std::string str;
+  std::string value;
 };
 
-TypeId MyData::id = {};  // zero-initialize type id
-XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data", &MyData::id);
+struct MyDataWithExplicitTypeId {
+  static TypeId id;
+  int64_t value;
+};
+
+// Rely on XLA to assign unique type id for the type.
+TypeId MyDataWithAutoTypeId::id = XLA_FFI_UNKNOWN_TYPE_ID;
+XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data_auto",
+                      &MyDataWithAutoTypeId::id);
+
+// Provide explicit type id and rely on XLA to check that it's unique.
+TypeId MyDataWithExplicitTypeId::id = {42};
+XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data_explicit",
+                      &MyDataWithExplicitTypeId::id);
 
 TEST(FfiTest, UserData) {
-  MyData data{"foo"};
+  MyDataWithAutoTypeId data0{"foo"};
+  MyDataWithExplicitTypeId data1{42};
+
+  EXPECT_GE(MyDataWithAutoTypeId::id.type_id, 0);
+  EXPECT_EQ(MyDataWithExplicitTypeId::id.type_id, 42);
 
   ExecutionContext execution_context;
   TF_ASSERT_OK(execution_context.Insert(
-      TypeIdRegistry::TypeId(MyData::id.type_id), &data));
+      TypeIdRegistry::TypeId(MyDataWithAutoTypeId::id.type_id), &data0));
+  TF_ASSERT_OK(execution_context.Insert(
+      TypeIdRegistry::TypeId(MyDataWithExplicitTypeId::id.type_id), &data1));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
   auto call_frame = builder.Build();
 
-  auto fn = [&](MyData* data) {
-    EXPECT_EQ(data->str, "foo");
+  auto fn = [&](MyDataWithAutoTypeId* data0, MyDataWithExplicitTypeId* data1) {
+    EXPECT_EQ(data0->value, "foo");
+    EXPECT_EQ(data1->value, 42);
     return Error::Success();
   };
 
-  auto handler = Ffi::Bind().Ctx<UserData<MyData>>().To(fn);
+  auto handler = Ffi::Bind()
+                     .Ctx<UserData<MyDataWithAutoTypeId>>()
+                     .Ctx<UserData<MyDataWithExplicitTypeId>>()
+                     .To(fn);
 
   CallOptions options;
   options.execution_context = &execution_context;
@@ -1261,6 +1342,13 @@ TEST(FfiTest, ScratchAllocatorUnimplemented) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, BindFfiInternals) {
+  (void)Ffi::Bind().Ctx<FfiApi>().Ctx<FfiExecutionContext>().To(
+      +[](const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx) {
+        return Error::Success();
+      });
+}
+
 TEST(FfiTest, ThreadPool) {
   tsl::thread::ThreadPool pool(tsl::Env::Default(), "ffi-test", 2);
   Eigen::ThreadPoolDevice device(pool.AsEigenThreadPool(), pool.NumThreads());
@@ -1554,4 +1642,62 @@ void BM_TupleOfI32Attrs(benchmark::State& state) {
 
 BENCHMARK(BM_TupleOfI32Attrs);
 
+//===----------------------------------------------------------------------===//
+// BM_EnumAttrs
+//===----------------------------------------------------------------------===//
+
+static Error EnumAttrsFunction(Enum0 e0, Enum1 e1, Enum2 e2, Enum3 e3,
+                               Enum4 e4) {
+  benchmark::DoNotOptimize(e0);
+  benchmark::DoNotOptimize(e1);
+  benchmark::DoNotOptimize(e2);
+  benchmark::DoNotOptimize(e3);
+  benchmark::DoNotOptimize(e4);
+  return Error::Success();
+}
+
+template <typename F>
+void BM_EnumAttrs(benchmark::State& state, F&& f) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("e0", int32_t{0});
+  attrs.Insert("e1", int32_t{0});
+  attrs.Insert("e2", int32_t{0});
+  attrs.Insert("e3", int32_t{0});
+  attrs.Insert("e4", int32_t{0});
+
+  CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind()
+                     .Attr<Enum0>("e0")
+                     .Attr<Enum1>("e1")
+                     .Attr<Enum2>("e2")
+                     .Attr<Enum3>("e3")
+                     .Attr<Enum4>("e4")
+                     .To(std::forward<F>(f));
+
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+static void BM_EnumAttrs(benchmark::State& state) {
+  BM_EnumAttrs(state, [](Enum0 e0, Enum1 e1, Enum2 e2, Enum3 e3, Enum4 e4) {
+    return EnumAttrsFunction(e0, e1, e2, e3, e4);
+  });
+}
+
+static void BM_EnumAttrsFunction(benchmark::State& state) {
+  BM_EnumAttrs(state, EnumAttrsFunction);
+}
+
+static void BM_EnumAttrsFunctionWrapper(benchmark::State& state) {
+  BM_EnumAttrs(state, Ffi::Wrapper<EnumAttrsFunction>());
+}
+
+BENCHMARK(BM_EnumAttrs);
+BENCHMARK(BM_EnumAttrsFunction);
+BENCHMARK(BM_EnumAttrsFunctionWrapper);
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/attribute_map.cc b/third_party/xla/xla/ffi/attribute_map.cc
index d774362eadf1..af675f418e2c 100644
--- a/third_party/xla/xla/ffi/attribute_map.cc
+++ b/third_party/xla/xla/ffi/attribute_map.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -35,17 +35,17 @@ limitations under the License.
 namespace xla::ffi {
 
 static absl::StatusOr<CallFrameBuilder::Attribute> ConvertBoolAttr(
-    std::string_view name, mlir::BoolAttr boolean) {
+    absl::string_view name, mlir::BoolAttr boolean) {
   return static_cast<bool>(boolean.getValue());
 }
 
 static absl::StatusOr<CallFrameBuilder::Attribute> ConvertStringAttr(
-    std::string_view name, mlir::StringAttr str) {
+    absl::string_view name, mlir::StringAttr str) {
   return str.getValue().str();
 }
 
 static absl::StatusOr<CallFrameBuilder::Attribute> ConvertIntegerAttr(
-    std::string_view name, mlir::IntegerAttr integer) {
+    absl::string_view name, mlir::IntegerAttr integer) {
   if (integer.getType().isUnsignedInteger()) {
     switch (integer.getType().getIntOrFloatBitWidth()) {
       case 8:
@@ -78,7 +78,7 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertIntegerAttr(
 }
 
 static absl::StatusOr<CallFrameBuilder::Attribute> ConvertFloatAttr(
-    std::string_view name, mlir::FloatAttr fp) {
+    absl::string_view name, mlir::FloatAttr fp) {
   switch (fp.getType().getIntOrFloatBitWidth()) {
     case 32:
       return static_cast<float>(fp.getValue().convertToFloat());
@@ -91,7 +91,7 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertFloatAttr(
 }
 
 static absl::StatusOr<CallFrameBuilder::Attribute> ConvertArrayAttr(
-    std::string_view name, mlir::DenseArrayAttr arr) {
+    absl::string_view name, mlir::DenseArrayAttr arr) {
   if (auto dense = mlir::dyn_cast<mlir::DenseI8ArrayAttr>(arr)) {
     return dense.asArrayRef().vec();
   } else if (auto dense = mlir::dyn_cast<mlir::DenseI16ArrayAttr>(arr)) {
@@ -118,7 +118,7 @@ static std::vector<T> CopyDenseElementsToVec(
 }
 
 static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDenseElementsAttr(
-    std::string_view name, mlir::DenseIntOrFPElementsAttr arr) {
+    absl::string_view name, mlir::DenseIntOrFPElementsAttr arr) {
   auto type = arr.getElementType();
   if (type.isInteger()) {
     if (type.isUnsignedInteger()) {
@@ -157,7 +157,7 @@ static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDenseElementsAttr(
 }
 
 static absl::StatusOr<CallFrameBuilder::Attribute> ConvertDictionaryAttr(
-    std::string_view name, mlir::DictionaryAttr dict) {
+    absl::string_view name, mlir::DictionaryAttr dict) {
   TF_ASSIGN_OR_RETURN(auto attrs, BuildAttributesMap(dict));
   return CallFrameBuilder::Dictionary{
       std::make_shared<CallFrameBuilder::AttributesMap>(std::move(attrs))};
@@ -167,7 +167,7 @@ absl::StatusOr<CallFrameBuilder::AttributesMap> BuildAttributesMap(
     mlir::DictionaryAttr dict) {
   CallFrameBuilder::AttributesMap attributes;
   for (auto& kv : dict) {
-    std::string_view name = kv.getName().strref();
+    absl::string_view name = kv.getName().strref();
     mlir::Attribute value = kv.getValue();
 
     // Wraps attribute conversion function into callable object.
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index 7bcb14da445e..ad067220ecf3 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -35,9 +35,9 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
 
 namespace xla::ffi {
 
@@ -55,12 +55,6 @@ CallFrameBuilder::AttributesMap CallFrameBuilder::AttributesBuilder::Build() {
   return std::move(attrs_);
 }
 
-static CallFrameBuilder::Attribute FromFlatAttribute(
-    CallFrameBuilder::FlatAttribute attr) {
-  return std::visit(
-      [](auto& attr) { return CallFrameBuilder::Attribute{attr}; }, attr);
-}
-
 CallFrameBuilder::AttributesBuilder::AttributesBuilder() = default;
 CallFrameBuilder::AttributesBuilder::~AttributesBuilder() = default;
 
@@ -76,7 +70,9 @@ void CallFrameBuilder::AttributesBuilder::Insert(std::string name,
 }
 
 void CallFrameBuilder::AttributesBuilder::Append(AttributesMap attrs) {
-  for (auto& [name, attr] : attrs) Insert(name, std::move(attr));
+  for (auto& [name, attr] : attrs) {
+    Insert(name, std::move(attr));
+  }
 }
 
 CallFrameBuilder::CallFrameBuilder(size_t num_args, size_t num_rets) {
@@ -249,10 +245,16 @@ static XLA_FFI_DataType ToDataType(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case PrimitiveType::PRIMITIVE_TYPE_INVALID:
     case PrimitiveType::PRED:
+    case PrimitiveType::S1:
+    case PrimitiveType::S2:
+    case PrimitiveType::S4:
     case PrimitiveType::S8:
     case PrimitiveType::S16:
     case PrimitiveType::S32:
     case PrimitiveType::S64:
+    case PrimitiveType::U1:
+    case PrimitiveType::U2:
+    case PrimitiveType::U4:
     case PrimitiveType::U8:
     case PrimitiveType::U16:
     case PrimitiveType::U32:
@@ -273,6 +275,8 @@ static XLA_FFI_DataType ToDataType(PrimitiveType primitive_type) {
     case PrimitiveType::F8E4M3FNUZ:
     case PrimitiveType::F8E3M4:
     case PrimitiveType::F8E8M0FNU:
+    case PrimitiveType::TUPLE:
+    case PrimitiveType::OPAQUE_TYPE:
       return static_cast<XLA_FFI_DataType>(primitive_type);
     default:
       DCHECK(false) << "Unsupported primitive type "
@@ -575,9 +579,13 @@ absl::Status CallFrame::UpdateWithBuffers(
   return absl::OkStatus();
 }
 
+CallFrame CallFrame::Copy() const {
+  return CallFrame(CopyArgs(*arguments_), CopyRets(*results_), attributes_);
+}
+
 absl::StatusOr<CallFrame> CallFrame::CopyWithBuffers(
     absl::Span<const se::DeviceMemoryBase> args,
-    absl::Span<const se::DeviceMemoryBase> rets) {
+    absl::Span<const se::DeviceMemoryBase> rets) const {
   CallFrame clone(CopyArgs(*arguments_), CopyRets(*results_), attributes_);
   TF_RETURN_IF_ERROR(clone.UpdateWithBuffers(args, rets));
   return clone;
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 0614bd750fd2..54863d0615b1 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -154,10 +154,13 @@ class CallFrame {
   absl::Status UpdateWithBuffers(absl::Span<const se::DeviceMemoryBase> args,
                                  absl::Span<const se::DeviceMemoryBase> rets);
 
+  // Creates a copy of the call frame.
+  CallFrame Copy() const;
+
   // Creates a copy of the call frame with updated arguments and results.
   absl::StatusOr<CallFrame> CopyWithBuffers(
       absl::Span<const se::DeviceMemoryBase> args,
-      absl::Span<const se::DeviceMemoryBase> rets);
+      absl::Span<const se::DeviceMemoryBase> rets) const;
 
   // Builds an XLA_FFI_CallFrame from owned arguments and attributes.
   XLA_FFI_CallFrame Build(
diff --git a/third_party/xla/xla/ffi/call_frame_test.cc b/third_party/xla/xla/ffi/call_frame_test.cc
index c74a51870df3..bb568313cd3b 100644
--- a/third_party/xla/xla/ffi/call_frame_test.cc
+++ b/third_party/xla/xla/ffi/call_frame_test.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
 
 namespace xla::ffi {
 
diff --git a/third_party/xla/xla/ffi/execution_context_test.cc b/third_party/xla/xla/ffi/execution_context_test.cc
index c8d37ea5c648..31439ff95629 100644
--- a/third_party/xla/xla/ffi/execution_context_test.cc
+++ b/third_party/xla/xla/ffi/execution_context_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/ffi/type_id_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla::ffi {
 
@@ -62,9 +62,8 @@ TEST(ExecutionContextTest, InsertUserOwned) {
 }
 
 TEST(ExecutionContextTest, InsertUserOwnedWithTypeId) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      TypeIdRegistry::TypeId type_id,
-      TypeIdRegistry::RegisterExternalTypeId("I32UserData"));
+  TF_ASSERT_OK_AND_ASSIGN(TypeIdRegistry::TypeId type_id,
+                          TypeIdRegistry::AssignExternalTypeId("I32UserData"));
 
   I32UserData user_data(42);
 
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index e23b510415b1..a72d781b60db 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <string_view>
 
 // IWYU pragma: begin_exports
 #include "xla/ffi/api/api.h"
@@ -82,8 +81,23 @@ namespace internal {
 
 inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
 
+// NativeTypeOf<dtype>::type is the native type for implementing the given dtype
+// in the FFI.
 template <PrimitiveType dtype>
-using NativeType = typename primitive_util::PrimitiveTypeToNative<dtype>::type;
+struct NativeTypeOf {
+  using type = typename primitive_util::PrimitiveTypeToNative<dtype>::type;
+};
+// PrimitiveTypeToNative<PrimitiveType::TOKEN> is not defined, so we need to
+// specialize it here.
+template <>
+struct NativeTypeOf<PrimitiveType::TOKEN> {
+  using type = void;
+};
+
+// NativeType<dtype> is the alias for the native type for implementing the given
+// dtype in the FFI.
+template <PrimitiveType dtype>
+using NativeType = typename NativeTypeOf<dtype>::type;
 
 }  // namespace internal
 
@@ -95,7 +109,7 @@ class AnyBuffer {
  public:
   using Dimensions = absl::Span<const int64_t>;
 
-  explicit AnyBuffer(absl::Nonnull<const XLA_FFI_Buffer*> buf) : buf_(buf) {
+  explicit AnyBuffer(const XLA_FFI_Buffer* absl_nonnull buf) : buf_(buf) {
     DCHECK(buf_ != nullptr) << "XLA_FFI_Buffer must be non-null";
   }
 
@@ -150,7 +164,7 @@ class Buffer {
  public:
   using Dimensions = AnyBuffer::Dimensions;
 
-  explicit Buffer(absl::Nonnull<const XLA_FFI_Buffer*> buf) : buf_(buf) {
+  explicit Buffer(const XLA_FFI_Buffer* absl_nonnull buf) : buf_(buf) {
     DCHECK(buf_ != nullptr) << "XLA_FFI_Buffer must be non-null";
   }
 
@@ -409,16 +423,16 @@ XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(double, XLA_FFI_DataType_F64);
 template <>
 struct AttrDecoding<absl::string_view> {
   using Type = absl::string_view;
-  static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
-                                                void* attr,
-                                                DiagnosticEngine& diagnostic) {
+  static std::optional<absl::string_view> Decode(XLA_FFI_AttrType type,
+                                                 void* attr,
+                                                 DiagnosticEngine& diagnostic) {
     if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_STRING << " but got " << type;
     }
 
     auto* span = reinterpret_cast<XLA_FFI_ByteSpan*>(attr);
-    return std::string_view(span->ptr, span->len);
+    return absl::string_view(span->ptr, span->len);
   }
 };
 
@@ -454,7 +468,7 @@ class Dictionary : public internal::DictionaryBase {
   using internal::DictionaryBase::DictionaryBase;
 
   template <typename T>
-  absl::StatusOr<T> get(std::string_view name) const {
+  absl::StatusOr<T> get(absl::string_view name) const {
     DiagnosticEngine diagnostic;
     std::optional<T> value = internal::DictionaryBase::get<T>(name, diagnostic);
     if (!value.has_value()) {
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index f93a5b296b0c..ad992d30b282 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdint>
 #include <exception>
 #include <string>
-#include <string_view>
 #include <type_traits>
 #include <utility>
 #include <variant>
@@ -36,6 +35,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/api.h"
 #include "xla/ffi/api/c_api.h"
@@ -333,13 +333,13 @@ const ExecutionContext* ScopedExecutionContext::GetCallExecutionContext(
 using HandlerKey = std::pair<std::string, std::string>;
 using HandlerRegistry = absl::flat_hash_map<HandlerKey, HandlerRegistration>;
 
-static HandlerKey MakeHandlerKey(std::string_view name,
-                                 std::string_view platform) {
+static HandlerKey MakeHandlerKey(absl::string_view name,
+                                 absl::string_view platform) {
   return std::make_pair(std::string(name), absl::AsciiStrToLower(platform));
 }
 
 static HandlerRegistry& GetHandlerRegistry() {
-  static auto* registry = new HandlerRegistry();
+  static auto* const registry = new HandlerRegistry();
   return *registry;
 }
 
@@ -353,8 +353,8 @@ static std::vector<std::string> GetHandlerStages(
   return stages;
 }
 
-static absl::Status RegisterHandler(std::string_view name,
-                                    std::string_view platform,
+static absl::Status RegisterHandler(absl::string_view name,
+                                    absl::string_view platform,
                                     XLA_FFI_Handler_Bundle bundle,
                                     XLA_FFI_Handler_Traits traits) {
   TF_ASSIGN_OR_RETURN(std::string canonical_platform,
@@ -413,8 +413,8 @@ static absl::Status RegisterHandler(std::string_view name,
   return absl::OkStatus();
 }
 
-absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
-                                                std::string_view platform) {
+absl::StatusOr<HandlerRegistration> FindHandler(absl::string_view name,
+                                                absl::string_view platform) {
   TF_ASSIGN_OR_RETURN(std::string canonical_platform,
                       PlatformUtil::CanonicalPlatformName(platform));
 
@@ -428,7 +428,7 @@ absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
 }
 
 absl::StatusOr<absl::flat_hash_map<std::string, HandlerRegistration>>
-StaticRegisteredHandlers(std::string_view platform) {
+StaticRegisteredHandlers(absl::string_view platform) {
   TF_ASSIGN_OR_RETURN(std::string canonical_platform,
                       PlatformUtil::CanonicalPlatformName(platform));
 
@@ -446,7 +446,7 @@ StaticRegisteredHandlers(std::string_view platform) {
 // XLA FFI Api Implementation
 //===----------------------------------------------------------------------===//
 
-static std::string StructSizeErrorMsg(std::string_view struct_name,
+static std::string StructSizeErrorMsg(absl::string_view struct_name,
                                       size_t expected, size_t actual) {
   return absl::StrCat("Unexpected ", struct_name, " size: expected ", expected,
                       ", got ", actual, ". Check installed software versions. ",
@@ -455,7 +455,7 @@ static std::string StructSizeErrorMsg(std::string_view struct_name,
 }
 
 static absl::Status ActualStructSizeIsGreaterOrEqual(
-    std::string_view struct_name, size_t expected, size_t actual) {
+    absl::string_view struct_name, size_t expected, size_t actual) {
   if (actual < expected) {
     return InvalidArgument("%s",
                            StructSizeErrorMsg(struct_name, expected, actual));
@@ -586,8 +586,8 @@ static XLA_FFI_Error* XLA_FFI_Handler_Register(
       args->struct_size));
 
   if (auto status = RegisterHandler(
-          std::string_view(args->name.ptr, args->name.len),
-          std::string_view(args->platform.ptr, args->platform.len),
+          absl::string_view(args->name.ptr, args->name.len),
+          absl::string_view(args->platform.ptr, args->platform.len),
           args->bundle, args->traits);
       !status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
@@ -629,19 +629,42 @@ static XLA_FFI_Error* XLA_FFI_RunId_Get(XLA_FFI_RunId_Get_Args* args) {
   return nullptr;
 }
 
+static XLA_FFI_Error* XLA_FFI_DeviceOrdinal_Get(
+    XLA_FFI_DeviceOrdinal_Get_Args* args) {
+  XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_DeviceOrdinal_Get", XLA_FFI_DeviceOrdinal_Get_Args_STRUCT_SIZE,
+      args->struct_size));
+  args->device_ordinal = args->ctx->device_ordinal;
+  return nullptr;
+}
+
 static XLA_FFI_Error* XLA_FFI_TypeId_Register(
     XLA_FFI_TypeId_Register_Args* args) {
   XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "XLA_FFI_ExecutionContext_Get_Args",
       XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE, args->struct_size));
 
-  auto type_id = TypeIdRegistry::RegisterExternalTypeId(
-      std::string_view(args->name.ptr, args->name.len));
-  if (!type_id.ok()) {
-    return new XLA_FFI_Error{std::move(type_id).status()};
+  absl::string_view type_name(args->name.ptr, args->name.len);
+  TypeIdRegistry::TypeId type_id(args->type_id->type_id);
+
+  // If type_id is unknown, we are registering a new type and XLA will assign a
+  // unique type id to it.
+  if (type_id == TypeIdRegistry::kUnknownTypeId) {
+    auto assigned_type_id = TypeIdRegistry::AssignExternalTypeId(type_name);
+    if (!assigned_type_id.ok()) {
+      return new XLA_FFI_Error{std::move(assigned_type_id).status()};
+    }
+
+    args->type_id->type_id = assigned_type_id->value();
+    return nullptr;
+  }
+
+  // If type_id is set, we are relying on the caller-provided unique type id.
+  if (auto status = TypeIdRegistry::RegisterExternalTypeId(type_name, type_id);
+      !status.ok()) {
+    return new XLA_FFI_Error{std::move(status)};
   }
 
-  args->type_id->type_id = type_id->value();
   return nullptr;
 }
 
@@ -943,6 +966,7 @@ static XLA_FFI_Api api = {
     XLA_FFI_Future_SetAvailable,
     XLA_FFI_Future_SetError,
     XLA_FFI_RunId_Get,
+    XLA_FFI_DeviceOrdinal_Get,
 };
 
 const XLA_FFI_Api* GetXlaFfiApi() { return &api; }
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index 3cc72c1670d9..ef9f65f4bd06 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
-#include <string_view>
 #include <variant>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/api.h"
 #include "xla/ffi/api/c_api.h"
@@ -150,12 +150,12 @@ bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits);
 
 // Returns registered FFI handler for a given name and platform, or an error if
 // it's not found in the static registry.
-absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
-                                                std::string_view platform);
+absl::StatusOr<HandlerRegistration> FindHandler(absl::string_view name,
+                                                absl::string_view platform);
 
 // Returns all registered calls in the static registry for a given platform.
 absl::StatusOr<absl::flat_hash_map<std::string, HandlerRegistration>>
-StaticRegisteredHandlers(std::string_view platform);
+StaticRegisteredHandlers(absl::string_view platform);
 
 //===----------------------------------------------------------------------===//
 // XLA FFI Api Implementation
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 30b92f54efd8..47508dfe1851 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <optional>
 #include <stdexcept>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <vector>
 
diff --git a/third_party/xla/xla/ffi/type_id_registry.cc b/third_party/xla/xla/ffi/type_id_registry.cc
index 836082f5e195..48e137deb5d5 100644
--- a/third_party/xla/xla/ffi/type_id_registry.cc
+++ b/third_party/xla/xla/ffi/type_id_registry.cc
@@ -18,12 +18,14 @@ limitations under the License.
 #include <atomic>
 #include <cstdint>
 #include <string>
-#include <string_view>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
 #include "absl/base/const_init.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/util.h"
 
@@ -35,28 +37,60 @@ using ExternalTypeIdRegistry =
     absl::flat_hash_map<std::string, TypeIdRegistry::TypeId>;
 
 static ExternalTypeIdRegistry& StaticExternalTypeIdRegistry() {
-  static auto* registry = new ExternalTypeIdRegistry();
+  static auto* const registry = new ExternalTypeIdRegistry();
   return *registry;
 }
 
-TypeIdRegistry::TypeId TypeIdRegistry::GetNextTypeId() {
+TypeIdRegistry::TypeId TypeIdRegistry::GetNextInternalTypeId() {
   static auto* counter = new std::atomic<int64_t>(1);
   return TypeId(counter->fetch_add(1));
 }
 
-absl::StatusOr<TypeIdRegistry::TypeId> TypeIdRegistry::RegisterExternalTypeId(
-    std::string_view name) {
+TypeIdRegistry::TypeId TypeIdRegistry::GetNextExternalTypeId() {
+  static auto* counter = new std::atomic<int64_t>(1);
+  return TypeId(counter->fetch_add(1));
+}
+
+absl::StatusOr<TypeIdRegistry::TypeId> TypeIdRegistry::AssignExternalTypeId(
+    absl::string_view name) {
   absl::MutexLock lock(&type_registry_mutex);
   auto& registry = StaticExternalTypeIdRegistry();
 
-  // Try to emplace with type id zero and fill it with real type id only if we
+  // Try to emplace with unknow type id and fill it with real type id only if we
   // successfully acquired an entry for a given name.
-  auto emplaced = registry.emplace(name, TypeId(0));
+  auto emplaced = registry.emplace(name, kUnknownTypeId);
   if (!emplaced.second) {
-    return Internal("Type id %d already registered for type name %s",
-                    emplaced.first->second.value(), name);
+    return Internal("Type name %s already registered with type id %d", name,
+                    emplaced.first->second.value());
   }
-  return emplaced.first->second = GetNextTypeId();
+
+  // Returns true if the registry contains an entry with a given type id.
+  auto type_id_is_in_use = [&registry](TypeId type_id) {
+    return absl::c_any_of(registry,
+                          [&](const auto& e) { return e.second == type_id; });
+  };
+
+  // Create a new type id that is not already in use.
+  TypeId type_id = GetNextExternalTypeId();
+  while (type_id_is_in_use(type_id)) {
+    type_id = GetNextExternalTypeId();
+  }
+
+  return emplaced.first->second = type_id;
+}
+
+absl::Status TypeIdRegistry::RegisterExternalTypeId(absl::string_view name,
+                                                    TypeId type_id) {
+  absl::MutexLock lock(&type_registry_mutex);
+  auto& registry = StaticExternalTypeIdRegistry();
+
+  auto emplaced = registry.emplace(name, type_id);
+  if (!emplaced.second && emplaced.first->second != type_id) {
+    return Internal("Type name %s already registered with type id %d vs %d)",
+                    name, emplaced.first->second.value(), type_id.value());
+  }
+
+  return absl::OkStatus();
 }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/type_id_registry.h b/third_party/xla/xla/ffi/type_id_registry.h
index 6b7455542c51..283ec9777796 100644
--- a/third_party/xla/xla/ffi/type_id_registry.h
+++ b/third_party/xla/xla/ffi/type_id_registry.h
@@ -17,9 +17,10 @@ limitations under the License.
 #define XLA_FFI_TYPE_ID_REGISTRY_H_
 
 #include <cstdint>
-#include <string_view>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/lib/gtl/int_type.h"
 
 namespace xla::ffi {
@@ -48,20 +49,30 @@ class TypeIdRegistry {
 
   static constexpr TypeId kUnknownTypeId = TypeId(0);
 
-  // Registers external type with a given name in a static type registry.
-  static absl::StatusOr<TypeId> RegisterExternalTypeId(std::string_view name);
+  // Assigns a unique type id to an external type with a given name. Returns an
+  // error if a type with a given name is already registered in the process.
+  static absl::StatusOr<TypeId> AssignExternalTypeId(absl::string_view name);
+
+  // Registers external type with a given name and type id. Type id is provided
+  // by the caller, and must be unique. Returns an error if a type with a given
+  // name is already registered with a different type id.
+  static absl::Status RegisterExternalTypeId(absl::string_view name,
+                                             TypeId type_id);
 
   // Returns a type id for a given type. For internal type ids only.
   template <typename T>
   static TypeId GetTypeId();
 
  private:
-  static TypeId GetNextTypeId();
+  // We never mix external and internal type ids, so we can use different type
+  // id spaces to assign unique ids to each type.
+  static TypeId GetNextInternalTypeId();
+  static TypeId GetNextExternalTypeId();
 };
 
 template <typename T>
 TypeIdRegistry::TypeId TypeIdRegistry::GetTypeId() {
-  static const TypeId id = GetNextTypeId();
+  static const TypeId id = GetNextInternalTypeId();
   return id;
 }
 
diff --git a/third_party/xla/xla/ffi/type_id_registry_test.cc b/third_party/xla/xla/ffi/type_id_registry_test.cc
index b26e385968c3..7e555291afb0 100644
--- a/third_party/xla/xla/ffi/type_id_registry_test.cc
+++ b/third_party/xla/xla/ffi/type_id_registry_test.cc
@@ -16,12 +16,14 @@ limitations under the License.
 #include "xla/ffi/type_id_registry.h"
 
 #include <cstdint>
+#include <limits>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla::ffi {
 namespace {
@@ -30,12 +32,25 @@ using ::testing::HasSubstr;
 
 TEST(TypeIdRegistryTest, RegisterExternalTypeId) {
   TF_ASSERT_OK_AND_ASSIGN(auto type_id,
-                          TypeIdRegistry::RegisterExternalTypeId("foo"));
+                          TypeIdRegistry::AssignExternalTypeId("foo"));
   EXPECT_GE(type_id.value(), 0);
 
-  auto duplicate_type_id = TypeIdRegistry::RegisterExternalTypeId("foo");
+  auto duplicate_type_id = TypeIdRegistry::AssignExternalTypeId("foo");
   EXPECT_THAT(duplicate_type_id.status().message(),
-              HasSubstr("already registered for type name foo"));
+              HasSubstr("Type name foo already registered with type id"));
+
+  // It's ok to register the same type with same type id.
+  TF_ASSERT_OK(TypeIdRegistry::RegisterExternalTypeId("foo", type_id));
+
+  // It's an error to register the same type with a different type id.
+  auto wrong_type_id = TypeIdRegistry::RegisterExternalTypeId(
+      "foo", TypeIdRegistry::TypeId(std::numeric_limits<int64_t>::max()));
+  EXPECT_THAT(wrong_type_id.message(),
+              HasSubstr("Type name foo already registered with type id"));
+
+  // It's ok to register a new type with a user-provided type id.
+  TF_ASSERT_OK(TypeIdRegistry::RegisterExternalTypeId(
+      "bar", TypeIdRegistry::TypeId(std::numeric_limits<int64_t>::max())));
 }
 
 TEST(TypeIdRegistryTest, RegisterInternalTypeId) {
diff --git a/third_party/xla/xla/hlo/analysis/BUILD b/third_party/xla/xla/hlo/analysis/BUILD
index 2ddad54a2840..4aa9a71b0881 100644
--- a/third_party/xla/xla/hlo/analysis/BUILD
+++ b/third_party/xla/xla/hlo/analysis/BUILD
@@ -2,7 +2,7 @@
 #   HLO analysis implementation.
 
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility")
@@ -28,6 +28,7 @@ cc_library(
     deps = [
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -39,14 +40,17 @@ xla_cc_test(
         ":hlo_dfs_reachability",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:test_benchmark",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -59,6 +63,9 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -70,6 +77,7 @@ xla_cc_test(
         ":hlo_reachability",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
@@ -78,6 +86,7 @@ xla_cc_test(
         "//xla/service:hlo_module_config",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/random",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test_benchmark",
     ],
@@ -89,17 +98,18 @@ cc_library(
     hdrs = ["hlo_ordering.h"],
     deps = [
         ":hlo_dataflow_analysis",
+        ":hlo_reachability",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:types",
         "//xla:util",
-        "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
         "//xla/service:call_graph",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -139,6 +149,7 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
@@ -164,6 +175,7 @@ xla_cc_test(
     deps = [
         ":while_loop_analysis",
         "//xla:comparison_util",
+        "//xla:literal_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -194,12 +206,12 @@ cc_library(
         "//xla/service:hlo_phi_graph",
         "//xla/service:hlo_value",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
@@ -229,6 +241,7 @@ xla_cc_test(
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_value",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -286,6 +299,7 @@ cc_library(
     srcs = ["hlo_replication_analysis.cc"],
     hdrs = ["hlo_replication_analysis.h"],
     deps = [
+        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -293,7 +307,10 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -307,9 +324,11 @@ xla_cc_test(
         ":hlo_replication_analysis",
         "//xla:shape_util",
         "//xla:types",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -329,10 +348,11 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -374,6 +394,9 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -389,7 +412,6 @@ xla_cc_test(
     deps = [
         ":hlo_alias_analysis",
         ":hlo_ordering",
-        "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -398,11 +420,11 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/transforms/simplifiers:flatten_call_graph",
-        "//xla/hlo/utils:hlo_matchers",
         "//xla/service:hlo_buffer",
         "//xla/service:hlo_value",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:logging",
@@ -421,6 +443,7 @@ cc_library(
         "//xla/service:logical_buffer",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
@@ -446,6 +469,9 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -470,6 +496,7 @@ xla_cc_test(
         "//xla/hlo/testlib:test_helpers",
         "//xla/service:logical_buffer",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:logging",
@@ -541,6 +568,7 @@ cc_library(
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:matmul_indexing_utils",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -564,8 +592,8 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/strings:string_view",
@@ -583,7 +611,7 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -612,7 +640,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc
index 7a489404bd64..aae41c5fe067 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -24,8 +26,12 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -415,7 +421,7 @@ absl::StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
   });
 
   XLA_VLOG_LINES(2, alias_analysis->ToString());
-  return std::move(alias_analysis);
+  return alias_analysis;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.h b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.h
index 0d1462d48453..095a2dea3237 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis.h
@@ -20,7 +20,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
diff --git a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis_test.cc
index 65b0915bef2f..8fa49381abad 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_alias_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_alias_analysis_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_computation.h"
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
index 0f60ef4e8509..00d118d70b11 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 
-#include <algorithm>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <queue>
@@ -25,16 +26,17 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -47,6 +49,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/types.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
@@ -1610,7 +1613,7 @@ absl::Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           // optional.
           define_value_at(/*index=*/{});
           define_value_at(/*index=*/{1});
-          for (int i = 2; i < instruction->shape().tuple_shapes_size(); ++i) {
+          for (int i = 2; i < instruction->shape().tuple_shapes().size(); ++i) {
             define_value_at(/*index=*/{i});
           }
 
@@ -1763,7 +1766,7 @@ absl::StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
 
   XLA_VLOG_LINES(1, dataflow_analysis->ToString());
 
-  return std::move(dataflow_analysis);
+  return dataflow_analysis;
 }
 
 absl::Status HloDataflowAnalysis::Verify() const {
@@ -1968,8 +1971,8 @@ HloDataflowAnalysis::GetInPlaceInputOutputPairs(
     if (instruction->operand(1)->shape().IsTuple()) {
       std::vector<std::pair<HloOperandIndex, ShapeIndex>> in_place_pairs(
           {{HloOperandIndex{1, {}}, {}}});
-      for (int i = 0; i < instruction->operand(1)->shape().tuple_shapes_size();
-           i++) {
+      for (int i = 0;
+           i < instruction->operand(1)->shape().tuple_shapes().size(); i++) {
         in_place_pairs.push_back({HloOperandIndex{1, {i}}, {i}});
       }
       return in_place_pairs;
@@ -1981,8 +1984,8 @@ HloDataflowAnalysis::GetInPlaceInputOutputPairs(
     if (instruction->operand(1)->shape().IsTuple()) {
       std::vector<std::pair<HloOperandIndex, ShapeIndex>> in_place_pairs(
           {{HloOperandIndex{1, {}}, {1}}});
-      for (int i = 0; i < instruction->operand(1)->shape().tuple_shapes_size();
-           i++) {
+      for (int i = 0;
+           i < instruction->operand(1)->shape().tuple_shapes().size(); i++) {
         in_place_pairs.push_back({HloOperandIndex{1, {i}}, {1, i}});
       }
       return in_place_pairs;
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.h b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.h
index 5509e1370433..c89df918c9aa 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.h
@@ -289,7 +289,6 @@ class HloDataflowAnalysis {
   bool UpdateCallValueSet(HloInstruction* call);
   bool UpdateConditionalValueSet(HloInstruction* conditional);
   bool UpdateCopyValueSet(HloInstruction* copy);
-  bool UpdateCustomCallValueSet(HloInstruction* custom_call);
   bool UpdateDomainValueSet(HloInstruction* domain);
   bool UpdateGetTupleElementValueSet(HloInstruction* gte);
   bool UpdateParameterValueSet(HloInstruction* parameter);
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
index 18302d3da9c9..06e6d133504f 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -156,6 +158,11 @@ TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
   EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
   EXPECT_FALSE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
   EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
+
+  // Check analysis ToString
+  EXPECT_THAT(
+      analysis.ToString(),
+      testing::HasSubstr("HloDataflowAnalysis, module BinaryOperation"));
 }
 
 TEST_P(HloDataflowAnalysisTest, TupleAndGtes) {
@@ -3619,5 +3626,104 @@ ENTRY AllToAll {
   EXPECT_EQ(in_place_pairs, expected_pairs);
 }
 
+// Test to check that the dataflow analysis works with a module that has scalar
+// bitcast user.
+TEST_P(HloDataflowAnalysisTest, b409416499) {
+  const char* after_layout_bitcast = R"(
+  HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(s32[1]{0:T(128)}, s32[1]{0:T(128)}, s32[1]{0:T(128)}, s32[1]{0:T(128)})->(s32[1]{0:T(128)}, s32[1]{0:T(128)}, s32[1]{0:T(128)}, s32[1]{0:T(128)})}, allow_spmd_sharding_propagation_to_parameters={false,false,false,false}, allow_spmd_sharding_propagation_to_output={true,true,true,true}, num_partitions=4
+  %region_0.13_spmd (param.1: s32[]) -> s32[] {
+    %param.1 = s32[]{:T(128)} parameter(0), metadata={op_name="jit(<unnamed wrapped function>)/jit(main)/jit(shmap_body)/while"}
+    %constant.1 = s32[]{:T(128)} constant(1)
+    ROOT %add.0 = s32[]{:T(128)} add(%param.1, %constant.1), metadata={op_name="jit(<unnamed wrapped function>)/jit(main)/jit(shmap_body)/while/body/add" source_file="third_party/py/jax/tests/shard_map_test.py" source_line=1052}
+  }
+
+  %region_1.17_spmd (param: s32[]) -> pred[] {
+    %param = s32[]{:T(128)} parameter(0), metadata={op_name="jit(<unnamed wrapped function>)/jit(main)/jit(shmap_body)/while"}
+    %constant = s32[]{:T(128)} constant(1)
+    ROOT %compare.0 = pred[]{:T(512)} compare(%param, %constant), direction=LT, metadata={op_name="jit(<unnamed wrapped function>)/jit(main)/jit(shmap_body)/while/cond/lt" source_file="third_party/py/jax/tests/shard_map_test.py" source_line=1049}
+  }
+
+  ENTRY %main.44_spmd (param.2: s32[1], param.3: s32[1], param.4: s32[1], param.5: s32[1]) -> (s32[1], s32[1], s32[1], s32[1]) {
+    %param.2 = s32[1]{0:T(128)} parameter(0), sharding={devices=[4]<=[4]}, metadata={op_name="args[0]"}
+    %bitcast.2 = s32[]{:T(128)} bitcast(%param.2), metadata={op_name="jit(<unnamed wrapped function>)/jit(main)/jit(shmap_body)/squeeze" source_file="third_party/py/jax/tests/shard_map_test.py" source_line=1053}
+    %while.1 = s32[]{:T(128)} while(%bitcast.2), condition=%region_1.17_spmd, body=%region_0.13_spmd, metadata={op_name="jit(<unnamed wrapped function>)/jit(main)/jit(shmap_body)/while" source_file="third_party/py/jax/tests/shard_map_test.py" source_line=1053}
+    %bitcast.3 = s32[1]{0:T(128)} bitcast(%while.1), metadata={op_name="jit(<unnamed wrapped function>)/jit(main)/jit(shmap_body)/broadcast_in_dim" source_file="third_party/py/jax/tests/shard_map_test.py" source_line=1053}
+    %param.3 = s32[1]{0:T(128)} parameter(1), sharding={devices=[4]<=[4]}, metadata={op_name="args[1]"}
+    %param.4 = s32[1]{0:T(128)} parameter(2), sharding={devices=[4]<=[4]}, metadata={op_name="args[2]"}
+    %param.5 = s32[1]{0:T(128)} parameter(3), sharding={devices=[4]<=[4]}, metadata={op_name="args[3]"}
+    ROOT %tuple.1 = (s32[1]{0:T(128)}, s32[1]{0:T(128)}, s32[1]{0:T(128)}, s32[1]{0:T(128)}) tuple(%bitcast.3, %param.3, %param.4, %param.5)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto after_layout_bitcast_module,
+                          ParseAndReturnVerifiedModule(after_layout_bitcast));
+  TF_ASSERT_OK_AND_ASSIGN(auto analysis,
+                          HloDataflowAnalysis::Run(*after_layout_bitcast_module,
+                                                   /*ssa_form=*/false));
+  HloInstruction* bitcast3 =
+      FindInstruction(after_layout_bitcast_module.get(), "bitcast.3");
+  HloInstruction* param2 =
+      FindInstruction(after_layout_bitcast_module.get(), "param.2");
+  HloComputation* while_body =
+      FindComputation(after_layout_bitcast_module.get(), "region_0.13_spmd");
+  HloInstruction* add0 = while_body->root_instruction();
+  std::vector<HloInstruction*> defining_instructions;
+  for (const HloValue* value :
+       analysis->GetValueSet(bitcast3, {}).TakeValues()) {
+    defining_instructions.push_back(value->defining_instruction());
+  }
+  EXPECT_THAT(defining_instructions, UnorderedElementsAre(param2, add0));
+}
+
+TEST_P(HloDataflowAnalysisTest, b409756077) {
+  const char* after_layout_bitcast = R"(
+  HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f32[1,256,256]{2,1,0:T(8,128)})->f32[1,256,256]{2,1,0:T(8,128)}}
+  add_f32 {
+    %add_lhs = f32[] parameter(0)
+    %add_rhs = f32[] parameter(1)
+    ROOT %add = f32[] add(%add_lhs, %add_rhs)
+  }
+  
+  %while_body (param.1: f32[256,256]) -> f32[256,256] {
+    %param.1 = f32[256,256]{1,0:T(8,128)} parameter(0)
+    %constant.0 = f32[]{:T(8,128)} constant(1)
+    %constant.1 = f32[256,256]{1,0:T(8,128)} broadcast(%constant.0), dimensions={}
+    ROOT %add.0 = f32[256,256]{1,0:T(8,128)} add(%param.1, %constant.1)
+  }
+
+  %while_condition (param: f32[256,256]) -> pred[] {
+    %param.0 = f32[256,256]{1,0:T(8,128)} parameter(0)
+    %zero = f32[]{:T(8,128)} constant(0)
+    %sum_of_values_in_param = f32[]{:T(8,128)} reduce(%param.0, %zero), dimensions={0,1}, to_apply=%add_f32
+    %constant = f32[]{:T(8,128)} constant(512)
+    ROOT %compare.0 = pred[] compare(%sum_of_values_in_param, %constant), direction=LT
+  }
+
+  ENTRY %main (param.2: f32[1,256,256]) -> f32[1,256,256] {
+    %param.2 = f32[1,256,256]{2,1,0:T(8,128)} parameter(0)
+    %bitcast.2 = f32[256,256]{1,0:T(8,128)} bitcast(%param.2)
+    %while.1 = f32[256,256]{1,0:T(8,128)} while(%bitcast.2), condition=%while_condition, body=%while_body
+    ROOT %bitcast.3 = f32[1,256,256]{2,1,0:T(8,128)} bitcast(%while.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto after_layout_bitcast_module,
+                          ParseAndReturnVerifiedModule(after_layout_bitcast));
+  TF_ASSERT_OK_AND_ASSIGN(auto analysis,
+                          HloDataflowAnalysis::Run(*after_layout_bitcast_module,
+                                                   /*ssa_form=*/false));
+  HloInstruction* bitcast3 =
+      FindInstruction(after_layout_bitcast_module.get(), "bitcast.3");
+  HloInstruction* param2 =
+      FindInstruction(after_layout_bitcast_module.get(), "param.2");
+  HloComputation* while_body =
+      FindComputation(after_layout_bitcast_module.get(), "while_body");
+  HloInstruction* add0 = while_body->root_instruction();
+  std::vector<HloInstruction*> defining_instructions;
+  for (const HloValue* value :
+       analysis->GetValueSet(bitcast3, {}).TakeValues()) {
+    defining_instructions.push_back(value->defining_instruction());
+  }
+  EXPECT_THAT(defining_instructions, UnorderedElementsAre(param2, add0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.cc b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.cc
index cf6e495424e6..e9593d4ce57b 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -110,4 +111,14 @@ std::unique_ptr<HloDfsReachability> HloDfsReachability::Build(
   return res;
 }
 
+void HloDfsReachability::OnInstructionReplaced(const HloInstruction* previous,
+                                               const HloInstruction* now) {
+  auto it = instruction_to_idx_.find(previous);
+  CHECK(it != instruction_to_idx_.end());
+  auto idx = it->second;
+  instruction_to_idx_.erase(it);
+  auto inserted = instruction_to_idx_.insert({now, idx}).second;
+  CHECK(inserted);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.h b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.h
index 775dd8eb824d..f81bf430329f 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability.h
@@ -37,9 +37,9 @@ namespace xla {
 class HloDfsReachability {
  public:
   // Returns true iff the instruction was present in the computation passed to
-  // Build(). The calling code may want to still use the class after the
-  // computation is modified, if it's known that the def-before-use order is
-  // still preserved.
+  // Build() or it was added via OnInstructionReplaced(). The calling code may
+  // want to still use the class after the computation is modified, if it's
+  // known that the def-before-use order is still preserved.
   bool IsPresent(const HloInstruction* instruction) const;
   // Returns true iff there is a path (with edges being users and control
   // successors) from 'from' to 'to'. (i.e. path from definitions to uses; from
@@ -50,6 +50,13 @@ class HloDfsReachability {
   bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
   static std::unique_ptr<HloDfsReachability> Build(
       const HloComputation* computation);
+  // Updates the internal data structure when instruction `previous` was
+  // replaced with instruction `now` in the computation. Requires
+  // IsPresent(previous) returns true and IsPresent(now) returns false.
+  // Postconditon: IsPresent(previous) returns false and IsPresent(now) returns
+  // true.
+  void OnInstructionReplaced(const HloInstruction* previous,
+                             const HloInstruction* now);
 
  private:
   // LLVM dense map shows ~10-20% speedup compared to absl::flat_hash_map.
diff --git a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc
index d717759643c1..03948283310d 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dfs_reachability_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -29,8 +30,10 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -116,6 +119,33 @@ TEST_F(HloDfsReachabilityTest, NonTrivialReachability) {
   EXPECT_FALSE(reachability->IsConnected(add, negate));
 }
 
+TEST_F(HloDfsReachabilityTest, ReplaceInstructionAfterFusion) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  HloModule m
+
+  ENTRY main {
+    param = f32[10]{0} parameter(0)
+    abs = f32[10]{0} abs(param)
+    ROOT negate = f32[10]{0} negate(abs)
+  })"));
+  auto computation = module->entry_computation();
+  auto reachability = HloDfsReachability::Build(computation);
+  auto neg = computation->root_instruction();
+  auto abs = neg->mutable_operand(0);
+  auto p0 = abs->operand(0);
+  EXPECT_TRUE(reachability->IsPresent(neg));
+  EXPECT_TRUE(reachability->IsPresent(abs));
+  EXPECT_TRUE(reachability->IsPresent(p0));
+  EXPECT_TRUE(reachability->IsReachable(p0, neg));
+  auto fusion = computation->AddInstruction(HloInstruction::CreateFusion(
+      neg->shape(), HloInstruction::FusionKind::kLoop, neg));
+  fusion->FuseInstruction(abs);
+  reachability->OnInstructionReplaced(neg, fusion);
+  EXPECT_FALSE(reachability->IsPresent(neg));
+  EXPECT_TRUE(reachability->IsPresent(fusion));
+  EXPECT_TRUE(reachability->IsReachable(p0, fusion));
+}
+
 TEST_F(HloDfsReachabilityTest, ChannelReachability) {
   const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
   HloComputation::Builder builder("ChannelReachability");
diff --git a/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis.cc
index 16152cbc8750..3c142220d052 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis.cc
@@ -18,13 +18,14 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <deque>
-#include <functional>
 #include <memory>
+#include <utility>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -349,7 +350,7 @@ absl::StatusOr<std::unique_ptr<HloLivenessAnalysis>> HloLivenessAnalysis::Run(
 
   liveness_analysis->RunAnalysis();
 
-  return std::move(liveness_analysis);
+  return liveness_analysis;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis_test.cc
index 0e164504056b..59f3843c3cbf 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_liveness_analysis_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "xla/hlo/analysis/hlo_liveness_analysis.h"
 
+#include <memory>
+#include <string>
+
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/hlo/analysis/hlo_ordering.cc b/third_party/xla/xla/hlo/analysis/hlo_ordering.cc
index 79b39a6f60de..5bb714fcc06d 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_ordering.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_ordering.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_ordering.h"
 
 #include <memory>
+#include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
-#include "absl/status/statusor.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
diff --git a/third_party/xla/xla/hlo/analysis/hlo_ordering.h b/third_party/xla/xla/hlo/analysis/hlo_ordering.h
index ded9fed8ccd1..55bb6b189839 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_ordering.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_ordering.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/hlo/analysis/hlo_ordering_test.cc b/third_party/xla/xla/hlo/analysis/hlo_ordering_test.cc
index 488953b6ba66..d715ee00172e 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_ordering_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_ordering_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_ordering.h"
 
 #include <memory>
-#include <string>
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/hlo/analysis/hlo_reachability.cc b/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
index 7123abbb1a73..a843013b9eb3 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "xla/hlo/analysis/hlo_reachability.h"
 
+#include <cstddef>
 #include <memory>
 #include <queue>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/hlo/analysis/hlo_reachability.h b/third_party/xla/xla/hlo/analysis/hlo_reachability.h
index 6c895c38a5a1..68faafe43cab 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_reachability.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_reachability.h
@@ -22,6 +22,8 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -58,7 +60,7 @@ class HloReachabilityMap {
   // Similar to the above Build operation except that it tries to identify
   // paths between instructions that do not contain control instructions
   // and multiple operands, i.e., b is_reachable a == true iff
-  // b = f(f(f(f(f(a), constant), constant), constant).
+  // b = f(f(f(f(f(a), constant), constant), constant), constant).
   // Further, the only ops allowed in a path are basic math operations such
   // as add, sub, mul, div.
   static std::unique_ptr<HloReachabilityMap> BuildWithRestrictions(
diff --git a/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc b/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc
index e9aae9531bc5..5e7e2eb2f57f 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_reachability_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_reachability.h"
 
 #include <memory>
-#include <set>
 #include <string>
 
 #include "absl/random/random.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/test_benchmark.h"
 
diff --git a/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.cc
index 2b1973ee8236..f03f76a70feb 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.cc
@@ -15,10 +15,13 @@ limitations under the License.
 
 #include "xla/hlo/analysis/hlo_replication_analysis.h"
 
+#include <unistd.h>
+
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -28,6 +31,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -37,12 +43,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/map_util.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
-namespace {
+
 // When cross_partition_spmd is true, returns the partition IDs of all
 // replica groups in which a given replica participates. Specfically, the k-th
 // element of the outermost vector in the returned data structure holds the
@@ -54,28 +62,30 @@ namespace {
 // element of the outermost vector in the returned data structure holds the
 // replica IDs converted from the global IDs in a collective's replica_groups
 // field for partition k.
-std::vector<std::vector<std::vector<int64_t>>> GroupsForReplicas(
-    absl::Span<const ReplicaGroup> groups, int64_t num_partitions,
-    int64_t replica_count, bool cross_partition_spmd) {
-  int64_t num_replicas = cross_partition_spmd ? replica_count : num_partitions;
+
+std::vector<std::vector<std::vector<int64_t>>>
+HloReplicationAnalysis::GroupsForReplicas(
+    absl::Span<const ReplicaGroup> groups) {
+  int64_t num_replicas =
+      cross_partition_spmd_ ? replica_count_ : num_partitions_;
   std::vector<std::vector<std::vector<int64_t>>> groups_for_replicas(
       num_replicas);
   for (const ReplicaGroup& group : groups) {
     absl::flat_hash_map<int64_t, std::vector<int64_t>> id_to_ids;
     for (int64_t id : group.replica_ids()) {
-      int64_t rid = id / num_partitions;
-      int64_t pid = id % num_partitions;
-      if (cross_partition_spmd) {
+      int64_t rid = id / num_partitions_;
+      int64_t pid = id % num_partitions_;
+      if (cross_partition_spmd_) {
         CHECK_LT(rid, num_replicas)
             << "Got replica ID " << rid
             << " which is greater or equal to the number of replicas: "
             << num_replicas;
         id_to_ids[rid].push_back(pid);
       } else {
-        CHECK_LT(pid, num_partitions)
+        CHECK_LT(pid, num_partitions_)
             << "Got partition ID " << rid
             << " which is greater or equal to the number of partitions: "
-            << num_partitions;
+            << num_partitions_;
         id_to_ids[pid].push_back(rid);
       }
     }
@@ -87,8 +97,6 @@ std::vector<std::vector<std::vector<int64_t>>> GroupsForReplicas(
   return groups_for_replicas;
 }
 
-}  // namespace
-
 // Determines whether an HLO instruction is replicated at index based on current
 // knowledge in hlo_replication. When cross_partition_spmd is true, the
 // instruction must be replicated across all partitions on each replica.
@@ -96,42 +104,30 @@ std::vector<std::vector<std::vector<int64_t>>> GroupsForReplicas(
 // replicated across all replicas on each partition.
 HloReplicationAnalysis::HloReplication
 HloReplicationAnalysis::DetermineHloInstructionIsReplicated(
-    const HloInstruction* hlo, const ShapeIndex& index,
-    bool cross_partition_spmd,
-    const absl::flat_hash_map<const HloInstruction*, ShapeTree<HloReplication>>&
-        hlo_replication,
-    bool support_partial_replication) {
-  const auto merge_operand_replication = [&hlo_replication](
-                                             const HloInstruction* inst) {
+    const HloInstruction* hlo, const ShapeIndex& index) {
+  const auto merge_operand_replication = [this](const HloInstruction* inst) {
     HloReplication replication = HloReplication::ReplicatedOnAllDevices();
     for (auto operand : inst->operands()) {
-      auto operand_it = hlo_replication.find(operand);
-      if (operand_it == hlo_replication.end()) {
-        replication = replication.Merge(HloReplication::UniqueOnAllDevices());
+      auto operand_it = hlo_replication_.find(operand);
+      if (operand_it == hlo_replication_.end()) {
+        replication = MergeReplications(replication,
+                                        HloReplication::UniqueOnAllDevices());
       } else {
-        replication = replication.Merge(operand_it->second.element({}));
+        replication =
+            MergeReplications(replication, operand_it->second.element({}));
       }
     }
     return replication;
   };
 
-  if (hlo->opcode() == HloOpcode::kAllReduce ||
-      hlo->opcode() == HloOpcode::kAllGather) {
-    // All-reduce/all-gather returns same values across partitions/replicas as
-    // long as its operands are replicated.
-    HloReplication replication = merge_operand_replication(hlo);
-    if (replication.IsReplicatedOnAllDevices()) {
-      return replication;
-    }
+  auto calculate_all_reduce_all_gather_replication = [this](
+                                                         const HloInstruction*
+                                                             hlo) {
     if (!hlo->channel_id().has_value()) {
-      // This is cross-replica-only.
-      if (cross_partition_spmd) {
-        return replication;
-      }
       if (hlo->replica_groups().empty() || hlo->replica_groups().size() == 1) {
         return HloReplication::ReplicatedOnAllDevices();
       }
-      if (support_partial_replication) {
+      if (support_partial_replication_) {
         std::vector<std::vector<std::vector<int64_t>>> device_sets_per_replica(
             1);
         for (const ReplicaGroup& replica_group : hlo->replica_groups()) {
@@ -142,9 +138,8 @@ HloReplicationAnalysis::DetermineHloInstructionIsReplicated(
           device_sets_per_replica[0].push_back(device_set);
         }
         return HloReplication::PartiallyReplicated(device_sets_per_replica);
-      } else {
-        return HloReplication::UniqueOnAllDevices();
       }
+      return HloReplication::UniqueOnAllDevices();
     } else {
       bool global_id;
       if (hlo->opcode() == HloOpcode::kAllReduce) {
@@ -153,63 +148,95 @@ HloReplicationAnalysis::DetermineHloInstructionIsReplicated(
         global_id = Cast<HloAllGatherInstruction>(hlo)->use_global_device_ids();
       }
       if (global_id) {
-        const int64_t num_partitions =
-            hlo->GetModule()->config().num_partitions();
-        const int64_t replica_count =
-            hlo->GetModule()->config().replica_count();
-        std::vector<std::vector<std::vector<int64_t>>> device_sets_per_replica =
-            GroupsForReplicas(hlo->replica_groups(), num_partitions,
-                              replica_count, cross_partition_spmd);
+        // Wrap the replica_groups() in a HashableReplicaGroupSpan to enabling
+        // hashing then cache the result of GroupsForReplicas().
+        auto [it, inserted] = device_sets_per_replica_map_.try_emplace(
+            HashableReplicaGroupSpan(hlo->replica_groups()));
+        const std::vector<std::vector<std::vector<int64_t>>>*
+            device_sets_per_replica;
+        if (inserted) {
+          it->second = std::vector<std::vector<std::vector<int64_t>>>(
+              GroupsForReplicas(hlo->replica_groups()));
+        }
+        device_sets_per_replica = &it->second;
 
         // In the fully replicated case, there is one set of partition or
         // replica IDs on each replica or partition. Since the flattened ID
         // replica groups must contain every device, the size of the set is the
         // number of partitions or replicas.
         bool fully_replicated = true;
-        for (auto device_sets : device_sets_per_replica) {
+        for (const auto& device_sets : *device_sets_per_replica) {
           fully_replicated &=
               device_sets.size() == 1 &&
               (*device_sets.begin()).size() ==
-                  (cross_partition_spmd ? num_partitions : replica_count);
+                  (cross_partition_spmd_ ? num_partitions_ : replica_count_);
         }
         if (fully_replicated) {
           return HloReplication::ReplicatedOnAllDevices();
-        } else if (support_partial_replication) {
-          return HloReplication::PartiallyReplicated(device_sets_per_replica);
+        } else if (support_partial_replication_) {
+          return HloReplication::PartiallyReplicated(*device_sets_per_replica);
         } else {
           return HloReplication::UniqueOnAllDevices();
         }
       }
-      if (cross_partition_spmd) {
+      if (cross_partition_spmd_) {
         return HloReplication::ReplicatedOnAllDevices();
       }
       if (hlo->replica_groups().empty() || hlo->replica_groups().size() == 1) {
         return HloReplication::ReplicatedOnAllDevices();
-      } else {
-        return HloReplication::UniqueOnAllDevices();
       }
+      return HloReplication::UniqueOnAllDevices();
+    }
+  };
+
+  if (hlo->opcode() == HloOpcode::kAllReduce ||
+      hlo->opcode() == HloOpcode::kAllGather) {
+    // All-reduce/all-gather returns same values across partitions/replicas as
+    // long as its operands are replicated.
+    HloReplication replication = merge_operand_replication(hlo);
+    if (replication.IsReplicatedOnAllDevices()) {
+      return replication;
     }
+    // This is cross-replica-only.
+    if (!hlo->channel_id().has_value() && cross_partition_spmd_) {
+      return replication;
+    }
+
+    // To save compile time on very large replica groups, check first if the
+    // replica group dedup map has an entry already populated with the
+    // replication and if so return that.
+    auto unique_replication_it = replica_group_dedup_map_.find(hlo);
+    if (unique_replication_it == replica_group_dedup_map_.end()) {
+      VLOG(1) << "No dedup entry for " << hlo->name();
+      return calculate_all_reduce_all_gather_replication(hlo);
+    }
+    std::optional<HloReplication>* unique_replication =
+        unique_replication_it->second;
+    if (!unique_replication->has_value()) {
+      *unique_replication = calculate_all_reduce_all_gather_replication(hlo);
+    }
+    return **unique_replication;
   }
   if (hlo->HasSideEffectNoRecurse()) {
     return HloReplication::UniqueOnAllDevices();
   }
   if (hlo->opcode() == HloOpcode::kReplicaId) {
     // ReplicaId returns the same value for all partitions in each replica.
-    return cross_partition_spmd ? HloReplication::ReplicatedOnAllDevices()
-                                : HloReplication::UniqueOnAllDevices();
+    return cross_partition_spmd_ ? HloReplication::ReplicatedOnAllDevices()
+                                 : HloReplication::UniqueOnAllDevices();
   }
   if (hlo->opcode() == HloOpcode::kPartitionId) {
     // PartitionId returns the same value for all replicas in each partition.
-    return cross_partition_spmd ? HloReplication::UniqueOnAllDevices()
-                                : HloReplication::ReplicatedOnAllDevices();
+    return cross_partition_spmd_ ? HloReplication::UniqueOnAllDevices()
+                                 : HloReplication::ReplicatedOnAllDevices();
   }
-  auto it = hlo_replication.find(hlo);
+  auto it = hlo_replication_.find(hlo);
   if (hlo->opcode() == HloOpcode::kParameter) {
     // Parameters should have been processed.
-    CHECK(it != hlo_replication.end());
+    CHECK(it != hlo_replication_.end());
     return it->second.element(index);
   }
-  if (it != hlo_replication.end() &&
+  if (it != hlo_replication_.end() &&
       it->second.element(index).IsUniqueOnAllDevices()) {
     // The HLO is already marked as non-replicated.
     return it->second.element(index);
@@ -227,21 +254,21 @@ HloReplicationAnalysis::DetermineHloInstructionIsReplicated(
   }
 
   // Pattern-match and process cases where the HLO is partially replicated.
-  if (support_partial_replication) {
+  if (support_partial_replication_) {
     // Below is a very specific pattern to match the SPMD pipeline case.
     if (hlo->opcode() == HloOpcode::kDynamicSlice) {
       const HloInstruction* ds_buffer = hlo->operand(0);
       if (hlo->dynamic_slice_sizes().size() == 1 &&
           hlo->dynamic_slice_sizes()[0] == 1 &&
           ds_buffer->opcode() == HloOpcode::kConstant &&
-          ds_buffer->shape().rank() == 1 &&
+          ds_buffer->shape().dimensions().size() == 1 &&
           ds_buffer->shape().element_type() == PrimitiveType::S32 &&
-          ((cross_partition_spmd &&
+          ((cross_partition_spmd_ &&
             hlo->operand(1)->opcode() == HloOpcode::kPartitionId) ||
-           (!cross_partition_spmd &&
+           (!cross_partition_spmd_ &&
             hlo->operand(1)->opcode() == HloOpcode::kReplicaId))) {
         const HloModule* hlo_module = hlo->GetModule();
-        int64_t num_devices = cross_partition_spmd
+        int64_t num_devices = cross_partition_spmd_
                                   ? hlo_module->config().num_partitions()
                                   : hlo_module->config().replica_count();
         absl::flat_hash_map<int64_t, std::vector<int64_t>> value_to_device_set;
@@ -287,7 +314,7 @@ HloReplicationAnalysis::DetermineHloInstructionIsReplicated(
 bool HloReplicationAnalysis::ComputeHloReplicationOnComputation(
     const HloComputation* computation, bool mark_everything_not_replicated) {
   bool changed = false;
-  for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+  for (const HloInstruction* inst : computation->MakeInstructionPostOrder()) {
     // Assigns the shape tree to dest if dest doesn't have one yet, or combines
     // it with the existing one by and'ing them. Returns if anything is updated.
     auto assign_or_combine_shapetree =
@@ -302,7 +329,7 @@ bool HloReplicationAnalysis::ComputeHloReplicationOnComputation(
           it->second.ForEachMutableElement(
               [&](const ShapeIndex& index, HloReplication* element) {
                 HloReplication new_replication =
-                    element->Merge(to_combine.element(index));
+                    MergeReplications(*element, to_combine.element(index));
                 if (!element->Equal(new_replication)) {
                   *element = std::move(new_replication);
                   updated = true;
@@ -444,9 +471,7 @@ bool HloReplicationAnalysis::ComputeHloReplicationOnComputation(
         ShapeUtil::ForEachSubshape(
             inst->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
               *shape_tree.mutable_element(index) =
-                  DetermineHloInstructionIsReplicated(
-                      inst, index, cross_partition_spmd_, hlo_replication_,
-                      support_partial_replication_);
+                  DetermineHloInstructionIsReplicated(inst, index);
             });
         changed |= assign_or_combine_shapetree(std::move(shape_tree), inst);
       }
@@ -464,6 +489,15 @@ absl::Status HloReplicationAnalysis::ComputeHloReplication() {
     auto param = entry->parameter_instruction(i);
     ShapeTree<HloReplication> shape_tree(param->shape(),
                                          HloReplication::UniqueOnAllDevices());
+
+    std::unique_ptr<ShapeTree<HloSharding>> sharding_tree = nullptr;
+    if (cross_partition_spmd_ && param->has_sharding()) {
+      TF_ASSIGN_OR_RETURN(auto result,
+                          param->sharding().AsShapeTree(param->shape()));
+      sharding_tree =
+          std::make_unique<ShapeTree<HloSharding>>(std::move(result));
+    }
+
     const auto& replication = param->parameter_replicated_at_leaf_buffers();
     int leaf_index = 0;
     absl::Status status = ShapeUtil::ForEachSubshapeWithStatus(
@@ -474,10 +508,8 @@ absl::Status HloReplicationAnalysis::ComputeHloReplication() {
           if (cross_partition_spmd_ && param->has_sharding()) {
             // In cross-partition spmd mode, set parameter replication status
             // based on the parameter's sharding.
-            TF_ASSIGN_OR_RETURN(auto sharding_tree,
-                                param->sharding().AsShapeTree(param->shape()));
             *shape_tree.mutable_element(index) =
-                sharding_tree.element(index).IsReplicated()
+                sharding_tree->element(index).IsReplicated()
                     ? HloReplication::ReplicatedOnAllDevices()
                     : HloReplication::UniqueOnAllDevices();
           }
@@ -508,6 +540,57 @@ absl::Status HloReplicationAnalysis::ComputeHloReplication() {
   return absl::OkStatus();
 }
 
+void HloReplicationAnalysis::BuildReplicaGroupDedupMap() {
+  std::vector<std::vector<const HloInstruction*>> dedupable_instructions;
+  for (const HloComputation* computation :
+       module_->MakeNonfusionComputations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kAllReduce ||
+          instruction->opcode() == HloOpcode::kAllGather) {
+        auto dedupable_it = absl::c_find_if(
+            dedupable_instructions,
+            [&](const std::vector<const HloInstruction*>& insts) {
+              const HloInstruction* other = insts.at(0);
+              auto use_global_device_ids = [&](const HloInstruction* inst) {
+                if (inst->opcode() == HloOpcode::kAllReduce) {
+                  return Cast<HloAllReduceInstruction>(inst)
+                      ->use_global_device_ids();
+                }
+                return Cast<HloAllGatherInstruction>(inst)
+                    ->use_global_device_ids();
+              };
+              // The existence of channel ids, global device ids and the replica
+              // groups can affect whether the instruction is replicated. So
+              // include these in the dedup cache key.
+              return instruction->channel_id().has_value() ==
+                         other->channel_id().has_value() &&
+                     use_global_device_ids(instruction) ==
+                         use_global_device_ids(other) &&
+                     absl::c_equal(
+                         instruction->replica_groups(), other->replica_groups(),
+                         [](const ReplicaGroup& a, const ReplicaGroup& b) {
+                           return absl::c_equal(a.replica_ids(),
+                                                b.replica_ids());
+                         });
+            });
+        if (dedupable_it == dedupable_instructions.end()) {
+          dedupable_instructions.push_back({instruction});
+        } else {
+          dedupable_it->push_back(instruction);
+        }
+      }
+    }
+  }
+
+  unique_replications_.reserve(dedupable_instructions.size());
+  for (auto& insts : dedupable_instructions) {
+    unique_replications_.push_back(std::nullopt);
+    for (const HloInstruction* inst : insts) {
+      replica_group_dedup_map_[inst] = &unique_replications_.back();
+    }
+  }
+}
+
 bool HloReplicationAnalysis::HloInstructionIsReplicatedAt(
     const HloInstruction* inst, const ShapeIndex& index) const {
   auto it = hlo_replication_.find(inst);
@@ -559,6 +642,7 @@ HloReplicationAnalysis::Run(const HloModule* module, bool cross_partition_spmd,
   auto analysis = absl::WrapUnique(new HloReplicationAnalysis(
       module, cross_partition_spmd, loops_known_with_same_iterations,
       /*support_partial_replication=*/false));
+  analysis->BuildReplicaGroupDedupMap();
   TF_RETURN_IF_ERROR(analysis->ComputeHloReplication());
   return analysis;
 }
@@ -570,6 +654,7 @@ HloReplicationAnalysis::RunWithPartialReplication(const HloModule* module,
   auto analysis = absl::WrapUnique(
       new HloReplicationAnalysis(module, cross_partition_spmd, &empty,
                                  /*support_partial_replication=*/true));
+  analysis->BuildReplicaGroupDedupMap();
   TF_RETURN_IF_ERROR(analysis->ComputeHloReplication());
   return analysis;
 }
@@ -581,10 +666,12 @@ HloReplicationAnalysis::HloReplication::HloReplication(
     HloReplicationAnalysis::HloReplication::State state,
     absl::Span<const std::vector<int64_t>> device_set_root_per_replica)
     : state_(state),
-      device_set_root_per_replica_(device_set_root_per_replica.begin(),
-                                   device_set_root_per_replica.end()) {
+      device_set_root_per_replica_(
+          std::make_shared<
+              HashOnConstruction<std::vector<std::vector<int64_t>>>>(
+              device_set_root_per_replica)) {
   CHECK(state == State::kPartiallyReplicated ||
-        device_set_root_per_replica_.empty());
+        device_set_root_per_replica_->empty());
 }
 
 HloReplicationAnalysis::HloReplication
@@ -643,13 +730,13 @@ HloReplicationAnalysis::HloReplication::Merge(
           bool unique_on_all_devices = true;
           std::vector<std::vector<std::vector<int64_t>>>
               device_sets_per_replica;
-          CHECK_EQ(device_set_root_per_replica_.size(),
-                   other.device_set_root_per_replica_.size());
-          for (int i = 0; i < device_set_root_per_replica_.size(); ++i) {
+          CHECK_EQ(device_set_root_per_replica_->size(),
+                   other.device_set_root_per_replica_->size());
+          for (int i = 0; i < device_set_root_per_replica_->size(); ++i) {
             const std::vector<int64_t>& my_device_set_root =
-                device_set_root_per_replica_[i];
+                device_set_root_per_replica_->at(i);
             const std::vector<int64_t>& other_device_set_root =
-                other.device_set_root_per_replica_[i];
+                other.device_set_root_per_replica_->at(i);
             absl::flat_hash_map<int64_t, std::vector<int64_t>>
                 value_to_device_set;
             size_t num_devices = my_device_set_root.size();
@@ -678,17 +765,14 @@ HloReplicationAnalysis::HloReplication::Merge(
 
 bool HloReplicationAnalysis::HloReplication::Equal(
     const HloReplication& other) const {
-  if (state_ != other.state_) {
-    return false;
-  }
-  for (int i = 0; i < device_set_root_per_replica_.size(); ++i) {
-    if (device_set_root_per_replica_[i] !=
-        other.device_set_root_per_replica_[i]) {
-      return false;
-    }
-  }
+  return state_ == other.state_ &&
+         device_set_root_per_replica_->hash_ ==
+             other.device_set_root_per_replica_->hash_;
+}
 
-  return true;
+bool HloReplicationAnalysis::HloReplication::operator==(
+    const HloReplicationAnalysis::HloReplication& rhs) const {
+  return Equal(rhs);
 }
 
 bool HloReplicationAnalysis::HloReplication::IsReplicatedOnAllDevices() const {
@@ -702,7 +786,8 @@ bool HloReplicationAnalysis::HloReplication::IsUniqueOnAllDevices() const {
 bool HloReplicationAnalysis::HloReplication::IsReplicatedWithinSubgroup(
     absl::Span<const int64_t> device_ids) const {
   if (device_ids.empty()) return true;
-  for (std::vector<int64_t> device_set_roots : device_set_root_per_replica_) {
+  for (const std::vector<int64_t>& device_set_roots :
+       *device_set_root_per_replica_) {
     if (!absl::c_all_of(device_ids,
                         [&device_ids, &device_set_roots](int device_id) {
                           return device_set_roots[device_id] ==
@@ -723,12 +808,12 @@ std::string HloReplicationAnalysis::HloReplication::ToString() const {
     case State::kPartiallyReplicated:
       std::ostringstream oss;
       oss << "PartiallyReplicated{";
-      for (int k = 0; k < device_set_root_per_replica_.size(); ++k) {
+      for (int k = 0; k < device_set_root_per_replica_->size(); ++k) {
         if (k > 0) {
           oss << ", ";
         }
         oss << absl::StrCat(
-            "{", absl::StrJoin(device_set_root_per_replica_[k], ","), "}");
+            "{", absl::StrJoin(device_set_root_per_replica_->at(k), ","), "}");
       }
       oss << "}";
       return oss.str();
diff --git a/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.h b/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.h
index aa4f15ab98b3..d050d44e19c9 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_replication_analysis.h
@@ -16,17 +16,60 @@ limitations under the License.
 #ifndef XLA_HLO_ANALYSIS_HLO_REPLICATION_ANALYSIS_H_
 #define XLA_HLO_ANALYSIS_HLO_REPLICATION_ANALYSIS_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
+// A wrapper around absl::Span<const ReplicaGroup> that allows us to hash it
+class HashableReplicaGroupSpan : public absl::Span<const ReplicaGroup> {
+ public:
+  explicit HashableReplicaGroupSpan(const absl::Span<const ReplicaGroup> groups)
+      : absl::Span<const ReplicaGroup>(groups) {}
+
+  bool operator==(const HashableReplicaGroupSpan& other) const {
+    if (size() != other.size()) {
+      return false;
+    }
+    for (int i = 0; i < size(); ++i) {
+      if (this->at(i).replica_ids().size() !=
+          other.at(i).replica_ids().size()) {
+        return false;
+      }
+      for (int j = 0; j < this->at(i).replica_ids().size(); ++j) {
+        if (this->at(i).replica_ids()[j] != other.at(i).replica_ids()[j]) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const HashableReplicaGroupSpan& a) {
+    for (const auto& group : a) {
+      for (int64_t id : group.replica_ids()) {
+        h = H::combine(std::move(h), id);
+      }
+    }
+    return H::combine(std::move(h), a.size());
+  }
+};
+
 // An HLO pass that determines whether each instruction in the module outputs
 // the same value across replicas or across partitions (depending on the value
 // `cross_partition_spmd`). It propagates sources of replicated values to
@@ -77,11 +120,18 @@ class HloReplicationAnalysis {
     HloReplication& operator=(HloReplication&& other) = default;
     HloReplication Merge(const HloReplication& other) const;
     bool Equal(const HloReplication& other) const;
+    bool operator==(const HloReplication& rhs) const;
     bool IsReplicatedOnAllDevices() const;
     bool IsUniqueOnAllDevices() const;
     bool IsReplicatedWithinSubgroup(absl::Span<const int64_t> device_ids) const;
     std::string ToString() const;
 
+    template <typename H>
+    friend H AbslHashValue(H h, const HloReplication& r) {
+      return H::combine(std::move(h), r.state_,
+                        *r.device_set_root_per_replica_);
+    }
+
    private:
     enum class State {
       kReplicatedOnAllDevices = 0,
@@ -92,6 +142,25 @@ class HloReplicationAnalysis {
         State state,
         absl::Span<const std::vector<int64_t>> device_set_root_per_replica);
     State state_;
+    // Helper class that subclasses T, and computes the hash once on
+    // construction, and intercepts the hash function to use the precomputed
+    // hash.
+    template <typename T>
+    class HashOnConstruction : public T {
+     public:
+      template <typename V>
+      explicit HashOnConstruction(V& device_set_root_per_replica)
+          : T(device_set_root_per_replica.begin(),
+              device_set_root_per_replica.end()),
+            hash_(absl::HashOf(device_set_root_per_replica)) {}
+
+      const size_t hash_;
+
+      template <typename H>
+      friend H AbslHashValue(H h, const HashOnConstruction& r) {
+        return H::combine(std::move(h), r.hash_);
+      }
+    };
     // Empty if state_ is kReplicatedOnAllDevices or kUniqueOnAllDevices.
 
     // If cross_partition_spmd is true, groups_for_replicas_[k]'s size equals
@@ -101,15 +170,30 @@ class HloReplicationAnalysis {
     // If cross_partition_spmd is false, groups_for_replicas_[k]'s size equals
     // the number of replicas, and within partition k, groups_for_replicas_[k]
     // maps each replica to the smallest replica ID in the set.
-    std::vector<std::vector<int64_t>> device_set_root_per_replica_;
+    std::shared_ptr<const HashOnConstruction<std::vector<std::vector<int64_t>>>>
+        device_set_root_per_replica_;
   };
 
-  static HloReplication DetermineHloInstructionIsReplicated(
-      const HloInstruction* hlo, const ShapeIndex& index,
-      bool cross_partition_spmd,
-      const absl::flat_hash_map<const HloInstruction*,
-                                ShapeTree<HloReplication>>& hlo_replication,
-      bool support_partial_replication);
+  std::vector<std::vector<std::vector<int64_t>>> GroupsForReplicas(
+      absl::Span<const ReplicaGroup> groups);
+
+  HloReplication DetermineHloInstructionIsReplicated(const HloInstruction* hlo,
+                                                     const ShapeIndex& index);
+
+  HloReplication MergeReplications(const HloReplication& replication_a,
+                                   const HloReplication& replication_b) {
+    std::pair<HloReplication, HloReplication> key = {replication_a,
+                                                     replication_b};
+
+    // Look replication pair up in map: if not found we pass the pair to an
+    // overloaded constructor of HloReplication which constructs and returns
+    // a merged HloReplication.
+    auto [iter, inserted] = replication_merge_map_.try_emplace(key);
+    if (inserted) {
+      iter->second = replication_a.Merge(replication_b);
+    }
+    return iter->second;
+  }
 
   HloReplicationAnalysis(const HloModule* module, bool cross_partition_spmd,
                          const absl::flat_hash_set<const HloInstruction*>*
@@ -118,7 +202,9 @@ class HloReplicationAnalysis {
       : module_(module),
         cross_partition_spmd_(cross_partition_spmd),
         loops_known_with_same_iterations_(*loops_known_with_same_iterations),
-        support_partial_replication_(support_partial_replication) {}
+        support_partial_replication_(support_partial_replication),
+        num_partitions_(module_->config().num_partitions()),
+        replica_count_(module_->config().replica_count()) {}
 
   // Computes hlo_replication_.
   absl::Status ComputeHloReplication();
@@ -128,6 +214,12 @@ class HloReplicationAnalysis {
   bool ComputeHloReplicationOnComputation(const HloComputation* computation,
                                           bool mark_everything_not_replicated);
 
+  // Builds the replica group dedup map that allows caching replication
+  // calculations for all-reduce/all-gather that share the same replica groups.
+  // This can significantly help in compile times when replica groups are very
+  // large.
+  void BuildReplicaGroupDedupMap();
+
   const HloModule* module_;
 
   // If true, run this replication analysis for replicated values across
@@ -138,7 +230,7 @@ class HloReplicationAnalysis {
   // are identical across partitions.
   //
   // If false, HloReplicationAnalysis runs across replicas.
-  bool cross_partition_spmd_;
+  const bool cross_partition_spmd_;
 
   // A set of while loops that are known to have the same iteration counts
   // across replicas or partitions. This is provided by the caller as additional
@@ -148,11 +240,26 @@ class HloReplicationAnalysis {
 
   const bool support_partial_replication_;
 
+  // Capture the number of partitions / replicas for the module.
+  const int64_t num_partitions_, replica_count_;
+
   // A map from each analyzed HLO instruction to a shape tree that represents
   // whether the instruction outputs the same value across replicas or
   // partitions at each shape index.
   absl::flat_hash_map<const HloInstruction*, ShapeTree<HloReplication>>
       hlo_replication_;
+
+  // Replications for all-reduce/all-gather that have the same replica groups is
+  // usually identical. We use the following data structures to memoize the
+  // replications for instructions with identical replica groups.
+  absl::flat_hash_map<const HloInstruction*, std::optional<HloReplication>*>
+      replica_group_dedup_map_;
+  absl::flat_hash_map<std::pair<HloReplication, HloReplication>, HloReplication>
+      replication_merge_map_;
+  std::vector<std::optional<HloReplication>> unique_replications_;
+  absl::flat_hash_map<HashableReplicaGroupSpan,
+                      std::vector<std::vector<std::vector<int64_t>>>>
+      device_sets_per_replica_map_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/hlo_replication_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_replication_analysis_test.cc
index eb0a2b1852f5..21e6845e58c7 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_replication_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_replication_analysis_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/types.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.cc
index f2454620fd46..ef6630e9e942 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -49,8 +50,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/side_effect_util.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -386,11 +385,6 @@ absl::Status EinsumDepthAnalysis::HandleDot(HloInstruction* dot) {
   return HandleDepthIncrementInstruction(dot);
 }
 
-absl::Status EinsumDepthAnalysis::HandleConvolution(
-    HloInstruction* convolution) {
-  return HandleDepthIncrementInstruction(convolution);
-}
-
 absl::Status EinsumDepthAnalysis::HandleCall(HloInstruction* call) {
   const ShapeTree<int>& depth_tree = GetDepthTreeOrDie(call);
   return HandleCalledComputation(*call->called_computations()[0], depth_tree,
@@ -773,12 +767,6 @@ absl::Status EinsumHeightAnalysis::HandleDot(HloInstruction* dot) {
   return HandleHeightIncrementInstruction(dot);
 }
 
-absl::Status EinsumHeightAnalysis::HandleConvolution(
-    HloInstruction* convolution) {
-  RETURN_IF_HEIGHT_EXISTS(convolution);
-  return HandleHeightIncrementInstruction(convolution);
-}
-
 absl::Status EinsumHeightAnalysis::HandleCall(HloInstruction* call) {
   RETURN_IF_HEIGHT_EXISTS(call);
   TF_RETURN_IF_ERROR(HandleCalledComputation(*(call->called_computations()[0]),
@@ -1178,6 +1166,13 @@ const HloValueSemantics* HloValueSemanticsPropagation::AddSemantics(
   return analysis_->NewHloValueSemantics(semantics.label(), semantics.origin());
 }
 
+namespace {
+bool IsDotOrConvolution(const HloInstruction* instruction) {
+  return HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution,
+                          HloOpcode::kRaggedDot>(instruction);
+}
+}  // namespace
+
 std::vector<HloValueSemanticsPropagation::EinsumAndOperandIndex>
 HloValueSemanticsPropagation::FindEinsumsWhereOriginDependsOnOther(
     const HloValueSemantics& semantics, const HloPosition& origin_dependence,
@@ -1203,10 +1198,8 @@ HloValueSemanticsPropagation::FindEinsumsWhereOriginDependsOnOther(
     if (origin.instruction->opcode() == HloOpcode::kDynamicSlice) {
       operands = operands.subspan(0, 1);
     }
-    bool is_einsum = origin.instruction->opcode() == HloOpcode::kDot ||
-                     origin.instruction->opcode() == HloOpcode::kConvolution;
     bool found_einsum = false;
-    if (is_einsum) {
+    if (IsDotOrConvolution(origin.instruction)) {
       for (int64_t operand_index = 0; operand_index < operands.size();
            ++operand_index) {
         const HloInstruction* origin_operand = operands[operand_index];
@@ -1250,9 +1243,7 @@ HloValueSemanticsPropagation::ComputeSemanticsFromStaticAndOther(
     return CopySemanticsWithNewOrigin(other_semantics, instruction);
   }
 
-  bool is_dot_or_convolution = instruction->opcode() == HloOpcode::kDot ||
-                               instruction->opcode() == HloOpcode::kConvolution;
-  if (is_dot_or_convolution &&
+  if (IsDotOrConvolution(instruction) &&
       other_semantics.label() == HloValueSemanticLabel::kActivationGradient) {
     return MaybeCreateGradientSemantics(
         instruction, HloValueSemanticLabel::kActivationGradient);
@@ -1300,10 +1291,8 @@ HloValueSemanticsPropagation::ComputeSemanticsFromWeightAndOther(
   CHECK(weight_semantics.label() == HloValueSemanticLabel::kWeight);
   CHECK(other_semantics.label() != HloValueSemanticLabel::kStatic &&
         other_semantics.label() != HloValueSemanticLabel::kRandom);
-  bool is_dot_or_convolution = instruction->opcode() == HloOpcode::kDot ||
-                               instruction->opcode() == HloOpcode::kConvolution;
   if (other_semantics.label() == HloValueSemanticLabel::kWeight) {
-    if (!is_dot_or_convolution) {
+    if (!IsDotOrConvolution(instruction)) {
       if (weight_semantics.origin() == other_semantics.origin()) {
         return CopySemantics(other_semantics);
       }
@@ -1312,7 +1301,7 @@ HloValueSemanticsPropagation::ComputeSemanticsFromWeightAndOther(
     return HloValueSemantics(HloValueSemanticLabel::kActivation,
                              {instruction, {}});
   }
-  if (!is_dot_or_convolution) {
+  if (!IsDotOrConvolution(instruction)) {
     return CopySemantics(other_semantics);
   }
   if (other_semantics.label() == HloValueSemanticLabel::kActivation) {
@@ -1361,9 +1350,7 @@ HloValueSemanticsPropagation::ComputeSemanticsFromActivationAndOther(
   CHECK(other_semantics.label() != HloValueSemanticLabel::kStatic &&
         other_semantics.label() != HloValueSemanticLabel::kRandom &&
         other_semantics.label() != HloValueSemanticLabel::kWeight);
-  bool is_dot_or_convolution = instruction->opcode() == HloOpcode::kDot ||
-                               instruction->opcode() == HloOpcode::kConvolution;
-  if (!is_dot_or_convolution) {
+  if (!IsDotOrConvolution(instruction)) {
     if (activation_semantics.origin() == other_semantics.origin()) {
       return CopySemantics(other_semantics);
     }
diff --git a/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.h b/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.h
index ec1f6df40520..a688ccb72aa4 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -98,7 +99,12 @@ class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
   absl::Status HandleGetTupleElement(
       HloInstruction* get_tuple_element) override;
   absl::Status HandleDot(HloInstruction* dot) override;
-  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override {
+    return HandleDot(convolution);
+  }
+  absl::Status HandleRaggedDot(HloInstruction* ragged_dot) override {
+    return HandleDot(ragged_dot);
+  }
   absl::Status HandleCall(HloInstruction* call) override;
   absl::Status HandleFusion(HloInstruction* fusion) override;
   absl::Status HandleWhile(HloInstruction* xla_while) override;
@@ -154,7 +160,12 @@ class EinsumHeightAnalysis : public DfsHloVisitorWithDefault {
   absl::Status HandleGetTupleElement(
       HloInstruction* get_tuple_element) override;
   absl::Status HandleDot(HloInstruction* dot) override;
-  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override {
+    return HandleDot(convolution);
+  }
+  absl::Status HandleRaggedDot(HloInstruction* ragged_dot) override {
+    return HandleDot(ragged_dot);
+  }
   absl::Status HandleCall(HloInstruction* call) override;
   absl::Status HandleFusion(HloInstruction* fusion) override;
   absl::Status HandleWhile(HloInstruction* xla_while) override;
diff --git a/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis_test.cc b/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis_test.cc
index 46cc4afa41cc..fafc79588552 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_value_semantics_analysis_test.cc
@@ -250,6 +250,52 @@ ENTRY entry {
   EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "dot.2"));
 }
 
+TEST_F(HloValueSemanticsAnalysisTest, OneRaggedDot) {
+  const std::string module_str = R"(
+HloModule OneMatmul
+
+region_0.39 {
+  Arg_0.40 = f32[] parameter(0)
+  Arg_1.41 = f32[] parameter(1)
+  ROOT add.42 = f32[] add(Arg_0.40, Arg_1.41)
+}
+
+ENTRY entry {
+  Arg_1.2 = f32[8,32,128]{2,1,0} parameter(0), sharding={devices=[1,2,1]0,1}
+  Arg_7.8 = f32[4,32]{1,0} parameter(1), sharding={devices=[2,1]0,1}
+  G = s32[8]{0} parameter(2), sharding={replicated}
+  copy = f32[4,32]{1,0} copy(Arg_7.8), sharding={devices=[2,1]0,1}
+  dot.0 = f32[4,128]{1,0} ragged-dot(copy, Arg_1.2, G), lhs_contracting_dims={1}, rhs_contracting_dims={1}, lhs_ragged_dims={0}, rhs_group_dims={0}, sharding={devices=[2,1]0,1}
+  constant.5 = f32[] constant(0), sharding={replicated}
+  broadcast.2 = f32[4,128]{1,0} broadcast(constant.5), dimensions={}, sharding={devices=[2,1]0,1}
+  maximum.33 = f32[4,128]{1,0} maximum(dot.0, broadcast.2), sharding={devices=[2,1]0,1}
+  compare.34 = pred[4,128]{1,0} compare(dot.0, maximum.33), direction=EQ, sharding={devices=[2,1]0,1}
+  constant.4 = f32[] constant(1), sharding={replicated}
+  broadcast.1 = f32[4,128]{1,0} broadcast(constant.4), dimensions={}, sharding={devices=[2,1]0,1}
+  select.35 = f32[4,128]{1,0} select(compare.34, broadcast.1, broadcast.2), sharding={devices=[2,1]0,1}
+  dot.2 = f32[32,128]{0,1} dot(copy, select.35), lhs_contracting_dims={0}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  constant.11 = f32[] constant(-0.01), sharding={replicated}
+  broadcast.12 = f32[32,128]{1,0} broadcast(constant.11), dimensions={}, sharding={devices=[2,1]0,1}
+  multiply.52 = f32[32,128]{0,1} multiply(dot.2, broadcast.12), sharding={devices=[2,1]0,1}
+  reduce.43 = f32[] reduce(maximum.33, constant.5), dimensions={0,1}, to_apply=region_0.39, sharding={replicated}
+  ROOT tuple.109 = (f32[32,128]{1,0}, f32[]) tuple(multiply.52, reduce.43), sharding={{devices=[2,1]0,1}, {replicated}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
+                                                /*num_partitions=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloValueSemanticsAnalysis> hlo_value_semantics_analysis,
+      HloValueSemanticsAnalysis::Run(*module));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "copy"));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "Arg_1.2"));
+  EXPECT_TRUE(
+      IsActivation(*hlo_value_semantics_analysis, module.get(), "dot.0"));
+  EXPECT_TRUE(
+      IsStatic(*hlo_value_semantics_analysis, module.get(), "select.35"));
+  EXPECT_TRUE(IsWeight(*hlo_value_semantics_analysis, module.get(), "dot.2"));
+}
 TEST_F(HloValueSemanticsAnalysisTest, HandleConditional) {
   const std::string module_str = R"(
     HloModule Module
@@ -686,6 +732,37 @@ TEST_F(EinsumDepthAnalysisTest, HandleAfterAll) {
             0);
 }
 
+TEST_F(EinsumDepthAnalysisTest, SendWithRecv) {
+  const std::string module_str = R"(
+    HloModule foobar
+
+    ENTRY entry {
+      arg_0 = s32[] parameter(0)
+      arg_1 = token[] parameter(1)
+
+      send.0 = (s32[], u32[], token[]) send(s32[] arg_0, token[] arg_1), channel_id=3, is_host_transfer=true, sharding={{maximal device=0}, {maximal device=0}, {maximal device=0}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous", _xla_host_transfer_rendezvous="rendezvous1"}
+      send-done.1 = token[] send-done((s32[], u32[], token[]) send.0), channel_id=3, is_host_transfer=true, sharding={maximal device=0}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous", _xla_host_transfer_rendezvous="rendezvous1"}
+
+      recv.2 = (s32[], u32[], token[]) recv(token[] send-done.1), channel_id=3, is_host_transfer=true, sharding={{maximal device=0}, {maximal device=0}, {maximal device=0}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous", _xla_host_transfer_rendezvous="rendezvous1"}
+      recv-done.3 = (s32[], token[]) recv-done((s32[], u32[], token[]) recv.2), channel_id=3, is_host_transfer=true, sharding={{maximal device=0}, {maximal device=0}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous", _xla_host_transfer_rendezvous="rendezvous1"}
+
+      get-tuple-element.4 = token[] get-tuple-element((s32[], token[]) recv-done.3), index=1, sharding={maximal device=0}
+      ROOT %after-all.2 = token[] after-all(get-tuple-element.4), frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="rendezvous1"}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<EinsumDepthAnalysis> einsum_depth_analysis,
+      EinsumDepthAnalysis::Run(*module->entry_computation(),
+                               SendRecvGroupMap(*module)));
+  const EinsumDepthMap& einsum_depth_map =
+      einsum_depth_analysis->GetEinsumDepthMap();
+  HloComputation* computation = module->GetComputationWithName("entry");
+  EXPECT_EQ(GetInstructionDepth(einsum_depth_map, computation, "after-all.2"),
+            0);
+}
+
 class EinsumHeightAnalysisTest : public HloHardwareIndependentTestBase {
  public:
   int GetInstructionHeight(const EinsumHeightMap& height_map,
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc b/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc
index 0e4011c3213b..2cd08d3129bc 100644
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexed_array_analysis.cc
@@ -16,10 +16,14 @@ limitations under the License.
 #include "xla/hlo/analysis/indexed_array_analysis.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <iterator>
 #include <numeric>
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
@@ -218,7 +222,7 @@ absl::StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldGatherOfGather(
 
   enum class IndexComponent { Ungathered, GatheredFirst, GatheredSecond };
 
-  std::vector<IndexComponent> simulated_index(a->shape().dimensions_size(),
+  std::vector<IndexComponent> simulated_index(a->shape().dimensions().size(),
                                               IndexComponent::Ungathered);
 
   // Simulate the first gather.
@@ -271,7 +275,7 @@ absl::StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldGatherOfGather(
 absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
     const Shape& shape, const GatherDimensionNumbers& dim_numbers,
     absl::Span<const int64_t> slice_sizes, Array* source, Array* indices) {
-  if (dim_numbers.index_vector_dim() != indices->shape().dimensions_size()) {
+  if (dim_numbers.index_vector_dim() != indices->shape().dimensions().size()) {
     VLOG(3) << "ComputeArrayForGather: indices are not scalar";
     return nullptr;
   }
@@ -293,7 +297,7 @@ absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
   // dimensions -- for instance it cannot represent a gather that picks 5 [2,3]
   // arrays from an array of size [7,4,6].  We check that condition down below:
 
-  for (int64_t i = 0, e = source->shape().dimensions_size(); i < e; i++) {
+  for (int64_t i = 0, e = source->shape().dimensions().size(); i < e; i++) {
     if (i != dim_numbers.collapsed_slice_dims(0) &&
         source->shape().dimensions(i) != slice_sizes[i]) {
       VLOG(3) << "ComputeArrayForGather: slice_sizes[" << i
@@ -307,7 +311,7 @@ absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
 
   int64_t source_dim = dim_numbers.start_index_map(0);
   std::vector<int64_t> output_dims;
-  for (int64_t i = 0, e = shape.dimensions_size(); i < e; i++) {
+  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
     if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) {
       output_dims.push_back(i);
     }
@@ -494,7 +498,7 @@ IndexedArrayAnalysis::ReshapeToRemoveDegenerateDims(
 
   const Shape& source_shape = operand->source()->shape();
   DimensionVector new_source_shape_dims;
-  for (int64_t i = 0, e = source_shape.dimensions_size(); i < e; i++) {
+  for (int64_t i = 0, e = source_shape.dimensions().size(); i < e; i++) {
     if (i == operand->source_dim() || source_shape.dimensions(i) != 1) {
       new_source_shape_dims.push_back(source_shape.dimensions(i));
     }
@@ -516,7 +520,7 @@ IndexedArrayAnalysis::ReshapeToRemoveDegenerateDims(
   // will no longer be present.
   DimensionVector new_output_dims;
   int64_t degenerate_dims_seen = 0;
-  for (int64_t i = 0, e = shape.dimensions_size(); i < e; i++) {
+  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
     if (shape.dimensions(i) == 1) {
       degenerate_dims_seen++;
     } else if (absl::c_linear_search(operand->output_dims(), i)) {
@@ -553,7 +557,7 @@ IndexedArrayAnalysis::ReshapeToAddDegenerateDims(
     // index.
 
     absl::InlinedVector<bool, 6> output_dims_bitvector(
-        operand->shape().dimensions_size());
+        operand->shape().dimensions().size());
     for (int64_t output_dim : operand->output_dims()) {
       output_dims_bitvector[output_dim] = true;
     }
@@ -640,7 +644,7 @@ absl::StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldReshapeOfGather(
   }
 
   DimensionVector degenerate_result_dims;
-  for (int64_t i = 0, e = shape.dimensions_size(); i < e; i++) {
+  for (int64_t i = 0, e = shape.dimensions().size(); i < e; i++) {
     if (shape.dimensions(i) == 1) {
       degenerate_result_dims.push_back(i);
     }
@@ -907,7 +911,8 @@ IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
   // existing broadcast:
   enum class IndexComponent { Broadcasted, NotBroadcasted };
   std::vector<IndexComponent> simulated_index(
-      broadcast_instr->shape().dimensions_size(), IndexComponent::Broadcasted);
+      broadcast_instr->shape().dimensions().size(),
+      IndexComponent::Broadcasted);
   for (int64_t broadcast_dim : broadcast_dims) {
     simulated_index[broadcast_dim] = IndexComponent::NotBroadcasted;
   }
@@ -1019,8 +1024,9 @@ bool CanFoldDotIntoIndexedArray(
     absl::Span<const int64_t> contracting_dims,
     absl::Span<const int64_t> batch_dims) {
   std::optional<int64_t> non_contracting_non_batch_dim =
-      GetOnlyNonContractingNonBatchDim(indexed_array->shape().rank(),
-                                       contracting_dims, batch_dims);
+      GetOnlyNonContractingNonBatchDim(
+          indexed_array->shape().dimensions().size(), contracting_dims,
+          batch_dims);
   if (!non_contracting_non_batch_dim.has_value()) {
     VLOG(3) << tag << ": multiple or no non-contracting non-batch dimensions";
     return false;
@@ -1032,7 +1038,7 @@ bool CanFoldDotIntoIndexedArray(
     return false;
   }
 
-  int64_t indexed_array_rank = indexed_array->shape().rank();
+  int64_t indexed_array_rank = indexed_array->shape().dimensions().size();
   if (indexed_array->source_dim() < (indexed_array_rank - 2)) {
     // This restriction can be lifted by inserting reshape nodes.
     VLOG(3) << tag
@@ -1060,7 +1066,7 @@ IndexedArrayAnalysis::ComputeArrayForDotWithIndexedLhs(
     return nullptr;
   }
 
-  int64_t lhs_rank = lhs->shape().rank();
+  int64_t lhs_rank = lhs->shape().dimensions().size();
   DotDimensionNumbers new_dim_numbers = dim_numbers;
   new_dim_numbers.set_lhs_contracting_dimensions(
       0, lhs->source_dim() == (lhs_rank - 1) ? (lhs_rank - 2) : (lhs_rank - 1));
@@ -1095,7 +1101,7 @@ IndexedArrayAnalysis::ComputeArrayForDotWithIndexedRhs(
     return nullptr;
   }
 
-  int64_t rhs_rank = rhs->shape().rank();
+  int64_t rhs_rank = rhs->shape().dimensions().size();
 
   DotDimensionNumbers new_dim_numbers = dim_numbers;
   new_dim_numbers.set_rhs_contracting_dimensions(
diff --git a/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc b/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc
index 35ed4aa06c4f..574a487c330e 100644
--- a/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexed_array_analysis_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "xla/hlo/analysis/indexed_array_analysis.h"
 
+#include <memory>
+#include <string>
+
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/strings/ascii.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
index 381715a2981e..5d48de7e8d38 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/optimization.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
@@ -74,12 +75,24 @@ HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
   return indexing;
 }
 
+// HLORTVar represents the origin operation and possible values for a runtime
+// variable in an indexing map.
+//
+// For example, in HLO
+//
+// data = f32[100] ...
+// idx = s32[10,1] ...
+// ROOT result = gather(data, idx), <...>
+//
+// in the indexing map of `data`: `(d0, d1){rt0} -> (d1 + rt0)` we have `rt0`
+// with HLORTVar:
+// - `feasible_values` in [0, 99],
+// - `hlo` pointing to `idx`,
+// - `map` of `(d0, d1) -> (d0, 0)` from the output of `gather` (not `data`)
+//   into `idx`.
 struct HLORTVar {
   Interval feasible_values;
   const HloInstruction* hlo;
-  // This is a map from the iteration space of the corresponding indexing map to
-  // the iteration space of `hlo`. It shows what element of `hlo` we need to
-  // extract to get the runtime value for the RTVar.
   mlir::AffineMap map;
 };
 
@@ -238,36 +251,23 @@ std::vector<IndexingMap::Variable> ConvertHLORTVarsToRTVars(
 IndexingMap FoldRTVarsAndConstructIndexingMap(
     AffineMap affine_map, std::vector<IndexingMap::Variable> dim_vars,
     std::vector<HLORTVar> hlo_rt_vars) {
-  if (hlo_rt_vars.empty()) {
-    return IndexingMap(affine_map, std::move(dim_vars), /*range_vars=*/{},
-                       ConvertHLORTVarsToRTVars(hlo_rt_vars));
-  }
-
   auto* ctx = affine_map.getContext();
-
-  for (auto symbol_index = 0; symbol_index < hlo_rt_vars.size();
-       ++symbol_index) {
-    auto& rt_var = hlo_rt_vars[symbol_index];
-
-    // range_vars and rt_vars share the symbol space, with the rt_vars coming
-    // after the range_vars.
-    auto rt_var_symbol = getAffineSymbolExpr(symbol_index, ctx);
-
-    RTVarOptimizationResult result = OptimizeRTVar(rt_var, symbol_index, ctx);
-
+  // Range and runtime variables share the symbol space in the affine map but
+  // currently we never have range variables here.
+  CHECK_EQ(affine_map.getNumSymbols(), hlo_rt_vars.size());
+  for (auto idx = 0; idx < affine_map.getNumSymbols(); ++idx) {
+    auto& rt_var = hlo_rt_vars[idx];
+    auto rt_var_symbol = getAffineSymbolExpr(idx, ctx);
+    RTVarOptimizationResult result = OptimizeRTVar(rt_var, idx, ctx);
     if (result.remapped_symbol != rt_var_symbol) {
       affine_map = affine_map.replace({{rt_var_symbol, result.remapped_symbol}},
                                       affine_map.getNumDims(),
                                       affine_map.getNumSymbols());
-
-      llvm::DenseMap<AffineExpr, AffineExpr> replacements;
     }
-
-    if (result.remapped_symbol.isFunctionOfSymbol(symbol_index)) {
-      // If we still depend on the rt_var, then we update it.
-      if (rt_var != result.rt_var) {
-        rt_var = std::move(result.rt_var);
-      }
+    // If we still depend on the rt_var, then update it.
+    if (result.remapped_symbol.isFunctionOfSymbol(idx) &&
+        rt_var != result.rt_var) {
+      rt_var = std::move(result.rt_var);
     }
   }
   return IndexingMap(affine_map, std::move(dim_vars), /*range_vars=*/{},
@@ -289,7 +289,8 @@ HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
     // Select allows implicit broadcasting in the predicate. We just handle it
     // generically here.
     auto* operand = instr->operand(operand_id);
-    if (operand->shape().rank() == 0 && instr->shape().rank() > 0) {
+    if (operand->shape().dimensions().size() == 0 &&
+        instr->shape().dimensions().size() > 0) {
       instr_indexing.indexing_maps[operand_id].insert(unit_map);
     } else {
       instr_indexing.indexing_maps[operand_id].insert(identity_map);
@@ -329,7 +330,7 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
 
   std::vector<int64_t> added_dims_sizes;
   std::vector<AffineExpr> exprs;
-  exprs.reserve(output_shape.rank());
+  exprs.reserve(output_shape.dimensions().size());
   for (auto [output_dim_id, output_dim] :
        llvm::enumerate(output_shape.dimensions())) {
     auto bcast_dim =
@@ -344,8 +345,8 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
         std::distance(bcast_dims.begin(), bcast_dim), mlir_context));
   }
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(input_shape.rank(), added_dims_sizes.size(), exprs,
-                     mlir_context),
+      AffineMap::get(input_shape.dimensions().size(), added_dims_sizes.size(),
+                     exprs, mlir_context),
       input_shape.dimensions(), added_dims_sizes);
 
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
@@ -436,8 +437,8 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
   // According to the StableHLO specification, the dimensions of the output
   // shape are ordered as follows:
   //   lhs_batch_dims | lhs_non_contracting_dims | rhs_non_contracting_dims
-  SmallVector<AffineExpr> lhs_exprs(lhs_shape.rank());
-  SmallVector<AffineExpr> rhs_exprs(rhs_shape.rank());
+  SmallVector<AffineExpr> lhs_exprs(lhs_shape.dimensions().size());
+  SmallVector<AffineExpr> rhs_exprs(rhs_shape.dimensions().size());
   int64_t output_dim_id = 0;
 
   // lhs_batch_dims
@@ -485,13 +486,13 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
   }
 
   IndexingMap lhs_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), lhs_exprs,
-                     mlir_context),
+      AffineMap::get(dot->shape().dimensions().size(), input_dim_sizes.size(),
+                     lhs_exprs, mlir_context),
       dot->shape().dimensions(), input_dim_sizes);
 
   IndexingMap rhs_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), rhs_exprs,
-                     mlir_context),
+      AffineMap::get(dot->shape().dimensions().size(), input_dim_sizes.size(),
+                     rhs_exprs, mlir_context),
       dot->shape().dimensions(), input_dim_sizes);
   return HloInstructionIndexing::FromIndexingMaps(
       {lhs_indexing_map, rhs_indexing_map});
@@ -502,10 +503,11 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
     MLIRContext* mlir_context) {
   const Shape& input_shape = dynamic_slice->operand(0)->shape();
   const Shape& output_shape = dynamic_slice->shape();
-  int64_t rank = output_shape.rank();
+  int64_t rank = output_shape.dimensions().size();
   const int64_t first_index_num = dynamic_slice->first_index_operand_number();
 
-  CHECK(dynamic_slice->operand(first_index_num)->shape().rank() == 0)
+  CHECK(dynamic_slice->operand(first_index_num)->shape().dimensions().size() ==
+        0)
       << "b/118437727: Old form, not supported.";
   // A map from tensor iteration space to (), because index operands are 0d
   // tensors.
@@ -540,7 +542,7 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
     const HloDynamicUpdateSliceInstruction* dus, MLIRContext* mlir_context) {
   const Shape& update_shape = dus->update()->shape();
   const Shape& output_shape = dus->shape();
-  int64_t rank = output_shape.rank();
+  int64_t rank = output_shape.dimensions().size();
 
   // operand: (d0, ... d_{N-1}) -> (d0, ... d_{N-1})
   std::vector<AffineExpr> identity;
@@ -553,13 +555,13 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
                      mlir_context),
       output_shape.dimensions(), {});
 
-  // start_indices: (d0, ... d_{N-1}) -> ()
+  // start_indices: (d0, ... d{N-1}) -> ()
   AffineMap empty_results_affine_map = AffineMap::get(
       /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
   IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
       empty_results_affine_map, output_shape.dimensions(), {});
 
-  // update: (d_0 - s_0, ..., d_{N-1} - s_{N-1})
+  // update: (d0 - rt0, ..., d{N-1} - rt{N-1})
   std::vector<AffineExpr> exprs;
   exprs.reserve(rank);
   std::vector<HLORTVar> rt_vars;
@@ -596,11 +598,11 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
       indices_shape.dimensions(dimension_numbers.index_vector_dim());
 
   const Shape& output_shape = gather->shape();
-  int64_t output_rank = output_shape.rank();
+  int64_t output_rank = output_shape.dimensions().size();
 
   // A map for the `indices` operand of gather. It is always
-  // (d_0, ... d_{rank - 1}) -> (d_0, s_0),
-  // where 0 <= s_0 <= indices_shape[1] - 1.
+  // (d0, ... d{rank - 1}) -> (d0, s0),
+  // where 0 <= s0 <= indices_shape[1] - 1.
   AffineExpr indices_id_dim = getAffineDimExpr(0, mlir_context);
   std::vector<IndexingMap::Variable> dim_vars =
       DimVarsFromTensorSizes(output_shape.dimensions());
@@ -613,17 +615,19 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
       /*rt_vars=*/{}};
 
   // A map for the `operand` operand of gather, from which we extract slices.
-  // (d_0, ... d_{rank - 1}) -> (d_1 + s0, d_2 + s_1, ...),
-  // where s_i are RTVars that extract indices from the `indices` operand.
+  // (d0, ... d{rank - 1}) -> (d1 + rt0, d2 + rt1, ...),
+  // where rt{i} are RTVars that extract indices from the `indices` operand.
   std::vector<HLORTVar> rt_vars;
   std::vector<AffineExpr> exprs;
-  exprs.reserve(operand_shape.rank());
+  exprs.reserve(operand_shape.dimensions().size());
   for (auto [operand_dim_id, slice_size] :
        llvm::enumerate(gather->gather_slice_sizes())) {
     int64_t output_dim_id = dimension_numbers.offset_dims(operand_dim_id);
     exprs.push_back(getAffineDimExpr(output_dim_id, mlir_context));
 
-    if (operand_dim_id >= index_vector_length) continue;
+    if (operand_dim_id >= index_vector_length) {
+      continue;
+    }
 
     rt_vars.push_back(HLORTVar{
         Interval{0, operand_shape.dimensions(operand_dim_id) - slice_size},
@@ -680,7 +684,7 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
     const HloPadInstruction* pad, MLIRContext* mlir_context) {
   const Shape& output_shape = pad->shape();
-  int64_t rank = output_shape.rank();
+  int64_t rank = output_shape.dimensions().size();
   SmallVector<int64_t> padding_low, padding_high, padding_interior;
   padding_low.reserve(rank);
   padding_high.reserve(rank);
@@ -694,7 +698,8 @@ HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
       output_shape.dimensions(), padding_low, padding_high, padding_interior,
       mlir_context);
   IndexingMap padding_value_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
+      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
+                     mlir_context),
       output_shape.dimensions(), /*symbol_upper_bounds=*/{});
   return HloInstructionIndexing::FromIndexingMaps(
       {input_indexing_map, padding_value_indexing_map});
@@ -711,7 +716,7 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
   std::vector<int64_t> parallel_dims_sizes;
   int64_t output_dim_id = 0;
   std::vector<AffineExpr> exprs;
-  exprs.reserve(input_shape.rank());
+  exprs.reserve(input_shape.dimensions().size());
   for (auto [input_dim_id, input_dim] :
        llvm::enumerate(input_shape.dimensions())) {
     if (reduce_dims_ids.contains(input_dim_id)) {
@@ -723,11 +728,12 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
   }
   IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.rank(), reduce_dims_ids.size(), exprs,
-                     mlir_context),
+      AffineMap::get(output_shape.dimensions().size(), reduce_dims_ids.size(),
+                     exprs, mlir_context),
       output_shape.dimensions(), parallel_dims_sizes);
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
+      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
+                     mlir_context),
       output_shape.dimensions(), {});
 
   HloInstructionIndexing instr_indexing;
@@ -745,7 +751,7 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
     const HloReduceInstruction* reduce, int input_id,
     MLIRContext* mlir_context) {
   const Shape& output_shape = GetOutputShape(reduce, 0);
-  int64_t output_rank = output_shape.rank();
+  int64_t output_rank = output_shape.dimensions().size();
 
   HloInstructionIndexing instr_indexing;
   int arity = reduce->input_count();
@@ -779,8 +785,8 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
     }
   }
   IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(input_shape.rank(), /*symbolCount=*/0, inputs_exprs,
-                     mlir_context),
+      AffineMap::get(input_shape.dimensions().size(), /*symbolCount=*/0,
+                     inputs_exprs, mlir_context),
       input_shape.dimensions(), {});
   for (int64_t id = 0; id < arity; ++id) {
     instr_indexing.indexing_maps[id].insert(inputs_indexing_map);
@@ -861,7 +867,8 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
 
   // Indexing map for the init value.
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
+      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
+                     mlir_context),
       output_shape.dimensions(), /*symbol_upper_bounds=*/{});
 
   HloInstructionIndexing instr_indexing;
@@ -883,7 +890,7 @@ HloInstructionIndexing ComputeOutputToInputConvolutionOpIndexing(
   const Shape& output_shape = convolution->shape();
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
-  size_t rank = output_shape.rank();
+  size_t rank = output_shape.dimensions().size();
 
   // Collect sizes for input/output spatial dimensions.
   size_t spatial_rank = rank - 2;
@@ -1100,12 +1107,12 @@ AffineMap ComputeReshapeIndexingMap(const Shape& input, const Shape& output,
   absl::Span<const int64_t> output_dims = output.dimensions();
 
   std::vector<AffineExpr> exprs;
-  exprs.reserve(input.rank());
+  exprs.reserve(input.dimensions().size());
 
   // If the input shape has no elements (e.g. 1000x10x0 -> 100x100x0), just set
   // everything to 0.
   if (ShapeUtil::ElementsIn(input) == 0) {
-    for (int i = 0; i < input.rank(); ++i) {
+    for (int i = 0; i < input.dimensions().size(); ++i) {
       exprs.push_back(getAffineConstantExpr(0, mlir_context));
     }
     return AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
@@ -1119,9 +1126,10 @@ AffineMap ComputeReshapeIndexingMap(const Shape& input, const Shape& output,
   int64_t output_num_elements = 1;
   std::vector<int64_t> input_subshape, output_subshape;
   size_t input_dim_id = 0, output_dim_id = 0;
-  while (input_dim_id < input.rank() || output_dim_id < output.rank() ||
+  while (input_dim_id < input.dimensions().size() ||
+         output_dim_id < output.dimensions().size() ||
          !input_subshape.empty()) {
-    if (input_dim_id < input.rank() &&
+    if (input_dim_id < input.dimensions().size() &&
         (input_subshape.empty() || input_num_elements < output_num_elements ||
          input_dims[input_dim_id] == 1)) {
       input_num_elements *= input_dims[input_dim_id];
@@ -1129,7 +1137,7 @@ AffineMap ComputeReshapeIndexingMap(const Shape& input, const Shape& output,
       ++input_dim_id;
       continue;
     }
-    if (output_dim_id < output.rank() &&
+    if (output_dim_id < output.dimensions().size() &&
         (output_subshape.empty() || output_num_elements < input_num_elements ||
          output_dims[output_dim_id] == 1)) {
       output_num_elements *= output_dims[output_dim_id];
@@ -1201,7 +1209,7 @@ HloInstructionIndexing ComputeReverseOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
     const HloSliceInstruction* slice, MLIRContext* mlir_context) {
-  auto output_rank = slice->shape().rank();
+  auto output_rank = slice->shape().dimensions().size();
 
   std::vector<AffineExpr> exprs;
   exprs.reserve(output_rank);
@@ -1218,7 +1226,7 @@ HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
 
 HloInstructionIndexing ComputeInputToOutputSliceOpIndexing(
     const HloSliceInstruction* slice, MLIRContext* mlir_context) {
-  auto output_rank = slice->shape().rank();
+  auto output_rank = slice->shape().dimensions().size();
 
   std::vector<AffineExpr> exprs;
   exprs.reserve(output_rank);
@@ -1397,7 +1405,7 @@ llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
 
 IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
     const Shape& shape, MLIRContext* mlir_context) {
-  if (shape.rank() == 0) {
+  if (shape.dimensions().size() == 0) {
     return IndexingMap(AffineMap::get(mlir_context),
                        /*dimensions=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
@@ -1412,7 +1420,7 @@ IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
 
 IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
     const Shape& shape, MLIRContext* mlir_context) {
-  if (shape.rank() == 0) {
+  if (shape.dimensions().size() == 0) {
     return IndexingMap(AffineMap::get(mlir_context),
                        /*dimensions=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
@@ -1633,6 +1641,9 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
   if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
     return ComputeOutputToInputTransposeOpIndexing(transpose, ctx);
   }
+  if (auto parameter = DynCast<HloParameterInstruction>(instr)) {
+    return HloInstructionIndexing{};
+  }
   LOG(ERROR) << "ComputeOutputToInputIndexing is not implemented for opcode "
              << instr->opcode();
   // If we cannot compute output-to-input indexing, we return std::nullopt for
@@ -1680,7 +1691,7 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
   // If we cannot compute input-to-output indexing, we return std::nullopt for
   // every op result.
   int64_t num_results =
-      instr->shape().IsTuple() ? instr->shape().tuple_shapes_size() : 1;
+      instr->shape().IsTuple() ? instr->shape().tuple_shapes().size() : 1;
   return CreateUnknownIndexing(num_results);
 }
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map.cc b/third_party/xla/xla/hlo/analysis/indexing_map.cc
index 74819f611802..c89388782fe8 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map.cc
@@ -1625,7 +1625,7 @@ SmallBitVector IndexingMap::RemoveUnusedSymbols() {
   if (!CompressVars(/*unused_dims=*/{}, unused_vars.unused_symbols)) {
     return {};
   }
-  return std::move(unused_vars.unused_symbols);
+  return std::move(unused_vars).unused_symbols;
 }
 
 void IndexingMap::ResetToKnownEmpty() {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
index 7ce844923505..315c798fec7e 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
index 237db549ed40..3b26c9304dee 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -30,7 +30,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class IndexingMapSerializationTest : public HloTestBase {
+class IndexingMapSerializationTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
   void ParseAndCheck(absl::string_view indexing_map_str) {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
index add0bb3acf46..6cba98bbc621 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
@@ -34,8 +34,8 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -46,7 +46,7 @@ using ::mlir::AffineMap;
 using ::testing::AnyOf;
 using ::testing::ElementsAre;
 
-class IndexingMapTest : public HloTestBase {
+class IndexingMapTest : public HloHardwareIndependentTestBase {
  public:
   IndexingMap Parse(absl::string_view indexing_map_str) {
     auto indexing_map = ParseIndexingMap(indexing_map_str, &mlir_context_);
diff --git a/third_party/xla/xla/hlo/analysis/logical_buffer_analysis.cc b/third_party/xla/xla/hlo/analysis/logical_buffer_analysis.cc
index 5346a4112d0e..d22bebb574f6 100644
--- a/third_party/xla/xla/hlo/analysis/logical_buffer_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/logical_buffer_analysis.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "xla/hlo/analysis/logical_buffer_analysis.h"
 
+#include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -54,7 +58,7 @@ LogicalBufferAnalysis::Run(const HloModule* module) {
   std::unique_ptr<LogicalBufferAnalysis> analysis(
       new LogicalBufferAnalysis(module));
   TF_RETURN_IF_ERROR(analysis->Analyze());
-  return std::move(analysis);
+  return analysis;
 }
 
 absl::Status LogicalBufferAnalysis::Analyze() {
@@ -78,7 +82,8 @@ absl::Status LogicalBufferAnalysis::Analyze() {
     }
   }
   for (auto* instruction : fusion_instructions) {
-    TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+    TF_RETURN_IF_ERROR(
+        instruction->fused_instructions_computation()->Accept(this));
   }
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.cc b/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.cc
index 961be9ef4d75..dcea12c88f5e 100644
--- a/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.cc
@@ -15,16 +15,24 @@ limitations under the License.
 
 #include "xla/hlo/analysis/tuple_points_to_analysis.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <ostream>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -142,7 +150,7 @@ TuplePointsToAnalysis::Run(const HloModule* module) {
   std::unique_ptr<TuplePointsToAnalysis> analysis(new TuplePointsToAnalysis(
       module, std::move(logical_buffer_analysis).value()));
   TF_RETURN_IF_ERROR(analysis->Analyze());
-  return std::move(analysis);
+  return analysis;
 }
 
 absl::Status TuplePointsToAnalysis::Analyze() {
@@ -166,7 +174,8 @@ absl::Status TuplePointsToAnalysis::Analyze() {
   }
   // Run points-to analysis on fusion instructions in 'computation'.
   for (auto* instruction : fusion_instructions) {
-    TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+    TF_RETURN_IF_ERROR(
+        instruction->fused_instructions_computation()->Accept(this));
     TF_RETURN_IF_ERROR(
         PopulateDefinedBuffersAndAliases(instruction->fused_instructions()));
   }
diff --git a/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.h b/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.h
index d182cfc7231f..d38cdc86b8f6 100644
--- a/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis.h
@@ -26,6 +26,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/hlo/analysis/logical_buffer_analysis.h"
diff --git a/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis_test.cc b/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis_test.cc
index e33d21052b58..3c41f7c1745d 100644
--- a/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/tuple_points_to_analysis_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -702,6 +703,31 @@ ENTRY %FusionParam0OneUser (param0: (f32[8], f32[3])) -> f32[8] {
   Run(hlo_str, /*expected_num_users=*/1);
 }
 
+TEST_F(FusionPointsToAnalysisTest,
+       FusionParam0OneUserWithUnreachableInstructionInFusion) {
+  std::string hlo_str = R"(
+HloModule FusionParam0OneUser
+
+%fused_computation (param_1.2: (f32[8], f32[3], f32[7])) -> f32[8] {
+  %param_1.2 = (f32[8]{0}, f32[3]{0}, f32[7]{0}) parameter(0)
+  %get-tuple-element.1 = f32[8]{0} get-tuple-element(%param_1.2), index=0
+  %get-tuple-element.2 = f32[3]{0} get-tuple-element(%param_1.2), index=1
+  %get-tuple-element.3 = f32[7]{0} get-tuple-element(%param_1.2), index=2
+  %placeholder = f32[7]{0} custom-call(%get-tuple-element.3), custom_call_target="IntermediateBufferDummyConsumer", custom_call_has_side_effect=true
+  %constant.3 = f32[3]{0} constant({1, 1, 1})
+  %add.1 = f32[3]{0} add(f32[3]{0} %get-tuple-element.2, f32[3]{0} %constant.3)
+  %constant.2 = s32[] constant(0)
+  ROOT %dynamic-update-slice.1 = f32[8]{0} dynamic-update-slice(f32[8]{0} %get-tuple-element.1, f32[3]{0} %add.1, s32[] %constant.2)
+}
+
+ENTRY %FusionParam0OneUser (param0: (f32[8], f32[3], f32[7])) -> f32[8] {
+  %param0 = (f32[8]{0}, f32[3]{0}, f32[7]{0}) parameter(0)
+  ROOT %fusion = f32[8]{0} fusion(%param0), kind=kLoop, calls=%fused_computation
+}
+)";
+  Run(hlo_str, /*expected_num_users=*/1);
+}
+
 // Tests the points-to set of tuple-shaped fusion parameter 0 and all GTE users.
 // Tests the alias set of tuple-shaped fusion parameter 0 at all shape indices.
 // Tests that there are two users of the aliases of tuple-shaped fusion
diff --git a/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc b/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc
index 2800977b05d5..5756e73ff3e9 100644
--- a/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/while_loop_analysis.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tools/hlo_extractor.h"
 #include "xla/tsl/platform/status.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -698,7 +699,7 @@ optional<int64_t> MatchTrivialLoopTripCount(const HloInstruction* while_op,
   int64_t trip_count_step = 0;
   if (!Match(while_body_indvar_update,
              m::AddAnyOrder(m::Op().Is(while_body_indvar),
-                            m::Op(&trip_count_increase_step_instr)))) {
+                            m::Constant(&trip_count_increase_step_instr)))) {
     if (trip_count_increase_step_instr == nullptr) {
       VLOG(2) << "Pattern-match failed: induction variable is not getting "
                  "updated by an add operation: "
@@ -878,8 +879,8 @@ optional<int64_t> ComputeWhileLoopTripCount(const HloInstruction* while_op,
 
   for (int64_t trip_count = 0; trip_count != max_brute_force_iters + 1;
        ++trip_count) {
-    absl::StatusOr<Literal> result = evaluator.EvaluateWithSubstitutions(
-        while_cond_root, {{while_cond_indvar, &indvar_iter_val}});
+    absl::StatusOr<Literal> result = evaluator.Evaluate(
+        while_cond_root, {}, false, {{while_cond_indvar, &indvar_iter_val}});
     if (!result.ok()) {
       VLOG(2) << "Couldn't evaluate while cond: " << result.status();
       return nullopt;
@@ -892,8 +893,8 @@ optional<int64_t> ComputeWhileLoopTripCount(const HloInstruction* while_op,
     // Calculate the value of the induction variable after one iteration of the
     // loop, and check whether the while condition is true with this new value.
     absl::StatusOr<Literal> indvar_next_result =
-        evaluator.EvaluateWithSubstitutions(
-            while_body_indvar_update, {{while_body_indvar, &indvar_iter_val}});
+        evaluator.Evaluate(while_body_indvar_update, {}, false,
+                           {{while_body_indvar, &indvar_iter_val}});
     if (!indvar_next_result.ok()) {
       VLOG(2) << "Couldn't evaluate induction variable update: "
               << indvar_next_result.status();
diff --git a/third_party/xla/xla/hlo/analysis/while_loop_analysis_test.cc b/third_party/xla/xla/hlo/analysis/while_loop_analysis_test.cc
index 83f872866e04..ba165eb1f1c5 100644
--- a/third_party/xla/xla/hlo/analysis/while_loop_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/while_loop_analysis_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/literal_util.h"
 #include "xla/service/constant_value.h"
 #include "xla/service/value_range.h"
 #include "xla/tsl/platform/statusor.h"
@@ -45,10 +47,11 @@ namespace {
 
 class WhileLoopAnalysisTest : public HloHardwareIndependentTestBase {
  protected:
-  [[nodiscard]] absl::StatusOr<int64_t> MakeWhileLoopAndGetTripCount(
-      int init, int limit, int step, ComparisonDirection dir);
-  [[nodiscard]] absl::StatusOr<Range> MakeWhileLoopAndGetRange(
-      int init, int limit, int step, ComparisonDirection dir);
+  absl::StatusOr<int64_t> MakeWhileLoopAndGetTripCount(int init, int limit,
+                                                       int step,
+                                                       ComparisonDirection dir);
+  absl::StatusOr<Range> MakeWhileLoopAndGetRange(int init, int limit, int step,
+                                                 ComparisonDirection dir);
 };
 
 absl::StatusOr<int64_t> WhileLoopAnalysisTest::MakeWhileLoopAndGetTripCount(
@@ -151,7 +154,7 @@ absl::StatusOr<Range> WhileLoopAnalysisTest::MakeWhileLoopAndGetRange(
 }
 
 TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
-  const char* const kHloModule = R"(
+  absl::string_view kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
@@ -182,7 +185,7 @@ TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
 }
 
 TEST_F(WhileLoopAnalysisTest, SimpleLoopWithCustomCallNonTuple) {
-  std::string hlo_string = R"(
+  absl::string_view hlo_string = R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[]{:T(128)}, s32[3]{0}) parameter(0)
@@ -214,7 +217,7 @@ TEST_F(WhileLoopAnalysisTest, SimpleLoopWithCustomCallNonTuple) {
 }
 
 TEST_F(WhileLoopAnalysisTest, SimpleLoopWithCustomCall) {
-  std::string hlo_string = R"(
+  absl::string_view hlo_string = R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[]{:T(128)}, s32[3]{0}) parameter(0)
@@ -247,7 +250,7 @@ TEST_F(WhileLoopAnalysisTest, SimpleLoopWithCustomCall) {
 }
 
 TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
-  const char* const kHloModule = R"(
+  absl::string_view kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
@@ -369,7 +372,7 @@ TEST_F(WhileLoopAnalysisTest, ExactBoundTrivialTripCount) {
 }
 
 TEST_F(WhileLoopAnalysisTest, NoAIVNoConstChain) {
-  const char* const kHloModule = R"(
+  absl::string_view kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
@@ -406,7 +409,7 @@ TEST_F(WhileLoopAnalysisTest, NoAIVNoConstChain) {
 }
 
 TEST_F(WhileLoopAnalysisTest, AIVMultiChain) {
-  const char* const kHloModule = R"(
+  absl::string_view kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
@@ -447,7 +450,7 @@ TEST_F(WhileLoopAnalysisTest, AIVMultiChain) {
 }
 
 TEST_F(WhileLoopAnalysisTest, NoAIV) {
-  const char* const kHloModule = R"(
+  absl::string_view kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
@@ -484,7 +487,7 @@ TEST_F(WhileLoopAnalysisTest, NoAIV) {
 }
 
 TEST_F(WhileLoopAnalysisTest, AIVNoChain) {
-  const char* const kHloModule = R"(
+  absl::string_view kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
@@ -521,7 +524,7 @@ TEST_F(WhileLoopAnalysisTest, AIVNoChain) {
 }
 
 TEST_F(WhileLoopAnalysisTest, NonScalarUpdateOp) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
     HloModule test, replica_count=2
     add {
       param.3 = s32[] parameter(0)
@@ -557,7 +560,7 @@ TEST_F(WhileLoopAnalysisTest, NonScalarUpdateOp) {
 }
 
 TEST_F(WhileLoopAnalysisTest, UpdateOnIndVarCopySuccess) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
     HloModule test, replica_count=2
     body {
       param.0 = (s32[], s32[]) parameter(0)
@@ -590,7 +593,7 @@ TEST_F(WhileLoopAnalysisTest, UpdateOnIndVarCopySuccess) {
 }
 
 TEST_F(WhileLoopAnalysisTest, IndVarInitialiationNotConstantSuccess) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
     HloModule test, replica_count=2
     body {
       param.0 = (s32[], s32[]) parameter(0)
@@ -623,7 +626,7 @@ TEST_F(WhileLoopAnalysisTest, IndVarInitialiationNotConstantSuccess) {
 }
 
 TEST_F(WhileLoopAnalysisTest, FusedUpdateOp) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
   HloModule test, replica_count=2
   add {
     param.3 = s32[] parameter(0)
@@ -660,7 +663,7 @@ TEST_F(WhileLoopAnalysisTest, FusedUpdateOp) {
 }
 
 TEST_F(WhileLoopAnalysisTest, NonScalarConditionOp) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
     HloModule test, replica_count=2
     add {
       param.3 = s32[] parameter(0)
@@ -703,7 +706,7 @@ TEST_F(WhileLoopAnalysisTest, NonScalarConditionOp) {
 }
 
 TEST_F(WhileLoopAnalysisTest, IndvarWithNonScalarShape) {
-  const std::string hlo_string = R"(
+  absl::string_view hlo_string = R"(
   HloModule test
 
   loop.body {
@@ -785,7 +788,7 @@ TEST_F(WhileLoopAnalysisTest, FusedConditionOp) {
 }
 
 TEST_F(WhileLoopAnalysisTest, AvoidBruteForceForHugeParams) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
   HloModule test
   fused_comp {
     p.0 = pred[100000000]{0} parameter(0)
@@ -833,7 +836,7 @@ TEST_F(WhileLoopAnalysisTest, AvoidBruteForceForHugeParams) {
 TEST_F(WhileLoopAnalysisTest, LoopFusionForLoopVariable) {
   // This test verifies that fusions in initialization, condition and update are
   // accepted by while loop analysis.
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
   HloModule test
   fused_add.11 {
     param_0.968 = s32[] parameter(0)
@@ -885,7 +888,7 @@ TEST_F(WhileLoopAnalysisTest, LoopFusionForLoopVariable) {
 }
 
 TEST_F(WhileLoopAnalysisTest, UpdateIsMultipleOperationsWithConstantOperand) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
   HloModule test
   body {
     param.1 = (s32[], s32[8,8]) parameter(0)
@@ -919,7 +922,7 @@ TEST_F(WhileLoopAnalysisTest, UpdateIsMultipleOperationsWithConstantOperand) {
 
 TEST_F(WhileLoopAnalysisTest,
        UpdateIsMultipleOperationsWithoutConstantOperand) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
   HloModule test
   body {
     param.1 = (s32[], s32[8,8]) parameter(0)
@@ -953,7 +956,7 @@ TEST_F(WhileLoopAnalysisTest,
 
 TEST_F(WhileLoopAnalysisTest,
        ConditionIsMultipleOperationsWithConstantOperand) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
   HloModule test
   body {
     param.1 = (s32[], s32[8,8]) parameter(0)
@@ -987,7 +990,7 @@ TEST_F(WhileLoopAnalysisTest,
 
 TEST_F(WhileLoopAnalysisTest,
        ConditionIsMultipleOperationsWithoutConstantOperand) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
   HloModule test
   body {
     param.1 = (s32[], s32[8,8]) parameter(0)
@@ -1020,7 +1023,7 @@ TEST_F(WhileLoopAnalysisTest,
 }
 
 TEST_F(WhileLoopAnalysisTest, GetIndvarIndexShouldWorkWhenParamIsCopied) {
-  const char* hlo = R"(
+  absl::string_view hlo = R"(
     HloModule test
 
     fused_copy {
@@ -1060,5 +1063,66 @@ TEST_F(WhileLoopAnalysisTest, GetIndvarIndexShouldWorkWhenParamIsCopied) {
   EXPECT_EQ(GetLoopInductionVarTupleIdx(while_op), 0);
 }
 
+TEST_F(WhileLoopAnalysisTest,
+       MatchTrivialLoopCountFailsWhenIndvarIsNotIncrementedByConstant) {
+  absl::string_view hlo_with_constant = R"(
+  HloModule test
+  body {
+    param.1 = (s32[], s32[]) parameter(0)
+    iter.1 = s32[] get-tuple-element(param.1), index=0
+    data.1 = s32[] get-tuple-element(param.1), index=1
+    c.1 = s32[] constant(1)
+    add.1 = s32[] add(iter.1, c.1)
+    ROOT tuple = (s32[], s32[]) tuple(add.1, data.1)
+  }
+  condition {
+    param = (s32[], s32[]) parameter(0)
+    iter = s32[] get-tuple-element(param), index=0
+    c.10 = s32[] constant(10)
+    ROOT compare = pred[] compare(iter, c.10), direction=LT
+  }
+  ENTRY main {
+    c0 = s32[] constant(0)
+    data = s32[] parameter(0)
+    tuple = (s32[], s32[]) tuple(c0, data)
+    ROOT while = (s32[], s32[]) while(tuple), body=body, condition=condition
+  })";
+  absl::string_view hlo_without_constant = R"(
+  HloModule test
+  body {
+    param.1 = (s32[], s32[]) parameter(0)
+    iter.1 = s32[] get-tuple-element(param.1), index=0
+    data.1 = s32[] get-tuple-element(param.1), index=1
+    add.1 = s32[] add(iter.1, iter.1)
+    ROOT tuple = (s32[], s32[]) tuple(add.1, data.1)
+  }
+  condition {
+    param = (s32[], s32[]) parameter(0)
+    iter = s32[] get-tuple-element(param), index=0
+    c.10 = s32[] constant(10)
+    ROOT compare = pred[] compare(iter, c.10), direction=LT
+  }
+  ENTRY main {
+    c1 = s32[] constant(1)
+    data = s32[] parameter(0)
+    tuple = (s32[], s32[]) tuple(c1, data)
+    ROOT while = (s32[], s32[]) while(tuple), body=body, condition=condition
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m_with_constant,
+                          ParseAndReturnVerifiedModule(hlo_with_constant));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m_without_constant,
+                          ParseAndReturnVerifiedModule(hlo_without_constant));
+  HloInstruction* while_op_with_constant =
+      m_with_constant->entry_computation()->root_instruction();
+  HloInstruction* while_op_without_constant =
+      m_without_constant->entry_computation()->root_instruction();
+  std::optional<int64_t> trip_count_with_constant = MatchTrivialLoopTripCount(
+      while_op_with_constant, 0, LiteralUtil::CreateR0<int32_t>(0));
+  EXPECT_EQ(trip_count_with_constant, 10);
+  std::optional<int64_t> trip_count_without_constant =
+      MatchTrivialLoopTripCount(while_op_without_constant, 0,
+                                LiteralUtil::CreateR0<int32_t>(0));
+  EXPECT_EQ(trip_count_without_constant, std::nullopt);
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/builder/BUILD b/third_party/xla/xla/hlo/builder/BUILD
index 6ffa2c1453f4..43dcce69c835 100644
--- a/third_party/xla/xla/hlo/builder/BUILD
+++ b/third_party/xla/xla/hlo/builder/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   XLA builder libraries.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -98,6 +98,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/lib/gtl:value_or_die",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
@@ -111,6 +112,29 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "value_inference_test",
+    srcs = ["value_inference_test.cc"],
+    deps = [
+        ":value_inference",
+        ":xla_builder",
+        ":xla_computation",
+        "//xla:comparison_util",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/builder/lib:arithmetic",
+        "//xla/hlo/builder/lib:prng",
+        "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "xla_builder",
     srcs = ["xla_builder.cc"],
@@ -176,6 +200,7 @@ xla_cc_test(
         "//xla/hlo/testlib:test_helpers",
         "//xla/service:hlo_proto_cc",
         "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/hlo/builder/lib/BUILD b/third_party/xla/xla/hlo/builder/lib/BUILD
index cb27b469a5e3..fc050b450018 100644
--- a/third_party/xla/xla/hlo/builder/lib/BUILD
+++ b/third_party/xla/xla/hlo/builder/lib/BUILD
@@ -42,13 +42,16 @@ cc_library(
 xla_test(
     name = "arithmetic_test",
     srcs = ["arithmetic_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":arithmetic",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:test_macros_header",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -75,6 +78,7 @@ cc_library(
 xla_test(
     name = "comparators_test",
     srcs = ["comparators_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":comparators",
         ":constants",
@@ -85,8 +89,10 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
         "//xla/service:hlo_proto_cc",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:test_macros_header",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -232,6 +238,7 @@ xla_test(
     name = "math_test",
     timeout = "long",
     srcs = ["math_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":constants",
         ":math",
@@ -245,9 +252,12 @@ xla_test(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
         "//xla/service",
-        "//xla/tests:client_library_test_base",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:test_macros_header",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -282,6 +292,7 @@ cc_library(
 xla_test(
     name = "matrix_test",
     srcs = ["matrix_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":constants",
         ":matrix",
@@ -290,11 +301,15 @@ xla_test(
         "//xla:array2d",
         "//xla:array3d",
         "//xla:array4d",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:types",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:test_macros_header",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -327,14 +342,17 @@ cc_library(
 xla_test(
     name = "pooling_test",
     srcs = ["pooling_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":pooling",
         "//xla:error_spec",
         "//xla:shape_util",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:test_macros_header",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -362,6 +380,7 @@ cc_library(
 xla_test(
     name = "prng_test",
     srcs = ["prng_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":constants",
         ":prng",
@@ -369,8 +388,10 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:test_macros_header",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
@@ -592,7 +613,10 @@ xla_test(
     srcs = ["self_adjoint_eig_test.cc"],
     real_hardware_only = True,
     shard_count = 5,
-    tags = ["optonly"],
+    tags = [
+        "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":arithmetic",
         ":constants",
@@ -603,13 +627,18 @@ xla_test(
         "//xla:array2d",
         "//xla:array3d",
         "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
-        "//xla/tests:client_library_test_base",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:test_macros_header",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -646,7 +675,10 @@ xla_test(
     srcs = ["svd_test.cc"],
     real_hardware_only = True,
     shard_count = 10,
-    tags = ["optonly"],
+    tags = [
+        "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":arithmetic",
         ":constants",
@@ -659,8 +691,11 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
-        "//xla/tests:client_library_test_base",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:test_macros_header",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
     ],
@@ -692,22 +727,27 @@ xla_test(
     srcs = ["tridiagonal_test.cc"],
     real_hardware_only = True,
     shard_count = 10,
-    tags = ["optonly"],
+    tags = [
+        "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
         ":slicing",
         ":tridiagonal",
-        "//xla:array",
         "//xla:array3d",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
-        "//xla/tests:client_library_test_base",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:test_macros_header",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -777,8 +817,10 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/service",
-        "//xla/tests:client_library_test_base",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_test_base",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/hlo/builder/lib/approx_topk.cc b/third_party/xla/xla/hlo/builder/lib/approx_topk.cc
index f6df6a9ac486..74a67d0b1abd 100644
--- a/third_party/xla/xla/hlo/builder/lib/approx_topk.cc
+++ b/third_party/xla/xla/hlo/builder/lib/approx_topk.cc
@@ -49,7 +49,7 @@ absl::StatusOr<std::vector<PrimitiveType>> GetOperandTypes(
   for (int i = 0; i < num_operands; ++i) {
     const auto& op_shape = operands_shapes[i];
     const auto& init_shape = init_values_shapes[i];
-    if (op_shape.rank() == 0) {
+    if (op_shape.dimensions().size() == 0) {
       return InvalidArgument("ApproxTopK operands must have rank 1+.");
     }
     if (!ShapeUtil::CompatibleIgnoringElementType(operands_shapes[0],
@@ -115,7 +115,7 @@ XlaOp AggregateToTopKBuilder(XlaBuilder* builder,
                              int64_t reduction_dim,
                              const XlaComputation& comparator) {
   auto operands_shapes = builder->GetOperandShapes(operands).value();
-  int64_t rank = operands_shapes[0].rank();
+  int64_t rank = operands_shapes[0].dimensions().size();
   int64_t num_operands = operands.size();
 
   if (top_k == 1) {
@@ -130,7 +130,7 @@ XlaOp AggregateToTopKBuilder(XlaBuilder* builder,
     auto val_args = Reduce(builder, operands, init_values,
                            reduction_computation, {reduction_dim});
     Shape op_shape = operands_shapes[0];
-    op_shape.mutable_dimensions()[reduction_dim] = 1;
+    op_shape.set_dimensions(reduction_dim, 1);
     auto top1_vals =
         Reshape(GetTupleElement(val_args, 0), op_shape.dimensions());
     auto top1_args =
@@ -176,7 +176,7 @@ XlaOp ApproxTopK(XlaBuilder* builder, absl::Span<const XlaOp> operands,
     return builder->ReportError(status_or_optypes.status());
   }
   auto op_types = status_or_optypes.value();
-  int64_t rank = operands_shapes[0].rank();
+  int64_t rank = operands_shapes[0].dimensions().size();
   if (reduction_dim < 0 || reduction_dim >= rank) {
     return builder->ReportError(
         InvalidArgument("reduction_dim should range in [0,%d)", rank));
@@ -226,7 +226,7 @@ XlaOp ApproxTopK(XlaBuilder* builder, absl::Span<const XlaOp> operands,
   std::vector<const Shape*> approx_output_shapes;
   approx_output_shapes.reserve(operands_shapes.size());
   for (auto& op_shape : operands_shapes) {
-    op_shape.mutable_dimensions()[reduction_dim] = approx_output_size;
+    op_shape.set_dimensions(reduction_dim, approx_output_size);
     approx_output_shapes.push_back(&op_shape);
   }
   auto approx_output_shape =
@@ -263,7 +263,7 @@ XlaOp ApproxTopKFallback(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                          bool aggregate_to_topk,
                          int64_t reduction_input_size_override) {
   auto operands_shapes = builder->GetOperandShapes(operands).value();
-  int64_t rank = operands_shapes[0].rank();
+  int64_t rank = operands_shapes[0].dimensions().size();
   uint64_t n = operands_shapes[0].dimensions(reduction_dim);
   // Align the output size with ApproxTopK.
   auto status_or_approx_output_size = ApproxTopKReductionOutputSize(
diff --git a/third_party/xla/xla/hlo/builder/lib/arithmetic.cc b/third_party/xla/xla/hlo/builder/lib/arithmetic.cc
index 6ec14f7dd31d..84908846dae7 100644
--- a/third_party/xla/xla/hlo/builder/lib/arithmetic.cc
+++ b/third_party/xla/xla/hlo/builder/lib/arithmetic.cc
@@ -110,7 +110,7 @@ XlaOp Any(XlaOp predicates) {
     XlaComputation logical_or = CreateScalarOrComputation(PRED, builder);
     TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
                         builder->GetShape(predicates));
-    std::vector<int64_t> all_dimensions(predicates_shape.rank());
+    std::vector<int64_t> all_dimensions(predicates_shape.dimensions().size());
     std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
     return Reduce(predicates, f, logical_or, all_dimensions);
   });
diff --git a/third_party/xla/xla/hlo/builder/lib/arithmetic_test.cc b/third_party/xla/xla/hlo/builder/lib/arithmetic_test.cc
index 2e5b546f801e..575bb05a8f81 100644
--- a/third_party/xla/xla/hlo/builder/lib/arithmetic_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/arithmetic_test.cc
@@ -22,14 +22,17 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/primitive_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-class ArithmeticTest : public ClientLibraryTestBase {
+class ArithmeticTest : public ClientLibraryTestRunnerMixin<
+                           HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   template <typename NativeT>
   void TestArgMin(std::initializer_list<std::initializer_list<NativeT>> input,
@@ -63,22 +66,22 @@ class ArithmeticTest : public ClientLibraryTestBase {
       std::function<void(XlaOp, PrimitiveType)> MinMaxImpl) {}
 };
 
-XLA_TEST_F(ArithmeticTest, ArgMinR2Axis0) {
+TEST_F(ArithmeticTest, ArgMinR2Axis0) {
   TestArgMin<int32_t>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {0, 1, 2},
                       /*axis=*/0);
 }
 
-XLA_TEST_F(ArithmeticTest, ArgMinR2Axis1) {
+TEST_F(ArithmeticTest, ArgMinR2Axis1) {
   TestArgMin<int32_t>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {0, 1, 1},
                       /*axis=*/1);
 }
 
-XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis0) {
+TEST_F(ArithmeticTest, ArgMaxR2Axis0) {
   TestArgMax<int32_t>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {2, 0, 1},
                       /*axis=*/0);
 }
 
-XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis1) {
+TEST_F(ArithmeticTest, ArgMaxR2Axis1) {
   TestArgMax<int32_t>({{1, 7, 4}, {6, 3, 5}, {8, 3, 3}}, {1, 0, 0},
                       /*axis=*/1);
 }
diff --git a/third_party/xla/xla/hlo/builder/lib/broadcast.cc b/third_party/xla/xla/hlo/builder/lib/broadcast.cc
index aaabe046cebb..1baec363b53a 100644
--- a/third_party/xla/xla/hlo/builder/lib/broadcast.cc
+++ b/third_party/xla/xla/hlo/builder/lib/broadcast.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/hlo/builder/lib/broadcast.h"
 
+#include <cstdint>
 #include <vector>
 
 #include "absl/algorithm/container.h"
diff --git a/third_party/xla/xla/hlo/builder/lib/comparators.cc b/third_party/xla/xla/hlo/builder/lib/comparators.cc
index a4965caab0d9..b6bd2254c028 100644
--- a/third_party/xla/xla/hlo/builder/lib/comparators.cc
+++ b/third_party/xla/xla/hlo/builder/lib/comparators.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/hlo/builder/lib/comparators.h"
 
-#include <limits>
+#include <cstdint>
 #include <optional>
 #include <string>
 #include <vector>
diff --git a/third_party/xla/xla/hlo/builder/lib/comparators_test.cc b/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
index 8c831b1a7a2a..974ae4899046 100644
--- a/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
@@ -29,15 +29,18 @@ limitations under the License.
 #include "xla/hlo/testlib/test.h"
 #include "xla/primitive_util.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
 
-class ComparatorsTest : public ClientLibraryTestBase {
+class ComparatorsTest : public ClientLibraryTestRunnerMixin<
+                            HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   ComparatorsTest() : builder_(TestName()) {}
   XlaBuilder* builder() { return &builder_; }
@@ -96,56 +99,56 @@ void BuildComparatorAndComparisons(ComparatorsTest* test,
   }
 }
 
-XLA_TEST_F(ComparatorsTest, CompareLtBF16) {
+TEST_F(ComparatorsTest, CompareLtBF16) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<BF16>(this, /*compare_less_than=*/true,
                                       &expected);
   ComputeAndCompareR1<bool>(builder(), expected, {});
 }
 
-XLA_TEST_F(ComparatorsTest, CompareGtBF16) {
+TEST_F(ComparatorsTest, CompareGtBF16) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<BF16>(this, /*compare_less_than=*/false,
                                       &expected);
   ComputeAndCompareR1<bool>(builder(), expected, {});
 }
 
-XLA_TEST_F(ComparatorsTest, CompareLtF16) {
+TEST_F(ComparatorsTest, CompareLtF16) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<F16>(this, /*compare_less_than=*/true,
                                      &expected);
   ComputeAndCompareR1<bool>(builder(), expected, {});
 }
 
-XLA_TEST_F(ComparatorsTest, CompareGtF16) {
+TEST_F(ComparatorsTest, CompareGtF16) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<F16>(this, /*compare_less_than=*/false,
                                      &expected);
   ComputeAndCompareR1<bool>(builder(), expected, {});
 }
 
-XLA_TEST_F(ComparatorsTest, CompareLtF32) {
+TEST_F(ComparatorsTest, CompareLtF32) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<F32>(this, /*compare_less_than=*/true,
                                      &expected);
   ComputeAndCompareR1<bool>(builder(), expected, {});
 }
 
-XLA_TEST_F(ComparatorsTest, CompareGtF32) {
+TEST_F(ComparatorsTest, CompareGtF32) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<F32>(this, /*compare_less_than=*/false,
                                      &expected);
   ComputeAndCompareR1<bool>(builder(), expected, {});
 }
 
-XLA_TEST_F(ComparatorsTest, CompareLtF64) {
+TEST_F(ComparatorsTest, CompareLtF64) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<F64>(this, /*compare_less_than=*/true,
                                      &expected);
   ComputeAndCompareR1<bool>(builder(), expected, {});
 }
 
-XLA_TEST_F(ComparatorsTest, CompareGtF64) {
+TEST_F(ComparatorsTest, CompareGtF64) {
   absl::InlinedVector<bool, 10> expected;
   BuildComparatorAndComparisons<F64>(this, /*compare_less_than=*/false,
                                      &expected);
diff --git a/third_party/xla/xla/hlo/builder/lib/dynamic_shaped_ops.cc b/third_party/xla/xla/hlo/builder/lib/dynamic_shaped_ops.cc
index 8644da4aa80a..71a367c1801b 100644
--- a/third_party/xla/xla/hlo/builder/lib/dynamic_shaped_ops.cc
+++ b/third_party/xla/xla/hlo/builder/lib/dynamic_shaped_ops.cc
@@ -52,8 +52,8 @@ Shape FindMaxShape(absl::Span<const Shape*> shapes) {
   if (shapes[0]->IsTuple()) {
     // Recurse into sub-element.
     std::vector<Shape> results;
-    results.reserve(shapes[0]->tuple_shapes_size());
-    for (int i = 0; i < shapes[0]->tuple_shapes_size(); ++i) {
+    results.reserve(shapes[0]->tuple_shapes().size());
+    for (int i = 0; i < shapes[0]->tuple_shapes().size(); ++i) {
       std::vector<const Shape*> subshapes;
       subshapes.reserve(shapes.size());
       for (int64_t j = 0; j < shapes.size(); ++j) {
@@ -66,8 +66,8 @@ Shape FindMaxShape(absl::Span<const Shape*> shapes) {
   Shape result = *shapes[0];
 
   for (const Shape* shape : shapes) {
-    CHECK(result.rank() == shape->rank());
-    for (int64_t dim = 0; dim < result.rank(); ++dim) {
+    CHECK(result.dimensions().size() == shape->dimensions().size());
+    for (int64_t dim = 0; dim < result.dimensions().size(); ++dim) {
       if (shape->dimensions(dim) > result.dimensions(dim)) {
         result.set_dimensions(dim, shape->dimensions(dim));
       }
@@ -83,12 +83,12 @@ absl::StatusOr<XlaOp> ReconsileBranchDifference(const Shape& left_branch_shape,
     // Invariant sanity check -- Left branch and right branch need to have
     // compatible shapes.
     CHECK(right_branch_shape.IsTuple() &&
-          left_branch_shape.tuple_shapes_size() ==
-              right_branch_shape.tuple_shapes_size());
+          left_branch_shape.tuple_shapes().size() ==
+              right_branch_shape.tuple_shapes().size());
     // Recurse into sub-element.
     std::vector<XlaOp> results;
-    results.reserve(left_branch_shape.tuple_shapes_size());
-    for (int i = 0; i < left_branch_shape.tuple_shapes_size(); ++i) {
+    results.reserve(left_branch_shape.tuple_shapes().size());
+    for (int i = 0; i < left_branch_shape.tuple_shapes().size(); ++i) {
       XlaOp sub_tuple = GetTupleElement(left_root, i);
       TF_ASSIGN_OR_RETURN(XlaOp elem,
                           ReconsileBranchDifference(
@@ -106,12 +106,15 @@ absl::StatusOr<XlaOp> ReconsileBranchDifference(const Shape& left_branch_shape,
         "right_branch_shape should not be a tuple, received %s",
         right_branch_shape.DebugString());
   }
-  if (left_branch_shape.rank() != right_branch_shape.rank()) {
+  if (left_branch_shape.dimensions().size() !=
+      right_branch_shape.dimensions().size()) {
     return InvalidArgument(
-        "left_branch_shape.rank() != right_branch_shape.rank() (%d vs %d)",
-        left_branch_shape.rank(), right_branch_shape.rank());
+        "left_branch_shape.dimensions_size() != "
+        "right_branch_shape.dimensions_size() (%d vs %d)",
+        left_branch_shape.dimensions().size(),
+        right_branch_shape.dimensions().size());
   }
-  for (int64_t dim = 0; dim < left_branch_shape.rank(); ++dim) {
+  for (int64_t dim = 0; dim < left_branch_shape.dimensions().size(); ++dim) {
     XlaOp original_dim = GetDimensionSize(result, dim);
     if (left_branch_shape.dimensions(dim) <
         right_branch_shape.dimensions(dim)) {
@@ -270,7 +273,7 @@ absl::StatusOr<XlaOp> SetAllDimensionSizes(ValueInference* value_inference,
   TF_RETURN_IF_ERROR(builder->GetCurrentStatus());
   TF_ASSIGN_OR_RETURN(auto shape_ptr, builder->GetShapePtr(operand));
 
-  for (int64_t i = 0; i < shape_ptr->rank(); ++i) {
+  for (int64_t i = 0; i < shape_ptr->dimensions().size(); ++i) {
     // If a dimension is dynamic, call set-dimension-size on the output.
     auto dim_size = xla::Slice(size_vector, {i}, {i + 1}, {1});
     dim_size = xla::Reshape(dim_size, {});
diff --git a/third_party/xla/xla/hlo/builder/lib/generate_math_impl.py b/third_party/xla/xla/hlo/builder/lib/generate_math_impl.py
index 6f17a96a4d6c..a40f4b75858c 100644
--- a/third_party/xla/xla/hlo/builder/lib/generate_math_impl.py
+++ b/third_party/xla/xla/hlo/builder/lib/generate_math_impl.py
@@ -100,8 +100,8 @@ def main():
 #ifndef {header_file_define}
 #define {header_file_define}
 
-#include "xla/client/lib/constants.h"
-#include "xla/client/xla_builder.h"
+#include "xla/hlo/builder/lib/constants.h"
+#include "xla/hlo/builder/xla_builder.h"
 
 namespace xla {{
 namespace math_impl {{
diff --git a/third_party/xla/xla/hlo/builder/lib/logdet.cc b/third_party/xla/xla/hlo/builder/lib/logdet.cc
index 0fa69e6b1863..602be8fbdc97 100644
--- a/third_party/xla/xla/hlo/builder/lib/logdet.cc
+++ b/third_party/xla/xla/hlo/builder/lib/logdet.cc
@@ -52,14 +52,14 @@ SignAndLogDet SLogDet(XlaOp a) {
         Sign(Einsum(qr.q_and_r, "...aa->...a")),
         One(a.builder(), a_shape.element_type()),
         CreateScalarMultiplyComputation(a_shape.element_type(), a.builder()),
-        {a_shape.rank() - 2});
+        {a_shape.dimensions_size() - 2});
     auto sliced_taus = SliceInMinorDims(qr.taus, {0}, {n - 1});
     auto sign_taus = Reduce(
         Select(Ne(sliced_taus, ZerosLike(sliced_taus)),
                FullLike(sliced_taus, -1), FullLike(sliced_taus, 1)),
         One(a.builder(), a_shape.element_type()),
         CreateScalarMultiplyComputation(a_shape.element_type(), a.builder()),
-        {a_shape.rank() - 2});
+        {a_shape.dimensions_size() - 2});
     return SignAndLogDet{sign_diag * sign_taus, log_abs_det};
   }();
   if (!result.ok()) {
diff --git a/third_party/xla/xla/hlo/builder/lib/lu_decomposition.cc b/third_party/xla/xla/hlo/builder/lib/lu_decomposition.cc
index 9c9b56bdfac3..eb9d541019d7 100644
--- a/third_party/xla/xla/hlo/builder/lib/lu_decomposition.cc
+++ b/third_party/xla/xla/hlo/builder/lib/lu_decomposition.cc
@@ -33,7 +33,7 @@ LuDecompositionResult LuDecomposition(XlaOp a) {
   XlaBuilder* builder = a.builder();
   XlaOp result = builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int ndims = a_shape.rank();
+    const int ndims = a_shape.dimensions().size();
     TF_RET_CHECK(ndims >= 2);
     const int64_t m = ShapeUtil::GetDimension(a_shape, -2);
     const int64_t n = ShapeUtil::GetDimension(a_shape, -1);
diff --git a/third_party/xla/xla/hlo/builder/lib/math_test.cc b/third_party/xla/xla/hlo/builder/lib/math_test.cc
index 0a04274834e7..243f19ff44c7 100644
--- a/third_party/xla/xla/hlo/builder/lib/math_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/math_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <limits>
-#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -34,22 +33,24 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
-#include "xla/service/service.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-class MathTest : public ClientLibraryTestBase {
- public:
-  ErrorSpec error_spec_{0.0001};
-};
+constexpr ErrorSpec kErrorSpec{0.0001};
+
+using MathTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 // Write TYPED_TESTs within the class definition so that we don't have to litter
 // "this->" everywhere.
@@ -60,21 +61,23 @@ class MathTypedTest : public MathTest {
     SetFastMathDisabled(true);
 
     XlaBuilder b(TestName());
-    Log(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}}), &b));
+    const Literal param0 = LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}});
+    Log(Parameter(&b, 0, param0.shape(), ""));
     ComputeAndCompareR1<T>(&b,
                            {-std::numeric_limits<T>::infinity(),
                             -std::numeric_limits<T>::infinity()},
-                           {}, error_spec_);
+                           {&param0}, kErrorSpec);
   }
 
   void TestLog1pEdgeCases() {
     SetFastMathDisabled(true);
 
     XlaBuilder b(TestName());
-    Log1p(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}, T{-1.0}}), &b));
+    const Literal param0 = LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}, T{-1.0}});
+    Log1p(Parameter(&b, 0, param0.shape(), ""));
     ComputeAndCompareR1<T>(
-        &b, {T{0.0}, T{-0.0}, -std::numeric_limits<T>::infinity()}, {},
-        error_spec_);
+        &b, {T{0.0}, T{-0.0}, -std::numeric_limits<T>::infinity()}, {&param0},
+        kErrorSpec);
   }
 
   void TestIsInfOrNan() {
@@ -120,16 +123,16 @@ class MathTypedTest : public MathTest {
     XlaBuilder b(TestName());
     T inf(std::numeric_limits<float>::infinity());
     T nan(std::numeric_limits<float>::quiet_NaN());
-    IsNegZero(AddParam(
-        LiteralUtil::CreateR1<T>({T{-0.0}, T{0}, T{1}, T{-1}, inf, -inf, nan}),
-        &b));
+    const Literal param0 =
+        LiteralUtil::CreateR1<T>({T{-0.0}, T{0}, T{1}, T{-1}, inf, -inf, nan});
+    IsNegZero(Parameter(&b, 0, param0.shape(), ""));
 
     bool is_mx = std::is_same_v<T, tsl::float4_e2m1fn>;
     ComputeAndCompareLiteral(
         &b,
         LiteralUtil::CreateR1<bool>(
             {has_negative_zero_v<T>, false, false, false, false, false, is_mx}),
-        {}, error_spec_);
+        {&param0}, kErrorSpec);
   }
 
   // sqrt(x) == pow(x, 0.5) except that
@@ -157,25 +160,27 @@ class MathTypedTest : public MathTest {
     const T nan(std::numeric_limits<float>::quiet_NaN());
 
     XlaBuilder b(TestName());
-    auto x = AddParam(LiteralUtil::CreateR1<T>({-inf}), &b);
+    const Literal param0 = LiteralUtil::CreateR1<T>({-inf});
+    XlaOp x = Parameter(&b, 0, param0.shape(), "");
     ConcatInDim(
         &b, {Sqrt(x), Pow(x, ScalarLike(x, 0.5)), Pow(x, ScalarLike(x, 0.3))},
         0);
     std::vector<T> expected = {nan, inf, inf};
-    ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
+    ComputeAndCompareR1<T>(&b, expected, {&param0}, kErrorSpec);
   }
 
   void TestErfInvEdgeCases() {
     SetFastMathDisabled(true);
 
     XlaBuilder b(TestName());
-    auto x = AddParam(LiteralUtil::CreateR1<T>({T{-1}, T{1}, T{0}}), &b);
+    const Literal param0 = LiteralUtil::CreateR1<T>({T{-1}, T{1}, T{0}});
+    XlaOp x = Parameter(&b, 0, param0.shape(), "");
     ErfInv(x);
 
     const T inf(std::numeric_limits<float>::infinity());
     std::vector<T> expected = {-inf, inf, T{0}};
 
-    ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
+    ComputeAndCompareR1<T>(&b, expected, {&param0}, kErrorSpec);
   }
 
   void TestErfEdgeCases() {
@@ -185,10 +190,10 @@ class MathTypedTest : public MathTest {
     const T nan(std::numeric_limits<float>::quiet_NaN());
 
     XlaBuilder b(TestName());
-    auto x = AddParam(LiteralUtil::CreateR1<T>({T{-inf}, T{inf}, T{-0}, T{0},
-                                                T{-kErfInvOneMinusHalfULP},
-                                                T{kErfInvOneMinusHalfULP}}),
-                      &b);
+    const Literal param0 = LiteralUtil::CreateR1<T>(
+        {T{-inf}, T{inf}, T{-0}, T{0}, T{-kErfInvOneMinusHalfULP},
+         T{kErfInvOneMinusHalfULP}});
+    XlaOp x = Parameter(&b, 0, param0.shape(), "");
     Erf(x);
 
     bool inf_as_nan = !std::numeric_limits<T>::has_infinity &&
@@ -200,7 +205,7 @@ class MathTypedTest : public MathTest {
                                T(-1),
                                T(1)};
 
-    ComputeAndCompareR1<T>(&b, expected, {}, error_spec_);
+    ComputeAndCompareR1<T>(&b, expected, {&param0}, kErrorSpec);
   }
 };
 
@@ -237,7 +242,7 @@ XLA_TYPED_TEST(MathTypedTest, ErfInvEdgeCases) { this->TestErfInvEdgeCases(); }
 XLA_TYPED_TEST(MathTypedTest, ErfEdgeCases) { this->TestErfEdgeCases(); }
 
 // Check that certain ops only support real, floating-point inputs.
-XLA_TEST_F(MathTest, RealFpOnlyOps) {
+TEST_F(MathTest, RealFpOnlyOps) {
   for (int64_t i = PrimitiveType_MIN; i <= PrimitiveType_MAX; ++i) {
     auto ty = static_cast<PrimitiveType>(i);
     SCOPED_TRACE(PrimitiveType_Name(ty));
@@ -264,7 +269,7 @@ XLA_TEST_F(MathTest, RealFpOnlyOps) {
              {IsPosInf, "is_pos_inf"},
              {IsNegInf, "is_neg_inf"},
              {IsNan, "is_nan"},
-             {Erf, "erf"},
+             {[](XlaOp x) { return Erf(x); }, "erf"},
              {Erfc, "erfc"},
              {Lgamma, "lgamma"},
              {Digamma, "digamma"},
@@ -284,34 +289,27 @@ XLA_TEST_F(MathTest, RealFpOnlyOps) {
   }
 }
 
-XLA_TEST_F(MathTest, SqrtF32) {
+TEST_F(MathTest, SqrtF32) {
   XlaBuilder builder(TestName());
-  Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F32);
-
-  std::unique_ptr<GlobalData> zero_data =
-      client_->TransferToServer(zero_literal).value();
-
+  const Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F32);
   XlaOp zero = Parameter(&builder, 0, zero_literal.shape(), "zero");
   Sqrt(zero);
 
-  ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 0.0f, {&zero_literal}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, SqrtF64) {
+TEST_F(MathTest, SqrtF64) {
   XlaBuilder builder(TestName());
-  Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F64);
-
-  std::unique_ptr<GlobalData> zero_data =
-      client_->TransferToServer(zero_literal).value();
+  const Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F64);
 
   XlaOp zero = Parameter(&builder, 0, zero_literal.shape(), "zero");
   Sqrt(zero);
 
-  ComputeAndCompareR0<double>(&builder, 0.0f, {zero_data.get()}, error_spec_);
+  ComputeAndCompareR0<double>(&builder, 0.0f, {&zero_literal}, kErrorSpec);
 }
 
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64
-XLA_TEST_F(MathTest, ErfInvF64) {
+TEST_F(MathTest, ErfInvF64) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<double>(
       &builder, {-0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1,
@@ -332,7 +330,7 @@ XLA_TEST_F(MathTest, ErfInvF64) {
 }
 #endif
 
-XLA_TEST_F(MathTest, SquareTenValues) {
+TEST_F(MathTest, SquareTenValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -340,10 +338,10 @@ XLA_TEST_F(MathTest, SquareTenValues) {
 
   std::vector<float> expected = {4.41, 6.76, 6.76, 16.,  4.41,
                                  5.29, 25.,  0.81, 5.76, 2.56};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, ReciprocalTenValues) {
+TEST_F(MathTest, ReciprocalTenValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -352,27 +350,27 @@ XLA_TEST_F(MathTest, ReciprocalTenValues) {
   std::vector<float> expected = {
       0.47619048, -0.38461538, 0.38461538,  -0.25,       0.47619048,
       0.43478261, -0.2,        -1.11111111, -0.41666667, 0.625};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, SqrtZeroes) {
+TEST_F(MathTest, SqrtZeroes) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {0.0, -0.0});
   Sqrt(x);
 
-  ComputeAndCompareR1<float>(&builder, {0, 0}, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, {0, 0}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, SqrtSixValues) {
+TEST_F(MathTest, SqrtSixValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {16.0, 1.0, 1024.0, 0.16, 0.2, 12345});
   Sqrt(x);
 
   std::vector<float> expected = {4, 1, 32, 0.4, 0.4472, 111.1080};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, CbrtSixF32Values) {
+TEST_F(MathTest, CbrtSixF32Values) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {8.0, 1.0, 4096.0, -64.0, 1.728, 1331});
   Cbrt(x);
@@ -381,7 +379,7 @@ XLA_TEST_F(MathTest, CbrtSixF32Values) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.001));
 }
 
-XLA_TEST_F(MathTest, CbrtSixF64Values) {
+TEST_F(MathTest, CbrtSixF64Values) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<double>(&builder, {8.0, 1.0, 4096.0, -64.0, 1.728, 1331});
   Cbrt(x);
@@ -390,31 +388,31 @@ XLA_TEST_F(MathTest, CbrtSixF64Values) {
   ComputeAndCompareR1<double>(&builder, expected, {}, ErrorSpec(0.001));
 }
 
-XLA_TEST_F(MathTest, SinhSmallValues) {
+TEST_F(MathTest, SinhSmallValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1e-3, 1e-5, 1e-7, 1e-9, 1e-11});
   Sinh(x);
   std::vector<float> expected = {1e-3, 1e-5, 1e-7, 1e-9, 1e-11};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, AsinhSmallValues) {
+TEST_F(MathTest, AsinhSmallValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1e-3, 1e-5, 1e-7, 1e-9, 1e-11});
   Asinh(x);
   std::vector<float> expected = {1e-3, 1e-5, 1e-7, 1e-9, 1e-11};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, AtanhSmallValues) {
+TEST_F(MathTest, AtanhSmallValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1e-8, 1e-9, 1e-10, 1e-11});
   Atanh(x);
   std::vector<float> expected = {1e-8, 1e-9, 1e-10, 1e-11};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, Lgamma) {
+TEST_F(MathTest, Lgamma) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.5, 1.5,
                                         2.5, -1.5, -3.5, -5.5});
@@ -433,12 +431,11 @@ XLA_TEST_F(MathTest, Lgamma) {
       static_cast<float>(std::log(M_PI) / 2 - std::log(3) + std::log(4)),
       static_cast<float>(std::log(M_PI) / 2 - std::log(105) + std::log(16)),
       static_cast<float>(std::log(M_PI) / 2 - std::log(10395) + std::log(64))};
-  error_spec_ = ErrorSpec{0.001};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec{0.001});
 }
 
 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
-XLA_TEST_F(MathTest, LgammaF16) {
+TEST_F(MathTest, LgammaF16) {
   SetFastMathDisabled(true);
 
   XlaBuilder b(TestName());
@@ -460,7 +457,7 @@ XLA_TEST_F(MathTest, LgammaF16) {
 }
 #endif
 
-XLA_TEST_F(MathTest, Digamma) {
+TEST_F(MathTest, Digamma) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1.0, 0.5, 1 / 3.0, 0.25, 1 / 6.0, 0.125,
                                         2.0, 3.0, 4.0, 6.0, 8.0, 9.0});
@@ -487,10 +484,10 @@ XLA_TEST_F(MathTest, Digamma) {
       static_cast<float>(137 / 60.0 - euler_mascheroni),
       static_cast<float>(363 / 140.0 - euler_mascheroni),
       static_cast<float>(761 / 280.0 - euler_mascheroni)};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, Igamma) {
+TEST_F(MathTest, Igamma) {
   XlaBuilder builder(TestName());
   auto a = ConstantR3FromArray3D<float>(
       &builder,
@@ -509,10 +506,10 @@ XLA_TEST_F(MathTest, Igamma) {
       {{0.78746926, 0.99940502, 0.98028261, 0.97033807, 0.99054696},
        {0.33265522, 0.99983558, 0.32599159, 0.99923275, 0.99980893},
        {0.74343963, 0.46703197, 0.33923541, 0.99978511, 0.99460685}}};
-  ComputeAndCompareR3<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR3<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, IgammaSpecialValues) {
+TEST_F(MathTest, IgammaSpecialValues) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
   const float nan = std::numeric_limits<float>::quiet_NaN();
@@ -524,11 +521,11 @@ XLA_TEST_F(MathTest, IgammaSpecialValues) {
 
   Igamma(a, x);
   std::vector<float> expected = {nan, nan, nan, nan, nan, nan};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
-XLA_TEST_F(MathTest, IgammaF16) {
+TEST_F(MathTest, IgammaF16) {
   SetFastMathDisabled(true);
 
   XlaBuilder builder(TestName());
@@ -550,7 +547,7 @@ XLA_TEST_F(MathTest, IgammaF16) {
 }
 #endif
 
-XLA_TEST_F(MathTest, Igammac) {
+TEST_F(MathTest, Igammac) {
   XlaBuilder builder(TestName());
   auto a = ConstantR3FromArray3D<float>(
       &builder,
@@ -571,11 +568,11 @@ XLA_TEST_F(MathTest, Igammac) {
                                7.67252602e-04, 1.91071108e-04},
                               {2.56560373e-01, 5.32968026e-01, 6.60764593e-01,
                                2.14889688e-04, 5.39314824e-03}}};
-  ComputeAndCompareR3<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR3<float>(&builder, expected, {}, kErrorSpec);
 }
 
 #if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
-XLA_TEST_F(MathTest, IgammacF16) {
+TEST_F(MathTest, IgammacF16) {
   SetFastMathDisabled(true);
 
   XlaBuilder builder(TestName());
@@ -598,7 +595,7 @@ XLA_TEST_F(MathTest, IgammacF16) {
 }
 #endif
 
-XLA_TEST_F(MathTest, RoundToEven) {
+TEST_F(MathTest, RoundToEven) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {-1.4, -1.5, -2.5, -0.5, 0, 0.5, 1.5, 2.5, 3.5, 4.5});
@@ -607,45 +604,45 @@ XLA_TEST_F(MathTest, RoundToEven) {
   std::vector<float> expected = {-1.0, -2.0, -2.0, -0.0, 0,
                                  0.0,  2.0,  2.0,  4.0,  4.0};
 
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, ErfRejectsComplexInputs) {
+TEST_F(MathTest, ErfRejectsComplexInputs) {
   XlaBuilder b(TestName());
   auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
   Erf(x);
   EXPECT_FALSE(b.Build().status().ok());
 }
 
-XLA_TEST_F(MathTest, ErfcRejectsComplexInputs) {
+TEST_F(MathTest, ErfcRejectsComplexInputs) {
   XlaBuilder b(TestName());
   auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
   Erfc(x);
   EXPECT_FALSE(b.Build().status().ok());
 }
 
-XLA_TEST_F(MathTest, LgammaRejectsComplexInputs) {
+TEST_F(MathTest, LgammaRejectsComplexInputs) {
   XlaBuilder b(TestName());
   auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
   Lgamma(x);
   EXPECT_FALSE(b.Build().status().ok());
 }
 
-XLA_TEST_F(MathTest, DigammaRejectsComplexInputs) {
+TEST_F(MathTest, DigammaRejectsComplexInputs) {
   XlaBuilder b(TestName());
   auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
   Digamma(x);
   EXPECT_FALSE(b.Build().status().ok());
 }
 
-XLA_TEST_F(MathTest, RoundToEvenRejectsComplexInputs) {
+TEST_F(MathTest, RoundToEvenRejectsComplexInputs) {
   XlaBuilder b(TestName());
   auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
   RoundToEven(x);
   EXPECT_FALSE(b.Build().status().ok());
 }
 
-XLA_TEST_F(MathTest, BesselI0eFloat) {
+TEST_F(MathTest, BesselI0eFloat) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder,
@@ -675,10 +672,10 @@ XLA_TEST_F(MathTest, BesselI0eFloat) {
                                  0.100544127361,
                                  0.0947062952128,
                                  0.0897803118848};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, DISABLED_ON_TPU(BesselI0eDouble)) {
+TEST_F(MathTest, DISABLED_ON_TPU(BesselI0eDouble)) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<double>(
       &builder,
@@ -708,10 +705,10 @@ XLA_TEST_F(MathTest, DISABLED_ON_TPU(BesselI0eDouble)) {
                                   0.100544127361,
                                   0.0947062952128,
                                   0.0897803118848};
-  ComputeAndCompareR1<double>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<double>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, BesselI1eFloat) {
+TEST_F(MathTest, BesselI1eFloat) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder,
@@ -741,10 +738,10 @@ XLA_TEST_F(MathTest, BesselI1eFloat) {
                                  0.0973496147565,
                                  0.092036796872,
                                  0.0875062221833};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, DISABLED_ON_TPU(BesselI1eDouble)) {
+TEST_F(MathTest, DISABLED_ON_TPU(BesselI1eDouble)) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<double>(
       &builder,
@@ -774,10 +771,10 @@ XLA_TEST_F(MathTest, DISABLED_ON_TPU(BesselI1eDouble)) {
                                   0.0973496147565,
                                   0.092036796872,
                                   0.0875062221833};
-  ComputeAndCompareR1<double>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<double>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, AcosComplexValues) {
+TEST_F(MathTest, AcosComplexValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<std::complex<float>>(
       &builder, {{0, 0}, {0, 1}, {1, 1}, {0.8, 0.2}});
@@ -788,10 +785,10 @@ XLA_TEST_F(MathTest, AcosComplexValues) {
       {1.5707963267948966, -0.881373587019543},
       {0.9045568943023814, -1.0612750619050357},
       {0.7011246914497526, -0.30527648462436596}};
-  ComputeAndCompareR1<std::complex<float>>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<std::complex<float>>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(MathTest, ZetaF64) {
+TEST_F(MathTest, ZetaF64) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<double>(&builder, {2.0});
   auto q = ConstantR1<double>(&builder, {1.0});
diff --git a/third_party/xla/xla/hlo/builder/lib/matrix.cc b/third_party/xla/xla/hlo/builder/lib/matrix.cc
index 7c189b762a49..e9fe29ea83ee 100644
--- a/third_party/xla/xla/hlo/builder/lib/matrix.cc
+++ b/third_party/xla/xla/hlo/builder/lib/matrix.cc
@@ -64,7 +64,7 @@ XlaOp GetDiagonalMask(XlaOp x, int diagonal) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    auto n_dims = static_cast<int32_t>(shape.rank());
+    auto n_dims = static_cast<int32_t>(shape.dimensions().size());
     TF_RET_CHECK(n_dims >= 2);
     auto m = shape.dimensions(n_dims - 2);
     auto n = shape.dimensions(n_dims - 1);
@@ -82,7 +82,7 @@ XlaOp GetMatrixDiagonal(XlaOp x, int k) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    auto n_dims = static_cast<int32_t>(shape.rank());
+    auto n_dims = static_cast<int32_t>(shape.dimensions().size());
     TF_RET_CHECK(n_dims >= 2);
     const int64_t m = shape.dimensions(n_dims - 2);
     const int64_t n = shape.dimensions(n_dims - 1);
@@ -116,7 +116,7 @@ XlaOp GetMatrixDiagonalViaGather(XlaOp x, int k) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    auto n_dims = static_cast<int32_t>(shape.rank());
+    auto n_dims = static_cast<int32_t>(shape.dimensions().size());
     TF_RET_CHECK(n_dims >= 2);
     const int64_t m = shape.dimensions(n_dims - 2);
     const int64_t n = shape.dimensions(n_dims - 1);
@@ -180,7 +180,7 @@ XlaOp SetMatrixDiagonal(XlaOp matrix, XlaOp diag, int k) {
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(matrix));
     TF_ASSIGN_OR_RETURN(Shape diag_shape, builder->GetShape(diag));
-    auto n_dims = static_cast<int32_t>(shape.rank());
+    auto n_dims = static_cast<int32_t>(shape.dimensions().size());
     TF_RET_CHECK(n_dims >= 2);
     const int64_t m = shape.dimensions(n_dims - 2);
     const int64_t n = shape.dimensions(n_dims - 1);
@@ -195,7 +195,7 @@ XlaOp SetMatrixDiagonal(XlaOp matrix, XlaOp diag, int k) {
 
     if (pad_high != 0) {
       PaddingConfig padding_config;
-      for (int64_t i = 0; i < diag_shape.rank() - 1; ++i) {
+      for (int64_t i = 0; i < diag_shape.dimensions().size() - 1; ++i) {
         auto* dims = padding_config.add_dimensions();
         dims->set_edge_padding_low(0);
         dims->set_interior_padding(0);
@@ -218,7 +218,7 @@ XlaOp TriangleMask(XlaOp x, int diagonal) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64_t n_dims = shape.rank();
+    const int64_t n_dims = shape.dimensions().size();
     TF_RET_CHECK(n_dims >= 2);
     const int64_t m = shape.dimensions(n_dims - 2);
     const int64_t n = shape.dimensions(n_dims - 1);
@@ -245,7 +245,7 @@ XlaOp Symmetrize(XlaOp x, bool lower) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    if (shape.rank() < 2) {
+    if (shape.dimensions().size() < 2) {
       return InvalidArgument(
           "Argument to symmetrize must have >= 2 dimensions, got %s",
           shape.ToString());
@@ -734,7 +734,8 @@ XlaOp Einsum(XlaOp x, XlaOp y, absl::string_view einsum_config,
     TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
     TF_ASSIGN_OR_RETURN(
         auto einsum_config_numeric,
-        ParseEinsumString(einsum_config, x_shape.rank(), y_shape.rank()));
+        ParseEinsumString(einsum_config, x_shape.dimensions().size(),
+                          y_shape.dimensions().size()));
     return Einsum(x, einsum_config_numeric[0], y, einsum_config_numeric[1],
                   einsum_config_numeric[2], precision, preferred_element_type,
                   grad_x, grad_y);
@@ -751,7 +752,7 @@ XlaOp TransposeInMinorDims(XlaOp x) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64_t n_dims = shape.rank();
+    const int64_t n_dims = shape.dimensions().size();
     TF_RET_CHECK(n_dims >= 2);
     std::vector<int64_t> permutation(n_dims);
     std::iota(permutation.begin(), permutation.end(), 0);
diff --git a/third_party/xla/xla/hlo/builder/lib/matrix_test.cc b/third_party/xla/xla/hlo/builder/lib/matrix_test.cc
index 9924a1068344..e35f52182aef 100644
--- a/third_party/xla/xla/hlo/builder/lib/matrix_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/matrix_test.cc
@@ -34,14 +34,19 @@ limitations under the License.
 #include "xla/hlo/builder/lib/slicing.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 
 namespace xla {
 namespace {
 
-class MatrixTest : public ClientLibraryTestBase {
+class MatrixTest : public ClientLibraryTestRunnerMixin<
+                       HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   template <typename T>
   void TestMatrixDiagonal();
@@ -66,7 +71,7 @@ class MatrixTest : public ClientLibraryTestBase {
   }
 };
 
-XLA_TEST_F(MatrixTest, Triangle) {
+TEST_F(MatrixTest, Triangle) {
   XlaBuilder builder(TestName());
   Array3D<int32_t> input(2, 3, 4);
   input.FillIota(0);
@@ -77,10 +82,10 @@ XLA_TEST_F(MatrixTest, Triangle) {
   Array3D<int32_t> expected({{{0, 0, 0, 0}, {4, 5, 0, 0}, {8, 9, 10, 0}},
                              {{12, 0, 0, 0}, {16, 17, 0, 0}, {20, 21, 22, 0}}});
 
-  ComputeAndCompareR3<int32_t>(&builder, expected, {a_data.get()});
+  ComputeAndCompareR3<int32_t>(&builder, expected, {&a_data});
 }
 
-XLA_TEST_F(MatrixTest, Symmetrize) {
+TEST_F(MatrixTest, Symmetrize) {
   for (bool lower : {false, true}) {
     XlaBuilder builder(TestName());
     float nan = std::numeric_limits<float>::quiet_NaN();
@@ -90,8 +95,8 @@ XLA_TEST_F(MatrixTest, Symmetrize) {
         {4, 5, 6},
     };
 
-    XlaOp a;
-    auto a_data = CreateParameter<float>(input, 0, "a", &builder, &a);
+    const Literal a_literal = LiteralUtil::CreateFromArray(input);
+    XlaOp a = Parameter(&builder, 0, a_literal.shape(), "a");
     Symmetrize(lower ? a : TransposeInMinorDims(a), /*lower=*/lower);
 
     Array<float> expected = {
@@ -100,11 +105,12 @@ XLA_TEST_F(MatrixTest, Symmetrize) {
         {4, 5, 6},
     };
 
-    ComputeAndCompare<float>(&builder, expected, {a_data.get()});
+    ComputeAndCompareLiteral(&builder, LiteralUtil::CreateFromArray(expected),
+                             {&a_literal});
   }
 }
 
-XLA_TEST_F(MatrixTest, SymmetrizeComplex) {
+TEST_F(MatrixTest, SymmetrizeComplex) {
   for (bool lower : {false, true}) {
     XlaBuilder builder(TestName());
     float nan = std::numeric_limits<float>::quiet_NaN();
@@ -114,8 +120,8 @@ XLA_TEST_F(MatrixTest, SymmetrizeComplex) {
         {complex64{4, 8}, complex64{5, 9}, complex64{6, nan}},
     };
 
-    XlaOp a;
-    auto a_data = CreateParameter<complex64>(input, 0, "a", &builder, &a);
+    const Literal a_literal = LiteralUtil::CreateFromArray(input);
+    XlaOp a = Parameter(&builder, 0, a_literal.shape(), "a");
     Symmetrize(lower ? a : Conj(TransposeInMinorDims(a)), /*lower=*/lower);
 
     Array<complex64> expected = {
@@ -124,7 +130,8 @@ XLA_TEST_F(MatrixTest, SymmetrizeComplex) {
         {complex64{4, 8}, complex64{5, 9}, 6},
     };
 
-    ComputeAndCompare<complex64>(&builder, expected, {a_data.get()});
+    ComputeAndCompareLiteral(&builder, LiteralUtil::CreateFromArray(expected),
+                             {&a_literal});
   }
 }
 
@@ -138,7 +145,7 @@ void MatrixTest::TestMatrixDiagonal() {
     auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
     GetMatrixDiagonal(a, kv.first);
 
-    ComputeAndCompareR2<T>(&builder, kv.second, {a_data.get()});
+    ComputeAndCompareR2<T>(&builder, kv.second, {&a_data});
   }
 }
 
@@ -158,25 +165,19 @@ void MatrixTest::TestSetMatrixDiagonal() {
                       kv.first) -
         ScalarLike(b, 1);
 
-    ComputeAndCompareR2<T>(&builder, kv.second, {a_data.get(), new_diag.get()});
+    ComputeAndCompareR2<T>(&builder, kv.second, {&a_data, &new_diag});
   }
 }
 
-XLA_TEST_F(MatrixTest, SetMatrixDiagonal_S32) {
-  TestSetMatrixDiagonal<int32_t>();
-}
-XLA_TEST_F(MatrixTest, SetMatrixDiagonal_S64) {
-  TestSetMatrixDiagonal<int64_t>();
-}
-XLA_TEST_F(MatrixTest, SetMatrixDiagonal_F32) {
-  TestSetMatrixDiagonal<float>();
-}
+TEST_F(MatrixTest, SetMatrixDiagonal_S32) { TestSetMatrixDiagonal<int32_t>(); }
+TEST_F(MatrixTest, SetMatrixDiagonal_S64) { TestSetMatrixDiagonal<int64_t>(); }
+TEST_F(MatrixTest, SetMatrixDiagonal_F32) { TestSetMatrixDiagonal<float>(); }
 
-XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32_t>(); }
+TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32_t>(); }
 
-XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64_t>(); }
+TEST_F(MatrixTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64_t>(); }
 
-XLA_TEST_F(MatrixTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+TEST_F(MatrixTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
 
 template <typename T>
 void MatrixTest::TestMatrixDiagonal4D() {
@@ -199,21 +200,15 @@ void MatrixTest::TestMatrixDiagonal4D() {
     auto a_data = CreateR4Parameter<T>(input, 0, "a", &builder, &a);
     GetMatrixDiagonal(a, kv.first);
 
-    ComputeAndCompareR3<T>(&builder, kv.second, {a_data.get()});
+    ComputeAndCompareR3<T>(&builder, kv.second, {&a_data});
   }
 }
 
-XLA_TEST_F(MatrixTest, GetMatrixDiagonal4D_S32) {
-  TestMatrixDiagonal4D<int32_t>();
-}
+TEST_F(MatrixTest, GetMatrixDiagonal4D_S32) { TestMatrixDiagonal4D<int32_t>(); }
 
-XLA_TEST_F(MatrixTest, GetMatrixDiagonal4D_S64) {
-  TestMatrixDiagonal4D<int64_t>();
-}
+TEST_F(MatrixTest, GetMatrixDiagonal4D_S64) { TestMatrixDiagonal4D<int64_t>(); }
 
-XLA_TEST_F(MatrixTest, GetMatrixDiagonal4D_F32) {
-  TestMatrixDiagonal4D<float>();
-}
+TEST_F(MatrixTest, GetMatrixDiagonal4D_F32) { TestMatrixDiagonal4D<float>(); }
 
 Array3D<float> BatchedAValsFull() {
   return {{
@@ -230,7 +225,7 @@ Array3D<float> BatchedAValsFull() {
           }};
 }
 
-XLA_TEST_F(MatrixTest, RowBatchDot) {
+TEST_F(MatrixTest, RowBatchDot) {
   XlaBuilder builder(TestName());
   int n = 4;
 
@@ -247,10 +242,10 @@ XLA_TEST_F(MatrixTest, RowBatchDot) {
   BatchDot(l_index, TransposeInMinorDims(row));
 
   ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
-                             {a_data.get(), row_data.get(), index_data.get()});
+                             {&a_data, &row_data, &index_data});
 }
 
-XLA_TEST_F(MatrixTest, Einsum) {
+TEST_F(MatrixTest, Einsum) {
   XlaBuilder builder(TestName());
 
   int n = 4;
@@ -268,10 +263,10 @@ XLA_TEST_F(MatrixTest, Einsum) {
   Einsum(l_index, row, "abc,adc->abd");
 
   ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
-                             {a_data.get(), row_data.get(), index_data.get()});
+                             {&a_data, &row_data, &index_data});
 }
 
-XLA_TEST_F(MatrixTest, ParseEinsumString) {
+TEST_F(MatrixTest, ParseEinsumString) {
   auto to_vec = [](absl::string_view s) {
     std::vector<int64_t> v;
     v.reserve(s.size());
@@ -326,7 +321,7 @@ XLA_TEST_F(MatrixTest, ParseEinsumString) {
   }
 }
 
-XLA_TEST_F(MatrixTest, NormalizeEinsumString) {
+TEST_F(MatrixTest, NormalizeEinsumString) {
   EXPECT_EQ(NormalizeEinsumString("a,b->ab"), "");
   EXPECT_EQ(NormalizeEinsumString("ba"), "ba->ab");
   EXPECT_EQ(NormalizeEinsumString("ab,dc"), "ab,dc->abcd");
diff --git a/third_party/xla/xla/hlo/builder/lib/pooling_test.cc b/third_party/xla/xla/hlo/builder/lib/pooling_test.cc
index 83ebbb50337f..1a32613ae864 100644
--- a/third_party/xla/xla/hlo/builder/lib/pooling_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/pooling_test.cc
@@ -25,12 +25,16 @@ limitations under the License.
 #include "xla/hlo/builder/padding.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/shape.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
+constexpr ErrorSpec kErrorSpec{0.0001};
+
 TensorFormat MakeNCHWFormat(int num_spatial_dims) {
   absl::InlinedVector<int64_t, 4> spatial_dimensions;
   for (int i = 0; i < num_spatial_dims; ++i) {
@@ -66,12 +70,10 @@ std::vector<int64_t> ExpandWithBatchAndFeatureDimensions(
   return tensor_sizes;
 }
 
-class PoolingTest : public ClientLibraryTestBase {
- public:
-  ErrorSpec error_spec_{0.0001};
-};
+using PoolingTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
-XLA_TEST_F(PoolingTest, MaxPool2D) {
+TEST_F(PoolingTest, MaxPool2D) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -81,10 +83,10 @@ XLA_TEST_F(PoolingTest, MaxPool2D) {
   auto stride = kernel_size;
   MaxPool(input, kernel_size, stride, Padding::kValid, data_format);
 
-  ComputeAndCompareR4<float>(&builder, {{{{5, 4}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{5, 4}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, MaxPool2DWithPadding) {
+TEST_F(PoolingTest, MaxPool2DWithPadding) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -94,10 +96,10 @@ XLA_TEST_F(PoolingTest, MaxPool2DWithPadding) {
   auto stride = kernel_size;
   MaxPool(input, kernel_size, stride, Padding::kSame, data_format);
 
-  ComputeAndCompareR4<float>(&builder, {{{{5, 4, 5}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{5, 4, 5}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, MaxPool2DWithPaddingAndStride) {
+TEST_F(PoolingTest, MaxPool2DWithPaddingAndStride) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -108,10 +110,10 @@ XLA_TEST_F(PoolingTest, MaxPool2DWithPaddingAndStride) {
   MaxPool(input, kernel_size, stride, Padding::kSame, data_format);
 
   ComputeAndCompareR4<float>(&builder, {{{{5, 4, 4, 5, 5}, {5, 4, 3, 2, 1}}}},
-                             {}, error_spec_);
+                             {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2D) {
+TEST_F(PoolingTest, AvgPool2D) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -124,10 +126,10 @@ XLA_TEST_F(PoolingTest, AvgPool2D) {
   AvgPool(input, kernel_size, stride, padding, data_format,
           /*counts_include_padding=*/true);
 
-  ComputeAndCompareR4<float>(&builder, {{{{3, 3}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{3, 3}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DWithPadding) {
+TEST_F(PoolingTest, AvgPool2DWithPadding) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -140,10 +142,10 @@ XLA_TEST_F(PoolingTest, AvgPool2DWithPadding) {
   AvgPool(input, kernel_size, stride, padding, data_format,
           /*counts_include_padding=*/false);
 
-  ComputeAndCompareR4<float>(&builder, {{{{3, 3, 3}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{3, 3, 3}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DWithPaddingAndStride) {
+TEST_F(PoolingTest, AvgPool2DWithPaddingAndStride) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -156,12 +158,11 @@ XLA_TEST_F(PoolingTest, AvgPool2DWithPaddingAndStride) {
   AvgPool(input, kernel_size, stride, padding, data_format,
           /*counts_include_padding=*/false);
 
-  ComputeAndCompareR4<float>(&builder,
-                             {{{{3, 3, 3, 3, 3}, {4.5, 3.5, 2.5, 1.5, 1}}}}, {},
-                             error_spec_);
+  ComputeAndCompareR4<float>(
+      &builder, {{{{3, 3, 3, 3, 3}, {4.5, 3.5, 2.5, 1.5, 1}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DWithGeneralPaddingCountNotIncludePadding) {
+TEST_F(PoolingTest, AvgPool2DWithGeneralPaddingCountNotIncludePadding) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -172,11 +173,11 @@ XLA_TEST_F(PoolingTest, AvgPool2DWithGeneralPaddingCountNotIncludePadding) {
   AvgPool(input, kernel_size, stride, {{1, 1}, {2, 1}}, data_format,
           /*counts_include_padding=*/false);
 
-  ComputeAndCompareR4<float>(&builder, {{{{3, 3}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{3, 3}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest,
-           AvgPool2DWithGeneralPaddingCountNotIncludePaddingAndStride) {
+TEST_F(PoolingTest,
+       AvgPool2DWithGeneralPaddingCountNotIncludePaddingAndStride) {
   XlaBuilder builder(TestName());
 
   XlaOp input = ConstantR4FromArray4D<float>(
@@ -188,10 +189,10 @@ XLA_TEST_F(PoolingTest,
           /*counts_include_padding=*/false);
 
   ComputeAndCompareR4<float>(&builder, {{{{1.5, 3, 4.5}, {3, 3, 3}}}}, {},
-                             error_spec_);
+                             kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DGradNoPadding) {
+TEST_F(PoolingTest, AvgPool2DGradNoPadding) {
   XlaBuilder builder(TestName());
   for (bool counts_include_padding : {false, true}) {
     XlaOp out_backprop = ConstantR4FromArray4D<float>(&builder, {{{{1.}}}});
@@ -204,11 +205,11 @@ XLA_TEST_F(PoolingTest, AvgPool2DGradNoPadding) {
     // Without padding, counts_include_padding makes no difference.
     ComputeAndCompareR4<float>(
         &builder, {{{{0.25, 0.25, 0.}, {0.25, 0.25, 0.}, {0., 0., 0.}}}}, {},
-        error_spec_);
+        kErrorSpec);
   }
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DGradNoPaddingWithStride) {
+TEST_F(PoolingTest, AvgPool2DGradNoPaddingWithStride) {
   XlaBuilder builder(TestName());
   for (bool counts_include_padding : {false, true}) {
     XlaOp out_backprop =
@@ -222,11 +223,11 @@ XLA_TEST_F(PoolingTest, AvgPool2DGradNoPaddingWithStride) {
     // Without padding, counts_include_padding makes no difference.
     ComputeAndCompareR4<float>(
         &builder, {{{{0.25, 0.5, 0.25}, {0.5, 1., 0.5}, {0.25, 0.5, 0.25}}}},
-        {}, error_spec_);
+        {}, kErrorSpec);
   }
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DGradWithPadding) {
+TEST_F(PoolingTest, AvgPool2DGradWithPadding) {
   XlaBuilder builder(TestName());
 
   XlaOp out_backprop =
@@ -240,10 +241,10 @@ XLA_TEST_F(PoolingTest, AvgPool2DGradWithPadding) {
   ComputeAndCompareR4<float>(
       &builder,
       {{{{0.25, 0.25, 0.25}, {0.25, 0.25, 0.25}, {0.25, 0.25, 0.25}}}}, {},
-      error_spec_);
+      kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountNotIncludePadding) {
+TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountNotIncludePadding) {
   XlaBuilder builder(TestName());
 
   XlaOp out_backprop =
@@ -255,10 +256,10 @@ XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountNotIncludePadding) {
               MakeNCHWFormat(2), false);
   ComputeAndCompareR4<float>(
       &builder, {{{{1., 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}}}}, {},
-      error_spec_);
+      kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountWithStride) {
+TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountWithStride) {
   XlaBuilder builder(TestName());
 
   XlaOp out_backprop =
@@ -271,13 +272,11 @@ XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountWithStride) {
   auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format);
   AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}},
               MakeNCHWFormat(2), true);
-  ComputeAndCompareR4<float>(&builder,
-                             {{{{1., 1., 1.}, {1., 1., 1.}, {1., 1., 1.}}}}, {},
-                             error_spec_);
+  ComputeAndCompareR4<float>(
+      &builder, {{{{1., 1., 1.}, {1., 1., 1.}, {1., 1., 1.}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(PoolingTest,
-           AvgPool2DGradWithPaddingCountWithStrideNotIncludePadding) {
+TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountWithStrideNotIncludePadding) {
   XlaBuilder builder(TestName());
 
   XlaOp out_backprop =
@@ -292,7 +291,7 @@ XLA_TEST_F(PoolingTest,
               MakeNCHWFormat(2), false);
   ComputeAndCompareR4<float>(
       &builder, {{{{2.25, 1.5, 2.25}, {1.5, 1., 1.5}, {2.25, 1.5, 2.25}}}}, {},
-      error_spec_);
+      kErrorSpec);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/hlo/builder/lib/prng.cc b/third_party/xla/xla/hlo/builder/lib/prng.cc
index 7bafd7bf5b8e..661981f19c17 100644
--- a/third_party/xla/xla/hlo/builder/lib/prng.cc
+++ b/third_party/xla/xla/hlo/builder/lib/prng.cc
@@ -156,7 +156,7 @@ std::pair<ThreeFry2x32State, XlaOp> GetThreeFryInputsAndUpdatedState(
   // initial_state is an R1, so reshape it to a scalar.
   auto input_u64 = Broadcast(Reshape(initial_state, {}), shape.dimensions());
   int64_t trailing_dims_product = 1;
-  for (int64_t i = shape.rank() - 1; i >= 0; --i) {
+  for (int64_t i = shape.dimensions().size() - 1; i >= 0; --i) {
     if (shape.dimensions(i) < 2) {
       continue;
     }
@@ -181,7 +181,7 @@ struct SplitShapePair {
 // Split the shape on a dimension > 1 into two halves.
 SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
   SplitShapePair pair;
-  if (shape.rank() == 0) {
+  if (shape.dimensions().size() == 0) {
     pair.half_shape = ShapeUtil::MakeShape(shape.element_type(), {1});
     pair.concat_shape = ShapeUtil::MakeShape(shape.element_type(), {2});
     pair.split_dim = 0;
@@ -189,7 +189,7 @@ SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
     return pair;
   }
   pair.split_dim = -1;
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     if (shape.dimensions(i) % 2 == 0) {
       pair.split_dim = i;
       break;
@@ -197,7 +197,7 @@ SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
   }
   if (pair.split_dim == -1) {
     // No even dims. Find a dimension with maximum size.
-    for (int64_t i = 0; i < shape.rank(); ++i) {
+    for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
       if (pair.split_dim == -1 ||
           shape.dimensions(i) > shape.dimensions(pair.split_dim)) {
         pair.split_dim = i;
@@ -209,7 +209,7 @@ SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
   }
   std::vector<int64_t> half_shape_dims;
   std::vector<int64_t> concat_shape_dims;
-  const auto rank = shape.rank();
+  const auto rank = shape.dimensions().size();
   half_shape_dims.reserve(rank + 1);
   concat_shape_dims.reserve(rank + 1);
   for (int64_t i = 0; i < rank; ++i) {
@@ -236,7 +236,7 @@ SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
 XlaOp CombineShapePair(absl::Span<const XlaOp> pair,
                        const SplitShapePair& shape_pair,
                        const Shape& original_shape) {
-  if (original_shape.rank() == 0) {
+  if (original_shape.dimensions().size() == 0) {
     return Reshape(pair[0], {});
   }
   XlaBuilder* builder = pair[0].builder();
@@ -248,9 +248,10 @@ XlaOp CombineShapePair(absl::Span<const XlaOp> pair,
   reshape_dims[shape_pair.split_dim] = RoundUpTo<int64_t>(pre_split_size, 2);
   result = Reshape(result, reshape_dims);
   if (reshape_dims[shape_pair.split_dim] != pre_split_size) {
-    result = Slice(result, std::vector<int64_t>(original_shape.rank(), 0),
+    result = Slice(result,
+                   std::vector<int64_t>(original_shape.dimensions().size(), 0),
                    original_shape.dimensions(),
-                   std::vector<int64_t>(original_shape.rank(), 1));
+                   std::vector<int64_t>(original_shape.dimensions().size(), 1));
   }
   return result;
 }
@@ -732,15 +733,17 @@ RngOutput NormalFloatingPointDistribution(XlaOp key, XlaOp initial_state,
       shape_pair.concat_shape);
 
   // Separate the bits into two groups to perform the Box-Muller transform.
-  XlaOp bits_0 = Slice(bits_state.value,
-                       std::vector<int64_t>(shape_pair.half_shape.rank(), 0),
-                       shape_pair.half_shape.dimensions(),
-                       std::vector<int64_t>(shape_pair.half_shape.rank(), 1));
-  std::vector<int64_t> bits_1_starts(shape_pair.half_shape.rank(), 0);
+  XlaOp bits_0 =
+      Slice(bits_state.value,
+            std::vector<int64_t>(shape_pair.half_shape.dimensions().size(), 0),
+            shape_pair.half_shape.dimensions(),
+            std::vector<int64_t>(shape_pair.half_shape.dimensions().size(), 1));
+  std::vector<int64_t> bits_1_starts(shape_pair.half_shape.dimensions().size(),
+                                     0);
   bits_1_starts[shape_pair.new_concat_dim] = 1;
-  XlaOp bits_1 = Slice(bits_state.value, bits_1_starts,
-                       shape_pair.concat_shape.dimensions(),
-                       std::vector<int64_t>(shape_pair.half_shape.rank(), 1));
+  XlaOp bits_1 = Slice(
+      bits_state.value, bits_1_starts, shape_pair.concat_shape.dimensions(),
+      std::vector<int64_t>(shape_pair.half_shape.dimensions().size(), 1));
   std::tie(bits_0, bits_1) = BoxMullerTransform(bits_0, bits_1);
 
   // Put the numbers in the two groups back to form the requested shape.
diff --git a/third_party/xla/xla/hlo/builder/lib/prng_test.cc b/third_party/xla/xla/hlo/builder/lib/prng_test.cc
index ae180736a541..c7e4e8d4773a 100644
--- a/third_party/xla/xla/hlo/builder/lib/prng_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/prng_test.cc
@@ -25,14 +25,17 @@ limitations under the License.
 #include "xla/hlo/testlib/test.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-class PrngTest : public ClientLibraryTestBase {
+class PrngTest : public ClientLibraryTestRunnerMixin<
+                     HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   template <PrimitiveType value_type, PrimitiveType bit_type,
             typename ValueT = typename primitive_util::PrimitiveTypeToNative<
@@ -81,7 +84,7 @@ class PrngTest : public ClientLibraryTestBase {
   }
 };
 
-XLA_TEST_F(PrngTest, RandomBitsToUniformFloatingPointInvalidArguments) {
+TEST_F(PrngTest, RandomBitsToUniformFloatingPointInvalidArguments) {
   // Existing prng test targets do not test invalid arguments cases, where
   // the number of bits are smaller than the value type's mantissa bits.
   TestConvertRandomBitsToUniformFloatingPoint<PrimitiveType::F32,
diff --git a/third_party/xla/xla/hlo/builder/lib/qr.cc b/third_party/xla/xla/hlo/builder/lib/qr.cc
index 699e13b4c2e1..130e3ed75437 100644
--- a/third_party/xla/xla/hlo/builder/lib/qr.cc
+++ b/third_party/xla/xla/hlo/builder/lib/qr.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "xla/hlo/builder/lib/qr.h"
 
 #include <algorithm>
-#include <memory>
+#include <cstdint>
 #include <vector>
 
 #include "absl/status/statusor.h"
@@ -36,7 +36,7 @@ QrDecomposition Qr(XlaOp a) {
   auto result = [&]() -> absl::StatusOr<QrDecomposition> {
     XlaBuilder* builder = a.builder();
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int num_dims = a_shape.rank();
+    const int num_dims = a_shape.dimensions().size();
     if (num_dims < 2) {
       return InvalidArgument(
           "Arguments to QR must have rank >= 2: got shape %s",
@@ -70,12 +70,12 @@ XlaOp ProductOfElementaryHouseholderReflectors(XlaOp a, XlaOp taus) {
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
     TF_ASSIGN_OR_RETURN(Shape taus_shape, builder->GetShape(taus));
-    if (a_shape.rank() < 2) {
+    if (a_shape.dimensions().size() < 2) {
       return InvalidArgument(
           "Matrix `a` must have >= 2 dimensions: got shape %s",
           a_shape.ToString());
     }
-    if (taus_shape.rank() + 1 != a_shape.rank()) {
+    if (taus_shape.dimensions().size() + 1 != a_shape.dimensions().size()) {
       return InvalidArgument(
           "Matrix `taus` must have one fewer dimension than `a`: got shapes "
           "%s and %s",
@@ -89,12 +89,12 @@ XlaOp ProductOfElementaryHouseholderReflectors(XlaOp a, XlaOp taus) {
           "reflectors must have m >= n, got shape %s",
           a_shape.ToString());
     }
-    absl::Span<const int64_t> a_batch_dims =
-        absl::MakeConstSpan(a_shape.dimensions().begin(),
-                            a_shape.dimensions().begin() + a_shape.rank() - 2);
+    absl::Span<const int64_t> a_batch_dims = absl::MakeConstSpan(
+        a_shape.dimensions().begin(),
+        a_shape.dimensions().begin() + a_shape.dimensions().size() - 2);
     absl::Span<const int64_t> taus_batch_dims = absl::MakeConstSpan(
         taus_shape.dimensions().begin(),
-        taus_shape.dimensions().begin() + taus_shape.rank() - 1);
+        taus_shape.dimensions().begin() + taus_shape.dimensions().size() - 1);
     const int64_t k = ShapeUtil::GetDimension(taus_shape, -1);
     if (a_shape.element_type() != taus_shape.element_type() ||
         a_batch_dims != taus_batch_dims || k > n) {
@@ -125,7 +125,7 @@ void QrExplicit(XlaOp a, bool full_matrices, XlaOp& q, XlaOp& r) {
       t = SliceInMinorDims(qr.q_and_r, {0, 0}, {m, m});
     } else {
       t = PadInDim(qr.q_and_r, Zero(a.builder(), a_shape.element_type()),
-                   a_shape.dimensions_size() - 1, /*pad_lo=*/0,
+                   a_shape.dimensions().size() - 1, /*pad_lo=*/0,
                    /*pad_hi=*/m - n);
     }
     q = ProductOfElementaryHouseholderReflectors(t, qr.taus);
diff --git a/third_party/xla/xla/hlo/builder/lib/quantize.h b/third_party/xla/xla/hlo/builder/lib/quantize.h
index d0126f0c021b..e5d427d5b350 100644
--- a/third_party/xla/xla/hlo/builder/lib/quantize.h
+++ b/third_party/xla/xla/hlo/builder/lib/quantize.h
@@ -120,11 +120,11 @@ inline XlaOp Dequantize(XlaOp input, const QuantizedRange& range,
       bit_mask |= 0x000000ff;
     }
 
-    std::vector<int64_t> shift_transpose_dimensions(shape.dimensions_size());
+    std::vector<int64_t> shift_transpose_dimensions(shape.dimensions().size());
     std::iota(shift_transpose_dimensions.begin(),
               shift_transpose_dimensions.end(), 0);
     shift_transpose_dimensions.insert(shift_transpose_dimensions.begin(), 1,
-                                      shape.dimensions_size());
+                                      shape.dimensions().size());
 
     // Shift the input by sizeof(T) bytes and apply bit_mask to unpack.
     XlaOp shifted_input = ShiftRightLogical(
@@ -154,7 +154,7 @@ inline XlaOp Dequantize(XlaOp input, const QuantizedRange& range,
           "Only MIN_COMBINED mode is supported in xla::Dequantize Op.");
     }
 
-    std::vector<int64_t> transpose_dimensions(shape.dimensions_size());
+    std::vector<int64_t> transpose_dimensions(shape.dimensions().size());
     std::iota(transpose_dimensions.begin(), transpose_dimensions.end(), 1);
     std::reverse(transpose_dimensions.begin(), transpose_dimensions.end());
     transpose_dimensions.insert(transpose_dimensions.begin() + 1, 1, 0);
@@ -171,7 +171,7 @@ inline XlaOp Dequantize(XlaOp input, const QuantizedRange& range,
     }
 
     // Transpose the result to be [d0, d1, ..., dn-1, dn * unpack_size].
-    std::vector<int64_t> result_dimensions(shape.dimensions_size());
+    std::vector<int64_t> result_dimensions(shape.dimensions().size());
     std::iota(result_dimensions.begin(), result_dimensions.end(), 0);
     std::reverse(result_dimensions.begin(), result_dimensions.end());
 
diff --git a/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig.cc b/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig.cc
index 0acccb15b7de..ba651261751d 100644
--- a/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig.cc
+++ b/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig.cc
@@ -37,7 +37,7 @@ SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64_t max_iter,
   XlaBuilder* builder = a.builder();
   XlaOp result = builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int64_t num_dims = a_shape.rank();
+    const int64_t num_dims = a_shape.dimensions().size();
     if (num_dims < 2) {
       return InvalidArgument(
           "Arguments to Eigen decomposition must have rank >= 2: got shape %s.",
diff --git a/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig_test.cc b/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig_test.cc
index a7f160711ee2..2b8425258440 100644
--- a/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/self_adjoint_eig_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/hlo/builder/lib/self_adjoint_eig.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <numeric>
 #include <vector>
 
@@ -32,19 +33,27 @@ limitations under the License.
 #include "xla/hlo/builder/lib/matrix.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
-class SelfAdjointEigTest : public ClientLibraryTestBase {
+class SelfAdjointEigTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   void SetUp() override {
-    ClientLibraryTestBase::SetUp();
+    ClientLibraryTestRunnerMixin<
+        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>::SetUp();
     batch_3d_4x4_ = Array3D<float>{
         {
             {4, 6, 8, 10},
@@ -78,7 +87,6 @@ class SelfAdjointEigTest : public ClientLibraryTestBase {
         {3, 9, 11, 17},
     };
   }
-  void TearDown() override { ClientLibraryTestBase::TearDown(); }
 
   Array3D<float> GetUnitMatrix3D(const Array3D<float>& matrix) {
     Array3D<float> result(matrix.n1(), matrix.n2(), matrix.n3(), 0.0);
@@ -126,10 +134,10 @@ XlaOp GetAverageAbsoluteError(XlaOp m1, XlaOp m2, XlaBuilder* builder) {
 XlaOp ComputeMatmulVWVt(SelfAdjointEigResult result, XlaBuilder* builder) {
   Shape shape = builder->GetShape(result.v).value();
   absl::Span<const int64_t> out_dims = shape.dimensions();
-  std::vector<int64_t> broadcast_dims(shape.rank() - 1);
+  std::vector<int64_t> broadcast_dims(shape.dimensions().size() - 1);
   std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
 
-  broadcast_dims[shape.rank() - 2] = shape.rank() - 1;
+  broadcast_dims[shape.dimensions().size() - 2] = shape.dimensions().size() - 1;
   auto vw =
       Mul(result.v,
           BroadcastInDim(ConvertElementType(result.w, shape.element_type()),
@@ -138,7 +146,7 @@ XlaOp ComputeMatmulVWVt(SelfAdjointEigResult result, XlaBuilder* builder) {
                   PrecisionConfig::HIGHEST);
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_2x4x4) {
+TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_2x4x4) {
   for (bool sort_eigenvalues : {false, true}) {
     XlaBuilder builder(TestName());
 
@@ -148,28 +156,28 @@ XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_2x4x4) {
                                  /*tol=*/1e-5, sort_eigenvalues);
     ComputeMatmulVWVt(result, &builder);
 
-    ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+    ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {&a_data},
                                ErrorSpec(1e-3, 1e-3));
   }
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_3x3_Complex) {
+TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_3x3_Complex) {
   XlaBuilder builder(TestName());
   Array<complex64> input = {
       {1, complex64{2, -7}, complex64{4, -8}},
       {complex64{2, 7}, 3, complex64{5, -9}},
       {complex64{4, 8}, complex64{5, 9}, 6},
   };
-  XlaOp a;
-  auto a_data = CreateParameter<complex64>(input, 0, "a", &builder, &a);
+  const Literal a_literal = LiteralUtil::CreateFromArray(input);
+  XlaOp a = Parameter(&builder, 0, a_literal.shape(), "a");
   auto result = SelfAdjointEig(a);
   ComputeMatmulVWVt(result, &builder);
 
-  ComputeAndCompare<complex64>(&builder, input, {a_data.get()},
-                               ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder, LiteralUtil::CreateFromArray(input),
+                           {&a_literal}, ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Lower_2x4x4) {
+TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Lower_2x4x4) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -178,11 +186,11 @@ XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Lower_2x4x4) {
   auto result = SelfAdjointEig(a);
   ComputeMatmulVWVt(result, &builder);
 
-  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Upper_2x4x4) {
+TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Upper_2x4x4) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -191,11 +199,11 @@ XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Upper_2x4x4) {
   auto result = SelfAdjointEig(a, false);
   ComputeMatmulVWVt(result, &builder);
 
-  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_2x4x4) {
+TEST_F(SelfAdjointEigTest, Test_Orthogonality_2x4x4) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -204,10 +212,10 @@ XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_2x4x4) {
   BatchDot(result.v, TransposeInMinorDims(result.v), PrecisionConfig::HIGHEST);
 
   ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(batch_3d_4x4_),
-                             {a_data.get()}, ErrorSpec(1e-3, 1e-3));
+                             {&a_data}, ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_VtWV_EQ_A_Rank_Deficient_4x4) {
+TEST_F(SelfAdjointEigTest, Test_VtWV_EQ_A_Rank_Deficient_4x4) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -215,11 +223,11 @@ XLA_TEST_F(SelfAdjointEigTest, Test_VtWV_EQ_A_Rank_Deficient_4x4) {
   auto result = SelfAdjointEig(a);
   ComputeMatmulVWVt(result, &builder);
 
-  ComputeAndCompareR2<float>(&builder, low_rank_4x4_, {a_data.get()},
+  ComputeAndCompareR2<float>(&builder, low_rank_4x4_, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_Eigen_8x8) {
+TEST_F(SelfAdjointEigTest, Test_Eigen_8x8) {
   XlaBuilder builder(TestName());
 
   // This is computed by numpy.linalg.eigh with float32.
@@ -231,11 +239,11 @@ XLA_TEST_F(SelfAdjointEigTest, Test_Eigen_8x8) {
   auto result = SelfAdjointEig(a);
   Add(result.w, ZerosLike(result.w));
 
-  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()},
+  ComputeAndCompareR1<float>(&builder, expected, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_8x8) {
+TEST_F(SelfAdjointEigTest, Test_Orthogonality_8x8) {
   XlaBuilder builder(TestName());
 
   float expected_vals = 1e-3;
@@ -248,11 +256,11 @@ XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_8x8) {
                           BatchDot(TransposeInMinorDims(result.v), result.v),
                           &builder);
 
-  ComputeAndCompareR0<float>(&builder, expected_vals, {a_data.get()},
+  ComputeAndCompareR0<float>(&builder, expected_vals, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SelfAdjointEigTest, Wrong_Type_Int) {
+TEST_F(SelfAdjointEigTest, Wrong_Type_Int) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -276,7 +284,8 @@ Array2D<float> GenerateRandomSymmetricMatrix(int size) {
 }
 
 using EighTestCase = int64_t;
-class RandomEighTest : public ClientLibraryTestBase,
+class RandomEighTest : public ClientLibraryTestRunnerMixin<
+                           HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                        public ::testing::WithParamInterface<EighTestCase> {};
 
 XLA_TEST_P(RandomEighTest, Random) {
@@ -290,7 +299,7 @@ XLA_TEST_P(RandomEighTest, Random) {
 
   // TODO(phawkins): this would be better expressed as <= 6e-3.
   double kExpected = 0.00300000003;
-  ComputeAndCompareR0<float>(&builder, kExpected, {a_data.get()},
+  ComputeAndCompareR0<float>(&builder, kExpected, {&a_data},
                              ErrorSpec(kExpected, 0));
 }
 
diff --git a/third_party/xla/xla/hlo/builder/lib/slicing.cc b/third_party/xla/xla/hlo/builder/lib/slicing.cc
index 42dd4c8a82d1..ae0f6b987497 100644
--- a/third_party/xla/xla/hlo/builder/lib/slicing.cc
+++ b/third_party/xla/xla/hlo/builder/lib/slicing.cc
@@ -44,7 +44,7 @@ XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64_t> start,
 
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
 
-    const int64_t n_dims = shape.rank();
+    const int64_t n_dims = shape.dimensions().size();
     TF_RET_CHECK(n_minor_dims <= n_dims);
     auto major_dims = shape.dimensions().subspan(
         /*pos=*/0,
@@ -69,7 +69,7 @@ XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64_t> start) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64_t n_dims = shape.rank();
+    const int64_t n_dims = shape.dimensions().size();
     const int64_t start_size = start.size();
     TF_RET_CHECK(start_size == n_dims);
 
@@ -89,7 +89,7 @@ XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64_t n_dims = shape.rank();
+    const int64_t n_dims = shape.dimensions().size();
     const int64_t n_minor_dims = start.size();
     TF_RET_CHECK(n_minor_dims <= n_dims);
     std::vector<int64_t> padded_start(n_dims, 0);
@@ -113,7 +113,7 @@ absl::StatusOr<std::vector<XlaOp>> PrependZerosInMajorDims(
     XlaOp x, absl::Span<const XlaOp> starts) {
   XlaBuilder* builder = x.builder();
   TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-  const int64_t n_dims = shape.rank();
+  const int64_t n_dims = shape.dimensions().size();
   auto zero = ConstantR0<int32_t>(builder, 0);
   std::vector<XlaOp> padded_starts(n_dims, zero);
   for (int i = 0; i < starts.size(); ++i) {
@@ -129,7 +129,7 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64_t n_dims = shape.rank();
+    const int64_t n_dims = shape.dimensions().size();
     int64_t n_minor_dims = starts.size();
     TF_RET_CHECK(n_minor_dims == sizes.size());
     TF_RET_CHECK(n_minor_dims <= n_dims);
@@ -161,15 +161,15 @@ XlaOp TorchGather(XlaOp input, XlaOp index, int64_t dim, bool sparse) {
       index = ConvertElementType(index, U32);
       index_shape.set_element_type(U32);
     }
-    if (index_shape.rank() == 1) {
+    if (index_shape.dimensions().size() == 1) {
       return TorchIndexSelect(input, index, 0);
     }
     if (!sparse) {
       std::vector<int64_t> index_broadcast_dims;
       std::vector<int64_t> input_broadcast_dims;
       std::vector<int64_t> sizes;
-      sizes.reserve(index_shape.rank());
-      for (int64_t i = 0; i < index_shape.rank(); ++i) {
+      sizes.reserve(index_shape.dimensions().size());
+      for (int64_t i = 0; i < index_shape.dimensions().size(); ++i) {
         if (i < dim) {
           input_broadcast_dims.push_back(i);
           index_broadcast_dims.push_back(i);
@@ -200,19 +200,20 @@ XlaOp TorchGather(XlaOp input, XlaOp index, int64_t dim, bool sparse) {
     ShapeUtil::AppendMajorDimension(1, &index_shape);
     std::vector<XlaOp> to_concat;
 
-    to_concat.reserve(input_shape.rank());
-    for (int64_t i = 0; i < input_shape.rank(); ++i) {
+    to_concat.reserve(input_shape.dimensions().size());
+    for (int64_t i = 0; i < input_shape.dimensions().size(); ++i) {
       if (i == dim) {
         to_concat.push_back(Reshape(index, index_shape.dimensions()));
       } else {
         to_concat.push_back(Iota(builder, index_shape, i));
       }
     }
-    XlaOp gather_indices = ConcatInDim(builder, to_concat, input_shape.rank());
-    std::vector<int64_t> slice_sizes(input_shape.rank(), 1);
+    XlaOp gather_indices =
+        ConcatInDim(builder, to_concat, input_shape.dimensions().size());
+    std::vector<int64_t> slice_sizes(input_shape.dimensions().size(), 1);
     GatherDimensionNumbers gather_dnums;
-    gather_dnums.set_index_vector_dim(input_shape.rank());
-    for (int64_t i = 0; i < input_shape.rank(); ++i) {
+    gather_dnums.set_index_vector_dim(input_shape.dimensions().size());
+    for (int64_t i = 0; i < input_shape.dimensions().size(); ++i) {
       gather_dnums.add_collapsed_slice_dims(i);
       gather_dnums.add_start_index_map(i);
     }
@@ -228,9 +229,9 @@ XlaOp TorchScatterDense(XlaOp input, XlaOp index, XlaOp src, int64_t dim,
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     std::vector<int64_t> index_broadcast_dims;
     std::vector<int64_t> sizes;
-    const auto rank = index_shape.rank();
+    const auto rank = index_shape.dimensions().size();
     sizes.reserve(rank + 1);
-    for (int64_t i = 0; i < index_shape.rank(); ++i) {
+    for (int64_t i = 0; i < index_shape.dimensions().size(); ++i) {
       if (i < dim) {
         index_broadcast_dims.push_back(i);
       } else {
@@ -277,7 +278,7 @@ XlaOp TorchIndexSelect(XlaOp input, XlaOp index, int64_t dim,
     }
     std::vector<int64_t> slice_sizes = SpanToVector(input_shape.dimensions());
     GatherDimensionNumbers gather_dnums;
-    gather_dnums.set_index_vector_dim(index_shape.rank());
+    gather_dnums.set_index_vector_dim(index_shape.dimensions().size());
     if (batch_dims > 0) {
       ShapeUtil::AppendMajorDimension(1, &index_shape);
       std::vector<XlaOp> to_concat;
@@ -289,7 +290,7 @@ XlaOp TorchIndexSelect(XlaOp input, XlaOp index, int64_t dim,
       to_concat.push_back(Reshape(index, index_shape.dimensions()));
       index = ConcatInDim(builder, to_concat, gather_dnums.index_vector_dim());
     }
-    for (int64_t i = 0; i < input_shape.rank(); ++i) {
+    for (int64_t i = 0; i < input_shape.dimensions().size(); ++i) {
       if (i < batch_dims || i == dim) {
         slice_sizes[i] = std::min<int64_t>(slice_sizes[i], 1);
         gather_dnums.add_collapsed_slice_dims(i);
diff --git a/third_party/xla/xla/hlo/builder/lib/sorting.cc b/third_party/xla/xla/hlo/builder/lib/sorting.cc
index 8d4eea1e3b6e..f8958f34f26c 100644
--- a/third_party/xla/xla/hlo/builder/lib/sorting.cc
+++ b/third_party/xla/xla/hlo/builder/lib/sorting.cc
@@ -37,7 +37,7 @@ XlaOp TopK(XlaOp input, int64_t k, PrimitiveType index_type) {
   XlaBuilder* const builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
-    int last_dim = input_shape.dimensions_size() - 1;
+    int last_dim = input_shape.dimensions().size() - 1;
     int64_t last_dim_size = input_shape.dimensions(last_dim);
     // TODO(b/148796364): tune these constants for better performance.
     const int64_t kPerPartitionSize = 8192;        // 2^13
@@ -56,7 +56,7 @@ XlaOp TopK(XlaOp input, int64_t k, PrimitiveType index_type) {
     Shape iota_shape =
         ShapeUtil::MakeShape(index_type, input_shape.dimensions());
     XlaOp iota = Iota(builder, iota_shape, last_dim);
-    for (int64_t i = 0; i < input_shape.rank(); ++i) {
+    for (int64_t i = 0; i < input_shape.dimensions().size(); ++i) {
       if (input_shape.is_dynamic_dimension(i)) {
         // Propagate dynamic dimension from inputs to iota.
         iota = SetDimensionSize(iota, GetDimensionSize(input, i), i);
@@ -79,13 +79,13 @@ XlaOp TopK(XlaOp input, int64_t k, PrimitiveType index_type) {
         (input_shape.element_type() == BF16 &&
          last_dim_size < kLow16BitsLimit &&
          (last_dim_size < kMaxLastDimSizeForSmallBatches ||
-          (input_shape.rank() == 2 &&
+          (input_shape.dimensions().size() == 2 &&
            input_shape.dimensions(0) >= kSmallBatchSizeThreshold)));
 
-    std::vector<int64_t> start_indices(input_shape.dimensions_size(), 0);
+    std::vector<int64_t> start_indices(input_shape.dimensions().size(), 0);
     std::vector<int64_t> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
-    std::vector<int64_t> strides(input_shape.dimensions_size(), 1);
+    std::vector<int64_t> strides(input_shape.dimensions().size(), 1);
 
     XlaOp values;
     XlaOp indices;
@@ -165,7 +165,7 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions,
   XlaBuilder* const builder = input.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
-    int last_dim = input_shape.dimensions_size() - 1;
+    int last_dim = input_shape.dimensions().size() - 1;
     // Calculate per partition size.
     auto input_dims = input_shape.dimensions();
     int64_t last_dim_size = input_shape.dimensions(last_dim);
@@ -179,7 +179,7 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions,
     Shape iota_shape =
         ShapeUtil::MakeShape(index_type, input_shape.dimensions());
     XlaOp iota = Iota(builder, iota_shape, last_dim);
-    for (int64_t i = 0; i < input_shape.rank(); ++i) {
+    for (int64_t i = 0; i < input_shape.dimensions().size(); ++i) {
       if (input_shape.is_dynamic_dimension(i)) {
         // Propagate dynamic dimension from inputs to iota.
         iota = SetDimensionSize(iota, GetDimensionSize(input, i), i);
@@ -213,9 +213,9 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions,
                                     sliced_indices.builder()),
           last_dim, true);
 
-      std::vector<int64_t> start_indices(input_shape.dimensions_size(), 0);
+      std::vector<int64_t> start_indices(input_shape.dimensions().size(), 0);
       std::vector<int64_t> limit_indices(input_dims.begin(), input_dims.end());
-      std::vector<int64_t> strides(input_shape.dimensions_size(), 1);
+      std::vector<int64_t> strides(input_shape.dimensions().size(), 1);
       // Slice topk.
       start_indices[last_dim] = 0;
       limit_indices[last_dim] = k;
@@ -228,9 +228,9 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions,
 
     // Get the values and indices for the first topk so that they can
     // be passed to the while loop.
-    std::vector<int64_t> start_indices(input_shape.dimensions_size(), 0);
+    std::vector<int64_t> start_indices(input_shape.dimensions().size(), 0);
     std::vector<int64_t> limit_indices(input_dims.begin(), input_dims.end());
-    std::vector<int64_t> strides(input_shape.dimensions_size(), 1);
+    std::vector<int64_t> strides(input_shape.dimensions().size(), 1);
     start_indices[last_dim] = 0;
     limit_indices[last_dim] = per_partition_size;
     // Slice value and indices for the first partition.
diff --git a/third_party/xla/xla/hlo/builder/lib/svd.cc b/third_party/xla/xla/hlo/builder/lib/svd.cc
index d28a252d3dee..561c107ba808 100644
--- a/third_party/xla/xla/hlo/builder/lib/svd.cc
+++ b/third_party/xla/xla/hlo/builder/lib/svd.cc
@@ -115,7 +115,7 @@ absl::StatusOr<HouseHolderResult> HouseRow(
     PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int64_t num_dims = a_shape.rank();
+  const int64_t num_dims = a_shape.dimensions().size();
   const int64_t n = ShapeUtil::GetDimension(a_shape, -1);
   XlaOp zero = ScalarLike(i, 0);
   XlaOp x = DynamicSliceInMinorDims(a, {i, zero}, {1, n});
@@ -181,7 +181,7 @@ absl::StatusOr<HouseHolderResult> HouseCol(
     PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int64_t num_dims = a_shape.rank();
+  const int64_t num_dims = a_shape.dimensions().size();
   const int64_t m = ShapeUtil::GetDimension(a_shape, -2);
   XlaOp zero = ScalarLike(i, 0);
   XlaOp x = DynamicSliceInMinorDims(a, {zero, j}, {m, 1});
@@ -259,7 +259,7 @@ absl::StatusOr<SVDResult> HouseHolderBidiagonalization(
     XlaOp a, XlaOp eps, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int64_t num_dims = a_shape.rank();
+  const int64_t num_dims = a_shape.dimensions().size();
   const int64_t num_batch_dims = num_dims - 2;
   std::vector<int64_t> batch_dims(num_batch_dims);
   for (int i = 0; i < num_batch_dims; ++i) {
@@ -464,7 +464,7 @@ absl::StatusOr<SVDResult> OneSidedJacobiUpdate(SVDResult svd_result, XlaOp p,
   XlaOp d = svd_result.d;
   XlaBuilder* builder = d.builder();
   TF_ASSIGN_OR_RETURN(Shape d_shape, builder->GetShape(d));
-  const int64_t num_dims = d_shape.rank();
+  const int64_t num_dims = d_shape.dimensions().size();
   const int64_t num_batch_dims = num_dims - 2;
   std::vector<int64_t> batch_dims(num_batch_dims);
   for (int i = 0; i < num_batch_dims; ++i) {
@@ -574,7 +574,7 @@ absl::StatusOr<SVDResult> OneSidedJacobiUpdate(SVDResult svd_result, XlaOp p,
 absl::StatusOr<XlaOp> ComputeToleranceComparison(XlaOp w, XlaOp epsilon) {
   XlaBuilder* builder = w.builder();
   TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(w));
-  auto num_dims = static_cast<int32_t>(shape.rank());
+  auto num_dims = static_cast<int32_t>(shape.dimensions().size());
   int64_t n = shape.dimensions(num_dims - 1);
   shape.set_dimensions(num_dims - 2, n);
   auto w_sliced = SliceInMinorDims(w, {0, 0}, {n, n});
@@ -743,7 +743,7 @@ absl::StatusOr<SVDResult> SortBySingularValuesAndPostProcessing(
     SVDResult result) {
   XlaBuilder* builder = result.d.builder();
   TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(result.d));
-  const int64_t num_dims = shape.rank();
+  const int64_t num_dims = shape.dimensions().size();
   auto dimensions = shape.dimensions();
   const int64_t m = ShapeUtil::GetDimension(shape, -2);
   const int64_t n = ShapeUtil::GetDimension(shape, -1);
@@ -844,7 +844,7 @@ SVDResult SVD(XlaOp a, int64_t max_iter, float epsilon,
     return return_error(shape_with_status.status());
   }
   Shape a_shape = shape_with_status.value();
-  const int64_t num_dims = a_shape.rank();
+  const int64_t num_dims = a_shape.dimensions().size();
   const int64_t num_batch_dims = num_dims - 2;
   std::vector<int64_t> batch_dims(num_batch_dims);
   for (int i = 0; i < num_batch_dims; ++i) {
diff --git a/third_party/xla/xla/hlo/builder/lib/svd_test.cc b/third_party/xla/xla/hlo/builder/lib/svd_test.cc
index cbf9a4bcabc5..e7ad1dcaec1b 100644
--- a/third_party/xla/xla/hlo/builder/lib/svd_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/svd_test.cc
@@ -30,16 +30,21 @@ limitations under the License.
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
-class SVDTest : public ClientLibraryTestBase {
+class SVDTest : public ClientLibraryTestRunnerMixin<
+                    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   void SetUp() override {
-    ClientLibraryTestBase::SetUp();
+    ClientLibraryTestRunnerMixin<
+        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>::SetUp();
     batch_3d_4x5_ = Array3D<float>{
         {
             {4, 6, 8, 10, 1},
@@ -55,7 +60,6 @@ class SVDTest : public ClientLibraryTestBase {
         },
     };
   }
-  void TearDown() override { ClientLibraryTestBase::TearDown(); }
 
   Array3D<float> GetUnitMatrix3D(int32_t batch_dim, int32_t mat_dim) {
     Array3D<float> result(batch_dim, mat_dim, mat_dim, 0.0);
@@ -84,7 +88,7 @@ class SVDTest : public ClientLibraryTestBase {
       v = SliceInMinorDims(v, {0, 0}, {n, m});
     }
 
-    int num_dims = u_shape.rank();
+    int num_dims = u_shape.dimensions().size();
     std::vector<int64_t> broadcast_dims(num_dims - 1);
     std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
     broadcast_dims[num_dims - 2] = num_dims - 1;
@@ -112,7 +116,7 @@ class SVDTest : public ClientLibraryTestBase {
   Array3D<float> batch_3d_4x5_;
 };
 
-XLA_TEST_F(SVDTest, Simple2D) {
+TEST_F(SVDTest, Simple2D) {
   XlaBuilder builder(TestName());
 
   Array2D<float> simple_2d_4x4_ = Array2D<float>{
@@ -126,11 +130,11 @@ XLA_TEST_F(SVDTest, Simple2D) {
   auto result = SVD(a, 100, 1e-6);
   ComputeMatmulUDVT(result, &builder);
 
-  ComputeAndCompareR2<float>(&builder, simple_2d_4x4_, {a_data.get()},
+  ComputeAndCompareR2<float>(&builder, simple_2d_4x4_, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Test_VWVt_EQ_A_2x4x5) {
+TEST_F(SVDTest, Test_VWVt_EQ_A_2x4x5) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -138,11 +142,11 @@ XLA_TEST_F(SVDTest, Test_VWVt_EQ_A_2x4x5) {
   auto result = SVD(a, 100, 1e-8);
   ComputeMatmulUDVT(result, &builder);
 
-  ComputeAndCompareR3<float>(&builder, batch_3d_4x5_, {a_data.get()},
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x5_, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Test_Orthogonality_U) {
+TEST_F(SVDTest, Test_Orthogonality_U) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -151,11 +155,11 @@ XLA_TEST_F(SVDTest, Test_Orthogonality_U) {
   ComputeMatmulUDVT(result, &builder);
   BatchDot(result.u, TransposeInMinorDims(result.u));
 
-  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(2, 4), {a_data.get()},
+  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(2, 4), {&a_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(SVDTest, Test_Orthogonality_V) {
+TEST_F(SVDTest, Test_Orthogonality_V) {
   XlaBuilder builder(TestName());
 
   XlaOp a;
@@ -163,11 +167,11 @@ XLA_TEST_F(SVDTest, Test_Orthogonality_V) {
   auto result = SVD(a, 100, 1e-8);
   BatchDot(result.v, TransposeInMinorDims(result.v), PrecisionConfig::HIGHEST);
 
-  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(2, 5), {a_data.get()},
+  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(2, 5), {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, TestSingleValuesMatchNumpy) {
+TEST_F(SVDTest, TestSingleValuesMatchNumpy) {
   XlaBuilder builder(TestName());
 
   auto singular_values = Array2D<float>{
@@ -180,13 +184,12 @@ XLA_TEST_F(SVDTest, TestSingleValuesMatchNumpy) {
   auto result = SVD(a, 100, 1e-8);
   Add(result.d, ZerosLike(result.d));
 
-  ComputeAndCompareR2<float>(&builder, singular_values, {a_data.get()},
+  ComputeAndCompareR2<float>(&builder, singular_values, {&a_data},
                              ErrorSpec(1e-3, 1e-3));
 }
 
 // Too slow on the interpreter backend.
-XLA_TEST_F(SVDTest,
-           DISABLED_ON_INTERPRETER(Various_Size_Random_Matrix_512x128)) {
+TEST_F(SVDTest, DISABLED_ON_INTERPRETER(Various_Size_Random_Matrix_512x128)) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(512, 128);
   XlaOp a;
@@ -194,11 +197,10 @@ XLA_TEST_F(SVDTest,
   auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
-  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareR0<float>(&builder, 1e-3, {&a_data}, ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_128x256) {
+TEST_F(SVDTest, Various_Size_Random_Matrix_128x256) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(128, 256);
   XlaOp a;
@@ -206,11 +208,10 @@ XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_128x256) {
   auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
-  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareR0<float>(&builder, 1e-3, {&a_data}, ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_256x128) {
+TEST_F(SVDTest, Various_Size_Random_Matrix_256x128) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(256, 128);
   XlaOp a;
@@ -218,13 +219,11 @@ XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_256x128) {
   auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
-  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareR0<float>(&builder, 1e-3, {&a_data}, ErrorSpec(1e-3, 1e-3));
 }
 
 // Too slow on the interpreter backend.
-XLA_TEST_F(SVDTest,
-           DISABLED_ON_INTERPRETER(Various_Size_Random_Matrix_128x512)) {
+TEST_F(SVDTest, DISABLED_ON_INTERPRETER(Various_Size_Random_Matrix_128x512)) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(128, 512);
   XlaOp a;
@@ -232,13 +231,12 @@ XLA_TEST_F(SVDTest,
   auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
-  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareR0<float>(&builder, 1e-3, {&a_data}, ErrorSpec(1e-3, 1e-3));
 }
 
 // Too slow on the interpreter and CPU backends.
-XLA_TEST_F(SVDTest, DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
-                        Various_Size_Random_Matrix_512x256))) {
+TEST_F(SVDTest, DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
+                    Various_Size_Random_Matrix_512x256))) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(512, 256);
   XlaOp a;
@@ -246,13 +244,12 @@ XLA_TEST_F(SVDTest, DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
   auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
-  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareR0<float>(&builder, 1e-3, {&a_data}, ErrorSpec(1e-3, 1e-3));
 }
 
 // Too slow on the CPU, GPU and interpreter backends.
-XLA_TEST_F(SVDTest, DISABLED_ON_GPU(DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
-                        Various_Size_Random_Matrix_512x512)))) {
+TEST_F(SVDTest, DISABLED_ON_GPU(DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
+                    Various_Size_Random_Matrix_512x512)))) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(512, 512);
   XlaOp a;
@@ -260,8 +257,7 @@ XLA_TEST_F(SVDTest, DISABLED_ON_GPU(DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
   auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
-  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareR0<float>(&builder, 1e-3, {&a_data}, ErrorSpec(1e-3, 1e-3));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/builder/lib/tridiagonal.cc b/third_party/xla/xla/hlo/builder/lib/tridiagonal.cc
index 9282560e8792..ab9ffac8b51b 100644
--- a/third_party/xla/xla/hlo/builder/lib/tridiagonal.cc
+++ b/third_party/xla/xla/hlo/builder/lib/tridiagonal.cc
@@ -69,10 +69,10 @@ absl::StatusOr<int64_t> CheckSystemAndReturnNumEquations(XlaOp lower_diagonal,
                       builder->GetShape(upper_diagonal));
   TF_ASSIGN_OR_RETURN(Shape rhs_shape, builder->GetShape(rhs));
 
-  const auto lower_diagonal_rank = lower_diagonal_shape.rank();
-  const auto main_diagonal_rank = main_diagonal_shape.rank();
-  const auto upper_diagonal_rank = upper_diagonal_shape.rank();
-  const auto rhs_rank = rhs_shape.rank();
+  const auto lower_diagonal_rank = lower_diagonal_shape.dimensions().size();
+  const auto main_diagonal_rank = main_diagonal_shape.dimensions().size();
+  const auto upper_diagonal_rank = upper_diagonal_shape.dimensions().size();
+  const auto rhs_rank = rhs_shape.dimensions().size();
   if (!((lower_diagonal_rank == main_diagonal_rank) &&
         (lower_diagonal_rank == upper_diagonal_rank) &&
         (lower_diagonal_rank == rhs_rank))) {
@@ -127,8 +127,8 @@ struct TridiagonalMatMulShapeParams {
 absl::Status ValidateTridiagonalMatMulDiagonal(
     const Shape& diagonal_shape, const absl::string_view diagonal_name,
     const Shape& rhs_shape) {
-  const int64_t diagonal_rank = diagonal_shape.rank();
-  const int64_t rhs_rank = rhs_shape.rank();
+  const int64_t diagonal_rank = diagonal_shape.dimensions().size();
+  const int64_t rhs_rank = rhs_shape.dimensions().size();
   if (diagonal_rank != rhs_rank) {
     return InvalidArgument("%s must have same rank as rhs, but got %d and %d.",
                            diagonal_name, diagonal_rank, rhs_rank);
@@ -178,7 +178,7 @@ CheckMatMulSystemAndReturnShapeParams(XlaOp upper_diagonal, XlaOp main_diagonal,
                       builder->GetShape(lower_diagonal));
   TF_ASSIGN_OR_RETURN(const Shape rhs_shape, builder->GetShape(rhs));
 
-  const int64_t rank = rhs_shape.rank();
+  const int64_t rank = rhs_shape.dimensions().size();
   if (rank < 2) {
     return InvalidArgument("Input must have rank >= 2, but got %d.", rank);
   }
@@ -405,7 +405,7 @@ absl::StatusOr<XlaOp> TridiagonalSolver(SolverAlgorithm algo, XlaOp diagonals,
                                         XlaOp rhs) {
   XlaBuilder* builder = diagonals.builder();
   TF_ASSIGN_OR_RETURN(Shape diagonals_shape, builder->GetShape(diagonals));
-  const int64_t rank = diagonals_shape.rank();
+  const int64_t rank = diagonals_shape.dimensions().size();
 
   auto upper_diagonal =
       SliceInDim(diagonals, /*start_index=*/0, /*limit_index=*/1,
diff --git a/third_party/xla/xla/hlo/builder/lib/tridiagonal_test.cc b/third_party/xla/xla/hlo/builder/lib/tridiagonal_test.cc
index f1410530108a..c6bf2c86bd7b 100644
--- a/third_party/xla/xla/hlo/builder/lib/tridiagonal_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/tridiagonal_test.cc
@@ -21,24 +21,27 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
-#include "xla/array.h"
 #include "xla/array3d.h"
 #include "xla/hlo/builder/lib/slicing.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/util.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace tridiagonal {
 namespace {
 
 class TridiagonalTest
-    : public ClientLibraryTestBase,
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
       public ::testing::WithParamInterface<std::tuple<int, int, int>> {};
 
 XLA_TEST_P(TridiagonalTest, SimpleTridiagonalMatMulOk) {
@@ -75,10 +78,10 @@ XLA_TEST_P(TridiagonalTest, SimpleTridiagonalMatMulOk) {
   std::vector<float> expected_values{191, 246, 301, 356, 435, 502,
                                      569, 636, 707, 830, 953, 1076};
   TF_ASSERT_OK_AND_ASSIGN(
-      auto result,
-      ComputeAndTransfer(x.builder(),
-                         {upper_diagonal_data.get(), main_diagonal_data.get(),
-                          lower_diagonal_data.get(), rhs_data.get()}));
+      const Literal result,
+      ExecuteAndTransfer(x.builder(),
+                         {&upper_diagonal_data, &main_diagonal_data,
+                          &lower_diagonal_data, &rhs_data}));
   EXPECT_EQ(result.shape().dimensions(), expected_shape);
   EXPECT_EQ(result.data<float>({}), expected_values);
 }
@@ -86,23 +89,14 @@ XLA_TEST_P(TridiagonalTest, SimpleTridiagonalMatMulOk) {
 XLA_TEST_P(TridiagonalTest, TridiagonalMatMulWrongShape) {
   xla::XlaBuilder builder(TestName());
 
-  Array<float> upper_diagonal = Array<float>({5, 3, 7}, 1);
-  Array<float> main_diagonal = Array<float>({5, 3, 7}, 1);
-  Array<float> lower_diagonal = Array<float>({5, 3, 7}, 1);
-  Array<float> rhs = Array<float>({5, 3, 7, 6}, 1);
-
-  XlaOp upper_diagonal_xla;
-  XlaOp main_diagonal_xla;
-  XlaOp lower_diagonal_xla;
-  XlaOp rhs_xla;
-
-  auto upper_diagonal_data = CreateParameter<float>(
-      upper_diagonal, 0, "upper_diagonal", &builder, &upper_diagonal_xla);
-  auto main_diagonal_data = CreateParameter<float>(
-      main_diagonal, 1, "main_diagonal", &builder, &main_diagonal_xla);
-  auto lower_diagonal_data = CreateParameter<float>(
-      lower_diagonal, 2, "lower_diagonal", &builder, &lower_diagonal_xla);
-  auto rhs_data = CreateParameter<float>(rhs, 3, "rhs", &builder, &rhs_xla);
+  XlaOp upper_diagonal_xla = Parameter(
+      &builder, 0, ShapeUtil::MakeShape(F32, {5, 3, 7}), "upper_diagonal");
+  XlaOp main_diagonal_xla = Parameter(
+      &builder, 1, ShapeUtil::MakeShape(F32, {5, 3, 7}), "main_diagonal");
+  XlaOp lower_diagonal_xla = Parameter(
+      &builder, 2, ShapeUtil::MakeShape(F32, {5, 3, 7}), "lower_diagonal");
+  XlaOp rhs_xla =
+      Parameter(&builder, 3, ShapeUtil::MakeShape(F32, {5, 3, 7, 6}), "rhs");
 
   auto result = TridiagonalMatMul(upper_diagonal_xla, main_diagonal_xla,
                                   lower_diagonal_xla, rhs_xla);
@@ -177,13 +171,11 @@ XLA_TEST_P(TridiagonalTest, Solves) {
   Abs(ConcatInDim(&builder, relative_errors, 2));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto result,
-      ComputeAndTransfer(&builder,
-                         {lower_diagonal_data.get(), main_diagonal_data.get(),
-                          upper_diagonal_data.get(), rhs_data.get()}));
+      const Literal result,
+      ExecuteAndTransfer(&builder, {&lower_diagonal_data, &main_diagonal_data,
+                                    &upper_diagonal_data, &rhs_data}));
 
-  auto result_data = result.data<float>({});
-  for (auto result_component : result_data) {
+  for (const float result_component : result.data<float>({})) {
     EXPECT_TRUE(result_component < 5e-3);
   }
 }
diff --git a/third_party/xla/xla/hlo/builder/lib/tuple.cc b/third_party/xla/xla/hlo/builder/lib/tuple.cc
index 6a0145addefb..3d08e5bb483b 100644
--- a/third_party/xla/xla/hlo/builder/lib/tuple.cc
+++ b/third_party/xla/xla/hlo/builder/lib/tuple.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/hlo/builder/lib/tuple.h"
 
-#include <utility>
-
 #include "absl/container/inlined_vector.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/builder/xla_builder.h"
@@ -39,7 +37,7 @@ absl::StatusOr<ShapeTree<XlaOp>> DisassembleTuple(XlaOp tuple) {
       *element = GetTupleElement(parent, index.back());
     }
   });
-  return std::move(result);
+  return result;
 }
 
 XlaOp AssembleTuple(XlaBuilder* builder, ShapeTree<XlaOp> elements) {
@@ -49,7 +47,7 @@ XlaOp AssembleTuple(XlaBuilder* builder, ShapeTree<XlaOp> elements) {
         if (subshape.IsTuple()) {
           absl::InlinedVector<XlaOp, 2> children;
           ShapeIndex child_index = index;
-          for (int i = 0; i < subshape.tuple_shapes_size(); ++i) {
+          for (int i = 0; i < subshape.tuple_shapes().size(); ++i) {
             child_index.push_back(i);
             children.push_back(elements.element(child_index));
             child_index.pop_back();
diff --git a/third_party/xla/xla/hlo/builder/lib/tuple_test.cc b/third_party/xla/xla/hlo/builder/lib/tuple_test.cc
index 67f270300acc..c81ceb4d8349 100644
--- a/third_party/xla/xla/hlo/builder/lib/tuple_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/tuple_test.cc
@@ -16,28 +16,28 @@ limitations under the License.
 #include "xla/hlo/builder/lib/tuple.h"
 
 #include <cstdint>
-#include <memory>
 #include <utility>
 
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/service/service.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class TupleTest : public ClientLibraryTestBase {};
+using TupleTest = ClientLibraryTestRunnerMixin<HloTestBase>;
 
-XLA_TEST_F(TupleTest, DisassembleAssemble) {
+TEST_F(TupleTest, DisassembleAssemble) {
   XlaBuilder builder(TestName());
 
   Shape shape = ShapeUtil::MakeTupleShape({
@@ -70,18 +70,13 @@ XLA_TEST_F(TupleTest, DisassembleAssemble) {
   });
   AssembleTuple(&builder, std::move(disassembled_tuple));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> data,
-                          client_->TransferToServer(input));
-
-  Literal expected = LiteralUtil::MakeTupleOwned(
+  const Literal expected = LiteralUtil::MakeTupleOwned(
       LiteralUtil::CreateFullWithDescendingLayout({3}, int32_t{43}),
       LiteralUtil::MakeTupleOwned(
           LiteralUtil::CreateFullWithDescendingLayout({4}, int32_t{45}),
           LiteralUtil::CreateFullWithDescendingLayout({5}, int32_t{47})),
       LiteralUtil::CreateFullWithDescendingLayout({6}, int32_t{49}));
-
-  ComputeAndCompareLiteral(&builder, expected, {data.get()}, ErrorSpec(0),
-                           &shape);
+  ComputeAndCompareLiteral(&builder, expected, {&input}, ErrorSpec(0), &shape);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/hlo/builder/sharding_builder.cc b/third_party/xla/xla/hlo/builder/sharding_builder.cc
index 245038195c8e..6ff89b0c8e16 100644
--- a/third_party/xla/xla/hlo/builder/sharding_builder.cc
+++ b/third_party/xla/xla/hlo/builder/sharding_builder.cc
@@ -65,7 +65,7 @@ OpSharding Tile1D(const Shape& tile_shape, int64_t num_tiles) {
   OpSharding result;
   result.set_type(OpSharding::OTHER);
 
-  CHECK_EQ(tile_shape.rank(), 1);
+  CHECK_EQ(tile_shape.dimensions().size(), 1);
   std::vector<int64_t> dimensions(1, num_tiles);
   *result.mutable_tile_shape() = tile_shape.ToProto();
   auto& tile_dimension =
diff --git a/third_party/xla/xla/hlo/builder/tests/BUILD b/third_party/xla/xla/hlo/builder/tests/BUILD
new file mode 100644
index 000000000000..22251bfd5607
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/BUILD
@@ -0,0 +1,18 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
+    licenses = ["notice"],
+)
+
+lit_test_suite(
+    name = "hlo_opt_tests",
+    srcs = glob(["*.hlo"]),
+    cfg = "//xla:lit.cfg.py",
+    tools = [
+        "//xla/hlo/tools:hlo-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/builder/tests/math_acos.hlo b/third_party/xla/xla/hlo/builder/tests/math_acos.hlo
new file mode 100644
index 000000000000..f43f13b5542c
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_acos.hlo
@@ -0,0 +1,132 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule acos_f32, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_Acos_17:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f32[] constant(-1)
+// CHECK-NEXT:  %[[broadcast_3:[^ ]+]] = f32[] broadcast(%[[constant_2]]), dimensions={}
+// CHECK-NEXT:  %[[compare_4:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[broadcast_3]]), direction=NE
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[multiply_7:[^ ]+]] = f32[] multiply(%[[arg0_1]], %[[arg0_1]])
+// CHECK-NEXT:  %[[subtract_8:[^ ]+]] = f32[] subtract(%[[constant_6]], %[[multiply_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = f32[] sqrt(%[[subtract_8]])
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_11:[^ ]+]] = f32[] add(%[[constant_10]], %[[arg0_1]])
+// CHECK-NEXT:  %[[atan2_12:[^ ]+]] = f32[] atan2(%[[sqrt_9]], %[[add_11]])
+// CHECK-NEXT:  %[[multiply_13:[^ ]+]] = f32[] multiply(%[[constant_5]], %[[atan2_12]])
+// CHECK-NEXT:  %[[constant_14:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[broadcast_15:[^ ]+]] = f32[] broadcast(%[[constant_14]]), dimensions={}
+// CHECK-NEXT:  ROOT %[[select_16:[^ ]+]] = f32[] select(%[[compare_4]], %[[multiply_13]], %[[broadcast_15]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos", called_computations={%[[$xla_builder_math_Acos_17]]}
+
+HloModule acos_f32, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acos"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule acos_f64, entry_computation_layout={(f64[])->f64[]}
+
+// CHECK:       %[[$xla_builder_math_Acos_17:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f64[] constant(-1)
+// CHECK-NEXT:  %[[broadcast_3:[^ ]+]] = f64[] broadcast(%[[constant_2]]), dimensions={}
+// CHECK-NEXT:  %[[compare_4:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[broadcast_3]]), direction=NE
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f64[] constant(2)
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[multiply_7:[^ ]+]] = f64[] multiply(%[[arg0_1]], %[[arg0_1]])
+// CHECK-NEXT:  %[[subtract_8:[^ ]+]] = f64[] subtract(%[[constant_6]], %[[multiply_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = f64[] sqrt(%[[subtract_8]])
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[add_11:[^ ]+]] = f64[] add(%[[constant_10]], %[[arg0_1]])
+// CHECK-NEXT:  %[[atan2_12:[^ ]+]] = f64[] atan2(%[[sqrt_9]], %[[add_11]])
+// CHECK-NEXT:  %[[multiply_13:[^ ]+]] = f64[] multiply(%[[constant_5]], %[[atan2_12]])
+// CHECK-NEXT:  %[[constant_14:[^ ]+]] = f64[] constant(3.1415926535897931)
+// CHECK-NEXT:  %[[broadcast_15:[^ ]+]] = f64[] broadcast(%[[constant_14]]), dimensions={}
+// CHECK-NEXT:  ROOT %[[select_16:[^ ]+]] = f64[] select(%[[compare_4]], %[[multiply_13]], %[[broadcast_15]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos", called_computations={%[[$xla_builder_math_Acos_17]]}
+
+HloModule acos_f64, entry_computation_layout={(f64[])->f64[]}
+
+ENTRY %main.3 (Arg_0.1: f64[]) -> f64[] {
+  %Arg_0.1 = f64[] parameter(0)
+  ROOT %custom-call.2 = f64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acos"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule acos_complex64, entry_computation_layout={(c64[])->c64[]}
+
+// CHECK:       %[[$xla_builder_math_Acos_15:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[complex_5:[^ ]+]] = c64[] complex(%[[constant_3]], %[[constant_4]])
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c64[] constant((1, 0))
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = c64[] add(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[subtract_7:[^ ]+]] = c64[] subtract(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = c64[] multiply(%[[add_6]], %[[subtract_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = c64[] sqrt(%[[multiply_8]])
+// CHECK-NEXT:  %[[multiply_10:[^ ]+]] = c64[] multiply(%[[complex_5]], %[[sqrt_9]])
+// CHECK-NEXT:  %[[add_11:[^ ]+]] = c64[] add(%[[arg0_1]], %[[multiply_10]])
+// CHECK-NEXT:  %[[log_12:[^ ]+]] = c64[] log(%[[add_11]])
+// CHECK-NEXT:  %[[multiply_13:[^ ]+]] = c64[] multiply(%[[complex_5]], %[[log_12]])
+// CHECK-NEXT:  ROOT %[[negate_14:[^ ]+]] = c64[] negate(%[[multiply_13]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos", called_computations={%[[$xla_builder_math_Acos_15]]}
+
+HloModule acos_complex64, entry_computation_layout={(c64[])->c64[]}
+
+ENTRY %main.3 (Arg_0.1: c64[]) -> c64[] {
+  %Arg_0.1 = c64[] parameter(0)
+  ROOT %custom-call.2 = c64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acos"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule acos_complex128, entry_computation_layout={(c128[])->c128[]}
+
+// CHECK:       %[[$xla_builder_math_Acos_15:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f64[] constant(0)
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[complex_5:[^ ]+]] = c128[] complex(%[[constant_3]], %[[constant_4]])
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c128[] constant((1, 0))
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = c128[] add(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[subtract_7:[^ ]+]] = c128[] subtract(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = c128[] multiply(%[[add_6]], %[[subtract_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = c128[] sqrt(%[[multiply_8]])
+// CHECK-NEXT:  %[[multiply_10:[^ ]+]] = c128[] multiply(%[[complex_5]], %[[sqrt_9]])
+// CHECK-NEXT:  %[[add_11:[^ ]+]] = c128[] add(%[[arg0_1]], %[[multiply_10]])
+// CHECK-NEXT:  %[[log_12:[^ ]+]] = c128[] log(%[[add_11]])
+// CHECK-NEXT:  %[[multiply_13:[^ ]+]] = c128[] multiply(%[[complex_5]], %[[log_12]])
+// CHECK-NEXT:  ROOT %[[negate_14:[^ ]+]] = c128[] negate(%[[multiply_13]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acos", called_computations={%[[$xla_builder_math_Acos_15]]}
+
+HloModule acos_complex128, entry_computation_layout={(c128[])->c128[]}
+
+ENTRY %main.3 (Arg_0.1: c128[]) -> c128[] {
+  %Arg_0.1 = c128[] parameter(0)
+  ROOT %custom-call.2 = c128[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acos"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/math_acosh.hlo b/third_party/xla/xla/hlo/builder/tests/math_acosh.hlo
new file mode 100644
index 000000000000..a9820488ab2b
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_acosh.hlo
@@ -0,0 +1,136 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule acosh_f32, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_Acosh_22:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f32[] constant(-1)
+// CHECK-NEXT:  %[[compare_18:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_3]]), direction=LT
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_5:[^ ]+]] = f32[] broadcast(%[[constant_4]]), dimensions={}
+// CHECK-NEXT:  %[[constant_16:[^ ]+]] = f32[] constant(3.40282347e+38)
+// CHECK-NEXT:  %[[sqrt_17:[^ ]+]] = f32[] sqrt(%[[constant_16]])
+// CHECK-NEXT:  %[[compare_19:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[sqrt_17]]), direction=GE
+// CHECK-NEXT:  %[[log_12:[^ ]+]] = f32[] log(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[log_14:[^ ]+]] = f32[] log(%[[constant_13]])
+// CHECK-NEXT:  %[[add_15:[^ ]+]] = f32[] add(%[[log_12]], %[[log_14]])
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = f32[] add(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[subtract_7:[^ ]+]] = f32[] subtract(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = f32[] multiply(%[[add_6]], %[[subtract_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = f32[] sqrt(%[[multiply_8]])
+// CHECK-NEXT:  %[[add_10:[^ ]+]] = f32[] add(%[[arg0_1]], %[[sqrt_9]])
+// CHECK-NEXT:  %[[log_11:[^ ]+]] = f32[] log(%[[add_10]])
+// CHECK-NEXT:  %[[select_20:[^ ]+]] = f32[] select(%[[compare_19]], %[[add_15]], %[[log_11]])
+// CHECK-NEXT:  ROOT %[[select_21:[^ ]+]] = f32[] select(%[[compare_18]], %[[broadcast_5]], %[[select_20]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh", called_computations={%[[$xla_builder_math_Acosh_22]]}
+
+HloModule acosh_f32, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acosh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule acosh_f64, entry_computation_layout={(f64[])->f64[]}
+
+// CHECK:       %[[$xla_builder_math_Acosh_22:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f64[] constant(-1)
+// CHECK-NEXT:  %[[compare_18:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_3]]), direction=LT
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = f64[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_5:[^ ]+]] = f64[] broadcast(%[[constant_4]]), dimensions={}
+// CHECK-NEXT:  %[[constant_16:[^ ]+]] = f64[] constant(1.7976931348623157e+308)
+// CHECK-NEXT:  %[[sqrt_17:[^ ]+]] = f64[] sqrt(%[[constant_16]])
+// CHECK-NEXT:  %[[compare_19:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[sqrt_17]]), direction=GE
+// CHECK-NEXT:  %[[log_12:[^ ]+]] = f64[] log(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f64[] constant(2)
+// CHECK-NEXT:  %[[log_14:[^ ]+]] = f64[] log(%[[constant_13]])
+// CHECK-NEXT:  %[[add_15:[^ ]+]] = f64[] add(%[[log_12]], %[[log_14]])
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = f64[] add(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[subtract_7:[^ ]+]] = f64[] subtract(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = f64[] multiply(%[[add_6]], %[[subtract_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = f64[] sqrt(%[[multiply_8]])
+// CHECK-NEXT:  %[[add_10:[^ ]+]] = f64[] add(%[[arg0_1]], %[[sqrt_9]])
+// CHECK-NEXT:  %[[log_11:[^ ]+]] = f64[] log(%[[add_10]])
+// CHECK-NEXT:  %[[select_20:[^ ]+]] = f64[] select(%[[compare_19]], %[[add_15]], %[[log_11]])
+// CHECK-NEXT:  ROOT %[[select_21:[^ ]+]] = f64[] select(%[[compare_18]], %[[broadcast_5]], %[[select_20]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh", called_computations={%[[$xla_builder_math_Acosh_22]]}
+
+HloModule acosh_f64, entry_computation_layout={(f64[])->f64[]}
+
+ENTRY %main.3 (Arg_0.1: f64[]) -> f64[] {
+  %Arg_0.1 = f64[] parameter(0)
+  ROOT %custom-call.2 = f64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acosh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule acosh_complex64, entry_computation_layout={(c64[])->c64[]}
+
+// CHECK:       %[[$xla_builder_math_Acosh_12:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = c64[] constant((-1, 0))
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = c64[] constant((nan, 0))
+// CHECK-NEXT:  %[[broadcast_5:[^ ]+]] = c64[] broadcast(%[[constant_4]]), dimensions={}
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c64[] constant((1, 0))
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = c64[] add(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[subtract_7:[^ ]+]] = c64[] subtract(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = c64[] multiply(%[[add_6]], %[[subtract_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = c64[] sqrt(%[[multiply_8]])
+// CHECK-NEXT:  %[[add_10:[^ ]+]] = c64[] add(%[[arg0_1]], %[[sqrt_9]])
+// CHECK-NEXT:  ROOT %[[log_11:[^ ]+]] = c64[] log(%[[add_10]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh", called_computations={%[[$xla_builder_math_Acosh_12]]}
+
+HloModule acosh_complex64, entry_computation_layout={(c64[])->c64[]}
+
+ENTRY %main.3 (Arg_0.1: c64[]) -> c64[] {
+  %Arg_0.1 = c64[] parameter(0)
+  ROOT %custom-call.2 = c64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acosh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule acosh_complex128, entry_computation_layout={(c128[])->c128[]}
+
+// CHECK:       %[[$xla_builder_math_Acosh_12:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = c128[] constant((-1, 0))
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = c128[] constant((nan, 0))
+// CHECK-NEXT:  %[[broadcast_5:[^ ]+]] = c128[] broadcast(%[[constant_4]]), dimensions={}
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c128[] constant((1, 0))
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = c128[] add(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[subtract_7:[^ ]+]] = c128[] subtract(%[[arg0_1]], %[[constant_2]])
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = c128[] multiply(%[[add_6]], %[[subtract_7]])
+// CHECK-NEXT:  %[[sqrt_9:[^ ]+]] = c128[] sqrt(%[[multiply_8]])
+// CHECK-NEXT:  %[[add_10:[^ ]+]] = c128[] add(%[[arg0_1]], %[[sqrt_9]])
+// CHECK-NEXT:  ROOT %[[log_11:[^ ]+]] = c128[] log(%[[add_10]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Acosh", called_computations={%[[$xla_builder_math_Acosh_12]]}
+
+HloModule acosh_complex128, entry_computation_layout={(c128[])->c128[]}
+
+ENTRY %main.3 (Arg_0.1: c128[]) -> c128[] {
+  %Arg_0.1 = c128[] parameter(0)
+  ROOT %custom-call.2 = c128[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Acosh"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/math_asin.hlo b/third_party/xla/xla/hlo/builder/tests/math_asin.hlo
new file mode 100644
index 000000000000..0fb11d17c3e8
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_asin.hlo
@@ -0,0 +1,344 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule asin_f32, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_Asin_11:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_4:[^ ]+]] = f32[] subtract(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[add_5:[^ ]+]] = f32[] add(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[multiply_6:[^ ]+]] = f32[] multiply(%[[subtract_4]], %[[add_5]])
+// CHECK-NEXT:  %[[sqrt_7:[^ ]+]] = f32[] sqrt(%[[multiply_6]])
+// CHECK-NEXT:  %[[add_8:[^ ]+]] = f32[] add(%[[constant_2]], %[[sqrt_7]])
+// CHECK-NEXT:  %[[atan2_9:[^ ]+]] = f32[] atan2(%[[arg0_1]], %[[add_8]])
+// CHECK-NEXT:  ROOT %[[multiply_10:[^ ]+]] = f32[] multiply(%[[constant_3]], %[[atan2_9]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin", called_computations={%[[$xla_builder_math_Asin_11]]}
+
+HloModule asin_f32, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asin"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule asin_f64, entry_computation_layout={(f64[])->f64[]}
+
+// CHECK:       %[[$xla_builder_math_Asin_11:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f64[] constant(2)
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[subtract_4:[^ ]+]] = f64[] subtract(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[add_5:[^ ]+]] = f64[] add(%[[constant_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[multiply_6:[^ ]+]] = f64[] multiply(%[[subtract_4]], %[[add_5]])
+// CHECK-NEXT:  %[[sqrt_7:[^ ]+]] = f64[] sqrt(%[[multiply_6]])
+// CHECK-NEXT:  %[[add_8:[^ ]+]] = f64[] add(%[[constant_2]], %[[sqrt_7]])
+// CHECK-NEXT:  %[[atan2_9:[^ ]+]] = f64[] atan2(%[[arg0_1]], %[[add_8]])
+// CHECK-NEXT:  ROOT %[[multiply_10:[^ ]+]] = f64[] multiply(%[[constant_3]], %[[atan2_9]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin", called_computations={%[[$xla_builder_math_Asin_11]]}
+
+HloModule asin_f64, entry_computation_layout={(f64[])->f64[]}
+
+ENTRY %main.3 (Arg_0.1: f64[]) -> f64[] {
+  %Arg_0.1 = f64[] parameter(0)
+  ROOT %custom-call.2 = f64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asin"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule asin_complex64, entry_computation_layout={(c64[])->c64[]}
+
+// CHECK:       %[[$xla_builder_math_Asin_127:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[real_2:[^ ]+]] = f32[] real(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_3:[^ ]+]] = f32[] abs(%[[real_2]])
+// CHECK-NEXT:  %[[imag_4:[^ ]+]] = f32[] imag(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_5:[^ ]+]] = f32[] abs(%[[imag_4]])
+// CHECK-NEXT:  %[[maximum_58:[^ ]+]] = f32[] maximum(%[[abs_3]], %[[abs_5]])
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(2.30584287e+18)
+// CHECK-NEXT:  %[[compare_59:[^ ]+]] = pred[] compare(%[[maximum_58]], %[[constant_6]]), direction=GE
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_60:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_7]]), direction=LE
+// CHECK-NEXT:  %[[constant_8:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = f32[] add(%[[abs_3]], %[[constant_7]])
+// CHECK-NEXT:  %[[abs_10:[^ ]+]] = f32[] abs(%[[add_9]])
+// CHECK-NEXT:  %[[maximum_11:[^ ]+]] = f32[] maximum(%[[abs_10]], %[[abs_5]])
+// CHECK-NEXT:  %[[minimum_12:[^ ]+]] = f32[] minimum(%[[abs_10]], %[[abs_5]])
+// CHECK-NEXT:  %[[compare_20:[^ ]+]] = pred[] compare(%[[maximum_11]], %[[minimum_12]]), direction=EQ
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f32[] constant(1.41421354)
+// CHECK-NEXT:  %[[multiply_21:[^ ]+]] = f32[] multiply(%[[constant_13]], %[[maximum_11]])
+// CHECK-NEXT:  %[[divide_14:[^ ]+]] = f32[] divide(%[[minimum_12]], %[[maximum_11]])
+// CHECK-NEXT:  %[[multiply_15:[^ ]+]] = f32[] multiply(%[[divide_14]], %[[divide_14]])
+// CHECK-NEXT:  %[[add_16:[^ ]+]] = f32[] add(%[[constant_7]], %[[multiply_15]])
+// CHECK-NEXT:  %[[sqrt_17:[^ ]+]] = f32[] sqrt(%[[add_16]])
+// CHECK-NEXT:  %[[compare_22:[^ ]+]] = pred[] compare(%[[sqrt_17]], %[[constant_7]]), direction=EQ
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_23:[^ ]+]] = pred[] compare(%[[multiply_15]], %[[constant_18]]), direction=GT
+// CHECK-NEXT:  %[[and_24:[^ ]+]] = pred[] and(%[[compare_22]], %[[compare_23]])
+// CHECK-NEXT:  %[[multiply_25:[^ ]+]] = f32[] multiply(%[[maximum_11]], %[[multiply_15]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[divide_26:[^ ]+]] = f32[] divide(%[[multiply_25]], %[[constant_19]])
+// CHECK-NEXT:  %[[add_27:[^ ]+]] = f32[] add(%[[maximum_11]], %[[divide_26]])
+// CHECK-NEXT:  %[[multiply_28:[^ ]+]] = f32[] multiply(%[[maximum_11]], %[[sqrt_17]])
+// CHECK-NEXT:  %[[select_29:[^ ]+]] = f32[] select(%[[and_24]], %[[add_27]], %[[multiply_28]])
+// CHECK-NEXT:  %[[select_30:[^ ]+]] = f32[] select(%[[compare_20]], %[[multiply_21]], %[[select_29]])
+// CHECK-NEXT:  %[[subtract_31:[^ ]+]] = f32[] subtract(%[[abs_3]], %[[constant_7]])
+// CHECK-NEXT:  %[[abs_32:[^ ]+]] = f32[] abs(%[[subtract_31]])
+// CHECK-NEXT:  %[[maximum_33:[^ ]+]] = f32[] maximum(%[[abs_32]], %[[abs_5]])
+// CHECK-NEXT:  %[[minimum_34:[^ ]+]] = f32[] minimum(%[[abs_32]], %[[abs_5]])
+// CHECK-NEXT:  %[[compare_39:[^ ]+]] = pred[] compare(%[[maximum_33]], %[[minimum_34]]), direction=EQ
+// CHECK-NEXT:  %[[multiply_40:[^ ]+]] = f32[] multiply(%[[constant_13]], %[[maximum_33]])
+// CHECK-NEXT:  %[[divide_35:[^ ]+]] = f32[] divide(%[[minimum_34]], %[[maximum_33]])
+// CHECK-NEXT:  %[[multiply_36:[^ ]+]] = f32[] multiply(%[[divide_35]], %[[divide_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = f32[] add(%[[constant_7]], %[[multiply_36]])
+// CHECK-NEXT:  %[[sqrt_38:[^ ]+]] = f32[] sqrt(%[[add_37]])
+// CHECK-NEXT:  %[[compare_41:[^ ]+]] = pred[] compare(%[[sqrt_38]], %[[constant_7]]), direction=EQ
+// CHECK-NEXT:  %[[compare_42:[^ ]+]] = pred[] compare(%[[multiply_36]], %[[constant_18]]), direction=GT
+// CHECK-NEXT:  %[[and_43:[^ ]+]] = pred[] and(%[[compare_41]], %[[compare_42]])
+// CHECK-NEXT:  %[[multiply_44:[^ ]+]] = f32[] multiply(%[[maximum_33]], %[[multiply_36]])
+// CHECK-NEXT:  %[[divide_45:[^ ]+]] = f32[] divide(%[[multiply_44]], %[[constant_19]])
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = f32[] add(%[[maximum_33]], %[[divide_45]])
+// CHECK-NEXT:  %[[multiply_47:[^ ]+]] = f32[] multiply(%[[maximum_33]], %[[sqrt_38]])
+// CHECK-NEXT:  %[[select_48:[^ ]+]] = f32[] select(%[[and_43]], %[[add_46]], %[[multiply_47]])
+// CHECK-NEXT:  %[[select_49:[^ ]+]] = f32[] select(%[[compare_39]], %[[multiply_40]], %[[select_48]])
+// CHECK-NEXT:  %[[add_50:[^ ]+]] = f32[] add(%[[select_30]], %[[select_49]])
+// CHECK-NEXT:  %[[multiply_51:[^ ]+]] = f32[] multiply(%[[constant_8]], %[[add_50]])
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = f32[] add(%[[multiply_51]], %[[abs_3]])
+// CHECK-NEXT:  %[[multiply_53:[^ ]+]] = f32[] multiply(%[[constant_8]], %[[add_52]])
+// CHECK-NEXT:  %[[multiply_54:[^ ]+]] = f32[] multiply(%[[abs_5]], %[[abs_5]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = f32[] add(%[[select_30]], %[[add_9]])
+// CHECK-NEXT:  %[[divide_61:[^ ]+]] = f32[] divide(%[[multiply_54]], %[[add_55]])
+// CHECK-NEXT:  %[[subtract_56:[^ ]+]] = f32[] subtract(%[[select_49]], %[[subtract_31]])
+// CHECK-NEXT:  %[[add_62:[^ ]+]] = f32[] add(%[[divide_61]], %[[subtract_56]])
+// CHECK-NEXT:  %[[multiply_63:[^ ]+]] = f32[] multiply(%[[multiply_53]], %[[add_62]])
+// CHECK-NEXT:  %[[sqrt_64:[^ ]+]] = f32[] sqrt(%[[multiply_63]])
+// CHECK-NEXT:  %[[divide_65:[^ ]+]] = f32[] divide(%[[multiply_53]], %[[add_55]])
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = f32[] add(%[[select_49]], %[[subtract_31]])
+// CHECK-NEXT:  %[[divide_66:[^ ]+]] = f32[] divide(%[[multiply_53]], %[[add_57]])
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = f32[] add(%[[divide_65]], %[[divide_66]])
+// CHECK-NEXT:  %[[sqrt_68:[^ ]+]] = f32[] sqrt(%[[add_67]])
+// CHECK-NEXT:  %[[multiply_69:[^ ]+]] = f32[] multiply(%[[abs_5]], %[[sqrt_68]])
+// CHECK-NEXT:  %[[select_70:[^ ]+]] = f32[] select(%[[compare_60]], %[[sqrt_64]], %[[multiply_69]])
+// CHECK-NEXT:  %[[select_71:[^ ]+]] = f32[] select(%[[compare_59]], %[[abs_5]], %[[select_70]])
+// CHECK-NEXT:  %[[atan2_72:[^ ]+]] = f32[] atan2(%[[real_2]], %[[select_71]])
+// CHECK-NEXT:  %[[compare_123:[^ ]+]] = pred[] compare(%[[imag_4]], %[[constant_18]]), direction=LT
+// CHECK-NEXT:  %[[constant_73:[^ ]+]] = f32[] constant(2.30584285e+30)
+// CHECK-NEXT:  %[[compare_74:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_73]]), direction=LT
+// CHECK-NEXT:  %[[constant_75:[^ ]+]] = f32[] constant(2.30584274e+12)
+// CHECK-NEXT:  %[[constant_76:[^ ]+]] = f32[] constant(2.30584283e+20)
+// CHECK-NEXT:  %[[select_77:[^ ]+]] = f32[] select(%[[compare_74]], %[[constant_75]], %[[constant_76]])
+// CHECK-NEXT:  %[[compare_78:[^ ]+]] = pred[] compare(%[[abs_5]], %[[select_77]]), direction=GE
+// CHECK-NEXT:  %[[select_79:[^ ]+]] = f32[] select(%[[compare_78]], %[[abs_5]], %[[abs_3]])
+// CHECK-NEXT:  %[[select_109:[^ ]+]] = f32[] select(%[[compare_78]], %[[select_77]], %[[constant_6]])
+// CHECK-NEXT:  %[[compare_110:[^ ]+]] = pred[] compare(%[[select_79]], %[[select_109]]), direction=GE
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = f32[] constant(0.693147182)
+// CHECK-NEXT:  %[[log_112:[^ ]+]] = f32[] log(%[[select_79]])
+// CHECK-NEXT:  %[[add_113:[^ ]+]] = f32[] add(%[[constant_111]], %[[log_112]])
+// CHECK-NEXT:  %[[constant_80:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_81:[^ ]+]] = pred[] compare(%[[abs_5]], %[[constant_80]]), direction=EQ
+// CHECK-NEXT:  %[[not_82:[^ ]+]] = pred[] not(%[[compare_81]])
+// CHECK-NEXT:  %[[and_83:[^ ]+]] = pred[] and(%[[compare_78]], %[[not_82]])
+// CHECK-NEXT:  %[[divide_84:[^ ]+]] = f32[] divide(%[[abs_3]], %[[abs_5]])
+// CHECK-NEXT:  %[[select_85:[^ ]+]] = f32[] select(%[[and_83]], %[[divide_84]], %[[constant_18]])
+// CHECK-NEXT:  %[[multiply_114:[^ ]+]] = f32[] multiply(%[[select_85]], %[[select_85]])
+// CHECK-NEXT:  %[[log_plus_one_115:[^ ]+]] = f32[] log-plus-one(%[[multiply_114]])
+// CHECK-NEXT:  %[[multiply_116:[^ ]+]] = f32[] multiply(%[[constant_8]], %[[log_plus_one_115]])
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = f32[] add(%[[add_113]], %[[multiply_116]])
+// CHECK-NEXT:  %[[constant_86:[^ ]+]] = f32[] constant(4.33680869e-19)
+// CHECK-NEXT:  %[[compare_87:[^ ]+]] = pred[] compare(%[[abs_5]], %[[constant_86]]), direction=LT
+// CHECK-NEXT:  %[[compare_88:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_7]]), direction=LT
+// CHECK-NEXT:  %[[and_89:[^ ]+]] = pred[] and(%[[compare_87]], %[[compare_88]])
+// CHECK-NEXT:  %[[multiply_103:[^ ]+]] = f32[] multiply(%[[add_9]], %[[subtract_31]])
+// CHECK-NEXT:  %[[add_90:[^ ]+]] = f32[] add(%[[multiply_51]], %[[constant_7]])
+// CHECK-NEXT:  %[[divide_104:[^ ]+]] = f32[] divide(%[[multiply_103]], %[[add_90]])
+// CHECK-NEXT:  %[[negate_105:[^ ]+]] = f32[] negate(%[[divide_104]])
+// CHECK-NEXT:  %[[compare_93:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_7]]), direction=GE
+// CHECK-NEXT:  %[[multiply_91:[^ ]+]] = f32[] multiply(%[[constant_8]], %[[multiply_54]])
+// CHECK-NEXT:  %[[divide_92:[^ ]+]] = f32[] divide(%[[multiply_91]], %[[add_55]])
+// CHECK-NEXT:  %[[multiply_94:[^ ]+]] = f32[] multiply(%[[constant_8]], %[[add_57]])
+// CHECK-NEXT:  %[[add_95:[^ ]+]] = f32[] add(%[[divide_92]], %[[multiply_94]])
+// CHECK-NEXT:  %[[constant_96:[^ ]+]] = f32[] constant(1.5)
+// CHECK-NEXT:  %[[compare_97:[^ ]+]] = pred[] compare(%[[multiply_51]], %[[constant_96]]), direction=LE
+// CHECK-NEXT:  %[[divide_98:[^ ]+]] = f32[] divide(%[[multiply_91]], %[[subtract_56]])
+// CHECK-NEXT:  %[[add_99:[^ ]+]] = f32[] add(%[[divide_92]], %[[divide_98]])
+// CHECK-NEXT:  %[[subtract_100:[^ ]+]] = f32[] subtract(%[[multiply_51]], %[[constant_7]])
+// CHECK-NEXT:  %[[select_101:[^ ]+]] = f32[] select(%[[compare_97]], %[[add_99]], %[[subtract_100]])
+// CHECK-NEXT:  %[[select_102:[^ ]+]] = f32[] select(%[[compare_93]], %[[add_95]], %[[select_101]])
+// CHECK-NEXT:  %[[select_106:[^ ]+]] = f32[] select(%[[and_89]], %[[negate_105]], %[[select_102]])
+// CHECK-NEXT:  %[[multiply_107:[^ ]+]] = f32[] multiply(%[[select_106]], %[[add_90]])
+// CHECK-NEXT:  %[[sqrt_108:[^ ]+]] = f32[] sqrt(%[[multiply_107]])
+// CHECK-NEXT:  %[[divide_118:[^ ]+]] = f32[] divide(%[[abs_5]], %[[sqrt_108]])
+// CHECK-NEXT:  %[[add_119:[^ ]+]] = f32[] add(%[[select_106]], %[[sqrt_108]])
+// CHECK-NEXT:  %[[log_plus_one_120:[^ ]+]] = f32[] log-plus-one(%[[add_119]])
+// CHECK-NEXT:  %[[select_121:[^ ]+]] = f32[] select(%[[and_89]], %[[divide_118]], %[[log_plus_one_120]])
+// CHECK-NEXT:  %[[select_122:[^ ]+]] = f32[] select(%[[compare_110]], %[[add_117]], %[[select_121]])
+// CHECK-NEXT:  %[[negate_124:[^ ]+]] = f32[] negate(%[[select_122]])
+// CHECK-NEXT:  %[[select_125:[^ ]+]] = f32[] select(%[[compare_123]], %[[negate_124]], %[[select_122]])
+// CHECK-NEXT:  ROOT %[[complex_126:[^ ]+]] = c64[] complex(%[[atan2_72]], %[[select_125]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin", called_computations={%[[$xla_builder_math_Asin_127]]}
+
+HloModule asin_complex64, entry_computation_layout={(c64[])->c64[]}
+
+ENTRY %main.3 (Arg_0.1: c64[]) -> c64[] {
+  %Arg_0.1 = c64[] parameter(0)
+  ROOT %custom-call.2 = c64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asin"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule asin_complex128, entry_computation_layout={(c128[])->c128[]}
+
+// CHECK:       %[[$xla_builder_math_Asin_127:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[real_2:[^ ]+]] = f64[] real(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_3:[^ ]+]] = f64[] abs(%[[real_2]])
+// CHECK-NEXT:  %[[imag_4:[^ ]+]] = f64[] imag(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_5:[^ ]+]] = f64[] abs(%[[imag_4]])
+// CHECK-NEXT:  %[[maximum_58:[^ ]+]] = f64[] maximum(%[[abs_3]], %[[abs_5]])
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f64[] constant(1.6759759912428245e+153)
+// CHECK-NEXT:  %[[compare_59:[^ ]+]] = pred[] compare(%[[maximum_58]], %[[constant_6]]), direction=GE
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[compare_60:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_7]]), direction=LE
+// CHECK-NEXT:  %[[constant_8:[^ ]+]] = f64[] constant(0.5)
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = f64[] add(%[[abs_3]], %[[constant_7]])
+// CHECK-NEXT:  %[[abs_10:[^ ]+]] = f64[] abs(%[[add_9]])
+// CHECK-NEXT:  %[[maximum_11:[^ ]+]] = f64[] maximum(%[[abs_10]], %[[abs_5]])
+// CHECK-NEXT:  %[[minimum_12:[^ ]+]] = f64[] minimum(%[[abs_10]], %[[abs_5]])
+// CHECK-NEXT:  %[[compare_20:[^ ]+]] = pred[] compare(%[[maximum_11]], %[[minimum_12]]), direction=EQ
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f64[] constant(1.4142135623730951)
+// CHECK-NEXT:  %[[multiply_21:[^ ]+]] = f64[] multiply(%[[constant_13]], %[[maximum_11]])
+// CHECK-NEXT:  %[[divide_14:[^ ]+]] = f64[] divide(%[[minimum_12]], %[[maximum_11]])
+// CHECK-NEXT:  %[[multiply_15:[^ ]+]] = f64[] multiply(%[[divide_14]], %[[divide_14]])
+// CHECK-NEXT:  %[[add_16:[^ ]+]] = f64[] add(%[[constant_7]], %[[multiply_15]])
+// CHECK-NEXT:  %[[sqrt_17:[^ ]+]] = f64[] sqrt(%[[add_16]])
+// CHECK-NEXT:  %[[compare_22:[^ ]+]] = pred[] compare(%[[sqrt_17]], %[[constant_7]]), direction=EQ
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f64[] constant(0)
+// CHECK-NEXT:  %[[compare_23:[^ ]+]] = pred[] compare(%[[multiply_15]], %[[constant_18]]), direction=GT
+// CHECK-NEXT:  %[[and_24:[^ ]+]] = pred[] and(%[[compare_22]], %[[compare_23]])
+// CHECK-NEXT:  %[[multiply_25:[^ ]+]] = f64[] multiply(%[[maximum_11]], %[[multiply_15]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = f64[] constant(2)
+// CHECK-NEXT:  %[[divide_26:[^ ]+]] = f64[] divide(%[[multiply_25]], %[[constant_19]])
+// CHECK-NEXT:  %[[add_27:[^ ]+]] = f64[] add(%[[maximum_11]], %[[divide_26]])
+// CHECK-NEXT:  %[[multiply_28:[^ ]+]] = f64[] multiply(%[[maximum_11]], %[[sqrt_17]])
+// CHECK-NEXT:  %[[select_29:[^ ]+]] = f64[] select(%[[and_24]], %[[add_27]], %[[multiply_28]])
+// CHECK-NEXT:  %[[select_30:[^ ]+]] = f64[] select(%[[compare_20]], %[[multiply_21]], %[[select_29]])
+// CHECK-NEXT:  %[[subtract_31:[^ ]+]] = f64[] subtract(%[[abs_3]], %[[constant_7]])
+// CHECK-NEXT:  %[[abs_32:[^ ]+]] = f64[] abs(%[[subtract_31]])
+// CHECK-NEXT:  %[[maximum_33:[^ ]+]] = f64[] maximum(%[[abs_32]], %[[abs_5]])
+// CHECK-NEXT:  %[[minimum_34:[^ ]+]] = f64[] minimum(%[[abs_32]], %[[abs_5]])
+// CHECK-NEXT:  %[[compare_39:[^ ]+]] = pred[] compare(%[[maximum_33]], %[[minimum_34]]), direction=EQ
+// CHECK-NEXT:  %[[multiply_40:[^ ]+]] = f64[] multiply(%[[constant_13]], %[[maximum_33]])
+// CHECK-NEXT:  %[[divide_35:[^ ]+]] = f64[] divide(%[[minimum_34]], %[[maximum_33]])
+// CHECK-NEXT:  %[[multiply_36:[^ ]+]] = f64[] multiply(%[[divide_35]], %[[divide_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = f64[] add(%[[constant_7]], %[[multiply_36]])
+// CHECK-NEXT:  %[[sqrt_38:[^ ]+]] = f64[] sqrt(%[[add_37]])
+// CHECK-NEXT:  %[[compare_41:[^ ]+]] = pred[] compare(%[[sqrt_38]], %[[constant_7]]), direction=EQ
+// CHECK-NEXT:  %[[compare_42:[^ ]+]] = pred[] compare(%[[multiply_36]], %[[constant_18]]), direction=GT
+// CHECK-NEXT:  %[[and_43:[^ ]+]] = pred[] and(%[[compare_41]], %[[compare_42]])
+// CHECK-NEXT:  %[[multiply_44:[^ ]+]] = f64[] multiply(%[[maximum_33]], %[[multiply_36]])
+// CHECK-NEXT:  %[[divide_45:[^ ]+]] = f64[] divide(%[[multiply_44]], %[[constant_19]])
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = f64[] add(%[[maximum_33]], %[[divide_45]])
+// CHECK-NEXT:  %[[multiply_47:[^ ]+]] = f64[] multiply(%[[maximum_33]], %[[sqrt_38]])
+// CHECK-NEXT:  %[[select_48:[^ ]+]] = f64[] select(%[[and_43]], %[[add_46]], %[[multiply_47]])
+// CHECK-NEXT:  %[[select_49:[^ ]+]] = f64[] select(%[[compare_39]], %[[multiply_40]], %[[select_48]])
+// CHECK-NEXT:  %[[add_50:[^ ]+]] = f64[] add(%[[select_30]], %[[select_49]])
+// CHECK-NEXT:  %[[multiply_51:[^ ]+]] = f64[] multiply(%[[constant_8]], %[[add_50]])
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = f64[] add(%[[multiply_51]], %[[abs_3]])
+// CHECK-NEXT:  %[[multiply_53:[^ ]+]] = f64[] multiply(%[[constant_8]], %[[add_52]])
+// CHECK-NEXT:  %[[multiply_54:[^ ]+]] = f64[] multiply(%[[abs_5]], %[[abs_5]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = f64[] add(%[[select_30]], %[[add_9]])
+// CHECK-NEXT:  %[[divide_61:[^ ]+]] = f64[] divide(%[[multiply_54]], %[[add_55]])
+// CHECK-NEXT:  %[[subtract_56:[^ ]+]] = f64[] subtract(%[[select_49]], %[[subtract_31]])
+// CHECK-NEXT:  %[[add_62:[^ ]+]] = f64[] add(%[[divide_61]], %[[subtract_56]])
+// CHECK-NEXT:  %[[multiply_63:[^ ]+]] = f64[] multiply(%[[multiply_53]], %[[add_62]])
+// CHECK-NEXT:  %[[sqrt_64:[^ ]+]] = f64[] sqrt(%[[multiply_63]])
+// CHECK-NEXT:  %[[divide_65:[^ ]+]] = f64[] divide(%[[multiply_53]], %[[add_55]])
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = f64[] add(%[[select_49]], %[[subtract_31]])
+// CHECK-NEXT:  %[[divide_66:[^ ]+]] = f64[] divide(%[[multiply_53]], %[[add_57]])
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = f64[] add(%[[divide_65]], %[[divide_66]])
+// CHECK-NEXT:  %[[sqrt_68:[^ ]+]] = f64[] sqrt(%[[add_67]])
+// CHECK-NEXT:  %[[multiply_69:[^ ]+]] = f64[] multiply(%[[abs_5]], %[[sqrt_68]])
+// CHECK-NEXT:  %[[select_70:[^ ]+]] = f64[] select(%[[compare_60]], %[[sqrt_64]], %[[multiply_69]])
+// CHECK-NEXT:  %[[select_71:[^ ]+]] = f64[] select(%[[compare_59]], %[[abs_5]], %[[select_70]])
+// CHECK-NEXT:  %[[atan2_72:[^ ]+]] = f64[] atan2(%[[real_2]], %[[select_71]])
+// CHECK-NEXT:  %[[compare_123:[^ ]+]] = pred[] compare(%[[imag_4]], %[[constant_18]]), direction=LT
+// CHECK-NEXT:  %[[constant_73:[^ ]+]] = f64[] constant(1.6759759912428244e+165)
+// CHECK-NEXT:  %[[compare_74:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_73]]), direction=LT
+// CHECK-NEXT:  %[[constant_75:[^ ]+]] = f64[] constant(1.6759759912428242e+147)
+// CHECK-NEXT:  %[[constant_76:[^ ]+]] = f64[] constant(1.6759759912428244e+155)
+// CHECK-NEXT:  %[[select_77:[^ ]+]] = f64[] select(%[[compare_74]], %[[constant_75]], %[[constant_76]])
+// CHECK-NEXT:  %[[compare_78:[^ ]+]] = pred[] compare(%[[abs_5]], %[[select_77]]), direction=GE
+// CHECK-NEXT:  %[[select_79:[^ ]+]] = f64[] select(%[[compare_78]], %[[abs_5]], %[[abs_3]])
+// CHECK-NEXT:  %[[select_109:[^ ]+]] = f64[] select(%[[compare_78]], %[[select_77]], %[[constant_6]])
+// CHECK-NEXT:  %[[compare_110:[^ ]+]] = pred[] compare(%[[select_79]], %[[select_109]]), direction=GE
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = f64[] constant(0.69314718055994529)
+// CHECK-NEXT:  %[[log_112:[^ ]+]] = f64[] log(%[[select_79]])
+// CHECK-NEXT:  %[[add_113:[^ ]+]] = f64[] add(%[[constant_111]], %[[log_112]])
+// CHECK-NEXT:  %[[constant_80:[^ ]+]] = f64[] constant(inf)
+// CHECK-NEXT:  %[[compare_81:[^ ]+]] = pred[] compare(%[[abs_5]], %[[constant_80]]), direction=EQ
+// CHECK-NEXT:  %[[not_82:[^ ]+]] = pred[] not(%[[compare_81]])
+// CHECK-NEXT:  %[[and_83:[^ ]+]] = pred[] and(%[[compare_78]], %[[not_82]])
+// CHECK-NEXT:  %[[divide_84:[^ ]+]] = f64[] divide(%[[abs_3]], %[[abs_5]])
+// CHECK-NEXT:  %[[select_85:[^ ]+]] = f64[] select(%[[and_83]], %[[divide_84]], %[[constant_18]])
+// CHECK-NEXT:  %[[multiply_114:[^ ]+]] = f64[] multiply(%[[select_85]], %[[select_85]])
+// CHECK-NEXT:  %[[log_plus_one_115:[^ ]+]] = f64[] log-plus-one(%[[multiply_114]])
+// CHECK-NEXT:  %[[multiply_116:[^ ]+]] = f64[] multiply(%[[constant_8]], %[[log_plus_one_115]])
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = f64[] add(%[[add_113]], %[[multiply_116]])
+// CHECK-NEXT:  %[[constant_86:[^ ]+]] = f64[] constant(5.9666725849601654e-154)
+// CHECK-NEXT:  %[[compare_87:[^ ]+]] = pred[] compare(%[[abs_5]], %[[constant_86]]), direction=LT
+// CHECK-NEXT:  %[[compare_88:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_7]]), direction=LT
+// CHECK-NEXT:  %[[and_89:[^ ]+]] = pred[] and(%[[compare_87]], %[[compare_88]])
+// CHECK-NEXT:  %[[multiply_103:[^ ]+]] = f64[] multiply(%[[add_9]], %[[subtract_31]])
+// CHECK-NEXT:  %[[add_90:[^ ]+]] = f64[] add(%[[multiply_51]], %[[constant_7]])
+// CHECK-NEXT:  %[[divide_104:[^ ]+]] = f64[] divide(%[[multiply_103]], %[[add_90]])
+// CHECK-NEXT:  %[[negate_105:[^ ]+]] = f64[] negate(%[[divide_104]])
+// CHECK-NEXT:  %[[compare_93:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_7]]), direction=GE
+// CHECK-NEXT:  %[[multiply_91:[^ ]+]] = f64[] multiply(%[[constant_8]], %[[multiply_54]])
+// CHECK-NEXT:  %[[divide_92:[^ ]+]] = f64[] divide(%[[multiply_91]], %[[add_55]])
+// CHECK-NEXT:  %[[multiply_94:[^ ]+]] = f64[] multiply(%[[constant_8]], %[[add_57]])
+// CHECK-NEXT:  %[[add_95:[^ ]+]] = f64[] add(%[[divide_92]], %[[multiply_94]])
+// CHECK-NEXT:  %[[constant_96:[^ ]+]] = f64[] constant(1.5)
+// CHECK-NEXT:  %[[compare_97:[^ ]+]] = pred[] compare(%[[multiply_51]], %[[constant_96]]), direction=LE
+// CHECK-NEXT:  %[[divide_98:[^ ]+]] = f64[] divide(%[[multiply_91]], %[[subtract_56]])
+// CHECK-NEXT:  %[[add_99:[^ ]+]] = f64[] add(%[[divide_92]], %[[divide_98]])
+// CHECK-NEXT:  %[[subtract_100:[^ ]+]] = f64[] subtract(%[[multiply_51]], %[[constant_7]])
+// CHECK-NEXT:  %[[select_101:[^ ]+]] = f64[] select(%[[compare_97]], %[[add_99]], %[[subtract_100]])
+// CHECK-NEXT:  %[[select_102:[^ ]+]] = f64[] select(%[[compare_93]], %[[add_95]], %[[select_101]])
+// CHECK-NEXT:  %[[select_106:[^ ]+]] = f64[] select(%[[and_89]], %[[negate_105]], %[[select_102]])
+// CHECK-NEXT:  %[[multiply_107:[^ ]+]] = f64[] multiply(%[[select_106]], %[[add_90]])
+// CHECK-NEXT:  %[[sqrt_108:[^ ]+]] = f64[] sqrt(%[[multiply_107]])
+// CHECK-NEXT:  %[[divide_118:[^ ]+]] = f64[] divide(%[[abs_5]], %[[sqrt_108]])
+// CHECK-NEXT:  %[[add_119:[^ ]+]] = f64[] add(%[[select_106]], %[[sqrt_108]])
+// CHECK-NEXT:  %[[log_plus_one_120:[^ ]+]] = f64[] log-plus-one(%[[add_119]])
+// CHECK-NEXT:  %[[select_121:[^ ]+]] = f64[] select(%[[and_89]], %[[divide_118]], %[[log_plus_one_120]])
+// CHECK-NEXT:  %[[select_122:[^ ]+]] = f64[] select(%[[compare_110]], %[[add_117]], %[[select_121]])
+// CHECK-NEXT:  %[[negate_124:[^ ]+]] = f64[] negate(%[[select_122]])
+// CHECK-NEXT:  %[[select_125:[^ ]+]] = f64[] select(%[[compare_123]], %[[negate_124]], %[[select_122]])
+// CHECK-NEXT:  ROOT %[[complex_126:[^ ]+]] = c128[] complex(%[[atan2_72]], %[[select_125]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asin", called_computations={%[[$xla_builder_math_Asin_127]]}
+
+HloModule asin_complex128, entry_computation_layout={(c128[])->c128[]}
+
+ENTRY %main.3 (Arg_0.1: c128[]) -> c128[] {
+  %Arg_0.1 = c128[] parameter(0)
+  ROOT %custom-call.2 = c128[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asin"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/math_asinh.hlo b/third_party/xla/xla/hlo/builder/tests/math_asinh.hlo
new file mode 100644
index 000000000000..fed0c9a5201d
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_asinh.hlo
@@ -0,0 +1,414 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule asinh_f32, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_Asinh_30:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[sign_24:[^ ]+]] = f32[] sign(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_3:[^ ]+]] = f32[] abs(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = f32[] constant(3.40282347e+38)
+// CHECK-NEXT:  %[[sqrt_23:[^ ]+]] = f32[] sqrt(%[[constant_22]])
+// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[] compare(%[[abs_3]], %[[sqrt_23]]), direction=GE
+// CHECK-NEXT:  %[[abs_17:[^ ]+]] = f32[] abs(%[[abs_3]])
+// CHECK-NEXT:  %[[log_18:[^ ]+]] = f32[] log(%[[abs_17]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[log_20:[^ ]+]] = f32[] log(%[[constant_19]])
+// CHECK-NEXT:  %[[add_21:[^ ]+]] = f32[] add(%[[log_18]], %[[log_20]])
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_26:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_2]]), direction=LE
+// CHECK-NEXT:  %[[multiply_4:[^ ]+]] = f32[] multiply(%[[abs_3]], %[[abs_3]])
+// CHECK-NEXT:  %[[multiply_5:[^ ]+]] = f32[] multiply(%[[abs_3]], %[[abs_3]])
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = f32[] add(%[[multiply_5]], %[[constant_2]])
+// CHECK-NEXT:  %[[sqrt_7:[^ ]+]] = f32[] sqrt(%[[add_6]])
+// CHECK-NEXT:  %[[add_8:[^ ]+]] = f32[] add(%[[constant_2]], %[[sqrt_7]])
+// CHECK-NEXT:  %[[divide_9:[^ ]+]] = f32[] divide(%[[multiply_4]], %[[add_8]])
+// CHECK-NEXT:  %[[add_10:[^ ]+]] = f32[] add(%[[abs_3]], %[[divide_9]])
+// CHECK-NEXT:  %[[log_plus_one_11:[^ ]+]] = f32[] log-plus-one(%[[add_10]])
+// CHECK-NEXT:  %[[multiply_12:[^ ]+]] = f32[] multiply(%[[abs_3]], %[[abs_3]])
+// CHECK-NEXT:  %[[add_13:[^ ]+]] = f32[] add(%[[multiply_12]], %[[constant_2]])
+// CHECK-NEXT:  %[[sqrt_14:[^ ]+]] = f32[] sqrt(%[[add_13]])
+// CHECK-NEXT:  %[[add_15:[^ ]+]] = f32[] add(%[[abs_3]], %[[sqrt_14]])
+// CHECK-NEXT:  %[[log_16:[^ ]+]] = f32[] log(%[[add_15]])
+// CHECK-NEXT:  %[[select_27:[^ ]+]] = f32[] select(%[[compare_26]], %[[log_plus_one_11]], %[[log_16]])
+// CHECK-NEXT:  %[[select_28:[^ ]+]] = f32[] select(%[[compare_25]], %[[add_21]], %[[select_27]])
+// CHECK-NEXT:  ROOT %[[multiply_29:[^ ]+]] = f32[] multiply(%[[sign_24]], %[[select_28]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh", called_computations={%[[$xla_builder_math_Asinh_30]]}
+
+HloModule asinh_f32, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asinh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule asinh_f64, entry_computation_layout={(f64[])->f64[]}
+
+// CHECK:       %[[$xla_builder_math_Asinh_30:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[sign_24:[^ ]+]] = f64[] sign(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_3:[^ ]+]] = f64[] abs(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = f64[] constant(1.7976931348623157e+308)
+// CHECK-NEXT:  %[[sqrt_23:[^ ]+]] = f64[] sqrt(%[[constant_22]])
+// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[] compare(%[[abs_3]], %[[sqrt_23]]), direction=GE
+// CHECK-NEXT:  %[[abs_17:[^ ]+]] = f64[] abs(%[[abs_3]])
+// CHECK-NEXT:  %[[log_18:[^ ]+]] = f64[] log(%[[abs_17]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = f64[] constant(2)
+// CHECK-NEXT:  %[[log_20:[^ ]+]] = f64[] log(%[[constant_19]])
+// CHECK-NEXT:  %[[add_21:[^ ]+]] = f64[] add(%[[log_18]], %[[log_20]])
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[compare_26:[^ ]+]] = pred[] compare(%[[abs_3]], %[[constant_2]]), direction=LE
+// CHECK-NEXT:  %[[multiply_4:[^ ]+]] = f64[] multiply(%[[abs_3]], %[[abs_3]])
+// CHECK-NEXT:  %[[multiply_5:[^ ]+]] = f64[] multiply(%[[abs_3]], %[[abs_3]])
+// CHECK-NEXT:  %[[add_6:[^ ]+]] = f64[] add(%[[multiply_5]], %[[constant_2]])
+// CHECK-NEXT:  %[[sqrt_7:[^ ]+]] = f64[] sqrt(%[[add_6]])
+// CHECK-NEXT:  %[[add_8:[^ ]+]] = f64[] add(%[[constant_2]], %[[sqrt_7]])
+// CHECK-NEXT:  %[[divide_9:[^ ]+]] = f64[] divide(%[[multiply_4]], %[[add_8]])
+// CHECK-NEXT:  %[[add_10:[^ ]+]] = f64[] add(%[[abs_3]], %[[divide_9]])
+// CHECK-NEXT:  %[[log_plus_one_11:[^ ]+]] = f64[] log-plus-one(%[[add_10]])
+// CHECK-NEXT:  %[[multiply_12:[^ ]+]] = f64[] multiply(%[[abs_3]], %[[abs_3]])
+// CHECK-NEXT:  %[[add_13:[^ ]+]] = f64[] add(%[[multiply_12]], %[[constant_2]])
+// CHECK-NEXT:  %[[sqrt_14:[^ ]+]] = f64[] sqrt(%[[add_13]])
+// CHECK-NEXT:  %[[add_15:[^ ]+]] = f64[] add(%[[abs_3]], %[[sqrt_14]])
+// CHECK-NEXT:  %[[log_16:[^ ]+]] = f64[] log(%[[add_15]])
+// CHECK-NEXT:  %[[select_27:[^ ]+]] = f64[] select(%[[compare_26]], %[[log_plus_one_11]], %[[log_16]])
+// CHECK-NEXT:  %[[select_28:[^ ]+]] = f64[] select(%[[compare_25]], %[[add_21]], %[[select_27]])
+// CHECK-NEXT:  ROOT %[[multiply_29:[^ ]+]] = f64[] multiply(%[[sign_24]], %[[select_28]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh", called_computations={%[[$xla_builder_math_Asinh_30]]}
+
+HloModule asinh_f64, entry_computation_layout={(f64[])->f64[]}
+
+ENTRY %main.3 (Arg_0.1: f64[]) -> f64[] {
+  %Arg_0.1 = f64[] parameter(0)
+  ROOT %custom-call.2 = f64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asinh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule asinh_complex64, entry_computation_layout={(c64[])->c64[]}
+
+// CHECK:       %[[$xla_builder_math_Asinh_143:[^ ]+]]
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c64[] constant((1, 0))
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[real_3:[^ ]+]] = f32[] real(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_133:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_134:[^ ]+]] = pred[] compare(%[[real_3]], %[[constant_133]]), direction=EQ
+// CHECK-NEXT:  %[[imag_4:[^ ]+]] = f32[] imag(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_135:[^ ]+]] = f32[] abs(%[[imag_4]])
+// CHECK-NEXT:  %[[constant_136:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_137:[^ ]+]] = pred[] compare(%[[abs_135]], %[[constant_136]]), direction=GT
+// CHECK-NEXT:  %[[and_138:[^ ]+]] = pred[] and(%[[compare_134]], %[[compare_137]])
+// CHECK-NEXT:  %[[negate_5:[^ ]+]] = f32[] negate(%[[real_3]])
+// CHECK-NEXT:  %[[complex_6:[^ ]+]] = c64[] complex(%[[imag_4]], %[[negate_5]])
+// CHECK-NEXT:  %[[real_7:[^ ]+]] = f32[] real(%[[complex_6]])
+// CHECK-NEXT:  %[[abs_8:[^ ]+]] = f32[] abs(%[[real_7]])
+// CHECK-NEXT:  %[[imag_9:[^ ]+]] = f32[] imag(%[[complex_6]])
+// CHECK-NEXT:  %[[abs_10:[^ ]+]] = f32[] abs(%[[imag_9]])
+// CHECK-NEXT:  %[[maximum_63:[^ ]+]] = f32[] maximum(%[[abs_8]], %[[abs_10]])
+// CHECK-NEXT:  %[[constant_11:[^ ]+]] = f32[] constant(2.30584287e+18)
+// CHECK-NEXT:  %[[compare_64:[^ ]+]] = pred[] compare(%[[maximum_63]], %[[constant_11]]), direction=GE
+// CHECK-NEXT:  %[[constant_12:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_65:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_12]]), direction=LE
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[add_14:[^ ]+]] = f32[] add(%[[abs_8]], %[[constant_12]])
+// CHECK-NEXT:  %[[abs_15:[^ ]+]] = f32[] abs(%[[add_14]])
+// CHECK-NEXT:  %[[maximum_16:[^ ]+]] = f32[] maximum(%[[abs_15]], %[[abs_10]])
+// CHECK-NEXT:  %[[minimum_17:[^ ]+]] = f32[] minimum(%[[abs_15]], %[[abs_10]])
+// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[] compare(%[[maximum_16]], %[[minimum_17]]), direction=EQ
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f32[] constant(1.41421354)
+// CHECK-NEXT:  %[[multiply_26:[^ ]+]] = f32[] multiply(%[[constant_18]], %[[maximum_16]])
+// CHECK-NEXT:  %[[divide_19:[^ ]+]] = f32[] divide(%[[minimum_17]], %[[maximum_16]])
+// CHECK-NEXT:  %[[multiply_20:[^ ]+]] = f32[] multiply(%[[divide_19]], %[[divide_19]])
+// CHECK-NEXT:  %[[add_21:[^ ]+]] = f32[] add(%[[constant_12]], %[[multiply_20]])
+// CHECK-NEXT:  %[[sqrt_22:[^ ]+]] = f32[] sqrt(%[[add_21]])
+// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[] compare(%[[sqrt_22]], %[[constant_12]]), direction=EQ
+// CHECK-NEXT:  %[[constant_23:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_28:[^ ]+]] = pred[] compare(%[[multiply_20]], %[[constant_23]]), direction=GT
+// CHECK-NEXT:  %[[and_29:[^ ]+]] = pred[] and(%[[compare_27]], %[[compare_28]])
+// CHECK-NEXT:  %[[multiply_30:[^ ]+]] = f32[] multiply(%[[maximum_16]], %[[multiply_20]])
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[divide_31:[^ ]+]] = f32[] divide(%[[multiply_30]], %[[constant_24]])
+// CHECK-NEXT:  %[[add_32:[^ ]+]] = f32[] add(%[[maximum_16]], %[[divide_31]])
+// CHECK-NEXT:  %[[multiply_33:[^ ]+]] = f32[] multiply(%[[maximum_16]], %[[sqrt_22]])
+// CHECK-NEXT:  %[[select_34:[^ ]+]] = f32[] select(%[[and_29]], %[[add_32]], %[[multiply_33]])
+// CHECK-NEXT:  %[[select_35:[^ ]+]] = f32[] select(%[[compare_25]], %[[multiply_26]], %[[select_34]])
+// CHECK-NEXT:  %[[subtract_36:[^ ]+]] = f32[] subtract(%[[abs_8]], %[[constant_12]])
+// CHECK-NEXT:  %[[abs_37:[^ ]+]] = f32[] abs(%[[subtract_36]])
+// CHECK-NEXT:  %[[maximum_38:[^ ]+]] = f32[] maximum(%[[abs_37]], %[[abs_10]])
+// CHECK-NEXT:  %[[minimum_39:[^ ]+]] = f32[] minimum(%[[abs_37]], %[[abs_10]])
+// CHECK-NEXT:  %[[compare_44:[^ ]+]] = pred[] compare(%[[maximum_38]], %[[minimum_39]]), direction=EQ
+// CHECK-NEXT:  %[[multiply_45:[^ ]+]] = f32[] multiply(%[[constant_18]], %[[maximum_38]])
+// CHECK-NEXT:  %[[divide_40:[^ ]+]] = f32[] divide(%[[minimum_39]], %[[maximum_38]])
+// CHECK-NEXT:  %[[multiply_41:[^ ]+]] = f32[] multiply(%[[divide_40]], %[[divide_40]])
+// CHECK-NEXT:  %[[add_42:[^ ]+]] = f32[] add(%[[constant_12]], %[[multiply_41]])
+// CHECK-NEXT:  %[[sqrt_43:[^ ]+]] = f32[] sqrt(%[[add_42]])
+// CHECK-NEXT:  %[[compare_46:[^ ]+]] = pred[] compare(%[[sqrt_43]], %[[constant_12]]), direction=EQ
+// CHECK-NEXT:  %[[compare_47:[^ ]+]] = pred[] compare(%[[multiply_41]], %[[constant_23]]), direction=GT
+// CHECK-NEXT:  %[[and_48:[^ ]+]] = pred[] and(%[[compare_46]], %[[compare_47]])
+// CHECK-NEXT:  %[[multiply_49:[^ ]+]] = f32[] multiply(%[[maximum_38]], %[[multiply_41]])
+// CHECK-NEXT:  %[[divide_50:[^ ]+]] = f32[] divide(%[[multiply_49]], %[[constant_24]])
+// CHECK-NEXT:  %[[add_51:[^ ]+]] = f32[] add(%[[maximum_38]], %[[divide_50]])
+// CHECK-NEXT:  %[[multiply_52:[^ ]+]] = f32[] multiply(%[[maximum_38]], %[[sqrt_43]])
+// CHECK-NEXT:  %[[select_53:[^ ]+]] = f32[] select(%[[and_48]], %[[add_51]], %[[multiply_52]])
+// CHECK-NEXT:  %[[select_54:[^ ]+]] = f32[] select(%[[compare_44]], %[[multiply_45]], %[[select_53]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = f32[] add(%[[select_35]], %[[select_54]])
+// CHECK-NEXT:  %[[multiply_56:[^ ]+]] = f32[] multiply(%[[constant_13]], %[[add_55]])
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = f32[] add(%[[multiply_56]], %[[abs_8]])
+// CHECK-NEXT:  %[[multiply_58:[^ ]+]] = f32[] multiply(%[[constant_13]], %[[add_57]])
+// CHECK-NEXT:  %[[multiply_59:[^ ]+]] = f32[] multiply(%[[abs_10]], %[[abs_10]])
+// CHECK-NEXT:  %[[add_60:[^ ]+]] = f32[] add(%[[select_35]], %[[add_14]])
+// CHECK-NEXT:  %[[divide_66:[^ ]+]] = f32[] divide(%[[multiply_59]], %[[add_60]])
+// CHECK-NEXT:  %[[subtract_61:[^ ]+]] = f32[] subtract(%[[select_54]], %[[subtract_36]])
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = f32[] add(%[[divide_66]], %[[subtract_61]])
+// CHECK-NEXT:  %[[multiply_68:[^ ]+]] = f32[] multiply(%[[multiply_58]], %[[add_67]])
+// CHECK-NEXT:  %[[sqrt_69:[^ ]+]] = f32[] sqrt(%[[multiply_68]])
+// CHECK-NEXT:  %[[divide_70:[^ ]+]] = f32[] divide(%[[multiply_58]], %[[add_60]])
+// CHECK-NEXT:  %[[add_62:[^ ]+]] = f32[] add(%[[select_54]], %[[subtract_36]])
+// CHECK-NEXT:  %[[divide_71:[^ ]+]] = f32[] divide(%[[multiply_58]], %[[add_62]])
+// CHECK-NEXT:  %[[add_72:[^ ]+]] = f32[] add(%[[divide_70]], %[[divide_71]])
+// CHECK-NEXT:  %[[sqrt_73:[^ ]+]] = f32[] sqrt(%[[add_72]])
+// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = f32[] multiply(%[[abs_10]], %[[sqrt_73]])
+// CHECK-NEXT:  %[[select_75:[^ ]+]] = f32[] select(%[[compare_65]], %[[sqrt_69]], %[[multiply_74]])
+// CHECK-NEXT:  %[[select_76:[^ ]+]] = f32[] select(%[[compare_64]], %[[abs_10]], %[[select_75]])
+// CHECK-NEXT:  %[[atan2_77:[^ ]+]] = f32[] atan2(%[[real_7]], %[[select_76]])
+// CHECK-NEXT:  %[[compare_128:[^ ]+]] = pred[] compare(%[[imag_9]], %[[constant_23]]), direction=LT
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = f32[] constant(2.30584285e+30)
+// CHECK-NEXT:  %[[compare_79:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_78]]), direction=LT
+// CHECK-NEXT:  %[[constant_80:[^ ]+]] = f32[] constant(2.30584274e+12)
+// CHECK-NEXT:  %[[constant_81:[^ ]+]] = f32[] constant(2.30584283e+20)
+// CHECK-NEXT:  %[[select_82:[^ ]+]] = f32[] select(%[[compare_79]], %[[constant_80]], %[[constant_81]])
+// CHECK-NEXT:  %[[compare_83:[^ ]+]] = pred[] compare(%[[abs_10]], %[[select_82]]), direction=GE
+// CHECK-NEXT:  %[[select_84:[^ ]+]] = f32[] select(%[[compare_83]], %[[abs_10]], %[[abs_8]])
+// CHECK-NEXT:  %[[select_114:[^ ]+]] = f32[] select(%[[compare_83]], %[[select_82]], %[[constant_11]])
+// CHECK-NEXT:  %[[compare_115:[^ ]+]] = pred[] compare(%[[select_84]], %[[select_114]]), direction=GE
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = f32[] constant(0.693147182)
+// CHECK-NEXT:  %[[log_117:[^ ]+]] = f32[] log(%[[select_84]])
+// CHECK-NEXT:  %[[add_118:[^ ]+]] = f32[] add(%[[constant_116]], %[[log_117]])
+// CHECK-NEXT:  %[[constant_85:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_86:[^ ]+]] = pred[] compare(%[[abs_10]], %[[constant_85]]), direction=EQ
+// CHECK-NEXT:  %[[not_87:[^ ]+]] = pred[] not(%[[compare_86]])
+// CHECK-NEXT:  %[[and_88:[^ ]+]] = pred[] and(%[[compare_83]], %[[not_87]])
+// CHECK-NEXT:  %[[divide_89:[^ ]+]] = f32[] divide(%[[abs_8]], %[[abs_10]])
+// CHECK-NEXT:  %[[select_90:[^ ]+]] = f32[] select(%[[and_88]], %[[divide_89]], %[[constant_23]])
+// CHECK-NEXT:  %[[multiply_119:[^ ]+]] = f32[] multiply(%[[select_90]], %[[select_90]])
+// CHECK-NEXT:  %[[log_plus_one_120:[^ ]+]] = f32[] log-plus-one(%[[multiply_119]])
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = f32[] multiply(%[[constant_13]], %[[log_plus_one_120]])
+// CHECK-NEXT:  %[[add_122:[^ ]+]] = f32[] add(%[[add_118]], %[[multiply_121]])
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = f32[] constant(4.33680869e-19)
+// CHECK-NEXT:  %[[compare_92:[^ ]+]] = pred[] compare(%[[abs_10]], %[[constant_91]]), direction=LT
+// CHECK-NEXT:  %[[compare_93:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_12]]), direction=LT
+// CHECK-NEXT:  %[[and_94:[^ ]+]] = pred[] and(%[[compare_92]], %[[compare_93]])
+// CHECK-NEXT:  %[[multiply_108:[^ ]+]] = f32[] multiply(%[[add_14]], %[[subtract_36]])
+// CHECK-NEXT:  %[[add_95:[^ ]+]] = f32[] add(%[[multiply_56]], %[[constant_12]])
+// CHECK-NEXT:  %[[divide_109:[^ ]+]] = f32[] divide(%[[multiply_108]], %[[add_95]])
+// CHECK-NEXT:  %[[negate_110:[^ ]+]] = f32[] negate(%[[divide_109]])
+// CHECK-NEXT:  %[[compare_98:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_12]]), direction=GE
+// CHECK-NEXT:  %[[multiply_96:[^ ]+]] = f32[] multiply(%[[constant_13]], %[[multiply_59]])
+// CHECK-NEXT:  %[[divide_97:[^ ]+]] = f32[] divide(%[[multiply_96]], %[[add_60]])
+// CHECK-NEXT:  %[[multiply_99:[^ ]+]] = f32[] multiply(%[[constant_13]], %[[add_62]])
+// CHECK-NEXT:  %[[add_100:[^ ]+]] = f32[] add(%[[divide_97]], %[[multiply_99]])
+// CHECK-NEXT:  %[[constant_101:[^ ]+]] = f32[] constant(1.5)
+// CHECK-NEXT:  %[[compare_102:[^ ]+]] = pred[] compare(%[[multiply_56]], %[[constant_101]]), direction=LE
+// CHECK-NEXT:  %[[divide_103:[^ ]+]] = f32[] divide(%[[multiply_96]], %[[subtract_61]])
+// CHECK-NEXT:  %[[add_104:[^ ]+]] = f32[] add(%[[divide_97]], %[[divide_103]])
+// CHECK-NEXT:  %[[subtract_105:[^ ]+]] = f32[] subtract(%[[multiply_56]], %[[constant_12]])
+// CHECK-NEXT:  %[[select_106:[^ ]+]] = f32[] select(%[[compare_102]], %[[add_104]], %[[subtract_105]])
+// CHECK-NEXT:  %[[select_107:[^ ]+]] = f32[] select(%[[compare_98]], %[[add_100]], %[[select_106]])
+// CHECK-NEXT:  %[[select_111:[^ ]+]] = f32[] select(%[[and_94]], %[[negate_110]], %[[select_107]])
+// CHECK-NEXT:  %[[multiply_112:[^ ]+]] = f32[] multiply(%[[select_111]], %[[add_95]])
+// CHECK-NEXT:  %[[sqrt_113:[^ ]+]] = f32[] sqrt(%[[multiply_112]])
+// CHECK-NEXT:  %[[divide_123:[^ ]+]] = f32[] divide(%[[abs_10]], %[[sqrt_113]])
+// CHECK-NEXT:  %[[add_124:[^ ]+]] = f32[] add(%[[select_111]], %[[sqrt_113]])
+// CHECK-NEXT:  %[[log_plus_one_125:[^ ]+]] = f32[] log-plus-one(%[[add_124]])
+// CHECK-NEXT:  %[[select_126:[^ ]+]] = f32[] select(%[[and_94]], %[[divide_123]], %[[log_plus_one_125]])
+// CHECK-NEXT:  %[[select_127:[^ ]+]] = f32[] select(%[[compare_115]], %[[add_122]], %[[select_126]])
+// CHECK-NEXT:  %[[negate_129:[^ ]+]] = f32[] negate(%[[select_127]])
+// CHECK-NEXT:  %[[select_130:[^ ]+]] = f32[] select(%[[compare_128]], %[[negate_129]], %[[select_127]])
+// CHECK-NEXT:  %[[complex_131:[^ ]+]] = c64[] complex(%[[atan2_77]], %[[select_130]])
+// CHECK-NEXT:  %[[imag_132:[^ ]+]] = f32[] imag(%[[complex_131]])
+// CHECK-NEXT:  %[[negate_139:[^ ]+]] = f32[] negate(%[[imag_132]])
+// CHECK-NEXT:  %[[select_140:[^ ]+]] = f32[] select(%[[and_138]], %[[imag_132]], %[[negate_139]])
+// CHECK-NEXT:  %[[real_141:[^ ]+]] = f32[] real(%[[complex_131]])
+// CHECK-NEXT:  ROOT %[[complex_142:[^ ]+]] = c64[] complex(%[[select_140]], %[[real_141]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh", called_computations={%[[$xla_builder_math_Asinh_143]]}
+
+HloModule asinh_complex64, entry_computation_layout={(c64[])->c64[]}
+
+ENTRY %main.3 (Arg_0.1: c64[]) -> c64[] {
+  %Arg_0.1 = c64[] parameter(0)
+  ROOT %custom-call.2 = c64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asinh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule asinh_complex128, entry_computation_layout={(c128[])->c128[]}
+
+// CHECK:       %[[$xla_builder_math_Asinh_143:[^ ]+]]
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c128[] constant((1, 0))
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[real_3:[^ ]+]] = f64[] real(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_133:[^ ]+]] = f64[] constant(0)
+// CHECK-NEXT:  %[[compare_134:[^ ]+]] = pred[] compare(%[[real_3]], %[[constant_133]]), direction=EQ
+// CHECK-NEXT:  %[[imag_4:[^ ]+]] = f64[] imag(%[[arg0_1]])
+// CHECK-NEXT:  %[[abs_135:[^ ]+]] = f64[] abs(%[[imag_4]])
+// CHECK-NEXT:  %[[constant_136:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[compare_137:[^ ]+]] = pred[] compare(%[[abs_135]], %[[constant_136]]), direction=GT
+// CHECK-NEXT:  %[[and_138:[^ ]+]] = pred[] and(%[[compare_134]], %[[compare_137]])
+// CHECK-NEXT:  %[[negate_5:[^ ]+]] = f64[] negate(%[[real_3]])
+// CHECK-NEXT:  %[[complex_6:[^ ]+]] = c128[] complex(%[[imag_4]], %[[negate_5]])
+// CHECK-NEXT:  %[[real_7:[^ ]+]] = f64[] real(%[[complex_6]])
+// CHECK-NEXT:  %[[abs_8:[^ ]+]] = f64[] abs(%[[real_7]])
+// CHECK-NEXT:  %[[imag_9:[^ ]+]] = f64[] imag(%[[complex_6]])
+// CHECK-NEXT:  %[[abs_10:[^ ]+]] = f64[] abs(%[[imag_9]])
+// CHECK-NEXT:  %[[maximum_63:[^ ]+]] = f64[] maximum(%[[abs_8]], %[[abs_10]])
+// CHECK-NEXT:  %[[constant_11:[^ ]+]] = f64[] constant(1.6759759912428245e+153)
+// CHECK-NEXT:  %[[compare_64:[^ ]+]] = pred[] compare(%[[maximum_63]], %[[constant_11]]), direction=GE
+// CHECK-NEXT:  %[[constant_12:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  %[[compare_65:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_12]]), direction=LE
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f64[] constant(0.5)
+// CHECK-NEXT:  %[[add_14:[^ ]+]] = f64[] add(%[[abs_8]], %[[constant_12]])
+// CHECK-NEXT:  %[[abs_15:[^ ]+]] = f64[] abs(%[[add_14]])
+// CHECK-NEXT:  %[[maximum_16:[^ ]+]] = f64[] maximum(%[[abs_15]], %[[abs_10]])
+// CHECK-NEXT:  %[[minimum_17:[^ ]+]] = f64[] minimum(%[[abs_15]], %[[abs_10]])
+// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[] compare(%[[maximum_16]], %[[minimum_17]]), direction=EQ
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f64[] constant(1.4142135623730951)
+// CHECK-NEXT:  %[[multiply_26:[^ ]+]] = f64[] multiply(%[[constant_18]], %[[maximum_16]])
+// CHECK-NEXT:  %[[divide_19:[^ ]+]] = f64[] divide(%[[minimum_17]], %[[maximum_16]])
+// CHECK-NEXT:  %[[multiply_20:[^ ]+]] = f64[] multiply(%[[divide_19]], %[[divide_19]])
+// CHECK-NEXT:  %[[add_21:[^ ]+]] = f64[] add(%[[constant_12]], %[[multiply_20]])
+// CHECK-NEXT:  %[[sqrt_22:[^ ]+]] = f64[] sqrt(%[[add_21]])
+// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[] compare(%[[sqrt_22]], %[[constant_12]]), direction=EQ
+// CHECK-NEXT:  %[[constant_23:[^ ]+]] = f64[] constant(0)
+// CHECK-NEXT:  %[[compare_28:[^ ]+]] = pred[] compare(%[[multiply_20]], %[[constant_23]]), direction=GT
+// CHECK-NEXT:  %[[and_29:[^ ]+]] = pred[] and(%[[compare_27]], %[[compare_28]])
+// CHECK-NEXT:  %[[multiply_30:[^ ]+]] = f64[] multiply(%[[maximum_16]], %[[multiply_20]])
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = f64[] constant(2)
+// CHECK-NEXT:  %[[divide_31:[^ ]+]] = f64[] divide(%[[multiply_30]], %[[constant_24]])
+// CHECK-NEXT:  %[[add_32:[^ ]+]] = f64[] add(%[[maximum_16]], %[[divide_31]])
+// CHECK-NEXT:  %[[multiply_33:[^ ]+]] = f64[] multiply(%[[maximum_16]], %[[sqrt_22]])
+// CHECK-NEXT:  %[[select_34:[^ ]+]] = f64[] select(%[[and_29]], %[[add_32]], %[[multiply_33]])
+// CHECK-NEXT:  %[[select_35:[^ ]+]] = f64[] select(%[[compare_25]], %[[multiply_26]], %[[select_34]])
+// CHECK-NEXT:  %[[subtract_36:[^ ]+]] = f64[] subtract(%[[abs_8]], %[[constant_12]])
+// CHECK-NEXT:  %[[abs_37:[^ ]+]] = f64[] abs(%[[subtract_36]])
+// CHECK-NEXT:  %[[maximum_38:[^ ]+]] = f64[] maximum(%[[abs_37]], %[[abs_10]])
+// CHECK-NEXT:  %[[minimum_39:[^ ]+]] = f64[] minimum(%[[abs_37]], %[[abs_10]])
+// CHECK-NEXT:  %[[compare_44:[^ ]+]] = pred[] compare(%[[maximum_38]], %[[minimum_39]]), direction=EQ
+// CHECK-NEXT:  %[[multiply_45:[^ ]+]] = f64[] multiply(%[[constant_18]], %[[maximum_38]])
+// CHECK-NEXT:  %[[divide_40:[^ ]+]] = f64[] divide(%[[minimum_39]], %[[maximum_38]])
+// CHECK-NEXT:  %[[multiply_41:[^ ]+]] = f64[] multiply(%[[divide_40]], %[[divide_40]])
+// CHECK-NEXT:  %[[add_42:[^ ]+]] = f64[] add(%[[constant_12]], %[[multiply_41]])
+// CHECK-NEXT:  %[[sqrt_43:[^ ]+]] = f64[] sqrt(%[[add_42]])
+// CHECK-NEXT:  %[[compare_46:[^ ]+]] = pred[] compare(%[[sqrt_43]], %[[constant_12]]), direction=EQ
+// CHECK-NEXT:  %[[compare_47:[^ ]+]] = pred[] compare(%[[multiply_41]], %[[constant_23]]), direction=GT
+// CHECK-NEXT:  %[[and_48:[^ ]+]] = pred[] and(%[[compare_46]], %[[compare_47]])
+// CHECK-NEXT:  %[[multiply_49:[^ ]+]] = f64[] multiply(%[[maximum_38]], %[[multiply_41]])
+// CHECK-NEXT:  %[[divide_50:[^ ]+]] = f64[] divide(%[[multiply_49]], %[[constant_24]])
+// CHECK-NEXT:  %[[add_51:[^ ]+]] = f64[] add(%[[maximum_38]], %[[divide_50]])
+// CHECK-NEXT:  %[[multiply_52:[^ ]+]] = f64[] multiply(%[[maximum_38]], %[[sqrt_43]])
+// CHECK-NEXT:  %[[select_53:[^ ]+]] = f64[] select(%[[and_48]], %[[add_51]], %[[multiply_52]])
+// CHECK-NEXT:  %[[select_54:[^ ]+]] = f64[] select(%[[compare_44]], %[[multiply_45]], %[[select_53]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = f64[] add(%[[select_35]], %[[select_54]])
+// CHECK-NEXT:  %[[multiply_56:[^ ]+]] = f64[] multiply(%[[constant_13]], %[[add_55]])
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = f64[] add(%[[multiply_56]], %[[abs_8]])
+// CHECK-NEXT:  %[[multiply_58:[^ ]+]] = f64[] multiply(%[[constant_13]], %[[add_57]])
+// CHECK-NEXT:  %[[multiply_59:[^ ]+]] = f64[] multiply(%[[abs_10]], %[[abs_10]])
+// CHECK-NEXT:  %[[add_60:[^ ]+]] = f64[] add(%[[select_35]], %[[add_14]])
+// CHECK-NEXT:  %[[divide_66:[^ ]+]] = f64[] divide(%[[multiply_59]], %[[add_60]])
+// CHECK-NEXT:  %[[subtract_61:[^ ]+]] = f64[] subtract(%[[select_54]], %[[subtract_36]])
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = f64[] add(%[[divide_66]], %[[subtract_61]])
+// CHECK-NEXT:  %[[multiply_68:[^ ]+]] = f64[] multiply(%[[multiply_58]], %[[add_67]])
+// CHECK-NEXT:  %[[sqrt_69:[^ ]+]] = f64[] sqrt(%[[multiply_68]])
+// CHECK-NEXT:  %[[divide_70:[^ ]+]] = f64[] divide(%[[multiply_58]], %[[add_60]])
+// CHECK-NEXT:  %[[add_62:[^ ]+]] = f64[] add(%[[select_54]], %[[subtract_36]])
+// CHECK-NEXT:  %[[divide_71:[^ ]+]] = f64[] divide(%[[multiply_58]], %[[add_62]])
+// CHECK-NEXT:  %[[add_72:[^ ]+]] = f64[] add(%[[divide_70]], %[[divide_71]])
+// CHECK-NEXT:  %[[sqrt_73:[^ ]+]] = f64[] sqrt(%[[add_72]])
+// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = f64[] multiply(%[[abs_10]], %[[sqrt_73]])
+// CHECK-NEXT:  %[[select_75:[^ ]+]] = f64[] select(%[[compare_65]], %[[sqrt_69]], %[[multiply_74]])
+// CHECK-NEXT:  %[[select_76:[^ ]+]] = f64[] select(%[[compare_64]], %[[abs_10]], %[[select_75]])
+// CHECK-NEXT:  %[[atan2_77:[^ ]+]] = f64[] atan2(%[[real_7]], %[[select_76]])
+// CHECK-NEXT:  %[[compare_128:[^ ]+]] = pred[] compare(%[[imag_9]], %[[constant_23]]), direction=LT
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = f64[] constant(1.6759759912428244e+165)
+// CHECK-NEXT:  %[[compare_79:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_78]]), direction=LT
+// CHECK-NEXT:  %[[constant_80:[^ ]+]] = f64[] constant(1.6759759912428242e+147)
+// CHECK-NEXT:  %[[constant_81:[^ ]+]] = f64[] constant(1.6759759912428244e+155)
+// CHECK-NEXT:  %[[select_82:[^ ]+]] = f64[] select(%[[compare_79]], %[[constant_80]], %[[constant_81]])
+// CHECK-NEXT:  %[[compare_83:[^ ]+]] = pred[] compare(%[[abs_10]], %[[select_82]]), direction=GE
+// CHECK-NEXT:  %[[select_84:[^ ]+]] = f64[] select(%[[compare_83]], %[[abs_10]], %[[abs_8]])
+// CHECK-NEXT:  %[[select_114:[^ ]+]] = f64[] select(%[[compare_83]], %[[select_82]], %[[constant_11]])
+// CHECK-NEXT:  %[[compare_115:[^ ]+]] = pred[] compare(%[[select_84]], %[[select_114]]), direction=GE
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = f64[] constant(0.69314718055994529)
+// CHECK-NEXT:  %[[log_117:[^ ]+]] = f64[] log(%[[select_84]])
+// CHECK-NEXT:  %[[add_118:[^ ]+]] = f64[] add(%[[constant_116]], %[[log_117]])
+// CHECK-NEXT:  %[[constant_85:[^ ]+]] = f64[] constant(inf)
+// CHECK-NEXT:  %[[compare_86:[^ ]+]] = pred[] compare(%[[abs_10]], %[[constant_85]]), direction=EQ
+// CHECK-NEXT:  %[[not_87:[^ ]+]] = pred[] not(%[[compare_86]])
+// CHECK-NEXT:  %[[and_88:[^ ]+]] = pred[] and(%[[compare_83]], %[[not_87]])
+// CHECK-NEXT:  %[[divide_89:[^ ]+]] = f64[] divide(%[[abs_8]], %[[abs_10]])
+// CHECK-NEXT:  %[[select_90:[^ ]+]] = f64[] select(%[[and_88]], %[[divide_89]], %[[constant_23]])
+// CHECK-NEXT:  %[[multiply_119:[^ ]+]] = f64[] multiply(%[[select_90]], %[[select_90]])
+// CHECK-NEXT:  %[[log_plus_one_120:[^ ]+]] = f64[] log-plus-one(%[[multiply_119]])
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = f64[] multiply(%[[constant_13]], %[[log_plus_one_120]])
+// CHECK-NEXT:  %[[add_122:[^ ]+]] = f64[] add(%[[add_118]], %[[multiply_121]])
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = f64[] constant(5.9666725849601654e-154)
+// CHECK-NEXT:  %[[compare_92:[^ ]+]] = pred[] compare(%[[abs_10]], %[[constant_91]]), direction=LT
+// CHECK-NEXT:  %[[compare_93:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_12]]), direction=LT
+// CHECK-NEXT:  %[[and_94:[^ ]+]] = pred[] and(%[[compare_92]], %[[compare_93]])
+// CHECK-NEXT:  %[[multiply_108:[^ ]+]] = f64[] multiply(%[[add_14]], %[[subtract_36]])
+// CHECK-NEXT:  %[[add_95:[^ ]+]] = f64[] add(%[[multiply_56]], %[[constant_12]])
+// CHECK-NEXT:  %[[divide_109:[^ ]+]] = f64[] divide(%[[multiply_108]], %[[add_95]])
+// CHECK-NEXT:  %[[negate_110:[^ ]+]] = f64[] negate(%[[divide_109]])
+// CHECK-NEXT:  %[[compare_98:[^ ]+]] = pred[] compare(%[[abs_8]], %[[constant_12]]), direction=GE
+// CHECK-NEXT:  %[[multiply_96:[^ ]+]] = f64[] multiply(%[[constant_13]], %[[multiply_59]])
+// CHECK-NEXT:  %[[divide_97:[^ ]+]] = f64[] divide(%[[multiply_96]], %[[add_60]])
+// CHECK-NEXT:  %[[multiply_99:[^ ]+]] = f64[] multiply(%[[constant_13]], %[[add_62]])
+// CHECK-NEXT:  %[[add_100:[^ ]+]] = f64[] add(%[[divide_97]], %[[multiply_99]])
+// CHECK-NEXT:  %[[constant_101:[^ ]+]] = f64[] constant(1.5)
+// CHECK-NEXT:  %[[compare_102:[^ ]+]] = pred[] compare(%[[multiply_56]], %[[constant_101]]), direction=LE
+// CHECK-NEXT:  %[[divide_103:[^ ]+]] = f64[] divide(%[[multiply_96]], %[[subtract_61]])
+// CHECK-NEXT:  %[[add_104:[^ ]+]] = f64[] add(%[[divide_97]], %[[divide_103]])
+// CHECK-NEXT:  %[[subtract_105:[^ ]+]] = f64[] subtract(%[[multiply_56]], %[[constant_12]])
+// CHECK-NEXT:  %[[select_106:[^ ]+]] = f64[] select(%[[compare_102]], %[[add_104]], %[[subtract_105]])
+// CHECK-NEXT:  %[[select_107:[^ ]+]] = f64[] select(%[[compare_98]], %[[add_100]], %[[select_106]])
+// CHECK-NEXT:  %[[select_111:[^ ]+]] = f64[] select(%[[and_94]], %[[negate_110]], %[[select_107]])
+// CHECK-NEXT:  %[[multiply_112:[^ ]+]] = f64[] multiply(%[[select_111]], %[[add_95]])
+// CHECK-NEXT:  %[[sqrt_113:[^ ]+]] = f64[] sqrt(%[[multiply_112]])
+// CHECK-NEXT:  %[[divide_123:[^ ]+]] = f64[] divide(%[[abs_10]], %[[sqrt_113]])
+// CHECK-NEXT:  %[[add_124:[^ ]+]] = f64[] add(%[[select_111]], %[[sqrt_113]])
+// CHECK-NEXT:  %[[log_plus_one_125:[^ ]+]] = f64[] log-plus-one(%[[add_124]])
+// CHECK-NEXT:  %[[select_126:[^ ]+]] = f64[] select(%[[and_94]], %[[divide_123]], %[[log_plus_one_125]])
+// CHECK-NEXT:  %[[select_127:[^ ]+]] = f64[] select(%[[compare_115]], %[[add_122]], %[[select_126]])
+// CHECK-NEXT:  %[[negate_129:[^ ]+]] = f64[] negate(%[[select_127]])
+// CHECK-NEXT:  %[[select_130:[^ ]+]] = f64[] select(%[[compare_128]], %[[negate_129]], %[[select_127]])
+// CHECK-NEXT:  %[[complex_131:[^ ]+]] = c128[] complex(%[[atan2_77]], %[[select_130]])
+// CHECK-NEXT:  %[[imag_132:[^ ]+]] = f64[] imag(%[[complex_131]])
+// CHECK-NEXT:  %[[negate_139:[^ ]+]] = f64[] negate(%[[imag_132]])
+// CHECK-NEXT:  %[[select_140:[^ ]+]] = f64[] select(%[[and_138]], %[[imag_132]], %[[negate_139]])
+// CHECK-NEXT:  %[[real_141:[^ ]+]] = f64[] real(%[[complex_131]])
+// CHECK-NEXT:  ROOT %[[complex_142:[^ ]+]] = c128[] complex(%[[select_140]], %[[real_141]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Asinh", called_computations={%[[$xla_builder_math_Asinh_143]]}
+
+HloModule asinh_complex128, entry_computation_layout={(c128[])->c128[]}
+
+ENTRY %main.3 (Arg_0.1: c128[]) -> c128[] {
+  %Arg_0.1 = c128[] parameter(0)
+  ROOT %custom-call.2 = c128[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Asinh"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/math_atan.hlo b/third_party/xla/xla/hlo/builder/tests/math_atan.hlo
new file mode 100644
index 000000000000..50177293ad7c
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_atan.hlo
@@ -0,0 +1,84 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule atan_f32, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_Atan_4:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  ROOT %[[atan2_3:[^ ]+]] = f32[] atan2(%[[arg0_1]], %[[constant_2]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan", called_computations={%[[$xla_builder_math_Atan_4]]}
+
+HloModule atan_f32, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Atan"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule atan_f64, entry_computation_layout={(f64[])->f64[]}
+
+// CHECK:       %[[$xla_builder_math_Atan_4:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  ROOT %[[atan2_3:[^ ]+]] = f64[] atan2(%[[arg0_1]], %[[constant_2]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan", called_computations={%[[$xla_builder_math_Atan_4]]}
+
+HloModule atan_f64, entry_computation_layout={(f64[])->f64[]}
+
+ENTRY %main.3 (Arg_0.1: f64[]) -> f64[] {
+  %Arg_0.1 = f64[] parameter(0)
+  ROOT %custom-call.2 = f64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Atan"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule atan_complex64, entry_computation_layout={(c64[])->c64[]}
+
+// CHECK:       %[[$xla_builder_math_Atan_4:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c64[] constant((1, 0))
+// CHECK-NEXT:  ROOT %[[atan2_3:[^ ]+]] = c64[] atan2(%[[arg0_1]], %[[constant_2]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan", called_computations={%[[$xla_builder_math_Atan_4]]}
+
+HloModule atan_complex64, entry_computation_layout={(c64[])->c64[]}
+
+ENTRY %main.3 (Arg_0.1: c64[]) -> c64[] {
+  %Arg_0.1 = c64[] parameter(0)
+  ROOT %custom-call.2 = c64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Atan"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule atan_complex128, entry_computation_layout={(c128[])->c128[]}
+
+// CHECK:       %[[$xla_builder_math_Atan_4:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c128[] constant((1, 0))
+// CHECK-NEXT:  ROOT %[[atan2_3:[^ ]+]] = c128[] atan2(%[[arg0_1]], %[[constant_2]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Atan", called_computations={%[[$xla_builder_math_Atan_4]]}
+
+HloModule atan_complex128, entry_computation_layout={(c128[])->c128[]}
+
+ENTRY %main.3 (Arg_0.1: c128[]) -> c128[] {
+  %Arg_0.1 = c128[] parameter(0)
+  ROOT %custom-call.2 = c128[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Atan"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/math_cosh.hlo b/third_party/xla/xla/hlo/builder/tests/math_cosh.hlo
new file mode 100644
index 000000000000..e1062978e1b8
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_cosh.hlo
@@ -0,0 +1,112 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule cosh_f32, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_Cosh_12:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[log_3:[^ ]+]] = f32[] log(%[[constant_2]])
+// CHECK-NEXT:  %[[add_4:[^ ]+]] = f32[] add(%[[arg0_1]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_5:[^ ]+]] = f32[] exponential(%[[add_4]])
+// CHECK-NEXT:  %[[negate_6:[^ ]+]] = f32[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[add_7:[^ ]+]] = f32[] add(%[[negate_6]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_8:[^ ]+]] = f32[] exponential(%[[add_7]])
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = f32[] add(%[[exponential_5]], %[[exponential_8]])
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  ROOT %[[maximum_11:[^ ]+]] = f32[] maximum(%[[add_9]], %[[constant_10]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh", called_computations={%[[$xla_builder_math_Cosh_12]]}
+
+HloModule cosh_f32, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Cosh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule cosh_f64, entry_computation_layout={(f64[])->f64[]}
+
+// CHECK:       %[[$xla_builder_math_Cosh_12:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f64[] constant(0.5)
+// CHECK-NEXT:  %[[log_3:[^ ]+]] = f64[] log(%[[constant_2]])
+// CHECK-NEXT:  %[[add_4:[^ ]+]] = f64[] add(%[[arg0_1]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_5:[^ ]+]] = f64[] exponential(%[[add_4]])
+// CHECK-NEXT:  %[[negate_6:[^ ]+]] = f64[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[add_7:[^ ]+]] = f64[] add(%[[negate_6]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_8:[^ ]+]] = f64[] exponential(%[[add_7]])
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = f64[] add(%[[exponential_5]], %[[exponential_8]])
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = f64[] constant(1)
+// CHECK-NEXT:  ROOT %[[maximum_11:[^ ]+]] = f64[] maximum(%[[add_9]], %[[constant_10]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh", called_computations={%[[$xla_builder_math_Cosh_12]]}
+
+HloModule cosh_f64, entry_computation_layout={(f64[])->f64[]}
+
+ENTRY %main.3 (Arg_0.1: f64[]) -> f64[] {
+  %Arg_0.1 = f64[] parameter(0)
+  ROOT %custom-call.2 = f64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Cosh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule cosh_complex64, entry_computation_layout={(c64[])->c64[]}
+
+// CHECK:       %[[$xla_builder_math_Cosh_10:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c64[] constant((0.5, 0))
+// CHECK-NEXT:  %[[log_3:[^ ]+]] = c64[] log(%[[constant_2]])
+// CHECK-NEXT:  %[[add_4:[^ ]+]] = c64[] add(%[[arg0_1]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_5:[^ ]+]] = c64[] exponential(%[[add_4]])
+// CHECK-NEXT:  %[[negate_6:[^ ]+]] = c64[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[add_7:[^ ]+]] = c64[] add(%[[negate_6]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_8:[^ ]+]] = c64[] exponential(%[[add_7]])
+// CHECK-NEXT:  ROOT %[[add_9:[^ ]+]] = c64[] add(%[[exponential_5]], %[[exponential_8]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c64[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh", called_computations={%[[$xla_builder_math_Cosh_10]]}
+
+HloModule cosh_complex64, entry_computation_layout={(c64[])->c64[]}
+
+ENTRY %main.3 (Arg_0.1: c64[]) -> c64[] {
+  %Arg_0.1 = c64[] parameter(0)
+  ROOT %custom-call.2 = c64[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Cosh"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule cosh_complex128, entry_computation_layout={(c128[])->c128[]}
+
+// CHECK:       %[[$xla_builder_math_Cosh_10:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = c128[] constant((0.5, 0))
+// CHECK-NEXT:  %[[log_3:[^ ]+]] = c128[] log(%[[constant_2]])
+// CHECK-NEXT:  %[[add_4:[^ ]+]] = c128[] add(%[[arg0_1]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_5:[^ ]+]] = c128[] exponential(%[[add_4]])
+// CHECK-NEXT:  %[[negate_6:[^ ]+]] = c128[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[add_7:[^ ]+]] = c128[] add(%[[negate_6]], %[[log_3]])
+// CHECK-NEXT:  %[[exponential_8:[^ ]+]] = c128[] exponential(%[[add_7]])
+// CHECK-NEXT:  ROOT %[[add_9:[^ ]+]] = c128[] add(%[[exponential_5]], %[[exponential_8]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c128[] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = c128[] custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.math.Cosh", called_computations={%[[$xla_builder_math_Cosh_10]]}
+
+HloModule cosh_complex128, entry_computation_layout={(c128[])->c128[]}
+
+ENTRY %main.3 (Arg_0.1: c128[]) -> c128[] {
+  %Arg_0.1 = c128[] parameter(0)
+  ROOT %custom-call.2 = c128[] custom-call(%Arg_0.1), custom_call_target="xla_builder.math.Cosh"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/math_igamma_grad_a.hlo b/third_party/xla/xla/hlo/builder/tests/math_igamma_grad_a.hlo
new file mode 100644
index 000000000000..8890922d2d06
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_igamma_grad_a.hlo
@@ -0,0 +1,703 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+// CHECK:       %[[$igammac_body_150:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_151:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_152:[^ ]+]] = pred[] get-tuple-element(%[[parameter_151]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_159:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=7
+// CHECK-NEXT:  %[[get_tuple_element_156:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=4
+// CHECK-NEXT:  %[[constant_171:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_172:[^ ]+]] = f32[] add(%[[get_tuple_element_156]], %[[constant_171]])
+// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = f32[] multiply(%[[get_tuple_element_159]], %[[add_172]])
+// CHECK-NEXT:  %[[get_tuple_element_161:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=9
+// CHECK-NEXT:  %[[get_tuple_element_155:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=3
+// CHECK-NEXT:  %[[constant_169:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_170:[^ ]+]] = f32[] add(%[[get_tuple_element_155]], %[[constant_169]])
+// CHECK-NEXT:  %[[get_tuple_element_157:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=5
+// CHECK-NEXT:  %[[constant_167:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_168:[^ ]+]] = f32[] add(%[[get_tuple_element_157]], %[[constant_167]])
+// CHECK-NEXT:  %[[multiply_173:[^ ]+]] = f32[] multiply(%[[add_170]], %[[add_168]])
+// CHECK-NEXT:  %[[multiply_178:[^ ]+]] = f32[] multiply(%[[get_tuple_element_161]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_179:[^ ]+]] = f32[] subtract(%[[multiply_177]], %[[multiply_178]])
+// CHECK-NEXT:  %[[constant_180:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_181:[^ ]+]] = pred[] compare(%[[subtract_179]], %[[constant_180]]), direction=NE
+// CHECK-NEXT:  %[[get_tuple_element_164:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=12
+// CHECK-NEXT:  %[[multiply_190:[^ ]+]] = f32[] multiply(%[[get_tuple_element_164]], %[[add_172]])
+// CHECK-NEXT:  %[[get_tuple_element_158:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=6
+// CHECK-NEXT:  %[[subtract_191:[^ ]+]] = f32[] subtract(%[[multiply_190]], %[[get_tuple_element_158]])
+// CHECK-NEXT:  %[[get_tuple_element_162:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=10
+// CHECK-NEXT:  %[[multiply_192:[^ ]+]] = f32[] multiply(%[[get_tuple_element_162]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_193:[^ ]+]] = f32[] subtract(%[[subtract_191]], %[[multiply_192]])
+// CHECK-NEXT:  %[[get_tuple_element_160:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=8
+// CHECK-NEXT:  %[[multiply_194:[^ ]+]] = f32[] multiply(%[[get_tuple_element_160]], %[[add_168]])
+// CHECK-NEXT:  %[[add_195:[^ ]+]] = f32[] add(%[[subtract_193]], %[[multiply_194]])
+// CHECK-NEXT:  %[[multiply_174:[^ ]+]] = f32[] multiply(%[[get_tuple_element_158]], %[[add_172]])
+// CHECK-NEXT:  %[[multiply_175:[^ ]+]] = f32[] multiply(%[[get_tuple_element_160]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_176:[^ ]+]] = f32[] subtract(%[[multiply_174]], %[[multiply_175]])
+// CHECK-NEXT:  %[[divide_182:[^ ]+]] = f32[] divide(%[[subtract_176]], %[[subtract_179]])
+// CHECK-NEXT:  %[[get_tuple_element_153:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=1
+// CHECK-NEXT:  %[[select_189:[^ ]+]] = f32[] select(%[[compare_181]], %[[divide_182]], %[[get_tuple_element_153]])
+// CHECK-NEXT:  %[[get_tuple_element_165:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=13
+// CHECK-NEXT:  %[[multiply_196:[^ ]+]] = f32[] multiply(%[[get_tuple_element_165]], %[[add_172]])
+// CHECK-NEXT:  %[[subtract_197:[^ ]+]] = f32[] subtract(%[[multiply_196]], %[[get_tuple_element_159]])
+// CHECK-NEXT:  %[[get_tuple_element_163:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=11
+// CHECK-NEXT:  %[[multiply_198:[^ ]+]] = f32[] multiply(%[[get_tuple_element_163]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_199:[^ ]+]] = f32[] subtract(%[[subtract_197]], %[[multiply_198]])
+// CHECK-NEXT:  %[[multiply_200:[^ ]+]] = f32[] multiply(%[[get_tuple_element_161]], %[[add_168]])
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = f32[] add(%[[subtract_199]], %[[multiply_200]])
+// CHECK-NEXT:  %[[multiply_202:[^ ]+]] = f32[] multiply(%[[select_189]], %[[add_201]])
+// CHECK-NEXT:  %[[subtract_203:[^ ]+]] = f32[] subtract(%[[add_195]], %[[multiply_202]])
+// CHECK-NEXT:  %[[divide_204:[^ ]+]] = f32[] divide(%[[subtract_203]], %[[subtract_179]])
+// CHECK-NEXT:  %[[get_tuple_element_166:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=14
+// CHECK-NEXT:  %[[select_205:[^ ]+]] = f32[] select(%[[compare_181]], %[[divide_204]], %[[get_tuple_element_166]])
+// CHECK-NEXT:  %[[subtract_206:[^ ]+]] = f32[] subtract(%[[select_205]], %[[get_tuple_element_166]])
+// CHECK-NEXT:  %[[abs_207:[^ ]+]] = f32[] abs(%[[subtract_206]])
+// CHECK-NEXT:  %[[constant_208:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_209:[^ ]+]] = f32[] broadcast(%[[constant_208]]), dimensions={}
+// CHECK-NEXT:  %[[select_210:[^ ]+]] = f32[] select(%[[compare_181]], %[[abs_207]], %[[broadcast_209]])
+// CHECK-NEXT:  %[[constant_240:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[compare_241:[^ ]+]] = pred[] compare(%[[select_210]], %[[constant_240]]), direction=GT
+// CHECK-NEXT:  %[[and_242:[^ ]+]] = pred[] and(%[[get_tuple_element_152]], %[[compare_241]])
+// CHECK-NEXT:  %[[select_243:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_189]], %[[get_tuple_element_153]])
+// CHECK-NEXT:  %[[subtract_183:[^ ]+]] = f32[] subtract(%[[get_tuple_element_153]], %[[divide_182]])
+// CHECK-NEXT:  %[[divide_184:[^ ]+]] = f32[] divide(%[[subtract_183]], %[[divide_182]])
+// CHECK-NEXT:  %[[abs_185:[^ ]+]] = f32[] abs(%[[divide_184]])
+// CHECK-NEXT:  %[[constant_186:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_187:[^ ]+]] = f32[] broadcast(%[[constant_186]]), dimensions={}
+// CHECK-NEXT:  %[[select_188:[^ ]+]] = f32[] select(%[[compare_181]], %[[abs_185]], %[[broadcast_187]])
+// CHECK-NEXT:  %[[get_tuple_element_154:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=2
+// CHECK-NEXT:  %[[select_244:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_188]], %[[get_tuple_element_154]])
+// CHECK-NEXT:  %[[select_245:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[add_170]], %[[get_tuple_element_155]])
+// CHECK-NEXT:  %[[select_246:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[add_172]], %[[get_tuple_element_156]])
+// CHECK-NEXT:  %[[abs_211:[^ ]+]] = f32[] abs(%[[subtract_176]])
+// CHECK-NEXT:  %[[constant_213:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_212:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[divide_214:[^ ]+]] = f32[] divide(%[[constant_213]], %[[constant_212]])
+// CHECK-NEXT:  %[[compare_215:[^ ]+]] = pred[] compare(%[[abs_211]], %[[divide_214]]), direction=GT
+// CHECK-NEXT:  %[[constant_219:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_220:[^ ]+]] = f32[] multiply(%[[subtract_176]], %[[constant_219]])
+// CHECK-NEXT:  %[[select_221:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_220]], %[[subtract_176]])
+// CHECK-NEXT:  %[[select_247:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_221]], %[[get_tuple_element_158]])
+// CHECK-NEXT:  %[[constant_225:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_226:[^ ]+]] = f32[] multiply(%[[subtract_179]], %[[constant_225]])
+// CHECK-NEXT:  %[[select_227:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_226]], %[[subtract_179]])
+// CHECK-NEXT:  %[[select_248:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_227]], %[[get_tuple_element_159]])
+// CHECK-NEXT:  %[[constant_216:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_217:[^ ]+]] = f32[] multiply(%[[get_tuple_element_158]], %[[constant_216]])
+// CHECK-NEXT:  %[[select_218:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_217]], %[[get_tuple_element_158]])
+// CHECK-NEXT:  %[[select_249:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_218]], %[[get_tuple_element_160]])
+// CHECK-NEXT:  %[[constant_222:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_223:[^ ]+]] = f32[] multiply(%[[get_tuple_element_159]], %[[constant_222]])
+// CHECK-NEXT:  %[[select_224:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_223]], %[[get_tuple_element_159]])
+// CHECK-NEXT:  %[[select_250:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_224]], %[[get_tuple_element_161]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_229:[^ ]+]] = f32[] multiply(%[[get_tuple_element_164]], %[[constant_228]])
+// CHECK-NEXT:  %[[select_230:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_229]], %[[get_tuple_element_164]])
+// CHECK-NEXT:  %[[select_251:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_230]], %[[get_tuple_element_162]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_232:[^ ]+]] = f32[] multiply(%[[get_tuple_element_165]], %[[constant_231]])
+// CHECK-NEXT:  %[[select_233:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_232]], %[[get_tuple_element_165]])
+// CHECK-NEXT:  %[[select_252:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_233]], %[[get_tuple_element_163]])
+// CHECK-NEXT:  %[[constant_234:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_235:[^ ]+]] = f32[] multiply(%[[add_195]], %[[constant_234]])
+// CHECK-NEXT:  %[[select_236:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_235]], %[[add_195]])
+// CHECK-NEXT:  %[[select_253:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_236]], %[[get_tuple_element_164]])
+// CHECK-NEXT:  %[[constant_237:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_238:[^ ]+]] = f32[] multiply(%[[add_201]], %[[constant_237]])
+// CHECK-NEXT:  %[[select_239:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_238]], %[[add_201]])
+// CHECK-NEXT:  %[[select_254:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_239]], %[[get_tuple_element_165]])
+// CHECK-NEXT:  %[[select_255:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_205]], %[[get_tuple_element_166]])
+// CHECK-NEXT:  ROOT %[[tuple_256:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(%[[and_242]], %[[select_243]], %[[select_244]], %[[select_245]], %[[select_246]], /*index=5*/%[[add_168]], %[[select_247]], %[[select_248]], %[[select_249]], %[[select_250]], /*index=10*/%[[select_251]], %[[select_252]], %[[select_253]], %[[select_254]], %[[select_255]])
+
+// CHECK:       %[[$or_257:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_258:[^ ]+]] = pred[] parameter(0)
+// CHECK-NEXT:  %[[rhs_259:[^ ]+]] = pred[] parameter(1)
+// CHECK-NEXT:  ROOT %[[or_260:[^ ]+]] = pred[] or(%[[lhs_258]], %[[rhs_259]])
+
+// CHECK:       %[[$igammac_condition_261:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_262:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_264:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_265:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_266:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_267:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_269:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_270:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=7
+// CHECK-NEXT:  %[[get_tuple_element_271:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=8
+// CHECK-NEXT:  %[[get_tuple_element_272:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=9
+// CHECK-NEXT:  %[[get_tuple_element_273:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=10
+// CHECK-NEXT:  %[[get_tuple_element_274:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=11
+// CHECK-NEXT:  %[[get_tuple_element_275:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=12
+// CHECK-NEXT:  %[[get_tuple_element_276:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=13
+// CHECK-NEXT:  %[[get_tuple_element_277:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=14
+// CHECK-NEXT:  %[[get_tuple_element_268:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=5
+// CHECK-NEXT:  %[[constant_278:[^ ]+]] = f32[] constant(2000)
+// CHECK-NEXT:  %[[compare_279:[^ ]+]] = pred[] compare(%[[get_tuple_element_268]], %[[constant_278]]), direction=LT
+// CHECK-NEXT:  %[[get_tuple_element_263:[^ ]+]] = pred[] get-tuple-element(%[[parameter_262]]), index=0
+// CHECK-NEXT:  %[[constant_280:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  %[[reduce_281:[^ ]+]] = pred[] reduce(%[[get_tuple_element_263]], %[[constant_280]]), dimensions={}, to_apply=%[[$or_257]]
+// CHECK-NEXT:  ROOT %[[and_282:[^ ]+]] = pred[] and(%[[compare_279]], %[[reduce_281]])
+
+// CHECK:       %[[$igamma_body_460:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_461:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_462:[^ ]+]] = pred[] get-tuple-element(%[[parameter_461]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_467:[^ ]+]] = f32[] get-tuple-element(%[[parameter_461]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_466:[^ ]+]] = f32[] get-tuple-element(%[[parameter_461]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_463:[^ ]+]] = f32[] get-tuple-element(%[[parameter_461]]), index=1
+// CHECK-NEXT:  %[[constant_469:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_470:[^ ]+]] = f32[] add(%[[get_tuple_element_463]], %[[constant_469]])
+// CHECK-NEXT:  %[[divide_471:[^ ]+]] = f32[] divide(%[[get_tuple_element_466]], %[[add_470]])
+// CHECK-NEXT:  %[[multiply_472:[^ ]+]] = f32[] multiply(%[[get_tuple_element_467]], %[[divide_471]])
+// CHECK-NEXT:  %[[constant_473:[^ ]+]] = f32[] constant(-1)
+// CHECK-NEXT:  %[[get_tuple_element_464:[^ ]+]] = f32[] get-tuple-element(%[[parameter_461]]), index=2
+// CHECK-NEXT:  %[[multiply_474:[^ ]+]] = f32[] multiply(%[[constant_473]], %[[get_tuple_element_464]])
+// CHECK-NEXT:  %[[multiply_475:[^ ]+]] = f32[] multiply(%[[multiply_474]], %[[get_tuple_element_466]])
+// CHECK-NEXT:  %[[multiply_476:[^ ]+]] = f32[] multiply(%[[add_470]], %[[add_470]])
+// CHECK-NEXT:  %[[divide_477:[^ ]+]] = f32[] divide(%[[multiply_475]], %[[multiply_476]])
+// CHECK-NEXT:  %[[add_478:[^ ]+]] = f32[] add(%[[multiply_472]], %[[divide_477]])
+// CHECK-NEXT:  %[[get_tuple_element_468:[^ ]+]] = f32[] get-tuple-element(%[[parameter_461]]), index=6
+// CHECK-NEXT:  %[[add_479:[^ ]+]] = f32[] add(%[[get_tuple_element_468]], %[[add_478]])
+// CHECK-NEXT:  %[[divide_483:[^ ]+]] = f32[] divide(%[[add_478]], %[[add_479]])
+// CHECK-NEXT:  %[[abs_484:[^ ]+]] = f32[] abs(%[[divide_483]])
+// CHECK-NEXT:  %[[constant_485:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[compare_486:[^ ]+]] = pred[] compare(%[[abs_484]], %[[constant_485]]), direction=GT
+// CHECK-NEXT:  %[[and_487:[^ ]+]] = pred[] and(%[[get_tuple_element_462]], %[[compare_486]])
+// CHECK-NEXT:  %[[select_488:[^ ]+]] = f32[] select(%[[get_tuple_element_462]], %[[add_470]], %[[get_tuple_element_463]])
+// CHECK-NEXT:  %[[divide_480:[^ ]+]] = f32[] divide(%[[get_tuple_element_466]], %[[add_470]])
+// CHECK-NEXT:  %[[multiply_481:[^ ]+]] = f32[] multiply(%[[get_tuple_element_464]], %[[divide_480]])
+// CHECK-NEXT:  %[[select_489:[^ ]+]] = f32[] select(%[[get_tuple_element_462]], %[[multiply_481]], %[[get_tuple_element_464]])
+// CHECK-NEXT:  %[[get_tuple_element_465:[^ ]+]] = f32[] get-tuple-element(%[[parameter_461]]), index=3
+// CHECK-NEXT:  %[[add_482:[^ ]+]] = f32[] add(%[[get_tuple_element_465]], %[[multiply_481]])
+// CHECK-NEXT:  %[[select_490:[^ ]+]] = f32[] select(%[[get_tuple_element_462]], %[[add_482]], %[[get_tuple_element_465]])
+// CHECK-NEXT:  %[[select_491:[^ ]+]] = f32[] select(%[[get_tuple_element_462]], %[[get_tuple_element_466]], %[[get_tuple_element_466]])
+// CHECK-NEXT:  %[[select_492:[^ ]+]] = f32[] select(%[[get_tuple_element_462]], %[[add_478]], %[[get_tuple_element_467]])
+// CHECK-NEXT:  %[[select_493:[^ ]+]] = f32[] select(%[[get_tuple_element_462]], %[[add_479]], %[[get_tuple_element_468]])
+// CHECK-NEXT:  ROOT %[[tuple_494:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(%[[and_487]], %[[select_488]], %[[select_489]], %[[select_490]], %[[select_491]], /*index=5*/%[[select_492]], %[[select_493]])
+
+// CHECK:       %[[$or_495:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_496:[^ ]+]] = pred[] parameter(0)
+// CHECK-NEXT:  %[[rhs_497:[^ ]+]] = pred[] parameter(1)
+// CHECK-NEXT:  ROOT %[[or_498:[^ ]+]] = pred[] or(%[[lhs_496]], %[[rhs_497]])
+
+// CHECK:       %[[$igamma_condition_499:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_500:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_502:[^ ]+]] = f32[] get-tuple-element(%[[parameter_500]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_503:[^ ]+]] = f32[] get-tuple-element(%[[parameter_500]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_504:[^ ]+]] = f32[] get-tuple-element(%[[parameter_500]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_505:[^ ]+]] = f32[] get-tuple-element(%[[parameter_500]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_506:[^ ]+]] = f32[] get-tuple-element(%[[parameter_500]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_507:[^ ]+]] = f32[] get-tuple-element(%[[parameter_500]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_501:[^ ]+]] = pred[] get-tuple-element(%[[parameter_500]]), index=0
+// CHECK-NEXT:  %[[constant_508:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  ROOT %[[reduce_509:[^ ]+]] = pred[] reduce(%[[get_tuple_element_501]], %[[constant_508]]), dimensions={}, to_apply=%[[$or_495]]
+
+// CHECK:       %[[$xla_builder_math_IgammaGradA_678:[^ ]+]]
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_7:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_6]]), direction=EQ
+// CHECK-NEXT:  %[[constant_8:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_9:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_8]]), direction=LT
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_11:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_10]]), direction=LE
+// CHECK-NEXT:  %[[or_12:[^ ]+]] = pred[] or(%[[compare_9]], %[[compare_11]])
+// CHECK-NEXT:  %[[or_120:[^ ]+]] = pred[] or(%[[compare_7]], %[[or_12]])
+// CHECK-NEXT:  %[[log_17:[^ ]+]] = f32[] log(%[[arg1_2]])
+// CHECK-NEXT:  %[[multiply_18:[^ ]+]] = f32[] multiply(%[[arg0_1]], %[[log_17]])
+// CHECK-NEXT:  %[[subtract_19:[^ ]+]] = f32[] subtract(%[[multiply_18]], %[[arg1_2]])
+// CHECK-NEXT:  %[[abs_110:[^ ]+]] = f32[] abs(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_112:[^ ]+]] = pred[] compare(%[[abs_110]], %[[constant_111]]), direction=EQ
+// CHECK-NEXT:  %[[constant_108:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = f32[] broadcast(%[[constant_108]]), dimensions={}
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_28:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_20]]), direction=LT
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[abs_91:[^ ]+]] = f32[] abs(%[[arg0_1]])
+// CHECK-NEXT:  %[[floor_92:[^ ]+]] = f32[] floor(%[[abs_91]])
+// CHECK-NEXT:  %[[subtract_93:[^ ]+]] = f32[] subtract(%[[abs_91]], %[[floor_92]])
+// CHECK-NEXT:  %[[constant_94:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_95:[^ ]+]] = pred[] compare(%[[subtract_93]], %[[constant_94]]), direction=GT
+// CHECK-NEXT:  %[[constant_96:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_97:[^ ]+]] = f32[] subtract(%[[constant_96]], %[[subtract_93]])
+// CHECK-NEXT:  %[[select_98:[^ ]+]] = f32[] select(%[[compare_95]], %[[subtract_97]], %[[subtract_93]])
+// CHECK-NEXT:  %[[multiply_99:[^ ]+]] = f32[] multiply(%[[constant_22]], %[[select_98]])
+// CHECK-NEXT:  %[[sine_100:[^ ]+]] = f32[] sine(%[[multiply_99]])
+// CHECK-NEXT:  %[[log_101:[^ ]+]] = f32[] log(%[[sine_100]])
+// CHECK-NEXT:  %[[is_finite_102:[^ ]+]] = pred[] is-finite(%[[log_101]])
+// CHECK-NEXT:  %[[constant_23:[^ ]+]] = f32[] constant(1.14472985)
+// CHECK-NEXT:  %[[subtract_103:[^ ]+]] = f32[] subtract(%[[constant_23]], %[[log_101]])
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = f32[] constant(0.918938518)
+// CHECK-NEXT:  %[[negate_29:[^ ]+]] = f32[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_21:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_30:[^ ]+]] = f32[] subtract(%[[arg0_1]], %[[constant_21]])
+// CHECK-NEXT:  %[[select_31:[^ ]+]] = f32[] select(%[[compare_28]], %[[negate_29]], %[[subtract_30]])
+// CHECK-NEXT:  %[[add_84:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_20]])
+// CHECK-NEXT:  %[[constant_25:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[add_80:[^ ]+]] = f32[] add(%[[constant_25]], %[[select_31]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[divide_81:[^ ]+]] = f32[] divide(%[[select_31]], %[[constant_25]])
+// CHECK-NEXT:  %[[log_plus_one_82:[^ ]+]] = f32[] log-plus-one(%[[divide_81]])
+// CHECK-NEXT:  %[[add_83:[^ ]+]] = f32[] add(%[[constant_26]], %[[log_plus_one_82]])
+// CHECK-NEXT:  %[[divide_85:[^ ]+]] = f32[] divide(%[[add_80]], %[[add_83]])
+// CHECK-NEXT:  %[[subtract_86:[^ ]+]] = f32[] subtract(%[[add_84]], %[[divide_85]])
+// CHECK-NEXT:  %[[multiply_87:[^ ]+]] = f32[] multiply(%[[subtract_86]], %[[add_83]])
+// CHECK-NEXT:  %[[add_88:[^ ]+]] = f32[] add(%[[constant_24]], %[[multiply_87]])
+// CHECK-NEXT:  %[[constant_27:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_32:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_33:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_34:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_33]])
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = f32[] add(%[[add_34]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_36:[^ ]+]] = f32[] divide(%[[constant_32]], %[[add_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = f32[] add(%[[constant_27]], %[[divide_36]])
+// CHECK-NEXT:  %[[constant_38:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_39:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_40:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_39]])
+// CHECK-NEXT:  %[[add_41:[^ ]+]] = f32[] add(%[[add_40]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_42:[^ ]+]] = f32[] divide(%[[constant_38]], %[[add_41]])
+// CHECK-NEXT:  %[[add_43:[^ ]+]] = f32[] add(%[[add_37]], %[[divide_42]])
+// CHECK-NEXT:  %[[constant_44:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_45:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_45]])
+// CHECK-NEXT:  %[[add_47:[^ ]+]] = f32[] add(%[[add_46]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_48:[^ ]+]] = f32[] divide(%[[constant_44]], %[[add_47]])
+// CHECK-NEXT:  %[[add_49:[^ ]+]] = f32[] add(%[[add_43]], %[[divide_48]])
+// CHECK-NEXT:  %[[constant_50:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_51:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_51]])
+// CHECK-NEXT:  %[[add_53:[^ ]+]] = f32[] add(%[[add_52]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_54:[^ ]+]] = f32[] divide(%[[constant_50]], %[[add_53]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = f32[] add(%[[add_49]], %[[divide_54]])
+// CHECK-NEXT:  %[[constant_56:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_57:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_58:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_57]])
+// CHECK-NEXT:  %[[add_59:[^ ]+]] = f32[] add(%[[add_58]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_60:[^ ]+]] = f32[] divide(%[[constant_56]], %[[add_59]])
+// CHECK-NEXT:  %[[add_61:[^ ]+]] = f32[] add(%[[add_55]], %[[divide_60]])
+// CHECK-NEXT:  %[[constant_62:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_64:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_63]])
+// CHECK-NEXT:  %[[add_65:[^ ]+]] = f32[] add(%[[add_64]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_66:[^ ]+]] = f32[] divide(%[[constant_62]], %[[add_65]])
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = f32[] add(%[[add_61]], %[[divide_66]])
+// CHECK-NEXT:  %[[constant_68:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_69:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_70:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_69]])
+// CHECK-NEXT:  %[[add_71:[^ ]+]] = f32[] add(%[[add_70]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_72:[^ ]+]] = f32[] divide(%[[constant_68]], %[[add_71]])
+// CHECK-NEXT:  %[[add_73:[^ ]+]] = f32[] add(%[[add_67]], %[[divide_72]])
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_75:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_76:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_75]])
+// CHECK-NEXT:  %[[add_77:[^ ]+]] = f32[] add(%[[add_76]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_78:[^ ]+]] = f32[] divide(%[[constant_74]], %[[add_77]])
+// CHECK-NEXT:  %[[add_79:[^ ]+]] = f32[] add(%[[add_73]], %[[divide_78]])
+// CHECK-NEXT:  %[[log_89:[^ ]+]] = f32[] log(%[[add_79]])
+// CHECK-NEXT:  %[[add_90:[^ ]+]] = f32[] add(%[[add_88]], %[[log_89]])
+// CHECK-NEXT:  %[[subtract_104:[^ ]+]] = f32[] subtract(%[[subtract_103]], %[[add_90]])
+// CHECK-NEXT:  %[[negate_105:[^ ]+]] = f32[] negate(%[[log_101]])
+// CHECK-NEXT:  %[[select_106:[^ ]+]] = f32[] select(%[[is_finite_102]], %[[subtract_104]], %[[negate_105]])
+// CHECK-NEXT:  %[[select_107:[^ ]+]] = f32[] select(%[[compare_28]], %[[select_106]], %[[add_90]])
+// CHECK-NEXT:  %[[select_113:[^ ]+]] = f32[] select(%[[compare_112]], %[[broadcast_109]], %[[select_107]])
+// CHECK-NEXT:  %[[subtract_114:[^ ]+]] = f32[] subtract(%[[subtract_19]], %[[select_113]])
+// CHECK-NEXT:  %[[constant_115:[^ ]+]] = f32[] constant(3.40282347e+38)
+// CHECK-NEXT:  %[[log_116:[^ ]+]] = f32[] log(%[[constant_115]])
+// CHECK-NEXT:  %[[negate_117:[^ ]+]] = f32[] negate(%[[log_116]])
+// CHECK-NEXT:  %[[compare_118:[^ ]+]] = pred[] compare(%[[subtract_114]], %[[negate_117]]), direction=LT
+// CHECK-NEXT:  %[[or_121:[^ ]+]] = pred[] or(%[[or_120]], %[[compare_118]])
+// CHECK-NEXT:  %[[compare_3:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[arg0_1]]), direction=NE
+// CHECK-NEXT:  %[[compare_4:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[arg1_2]]), direction=NE
+// CHECK-NEXT:  %[[or_5:[^ ]+]] = pred[] or(%[[compare_3]], %[[compare_4]])
+// CHECK-NEXT:  %[[or_122:[^ ]+]] = pred[] or(%[[or_121]], %[[or_5]])
+// CHECK-NEXT:  %[[not_123:[^ ]+]] = pred[] not(%[[or_122]])
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_14:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_13]]), direction=GT
+// CHECK-NEXT:  %[[compare_15:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[arg0_1]]), direction=GT
+// CHECK-NEXT:  %[[and_16:[^ ]+]] = pred[] and(%[[compare_14]], %[[compare_15]])
+// CHECK-NEXT:  %[[and_124:[^ ]+]] = pred[] and(%[[not_123]], %[[and_16]])
+// CHECK-NEXT:  %[[constant_133:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_134:[^ ]+]] = f32[] add(%[[arg1_2]], %[[constant_133]])
+// CHECK-NEXT:  %[[constant_125:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_126:[^ ]+]] = f32[] subtract(%[[constant_125]], %[[arg0_1]])
+// CHECK-NEXT:  %[[add_127:[^ ]+]] = f32[] add(%[[arg1_2]], %[[subtract_126]])
+// CHECK-NEXT:  %[[constant_128:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_129:[^ ]+]] = f32[] add(%[[add_127]], %[[constant_128]])
+// CHECK-NEXT:  %[[multiply_135:[^ ]+]] = f32[] multiply(%[[add_129]], %[[arg1_2]])
+// CHECK-NEXT:  %[[divide_136:[^ ]+]] = f32[] divide(%[[add_134]], %[[multiply_135]])
+// CHECK-NEXT:  %[[constant_137:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_138:[^ ]+]] = f32[] broadcast(%[[constant_137]]), dimensions={}
+// CHECK-NEXT:  %[[constant_130:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_131:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_132:[^ ]+]] = f32[] broadcast(%[[constant_131]]), dimensions={}
+// CHECK-NEXT:  %[[constant_139:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = f32[] broadcast(%[[constant_139]]), dimensions={}
+// CHECK-NEXT:  %[[constant_141:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_142:[^ ]+]] = f32[] broadcast(%[[constant_141]]), dimensions={}
+// CHECK-NEXT:  %[[constant_143:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_144:[^ ]+]] = f32[] broadcast(%[[constant_143]]), dimensions={}
+// CHECK-NEXT:  %[[negate_145:[^ ]+]] = f32[] negate(%[[arg1_2]])
+// CHECK-NEXT:  %[[multiply_146:[^ ]+]] = f32[] multiply(%[[divide_136]], %[[negate_145]])
+// CHECK-NEXT:  %[[subtract_147:[^ ]+]] = f32[] subtract(%[[broadcast_144]], %[[multiply_146]])
+// CHECK-NEXT:  %[[divide_148:[^ ]+]] = f32[] divide(%[[subtract_147]], %[[multiply_135]])
+// CHECK-NEXT:  %[[tuple_149:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(%[[and_124]], %[[divide_136]], %[[broadcast_138]], %[[subtract_126]], %[[add_129]], /*index=5*/%[[constant_130]], %[[add_134]], %[[multiply_135]], %[[broadcast_132]], %[[arg1_2]], /*index=10*/%[[broadcast_140]], %[[broadcast_142]], %[[broadcast_144]], %[[negate_145]], %[[divide_148]])
+// CHECK-NEXT:  %[[while_283:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) while(%[[tuple_149]]), condition=%[[$igammac_condition_261]], body=%[[$igammac_body_150]]
+// CHECK-NEXT:  %[[get_tuple_element_284:[^ ]+]] = pred[] get-tuple-element(%[[while_283]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_286:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_287:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_288:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_289:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_290:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_291:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=7
+// CHECK-NEXT:  %[[get_tuple_element_292:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=8
+// CHECK-NEXT:  %[[get_tuple_element_293:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=9
+// CHECK-NEXT:  %[[get_tuple_element_294:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=10
+// CHECK-NEXT:  %[[get_tuple_element_295:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=11
+// CHECK-NEXT:  %[[get_tuple_element_296:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=12
+// CHECK-NEXT:  %[[get_tuple_element_297:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=13
+// CHECK-NEXT:  %[[not_449:[^ ]+]] = pred[] not(%[[and_16]])
+// CHECK-NEXT:  %[[and_450:[^ ]+]] = pred[] and(%[[not_123]], %[[not_449]])
+// CHECK-NEXT:  %[[constant_451:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_452:[^ ]+]] = f32[] broadcast(%[[constant_451]]), dimensions={}
+// CHECK-NEXT:  %[[constant_453:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_454:[^ ]+]] = f32[] broadcast(%[[constant_453]]), dimensions={}
+// CHECK-NEXT:  %[[constant_455:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_456:[^ ]+]] = f32[] broadcast(%[[constant_455]]), dimensions={}
+// CHECK-NEXT:  %[[constant_457:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_458:[^ ]+]] = f32[] broadcast(%[[constant_457]]), dimensions={}
+// CHECK-NEXT:  %[[tuple_459:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(%[[and_450]], %[[arg0_1]], %[[broadcast_452]], %[[broadcast_454]], %[[arg1_2]], /*index=5*/%[[broadcast_456]], %[[broadcast_458]])
+// CHECK-NEXT:  %[[while_510:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) while(%[[tuple_459]]), condition=%[[$igamma_condition_499]], body=%[[$igamma_body_460]]
+// CHECK-NEXT:  %[[get_tuple_element_511:[^ ]+]] = pred[] get-tuple-element(%[[while_510]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_512:[^ ]+]] = f32[] get-tuple-element(%[[while_510]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_513:[^ ]+]] = f32[] get-tuple-element(%[[while_510]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_515:[^ ]+]] = f32[] get-tuple-element(%[[while_510]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_516:[^ ]+]] = f32[] get-tuple-element(%[[while_510]]), index=5
+// CHECK-NEXT:  %[[or_674:[^ ]+]] = pred[] or(%[[or_12]], %[[or_5]])
+// CHECK-NEXT:  %[[constant_675:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_676:[^ ]+]] = f32[] broadcast(%[[constant_675]]), dimensions={}
+// CHECK-NEXT:  %[[constant_671:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_672:[^ ]+]] = f32[] broadcast(%[[constant_671]]), dimensions={}
+// CHECK-NEXT:  %[[exponential_119:[^ ]+]] = f32[] exponential(%[[subtract_114]])
+// CHECK-NEXT:  %[[get_tuple_element_285:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=1
+// CHECK-NEXT:  %[[log_299:[^ ]+]] = f32[] log(%[[arg1_2]])
+// CHECK-NEXT:  %[[constant_300:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_437:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_300]]), direction=LE
+// CHECK-NEXT:  %[[floor_438:[^ ]+]] = f32[] floor(%[[arg0_1]])
+// CHECK-NEXT:  %[[compare_439:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[floor_438]]), direction=EQ
+// CHECK-NEXT:  %[[and_440:[^ ]+]] = pred[] and(%[[compare_437]], %[[compare_439]])
+// CHECK-NEXT:  %[[constant_441:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_442:[^ ]+]] = f32[] broadcast(%[[constant_441]]), dimensions={}
+// CHECK-NEXT:  %[[constant_301:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_308:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_301]]), direction=LT
+// CHECK-NEXT:  %[[constant_306:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[negate_309:[^ ]+]] = f32[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_302:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_310:[^ ]+]] = f32[] subtract(%[[arg0_1]], %[[constant_302]])
+// CHECK-NEXT:  %[[select_311:[^ ]+]] = f32[] select(%[[compare_308]], %[[negate_309]], %[[subtract_310]])
+// CHECK-NEXT:  %[[constant_305:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[divide_417:[^ ]+]] = f32[] divide(%[[select_311]], %[[constant_305]])
+// CHECK-NEXT:  %[[log_plus_one_418:[^ ]+]] = f32[] log-plus-one(%[[divide_417]])
+// CHECK-NEXT:  %[[add_419:[^ ]+]] = f32[] add(%[[constant_306]], %[[log_plus_one_418]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_313:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_314:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_313]])
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = f32[] add(%[[add_314]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_316:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_313]])
+// CHECK-NEXT:  %[[add_317:[^ ]+]] = f32[] add(%[[add_316]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_318:[^ ]+]] = f32[] multiply(%[[add_315]], %[[add_317]])
+// CHECK-NEXT:  %[[divide_319:[^ ]+]] = f32[] divide(%[[constant_312]], %[[multiply_318]])
+// CHECK-NEXT:  %[[subtract_320:[^ ]+]] = f32[] subtract(%[[constant_300]], %[[divide_319]])
+// CHECK-NEXT:  %[[constant_325:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_326:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_327:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_326]])
+// CHECK-NEXT:  %[[add_328:[^ ]+]] = f32[] add(%[[add_327]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_329:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_326]])
+// CHECK-NEXT:  %[[add_330:[^ ]+]] = f32[] add(%[[add_329]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_331:[^ ]+]] = f32[] multiply(%[[add_328]], %[[add_330]])
+// CHECK-NEXT:  %[[divide_332:[^ ]+]] = f32[] divide(%[[constant_325]], %[[multiply_331]])
+// CHECK-NEXT:  %[[subtract_333:[^ ]+]] = f32[] subtract(%[[subtract_320]], %[[divide_332]])
+// CHECK-NEXT:  %[[constant_338:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_339:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_340:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_339]])
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = f32[] add(%[[add_340]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_342:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_339]])
+// CHECK-NEXT:  %[[add_343:[^ ]+]] = f32[] add(%[[add_342]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_344:[^ ]+]] = f32[] multiply(%[[add_341]], %[[add_343]])
+// CHECK-NEXT:  %[[divide_345:[^ ]+]] = f32[] divide(%[[constant_338]], %[[multiply_344]])
+// CHECK-NEXT:  %[[subtract_346:[^ ]+]] = f32[] subtract(%[[subtract_333]], %[[divide_345]])
+// CHECK-NEXT:  %[[constant_351:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_352:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_353:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_352]])
+// CHECK-NEXT:  %[[add_354:[^ ]+]] = f32[] add(%[[add_353]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_355:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_352]])
+// CHECK-NEXT:  %[[add_356:[^ ]+]] = f32[] add(%[[add_355]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_357:[^ ]+]] = f32[] multiply(%[[add_354]], %[[add_356]])
+// CHECK-NEXT:  %[[divide_358:[^ ]+]] = f32[] divide(%[[constant_351]], %[[multiply_357]])
+// CHECK-NEXT:  %[[subtract_359:[^ ]+]] = f32[] subtract(%[[subtract_346]], %[[divide_358]])
+// CHECK-NEXT:  %[[constant_364:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_365:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_366:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_365]])
+// CHECK-NEXT:  %[[add_367:[^ ]+]] = f32[] add(%[[add_366]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_368:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_365]])
+// CHECK-NEXT:  %[[add_369:[^ ]+]] = f32[] add(%[[add_368]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_370:[^ ]+]] = f32[] multiply(%[[add_367]], %[[add_369]])
+// CHECK-NEXT:  %[[divide_371:[^ ]+]] = f32[] divide(%[[constant_364]], %[[multiply_370]])
+// CHECK-NEXT:  %[[subtract_372:[^ ]+]] = f32[] subtract(%[[subtract_359]], %[[divide_371]])
+// CHECK-NEXT:  %[[constant_377:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_378:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_379:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_378]])
+// CHECK-NEXT:  %[[add_380:[^ ]+]] = f32[] add(%[[add_379]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_381:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_378]])
+// CHECK-NEXT:  %[[add_382:[^ ]+]] = f32[] add(%[[add_381]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_383:[^ ]+]] = f32[] multiply(%[[add_380]], %[[add_382]])
+// CHECK-NEXT:  %[[divide_384:[^ ]+]] = f32[] divide(%[[constant_377]], %[[multiply_383]])
+// CHECK-NEXT:  %[[subtract_385:[^ ]+]] = f32[] subtract(%[[subtract_372]], %[[divide_384]])
+// CHECK-NEXT:  %[[constant_390:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_391:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_392:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_391]])
+// CHECK-NEXT:  %[[add_393:[^ ]+]] = f32[] add(%[[add_392]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_394:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_391]])
+// CHECK-NEXT:  %[[add_395:[^ ]+]] = f32[] add(%[[add_394]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_396:[^ ]+]] = f32[] multiply(%[[add_393]], %[[add_395]])
+// CHECK-NEXT:  %[[divide_397:[^ ]+]] = f32[] divide(%[[constant_390]], %[[multiply_396]])
+// CHECK-NEXT:  %[[subtract_398:[^ ]+]] = f32[] subtract(%[[subtract_385]], %[[divide_397]])
+// CHECK-NEXT:  %[[constant_403:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_404:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_405:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_404]])
+// CHECK-NEXT:  %[[add_406:[^ ]+]] = f32[] add(%[[add_405]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_407:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_404]])
+// CHECK-NEXT:  %[[add_408:[^ ]+]] = f32[] add(%[[add_407]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_409:[^ ]+]] = f32[] multiply(%[[add_406]], %[[add_408]])
+// CHECK-NEXT:  %[[divide_410:[^ ]+]] = f32[] divide(%[[constant_403]], %[[multiply_409]])
+// CHECK-NEXT:  %[[subtract_411:[^ ]+]] = f32[] subtract(%[[subtract_398]], %[[divide_410]])
+// CHECK-NEXT:  %[[constant_307:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_321:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_313]])
+// CHECK-NEXT:  %[[add_322:[^ ]+]] = f32[] add(%[[add_321]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_323:[^ ]+]] = f32[] divide(%[[constant_312]], %[[add_322]])
+// CHECK-NEXT:  %[[add_324:[^ ]+]] = f32[] add(%[[constant_307]], %[[divide_323]])
+// CHECK-NEXT:  %[[add_334:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_326]])
+// CHECK-NEXT:  %[[add_335:[^ ]+]] = f32[] add(%[[add_334]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_336:[^ ]+]] = f32[] divide(%[[constant_325]], %[[add_335]])
+// CHECK-NEXT:  %[[add_337:[^ ]+]] = f32[] add(%[[add_324]], %[[divide_336]])
+// CHECK-NEXT:  %[[add_347:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_339]])
+// CHECK-NEXT:  %[[add_348:[^ ]+]] = f32[] add(%[[add_347]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_349:[^ ]+]] = f32[] divide(%[[constant_338]], %[[add_348]])
+// CHECK-NEXT:  %[[add_350:[^ ]+]] = f32[] add(%[[add_337]], %[[divide_349]])
+// CHECK-NEXT:  %[[add_360:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_352]])
+// CHECK-NEXT:  %[[add_361:[^ ]+]] = f32[] add(%[[add_360]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_362:[^ ]+]] = f32[] divide(%[[constant_351]], %[[add_361]])
+// CHECK-NEXT:  %[[add_363:[^ ]+]] = f32[] add(%[[add_350]], %[[divide_362]])
+// CHECK-NEXT:  %[[add_373:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_365]])
+// CHECK-NEXT:  %[[add_374:[^ ]+]] = f32[] add(%[[add_373]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_375:[^ ]+]] = f32[] divide(%[[constant_364]], %[[add_374]])
+// CHECK-NEXT:  %[[add_376:[^ ]+]] = f32[] add(%[[add_363]], %[[divide_375]])
+// CHECK-NEXT:  %[[add_386:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_378]])
+// CHECK-NEXT:  %[[add_387:[^ ]+]] = f32[] add(%[[add_386]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_388:[^ ]+]] = f32[] divide(%[[constant_377]], %[[add_387]])
+// CHECK-NEXT:  %[[add_389:[^ ]+]] = f32[] add(%[[add_376]], %[[divide_388]])
+// CHECK-NEXT:  %[[add_399:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_391]])
+// CHECK-NEXT:  %[[add_400:[^ ]+]] = f32[] add(%[[add_399]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_401:[^ ]+]] = f32[] divide(%[[constant_390]], %[[add_400]])
+// CHECK-NEXT:  %[[add_402:[^ ]+]] = f32[] add(%[[add_389]], %[[divide_401]])
+// CHECK-NEXT:  %[[add_412:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_404]])
+// CHECK-NEXT:  %[[add_413:[^ ]+]] = f32[] add(%[[add_412]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_414:[^ ]+]] = f32[] divide(%[[constant_403]], %[[add_413]])
+// CHECK-NEXT:  %[[add_415:[^ ]+]] = f32[] add(%[[add_402]], %[[divide_414]])
+// CHECK-NEXT:  %[[divide_420:[^ ]+]] = f32[] divide(%[[subtract_411]], %[[add_415]])
+// CHECK-NEXT:  %[[add_421:[^ ]+]] = f32[] add(%[[add_419]], %[[divide_420]])
+// CHECK-NEXT:  %[[constant_304:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_416:[^ ]+]] = f32[] add(%[[constant_305]], %[[select_311]])
+// CHECK-NEXT:  %[[divide_422:[^ ]+]] = f32[] divide(%[[constant_304]], %[[add_416]])
+// CHECK-NEXT:  %[[subtract_423:[^ ]+]] = f32[] subtract(%[[add_421]], %[[divide_422]])
+// CHECK-NEXT:  %[[constant_303:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[constant_424:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[add_425:[^ ]+]] = f32[] add(%[[arg0_1]], %[[constant_424]])
+// CHECK-NEXT:  %[[floor_426:[^ ]+]] = f32[] floor(%[[add_425]])
+// CHECK-NEXT:  %[[abs_427:[^ ]+]] = f32[] abs(%[[floor_426]])
+// CHECK-NEXT:  %[[add_428:[^ ]+]] = f32[] add(%[[arg0_1]], %[[abs_427]])
+// CHECK-NEXT:  %[[multiply_429:[^ ]+]] = f32[] multiply(%[[constant_303]], %[[add_428]])
+// CHECK-NEXT:  %[[cosine_430:[^ ]+]] = f32[] cosine(%[[multiply_429]])
+// CHECK-NEXT:  %[[multiply_431:[^ ]+]] = f32[] multiply(%[[constant_303]], %[[cosine_430]])
+// CHECK-NEXT:  %[[multiply_432:[^ ]+]] = f32[] multiply(%[[constant_303]], %[[add_428]])
+// CHECK-NEXT:  %[[sine_433:[^ ]+]] = f32[] sine(%[[multiply_432]])
+// CHECK-NEXT:  %[[divide_434:[^ ]+]] = f32[] divide(%[[multiply_431]], %[[sine_433]])
+// CHECK-NEXT:  %[[subtract_435:[^ ]+]] = f32[] subtract(%[[subtract_423]], %[[divide_434]])
+// CHECK-NEXT:  %[[select_436:[^ ]+]] = f32[] select(%[[compare_308]], %[[subtract_435]], %[[subtract_423]])
+// CHECK-NEXT:  %[[select_443:[^ ]+]] = f32[] select(%[[and_440]], %[[broadcast_442]], %[[select_436]])
+// CHECK-NEXT:  %[[subtract_444:[^ ]+]] = f32[] subtract(%[[log_299]], %[[select_443]])
+// CHECK-NEXT:  %[[multiply_445:[^ ]+]] = f32[] multiply(%[[get_tuple_element_285]], %[[subtract_444]])
+// CHECK-NEXT:  %[[get_tuple_element_298:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=14
+// CHECK-NEXT:  %[[add_446:[^ ]+]] = f32[] add(%[[multiply_445]], %[[get_tuple_element_298]])
+// CHECK-NEXT:  %[[multiply_447:[^ ]+]] = f32[] multiply(%[[exponential_119]], %[[add_446]])
+// CHECK-NEXT:  %[[negate_448:[^ ]+]] = f32[] negate(%[[multiply_447]])
+// CHECK-NEXT:  %[[get_tuple_element_514:[^ ]+]] = f32[] get-tuple-element(%[[while_510]]), index=3
+// CHECK-NEXT:  %[[log_518:[^ ]+]] = f32[] log(%[[arg1_2]])
+// CHECK-NEXT:  %[[constant_519:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_520:[^ ]+]] = f32[] add(%[[arg0_1]], %[[constant_519]])
+// CHECK-NEXT:  %[[constant_521:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_658:[^ ]+]] = pred[] compare(%[[add_520]], %[[constant_521]]), direction=LE
+// CHECK-NEXT:  %[[floor_659:[^ ]+]] = f32[] floor(%[[add_520]])
+// CHECK-NEXT:  %[[compare_660:[^ ]+]] = pred[] compare(%[[add_520]], %[[floor_659]]), direction=EQ
+// CHECK-NEXT:  %[[and_661:[^ ]+]] = pred[] and(%[[compare_658]], %[[compare_660]])
+// CHECK-NEXT:  %[[constant_662:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_663:[^ ]+]] = f32[] broadcast(%[[constant_662]]), dimensions={}
+// CHECK-NEXT:  %[[constant_522:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_529:[^ ]+]] = pred[] compare(%[[add_520]], %[[constant_522]]), direction=LT
+// CHECK-NEXT:  %[[constant_527:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[negate_530:[^ ]+]] = f32[] negate(%[[add_520]])
+// CHECK-NEXT:  %[[constant_523:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_531:[^ ]+]] = f32[] subtract(%[[add_520]], %[[constant_523]])
+// CHECK-NEXT:  %[[select_532:[^ ]+]] = f32[] select(%[[compare_529]], %[[negate_530]], %[[subtract_531]])
+// CHECK-NEXT:  %[[constant_526:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[divide_638:[^ ]+]] = f32[] divide(%[[select_532]], %[[constant_526]])
+// CHECK-NEXT:  %[[log_plus_one_639:[^ ]+]] = f32[] log-plus-one(%[[divide_638]])
+// CHECK-NEXT:  %[[add_640:[^ ]+]] = f32[] add(%[[constant_527]], %[[log_plus_one_639]])
+// CHECK-NEXT:  %[[constant_533:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_534:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_535:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_534]])
+// CHECK-NEXT:  %[[add_536:[^ ]+]] = f32[] add(%[[add_535]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_537:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_534]])
+// CHECK-NEXT:  %[[add_538:[^ ]+]] = f32[] add(%[[add_537]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_539:[^ ]+]] = f32[] multiply(%[[add_536]], %[[add_538]])
+// CHECK-NEXT:  %[[divide_540:[^ ]+]] = f32[] divide(%[[constant_533]], %[[multiply_539]])
+// CHECK-NEXT:  %[[subtract_541:[^ ]+]] = f32[] subtract(%[[constant_521]], %[[divide_540]])
+// CHECK-NEXT:  %[[constant_546:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_547:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_548:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_547]])
+// CHECK-NEXT:  %[[add_549:[^ ]+]] = f32[] add(%[[add_548]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_550:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_547]])
+// CHECK-NEXT:  %[[add_551:[^ ]+]] = f32[] add(%[[add_550]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_552:[^ ]+]] = f32[] multiply(%[[add_549]], %[[add_551]])
+// CHECK-NEXT:  %[[divide_553:[^ ]+]] = f32[] divide(%[[constant_546]], %[[multiply_552]])
+// CHECK-NEXT:  %[[subtract_554:[^ ]+]] = f32[] subtract(%[[subtract_541]], %[[divide_553]])
+// CHECK-NEXT:  %[[constant_559:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_560:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_561:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_560]])
+// CHECK-NEXT:  %[[add_562:[^ ]+]] = f32[] add(%[[add_561]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_563:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_560]])
+// CHECK-NEXT:  %[[add_564:[^ ]+]] = f32[] add(%[[add_563]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_565:[^ ]+]] = f32[] multiply(%[[add_562]], %[[add_564]])
+// CHECK-NEXT:  %[[divide_566:[^ ]+]] = f32[] divide(%[[constant_559]], %[[multiply_565]])
+// CHECK-NEXT:  %[[subtract_567:[^ ]+]] = f32[] subtract(%[[subtract_554]], %[[divide_566]])
+// CHECK-NEXT:  %[[constant_572:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_573:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_574:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_573]])
+// CHECK-NEXT:  %[[add_575:[^ ]+]] = f32[] add(%[[add_574]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_576:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_573]])
+// CHECK-NEXT:  %[[add_577:[^ ]+]] = f32[] add(%[[add_576]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_578:[^ ]+]] = f32[] multiply(%[[add_575]], %[[add_577]])
+// CHECK-NEXT:  %[[divide_579:[^ ]+]] = f32[] divide(%[[constant_572]], %[[multiply_578]])
+// CHECK-NEXT:  %[[subtract_580:[^ ]+]] = f32[] subtract(%[[subtract_567]], %[[divide_579]])
+// CHECK-NEXT:  %[[constant_585:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_586:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_587:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_586]])
+// CHECK-NEXT:  %[[add_588:[^ ]+]] = f32[] add(%[[add_587]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_589:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_586]])
+// CHECK-NEXT:  %[[add_590:[^ ]+]] = f32[] add(%[[add_589]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_591:[^ ]+]] = f32[] multiply(%[[add_588]], %[[add_590]])
+// CHECK-NEXT:  %[[divide_592:[^ ]+]] = f32[] divide(%[[constant_585]], %[[multiply_591]])
+// CHECK-NEXT:  %[[subtract_593:[^ ]+]] = f32[] subtract(%[[subtract_580]], %[[divide_592]])
+// CHECK-NEXT:  %[[constant_598:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_599:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_600:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_599]])
+// CHECK-NEXT:  %[[add_601:[^ ]+]] = f32[] add(%[[add_600]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_602:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_599]])
+// CHECK-NEXT:  %[[add_603:[^ ]+]] = f32[] add(%[[add_602]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_604:[^ ]+]] = f32[] multiply(%[[add_601]], %[[add_603]])
+// CHECK-NEXT:  %[[divide_605:[^ ]+]] = f32[] divide(%[[constant_598]], %[[multiply_604]])
+// CHECK-NEXT:  %[[subtract_606:[^ ]+]] = f32[] subtract(%[[subtract_593]], %[[divide_605]])
+// CHECK-NEXT:  %[[constant_611:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_612:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_613:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_612]])
+// CHECK-NEXT:  %[[add_614:[^ ]+]] = f32[] add(%[[add_613]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_615:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_612]])
+// CHECK-NEXT:  %[[add_616:[^ ]+]] = f32[] add(%[[add_615]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_617:[^ ]+]] = f32[] multiply(%[[add_614]], %[[add_616]])
+// CHECK-NEXT:  %[[divide_618:[^ ]+]] = f32[] divide(%[[constant_611]], %[[multiply_617]])
+// CHECK-NEXT:  %[[subtract_619:[^ ]+]] = f32[] subtract(%[[subtract_606]], %[[divide_618]])
+// CHECK-NEXT:  %[[constant_624:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_625:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_626:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_625]])
+// CHECK-NEXT:  %[[add_627:[^ ]+]] = f32[] add(%[[add_626]], %[[constant_523]])
+// CHECK-NEXT:  %[[add_628:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_625]])
+// CHECK-NEXT:  %[[add_629:[^ ]+]] = f32[] add(%[[add_628]], %[[constant_523]])
+// CHECK-NEXT:  %[[multiply_630:[^ ]+]] = f32[] multiply(%[[add_627]], %[[add_629]])
+// CHECK-NEXT:  %[[divide_631:[^ ]+]] = f32[] divide(%[[constant_624]], %[[multiply_630]])
+// CHECK-NEXT:  %[[subtract_632:[^ ]+]] = f32[] subtract(%[[subtract_619]], %[[divide_631]])
+// CHECK-NEXT:  %[[constant_528:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_542:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_534]])
+// CHECK-NEXT:  %[[add_543:[^ ]+]] = f32[] add(%[[add_542]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_544:[^ ]+]] = f32[] divide(%[[constant_533]], %[[add_543]])
+// CHECK-NEXT:  %[[add_545:[^ ]+]] = f32[] add(%[[constant_528]], %[[divide_544]])
+// CHECK-NEXT:  %[[add_555:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_547]])
+// CHECK-NEXT:  %[[add_556:[^ ]+]] = f32[] add(%[[add_555]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_557:[^ ]+]] = f32[] divide(%[[constant_546]], %[[add_556]])
+// CHECK-NEXT:  %[[add_558:[^ ]+]] = f32[] add(%[[add_545]], %[[divide_557]])
+// CHECK-NEXT:  %[[add_568:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_560]])
+// CHECK-NEXT:  %[[add_569:[^ ]+]] = f32[] add(%[[add_568]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_570:[^ ]+]] = f32[] divide(%[[constant_559]], %[[add_569]])
+// CHECK-NEXT:  %[[add_571:[^ ]+]] = f32[] add(%[[add_558]], %[[divide_570]])
+// CHECK-NEXT:  %[[add_581:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_573]])
+// CHECK-NEXT:  %[[add_582:[^ ]+]] = f32[] add(%[[add_581]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_583:[^ ]+]] = f32[] divide(%[[constant_572]], %[[add_582]])
+// CHECK-NEXT:  %[[add_584:[^ ]+]] = f32[] add(%[[add_571]], %[[divide_583]])
+// CHECK-NEXT:  %[[add_594:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_586]])
+// CHECK-NEXT:  %[[add_595:[^ ]+]] = f32[] add(%[[add_594]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_596:[^ ]+]] = f32[] divide(%[[constant_585]], %[[add_595]])
+// CHECK-NEXT:  %[[add_597:[^ ]+]] = f32[] add(%[[add_584]], %[[divide_596]])
+// CHECK-NEXT:  %[[add_607:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_599]])
+// CHECK-NEXT:  %[[add_608:[^ ]+]] = f32[] add(%[[add_607]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_609:[^ ]+]] = f32[] divide(%[[constant_598]], %[[add_608]])
+// CHECK-NEXT:  %[[add_610:[^ ]+]] = f32[] add(%[[add_597]], %[[divide_609]])
+// CHECK-NEXT:  %[[add_620:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_612]])
+// CHECK-NEXT:  %[[add_621:[^ ]+]] = f32[] add(%[[add_620]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_622:[^ ]+]] = f32[] divide(%[[constant_611]], %[[add_621]])
+// CHECK-NEXT:  %[[add_623:[^ ]+]] = f32[] add(%[[add_610]], %[[divide_622]])
+// CHECK-NEXT:  %[[add_633:[^ ]+]] = f32[] add(%[[select_532]], %[[constant_625]])
+// CHECK-NEXT:  %[[add_634:[^ ]+]] = f32[] add(%[[add_633]], %[[constant_523]])
+// CHECK-NEXT:  %[[divide_635:[^ ]+]] = f32[] divide(%[[constant_624]], %[[add_634]])
+// CHECK-NEXT:  %[[add_636:[^ ]+]] = f32[] add(%[[add_623]], %[[divide_635]])
+// CHECK-NEXT:  %[[divide_641:[^ ]+]] = f32[] divide(%[[subtract_632]], %[[add_636]])
+// CHECK-NEXT:  %[[add_642:[^ ]+]] = f32[] add(%[[add_640]], %[[divide_641]])
+// CHECK-NEXT:  %[[constant_525:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_637:[^ ]+]] = f32[] add(%[[constant_526]], %[[select_532]])
+// CHECK-NEXT:  %[[divide_643:[^ ]+]] = f32[] divide(%[[constant_525]], %[[add_637]])
+// CHECK-NEXT:  %[[subtract_644:[^ ]+]] = f32[] subtract(%[[add_642]], %[[divide_643]])
+// CHECK-NEXT:  %[[constant_524:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[constant_645:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[add_646:[^ ]+]] = f32[] add(%[[add_520]], %[[constant_645]])
+// CHECK-NEXT:  %[[floor_647:[^ ]+]] = f32[] floor(%[[add_646]])
+// CHECK-NEXT:  %[[abs_648:[^ ]+]] = f32[] abs(%[[floor_647]])
+// CHECK-NEXT:  %[[add_649:[^ ]+]] = f32[] add(%[[add_520]], %[[abs_648]])
+// CHECK-NEXT:  %[[multiply_650:[^ ]+]] = f32[] multiply(%[[constant_524]], %[[add_649]])
+// CHECK-NEXT:  %[[cosine_651:[^ ]+]] = f32[] cosine(%[[multiply_650]])
+// CHECK-NEXT:  %[[multiply_652:[^ ]+]] = f32[] multiply(%[[constant_524]], %[[cosine_651]])
+// CHECK-NEXT:  %[[multiply_653:[^ ]+]] = f32[] multiply(%[[constant_524]], %[[add_649]])
+// CHECK-NEXT:  %[[sine_654:[^ ]+]] = f32[] sine(%[[multiply_653]])
+// CHECK-NEXT:  %[[divide_655:[^ ]+]] = f32[] divide(%[[multiply_652]], %[[sine_654]])
+// CHECK-NEXT:  %[[subtract_656:[^ ]+]] = f32[] subtract(%[[subtract_644]], %[[divide_655]])
+// CHECK-NEXT:  %[[select_657:[^ ]+]] = f32[] select(%[[compare_529]], %[[subtract_656]], %[[subtract_644]])
+// CHECK-NEXT:  %[[select_664:[^ ]+]] = f32[] select(%[[and_661]], %[[broadcast_663]], %[[select_657]])
+// CHECK-NEXT:  %[[subtract_665:[^ ]+]] = f32[] subtract(%[[log_518]], %[[select_664]])
+// CHECK-NEXT:  %[[multiply_666:[^ ]+]] = f32[] multiply(%[[get_tuple_element_514]], %[[subtract_665]])
+// CHECK-NEXT:  %[[get_tuple_element_517:[^ ]+]] = f32[] get-tuple-element(%[[while_510]]), index=6
+// CHECK-NEXT:  %[[add_667:[^ ]+]] = f32[] add(%[[multiply_666]], %[[get_tuple_element_517]])
+// CHECK-NEXT:  %[[multiply_668:[^ ]+]] = f32[] multiply(%[[exponential_119]], %[[add_667]])
+// CHECK-NEXT:  %[[divide_669:[^ ]+]] = f32[] divide(%[[multiply_668]], %[[arg0_1]])
+// CHECK-NEXT:  %[[select_670:[^ ]+]] = f32[] select(%[[and_16]], %[[negate_448]], %[[divide_669]])
+// CHECK-NEXT:  %[[select_673:[^ ]+]] = f32[] select(%[[compare_7]], %[[broadcast_672]], %[[select_670]])
+// CHECK-NEXT:  ROOT %[[select_677:[^ ]+]] = f32[] select(%[[or_674]], %[[broadcast_676]], %[[select_673]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.IgammaGradA"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.IgammaGradA", called_computations={%[[$xla_builder_math_IgammaGradA_678]]}
+
+HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[], Arg_1.2: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="xla_builder.math.IgammaGradA"
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/builder/tests/math_next_after.hlo b/third_party/xla/xla/hlo/builder/tests/math_next_after.hlo
new file mode 100644
index 000000000000..386296c3b0a6
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_next_after.hlo
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_NextAfter_42:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[compare_5:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[arg0_1]]), direction=NE
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[compare_6:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[arg1_2]]), direction=NE
+// CHECK-NEXT:  %[[or_7:[^ ]+]] = pred[] or(%[[compare_5]], %[[compare_6]])
+// CHECK-NEXT:  %[[constant_8:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_9:[^ ]+]] = f32[] broadcast(%[[constant_8]]), dimensions={}
+// CHECK-NEXT:  %[[bitcast_convert_10:[^ ]+]] = u32[] bitcast-convert(%[[broadcast_9]])
+// CHECK-NEXT:  %[[bitcast_convert_3:[^ ]+]] = u32[] bitcast-convert(%[[arg0_1]])
+// CHECK-NEXT:  %[[bitcast_convert_4:[^ ]+]] = u32[] bitcast-convert(%[[arg1_2]])
+// CHECK-NEXT:  %[[compare_15:[^ ]+]] = pred[] compare(%[[bitcast_convert_3]], %[[bitcast_convert_4]]), direction=EQ
+// CHECK-NEXT:  %[[constant_11:[^ ]+]] = u32[] constant(2147483647)
+// CHECK-NEXT:  %[[and_12:[^ ]+]] = u32[] and(%[[bitcast_convert_3]], %[[constant_11]])
+// CHECK-NEXT:  %[[constant_16:[^ ]+]] = u32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_17:[^ ]+]] = u32[] broadcast(%[[constant_16]]), dimensions={}
+// CHECK-NEXT:  %[[compare_18:[^ ]+]] = pred[] compare(%[[and_12]], %[[broadcast_17]]), direction=EQ
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = u32[] constant(2147483647)
+// CHECK-NEXT:  %[[and_14:[^ ]+]] = u32[] and(%[[bitcast_convert_4]], %[[constant_13]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = u32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_20:[^ ]+]] = u32[] broadcast(%[[constant_19]]), dimensions={}
+// CHECK-NEXT:  %[[compare_21:[^ ]+]] = pred[] compare(%[[and_14]], %[[broadcast_20]]), direction=EQ
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = u32[] constant(2147483648)
+// CHECK-NEXT:  %[[and_25:[^ ]+]] = u32[] and(%[[bitcast_convert_4]], %[[constant_24]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[or_27:[^ ]+]] = u32[] or(%[[and_25]], %[[constant_26]])
+// CHECK-NEXT:  %[[select_37:[^ ]+]] = u32[] select(%[[compare_21]], %[[bitcast_convert_4]], %[[or_27]])
+// CHECK-NEXT:  %[[compare_29:[^ ]+]] = pred[] compare(%[[and_12]], %[[and_14]]), direction=GT
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = u32[] constant(2147483648)
+// CHECK-NEXT:  %[[and_23:[^ ]+]] = u32[] and(%[[bitcast_convert_3]], %[[constant_22]])
+// CHECK-NEXT:  %[[compare_28:[^ ]+]] = pred[] compare(%[[and_23]], %[[and_25]]), direction=NE
+// CHECK-NEXT:  %[[or_30:[^ ]+]] = pred[] or(%[[compare_29]], %[[compare_28]])
+// CHECK-NEXT:  %[[constant_31:[^ ]+]] = u32[] constant(4294967295)
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u32[] broadcast(%[[constant_31]]), dimensions={}
+// CHECK-NEXT:  %[[constant_33:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_34:[^ ]+]] = u32[] broadcast(%[[constant_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_35:[^ ]+]] = u32[] select(%[[or_30]], %[[broadcast_32]], %[[broadcast_34]])
+// CHECK-NEXT:  %[[add_36:[^ ]+]] = u32[] add(%[[bitcast_convert_3]], %[[select_35]])
+// CHECK-NEXT:  %[[select_38:[^ ]+]] = u32[] select(%[[compare_18]], %[[select_37]], %[[add_36]])
+// CHECK-NEXT:  %[[select_39:[^ ]+]] = u32[] select(%[[compare_15]], %[[bitcast_convert_4]], %[[select_38]])
+// CHECK-NEXT:  %[[select_40:[^ ]+]] = u32[] select(%[[or_7]], %[[bitcast_convert_10]], %[[select_39]])
+// CHECK-NEXT:  ROOT %[[bitcast_convert_41:[^ ]+]] = f32[] bitcast-convert(%[[select_40]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.NextAfter"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.NextAfter", called_computations={%[[$xla_builder_math_NextAfter_42]]}
+
+HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[], Arg_1.2: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="xla_builder.math.NextAfter"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule next_after_f8e5m2fnuz, entry_computation_layout={(f32[], f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_NextAfter_50:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f8e5m2fnuz[] parameter(0)
+// CHECK-NEXT:  %[[compare_5:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[arg0_1]]), direction=NE
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f8e5m2fnuz[] parameter(1)
+// CHECK-NEXT:  %[[compare_6:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[arg1_2]]), direction=NE
+// CHECK-NEXT:  %[[or_7:[^ ]+]] = pred[] or(%[[compare_5]], %[[compare_6]])
+// CHECK-NEXT:  %[[constant_8:[^ ]+]] = f8e5m2fnuz[] constant(-nan)
+// CHECK-NEXT:  %[[broadcast_9:[^ ]+]] = f8e5m2fnuz[] broadcast(%[[constant_8]]), dimensions={}
+// CHECK-NEXT:  %[[bitcast_convert_10:[^ ]+]] = u8[] bitcast-convert(%[[broadcast_9]])
+// CHECK-NEXT:  %[[bitcast_convert_3:[^ ]+]] = u8[] bitcast-convert(%[[arg0_1]])
+// CHECK-NEXT:  %[[bitcast_convert_4:[^ ]+]] = u8[] bitcast-convert(%[[arg1_2]])
+// CHECK-NEXT:  %[[compare_15:[^ ]+]] = pred[] compare(%[[bitcast_convert_3]], %[[bitcast_convert_4]]), direction=EQ
+// CHECK-NEXT:  %[[constant_11:[^ ]+]] = u8[] constant(127)
+// CHECK-NEXT:  %[[and_12:[^ ]+]] = u8[] and(%[[bitcast_convert_3]], %[[constant_11]])
+// CHECK-NEXT:  %[[constant_16:[^ ]+]] = u8[] constant(0)
+// CHECK-NEXT:  %[[broadcast_17:[^ ]+]] = u8[] broadcast(%[[constant_16]]), dimensions={}
+// CHECK-NEXT:  %[[compare_18:[^ ]+]] = pred[] compare(%[[and_12]], %[[broadcast_17]]), direction=EQ
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = u8[] constant(127)
+// CHECK-NEXT:  %[[and_14:[^ ]+]] = u8[] and(%[[bitcast_convert_4]], %[[constant_13]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = u8[] constant(0)
+// CHECK-NEXT:  %[[broadcast_20:[^ ]+]] = u8[] broadcast(%[[constant_19]]), dimensions={}
+// CHECK-NEXT:  %[[compare_21:[^ ]+]] = pred[] compare(%[[and_14]], %[[broadcast_20]]), direction=EQ
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = u8[] constant(128)
+// CHECK-NEXT:  %[[and_25:[^ ]+]] = u8[] and(%[[bitcast_convert_4]], %[[constant_24]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = u8[] constant(1)
+// CHECK-NEXT:  %[[or_27:[^ ]+]] = u8[] or(%[[and_25]], %[[constant_26]])
+// CHECK-NEXT:  %[[select_45:[^ ]+]] = u8[] select(%[[compare_21]], %[[bitcast_convert_4]], %[[or_27]])
+// CHECK-NEXT:  %[[constant_39:[^ ]+]] = u8[] constant(129)
+// CHECK-NEXT:  %[[compare_40:[^ ]+]] = pred[] compare(%[[bitcast_convert_3]], %[[constant_39]]), direction=EQ
+// CHECK-NEXT:  %[[convert_37:[^ ]+]] = pred[] convert(%[[and_25]])
+// CHECK-NEXT:  %[[not_38:[^ ]+]] = pred[] not(%[[convert_37]])
+// CHECK-NEXT:  %[[and_41:[^ ]+]] = pred[] and(%[[compare_40]], %[[not_38]])
+// CHECK-NEXT:  %[[constant_42:[^ ]+]] = u8[] constant(0)
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u8[] broadcast(%[[constant_42]]), dimensions={}
+// CHECK-NEXT:  %[[compare_29:[^ ]+]] = pred[] compare(%[[and_12]], %[[and_14]]), direction=GT
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = u8[] constant(128)
+// CHECK-NEXT:  %[[and_23:[^ ]+]] = u8[] and(%[[bitcast_convert_3]], %[[constant_22]])
+// CHECK-NEXT:  %[[compare_28:[^ ]+]] = pred[] compare(%[[and_23]], %[[and_25]]), direction=NE
+// CHECK-NEXT:  %[[or_30:[^ ]+]] = pred[] or(%[[compare_29]], %[[compare_28]])
+// CHECK-NEXT:  %[[constant_31:[^ ]+]] = u8[] constant(255)
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u8[] broadcast(%[[constant_31]]), dimensions={}
+// CHECK-NEXT:  %[[constant_33:[^ ]+]] = u8[] constant(1)
+// CHECK-NEXT:  %[[broadcast_34:[^ ]+]] = u8[] broadcast(%[[constant_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_35:[^ ]+]] = u8[] select(%[[or_30]], %[[broadcast_32]], %[[broadcast_34]])
+// CHECK-NEXT:  %[[add_36:[^ ]+]] = u8[] add(%[[bitcast_convert_3]], %[[select_35]])
+// CHECK-NEXT:  %[[select_44:[^ ]+]] = u8[] select(%[[and_41]], %[[broadcast_43]], %[[add_36]])
+// CHECK-NEXT:  %[[select_46:[^ ]+]] = u8[] select(%[[compare_18]], %[[select_45]], %[[select_44]])
+// CHECK-NEXT:  %[[select_47:[^ ]+]] = u8[] select(%[[compare_15]], %[[bitcast_convert_4]], %[[select_46]])
+// CHECK-NEXT:  %[[select_48:[^ ]+]] = u8[] select(%[[or_7]], %[[bitcast_convert_10]], %[[select_47]])
+// CHECK-NEXT:  ROOT %[[bitcast_convert_49:[^ ]+]] = f8e5m2fnuz[] bitcast-convert(%[[select_48]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f8e5m2fnuz[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f8e5m2fnuz[] parameter(1)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f8e5m2fnuz[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.NextAfter"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f8e5m2fnuz[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.NextAfter", called_computations={%[[$xla_builder_math_NextAfter_50]]}
+
+HloModule next_after_f8e5m2fnuz, entry_computation_layout={(f32[], f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f8e5m2fnuz[], Arg_1.2: f8e5m2fnuz[]) -> f8e5m2fnuz[] {
+  %Arg_0.1 = f8e5m2fnuz[] parameter(0)
+  %Arg_1.2 = f8e5m2fnuz[] parameter(1)
+  ROOT %custom-call.2 = f8e5m2fnuz[] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="xla_builder.math.NextAfter"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/math_polygamma.hlo b/third_party/xla/xla/hlo/builder/tests/math_polygamma.hlo
new file mode 100644
index 000000000000..7066f4d03c28
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_polygamma.hlo
@@ -0,0 +1,487 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+// CHECK:       %[[$xla_builder_math_Polygamma_468:[^ ]+]]
+// CHECK-NEXT:  %[[constant_253:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_313:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[floor_461:[^ ]+]] = f32[] floor(%[[arg0_1]])
+// CHECK-NEXT:  %[[compare_462:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[floor_461]]), direction=NE
+// CHECK-NEXT:  %[[constant_463:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_464:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_463]]), direction=LT
+// CHECK-NEXT:  %[[or_465:[^ ]+]] = pred[] or(%[[compare_462]], %[[compare_464]])
+// CHECK-NEXT:  %[[constant_466:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[constant_11:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_12:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_11]]), direction=EQ
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_150:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_13]]), direction=LE
+// CHECK-NEXT:  %[[floor_151:[^ ]+]] = f32[] floor(%[[arg1_2]])
+// CHECK-NEXT:  %[[compare_152:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[floor_151]]), direction=EQ
+// CHECK-NEXT:  %[[and_153:[^ ]+]] = pred[] and(%[[compare_150]], %[[compare_152]])
+// CHECK-NEXT:  %[[constant_154:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_155:[^ ]+]] = f32[] broadcast(%[[constant_154]]), dimensions={}
+// CHECK-NEXT:  %[[constant_14:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_21:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_14]]), direction=LT
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[negate_22:[^ ]+]] = f32[] negate(%[[arg1_2]])
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_23:[^ ]+]] = f32[] subtract(%[[arg1_2]], %[[constant_15]])
+// CHECK-NEXT:  %[[select_24:[^ ]+]] = f32[] select(%[[compare_21]], %[[negate_22]], %[[subtract_23]])
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[divide_130:[^ ]+]] = f32[] divide(%[[select_24]], %[[constant_18]])
+// CHECK-NEXT:  %[[log_plus_one_131:[^ ]+]] = f32[] log-plus-one(%[[divide_130]])
+// CHECK-NEXT:  %[[add_132:[^ ]+]] = f32[] add(%[[constant_19]], %[[log_plus_one_131]])
+// CHECK-NEXT:  %[[constant_25:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_27:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_26]])
+// CHECK-NEXT:  %[[add_28:[^ ]+]] = f32[] add(%[[add_27]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_29:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_26]])
+// CHECK-NEXT:  %[[add_30:[^ ]+]] = f32[] add(%[[add_29]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_31:[^ ]+]] = f32[] multiply(%[[add_28]], %[[add_30]])
+// CHECK-NEXT:  %[[divide_32:[^ ]+]] = f32[] divide(%[[constant_25]], %[[multiply_31]])
+// CHECK-NEXT:  %[[subtract_33:[^ ]+]] = f32[] subtract(%[[constant_13]], %[[divide_32]])
+// CHECK-NEXT:  %[[constant_38:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_39:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_40:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_39]])
+// CHECK-NEXT:  %[[add_41:[^ ]+]] = f32[] add(%[[add_40]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_42:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_39]])
+// CHECK-NEXT:  %[[add_43:[^ ]+]] = f32[] add(%[[add_42]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_44:[^ ]+]] = f32[] multiply(%[[add_41]], %[[add_43]])
+// CHECK-NEXT:  %[[divide_45:[^ ]+]] = f32[] divide(%[[constant_38]], %[[multiply_44]])
+// CHECK-NEXT:  %[[subtract_46:[^ ]+]] = f32[] subtract(%[[subtract_33]], %[[divide_45]])
+// CHECK-NEXT:  %[[constant_51:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_52:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_53:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_52]])
+// CHECK-NEXT:  %[[add_54:[^ ]+]] = f32[] add(%[[add_53]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_52]])
+// CHECK-NEXT:  %[[add_56:[^ ]+]] = f32[] add(%[[add_55]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_57:[^ ]+]] = f32[] multiply(%[[add_54]], %[[add_56]])
+// CHECK-NEXT:  %[[divide_58:[^ ]+]] = f32[] divide(%[[constant_51]], %[[multiply_57]])
+// CHECK-NEXT:  %[[subtract_59:[^ ]+]] = f32[] subtract(%[[subtract_46]], %[[divide_58]])
+// CHECK-NEXT:  %[[constant_64:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_66:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_65]])
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = f32[] add(%[[add_66]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_68:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_65]])
+// CHECK-NEXT:  %[[add_69:[^ ]+]] = f32[] add(%[[add_68]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_70:[^ ]+]] = f32[] multiply(%[[add_67]], %[[add_69]])
+// CHECK-NEXT:  %[[divide_71:[^ ]+]] = f32[] divide(%[[constant_64]], %[[multiply_70]])
+// CHECK-NEXT:  %[[subtract_72:[^ ]+]] = f32[] subtract(%[[subtract_59]], %[[divide_71]])
+// CHECK-NEXT:  %[[constant_77:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_79:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_78]])
+// CHECK-NEXT:  %[[add_80:[^ ]+]] = f32[] add(%[[add_79]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_81:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_78]])
+// CHECK-NEXT:  %[[add_82:[^ ]+]] = f32[] add(%[[add_81]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_83:[^ ]+]] = f32[] multiply(%[[add_80]], %[[add_82]])
+// CHECK-NEXT:  %[[divide_84:[^ ]+]] = f32[] divide(%[[constant_77]], %[[multiply_83]])
+// CHECK-NEXT:  %[[subtract_85:[^ ]+]] = f32[] subtract(%[[subtract_72]], %[[divide_84]])
+// CHECK-NEXT:  %[[constant_90:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_92:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_91]])
+// CHECK-NEXT:  %[[add_93:[^ ]+]] = f32[] add(%[[add_92]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_94:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_91]])
+// CHECK-NEXT:  %[[add_95:[^ ]+]] = f32[] add(%[[add_94]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_96:[^ ]+]] = f32[] multiply(%[[add_93]], %[[add_95]])
+// CHECK-NEXT:  %[[divide_97:[^ ]+]] = f32[] divide(%[[constant_90]], %[[multiply_96]])
+// CHECK-NEXT:  %[[subtract_98:[^ ]+]] = f32[] subtract(%[[subtract_85]], %[[divide_97]])
+// CHECK-NEXT:  %[[constant_103:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_104:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_105:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_104]])
+// CHECK-NEXT:  %[[add_106:[^ ]+]] = f32[] add(%[[add_105]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_107:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_104]])
+// CHECK-NEXT:  %[[add_108:[^ ]+]] = f32[] add(%[[add_107]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_109:[^ ]+]] = f32[] multiply(%[[add_106]], %[[add_108]])
+// CHECK-NEXT:  %[[divide_110:[^ ]+]] = f32[] divide(%[[constant_103]], %[[multiply_109]])
+// CHECK-NEXT:  %[[subtract_111:[^ ]+]] = f32[] subtract(%[[subtract_98]], %[[divide_110]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_117:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_118:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_117]])
+// CHECK-NEXT:  %[[add_119:[^ ]+]] = f32[] add(%[[add_118]], %[[constant_15]])
+// CHECK-NEXT:  %[[add_120:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_117]])
+// CHECK-NEXT:  %[[add_121:[^ ]+]] = f32[] add(%[[add_120]], %[[constant_15]])
+// CHECK-NEXT:  %[[multiply_122:[^ ]+]] = f32[] multiply(%[[add_119]], %[[add_121]])
+// CHECK-NEXT:  %[[divide_123:[^ ]+]] = f32[] divide(%[[constant_116]], %[[multiply_122]])
+// CHECK-NEXT:  %[[subtract_124:[^ ]+]] = f32[] subtract(%[[subtract_111]], %[[divide_123]])
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_34:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_26]])
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = f32[] add(%[[add_34]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_36:[^ ]+]] = f32[] divide(%[[constant_25]], %[[add_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = f32[] add(%[[constant_20]], %[[divide_36]])
+// CHECK-NEXT:  %[[add_47:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_39]])
+// CHECK-NEXT:  %[[add_48:[^ ]+]] = f32[] add(%[[add_47]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_49:[^ ]+]] = f32[] divide(%[[constant_38]], %[[add_48]])
+// CHECK-NEXT:  %[[add_50:[^ ]+]] = f32[] add(%[[add_37]], %[[divide_49]])
+// CHECK-NEXT:  %[[add_60:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_52]])
+// CHECK-NEXT:  %[[add_61:[^ ]+]] = f32[] add(%[[add_60]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_62:[^ ]+]] = f32[] divide(%[[constant_51]], %[[add_61]])
+// CHECK-NEXT:  %[[add_63:[^ ]+]] = f32[] add(%[[add_50]], %[[divide_62]])
+// CHECK-NEXT:  %[[add_73:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_65]])
+// CHECK-NEXT:  %[[add_74:[^ ]+]] = f32[] add(%[[add_73]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_75:[^ ]+]] = f32[] divide(%[[constant_64]], %[[add_74]])
+// CHECK-NEXT:  %[[add_76:[^ ]+]] = f32[] add(%[[add_63]], %[[divide_75]])
+// CHECK-NEXT:  %[[add_86:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_78]])
+// CHECK-NEXT:  %[[add_87:[^ ]+]] = f32[] add(%[[add_86]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_88:[^ ]+]] = f32[] divide(%[[constant_77]], %[[add_87]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = f32[] add(%[[add_76]], %[[divide_88]])
+// CHECK-NEXT:  %[[add_99:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_91]])
+// CHECK-NEXT:  %[[add_100:[^ ]+]] = f32[] add(%[[add_99]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_101:[^ ]+]] = f32[] divide(%[[constant_90]], %[[add_100]])
+// CHECK-NEXT:  %[[add_102:[^ ]+]] = f32[] add(%[[add_89]], %[[divide_101]])
+// CHECK-NEXT:  %[[add_112:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_104]])
+// CHECK-NEXT:  %[[add_113:[^ ]+]] = f32[] add(%[[add_112]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_114:[^ ]+]] = f32[] divide(%[[constant_103]], %[[add_113]])
+// CHECK-NEXT:  %[[add_115:[^ ]+]] = f32[] add(%[[add_102]], %[[divide_114]])
+// CHECK-NEXT:  %[[add_125:[^ ]+]] = f32[] add(%[[select_24]], %[[constant_117]])
+// CHECK-NEXT:  %[[add_126:[^ ]+]] = f32[] add(%[[add_125]], %[[constant_15]])
+// CHECK-NEXT:  %[[divide_127:[^ ]+]] = f32[] divide(%[[constant_116]], %[[add_126]])
+// CHECK-NEXT:  %[[add_128:[^ ]+]] = f32[] add(%[[add_115]], %[[divide_127]])
+// CHECK-NEXT:  %[[divide_133:[^ ]+]] = f32[] divide(%[[subtract_124]], %[[add_128]])
+// CHECK-NEXT:  %[[add_134:[^ ]+]] = f32[] add(%[[add_132]], %[[divide_133]])
+// CHECK-NEXT:  %[[constant_17:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_129:[^ ]+]] = f32[] add(%[[constant_18]], %[[select_24]])
+// CHECK-NEXT:  %[[divide_135:[^ ]+]] = f32[] divide(%[[constant_17]], %[[add_129]])
+// CHECK-NEXT:  %[[subtract_136:[^ ]+]] = f32[] subtract(%[[add_134]], %[[divide_135]])
+// CHECK-NEXT:  %[[constant_16:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[constant_137:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[add_138:[^ ]+]] = f32[] add(%[[arg1_2]], %[[constant_137]])
+// CHECK-NEXT:  %[[floor_139:[^ ]+]] = f32[] floor(%[[add_138]])
+// CHECK-NEXT:  %[[abs_140:[^ ]+]] = f32[] abs(%[[floor_139]])
+// CHECK-NEXT:  %[[add_141:[^ ]+]] = f32[] add(%[[arg1_2]], %[[abs_140]])
+// CHECK-NEXT:  %[[multiply_142:[^ ]+]] = f32[] multiply(%[[constant_16]], %[[add_141]])
+// CHECK-NEXT:  %[[cosine_143:[^ ]+]] = f32[] cosine(%[[multiply_142]])
+// CHECK-NEXT:  %[[multiply_144:[^ ]+]] = f32[] multiply(%[[constant_16]], %[[cosine_143]])
+// CHECK-NEXT:  %[[multiply_145:[^ ]+]] = f32[] multiply(%[[constant_16]], %[[add_141]])
+// CHECK-NEXT:  %[[sine_146:[^ ]+]] = f32[] sine(%[[multiply_145]])
+// CHECK-NEXT:  %[[divide_147:[^ ]+]] = f32[] divide(%[[multiply_144]], %[[sine_146]])
+// CHECK-NEXT:  %[[subtract_148:[^ ]+]] = f32[] subtract(%[[subtract_136]], %[[divide_147]])
+// CHECK-NEXT:  %[[select_149:[^ ]+]] = f32[] select(%[[compare_21]], %[[subtract_148]], %[[subtract_136]])
+// CHECK-NEXT:  %[[select_156:[^ ]+]] = f32[] select(%[[and_153]], %[[broadcast_155]], %[[select_149]])
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[remainder_7:[^ ]+]] = f32[] remainder(%[[arg0_1]], %[[constant_6]])
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = f32[] multiply(%[[constant_5]], %[[remainder_7]])
+// CHECK-NEXT:  %[[constant_9:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_10:[^ ]+]] = f32[] subtract(%[[multiply_8]], %[[constant_9]])
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_4:[^ ]+]] = f32[] add(%[[arg0_1]], %[[constant_3]])
+// CHECK-NEXT:  %[[abs_247:[^ ]+]] = f32[] abs(%[[add_4]])
+// CHECK-NEXT:  %[[constant_248:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_249:[^ ]+]] = pred[] compare(%[[abs_247]], %[[constant_248]]), direction=EQ
+// CHECK-NEXT:  %[[constant_245:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[broadcast_246:[^ ]+]] = f32[] broadcast(%[[constant_245]]), dimensions={}
+// CHECK-NEXT:  %[[constant_157:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_165:[^ ]+]] = pred[] compare(%[[add_4]], %[[constant_157]]), direction=LT
+// CHECK-NEXT:  %[[constant_159:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[abs_228:[^ ]+]] = f32[] abs(%[[add_4]])
+// CHECK-NEXT:  %[[floor_229:[^ ]+]] = f32[] floor(%[[abs_228]])
+// CHECK-NEXT:  %[[subtract_230:[^ ]+]] = f32[] subtract(%[[abs_228]], %[[floor_229]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_232:[^ ]+]] = pred[] compare(%[[subtract_230]], %[[constant_231]]), direction=GT
+// CHECK-NEXT:  %[[constant_233:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_234:[^ ]+]] = f32[] subtract(%[[constant_233]], %[[subtract_230]])
+// CHECK-NEXT:  %[[select_235:[^ ]+]] = f32[] select(%[[compare_232]], %[[subtract_234]], %[[subtract_230]])
+// CHECK-NEXT:  %[[multiply_236:[^ ]+]] = f32[] multiply(%[[constant_159]], %[[select_235]])
+// CHECK-NEXT:  %[[sine_237:[^ ]+]] = f32[] sine(%[[multiply_236]])
+// CHECK-NEXT:  %[[log_238:[^ ]+]] = f32[] log(%[[sine_237]])
+// CHECK-NEXT:  %[[is_finite_239:[^ ]+]] = pred[] is-finite(%[[log_238]])
+// CHECK-NEXT:  %[[constant_160:[^ ]+]] = f32[] constant(1.14472985)
+// CHECK-NEXT:  %[[subtract_240:[^ ]+]] = f32[] subtract(%[[constant_160]], %[[log_238]])
+// CHECK-NEXT:  %[[constant_161:[^ ]+]] = f32[] constant(0.918938518)
+// CHECK-NEXT:  %[[negate_166:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[constant_158:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_167:[^ ]+]] = f32[] subtract(%[[add_4]], %[[constant_158]])
+// CHECK-NEXT:  %[[select_168:[^ ]+]] = f32[] select(%[[compare_165]], %[[negate_166]], %[[subtract_167]])
+// CHECK-NEXT:  %[[add_221:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_157]])
+// CHECK-NEXT:  %[[constant_162:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[add_217:[^ ]+]] = f32[] add(%[[constant_162]], %[[select_168]])
+// CHECK-NEXT:  %[[constant_163:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[divide_218:[^ ]+]] = f32[] divide(%[[select_168]], %[[constant_162]])
+// CHECK-NEXT:  %[[log_plus_one_219:[^ ]+]] = f32[] log-plus-one(%[[divide_218]])
+// CHECK-NEXT:  %[[add_220:[^ ]+]] = f32[] add(%[[constant_163]], %[[log_plus_one_219]])
+// CHECK-NEXT:  %[[divide_222:[^ ]+]] = f32[] divide(%[[add_217]], %[[add_220]])
+// CHECK-NEXT:  %[[subtract_223:[^ ]+]] = f32[] subtract(%[[add_221]], %[[divide_222]])
+// CHECK-NEXT:  %[[multiply_224:[^ ]+]] = f32[] multiply(%[[subtract_223]], %[[add_220]])
+// CHECK-NEXT:  %[[add_225:[^ ]+]] = f32[] add(%[[constant_161]], %[[multiply_224]])
+// CHECK-NEXT:  %[[constant_164:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_169:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_170:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_171:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_170]])
+// CHECK-NEXT:  %[[add_172:[^ ]+]] = f32[] add(%[[add_171]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_173:[^ ]+]] = f32[] divide(%[[constant_169]], %[[add_172]])
+// CHECK-NEXT:  %[[add_174:[^ ]+]] = f32[] add(%[[constant_164]], %[[divide_173]])
+// CHECK-NEXT:  %[[constant_175:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_176:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_177:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_176]])
+// CHECK-NEXT:  %[[add_178:[^ ]+]] = f32[] add(%[[add_177]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_179:[^ ]+]] = f32[] divide(%[[constant_175]], %[[add_178]])
+// CHECK-NEXT:  %[[add_180:[^ ]+]] = f32[] add(%[[add_174]], %[[divide_179]])
+// CHECK-NEXT:  %[[constant_181:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_182:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_183:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_182]])
+// CHECK-NEXT:  %[[add_184:[^ ]+]] = f32[] add(%[[add_183]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_185:[^ ]+]] = f32[] divide(%[[constant_181]], %[[add_184]])
+// CHECK-NEXT:  %[[add_186:[^ ]+]] = f32[] add(%[[add_180]], %[[divide_185]])
+// CHECK-NEXT:  %[[constant_187:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_188:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_189:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_188]])
+// CHECK-NEXT:  %[[add_190:[^ ]+]] = f32[] add(%[[add_189]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_191:[^ ]+]] = f32[] divide(%[[constant_187]], %[[add_190]])
+// CHECK-NEXT:  %[[add_192:[^ ]+]] = f32[] add(%[[add_186]], %[[divide_191]])
+// CHECK-NEXT:  %[[constant_193:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_194:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_195:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_194]])
+// CHECK-NEXT:  %[[add_196:[^ ]+]] = f32[] add(%[[add_195]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_197:[^ ]+]] = f32[] divide(%[[constant_193]], %[[add_196]])
+// CHECK-NEXT:  %[[add_198:[^ ]+]] = f32[] add(%[[add_192]], %[[divide_197]])
+// CHECK-NEXT:  %[[constant_199:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_200]])
+// CHECK-NEXT:  %[[add_202:[^ ]+]] = f32[] add(%[[add_201]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_203:[^ ]+]] = f32[] divide(%[[constant_199]], %[[add_202]])
+// CHECK-NEXT:  %[[add_204:[^ ]+]] = f32[] add(%[[add_198]], %[[divide_203]])
+// CHECK-NEXT:  %[[constant_205:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_206:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_207:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_206]])
+// CHECK-NEXT:  %[[add_208:[^ ]+]] = f32[] add(%[[add_207]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_209:[^ ]+]] = f32[] divide(%[[constant_205]], %[[add_208]])
+// CHECK-NEXT:  %[[add_210:[^ ]+]] = f32[] add(%[[add_204]], %[[divide_209]])
+// CHECK-NEXT:  %[[constant_211:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_212:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_213:[^ ]+]] = f32[] add(%[[select_168]], %[[constant_212]])
+// CHECK-NEXT:  %[[add_214:[^ ]+]] = f32[] add(%[[add_213]], %[[constant_158]])
+// CHECK-NEXT:  %[[divide_215:[^ ]+]] = f32[] divide(%[[constant_211]], %[[add_214]])
+// CHECK-NEXT:  %[[add_216:[^ ]+]] = f32[] add(%[[add_210]], %[[divide_215]])
+// CHECK-NEXT:  %[[log_226:[^ ]+]] = f32[] log(%[[add_216]])
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = f32[] add(%[[add_225]], %[[log_226]])
+// CHECK-NEXT:  %[[subtract_241:[^ ]+]] = f32[] subtract(%[[subtract_240]], %[[add_227]])
+// CHECK-NEXT:  %[[negate_242:[^ ]+]] = f32[] negate(%[[log_238]])
+// CHECK-NEXT:  %[[select_243:[^ ]+]] = f32[] select(%[[is_finite_239]], %[[subtract_241]], %[[negate_242]])
+// CHECK-NEXT:  %[[select_244:[^ ]+]] = f32[] select(%[[compare_165]], %[[select_243]], %[[add_227]])
+// CHECK-NEXT:  %[[select_250:[^ ]+]] = f32[] select(%[[compare_249]], %[[broadcast_246]], %[[select_244]])
+// CHECK-NEXT:  %[[exponential_251:[^ ]+]] = f32[] exponential(%[[select_250]])
+// CHECK-NEXT:  %[[multiply_252:[^ ]+]] = f32[] multiply(%[[subtract_10]], %[[exponential_251]])
+// CHECK-NEXT:  %[[constant_443:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_444:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_443]]), direction=LE
+// CHECK-NEXT:  %[[floor_445:[^ ]+]] = f32[] floor(%[[arg1_2]])
+// CHECK-NEXT:  %[[compare_446:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[floor_445]]), direction=EQ
+// CHECK-NEXT:  %[[and_447:[^ ]+]] = pred[] and(%[[compare_444]], %[[compare_446]])
+// CHECK-NEXT:  %[[constant_448:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[remainder_449:[^ ]+]] = f32[] remainder(%[[add_4]], %[[constant_448]])
+// CHECK-NEXT:  %[[constant_450:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_451:[^ ]+]] = pred[] compare(%[[remainder_449]], %[[constant_450]]), direction=EQ
+// CHECK-NEXT:  %[[floor_452:[^ ]+]] = f32[] floor(%[[add_4]])
+// CHECK-NEXT:  %[[compare_453:[^ ]+]] = pred[] compare(%[[add_4]], %[[floor_452]]), direction=EQ
+// CHECK-NEXT:  %[[and_454:[^ ]+]] = pred[] and(%[[compare_451]], %[[compare_453]])
+// CHECK-NEXT:  %[[constant_455:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[constant_456:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[select_457:[^ ]+]] = f32[] select(%[[and_454]], %[[constant_455]], %[[constant_456]])
+// CHECK-NEXT:  %[[constant_436:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_437:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_436]]), direction=LE
+// CHECK-NEXT:  %[[floor_438:[^ ]+]] = f32[] floor(%[[add_4]])
+// CHECK-NEXT:  %[[compare_439:[^ ]+]] = pred[] compare(%[[add_4]], %[[floor_438]]), direction=NE
+// CHECK-NEXT:  %[[and_440:[^ ]+]] = pred[] and(%[[compare_437]], %[[compare_439]])
+// CHECK-NEXT:  %[[constant_441:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[constant_432:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_433:[^ ]+]] = pred[] compare(%[[add_4]], %[[constant_432]]), direction=LT
+// CHECK-NEXT:  %[[constant_434:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[constant_428:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_429:[^ ]+]] = pred[] compare(%[[add_4]], %[[constant_428]]), direction=EQ
+// CHECK-NEXT:  %[[constant_430:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[constant_256:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = f32[] add(%[[arg1_2]], %[[constant_256]])
+// CHECK-NEXT:  %[[constant_261:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_262:[^ ]+]] = f32[] add(%[[add_257]], %[[constant_261]])
+// CHECK-NEXT:  %[[constant_266:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_267:[^ ]+]] = f32[] add(%[[add_262]], %[[constant_266]])
+// CHECK-NEXT:  %[[constant_271:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_272:[^ ]+]] = f32[] add(%[[add_267]], %[[constant_271]])
+// CHECK-NEXT:  %[[constant_276:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_277:[^ ]+]] = f32[] add(%[[add_272]], %[[constant_276]])
+// CHECK-NEXT:  %[[constant_281:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_282:[^ ]+]] = f32[] add(%[[add_277]], %[[constant_281]])
+// CHECK-NEXT:  %[[constant_286:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_287:[^ ]+]] = f32[] add(%[[add_282]], %[[constant_286]])
+// CHECK-NEXT:  %[[constant_291:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_292:[^ ]+]] = f32[] add(%[[add_287]], %[[constant_291]])
+// CHECK-NEXT:  %[[constant_296:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_297:[^ ]+]] = f32[] add(%[[add_292]], %[[constant_296]])
+// CHECK-NEXT:  %[[constant_301:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_302:[^ ]+]] = f32[] add(%[[add_297]], %[[constant_301]])
+// CHECK-NEXT:  %[[negate_303:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_304:[^ ]+]] = f32[] power(%[[add_302]], %[[negate_303]])
+// CHECK-NEXT:  %[[abs_422:[^ ]+]] = f32[] abs(%[[power_304]])
+// CHECK-NEXT:  %[[negate_254:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_255:[^ ]+]] = f32[] power(%[[arg1_2]], %[[negate_254]])
+// CHECK-NEXT:  %[[negate_258:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_259:[^ ]+]] = f32[] power(%[[add_257]], %[[negate_258]])
+// CHECK-NEXT:  %[[add_260:[^ ]+]] = f32[] add(%[[power_255]], %[[power_259]])
+// CHECK-NEXT:  %[[negate_263:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_264:[^ ]+]] = f32[] power(%[[add_262]], %[[negate_263]])
+// CHECK-NEXT:  %[[add_265:[^ ]+]] = f32[] add(%[[add_260]], %[[power_264]])
+// CHECK-NEXT:  %[[negate_268:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_269:[^ ]+]] = f32[] power(%[[add_267]], %[[negate_268]])
+// CHECK-NEXT:  %[[add_270:[^ ]+]] = f32[] add(%[[add_265]], %[[power_269]])
+// CHECK-NEXT:  %[[negate_273:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_274:[^ ]+]] = f32[] power(%[[add_272]], %[[negate_273]])
+// CHECK-NEXT:  %[[add_275:[^ ]+]] = f32[] add(%[[add_270]], %[[power_274]])
+// CHECK-NEXT:  %[[negate_278:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_279:[^ ]+]] = f32[] power(%[[add_277]], %[[negate_278]])
+// CHECK-NEXT:  %[[add_280:[^ ]+]] = f32[] add(%[[add_275]], %[[power_279]])
+// CHECK-NEXT:  %[[negate_283:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_284:[^ ]+]] = f32[] power(%[[add_282]], %[[negate_283]])
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = f32[] add(%[[add_280]], %[[power_284]])
+// CHECK-NEXT:  %[[negate_288:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_289:[^ ]+]] = f32[] power(%[[add_287]], %[[negate_288]])
+// CHECK-NEXT:  %[[add_290:[^ ]+]] = f32[] add(%[[add_285]], %[[power_289]])
+// CHECK-NEXT:  %[[negate_293:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_294:[^ ]+]] = f32[] power(%[[add_292]], %[[negate_293]])
+// CHECK-NEXT:  %[[add_295:[^ ]+]] = f32[] add(%[[add_290]], %[[power_294]])
+// CHECK-NEXT:  %[[negate_298:[^ ]+]] = f32[] negate(%[[add_4]])
+// CHECK-NEXT:  %[[power_299:[^ ]+]] = f32[] power(%[[add_297]], %[[negate_298]])
+// CHECK-NEXT:  %[[add_300:[^ ]+]] = f32[] add(%[[add_295]], %[[power_299]])
+// CHECK-NEXT:  %[[abs_423:[^ ]+]] = f32[] abs(%[[add_300]])
+// CHECK-NEXT:  %[[constant_424:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_425:[^ ]+]] = f32[] multiply(%[[abs_423]], %[[constant_424]])
+// CHECK-NEXT:  %[[compare_426:[^ ]+]] = pred[] compare(%[[abs_422]], %[[multiply_425]]), direction=LT
+// CHECK-NEXT:  %[[multiply_305:[^ ]+]] = f32[] multiply(%[[power_304]], %[[add_302]])
+// CHECK-NEXT:  %[[constant_306:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_307:[^ ]+]] = f32[] subtract(%[[add_4]], %[[constant_306]])
+// CHECK-NEXT:  %[[divide_308:[^ ]+]] = f32[] divide(%[[multiply_305]], %[[subtract_307]])
+// CHECK-NEXT:  %[[add_420:[^ ]+]] = f32[] add(%[[add_300]], %[[divide_308]])
+// CHECK-NEXT:  %[[constant_413:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[divide_414:[^ ]+]] = f32[] divide(%[[add_4]], %[[add_302]])
+// CHECK-NEXT:  %[[constant_415:[^ ]+]] = f32[] constant(0.0833333358)
+// CHECK-NEXT:  %[[constant_404:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_405:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_404]])
+// CHECK-NEXT:  %[[constant_406:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_407:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_406]])
+// CHECK-NEXT:  %[[multiply_408:[^ ]+]] = f32[] multiply(%[[add_405]], %[[add_407]])
+// CHECK-NEXT:  %[[constant_310:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[multiply_309:[^ ]+]] = f32[] multiply(%[[add_302]], %[[add_302]])
+// CHECK-NEXT:  %[[divide_311:[^ ]+]] = f32[] divide(%[[constant_310]], %[[multiply_309]])
+// CHECK-NEXT:  %[[multiply_409:[^ ]+]] = f32[] multiply(%[[multiply_408]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_395:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_396:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_395]])
+// CHECK-NEXT:  %[[constant_397:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_398:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_397]])
+// CHECK-NEXT:  %[[multiply_399:[^ ]+]] = f32[] multiply(%[[add_396]], %[[add_398]])
+// CHECK-NEXT:  %[[multiply_400:[^ ]+]] = f32[] multiply(%[[multiply_399]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_386:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_387:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_386]])
+// CHECK-NEXT:  %[[constant_388:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_389:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_388]])
+// CHECK-NEXT:  %[[multiply_390:[^ ]+]] = f32[] multiply(%[[add_387]], %[[add_389]])
+// CHECK-NEXT:  %[[multiply_391:[^ ]+]] = f32[] multiply(%[[multiply_390]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_377:[^ ]+]] = f32[] constant(8)
+// CHECK-NEXT:  %[[add_378:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_377]])
+// CHECK-NEXT:  %[[constant_379:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_380:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_379]])
+// CHECK-NEXT:  %[[multiply_381:[^ ]+]] = f32[] multiply(%[[add_378]], %[[add_380]])
+// CHECK-NEXT:  %[[multiply_382:[^ ]+]] = f32[] multiply(%[[multiply_381]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_368:[^ ]+]] = f32[] constant(10)
+// CHECK-NEXT:  %[[add_369:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_368]])
+// CHECK-NEXT:  %[[constant_370:[^ ]+]] = f32[] constant(9)
+// CHECK-NEXT:  %[[add_371:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_370]])
+// CHECK-NEXT:  %[[multiply_372:[^ ]+]] = f32[] multiply(%[[add_369]], %[[add_371]])
+// CHECK-NEXT:  %[[multiply_373:[^ ]+]] = f32[] multiply(%[[multiply_372]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_359:[^ ]+]] = f32[] constant(12)
+// CHECK-NEXT:  %[[add_360:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_359]])
+// CHECK-NEXT:  %[[constant_361:[^ ]+]] = f32[] constant(11)
+// CHECK-NEXT:  %[[add_362:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_361]])
+// CHECK-NEXT:  %[[multiply_363:[^ ]+]] = f32[] multiply(%[[add_360]], %[[add_362]])
+// CHECK-NEXT:  %[[multiply_364:[^ ]+]] = f32[] multiply(%[[multiply_363]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_350:[^ ]+]] = f32[] constant(14)
+// CHECK-NEXT:  %[[add_351:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_350]])
+// CHECK-NEXT:  %[[constant_352:[^ ]+]] = f32[] constant(13)
+// CHECK-NEXT:  %[[add_353:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_352]])
+// CHECK-NEXT:  %[[multiply_354:[^ ]+]] = f32[] multiply(%[[add_351]], %[[add_353]])
+// CHECK-NEXT:  %[[multiply_355:[^ ]+]] = f32[] multiply(%[[multiply_354]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_341:[^ ]+]] = f32[] constant(16)
+// CHECK-NEXT:  %[[add_342:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_341]])
+// CHECK-NEXT:  %[[constant_343:[^ ]+]] = f32[] constant(15)
+// CHECK-NEXT:  %[[add_344:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_343]])
+// CHECK-NEXT:  %[[multiply_345:[^ ]+]] = f32[] multiply(%[[add_342]], %[[add_344]])
+// CHECK-NEXT:  %[[multiply_346:[^ ]+]] = f32[] multiply(%[[multiply_345]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_332:[^ ]+]] = f32[] constant(18)
+// CHECK-NEXT:  %[[add_333:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_332]])
+// CHECK-NEXT:  %[[constant_334:[^ ]+]] = f32[] constant(17)
+// CHECK-NEXT:  %[[add_335:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_334]])
+// CHECK-NEXT:  %[[multiply_336:[^ ]+]] = f32[] multiply(%[[add_333]], %[[add_335]])
+// CHECK-NEXT:  %[[multiply_337:[^ ]+]] = f32[] multiply(%[[multiply_336]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_323:[^ ]+]] = f32[] constant(20)
+// CHECK-NEXT:  %[[add_324:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_323]])
+// CHECK-NEXT:  %[[constant_325:[^ ]+]] = f32[] constant(19)
+// CHECK-NEXT:  %[[add_326:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_325]])
+// CHECK-NEXT:  %[[multiply_327:[^ ]+]] = f32[] multiply(%[[add_324]], %[[add_326]])
+// CHECK-NEXT:  %[[multiply_328:[^ ]+]] = f32[] multiply(%[[multiply_327]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_314:[^ ]+]] = f32[] constant(22)
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_314]])
+// CHECK-NEXT:  %[[constant_316:[^ ]+]] = f32[] constant(21)
+// CHECK-NEXT:  %[[add_317:[^ ]+]] = f32[] add(%[[add_4]], %[[constant_316]])
+// CHECK-NEXT:  %[[multiply_318:[^ ]+]] = f32[] multiply(%[[add_315]], %[[add_317]])
+// CHECK-NEXT:  %[[multiply_319:[^ ]+]] = f32[] multiply(%[[multiply_318]], %[[divide_311]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_320:[^ ]+]] = f32[] constant(-1.39544646e-19)
+// CHECK-NEXT:  %[[add_321:[^ ]+]] = f32[] add(%[[constant_312]], %[[constant_320]])
+// CHECK-NEXT:  %[[multiply_322:[^ ]+]] = f32[] multiply(%[[multiply_319]], %[[add_321]])
+// CHECK-NEXT:  %[[constant_329:[^ ]+]] = f32[] constant(5.50900303e-18)
+// CHECK-NEXT:  %[[add_330:[^ ]+]] = f32[] add(%[[multiply_322]], %[[constant_329]])
+// CHECK-NEXT:  %[[multiply_331:[^ ]+]] = f32[] multiply(%[[multiply_328]], %[[add_330]])
+// CHECK-NEXT:  %[[constant_338:[^ ]+]] = f32[] constant(-2.17486866e-16)
+// CHECK-NEXT:  %[[add_339:[^ ]+]] = f32[] add(%[[multiply_331]], %[[constant_338]])
+// CHECK-NEXT:  %[[multiply_340:[^ ]+]] = f32[] multiply(%[[multiply_337]], %[[add_339]])
+// CHECK-NEXT:  %[[constant_347:[^ ]+]] = f32[] constant(8.58606213e-15)
+// CHECK-NEXT:  %[[add_348:[^ ]+]] = f32[] add(%[[multiply_340]], %[[constant_347]])
+// CHECK-NEXT:  %[[multiply_349:[^ ]+]] = f32[] multiply(%[[multiply_346]], %[[add_348]])
+// CHECK-NEXT:  %[[constant_356:[^ ]+]] = f32[] constant(-3.3896803e-13)
+// CHECK-NEXT:  %[[add_357:[^ ]+]] = f32[] add(%[[multiply_349]], %[[constant_356]])
+// CHECK-NEXT:  %[[multiply_358:[^ ]+]] = f32[] multiply(%[[multiply_355]], %[[add_357]])
+// CHECK-NEXT:  %[[constant_365:[^ ]+]] = f32[] constant(1.33825364e-11)
+// CHECK-NEXT:  %[[add_366:[^ ]+]] = f32[] add(%[[multiply_358]], %[[constant_365]])
+// CHECK-NEXT:  %[[multiply_367:[^ ]+]] = f32[] multiply(%[[multiply_364]], %[[add_366]])
+// CHECK-NEXT:  %[[constant_374:[^ ]+]] = f32[] constant(-5.28419031e-10)
+// CHECK-NEXT:  %[[add_375:[^ ]+]] = f32[] add(%[[multiply_367]], %[[constant_374]])
+// CHECK-NEXT:  %[[multiply_376:[^ ]+]] = f32[] multiply(%[[multiply_373]], %[[add_375]])
+// CHECK-NEXT:  %[[constant_383:[^ ]+]] = f32[] constant(2.08767563e-08)
+// CHECK-NEXT:  %[[add_384:[^ ]+]] = f32[] add(%[[multiply_376]], %[[constant_383]])
+// CHECK-NEXT:  %[[multiply_385:[^ ]+]] = f32[] multiply(%[[multiply_382]], %[[add_384]])
+// CHECK-NEXT:  %[[constant_392:[^ ]+]] = f32[] constant(-8.26719599e-07)
+// CHECK-NEXT:  %[[add_393:[^ ]+]] = f32[] add(%[[multiply_385]], %[[constant_392]])
+// CHECK-NEXT:  %[[multiply_394:[^ ]+]] = f32[] multiply(%[[multiply_391]], %[[add_393]])
+// CHECK-NEXT:  %[[constant_401:[^ ]+]] = f32[] constant(3.30687835e-05)
+// CHECK-NEXT:  %[[add_402:[^ ]+]] = f32[] add(%[[multiply_394]], %[[constant_401]])
+// CHECK-NEXT:  %[[multiply_403:[^ ]+]] = f32[] multiply(%[[multiply_400]], %[[add_402]])
+// CHECK-NEXT:  %[[constant_410:[^ ]+]] = f32[] constant(-0.00138888892)
+// CHECK-NEXT:  %[[add_411:[^ ]+]] = f32[] add(%[[multiply_403]], %[[constant_410]])
+// CHECK-NEXT:  %[[multiply_412:[^ ]+]] = f32[] multiply(%[[multiply_409]], %[[add_411]])
+// CHECK-NEXT:  %[[add_416:[^ ]+]] = f32[] add(%[[constant_415]], %[[multiply_412]])
+// CHECK-NEXT:  %[[multiply_417:[^ ]+]] = f32[] multiply(%[[divide_414]], %[[add_416]])
+// CHECK-NEXT:  %[[add_418:[^ ]+]] = f32[] add(%[[constant_413]], %[[multiply_417]])
+// CHECK-NEXT:  %[[multiply_419:[^ ]+]] = f32[] multiply(%[[power_304]], %[[add_418]])
+// CHECK-NEXT:  %[[add_421:[^ ]+]] = f32[] add(%[[add_420]], %[[multiply_419]])
+// CHECK-NEXT:  %[[select_427:[^ ]+]] = f32[] select(%[[compare_426]], %[[add_300]], %[[add_421]])
+// CHECK-NEXT:  %[[select_431:[^ ]+]] = f32[] select(%[[compare_429]], %[[constant_430]], %[[select_427]])
+// CHECK-NEXT:  %[[select_435:[^ ]+]] = f32[] select(%[[compare_433]], %[[constant_434]], %[[select_431]])
+// CHECK-NEXT:  %[[select_442:[^ ]+]] = f32[] select(%[[and_440]], %[[constant_441]], %[[select_435]])
+// CHECK-NEXT:  %[[select_458:[^ ]+]] = f32[] select(%[[and_447]], %[[select_457]], %[[select_442]])
+// CHECK-NEXT:  %[[multiply_459:[^ ]+]] = f32[] multiply(%[[multiply_252]], %[[select_458]])
+// CHECK-NEXT:  %[[select_460:[^ ]+]] = f32[] select(%[[compare_12]], %[[select_156]], %[[multiply_459]])
+// CHECK-NEXT:  ROOT %[[select_467:[^ ]+]] = f32[] select(%[[or_465]], %[[constant_466]], %[[select_460]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.Polygamma"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.Polygamma", called_computations={%[[$xla_builder_math_Polygamma_468]]}
+
+HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[], Arg_1.2: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="xla_builder.math.Polygamma"
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/builder/tests/math_random_gamma_grad.hlo b/third_party/xla/xla/hlo/builder/tests/math_random_gamma_grad.hlo
new file mode 100644
index 000000000000..51b0421f3ccd
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_random_gamma_grad.hlo
@@ -0,0 +1,705 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+// CHECK:       %[[$igammac_body_150:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_151:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_152:[^ ]+]] = pred[] get-tuple-element(%[[parameter_151]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_159:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=7
+// CHECK-NEXT:  %[[get_tuple_element_156:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=4
+// CHECK-NEXT:  %[[constant_171:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_172:[^ ]+]] = f32[] add(%[[get_tuple_element_156]], %[[constant_171]])
+// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = f32[] multiply(%[[get_tuple_element_159]], %[[add_172]])
+// CHECK-NEXT:  %[[get_tuple_element_161:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=9
+// CHECK-NEXT:  %[[get_tuple_element_155:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=3
+// CHECK-NEXT:  %[[constant_169:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_170:[^ ]+]] = f32[] add(%[[get_tuple_element_155]], %[[constant_169]])
+// CHECK-NEXT:  %[[get_tuple_element_157:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=5
+// CHECK-NEXT:  %[[constant_167:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_168:[^ ]+]] = f32[] add(%[[get_tuple_element_157]], %[[constant_167]])
+// CHECK-NEXT:  %[[multiply_173:[^ ]+]] = f32[] multiply(%[[add_170]], %[[add_168]])
+// CHECK-NEXT:  %[[multiply_178:[^ ]+]] = f32[] multiply(%[[get_tuple_element_161]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_179:[^ ]+]] = f32[] subtract(%[[multiply_177]], %[[multiply_178]])
+// CHECK-NEXT:  %[[constant_180:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_181:[^ ]+]] = pred[] compare(%[[subtract_179]], %[[constant_180]]), direction=NE
+// CHECK-NEXT:  %[[get_tuple_element_164:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=12
+// CHECK-NEXT:  %[[multiply_190:[^ ]+]] = f32[] multiply(%[[get_tuple_element_164]], %[[add_172]])
+// CHECK-NEXT:  %[[get_tuple_element_158:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=6
+// CHECK-NEXT:  %[[subtract_191:[^ ]+]] = f32[] subtract(%[[multiply_190]], %[[get_tuple_element_158]])
+// CHECK-NEXT:  %[[get_tuple_element_162:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=10
+// CHECK-NEXT:  %[[multiply_192:[^ ]+]] = f32[] multiply(%[[get_tuple_element_162]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_193:[^ ]+]] = f32[] subtract(%[[subtract_191]], %[[multiply_192]])
+// CHECK-NEXT:  %[[get_tuple_element_160:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=8
+// CHECK-NEXT:  %[[multiply_194:[^ ]+]] = f32[] multiply(%[[get_tuple_element_160]], %[[add_168]])
+// CHECK-NEXT:  %[[add_195:[^ ]+]] = f32[] add(%[[subtract_193]], %[[multiply_194]])
+// CHECK-NEXT:  %[[multiply_174:[^ ]+]] = f32[] multiply(%[[get_tuple_element_158]], %[[add_172]])
+// CHECK-NEXT:  %[[multiply_175:[^ ]+]] = f32[] multiply(%[[get_tuple_element_160]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_176:[^ ]+]] = f32[] subtract(%[[multiply_174]], %[[multiply_175]])
+// CHECK-NEXT:  %[[divide_182:[^ ]+]] = f32[] divide(%[[subtract_176]], %[[subtract_179]])
+// CHECK-NEXT:  %[[get_tuple_element_153:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=1
+// CHECK-NEXT:  %[[select_189:[^ ]+]] = f32[] select(%[[compare_181]], %[[divide_182]], %[[get_tuple_element_153]])
+// CHECK-NEXT:  %[[get_tuple_element_165:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=13
+// CHECK-NEXT:  %[[multiply_196:[^ ]+]] = f32[] multiply(%[[get_tuple_element_165]], %[[add_172]])
+// CHECK-NEXT:  %[[subtract_197:[^ ]+]] = f32[] subtract(%[[multiply_196]], %[[get_tuple_element_159]])
+// CHECK-NEXT:  %[[get_tuple_element_163:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=11
+// CHECK-NEXT:  %[[multiply_198:[^ ]+]] = f32[] multiply(%[[get_tuple_element_163]], %[[multiply_173]])
+// CHECK-NEXT:  %[[subtract_199:[^ ]+]] = f32[] subtract(%[[subtract_197]], %[[multiply_198]])
+// CHECK-NEXT:  %[[multiply_200:[^ ]+]] = f32[] multiply(%[[get_tuple_element_161]], %[[add_168]])
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = f32[] add(%[[subtract_199]], %[[multiply_200]])
+// CHECK-NEXT:  %[[multiply_202:[^ ]+]] = f32[] multiply(%[[select_189]], %[[add_201]])
+// CHECK-NEXT:  %[[subtract_203:[^ ]+]] = f32[] subtract(%[[add_195]], %[[multiply_202]])
+// CHECK-NEXT:  %[[divide_204:[^ ]+]] = f32[] divide(%[[subtract_203]], %[[subtract_179]])
+// CHECK-NEXT:  %[[get_tuple_element_166:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=14
+// CHECK-NEXT:  %[[select_205:[^ ]+]] = f32[] select(%[[compare_181]], %[[divide_204]], %[[get_tuple_element_166]])
+// CHECK-NEXT:  %[[subtract_206:[^ ]+]] = f32[] subtract(%[[select_205]], %[[get_tuple_element_166]])
+// CHECK-NEXT:  %[[abs_207:[^ ]+]] = f32[] abs(%[[subtract_206]])
+// CHECK-NEXT:  %[[constant_208:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_209:[^ ]+]] = f32[] broadcast(%[[constant_208]]), dimensions={}
+// CHECK-NEXT:  %[[select_210:[^ ]+]] = f32[] select(%[[compare_181]], %[[abs_207]], %[[broadcast_209]])
+// CHECK-NEXT:  %[[constant_240:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[compare_241:[^ ]+]] = pred[] compare(%[[select_210]], %[[constant_240]]), direction=GT
+// CHECK-NEXT:  %[[and_242:[^ ]+]] = pred[] and(%[[get_tuple_element_152]], %[[compare_241]])
+// CHECK-NEXT:  %[[select_243:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_189]], %[[get_tuple_element_153]])
+// CHECK-NEXT:  %[[subtract_183:[^ ]+]] = f32[] subtract(%[[get_tuple_element_153]], %[[divide_182]])
+// CHECK-NEXT:  %[[divide_184:[^ ]+]] = f32[] divide(%[[subtract_183]], %[[divide_182]])
+// CHECK-NEXT:  %[[abs_185:[^ ]+]] = f32[] abs(%[[divide_184]])
+// CHECK-NEXT:  %[[constant_186:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_187:[^ ]+]] = f32[] broadcast(%[[constant_186]]), dimensions={}
+// CHECK-NEXT:  %[[select_188:[^ ]+]] = f32[] select(%[[compare_181]], %[[abs_185]], %[[broadcast_187]])
+// CHECK-NEXT:  %[[get_tuple_element_154:[^ ]+]] = f32[] get-tuple-element(%[[parameter_151]]), index=2
+// CHECK-NEXT:  %[[select_244:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_188]], %[[get_tuple_element_154]])
+// CHECK-NEXT:  %[[select_245:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[add_170]], %[[get_tuple_element_155]])
+// CHECK-NEXT:  %[[select_246:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[add_172]], %[[get_tuple_element_156]])
+// CHECK-NEXT:  %[[abs_211:[^ ]+]] = f32[] abs(%[[subtract_176]])
+// CHECK-NEXT:  %[[constant_213:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_212:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[divide_214:[^ ]+]] = f32[] divide(%[[constant_213]], %[[constant_212]])
+// CHECK-NEXT:  %[[compare_215:[^ ]+]] = pred[] compare(%[[abs_211]], %[[divide_214]]), direction=GT
+// CHECK-NEXT:  %[[constant_219:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_220:[^ ]+]] = f32[] multiply(%[[subtract_176]], %[[constant_219]])
+// CHECK-NEXT:  %[[select_221:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_220]], %[[subtract_176]])
+// CHECK-NEXT:  %[[select_247:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_221]], %[[get_tuple_element_158]])
+// CHECK-NEXT:  %[[constant_225:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_226:[^ ]+]] = f32[] multiply(%[[subtract_179]], %[[constant_225]])
+// CHECK-NEXT:  %[[select_227:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_226]], %[[subtract_179]])
+// CHECK-NEXT:  %[[select_248:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_227]], %[[get_tuple_element_159]])
+// CHECK-NEXT:  %[[constant_216:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_217:[^ ]+]] = f32[] multiply(%[[get_tuple_element_158]], %[[constant_216]])
+// CHECK-NEXT:  %[[select_218:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_217]], %[[get_tuple_element_158]])
+// CHECK-NEXT:  %[[select_249:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_218]], %[[get_tuple_element_160]])
+// CHECK-NEXT:  %[[constant_222:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_223:[^ ]+]] = f32[] multiply(%[[get_tuple_element_159]], %[[constant_222]])
+// CHECK-NEXT:  %[[select_224:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_223]], %[[get_tuple_element_159]])
+// CHECK-NEXT:  %[[select_250:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_224]], %[[get_tuple_element_161]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_229:[^ ]+]] = f32[] multiply(%[[get_tuple_element_164]], %[[constant_228]])
+// CHECK-NEXT:  %[[select_230:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_229]], %[[get_tuple_element_164]])
+// CHECK-NEXT:  %[[select_251:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_230]], %[[get_tuple_element_162]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_232:[^ ]+]] = f32[] multiply(%[[get_tuple_element_165]], %[[constant_231]])
+// CHECK-NEXT:  %[[select_233:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_232]], %[[get_tuple_element_165]])
+// CHECK-NEXT:  %[[select_252:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_233]], %[[get_tuple_element_163]])
+// CHECK-NEXT:  %[[constant_234:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_235:[^ ]+]] = f32[] multiply(%[[add_195]], %[[constant_234]])
+// CHECK-NEXT:  %[[select_236:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_235]], %[[add_195]])
+// CHECK-NEXT:  %[[select_253:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_236]], %[[get_tuple_element_164]])
+// CHECK-NEXT:  %[[constant_237:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[multiply_238:[^ ]+]] = f32[] multiply(%[[add_201]], %[[constant_237]])
+// CHECK-NEXT:  %[[select_239:[^ ]+]] = f32[] select(%[[compare_215]], %[[multiply_238]], %[[add_201]])
+// CHECK-NEXT:  %[[select_254:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_239]], %[[get_tuple_element_165]])
+// CHECK-NEXT:  %[[select_255:[^ ]+]] = f32[] select(%[[get_tuple_element_152]], %[[select_205]], %[[get_tuple_element_166]])
+// CHECK-NEXT:  ROOT %[[tuple_256:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(%[[and_242]], %[[select_243]], %[[select_244]], %[[select_245]], %[[select_246]], /*index=5*/%[[add_168]], %[[select_247]], %[[select_248]], %[[select_249]], %[[select_250]], /*index=10*/%[[select_251]], %[[select_252]], %[[select_253]], %[[select_254]], %[[select_255]])
+
+// CHECK:       %[[$or_257:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_258:[^ ]+]] = pred[] parameter(0)
+// CHECK-NEXT:  %[[rhs_259:[^ ]+]] = pred[] parameter(1)
+// CHECK-NEXT:  ROOT %[[or_260:[^ ]+]] = pred[] or(%[[lhs_258]], %[[rhs_259]])
+
+// CHECK:       %[[$igammac_condition_261:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_262:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_264:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_265:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_266:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_267:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_269:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_270:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=7
+// CHECK-NEXT:  %[[get_tuple_element_271:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=8
+// CHECK-NEXT:  %[[get_tuple_element_272:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=9
+// CHECK-NEXT:  %[[get_tuple_element_273:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=10
+// CHECK-NEXT:  %[[get_tuple_element_274:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=11
+// CHECK-NEXT:  %[[get_tuple_element_275:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=12
+// CHECK-NEXT:  %[[get_tuple_element_276:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=13
+// CHECK-NEXT:  %[[get_tuple_element_277:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=14
+// CHECK-NEXT:  %[[get_tuple_element_268:[^ ]+]] = f32[] get-tuple-element(%[[parameter_262]]), index=5
+// CHECK-NEXT:  %[[constant_278:[^ ]+]] = f32[] constant(2000)
+// CHECK-NEXT:  %[[compare_279:[^ ]+]] = pred[] compare(%[[get_tuple_element_268]], %[[constant_278]]), direction=LT
+// CHECK-NEXT:  %[[get_tuple_element_263:[^ ]+]] = pred[] get-tuple-element(%[[parameter_262]]), index=0
+// CHECK-NEXT:  %[[constant_280:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  %[[reduce_281:[^ ]+]] = pred[] reduce(%[[get_tuple_element_263]], %[[constant_280]]), dimensions={}, to_apply=%[[$or_257]]
+// CHECK-NEXT:  ROOT %[[and_282:[^ ]+]] = pred[] and(%[[compare_279]], %[[reduce_281]])
+
+// CHECK:       %[[$igamma_body_461:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_462:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_463:[^ ]+]] = pred[] get-tuple-element(%[[parameter_462]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_468:[^ ]+]] = f32[] get-tuple-element(%[[parameter_462]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_467:[^ ]+]] = f32[] get-tuple-element(%[[parameter_462]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_464:[^ ]+]] = f32[] get-tuple-element(%[[parameter_462]]), index=1
+// CHECK-NEXT:  %[[constant_470:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_471:[^ ]+]] = f32[] add(%[[get_tuple_element_464]], %[[constant_470]])
+// CHECK-NEXT:  %[[divide_472:[^ ]+]] = f32[] divide(%[[get_tuple_element_467]], %[[add_471]])
+// CHECK-NEXT:  %[[multiply_473:[^ ]+]] = f32[] multiply(%[[get_tuple_element_468]], %[[divide_472]])
+// CHECK-NEXT:  %[[constant_474:[^ ]+]] = f32[] constant(-1)
+// CHECK-NEXT:  %[[get_tuple_element_465:[^ ]+]] = f32[] get-tuple-element(%[[parameter_462]]), index=2
+// CHECK-NEXT:  %[[multiply_475:[^ ]+]] = f32[] multiply(%[[constant_474]], %[[get_tuple_element_465]])
+// CHECK-NEXT:  %[[multiply_476:[^ ]+]] = f32[] multiply(%[[multiply_475]], %[[get_tuple_element_467]])
+// CHECK-NEXT:  %[[multiply_477:[^ ]+]] = f32[] multiply(%[[add_471]], %[[add_471]])
+// CHECK-NEXT:  %[[divide_478:[^ ]+]] = f32[] divide(%[[multiply_476]], %[[multiply_477]])
+// CHECK-NEXT:  %[[add_479:[^ ]+]] = f32[] add(%[[multiply_473]], %[[divide_478]])
+// CHECK-NEXT:  %[[get_tuple_element_469:[^ ]+]] = f32[] get-tuple-element(%[[parameter_462]]), index=6
+// CHECK-NEXT:  %[[add_480:[^ ]+]] = f32[] add(%[[get_tuple_element_469]], %[[add_479]])
+// CHECK-NEXT:  %[[divide_484:[^ ]+]] = f32[] divide(%[[add_479]], %[[add_480]])
+// CHECK-NEXT:  %[[abs_485:[^ ]+]] = f32[] abs(%[[divide_484]])
+// CHECK-NEXT:  %[[constant_486:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[compare_487:[^ ]+]] = pred[] compare(%[[abs_485]], %[[constant_486]]), direction=GT
+// CHECK-NEXT:  %[[and_488:[^ ]+]] = pred[] and(%[[get_tuple_element_463]], %[[compare_487]])
+// CHECK-NEXT:  %[[select_489:[^ ]+]] = f32[] select(%[[get_tuple_element_463]], %[[add_471]], %[[get_tuple_element_464]])
+// CHECK-NEXT:  %[[divide_481:[^ ]+]] = f32[] divide(%[[get_tuple_element_467]], %[[add_471]])
+// CHECK-NEXT:  %[[multiply_482:[^ ]+]] = f32[] multiply(%[[get_tuple_element_465]], %[[divide_481]])
+// CHECK-NEXT:  %[[select_490:[^ ]+]] = f32[] select(%[[get_tuple_element_463]], %[[multiply_482]], %[[get_tuple_element_465]])
+// CHECK-NEXT:  %[[get_tuple_element_466:[^ ]+]] = f32[] get-tuple-element(%[[parameter_462]]), index=3
+// CHECK-NEXT:  %[[add_483:[^ ]+]] = f32[] add(%[[get_tuple_element_466]], %[[multiply_482]])
+// CHECK-NEXT:  %[[select_491:[^ ]+]] = f32[] select(%[[get_tuple_element_463]], %[[add_483]], %[[get_tuple_element_466]])
+// CHECK-NEXT:  %[[select_492:[^ ]+]] = f32[] select(%[[get_tuple_element_463]], %[[get_tuple_element_467]], %[[get_tuple_element_467]])
+// CHECK-NEXT:  %[[select_493:[^ ]+]] = f32[] select(%[[get_tuple_element_463]], %[[add_479]], %[[get_tuple_element_468]])
+// CHECK-NEXT:  %[[select_494:[^ ]+]] = f32[] select(%[[get_tuple_element_463]], %[[add_480]], %[[get_tuple_element_469]])
+// CHECK-NEXT:  ROOT %[[tuple_495:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(%[[and_488]], %[[select_489]], %[[select_490]], %[[select_491]], %[[select_492]], /*index=5*/%[[select_493]], %[[select_494]])
+
+// CHECK:       %[[$or_496:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_497:[^ ]+]] = pred[] parameter(0)
+// CHECK-NEXT:  %[[rhs_498:[^ ]+]] = pred[] parameter(1)
+// CHECK-NEXT:  ROOT %[[or_499:[^ ]+]] = pred[] or(%[[lhs_497]], %[[rhs_498]])
+
+// CHECK:       %[[$igamma_condition_500:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_501:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_503:[^ ]+]] = f32[] get-tuple-element(%[[parameter_501]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_504:[^ ]+]] = f32[] get-tuple-element(%[[parameter_501]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_505:[^ ]+]] = f32[] get-tuple-element(%[[parameter_501]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_506:[^ ]+]] = f32[] get-tuple-element(%[[parameter_501]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_507:[^ ]+]] = f32[] get-tuple-element(%[[parameter_501]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_508:[^ ]+]] = f32[] get-tuple-element(%[[parameter_501]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_502:[^ ]+]] = pred[] get-tuple-element(%[[parameter_501]]), index=0
+// CHECK-NEXT:  %[[constant_509:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  ROOT %[[reduce_510:[^ ]+]] = pred[] reduce(%[[get_tuple_element_502]], %[[constant_509]]), dimensions={}, to_apply=%[[$or_496]]
+
+// CHECK:       %[[$xla_builder_math_RandomGammaGrad_680:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[log_17:[^ ]+]] = f32[] log(%[[arg1_2]])
+// CHECK-NEXT:  %[[multiply_18:[^ ]+]] = f32[] multiply(%[[arg0_1]], %[[log_17]])
+// CHECK-NEXT:  %[[subtract_19:[^ ]+]] = f32[] subtract(%[[multiply_18]], %[[arg1_2]])
+// CHECK-NEXT:  %[[abs_110:[^ ]+]] = f32[] abs(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_112:[^ ]+]] = pred[] compare(%[[abs_110]], %[[constant_111]]), direction=EQ
+// CHECK-NEXT:  %[[constant_108:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = f32[] broadcast(%[[constant_108]]), dimensions={}
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_28:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_20]]), direction=LT
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[abs_91:[^ ]+]] = f32[] abs(%[[arg0_1]])
+// CHECK-NEXT:  %[[floor_92:[^ ]+]] = f32[] floor(%[[abs_91]])
+// CHECK-NEXT:  %[[subtract_93:[^ ]+]] = f32[] subtract(%[[abs_91]], %[[floor_92]])
+// CHECK-NEXT:  %[[constant_94:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_95:[^ ]+]] = pred[] compare(%[[subtract_93]], %[[constant_94]]), direction=GT
+// CHECK-NEXT:  %[[constant_96:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_97:[^ ]+]] = f32[] subtract(%[[constant_96]], %[[subtract_93]])
+// CHECK-NEXT:  %[[select_98:[^ ]+]] = f32[] select(%[[compare_95]], %[[subtract_97]], %[[subtract_93]])
+// CHECK-NEXT:  %[[multiply_99:[^ ]+]] = f32[] multiply(%[[constant_22]], %[[select_98]])
+// CHECK-NEXT:  %[[sine_100:[^ ]+]] = f32[] sine(%[[multiply_99]])
+// CHECK-NEXT:  %[[log_101:[^ ]+]] = f32[] log(%[[sine_100]])
+// CHECK-NEXT:  %[[is_finite_102:[^ ]+]] = pred[] is-finite(%[[log_101]])
+// CHECK-NEXT:  %[[constant_23:[^ ]+]] = f32[] constant(1.14472985)
+// CHECK-NEXT:  %[[subtract_103:[^ ]+]] = f32[] subtract(%[[constant_23]], %[[log_101]])
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = f32[] constant(0.918938518)
+// CHECK-NEXT:  %[[negate_29:[^ ]+]] = f32[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_21:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_30:[^ ]+]] = f32[] subtract(%[[arg0_1]], %[[constant_21]])
+// CHECK-NEXT:  %[[select_31:[^ ]+]] = f32[] select(%[[compare_28]], %[[negate_29]], %[[subtract_30]])
+// CHECK-NEXT:  %[[add_84:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_20]])
+// CHECK-NEXT:  %[[constant_25:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[add_80:[^ ]+]] = f32[] add(%[[constant_25]], %[[select_31]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[divide_81:[^ ]+]] = f32[] divide(%[[select_31]], %[[constant_25]])
+// CHECK-NEXT:  %[[log_plus_one_82:[^ ]+]] = f32[] log-plus-one(%[[divide_81]])
+// CHECK-NEXT:  %[[add_83:[^ ]+]] = f32[] add(%[[constant_26]], %[[log_plus_one_82]])
+// CHECK-NEXT:  %[[divide_85:[^ ]+]] = f32[] divide(%[[add_80]], %[[add_83]])
+// CHECK-NEXT:  %[[subtract_86:[^ ]+]] = f32[] subtract(%[[add_84]], %[[divide_85]])
+// CHECK-NEXT:  %[[multiply_87:[^ ]+]] = f32[] multiply(%[[subtract_86]], %[[add_83]])
+// CHECK-NEXT:  %[[add_88:[^ ]+]] = f32[] add(%[[constant_24]], %[[multiply_87]])
+// CHECK-NEXT:  %[[constant_27:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_32:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_33:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_34:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_33]])
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = f32[] add(%[[add_34]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_36:[^ ]+]] = f32[] divide(%[[constant_32]], %[[add_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = f32[] add(%[[constant_27]], %[[divide_36]])
+// CHECK-NEXT:  %[[constant_38:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_39:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_40:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_39]])
+// CHECK-NEXT:  %[[add_41:[^ ]+]] = f32[] add(%[[add_40]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_42:[^ ]+]] = f32[] divide(%[[constant_38]], %[[add_41]])
+// CHECK-NEXT:  %[[add_43:[^ ]+]] = f32[] add(%[[add_37]], %[[divide_42]])
+// CHECK-NEXT:  %[[constant_44:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_45:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_45]])
+// CHECK-NEXT:  %[[add_47:[^ ]+]] = f32[] add(%[[add_46]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_48:[^ ]+]] = f32[] divide(%[[constant_44]], %[[add_47]])
+// CHECK-NEXT:  %[[add_49:[^ ]+]] = f32[] add(%[[add_43]], %[[divide_48]])
+// CHECK-NEXT:  %[[constant_50:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_51:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_51]])
+// CHECK-NEXT:  %[[add_53:[^ ]+]] = f32[] add(%[[add_52]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_54:[^ ]+]] = f32[] divide(%[[constant_50]], %[[add_53]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = f32[] add(%[[add_49]], %[[divide_54]])
+// CHECK-NEXT:  %[[constant_56:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_57:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_58:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_57]])
+// CHECK-NEXT:  %[[add_59:[^ ]+]] = f32[] add(%[[add_58]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_60:[^ ]+]] = f32[] divide(%[[constant_56]], %[[add_59]])
+// CHECK-NEXT:  %[[add_61:[^ ]+]] = f32[] add(%[[add_55]], %[[divide_60]])
+// CHECK-NEXT:  %[[constant_62:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_64:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_63]])
+// CHECK-NEXT:  %[[add_65:[^ ]+]] = f32[] add(%[[add_64]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_66:[^ ]+]] = f32[] divide(%[[constant_62]], %[[add_65]])
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = f32[] add(%[[add_61]], %[[divide_66]])
+// CHECK-NEXT:  %[[constant_68:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_69:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_70:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_69]])
+// CHECK-NEXT:  %[[add_71:[^ ]+]] = f32[] add(%[[add_70]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_72:[^ ]+]] = f32[] divide(%[[constant_68]], %[[add_71]])
+// CHECK-NEXT:  %[[add_73:[^ ]+]] = f32[] add(%[[add_67]], %[[divide_72]])
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_75:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_76:[^ ]+]] = f32[] add(%[[select_31]], %[[constant_75]])
+// CHECK-NEXT:  %[[add_77:[^ ]+]] = f32[] add(%[[add_76]], %[[constant_21]])
+// CHECK-NEXT:  %[[divide_78:[^ ]+]] = f32[] divide(%[[constant_74]], %[[add_77]])
+// CHECK-NEXT:  %[[add_79:[^ ]+]] = f32[] add(%[[add_73]], %[[divide_78]])
+// CHECK-NEXT:  %[[log_89:[^ ]+]] = f32[] log(%[[add_79]])
+// CHECK-NEXT:  %[[add_90:[^ ]+]] = f32[] add(%[[add_88]], %[[log_89]])
+// CHECK-NEXT:  %[[subtract_104:[^ ]+]] = f32[] subtract(%[[subtract_103]], %[[add_90]])
+// CHECK-NEXT:  %[[negate_105:[^ ]+]] = f32[] negate(%[[log_101]])
+// CHECK-NEXT:  %[[select_106:[^ ]+]] = f32[] select(%[[is_finite_102]], %[[subtract_104]], %[[negate_105]])
+// CHECK-NEXT:  %[[select_107:[^ ]+]] = f32[] select(%[[compare_28]], %[[select_106]], %[[add_90]])
+// CHECK-NEXT:  %[[select_113:[^ ]+]] = f32[] select(%[[compare_112]], %[[broadcast_109]], %[[select_107]])
+// CHECK-NEXT:  %[[subtract_114:[^ ]+]] = f32[] subtract(%[[subtract_19]], %[[select_113]])
+// CHECK-NEXT:  %[[exponential_119:[^ ]+]] = f32[] exponential(%[[subtract_114]])
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_7:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_6]]), direction=EQ
+// CHECK-NEXT:  %[[constant_8:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_9:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_8]]), direction=LT
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_11:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_10]]), direction=LE
+// CHECK-NEXT:  %[[or_12:[^ ]+]] = pred[] or(%[[compare_9]], %[[compare_11]])
+// CHECK-NEXT:  %[[or_120:[^ ]+]] = pred[] or(%[[compare_7]], %[[or_12]])
+// CHECK-NEXT:  %[[constant_115:[^ ]+]] = f32[] constant(3.40282347e+38)
+// CHECK-NEXT:  %[[log_116:[^ ]+]] = f32[] log(%[[constant_115]])
+// CHECK-NEXT:  %[[negate_117:[^ ]+]] = f32[] negate(%[[log_116]])
+// CHECK-NEXT:  %[[compare_118:[^ ]+]] = pred[] compare(%[[subtract_114]], %[[negate_117]]), direction=LT
+// CHECK-NEXT:  %[[or_121:[^ ]+]] = pred[] or(%[[or_120]], %[[compare_118]])
+// CHECK-NEXT:  %[[compare_3:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[arg0_1]]), direction=NE
+// CHECK-NEXT:  %[[compare_4:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[arg1_2]]), direction=NE
+// CHECK-NEXT:  %[[or_5:[^ ]+]] = pred[] or(%[[compare_3]], %[[compare_4]])
+// CHECK-NEXT:  %[[or_122:[^ ]+]] = pred[] or(%[[or_121]], %[[or_5]])
+// CHECK-NEXT:  %[[not_123:[^ ]+]] = pred[] not(%[[or_122]])
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_14:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_13]]), direction=GT
+// CHECK-NEXT:  %[[compare_15:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[arg0_1]]), direction=GT
+// CHECK-NEXT:  %[[and_16:[^ ]+]] = pred[] and(%[[compare_14]], %[[compare_15]])
+// CHECK-NEXT:  %[[and_124:[^ ]+]] = pred[] and(%[[not_123]], %[[and_16]])
+// CHECK-NEXT:  %[[constant_133:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_134:[^ ]+]] = f32[] add(%[[arg1_2]], %[[constant_133]])
+// CHECK-NEXT:  %[[constant_125:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_126:[^ ]+]] = f32[] subtract(%[[constant_125]], %[[arg0_1]])
+// CHECK-NEXT:  %[[add_127:[^ ]+]] = f32[] add(%[[arg1_2]], %[[subtract_126]])
+// CHECK-NEXT:  %[[constant_128:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_129:[^ ]+]] = f32[] add(%[[add_127]], %[[constant_128]])
+// CHECK-NEXT:  %[[multiply_135:[^ ]+]] = f32[] multiply(%[[add_129]], %[[arg1_2]])
+// CHECK-NEXT:  %[[divide_136:[^ ]+]] = f32[] divide(%[[add_134]], %[[multiply_135]])
+// CHECK-NEXT:  %[[constant_137:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_138:[^ ]+]] = f32[] broadcast(%[[constant_137]]), dimensions={}
+// CHECK-NEXT:  %[[constant_130:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_131:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_132:[^ ]+]] = f32[] broadcast(%[[constant_131]]), dimensions={}
+// CHECK-NEXT:  %[[constant_139:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = f32[] broadcast(%[[constant_139]]), dimensions={}
+// CHECK-NEXT:  %[[constant_141:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_142:[^ ]+]] = f32[] broadcast(%[[constant_141]]), dimensions={}
+// CHECK-NEXT:  %[[constant_143:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_144:[^ ]+]] = f32[] broadcast(%[[constant_143]]), dimensions={}
+// CHECK-NEXT:  %[[negate_145:[^ ]+]] = f32[] negate(%[[arg1_2]])
+// CHECK-NEXT:  %[[multiply_146:[^ ]+]] = f32[] multiply(%[[divide_136]], %[[negate_145]])
+// CHECK-NEXT:  %[[subtract_147:[^ ]+]] = f32[] subtract(%[[broadcast_144]], %[[multiply_146]])
+// CHECK-NEXT:  %[[divide_148:[^ ]+]] = f32[] divide(%[[subtract_147]], %[[multiply_135]])
+// CHECK-NEXT:  %[[tuple_149:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) tuple(%[[and_124]], %[[divide_136]], %[[broadcast_138]], %[[subtract_126]], %[[add_129]], /*index=5*/%[[constant_130]], %[[add_134]], %[[multiply_135]], %[[broadcast_132]], %[[arg1_2]], /*index=10*/%[[broadcast_140]], %[[broadcast_142]], %[[broadcast_144]], %[[negate_145]], %[[divide_148]])
+// CHECK-NEXT:  %[[while_283:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[], f32[], f32[], /*index=10*/f32[], f32[], f32[], f32[], f32[]) while(%[[tuple_149]]), condition=%[[$igammac_condition_261]], body=%[[$igammac_body_150]]
+// CHECK-NEXT:  %[[get_tuple_element_284:[^ ]+]] = pred[] get-tuple-element(%[[while_283]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_286:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_287:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_288:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_289:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_290:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_291:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=7
+// CHECK-NEXT:  %[[get_tuple_element_292:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=8
+// CHECK-NEXT:  %[[get_tuple_element_293:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=9
+// CHECK-NEXT:  %[[get_tuple_element_294:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=10
+// CHECK-NEXT:  %[[get_tuple_element_295:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=11
+// CHECK-NEXT:  %[[get_tuple_element_296:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=12
+// CHECK-NEXT:  %[[get_tuple_element_297:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=13
+// CHECK-NEXT:  %[[not_450:[^ ]+]] = pred[] not(%[[and_16]])
+// CHECK-NEXT:  %[[and_451:[^ ]+]] = pred[] and(%[[not_123]], %[[not_450]])
+// CHECK-NEXT:  %[[constant_452:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_453:[^ ]+]] = f32[] broadcast(%[[constant_452]]), dimensions={}
+// CHECK-NEXT:  %[[constant_454:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_455:[^ ]+]] = f32[] broadcast(%[[constant_454]]), dimensions={}
+// CHECK-NEXT:  %[[constant_456:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_457:[^ ]+]] = f32[] broadcast(%[[constant_456]]), dimensions={}
+// CHECK-NEXT:  %[[constant_458:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_459:[^ ]+]] = f32[] broadcast(%[[constant_458]]), dimensions={}
+// CHECK-NEXT:  %[[tuple_460:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) tuple(%[[and_451]], %[[arg0_1]], %[[broadcast_453]], %[[broadcast_455]], %[[arg1_2]], /*index=5*/%[[broadcast_457]], %[[broadcast_459]])
+// CHECK-NEXT:  %[[while_511:[^ ]+]] = (pred[], f32[], f32[], f32[], f32[], /*index=5*/f32[], f32[]) while(%[[tuple_460]]), condition=%[[$igamma_condition_500]], body=%[[$igamma_body_461]]
+// CHECK-NEXT:  %[[get_tuple_element_512:[^ ]+]] = pred[] get-tuple-element(%[[while_511]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_513:[^ ]+]] = f32[] get-tuple-element(%[[while_511]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_514:[^ ]+]] = f32[] get-tuple-element(%[[while_511]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_516:[^ ]+]] = f32[] get-tuple-element(%[[while_511]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_517:[^ ]+]] = f32[] get-tuple-element(%[[while_511]]), index=5
+// CHECK-NEXT:  %[[or_676:[^ ]+]] = pred[] or(%[[or_12]], %[[or_5]])
+// CHECK-NEXT:  %[[constant_677:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_678:[^ ]+]] = f32[] broadcast(%[[constant_677]]), dimensions={}
+// CHECK-NEXT:  %[[constant_673:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_674:[^ ]+]] = f32[] broadcast(%[[constant_673]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_298:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=14
+// CHECK-NEXT:  %[[get_tuple_element_285:[^ ]+]] = f32[] get-tuple-element(%[[while_283]]), index=1
+// CHECK-NEXT:  %[[log_299:[^ ]+]] = f32[] log(%[[arg1_2]])
+// CHECK-NEXT:  %[[constant_300:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_437:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_300]]), direction=LE
+// CHECK-NEXT:  %[[floor_438:[^ ]+]] = f32[] floor(%[[arg0_1]])
+// CHECK-NEXT:  %[[compare_439:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[floor_438]]), direction=EQ
+// CHECK-NEXT:  %[[and_440:[^ ]+]] = pred[] and(%[[compare_437]], %[[compare_439]])
+// CHECK-NEXT:  %[[constant_441:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_442:[^ ]+]] = f32[] broadcast(%[[constant_441]]), dimensions={}
+// CHECK-NEXT:  %[[constant_301:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_308:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_301]]), direction=LT
+// CHECK-NEXT:  %[[constant_306:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[negate_309:[^ ]+]] = f32[] negate(%[[arg0_1]])
+// CHECK-NEXT:  %[[constant_302:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_310:[^ ]+]] = f32[] subtract(%[[arg0_1]], %[[constant_302]])
+// CHECK-NEXT:  %[[select_311:[^ ]+]] = f32[] select(%[[compare_308]], %[[negate_309]], %[[subtract_310]])
+// CHECK-NEXT:  %[[constant_305:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[divide_417:[^ ]+]] = f32[] divide(%[[select_311]], %[[constant_305]])
+// CHECK-NEXT:  %[[log_plus_one_418:[^ ]+]] = f32[] log-plus-one(%[[divide_417]])
+// CHECK-NEXT:  %[[add_419:[^ ]+]] = f32[] add(%[[constant_306]], %[[log_plus_one_418]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_313:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_314:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_313]])
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = f32[] add(%[[add_314]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_316:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_313]])
+// CHECK-NEXT:  %[[add_317:[^ ]+]] = f32[] add(%[[add_316]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_318:[^ ]+]] = f32[] multiply(%[[add_315]], %[[add_317]])
+// CHECK-NEXT:  %[[divide_319:[^ ]+]] = f32[] divide(%[[constant_312]], %[[multiply_318]])
+// CHECK-NEXT:  %[[subtract_320:[^ ]+]] = f32[] subtract(%[[constant_300]], %[[divide_319]])
+// CHECK-NEXT:  %[[constant_325:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_326:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_327:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_326]])
+// CHECK-NEXT:  %[[add_328:[^ ]+]] = f32[] add(%[[add_327]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_329:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_326]])
+// CHECK-NEXT:  %[[add_330:[^ ]+]] = f32[] add(%[[add_329]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_331:[^ ]+]] = f32[] multiply(%[[add_328]], %[[add_330]])
+// CHECK-NEXT:  %[[divide_332:[^ ]+]] = f32[] divide(%[[constant_325]], %[[multiply_331]])
+// CHECK-NEXT:  %[[subtract_333:[^ ]+]] = f32[] subtract(%[[subtract_320]], %[[divide_332]])
+// CHECK-NEXT:  %[[constant_338:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_339:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_340:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_339]])
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = f32[] add(%[[add_340]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_342:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_339]])
+// CHECK-NEXT:  %[[add_343:[^ ]+]] = f32[] add(%[[add_342]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_344:[^ ]+]] = f32[] multiply(%[[add_341]], %[[add_343]])
+// CHECK-NEXT:  %[[divide_345:[^ ]+]] = f32[] divide(%[[constant_338]], %[[multiply_344]])
+// CHECK-NEXT:  %[[subtract_346:[^ ]+]] = f32[] subtract(%[[subtract_333]], %[[divide_345]])
+// CHECK-NEXT:  %[[constant_351:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_352:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_353:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_352]])
+// CHECK-NEXT:  %[[add_354:[^ ]+]] = f32[] add(%[[add_353]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_355:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_352]])
+// CHECK-NEXT:  %[[add_356:[^ ]+]] = f32[] add(%[[add_355]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_357:[^ ]+]] = f32[] multiply(%[[add_354]], %[[add_356]])
+// CHECK-NEXT:  %[[divide_358:[^ ]+]] = f32[] divide(%[[constant_351]], %[[multiply_357]])
+// CHECK-NEXT:  %[[subtract_359:[^ ]+]] = f32[] subtract(%[[subtract_346]], %[[divide_358]])
+// CHECK-NEXT:  %[[constant_364:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_365:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_366:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_365]])
+// CHECK-NEXT:  %[[add_367:[^ ]+]] = f32[] add(%[[add_366]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_368:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_365]])
+// CHECK-NEXT:  %[[add_369:[^ ]+]] = f32[] add(%[[add_368]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_370:[^ ]+]] = f32[] multiply(%[[add_367]], %[[add_369]])
+// CHECK-NEXT:  %[[divide_371:[^ ]+]] = f32[] divide(%[[constant_364]], %[[multiply_370]])
+// CHECK-NEXT:  %[[subtract_372:[^ ]+]] = f32[] subtract(%[[subtract_359]], %[[divide_371]])
+// CHECK-NEXT:  %[[constant_377:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_378:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_379:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_378]])
+// CHECK-NEXT:  %[[add_380:[^ ]+]] = f32[] add(%[[add_379]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_381:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_378]])
+// CHECK-NEXT:  %[[add_382:[^ ]+]] = f32[] add(%[[add_381]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_383:[^ ]+]] = f32[] multiply(%[[add_380]], %[[add_382]])
+// CHECK-NEXT:  %[[divide_384:[^ ]+]] = f32[] divide(%[[constant_377]], %[[multiply_383]])
+// CHECK-NEXT:  %[[subtract_385:[^ ]+]] = f32[] subtract(%[[subtract_372]], %[[divide_384]])
+// CHECK-NEXT:  %[[constant_390:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_391:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_392:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_391]])
+// CHECK-NEXT:  %[[add_393:[^ ]+]] = f32[] add(%[[add_392]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_394:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_391]])
+// CHECK-NEXT:  %[[add_395:[^ ]+]] = f32[] add(%[[add_394]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_396:[^ ]+]] = f32[] multiply(%[[add_393]], %[[add_395]])
+// CHECK-NEXT:  %[[divide_397:[^ ]+]] = f32[] divide(%[[constant_390]], %[[multiply_396]])
+// CHECK-NEXT:  %[[subtract_398:[^ ]+]] = f32[] subtract(%[[subtract_385]], %[[divide_397]])
+// CHECK-NEXT:  %[[constant_403:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_404:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_405:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_404]])
+// CHECK-NEXT:  %[[add_406:[^ ]+]] = f32[] add(%[[add_405]], %[[constant_302]])
+// CHECK-NEXT:  %[[add_407:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_404]])
+// CHECK-NEXT:  %[[add_408:[^ ]+]] = f32[] add(%[[add_407]], %[[constant_302]])
+// CHECK-NEXT:  %[[multiply_409:[^ ]+]] = f32[] multiply(%[[add_406]], %[[add_408]])
+// CHECK-NEXT:  %[[divide_410:[^ ]+]] = f32[] divide(%[[constant_403]], %[[multiply_409]])
+// CHECK-NEXT:  %[[subtract_411:[^ ]+]] = f32[] subtract(%[[subtract_398]], %[[divide_410]])
+// CHECK-NEXT:  %[[constant_307:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_321:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_313]])
+// CHECK-NEXT:  %[[add_322:[^ ]+]] = f32[] add(%[[add_321]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_323:[^ ]+]] = f32[] divide(%[[constant_312]], %[[add_322]])
+// CHECK-NEXT:  %[[add_324:[^ ]+]] = f32[] add(%[[constant_307]], %[[divide_323]])
+// CHECK-NEXT:  %[[add_334:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_326]])
+// CHECK-NEXT:  %[[add_335:[^ ]+]] = f32[] add(%[[add_334]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_336:[^ ]+]] = f32[] divide(%[[constant_325]], %[[add_335]])
+// CHECK-NEXT:  %[[add_337:[^ ]+]] = f32[] add(%[[add_324]], %[[divide_336]])
+// CHECK-NEXT:  %[[add_347:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_339]])
+// CHECK-NEXT:  %[[add_348:[^ ]+]] = f32[] add(%[[add_347]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_349:[^ ]+]] = f32[] divide(%[[constant_338]], %[[add_348]])
+// CHECK-NEXT:  %[[add_350:[^ ]+]] = f32[] add(%[[add_337]], %[[divide_349]])
+// CHECK-NEXT:  %[[add_360:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_352]])
+// CHECK-NEXT:  %[[add_361:[^ ]+]] = f32[] add(%[[add_360]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_362:[^ ]+]] = f32[] divide(%[[constant_351]], %[[add_361]])
+// CHECK-NEXT:  %[[add_363:[^ ]+]] = f32[] add(%[[add_350]], %[[divide_362]])
+// CHECK-NEXT:  %[[add_373:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_365]])
+// CHECK-NEXT:  %[[add_374:[^ ]+]] = f32[] add(%[[add_373]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_375:[^ ]+]] = f32[] divide(%[[constant_364]], %[[add_374]])
+// CHECK-NEXT:  %[[add_376:[^ ]+]] = f32[] add(%[[add_363]], %[[divide_375]])
+// CHECK-NEXT:  %[[add_386:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_378]])
+// CHECK-NEXT:  %[[add_387:[^ ]+]] = f32[] add(%[[add_386]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_388:[^ ]+]] = f32[] divide(%[[constant_377]], %[[add_387]])
+// CHECK-NEXT:  %[[add_389:[^ ]+]] = f32[] add(%[[add_376]], %[[divide_388]])
+// CHECK-NEXT:  %[[add_399:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_391]])
+// CHECK-NEXT:  %[[add_400:[^ ]+]] = f32[] add(%[[add_399]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_401:[^ ]+]] = f32[] divide(%[[constant_390]], %[[add_400]])
+// CHECK-NEXT:  %[[add_402:[^ ]+]] = f32[] add(%[[add_389]], %[[divide_401]])
+// CHECK-NEXT:  %[[add_412:[^ ]+]] = f32[] add(%[[select_311]], %[[constant_404]])
+// CHECK-NEXT:  %[[add_413:[^ ]+]] = f32[] add(%[[add_412]], %[[constant_302]])
+// CHECK-NEXT:  %[[divide_414:[^ ]+]] = f32[] divide(%[[constant_403]], %[[add_413]])
+// CHECK-NEXT:  %[[add_415:[^ ]+]] = f32[] add(%[[add_402]], %[[divide_414]])
+// CHECK-NEXT:  %[[divide_420:[^ ]+]] = f32[] divide(%[[subtract_411]], %[[add_415]])
+// CHECK-NEXT:  %[[add_421:[^ ]+]] = f32[] add(%[[add_419]], %[[divide_420]])
+// CHECK-NEXT:  %[[constant_304:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_416:[^ ]+]] = f32[] add(%[[constant_305]], %[[select_311]])
+// CHECK-NEXT:  %[[divide_422:[^ ]+]] = f32[] divide(%[[constant_304]], %[[add_416]])
+// CHECK-NEXT:  %[[subtract_423:[^ ]+]] = f32[] subtract(%[[add_421]], %[[divide_422]])
+// CHECK-NEXT:  %[[constant_303:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[constant_424:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[add_425:[^ ]+]] = f32[] add(%[[arg0_1]], %[[constant_424]])
+// CHECK-NEXT:  %[[floor_426:[^ ]+]] = f32[] floor(%[[add_425]])
+// CHECK-NEXT:  %[[abs_427:[^ ]+]] = f32[] abs(%[[floor_426]])
+// CHECK-NEXT:  %[[add_428:[^ ]+]] = f32[] add(%[[arg0_1]], %[[abs_427]])
+// CHECK-NEXT:  %[[multiply_429:[^ ]+]] = f32[] multiply(%[[constant_303]], %[[add_428]])
+// CHECK-NEXT:  %[[cosine_430:[^ ]+]] = f32[] cosine(%[[multiply_429]])
+// CHECK-NEXT:  %[[multiply_431:[^ ]+]] = f32[] multiply(%[[constant_303]], %[[cosine_430]])
+// CHECK-NEXT:  %[[multiply_432:[^ ]+]] = f32[] multiply(%[[constant_303]], %[[add_428]])
+// CHECK-NEXT:  %[[sine_433:[^ ]+]] = f32[] sine(%[[multiply_432]])
+// CHECK-NEXT:  %[[divide_434:[^ ]+]] = f32[] divide(%[[multiply_431]], %[[sine_433]])
+// CHECK-NEXT:  %[[subtract_435:[^ ]+]] = f32[] subtract(%[[subtract_423]], %[[divide_434]])
+// CHECK-NEXT:  %[[select_436:[^ ]+]] = f32[] select(%[[compare_308]], %[[subtract_435]], %[[subtract_423]])
+// CHECK-NEXT:  %[[select_443:[^ ]+]] = f32[] select(%[[and_440]], %[[broadcast_442]], %[[select_436]])
+// CHECK-NEXT:  %[[subtract_444:[^ ]+]] = f32[] subtract(%[[log_299]], %[[select_443]])
+// CHECK-NEXT:  %[[multiply_445:[^ ]+]] = f32[] multiply(%[[get_tuple_element_285]], %[[subtract_444]])
+// CHECK-NEXT:  %[[add_446:[^ ]+]] = f32[] add(%[[get_tuple_element_298]], %[[multiply_445]])
+// CHECK-NEXT:  %[[negate_447:[^ ]+]] = f32[] negate(%[[add_446]])
+// CHECK-NEXT:  %[[multiply_448:[^ ]+]] = f32[] multiply(%[[negate_447]], %[[arg1_2]])
+// CHECK-NEXT:  %[[negate_449:[^ ]+]] = f32[] negate(%[[multiply_448]])
+// CHECK-NEXT:  %[[get_tuple_element_518:[^ ]+]] = f32[] get-tuple-element(%[[while_511]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_515:[^ ]+]] = f32[] get-tuple-element(%[[while_511]]), index=3
+// CHECK-NEXT:  %[[log_519:[^ ]+]] = f32[] log(%[[arg1_2]])
+// CHECK-NEXT:  %[[constant_520:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_521:[^ ]+]] = f32[] add(%[[arg0_1]], %[[constant_520]])
+// CHECK-NEXT:  %[[constant_522:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_659:[^ ]+]] = pred[] compare(%[[add_521]], %[[constant_522]]), direction=LE
+// CHECK-NEXT:  %[[floor_660:[^ ]+]] = f32[] floor(%[[add_521]])
+// CHECK-NEXT:  %[[compare_661:[^ ]+]] = pred[] compare(%[[add_521]], %[[floor_660]]), direction=EQ
+// CHECK-NEXT:  %[[and_662:[^ ]+]] = pred[] and(%[[compare_659]], %[[compare_661]])
+// CHECK-NEXT:  %[[constant_663:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[broadcast_664:[^ ]+]] = f32[] broadcast(%[[constant_663]]), dimensions={}
+// CHECK-NEXT:  %[[constant_523:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_530:[^ ]+]] = pred[] compare(%[[add_521]], %[[constant_523]]), direction=LT
+// CHECK-NEXT:  %[[constant_528:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[negate_531:[^ ]+]] = f32[] negate(%[[add_521]])
+// CHECK-NEXT:  %[[constant_524:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_532:[^ ]+]] = f32[] subtract(%[[add_521]], %[[constant_524]])
+// CHECK-NEXT:  %[[select_533:[^ ]+]] = f32[] select(%[[compare_530]], %[[negate_531]], %[[subtract_532]])
+// CHECK-NEXT:  %[[constant_527:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[divide_639:[^ ]+]] = f32[] divide(%[[select_533]], %[[constant_527]])
+// CHECK-NEXT:  %[[log_plus_one_640:[^ ]+]] = f32[] log-plus-one(%[[divide_639]])
+// CHECK-NEXT:  %[[add_641:[^ ]+]] = f32[] add(%[[constant_528]], %[[log_plus_one_640]])
+// CHECK-NEXT:  %[[constant_534:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_535:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_536:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_535]])
+// CHECK-NEXT:  %[[add_537:[^ ]+]] = f32[] add(%[[add_536]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_538:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_535]])
+// CHECK-NEXT:  %[[add_539:[^ ]+]] = f32[] add(%[[add_538]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_540:[^ ]+]] = f32[] multiply(%[[add_537]], %[[add_539]])
+// CHECK-NEXT:  %[[divide_541:[^ ]+]] = f32[] divide(%[[constant_534]], %[[multiply_540]])
+// CHECK-NEXT:  %[[subtract_542:[^ ]+]] = f32[] subtract(%[[constant_522]], %[[divide_541]])
+// CHECK-NEXT:  %[[constant_547:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_548:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_549:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_548]])
+// CHECK-NEXT:  %[[add_550:[^ ]+]] = f32[] add(%[[add_549]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_551:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_548]])
+// CHECK-NEXT:  %[[add_552:[^ ]+]] = f32[] add(%[[add_551]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_553:[^ ]+]] = f32[] multiply(%[[add_550]], %[[add_552]])
+// CHECK-NEXT:  %[[divide_554:[^ ]+]] = f32[] divide(%[[constant_547]], %[[multiply_553]])
+// CHECK-NEXT:  %[[subtract_555:[^ ]+]] = f32[] subtract(%[[subtract_542]], %[[divide_554]])
+// CHECK-NEXT:  %[[constant_560:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_561:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_562:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_561]])
+// CHECK-NEXT:  %[[add_563:[^ ]+]] = f32[] add(%[[add_562]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_564:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_561]])
+// CHECK-NEXT:  %[[add_565:[^ ]+]] = f32[] add(%[[add_564]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_566:[^ ]+]] = f32[] multiply(%[[add_563]], %[[add_565]])
+// CHECK-NEXT:  %[[divide_567:[^ ]+]] = f32[] divide(%[[constant_560]], %[[multiply_566]])
+// CHECK-NEXT:  %[[subtract_568:[^ ]+]] = f32[] subtract(%[[subtract_555]], %[[divide_567]])
+// CHECK-NEXT:  %[[constant_573:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_574:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_575:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_574]])
+// CHECK-NEXT:  %[[add_576:[^ ]+]] = f32[] add(%[[add_575]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_577:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_574]])
+// CHECK-NEXT:  %[[add_578:[^ ]+]] = f32[] add(%[[add_577]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_579:[^ ]+]] = f32[] multiply(%[[add_576]], %[[add_578]])
+// CHECK-NEXT:  %[[divide_580:[^ ]+]] = f32[] divide(%[[constant_573]], %[[multiply_579]])
+// CHECK-NEXT:  %[[subtract_581:[^ ]+]] = f32[] subtract(%[[subtract_568]], %[[divide_580]])
+// CHECK-NEXT:  %[[constant_586:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_587:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_588:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_587]])
+// CHECK-NEXT:  %[[add_589:[^ ]+]] = f32[] add(%[[add_588]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_590:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_587]])
+// CHECK-NEXT:  %[[add_591:[^ ]+]] = f32[] add(%[[add_590]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_592:[^ ]+]] = f32[] multiply(%[[add_589]], %[[add_591]])
+// CHECK-NEXT:  %[[divide_593:[^ ]+]] = f32[] divide(%[[constant_586]], %[[multiply_592]])
+// CHECK-NEXT:  %[[subtract_594:[^ ]+]] = f32[] subtract(%[[subtract_581]], %[[divide_593]])
+// CHECK-NEXT:  %[[constant_599:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_600:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_601:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_600]])
+// CHECK-NEXT:  %[[add_602:[^ ]+]] = f32[] add(%[[add_601]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_603:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_600]])
+// CHECK-NEXT:  %[[add_604:[^ ]+]] = f32[] add(%[[add_603]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_605:[^ ]+]] = f32[] multiply(%[[add_602]], %[[add_604]])
+// CHECK-NEXT:  %[[divide_606:[^ ]+]] = f32[] divide(%[[constant_599]], %[[multiply_605]])
+// CHECK-NEXT:  %[[subtract_607:[^ ]+]] = f32[] subtract(%[[subtract_594]], %[[divide_606]])
+// CHECK-NEXT:  %[[constant_612:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_613:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_614:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_613]])
+// CHECK-NEXT:  %[[add_615:[^ ]+]] = f32[] add(%[[add_614]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_616:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_613]])
+// CHECK-NEXT:  %[[add_617:[^ ]+]] = f32[] add(%[[add_616]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_618:[^ ]+]] = f32[] multiply(%[[add_615]], %[[add_617]])
+// CHECK-NEXT:  %[[divide_619:[^ ]+]] = f32[] divide(%[[constant_612]], %[[multiply_618]])
+// CHECK-NEXT:  %[[subtract_620:[^ ]+]] = f32[] subtract(%[[subtract_607]], %[[divide_619]])
+// CHECK-NEXT:  %[[constant_625:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_626:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_627:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_626]])
+// CHECK-NEXT:  %[[add_628:[^ ]+]] = f32[] add(%[[add_627]], %[[constant_524]])
+// CHECK-NEXT:  %[[add_629:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_626]])
+// CHECK-NEXT:  %[[add_630:[^ ]+]] = f32[] add(%[[add_629]], %[[constant_524]])
+// CHECK-NEXT:  %[[multiply_631:[^ ]+]] = f32[] multiply(%[[add_628]], %[[add_630]])
+// CHECK-NEXT:  %[[divide_632:[^ ]+]] = f32[] divide(%[[constant_625]], %[[multiply_631]])
+// CHECK-NEXT:  %[[subtract_633:[^ ]+]] = f32[] subtract(%[[subtract_620]], %[[divide_632]])
+// CHECK-NEXT:  %[[constant_529:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_543:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_535]])
+// CHECK-NEXT:  %[[add_544:[^ ]+]] = f32[] add(%[[add_543]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_545:[^ ]+]] = f32[] divide(%[[constant_534]], %[[add_544]])
+// CHECK-NEXT:  %[[add_546:[^ ]+]] = f32[] add(%[[constant_529]], %[[divide_545]])
+// CHECK-NEXT:  %[[add_556:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_548]])
+// CHECK-NEXT:  %[[add_557:[^ ]+]] = f32[] add(%[[add_556]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_558:[^ ]+]] = f32[] divide(%[[constant_547]], %[[add_557]])
+// CHECK-NEXT:  %[[add_559:[^ ]+]] = f32[] add(%[[add_546]], %[[divide_558]])
+// CHECK-NEXT:  %[[add_569:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_561]])
+// CHECK-NEXT:  %[[add_570:[^ ]+]] = f32[] add(%[[add_569]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_571:[^ ]+]] = f32[] divide(%[[constant_560]], %[[add_570]])
+// CHECK-NEXT:  %[[add_572:[^ ]+]] = f32[] add(%[[add_559]], %[[divide_571]])
+// CHECK-NEXT:  %[[add_582:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_574]])
+// CHECK-NEXT:  %[[add_583:[^ ]+]] = f32[] add(%[[add_582]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_584:[^ ]+]] = f32[] divide(%[[constant_573]], %[[add_583]])
+// CHECK-NEXT:  %[[add_585:[^ ]+]] = f32[] add(%[[add_572]], %[[divide_584]])
+// CHECK-NEXT:  %[[add_595:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_587]])
+// CHECK-NEXT:  %[[add_596:[^ ]+]] = f32[] add(%[[add_595]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_597:[^ ]+]] = f32[] divide(%[[constant_586]], %[[add_596]])
+// CHECK-NEXT:  %[[add_598:[^ ]+]] = f32[] add(%[[add_585]], %[[divide_597]])
+// CHECK-NEXT:  %[[add_608:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_600]])
+// CHECK-NEXT:  %[[add_609:[^ ]+]] = f32[] add(%[[add_608]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_610:[^ ]+]] = f32[] divide(%[[constant_599]], %[[add_609]])
+// CHECK-NEXT:  %[[add_611:[^ ]+]] = f32[] add(%[[add_598]], %[[divide_610]])
+// CHECK-NEXT:  %[[add_621:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_613]])
+// CHECK-NEXT:  %[[add_622:[^ ]+]] = f32[] add(%[[add_621]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_623:[^ ]+]] = f32[] divide(%[[constant_612]], %[[add_622]])
+// CHECK-NEXT:  %[[add_624:[^ ]+]] = f32[] add(%[[add_611]], %[[divide_623]])
+// CHECK-NEXT:  %[[add_634:[^ ]+]] = f32[] add(%[[select_533]], %[[constant_626]])
+// CHECK-NEXT:  %[[add_635:[^ ]+]] = f32[] add(%[[add_634]], %[[constant_524]])
+// CHECK-NEXT:  %[[divide_636:[^ ]+]] = f32[] divide(%[[constant_625]], %[[add_635]])
+// CHECK-NEXT:  %[[add_637:[^ ]+]] = f32[] add(%[[add_624]], %[[divide_636]])
+// CHECK-NEXT:  %[[divide_642:[^ ]+]] = f32[] divide(%[[subtract_633]], %[[add_637]])
+// CHECK-NEXT:  %[[add_643:[^ ]+]] = f32[] add(%[[add_641]], %[[divide_642]])
+// CHECK-NEXT:  %[[constant_526:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_638:[^ ]+]] = f32[] add(%[[constant_527]], %[[select_533]])
+// CHECK-NEXT:  %[[divide_644:[^ ]+]] = f32[] divide(%[[constant_526]], %[[add_638]])
+// CHECK-NEXT:  %[[subtract_645:[^ ]+]] = f32[] subtract(%[[add_643]], %[[divide_644]])
+// CHECK-NEXT:  %[[constant_525:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[constant_646:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[add_647:[^ ]+]] = f32[] add(%[[add_521]], %[[constant_646]])
+// CHECK-NEXT:  %[[floor_648:[^ ]+]] = f32[] floor(%[[add_647]])
+// CHECK-NEXT:  %[[abs_649:[^ ]+]] = f32[] abs(%[[floor_648]])
+// CHECK-NEXT:  %[[add_650:[^ ]+]] = f32[] add(%[[add_521]], %[[abs_649]])
+// CHECK-NEXT:  %[[multiply_651:[^ ]+]] = f32[] multiply(%[[constant_525]], %[[add_650]])
+// CHECK-NEXT:  %[[cosine_652:[^ ]+]] = f32[] cosine(%[[multiply_651]])
+// CHECK-NEXT:  %[[multiply_653:[^ ]+]] = f32[] multiply(%[[constant_525]], %[[cosine_652]])
+// CHECK-NEXT:  %[[multiply_654:[^ ]+]] = f32[] multiply(%[[constant_525]], %[[add_650]])
+// CHECK-NEXT:  %[[sine_655:[^ ]+]] = f32[] sine(%[[multiply_654]])
+// CHECK-NEXT:  %[[divide_656:[^ ]+]] = f32[] divide(%[[multiply_653]], %[[sine_655]])
+// CHECK-NEXT:  %[[subtract_657:[^ ]+]] = f32[] subtract(%[[subtract_645]], %[[divide_656]])
+// CHECK-NEXT:  %[[select_658:[^ ]+]] = f32[] select(%[[compare_530]], %[[subtract_657]], %[[subtract_645]])
+// CHECK-NEXT:  %[[select_665:[^ ]+]] = f32[] select(%[[and_662]], %[[broadcast_664]], %[[select_658]])
+// CHECK-NEXT:  %[[subtract_666:[^ ]+]] = f32[] subtract(%[[log_519]], %[[select_665]])
+// CHECK-NEXT:  %[[multiply_667:[^ ]+]] = f32[] multiply(%[[get_tuple_element_515]], %[[subtract_666]])
+// CHECK-NEXT:  %[[add_668:[^ ]+]] = f32[] add(%[[get_tuple_element_518]], %[[multiply_667]])
+// CHECK-NEXT:  %[[negate_669:[^ ]+]] = f32[] negate(%[[add_668]])
+// CHECK-NEXT:  %[[multiply_670:[^ ]+]] = f32[] multiply(%[[negate_669]], %[[arg1_2]])
+// CHECK-NEXT:  %[[divide_671:[^ ]+]] = f32[] divide(%[[multiply_670]], %[[arg0_1]])
+// CHECK-NEXT:  %[[select_672:[^ ]+]] = f32[] select(%[[and_16]], %[[negate_449]], %[[divide_671]])
+// CHECK-NEXT:  %[[select_675:[^ ]+]] = f32[] select(%[[compare_7]], %[[broadcast_674]], %[[select_672]])
+// CHECK-NEXT:  ROOT %[[select_679:[^ ]+]] = f32[] select(%[[or_676]], %[[broadcast_678]], %[[select_675]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.RandomGammaGrad"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.math.RandomGammaGrad", called_computations={%[[$xla_builder_math_RandomGammaGrad_680]]}
+
+HloModule main, entry_computation_layout={(f32[], f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[], Arg_1.2: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="xla_builder.math.RandomGammaGrad"
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/builder/tests/math_regularized_incomplete_beta.hlo b/third_party/xla/xla/hlo/builder/tests/math_regularized_incomplete_beta.hlo
new file mode 100644
index 000000000000..e81c1e3fb144
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/math_regularized_incomplete_beta.hlo
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], f32[], f32[])->f32[]}
+
+// CHECK:       %[[$or_47:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_48:[^ ]+]] = pred[] parameter(0)
+// CHECK-NEXT:  %[[rhs_49:[^ ]+]] = pred[] parameter(1)
+// CHECK-NEXT:  ROOT %[[or_50:[^ ]+]] = pred[] or(%[[lhs_48]], %[[rhs_49]])
+
+// CHECK:       %[[$Betainc_body_51:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_52:[^ ]+]] = (u32[], pred[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_54:[^ ]+]] = pred[] get-tuple-element(%[[parameter_52]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_53:[^ ]+]] = u32[] get-tuple-element(%[[parameter_52]]), index=0
+// CHECK-NEXT:  %[[constant_130:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[add_131:[^ ]+]] = u32[] add(%[[get_tuple_element_53]], %[[constant_130]])
+// CHECK-NEXT:  %[[broadcast_108:[^ ]+]] = u32[] broadcast(%[[get_tuple_element_53]]), dimensions={}
+// CHECK-NEXT:  %[[constant_109:[^ ]+]] = u32[] constant(0)
+// CHECK-NEXT:  %[[compare_110:[^ ]+]] = pred[] compare(%[[broadcast_108]], %[[constant_109]]), direction=EQ
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_112:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[select_113:[^ ]+]] = f32[] select(%[[compare_110]], %[[constant_111]], %[[constant_112]])
+// CHECK-NEXT:  %[[broadcast_61:[^ ]+]] = u32[] broadcast(%[[get_tuple_element_53]]), dimensions={}
+// CHECK-NEXT:  %[[constant_68:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_69:[^ ]+]] = u32[] broadcast(%[[constant_68]]), dimensions={}
+// CHECK-NEXT:  %[[compare_70:[^ ]+]] = pred[] compare(%[[broadcast_61]], %[[broadcast_69]]), direction=EQ
+// CHECK-NEXT:  %[[constant_105:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_62:[^ ]+]] = u32[] constant(2)
+// CHECK-NEXT:  %[[broadcast_63:[^ ]+]] = u32[] broadcast(%[[constant_62]]), dimensions={}
+// CHECK-NEXT:  %[[remainder_64:[^ ]+]] = u32[] remainder(%[[broadcast_61]], %[[broadcast_63]])
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = u32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u32[] broadcast(%[[constant_65]]), dimensions={}
+// CHECK-NEXT:  %[[compare_67:[^ ]+]] = pred[] compare(%[[remainder_64]], %[[broadcast_66]]), direction=EQ
+// CHECK-NEXT:  %[[get_tuple_element_58:[^ ]+]] = f32[] get-tuple-element(%[[parameter_52]]), index=5
+// CHECK-NEXT:  %[[constant_71:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_72:[^ ]+]] = u32[] broadcast(%[[constant_71]]), dimensions={}
+// CHECK-NEXT:  %[[subtract_73:[^ ]+]] = u32[] subtract(%[[broadcast_61]], %[[broadcast_72]])
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = u32[] constant(2)
+// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = u32[] broadcast(%[[constant_74]]), dimensions={}
+// CHECK-NEXT:  %[[divide_76:[^ ]+]] = u32[] divide(%[[subtract_73]], %[[broadcast_75]])
+// CHECK-NEXT:  %[[convert_77:[^ ]+]] = f32[] convert(%[[divide_76]])
+// CHECK-NEXT:  %[[add_82:[^ ]+]] = f32[] add(%[[get_tuple_element_58]], %[[convert_77]])
+// CHECK-NEXT:  %[[negate_83:[^ ]+]] = f32[] negate(%[[add_82]])
+// CHECK-NEXT:  %[[get_tuple_element_59:[^ ]+]] = f32[] get-tuple-element(%[[parameter_52]]), index=6
+// CHECK-NEXT:  %[[add_84:[^ ]+]] = f32[] add(%[[get_tuple_element_58]], %[[get_tuple_element_59]])
+// CHECK-NEXT:  %[[add_85:[^ ]+]] = f32[] add(%[[add_84]], %[[convert_77]])
+// CHECK-NEXT:  %[[multiply_86:[^ ]+]] = f32[] multiply(%[[negate_83]], %[[add_85]])
+// CHECK-NEXT:  %[[get_tuple_element_60:[^ ]+]] = f32[] get-tuple-element(%[[parameter_52]]), index=7
+// CHECK-NEXT:  %[[multiply_87:[^ ]+]] = f32[] multiply(%[[multiply_86]], %[[get_tuple_element_60]])
+// CHECK-NEXT:  %[[constant_80:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = f32[] broadcast(%[[constant_80]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_88:[^ ]+]] = f32[] multiply(%[[broadcast_81]], %[[convert_77]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = f32[] add(%[[get_tuple_element_58]], %[[multiply_88]])
+// CHECK-NEXT:  %[[multiply_90:[^ ]+]] = f32[] multiply(%[[broadcast_81]], %[[convert_77]])
+// CHECK-NEXT:  %[[add_91:[^ ]+]] = f32[] add(%[[get_tuple_element_58]], %[[multiply_90]])
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = f32[] broadcast(%[[constant_78]]), dimensions={}
+// CHECK-NEXT:  %[[add_92:[^ ]+]] = f32[] add(%[[add_91]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = f32[] multiply(%[[add_89]], %[[add_92]])
+// CHECK-NEXT:  %[[divide_94:[^ ]+]] = f32[] divide(%[[multiply_87]], %[[multiply_93]])
+// CHECK-NEXT:  %[[subtract_95:[^ ]+]] = f32[] subtract(%[[get_tuple_element_59]], %[[convert_77]])
+// CHECK-NEXT:  %[[multiply_96:[^ ]+]] = f32[] multiply(%[[convert_77]], %[[subtract_95]])
+// CHECK-NEXT:  %[[multiply_97:[^ ]+]] = f32[] multiply(%[[multiply_96]], %[[get_tuple_element_60]])
+// CHECK-NEXT:  %[[multiply_98:[^ ]+]] = f32[] multiply(%[[broadcast_81]], %[[convert_77]])
+// CHECK-NEXT:  %[[add_99:[^ ]+]] = f32[] add(%[[get_tuple_element_58]], %[[multiply_98]])
+// CHECK-NEXT:  %[[subtract_100:[^ ]+]] = f32[] subtract(%[[add_99]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[multiply_101:[^ ]+]] = f32[] multiply(%[[broadcast_81]], %[[convert_77]])
+// CHECK-NEXT:  %[[add_102:[^ ]+]] = f32[] add(%[[get_tuple_element_58]], %[[multiply_101]])
+// CHECK-NEXT:  %[[multiply_103:[^ ]+]] = f32[] multiply(%[[subtract_100]], %[[add_102]])
+// CHECK-NEXT:  %[[divide_104:[^ ]+]] = f32[] divide(%[[multiply_97]], %[[multiply_103]])
+// CHECK-NEXT:  %[[select_106:[^ ]+]] = f32[] select(%[[compare_67]], %[[divide_94]], %[[divide_104]])
+// CHECK-NEXT:  %[[select_107:[^ ]+]] = f32[] select(%[[compare_70]], %[[constant_105]], %[[select_106]])
+// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = f32[] get-tuple-element(%[[parameter_52]]), index=2
+// CHECK-NEXT:  %[[divide_114:[^ ]+]] = f32[] divide(%[[select_107]], %[[get_tuple_element_55]])
+// CHECK-NEXT:  %[[add_115:[^ ]+]] = f32[] add(%[[select_113]], %[[divide_114]])
+// CHECK-NEXT:  %[[abs_118:[^ ]+]] = f32[] abs(%[[add_115]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = f32[] constant(5.96046448e-08)
+// CHECK-NEXT:  %[[broadcast_117:[^ ]+]] = f32[] broadcast(%[[constant_116]]), dimensions={}
+// CHECK-NEXT:  %[[compare_119:[^ ]+]] = pred[] compare(%[[abs_118]], %[[broadcast_117]]), direction=LT
+// CHECK-NEXT:  %[[select_120:[^ ]+]] = f32[] select(%[[compare_119]], %[[broadcast_117]], %[[add_115]])
+// CHECK-NEXT:  %[[constant_126:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = f32[] get-tuple-element(%[[parameter_52]]), index=3
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = f32[] multiply(%[[select_107]], %[[get_tuple_element_56]])
+// CHECK-NEXT:  %[[add_122:[^ ]+]] = f32[] add(%[[select_113]], %[[multiply_121]])
+// CHECK-NEXT:  %[[abs_123:[^ ]+]] = f32[] abs(%[[add_122]])
+// CHECK-NEXT:  %[[compare_124:[^ ]+]] = pred[] compare(%[[abs_123]], %[[broadcast_117]]), direction=LT
+// CHECK-NEXT:  %[[select_125:[^ ]+]] = f32[] select(%[[compare_124]], %[[broadcast_117]], %[[add_122]])
+// CHECK-NEXT:  %[[divide_127:[^ ]+]] = f32[] divide(%[[constant_126]], %[[select_125]])
+// CHECK-NEXT:  %[[multiply_128:[^ ]+]] = f32[] multiply(%[[select_120]], %[[divide_127]])
+// CHECK-NEXT:  %[[constant_132:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = f32[] broadcast(%[[constant_132]]), dimensions={}
+// CHECK-NEXT:  %[[subtract_134:[^ ]+]] = f32[] subtract(%[[multiply_128]], %[[broadcast_133]])
+// CHECK-NEXT:  %[[abs_135:[^ ]+]] = f32[] abs(%[[subtract_134]])
+// CHECK-NEXT:  %[[constant_136:[^ ]+]] = f32[] constant(5.96046448e-08)
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = f32[] broadcast(%[[constant_136]]), dimensions={}
+// CHECK-NEXT:  %[[compare_138:[^ ]+]] = pred[] compare(%[[abs_135]], %[[broadcast_137]]), direction=GE
+// CHECK-NEXT:  %[[constant_139:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  %[[reduce_140:[^ ]+]] = pred[] reduce(%[[compare_138]], %[[constant_139]]), dimensions={}, to_apply=%[[$or_47]]
+// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = f32[] get-tuple-element(%[[parameter_52]]), index=4
+// CHECK-NEXT:  %[[multiply_129:[^ ]+]] = f32[] multiply(%[[get_tuple_element_57]], %[[multiply_128]])
+// CHECK-NEXT:  ROOT %[[tuple_141:[^ ]+]] = (u32[], pred[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[]) tuple(%[[add_131]], %[[reduce_140]], %[[select_120]], %[[divide_127]], %[[multiply_129]], /*index=5*/%[[get_tuple_element_58]], %[[get_tuple_element_59]], %[[get_tuple_element_60]])
+
+// CHECK:       %[[$Betainc_condition_142:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_143:[^ ]+]] = (u32[], pred[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_146:[^ ]+]] = f32[] get-tuple-element(%[[parameter_143]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_147:[^ ]+]] = f32[] get-tuple-element(%[[parameter_143]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_148:[^ ]+]] = f32[] get-tuple-element(%[[parameter_143]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_149:[^ ]+]] = f32[] get-tuple-element(%[[parameter_143]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_150:[^ ]+]] = f32[] get-tuple-element(%[[parameter_143]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_151:[^ ]+]] = f32[] get-tuple-element(%[[parameter_143]]), index=7
+// CHECK-NEXT:  %[[get_tuple_element_144:[^ ]+]] = u32[] get-tuple-element(%[[parameter_143]]), index=0
+// CHECK-NEXT:  %[[constant_152:[^ ]+]] = u32[] constant(200)
+// CHECK-NEXT:  %[[compare_153:[^ ]+]] = pred[] compare(%[[get_tuple_element_144]], %[[constant_152]]), direction=LT
+// CHECK-NEXT:  %[[get_tuple_element_145:[^ ]+]] = pred[] get-tuple-element(%[[parameter_143]]), index=1
+// CHECK-NEXT:  ROOT %[[and_154:[^ ]+]] = pred[] and(%[[compare_153]], %[[get_tuple_element_145]])
+
+// CHECK:       %[[$xla_builder_math_RegularizedIncompleteBeta_465:[^ ]+]]
+// CHECK-NEXT:  %[[constant_42:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[constant_43:[^ ]+]] = pred[] constant(true)
+// CHECK-NEXT:  %[[constant_30:[^ ]+]] = u32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_31:[^ ]+]] = u32[] broadcast(%[[constant_30]]), dimensions={}
+// CHECK-NEXT:  %[[constant_32:[^ ]+]] = u32[] constant(0)
+// CHECK-NEXT:  %[[compare_33:[^ ]+]] = pred[] compare(%[[broadcast_31]], %[[constant_32]]), direction=EQ
+// CHECK-NEXT:  %[[constant_34:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_35:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[select_36:[^ ]+]] = f32[] select(%[[compare_33]], %[[constant_34]], %[[constant_35]])
+// CHECK-NEXT:  %[[abs_39:[^ ]+]] = f32[] abs(%[[select_36]])
+// CHECK-NEXT:  %[[constant_37:[^ ]+]] = f32[] constant(5.96046448e-08)
+// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = f32[] broadcast(%[[constant_37]]), dimensions={}
+// CHECK-NEXT:  %[[compare_40:[^ ]+]] = pred[] compare(%[[abs_39]], %[[broadcast_38]]), direction=LT
+// CHECK-NEXT:  %[[select_41:[^ ]+]] = f32[] select(%[[compare_40]], %[[broadcast_38]], %[[select_36]])
+// CHECK-NEXT:  %[[constant_44:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_45:[^ ]+]] = f32[] broadcast(%[[constant_44]]), dimensions={}
+// CHECK-NEXT:  %[[arg2_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_16:[^ ]+]] = f32[] broadcast(%[[constant_15]]), dimensions={}
+// CHECK-NEXT:  %[[add_17:[^ ]+]] = f32[] add(%[[arg0_1]], %[[broadcast_16]])
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[add_18:[^ ]+]] = f32[] add(%[[arg0_1]], %[[arg1_2]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[broadcast_20:[^ ]+]] = f32[] broadcast(%[[constant_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_21:[^ ]+]] = f32[] add(%[[add_18]], %[[broadcast_20]])
+// CHECK-NEXT:  %[[divide_22:[^ ]+]] = f32[] divide(%[[add_17]], %[[add_21]])
+// CHECK-NEXT:  %[[compare_23:[^ ]+]] = pred[] compare(%[[arg2_3]], %[[divide_22]]), direction=LT
+// CHECK-NEXT:  %[[select_24:[^ ]+]] = f32[] select(%[[compare_23]], %[[arg0_1]], %[[arg1_2]])
+// CHECK-NEXT:  %[[select_25:[^ ]+]] = f32[] select(%[[compare_23]], %[[arg1_2]], %[[arg0_1]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_27:[^ ]+]] = f32[] broadcast(%[[constant_26]]), dimensions={}
+// CHECK-NEXT:  %[[subtract_28:[^ ]+]] = f32[] subtract(%[[broadcast_27]], %[[arg2_3]])
+// CHECK-NEXT:  %[[select_29:[^ ]+]] = f32[] select(%[[compare_23]], %[[arg2_3]], %[[subtract_28]])
+// CHECK-NEXT:  %[[tuple_46:[^ ]+]] = (u32[], pred[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[]) tuple(%[[constant_42]], %[[constant_43]], %[[select_41]], %[[broadcast_45]], %[[select_41]], /*index=5*/%[[select_24]], %[[select_25]], %[[select_29]])
+// CHECK-NEXT:  %[[while_155:[^ ]+]] = (u32[], pred[], f32[], f32[], f32[], /*index=5*/f32[], f32[], f32[]) while(%[[tuple_46]]), condition=%[[$Betainc_condition_142]], body=%[[$Betainc_body_51]]
+// CHECK-NEXT:  %[[get_tuple_element_156:[^ ]+]] = u32[] get-tuple-element(%[[while_155]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_157:[^ ]+]] = pred[] get-tuple-element(%[[while_155]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_158:[^ ]+]] = f32[] get-tuple-element(%[[while_155]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_159:[^ ]+]] = f32[] get-tuple-element(%[[while_155]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_161:[^ ]+]] = f32[] get-tuple-element(%[[while_155]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_162:[^ ]+]] = f32[] get-tuple-element(%[[while_155]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_163:[^ ]+]] = f32[] get-tuple-element(%[[while_155]]), index=7
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_5:[^ ]+]] = pred[] compare(%[[arg0_1]], %[[constant_4]]), direction=LE
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_7:[^ ]+]] = pred[] compare(%[[arg1_2]], %[[constant_6]]), direction=LE
+// CHECK-NEXT:  %[[or_8:[^ ]+]] = pred[] or(%[[compare_5]], %[[compare_7]])
+// CHECK-NEXT:  %[[constant_9:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[compare_10:[^ ]+]] = pred[] compare(%[[arg2_3]], %[[constant_9]]), direction=LT
+// CHECK-NEXT:  %[[or_11:[^ ]+]] = pred[] or(%[[or_8]], %[[compare_10]])
+// CHECK-NEXT:  %[[constant_12:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[compare_13:[^ ]+]] = pred[] compare(%[[arg2_3]], %[[constant_12]]), direction=GT
+// CHECK-NEXT:  %[[or_14:[^ ]+]] = pred[] or(%[[or_11]], %[[compare_13]])
+// CHECK-NEXT:  %[[constant_459:[^ ]+]] = f32[] constant(nan)
+// CHECK-NEXT:  %[[get_tuple_element_160:[^ ]+]] = f32[] get-tuple-element(%[[while_155]]), index=4
+// CHECK-NEXT:  %[[log_449:[^ ]+]] = f32[] log(%[[select_29]])
+// CHECK-NEXT:  %[[multiply_450:[^ ]+]] = f32[] multiply(%[[log_449]], %[[select_24]])
+// CHECK-NEXT:  %[[negate_451:[^ ]+]] = f32[] negate(%[[select_29]])
+// CHECK-NEXT:  %[[log_plus_one_452:[^ ]+]] = f32[] log-plus-one(%[[negate_451]])
+// CHECK-NEXT:  %[[multiply_453:[^ ]+]] = f32[] multiply(%[[log_plus_one_452]], %[[select_25]])
+// CHECK-NEXT:  %[[add_454:[^ ]+]] = f32[] add(%[[multiply_450]], %[[multiply_453]])
+// CHECK-NEXT:  %[[abs_254:[^ ]+]] = f32[] abs(%[[select_24]])
+// CHECK-NEXT:  %[[constant_255:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_256:[^ ]+]] = pred[] compare(%[[abs_254]], %[[constant_255]]), direction=EQ
+// CHECK-NEXT:  %[[constant_252:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[broadcast_253:[^ ]+]] = f32[] broadcast(%[[constant_252]]), dimensions={}
+// CHECK-NEXT:  %[[constant_164:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_172:[^ ]+]] = pred[] compare(%[[select_24]], %[[constant_164]]), direction=LT
+// CHECK-NEXT:  %[[constant_166:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[abs_235:[^ ]+]] = f32[] abs(%[[select_24]])
+// CHECK-NEXT:  %[[floor_236:[^ ]+]] = f32[] floor(%[[abs_235]])
+// CHECK-NEXT:  %[[subtract_237:[^ ]+]] = f32[] subtract(%[[abs_235]], %[[floor_236]])
+// CHECK-NEXT:  %[[constant_238:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_239:[^ ]+]] = pred[] compare(%[[subtract_237]], %[[constant_238]]), direction=GT
+// CHECK-NEXT:  %[[constant_240:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_241:[^ ]+]] = f32[] subtract(%[[constant_240]], %[[subtract_237]])
+// CHECK-NEXT:  %[[select_242:[^ ]+]] = f32[] select(%[[compare_239]], %[[subtract_241]], %[[subtract_237]])
+// CHECK-NEXT:  %[[multiply_243:[^ ]+]] = f32[] multiply(%[[constant_166]], %[[select_242]])
+// CHECK-NEXT:  %[[sine_244:[^ ]+]] = f32[] sine(%[[multiply_243]])
+// CHECK-NEXT:  %[[log_245:[^ ]+]] = f32[] log(%[[sine_244]])
+// CHECK-NEXT:  %[[is_finite_246:[^ ]+]] = pred[] is-finite(%[[log_245]])
+// CHECK-NEXT:  %[[constant_167:[^ ]+]] = f32[] constant(1.14472985)
+// CHECK-NEXT:  %[[subtract_247:[^ ]+]] = f32[] subtract(%[[constant_167]], %[[log_245]])
+// CHECK-NEXT:  %[[constant_168:[^ ]+]] = f32[] constant(0.918938518)
+// CHECK-NEXT:  %[[negate_173:[^ ]+]] = f32[] negate(%[[select_24]])
+// CHECK-NEXT:  %[[constant_165:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_174:[^ ]+]] = f32[] subtract(%[[select_24]], %[[constant_165]])
+// CHECK-NEXT:  %[[select_175:[^ ]+]] = f32[] select(%[[compare_172]], %[[negate_173]], %[[subtract_174]])
+// CHECK-NEXT:  %[[add_228:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_164]])
+// CHECK-NEXT:  %[[constant_169:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[add_224:[^ ]+]] = f32[] add(%[[constant_169]], %[[select_175]])
+// CHECK-NEXT:  %[[constant_170:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[divide_225:[^ ]+]] = f32[] divide(%[[select_175]], %[[constant_169]])
+// CHECK-NEXT:  %[[log_plus_one_226:[^ ]+]] = f32[] log-plus-one(%[[divide_225]])
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = f32[] add(%[[constant_170]], %[[log_plus_one_226]])
+// CHECK-NEXT:  %[[divide_229:[^ ]+]] = f32[] divide(%[[add_224]], %[[add_227]])
+// CHECK-NEXT:  %[[subtract_230:[^ ]+]] = f32[] subtract(%[[add_228]], %[[divide_229]])
+// CHECK-NEXT:  %[[multiply_231:[^ ]+]] = f32[] multiply(%[[subtract_230]], %[[add_227]])
+// CHECK-NEXT:  %[[add_232:[^ ]+]] = f32[] add(%[[constant_168]], %[[multiply_231]])
+// CHECK-NEXT:  %[[constant_171:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_176:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_177:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_178:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_177]])
+// CHECK-NEXT:  %[[add_179:[^ ]+]] = f32[] add(%[[add_178]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_180:[^ ]+]] = f32[] divide(%[[constant_176]], %[[add_179]])
+// CHECK-NEXT:  %[[add_181:[^ ]+]] = f32[] add(%[[constant_171]], %[[divide_180]])
+// CHECK-NEXT:  %[[constant_182:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_183:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_184:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_183]])
+// CHECK-NEXT:  %[[add_185:[^ ]+]] = f32[] add(%[[add_184]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_186:[^ ]+]] = f32[] divide(%[[constant_182]], %[[add_185]])
+// CHECK-NEXT:  %[[add_187:[^ ]+]] = f32[] add(%[[add_181]], %[[divide_186]])
+// CHECK-NEXT:  %[[constant_188:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_189:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_190:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_189]])
+// CHECK-NEXT:  %[[add_191:[^ ]+]] = f32[] add(%[[add_190]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_192:[^ ]+]] = f32[] divide(%[[constant_188]], %[[add_191]])
+// CHECK-NEXT:  %[[add_193:[^ ]+]] = f32[] add(%[[add_187]], %[[divide_192]])
+// CHECK-NEXT:  %[[constant_194:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_195:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_196:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_195]])
+// CHECK-NEXT:  %[[add_197:[^ ]+]] = f32[] add(%[[add_196]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_198:[^ ]+]] = f32[] divide(%[[constant_194]], %[[add_197]])
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = f32[] add(%[[add_193]], %[[divide_198]])
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_201:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_202:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_201]])
+// CHECK-NEXT:  %[[add_203:[^ ]+]] = f32[] add(%[[add_202]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_204:[^ ]+]] = f32[] divide(%[[constant_200]], %[[add_203]])
+// CHECK-NEXT:  %[[add_205:[^ ]+]] = f32[] add(%[[add_199]], %[[divide_204]])
+// CHECK-NEXT:  %[[constant_206:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_207:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_208:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_207]])
+// CHECK-NEXT:  %[[add_209:[^ ]+]] = f32[] add(%[[add_208]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_210:[^ ]+]] = f32[] divide(%[[constant_206]], %[[add_209]])
+// CHECK-NEXT:  %[[add_211:[^ ]+]] = f32[] add(%[[add_205]], %[[divide_210]])
+// CHECK-NEXT:  %[[constant_212:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_213:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_214:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_213]])
+// CHECK-NEXT:  %[[add_215:[^ ]+]] = f32[] add(%[[add_214]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_216:[^ ]+]] = f32[] divide(%[[constant_212]], %[[add_215]])
+// CHECK-NEXT:  %[[add_217:[^ ]+]] = f32[] add(%[[add_211]], %[[divide_216]])
+// CHECK-NEXT:  %[[constant_218:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_219:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_220:[^ ]+]] = f32[] add(%[[select_175]], %[[constant_219]])
+// CHECK-NEXT:  %[[add_221:[^ ]+]] = f32[] add(%[[add_220]], %[[constant_165]])
+// CHECK-NEXT:  %[[divide_222:[^ ]+]] = f32[] divide(%[[constant_218]], %[[add_221]])
+// CHECK-NEXT:  %[[add_223:[^ ]+]] = f32[] add(%[[add_217]], %[[divide_222]])
+// CHECK-NEXT:  %[[log_233:[^ ]+]] = f32[] log(%[[add_223]])
+// CHECK-NEXT:  %[[add_234:[^ ]+]] = f32[] add(%[[add_232]], %[[log_233]])
+// CHECK-NEXT:  %[[subtract_248:[^ ]+]] = f32[] subtract(%[[subtract_247]], %[[add_234]])
+// CHECK-NEXT:  %[[negate_249:[^ ]+]] = f32[] negate(%[[log_245]])
+// CHECK-NEXT:  %[[select_250:[^ ]+]] = f32[] select(%[[is_finite_246]], %[[subtract_248]], %[[negate_249]])
+// CHECK-NEXT:  %[[select_251:[^ ]+]] = f32[] select(%[[compare_172]], %[[select_250]], %[[add_234]])
+// CHECK-NEXT:  %[[select_257:[^ ]+]] = f32[] select(%[[compare_256]], %[[broadcast_253]], %[[select_251]])
+// CHECK-NEXT:  %[[abs_348:[^ ]+]] = f32[] abs(%[[select_25]])
+// CHECK-NEXT:  %[[constant_349:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_350:[^ ]+]] = pred[] compare(%[[abs_348]], %[[constant_349]]), direction=EQ
+// CHECK-NEXT:  %[[constant_346:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[broadcast_347:[^ ]+]] = f32[] broadcast(%[[constant_346]]), dimensions={}
+// CHECK-NEXT:  %[[constant_258:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_266:[^ ]+]] = pred[] compare(%[[select_25]], %[[constant_258]]), direction=LT
+// CHECK-NEXT:  %[[constant_260:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[abs_329:[^ ]+]] = f32[] abs(%[[select_25]])
+// CHECK-NEXT:  %[[floor_330:[^ ]+]] = f32[] floor(%[[abs_329]])
+// CHECK-NEXT:  %[[subtract_331:[^ ]+]] = f32[] subtract(%[[abs_329]], %[[floor_330]])
+// CHECK-NEXT:  %[[constant_332:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_333:[^ ]+]] = pred[] compare(%[[subtract_331]], %[[constant_332]]), direction=GT
+// CHECK-NEXT:  %[[constant_334:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_335:[^ ]+]] = f32[] subtract(%[[constant_334]], %[[subtract_331]])
+// CHECK-NEXT:  %[[select_336:[^ ]+]] = f32[] select(%[[compare_333]], %[[subtract_335]], %[[subtract_331]])
+// CHECK-NEXT:  %[[multiply_337:[^ ]+]] = f32[] multiply(%[[constant_260]], %[[select_336]])
+// CHECK-NEXT:  %[[sine_338:[^ ]+]] = f32[] sine(%[[multiply_337]])
+// CHECK-NEXT:  %[[log_339:[^ ]+]] = f32[] log(%[[sine_338]])
+// CHECK-NEXT:  %[[is_finite_340:[^ ]+]] = pred[] is-finite(%[[log_339]])
+// CHECK-NEXT:  %[[constant_261:[^ ]+]] = f32[] constant(1.14472985)
+// CHECK-NEXT:  %[[subtract_341:[^ ]+]] = f32[] subtract(%[[constant_261]], %[[log_339]])
+// CHECK-NEXT:  %[[constant_262:[^ ]+]] = f32[] constant(0.918938518)
+// CHECK-NEXT:  %[[negate_267:[^ ]+]] = f32[] negate(%[[select_25]])
+// CHECK-NEXT:  %[[constant_259:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_268:[^ ]+]] = f32[] subtract(%[[select_25]], %[[constant_259]])
+// CHECK-NEXT:  %[[select_269:[^ ]+]] = f32[] select(%[[compare_266]], %[[negate_267]], %[[subtract_268]])
+// CHECK-NEXT:  %[[add_322:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_258]])
+// CHECK-NEXT:  %[[constant_263:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[add_318:[^ ]+]] = f32[] add(%[[constant_263]], %[[select_269]])
+// CHECK-NEXT:  %[[constant_264:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[divide_319:[^ ]+]] = f32[] divide(%[[select_269]], %[[constant_263]])
+// CHECK-NEXT:  %[[log_plus_one_320:[^ ]+]] = f32[] log-plus-one(%[[divide_319]])
+// CHECK-NEXT:  %[[add_321:[^ ]+]] = f32[] add(%[[constant_264]], %[[log_plus_one_320]])
+// CHECK-NEXT:  %[[divide_323:[^ ]+]] = f32[] divide(%[[add_318]], %[[add_321]])
+// CHECK-NEXT:  %[[subtract_324:[^ ]+]] = f32[] subtract(%[[add_322]], %[[divide_323]])
+// CHECK-NEXT:  %[[multiply_325:[^ ]+]] = f32[] multiply(%[[subtract_324]], %[[add_321]])
+// CHECK-NEXT:  %[[add_326:[^ ]+]] = f32[] add(%[[constant_262]], %[[multiply_325]])
+// CHECK-NEXT:  %[[constant_265:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_270:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_271:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_272:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_271]])
+// CHECK-NEXT:  %[[add_273:[^ ]+]] = f32[] add(%[[add_272]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_274:[^ ]+]] = f32[] divide(%[[constant_270]], %[[add_273]])
+// CHECK-NEXT:  %[[add_275:[^ ]+]] = f32[] add(%[[constant_265]], %[[divide_274]])
+// CHECK-NEXT:  %[[constant_276:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_277:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_278:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_277]])
+// CHECK-NEXT:  %[[add_279:[^ ]+]] = f32[] add(%[[add_278]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_280:[^ ]+]] = f32[] divide(%[[constant_276]], %[[add_279]])
+// CHECK-NEXT:  %[[add_281:[^ ]+]] = f32[] add(%[[add_275]], %[[divide_280]])
+// CHECK-NEXT:  %[[constant_282:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_283:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_284:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_283]])
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = f32[] add(%[[add_284]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_286:[^ ]+]] = f32[] divide(%[[constant_282]], %[[add_285]])
+// CHECK-NEXT:  %[[add_287:[^ ]+]] = f32[] add(%[[add_281]], %[[divide_286]])
+// CHECK-NEXT:  %[[constant_288:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_289:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_290:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_289]])
+// CHECK-NEXT:  %[[add_291:[^ ]+]] = f32[] add(%[[add_290]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_292:[^ ]+]] = f32[] divide(%[[constant_288]], %[[add_291]])
+// CHECK-NEXT:  %[[add_293:[^ ]+]] = f32[] add(%[[add_287]], %[[divide_292]])
+// CHECK-NEXT:  %[[constant_294:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_295:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_296:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_295]])
+// CHECK-NEXT:  %[[add_297:[^ ]+]] = f32[] add(%[[add_296]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_298:[^ ]+]] = f32[] divide(%[[constant_294]], %[[add_297]])
+// CHECK-NEXT:  %[[add_299:[^ ]+]] = f32[] add(%[[add_293]], %[[divide_298]])
+// CHECK-NEXT:  %[[constant_300:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_301:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_302:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_301]])
+// CHECK-NEXT:  %[[add_303:[^ ]+]] = f32[] add(%[[add_302]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_304:[^ ]+]] = f32[] divide(%[[constant_300]], %[[add_303]])
+// CHECK-NEXT:  %[[add_305:[^ ]+]] = f32[] add(%[[add_299]], %[[divide_304]])
+// CHECK-NEXT:  %[[constant_306:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_307:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_308:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_307]])
+// CHECK-NEXT:  %[[add_309:[^ ]+]] = f32[] add(%[[add_308]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_310:[^ ]+]] = f32[] divide(%[[constant_306]], %[[add_309]])
+// CHECK-NEXT:  %[[add_311:[^ ]+]] = f32[] add(%[[add_305]], %[[divide_310]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_313:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_314:[^ ]+]] = f32[] add(%[[select_269]], %[[constant_313]])
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = f32[] add(%[[add_314]], %[[constant_259]])
+// CHECK-NEXT:  %[[divide_316:[^ ]+]] = f32[] divide(%[[constant_312]], %[[add_315]])
+// CHECK-NEXT:  %[[add_317:[^ ]+]] = f32[] add(%[[add_311]], %[[divide_316]])
+// CHECK-NEXT:  %[[log_327:[^ ]+]] = f32[] log(%[[add_317]])
+// CHECK-NEXT:  %[[add_328:[^ ]+]] = f32[] add(%[[add_326]], %[[log_327]])
+// CHECK-NEXT:  %[[subtract_342:[^ ]+]] = f32[] subtract(%[[subtract_341]], %[[add_328]])
+// CHECK-NEXT:  %[[negate_343:[^ ]+]] = f32[] negate(%[[log_339]])
+// CHECK-NEXT:  %[[select_344:[^ ]+]] = f32[] select(%[[is_finite_340]], %[[subtract_342]], %[[negate_343]])
+// CHECK-NEXT:  %[[select_345:[^ ]+]] = f32[] select(%[[compare_266]], %[[select_344]], %[[add_328]])
+// CHECK-NEXT:  %[[select_351:[^ ]+]] = f32[] select(%[[compare_350]], %[[broadcast_347]], %[[select_345]])
+// CHECK-NEXT:  %[[add_352:[^ ]+]] = f32[] add(%[[select_257]], %[[select_351]])
+// CHECK-NEXT:  %[[add_353:[^ ]+]] = f32[] add(%[[select_24]], %[[select_25]])
+// CHECK-NEXT:  %[[abs_444:[^ ]+]] = f32[] abs(%[[add_353]])
+// CHECK-NEXT:  %[[constant_445:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[compare_446:[^ ]+]] = pred[] compare(%[[abs_444]], %[[constant_445]]), direction=EQ
+// CHECK-NEXT:  %[[constant_442:[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:  %[[broadcast_443:[^ ]+]] = f32[] broadcast(%[[constant_442]]), dimensions={}
+// CHECK-NEXT:  %[[constant_354:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_362:[^ ]+]] = pred[] compare(%[[add_353]], %[[constant_354]]), direction=LT
+// CHECK-NEXT:  %[[constant_356:[^ ]+]] = f32[] constant(3.14159274)
+// CHECK-NEXT:  %[[abs_425:[^ ]+]] = f32[] abs(%[[add_353]])
+// CHECK-NEXT:  %[[floor_426:[^ ]+]] = f32[] floor(%[[abs_425]])
+// CHECK-NEXT:  %[[subtract_427:[^ ]+]] = f32[] subtract(%[[abs_425]], %[[floor_426]])
+// CHECK-NEXT:  %[[constant_428:[^ ]+]] = f32[] constant(0.5)
+// CHECK-NEXT:  %[[compare_429:[^ ]+]] = pred[] compare(%[[subtract_427]], %[[constant_428]]), direction=GT
+// CHECK-NEXT:  %[[constant_430:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_431:[^ ]+]] = f32[] subtract(%[[constant_430]], %[[subtract_427]])
+// CHECK-NEXT:  %[[select_432:[^ ]+]] = f32[] select(%[[compare_429]], %[[subtract_431]], %[[subtract_427]])
+// CHECK-NEXT:  %[[multiply_433:[^ ]+]] = f32[] multiply(%[[constant_356]], %[[select_432]])
+// CHECK-NEXT:  %[[sine_434:[^ ]+]] = f32[] sine(%[[multiply_433]])
+// CHECK-NEXT:  %[[log_435:[^ ]+]] = f32[] log(%[[sine_434]])
+// CHECK-NEXT:  %[[is_finite_436:[^ ]+]] = pred[] is-finite(%[[log_435]])
+// CHECK-NEXT:  %[[constant_357:[^ ]+]] = f32[] constant(1.14472985)
+// CHECK-NEXT:  %[[subtract_437:[^ ]+]] = f32[] subtract(%[[constant_357]], %[[log_435]])
+// CHECK-NEXT:  %[[constant_358:[^ ]+]] = f32[] constant(0.918938518)
+// CHECK-NEXT:  %[[negate_363:[^ ]+]] = f32[] negate(%[[add_353]])
+// CHECK-NEXT:  %[[constant_355:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[subtract_364:[^ ]+]] = f32[] subtract(%[[add_353]], %[[constant_355]])
+// CHECK-NEXT:  %[[select_365:[^ ]+]] = f32[] select(%[[compare_362]], %[[negate_363]], %[[subtract_364]])
+// CHECK-NEXT:  %[[add_418:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_354]])
+// CHECK-NEXT:  %[[constant_359:[^ ]+]] = f32[] constant(7.5)
+// CHECK-NEXT:  %[[add_414:[^ ]+]] = f32[] add(%[[constant_359]], %[[select_365]])
+// CHECK-NEXT:  %[[constant_360:[^ ]+]] = f32[] constant(2.01490307)
+// CHECK-NEXT:  %[[divide_415:[^ ]+]] = f32[] divide(%[[select_365]], %[[constant_359]])
+// CHECK-NEXT:  %[[log_plus_one_416:[^ ]+]] = f32[] log-plus-one(%[[divide_415]])
+// CHECK-NEXT:  %[[add_417:[^ ]+]] = f32[] add(%[[constant_360]], %[[log_plus_one_416]])
+// CHECK-NEXT:  %[[divide_419:[^ ]+]] = f32[] divide(%[[add_414]], %[[add_417]])
+// CHECK-NEXT:  %[[subtract_420:[^ ]+]] = f32[] subtract(%[[add_418]], %[[divide_419]])
+// CHECK-NEXT:  %[[multiply_421:[^ ]+]] = f32[] multiply(%[[subtract_420]], %[[add_417]])
+// CHECK-NEXT:  %[[add_422:[^ ]+]] = f32[] add(%[[constant_358]], %[[multiply_421]])
+// CHECK-NEXT:  %[[constant_361:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_366:[^ ]+]] = f32[] constant(676.520386)
+// CHECK-NEXT:  %[[constant_367:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[add_368:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_367]])
+// CHECK-NEXT:  %[[add_369:[^ ]+]] = f32[] add(%[[add_368]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_370:[^ ]+]] = f32[] divide(%[[constant_366]], %[[add_369]])
+// CHECK-NEXT:  %[[add_371:[^ ]+]] = f32[] add(%[[constant_361]], %[[divide_370]])
+// CHECK-NEXT:  %[[constant_372:[^ ]+]] = f32[] constant(-1259.13916)
+// CHECK-NEXT:  %[[constant_373:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[add_374:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_373]])
+// CHECK-NEXT:  %[[add_375:[^ ]+]] = f32[] add(%[[add_374]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_376:[^ ]+]] = f32[] divide(%[[constant_372]], %[[add_375]])
+// CHECK-NEXT:  %[[add_377:[^ ]+]] = f32[] add(%[[add_371]], %[[divide_376]])
+// CHECK-NEXT:  %[[constant_378:[^ ]+]] = f32[] constant(771.323425)
+// CHECK-NEXT:  %[[constant_379:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[add_380:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_379]])
+// CHECK-NEXT:  %[[add_381:[^ ]+]] = f32[] add(%[[add_380]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_382:[^ ]+]] = f32[] divide(%[[constant_378]], %[[add_381]])
+// CHECK-NEXT:  %[[add_383:[^ ]+]] = f32[] add(%[[add_377]], %[[divide_382]])
+// CHECK-NEXT:  %[[constant_384:[^ ]+]] = f32[] constant(-176.615036)
+// CHECK-NEXT:  %[[constant_385:[^ ]+]] = f32[] constant(3)
+// CHECK-NEXT:  %[[add_386:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_385]])
+// CHECK-NEXT:  %[[add_387:[^ ]+]] = f32[] add(%[[add_386]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_388:[^ ]+]] = f32[] divide(%[[constant_384]], %[[add_387]])
+// CHECK-NEXT:  %[[add_389:[^ ]+]] = f32[] add(%[[add_383]], %[[divide_388]])
+// CHECK-NEXT:  %[[constant_390:[^ ]+]] = f32[] constant(12.5073433)
+// CHECK-NEXT:  %[[constant_391:[^ ]+]] = f32[] constant(4)
+// CHECK-NEXT:  %[[add_392:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_391]])
+// CHECK-NEXT:  %[[add_393:[^ ]+]] = f32[] add(%[[add_392]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_394:[^ ]+]] = f32[] divide(%[[constant_390]], %[[add_393]])
+// CHECK-NEXT:  %[[add_395:[^ ]+]] = f32[] add(%[[add_389]], %[[divide_394]])
+// CHECK-NEXT:  %[[constant_396:[^ ]+]] = f32[] constant(-0.138571098)
+// CHECK-NEXT:  %[[constant_397:[^ ]+]] = f32[] constant(5)
+// CHECK-NEXT:  %[[add_398:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_397]])
+// CHECK-NEXT:  %[[add_399:[^ ]+]] = f32[] add(%[[add_398]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_400:[^ ]+]] = f32[] divide(%[[constant_396]], %[[add_399]])
+// CHECK-NEXT:  %[[add_401:[^ ]+]] = f32[] add(%[[add_395]], %[[divide_400]])
+// CHECK-NEXT:  %[[constant_402:[^ ]+]] = f32[] constant(9.98436917e-06)
+// CHECK-NEXT:  %[[constant_403:[^ ]+]] = f32[] constant(6)
+// CHECK-NEXT:  %[[add_404:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_403]])
+// CHECK-NEXT:  %[[add_405:[^ ]+]] = f32[] add(%[[add_404]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_406:[^ ]+]] = f32[] divide(%[[constant_402]], %[[add_405]])
+// CHECK-NEXT:  %[[add_407:[^ ]+]] = f32[] add(%[[add_401]], %[[divide_406]])
+// CHECK-NEXT:  %[[constant_408:[^ ]+]] = f32[] constant(1.50563267e-07)
+// CHECK-NEXT:  %[[constant_409:[^ ]+]] = f32[] constant(7)
+// CHECK-NEXT:  %[[add_410:[^ ]+]] = f32[] add(%[[select_365]], %[[constant_409]])
+// CHECK-NEXT:  %[[add_411:[^ ]+]] = f32[] add(%[[add_410]], %[[constant_355]])
+// CHECK-NEXT:  %[[divide_412:[^ ]+]] = f32[] divide(%[[constant_408]], %[[add_411]])
+// CHECK-NEXT:  %[[add_413:[^ ]+]] = f32[] add(%[[add_407]], %[[divide_412]])
+// CHECK-NEXT:  %[[log_423:[^ ]+]] = f32[] log(%[[add_413]])
+// CHECK-NEXT:  %[[add_424:[^ ]+]] = f32[] add(%[[add_422]], %[[log_423]])
+// CHECK-NEXT:  %[[subtract_438:[^ ]+]] = f32[] subtract(%[[subtract_437]], %[[add_424]])
+// CHECK-NEXT:  %[[negate_439:[^ ]+]] = f32[] negate(%[[log_435]])
+// CHECK-NEXT:  %[[select_440:[^ ]+]] = f32[] select(%[[is_finite_436]], %[[subtract_438]], %[[negate_439]])
+// CHECK-NEXT:  %[[select_441:[^ ]+]] = f32[] select(%[[compare_362]], %[[select_440]], %[[add_424]])
+// CHECK-NEXT:  %[[select_447:[^ ]+]] = f32[] select(%[[compare_446]], %[[broadcast_443]], %[[select_441]])
+// CHECK-NEXT:  %[[subtract_448:[^ ]+]] = f32[] subtract(%[[add_352]], %[[select_447]])
+// CHECK-NEXT:  %[[subtract_455:[^ ]+]] = f32[] subtract(%[[add_454]], %[[subtract_448]])
+// CHECK-NEXT:  %[[exponential_456:[^ ]+]] = f32[] exponential(%[[subtract_455]])
+// CHECK-NEXT:  %[[multiply_457:[^ ]+]] = f32[] multiply(%[[get_tuple_element_160]], %[[exponential_456]])
+// CHECK-NEXT:  %[[divide_458:[^ ]+]] = f32[] divide(%[[multiply_457]], %[[select_24]])
+// CHECK-NEXT:  %[[select_460:[^ ]+]] = f32[] select(%[[or_14]], %[[constant_459]], %[[divide_458]])
+// CHECK-NEXT:  %[[constant_461:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_462:[^ ]+]] = f32[] broadcast(%[[constant_461]]), dimensions={}
+// CHECK-NEXT:  %[[subtract_463:[^ ]+]] = f32[] subtract(%[[broadcast_462]], %[[select_460]])
+// CHECK-NEXT:  ROOT %[[select_464:[^ ]+]] = f32[] select(%[[compare_23]], %[[select_460]], %[[subtract_463]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), custom_call_target="xla_builder.math.RegularizedIncompleteBeta"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[] custom-call(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), custom_call_target="xla_builder.math.RegularizedIncompleteBeta", called_computations={%[[$xla_builder_math_RegularizedIncompleteBeta_465]]}
+
+HloModule main, entry_computation_layout={(f32[], f32[], f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[], Arg_1.2: f32[], Arg_2.3: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  %Arg_2.3 = f32[] parameter(2)
+  ROOT %custom-call.2 = f32[] custom-call(%Arg_0.1, %Arg_1.2, %Arg_2.3), custom_call_target="xla_builder.math.RegularizedIncompleteBeta"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/matrix_einsum.hlo b/third_party/xla/xla/hlo/builder/tests/matrix_einsum.hlo
new file mode 100644
index 000000000000..1f0f92cf0235
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/matrix_einsum.hlo
@@ -0,0 +1,97 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule einsum_diag, entry_computation_layout={(f32[3,3]{1,0})->f32[3]{0}}
+
+// CHECK:       %[[$add_F32_13:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_14:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[rhs_15:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_16:[^ ]+]] = f32[] add(%[[lhs_14]], %[[rhs_15]])
+
+// CHECK:       %[[$add_F32_28:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_29:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[rhs_30:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_31:[^ ]+]] = f32[] add(%[[lhs_29]], %[[rhs_30]])
+
+// CHECK:       %[[$xla_builder_matrix_Einsum_35:[^ ]+]]
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = pred[] constant(true)
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = pred[3,3]{1,0} broadcast(%[[constant_4]]), dimensions={}
+// CHECK-NEXT:  %[[iota_5:[^ ]+]] = s32[3,3]{1,0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[iota_6:[^ ]+]] = s32[3,3]{1,0} iota(), iota_dimension=1
+// CHECK-NEXT:  %[[compare_7:[^ ]+]] = pred[3,3]{1,0} compare(%[[iota_5]], %[[iota_6]]), direction=EQ
+// CHECK-NEXT:  %[[and_9:[^ ]+]] = pred[3,3]{1,0} and(%[[broadcast_8]], %[[compare_7]])
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[3,3]{1,0} parameter(0)
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_11:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[constant_10]]), dimensions={}
+// CHECK-NEXT:  %[[select_12:[^ ]+]] = f32[3,3]{1,0} select(%[[and_9]], %[[arg0_1]], %[[broadcast_11]])
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[reduce_17:[^ ]+]] = f32[3]{0} reduce(%[[select_12]], %[[constant_3]]), dimensions={1}, to_apply=%[[$add_F32_13]]
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = pred[] constant(true)
+// CHECK-NEXT:  %[[broadcast_23:[^ ]+]] = pred[3,3]{1,0} broadcast(%[[constant_19]]), dimensions={}
+// CHECK-NEXT:  %[[iota_20:[^ ]+]] = s32[3,3]{1,0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[iota_21:[^ ]+]] = s32[3,3]{1,0} iota(), iota_dimension=1
+// CHECK-NEXT:  %[[compare_22:[^ ]+]] = pred[3,3]{1,0} compare(%[[iota_20]], %[[iota_21]]), direction=EQ
+// CHECK-NEXT:  %[[and_24:[^ ]+]] = pred[3,3]{1,0} and(%[[broadcast_23]], %[[compare_22]])
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[3,3]{1,0} parameter(1)
+// CHECK-NEXT:  %[[constant_25:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[constant_25]]), dimensions={}
+// CHECK-NEXT:  %[[select_27:[^ ]+]] = f32[3,3]{1,0} select(%[[and_24]], %[[arg1_2]], %[[broadcast_26]])
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[reduce_32:[^ ]+]] = f32[3]{0} reduce(%[[select_27]], %[[constant_18]]), dimensions={1}, to_apply=%[[$add_F32_28]]
+// CHECK-NEXT:  %[[dot_33:[^ ]+]] = f32[3]{0} dot(%[[reduce_17]], %[[reduce_32]]), lhs_batch_dims={0}, lhs_contracting_dims={}, rhs_batch_dims={0}, rhs_contracting_dims={}, frontend_attributes={grad_x="false",grad_y="false"}
+// CHECK-NEXT:  ROOT %[[transpose_34:[^ ]+]] = f32[3]{0} transpose(%[[dot_33]]), dimensions={0}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[3,3]{1,0} parameter(0)
+// CHECK-NEXT:  %[[cst:[^ ]+]] = f32[3,3]{1,0} constant({ { 1, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } })
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[3]{0} custom-call(%[[Arg_0_1]], %[[cst]]), custom_call_target="xla_builder.matrix.Einsum", backend_config="ii,ii->i"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[3]{0} custom-call(%[[Arg_0_1]], %[[cst]]), custom_call_target="xla_builder.matrix.Einsum", called_computations={%[[$xla_builder_matrix_Einsum_35]]}
+
+HloModule einsum_diag, entry_computation_layout={(f32[3,3])->f32[3]}
+
+ENTRY %main.3 (Arg_0.1: f32[3,3]) -> f32[3] {
+  %Arg_0.1 = f32[3,3] parameter(0)
+  %cst = f32[3,3] constant(1)
+  ROOT %custom-call.2 = f32[3] custom-call(%Arg_0.1, %cst), custom_call_target="xla_builder.matrix.Einsum", backend_config="ii,ii->i"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule einsum_inverse_diag, entry_computation_layout={(f32[3,3]{1,0})->f32[3,3]{1,0}}
+
+// CHECK:       %[[$add_F32_4:[^ ]+]]
+// CHECK-NEXT:  %[[lhs_5:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[rhs_6:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_7:[^ ]+]] = f32[] add(%[[lhs_5]], %[[rhs_6]])
+
+// CHECK:       %[[$xla_builder_matrix_Einsum_21:[^ ]+]]
+// CHECK-NEXT:  %[[constant_12:[^ ]+]] = pred[] constant(true)
+// CHECK-NEXT:  %[[broadcast_16:[^ ]+]] = pred[3,3]{1,0} broadcast(%[[constant_12]]), dimensions={}
+// CHECK-NEXT:  %[[iota_13:[^ ]+]] = s32[3,3]{1,0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[iota_14:[^ ]+]] = s32[3,3]{1,0} iota(), iota_dimension=1
+// CHECK-NEXT:  %[[compare_15:[^ ]+]] = pred[3,3]{1,0} compare(%[[iota_13]], %[[iota_14]]), direction=EQ
+// CHECK-NEXT:  %[[and_17:[^ ]+]] = pred[3,3]{1,0} and(%[[broadcast_16]], %[[compare_15]])
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[3]{0} parameter(0)
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[3,3]{1,0} parameter(1)
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[reduce_8:[^ ]+]] = f32[3]{0} reduce(%[[arg1_2]], %[[constant_3]]), dimensions={1}, to_apply=%[[$add_F32_4]]
+// CHECK-NEXT:  %[[dot_9:[^ ]+]] = f32[3]{0} dot(%[[arg0_1]], %[[reduce_8]]), lhs_batch_dims={0}, lhs_contracting_dims={}, rhs_batch_dims={0}, rhs_contracting_dims={}, frontend_attributes={grad_x="false",grad_y="false"}
+// CHECK-NEXT:  %[[transpose_10:[^ ]+]] = f32[3]{0} transpose(%[[dot_9]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_11:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[transpose_10]]), dimensions={0}
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_19:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[constant_18]]), dimensions={}
+// CHECK-NEXT:  ROOT %[[select_20:[^ ]+]] = f32[3,3]{1,0} select(%[[and_17]], %[[broadcast_11]], %[[broadcast_19]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[cst:[^ ]+]] = f32[3]{0} constant({1, 0, 0})
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[3,3]{1,0} parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[3,3]{1,0} custom-call(%[[cst]], %[[Arg_0_1]]), custom_call_target="xla_builder.matrix.Einsum", backend_config="i,ij->ii"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[3,3]{1,0} custom-call(%[[cst]], %[[Arg_0_1]]), custom_call_target="xla_builder.matrix.Einsum", called_computations={%[[$xla_builder_matrix_Einsum_21]]}
+
+HloModule einsum_inverse_diag, entry_computation_layout={(f32[3,3])->f32[3,3]}
+
+ENTRY %main.3 (Arg_0.1: f32[3,3]) -> f32[3,3] {
+  %Arg_0.1 = f32[3,3] parameter(0)
+  %cst = f32[3] constant(1)
+  ROOT %custom-call.2 = f32[3,3] custom-call(%cst, %Arg_0.1), custom_call_target="xla_builder.matrix.Einsum", backend_config="i,ij->ii"
+}
diff --git a/third_party/xla/xla/hlo/builder/tests/matrix_get_matrix_diagonal_via_gather.hlo b/third_party/xla/xla/hlo/builder/tests/matrix_get_matrix_diagonal_via_gather.hlo
new file mode 100644
index 000000000000..fcea05260ac3
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/matrix_get_matrix_diagonal_via_gather.hlo
@@ -0,0 +1,26 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,2]{1,0})->f32[2,2]{1,0}}
+
+// CHECK:       %[[$xla_builder_matrix_GetMatrixDiagonalViaGather_9:[^ ]+]]
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[2,2]{1,0} parameter(0)
+// CHECK-NEXT:  %[[iota_2:[^ ]+]] = s32[2]{0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[broadcast_3:[^ ]+]] = s32[2,2]{1,0} broadcast(%[[iota_2]]), dimensions={0}
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_5:[^ ]+]] = s32[2]{0} broadcast(%[[constant_4]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = s32[2,2]{1,0} broadcast(%[[broadcast_5]]), dimensions={1}
+// CHECK-NEXT:  %[[add_7:[^ ]+]] = s32[2,2]{1,0} add(%[[broadcast_3]], %[[broadcast_6]])
+// CHECK-NEXT:  ROOT %[[gather_8:[^ ]+]] = f32[2]{0} gather(%[[arg0_1]], %[[add_7]]), offset_dims={}, collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=1, slice_sizes={1,1}, indices_are_sorted=true
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,2]{1,0} parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[2,2]{1,0} custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.matrix.GetMatrixDiagonalViaGather"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[2,2]{1,0} custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.matrix.GetMatrixDiagonalViaGather", called_computations={%[[$xla_builder_matrix_GetMatrixDiagonalViaGather_9]]}
+
+HloModule main, entry_computation_layout={(f32[2,2])->f32[2,2]}
+
+ENTRY %main.3 (Arg_0.1: f32[2,2]) -> f32[2,2] {
+  %Arg_0.1 = f32[2,2] parameter(0)
+  ROOT %custom-call.2 = f32[2,2] custom-call(%Arg_0.1), custom_call_target="xla_builder.matrix.GetMatrixDiagonalViaGather"
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/builder/tests/prng_scramble_philox_key.hlo b/third_party/xla/xla/hlo/builder/tests/prng_scramble_philox_key.hlo
new file mode 100644
index 000000000000..abf5e42ea576
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/prng_scramble_philox_key.hlo
@@ -0,0 +1,339 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(u64[1,1]{1,0})->u64[1,1]{1,0}}
+
+// CHECK:       %[[$xla_builder_prng_ScramblePhiloxKey_322:[^ ]+]]
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = u32[] constant(1053357856)
+// CHECK-NEXT:  %[[constant_45:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = u32[] add(%[[constant_19]], %[[constant_45]])
+// CHECK-NEXT:  %[[constant_73:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_74:[^ ]+]] = u32[] add(%[[add_46]], %[[constant_73]])
+// CHECK-NEXT:  %[[constant_101:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_102:[^ ]+]] = u32[] add(%[[add_74]], %[[constant_101]])
+// CHECK-NEXT:  %[[constant_129:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_130:[^ ]+]] = u32[] add(%[[add_102]], %[[constant_129]])
+// CHECK-NEXT:  %[[constant_157:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_158:[^ ]+]] = u32[] add(%[[add_130]], %[[constant_157]])
+// CHECK-NEXT:  %[[constant_185:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_186:[^ ]+]] = u32[] add(%[[add_158]], %[[constant_185]])
+// CHECK-NEXT:  %[[constant_213:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_214:[^ ]+]] = u32[] add(%[[add_186]], %[[constant_213]])
+// CHECK-NEXT:  %[[constant_241:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_242:[^ ]+]] = u32[] add(%[[add_214]], %[[constant_241]])
+// CHECK-NEXT:  %[[constant_269:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_270:[^ ]+]] = u32[] add(%[[add_242]], %[[constant_269]])
+// CHECK-NEXT:  %[[constant_297:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_298:[^ ]+]] = u32[] add(%[[add_270]], %[[constant_297]])
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = u32[] constant(38149673)
+// CHECK-NEXT:  %[[constant_47:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_48:[^ ]+]] = u32[] add(%[[constant_20]], %[[constant_47]])
+// CHECK-NEXT:  %[[constant_75:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_76:[^ ]+]] = u32[] add(%[[add_48]], %[[constant_75]])
+// CHECK-NEXT:  %[[constant_103:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_104:[^ ]+]] = u32[] add(%[[add_76]], %[[constant_103]])
+// CHECK-NEXT:  %[[constant_131:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_132:[^ ]+]] = u32[] add(%[[add_104]], %[[constant_131]])
+// CHECK-NEXT:  %[[constant_159:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_160:[^ ]+]] = u32[] add(%[[add_132]], %[[constant_159]])
+// CHECK-NEXT:  %[[constant_187:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_188:[^ ]+]] = u32[] add(%[[add_160]], %[[constant_187]])
+// CHECK-NEXT:  %[[constant_215:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_216:[^ ]+]] = u32[] add(%[[add_188]], %[[constant_215]])
+// CHECK-NEXT:  %[[constant_243:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_244:[^ ]+]] = u32[] add(%[[add_216]], %[[constant_243]])
+// CHECK-NEXT:  %[[constant_271:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_272:[^ ]+]] = u32[] add(%[[add_244]], %[[constant_271]])
+// CHECK-NEXT:  %[[constant_299:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_300:[^ ]+]] = u32[] add(%[[add_272]], %[[constant_299]])
+// CHECK-NEXT:  %[[constant_301:[^ ]+]] = u32[] constant(0)
+// CHECK-NEXT:  %[[convert_302:[^ ]+]] = u64[] convert(%[[constant_301]])
+// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u64[] convert(%[[constant_301]])
+// CHECK-NEXT:  %[[constant_304:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_305:[^ ]+]] = u64[] shift-left(%[[convert_303]], %[[constant_304]])
+// CHECK-NEXT:  %[[or_306:[^ ]+]] = u64[] or(%[[convert_302]], %[[shift_left_305]])
+// CHECK-NEXT:  %[[reshape_313:[^ ]+]] = u64[1]{0} reshape(%[[or_306]])
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = u64[1,1]{1,0} parameter(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_2]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_5:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[arg0_1]], %[[broadcast_4]])
+// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_5]])
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u64[1,1]{1,0} convert(%[[convert_6]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[1,1]{1,0} convert(%[[convert_8]])
+// CHECK-NEXT:  %[[convert_30:[^ ]+]] = u64[1,1]{1,0} convert(%[[convert_14]])
+// CHECK-NEXT:  %[[constant_31:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_31]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_33:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_30]], %[[broadcast_32]])
+// CHECK-NEXT:  %[[constant_35:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_36:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_35]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_37:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_33]], %[[broadcast_36]])
+// CHECK-NEXT:  %[[convert_38:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_37]])
+// CHECK-NEXT:  %[[convert_3:[^ ]+]] = u32[1,1]{1,0} convert(%[[arg0_1]])
+// CHECK-NEXT:  %[[convert_7:[^ ]+]] = u64[1,1]{1,0} convert(%[[convert_3]])
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_11:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_10]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_12:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[convert_7]], %[[broadcast_11]])
+// CHECK-NEXT:  %[[convert_13:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_12]])
+// CHECK-NEXT:  %[[xor_39:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_38]], %[[convert_13]])
+// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[constant_19]]), dimensions={}
+// CHECK-NEXT:  %[[xor_41:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_39]], %[[broadcast_40]])
+// CHECK-NEXT:  %[[convert_49:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_41]])
+// CHECK-NEXT:  %[[constant_50:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_51:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_50]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_52:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_49]], %[[broadcast_51]])
+// CHECK-NEXT:  %[[constant_54:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_55:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_54]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_56:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_52]], %[[broadcast_55]])
+// CHECK-NEXT:  %[[convert_57:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_56]])
+// CHECK-NEXT:  %[[convert_9:[^ ]+]] = u32[1,1]{1,0} convert(%[[convert_7]])
+// CHECK-NEXT:  %[[convert_21:[^ ]+]] = u64[1,1]{1,0} convert(%[[convert_9]])
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_23:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_22]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_24:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_21]], %[[broadcast_23]])
+// CHECK-NEXT:  %[[convert_25:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_24]])
+// CHECK-NEXT:  %[[xor_70:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_57]], %[[convert_25]])
+// CHECK-NEXT:  %[[broadcast_71:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_48]]), dimensions={}
+// CHECK-NEXT:  %[[xor_72:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_70]], %[[broadcast_71]])
+// CHECK-NEXT:  %[[convert_86:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_72]])
+// CHECK-NEXT:  %[[constant_87:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_88:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_87]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_89:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_86]], %[[broadcast_88]])
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_91]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_93:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_89]], %[[broadcast_92]])
+// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_93]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_27:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_26]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_28:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_24]], %[[broadcast_27]])
+// CHECK-NEXT:  %[[convert_29:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_28]])
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_16:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_15]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[convert_8]], %[[broadcast_16]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[xor_42:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_29]], %[[convert_18]])
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[constant_20]]), dimensions={}
+// CHECK-NEXT:  %[[xor_44:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_42]], %[[broadcast_43]])
+// CHECK-NEXT:  %[[convert_58:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_44]])
+// CHECK-NEXT:  %[[constant_59:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_60:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_59]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_61:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_58]], %[[broadcast_60]])
+// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_61]])
+// CHECK-NEXT:  %[[xor_95:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_94]], %[[convert_62]])
+// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_74]]), dimensions={}
+// CHECK-NEXT:  %[[xor_97:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_95]], %[[broadcast_96]])
+// CHECK-NEXT:  %[[convert_105:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_97]])
+// CHECK-NEXT:  %[[constant_106:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_106]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_108:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_105]], %[[broadcast_107]])
+// CHECK-NEXT:  %[[constant_110:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_111:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_110]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_112:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_108]], %[[broadcast_111]])
+// CHECK-NEXT:  %[[convert_113:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_112]])
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_63]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_65:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_61]], %[[broadcast_64]])
+// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_65]])
+// CHECK-NEXT:  %[[convert_34:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_33]])
+// CHECK-NEXT:  %[[xor_67:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_66]], %[[convert_34]])
+// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_46]]), dimensions={}
+// CHECK-NEXT:  %[[xor_69:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_67]], %[[broadcast_68]])
+// CHECK-NEXT:  %[[convert_77:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_69]])
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_78]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_80:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_77]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[convert_81:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_80]])
+// CHECK-NEXT:  %[[xor_126:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_113]], %[[convert_81]])
+// CHECK-NEXT:  %[[broadcast_127:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_104]]), dimensions={}
+// CHECK-NEXT:  %[[xor_128:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_126]], %[[broadcast_127]])
+// CHECK-NEXT:  %[[convert_142:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_128]])
+// CHECK-NEXT:  %[[constant_143:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_144:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_143]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_145:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_142]], %[[broadcast_144]])
+// CHECK-NEXT:  %[[constant_147:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_147]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_149:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_145]], %[[broadcast_148]])
+// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_149]])
+// CHECK-NEXT:  %[[constant_82:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_83:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_82]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_84:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_80]], %[[broadcast_83]])
+// CHECK-NEXT:  %[[convert_85:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_84]])
+// CHECK-NEXT:  %[[convert_53:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_52]])
+// CHECK-NEXT:  %[[xor_98:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_85]], %[[convert_53]])
+// CHECK-NEXT:  %[[broadcast_99:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_76]]), dimensions={}
+// CHECK-NEXT:  %[[xor_100:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_98]], %[[broadcast_99]])
+// CHECK-NEXT:  %[[convert_114:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_100]])
+// CHECK-NEXT:  %[[constant_115:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_116:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_115]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_117:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_114]], %[[broadcast_116]])
+// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_117]])
+// CHECK-NEXT:  %[[xor_151:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_150]], %[[convert_118]])
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_130]]), dimensions={}
+// CHECK-NEXT:  %[[xor_153:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_151]], %[[broadcast_152]])
+// CHECK-NEXT:  %[[convert_161:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_153]])
+// CHECK-NEXT:  %[[constant_162:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_163:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_162]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_164:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_161]], %[[broadcast_163]])
+// CHECK-NEXT:  %[[constant_166:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_167:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_166]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_168:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_164]], %[[broadcast_167]])
+// CHECK-NEXT:  %[[convert_169:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_168]])
+// CHECK-NEXT:  %[[constant_119:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_119]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_121:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_117]], %[[broadcast_120]])
+// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_121]])
+// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_89]])
+// CHECK-NEXT:  %[[xor_123:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_122]], %[[convert_90]])
+// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_102]]), dimensions={}
+// CHECK-NEXT:  %[[xor_125:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_123]], %[[broadcast_124]])
+// CHECK-NEXT:  %[[convert_133:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_125]])
+// CHECK-NEXT:  %[[constant_134:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_135:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_134]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_136:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_133]], %[[broadcast_135]])
+// CHECK-NEXT:  %[[convert_137:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_136]])
+// CHECK-NEXT:  %[[xor_182:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_169]], %[[convert_137]])
+// CHECK-NEXT:  %[[broadcast_183:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_160]]), dimensions={}
+// CHECK-NEXT:  %[[xor_184:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_182]], %[[broadcast_183]])
+// CHECK-NEXT:  %[[convert_198:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_184]])
+// CHECK-NEXT:  %[[constant_199:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_200:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_199]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_201:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_198]], %[[broadcast_200]])
+// CHECK-NEXT:  %[[constant_203:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_203]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_205:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_201]], %[[broadcast_204]])
+// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_205]])
+// CHECK-NEXT:  %[[constant_138:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_139:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_138]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_140:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_136]], %[[broadcast_139]])
+// CHECK-NEXT:  %[[convert_141:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_140]])
+// CHECK-NEXT:  %[[convert_109:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_108]])
+// CHECK-NEXT:  %[[xor_154:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_141]], %[[convert_109]])
+// CHECK-NEXT:  %[[broadcast_155:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_132]]), dimensions={}
+// CHECK-NEXT:  %[[xor_156:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_154]], %[[broadcast_155]])
+// CHECK-NEXT:  %[[convert_170:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_156]])
+// CHECK-NEXT:  %[[constant_171:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_172:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_171]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_173:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_170]], %[[broadcast_172]])
+// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_173]])
+// CHECK-NEXT:  %[[xor_207:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_206]], %[[convert_174]])
+// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_186]]), dimensions={}
+// CHECK-NEXT:  %[[xor_209:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_207]], %[[broadcast_208]])
+// CHECK-NEXT:  %[[convert_217:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_209]])
+// CHECK-NEXT:  %[[constant_218:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_219:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_218]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_220:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_217]], %[[broadcast_219]])
+// CHECK-NEXT:  %[[constant_222:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_222]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_224:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_220]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[convert_225:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_224]])
+// CHECK-NEXT:  %[[constant_175:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_175]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_177:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_173]], %[[broadcast_176]])
+// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_177]])
+// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_145]])
+// CHECK-NEXT:  %[[xor_179:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_178]], %[[convert_146]])
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_158]]), dimensions={}
+// CHECK-NEXT:  %[[xor_181:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_179]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[convert_189:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_181]])
+// CHECK-NEXT:  %[[constant_190:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_191:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_190]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_192:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_189]], %[[broadcast_191]])
+// CHECK-NEXT:  %[[convert_193:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_192]])
+// CHECK-NEXT:  %[[xor_238:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_225]], %[[convert_193]])
+// CHECK-NEXT:  %[[broadcast_239:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_216]]), dimensions={}
+// CHECK-NEXT:  %[[xor_240:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_238]], %[[broadcast_239]])
+// CHECK-NEXT:  %[[convert_254:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_240]])
+// CHECK-NEXT:  %[[constant_255:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_256:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_255]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_257:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_254]], %[[broadcast_256]])
+// CHECK-NEXT:  %[[constant_259:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_259]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_261:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_257]], %[[broadcast_260]])
+// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_261]])
+// CHECK-NEXT:  %[[constant_194:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_194]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_196:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_192]], %[[broadcast_195]])
+// CHECK-NEXT:  %[[convert_197:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_196]])
+// CHECK-NEXT:  %[[convert_165:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_164]])
+// CHECK-NEXT:  %[[xor_210:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_197]], %[[convert_165]])
+// CHECK-NEXT:  %[[broadcast_211:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_188]]), dimensions={}
+// CHECK-NEXT:  %[[xor_212:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_210]], %[[broadcast_211]])
+// CHECK-NEXT:  %[[convert_226:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_212]])
+// CHECK-NEXT:  %[[constant_227:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_228:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_227]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_229:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_226]], %[[broadcast_228]])
+// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_229]])
+// CHECK-NEXT:  %[[xor_263:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_262]], %[[convert_230]])
+// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_242]]), dimensions={}
+// CHECK-NEXT:  %[[xor_265:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_263]], %[[broadcast_264]])
+// CHECK-NEXT:  %[[convert_273:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_265]])
+// CHECK-NEXT:  %[[constant_274:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_275:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_274]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_276:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_273]], %[[broadcast_275]])
+// CHECK-NEXT:  %[[constant_278:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_279:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_278]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_280:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_276]], %[[broadcast_279]])
+// CHECK-NEXT:  %[[convert_281:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_280]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_233:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_229]], %[[broadcast_232]])
+// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_233]])
+// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_201]])
+// CHECK-NEXT:  %[[xor_235:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_234]], %[[convert_202]])
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_214]]), dimensions={}
+// CHECK-NEXT:  %[[xor_237:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_235]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[convert_245:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_237]])
+// CHECK-NEXT:  %[[constant_246:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_247:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_246]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_248:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_245]], %[[broadcast_247]])
+// CHECK-NEXT:  %[[convert_249:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_248]])
+// CHECK-NEXT:  %[[xor_294:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_281]], %[[convert_249]])
+// CHECK-NEXT:  %[[broadcast_295:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_272]]), dimensions={}
+// CHECK-NEXT:  %[[xor_296:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_294]], %[[broadcast_295]])
+// CHECK-NEXT:  %[[convert_307:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_296]])
+// CHECK-NEXT:  %[[convert_277:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_276]])
+// CHECK-NEXT:  %[[convert_308:[^ ]+]] = u64[1,1]{1,0} convert(%[[convert_277]])
+// CHECK-NEXT:  %[[constant_309:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_310:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_309]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_311:[^ ]+]] = u64[1,1]{1,0} shift-left(%[[convert_308]], %[[broadcast_310]])
+// CHECK-NEXT:  %[[or_312:[^ ]+]] = u64[1,1]{1,0} or(%[[convert_307]], %[[shift_left_311]])
+// CHECK-NEXT:  %[[reshape_314:[^ ]+]] = u64[1]{0} reshape(%[[or_312]])
+// CHECK-NEXT:  %[[concatenate_315:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_313]], %[[reshape_314]]), dimensions={0}
+// CHECK-NEXT:  %[[constant_250:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_251:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_250]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_252:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_248]], %[[broadcast_251]])
+// CHECK-NEXT:  %[[convert_253:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_252]])
+// CHECK-NEXT:  %[[convert_221:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_220]])
+// CHECK-NEXT:  %[[xor_266:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_253]], %[[convert_221]])
+// CHECK-NEXT:  %[[broadcast_267:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_244]]), dimensions={}
+// CHECK-NEXT:  %[[xor_268:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_266]], %[[broadcast_267]])
+// CHECK-NEXT:  %[[convert_282:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_268]])
+// CHECK-NEXT:  %[[constant_283:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_284:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_283]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_285:[^ ]+]] = u64[1,1]{1,0} multiply(%[[convert_282]], %[[broadcast_284]])
+// CHECK-NEXT:  %[[constant_287:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_287]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_289:[^ ]+]] = u64[1,1]{1,0} shift-right-logical(%[[multiply_285]], %[[broadcast_288]])
+// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1,1]{1,0} convert(%[[shift_right_logical_289]])
+// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_257]])
+// CHECK-NEXT:  %[[xor_291:[^ ]+]] = u32[1,1]{1,0} xor(%[[convert_290]], %[[convert_258]])
+// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u32[1,1]{1,0} broadcast(%[[add_270]]), dimensions={}
+// CHECK-NEXT:  %[[xor_293:[^ ]+]] = u32[1,1]{1,0} xor(%[[xor_291]], %[[broadcast_292]])
+// CHECK-NEXT:  %[[convert_316:[^ ]+]] = u64[1,1]{1,0} convert(%[[xor_293]])
+// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u32[1,1]{1,0} convert(%[[multiply_285]])
+// CHECK-NEXT:  %[[convert_317:[^ ]+]] = u64[1,1]{1,0} convert(%[[convert_286]])
+// CHECK-NEXT:  %[[constant_318:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_319:[^ ]+]] = u64[1,1]{1,0} broadcast(%[[constant_318]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_320:[^ ]+]] = u64[1,1]{1,0} shift-left(%[[convert_317]], %[[broadcast_319]])
+// CHECK-NEXT:  ROOT %[[or_321:[^ ]+]] = u64[1,1]{1,0} or(%[[convert_316]], %[[shift_left_320]])
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = u64[1,1]{1,0} parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = u64[1,1]{1,0} custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.prng.ScramblePhiloxKey"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = u64[1,1]{1,0} custom-call(%[[Arg_0_1]]), custom_call_target="xla_builder.prng.ScramblePhiloxKey", called_computations={%[[$xla_builder_prng_ScramblePhiloxKey_322]]}
+
+HloModule main, entry_computation_layout={(u64[1,1])->u64[1,1]}
+
+ENTRY %main.3 (Arg_0.1: u64[1,1]) -> u64[1,1] {
+  %Arg_0.1 = u64[1,1] parameter(0)
+  ROOT %custom-call.2 = u64[1,1] custom-call(%Arg_0.1), custom_call_target="xla_builder.prng.ScramblePhiloxKey"
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/builder/tests/tridiagonal_tridiagonal_solve.hlo b/third_party/xla/xla/hlo/builder/tests/tridiagonal_tridiagonal_solve.hlo
new file mode 100644
index 000000000000..84f46311e4a6
--- /dev/null
+++ b/third_party/xla/xla/hlo/builder/tests/tridiagonal_tridiagonal_solve.hlo
@@ -0,0 +1,399 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt --passes=test-only-xla-builder --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: HloModule tridiagonal_tridiagonal_solve, entry_computation_layout={(f32[3,3]{1,0}, f32[3,3]{1,0})->f32[3,3]{1,0}}
+
+// CHECK:       %[[$preparation_body_29:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_30:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_31:[^ ]+]] = s32[] get-tuple-element(%[[parameter_30]]), index=0
+// CHECK-NEXT:  %[[constant_34:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = s32[] add(%[[get_tuple_element_31]], %[[constant_34]])
+// CHECK-NEXT:  %[[get_tuple_element_32:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_30]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_33:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_30]]), index=2
+// CHECK-NEXT:  %[[constant_36:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_37:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_33]], %[[constant_36]], %[[get_tuple_element_31]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_38:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_39:[^ ]+]] = f32[1,3]{1,0} dynamic-update-slice(%[[get_tuple_element_32]], %[[dynamic_slice_37]], %[[constant_38]], %[[get_tuple_element_31]])
+// CHECK-NEXT:  ROOT %[[tuple_40:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[add_35]], %[[dynamic_update_slice_39]], %[[get_tuple_element_33]])
+
+// CHECK:       %[[$preparation_condition_41:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_42:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_44:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_42]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_45:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_42]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_43:[^ ]+]] = s32[] get-tuple-element(%[[parameter_42]]), index=0
+// CHECK-NEXT:  %[[constant_46:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  ROOT %[[compare_47:[^ ]+]] = pred[] compare(%[[get_tuple_element_43]], %[[constant_46]]), direction=LT
+
+// CHECK:       %[[$forward_transformation_body_54:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_55:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{0,1}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = s32[] get-tuple-element(%[[parameter_55]]), index=0
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_64:[^ ]+]] = s32[] add(%[[get_tuple_element_56]], %[[constant_63]])
+// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_55]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_58:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_55]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_59:[^ ]+]] = f32[3,3]{0,1} get-tuple-element(%[[parameter_55]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_60:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_55]]), index=4
+// CHECK-NEXT:  %[[constant_69:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_66:[^ ]+]] = s32[] add(%[[get_tuple_element_56]], %[[constant_65]])
+// CHECK-NEXT:  %[[dynamic_slice_70:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_58]], %[[constant_69]], %[[add_66]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_67:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_68:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_57]], %[[constant_67]], %[[add_66]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[subtract_73:[^ ]+]] = s32[] subtract(%[[add_66]], %[[constant_65]])
+// CHECK-NEXT:  %[[dynamic_slice_75:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_60]], %[[constant_74]], %[[subtract_73]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[divide_76:[^ ]+]] = f32[1,1]{1,0} divide(%[[dynamic_slice_68]], %[[dynamic_slice_75]])
+// CHECK-NEXT:  %[[get_tuple_element_61:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_55]]), index=5
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[subtract_77:[^ ]+]] = s32[] subtract(%[[add_66]], %[[constant_65]])
+// CHECK-NEXT:  %[[dynamic_slice_79:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_61]], %[[constant_78]], %[[subtract_77]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[multiply_80:[^ ]+]] = f32[1,1]{1,0} multiply(%[[divide_76]], %[[dynamic_slice_79]])
+// CHECK-NEXT:  %[[subtract_81:[^ ]+]] = f32[1,1]{1,0} subtract(%[[dynamic_slice_70]], %[[multiply_80]])
+// CHECK-NEXT:  %[[constant_82:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_83:[^ ]+]] = f32[1,3]{1,0} dynamic-update-slice(%[[get_tuple_element_60]], %[[subtract_81]], %[[constant_82]], %[[add_66]])
+// CHECK-NEXT:  %[[get_tuple_element_62:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_55]]), index=6
+// CHECK-NEXT:  %[[constant_71:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_72:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_59]], %[[constant_71]], %[[add_66]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[reshape_87:[^ ]+]] = f32[1]{0} reshape(%[[divide_76]])
+// CHECK-NEXT:  %[[broadcast_88:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_87]]), dimensions={1}
+// CHECK-NEXT:  %[[constant_85:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[subtract_84:[^ ]+]] = s32[] subtract(%[[add_66]], %[[constant_65]])
+// CHECK-NEXT:  %[[dynamic_slice_86:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_62]], %[[constant_85]], %[[subtract_84]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[multiply_89:[^ ]+]] = f32[3,1]{1,0} multiply(%[[broadcast_88]], %[[dynamic_slice_86]])
+// CHECK-NEXT:  %[[subtract_90:[^ ]+]] = f32[3,1]{1,0} subtract(%[[dynamic_slice_72]], %[[multiply_89]])
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_92:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[get_tuple_element_62]], %[[subtract_90]], %[[constant_91]], %[[add_66]])
+// CHECK-NEXT:  ROOT %[[tuple_93:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{0,1}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) tuple(%[[add_64]], %[[get_tuple_element_57]], %[[get_tuple_element_58]], %[[get_tuple_element_59]], %[[dynamic_update_slice_83]], /*index=5*/%[[get_tuple_element_61]], %[[dynamic_update_slice_92]])
+
+// CHECK:       %[[$forward_transformation_condition_94:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_95:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{0,1}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_97:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_95]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_98:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_95]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_99:[^ ]+]] = f32[3,3]{0,1} get-tuple-element(%[[parameter_95]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_100:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_95]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_101:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_95]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_102:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_95]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_96:[^ ]+]] = s32[] get-tuple-element(%[[parameter_95]]), index=0
+// CHECK-NEXT:  %[[constant_103:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  ROOT %[[compare_104:[^ ]+]] = pred[] compare(%[[get_tuple_element_96]], %[[constant_103]]), direction=LT
+
+// CHECK:       %[[$backward_reduction_body_127:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_128:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_129:[^ ]+]] = s32[] get-tuple-element(%[[parameter_128]]), index=0
+// CHECK-NEXT:  %[[constant_134:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_135:[^ ]+]] = s32[] add(%[[get_tuple_element_129]], %[[constant_134]])
+// CHECK-NEXT:  %[[get_tuple_element_130:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_128]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_131:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_128]]), index=2
+// CHECK-NEXT:  %[[constant_139:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_136:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[subtract_138:[^ ]+]] = s32[] subtract(%[[constant_136]], %[[get_tuple_element_129]])
+// CHECK-NEXT:  %[[dynamic_slice_140:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_131]], %[[constant_139]], %[[subtract_138]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[get_tuple_element_132:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_128]]), index=3
+// CHECK-NEXT:  %[[constant_141:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_142:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_132]], %[[constant_141]], %[[subtract_138]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[reshape_146:[^ ]+]] = f32[1]{0} reshape(%[[dynamic_slice_142]])
+// CHECK-NEXT:  %[[broadcast_147:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_146]]), dimensions={1}
+// CHECK-NEXT:  %[[constant_144:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_137:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_143:[^ ]+]] = s32[] add(%[[subtract_138]], %[[constant_137]])
+// CHECK-NEXT:  %[[dynamic_slice_145:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_130]], %[[constant_144]], %[[add_143]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[multiply_148:[^ ]+]] = f32[3,1]{1,0} multiply(%[[broadcast_147]], %[[dynamic_slice_145]])
+// CHECK-NEXT:  %[[subtract_149:[^ ]+]] = f32[3,1]{1,0} subtract(%[[dynamic_slice_140]], %[[multiply_148]])
+// CHECK-NEXT:  %[[get_tuple_element_133:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_128]]), index=4
+// CHECK-NEXT:  %[[constant_150:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_151:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_133]], %[[constant_150]], %[[subtract_138]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[reshape_152:[^ ]+]] = f32[1]{0} reshape(%[[dynamic_slice_151]])
+// CHECK-NEXT:  %[[broadcast_153:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_152]]), dimensions={1}
+// CHECK-NEXT:  %[[divide_154:[^ ]+]] = f32[3,1]{1,0} divide(%[[subtract_149]], %[[broadcast_153]])
+// CHECK-NEXT:  %[[constant_155:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_156:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[get_tuple_element_130]], %[[divide_154]], %[[constant_155]], %[[subtract_138]])
+// CHECK-NEXT:  ROOT %[[tuple_157:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[add_135]], %[[dynamic_update_slice_156]], %[[get_tuple_element_131]], %[[get_tuple_element_132]], %[[get_tuple_element_133]])
+
+// CHECK:       %[[$backward_reduction_condition_158:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_159:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_161:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_159]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_162:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_159]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_163:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_159]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_164:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_159]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_160:[^ ]+]] = s32[] get-tuple-element(%[[parameter_159]]), index=0
+// CHECK-NEXT:  %[[constant_165:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  ROOT %[[compare_166:[^ ]+]] = pred[] compare(%[[get_tuple_element_160]], %[[constant_165]]), direction=LT
+
+// CHECK:       %[[$xla_builder_tridiagonal_TridiagonalSolver_174:[^ ]+]]
+// CHECK-NEXT:  %[[constant_27:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_11:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = f32[1,3]{1,0} broadcast(%[[constant_11]]), dimensions={}
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[3,3]{1,0} parameter(0)
+// CHECK-NEXT:  %[[slice_3:[^ ]+]] = f32[1,3]{1,0} slice(%[[arg0_1]]), slice={[0:1], [0:3]}
+// CHECK-NEXT:  %[[tuple_28:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[constant_27]], %[[broadcast_12]], %[[slice_3]])
+// CHECK-NEXT:  %[[while_48:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) while(%[[tuple_28]]), condition=%[[$preparation_condition_41]], body=%[[$preparation_body_29]]
+// CHECK-NEXT:  %[[get_tuple_element_49:[^ ]+]] = s32[] get-tuple-element(%[[while_48]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_51:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_48]]), index=2
+// CHECK-NEXT:  %[[constant_52:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[slice_5:[^ ]+]] = f32[1,3]{1,0} slice(%[[arg0_1]]), slice={[2:3], [0:3]}
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = f32[1,3]{1,0} slice(%[[arg0_1]]), slice={[1:2], [0:3]}
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[3,3]{1,0} parameter(1)
+// CHECK-NEXT:  %[[transpose_6:[^ ]+]] = f32[3,3]{0,1} transpose(%[[arg1_2]]), dimensions={1,0}
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = f32[1,3]{1,0} broadcast(%[[constant_7]]), dimensions={}
+// CHECK-NEXT:  %[[constant_16:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_17:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[slice_4]], %[[constant_16]], %[[constant_15]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_20:[^ ]+]] = f32[1,3]{1,0} dynamic-update-slice(%[[broadcast_8]], %[[dynamic_slice_17]], %[[constant_19]], %[[constant_18]])
+// CHECK-NEXT:  %[[get_tuple_element_50:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_48]]), index=1
+// CHECK-NEXT:  %[[constant_9:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[constant_9]]), dimensions={}
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_21:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_23:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[transpose_6]], %[[constant_22]], %[[constant_21]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[constant_25:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_26:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[broadcast_10]], %[[dynamic_slice_23]], %[[constant_25]], %[[constant_24]])
+// CHECK-NEXT:  %[[tuple_53:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{0,1}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) tuple(%[[constant_52]], %[[slice_5]], %[[slice_4]], %[[transpose_6]], %[[dynamic_update_slice_20]], /*index=5*/%[[get_tuple_element_50]], %[[dynamic_update_slice_26]])
+// CHECK-NEXT:  %[[while_105:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{0,1}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) while(%[[tuple_53]]), condition=%[[$forward_transformation_condition_94]], body=%[[$forward_transformation_body_54]]
+// CHECK-NEXT:  %[[get_tuple_element_106:[^ ]+]] = s32[] get-tuple-element(%[[while_105]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_107:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_105]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_108:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_105]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_109:[^ ]+]] = f32[3,3]{0,1} get-tuple-element(%[[while_105]]), index=3
+// CHECK-NEXT:  %[[constant_125:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_14:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[constant_13]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_112:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[while_105]]), index=6
+// CHECK-NEXT:  %[[constant_114:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_113:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  %[[dynamic_slice_115:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_112]], %[[constant_114]], %[[constant_113]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[get_tuple_element_110:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_105]]), index=4
+// CHECK-NEXT:  %[[constant_117:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  %[[dynamic_slice_118:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_110]], %[[constant_117]], %[[constant_116]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[reshape_119:[^ ]+]] = f32[1]{0} reshape(%[[dynamic_slice_118]])
+// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_119]]), dimensions={1}
+// CHECK-NEXT:  %[[divide_121:[^ ]+]] = f32[3,1]{1,0} divide(%[[dynamic_slice_115]], %[[broadcast_120]])
+// CHECK-NEXT:  %[[constant_123:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_122:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  %[[dynamic_update_slice_124:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[broadcast_14]], %[[divide_121]], %[[constant_123]], %[[constant_122]])
+// CHECK-NEXT:  %[[get_tuple_element_111:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_105]]), index=5
+// CHECK-NEXT:  %[[tuple_126:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[constant_125]], %[[dynamic_update_slice_124]], %[[get_tuple_element_112]], %[[get_tuple_element_111]], %[[get_tuple_element_110]])
+// CHECK-NEXT:  %[[while_167:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) while(%[[tuple_126]]), condition=%[[$backward_reduction_condition_158]], body=%[[$backward_reduction_body_127]]
+// CHECK-NEXT:  %[[get_tuple_element_168:[^ ]+]] = s32[] get-tuple-element(%[[while_167]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_170:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[while_167]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_171:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_167]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_172:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_167]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_169:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[while_167]]), index=1
+// CHECK-NEXT:  ROOT %[[transpose_173:[^ ]+]] = f32[3,3]{0,1} transpose(%[[get_tuple_element_169]]), dimensions={1,0}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[3,3]{1,0} parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[3,3]{1,0} parameter(1)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[3,3]{1,0} custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.tridiagonal.TridiagonalSolver"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[3,3]{1,0} custom-call(%[[Arg_0_1]], %[[Arg_1_2]]), custom_call_target="xla_builder.tridiagonal.TridiagonalSolver", called_computations={%[[$xla_builder_tridiagonal_TridiagonalSolver_174]]}
+
+HloModule tridiagonal_tridiagonal_solve, entry_computation_layout={(f32[3,3],f32[3,3])->f32[3,3]}
+
+ENTRY %main.3 (Arg_0.1: f32[3,3], Arg_1.2: f32[3,3]) -> f32[3,3] {
+  %Arg_0.1 = f32[3,3] parameter(0)
+  %Arg_1.2 = f32[3,3] parameter(1)
+  ROOT %custom-call.2 = f32[3,3] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="xla_builder.tridiagonal.TridiagonalSolver"
+}
+
+// -----
+
+// CHECK-LABEL: HloModule tridiagonal_tridiagonal_solve_all_args, entry_computation_layout={(f32[1,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{1,0})->f32[3,3]{1,0}}
+
+// CHECK:       %[[$preparation_body_27:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_28:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_29:[^ ]+]] = s32[] get-tuple-element(%[[parameter_28]]), index=0
+// CHECK-NEXT:  %[[constant_32:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_33:[^ ]+]] = s32[] add(%[[get_tuple_element_29]], %[[constant_32]])
+// CHECK-NEXT:  %[[get_tuple_element_30:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_28]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_31:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_28]]), index=2
+// CHECK-NEXT:  %[[constant_34:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_35:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_31]], %[[constant_34]], %[[get_tuple_element_29]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_36:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_37:[^ ]+]] = f32[1,3]{1,0} dynamic-update-slice(%[[get_tuple_element_30]], %[[dynamic_slice_35]], %[[constant_36]], %[[get_tuple_element_29]])
+// CHECK-NEXT:  ROOT %[[tuple_38:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[add_33]], %[[dynamic_update_slice_37]], %[[get_tuple_element_31]])
+
+// CHECK:       %[[$preparation_condition_39:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_40:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_42:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_40]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_43:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_40]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_41:[^ ]+]] = s32[] get-tuple-element(%[[parameter_40]]), index=0
+// CHECK-NEXT:  %[[constant_44:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  ROOT %[[compare_45:[^ ]+]] = pred[] compare(%[[get_tuple_element_41]], %[[constant_44]]), direction=LT
+
+// CHECK:       %[[$forward_transformation_body_52:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_53:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_54:[^ ]+]] = s32[] get-tuple-element(%[[parameter_53]]), index=0
+// CHECK-NEXT:  %[[constant_61:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_62:[^ ]+]] = s32[] add(%[[get_tuple_element_54]], %[[constant_61]])
+// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_53]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_53]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_53]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_58:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_53]]), index=4
+// CHECK-NEXT:  %[[constant_67:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_64:[^ ]+]] = s32[] add(%[[get_tuple_element_54]], %[[constant_63]])
+// CHECK-NEXT:  %[[dynamic_slice_68:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_56]], %[[constant_67]], %[[add_64]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_66:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_55]], %[[constant_65]], %[[add_64]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_72:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[subtract_71:[^ ]+]] = s32[] subtract(%[[add_64]], %[[constant_63]])
+// CHECK-NEXT:  %[[dynamic_slice_73:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_58]], %[[constant_72]], %[[subtract_71]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[divide_74:[^ ]+]] = f32[1,1]{1,0} divide(%[[dynamic_slice_66]], %[[dynamic_slice_73]])
+// CHECK-NEXT:  %[[get_tuple_element_59:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_53]]), index=5
+// CHECK-NEXT:  %[[constant_76:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[subtract_75:[^ ]+]] = s32[] subtract(%[[add_64]], %[[constant_63]])
+// CHECK-NEXT:  %[[dynamic_slice_77:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_59]], %[[constant_76]], %[[subtract_75]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[multiply_78:[^ ]+]] = f32[1,1]{1,0} multiply(%[[divide_74]], %[[dynamic_slice_77]])
+// CHECK-NEXT:  %[[subtract_79:[^ ]+]] = f32[1,1]{1,0} subtract(%[[dynamic_slice_68]], %[[multiply_78]])
+// CHECK-NEXT:  %[[constant_80:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_81:[^ ]+]] = f32[1,3]{1,0} dynamic-update-slice(%[[get_tuple_element_58]], %[[subtract_79]], %[[constant_80]], %[[add_64]])
+// CHECK-NEXT:  %[[get_tuple_element_60:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_53]]), index=6
+// CHECK-NEXT:  %[[constant_69:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_70:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_57]], %[[constant_69]], %[[add_64]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[reshape_85:[^ ]+]] = f32[1]{0} reshape(%[[divide_74]])
+// CHECK-NEXT:  %[[broadcast_86:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_85]]), dimensions={1}
+// CHECK-NEXT:  %[[constant_83:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[subtract_82:[^ ]+]] = s32[] subtract(%[[add_64]], %[[constant_63]])
+// CHECK-NEXT:  %[[dynamic_slice_84:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_60]], %[[constant_83]], %[[subtract_82]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[multiply_87:[^ ]+]] = f32[3,1]{1,0} multiply(%[[broadcast_86]], %[[dynamic_slice_84]])
+// CHECK-NEXT:  %[[subtract_88:[^ ]+]] = f32[3,1]{1,0} subtract(%[[dynamic_slice_70]], %[[multiply_87]])
+// CHECK-NEXT:  %[[constant_89:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_90:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[get_tuple_element_60]], %[[subtract_88]], %[[constant_89]], %[[add_64]])
+// CHECK-NEXT:  ROOT %[[tuple_91:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) tuple(%[[add_62]], %[[get_tuple_element_55]], %[[get_tuple_element_56]], %[[get_tuple_element_57]], %[[dynamic_update_slice_81]], /*index=5*/%[[get_tuple_element_59]], %[[dynamic_update_slice_90]])
+
+// CHECK:       %[[$forward_transformation_condition_92:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_93:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_95:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_93]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_96:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_93]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_97:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_93]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_98:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_93]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_99:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_93]]), index=5
+// CHECK-NEXT:  %[[get_tuple_element_100:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_93]]), index=6
+// CHECK-NEXT:  %[[get_tuple_element_94:[^ ]+]] = s32[] get-tuple-element(%[[parameter_93]]), index=0
+// CHECK-NEXT:  %[[constant_101:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  ROOT %[[compare_102:[^ ]+]] = pred[] compare(%[[get_tuple_element_94]], %[[constant_101]]), direction=LT
+
+// CHECK:       %[[$backward_reduction_body_125:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_126:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_127:[^ ]+]] = s32[] get-tuple-element(%[[parameter_126]]), index=0
+// CHECK-NEXT:  %[[constant_132:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_133:[^ ]+]] = s32[] add(%[[get_tuple_element_127]], %[[constant_132]])
+// CHECK-NEXT:  %[[get_tuple_element_128:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_126]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_129:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_126]]), index=2
+// CHECK-NEXT:  %[[constant_137:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_134:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[subtract_136:[^ ]+]] = s32[] subtract(%[[constant_134]], %[[get_tuple_element_127]])
+// CHECK-NEXT:  %[[dynamic_slice_138:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_129]], %[[constant_137]], %[[subtract_136]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[get_tuple_element_130:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_126]]), index=3
+// CHECK-NEXT:  %[[constant_139:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_140:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_130]], %[[constant_139]], %[[subtract_136]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[reshape_144:[^ ]+]] = f32[1]{0} reshape(%[[dynamic_slice_140]])
+// CHECK-NEXT:  %[[broadcast_145:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_144]]), dimensions={1}
+// CHECK-NEXT:  %[[constant_142:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_135:[^ ]+]] = s32[] constant(1)
+// CHECK-NEXT:  %[[add_141:[^ ]+]] = s32[] add(%[[subtract_136]], %[[constant_135]])
+// CHECK-NEXT:  %[[dynamic_slice_143:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_128]], %[[constant_142]], %[[add_141]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[multiply_146:[^ ]+]] = f32[3,1]{1,0} multiply(%[[broadcast_145]], %[[dynamic_slice_143]])
+// CHECK-NEXT:  %[[subtract_147:[^ ]+]] = f32[3,1]{1,0} subtract(%[[dynamic_slice_138]], %[[multiply_146]])
+// CHECK-NEXT:  %[[get_tuple_element_131:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_126]]), index=4
+// CHECK-NEXT:  %[[constant_148:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_149:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_131]], %[[constant_148]], %[[subtract_136]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[reshape_150:[^ ]+]] = f32[1]{0} reshape(%[[dynamic_slice_149]])
+// CHECK-NEXT:  %[[broadcast_151:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_150]]), dimensions={1}
+// CHECK-NEXT:  %[[divide_152:[^ ]+]] = f32[3,1]{1,0} divide(%[[subtract_147]], %[[broadcast_151]])
+// CHECK-NEXT:  %[[constant_153:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_154:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[get_tuple_element_128]], %[[divide_152]], %[[constant_153]], %[[subtract_136]])
+// CHECK-NEXT:  ROOT %[[tuple_155:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[add_133]], %[[dynamic_update_slice_154]], %[[get_tuple_element_129]], %[[get_tuple_element_130]], %[[get_tuple_element_131]])
+
+// CHECK:       %[[$backward_reduction_condition_156:[^ ]+]]
+// CHECK-NEXT:  %[[parameter_157:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_159:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_157]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_160:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[parameter_157]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_161:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_157]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_162:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[parameter_157]]), index=4
+// CHECK-NEXT:  %[[get_tuple_element_158:[^ ]+]] = s32[] get-tuple-element(%[[parameter_157]]), index=0
+// CHECK-NEXT:  %[[constant_163:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  ROOT %[[compare_164:[^ ]+]] = pred[] compare(%[[get_tuple_element_158]], %[[constant_163]]), direction=LT
+
+// CHECK:       %[[$xla_builder_tridiagonal_TridiagonalSolver_171:[^ ]+]]
+// CHECK-NEXT:  %[[constant_25:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_9:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = f32[1,3]{1,0} broadcast(%[[constant_9]]), dimensions={}
+// CHECK-NEXT:  %[[arg2_3:[^ ]+]] = f32[1,3]{1,0} parameter(2)
+// CHECK-NEXT:  %[[tuple_26:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[constant_25]], %[[broadcast_10]], %[[arg2_3]])
+// CHECK-NEXT:  %[[while_46:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}) while(%[[tuple_26]]), condition=%[[$preparation_condition_39]], body=%[[$preparation_body_27]]
+// CHECK-NEXT:  %[[get_tuple_element_47:[^ ]+]] = s32[] get-tuple-element(%[[while_46]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_49:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_46]]), index=2
+// CHECK-NEXT:  %[[constant_50:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[arg0_1:[^ ]+]] = f32[1,3]{1,0} parameter(0)
+// CHECK-NEXT:  %[[arg1_2:[^ ]+]] = f32[1,3]{1,0} parameter(1)
+// CHECK-NEXT:  %[[arg3_4:[^ ]+]] = f32[3,3]{1,0} parameter(3)
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = f32[1,3]{1,0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[constant_14:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_15:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[arg1_2]], %[[constant_14]], %[[constant_13]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[constant_17:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_16:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_18:[^ ]+]] = f32[1,3]{1,0} dynamic-update-slice(%[[broadcast_6]], %[[dynamic_slice_15]], %[[constant_17]], %[[constant_16]])
+// CHECK-NEXT:  %[[get_tuple_element_48:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_46]]), index=1
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[constant_7]]), dimensions={}
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_slice_21:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[arg3_4]], %[[constant_20]], %[[constant_19]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[constant_23:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_22:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[dynamic_update_slice_24:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[broadcast_8]], %[[dynamic_slice_21]], %[[constant_23]], %[[constant_22]])
+// CHECK-NEXT:  %[[tuple_51:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) tuple(%[[constant_50]], %[[arg0_1]], %[[arg1_2]], %[[arg3_4]], %[[dynamic_update_slice_18]], /*index=5*/%[[get_tuple_element_48]], %[[dynamic_update_slice_24]])
+// CHECK-NEXT:  %[[while_103:[^ ]+]] = (s32[], f32[1,3]{1,0}, f32[1,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, /*index=5*/f32[1,3]{1,0}, f32[3,3]{1,0}) while(%[[tuple_51]]), condition=%[[$forward_transformation_condition_92]], body=%[[$forward_transformation_body_52]]
+// CHECK-NEXT:  %[[get_tuple_element_104:[^ ]+]] = s32[] get-tuple-element(%[[while_103]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_105:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_103]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_106:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_103]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_107:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[while_103]]), index=3
+// CHECK-NEXT:  %[[constant_123:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_11:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = f32[3,3]{1,0} broadcast(%[[constant_11]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_110:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[while_103]]), index=6
+// CHECK-NEXT:  %[[constant_112:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  %[[dynamic_slice_113:[^ ]+]] = f32[3,1]{1,0} dynamic-slice(%[[get_tuple_element_110]], %[[constant_112]], %[[constant_111]]), dynamic_slice_sizes={3,1}
+// CHECK-NEXT:  %[[get_tuple_element_108:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_103]]), index=4
+// CHECK-NEXT:  %[[constant_115:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_114:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  %[[dynamic_slice_116:[^ ]+]] = f32[1,1]{1,0} dynamic-slice(%[[get_tuple_element_108]], %[[constant_115]], %[[constant_114]]), dynamic_slice_sizes={1,1}
+// CHECK-NEXT:  %[[reshape_117:[^ ]+]] = f32[1]{0} reshape(%[[dynamic_slice_116]])
+// CHECK-NEXT:  %[[broadcast_118:[^ ]+]] = f32[3,1]{1,0} broadcast(%[[reshape_117]]), dimensions={1}
+// CHECK-NEXT:  %[[divide_119:[^ ]+]] = f32[3,1]{1,0} divide(%[[dynamic_slice_113]], %[[broadcast_118]])
+// CHECK-NEXT:  %[[constant_121:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_120:[^ ]+]] = s32[] constant(2)
+// CHECK-NEXT:  %[[dynamic_update_slice_122:[^ ]+]] = f32[3,3]{1,0} dynamic-update-slice(%[[broadcast_12]], %[[divide_119]], %[[constant_121]], %[[constant_120]])
+// CHECK-NEXT:  %[[get_tuple_element_109:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_103]]), index=5
+// CHECK-NEXT:  %[[tuple_124:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) tuple(%[[constant_123]], %[[dynamic_update_slice_122]], %[[get_tuple_element_110]], %[[get_tuple_element_109]], %[[get_tuple_element_108]])
+// CHECK-NEXT:  %[[while_165:[^ ]+]] = (s32[], f32[3,3]{1,0}, f32[3,3]{1,0}, f32[1,3]{1,0}, f32[1,3]{1,0}) while(%[[tuple_124]]), condition=%[[$backward_reduction_condition_156]], body=%[[$backward_reduction_body_125]]
+// CHECK-NEXT:  %[[get_tuple_element_166:[^ ]+]] = s32[] get-tuple-element(%[[while_165]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_167:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[while_165]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_168:[^ ]+]] = f32[3,3]{1,0} get-tuple-element(%[[while_165]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_169:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_165]]), index=3
+// CHECK-NEXT:  ROOT %[[get_tuple_element_170:[^ ]+]] = f32[1,3]{1,0} get-tuple-element(%[[while_165]]), index=4
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[1,3]{1,0} parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[1,3]{1,0} parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[1,3]{1,0} parameter(2)
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = f32[3,3]{1,0} parameter(3)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = f32[3,3]{1,0} custom-call(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]], %[[Arg_3_4]]), custom_call_target="xla_builder.tridiagonal.TridiagonalSolver"
+// CHECK-NEXT:  ROOT %[[custom_call:[^ ]+]] = f32[3,3]{1,0} custom-call(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]], %[[Arg_3_4]]), custom_call_target="xla_builder.tridiagonal.TridiagonalSolver", called_computations={%[[$xla_builder_tridiagonal_TridiagonalSolver_171]]}
+
+HloModule tridiagonal_tridiagonal_solve_all_args, entry_computation_layout={(f32[1,3],f32[1,3],f32[1,3],f32[3,3])->f32[3,3]}
+
+ENTRY %main.3 (Arg_0.1: f32[1,3], Arg_1.2: f32[1,3], Arg_2.3: f32[1,3], Arg_3.4: f32[3,3]) -> f32[3,3] {
+  %Arg_0.1 = f32[1,3] parameter(0)
+  %Arg_1.2 = f32[1,3] parameter(1)
+  %Arg_2.3 = f32[1,3] parameter(2)
+  %Arg_3.4 = f32[3,3] parameter(3)
+  ROOT %custom-call.2 = f32[3,3] custom-call(%Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_3.4), custom_call_target="xla_builder.tridiagonal.TridiagonalSolver"
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/builder/value_inference.cc b/third_party/xla/xla/hlo/builder/value_inference.cc
index bb8e0477f2f0..169e8284aa96 100644
--- a/third_party/xla/xla/hlo/builder/value_inference.cc
+++ b/third_party/xla/xla/hlo/builder/value_inference.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/lib/gtl/value_or_die.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -171,9 +172,9 @@ struct HloProtoEvaluator {
     }
 
     if (primitive_type.has_value()) {
-      *inst.mutable_shape() = ShapeUtil::ChangeElementType(
-                                  Shape(inst.shape()), primitive_type.value())
-                                  .ToProto();
+      TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(inst.shape()));
+      *inst.mutable_shape() =
+          ShapeUtil::ChangeElementType(shape, primitive_type.value()).ToProto();
     }
     if (opcode.has_value()) {
       *inst.mutable_opcode() = std::string(HloOpcodeString(opcode.value()));
@@ -398,8 +399,9 @@ struct PostorderDFSVisitor {
   // kGetDimensionSize or kSetDimensionSize doesn't need evaluation).
   bool IsInstructionOverLimit(const HloInstructionProto* proto,
                               const InferenceContext& context) {
-    auto subshape = std::make_unique<Shape>(
-        ShapeUtil::GetSubshape(Shape(proto->shape()), context.shape_index));
+    auto subshape = std::make_unique<Shape>(ShapeUtil::GetSubshape(
+        tsl::gtl::ValueOrDie(Shape::FromProto(proto->shape())),
+        context.shape_index));
 
     if (subshape->IsArray() &&
         ShapeUtil::ElementsIn(*subshape) > kLargeShapeElementLimit) {
@@ -485,8 +487,8 @@ PostorderDFSVisitor::AnalyzeConstantValueFallback(int64_t handle,
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
-  Shape subshape =
-      ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index);
+  TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
+  Shape subshape = ShapeUtil::GetSubshape(root_shape, context.shape_index);
   PostorderDFSNode result;
   // By default, the dependencies of current node are its operands.
   for (auto operand_id : root->operand_ids()) {
@@ -547,7 +549,7 @@ PostorderDFSVisitor::AnalyzeConstantValueFallback(int64_t handle,
                          call_context, "callee's root instruction");
       return node.AddVisit([](Literal operand) -> absl::StatusOr<Literal> {
         // Forward result of callee's root to caller.
-        return std::move(operand);
+        return operand;
       });
     }
 
@@ -654,8 +656,8 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
-  Shape subshape =
-      ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index);
+  TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
+  Shape subshape = ShapeUtil::GetSubshape(root_shape, context.shape_index);
 
   if (IsInstructionOverLimit(root, context)) {
     return CreateAllDynamicResult(subshape,
@@ -722,8 +724,9 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
               results.emplace_back(
                   max.Broadcast(operands[i].shape(), {}).value());
             }
-            if (ShapeUtil::GetSubshape(Shape(root->shape()),
-                                       context.shape_index)
+            TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                Shape::FromProto(root->shape()));
+            if (ShapeUtil::GetSubshape(root_shape, context.shape_index)
                     .IsTuple()) {
               return LiteralUtil::MakeTupleOwned(std::move(results));
             } else {
@@ -825,8 +828,8 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
-  Shape subshape =
-      ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index);
+  TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
+  Shape subshape = ShapeUtil::GetSubshape(root_shape, context.shape_index);
   if (IsInstructionOverLimit(root, context)) {
     return CreateAllDynamicResult(subshape,
                                   PostorderDFSNodeType::kConstantLowerBound);
@@ -916,8 +919,8 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
   HloOpcode opcode = StringToHloOpcode(root->opcode()).value();
-  Shape subshape =
-      ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index);
+  TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
+  Shape subshape = ShapeUtil::GetSubshape(root_shape, context.shape_index);
   if (IsInstructionOverLimit(root, context)) {
     return CreateAllDynamicResult(subshape,
                                   PostorderDFSNodeType::kConstantValue);
@@ -933,7 +936,9 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
             if (operand_proto->shape().is_dynamic_dimension(dimension)) {
               // The value is dynamic, we return garbage data here and mask them
               // out later.
-              return CreateGarbageLiteral(Shape(root->shape()));
+              TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                  Shape::FromProto(root->shape()));
+              return CreateGarbageLiteral(root_shape);
             } else {
               return LiteralUtil::CreateR0<int32_t>(
                   operand_proto->shape().dimensions(dimension));
@@ -977,11 +982,13 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
             .AddVisit([](Literal operand) { return operand; });
       } else {
         return PostorderDFSNode().AddVisit(
-            [root, context](absl::Span<Literal>) {
+            [root, context](absl::Span<Literal>) -> absl::StatusOr<Literal> {
               // The value is dynamic. We return a garbage literal here, which
               // will be masked out later.
-              return CreateGarbageLiteral(ShapeUtil::GetSubshape(
-                  Shape(root->shape()), context.shape_index));
+              TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                  Shape::FromProto(root->shape()));
+              return CreateGarbageLiteral(
+                  ShapeUtil::GetSubshape(root_shape, context.shape_index));
             });
       }
     }
@@ -1022,11 +1029,11 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
   VLOG(1) << "Analyzing IsDynamic on "
           << handle_to_instruction(handle).value()->DebugString();
   if (IsInstructionOverLimit(handle_to_instruction(handle).value(), context)) {
+    TF_ASSIGN_OR_RETURN(
+        Shape shape,
+        Shape::FromProto(handle_to_instruction(handle).value()->shape()));
     return CreateAllDynamicResult(
-        ShapeUtil::GetSubshape(
-            Shape(handle_to_instruction(handle).value()->shape()),
-            context.shape_index),
-        type);
+        ShapeUtil::GetSubshape(shape, context.shape_index), type);
   }
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
@@ -1072,9 +1079,11 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
         if (type == PostorderDFSNodeType::kValueIsDynamic) {
           // If there is a single operand of a sort is dynamic, we
           // conservatively say all results are dynamic.
-          return CreatePredLiteral(!all_operands_values_static,
-                                   ShapeUtil::GetSubshape(Shape(root->shape()),
-                                                          context.shape_index));
+          TF_ASSIGN_OR_RETURN(Shape root_shape,
+                              Shape::FromProto(root->shape()));
+          return CreatePredLiteral(
+              !all_operands_values_static,
+              ShapeUtil::GetSubshape(root_shape, context.shape_index));
         }
         CHECK(type == PostorderDFSNodeType::kBoundIsDynamic);
         // The condition for bounds are more relaxed than values. If we know the
@@ -1094,7 +1103,8 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
           results.emplace_back(
               CreatePredLiteral(!all_values_static, operands[i].shape()));
         }
-        if (!ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index)
+        TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
+        if (!ShapeUtil::GetSubshape(root_shape, context.shape_index)
                  .IsTuple()) {
           return std::move(results[0]);
         }
@@ -1102,24 +1112,29 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       });
     }
     case HloOpcode::kSetDimensionSize:
-      return result.AddVisit([root, type](absl::Span<Literal> operands) {
+      return result.AddVisit([root, type](absl::Span<Literal> operands)
+                                 -> absl::StatusOr<Literal> {
         bool any_dynamic_operand = absl::c_any_of(
             operands, [](Literal& operand) { return !operand.IsAll(0); });
         // If values in a tensor `t` with bound are [e0, e1, e2...], we can say
         // the max value of each position is [max(t), max(t), max(t), ...]. The
         // effective size of this tensor doesn't change the max value.
+        TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
         return CreatePredLiteral(
             type == PostorderDFSNodeType::kValueIsDynamic &&
                 any_dynamic_operand,
-            ShapeUtil::MakeStaticShape(Shape(root->shape())));
+            ShapeUtil::MakeStaticShape(root_shape));
       });
     case HloOpcode::kDynamicSlice: {
-      return result.AddVisit([root](absl::Span<Literal> operands) {
-        // If any of the operand is dynamic, we say output is dynamic.
-        bool any_dynamic_operand = absl::c_any_of(
-            operands, [](Literal& operand) { return !operand.IsAll(0); });
-        return CreatePredLiteral(any_dynamic_operand, Shape(root->shape()));
-      });
+      return result.AddVisit(
+          [root](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
+            // If any of the operand is dynamic, we say output is dynamic.
+            bool any_dynamic_operand = absl::c_any_of(
+                operands, [](Literal& operand) { return !operand.IsAll(0); });
+            TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                Shape::FromProto(root->shape()));
+            return CreatePredLiteral(any_dynamic_operand, root_shape);
+          });
     }
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
@@ -1210,8 +1225,8 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
 
       if (call_proto->operand_ids_size() != 1) {
         // Only support single operand forwarding.
-        return CreateAllDynamicResult(
-            Shape(handle_to_instruction(handle).value()->shape()), type);
+        TF_ASSIGN_OR_RETURN(auto shape, Shape::FromProto(call_proto->shape()));
+        return CreateAllDynamicResult(shape, type);
       }
       int64_t call_root =
           handle_to_computation(call_proto->called_computation_ids(0))
@@ -1265,9 +1280,9 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
                             context](absl::Span<Literal> operands)
                                -> absl::StatusOr<Literal> {
         int64_t pred_is_dynamic = operands[1].Get<bool>({});
+        TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
         auto result = CreatePredLiteral(
-            true,
-            ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
+            true, ShapeUtil::GetSubshape(root_shape, context.shape_index));
         if (pred_is_dynamic) {
           VLOG(1) << "predict is dynamic value" << result.ToString();
           // If predicate is dynamic, the result is only static if all
@@ -1323,8 +1338,10 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
 
     case HloOpcode::kReduce: {
       return result.AddVisit(
-          [root, context, this](absl::Span<Literal> operands) {
-            Shape root_shape = Shape(root->shape());
+          [root, context,
+           this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
+            TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                Shape::FromProto(root->shape()));
             Shape scalar_shape = ShapeUtil::MakeScalarShape(xla::PRED);
             std::unique_ptr<HloComputation> reduce_or;
             if (root_shape.IsTuple()) {
@@ -1339,11 +1356,11 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
               auto accum = b.AddInstruction(HloInstruction::CreateConstant(
                   LiteralUtil::CreateR0<bool>(false)));
 
-              for (int i = 0; i < root_shape.tuple_shapes_size(); ++i) {
+              for (int i = 0; i < root_shape.tuple_shapes().size(); ++i) {
                 auto lhs = b.AddInstruction(
                     HloInstruction::CreateParameter(i, scalar_shape, "lhs"));
                 auto rhs = b.AddInstruction(HloInstruction::CreateParameter(
-                    i + root_shape.tuple_shapes_size(), scalar_shape, "rhs"));
+                    i + root_shape.tuple_shapes().size(), scalar_shape, "rhs"));
                 accum = b.AddInstruction(HloInstruction::CreateBinary(
                     scalar_shape, HloOpcode::kOr, accum, lhs));
                 accum = b.AddInstruction(HloInstruction::CreateBinary(
@@ -1351,7 +1368,7 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
               }
               // `Broadcast` the result to all positions in the result.
               std::vector<HloInstruction*> results(
-                  root_shape.tuple_shapes_size(), accum);
+                  root_shape.tuple_shapes().size(), accum);
               b.AddInstruction(HloInstruction::CreateTuple(results));
               reduce_or = b.Build();
             } else {
@@ -1376,8 +1393,10 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     }
     case HloOpcode::kConstant:
     case HloOpcode::kIota: {
-      return result.AddVisit(
-          [root]() { return CreatePredLiteral(false, Shape(root->shape())); });
+      return result.AddVisit([root]() -> absl::StatusOr<Literal> {
+        TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
+        return CreatePredLiteral(false, root_shape);
+      });
     }
     case HloOpcode::kParameter: {
       if (opcode == HloOpcode::kParameter &&
@@ -1387,10 +1406,10 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
         return result.AddDependency(caller_operand, type, context)
             .AddVisit([](Literal literal) { return literal; });
       }
-      return result.AddVisit([root, context]() {
+      return result.AddVisit([root, context]() -> absl::StatusOr<Literal> {
+        TF_ASSIGN_OR_RETURN(Shape root_shape, Shape::FromProto(root->shape()));
         return CreatePredLiteral(
-            true,
-            ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
+            true, ShapeUtil::GetSubshape(root_shape, context.shape_index));
       });
     }
     case HloOpcode::kSelect: {
@@ -1409,7 +1428,9 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
                                                       std::move(operands[1]));
             Literal lhs = std::move(operands[2]);
             Literal rhs = std::move(operands[3]);
-            auto result = CreatePredLiteral(true, Shape(root->shape()));
+            TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                Shape::FromProto(root->shape()));
+            auto result = CreatePredLiteral(true, root_shape);
             result.MutableEachCell<bool>(
                 [&](absl::Span<const int64_t> indices, bool value) {
                   std::optional<bool> optional_selector =
@@ -1448,7 +1469,9 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
 
                 if (!optional_selector_literal.AllValid()) {
                   // Conservatively assume results are dynamic.
-                  return CreatePredLiteral(true, Shape(root->shape()));
+                  TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                      Shape::FromProto(root->shape()));
+                  return CreatePredLiteral(true, root_shape);
                 }
                 std::vector<Literal> new_operands;
                 new_operands.emplace_back(std::move(operands[0]));
@@ -1466,7 +1489,9 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
         return PostorderDFSNode().AddVisit([type,
                                             root]() -> absl::StatusOr<Literal> {
           if (type == PostorderDFSNodeType::kBoundIsDynamic) {
-            return CreatePredLiteral(false, Shape(root->shape()));
+            TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                Shape::FromProto(root->shape()));
+            return CreatePredLiteral(false, root_shape);
           } else {
             if (root->literal().shape().element_type() == TUPLE) {
               // First literal of SetBound contains bounds, second literal
@@ -1474,7 +1499,9 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
               return Literal::CreateFromProto(
                   root->literal().tuple_literals(1));
             } else if (type == PostorderDFSNodeType::kValueIsDynamic) {
-              return CreatePredLiteral(true, Shape(root->shape()));
+              TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                  Shape::FromProto(root->shape()));
+              return CreatePredLiteral(true, root_shape);
             } else {
               return Literal::CreateFromProto(root->literal());
             }
@@ -1496,21 +1523,23 @@ absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kWhile: {
-      return PostorderDFSNode().AddVisit([root, context]()
-                                             -> absl::StatusOr<Literal> {
-        return CreatePredLiteral(
-            true,
-            ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
-      });
+      return PostorderDFSNode().AddVisit(
+          [root, context]() -> absl::StatusOr<Literal> {
+            TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                Shape::FromProto(root->shape()));
+            return CreatePredLiteral(
+                true, ShapeUtil::GetSubshape(root_shape, context.shape_index));
+          });
       break;
     }
     default:
-      return PostorderDFSNode().AddVisit([root, context]()
-                                             -> absl::StatusOr<Literal> {
-        return CreatePredLiteral(
-            true,
-            ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
-      });
+      return PostorderDFSNode().AddVisit(
+          [root, context]() -> absl::StatusOr<Literal> {
+            TF_ASSIGN_OR_RETURN(Shape root_shape,
+                                Shape::FromProto(root->shape()));
+            return CreatePredLiteral(
+                true, ShapeUtil::GetSubshape(root_shape, context.shape_index));
+          });
   }
 }
 
@@ -1704,7 +1733,9 @@ absl::StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
       // Only identity kConvert can be optimized away.
       auto operand =
           builder_->LookUpInstructionByHandle(inst->operand_ids(0)).value();
-      if (Shape::Equal()(*output_shape, Shape(operand->shape()))) {
+      TF_ASSIGN_OR_RETURN(Shape operand_shape,
+                          Shape::FromProto(operand->shape()));
+      if (Shape::Equal()(*output_shape, operand_shape)) {
         // Forward operand handle as result.
         return SimplifyOp(inst->operand_ids(0));
       } else {
@@ -1714,7 +1745,7 @@ absl::StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
     case HloOpcode::kAdd: {
       // a + (b - a) => b
       // a + b + (c - a) => b + c
-      if (output_shape->rank() == 0) {
+      if (output_shape->dimensions().size() == 0) {
         TF_ASSIGN_OR_RETURN(auto lhs, SimplifyOp(inst->operand_ids(0)));
         TF_ASSIGN_OR_RETURN(auto rhs, SimplifyOp(inst->operand_ids(1)));
         int64_t lhs_handle = lhs.Get<int64_t>({});
diff --git a/third_party/xla/xla/tests/value_inference_test.cc b/third_party/xla/xla/hlo/builder/value_inference_test.cc
similarity index 97%
rename from third_party/xla/xla/tests/value_inference_test.cc
rename to third_party/xla/xla/hlo/builder/value_inference_test.cc
index 66c3d35250a8..4f03b8546cce 100644
--- a/third_party/xla/xla/tests/value_inference_test.cc
+++ b/third_party/xla/xla/hlo/builder/value_inference_test.cc
@@ -15,30 +15,27 @@ limitations under the License.
 
 #include "xla/hlo/builder/value_inference.h"
 
-#include <memory>
+#include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
-#include "absl/strings/match.h"
 #include "absl/types/span.h"
-#include "xla/client/client_library.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/lib/prng.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/layout_util.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
-#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -52,9 +49,6 @@ class ValueInferenceTest : public ::testing::Test {
 
 class DynamismInferenceTest : public ValueInferenceTest {
  public:
-  explicit DynamismInferenceTest(se::Platform* platform = nullptr)
-      : platform_(platform) {}
-
   absl::StatusOr<Literal> ComputeDynamismLiteral(
       XlaOp operand, XlaBuilder* builder, Layout* output_layout = nullptr) {
     TF_RETURN_IF_ERROR(builder->first_error());
@@ -70,8 +64,6 @@ class DynamismInferenceTest : public ValueInferenceTest {
                         ComputeDynamismLiteral(operand, builder, nullptr));
     return literal.Get<bool>({}, index);
   }
-
-  se::Platform* platform_;
 };
 
 TEST_F(DynamismInferenceTest, ScalarInt32Literal) {
@@ -85,7 +77,7 @@ TEST_F(DynamismInferenceTest, ScalarInt32Literal) {
 }
 
 TEST_F(DynamismInferenceTest, Iota) {
-  // The output of iota are consistened static.
+  // The output of iota are considered static.
   XlaBuilder b(TestName());
   auto computation = Iota(&b, S32, 2);
   // Iota is not dynamic.
@@ -554,9 +546,6 @@ TEST_F(DynamismInferenceTest, ArgumentForwardingNestedTuple) {
 
 class UpperBoundInferenceTest : public ValueInferenceTest {
  public:
-  explicit UpperBoundInferenceTest(se::Platform* platform = nullptr)
-      : platform_(platform) {}
-
   absl::StatusOr<OptionalLiteral> ComputeUpperBoundLiteral(
       XlaOp operand, XlaBuilder* builder, Layout* output_layout = nullptr) {
     ValueInference value_inference(builder);
@@ -565,8 +554,6 @@ class UpperBoundInferenceTest : public ValueInferenceTest {
                             operand, ValueInferenceMode::kUpperBound));
     return literal;
   }
-
-  se::Platform* platform_;
 };
 
 TEST_F(UpperBoundInferenceTest, GetDimensionSize) {
@@ -711,9 +698,6 @@ TEST_F(UpperBoundInferenceTest, KeyValueSort) {
 
 class ConstValueInferenceTest : public ValueInferenceTest {
  public:
-  explicit ConstValueInferenceTest(se::Platform* platform = nullptr)
-      : platform_(platform) {}
-
   absl::StatusOr<OptionalLiteral> ComputeConstantValueLiteral(
       XlaOp operand, XlaBuilder* builder, Layout* output_layout = nullptr) {
     ValueInference value_inference(builder);
@@ -721,8 +705,6 @@ class ConstValueInferenceTest : public ValueInferenceTest {
                                           operand, ValueInferenceMode::kValue));
     return literal;
   }
-
-  se::Platform* platform_;
 };
 
 TEST_F(ConstValueInferenceTest, ConstValuePassThroughSetBound) {
diff --git a/third_party/xla/xla/hlo/builder/xla_builder.cc b/third_party/xla/xla/hlo/builder/xla_builder.cc
index fe8528dc61bc..93d7782de50e 100644
--- a/third_party/xla/xla/hlo/builder/xla_builder.cc
+++ b/third_party/xla/xla/hlo/builder/xla_builder.cc
@@ -115,7 +115,7 @@ bool InstrIsSetBound(const HloInstructionProto* instr_proto) {
 absl::Status NormalizeAndAssignSharing(HloInstructionProto* instr,
                                        const OpSharding& op_sharding) {
   // Normalize tuple sharding and fail the call if the sharding is invalid.
-  Shape shape(instr->shape());
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(instr->shape()));
   TF_ASSIGN_OR_RETURN(HloSharding sharding,
                       HloSharding::FromProto(op_sharding));
   sharding = sharding.NormalizeTupleSharding(shape);
@@ -578,14 +578,14 @@ absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape(
 
   ProgramShape program_shape;
 
-  *program_shape.mutable_result() = Shape(root_proto->shape());
+  TF_ASSIGN_OR_RETURN(*program_shape.mutable_result(),
+                      Shape::FromProto(root_proto->shape()));
 
   // Check that the parameter numbers are continuous from 0, and add parameter
   // shapes and names to the program shape.
   const int64_t param_count = parameter_numbers_.size();
   for (int64_t i = 0; i < param_count; i++) {
-    program_shape.add_parameters();
-    program_shape.add_parameter_names();
+    program_shape.AddParameter(Shape(), "");
   }
   for (const HloInstructionProto& instr : instructions_) {
     // Parameter number uniqueness is guaranteed in XlaBuilder::Parameter(). So
@@ -595,8 +595,9 @@ absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape(
       const int64_t index = instr.parameter_number();
       TF_RET_CHECK(index >= 0 && index < param_count)
           << "invalid parameter number: " << index;
-      *program_shape.mutable_parameters(index) = Shape(instr.shape());
-      *program_shape.mutable_parameter_names(index) = instr.name();
+      TF_ASSIGN_OR_RETURN(*program_shape.mutable_parameters(index),
+                          Shape::FromProto(instr.shape()));
+      program_shape.set_parameter_names(index, instr.name());
     }
   }
   return program_shape;
@@ -759,13 +760,15 @@ absl::StatusOr<XlaComputation> XlaBuilder::Build(
   // the backend.
   if (remove_dynamic_dimensions) {
     std::function<void(Shape*)> remove_dynamic_dimension = [&](Shape* shape) {
-      if (shape->tuple_shapes_size() != 0) {
-        for (int i = 0; i < shape->tuple_shapes_size(); ++i) {
+      if (shape->IsTuple()) {
+        for (int i = 0; i < shape->tuple_shapes().size(); ++i) {
           remove_dynamic_dimension(shape->mutable_tuple_shapes(i));
         }
       }
-      for (int64_t i = 0; i < shape->dimensions_size(); ++i) {
-        shape->set_dynamic_dimension(i, false);
+      if (shape->IsArray()) {
+        for (int64_t i = 0; i < shape->dimensions().size(); ++i) {
+          shape->set_dynamic_dimension(i, false);
+        }
       }
     };
     for (size_t index = 0; index < instructions_.size(); ++index) {
@@ -811,7 +814,7 @@ absl::StatusOr<XlaComputation> XlaBuilder::Build(
   this->embedded_.clear();
   this->parameter_numbers_.clear();
 
-  return std::move(computation);
+  return computation;
 }
 
 /* static */ absl::Status XlaBuilder::PopulateInputOutputAliasAndBufferDonor(
@@ -893,13 +896,13 @@ XlaOp XlaBuilder::MhloDynamicReshape(XlaOp operand, XlaOp output_shape,
     }
     TF_ASSIGN_OR_RETURN(const Shape* output_shape_shape,
                         GetShapePtr(output_shape));
-    if (output_shape_shape->dimensions(0) != shape.rank()) {
+    if (output_shape_shape->dimensions(0) != shape.dimensions().size()) {
       return InvalidArgument(
           "output_shape dimension size=%d (%s) and rank of shape=%d (%s) must "
           "match",
           output_shape_shape->dimensions(0),
-          ShapeUtil::HumanString(*output_shape_shape), shape.rank(),
-          ShapeUtil::HumanString(shape));
+          ShapeUtil::HumanString(*output_shape_shape),
+          shape.dimensions().size(), ShapeUtil::HumanString(shape));
     }
     return xla::CustomCall(operand.builder(), "mhlo.dynamic_reshape",
                            /*operands=*/{operand, output_shape},
@@ -916,18 +919,18 @@ XlaOp XlaBuilder::MhloDynamicBroadcastInDim(
     TF_ASSIGN_OR_RETURN(const Shape* output_dimensions_shape,
                         GetShapePtr(output_dimensions));
 
-    if (!output_dimensions_shape->IsInteger()) {
+    if (!output_dimensions_shape->AreAllLeavesIntegers()) {
       return InvalidArgument("output_dimensions must be an integer type %s",
                              ShapeUtil::HumanString(*output_dimensions_shape));
     }
 
-    if (output_dimensions_shape->rank() != 1) {
+    if (output_dimensions_shape->dimensions().size() != 1) {
       return InvalidArgument("output_dimensions must be rank 1 but got rank %d",
-                             output_dimensions_shape->rank());
+                             output_dimensions_shape->dimensions().size());
     }
 
-    int64_t operand_rank = operand_shape->rank();
-    int64_t result_rank = output_shape.rank();
+    int64_t operand_rank = operand_shape->dimensions().size();
+    int64_t result_rank = output_shape.dimensions().size();
     int64_t broadcast_dimensions_size = broadcast_dimensions.size();
     if (broadcast_dimensions_size != operand_rank) {
       return InvalidArgument(
@@ -985,7 +988,7 @@ absl::StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
   TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
   TF_RET_CHECK(!shape.is_unbounded_dynamic())
       << "broadcast op result shapes must be static";
-  for (int64_t i = 0; i < shape.rank(); i++) {
+  for (int64_t i = 0; i < shape.dimensions().size(); i++) {
     if (auto it = absl::c_find(broadcast_dimensions, i);
         it != broadcast_dimensions.end()) {
       // Broadcast dimensions are permitted to be dynamic iff the operand
@@ -1010,7 +1013,7 @@ absl::StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(
   TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
   CHECK(ShapeUtil::IsScalar(*operand_shape) ||
-        operand_shape->rank() == output_shape.rank());
+        operand_shape->dimensions().size() == output_shape.dimensions().size());
   Shape broadcast_shape =
       ShapeUtil::ChangeElementType(output_shape, operand_shape->element_type());
 
@@ -1024,7 +1027,7 @@ absl::StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(
   std::vector<int64_t> broadcast_dimensions;
   std::vector<int64_t> reshaped_dimensions;
   std::vector<bool> reshaped_dynamic_dimensions;
-  for (int i = 0; i < operand_shape->rank(); i++) {
+  for (int i = 0; i < operand_shape->dimensions().size(); i++) {
     if (operand_shape->dimensions(i) == output_shape.dimensions(i)) {
       broadcast_dimensions.push_back(i);
       reshaped_dimensions.push_back(operand_shape->dimensions(i));
@@ -1060,22 +1063,14 @@ absl::StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(
                         broadcast_dimensions);
 }
 
-XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand) {
-  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferUnaryOpShape(unop, *operand_shape));
-    return AddOpWithShape(unop, shape, {operand});
-  });
-}
-
-XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand,
-                          const ResultAccuracy& result_accuracy) {
+XlaOp XlaBuilder::UnaryOp(
+    HloOpcode unop, XlaOp operand,
+    const std::optional<ResultAccuracy>& result_accuracy) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferUnaryOpShape(unop, *operand_shape));
-    return AddOpWithResultAccuracy(unop, shape, {operand}, result_accuracy);
+    return AddOpWithShape(unop, shape, {operand}, result_accuracy);
   });
 }
 
@@ -1091,8 +1086,8 @@ absl::StatusOr<XlaOp> BroadcastToTargetRank(
     return origin;
   }
 
-  const int64_t origin_rank = origin_shape.rank();
-  const int64_t target_rank = target_shape.rank();
+  const int64_t origin_rank = origin_shape.dimensions().size();
+  const int64_t target_rank = target_shape.dimensions().size();
 
   // Identity op if ranks match, should never be larger than target.
   if (origin_rank >= target_rank) {
@@ -1140,8 +1135,8 @@ absl::StatusOr<XlaOp> BroadcastScalarToOutputShapeWithUnbounded(
   TF_ASSIGN_OR_RETURN(const Shape* scalar_shape, builder->GetShapePtr(scalar));
   CHECK(ShapeUtil::IsScalar(*scalar_shape));
 
-  std::vector<XlaOp> output_sizes(output_shape.rank());
-  for (size_t i = 0; i < output_shape.rank(); i++) {
+  std::vector<XlaOp> output_sizes(output_shape.dimensions().size());
+  for (size_t i = 0; i < output_shape.dimensions().size(); i++) {
     output_sizes[i] =
         output_shape.is_static_dimension(i)
             ? ConstantR1<int32_t>(
@@ -1162,9 +1157,10 @@ absl::StatusOr<XlaOp> DegenerateBroadcastWithUnbounded(
   TF_ASSIGN_OR_RETURN(const Shape* operand_shape,
                       builder->GetShapePtr(operand));
 
-  std::vector<int64_t> broadcast_dimensions(operand_shape->rank());
-  std::iota(broadcast_dimensions.begin(), broadcast_dimensions.end(),
-            output_shape.rank() - operand_shape->rank());
+  std::vector<int64_t> broadcast_dimensions(operand_shape->dimensions().size());
+  std::iota(
+      broadcast_dimensions.begin(), broadcast_dimensions.end(),
+      output_shape.dimensions().size() - operand_shape->dimensions().size());
 
   return MhloDynamicBroadcastInDim(operand, output_dimensions,
                                    broadcast_dimensions, output_shape);
@@ -1182,9 +1178,9 @@ absl::StatusOr<UnboundedBroadcastResult> BroadcastToOutputShapeWithUnbounded(
     XlaBuilder* builder, XlaOp lhs, const Shape& lhs_shape, XlaOp rhs,
     const Shape rhs_shape, const Shape& output_shape,
     absl::Span<const int64_t> broadcast_dimensions) {
-  const int64_t lhs_rank = lhs_shape.rank();
-  const int64_t rhs_rank = rhs_shape.rank();
-  const int64_t output_rank = output_shape.rank();
+  const int64_t lhs_rank = lhs_shape.dimensions().size();
+  const int64_t rhs_rank = rhs_shape.dimensions().size();
+  const int64_t output_rank = output_shape.dimensions().size();
 
   // If the rank of the op is less than the output rank, pad the dimension
   // sizes of the op with 1's to match the output rank.
@@ -1228,12 +1224,12 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
     XlaOp updated_rhs = rhs;
     if (!lhs_shape->is_unbounded_dynamic() &&
         !rhs_shape->is_unbounded_dynamic()) {
-      if (lhs_shape->rank() < shape.rank()) {
+      if (lhs_shape->dimensions().size() < shape.dimensions().size()) {
         TF_ASSIGN_OR_RETURN(updated_lhs,
                             BroadcastToTargetRank(lhs, *lhs_shape, shape,
                                                   broadcast_dimensions));
       }
-      if (rhs_shape->rank() < shape.rank()) {
+      if (rhs_shape->dimensions().size() < shape.dimensions().size()) {
         TF_ASSIGN_OR_RETURN(updated_rhs,
                             BroadcastToTargetRank(rhs, *rhs_shape, shape,
                                                   broadcast_dimensions));
@@ -1543,10 +1539,10 @@ XlaOp XlaBuilder::Broadcast(XlaOp operand,
     // output, so to append dimensions on the left the instruction's dimensions
     // should just be the n highest dimension numbers of the output shape where
     // n is the number of input dimensions.
-    const int64_t operand_rank = operand_shape->rank();
+    const int64_t operand_rank = operand_shape->dimensions().size();
     std::vector<int64_t> dimensions(operand_rank);
     for (int i = 0; i < operand_rank; ++i) {
-      dimensions[i] = i + shape.rank() - operand_rank;
+      dimensions[i] = i + shape.dimensions().size() - operand_rank;
     }
     return InDimBroadcast(shape, operand, dimensions);
   });
@@ -1566,11 +1562,11 @@ XlaOp XlaBuilder::BroadcastInDim(
         << "BroadcastInDim output must shape be static or bounded dynamic "
         << ShapeUtil::HumanString(output_shape);
     int64_t broadcast_rank = broadcast_dimensions.size();
-    if (operand_shape->rank() != broadcast_rank) {
+    if (operand_shape->dimensions().size() != broadcast_rank) {
       return InvalidArgument(
           "Size of broadcast_dimensions has to match operand's rank; operand "
           "rank: %lld, size of broadcast_dimensions %u.",
-          operand_shape->rank(), broadcast_dimensions.size());
+          operand_shape->dimensions().size(), broadcast_dimensions.size());
     }
     for (int i = 0; i < broadcast_rank; i++) {
       const int64_t num_dims = out_dim_size.size();
@@ -1661,10 +1657,10 @@ XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64_t start_index,
                              int64_t dimno) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
-    std::vector<int64_t> starts(shape->rank(), 0);
+    std::vector<int64_t> starts(shape->dimensions().size(), 0);
     std::vector<int64_t> limits(shape->dimensions().begin(),
                                 shape->dimensions().end());
-    std::vector<int64_t> strides(shape->rank(), 1);
+    std::vector<int64_t> strides(shape->dimensions().size(), 1);
     starts[dimno] = start_index;
     limits[dimno] = limit_index;
     strides[dimno] = stride;
@@ -1775,7 +1771,8 @@ XlaOp XlaBuilder::PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
                            int64_t pad_lo, int64_t pad_hi) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
-    PaddingConfig padding_config = MakeNoPaddingConfig(shape->rank());
+    PaddingConfig padding_config =
+        MakeNoPaddingConfig(shape->dimensions().size());
     auto* dims = padding_config.mutable_dimensions(dimno);
     dims->set_edge_padding_low(pad_lo);
     dims->set_edge_padding_high(pad_hi);
@@ -1868,7 +1865,7 @@ XlaOp XlaBuilder::Collapse(XlaOp operand,
     VLOG(3) << "dims to collapse: " << absl::StrJoin(dimensions, ",");
 
     std::vector<int64_t> new_sizes;
-    for (int i = 0; i < original_shape->rank(); ++i) {
+    for (int i = 0; i < original_shape->dimensions().size(); ++i) {
       if (i <= dimensions.front() || i > dimensions.back()) {
         new_sizes.push_back(original_shape->dimensions(i));
       } else {
@@ -1964,7 +1961,7 @@ XlaOp XlaBuilder::Dot(XlaOp lhs, XlaOp rhs,
 
     DotDimensionNumbers dimension_numbers;
     dimension_numbers.add_lhs_contracting_dimensions(
-        lhs_shape->dimensions_size() == 1 ? 0 : 1);
+        lhs_shape->dimensions().size() == 1 ? 0 : 1);
     dimension_numbers.add_rhs_contracting_dimensions(0);
     return DotGeneral(lhs, rhs, dimension_numbers, precision_config,
                       preferred_element_type);
@@ -2093,13 +2090,13 @@ XlaOp XlaBuilder::RaggedDot(
 absl::Status XlaBuilder::VerifyConvolution(
     const Shape& lhs_shape, const Shape& rhs_shape,
     const ConvolutionDimensionNumbers& dimension_numbers) const {
-  if (lhs_shape.rank() != rhs_shape.rank()) {
+  if (lhs_shape.dimensions().size() != rhs_shape.dimensions().size()) {
     return InvalidArgument(
         "Convolution arguments must have same number of "
         "dimensions. Got: %s and %s",
         ShapeUtil::HumanString(lhs_shape), ShapeUtil::HumanString(rhs_shape));
   }
-  int num_dims = lhs_shape.rank();
+  int num_dims = lhs_shape.dimensions().size();
   if (num_dims < 2) {
     return InvalidArgument(
         "Convolution expects argument arrays with >= 3 dimensions. "
@@ -2293,7 +2290,7 @@ absl::StatusOr<HloInstructionProto> XlaBuilder::DynamicConvInstruction(
   if (precision_config != nullptr) {
     *instr.mutable_precision_config() = *precision_config;
   }
-  return std::move(instr);
+  return instr;
 }
 
 XlaOp XlaBuilder::DynamicConvInputGrad(
@@ -2921,7 +2918,7 @@ absl::StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
   instr.set_is_stable(is_stable);
   if (dimension == -1) {
     TF_ASSIGN_OR_RETURN(const Shape* keys_shape, GetShapePtr(operands[0]));
-    dimension = keys_shape->rank() - 1;
+    dimension = keys_shape->dimensions().size() - 1;
   }
   instr.add_dimensions(dimension);
   AddCalledComputation(comparator, &instr);
@@ -3018,13 +3015,13 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
                          operand_shape_ptrs, called_program_shape, dimensions));
     *instr.mutable_shape() = shape.ToProto();
 
-    Shape output_shape(instr.shape());
-    const int64_t output_rank = output_shape.rank();
+    TF_ASSIGN_OR_RETURN(Shape output_shape, Shape::FromProto(instr.shape()));
+    const int64_t output_rank = output_shape.dimensions().size();
     AddCalledComputation(computation, &instr);
     std::vector<XlaOp> new_operands(operands.begin(), operands.end());
     for (XlaOp& new_operand : new_operands) {
       TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(new_operand));
-      const int64_t rank = shape->rank();
+      const int64_t rank = shape->dimensions().size();
       if (rank != output_rank) {
         TF_ASSIGN_OR_RETURN(new_operand,
                             InDimBroadcast(output_shape, new_operand, {}));
@@ -3086,9 +3083,11 @@ XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
     TF_ASSIGN_OR_RETURN(Shape state_shape, GetShape(initial_state));
-    Shape output_shape = shape;
-    output_shape.set_element_type(PRIMITIVE_TYPE_INVALID);
-    if (primitive_util::IsArrayType(shape.element_type())) {
+    Shape output_shape;  // An invalid shape by default.
+    if (shape.IsArray()) {
+      // Make output_shape the same as the input shape, but with an unsigned
+      // integral type.
+      output_shape = shape;
       output_shape.set_element_type(
           primitive_util::UnsignedIntegralTypeForBitWidth(
               primitive_util::BitWidth(shape.element_type())));
@@ -3290,10 +3289,10 @@ XlaOp XlaBuilder::AllReduceImpl(XlaOp operand,
     std::vector<const Shape*> operand_shapes;
     std::vector<XlaOp> operands;
     if (operand_shape->IsTuple()) {
-      if (operand_shape->tuple_shapes_size() == 0) {
+      if (operand_shape->tuple_shapes().size() == 0) {
         return Unimplemented("0 element tuple AllReduce is not supported");
       }
-      for (int i = 0; i < operand_shape->tuple_shapes_size(); ++i) {
+      for (int i = 0; i < operand_shape->tuple_shapes().size(); ++i) {
         if (operand_shape->tuple_shapes(i).element_type() !=
             operand_shape->tuple_shapes(0).element_type()) {
           return Unimplemented(
@@ -3325,7 +3324,7 @@ XlaOp XlaBuilder::AllReduceImpl(XlaOp operand,
       instr.set_constrain_layout(true);
       if (operand_shape->IsTuple() && !inferred_shape.IsTuple()) {
         // For a single-element tuple, take the tuple element shape.
-        TF_RET_CHECK(layout->tuple_shapes_size() == 1);
+        TF_RET_CHECK(layout->tuple_shapes().size() == 1);
         *instr.mutable_shape() = layout->tuple_shapes(0).ToProto();
       } else {
         *instr.mutable_shape() = layout->ToProto();
@@ -3378,10 +3377,10 @@ XlaOp XlaBuilder::AllGatherImpl(const XlaOp operand,
     std::vector<const Shape*> operand_shapes;
     std::vector<XlaOp> operands;
     if (operand_shape->IsTuple()) {
-      if (operand_shape->tuple_shapes_size() == 0) {
+      if (operand_shape->tuple_shapes().size() == 0) {
         return Unimplemented("0 element tuple AllGather is not supported");
       }
-      for (int i = 0; i < operand_shape->tuple_shapes_size(); ++i) {
+      for (int i = 0; i < operand_shape->tuple_shapes().size(); ++i) {
         operand_shapes.push_back(&operand_shape->tuple_shapes(i));
         operands.push_back(GetTupleElement(operand, i));
       }
@@ -3523,7 +3522,7 @@ XlaOp XlaBuilder::ReduceAll(XlaOp operand, XlaOp init_value,
                             const XlaComputation& computation) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    std::vector<int64_t> all_dimnos(operand_shape->rank());
+    std::vector<int64_t> all_dimnos(operand_shape->dimensions().size());
     std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
     return Reduce(operand, init_value, computation, all_dimnos);
   });
@@ -3562,7 +3561,7 @@ XlaOp XlaBuilder::ReduceWindow(absl::Span<const XlaOp> operands,
                             /*lhs_dilation=*/{},
                             /*rhs_dilation=*/{}));
     PaddingType padding_type = PADDING_INVALID;
-    for (int64_t i = 0; i < operand_shape->rank(); ++i) {
+    for (int64_t i = 0; i < operand_shape->dimensions().size(); ++i) {
       if (operand_shape->is_dynamic_dimension(i) &&
           !window_util::IsTrivialWindowDimension(window.dimensions(i)) &&
           padding == Padding::kSame) {
@@ -3772,7 +3771,7 @@ XlaOp XlaBuilder::CrossReplicaSum(
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     const Shape* element_shape;
     if (shape->IsTuple()) {
-      if (shape->tuple_shapes_size() == 0) {
+      if (shape->tuple_shapes().size() == 0) {
         return Unimplemented(
             "0 element tuple CrossReplicaSum is not supported");
       }
@@ -3818,10 +3817,10 @@ XlaOp XlaBuilder::ReduceScatter(
     std::vector<const Shape*> operand_shapes;
     std::vector<XlaOp> operands;
     if (operand_shape->IsTuple()) {
-      if (operand_shape->tuple_shapes_size() == 0) {
+      if (operand_shape->tuple_shapes().size() == 0) {
         return Unimplemented("0 element tuple ReduceScatter is not supported");
       }
-      for (int i = 0; i < operand_shape->tuple_shapes_size(); ++i) {
+      for (int i = 0; i < operand_shape->tuple_shapes().size(); ++i) {
         if (operand_shape->tuple_shapes(i).element_type() !=
             operand_shape->tuple_shapes(0).element_type()) {
           return Unimplemented(
@@ -3925,7 +3924,7 @@ XlaOp XlaBuilder::AllToAllArray(
     };
     XlaOp r1_split_count =
         ConstantR1<int32_t>(this, {static_cast<int32_t>(split_count)});
-    for (int64_t i = 0; i < operand_shape->rank(); ++i) {
+    for (int64_t i = 0; i < operand_shape->dimensions().size(); ++i) {
       if (i != split_dimension) {
         sizes.push_back(operand_shape->dimensions(i));
         if (is_unbounded) {
@@ -3964,7 +3963,7 @@ XlaOp XlaBuilder::AllToAllArray(
     }
 
     std::vector<int64_t> permutation;
-    const auto rank = operand_shape->rank();
+    const auto rank = operand_shape->dimensions().size();
     permutation.reserve(rank + 1);
     for (int64_t i = 0; i < rank; ++i) {
       int64_t dim_after_reshape = i >= split_dimension ? i + 1 : i;
@@ -3977,8 +3976,8 @@ XlaOp XlaBuilder::AllToAllArray(
 
     if (is_unbounded) {
       std::vector<XlaOp> new_dimensions;
-      new_dimensions.reserve(operand_shape->rank());
-      for (int64_t i = 0; i < operand_shape->rank(); ++i) {
+      new_dimensions.reserve(operand_shape->dimensions().size());
+      for (int64_t i = 0; i < operand_shape->dimensions().size(); ++i) {
         new_dimensions.push_back(GetR1DimensionSizeOrConstant(operand, i));
       }
       new_dimensions[split_dimension] =
@@ -4014,7 +4013,8 @@ XlaOp XlaBuilder::AllToAllTuple(
       for (int64_t i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
         const int64_t layout_minor_to_major_size =
             layout->minor_to_major().size();
-        if (layout_minor_to_major_size != shape.tuple_shapes(i).rank()) {
+        if (layout_minor_to_major_size !=
+            shape.tuple_shapes(i).dimensions().size()) {
           return InvalidArgument(
               "Provided layout must be compatible with the operands' shape. "
               "The layout is %s, but operand %d has shape %s.",
@@ -4229,7 +4229,7 @@ XlaOp XlaBuilder::SelectAndScatter(XlaOp operand, const XlaComputation& select,
                             /*lhs_dilation=*/{},
                             /*rhs_dilation=*/{}));
     PaddingType padding_type = PADDING_INVALID;
-    for (int64_t i = 0; i < operand_shape->rank(); ++i) {
+    for (int64_t i = 0; i < operand_shape->dimensions().size(); ++i) {
       if (operand_shape->is_dynamic_dimension(i) &&
           !window_util::IsTrivialWindowDimension(window.dimensions(i)) &&
           padding == Padding::kSame) {
@@ -4747,7 +4747,7 @@ absl::StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "Constant computation:\n" << module->DebugString();
   }
-  return std::move(computation);
+  return computation;
 }
 
 std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
@@ -4863,8 +4863,9 @@ absl::StatusOr<XlaOp> XlaBuilder::AddInstruction(
 
   handle_to_index_[handle] = instructions_.size();
   instructions_.push_back(std::move(instr));
-  instruction_shapes_.push_back(
-      std::make_unique<Shape>(instructions_.back().shape()));
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      Shape::FromProto(instructions_.back().shape()));
+  instruction_shapes_.push_back(std::make_unique<Shape>(std::move(shape)));
 
   XlaOp op(handle, this);
   return op;
@@ -4872,17 +4873,17 @@ absl::StatusOr<XlaOp> XlaBuilder::AddInstruction(
 
 absl::StatusOr<XlaOp> XlaBuilder::AddOpWithShape(
     HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands) {
-  HloInstructionProto instr;
-  *instr.mutable_shape() = shape.ToProto();
-  return AddInstruction(std::move(instr), opcode, operands);
+  return AddOpWithShape(opcode, shape, operands, std::nullopt);
 }
 
-absl::StatusOr<XlaOp> XlaBuilder::AddOpWithResultAccuracy(
+absl::StatusOr<XlaOp> XlaBuilder::AddOpWithShape(
     HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands,
-    const ResultAccuracy& result_accuracy) {
+    const std::optional<ResultAccuracy>& result_accuracy) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
-  *instr.mutable_result_accuracy() = result_accuracy;
+  if (result_accuracy.has_value()) {
+    *instr.mutable_result_accuracy() = result_accuracy.value();
+  }
   return AddInstruction(std::move(instr), opcode, operands);
 }
 
@@ -5777,75 +5778,106 @@ XlaOp Atan2(const XlaOp y, const XlaOp x,
   return y.builder()->BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions);
 }
 
-XlaOp Exp(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kExp, operand);
-}
-
-XlaOp Exp(const XlaOp operand, const ResultAccuracy& result_accuracy) {
+XlaOp Exp(const XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy) {
   return operand.builder()->UnaryOp(HloOpcode::kExp, operand, result_accuracy);
 }
 
-XlaOp Expm1(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kExpm1, operand);
+XlaOp Expm1(const XlaOp operand,
+            const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kExpm1, operand,
+                                    result_accuracy);
 }
+
 XlaOp Floor(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kFloor, operand);
 }
+
 XlaOp Ceil(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kCeil, operand);
 }
+
 XlaOp Round(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kRoundNearestAfz, operand);
 }
+
 XlaOp RoundNearestEven(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kRoundNearestEven, operand);
 }
-XlaOp Log(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kLog, operand);
+
+XlaOp Log(const XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog, operand, result_accuracy);
 }
-XlaOp Log1p(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
+
+XlaOp Log1p(const XlaOp operand,
+            const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand,
+                                    result_accuracy);
 }
-XlaOp Erf(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kErf, operand);
+
+XlaOp Erf(const XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kErf, operand, result_accuracy);
 }
-XlaOp Logistic(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kLogistic, operand);
+
+XlaOp Logistic(const XlaOp operand,
+               const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kLogistic, operand,
+                                    result_accuracy);
 }
+
 XlaOp Sign(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
 }
+
 XlaOp Clz(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kClz, operand);
 }
-XlaOp Cos(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kCos, operand);
+
+XlaOp Cos(const XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kCos, operand, result_accuracy);
 }
-XlaOp Sin(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kSin, operand);
+
+XlaOp Sin(const XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kSin, operand, result_accuracy);
 }
-XlaOp Tan(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kTan, operand);
+
+XlaOp Tan(const XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kTan, operand, result_accuracy);
 }
-XlaOp Tanh(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kTanh, operand);
+
+XlaOp Tanh(const XlaOp operand,
+           const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kTanh, operand, result_accuracy);
 }
+
 XlaOp Real(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kReal, operand);
 }
+
 XlaOp Imag(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kImag, operand);
 }
-XlaOp Sqrt(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kSqrt, operand);
-}
-XlaOp Cbrt(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kCbrt, operand);
+
+XlaOp Sqrt(const XlaOp operand,
+           const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kSqrt, operand, result_accuracy);
 }
-XlaOp Rsqrt(const XlaOp operand) {
-  return operand.builder()->UnaryOp(HloOpcode::kRsqrt, operand);
+
+XlaOp Cbrt(const XlaOp operand,
+           const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kCbrt, operand, result_accuracy);
 }
 
+XlaOp Rsqrt(const XlaOp operand,
+            const std::optional<ResultAccuracy>& result_accuracy) {
+  return operand.builder()->UnaryOp(HloOpcode::kRsqrt, operand,
+                                    result_accuracy);
+}
 XlaOp Pow(const XlaOp lhs, const XlaOp rhs,
           absl::Span<const int64_t> broadcast_dimensions) {
   return lhs.builder()->BinaryOp(HloOpcode::kPower, lhs, rhs,
@@ -6112,7 +6144,7 @@ absl::StatusOr<XlaOp> ConvertSpmdFullToShardShape(
   TF_ASSIGN_OR_RETURN(const Shape input_shape, builder->GetShape(input));
 
   Shape output_shape = input_shape;
-  const int64_t rank = output_shape.rank();
+  const int64_t rank = output_shape.dimensions().size();
   if (manual_sharding.type() == OpSharding::OTHER) {
     for (int64_t i = 0; i < rank; ++i) {
       if (single_dim >= 0 && i != single_dim) {
diff --git a/third_party/xla/xla/hlo/builder/xla_builder.h b/third_party/xla/xla/hlo/builder/xla_builder.h
index 07548a99d929..8f479090d86b 100644
--- a/third_party/xla/xla/hlo/builder/xla_builder.h
+++ b/third_party/xla/xla/hlo/builder/xla_builder.h
@@ -1073,13 +1073,11 @@ class XlaBuilder {
   absl::StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(
       int64_t handle);
 
-  // Internal helper method that does the building for an arbitrary unary op.
-  virtual XlaOp UnaryOp(HloOpcode unop, XlaOp operand);
-
   // Internal helper method that does the building for an arbitrary unary op
-  // with a result accuracy intended for unary functions.
-  virtual XlaOp UnaryOp(HloOpcode unop, XlaOp operand,
-                        const ResultAccuracy& result_accuracy);
+  // with an optional result accuracy.
+  XlaOp UnaryOp(
+      HloOpcode unop, XlaOp operand,
+      const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
   // Internal helper method that does the building for an arbitrary binary op.
   // broadcast_dimensions specifies which dimensions to use for broadcasting
@@ -1604,28 +1602,40 @@ class XlaBuilder {
   friend XlaOp Abs(XlaOp operand);
   friend XlaOp Atan2(XlaOp y, XlaOp x,
                      absl::Span<const int64_t> broadcast_dimensions);
-  friend XlaOp Erf(XlaOp operand);
-  friend XlaOp Exp(XlaOp operand);
-  friend XlaOp Exp(XlaOp operand, const ResultAccuracy& result_accuracy);
-  friend XlaOp Expm1(XlaOp operand);
+  friend XlaOp Erf(XlaOp operand,
+                   const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Exp(XlaOp operand,
+                   const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Expm1(XlaOp operand,
+                     const std::optional<ResultAccuracy>& result_accuracy);
   friend XlaOp Floor(XlaOp operand);
   friend XlaOp Ceil(XlaOp operand);
   friend XlaOp Round(XlaOp operand);
   friend XlaOp RoundNearestEven(XlaOp operand);
-  friend XlaOp Log(XlaOp operand);
-  friend XlaOp Log1p(XlaOp operand);
-  friend XlaOp Logistic(XlaOp operand);
+  friend XlaOp Log(XlaOp operand,
+                   const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Log1p(XlaOp operand,
+                     const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Logistic(XlaOp operand,
+                        const std::optional<ResultAccuracy>& result_accuracy);
   friend XlaOp Sign(XlaOp operand);
   friend XlaOp Clz(XlaOp operand);
-  friend XlaOp Cos(XlaOp operand);
-  friend XlaOp Sin(XlaOp operand);
-  friend XlaOp Tan(XlaOp operand);
-  friend XlaOp Tanh(XlaOp operand);
+  friend XlaOp Cos(XlaOp operand,
+                   const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Sin(XlaOp operand,
+                   const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Tan(XlaOp operand,
+                   const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Tanh(XlaOp operand,
+                    const std::optional<ResultAccuracy>& result_accuracy);
   friend XlaOp Real(XlaOp operand);
   friend XlaOp Imag(XlaOp operand);
-  friend XlaOp Sqrt(XlaOp operand);
-  friend XlaOp Rsqrt(XlaOp operand);
-  friend XlaOp Cbrt(XlaOp operand);
+  friend XlaOp Sqrt(XlaOp operand,
+                    const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Rsqrt(XlaOp operand,
+                     const std::optional<ResultAccuracy>& result_accuracy);
+  friend XlaOp Cbrt(XlaOp operand,
+                    const std::optional<ResultAccuracy>& result_accuracy);
   friend XlaOp Pow(XlaOp lhs, XlaOp rhs,
                    absl::Span<const int64_t> broadcast_dimensions);
   friend XlaOp IsFinite(XlaOp operand);
@@ -1763,11 +1773,9 @@ class XlaBuilder {
   // Creates an op with the given opcode and the output shape.
   virtual absl::StatusOr<XlaOp> AddOpWithShape(
       HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands);
-
-  // Creates an op with the given opcode and the output shape.
-  virtual absl::StatusOr<XlaOp> AddOpWithResultAccuracy(
+  virtual absl::StatusOr<XlaOp> AddOpWithShape(
       HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands,
-      const ResultAccuracy& result_accuracy);
+      const std::optional<ResultAccuracy>& result_accuracy);
 
   // Here, InstructionType is either const HloInstructionProto* or non-const
   // HloInstructionProto*.
@@ -2076,9 +2084,9 @@ XlaOp Collapse(XlaOp operand, absl::Span<const int64_t> dimensions);
 // Enqueues a slice operation onto the computation that slices the operand
 // from the start indices to the limit indices; e.g.
 //
-//        x
+//        y
 //   [ 0 1 2 3 ]
-// y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+// x [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
 //   [ 8 9 a b ]
 //
 // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
@@ -2721,14 +2729,17 @@ XlaOp Atan2(XlaOp y, XlaOp x,
             absl::Span<const int64_t> broadcast_dimensions = {});
 
 // Enqueues an erf instruction onto the computation.
-XlaOp Erf(XlaOp operand);
+XlaOp Erf(XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues an exp instruction onto the computation.
-XlaOp Exp(XlaOp operand);
-XlaOp Exp(XlaOp operand, const ResultAccuracy& result_accuracy);
+XlaOp Exp(XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues an expm1 instruction onto the computation.
-XlaOp Expm1(XlaOp operand);
+XlaOp Expm1(
+    XlaOp operand,
+    const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a floor instruction onto the computation.
 XlaOp Floor(XlaOp operand);
@@ -2744,13 +2755,18 @@ XlaOp Round(XlaOp operand);
 XlaOp RoundNearestEven(XlaOp operand);
 
 // Enqueues an log instruction (natural logarithm) onto the computation.
-XlaOp Log(XlaOp operand);
+XlaOp Log(XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues an log1p instruction (log(x+1)) onto the computation.
-XlaOp Log1p(XlaOp operand);
+XlaOp Log1p(
+    XlaOp operand,
+    const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a logistic instruction onto the computation.
-XlaOp Logistic(XlaOp operand);
+XlaOp Logistic(
+    XlaOp operand,
+    const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a sign instruction onto the computation.
 XlaOp Sign(XlaOp operand);
@@ -2759,16 +2775,20 @@ XlaOp Sign(XlaOp operand);
 XlaOp Clz(XlaOp operand);
 
 // Enqueues a cosine instruction onto the computation.
-XlaOp Cos(XlaOp operand);
+XlaOp Cos(XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a sine instruction onto the computation.
-XlaOp Sin(XlaOp operand);
+XlaOp Sin(XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a tan instruction onto the computation.
-XlaOp Tan(XlaOp operand);
+XlaOp Tan(XlaOp operand,
+          const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a tanh instruction onto the computation.
-XlaOp Tanh(XlaOp operand);
+XlaOp Tanh(XlaOp operand,
+           const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a real-part instruction onto the computation.
 XlaOp Real(XlaOp operand);
@@ -2777,13 +2797,17 @@ XlaOp Real(XlaOp operand);
 XlaOp Imag(XlaOp operand);
 
 // Enqueues a sqrt computation onto the computation.
-XlaOp Sqrt(XlaOp operand);
+XlaOp Sqrt(XlaOp operand,
+           const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a cbrt computation onto the computation.
-XlaOp Cbrt(XlaOp operand);
+XlaOp Cbrt(XlaOp operand,
+           const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a rsqrt computation onto the computation.
-XlaOp Rsqrt(XlaOp operand);
+XlaOp Rsqrt(
+    XlaOp operand,
+    const std::optional<ResultAccuracy>& result_accuracy = std::nullopt);
 
 // Enqueues a lhs^rhs computation onto the computation.
 XlaOp Pow(XlaOp lhs, XlaOp rhs,
diff --git a/third_party/xla/xla/hlo/builder/xla_builder_test.cc b/third_party/xla/xla/hlo/builder/xla_builder_test.cc
index 85592bf4e310..e95ed89f4c8d 100644
--- a/third_party/xla/xla/hlo/builder/xla_builder_test.cc
+++ b/third_party/xla/xla/hlo/builder/xla_builder_test.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status_matchers.h"
@@ -104,6 +105,76 @@ std::string TestName() {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
+TEST(XlaBuilderTest, IsConstant) {
+  {
+    // cst -> tuple -> get_tuple_element
+    XlaBuilder b(TestName());
+    auto cst = ConstantR0<float>(&b, 1.0);
+    auto tuple = Tuple(&b, {cst, cst});
+    auto get_tuple_element = GetTupleElement(tuple, 0);
+    TF_ASSERT_OK_AND_ASSIGN(bool is_constant, b.IsConstant(get_tuple_element));
+    EXPECT_TRUE(is_constant);
+  }
+  {
+    // param -> tuple -> get_tuple_element
+    XlaBuilder b(TestName());
+    auto param = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "p0");
+    auto tuple = Tuple(&b, {param, param});
+    auto get_tuple_element = GetTupleElement(tuple, 0);
+    TF_ASSERT_OK_AND_ASSIGN(bool is_constant, b.IsConstant(get_tuple_element));
+    EXPECT_FALSE(is_constant);
+  }
+  {
+    // cst -> add -> tuple -> get_tuple_element
+    XlaBuilder b(TestName());
+    auto cst = ConstantR0<float>(&b, 1.0);
+    auto add = Add(cst, cst);
+    auto tuple = Tuple(&b, {add, add});
+    auto get_tuple_element = GetTupleElement(tuple, 0);
+    TF_ASSERT_OK_AND_ASSIGN(bool is_constant, b.IsConstant(get_tuple_element));
+    EXPECT_TRUE(is_constant);
+  }
+  {
+    // cst,param -> add -> tuple -> get_tuple_element
+    XlaBuilder b(TestName());
+    auto cst = ConstantR0<float>(&b, 1.0);
+    auto param = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "p0");
+    auto add = Add(cst, param);
+    auto tuple = Tuple(&b, {add, add});
+    auto get_tuple_element = GetTupleElement(tuple, 0);
+    TF_ASSERT_OK_AND_ASSIGN(bool is_constant, b.IsConstant(get_tuple_element));
+    EXPECT_FALSE(is_constant);
+  }
+}
+
+TEST(XlaBuilderTest, ConstantSubgraph) {
+  // cst -> tuple -> get_tuple_element
+  XlaBuilder b(TestName());
+  auto cst = ConstantR1<float>(&b, {1.0});
+  auto tuple = Tuple(&b, {cst, cst});
+  auto get_tuple_element = GetTupleElement(tuple, 0);
+  // Returns ok if subgraph's root is a constant.
+  EXPECT_TRUE(b.BuildConstantSubGraph(get_tuple_element).ok());
+}
+
+TEST(XlaBuilderTest, ConstantSubgraphGetDimSize) {
+  // Needs a little hackery to update the param from dynamic to static since
+  // xla::GetDimensionSize will fold to a constant if the dimension is static.
+
+  // param->get_dim_size
+  XlaBuilder b(TestName());
+  internal::XlaBuilderFriend builder_friend;
+
+  auto param = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}, {true}), "p0");
+  auto get_dim_size = GetDimensionSize(param, 0);
+
+  // Make input param static
+  HloInstructionProto* param_proto = builder_friend.GetInstruction(param);
+  param_proto->mutable_shape()->set_is_dynamic_dimension(0, false);
+
+  EXPECT_TRUE(b.BuildConstantSubGraph(get_dim_size).ok());
+}
+
 TEST(XlaBuilderTest, OnePlusTwo) {
   XlaBuilder b(TestName());
   Add(ConstantR0<float>(&b, 1.0), ConstantR0<float>(&b, 2.0));
@@ -583,7 +654,7 @@ TEST(XlaBuilderTest, BroadcastInDimWithNegativeSize) {
                  /*broadcast_dimensions=*/{0, 1, 2});
   auto statusor = BuildHloModule(b);
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().message(), HasSubstr("invalid shape"));
+  EXPECT_THAT(statusor.status().message(), HasSubstr("Invalid dimension size"));
 }
 
 TEST(XlaBuilderTest, OperandFromWrongBuilder) {
@@ -788,6 +859,21 @@ TEST(XlaBuilderTest, AllToAllTuple) {
                              .WithPredicate(is_replica_group_pred)));
 }
 
+TEST(XlaBuilderTest, AllToAllTupleWithLayout) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 15]{1,0}"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, 45]{1,0}"));
+  AllToAllTuple(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                /*split_dimension=*/0,
+                /*concat_dimension=*/1,
+                /*split_count=*/3,
+                /*replica_groups=*/{});
+
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, AllReduceTuple) {
   XlaBuilder b(TestName());
   auto shape0 = ShapeUtil::MakeShape(F32, {});
@@ -860,6 +946,23 @@ TEST(XlaBuilderTest, GetDimensionSizeConstant) {
   EXPECT_EQ(GetRoot(*module)->opcode(), HloOpcode::kConstant);
 }
 
+TEST(XlaBuilderTest, OpToString) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
+  auto y = Add(x, x);
+  EXPECT_EQ(b.OpToString(y),
+            "add, shape=[5, 7], metadata={:0}\n"
+            "  parameter, shape=[5, 7], metadata={:0}\n"
+            "  parameter, shape=[5, 7], metadata={:0}");
+}
+
+TEST(XlaBuilderTest, ReplicaId) {
+  XlaBuilder b(TestName());
+  ReplicaId(&b);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_EQ(GetRoot(*module)->opcode(), HloOpcode::kReplicaId);
+}
+
 TEST(XlaBuilderTest, ReportError) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
@@ -1006,6 +1109,18 @@ TEST(XlaBuilderTest, RemoveDynamicDimensionMultiDims) {
   EXPECT_FALSE(root_shape.is_dynamic_dimension(1));
 }
 
+TEST(XlaBuilderTest, RemoveDynamicDimensionInBuild) {
+  XlaBuilder b(TestName());
+  auto dyn_shape = ShapeUtil::MakeShape(F32, {10, 10}, {true, true});
+  auto static_shape = ShapeUtil::MakeShape(F32, {10, 10});
+  auto param = Parameter(&b, 0, dyn_shape, "p0");
+  EXPECT_EQ(b.GetProgramShape().value().result(), dyn_shape);
+  EXPECT_EQ(b.GetProgramShape(param).value().result(), dyn_shape);
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
+                          b.Build(/*remove_dynamic_dimensions=*/true));
+  EXPECT_EQ(computation.GetProgramShape().value().result(), static_shape);
+}
+
 TEST(XlaBuilderTest, DynamicUnary) {
   XlaBuilder b(TestName());
   const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
@@ -1184,6 +1299,80 @@ TEST(XlaBuilderTest, DynamicConvolution) {
       << result_shape;
 }
 
+TEST(XlaBuilderTest, DynamicConvolutions) {
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {1, 2, 2, 128}, {true, false, false, false}),
+       ShapeUtil::MakeShape(F32, {2, 2, 128, 8}, {false, false, true, false}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.set_input_feature_dimension(3);
+  dnums.set_output_feature_dimension(3);
+  dnums.add_kernel_spatial_dimensions(0);
+  dnums.add_kernel_spatial_dimensions(1);
+  dnums.set_kernel_input_feature_dimension(2);
+  dnums.set_kernel_output_feature_dimension(3);
+  ASSERT_TRUE(XlaBuilder::Validate(dnums).ok());
+
+  {
+    // DynamicConvInputGrad
+    XlaBuilder b(TestName());
+    auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+    auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(U32, {4}), "p0");
+    auto input = GetTupleElement(p0, 0);
+    auto filter = GetTupleElement(p0, 1);
+    DynamicConvInputGrad(
+        p1, input, filter, {1, 1}, {{1, 1}, {1, 1}}, {1, 1}, {1, 1}, dnums,
+        /*feature_group_count=*/1, /*batch_group_count=*/1,
+        /*precision_config=*/nullptr, PaddingType::PADDING_VALID);
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+    EXPECT_TRUE(GetRoot(*module)->IsCustomCall("DynamicConvolutionInputGrad"));
+  }
+  {
+    // DynamicConvKernelGrad
+    XlaBuilder b(TestName());
+    auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+    auto input = GetTupleElement(p0, 0);
+    auto filter = GetTupleElement(p0, 1);
+    DynamicConvKernelGrad(
+        input, filter, {1, 1}, {{1, 1}, {1, 1}}, {1, 1}, {1, 1}, dnums,
+        /*feature_group_count=*/1, /*batch_group_count=*/1,
+        /*precision_config=*/nullptr, PaddingType::PADDING_VALID);
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+    EXPECT_TRUE(GetRoot(*module)->IsCustomCall("DynamicConvolutionKernelGrad"));
+  }
+  {
+    // DynamicConvForward
+    XlaBuilder b(TestName());
+    auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+    auto input = GetTupleElement(p0, 0);
+    auto filter = GetTupleElement(p0, 1);
+    DynamicConvForward(
+        input, filter, {1, 1}, {{1, 1}, {1, 1}}, {1, 1}, {1, 1}, dnums,
+        /*feature_group_count=*/1, /*batch_group_count=*/1,
+        /*precision_config=*/nullptr, PaddingType::PADDING_VALID);
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+    EXPECT_TRUE(GetRoot(*module)->IsCustomCall("DynamicConvolutionForward"));
+  }
+  {
+    // ConvWithGeneralPadding
+    XlaBuilder b(TestName());
+    auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+    auto input = GetTupleElement(p0, 0);
+    auto filter = GetTupleElement(p0, 1);
+    ConvWithGeneralPadding(input, filter, {1, 1}, {{1, 1}, {1, 1}},
+                           /*feature_group_count=*/1, /*batch_group_count=*/1,
+                           /*precision_config=*/nullptr, std::nullopt);
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+    EXPECT_EQ(GetRoot(*module)->opcode(), HloOpcode::kConvolution);
+  }
+}
+
 TEST(XlaBuilderTest, DynamicDot) {
   XlaBuilder b(TestName());
   const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
@@ -1996,9 +2185,9 @@ TEST(XlaBuilderTest, TopKDimensions) {
   const HloInstruction* root = GetRoot(*module);
   EXPECT_TRUE(root->opcode() == HloOpcode::kTopK);
   EXPECT_TRUE(root->shape().IsTuple());
-  EXPECT_EQ(root->shape().tuple_shapes_size(), 2);
-  EXPECT_EQ(root->shape().tuple_shapes(0).rank(), 2);
-  EXPECT_EQ(root->shape().tuple_shapes(1).rank(), 2);
+  EXPECT_EQ(root->shape().tuple_shapes().size(), 2);
+  EXPECT_EQ(root->shape().tuple_shapes(0).dimensions().size(), 2);
+  EXPECT_EQ(root->shape().tuple_shapes(1).dimensions().size(), 2);
   EXPECT_EQ(root->shape().tuple_shapes(0).dimensions(0), 6);
   EXPECT_EQ(root->shape().tuple_shapes(0).dimensions(1), k);
   EXPECT_EQ(root->shape().tuple_shapes(1).dimensions(0), 6);
@@ -2020,6 +2209,36 @@ TEST(XlaBuilderTest, ExpWithResultAccuracy) {
   EXPECT_EQ(root->result_accuracy().mode(), ResultAccuracy::DEFAULT);
 }
 
+TEST(XlaBuilderTest, RsqrtWithResultAccuracy) {
+  XlaBuilder b(TestName());
+  const Shape shape = ShapeUtil::MakeShape(F32, {1, 1});
+  ResultAccuracy result_accuracy;
+  ResultAccuracy::Tolerance tolerance;
+  tolerance.set_ulps(120.0f);
+  *result_accuracy.mutable_tolerance() = tolerance;
+  Rsqrt(Parameter(&b, 0, shape, "p0"), result_accuracy);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  const HloInstruction* root = GetRoot(*module);
+
+  EXPECT_EQ(root->result_accuracy().tolerance().ulps(), 120.0f);
+  EXPECT_EQ(root->result_accuracy().mode(), ResultAccuracy::DEFAULT);
+}
+
+TEST(XlaBuilderTest, CosineWithResultAccuracy) {
+  XlaBuilder b(TestName());
+  const Shape shape = ShapeUtil::MakeShape(F32, {1, 1});
+  ResultAccuracy result_accuracy;
+  ResultAccuracy::Tolerance tolerance;
+  tolerance.set_ulps(120.0f);
+  *result_accuracy.mutable_tolerance() = tolerance;
+  Cos(Parameter(&b, 0, shape, "p0"), result_accuracy);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  const HloInstruction* root = GetRoot(*module);
+
+  EXPECT_EQ(root->result_accuracy().tolerance().ulps(), 120.0f);
+  EXPECT_EQ(root->result_accuracy().mode(), ResultAccuracy::DEFAULT);
+}
+
 //============================================================================//
 // Experimental Test
 //============================================================================//
@@ -2183,6 +2402,41 @@ TEST(XlaBuilderTest, MhloDynamicReshapeRankMismatch) {
                             "of shape=2 (f32[?,15]) must match")));
 }
 
+TEST(XlaBuilderTest, ConvertSpmdShardToFullShape) {
+  XlaBuilder b(TestName());
+  auto shape = ShapeUtil::MakeShape(F32, {8, 8});
+  auto param = Parameter(&b, 0, shape, "operand");
+  OpSharding sharding;
+  sharding.ParseFromString("{devices=[4,8]<=[8,4]T(1,0)}");
+
+  ASSERT_TRUE(
+      ConvertSpmdShardToFullShape(&b, param, shape, 0, sharding, {}).ok());
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<xla::HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(module->ToString(), HasSubstr("custom_call_target=\"Sharding\", "
+                                            "sharding={manual}"));
+  EXPECT_THAT(module->ToString(),
+              HasSubstr("custom_call_target=\"SPMDShardToFullShape\", "
+                        "sharding={replicated}"));
+}
+
+TEST(XlaBuilderTest, ConvertSpmdFullToShardShape) {
+  XlaBuilder b(TestName());
+  auto shape = ShapeUtil::MakeShape(F32, {8, 8});
+  auto param = Parameter(&b, 0, shape, "operand");
+  OpSharding sharding;
+  sharding.ParseFromString("{devices=[4,8]<=[8,4]T(1,0)}");
+
+  ASSERT_TRUE(ConvertSpmdFullToShardShape(&b, param, 0, sharding, {}).ok());
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<xla::HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(module->ToString(), HasSubstr("custom_call_target=\"Sharding\", "
+                                            "sharding={replicated}"));
+  EXPECT_THAT(module->ToString(),
+              HasSubstr("custom_call_target=\"SPMDFullToShardShape\", "
+                        "sharding={manual}"));
+}
+
 //============================================================================//
 // Unbounded Dynamism Test
 //============================================================================//
@@ -3560,34 +3814,34 @@ TEST(XlaBuilderTest, UnboundedXor) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
-INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, XlaBuilderUnboundedUnaryOpTest,
-                         ::testing::ValuesIn<UnaryOpTestCase>(
-                             {{"f32[?]", "f32[?]", &Abs},
-                              {"f32[?]", "f32[?]", &Cbrt},
-                              {"f32[?]", "f32[?]", &Ceil},
-                              {"u32[?]", "u32[?]", &Clz},
-                              {"f32[?]", "f32[?]", &Cos},
-                              {"f32[?]", "f32[?]", &Erf},
-                              {"f32[?]", "f32[?]",
-                               [](XlaOp x) { return Exp(x); }},
-                              {"f32[?]", "f32[?]", &Expm1},
-                              {"f32[?]", "f32[?]", &Floor},
-                              {"f32[?]", "f32[?]", &Imag},
-                              {"f32[?]", "pred[?]", &IsFinite},
-                              {"f32[?]", "f32[?]", &Log},
-                              {"f32[?]", "f32[?]", &Log1p},
-                              {"f32[?]", "f32[?]", &Logistic},
-                              {"f32[?]", "f32[?]", &Neg},
-                              {"s32[?]", "s32[?]", &Not},
-                              {"u32[?]", "u32[?]", &PopulationCount},
-                              {"f32[?]", "f32[?]", &Real},
-                              {"f32[?]", "f32[?]", &Round},
-                              {"f32[?]", "f32[?]", &RoundNearestEven},
-                              {"f32[?]", "f32[?]", &Rsqrt},
-                              {"f32[?]", "f32[?]", &Sign},
-                              {"f32[?]", "f32[?]", &Sin},
-                              {"f32[?]", "f32[?]", &Sqrt},
-                              {"f32[?]", "f32[?]", &Tanh}}));
+INSTANTIATE_TEST_SUITE_P(
+    UnboundedDynamism, XlaBuilderUnboundedUnaryOpTest,
+    ::testing::ValuesIn<UnaryOpTestCase>(
+        {{"f32[?]", "f32[?]", &Abs},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Cbrt(x); }},
+         {"f32[?]", "f32[?]", &Ceil},
+         {"u32[?]", "u32[?]", &Clz},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Cos(x); }},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Erf(x); }},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Exp(x); }},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Expm1(x); }},
+         {"f32[?]", "f32[?]", &Floor},
+         {"f32[?]", "f32[?]", &Imag},
+         {"f32[?]", "pred[?]", &IsFinite},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Log(x); }},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Log1p(x); }},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Logistic(x); }},
+         {"f32[?]", "f32[?]", &Neg},
+         {"s32[?]", "s32[?]", &Not},
+         {"u32[?]", "u32[?]", &PopulationCount},
+         {"f32[?]", "f32[?]", &Real},
+         {"f32[?]", "f32[?]", &Round},
+         {"f32[?]", "f32[?]", &RoundNearestEven},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Rsqrt(x); }},
+         {"f32[?]", "f32[?]", &Sign},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Sin(x); }},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Sqrt(x); }},
+         {"f32[?]", "f32[?]", [](XlaOp x) { return Tanh(x); }}}));
 
 INSTANTIATE_TEST_SUITE_P(
     UnboundedDynamism, XlaBuilderUnboundedBinaryOpTest,
diff --git a/third_party/xla/xla/hlo/builder/xla_computation.cc b/third_party/xla/xla/hlo/builder/xla_computation.cc
index 1d01870f1d85..50beed5d7656 100644
--- a/third_party/xla/xla/hlo/builder/xla_computation.cc
+++ b/third_party/xla/xla/hlo/builder/xla_computation.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 
 #include <memory>
-#include <utility>
 
 #include "absl/status/statusor.h"
 #include "xla/service/hlo.pb.h"
@@ -28,7 +27,7 @@ namespace xla {
 
 absl::StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
   TF_RET_CHECK(proto_.has_host_program_shape());
-  return ProgramShape(proto_.host_program_shape());
+  return ProgramShape::FromProto(proto_.host_program_shape());
 }
 
 absl::StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
@@ -37,7 +36,7 @@ absl::StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
   }
   auto session = std::make_unique<HloSnapshot>();
   *session->mutable_hlo()->mutable_hlo_module() = proto_;
-  return std::move(session);
+  return session;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index e5fbfba1960d..f996c13051ab 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   XLA evaluator implementation.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -56,8 +56,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:tuple_points_to_analysis",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/service:call_graph",
         "//xla/service:compilation_environments",
         "//xla/service:dynamic_dimension_inference",
         "//xla/service:gather_scatter_utils",
@@ -144,7 +142,6 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/hlo/transforms/simplifiers:hlo_element_type_converter",
-        "//xla/service:call_graph",
         "//xla/service:dynamic_dimension_inference",
         "//xla/service:hlo_module_config",
         "//xla/service:shape_inference",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index 70d2ec5bd1df..3c978781623d 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -59,17 +59,16 @@ limitations under the License.
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/hlo_query.h"
 #include "xla/index_util.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
-#include "xla/service/call_graph.h"
 #include "xla/service/compilation_environments.h"
 #include "xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "xla/service/gather_scatter_utils.h"
@@ -133,7 +132,7 @@ absl::StatusOr<Literal> Compare(const Shape& shape, Comparison comparison,
             return compare_op(lhs, rhs);
           }));
     }
-    return std::move(result);
+    return result;
   };
   switch (comparison.GetDirection()) {
     case ComparisonDirection::kEq:
@@ -318,11 +317,11 @@ std::optional<ParamIndexAndValue> TryParsingInstructionAsParameterAndInteger(
   }
   std::optional<DynamicOrStaticInteger> integer_value =
       GetInstructionValueAsInteger(instruction, precomputed_analyses);
-  result.value = std::move(integer_value);
+  result.value = integer_value;
   if (!result.IsValid()) {
     return std::nullopt;
   }
-  return std::optional<ParamIndexAndValue>(std::move(result));
+  return result;
 }
 
 // Represents the while loop condition comparison.
@@ -377,8 +376,7 @@ std::optional<WhileCondComparisonOrNoOp> PatternMatchLoopCondComparison(
   if (!lhs.has_value() || !rhs.has_value()) {
     return std::nullopt;
   }
-  return WhileCondComparison{comparison->comparison_direction(),
-                             *std::move(lhs), *std::move(rhs)};
+  return WhileCondComparison{comparison->comparison_direction(), *lhs, *rhs};
 }
 // Finds the while loop condition comparison by matching the loop condition root
 // with known patterns.
@@ -403,7 +401,7 @@ std::optional<WhileCondComparisonOrNoOp> PatternMatchLoopCondRoot(
   if (Match(loop_cond_root, match::GetTupleElement().WithOperand(
                                 0, match::Parameter().WithParameterNum(0)))) {
     if (loop_cond_root->shape().element_type() != PrimitiveType::PRED &&
-        loop_cond_root->shape().rank() != 0) {
+        loop_cond_root->shape().dimensions().size() != 0) {
       return std::nullopt;
     }
     return ParamIndexAndValue{{/*param_index=*/loop_cond_root->tuple_index()}};
@@ -935,7 +933,6 @@ absl::StatusOr<Literal> HloEvaluator::Evaluate(
   // Reset evaluation state with the argument literals.
   ScopedEvaluateState evaluate_state(&state_, args);
 
-  call_graph_cache_.reset();
   tuple_points_to_analysis_cache_.reset();
 
   // Re-seed RNG, either from the configuration's seed or a monotonic
@@ -966,10 +963,17 @@ absl::StatusOr<Literal> HloEvaluator::Evaluate(
 
 absl::StatusOr<Literal> HloEvaluator::Evaluate(
     const HloInstruction* instruction, PrecomputedAnalyses precomputed_analyses,
-    bool recursively_evaluate_nonconstant_operands) {
+    bool recursively_evaluate_nonconstant_operands,
+    const absl::flat_hash_map<const HloInstruction*, const LiteralBase*>&
+        substitutions) {
   ScopedEvaluateState evaluate_state(&state_);
 
-  call_graph_cache_.reset();
+  // Use the substitutions to manually set instructions results to a specific
+  // value.
+  for (const auto& [substituted_instr, literal_value] : substitutions) {
+    SetEvaluatedLiteralFor(substituted_instr, literal_value->Clone());
+  }
+
   tuple_points_to_analysis_cache_.reset();
   auto enable_partial_evaluation_cleanup =
       absl::MakeCleanup([this] { enable_partial_evaluation_ = false; });
@@ -999,48 +1003,6 @@ bool HloEvaluator::TryEvaluate(const HloInstruction* instruction,
   return true;
 }
 
-absl::StatusOr<Literal> HloEvaluator::EvaluateWithSubstitutions(
-    const HloInstruction* instruction,
-    const absl::flat_hash_map<const HloInstruction*, const LiteralBase*>&
-        substitutions,
-    bool recursively_evaluate_nonconstant_operands) {
-  std::vector<std::unique_ptr<HloInstruction>> owned_operands;
-  for (const HloInstruction* operand : instruction->operands()) {
-    auto it = substitutions.find(operand);
-    if (it == substitutions.end()) {
-      if (recursively_evaluate_nonconstant_operands) {
-        TF_ASSIGN_OR_RETURN(Literal value,
-                            EvaluateWithSubstitutions(
-                                operand, substitutions,
-                                recursively_evaluate_nonconstant_operands));
-        owned_operands.push_back(HloInstruction::CreateConstant(value.Clone()));
-      } else {
-        if (!operand->IsConstant()) {
-          VLOG(2) << "EvaluateWithSubstitutions called when not all operands "
-                     "are constant. Consider calling it with "
-                     "`recursively_evaluate_non_constant_operands` true.";
-        }
-        owned_operands.push_back(operand->Clone());
-      }
-    } else {
-      owned_operands.push_back(
-          HloInstruction::CreateConstant(it->second->Clone()));
-    }
-  }
-
-  std::vector<HloInstruction*> operands;
-  operands.reserve(owned_operands.size());
-  for (auto& operand : owned_operands) {
-    operands.push_back(operand.get());
-  }
-
-  std::unique_ptr<HloInstruction> cloned_instruction =
-      instruction->CloneWithNewOperands(instruction->shape(), operands);
-  auto result = Evaluate(cloned_instruction.get());
-
-  return result;
-}
-
 absl::StatusOr<Literal> HloEvaluator::EvaluateElementwiseBinaryOp(
     HloOpcode opcode, const Literal& lhs, const Literal& rhs) {
   std::unique_ptr<HloInstruction> lhs_instr =
@@ -1129,8 +1091,7 @@ absl::Status HloEvaluator::EvaluateParameterFromCallerArgument(
     PrecomputedAnalyses analyses) {
   CHECK(!state_.has_evaluated(parameter));
   const HloComputation* parent_computation = parameter->parent();
-  std::vector<HloInstruction*> computation_callers =
-      analyses.call_graph->GetComputationCallers(parent_computation);
+  auto computation_callers = parent_computation->caller_instructions();
   // If the parent computation has multiple callers, we cannot determine from
   // which caller the arguments are passed.
   if (computation_callers.size() != 1) {
@@ -1150,14 +1111,28 @@ absl::Status HloEvaluator::EvaluateParameterFromCallerArgument(
         ", which is not yet supported.");
   }
   if (computation_caller->opcode() == HloOpcode::kWhile) {
+    if (!analyses.tuple_points_to && !tuple_points_to_analysis_cache_) {
+      absl::StatusOr<std::unique_ptr<TuplePointsToAnalysis>> tuple_points_to =
+          TuplePointsToAnalysis::Run(parameter->GetModule());
+      if (!tuple_points_to.ok()) {
+        return absl::FailedPreconditionError(
+            "Failed to run TuplePointsToAnalysis.");
+      }
+      tuple_points_to_analysis_cache_ = *std::move(tuple_points_to);
+    }
+    TuplePointsToAnalysis* tuple_points_to_analysis =
+        analyses.tuple_points_to != nullptr
+            ? analyses.tuple_points_to
+            : tuple_points_to_analysis_cache_.get();
+
     HloComputation* while_body = computation_caller->while_body();
     TF_ASSIGN_OR_RETURN(
         const LogicalBuffer* logical_buffer,
-        analyses.tuple_points_to->GetBufferDefinedAt(
+        tuple_points_to_analysis->GetBufferDefinedAt(
             while_body->parameter_instruction(parameter->parameter_number()),
             shape_index));
     const TuplePointsToAnalysis::BufferAliasVector& buffer_aliases =
-        analyses.tuple_points_to->GetBufferAliases(*logical_buffer);
+        tuple_points_to_analysis->GetBufferAliases(*logical_buffer);
     bool unchanged_in_return = false;
     for (const BufferAlias& buffer_alias : buffer_aliases) {
       if (buffer_alias.instruction() == while_body->root_instruction() &&
@@ -1206,7 +1181,7 @@ std::vector<int64_t> HloEvaluator::GetS64Indices(
 }
 
 DimensionVector HloEvaluator::MakeDimMultipliers(const Shape& shape) {
-  DimensionVector v(shape.rank());
+  DimensionVector v(shape.dimensions().size());
   int64_t scale = 1;
   for (auto dim : LayoutUtil::MinorToMajor(shape)) {
     v[dim] = scale;
@@ -1226,10 +1201,13 @@ absl::Status HloEvaluator::EvaluateInternal(
   }
 
   if (!recursively_evaluate_nonconstant_operands) {
-    if (!hlo_query::AllOperandsAreConstants(*instruction)) {
-      return absl::FailedPreconditionError(
-          absl::StrCat("Not all operands are constants. Instruction: ",
-                       instruction->ToString()));
+    for (const HloInstruction* operand : instruction->operands()) {
+      if (!IsAlreadyEvaluated(operand, shape_index)) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Not all operands are constants or have known "
+                         "results. Instruction: ",
+                         instruction->ToString()));
+      }
     }
   } else {
     if (instruction->opcode() == HloOpcode::kGetTupleElement) {
@@ -1248,38 +1226,13 @@ absl::Status HloEvaluator::EvaluateInternal(
                            precomputed_analyses, new_shape_index,
                            /*recursively_evaluate_nonconstant_operands=*/true));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
-      CallGraph* call_graph =
-          (precomputed_analyses.call_graph != nullptr)
-              ? precomputed_analyses.call_graph
-              : std::invoke([this, instruction]() -> CallGraph* {
-                  call_graph_cache_ =
-                      CallGraph::Build(instruction->GetModule());
-                  return call_graph_cache_.get();
-                });
-      TuplePointsToAnalysis* tuple_points_to_analysis =
-          (precomputed_analyses.tuple_points_to != nullptr)
-              ? precomputed_analyses.tuple_points_to
-              : std::invoke([this, instruction]() -> TuplePointsToAnalysis* {
-                  absl::StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-                      tuple_points_to_analysis =
-                          TuplePointsToAnalysis::Run(instruction->GetModule());
-                  if (!tuple_points_to_analysis.ok()) {
-                    return nullptr;
-                  }
-                  tuple_points_to_analysis_cache_ =
-                      *std::move(tuple_points_to_analysis);
-                  return tuple_points_to_analysis_cache_.get();
-                });
-      if (call_graph && tuple_points_to_analysis) {
-        absl::Status argument_eval_status = EvaluateParameterFromCallerArgument(
-            instruction, shape_index, {tuple_points_to_analysis, call_graph});
-        if (!argument_eval_status.ok()) {
-          VLOG(4) << "Failed to evaluate parameter " << instruction->name()
-                  << " from caller. Reason: " << argument_eval_status.message();
-        } else {
-          VLOG(4) << "Successfully evaluated parameter: "
-                  << instruction->name();
-        }
+      absl::Status argument_eval_status = EvaluateParameterFromCallerArgument(
+          instruction, shape_index, precomputed_analyses);
+      if (!argument_eval_status.ok()) {
+        VLOG(4) << "Failed to evaluate parameter " << instruction->name()
+                << " from caller. Reason: " << argument_eval_status.message();
+      } else {
+        VLOG(4) << "Successfully evaluated parameter: " << instruction->name();
       }
     } else {
       for (HloInstruction* operand : instruction->operands()) {
@@ -1465,7 +1418,7 @@ absl::Status HloEvaluator::HandleConcatenate(
   // concatenate dimensions of the operands taking part of the operation.
   const Shape& reference_shape = operands[0]->shape();
   CHECK(reference_shape.IsArray());
-  const int64_t rank = reference_shape.rank();
+  const int64_t rank = reference_shape.dimensions().size();
   const int64_t concat_dim = concatenate->dimensions()[0];
   CHECK_GE(concat_dim, 0);
   CHECK_LT(concat_dim, rank);
@@ -1691,7 +1644,7 @@ absl::Status HloEvaluator::HandleTuple(const HloInstruction* tuple) {
     CHECK(new_result.IsDetermined(visitor_shape_index_));
     Literal literal;
     TF_RETURN_IF_ERROR(
-        literal.CopyFrom(std::move(new_result),
+        literal.CopyFrom(new_result,
                          /*dest_shape_index=*/visitor_shape_index_,
                          /*src_shape_index=*/visitor_shape_index_));
     SetEvaluatedLiteralFor(tuple, std::move(literal));
@@ -1842,7 +1795,8 @@ class FftTransform {
         return false;
       };
       GenerateIndices(output_lengths, output_strides, input_lengths,
-                      input_strides, input_shape.rank(), 0, 0, base_case);
+                      input_strides, input_shape.dimensions().size(), 0, 0,
+                      base_case);
     }
 
     return absl::OkStatus();
@@ -2306,7 +2260,7 @@ class FftTransform {
       return InvalidArgument("Invalid input type: %d, must be %d (complex64).",
                              input_elt_type, PrimitiveType::C64);
     }
-    const int64_t input_rank = input_shape.rank();
+    const int64_t input_rank = input_shape.dimensions().size();
     if (input_rank < fft_rank_) {
       return InvalidArgument("Input shape rank is smaller than FFT rank.");
     }
@@ -2325,7 +2279,7 @@ class FftTransform {
       return InvalidArgument("Invalid output type: %d, must be %d (complex64).",
                              output_elt_type, PrimitiveType::C64);
     }
-    const int64_t output_rank = output_shape.rank();
+    const int64_t output_rank = output_shape.dimensions().size();
     if (output_rank < fft_rank_) {
       return InvalidArgument("Output shape rank is smaller than FFT rank.");
     }
@@ -2370,7 +2324,7 @@ absl::Status HloEvaluator::HandleFft(const HloInstruction* fft) {
 // dimensions while keeping the rest of the output dimensions clamped to 0.
 ShapeUtil::IndexIterationSpace IterationSpaceForOutputBatchIndices(
     const Shape& output_shape, const GatherDimensionNumbers& dim_numbers) {
-  int64_t output_rank = output_shape.dimensions_size();
+  int64_t output_rank = output_shape.dimensions().size();
   std::vector<int64_t> index_base(output_rank, 0);
   std::vector<int64_t> index_count;
   index_count.reserve(output_rank);
@@ -2421,12 +2375,12 @@ class OutputBatchIndexToInputIndex {
       const GatherDimensionNumbers* dim_numbers, const Shape& input_shape,
       const Shape& output_shape, const Literal* start_indices)
       : dim_numbers_(*dim_numbers), start_indices_(*start_indices) {
-    for (int64_t i = 0; i < output_shape.dimensions_size(); i++) {
+    for (int64_t i = 0; i < output_shape.dimensions().size(); i++) {
       output_dim_is_batch_dims_.push_back(
           !absl::c_binary_search(dim_numbers_.offset_dims(), i));
     }
 
-    for (int64_t i = 0; i < input_shape.dimensions_size(); i++) {
+    for (int64_t i = 0; i < input_shape.dimensions().size(); i++) {
       int64_t index_of_input_dim_in_index_vector =
           std::distance(dim_numbers_.start_index_map().begin(),
                         absl::c_find(dim_numbers_.start_index_map(), i));
@@ -2439,8 +2393,8 @@ class OutputBatchIndexToInputIndex {
       }
     }
 
-    index_vector_index_.resize(start_indices_.shape().dimensions_size());
-    input_index_.resize(input_shape.dimensions_size());
+    index_vector_index_.resize(start_indices_.shape().dimensions().size());
+    input_index_.resize(input_shape.dimensions().size());
     int64_t index_vector_size =
         start_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
     index_vector_.resize(index_vector_size);
@@ -2449,7 +2403,8 @@ class OutputBatchIndexToInputIndex {
         GetStartIndicesDimToOutputDimForExplicitBatchingDims(
             dim_numbers_.start_indices_batching_dims(),
             dim_numbers_.index_vector_dim(), dim_numbers_.offset_dims(),
-            start_indices_.shape().rank(), output_shape.rank());
+            start_indices_.shape().dimensions().size(),
+            output_shape.dimensions().size());
     for (int64_t i = 0; i < dim_numbers->operand_batching_dims().size(); ++i) {
       int64_t operand_dim = dim_numbers->operand_batching_dims(i);
       int64_t start_indices_dim = dim_numbers->start_indices_batching_dims(i);
@@ -2573,7 +2528,7 @@ class OutputOffsetIndexToInputIndex {
       const GatherDimensionNumbers& dim_numbers, const Shape& input_shape) {
     CHECK(absl::c_is_sorted(dim_numbers.offset_dims()));
     int64_t window_dim_count = 0;
-    for (int64_t i = 0; i < input_shape.dimensions_size(); i++) {
+    for (int64_t i = 0; i < input_shape.dimensions().size(); i++) {
       if (IsCollapsedOrBatchingDim(dim_numbers.collapsed_slice_dims(),
                                    dim_numbers.operand_batching_dims(), i)) {
         input_dim_value_to_output_index_.push_back(-1);
@@ -2583,7 +2538,7 @@ class OutputOffsetIndexToInputIndex {
       }
     }
 
-    input_index_.resize(input_shape.dimensions_size());
+    input_index_.resize(input_shape.dimensions().size());
   }
 
   // Returns the contribution of the window indices to the input index
@@ -2638,7 +2593,7 @@ class OutputOffsetIndexToInputIndex {
 static absl::StatusOr<std::reference_wrapper<const Literal>>
 ReshapedGatherIndices(int64_t index_vector_dim, const Literal& start_indices,
                       Literal* reshaped_start_indices) {
-  if (start_indices.shape().dimensions_size() != index_vector_dim) {
+  if (start_indices.shape().dimensions().size() != index_vector_dim) {
     return std::cref(start_indices);
   }
 
@@ -2678,13 +2633,13 @@ absl::Status HloEvaluator::HandleGather(const HloInstruction* gather) {
       IterationSpaceForOutputBatchIndices(shape, dim_numbers);
   ShapeUtil::IndexIterationSpace offset_indices_iteration_space =
       IterationSpaceForOutputOffsetIndices(
-          shape.dimensions_size(), gather->gather_slice_sizes(), dim_numbers);
+          shape.dimensions().size(), gather->gather_slice_sizes(), dim_numbers);
 
   // Scratch buffers that hold an index in the output shape and the
   // corresponding index in the input shape.
-  std::vector<int64_t> input_index(operand.shape().dimensions_size());
-  std::vector<int64_t> output_index(gather->shape().dimensions_size());
-  std::vector<int64_t> input_index_clamped(operand.shape().dimensions_size());
+  std::vector<int64_t> input_index(operand.shape().dimensions().size());
+  std::vector<int64_t> output_index(gather->shape().dimensions().size());
+  std::vector<int64_t> input_index_clamped(operand.shape().dimensions().size());
 
   OutputBatchIndexToInputIndex output_batch_index_to_input_index(
       &gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
@@ -2760,7 +2715,7 @@ namespace {
 absl::StatusOr<std::reference_wrapper<const Literal>> ReshapedScatterIndices(
     int64_t index_vector_dim, const Literal& indices,
     Literal* reshaped_indices) {
-  if (indices.shape().dimensions_size() != index_vector_dim) {
+  if (indices.shape().dimensions().size() != index_vector_dim) {
     return std::cref(indices);
   }
 
@@ -2858,7 +2813,7 @@ class UpdateScatterIndexToInputIndex {
       }
     }
 
-    index_vector_index_.resize(scatter_indices_.shape().dimensions_size());
+    index_vector_index_.resize(scatter_indices_.shape().dimensions().size());
     input_index_.resize(input_rank);
     int64_t index_vector_size =
         scatter_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
@@ -2868,7 +2823,7 @@ class UpdateScatterIndexToInputIndex {
         GetStartIndicesDimToOutputDimForExplicitBatchingDims(
             dim_numbers_.scatter_indices_batching_dims(),
             dim_numbers_.index_vector_dim(), dim_numbers_.update_window_dims(),
-            scatter_indices_.shape().rank(), updates_rank);
+            scatter_indices_.shape().dimensions().size(), updates_rank);
     for (int64_t i = 0; i < dim_numbers.input_batching_dims().size(); ++i) {
       int64_t input_dim = dim_numbers.input_batching_dims(i);
       int64_t scatter_indices_dim =
@@ -3211,9 +3166,11 @@ absl::Status HloEvaluator::HandleBroadcast(const HloInstruction* broadcast) {
   TF_RET_CHECK(broadcast->shape().element_type() ==
                operand.shape().element_type())
       << " broadcast from a different data type is not supported";
-  TF_RET_CHECK(broadcast->dimensions().size() == operand.shape().rank())
+  TF_RET_CHECK(broadcast->dimensions().size() ==
+               operand.shape().dimensions().size())
       << "broadcast dimensions is of size: " << broadcast->dimensions().size()
-      << " and rank of operand_to_broadcast is: " << operand.shape().rank();
+      << " and rank of operand_to_broadcast is: "
+      << operand.shape().dimensions().size();
   // Checks that operand's dimensions are the same as the broadcast's
   // dimensions along the dimensions to be broadcasted.
   for (int64_t i = 0; i < broadcast->dimensions().size(); ++i) {
@@ -3227,9 +3184,14 @@ absl::Status HloEvaluator::HandleBroadcast(const HloInstruction* broadcast) {
         broadcast->ToString());
   }
 
-  TF_ASSIGN_OR_RETURN(
-      Literal literal,
-      operand.Broadcast(broadcast->shape(), broadcast->dimensions()));
+  auto shape = broadcast->shape();
+  // operand.Broadcast requires a layout, but there may not be one if we're in a
+  // fusion.
+  if (!shape.has_layout()) {
+    LayoutUtil::SetToDefaultLayout(&shape);
+  }
+  TF_ASSIGN_OR_RETURN(Literal literal,
+                      operand.Broadcast(shape, broadcast->dimensions()));
   SetEvaluatedLiteralFor(broadcast, std::move(literal));
 
   return absl::OkStatus();
@@ -3413,39 +3375,26 @@ absl::Status HloEvaluator::HandleCall(const HloInstruction* call) {
 }
 
 absl::Status HloEvaluator::HandleFusion(const HloInstruction* fusion) {
-  HloModuleConfig config;
-  // Attach cloned computation to an empty HLO module so the existing ones are
-  // not modified.
-  HloModule empty_hlo_module("EmptyModuleForFusion", config,
-                             std::make_unique<CompilationEnvironments>(
-                                 fusion->GetModule()->comp_envs()));
-  HloCloneContext context(&empty_hlo_module);
-  auto cloned_fused_computation =
-      fusion->fused_instructions_computation()->Clone(
-          /*suffix=*/"clone_with_layout", &context);
-  for (auto* instruction : cloned_fused_computation->instructions()) {
-    if (!LayoutUtil::HasLayout(instruction->shape())) {
-      LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
-    }
-  }
-  auto readded_computation =
-      empty_hlo_module.AddEntryComputation(std::move(cloned_fused_computation));
-
-  auto operands = fusion->operands();
-  std::vector<const Literal*> arg_literals;
-  arg_literals.reserve(operands.size());
-  for (auto operand : operands) {
-    const Literal& arg_literal = GetEvaluatedLiteralFor(operand);
-    arg_literals.push_back(&arg_literal);
-  }
+  auto* computation = fusion->fused_instructions_computation();
 
   std::unique_ptr<HloEvaluator> embedded_evaluator =
       CreateEmbedded(max_loop_iterations_);
   embedded_evaluator->set_dynamic_dimension_inference(
       dynamic_dimension_inference_);
-  TF_ASSIGN_OR_RETURN(Literal result, embedded_evaluator->Evaluate(
-                                          *readded_computation, arg_literals));
 
+  absl::flat_hash_map<const HloInstruction*, const LiteralBase*> substitutions;
+  const auto& operands = fusion->operands();
+  for (int i = 0; i < operands.size(); ++i) {
+    substitutions[computation->parameter_instruction(i)] =
+        &GetEvaluatedLiteralFor(operands[i]);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      Literal result,
+      embedded_evaluator->Evaluate(
+          fusion->fused_expression_root(),
+          /*precomputed_analyses=*/{},
+          /*recursively_evaluate_nonconstant_operands=*/true, substitutions));
   SetEvaluatedLiteralFor(fusion, std::move(result));
   return absl::OkStatus();
 }
@@ -3563,7 +3512,7 @@ absl::Status HloEvaluator::HandleDynamicUpdateSlice(const HloInstruction* dus) {
   const Literal& update_literal = GetEvaluatedLiteralFor(update);
 
   auto result = operand_literal.Clone();
-  const auto rank = result.shape().rank();
+  const auto rank = result.shape().dimensions().size();
   std::vector<int64_t> start =
       GetS64Indices(absl::MakeConstSpan(dus->operands()).subspan(2));
 
@@ -3583,8 +3532,8 @@ absl::Status HloEvaluator::HandleDynamicUpdateSlice(const HloInstruction* dus) {
     return true;
   };
 
-  std::vector<int64_t> base(update_literal.shape().dimensions_size(), 0);
-  std::vector<int64_t> step(update_literal.shape().dimensions_size(), 1);
+  std::vector<int64_t> base(update_literal.shape().dimensions().size(), 0);
+  std::vector<int64_t> step(update_literal.shape().dimensions().size(), 1);
   ShapeUtil::ForEachIndexNoStatus(update_literal.shape(), base,
                                   update_literal.shape().dimensions(), step,
                                   func);
@@ -3768,7 +3717,7 @@ void IterateThroughWindow(
     const Shape& window_shape, const Window& window, const Shape& base_shape,
     const absl::Span<const int64_t> window_count_index,
     const std::function<void(absl::Span<const int64_t>)>& f) {
-  const int64_t rank = base_shape.rank();
+  const int64_t rank = base_shape.dimensions().size();
   DimensionVector window_index(rank);
   std::fill(window_index.begin(), window_index.end(), 0);
   do {
@@ -3866,7 +3815,7 @@ absl::StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
         return stochastic_convert_op(operand_literal.Get<Fp>(multi_index),
                                      random_literal.Get<Uint>(multi_index));
       }));
-  return std::move(result);
+  return result;
 }
 
 // Converts from primitive types to native types.
@@ -3991,7 +3940,7 @@ absl::Status HloEvaluator::HandleSelectAndScatter(
   const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
   const Literal& source_literal = GetEvaluatedLiteralFor(source);
 
-  int64_t rank = operand_literal.shape().rank();
+  int64_t rank = operand_literal.shape().dimensions().size();
 
   HloEvaluator embedded_evaluator(max_loop_iterations_);
   DimensionVector source_index(rank, 0);
@@ -4058,7 +4007,8 @@ absl::Status HloEvaluator::HandleSelectAndScatter(
   return absl::OkStatus();
 }
 
-absl::Status HloEvaluator::HandleSlice(const HloInstruction* slice) {
+absl::Status HloEvaluator::HandleSlice(const HloInstruction* hlo) {
+  const HloSliceInstruction* slice = Cast<HloSliceInstruction>(hlo);
   auto operand = slice->operand(0);
   const Shape& shape = slice->shape();
   TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
@@ -4070,7 +4020,7 @@ absl::Status HloEvaluator::HandleSlice(const HloInstruction* slice) {
       << " but is inferred to be: "
       << ShapeUtil::HumanString(inferred_return_shape);
 
-  const int64_t rank = operand->shape().rank();
+  const int64_t rank = operand->shape().dimensions().size();
   const Literal& operand_literal = GetEvaluatedLiteralFor(operand);
   const size_t element_byte_size =
       primitive_util::ByteWidth(shape.element_type());
@@ -4109,7 +4059,7 @@ absl::Status HloEvaluator::HandleSort(const HloInstruction* sort) {
     }
   }
   Shape key_shape = sort->operand(0)->shape();
-  auto rank = key_shape.rank();
+  auto rank = key_shape.dimensions().size();
   std::vector<Literal> result_literals;
   result_literals.reserve(sort->operand_count());
   for (int64_t i = 0; i < sort->operand_count(); ++i) {
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
index bd1890be1008..31a780b3a9c3 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_HLO_EVALUATOR_HLO_EVALUATOR_H_
 #define XLA_HLO_EVALUATOR_HLO_EVALUATOR_H_
 
+#include "absl/log/log.h"
 #define _USE_MATH_DEFINES
 
 #include <complex>
@@ -46,7 +47,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
-#include "xla/service/call_graph.h"
 #include "xla/service/dynamic_dimension_inference.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -67,7 +67,6 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
   // recomputation during evaluation.
   struct PrecomputedAnalyses {
     TuplePointsToAnalysis* tuple_points_to;
-    CallGraph* call_graph;
   };
 
   // Only evaluate up to max_loop_iterations per while-loop execution if
@@ -148,27 +147,24 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
   // The caller may pass in non-null `precomputed_analyses` to avoid
   // recomputation during evaluation; the caller must ensure that any
   // precomputed analyses were performed on the module containing `instruction`.
+  // The optional `substitutions` map can be used to substitute the given
+  // literals for any instruction in the evaluation graph, usually some of the
+  // instruction's operands.
+  //
+  // For example, given instruction = op(A, B, C) and the map
+  // {A = x, C = y}, this evaluates op(x, B, y).
   absl::StatusOr<Literal> Evaluate(
       const HloInstruction* instruction,
       PrecomputedAnalyses precomputed_analyses = {},
-      bool recursively_evaluate_nonconstant_operands = false);
+      bool recursively_evaluate_nonconstant_operands = false,
+      const absl::flat_hash_map<const HloInstruction*, const LiteralBase*>&
+          substitutions = {});
 
   // Same as Evaluate, except returning false on error and accepts an output
   // pointer.
   bool TryEvaluate(const HloInstruction* instruction, Literal* result,
                    bool recursively_evaluate_nonconstant_operands = false);
 
-  // Evaluates a single HLO instruction, substituting the given literals for
-  // some of the instruction's operands.
-  //
-  // For example, given instruction = op(A, B, C) and the map
-  // {A = x, C = y}, this evaluates op(x, B, y).
-  absl::StatusOr<Literal> EvaluateWithSubstitutions(
-      const HloInstruction* instruction,
-      const absl::flat_hash_map<const HloInstruction*, const LiteralBase*>&
-          substitutions,
-      bool recursively_evaluate_nonconstant_operands = false);
-
   absl::StatusOr<Literal> EvaluateElementwiseBinaryOp(HloOpcode opcode,
                                                       const Literal& lhs,
                                                       const Literal& rhs);
@@ -355,7 +351,7 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
   absl::Status HandleReverse(const HloInstruction* reverse) override;
   absl::Status HandleSelectAndScatter(
       const HloInstruction* select_and_scatter) override;
-  absl::Status HandleSlice(const HloInstruction* slice) override;
+  absl::Status HandleSlice(const HloInstruction* hlo) override;
   absl::Status HandleSort(const HloInstruction* sort) override;
   absl::Status HandleStochasticConvert(
       const HloInstruction* stochastic_convert) override;
@@ -415,6 +411,9 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
   // Returns the already-evaluated literal result for the instruction and
   // removes it from internal evaluate state.
   Literal ExtractEvaluatedLiteralFor(const HloInstruction* hlo) {
+    if (state_.has_evaluated(hlo)) {
+      return state_.extract_evaluated(hlo);
+    }
     if (hlo->IsConstant()) {
       return hlo->literal().Clone();
     }
@@ -422,9 +421,7 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
       return state_.arg(hlo->parameter_number())->Clone();
     }
 
-    CHECK(state_.has_evaluated(hlo))
-        << "could not find evaluated value for: " << hlo->ToString();
-    return state_.extract_evaluated(hlo);
+    LOG(FATAL) << "could not find evaluated value for: " << hlo->ToString();
   }
 
   // Returns true if the given hlo has been evaluated and cached.
@@ -593,7 +590,6 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
   TraceMACHandler trace_mac_handler_;
 
   // TODO(ezhulenev): Move cache members to EvaluationState.
-  std::unique_ptr<CallGraph> call_graph_cache_;
   std::unique_ptr<TuplePointsToAnalysis> tuple_points_to_analysis_cache_;
 
   // Set by EvaluateInternal and opportunitiscally used by the HandleXXX
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
index 3fe3f01de7f5..a804608c7183 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
-#include "xla/service/call_graph.h"
 #include "xla/service/dynamic_dimension_inference.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/shape_inference.h"
@@ -3312,12 +3311,21 @@ TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutions) {
   HloEvaluator evaluator;
   Literal param0_literal = LiteralUtil::CreateR1<float>({1, 2, 3, 4});
   Literal square_literal = LiteralUtil::CreateR1<float>({10, 20, 30, 40});
-  TF_ASSERT_OK_AND_ASSIGN(
-      Literal result,
-      evaluator.EvaluateWithSubstitutions(
-          add, {{param0, &param0_literal}, {square, &square_literal}}));
+  TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                          evaluator.Evaluate(add, {}, false,
+                                             {{param0, &param0_literal},
+                                              {square, &square_literal}}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result));
+
+  // Evaluate again, with a different substitution. This verifies we don't
+  // accidentally cache anything.
+  Literal param0_literal2 = LiteralUtil::CreateR1<float>({5, 6, 7, 8});
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result2,
+      evaluator.Evaluate(add, {}, true, {{param0, &param0_literal2}}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({30, 42, 56, 72}), result2));
 }
 
 TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsRecursive) {
@@ -3337,10 +3345,11 @@ TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsRecursive) {
   HloInstruction* param = module->entry_computation()->parameter_instruction(0);
   TF_ASSERT_OK_AND_ASSIGN(
       auto result,
-      evaluator_.EvaluateWithSubstitutions(
+      evaluator_.Evaluate(
           /*instruction=*/module->entry_computation()->root_instruction(),
-          /*substitutions=*/{{param, &param_value}},
-          /*recursively_evaluate_nonconstant_operands=*/true));
+          /*precomputed_analyses=*/{},
+          /*recursively_evaluate_nonconstant_operands=*/true,
+          /*substitutions=*/{{param, &param_value}}));
   EXPECT_EQ(result, LiteralUtil::CreateR0(PrimitiveType::S32, 1 + 2 + 3));
 }
 
@@ -3362,10 +3371,11 @@ TEST_F(HloEvaluatorTest,
   HloInstruction* param = module->entry_computation()->parameter_instruction(0);
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result,
-      evaluator_.EvaluateWithSubstitutions(
+      evaluator_.Evaluate(
           /*instruction=*/module->entry_computation()->root_instruction(),
-          /*substitutions=*/{{param, &param_value}},
-          /*recursively_evaluate_nonconstant_operands=*/true));
+          /*precomputed_analyses=*/{},
+          /*recursively_evaluate_nonconstant_operands=*/true,
+          /*substitutions=*/{{param, &param_value}}));
   EXPECT_EQ(result, LiteralUtil::CreateR0(PrimitiveType::S32, 4 + 1 + 2 + 1));
 }
 
@@ -3389,11 +3399,29 @@ TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutionsWithConstantOperand) {
   Literal square_literal = LiteralUtil::CreateR1<float>({10, 20, 30, 40});
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result,
-      evaluator.EvaluateWithSubstitutions(add, {{square, &square_literal}}));
+      evaluator.Evaluate(add, {}, false, {{square, &square_literal}}));
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result));
 }
 
+// Check that EvaluateWithSubstitutions works if the thing we're evaluating is
+// being substituted.
+TEST_P(HloEvaluatorBf16Test, EvaluateSubstitutedInstruction) {
+  HloComputation::Builder b(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4});
+
+  HloInstruction* param =
+      b.AddInstruction(HloInstruction::CreateParameter(0, shape, "param0"));
+
+  HloEvaluator evaluator;
+  Literal literal = LiteralUtil::CreateR1<float>({10, 20, 30, 40});
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result,
+      evaluator.Evaluate(param, {}, false, {{param, &literal}}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({10, 20, 30, 40}), result));
+}
+
 TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsLiteralBase) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(S64, {3});
@@ -3409,8 +3437,9 @@ TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsLiteralBase) {
   BorrowingLiteral literal(reinterpret_cast<const char*>(int64_values),
                            literal_shape);
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result, evaluator.EvaluateWithSubstitutions(
-                                              square, {{param0, &literal}}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result,
+      evaluator.Evaluate(square, {}, false, {{param0, &literal}}));
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int64_t>({1, 4, 9}),
                                      result));
 }
@@ -4440,6 +4469,31 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+TEST_F(HloEvaluatorTest, Reduction2D) {
+  const std::string hlo_text = R"(
+HloModule Reduction2D
+
+add_f32 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  arg0 = f32[2,4] parameter(0)
+  init = f32[] constant(0)
+  ROOT %reduce = f32[2] reduce(arg0, init), dimensions={1}, to_apply=add_f32
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+
+  Literal arg = LiteralUtil::CreateR2<float>(
+      {{1.0f, 3.0f, -2.0f, 42.0f}, {4.0f, 5.0f, 6.0f, 7.0f}});
+  Literal expected = LiteralUtil::CreateR1<float>({44.0f, 22.0f});
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate({&arg}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST_F(HloEvaluatorTest, DontFailOnCallUnimplementedOps) {
   // Outfeed triggers unimplemented error within HandleCall, and we verify that
   // the Evaluator does fail in such case.
@@ -4773,6 +4827,44 @@ TEST_F(HloEvaluatorTest, PreserveMOFusionOutputLayout) {
       absl::c_equal(args[0].data<float>(), actual_literals[0].data<float>()));
 }
 
+TEST_F(HloEvaluatorTest, ConvolutionWithNestedFusionWithoutLayout) {
+  // This is a regression test for a case where missing layouts weren't handled
+  // correctly
+  const absl::string_view hlo_text = R"(
+    copy_fusion {
+      p0 = f32[16,4] parameter(0)
+      ROOT copy = f32[16,4] copy(p0)
+    }
+
+    conv_fusion {
+      p0 = f32[8,16] parameter(0)
+      p1 = f32[16,4] parameter(1)
+      p1_copy = f32[16,4] fusion(p1), kind=kLoop, calls=copy_fusion
+      ROOT conv = f32[8,4] convolution(p0, p1_copy), dim_labels=bf_io->bf
+    }
+
+    ENTRY main {
+      p0 = f32[8,16] parameter(0)
+      p1 = f32[16,4] parameter(1)
+      ROOT fusion.2 = f32[8,4] fusion(p0, p1), kind=kOutput, calls=conv_fusion
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+
+  // Layout assignment would clear the layout on the copy fusion. We do it
+  // manually to avoid the dependency.
+  auto* conv_fusion = m_->GetComputationWithName("conv_fusion");
+  auto* p1_copy = conv_fusion->GetInstructionWithName("p1_copy");
+  p1_copy->mutable_shape()->clear_layout();
+
+  auto args = MakeFakeArguments(m_.get()).value();
+  absl::Status evaluate_status = Evaluate({&args[0], &args[1]}).status();
+  // Just verify this executes correctly. We are testing for issues around the
+  // handling of missing layouts here, which will cause the entire evaluation
+  // to fail.
+  EXPECT_IS_OK(evaluate_status);
+}
+
 // Tests that custom_calls fail to evaluate when no handler is specified.
 TEST_F(HloEvaluatorTest, EvaluateCustomCall_NoHandler) {
   const absl::string_view hlo_text = R"(
@@ -4972,6 +5064,23 @@ TEST_F(HloEvaluatorTest, AsyncOps) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+TEST_F(HloEvaluatorTest, AsyncOpsWithLayout) {
+  const absl::string_view hlo_text = R"(
+  HloModule test
+  ENTRY AsyncOps {
+    init = f32[2,2]{0,1} constant({{1.0, 2.0}, {3.0, 4.0}})
+    async-start = ((f32[2,2]{0,1}), f32[2,2]{0,1}, u32[]) negate-start(init)
+    async-update = ((f32[2,2]{0,1}), f32[2,2]{0,1}, u32[]) negate-update(async-start)
+    ROOT async-done = f32[2,2]{0,1} negate-done(async-update)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal expected = LiteralUtil::CreateR2<float>({{-1.0, -2.0}, {-3.0, -4.0}});
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result, HloEvaluator().Evaluate(*m_->entry_computation(), {}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST_F(HloEvaluatorTest, MapBF16) {
   const absl::string_view hlo_text = R"(
   HloModule test
@@ -5357,11 +5466,9 @@ TEST_F(HloEvaluatorTest, ParameterThroughCallSucceedsWithPrecomputation) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<TuplePointsToAnalysis> tuple_points_to,
       TuplePointsToAnalysis::Run(hlo_module.get()));
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(hlo_module.get());
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result,
-      evaluator_.Evaluate(parameter_instruction,
-                          {tuple_points_to.get(), call_graph.get()},
+      evaluator_.Evaluate(parameter_instruction, {tuple_points_to.get()},
                           /*recursively_evaluate_nonconstant_operands=*/true));
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
@@ -5453,12 +5560,11 @@ TEST_F(PatternMatchParseWhileLoopTest,
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<TuplePointsToAnalysis> tuple_points_to,
       TuplePointsToAnalysis::Run(hlo_module.get()));
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(hlo_module.get());
 
   HloInstruction* while_op =
       hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
-  std::optional<ParsedWhileLoop> parsed_while_loop = PatternMatchParseWhileLoop(
-      while_op, {tuple_points_to.get(), call_graph.get()});
+  std::optional<ParsedWhileLoop> parsed_while_loop =
+      PatternMatchParseWhileLoop(while_op, {tuple_points_to.get()});
   ASSERT_TRUE(parsed_while_loop.has_value());
   EXPECT_FALSE(parsed_while_loop->is_dynamic());
   EXPECT_EQ(parsed_while_loop->static_while_loop->trip_count, 5);
@@ -6188,6 +6294,41 @@ TEST_F(HloEvaluatorTest, SimpleConvTraced) {
   EXPECT_EQ(macs_traced, macs_expected);
 }
 
+TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2KernelNoOutputLayout) {
+  const char* hlo_text = R"(
+    ENTRY main {
+      lhs = f32[1,1,4,4] constant(
+          {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}}})
+      rhs = f32[1,1,2,2] constant({{{{5, 6}, {7, 8}}}})
+      ROOT conv = f32[1,1,4,4] convolution(lhs, rhs),
+          window={size=2x2 pad=0_1x0_1}, dim_labels=bf01_oi01->bf01
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  // Explicitly clear the layout, like it would be done by LayoutAssignment if
+  // the convolution was in a fusion.
+  m_->entry_computation()->root_instruction()->mutable_shape()->clear_layout();
+
+  // The multiply-accumulate handler computes a linear index, which requires a
+  // layout.
+  evaluator_.set_trace_mac_handler([](int64_t result_index, int64_t lhs_index,
+                                      int64_t rhs_index) -> void {});
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
+
+  Array4D<float> expected_array(1, 1, 4, 4);
+  // clang-format off
+  expected_array.FillWithYX(Array2D<float>({
+    {100, 126, 152,  76},
+    {204, 230, 256, 124},
+    {308, 334, 360, 172},
+    {149, 160, 171,  80},
+  }));
+  // clang-format on
+  auto expected = LiteralUtil::CreateR4FromArray4D<float>(expected_array);
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST(EvalErrorTest, OK) {
   EXPECT_EQ(std::nullopt, internal::ParseEvalErrorDetail(absl::OkStatus()));
 }
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index af57c9c28927..f57aa39ce948 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -98,7 +98,7 @@ auto ToArithmeticSafeType(T t) {
     return static_cast<detail::unsigned_promoted_type_t<T>>(t);
   }
   if constexpr (!std::is_integral_v<T>) {
-    return std::move(t);
+    return t;
   }
 }
 
@@ -138,6 +138,18 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
         PrimitiveType_Name(instruction->shape().element_type()));
   }
 
+  // Returns `shape`, if it has a layout, or a copy of `shape` with the default
+  // layout if it doesn't. Some functions require shapes to have layouts, so we
+  // simply always set one.
+  Shape GetShapeWithLayout(const Shape& shape) {
+    CHECK(shape.IsArray());
+    Shape shape_copy = shape;
+    if (!shape.has_layout()) {
+      LayoutUtil::SetToDefaultLayout(&shape_copy);
+    }
+    return shape_copy;
+  }
+
  public:
   explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {}
 
@@ -813,14 +825,9 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                              const Literal& lhs_literal,
                                              const Literal& rhs_literal) {
     const auto& window = conv->window();
-    const Shape& result_shape = conv->shape();
+    Shape result_shape = GetShapeWithLayout(conv->shape());
     const Shape& lhs_shape = lhs_literal.shape();
     const Shape& rhs_shape = rhs_literal.shape();
-    const auto packed_nibble_count =
-        absl::c_count(conv->precision_config().operand_precision(),
-                      PrecisionConfig::PACKED_NIBBLE);
-    CHECK_NE(packed_nibble_count, 1);
-    const bool is_packed_nibble = packed_nibble_count == 2;
 
     TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
     TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
@@ -858,9 +865,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
                  &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
                  rhs_literal_data, feature_group_count, batch_group_count,
-                 is_packed_nibble, result_shape,
-                 this](const absl::Span<const int64_t> out_index,
-                       int /*thread_id*/) {
+                 result_shape, this](const absl::Span<const int64_t> out_index,
+                                     int /*thread_id*/) {
       // Dimension number applicable for input (lhs).
       const int64_t input_batch_dim = dnums.input_batch_dimension();
       const int64_t input_z_dim = dnums.input_feature_dimension();
@@ -982,23 +988,15 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
               static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]);
           auto rhs =
               static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
-          if (is_packed_nibble) {
-            auto lhs_n0 = ToArithmeticSafeType(Nibble0(lhs));
-            auto lhs_n1 = ToArithmeticSafeType(Nibble1(lhs));
-            auto rhs_n0 = ToArithmeticSafeType(Nibble0(rhs));
-            auto rhs_n1 = ToArithmeticSafeType(Nibble1(rhs));
-            result_val += (lhs_n0 * rhs_n0) + (lhs_n1 * rhs_n1);
-          } else {
-            result_val += ToArithmeticSafeType(lhs) * ToArithmeticSafeType(rhs);
+          result_val += ToArithmeticSafeType(lhs) * ToArithmeticSafeType(rhs);
 
-            if (parent_->trace_mac_handler_ != nullptr) {
-              const int64_t result_linear_index =
-                  IndexUtil::MultidimensionalIndexToLinearIndex(result_shape,
-                                                                out_index);
+          if (parent_->trace_mac_handler_ != nullptr) {
+            const int64_t result_linear_index =
+                IndexUtil::MultidimensionalIndexToLinearIndex(result_shape,
+                                                              out_index);
 
-              parent_->trace_mac_handler_(result_linear_index, lhs_linear_index,
-                                          rhs_linear_index);
-            }
+            parent_->trace_mac_handler_(result_linear_index, lhs_linear_index,
+                                        rhs_linear_index);
           }
         }
       cnt: {}
@@ -1024,9 +1022,9 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     auto lhs = conv->operand(0);
     auto rhs = conv->operand(1);
     const auto& window = conv->window();
-    const Shape& result_shape = conv->shape();
-    const Shape& lhs_shape = lhs->shape();
-    const Shape& rhs_shape = rhs->shape();
+    Shape result_shape = GetShapeWithLayout(conv->shape());
+    Shape lhs_shape = GetShapeWithLayout(lhs->shape());
+    Shape rhs_shape = GetShapeWithLayout(rhs->shape());
 
     TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
     TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
@@ -1040,8 +1038,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     CHECK_GE(num_spatial_dims, 0);
     CHECK_EQ(window.dimensions_size(), num_spatial_dims);
 
-    const auto lhs_rank = lhs_shape.rank();
-    const auto rhs_rank = rhs_shape.rank();
+    const auto lhs_rank = lhs_shape.dimensions().size();
+    const auto rhs_rank = rhs_shape.dimensions().size();
 
     CHECK_EQ(num_spatial_dims + 2, lhs_rank);
     CHECK_EQ(num_spatial_dims + 2, rhs_rank);
@@ -1100,8 +1098,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
 
     const auto& dnums = dot->dot_dimension_numbers();
 
-    const int64_t lhs_rank = lhs->shape().rank();
-    const int64_t rhs_rank = rhs->shape().rank();
+    const int64_t lhs_rank = lhs->shape().dimensions().size();
+    const int64_t rhs_rank = rhs->shape().dimensions().size();
 
     CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
     CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
@@ -1119,15 +1117,16 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
         << " rhs contracted dimension: "
         << rhs->shape().dimensions(rhs_contracting_dimension);
 
+    auto is_default_layout = [](const HloInstruction* op) {
+      return !op->shape().has_layout() ||
+             LayoutUtil::Equal(op->shape().layout(),
+                               LayoutUtil::GetDefaultLayoutForR2());
+    };
+
     // The fast path is for a simple rank 2 dot with default layout operands.
     if (lhs_rank != 2 || rhs_rank != 2 || lhs_contracting_dimension != 1 ||
-        rhs_contracting_dimension != 0 ||
-        !LayoutUtil::Equal(lhs->shape().layout(),
-                           LayoutUtil::GetDefaultLayoutForR2()) ||
-        !LayoutUtil::Equal(rhs->shape().layout(),
-                           LayoutUtil::GetDefaultLayoutForR2()) ||
-        !LayoutUtil::Equal(dot->shape().layout(),
-                           LayoutUtil::GetDefaultLayoutForR2())) {
+        rhs_contracting_dimension != 0 || !is_default_layout(lhs) ||
+        !is_default_layout(rhs) || !is_default_layout(dot)) {
       return HandleDotSlowPath(dot);
     }
 
@@ -1165,17 +1164,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                              const Literal& rhs_literal) {
     const auto& dnums = dot->dot_dimension_numbers();
 
-    const auto lhs_rank = lhs_literal.shape().rank();
-    const auto rhs_rank = rhs_literal.shape().rank();
+    const auto lhs_rank = lhs_literal.shape().dimensions().size();
+    const auto rhs_rank = rhs_literal.shape().dimensions().size();
 
     CHECK(ShapeUtil::SameElementType(lhs_literal.shape(), rhs_literal.shape()));
     CHECK(ShapeUtil::SameElementType(lhs_literal.shape(), dot->shape()));
 
-    const auto packed_nibble_count =
-        absl::c_count(dot->precision_config().operand_precision(),
-                      PrecisionConfig::PACKED_NIBBLE);
-    CHECK_NE(packed_nibble_count, 1);
-    const bool is_packed_nibble = packed_nibble_count == 2;
     CHECK_EQ(dnums.lhs_batch_dimensions_size(),
              dnums.rhs_batch_dimensions_size());
 
@@ -1199,7 +1193,8 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
       contracting_dim_sizes.push_back(dim_size);
     }
     const int64_t total_contraction_size = Product(contracting_dim_sizes);
-    Literal result(dot->shape());
+    Shape dot_shape = GetShapeWithLayout(dot->shape());
+    Literal result(dot_shape);
     TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
         [&](absl::Span<const int64_t> result_index, int /*thread_id*/) {
           // Locations in LHS and RHS that we read from.
@@ -1229,30 +1224,21 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                 static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index));
             const auto rhs =
                 static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
-            if (is_packed_nibble) {
-              auto lhs_n0 = ToArithmeticSafeType(Nibble0(lhs));
-              auto lhs_n1 = ToArithmeticSafeType(Nibble1(lhs));
-              auto rhs_n0 = ToArithmeticSafeType(Nibble0(rhs));
-              auto rhs_n1 = ToArithmeticSafeType(Nibble1(rhs));
-              result_val += (lhs_n0 * rhs_n0) + (lhs_n1 * rhs_n1);
-            } else {
-              result_val +=
-                  ToArithmeticSafeType(lhs) * ToArithmeticSafeType(rhs);
-
-              if (parent_->trace_mac_handler_ != nullptr) {
-                const int64_t result_linear_index =
-                    IndexUtil::MultidimensionalIndexToLinearIndex(dot->shape(),
-                                                                  result_index);
-                const int64_t lhs_linear_index =
-                    IndexUtil::MultidimensionalIndexToLinearIndex(
-                        lhs_literal.shape(), lhs_index);
-                const int64_t rhs_linear_index =
-                    IndexUtil::MultidimensionalIndexToLinearIndex(
-                        rhs_literal.shape(), rhs_index);
-
-                parent_->trace_mac_handler_(result_linear_index,
-                                            lhs_linear_index, rhs_linear_index);
-              }
+            result_val += ToArithmeticSafeType(lhs) * ToArithmeticSafeType(rhs);
+
+            if (parent_->trace_mac_handler_ != nullptr) {
+              const int64_t result_linear_index =
+                  IndexUtil::MultidimensionalIndexToLinearIndex(dot_shape,
+                                                                result_index);
+              const int64_t lhs_linear_index =
+                  IndexUtil::MultidimensionalIndexToLinearIndex(
+                      lhs_literal.shape(), lhs_index);
+              const int64_t rhs_linear_index =
+                  IndexUtil::MultidimensionalIndexToLinearIndex(
+                      rhs_literal.shape(), rhs_index);
+
+              parent_->trace_mac_handler_(result_linear_index, lhs_linear_index,
+                                          rhs_linear_index);
             }
 
             // If there are no contracting dimensions, do not try to count down
@@ -1312,7 +1298,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     CHECK(pad->operand(0)->shape().IsArray());
     // Padding value must be scalar.
     CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
-    CHECK_EQ(pad->operand(0)->shape().rank(),
+    CHECK_EQ(pad->operand(0)->shape().dimensions().size(),
              pad->padding_config().dimensions_size());
 
     TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
@@ -1342,14 +1328,14 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     }
 
     // Create new HLO of padded shape with padding value.
-    Literal result(pad->shape());
+    Literal result(GetShapeWithLayout(pad->shape()));
     TF_RETURN_IF_ERROR(result.PopulateLinearParallel<ReturnT>(
         [&scalar](int64_t linear_index, int) { return scalar; }));
 
     const Literal& evaluated_operand =
         parent_->GetEvaluatedLiteralFor(pad->operand(0));
 
-    std::vector<int64_t> target_index(result.shape().rank(), 0);
+    std::vector<int64_t> target_index(result.shape().dimensions().size(), 0);
 
     // Loop through each element of the operand, assign them to the
     // corresponding index of the resulting padded literal.
@@ -1376,9 +1362,9 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
       return true;
     };
 
-    std::vector<int64_t> zero_base(evaluated_operand.shape().dimensions_size(),
-                                   0);
-    std::vector<int64_t> step(evaluated_operand.shape().dimensions_size(), 1);
+    std::vector<int64_t> zero_base(
+        evaluated_operand.shape().dimensions().size(), 0);
+    std::vector<int64_t> step(evaluated_operand.shape().dimensions().size(), 1);
 
     ShapeUtil::ForEachIndexNoStatus(evaluated_operand.shape(), zero_base,
                                     evaluated_operand.shape().dimensions(),
@@ -1578,9 +1564,10 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     if constexpr (std::is_integral_v<ElementwiseT> ||
                   is_complex_v<ElementwiseT> ||
                   std::is_floating_point_v<ElementwiseT>) {
-      Literal result(iota->shape());
+      auto iota_shape = GetShapeWithLayout(iota->shape());
+      Literal result(iota_shape);
       ShapeUtil::ForEachIndexNoStatus(
-          iota->shape(), [&](absl::Span<const int64_t> idx) {
+          iota_shape, [&](absl::Span<const int64_t> idx) {
             result.Set(idx, static_cast<ReturnT>(idx[iota->iota_dimension()]));
             return true;
           });
@@ -1592,7 +1579,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
 
   absl::Status HandleRng(const HloInstruction* random) override {
     RandomDistribution distribution = random->random_distribution();
-    const Shape& result_shape = random->shape();
+    Shape result_shape = GetShapeWithLayout(random->shape());
     Literal result(result_shape);
 
     if constexpr (std::is_floating_point_v<ElementwiseT>) {
@@ -1712,7 +1699,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                         ElementwiseT>,
                   "Invalid BinaryOp signature");
 
-    const auto& shape = instruction->shape();
+    Shape shape = GetShapeWithLayout(instruction->shape());
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
     TF_RET_CHECK(ShapeUtil::SameDimensions(shape, rhs->shape()));
@@ -1755,7 +1742,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
         std::is_invocable_r_v<ReturnT, TernaryOp, LhsType, RhsType, EhsType>,
         "Invalid TernaryOp signature");
 
-    const auto& shape = instruction->shape();
+    Shape shape = GetShapeWithLayout(instruction->shape());
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
     const auto* ehs = instruction->operand(2);
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int2.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int2.cc
index 24ba8714f592..94a7f2f0f7e5 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int2.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int2.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc
index dd0889da2efa..1fa77b429598 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc
index 5b23fb6c4891..d0d50f293ad7 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index 024384346732..1f3f7893db92 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -1,7 +1,7 @@
 # Automatic sharding annotation
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test", "xla_internal")
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_cc_test", "xla_internal")
 load("//xla/tsl:tsl.bzl", "if_google")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
@@ -33,6 +33,7 @@ cc_library(
     deps = [
         ":auto_sharding_cost_graph",
         ":auto_sharding_device_mesh",
+        ":auto_sharding_iopddl",
         ":auto_sharding_option",
         ":auto_sharding_solver",
         ":auto_sharding_strategy",
@@ -164,6 +165,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ] + if_google(
         ["@com_google_ortools//ortools/linear_solver:linear_solver_wrapper"],
         ["@com_google_ortools//ortools/linear_solver"],
@@ -176,7 +178,6 @@ cc_library(
     hdrs = ["auto_sharding_cost_graph.h"],
     compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
-        ":auto_sharding_device_mesh",
         ":auto_sharding_strategy",
         ":matrix",
         "//xla:shape_util",
@@ -195,9 +196,7 @@ cc_library(
     hdrs = ["auto_sharding_option.h"],
     compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
-        ":auto_sharding_device_mesh",
         ":auto_sharding_util",
-        "//xla:array",
         "//xla/service:hlo_module_config",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -400,6 +399,7 @@ xla_cc_test(
         "//xla/service:buffer_value",
         "//xla/service:hlo_value",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -410,7 +410,6 @@ xla_cc_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -429,6 +428,7 @@ xla_cc_test(
         ":auto_sharding_proto_cc",
         ":auto_sharding_solver",  # build_cleaner: keep
         ":auto_sharding_strategy",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
@@ -439,3 +439,50 @@ xla_cc_test(
         "@local_tsl//tsl/platform:statusor",
     ] + if_google(["@com_google_ortools//ortools/linear_solver:linear_solver_scip"]),
 )
+
+cc_library(
+    name = "iopddl_lib",
+    srcs = [
+        "iopddl.cc",
+        "solver.cc",
+    ],
+    hdrs = [
+        "iopddl.h",
+        "solver.h",
+    ],
+    compatible_with = get_compatible_with_libtpu_portable(),
+    deps = [
+        "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_test(
+    # CC_TEST_OK=tests imported from the IOPDDL library.
+    name = "iopddl_test",
+    srcs = ["iopddl_test.cc"],
+    data = ["example.json"],
+    deps = [
+        ":iopddl_lib",
+        "//xla/tsl/platform:status_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "auto_sharding_iopddl",
+    srcs = ["auto_sharding_iopddl.cc"],
+    hdrs = ["auto_sharding_iopddl.h"],
+    compatible_with = get_compatible_with_libtpu_portable(),
+    deps = [
+        ":iopddl_lib",
+        "//xla/hlo/experimental/auto_sharding:auto_sharding_proto_cc",
+        "//xla/hlo/experimental/auto_sharding:auto_sharding_strategy",
+        "@com_google_absl//absl/log:check",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 1d19ce6757cb..4f1cf637ad29 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_device_mesh.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_memory.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
@@ -223,7 +224,7 @@ GenerateReshardingCostsAndMissingShardingsForAllOperands(
     const StrategyGroup& operand_strategy_group = *strategy_map.at(operand);
     const auto& operand_strategies = operand_strategy_group.GetStrategies();
     const std::vector<double> zeros(operand_strategies.size(), 0.0);
-    if (operand_shape.IsToken() || operand_shape.rank() == 0) {
+    if (operand_shape.IsToken() || operand_shape.dimensions().size() == 0) {
       communication_resharding_costs.push_back(zeros);
       memory_resharding_costs.push_back(zeros);
       if (!input_shardings.shardings[k].has_value()) {
@@ -365,7 +366,7 @@ std::unique_ptr<StrategyGroup> HandlePartialReduce(
 
   std::unique_ptr<StrategyGroup> strategy_group =
       CreateTupleStrategyGroup(instruction_id);
-  int64_t output_size = shape.tuple_shapes_size();
+  int64_t output_size = shape.tuple_shapes().size();
   for (size_t i = 0; i < output_size; ++i) {
     std::unique_ptr<StrategyGroup> child_strategy_group =
         CreateLeafStrategyGroupWithoutInNodes(instruction_id, strategy_groups);
@@ -428,7 +429,7 @@ std::unique_ptr<StrategyGroup> MaybeFollowInsStrategyGroup(
   std::unique_ptr<StrategyGroup> strategy_group;
   if (src_strategy_group.is_tuple) {
     CHECK(shape.IsTuple());
-    CHECK_EQ(shape.tuple_shapes_size(), children.size());
+    CHECK_EQ(shape.tuple_shapes().size(), children.size());
     strategy_group = CreateTupleStrategyGroup(instruction_id);
     for (size_t i = 0; i < children.size(); ++i) {
       auto child_strategies = MaybeFollowInsStrategyGroup(
@@ -457,12 +458,12 @@ absl::StatusOr<std::unique_ptr<StrategyGroup>> FollowReduceStrategy(
   std::unique_ptr<StrategyGroup> strategy_group;
   if (output_shape.IsTuple()) {
     strategy_group = CreateTupleStrategyGroup(instruction_id);
-    for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+    for (size_t i = 0; i < ins->shape().tuple_shapes().size(); ++i) {
       TF_ASSIGN_OR_RETURN(
           std::unique_ptr<StrategyGroup> child_strategy,
           FollowReduceStrategy(
               ins, ins->shape().tuple_shapes().at(i), ins->operand(i),
-              ins->operand(i + ins->shape().tuple_shapes_size()),
+              ins->operand(i + ins->shape().tuple_shapes().size()),
               instruction_id, strategy_map, strategy_groups, cluster_env,
               allow_mixed_mesh_shape, crash_at_error));
       child_strategy->tuple_element_idx = i;
@@ -480,9 +481,9 @@ absl::StatusOr<std::unique_ptr<StrategyGroup>> FollowReduceStrategy(
     // op_dim_to_output_dim = [0, 1, -1]
     std::vector<int64_t> op_dim_to_output_dim =
         GetDimensionMapping(/*reduced_dimensions=*/ins->dimensions(),
-                            /*op_count*/ operand->shape().rank());
-    CHECK_EQ(ins->dimensions().size() + output_shape.rank(),
-             operand->shape().rank())
+                            /*op_count*/ operand->shape().dimensions().size());
+    CHECK_EQ(ins->dimensions().size() + output_shape.dimensions().size(),
+             operand->shape().dimensions().size())
         << "Invalid kReduce: output size + reduced dimensions size != op count";
 
     for (const auto& src_strategy : src_strategy_group->GetStrategies()) {
@@ -491,12 +492,13 @@ absl::StatusOr<std::unique_ptr<StrategyGroup>> FollowReduceStrategy(
           operand->shape(), input_sharding,
           /* consider_reverse_device_meshes */ true,
           /* crash_at_error */ crash_at_error);
-      if (tensor_dim_to_mesh.size() != operand->shape().rank()) {
+      if (tensor_dim_to_mesh.size() != operand->shape().dimensions().size()) {
         return absl::InvalidArgumentError(
             "Cannot generate tensor dim to mesh dim mapping");
       }
       std::vector<int64_t> all_reduce_dims;
-      for (int64_t op_dim = 0; op_dim < operand->shape().rank(); ++op_dim) {
+      for (int64_t op_dim = 0; op_dim < operand->shape().dimensions().size();
+           ++op_dim) {
         int64_t mesh_dim = tensor_dim_to_mesh[op_dim];
         // Replicates on this mesh dim.
         if (mesh_dim == -1) {
@@ -585,7 +587,7 @@ ReshardingCostsForTupleOperand(const HloInstruction* operand,
   ReshardingCosts memory_resharding_costs;
   std::vector<HloSharding> tuple_element_shardings;
   for (size_t tuple_element_idx = 0;
-       tuple_element_idx < operand->shape().tuple_shapes_size();
+       tuple_element_idx < operand->shape().tuple_shapes().size();
        tuple_element_idx++) {
     const StrategyGroup& tuple_element_strategy_group =
         *operand_strategy_vector.GetChildren()[tuple_element_idx];
@@ -627,7 +629,7 @@ ReshardingCosts CreateZeroReshardingCostsForAllOperands(
             << "Do not support instructions with more than one tuple "
                "operand.";
         for (size_t tuple_element_idx = 0;
-             tuple_element_idx < operand->shape().tuple_shapes_size();
+             tuple_element_idx < operand->shape().tuple_shapes().size();
              tuple_element_idx++) {
           const StrategyGroup& tuple_element_strategy_group =
               *operand_strategy_group.GetChildren().at(tuple_element_idx);
@@ -653,7 +655,7 @@ void GenerateOutfeedStrategy(const HloInstruction* ins, const Shape& shape,
   ReshardingCosts memory_resharding_costs;
   InputShardings input_shardings = {"R"};
 
-  const int tuple_size = ins->operand(0)->shape().tuple_shapes_size();
+  const int tuple_size = ins->operand(0)->shape().tuple_shapes().size();
   const auto& operand_strategy_group = strategy_map.at(ins->operand(0));
   const auto& operand_children = operand_strategy_group->GetChildren();
   if (ins->has_sharding()) {
@@ -878,7 +880,7 @@ void EnumerateAll1DPartition(
     bool allow_shardings_small_dims_across_many_devices,
     const std::string& suffix, const CallGraph& call_graph,
     StrategyGroup& strategy_group) {
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     for (int64_t j = 0; j < device_mesh.num_dimensions(); ++j) {
       bool small_dims_sharding_check =
           !allow_shardings_small_dims_across_many_devices &&
@@ -937,7 +939,8 @@ void EnumerateAll1DPartition(
         // the cost model for sort (which, as noted above in the comments for
         // the function) is also an approximation.
         communication_cost = ComputeSortCommunicationCost(
-            ins->operand(0)->shape().rank() - 1, i, j, shape, cluster_env);
+            ins->operand(0)->shape().dimensions().size() - 1, i, j, shape,
+            cluster_env);
       }
       strategy_group.AddStrategy(
           ShardingStrategy({output_spec, compute_cost, communication_cost,
@@ -972,7 +975,7 @@ void EnumerateAllPartition(
     return;
   }
   // Fully tile the buffer to the mesh
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     auto tensor_it = std::find(tensor_dims.begin(), tensor_dims.end(), i);
     if (tensor_it != tensor_dims.end()) {
       continue;
@@ -1041,7 +1044,7 @@ void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
     CHECK(sort_ins);
     sort_or_topk_dim = sort_ins->sort_dimension();
   } else if (IsTopKCustomCall(ins)) {
-    sort_or_topk_dim = ins->operand(0)->shape().rank() - 1;
+    sort_or_topk_dim = ins->operand(0)->shape().dimensions().size() - 1;
   }
 
   if (sort_or_topk_dim != -1) {
@@ -1069,7 +1072,7 @@ void EnumerateAll1DPartitionReshape(const HloInstruction* ins,
   const Shape& operand_shape = operand->shape();
   const StrategyGroup& operand_strategy_group = *strategy_map.at(operand);
 
-  for (int64_t i = 0; i < ins->shape().rank(); ++i) {
+  for (int64_t i = 0; i < ins->shape().dimensions().size(); ++i) {
     for (int64_t j = 0; j < device_mesh.num_dimensions(); ++j) {
       if (device_mesh.dim(j) == 1 ||
           (only_allow_divisible &&
@@ -1260,7 +1263,7 @@ absl::StatusOr<std::unique_ptr<StrategyGroup>> CreateAllStrategiesGroup(
   std::unique_ptr<StrategyGroup> strategy_group;
   if (shape.IsTuple()) {
     strategy_group = CreateTupleStrategyGroup(instruction_id);
-    for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+    for (size_t i = 0; i < shape.tuple_shapes().size(); ++i) {
       auto child_strategies =
           CreateAllStrategiesGroup(
               ins, shape.tuple_shapes(i), instruction_id, strategy_groups,
@@ -1337,144 +1340,143 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
           pretrimmed_strategy_map, call_graph, strict,
           strategy_group.GetChild(i));
     }
-  } else {
-    if (existing_sharding.IsUnknown()) {
-      return;
+    return;
+  }
+  if (existing_sharding.IsUnknown()) {
+    return;
+  }
+  if (spmd::ShardingIsComplete(existing_sharding,
+                               cluster_env.device_mesh_.num_elements())) {
+    // Sharding provided by XLA users, we need to keep them.
+    strategy_group.following = nullptr;
+    std::vector<std::pair<ShardingStrategy, InputShardings>> new_strategies;
+    const auto& strategy_input_shardings =
+        strategy_group.GetStrategyInputShardings();
+    for (size_t iid = 0; iid < strategy_input_shardings.size(); ++iid) {
+      const InputShardings& input_shardings = strategy_input_shardings[iid];
+      const ShardingStrategy& strategy =
+          strategy_group.GetStrategyForInputShardings(iid);
+      if (strategy.output_sharding == existing_sharding) {
+        VLOG(1) << "Keeping strategy: " << strategy.ToString();
+        new_strategies.push_back({strategy, input_shardings});
+      }
     }
-    if (spmd::ShardingIsComplete(existing_sharding,
-                                 cluster_env.device_mesh_.num_elements())) {
-      // Sharding provided by XLA users, we need to keep them.
-      strategy_group.following = nullptr;
-      std::vector<std::pair<ShardingStrategy, InputShardings>> new_strategies;
-      const auto& strategy_input_shardings =
-          strategy_group.GetStrategyInputShardings();
-      for (size_t iid = 0; iid < strategy_input_shardings.size(); ++iid) {
-        const InputShardings& input_shardings = strategy_input_shardings[iid];
-        const ShardingStrategy& strategy =
-            strategy_group.GetStrategyForInputShardings(iid);
-        if (strategy.output_sharding == existing_sharding) {
-          VLOG(1) << "Keeping strategy: " << strategy.ToString();
-          new_strategies.push_back({strategy, input_shardings});
-        }
+    if (!new_strategies.empty()) {
+      // Stores other strategies in the map, removes them in the vector and
+      // only keeps the one we found.
+      pretrimmed_strategy_map[strategy_group.node_idx] =
+          strategy_group.GetStrategies();
+      strategy_group.ClearStrategies();
+      for (const auto& [strategy, input_shardings] : new_strategies) {
+        strategy_group.AddStrategy(strategy, input_shardings);
       }
-      if (!new_strategies.empty()) {
-        // Stores other strategies in the map, removes them in the vector and
-        // only keeps the one we found.
-        pretrimmed_strategy_map[strategy_group.node_idx] =
-            strategy_group.GetStrategies();
-        strategy_group.ClearStrategies();
-        for (const auto& [strategy, input_shardings] : new_strategies) {
-          strategy_group.AddStrategy(strategy, input_shardings);
-        }
-      } else {
-        VLOG(1) << "Generate a new strategy based on user sharding.";
-        std::string name = ToStringSimple(existing_sharding);
-        ReshardingCosts communication_resharding_costs;
-        ReshardingCosts memory_resharding_costs;
-        InputShardings input_shardings = {name};
-        if (!strategy_group.in_nodes.empty()) {
-          HloInstruction* ins = instructions.at(strategy_group.instruction_id);
-          for (size_t i = 0; i < strategy_group.in_nodes.size(); i++) {
-            HloInstruction* operand =
-                instructions.at(strategy_group.in_nodes.at(i)->instruction_id);
-            std::optional<HloSharding> input_sharding =
-                ShardingPropagation::GetShardingFromUser(
-                    *operand, *ins, 10, true, call_graph,
-                    /*sharding_helper=*/nullptr);
-            StrategyGroup* operand_strategy_group =
-                strategy_map.at(operand).get();
-            Shape operand_shape = operand->shape();
-            if (ins->opcode() == HloOpcode::kGetTupleElement) {
-              if (input_sharding && input_sharding->IsTuple()) {
-                input_sharding = input_sharding->GetSubSharding(
-                    operand->shape(), {ins->tuple_index()});
-              }
-              operand_strategy_group =
-                  &operand_strategy_group->GetChild(ins->tuple_index());
-              operand_shape = operand->shape().tuple_shapes(ins->tuple_index());
+    } else {
+      VLOG(1) << "Generate a new strategy based on user sharding.";
+      std::string name = ToStringSimple(existing_sharding);
+      ReshardingCosts communication_resharding_costs;
+      ReshardingCosts memory_resharding_costs;
+      InputShardings input_shardings = {name};
+      if (!strategy_group.in_nodes.empty()) {
+        HloInstruction* ins = instructions.at(strategy_group.instruction_id);
+        for (size_t i = 0; i < strategy_group.in_nodes.size(); i++) {
+          HloInstruction* operand =
+              instructions.at(strategy_group.in_nodes.at(i)->instruction_id);
+          std::optional<HloSharding> input_sharding =
+              ShardingPropagation::GetShardingFromUser(
+                  *operand, *ins, 10, true, call_graph,
+                  /*sharding_helper=*/nullptr);
+          StrategyGroup* operand_strategy_group =
+              strategy_map.at(operand).get();
+          Shape operand_shape = operand->shape();
+          if (ins->opcode() == HloOpcode::kGetTupleElement) {
+            if (input_sharding && input_sharding->IsTuple()) {
+              input_sharding = input_sharding->GetSubSharding(
+                  operand->shape(), {ins->tuple_index()});
             }
+            operand_strategy_group =
+                &operand_strategy_group->GetChild(ins->tuple_index());
+            operand_shape = operand->shape().tuple_shapes(ins->tuple_index());
+          }
 
-            if (!input_sharding) {
-              if (existing_sharding.Validate(operand_shape).ok()) {
-                input_sharding = existing_sharding;
-              } else {
-                input_sharding = HloSharding::Replicate();
-              }
+          if (!input_sharding) {
+            if (existing_sharding.Validate(operand_shape).ok()) {
+              input_sharding = existing_sharding;
+            } else {
+              input_sharding = HloSharding::Replicate();
             }
+          }
 
-            CHECK(input_sharding.has_value());
+          CHECK(input_sharding.has_value());
 
-            input_shardings.shardings.push_back(*input_sharding);
-            communication_resharding_costs.push_back(
-                CommunicationReshardingCostVector(
-                    *operand_strategy_group, operand_shape, *input_sharding,
-                    cluster_env));
-            memory_resharding_costs.push_back(MemoryReshardingCostVector(
-                *operand_strategy_group, operand_shape, *input_sharding,
-                cluster_env));
-          }
-        }
-        double memory_cost =
-            ByteSizeOfShapeWithSharding(output_shape, existing_sharding);
-        if (!strategy_group.GetStrategies().empty()) {
-          pretrimmed_strategy_map[strategy_group.node_idx] =
-              strategy_group.GetStrategies();
+          input_shardings.shardings.push_back(*input_sharding);
+          communication_resharding_costs.push_back(
+              CommunicationReshardingCostVector(*operand_strategy_group,
+                                                operand_shape, *input_sharding,
+                                                cluster_env));
+          memory_resharding_costs.push_back(
+              MemoryReshardingCostVector(*operand_strategy_group, operand_shape,
+                                         *input_sharding, cluster_env));
         }
-        strategy_group.ClearStrategies();
-        strategy_group.AddStrategy(
-            ShardingStrategy({existing_sharding, 0, 0, memory_cost,
-                              communication_resharding_costs,
-                              memory_resharding_costs}),
-            input_shardings);
       }
-      // If there is only one option for resharding, and the cost computed for
-      // that option is kInfinityCost, set the cost to zero. This is okay
-      // because there is only one option anyway, and having the costs set to
-      // kInfinityCost is problematic for the solver.
-      if (strategy_group.GetStrategies().size() == 1) {
-        for (auto& operand_communication_resharding_costs :
-             strategy_group.GetStrategy(0).communication_resharding_costs) {
-          if (operand_communication_resharding_costs.size() == 1 &&
-              operand_communication_resharding_costs[0] >= kInfinityCost) {
-            operand_communication_resharding_costs[0] = 0;
-          }
-        }
+      double memory_cost =
+          ByteSizeOfShapeWithSharding(output_shape, existing_sharding);
+      if (!strategy_group.GetStrategies().empty()) {
+        pretrimmed_strategy_map[strategy_group.node_idx] =
+            strategy_group.GetStrategies();
       }
-    } else if (!strategy_group.following) {
-      // If existing sharding is a partial sharding from previous iteration,
-      // find the strategies that are 1D&&complete or align with user
-      // sharding.
-      // It is IMPORTANT that we do this only for instructions that do no follow
-      // others, to keep the number of ILP variable small.
-      std::vector<std::pair<ShardingStrategy, InputShardings>> new_vector;
-      const auto& strategy_input_shardings =
-          strategy_group.GetStrategyInputShardings();
-      for (size_t iid = 0; iid < strategy_input_shardings.size(); ++iid) {
-        const InputShardings& input_shardings = strategy_input_shardings[iid];
-        const ShardingStrategy& strategy =
-            strategy_group.GetStrategyForInputShardings(iid);
-        if (strategy.output_sharding.IsReplicated() ||
-            ShardingIsConsistent(existing_sharding, strategy.output_sharding,
-                                 strict) ||
-            (VectorGreaterThanOneElementCount(
-                 strategy.output_sharding.tile_assignment().dimensions()) ==
-                 1 &&
-             spmd::ShardingIsComplete(
-                 strategy.output_sharding,
-                 cluster_env.original_device_mesh_.num_elements()))) {
-          new_vector.push_back({strategy, input_shardings});
+      strategy_group.ClearStrategies();
+      strategy_group.AddStrategy(
+          ShardingStrategy({existing_sharding, 0, 0, memory_cost,
+                            communication_resharding_costs,
+                            memory_resharding_costs}),
+          input_shardings);
+    }
+    // If there is only one option for resharding, and the cost computed for
+    // that option is kInfinityCost, set the cost to zero. This is okay
+    // because there is only one option anyway, and having the costs set to
+    // kInfinityCost is problematic for the solver.
+    if (strategy_group.GetStrategies().size() == 1) {
+      for (auto& operand_communication_resharding_costs :
+           strategy_group.GetStrategy(0).communication_resharding_costs) {
+        if (operand_communication_resharding_costs.size() == 1 &&
+            operand_communication_resharding_costs[0] >= kInfinityCost) {
+          operand_communication_resharding_costs[0] = 0;
         }
       }
-      // If no sharding strategy left, just keep the original set, because we do
-      // not have to strictly keep those shardings and the only purpose is to
-      // reduce problem size for the last iteration.
-      if (!new_vector.empty() &&
-          new_vector.size() != strategy_group.GetStrategies().size()) {
-        strategy_group.following = nullptr;
-        strategy_group.ClearStrategies();
-        for (const auto& [strategy, input_shardings] : new_vector) {
-          strategy_group.AddStrategy(strategy, input_shardings);
-        }
+    }
+  } else if (!strategy_group.following) {
+    // If existing sharding is a partial sharding from previous iteration,
+    // find the strategies that are 1D&&complete or align with user
+    // sharding.
+    // It is IMPORTANT that we do this only for instructions that do no follow
+    // others, to keep the number of ILP variable small.
+    std::vector<std::pair<ShardingStrategy, InputShardings>> new_vector;
+    const auto& strategy_input_shardings =
+        strategy_group.GetStrategyInputShardings();
+    for (size_t iid = 0; iid < strategy_input_shardings.size(); ++iid) {
+      const InputShardings& input_shardings = strategy_input_shardings[iid];
+      const ShardingStrategy& strategy =
+          strategy_group.GetStrategyForInputShardings(iid);
+      if (strategy.output_sharding.IsReplicated() ||
+          ShardingIsConsistent(existing_sharding, strategy.output_sharding,
+                               strict) ||
+          (VectorGreaterThanOneElementCount(
+               strategy.output_sharding.tile_assignment().dimensions()) == 1 &&
+           spmd::ShardingIsComplete(
+               strategy.output_sharding,
+               cluster_env.original_device_mesh_.num_elements()))) {
+        new_vector.push_back({strategy, input_shardings});
+      }
+    }
+    // If no sharding strategy left, just keep the original set, because we do
+    // not have to strictly keep those shardings and the only purpose is to
+    // reduce problem size for the last iteration.
+    if (!new_vector.empty() &&
+        new_vector.size() != strategy_group.GetStrategies().size()) {
+      strategy_group.following = nullptr;
+      strategy_group.ClearStrategies();
+      for (const auto& [strategy, input_shardings] : new_vector) {
+        strategy_group.AddStrategy(strategy, input_shardings);
       }
     }
   }
@@ -1532,7 +1534,7 @@ void RemoveShardingsWhereSmallDimsShardedAcrossManyDevices(
       continue;
     }
     const auto& tile_assignment = strategy.output_sharding.tile_assignment();
-    for (int64_t i = 0; i < shape.rank(); ++i) {
+    for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
       if (tile_assignment.dim(i) > 1 &&
           tile_assignment.dim(i) > shape.dimensions(i)) {
         invalid_strategy_indices.push_back(sid);
@@ -1642,7 +1644,7 @@ std::unique_ptr<StrategyGroup> HandleManuallyShardedInstruction(
   std::unique_ptr<StrategyGroup> strategy_group;
   if (shape.IsTuple()) {
     strategy_group = CreateTupleStrategyGroup(instruction_id);
-    for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+    for (size_t i = 0; i < shape.tuple_shapes().size(); ++i) {
       std::unique_ptr<StrategyGroup> child_strategies =
           HandleManuallyShardedInstruction(ins, shape.tuple_shapes(i),
                                            instruction_id, strategy_groups,
@@ -1754,10 +1756,6 @@ CreateAutoShardingSolverRequestAndCallSolver(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
     const CostGraph& cost_graph, const AliasSet& alias_set,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
-    const std::vector<absl::btree_set<int64_t>>& node_groups,
-    const std::vector<absl::btree_set<int64_t>>& edge_groups,
     const std::vector<NodeStrategyIdx>& s_hint, const bool compute_iis,
     const int64_t solver_timeout_in_seconds, const AutoShardingOption& option,
     std::optional<double> max_cost, absl::string_view request_name,
@@ -1947,32 +1945,61 @@ CreateAutoShardingSolverRequestAndCallSolver(
     }
   }
 
-  for (const auto& interval : node_intervals) {
-    AutoShardingSolverRequest_Pair pair;
-    pair.set_first(interval.first);
-    pair.set_second(interval.second);
-    *request.add_node_intervals() = std::move(pair);
+  // Serialize intervals
+  std::vector<absl::flat_hash_set<EdgeIdx>> node_to_edges(
+      strategy_groups.size());
+  EdgeIdx edge_idx = 0;
+  for (const auto& [edge, _] : cost_graph.edge_costs_) {
+    node_to_edges[edge.second].insert(edge_idx);
+    ++edge_idx;
   }
-  for (const auto& interval : edge_intervals) {
-    AutoShardingSolverRequest_Pair pair;
-    pair.set_first(interval.first);
-    pair.set_second(interval.second);
-    *request.add_edge_intervals() = std::move(pair);
+  const absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
+      buffer_live_ranges = hlo_live_range.buffer_live_ranges();
+  absl::flat_hash_map<NodeIdx, HloLiveRange::TimeBound> node_to_time_bound;
+  absl::flat_hash_map<EdgeIdx, HloLiveRange::TimeBound> edge_to_time_bound;
+  for (const auto& [value, time_bound] : buffer_live_ranges) {
+    const HloInstruction* instruction = value->instruction();
+    const ShapeIndex& index = value->index();
+    if (instruction->shape().IsTuple() && index.empty()) continue;
+    const spmd::StrategyGroup* strategy_group =
+        strategy_map.at(instruction).get();
+    const spmd::NodeIdx node_idx =
+        strategy_group->GetSubStrategyGroup(index)->node_idx;
+    if (node_idx < 0) continue;
+    node_to_time_bound[node_idx] = time_bound;
+    for (const EdgeIdx edge_idx : node_to_edges[node_idx]) {
+      edge_to_time_bound[edge_idx] = time_bound;
+    }
   }
-  for (const auto& reduced_group : node_groups) {
-    AutoShardingSolverRequest_Group group;
-    group.mutable_prims()->Add(reduced_group.begin(), reduced_group.end());
-    *request.add_node_groups() = std::move(group);
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    AutoShardingSolverRequest_Pair interval;
+    if (auto time_bound = node_to_time_bound.find(node_idx);
+        time_bound != node_to_time_bound.end()) {
+      interval.set_first(time_bound->second.start);
+      interval.set_second(time_bound->second.end);
+    } else {
+      interval.set_first(std::numeric_limits<int64_t>::max());
+      interval.set_second(0);
+    }
+    *request.add_node_intervals() = std::move(interval);
   }
-  for (const auto& reduced_group : edge_groups) {
-    AutoShardingSolverRequest_Group group;
-    group.mutable_prims()->Add(reduced_group.begin(), reduced_group.end());
-    *request.add_edge_groups() = std::move(group);
+  for (EdgeIdx edge_idx = 0; edge_idx < request.edges_size(); ++edge_idx) {
+    AutoShardingSolverRequest_Pair interval;
+    if (auto time_bound = edge_to_time_bound.find(edge_idx);
+        time_bound != edge_to_time_bound.end()) {
+      interval.set_first(time_bound->second.start);
+      interval.set_second(time_bound->second.end);
+    } else {
+      interval.set_first(std::numeric_limits<int64_t>::max());
+      interval.set_second(0);
+    }
+    *request.add_edge_intervals() = std::move(interval);
   }
 
-  PopulateTemporalValues(cost_graph, request);
-
-  return FormulateAndSolveMIPFromSolverRequest(request);
+  const auto converted_problem = ConvertToProblem(request);
+  const auto converted_request = ConvertToSolverRequest(converted_problem);
+  return FormulateAndSolveMIPFromSolverRequest(converted_request,
+                                               GetParams(request));
 }
 
 void CheckHloSharding(
@@ -2188,8 +2215,10 @@ absl::Status InsertReshardReshapes(
               rhs->shape(), rhs_sharding,
               /*consider_reverse_device_meshes=*/true, crash_at_error);
 
-      if (lhs_tensor_dim_to_mesh_dim.size() != lhs->shape().rank() ||
-          rhs_tensor_dim_to_mesh_dim.size() != rhs->shape().rank()) {
+      if (lhs_tensor_dim_to_mesh_dim.size() !=
+              lhs->shape().dimensions().size() ||
+          rhs_tensor_dim_to_mesh_dim.size() !=
+              rhs->shape().dimensions().size()) {
         return absl::InvalidArgumentError(
             "Cannot generate tensor dim to mesh dim mapping");
       }
@@ -2251,7 +2280,7 @@ absl::Status InsertReshardReshapes(
           case HloOpcode::kCustomCall:
           case HloOpcode::kRngBitGenerator:
           case HloOpcode::kSort: {
-            for (size_t i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
+            for (size_t i = 0; i < inst->shape().tuple_shapes().size(); ++i) {
               const InputShardings& input_shardings =
                   GetInputShardingsForTuple(inst, {static_cast<int64_t>(i)},
                                             strategy_map, cost_graph, s_val);
@@ -2265,7 +2294,7 @@ absl::Status InsertReshardReshapes(
             break;
           }
           case HloOpcode::kTuple: {
-            for (size_t i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
+            for (size_t i = 0; i < inst->shape().tuple_shapes().size(); ++i) {
               const InputShardings& input_shardings =
                   GetInputShardingsForTuple(inst, {static_cast<int64_t>(i)},
                                             strategy_map, cost_graph, s_val);
@@ -2279,8 +2308,8 @@ absl::Status InsertReshardReshapes(
           }
           case HloOpcode::kGetTupleElement: {
             std::vector<std::optional<HloSharding>> dst_shardings(
-                inst->shape().tuple_shapes_size(), std::nullopt);
-            for (size_t i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
+                inst->shape().tuple_shapes().size(), std::nullopt);
+            for (size_t i = 0; i < inst->shape().tuple_shapes().size(); ++i) {
               CHECK(!inst->shape().tuple_shapes(i).IsTuple())
                   << "We currently do not support ops with nested tuples as "
                      "output. See b/332951306.";
@@ -2632,7 +2661,7 @@ void CheckUserShardingPreservation(
       } else if (inst->sharding().IsTuple()) {
         const std::vector<HloSharding>* preserve_shardings_tuple =
             &preserve_shardings.at(inst->name());
-        for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
+        for (size_t i = 0; i < inst->shape().tuple_shapes().size(); i++) {
           if (!preserve_shardings_tuple->at(i).IsUnknown() &&
               preserve_shardings_tuple->at(i) !=
                   inst->sharding().tuple_elements().at(i)) {
@@ -3202,7 +3231,7 @@ HloSharding GetReduceScatterOutput(const HloInstruction* ins,
     }
   } else if (ins->opcode() == HloOpcode::kReduce) {
     // TODO(zhuohan): support more cases.
-    CHECK_EQ(ins->shape().rank(), 1);
+    CHECK_EQ(ins->shape().dimensions().size(), 1);
 
     int mesh_dim;
     if (absl::StrContains(input_shardings.name, "allreduce @ [0]")) {
@@ -3257,7 +3286,8 @@ bool HasReduceScatterOpportunity(const HloInstruction* inst,
     return false;
   }
 
-  if (inst->opcode() == HloOpcode::kReduce && inst->shape().rank() == 1) {
+  if (inst->opcode() == HloOpcode::kReduce &&
+      inst->shape().dimensions().size() == 1) {
     return true;
   }
   if (inst->opcode() == HloOpcode::kDot) {
@@ -3555,9 +3585,7 @@ absl::StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
   };
   TF_ASSIGN_OR_RETURN(
       HloSchedule schedule,
-      ScheduleModule(module, size_fn,
-                     ComputationSchedulerToModuleScheduler(DFSMemoryScheduler),
-                     execution_threads));
+      ScheduleModule(module, DFSMemoryScheduler(size_fn), execution_threads));
   const HloComputation* entry_computation = module->entry_computation();
   std::unique_ptr<HloAliasAnalysis> alias_analysis =
       HloAliasAnalysis::Run(module).value();
@@ -3722,90 +3750,12 @@ absl::StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
     spmd::CostGraph cost_graph(strategy_groups, associative_dot_pairs);
     cost_graph.Simplify(option_.simplify_graph);
 
-    // ----- Build & reduce node and edge intervals -----
-    std::vector<absl::flat_hash_set<spmd::EdgeIdx>> node_to_edges(
-        strategy_groups.size());
-    spmd::EdgeIdx edge_idx = 0;
-    for (const auto& [edge, _] : cost_graph.edge_costs_) {
-      node_to_edges[edge.second].insert(edge_idx);
-      ++edge_idx;
-    }
-    const absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
-        buffer_live_ranges = hlo_live_range->buffer_live_ranges();
-    absl::flat_hash_map<spmd::NodeIdx, HloLiveRange::TimeBound>
-        node_to_time_bound;
-    absl::flat_hash_map<spmd::EdgeIdx, HloLiveRange::TimeBound>
-        edge_to_time_bound;
-    for (const auto& [value, time_bound] : buffer_live_ranges) {
-      const HloInstruction* instruction = value->instruction();
-      const ShapeIndex& index = value->index();
-      if (instruction->shape().IsTuple() && index.empty()) continue;
-      const spmd::StrategyGroup* strategy_group =
-          strategy_map.at(instruction).get();
-      const spmd::NodeIdx node_idx =
-          strategy_group->GetSubStrategyGroup(index)->node_idx;
-      if (node_idx < 0) continue;
-      node_to_time_bound[node_idx] = time_bound;
-      for (const spmd::EdgeIdx edge_idx : node_to_edges[node_idx]) {
-        edge_to_time_bound[edge_idx] = time_bound;
-      }
-    }
-    std::vector<std::pair<spmd::LivenessIdx, spmd::LivenessIdx>> node_intervals,
-        edge_intervals;
-    for (spmd::NodeIdx node_idx = 0; node_idx < strategy_groups.size();
-         ++node_idx) {
-      std::pair<spmd::LivenessIdx, spmd::LivenessIdx> interval;
-      if (auto time_bound = node_to_time_bound.find(node_idx);
-          time_bound != node_to_time_bound.end()) {
-        interval.first = time_bound->second.start;
-        interval.second = time_bound->second.end;
-      } else {
-        interval.first = std::numeric_limits<int64_t>::max();
-        interval.second = 0;
-      }
-      node_intervals.push_back(std::move(interval));
-    }
-    for (spmd::EdgeIdx edge_idx = 0; edge_idx < cost_graph.edge_costs_.size();
-         ++edge_idx) {
-      std::pair<spmd::LivenessIdx, spmd::LivenessIdx> interval;
-      if (auto time_bound = edge_to_time_bound.find(edge_idx);
-          time_bound != edge_to_time_bound.end()) {
-        interval.first = time_bound->second.start;
-        interval.second = time_bound->second.end;
-      } else {
-        interval.first = std::numeric_limits<int64_t>::max();
-        interval.second = 0;
-      }
-      edge_intervals.push_back(std::move(interval));
-    }
-    const absl::Time term_reduction_start_time = absl::Now();
-    std::vector<std::pair<spmd::LivenessIdx, spmd::LivenessIdx>>
-        reduced_node_intervals, reduced_edge_intervals;
-    std::vector<absl::btree_set<int64_t>> reduced_node_groups,
-        reduced_edge_groups;
-    auto num_node_terms =
-        ReduceMemoryTerms(strategy_groups.size(), node_intervals,
-                          reduced_node_intervals, reduced_node_groups);
-    auto num_edge_terms =
-        ReduceMemoryTerms(cost_graph.edge_costs_.size(), edge_intervals,
-                          reduced_edge_intervals, reduced_edge_groups);
-    const absl::Time term_reduction_end_time = absl::Now();
-    const auto term_reduction_duration =
-        term_reduction_end_time - term_reduction_start_time;
-    LOG(INFO) << "Memory Term Reducer took "
-              << absl::ToInt64Milliseconds(term_reduction_duration)
-              << " ms and reduced the number of terms from "
-              << num_node_terms.first + num_edge_terms.first << " to "
-              << num_node_terms.second + num_edge_terms.second;
-
     // ----- Call the ILP Solver -----
     std::string request_name = absl::StrCat("mesh_idx_", mesh_idx);
-    TF_ASSIGN_OR_RETURN(
-        spmd::AutoShardingSolverOutput output,
-        Solve(*module, *hlo_live_range, strategy_map, strategy_groups,
-              cost_graph, alias_set, reduced_node_intervals,
-              reduced_edge_intervals, reduced_node_groups, reduced_edge_groups,
-              option_, request_name, sharding_propagation_solution));
+    TF_ASSIGN_OR_RETURN(spmd::AutoShardingSolverOutput output,
+                        Solve(*module, *hlo_live_range, strategy_map,
+                              strategy_groups, cost_graph, alias_set, option_,
+                              request_name, sharding_propagation_solution));
     if (mesh_idx == partial_mesh_shapes.size() - 1) {
       this->solver_optimal_objective_value_ = output.cost;
     } else if (hard_memory_constraint) {
@@ -3948,8 +3898,8 @@ absl::Status MoveComputationsFromModuleToModule(HloModule* from_module,
     computation_replacements[original_computation] = new_computation;
   }
 
-  to_module->ReplaceComputations(computation_replacements);
   to_module->MoveComputationsFrom(from_module);
+  to_module->ReplaceComputations(computation_replacements);
 
   *to_module->mutable_config().mutable_entry_computation_layout() =
       from_module->entry_computation_layout();
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 2034dd689724..9db3f047c91d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -209,10 +209,6 @@ HloSharding GetReduceScatterOutput(const HloInstruction* ins,
                                    const ShardingStrategy& strategy,
                                    const ClusterEnvironment& cluster_env);
 
-// Populates temporal distance values.
-void PopulateTemporalValues(const CostGraph& cost_graph,
-                            AutoShardingSolverRequest& request);
-
 void AddReplicatedStrategy(
     const HloInstruction* ins, const Shape& shape,
     const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 3bbfd1ff0773..3c748b2ff911 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -717,12 +717,12 @@ void DotHandler::AppendAllGatherWindowedEinsumStrategyForOperand(
     used_mesh_dims.insert(mesh_dim_set.begin(), mesh_dim_set.end());
   }
   if (used_mesh_dims.size() == device_mesh_.num_dimensions() ||
-      used_mesh_dims.size() == operand->shape().rank()) {
+      used_mesh_dims.size() == operand->shape().dimensions().size()) {
     return;
   }
 
-  for (int64_t tensor_dim = 0; tensor_dim < operand->shape().rank();
-       ++tensor_dim) {
+  for (int64_t tensor_dim = 0;
+       tensor_dim < operand->shape().dimensions().size(); ++tensor_dim) {
     if (auto it = operand_dim_map.find(tensor_dim);
         it != operand_dim_map.end() && IsMeshDimSetNonTrivial(it->second)) {
       continue;
@@ -763,11 +763,11 @@ void DotHandler::AppendReduceScatterWindowedEinsumStrategy(
   }
 
   if (used_mesh_dims.size() == device_mesh_.num_dimensions() ||
-      used_mesh_dims.size() == ins_->shape().rank()) {
+      used_mesh_dims.size() == ins_->shape().dimensions().size()) {
     return;
   }
 
-  for (int64_t tensor_dim = 0; tensor_dim < ins_->shape().rank();
+  for (int64_t tensor_dim = 0; tensor_dim < ins_->shape().dimensions().size();
        ++tensor_dim) {
     if (auto it = output_dim_map.find(tensor_dim);
         it != output_dim_map.end() && IsMeshDimSetNonTrivial(it->second)) {
@@ -805,7 +805,8 @@ absl::Status DotHandler::RegisterStrategies() {
       [&](const DimMap& output_dim_map) {
         GenerateDotShardingStrategiesFromOutputSharding(output_dim_map);
       },
-      ins_->shape().rank(), all_mesh_dims, option_.allow_mixed_mesh_shape);
+      ins_->shape().dimensions().size(), all_mesh_dims,
+      option_.allow_mixed_mesh_shape);
   SortStrategies();
   return absl::OkStatus();
 }
@@ -964,7 +965,7 @@ void ConvHandler::SplitDepthwise(bool forward) {
       };
   std::vector<int> all_mesh_dims(device_mesh_.num_dimensions());
   std::iota(all_mesh_dims.begin(), all_mesh_dims.end(), 0);
-  Enumerate(split_func, ins_->shape().rank(), all_mesh_dims,
+  Enumerate(split_func, ins_->shape().dimensions().size(), all_mesh_dims,
             option_.allow_mixed_mesh_shape);
 }
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
index b9226f561244..0bb737a624c1 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
@@ -42,27 +42,17 @@ absl::StatusOr<AutoShardingSolverOutput> Solve(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
     const CostGraph& cost_graph, const AliasSet& alias_set,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
-    const std::vector<absl::btree_set<int64_t>>& node_groups,
-    const std::vector<absl::btree_set<int64_t>>& edge_groups,
     const AutoShardingOption& option, absl::string_view request_prefix,
     const absl::flat_hash_map<std::string, HloSharding>&
         sharding_propagation_solution) {
   return CreateAutoShardingSolverRequestAndCallSolver(
       hlo_module, hlo_live_range, strategy_map, strategy_groups, cost_graph,
-      alias_set, node_intervals, edge_intervals, node_groups, edge_groups,
-      /*s_hint*/ {},
-      /*compute_iis*/ true, option.solver_timeout_in_seconds, option,
-      /*max_cost*/ std::nullopt, request_prefix, sharding_propagation_solution,
+      alias_set, /*s_hint*/ {}, /*compute_iis*/ true,
+      option.solver_timeout_in_seconds, option, /*max_cost*/ std::nullopt,
+      request_prefix, sharding_propagation_solution,
       /*deterministic mode*/ true);
 }
 
-void PopulateTemporalValues(const CostGraph& cost_graph,
-                            AutoShardingSolverRequest& request) {
-  // TODO(moffitt): Implement this.
-}
-
 double GetDotConvReplicationPenalty(const HloInstruction* inst,
                                     size_t instruction_id, size_t window,
                                     const HloInstructionSequence& sequence,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.cc
new file mode 100644
index 000000000000..11879156c6bc
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.cc
@@ -0,0 +1,351 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "xla/hlo/experimental/auto_sharding/iopddl.h"
+
+namespace xla {
+namespace spmd {
+
+iopddl::Cost ConvertCost(const double cost) {
+  CHECK_GE(cost, 0);  // Contest problems shouldn't include any negative costs.
+  if (cost >= kInfinityInt) {
+    return kInfinityInt;
+  }
+  return static_cast<int64_t>(cost);
+}
+
+iopddl::Problem ConvertToProblem(const AutoShardingSolverRequest& request) {
+  iopddl::Problem problem = {.name = request.request_name()};
+  std::vector<iopddl::Interval> node_intervals;
+  // Process all nodes, taking intervals if provided.
+  for (int64_t node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    iopddl::Interval node_interval = {kInfinityInt, -1};
+    if (request.live().empty()) {  // No live matrices, so use intervals.
+      CHECK_LT(node_idx, request.node_intervals_size());
+      const auto& interval = request.node_intervals(node_idx);
+      if (interval.first() <= interval.second()) {
+        node_interval = {interval.first(), interval.second() + 1};
+      }
+    }
+    node_intervals.push_back(node_interval);
+  }
+  // Process live matrices for nodes, if provided.
+  for (LivenessIdx t = 0; t < request.live_size(); ++t) {
+    for (int64_t node_idx : request.live(t).nodes()) {
+      if (node_idx >= request.num_nodes()) {
+        continue;  // This is a group.
+      }
+      node_intervals[node_idx] = {
+          std::min(node_intervals[node_idx].first, t),
+          std::max(node_intervals[node_idx].second, t + 1)};
+    }
+  }
+  // Process all groups (if present), taking intervals if provided.
+  std::vector<iopddl::Interval> group_intervals;
+  for (int64_t group_idx = 0; group_idx < request.node_groups_size();
+       ++group_idx) {
+    iopddl::Interval group_interval = {kInfinityInt, -1};
+    if (request.live().empty()) {  // No live matrices, so use intervals.
+      int64_t interval_idx = request.num_nodes() + group_idx;
+      CHECK_LT(interval_idx, request.node_intervals_size());
+      const auto& interval = request.node_intervals(interval_idx);
+      if (interval.first() <= interval.second()) {
+        group_interval = {interval.first(), interval.second() + 1};
+      }
+    }
+    group_intervals.push_back(group_interval);
+  }
+  // Process live matrices for groups, if provided.
+  for (LivenessIdx t = 0; t < request.live_size(); ++t) {
+    for (int64_t node_idx : request.live(t).nodes()) {
+      if (node_idx < request.num_nodes()) {
+        continue;  // This is not a group.
+      }
+      int64_t group_idx = node_idx - request.num_nodes();
+      group_intervals[group_idx] = {
+          std::min(node_intervals[group_idx].first, t),
+          std::max(node_intervals[group_idx].second, t + 1)};
+    }
+  }
+  // Propagate any group intervals to the nodes in that group.
+  for (int64_t group_idx = 0; group_idx < request.node_groups_size();
+       ++group_idx) {
+    const auto& group = request.node_groups(group_idx);
+    const auto& group_interval = group_intervals[group_idx];
+    for (const auto& node_idx : group.prims()) {
+      node_intervals[node_idx].first =
+          std::min(node_intervals[node_idx].first, group_interval.first);
+      node_intervals[node_idx].second =
+          std::max(node_intervals[node_idx].second, group_interval.second);
+    }
+  }
+  for (int64_t node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    const auto& interval = node_intervals[node_idx];
+    iopddl::Interval node_interval = {0, 0};
+    if (interval.first <= interval.second) {
+      node_interval = {interval.first, interval.second};
+    }
+    problem.nodes.push_back({node_interval});
+    CHECK_LT(node_idx, request.s_len_size());
+    CHECK_LT(node_idx, request.computation_costs_size());
+    CHECK_LT(node_idx, request.communication_costs_size());
+    CHECK_LT(node_idx, request.memory_costs_size());
+    for (int64_t j = 0; j < request.s_len(node_idx); ++j) {
+      CHECK_LT(j, request.computation_costs(node_idx).costs_size());
+      CHECK_LT(j, request.communication_costs(node_idx).costs_size());
+      CHECK_LT(j, request.memory_costs(node_idx).costs_size());
+      const double node_cost = request.computation_costs(node_idx).costs(j) +
+                               request.communication_costs(node_idx).costs(j);
+      const iopddl::Cost cost = ConvertCost(node_cost);
+      const iopddl::Usage usage =
+          ConvertCost(request.memory_costs(node_idx).costs(j));
+      problem.nodes.back().strategies.push_back({cost, usage});
+    }
+  }
+  // The first kind of edges come from request.edges
+  for (int64_t edge_idx = 0; edge_idx < request.edges_size(); ++edge_idx) {
+    const auto& edge = request.edges(edge_idx);
+    NodeIdx u = edge.first(), v = edge.second();
+    CHECK_LT(u, request.s_len_size());
+    CHECK_LT(v, request.s_len_size());
+    CHECK_LT(edge_idx, request.resharding_costs_size());
+    problem.edges.push_back({{u, v}});
+    double min_cost = 0.0;
+    for (int64_t i = 0; i < request.s_len(u); ++i) {
+      for (int64_t j = 0; j < request.s_len(v); ++j) {
+        const int64_t k = i * request.s_len(v) + j;
+        CHECK_LT(k, request.resharding_costs(edge_idx).costs_size());
+        min_cost =
+            std::min(min_cost, request.resharding_costs(edge_idx).costs(k));
+      }
+    }
+    for (int64_t i = 0; i < request.s_len(u); ++i) {
+      for (int64_t j = 0; j < request.s_len(v); ++j) {
+        const int64_t k = i * request.s_len(v) + j;
+        CHECK_LT(k, request.resharding_costs(edge_idx).costs_size());
+        const iopddl::Cost cost =
+            ConvertCost(request.resharding_costs(edge_idx).costs(k) - min_cost);
+        problem.edges.back().strategies.push_back({cost});
+      }
+    }
+  }
+  // The second kind of edges come from request.aliases
+  for (int64_t alias_idx = 0; alias_idx < request.aliases_size(); ++alias_idx) {
+    const auto& alias = request.aliases(alias_idx);
+    problem.edges.push_back({{alias.first(), alias.second()}});
+    CHECK_LT(alias.first(), request.s_len_size());
+    CHECK_LT(alias.second(), request.s_len_size());
+    CHECK_LT(alias_idx, request.value_costs_size());
+    for (int64_t i = 0; i < request.s_len(alias.first()); ++i) {
+      for (int64_t j = 0; j < request.s_len(alias.second()); ++j) {
+        const int64_t k = i * request.s_len(alias.second()) + j;
+        CHECK_LT(k, request.value_costs(alias_idx).costs_size());
+        const iopddl::Cost cost =
+            ConvertCost(request.value_costs(alias_idx).costs(k) * kInfinityInt);
+        problem.edges.back().strategies.push_back({cost});
+      }
+    }
+  }
+  // The third kind of edges come from request.s_follow
+  for (int64_t node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    CHECK_LT(node_idx, request.s_follow_size());
+    if (request.s_follow(node_idx) < 0) {
+      continue;
+    }
+    problem.edges.push_back({{request.s_follow(node_idx), node_idx}});
+    CHECK_LT(node_idx, request.s_len_size());
+    for (int64_t i = 0; i < request.s_len(node_idx); ++i) {
+      for (int64_t j = 0; j < request.s_len(node_idx); ++j) {
+        const iopddl::Cost cost = (i == j) ? 0 : kInfinityInt;
+        problem.edges.back().strategies.push_back({cost});
+      }
+    }
+  }
+  if (request.memory_budget() > 0) {
+    problem.usage_limit = request.memory_budget();
+  }
+  return problem;
+}
+
+static bool IsEdgeFollower(const iopddl::Problem& problem,
+                           const iopddl::Edge& edge) {
+  int strategies0 = problem.nodes[edge.nodes[0]].strategies.size();
+  int strategies1 = problem.nodes[edge.nodes[1]].strategies.size();
+  if (strategies0 != strategies1) {
+    return false;
+  }
+  for (iopddl::StrategyIdx idx0 = 0; idx0 < strategies0; ++idx0) {
+    for (iopddl::StrategyIdx idx1 = 0; idx1 < strategies1; ++idx1) {
+      const auto strategy = edge.strategies[idx0 * strategies1 + idx1];
+      if (idx0 == idx1 && strategy.cost != 0) {
+        return false;
+      }
+      if (idx0 != idx1 && strategy.cost != kInfinityInt) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static bool IsEdgeAlias(const iopddl::Edge& edge) {
+  for (const iopddl::Strategy& strategy : edge.strategies) {
+    if (strategy.cost == kInfinityInt) {
+      return true;
+    }
+  }
+  return false;
+}
+
+AutoShardingSolverRequest ConvertToSolverRequest(
+    const iopddl::Problem& problem) {
+  AutoShardingSolverRequest request;
+  request.set_request_name(problem.name);
+  request.set_num_nodes(problem.nodes.size());
+  request.set_memory_budget(problem.usage_limit.value_or(-1));
+  const std::vector<int64_t> followers = GetFollowers(problem);
+  for (iopddl::NodeIdx node_idx = 0; node_idx < problem.nodes.size();
+       ++node_idx) {
+    const iopddl::Node& node = problem.nodes[node_idx];
+    request.add_s_len(node.strategies.size());
+    request.add_s_follow(followers[node_idx]);
+    request.add_communication_costs();
+    request.add_computation_costs();
+    request.add_memory_costs();
+    for (const iopddl::Strategy& strategy : node.strategies) {
+      double strategy_cost = (strategy.cost == kInfinityInt)
+                                 ? kInfinityCost
+                                 : static_cast<double>(strategy.cost);
+      request.mutable_computation_costs()->rbegin()->add_costs(
+          static_cast<double>(strategy_cost));
+      request.mutable_communication_costs()->rbegin()->add_costs(0.0);
+      request.mutable_memory_costs()->rbegin()->add_costs(
+          static_cast<double>(strategy.usage));
+    }
+    request.add_node_intervals();
+    bool empty_interval = (node.interval.first == node.interval.second);
+    request.mutable_node_intervals()->rbegin()->set_first(
+        empty_interval ? 100 : node.interval.first);
+    request.mutable_node_intervals()->rbegin()->set_second(
+        empty_interval ? -1 : node.interval.second - 1);
+  }
+  for (const auto& alias : GetAliases(problem)) {
+    auto* alias_proto = request.add_aliases();
+    alias_proto->set_first(alias.nodes[0]);
+    alias_proto->set_second(alias.nodes[1]);
+    request.add_value_costs();
+    for (const iopddl::Strategy& strategy : alias.strategies) {
+      request.mutable_value_costs()->rbegin()->add_costs(
+          strategy.cost == kInfinityInt ? 1.0 : 0.0);
+    }
+  }
+  for (const auto& deduplicated_edge : GetDeduplicatedEdges(problem)) {
+    auto* edge_proto = request.add_edges();
+    edge_proto->set_first(deduplicated_edge.nodes[0]);
+    edge_proto->set_second(deduplicated_edge.nodes[1]);
+    request.add_resharding_costs();
+    for (const auto& strategy : deduplicated_edge.strategies) {
+      request.mutable_resharding_costs()->rbegin()->add_costs(
+          static_cast<double>(strategy.cost));
+    }
+  }
+  return request;
+}
+
+std::vector<int64_t> GetFollowers(const iopddl::Problem& problem) {
+  std::vector<int64_t> followers(problem.nodes.size(), -1);
+  for (iopddl::EdgeIdx edge_idx = 0; edge_idx < problem.edges.size();
+       ++edge_idx) {
+    const iopddl::Edge& edge = problem.edges[edge_idx];
+    if (IsEdgeFollower(problem, edge)) {
+      followers[edge.nodes[1]] = edge.nodes[0];
+    }
+  }
+  return followers;
+}
+
+std::vector<iopddl::Edge> GetAliases(const iopddl::Problem& problem) {
+  std::vector<iopddl::Edge> aliases;
+  for (const iopddl::Edge& edge : problem.edges) {
+    if (!IsEdgeFollower(problem, edge) && IsEdgeAlias(edge)) {
+      aliases.push_back(edge);
+    }
+  }
+  return aliases;
+}
+
+std::vector<iopddl::Edge> GetDeduplicatedEdges(const iopddl::Problem& problem) {
+  std::map<std::pair<int64_t, int64_t>, std::vector<iopddl::Cost>> edge_costs;
+  for (const iopddl::Edge& edge : problem.edges) {
+    if (IsEdgeFollower(problem, edge) || IsEdgeAlias(edge)) {
+      continue;
+    }
+    std::pair<int64_t, int64_t> node_pair = {edge.nodes[0], edge.nodes[1]};
+    if (edge_costs.find(node_pair) == edge_costs.end()) {
+      edge_costs[node_pair].resize(edge.strategies.size());
+    }
+    auto& costs = edge_costs[node_pair];
+    for (iopddl::StrategyIdx idx = 0; idx < edge.strategies.size(); ++idx) {
+      costs[idx] += edge.strategies[idx].cost;
+    }
+  }
+  std::vector<iopddl::Edge> deduplicated_edges;
+  for (const auto& [edge, costs] : edge_costs) {
+    deduplicated_edges.push_back({{edge.first, edge.second}});
+    for (iopddl::Cost strategy_cost : costs) {
+      deduplicated_edges.back().strategies.push_back({strategy_cost});
+    }
+  }
+  return deduplicated_edges;
+}
+
+void RandomizeCosts(iopddl::Problem& problem) {
+  unsigned int seed = 2025;
+  auto get_multiplier = [&]() {  // Returns a value between 1/16 and 16.0
+    return std::pow(2.0, (rand_r(&seed) % 9) - 4);
+  };
+  auto randomize = [&](iopddl::Cost& cost, const double multiplier) {
+    if (cost != kInfinityInt) {
+      cost = static_cast<iopddl::Cost>(static_cast<double>(cost) * multiplier);
+    }
+  };
+  for (iopddl::Node& node : problem.nodes) {
+    const double multiplier = get_multiplier();
+    for (iopddl::Strategy& strategy : node.strategies) {
+      randomize(strategy.cost, multiplier);
+    }
+  }
+  for (iopddl::Edge& edge : problem.edges) {
+    const double multiplier = get_multiplier();
+    for (iopddl::Strategy& strategy : edge.strategies) {
+      randomize(strategy.cost, multiplier);
+    }
+  }
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h
new file mode 100644
index 000000000000..e55955a2469a
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_iopddl.h
@@ -0,0 +1,48 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_IOPDDL_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_IOPDDL_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
+#include "xla/hlo/experimental/auto_sharding/iopddl.h"
+
+namespace xla {
+namespace spmd {
+
+constexpr int64_t kInfinityInt = 1e18;
+
+iopddl::Cost ConvertCost(double cost);
+
+iopddl::Problem ConvertToProblem(const AutoShardingSolverRequest& request);
+
+AutoShardingSolverRequest ConvertToSolverRequest(
+    const iopddl::Problem& problem);
+
+std::vector<int64_t> GetFollowers(const iopddl::Problem& problem);
+
+std::vector<iopddl::Edge> GetAliases(const iopddl::Problem& problem);
+
+std::vector<iopddl::Edge> GetDeduplicatedEdges(const iopddl::Problem& problem);
+
+void RandomizeCosts(iopddl::Problem& problem);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_IOPDDL_H_
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index abd1c9ec073f..7bb6fdf04165 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <optional>
 #include <random>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -64,6 +65,7 @@ namespace spmd {
 using ::operations_research::MPConstraint;
 using ::operations_research::MPSolver;
 using ::operations_research::MPVariable;
+using EdgeAdjacency = std::vector<std::vector<EdgeIdx>>;
 
 // We need to nudge the maximum cost (if present) slightly, since the constraint
 // solver cannot guarantee exact numerical precision.
@@ -83,96 +85,30 @@ bool AutoShardingSolverOutput::operator==(
          is_optimal == other.is_optimal && peak_times == other.peak_times;
 }
 
-namespace {
-
-double MaxCoeff(
-    const tsl::protobuf::RepeatedPtrField<AutoShardingSolverRequest_Costs>&
-        cost_mat) {
-  double max_coeff = 0.0;
-  for (auto& costs : cost_mat) {
-    for (auto& cost : costs.costs()) {
-      if (cost < kInfinityCost) {
-        max_coeff = std::max(max_coeff, cost);
-      }
-    }
-  }
-  return max_coeff;
-}
-
-void ScaleCoeffs(
-    double scaling_factor,
-    tsl::protobuf::RepeatedPtrField<AutoShardingSolverRequest_Costs>*
-        cost_mat) {
-  for (auto& costs : *cost_mat) {
-    for (auto& cost : *costs.mutable_costs()) {
-      if (cost < kInfinityCost) {
-        cost = floor(cost * scaling_factor);
-      }
-    }
-  }
-}
-
-}  // namespace
-
-AutoShardingSolverRequest ScaleRequest(
-    const AutoShardingSolverRequest& request) {
-  if (!request.has_coeff_limit()) return request;
-  VLOG(0) << "Scaling request by coefficient limit: "
-          << request.coeff_limit().coeff();
-  double max_coeff = 0.0;
-  max_coeff = std::max(max_coeff, MaxCoeff(request.communication_costs()));
-  max_coeff = std::max(max_coeff, MaxCoeff(request.computation_costs()));
-  max_coeff = std::max(max_coeff, MaxCoeff(request.resharding_costs()));
-  if (max_coeff <= request.coeff_limit().coeff()) return request;
-  const double scaling_factor = request.coeff_limit().coeff() / max_coeff;
-  AutoShardingSolverRequest scaled_request = request;
-  ScaleCoeffs(scaling_factor, scaled_request.mutable_communication_costs());
-  ScaleCoeffs(scaling_factor, scaled_request.mutable_computation_costs());
-  ScaleCoeffs(scaling_factor, scaled_request.mutable_resharding_costs());
-  return scaled_request;
-}
-
 double MinimumMemoryBudgetRequired(const AutoShardingSolverRequest& request) {
   std::vector<double> min_memory_required;
-  if (request.node_intervals().empty()) {  // Handles live matrices.
-    min_memory_required.resize(request.live_size(), 0.0);
-    for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
-      for (NodeIdx node_idx : request.live(time_idx).nodes()) {
-        const auto& m = request.memory_costs(node_idx).costs();
-        const double fixed_memory_cost = *std::min_element(m.begin(), m.end());
-        min_memory_required[time_idx] += fixed_memory_cost;
-      }
+  std::vector<double> min_memory_required_group;
+  for (NodeIdx node_idx = 0; node_idx < request.node_intervals_size();
+       ++node_idx) {
+    const auto& interval = request.node_intervals(node_idx);
+    if (interval.first() > interval.second()) {
+      continue;
     }
-  } else {  // Handles the interval-based memory representation.
-    std::vector<double> min_memory_required_group;
-    for (const auto& group : request.node_groups()) {
-      double fixed_memory_cost = 0.0;
-      for (const NodeIdx node_idx : group.prims()) {
-        const auto& m = request.memory_costs(node_idx).costs();
-        fixed_memory_cost += *std::min_element(m.begin(), m.end());
-      }
-      min_memory_required_group.push_back(fixed_memory_cost);
+    // Expand cost vectors if needed to cover the range of this interval.
+    while (min_memory_required.size() <= interval.second()) {
+      min_memory_required.push_back(0.0);
     }
-    for (NodeIdx node_idx = 0; node_idx < request.node_intervals_size();
-         ++node_idx) {
-      const auto& interval = request.node_intervals(node_idx);
-      if (interval.first() > interval.second()) continue;
-      // Expand cost vectors if needed to cover the range of this interval.
-      while (min_memory_required.size() <= interval.second()) {
-        min_memory_required.push_back(0.0);
-      }
-      double fixed_memory_cost = 0.0;
-      if (node_idx < request.num_nodes()) {
-        const auto& m = request.memory_costs(node_idx).costs();
-        fixed_memory_cost = *std::min_element(m.begin(), m.end());
-      } else {
-        int64_t group_idx = node_idx - request.num_nodes();
-        fixed_memory_cost = min_memory_required_group[group_idx];
-      }
-      for (LivenessIdx time_idx = interval.first();
-           time_idx <= interval.second(); ++time_idx) {
-        min_memory_required[time_idx] += fixed_memory_cost;
-      }
+    double fixed_memory_cost = 0.0;
+    if (node_idx < request.num_nodes()) {
+      const auto& m = request.memory_costs(node_idx).costs();
+      fixed_memory_cost = *std::min_element(m.begin(), m.end());
+    } else {
+      int64_t group_idx = node_idx - request.num_nodes();
+      fixed_memory_cost = min_memory_required_group[group_idx];
+    }
+    for (LivenessIdx time_idx = interval.first(); time_idx <= interval.second();
+         ++time_idx) {
+      min_memory_required[time_idx] += fixed_memory_cost;
     }
   }
   double min_memory_budget_required_estimate = 0.0;
@@ -184,6 +120,30 @@ double MinimumMemoryBudgetRequired(const AutoShardingSolverRequest& request) {
   return min_memory_budget_required_estimate;
 }
 
+AutoShardingSolverParams GetParams(const AutoShardingSolverRequest& request) {
+  AutoShardingSolverParams params;
+  for (const auto& departure_cost : request.departure_costs()) {
+    std::vector<double> departure_cost_vector(departure_cost.costs().begin(),
+                                              departure_cost.costs().end());
+    params.departure_costs.push_back(departure_cost_vector);
+  }
+  params.deterministic_mode = request.deterministic_mode();
+  params.max_departures =
+      request.has_max_departures()
+          ? std::make_optional(request.max_departures().coeff())
+          : std::nullopt;
+  params.minimize_departures = request.minimize_departures();
+  params.overbudget_coeff =
+      request.has_overbudget_coeff()
+          ? std::make_optional(request.overbudget_coeff().coeff())
+          : std::nullopt;
+  params.solver_timeout =
+      request.has_solver_timeout()
+          ? absl::Seconds(request.solver_timeout().solver_timeout_in_seconds())
+          : absl::InfiniteDuration();
+  return params;
+}
+
 namespace {
 
 std::vector<NodeStrategyIdx> GetChosenNodeStrategy(
@@ -198,6 +158,11 @@ std::vector<NodeStrategyIdx> GetChosenNodeStrategy(
         break;
       }
     }
+    if (chosen_node_strategy[node_idx] == -1) {
+      LOG(WARNING) << "No strategy chosen for node " << node_idx
+                   << ", replacing with zero.";
+      chosen_node_strategy[node_idx] = 0;
+    }
   }
   return chosen_node_strategy;
 }
@@ -260,10 +225,10 @@ void PrintLargestInstructions(
 
 absl::StatusOr<AutoShardingSolverOutput> SolveAndExtractSolution(
     const AutoShardingSolverRequest& request,
+    const AutoShardingSolverParams& params,
     const std::vector<std::vector<MPVariable*>>& s,
     const std::vector<std::vector<MPVariable*>>& e,
-    const MPVariable* overbudget_var, const MPVariable* makespan_var,
-    MPSolver& solver) {
+    const MPVariable* overbudget_var, MPSolver& solver) {
   auto status = solver.Solve();
   LOG(INFO) << "Solver absl::Status: " << status;
 
@@ -299,14 +264,17 @@ absl::StatusOr<AutoShardingSolverOutput> SolveAndExtractSolution(
 #endif
     return absl::InternalError(
         "MPSolver could not find any feasible solution.");
-  } else if (status == operations_research::MPSolver::MODEL_INVALID) {
+  }
+  if (status == operations_research::MPSolver::MODEL_INVALID) {
     LOG(FATAL) << "The MIP fed to the solver is invalid. This is most likely a "
                   "bug and should be reported.";
     return absl::InternalError("Invalid MIP.");
-  } else if (status == operations_research::MPSolver::NOT_SOLVED) {
+  }
+  if (status == operations_research::MPSolver::NOT_SOLVED) {
     LOG(WARNING) << "Solver timeout; no solution was produced";
     return absl::InternalError("Solver timed out.");
-  } else if (status != operations_research::MPSolver::OPTIMAL) {
+  }
+  if (status != operations_research::MPSolver::OPTIMAL) {
     LOG(WARNING) << "Solver timeout; moving forward with a suboptimal solution";
   } else {
     is_optimal = true;
@@ -359,14 +327,10 @@ absl::StatusOr<AutoShardingSolverOutput> SolveAndExtractSolution(
     unsalted_objective += request.resharding_costs(edge_idx).costs(j);
   }
   if (overbudget_var) {
-    unsalted_objective += request.overbudget_coeff().coeff() *
+    unsalted_objective += *params.overbudget_coeff *
                           overbudget_var->solution_value() *
                           request.memory_budget();
   }
-  if (makespan_var) {
-    unsalted_objective +=
-        request.makespan_coeff().coeff() * makespan_var->solution_value();
-  }
 
   LOG(INFO) << "Unsalted objective value: " << unsalted_objective;
   LOG(INFO) << "N = " << request.num_nodes();
@@ -386,14 +350,9 @@ absl::StatusOr<AutoShardingSolverOutput> SolveAndExtractSolution(
 // create constrained variables for the subsequent groups.
 std::optional<std::pair<int64_t, int64_t>> ReduceMemoryTerms(
     const AutoShardingSolverRequest& request, MPSolver& solver,
-    int64_t num_lives, int64_t num_primitives,
-    const std::function<
-        tsl::protobuf::RepeatedField<int64_t>(int64_t)>&  // NOLINT
-        live,
+    int64_t num_primitives,
     const tsl::protobuf::RepeatedPtrField<  // NOLINT
         AutoShardingSolverRequest_Pair>& intervals,
-    const tsl::protobuf::RepeatedPtrField<  // NOLINT
-        AutoShardingSolverRequest_Group>& groups,
     const tsl::protobuf::RepeatedPtrField<  // NOLINT
         AutoShardingSolverRequest_Costs>& memory_costs,
     absl::string_view prim_type,
@@ -404,32 +363,23 @@ std::optional<std::pair<int64_t, int64_t>> ReduceMemoryTerms(
   const absl::Time term_reduction_start_time = absl::Now();
   std::optional<std::pair<int64_t, int64_t>> num_terms = std::nullopt;
   std::vector<absl::btree_set<int64_t>> reduced_groups;
-  if (groups.empty()) {
-    // If we've been given primitive intervals instead of a liveness matrix, we
-    // need to update the # of lives in order to use the memory term reducer.
-    for (const auto& interval : intervals) {
-      if (interval.first() > interval.second()) continue;  // Interval undefined
-      num_lives = std::max(num_lives, interval.second() + 1);
-    }
-    auto Intervals =
-        [intervals](int64_t prim_idx) -> std::pair<int64_t, int64_t> {
-      return {intervals.at(prim_idx).first(), intervals.at(prim_idx).second()};
-    };
-    MemoryTermReducer reducer;
-    num_terms =
-        intervals.empty()
-            ? reducer.Reduce(num_lives, num_primitives, live)
-            : reducer.Reduce(num_lives, num_primitives, std::move(Intervals));
-    reduced_intervals = reducer.GetReducedIntervals();
-    reduced_groups = reducer.GetReducedGroups();
-  } else {  // If we've already done term reduction, just copy over the results.
-    for (const auto& interval : intervals) {
-      reduced_intervals.push_back({interval.first(), interval.second()});
-    }
-    for (const auto& group : groups) {
-      reduced_groups.push_back({group.prims().begin(), group.prims().end()});
+  // We need to compute the number of lives in order to use the memory term
+  // reducer.
+  int64_t num_lives = 0;
+  for (const auto& interval : intervals) {
+    if (interval.first() > interval.second()) {
+      continue;  // Interval undefined
     }
+    num_lives = std::max(num_lives, interval.second() + 1);
   }
+  auto Intervals =
+      [intervals](int64_t prim_idx) -> std::pair<int64_t, int64_t> {
+    return {intervals.at(prim_idx).first(), intervals.at(prim_idx).second()};
+  };
+  MemoryTermReducer reducer;
+  num_terms = reducer.Reduce(num_lives, num_primitives, std::move(Intervals));
+  reduced_intervals = reducer.GetReducedIntervals();
+  reduced_groups = reducer.GetReducedGroups();
   solver.MakeNumVarArray(reduced_groups.size(), 0.0, MPSolver::infinity(),
                          absl::StrCat("group_", prim_type), &group_vars);
   for (int64_t group_idx = 0; group_idx < group_vars.size(); ++group_idx) {
@@ -563,19 +513,17 @@ void AddMemoryTerms(
 //    the share same sharding as s_follow[i].
 // 2. If request.overbudget_coeff is present, we turn the hard memory budget
 //    constraint into a soft constraint instead.
-// 3. If request.makespan_coeff is present, the objective additionally includes
-//    a makespan term. This is experimental and turned off by default.
-// 4. request.max_departures is used only for debugging and can be ignored.
-// 5. Note that due to our modeling of XLA's AllReduceReassociate optimization
+// 3. request.max_departures is used only for debugging and can be ignored.
+// 4. Note that due to our modeling of XLA's AllReduceReassociate optimization
 //    (more details in CostGraph::CostGraph() in auto_sharding_cost_graph.cc,
 //    and in CreateElementwiseOperatorStrategies() in auto_sharding.cc), there
 //    can be a few (usually < 10) edges in the problem with negative costs. This
 //    is guaranteed to never produce a negative overall cost for the graph,
 //    however.
 absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
-    const AutoShardingSolverRequest& unscaled_request) {
+    const AutoShardingSolverRequest& request,
+    const AutoShardingSolverParams& params) {
   const absl::Time start_time = absl::Now();
-  const AutoShardingSolverRequest request = ScaleRequest(unscaled_request);
   const size_t num_edges = request.edges_size();
   const int num_workers = 32;
   // SAT or SCIP
@@ -594,14 +542,18 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
     // determinism, mip_max_bound (to handle large costs), and num_workers for
     // parallelism.
     solver_parameter_str = absl::StrCat("num_workers:", num_workers);
-    if (request.deterministic_mode()) {
+    if (params.deterministic_mode) {
       absl::StrAppend(
           &solver_parameter_str,
           ",share_binary_clauses:false,random_seed:1,interleave_search:true");
     }
-    if (request.has_solver_timeout()) {
-      absl::StrAppend(&solver_parameter_str, ",max_deterministic_time:",
-                      request.solver_timeout().solver_timeout_in_seconds());
+    if (params.solver_timeout != absl::InfiniteDuration()) {
+      if (params.deterministic_mode) {
+        absl::StrAppend(&solver_parameter_str, ",max_deterministic_time:",
+                        absl::ToInt64Seconds(params.solver_timeout));
+      } else {
+        solver->SetTimeLimit(params.solver_timeout);
+      }
     }
     solver->SetSolverSpecificParametersAsString(solver_parameter_str);
   }
@@ -609,7 +561,6 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
   std::vector<std::vector<MPVariable*>> s(request.num_nodes());
   std::vector<std::vector<MPVariable*>> e(num_edges);
   MPVariable* overbudget_var = nullptr;
-  MPVariable* makespan_var = nullptr;
 
   size_t unique_nodes = 0;
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
@@ -652,15 +603,11 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
     edge_map.insert({followed_edge, edge_idx});
   }
 
-  if (request.memory_budget() > 0 && request.has_overbudget_coeff()) {
+  if (request.memory_budget() > 0 && params.overbudget_coeff.has_value()) {
     overbudget_var =
         solver->MakeNumVar(0.0, MPSolver::infinity(), "overbudget");
   }
 
-  if (request.has_makespan_coeff()) {
-    makespan_var = CreateMakespanVar(request, e, *solver);
-  }
-
   // Construct objective function.
   // Node costs
   absl::flat_hash_set<MPVariable*> infinity_vars;
@@ -672,7 +619,9 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
         infinity_vars.insert(s[node_idx][j]);
         continue;
       }
-      if (request.minimize_departures()) continue;
+      if (params.minimize_departures) {
+        continue;
+      }
       double accumulated_coefficient =
           solver->MutableObjective()->GetCoefficient(s[node_idx][j]);
       solver->MutableObjective()->SetCoefficient(
@@ -687,7 +636,9 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
         infinity_vars.insert(e[edge_idx][j]);
         continue;
       }
-      if (request.minimize_departures()) continue;
+      if (params.minimize_departures) {
+        continue;
+      }
       double accumulated_coefficient =
           solver->MutableObjective()->GetCoefficient(e[edge_idx][j]);
       solver->MutableObjective()->SetCoefficient(
@@ -760,46 +711,22 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
   }
   // c.
   if (request.memory_budget() > 0) {
-    auto LiveNodes =
-        [request](int64_t live_idx) -> tsl::protobuf::RepeatedField<int64_t> {
-      return request.live(live_idx).nodes();
-    };
-    auto LiveEdges =
-        [request](int64_t live_idx) -> tsl::protobuf::RepeatedField<int64_t> {
-      return request.live_edges(live_idx).edges();
-    };
-    std::vector<std::pair<int64_t, int64_t>> reduced_intervals_nodes,
-        reduced_intervals_edges;
+    std::vector<std::pair<int64_t, int64_t>> reduced_intervals_nodes;
     absl::flat_hash_set<int64_t> reduced_times;
-    std::vector<MPVariable*> group_node_vars, group_edge_vars;
-    std::optional<std::pair<int64_t, int64_t>> num_node_terms, num_edge_terms;
+    std::vector<MPVariable*> group_node_vars;
+    std::optional<std::pair<int64_t, int64_t>> num_node_terms;
     num_node_terms = ReduceMemoryTerms(
-        request, *solver, request.live_size(), request.num_nodes(),
-        std::move(LiveNodes), request.node_intervals(), request.node_groups(),
+        request, *solver, request.num_nodes(), request.node_intervals(),
         request.memory_costs(), "node", s, reduced_intervals_nodes,
         group_node_vars, reduced_times);
-    if (request.enable_memory_edge_costs()) {
-      num_edge_terms = ReduceMemoryTerms(
-          request, *solver, request.live_edges_size(), request.edges_size(),
-          std::move(LiveEdges), request.edge_intervals(), request.edge_groups(),
-          request.memory_edge_costs(), "edge", e, reduced_intervals_edges,
-          group_edge_vars, reduced_times);
-    }
     absl::flat_hash_map<LivenessIdx, MPConstraint*> constraints;
     AddMemoryTerms(request, *solver, request.num_nodes(),
                    reduced_intervals_nodes, request.memory_costs(),
                    overbudget_var, reduced_times, s, group_node_vars,
                    constraints);
-    if (request.enable_memory_edge_costs()) {
-      AddMemoryTerms(request, *solver, request.edges_size(),
-                     reduced_intervals_edges, request.memory_edge_costs(),
-                     overbudget_var, reduced_times, e, group_edge_vars,
-                     constraints);
-    }
-    if (overbudget_var && !request.minimize_departures()) {
+    if (overbudget_var && !params.minimize_departures) {
       solver->MutableObjective()->SetCoefficient(
-          overbudget_var,
-          request.overbudget_coeff().coeff() * request.memory_budget());
+          overbudget_var, *params.overbudget_coeff * request.memory_budget());
     }
     LOG(INFO) << "Minimum memory budget estimate: "
               << MinimumMemoryBudgetRequired(request);
@@ -810,7 +737,9 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
   // d. specified via "BoolVarArray"
   // e.
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
-    if (e_follow[edge_idx] >= 0) continue;
+    if (e_follow[edge_idx] >= 0) {
+      continue;
+    }
     const auto& edge = request.edges(edge_idx);
     for (NodeStrategyIdx p = 0; p < s[edge.first()].size(); ++p) {
       for (NodeStrategyIdx q = 0; q < s[edge.second()].size(); ++q) {
@@ -834,7 +763,9 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
     const auto& raw_alias = request.aliases(alias_idx);
     const std::pair<NodeIdx, NodeIdx> alias(raw_alias.first(),
                                             raw_alias.second());
-    if (alias_set.contains(alias)) continue;
+    if (alias_set.contains(alias)) {
+      continue;
+    }
     alias_set.insert(alias);
     const auto& value_costs = request.value_costs(alias_idx).costs();
     for (NodeStrategyIdx p = 0; p < s[alias.first].size(); ++p) {
@@ -851,10 +782,10 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
       }
     }
   }
-  if (request.has_max_departures()) {
+  if (params.max_departures.has_value()) {
     MPConstraint* constraint = solver->MakeRowConstraint(
-        0, request.max_departures().coeff(),
-        absl::StrCat("departures <= ", request.max_departures().coeff()));
+        0, *params.max_departures,
+        absl::StrCat("departures <= ", *params.max_departures));
     for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
       for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
         double accumulated_coefficient =
@@ -865,7 +796,7 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
       }
     }
   }
-  if (request.minimize_departures()) {
+  if (params.minimize_departures) {
     for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
       for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
         double accumulated_coefficient =
@@ -890,7 +821,9 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
       (!request.has_max_cost() || request.max_cost().coeff() < kMaxCostValue)) {
     std::vector<std::pair<const MPVariable*, double>> hint;
     for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
-      if (request.s_follow(node_idx) >= 0) continue;
+      if (request.s_follow(node_idx) >= 0) {
+        continue;
+      }
       for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
         double hint_val = (request.s_hint(node_idx) == j) ? 1.0 : 0.0;
         hint.push_back({s[node_idx][j], hint_val});
@@ -913,17 +846,17 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
       LOG(ERROR) << write_status.message();
     }
   }
-  // Exports the *unscaled* solver request proto for debugging.
+  // Exports the solver request proto for debugging.
   bool dump_solver_request = false;
   if (dump_solver_request) {
     uint64_t solver_request_fprint =
-        tsl::Fingerprint64(unscaled_request.SerializeAsString());
+        tsl::Fingerprint64(request.SerializeAsString());
     std::string request_dump_path =
-        absl::StrCat("/tmp/solver_request_", unscaled_request.request_name(),
-                     "_", solver_request_fprint, ".textproto");
+        absl::StrCat("/tmp/solver_request_", request.request_name(), "_",
+                     solver_request_fprint, ".textproto");
     auto write_status = file::SetTextProto(
         // Modify this file path if needed.
-        request_dump_path, unscaled_request, file::Defaults());
+        request_dump_path, request, file::Defaults());
     LOG(INFO) << "Dumped solver request to " << request_dump_path;
     if (!write_status.ok()) {
       LOG(ERROR) << write_status.message();
@@ -932,7 +865,7 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
   // Invokes the solver request callback for any additional debugging.
   bool solver_request_callback = false;
   if (solver_request_callback) {
-    SolverRequestCallback(unscaled_request);
+    SolverRequestCallback(request);
   }
 #endif
   if (request.enable_output()) {
@@ -956,20 +889,20 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
           << "Number variables for ILP: " << solver->NumVariables() << "\n"
           << "Number of ILP constraints: " << solver->NumConstraints() << "\n"
           << "Deterministic mode: " << request.deterministic_mode() << "\n"
-          << "Minimize departures: " << request.minimize_departures() << "\n"
+          << "Minimize departures: " << params.minimize_departures << "\n"
           << "Module name: " << request.module_name();
   if (request.has_max_cost()) {
     VLOG(0) << "Max cost: " << request.max_cost().coeff();
   }
-  if (request.has_max_departures()) {
-    VLOG(0) << "Max departures: " << request.max_departures().coeff();
+  if (params.max_departures.has_value()) {
+    VLOG(0) << "Max departures: " << *params.max_departures;
   }
-  auto result = SolveAndExtractSolution(request, s, e, overbudget_var,
-                                        makespan_var, *solver);
+  auto result =
+      SolveAndExtractSolution(request, params, s, e, overbudget_var, *solver);
   if (result.ok()) {
     const AutoShardingEvaluation evaluation =
-        Evaluate(unscaled_request, *result);
-    LOG(INFO) << "*** Total costs for the (unscaled) solver request ***";
+        Evaluate(request, *result, params);
+    LOG(INFO) << "*** Total costs for the solver request ***";
     LOG(INFO) << "Total Communication Cost: "
               << evaluation.total.communication_cost
               << " (lower bound: " << evaluation.lower_bound.communication_cost
@@ -983,13 +916,9 @@ absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
     LOG(INFO) << "Total Overbudget Cost: " << evaluation.total.overbudget_cost
               << " (lower bound: " << evaluation.lower_bound.overbudget_cost
               << ")";
-    LOG(INFO) << "Total Makespan Cost: " << evaluation.total.makespan_cost
-              << " (lower bound: " << evaluation.lower_bound.makespan_cost
-              << ")";
     LOG(INFO) << "Total Cost: " << evaluation.total.cost()
               << " (lower bound: " << evaluation.lower_bound.cost() << ")";
     LOG(INFO) << "Total Departures: " << evaluation.total_departures;
-    LOG(INFO) << "Total Makespan: " << evaluation.total_makespan;
     LOG(INFO) << "Total Violations: " << evaluation.violation_codes.size();
     LOG(INFO) << "Total Maximum Memory: " << evaluation.total.max_memory
               << " (lower bound: " << evaluation.lower_bound.max_memory << ")";
@@ -1012,6 +941,292 @@ EdgeStrategyIdx GetEdgeStrategy(
   return node_strategies[u] * num_v_strategies + node_strategies[v];
 }
 
+// Stores the active times for each node.
+std::vector<std::vector<LivenessIdx>> GetNodeToActiveTimes(
+    const AutoShardingSolverRequest& request) {
+  std::vector<std::vector<LivenessIdx>> node_to_active_times(
+      request.num_nodes());
+  for (LivenessIdx t = 0; t < request.live_size(); ++t) {
+    for (NodeIdx node : request.live(t).nodes()) {
+      node_to_active_times[node].push_back(t);
+    }
+  }
+  return node_to_active_times;
+}
+
+// Computes the memory slack for each time (i.e., budget - live memory at t)
+std::vector<double> TrackMemorySlack(
+    const AutoShardingSolverRequest& request,
+    const std::vector<NodeStrategyIdx>& node_strategies) {
+  std::vector<double> memory_slack(request.live_size(), 0.0);
+  for (LivenessIdx t = 0; t < request.live_size(); ++t) {
+    double live_memory = 0.0;
+    for (NodeIdx node : request.live(t).nodes()) {
+      live_memory += request.memory_costs(node).costs(node_strategies[node]);
+    }
+    memory_slack[t] = request.memory_budget() - live_memory;
+  }
+  return memory_slack;
+}
+
+std::pair<EdgeAdjacency, EdgeAdjacency> GetAdjacencyMatrix(
+    const AutoShardingSolverRequest& request) {
+  // outward_edges: i-th vector is the edges of the form (i-th node)->v.
+  // inward_edges: i-th vector is the edges of the form v->(i-th node).
+  EdgeAdjacency outward_edges(request.num_nodes());
+  EdgeAdjacency inward_edges(request.num_nodes());
+  for (EdgeIdx edge_idx = 0; edge_idx < request.edges_size(); ++edge_idx) {
+    const auto& edge = request.edges(edge_idx);
+    outward_edges[edge.first()].push_back(edge_idx);
+    inward_edges[edge.second()].push_back(edge_idx);
+  }
+  return {outward_edges, inward_edges};
+}
+
+// Store the edges within the path.
+std::vector<EdgeIdx> GetEdgesWithinPath(
+    const AutoShardingSolverRequest& request, const std::vector<NodeIdx>& path,
+    const EdgeAdjacency& outward_edges) {
+  std::vector<EdgeIdx> edges_within_path;
+  for (const NodeIdx& node : path) {
+    for (const EdgeIdx& edge : outward_edges[node]) {
+      auto it =
+          std::find(path.begin(), path.end(), request.edges(edge).second());
+      if (it != path.end()) {
+        edges_within_path.push_back(edge);
+      }
+    }
+  }
+  return edges_within_path;
+}
+
+// Sample a random path of length `path_length'.
+std::vector<NodeIdx> SamplePath(const AutoShardingSolverRequest& request,
+                                const EdgeAdjacency& outward_edges,
+                                const int path_length, std::mt19937_64& rng) {
+  std::vector<NodeIdx> path;
+  path.reserve(path_length + 1);
+  if (path_length == 0) {  // Sample a random node.
+    std::uniform_int_distribution<> dist(0, request.num_nodes() - 1);
+    path.push_back(dist(rng));
+  } else if (path_length == 1) {  // Sample a random edge.
+    std::uniform_int_distribution<> dist(0, request.edges_size() - 1);
+    EdgeIdx random_edge_idx = dist(rng);
+    path.push_back(request.edges(random_edge_idx).first());
+    path.push_back(request.edges(random_edge_idx).second());
+  } else {  // Path-sampling by concatenating nodes.
+    int scanned_length = 0;
+    std::uniform_int_distribution<> dist(0, request.edges_size() - 1);
+    NodeIdx u = request.edges(dist(rng)).first();
+    path.push_back(u);
+    while (scanned_length < path_length) {
+      // Sample edges from the outward edges of u.
+      if (outward_edges[u].empty()) {
+        break;
+      }
+      scanned_length++;
+      std::uniform_int_distribution<> dist(0, outward_edges[u].size() - 1);
+      EdgeIdx edge_idx = outward_edges[u][dist(rng)];
+      u = request.edges(edge_idx).second();
+      path.push_back(u);
+    }
+  }
+  return path;
+}
+
+// Computes the cost induced by a node and its adjacent edges.
+double AggregateCostAroundNode(
+    const AutoShardingSolverRequest& request,
+    const std::pair<EdgeAdjacency, EdgeAdjacency>& adjacency,
+    const std::vector<NodeStrategyIdx>& node_strategies, const NodeIdx& node) {
+  const EdgeAdjacency& outward_edges = adjacency.first;
+  const EdgeAdjacency& inward_edges = adjacency.second;
+  double cost = 0.0;
+  // Node cost
+  cost += request.computation_costs(node).costs(node_strategies[node]) +
+          request.communication_costs(node).costs(node_strategies[node]);
+
+  // Edge cost
+  for (const EdgeIdx& outward_edge : outward_edges[node]) {
+    cost += request.resharding_costs(outward_edge)
+                .costs(GetEdgeStrategy(request, node_strategies, outward_edge));
+  }
+  for (const EdgeIdx& inward_edge : inward_edges[node]) {
+    cost += request.resharding_costs(inward_edge)
+                .costs(GetEdgeStrategy(request, node_strategies, inward_edge));
+  }
+  return cost;
+}
+
+// Computes the cost induced by a path (cost of nodes and adjacent edges).
+double ComputePathCost(const AutoShardingSolverRequest& request,
+                       const std::pair<EdgeAdjacency, EdgeAdjacency>& adjacency,
+                       const std::vector<NodeIdx>& path,
+                       const std::vector<EdgeIdx>& edges_within_path,
+                       std::vector<NodeStrategyIdx>& node_strategies) {
+  double cost = 0.0;
+  for (const NodeIdx& node : path) {
+    cost += AggregateCostAroundNode(request, adjacency, node_strategies, node);
+  }
+  // Subtracting the overcounted edge costs within the path.
+  for (const EdgeIdx& edge : edges_within_path) {
+    EdgeStrategyIdx edge_strategy =
+        GetEdgeStrategy(request, node_strategies, edge);
+    cost -= request.resharding_costs(edge).costs(edge_strategy);
+  }
+  return cost;
+}
+
+// Recursively optimizes over the path.
+std::pair<double, std::vector<NodeStrategyIdx>> _OptimizeOverPath(
+    const AutoShardingSolverRequest& request, const std::vector<NodeIdx>& path,
+    const std::vector<EdgeIdx>& edges_within_path,
+    std::vector<NodeStrategyIdx>& node_strategies,
+    const std::pair<EdgeAdjacency, EdgeAdjacency>& adjacency,
+    int num_remaining_nodes) {
+  double best_cost = std::numeric_limits<double>::infinity();
+  std::vector<NodeStrategyIdx> best_strategy(path.size(), 0);
+  for (int i = 0; i < path.size(); ++i) {
+    best_strategy[i] = node_strategies[path[i]];
+  }
+
+  if (num_remaining_nodes == 1) {  // Base case of the recursion.
+    NodeIdx last_node = path[path.size() - 1];
+    for (NodeStrategyIdx node_strategy = 0;
+         node_strategy < request.computation_costs(last_node).costs_size();
+         ++node_strategy) {
+      node_strategies[last_node] = node_strategy;
+      double path_cost = ComputePathCost(request, adjacency, path,
+                                         edges_within_path, node_strategies);
+      if (path_cost < best_cost) {
+        best_cost = path_cost;
+        best_strategy[best_strategy.size() - 1] = node_strategy;
+      }
+    }
+  } else {
+    NodeIdx current_node = path[path.size() - num_remaining_nodes];
+    for (NodeStrategyIdx node_strategy = 0;
+         node_strategy < request.computation_costs(current_node).costs_size();
+         ++node_strategy) {
+      node_strategies[current_node] = node_strategy;
+      auto [path_cost, path_strategy] =
+          _OptimizeOverPath(request, path, edges_within_path, node_strategies,
+                            adjacency, num_remaining_nodes - 1);
+      if (path_cost < best_cost) {
+        best_cost = path_cost;
+        best_strategy = path_strategy;
+      }
+    }
+  }
+  return {best_cost, best_strategy};
+}
+
+// A wrapper function for `_OptimizeOverPath`, which is a recursive
+// function to find (1) the best sharding strategies for the path and (2) the
+// the improvement in cost (w.r.t. the current strategies).
+std::pair<double, std::vector<NodeStrategyIdx>> OptimizeOverPath(
+    const AutoShardingSolverRequest& request, const std::vector<NodeIdx>& path,
+    std::vector<NodeStrategyIdx>& node_strategies,
+    const std::pair<EdgeAdjacency, EdgeAdjacency>& adjacency) {
+  std::vector<NodeStrategyIdx> old_strategies(path.size(), 0);
+  for (int i = 0; i < path.size(); ++i) {
+    old_strategies[i] = node_strategies[path[i]];
+  }
+  std::vector<EdgeIdx> edges_within_path =
+      GetEdgesWithinPath(request, path, /*outward_edges=*/adjacency.first);
+
+  double original_path_cost = ComputePathCost(
+      request, adjacency, path, edges_within_path, node_strategies);
+  auto [new_path_cost, best_path_strategies] =
+      _OptimizeOverPath(request, path, edges_within_path, node_strategies,
+                        adjacency, path.size());
+
+  // node_strategies could change within _OptimizeOverPath, so we restore the
+  // original sharding strategies for the nodes on the path.
+  for (int i = 0; i < path.size(); ++i) {
+    node_strategies[path[i]] = old_strategies[i];
+  }
+  double cost_delta = new_path_cost - original_path_cost;
+  CHECK_LE(cost_delta, 0.0);
+  return {cost_delta, best_path_strategies};
+}
+
+// Check if a path's new configuration satisfies the memory constraints.
+absl::flat_hash_map<LivenessIdx, double> GetNewMemorySlack(
+    const AutoShardingSolverRequest& request, const std::vector<NodeIdx>& path,
+    const std::vector<NodeStrategyIdx>& path_strategies,
+    const std::vector<NodeStrategyIdx>& node_strategies,
+    const std::vector<std::vector<LivenessIdx>>& node_to_active_times,
+    const std::vector<double>& memory_slack) {
+  absl::flat_hash_map<LivenessIdx, double> new_memory_slack;
+  for (int i = 0; i < path.size(); ++i) {
+    NodeIdx node = path[i];
+    if (!node_to_active_times[node].empty()) {
+      for (LivenessIdx t : node_to_active_times[node]) {
+        if (!new_memory_slack.contains(t)) {
+          new_memory_slack[t] = memory_slack[t];
+        }
+        new_memory_slack[t] -=
+            (request.memory_costs(node).costs(path_strategies[i]) -
+             request.memory_costs(node).costs(node_strategies[node]));
+      }
+    }
+  }
+  return new_memory_slack;
+}
+
+// Update `node_strategies` for the nodes in `path` if `new_path_strategies` is
+// a feasible set of improving changes. Returns true iff the update is accepted.
+bool UpdateNodeStrategies(
+    const AutoShardingSolverRequest& request, const std::vector<NodeIdx>& path,
+    const std::vector<NodeStrategyIdx>& new_path_strategies,
+    std::vector<NodeStrategyIdx>& node_strategies,
+    const std::string& memory_mode, std::vector<double>& memory_slack,
+    const std::vector<std::vector<LivenessIdx>>& node_to_active_times) {
+  if (memory_mode == "inactive") {
+    for (int i = 0; i < path.size(); ++i) {
+      node_strategies[path[i]] = new_path_strategies[i];
+    }
+  } else if (memory_mode == "active") {
+    // Check: the new strategy satisfies the memory constraints.
+    const auto new_memory_slack_at_times =
+        GetNewMemorySlack(request, path, new_path_strategies, node_strategies,
+                          node_to_active_times, memory_slack);
+    for (const auto& [time_step, new_slack] : new_memory_slack_at_times) {
+      if (new_slack < 0.0) {
+        return false;
+      }
+    }
+    // If feasible, update the sharding strategies and memory slack.
+    for (const auto& [time_step, new_slack] : new_memory_slack_at_times) {
+      memory_slack[time_step] = new_slack;
+    }
+    for (int i = 0; i < path.size(); ++i) {
+      node_strategies[path[i]] = new_path_strategies[i];
+    }
+  }
+  return true;
+}
+
+std::tuple<double, std::vector<NodeIdx>, std::vector<NodeStrategyIdx>>
+SampleAndOptimizePath(const AutoShardingSolverRequest& request,
+                      std::vector<NodeStrategyIdx>& node_strategies,
+                      const std::pair<EdgeAdjacency, EdgeAdjacency>& adjacency,
+                      const int path_length, std::mt19937_64& rng) {
+  std::vector<NodeIdx> path =
+      SamplePath(request, adjacency.first, path_length, rng);
+  if (path.size() != path_length + 1) {
+    return {0.0, {}, {}};
+  }
+  const auto [cost_delta, new_path_strategies] =
+      OptimizeOverPath(request, path, node_strategies, adjacency);
+  // Check that the new path strategy improves the cost.
+  if (cost_delta == 0.0) {
+    return {0.0, {}, {}};
+  }
+  return {cost_delta, path, new_path_strategies};
+}
+
 // Checks if the node-sharding strategy has a finite cost and satisfies the
 // peak-memory constraint.
 std::optional<AutoShardingViolationCode> ShardingStrategyHasViolation(
@@ -1051,6 +1266,24 @@ std::optional<AutoShardingViolationCode> ShardingStrategyHasViolation(
   return std::nullopt;
 }
 
+// TODO(ykook): Change the name of a function calling this to
+// ComputeShardingCostWithNegativeErrorCodes().
+double ComputeShardingCost(
+    const AutoShardingSolverRequest& request,
+    const std::vector<NodeStrategyIdx>& node_strategies) {
+  double cost = 0.0;
+  for (NodeIdx v = 0; v < request.num_nodes(); ++v) {
+    NodeStrategyIdx strategy = node_strategies[v];
+    cost += request.computation_costs(v).costs(strategy) +
+            request.communication_costs(v).costs(strategy);
+  }
+  for (EdgeIdx e = 0; e < request.edges_size(); ++e) {
+    EdgeStrategyIdx strategy = GetEdgeStrategy(request, node_strategies, e);
+    cost += request.resharding_costs(e).costs(strategy);
+  }
+  return cost;
+}
+
 // Assigns all nodes to their first sharding configuration. If the assignment is
 // infeasible, the output cost is negative and encodes the violation code.
 AutoShardingSolverOutput SolveTrivial(
@@ -1084,7 +1317,8 @@ AutoShardingSolverOutput SolveRandom(const AutoShardingSolverRequest& request,
     bool candidate_is_feasible = (cost >= 0.0);
     if (have_feasible_solution && !candidate_is_feasible) {
       continue;
-    } else if (have_feasible_solution && candidate_is_feasible) {
+    }
+    if (have_feasible_solution && candidate_is_feasible) {
       if (cost < best_cost) {
         best_node_strategies = node_strategies;
         best_cost = cost;
@@ -1107,8 +1341,8 @@ AutoShardingSolverOutput SolveRandom(const AutoShardingSolverRequest& request,
 }
 
 // Greedily selects the node sharding strategies. Valid modes:
-// - "node_cost"
-// - "node_memory"
+// - "node-cost"
+// - "node-memory"
 AutoShardingSolverOutput SolveGreedy(const AutoShardingSolverRequest& request,
                                      const std::string& mode) {
   const int num_nodes = request.num_nodes();
@@ -1144,14 +1378,90 @@ AutoShardingSolverOutput SolveGreedy(const AutoShardingSolverRequest& request,
   return output;
 }
 
+// A local search algorithm that iteratively picks a random path of length
+// `path_length` and computes the best sharding configuration for the path.
+// - `path_length = 0` corresponds to a random node.
+// - `path_length = 1` corresponds to a random edge.
+// It has two `memory_mode` options for how it handles peak-memory constraints:
+// - "inactive": ignores peak-memory constraints
+// - "active": treats the peak-memory usage as a hard constraint
+// `tolerance` in [0, 1] is a threshold for the relative cost decrease
+// (out of the previous cost) to continue iterations.
+AutoShardingSolverOutput SolveRandomPathGreedy(
+    const AutoShardingSolverRequest& request, const int path_length,
+    const int num_trials, const double tolerance,
+    const std::string& memory_mode) {
+  std::mt19937_64 rng(0);
+  if (memory_mode != "inactive" && memory_mode != "active") {
+    CHECK(false) << absl::Substitute("Memory mode $0 is not implemented.",
+                                     memory_mode);
+  }
+
+  // Initialize each node's sharding strategy with the least-memory usage.
+  AutoShardingSolverOutput output;
+  std::vector<NodeStrategyIdx> node_strategies =
+      SolveGreedy(request, "node-memory").s_val;
+  const std::pair<EdgeAdjacency, EdgeAdjacency> adjacency =
+      GetAdjacencyMatrix(request);
+  std::vector<std::vector<LivenessIdx>> node_to_active_times;
+  std::vector<double> memory_slack;
+  double current_cost = ComputeShardingCost(request, node_strategies);
+  if (memory_mode == "active") {
+    node_to_active_times = GetNodeToActiveTimes(request);
+    memory_slack = TrackMemorySlack(request, node_strategies);
+  }
+  // Phase 0: Return if minimum possible memory usage already exceeds budget.
+  // This makes sense because we're initializing the solution w/
+  // greedy-node-memory (which has minimum possible peak memory).
+  std::optional<AutoShardingViolationCode> violation_code =
+      ShardingStrategyHasViolation(request, node_strategies);
+  if (violation_code.has_value() && *violation_code == kMemoryViolationCode) {
+    output.s_val = node_strategies;
+    output.cost = current_cost;
+    return output;
+  }
+
+  // Phase 1: Store the sharding costs of the last `window_size` trials.
+  CHECK_GE(num_trials, 20);
+  int window_size = std::min(static_cast<int>(0.05 * num_trials), 100000);
+  std::vector<double> cost_window(window_size, -1.0);
+  cost_window[0] = current_cost;
+  for (int window_idx = 1; window_idx < window_size; ++window_idx) {
+    auto [cost_delta, path, new_path_strategies] = SampleAndOptimizePath(
+        request, node_strategies, adjacency, path_length, rng);
+    if (cost_delta < 0.0 &&
+        UpdateNodeStrategies(request, path, new_path_strategies,
+                             node_strategies, memory_mode, memory_slack,
+                             node_to_active_times)) {
+      current_cost += cost_delta;
+    }
+    cost_window[window_idx] = current_cost;
+  }
+  // Phase 2: Optimize the sharding cost with an early-stopping feature.
+  for (int trial = window_size; trial < num_trials; ++trial) {
+    auto [cost_delta, path, new_path_strategies] = SampleAndOptimizePath(
+        request, node_strategies, adjacency, path_length, rng);
+    if (cost_delta < 0.0 &&
+        UpdateNodeStrategies(request, path, new_path_strategies,
+                             node_strategies, memory_mode, memory_slack,
+                             node_to_active_times)) {
+      current_cost += cost_delta;
+    }
+    if (1.0 - current_cost / cost_window[trial % window_size] < tolerance) {
+      break;
+    }
+    cost_window[trial % window_size] = current_cost;
+  }
+
+  output.s_val = node_strategies;
+  output.cost = ComputeShardingStrategyCost(request, node_strategies);
+  return output;
+}
+
 }  // namespace
 
 absl::StatusOr<AutoShardingSolverOutput> RunHeuristicSolver(
-    const AutoShardingSolverRequest& unscaled_request,
-    const std::string& algorithm) {
-  // Scale the coefficients in the request in the same way as the MIP solver.
-  AutoShardingSolverRequest request = ScaleRequest(unscaled_request);
-
+    const AutoShardingSolverRequest& request, const std::string& algorithm) {
   absl::Time start_time = absl::Now();
   AutoShardingSolverOutput output;
   if (algorithm == "trivial") {
@@ -1162,6 +1472,12 @@ absl::StatusOr<AutoShardingSolverOutput> RunHeuristicSolver(
     output = SolveGreedy(request, "node-cost");
   } else if (algorithm == "greedy-node-memory") {
     output = SolveGreedy(request, "node-memory");
+  } else if (algorithm == "random-path-greedy") {
+    const int num_trials =
+        2 * request.edges_size() * std::log(request.edges_size());
+    output = SolveRandomPathGreedy(request, /*path_length=*/2, num_trials,
+                                   /*tolerance=*/0.001,
+                                   /*memory_mode=*/"active");
   } else if (algorithm == "brkga") {
     output = SolveBrkga(request);
   } else {
@@ -1172,7 +1488,7 @@ absl::StatusOr<AutoShardingSolverOutput> RunHeuristicSolver(
   LOG(INFO) << "Solver took " << absl::ToInt64Milliseconds(duration) << " ms";
   LOG(INFO) << "Objective value: " << output.cost;
   LOG(INFO) << "Total Cost: "
-            << ComputeShardingStrategyCost(unscaled_request, output.s_val);
+            << ComputeShardingStrategyCost(request, output.s_val);
   return output;
 }
 
@@ -1181,12 +1497,12 @@ bool CostComponents::operator==(const CostComponents& other) const {
          computation_cost == other.computation_cost &&
          resharding_cost == other.resharding_cost &&
          overbudget_cost == other.overbudget_cost &&
-         makespan_cost == other.makespan_cost && max_memory == other.max_memory;
+         max_memory == other.max_memory;
 }
 
 double CostComponents::cost() const {
   return communication_cost + computation_cost + resharding_cost +
-         overbudget_cost + makespan_cost;
+         overbudget_cost;
 }
 
 bool AutoShardingEvaluation::operator==(
@@ -1197,12 +1513,13 @@ bool AutoShardingEvaluation::operator==(
 }
 
 AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
-                                const AutoShardingSolverOutput& result) {
+                                const AutoShardingSolverOutput& result,
+                                const AutoShardingSolverParams& params) {
   const auto& c = request.computation_costs();
   const auto& d = request.communication_costs();
   const auto& r = request.resharding_costs();
   const auto& v = request.value_costs();
-  const auto& p = request.departure_costs();
+  const auto& p = params.departure_costs;
   const std::vector<NodeStrategyIdx>& s_val = result.s_val;
   const auto e_val = [&](EdgeIdx edge_idx) {
     const auto& edge = request.edges(edge_idx);
@@ -1238,111 +1555,37 @@ AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
     }
   }
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
-    evaluation.total_departures += p.at(node_idx).costs(s_val[node_idx]);
-    if (request.has_max_departures() &&
-        evaluation.total_departures > request.max_departures().coeff()) {
+    if (p.empty()) {
+      continue;
+    }
+    evaluation.total_departures += p[node_idx][s_val[node_idx]];
+
+    if (params.max_departures.has_value() &&
+        evaluation.total_departures > *params.max_departures) {
       evaluation.violation_codes.insert(kMaxDeparturesViolationCode);
     }
   }
   if (request.memory_budget() > 0) {
     std::vector<double> total_memory_costs, lower_bound_memory_costs;
-    if (request.node_intervals().empty()) {  // Handles live matrices.
-      total_memory_costs.resize(request.live_size(), 0.0);
-      lower_bound_memory_costs.resize(request.live_size(), 0.0);
-      for (LivenessIdx time_idx = 0; time_idx < request.live_size();
-           ++time_idx) {
-        for (NodeIdx node_idx : request.live(time_idx).nodes()) {
-          const auto& m = request.memory_costs(node_idx).costs();
-          total_memory_costs[time_idx] += m[s_val[node_idx]];
-          lower_bound_memory_costs[time_idx] +=
-              *std::min_element(m.begin(), m.end());
-        }
-        if (!request.live_edges().empty() &&
-            request.enable_memory_edge_costs()) {
-          for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
-            const auto& m = request.memory_edge_costs(edge_idx).costs();
-            total_memory_costs[time_idx] += m[e_val(edge_idx)];
-            lower_bound_memory_costs[time_idx] +=
-                *std::min_element(m.begin(), m.end());
-          }
-        }
-      }
-    } else {  // Handles the interval-based memory representation.
-      std::vector<double> total_node_group_costs, total_edge_group_costs,
-          lower_bound_node_group_costs, lower_bound_edge_group_costs;
-      for (const auto& group : request.node_groups()) {
-        double total_group_cost = 0.0;
-        double lower_bound_group_cost = 0.0;
-        for (const NodeIdx node_idx : group.prims()) {
-          const auto& m = request.memory_costs(node_idx).costs();
-          total_group_cost += m[s_val[node_idx]];
-          lower_bound_group_cost += *std::min_element(m.begin(), m.end());
-        }
-        total_node_group_costs.push_back(total_group_cost);
-        lower_bound_node_group_costs.push_back(lower_bound_group_cost);
-      }
-      for (const auto& group : request.edge_groups()) {
-        double total_group_cost = 0.0;
-        double lower_bound_group_cost = 0.0;
-        for (const EdgeIdx edge_idx : group.prims()) {
-          const auto& m = request.memory_edge_costs(edge_idx).costs();
-          total_group_cost += m[e_val(edge_idx)];
-          lower_bound_group_cost += *std::min_element(m.begin(), m.end());
-        }
-        total_edge_group_costs.push_back(total_group_cost);
-        lower_bound_edge_group_costs.push_back(lower_bound_group_cost);
+    for (NodeIdx node_idx = 0; node_idx < request.node_intervals_size();
+         ++node_idx) {
+      const auto& interval = request.node_intervals(node_idx);
+      if (interval.first() > interval.second()) {
+        continue;
       }
-      for (NodeIdx node_idx = 0; node_idx < request.node_intervals_size();
-           ++node_idx) {
-        const auto& interval = request.node_intervals(node_idx);
-        if (interval.first() > interval.second()) continue;
-        // Expand cost vectors if needed to cover the range of this interval.
-        while (total_memory_costs.size() <= interval.second()) {
-          total_memory_costs.push_back(0.0);
-          lower_bound_memory_costs.push_back(0.0);
-        }
-        double total_memory_cost = 0.0, lower_bound_memory_cost = 0.0;
-        if (node_idx < request.num_nodes()) {
-          const auto& m = request.memory_costs(node_idx).costs();
-          total_memory_cost = m[s_val[node_idx]];
-          lower_bound_memory_cost = *std::min_element(m.begin(), m.end());
-        } else {
-          int64_t group_idx = node_idx - request.num_nodes();
-          total_memory_cost = total_node_group_costs[group_idx];
-          lower_bound_memory_cost = lower_bound_node_group_costs[group_idx];
-        }
-        for (LivenessIdx time_idx = interval.first();
-             time_idx <= interval.second(); ++time_idx) {
-          total_memory_costs[time_idx] += total_memory_cost;
-          lower_bound_memory_costs[time_idx] += lower_bound_memory_cost;
-        }
+      // Expand cost vectors if needed to cover the range of this interval.
+      while (total_memory_costs.size() <= interval.second()) {
+        total_memory_costs.push_back(0.0);
+        lower_bound_memory_costs.push_back(0.0);
       }
-      if (request.enable_memory_edge_costs()) {
-        for (EdgeIdx edge_idx = 0; edge_idx < request.edge_intervals_size();
-             ++edge_idx) {
-          const auto& interval = request.edge_intervals(edge_idx);
-          if (interval.first() > interval.second()) continue;
-          // Expand cost vectors if needed to cover the range of this interval.
-          while (total_memory_costs.size() <= interval.second()) {
-            total_memory_costs.push_back(0.0);
-            lower_bound_memory_costs.push_back(0.0);
-          }
-          double total_memory_cost = 0.0, lower_bound_memory_cost = 0.0;
-          if (edge_idx < request.edges_size()) {
-            const auto& m = request.memory_edge_costs(edge_idx).costs();
-            total_memory_cost = m[e_val(edge_idx)];
-            lower_bound_memory_cost = *std::min_element(m.begin(), m.end());
-          } else {
-            int64_t group_idx = edge_idx - request.edges_size();
-            total_memory_cost = total_edge_group_costs[group_idx];
-            lower_bound_memory_cost = lower_bound_edge_group_costs[group_idx];
-          }
-          for (LivenessIdx time_idx = interval.first();
-               time_idx <= interval.second(); ++time_idx) {
-            total_memory_costs[time_idx] += total_memory_cost;
-            lower_bound_memory_costs[time_idx] += lower_bound_memory_cost;
-          }
-        }
+      double total_memory_cost = 0.0, lower_bound_memory_cost = 0.0;
+      const auto& m = request.memory_costs(node_idx).costs();
+      total_memory_cost = m[s_val[node_idx]];
+      lower_bound_memory_cost = *std::min_element(m.begin(), m.end());
+      for (LivenessIdx time_idx = interval.first();
+           time_idx <= interval.second(); ++time_idx) {
+        total_memory_costs[time_idx] += total_memory_cost;
+        lower_bound_memory_costs[time_idx] += lower_bound_memory_cost;
       }
     }
     double total_overbudget = 0.0;
@@ -1365,11 +1608,11 @@ AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
         evaluation.violation_codes.insert(kMemoryViolationCode);
       }
     }
-    if (request.has_overbudget_coeff()) {
+    if (params.overbudget_coeff.has_value()) {
       evaluation.total.overbudget_cost =
-          request.overbudget_coeff().coeff() * total_overbudget;
+          *params.overbudget_coeff * total_overbudget;
       evaluation.lower_bound.overbudget_cost =
-          request.overbudget_coeff().coeff() * lower_bound_overbudget;
+          *params.overbudget_coeff * lower_bound_overbudget;
     }
   }
   // Compute metrics and lower bounds.
@@ -1387,23 +1630,13 @@ AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
     evaluation.lower_bound.resharding_cost += *std::min_element(
         r.at(edge_idx).costs().begin(), r.at(edge_idx).costs().end());
   }
-  evaluation.total_makespan = EvaluateMakespan(request, result, evaluation);
   return evaluation;
 }
 
 double ComputeShardingStrategyCost(
     const AutoShardingSolverRequest& request,
     const std::vector<NodeStrategyIdx>& node_strategies) {
-  double cost = 0.0;
-  for (NodeIdx v = 0; v < request.num_nodes(); ++v) {
-    NodeStrategyIdx strategy = node_strategies[v];
-    cost += request.computation_costs(v).costs(strategy) +
-            request.communication_costs(v).costs(strategy);
-  }
-  for (EdgeIdx e = 0; e < request.edges_size(); ++e) {
-    EdgeStrategyIdx strategy = GetEdgeStrategy(request, node_strategies, e);
-    cost += request.resharding_costs(e).costs(strategy);
-  }
+  double cost = ComputeShardingCost(request, node_strategies);
   std::optional<AutoShardingViolationCode> violation_code =
       ShardingStrategyHasViolation(request, node_strategies);
   if (violation_code.has_value()) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index f832699ffb5b..5af1bf5c5221 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
 #define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/time/time.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "ortools/linear_solver/linear_solver.h"
@@ -38,15 +40,23 @@ struct AutoShardingSolverOutput {
   bool operator==(const AutoShardingSolverOutput& other) const;
 };
 
-// Scales down values to reduce the range of costs & coefficients in the solver.
-AutoShardingSolverRequest ScaleRequest(
-    const AutoShardingSolverRequest& request);
-
 // Determines the minimum memory budget required to avoid memory violations.
 double MinimumMemoryBudgetRequired(const AutoShardingSolverRequest& request);
 
+struct AutoShardingSolverParams {
+  std::vector<std::vector<double>> departure_costs;
+  bool deterministic_mode = false;
+  std::optional<double> max_departures;
+  bool minimize_departures = false;
+  std::optional<double> overbudget_coeff;
+  absl::Duration solver_timeout = absl::InfiniteDuration();
+};
+
+AutoShardingSolverParams GetParams(const AutoShardingSolverRequest& request);
+
 absl::StatusOr<AutoShardingSolverOutput> FormulateAndSolveMIPFromSolverRequest(
-    const AutoShardingSolverRequest& request);
+    const AutoShardingSolverRequest& request,
+    const AutoShardingSolverParams& params);
 
 // TODO(fahrbach): Create AutoShardingHeuristicOptions proto with a oneof field.
 // Runs a heuristic specified by one of the following values of `algorithm`:
@@ -71,7 +81,6 @@ struct CostComponents {
   double computation_cost = 0.0;
   double resharding_cost = 0.0;
   double overbudget_cost = 0.0;
-  double makespan_cost = 0.0;
   double max_memory = 0.0;
 
   double cost() const;
@@ -92,16 +101,14 @@ struct AutoShardingEvaluation {
   // How many instructions departed from the "default" sharding strategy.
   double total_departures = 0.0;
 
-  // The (raw) total makespan, i.e., not scaled by the makespan coefficient.
-  double total_makespan = 0.0;
-
   bool operator==(const AutoShardingEvaluation& other) const;
 };
 
 // Evaluates the given solver result w.r.t. the input request, computing various
 // solution quality metrics and validating the consistency of hard constraints.
 AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
-                                const AutoShardingSolverOutput& result);
+                                const AutoShardingSolverOutput& result,
+                                const AutoShardingSolverParams& params);
 
 // Computes the objective value of the sharding strategy. If the objective value
 // is infinite or the sharding is infeasible (e.g., violates the peak-memory
@@ -112,16 +119,6 @@ double ComputeShardingStrategyCost(
     const AutoShardingSolverRequest& request,
     const std::vector<NodeStrategyIdx>& node_strategies);
 
-// Creates and returns a variable for makespan.
-operations_research::MPVariable* CreateMakespanVar(
-    const AutoShardingSolverRequest& request,
-    const std::vector<std::vector<operations_research::MPVariable*>>& e,
-    operations_research::MPSolver& solver);
-
-double EvaluateMakespan(const AutoShardingSolverRequest& request,
-                        const AutoShardingSolverOutput& result,
-                        AutoShardingEvaluation& evaluation);
-
 // Determines if strategy 'first' is dominated by strategy 'second' (i.e., its
 // costs are all equal or worse, and it has identical alias mappings).
 bool CheckDominance(const AutoShardingSolverRequest& request,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
index e57552d1db31..aecf03d5cf5a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
@@ -13,6 +13,7 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -24,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/statusor.h"
 
@@ -95,11 +97,10 @@ AutoShardingSolverRequest DefaultAutoShardingSolverRequest() {
   edge2.set_first(1);
   edge2.set_second(2);
   const auto edges = {edge1, edge2};
-  const NodeMatrix live = {{1, 0},
-                           {1, 0},
-                           {1, 2, 0},
-                           {1, 2, 3, 0},
-                           {1, 3, 0}};
+  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
+      {{0, 4}, {0, 4}, {2, 3}, {3, 4}, {100, -1}};
+  const std::vector<std::pair<int64_t, int64_t>> edge_intervals =
+      {{1, 2}, {2, 3}};
   const CostMatrix c = {{10, 11, 12, 13},
                         {20, 21, 22},
                         {30, 31, 32, 33},
@@ -154,7 +155,8 @@ AutoShardingSolverRequest DefaultAutoShardingSolverRequest() {
   request.mutable_s_len()->Add(s_len.begin(), s_len.end());
   request.mutable_s_follow()->Add(s_follow.begin(), s_follow.end());
   request.mutable_edges()->Add(edges.begin(), edges.end());
-  AddNodes(request.mutable_live(), live);
+  AddIntervals(request.mutable_node_intervals(), node_intervals);
+  AddIntervals(request.mutable_edge_intervals(), edge_intervals);
   AddCosts(request.mutable_computation_costs(), c);
   AddCosts(request.mutable_communication_costs(), d);
   AddCosts(request.mutable_memory_costs(), m);
@@ -179,11 +181,10 @@ AutoShardingSolverRequest AutoShardingSolverRequestWithEquivalences() {
   edge2.set_first(1);
   edge2.set_second(2);
   const auto edges = {edge1, edge2};
-  const NodeMatrix live = {{1, 0},
-                           {1, 0},
-                           {1, 2, 0},
-                           {1, 2, 3, 0},
-                           {1, 3, 0}};
+  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
+      {{0, 4}, {0, 4}, {2, 3}, {3, 4}, {100, -1}};
+  const std::vector<std::pair<int64_t, int64_t>> edge_intervals =
+      {{1, 2}, {2, 3}};
   const CostMatrix c = {{10, 10, 10, 10},
                         {20, 20, 20},
                         {30, 30, 31, 30, 30, 30, 30},
@@ -237,7 +238,8 @@ AutoShardingSolverRequest AutoShardingSolverRequestWithEquivalences() {
   request.mutable_s_len()->Add(s_len.begin(), s_len.end());
   request.mutable_s_follow()->Add(s_follow.begin(), s_follow.end());
   request.mutable_edges()->Add(edges.begin(), edges.end());
-  AddNodes(request.mutable_live(), live);
+  AddIntervals(request.mutable_node_intervals(), node_intervals);
+  AddIntervals(request.mutable_edge_intervals(), edge_intervals);
   AddCosts(request.mutable_computation_costs(), c);
   AddCosts(request.mutable_communication_costs(), d);
   AddCosts(request.mutable_memory_costs(), m);
@@ -255,7 +257,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, SolvesOptimally) {
   const AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const double objective_value = 7650.0;
@@ -269,7 +272,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, SolvesOverbudget) {
   request.mutable_overbudget_coeff()->set_coeff(10.0);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const double objective_value = 9007650.0;
@@ -282,7 +286,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, SolvesMaxDepartures) {
   request.mutable_max_departures()->set_coeff(3.0);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const double objective_value = 7872.0;
@@ -295,7 +300,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, MinimizesDepartures) {
   request.set_minimize_departures(true);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 1, 0, 0, 1};
   const double objective_value = 3.0;
@@ -310,7 +316,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, AvoidsInfiniteNodeCosts) {
   request.mutable_computation_costs(0)->set_costs(2, kInfinityCost);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {3, 0, 0, 0, 0};
   const double objective_value = 10683.0;
@@ -323,7 +330,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, AvoidsInfiniteEdgeCosts) {
   request.mutable_resharding_costs(0)->set_costs(0, kInfinityCost);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const double objective_value = 7872.0;
@@ -348,7 +356,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesFollowedEdges) {
   AddCosts(request.mutable_duration_costs(), t);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const double objective_value = 12650.0;
@@ -375,7 +384,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesCollapsedEdge) {
   AddCosts(request.mutable_duration_costs(), t);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                        FormulateAndSolveMIPFromSolverRequest(request));
+                        FormulateAndSolveMIPFromSolverRequest(
+                            request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const double objective_value = 13972.0;
@@ -389,7 +399,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, UsesHint) {
   request.mutable_s_hint()->Add(s_hint.begin(), s_hint.end());
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                        FormulateAndSolveMIPFromSolverRequest(request));
+                        FormulateAndSolveMIPFromSolverRequest(
+                            request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const double objective_value = 7650.0;
@@ -402,7 +413,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HonorsMaxCost) {
   request.mutable_max_cost()->set_coeff(7600.0);  // Best possible is 7650.0
 
   const absl::StatusOr<AutoShardingSolverOutput> result =
-      FormulateAndSolveMIPFromSolverRequest(request);
+      FormulateAndSolveMIPFromSolverRequest(request,
+                                            GetParams(request));
 
   EXPECT_TRUE(absl::IsInternal(result.status()));
 }
@@ -412,7 +424,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesExtremelyHighMaxCost) {
   request.mutable_max_cost()->set_coeff(1e19);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const double objective_value = 7650.0;
@@ -420,7 +433,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesExtremelyHighMaxCost) {
   EXPECT_EQ(result, expected_output);
 }
 
-TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesMemoryEdgeCosts) {
+TEST(DISABLED_FormulateAndSolveMIPFromSolverRequestTest,
+     HandlesMemoryEdgeCosts) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const EdgeMatrix live_edges = {{}, {0}, {0, 1}, {1}, {}};
   const CostMatrix memory_edge_costs = {{1000000, 1100, 1200, 1300,
@@ -435,7 +449,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesMemoryEdgeCosts) {
   request.set_enable_memory_edge_costs(true);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const double objective_value = 7872.0;
@@ -443,12 +458,9 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesMemoryEdgeCosts) {
   EXPECT_EQ(result, expected_output);
 }
 
-TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesIntervals) {
+TEST(DISABLED_FormulateAndSolveMIPFromSolverRequestTest,
+     HandlesIntervals) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
-      {{0, 4}, {0, 4}, {2, 3}, {3, 4}, {100, -1}};
-  const std::vector<std::pair<int64_t, int64_t>> edge_intervals =
-      {{1, 2}, {2, 3}};
   const CostMatrix memory_edge_costs = {{1000000, 1100, 1200, 1300,
                                          2000, 2100, 2200, 2300,
                                          3000, 3100, 3200, 3300,
@@ -456,14 +468,12 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesIntervals) {
                                         {5000000, 5100, 5200, 5300,
                                          6000, 6100, 6200, 6300,
                                          7000, 7100, 7200, 7300}};
-  request.clear_live();
-  AddIntervals(request.mutable_node_intervals(), node_intervals);
-  AddIntervals(request.mutable_edge_intervals(), edge_intervals);
   AddCosts(request.mutable_memory_edge_costs(), memory_edge_costs);
   request.set_enable_memory_edge_costs(true);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const double objective_value = 7872.0;
@@ -471,7 +481,7 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, HandlesIntervals) {
   EXPECT_EQ(result, expected_output);
 }
 
-TEST(FormulateAndSolveMIPFromSolverRequestTest,
+TEST(DISABLED_FormulateAndSolveMIPFromSolverRequestTest,
      HandlesReducedIntervalsAndGroups) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const std::vector<std::pair<int64_t, int64_t>> node_intervals =
@@ -487,7 +497,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest,
                                         {5000000, 5100, 5200, 5300,
                                          6000, 6100, 6200, 6300,
                                          7000, 7100, 7200, 7300}};
-  request.clear_live();
+  request.clear_node_intervals();
+  request.clear_edge_intervals();
   AddIntervals(request.mutable_node_intervals(), node_intervals);
   AddIntervals(request.mutable_edge_intervals(), edge_intervals);
   AddGroups(request.mutable_node_groups(), node_groups);
@@ -496,7 +507,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest,
   request.set_enable_memory_edge_costs(true);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
   const double objective_value = 7872.0;
@@ -504,19 +516,21 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest,
   EXPECT_EQ(result, expected_output);
 }
 
-TEST(FormulateAndSolveMIPFromSolverRequestTest,
+TEST(DISABLED_FormulateAndSolveMIPFromSolverRequestTest,
      HandlesReducedIntervalsAndGroupsNoMemoryEdgeCosts) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const std::vector<std::pair<int64_t, int64_t>> node_intervals =
       {{5, -1}, {5, -1}, {2, 3}, {3, 4}, {100, -1}, {0, 4}};
   const std::vector<std::vector<int64_t>> node_groups = {{0, 1}};
-  request.clear_live();
+  request.clear_node_intervals();
+  request.clear_edge_intervals();
   AddIntervals(request.mutable_node_intervals(), node_intervals);
   AddGroups(request.mutable_node_groups(), node_groups);
   request.set_enable_memory_edge_costs(false);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const double objective_value = 7650.0;
@@ -524,7 +538,7 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest,
   EXPECT_EQ(result, expected_output);
 }
 
-TEST(FormulateAndSolveMIPFromSolverRequestTest,
+TEST(DISABLED_FormulateAndSolveMIPFromSolverRequestTest,
      HandlesGroupsWithTinyMemoryCosts) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const std::vector<std::pair<int64_t, int64_t>> node_intervals =
@@ -545,7 +559,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest,
                                         {0, 0, 0, 0,
                                          0, 0, 0, 0,
                                          0, 0, 0, 0}};
-  request.clear_live();
+  request.clear_node_intervals();
+  request.clear_edge_intervals();
   request.clear_memory_costs();
   AddIntervals(request.mutable_node_intervals(), node_intervals);
   AddIntervals(request.mutable_edge_intervals(), edge_intervals);
@@ -557,7 +572,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest,
   request.set_memory_budget(4321);
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 0, 0, 0};
   const double objective_value = 7650.0;
@@ -570,7 +586,8 @@ TEST(FormulateAndSolveMIPFromSolverRequestTest, SolvesWithEquivalences) {
       AutoShardingSolverRequestWithEquivalences();
 
   TF_ASSERT_OK_AND_ASSIGN(const AutoShardingSolverOutput result,
-                          FormulateAndSolveMIPFromSolverRequest(request));
+                          FormulateAndSolveMIPFromSolverRequest(
+                              request, GetParams(request)));
 
   const std::vector<NodeStrategyIdx> s_val = {0, 0, 5, 5, 1};
   const double objective_value = 7650.0;
@@ -584,7 +601,8 @@ TEST(AutoShardingEvaluatorTest, NoViolations) {
   const double objective_value = 12149.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.total.computation_cost = 159.0;  // 13+21+32+42+51
@@ -607,7 +625,8 @@ TEST(AutoShardingEvaluatorTest, EvaluatesOverbudget) {
   const double objective_value = 11138.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.total.computation_cost = 158.0;  // 12+21+32+42+51
@@ -626,17 +645,14 @@ TEST(AutoShardingEvaluatorTest, EvaluatesOverbudget) {
 
 TEST(AutoShardingEvaluatorTest, EvaluatesOverbudgetWithIntervals) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
-      {{0, 4}, {0, 4}, {2, 3}, {3, 4}, {100, -1}};
   request.set_memory_budget(100000);
   request.mutable_overbudget_coeff()->set_coeff(10.0);
-  request.clear_live();
-  AddIntervals(request.mutable_node_intervals(), node_intervals);
   const std::vector<NodeStrategyIdx> s_val = {2 /* violates */, 1, 2, 2, 1};
   const double objective_value = 11138.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.total.computation_cost = 158.0;  // 12+21+32+42+51
@@ -653,7 +669,7 @@ TEST(AutoShardingEvaluatorTest, EvaluatesOverbudgetWithIntervals) {
   EXPECT_EQ(evaluation, expected_evaluation);
 }
 
-TEST(AutoShardingEvaluatorTest,
+TEST(DISABLED_AutoShardingEvaluatorTest,
      EvaluatesOverbudgetWithReducedIntervalsAndGroups) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const std::vector<std::pair<int64_t, int64_t>> node_intervals =
@@ -661,14 +677,16 @@ TEST(AutoShardingEvaluatorTest,
   const std::vector<std::vector<int64_t>> node_groups = {{0, 1}};
   request.set_memory_budget(100000);
   request.mutable_overbudget_coeff()->set_coeff(10.0);
-  request.clear_live();
+  request.clear_node_intervals();
+  request.clear_edge_intervals();
   AddIntervals(request.mutable_node_intervals(), node_intervals);
   AddGroups(request.mutable_node_groups(), node_groups);
   const std::vector<NodeStrategyIdx> s_val = {2 /* violates */, 1, 2, 2, 1};
   const double objective_value = 11138.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.total.computation_cost = 158.0;  // 12+21+32+42+51
@@ -691,7 +709,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesFollower) {
   const double objective_value = 12138.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.violation_codes = {kFollowerViolationCode};
@@ -713,7 +732,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesAlias) {
   const double objective_value = 12138.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.violation_codes = {kAliasViolationCode};
@@ -735,7 +755,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesMemory) {
   const double objective_value = 11138.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.violation_codes = {kMemoryViolationCode};
@@ -760,7 +781,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForNode) {
   const double objective_value = 1e+20;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.violation_codes = {kInfiniteCostViolationCode};
@@ -783,7 +805,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForEdge) {
   const double objective_value = 1e+20;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.violation_codes = {kInfiniteCostViolationCode};
@@ -806,7 +829,8 @@ TEST(AutoShardingEvaluatorTest, ViolatesMaxDepartures) {
   const double objective_value = 12149.0;
   const AutoShardingSolverOutput output = {s_val, objective_value};
 
-  const AutoShardingEvaluation evaluation = Evaluate(request, output);
+  const AutoShardingEvaluation evaluation =
+      Evaluate(request, output, GetParams(request));
 
   AutoShardingEvaluation expected_evaluation;
   expected_evaluation.violation_codes = {kMaxDeparturesViolationCode};
@@ -822,125 +846,24 @@ TEST(AutoShardingEvaluatorTest, ViolatesMaxDepartures) {
   EXPECT_EQ(evaluation, expected_evaluation);
 }
 
-TEST(ScaleRequest, ScalesProperly) {
-  AutoShardingSolverRequest unscaled_request;
-  const CostMatrix c = {{10000000, 11000000, 12000000, 13000000},
-                        {20000000, 21000000, 22000000},
-                        {30000000, 31000000, 32000000, 33000000},
-                        {40000000, 41000000, 42000000, 43000000},
-                        {50000000, 51000000, 52000000, 53000000}};
-  const CostMatrix d = {{100000000, 110000000, 120000000, 130000000},
-                        {200000000, 210000000, 220000000},
-                        {300000000, 310000000, 320000000, 330000000},
-                        {400000000, 410000000, 420000000, 430000000},
-                        {500000000, 510000000, 520000000}};
-  const CostMatrix r = {{1000000000, 1100000000, 1200000000, 1300000000,
-                         2000000000, 2100000000, 2200000000, 2300000000,
-                         3000000000, 3100000000, 3200000000, 3300000000,
-                         4000000000, 4100000000, 4200000000, 4300000000},
-                        {5000000000, 5100000000, 5200000000, 5300000000,
-                         6000000000, 6100000000, 6200000000, 6300000000,
-                         7000000000, 7100000000, 7200000000, 10000000000000}};
-  AddCosts(unscaled_request.mutable_computation_costs(), c);
-  AddCosts(unscaled_request.mutable_communication_costs(), d);
-  AddCosts(unscaled_request.mutable_resharding_costs(), r);
-  unscaled_request.mutable_coeff_limit()->set_coeff(1e7);
-
-  AutoShardingSolverRequest request = ScaleRequest(unscaled_request);
-
-  AutoShardingSolverRequest expected_request;
-  const CostMatrix expected_c = {{10, 11, 12, 13},
-                                 {20, 21, 22},
-                                 {30, 31, 32, 33},
-                                 {40, 41, 42, 43},
-                                 {50, 51, 52, 53}};
-  const CostMatrix expected_d = {{100, 110, 120, 130},
-                                 {200, 210, 220},
-                                 {300, 310, 320, 330},
-                                 {400, 410, 420, 430},
-                                 {500, 510, 520}};
-  const CostMatrix expected_r = {{1000, 1100, 1200, 1300,
-                                  2000, 2100, 2200, 2300,
-                                  3000, 3100, 3200, 3300,
-                                  4000, 4100, 4200, 4300},
-                                 {5000, 5100, 5200, 5300,
-                                  6000, 6100, 6200, 6300,
-                                  7000, 7100, 7200, 10000000}};
-  AddCosts(expected_request.mutable_computation_costs(), expected_c);
-  AddCosts(expected_request.mutable_communication_costs(), expected_d);
-  AddCosts(expected_request.mutable_resharding_costs(), expected_r);
-  expected_request.mutable_coeff_limit()->set_coeff(1e7);
-  EXPECT_THAT(request, ::testing::EqualsProto(expected_request));
-}
-
-TEST(ScaleRequest, SkipsScaling) {
-  AutoShardingSolverRequest unscaled_request;
-  const CostMatrix c = {{10, 11, 12, 13},
-                        {20, 21, 22},
-                        {30, 31, 32, 33},
-                        {40, 41, 42, 43},
-                        {50, 51, 52, 53}};
-  const CostMatrix d = {{100, 110, 120, 130},
-                        {200, 210, 220},
-                        {300, 310, 320, 330},
-                        {400, 410, 420, 430},
-                        {500, 510, 520}};
-  const CostMatrix r = {{1000, 1100, 1200, 1300,
-                         2000, 2100, 2200, 2300,
-                         3000, 3100, 3200, 3300,
-                         4000, 4100, 4200, 4300},
-                        {5000, 5100, 5200, 5300,
-                         6000, 6100, 6200, 6300,
-                         7000, 7100, 7200, 10000000}};
-  AddCosts(unscaled_request.mutable_computation_costs(), c);
-  AddCosts(unscaled_request.mutable_communication_costs(), d);
-  AddCosts(unscaled_request.mutable_resharding_costs(), r);
-  unscaled_request.mutable_coeff_limit()->set_coeff(1e7);
-
-  AutoShardingSolverRequest request = ScaleRequest(unscaled_request);
-
-  AutoShardingSolverRequest expected_request;
-  const CostMatrix expected_c = {{10, 11, 12, 13},
-                                 {20, 21, 22},
-                                 {30, 31, 32, 33},
-                                 {40, 41, 42, 43},
-                                 {50, 51, 52, 53}};
-  const CostMatrix expected_d = {{100, 110, 120, 130},
-                                 {200, 210, 220},
-                                 {300, 310, 320, 330},
-                                 {400, 410, 420, 430},
-                                 {500, 510, 520}};
-  const CostMatrix expected_r = {{1000, 1100, 1200, 1300,
-                                  2000, 2100, 2200, 2300,
-                                  3000, 3100, 3200, 3300,
-                                  4000, 4100, 4200, 4300},
-                                 {5000, 5100, 5200, 5300,
-                                  6000, 6100, 6200, 6300,
-                                  7000, 7100, 7200, 10000000}};
-  AddCosts(expected_request.mutable_computation_costs(), expected_c);
-  AddCosts(expected_request.mutable_communication_costs(), expected_d);
-  AddCosts(expected_request.mutable_resharding_costs(), expected_r);
-  expected_request.mutable_coeff_limit()->set_coeff(1e7);
-  EXPECT_THAT(request, ::testing::EqualsProto(expected_request));
-}
-
 TEST(MinimumMemoryBudgetRequired, HandlesLiveMatrix) {
   const AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   EXPECT_EQ(MinimumMemoryBudgetRequired(request), 1000000.0);
 }
 
-TEST(MinimumMemoryBudgetRequired, HandlesReducedIntervalsAndGroups) {
+TEST(DISABLED_MinimumMemoryBudgetRequired, HandlesReducedIntervalsAndGroups) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const std::vector<std::pair<int64_t, int64_t>> node_intervals =
       {{5, -1}, {5, -1}, {2, 3}, {3, 4}, {100, -1}, {0, 4}};
   const std::vector<std::vector<int64_t>> node_groups = {{0, 1}};
-  request.clear_live();
+  request.clear_node_intervals();
+  request.clear_edge_intervals();
   AddIntervals(request.mutable_node_intervals(), node_intervals);
   AddGroups(request.mutable_node_groups(), node_groups);
   EXPECT_EQ(MinimumMemoryBudgetRequired(request), 1000000.0);
 }
 
-TEST(StableMap, IterationOrderDeterminism){
+TEST(StableMap, IterationOrderDeterminism) {
   StableMap<int, int> map;
   std::vector<int> insertion_order = {6, 3, 1, 2, 4, 5, 10, 0, 7, 9, 8};
   for (int key : insertion_order) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index 20dddb65a96d..32885fe38f0d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -91,8 +91,8 @@ void EnumerateHelper(std::function<void(const DimMap&)> split_func,
   }
 }
 
-// Map tensor dims from [0, tensor_shape.rank() - 1] to (atmost one or more,
-// depending on the value of allow_mixed_mesh_shape) mesh dims.
+// Map tensor dims from [0, tensor_shape.dimensions_size() - 1] to (atmost one
+// or more, depending on the value of allow_mixed_mesh_shape) mesh dims.
 void Enumerate(std::function<void(const DimMap&)> split_func,
                int64_t tensor_rank,
                const std::vector<int>& unassigned_mesh_dims,
@@ -133,14 +133,14 @@ ComputeSliceShardingAndCommunicationCostFromOperand(
 
   CHECK(old_shape.IsArray());
 
-  std::vector<int64_t> tensor_to_mesh_dim =
-      GetTensorDimToMeshDim(new_shape.rank(), input_spec, device_mesh,
-                            /* consider_reverse_device_meshes */ true);
+  std::vector<int64_t> tensor_to_mesh_dim = GetTensorDimToMeshDim(
+      new_shape.dimensions().size(), input_spec, device_mesh,
+      /* consider_reverse_device_meshes */ true);
 
   std::vector<int64_t> mesh_dims_for_communication;
   std::vector<int64_t> tensor_dims;
   std::vector<int64_t> mesh_dims;
-  for (size_t i = 0; i < new_shape.rank(); ++i) {
+  for (size_t i = 0; i < new_shape.dimensions().size(); ++i) {
     if (tensor_to_mesh_dim[i] == -1) {
       continue;
     }
@@ -326,7 +326,7 @@ BuildStrategyAndCost(
           // loop. Therefore, this followinf relationship is necessary for
           // correctness, and is not merely an optimization.
           is_follow_necessary_for_correctness = true;
-          for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+          for (size_t i = 0; i < ins->shape().tuple_shapes().size(); ++i) {
             std::unique_ptr<StrategyGroup> child_strategies =
                 MaybeFollowInsStrategyGroup(
                     *while_input_tuple_strategy_group->GetChildren()[i],
@@ -909,7 +909,7 @@ BuildStrategyAndCost(
               if (only_replicated) {
                 if (ins->shape().IsTuple()) {
                   strategy_group = CreateTupleStrategyGroup(instruction_id);
-                  for (size_t i = 0; i < ins->shape().tuple_shapes_size();
+                  for (size_t i = 0; i < ins->shape().tuple_shapes().size();
                        ++i) {
                     std::unique_ptr<StrategyGroup> child_strategies =
                         CreateLeafStrategyGroup(instruction_id, ins,
@@ -980,7 +980,7 @@ BuildStrategyAndCost(
         strategy_group = CreateTupleStrategyGroup(instruction_id);
         const auto& src_strategy_group = *strategy_map.at(ins->operand(0));
         const auto& src_children = src_strategy_group.GetChildren();
-        for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+        for (size_t i = 0; i < ins->shape().tuple_shapes().size(); ++i) {
           auto child_strategies = MaybeFollowInsStrategyGroup(
               *src_children[i], ins->shape().tuple_shapes().at(i),
               instruction_id, strategy_groups, cluster_env,
@@ -1015,7 +1015,7 @@ BuildStrategyAndCost(
       case HloOpcode::kRecvDone:
       case HloOpcode::kSend: {
         strategy_group = CreateTupleStrategyGroup(instruction_id);
-        for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+        for (size_t i = 0; i < ins->shape().tuple_shapes().size(); ++i) {
           std::unique_ptr<StrategyGroup> child_strategies =
               CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
                                       strategy_groups);
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
index 13c1eb71d590..9b625a8fef4c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
@@ -392,8 +392,8 @@ using AliasSet = StableSet<std::pair<NodeIdx, NodeIdx>>;
 using MeshDimSet = StableSet<int>;
 using DimMap = StableMap</*tensor dim*/ int, /*mesh dims*/ MeshDimSet>;
 
-// Map tensor dims from [0, tensor_shape.rank() - 1] to (atmost one or more,
-// depending on the value of allow_mixed_mesh_shape) mesh dims.
+// Map tensor dims from [0, tensor_shape.dimensions_size() - 1] to (atmost one
+// or more, depending on the value of allow_mixed_mesh_shape) mesh dims.
 void Enumerate(std::function<void(const DimMap&)> split_func,
                int64_t tensor_rank,
                const std::vector<int>& unassigned_mesh_dims,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index 660b344b3bdb..625f3f95b591 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -52,7 +52,7 @@ limitations under the License.
 #include "xla/service/buffer_value.h"
 #include "xla/service/hlo_value.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -515,10 +515,7 @@ TEST_F(AutoShardingTest, MemoryBudgetTest) {
       return spmd::ByteSizeOfShape(buffer.shape());
     };
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                        ScheduleModule(&module, size_fn,
-                                       ComputationSchedulerToModuleScheduler(
-                                           DFSMemoryScheduler),
-                                       /* execution_threads */ {}));
+                        ScheduleModule(&module, DFSMemoryScheduler(size_fn)));
     const HloComputation* entry_computation = module.entry_computation();
     std::unique_ptr<HloAliasAnalysis> alias_analysis =
         HloAliasAnalysis::Run(&module).value();
@@ -1530,15 +1527,19 @@ ENTRY twomatmul {
               op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
   const HloInstruction* param3 = FindInstruction(module.get(), "parameter.3");
   ASSERT_NE(param3, nullptr);
-  EXPECT_THAT(param3,
-              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+  EXPECT_THAT(
+      param3,
+      AnyOf(op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"),
+            op::Sharding("{devices=[1,2,2]<=[4] last_tile_dim_replicate}")));
   const HloInstruction* dot4 = FindInstruction(module.get(), "dot.4");
   ASSERT_NE(dot4, nullptr);
   EXPECT_THAT(dot4, op::Sharding("{devices=[2,2]0,2,1,3}"));
   const HloInstruction* dot5 = FindInstruction(module.get(), "dot.5");
   ASSERT_NE(dot5, nullptr);
-  EXPECT_THAT(dot5,
-              op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}"));
+  EXPECT_THAT(
+      dot5,
+      AnyOf(op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}"),
+            op::Sharding("{devices=[2,2]<=[2,2]T(1,0)}")));
 }
 
 TEST_F(AutoShardingTest, TwoMatmulWithDotReplicationEnabled) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 6e95746e39e6..4a608ed50afa 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -121,7 +121,7 @@ std::optional<HloSharding> PropagateDimwiseSharding(
   CHECK(old_shape.IsArray());
 
   const auto& tile_assignment = input_spec.tile_assignment();
-  for (int64_t i = 0; i < old_shape.rank(); ++i) {
+  for (int64_t i = 0; i < old_shape.dimensions().size(); ++i) {
     if (tile_assignment.dim(i) > 1 &&
         new_shape.dimensions(i) != old_shape.dimensions(i)) {
       return std::nullopt;
@@ -144,7 +144,7 @@ std::optional<HloSharding> PropagateReduceWindowSharding(
   CHECK(!input_spec.IsTuple());
 
   const auto& tile_assignment = input_spec.tile_assignment();
-  for (int64_t i = 0; i < old_shape.rank(); ++i) {
+  for (int64_t i = 0; i < old_shape.dimensions().size(); ++i) {
     if (tile_assignment.dim(i) > 1 && window.dimensions(i).size() != 1) {
       return std::nullopt;
     }
@@ -265,7 +265,7 @@ void BatchDimMapForward(const std::vector<HloInstruction*>& instructions,
         if (batch_map.contains(GetBatchDimMapKey(operand))) {
           int value = batch_map[GetBatchDimMapKey(operand)];
           int old_dim = -1;
-          for (int i = 0; i < ins->shape().rank(); ++i) {
+          for (int i = 0; i < ins->shape().dimensions().size(); ++i) {
             if (absl::c_linear_search(dimensions, i)) {
               old_dim++;
             }
@@ -483,7 +483,7 @@ void BatchDimMapForward(const std::vector<HloInstruction*>& instructions,
         break;
       case HloOpcode::kWhile: {
         const HloInstruction* op = ins->operand(0);
-        for (size_t i = 0; i < op->shape().tuple_shapes_size(); ++i) {
+        for (size_t i = 0; i < op->shape().tuple_shapes().size(); ++i) {
           if (batch_map.contains(GetBatchDimMapKey(op, i))) {
             batch_map[GetBatchDimMapKey(ins, i)] =
                 batch_map[GetBatchDimMapKey(op, i)];
@@ -521,7 +521,7 @@ void BatchDimMapBackward(const std::vector<HloInstruction*>& instructions,
             !batch_map.contains(GetBatchDimMapKey(operand))) {
           int value = batch_map[GetBatchDimMapKey(ins)];
           int old_dim = -1;
-          for (int i = 0; i < ins->shape().rank(); ++i) {
+          for (int i = 0; i < ins->shape().dimensions().size(); ++i) {
             if (absl::c_linear_search(dimensions, i)) {
               old_dim++;
             }
@@ -734,7 +734,7 @@ void BatchDimMapBackward(const std::vector<HloInstruction*>& instructions,
         break;
       case HloOpcode::kWhile: {
         const HloInstruction* op = ins->operand(0);
-        for (size_t i = 0; i < op->shape().tuple_shapes_size(); ++i) {
+        for (size_t i = 0; i < op->shape().tuple_shapes().size(); ++i) {
           if (batch_map.contains(GetBatchDimMapKey(ins, i))) {
             batch_map[GetBatchDimMapKey(op, i)] =
                 batch_map[GetBatchDimMapKey(ins, i)];
@@ -914,7 +914,7 @@ bool IsAlwaysReplicated(const HloInstruction* inst) {
   if (inst->opcode() == HloOpcode::kConstant) {
     return true;
   }
-  if (inst->shape().rank() == 0) {
+  if (inst->shape().dimensions().size() == 0) {
     return true;
   }
   if (inst->opcode() == HloOpcode::kBroadcast) {
@@ -1211,7 +1211,7 @@ absl::StatusOr<Shape> ComputeIntermediateShape(const HloSharding& src_sharding,
 
   // Find an intermediate shape
   std::vector<int64_t> inter_shape_dims;
-  for (size_t i = 0; i < shape.rank(); ++i) {
+  for (size_t i = 0; i < shape.dimensions().size(); ++i) {
     if (sharding_1d->tile_assignment().dim(i) == 1) {
       inter_shape_dims.push_back(shape.dimensions(i));
     } else {
@@ -1275,7 +1275,7 @@ absl::Status FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
     HloInstruction* inst,
     const std::vector<std::optional<HloSharding>>& dst_shardings,
     const DeviceMesh& device_mesh) {
-  size_t tuple_size = inst->shape().tuple_shapes_size();
+  size_t tuple_size = inst->shape().tuple_shapes().size();
   const HloSharding& current_sharding = inst->sharding();
 
   bool need_to_reshard = false;
@@ -1500,7 +1500,8 @@ HloSharding TileV1(const Shape& tensor_shape,
                    const DeviceMesh& device_mesh) {
   CHECK_EQ(tensor_dims.size(), mesh_dims.size());
   CHECK(tensor_shape.IsArray());
-  std::vector<int64_t> tile_assignment_dimensions(tensor_shape.rank(), 1);
+  std::vector<int64_t> tile_assignment_dimensions(
+      tensor_shape.dimensions().size(), 1);
 
   // Split on certain mesh dimensions
   int64_t split_prod = 1;
@@ -1541,7 +1542,7 @@ HloSharding TileV1(const Shape& tensor_shape,
     }
 
     if (proceed_to_next_tensor_dim &&
-        current_tensor_dim == tensor_shape.rank() - 1) {
+        current_tensor_dim == tensor_shape.dimensions().size() - 1) {
       AppendFlattenElements(&tile_assignment_devices, device_mesh.DeviceArray(),
                             mesh_indices);
       return;
@@ -1596,7 +1597,8 @@ HloSharding TileV2(const Shape& tensor_shape,
                    const DeviceMesh& device_mesh) {
   CHECK_EQ(tensor_dims.size(), mesh_dims.size());
   CHECK(tensor_shape.IsArray());
-  std::vector<int64_t> tile_assignment_dimensions(tensor_shape.rank(), 1);
+  std::vector<int64_t> tile_assignment_dimensions(
+      tensor_shape.dimensions().size(), 1);
   std::vector<int> transpose_perm;
   absl::Span<const int64_t> reshape_dims = device_mesh.dimensions();
 
@@ -2129,10 +2131,12 @@ int64_t ByteSizeOfShapeIfShardedAcrossDevices(
           return;
         }
         int64_t byte_size = ByteSizeOfShape(subshape);
-        absl::Span<const int64_t> subshape_dims = subshape.dimensions();
-        auto max_dim_it = absl::c_max_element(subshape_dims);
-        if (max_dim_it != subshape_dims.end() && *max_dim_it >= num_devices) {
-          byte_size /= num_devices;
+        if (subshape.IsArray()) {
+          const absl::Span<const int64_t> subshape_dims = subshape.dimensions();
+          const auto max_dim_it = absl::c_max_element(subshape_dims);
+          if (max_dim_it != subshape_dims.end() && *max_dim_it >= num_devices) {
+            byte_size /= num_devices;
+          }
         }
         total_size += byte_size;
       });
@@ -2252,15 +2256,16 @@ absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
     if (inst->shape().IsTuple()) {
       ShapeTree<HloSharding> output_tuple_sharding(inst->shape(), Undefined());
       std::vector<HloSharding> output_flattened_shardings;
-      for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
+      for (size_t i = 0; i < inst->shape().tuple_shapes().size(); i++) {
         const Shape& shape = inst->shape().tuple_shapes(i);
         const HloSharding& sharding = inst->sharding().tuple_elements()[i];
         if (sharding.IsUnknown()) {
           output_flattened_shardings.push_back(sharding);
           continue;
         }
-        TF_ASSIGN_OR_RETURN(std::optional<HloSharding> new_sharding,
-                            adjust_sharding(shape.rank(), sharding));
+        TF_ASSIGN_OR_RETURN(
+            std::optional<HloSharding> new_sharding,
+            adjust_sharding(shape.dimensions().size(), sharding));
         output_flattened_shardings.push_back(
             new_sharding.has_value() ? *new_sharding : sharding);
         changed |= new_sharding.has_value();
@@ -2275,7 +2280,7 @@ absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
     }
     TF_ASSIGN_OR_RETURN(
         std::optional<HloSharding> new_sharding,
-        adjust_sharding(inst->shape().rank(), inst->sharding()));
+        adjust_sharding(inst->shape().dimensions().size(), inst->sharding()));
     if (new_sharding.has_value()) {
       inst->set_sharding(*new_sharding);
       changed = true;
@@ -2519,7 +2524,7 @@ std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
 
 bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape) {
   if (shape.IsTuple()) {
-    for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+    for (size_t i = 0; i < shape.tuple_shapes().size(); ++i) {
       if (IsShardingMisaligned(
               sharding.IsTuple()
                   ? sharding.GetSubSharding(shape, {static_cast<int64_t>(i)})
@@ -2536,7 +2541,7 @@ bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape) {
     return false;
   }
 
-  for (size_t i = 0; i < shape.rank(); ++i) {
+  for (size_t i = 0; i < shape.dimensions().size(); ++i) {
     int64_t shape_dim = shape.dimensions()[i];
     int64_t sharding_dim = sharding.tile_assignment().dim(i);
     if (shape_dim % sharding_dim != 0) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 2c8677e1e7e4..5e1241b71dc6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -72,12 +72,12 @@ inline std::pair<int, int> ParseMeshDims(const std::string& strategy_name) {
 
 inline std::string ToAdaptiveString(const HloInstruction* ins) {
   bool is_large_instruction =
-      ins->shape().IsTuple() && ins->shape().tuple_shapes_size() > 500;
+      ins->shape().IsTuple() && ins->shape().tuple_shapes().size() > 500;
   if (!is_large_instruction) {
     for (const auto& operand : ins->operands()) {
-      is_large_instruction =
-          is_large_instruction || (operand->shape().IsTuple() &&
-                                   operand->shape().tuple_shapes_size() > 500);
+      is_large_instruction = is_large_instruction ||
+                             (operand->shape().IsTuple() &&
+                              operand->shape().tuple_shapes().size() > 500);
     }
   }
   return is_large_instruction ? ins->ToShortString() : ins->ToString();
@@ -184,7 +184,7 @@ GetSpaceDims(const Shape& lhs_shape, const Shape& rhs_shape,
              const DotDimensionNumbers& dnums) {
   tsl::protobuf::RepeatedField<int64_t> lhs_space_dims, rhs_space_dims;
 
-  for (int64_t i = 0; i < lhs_shape.rank(); ++i) {
+  for (int64_t i = 0; i < lhs_shape.dimensions().size(); ++i) {
     if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
         absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
       continue;
@@ -192,7 +192,7 @@ GetSpaceDims(const Shape& lhs_shape, const Shape& rhs_shape,
     lhs_space_dims.Add(i);
   }
 
-  for (int64_t i = 0; i < rhs_shape.rank(); ++i) {
+  for (int64_t i = 0; i < rhs_shape.dimensions().size(); ++i) {
     if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
         absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
       continue;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
index 333df715447f..5a07e71e726f 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
@@ -45,10 +45,6 @@ absl::StatusOr<AutoShardingSolverOutput> Solve(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
     const CostGraph& cost_graph, const AliasSet& alias_set,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
-    const std::vector<absl::btree_set<int64_t>>& node_groups,
-    const std::vector<absl::btree_set<int64_t>>& edge_groups,
     const AutoShardingOption& option, absl::string_view request_prefix,
     const absl::flat_hash_map<std::string, HloSharding>&
         sharding_propagation_solution = {});
@@ -60,10 +56,6 @@ CreateAutoShardingSolverRequestAndCallSolver(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
     const CostGraph& cost_graph, const AliasSet& alias_set,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
-    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
-    const std::vector<absl::btree_set<int64_t>>& node_groups,
-    const std::vector<absl::btree_set<int64_t>>& edge_groups,
     const std::vector<NodeStrategyIdx>& s_hint, bool compute_iis,
     int64_t solver_timeout_in_seconds, const AutoShardingOption& option,
     std::optional<double> max_cost, absl::string_view request_name,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
index 9a68b636b79f..cc41ea7503e3 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
@@ -143,11 +143,11 @@ double ClusterEnvironment::ReshardingCostMixedMeshShape(
     const HloSharding& dst_sharding) const {
   absl::StatusOr<std::vector<absl::btree_set<int64_t>>>
       src_tensor_dim_to_mesh_axis = GetTensorDimToMeshDimMixedMeshSharding(
-          shape.rank(), src_sharding, device_mesh_,
+          shape.dimensions().size(), src_sharding, device_mesh_,
           /*consider_reverse_device_meshes=*/true);
   absl::StatusOr<std::vector<absl::btree_set<int64_t>>>
       dst_tensor_dim_to_mesh_axis = GetTensorDimToMeshDimMixedMeshSharding(
-          shape.rank(), dst_sharding, device_mesh_,
+          shape.dimensions().size(), dst_sharding, device_mesh_,
           /*consider_reverse_device_meshes=*/true);
   if (!src_tensor_dim_to_mesh_axis.ok() || !dst_tensor_dim_to_mesh_axis.ok()) {
     return OverestimateReplicationCost(shape, src_sharding, device_mesh_);
@@ -156,7 +156,7 @@ double ClusterEnvironment::ReshardingCostMixedMeshShape(
   int64_t num_devices = device_mesh_.num_elements();
   std::vector<int64_t> collective_mesh_axes;
   // Only consider sharded dimensions, do not consider replicate_on_last_dim.
-  for (size_t i = 0; i < shape.rank(); ++i) {
+  for (size_t i = 0; i < shape.dimensions().size(); ++i) {
     if ((*src_tensor_dim_to_mesh_axis)[i] ==
         (*dst_tensor_dim_to_mesh_axis)[i]) {
       continue;
@@ -313,9 +313,9 @@ double ClusterEnvironment::ReshardingCost(const Shape& shape,
   // of an operand with a different shape, we need to use their
   // TiledDataRank().
   size_t src_rank =
-      src_spec.IsTiled() ? src_spec.TiledDataRank() : shape.rank();
+      src_spec.IsTiled() ? src_spec.TiledDataRank() : shape.dimensions().size();
   size_t dst_rank =
-      dst_spec.IsTiled() ? dst_spec.TiledDataRank() : shape.rank();
+      dst_spec.IsTiled() ? dst_spec.TiledDataRank() : shape.dimensions().size();
 
   auto get_tensor_dim_to_mesh_dim = [&](int64_t rank,
                                         const HloSharding& sharding) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
index 89b81133c95d..a0612e607117 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
@@ -115,11 +115,13 @@ class ClusterEnvironment {
     int64_t n_dim = NumTileDimensions(spec);
     std::vector<int64_t> tensor_dim_to_mesh_dim;
     if (crash_at_error) {
-      tensor_dim_to_mesh_dim = GetTensorDimToMeshDim(
-          shape.rank(), spec, device_mesh_, consider_reverse_device_meshes);
+      tensor_dim_to_mesh_dim =
+          GetTensorDimToMeshDim(shape.dimensions().size(), spec, device_mesh_,
+                                consider_reverse_device_meshes);
     } else {
       auto tensor_dim_to_mesh_dim_status = GetTensorDimToMeshDimNoCrash(
-          shape.rank(), spec, device_mesh_, consider_reverse_device_meshes);
+          shape.dimensions().size(), spec, device_mesh_,
+          consider_reverse_device_meshes);
       if (tensor_dim_to_mesh_dim_status.ok()) {
         tensor_dim_to_mesh_dim = tensor_dim_to_mesh_dim_status.value();
       }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/example.json b/third_party/xla/xla/hlo/experimental/auto_sharding/example.json
new file mode 100644
index 000000000000..198ffd1621c0
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/example.json
@@ -0,0 +1,45 @@
+{
+  "problem": {
+    "name": "example",
+    "nodes": {
+      "intervals": [
+        [30, 70],
+        [40, 70],
+        [50, 120],
+        [110, 140],
+        [110, 150]
+      ],
+      "costs": [
+        [15],
+        [55, 65],
+        [25, 45, 35],
+        [85, 75],
+        [95]
+      ],
+      "usages": [
+        [10],
+        [25, 25],
+        [15, 20, 15],
+        [10, 10],
+        [15]
+      ]
+    },
+    "edges": {
+      "nodes": [
+        [0, 1],
+        [0, 2],
+        [1, 3],
+        [2, 4],
+        [3, 4]
+      ],
+      "costs": [
+        [30, 40],
+        [50, 10, 40],
+        [90, 10, 20, 80],
+        [60, 20, 30],
+        [70, 60]
+      ]
+    },
+    "usage_limit": 50
+  }
+}
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl.cc
new file mode 100644
index 000000000000..88748eb97820
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl.cc
@@ -0,0 +1,133 @@
+/*
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "xla/hlo/experimental/auto_sharding/iopddl.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/////////  Utilities for reading problems and evaluating solutions.    /////////
+/////////  Contest participants do not need to modify this code.       /////////
+////////////////////////////////////////////////////////////////////////////////
+
+namespace iopddl {
+
+bool Strategy::operator==(const Strategy& other) const {
+  return cost == other.cost && usage == other.usage;
+}
+
+bool Node::operator==(const Node& other) const {
+  return interval == other.interval && strategies == other.strategies;
+}
+
+bool Edge::operator==(const Edge& other) const {
+  return nodes == other.nodes && strategies == other.strategies;
+}
+
+bool Problem::operator==(const Problem& other) const {
+  return name == other.name && nodes == other.nodes && edges == other.edges &&
+         usage_limit == other.usage_limit;
+}
+
+absl::StatusOr<TotalCost> Evaluate(const Problem& problem,
+                                   const Solution& solution) {
+  if (solution.size() != problem.nodes.size()) {
+    return absl::InvalidArgumentError("Incorrect solution size");
+  }
+  TimeIdx max_time = 0;
+  for (const Node& node : problem.nodes) {
+    max_time = std::max(max_time, node.interval.second);
+  }
+  TotalCost cost = 0;
+  std::vector<TotalUsage> total_usages(max_time);
+  for (NodeIdx node_idx = 0; node_idx < problem.nodes.size(); ++node_idx) {
+    const Node& node = problem.nodes[node_idx];
+    const StrategyIdx strategy_idx = solution[node_idx];
+    if (strategy_idx < 0 || strategy_idx >= (int64_t)node.strategies.size()) {
+      return absl::OutOfRangeError("Invalid strategy index");
+    }
+    cost += node.strategies[strategy_idx].cost;
+    for (TimeIdx t = node.interval.first; t < node.interval.second; ++t) {
+      total_usages[t] += node.strategies[strategy_idx].usage;
+    }
+  }
+  for (const Edge& edge : problem.edges) {
+    StrategyIdx strategy_idx = 0;
+    for (const NodeIdx node_idx : edge.nodes) {
+      strategy_idx *= problem.nodes[node_idx].strategies.size();
+      strategy_idx += solution[node_idx];
+    }
+    cost += edge.strategies[strategy_idx].cost;
+  }
+  if (problem.usage_limit) {
+    for (const TotalUsage& total_usage : total_usages) {
+      if (total_usage > *problem.usage_limit) {
+        return absl::ResourceExhaustedError("Usage limit exceeded");
+      }
+    }
+  }
+  return cost;
+}
+
+// TODO(moffitt): Re-implement this using an XLA-friendly library (eg, jsoncpp).
+absl::StatusOr<Problem> ReadProblem(const std::string& filename) {
+/*
+  const nlohmann::json data = nlohmann::json::parse(std::ifstream(filename));
+  Problem problem = {.name = data["problem"]["name"]};
+  const auto& nodes = data["problem"]["nodes"];
+  for (const auto& node_interval : nodes["intervals"]) {
+    problem.nodes.push_back({.interval = {node_interval[0], node_interval[1]}});
+  }
+  for (NodeIdx node_idx = 0; node_idx < problem.nodes.size(); ++node_idx) {
+    Node& node = problem.nodes[node_idx];
+    const auto& costs = nodes["costs"][node_idx];
+    const auto& usages = nodes["usages"][node_idx];
+    node.strategies.reserve(costs.size());
+    for (StrategyIdx strategy_idx = 0; strategy_idx < costs.size();
+         ++strategy_idx) {
+      node.strategies.push_back(
+          {.cost = costs[strategy_idx], .usage = usages[strategy_idx]});
+    }
+  }
+  const auto& edges = data["problem"]["edges"];
+  for (const auto& node_list : edges["nodes"]) {
+    problem.edges.push_back({});
+    for (const NodeIdx node_idx : node_list) {
+      problem.edges.back().nodes.push_back(node_idx);
+    }
+  }
+  for (EdgeIdx edge_idx = 0; edge_idx < problem.edges.size(); ++edge_idx) {
+    Edge& edge = problem.edges[edge_idx];
+    for (const Cost cost : edges["costs"][edge_idx]) {
+      edge.strategies.push_back({.cost = cost, .usage = 0});
+    }
+  }
+  if (data["problem"].contains("usage_limit")) {
+    problem.usage_limit = data["problem"]["usage_limit"];
+  }
+  return problem;
+*/
+  return absl::UnimplementedError("ReadProblem is not implemented");
+}
+
+}  // namespace iopddl
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl.h b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl.h
new file mode 100644
index 000000000000..9dbab5cbc38c
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl.h
@@ -0,0 +1,81 @@
+/*
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_IOPDDL_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_IOPDDL_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/numeric/int128.h"
+#include "absl/status/statusor.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/////////  Basic definitions for problem & solution data structures.   /////////
+/////////  Contest participants do not need to modify this code.       /////////
+////////////////////////////////////////////////////////////////////////////////
+
+namespace iopddl {
+
+using Cost = int64_t;
+using Usage = int64_t;
+using TimeIdx = int64_t;
+using NodeIdx = int64_t;
+using EdgeIdx = int64_t;
+using StrategyIdx = int64_t;
+using Interval = std::pair<TimeIdx, TimeIdx>;
+using Solution = std::vector<StrategyIdx>;
+using TotalUsage = absl::int128;
+using TotalCost = absl::int128;
+
+struct Strategy {
+  Cost cost;
+  Usage usage;
+  bool operator==(const Strategy& other) const;
+};
+
+struct Node {
+  Interval interval;  // Interpreted as half-open with an exclusive upper bound
+  std::vector<Strategy> strategies;
+  bool operator==(const Node& other) const;
+};
+
+struct Edge {
+  using Nodes = std::vector<NodeIdx>;
+  Nodes nodes;
+  std::vector<Strategy> strategies;
+  bool operator==(const Edge& other) const;
+};
+
+struct Problem {
+  std::string name;
+  std::vector<Node> nodes;
+  std::vector<Edge> edges;
+  std::optional<Usage> usage_limit;
+  bool operator==(const Problem& other) const;
+};
+
+absl::StatusOr<TotalCost> Evaluate(const Problem& problem,
+                                   const Solution& solution);
+
+absl::StatusOr<Problem> ReadProblem(const std::string& filename);
+
+}  // namespace iopddl
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_IOPDDL_H_
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl_test.cc
new file mode 100644
index 000000000000..4dc2e528c7a4
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/iopddl_test.cc
@@ -0,0 +1,130 @@
+/*
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "xla/hlo/experimental/auto_sharding/iopddl.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/time/time.h"
+#include "xla/hlo/experimental/auto_sharding/solver.h"
+#include "xla/tsl/platform/status_matchers.h"
+
+namespace iopddl {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+
+Problem GetExampleProblem() {
+  return {
+      .name = "example",
+      .nodes =
+          {
+              // Node 0
+              {.interval = {30, 70}, .strategies = {{.cost = 15, .usage = 10}}},
+              // Node 1
+              {.interval = {40, 70},
+               .strategies = {{.cost = 55, .usage = 25},
+                              {.cost = 65, .usage = 25}}},
+              // Node 2
+              {.interval = {50, 120},
+               .strategies = {{.cost = 25, .usage = 15},
+                              {.cost = 45, .usage = 20},
+                              {.cost = 35, .usage = 15}}},
+              // Node 3
+              {.interval = {110, 140},
+               .strategies = {{.cost = 85, .usage = 10},
+                              {.cost = 75, .usage = 10}}},
+              // Node 4
+              {.interval = {110, 150},
+               .strategies = {{.cost = 95, .usage = 15}}},
+          },
+      .edges =
+          {
+              {.nodes = {0, 1}, .strategies = {{.cost = 30}, {.cost = 40}}},
+              {.nodes = {0, 2},
+               .strategies = {{.cost = 50}, {.cost = 10}, {.cost = 40}}},
+              {.nodes = {1, 3},
+               .strategies =
+                   {{.cost = 90}, {.cost = 10}, {.cost = 20}, {.cost = 80}}},
+              {.nodes = {2, 4},
+               .strategies = {{.cost = 60}, {.cost = 20}, {.cost = 30}}},
+              {.nodes = {3, 4}, .strategies = {{.cost = 70}, {.cost = 60}}},
+          },
+      .usage_limit = 50};
+}
+
+TEST(EvaluateTest, LegalSolution) {
+  // Node costs: 15 + 65 + 35 + 85 + 95 = 295
+  // Edge costs: 40 + 40 + 20 + 30 + 70 = 200
+  EXPECT_THAT(Evaluate(GetExampleProblem(), {0, 1, 2, 0, 0}),
+              IsOkAndHolds(495));
+}
+
+TEST(EvaluateTest, LegalSolutionNoUsageLimit) {
+  Problem problem = GetExampleProblem();
+  problem.usage_limit.reset();
+  // Node costs: 15 + 55 + 45 + 75 + 95 = 285
+  // Edge costs: 30 + 10 + 10 + 20 + 60 = 130
+  EXPECT_THAT(Evaluate(problem, {0, 0, 1, 1, 0}), IsOkAndHolds(415));
+}
+
+TEST(EvaluateTest, IllegalSolutionEclipsesUsageLimit) {
+  EXPECT_EQ(Evaluate(GetExampleProblem(), {0, 0, 1, 1, 0}).status().code(),
+            absl::StatusCode::kResourceExhausted);
+}
+
+TEST(EvaluateTest, IllegalSolutionHasTooManyTerms) {
+  EXPECT_EQ(Evaluate(GetExampleProblem(), {0, 0, 0, 0, 0, 0}).status().code(),
+            absl::StatusCode::kInvalidArgument);
+}
+
+TEST(EvaluateTest, IllegalSolutionHasTooFewTerms) {
+  EXPECT_EQ(Evaluate(GetExampleProblem(), {0, 0, 0, 0}).status().code(),
+            absl::StatusCode::kInvalidArgument);
+}
+
+TEST(EvaluateTest, IllegalSolutionHasNegativeStrategyIndex) {
+  EXPECT_EQ(Evaluate(GetExampleProblem(), {0, 0, -1, 0, 0}).status().code(),
+            absl::StatusCode::kOutOfRange);
+}
+
+TEST(EvaluateTest, IllegalSolutionHasBogusStrategyIndex) {
+  EXPECT_EQ(Evaluate(GetExampleProblem(), {0, 0, 4, 0, 0}).status().code(),
+            absl::StatusCode::kOutOfRange);
+}
+
+TEST(DISABLED_ReadProblemTest, ExampleFile) {
+  const std::string filename = "example.json";
+  EXPECT_THAT(ReadProblem(filename), IsOkAndHolds(GetExampleProblem()));
+}
+
+TEST(SolveTest, FindsOptimalSolution) {
+  EXPECT_THAT(Solver().Solve(GetExampleProblem(), absl::Seconds(1)),
+              IsOkAndHolds(Solution{0, 0, 2, 1, 0}));
+}
+
+TEST(SolveTest, NoSolutionFound) {
+  Problem problem = GetExampleProblem();
+  problem.usage_limit = 0;
+  EXPECT_EQ(Solver().Solve(problem, absl::Seconds(1)).status().code(),
+            absl::StatusCode::kNotFound);
+}
+
+}  // namespace
+}  // namespace iopddl
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/solver.cc
new file mode 100644
index 000000000000..616a1d0f2a61
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/solver.cc
@@ -0,0 +1,65 @@
+/*
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "xla/hlo/experimental/auto_sharding/solver.h"
+
+#include <stdlib.h>
+
+#include <iostream>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_join.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "xla/hlo/experimental/auto_sharding/iopddl.h"
+
+namespace iopddl {
+
+////////////////////////////////////////////////////////////////////////////////
+//  A simple solver that generates random solutions until the given timeout.  //
+//  Contest participants SHOULD replace this implementation with their own!!  //
+////////////////////////////////////////////////////////////////////////////////
+
+absl::StatusOr<Solution> Solver::Solve(const Problem& problem,
+                                       absl::Duration timeout) {
+  const absl::Time start_time = absl::Now();
+  std::optional<TotalCost> best_cost;
+  std::optional<Solution> best_solution;
+  unsigned int seed = 2025;
+  while (absl::Now() - start_time < timeout) {
+    Solution solution;
+    solution.reserve(problem.nodes.size());
+    for (const Node& node : problem.nodes) {
+      solution.push_back(rand_r(&seed) % node.strategies.size());
+    }
+    auto cost = Evaluate(problem, solution);
+    if (!cost.ok() || (best_cost && *best_cost <= *cost)) {
+      continue;
+    }
+    std::cout << "# Found solution [" << absl::StrJoin(solution, ", ")
+              << "] with cost " << *cost << std::endl;;
+    best_cost = *cost;
+    best_solution = solution;
+  }
+  if (!best_solution) {
+    return absl::NotFoundError("No solution found");
+  }
+  return *best_solution;
+}
+
+}  // namespace iopddl
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/solver.h
new file mode 100644
index 000000000000..1da575ae7c37
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/solver.h
@@ -0,0 +1,37 @@
+/*
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_SOLVER_H_
+#define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_SOLVER_H_
+
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/hlo/experimental/auto_sharding/iopddl.h"
+
+namespace iopddl {
+
+class Solver {
+ public:
+  absl::StatusOr<Solution> Solve(
+      const Problem& problem,
+      absl::Duration timeout = absl::InfiniteDuration());
+};
+
+}  // namespace iopddl
+
+#endif  // XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_SOLVER_H_
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 9b15a1cdb3d9..6ce1b1d49ec5 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   XLA’s HLO Intermediate Representation implementation.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -68,11 +68,13 @@ cc_library(
         "//xla:literal",
         "//xla:literal_pool",
         "//xla:literal_util",
+        "//xla:online_topsort",
         "//xla:printer",
         "//xla:protobuf_util",
         "//xla:shape_layout",
         "//xla:shape_tree",
         "//xla:shape_util",
+        "//xla:side_effect_util",
         "//xla:sort_json",
         "//xla:status_macros",
         "//xla:types",
@@ -115,18 +117,49 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@highwayhash",
+        "@highwayhash//:arch_specific",
+        "@highwayhash//:hh_types",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
+xla_cc_test(
+    name = "hlo_opcode_test",
+    srcs = ["hlo_opcode_test.cc"],
+    deps = [
+        ":hlo",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_computation_test",
+    srcs = ["hlo_computation_test.cc"],
+    deps = [
+        ":hlo",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 xla_cc_test(
     name = "hlo_instruction_test",
     srcs = ["hlo_instruction_test.cc"],
     deps = [
         ":hlo",
         "//xla:shape_util",
+        "//xla:side_effect_util",
         "//xla:xla_data_proto_cc",
+        "//xla/service:hlo_proto_cc",
         "//xla/tsl/platform:test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -176,9 +209,12 @@ xla_cc_test(
     deps = [
         ":hlo",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_module_config",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
@@ -217,7 +253,6 @@ xla_cc_test(
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -257,26 +292,11 @@ xla_cc_test(
         ":hlo_instruction_utils",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
-        "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
         "@com_google_googletest//:gtest",
     ],
 )
 
-cc_library(
-    name = "hlo_reachability",
-    hdrs = ["hlo_reachability.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:hlo_reachability instead.",
-    deps = ["//xla/hlo/analysis:hlo_reachability"],
-)
-
-cc_library(
-    name = "hlo_dfs_reachability",
-    hdrs = ["hlo_dfs_reachability.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:hlo_dfs_reachability instead.",
-    deps = ["//xla/hlo/analysis:hlo_dfs_reachability"],
-)
-
 cc_library(
     name = "ptrvec",
     hdrs = ["ptrvec.h"],
@@ -286,7 +306,7 @@ cc_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "ptrvec_test",
     srcs = ["ptrvec_test.cc"],
     deps = [
@@ -309,10 +329,12 @@ cc_library(
         "//xla:printer",
         "//xla:util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -325,7 +347,6 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/log:globals",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/third_party/xla/xla/hlo/ir/backend_config.cc b/third_party/xla/xla/hlo/ir/backend_config.cc
index ab2819f17f1f..a56fb5964018 100644
--- a/third_party/xla/xla/hlo/ir/backend_config.cc
+++ b/third_party/xla/xla/hlo/ir/backend_config.cc
@@ -53,7 +53,7 @@ const std::string& BackendConfigWrapper::GetRawStringWithoutMutex() const {
     // Cache the raw string.
     raw_string_ = BackendConfigToRawString(*proto_).value();
   }
-  static const std::string* kEmptyString = new std::string();
+  static const std::string* const kEmptyString = new std::string();
   return raw_string_.empty() ? *kEmptyString : raw_string_;
 }
 
diff --git a/third_party/xla/xla/hlo/ir/backend_config_test.cc b/third_party/xla/xla/hlo/ir/backend_config_test.cc
index 75faa55cb7e0..2031339d33f2 100644
--- a/third_party/xla/xla/hlo/ir/backend_config_test.cc
+++ b/third_party/xla/xla/hlo/ir/backend_config_test.cc
@@ -37,7 +37,7 @@ const int kNumRepetitions = 100;
 // since the == operator does not canonicalize the raw strings before comparing
 // them.
 constexpr absl::string_view kRawString =
-    R"({"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}},"force_earliest_schedule":false})";
+    R"({"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}},"force_earliest_schedule":false,"reification_cost":[]})";
 
 template <typename Input, typename CheckFn>
 void RunThreaded(Input input, CheckFn check_fn) {
diff --git a/third_party/xla/xla/hlo/ir/collective_device_list.cc b/third_party/xla/xla/hlo/ir/collective_device_list.cc
index 5f962c47cb71..00eea926e2b6 100644
--- a/third_party/xla/xla/hlo/ir/collective_device_list.cc
+++ b/third_party/xla/xla/hlo/ir/collective_device_list.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
+#include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
@@ -58,6 +59,10 @@ std::string IotaReplicaGroupList::ToString() const {
   return iota_tile_assignment_.ToString();
 }
 
+void IotaReplicaGroupList::Print(Printer* printer) const {
+  iota_tile_assignment_.Print(printer);
+}
+
 IotaReplicaGroupListProto IotaReplicaGroupList::ToProto() const {
   IotaReplicaGroupListProto proto;
   proto.set_num_replica_groups(num_replica_groups_);
@@ -122,6 +127,21 @@ std::string CollectiveDeviceList::ToString(
   return ReplicaGroupsToString(replica_groups());
 }
 
+void CollectiveDeviceList::Print(Printer* printer,
+                                 bool print_full_replica_group_list) const {
+  if (iota_replica_group_list_.has_value() && !print_full_replica_group_list) {
+    iota_replica_group_list_->Print(printer);
+    return;
+  }
+  printer->Append("{");
+  bool leading_comma = false;
+  for (const ReplicaGroup& group : replica_groups()) {
+    printer->AppendInt64List(group.replica_ids(), leading_comma);
+    leading_comma = true;
+  }
+  printer->Append("}");
+}
+
 CollectiveDeviceListProto CollectiveDeviceList::ToProto() const {
   CollectiveDeviceListProto proto;
   if (iota_replica_group_list_.has_value()) {
diff --git a/third_party/xla/xla/hlo/ir/collective_device_list.h b/third_party/xla/xla/hlo/ir/collective_device_list.h
index 0c2d3742e297..9901d3d9b727 100644
--- a/third_party/xla/xla/hlo/ir/collective_device_list.h
+++ b/third_party/xla/xla/hlo/ir/collective_device_list.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/hlo/ir/tile_assignment.h"
+#include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"
@@ -57,6 +58,13 @@ class IotaReplicaGroupList {
         num_replica_groups_(num_replica_groups),
         num_devices_per_group_(num_devices_per_group) {}
 
+  bool operator==(const IotaReplicaGroupList& other) const {
+    return num_replica_groups() == other.num_replica_groups() &&
+           num_devices_per_group() == other.num_devices_per_group() &&
+           reshape_dims() == other.reshape_dims() &&
+           transpose_perm() == other.transpose_perm();
+  }
+
   int64_t num_replica_groups() const;
   int64_t num_devices_per_group() const;
   absl::Span<const int64_t> reshape_dims() const {
@@ -67,6 +75,8 @@ class IotaReplicaGroupList {
   }
   Array<int64_t> ToArray() const { return iota_tile_assignment_.ToArray(); }
 
+  void Print(Printer* printer) const;
+
   std::string ToString() const;
 
   IotaReplicaGroupListProto ToProto() const;
@@ -105,6 +115,10 @@ class CollectiveDeviceList {
   const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
     return iota_replica_group_list_;
   }
+
+  void Print(Printer* printer,
+             bool print_full_replica_group_list = false) const;
+
   std::string ToString(bool print_full_replica_group_list = false) const;
 
   CollectiveDeviceListProto ToProto() const;
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
index 37ca85f126a4..577ce6aaff62 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
@@ -418,6 +418,11 @@ class DfsHloVisitorBase {
   // own postprocessing.
   virtual absl::Status Postprocess(HloInstructionPtr hlo);
 
+  // This method should be overriden by subclasses that wish to skip some ops
+  // while traversing the HLO graph. If this method returns false, the calls to
+  // Preprocess(op), Handle/OpType/(op) and Postprocess(op) are skipped.
+  virtual bool ShouldProcessNode(HloInstructionPtr hlo) { return true; }
+
  private:
   absl::flat_hash_map<int, VisitState> visit_state_;
 
diff --git a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
index 5da5638e77cd..9803f33a37b1 100644
--- a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
+++ b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
@@ -98,7 +98,8 @@ absl::Status DynamicParameterBinding::Verify(
             computation.parameter_instruction(dynamic_dimension.parameter_num)
                 ->shape(),
             dynamic_dimension.parameter_index)
-            .rank());
+            .dimensions()
+            .size());
     return absl::OkStatus();
   });
 }
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 5a08c3dcb06e..52ea19f95d65 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -29,10 +29,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -48,6 +50,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/ptrvec.h"
 #include "xla/literal.h"
 #include "xla/map_util.h"
@@ -178,16 +182,42 @@ HloComputation::~HloComputation() {
     CHECK(FusionInstruction()->fused_instructions_computation() == this);
     FusionInstruction()->ClearCalledComputations();
   }
-  if (IsAsyncComputation()) {
-    CHECK(async_start_->async_wrapped_computation() == this);
-    async_start_->ClearCalledComputations();
-  }
   Cleanup();
+  ClearCalledComputations();
+
+  // We need to make sure there are no dangling references to this computation
+  // from instructions in other computations.
+  std::vector<HloComputation*> callers;
+  for (const auto& [caller, count] : caller_computations_) {
+    callers.push_back(caller);
+  }
+  for (HloComputation* caller : callers) {
+    for (HloInstruction* inst : caller->instructions()) {
+      for (int i = 0; i < inst->called_computations().size(); ++i) {
+        if (inst->called_computations()[i] == this) {
+          inst->set_called_computation(i, nullptr);
+        }
+      }
+    }
+  }
+  CHECK(caller_computations_.empty());
+
+  // Delete the map from caller instructions to count, if it exists.
+  delete GetCallersMap();
+
   for (const auto& i : instructions_) {
     delete i.inst();
   }
 }
 
+void HloComputation::ClearCalledComputations() {
+  for (HloInstruction* i : instructions()) {
+    i->ClearCalledComputations();
+  }
+  // Clearing the instructions should have removed all callee computations.
+  CHECK(callee_computations_.empty());
+}
+
 void HloComputation::SetInstruction(HloInstruction* instruction,
                                     InstructionType type) {
   static_assert(alignof(HloInstruction) == kInstructionTypeMask + 1,
@@ -241,11 +271,99 @@ HloInstruction* HloComputation::AddInstruction(
   return AddInstruction(std::move(instruction));
 }
 
+static void IncrementCount(
+    absl::btree_map<HloComputation*, int, HloComputation::UniqueIdComparator>&
+        map,
+    HloComputation* key) {
+  ++map[key];
+}
+
+static void DecrementCount(
+    absl::btree_map<HloComputation*, int, HloComputation::UniqueIdComparator>&
+        map,
+    HloComputation* key) {
+  auto it = map.find(key);
+  CHECK(it != map.end());
+  CHECK_GT(it->second, 0);
+  --it->second;
+  if (it->second == 0) {
+    map.erase(it);
+  }
+}
+
+void HloComputation::AddCallee(HloInstruction* caller, HloComputation* callee) {
+  IncrementCount(callee_computations_, callee);
+  IncrementCount(callee->caller_computations_, this);
+
+  if (auto* map = callee->GetCallersMap()) {
+    ++(*map)[caller];
+  } else if (callee->callers_ == 0) {
+    callee->callers_ = reinterpret_cast<uintptr_t>(caller);
+  } else {
+    // Convert the single instruction to a map.
+    auto* current_caller = reinterpret_cast<const HloInstruction*>(
+        callee->callers_ & ~kCallerTypeMask);
+    auto* map = new absl::flat_hash_map<const HloInstruction*, int>();
+    (*map)[current_caller] = 1;
+    ++(*map)[caller];
+    callee->callers_ = reinterpret_cast<uintptr_t>(map) |
+                       static_cast<uintptr_t>(CallersType::kCallerCountHashMap);
+  }
+
+  if (parent() != nullptr && callee->parent() == parent()) {
+    parent()->topological_sort_.AddEdge(this, callee);
+  }
+}
+
+void HloComputation::RemoveCallee(HloInstruction* caller,
+                                  HloComputation* callee) {
+  CHECK(caller);
+  CHECK(callee);
+  DecrementCount(callee_computations_, callee);
+  DecrementCount(callee->caller_computations_, this);
+
+  if (callee->callers_ == reinterpret_cast<uintptr_t>(caller)) {
+    // The callee had just this single caller, so we reset it to 0 (no caller).
+    callee->callers_ = 0;
+  } else {
+    auto* map = callee->GetCallersMap();
+    CHECK(map) << "Attempted to remove a caller " << caller->name()
+               << " that did not call the computation " << name() << "."
+               << callee->callers_;
+    auto it = map->find(caller);
+    CHECK(it != map->end())
+        << "Attempted to remove a caller " << caller->name()
+        << " that did not call the computation " << name() << ".";
+    --it->second;
+    // We don't convert back to the inline representation, since this case
+    // should be rare.
+  }
+}
+
+absl::flat_hash_map<HloInstruction*, int>* HloComputation::GetCallersMap() {
+  if (static_cast<CallersType>(callers_ & kCallerTypeMask) ==
+      CallersType::kCallerCountHashMap) {
+    return reinterpret_cast<absl::flat_hash_map<HloInstruction*, int>*>(
+        callers_ & ~kCallerTypeMask);
+  }
+  return nullptr;
+}
+
+absl::flat_hash_map<HloInstruction*, int>* const HloComputation::GetCallersMap()
+    const {
+  if (static_cast<CallersType>(callers_ & kCallerTypeMask) ==
+      CallersType::kCallerCountHashMap) {
+    return reinterpret_cast<absl::flat_hash_map<HloInstruction*, int>* const>(
+        callers_ & ~kCallerTypeMask);
+  }
+  return nullptr;
+}
+
 HloInstruction* HloComputation::AddInstructionInternal(
     std::unique_ptr<HloInstruction> instruction) {
   if (parent() != nullptr) {
-    instruction->UniquifyName(&parent()->instruction_name_uniquer());
-    instruction->SetUniqueId(parent()->NewUniqueInstructionId());
+    instruction->UniquifyName(parent());
+    instruction->UniquifyId(parent());
   }
   instruction->set_parent(this);
   HloInstruction* pinst = instruction.release();  // Take ownership
@@ -258,6 +376,15 @@ HloInstruction* HloComputation::AddInstructionInternal(
   instruction_count_++;
   pinst->index_in_parent_ = index;
   instructions_.push_back(info);
+  for (HloComputation* called_computation : pinst->called_computations()) {
+    CHECK(called_computation);
+    // TODO(b/399394039): Consider enforcing that
+    // called_computation->parent() != nullptr.
+    CHECK(parent() == nullptr || called_computation->parent() == parent())
+        << "Called computation " << called_computation->name()
+        << " is not in the same module as " << name();
+    AddCallee(pinst, called_computation);
+  }
   return pinst;
 }
 
@@ -324,6 +451,7 @@ absl::Status HloComputation::RemoveParameter(int64_t param_no) {
     HloInstruction* new_instr =
         AddInstructionInternal(HloInstruction::CreateParameter(
             param_no, param_instruction->shape(), StrCat("param_", param_no)));
+    param_instruction->SetupDerivedInstruction(new_instr);
     TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr));
     param_instructions_[param_no] = new_instr;
     TF_RETURN_IF_ERROR(ForceRemoveInstruction(param_instruction));
@@ -387,8 +515,11 @@ absl::Status HloComputation::RemoveUnusedParametersImpl(bool allow_non_fusion) {
   return absl::OkStatus();
 }
 
-bool HloComputation::IsSafelyRemovable(const HloInstruction* instruction,
-                                       bool ignore_control_dependency) {
+bool HloComputation::IsSafelyRemovable(
+    const HloInstruction* instruction, bool ignore_control_dependency,
+    std::optional<
+        absl::FunctionRef<std::vector<HloInstruction*>(const HloComputation*)>>
+        computation_callers) const {
   // If the instruction has control predecessors or successors then we cannot
   // remove the instruction without violating ordering constraints (added, for
   // example, to avert interference due to buffer aliasing).
@@ -396,11 +527,42 @@ bool HloComputation::IsSafelyRemovable(const HloInstruction* instruction,
     return false;
   }
 
-  if (instruction->opcode() == HloOpcode::kParameter &&
-      !IsFusionComputation()) {
-    return false;
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    // If there is no parent, it is safe to remove the child.
+    if (instruction->parent() == nullptr) {
+      return true;
+    }
+    // Entry computation parameters can never be removed.
+    if (instruction->parent()->IsEntryComputation()) {
+      return false;
+    }
+    // We generally want to be using the call graph to determine who the caller
+    // is, as this back pointer is very fragile, however its not reasonable to
+    // expect every caller to be passing in the call graph.
+    if (IsFusionComputation()) {
+      return true;
+    }
+    // If we can't fixup the caller, then we can't remove the parameter.
+    if (!computation_callers.has_value()) {
+      return false;
+    }
+    std::vector<HloInstruction*> callers =
+        (*computation_callers)(instruction->parent());
+    if (callers.empty()) {
+      return false;
+    }
+    for (HloInstruction* caller :
+         (*computation_callers)(instruction->parent())) {
+      if (caller->opcode() != HloOpcode::kFusion &&
+          caller->opcode() != HloOpcode::kCall &&
+          caller->opcode() != HloOpcode::kAsyncStart) {
+        // We don't handle callers with non-trivial control flow today.
+        return false;
+      }
+    }
   }
 
+  // All instruction generally are safe to remove.
   return true;
 }
 
@@ -420,12 +582,19 @@ bool HloComputation::IsMarkedAsDead(const HloInstruction* inst) {
 absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
     HloInstruction* instruction,
     std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup,
-    bool ignore_control_dependencies) {
+    bool ignore_control_dependencies,
+    std::optional<
+        absl::FunctionRef<std::vector<HloInstruction*>(const HloComputation*)>>
+        computation_callers) {
   TF_RET_CHECK(root_instruction() != instruction);
 
   TF_RET_CHECK(instruction->IsDead());
-  TF_RET_CHECK(IsSafelyRemovable(instruction, ignore_control_dependencies))
+  TF_RET_CHECK(IsSafelyRemovable(instruction, ignore_control_dependencies,
+                                 computation_callers))
       << "Cannot remove instruction: " << instruction->ToString();
+  // Remember the parent, in case we lose all references to it, in order to
+  // clean up the callers.
+  HloComputation* parent = instruction->parent();
   absl::flat_hash_set<HloInstruction*> removed;
   std::queue<HloInstruction*> worklist;
   worklist.push(instruction);
@@ -435,7 +604,8 @@ absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
     worklist.pop();
 
     if (removed.contains(item) || !item->IsDead() ||
-        !IsSafelyRemovable(item, ignore_control_dependencies) ||
+        !IsSafelyRemovable(item, ignore_control_dependencies,
+                           computation_callers) ||
         (item->HasSideEffect() && item != instruction)) {
       continue;
     }
@@ -453,10 +623,9 @@ absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
       (*cleanup)(item);
     }
     if (item->opcode() == HloOpcode::kParameter) {
-      // Note that right now, only parameters inside fusion computations are
-      // considered to be safely removable. We cannot remove a parameter
-      // directly, because it may cause a renumbering of other parameters which
-      // may invalidate some of the pointers in the worklist.
+      // We cannot remove a parameter directly, because it may cause a
+      // renumbering of other parameters which may invalidate some of the
+      // pointers in the worklist.
       parameters_to_be_removed.push_back(item);
     } else {
       TF_RETURN_IF_ERROR(RemoveInstruction(item));
@@ -469,18 +638,47 @@ absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
             [](HloInstruction* a, HloInstruction* b) {
               return a->parameter_number() > b->parameter_number();
             });
+  std::vector<HloInstruction*> callers;
+  if (!parameters_to_be_removed.empty()) {
+    if (parent != nullptr && computation_callers.has_value()) {
+      callers = (*computation_callers)(parent);
+    }
+    // We generally want to be using the call graph to determine who the caller
+    // is, as this back pointer is very fragile, however its not reasonable to
+    // expect every caller to be passing in the call graph.
+    if (callers.empty() && FusionInstruction() != nullptr) {
+      callers = {FusionInstruction()};
+    }
+  }
+  // Only attempt to remove parameters if we can fixup the caller.
+  if (callers.empty()) {
+    return absl::OkStatus();
+  }
   for (HloInstruction* param : parameters_to_be_removed) {
     int64_t parameter_number = param->parameter_number();
     TF_RETURN_IF_ERROR(RemoveParameter(parameter_number));
-    if (FusionInstruction() != nullptr) {
-      auto operand = FusionInstruction()->mutable_operand(parameter_number);
-      FusionInstruction()->RemoveOperandAt(parameter_number);
-      FusionInstruction()->DetachFrom(operand);
-      if (operand->IsDead() && operand->parent()->IsSafelyRemovable(
-                                   operand, ignore_control_dependencies)) {
+    for (HloInstruction* caller : callers) {
+      // The caller could have been eagerly removed.
+      if (caller->IsDead()) {
+        continue;
+      }
+      auto operand = caller->mutable_operand(parameter_number);
+      caller->RemoveOperandAt(parameter_number);
+      caller->DetachFrom(operand);
+      // Cleanup operand shape embedded into the async-start shape.
+      if (caller->opcode() == HloOpcode::kAsyncStart) {
+        std::vector<Shape>* operand_shapes = caller->mutable_shape()
+                                                 ->mutable_tuple_shapes(0)
+                                                 ->mutable_tuple_shapes();
+        operand_shapes->erase(operand_shapes->begin() + parameter_number);
+      }
+      if (operand->IsDead() &&
+          operand->parent()->IsSafelyRemovable(
+              operand, ignore_control_dependencies, computation_callers)) {
         TF_RETURN_IF_ERROR(
             operand->parent()->RemoveInstructionAndUnusedOperands(
-                operand, cleanup, ignore_control_dependencies));
+                operand, cleanup, ignore_control_dependencies,
+                computation_callers));
       }
     }
   }
@@ -513,13 +711,13 @@ absl::Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
 
   HloInstructionInfo* info = &instructions_[instruction->index_in_parent_];
   DCHECK_EQ(info->inst(), instruction);
-  info->inst()->set_parent(nullptr);
   to_be_deleted_.push_back(info->inst());  // Takes ownership
   to_be_deleted_.back()->DetachFromOperandsAndUsers();
   // Clear all operands to avoid Null operands.
   to_be_deleted_.back()->RemoveAllOperands();
   to_be_deleted_.back()->ClearCalledComputations();
   to_be_deleted_.back()->MarkAsDead();
+  info->inst()->set_parent(nullptr);
 
   // If this instruction is a constant, clear the literal eagerly instead of
   // waiting for the instruction to be deleted in Cleanup(). This greatly
@@ -704,9 +902,6 @@ HloComputation::ChannelDependencies HloComputation::ComputeChannelDependencies()
         std::optional<int64_t> channel_id = instruction->channel_id();
         if (channel_id) {
           Instructions& group = channel_groups[*channel_id];
-          for (const HloInstruction* group_inst : group) {
-            dependencies[group_inst].push_back(instruction);
-          }
           dependencies[instruction] = group;
           group.push_back(instruction);
         }
@@ -910,9 +1105,6 @@ void HloComputation::Print(Printer* printer,
 void HloComputation::Print(
     Printer* printer, const HloPrintOptions& options,
     absl::Span<const HloInstruction* const> instruction_order) const {
-  if (!instruction_order.empty()) {
-    CHECK_EQ(instruction_order.size(), instruction_count());
-  }
   const std::string tab(2 * options.indent_amount(), ' ');
 
   printer->Append(tab);
@@ -1081,7 +1273,7 @@ HloComputation::CreateFromProto(
 
   auto computation = absl::WrapUnique(
       new HloComputation(proto.name(), parameter_count, &instructions, root));
-  computation->unique_id_ = proto.id();
+  computation->SetUniqueIdHelper(proto.id());
   if (proto.is_fusion_computation()) {
     computation->instruction_and_type_ =
         static_cast<uintptr_t>(InstructionType::kFusion);
@@ -1089,7 +1281,7 @@ HloComputation::CreateFromProto(
   if (!proto.execution_thread().empty()) {
     computation->SetExecutionThread(proto.execution_thread());
   }
-  return std::move(computation);
+  return computation;
 }
 
 void HloComputation::AppendInstructionsIntoCalledComputation(
@@ -1315,9 +1507,9 @@ ProgramShape HloComputation::ComputeProgramShape(bool include_ids) const {
   ProgramShape program_shape;
 
   for (auto* param_instruction : param_instructions_) {
-    *program_shape.add_parameters() = param_instruction->shape();
-    *program_shape.add_parameter_names() =
-        std::string(PrintName(param_instruction->name(), include_ids));
+    program_shape.AddParameter(
+        param_instruction->shape(),
+        std::string(PrintName(param_instruction->name(), include_ids)));
   }
   *program_shape.mutable_result() = root_instruction_->shape();
 
@@ -1460,20 +1652,7 @@ absl::StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
     new_instruction->set_frontend_attributes(
         old_instruction->frontend_attributes());
   }
-  if (auto original_value = old_instruction->original_value()) {
-    // Fusions are handled separately. The original value of fused instructions
-    // is copied when they are added into the fused computation.
-    if (new_instruction->opcode() != HloOpcode::kFusion) {
-      if (ShapeUtil::Compatible(old_instruction->shape(),
-                                new_instruction->shape())) {
-        new_instruction->set_original_value(original_value);
-      } else {
-        LOG(WARNING)
-            << "Expect the new instruction to have the same shape with the old "
-               "instruction when copying over original_value\n";
-      }
-    }
-  }
+  MoveOriginalValue(old_instruction, new_instruction);
 
   // Like the metadata above, if the user didn't specify any sharding
   // information on the new instruction we should copy the old sharding
@@ -1832,4 +2011,36 @@ bool HloComputation::CanExpandIntoSingleInstruction() const {
       });
 }
 
+void HloComputation::ClearUniqueIdInternal() { SetUniqueIdHelper(-1); }
+
+void HloComputation::SetUniqueId(int64_t id) {
+  CHECK_EQ(unique_id_, -1);
+  CHECK_GE(id, 0);
+  SetUniqueIdHelper(id);
+}
+
+void HloComputation::SetUniqueIdHelper(int64_t id) {
+  // The caller/callee computations are ordered by unique ID, so we need to
+  // remove and readd them to our neighbor's data structures.
+  for (auto& [computation, count] : caller_computations_) {
+    auto it = computation->callee_computations_.find(this);
+    CHECK(it != computation->callee_computations_.end());
+    CHECK_EQ(it->second, count);
+    computation->callee_computations_.erase(it);
+  }
+  for (auto& [computation, count] : callee_computations_) {
+    auto it = computation->caller_computations_.find(this);
+    CHECK(it != computation->caller_computations_.end());
+    CHECK_EQ(it->second, count);
+    computation->caller_computations_.erase(it);
+  }
+  unique_id_ = id;
+  for (auto& [computation, count] : caller_computations_) {
+    computation->callee_computations_[this] = count;
+  }
+  for (auto& [computation, count] : callee_computations_) {
+    computation->caller_computations_[this] = count;
+  }
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index 85b855ccb668..5ade7df576fb 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -21,10 +21,12 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
@@ -43,6 +45,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/ptrvec.h"
 #include "xla/iterator_util.h"
+#include "xla/online_topsort.h"
 #include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/name_uniquer.h"
@@ -212,16 +215,8 @@ class HloComputation {
     // unreachable, and its instruction is set to null. We still need to regard
     // such computations as fusion computations for HLO scheduling purposes.
     kFusion,
-    // This computation is a custom-call computation.
-    kCustomCall,
-    // This computation is a collective computation.
-    kCollective,
-    // This computation is a while body computation.
-    kWhile,
-    // This computation is a conditional branch computation.
-    kConditional,
     // Last Value for range checking.
-    kLast = kConditional,
+    kLast = kFusion,
   };
   static constexpr uintptr_t kInstructionTypeMask = 0b111;
   static_assert(static_cast<int>(InstructionType::kUnset) == 0,
@@ -256,13 +251,11 @@ class HloComputation {
                                    std::unique_ptr<HloInstruction> instruction);
 
   // Remove the param_no'th parameter from the computation.
-  // Note this is only applicatable to the computation for the fusion
-  // instruction.
+  // Note this is only applicable to the computation for the fusion instruction.
   absl::Status RemoveParameter(int64_t param_no);
 
   // Remove unused parameters from the computation.
-  // Note this is only applicatable to the computation for the fusion
-  // instruction.
+  // Note this is only applicable to the computation for the fusion instruction.
   absl::Status RemoveUnusedParametersFromFusedComputation();
 
   // Remove unused parameters from the computation. Unlike
@@ -312,14 +305,17 @@ class HloComputation {
   // on a removed instruction before its marked as deleted. If
   // ignore_control_dependencies is set to true, if will remove the unused
   // operands even when they have control dependencies, and transitively pass
-  // the control dependencies from the predecessors to the succesors of the
+  // the control dependencies from the predecessors to the successors of the
   // removed instructions, so that the logical exeuction order of the remaining
   // unremoved instructions are preserved.
   absl::Status RemoveInstructionAndUnusedOperands(
       HloInstruction* instruction,
       std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup =
           std::nullopt,
-      bool ignore_control_dependencies = false);
+      bool ignore_control_dependencies = false,
+      std::optional<absl::FunctionRef<
+          std::vector<HloInstruction*>(const HloComputation*)>>
+          computation_callers = std::nullopt);
 
   // Set the root of the computation to the given instruction. The instruction
   // must have already been added to the computation. In addition it must have
@@ -673,8 +669,7 @@ class HloComputation {
   absl::Status ReplaceInstructionWithDifferentShape(
       HloInstruction* old_instruction, HloInstruction* new_instruction);
 
-  // Set/get the module containing this computation.
-  void set_parent(HloModule* module) { parent_ = module; }
+  // Get the module containing this computation.
   const HloModule* parent() const { return parent_; }
   HloModule* parent() { return parent_; }
 
@@ -763,8 +758,10 @@ class HloComputation {
 
   // Returns true if the given instruction can be removed from the computation.
   // Parameter instructions cannot be removed without violating invariants of
-  // the HLO computation with the exception of fusion computation. A parameter
-  // instruction is removable for a fusion computation.
+  // the HLO computation with the exception of those with trivial control flow
+  // (fusion, call, async call). This is determined by checking the call graph
+  // via computation_callers. This is expected to be equivalent to
+  // CallGraph::GetComputationCallers().
   //
   // Note that IsSafelyRemovable() is a necessary condition to remove an
   // instruction rather than a sufficient condition. For example, instructions
@@ -772,8 +769,11 @@ class HloComputation {
   // but the transformation must guarantee the invariants relevant to the
   // instructions still hold (e.g., Send and Recv must be removed together to
   // make each channel complete).
-  bool IsSafelyRemovable(const HloInstruction* instruction,
-                         bool ignore_control_dependency = false);
+  bool IsSafelyRemovable(
+      const HloInstruction* instruction, bool ignore_control_dependency = false,
+      std::optional<absl::FunctionRef<
+          std::vector<HloInstruction*>(const HloComputation*)>>
+          computation_callers = std::nullopt) const;
 
   // Returns a map from an instruction to the group of instructions associated
   // with the same channel. These instructions will be considered as a single
@@ -807,89 +807,10 @@ class HloComputation {
     SetInstruction(fusion_instruction, InstructionType::kFusion);
   }
 
-  // Returns if this computation is a custom-call computation.
-  bool IsCustomCallComputation() const {
-    return instruction_type() == InstructionType::kCustomCall;
-  }
-
-  // Returns the owning custom call instruction, or nullptr if this is not a
-  // custom call computation.
-  HloInstruction* CustomCallInstruction() const {
-    return instruction_type() == InstructionType::kCustomCall ? instruction()
-                                                              : nullptr;
-  }
-  void SetCustomCallInstruction(HloInstruction* custom_call_instruction) {
-    SetInstruction(custom_call_instruction, InstructionType::kCustomCall);
-  }
-
-  // Returns if this computation is a to_apply region of a collective.
-  bool IsCollectiveCalledComputation() const {
-    return instruction_type() == InstructionType::kCollective;
-  }
-
-  // Returns the owning collective call instruction, or nullptr if this is not a
-  // collective call computation.
-  HloInstruction* CollectiveCallInstruction() const {
-    return instruction_type() == InstructionType::kCollective ? instruction()
-                                                              : nullptr;
-  }
-
-  void SetCollectiveCallInstruction(
-      HloInstruction* collective_call_instruction) {
-    SetInstruction(collective_call_instruction, InstructionType::kCollective);
-  }
-
-  // Returns if this computation is a body computation of a while.
-  [[deprecated(
-      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
-  bool IsWhileBodyComputation() const {
-    return instruction_type() == InstructionType::kWhile;
-  }
-
-  // Returns the owning while call instruction, or nullptr if this is not a
-  // while call body computation.
-  [[deprecated(
-      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
-  HloInstruction* WhileCallInstruction() const {
-    return instruction_type() == InstructionType::kWhile ? instruction()
-                                                         : nullptr;
-  }
-
-  [[deprecated(
-      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
-  void SetWhileCallInstruction(HloInstruction* while_call_instruction) {
-    CHECK(while_call_instruction != nullptr);
-    CHECK(while_call_instruction->opcode() == HloOpcode::kWhile);
-    SetInstruction(while_call_instruction, InstructionType::kWhile);
-  }
-
-  // Returns if this computation is a branch computation of a conditional.
-  [[deprecated(
-      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
-  bool IsConditionalBranchComputation() const {
-    return instruction_type() == InstructionType::kConditional;
-  }
-
-  // Returns the owning conditional call instruction, or nullptr if this is not
-  // a conditional branch computation.
-  [[deprecated(
-      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
-  HloInstruction* ConditionalCallInstruction() const {
-    return instruction_type() == InstructionType::kConditional ? instruction()
-                                                               : nullptr;
-  }
-
-  [[deprecated(
-      "This is broken. Use CallGraph::GetComputationCallers() instead")]]
-  void SetConditionalCallInstruction(
-      HloInstruction* conditional_call_instruction) {
-    CHECK(conditional_call_instruction != nullptr);
-    CHECK(conditional_call_instruction->opcode() == HloOpcode::kConditional);
-    SetInstruction(conditional_call_instruction, InstructionType::kConditional);
-  }
-
   // Returns if this computation is an async computation.
-  bool IsAsyncComputation() const { return async_start_ != nullptr; }
+  bool IsAsyncComputation() const {
+    return !caller_instructions(HloOpcode::kAsyncStart).empty();
+  }
 
   // Returns true if this computation only contains send/recv instructions.
   bool OnlyContainsSendRecv() {
@@ -903,29 +824,12 @@ class HloComputation {
     return true;
   }
 
-  // Returns the owning async instruction. It's nullptr if this is not an async
-  // computation.
-  HloInstruction* AsyncStart() const { return async_start_; }
-
-  void AddAsyncStart(HloInstruction* async_instruction) {
-    // TODO: Add instruction type for async instructions.
-    CHECK(instruction_type() == InstructionType::kUnset);
-    CHECK(async_instruction->opcode() == HloOpcode::kAsyncStart);
-    async_start_ = async_instruction;
-  }
-
-  void RemoveAsyncStart() { async_start_ = nullptr; }
-
   // Clear the unique ID of the computation so that it can be re-assigned, such
   // as for the purpose of compacting the unique IDs.
-  void ClearUniqueIdInternal() { unique_id_ = -1; }
+  void ClearUniqueIdInternal();
 
   // The id of this computation should be unique within the module.
-  void SetUniqueId(int64_t id) {
-    CHECK_EQ(unique_id_, -1);
-    CHECK_GE(id, 0);
-    unique_id_ = id;
-  }
+  void SetUniqueId(int64_t id);
 
   // Returns the instruction in this computation that has name `name`.  Returns
   // null if there is no such computation.
@@ -958,7 +862,82 @@ class HloComputation {
   // Returns true iff this computation can be inlined as a single instruction.
   bool CanExpandIntoSingleInstruction() const;
 
+  // A comparator that orders computations by their unique IDs. This is used
+  // for determinism.
+  struct UniqueIdComparator {
+    bool operator()(const HloComputation* lhs,
+                    const HloComputation* rhs) const {
+      // We include the computation pointer so that we can disambiguate
+      // computations that do not belong to any module and therefore have a
+      // unique ID of -1. This is not deterministic, but we don't need
+      // determinism for computations not in a module since they are ignored
+      // by the topological sorting code.
+      return std::tie(lhs->unique_id_, lhs) < std::tie(rhs->unique_id_, rhs);
+    }
+  };
+
+  // Count of times this computation calls other computations.
+  absl::btree_map<HloComputation*, int, UniqueIdComparator>
+  callee_computations() const {
+    return callee_computations_;
+  }
+
+  // Count of times this computation is called by other computations.
+  absl::btree_map<HloComputation*, int, UniqueIdComparator>
+  caller_computations() const {
+    return caller_computations_;
+  }
+
+  // The returned callers are in no particular order.
+  absl::InlinedVector<HloInstruction*, 1> caller_instructions(
+      std::optional<HloOpcode> caller_opcode = std::nullopt) const {
+    if (const auto* map = GetCallersMap()) {
+      absl::InlinedVector<HloInstruction*, 1> result;
+      for (auto [instr, count] : *map) {
+        if (count == 0) {
+          continue;
+        }
+        if (caller_opcode == std::nullopt ||
+            instr->opcode() == *caller_opcode) {
+          result.push_back(instr);
+        }
+      }
+      return result;
+    }
+
+    if (callers_ == 0) {
+      return {};
+    }
+
+    auto* instr =
+        reinterpret_cast<HloInstruction*>(callers_ & ~kCallerTypeMask);
+    if (caller_opcode == std::nullopt || instr->opcode() == *caller_opcode) {
+      return {instr};
+    }
+
+    return {};
+  }
+
+  // Returns the only caller with the given opcode, if there is exactly one.
+  std::optional<HloInstruction*> GetUniqueCaller(HloOpcode opcode) const {
+    auto callers = caller_instructions(opcode);
+    if (callers.size() == 1) {
+      return callers.front();
+    }
+    return std::nullopt;
+  }
+
+  void ClearCalledComputations();
+
  private:
+  friend class HloModule;
+
+  enum class CallersType : uint8_t {
+    kHloInstruction = 0,
+    kCallerCountHashMap = 1,
+  };
+  static constexpr uintptr_t kCallerTypeMask = 0b1;
+
   explicit HloComputation(
       const std::string& name, int parameter_count,
       std::vector<std::unique_ptr<HloInstruction>>* instructions,
@@ -1011,6 +990,30 @@ class HloComputation {
 
   void SetInstruction(HloInstruction* instruction, InstructionType type);
 
+  // Private, because only HloModule should be able to set the parent.
+  // We maintain the invariant that a computation has a parent() if and only if
+  // the computation has been added to a module. Accordingly, the only way to
+  // set the parent of a computation is to add it to a module.
+  void set_parent(HloModule* module) { parent_ = module; }
+
+  // Helper that updates the unique ID of the computation. This requires
+  // updating the callee_computations_ and caller_computations_ sets since they
+  // are ordered by unique ID.
+  void SetUniqueIdHelper(int64_t id);
+
+  friend class HloInstruction;
+  // Add/remove call from `caller`, which must be in this computation, to
+  // `callee`.
+  void AddCallee(HloInstruction* caller, HloComputation* callee);
+  void RemoveCallee(HloInstruction* caller, HloComputation* callee);
+
+  // Returns nullptr if `callers_` is not a map.
+  absl::flat_hash_map<HloInstruction*, int>* GetCallersMap();
+  absl::flat_hash_map<HloInstruction*, int>* const GetCallersMap() const;
+
+  // Unique ID of this computation.
+  // This is set to -1 if the computation is not in a module. Should only be
+  // updated by SetUniqueIdHelper().
   int64_t unique_id_;
   HloInstruction* root_instruction_;
 
@@ -1021,11 +1024,10 @@ class HloComputation {
   // The respective type in the least significant three bits.
   uintptr_t instruction_and_type_ = 0;
 
-  // If this computation is an async computation, this field points to the
-  // first async instruction (async-start) in the asynchronous op chain that
-  // calls this computation.
-  // Otherwise, this is empty.
-  HloInstruction* async_start_ = nullptr;
+  // Contains an HloInstruction* or an absl::flat_hash_map<HloInstruction*,
+  // /*count=*/int> in the high bits and a CallersType in the least significant
+  // bit.
+  uintptr_t callers_ = 0;
 
   HloInstruction::InstructionVector param_instructions_;
 
@@ -1049,6 +1051,95 @@ class HloComputation {
 
   std::string name_;
 
+  // Callers and callees of this computation.
+  // * These include all computations that have a caller/callee relationship
+  //   with this computation, even those that may not belong to a module. For
+  //   example, a computation that has been created and is in the process of
+  //   being constructed but has not been added to a module yet may appear here.
+  // * These are ordered maps, ordered by (unique ID, computation pointer). The
+  //   unique ID is used to ensure determinism, whereas the computation pointer
+  //   is used to disambiguate computations that do not belong to any module and
+  //   therefore have a unique ID of -1. We assume that determinism only matters
+  //   for computations that belong to a module (i.e, unique_id != -1), since
+  //   the primary use case for this data structure is to topologically sort
+  //   computations in a module.
+  // * The values of the maps are the number of times the computation is
+  //   referenced. In a graph sense, this is the number of parallel edges.
+  absl::btree_map<HloComputation*, int, UniqueIdComparator>
+      callee_computations_;
+  absl::btree_map<HloComputation*, int, UniqueIdComparator>
+      caller_computations_;
+
+  // Adapters for the use of the topological sort.
+  class NeighborIterator {
+   public:
+    using Iterator = absl::btree_map<HloComputation*, int,
+                                     UniqueIdComparator>::const_iterator;
+    NeighborIterator(const HloModule* parent, Iterator it, Iterator end)
+        : parent_(parent), it_(it), end_(end) {
+      SkipComputationsFromOtherModules();
+    }
+
+    HloComputation* operator*() const { return it_->first; }
+    HloComputation* operator->() const { return it_->first; }
+    NeighborIterator& operator++() {
+      ++it_;
+      SkipComputationsFromOtherModules();
+      return *this;
+    }
+    bool operator==(const NeighborIterator& other) const {
+      return it_ == other.it_;
+    }
+    bool operator!=(const NeighborIterator& other) const {
+      return it_ != other.it_;
+    }
+
+   private:
+    void SkipComputationsFromOtherModules() {
+      if (!parent_) {
+        it_ = end_;
+        return;
+      }
+      while (it_ != end_ && it_->first->parent() != parent_) {
+        ++it_;
+      }
+    }
+
+    const HloModule* parent_;
+    Iterator it_;
+    Iterator end_;
+  };
+  NeighborIterator callers_begin() const {
+    return NeighborIterator(parent(), caller_computations_.begin(),
+                            caller_computations_.end());
+  }
+  NeighborIterator callers_end() const {
+    return NeighborIterator(parent(), caller_computations_.end(),
+                            caller_computations_.end());
+  }
+  NeighborIterator callees_begin() const {
+    return NeighborIterator(parent(), callee_computations_.begin(),
+                            callee_computations_.end());
+  }
+  NeighborIterator callees_end() const {
+    return NeighborIterator(parent(), callee_computations_.end(),
+                            callee_computations_.end());
+  }
+
+  template <typename S, typename Index, TopologicalSortNode<S> S::* Link,
+            Index S::* IndexInParent, typename PredecessorIterator,
+            PredecessorIterator (S::*PredecessorsBegin)() const,
+            PredecessorIterator (S::*PredecessorsEnd)() const,
+            typename SuccessorIterator,
+            SuccessorIterator (S::*SuccessorsBegin)() const,
+            SuccessorIterator (S::*SuccessorsEnd)() const>
+  friend class TopologicalSort;
+
+  template <typename S, TopologicalSortNode<S> S::* Link>
+  friend class TopologicalSortIterator;
+
+  TopologicalSortNode<HloComputation> topological_sort_node_;
+
   HloComputation(const HloComputation&) = delete;
   HloComputation& operator=(const HloComputation&) = delete;
 };
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation_test.cc b/third_party/xla/xla/hlo/ir/hlo_computation_test.cc
new file mode 100644
index 000000000000..a91f51d36bfb
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_computation_test.cc
@@ -0,0 +1,217 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/hlo_computation.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using HLOComputationTest = HloHardwareIndependentTestBase;
+
+int64_t CountControlEdges(const HloComputation &computation) {
+  int64_t count = 0;
+  for (const auto &instruction : computation.instructions()) {
+    count += instruction->control_successors().size();
+  }
+  return count;
+}
+
+TEST_F(HLOComputationTest, DefUseOrder) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = f32[] add(a, b)
+}
+
+ENTRY entry {
+  p0 = f32[100] parameter(0), parameter_replication={false}
+  p1 = f32[100] parameter(1), parameter_replication={false}
+  add0 = f32[100] add(p0, p1)
+  mul0 = f32[100] multiply(p0, p1)
+  div0 = f32[100] divide(p0, p1)
+  reduce0 = f32[100] all-reduce(add0), replica_groups={}, to_apply=sum, channel_id=1
+  reduce1 = f32[100] all-reduce(mul0), replica_groups={}, to_apply=sum, channel_id=1
+  reduce2 = f32[100] all-reduce(div0), replica_groups={}, to_apply=sum, channel_id=1
+  add1 = f32[100] add(reduce0, reduce1)
+  ROOT out = f32[100] add(add1, reduce2)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  EXPECT_EQ(CountControlEdges(*module->entry_computation()), 0);
+
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  const HloInstruction *add1 = root->operand(0);     // t = add(c1, c2)
+  const HloInstruction *reduce2 = root->operand(1);  // c3 = all-reduce(i2)...
+  EXPECT_EQ(add1->opcode(), HloOpcode::kAdd);
+  EXPECT_EQ(reduce2->opcode(), HloOpcode::kAllReduce);
+
+  const HloInstruction *reduce0 = add1->operand(0);
+  const HloInstruction *reduce1 = add1->operand(1);
+  EXPECT_EQ(reduce0->opcode(), HloOpcode::kAllReduce);
+  EXPECT_EQ(reduce1->opcode(), HloOpcode::kAllReduce);
+
+  bool found_add0 = false;
+  // Verify that i0 is before c1.
+  auto post_order = module->entry_computation()->MakeInstructionPostOrder();
+  for (const auto &instruction : post_order) {
+    if (instruction->name() == "reduce0") {
+      EXPECT_TRUE(found_add0);
+    }
+    if (instruction->name() == "add0") {
+      found_add0 = true;
+    }
+  }
+
+  // Verify that MakeInstructionPostOrder() is idempotent.
+  auto post_order_2 = module->entry_computation()->MakeInstructionPostOrder();
+  EXPECT_EQ(post_order, post_order_2);
+}
+
+TEST_F(HLOComputationTest, MakeInstructionPostOrder) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  p0 = f32[100] parameter(0)
+  p1 = f32[100] parameter(1)
+  add0 = f32[100] add(p0, p1)
+  mul0 = f32[100] multiply(p0, add0)
+  ROOT div0 = f32[100] divide(p1, mul0)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto post_order = module->entry_computation()->MakeInstructionPostOrder();
+
+  // Verify the order of instructions in the post order.
+  bool found_p0 = false;
+  bool found_p1 = false;
+  bool found_add0 = false;
+  bool found_mul0 = false;
+  for (HloInstruction *instruction : post_order) {
+    if (instruction->name() == "add0") {
+      EXPECT_TRUE(found_p0);
+      EXPECT_TRUE(found_p1);
+      found_add0 = true;
+    } else if (instruction->name() == "mul0") {
+      EXPECT_TRUE(found_p0);
+      EXPECT_TRUE(found_add0);
+      found_mul0 = true;
+    } else if (instruction->name() == "div0") {
+      EXPECT_TRUE(found_p1);
+      EXPECT_TRUE(found_mul0);
+    } else if (instruction->name() == "p0") {
+      found_p0 = true;
+    } else if (instruction->name() == "p1") {
+      found_p1 = true;
+    }
+  }
+
+  // Verify that MakeInstructionPostOrder() is idempotent.
+  auto post_order_2 = module->entry_computation()->MakeInstructionPostOrder();
+  EXPECT_EQ(post_order, post_order_2);
+}
+
+// Test AddCallee
+TEST_F(HLOComputationTest, AddCallee) {
+  absl::string_view hlo_string = R"(
+HloModule module
+diff {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = f32[] add(a, b)
+}
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = f32[] add(a, b)
+}
+
+ENTRY entry {
+  p0 = f32[100] parameter(0), parameter_replication={false}
+  p1 = f32[100] parameter(1), parameter_replication={false}
+  map0 = f32[100] map(p0, p1), to_apply=diff
+  ROOT map1 = f32[100] map(p0, map0), to_apply=sum
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloComputation *entry = module->entry_computation();
+  HloComputation *sum = module->GetComputationWithName("sum");
+  ASSERT_NE(entry, nullptr);
+  ASSERT_NE(sum, nullptr);
+
+  EXPECT_EQ(entry->callee_computations().size(), 2);
+  EXPECT_TRUE(entry->callee_computations().contains(sum));
+  EXPECT_EQ(sum->caller_computations().size(), 1);
+  EXPECT_EQ(sum->caller_computations().count(entry), 1);
+
+  // Get the operands of the add.
+  HloInstruction *entry_a = entry->root_instruction()->mutable_operand(0);
+  HloInstruction *entry_b = entry->root_instruction()->mutable_operand(1);
+
+  // Create a new computation and add it as a callee.
+  auto builder = HloComputation::Builder("mul");
+  auto a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "a"));
+  auto b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "b"));
+
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kMultiply, a, b));
+
+  HloComputation *mul_comp = module->AddEmbeddedComputation(builder.Build());
+
+  auto map = HloInstruction::CreateMap(entry->root_instruction()->shape(),
+                                       {entry_a, entry_b}, mul_comp);
+
+  // Add the new computation as a callee of the entry computation.
+  EXPECT_EQ(entry->ReplaceWithNewInstruction(entry->root_instruction(),
+                                             std::move(map)),
+            absl::OkStatus());
+
+  HloComputation *mul_int = module->GetComputationWithName("mul");
+
+  EXPECT_EQ(entry->callee_computations().size(), 2);
+  EXPECT_FALSE(entry->callee_computations().contains(sum));
+  EXPECT_EQ(entry->callee_computations().count(mul_int), 1);
+  EXPECT_EQ(sum->caller_computations().size(), 0);
+  EXPECT_EQ(mul_int->caller_computations().size(), 1);
+  EXPECT_TRUE(mul_int->caller_computations().contains(entry));
+}
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.h b/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.h
deleted file mode 100644
index 446be761b962..000000000000
--- a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_HLO_IR_HLO_DFS_REACHABILITY_H_
-#define XLA_HLO_IR_HLO_DFS_REACHABILITY_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_dfs_reachability.h"
-
-#endif  // XLA_HLO_IR_HLO_DFS_REACHABILITY_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index d8964df0a81e..0dc7d619afc3 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -74,6 +74,7 @@ limitations under the License.
 #include "xla/service/name_uniquer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/side_effect_util.h"
 #include "xla/sort_json.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/lib/gtl/iterator_range.h"
@@ -218,10 +219,65 @@ void HloInstruction::AppendComputation(HloComputation* computation) {
   // In .cc file since PtrVec<T*>::push_back() wants to check the alignment
   // of T and hlo_instruction.h does not include hlo_computation.h.
   mutable_rare()->called_computations.push_back(computation);
+  if (parent()) {
+    parent()->AddCallee(this, computation);
+  }
+}
+
+void HloInstruction::set_called_computation(int index,
+                                            HloComputation* computation) {
+  // TODO(b/399394039): Consider also enforcing that computation->parent() !=
+  // nullptr.
+  CHECK(parent() == nullptr || parent()->parent() == nullptr ||
+        computation == nullptr || parent()->parent() == computation->parent())
+      << ToString();
+  HloComputation* old_computation = computation;
+  std::swap(old_computation, mutable_rare()->called_computations[index]);
+  if (parent()) {
+    if (old_computation) {
+      parent()->RemoveCallee(this, old_computation);
+    }
+    if (computation) {
+      parent()->AddCallee(this, computation);
+    }
+  }
+}
+
+const PtrVec<HloComputation*>& HloInstruction::called_computations() const {
+  if (has_rare()) {
+    return rare()->called_computations;
+  }
+
+  static PtrVec<HloComputation*>* empty = new PtrVec<HloComputation*>;
+  return *empty;
+}
+
+void HloInstruction::ReplaceCalledComputations(
+    absl::FunctionRef<HloComputation*(HloComputation*)> map_function) {
+  for (int64_t i = 0; i < called_computations().size(); ++i) {
+    set_called_computation(i, map_function(rare()->called_computations[i]));
+  }
+}
+
+void HloInstruction::ClearCalledComputations() {
+  if (has_rare()) {
+    if (parent()) {
+      for (HloComputation* computation : called_computations()) {
+        if (computation) {
+          parent()->RemoveCallee(this, computation);
+        }
+      }
+    }
+    mutable_rare()->called_computations.clear();
+  }
 }
 
 HloInstruction* HloInstruction::AddInstruction(
-    std::unique_ptr<HloInstruction> derived_instruction) {
+    std::unique_ptr<HloInstruction> derived_instruction,
+    absl::string_view new_name) {
+  if (!new_name.empty()) {
+    derived_instruction->SetAndSanitizeName(new_name);
+  }
   HloInstruction* derived =
       parent()->AddInstruction(std::move(derived_instruction));
   const bool has_prior_sharding = derived->has_sharding();
@@ -319,7 +375,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                      [&](int64_t id) { return computation_map.contains(id); }))
       << proto.name() << " instruction references invalid computation id(s)";
 
-  Shape shape(proto.shape());
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
   std::optional<int> arity = HloOpcodeArity(opcode);
@@ -490,9 +546,9 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                              proto.dimensions().end()));
       break;
     case HloOpcode::kConcatenate:
-      TF_RET_CHECK(proto.dimensions_size() == 1)
+      TF_RET_CHECK(proto.dimensions().size() == 1)
           << "Concatenate instruction should have 1 dimension but sees "
-          << proto.dimensions_size();
+          << proto.dimensions().size();
       instruction =
           CreateConcatenate(shape, all_operands(), proto.dimensions(0));
       break;
@@ -668,7 +724,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
     case HloOpcode::kOutfeed: {
-      Shape outfeed_shape(proto.outfeed_shape());
+      TF_ASSIGN_OR_RETURN(Shape outfeed_shape,
+                          Shape::FromProto(proto.outfeed_shape()));
       TF_RETURN_IF_ERROR(
           ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape));
       instruction = CreateOutfeed(outfeed_shape, operands(0), operands(1),
@@ -682,7 +739,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         channel_id = proto.channel_id();
       }
 
-      TF_RET_CHECK(proto.dimensions_size() == 1)
+      TF_RET_CHECK(proto.dimensions().size() == 1)
           << "AllGather cannot have more than 1 all-gather dimensions";
       int64_t all_gather_dimension = proto.dimensions(0);
       if (opcode == HloOpcode::kAllGather) {
@@ -720,7 +777,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                             proto.constrain_layout(), channel_id,
                             proto.use_global_device_ids());
       } else if (opcode == HloOpcode::kReduceScatter) {
-        TF_RET_CHECK(proto.dimensions_size() == 1)
+        TF_RET_CHECK(proto.dimensions().size() == 1)
             << "ReduceScatter cannot have more than 1 scatter dimensions";
         int64_t scatter_dimension = proto.dimensions(0);
         instruction = CreateReduceScatter(
@@ -741,8 +798,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         channel_id = proto.channel_id();
       }
       std::optional<int64_t> split_dimension;
-      if (proto.dimensions_size() > 0) {
-        TF_RET_CHECK(proto.dimensions_size() == 1)
+      if (!proto.dimensions().empty()) {
+        TF_RET_CHECK(proto.dimensions().size() == 1)
             << "AllToAll cannot have more than 1 dimension (split dimension)";
         TF_RET_CHECK(all_operands().size() == 1)
             << "AllToAll must have a single operand when the split dimension "
@@ -804,8 +861,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         HloInstruction* input = operands(0);
         HloInstruction* input_start_indices = operands(2);
         if (input->shape().IsTuple() &&
-            input->shape().tuple_shapes_size() > 1) {
-          slice_sizes.resize(input->shape().tuple_shapes_size());
+            input->shape().tuple_shapes().size() > 1) {
+          slice_sizes.resize(input->shape().tuple_shapes().size());
         } else {
           slice_sizes.resize(1);
         }
@@ -815,12 +872,13 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                   .tuple_shapes(0)
                   .tuple_shapes(0)
                   .IsArray()) {
-            slice_sizes.resize(input->shape().tuple_shapes_size());
-            for (int i = 0; i < input->shape().tuple_shapes_size(); ++i) {
+            slice_sizes.resize(input->shape().tuple_shapes().size());
+            for (int i = 0; i < input->shape().tuple_shapes().size(); ++i) {
               slice_sizes[i].resize(
-                  input->shape().tuple_shapes(i).dimensions_size());
+                  input->shape().tuple_shapes(i).dimensions().size());
               for (int j = 0;
-                   j < input->shape().tuple_shapes(i).dimensions_size(); ++j) {
+                   j < input->shape().tuple_shapes(i).dimensions().size();
+                   ++j) {
                 CHECK_GE(proto.dynamic_slice_sizes_size(), proto_index);
                 slice_sizes[i][j] = proto.dynamic_slice_sizes(proto_index);
                 proto_index += 1;
@@ -828,18 +886,19 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             }
           } else {
             slice_sizes.resize(
-                input->shape().tuple_shapes_size() *
+                input->shape().tuple_shapes().size() *
                 ShapeUtil::TupleElementCount(
                     input_start_indices->shape().tuple_shapes(0)));
             int slice_sizes_count = 0;
-            for (int i = 0; i < input->shape().tuple_shapes_size(); ++i) {
+            for (int i = 0; i < input->shape().tuple_shapes().size(); ++i) {
               for (int j = 0;
                    j < ShapeUtil::TupleElementCount(
                            input_start_indices->shape().tuple_shapes(i));
                    ++j) {
                 slice_sizes[slice_sizes_count].resize(
-                    input->shape().tuple_shapes(i).rank());
-                for (int k = 0; k < input->shape().tuple_shapes(i).rank();
+                    input->shape().tuple_shapes(i).dimensions().size());
+                for (int k = 0;
+                     k < input->shape().tuple_shapes(i).dimensions().size();
                      ++k) {
                   CHECK_GE(proto.dynamic_slice_sizes_size(), proto_index);
                   slice_sizes[slice_sizes_count][k] =
@@ -857,16 +916,16 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             for (int i = 0;
                  i < ShapeUtil::TupleElementCount(input_start_indices->shape());
                  ++i) {
-              slice_sizes[i].resize(input->shape().dimensions_size());
-              for (int j = 0; j < input->shape().dimensions_size(); ++j) {
+              slice_sizes[i].resize(input->shape().dimensions().size());
+              for (int j = 0; j < input->shape().dimensions().size(); ++j) {
                 slice_sizes[i][j] = proto.dynamic_slice_sizes(proto_index);
                 proto_index += 1;
               }
             }
           } else {
             slice_sizes.resize(1);
-            slice_sizes[0].resize(input->shape().dimensions_size());
-            for (int j = 0; j < input->shape().dimensions_size(); ++j) {
+            slice_sizes[0].resize(input->shape().dimensions().size());
+            for (int j = 0; j < input->shape().dimensions().size(); ++j) {
               slice_sizes[0][j] = proto.dynamic_slice_sizes(proto_index);
               proto_index += 1;
             }
@@ -1016,8 +1075,9 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
              "sees "
           << proto.operand_ids_size();
       // TODO(b/118437727): Old form, make the check unconditional.
-      if (proto.operand_ids_size() != 2 || operands(1)->shape().rank() != 1) {
-        auto expected_operands = 1 + operands(0)->shape().rank();
+      if (proto.operand_ids_size() != 2 ||
+          operands(1)->shape().dimensions().size() != 1) {
+        auto expected_operands = 1 + operands(0)->shape().dimensions().size();
         TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
             << "DynamicSlice instruction should have " << expected_operands
             << " operands, but has " << proto.operand_ids_size();
@@ -1034,8 +1094,9 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
              "but sees "
           << proto.operand_ids_size();
       // TODO(b/118437727): Old form, make the check unconditional.
-      if (proto.operand_ids_size() != 3 || operands(2)->shape().rank() != 1) {
-        auto expected_operands = 2 + operands(0)->shape().rank();
+      if (proto.operand_ids_size() != 3 ||
+          operands(2)->shape().dimensions().size() != 1) {
+        auto expected_operands = 2 + operands(0)->shape().dimensions().size();
         TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
             << "DynamicUpdateSlice instruction should have "
             << expected_operands << " operands, but has "
@@ -1084,9 +1145,9 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kIota:
-      TF_RET_CHECK(proto.dimensions_size() == 1)
+      TF_RET_CHECK(proto.dimensions().size() == 1)
           << "Iota instruction should have 1 dimension but sees "
-          << proto.dimensions_size();
+          << proto.dimensions().size();
       instruction = CreateIota(shape, proto.dimensions(0));
       break;
     case HloOpcode::kDot: {
@@ -1154,12 +1215,12 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kGetDimensionSize:
-      TF_RET_CHECK(proto.dimensions_size() == 1);
+      TF_RET_CHECK(proto.dimensions().size() == 1);
       instruction =
           CreateGetDimensionSize(shape, operands(0), proto.dimensions(0));
       break;
     case HloOpcode::kSetDimensionSize:
-      TF_RET_CHECK(proto.dimensions_size() == 1);
+      TF_RET_CHECK(proto.dimensions().size() == 1);
       instruction = CreateSetDimensionSize(shape, operands(0), operands(1),
                                            proto.dimensions(0));
       break;
@@ -1266,8 +1327,6 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         TF_RET_CHECK(proto.called_computation_ids_size() == 2)
             << "While should have 2 called computation but has "
             << proto.called_computation_ids_size();
-        computation_map.at(proto.called_computation_ids(0))
-            ->SetWhileCallInstruction(instruction.get());
       }
 
       for (const int64_t operand_id : proto.operand_ids()) {
@@ -1276,9 +1335,6 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       for (const int64_t computation_id : proto.called_computation_ids()) {
         instruction->AppendComputation(computation_map.at(computation_id));
       }
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        instruction->while_body()->SetWhileCallInstruction(instruction.get());
-      }
 
       TF_RET_CHECK(!proto.has_precision_config())
           << instruction->opcode() << proto.DebugString();
@@ -1335,7 +1391,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->set_original_value(original_value);
   }
 
-  return std::move(instruction);
+  return instruction;
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateParameter(
@@ -1946,8 +2002,6 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
   // Body comes before condition computation in the vector.
   instruction->AppendComputation(body);
   instruction->AppendComputation(condition);
-  // Set back pointer from body computation to the while call instruction.
-  body->SetWhileCallInstruction(instruction.get());
   return instruction;
 }
 
@@ -1965,9 +2019,6 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
   // kFalseComputationIndex.
   instruction->AppendComputation(true_computation);
   instruction->AppendComputation(false_computation);
-  // Set back pointer from computations to the conditional instruction.
-  true_computation->SetConditionalCallInstruction(instruction.get());
-  false_computation->SetConditionalCallInstruction(instruction.get());
   return instruction;
 }
 
@@ -1982,8 +2033,6 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
   for (int i = 0; i < branch_computations.size(); ++i) {
     instruction->AppendComputation(branch_computations[i]);
     instruction->AppendOperand(branch_computation_args[i]);
-    // Set back pointer from the computation to the conditional instruction.
-    branch_computations[i]->SetConditionalCallInstruction(instruction.get());
   }
   return instruction;
 }
@@ -2090,7 +2139,7 @@ HloInstruction::CreateStochasticConvert(const Shape& shape,
                         dimensions_to_reduce, reduce_computation);
   }
   absl::InlinedVector<HloInstruction*, 4> inputs;
-  for (int idx = 0; idx < tuple_of_instructions->shape().tuple_shapes_size();
+  for (int idx = 0; idx < tuple_of_instructions->shape().tuple_shapes().size();
        idx++) {
     std::unique_ptr<HloInstruction> gte =
         HloInstruction::CreateGetTupleElement(tuple_of_instructions, idx);
@@ -2182,7 +2231,8 @@ HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
     absl::FunctionRef<HloInstruction*(std::unique_ptr<HloInstruction>)> adder) {
   CHECK(ShapeUtil::IsScalar(operand->shape()) ||
-        operand->shape().rank() == output_shape.rank());
+        operand->shape().dimensions().size() ==
+            output_shape.dimensions().size());
   Shape broadcast_shape = ShapeUtil::ChangeElementType(
       output_shape, operand->shape().element_type());
   // Do explicit broadcast for scalar.
@@ -2200,7 +2250,7 @@ HloInstruction::CreateBroadcastSequence(
   // Do explicit broadcast for degenerate broadcast.
   std::vector<int64_t> broadcast_dimensions;
   std::vector<int64_t> reshaped_dimensions;
-  for (int i = 0; i < operand->shape().rank(); i++) {
+  for (int i = 0; i < operand->shape().dimensions().size(); i++) {
     if (operand->shape().dimensions(i) == output_shape.dimensions(i)) {
       broadcast_dimensions.push_back(i);
       reshaped_dimensions.push_back(operand->shape().dimensions(i));
@@ -2260,7 +2310,7 @@ HloInstruction::CreateDynamicReshape(
            ShapeUtil::StaticExtentProduct(data_operand[0].shape()))
       << "shape: " << ShapeUtil::HumanString(shape)
       << " operand: " << ShapeUtil::HumanString(data_operand[0].shape());
-  CHECK_EQ(shape.rank(), dim_sizes.size());
+  CHECK_EQ(shape.dimensions().size(), dim_sizes.size());
   return std::make_unique<HloDynamicReshapeInstruction>(shape, data_operand,
                                                         dim_sizes);
 }
@@ -2318,6 +2368,8 @@ void HloInstruction::SetupDerivedInstruction(
   if (has_rare()) {
     derived_instruction->set_result_accuracy(result_accuracy());
     derived_instruction->set_frontend_attributes(frontend_attributes());
+    // Offload annotations should not be implicitly derived.
+    derived_instruction->erase_frontend_attribute(kXlaComputeTypeAttr);
     derived_instruction->set_statistics_viz(statistics_viz());
   } else if (derived_instruction->has_rare()) {
     derived_instruction->mutable_rare()->frontend_attributes.Clear();
@@ -2727,10 +2779,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone =
           CreateWhile(shape, while_condition(), while_body(), new_operands[0]);
-      // Repoint the while body back at the original while instruction.
-      // If a context was passed, the body will be cloned and the clone will
-      // point to the copied instruction.
-      while_body()->SetWhileCallInstruction(const_cast<HloInstruction*>(this));
       break;
     case HloOpcode::kConditional:
       CHECK_EQ(new_operands.size(), branch_count() + 1);
@@ -2762,8 +2810,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
-  clone->set_parent(parent_);
   clone->backend_config_ = BackendConfigWrapper(backend_config_);
+  clone->set_frontend_attributes(frontend_attributes());
   // The new instruction's name will be uniquified when it's added to a
   // computation.
   clone->SetAndSanitizeName(name());
@@ -2774,9 +2822,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
                  ? context->module()->DeepCloneComputation(callee, context)
                  : callee;
     });
-    if (opcode() == HloOpcode::kWhile) {
-      clone->while_body()->SetWhileCallInstruction(clone.get());
-    }
   }
 
   if (!suffix.empty()) {
@@ -2958,6 +3003,11 @@ bool HloInstruction::HasControlDependencies() const {
   return (!r->control_predecessors.empty() || !r->control_successors.empty());
 }
 
+bool HloInstruction::HasSuccessorControlDependencies() const {
+  const Rare* r = rare();
+  return (!r->control_successors.empty());
+}
+
 absl::Status HloInstruction::CopyAllControlDepsTo(HloInstruction* start,
                                                   HloInstruction* end) const {
   for (auto* ctrl_pred : control_predecessors()) {
@@ -3518,7 +3568,7 @@ void HloInstruction::set_to_apply(HloComputation* computation) {
   if (has_to_apply()) {
     CHECK_EQ(called_computations().size(), 1)
         << "Expected a to_apply computation for " << opcode();
-    rare_->called_computations[0] = computation;
+    set_called_computation(0, computation);
     return;
   }
   LOG(FATAL) << "Invalid opcode for to_apply(): " << opcode();
@@ -3557,12 +3607,12 @@ HloComputation* HloInstruction::while_body() const {
 
 void HloInstruction::set_while_condition(HloComputation* computation) {
   CHECK_EQ(HloOpcode::kWhile, opcode_);
-  rare_->called_computations[kConditionComputationIndex] = computation;
+  set_called_computation(kConditionComputationIndex, computation);
 }
 
 void HloInstruction::set_while_body(HloComputation* computation) {
   CHECK_EQ(HloOpcode::kWhile, opcode_);
-  rare_->called_computations[kBodyComputationIndex] = computation;
+  set_called_computation(kBodyComputationIndex, computation);
 }
 
 HloInstruction* HloInstruction::while_init() const {
@@ -3614,7 +3664,7 @@ int32_t HloInstruction::branch_index(HloComputation* computation) const {
 void HloInstruction::set_branch_computation(int b,
                                             HloComputation* computation) {
   CHECK_EQ(HloOpcode::kConditional, opcode_);
-  rare_->called_computations[b] = computation;
+  set_called_computation(b, computation);
 }
 
 std::string HloInstruction::SignatureString() const {
@@ -3712,7 +3762,9 @@ std::string HloInstruction::ToString(const HloPrintOptions& options) const {
 }
 
 std::string HloInstruction::ToString() const {
-  return ToString(HloPrintOptions::Default());
+  HloPrintOptions options = HloPrintOptions::Default();
+  options.set_print_large_constants(false);
+  return ToString(options);
 }
 
 bool HloInstruction::IsOpElementwise(HloOpcode opcode) {
@@ -3887,9 +3939,10 @@ void HloInstruction::PrintWithCanonicalNameMap(
   }
 
   if (options.print_metadata() &&
-      (!metadata_->op_type().empty() || !metadata_->op_name().empty() ||
-       !metadata_->source_file().empty() ||
-       !metadata_->scheduling_name().empty())) {
+      (metadata_ != nullptr &&
+       (!metadata_->op_type().empty() || !metadata_->op_name().empty() ||
+        !metadata_->source_file().empty() ||
+        !metadata_->scheduling_name().empty()))) {
     printer->Append(", metadata={");
     printer->Append(xla::OpMetadataToString(
         *metadata_, options.print_metadata_only_op_name()));
@@ -4044,7 +4097,7 @@ void HloInstruction::PrintExtraAttributes(
                opcode() == HloOpcode::kReduceScatter ||
                opcode() == HloOpcode::kAllReduceStart ||
                opcode() == HloOpcode::kScatter ||
-               opcode() == HloOpcode::kTopK || opcode() == HloOpcode::kSort) {
+               opcode() == HloOpcode::kSort) {
       if (!called_computations().empty()) {
         printer.Next([this, &options](Printer* printer) {
           printer->Append("to_apply=");
@@ -4144,7 +4197,6 @@ void HloInstruction::PrintExtraAttributes(
       case HloOpcode::kAllReduceStart:
       case HloOpcode::kScatter:
       case HloOpcode::kSort:
-      case HloOpcode::kTopK:
         if (!called_computations().empty()) {
           printer.Next([this, &new_options](Printer* printer) {
             printer->Append("to_apply=\n");
@@ -4259,7 +4311,7 @@ std::string FrontendAttributesToString(
     if (LexesAsJsonDict(item.second)) {
       absl::StrAppend(out, item.first, "=", item.second);
     } else {
-      absl::StrAppend(out, item.first, "=\"", item.second, "\"");
+      absl::StrAppend(out, item.first, "=\"", CEscape(item.second), "\"");
     }
   };
   return absl::StrFormat("{%s}",
@@ -4708,7 +4760,7 @@ static absl::Status PostOrderDFS(
 
     int current_id = dfs_stack.back().first;
     HloInstruction* current_node = dfs_stack.back().second;
-    CHECK_GE(current_id, 0) << current_id << ": " << current_node
+    CHECK_GE(current_id, 0) << current_id << ": " << current_node->name()
                             << ": instruction may not have parent computation";
     typename Visitor::VisitState visit_state =
         visitor->GetVisitState(current_id);
@@ -4722,11 +4774,15 @@ static absl::Status PostOrderDFS(
     if (visit_state == Visitor::kVisiting) {
       dfs_stack.pop_back();
 
-      TF_RETURN_IF_ERROR(visitor->Preprocess(current_node));
-      VLOG(2) << "Visiting HLO %" << current_node->name();
-      TF_RETURN_IF_ERROR(current_node->Visit(visitor));
-      visitor->SetVisitState(current_id, Visitor::kVisited);
-      TF_RETURN_IF_ERROR(visitor->Postprocess(current_node));
+      if (visitor->ShouldProcessNode(current_node)) {
+        TF_RETURN_IF_ERROR(visitor->Preprocess(current_node));
+        VLOG(2) << "Visiting HLO %" << current_node->name();
+        TF_RETURN_IF_ERROR(current_node->Visit(visitor));
+        visitor->SetVisitState(current_id, Visitor::kVisited);
+        TF_RETURN_IF_ERROR(visitor->Postprocess(current_node));
+      } else {
+        visitor->SetVisitState(current_id, Visitor::kVisited);
+      }
       continue;
     }
 
@@ -4901,9 +4957,11 @@ static UseKind OperandElementUse(const HloInstruction& instr,
                                                 *instr.fused_expression_root());
     case HloOpcode::kDot:
       // Matrix-vector dots do not reuse the matrix operand.
-      if (instr.shape().dimensions_size() <= 1) {
-        if ((operand_num == 0 && instr.operand(1)->shape().rank() <= 1) ||
-            (operand_num == 1 && instr.operand(0)->shape().rank() <= 1)) {
+      if (instr.shape().dimensions().size() <= 1) {
+        if ((operand_num == 0 &&
+             instr.operand(1)->shape().dimensions().size() <= 1) ||
+            (operand_num == 1 &&
+             instr.operand(0)->shape().dimensions().size() <= 1)) {
           return UseKind::kUse;
         }
       }
@@ -5209,7 +5267,8 @@ std::string ConvolutionDimensionNumbersToString(
 absl::StatusOr<RandomAlgorithm> StringToRandomAlgorithm(
     const std::string& name) {
   static absl::flat_hash_map<std::string, RandomAlgorithm>* map = [] {
-    static auto* map = new absl::flat_hash_map<std::string, RandomAlgorithm>;
+    static auto* const map =
+        new absl::flat_hash_map<std::string, RandomAlgorithm>;
     for (int i = 0; i < RandomAlgorithm_ARRAYSIZE; i++) {
       if (RandomAlgorithm_IsValid(i)) {
         auto value = static_cast<RandomAlgorithm>(i);
@@ -5228,7 +5287,8 @@ absl::StatusOr<RandomAlgorithm> StringToRandomAlgorithm(
 absl::StatusOr<RandomDistribution> StringToRandomDistribution(
     const std::string& name) {
   static absl::flat_hash_map<std::string, RandomDistribution>* map = [] {
-    static auto* map = new absl::flat_hash_map<std::string, RandomDistribution>;
+    static auto* const map =
+        new absl::flat_hash_map<std::string, RandomDistribution>;
     for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
       if (RandomDistribution_IsValid(i)) {
         auto value = static_cast<RandomDistribution>(i);
@@ -5248,7 +5308,7 @@ absl::StatusOr<PrecisionConfig::Precision> StringToPrecision(
     const std::string& name) {
   static absl::flat_hash_map<std::string, PrecisionConfig::Precision>* map =
       [] {
-        static auto* map =
+        static auto* const map =
             new absl::flat_hash_map<std::string, PrecisionConfig::Precision>;
         for (int i = 0; i < PrecisionConfig::Precision_ARRAYSIZE; i++) {
           if (PrecisionConfig::Precision_IsValid(i)) {
@@ -5289,7 +5349,7 @@ absl::StatusOr<PrecisionConfig::Algorithm> StringToAlgorithm(
     const std::string& name) {
   static absl::flat_hash_map<std::string, PrecisionConfig::Algorithm>* map =
       [] {
-        static auto* map =
+        static auto* const map =
             new absl::flat_hash_map<std::string, PrecisionConfig::Algorithm>;
         for (int i = 0; i < PrecisionConfig::Algorithm_ARRAYSIZE; i++) {
           if (PrecisionConfig::Algorithm_IsValid(i)) {
@@ -5309,7 +5369,8 @@ absl::StatusOr<PrecisionConfig::Algorithm> StringToAlgorithm(
 absl::StatusOr<CustomCallSchedule> StringToCustomCallSchedule(
     absl::string_view name) {
   static const absl::flat_hash_map<std::string, CustomCallSchedule>* map = [] {
-    static auto* map = new absl::flat_hash_map<std::string, CustomCallSchedule>;
+    static auto* const map =
+        new absl::flat_hash_map<std::string, CustomCallSchedule>;
     for (int i = 0; i < CustomCallSchedule_ARRAYSIZE; i++) {
       if (CustomCallSchedule_IsValid(i)) {
         auto value = static_cast<CustomCallSchedule>(i);
@@ -5329,7 +5390,7 @@ absl::StatusOr<CustomCallApiVersion> StringToCustomCallApiVersion(
     absl::string_view name) {
   static const absl::flat_hash_map<std::string, CustomCallApiVersion>* map =
       [] {
-        static auto* map =
+        static auto* const map =
             new absl::flat_hash_map<std::string, CustomCallApiVersion>;
         for (int i = 0; i < CustomCallApiVersion_ARRAYSIZE; i++) {
           if (CustomCallApiVersion_IsValid(i)) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index 0ad526922469..59f542b6ec3b 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -300,9 +300,11 @@ class HloInstruction {
   void DetachFromOperandsAndUsers();
 
   // Adds a derived instruction to the parent computation of this instruction.
-  // Also update setup the new instruction as a derived instruction.
+  // Updates setup the new instruction as a derived instruction, and sets the
+  // name of the new instruction (if `new_name` is not empty).
   HloInstruction* AddInstruction(
-      std::unique_ptr<HloInstruction> derived_instruction);
+      std::unique_ptr<HloInstruction> derived_instruction,
+      absl::string_view new_name = "");
 
   // Creates an instruction from the given proto. Arguments:
   //
@@ -996,7 +998,7 @@ class HloInstruction {
   // Creates a dynamic reshape instruction. Similar to reshape but dynamic
   // dimensions sizes are provided as additional variadic arguments.
   //
-  // Precondition: dim_sizes.size() == shape.rank()
+  // Precondition: dim_sizes.size() == shape.dimensions().size()
   static std::unique_ptr<HloInstruction> CreateDynamicReshape(
       const Shape& shape, HloInstruction* data_operand,
       absl::Span<HloInstruction* const> dim_sizes);
@@ -1266,6 +1268,9 @@ class HloInstruction {
   // Returns if instruction has any control dependencies.
   bool HasControlDependencies() const;
 
+  // Returns if instruction has successor control dependencies.
+  bool HasSuccessorControlDependencies() const;
+
   // Copies the control predecessors and successors on this HLO instruction to
   // `inst`.  Does not do a deep copy so this makes sense only if `inst` and
   // this HLO are in the same module.
@@ -1717,9 +1722,8 @@ class HloInstruction {
   }
 
   // Returns the computations this instruction directly calls (if any).
-  const PtrVec<HloComputation*>& called_computations() const {
-    return rare()->called_computations;
-  }
+  const PtrVec<HloComputation*>& called_computations() const;
+
   bool has_called_computations() const {
     return has_rare() && !called_computations().empty();
   }
@@ -1732,12 +1736,7 @@ class HloInstruction {
   // when we clone hlo_computations and want to let the instructions to point
   // to the newly cloned nodes.
   void ReplaceCalledComputations(
-      absl::FunctionRef<HloComputation*(HloComputation*)> map_function) {
-    for (int64_t i = 0; i < called_computations().size(); ++i) {
-      mutable_rare()->called_computations[i] =
-          map_function(rare()->called_computations[i]);
-    }
-  }
+      absl::FunctionRef<HloComputation*(HloComputation*)> map_function);
 
   // Clears out the called computations.
   //
@@ -1747,11 +1746,7 @@ class HloInstruction {
   // clearing out the computations, we reflect the fact that all side-effecting
   // properties have been reflected in the caller, and make the call HLO
   // removable.
-  virtual void ClearCalledComputations() {
-    if (has_rare()) {
-      mutable_rare()->called_computations.clear();
-    }
-  }
+  virtual void ClearCalledComputations();
 
   // Returns true if this instruction performs an elementwise operation on
   // `operand_idx`-th operand. An instruction is elementwise on an operand iff,
@@ -1876,8 +1871,7 @@ class HloInstruction {
   }
 
   // Adds or overrides a single attribute in the HloInstruction.
-  void set_frontend_attribute(const std::string& key,
-                              const std::string& value) {
+  void set_frontend_attribute(absl::string_view key, absl::string_view value) {
     (*mutable_rare()->frontend_attributes.mutable_map())[key] = value;
   }
 
@@ -1890,7 +1884,7 @@ class HloInstruction {
   }
 
   std::optional<std::string> get_frontend_attribute(
-      const std::string& key) const {
+      absl::string_view key) const {
     auto it = rare()->frontend_attributes.map().find(key);
     if (it == rare()->frontend_attributes.map().end()) {
       return std::nullopt;
@@ -1973,7 +1967,7 @@ class HloInstruction {
   absl::StatusOr<ConfigProto> backend_config() const {
     ConfigProto proto;
     TF_RETURN_IF_ERROR(backend_config_.GetProto(&proto));
-    return std::move(proto);
+    return proto;
   }
 
   absl::Status set_backend_config(const tsl::protobuf::Message& proto) {
@@ -2033,10 +2027,7 @@ class HloInstruction {
   }
   const OpMetadata& metadata() const { return *metadata_; }
 
-  // Set/get the computation containing this instruction. set_parent should only
-  // be called by HloComputation methods which add/remove instructions to
-  // computations.
-  void set_parent(HloComputation* computation) { parent_ = computation; }
+  // Get the computation containing this instruction.
   const HloComputation* parent() const { return parent_; }
   HloComputation* parent() { return parent_; }
 
@@ -2405,9 +2396,8 @@ class HloInstruction {
 
   void DetachFrom(HloInstruction* usee) { usee->RemoveUser(this); }
 
-  void set_called_computation(int index, HloComputation* computation) {
-    mutable_rare()->called_computations[index] = computation;
-  }
+  void set_called_computation(int index, HloComputation* computation);
+
   // Indices of computations in called_computations for instructions which call
   // multiple computations.
   enum {
@@ -2440,6 +2430,9 @@ class HloInstruction {
       bool ignore_channel_id_values,
       bool ignore_commutative_operand_order) const;
 
+  // Set the computation containing this instruction.
+  void set_parent(HloComputation* computation) { parent_ = computation; }
+
   // Implementation for non-common logic of PrintExtraAttributes.
   virtual void PrintExtraAttributesImpl(AttributePrinter& printer,
                                         const HloPrintOptions& options) const {}
@@ -2758,7 +2751,6 @@ bool HloPredicateIsNotOp(const HloInstruction* instruction) {
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSort:
-    case HloOpcode::kTopK:
     case HloOpcode::kCustomCall:
       return true;
     default:
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction_test.cc b/third_party/xla/xla/hlo/ir/hlo_instruction_test.cc
index 90f9a7fad83c..8cebb03683f4 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction_test.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction_test.cc
@@ -13,9 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/shape_util.h"
+#include "xla/side_effect_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -61,16 +65,31 @@ TEST(HloInstruction, AddFrontendAttributes) {
   EXPECT_EQ(instr.get_frontend_attribute("key2").value(), "value2");
 }
 
-TEST(HloInstruction, EraseFrontendAttribute) {
-  HloConstantInstruction instr(ShapeUtil::MakeShape(U32, {3, 2}));
-  instr.add_frontend_attribute("key1", "value1");
-  instr.add_frontend_attribute("key2", "value2");
-  EXPECT_EQ(instr.erase_frontend_attribute("key2"), 1);
-  EXPECT_EQ(instr.erase_frontend_attribute("not_a_key"), 0);
-  EXPECT_EQ(instr.get_frontend_attribute("key1").value(), "value1")
-      << "key1 should not be erased";
-  EXPECT_EQ(instr.get_frontend_attribute("key2"), std::nullopt)
-      << "key2 should have been erased";
+TEST(HloInstruction, CustomCallInstructionStorage) {
+  HloCustomCallInstruction instr(ShapeUtil::MakeShape(U32, {3, 2}),
+                                 /*operands=*/{}, "custom_call_target",
+                                 /*opaque=*/"",
+                                 CustomCallApiVersion::API_VERSION_ORIGINAL);
+  EXPECT_EQ(instr.GetPerInstructionStorage(), nullptr);
+  auto* storage1 = new HloCustomCallInstruction::PerInstructionStorage();
+  auto* storage2 = new HloCustomCallInstruction::PerInstructionStorage();
+
+  instr.SetPerInstructionStorage(
+      std::unique_ptr<HloCustomCallInstruction::PerInstructionStorage>(
+          storage1));
+  instr.SetPerInstructionStorage(
+      std::unique_ptr<HloCustomCallInstruction::PerInstructionStorage>(
+          storage2));
+
+  EXPECT_EQ(instr.GetPerInstructionStorage(), storage1);
+}
+
+TEST(HloInstruction, DeriveComputeTypeAttribute) {
+  HloConstantInstruction instr0(ShapeUtil::MakeShape(U32, {3, 2}));
+  instr0.add_frontend_attribute(kXlaComputeTypeAttr, kXlaComputeTypeHost);
+  HloConstantInstruction instr1(ShapeUtil::MakeShape(U32, {3, 2}));
+  instr0.SetupDerivedInstruction(&instr1);
+  EXPECT_FALSE(instr1.has_frontend_attributes());
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 12188e79b301..0557a52c67fd 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -400,35 +400,12 @@ HloAsyncStartInstruction::HloAsyncStartInstruction(
     HloComputation* async_computation, absl::string_view async_execution_thread)
     : HloAsyncInstruction(opcode, shape, operands,
                           async_computation->root_instruction()->opcode()) {
-  CHECK(!async_computation->IsCustomCallComputation());
+  CHECK(async_computation->caller_instructions(HloOpcode::kCustomCall).empty());
   CHECK(!async_computation->IsFusionComputation());
-  CHECK(!async_computation->IsAsyncComputation());
   AppendComputation(async_computation);
-  async_computation->AddAsyncStart(this);
   HloAsyncStartInstruction::set_async_execution_thread(async_execution_thread);
 }
 
-HloAsyncStartInstruction::~HloAsyncStartInstruction() {
-  ClearAsyncComputationInstruction();
-}
-
-void HloAsyncStartInstruction::ClearCalledComputations() {
-  ClearAsyncComputationInstruction();
-  HloInstruction::ClearCalledComputations();
-}
-
-void HloAsyncStartInstruction::ClearAsyncComputationInstruction() {
-  // Each async instruction calls a single computation, but we use
-  // called_computations() instead of async_wrapped_instruction(), because the
-  // order in which things get destructed can vary; the async computation's
-  // back-pointer may already be null, which violates a check in
-  // async_wrapped_instruction.
-  if (!called_computations().empty() &&
-      async_wrapped_computation()->AsyncStart() == this) {
-    async_wrapped_computation()->RemoveAsyncStart();
-  }
-}
-
 void HloAsyncStartInstruction::set_async_execution_thread(
     absl::string_view async_execution_thread) {
   async_execution_thread_ = std::string(async_execution_thread);
@@ -934,8 +911,8 @@ void HloCollectiveInstruction::PrintExtraAttributesImpl(
     VLOG(4) << name() << " replica_groups="
             << device_list_.ToString(options.print_full_replica_group_list());
 
-    AppendCat(printer, "replica_groups=",
-              device_list_.ToString(options.print_full_replica_group_list()));
+    printer->Append("replica_groups=");
+    device_list_.Print(printer, options.print_full_replica_group_list());
   });
   if (constrain_layout_) {
     printer.Next(
@@ -1028,7 +1005,6 @@ HloAllReduceInstructionBase::HloAllReduceInstructionBase(
                                constrain_layout, channel_id),
       use_global_device_ids_(use_global_device_ids) {
   AppendComputation(reduce_computation);
-  reduce_computation->SetCollectiveCallInstruction(this);
 }
 
 HloInstructionProto HloAllReduceInstructionBase::ToProto() const {
@@ -1517,7 +1493,7 @@ HloTransposeInstruction::HloTransposeInstruction(
 
 bool HloTransposeInstruction::IsRank2Transpose() const {
   return dimensions() == std::vector<int64_t>({1, 0}) &&
-         shape().dimensions_size() == 2 &&
+         shape().dimensions().size() == 2 &&
          std::equal(shape().dimensions().begin(), shape().dimensions().end(),
                     operand(0)->shape().dimensions().rbegin());
 }
@@ -1619,7 +1595,7 @@ HloMapInstruction::HloMapInstruction(const Shape& shape,
   AppendComputation(map_computation);
   // TODO(b/65689298) Remove code below once Map is generalized to accept
   // arbitrary map dimensions.
-  dimensions_.resize(shape.rank());
+  dimensions_.resize(shape.dimensions().size());
   std::iota(dimensions_.begin(), dimensions_.end(), 0);
 }
 
@@ -1635,7 +1611,7 @@ bool HloMapInstruction::IsElementwiseImpl(
     const std::optional<int64_t>& operand_idx) const {
   if (!dimensions().empty()) {
     // Check that the map is executed in elementwise compatible dimensions.
-    if (dimensions().size() != shape().dimensions_size()) {
+    if (dimensions().size() != shape().dimensions().size()) {
       return false;
     }
     for (int i = 0; i < dimensions().size(); ++i) {
@@ -1827,14 +1803,16 @@ void HloConstantInstruction::PrintOperandsWithCanonicalNameMap(
       printer->Append("1");
       return;
     }
-    if (shape().IsInteger()) {
+    if (shape().AreAllLeavesIntegers()) {
       // The following prevents high compilation latencies caused by serializing
       // large constant tensors; for example: b/265669625. The limit of 500k was
       // chosen empirically to make sure that serialization of the `literal_` is
       // less than a second.
-      if (auto num_constants =
-              absl::c_accumulate(shape().dimensions(), 1, std::multiplies<>());
-          num_constants <= 500'000) {
+      const auto num_constants =
+          shape().IsArray()
+              ? absl::c_accumulate(shape().dimensions(), 1, std::multiplies<>())
+              : 1;
+      if (num_constants <= 500'000) {
         literal_->PrintWithoutShapeOneline(printer);
         return;
       }
@@ -1976,8 +1954,8 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     CHECK(!add_output);
     auto builder = HloComputation::Builder(default_called_computation_name());
     builder.AddInstruction(instruction_to_append->Clone(/*suffix=*/""));
-    auto* new_computation =
-        CHECK_NOTNULL(GetModule())->AddEmbeddedComputation(builder.Build());
+    auto* new_computation = CHECK_NOTNULL(instruction_to_append->GetModule())
+                                ->AddEmbeddedComputation(builder.Build());
     AppendComputation(new_computation);
     if (opcode() == HloOpcode::kFusion) {
       new_computation->SetFusionInstruction(this);
@@ -2198,13 +2176,8 @@ HloFusionInstruction::HloFusionInstruction(const Shape& shape,
   CHECK(fused_root != nullptr);
   SetAndSanitizeName(absl::StrCat(prefix, HloOpcodeString(opcode())));
 
-  set_parent(fused_root->parent());
   set_metadata(fused_root->metadata());
   set_frontend_attributes(fused_root->frontend_attributes());
-  // This simplifies some use cases for the original value that involve fusions.
-  if (auto original_value = fused_root->original_value()) {
-    set_original_value(original_value);
-  }
   CHECK(fused_root->IsFusible()) << fused_root->ToString();
   CloneAndAppendInstructionIntoCalledComputation(fused_root);
 }
@@ -2613,7 +2586,6 @@ HloCallInstruction::HloCallInstruction(const Shape& shape,
     : HloCallableInstruction(HloOpcode::kCall, shape) {
   CHECK(called_computation_root != nullptr);
   SetAndSanitizeName(HloOpcodeString(opcode()));
-  set_parent(called_computation_root->parent());
   set_metadata(called_computation_root->metadata());
   CloneAndAppendInstructionIntoCalledComputation(called_computation_root);
 }
@@ -2643,7 +2615,6 @@ HloCallInstruction::HloCallInstruction(const Shape& shape,
 
   add_frontend_attributes(frontend_attributes);
   set_is_composite(true);
-  set_parent(decomposition_root->parent());
   set_metadata(decomposition_root->metadata());
   CloneAndAppendInstructionIntoCalledComputation(decomposition_root);
 }
@@ -2990,7 +2961,7 @@ HloInstructionProto HloConvolutionInstruction::ToProto() const {
 
 void HloConvolutionInstruction::PrintExtraAttributesImpl(
     AttributePrinter& printer, const HloPrintOptions& options) const {
-  if (window_.dimensions_size() != 0) {
+  if (!window_.dimensions().empty()) {
     printer.Next([this](Printer* printer) {
       AppendCat(printer, "window={", window_util::ToString(window()), "}");
     });
@@ -3026,12 +2997,13 @@ bool HloConvolutionInstruction::IdenticalSlowPath(
   if (batch_group_count_ != other.batch_group_count()) {
     return false;
   }
-  return protobuf_util::ProtobufEquals(window(), casted_other.window()) &&
-         protobuf_util::ProtobufEquals(
+  return protobuf_util::HaveSameSerialization(window(),
+                                              casted_other.window()) &&
+         protobuf_util::HaveSameSerialization(
              convolution_dimension_numbers(),
              casted_other.convolution_dimension_numbers()) &&
-         protobuf_util::ProtobufEquals(precision_config(),
-                                       casted_other.precision_config());
+         protobuf_util::HaveSameSerialization(precision_config(),
+                                              casted_other.precision_config());
 }
 
 std::unique_ptr<HloInstruction>
@@ -3074,7 +3046,7 @@ HloInstructionProto HloReduceWindowInstruction::ToProto() const {
 
 void HloReduceWindowInstruction::PrintExtraAttributesImpl(
     AttributePrinter& printer, const HloPrintOptions& options) const {
-  if (window_.dimensions_size() != 0) {
+  if (!window_.dimensions().empty()) {
     printer.Next([this](Printer* printer) {
       AppendCat(printer, "window={", window_util::ToString(window()), "}");
     });
@@ -3088,7 +3060,7 @@ bool HloReduceWindowInstruction::IdenticalSlowPath(
   const auto& casted_other =
       static_cast<const HloReduceWindowInstruction&>(other);
   return eq_computations(to_apply(), casted_other.to_apply()) &&
-         protobuf_util::ProtobufEquals(window(), casted_other.window());
+         protobuf_util::HaveSameSerialization(window(), casted_other.window());
 }
 
 std::unique_ptr<HloInstruction>
@@ -3125,7 +3097,7 @@ HloInstructionProto HloSelectAndScatterInstruction::ToProto() const {
 
 void HloSelectAndScatterInstruction::PrintExtraAttributesImpl(
     AttributePrinter& printer, const HloPrintOptions& options) const {
-  if (window_.dimensions_size() != 0) {
+  if (!window_.dimensions().empty()) {
     printer.Next([this](Printer* printer) {
       AppendCat(printer, "window={", window_util::ToString(window()), "}");
     });
@@ -3140,7 +3112,7 @@ bool HloSelectAndScatterInstruction::IdenticalSlowPath(
       static_cast<const HloSelectAndScatterInstruction&>(other);
   return eq_computations(select(), casted_other.select()) &&
          eq_computations(scatter(), casted_other.scatter()) &&
-         protobuf_util::ProtobufEquals(window(), casted_other.window());
+         protobuf_util::HaveSameSerialization(window(), casted_other.window());
 }
 
 std::unique_ptr<HloInstruction>
@@ -3183,7 +3155,6 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       custom_call_schedule_(CustomCallSchedule::SCHEDULE_NONE),
       api_version_(api_version) {
   set_raw_backend_config_string(std::move(opaque));
-  to_apply->SetCustomCallInstruction(this);
 }
 
 HloCustomCallInstruction::HloCustomCallInstruction(
@@ -3202,9 +3173,6 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       custom_call_schedule_(CustomCallSchedule::SCHEDULE_NONE),
       api_version_(api_version) {
   set_raw_backend_config_string(std::move(opaque));
-  for (auto comp : called_computations) {
-    comp->SetCustomCallInstruction(this);
-  }
 }
 
 HloCustomCallInstruction::HloCustomCallInstruction(
@@ -3362,14 +3330,14 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
   const auto& casted_other =
       static_cast<const HloCustomCallInstruction&>(other);
   if ((window_ == nullptr) != (casted_other.window_ == nullptr) ||
-      (window_ != nullptr &&
-       !protobuf_util::ProtobufEquals(*window_, *casted_other.window_))) {
+      (window_ != nullptr && !protobuf_util::HaveSameSerialization(
+                                 *window_, *casted_other.window_))) {
     return false;
   }
   if ((convolution_dimension_numbers_ == nullptr) !=
           (casted_other.convolution_dimension_numbers_ == nullptr) ||
       (convolution_dimension_numbers_ != nullptr &&
-       !protobuf_util::ProtobufEquals(
+       !protobuf_util::HaveSameSerialization(
            convolution_dimension_numbers(),
            casted_other.convolution_dimension_numbers()))) {
     return false;
@@ -3404,8 +3372,8 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
       casted_other.output_to_operand_aliasing()) {
     return false;
   }
-  if (!protobuf_util::ProtobufEquals(precision_config(),
-                                     casted_other.precision_config())) {
+  if (!protobuf_util::HaveSameSerialization(precision_config(),
+                                            casted_other.precision_config())) {
     return false;
   }
 
@@ -3465,7 +3433,7 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
   cloned->set_padding_type(padding_type_);
   *cloned->mutable_precision_config() = precision_config();
   cloned->set_custom_call_schedule(custom_call_schedule_);
-  return std::move(cloned);
+  return cloned;
 }
 
 HloPadInstruction::HloPadInstruction(const Shape& shape,
@@ -3495,8 +3463,8 @@ bool HloPadInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   const auto& casted_other = static_cast<const HloPadInstruction&>(other);
-  return protobuf_util::ProtobufEquals(padding_config(),
-                                       casted_other.padding_config());
+  return protobuf_util::HaveSameSerialization(padding_config(),
+                                              casted_other.padding_config());
 }
 
 std::unique_ptr<HloInstruction> HloPadInstruction::CloneWithNewOperandsImpl(
@@ -3577,7 +3545,8 @@ std::unique_ptr<HloInstruction>
 HloDynamicSliceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  if (new_operands.size() == 2 && new_operands[1]->shape().rank() == 1) {
+  if (new_operands.size() == 2 &&
+      new_operands[1]->shape().dimensions().size() == 1) {
     // TODO(b/118437727): Old form, remove this path.
     return std::make_unique<HloDynamicSliceInstruction>(
         shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
@@ -3685,7 +3654,7 @@ bool HloGatherInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   const auto& casted_other = static_cast<const HloGatherInstruction&>(other);
-  return protobuf_util::ProtobufEquals(
+  return protobuf_util::HaveSameSerialization(
              gather_dimension_numbers(),
              casted_other.gather_dimension_numbers()) &&
          gather_slice_sizes() == casted_other.gather_slice_sizes() &&
@@ -3802,7 +3771,7 @@ bool HloScatterInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   const auto& casted_other = static_cast<const HloScatterInstruction&>(other);
-  return protobuf_util::ProtobufEquals(
+  return protobuf_util::HaveSameSerialization(
              scatter_dimension_numbers(),
              casted_other.scatter_dimension_numbers()) &&
          eq_computations(to_apply(), casted_other.to_apply()) &&
@@ -3900,12 +3869,12 @@ bool HloDotInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   const auto& casted_other = static_cast<const HloDotInstruction&>(other);
-  return protobuf_util::ProtobufEquals(dot_dimension_numbers(),
-                                       casted_other.dot_dimension_numbers()) &&
-         protobuf_util::ProtobufEquals(precision_config(),
-                                       casted_other.precision_config()) &&
+  return protobuf_util::HaveSameSerialization(
+             dot_dimension_numbers(), casted_other.dot_dimension_numbers()) &&
+         protobuf_util::HaveSameSerialization(
+             precision_config(), casted_other.precision_config()) &&
          absl::c_equal(sparsity_, casted_other.sparsity_,
-                       protobuf_util::ProtobufEquals);
+                       protobuf_util::HaveSameSerialization);
 }
 
 std::unique_ptr<HloInstruction> HloDotInstruction::CloneWithNewOperandsImpl(
@@ -3951,11 +3920,11 @@ bool HloRaggedDotInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   const auto& casted_other = static_cast<const HloRaggedDotInstruction&>(other);
-  return protobuf_util::ProtobufEquals(
+  return protobuf_util::HaveSameSerialization(
              ragged_dot_dimension_numbers(),
              casted_other.ragged_dot_dimension_numbers()) &&
-         protobuf_util::ProtobufEquals(precision_config(),
-                                       casted_other.precision_config());
+         protobuf_util::HaveSameSerialization(precision_config(),
+                                              casted_other.precision_config());
 }
 
 std::unique_ptr<HloInstruction>
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 1930f5aebacb..22dd6ebe0e7b 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/collective_device_list.h"
@@ -301,12 +302,6 @@ class HloAsyncStartInstruction : public HloAsyncInstruction {
       HloComputation* async_computation,
       absl::string_view async_execution_thread = kMainExecutionThread);
 
-  ~HloAsyncStartInstruction() override;
-  void ClearCalledComputations() override;
-  // When an async instruction is being destructed, remove it from the vector of
-  // pointers of its called computation, to avoid referencing freed memory.
-  void ClearAsyncComputationInstruction();
-
   absl::string_view async_execution_thread() const override {
     return async_execution_thread_;
   };
@@ -707,6 +702,7 @@ class HloAllGatherInstruction : public HloCollectiveInstruction {
 
   // Same as HloAllReduceInstruction::use_global_device_ids.
   bool use_global_device_ids() const { return use_global_device_ids_; }
+  void set_use_global_device_ids(bool value) { use_global_device_ids_ = value; }
 
   // The dimension on which data from different participants are concatenated.
   int64_t all_gather_dimension() const { return all_gather_dimension_; }
@@ -2205,6 +2201,26 @@ class HloCustomCallInstruction : public HloCallableInstruction {
     return hlo->opcode() == HloOpcode::kCustomCall;
   }
 
+  class PerInstructionStorage {
+    // Abstract class for per-instruction storage.
+   public:
+    virtual ~PerInstructionStorage() = default;
+  };
+
+  void SetPerInstructionStorage(
+      std::unique_ptr<PerInstructionStorage> per_instruction_storage) {
+    absl::MutexLock lock(&per_instruction_storage_mutex_);
+    if (per_instruction_storage_ != nullptr) {
+      LOG(WARNING) << "Not Overwriting existing per-instruction storage.";
+      return;
+    }
+    per_instruction_storage_ = std::move(per_instruction_storage);
+  }
+
+  const PerInstructionStorage* GetPerInstructionStorage() const {
+    return per_instruction_storage_.get();
+  }
+
  protected:
   std::string default_called_computation_name() const override {
     return "custom_call_computation";
@@ -2249,6 +2265,9 @@ class HloCustomCallInstruction : public HloCallableInstruction {
   // TODO(b/189822916): Remove this field when all clients are migrated to the
   // status-returning API.
   CustomCallApiVersion api_version_;
+
+  absl::Mutex per_instruction_storage_mutex_;
+  std::unique_ptr<PerInstructionStorage> per_instruction_storage_ = nullptr;
 };
 
 class HloPadInstruction : public HloInstruction {
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index f9ab1aeb7897..2c07de49b2e1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "highwayhash/arch_specific.h"
+#include "highwayhash/hh_types.h"
+#include "highwayhash/highwayhash.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -90,6 +93,14 @@ HloModule::HloModule(const std::string& name,
   metadata_.set_canonical_module_id(unique_id_);
 }
 
+HloModule::~HloModule() {
+  // To avoid dangling references between computations, we first clear all the
+  // inter-computation references before deleting any of the computations.
+  for (const auto& computation : computations_) {
+    computation->ClearCalledComputations();
+  }
+}
+
 absl::Status HloModule::set_schedule(HloSchedule schedule) {
   TF_RET_CHECK(schedule.module() == this);
   TF_RETURN_IF_ERROR(schedule.Verify());
@@ -180,6 +191,17 @@ HloComputation* HloModule::AddComputationInternal(
   }
 
   computation->set_parent(this);
+  topological_sort_.AddNode(computation.get());
+  for (auto& [caller, count] : computation->caller_computations_) {
+    if (caller->parent() == this) {
+      topological_sort_.AddEdge(caller, computation.get());
+    }
+  }
+  for (auto& [callee, count] : computation->callee_computations_) {
+    if (callee->parent() == this) {
+      topological_sort_.AddEdge(computation.get(), callee);
+    }
+  }
   computations_.push_back(std::move(computation));
   return computations_.back().get();
 }
@@ -202,6 +224,7 @@ absl::Status HloModule::RemoveEmbeddedComputation(HloComputation* to_remove) {
   if (has_schedule()) {
     schedule_->remove_computation(to_remove);
   }
+  topological_sort_.RemoveNode(to_remove);
 
   auto it = absl::c_find_if(
       computations_, [&to_remove](const std::unique_ptr<HloComputation>& comp) {
@@ -221,8 +244,9 @@ HloComputation* HloModule::AddEmbeddedComputation(
 }
 
 void HloModule::MarkFusionDuplications(
-    const absl::flat_hash_map<HloComputation*, HloComputation*>& replacements) {
-  for (std::unique_ptr<HloComputation>& computation : computations_) {
+    const absl::flat_hash_map<HloComputation*, HloComputation*>& replacements)
+    const {
+  for (const std::unique_ptr<HloComputation>& computation : computations_) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kFusion) {
         auto rep =
@@ -253,6 +277,7 @@ void HloModule::MoveComputationsFrom(HloModule* module,
     if (computation_raw_ptr->IsEntryComputation()) {
       this->entry_computation_ = nullptr;
     }
+    module->topological_sort_.RemoveNode(computation_raw_ptr);
     this->AddComputationInternal(
         std::move(module->computations_[i]),
         /*is_entry=*/computation_raw_ptr->IsEntryComputation(),
@@ -339,6 +364,8 @@ void HloModule::ReplaceComputations(
 
     if (replacements.find(computation.get()) == replacements.end()) {
       new_computations.push_back(std::move(computation));
+    } else {
+      topological_sort_.RemoveNode(computation.get());
     }
   }
 
@@ -357,7 +384,6 @@ void HloModule::Print(Printer* printer, const HloPrintOptions& options) const {
     printer->Append(name());
   }
   if (has_schedule()) {
-    TF_CHECK_OK(schedule().Verify());
     printer->Append(", is_scheduled=true");
   }
   std::string serialized_aliasing = input_output_alias_config().ToShortString();
@@ -413,9 +439,14 @@ void HloModule::Print(Printer* printer, const HloPrintOptions& options) const {
               FrontendAttributesToString(frontend_attributes_));
   }
   printer->Append("\n\n");
-  const auto& computations = options.canonicalize_computations()
-                                 ? MakeComputationSorted()
-                                 : MakeComputationPostOrder();
+  // We use a DFS postorder traversal to ensure that computations are printed
+  // more consistently run to run. Even thet non-dfs postorder is deterministic,
+  // but exactly which topological ordering it yields depends on the order in
+  // which the module was constructed.
+  const auto& computations =
+      options.canonicalize_computations()
+          ? MakeComputationSorted()
+          : MakeComputationPostOrder(/*dfs_postorder=*/true);
   for (const HloComputation* computation : computations) {
     // Don't print async computations when the syntax sugar is enabled since
     // that is redundant information.
@@ -461,6 +492,52 @@ absl::Cord HloModule::ToCord(const HloPrintOptions& options) const {
   return std::move(printer).ToCord();
 }
 
+namespace {
+// Generated using openssl rand.
+static constexpr highwayhash::HHKey kDefaultKey = {
+    0x9e0433b546e065d2ull,
+    0x0e7ecad49e703760ull,
+    0x83d29f20dae229b0ull,
+    0x40c1ce3ff9d19a42ull,
+};
+
+// HighwayHashPrinter is a Printer that computes the fingerprint of the added
+// data using a HighwayHash hasher.
+class HighwayHashPrinter : public Printer {
+ public:
+  HighwayHashPrinter() : hasher_(kDefaultKey) {}
+
+  void Append(const absl::AlphaNum& a) override {
+    hasher_.Append(a.data(), a.size());
+  }
+
+  void AppendInt64List(absl::Span<const int64_t> list,
+                       bool _ /*leading_comma*/) override {
+    // Instead of separators, prefix with the length. This is fine since
+    // there's no way for the caller to distinguish between the two.
+    const uint64_t num = list.size();
+    hasher_.Append(reinterpret_cast<const char*>(&num), sizeof(num));
+    hasher_.Append(reinterpret_cast<const char*>(list.data()),
+                   list.size() * sizeof(list[0]));
+  }
+
+  uint64_t ToFingerprint() {
+    highwayhash::HHResult64 result;
+    hasher_.Finalize(&result);
+    return result;
+  }
+
+ private:
+  highwayhash::HighwayHashCatT<HH_TARGET_PREFERRED> hasher_;
+};
+}  // namespace
+
+uint64_t HloModule::ToFingerprint(const HloPrintOptions& options) const {
+  HighwayHashPrinter printer;
+  Print(&printer, options);
+  return printer.ToFingerprint();
+}
+
 HloModuleProto HloModule::ToProto() const {
   HloModuleProto proto;
   proto.set_id(unique_id_);
@@ -517,6 +594,9 @@ HloModuleProto HloModule::ToProto() const {
     profile_info_proto.set_fingerprint(profile_info.fingerprint());
     profile_info_proto.set_profile_generation_strategy(
         profile_info.profile_generation_strategy());
+    profile_info_proto.set_original_changelist(
+        profile_info.original_changelist());
+    profile_info_proto.set_changelist(profile_info.changelist());
   }
   if (config().has_static_device_assignment()) {
     DeviceAssignmentProto device_assignment;
@@ -578,7 +658,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   // the entry parameters and root.
   TF_RET_CHECK(proto.has_host_program_shape())
       << "No program shape found in the proto";
-  ProgramShape expected_program_shape(proto.host_program_shape());
+  TF_ASSIGN_OR_RETURN(ProgramShape expected_program_shape,
+                      ProgramShape::FromProto(proto.host_program_shape()));
   TF_RET_CHECK(expected_program_shape.parameters_size() ==
                module_config.entry_computation_layout().parameter_count());
   for (int i = 0; i < expected_program_shape.parameters_size(); ++i) {
@@ -710,7 +791,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
       module->stack_frame_index_ = std::move(proto.stack_frame_index());
     }
   }
-  return std::move(module);
+  return module;
 }
 
 /* static */
@@ -803,7 +884,8 @@ absl::StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
     return tsl::errors::FailedPrecondition(
         "No program shape found in the proto");
   }
-  ProgramShape program_shape(module.host_program_shape());
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      ProgramShape::FromProto(module.host_program_shape()));
   TF_ASSIGN_OR_RETURN(HloModuleConfig config,
                       CreateModuleConfigFromShape(program_shape, debug_options,
                                                   execution_options));
@@ -939,9 +1021,10 @@ int64_t HloModule::instruction_count() const {
 
 std::vector<HloComputation*> HloModule::MakeComputationPostOrder(
     const absl::flat_hash_set<absl::string_view>& execution_threads,
-    const absl::flat_hash_set<HloComputation*>& allow_list) const {
+    const absl::flat_hash_set<HloComputation*>& allow_list,
+    bool dfs_postorder) const {
   std::vector<HloComputation*> post_order =
-      this->MakeComputationPostOrder(execution_threads);
+      this->MakeComputationPostOrder(execution_threads, dfs_postorder);
 
   post_order.erase(std::remove_if(post_order.begin(), post_order.end(),
                                   [&allow_list](HloComputation* computation) {
@@ -953,69 +1036,104 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder(
 }
 
 std::vector<HloComputation*> HloModule::MakeComputationPostOrder(
-    const absl::flat_hash_set<absl::string_view>& execution_threads) const {
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    bool dfs_postorder) const {
   if (computations_.empty()) {
     return {};
   }
-  // First determine all root computations by building a set of non-root
-  // computations (computations which are called by an instruction in the
-  // module).
-  absl::flat_hash_set<HloComputation*> nonroot_computations;
-  nonroot_computations.reserve(computations_.size() - 1);
-  for (auto& computation : computations_) {
-    for (const HloInstructionInfo& inst :
-         computation->instructions_with_info()) {
-      if (HloInstruction::MightHaveCalledComputations(inst.opcode())) {
-        for (HloComputation* called_computation : inst->called_computations()) {
-          nonroot_computations.insert(called_computation);
+
+  if (dfs_postorder) {
+    // First determine all root computations by building a set of non-root
+    // computations (computations which are called by an instruction in the
+    // module).
+    absl::flat_hash_set<HloComputation*> nonroot_computations;
+    nonroot_computations.reserve(computations_.size() - 1);
+    for (auto& computation : computations_) {
+      for (const HloInstructionInfo& inst :
+           computation->instructions_with_info()) {
+        if (HloInstruction::MightHaveCalledComputations(inst.opcode())) {
+          for (HloComputation* called_computation :
+               inst->called_computations()) {
+            nonroot_computations.insert(called_computation);
+          }
         }
       }
     }
-  }
 
-  // Keep track of computations which have already been added to the post
-  // order. This prevents duplication as an embedded computation may be called
-  // from two different root computations.
-  absl::flat_hash_set<HloComputation*> added_computations;
-  std::vector<HloComputation*> post_order;
-  added_computations.reserve(computations_.size());
-  post_order.reserve(computations_.size());
-  for (auto& computation : computations_) {
-    if (nonroot_computations.contains(computation.get())) {
-      continue;
+    // Keep track of computations which have already been added to the post
+    // order. This prevents duplication as an embedded computation may be called
+    // from two different root computations.
+    absl::flat_hash_set<HloComputation*> added_computations;
+    std::vector<HloComputation*> post_order;
+    added_computations.reserve(computations_.size());
+    post_order.reserve(computations_.size());
+    for (auto& computation : computations_) {
+      if (nonroot_computations.contains(computation.get())) {
+        continue;
+      }
+      for (HloComputation* embedded_computation :
+           computation->MakeEmbeddedComputationsList()) {
+        if (added_computations.insert(embedded_computation).second) {
+          post_order.push_back(embedded_computation);
+        }
+      }
+      // Root computations should only be encountered once.
+      CHECK(!added_computations.contains(computation.get()));
+      post_order.push_back(computation.get());
+      added_computations.insert(computation.get());
     }
-    for (HloComputation* embedded_computation :
-         computation->MakeEmbeddedComputationsList()) {
-      if (added_computations.insert(embedded_computation).second) {
-        post_order.push_back(embedded_computation);
+    if (post_order.size() != computations_.size()) {
+      for (HloComputation* computation : post_order) {
+        LOG(ERROR) << "Post Order: " << computation->name() << " ("
+                   << computation->parent()->name() << ")";
+      }
+      for (auto& computation : computations_) {
+        LOG(ERROR) << "Computations: " << computation->name() << " ("
+                   << computation->parent()->name() << ")";
       }
+      LOG(FATAL) << "Mismatch computation count: post_order="
+                 << post_order.size()
+                 << " computation_count=" << computations_.size();
     }
-    // Root computations should only be encountered once.
-    CHECK(!added_computations.contains(computation.get()));
-    post_order.push_back(computation.get());
-    added_computations.insert(computation.get());
-  }
-  if (post_order.size() != computations_.size()) {
-    for (HloComputation* computation : post_order) {
-      LOG(ERROR) << "Post Order: " << computation->name() << " ("
-                 << computation->parent()->name() << ")";
+    if (!execution_threads.empty()) {
+      post_order.erase(std::remove_if(post_order.begin(), post_order.end(),
+                                      [&](HloComputation* computation) {
+                                        return !execution_threads.contains(
+                                            computation->execution_thread());
+                                      }),
+                       post_order.end());
     }
-    for (auto& computation : computations_) {
-      LOG(ERROR) << "Computations: " << computation->name() << " ("
-                 << computation->parent()->name() << ")";
+    return post_order;
+  } else {
+    // The topological sort is a reverse post-order, reverse it so we get a
+    // post-order.
+    std::vector<HloComputation*> post_order;
+    post_order.reserve(computations_.size());
+    int num_computations = 0;
+    for (auto it = topological_sort_.rbegin(); it != topological_sort_.rend();
+         ++it) {
+      ++num_computations;
+      if (execution_threads.empty() ||
+          execution_threads.contains(it->execution_thread())) {
+        post_order.push_back(&*it);
+      }
     }
-    LOG(FATAL) << "Mismatch computation count: post_order=" << post_order.size()
-               << " computation_count=" << computations_.size();
-  }
-  if (!execution_threads.empty()) {
-    post_order.erase(std::remove_if(post_order.begin(), post_order.end(),
-                                    [&](HloComputation* computation) {
-                                      return !execution_threads.contains(
-                                          computation->execution_thread());
-                                    }),
-                     post_order.end());
+
+    if (num_computations != computations_.size()) {
+      for (HloComputation& computation : topological_sort_) {
+        LOG(ERROR) << "Reverse postorder: " << computation.name() << " ("
+                   << computation.parent()->name() << ")";
+      }
+      for (auto& computation : computations_) {
+        LOG(ERROR) << "Computations: " << computation->name() << " ("
+                   << computation->parent()->name() << ")";
+      }
+      LOG(FATAL) << "Mismatch computation count: post_order="
+                 << post_order.size()
+                 << " computation_count=" << computations_.size();
+    }
+    return post_order;
   }
-  return post_order;
 }
 
 namespace {
@@ -1027,8 +1145,10 @@ class FingerprintMap {
   uint64_t GetFingerprint(const HloComputation* computation) {
     auto result = fingerprint_map_.try_emplace(computation, 0);
     if (result.second) {
-      result.first->second =
-          tsl::Fingerprint64(computation->ToString(print_options_));
+      HighwayHashPrinter printer;
+      computation->Print(&printer, print_options_,
+                         computation->MakeInstructionPostOrder());
+      result.first->second = printer.ToFingerprint();
     }
     return result.first->second;
   }
@@ -1103,6 +1223,26 @@ std::unique_ptr<HloModule> CreateModule(
       new_name, new_config,
       std::make_unique<CompilationEnvironments>(source.comp_envs()));
 }
+
+void CopyUniqueIds(const HloModule& source, HloModule* clone,
+                   const HloCloneContext& context) {
+  for (HloComputation* computation : source.computations()) {
+    HloComputation* new_computation = context.FindComputation(computation);
+    if (new_computation == nullptr) {
+      continue;
+    }
+    new_computation->ClearUniqueIdInternal();
+    new_computation->SetUniqueId(computation->unique_id());
+    for (HloInstruction* instruction : computation->instructions()) {
+      HloInstruction* new_instruction = context.FindInstruction(instruction);
+      if (new_instruction != nullptr) {
+        new_instruction->ClearUniqueIdInternal();
+        new_instruction->SetUniqueId(instruction->unique_id());
+      }
+    }
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<HloModule> HloModule::Clone(
@@ -1115,6 +1255,11 @@ std::unique_ptr<HloModule> HloModule::Clone(
     auto cloned_computation = entry_computation_->Clone(suffix, &context);
     module->AddEntryComputation(std::move(cloned_computation));
   }
+
+  // Preserve original instruction and computation ids.
+  CopyUniqueIds(*this, module.get(), context);
+  module->next_unique_id_ = next_unique_id_;
+
   module->input_output_alias_config() = input_output_alias_config();
   module->buffer_donor_config() = buffer_donor_config();
   module->set_is_dynamic(is_dynamic());
@@ -1204,7 +1349,8 @@ uint64_t HloModule::RandomNew64() const {
   return rng_();
 }
 
-HloComputation* HloModule::GetComputationWithName(absl::string_view name) {
+HloComputation* HloModule::GetComputationWithName(
+    absl::string_view name) const {
   auto computations_in_module = computations();
   auto it = absl::c_find_if(
       computations_in_module,
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 260b33720b9d..70356cea6d59 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/iterator_util.h"
+#include "xla/online_topsort.h"
 #include "xla/printer.h"
 #include "xla/service/compilation_environments.h"
 #include "xla/service/computation_layout.h"
@@ -88,7 +89,7 @@ class HloModule {
   HloModule(const std::string& name,
             std::shared_ptr<const HloModuleConfig> config,
             std::unique_ptr<CompilationEnvironments> comp_envs);
-  virtual ~HloModule() = default;
+  virtual ~HloModule();
 
   // Adds an entry computation to the module. A module can only have one entry
   // computation. Returns a pointer to the newly added computation.
@@ -119,8 +120,8 @@ class HloModule {
   // Marks duplicate fusions with the same name to be able to group them for
   // analysis purposes (e.g. through Xprof).
   void MarkFusionDuplications(
-      const absl::flat_hash_map<HloComputation*, HloComputation*>&
-          replacements);
+      const absl::flat_hash_map<HloComputation*, HloComputation*>& replacements)
+      const;
 
   // Replaces all uses of computations that are keys of 'replacements' with
   // the corresponding values in 'replacements'. Replaces the entry computation,
@@ -131,7 +132,7 @@ class HloModule {
   // computations.
   //
   // N.B.: This function does not update the computations_ field of the
-  // HloModule with the newly added compututations. Therefore, along with
+  // HloModule with the newly added computations. Therefore, along with
   // invoking this function, if a replacement computation is not already present
   // in module, it should be separately added into the module using
   // `AddEmbeddedComputation`.
@@ -286,7 +287,7 @@ class HloModule {
 
   // Returns the computation in this module that has the name `name`.  Returns
   // null if there is no such computation.
-  HloComputation* GetComputationWithName(absl::string_view name);
+  HloComputation* GetComputationWithName(absl::string_view name) const;
 
   // Gets the number of computations in this module.
   int64_t computation_count() const { return computations_.size(); }
@@ -307,24 +308,31 @@ class HloModule {
     }
   }
 
-  // Compute and return a post order of all computations in the module. The sort
-  // is defined like so: if computation A has an instruction which calls
-  // computation B, then A will appear after B in the sort.
-  std::vector<HloComputation*> MakeComputationPostOrder() const {
-    return MakeComputationPostOrder({});
+  // Compute and return a topological sort of all computations in the module.
+  // The sort is defined like so: if computation A has an instruction which
+  // calls computation B, then A will appear after B in the sort.
+  // If `dfs_postorder` is true, the order is a DFS postorder, otherwise it is
+  // any reverse topological sort of the computations. The dfs_postorder is
+  // primarily used for printing an HLO module; it is more expensive to
+  // compute.
+  std::vector<HloComputation*> MakeComputationPostOrder(
+      bool dfs_postorder = false) const {
+    return MakeComputationPostOrder({}, dfs_postorder);
   }
   // Similar as above but only returns computations with specified
   // `execution_threads`. Empty `execution_threads` list means all execution
   // threads are included.
   std::vector<HloComputation*> MakeComputationPostOrder(
-      const absl::flat_hash_set<absl::string_view>& execution_threads) const;
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      bool dfs_postorder = false) const;
   // Same as MakeComputationPostOrder() but only returns the computations that
   // are on specified `execution_threads` and are also found in the passed in
   // allowList. Empty `execution_threads` list means all execution threads are
   // included.
   std::vector<HloComputation*> MakeComputationPostOrder(
       const absl::flat_hash_set<absl::string_view>& execution_threads,
-      const absl::flat_hash_set<HloComputation*>& allow_list) const;
+      const absl::flat_hash_set<HloComputation*>& allow_list,
+      bool dfs_postorder = false) const;
 
   // If config().content_aware_computation_sorting() is true, sorts computations
   // by their contents, otherwise returns MakeComputationPostOrder(). Note that
@@ -417,6 +425,9 @@ class HloModule {
   absl::Cord ToCord() const { return ToCord(HloPrintOptions::Default()); }
   absl::Cord ToCord(const HloPrintOptions& options) const;
 
+  // Returns a stable fingerprint of the module using the given print options.
+  uint64_t ToFingerprint(const HloPrintOptions& options) const;
+
   // Convert an HloModule to or from a proto.
   HloModuleProto ToProto() const;
   static absl::StatusOr<std::unique_ptr<HloModule>> CreateFromProto(
@@ -696,6 +707,8 @@ class HloModule {
   StackFrame get_stack_frame(int id) const;
 
  private:
+  friend class HloComputation;
+
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
       bool uniquify_identifiers, bool preserve_entry_layouts);
@@ -791,6 +804,19 @@ class HloModule {
 
   // Stack frame indexes flat representation.
   std::optional<StackFrameIndexProto> stack_frame_index_;
+
+  // Topological ordering of the computations in this module.
+  // The topological order only contains computations whose parent() is this
+  // module.
+  // TODO(phawkins): unique_id_ may not be as dense as we might like for this
+  // data structure.
+  TopologicalSort<HloComputation, int64_t,
+                  &HloComputation::topological_sort_node_,
+                  &HloComputation::unique_id_, HloComputation::NeighborIterator,
+                  &HloComputation::callers_begin, &HloComputation::callers_end,
+                  HloComputation::NeighborIterator,
+                  &HloComputation::callees_begin, &HloComputation::callees_end>
+      topological_sort_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_test.cc b/third_party/xla/xla/hlo/ir/hlo_module_test.cc
index 66588a961163..7db2120bf179 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_test.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module_test.cc
@@ -18,25 +18,38 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/hash/hash.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::UnorderedElementsAre;
+
 TEST(HloModuleTest, AbslHashValue) {
   HloModule module1("temp_module", HloModuleConfig());
   HloModule module2("temp_module3", HloModuleConfig());
@@ -57,6 +70,29 @@ TEST(HloModuleTest, AbslHashValue) {
   EXPECT_NE(absl::HashOf(module1), absl::HashOf(*module4));
 }
 
+TEST(HloModuleTest, ToFingerprint) {
+  auto fp = [](const HloModule& module) {
+    return module.ToFingerprint(HloPrintOptions::ModuleFingerprint());
+  };
+  HloModule module1("m1", HloModuleConfig());
+  HloModule module2("m2", HloModuleConfig());
+  EXPECT_EQ(fp(module1), fp(module2));
+
+  absl::string_view hlo = R"(
+      HloModule m3
+        ENTRY main {
+          a = f32[] parameter(0)
+          b = f32[] parameter(1)
+        ROOT res = f32[] multiply(a, b)
+      })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module3,
+                          ParseAndReturnUnverifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module4,
+                          ParseAndReturnUnverifiedModule(hlo));
+  EXPECT_EQ(fp(*module3), fp(*module4));
+  EXPECT_NE(fp(module1), fp(*module4));
+}
+
 TEST(HloModuleTest, MutableAndReadOnlyConfigEquals) {
   HloModuleConfig config1;
   config1.set_device_type("GPU");
@@ -202,6 +238,51 @@ TEST(HloModuleTest, CloneWithNewConfig) {
             m1.config().device_memory_size());
 }
 
+TEST(HloModuleTest, ClonePreservesUniqueId) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(R"(
+    HloModule m
+
+    add {
+      p0 = f16[] parameter(0)
+      p1 = f16[] parameter(1)
+      ROOT add = f16[] add(p0, p1)
+    }
+
+    // HloModule::Clone() deletes dead code.
+    dead_code {
+      p0 = f16[] parameter(0)
+      p1 = f16[] parameter(1)
+      ROOT add = f16[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f16[10000000]{0} parameter(0)
+      p1 = f16[10000000]{0} parameter(1)
+      ar0 = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add
+      ar1 = f16[10000000]{0} all-reduce(p1), replica_groups={}, to_apply=add
+      ROOT result = tuple(ar0, ar1)
+    }
+  )"));
+
+  // Annotate all instructions with a unique id. Frontend attributes are
+  // preserved when cloning.
+  static constexpr char kUniqueIdAttr[] = "collective_id";
+  hlo_query::ForEachInstructionWithPred(
+      *module, HloPredicateTrue, [](HloInstruction* instr) {
+        instr->set_frontend_attribute(kUniqueIdAttr,
+                                      absl::StrCat(instr->unique_id()));
+      });
+
+  std::unique_ptr<HloModule> clone = module->Clone(kCloneSuffix);
+  hlo_query::ForEachInstructionWithPred(
+      *clone, HloPredicateTrue, [](HloInstruction* instr) {
+        EXPECT_EQ(instr->get_frontend_attribute(kUniqueIdAttr),
+                  absl::StrCat(instr->unique_id()))
+            << "unique_id differs for " << instr->ToString();
+      });
+}
+
 TEST(HloModuleTest, AbslHashInstructionOrdering) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module1,
                           ParseAndReturnUnverifiedModule(R"(
@@ -469,5 +550,97 @@ TEST(HloModuleTest, CheckToStringHonorsDebugOptions) {
   EXPECT_TRUE(filecheck_matched);
 }
 
+TEST(HloModuleTest, TestCallersAndCallees) {
+  const char* hlo = R"(
+    HloModule jit_h
+
+    f {
+      p0 = f32[] parameter(0)
+      ROOT sine.4 = f32[] sine(p0)
+    }
+
+    g {
+      p0 = f32[] parameter(0)
+      call.f.0 = f32[] call(p0), to_apply=f
+      ROOT call.f.1 = f32[] call(call.f.0), to_apply=f
+    }
+
+    h {
+      ROOT p0 = f32[] parameter(0)
+    }
+
+    uncalled {
+      p0 = f32[] parameter(0)
+      ROOT call.h = f32[] call(p0), to_apply=h
+    }
+
+    ENTRY main {
+      Arg_0.1 = f32[] parameter(0)
+      call.f.2 = f32[] call(Arg_0.1), to_apply=f
+      call.g.0 = f32[] call(call.f.2), to_apply=g
+      ROOT call.g.1 = f32[] call(call.g.0), to_apply=g
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo));
+  EXPECT_EQ(module->computation_count(), 5);
+  HloComputation* main = module->GetComputationWithName("main");
+  HloComputation* f = module->GetComputationWithName("f");
+  HloComputation* g = module->GetComputationWithName("g");
+  HloComputation* h = module->GetComputationWithName("h");
+  HloComputation* uncalled = module->GetComputationWithName("uncalled");
+  EXPECT_THAT(main->callee_computations(),
+              ElementsAre(std::make_pair(f, 1), std::make_pair(g, 2)));
+  EXPECT_THAT(f->callee_computations(), ElementsAre());
+  EXPECT_THAT(g->callee_computations(), ElementsAre(std::make_pair(f, 2)));
+  EXPECT_THAT(f->caller_computations(),
+              ElementsAre(std::make_pair(g, 2), std::make_pair(main, 1)));
+  EXPECT_THAT(g->caller_computations(), ElementsAre(std::make_pair(main, 2)));
+
+  HloInstruction* call_f_0 = g->GetInstructionWithName("call.f.0");
+  HloInstruction* call_f_1 = g->GetInstructionWithName("call.f.1");
+  HloInstruction* call_f_2 = main->GetInstructionWithName("call.f.2");
+  HloInstruction* call_g_0 = main->GetInstructionWithName("call.g.0");
+  HloInstruction* call_g_1 = main->GetInstructionWithName("call.g.1");
+  HloInstruction* call_h = uncalled->GetInstructionWithName("call.h");
+
+  EXPECT_THAT(f->caller_instructions(),
+              UnorderedElementsAre(call_f_0, call_f_1, call_f_2));
+  EXPECT_THAT(g->caller_instructions(),
+              UnorderedElementsAre(call_g_0, call_g_1));
+  EXPECT_THAT(h->caller_instructions(), ElementsAre(call_h));
+  EXPECT_THAT(uncalled->caller_instructions(), IsEmpty());
+}
+
+TEST(HloModuleTest, MultipleCallsFromOneInstruction) {
+  const char* hlo = R"(
+    f {
+      tparam = f32[4] parameter(0)
+      ROOT tuple = (f32[4]) tuple(tparam)
+    }
+
+    g {
+      fparam = f32[4] parameter(0)
+      ROOT tuple = (f32[4]) tuple(fparam)
+    }
+
+    ENTRY main {
+      p0 = f32[4] parameter(0)
+      b0 = s32[] parameter(1)
+      ROOT conditional = (f32[4]) conditional(b0, p0, p0, p0),
+        branch_computations={f, f, g}
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo));
+  EXPECT_EQ(module->computation_count(), 3);
+  HloComputation* main = module->GetComputationWithName("main");
+  HloComputation* f = module->GetComputationWithName("f");
+  HloComputation* g = module->GetComputationWithName("g");
+
+  HloInstruction* conditional = main->GetInstructionWithName("conditional");
+
+  EXPECT_THAT(f->caller_instructions(), ElementsAre(conditional));
+  EXPECT_THAT(g->caller_instructions(), ElementsAre(conditional));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_opcode.cc b/third_party/xla/xla/hlo/ir/hlo_opcode.cc
index 3f0efde356f0..e6042ad3a20e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_opcode.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_opcode.cc
@@ -37,12 +37,13 @@ absl::string_view HloOpcodeString(HloOpcode opcode) {
 }
 
 absl::StatusOr<HloOpcode> StringToHloOpcode(absl::string_view opcode_name) {
-  static auto* opcode_map = new absl::flat_hash_map<std::string, HloOpcode>({
+  static auto* const opcode_map =
+      new absl::flat_hash_map<std::string, HloOpcode>({
 #define STRING_TO_OPCODE_ENTRY(enum_name, opcode_name, ...) \
   {opcode_name, HloOpcode::enum_name},
-      HLO_OPCODE_LIST(STRING_TO_OPCODE_ENTRY)
+          HLO_OPCODE_LIST(STRING_TO_OPCODE_ENTRY)
 #undef STRING_TO_OPCODE_ENTRY
-  });
+      });
   auto it = opcode_map->find(opcode_name);
   if (it == opcode_map->end()) {
     return InvalidArgument("Unknown opcode: %s", opcode_name);
diff --git a/third_party/xla/xla/service/hlo_opcode_test.cc b/third_party/xla/xla/hlo/ir/hlo_opcode_test.cc
similarity index 100%
rename from third_party/xla/xla/service/hlo_opcode_test.cc
rename to third_party/xla/xla/hlo/ir/hlo_opcode_test.cc
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.cc b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
index e76cd15d989c..1762fe71eda0 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_original_value.h"
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
@@ -88,4 +91,24 @@ OriginalValueProto OriginalValueToProto(const OriginalValue& original_value) {
   return original_value_proto;
 }
 
+void MoveOriginalValue(const HloInstruction* src_instruction,
+                       HloInstruction* dest_instruction) {
+  std::shared_ptr<OriginalValue> original_value =
+      src_instruction->original_value();
+  if (!original_value) {
+    return;
+  }
+
+  // This is not expected to happen in practice.
+  if (!ShapeUtil::Compatible(src_instruction->shape(),
+                             dest_instruction->shape())) {
+    LOG(WARNING)
+        << "Expect the new instruction to have the same shape with the old "
+           "instruction when moving over original_value";
+    return;
+  }
+
+  dest_instruction->set_original_value(original_value);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.h b/third_party/xla/xla/hlo/ir/hlo_original_value.h
index eca98ef3d0bc..f61ce0bd7704 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.h
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.h
@@ -35,6 +35,15 @@ using OriginalValue = ShapeTree<std::optional<OriginalArray>>;
 std::string OriginalValueToString(const OriginalValue& original_value);
 
 OriginalValueProto OriginalValueToProto(const OriginalValue& original_value);
+
+// Associate the original value of the source to the destination instruction.
+// Note the original values of fused instructions are copied when they are added
+// into a fusion, so it's not required to move the value if the target is a
+// fusion instruction, which should have the same original value as the root of
+// the fused computation anyway. However, we will move the value nontheless to
+// simplify some use cases that involve fusions.
+void MoveOriginalValue(const HloInstruction* src_instruction,
+                       HloInstruction* dest_instruction);
 }  // namespace xla
 
 #endif  // XLA_HLO_IR_HLO_ORIGINAL_VALUE_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_print_options.h b/third_party/xla/xla/hlo/ir/hlo_print_options.h
index 4fc3d7864b08..0d5f82647025 100644
--- a/third_party/xla/xla/hlo/ir/hlo_print_options.h
+++ b/third_party/xla/xla/hlo/ir/hlo_print_options.h
@@ -62,7 +62,7 @@ class HloPrintOptions {
         compact_operands_(false),
         include_layout_in_shapes_(true),
         print_result_shape_(true),
-        print_operand_shape_(true),
+        print_operand_shape_(false),
         print_operand_names_(true),
         print_program_shape_(true),
         print_percent_(true),
diff --git a/third_party/xla/xla/hlo/ir/hlo_reachability.h b/third_party/xla/xla/hlo/ir/hlo_reachability.h
deleted file mode 100644
index 30153bf07aad..000000000000
--- a/third_party/xla/xla/hlo/ir/hlo_reachability.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_HLO_IR_HLO_REACHABILITY_H_
-#define XLA_HLO_IR_HLO_REACHABILITY_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_reachability.h"
-
-#endif  // XLA_HLO_IR_HLO_REACHABILITY_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.cc b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
index 9a8dd8307bd0..5e39d86170bf 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
@@ -80,7 +80,7 @@ namespace xla {
     }
   }
   TF_RETURN_IF_ERROR(schedule.Verify());
-  return std::move(schedule);
+  return schedule;
 }
 
 absl::StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
@@ -96,7 +96,7 @@ absl::StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
       proto_sequence.add_instruction_ids(id);
     }
   }
-  return std::move(proto);
+  return proto;
 }
 
 void HloSchedule::set_sequence(const HloComputation* computation,
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.h b/third_party/xla/xla/hlo/ir/hlo_schedule.h
index e07f7c10f3cd..8c44ea7c5165 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.h
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.h
@@ -45,6 +45,7 @@ class HloInstructionSequence {
   HloInstructionSequence() = default;
   explicit HloInstructionSequence(
       absl::Span<HloInstruction* const> instructions) {
+    reserve(instructions.size());
     for (HloInstruction* instruction : instructions) {
       push_back(instruction);
     }
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index a9caed3b918b..ff6fd3de63f6 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <map>
@@ -128,7 +129,7 @@ HloSharding HloSharding::AssignDevice(int64_t device_id,
 
 HloSharding HloSharding::Tile1D(const Shape& input_shape, int64_t num_tiles,
                                 absl::Span<const OpMetadata> metadata) {
-  CHECK_EQ(1, input_shape.rank());
+  CHECK_EQ(1, input_shape.dimensions().size());
   CHECK_GT(num_tiles, 1);
   absl::Span<const int64_t> dimensions(&num_tiles, 1);
   return HloSharding(TileAssignment(dimensions, dimensions, {0}),
@@ -138,9 +139,9 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64_t num_tiles,
 HloSharding HloSharding::PartialTile(
     const TileAssignment& tile_assignment_last_dim_replicate,
     absl::Span<const OpMetadata> metadata) {
+  const size_t num_elements = tile_assignment_last_dim_replicate.num_elements();
   if (tile_assignment_last_dim_replicate.num_dimensions() == 1 ||
-      tile_assignment_last_dim_replicate.dimensions().back() ==
-          tile_assignment_last_dim_replicate.num_elements()) {
+      tile_assignment_last_dim_replicate.dimensions().back() == num_elements) {
     return Replicate(metadata);
   }
   if (tile_assignment_last_dim_replicate.dimensions().back() == 1) {
@@ -172,38 +173,17 @@ HloSharding HloSharding::PartialTile(
           /*replicate_on_last_tile_dim=*/true, metadata);
     }
   }
-  // Full array representation handling.
-  std::vector<int64_t> sorted_groups(
-      tile_assignment_last_dim_replicate.num_elements());
-  const int64_t num_groups =
-      tile_assignment_last_dim_replicate.num_elements() / group_size;
-  std::vector<int32_t> current_group_idx(num_groups, 0);
-  auto get_group_id = [&](absl::Span<const int64_t> indices) {
-    int64_t group_id = 0;
-    for (int64_t i = 0; i < indices.size() - 1; ++i) {
-      group_id *= tile_assignment_last_dim_replicate.dim(i);
-      group_id += indices[i];
-    }
-    return group_id;
-  };
-  tile_assignment_last_dim_replicate.Each(
-      [&](absl::Span<const int64_t> indices, const int64_t device) {
-        const int64_t group_id = get_group_id(indices);
-        sorted_groups[group_id * group_size + current_group_idx[group_id]++] =
-            device;
-      });
-  for (int i = 0; i < num_groups; ++i) {
-    std::sort(sorted_groups.begin() + i * group_size,
-              sorted_groups.begin() + (i + 1) * group_size);
-  }
-  absl::c_fill(current_group_idx, 0);
-  auto sorted_tile = std::make_shared<Array<int64_t>>(
-      tile_assignment_last_dim_replicate.dimensions());
-  sorted_tile->Each([&](absl::Span<const int64_t> indices, int64_t* device) {
-    const int64_t group_id = get_group_id(indices);
-    *device =
-        sorted_groups[group_id * group_size + current_group_idx[group_id]++];
-  });
+
+  std::shared_ptr<Array<int64_t>> sorted_tile =
+      tile_assignment_last_dim_replicate.shared_array_clone();
+  int64_t* sorted_tile_data = sorted_tile->data();
+  int64_t* sorted_tile_data_end = sorted_tile_data + num_elements;
+  while (sorted_tile_data < sorted_tile_data_end) {
+    std::sort(sorted_tile_data, sorted_tile_data + group_size);
+    sorted_tile_data += group_size;
+  }
+  DCHECK_EQ(sorted_tile_data, sorted_tile_data_end);
+
   return HloSharding(TileAssignment(std::move(sorted_tile)),
                      /*replicate_on_last_tile_dim=*/true, metadata);
 }
@@ -577,9 +557,9 @@ std::vector<int64_t> HloSharding::TileOffsetForDevice(const Shape& shape,
   CHECK(!IsUnknown());
 
   if (maximal_) {
-    return std::vector<int64_t>(shape.dimensions_size(), 0);
+    return std::vector<int64_t>(shape.dimensions().size(), 0);
   }
-  CHECK_EQ(shape.dimensions_size(), TiledDataRank());
+  CHECK_EQ(shape.dimensions().size(), TiledDataRank());
   std::vector<int64_t> index = TileIndexForDevice(device);
   for (int64_t i = 0; i < index.size(); ++i) {
     const int64_t shape_dim = shape.dimensions(i);
@@ -600,7 +580,7 @@ std::vector<int64_t> HloSharding::TileLimitForDevice(const Shape& shape,
                                 shape.dimensions().end());
   }
 
-  CHECK_EQ(shape.dimensions_size(), TiledDataRank());
+  CHECK_EQ(shape.dimensions().size(), TiledDataRank());
   std::vector<int64_t> index = TileIndexForDevice(device);
   for (int64_t i = 0; i < index.size(); ++i) {
     const int64_t shape_dim = shape.dimensions(i);
@@ -641,7 +621,7 @@ absl::StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
     for (auto& index_to_sharding : result.leaves()) {
       index_to_sharding.second = *it++;
     }
-    return std::move(result);
+    return result;
   } else {
     return ShapeTree<HloSharding>(shape, *this);
   }
@@ -776,7 +756,7 @@ absl::Status HloSharding::ValidateNonTuple(
   }
 
   // The tile assignment tensor must have the same rank as the tiled data rank.
-  if (shape.rank() != TiledDataRank()) {
+  if (shape.dimensions().size() != TiledDataRank()) {
     return tsl::errors::InvalidArgument(
         "Number of tile assignment dimensions (excluding subgroups) is "
         "different than the input rank. "
@@ -1059,10 +1039,9 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
   const Shape* sub_shape = &shape;
   for (int64_t idx : index) {
     for (int64_t i = 0; i < idx; ++i) {
-      sharding_index += ShapeUtil::GetLeafCount(
-          ShapeUtil::GetSubshapeOneIndex(*sub_shape, i));
+      sharding_index += ShapeUtil::GetLeafCount(sub_shape->tuple_shapes(i));
     }
-    sub_shape = &ShapeUtil::GetSubshapeOneIndex(*sub_shape, idx);
+    sub_shape = &sub_shape->tuple_shapes(idx);
   }
   if (sub_shape->IsTuple()) {
     auto begin_it = tuple_elements_.begin() + sharding_index;
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.cc b/third_party/xla/xla/hlo/ir/tile_assignment.cc
index c3518bbe0ed1..04a258b525b1 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.cc
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/printer.h"
@@ -383,6 +384,50 @@ int64_t IotaTileAssignment::value_at(absl::Span<const int64_t> index) const {
   return value;
 }
 
+TileAssignment::TileAssignment(const TileAssignment& other) {
+  iota_ = other.iota_;
+  absl::MutexLock other_lock(&other.mu_);
+  shared_array_ = other.shared_array_;
+  array_ = other.array_;
+}
+
+TileAssignment::TileAssignment(TileAssignment&& other) {
+  absl::MutexLock other_lock(&other.mu_);
+  iota_ = other.iota_;
+  shared_array_ = std::move(other.shared_array_);
+  array_ = other.array_;
+}
+
+TileAssignment& TileAssignment::operator=(const TileAssignment& other) {
+  iota_ = other.iota_;
+  std::shared_ptr<const Array<int64_t>> shared_array;
+  const Array<int64_t>* array;
+  {
+    absl::MutexLock other_lock(&other.mu_);
+    shared_array = other.shared_array_;
+    array = other.array_;
+  }
+  absl::MutexLock lock(&mu_);
+  shared_array_ = shared_array;
+  array_ = array;
+  return *this;
+}
+
+TileAssignment& TileAssignment::operator=(TileAssignment&& other) {
+  iota_ = other.iota_;
+  std::shared_ptr<const Array<int64_t>> shared_array;
+  const Array<int64_t>* array;
+  {
+    absl::MutexLock other_lock(&other.mu_);
+    shared_array = std::move(other.shared_array_);
+    array = other.array_;
+  }
+  absl::MutexLock lock(&mu_);
+  shared_array_ = shared_array;
+  array_ = array;
+  return *this;
+}
+
 bool TileAssignment::operator==(const TileAssignment& other) const {
   if (iota_ && other.iota_) {
     return *iota_ == *other.iota_;
@@ -391,37 +436,55 @@ bool TileAssignment::operator==(const TileAssignment& other) const {
 }
 
 int64_t TileAssignment::operator()(absl::Span<const int64_t> indexes) const {
+  absl::MutexLock lock(&mu_);
   return array_ ? (*array_)(indexes) : iota_->value_at(indexes);
 }
 
 absl::Span<const int64_t> TileAssignment::dimensions() const {
+  absl::MutexLock lock(&mu_);
   return array_ ? array_->dimensions() : iota_->dims();
 }
 
 int64_t TileAssignment::num_dimensions() const {
+  absl::MutexLock lock(&mu_);
   return array_ ? array_->num_dimensions() : iota_->ndims();
 }
 
 int64_t TileAssignment::dim(int64_t n) const {
+  absl::MutexLock lock(&mu_);
   return array_ ? array_->dim(n) : iota_->dim(n);
 }
 int64_t TileAssignment::num_elements() const {
+  absl::MutexLock lock(&mu_);
   return array_ ? array_->num_elements() : iota_->num_elements();
 }
 
-int64_t TileAssignment::first() const { return array_ ? *array_->begin() : 0; }
+int64_t TileAssignment::first() const {
+  absl::MutexLock lock(&mu_);
+  return array_ ? *array_->begin() : 0;
+}
 
 void TileAssignment::Each(
     absl::FunctionRef<void(absl::Span<const int64_t>, int64_t)> f) const {
-  MaybeMaterializeFullArray();
-  array_->Each(f);
+  Array<int64_t> const* array;
+  {
+    absl::MutexLock lock(&mu_);
+    MaybeMaterializeFullArray();
+    array = array_;
+  }
+  array->Each(f);
 }
 
 absl::Status TileAssignment::EachStatus(
     absl::FunctionRef<absl::Status(absl::Span<const int64_t>, int64_t)> f)
     const {
-  MaybeMaterializeFullArray();
-  return array_->EachStatus(f);
+  Array<int64_t> const* array;
+  {
+    absl::MutexLock lock(&mu_);
+    MaybeMaterializeFullArray();
+    array = array_;
+  }
+  return array->EachStatus(f);
 }
 
 [[nodiscard]] TileAssignment TileAssignment::Reshape(
@@ -433,7 +496,7 @@ absl::Status TileAssignment::EachStatus(
                            iota_->transpose_perm()),
         /*shared_array=*/nullptr);
   }
-  auto reshaped = std::make_shared<Array<int64_t>>(*array_);
+  std::shared_ptr<Array<int64_t>> reshaped = shared_array_clone();
   reshaped->Reshape(new_dimensions);
   return TileAssignment(std::move(reshaped));
 }
@@ -479,16 +542,18 @@ bool TileAssignment::UsesDevice(int64_t device) const {
 }
 
 const Array<int64_t>& TileAssignment::array() const {
+  absl::MutexLock lock(&mu_);
   MaybeMaterializeFullArray();
   return *array_;
 }
-const std::shared_ptr<const Array<int64_t>>& TileAssignment::shared_array()
-    const {
+std::shared_ptr<const Array<int64_t>> TileAssignment::shared_array() const {
+  absl::MutexLock lock(&mu_);
   MaybeMaterializeFullArray();
   return shared_array_;
 }
 
 std::shared_ptr<Array<int64_t>> TileAssignment::shared_array_clone() const {
+  absl::MutexLock lock(&mu_);
   MaybeMaterializeFullArray();
   return std::make_shared<Array<int64_t>>(*array_);
 }
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.h b/third_party/xla/xla/hlo/ir/tile_assignment.h
index 31d874328b64..d225f5c60f05 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.h
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.h
@@ -27,9 +27,11 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/printer.h"
@@ -189,6 +191,11 @@ class TileAssignment {
                           absl::Span<const int> transpose_perm)
       : iota_(IotaTileAssignment::Create(dims, reshape_dims, transpose_perm)) {}
 
+  TileAssignment(const TileAssignment& other);
+  TileAssignment(TileAssignment&& other);
+  TileAssignment& operator=(const TileAssignment& other);
+  TileAssignment& operator=(TileAssignment&& other);
+
   bool operator==(const TileAssignment& other) const;
   bool operator!=(const TileAssignment& other) const {
     return !operator==(other);
@@ -214,6 +221,14 @@ class TileAssignment {
   void Each(
       absl::FunctionRef<void(absl::Span<const int64_t>, int64_t)> f) const;
 
+  // Templated variant of Each() that avoids virtual function call
+  // overhead per element. Useful for hot code paths.
+  template <class Fn>
+  void TemplatedEach(const Fn& fn) const {
+    MaybeMaterializeFullArray();
+    array_->TemplatedEach(fn);
+  }
+
   absl::Status EachStatus(
       absl::FunctionRef<absl::Status(absl::Span<const int64_t>, int64_t)> f)
       const;
@@ -241,7 +256,7 @@ class TileAssignment {
   const Array<int64_t>& array() const;
   // Similar to array() but returns the underlying shared_ptr to avoid deep
   // copy.
-  const std::shared_ptr<const Array<int64_t>>& shared_array() const;
+  std::shared_ptr<const Array<int64_t>> shared_array() const;
   // Makes a deep copy of shared_array().
   std::shared_ptr<Array<int64_t>> shared_array_clone() const;
 
@@ -264,18 +279,21 @@ class TileAssignment {
         shared_array_(std::move(shared_array)),
         array_(shared_array_.get()) {}
 
-  void MaybeMaterializeFullArray() const;
+  void MaybeMaterializeFullArray() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   static const Array<int64_t>* ReplicatedArray() {
-    static auto* array = new Array<int64_t>({0});
+    static auto* const array = new Array<int64_t>({0});
     return array;
   }
 
   std::optional<IotaTileAssignment> iota_;
+
+  mutable absl::Mutex mu_;
   // If iota_ is set, shared_array_ is a lazy cache of the materialized array.
-  mutable std::shared_ptr<const Array<int64_t>> shared_array_;
+  mutable std::shared_ptr<const Array<int64_t>> shared_array_
+      ABSL_GUARDED_BY(mu_);
   // Pointer to the storage of the fully materialized array format.
-  mutable const Array<int64_t>* array_ = nullptr;
+  mutable const Array<int64_t>* array_ ABSL_GUARDED_BY(mu_) = nullptr;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/parser/BUILD b/third_party/xla/xla/hlo/parser/BUILD
index cbc46e048e12..6ec62d736b7e 100644
--- a/third_party/xla/xla/hlo/parser/BUILD
+++ b/third_party/xla/xla/hlo/parser/BUILD
@@ -2,7 +2,7 @@
 #   HLO parser implementation.
 
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility")
@@ -72,7 +72,6 @@ xla_cc_test(
         ":hlo_lexer",
         ":hlo_parser",
         "//xla:array",
-        "//xla:protobuf_util",
         "//xla:shape_util",
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
@@ -88,6 +87,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/hlo/parser/hlo_lexer.cc b/third_party/xla/xla/hlo/parser/hlo_lexer.cc
index 4c294b8567e3..d0ab69bd9159 100644
--- a/third_party/xla/xla/hlo/parser/hlo_lexer.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_lexer.cc
@@ -292,12 +292,13 @@ TokKind HloLexer::LexIdentifier() {
   absl::string_view identifier =
       StringViewFromPointers(token_state_.token_start, current_ptr_);
 
-  // Primitive type strings are reserved words. The exception is 'tuple' whose
-  // type is represented using nested parentheses without the string 'tuple'.
+  // Primitive type strings are reserved words. The exception is 'tuple' and
+  // 'buffer' whose type are represented using nested parentheses without the
+  // string 'tuple' or 'buffer'.
   if (primitive_util::IsPrimitiveTypeName(identifier)) {
     PrimitiveType primitive_type =
         primitive_util::StringToPrimitiveType(identifier).value();
-    if (primitive_type != TUPLE) {
+    if (primitive_type != TUPLE && primitive_type != BUFFER) {
       token_state_.primitive_type_val = primitive_type;
       return TokKind::kPrimitiveType;
     }
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser.cc b/third_party/xla/xla/hlo/parser/hlo_parser.cc
index ef5180aefc5b..929548569cf6 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser.cc
@@ -558,9 +558,7 @@ class HloParserImpl : public HloParser {
   bool ParseLayoutIntAttribute(int64_t* attr_value,
                                absl::string_view attr_description);
   bool ParseDimLevelTypes(
-      absl::InlinedVector<DimLevelType, InlineRank()>* dim_level_types,
-      absl::InlinedVector<bool, InlineRank()>* dim_unique,
-      absl::InlinedVector<bool, InlineRank()>* dim_ordered);
+      absl::InlinedVector<DimLevelType, InlineRank()>* dim_level_types);
   bool ParseTiles(std::vector<Tile>* tiles);
   bool ParseSplitConfigs(std::vector<SplitConfig>& split_configs);
   bool ParsePhysicalShape(Shape* physical_shape);
@@ -740,15 +738,28 @@ absl::Status HloParserImpl::Run(HloModule* module) {
           "Syntax error when trying to parse the text as a HloModule:\n%s",
           GetError());
     }
-    return absl::OkStatus();
+  } else {
+    // This means that the text is a single HLO instruction.
+    if (!ParseSingleInstruction(module)) {
+      return InvalidArgument(
+          "Syntax error when trying to parse the text as a single "
+          "HloInstruction:\n%s",
+          GetError());
+    }
   }
-  // This means that the text is a single HLO instruction.
-  if (!ParseSingleInstruction(module)) {
-    return InvalidArgument(
-        "Syntax error when trying to parse the text as a single "
-        "HloInstruction:\n%s",
-        GetError());
+
+  // There should be a 1:1 correspondence between async-start ops and
+  // async wrapped computations. Verify that each async computation has exactly
+  // one caller.
+  for (HloComputation* computation : module->computations()) {
+    if (computation->IsAsyncComputation() &&
+        !computation->GetUniqueCaller(HloOpcode::kAsyncStart)) {
+      return InvalidArgument(
+          "Computation %s is called by more than one async op.",
+          computation->name());
+    }
   }
+
   return absl::OkStatus();
 }
 
@@ -1948,7 +1959,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         return nullptr;
       }
       auto is_async_shape_correct = [](const Shape& shape) {
-        return shape.IsTuple() && shape.tuple_shapes_size() >= 2 &&
+        return shape.IsTuple() && shape.tuple_shapes().size() >= 2 &&
                shape.tuple_shapes(0).IsTuple();
       };
       // Verify operand/resulting shapes
@@ -2067,16 +2078,6 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
           return nullptr;
         }
       }
-      // There should be a 1:1 correspondence between async-start ops and
-      // async wrapped computations. At this stage, the computation should
-      // not be referenced by any other async op.
-      if (opcode == HloOpcode::kAsyncStart &&
-          (*async_computation)->IsAsyncComputation()) {
-        TokenError(StrFormat(
-            "Computation %s is already referenced by another async op",
-            (*async_computation)->name()));
-        return nullptr;
-      }
       if (opcode == HloOpcode::kAsyncStart) {
         // async_execution_thread only needs to be populated for async-start,
         // as the rest of the async chain will reference the root op.
@@ -2198,9 +2199,9 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
     }
     case HloOpcode::kTuple: {
       if ((!preset_operands &&
-           !(shape.has_value()
-                 ? ParseOperands(&operands, builder, shape->tuple_shapes_size())
-                 : ParseOperands(&operands, builder))) ||
+           !(shape.has_value() ? ParseOperands(&operands, builder,
+                                               shape->tuple_shapes().size())
+                               : ParseOperands(&operands, builder))) ||
           !ParseAttributes(attrs, allow_attributes, shape)) {
         return nullptr;
       }
@@ -2699,8 +2700,9 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         TokenError("Expected at least one operand.");
         return nullptr;
       }
-      if (!(operands.size() == 2 && operands[1]->shape().rank() == 1) &&
-          operands.size() != 1 + operands[0]->shape().rank()) {
+      if (!(operands.size() == 2 &&
+            operands[1]->shape().dimensions().size() == 1) &&
+          operands.size() != 1 + operands[0]->shape().dimensions().size()) {
         TokenError("Wrong number of operands.");
         return nullptr;
       }
@@ -2718,8 +2720,9 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         TokenError("Expected at least two operands.");
         return nullptr;
       }
-      if (!(operands.size() == 3 && operands[2]->shape().rank() == 1) &&
-          operands.size() != 2 + operands[0]->shape().rank()) {
+      if (!(operands.size() == 3 &&
+            operands[2]->shape().dimensions().size() == 1) &&
+          operands.size() != 2 + operands[0]->shape().dimensions().size()) {
         TokenError("Wrong number of operands.");
         return nullptr;
       }
@@ -4502,7 +4505,7 @@ bool HloParserImpl::ParseDenseLiteral(Literal* literal, const Shape& shape) {
   // Cast `rank` to int because we call shape.dimensions(int rank) below, and if
   // `rank` is an int64_t, that's an implicit narrowing conversion, which is
   // implementation-defined behavior.
-  const int rank = static_cast<int>(shape.rank());
+  const int rank = static_cast<int>(shape.dimensions().size());
 
   // Create a literal with the given shape in default layout.
   *literal = LiteralUtil::CreateFromDimensions(shape.element_type(),
@@ -5947,9 +5950,7 @@ bool HloParserImpl::ParseDimensionSizes(std::vector<int64_t>* dimension_sizes,
 //   ::= 'C'
 //   ::= 'S'
 bool HloParserImpl::ParseDimLevelTypes(
-    absl::InlinedVector<DimLevelType, InlineRank()>* dim_level_types,
-    absl::InlinedVector<bool, InlineRank()>* dim_unique,
-    absl::InlinedVector<bool, InlineRank()>* dim_ordered) {
+    absl::InlinedVector<DimLevelType, InlineRank()>* dim_level_types) {
   auto parse_and_add_item = [&]() {
     if (lexer_.GetKind() == TokKind::kIdent) {
       bool dim_level_type_valid = false;
@@ -5972,25 +5973,13 @@ bool HloParserImpl::ParseDimLevelTypes(
         dim_level_type_valid = true;
       }
       if (dim_level_type_valid) {
-        bool new_dim_unique = true;
         if (lexer_.GetKind() == TokKind::kPlus) {
-          new_dim_unique = false;
           lexer_.Lex();
         }
-        bool new_dim_ordered = true;
         if (lexer_.GetKind() == TokKind::kTilde) {
-          new_dim_ordered = false;
           lexer_.Lex();
         }
-        if (!LayoutUtil::ValidateDimLevel(dim_level_type, new_dim_unique,
-                                          new_dim_ordered)) {
-          return Error(
-              lexer_.GetLoc(),
-              "invalid DimLevelType/unique/ordered combination in shape");
-        }
         dim_level_types->push_back(dim_level_type);
-        dim_unique->push_back(new_dim_unique);
-        dim_ordered->push_back(new_dim_ordered);
         return true;
       }
     }
@@ -6147,8 +6136,6 @@ bool HloParserImpl::ParseSplitConfigs(std::vector<SplitConfig>& split_configs) {
 bool HloParserImpl::ParseLayout(Layout* layout) {
   absl::InlinedVector<int64_t, InlineRank()> minor_to_major;
   DimLevelTypeVector dim_level_types;
-  absl::InlinedVector<bool, InlineRank()> dim_unique;
-  absl::InlinedVector<bool, InlineRank()> dim_ordered;
   std::vector<Tile> tiles;
   PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID;
   PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID;
@@ -6188,7 +6175,7 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
 
       if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "D") {
         lexer_.Lex();
-        ParseDimLevelTypes(&dim_level_types, &dim_unique, &dim_ordered);
+        ParseDimLevelTypes(&dim_level_types);
       }
 
       if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "T") {
@@ -6265,20 +6252,34 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
     vec_tiles[i] = Tile(tiles[i]);
   }
   *layout = LayoutUtil::MakeLayout(
-      minor_to_major, dim_level_types, dim_unique, dim_ordered, vec_tiles,
-      tail_padding_alignment_in_elements, index_primitive_type,
-      pointer_primitive_type, element_size_in_bits, memory_space, split_configs,
-      std::move(physical_shape), dynamic_shape_metadata_prefix_bytes);
+      minor_to_major, vec_tiles, tail_padding_alignment_in_elements,
+      index_primitive_type, pointer_primitive_type, element_size_in_bits,
+      memory_space, split_configs, std::move(physical_shape),
+      dynamic_shape_metadata_prefix_bytes);
   return true;
 }
 
 // shape ::= shape_val_
 // shape ::= '(' tuple_elements ')'
+// shape ::= 'b(' shape ')'
 // tuple_elements
 //   ::= /*empty*/
 //   ::= shape (',' shape)*
 bool HloParserImpl::ParseShape(Shape* result,
                                bool allow_fallback_to_default_layout) {
+  if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "b" &&
+      lexer_.LookAhead() == TokKind::kLparen) {  // Buffer shape
+    lexer_.Lex();
+    lexer_.Lex();
+    Shape shape;
+    if (!ParseShape(&shape, allow_fallback_to_default_layout)) {
+      return false;
+    }
+    *result = Shape::MakeBufferShape(shape);
+    return ParseToken(TokKind::kRparen,
+                      "expects ')' at the end of buffer shape.");
+  }
+
   if (EatIfPresent(TokKind::kLparen)) {  // Tuple
     std::vector<Shape> shapes;
     if (lexer_.GetKind() == TokKind::kRparen) {
@@ -6310,8 +6311,11 @@ bool HloParserImpl::ParseShape(Shape* result,
   }
   result->set_element_type(primitive_type);
   for (int i = 0; i < dimension_sizes.size(); ++i) {
-    result->add_dimensions(dimension_sizes[i]);
-    result->set_dynamic_dimension(i, dynamic_dimensions[i]);
+    if (!Shape::IsValidDimensionSize(dimension_sizes[i],
+                                     dynamic_dimensions[i])) {
+      return false;
+    }
+    result->add_dimensions(dimension_sizes[i], dynamic_dimensions[i]);
   }
   if ((allow_fallback_to_default_layout && options_.fill_missing_layouts()) ||
       ShapeUtil::IsScalar(*result)) {
@@ -6334,25 +6338,13 @@ bool HloParserImpl::ParseShape(Shape* result,
     if (!ParseLayout(&layout)) {
       return false;
     }
-    if (layout.dim_level_types_size() != 0 &&
-        layout.dim_level_types_size() != result->rank()) {
-      return Error(
-          lexer_.GetLoc(),
-          StrFormat("Dimensions size is %ld, but dim level types size is %ld.",
-                    result->rank(), layout.dim_level_types_size()));
-    }
-    if (layout.minor_to_major_size() != result->rank()) {
+    if (layout.minor_to_major_size() != result->dimensions().size()) {
       return Error(
           lexer_.GetLoc(),
           StrFormat("Dimensions size is %ld, but minor to major size is %ld.",
-                    result->rank(), layout.minor_to_major_size()));
-    }
-    if (LayoutUtil::IsSparse(layout) && layout.tiles_size() > 0) {
-      return Error(lexer_.GetLoc(),
-                   StrFormat("Layout has tiles, but is for a sparse array: %s",
-                             layout.ToString()));
+                    result->dimensions().size(), layout.minor_to_major_size()));
     }
-    if (!LayoutUtil::IsSparse(layout) && layout.has_physical_shape()) {
+    if (layout.has_physical_shape()) {
       return Error(
           lexer_.GetLoc(),
           StrFormat(
@@ -6368,7 +6360,9 @@ bool HloParserImpl::CanBeShape() {
   // A non-tuple shape starts with a kPrimitiveType token; a tuple shape starts
   // with '('.
   return lexer_.GetKind() == TokKind::kPrimitiveType ||
-         lexer_.GetKind() == TokKind::kLparen;
+         lexer_.GetKind() == TokKind::kLparen ||
+         (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "b" &&
+          lexer_.LookAhead() == TokKind::kLparen);
 }
 
 bool HloParserImpl::ParseName(std::string* result) {
@@ -7280,7 +7274,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
   auto module = std::make_unique<HloModule>(/*name=*/"_", config);
   HloParserImpl parser(str, options);
   TF_RETURN_IF_ERROR(parser.Run(module.get()));
-  return std::move(module);
+  return module;
 }
 
 absl::StatusOr<HloSharding> ParseSharding(absl::string_view str) {
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
index dbbb25c8d74d..f49e109a69f1 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
@@ -37,14 +37,15 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/parser/hlo_lexer.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
-#include "xla/protobuf_util.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
@@ -54,6 +55,7 @@ limitations under the License.
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
 
@@ -65,12 +67,12 @@ namespace m = ::xla::match;
 using ::absl::string_view;
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
+using ::tsl::proto_testing::EqualsProto;
 
 struct TestData {
   std::string test_name;
   std::string module_string;
   int64_t replica_count = 1;
-  bool enable_verification = true;
 };
 
 std::string TestDataToString(const ::testing::TestParamInfo<TestData>& data) {
@@ -2689,108 +2691,22 @@ ENTRY test {
   ROOT root = f32[10,10]{1,0} sqrt(broadcast.anon)
 })"
 },
-
-{
-"SparseShape",
-R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)} parameter(0)
-})",
-R"(HloModule test, entry_computation_layout={(f32[10,10]{1,0:D(D,C)})->f32[10,10]{1,0:D(D,C)}}
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)} parameter(0)
-})",
-},
-
-{
-"SparseShapeWithIndexPrimitiveType",
-R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)#(u32)} parameter(0)
-})",
-R"(HloModule test, entry_computation_layout={(f32[10,10]{1,0:D(D,C)#(u32)})->f32[10,10]{1,0:D(D,C)#(u32)}}
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)#(u32)} parameter(0)
-})",
-},
-
-{
-"SparseShapeWithPointerPrimitiveType",
-R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)*(u32)} parameter(0)
-})",
-R"(HloModule test, entry_computation_layout={(f32[10,10]{1,0:D(D,C)*(u32)})->f32[10,10]{1,0:D(D,C)*(u32)}}
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)*(u32)} parameter(0)
-})",
-},
-
-{
-"SparseShapeWithPhysicalShape",
-R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))} parameter(0)
-})",
-R"(HloModule test, entry_computation_layout={(f32[10,10]{1,0:D(D,C)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))})->f32[10,10]{1,0:D(D,C)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))}}
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))} parameter(0)
-})",
-},
-
-{
-"SparseShapeFull",
-R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)#(u64)*(u32)S(42)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))} parameter(0)
-})",
-R"(HloModule test, entry_computation_layout={(f32[10,10]{1,0:D(D,C)#(u64)*(u32)S(42)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))})->f32[10,10]{1,0:D(D,C)#(u64)*(u32)S(42)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))}}
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)#(u64)*(u32)S(42)P((s32[10]{0:T(100)}, s32[10]{0:T(100)}, f32[10]{0:T(100)}))} parameter(0)
-})",
-},
-
-{
-"SparseCOO",
-R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(C+,S)} parameter(0)
-})",
-R"(HloModule test, entry_computation_layout={(f32[10,10]{1,0:D(C+,S)})->f32[10,10]{1,0:D(C+,S)}}
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(C+,S)} parameter(0)
-})",
-},
-
-{
-"SparseCOOUnordered",
-R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(C+~,S~)} parameter(0)
-})",
-R"(HloModule test, entry_computation_layout={(f32[10,10]{1,0:D(C+~,S~)})->f32[10,10]{1,0:D(C+~,S~)}}
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(C+~,S~)} parameter(0)
-})",
-},
 });
   // clang-format on
 }
 
+absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnVerifiedModule(
+    absl::string_view name, absl::string_view hlo_text,
+    const HloModuleConfig& config) {
+  auto verified_module = std::make_unique<VerifiedHloModule>(
+      std::string(name), config,
+      /*verifier_layout_sensitive=*/false,
+      /*allow_mixed_precision_in_hlo_verifier=*/true,
+      ShapeUtil::ByteSizeOfElements);
+  TF_RETURN_IF_ERROR(verified_module->ParseHloStringAndVerifyModule(hlo_text));
+  return verified_module;
+}
+
 // The test class for those tests defined above which round-trip through the
 // parser and ToString is templatized on two bool parameters:
 //
@@ -2809,28 +2725,31 @@ class HloParameterizedParserTest
     : public ::testing::Test,
       public ::testing::WithParamInterface<TestData> {
  protected:
-  // Expects "ToString(ParseHloModule(std::string)) == string", that is, parses
-  // the string, asserts that it succeeded, stringifies the parsed module, and
-  // checks that it equals the original string.
+  absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text) {
+    HloModuleConfig config;
+    config.set_replica_count(GetParam().replica_count);
+    return xla::ParseAndReturnVerifiedModule(
+        ::testing::UnitTest::GetInstance()->current_test_info()->name(),
+        hlo_text, config);
+  }
+
+  // Expects "ToString(ParseHloModule(ToString(ParseHloModule(std::string)))) ==
+  // string", that is, parses the string, asserts that it succeeded, stringifies
+  // the parsed module, parses this string to ensure that the default ToString()
+  // version is parsable, then stringifies the newly parsed module with
+  // appropriate options for original tests, and checks that it equals the
+  // original string.
   void ExpectEqual() {
     VLOG(3) << "Running HloParameterizedParserTest with short_form = "
             << short_form << ", proto_round_trip = " << proto_round_trip;
-    std::unique_ptr<HloModule> module;
     const std::string& original = GetParam().module_string;
-    HloModuleConfig config;
-    config.set_replica_count(GetParam().replica_count);
-    if (GetParam().enable_verification) {
-      auto verified_module = std::make_unique<VerifiedHloModule>(
-          GetParam().test_name, config,
-          /*verifier_layout_sensitive=*/false,
-          /*allow_mixed_precision_in_hlo_verifier=*/true,
-          ShapeUtil::ByteSizeOfElements);
-      TF_ASSERT_OK(verified_module->ParseHloStringAndVerifyModule(original));
-      module = std::move(verified_module);
-    } else {
-      TF_ASSERT_OK_AND_ASSIGN(module,
-                              ParseAndReturnUnverifiedModule(original, config));
-    }
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(original));
+    TF_ASSERT_OK_AND_ASSIGN(
+        module, ParseAndReturnVerifiedModule(module->ToString(
+                    HloPrintOptions().set_print_large_constants(true))));
+
     if (proto_round_trip) {
       TF_ASSERT_OK_AND_ASSIGN(module, HloModule::CreateFromProto(
                                           module->ToProto(), module->config()));
@@ -2838,15 +2757,16 @@ class HloParameterizedParserTest
     if (short_form) {
       EXPECT_EQ(original, module->ToString(HloPrintOptions::ShortParsable()));
     } else {
-      EXPECT_EQ(
-          original,
-          module->ToString(HloPrintOptions().set_print_large_constants(true)));
+      EXPECT_EQ(original,
+                module->ToString(HloPrintOptions()
+                                     .set_print_operand_shape(true)
+                                     .set_print_large_constants(true)));
     }
     for (HloComputation* computation : module->computations()) {
       for (HloInstruction* instr : computation->instructions()) {
         if (instr->opcode() == HloOpcode::kWhile) {
-          EXPECT_EQ(instr->while_body()->WhileCallInstruction(), instr);
-          EXPECT_TRUE(instr->while_body()->IsWhileBodyComputation());
+          EXPECT_EQ(instr->while_body()->GetUniqueCaller(HloOpcode::kWhile),
+                    instr);
         }
       }
     }
@@ -2883,13 +2803,10 @@ INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation,
 class HloNonRoundtripParserTest
     : public ::testing::TestWithParam<NonRoundtripTestData> {};
 TEST_P(HloNonRoundtripParserTest, Run) {
-  auto module = std::make_unique<VerifiedHloModule>(
-      GetParam().test_name, HloModuleConfig{},
-      /*verifier_layout_sensitive=*/false,
-      /*allow_mixed_precision_in_hlo_verifier=*/true,
-      ShapeUtil::ByteSizeOfElements);
-  TF_ASSERT_OK(
-      module->ParseHloStringAndVerifyModule(GetParam().input_module_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(GetParam().test_name,
+                                                GetParam().input_module_string,
+                                                HloModuleConfig()));
   EXPECT_EQ(absl::StripAsciiWhitespace(GetParam().output_module_string),
             absl::StripAsciiWhitespace(
                 module->ToString(HloPrintOptions::ShortParsable())));
@@ -2906,16 +2823,11 @@ class HloParserTest : public ::testing::Test {
     EXPECT_TRUE(absl::StrContains(s, expected))
         << "'" << s << "' does not contain '" << expected << "'";
   }
-  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
-  ParseAndReturnVerifiedModule(absl::string_view hlo_text) {
-    auto module = std::make_unique<VerifiedHloModule>(
+  absl::StatusOr<std::unique_ptr<HloModule>> ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text) {
+    return xla::ParseAndReturnVerifiedModule(
         ::testing::UnitTest::GetInstance()->current_test_info()->name(),
-        HloModuleConfig(),
-        /*verifier_layout_sensitive=*/false,
-        /*allow_mixed_precision_in_hlo_verifier=*/true,
-        ShapeUtil::ByteSizeOfElements);
-    TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
-    return std::move(module);
+        hlo_text, HloModuleConfig());
   }
 };
 
@@ -4143,7 +4055,7 @@ TEST_F(HloParserTest, ParseUnknownSharding) {
 
 TEST_F(HloParserTest, ParseFrontendAttributes) {
   const std::string original =
-      R"({attr_a="test_a",attr_b="b",attr_c="s64",attr_d="a/b"})";
+      R"({attr_a="test_a",attr_b="b",attr_c={type="s64"},attr_d="a=\"b/c\""})";
   TF_ASSERT_OK_AND_ASSIGN(FrontendAttributes frontend_attributes,
                           ParseFrontendAttributes(original));
   EXPECT_EQ(FrontendAttributesToString(frontend_attributes), original);
@@ -4786,17 +4698,6 @@ TEST_F(HloParserTest, ParseDynamicTuple) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
-TEST_F(HloParserTest, ParseInvalidDimLevel) {
-  constexpr absl::string_view shape_string = "f32[123]{0:D(D+~)}";
-  absl::StatusOr<Shape> result = ParseShape(shape_string);
-  ASSERT_THAT(
-      result.status(),
-      tsl::testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          testing::HasSubstr(
-              "invalid DimLevelType/unique/ordered combination in shape")));
-}
-
 TEST_F(HloParserTest, NegativeParameterNumber) {
   const std::string hlo_string = "par0 = f32[3,5] parameter(-1)";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
@@ -5208,31 +5109,6 @@ ENTRY test {
                   HasSubstr("expected a DimLevelType abbreviation")));
 }
 
-TEST_F(HloParserTest, InvalidDimLevelTypeCount) {
-  const std::string original = R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(C)} parameter(0)
-})";
-  EXPECT_THAT(
-      ParseAndReturnUnverifiedModule(original).status(),
-      tsl::testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          HasSubstr("Dimensions size is 2, but dim level types size is 1")));
-}
-
-TEST_F(HloParserTest, RejectSparseTiles) {
-  const std::string original = R"(HloModule test
-
-ENTRY test {
-  ROOT root = f32[10,10]{1,0:D(D,C)T(128,8)} parameter(0)
-})";
-  EXPECT_THAT(ParseAndReturnUnverifiedModule(original).status(),
-              tsl::testing::StatusIs(
-                  tsl::error::INVALID_ARGUMENT,
-                  HasSubstr("Layout has tiles, but is for a sparse array")));
-}
-
 TEST_F(HloParserTest, RejectDensePhysicalShape) {
   const std::string original = R"(HloModule test
 
@@ -5571,11 +5447,11 @@ ENTRY %Entry (p0: f32[10]) -> f32[20] {
   ROOT %async-done.1 = f32[20]{0} async-done(((f32[10]{0}), f32[20]{0}, s32[]) %async-start.1)
 }
   )";
-  EXPECT_THAT(ParseAndReturnUnverifiedModule(hlo_string).status(),
-              tsl::testing::StatusIs(
-                  tsl::error::INVALID_ARGUMENT,
-                  HasSubstr("Computation async_wrapped is already referenced "
-                            "by another async op")));
+  EXPECT_THAT(
+      ParseAndReturnUnverifiedModule(hlo_string).status(),
+      tsl::testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                             HasSubstr("Computation async_wrapped is called by "
+                                       "more than one async op")));
 }
 
 TEST_F(HloParserTest, AsyncUpdateWrongComputation) {
@@ -5822,8 +5698,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyMode) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
   auto* unary = module->entry_computation()->root_instruction();
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(unary->result_accuracy(),
-                                            expected_result_accuracy));
+  EXPECT_THAT(unary->result_accuracy(), EqualsProto(expected_result_accuracy));
 }
 
 TEST_F(HloParserTest, TranscendentalAccuracyModeError) {
@@ -5859,8 +5734,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyRtol) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
   auto* unary = module->entry_computation()->root_instruction();
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(unary->result_accuracy(),
-                                            expected_result_accuracy));
+  EXPECT_THAT(unary->result_accuracy(), EqualsProto(expected_result_accuracy));
 }
 
 TEST_F(HloParserTest, TranscendentalResultAccuracyInvalidName) {
@@ -5904,9 +5778,9 @@ TEST_F(HloParserTest, TranscendentalAccuracyNoConfig) {
                           ParseAndReturnUnverifiedModule(hlo_string));
   ResultAccuracy default_result_accuracy;
   default_result_accuracy.set_mode(ResultAccuracy::DEFAULT);
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(
+  EXPECT_THAT(
       module->entry_computation()->root_instruction()->result_accuracy(),
-      default_result_accuracy));
+      EqualsProto(default_result_accuracy));
 }
 
 TEST_F(HloParserTest, TranscendentalAccuracyInvalidOp) {
@@ -5964,14 +5838,13 @@ TEST_F(HloParserTest,
   // Check the async-start and async-done instructions.
   HloInstruction* async_done = m->entry_computation()->root_instruction();
   HloInstruction* async_start = async_done->async_chain_start();
-  EXPECT_EQ(async_start->metadata().DebugString(),
-            wrapped_instr->metadata().DebugString());
+  EXPECT_THAT(async_start->metadata(), EqualsProto(wrapped_instr->metadata()));
   EXPECT_EQ(async_start->raw_backend_config_string(),
             wrapped_instr->raw_backend_config_string());
-  EXPECT_EQ(async_start->frontend_attributes().DebugString(),
-            wrapped_instr->frontend_attributes().DebugString());
-  EXPECT_EQ(async_start->statistics_viz().DebugString(),
-            wrapped_instr->statistics_viz().DebugString());
+  EXPECT_THAT(async_start->frontend_attributes(),
+              EqualsProto(wrapped_instr->frontend_attributes()));
+  EXPECT_THAT(async_start->statistics_viz(),
+              EqualsProto(wrapped_instr->statistics_viz()));
   EXPECT_EQ(OriginalValueToString(*async_done->original_value()),
             OriginalValueToString(*wrapped_instr->original_value()));
 }
@@ -5995,5 +5868,38 @@ TEST_F(HloParserTest, ResultAccuracyToProto) {
   EXPECT_EQ(exp_hlo_inst_proto.result_accuracy().tolerance().atol(), 1.0);
 }
 
+TEST_F(HloParserTest, ParseBufferEmptyElement) {
+  std::string shape_string = "b()";
+  auto result = ParseShape(shape_string);
+  EXPECT_NE(absl::OkStatus(), result.status());
+  ExpectHasSubstr(result.status().message(), "expected primitive type");
+}
+
+TEST_F(HloParserTest, ParseBufferMoreThanOneElement) {
+  std::string shape_string = "b(s32[], f32[])";
+  auto result = ParseShape(shape_string);
+  EXPECT_NE(absl::OkStatus(), result.status());
+  ExpectHasSubstr(result.status().message(),
+                  "expects ')' at the end of buffer shape");
+}
+
+TEST_F(HloParserTest, ParseBufferScalar) {
+  std::string shape_string = "b(s32[])";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeBufferShape(S32, {});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseBufferArray) {
+  std::string shape_string = "b(f32[8,16]{1,0})";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeBufferShape(F32, {8, 16});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/pass/BUILD b/third_party/xla/xla/hlo/pass/BUILD
index c3e790ad2c11..6daaf01f0350 100644
--- a/third_party/xla/xla/hlo/pass/BUILD
+++ b/third_party/xla/xla/hlo/pass/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   XLA’s HLO Intermediate Representation implementation.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load(
@@ -53,20 +53,19 @@ xla_cc_test(
         ":hlo_pass",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test_helpers",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/service:hlo_proto_cc",
         "//xla/service:pattern_matcher",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -89,14 +88,16 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
         "//xla/service:compilation_stats",
         "//xla/service:dump",
         "//xla/service:hlo_graph_dumper",
-        "//xla/service:hlo_proto_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
index 0332f3f6b5f2..07cf9a8c4bce 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.cc
@@ -15,23 +15,26 @@ limitations under the License.
 
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 
+#include <cstddef>
+#include <optional>
 #include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/dump.h"
-#include "xla/service/hlo_graph_dumper.h"
-#include "xla/service/hlo_proto_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/status.h"
-#include "xla/types.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
@@ -109,6 +112,7 @@ template <typename HloT>
 absl::Status HloPassPipeline::RunInvariantCheckers(
     HloT* hlo, absl::string_view after_pass_name,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  tsl::profiler::TraceMe traceme("RunInvariantCheckers");
   for (auto& invariant_checker : invariant_checkers_) {
     VLOG(1) << "    Invariant checker " << invariant_checker->name();
     absl::StatusOr<bool> changed_status =
@@ -138,6 +142,33 @@ std::string UniqueId(const HloModuleGroup& group) {
                          out->append(std::to_string(mod->unique_id()));
                        });
 }
+
+template <typename HloT>
+static void VerifyPassChangedReport(const HloT* hlo, bool pass_changed,
+                                    const DebugOptions& debug_options,
+                                    absl::string_view pass_name,
+                                    absl::string_view pipeline_name,
+                                    size_t hash_before) {
+  size_t hash_after = absl::HashOf(hlo);
+  // Fail if pass changed HLO but has reported that it didn't.
+  if (!pass_changed && hash_after != hash_before &&
+      debug_options.xla_unsupported_crash_on_hlo_pass_silent_hlo_change()) {
+    LOG(FATAL) << absl::StrFormat(
+        "Pass '%s' in pipeline '%s' reported that it did not change the "
+        "HLO but the hash of HLO was changed from %d to %d. HLO text "
+        "after:\n%s",
+        pass_name, pipeline_name, hash_before, hash_after, hlo->ToString());
+  }
+  // Fail if pass did not change HLO but has reported that it did.
+  if (pass_changed && hash_after == hash_before &&
+      debug_options.xla_unsupported_crash_on_hlo_pass_noop_change()) {
+    LOG(FATAL) << absl::StrFormat(
+        "Pass '%s' in pipeline '%s' reported that it changed the HLO but "
+        "the hash of HLO was not updated. HLO text after:\n%s",
+        pass_name, pipeline_name, hlo->ToString());
+  }
+}
+
 }  // namespace
 
 template <typename HloT>
@@ -169,7 +200,10 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
                         /*module_changed=*/false);
 
   bool changed = false;
-  for (int i = 0; i < passes.size(); i++) {
+  bool verify_pass_changed_report =
+      debug_options.xla_unsupported_crash_on_hlo_pass_silent_hlo_change() ||
+      debug_options.xla_unsupported_crash_on_hlo_pass_noop_change();
+  for (int i = 0, sz = passes.size(); i < sz; i++) {
     HloPassInterface* pass = passes[i];
     std::string pass_name = std::string(pass->name());
     XLA_SCOPED_LOGGING_TIMER(absl::StrCat("HLO pass: ", pass_name));
@@ -178,7 +212,11 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
                              pass_name, hlo->name(), UniqueId(*hlo));
     }};
     VLOG(1) << "  HLO pass " << pass_name;
-    VLOG(2) << "  Module hash " << absl::HashOf(*hlo);
+    std::optional<size_t> hash_before = std::nullopt;
+    if (verify_pass_changed_report || VLOG_IS_ON(2)) {
+      hash_before = absl::HashOf(*hlo);
+      VLOG(2) << "  Module hash " << hash_before.value();
+    }
     tsl::profiler::TraceMe traceme(pass->name());
     if (!pass->IsPassPipeline()) {
       compilation_stats_->StartPass(pass_name);
@@ -190,6 +228,10 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
           pass_name, absl::StatusCodeToString(status.code()));
     }
     TF_ASSIGN_OR_RETURN(bool pass_changed, status_or_changed);
+    if (verify_pass_changed_report) {
+      VerifyPassChangedReport(hlo, pass_changed, debug_options, pass_name,
+                              pipeline_name, hash_before.value());
+    }
     if (!dump_regex.empty() && (pass_changed || dump_regex != ".*")) {
       MaybeDumpHloAndSaveFilenames(*hlo,
                                    /*after_pass_name=*/pass_name,
@@ -301,8 +343,9 @@ absl::StatusOr<bool> HloPassPipeline::Run(
           << name();
 
   tsl::profiler::TraceMe traceme(name());
-  return RunPassesInternal(module, module->config().debug_options(),
-                           execution_threads);
+  // Copy debug options by value as passes may modify module config.
+  DebugOptions debug_options = module->config().debug_options();
+  return RunPassesInternal(module, debug_options, execution_threads);
 }
 
 absl::StatusOr<bool> HloPassPipeline::RunOnModuleGroup(
@@ -318,9 +361,9 @@ absl::StatusOr<bool> HloPassPipeline::RunOnModuleGroup(
     return false;
   }
 
-  return RunPassesInternal(module_group,
-                           module_group->module(0).config().debug_options(),
-                           execution_threads);
+  // Copy debug options by value as passes may modify module config.
+  DebugOptions debug_options = module_group->module(0).config().debug_options();
+  return RunPassesInternal(module_group, debug_options, execution_threads);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
index 8f02c134bc86..2f3550db574c 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline.h
@@ -48,6 +48,15 @@ class HloPassPipeline : public HloPassInterface {
   }
   absl::string_view name() const override { return name_; }
 
+  // Add a pass to the pipeline.
+  template <typename T>
+  T& AddPass(std::unique_ptr<T> pass) {
+    CHECK(!run_called_) << "AddPass cannot be called after Run";
+    T* underlying_ptr = pass.get();
+    passes_.push_back(std::move(pass));
+    return *underlying_ptr;
+  }
+
   // Add a pass to the pipeline. It should be called with the arguments for the
   // pass constructor:
   //
@@ -56,10 +65,7 @@ class HloPassPipeline : public HloPassInterface {
   // Returns a reference to the added pass.
   template <typename T, typename... Args>
   T& AddPass(Args&&... args) {
-    CHECK(!run_called_) << "AddPass cannot be called after Run";
-    auto pass = new T(std::forward<Args>(args)...);
-    passes_.push_back(std::unique_ptr<T>(pass));
-    return *pass;
+    return AddPass(std::make_unique<T>(std::forward<Args>(args)...));
   }
 
   // Add an invariant-checking pass to the pipeline. It will be run before and
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
index ee08fbd5abed..be8df4849d73 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_pipeline_test.cc
@@ -58,7 +58,7 @@ class HloPassPipelineTest : public HloHardwareIndependentTestBase {
                           ParseAndReturnVerifiedModule(hlo_string));
       group.push_back(std::move(module));
     }
-    return std::move(group);
+    return group;
   }
 };
 
diff --git a/third_party/xla/xla/hlo/testlib/BUILD b/third_party/xla/xla/hlo/testlib/BUILD
index 13003940030b..0300aed0a0bf 100644
--- a/third_party/xla/xla/hlo/testlib/BUILD
+++ b/third_party/xla/xla/hlo/testlib/BUILD
@@ -66,9 +66,11 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
+        "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:computation_layout",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_verifier",
         "//xla/tsl/platform:errors",
@@ -84,6 +86,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -116,7 +119,6 @@ cc_library(
     testonly = 1,
     hdrs = ["pattern_matcher_gmock.h"],
     deps = [
-        "test",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
@@ -131,7 +133,7 @@ cc_library(
     visibility = internal_visibility(["//xla/tests:friends"]),
     deps = [
         "//xla/tsl/platform:test",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
         "@local_tsl//tsl/platform",
     ],
 )
diff --git a/third_party/xla/xla/hlo/testlib/filecheck.cc b/third_party/xla/xla/hlo/testlib/filecheck.cc
index 2620742bd80f..a98490090873 100644
--- a/third_party/xla/xla/hlo/testlib/filecheck.cc
+++ b/third_party/xla/xla/hlo/testlib/filecheck.cc
@@ -38,7 +38,7 @@ absl::StatusOr<bool> RunFileCheck(const std::string& input,
     return tsl::errors::Internal("couldn't get a pattern file name");
   }
   TF_RETURN_IF_ERROR(tsl::WriteStringToFile(env, pattern_path, pattern));
-  // LOG(INFO) << "input: " << input;
+  VLOG(3) << "input: " << input;
   return RunFileCheckWithPatternFile(input, pattern_path);
 }
 
diff --git a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
index b49b4356d055..134a41678351 100644
--- a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
+++ b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
@@ -33,15 +33,19 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/shape.h"
@@ -84,12 +88,24 @@ HloHardwareIndependentTestBase::CreateNewVerifiedModule(
       instruction_can_change_layout_func_);
 }
 
+/* static */ DeviceAssignment
+HloHardwareIndependentTestBase::GetDefaultDeviceAssignment(
+    int64_t replica_count, int64_t num_partitions) {
+  DeviceAssignment device_assignment(replica_count, num_partitions);
+  device_assignment.FillIota(0);
+  return device_assignment;
+}
+
 absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
 HloHardwareIndependentTestBase::ParseAndReturnVerifiedModule(
-    absl::string_view hlo_text, int64_t replica_count,
-    int64_t num_partitions) const {
-  return ParseAndReturnVerifiedModule(
-      hlo_text, GetModuleConfigForTest(replica_count, num_partitions));
+    absl::string_view hlo_text, int64_t replica_count, int64_t num_partitions,
+    std::optional<DeviceAssignment> device_assignment) const {
+  HloModuleConfig config =
+      GetModuleConfigForTest(replica_count, num_partitions);
+  if (device_assignment.has_value()) {
+    config.set_static_device_assignment(device_assignment.value());
+  }
+  return ParseAndReturnVerifiedModule(hlo_text, config);
 }
 
 absl::Status HloHardwareIndependentTestBase::
@@ -117,12 +133,32 @@ absl::Status HloHardwareIndependentTestBase::
 
 absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
 HloHardwareIndependentTestBase::ParseAndReturnVerifiedModule(
-    absl::string_view hlo_text, const HloModuleConfig& config) const {
+    absl::string_view hlo_text, const HloModuleConfig& config,
+    const HloParserOptions& parser_options) const {
+  return ParseAndReturnVerifiedModule(hlo_text, config, parser_options,
+                                      ShapeUtil::ByteSizeOfElements);
+}
+
+absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+HloHardwareIndependentTestBase::ParseAndReturnVerifiedModule(
+    absl::string_view hlo_text, const HloModuleConfig& config,
+    const HloParserOptions& parser_options,
+    std::function<int64_t(const xla::Shape&)> shape_size_fn) const {
+  HloModuleConfig config_with_device_assignment = config;
+  if (!config.has_static_device_assignment()) {
+    absl::MutexLock ml(&device_assignment_mu_);
+    default_device_assignment_ =
+        std::make_unique<DeviceAssignment>(GetDefaultDeviceAssignment(
+            config.replica_count(), config.num_partitions()));
+    config_with_device_assignment.set_static_device_assignment(
+        *default_device_assignment_);
+  }
   auto module = std::make_unique<VerifiedHloModule>(
-      TestName(), config, verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_, ShapeUtil::ByteSizeOfElements,
+      TestName(), config_with_device_assignment, verifier_layout_sensitive_,
+      allow_mixed_precision_in_hlo_verifier_, shape_size_fn,
       instruction_can_change_layout_func_);
-  TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
+  TF_RETURN_IF_ERROR(
+      module->ParseHloStringAndVerifyModule(hlo_text, parser_options));
   return module;
 }
 
@@ -209,7 +245,7 @@ void HloHardwareIndependentTestBase::RunAndFilecheckHloRewrite(
     TF_ASSERT_OK_AND_ASSIGN(
         bool filecheck_matches,
         RunFileCheck(
-            module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
+            module->ToString(HloPrintOptions().set_print_large_constants(true)),
             *expected));
     EXPECT_TRUE(filecheck_matches);
     if (after_pass_checks) {
@@ -253,7 +289,7 @@ void HloHardwareIndependentTestBase::RunAndFilecheckHloModuleGroupRewrite(
     TF_ASSERT_OK_AND_ASSIGN(
         bool filecheck_matches,
         RunFileCheck(module_group.module(index).ToString(
-                         HloPrintOptions{}.set_print_operand_shape(false)),
+                         HloPrintOptions().set_print_large_constants(true)),
                      expected_str));
     EXPECT_TRUE(filecheck_matches);
     index++;
diff --git a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
index 9fb276d16c79..d92eb0ab2da7 100644
--- a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
+++ b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
@@ -26,19 +26,23 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/layout.h"
 #include "xla/service/computation_layout.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/shape_layout.h"
@@ -97,14 +101,26 @@ class HloHardwareIndependentTestBase : public ::testing::Test {
   std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
       const std::string& name = TestName(), int64_t replica_count = 1) const;
 
+  // Returns a default device assignment for the given replica and partition
+  // counts.
+  static DeviceAssignment GetDefaultDeviceAssignment(int64_t replica_count,
+                                                     int64_t num_partitions);
+
   // Parses the given string and returns module as a VerifiedHloModule.
   absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
-  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
-                               int64_t replica_count = 1,
-                               int64_t num_partitions = 1) const;
+  ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text, int64_t replica_count = 1,
+      int64_t num_partitions = 1,
+      std::optional<DeviceAssignment> device_assignment = std::nullopt) const;
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text, const HloModuleConfig& config,
+      const HloParserOptions& parser_options = HloParserOptions()) const;
   absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
-  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
-                               const HloModuleConfig& config) const;
+  ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text, const HloModuleConfig& config,
+      const HloParserOptions& parser_options,
+      std::function<int64_t(const xla::Shape&)> shape_size_fn) const;
 
   // Runs the hlo_pass with the provided module and returns the result. This
   // function also verifies that the module remains unchanged when hlo_pass
@@ -194,13 +210,26 @@ class HloHardwareIndependentTestBase : public ::testing::Test {
   // options (e.g. disabling additional passes).
   virtual DebugOptions GetDebugOptionsForTest() const;
 
+  void TearDown() override {
+    absl::MutexLock ml(&device_assignment_mu_);
+    default_device_assignment_.reset();
+  }
   // Gets an HloModuleConfig with options appropriate for tests.
-  HloModuleConfig GetModuleConfigForTest(int64_t replica_count = 1,
-                                         int64_t num_partitions = 1) const {
+  HloModuleConfig GetModuleConfigForTest(
+      int64_t replica_count = 1, int64_t num_partitions = 1,
+      std::optional<DeviceAssignment> device_assignment = std::nullopt) const {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     config.set_replica_count(replica_count);
     config.set_num_partitions(num_partitions);
+    if (device_assignment.has_value()) {
+      config.set_static_device_assignment(*device_assignment);
+    } else {
+      absl::MutexLock ml(&device_assignment_mu_);
+      default_device_assignment_ = std::make_unique<DeviceAssignment>(
+          GetDefaultDeviceAssignment(replica_count, num_partitions));
+      config.set_static_device_assignment(*default_device_assignment_);
+    }
     return config;
   }
 
@@ -282,6 +311,10 @@ class HloHardwareIndependentTestBase : public ::testing::Test {
   bool allow_mixed_precision_in_hlo_verifier_;
   HloPredicate instruction_can_change_layout_func_;
   std::unique_ptr<HloVerifier> hlo_verifier_;
+  mutable absl::Mutex device_assignment_mu_;
+  mutable std::unique_ptr<DeviceAssignment> default_device_assignment_
+      ABSL_GUARDED_BY(device_assignment_mu_)
+          ABSL_PT_GUARDED_BY(device_assignment_mu_);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/BUILD b/third_party/xla/xla/hlo/tools/BUILD
index 74cf502d2476..2c23097edefd 100644
--- a/third_party/xla/xla/hlo/tools/BUILD
+++ b/third_party/xla/xla/hlo/tools/BUILD
@@ -1,8 +1,8 @@
 # Tools and utilities that aid in XLA development and usage.
 
-load("//xla:strict.default.bzl", "py_strict_library")
+load("//xla:py_strict.bzl", "py_strict_binary", "py_strict_library")
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_binary",
 )
 load("//xla/tsl:tsl.default.bzl", "filegroup")
@@ -31,6 +31,8 @@ xla_cc_binary(
         "//xla/tsl/lib/io:random_inputstream",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
@@ -58,6 +60,8 @@ xla_cc_binary(
     srcs = ["convert_computation.cc"],
     deps = [
         "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
@@ -131,6 +135,7 @@ xla_cc_binary(
         "//xla/stream_executor/host:host_platform",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -153,7 +158,11 @@ xla_cc_binary(
 )
 
 py_strict_library(
+    name = "generate_hlo_test_checks_lib",
+    srcs = ["generate_hlo_test_checks.py"],
+)
+
+py_strict_binary(
     name = "generate_hlo_test_checks",
     srcs = ["generate_hlo_test_checks.py"],
-    srcs_version = "PY3",
 )
diff --git a/third_party/xla/xla/hlo/tools/convert_computation.cc b/third_party/xla/xla/hlo/tools/convert_computation.cc
index 7ebc5d3f5aa4..b013ac383ab8 100644
--- a/third_party/xla/xla/hlo/tools/convert_computation.cc
+++ b/third_party/xla/xla/hlo/tools/convert_computation.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // Usage: convert_computation <txt2bin|bin2txt> serialized_computation_proto
 //
 // bin2txt spits out the result to stdout. txt2bin modifies the file in place.
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "tsl/platform/status.h"
 #ifndef _WIN32
 #include <unistd.h>
diff --git a/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc b/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc
index 659e4cde814b..efbcef00e0f3 100644
--- a/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc
+++ b/third_party/xla/xla/hlo/tools/hex_floats_to_packed_literal.cc
@@ -15,10 +15,13 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/base/casts.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/io/buffered_inputstream.h"
 #include "xla/tsl/lib/io/random_inputstream.h"
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/BUILD
new file mode 100644
index 000000000000..5f0d4b11faf5
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/BUILD
@@ -0,0 +1,197 @@
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_mappings",
+    hdrs = ["hlo_gumgraph_mappings.h"],
+    deps = [
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/service:call_graph",
+        "@boost//:bimap",
+    ],
+)
+
+cc_library(
+    name = "hlo_diff_result",
+    srcs = ["hlo_diff_result.cc"],
+    hdrs = ["hlo_diff_result.h"],
+    deps = [
+        ":hlo_gumgraph_mappings",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/hlo/tools/hlo_diff/graph/utils:hlo_gumgraph_bfs",
+        "//xla/hlo/tools/hlo_diff/proto:diff_result_proto_cc",
+        "//xla/hlo/tools/hlo_diff/utils:hlo_diff_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_diff_result_test",
+    srcs = ["hlo_diff_result_test.cc"],
+    deps = [
+        ":hlo_diff_result",
+        ":hlo_gumgraph_mappings",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/utils:test_util",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_diff_summary",
+    srcs = ["hlo_diff_summary.cc"],
+    hdrs = ["hlo_diff_summary.h"],
+    deps = [
+        ":hlo_diff_result",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff/utils:connected_components",
+        "@boost//:bimap",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:fingerprint",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_diff_summary_test",
+    srcs = ["hlo_diff_summary_test.cc"],
+    deps = [
+        ":hlo_diff_result",
+        ":hlo_diff_summary",
+        ":hlo_gumgraph_mappings",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/utils:test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_diff",
+    srcs = ["hlo_gumgraph_diff.cc"],
+    hdrs = ["hlo_gumgraph_diff.h"],
+    deps = [
+        ":hlo_diff_eval",
+        ":hlo_diff_result",
+        ":hlo_diff_summary",
+        ":hlo_gumgraph_mappings",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/hlo/tools/hlo_diff/matchers:hlo_call_graph_matcher",
+        "//xla/hlo/tools/hlo_diff/matchers:hlo_computation_graph_matcher",
+        "//xla/hlo/tools/hlo_diff/matchers:hlo_gumgraph_matcher",
+        "//xla/service:call_graph",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_gumgraph_diff_test",
+    srcs = ["hlo_gumgraph_diff_test.cc"],
+    deps = [
+        ":hlo_gumgraph_diff",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_diff_eval",
+    srcs = ["hlo_diff_eval.cc"],
+    hdrs = ["hlo_diff_eval.h"],
+    deps = [
+        ":hlo_diff_result",
+        ":hlo_diff_summary",
+        ":hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_diff_eval_test",
+    srcs = ["hlo_diff_eval_test.cc"],
+    deps = [
+        ":hlo_diff_eval",
+        ":hlo_diff_result",
+        ":hlo_diff_summary",
+        ":hlo_gumgraph_mappings",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/utils:test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_binary(
+    name = "hlo_diff",
+    srcs = ["hlo_diff_main.cc"],
+    deps = [
+        ":hlo_diff_result",
+        ":hlo_diff_summary",
+        ":hlo_gumgraph_diff",
+        "//xla:debug_options_flags",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/tools/hlo_diff/render:hlo_gumgraph_html_renderer",
+        "//xla/hlo/tools/hlo_diff/render:hlo_gumgraph_text_renderer",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_module_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/graph/BUILD
new file mode 100644
index 000000000000..012ba11efd07
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/BUILD
@@ -0,0 +1,68 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_node",
+    hdrs = ["hlo_gumgraph_node.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:call_graph",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph",
+    srcs = ["hlo_gumgraph.cc"],
+    hdrs = ["hlo_gumgraph.h"],
+    deps = [
+        ":hlo_gumgraph_node",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff/graph/analysis:hlo_value_tracing",
+        "//xla/hlo/tools/hlo_diff/graph/utils:hlo_gumgraph_dfs",
+        "//xla/hlo/tools/hlo_diff/utils:hlo_diff_util",
+        "//xla/service:call_graph",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:fingerprint",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_gumgraph_test",
+    srcs = ["hlo_gumgraph_test.cc"],
+    deps = [
+        ":hlo_gumgraph",
+        ":hlo_gumgraph_node",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/BUILD
new file mode 100644
index 000000000000..233bc8478010
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/BUILD
@@ -0,0 +1,41 @@
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//xla/hlo/tools/hlo_diff:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "hlo_value_tracing",
+    srcs = ["hlo_value_tracing.cc"],
+    hdrs = ["hlo_value_tracing.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:call_graph",
+        "//xla/service:hlo_value",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.cc
new file mode 100644
index 000000000000..c8de6ba58108
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.cc
@@ -0,0 +1,1220 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace {
+// CalculatePostOrderSchedule traverses a module and assign a ordinal to each
+// instruction based the postorder dependency.
+int64_t CalculatePostOrderScheduleHelper(
+    const HloComputation* comp, int64_t start_ordinal,
+    absl::flat_hash_map<HloInstruction*, int64_t>* ordinal_map) {
+  int64_t ordinal = start_ordinal;
+  for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
+    if (instruction->opcode() == HloOpcode::kCall ||
+        instruction->opcode() == HloOpcode::kAsyncStart ||
+        instruction->opcode() == HloOpcode::kConditional) {
+      for (const HloComputation* called_computation :
+           instruction->called_computations()) {
+        ordinal = CalculatePostOrderScheduleHelper(called_computation, ordinal,
+                                                   ordinal_map);
+      }
+    }
+    if (instruction->opcode() == HloOpcode::kWhile) {
+      ordinal = CalculatePostOrderScheduleHelper(instruction->while_condition(),
+                                                 ordinal, ordinal_map);
+      ordinal = CalculatePostOrderScheduleHelper(instruction->while_body(),
+                                                 ordinal, ordinal_map);
+    }
+    // It's possible that in some unit tests the computation graph is not
+    // flatten (meaning we could have multiple callers for one computation). In
+    // that case the oridinal_map will see the instruction multiple times. We
+    // consider that case to be ok as it only shows up in unit tests.
+    ordinal_map->insert({instruction, ordinal++});
+  }
+  return ordinal;
+}
+
+absl::flat_hash_map<HloInstruction*, int64_t> CalculatePostOrderSchedule(
+    const HloModule& module) {
+  absl::flat_hash_map<HloInstruction*, int64_t> map;
+  CalculatePostOrderScheduleHelper(module.entry_computation(), 0, &map);
+  return map;
+}
+
+}  // namespace
+
+bool HloValueTracing::ValueIsDefinedAt(const HloInstruction* instruction,
+                                       const ShapeIndex& index) const {
+  const HloValueSet& value_set = GetValueSet(instruction, index);
+  if (value_set.values().size() != 1) {
+    return false;
+  }
+  return value_set.GetUniqueValue().defining_instruction() == instruction;
+}
+
+HloValueTracing::HloValueTracing(
+    const HloModule& module,
+    absl::flat_hash_set<absl::string_view> execution_threads)
+    : module_(module),
+      execution_threads_(std::move(execution_threads)),
+      call_graph_(CallGraph::Build(&module)) {}
+
+HloValue* HloValueTracing::NewHloValue(HloInstruction* instruction,
+                                       const ShapeIndex& index, bool is_phi) {
+  const int64_t value_id = next_value_id_++;
+  auto result =
+      values_.insert({value_id, std::make_unique<HloValue>(
+                                    value_id, instruction, index, is_phi)});
+  CHECK(result.second);
+
+
+  return result.first->second.get();
+}
+
+void HloValueTracing::DeleteMarkedValues() {
+  // Use a set to prevent deleting an id twice.
+  absl::flat_hash_set<HloValue::Id> id_set(value_ids_to_delete_.begin(),
+                                           value_ids_to_delete_.end());
+
+  for (HloValue::Id value_id : id_set) {
+    values_.erase(value_id);
+  }
+  value_ids_to_delete_.clear();
+}
+
+HloValue& HloValueTracing::GetValue(HloValue::Id value_id) {
+  DCHECK(values_.contains(value_id)) << "Value not found: " << value_id;
+  return *values_.find(value_id)->second;
+}
+
+HloValueSet HloValueTracing::GetFlattenedValueSet(
+    const HloInstruction* instruction) const {
+  HloValueSet value_set;
+
+  const InstructionValueSet& value_set_tree =
+      GetInstructionValueSet(instruction);
+
+  std::vector<const HloValueSet*> all_sets;
+  for (auto& pair : value_set_tree) {
+    all_sets.push_back(&pair.second);
+  }
+  value_set.AssignUnionOf(all_sets);
+
+  return value_set;
+}
+
+const HloValueSet& HloValueTracing::GetValueSet(
+    const HloInstruction* instruction, const ShapeIndex& index) const {
+  return GetInstructionValueSet(instruction).element(index);
+}
+
+HloValueSet& HloValueTracing::GetValueSet(const HloInstruction* instruction,
+                                          const ShapeIndex& index) {
+  return *GetInstructionValueSet(instruction).mutable_element(index);
+}
+
+bool HloValueTracing::UpdateSendValueSet(HloInstruction* send) {
+  CHECK_EQ(send->opcode(), HloOpcode::kSend);
+  bool changed = false;
+  // Send forwards the operand value to the output tuple at {0}.
+  for (auto& pair : GetInstructionValueSet(send->operand(0))) {
+    const ShapeIndex& operand_index = pair.first;
+    const HloValueSet& operand_value_set = pair.second;
+
+    ShapeIndex index = {0};
+    for (int64_t i : operand_index) {
+      index.push_back(i);
+    }
+
+    HloValueSet& value_set = GetValueSet(send, index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateAsyncStartValueSet(HloInstruction* async_start) {
+  CHECK_EQ(async_start->opcode(), HloOpcode::kAsyncStart);
+  bool changed = false;
+  // AsyncStart forwards the operand values to element {0} of its output.
+  for (int64_t i = 0; i < async_start->operand_count(); ++i) {
+    const HloInstruction* operand = async_start->operand(i);
+    ShapeUtil::ForEachSubshape(
+        operand->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+          if (!subshape.IsArray()) {
+            return;
+          }
+          const HloValueSet& operand_value_set = GetValueSet(operand, index);
+
+          ShapeIndex output_index = {0, i};
+          output_index.insert(output_index.end(), index.begin(), index.end());
+
+          HloValueSet& value_set = GetValueSet(async_start, output_index);
+          if (value_set != operand_value_set) {
+            value_set = operand_value_set;
+            changed = true;
+          }
+        });
+  }
+  if (!HloInstruction::IsThreadIncluded(async_start->async_execution_thread(),
+                                        execution_threads_)) {
+    return changed;
+  }
+  // AsyncStart forwards the async wrapped computation root values to element
+  // {1} of its output.
+  HloInstruction* root =
+      async_start->async_wrapped_computation()->root_instruction();
+  ShapeUtil::ForEachSubshape(
+      root->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (!subshape.IsArray()) {
+          return;
+        }
+        const HloValueSet& root_value_set = GetValueSet(root, index);
+
+        ShapeIndex output_index = {1};
+        output_index.insert(output_index.end(), index.begin(), index.end());
+
+        HloValueSet& value_set = GetValueSet(async_start, output_index);
+        if (value_set != root_value_set) {
+          value_set = root_value_set;
+          changed = true;
+        }
+      });
+  return changed;
+}
+
+bool HloValueTracing::UpdateAsyncUpdateValueSet(HloInstruction* async_update) {
+  CHECK_EQ(async_update->opcode(), HloOpcode::kAsyncUpdate);
+  CHECK_EQ(async_update->shape(), async_update->operand(0)->shape());
+  bool changed = false;
+  HloInstruction* root =
+      HloInstruction::IsThreadIncluded(async_update->async_execution_thread(),
+                                       execution_threads_)
+          ? async_update->async_wrapped_computation()->root_instruction()
+          : nullptr;
+  // AsyncUpdate forwards all of the operand values to corresponding elements of
+  // its output.
+  ShapeUtil::ForEachSubshape(
+      async_update->operand(0)->shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) {
+        if (!subshape.IsArray()) {
+          return;
+        }
+        const HloValueSet& operand_value_set =
+            GetValueSet(async_update->operand(0), index);
+
+        HloValueSet& value_set = GetValueSet(async_update, index);
+        CHECK_GE(index.size(), 0);
+        if (index[0] != 1) {
+          if (value_set != operand_value_set) {
+            value_set = operand_value_set;
+            changed = true;
+          }
+        } else if (root != nullptr) {
+          // If this subshape is an output (index {1}), we need to create the
+          // union with the async wrapped computation root.
+          ShapeIndex root_index(index.begin() + 1, index.end());
+          const HloValueSet& root_value_set = GetValueSet(root, root_index);
+          changed |=
+              value_set.AssignUnionOf({&operand_value_set, &root_value_set});
+        } else if (value_set != operand_value_set) {
+          value_set = operand_value_set;
+          changed = true;
+        }
+      });
+  return changed;
+}
+
+bool HloValueTracing::UpdateAsyncDoneValueSet(HloInstruction* async_done) {
+  CHECK_EQ(async_done->opcode(), HloOpcode::kAsyncDone);
+  bool changed = false;
+  HloInstruction* root =
+      HloInstruction::IsThreadIncluded(async_done->async_execution_thread(),
+                                       execution_threads_)
+          ? async_done->async_wrapped_computation()->root_instruction()
+          : nullptr;
+  // AsyncDone creates a union of the operand values at {1} and the async
+  // wrapped computation root to element {} of its output.
+  ShapeUtil::ForEachSubshape(
+      async_done->operand(0)->shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) {
+        if (!subshape.IsArray() || index.front() != 1) {
+          return;
+        }
+        const HloValueSet& operand_value_set =
+            GetValueSet(async_done->operand(0), index);
+
+        ShapeIndex output_index(index.begin() + 1, index.end());
+        HloValueSet& value_set = GetValueSet(async_done, output_index);
+        if (root != nullptr) {
+          const HloValueSet& root_value_set = GetValueSet(root, output_index);
+          changed |=
+              value_set.AssignUnionOf({&operand_value_set, &root_value_set});
+        } else if (value_set != operand_value_set) {
+          value_set = operand_value_set;
+          changed = true;
+        }
+      });
+  return changed;
+}
+
+bool HloValueTracing::UpdateCopyStartValueSet(HloInstruction* copy_start) {
+  CHECK_EQ(copy_start->opcode(), HloOpcode::kCopyStart);
+  bool changed = false;
+  // CopyStart forwards the operand value to elements {0, 1} of its output.
+  const HloValueSet& operand_value_set = GetValueSet(copy_start->operand(0));
+  HloValueSet& first_value_set = GetValueSet(copy_start, {0});
+  if (first_value_set != operand_value_set) {
+    first_value_set = operand_value_set;
+    changed = true;
+  }
+
+  HloValueSet& second_value_set = GetValueSet(copy_start, {1});
+  if (second_value_set != operand_value_set) {
+    second_value_set = operand_value_set;
+    changed = true;
+  }
+
+  return changed;
+}
+
+bool HloValueTracing::UpdateCopyDoneValueSet(HloInstruction* copy_done) {
+  CHECK_EQ(copy_done->opcode(), HloOpcode::kCopyDone);
+  bool changed = false;
+  // CopyDone forwards the operand value at {0} to element {} of its output.
+  const HloValueSet& operand_value_set =
+      GetValueSet(copy_done->operand(0), {0});
+  HloValueSet& value_set = GetValueSet(copy_done);
+  if (value_set != operand_value_set) {
+    value_set = operand_value_set;
+    changed = true;
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateRecvDoneValueSet(HloInstruction* recv_done) {
+  CHECK_EQ(recv_done->opcode(), HloOpcode::kRecvDone);
+  bool changed = false;
+  // RecvDone forwards the operand value at {0} to element {0} of its output.
+  for (auto& pair : GetInstructionValueSet(recv_done)) {
+    ShapeIndex& index = pair.first;
+    HloValueSet& value_set = pair.second;
+
+    if (index.empty() || index[0] != 0) {
+      continue;
+    }
+
+    const HloValueSet& operand_value_set =
+        GetValueSet(recv_done->operand(0), index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateCallValueSet(HloInstruction* call) {
+  CHECK_EQ(call->opcode(), HloOpcode::kCall);
+  InstructionValueSet& value_set = GetInstructionValueSet(call);
+  InstructionValueSet& root_value_set =
+      GetInstructionValueSet(call->to_apply()->root_instruction());
+  if (value_set != root_value_set) {
+    value_set = root_value_set;
+    return true;
+  }
+  return false;
+}
+
+bool HloValueTracing::UpdateConditionalValueSet(HloInstruction* conditional) {
+  CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
+  std::vector<const InstructionValueSet*> inputs(conditional->branch_count());
+  for (int j = 0; j < conditional->branch_count(); ++j) {
+    inputs[j] = &GetInstructionValueSet(
+        conditional->branch_computation(j)->root_instruction());
+  }
+  return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+}
+
+bool HloValueTracing::UpdateCopyValueSet(HloInstruction* copy) {
+  CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
+  bool changed = false;
+  for (auto& pair : GetInstructionValueSet(copy)) {
+    const ShapeIndex& index = pair.first;
+
+    HloValueSet& value_set = pair.second;
+    HloValueSet& operand_value_set = GetValueSet(copy->operand(0), index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateOptimizationBarrierValueSet(
+    HloInstruction* barrier) {
+  // Optimization Barriers just forward their operand. Given that barriers can
+  // have a tuple operand, we iterate through its indexes, like for copies.
+  // Unlike copies though we also propagate the top-level value.
+  CHECK_EQ(barrier->opcode(), HloOpcode::kOptimizationBarrier);
+  bool changed = false;
+  for (auto& pair : GetInstructionValueSet(barrier)) {
+    const ShapeIndex& index = pair.first;
+    HloValueSet& value_set = pair.second;
+    HloValueSet& operand_value_set = GetValueSet(barrier->operand(0), index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateDomainValueSet(HloInstruction* domain) {
+  // Domain instructions just forward their operand. Given that domains can have
+  // a tuple operand, we iterate through its indexes, like for copies.
+  // Unlike copies though we also propagate the top-level value.
+  CHECK_EQ(domain->opcode(), HloOpcode::kDomain);
+  bool changed = false;
+  for (auto& pair : GetInstructionValueSet(domain)) {
+    const ShapeIndex& index = pair.first;
+    HloValueSet& value_set = pair.second;
+    HloValueSet& operand_value_set = GetValueSet(domain->operand(0), index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateAddDependencyValueSet(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand.
+  CHECK_EQ(add_dependency->opcode(), HloOpcode::kAddDependency);
+  const InstructionValueSet& operand_set =
+      GetInstructionValueSet(add_dependency->operand(0));
+  InstructionValueSet& add_dependency_set =
+      GetInstructionValueSet(add_dependency);
+  if (operand_set != add_dependency_set) {
+    add_dependency_set = operand_set;
+    return true;
+  }
+  return false;
+}
+
+bool HloValueTracing::UpdateGetTupleElementValueSet(HloInstruction* gte) {
+  CHECK_EQ(gte->opcode(), HloOpcode::kGetTupleElement);
+  bool changed = false;
+  // The GetTupleElement instruction forwards the values from the specified
+  // tuple element.
+  for (auto& pair : GetInstructionValueSet(gte)) {
+    const ShapeIndex& index = pair.first;
+    HloValueSet& value_set = pair.second;
+
+    // The corresponding ShapeIndex of the operand is simply the GTE ShapeIndex
+    // with the tuple element number prefixed.
+    ShapeIndex operand_index = {gte->tuple_index()};
+    for (int64_t i : index) {
+      operand_index.push_back(i);
+    }
+
+    HloValueSet& operand_value_set =
+        GetValueSet(gte->operand(0), operand_index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateParameterValueSet(HloInstruction* parameter) {
+  CHECK_EQ(parameter->opcode(), HloOpcode::kParameter);
+  const CallGraphNode& call_graph_node =
+      call_graph_->GetNode(parameter->parent());
+
+  // Subcomputations called in a parallel context (eg, map) do not have dataflow
+  // from the caller operands.
+  if (call_graph_node.caller_callsites().empty()) {
+    return false;
+  }
+
+  std::vector<const InstructionValueSet*> inputs;
+  for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+    const HloOpcode opcode = callsite.instruction()->opcode();
+    if (opcode == HloOpcode::kCall || opcode == HloOpcode::kFusion) {
+      // The operand values of a call instruction are forwarded to the
+      // respective parameter instruction of the subcomputation.
+      inputs.push_back(&GetInstructionValueSet(
+          callsite.instruction()->operand(parameter->parameter_number())));
+    } else if (opcode == HloOpcode::kWhile) {
+      // In a while instruction, the while operand (ie, the init value) and the
+      // backedge are dataflow inputs to the parameter instruction. This is the
+      // case for parameters of both the body and condition computations.
+      CHECK_EQ(parameter->parameter_number(), 0);
+      inputs.push_back(
+          &GetInstructionValueSet(callsite.instruction()->operand(0)));
+      // If the parameter *is not* the root, parameter state would be
+      // updated by the root, otherwise don't consider it's current state
+      // (InstructionValueSet) as we are recomputing its current state.
+      if (parameter !=
+          callsite.instruction()->while_body()->root_instruction()) {
+        inputs.push_back(&GetInstructionValueSet(
+            callsite.instruction()->while_body()->root_instruction()));
+      }
+    } else if (opcode == HloOpcode::kConditional) {
+      CHECK_EQ(parameter->parameter_number(), 0);
+      auto* conditional = callsite.instruction();
+      // Conditional has branch_count+1 operands. Operand 0 is the branch_index,
+      // operands 1 and onward are the arguments to the branch computations.
+      //
+      // If the parameter belongs to conditional's branch 0 computation, then
+      // operand 1 is forwarded to this parameter instruction. If the parameter
+      // belongs to conditional's branch 5 computation, then operand 6 is
+      // forwarded to this parameter instruction.
+      bool found_parent = false;
+      for (int j = 0; j < conditional->branch_count(); ++j) {
+        if (parameter->parent() == conditional->branch_computation(j)) {
+          inputs.push_back(
+              &GetInstructionValueSet(conditional->operand(j + 1)));
+          found_parent = true;
+          break;
+        }
+      }
+      CHECK(found_parent);
+    } else if (opcode == HloOpcode::kAsyncStart) {
+      inputs.push_back(&GetInstructionValueSet(
+          callsite.instruction()->operand(parameter->parameter_number())));
+    } else if (opcode == HloOpcode::kAsyncUpdate ||
+               opcode == HloOpcode::kAsyncDone) {
+      return GetInstructionValueSet(parameter).AssignUnionOf(
+          GetInstructionValueSet(callsite.instruction()->operand(0)),
+          {0, parameter->parameter_number()});
+    } else {
+      return false;
+    }
+  }
+
+  return GetInstructionValueSet(parameter).AssignUnionOf(inputs);
+}
+
+bool HloValueTracing::UpdateTupleValueSet(HloInstruction* tuple) {
+  CHECK_EQ(tuple->opcode(), HloOpcode::kTuple);
+  bool changed = false;
+  for (int64_t i = 0; i < tuple->operands().size(); ++i) {
+    // Copy the value set(s) of each operand into the respective position in the
+    // kTuple instruction's value sets.
+    for (auto& pair : GetInstructionValueSet(tuple->operand(i))) {
+      const ShapeIndex& operand_index = pair.first;
+      HloValueSet& operand_value_set = pair.second;
+
+      ShapeIndex index = {i};
+      for (int64_t op_index : operand_index) {
+        index.push_back(op_index);
+      }
+      HloValueSet& value_set = GetValueSet(tuple, index);
+
+      if (value_set != operand_value_set) {
+        value_set = operand_value_set;
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateWhileValueSet(HloInstruction* xla_while) {
+  CHECK_EQ(xla_while->opcode(), HloOpcode::kWhile);
+  const InstructionValueSet* const inputs[] = {
+      &GetInstructionValueSet(xla_while->while_body()->root_instruction()),
+      &GetInstructionValueSet(xla_while->operand(0))};
+  return GetInstructionValueSet(xla_while).AssignUnionOf(inputs);
+}
+
+bool HloValueTracing::UpdateFusionValueSet(HloInstruction* fusion) {
+  CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
+
+  InstructionValueSet& value_set = GetInstructionValueSet(fusion);
+  InstructionValueSet& root_value_set = GetInstructionValueSet(
+      fusion->called_computations().front()->root_instruction());
+  if (value_set != root_value_set) {
+    value_set = root_value_set;
+    return true;
+  }
+
+  return false;
+}
+
+bool HloValueTracing::UpdateAllGatherStartValueSet(
+    HloInstruction* all_gather_start) {
+  CHECK_EQ(all_gather_start->opcode(), HloOpcode::kAllGatherStart);
+  bool changed = false;
+  // AllGatherStart forwards the operand values to element {0} of its output.
+  for (int64_t i = 0; i < all_gather_start->operand_count(); ++i) {
+    const HloValueSet& operand_value_set =
+        GetValueSet(all_gather_start->operand(i));
+
+    ShapeIndex output_index = {0};
+    if (all_gather_start->operand_count() > 1) {
+      output_index.push_back(i);
+    }
+
+    HloValueSet& value_set = GetValueSet(all_gather_start, output_index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateAllGatherDoneValueSet(
+    HloInstruction* all_gather_done) {
+  CHECK_EQ(all_gather_done->opcode(), HloOpcode::kAllGatherDone);
+  bool changed = false;
+  // AllGatherDone forwards the operand value at {1} to its output. If the
+  // output is a tuple, then that tuple is defined by all-gather-done, so
+  // only update the value set for tuple leaf elements (arrays).
+  for (auto& pair : GetInstructionValueSet(all_gather_done)) {
+    const ShapeIndex& output_index = pair.first;
+    HloValueSet& value_set = pair.second;
+
+    if (!ShapeUtil::GetSubshape(all_gather_done->shape(), output_index)
+             .IsArray()) {
+      continue;
+    }
+    ShapeIndex operand_index = {1};
+    for (int64_t i : output_index) {
+      operand_index.push_back(i);
+    }
+
+    const HloValueSet& operand_value_set =
+        GetValueSet(all_gather_done->operand(0), operand_index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateAllReduceDoneValueSet(
+    HloInstruction* all_reduce_done) {
+  CHECK_EQ(all_reduce_done->opcode(), HloOpcode::kAllReduceDone);
+  bool changed = false;
+  // AllReduceDone forwards its only operand.
+  for (auto& pair : GetInstructionValueSet(all_reduce_done)) {
+    const ShapeIndex& output_index = pair.first;
+    HloValueSet& value_set = pair.second;
+
+    ShapeIndex operand_index = {};
+    for (int64_t i : output_index) {
+      operand_index.push_back(i);
+    }
+
+    const HloValueSet& operand_value_set =
+        GetValueSet(all_reduce_done->operand(0), operand_index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateCollectivePermuteStartValueSet(
+    HloInstruction* collective_permute_start) {
+  CHECK_EQ(collective_permute_start->opcode(),
+           HloOpcode::kCollectivePermuteStart);
+  bool changed = false;
+  // CollectivePermuteStart forwards the operand value to element {0} of its
+  // output.
+  if (collective_permute_start->operand(0)->shape().IsTuple()) {
+    for (int i = 0; i < ShapeUtil::TupleElementCount(
+                            collective_permute_start->operand(0)->shape());
+         ++i) {
+      const HloValueSet& operand_value_set =
+          GetValueSet(collective_permute_start->operand(0), {i});
+      HloValueSet& value_set = GetValueSet(collective_permute_start, {0, i});
+      if (value_set != operand_value_set) {
+        value_set = operand_value_set;
+        changed = true;
+      }
+    }
+  } else {
+    const HloValueSet& operand_value_set =
+        GetValueSet(collective_permute_start->operand(0));
+    HloValueSet& value_set = GetValueSet(collective_permute_start, {0});
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateCollectivePermuteDoneValueSet(
+    HloInstruction* collective_permute_done) {
+  CHECK_EQ(collective_permute_done->opcode(),
+           HloOpcode::kCollectivePermuteDone);
+  bool changed = false;
+  // CollectivePermuteDone forwards the operand value at {1} to its output.
+  if (collective_permute_done->shape().IsTuple()) {
+    for (int i = 0;
+         i < ShapeUtil::TupleElementCount(collective_permute_done->shape());
+         ++i) {
+      const HloValueSet& operand_value_set =
+          GetValueSet(collective_permute_done->operand(0), {1, i});
+      HloValueSet& value_set = GetValueSet(collective_permute_done, {i});
+      if (value_set != operand_value_set) {
+        value_set = operand_value_set;
+        changed = true;
+      }
+    }
+  } else {
+    const HloValueSet& operand_value_set =
+        GetValueSet(collective_permute_done->operand(0), {1});
+    HloValueSet& value_set = GetValueSet(collective_permute_done);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloValueTracing::UpdateInstructionValueSet(HloInstruction* instruction) {
+  // Recompute from operands.
+  bool changed = false;
+  switch (instruction->opcode()) {
+    case HloOpcode::kAddDependency: {
+      changed = UpdateAddDependencyValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAllGatherStart: {
+      changed = UpdateAllGatherStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAllGatherDone: {
+      changed = UpdateAllGatherDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAsyncStart: {
+      changed = UpdateAsyncStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAsyncUpdate: {
+      changed = UpdateAsyncUpdateValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAsyncDone: {
+      changed = UpdateAsyncDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kDomain: {
+      changed = UpdateDomainValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCopy: {
+      changed = UpdateCopyValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kGetTupleElement: {
+      changed = UpdateGetTupleElementValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kTuple: {
+      changed = UpdateTupleValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kParameter: {
+      changed = UpdateParameterValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCall: {
+      changed = UpdateCallValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kWhile: {
+      changed = UpdateWhileValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kSend: {
+      changed = UpdateSendValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kRecvDone: {
+      changed = UpdateRecvDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCopyStart: {
+      changed = UpdateCopyStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCopyDone: {
+      changed = UpdateCopyDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kConditional: {
+      changed = UpdateConditionalValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAllReduceDone: {
+      changed = UpdateAllReduceDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCollectivePermuteStart: {
+      changed = UpdateCollectivePermuteStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCollectivePermuteDone: {
+      changed = UpdateCollectivePermuteDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kOptimizationBarrier: {
+      changed = UpdateOptimizationBarrierValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kFusion: {
+      changed = UpdateFusionValueSet(instruction);
+      break;
+    }
+    default:
+      break;
+  }
+
+  return changed;
+}
+
+void HloValueTracing::Propagate() {
+  using Work = std::pair<int64_t, HloInstruction*>;
+  // Avoid duplicating work by preferring work items early in the post order
+  // schedule. Intuitively, we start from entry parameters and propagate buffers
+  // updates throughout the module only once.
+  std::priority_queue<Work, std::vector<Work>, std::greater<Work>> worklist;
+  absl::flat_hash_set<HloInstruction*> workset;
+  auto priority_map = CalculatePostOrderSchedule(module_);
+  auto add_to_worklist = [&priority_map, &worklist,
+                          &workset](HloInstruction* instruction) {
+    if (workset.insert(instruction).second) {
+      worklist.emplace(priority_map[instruction], instruction);
+    }
+  };
+
+  auto comps = module_.MakeComputationPostOrder();
+  for (HloComputation* computation : comps) {
+    if (!HloInstruction::IsThreadIncluded(computation->execution_thread(),
+                                          execution_threads_)) {
+      continue;
+    }
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      add_to_worklist(instruction);
+    }
+  }
+
+  while (!worklist.empty()) {
+    HloInstruction* instruction = worklist.top().second;
+    worklist.pop();
+
+    workset.erase(workset.find(instruction));
+
+    if (!UpdateInstructionValueSet(instruction)) {
+      // No change to the instruction's value set.
+      continue;
+    }
+
+    // Instruction value was updated. Add users to work list if we haven't
+    // already.
+    for (HloInstruction* user : instruction->users()) {
+      add_to_worklist(user);
+
+      // If user sequentially calls a computation, then the respective
+      // parameter(s) of the computation need to be updated.
+      if (user->opcode() == HloOpcode::kConditional) {
+        // If operand 0 is the use of instruction, then no parameters need to be
+        // updated, since that is the branch_index of the conditional.
+        // If operand n+1 is the use of instruction, then the branch_computation
+        // n's parameter need to be updated.
+        //
+        // Note that the same instruction can be used in multiple branches'
+        // operands.
+        for (int j = 0; j < user->branch_count(); ++j) {
+          if (user->operand(j + 1) == instruction) {
+            add_to_worklist(
+                user->branch_computation(j)->parameter_instruction(0));
+          }
+        }
+      } else if (user->opcode() == HloOpcode::kAsyncUpdate ||
+                 user->opcode() == HloOpcode::kAsyncDone) {
+        if (HloInstruction::IsThreadIncluded(user->async_execution_thread(),
+                                             execution_threads_)) {
+          // For async update and async done, we cannot distinguish which
+          // parameter needs to be updated so add all to the worklist.
+          for (int64_t parameter_number = 0;
+               parameter_number <
+               user->async_wrapped_computation()->num_parameters();
+               ++parameter_number) {
+            add_to_worklist(
+                user->async_wrapped_computation()->parameter_instruction(
+                    parameter_number));
+          }
+        }
+      } else {
+        for (HloComputation* called_computation : user->called_computations()) {
+          if (!HloInstruction::IsThreadIncluded(
+                  called_computation->execution_thread(), execution_threads_)) {
+            continue;
+          }
+          const CallGraphNode& call_graph_node =
+              call_graph_->GetNode(called_computation);
+          if (call_graph_node.context() == CallContext::kControlFlow ||
+              user->opcode() == HloOpcode::kFusion) {
+            for (int64_t operand_number : user->OperandIndices(instruction)) {
+              add_to_worklist(
+                  called_computation->parameter_instruction(operand_number));
+            }
+          }
+        }
+      }
+    }
+
+    // If instruction is a root instruction, then propagate out to any calling
+    // instruction and across any while backedge.
+    if (instruction == instruction->parent()->root_instruction()) {
+      const CallGraphNode& call_graph_node =
+          call_graph_->GetNode(instruction->parent());
+      for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+        if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+          // Add the while itself, and the body and condition parameters.
+          add_to_worklist(callsite.instruction());
+          add_to_worklist(
+              callsite.instruction()->while_body()->parameter_instruction(0));
+          add_to_worklist(
+              callsite.instruction()->while_condition()->parameter_instruction(
+                  0));
+        } else if (call_graph_node.context() == CallContext::kControlFlow ||
+                   callsite.instruction()->opcode() ==
+                       HloOpcode::kConditional ||
+                   callsite.instruction()->opcode() == HloOpcode::kFusion) {
+          add_to_worklist(callsite.instruction());
+        }
+      }
+    }
+  }
+}
+
+const InstructionValueSet& HloValueTracing::GetInstructionValueSet(
+    const HloInstruction* instruction) const {
+  DCHECK(value_sets_.contains(instruction))
+      << "Instruction " << instruction->ToString() << " not found.";
+  return *value_sets_.find(instruction)->second;
+}
+
+InstructionValueSet& HloValueTracing::GetInstructionValueSet(
+    const HloInstruction* instruction) {
+  DCHECK(value_sets_.contains(instruction))
+      << "Instruction " << instruction->ToString() << " not found.";
+  return *value_sets_.find(instruction)->second;
+}
+
+absl::Status HloValueTracing::InitializeInstructionValueSets() {
+  for (const HloComputation* computation : module_.MakeComputationSorted()) {
+    if (!HloInstruction::IsThreadIncluded(computation->execution_thread(),
+                                          execution_threads_)) {
+      continue;
+    }
+    const CallGraphNode& call_graph_node = call_graph_->GetNode(computation);
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      // Create an empty shape tree.
+      value_sets_.insert({instruction, std::make_unique<InstructionValueSet>(
+                                           &instruction->shape())});
+
+      // For each sub-shape of the instruction shape, add a new HloValue to its
+      // HloValueSet. should_define may be provided to define a subset of
+      // values.
+      auto define_all_values =
+          [this, &instruction](
+              absl::FunctionRef<bool(const ShapeIndex&)> should_define =
+                  [](const ShapeIndex&) { return true; }) {
+            for (auto& pair : GetInstructionValueSet(instruction)) {
+              const ShapeIndex& index = pair.first;
+
+              if (should_define(index)) {
+                HloValue* value =
+                    NewHloValue(instruction, index, /*is_phi=*/false);
+                GetValueSet(instruction, index).AddValue(value);
+              }
+            }
+          };
+
+      // Add a new HloValue to the HloValueSet corresponding to the given index
+      // of the instruction shape.
+      auto define_value_at = [this, &instruction](const ShapeIndex& index) {
+        HloValue* value = NewHloValue(instruction, index, /*is_phi=*/false);
+        GetValueSet(instruction, index).AddValue(value);
+      };
+
+      switch (instruction->opcode()) {
+        case HloOpcode::kAddDependency:
+        case HloOpcode::kWhile:
+        case HloOpcode::kCall:
+        case HloOpcode::kConditional:
+        case HloOpcode::kGetTupleElement:
+        case HloOpcode::kDomain:
+        case HloOpcode::kOptimizationBarrier:
+        case HloOpcode::kCopy:
+        case HloOpcode::kFusion:
+          // These instructions define no values. The values in their output
+          // flow from their operands or from cross computation dataflow.
+          break;
+        case HloOpcode::kParameter: {
+          if (call_graph_node.context() == CallContext::kBoth) {
+            // We do not support a subcomputation that is called from both a
+            // parallel and sequential context. In this case, the parameter
+            // would both define a value and propagate a value from its
+            // caller. This limitation is not really a problem because the call
+            // graph is typically flattened.
+            return Unimplemented(
+                "Computation %s is called in both a parallel (eg, kMap) and "
+                "sequential (eg, kCall) context",
+                computation->name());
+          }
+          if (call_graph_node.caller_callsites().empty()) {
+            // Parameters of computations called in a parallel context (eg, map
+            // and reduce) as well as parameters of dead computations define all
+            // values in their output. Otherwise the values of the parameter
+            // come from the caller (eg, operands to the kCall instruction).
+            define_all_values();
+          } else {
+            HloOpcode caller_callsite_opcode =
+                call_graph_node.caller_callsites()
+                    .front()
+                    .instruction()
+                    ->opcode();
+            if (caller_callsite_opcode != HloOpcode::kFusion) {
+              define_all_values();
+            }
+          }
+          break;
+        }
+        case HloOpcode::kTuple:
+          // These instructions only define their top-level values. Any other
+          // values flow from their operands.
+          define_value_at(/*index=*/{});
+          break;
+        case HloOpcode::kAsyncStart: {
+          // AsyncStart produces a tuple of {{aliased operands}, {destination},
+          // contexts}. It defines all of the tuple-shaped values and the
+          // contexts.
+          // If the thread is excluded, then we don't track the contained
+          // dataflow, and define the destination values too.
+          bool thread_included = HloInstruction::IsThreadIncluded(
+              instruction->async_execution_thread(), execution_threads_);
+          define_all_values([&](const ShapeIndex& index) {
+            return ShapeUtil::GetSubshape(instruction->shape(), index)
+                       .IsTuple() ||
+                   (!thread_included && index.front() == 1) ||
+                   (index.front() > 1);
+          });
+          break;
+        }
+        case HloOpcode::kAsyncUpdate:
+          // AsyncUpdate produces a tuple of {{aliased operands}, {destination},
+          // contexts} where all of the array-typed values alias with the
+          // operand. So, only tuple-shaped values are defined by AsyncUpdate.
+          define_all_values([&](const ShapeIndex& index) {
+            return ShapeUtil::GetSubshape(instruction->shape(), index)
+                .IsTuple();
+          });
+          break;
+        case HloOpcode::kAsyncDone:
+          // AsyncDone's output aliases its output. It defines all remaining
+          // tuple-shaped values.
+          define_all_values([&](const ShapeIndex& index) {
+            return ShapeUtil::GetSubshape(instruction->shape(), index)
+                .IsTuple();
+          });
+          break;
+        case HloOpcode::kCopyStart:
+          // CopyStart produces a tuple of {destination buffer, aliased operand,
+          // U32 context}.
+          define_value_at(/*index=*/{});
+          define_value_at(/*index=*/{2});
+          break;
+        case HloOpcode::kCopyDone:
+          // CopyDone consumes a tuple produced by CopyStart and produces an
+          // element. Its output aliases its input tuple element {0}.
+          break;
+        case HloOpcode::kAllGatherStart:
+          // AllGatherStart produces a tuple of
+          // {aliased operands, destination buffers}. If there is more than
+          // one operand, then both aliased operands and destination buffers
+          // will be tuples themselves. all-gather-start will define all tuples
+          // and all tuple leaves (arrays) in tuple sub-index 1 (destination
+          // buffers).
+          define_all_values([&](const ShapeIndex& index) {
+            return ShapeUtil::GetSubshape(instruction->shape(), index)
+                       .IsTuple() ||
+                   index.front() == 1;
+          });
+          break;
+        case HloOpcode::kAllGatherDone:
+          // AllGatherDone's output aliases its input tuple element {1}.
+          if (instruction->shape().IsTuple()) {
+            define_value_at(/*index=*/{});
+          }
+          break;
+        case HloOpcode::kAllReduceDone:
+          // AllReduceDone's output aliases its input.
+          break;
+        case HloOpcode::kCollectivePermuteStart:
+          // CollectivePermuteStart produces a tuple of
+          // {aliased operand, destination buffer, contexts}, where the context
+          // data are optional.
+          define_value_at(/*index=*/{});
+          define_value_at(/*index=*/{1});
+          for (int i = 2; i < instruction->shape().tuple_shapes().size(); ++i) {
+            define_value_at(/*index=*/{i});
+          }
+
+          if (instruction->operand_count() > 1) {
+            CHECK_EQ(instruction->operand_count(), 4);
+            if (instruction->operand(1)->shape().IsTuple()) {
+              for (int i = 0; i < ShapeUtil::TupleElementCount(
+                                      instruction->operand(1)->shape());
+                   ++i) {
+                define_value_at(/*index=*/{1, i});
+              }
+            }
+          }
+          break;
+        case HloOpcode::kCollectivePermuteDone:
+          // CollectivePermuteDone's output aliases its input tuple element {1}.
+          if (instruction->shape().IsTuple()) {
+            define_value_at(/*index=*/{});
+          }
+          break;
+        case HloOpcode::kRecvDone:
+          // RecvDone produces a two-element tuple. Element zero aliases its
+          // input tuple element {0}; element one is a token.
+          define_value_at(/*index=*/{});
+          define_value_at(/*index=*/{1});
+          break;
+        case HloOpcode::kSend:
+          // Send produces a tuple of {aliased operand, U32 context, token},
+          // therefore only defines the top-level tuple and the tuple elements
+          // at {1} and {2}.
+          define_value_at(/*index=*/{});
+          define_value_at(/*index=*/{1});
+          define_value_at(/*index=*/{2});
+          break;
+        default:
+          define_all_values();
+          break;
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+/* static */
+absl::StatusOr<std::unique_ptr<HloValueTracing>> HloValueTracing::Run(
+    const HloModule& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  auto hlo_value_tracing =
+      absl::WrapUnique(new HloValueTracing(module, execution_threads));
+
+  TF_RETURN_IF_ERROR(hlo_value_tracing->InitializeInstructionValueSets());
+  hlo_value_tracing->Propagate();
+
+  // Delete all values marked for deletion.
+  hlo_value_tracing->DeleteMarkedValues();
+
+  // Gather and set all non-definition positions of all values. Value deletion
+  // is rare, so just use a vector indexed by Value::Id rather than a map from
+  // Value::Id to positions. There should be very few holes in the vector, and
+  // lookup is faster.
+  std::vector<std::vector<HloPosition>> value_positions(
+      hlo_value_tracing->next_value_id_);
+  for (const HloComputation* computation : module.computations()) {
+    if (!HloInstruction::IsThreadIncluded(computation->execution_thread(),
+                                          execution_threads)) {
+      continue;
+    }
+    for (HloInstruction* instruction : computation->instructions()) {
+      for (const auto& pair :
+           hlo_value_tracing->GetInstructionValueSet(instruction)) {
+        const ShapeIndex& index = pair.first;
+        const HloValueSet& value_set = pair.second;
+        for (const HloValue* value : value_set.values()) {
+          if (value->defining_instruction() != instruction) {
+            value_positions[value->id()].push_back(
+                HloPosition{instruction, index});
+          }
+        }
+      }
+    }
+  }
+  for (auto& pair : hlo_value_tracing->values_) {
+    HloValue::Id value_id = pair.first;
+    HloValue& value = *pair.second;
+    value.SetPositions(value_positions[value_id]);
+  }
+
+  // Construct vector of values.
+  hlo_value_tracing->values_vector_.reserve(hlo_value_tracing->values_.size());
+  for (const auto& pair : hlo_value_tracing->values_) {
+    hlo_value_tracing->values_vector_.push_back(pair.second.get());
+  }
+  absl::c_sort(hlo_value_tracing->values_vector_, HloValue::IdLessThan);
+
+  return hlo_value_tracing;
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.h b/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.h
new file mode 100644
index 000000000000..6cabdf4f8387
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_GRAPH_ANALYSIS_HLO_VALUE_TRACING_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_GRAPH_ANALYSIS_HLO_VALUE_TRACING_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Analysis that traces the defining HLO instructions of HLO values used by
+// any instruction. This is largely based on HloDataflowAnalysis with
+// primary difference that the HLO values are traced back through copy and
+// fusion instructions.
+class HloValueTracing {
+ public:
+  // Runs dataflow analysis on the given module.
+  static absl::StatusOr<std::unique_ptr<HloValueTracing>> Run(
+      const HloModule& module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+
+  // Returns true if 'instruction' defines an HLO value at the given shape index
+  // of its output.
+  bool ValueIsDefinedAt(const HloInstruction* instruction,
+                        const ShapeIndex& index = {}) const;
+
+  // Returns the InstructionValueSet for the given instruction.
+  const InstructionValueSet& GetInstructionValueSet(
+      const HloInstruction* instruction) const;
+  InstructionValueSet& GetInstructionValueSet(
+      const HloInstruction* instruction);
+
+  // Returns all values that are contained in the output of this instruction in
+  // a flattened set.
+  HloValueSet GetFlattenedValueSet(const HloInstruction* instruction) const;
+
+  // Returns the HloValueSet for the given instruction at the given index or the
+  // given position.
+  const HloValueSet& GetValueSet(const HloInstruction* instruction,
+                                 const ShapeIndex& index = {}) const;
+  HloValueSet& GetValueSet(const HloInstruction* instruction,
+                           const ShapeIndex& index = {});
+
+  // Returns the HloValue with the given Id.
+  HloValue& GetValue(HloValue::Id value_id);
+
+ private:
+  HloValueTracing(const HloModule& module,
+                  absl::flat_hash_set<absl::string_view> execution_threads);
+
+  // Returns a new HloValue defined at the given instruction and shape index.
+  HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
+                        bool is_phi);
+
+  // Deletes all HloValues marked for deletion. Should be called after
+  // propagation is complete.
+  void DeleteMarkedValues();
+
+  // Constructs and initializes the InstructionValueSets of all instructions to
+  // contain exactly the HloValues defined by each instruction. These values can
+  // then propagated throughout the HLO graph by calling Propagate.
+  absl::Status InitializeInstructionValueSets();
+
+  // Updates the value set of the given instruction based on the values flowing
+  // into the instruction (operands and cross-computation dataflow).
+  bool UpdateInstructionValueSet(HloInstruction* instruction);
+
+  // Updates the value set for a particular instruction type. Returns whether
+  // the instruction value set changed.
+  bool UpdateCallValueSet(HloInstruction* call);
+  bool UpdateConditionalValueSet(HloInstruction* conditional);
+  bool UpdateCopyValueSet(HloInstruction* copy);
+  bool UpdateDomainValueSet(HloInstruction* domain);
+  bool UpdateGetTupleElementValueSet(HloInstruction* gte);
+  bool UpdateParameterValueSet(HloInstruction* parameter);
+  // Async op propagation rules:
+  //  - Operand of async-start to parameter of async wrapped computation and at
+  //    index {0, operand_number} of async-start and async-update outputs.
+  //  - Root of async wrapped computation to index {1} of async-start and
+  //    async-update and index {} of async-done.
+  //  - The contexts in indices {2+} of async-start to the same indices of
+  //    async-update.
+  //
+  // As a result of this, the operands/outputs of async-start and async-done
+  // instructions share the same values as the parameters/roots of the async
+  // wrapped computation.
+  bool UpdateAsyncStartValueSet(HloInstruction* async_start);
+  bool UpdateAsyncUpdateValueSet(HloInstruction* async_update);
+  bool UpdateAsyncDoneValueSet(HloInstruction* async_done);
+  bool UpdateCopyStartValueSet(HloInstruction* copy_start);
+  bool UpdateCopyDoneValueSet(HloInstruction* copy_done);
+  bool UpdateOptimizationBarrierValueSet(HloInstruction* barrier);
+  bool UpdateRecvDoneValueSet(HloInstruction* recv_done);
+  bool UpdateSendValueSet(HloInstruction* send);
+  bool UpdateTupleValueSet(HloInstruction* tuple);
+  bool UpdateFusionValueSet(HloInstruction* fusion);
+  bool UpdateWhileValueSet(HloInstruction* xla_while);
+  bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
+  bool UpdateAllGatherStartValueSet(HloInstruction* all_gather_start);
+  bool UpdateAllGatherDoneValueSet(HloInstruction* all_gather_done);
+  bool UpdateAllReduceDoneValueSet(HloInstruction* all_reduce_done);
+  bool UpdateCollectivePermuteStartValueSet(
+      HloInstruction* collective_permute_start);
+  bool UpdateCollectivePermuteDoneValueSet(
+      HloInstruction* collective_permute_done);
+
+  // Propagates the dataflow through the module. In particular, it propagates
+  // the HloValueSet from its defining instruction to the users of the
+  // instructions.
+  void Propagate();
+
+  const HloModule& module_;
+  const absl::flat_hash_set<absl::string_view> execution_threads_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+
+  // The map of all HloValues in the module. We pass around pointers to the
+  // mapped HloValues, so the underlying container must keep them valid despite
+  // mutations touching other map entries.
+  absl::flat_hash_map<HloValue::Id, std::unique_ptr<HloValue>> values_;
+
+  // A map from instruction to InstructionValueSet.
+  absl::flat_hash_map<const HloInstruction*,
+                      std::unique_ptr<InstructionValueSet>>
+      value_sets_;
+
+  // Values marked for deletion during construction. We don't delete them
+  // immediately because references to them may remain in ValueSets temporarily
+  // during propagation. After construction, these values are deleted.
+  std::vector<HloValue::Id> value_ids_to_delete_;
+
+  // A vector containing all HloValues sorted by HloValue::Id.
+  std::vector<HloValue*> values_vector_;
+
+  // The Id to use for the next HloValue.
+  HloValue::Id next_value_id_ = 0;
+};
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_GRAPH_ANALYSIS_HLO_VALUE_TRACING_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
new file mode 100644
index 000000000000..8d2403ccbcca
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.cc
@@ -0,0 +1,332 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.h"
+#include "xla/hlo/tools/hlo_diff/utils/hlo_diff_util.h"
+#include "xla/service/call_graph.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+// Adds an edge between the given parent and child nodes.
+void AddEdge(HloInstructionNode* parent, HloInstructionNode* child) {
+  parent->children.push_back(child);
+  child->parents.push_back(parent);
+}
+
+// Creates HloPrintOptions from the given fingerprint options.
+HloPrintOptions CreateHloPrintOptions(
+    const HloGumgraphFingerprintOptions& fingerprint_options) {
+  HloPrintOptions hlo_print_options =
+      HloPrintOptions::Fingerprint()
+          .set_include_layout_in_shapes(false)
+          .set_print_subcomputation_mode(
+              HloPrintOptions::PrintSubcomputationMode::kOff)
+          .set_print_parameter_number(false);
+  if (fingerprint_options.ignore_shape) {
+    hlo_print_options.set_print_operand_shape(false);
+    hlo_print_options.set_print_result_shape(false);
+  }
+  return hlo_print_options;
+}
+
+}  // namespace
+
+std::pair<HloInstructionNode*, bool> HloGumgraph::AddNode(
+    const HloInstruction& instruction, int unique_node_index) {
+  auto node = std::make_unique<HloInstructionNode>(HloInstructionNode{
+      .instruction = &instruction, .unique_node_index = unique_node_index});
+  auto [new_node_it, inserted] =
+      instruction_to_node_.try_emplace(&instruction, std::move(node));
+  return {new_node_it->second.get(), inserted};
+}
+
+absl::Status HloGumgraph::ConstructGraph(const HloModule& hlo_module) {
+  LOG(INFO) << "Constructing HloGumgraph";
+  int unique_instruction_index = 0;
+  for (auto* computation : hlo_module.MakeComputationPostOrder()) {
+    for (auto* instruction : computation->MakeInstructionPostOrder()) {
+      std::pair<HloInstructionNode*, bool> node_and_inserted =
+          AddNode(*instruction, ++unique_instruction_index);
+      if (!node_and_inserted.second) {
+        return absl::InternalError(absl::StrCat(
+            "Instruction: ", instruction->name(), " already in the graph"));
+      }
+
+      HloInstructionNode* node = node_and_inserted.first;
+      node->props.fingerprint = GetHloInstructionFingerprint(
+          instruction, CreateHloPrintOptions(fingerprint_options_));
+
+      switch (instruction->opcode()) {
+        case HloOpcode::kCall:
+        case HloOpcode::kFusion:
+        case HloOpcode::kWhile: {
+          // Connect Call, Fusion and While instruction's called computations
+          // parameters with the operands of the caller instructions to inline
+          // the called computation as they should match 1:1.
+          for (auto* called_computation : instruction->called_computations()) {
+            for (int i = 0; i < instruction->operands().size(); ++i) {
+              HloInstructionNode* parent =
+                  GetNode(called_computation->parameter_instruction(i));
+              HloInstructionNode* child = GetNode(instruction->operands()[i]);
+              if (parent == nullptr || child == nullptr) {
+                return absl::InternalError(absl::StrFormat(
+                    "Called computation instruction (%s) operand not found "
+                    "in the called computation: %s parameters (%dth parameter)",
+                    child == nullptr ? "nullptr" : child->GetName(),
+                    parent == nullptr ? "nullptr" : parent->GetName(), i));
+              }
+              AddEdge(parent, child);
+            }
+          }
+          break;
+        }
+        case HloOpcode::kConditional: {
+          // Connect conditional instruction node with the predicate operand.
+          HloInstructionNode* pred_node = GetNode(instruction->operands()[0]);
+          if (pred_node == nullptr) {
+            return absl::InternalError(absl::StrFormat(
+                "Instruction (%s) operand: %s not found in the graph",
+                instruction->name(), instruction->operands()[0]->name()));
+          }
+          AddEdge(node, pred_node);
+
+          // Connect conditional instruction's branch computations parameters
+          // with the operands of the caller instructions to inline the branch
+          // computations.
+          for (int i = 0; i < instruction->branch_count(); ++i) {
+            HloComputation* branch_computation =
+                instruction->branch_computation(i);
+            HloInstructionNode* parent =
+                GetNode(branch_computation->parameter_instruction(0));
+            HloInstructionNode* child = GetNode(instruction->operands()[i + 1]);
+            if (parent == nullptr || child == nullptr) {
+              return absl::InternalError(absl::StrFormat(
+                  "Branch computation instruction (%s) operand not found "
+                  "in the branch computation: %s parameters",
+                  child->GetName(), parent->GetName()));
+            }
+            AddEdge(parent, child);
+          }
+          break;
+        }
+        default: {
+          for (auto* operand : instruction->operands()) {
+            HloInstructionNode* child = GetNode(operand);
+            if (child == nullptr) {
+              return absl::InternalError(absl::StrFormat(
+                  "Instruction (%s) operand: %s not found in the graph",
+                  instruction->name(), operand->name()));
+            }
+            AddEdge(node, child);
+          }
+        }
+      }
+
+      // Connect the root instruction of the called computation with the
+      // caller instruction.
+      for (auto* called_computation : instruction->called_computations()) {
+        HloInstructionNode* called_computation_root_node =
+            GetNode(called_computation->root_instruction());
+        if (called_computation_root_node == nullptr) {
+          return absl::InternalError(absl::StrFormat(
+              "Called computation (%s) root: %s not found in the graph",
+              called_computation->name(),
+              called_computation->root_instruction()->name()));
+        }
+        AddEdge(node, called_computation_root_node);
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::vector<HloInstructionNode*>>
+HloGumgraph::PrecomputeGenerations() {
+  LOG(INFO) << "Precomputing generations";
+  std::vector<HloInstructionNode*> zero_indegrees;
+  absl::flat_hash_map<const HloInstructionNode*, int> indegrees;
+  for (const auto& [_, node] : instruction_to_node_) {
+    if (node->parents.empty()) {
+      zero_indegrees.push_back(node.get());
+      continue;
+    }
+
+    auto [it, inserted] = indegrees.insert({node.get(), node->parents.size()});
+    if (!inserted) {
+      return absl::InternalError(
+          absl::StrCat("Instruction: ", node->instruction->name(),
+                       " already inserted in indegree map"));
+    }
+    indegrees[node.get()] = node->parents.size();
+  }
+  std::vector<HloInstructionNode*> init_zero_indegrees = zero_indegrees;
+  nodes_by_generation_.push_back({&root_});
+
+  int current_generation = 1;
+  while (!zero_indegrees.empty()) {
+    std::vector<HloInstructionNode*> current_generation_nodes =
+        std::move(zero_indegrees);
+    zero_indegrees = {};
+
+    for (int i = 0; i < current_generation_nodes.size(); ++i) {
+      current_generation_nodes[i]->props.generation = current_generation;
+      current_generation_nodes[i]->props.sibling_position = {
+          i, static_cast<int64_t>(current_generation_nodes.size())};
+      for (HloInstructionNode* child : current_generation_nodes[i]->children) {
+        auto it = indegrees.find(child);
+        if (it == indegrees.end()) {
+          return absl::InternalError(
+              absl::StrCat("Instruction: ", child->instruction->name(),
+                           " not found in indegree map"));
+        }
+        --it->second;
+        if (it->second == 0) {
+          zero_indegrees.push_back(child);
+          indegrees.erase(it);
+        }
+      }
+    }
+    nodes_by_generation_.push_back(std::move(current_generation_nodes));
+    ++current_generation;
+  }
+
+  if (!indegrees.empty()) {
+    LOG(WARNING) << "Cycle detected in the graph.";
+    return absl::InternalError("Cycle detected in the graph");
+  }
+  return init_zero_indegrees;
+}
+
+void HloGumgraph::PrecomputeSizeAndHeight() {
+  LOG(INFO) << "Precomputing size and height";
+  // TODO(camillesun): Refactor this to use DFS.
+  for (auto it = nodes_by_generation_.rbegin();
+       it != nodes_by_generation_.rend(); ++it) {
+    for (HloInstructionNode* node : *it) {
+      int64_t height = 0;
+      uint64_t fingerprint = node->props.fingerprint;
+
+      for (const HloInstructionNode* child : node->children) {
+        height = std::max(height, child->props.height);
+        fingerprint = tsl::FingerprintCat64(fingerprint,
+                                            child->props.subgraph_fingerprint);
+      }
+
+      node->props.height = height + 1;
+      // TODO(b/365855856): graph with different structure can share a same
+      // subgraph fingerprint, see test case
+      // PreComputationsWorksSubgraphFingerprint. This is unexpected.
+      node->props.subgraph_fingerprint = fingerprint;
+    }
+  }
+}
+
+absl::Status HloGumgraph::PrecomputeComputationFingerprint() {
+  LOG(INFO) << "Precomputing computation fingerprint";
+  TF_RETURN_IF_ERROR(call_graph_->VisitNodes([&](const CallGraphNode& node)
+                                                 -> absl::Status {
+    absl::flat_hash_map<const HloInstruction*, uint64_t> subgraph_fingerprint;
+    const HloComputation* computation = node.computation();
+    for (auto* instruction : computation->MakeInstructionPostOrder()) {
+      uint64_t fp = GetNode(instruction)->props.fingerprint;
+      for (const HloInstruction* operand : instruction->operands()) {
+        fp = tsl::FingerprintCat64(
+            fp, subgraph_fingerprint.at(GetNode(operand)->instruction));
+      }
+      subgraph_fingerprint[instruction] = fp;
+    }
+
+    computation_to_props_[computation] = CallGraphNodeProps{
+        .call_graph_node = &node,
+        .fingerprint =
+            subgraph_fingerprint.at(computation->root_instruction())};
+
+    return absl::OkStatus();
+  }));
+  return absl::OkStatus();
+}
+
+void HloGumgraph::PrecomputeDfsPosition() {
+  LOG(INFO) << "Precomputing DFS position";
+  std::vector<const HloInstructionNode*> pre_order_nodes =
+      GetAllNodesInDfsOrder(root_, DfsTraversalOrder::kPreOrder,
+                            GetNodeCount());
+  for (int i = 0; i < pre_order_nodes.size(); ++i) {
+    if (pre_order_nodes[i]->is_root) {
+      continue;
+    }
+    instruction_to_node_[pre_order_nodes[i]->instruction]
+        ->props.pre_order_graph_position = {
+        i, static_cast<int64_t>(pre_order_nodes.size())};
+  }
+}
+
+absl::StatusOr<std::unique_ptr<const HloGumgraph>> HloGumgraph::Create(
+    const HloModule* absl_nonnull hlo_module,
+    const HloGumgraphFingerprintOptions& fingerprint_options) {
+  CHECK(hlo_module != nullptr) << "Expected a non-null hlo module";
+  CHECK(hlo_module->entry_computation() != nullptr)
+      << "Expected a non-null entry computation";
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(hlo_module);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloValueTracing> hlo_value_tracing,
+                      HloValueTracing::Run(*hlo_module));
+  auto graph = absl::WrapUnique(
+      new HloGumgraph(*hlo_module, fingerprint_options, std::move(call_graph),
+                      std::move(hlo_value_tracing)));
+
+  TF_RETURN_IF_ERROR(graph->ConstructGraph(*hlo_module));
+  TF_ASSIGN_OR_RETURN(std::vector<HloInstructionNode*> zero_indegree_nodes,
+                      graph->PrecomputeGenerations());
+  for (auto* zero_indegree_node : zero_indegree_nodes) {
+    AddEdge(&graph->root_, zero_indegree_node);
+  }
+  graph->PrecomputeSizeAndHeight();
+  TF_RETURN_IF_ERROR(graph->PrecomputeComputationFingerprint());
+  graph->PrecomputeDfsPosition();
+
+  return graph;
+};
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h
new file mode 100644
index 000000000000..6885724651a7
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_GRAPH_HLO_GUMGRAPH_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_GRAPH_HLO_GUMGRAPH_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/analysis/hlo_value_tracing.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/service/call_graph.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Options for computing the per instruction/node fingerprint of an HloGumgraph.
+struct HloGumgraphFingerprintOptions {
+  // Ignore shape when computing the instruction fingerprint.
+  bool ignore_shape = false;
+};
+
+// A directed acyclic graph representation of an HloModule with all called
+// computations inlined i.e. the calling instructions is connected to the
+// called computation's root instruction.
+class HloGumgraph {
+ public:
+  // Instantiates a HloGumgraph from a HloModule, pre-processing and caching
+  // various graph properties such as height, siblings per node etc.
+  static absl::StatusOr<std::unique_ptr<const HloGumgraph>> Create(
+      const HloModule* absl_nonnull hlo_module,
+      const HloGumgraphFingerprintOptions& fingerprint_options = {});
+
+  // HloGumgraph is neither copyable nor movable as it can be really large.
+  HloGumgraph(const HloGumgraph&) = delete;
+  HloGumgraph& operator=(const HloGumgraph&) = delete;
+
+  // Returns the dummy root node which is connected to all zero-indegree nodes
+  // in the graph. The dummy root is always connected to the entry computation's
+  // root instruction but additionally might be connected to other unreachable
+  // roots in the entry computation.
+  inline const HloInstructionNode& GetRoot() const { return root_; }
+
+  // Returns graph node corresponding to the given HloInstruction. Returns
+  // nullptr if the instruction is not in the graph.
+  inline HloInstructionNode* GetNode(
+      const HloInstruction* absl_nonnull instruction) const {
+    if (auto it = instruction_to_node_.find(instruction);
+        it != instruction_to_node_.end()) {
+      return it->second.get();
+    }
+    return nullptr;
+  }
+
+  // Returns all nodes in the graph excluding the dummy root node.
+  inline std::vector<HloInstructionNode*> AllNodes() const {
+    std::vector<HloInstructionNode*> nodes;
+    for (const auto& [_, node] : instruction_to_node_) {
+      nodes.push_back(node.get());
+    }
+    return nodes;
+  }
+
+  // Returns the number of nodes in the graph including the dummy root node.
+  inline int GetNodeCount() const { return instruction_to_node_.size() + 1; }
+
+  // Returns all properties of computations in the graph.
+  inline const absl::flat_hash_map<const HloComputation*, CallGraphNodeProps>&
+  AllComputationProps() const {
+    return computation_to_props_;
+  }
+
+  // Returns the call graph of the HloModule.
+  const CallGraph& GetCallGraph() const { return *call_graph_; }
+
+  // Returns the HloValueTracing used to trace the HloValues used by
+  // instructions.
+  const HloValueTracing& GetHloValueTracing() const {
+    return *hlo_value_tracing_;
+  }
+
+  // Returns the backing HloModule of the HloGumgraph.
+  const HloModule& GetHloModule() const { return hlo_module_; }
+
+ private:
+  explicit HloGumgraph(const HloModule& hlo_module,
+                       const HloGumgraphFingerprintOptions& fingerprint_options,
+                       std::unique_ptr<CallGraph> call_graph,
+                       std::unique_ptr<HloValueTracing> hlo_value_tracing)
+      : hlo_module_(hlo_module),
+        fingerprint_options_(fingerprint_options),
+        root_(
+            {.instruction = nullptr, .unique_node_index = 0, .is_root = true}),
+        call_graph_(std::move(call_graph)),
+        hlo_value_tracing_(std::move(hlo_value_tracing)) {}
+
+  // Adds a HloInstructionNode for the given HloInstruction to the graph.
+  // Returns a pair of the node and a boolean indicating whether the node was
+  // already in the graph.
+  std::pair<HloInstructionNode*, bool> AddNode(
+      const HloInstruction& instruction, int unique_node_index);
+
+  // Constructs the HloGumgraph from the given HloModule connecting Instruction
+  // operands and called computations.
+  absl::Status ConstructGraph(const HloModule& hlo_module);
+
+  // Precomputes the generation of each node in the graph. Generation of a node
+  // is simply the longest distance of a node from the root node. The generation
+  // of the root node is 0. Additionally it returns all zero-indegree nodes.
+  absl::StatusOr<std::vector<HloInstructionNode*>> PrecomputeGenerations();
+
+  // Precomputes the size and height of each node in the graph.
+  void PrecomputeSizeAndHeight();
+
+  // Precomputes the fingerprint of each computation in the graph, all
+  // instructions in the computation are hashed to compute the fingerprint.
+  absl::Status PrecomputeComputationFingerprint();
+
+  // Precomputes the index of each node in a pre-order DFS traversal of the
+  // graph.
+  void PrecomputeDfsPosition();
+
+  const HloModule& hlo_module_;
+  const HloGumgraphFingerprintOptions& fingerprint_options_;
+  HloInstructionNode root_;
+  absl::flat_hash_map<const HloInstruction*,
+                      std::unique_ptr<HloInstructionNode>>
+      instruction_to_node_;
+  absl::flat_hash_map<const HloComputation*, CallGraphNodeProps>
+      computation_to_props_;
+  std::vector<std::vector<HloInstructionNode*>> nodes_by_generation_;
+  const std::unique_ptr<CallGraph> call_graph_;
+  const std::unique_ptr<HloValueTracing> hlo_value_tracing_;
+};
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_GRAPH_HLO_GUMGRAPH_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h
new file mode 100644
index 000000000000..d5fa2d14a414
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_GRAPH_HLO_GUMGRAPH_NODE_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_GRAPH_HLO_GUMGRAPH_NODE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/call_graph.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Position of a node in a container of siblings.
+struct ListPosition {
+  int64_t index = 0;
+  int64_t size = 0;
+};
+
+// Properties of a instruction node in a HloGumgraph such as generation etc.
+struct HloInstructionNodeProps {
+  int64_t generation = 0;
+  int64_t height = 0;
+  uint64_t subgraph_fingerprint = 0;
+  uint64_t fingerprint = 0;
+  ListPosition sibling_position;
+  ListPosition pre_order_graph_position;
+};
+
+// Properties of a computation node in a HloGumgraph.
+struct CallGraphNodeProps {
+  const CallGraphNode* call_graph_node;
+  uint64_t fingerprint = 0;
+  absl::string_view GetName() const {
+    return call_graph_node->computation()->name();
+  }
+};
+
+// A node in a HloGumgraph representing a HLO instruction.
+// Only root nodes can have no instruction.
+struct HloInstructionNode {
+  const HloInstruction* instruction;
+  int unique_node_index = 0;
+  std::vector<HloInstructionNode*> children;
+  std::vector<HloInstructionNode*> parents;
+  HloInstructionNodeProps props;
+  bool is_root = false;
+  absl::string_view GetName() const {
+    return is_root ? "root" : instruction->name();
+  }
+};
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_GRAPH_HLO_GUMGRAPH_NODE_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
new file mode 100644
index 000000000000..ff5c28d99722
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_test.cc
@@ -0,0 +1,523 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::Field;
+using ::testing::FieldsAre;
+using ::testing::Pair;
+using ::testing::Pointee;
+using ::testing::Property;
+using ::testing::UnorderedElementsAre;
+
+class HloGumgraphTest : public HloHardwareIndependentTestBase {};
+
+const HloInstructionNode* SelectNodeByName(const HloGumgraph& graph,
+                                           absl::string_view name) {
+  const HloInstructionNode* result = nullptr;
+  for (const auto* node : graph.AllNodes()) {
+    if (!node->is_root && node->instruction->name() == name) {
+      result = node;
+      break;
+    }
+  }
+  return result;
+}
+
+// Returns true if the subgraph fingerprint of the roots are the same.
+bool FingerprintEqualTo(const HloGumgraph& first, const HloGumgraph& second) {
+  return first.GetRoot().props.subgraph_fingerprint ==
+         second.GetRoot().props.subgraph_fingerprint;
+}
+
+void AssertNode(const HloInstructionNode* actual_node,
+                absl::string_view expected_node_name, int expected_num_children,
+                int expected_num_parents) {
+  EXPECT_EQ(actual_node->instruction->name(), expected_node_name);
+  ASSERT_EQ(actual_node->children.size(), expected_num_children);
+  ASSERT_EQ(actual_node->parents.size(), expected_num_parents);
+}
+
+TEST_F(HloGumgraphTest, CreateSimpleHloModuleWithoutFusionInstructionWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | Add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  const auto* entry = graph->GetRoot().children[0];
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry, "add_0", 2, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry->children[0], "add_1", 2, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry->children[1], "baz", 0, 1));
+  ASSERT_NO_FATAL_FAILURE(
+      AssertNode(entry->children[0]->children[0], "foo", 0, 1));
+  ASSERT_NO_FATAL_FAILURE(
+      AssertNode(entry->children[0]->children[1], "bar", 0, 1));
+
+  EXPECT_THAT(
+      graph->AllComputationProps(),
+      UnorderedElementsAre(Pair(
+          Pointee(Property(&HloComputation::name, "entry")),
+          Field(&CallGraphNodeProps::fingerprint, 10150663182810228731U))));
+}
+
+TEST_F(HloGumgraphTest, CreateHloModuleWithFusionInstructionWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param p0] ---> [Param p2] ---> ┌-------┐      ┌----------┐      ┌------┐
+  //                                 | add.1 | ---> | fusion.1 | ---> | ROOT |
+  // [Param p1] ---> [Param p3] ---> └-------┘      └----------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.1 {
+  p2 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+}
+
+ENTRY entry {
+  p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  ROOT fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  const auto* entry = graph->GetRoot().children[0];
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry, "fusion.1", 1, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry->children[0], "add.1", 2, 1));
+  ASSERT_NO_FATAL_FAILURE(
+      AssertNode(entry->children[0]->children[0], "p2", 1, 1));
+  ASSERT_NO_FATAL_FAILURE(
+      AssertNode(entry->children[0]->children[0]->children[0], "p0", 0, 1));
+
+  EXPECT_THAT(
+      graph->AllComputationProps(),
+      UnorderedElementsAre(
+          Pair(Pointee(Property(&HloComputation::name, "entry")),
+               Field(&CallGraphNodeProps::fingerprint, 17918193494741257405U)),
+          Pair(
+              Pointee(Property(&HloComputation::name, "fused_computation.1")),
+              Field(&CallGraphNodeProps::fingerprint, 18256571801256786953U))));
+}
+
+TEST_F(HloGumgraphTest, CreateHloModuleWithConditionalInstructionWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [constant.2] ---> [y] ---> [identity] ---> ┌-------------┐
+  //                                            |             |      ┌------┐
+  // [constant.1] ---> [x] ---> [negate] -----> | conditional | ---> | ROOT |
+  // [constant] ------------------------------> └-------------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+Negate {
+  x = f32[] parameter(0)
+  ROOT negate = f32[] negate(x)
+}
+
+Identity {
+  y = f32[] parameter(0)
+  ROOT identity = f32[] copy(y)
+}
+
+ENTRY entry {
+  constant = pred[] constant(true)
+  constant.1 = f32[] constant(56)
+  constant.2 = f32[] constant(12)
+  ROOT conditional = f32[] conditional(constant, constant.1, constant.2), true_computation=Negate, false_computation=Identity
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  const auto* entry = graph->GetRoot().children[0];
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry, "conditional", 3, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry->children[2], "identity", 1, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry->children[1], "negate", 1, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(entry->children[0], "constant", 0, 1));
+  ASSERT_NO_FATAL_FAILURE(
+      AssertNode(entry->children[2]->children[0], "y", 1, 1));
+  ASSERT_NO_FATAL_FAILURE(
+      AssertNode(entry->children[1]->children[0], "x", 1, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(
+      entry->children[2]->children[0]->children[0], "constant.2", 0, 1));
+  ASSERT_NO_FATAL_FAILURE(AssertNode(
+      entry->children[1]->children[0]->children[0], "constant.1", 0, 1));
+
+  EXPECT_THAT(
+      graph->AllComputationProps(),
+      UnorderedElementsAre(
+          Pair(Pointee(Property(&HloComputation::name, "entry")),
+               Field(&CallGraphNodeProps::fingerprint, 9646443073508437215U)),
+          Pair(Pointee(Property(&HloComputation::name, "Identity")),
+               Field(&CallGraphNodeProps::fingerprint, 7593821242743477274U)),
+          Pair(
+              Pointee(Property(&HloComputation::name, "Negate")),
+              Field(&CallGraphNodeProps::fingerprint, 11882609566947793238U))));
+}
+
+TEST_F(HloGumgraphTest, PreComputationsWorksWithoutShapeInFingerprint) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | Add_1 |
+  // ┌------------┐ ---> └-------┘ ---> ┌-------┐      ┌------┐
+  // |Constant bar|                     | add_0 | ---> | ROOT |
+  // └------------┘ ------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, bar)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<const HloGumgraph> graph,
+      HloGumgraph::Create(module.get(), {.ignore_shape = true}));
+
+  const auto* entry = graph->GetRoot().children[0];
+  EXPECT_THAT(
+      entry->props,
+      FieldsAre(
+          /*generation=*/1,
+          /*height=*/3, /*subgraph_fingerprint=*/8543065396480500811U,
+          /*fingerprint=*/7968662072287666665U,
+          /*sibling_position=*/FieldsAre(/*index=*/0, /*size=*/1),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/1, /*size=*/5)));
+  EXPECT_THAT(
+      entry->children[0]->props,
+      FieldsAre(
+          /*generation=*/2,
+          /*height=*/2, /*subgraph_fingerprint=*/12467718903949982030U,
+          /*fingerprint=*/7968662072287666665U,
+          /*sibling_position=*/FieldsAre(/*index=*/0, /*size=*/1),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/3, /*size=*/5)));
+  EXPECT_THAT(
+      entry->children[1]->props,
+      FieldsAre(
+          /*generation=*/3,
+          /*height=*/1, /*subgraph_fingerprint=*/3183718271480206887U,
+          /*fingerprint=*/3183718271480206887U,
+          /*sibling_position=*/FieldsAre(/*index=*/1, /*size=*/2),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/2, /*size=*/5)));
+  EXPECT_THAT(
+      entry->children[0]->children[0]->props,
+      FieldsAre(
+          /*generation=*/3,
+          /*height=*/1, /*subgraph_fingerprint=*/856105463456541506U,
+          /*fingerprint=*/856105463456541506U,
+          /*sibling_position=*/FieldsAre(/*index=*/0, /*size=*/2),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/4, /*size=*/5)));
+
+  EXPECT_THAT(
+      graph->AllComputationProps(),
+      UnorderedElementsAre(
+          Pair(Pointee(Property(&HloComputation::name, "entry")),
+               Field(&CallGraphNodeProps::fingerprint, 8543065396480500811U))));
+}
+
+TEST_F(HloGumgraphTest, PreComputationsWorksWithShapeInFingerprint) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | Add_1 |
+  // ┌------------┐ ---> └-------┘ ---> ┌-------┐      ┌------┐
+  // |Constant bar|                     | add_0 | ---> | ROOT |
+  // └------------┘ ------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, bar)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<const HloGumgraph> graph,
+      HloGumgraph::Create(module.get(), {.ignore_shape = false}));
+
+  const auto* entry = graph->GetRoot().children[0];
+  EXPECT_THAT(
+      entry->props,
+      FieldsAre(
+          /*generation=*/1,
+          /*height=*/3, /*subgraph_fingerprint=*/11491866794545709423U,
+          /*fingerprint=*/13023796333337170182U,
+          /*sibling_position=*/FieldsAre(/*index=*/0, /*size=*/1),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/1, /*size=*/5)));
+
+  EXPECT_THAT(
+      entry->children[0]->props,
+      FieldsAre(
+          /*generation=*/2,
+          /*height=*/2, /*subgraph_fingerprint=*/11413025457497517292U,
+          /*fingerprint=*/13023796333337170182U,
+          /*sibling_position=*/FieldsAre(/*index=*/0, /*size=*/1),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/3, /*size=*/5)));
+  EXPECT_THAT(
+      entry->children[1]->props,
+      FieldsAre(
+          /*generation=*/3,
+          /*height=*/1, /*subgraph_fingerprint=*/18045659843081992748U,
+          /*fingerprint=*/18045659843081992748U,
+          /*sibling_position=*/FieldsAre(/*index=*/1, /*size=*/2),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/2, /*size=*/5)));
+  EXPECT_THAT(
+      entry->children[0]->children[0]->props,
+      FieldsAre(
+          /*generation=*/3,
+          /*height=*/1, /*subgraph_fingerprint=*/7851455295828926644U,
+          /*fingerprint=*/7851455295828926644U,
+          /*sibling_position=*/FieldsAre(/*index=*/0, /*size=*/2),
+          /*pre_order_graph_position=*/FieldsAre(/*index=*/4, /*size=*/5)));
+
+  EXPECT_THAT(
+      graph->AllComputationProps(),
+      UnorderedElementsAre(Pair(
+          Pointee(Property(&HloComputation::name, "entry")),
+          Field(&CallGraphNodeProps::fingerprint, 11491866794545709423U))));
+}
+
+TEST_F(HloGumgraphTest, PreComputationsWorksMultiRoot) {
+  // Create a module with entry computation containing the following structure:
+  //                      ┌--------┐           ┌-----------┐
+  // ┌-----------┐ -----> |  recv  | --------> | recv-done | ---> ┌------┐
+  // | after-all |        └--------┘           └-----------┘      | ROOT |
+  // └-----------┘ -----> ┌--------┐           ┌-----------┐ ---> └------┘
+  // ┌----------┐         |  send  | --------> | send-done |
+  // | constant | ------> └--------┘           └-----------┘
+  // └----------┘
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<xla::VerifiedHloModule> module,
+      ParseAndReturnVerifiedModule(
+          R"(HloModule TwoSendRecvBothWayRecvFist_module, entry_computation_layout={()->(f32[], token[])}
+
+ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15
+  ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15
+  %constant = f32[] constant(2.1)
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, control-predecessors={%recv}
+  %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16
+}
+
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  EXPECT_EQ(SelectNodeByName(*graph, "recv")->props.generation, 2);
+  EXPECT_EQ(SelectNodeByName(*graph, "recv-done")->props.generation, 1);
+  EXPECT_EQ(SelectNodeByName(*graph, "send")->props.generation, 2);
+  EXPECT_EQ(SelectNodeByName(*graph, "send-done")->props.generation, 1);
+  EXPECT_EQ(SelectNodeByName(*graph, "token0")->props.generation, 3);
+  EXPECT_EQ(SelectNodeByName(*graph, "constant")->props.generation, 3);
+}
+
+TEST_F(HloGumgraphTest, PreComputationsWorksSubgraphFingerprint) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 |
+  // [Const 1] ---> └-------┘ ---> ┌-------┐      ┌------┐
+  //                               | add_3 | ---> | ROOT |
+  // [Const 2] ---> ┌-------┐ ---> └-------┘      └------┘
+  //                | add_1 |
+  // [Const 3] ---> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.2, constant.3)
+  add.3 = f32[] add(add.0, add.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 |
+  // ┌-------┐ ---> └-------┘ ---> ┌-------┐      ┌------┐
+  // |Const 1|                     | add_3 | ---> | ROOT |
+  // └-------┘ ---> ┌-------┐ ---> └-------┘      └------┘
+  //                | add_1 |
+  // [Const 3] ---> └-------┘
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.1, constant.3)
+  add.3 = f32[] add(add.0, add.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+
+  // TODO(b/365855856): The subgraph fingerprint should not be the same.
+  // EXPECT_NE(graph_l->GetRoot().props.subgraph_fingerprint,
+  //           graph_r->GetRoot().props.subgraph_fingerprint);
+  EXPECT_EQ(graph_l->GetRoot().props.subgraph_fingerprint,
+            graph_r->GetRoot().props.subgraph_fingerprint);
+}
+
+using HloGumgraphDeathTest = HloGumgraphTest;
+
+TEST_F(HloGumgraphDeathTest, CreateWithNullHloModuleFails) {
+  // The `hlo_module` parameter is annotated nonnull, but we want to test the
+  // defensive null check. Use a variable instead of passing nullptr directly
+  // to avoid a `-Wnonnull` warning.
+  HloModule* null_hlo_module = nullptr;
+  ASSERT_DEATH(auto unused = HloGumgraph::Create(null_hlo_module), "");
+}
+
+TEST_F(HloGumgraphDeathTest, CreateWithNullEntryComputationFails) {
+  HloModule hlo_module("module", HloModuleConfig());
+
+  ASSERT_DEATH(auto unused = HloGumgraph::Create(&hlo_module), "");
+}
+
+TEST_F(HloGumgraphTest, CheckEqualityForIdenticalGraphs) {
+  // Create two identical modules with entry computation containing the
+  // following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | Add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  const auto* hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto first_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto second_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> first_graph,
+                          HloGumgraph::Create(first_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> second_graph,
+                          HloGumgraph::Create(second_module.get()));
+
+  EXPECT_TRUE(FingerprintEqualTo(*first_graph, *second_graph));
+}
+
+TEST_F(HloGumgraphTest, CheckEqualityForDifferentGraphs) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | Add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(auto first_module, ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | Add_1 | ---> ┌------------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | subtract_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └------------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(auto second_module, ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  subtract_0 = f32[8,2048]{1,0:T(8,128)} subtract(add_1, baz)
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> first_graph,
+                          HloGumgraph::Create(first_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> second_graph,
+                          HloGumgraph::Create(second_module.get()));
+
+  EXPECT_FALSE(FingerprintEqualTo(*first_graph, *second_graph));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/BUILD
new file mode 100644
index 000000000000..c21df31a752e
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/BUILD
@@ -0,0 +1,62 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//xla/hlo/tools/hlo_diff:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "hlo_gumgraph_bfs",
+    srcs = ["hlo_gumgraph_bfs.cc"],
+    hdrs = ["hlo_gumgraph_bfs.h"],
+    deps = [
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_dfs",
+    srcs = ["hlo_gumgraph_dfs.cc"],
+    hdrs = ["hlo_gumgraph_dfs.h"],
+    deps = [
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "@com_google_absl//absl/functional:function_ref",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_gumgraph_dfs_test",
+    srcs = ["hlo_gumgraph_dfs_test.cc"],
+    deps = [
+        ":hlo_gumgraph_dfs",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_gumgraph_bfs_test",
+    srcs = ["hlo_gumgraph_bfs_test.cc"],
+    deps = [
+        ":hlo_gumgraph_bfs",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.cc
new file mode 100644
index 000000000000..2af07f360759
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.cc
@@ -0,0 +1,103 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.h"
+
+#include <cstdint>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+bool GetVisited(std::vector<uint64_t>& visited, int node_index) {
+  int index = node_index / 64;
+  CHECK_LT(index, visited.size());
+  return visited[index] & (1ull << (node_index % 64));
+}
+
+void SetVisited(std::vector<uint64_t>& visited, int node_index) {
+  int index = node_index / 64;
+  CHECK_LT(index, visited.size());
+  visited[index] |= (1ull << (node_index % 64));
+}
+}  // namespace
+
+void HloGumgraphBfs(
+    absl::Span<const HloInstructionNode* const> start_nodes,
+    absl::FunctionRef<bool(const HloInstructionNode&, int distance)>
+        per_node_fn,
+    BfsTraversalDirection direction, int node_limit,
+    absl::FunctionRef<bool(const HloInstructionNode&, int distance)>
+        expand_node_fn) {
+  std::queue<std::pair<const HloInstructionNode*, int>> nodes_to_expand;
+  std::vector<uint64_t> visited((node_limit + 63) / 64, 0);
+
+  for (const HloInstructionNode* start_node : start_nodes) {
+    CHECK(start_node != nullptr) << "Expected a non-null root node";
+    if (!per_node_fn(*start_node, 0)) {
+      return;
+    }
+    if (expand_node_fn(*start_node, 0)) {
+      nodes_to_expand.push({start_node, 0});
+    }
+    SetVisited(visited, start_node->unique_node_index);
+  }
+
+  while (!nodes_to_expand.empty()) {
+    const HloInstructionNode* current_node = nodes_to_expand.front().first;
+    int distance = nodes_to_expand.front().second;
+    nodes_to_expand.pop();
+
+    std::vector<HloInstructionNode*> adjacent_nodes =
+        direction == BfsTraversalDirection::kForward ? current_node->children
+                                                     : current_node->parents;
+
+    for (auto* adjacent_node : adjacent_nodes) {
+      if (!GetVisited(visited, adjacent_node->unique_node_index)) {
+        if (!per_node_fn(*adjacent_node, distance + 1)) {
+          return;
+        }
+        if (expand_node_fn(*adjacent_node, distance + 1)) {
+          nodes_to_expand.push({adjacent_node, distance + 1});
+        }
+        SetVisited(visited, adjacent_node->unique_node_index);
+      }
+    }
+  }
+}
+
+std::vector<const HloInstructionNode*> GetAllNodesInBfsOrder(
+    const HloInstructionNode& root, BfsTraversalDirection direction,
+    int node_limit) {
+  std::vector<const HloInstructionNode*> subgraph;
+  HloGumgraphBfs(
+      root,
+      [&](const HloInstructionNode& node) {
+        subgraph.push_back(&node);
+        return true;
+      },
+      BfsTraversalDirection::kForward, node_limit);
+  return subgraph;
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.h b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.h
new file mode 100644
index 000000000000..59ba20c4e8f4
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_GRAPH_UTILS_HLO_GUMGRAPH_BFS_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_GRAPH_UTILS_HLO_GUMGRAPH_BFS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "absl/types/span.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+
+namespace xla::hlo_diff {
+
+// Direction of the BFS traversal.
+enum class BfsTraversalDirection : std::int8_t { kForward, kReverse };
+
+// Performs a breadth first search of the HLO Module starting with specified
+// instruction node as start and calls supplied per node execution function.
+//
+// If the per_node_fn returns false for a node, the BFS traversal will be
+// terminate immediately.
+//
+// The BFS traversal is performed in the specified direction.
+// kForward: Start from the start node and traverse forward to the nodes
+// children.
+// kReverse: Start from the start node and traverse backwards to the
+// nodes parents.
+//
+// The node_limit parameter should be set to the number of nodes in the
+// HLOGumgraph, as its used to track the visit state of each node during
+// traversal.
+//
+// If the expand_node_fn returns false for a node, the children of the node
+// will not be visited.
+//
+// Both per_node_fn and expand_node_fn receive an 'int distance' parameter,
+// representing the distance (number of edges) from the start node(s) to the
+// currently visited node.
+void HloGumgraphBfs(
+    absl::Span<const HloInstructionNode* const> start_nodes,
+    absl::FunctionRef<bool(const HloInstructionNode&, int distance)>
+        per_node_fn,
+    BfsTraversalDirection direction, int node_limit,
+    absl::FunctionRef<bool(const HloInstructionNode&, int distance)>
+        expand_node_fn =
+            [](const HloInstructionNode&, int distance) { return true; });
+
+// Same function as above but with a single start node.
+inline void HloGumgraphBfs(
+    const HloInstructionNode& start_node,
+    absl::FunctionRef<bool(const HloInstructionNode&, int distance)>
+        per_node_fn,
+    BfsTraversalDirection direction, int node_limit,
+    absl::FunctionRef<bool(const HloInstructionNode&, int distance)>
+        expand_node_fn =
+            [](const HloInstructionNode&, int distance) { return true; }) {
+  return HloGumgraphBfs(std::vector<const HloInstructionNode*>({&start_node}),
+                        per_node_fn, direction, node_limit, expand_node_fn);
+};
+
+// Breadth first search from multiple start nodes. Takes no extra distance
+// parameter.
+inline void HloGumgraphBfs(
+    absl::Span<const HloInstructionNode* const> start_nodes,
+    absl::FunctionRef<bool(const HloInstructionNode&)> per_node_fn,
+    BfsTraversalDirection direction, int node_limit,
+    absl::FunctionRef<bool(const HloInstructionNode&)> expand_node_fn =
+        [](const HloInstructionNode&) { return true; }) {
+  HloGumgraphBfs(
+      start_nodes,
+      [&](const HloInstructionNode& node, int distance) {
+        return per_node_fn(node);
+      },
+      direction, node_limit,
+      [&](const HloInstructionNode& node, int distance) {
+        return expand_node_fn(node);
+      });
+};
+
+// Performs a breadth first search of the HLO Module starting with specified
+// instruction node. Takes no extra distance parameter.
+inline void HloGumgraphBfs(
+    const HloInstructionNode& start_node,
+    absl::FunctionRef<bool(const HloInstructionNode&)> per_node_fn,
+    BfsTraversalDirection direction, int node_limit,
+    absl::FunctionRef<bool(const HloInstructionNode&)> expand_node_fn =
+        [](const HloInstructionNode&) { return true; }) {
+  HloGumgraphBfs(
+      std::vector<const HloInstructionNode*>({&start_node}),
+      [&](const HloInstructionNode& node, int distance) {
+        return per_node_fn(node);
+      },
+      direction, node_limit,
+      [&](const HloInstructionNode& node, int distance) {
+        return expand_node_fn(node);
+      });
+}
+
+// Returns all nodes start from the given node in BFS order. Check comment of
+// HloGumgraphBfs for more details.
+std::vector<const HloInstructionNode*> GetAllNodesInBfsOrder(
+    const HloInstructionNode& root, BfsTraversalDirection direction,
+    int node_limit = 100000);
+
+}  // namespace xla::hlo_diff
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_GRAPH_UTILS_HLO_GUMGRAPH_BFS_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs_test.cc
new file mode 100644
index 000000000000..31e087e34c21
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs_test.cc
@@ -0,0 +1,325 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::ElementsAre;
+
+class HloGumgraphBfsTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloGumgraphBfsTest, BfsForwardWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+    bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+    baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+    add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+    add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  const auto root = graph->GetRoot();
+  HloGumgraphBfs(
+      root,
+      [&visited_nodes](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+        return true;
+      },
+      BfsTraversalDirection::kForward, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes,
+              ElementsAre("root", "add_0", "add_1", "baz", "foo", "bar"));
+}
+
+TEST_F(HloGumgraphBfsTest, BfsReverseWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐      ┌-------┐      ┌------┐
+  //                     | add_0 | ---> | abs_0 | ---> | ROOT |
+  // [Constant bar] ---> └-------┘      └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+    bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+    add_0 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+    abs_0 = f32[8,2048]{1,0:T(8,128)} abs(f32[8,2048]{1,0:T(8,128)} %add_0)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  const auto root = graph->GetRoot();
+  HloGumgraphBfs(
+      *root.children[0]->children[0]->children[0],
+      [&visited_nodes](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+        return true;
+      },
+      BfsTraversalDirection::kReverse, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes, ElementsAre("foo", "add_0", "abs_0", "root"));
+}
+
+TEST_F(HloGumgraphBfsTest, GetAllNodesWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+    bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+    baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+    add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+    add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+  const auto root = graph->GetRoot();
+  std::vector<const HloInstructionNode*> visited_nodes =
+      GetAllNodesInBfsOrder(root, BfsTraversalDirection::kForward);
+  std::vector<absl::string_view> string_views;
+  string_views.reserve(visited_nodes.size());
+  for (const HloInstructionNode* node : visited_nodes) {
+    string_views.push_back(node->GetName());
+  }
+
+  EXPECT_THAT(string_views,
+              ElementsAre("root", "add_0", "add_1", "baz", "foo", "bar"));
+}
+
+TEST_F(HloGumgraphBfsTest, BfsFromMultipleNodesWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+    bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+    baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+    add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+    add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  const auto root = graph->GetRoot();
+  HloGumgraphBfs(
+      std::vector<const HloInstructionNode*>{root.children[0]->children[0],
+                                             root.children[0]},
+      [&visited_nodes](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+        return true;
+      },
+      BfsTraversalDirection::kForward, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes,
+              ElementsAre("add_1", "add_0", "foo", "bar", "baz"));
+}
+
+TEST_F(HloGumgraphBfsTest, BfsStopExpandingWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+    bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+    baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+    add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+    add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  HloGumgraphBfs(
+      graph->GetRoot(),
+      [&](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+        return true;
+      },
+      BfsTraversalDirection::kForward, 6,
+      [&](const HloInstructionNode& node) {
+        return node.GetName() != "add_1";
+      });
+
+  EXPECT_THAT(visited_nodes, ElementsAre("root", "add_0", "add_1", "baz"));
+}
+
+TEST_F(HloGumgraphBfsTest, BfsEarlyTerminationWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+    bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+    baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+    add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+    add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  // This is an example of how to use per_node_fn return value to limit a BFS
+  // traversal to stop after already visiting 5 nodes.
+  HloGumgraphBfs(
+      graph->GetRoot(),
+      [&](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+        return visited_nodes.size() < 5;
+      },
+      BfsTraversalDirection::kForward, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes,
+              ElementsAre("root", "add_0", "add_1", "baz", "foo"));
+}
+
+TEST_F(HloGumgraphBfsTest, BfsLimitDistanceWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  // This is an example of how to use per_node_fn return value to limit a BFS
+  // traversal to stop after already visiting 2 layers of nodes.
+  HloGumgraphBfs(
+      graph->GetRoot(),
+      [&](const HloInstructionNode& node, int distance) {
+        if (distance > 2) {
+          return false;
+        }
+        visited_nodes.push_back(node.GetName());
+        return true;
+      },
+      BfsTraversalDirection::kForward, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes, ElementsAre("root", "add_0", "add_1", "baz"));
+}
+
+TEST_F(HloGumgraphBfsTest, BfsLimitDistanceFromMultipleNodesWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  // This is an example of how to use per_node_fn return value to limit a BFS
+  // traversal to stop after already visiting 2 layers of nodes.
+  HloGumgraphBfs(
+      std::vector<const HloInstructionNode*>{
+          graph->GetRoot().children[0]->children[0]->children[1],
+          graph->GetRoot().children[0]->children[1]},
+      [&](const HloInstructionNode& node, int distance) {
+        if (distance > 1) {
+          return false;
+        }
+        visited_nodes.push_back(node.GetName());
+        return true;
+      },
+      BfsTraversalDirection::kReverse, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes, ElementsAre("bar", "baz", "add_1", "add_0"));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.cc
new file mode 100644
index 000000000000..b02afcb74db7
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.cc
@@ -0,0 +1,81 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+
+namespace xla::hlo_diff {
+namespace {
+
+enum class VisitState : uint8_t { kNew = 0, kVisiting = 1, kVisited = 2 };
+
+}  // namespace
+
+void HloGumgraphDfs(
+    const HloInstructionNode& start_node,
+    absl::FunctionRef<void(const HloInstructionNode&)> per_node_fn,
+    DfsTraversalOrder order, int node_limit,
+    absl::FunctionRef<bool(const HloInstructionNode&)> expand_node_fn) {
+  std::vector<VisitState> visited(node_limit);
+
+  std::vector<const HloInstructionNode*> stack = {&start_node};
+
+  while (!stack.empty()) {
+    const HloInstructionNode* node = stack.back();
+    VisitState& visit_state = visited[node->unique_node_index];
+
+    if (visit_state == VisitState::kNew) {
+      visit_state = VisitState::kVisiting;
+      if (order == DfsTraversalOrder::kPreOrder) {
+        per_node_fn(*node);
+      }
+    } else {
+      stack.pop_back();
+      if (visit_state == VisitState::kVisiting) {
+        visit_state = VisitState::kVisited;
+        if (order == DfsTraversalOrder::kPostOrder) {
+          per_node_fn(*node);
+        }
+      }
+      continue;
+    }
+
+    if (!expand_node_fn(*node)) {
+      continue;
+    }
+    for (auto* child : node->children) {
+      if (visited[child->unique_node_index] == VisitState::kNew) {
+        stack.push_back(child);
+      } else {
+        // Already fully visited, no need to visit.
+      }
+    }
+  }
+}
+
+std::vector<const HloInstructionNode*> GetAllNodesInDfsOrder(
+    const HloInstructionNode& root, DfsTraversalOrder order, int node_limit) {
+  std::vector<const HloInstructionNode*> subgraph;
+  HloGumgraphDfs(
+      root, [&](const HloInstructionNode& node) { subgraph.push_back(&node); },
+      order, node_limit);
+  return subgraph;
+}
+
+}  // namespace xla::hlo_diff
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.h b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.h
new file mode 100644
index 000000000000..c49a648f51d6
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_GRAPH_UTILS_HLO_GUMGRAPH_DFS_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_GRAPH_UTILS_HLO_GUMGRAPH_DFS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/functional/function_ref.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+
+namespace xla::hlo_diff {
+
+// DFS traversal order: pre-order or post-order.
+enum class DfsTraversalOrder : std::int8_t { kPreOrder, kPostOrder };
+
+// Performs a depth first search of the HLO Module starting with specified
+// instruction node as start and calls supplied per node execution function for
+// each visited node.
+//
+// The traversal order determines whether the per node function is invoked
+// before or after the children of the node are visited, i.e. pre-order or
+// post-order traversal.
+//
+// The node_limit parameter should be set to the number of nodes in the
+// HLOGumgraph, as its used to track the visit state of each node during
+// traversal.
+//
+// If the expand_node_fn returns false for a node, the children of the node
+// will not be visited.
+void HloGumgraphDfs(
+    const HloInstructionNode& start_node,
+    absl::FunctionRef<void(const HloInstructionNode&)> per_node_fn,
+    DfsTraversalOrder order, int node_limit,
+    absl::FunctionRef<bool(const HloInstructionNode&)> expand_node_fn =
+        [](const HloInstructionNode&) { return true; });
+
+// Returns all nodes in the HLO Module in DFS order starting from the provided
+// root node. Check  comment of HloGumgraphDfs for more details.
+std::vector<const HloInstructionNode*> GetAllNodesInDfsOrder(
+    const HloInstructionNode& root, DfsTraversalOrder order, int node_limit);
+
+}  // namespace xla::hlo_diff
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_GRAPH_UTILS_HLO_GUMGRAPH_DFS_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs_test.cc
new file mode 100644
index 000000000000..0487fc79565e
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs_test.cc
@@ -0,0 +1,233 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::ElementsAre;
+
+class HloGumgraphDfsTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloGumgraphDfsTest, DfsPreOrderWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+  const auto root = graph->GetRoot();
+  std::vector<absl::string_view> visited_nodes;
+  HloGumgraphDfs(
+      root,
+      [&](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+      },
+      DfsTraversalOrder::kPreOrder, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes,
+              ElementsAre("root", "add_0", "baz", "add_1", "bar", "foo"));
+}
+
+TEST_F(HloGumgraphDfsTest, DfsPostOrderWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+  const auto root = graph->GetRoot();
+  std::vector<absl::string_view> visited_nodes;
+  HloGumgraphDfs(
+      root,
+      [&](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+      },
+      DfsTraversalOrder::kPostOrder, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes,
+              ElementsAre("baz", "bar", "foo", "add_1", "add_0", "root"));
+}
+
+TEST_F(HloGumgraphDfsTest, DfsPostOrderWorksForMultiplePathsFromRoot) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐      ┌------┐
+  //     |               | add_1 | ---> | ROOT |
+  // [copy_foo] -------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  copy_foo = f32[8,2048]{1,0:T(8,128)} copy(foo)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, copy_foo)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+  const auto root = graph->GetRoot();
+  std::vector<absl::string_view> visited_nodes;
+  HloGumgraphDfs(
+      root,
+      [&](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+      },
+      DfsTraversalOrder::kPostOrder, graph->GetNodeCount());
+
+  EXPECT_THAT(visited_nodes, ElementsAre("foo", "copy_foo", "add_1", "root"));
+}
+
+TEST_F(HloGumgraphDfsTest, GetAllNodesWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+  const auto root = graph->GetRoot();
+  std::vector<const HloInstructionNode*> visited_nodes = GetAllNodesInDfsOrder(
+      root, DfsTraversalOrder::kPreOrder, graph->GetNodeCount());
+  std::vector<absl::string_view> string_views;
+  string_views.reserve(visited_nodes.size());
+  for (const HloInstructionNode* node : visited_nodes) {
+    string_views.push_back(node->GetName());
+  }
+
+  EXPECT_THAT(string_views,
+              ElementsAre("root", "add_0", "baz", "add_1", "bar", "foo"));
+}
+
+TEST_F(HloGumgraphDfsTest, DfsPreOrderStopExpandingWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  HloGumgraphDfs(
+      graph->GetRoot(),
+      [&](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+      },
+      DfsTraversalOrder::kPreOrder, 6,
+      [](const HloInstructionNode& node) { return node.GetName() != "add_1"; });
+
+  EXPECT_THAT(visited_nodes, ElementsAre("root", "add_0", "baz", "add_1"));
+}
+
+TEST_F(HloGumgraphDfsTest, DfsPostOrderStopExpandingWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph,
+                          HloGumgraph::Create(module.get()));
+
+  std::vector<absl::string_view> visited_nodes;
+  HloGumgraphDfs(
+      graph->GetRoot(),
+      [&](const HloInstructionNode& node) {
+        visited_nodes.push_back(node.GetName());
+      },
+      DfsTraversalOrder::kPostOrder, 6,
+      [](const HloInstructionNode& node) { return node.GetName() != "add_1"; });
+
+  EXPECT_THAT(visited_nodes, ElementsAre("baz", "add_1", "add_0", "root"));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval.cc
new file mode 100644
index 000000000000..38b0553a3762
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval.cc
@@ -0,0 +1,137 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_diff_eval.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+
+namespace xla::hlo_diff {
+namespace {
+
+// Counts the number of split allegiance in the diff result.
+// Split allegiance is defined as:
+// Two computation nodes share the same fingerprint and some instructions are
+// matched, but other instructions in the left computation are matched to
+// instructions in a different computation node in the right graph.
+// Returns a pair of the number of computations that are split allegiance, and
+// the accumulated number of minimum instructions that are mismatched inside.
+std::pair<int64_t, int64_t> CountSplitAllegiance(
+    const HloGumgraph& left, const HloGumgraph& right,
+    const DiffSummary& diff_summary) {
+  int64_t split_allegiance_computation_count = 0;
+  int64_t split_allegiance_instruction_count = 0;
+  for (auto const& [computation, computation_props] :
+       left.AllComputationProps()) {
+    if (auto it = diff_summary.computation_summary.find(computation);
+        it != diff_summary.computation_summary.end()) {
+      const ComputationSummary& cmi = it->second;
+      if (cmi.split_allegiance_instruction_count > 0 &&
+          right.AllComputationProps()
+                  .at(cmi.main_matched_computation)
+                  .fingerprint == computation_props.fingerprint) {
+        ++split_allegiance_computation_count;
+        split_allegiance_instruction_count +=
+            cmi.split_allegiance_instruction_count;
+      }
+    }
+  }
+  return std::make_pair(split_allegiance_computation_count,
+                        split_allegiance_instruction_count);
+}
+
+// Counts the number of split allegiance parental in the diff result.
+// Split allegiance parental is defined as:
+// Two nodes are matched, they share the same number of children and children
+// opcodes, but some of their children are not matched.
+int64_t CountSplitAllegianceParental(const HloGumgraph& left,
+                                     const HloGumgraph& right,
+                                     const HloGumgraphMappings& mappings) {
+  int64_t count = 0;
+  for (const auto it : mappings.left_to_right_instruction_map.left) {
+    if (it.first->children.size() != it.second->children.size()) {
+      continue;
+    }
+    bool children_opcode_mismatch = false;
+    for (int i = 0; i < it.first->children.size(); ++i) {
+      if (it.first->children[i]->instruction->opcode() !=
+          it.second->children[i]->instruction->opcode()) {
+        children_opcode_mismatch = true;
+        break;
+      }
+    }
+    if (children_opcode_mismatch) {
+      continue;
+    }
+    for (int i = 0; i < it.first->children.size(); ++i) {
+      if (auto cit = mappings.left_to_right_instruction_map.left.find(
+              it.first->children[i]);
+          cit == mappings.left_to_right_instruction_map.left.end() ||
+          cit->second != it.second->children[i]) {
+        count++;
+        // LOG(INFO) << it.first->instruction->name() << " has split child: "
+        //           << it.first->children[i]->instruction->name();
+      }
+    }
+  }
+  return count;
+}
+
+}  // namespace
+
+std::unique_ptr<const DiffEval> ComputeDiffEval(
+    const HloGumgraph& left, const HloGumgraph& right,
+    const HloGumgraphMappings& mappings, const DiffResult& diff_result,
+    const DiffSummary& diff_summary) {
+  LOG(INFO) << "Evaluating diff result";
+  auto eval = std::make_unique<DiffEval>();
+  auto [split_allegiance_computation_count,
+        split_allegiance_instruction_count] =
+      CountSplitAllegiance(left, right, diff_summary);
+  eval->num_split_allegiance_computation = split_allegiance_computation_count;
+  eval->num_split_allegiance_instruction = split_allegiance_instruction_count;
+  eval->num_split_allegiance_parental =
+      CountSplitAllegianceParental(left, right, mappings);
+
+  eval->len_left_unmatched =
+      diff_result.left_module_unmatched_instructions.size();
+  eval->len_right_unmatched =
+      diff_result.right_module_unmatched_instructions.size();
+  eval->len_changed = diff_result.changed_instructions.size();
+  eval->len_unchanged = diff_result.unchanged_instructions.size();
+
+  eval->left_node_count = left.GetNodeCount();
+  eval->right_node_count = right.GetNodeCount();
+
+  return eval;
+}
+
+void LogDiffEval(const DiffEval& diff_eval) {
+  LOG(INFO) << "Split Allegiance Computation: "
+            << diff_eval.num_split_allegiance_computation;
+  LOG(INFO) << "Split Allegiance Instruction: "
+            << diff_eval.num_split_allegiance_instruction;
+  LOG(INFO) << "Split Allegiance Parental: "
+            << diff_eval.num_split_allegiance_parental;
+}
+
+}  // namespace xla::hlo_diff
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval.h b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval.h
new file mode 100644
index 000000000000..620a1a5e1219
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_EVAL_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_EVAL_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+
+namespace xla::hlo_diff {
+
+// Evaluation metrics for the diff result.
+struct DiffEval {
+  // Split allegiance is defined as:
+  // Two computation nodes share the same fingerprint and some instructions are
+  // matched, but other instructions in the left computation are matched to
+  // instructions in a different computation node in the right graph.
+  int64_t num_split_allegiance_computation = 0;
+  int64_t num_split_allegiance_instruction = 0;
+  // Split allegiance parental is defined as:
+  // Two nodes are matched, they share the same number of children and children
+  // opcodes, but some of their children are not matched.
+  int64_t num_split_allegiance_parental = 0;
+
+  // Size of the diff result.
+  int64_t len_left_unmatched = 0;
+  int64_t len_right_unmatched = 0;
+  int64_t len_changed = 0;
+  int64_t len_unchanged = 0;
+
+  // Graph node counts.
+  int64_t left_node_count = 0;
+  int64_t right_node_count = 0;
+};
+
+// Computes the diff evaluation metrics.
+// left and right are the original graphs.
+// mappings are the node mappings between the two graphs.
+// diff_result contains the edit script(insert/delete/change/move) created from
+// the node mappings. diff_summary summarizes the computation-based repeated
+// diff patterns.
+std::unique_ptr<const DiffEval> ComputeDiffEval(
+    const HloGumgraph& left, const HloGumgraph& right,
+    const HloGumgraphMappings& mappings, const DiffResult& diff_result,
+    const DiffSummary& diff_summary);
+
+// Logs the diff evaluation metrics.
+void LogDiffEval(const DiffEval& diff_eval);
+
+}  // namespace xla::hlo_diff
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_EVAL_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval_test.cc
new file mode 100644
index 000000000000..33e411244a41
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_eval_test.cc
@@ -0,0 +1,206 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_diff_eval.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/utils/test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+class HloDiffTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloDiffTest, SplitAllegianceWorks) {
+  // Create two similar modules with entry computation containing the following
+  // structure:
+  // [Param p0]->[Param p2]->┌-------┐  ┌----------┐
+  //                         | add.1 |->| fusion.1 |->┌-------┐
+  // [Param p1]->[Param p3]->└-------┘  └----------┘  |       |  ┌------┐
+  //                                                  | add.3 |->| ROOT |
+  // [Param p4]->[Param p6]->┌-------┐  ┌----------┐  |       |  └------┘
+  //                         | add.2 |->| fusion.2 |->└-------┘
+  // [Param p5]->[Param p7]->└-------┘  └----------┘
+  const char* hlo_string = R"(
+  HloModule module, is_scheduled=true
+  
+  fused_computation.1 {
+    p2 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+  }
+  
+  fused_computation.2 {
+    p6 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p7 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.2 = s32[32,16]{0,1:T(1,128)} add(p6, p7)
+  }
+  
+  ENTRY entry {
+    p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+    p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    p4 = s32[32,16]{0, 1:T(1,128)} parameter(2)
+    p5 = s32[32,16]{0,1:T(1,128)} parameter(3)
+    fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+    fusion.2 = s32[32,16]{0,1:T(1,128)} fusion(p4,p5), kind=kLoop, calls=fused_computation.2
+    ROOT add.3 = s32[32,16]{0,1:T(1,128)} add(fusion.1, fusion.2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  HloGumgraphMappings mappings;
+  // Map all nodes with the same name and then switch the mappings for add.1 and
+  // add.2.
+  mappings.MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                   MatcherType::kManual);
+  MatchAllNodesByName(*graph_l, *graph_r, mappings);
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.1"),
+                               GetNodeByName(*graph_r, "add.2"), mappings));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.2"),
+                               GetNodeByName(*graph_r, "add.1"), mappings));
+  // Construct the diff eval from the manually mapped node mappings.
+  std::unique_ptr<const DiffResult> diff_result =
+      ConstructDiffResult(*graph_l, *graph_r, mappings);
+  std::unique_ptr<const DiffSummary> diff_summary =
+      ConstructDiffSummary(*module_l, *module_r, *diff_result);
+  std::unique_ptr<const DiffEval> diff_eval = ComputeDiffEval(
+      *graph_l, *graph_r, mappings, *diff_result, *diff_summary);
+
+  EXPECT_EQ(diff_eval->num_split_allegiance_computation, 2);
+  EXPECT_EQ(diff_eval->num_split_allegiance_instruction, 2);
+  // The following pairs are split allegiance parental: parent are
+  // matched but children are not. (add.1 is matched to add.2)
+  // fusion.1
+  //   add.1 -> add.1
+  // fusion.2
+  //   add.2 -> add.2
+  // add.1
+  //   param2 -> param6
+  //   param3 -> param7
+  // add.2
+  //   param6 -> param2
+  //   param7 -> param3
+  EXPECT_EQ(diff_eval->num_split_allegiance_parental, 6);
+}
+
+TEST_F(HloDiffTest, GraphNodeCountsWork) {
+  // Create a module with entry computation containing the following structure:
+  // [Param p0]->[Param p2]->┌-------┐  ┌----------┐
+  //                         | add.1 |->| fusion.1 |->┌-------┐
+  // [Param p1]->[Param p3]->└-------┘  └----------┘  |       |  ┌------┐
+  //                                                  | add.3 |->| ROOT |
+  // [Param p4]->[Param p6]->┌-------┐  ┌----------┐  |       |  └------┘
+  //                         | add.2 |->| fusion.2 |->└-------┘
+  // [Param p5]->[Param p7]->└-------┘  └----------┘
+  const char* hlo_string = R"(
+  HloModule module, is_scheduled=true
+  
+  fused_computation.1 {
+    p2 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+  }
+  
+  fused_computation.2 {
+    p6 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p7 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.2 = s32[32,16]{0,1:T(1,128)} add(p6, p7)
+  }
+  
+  ENTRY entry {
+    p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+    p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    p4 = s32[32,16]{0, 1:T(1,128)} parameter(2)
+    p5 = s32[32,16]{0,1:T(1,128)} parameter(3)
+    fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+    fusion.2 = s32[32,16]{0,1:T(1,128)} fusion(p4,p5), kind=kLoop, calls=fused_computation.2
+    ROOT add.3 = s32[32,16]{0,1:T(1,128)} add(fusion.1, fusion.2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  std::unique_ptr<const DiffEval> diff_eval =
+      ComputeDiffEval(*graph_l, *graph_r, {}, {}, {});
+
+  EXPECT_EQ(diff_eval->left_node_count, 14);
+  EXPECT_EQ(diff_eval->right_node_count, 14);
+}
+
+TEST_F(HloDiffTest, DiffSizeWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param p0]->┌-------┐
+  //             | add.1 |
+  // [Param p1]->└-------┘
+  const char* hlo_string = R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+    p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    ROOT add.1 = s32[32,16]{0,1:T(1,128)} add(p0, p1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  DiffResult diff_result;
+  diff_result.left_module_unmatched_instructions.insert(
+      graph_l->GetRoot().instruction);
+  diff_result.right_module_unmatched_instructions.insert(
+      graph_r->GetRoot().instruction);
+  diff_result.changed_instructions.insert(
+      {graph_l->GetRoot().instruction, graph_r->GetRoot().instruction});
+  diff_result.unchanged_instructions.insert(
+      {graph_l->GetRoot().instruction, graph_r->GetRoot().instruction});
+  std::unique_ptr<const DiffEval> diff_eval =
+      ComputeDiffEval(*graph_l, *graph_r, {}, diff_result, {});
+
+  EXPECT_EQ(diff_eval->len_left_unmatched, 1);
+  EXPECT_EQ(diff_eval->len_right_unmatched, 1);
+  EXPECT_EQ(diff_eval->len_changed, 1);
+  EXPECT_EQ(diff_eval->len_unchanged, 1);
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
new file mode 100644
index 000000000000..837756d514a6
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_main.cc
@@ -0,0 +1,251 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.h"
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.h"
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_module_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tsl/platform/init_main.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+const char* const kUsage = R"(
+Given two HLO Modules, compares the graph structure of two HLO Modules and 
+summarizes the differences in a human readable format. The tool focuses on 
+computational differences ignoring irrelevant changes such as instruction 
+names, parameter ordering etc, layouts (in some instances).
+
+  Usage:
+      bazel run hlo_diff -- \
+        --{first_hlo_snapshot,first_hlo_proto,first_hlo_module_proto,first_hlo_text}=path/to/first/binary_proto
+        --{second_hlo_snapshot,second_hlo_proto,second_hlo_module_proto,second_hlo_text}=path/to/second/binary_proto
+        [--ignore_shape_during_instruction_matching]
+        [--text_output=path/to/file/to/save/text]
+        [--html_output=path/to/file/to/save/html]
+
+first and second hlo file paths are required flags. Optionally the following
+flags can be used:
+
+If --ignore_shape_during_instruction_matching is specified, the tool ignores
+array/tensor shapes when matching instructions allowing for more permissive
+matches.
+If --text_output is specified, the full diff result will be printed in text
+format and saved to the specified file.
+if --html_output is specified, the diff result will be rendered in HTML
+format and saved to the specified path.
+)";
+
+// Command line opts to this tool. See the main() for descriptions of these
+// fields.
+struct Options {
+  struct HloPath {
+    std::string hlo_snapshot;
+    std::string hlo_proto;
+    std::string hlo_module_proto;
+    std::string hlo_text;
+  };
+
+  struct RenderOptions {
+    std::string text_output;
+    std::string html_output;
+  };
+
+  HloPath first;
+  HloPath second;
+  DiffOptions diff_options;
+  RenderOptions render_options;
+};
+
+absl::Status CheckGroupFlags(const Options::HloPath& hlo_path) {
+  int nonempty_options_amount = 0;
+  for (const auto& path : {hlo_path.hlo_snapshot, hlo_path.hlo_proto,
+                           hlo_path.hlo_module_proto, hlo_path.hlo_text}) {
+    if (!path.empty()) {
+      ++nonempty_options_amount;
+    }
+  }
+  return nonempty_options_amount == 1
+             ? absl::OkStatus()
+             : absl::FailedPreconditionError(
+                   "Can only specify one and only one of path flags.");
+}
+
+// Builds a HloModule from the HloModuleProto.
+absl::StatusOr<std::unique_ptr<HloModule>> BuildHloModule(
+    const HloModuleProto& hlo_module_proto) {
+  TF_ASSIGN_OR_RETURN(HloModuleConfig config,
+                      HloModule::CreateModuleConfigFromProto(
+                          hlo_module_proto, xla::GetDebugOptionsFromFlags()));
+  return HloModule::CreateFromProto(hlo_module_proto, config);
+}
+
+absl::StatusOr<std::unique_ptr<HloModule>> LoadHLOModule(
+    const Options::HloPath& hlo_path) {
+  if (!hlo_path.hlo_snapshot.empty()) {
+    HloSnapshot snapshot;
+    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), hlo_path.hlo_snapshot,
+                                     &snapshot))
+        << "Can't open, read, or parse HloSnapshot proto at "
+        << hlo_path.hlo_snapshot;
+    return BuildHloModule(snapshot.hlo().hlo_module());
+  }
+  if (!hlo_path.hlo_proto.empty()) {
+    return ReadModuleFromBinaryProtoFile(hlo_path.hlo_proto,
+                                         xla::GetDebugOptionsFromFlags());
+  }
+  if (!hlo_path.hlo_module_proto.empty()) {
+    return ReadModuleFromModuleBinaryProtofile(hlo_path.hlo_module_proto,
+                                               xla::GetDebugOptionsFromFlags());
+  }
+  if (!hlo_path.hlo_text.empty()) {
+    return ReadModuleFromHloTextFile(
+        hlo_path.hlo_text, xla::GetDebugOptionsFromFlags(),
+        xla::HloParserOptions().set_fill_shortform_constants_with_random_values(
+            false));
+  }
+
+  return absl::InvalidArgumentError("No hlo_path specified.");
+}
+
+// Runs Gumgraph algorithm based diff and renders the diff results.
+absl::Status RunGumgraphDiff(HloModule& first_module, HloModule& second_module,
+                             const Options& opts) {
+  TF_RETURN_IF_ERROR(first_module.RemoveUnusedComputations());
+  TF_RETURN_IF_ERROR(second_module.RemoveUnusedComputations());
+
+  TF_ASSIGN_OR_RETURN(
+      auto hlo_gumgraph_diff,
+      ComputeDiff(first_module, second_module, opts.diff_options));
+  std::cout << "Diffing finished" << '\n';
+
+  const DiffResult& diff = *hlo_gumgraph_diff.diff_result;
+  const DiffSummary& diff_summary = *hlo_gumgraph_diff.diff_summary;
+  LogDiffResult(diff);
+  std::ostringstream text;
+  RenderTextSummary(diff, text);
+  std::cout << text.str() << '\n';
+
+  const std::string& text_output = opts.render_options.text_output;
+  if (!text_output.empty()) {
+    std::ostringstream text;
+    RenderText(diff, text);
+    TF_RETURN_IF_ERROR(
+        tsl::WriteStringToFile(tsl::Env::Default(), text_output, text.str()));
+  }
+
+  std::string html_output = opts.render_options.html_output;
+  if (!html_output.empty()) {
+    std::ostringstream html;
+    RenderHtml(diff, diff_summary, html);
+    TF_RETURN_IF_ERROR(
+        tsl::WriteStringToFile(tsl::Env::Default(), html_output, html.str()));
+
+    std::cout << "The diff summary is saved to: " << html_output << '\n';
+  }
+
+  return absl::OkStatus();
+}
+
+void RealMain(const Options& opts) {
+  TF_CHECK_OK(CheckGroupFlags(opts.first))
+      << "Can only specify one and ony one of --first_hlo_snapshot, "
+         "--first_hlo_proto, --first_hlo_module_proto, --first_hlo_text";
+  TF_CHECK_OK(CheckGroupFlags(opts.second))
+      << "Can only specify one and ony one of --second_hlo_snapshot, "
+         "--second_hlo_proto, --second_hlo_module_proto, --second_hlo_text";
+
+  LOG(INFO) << "Loading first module";
+  absl::StatusOr<std::unique_ptr<HloModule>> first_module =
+      LoadHLOModule(opts.first);
+  TF_CHECK_OK(first_module.status()) << "Failed to build first HLO module";
+  LOG(INFO) << "Loaded first module";
+
+  LOG(INFO) << "Loading second module";
+  absl::StatusOr<std::unique_ptr<HloModule>> second_module =
+      LoadHLOModule(opts.second);
+  TF_CHECK_OK(second_module.status()) << "Failed to build second HLO module";
+  LOG(INFO) << "Loaded second module";
+
+  CHECK_OK(
+      RunGumgraphDiff(*first_module.value(), *second_module.value(), opts));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  xla::hlo_diff::Options opts;
+  bool need_help = false;
+  const std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("first_hlo_snapshot", &opts.first.hlo_snapshot,
+                "first HloSnapshot proto to compare"),
+      tsl::Flag("first_hlo_proto", &opts.first.hlo_proto,
+                "first XLA hlo proto to compare"),
+      tsl::Flag("first_hlo_module_proto", &opts.first.hlo_module_proto,
+                "first XLA hlo module proto to compare"),
+      tsl::Flag("first_hlo_text", &opts.first.hlo_text,
+                "first XLA hlo text to compare"),
+      tsl::Flag("second_hlo_snapshot", &opts.second.hlo_snapshot,
+                "second HloSnapshot proto to compare"),
+      tsl::Flag("second_hlo_proto", &opts.second.hlo_proto,
+                "second XLA hlo proto to compare"),
+      tsl::Flag("second_hlo_module_proto", &opts.second.hlo_module_proto,
+                "second XLA hlo module proto to compare"),
+      tsl::Flag("second_hlo_text", &opts.second.hlo_text,
+                "second XLA hlo text to compare"),
+      tsl::Flag("ignore_shape_during_instruction_matching",
+                &opts.diff_options.fingerprint_options.ignore_shape,
+                "Ignore array/tensor shapes when matching instructions"),
+      tsl::Flag("text_output", &opts.render_options.text_output,
+                "file to save diff blocks as text"),
+      tsl::Flag("html_output", &opts.render_options.html_output,
+                "file to save an overview of the diff result as html"),
+      tsl::Flag("help", &need_help, "Prints this help message"),
+  };
+
+  std::string usage = tsl::Flags::Usage(argv[0], flag_list);
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(xla::hlo_diff::kUsage, &argc, &argv);
+  LOG_IF(QFATAL, argc != 1 || !parse_ok || need_help) << usage;
+  xla::hlo_diff::RealMain(opts);
+  return 0;
+}
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result.cc
new file mode 100644
index 000000000000..e1fbf33b9e74
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result.cc
@@ -0,0 +1,196 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/proto/diff_result.pb.h"
+#include "xla/hlo/tools/hlo_diff/utils/hlo_diff_util.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+bool IsChangedInstruction(const HloInstructionNode* left_node,
+                          const HloInstructionNode* right_node) {
+  uint64_t left_fingerprint = GetHloInstructionFingerprint(
+      left_node->instruction, HloPrintOptions::Fingerprint());
+  uint64_t right_fingerprint = GetHloInstructionFingerprint(
+      right_node->instruction, HloPrintOptions::Fingerprint());
+  return left_fingerprint != right_fingerprint;
+}
+
+}  // namespace
+
+std::unique_ptr<const DiffResult> ConstructDiffResult(
+    const HloGumgraph& left_graph, const HloGumgraph& right_graph,
+    const HloGumgraphMappings& mappings) {
+  LOG(INFO) << "Constructing diff result";
+  const std::vector<const HloInstructionNode*> left_all_nodes =
+      GetAllNodesInBfsOrder(left_graph.GetRoot(),
+                            BfsTraversalDirection::kForward,
+                            left_graph.GetNodeCount());
+  const std::vector<const HloInstructionNode*> right_all_nodes =
+      GetAllNodesInBfsOrder(right_graph.GetRoot(),
+                            BfsTraversalDirection::kForward,
+                            right_graph.GetNodeCount());
+  auto diff_result = std::make_unique<DiffResult>();
+  for (const HloInstructionNode* left_node : left_all_nodes) {
+    if (left_node->is_root) {
+      continue;
+    }
+    diff_result->node_props.insert({left_node->instruction, left_node->props});
+    if (!mappings.InstructionMapContainsLeft(left_node)) {
+      diff_result->left_module_unmatched_instructions.insert(
+          left_node->instruction);
+      continue;
+    }
+    const HloInstructionNode* right_node =
+        mappings.left_to_right_instruction_map.left.find(left_node)->second;
+    const HloInstructionNodeMappingProps& mapping_props =
+        mappings.left_to_right_instruction_map.left.find(left_node)->info;
+
+    if (IsChangedInstruction(left_node, right_node)) {
+      diff_result->changed_instructions[left_node->instruction] =
+          right_node->instruction;
+      diff_result->map_by[std::make_pair(left_node->instruction,
+                                         right_node->instruction)] =
+          mapping_props.matcher_type;
+      continue;
+    }
+    // If node position is unchanged, add to unchanged instructions.
+    if (mapping_props.unchanged) {
+      diff_result->unchanged_instructions[left_node->instruction] =
+          right_node->instruction;
+      diff_result->map_by[std::make_pair(left_node->instruction,
+                                         right_node->instruction)] =
+          mapping_props.matcher_type;
+      continue;
+    }
+    // TODO(b/369851244): Add moved instructions to diff result.
+    diff_result->unchanged_instructions[left_node->instruction] =
+        right_node->instruction;
+    diff_result->map_by[std::make_pair(left_node->instruction,
+                                       right_node->instruction)] =
+        mapping_props.matcher_type;
+  }
+
+  for (const HloInstructionNode* right_node : right_all_nodes) {
+    if (right_node->is_root) {
+      continue;
+    }
+    diff_result->node_props.insert(
+        {right_node->instruction, right_node->props});
+    if (!mappings.InstructionMapContainsRight(right_node)) {
+      diff_result->right_module_unmatched_instructions.insert(
+          right_node->instruction);
+    }
+  }
+
+  return diff_result;
+}
+
+DiffResultProto DiffResult::ToProto() const {
+  DiffResultProto proto;
+  for (const auto& [left_instruction, right_instruction] :
+       unchanged_instructions) {
+    MatchedInstructionPairProto* pair = proto.add_unchanged_instructions();
+    pair->set_left(std::string(left_instruction->name()));
+    pair->set_right(std::string(right_instruction->name()));
+  }
+  for (const auto& [left_instruction, right_instruction] :
+       changed_instructions) {
+    MatchedInstructionPairProto* pair = proto.add_changed_instructions();
+    pair->set_left(std::string(left_instruction->name()));
+    pair->set_right(std::string(right_instruction->name()));
+  }
+  for (const HloInstruction* instruction : left_module_unmatched_instructions) {
+    proto.add_left_unmatched_instructions(std::string(instruction->name()));
+  }
+  for (const HloInstruction* instruction :
+       right_module_unmatched_instructions) {
+    proto.add_right_unmatched_instructions(std::string(instruction->name()));
+  }
+  return proto;
+}
+
+DiffResult DiffResult::FromProto(const DiffResultProto& proto,
+                                 const HloModule& left_module,
+                                 const HloModule& right_module) {
+  // Get instructions from modules.
+  absl::flat_hash_map<std::string, const HloInstruction*>
+      left_instructions_by_name;
+  for (const HloComputation* computation : left_module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      left_instructions_by_name[instruction->name()] = instruction;
+    }
+  }
+  absl::flat_hash_map<std::string, const HloInstruction*>
+      right_instructions_by_name;
+  for (const HloComputation* computation : right_module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      right_instructions_by_name[instruction->name()] = instruction;
+    }
+  }
+
+  DiffResult diff_result;
+  for (const MatchedInstructionPairProto& pair :
+       proto.unchanged_instructions()) {
+    diff_result.unchanged_instructions[left_instructions_by_name[pair.left()]] =
+        right_instructions_by_name[pair.right()];
+  }
+  for (const MatchedInstructionPairProto& pair : proto.changed_instructions()) {
+    diff_result.changed_instructions[left_instructions_by_name[pair.left()]] =
+        right_instructions_by_name[pair.right()];
+  }
+  for (const std::string& name : proto.left_unmatched_instructions()) {
+    diff_result.left_module_unmatched_instructions.insert(
+        left_instructions_by_name[name]);
+  }
+  for (const std::string& name : proto.right_unmatched_instructions()) {
+    diff_result.right_module_unmatched_instructions.insert(
+        right_instructions_by_name[name]);
+  }
+
+  return diff_result;
+}
+
+void LogDiffResult(const DiffResult& diff_result) {
+  LOG(INFO) << "Unmatched instructions in the left module: "
+            << diff_result.left_module_unmatched_instructions.size();
+  LOG(INFO) << "Unmatched instructions in the right module: "
+            << diff_result.right_module_unmatched_instructions.size();
+  LOG(INFO) << "Changed instructions: "
+            << diff_result.changed_instructions.size();
+  LOG(INFO) << "Moved instructions: " << diff_result.moved_instructions.size();
+  LOG(INFO) << "Unchanged instructions: "
+            << diff_result.unchanged_instructions.size();
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result.h b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result.h
new file mode 100644
index 000000000000..92c5250c989a
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_RESULT_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_RESULT_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/proto/diff_result.pb.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Result of diff'ng the left and right HLO modules. Contains the matched and
+// unmatched instructions in the two modules.
+struct DiffResult {
+  // Matched instructions.
+  absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
+      unchanged_instructions;
+  absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
+      changed_instructions;
+  absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
+      moved_instructions;
+
+  // Unmatched instructions.
+  absl::flat_hash_set<const HloInstruction*> left_module_unmatched_instructions;
+  absl::flat_hash_set<const HloInstruction*>
+      right_module_unmatched_instructions;
+
+  // Debug info.
+  absl::flat_hash_map<std::pair<const HloInstruction*, const HloInstruction*>,
+                      MatcherType>
+      map_by;
+  absl::flat_hash_map<const HloInstruction*, HloInstructionNodeProps>
+      node_props;
+
+  // Converts the diff result to a proto.
+  DiffResultProto ToProto() const;
+
+  // Converts the diff result from a proto.
+  static DiffResult FromProto(const DiffResultProto& proto,
+                              const HloModule& left_module,
+                              const HloModule& right_module);
+};
+
+// Constructs the diff result from the node mappings.
+std::unique_ptr<const DiffResult> ConstructDiffResult(
+    const HloGumgraph& left_graph, const HloGumgraph& right_graph,
+    const HloGumgraphMappings& mappings);
+
+// Logs the diff result.
+void LogDiffResult(const DiffResult& diff_result);
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_RESULT_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result_test.cc
new file mode 100644
index 000000000000..a82659330e33
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_result_test.cc
@@ -0,0 +1,357 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/utils/test_util.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::Pair;
+using ::testing::Pointee;
+using ::testing::Property;
+using ::testing::UnorderedElementsAre;
+
+class HloDiffTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloDiffTest, MatchedDifferentShapeMarkAsChanged) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Param 0] ---> ┌-------┐
+  //                | add_0 |
+  // [Param 1] ---> └-------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f32[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.0, parameter.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Param 0] ---> ┌-------┐
+  //                | add_0 |
+  // [Param 1] ---> └-------┘
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f64[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.0, parameter.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "add.0"), GetNodeByName(*graph_r, "add.0"),
+      *mappings, /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "parameter.0"),
+                               GetNodeByName(*graph_r, "parameter.0"),
+                               *mappings, /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "parameter.1"),
+                               GetNodeByName(*graph_r, "parameter.1"),
+                               *mappings, /*position_unchanged=*/true));
+  auto diff_result = ConstructDiffResult(*graph_l, *graph_r, *mappings);
+
+  EXPECT_THAT(diff_result->changed_instructions,
+              UnorderedElementsAre(
+                  Pair(Pointee(Property(&HloInstruction::name, "parameter.0")),
+                       Pointee(Property(&HloInstruction::name, "parameter.0"))),
+                  Pair(Pointee(Property(&HloInstruction::name, "add.0")),
+                       Pointee(Property(&HloInstruction::name, "add.0")))));
+  EXPECT_THAT(diff_result->unchanged_instructions,
+              UnorderedElementsAre(Pair(
+                  Pointee(Property(&HloInstruction::name, "parameter.1")),
+                  Pointee(Property(&HloInstruction::name, "parameter.1")))));
+}
+
+TEST_F(HloDiffTest, MatchedDifferentFingerprintMarkAsChanged) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Param 0] ---> ┌-------┐      ┌------┐
+  //                | add_0 | ---> | ROOT |
+  // [Param 1] ---> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f32[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.0, parameter.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Param 1] ---> ┌-------┐      ┌------┐
+  //                | add_0 | ---> | ROOT |
+  // [Param 0] ---> └-------┘      └------┘
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f32[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.1, parameter.0)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "add.0"), GetNodeByName(*graph_r, "add.0"),
+      *mappings, /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "parameter.0"),
+                               GetNodeByName(*graph_r, "parameter.1"),
+                               *mappings, /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "parameter.1"),
+                               GetNodeByName(*graph_r, "parameter.0"),
+                               *mappings, /*position_unchanged=*/true));
+  auto diff_result = ConstructDiffResult(*graph_l, *graph_r, *mappings);
+
+  EXPECT_THAT(
+      diff_result->changed_instructions,
+      UnorderedElementsAre(
+          Pair(Pointee(Property(&HloInstruction::name, "parameter.0")),
+               Pointee(Property(&HloInstruction::name, "parameter.1"))),
+          Pair(Pointee(Property(&HloInstruction::name, "parameter.1")),
+               Pointee(Property(&HloInstruction::name, "parameter.0")))));
+  EXPECT_THAT(diff_result->unchanged_instructions,
+              UnorderedElementsAre(
+                  Pair(Pointee(Property(&HloInstruction::name, "add.0")),
+                       Pointee(Property(&HloInstruction::name, "add.0")))));
+}
+
+TEST_F(HloDiffTest, UnmatchedInstructionsMarkAsUnmatched) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Param 0] ---> ┌-------┐
+  //                | add_0 |
+  // [Param 1] ---> └-------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f32[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.0, parameter.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Param 1] ---> ┌-------┐
+  //                | add_0 |
+  // [Param 0] ---> └-------┘
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f32[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.1, parameter.0)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "add.0"), GetNodeByName(*graph_r, "add.0"),
+      *mappings, /*position_unchanged=*/true));
+  auto diff_result = ConstructDiffResult(*graph_l, *graph_r, *mappings);
+
+  EXPECT_THAT(diff_result->unchanged_instructions,
+              UnorderedElementsAre(
+                  Pair(Pointee(Property(&HloInstruction::name, "add.0")),
+                       Pointee(Property(&HloInstruction::name, "add.0")))));
+  EXPECT_THAT(diff_result->left_module_unmatched_instructions,
+              UnorderedElementsAre(
+                  Pointee(Property(&HloInstruction::name, "parameter.0")),
+                  Pointee(Property(&HloInstruction::name, "parameter.1"))));
+  EXPECT_THAT(diff_result->right_module_unmatched_instructions,
+              UnorderedElementsAre(
+                  Pointee(Property(&HloInstruction::name, "parameter.0")),
+                  Pointee(Property(&HloInstruction::name, "parameter.1"))));
+}
+
+TEST_F(HloDiffTest, ShortFormConstantsMatched) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Param 0]    ---> ┌-------┐
+  //                   | add_0 |
+  // [Const 2958] ---> └-------┘
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<xla::HloModule> module_l,
+      ParseAndReturnUnverifiedModule(
+          R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = s32[301]{0:T(512)} parameter(0)
+  constant.2958 = s32[301]{0:T(512)} constant({...})
+  add.0 = s32[301]{0:T(512)} add(parameter.0, constant.2958)
+}
+)",
+          HloModuleConfig(),
+          HloParserOptions().set_fill_shortform_constants_with_random_values(
+              false)));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Param 0]    ---> ┌-------┐
+  //                   | add_0 |
+  // [Const 2958] ---> └-------┘
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<xla::HloModule> module_r,
+      ParseAndReturnUnverifiedModule(
+          R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = s32[301]{0:T(512)} parameter(0)
+  constant.2958 = s32[301]{0:T(512)} constant({...})
+  add.0 = s32[301]{0:T(512)} add(parameter.0, constant.2958)
+}
+)",
+          HloModuleConfig(),
+          HloParserOptions().set_fill_shortform_constants_with_random_values(
+              false)));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "add.0"), GetNodeByName(*graph_r, "add.0"),
+      *mappings, /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "parameter.0"),
+                               GetNodeByName(*graph_r, "parameter.0"),
+                               *mappings, /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "constant.2958"),
+                               GetNodeByName(*graph_r, "constant.2958"),
+                               *mappings, /*position_unchanged=*/true));
+  auto diff_result = ConstructDiffResult(*graph_l, *graph_r, *mappings);
+
+  EXPECT_THAT(
+      diff_result->unchanged_instructions,
+      UnorderedElementsAre(
+          Pair(Pointee(Property(&HloInstruction::name, "constant.2958")),
+               Pointee(Property(&HloInstruction::name, "constant.2958"))),
+          Pair(Pointee(Property(&HloInstruction::name, "parameter.0")),
+               Pointee(Property(&HloInstruction::name, "parameter.0"))),
+          Pair(Pointee(Property(&HloInstruction::name, "add.0")),
+               Pointee(Property(&HloInstruction::name, "add.0")))));
+}
+
+TEST_F(HloDiffTest, DiffResultToAndFromProtoWorks) {
+  DiffResult diff_result;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f32[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.0, parameter.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  parameter.0 = f32[] parameter(0)
+  parameter.1 = f32[] parameter(1)
+  add.0 = f32[] add(parameter.1, parameter.0)
+}
+)"));
+  diff_result.unchanged_instructions.insert(
+      {module_l->entry_computation()->root_instruction(),
+       module_r->entry_computation()->root_instruction()});
+  diff_result.changed_instructions.insert(
+      {module_l->entry_computation()->parameter_instruction(0),
+       module_r->entry_computation()->parameter_instruction(1)});
+  diff_result.left_module_unmatched_instructions.insert(
+      module_l->entry_computation()->parameter_instruction(1));
+  diff_result.right_module_unmatched_instructions.insert(
+      module_r->entry_computation()->parameter_instruction(0));
+
+  DiffResultProto proto = diff_result.ToProto();
+
+  EXPECT_EQ(proto.unchanged_instructions_size(), 1);
+  EXPECT_EQ(proto.unchanged_instructions(0).left(), "add.0");
+  EXPECT_EQ(proto.unchanged_instructions(0).right(), "add.0");
+  EXPECT_EQ(proto.changed_instructions_size(), 1);
+  EXPECT_EQ(proto.changed_instructions(0).left(), "parameter.0");
+  EXPECT_EQ(proto.changed_instructions(0).right(), "parameter.1");
+  EXPECT_EQ(proto.left_unmatched_instructions_size(), 1);
+  EXPECT_EQ(proto.left_unmatched_instructions(0), "parameter.1");
+  EXPECT_EQ(proto.right_unmatched_instructions_size(), 1);
+  EXPECT_EQ(proto.right_unmatched_instructions(0), "parameter.0");
+
+  DiffResult diff_result_from_proto =
+      DiffResult::FromProto(proto, *module_l, *module_r);
+  EXPECT_EQ(diff_result_from_proto.unchanged_instructions.size(), 1);
+  EXPECT_EQ(diff_result_from_proto.changed_instructions.size(), 1);
+  EXPECT_EQ(diff_result_from_proto.left_module_unmatched_instructions.size(),
+            1);
+  EXPECT_EQ(diff_result_from_proto.right_module_unmatched_instructions.size(),
+            1);
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.cc
new file mode 100644
index 000000000000..d63e97777a39
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.cc
@@ -0,0 +1,412 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "boost/bimap.hpp"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/utils/connected_components.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using InstructionBimap =
+    boost::bimap<const HloInstruction*, const HloInstruction*>;
+
+InstructionBimap ConstructInstructionBimap(const DiffResult& diff_result) {
+  InstructionBimap mapping;
+  for (const auto& [left, right] : diff_result.unchanged_instructions) {
+    mapping.insert({left, right});
+  }
+  for (const auto& [left, right] : diff_result.changed_instructions) {
+    mapping.insert({left, right});
+  }
+  return mapping;
+}
+
+// Returns the mapped instruction node of the given instruction in the given
+// direction. Returns nullptr if the instruction is not mapped.
+const HloInstruction* FindMappedInstruction(const InstructionBimap& mapping,
+                                            const HloInstruction* instruction,
+                                            DiffSide side) {
+  switch (side) {
+    case DiffSide::kLeft: {
+      auto it = mapping.left.find(instruction);
+      if (it != mapping.left.end()) {
+        return it->second;
+      }
+      break;
+    }
+    case DiffSide::kRight: {
+      auto it = mapping.right.find(instruction);
+      if (it != mapping.right.end()) {
+        return it->second;
+      }
+      break;
+    }
+  }
+  return nullptr;
+}
+
+// Result of finding the main matched computation.
+struct MainMatchedComputationResult {
+  const HloComputation* main_matched_computation = nullptr;
+  int max_matched_instruction_count = 0;
+  int split_allegiance_instruction_count = 0;
+};
+
+// Returns the main matched computation of the given computation in the given
+// direction. A computation is considered as the main matched computation if it
+// has the most matched instructions.
+MainMatchedComputationResult FindMainMatchedComputation(
+    const HloComputation* computation, const InstructionBimap& mapping,
+    DiffSide side) {
+  absl::flat_hash_map<const HloComputation*, int> matched_instruction_count;
+  int max_count = 0;
+  int mapped_instruction_count = 0;
+  const HloComputation* main_matched_computation = nullptr;
+  for (const HloInstruction* instruction : computation->instructions()) {
+    if (const HloInstruction* const mapped_instruction =
+            FindMappedInstruction(mapping, instruction, side);
+        mapped_instruction != nullptr) {
+      ++mapped_instruction_count;
+      const HloComputation* right_computation = mapped_instruction->parent();
+      const int count = ++matched_instruction_count[right_computation];
+      if (count > max_count) {
+        max_count = count;
+        main_matched_computation = right_computation;
+      }
+    }
+  }
+  MainMatchedComputationResult result;
+  result.main_matched_computation = main_matched_computation;
+  result.max_matched_instruction_count = max_count;
+  result.split_allegiance_instruction_count =
+      mapped_instruction_count - max_count;
+  return result;
+}
+
+uint64_t GetDiffTypeFingerprint(
+    const HloInstruction* instruction,
+    const absl::flat_hash_set<const HloInstruction*>& changed_instructions,
+    const absl::flat_hash_set<const HloInstruction*>& unmatched_instructions) {
+  if (changed_instructions.contains(instruction)) {
+    return DiffCode::kChanged;
+  }
+  if (unmatched_instructions.contains(instruction)) {
+    return DiffCode::kUnmatched;
+  }
+  return DiffCode::kUnchanged;
+}
+
+struct DiffFingerprint {
+  bool all_unchanged;
+  uint64_t diff_fingerprint;
+};
+
+DiffFingerprint ComputationDiffFingerprint(
+    const xla::HloComputation* computation,
+    const absl::flat_hash_set<const HloInstruction*>& changed_instructions,
+    const absl::flat_hash_set<const HloInstruction*>& unmatched_instructions) {
+  absl::flat_hash_map<const HloInstruction*, uint64_t> subgraph_fingerprint;
+  bool all_unchanged = true;
+  for (auto* instruction : computation->MakeInstructionPostOrder()) {
+    uint64_t fp = static_cast<uint64_t>(instruction->opcode());
+    uint64_t diff_type_fp = GetDiffTypeFingerprint(
+        instruction, changed_instructions, unmatched_instructions);
+    all_unchanged = all_unchanged && (diff_type_fp == DiffCode::kUnchanged);
+    fp = tsl::FingerprintCat64(fp, diff_type_fp);
+    for (const HloInstruction* operand : instruction->operands()) {
+      fp = tsl::FingerprintCat64(fp, subgraph_fingerprint.at(operand));
+    }
+    // TODO(b/394201811): Make sure no fingerprint collision.
+    subgraph_fingerprint[instruction] = fp;
+  }
+  DiffFingerprint result;
+  result.all_unchanged = all_unchanged;
+  result.diff_fingerprint =
+      subgraph_fingerprint.at(computation->root_instruction());
+  return result;
+}
+
+// Split the computations into left and right computations.
+ComputationGroup SplitComputations(
+    const std::vector<const HloComputation*>& computations,
+    const absl::flat_hash_map<const HloComputation*, const ComputationSummary>&
+        computation_summaries) {
+  ComputationGroup result;
+  for (const HloComputation* computation : computations) {
+    if (auto it = computation_summaries.find(computation);
+        it != computation_summaries.end()) {
+      if (it->second.side == DiffSide::kLeft) {
+        result.left_computations.push_back(computation);
+      } else {
+        result.right_computations.push_back(computation);
+      }
+    }
+  }
+  return result;
+}
+
+// Returns the connected components of the given computation summary.
+absl::flat_hash_map<uint64_t, std::vector<ComputationGroup>>
+FindConnectedComponents(
+    absl::flat_hash_map<const HloComputation*, const ComputationSummary>
+        computation_summary) {
+  ConnectedComponentsFinder cc;
+  absl::flat_hash_map<uint64_t, std::vector<ComputationGroup>> result;
+  for (const auto& [computation, computation_match_info] :
+       computation_summary) {
+    if (computation_match_info.main_matched_computation != nullptr) {
+      cc.AddEdge(computation, computation_match_info.main_matched_computation);
+    }
+  }
+  std::vector<std::vector<const HloComputation*>> connected_component_groups =
+      cc.FindConnectedComponents();
+
+  for (const auto& component_group : connected_component_groups) {
+    bool all_unchanged = true;
+    for (const auto& computation : component_group) {
+      all_unchanged =
+          all_unchanged && computation_summary.at(computation).all_unchanged;
+    }
+    // Skip the component group if all computations are unchanged.
+    if (all_unchanged) {
+      continue;
+    }
+    std::vector<const HloComputation*> sorted_component_group(component_group);
+    std::sort(sorted_component_group.begin(), sorted_component_group.end(),
+              [&](const HloComputation* a, const HloComputation* b) {
+                return computation_summary.at(a).diff_fingerprint <
+                       computation_summary.at(b).diff_fingerprint;
+              });
+    uint64_t fingerprint = 0;
+    for (const auto& computation : sorted_component_group) {
+      fingerprint = tsl::FingerprintCat64(
+          fingerprint, computation_summary.at(computation).diff_fingerprint);
+    }
+    result[fingerprint].push_back(
+        SplitComputations(sorted_component_group, computation_summary));
+  }
+  return result;
+}
+
+DiffMetrics GetDiffMetrics(const ComputationGroup& computation_group,
+                           const DiffResult& diff_result) {
+  DiffMetrics result;
+  for (const HloComputation* computation :
+       computation_group.left_computations) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (diff_result.changed_instructions.contains(instruction)) {
+        ++result.changed_instruction_count;
+      } else if (diff_result.left_module_unmatched_instructions.contains(
+                     instruction)) {
+        ++result.left_unmatched_instruction_count;
+      }
+    }
+  }
+  for (const HloComputation* computation :
+       computation_group.right_computations) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (diff_result.changed_instructions.contains(instruction)) {
+        ++result.changed_instruction_count;
+      } else if (diff_result.right_module_unmatched_instructions.contains(
+                     instruction)) {
+        ++result.right_unmatched_instruction_count;
+      }
+    }
+  }
+  return result;
+}
+
+std::vector<ComputationDiffPattern> FindComputationDiffPatterns(
+    const absl::flat_hash_map<const HloComputation*, const ComputationSummary>&
+        computation_summary,
+    const DiffResult& diff_result) {
+  std::vector<ComputationDiffPattern> result;
+  absl::flat_hash_map<uint64_t, std::vector<ComputationGroup>>
+      connected_components = FindConnectedComponents(computation_summary);
+  for (const auto& [fingerprint, computation_groups] : connected_components) {
+    ComputationDiffPattern diff_pattern;
+    diff_pattern.fingerprint = fingerprint;
+    diff_pattern.computation_groups = computation_groups;
+    diff_pattern.diff_metrics =
+        GetDiffMetrics(computation_groups[0], diff_result);
+    result.push_back(std::move(diff_pattern));
+  }
+  return result;
+}
+
+// Summarizes all computations in the given graph.
+absl::flat_hash_map<const HloComputation*, const ComputationSummary>
+SummarizeAllComputationsInGraph(
+    const HloModule& module, const InstructionBimap& mapping,
+    const absl::flat_hash_set<const HloInstruction*>& changed_instructions,
+    const absl::flat_hash_set<const HloInstruction*>& unmatched_instructions,
+    DiffSide side) {
+  absl::flat_hash_map<const HloComputation*, const ComputationSummary> result;
+  for (const HloComputation* computation : module.computations()) {
+    const MainMatchedComputationResult mmc =
+        FindMainMatchedComputation(computation, mapping, side);
+    DiffFingerprint dfp = ComputationDiffFingerprint(
+        computation, changed_instructions, unmatched_instructions);
+    ComputationSummary summary;
+    summary.side = side;
+    summary.main_matched_computation = mmc.main_matched_computation;
+    summary.max_matched_instruction_count = mmc.max_matched_instruction_count;
+    summary.split_allegiance_instruction_count =
+        mmc.split_allegiance_instruction_count;
+    summary.diff_fingerprint = dfp.diff_fingerprint;
+    summary.all_unchanged = dfp.all_unchanged;
+    result.insert({computation, summary});
+  }
+  return result;
+}
+
+// Logs the computation group.
+void LogComputationGroup(const ComputationGroup& computation_group) {
+  std::vector<std::string> computations_str(
+      computation_group.left_computations.size() +
+      computation_group.right_computations.size());
+  for (int i = 0; i < computation_group.left_computations.size(); ++i) {
+    computations_str[i] = absl::StrFormat(
+        "L: %s", computation_group.left_computations[i]->name());
+  }
+  for (int i = 0; i < computation_group.right_computations.size(); ++i) {
+    computations_str[i] = absl::StrFormat(
+        "R: %s", computation_group.right_computations[i]->name());
+  }
+  LOG(INFO) << absl::StrJoin(computations_str, ", ");
+}
+
+// Logs the computation diff pattern.
+void LogComputationDiffPattern(const ComputationDiffPattern& diff_pattern) {
+  LOG(INFO) << diff_pattern.computation_groups.size()
+            << " Repeated Diff Pattern Fingerprint: "
+            << diff_pattern.fingerprint;
+  int i = 0;
+  for (const auto& computation_group : diff_pattern.computation_groups) {
+    ++i;
+    LogComputationGroup(computation_group);
+    if (i >= 5) {
+      LOG(INFO) << "...";
+      break;
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<const DiffSummary> ConstructDiffSummary(
+    const HloModule& left_module, const HloModule& right_module,
+    const DiffResult& diff_result) {
+  auto summary = std::make_unique<DiffSummary>();
+  absl::flat_hash_set<const HloInstruction*> left_changed_instructions;
+  absl::flat_hash_set<const HloInstruction*> right_changed_instructions;
+  absl::flat_hash_set<const HloInstruction*> left_unmatched_instructions;
+  absl::flat_hash_set<const HloInstruction*> right_unmatched_instructions;
+  for (auto const& [left, right] : diff_result.changed_instructions) {
+    left_changed_instructions.insert(left);
+    right_changed_instructions.insert(right);
+  }
+  InstructionBimap mapping = ConstructInstructionBimap(diff_result);
+  left_unmatched_instructions.insert(
+      diff_result.left_module_unmatched_instructions.begin(),
+      diff_result.left_module_unmatched_instructions.end());
+  right_unmatched_instructions.insert(
+      diff_result.right_module_unmatched_instructions.begin(),
+      diff_result.right_module_unmatched_instructions.end());
+  summary->computation_summary.merge(SummarizeAllComputationsInGraph(
+      left_module, mapping, left_changed_instructions,
+      left_unmatched_instructions, DiffSide::kLeft));
+  summary->computation_summary.merge(SummarizeAllComputationsInGraph(
+      right_module, mapping, right_changed_instructions,
+      right_unmatched_instructions, DiffSide::kRight));
+
+  // Group the computations by their diff fingerprint.
+  summary->computation_diff_patterns =
+      FindComputationDiffPatterns(summary->computation_summary, diff_result);
+
+  return summary;
+}
+
+void LogDiffSummary(const DiffSummary& diff_summary) {
+  // Log the connected components repeated more than 3 times.
+  LOG(INFO) << "Diff Summary: ";
+
+  // Log the computation diff patterns.
+  if (diff_summary.computation_diff_patterns.empty()) {
+    LOG(INFO) << "No diff patterns found.";
+  } else {
+    LOG(INFO) << "Found Repeated Diff Patterns: ";
+    for (const ComputationDiffPattern& diff_pattern :
+         diff_summary.computation_diff_patterns) {
+      // Only log the patterns with at least 3 repeats.
+      if (diff_pattern.computation_groups.size() < 3) {
+        continue;
+      }
+      LogComputationDiffPattern(diff_pattern);
+    }
+  }
+}
+
+void PrintTo(const ComputationDiffPattern& diff_pattern, std::ostream* os) {
+  *os << "{ fingerprint: " << diff_pattern.fingerprint;
+  for (const auto& computation_group : diff_pattern.computation_groups) {
+    *os << ", computation_groups: "
+        << "{ L: ";
+    for (const HloComputation* computation :
+         computation_group.left_computations) {
+      *os << absl::StrFormat("%s ", computation->name());
+    }
+    *os << ", R: ";
+    for (const HloComputation* computation :
+         computation_group.right_computations) {
+      *os << absl::StrFormat("%s ", computation->name());
+    }
+    *os << " }";
+  }
+  *os << ", diff_metrics: {"
+      << diff_pattern.diff_metrics.changed_instruction_count << " changed, "
+      << diff_pattern.diff_metrics.left_unmatched_instruction_count
+      << " left unmatched, "
+      << diff_pattern.diff_metrics.right_unmatched_instruction_count
+      << " right unmatched }";
+  *os << " }";
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.h b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.h
new file mode 100644
index 000000000000..a282ca689f1f
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_SUMMARY_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_SUMMARY_H_
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+
+namespace xla {
+namespace hlo_diff {
+
+enum DiffCode : uint8_t {
+  kUnchanged,
+  kChanged,
+  kUnmatched,
+};
+
+enum class DiffSide : std::uint8_t { kLeft, kRight };
+
+struct ComputationSummary {
+  DiffSide side;
+
+  // Computation in the other graph that has most instructions matched.
+  // Can be nullptr if no instructions are matched.
+  const HloComputation* main_matched_computation = nullptr;
+
+  // Number of instructions that are mapped to instructions in the main matched
+  // computation.
+  int64_t max_matched_instruction_count = 0;
+
+  // Number of instructions that are mapped to instructions in a different
+  // computation.
+  int64_t split_allegiance_instruction_count = 0;
+
+  // Fingerprint of the computation including diff.
+  uint64_t diff_fingerprint = 0;
+
+  // Whether all instructions in the computation are unchanged.
+  bool all_unchanged = true;
+};
+
+// A group of left and right computations that form a diff pattern.
+struct ComputationGroup {
+  std::vector<const HloComputation*> left_computations;
+  std::vector<const HloComputation*> right_computations;
+};
+
+// Metrics of the diff pattern.
+struct DiffMetrics {
+  int64_t changed_instruction_count = 0;
+  int64_t left_unmatched_instruction_count = 0;
+  int64_t right_unmatched_instruction_count = 0;
+};
+
+// A computation diff pattern is multiple groups of computations that have the
+// same diff.
+struct ComputationDiffPattern {
+  uint64_t fingerprint = 0;
+  std::vector<ComputationGroup> computation_groups;
+  DiffMetrics diff_metrics;
+};
+
+// Teach the gunit to print the diff pattern.
+void PrintTo(const ComputationDiffPattern& diff_pattern, std::ostream* os);
+
+//  Summary of the diff result of the left and right HLO modules.
+struct DiffSummary {
+  // The computation diff patterns found in the diff result.
+  std::vector<ComputationDiffPattern> computation_diff_patterns;
+
+  // Summary of each computation.
+  absl::flat_hash_map<const HloComputation*, const ComputationSummary>
+      computation_summary;
+};
+
+// Constructs the diff summary from the diff result.
+// `left_module` and `right_module` are the original HLO modules.
+// `diff_result` contains the edit script(insert/delete/change/move) created
+// from the node mappings.
+std::unique_ptr<const DiffSummary> ConstructDiffSummary(
+    const HloModule& left_module, const HloModule& right_module,
+    const DiffResult& diff_result);
+
+// Logs the diff summary.
+void LogDiffSummary(const DiffSummary& diff_summary);
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_HLO_DIFF_SUMMARY_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary_test.cc
new file mode 100644
index 000000000000..564191179134
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_diff_summary_test.cc
@@ -0,0 +1,483 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/utils/test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::ExplainMatchResult;
+using ::testing::FieldsAre;
+using ::testing::Pair;
+using ::testing::Pointee;
+using ::testing::Property;
+using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedPointwise;
+
+class HloDiffTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloDiffTest, FindMainMatchedComputationWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param p0]->[Param p2]->┌-------┐  ┌----------┐
+  //                         | add.1 |->| fusion.1 |->┌-------┐
+  // [Param p1]->[Param p3]->└-------┘  └----------┘  |       |  ┌------┐
+  //                                                  | add.3 |->| ROOT |
+  // [Param p4]->[Param p6]->┌-------┐  ┌----------┐  |       |  └------┘
+  //                         | add.2 |->| fusion.2 |->└-------┘
+  // [Param p5]->[Param p7]->└-------┘  └----------┘
+  const char* hlo_string = R"(
+  HloModule module, is_scheduled=true
+  
+  fused_computation.1 {
+    p2 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+  }
+  
+  fused_computation.2 {
+    p6 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p7 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.2 = s32[32,16]{0,1:T(1,128)} add(p6, p7)
+  }
+  
+  ENTRY entry {
+    p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+    p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    p4 = s32[32,16]{0, 1:T(1,128)} parameter(2)
+    p5 = s32[32,16]{0,1:T(1,128)} parameter(3)
+    fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+    fusion.2 = s32[32,16]{0,1:T(1,128)} fusion(p4,p5), kind=kLoop, calls=fused_computation.2
+    ROOT add.3 = s32[32,16]{0,1:T(1,128)} add(fusion.1, fusion.2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  HloGumgraphMappings mappings;
+  // Root nodes are matched by default before the matcher is called.
+  mappings.MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                   MatcherType::kManual);
+  MatchAllNodesByName(*graph_l, *graph_r, mappings);
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.1"),
+                               GetNodeByName(*graph_r, "add.2"), mappings,
+                               /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.2"),
+                               GetNodeByName(*graph_r, "add.1"), mappings,
+                               /*position_unchanged=*/true));
+  std::unique_ptr<const DiffResult> diff_result =
+      ConstructDiffResult(*graph_l, *graph_r, mappings);
+  std::unique_ptr<const DiffSummary> diff_summary =
+      ConstructDiffSummary(*module_l, *module_r, *diff_result);
+  absl::flat_hash_map<const HloComputation*, ComputationSummary>
+      left_computation_summary;
+  for (const auto& [computation, _] : graph_l->AllComputationProps()) {
+    if (auto it = diff_summary->computation_summary.find(computation);
+        it != diff_summary->computation_summary.end()) {
+      left_computation_summary[computation] = it->second;
+    }
+  }
+  absl::flat_hash_map<const HloComputation*, ComputationSummary>
+      right_computation_summary;
+  for (const auto& [computation, _] : graph_r->AllComputationProps()) {
+    if (auto it = diff_summary->computation_summary.find(computation);
+        it != diff_summary->computation_summary.end()) {
+      right_computation_summary[computation] = it->second;
+    }
+  }
+
+  EXPECT_THAT(
+      left_computation_summary,
+      UnorderedElementsAre(
+          Pair(Pointee(Property(&HloComputation::name, "entry")),
+               FieldsAre(/*side=*/DiffSide::kLeft,
+                         /*main_matched_computation=*/
+                         Pointee(Property(&HloComputation::name, "entry")),
+                         /*max_matched_instruction_count=*/7,
+                         /*split_allegiance_instruction=*/0,
+                         /*diff_fingerprint=*/3570884195340145402U,
+                         /*all_unchanged=*/true)),
+          Pair(Pointee(Property(&HloComputation::name, "fused_computation.1")),
+               FieldsAre(/*side=*/DiffSide::kLeft,
+                         /*main_matched_computation=*/
+                         Pointee(Property(&HloComputation::name,
+                                          "fused_computation.1")),
+                         /*max_matched_instruction_count=*/2,
+                         /*split_allegiance_instruction=*/1,
+                         /*diff_fingerprint=*/2604941079081458563U,
+                         /*all_unchanged=*/true)),
+          Pair(Pointee(Property(&HloComputation::name, "fused_computation.2")),
+               FieldsAre(/*side=*/DiffSide::kLeft,
+                         /*main_matched_computation=*/
+                         Pointee(Property(&HloComputation::name,
+                                          "fused_computation.2")),
+                         /*max_matched_instruction_count=*/2,
+                         /*split_allegiance_instruction=*/1,
+                         /*diff_fingerprint=*/2604941079081458563U,
+                         /*all_unchanged=*/true))));
+  EXPECT_THAT(
+      right_computation_summary,
+      UnorderedElementsAre(
+          Pair(Pointee(Property(&HloComputation::name, "entry")),
+               FieldsAre(/*side=*/DiffSide::kRight,
+                         /*main_matched_computation=*/
+                         Pointee(Property(&HloComputation::name, "entry")),
+                         /*max_matched_instruction_count=*/7,
+                         /*split_allegiance_instruction=*/0,
+                         /*diff_fingerprint=*/3570884195340145402U,
+                         /*all_unchanged=*/true)),
+          Pair(Pointee(Property(&HloComputation::name, "fused_computation.1")),
+               FieldsAre(/*side=*/DiffSide::kRight,
+                         /*main_matched_computation=*/
+                         Pointee(Property(&HloComputation::name,
+                                          "fused_computation.1")),
+                         /*max_matched_instruction_count=*/2,
+                         /*split_allegiance_instruction=*/1,
+                         /*diff_fingerprint=*/2604941079081458563U,
+                         /*all_unchanged=*/true)),
+          Pair(Pointee(Property(&HloComputation::name, "fused_computation.2")),
+               FieldsAre(/*side=*/DiffSide::kRight,
+                         /*main_matched_computation=*/
+                         Pointee(Property(&HloComputation::name,
+                                          "fused_computation.2")),
+                         /*max_matched_instruction_count=*/2,
+                         /*split_allegiance_instruction=*/1,
+                         /*diff_fingerprint=*/2604941079081458563U,
+                         /*all_unchanged=*/true))));
+}
+
+TEST_F(HloDiffTest, ComputationDiffFingerprintWorks) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Param 0] ---> ┌-------┐
+  //                | add_0 |
+  // [Param 1] ---> └-------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    parameter.0 = f32[] parameter(0)
+    parameter.1 = f32[] parameter(1)
+    add.0 = f32[] add(parameter.0, parameter.1)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Param 1] ---> ┌-------┐
+  //                | add_0 |
+  // [Param 0] ---> └-------┘
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    parameter.0 = f32[] parameter(0)
+    parameter.1 = f32[] parameter(1)
+    add.0 = f32[] add(parameter.1, parameter.0)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  HloGumgraphMappings mappings;
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "add.0"), GetNodeByName(*graph_r, "add.0"),
+      mappings, true));
+  std::unique_ptr<const DiffResult> diff_result =
+      ConstructDiffResult(*graph_l, *graph_r, mappings);
+  std::unique_ptr<const DiffSummary> diff_summary =
+      ConstructDiffSummary(*module_l, *module_r, *diff_result);
+  absl::flat_hash_map<const HloComputation*, ComputationSummary>
+      left_computation_summary;
+  for (const auto& [computation, _] : graph_l->AllComputationProps()) {
+    if (auto it = diff_summary->computation_summary.find(computation);
+        it != diff_summary->computation_summary.end()) {
+      left_computation_summary[computation] = it->second;
+    }
+  }
+  absl::flat_hash_map<const HloComputation*, ComputationSummary>
+      right_computation_summary;
+  for (const auto& [computation, _] : graph_r->AllComputationProps()) {
+    if (auto it = diff_summary->computation_summary.find(computation);
+        it != diff_summary->computation_summary.end()) {
+      right_computation_summary[computation] = it->second;
+    }
+  }
+  EXPECT_THAT(left_computation_summary,
+              UnorderedElementsAre(Pair(
+                  Pointee(Property(&HloComputation::name, "entry")),
+                  FieldsAre(/*side=*/DiffSide::kLeft,
+                            /*main_matched_computation=*/
+                            Pointee(Property(&HloComputation::name, "entry")),
+                            /*max_matched_instruction_count=*/1,
+                            /*split_allegiance_instruction=*/0,
+                            /*diff_fingerprint=*/13464792036913846758U,
+                            /*all_unchanged=*/false))));
+  EXPECT_THAT(right_computation_summary,
+              UnorderedElementsAre(Pair(
+                  Pointee(Property(&HloComputation::name, "entry")),
+                  FieldsAre(/*side=*/DiffSide::kRight,
+                            /*main_matched_computation=*/
+                            Pointee(Property(&HloComputation::name, "entry")),
+                            /*max_matched_instruction_count=*/1,
+                            /*split_allegiance_instruction=*/0,
+                            /*diff_fingerprint=*/13464792036913846758U,
+                            /*all_unchanged=*/false))));
+  EXPECT_THAT(diff_summary->computation_diff_patterns,
+              UnorderedElementsAre(FieldsAre(
+                  /*fingerprint=*/2864899211444957078U,
+                  /*computation_groups=*/
+                  UnorderedElementsAre(FieldsAre(
+                      /*left_computations=*/UnorderedElementsAre(
+                          Pointee(Property(&HloComputation::name, "entry"))),
+                      /*right_computations=*/UnorderedElementsAre(
+                          Pointee(Property(&HloComputation::name, "entry"))))),
+                  /*diff_metrics=*/
+                  FieldsAre(/*changed_instruction_count=*/0,
+                            /*left_unmatched_instruction_count=*/2,
+                            /*right_unmatched_instruction_count=*/2))));
+}
+
+TEST_F(HloDiffTest, FindConnectedComponentsWorks) {
+  // Create a module with entry computation containing the following structure:
+  // [Param p0]->[Param p2]->┌-------┐  ┌----------┐
+  //                         | add.1 |->| fusion.1 |->┌-------┐
+  // [Param p1]->[Param p3]->└-------┘  └----------┘  |       |  ┌------┐
+  //                                                  | add.3 |->| ROOT |
+  // [Param p4]->[Param p6]->┌-------┐  ┌----------┐  |       |  └------┘
+  //                         | add.2 |->| fusion.2 |->└-------┘
+  // [Param p5]->[Param p7]->└-------┘  └----------┘
+  const char* hlo_string = R"(
+  HloModule module, is_scheduled=true
+  
+  fused_computation.1 {
+    p2 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+  }
+  
+  fused_computation.2 {
+    p6 = s32[32,16]{0,1:T(1,128)} parameter(0)
+    p7 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    add.2 = s32[32,16]{0,1:T(1,128)} add(p6, p7)
+  }
+  
+  ENTRY entry {
+    p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+    p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+    p4 = s32[32,16]{0, 1:T(1,128)} parameter(2)
+    p5 = s32[32,16]{0,1:T(1,128)} parameter(3)
+    fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+    fusion.2 = s32[32,16]{0,1:T(1,128)} fusion(p4,p5), kind=kLoop, calls=fused_computation.2
+    ROOT add.3 = s32[32,16]{0,1:T(1,128)} add(fusion.1, fusion.2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.1"),
+                               GetNodeByName(*graph_r, "add.2"), *mappings,
+                               /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.2"),
+                               GetNodeByName(*graph_r, "add.1"), *mappings,
+                               /*position_unchanged=*/true));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.3"),
+                               GetNodeByName(*graph_r, "add.3"), *mappings,
+                               /*position_unchanged=*/true));
+  std::unique_ptr<const DiffResult> diff_result =
+      ConstructDiffResult(*graph_l, *graph_r, *mappings);
+  std::unique_ptr<const DiffSummary> diff_summary =
+      ConstructDiffSummary(*module_l, *module_r, *diff_result);
+  EXPECT_THAT(
+      diff_summary->computation_diff_patterns,
+      UnorderedElementsAre(
+          FieldsAre(
+              /*fingerprint=*/2864899211444957078U,
+              /*computation_groups=*/
+              UnorderedElementsAre(
+                  FieldsAre(/*left_computations=*/UnorderedElementsAre(
+                                Pointee(Property(&HloComputation::name,
+                                                 "fused_computation.1"))),
+                            /*right_computations=*/UnorderedElementsAre(
+                                Pointee(Property(&HloComputation::name,
+                                                 "fused_computation.2")))),
+                  FieldsAre(/*left_computations=*/UnorderedElementsAre(
+                                Pointee(Property(&HloComputation::name,
+                                                 "fused_computation.2"))),
+                            /*right_computations=*/UnorderedElementsAre(
+                                Pointee(Property(&HloComputation::name,
+                                                 "fused_computation.1"))))),
+              /*diff_metrics=*/
+              FieldsAre(/*changed_instruction_count=*/0,
+                        /*left_unmatched_instruction_count=*/2,
+                        /*right_unmatched_instruction_count=*/2)),
+          FieldsAre(/*fingerprint=*/15473561031564762362U,
+                    /*computation_groups=*/
+                    UnorderedElementsAre(FieldsAre(
+                        /*left_computations=*/UnorderedElementsAre(
+                            Pointee(Property(&HloComputation::name, "entry"))),
+                        /*right_computations=*/UnorderedElementsAre(Pointee(
+                            Property(&HloComputation::name, "entry"))))),
+                    /*diff_metrics=*/
+                    FieldsAre(/*changed_instruction_count=*/0,
+                              /*left_unmatched_instruction_count=*/6,
+                              /*right_unmatched_instruction_count=*/6))));
+}
+
+MATCHER(EqualsComputationGroup, "") {
+  const ComputationGroup& a = std::get<0>(arg);
+  const ComputationGroup& b = std::get<1>(arg);
+  return ExplainMatchResult(a.left_computations, b.left_computations,
+                            result_listener) &&
+         ExplainMatchResult(a.right_computations, b.right_computations,
+                            result_listener);
+}
+
+MATCHER_P(EqualsDiffMetrics, a, "") {
+  const DiffMetrics& b = arg;
+  return ExplainMatchResult(a.changed_instruction_count,
+                            b.changed_instruction_count, result_listener) &&
+         ExplainMatchResult(a.left_unmatched_instruction_count,
+                            b.left_unmatched_instruction_count,
+                            result_listener) &&
+         ExplainMatchResult(a.right_unmatched_instruction_count,
+                            b.right_unmatched_instruction_count,
+                            result_listener);
+}
+
+MATCHER(EqualsComputationDiffPattern, "") {
+  const ComputationDiffPattern& a = std::get<0>(arg);
+  const ComputationDiffPattern& b = std::get<1>(arg);
+  return ExplainMatchResult(a.fingerprint, b.fingerprint, result_listener) &&
+         ExplainMatchResult(
+             UnorderedPointwise(EqualsComputationGroup(), a.computation_groups),
+             b.computation_groups, result_listener) &&
+         ExplainMatchResult(EqualsDiffMetrics(a.diff_metrics), b.diff_metrics,
+                            result_listener);
+}
+
+MATCHER_P(EqualsComputationSummary, a, "") {
+  const ComputationSummary& b = arg;
+  return ExplainMatchResult(a.side, b.side, result_listener) &&
+         ExplainMatchResult(a.main_matched_computation,
+                            b.main_matched_computation, result_listener) &&
+         ExplainMatchResult(a.max_matched_instruction_count,
+                            b.max_matched_instruction_count, result_listener) &&
+         ExplainMatchResult(a.split_allegiance_instruction_count,
+                            b.split_allegiance_instruction_count,
+                            result_listener) &&
+         ExplainMatchResult(a.diff_fingerprint, b.diff_fingerprint,
+                            result_listener) &&
+         ExplainMatchResult(a.all_unchanged, b.all_unchanged, result_listener);
+}
+
+MATCHER(EqualsComputationSummaryMapElement, "") {
+  const auto& a = std::get<0>(arg);
+  const auto& b = std::get<1>(arg);
+  return ExplainMatchResult(a.first, b.first, result_listener) &&
+         ExplainMatchResult(EqualsComputationSummary(a.second), b.second,
+                            result_listener);
+}
+
+TEST_F(HloDiffTest, DiffSummaryFromDiffResultProtoWorks) {
+  DiffResult diff_result;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    parameter.0 = f32[] parameter(0)
+    parameter.1 = f32[] parameter(1)
+    add.0 = f32[] add(parameter.0, parameter.1)
+  }
+  )"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+  HloModule module, is_scheduled=true
+  
+  ENTRY entry {
+    parameter.0 = f32[] parameter(0)
+    parameter.1 = f32[] parameter(1)
+    add.0 = f32[] add(parameter.1, parameter.0)
+  }
+  )"));
+  diff_result.unchanged_instructions.insert(
+      {module_l->entry_computation()->root_instruction(),
+       module_r->entry_computation()->root_instruction()});
+  diff_result.changed_instructions.insert(
+      {module_l->entry_computation()->parameter_instruction(0),
+       module_r->entry_computation()->parameter_instruction(1)});
+  diff_result.left_module_unmatched_instructions.insert(
+      module_l->entry_computation()->parameter_instruction(1));
+  diff_result.right_module_unmatched_instructions.insert(
+      module_r->entry_computation()->parameter_instruction(0));
+
+  DiffResultProto proto = diff_result.ToProto();
+  DiffResult diff_result_from_proto =
+      DiffResult::FromProto(proto, *module_l, *module_r);
+  std::unique_ptr<const DiffSummary> diff_summary =
+      ConstructDiffSummary(*module_l, *module_r, diff_result_from_proto);
+  std::unique_ptr<const DiffSummary> expected_diff_summary =
+      ConstructDiffSummary(*module_l, *module_r, diff_result);
+  EXPECT_THAT(diff_summary->computation_summary,
+              UnorderedPointwise(EqualsComputationSummaryMapElement(),
+                                 expected_diff_summary->computation_summary));
+  EXPECT_THAT(
+      diff_summary->computation_diff_patterns,
+      UnorderedPointwise(EqualsComputationDiffPattern(),
+                         expected_diff_summary->computation_diff_patterns));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.cc
new file mode 100644
index 000000000000..75caff6c0735
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.cc
@@ -0,0 +1,117 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_eval.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.h"
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.h"
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.h"
+#include "xla/service/call_graph.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+absl::StatusOr<std::unique_ptr<const HloGumgraphMappings>> FindMappings(
+    const HloGumgraph& left, const HloGumgraph& right,
+    const MatchOptions& options = {}) {
+  LOG(INFO) << "Running Matchers";
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  mappings->MapInstructionsIfAbsent(&left.GetRoot(), &right.GetRoot(),
+                                    MatcherType::kManual);
+
+  MatchCallGraphs(left, right, *mappings);
+
+  TF_RETURN_IF_ERROR(left.GetCallGraph().VisitNodes(
+      [&](const CallGraphNode& node) {
+        if (auto it = mappings->left_to_right_computation_map.left.find(&node);
+            it != mappings->left_to_right_computation_map.left.end()) {
+          MatchComputationGraphs(left, right, node, *it->second, *mappings);
+        }
+        return absl::OkStatus();
+      },
+      /*visit_unreachable_nodes=*/true));
+
+  std::vector<std::unique_ptr<HloGumgraphMatcher>> matchers;
+  matchers.push_back(std::make_unique<GreedyTopDownMatcher>(
+      &left, &right, /*require_same_children=*/true));
+  matchers.push_back(
+      std::make_unique<GreedyLimitedCandidatesBottomUpMatcher>(&left, &right));
+  if (options.use_top_down_matcher) {
+    matchers.push_back(std::make_unique<GreedyTopDownMatcher>(&left, &right));
+  }
+
+  for (auto& matcher : matchers) {
+    matcher->Match(*mappings);
+  }
+
+  return mappings;
+}
+}  // namespace
+
+absl::StatusOr<HloGumgraphDiffResults> ComputeDiff(const HloModule& left,
+                                                   const HloModule& right,
+                                                   const DiffOptions& options,
+                                                   bool run_eval) {
+  LOG(INFO) << "Initializing left module graph";
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<const HloGumgraph> left_graph,
+                      HloGumgraph::Create(&left, options.fingerprint_options));
+  LOG(INFO) << "Initialized left module graph of size: "
+            << left_graph->GetNodeCount()
+            << " and height: " << left_graph->GetRoot().props.height;
+
+  LOG(INFO) << "Initializing right module graph";
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<const HloGumgraph> right_graph,
+                      HloGumgraph::Create(&right, options.fingerprint_options));
+  LOG(INFO) << "Initialized right module graph of size: "
+            << right_graph->GetNodeCount()
+            << " and height: " << right_graph->GetRoot().props.height;
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<const HloGumgraphMappings> mappings,
+                      FindMappings(*left_graph, *right_graph));
+
+  std::unique_ptr<const DiffResult> diff_result =
+      ConstructDiffResult(*left_graph, *right_graph, *mappings);
+  std::unique_ptr<const DiffSummary> diff_summary =
+      ConstructDiffSummary(left, right, *diff_result);
+  std::unique_ptr<const DiffEval> diff_eval = nullptr;
+  if (run_eval) {
+    diff_eval = ComputeDiffEval(*left_graph, *right_graph, *mappings,
+                                *diff_result, *diff_summary);
+  }
+
+  return HloGumgraphDiffResults(
+      {std::move(diff_result), std::move(diff_summary), std::move(diff_eval)});
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.h b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.h
new file mode 100644
index 000000000000..9d8db2f25546
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_HLO_GUMGRAPH_DIFF_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_HLO_GUMGRAPH_DIFF_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_eval.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Options for computing the diff between two HLO modules.
+struct DiffOptions {
+  HloGumgraphFingerprintOptions fingerprint_options;
+};
+
+struct HloGumgraphDiffResults {
+  std::unique_ptr<const DiffResult> diff_result;
+  std::unique_ptr<const DiffSummary> diff_summary;
+  std::unique_ptr<const DiffEval> diff_eval;
+};
+
+// Compares two HLO modules, computes and returns differences.
+absl::StatusOr<HloGumgraphDiffResults> ComputeDiff(
+    const HloModule& left, const HloModule& right,
+    const DiffOptions& options = {}, bool run_eval = false);
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_HLO_GUMGRAPH_DIFF_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff_test.cc
new file mode 100644
index 000000000000..bf56f38a0093
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_diff_test.cc
@@ -0,0 +1,85 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_diff.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+class HloDiffTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloDiffTest, ComputeDiffWorksWithoutEval) {
+  // Create a module with entry computation containing the following structure:
+  // [Param p0]->┌-------┐
+  //             | add.1 |
+  // [Param p1]->└-------┘
+  const char* hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  ROOT add.1 = s32[32,16]{0,1:T(1,128)} add(p0, p1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto diff_result,
+      ComputeDiff(*module_l, *module_r, {}, /*run_eval=*/false));
+
+  EXPECT_NE(diff_result.diff_result, nullptr);
+  EXPECT_NE(diff_result.diff_summary, nullptr);
+  EXPECT_EQ(diff_result.diff_eval, nullptr);
+}
+
+TEST_F(HloDiffTest, ComputeDiffWorksWithEval) {
+  // Create a module with entry computation containing the following structure:
+  // [Param p0]->┌-------┐
+  //             | add.1 |
+  // [Param p1]->└-------┘
+  const char* hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  ROOT add.1 = s32[32,16]{0,1:T(1,128)} add(p0, p1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto diff_result, ComputeDiff(*module_l, *module_r,
+                                                        {}, /*run_eval=*/true));
+
+  EXPECT_NE(diff_result.diff_result, nullptr);
+  EXPECT_NE(diff_result.diff_summary, nullptr);
+  EXPECT_NE(diff_result.diff_eval, nullptr);
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h
new file mode 100644
index 000000000000..4a97e924f9e0
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_HLO_GUMGRAPH_MAPPINGS_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_HLO_GUMGRAPH_MAPPINGS_H_
+
+#include <cstdint>
+
+#include "boost/bimap.hpp"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/service/call_graph.h"
+
+namespace xla::hlo_diff {
+
+// Type of matcher that matched two HloInstructionNodes.
+enum class MatcherType : std::uint8_t {
+  kNotSet,
+  kManual,
+  kComputationGraphExactFingerprintMatcher,
+  kComputationGraphExactSignatureMatcher,
+  kGreedySubGraphExactMatcher,
+  kGreedyDoubleCountedBottomUpMatcher,
+  kGreedyLimitedCandidatesBottomUpMatcher,
+  kGreedyLimitedCandidatesStaticSeedsBottomUpMatcher,
+  kGreedyTopDownMatcher,
+};
+
+// Computations with matching input parameters and output result are classified
+// as kSignature matches. kExact matches on the other hand are kSignature
+// matches that additionally have identical instructions in the computation
+// graph, i.e. same computation fingerprint.
+enum class ComputationMatchType : std::uint8_t { kExact, kSignature };
+
+// Aggregated match characteristics of a mapped HloInstructionNode.
+struct HloInstructionNodeMappingProps {
+  bool unchanged = false;
+  MatcherType matcher_type = MatcherType::kNotSet;
+};
+
+// Aggregated match characteristics of a mapped CallGraphNode.
+struct HloCallGraphNodeMappingProps {
+  ComputationMatchType computation_match_type = ComputationMatchType::kExact;
+};
+
+using InstructionPair = boost::bimap<
+    const HloInstructionNode*, const HloInstructionNode*,
+    boost::bimaps::with_info<HloInstructionNodeMappingProps>>::value_type;
+
+using CallGraphNodePair = boost::bimap<
+    const CallGraphNode*, const CallGraphNode*,
+    boost::bimaps::with_info<HloCallGraphNodeMappingProps>>::value_type;
+
+// Mapped nodes between two HloGumgraphs.
+struct HloGumgraphMappings {
+  // Map between the left and right CallGraphNodes.
+  boost::bimap<const CallGraphNode*, const CallGraphNode*,
+               boost::bimaps::with_info<HloCallGraphNodeMappingProps>>
+      left_to_right_computation_map;
+
+  // A bi-directional map between the left and right HloInstructionNodes along
+  // with additional information about the mapping. Check out
+  // https://www.boost.org/doc/libs/1_79_0/libs/bimap/doc/html/boost_bimap/the_tutorial/additional_information.html
+  // for more details on the bimap API.
+  boost::bimap<const HloInstructionNode*, const HloInstructionNode*,
+               boost::bimaps::with_info<HloInstructionNodeMappingProps>>
+      left_to_right_instruction_map;
+
+  // Maps two nodes if they are not already mapped. Returns true if mapping
+  // was performed.
+  inline bool MapInstructionsIfAbsent(const HloInstructionNode* left,
+                                      const HloInstructionNode* right,
+                                      const MatcherType matcher_type) {
+    auto [it, inserted] = left_to_right_instruction_map.insert(
+        InstructionPair(left, right, {.matcher_type = matcher_type}));
+
+    return inserted;
+  }
+
+  // Maps two CallGraphNodes if they are not already mapped. Returns true if
+  // mapping was performed.
+  inline bool MapComputationsIfAbsent(
+      const CallGraphNode& left, const CallGraphNode& right,
+      const ComputationMatchType computation_match_type) {
+    auto [it, inserted] =
+        left_to_right_computation_map.insert(CallGraphNodePair(
+            &left, &right, {.computation_match_type = computation_match_type}));
+
+    return inserted;
+  }
+
+  // Returns true if the left node is mapped to a right node.
+  inline bool InstructionMapContainsLeft(
+      const HloInstructionNode* left_node) const {
+    return left_to_right_instruction_map.left.find(left_node) !=
+           left_to_right_instruction_map.left.end();
+  }
+
+  // Returns true if the right node is mapped to a left node.
+  inline bool InstructionMapContainsRight(
+      const HloInstructionNode* right_node) const {
+    return left_to_right_instruction_map.right.find(right_node) !=
+           left_to_right_instruction_map.right.end();
+  }
+};
+
+}  // namespace xla::hlo_diff
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_HLO_GUMGRAPH_MAPPINGS_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/BUILD
new file mode 100644
index 000000000000..3da54c3e4c25
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/BUILD
@@ -0,0 +1,115 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//xla/hlo/tools/hlo_diff:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "hlo_call_graph_matcher",
+    srcs = ["hlo_call_graph_matcher.cc"],
+    hdrs = ["hlo_call_graph_matcher.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff:hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/service:call_graph",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_call_graph_matcher_test",
+    srcs = ["hlo_call_graph_matcher_test.cc"],
+    deps = [
+        ":hlo_call_graph_matcher",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff:hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/utils:test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_computation_graph_matcher",
+    srcs = ["hlo_computation_graph_matcher.cc"],
+    hdrs = ["hlo_computation_graph_matcher.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff:hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/service:call_graph",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_computation_graph_matcher_test",
+    srcs = ["hlo_computation_graph_matcher_test.cc"],
+    deps = [
+        ":hlo_computation_graph_matcher",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff:hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/utils:test_util",
+        "//xla/service:call_graph",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_matcher",
+    srcs = ["hlo_gumgraph_matcher.cc"],
+    hdrs = ["hlo_gumgraph_matcher.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff:hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "//xla/hlo/tools/hlo_diff/graph/utils:hlo_gumgraph_bfs",
+        "//xla/hlo/tools/hlo_diff/graph/utils:hlo_gumgraph_dfs",
+        "//xla/service:hlo_value",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_gumgraph_matcher_test",
+    srcs = ["hlo_gumgraph_matcher_test.cc"],
+    deps = [
+        ":hlo_gumgraph_matcher",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/tools/hlo_diff:hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/utils:test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.cc b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.cc
new file mode 100644
index 000000000000..e1355f02b822
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.cc
@@ -0,0 +1,389 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.h"
+
+#include <cstdint>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/service/call_graph.h"
+#include "xla/shape.h"
+
+namespace xla::hlo_diff {
+namespace {
+
+using VisitorFunction = absl::FunctionRef<void(const CallGraphNode&)>;
+
+// Sort the callees of the caller computation in order of the caller
+// computation's post order instructions.
+std::vector<std::pair<const HloInstruction*, const CallGraphNode*>>
+SortCalleesByCallerComputationPostOrder(
+    const absl::flat_hash_set<const CallGraphNode*>& callees,
+    HloComputation::CachingPostOrder& cpo, const HloGumgraph& gumgraph) {
+  std::vector<std::pair<const HloInstruction*, const CallGraphNode*>> callsites;
+  for (auto* instruction : cpo.PostOrder()) {
+    for (const auto* computation : instruction->called_computations()) {
+      const CallGraphNode* callee =
+          &gumgraph.GetCallGraph().GetNode(computation);
+      if (callees.contains(callee)) {
+        callsites.push_back(std::make_pair(instruction, callee));
+      }
+    }
+  }
+
+  return callsites;
+}
+
+// Match left and right callee computations based on call_site instruction
+// attributes: ex. op_code and instruction position in the caller computation
+// post-order.
+void MapComputationCalleesWithSameFingerprintOrProgramShape(
+    const absl::flat_hash_set<const CallGraphNode*>& left_callees,
+    const absl::flat_hash_set<const CallGraphNode*>& right_callees,
+    HloComputation::CachingPostOrder& left_cpo,
+    HloComputation::CachingPostOrder& right_cpo, const HloGumgraph& left,
+    const HloGumgraph& right, HloGumgraphMappings& mappings,
+    const ComputationMatchType computation_match_type) {
+  // Don't attempt to match if there are different number of callee
+  // computations as its difficult to disambiguate between the callees.
+  if (left_callees.size() != right_callees.size()) {
+    return;
+  }
+
+  // Match computations if there is exactly one callee on both sides.
+  if (left_callees.size() == 1 && right_callees.size() == 1) {
+    mappings.MapComputationsIfAbsent(*(*left_callees.begin()),
+                                     *(*right_callees.begin()),
+                                     computation_match_type);
+    return;
+  }
+
+  // For multiple callees, match them in order of the caller computation's post
+  // order instructions.
+  std::vector<std::pair<const HloInstruction*, const CallGraphNode*>>
+      left_callsites =
+          SortCalleesByCallerComputationPostOrder(left_callees, left_cpo, left);
+  std::vector<std::pair<const HloInstruction*, const CallGraphNode*>>
+      right_callsites = SortCalleesByCallerComputationPostOrder(
+          right_callees, right_cpo, right);
+
+  // Don't attempt to match if there are different number of call sites as its
+  // difficult to disambiguate between the call sites.
+  if (left_callsites.size() != right_callsites.size()) {
+    return;
+  }
+
+  // Verify that all call sites instruction op codes and metadata op_name match.
+  for (int i = 0; i < left_callsites.size(); ++i) {
+    if (left_callsites[i].first->opcode() !=
+        right_callsites[i].first->opcode()) {
+      return;
+    }
+
+    if (left_callsites[i].first->metadata().op_name() !=
+        right_callsites[i].first->metadata().op_name()) {
+      return;
+    }
+  }
+
+  for (int i = 0; i < left_callsites.size(); ++i) {
+    mappings.MapComputationsIfAbsent(*left_callsites[i].second,
+                                     *right_callsites[i].second,
+                                     computation_match_type);
+  }
+}
+
+void MapCalledComputations(const HloInstruction* left_instruction,
+                           const HloInstruction* right_instruction,
+                           const HloGumgraph& left, const HloGumgraph& right,
+                           HloGumgraphMappings& mappings) {
+  for (int i = 0; i < left_instruction->called_computations().size(); ++i) {
+    mappings.MapComputationsIfAbsent(
+        left.GetCallGraph().GetNode(
+            left_instruction->called_computations().at(i)),
+        right.GetCallGraph().GetNode(
+            right_instruction->called_computations().at(i)),
+        ComputationMatchType::kSignature);
+  }
+}
+
+// Process a single computation (CallGraphNode) in the left call graph. For each
+// called computation in this computation, attempt to find a matching
+// computation on the right call graph.
+void ProcessCallGraphNode(const CallGraphNode& left_computation,
+                          const HloGumgraph& left, const HloGumgraph& right,
+                          HloGumgraphMappings& mappings) {
+  // Only match called computations if current computation is already matched.
+  auto it = mappings.left_to_right_computation_map.left.find(&left_computation);
+  if (it == mappings.left_to_right_computation_map.left.end() ||
+      left_computation.callees().empty()) {
+    return;
+  }
+
+  const CallGraphNode* right_computation = it->second;
+  HloComputation::CachingPostOrder left_cpo(left_computation.computation());
+  HloComputation::CachingPostOrder right_cpo(right_computation->computation());
+
+  // Phase 1: Match called computations to computations with matching
+  // computation fingerprints, i.e. exact matches.
+  absl::flat_hash_map<uint64_t, absl::flat_hash_set<const CallGraphNode*>>
+      left_callees_by_fingerprint, right_callees_by_fingerprint;
+  for (const HloComputation* callee : left_computation.callees()) {
+    CallGraphNodeProps left_props = left.AllComputationProps().at(callee);
+    left_callees_by_fingerprint[left_props.fingerprint].insert(
+        left_props.call_graph_node);
+  }
+  for (const HloComputation* callee : right_computation->callees()) {
+    CallGraphNodeProps right_props = right.AllComputationProps().at(callee);
+    right_callees_by_fingerprint[right_props.fingerprint].insert(
+        right_props.call_graph_node);
+  }
+
+  for (const auto& [fingerprint, left_callees] : left_callees_by_fingerprint) {
+    if (auto right_it = right_callees_by_fingerprint.find(fingerprint);
+        right_it != right_callees_by_fingerprint.end()) {
+      const absl::flat_hash_set<const CallGraphNode*>& right_callees =
+          right_it->second;
+      MapComputationCalleesWithSameFingerprintOrProgramShape(
+          left_callees, right_callees, left_cpo, right_cpo, left, right,
+          mappings, ComputationMatchType::kExact);
+    }
+  }
+
+  // Phase2: Match left called computations to right computations if their
+  // callsite instructions have matching opcodes and metadata op_name.
+  absl::flat_hash_map<std::pair<HloOpcode, std::string>,
+                      std::vector<const HloInstruction*>>
+      left_instructions_by_op, right_instructions_by_op;
+  // First we filter out instructions whose called computations are already
+  // matched.
+  for (const HloInstruction* instruction : left_cpo.PostOrder()) {
+    bool all_called_computations_matched = true;
+    for (const HloComputation* callee : instruction->called_computations()) {
+      if (auto left_it = mappings.left_to_right_computation_map.left.find(
+              &left.GetCallGraph().GetNode(callee));
+          left_it == mappings.left_to_right_computation_map.left.end()) {
+        all_called_computations_matched = false;
+        break;
+      }
+    }
+
+    if (!all_called_computations_matched) {
+      std::pair op_code_and_name = std::make_pair(
+          instruction->opcode(), instruction->metadata().op_name());
+      left_instructions_by_op[op_code_and_name].push_back(instruction);
+    }
+  }
+
+  for (const HloInstruction* instruction : right_cpo.PostOrder()) {
+    bool all_called_computations_matched = true;
+    for (const HloComputation* callee : instruction->called_computations()) {
+      if (auto right_it = mappings.left_to_right_computation_map.right.find(
+              &right.GetCallGraph().GetNode(callee));
+          right_it == mappings.left_to_right_computation_map.right.end()) {
+        all_called_computations_matched = false;
+        break;
+      }
+    }
+
+    if (!all_called_computations_matched) {
+      std::pair op_code_and_name = std::make_pair(
+          instruction->opcode(), instruction->metadata().op_name());
+      right_instructions_by_op[op_code_and_name].push_back(instruction);
+    }
+  }
+
+  // Match called computations if their callsite instructions have matching
+  // opcodes and metadata op_name and there is exactly one called computation
+  // on both sides.
+  for (const auto& [op, left_instructions] : left_instructions_by_op) {
+    auto right_it = right_instructions_by_op.find(op);
+    if (right_it == right_instructions_by_op.end()) {
+      continue;
+    }
+
+    std::vector<const HloInstruction*> right_instructions = right_it->second;
+    if (left_instructions.size() == 1 && right_instructions.size() == 1) {
+      MapCalledComputations(left_instructions[0], right_instructions[0], left,
+                            right, mappings);
+    } else {
+      // Even if there are multiple call sites with matching opcodes and
+      // metadata op_name, we still attempt to match the called computations if
+      // they are of the same size, but only for While opcodes.
+      switch (op.first) {
+        case HloOpcode::kWhile: {
+          if (left_instructions.size() != right_instructions.size()) {
+            break;
+          }
+
+          for (int i = 0; i < left_instructions.size(); ++i) {
+            MapCalledComputations(left_instructions[i], right_instructions[i],
+                                  left, right, mappings);
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+
+  // Phase 3: Match children computations with matching opcode, metadata op-name
+  // and program shapes as signature matches.
+  absl::flat_hash_map<std::string, absl::flat_hash_set<const CallGraphNode*>>
+      unmatched_left_callees, unmatched_right_callees;
+  for (const HloComputation* callee : left_computation.callees()) {
+    if (auto left_it = mappings.left_to_right_computation_map.left.find(
+            &left.GetCallGraph().GetNode(callee));
+        left_it == mappings.left_to_right_computation_map.left.end()) {
+      const CallGraphNode& callee_node = left.GetCallGraph().GetNode(callee);
+      std::string opcode_and_name;
+      if (!callee_node.caller_callsites().empty()) {
+        const HloInstruction* caller_instruction =
+            callee_node.caller_callsites()[0].instruction();
+        opcode_and_name =
+            absl::StrCat(caller_instruction->opcode(),
+                         "::", caller_instruction->metadata().op_name());
+      } else {
+        LOG(WARNING) << "Callee node " << callee_node.computation()->name()
+                     << " has no caller callsites";
+      }
+      std::string opcode_name_shape = absl::StrCat(
+          opcode_and_name,
+          "::", callee->ComputeProgramShape(/*include ids=*/false).ToString());
+      unmatched_left_callees[opcode_name_shape].insert(&callee_node);
+    }
+  }
+  for (const HloComputation* callee : right_computation->callees()) {
+    if (auto right_it = mappings.left_to_right_computation_map.right.find(
+            &right.GetCallGraph().GetNode(callee));
+        right_it == mappings.left_to_right_computation_map.right.end()) {
+      const CallGraphNode& callee_node = right.GetCallGraph().GetNode(callee);
+      std::string opcode_and_name;
+      if (callee_node.caller_callsites().size() == 1) {
+        const HloInstruction* caller_instruction =
+            callee_node.caller_callsites()[0].instruction();
+        opcode_and_name =
+            absl::StrCat(caller_instruction->opcode(),
+                         "::", caller_instruction->metadata().op_name());
+      }
+      std::string key = absl::StrCat(
+          opcode_and_name,
+          "::", callee->ComputeProgramShape(/*include ids=*/false).ToString());
+      unmatched_right_callees[key].insert(&callee_node);
+    }
+  }
+
+  for (const auto& [shape, left_calleees] : unmatched_left_callees) {
+    if (auto right_it = unmatched_right_callees.find(shape);
+        right_it != unmatched_right_callees.end()) {
+      const absl::flat_hash_set<const CallGraphNode*>&
+          program_shape_matched_right_calleees = right_it->second;
+      MapComputationCalleesWithSameFingerprintOrProgramShape(
+          left_calleees, program_shape_matched_right_calleees, left_cpo,
+          right_cpo, left, right, mappings, ComputationMatchType::kSignature);
+    }
+  }
+}
+
+// Visits all CallGraphNodes in the call graph in BFS order.
+void VisitCallGraphNodesBfs(const CallGraph& call_graph,
+                            const CallGraphNode& root,
+                            VisitorFunction visit_fn) {
+  absl::flat_hash_set<const CallGraphNode*> visited;
+  std::queue<const CallGraphNode*> queue;
+  queue.push(&root);
+
+  while (!queue.empty()) {
+    const CallGraphNode* current_node = queue.front();
+    queue.pop();
+
+    if (!visited.insert(current_node).second) {
+      continue;
+    }
+
+    visit_fn(*current_node);
+
+    for (const HloComputation* callee : current_node->callees()) {
+      queue.push(&call_graph.GetNode(callee));
+    }
+  }
+}
+
+}  // namespace
+
+void MatchCallGraphs(const HloGumgraph& left, const HloGumgraph& right,
+                     HloGumgraphMappings& mappings) {
+  // Match the entry computations as signature matches. This optimizes for the
+  // common case, i.e. users comparing similar programs whose input/output
+  // parameters are often identical or very similar.
+  ComputationMatchType entry_computation_match_type =
+      ComputationMatchType::kSignature;
+  if (left.AllComputationProps()
+          .at(left.GetHloModule().entry_computation())
+          .fingerprint == right.AllComputationProps()
+                              .at(right.GetHloModule().entry_computation())
+                              .fingerprint) {
+    entry_computation_match_type = ComputationMatchType::kExact;
+  }
+  mappings.MapComputationsIfAbsent(
+      left.GetCallGraph().GetNode(left.GetHloModule().entry_computation()),
+      right.GetCallGraph().GetNode(right.GetHloModule().entry_computation()),
+      entry_computation_match_type);
+
+  // Traverse the call graph of the left HloGumgraph in BFS order. For each
+  // visited computation node in the left call graph we attempt to find a
+  // matching computation node on the right call graph. Two computation nodes
+  // are only matched if their parent computations are already matched.
+  VisitCallGraphNodesBfs(
+      left.GetCallGraph(),
+      left.GetCallGraph().GetNode(left.GetHloModule().entry_computation()),
+      [&](const CallGraphNode& left_node) {
+        return ProcessCallGraphNode(left_node, left, right, mappings);
+      });
+
+  int signature_match_count = 0, exact_match_count = 0;
+  for (auto it = mappings.left_to_right_computation_map.left.begin();
+       it != mappings.left_to_right_computation_map.left.end(); ++it) {
+    if (it->info.computation_match_type == ComputationMatchType::kSignature) {
+      ++signature_match_count;
+    } else {
+      ++exact_match_count;
+    }
+  }
+  LOG(INFO) << "Finished matching call graphs for "
+            << left.GetHloModule().name() << ": "
+            << left.GetCallGraph().nodes().size() << " and "
+            << right.GetHloModule().name() << ": "
+            << right.GetCallGraph().nodes().size()
+            << ". Total signature matched computations: "
+            << signature_match_count
+            << ". Total exact matched computations: " << exact_match_count;
+}
+
+}  // namespace xla::hlo_diff
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.h b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.h
new file mode 100644
index 000000000000..23759bc49b78
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_CALL_GRAPH_MATCHER_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_CALL_GRAPH_MATCHER_H_
+
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+
+namespace xla::hlo_diff {
+
+// Matches similar computations between two HloGumgraphs (HloModules).
+// Computations with the semantically same input parameters and output
+// parameters are matched as `kSignature` matches. If a kSignature match
+// additionally has semantically identical instructions, then its classified as
+// a `kExact` matches.
+
+// The matcher does not match the instructions within the matched computations
+// which is the responsibility of the subequent matchers.
+void MatchCallGraphs(const HloGumgraph& left, const HloGumgraph& right,
+                     HloGumgraphMappings& mappings);
+
+}  // namespace xla::hlo_diff
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_CALL_GRAPH_MATCHER_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher_test.cc
new file mode 100644
index 000000000000..e522956b336f
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher_test.cc
@@ -0,0 +1,400 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_call_graph_matcher.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/utils/test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::hlo_diff {
+namespace {
+
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+class HloCallGraphMatcherTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloCallGraphMatcherTest, ExactFingerprintMatches) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.1 {
+  p2 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+}
+
+fused_computation.2 {
+  p4 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p5 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.2 = s32[32,16]{0,1:T(1,128)} add(p4, p5)
+}
+
+ENTRY entry {
+  p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+  fusion.2 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.2
+  ROOT add = s32[32,16]{0,1:T(1,128)} add(fusion.1, fusion.2)
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.11 {
+  p21 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p31 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.11 = s32[32,16]{0,1:T(1,128)} add(p21, p31)
+}
+
+fused_computation.21 {
+  p41 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p51 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.21 = s32[32,16]{0,1:T(1,128)} add(p41, p51)
+}
+
+ENTRY entry {
+  p01 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p11 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  fusion.11 = s32[32,16]{0,1:T(1,128)} fusion(p01,p11), kind=kLoop, calls=fused_computation.11
+  fusion.21 = s32[32,16]{0,1:T(1,128)} fusion(p01,p11), kind=kLoop, calls=fused_computation.21
+  ROOT add11 = s32[32,16]{0,1:T(1,128)} add(fusion.11, fusion.21)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+
+  MatchCallGraphs(*left_gumgraph, *right_gumgraph, *mappings);
+
+  auto matched_computations = ExtractMappedComputationNames(*mappings);
+  auto match_type = ExtractComputationMatchType(*mappings);
+  EXPECT_THAT(
+      matched_computations,
+      UnorderedElementsAre(Pair("fused_computation.1", "fused_computation.11"),
+                           Pair("fused_computation.2", "fused_computation.21"),
+                           Pair("entry", "entry")));
+  EXPECT_THAT(match_type,
+              UnorderedElementsAre(
+                  Pair("fused_computation.1", ComputationMatchType::kExact),
+                  Pair("fused_computation.2", ComputationMatchType::kExact),
+                  Pair("entry", ComputationMatchType::kExact)));
+}
+
+TEST_F(HloCallGraphMatcherTest, UnequalFingerprintMatchesNotMatched) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.1 {
+  p2 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+}
+
+fused_computation.2 {
+  p4 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p5 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.2 = s32[32,16]{0,1:T(1,128)} add(p4, p5)
+}
+
+ENTRY entry {
+  p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+  fusion.2 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.2
+  ROOT add = s32[32,16]{0,1:T(1,128)} add(fusion.1, fusion.2)
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.11 {
+  p21 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p31 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.11 = s32[32,16]{0,1:T(1,128)} add(p21, p31)
+}
+
+ENTRY entry {
+  p01 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p11 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  ROOT fusion.11 = s32[32,16]{0,1:T(1,128)} fusion(p01,p11), kind=kLoop, calls=fused_computation.11
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+
+  MatchCallGraphs(*left_gumgraph, *right_gumgraph, *mappings);
+
+  auto matched_computations = ExtractMappedComputationNames(*mappings);
+  auto match_type = ExtractComputationMatchType(*mappings);
+  EXPECT_THAT(matched_computations,
+              UnorderedElementsAre(Pair("entry", "entry")));
+  EXPECT_THAT(match_type, UnorderedElementsAre(
+                              Pair("entry", ComputationMatchType::kSignature)));
+}
+
+TEST_F(HloCallGraphMatcherTest, MultipleWhileInstructionsMatched) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+body_1 {
+  prev_1 = s32[2] parameter(0)
+  all-gather_1 = s32[4] all-gather(s32[2] prev_1), replica_groups={}, dimensions={0}, backend_config="{}"
+  ROOT slice_1 = s32[2] slice(all-gather_1), slice={[0:2]}
+}
+
+condition_1 {
+  prev_1 = s32[2] parameter(0)
+  constant_1 = pred[] constant(true)
+  ROOT copy_1 = pred[] copy(constant_1)
+}
+
+body_2 {
+  prev_2 = s32[2] parameter(0)
+  all-gather_2 = s32[4] all-gather(s32[2] prev_2), replica_groups={}, dimensions={0}, backend_config="{}"
+  ROOT slice_2 = s32[2] slice(all-gather_2), slice={[0:2]}
+}
+
+condition_2 {
+  prev_2 = s32[2] parameter(0)
+  constant_2 = pred[] constant(true)
+  ROOT copy_2 = pred[] copy(constant_2)
+}
+
+body_3 {
+  prev_3 = s32[2] parameter(0)
+  all-gather_3 = s32[4] all-gather(s32[2] prev_3), replica_groups={}, dimensions={0}, backend_config="{}"
+  ROOT slice_3 = s32[2] slice(all-gather_3), slice={[0:2]}
+}
+
+condition_3 {
+  prev_3 = s32[2] parameter(0)
+  constant_3 = pred[] constant(true)
+  ROOT copy_3 = pred[] copy(constant_3)
+}
+
+ENTRY entry {
+  constant = s32[2] constant({0,0})
+  while.1 = s32[2] while(s32[2] constant), condition=condition_1, body=body_1
+  while.2 = s32[2] while(s32[2] constant), condition=condition_2, body=body_2, metadata={op_name="while-activations"}
+  while.3 = s32[2] while(s32[2] constant), condition=condition_3, body=body_3
+  add.1 = s32[2] add(while.1, while.2)
+  ROOT add.2 = s32[2] add(add.1, while.3)
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+body_1 {
+  prev_1 = s32[2] parameter(0)
+  all-gather_1 = s32[4] all-gather(s32[2] prev_1), replica_groups={}, dimensions={0}, backend_config="{}"
+  ROOT slice_1 = s32[2] slice(all-gather_1), slice={[0:2]}
+}
+
+condition_1 {
+  prev_1 = s32[2] parameter(0)
+  constant_1 = pred[] constant(true)
+  ROOT copy_1 = pred[] copy(constant_1)
+}
+
+body_2 {
+  prev_2 = s32[2] parameter(0)
+  all-gather_2 = s32[4] all-gather(s32[2] prev_2), replica_groups={}, dimensions={0}, backend_config="{}"
+  ROOT slice_2 = s32[2] slice(all-gather_2), slice={[0:2]}
+}
+
+condition_2 {
+  prev_2 = s32[2] parameter(0)
+  constant_2 = pred[] constant(true)
+  ROOT copy_2 = pred[] copy(constant_2)
+}
+
+body_3 {
+  prev_3 = s32[2] parameter(0)
+  all-gather_3 = s32[4] all-gather(s32[2] prev_3), replica_groups={}, dimensions={0}, backend_config="{}"
+  ROOT slice_3 = s32[2] slice(all-gather_3), slice={[0:2]}
+}
+
+condition_3 {
+  prev_3 = s32[2] parameter(0)
+  constant_3 = pred[] constant(true)
+  ROOT copy_3 = pred[] copy(constant_3)
+}
+
+ENTRY entry {
+  constant = s32[2] constant({0,0})
+  while.1 = s32[2] while(s32[2] constant), condition=condition_1, body=body_1, metadata={op_name="while-activations"} 
+  while.2 = s32[2] while(s32[2] constant), condition=condition_2, body=body_2
+  while.3 = s32[2] while(s32[2] constant), condition=condition_3, body=body_3
+  add.1 = s32[2] add(while.1, while.2)
+  ROOT add.2 = s32[2] add(add.1, while.3)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+
+  MatchCallGraphs(*left_gumgraph, *right_gumgraph, *mappings);
+
+  auto matched_computations = ExtractMappedComputationNames(*mappings);
+  auto match_type = ExtractComputationMatchType(*mappings);
+  EXPECT_THAT(matched_computations,
+              UnorderedElementsAre(
+                  Pair("body_1", "body_2"), Pair("body_2", "body_1"),
+                  Pair("body_3", "body_3"), Pair("condition_1", "condition_2"),
+                  Pair("condition_2", "condition_1"),
+                  Pair("condition_3", "condition_3"), Pair("entry", "entry")));
+  EXPECT_THAT(match_type,
+              UnorderedElementsAre(
+                  Pair("body_1", ComputationMatchType::kSignature),
+                  Pair("body_2", ComputationMatchType::kSignature),
+                  Pair("body_3", ComputationMatchType::kSignature),
+                  Pair("condition_1", ComputationMatchType::kSignature),
+                  Pair("condition_2", ComputationMatchType::kSignature),
+                  Pair("condition_3", ComputationMatchType::kSignature),
+                  Pair("entry", ComputationMatchType::kExact)));
+}
+
+TEST_F(HloCallGraphMatcherTest, ExactSignatureMatches) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.1 {
+  p.2 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p.3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.1 = s32[32,16]{0,1:T(1,128)} add(p.2, p.3)
+}
+
+fused_computation.2 {
+  p.4 = s32[]  parameter(0)
+  p.5 = s32[] parameter(1)
+  add.2 = s32[] add(p.4, p.5)
+}
+
+fused_computation.3 {
+  p.6 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p.7 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.3 = s32[32,16]{0,1:T(1,128)} add(p.6, p.7)
+}
+
+fused_computation.4 {
+  p.8 = s32[]  parameter(0)
+  p.9 = s32[] parameter(1)
+  add.4 = s32[] add(p.8, p.9)
+}
+
+ENTRY entry {
+  p.0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p.1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  p.2 = s32[] parameter(2)
+  p.3 = s32[] parameter(3)
+  fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p.0, p.1), kind=kLoop, calls=fused_computation.1
+  fusion.2 = s32[] fusion(p.2, p.3), kind=kLoop, calls=fused_computation.2
+  fusion.3 = s32[32,16]{0,1:T(1,128)} fusion(p.0, p.1), kind=kLoop, calls=fused_computation.3, metadata={op_name="add_fusion"}
+  fusion.4 = s32[] fusion(p.2, p.3), kind=kLoop, calls=fused_computation.4
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.11 {
+  p.21 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p.31 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  subtract.11 = s32[32,16]{0,1:T(1,128)} subtract(p.21, p.31)
+}
+
+fused_computation.21 {
+  p.41 = s32[]  parameter(0)
+  p.51 = s32[] parameter(1)
+  subtract.21 = s32[] subtract(p.41, p.51)
+}
+
+fused_computation.31 {
+  p.21 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p.31 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  subtract.11 = s32[32,16]{0,1:T(1,128)} subtract(p.21, p.31)
+}
+
+fused_computation.41 {
+  p.41 = s32[]  parameter(0)
+  p.51 = s32[] parameter(1)
+  subtract.21 = s32[] subtract(p.41, p.51)
+}
+
+ENTRY entry {
+  p.01 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p.11 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  p.21 = s32[] parameter(2)
+  p.31 = s32[] parameter(3)
+  fusion.11 = s32[32,16]{0,1:T(1,128)} fusion(p.01,p.11), kind=kLoop, calls=fused_computation.11
+  fusion.21 = s32[] fusion(p.21, p.31), kind=kLoop, calls=fused_computation.21
+  fusion.31 = s32[32,16]{0,1:T(1,128)} fusion(p.01,p.11), kind=kLoop, calls=fused_computation.31, metadata={op_name="add_fusion"}
+  fusion.41 = s32[] fusion(p.21, p.31), kind=kLoop, calls=fused_computation.41
+  
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+
+  MatchCallGraphs(*left_gumgraph, *right_gumgraph, *mappings);
+
+  auto matched_computations = ExtractMappedComputationNames(*mappings);
+  auto match_type = ExtractComputationMatchType(*mappings);
+  EXPECT_THAT(
+      matched_computations,
+      UnorderedElementsAre(Pair("fused_computation.1", "fused_computation.11"),
+                           Pair("fused_computation.2", "fused_computation.21"),
+                           Pair("fused_computation.3", "fused_computation.31"),
+                           Pair("fused_computation.4", "fused_computation.41"),
+                           Pair("entry", "entry")));
+  EXPECT_THAT(match_type,
+              UnorderedElementsAre(
+                  Pair("fused_computation.1", ComputationMatchType::kSignature),
+                  Pair("fused_computation.2", ComputationMatchType::kSignature),
+                  Pair("fused_computation.3", ComputationMatchType::kSignature),
+                  Pair("fused_computation.4", ComputationMatchType::kSignature),
+                  Pair("entry", ComputationMatchType::kExact)));
+}
+
+}  // namespace
+}  // namespace xla::hlo_diff
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.cc b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.cc
new file mode 100644
index 000000000000..559ebccfd4ad
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.cc
@@ -0,0 +1,401 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.h"
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/service/call_graph.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+// Function to compute property match score between two instructions.
+// Compares various properties of the instructions and returns a int score.
+// Higher the score, more similar the instructions are.
+using PropertyMatchesFn = absl::FunctionRef<int(const HloInstructionNode*,
+                                                const HloInstructionNode*)>;
+
+// Returns true if all the users of the left instruction are matched to the
+// right instruction users by fingerprint.
+bool AllInstructionUsersAreMatched(const HloInstructionNode* left,
+                                   const HloInstructionNode* right) {
+  absl::flat_hash_set<const HloInstructionNode*> left_users, right_users;
+  for (const HloInstructionNode* user : left->parents) {
+    left_users.insert(user);
+  }
+  for (const HloInstructionNode* user : right->parents) {
+    right_users.insert(user);
+  }
+
+  for (const HloInstructionNode* user : left_users) {
+    if (!right_users.contains(user)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns count of matched properties between two leaf instructions - i.e.
+// parameter or constant.
+int LeafPropertyMatches(const HloInstructionNode* left,
+                        const HloInstructionNode* right) {
+  int match_score = 0;
+  if (left->instruction->shape().has_layout() &&
+      right->instruction->shape().has_layout() &&
+      (left->instruction->shape().layout() ==
+       right->instruction->shape().layout())) {
+    ++match_score;
+  }
+
+  if (left->instruction->has_sharding() && right->instruction->has_sharding() &&
+      (left->instruction->sharding() == right->instruction->sharding())) {
+    ++match_score;
+  }
+
+  if (!left->instruction->metadata().op_name().empty() &&
+      !right->instruction->metadata().op_name().empty() &&
+      (left->instruction->metadata().op_name() ==
+       right->instruction->metadata().op_name())) {
+    ++match_score;
+  }
+
+  if (!left->instruction->metadata().source_file().empty() &&
+      !right->instruction->metadata().source_file().empty() &&
+      (left->instruction->metadata().source_file() ==
+       right->instruction->metadata().source_file())) {
+    ++match_score;
+  }
+
+  if ((left->instruction->metadata().source_line() != 0) &&
+      (right->instruction->metadata().source_line() != 0) &&
+      (left->instruction->metadata().source_line() ==
+       right->instruction->metadata().source_line())) {
+    ++match_score;
+  }
+
+  if (AllInstructionUsersAreMatched(left, right)) {
+    ++match_score;
+  }
+
+  return match_score;
+}
+
+// Returns a similarity score between 0-1.0 of two parameters based on their
+// sharding, layout, name and users.
+int ParamPropertyMatches(const HloInstructionNode* left,
+                         const HloInstructionNode* right) {
+  int match_score = LeafPropertyMatches(left, right);
+
+  // A Parameter's name and parameter number are typically consistently
+  // generated by the frameworks. But in some cases, the name and parameter
+  // numbers might be differently generated for the same instruction, but if
+  // they are both same, we can be pretty confident that they are the same
+  // instruction.
+  if ((left->instruction->name() == right->instruction->name()) &&
+      (left->instruction->parameter_number() ==
+       right->instruction->parameter_number())) {
+    ++match_score;
+  }
+
+  return match_score;
+}
+
+// Returns count of matched properties between two constant instructions.
+int ConstantPropertyMatches(const HloInstructionNode* left,
+                            const HloInstructionNode* right) {
+  int match_score = LeafPropertyMatches(left, right);
+
+  // Use the canonical options as fingerprint ignore float values.
+  if (left->instruction->ToString(HloPrintOptions::Canonical()) ==
+      right->instruction->ToString(HloPrintOptions::Canonical())) {
+    ++match_score;
+  }
+
+  if (left->parents.size() == right->parents.size()) {
+    ++match_score;
+  }
+  return match_score;
+}
+
+// Match instructions with multiple match candidates using similarity measures.
+void MatchInstructionsWithMultipleCandidates(
+    const absl::flat_hash_set<const HloInstructionNode*>& left_instructions,
+    const absl::flat_hash_set<const HloInstructionNode*>& right_instructions,
+    HloGumgraphMappings& mappings, PropertyMatchesFn property_matches_fn,
+    const MatcherType& matcher_type) {
+  for (const HloInstructionNode* left : left_instructions) {
+    double max_match_score = 0.0;
+    std::vector<const HloInstructionNode*> right_candidates;
+    for (const HloInstructionNode* right : right_instructions) {
+      double similarity = property_matches_fn(left, right);
+      if (similarity > max_match_score) {
+        max_match_score = similarity;
+        right_candidates.clear();
+        right_candidates.push_back(right);
+      } else if (similarity == max_match_score) {
+        right_candidates.push_back(right);
+      }
+    }
+
+    // Avoid matching instructions with multiple candidates.
+    if (right_candidates.size() == 1) {
+      mappings.MapInstructionsIfAbsent(left, right_candidates[0], matcher_type);
+    }
+  }
+}
+
+// Find optimal matches between the left and right leaf instructions - i.e.
+// parameter or constant.
+// This function is called when attempting to map two computations. The goal is
+// to establish a mapping between corresponding leaf instructions from the
+// 'left_instructions' and 'right_instructions' sets. These sets are derived
+// from the two computations being mapped.
+void MatchLeafInstructions(
+    const absl::flat_hash_set<const HloInstructionNode*>& left_instructions,
+    const absl::flat_hash_set<const HloInstructionNode*>& right_instructions,
+    HloGumgraphMappings& mappings, PropertyMatchesFn property_matches_fn,
+    const MatcherType& matcher_type) {
+  absl::flat_hash_set<const HloInstructionNode*> matched_instructions;
+
+  // Phase 0: Direct mapping if only one instruction in each set.
+  if (left_instructions.size() == 1 && right_instructions.size() == 1) {
+    mappings.MapInstructionsIfAbsent(*left_instructions.begin(),
+                                     *right_instructions.begin(), matcher_type);
+    return;  // Early return after direct mapping.
+  }
+
+  // Phase 1: Map instructions with the same shape and metadata op name if its
+  // specified. This name is often unique within a computation and specified by
+  // the frameworks. Note that for XLA generated computations, the metadata is
+  // not consistently specified.
+  for (const HloInstructionNode* left_instruction : left_instructions) {
+    if (left_instruction->instruction->metadata().op_name().empty()) {
+      continue;
+    }
+    int candidates_found = 0;
+    const HloInstructionNode* candidate = nullptr;
+
+    for (const HloInstructionNode* right_instruction : right_instructions) {
+      bool same_shape = left_instruction->instruction->shape().ToString(
+                            /*print_layout=*/false) ==
+                        right_instruction->instruction->shape().ToString(
+                            /*print_layout=*/false);
+      bool same_op_name = left_instruction->instruction->metadata().op_name() ==
+                          right_instruction->instruction->metadata().op_name();
+      if (same_shape && same_op_name) {
+        ++candidates_found;
+        candidate = right_instruction;
+      }
+    }
+
+    // Avoid matching instructions with multiple candidates.
+    if (candidates_found == 1) {
+      mappings.MapInstructionsIfAbsent(left_instruction, candidate,
+                                       matcher_type);
+      matched_instructions.insert(left_instruction);
+      matched_instructions.insert(candidate);
+    }
+  }
+
+  // Phase 2: Group instructions by shape.
+  // 2.1: Match unique instructions with the same shape
+  // 2.2: Match instructions with multiple candidates using similarity measures.
+  absl::flat_hash_map<std::string,
+                      absl::flat_hash_set<const HloInstructionNode*>>
+      left_instructions_by_shape;
+  for (const HloInstructionNode* instruction : left_instructions) {
+    if (!matched_instructions.contains(instruction)) {
+      left_instructions_by_shape[instruction->instruction->shape().ToString(
+                                     /*print_layout=*/false)]
+          .insert(instruction);
+    }
+  }
+
+  absl::flat_hash_map<std::string,
+                      absl::flat_hash_set<const HloInstructionNode*>>
+      right_instructions_by_shape;
+  for (const HloInstructionNode* instruction : right_instructions) {
+    if (!matched_instructions.contains(instruction)) {
+      right_instructions_by_shape[instruction->instruction->shape().ToString(
+                                      /*print_layout=*/false)]
+          .insert(instruction);
+    }
+  }
+
+  for (const auto& [shape, shape_left_instructions] :
+       left_instructions_by_shape) {
+    if (auto it = right_instructions_by_shape.find(shape);
+        it != right_instructions_by_shape.end()) {
+      absl::flat_hash_set<const HloInstructionNode*> shape_right_instructions =
+          it->second;
+      // Phase 2.1: Match unique instructions with the same shape.
+      if (shape_left_instructions.size() == 1 &&
+          shape_right_instructions.size() == 1) {
+        mappings.MapInstructionsIfAbsent(*shape_left_instructions.begin(),
+                                         *shape_right_instructions.begin(),
+                                         matcher_type);
+      } else {
+        // Phase 2.2: Match instructions with multiple candidates using
+        // similarity measures.
+        MatchInstructionsWithMultipleCandidates(
+            shape_left_instructions, shape_right_instructions, mappings,
+            property_matches_fn, matcher_type);
+      }
+    }
+  }
+}
+
+// Match parameter instructions between the left and right computations.
+void MatchComputationParams(const HloGumgraph& left, const HloGumgraph& right,
+                            const CallGraphNode& left_computation,
+                            const CallGraphNode& right_computation,
+                            HloGumgraphMappings& mappings,
+                            const MatcherType& matcher_type) {
+  absl::flat_hash_set<const HloInstructionNode*> left_params, right_params;
+  for (const HloInstruction* param :
+       left_computation.computation()->parameter_instructions()) {
+    left_params.insert(left.GetNode(param));
+  }
+  for (const HloInstruction* param :
+       right_computation.computation()->parameter_instructions()) {
+    right_params.insert(right.GetNode(param));
+  }
+
+  MatchLeafInstructions(left_params, right_params, mappings,
+                        std::ref(ParamPropertyMatches), matcher_type);
+}
+
+// Match constant instructions between the left and right computations.
+void MatchComputationConstants(const HloGumgraph& left,
+                               const HloGumgraph& right,
+                               const CallGraphNode& left_computation,
+                               const CallGraphNode& right_computation,
+                               HloGumgraphMappings& mappings,
+                               const MatcherType& matcher_type) {
+  absl::flat_hash_set<const HloInstructionNode*> left_constants,
+      right_constants;
+  for (const HloInstruction* instruction :
+       left_computation.computation()->instructions()) {
+    if (instruction->IsConstant()) {
+      left_constants.insert(left.GetNode(instruction));
+    }
+  }
+  for (const HloInstruction* instruction :
+       right_computation.computation()->instructions()) {
+    if (instruction->IsConstant()) {
+      right_constants.insert(right.GetNode(instruction));
+    }
+  }
+
+  MatchLeafInstructions(left_constants, right_constants, mappings,
+                        std::ref(ConstantPropertyMatches), matcher_type);
+}
+
+// Match the call site instruction and it's operands for a matched left and
+// right computation.
+void MatchCallSites(const HloGumgraph& left, const HloGumgraph& right,
+                    const CallGraphNode& left_computation,
+                    const CallGraphNode& right_computation,
+                    HloGumgraphMappings& mappings) {
+  // Only match call sites if both computations are called from exactly one call
+  // site. In case a computation is called from multiple call sites, we cannot
+  // disambiguate between the call sites. The subsequent matchers should be able
+  // to find the matches between the call sites in such cases.
+  if (left_computation.caller_callsites().size() != 1 ||
+      right_computation.caller_callsites().size() != 1) {
+    return;
+  }
+
+  const CallSite& left_call_site = *left_computation.caller_callsites().begin();
+  const CallSite& right_call_site =
+      *right_computation.caller_callsites().begin();
+
+  // Match the call site instruction.
+  mappings.MapInstructionsIfAbsent(
+      left.GetNode(left_call_site.instruction()),
+      right.GetNode(right_call_site.instruction()),
+      MatcherType::kComputationGraphExactSignatureMatcher);
+}
+
+}  // namespace
+
+void MatchComputationGraphs(const HloGumgraph& left, const HloGumgraph& right,
+                            const CallGraphNode& left_computation,
+                            const CallGraphNode& right_computation,
+                            HloGumgraphMappings& mappings) {
+  auto it = mappings.left_to_right_computation_map.left.find(&left_computation);
+  if (it == mappings.left_to_right_computation_map.left.end()) {
+    return;
+  }
+
+  MatchCallSites(left, right, left_computation, right_computation, mappings);
+
+  // If the two computations are exact matches, we can match all
+  // instructions in the two computations.
+  if (it->info.computation_match_type == ComputationMatchType::kExact) {
+    auto left_instructions =
+        left_computation.computation()->MakeInstructionPostOrder();
+    auto right_instructions =
+        right_computation.computation()->MakeInstructionPostOrder();
+    if (left_instructions.size() != right_instructions.size()) {
+      LOG(WARNING) << "Computation size mismatch: Left computation: "
+                   << left_computation.computation()->name() << " has "
+                   << left_instructions.size()
+                   << " instructions and right computation: "
+                   << right_computation.computation()->name() << " has "
+                   << right_instructions.size() << " instructions";
+      return;
+    }
+
+    for (int i = 0; i < left_instructions.size(); ++i) {
+      mappings.MapInstructionsIfAbsent(
+          left.GetNode(left_instructions[i]),
+          right.GetNode(right_instructions[i]),
+          MatcherType::kComputationGraphExactFingerprintMatcher);
+    }
+  } else {
+    // If the two computations are signature matches, we can match the
+    // inputs (parameters, constants) and root instruction of the two
+    // computation graph.
+    MatchComputationParams(left, right, left_computation, right_computation,
+                           mappings,
+                           MatcherType::kComputationGraphExactSignatureMatcher);
+    MatchComputationConstants(
+        left, right, left_computation, right_computation, mappings,
+        MatcherType::kComputationGraphExactSignatureMatcher);
+
+    if (left_computation.computation()->root_instruction()->opcode() ==
+        right_computation.computation()->root_instruction()->opcode()) {
+      mappings.MapInstructionsIfAbsent(
+          left.GetNode(left_computation.computation()->root_instruction()),
+          right.GetNode(right_computation.computation()->root_instruction()),
+          MatcherType::kComputationGraphExactSignatureMatcher);
+    }
+  }
+}
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.h b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.h
new file mode 100644
index 000000000000..6989894760e3
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_COMPUTATION_GRAPH_MATCHER_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_COMPUTATION_GRAPH_MATCHER_H_
+
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/service/call_graph.h"
+
+namespace xla::hlo_diff {
+
+// Matches instructions of matched left and right computations in the left and
+// right HloGumgraphs.
+void MatchComputationGraphs(const HloGumgraph& left, const HloGumgraph& right,
+                            const CallGraphNode& left_computation,
+                            const CallGraphNode& right_computation,
+                            HloGumgraphMappings& mappings);
+
+}  // namespace xla::hlo_diff
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_COMPUTATION_GRAPH_MATCHER_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher_test.cc
new file mode 100644
index 000000000000..3a2d523ab2f3
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher_test.cc
@@ -0,0 +1,251 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_computation_graph_matcher.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/utils/test_util.h"
+#include "xla/service/call_graph.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::hlo_diff {
+namespace {
+
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+class HloComputationGraphMatcherTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloComputationGraphMatcherTest, MatchSingleParameterOrConstant) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p1 = bf16[2]{0} parameter(0), metadata={op_name="first-phase"}
+  c1 = bf16[2]{0} constant({1.1, 2.2})
+
+  ROOT add1 = bf16[2]{0} add(p1, c1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p2 = bf16[3]{0} parameter(0), metadata={op_name="first-phase.modify"}
+  c2 = bf16[3]{0} constant({1.1, 2.2, 3.3})
+
+  ROOT add2 = bf16[3]{0} add(p2, c2)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  const CallGraphNode& left_entry_computation =
+      left_gumgraph->GetCallGraph().GetNode(left_module->entry_computation());
+  const CallGraphNode& right_entry_computation =
+      right_gumgraph->GetCallGraph().GetNode(right_module->entry_computation());
+
+  mappings->MapComputationsIfAbsent(left_entry_computation,
+                                    right_entry_computation,
+                                    ComputationMatchType::kSignature);
+  MatchComputationGraphs(*left_gumgraph, *right_gumgraph,
+                         left_entry_computation, right_entry_computation,
+                         *mappings);
+
+  auto matched_params = ExtractMappedInstructionNames(*mappings);
+  EXPECT_THAT(matched_params,
+              UnorderedElementsAre(Pair("p1", "p2"), Pair("c1", "c2"),
+                                   Pair("add1", "add2")));
+}
+
+TEST_F(HloComputationGraphMatcherTest, MatchComputationParams) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p21 = f32[10]{0} parameter(0), metadata={op_name="first-phase"}
+  p22 = f32[10]{0:T(128)} parameter(1), metadata={op_name="first-phase.multiple-matches", source_file="test.cc", source_line=43}
+  p23 = f32[20]{0} parameter(2)
+  p24 = f32[10]{0} parameter(3), metadata={source_file="test.cc", source_line=42}
+  p25 = f32[10]{0} parameter(4), sharding={maximal device=1}
+  p26 = f32[30]{0} parameter(5), sharding={maximal device=1}
+
+  add21 = f32[10]{0} add(p21, p22)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p11 = f32[10]{0} parameter(0), metadata={op_name="first-phase"}
+  p12 = f32[10]{0} parameter(1), metadata={op_name="first-phase.multiple-matches"}
+  p13 = f32[10]{0} parameter(2), metadata={op_name="first-phase.multiple-matches"}
+  p14 = f32[20]{0} parameter(3)
+  p15 = f32[10]{0} parameter(4), metadata={source_file="test.cc", source_line=42}
+  p16 = f32[10]{0} parameter(5), sharding={maximal device=1}
+  p17 = f32[30]{0} parameter(6)
+  p18 = f32[10]{0:T(128)} parameter(7), metadata={source_file="test.cc", source_line=43}
+
+  ROOT add22 = f32[10]{0} add(p11, p18)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  const CallGraphNode& left_entry_computation =
+      left_gumgraph->GetCallGraph().GetNode(left_module->entry_computation());
+  const CallGraphNode& right_entry_computation =
+      right_gumgraph->GetCallGraph().GetNode(right_module->entry_computation());
+
+  mappings->MapComputationsIfAbsent(left_entry_computation,
+                                    right_entry_computation,
+                                    ComputationMatchType::kSignature);
+  MatchComputationGraphs(
+      *left_gumgraph, *right_gumgraph,
+      left_gumgraph->GetCallGraph().GetNode(left_module->entry_computation()),
+      right_gumgraph->GetCallGraph().GetNode(right_module->entry_computation()),
+      *mappings);
+
+  auto matched_params = ExtractMappedInstructionNames(*mappings);
+  EXPECT_THAT(matched_params,
+              UnorderedElementsAre(Pair("p21", "p11"), Pair("p22", "p18"),
+                                   Pair("p23", "p14"), Pair("p24", "p15"),
+                                   Pair("p25", "p16"), Pair("p26", "p17"),
+                                   Pair("add21", "add22")));
+}
+
+TEST_F(HloComputationGraphMatcherTest, MatchComputationConstants) {
+  const char* hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  c20 = bf16[2]{0} constant({1.1, 2.2})
+  c21 = bf16[2]{0} constant({1.1, 2.2})
+  c22 = bf16[2]{0} constant({1.1, 2.2})
+  c23 = bf16[2]{0} constant({5.5, 6.6})
+  c24 = u32[2]{0} constant({1, 2}), metadata={op_name="first-phase"}
+  c25 = bf16[1] constant(0.0), metadata={source_file="test.cc", source_line=42}
+  c26 = s32[4]{0} constant({1, 2, 3, 4})
+
+  add21 = bf16[2]{0} add(c22, c23)
+  add22 = bf16[2]{0} add(c22, c23)
+  add23 = bf16[2]{0} add(add21, add22)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  const CallGraphNode& left_entry_computation =
+      left_gumgraph->GetCallGraph().GetNode(left_module->entry_computation());
+  const CallGraphNode& right_entry_computation =
+      right_gumgraph->GetCallGraph().GetNode(right_module->entry_computation());
+
+  mappings->MapComputationsIfAbsent(left_entry_computation,
+                                    right_entry_computation,
+                                    ComputationMatchType::kSignature);
+  MatchComputationGraphs(*left_gumgraph, *right_gumgraph,
+                         left_entry_computation, right_entry_computation,
+                         *mappings);
+
+  auto matched_params = ExtractMappedInstructionNames(*mappings);
+  EXPECT_THAT(matched_params,
+              UnorderedElementsAre(Pair("c22", "c22"), Pair("c23", "c23"),
+                                   Pair("c24", "c24"), Pair("c25", "c25"),
+                                   Pair("c26", "c26"), Pair("add23", "add23")));
+}
+
+TEST_F(HloComputationGraphMatcherTest,
+       ExactMatchComputationsInstructionsExactlyMatched) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> left_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.1 {
+  p2 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p3 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.1 = s32[32,16]{0,1:T(1,128)} add(p2, p3)
+}
+
+ENTRY entry {
+  p0 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p1 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  ROOT fusion.1 = s32[32,16]{0,1:T(1,128)} fusion(p0,p1), kind=kLoop, calls=fused_computation.1
+}
+)"));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> right_module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation.11 {
+  p21 = s32[32,16]{0,1:T(1,128)}  parameter(0)
+  p31 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  add.11 = s32[32,16]{0,1:T(1,128)} add(p21, p31)
+}
+
+ENTRY entry {
+  p01 = s32[32,16]{0, 1:T(1,128)} parameter(0)
+  p11 = s32[32,16]{0,1:T(1,128)} parameter(1)
+  ROOT fusion.11 = s32[32,16]{0,1:T(1,128)} fusion(p01,p11), kind=kLoop, calls=fused_computation.11
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> left_gumgraph,
+                          HloGumgraph::Create(left_module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> right_gumgraph,
+                          HloGumgraph::Create(right_module.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  const CallGraphNode& left_fused_computation =
+      left_gumgraph->GetCallGraph().GetNode(
+          left_module->GetComputationWithName("fused_computation.1"));
+  const CallGraphNode& right_fused_computation =
+      right_gumgraph->GetCallGraph().GetNode(
+          right_module->GetComputationWithName("fused_computation.11"));
+
+  mappings->MapComputationsIfAbsent(left_fused_computation,
+                                    right_fused_computation,
+                                    ComputationMatchType::kExact);
+  MatchComputationGraphs(*left_gumgraph, *right_gumgraph,
+                         left_fused_computation, right_fused_computation,
+                         *mappings);
+
+  auto matched_params = ExtractMappedInstructionNames(*mappings);
+  EXPECT_THAT(matched_params,
+              UnorderedElementsAre(Pair("p2", "p21"), Pair("p3", "p31"),
+                                   Pair("add.1", "add.11"),
+                                   Pair("fusion.1", "fusion.11")));
+}
+
+}  // namespace
+}  // namespace xla::hlo_diff
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.cc b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.cc
new file mode 100644
index 000000000000..ab549070f820
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.cc
@@ -0,0 +1,571 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_bfs.h"
+#include "xla/hlo/tools/hlo_diff/graph/utils/hlo_gumgraph_dfs.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/service/hlo_value.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+constexpr double kOperandsMatchScore = 0.75;
+constexpr double kFingerprintMatchScore = 0.5;
+constexpr double kOperandsFingerprintsMatchScore = 0.5;
+constexpr double kMetadataOpNameMatchScore = 0.1;
+constexpr double kMetadataSourceFileMatchScore = 0.1;
+constexpr double kMetadataSourceLineMatchScore = 0.1;
+
+constexpr int kProgressBarWidth = 60;
+constexpr char kProgressBarBlock = '|';
+constexpr char kProgressBarEmpty = ' ';
+
+void PrintProgress(int percentage) {
+  int lpad = static_cast<int>(percentage / 100.0 * kProgressBarWidth);
+  int rpad = kProgressBarWidth - lpad;
+  std::cout << "\r" << std::setw(3) << percentage << "% ["
+            << std::string(lpad, kProgressBarBlock)
+            << std::string(rpad, kProgressBarEmpty) << "]" << std::flush;
+}
+
+struct NodePairSimilarity {
+  const HloInstructionNode* left;
+  const HloInstructionNode* right;
+  double similarity;
+};
+
+// Returns true if the two subgraphs have a diff.
+bool HasDiff(const HloInstructionNode* absl_nonnull left, int left_graph_size,
+             const HloInstructionNode* absl_nonnull right,
+             int right_graph_size) {
+  if (left->props.subgraph_fingerprint != right->props.subgraph_fingerprint) {
+    return true;
+  }
+  // TODO(b/365855856): Make sure there's no hash collision before removing the
+  // following extra comparison code.
+  std::vector<const HloInstructionNode*> left_subgraph = GetAllNodesInBfsOrder(
+      *left, BfsTraversalDirection::kForward, left_graph_size);
+  std::vector<const HloInstructionNode*> right_subgraph = GetAllNodesInBfsOrder(
+      *right, BfsTraversalDirection::kForward, right_graph_size);
+  if (left_subgraph.size() != right_subgraph.size()) {
+    LOG(WARNING) << "Subgraph (" << left->instruction->name() << " vs "
+                 << right->instruction->name() << ") with same fingerprint "
+                 << left->props.subgraph_fingerprint
+                 << " but different size: " << left_subgraph.size() << " vs "
+                 << right_subgraph.size();
+    return true;
+  }
+  for (int i = 0; i < left_subgraph.size(); ++i) {
+    if (left_subgraph[i]->instruction->opcode() !=
+        right_subgraph[i]->instruction->opcode()) {
+      LOG(WARNING) << "Subgraph (" << left->instruction->name() << " vs "
+                   << right->instruction->name() << ") with same fingerprint "
+                   << left->props.subgraph_fingerprint << " and size "
+                   << left_subgraph.size() << " but has diff type at node " << i
+                   << ":" << left_subgraph[i]->instruction->name() << " vs "
+                   << right_subgraph[i]->instruction->name();
+      return true;
+    }
+  }
+  return false;
+};
+
+// Maps the two subgraphs starting from the given nodes.
+void MapSubgraph(const HloInstructionNode* absl_nonnull left,
+                 int left_graph_size,
+                 const HloInstructionNode* absl_nonnull right,
+                 int right_graph_size, const MatcherType matcher_type,
+                 HloGumgraphMappings& mappings) {
+  std::vector<const HloInstructionNode*> left_subgraph = GetAllNodesInBfsOrder(
+      *left, BfsTraversalDirection::kForward, left_graph_size);
+  std::vector<const HloInstructionNode*> right_subgraph = GetAllNodesInBfsOrder(
+      *right, BfsTraversalDirection::kForward, right_graph_size);
+  if (left_subgraph.size() != right_subgraph.size()) {
+    LOG(WARNING) << "Unable to map subgraphs due to size mismatch: "
+                 << left_subgraph.size() << " vs " << right_subgraph.size();
+    return;
+  }
+  for (int i = 0; i < left_subgraph.size(); ++i) {
+    mappings.MapInstructionsIfAbsent(left_subgraph[i], right_subgraph[i],
+                                     matcher_type);
+    // Mark all nodes except the root as unchanged.
+    if (i != 0) {
+      mappings.left_to_right_instruction_map.left.find(left_subgraph[i])
+          ->info.unchanged = true;
+    }
+  }
+}
+
+// Recursively matches the two nodes top down when the opcodes and the
+// position of the nodes in their parents' children list match.
+void RecursiveTopDownMatcher(const HloInstructionNode* left,
+                             const HloInstructionNode* right,
+                             const MatcherType matcher_type,
+                             HloGumgraphMappings& mappings,
+                             bool require_same_children) {
+  if (require_same_children) {
+    if (left->children.size() != right->children.size()) {
+      return;
+    }
+    for (auto i = 0; i < left->children.size(); ++i) {
+      if (left->children[i]->instruction->opcode() !=
+          right->children[i]->instruction->opcode()) {
+        return;
+      }
+    }
+  }
+  for (auto i = 0; i < left->children.size() && i < right->children.size();
+       ++i) {
+    const HloInstructionNode* left_child = left->children[i];
+    const HloInstructionNode* right_child = right->children[i];
+    // TODO(b/360878130) - Use fingerprint to compare nodes.
+    if (left_child->instruction->opcode() !=
+            right_child->instruction->opcode() ||
+        !(mappings.MapInstructionsIfAbsent(left_child, right_child,
+                                           matcher_type))) {
+      // Stop recursive matching if the nodes are not matched, or
+      // non-overwriting mapping failed.
+      continue;
+    }
+    RecursiveTopDownMatcher(left_child, right_child, matcher_type, mappings,
+                            require_same_children);
+  }
+}
+
+// DiceSim similarity score between two subgraphs. Subgraphs are limited to
+// first max_subgraph_size nodes of BFS starting from the given nodes.
+double DiceSimLimitedSubgraph(const HloInstructionNode* absl_nonnull left,
+                              const HloInstructionNode* absl_nonnull right,
+                              HloGumgraphMappings& mappings,
+                              int max_subgraph_size, int min_bfs_distance,
+                              int left_graph_size, int right_graph_size) {
+  absl::flat_hash_set<const HloInstructionNode*> left_nodes;
+  absl::flat_hash_set<const HloInstructionNode*> right_nodes;
+  HloGumgraphBfs(
+      *left,
+      [&](const HloInstructionNode& node, int distance) {
+        left_nodes.insert(&node);
+        return distance <= min_bfs_distance ||
+               left_nodes.size() < max_subgraph_size;
+      },
+      BfsTraversalDirection::kForward, left_graph_size);
+  HloGumgraphBfs(
+      *right,
+      [&](const HloInstructionNode& node, int distance) {
+        right_nodes.insert(&node);
+        return distance <= min_bfs_distance ||
+               right_nodes.size() < max_subgraph_size;
+      },
+      BfsTraversalDirection::kForward, right_graph_size);
+  int common = 0;
+  for (const HloInstructionNode* left_node : left_nodes) {
+    if (auto it = mappings.left_to_right_instruction_map.left.find(left_node);
+        it != mappings.left_to_right_instruction_map.left.end() &&
+        right_nodes.contains(it->second)) {
+      ++common;
+    }
+  }
+
+  return 2 * static_cast<double>(common) /
+         static_cast<double>((left_nodes.size() + right_nodes.size()));
+}
+
+// A heuristic score based on the node attributes. Calculated by comparing the
+// fingerprint, name and generation of the nodes. This set of parameters
+// together with min_similarity threshold = 0.75 works the best so far, and
+// might need to be tuned later.
+double NodeAttributesSimilarity(const HloInstructionNode* absl_nonnull left,
+                                const HloInstructionNode* absl_nonnull right) {
+  double sim_score = 0.0;
+
+  if (right->props.fingerprint == left->props.fingerprint) {
+    sim_score += kFingerprintMatchScore;
+  }
+
+  if (!left->instruction->metadata().op_name().empty() &&
+      left->instruction->metadata().op_name() ==
+          right->instruction->metadata().op_name()) {
+    sim_score += kMetadataOpNameMatchScore;
+    if (!left->instruction->metadata().source_file().empty() &&
+        left->instruction->metadata().source_file() ==
+            right->instruction->metadata().source_file()) {
+      sim_score += kMetadataSourceFileMatchScore;
+      if (left->instruction->metadata().source_line() != 0 &&
+          left->instruction->metadata().source_line() ==
+              right->instruction->metadata().source_line()) {
+        sim_score += kMetadataSourceLineMatchScore;
+      }
+    }
+  }
+
+  return sim_score;
+}
+
+// A heuristic score based on the ancestor subgraphs of the given nodes.
+// Calculated by comparing the fingerprints of the ancestors of the nodes.
+double AncestorSubGraphSimilarity(const HloInstructionNode* left,
+                                  const HloInstructionNode* right,
+                                  const int candidate_traversal_limit,
+                                  const int min_bfs_distance,
+                                  int left_graph_size, int right_graph_size) {
+  absl::flat_hash_map<uint64_t, int> left_ancestor_fingerprints,
+      right_ancestor_fingerprints;
+  int left_traversal_count = 0;
+  HloGumgraphBfs(
+      *left,
+      [&](const HloInstructionNode& node, int distance) {
+        ++left_ancestor_fingerprints[node.props.fingerprint];
+        ++left_traversal_count;
+        return distance <= min_bfs_distance ||
+               left_traversal_count < candidate_traversal_limit;
+      },
+      BfsTraversalDirection::kReverse, left_graph_size);
+  int right_traversal_count = 0;
+  HloGumgraphBfs(
+      *right,
+      [&](const HloInstructionNode& node, int distance) {
+        ++right_ancestor_fingerprints[node.props.fingerprint];
+        ++right_traversal_count;
+        return distance <= min_bfs_distance ||
+               right_traversal_count < candidate_traversal_limit;
+      },
+      BfsTraversalDirection::kReverse, right_graph_size);
+
+  int matching_ancestors = 0;
+  for (const auto& [fingerprint, count] : left_ancestor_fingerprints) {
+    if (right_ancestor_fingerprints.contains(fingerprint)) {
+      matching_ancestors +=
+          std::min(count, right_ancestor_fingerprints[fingerprint]);
+    }
+  }
+
+  return 2.0 * static_cast<double>(matching_ancestors) /
+         static_cast<double>(left_traversal_count + right_traversal_count);
+}
+
+// Returns all HloValues used by the given instruction.
+std::vector<const HloValue*> GetAllValuesUsedByInstruction(
+    const HloInstruction* instruction, const HloGumgraph& gumgraph) {
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    if (instruction->parent()->IsEntryComputation() ||
+        gumgraph.GetHloValueTracing().ValueIsDefinedAt(instruction)) {
+      return std::vector<const HloValue*>();
+    }
+
+    return gumgraph.GetHloValueTracing()
+        .GetFlattenedValueSet(instruction)
+        .values();
+  }
+
+  std::vector<const HloValue*> values_used_by_instruction;
+  for (const HloInstruction* operand : instruction->operands()) {
+    const HloValueSet operand_value_set =
+        gumgraph.GetHloValueTracing().GetFlattenedValueSet(operand);
+    for (const HloValue* value : operand_value_set.values()) {
+      absl::Span<const HloUse> uses = value->GetUses();
+      for (const HloUse& use : uses) {
+        if (use.instruction == instruction) {
+          values_used_by_instruction.push_back(value);
+          break;
+        }
+      }
+    }
+  }
+
+  return values_used_by_instruction;
+}
+
+// Returns true if all HloValues used by the left and right nodes have their
+// defining instructions matched.
+double AllOperandHloValuesMatchedScore(
+    const HloInstructionNode* left_node, const HloInstructionNode* right_node,
+    const HloGumgraph& left, const HloGumgraph& right,
+    absl::flat_hash_map<const HloInstruction*,
+                        const std::vector<const HloValue*>>&
+        instruction_used_values_cache,
+    HloGumgraphMappings& mappings) {
+  if (!instruction_used_values_cache.contains(left_node->instruction)) {
+    instruction_used_values_cache.emplace(
+        left_node->instruction,
+        GetAllValuesUsedByInstruction(left_node->instruction, left));
+  }
+  if (!instruction_used_values_cache.contains(right_node->instruction)) {
+    instruction_used_values_cache.emplace(
+        right_node->instruction,
+        GetAllValuesUsedByInstruction(right_node->instruction, right));
+  }
+  auto& left_hlo_values = instruction_used_values_cache[left_node->instruction];
+  auto& right_hlo_values =
+      instruction_used_values_cache[right_node->instruction];
+
+  if (left_hlo_values.empty() || right_hlo_values.empty() ||
+      (left_hlo_values.size() != right_hlo_values.size())) {
+    return 0.0;
+  }
+
+  bool fingerprints_matched = true;
+  bool mappings_matched = true;
+  for (int i = 0; i < left_hlo_values.size(); ++i) {
+    if (!fingerprints_matched && !mappings_matched) {
+      // stop if both fingerprints and mappings are not matched.
+      break;
+    }
+
+    HloInstructionNode* left_hlo_value_node =
+        left.GetNode(left_hlo_values[i]->defining_instruction());
+    HloInstructionNode* right_hlo_value_node =
+        right.GetNode(right_hlo_values[i]->defining_instruction());
+    if (auto it = mappings.left_to_right_instruction_map.left.find(
+            left_hlo_value_node);
+        it == mappings.left_to_right_instruction_map.left.end() ||
+        it->second != right_hlo_value_node) {
+      mappings_matched = false;
+    }
+    if (left_hlo_value_node->props.fingerprint !=
+        right_hlo_value_node->props.fingerprint) {
+      fingerprints_matched = false;
+    }
+  }
+
+  if (mappings_matched) {
+    return kOperandsMatchScore;
+  }
+  if (fingerprints_matched) {
+    return kOperandsFingerprintsMatchScore;
+  }
+  return 0.0;
+}
+
+}  // namespace
+
+void GreedySubGraphExactMatcher::Match(HloGumgraphMappings& mappings) const {
+  // Find candidate subgraphs that match exactly.
+  LOG(INFO) << "Running GreedySubgraphExactMatcher: matching subgraphs that "
+               "match exactly";
+  int current_mapping_count = mappings.left_to_right_instruction_map.size();
+  absl::flat_hash_map<const HloInstructionNode*,
+                      std::vector<const HloInstructionNode*>>
+      candidates, candidates_reverse;
+  int max_height =
+      std::max(left_.GetRoot().props.height, right_.GetRoot().props.height);
+  // Cache all subgraphs at each height.
+  absl::flat_hash_map<int, std::vector<const HloInstructionNode*>>
+      source_subgraphs;
+  HloGumgraphBfs(
+      left_.GetRoot(),
+      [&](const HloInstructionNode& node) {
+        if (!node.is_root) {
+          source_subgraphs[node.props.height].push_back(&node);
+        }
+        return true;
+      },
+      BfsTraversalDirection::kForward, left_.GetNodeCount());
+  absl::flat_hash_map<int, std::vector<const HloInstructionNode*>>
+      target_subgraphs;
+  HloGumgraphBfs(
+      right_.GetRoot(),
+      [&](const HloInstructionNode& node) {
+        if (!node.is_root) {
+          target_subgraphs[node.props.height].push_back(&node);
+        }
+        return true;
+      },
+      BfsTraversalDirection::kForward, right_.GetNodeCount());
+
+  absl::flat_hash_set<const HloInstructionNode*> ignored;
+  // Find exact match left-right subgraphs candidates greedly from high to low
+  // height.
+  for (int height = max_height; height >= 0; --height) {
+    if (!source_subgraphs.contains(height) ||
+        !target_subgraphs.contains(height)) {
+      continue;
+    }
+    absl::flat_hash_set<const HloInstructionNode*> found;
+    // Find exact match left-right subgraph candidates at the current height.
+    for (const HloInstructionNode* source_node : source_subgraphs[height]) {
+      if (ignored.contains(source_node)) {
+        continue;
+      }
+      for (const HloInstructionNode* target_node : target_subgraphs[height]) {
+        if (ignored.contains(target_node)) {
+          continue;
+        }
+        if (HasDiff(source_node, left_.GetNodeCount(), target_node,
+                    right_.GetNodeCount())) {
+          continue;
+        }
+        candidates[source_node].push_back(target_node);
+        candidates_reverse[target_node].push_back(source_node);
+        found.insert(source_node);
+        found.insert(target_node);
+      }
+    }
+    // Ignore all nodes in the subgraphs that matched in later traversals.
+    for (const HloInstructionNode* found_node : found) {
+      HloGumgraphBfs(
+          *found_node,
+          [&](const HloInstructionNode& node) {
+            ignored.insert(&node);
+            return true;
+          },
+          BfsTraversalDirection::kForward,
+          std::max(left_.GetNodeCount(), right_.GetNodeCount()));
+    }
+  }
+  // Map 1:1 candidates.
+  for (auto& [left, right] : candidates) {
+    if (right.size() == 1 && candidates_reverse[right[0]].size() == 1) {
+      MapSubgraph(left, left_.GetNodeCount(), right[0], right_.GetNodeCount(),
+                  type_, mappings);
+    }
+  }
+
+  LOG(INFO)
+      << "Finished GreedySubGraphExactMatcher. Found left to right mappings: "
+      << mappings.left_to_right_instruction_map.size() - current_mapping_count;
+}
+
+void GreedyLimitedCandidatesBottomUpMatcher::Match(
+    HloGumgraphMappings& mappings) const {
+  LOG(INFO) << "Running GreedyLimitedCandidatesBottomUpMatcher: matching "
+               "subgraphs that match based on Dice similarity";
+  absl::flat_hash_map<const HloInstruction*, const std::vector<const HloValue*>>
+      instruction_used_values_cache;
+  int current_mapping_count = mappings.left_to_right_instruction_map.size();
+  std::vector<const HloInstructionNode*> left_postorder = GetAllNodesInDfsOrder(
+      left_.GetRoot(), DfsTraversalOrder::kPostOrder, left_.GetNodeCount());
+  int progress = 0;
+  int total_steps = left_postorder.size();
+  for (size_t i = 0; i < total_steps; ++i) {
+    const auto* left_node = left_postorder[i];
+    int current_progress = static_cast<int>((i * 100.0) / total_steps);
+    if (current_progress > progress) {
+      PrintProgress(current_progress);
+      progress = current_progress;
+    }
+    // Skip matched nodes or ones without children.
+    if (mappings.InstructionMapContainsLeft(left_node) ||
+        left_node->children.empty()) {
+      continue;
+    }
+
+    std::vector<const HloInstructionNode*> right_seeds;
+    int count = 0;
+    HloGumgraphBfs(
+        *left_node,
+        [&](const HloInstructionNode& node, int distance) {
+          if (auto it = mappings.left_to_right_instruction_map.left.find(&node);
+              it != mappings.left_to_right_instruction_map.left.end()) {
+            right_seeds.push_back(it->second);
+          }
+          // Don't pursue subgraphs with too many childrens. Allows us to visit
+          // deeper subgraphs without getting stuck on a single node with a
+          // large number of children.
+          if (node.children.size() > right_seeds_traversal_limit_ / 2) {
+            return false;
+          }
+          return distance <= min_bfs_distance_ ||
+                 ++count < right_seeds_traversal_limit_;
+        },
+        BfsTraversalDirection::kForward, left_.GetNodeCount());
+
+    // Find right candidates and maxSimilarity on the fly.
+    double max_similarity = 0;
+    const HloInstructionNode* right_candidate = nullptr;
+    count = 0;
+    HloGumgraphBfs(
+        right_seeds,
+        [&](const HloInstructionNode& node, int distance) {
+          if (!mappings.InstructionMapContainsRight(&node) &&
+              node.instruction->opcode() == left_node->instruction->opcode()) {
+            // Found candidate. Calculate similarity.
+            double operands_match_similarity = AllOperandHloValuesMatchedScore(
+                left_node, &node, left_, right_, instruction_used_values_cache,
+                mappings);
+            double dice_sim = DiceSimLimitedSubgraph(
+                left_node, &node, mappings, max_dice_subgraph_size_,
+                min_bfs_distance_, left_.GetNodeCount(), right_.GetNodeCount());
+            double node_attributes_similarity =
+                NodeAttributesSimilarity(left_node, &node);
+            double ancestor_similarity = AncestorSubGraphSimilarity(
+                left_node, &node, max_ancestors_to_consider_, min_bfs_distance_,
+                left_.GetNodeCount(), right_.GetNodeCount());
+            // We give ancestor similarity a lower weight as its lower signal
+            // in comparison to dice similarity and node attributes similarity.
+            double similarity = operands_match_similarity +
+                                node_attributes_similarity + dice_sim +
+                                ancestor_similarity / 2;
+            if (similarity > max_similarity) {
+              max_similarity = similarity;
+              right_candidate = &node;
+            }
+          }
+          return distance <= min_bfs_distance_ ||
+                 ++count < right_seeds_traversal_limit_;
+        },
+        BfsTraversalDirection::kReverse, right_.GetNodeCount());
+    if (max_similarity > min_similarity_) {
+      mappings.MapInstructionsIfAbsent(left_node, right_candidate, type_);
+    }
+  }
+  LOG(INFO) << "Finished GreedyLimitedCandidatesBottomUpMatcher. Total left to "
+               "right mappings: "
+            << mappings.left_to_right_instruction_map.size() -
+                   current_mapping_count;
+}
+
+void GreedyTopDownMatcher::Match(HloGumgraphMappings& mappings) const {
+  LOG(INFO) << "Running GreedyTopDownMatcher: matching umatched nodes";
+  int current_mapping_count = mappings.left_to_right_instruction_map.size();
+  HloGumgraphDfs(
+      left_.GetRoot(),
+      [&](const HloInstructionNode& left_node) {
+        auto it = mappings.left_to_right_instruction_map.left.find(&left_node);
+        if (it == mappings.left_to_right_instruction_map.left.end()) {
+          return;
+        }
+
+        RecursiveTopDownMatcher(&left_node, it->second, type_, mappings,
+                                require_same_children_);
+      },
+      DfsTraversalOrder::kPostOrder, left_.GetNodeCount());
+  LOG(INFO) << "Finished GreedyTopDownMatcher. Total left to right mappings: "
+            << mappings.left_to_right_instruction_map.size() -
+                   current_mapping_count;
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.h b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.h
new file mode 100644
index 000000000000..15e3400eac31
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_GUMGRAPH_MATCHER_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_GUMGRAPH_MATCHER_H_
+
+#include "absl/log/die_if_null.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Options allowing configuration of the instruction matching algorithm.
+struct MatchOptions {
+  bool use_top_down_matcher = true;
+};
+
+// Base class for all node matchers. Each matcher implements a unique algorithm
+// to match nodes between two HLO graphs. The base class standardizes input and
+// output types, ensuring seamless integration and compatibility within any
+// matcher sequence.
+class HloGumgraphMatcher {
+ public:
+  virtual ~HloGumgraphMatcher() = default;
+  virtual void Match(HloGumgraphMappings& mappings) const = 0;
+
+ protected:
+  explicit HloGumgraphMatcher(MatcherType type) : type_(type) {}
+  const MatcherType type_;
+};
+
+// Matcher that matches identical subgraphs starting with the tallest.
+class GreedySubGraphExactMatcher : public HloGumgraphMatcher {
+ public:
+  GreedySubGraphExactMatcher(const HloGumgraph* left, const HloGumgraph* right)
+      : HloGumgraphMatcher(MatcherType::kGreedySubGraphExactMatcher),
+        left_(*ABSL_DIE_IF_NULL(left)),
+        right_(*ABSL_DIE_IF_NULL(right)) {}
+  void Match(HloGumgraphMappings& mappings) const override;
+
+ private:
+  const HloGumgraph& left_;
+  const HloGumgraph& right_;
+};
+
+// Matcher that matches nodes bottom up by dice similarity. For each left node,
+// mappings of the already matched descendants are considered as seeds, from
+// which we traverse back the graph to find nodes with same opcode as
+// candidates. The candidate with the highest similarity is chosen as the match.
+// Nodes mapped by this matcher in earlier iterations are also considered as
+// seeds for later iterations.
+//
+// Seeds: Number of nodes to traverse to find seeds are limited.
+// Candidates: Number of nodes to traverse to find candidates are limited.
+// Dice similarity: Number of nodes to traverse in subgraph are limited.
+class GreedyLimitedCandidatesBottomUpMatcher : public HloGumgraphMatcher {
+ public:
+  GreedyLimitedCandidatesBottomUpMatcher(
+      const HloGumgraph* left, const HloGumgraph* right,
+      double min_similarity = 1.2, int max_dice_subgraph_size = 200,
+      int min_bfs_distance = 1, int max_ancestors_to_consider = 100,
+      int right_seeds_traversal_limit = 40, int candidate_traversal_limit = 200)
+      : HloGumgraphMatcher(
+            MatcherType::kGreedyLimitedCandidatesBottomUpMatcher),
+        left_(*ABSL_DIE_IF_NULL(left)),
+        right_(*ABSL_DIE_IF_NULL(right)),
+        min_similarity_(min_similarity),
+        max_dice_subgraph_size_(max_dice_subgraph_size),
+        min_bfs_distance_(min_bfs_distance),
+        max_ancestors_to_consider_(max_ancestors_to_consider),
+        right_seeds_traversal_limit_(right_seeds_traversal_limit),
+        candidate_traversal_limit_(candidate_traversal_limit) {}
+  void Match(HloGumgraphMappings& mappings) const override;
+
+ private:
+  const HloGumgraph& left_;
+  const HloGumgraph& right_;
+
+  // Minimum similarity to consider a match.
+  const double min_similarity_;
+
+  // Maximum size of the subgraph to consider when calculating dice similarity.
+  // Subject to min_bfs_distance_.
+  const int max_dice_subgraph_size_;
+
+  // Minimum height of the subgraph to consider when find seeds and candidates
+  // and calculate dice similarity.
+  const int min_bfs_distance_;
+
+  // Maximum number of ancestors to consider when calculating ancestor
+  // similarity.
+  const int max_ancestors_to_consider_;
+
+  // Maximum number of nodes to traverse to find right seeds.
+  const int right_seeds_traversal_limit_;
+
+  // Maximum number of nodes to traverse from seeds. Nodes with the same
+  // opcode are considered as candidates.
+  const int candidate_traversal_limit_;
+};
+
+// Matcher that matches nodes top down by same type sequence along the path.
+class GreedyTopDownMatcher : public HloGumgraphMatcher {
+ public:
+  GreedyTopDownMatcher(const HloGumgraph* left, const HloGumgraph* right,
+                       bool require_same_children = false)
+      : HloGumgraphMatcher(MatcherType::kGreedyTopDownMatcher),
+        left_(*ABSL_DIE_IF_NULL(left)),
+        right_(*ABSL_DIE_IF_NULL(right)),
+        require_same_children_(require_same_children) {}
+  void Match(HloGumgraphMappings& mappings) const override;
+
+ private:
+  const HloGumgraph& left_;
+  const HloGumgraph& right_;
+  const bool require_same_children_;
+};
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_MATCHERS_HLO_GUMGRAPH_MATCHER_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher_test.cc
new file mode 100644
index 000000000000..d3dc398c0e15
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher_test.cc
@@ -0,0 +1,628 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/matchers/hlo_gumgraph_matcher.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+#include "xla/hlo/tools/hlo_diff/utils/test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+class HloMatcherTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloMatcherTest, SubGraphExactMatcherEntryChange) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Param foo_L] ------> ┌-------┐
+  //                       | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar_L] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz_L] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo_L = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar_L = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz_L = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo_L, bar_L)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz_L)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Param foo_R] ------> ┌-------┐
+  //                       | add_1 | ---> ┌------------┐      ┌------┐
+  // [Constant bar_R] ---> └-------┘      | subtract_0 | ---> | ROOT |
+  // [Param baz_R] ---------------------> └------------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo_R = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar_R = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz_R = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo_R, bar_R)
+  subtract_0 = f32[8,2048]{1,0:T(8,128)} subtract(add_1, baz_R)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  auto matcher = std::make_unique<GreedySubGraphExactMatcher>(graph_l.get(),
+                                                              graph_r.get());
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(mapped_nodes, UnorderedElementsAre(
+                                Pair("add_1", "add_1"), Pair("foo_L", "foo_R"),
+                                Pair("bar_L", "bar_R"), Pair("baz_L", "baz_R"),
+                                Pair("root_L", "root_R")));
+}
+
+TEST_F(HloMatcherTest, SubGraphExactMatcherLeafChange) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Param baz] ---------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} parameter(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Param foo] ------> ┌-------┐
+  //                     | add_1 | ---> ┌-------┐      ┌------┐
+  // [Constant bar] ---> └-------┘      | add_0 | ---> | ROOT |
+  // [Constant baz] ------------------> └-------┘      └------┘
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  foo = f32[8,2048]{1,0:T(8,128)} parameter(0)
+  bar = f32[8,2048]{1,0:T(8,128)} constant(0)
+  baz = f32[8,2048]{1,0:T(8,128)} constant(1)
+  add_1 = f32[8,2048]{1,0:T(8,128)} add(foo, bar)
+  add_0 = f32[8,2048]{1,0:T(8,128)} add(add_1, baz)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  auto matcher = std::make_unique<GreedySubGraphExactMatcher>(graph_l.get(),
+                                                              graph_r.get());
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(mapped_nodes, UnorderedElementsAre(
+                                Pair("add_1", "add_1"), Pair("foo", "foo"),
+                                Pair("bar", "bar"), Pair("root_L", "root_R")));
+}
+
+TEST_F(HloMatcherTest, GreedyLimitedCandidatesBottomUpMatcher) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 | --------> ┌-------┐
+  // [Const 1] ---> └-------┘           |       |      ┌-------┐
+  //                                    | add_3 | ---> |       |
+  // [Const 2] ---> ┌------------┐      |       |      |       |      ┌------┐
+  //                | subtract_1 | ---> └-------┘      | add_4 | ---> | ROOT |
+  // [Const 3] ---> └------------┘                     |       |      └------┘
+  //                                                   |       |
+  // [Const 4] --------------------------------------> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  subtract.1 = f32[] subtract(constant.2, constant.3)
+  add.3 = f32[] add(add.0, subtract.1)
+  add.4 = f32[] add(add.3, constant.4)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 | ---> ┌-------┐
+  // [Const 1] ---> └-------┘      |       |      ┌-------┐
+  //                               | add_3 | ---> |       |
+  // [Const 2] ---> ┌-------┐      |       |      |       |      ┌------┐
+  //                | add_1 | ---> └-------┘      | add_4 | ---> | ROOT |
+  // [Const 3] ---> └-------┘                     |       |      └------┘
+  //                                              |       |
+  // [Const 4] ---------------------------------> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.2, constant.3)
+  add.3 = f32[] add(add.0, add.1)
+  add.4 = f32[] add(add.3, constant.4)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.0"),
+      GetNodeByName(*graph_r, "constant.0"), *mappings));
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.1"),
+      GetNodeByName(*graph_r, "constant.1"), *mappings));
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.2"),
+      GetNodeByName(*graph_r, "constant.2"), *mappings));
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.3"),
+      GetNodeByName(*graph_r, "constant.3"), *mappings));
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.4"),
+      GetNodeByName(*graph_r, "constant.4"), *mappings));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.0"),
+                               GetNodeByName(*graph_r, "add.0"), *mappings));
+  auto matcher = std::make_unique<GreedyLimitedCandidatesBottomUpMatcher>(
+      graph_l.get(), graph_r.get());
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(
+      mapped_nodes,
+      UnorderedElementsAre(
+          Pair("constant.0", "constant.0"), Pair("constant.1", "constant.1"),
+          Pair("constant.2", "constant.2"), Pair("constant.3", "constant.3"),
+          Pair("add.0", "add.0"), Pair("add.3", "add.3"),
+          Pair("constant.4", "constant.4"), Pair("add.4", "add.4"),
+          Pair("root_L", "root_R")));
+}
+
+TEST_F(HloMatcherTest, GreedyLimitedCandidatesBottomUpMatcherAmbiguousMatch) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.0, constant.1)
+  add.2 = f32[] add(add.0, constant.0)
+  subtract.1 = f32[] subtract(add.1, add.2)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  add.10 = f32[] add(constant.0, constant.1)
+  add.11 = f32[] add(constant.0, constant.1)
+  add.12 = f32[] add(add.10, constant.0)
+  subtract.1 = f32[] subtract(add.11, add.12)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.0"),
+      GetNodeByName(*graph_r, "constant.0"), *mappings));
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.1"),
+      GetNodeByName(*graph_r, "constant.1"), *mappings));
+  auto matcher = std::make_unique<GreedyLimitedCandidatesBottomUpMatcher>(
+      graph_l.get(), graph_r.get());
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(mapped_nodes,
+              UnorderedElementsAre(
+                  Pair("constant.0", "constant.0"),
+                  Pair("constant.1", "constant.1"), Pair("add.0", "add.10"),
+                  Pair("add.1", "add.11"), Pair("add.2", "add.12"),
+                  Pair("subtract.1", "subtract.1"), Pair("root_L", "root_R")));
+}
+
+TEST_F(HloMatcherTest, GreedyLimitedCandidatesBottomUpMatcherHloValueTraced) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation0 {
+  param_0 = f32[] parameter(0)
+  ROOT negate.0 = f32[] negate(param_0)
+}
+
+fused_computation1 {
+  param_1 = f32[] parameter(0)
+  ROOT abs.0 = f32[] abs(param_1)
+}
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  bitcast.0 = f32[] bitcast(constant.0)
+  copy.0 = f32[] copy(bitcast.0)
+  fusion.0 = f32[] fusion(bitcast.0), kind=kLoop, calls=fused_computation0
+  fusion.1 = f32[] fusion(copy.0), kind=kLoop, calls=fused_computation1
+  ROOT add.0 = f32[] add(fusion.0, fusion.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+fused_computation0 {
+  param_0 = f32[] parameter(0)
+  ROOT negate.0 = f32[] negate(param_0)
+}
+
+fused_computation1 {
+  param_1 = f32[] parameter(0)
+  ROOT abs.0 = f32[] abs(param_1)
+}
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  bitcast.0 = f32[] bitcast(constant.0)
+  copy.0 = f32[] copy(bitcast.0)
+  fusion.0 = f32[] fusion(bitcast.0), kind=kLoop, calls=fused_computation1
+  fusion.1 = f32[] fusion(copy.0), kind=kLoop, calls=fused_computation0
+  ROOT add.0 = f32[] add(fusion.0, fusion.1)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(OverwriteMapInstructions(
+      GetNodeByName(*graph_l, "constant.0"),
+      GetNodeByName(*graph_r, "constant.0"), *mappings));
+  auto matcher = std::make_unique<GreedyLimitedCandidatesBottomUpMatcher>(
+      graph_l.get(), graph_r.get());
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(mapped_nodes,
+              UnorderedElementsAre(
+                  Pair("constant.0", "constant.0"),
+                  Pair("bitcast.0", "bitcast.0"), Pair("copy.0", "copy.0"),
+                  Pair("fusion.0", "fusion.0"), Pair("fusion.1", "fusion.1"),
+                  Pair("add.0", "add.0"), Pair("negate.0", "negate.0"),
+                  Pair("abs.0", "abs.0"), Pair("param_0", "param_1"),
+                  Pair("param_1", "param_0"), Pair("root_L", "root_R")));
+}
+
+TEST_F(HloMatcherTest, GreedyTopDownMatcherStopAtUnmatchedType) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 | --------> ┌-------┐
+  // [Const 1] ---> └-------┘           |       |      ┌-------┐
+  //                                    | add_3 | ---> |       |
+  // [Const 2] ---> ┌------------┐      |       |      |       |      ┌------┐
+  //                | subtract_1 | ---> └-------┘      | add_4 | ---> | ROOT |
+  // [Const 3] ---> └------------┘                     |       |      └------┘
+  //                                                   |       |
+  // [Const 4] --------------------------------------> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  subtract.1 = f32[] subtract(constant.2, constant.3)
+  add.3 = f32[] add(add.0, subtract.1)
+  add.4 = f32[] add(add.3, constant.4)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 | ---> ┌-------┐
+  // [Const 1] ---> └-------┘      |       |      ┌-------┐
+  //                               | add_3 | ---> |       |
+  // [Const 2] ---> ┌-------┐      |       |      |       |      ┌------┐
+  //                | add_1 | ---> └-------┘      | add_4 | ---> | ROOT |
+  // [Const 3] ---> └-------┘                     |       |      └------┘
+  //                                              |       |
+  // [Const 4] ---------------------------------> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.2, constant.3)
+  add.3 = f32[] add(add.0, add.1)
+  add.4 = f32[] add(add.3, constant.4)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  auto matcher =
+      std::make_unique<GreedyTopDownMatcher>(graph_l.get(), graph_r.get());
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(mapped_nodes,
+              UnorderedElementsAre(
+                  Pair("constant.0", "constant.0"),
+                  Pair("constant.1", "constant.1"), Pair("add.0", "add.0"),
+                  Pair("add.3", "add.3"), Pair("constant.4", "constant.4"),
+                  Pair("add.4", "add.4"), Pair("root_L", "root_R")));
+}
+
+TEST_F(HloMatcherTest, GreedyTopDownMatcherStopAtMappedNode) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [const.0] ---> ┌-------┐
+  //                | add.0 | ---> ┌-------┐
+  // [const.1] ---> └-------┘      |       |      ┌-------┐
+  //                               | add.3 | ---> |       |
+  // [const.2] ---> ┌-------┐      |       |      |       |      ┌------┐
+  //                | add.1 | ---> └-------┘      | add.4 | ---> | ROOT |
+  // [const.3] ---> └-------┘                     |       |      └------┘
+  //                                              |       |
+  // [const.4] ---------------------------------> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.2, constant.3)
+  add.3 = f32[] add(add.0, add.1)
+  add.4 = f32[] add(add.3, constant.4)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [const.0] ---> ┌-------┐
+  //                | add.0 | ---> ┌-------┐
+  // [const.1] ---> └-------┘      |       |      ┌-------┐
+  //                               | add.3 | ---> |       |
+  // [const.2] ---> ┌-------┐      |       |      |       |
+  //                | add.1 | ---> └-------┘      |       |      ┌------┐
+  // [const.3] ---> └-------┘                     | add.4 | ---> | ROOT |
+  //                                              |       |      └------┘
+  // [const.4] ---> ┌-------┐                     |       |
+  //                | add.2 | ------------------> |       |
+  // [const.5] ---> └-------┘                     └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  constant.5 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.2, constant.3)
+  add.2 = f32[] add(constant.4, constant.5)
+  add.3 = f32[] add(add.0, add.1)
+  add.4 = f32[] add(add.3, add.2)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.4"),
+                               GetNodeByName(*graph_r, "add.4"), *mappings));
+  ASSERT_NO_FATAL_FAILURE(
+      OverwriteMapInstructions(GetNodeByName(*graph_l, "add.1"),
+                               GetNodeByName(*graph_r, "add.2"), *mappings));
+  auto matcher =
+      std::make_unique<GreedyTopDownMatcher>(graph_l.get(), graph_r.get());
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(
+      mapped_nodes,
+      UnorderedElementsAre(
+          Pair("constant.0", "constant.0"), Pair("constant.1", "constant.1"),
+          Pair("add.0", "add.0"), Pair("constant.2", "constant.4"),
+          Pair("constant.3", "constant.5"), Pair("add.1", "add.2"),
+          Pair("add.3", "add.3"), Pair("add.4", "add.4"),
+          Pair("root_L", "root_R")));
+}
+
+TEST_F(HloMatcherTest, GreedyTopDownMatcherStopAtDifferentChildren) {
+  // Create left module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 | --------> ┌-------┐
+  // [Const 1] ---> └-------┘           |       |      ┌-------┐
+  //                                    | add_3 | ---> |       |
+  // [Const 2] ---> ┌------------┐      |       |      |       |      ┌------┐
+  //                | subtract_1 | ---> └-------┘      | add_4 | ---> | ROOT |
+  // [Const 3] ---> └------------┘                     |       |      └------┘
+  //                                                   |       |
+  // [Const 4] --------------------------------------> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_l,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  subtract.1 = f32[] subtract(constant.2, constant.3)
+  add.3 = f32[] add(add.0, subtract.1)
+  add.4 = f32[] add(add.3, constant.4)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_l,
+                          HloGumgraph::Create(module_l.get()));
+
+  // Create right module with entry computation containing the following
+  // structure:
+  // [Const 0] ---> ┌-------┐
+  //                | add_0 | ---> ┌-------┐
+  // [Const 1] ---> └-------┘      |       |      ┌-------┐
+  //                               | add_3 | ---> |       |
+  // [Const 2] ---> ┌-------┐      |       |      |       |      ┌------┐
+  //                | add_1 | ---> └-------┘      | add_4 | ---> | ROOT |
+  // [Const 3] ---> └-------┘                     |       |      └------┘
+  //                                              |       |
+  // [Const 4] ---------------------------------> └-------┘
+  //
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::VerifiedHloModule> module_r,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(0)
+  constant.2 = f32[] constant(0)
+  constant.3 = f32[] constant(0)
+  constant.4 = f32[] constant(0)
+  add.0 = f32[] add(constant.0, constant.1)
+  add.1 = f32[] add(constant.2, constant.3)
+  add.3 = f32[] add(add.0, add.1)
+  add.4 = f32[] add(add.3, constant.4)
+}
+)"));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<const HloGumgraph> graph_r,
+                          HloGumgraph::Create(module_r.get()));
+  auto mappings = std::make_unique<HloGumgraphMappings>();
+  auto matcher = std::make_unique<GreedyTopDownMatcher>(
+      graph_l.get(), graph_r.get(), /*require_same_children=*/true);
+  // Root nodes are matched by default before the matcher is called.
+  mappings->MapInstructionsIfAbsent(&graph_l->GetRoot(), &graph_r->GetRoot(),
+                                    MatcherType::kManual);
+  matcher->Match(*mappings);
+  auto mapped_nodes = ExtractMappedInstructionNames(*mappings);
+
+  EXPECT_THAT(mapped_nodes,
+              UnorderedElementsAre(
+                  Pair("add.3", "add.3"), Pair("constant.4", "constant.4"),
+                  Pair("add.4", "add.4"), Pair("root_L", "root_R")));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/proto/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/proto/BUILD
new file mode 100644
index 000000000000..814334766d36
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/proto/BUILD
@@ -0,0 +1,20 @@
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+tf_proto_library(
+    name = "diff_result_proto",
+    srcs = ["diff_result.proto"],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/proto/diff_result.proto b/third_party/xla/xla/hlo/tools/hlo_diff/proto/diff_result.proto
new file mode 100644
index 000000000000..68e13dc90e90
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/proto/diff_result.proto
@@ -0,0 +1,17 @@
+syntax = "proto3";
+
+package xla.hlo_diff;
+
+// Represents a pair of matched instructions in the left and right modules.
+message MatchedInstructionPairProto {
+  string left = 1;
+  string right = 2;
+}
+
+// Represents the result of computing the diff between two HLO modules.
+message DiffResultProto {
+  repeated string left_unmatched_instructions = 1;
+  repeated string right_unmatched_instructions = 2;
+  repeated MatchedInstructionPairProto unchanged_instructions = 3;
+  repeated MatchedInstructionPairProto changed_instructions = 4;
+}
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/render/BUILD
new file mode 100644
index 000000000000..896642e7e28d
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/BUILD
@@ -0,0 +1,112 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_renderer_util",
+    srcs = ["hlo_gumgraph_renderer_util.cc"],
+    hdrs = ["hlo_gumgraph_renderer_util.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_gumgraph_renderer_util_test",
+    srcs = ["hlo_gumgraph_renderer_util_test.cc"],
+    deps = [
+        ":hlo_gumgraph_renderer_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_text_renderer",
+    srcs = ["hlo_gumgraph_text_renderer.cc"],
+    hdrs = ["hlo_gumgraph_text_renderer.h"],
+    deps = [
+        ":hlo_gumgraph_renderer_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff:hlo_diff_result",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "hlo_gumgraph_html_renderer",
+    srcs = ["hlo_gumgraph_html_renderer.cc"],
+    hdrs = ["hlo_gumgraph_html_renderer.h"],
+    deps = [
+        ":graph_url_generator",
+        ":hlo_gumgraph_renderer_util",
+        ":op_metric_getter",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff:hlo_diff_result",
+        "//xla/hlo/tools/hlo_diff:hlo_diff_summary",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_gumgraph_html_renderer_test",
+    srcs = ["hlo_gumgraph_html_renderer_test.cc"],
+    deps = [
+        ":hlo_gumgraph_html_renderer",
+        ":op_metric_getter",
+        "//xla/hlo/tools/hlo_diff:hlo_diff_result",
+        "//xla/hlo/tools/hlo_diff:hlo_diff_summary",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "graph_url_generator",
+    hdrs = ["graph_url_generator.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "op_metric_getter",
+    hdrs = ["op_metric_getter.h"],
+    deps = [
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/graph_url_generator.h b/third_party/xla/xla/hlo/tools/hlo_diff/render/graph_url_generator.h
new file mode 100644
index 000000000000..20aabfedbec2
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/graph_url_generator.h
@@ -0,0 +1,52 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_RENDER_GRAPH_URL_GENERATOR_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_RENDER_GRAPH_URL_GENERATOR_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// A helper class to generate a url to the graph visualization.
+class GraphUrlGenerator {
+ public:
+  virtual ~GraphUrlGenerator() = default;
+
+  // Generates a url to the graph visualization for the given selected nodes.
+  virtual std::string GenerateWithSelectedNodes(
+      absl::string_view left_selected_node_id,
+      absl::string_view right_selected_node_id) = 0;
+
+  // Generates a url to the graph visualization for the given instruction pair.
+  virtual std::string GenerateWithSelectedNodes(
+      const HloInstruction* left_inst, const HloInstruction* right_inst) = 0;
+
+  // Generates a url to the graph visualization for the given computation pair.
+  virtual std::string GenerateWithSelectedNodes(
+      const HloComputation* left_comp, const HloComputation* right_comp) = 0;
+
+  // Generates a url to the graph visualization without any selected nodes.
+  inline std::string Generate() { return GenerateWithSelectedNodes("", ""); }
+};
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_RENDER_GRAPH_URL_GENERATOR_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.cc b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.cc
new file mode 100644
index 000000000000..4f2bdfcfc3fb
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.cc
@@ -0,0 +1,700 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/render/graph_url_generator.h"
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.h"
+#include "xla/hlo/tools/hlo_diff/render/op_metric_getter.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+/*** HTML printing functions ***/
+
+// Prints the CSS styles for the HTML output.
+std::string PrintCss() {
+  return R"html(
+    <style>
+    .section {
+      margin: 10px;
+      padding: 10px;
+      border: 1px solid #ccc;
+      border-radius: 5px;
+    }
+    .section > .header {
+      font-size: 16px;
+      font-weight: bold;
+      margin-bottom: 5px;
+    }
+    .section > .content {
+      font-size: 14px;
+    }
+
+    details {
+      margin: 0;
+      padding: 0;
+    }
+    details > summary {
+      font-weight: bold;
+      cursor: pointer;
+    }
+    details > summary:hover {
+      background-color: #eee;
+    }
+    details > .content {
+      padding-left: 20px;
+    }
+
+    .list {
+      margin: 0;
+      padding: 0;
+    }
+    .list > .item:hover {
+      background-color: #eee;
+    }
+
+    .attributes-list {
+      margin: 0;
+      padding: 0;
+    }
+
+    .tooltip {
+      position: relative;
+      display: inline-block;
+      border-bottom: 1px dotted black;
+    }
+    .tooltip .tooltiptext {
+      visibility: hidden;
+      background-color: #555;
+      color: #fff;
+      text-align: left;
+      padding: 5px;
+      border-radius: 6px;
+      position: absolute;
+      z-index: 1;
+      top: 50%;
+      transform: translateY(-50%);
+      left: calc(100% + 10px);
+      opacity: 0;
+      transition: opacity 0.3s;
+      white-space: pre;
+      font-family: monospace;
+    }
+    .tooltip .tooltiptext::after {
+      content: " ";
+      position: absolute;
+      top: 50%;
+      right: 100%;
+      margin-top: -5px;
+      border-width: 5px;
+      border-style: solid;
+      border-color: transparent #555 transparent transparent;
+      white-space: normal;
+    }
+    .tooltip:hover .tooltiptext {
+      visibility: visible;
+      opacity: 1;
+    }
+
+  .click-to-copy {
+    position: relative;
+    display: inline-block;
+    cursor: pointer;
+  }
+    </style>
+  )html";
+}
+
+// Prints javascript for the HTML output.
+std::string PrintJavascript() {
+  return R"html(
+  <script>
+  function CopyToClipboard(text) {
+    navigator.clipboard.writeText(text);
+    const tooltip = event.srcElement.querySelector('.tooltiptext');
+    tooltip.textContent = 'Copied to clipboard';
+    setTimeout(() => {
+      tooltip.textContent = 'Click to copy';
+    }, 2000);
+  }
+  </script>
+  )html";
+}
+
+// Escapes the string for html attribute.
+std::string EscapeStringForHtmlAttribute(absl::string_view str) {
+  std::string escaped_str;
+  for (char c : str) {
+    switch (c) {
+      case '&':
+        absl::StrAppend(&escaped_str, "&amp;");
+        break;
+      case '<':
+        absl::StrAppend(&escaped_str, "&lt;");
+        break;
+      case '>':
+        absl::StrAppend(&escaped_str, "&gt;");
+        break;
+      case '"':
+        absl::StrAppend(&escaped_str, "&quot;");
+        break;
+      case '\'':
+        absl::StrAppend(&escaped_str, "&#39;");
+        break;
+      default:
+        absl::StrAppend(&escaped_str, absl::string_view(&c, 1));
+        break;
+    }
+  }
+  return escaped_str;
+}
+
+// Prints the div html block.
+std::string PrintDiv(absl::string_view content, absl::string_view class_name) {
+  return absl::StrFormat(R"html(<div class="%s">%s</div>)html", class_name,
+                         content);
+}
+
+// Prints the detail html block.
+std::string PrintDetails(absl::string_view summary, absl::string_view content) {
+  return absl::StrFormat(
+      R"html(<details><summary>%s</summary>%s</details>)html", summary,
+      PrintDiv(content, "content"));
+}
+
+// Prints a link to the given url.
+std::string PrintLink(absl::string_view text, absl::string_view url) {
+  return absl::StrFormat(R"html(<a href="%s" target="_blank">%s</a>)html", url,
+                         text);
+}
+
+// Prints a html block with a header.
+std::string PrintSectionWithHeader(absl::string_view header,
+                                   absl::string_view content) {
+  return PrintDiv(
+      absl::StrCat(PrintDiv(header, "header"), PrintDiv(content, "content")),
+      "section");
+}
+
+// Prints a list of items.
+std::string PrintList(absl::Span<const std::string> items) {
+  return PrintDiv(absl::StrJoin(items, "",
+                                [](std::string* out, const auto& item) {
+                                  absl::StrAppend(out, PrintDiv(item, "item"));
+                                }),
+                  "list");
+}
+
+// Prints a list of attribute items.
+std::string PrintAttributesList(absl::Span<const std::string> items) {
+  return PrintDiv(absl::StrJoin(items, "",
+                                [](std::string* out, const auto& item) {
+                                  absl::StrAppend(out, PrintDiv(item, "item"));
+                                }),
+                  "attributes-list");
+}
+
+// Prints a span with a tooltip.
+std::string PrintTooltip(absl::string_view text,
+                         absl::string_view tooltip_text) {
+  return absl::StrFormat(
+      R"html(<span class="tooltip">%s<span class="tooltiptext">%s</span></span>)html",
+      text, tooltip_text);
+}
+
+// Print click to copy button.
+std::string PrintClickToCopyButton(absl::string_view text,
+                                   absl::string_view content) {
+  return absl::StrFormat(
+      R"html(<span class="click-to-copy" onclick="CopyToClipboard(`%s`)">%s</span>)html",
+      EscapeStringForHtmlAttribute(content),
+      PrintTooltip(text, "Click to copy"));
+}
+
+/*** Summary logic ***/
+
+// Prints the instruction name and click to copy button that copy the text
+// format.
+std::string PrintInstruction(const HloInstruction& inst) {
+  return absl::StrFormat("%s (%s)", inst.name(),
+                         PrintClickToCopyButton("text", inst.ToString()));
+}
+
+// Prints a pair of instructions. If url_generator is not null, a link to the
+// pair of instructions in model explorer will be printed.
+std::string PrintInstructionPair(const HloInstruction* left_inst,
+                                 const HloInstruction* right_inst,
+                                 GraphUrlGenerator* url_generator) {
+  std::vector<std::string> instructions;
+  if (left_inst != nullptr) {
+    instructions.push_back(PrintInstruction(*left_inst));
+  }
+  if (right_inst != nullptr) {
+    instructions.push_back(PrintInstruction(*right_inst));
+  }
+  std::string text = absl::StrJoin(instructions, " ↔ ");
+  if (url_generator == nullptr) {
+    return text;
+  }
+  std::string url =
+      url_generator->GenerateWithSelectedNodes(left_inst, right_inst);
+  if (url.empty()) {
+    return text;
+  }
+  return absl::StrCat(text, " (", PrintLink("Model Explorer", url), ")");
+}
+
+// Prints computation name and click to copy button that copy the text format.
+std::string PrintComputation(const HloComputation& comp) {
+  return absl::StrFormat("%s (%s)", comp.name(),
+                         PrintClickToCopyButton("text", comp.ToString()));
+}
+
+// Prints a pair of computations. If url_generator is not null, a link to the
+// pair of computations in model explorer will be printed.
+std::string PrintComputationPair(const HloComputation* left_comp,
+                                 const HloComputation* right_comp,
+                                 GraphUrlGenerator* url_generator) {
+  std::vector<std::string> computations;
+  if (left_comp != nullptr) {
+    computations.push_back(PrintComputation(*left_comp));
+  }
+  if (right_comp != nullptr) {
+    computations.push_back(PrintComputation(*right_comp));
+  }
+  std::string text = absl::StrJoin(computations, " ↔ ");
+  if (url_generator == nullptr) {
+    return text;
+  }
+  std::string url =
+      url_generator->GenerateWithSelectedNodes(left_comp, right_comp);
+  if (url.empty()) {
+    return text;
+  }
+  return absl::StrCat(text, " (", PrintLink("Model Explorer", url), ")");
+}
+
+// The location of the instruction in the diff result.
+enum class InstructionLocation : std::uint8_t { kLeft, kRight };
+
+// Prints a list of instructions.
+std::string PrintInstructionsAsList(
+    absl::Span<const HloInstruction* const> instructions,
+    InstructionLocation location, bool name_only,
+    GraphUrlGenerator* url_generator) {
+  std::vector<std::string> instructions_list;
+  for (const HloInstruction* inst : instructions) {
+    std::string link;
+    if (location == InstructionLocation::kLeft) {
+      link = PrintInstructionPair(inst, /*right_inst=*/nullptr, url_generator);
+    } else {
+      link = PrintInstructionPair(/*left_inst=*/nullptr, inst, url_generator);
+    }
+    instructions_list.push_back(link);
+  }
+  return PrintList(instructions_list);
+}
+
+// Prints a list of instruction pairs.
+std::string PrintInstructionPairsAsList(
+    absl::Span<const std::pair<const HloInstruction*, const HloInstruction*>>
+        instruction_pairs,
+    const std::function<std::string(const HloInstruction*,
+                                    const HloInstruction*)>&
+        instruction_pair_printer) {
+  std::vector<std::string> pair_list;
+  for (const auto& pair : instruction_pairs) {
+    pair_list.push_back(instruction_pair_printer(pair.first, pair.second));
+  }
+  return PrintList(pair_list);
+}
+
+// Prints unmatched instructions grouped by opcode and print in a descending
+// order of the number of instructions for each opcode.
+std::string PrintUnmatchedInstructions(
+    const absl::flat_hash_set<const HloInstruction*>& instructions,
+    InstructionLocation location,
+    const absl::flat_hash_set<HloOpcode>& opcodes_to_ignore, bool name_only,
+    GraphUrlGenerator* url_generator) {
+  absl::flat_hash_map<HloOpcode, std::vector<const HloInstruction*>>
+      instructions_by_opcode = GroupInstructionsByOpcode(instructions);
+  std::vector<std::pair<HloOpcode, int64_t>> opcode_counts;
+  for (const auto& [opcode, insts] : instructions_by_opcode) {
+    opcode_counts.push_back({opcode, insts.size()});
+  }
+  std::sort(opcode_counts.begin(), opcode_counts.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+  std::stringstream ss;
+  for (auto cit = opcode_counts.begin(); cit != opcode_counts.end(); ++cit) {
+    if (opcodes_to_ignore.contains(cit->first)) {
+      continue;
+    }
+    ss << PrintDetails(
+        absl::StrFormat("%s (%d)", HloOpcodeString(cit->first), cit->second),
+        PrintInstructionsAsList(instructions_by_opcode[cit->first], location,
+                                name_only, url_generator));
+  }
+  return ss.str();
+}
+
+// Prints instruction pairs grouped by opcode and print in a descending order
+// of the number of instruction pairs for each opcode.
+std::string PrintInstructionPairsByOpcode(
+    const absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        instructions,
+    const absl::flat_hash_set<HloOpcode>& opcodes_to_ignore,
+    const std::function<std::string(const HloInstruction*,
+                                    const HloInstruction*)>&
+        instruction_pair_printer) {
+  absl::flat_hash_map<
+      HloOpcode,
+      std::vector<std::pair<const HloInstruction*, const HloInstruction*>>>
+      instructions_by_opcode = GroupInstructionPairsByOpcode(instructions);
+  std::vector<std::pair<HloOpcode, int64_t>> opcode_counts;
+  for (const auto& [opcode, insts] : instructions_by_opcode) {
+    opcode_counts.push_back({opcode, insts.size()});
+  }
+  std::sort(opcode_counts.begin(), opcode_counts.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+  std::stringstream ss;
+  for (auto cit = opcode_counts.begin(); cit != opcode_counts.end(); ++cit) {
+    if (opcodes_to_ignore.contains(cit->first)) {
+      continue;
+    }
+    absl::string_view op_name = HloOpcodeString(cit->first);
+    ss << PrintDetails(
+        absl::StrFormat("%s (%d)", op_name, cit->second),
+        PrintInstructionPairsAsList(instructions_by_opcode.at(cit->first),
+                                    instruction_pair_printer));
+  }
+  return ss.str();
+}
+
+// Prints the summary of the changed instruction diff type.
+std::string PrintChangedInstructionDiffTypeSummary(
+    const HloInstruction* left_inst, const HloInstruction* right_inst,
+    ChangedInstructionDiffType diff_type) {
+  switch (diff_type) {
+    case ChangedInstructionDiffType::kShapeChange:
+      return absl::StrFormat(
+          "left:  %s\nright: %s",
+          left_inst->shape().ToString(/*print_layout=*/true),
+          right_inst->shape().ToString(/*print_layout=*/true));
+    case ChangedInstructionDiffType::kLayoutChange:
+      return absl::StrFormat("left:  %s\nright: %s",
+                             left_inst->shape().layout().ToString(),
+                             right_inst->shape().layout().ToString());
+    case ChangedInstructionDiffType::kMemorySpaceChange:
+      return absl::StrFormat("left:  %d\nright: %d",
+                             left_inst->shape().layout().memory_space(),
+                             right_inst->shape().layout().memory_space());
+    case ChangedInstructionDiffType::kChangedOperandsNumber:
+      return absl::StrFormat("left:  %d\nright: %d", left_inst->operand_count(),
+                             right_inst->operand_count());
+    case ChangedInstructionDiffType::kChangedOperandsShape: {
+      std::vector<std::string> operand_shape_diffs;
+      for (int64_t i = 0; i < left_inst->operand_count(); ++i) {
+        if (left_inst->operand(i)->shape() != right_inst->operand(i)->shape()) {
+          operand_shape_diffs.push_back(absl::StrFormat(
+              "operand %d (%s):\n  left:  %s\n  right: %s", i,
+              HloOpcodeString(left_inst->operand(i)->opcode()),
+              left_inst->operand(i)->shape().ToString(/*print_layout=*/true),
+              right_inst->operand(i)->shape().ToString(/*print_layout=*/true)));
+        }
+      }
+      return absl::StrJoin(operand_shape_diffs, "\n");
+    }
+    case ChangedInstructionDiffType::kOpCodeChanged:
+      return absl::StrFormat("left:  %s\nright: %s",
+                             HloOpcodeString(left_inst->opcode()),
+                             HloOpcodeString(right_inst->opcode()));
+    case ChangedInstructionDiffType::kConstantLiteralChanged:
+      return absl::StrFormat("left:  %s\nright: %s",
+                             left_inst->literal().ToString(),
+                             right_inst->literal().ToString());
+    default:
+      return "Other changes";
+  }
+}
+
+// Prints changed instructions grouped by opcode and print in a
+// descending order of the number of instructions for each opcode.
+std::string PrintChangedInstructions(
+    const absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        instructions,
+    const absl::flat_hash_set<HloOpcode>& opcodes_to_ignore,
+    GraphUrlGenerator* url_generator) {
+  auto decorated_printer = [&url_generator](const HloInstruction* left_inst,
+                                            const HloInstruction* right_inst) {
+    std::vector<ChangedInstructionDiffType> diff_types =
+        GetChangedInstructionDiffTypes(*left_inst, *right_inst);
+    return absl::StrFormat(
+        "%s have changed: %s",
+        PrintInstructionPair(left_inst, right_inst, url_generator),
+        absl::StrJoin(
+            diff_types, ", ",
+            [&left_inst, &right_inst](std::string* out, const auto& diff_type) {
+              std::string diff_type_string =
+                  GetChangedInstructionDiffTypeString(diff_type);
+              return absl::StrAppend(
+                  out,
+                  diff_type == ChangedInstructionDiffType::kOtherChange
+                      ? diff_type_string
+                      : PrintTooltip(diff_type_string,
+                                     PrintChangedInstructionDiffTypeSummary(
+                                         left_inst, right_inst, diff_type)));
+            }));
+  };
+  return PrintInstructionPairsByOpcode(instructions, opcodes_to_ignore,
+                                       decorated_printer);
+}
+
+// Prints unchanged instructions grouped by opcode and print in a
+// descending order of the number of instructions for each opcode.
+std::string PrintUnchangedInstructions(
+    const absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        instructions,
+    const absl::flat_hash_set<HloOpcode>& opcodes_to_ignore,
+    GraphUrlGenerator* url_generator) {
+  auto simple_printer = [&url_generator](const HloInstruction* left_inst,
+                                         const HloInstruction* right_inst) {
+    return PrintInstructionPair(left_inst, right_inst, url_generator);
+  };
+  return PrintInstructionPairsByOpcode(instructions, opcodes_to_ignore,
+                                       simple_printer);
+}
+
+std::string PrintUnmatchedMetricsDiff(
+    const absl::flat_hash_set<const HloInstruction*>& instructions,
+    const OpMetricGetter& op_metric_getter, GraphUrlGenerator* url_generator) {
+  std::vector<std::pair<const HloInstruction*, double>> sorted_metrics_diff;
+  for (const HloInstruction* inst : instructions) {
+    if (auto time_ps = op_metric_getter.GetOpTimePs(inst->name());
+        time_ps.ok()) {
+      sorted_metrics_diff.push_back({inst, static_cast<double>(*time_ps)});
+    }
+  }
+
+  std::sort(sorted_metrics_diff.begin(), sorted_metrics_diff.end());
+  std::vector<std::string> metrics_diff_list(sorted_metrics_diff.size());
+  for (const auto& [inst, metrics_diff] : sorted_metrics_diff) {
+    metrics_diff_list.push_back(absl::StrFormat(
+        "%s: %.2f (us)",
+        PrintInstructionPair(inst, /*right_inst=*/nullptr, url_generator),
+        metrics_diff / 1e6));
+  }
+  return PrintList(metrics_diff_list);
+}
+
+std::string PrintMatchedMetricsDiff(
+    const absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        instructions,
+    const OpMetricGetter& left_op_metric_getter,
+    const OpMetricGetter& right_op_metric_getter,
+    GraphUrlGenerator* url_generator) {
+  std::vector<std::pair<std::pair<const HloInstruction*, const HloInstruction*>,
+                        double>>
+      sorted_metrics_diff;
+  for (const auto& [left_inst, right_inst] : instructions) {
+    absl::StatusOr<uint64_t> left_time_ps =
+        left_op_metric_getter.GetOpTimePs(left_inst->name());
+    absl::StatusOr<uint64_t> right_time_ps =
+        right_op_metric_getter.GetOpTimePs(right_inst->name());
+    if (!left_time_ps.ok() || !right_time_ps.ok()) {
+      continue;
+    }
+    sorted_metrics_diff.push_back(
+        {{left_inst, right_inst},
+         static_cast<double>(*right_time_ps - *left_time_ps)});
+  }
+  std::sort(sorted_metrics_diff.begin(), sorted_metrics_diff.end());
+  std::vector<std::string> metrics_diff_list(sorted_metrics_diff.size());
+  for (const auto& [inst_pair, metrics_diff] : sorted_metrics_diff) {
+    const auto& [left_inst, right_inst] = inst_pair;
+    metrics_diff_list.push_back(absl::StrFormat(
+        "%s: %.2f (us)",
+        PrintInstructionPair(left_inst, right_inst, url_generator),
+        metrics_diff / 1e6));
+  }
+  return PrintList(metrics_diff_list);
+}
+
+// Summarize a diff pattern.
+std::string SummarizeDiffPattern(const ComputationDiffPattern& diff_pattern) {
+  if (diff_pattern.computation_groups.size() > 1) {
+    return absl::StrFormat("Summarized %d computations with the same diff",
+                           diff_pattern.computation_groups.size());
+  }
+  return "A single computation has unique diff";
+}
+
+// Prints the summary of the repetitive diff patterns.
+std::string PrintRepetitiveDiffPatterns(
+    absl::Span<const ComputationDiffPattern> diff_patterns,
+    GraphUrlGenerator* url_generator) {
+  // Sort the diff patterns by the number of computations in each group in
+  // descending order.
+  std::vector<ComputationDiffPattern> sorted_diff_patterns;
+  for (const ComputationDiffPattern& diff_pattern : diff_patterns) {
+    sorted_diff_patterns.push_back(diff_pattern);
+  }
+  std::sort(
+      sorted_diff_patterns.begin(), sorted_diff_patterns.end(),
+      [](const ComputationDiffPattern& a, const ComputationDiffPattern& b) {
+        return a.computation_groups.size() > b.computation_groups.size();
+      });
+  std::string computation_group_list;
+  int i = 0;
+  for (const auto& diff_pattern : sorted_diff_patterns) {
+    if (diff_pattern.computation_groups.empty()) {
+      continue;
+    }
+    const ComputationGroup& sample = diff_pattern.computation_groups[0];
+    // We only print the one-to-one mapping for now.
+    if (sample.left_computations.size() != 1 ||
+        sample.right_computations.size() != 1) {
+      continue;
+    }
+    std::vector<std::string> computation_pair_list;
+    for (const ComputationGroup& computation_group :
+         diff_pattern.computation_groups) {
+      if (computation_group.left_computations.size() != 1 ||
+          computation_group.right_computations.size() != 1) {
+        continue;
+      }
+      const HloComputation* left_computation =
+          computation_group.left_computations[0];
+      const HloComputation* right_computation =
+          computation_group.right_computations[0];
+      computation_pair_list.push_back(PrintComputationPair(
+          left_computation, right_computation, url_generator));
+    }
+    absl::StrAppend(
+        &computation_group_list,
+        PrintDetails(
+            absl::StrFormat("Group %d: %s (Sample: %s → %s)", ++i,
+                            SummarizeDiffPattern(diff_pattern),
+                            sample.left_computations[0]->name(),
+                            sample.right_computations[0]->name()),
+            PrintAttributesList(
+                {absl::StrFormat(
+                     "Instruction count: %d → %d",
+                     sample.left_computations[0]->instruction_count(),
+                     sample.right_computations[0]->instruction_count()),
+                 absl::StrFormat(
+                     "Diff summary: %d changed, %d left unmatched, %d right "
+                     "unmatched",
+                     diff_pattern.diff_metrics.changed_instruction_count,
+                     diff_pattern.diff_metrics.left_unmatched_instruction_count,
+                     diff_pattern.diff_metrics
+                         .right_unmatched_instruction_count),
+                 PrintDetails("Instances",
+                              PrintList(computation_pair_list))})));
+  }
+  return computation_group_list;
+}
+
+}  // namespace
+
+void RenderHtml(const DiffResult& diff_result, const DiffSummary& diff_summary,
+                GraphUrlGenerator* url_generator,
+                OpMetricGetter* left_op_metric_getter,
+                OpMetricGetter* right_op_metric_getter,
+                std::ostringstream& out) {
+  const absl::flat_hash_set<HloOpcode> ignored_opcodes(kIgnoredOpcodes.begin(),
+                                                       kIgnoredOpcodes.end());
+  out << PrintCss() << PrintJavascript();
+
+  // Print full diff results
+  out << PrintSectionWithHeader(
+      "Full Diff Results",
+      absl::StrCat(
+          PrintDetails(
+              absl::StrFormat(
+                  "Unmatched Instructions (left) (%d)",
+                  diff_result.left_module_unmatched_instructions.size()),
+              PrintUnmatchedInstructions(
+                  diff_result.left_module_unmatched_instructions,
+                  InstructionLocation::kLeft, ignored_opcodes,
+                  /*name_only=*/false, url_generator)),
+          PrintDetails(
+              absl::StrFormat(
+                  "Unmatched Instructions (right) (%d)",
+                  diff_result.right_module_unmatched_instructions.size()),
+              PrintUnmatchedInstructions(
+                  diff_result.right_module_unmatched_instructions,
+                  InstructionLocation::kRight, ignored_opcodes,
+                  /*name_only=*/false, url_generator)),
+          PrintDetails(
+              absl::StrFormat("Changed Instructions (%d)",
+                              diff_result.changed_instructions.size()),
+              PrintChangedInstructions(diff_result.changed_instructions,
+                                       ignored_opcodes, url_generator))));
+
+  // Print profile metrics diff
+  if (left_op_metric_getter != nullptr && right_op_metric_getter != nullptr) {
+    out << PrintSectionWithHeader(
+        "Profile Metrics Diff",
+        absl::StrCat(
+            PrintDetails("Left Module Unmatched Instructions",
+                         PrintUnmatchedMetricsDiff(
+                             diff_result.left_module_unmatched_instructions,
+                             *left_op_metric_getter, url_generator)),
+            PrintDetails("Right Module Unmatched Instructions",
+                         PrintUnmatchedMetricsDiff(
+                             diff_result.right_module_unmatched_instructions,
+                             *right_op_metric_getter, url_generator)),
+            PrintDetails(
+                "Changed Instructions",
+                PrintMatchedMetricsDiff(
+                    diff_result.changed_instructions, *left_op_metric_getter,
+                    *right_op_metric_getter, url_generator)),
+            PrintDetails(
+                "Unchanged Instructions",
+                PrintMatchedMetricsDiff(
+                    diff_result.unchanged_instructions, *left_op_metric_getter,
+                    *right_op_metric_getter, url_generator))));
+  }
+
+  // Print repetitive computation groups
+  out << PrintSectionWithHeader(
+      "Group of computations with the same diff",
+      PrintRepetitiveDiffPatterns(diff_summary.computation_diff_patterns,
+                                  url_generator));
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.h b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.h
new file mode 100644
index 000000000000..121340dbe8e4
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_HTML_RENDERER_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_HTML_RENDERER_H_
+
+#include <sstream>
+
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/render/graph_url_generator.h"
+#include "xla/hlo/tools/hlo_diff/render/op_metric_getter.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Renders the diff result in HTML format, and writes the result to the given
+// output stream. url_generator can be specified which is used to link an url to
+// each generated diff result.
+void RenderHtml(const DiffResult& diff_result, const DiffSummary& diff_summary,
+                GraphUrlGenerator* url_generator,
+                OpMetricGetter* left_op_metric_getter,
+                OpMetricGetter* right_op_metric_getter,
+                std::ostringstream& out);
+inline void RenderHtml(const DiffResult& diff_result,
+                       const DiffSummary& diff_summary,
+                       GraphUrlGenerator* url_generator,
+                       std::ostringstream& out) {
+  RenderHtml(diff_result, diff_summary, url_generator,
+             /*left_op_metric_getter=*/nullptr,
+             /*right_op_metric_getter=*/nullptr, out);
+}
+inline void RenderHtml(const DiffResult& diff_result,
+                       const DiffSummary& diff_summary,
+                       std::ostringstream& out) {
+  RenderHtml(diff_result, diff_summary, /*url_generator=*/nullptr,
+             /*left_op_metric_getter=*/nullptr,
+             /*right_op_metric_getter=*/nullptr, out);
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_HTML_RENDERER_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer_test.cc
new file mode 100644
index 000000000000..33196073b8d8
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_html_renderer.h"
+
+#include <cstdint>
+#include <sstream>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_summary.h"
+#include "xla/hlo/tools/hlo_diff/render/op_metric_getter.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::HasSubstr;
+using ::testing::Not;
+
+// A mock OpMetricGetter for testing.
+class MockOpMetricGetter : public OpMetricGetter {
+ public:
+  MOCK_METHOD(absl::StatusOr<uint64_t>, GetOpTimePs, (absl::string_view),
+              (const, override));
+};
+
+TEST(HloGumgraphHtmlRendererTest, RenderHtml) {
+  DiffResult diff_result;
+  DiffSummary diff_summary;
+  std::ostringstream out;
+  RenderHtml(diff_result, diff_summary, out);
+  EXPECT_THAT(out.str(), HasSubstr("<style>"));
+  EXPECT_THAT(out.str(), HasSubstr("<script>"));
+}
+
+TEST(HloGumgraphHtmlRendererTest, RenderHtmlWithOpMetrics) {
+  DiffResult diff_result;
+  DiffSummary diff_summary;
+  std::ostringstream out;
+  MockOpMetricGetter op_metrics;
+  RenderHtml(diff_result, diff_summary, nullptr, &op_metrics, &op_metrics, out);
+  EXPECT_THAT(out.str(), HasSubstr("Profile Metrics Diff"));
+}
+
+TEST(HloGumgraphHtmlRendererTest, RenderHtmlWithoutOpMetrics) {
+  DiffResult diff_result;
+  DiffSummary diff_summary;
+  std::ostringstream out;
+  RenderHtml(diff_result, diff_summary, nullptr, nullptr, nullptr, out);
+  EXPECT_THAT(out.str(), Not(HasSubstr("Profile Metrics Diff")));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.cc b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.cc
new file mode 100644
index 000000000000..89d82fe9ae1d
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.cc
@@ -0,0 +1,140 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.h"
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+
+namespace xla {
+namespace hlo_diff {
+
+std::string InstructionToString(const HloInstruction* instr, bool name_only) {
+  if (name_only) {
+    return std::string(instr->name());
+  }
+  return instr->ToString(HloPrintOptions::ShortParsable());
+}
+
+std::vector<ChangedInstructionDiffType> GetChangedInstructionDiffTypes(
+    const HloInstruction& left, const HloInstruction& right) {
+  // Compare shapes, layouts and memory spaces
+  std::vector<ChangedInstructionDiffType> diff_types;
+  if (left.shape() != right.shape()) {
+    diff_types.push_back(ChangedInstructionDiffType::kShapeChange);
+
+    if (left.shape().IsArray() && right.shape().IsArray() &&
+        left.shape().has_layout() && right.shape().has_layout() &&
+        (left.shape().layout() != right.shape().layout())) {
+      diff_types.push_back(ChangedInstructionDiffType::kLayoutChange);
+      if (left.shape().layout().memory_space() !=
+          right.shape().layout().memory_space()) {
+        diff_types.push_back(ChangedInstructionDiffType::kMemorySpaceChange);
+      }
+    }
+  }
+
+  // Compare operand numbers and shapes
+  if (left.operand_count() != right.operand_count()) {
+    diff_types.push_back(ChangedInstructionDiffType::kChangedOperandsNumber);
+  } else {  // If operand numbers are the same, compare shapes
+    for (int64_t i = 0; i < left.operand_count(); ++i) {
+      if (left.operand(i)->shape() != right.operand(i)->shape()) {
+        diff_types.push_back(ChangedInstructionDiffType::kChangedOperandsShape);
+        break;
+      }
+    }
+  }
+
+  // Compare opcodes
+  if (left.opcode() != right.opcode()) {
+    diff_types.push_back(ChangedInstructionDiffType::kOpCodeChanged);
+  }
+
+  // Compare constants
+  if (left.IsConstant() && right.IsConstant()) {
+    if (left.literal() != right.literal()) {
+      diff_types.push_back(ChangedInstructionDiffType::kConstantLiteralChanged);
+    }
+  }
+
+  // If no diff type is found, return kOtherChange.
+  if (diff_types.empty()) {
+    diff_types.push_back(ChangedInstructionDiffType::kOtherChange);
+  }
+
+  return diff_types;
+};
+
+std::string GetChangedInstructionDiffTypeString(
+    ChangedInstructionDiffType diff_type) {
+  switch (diff_type) {
+    case ChangedInstructionDiffType::kOtherChange:
+      return "kOtherChange";
+    case ChangedInstructionDiffType::kShapeChange:
+      return "kShapeChange";
+    case ChangedInstructionDiffType::kLayoutChange:
+      return "kLayoutChange";
+    case ChangedInstructionDiffType::kMemorySpaceChange:
+      return "kMemorySpaceChange";
+    case ChangedInstructionDiffType::kChangedOperandsNumber:
+      return "kChangedOperandsNumber";
+    case ChangedInstructionDiffType::kChangedOperandsShape:
+      return "kChangedOperandsShape";
+    case ChangedInstructionDiffType::kOpCodeChanged:
+      return "kOpCodeChanged";
+    case ChangedInstructionDiffType::kConstantLiteralChanged:
+      return "kConstantLiteralChanged";
+    default:
+      return "";
+  }
+}
+
+absl::flat_hash_map<HloOpcode, std::vector<const HloInstruction*>>
+GroupInstructionsByOpcode(
+    const absl::flat_hash_set<const HloInstruction*>& instructions) {
+  absl::flat_hash_map<HloOpcode, std::vector<const HloInstruction*>>
+      instructions_by_opcode;
+  for (const HloInstruction* inst : instructions) {
+    instructions_by_opcode[inst->opcode()].push_back(inst);
+  }
+  return instructions_by_opcode;
+}
+
+absl::flat_hash_map<
+    HloOpcode,
+    std::vector<std::pair<const HloInstruction*, const HloInstruction*>>>
+GroupInstructionPairsByOpcode(
+    const absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        instructions) {
+  absl::flat_hash_map<
+      HloOpcode,
+      std::vector<std::pair<const HloInstruction*, const HloInstruction*>>>
+      instructions_by_opcode;
+  for (const auto& pair : instructions) {
+    instructions_by_opcode[pair.first->opcode()].push_back(pair);
+  }
+  return instructions_by_opcode;
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.h b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.h
new file mode 100644
index 000000000000..91d3ed79b802
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_RENDERER_UTIL_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_RENDERER_UTIL_H_
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Print the instruction to string.
+std::string InstructionToString(const HloInstruction* instr, bool name_only);
+
+// Enum representing the type of changes for a pair of changed instructions.
+enum class ChangedInstructionDiffType : uint8_t {
+  kOtherChange,
+  kShapeChange,
+  kLayoutChange,
+  kMemorySpaceChange,
+  kChangedOperandsNumber,
+  kChangedOperandsShape,
+  kOpCodeChanged,
+  kConstantLiteralChanged,
+};
+
+// Returns details on what exactly has changed for a pair of changed
+// instruction.
+std::vector<ChangedInstructionDiffType> GetChangedInstructionDiffTypes(
+    const HloInstruction& left, const HloInstruction& right);
+
+// Converts the changed instruction diff type enum value to a string.
+std::string GetChangedInstructionDiffTypeString(
+    ChangedInstructionDiffType diff_type);
+
+// Opcodes to be ignored when printing summaries.
+inline constexpr auto kIgnoredOpcodes = std::array<HloOpcode, 6>(
+    {HloOpcode::kReshape, HloOpcode::kBitcast, HloOpcode::kPad,
+     HloOpcode::kCopyDone, HloOpcode::kCopyStart, HloOpcode::kGetTupleElement});
+
+// Groups the instructions by opcode.
+absl::flat_hash_map<HloOpcode, std::vector<const HloInstruction*>>
+GroupInstructionsByOpcode(
+    const absl::flat_hash_set<const HloInstruction*>& instructions);
+
+// Groups the instruction pairs by opcode.
+absl::flat_hash_map<
+    HloOpcode,
+    std::vector<std::pair<const HloInstruction*, const HloInstruction*>>>
+GroupInstructionPairsByOpcode(
+    const absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        instructions);
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_RENDERER_UTIL_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util_test.cc
new file mode 100644
index 000000000000..830ffe43b3a9
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util_test.cc
@@ -0,0 +1,92 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+using ::testing::Pair;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
+
+class HloDiffRendererUtilTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(HloDiffRendererUtilTest, GroupInstructionsByOpcode) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule test_module
+
+ENTRY test_computation {
+  param1 = s32[10] parameter(0)
+  param2 = s32[10] parameter(1)
+  add = s32[10] add(param1, param2)
+  ROOT sub = s32[10] subtract(add, param2)
+}
+  )"));
+  absl::flat_hash_set<const HloInstruction*> instructions;
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      instructions.insert(instruction);
+    }
+  }
+
+  EXPECT_THAT(GroupInstructionsByOpcode(instructions),
+              UnorderedElementsAre(Pair(HloOpcode::kParameter, SizeIs(2)),
+                                   Pair(HloOpcode::kAdd, SizeIs(1)),
+                                   Pair(HloOpcode::kSubtract, SizeIs(1))));
+}
+
+TEST_F(HloDiffRendererUtilTest, GroupInstructionPairsByOpcode) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule test_module
+
+ENTRY test_computation {
+  param1 = s32[10] parameter(0)
+  param2 = s32[10] parameter(1)
+  add = s32[10] add(param1, param2)
+  ROOT sub = s32[10] subtract(add, param2)
+}
+  )"));
+  absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
+      instruction_map;
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      instruction_map[instruction] = instruction;
+    }
+  }
+
+  EXPECT_THAT(GroupInstructionPairsByOpcode(instruction_map),
+              UnorderedElementsAre(Pair(HloOpcode::kParameter, SizeIs(2)),
+                                   Pair(HloOpcode::kAdd, SizeIs(1)),
+                                   Pair(HloOpcode::kSubtract, SizeIs(1))));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.cc b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.cc
new file mode 100644
index 000000000000..3ed75fc588f8
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.cc
@@ -0,0 +1,270 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+#include "xla/hlo/tools/hlo_diff/render/hlo_gumgraph_renderer_util.h"
+
+namespace xla {
+namespace hlo_diff {
+namespace {
+
+// Prints unmatched instructions grouped by opcode and print in a descending
+// order of the number of instructions for each opcode. If top_n_opcodes or
+// max_instructions_per_opcode is a negative number, all the instructions will
+// be printed.
+void PrintUnmatchedInstructions(
+    const absl::string_view header,
+    const absl::flat_hash_set<const HloInstruction*>& instructions,
+    std::ostringstream& out, const RenderTextOptions& options) {
+  out << header;
+  if (options.top_n_opcodes >= 0) {
+    out << " (top " << options.top_n_opcodes << " frequent opcode)";
+  }
+  if (!options.opcodes_to_ignore.empty()) {
+    out << " (ignoring "
+        << absl::StrJoin(options.opcodes_to_ignore, ", ",
+                         [](std::string* out, const HloOpcode& opcode) {
+                           absl::StrAppend(out, HloOpcodeString(opcode));
+                         })
+        << ")";
+  }
+  out << ":\n";
+
+  absl::flat_hash_map<HloOpcode, std::vector<const HloInstruction*>>
+      instructions_by_opcode = GroupInstructionsByOpcode(instructions);
+  std::vector<std::pair<HloOpcode, int64_t>> opcode_counts;
+  for (const auto& [opcode, insts] : instructions_by_opcode) {
+    opcode_counts.push_back({opcode, insts.size()});
+  }
+  std::sort(opcode_counts.begin(), opcode_counts.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+  // Print the top N most frequent opcodes
+  int i = 0;
+  for (auto cit = opcode_counts.begin();
+       (options.top_n_opcodes < 0 || i < options.top_n_opcodes) &&
+       cit != opcode_counts.end();
+       ++cit) {
+    if (options.opcodes_to_ignore.contains(cit->first)) {
+      continue;
+    }
+    absl::string_view op_name = HloOpcodeString(cit->first);
+    out << "  " << op_name << " (" << cit->second << "):\n";
+    std::vector<const HloInstruction*> insts =
+        instructions_by_opcode[cit->first];
+    // Print the M instructions for each opcode
+    int j = 0;
+    for (auto iit = insts.begin(); (options.max_instructions_per_opcode < 0 ||
+                                    j < options.max_instructions_per_opcode) &&
+                                   iit != insts.end();
+         ++j, ++iit) {
+      out << "    " << InstructionToString(*iit, options.name_only) << "\n";
+    }
+    if (j < insts.size()) {
+      out << "    ... and " << insts.size() - j << " more " << op_name
+          << " instructions\n";
+    }
+    out << "\n";
+    ++i;
+  }
+  if (i < opcode_counts.size()) {
+    out << "  ... and " << opcode_counts.size() - i << " more opcodes\n";
+  }
+  out << "\n";
+}
+
+// Prints changed or unchanged instructions grouped by opcode and print in a
+// descending order of the number of instructions for each opcode. If
+// top_n_opcodes or max_instructions_per_opcode is a negative number, all the
+// instructions will be printed.
+void PrintChangedAndUnchangedInstructions(
+    absl::string_view header,
+    const absl::flat_hash_map<const HloInstruction*, const HloInstruction*>&
+        instructions,
+    std::ostringstream& out, bool is_changed_pair,
+    const RenderTextOptions& options) {
+  out << header;
+  if (options.top_n_opcodes >= 0) {
+    out << " (top " << options.top_n_opcodes << " frequent opcode)";
+  }
+  if (!options.opcodes_to_ignore.empty()) {
+    out << " (ignoring "
+        << absl::StrJoin(options.opcodes_to_ignore, ", ",
+                         [](std::string* out, const HloOpcode& opcode) {
+                           absl::StrAppend(out, HloOpcodeString(opcode));
+                         })
+        << ")";
+  }
+  out << ":\n";
+  absl::flat_hash_map<
+      HloOpcode,
+      std::vector<std::pair<const HloInstruction*, const HloInstruction*>>>
+      instructions_by_opcode = GroupInstructionPairsByOpcode(instructions);
+  std::vector<std::pair<HloOpcode, int64_t>> opcode_counts;
+  for (const auto& [opcode, insts] : instructions_by_opcode) {
+    opcode_counts.push_back({opcode, insts.size()});
+  }
+  std::sort(opcode_counts.begin(), opcode_counts.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+  // Print the top N most frequent opcodes
+  int i = 0;
+  for (auto cit = opcode_counts.begin();
+       (options.top_n_opcodes < 0 || i < options.top_n_opcodes) &&
+       cit != opcode_counts.end();
+       ++cit) {
+    if (options.opcodes_to_ignore.contains(cit->first)) {
+      continue;
+    }
+    absl::string_view op_name = HloOpcodeString(cit->first);
+    out << "  " << op_name << " (" << cit->second << ")";
+    if (is_changed_pair) {
+      // Count and sort the number of diff types for each opcode
+      absl::flat_hash_map<ChangedInstructionDiffType, int64_t> diff_type_counts;
+      for (const auto& inst_pair : instructions_by_opcode[cit->first]) {
+        std::vector<ChangedInstructionDiffType> diff_types =
+            GetChangedInstructionDiffTypes(*inst_pair.first, *inst_pair.second);
+        for (const auto& diff_type : diff_types) {
+          diff_type_counts[diff_type]++;
+        }
+      }
+      std::vector<std::pair<ChangedInstructionDiffType, int64_t>>
+          diff_type_counts_vec(diff_type_counts.begin(),
+                               diff_type_counts.end());
+      std::sort(
+          diff_type_counts_vec.begin(), diff_type_counts_vec.end(),
+          [](const auto& a, const auto& b) { return a.second > b.second; });
+
+      out << ", top diff types: "
+          << absl::StrJoin(
+                 diff_type_counts_vec, ", ",
+                 [](std::string* out, const auto& pair) {
+                   absl::StrAppend(
+                       out, GetChangedInstructionDiffTypeString(pair.first),
+                       " (", pair.second, ")");
+                 });
+    }
+    out << "\n";
+    std::vector<std::pair<const HloInstruction*, const HloInstruction*>> insts =
+        instructions_by_opcode[cit->first];
+
+    // Print the M instructions for each opcode
+    int j = 0;
+    for (auto iit = insts.begin(); (options.max_instructions_per_opcode < 0 ||
+                                    j < options.max_instructions_per_opcode) &&
+                                   iit != insts.end();
+         ++j, ++iit) {
+      if (is_changed_pair) {
+        std::vector<ChangedInstructionDiffType> diff_types =
+            GetChangedInstructionDiffTypes(*iit->first, *iit->second);
+        out << "    " << InstructionToString(iit->first, /*name_only=*/true)
+            << " and " << InstructionToString(iit->second, /*name_only=*/true)
+            << " have changed: "
+            << absl::StrJoin(
+                   diff_types, ", ",
+                   [](std::string* out, const auto& diff_type) {
+                     return absl::StrAppend(
+                         out, GetChangedInstructionDiffTypeString(diff_type));
+                   })
+            << "\n";
+        if (!options.name_only) {
+          out << "      Left: "
+              << InstructionToString(iit->first, /*name_only=*/false) << "\n";
+          out << "      Right: "
+              << InstructionToString(iit->second, /*name_only=*/false) << "\n";
+        }
+      } else {
+        out << "    " << InstructionToString(iit->first, options.name_only)
+            << "\n";
+      }
+    }
+    if (j < insts.size()) {
+      out << "    ... and " << insts.size() - j << " more " << op_name
+          << " instructions\n";
+    }
+    out << "\n";
+    ++i;
+  }
+  if (i < opcode_counts.size()) {
+    out << "  ... and " << opcode_counts.size() - i << " more opcodes\n";
+  }
+  out << "\n";
+}
+
+}  // namespace
+
+void RenderText(const DiffResult& diff_result, std::ostringstream& out,
+                const RenderTextOptions& options) {
+  // Print unmatched instructions
+  PrintUnmatchedInstructions("Unmatched Instructions (left)",
+                             diff_result.left_module_unmatched_instructions,
+                             out, options);
+
+  PrintUnmatchedInstructions("Unmatched Instructions (right)",
+                             diff_result.right_module_unmatched_instructions,
+                             out, options);
+
+  // Print changed instructions (print both left and right)
+  PrintChangedAndUnchangedInstructions("Changed Instructions",
+                                       diff_result.changed_instructions, out,
+                                       true, options);
+
+  if (options.print_unchanged_instructions) {
+    // Print unchanged instructions (print only the first instruction)
+    PrintChangedAndUnchangedInstructions("Unchanged Instructions",
+                                         diff_result.unchanged_instructions,
+                                         out, false, options);
+  }
+}
+
+void RenderTextSummary(const DiffResult& diff_result, std::ostringstream& out) {
+  // Print a summary of the diff results
+  out << "Diff Summary:\n";
+  out << "  Unmatched instructions (left): "
+      << diff_result.left_module_unmatched_instructions.size() << "\n";
+  out << "  Unmatched instructions (right): "
+      << diff_result.right_module_unmatched_instructions.size() << "\n";
+  out << "  Changed instructions: " << diff_result.changed_instructions.size()
+      << "\n";
+  out << "  Unchanged instructions: "
+      << diff_result.unchanged_instructions.size() << "\n";
+  out << "\n";
+
+  RenderTextOptions options = {
+      .top_n_opcodes = 5,
+      .max_instructions_per_opcode = 5,
+      .name_only = true,
+      .opcodes_to_ignore = absl::flat_hash_set<HloOpcode>(
+          kIgnoredOpcodes.begin(), kIgnoredOpcodes.end()),
+      .print_unchanged_instructions = false};
+  RenderText(diff_result, out, options);
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.h b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.h
new file mode 100644
index 000000000000..bd56b34e55e3
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/hlo_gumgraph_text_renderer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_TEXT_RENDERER_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_TEXT_RENDERER_H_
+
+#include <sstream>
+
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/tools/hlo_diff/hlo_diff_result.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Options for rendering the diff result to text.
+struct RenderTextOptions {
+  // Print the top n opcodes. If negative, print all opcodes.
+  int top_n_opcodes = -1;
+  // Print the top n instructions per opcode. If negative, print all
+  // instructions.
+  int max_instructions_per_opcode = -1;
+  // If true, only print the instruction name. Otherwise, print the full details
+  // of the instruction.
+  bool name_only = false;
+  // Opcodes to be ignored when printing summaries.
+  absl::flat_hash_set<HloOpcode> opcodes_to_ignore;
+  // If true, print the unchanged instructions.
+  bool print_unchanged_instructions = true;
+};
+
+// Renders the diff result to a text output stream.
+void RenderText(const DiffResult& diff_result, std::ostringstream& out,
+                const RenderTextOptions& options = {});
+
+// Renders the diff summary to a text output stream.
+void RenderTextSummary(const DiffResult& diff_result, std::ostringstream& out);
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_RENDER_HLO_GUMGRAPH_TEXT_RENDERER_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/render/op_metric_getter.h b/third_party/xla/xla/hlo/tools/hlo_diff/render/op_metric_getter.h
new file mode 100644
index 000000000000..8d5d9ed33dd9
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/render/op_metric_getter.h
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_RENDER_OP_METRIC_GETTER_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_RENDER_OP_METRIC_GETTER_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// An interface for getting op metrics.
+class OpMetricGetter {
+ public:
+  virtual ~OpMetricGetter() = default;
+
+  // Returns the op time in picoseconds.
+  virtual absl::StatusOr<uint64_t> GetOpTimePs(
+      absl::string_view op_name) const = 0;
+};
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_RENDER_OP_METRIC_GETTER_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/utils/BUILD b/third_party/xla/xla/hlo/tools/hlo_diff/utils/BUILD
new file mode 100644
index 000000000000..32231488c5bb
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/utils/BUILD
@@ -0,0 +1,67 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "connected_components",
+    srcs = ["connected_components.cc"],
+    hdrs = ["connected_components.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+xla_cc_test(
+    name = "connected_components_test",
+    srcs = ["connected_components_test.cc"],
+    deps = [
+        ":connected_components",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_diff_util",
+    hdrs = ["hlo_diff_util.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "@local_tsl//tsl/platform:fingerprint",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = True,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/tools/hlo_diff:hlo_gumgraph_mappings",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph",
+        "//xla/hlo/tools/hlo_diff/graph:hlo_gumgraph_node",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_for_library",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components.cc b/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components.cc
new file mode 100644
index 000000000000..064cca682722
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components.cc
@@ -0,0 +1,71 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/utils/connected_components.h"
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_computation.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Find the representative of the set (with path compression)
+const HloComputation* ConnectedComponentsFinder::Find(const HloComputation* i) {
+  if (parent_.find(i) == parent_.end() || parent_[i] == i) {
+    parent_[i] = i;
+    return i;
+  }
+  return parent_[i] = Find(parent_[i]);  // Path compression
+}
+
+// Union the sets containing a and b (by making one parent the other)
+void ConnectedComponentsFinder::Union(const HloComputation* a,
+                                      const HloComputation* b) {
+  const HloComputation* root_a = Find(a);
+  const HloComputation* root_b = Find(b);
+  if (root_a != root_b) {
+    parent_[root_a] = root_b;
+  }
+}
+
+// Add an edge between two computations
+void ConnectedComponentsFinder::AddEdge(const HloComputation* u,
+                                        const HloComputation* v) {
+  nodes_.insert(u);
+  nodes_.insert(v);
+  Union(u, v);
+}
+
+// Find and return the connected components
+std::vector<std::vector<const HloComputation*>>
+ConnectedComponentsFinder::FindConnectedComponents() {
+  absl::flat_hash_map<const HloComputation*, std::vector<const HloComputation*>>
+      components;
+  for (const auto& node : nodes_) {
+    components[Find(node)].push_back(node);
+  }
+
+  std::vector<std::vector<const HloComputation*>> result;
+  result.reserve(components.size());
+  for (auto& [root, component_nodes] : components) {
+    result.push_back(component_nodes);
+  }
+  return result;
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components.h b/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components.h
new file mode 100644
index 000000000000..61bef8aa2fc6
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_UTILS_CONNECTED_COMPONENTS_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_UTILS_CONNECTED_COMPONENTS_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_computation.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Finds the connected components in an undirected graph of HloComputations.
+class ConnectedComponentsFinder {
+ public:
+  // Add an edge between two computations
+  void AddEdge(const HloComputation* u, const HloComputation* v);
+
+  // Find and return the connected components
+  std::vector<std::vector<const HloComputation*>> FindConnectedComponents();
+
+ private:
+  // Find the representative of the set (with path compression)
+  const HloComputation* Find(const HloComputation* i);
+
+  // Union the sets containing a and b (by making one parent the other)
+  void Union(const HloComputation* a, const HloComputation* b);
+
+  absl::flat_hash_map<const HloComputation*, const HloComputation*> parent_;
+  absl::flat_hash_set<const HloComputation*> nodes_;
+};
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_UTILS_CONNECTED_COMPONENTS_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components_test.cc b/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components_test.cc
new file mode 100644
index 000000000000..10b8f2b7b981
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/utils/connected_components_test.cc
@@ -0,0 +1,153 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/utils/connected_components.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace hlo_diff {
+
+namespace {
+using ::testing::IsEmpty;
+using ::testing::UnorderedElementsAre;
+
+// Helper function to create a simple HloComputation
+std::unique_ptr<HloComputation> MakeComputation(absl::string_view name) {
+  auto builder = HloComputation::Builder(name);
+  HloInstruction* x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "x"));
+  builder.AddInstruction(HloInstruction::CreateTuple({x}));
+  return builder.Build();
+}
+
+class ConnectedComponentsFinderTest : public ::testing::Test {};
+
+TEST_F(ConnectedComponentsFinderTest, EmptyGraph) {
+  ConnectedComponentsFinder cc_finder;
+  EXPECT_THAT(cc_finder.FindConnectedComponents(), IsEmpty());
+}
+
+TEST_F(ConnectedComponentsFinderTest, SingleNodeNoEdges) {
+  ConnectedComponentsFinder cc_finder;
+  auto c1 = MakeComputation("c1");
+  cc_finder.AddEdge(c1.get(), c1.get());  // Adding a self-loop, node exists
+  auto components = cc_finder.FindConnectedComponents();
+  ASSERT_THAT(components.size(), 1);
+  EXPECT_THAT(components[0], UnorderedElementsAre(c1.get()));
+}
+
+TEST_F(ConnectedComponentsFinderTest, TwoSeparateNodes) {
+  ConnectedComponentsFinder cc_finder;
+  auto c1 = MakeComputation("c1");
+  auto c2 = MakeComputation("c2");
+  // Don't add an edge between c1 and c2
+  cc_finder.AddEdge(c1.get(), c1.get());
+  cc_finder.AddEdge(c2.get(), c2.get());
+  auto components = cc_finder.FindConnectedComponents();
+  ASSERT_THAT(components.size(), 2);
+  EXPECT_THAT(components, UnorderedElementsAre(UnorderedElementsAre(c1.get()),
+                                               UnorderedElementsAre(c2.get())));
+}
+
+TEST_F(ConnectedComponentsFinderTest, TwoConnectedNodes) {
+  ConnectedComponentsFinder cc_finder;
+  auto c1 = MakeComputation("c1");
+  auto c2 = MakeComputation("c2");
+  cc_finder.AddEdge(c1.get(), c2.get());
+  auto components = cc_finder.FindConnectedComponents();
+  ASSERT_THAT(components.size(), 1);
+  EXPECT_THAT(components[0], UnorderedElementsAre(c1.get(), c2.get()));
+}
+
+TEST_F(ConnectedComponentsFinderTest, ThreeNodesLinearConnection) {
+  ConnectedComponentsFinder cc_finder;
+  auto c1 = MakeComputation("c1");
+  auto c2 = MakeComputation("c2");
+  auto c3 = MakeComputation("c3");
+  cc_finder.AddEdge(c1.get(), c2.get());
+  cc_finder.AddEdge(c2.get(), c3.get());
+  auto components = cc_finder.FindConnectedComponents();
+  ASSERT_THAT(components.size(), 1);
+  EXPECT_THAT(components[0],
+              UnorderedElementsAre(c1.get(), c2.get(), c3.get()));
+}
+
+TEST_F(ConnectedComponentsFinderTest, ThreeNodesTriangleConnection) {
+  ConnectedComponentsFinder cc_finder;
+  auto c1 = MakeComputation("c1");
+  auto c2 = MakeComputation("c2");
+  auto c3 = MakeComputation("c3");
+  cc_finder.AddEdge(c1.get(), c2.get());
+  cc_finder.AddEdge(c2.get(), c3.get());
+  cc_finder.AddEdge(c3.get(), c1.get());
+  auto components = cc_finder.FindConnectedComponents();
+  ASSERT_THAT(components.size(), 1);
+  EXPECT_THAT(components[0],
+              UnorderedElementsAre(c1.get(), c2.get(), c3.get()));
+}
+
+TEST_F(ConnectedComponentsFinderTest, MixedConnectedAndSeparate) {
+  ConnectedComponentsFinder cc_finder;
+  auto c1 = MakeComputation("c1");
+  auto c2 = MakeComputation("c2");
+  auto c3 = MakeComputation("c3");
+  auto c4 = MakeComputation("c4");
+  auto c5 = MakeComputation("c5");
+  cc_finder.AddEdge(c1.get(), c2.get());
+  cc_finder.AddEdge(c2.get(), c3.get());
+  cc_finder.AddEdge(c4.get(), c4.get());  // c4 is separate
+  auto components = cc_finder.FindConnectedComponents();
+  ASSERT_THAT(components.size(), 2);
+  EXPECT_THAT(
+      components,
+      UnorderedElementsAre(UnorderedElementsAre(c1.get(), c2.get(), c3.get()),
+                           UnorderedElementsAre(c4.get())));
+}
+
+TEST_F(ConnectedComponentsFinderTest, LargerComponentOk) {
+  ConnectedComponentsFinder cc_finder;
+  auto c1 = MakeComputation("c1");
+  auto c2 = MakeComputation("c2");
+  auto c3 = MakeComputation("c3");
+  auto c4 = MakeComputation("c4");
+  auto c5 = MakeComputation("c5");
+  auto c6 = MakeComputation("c6");
+  cc_finder.AddEdge(c1.get(), c2.get());
+  cc_finder.AddEdge(c2.get(), c3.get());
+  cc_finder.AddEdge(c3.get(), c4.get());
+  cc_finder.AddEdge(c4.get(), c1.get());
+  cc_finder.AddEdge(c3.get(), c5.get());
+  cc_finder.AddEdge(c6.get(), c6.get());
+
+  auto components = cc_finder.FindConnectedComponents();
+
+  ASSERT_THAT(components.size(), 2);
+  EXPECT_THAT(components, UnorderedElementsAre(
+                              UnorderedElementsAre(c1.get(), c2.get(), c3.get(),
+                                                   c4.get(), c5.get()),
+                              UnorderedElementsAre(c6.get())));
+}
+
+}  // namespace
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/utils/hlo_diff_util.h b/third_party/xla/xla/hlo/tools/hlo_diff/utils/hlo_diff_util.h
new file mode 100644
index 000000000000..b1d24e3c313b
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/utils/hlo_diff_util.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_UTILS_HLO_DIFF_UTIL_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_UTILS_HLO_DIFF_UTIL_H_
+
+#include <cstdint>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "tsl/platform/fingerprint.h"
+
+namespace xla::hlo_diff {
+
+inline uint64_t GetHloInstructionFingerprint(
+    const HloInstruction* instruction,
+    const HloPrintOptions& hlo_print_options) {
+  return tsl::Fingerprint64(instruction->ToString(hlo_print_options));
+}
+
+inline uint64_t GetHloInstructionFingerprint(
+    const HloInstruction* instruction) {
+  return GetHloInstructionFingerprint(instruction,
+                                      HloPrintOptions::Fingerprint());
+}
+
+}  // namespace xla::hlo_diff
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_UTILS_HLO_DIFF_UTIL_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/utils/test_util.cc b/third_party/xla/xla/hlo/tools/hlo_diff/utils/test_util.cc
new file mode 100644
index 000000000000..ad771dc2f39a
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/utils/test_util.cc
@@ -0,0 +1,138 @@
+// Copyright 2025 The OpenXLA Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/hlo/tools/hlo_diff/utils/test_util.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+
+namespace xla {
+namespace hlo_diff {
+
+const HloInstructionNode* GetNodeByName(const HloGumgraph& graph,
+                                        absl::string_view name) {
+  for (const auto* node : graph.AllNodes()) {
+    if (!node->is_root && node->instruction->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+void OverwriteMapInstructions(const HloInstructionNode* left,
+                              const HloInstructionNode* right,
+                              HloGumgraphMappings& mappings,
+                              bool position_unchanged) {
+  ASSERT_NE(left, nullptr);
+  ASSERT_NE(right, nullptr);
+  if (auto it = mappings.left_to_right_instruction_map.left.find(left);
+      it != mappings.left_to_right_instruction_map.left.end()) {
+    mappings.left_to_right_instruction_map.left.erase(it);
+  }
+
+  if (auto it = mappings.left_to_right_instruction_map.right.find(right);
+      it != mappings.left_to_right_instruction_map.right.end()) {
+    mappings.left_to_right_instruction_map.right.erase(it);
+  }
+
+  mappings.left_to_right_instruction_map.insert(
+      InstructionPair(left, right, {.matcher_type = MatcherType::kManual}));
+  if (position_unchanged) {
+    mappings.left_to_right_instruction_map.left.find(left)->info.unchanged =
+        true;
+  }
+}
+
+void MatchAllNodesByName(const HloGumgraph& left, const HloGumgraph& right,
+                         HloGumgraphMappings& mappings) {
+  for (const auto* left_node : left.AllNodes()) {
+    if (left_node->is_root) {
+      continue;
+    }
+    const HloInstructionNode* right_node = nullptr;
+    for (const auto* node : right.AllNodes()) {
+      if (!node->is_root &&
+          node->instruction->name() == left_node->instruction->name()) {
+        right_node = node;
+        break;
+      }
+    }
+    if (right_node != nullptr) {
+      mappings.MapInstructionsIfAbsent(left_node, right_node,
+                                       MatcherType::kManual);
+    }
+  }
+}
+
+absl::flat_hash_map<std::string, std::string> ExtractMappedInstructionNames(
+    const HloGumgraphMappings& mappings) {
+  absl::flat_hash_map<std::string, std::string> mapped_nodes;
+  for (auto it = mappings.left_to_right_instruction_map.begin();
+       it != mappings.left_to_right_instruction_map.end(); ++it) {
+    absl::string_view left_name =
+        it->left->is_root ? "root_L" : it->left->instruction->name();
+    absl::string_view right_name =
+        it->right->is_root ? "root_R" : it->right->instruction->name();
+    mapped_nodes[left_name] = right_name;
+  }
+
+  return mapped_nodes;
+}
+
+absl::flat_hash_map<std::string, std::string> ExtractMappedComputationNames(
+    const HloGumgraphMappings& mappings) {
+  absl::flat_hash_map<std::string, std::string> mapped_computations;
+  for (auto it = mappings.left_to_right_computation_map.left.begin();
+       it != mappings.left_to_right_computation_map.left.end(); ++it) {
+    mapped_computations[it->first->computation()->name()] =
+        it->second->computation()->name();
+  }
+  return mapped_computations;
+}
+
+absl::flat_hash_map<std::string, ComputationMatchType>
+ExtractComputationMatchType(const HloGumgraphMappings& mappings) {
+  absl::flat_hash_map<std::string, ComputationMatchType> computation_match_type;
+  for (auto it = mappings.left_to_right_computation_map.left.begin();
+       it != mappings.left_to_right_computation_map.left.end(); ++it) {
+    computation_match_type[it->first->computation()->name()] =
+        it->info.computation_match_type;
+  }
+  return computation_match_type;
+}
+
+absl::StatusOr<HloInstruction*> GetInstructionByName(HloModule& module,
+                                                     absl::string_view name) {
+  for (HloComputation* computation : module.computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->name() == name) {
+        return instruction;
+      }
+    }
+  }
+  return absl::InvalidArgumentError("instruction not found");
+}
+
+}  // namespace hlo_diff
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_diff/utils/test_util.h b/third_party/xla/xla/hlo/tools/hlo_diff/utils/test_util.h
new file mode 100644
index 000000000000..411bf7433c07
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/hlo_diff/utils/test_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2025 The OpenXLA Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_HLO_TOOLS_HLO_DIFF_UTILS_TEST_UTIL_H_
+#define XLA_HLO_TOOLS_HLO_DIFF_UTILS_TEST_UTIL_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph.h"
+#include "xla/hlo/tools/hlo_diff/graph/hlo_gumgraph_node.h"
+#include "xla/hlo/tools/hlo_diff/hlo_gumgraph_mappings.h"
+
+namespace xla {
+namespace hlo_diff {
+
+// Returns the node with the given name.
+// Returns nullptr if the node is not found.
+const HloInstructionNode* GetNodeByName(const HloGumgraph& graph,
+                                        absl::string_view name);
+
+// Map Nodes, overwriting existing mappings if they are different.
+void OverwriteMapInstructions(const HloInstructionNode* left,
+                              const HloInstructionNode* right,
+                              HloGumgraphMappings& mappings,
+                              bool position_unchanged = false);
+
+// Matches all node pairs with the same name.
+void MatchAllNodesByName(const HloGumgraph& left, const HloGumgraph& right,
+                         HloGumgraphMappings& mappings);
+
+// Extracts the mapped instruction names from the HloGumgraphMappings.
+absl::flat_hash_map<std::string, std::string> ExtractMappedInstructionNames(
+    const HloGumgraphMappings& mappings);
+
+// Extracts the mapped computation names from the HloGumgraphMappings.
+absl::flat_hash_map<std::string, std::string> ExtractMappedComputationNames(
+    const HloGumgraphMappings& mappings);
+
+// Extracts the computation match type from the HloGumgraphMappings.
+absl::flat_hash_map<std::string, ComputationMatchType>
+ExtractComputationMatchType(const HloGumgraphMappings& mappings);
+
+// Returns the instruction with the given name.
+absl::StatusOr<HloInstruction*> GetInstructionByName(HloModule& module,
+                                                     absl::string_view name);
+
+}  // namespace hlo_diff
+}  // namespace xla
+
+#endif  // XLA_HLO_TOOLS_HLO_DIFF_UTILS_TEST_UTIL_H_
diff --git a/third_party/xla/xla/hlo/tools/hlo_opt/BUILD b/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
index 23f27e3fe272..d724068ad2cd 100644
--- a/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/hlo/tools/hlo_opt/BUILD
@@ -53,7 +53,6 @@ cc_library(
         "//xla/hlo/transforms:literal_canonicalizer",
         "//xla/hlo/transforms:memory_space_propagation",
         "//xla/hlo/transforms:operand_upcaster",
-        "//xla/hlo/transforms:sharding_format_picker",
         "//xla/hlo/transforms:while_loop_trip_count_annotator",
         "//xla/hlo/transforms/collectives:all_gather_broadcast_reorder",
         "//xla/hlo/transforms/collectives:all_gather_combiner",
diff --git a/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc b/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
index 0498dd35c0c6..7ca0dd138651 100644
--- a/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_opt/opt_lib.cc
@@ -81,7 +81,6 @@ limitations under the License.
 #include "xla/hlo/transforms/literal_canonicalizer.h"
 #include "xla/hlo/transforms/memory_space_propagation.h"
 #include "xla/hlo/transforms/operand_upcaster.h"
-#include "xla/hlo/transforms/sharding_format_picker.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/all_reduce_folder.h"
 #include "xla/hlo/transforms/simplifiers/ar_crs_combiner.h"
@@ -295,8 +294,7 @@ void OptProvider::RegisterAllHardwareIndependentPasses() {
   RegisterPass<HloMemoryScheduler>(/*size_fn*/ size_fn);
   RegisterPass<HloTrivialScheduler>();
   RegisterPass<HostMemoryTransferAsyncifier>(/*host_memory_space_color=*/5);
-  RegisterPass<HostOffloadLegalize>(
-      /*host_memory_space_color=*/5, /*after_layout=*/false);
+  RegisterPass<HostOffloadLegalize>();
   RegisterPass<HostOffloader>();
   RegisterPass<HostOffloadingPrepare>(
       /*rewrite=*/HostOffloadingPrepare::Rewrite::kElideMoveToHost);
@@ -322,8 +320,6 @@ void OptProvider::RegisterAllHardwareIndependentPasses() {
   RegisterPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_THREE_FRY);
   RegisterPass<RngExpander>();
   RegisterPass<RootInstructionSinker>();
-  RegisterPass<ShardingFormatPicker>(
-      /*sharding_type=*/ShardingFormatPicker::ShardingType::kBestEffortV2);
   RegisterPass<SimplifyFPConversions>();
   RegisterPass<SliceSinker>();
   RegisterPass<SortSimplifier>();
@@ -343,12 +339,17 @@ void OptProvider::RegisterAllHardwareIndependentPasses() {
   //   pass specific customization to the `RegisterPass`.
 
   // Dummy passes for unit-testing the `hlo-opt` tool itself.
-  RegisterPass<test_only::FooToBarModulePass>();
+  // go/keep-sorted start
   RegisterPass<test_only::BarToHelloModulePass>();
+  RegisterPass<test_only::FooToBarModulePass>();
+  // go/keep-sorted end
 
   // Test-only passes exposing behavior that isn't easily testable through
   // standard passes, e.g. internal or config-dependent behavior.
+  // go/keep-sorted start
   RegisterPass<test_only::AlgebraicSimplifierWithOnednnEnabled>();
+  RegisterPass<test_only::XlaBuilderTestPass>();
+  // go/keep-sorted end
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/tools/hlo_translate.cc b/third_party/xla/xla/hlo/tools/hlo_translate.cc
index f0dddd066388..1f88755c8dfc 100644
--- a/third_party/xla/xla/hlo/tools/hlo_translate.cc
+++ b/third_party/xla/xla/hlo/tools/hlo_translate.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/LogicalResult.h"
@@ -97,14 +98,24 @@ bool LoadHloProto(const std::string& contents, xla::HloProto* hlo_proto) {
 constexpr char kLoadHloError[] = "Failed to parse HLO.";
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOText(
-    absl::string_view content, mlir::MLIRContext* context) {
+    absl::string_view content, mlir::MLIRContext* context, bool emit_mhlo) {
   auto hlo_text = xla::ParseAndReturnUnverifiedModule(
       content, {}, xla::HloParserOptions().set_keep_module_auto_layouts(true));
-  if (!hlo_text.ok()) return absl::InvalidArgumentError(kLoadHloError);
+  if (!hlo_text.ok()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat(kLoadHloError, hlo_text.status().message()));
+  }
+
+  auto hlo_module = std::move(hlo_text.value());
 
+  // For emitting StableHLO, use new APIs by defualt.
+  if (!emit_mhlo) {
+    return xla::ConvertHloToStablehlo(*context, hlo_module.get());
+  }
+
+  // For MHLO require legacy API for now.
   mlir::OwningOpRef<mlir::ModuleOp> module =
       xla::llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(context));
-  auto hlo_module = std::move(hlo_text.value());
   auto status = ConvertHloToMlirHlo(*module, hlo_module.get(),
                                     /*import_all_computations=*/true,
                                     /*flatten_computation_args_result*/ true);
@@ -113,11 +124,17 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOText(
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOProto(
-    std::string const& content, mlir::MLIRContext* context) {
+    const std::string& content, mlir::MLIRContext* context, bool emit_mhlo) {
   xla::HloProto hlo_proto;
   if (!LoadHloProto(content, &hlo_proto))
     return absl::InvalidArgumentError(kLoadHloError);
 
+  // For emitting StableHLO, use new APIs by defualt.
+  if (!emit_mhlo) {
+    return xla::ConvertHloToStablehlo(*context, hlo_proto.mutable_hlo_module());
+  }
+
+  // For MHLO require legacy API for now.
   mlir::OwningOpRef<mlir::ModuleOp> module =
       xla::llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(context));
   auto status =
@@ -130,7 +147,7 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetModuleFromHLOProto(
 
 mlir::OwningOpRef<mlir::ModuleOp> GetModuleFromHloInput(
     const std::shared_ptr<llvm::SourceMgr>& source_mgr,
-    mlir::MLIRContext* context) {
+    mlir::MLIRContext* context, bool emit_mhlo) {
   const llvm::MemoryBuffer* input =
       source_mgr->getMemoryBuffer(source_mgr->getMainFileID());
   absl::string_view content =
@@ -144,25 +161,27 @@ mlir::OwningOpRef<mlir::ModuleOp> GetModuleFromHloInput(
   };
 
   // Try HLO Text
-  auto module_from_text = GetModuleFromHLOText(content, context);
-  if (module_from_text.ok()) return std::move(module_from_text.value());
-  if (module_from_text.status().message() != kLoadHloError) {
+  auto module_from_text = GetModuleFromHLOText(content, context, emit_mhlo);
+  if (module_from_text.ok()) return std::move(module_from_text).value();
+  if (module_from_text.status().message().rfind(kLoadHloError, 0) != 0) {
     emitError() << "Failed to convert HLO to MLIR: "
                 << module_from_text.status().message();
     return nullptr;
   }
 
   // Try HLO Proto
-  auto module_from_proto = GetModuleFromHLOProto(std::string(content), context);
-  if (module_from_proto.ok()) return std::move(module_from_proto.value());
-  if (module_from_proto.status().message() != kLoadHloError) {
+  auto module_from_proto =
+      GetModuleFromHLOProto(std::string(content), context, emit_mhlo);
+  if (module_from_proto.ok()) return std::move(module_from_proto).value();
+  if (module_from_text.status().message().rfind(kLoadHloError, 0) != 0) {
     emitError() << "Failed to convert HLO to MLIR: "
                 << module_from_proto.status().message();
     return nullptr;
   }
 
   // Failed to parse
-  emitError() << "Failed to parse input as HLO text or proto.";
+  emitError() << "Failed to parse input as HLO text or proto.\n"
+              << module_from_text.status().message();
   return nullptr;
 }
 
@@ -172,19 +191,10 @@ static mlir::OwningOpRef<mlir::ModuleOp> HloToMlirTranslate(
     const std::shared_ptr<llvm::SourceMgr>& sourceMgr,
     mlir::MLIRContext* context) {
   mlir::OwningOpRef<mlir::ModuleOp> module =
-      GetModuleFromHloInput(sourceMgr, context);
+      GetModuleFromHloInput(sourceMgr, context, emit_mhlo);
 
   if (!module) return nullptr;
 
-  if (emit_mhlo) return module;
-
-  mlir::PassManager pm(context);
-  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
-  if (failed(pm.run(*module))) {
-    module->emitError("Failed to legalize to StableHLO");
-    return nullptr;
-  }
-
   return module;
 }
 
diff --git a/third_party/xla/xla/hlo/tools/tests/BUILD b/third_party/xla/xla/hlo/tools/tests/BUILD
index 50b7ad6fe63e..3592fdba1169 100644
--- a/third_party/xla/xla/hlo/tools/tests/BUILD
+++ b/third_party/xla/xla/hlo/tools/tests/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-load("//xla:strict.default.bzl", "py_strict_test")
+load("//xla:py_strict.bzl", "py_strict_test")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
@@ -40,6 +40,9 @@ lit_test_suite(
     data = [
         "hlo_opt_hlo_protobinary.pb",
     ],
+    tags_override = {
+        "hlo_opt_emit_proto.hlo": ["no_oss"],  # TODO(b/394180263)
+    },
     tools = [
         "//xla/hlo/tools:hlo-opt",
         "@llvm-project//llvm:FileCheck",
@@ -55,9 +58,8 @@ py_strict_test(
         "generate_hlo_test_checks_test_output.hlo",
         "//xla/hlo/tools:hlo-opt",
     ],
-    python_version = "PY3",
     deps = [
-        "//xla/hlo/tools:generate_hlo_test_checks",
+        "//xla/hlo/tools:generate_hlo_test_checks_lib",
         "@absl_py//absl/testing:absltest",
     ],
 )
@@ -77,14 +79,27 @@ lit_test_suite(
 
 cc_library(
     name = "hlo_opt_test_only_passes",
+    srcs = ["hlo_opt_test_only_passes.cc"],
     hdrs = ["hlo_opt_test_only_passes.h"],
     deps = [
+        "//xla:shape_util",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/builder/lib:math",
+        "//xla/hlo/builder/lib:matrix",
+        "//xla/hlo/builder/lib:prng",
+        "//xla/hlo/builder/lib:tridiagonal",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
     ],
 )
diff --git a/third_party/xla/xla/hlo/tools/tests/generate_hlo_test_checks_test_output.hlo b/third_party/xla/xla/hlo/tools/tests/generate_hlo_test_checks_test_output.hlo
index d805fc6f1385..21ab49a1cfab 100644
--- a/third_party/xla/xla/hlo/tools/tests/generate_hlo_test_checks_test_output.hlo
+++ b/third_party/xla/xla/hlo/tools/tests/generate_hlo_test_checks_test_output.hlo
@@ -27,11 +27,11 @@ ENTRY no_op {
 // CHECK-LABEL: ENTRY %test_case
 // CHECK-NEXT:  %[[foo:[^ ]+]] = u8[] parameter(0)
 // CHECK-NEXT:  %[[bar:[^ ]+]] = u8[] parameter(1)
-// CHECK-NEXT:  %[[foobar:[^ ]+]] = u8[] add(u8[] %[[foo]], u8[] %[[bar]])
+// CHECK-NEXT:  %[[foobar:[^ ]+]] = u8[] add(%[[foo]], %[[bar]])
 // CHECK-NEXT:  %[[baz:[^ ]+]] = u8[] parameter(2)
 // CHECK-NEXT:  %[[qux:[^ ]+]] = u8[] parameter(3)
-// CHECK-NEXT:  %[[bazqux:[^ ]+]] = u8[] add(u8[] %[[baz]], u8[] %[[qux]])
-// CHECK-NEXT:  ROOT %[[foobarbazqux:[^ ]+]] = u8[] add(u8[] %[[foobar]], u8[] %[[bazqux]])
+// CHECK-NEXT:  %[[bazqux:[^ ]+]] = u8[] add(%[[baz]], %[[qux]])
+// CHECK-NEXT:  ROOT %[[foobarbazqux:[^ ]+]] = u8[] add(%[[foobar]], %[[bazqux]])
 
 HloModule TestBasicFunctionality
 
@@ -51,18 +51,18 @@ ENTRY test_case {
 
 // CHECK-LABEL: ENTRY %test_case
 // CHECK-NEXT:  %[[constant:[^ ]+]] = f32[] constant(1)
-// CHECK-NEXT:  %[[broadcast:[^ ]+]] = f32[8,7]{1,0} broadcast(f32[] %[[constant]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast:[^ ]+]] = f32[8,7]{1,0} broadcast(%[[constant]]), dimensions={}
 // CHECK-NEXT:  %[[parameter_anon:[^ ]+]] = f32[1,8,1,7]{3,2,1,0} parameter(0)
-// CHECK-NEXT:  %[[reshape0:[^ ]+]] = f32[8,7]{1,0} reshape(f32[1,8,1,7]{3,2,1,0} %[[parameter_anon]])
-// CHECK-NEXT:  %[[reshape:[^ ]+]] = f32[1,8,1,7]{3,2,1,0} reshape(f32[8,7]{1,0} %[[reshape0]])
-// CHECK-NEXT:  %[[negate_1:[^ ]+]] = f32[1,8,1,7]{3,2,1,0} negate(f32[1,8,1,7]{3,2,1,0} %[[reshape]])
-// CHECK-NEXT:  %[[reshape_1:[^ ]+]] = f32[8,7]{1,0} reshape(f32[1,8,1,7]{3,2,1,0} %[[negate_1]])
-// CHECK-NEXT:  %[[exponential:[^ ]+]] = f32[8,7]{1,0} exponential(f32[8,7]{1,0} %[[reshape_1]])
-// CHECK-NEXT:  %[[add_1:[^ ]+]] = f32[8,7]{1,0} add(f32[8,7]{1,0} %[[broadcast]], f32[8,7]{1,0} %[[exponential]])
-// CHECK-NEXT:  %[[divide:[^ ]+]] = f32[8,7]{1,0} divide(f32[8,7]{1,0} %[[broadcast]], f32[8,7]{1,0} %[[add_1]])
+// CHECK-NEXT:  %[[reshape0:[^ ]+]] = f32[8,7]{1,0} reshape(%[[parameter_anon]])
+// CHECK-NEXT:  %[[reshape:[^ ]+]] = f32[1,8,1,7]{3,2,1,0} reshape(%[[reshape0]])
+// CHECK-NEXT:  %[[negate_1:[^ ]+]] = f32[1,8,1,7]{3,2,1,0} negate(%[[reshape]])
+// CHECK-NEXT:  %[[reshape_1:[^ ]+]] = f32[8,7]{1,0} reshape(%[[negate_1]])
+// CHECK-NEXT:  %[[exponential:[^ ]+]] = f32[8,7]{1,0} exponential(%[[reshape_1]])
+// CHECK-NEXT:  %[[add_1:[^ ]+]] = f32[8,7]{1,0} add(%[[broadcast]], %[[exponential]])
+// CHECK-NEXT:  %[[divide:[^ ]+]] = f32[8,7]{1,0} divide(%[[broadcast]], %[[add_1]])
 // CHECK-NEXT:  %[[parameter_anon_1:[^ ]+]] = f32[1,8,1,7]{3,2,1,0} parameter(1)
-// CHECK-NEXT:  %[[reshape1:[^ ]+]] = f32[8,7]{1,0} reshape(f32[1,8,1,7]{3,2,1,0} %[[parameter_anon_1]])
-// CHECK-NEXT:  ROOT %[[add:[^ ]+]] = f32[8,7]{1,0} add(f32[8,7]{1,0} %[[divide]], f32[8,7]{1,0} %[[reshape1]])
+// CHECK-NEXT:  %[[reshape1:[^ ]+]] = f32[8,7]{1,0} reshape(%[[parameter_anon_1]])
+// CHECK-NEXT:  ROOT %[[add:[^ ]+]] = f32[8,7]{1,0} add(%[[divide]], %[[reshape1]])
 
 HloModule TestWithRelevantOptimizationPasses
 
@@ -93,21 +93,21 @@ ENTRY no_op {
 // CHECK:       %[[$foo_bar_baz_0:[^ ]+]]
 // CHECK-NEXT:  %[[foo_bar_baz_0_1:[^ ]+]] = f32[] parameter(0)
 // CHECK-NEXT:  %[[foo_bar_baz_0_2:[^ ]+]] = f32[] parameter(1)
-// CHECK-NEXT:  %[[foo_bar_baz_0_3:[^ ]+]] = f32[] multiply(f32[] %[[foo_bar_baz_0_1]], f32[] %[[foo_bar_baz_0_2]])
-// CHECK-NEXT:  ROOT %[[foobarbaz:[^ ]+]] = f32[] add(f32[] %[[foo_bar_baz_0_1]], f32[] %[[foo_bar_baz_0_3]])
+// CHECK-NEXT:  %[[foo_bar_baz_0_3:[^ ]+]] = f32[] multiply(%[[foo_bar_baz_0_1]], %[[foo_bar_baz_0_2]])
+// CHECK-NEXT:  ROOT %[[foobarbaz:[^ ]+]] = f32[] add(%[[foo_bar_baz_0_1]], %[[foo_bar_baz_0_3]])
 
 // CHECK-LABEL: %foo.bar.baz_0
 // CHECK-NEXT:  %[[foo_bar_baz_0_4:[^ ]+]] = f32[] parameter(0)
 // CHECK-NEXT:  %[[foo_bar_baz_0_5:[^ ]+]] = f32[] parameter(1)
-// CHECK-NEXT:  %[[foo_bar_baz_0_6:[^ ]+]] = f32[] add(f32[] %[[foo_bar_baz_0_4]], f32[] %[[foo_bar_baz_0_5]])
-// CHECK-NEXT:  ROOT %[[foobarbaz_1:[^ ]+]] = f32[] multiply(f32[] %[[foo_bar_baz_0_4]], f32[] %[[foo_bar_baz_0_6]])
+// CHECK-NEXT:  %[[foo_bar_baz_0_6:[^ ]+]] = f32[] add(%[[foo_bar_baz_0_4]], %[[foo_bar_baz_0_5]])
+// CHECK-NEXT:  ROOT %[[foobarbaz_1:[^ ]+]] = f32[] multiply(%[[foo_bar_baz_0_4]], %[[foo_bar_baz_0_6]])
 
 // CHECK-LABEL: ENTRY %foobarbaz
 // CHECK-NEXT:  %[[constant_0:[^ ]+]] = f32[4]{0} constant({8.7, 6.5, 4.3, 2.1})
 // CHECK-NEXT:  %[[constant_0_1:[^ ]+]] = f32[4]{0} constant({1.2, 3.4, 5.6, 7.8})
-// CHECK-NEXT:  %[[call_foo_bar_baz_0:[^ ]+]] = f32[] call(f32[4]{0} %[[constant_0]], f32[4]{0} %[[constant_0_1]]), to_apply=%foo.bar.baz_0
-// CHECK-NEXT:  %[[call_foo_bar_baz_0_1:[^ ]+]] = f32[] call(f32[4]{0} %[[constant_0_1]], f32[4]{0} %[[constant_0]]), to_apply=%[[$foo_bar_baz_0]]
-// CHECK-NEXT:  ROOT %[[sum:[^ ]+]] = f32[] add(f32[] %[[call_foo_bar_baz_0_1]], f32[] %[[call_foo_bar_baz_0_1]])
+// CHECK-NEXT:  %[[call_foo_bar_baz_0:[^ ]+]] = f32[] call(%[[constant_0]], %[[constant_0_1]]), to_apply=%foo.bar.baz_0
+// CHECK-NEXT:  %[[call_foo_bar_baz_0_1:[^ ]+]] = f32[] call(%[[constant_0_1]], %[[constant_0]]), to_apply=%[[$foo_bar_baz_0]]
+// CHECK-NEXT:  ROOT %[[sum:[^ ]+]] = f32[] add(%[[call_foo_bar_baz_0_1]], %[[call_foo_bar_baz_0_1]])
 
 // Test the tool's ability to disambiguate symbols with extremely similar names.
 HloModule TestSymbolNameDisambiguation
diff --git a/third_party/xla/xla/hlo/tools/tests/hlo_opt_emit_proto.hlo b/third_party/xla/xla/hlo/tools/tests/hlo_opt_emit_proto.hlo
index c749a26890cf..ddb66c521c01 100644
--- a/third_party/xla/xla/hlo/tools/tests/hlo_opt_emit_proto.hlo
+++ b/third_party/xla/xla/hlo/tools/tests/hlo_opt_emit_proto.hlo
@@ -123,6 +123,6 @@ HloModule test_module, entry_computation_layout={(f32[], f32[])->f32[]}
     a = f32[] parameter(0)
     // CHECK-ROUNDTRIP-NEXT:   b = f32[] parameter(1)
     b = f32[] parameter(1)
-  // CHECK-ROUNDTRIP-NEXT:   ROOT %res = f32[] multiply(f32[] %a, f32[] %b)
+  // CHECK-ROUNDTRIP-NEXT:   ROOT %res = f32[] multiply(%a, %b)
   ROOT %res = f32[] multiply(f32[] %a, f32[] %b)
 }
diff --git a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
new file mode 100644
index 000000000000..fb44caf056f6
--- /dev/null
+++ b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.cc
@@ -0,0 +1,244 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/tools/tests/hlo_opt_test_only_passes.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/builder/lib/math.h"
+#include "xla/hlo/builder/lib/matrix.h"
+#include "xla/hlo/builder/lib/prng.h"
+#include "xla/hlo/builder/lib/tridiagonal.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::test_only {
+
+namespace {
+
+absl::StatusOr<HloComputation*> XlaComputationToHloComputation(
+    XlaComputation& src_comp, HloModule* dest_module) {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, src_comp.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(src_comp.proto(), config));
+  HloCloneContext context(dest_module);
+  return dest_module->DeepCloneComputation(new_module->entry_computation(),
+                                           &context);
+}
+
+std::unique_ptr<HloInstruction> CreateCustomCallToBuilderMethod(
+    HloInstruction* instruction, HloComputation* computation) {
+  return HloInstruction::CreateCustomCall(instruction->shape(),
+                                          instruction->operands(), computation,
+                                          instruction->custom_call_target());
+}
+
+std::vector<XlaOp> GetParameters(XlaBuilder& builder,
+                                 HloInstruction* instruction) {
+  std::vector<XlaOp> parameters;
+  for (int i = 0; i < instruction->operand_count(); ++i) {
+    parameters.emplace_back(xla::Parameter(
+        &builder, i, instruction->operand(i)->shape(), absl::StrCat("arg", i)));
+    parameters[i].builder();
+  }
+  return parameters;
+}
+
+absl::Status VerifyOperandCounts(
+    HloInstruction* instruction,
+    const std::vector<int64_t>& expected_operand_counts,
+    absl::string_view custom_call_target) {
+  if (std::find(expected_operand_counts.begin(), expected_operand_counts.end(),
+                instruction->operand_count()) ==
+      expected_operand_counts.end()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        custom_call_target, " expected ",
+        absl::StrJoin(expected_operand_counts, " or "), " operands, but got ",
+        instruction->operand_count(), " operands."));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status VerifyOperandCount(HloInstruction* instruction,
+                                int64_t expected_operand_count,
+                                absl::string_view custom_call_target) {
+  return VerifyOperandCounts(instruction, {expected_operand_count},
+                             custom_call_target);
+}
+
+absl::StatusOr<bool> BuildAndReplace(XlaBuilder& builder,
+                                     HloInstruction* instruction) {
+  HloComputation* computation = instruction->parent();
+  HloModule* module = computation->parent();
+
+  TF_ASSIGN_OR_RETURN(XlaComputation called_computation, builder.Build());
+  TF_ASSIGN_OR_RETURN(
+      HloComputation * new_computation,
+      XlaComputationToHloComputation(called_computation, module));
+  HloInstruction* new_instruction = computation->AddInstruction(
+      CreateCustomCallToBuilderMethod(instruction, new_computation));
+  TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(new_instruction));
+  return true;
+}
+
+}  // namespace
+
+absl::StatusOr<bool> XlaBuilderTestPass::ReplaceWithExpandedClientHlo(
+    HloInstruction* instruction, absl::string_view custom_call_target) {
+  XlaBuilder builder(
+      std::string(custom_call_target.data(), custom_call_target.size()));
+
+  std::vector<XlaOp> parameters = GetParameters(builder, instruction);
+
+  // xla_builder.math
+  if (custom_call_target == "xla_builder.math.Acos") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::Acos(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.Acosh") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::Acosh(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.Asin") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::Asin(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.Asinh") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::Asinh(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.Atan") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::Atan(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.Cosh") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::Cosh(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.IgammaGradA") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 2, custom_call_target));
+    xla::IgammaGradA(parameters[0], parameters[1]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.NextAfter") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 2, custom_call_target));
+    xla::NextAfter(parameters[0], parameters[1]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.Polygamma") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 2, custom_call_target));
+    xla::Polygamma(parameters[0], parameters[1]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.RandomGammaGrad") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 2, custom_call_target));
+    xla::RandomGammaGrad(parameters[0], parameters[1]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.math.RegularizedIncompleteBeta") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 3, custom_call_target));
+    xla::RegularizedIncompleteBeta(parameters[0], parameters[1], parameters[2]);
+    return BuildAndReplace(builder, instruction);
+  }
+
+  // xla_builder.matrix
+  if (custom_call_target == "xla_builder.matrix.GetMatrixDiagonalViaGather") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::GetMatrixDiagonalViaGather(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+  if (custom_call_target == "xla_builder.matrix.Einsum") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 2, custom_call_target));
+    absl::string_view einsum_config = instruction->raw_backend_config_string();
+    xla::Einsum(parameters[0], parameters[1], einsum_config);
+    return BuildAndReplace(builder, instruction);
+  }
+
+  // xla_builder.prng
+  if (custom_call_target == "xla_builder.prng.ScramblePhiloxKey") {
+    TF_RETURN_IF_ERROR(VerifyOperandCount(instruction, 1, custom_call_target));
+    xla::ScramblePhiloxKey(parameters[0]);
+    return BuildAndReplace(builder, instruction);
+  }
+
+  // xla_builder.tridiagonal
+  if (custom_call_target == "xla_builder.tridiagonal.TridiagonalSolver") {
+    TF_RETURN_IF_ERROR(
+        VerifyOperandCounts(instruction, {2, 4}, custom_call_target));
+    if (parameters.size() == 2) {
+      TF_ASSIGN_OR_RETURN(
+          std::ignore, xla::tridiagonal::TridiagonalSolver(
+                           tridiagonal::SolverAlgorithm::kThomas, parameters[0],
+                           parameters[1]));
+      return BuildAndReplace(builder, instruction);
+    }
+    TF_ASSIGN_OR_RETURN(
+        std::ignore, xla::tridiagonal::TridiagonalSolver(
+                         tridiagonal::SolverAlgorithm::kThomas, parameters[0],
+                         parameters[1], parameters[2], parameters[3]));
+    return BuildAndReplace(builder, instruction);
+  }
+
+  return absl::InvalidArgumentError(absl::StrCat(
+      "Unsupported xla_builder custom call target: ", custom_call_target));
+}
+
+absl::StatusOr<bool> XlaBuilderTestPass::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      // Find custom calls that start with "xla_builder." and expand the HLO
+      if (instruction->opcode() == HloOpcode::kCustomCall &&
+          instruction->custom_call_target().rfind("xla_builder.", 0) == 0) {
+        TF_ASSIGN_OR_RETURN(
+            bool call_changed,
+            ReplaceWithExpandedClientHlo(instruction,
+                                         instruction->custom_call_target()));
+        changed |= call_changed;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla::test_only
diff --git a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
index 14679b40bd9f..bc32ef0f3754 100644
--- a/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
+++ b/third_party/xla/xla/hlo/tools/tests/hlo_opt_test_only_passes.h
@@ -92,6 +92,27 @@ class AlgebraicSimplifierWithOnednnEnabled : public AlgebraicSimplifier {
   }
 };
 
+// Test XLA Builder methods using lit tests.
+// Transforms custom calls that start with `xla_builder.some_method` into
+// expanded HLO by calling the client methods:
+// Example:
+//  custom-call @xla_builder.add(operand1, operand2)
+//  ==>
+//  add(operand1, operand2)
+class XlaBuilderTestPass : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "test-only-xla-builder"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> ReplaceWithExpandedClientHlo(
+      HloInstruction* instruction, absl::string_view custom_call_target);
+};
+
 }  // namespace xla::test_only
 
 #endif  // XLA_HLO_TOOLS_TESTS_HLO_OPT_TEST_ONLY_PASSES_H_
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 42986c58f68e..5f57b9cd9b45 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Implementation of XLA’s HLO transformations.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -66,8 +66,10 @@ xla_cc_test(
         "//xla/service:float_support",
         "//xla/service:hlo_verifier",
         "//xla/tests:literal_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",  # fixdeps: keep
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -114,7 +116,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:call_graph",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -166,7 +167,9 @@ cc_library(
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/service:hlo_value",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -198,7 +201,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/service:host_memory_offload_annotations_hdr",
+        "//xla/service:memory_annotations_hdr",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -219,7 +222,7 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service:host_memory_offload_annotations_hdr",
+        "//xla/service:memory_annotations_hdr",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -240,8 +243,8 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:call_graph",
         "//xla/service:hlo_value",
-        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:host_offload_utils",
+        "//xla/service:memory_annotations_hdr",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -267,10 +270,9 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
-        "//xla/service:host_memory_offload_annotations_hdr",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:pattern_matcher",
         "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
@@ -298,8 +300,8 @@ cc_library(
         "//xla/service:hlo_buffer",
         "//xla/service:hlo_cse",
         "//xla/service:hlo_value",
-        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:host_offload_utils",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:pattern_matcher",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -330,8 +332,8 @@ xla_cc_test(
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:hlo_verifier",
-        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:host_offload_utils",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:pattern_matcher",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -350,11 +352,10 @@ cc_library(
     srcs = ["host_offloading_prepare.cc"],
     hdrs = ["host_offloading_prepare.h"],
     deps = [
-        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:call_graph",
-        "//xla/service:host_memory_offload_annotations_hdr",
+        "//xla/service:memory_annotations_hdr",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -373,7 +374,7 @@ xla_cc_test(
         ":host_offloading_prepare",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service:host_memory_offload_annotations_hdr",
+        "//xla/service:memory_annotations_hdr",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -412,7 +413,6 @@ cc_library(
         "//xla:literal_pool",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/pass:hlo_pass_pipeline",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -477,23 +477,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "sharding_format_picker",
-    testonly = True,
-    srcs = ["sharding_format_picker.cc"],
-    hdrs = ["sharding_format_picker.h"],
-    deps = [
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:tile_assignment",
-        "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "add_original_value",
     srcs = ["add_original_value.cc"],
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
index ef8924674827..b4dac526785c 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.cc
@@ -245,6 +245,57 @@ void BFloat16Propagation::DetermineConditionalComputationsPrecision(
   }
 }
 
+void BFloat16Propagation::DetermineAsyncComputationsPrecision(
+    HloInstruction* async_start) {
+  CHECK_EQ(async_start->opcode(), HloOpcode::kAsyncStart);
+
+  auto root = async_start->async_wrapped_instruction();
+  ShapeUtil::ForEachSubshape(root->shape(), [&](const Shape& subshape,
+                                                const ShapeIndex& index) {
+    if (subshape.element_type() != F32) {
+      return;
+    }
+    if (OutputTypeAfterChange(async_start->async_chain_done(), index) == BF16) {
+      AddToOrRemoveFromBF16ChangeSet(root, index, BF16);
+      VLOG(2) << "Async wrapped computation root " << root->ToString()
+              << " at shape index " << index
+              << " changed to BF16 precision for async start "
+              << async_start->ToString();
+    }
+  });
+  auto insts =
+      async_start->async_wrapped_computation()->MakeInstructionPostOrder();
+  for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
+    DetermineInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+  }
+  computations_visited_in_backward_pass_.insert(
+      async_start->async_wrapped_computation());
+}
+
+void BFloat16Propagation::DetermineCalledComputationsPrecision(
+    HloInstruction* call) {
+  CHECK_EQ(call->opcode(), HloOpcode::kCall);
+
+  auto root = call->to_apply()->root_instruction();
+  ShapeUtil::ForEachSubshape(
+      root->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.element_type() != F32) {
+          return;
+        }
+        if (OutputTypeAfterChange(call, index) == BF16) {
+          AddToOrRemoveFromBF16ChangeSet(root, index, BF16);
+          VLOG(2) << "Called computation root " << root->ToString()
+                  << " at shape index " << index
+                  << " changed to BF16 precision for call " << call->ToString();
+        }
+      });
+  auto insts = call->to_apply()->MakeInstructionPostOrder();
+  for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
+    DetermineInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+  }
+  computations_visited_in_backward_pass_.insert(call->to_apply());
+}
+
 bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
                                               const ShapeIndex& index) const {
   // If the subshape isn't floating point then none of the users will be BF16.
@@ -315,6 +366,28 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
           return false;
         }
         continue;
+      } else if (use.instruction->opcode() == HloOpcode::kAsyncStart &&
+                 HloInstruction::IsThreadIncluded(
+                     use.instruction->async_execution_thread(),
+                     execution_threads_)) {
+        auto* async_parameter =
+            use.instruction->async_wrapped_computation()->parameter_instruction(
+                use.operand_number);
+        if (OutputTypeAfterChange(async_parameter, use.operand_index) != BF16) {
+          return false;
+        }
+        continue;
+      } else if (use.instruction->opcode() == HloOpcode::kCall) {
+        auto* call_parameter =
+            use.instruction->to_apply()->parameter_instruction(
+                use.operand_number);
+        if (OutputTypeAfterChange(call_parameter, use.operand_index) != BF16) {
+          return false;
+        }
+        continue;
+      } else if (use.instruction->opcode() == HloOpcode::kAsyncDone) {
+        // async-done consumes whatever async-start gives it.
+        continue;
       }
       if (bfloat16_support_->EffectiveOperandPrecisionIsLowPrecision(
               *use.instruction, use.operand_number)) {
@@ -371,9 +444,11 @@ bool BFloat16Propagation::ShouldKeepPrecisionUnchanged(
   // since it is merely a buffer allocation and does not have any side effects.
   return (inst->opcode() == HloOpcode::kCustomCall &&
           !inst->IsCustomCall("AllocateBuffer")) ||
-         inst->opcode() == HloOpcode::kCall ||
          inst->opcode() == HloOpcode::kBitcastConvert ||
-         inst->HasSideEffectNoRecurse();
+         inst->HasSideEffectNoRecurse() ||
+         (inst->IsAsynchronous() &&
+          !HloInstruction::IsThreadIncluded(inst->async_execution_thread(),
+                                            execution_threads_));
 }
 
 void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
@@ -392,6 +467,12 @@ void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
         DetermineWhileComputationsPrecision(hlo);
       } else if (hlo->opcode() == HloOpcode::kConditional) {
         DetermineConditionalComputationsPrecision(hlo);
+      } else if (hlo->opcode() == HloOpcode::kAsyncStart &&
+                 HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
+                                                  execution_threads_)) {
+        DetermineAsyncComputationsPrecision(hlo);
+      } else if (hlo->opcode() == HloOpcode::kCall) {
+        DetermineCalledComputationsPrecision(hlo);
       }
     }
     instructions_visited_in_backward_pass_.insert(hlo);
@@ -412,6 +493,20 @@ void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
     return;
   }
 
+  if (hlo->opcode() == HloOpcode::kAsyncStart &&
+      HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
+                                       execution_threads_) &&
+      caller_counts_[hlo->async_wrapped_computation()] > 1) {
+    postpone_processing_called_computations = true;
+    return;
+  }
+
+  if (hlo->opcode() == HloOpcode::kCall &&
+      caller_counts_[hlo->to_apply()] > 1) {
+    postpone_processing_called_computations = true;
+    return;
+  }
+
   // Prevent root instructions from having their output modified by recording
   // all F32 output values as needing to stay as F32.
   CHECK(hlo->parent() != nullptr);
@@ -521,6 +616,15 @@ void BFloat16Propagation::AdjustCalledComputationParameters(
                            {hlo->mutable_operand(i + 1)});
       }
       break;
+    case HloOpcode::kAsyncStart:
+      if (HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
+                                           execution_threads_)) {
+        adjust_computation(hlo->async_wrapped_computation(), hlo->operands());
+      }
+      break;
+    case HloOpcode::kCall:
+      adjust_computation(hlo->to_apply(), hlo->operands());
+      break;
     default:
       break;
   }
@@ -576,6 +680,15 @@ void BFloat16Propagation::AdjustCalledComputationRoot(HloInstruction* hlo) {
         adjust_computation(branch, hlo);
       }
       break;
+    case HloOpcode::kAsyncStart:
+      if (HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
+                                           execution_threads_)) {
+        adjust_computation(hlo->async_wrapped_computation(), hlo);
+      }
+      break;
+    case HloOpcode::kCall:
+      adjust_computation(hlo->to_apply(), hlo);
+      break;
     default:
       break;
   }
@@ -698,6 +811,14 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
           ResolveInconsistencyOfAliasingBuffersHelper(branch,
                                                       visited_computations);
         }
+      } else if (hlo->opcode() == HloOpcode::kAsyncStart &&
+                 HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
+                                                  execution_threads_)) {
+        ResolveInconsistencyOfAliasingBuffersHelper(
+            hlo->async_wrapped_computation(), visited_computations);
+      } else if (hlo->opcode() == HloOpcode::kCall) {
+        ResolveInconsistencyOfAliasingBuffersHelper(hlo->to_apply(),
+                                                    visited_computations);
       }
     }
     if (!any_change) {
@@ -712,10 +833,9 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
 }
 
 void BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    HloModule* module) {
   const auto& computations_topological_order =
-      module->MakeComputationPostOrder(execution_threads);
+      module->MakeComputationPostOrder(execution_threads_);
   absl::flat_hash_set<const HloComputation*> resolved;
   for (auto comp_it = computations_topological_order.rbegin();
        comp_it != computations_topological_order.rend(); ++comp_it) {
@@ -727,8 +847,7 @@ void BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
 }
 
 absl::Status BFloat16Propagation::ResolveInconsistentFusions(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    HloModule* module) {
   // We could have changed a fusion computation's root shape to have a different
   // precision than the fusion node's output, if the fusion root does not
   // define a buffer (e.g., a tuple). Now we add conversions after such fusion
@@ -754,7 +873,8 @@ absl::Status BFloat16Propagation::ResolveInconsistentFusions(
   // (1) a is F32 but tuple is BF16
   // (2) after adding conversion
   // (3) after tuple simplifier and DCE.
-  for (auto computation : module->MakeComputationPostOrder(execution_threads)) {
+  for (auto computation :
+       module->MakeComputationPostOrder(execution_threads_)) {
     auto insts = computation->MakeInstructionPostOrder();
     for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
       auto hlo = *inst_it;
@@ -789,9 +909,7 @@ absl::Status BFloat16Propagation::ResolveInconsistentFusions(
   return absl::OkStatus();
 }
 
-absl::Status BFloat16Propagation::ResolveConvertedConstants(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+absl::Status BFloat16Propagation::ResolveConvertedConstants(HloModule* module) {
   // We may have converted some constants from F32 to BF16, so adjust the
   // constant literals in such cases. We do this here instead of when the
   // constant node's is changed because 1) the HloInstruction interface does not
@@ -802,7 +920,8 @@ absl::Status BFloat16Propagation::ResolveConvertedConstants(
   // can avoid repeated conversions.
   //
   // TODO(b/73833576): Consider resetting literal in HloInstruction.
-  for (auto computation : module->MakeComputationPostOrder(execution_threads)) {
+  for (auto computation :
+       module->MakeComputationPostOrder(execution_threads_)) {
     for (auto hlo : computation->MakeInstructionPostOrder()) {
       if (hlo->opcode() != HloOpcode::kConstant) {
         continue;
@@ -821,10 +940,8 @@ absl::Status BFloat16Propagation::ResolveConvertedConstants(
   return absl::OkStatus();
 }
 
-absl::Status BFloat16Propagation::SkipNoopConversions(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  for (auto computation : module->computations(execution_threads)) {
+absl::Status BFloat16Propagation::SkipNoopConversions(HloModule* module) {
+  for (auto computation : module->computations(execution_threads_)) {
     for (auto hlo : computation->MakeInstructionPostOrder()) {
       if (hlo->opcode() != HloOpcode::kConvert) {
         continue;
@@ -859,9 +976,10 @@ absl::StatusOr<bool> BFloat16Propagation::Run(
   caller_counts_.clear();
   changes_to_bf16_.clear();
   changed_ = false;
+  execution_threads_ = execution_threads;
 
   auto computations_topological_order =
-      module->MakeComputationPostOrder(execution_threads);
+      module->MakeComputationPostOrder(execution_threads_);
 
   // Before running the propagation pass, we insert copies (kConvert to the same
   // type) of F32 inputs to while loops. This prevents other uses of the same
@@ -929,7 +1047,7 @@ absl::StatusOr<bool> BFloat16Propagation::Run(
   // It's possible that an instruction does not define a buffer, but the
   // defining instruction's shape has changed. So we need to adjust the output
   // shapes of instructions according to the HLO values they refer to.
-  ResolveInconsistencyOfAliasingBuffers(module, execution_threads);
+  ResolveInconsistencyOfAliasingBuffers(module);
 
   // Apply the changes in changes_to_bf16_.
   for (auto& change : changes_to_bf16_) {
@@ -978,13 +1096,13 @@ absl::StatusOr<bool> BFloat16Propagation::Run(
   // Removes redundant HLOs added by this pass, either when inserting
   // de-aliasing copies to while loop inputs, or later when converting output
   // types.
-  auto clean_up = [this, module, &execution_threads]() {
-    TF_RETURN_IF_ERROR(SkipNoopConversions(module, execution_threads));
+  auto clean_up = [this, module]() {
+    TF_RETURN_IF_ERROR(SkipNoopConversions(module));
     TupleSimplifier tuple_simplifier;
     TF_RETURN_IF_ERROR(
-        tuple_simplifier.Run(module, execution_threads).status());
+        tuple_simplifier.Run(module, execution_threads_).status());
     HloDCE dce;
-    TF_RETURN_IF_ERROR(dce.Run(module, execution_threads).status());
+    TF_RETURN_IF_ERROR(dce.Run(module, execution_threads_).status());
     return absl::OkStatus();
   };
 
@@ -993,8 +1111,8 @@ absl::StatusOr<bool> BFloat16Propagation::Run(
     return false;
   }
 
-  TF_RETURN_IF_ERROR(ResolveInconsistentFusions(module, execution_threads));
-  TF_RETURN_IF_ERROR(ResolveConvertedConstants(module, execution_threads));
+  TF_RETURN_IF_ERROR(ResolveInconsistentFusions(module));
+  TF_RETURN_IF_ERROR(ResolveConvertedConstants(module));
 
   TF_RETURN_IF_ERROR(clean_up());
   return true;
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
index 317d754cb60c..6d412d4265e1 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation.h
@@ -130,6 +130,16 @@ class BFloat16Propagation : public HloModulePass {
   // Precondition: hlo->opcode() == kConditional
   void DetermineConditionalComputationsPrecision(HloInstruction* cond);
 
+  // Special handling in the opportunity-finding pass for async computations.
+  //
+  // Precondition: hlo->opcode() == kAsyncStart
+  void DetermineAsyncComputationsPrecision(HloInstruction* async_start);
+
+  // Special handling in the opportunity-finding pass for called computations.
+  //
+  // Precondition: hlo->opcode() == kCall
+  void DetermineCalledComputationsPrecision(HloInstruction* call);
+
   // The set of HloInstructions that have been visited in the
   // opportunity-finding pass.
   absl::flat_hash_set<const HloInstruction*>
@@ -146,9 +156,7 @@ class BFloat16Propagation : public HloModulePass {
   // Adjusts the output shapes of HloInstructions such that if two
   // HloInstructions have aliasing buffers in their outputs, they must have the
   // same precision.
-  void ResolveInconsistencyOfAliasingBuffers(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  void ResolveInconsistencyOfAliasingBuffers(HloModule* module);
 
   // Resolves inconsistency of aliasing buffers for the given computation, and
   // recursively runs on a while instruction's condition and body until a fixed
@@ -170,21 +178,15 @@ class BFloat16Propagation : public HloModulePass {
 
   // Resolves inconsistencies introduced by this pass for fusions with
   // tuple-type output.
-  absl::Status ResolveInconsistentFusions(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  absl::Status ResolveInconsistentFusions(HloModule* module);
 
   // Converts the literals in kConstant HLOs which have their types changed to
   // BF16 by this pass.
-  absl::Status ResolveConvertedConstants(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  absl::Status ResolveConvertedConstants(HloModule* module);
 
   // Skips no-op conversions (same source and target shapes) that can be
   // produced this pass, i.e., replaces them in their uses with their operands.
-  absl::Status SkipNoopConversions(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads);
+  absl::Status SkipNoopConversions(HloModule* module);
 
   // ***************************
   // Functions called and state used by two or more passes.
@@ -232,6 +234,8 @@ class BFloat16Propagation : public HloModulePass {
   bool changed_ = false;
 
   std::unique_ptr<HloDataflowAnalysis> dataflow_;
+
+  absl::flat_hash_set<absl::string_view> execution_threads_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc b/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc
index cf14c05d6a73..505b18a1ee9c 100644
--- a/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc
+++ b/third_party/xla/xla/hlo/transforms/bfloat16_propagation_test.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -36,6 +38,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -437,6 +440,104 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
   EXPECT_TRUE(OutputsBF16(b_f1));
 }
 
+// Tests that BF16 is propagated properly through called fused computations.
+TEST_F(BFloat16PropagationTest, PropagateThroughCalledFusion) {
+  constexpr absl::string_view kHlo = R"(
+HloModule main
+
+ENTRY main {
+  arg.0 = f32[4,4] parameter(0)
+  add.0 = f32[4,4] add(arg.0, arg.0)
+  call.0 = call(add.0, add.0), to_apply={
+    arg.0 = f32[4,4] parameter(0)
+    arg.1 = f32[4,4] parameter(1)
+    ROOT fusion.0 = (f32[4,4], f32[4,4]) fusion(arg.0, arg.1), kind=kCustom, calls={
+      arg.0 = f32[4,4] parameter(0)
+      arg.1 = f32[4,4] parameter(1)
+      ROOT tuple.0 = tuple(arg.0, arg.1)
+    }
+  }
+  ROOT fusion.1 = f32[4,4] fusion(call.0), kind=kCustom, calls={
+    arg.0 = (f32[4,4], f32[4,4]) parameter(0)
+    gte.0 = get-tuple-element(arg.0), index=0
+    gte.1 = get-tuple-element(arg.0), index=1
+    ROOT dot.0 = dot(gte.0, gte.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  HloInstruction* add0 = FindInstruction(module.get(), "add.0");
+  ASSERT_NE(add0, nullptr);
+  EXPECT_TRUE(OutputsBF16(add0));
+  HloInstruction* call = FindInstruction(module.get(), "call.0");
+  ASSERT_NE(call, nullptr);
+  HloInstruction* arg0 = call->to_apply()->parameter_instruction(0);
+  EXPECT_TRUE(OutputsBF16(arg0));
+  HloInstruction* arg1 = call->to_apply()->parameter_instruction(1);
+  EXPECT_TRUE(OutputsBF16(arg1));
+  HloInstruction* gte0 = FindInstruction(module.get(), "gte.0");
+  ASSERT_NE(gte0, nullptr);
+  EXPECT_TRUE(OutputsBF16(gte0));
+  HloInstruction* gte1 = FindInstruction(module.get(), "gte.1");
+  ASSERT_NE(gte1, nullptr);
+  EXPECT_TRUE(OutputsBF16(gte1));
+}
+
+// Tests that BF16 is propagated properly through async fused computations.
+TEST_F(BFloat16PropagationTest, PropagateThroughAsyncFusion) {
+  constexpr absl::string_view kHlo = R"(
+HloModule main
+
+ENTRY main {
+  arg.0 = f32[4,4] parameter(0)
+  add.0 = f32[4,4] add(arg.0, arg.0)
+  fusion-start.0 = ((f32[4,4], f32[4,4]), (f32[4,4], f32[4,4]), s32[]) fusion-start(add.0, add.0), kind=kCustom, calls={
+    arg.0 = f32[4,4] parameter(0)
+    arg.1 = f32[4,4] parameter(1)
+    ROOT tuple.0 = tuple(arg.0, arg.1)
+  }, async_execution_thread="main"
+  fusion-done.0 = (f32[4,4], f32[4,4]) fusion-done(fusion-start.0)
+  ROOT fusion.1 = f32[4,4] fusion(fusion-done.0), kind=kCustom, calls={
+    arg.0 = (f32[4,4], f32[4,4]) parameter(0)
+    gte.0 = get-tuple-element(arg.0), index=0
+    gte.1 = get-tuple-element(arg.0), index=1
+    ROOT dot.0 = dot(gte.0, gte.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  HloInstruction* add0 = FindInstruction(module.get(), "add.0");
+  ASSERT_NE(add0, nullptr);
+  EXPECT_TRUE(OutputsBF16(add0));
+  HloInstruction* fusion0 = FindInstruction(module.get(), "fusion-start.0");
+  HloInstruction* async_arg0 =
+      fusion0->async_wrapped_computation()->parameter_instruction(0);
+  EXPECT_TRUE(OutputsBF16(async_arg0));
+  HloInstruction* async_arg1 =
+      fusion0->async_wrapped_computation()->parameter_instruction(1);
+  EXPECT_TRUE(OutputsBF16(async_arg1));
+  HloInstruction* arg0 = fusion0->async_wrapped_instruction()
+                             ->called_computations()[0]
+                             ->parameter_instruction(0);
+  EXPECT_TRUE(OutputsBF16(arg0));
+  HloInstruction* arg1 = fusion0->async_wrapped_instruction()
+                             ->called_computations()[0]
+                             ->parameter_instruction(1);
+  EXPECT_TRUE(OutputsBF16(arg1));
+  HloInstruction* gte0 = FindInstruction(module.get(), "gte.0");
+  ASSERT_NE(gte0, nullptr);
+  EXPECT_TRUE(OutputsBF16(gte0));
+  HloInstruction* gte1 = FindInstruction(module.get(), "gte.1");
+  ASSERT_NE(gte1, nullptr);
+  EXPECT_TRUE(OutputsBF16(gte1));
+}
+
 // Tests that a fusion with a bitcast-convert as its root is changed via adding
 // extra convert, instead of changing the type in-place.
 TEST_F(BFloat16PropagationTest, FusionWithBitcastConvertRoot) {
diff --git a/third_party/xla/xla/hlo/transforms/collectives/BUILD b/third_party/xla/xla/hlo/transforms/collectives/BUILD
index 57f111e6aa83..3fb783e8f05e 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/BUILD
+++ b/third_party/xla/xla/hlo/transforms/collectives/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   XLA collective transforms implementation.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -182,13 +182,12 @@ cc_library(
         "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -202,6 +201,7 @@ xla_cc_test(
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -372,8 +372,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/hlo/utils:hlo_sharding_util",
         "//xla/service:collective_combiner_utils",
         "//xla/service:collective_permute_key",
         "//xla/service:hlo_domain_map",
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
index 51b18f4f11c8..7697315c9e4c 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_broadcast_reorder.cc
@@ -76,7 +76,7 @@ absl::StatusOr<bool> AllGatherBroadcastReorder::Run(
 
       // Find the product of the size of uniform dims.
       int64_t uniform_dim_size = 1;
-      for (int64_t i = 0; i < ag->shape().rank(); ++i) {
+      for (int64_t i = 0; i < ag->shape().dimensions().size(); ++i) {
         if (non_uniform_dims.count(i) == 0) {
           uniform_dim_size *= ag->shape().dimensions(i);
         }
diff --git a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
index f47bcdc38a05..61b261248e0e 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/all_gather_combiner.cc
@@ -66,7 +66,8 @@ int64_t FindMostFrequentGatherDim(
     frequency.resize(std::max(dim + 1, static_cast<int64_t>(frequency.size())),
                      0);
     frequency[dim]++;
-    min_rank = std::min(min_rank, it->shape().rank());
+    min_rank = std::min(min_rank,
+                        static_cast<int64_t>(it->shape().dimensions().size()));
   }
 
   int64_t most_frequent_dim = std::distance(
@@ -117,7 +118,7 @@ absl::Status CombineAllGathers(absl::Span<HloInstruction* const> to_combine,
 
       // Build permutation to align gather dimension.
       auto& perm = operand_permutations.back();
-      perm = std::vector<int64_t>(operand_shape.rank());
+      perm = std::vector<int64_t>(operand_shape.dimensions().size());
       std::iota(perm->begin(), perm->end(), 0);
       std::swap((*perm)[most_frequent_dim],
                 (*perm)[ag->all_gather_dimension()]);
@@ -239,7 +240,8 @@ absl::StatusOr<bool> AllGatherCombiner::RunWithKeyCombiner(
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    if (!combine_while_loops_ && computation->IsWhileBodyComputation()) {
+    if (!combine_while_loops_ &&
+        computation->GetUniqueCaller(HloOpcode::kWhile)) {
       VLOG(2) << "Skipping this computation because the computation is a while "
                  "loop body: "
               << computation->ToString();
diff --git a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
index f5a91a7e3146..f993a5762d4c 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/async_collective_creator.cc
@@ -142,7 +142,7 @@ absl::StatusOr<ReplacedAsync> CreateAsyncStartDone(
 int64_t GetShapeSize(const Shape& shape) {
   int64_t size_in_bytes = 0;
   if (shape.IsTuple()) {
-    for (int64_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+    for (int64_t i = 0; i < shape.tuple_shapes().size(); ++i) {
       size_in_bytes += GetShapeSize(shape.tuple_shapes(i));
     }
     return size_in_bytes;
@@ -219,6 +219,8 @@ absl::StatusOr<bool> AsyncCollectiveCreator::ReplaceCollectives(
     TF_RETURN_IF_ERROR(async_pair.status());
     async_pair->start->set_metadata(instruction->metadata());
     async_pair->start->CopyBackendConfigFrom(instruction);
+    async_pair->done->set_metadata(instruction->metadata());
+    async_pair->done->CopyBackendConfigFrom(instruction);
     if (should_update_schedule) {
       replaced_pairs[instruction] = *async_pair;
     }
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner_test.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner_test.cc
index 1879654d99f8..c860f6cb766c 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner_test.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_permute_combiner_test.cc
@@ -301,7 +301,7 @@ ENTRY %CombineCollectivePermutes () -> (f32[256], f32[512], f32[2560], f32[1792]
   EXPECT_TRUE(changed);
 }
 
-TEST_F(CollectivePermuteCombinerTest, ChannelIdPreventsCombining) {
+TEST_F(CollectivePermuteCombinerTest, IgnoreChannelId) {
   const char* const hlo_string = R"(
 HloModule CombineCollectivePermutes, entry_computation_layout={()->(f32[256]{0}, f32[512]{0}, f32[2560]{0}, f32[1792]{0}, f32[1536]{0})}
 
@@ -328,16 +328,19 @@ ENTRY %CombineCollectivePermutes () -> (f32[256], f32[512], f32[2560], f32[1792]
   
   ROOT %tuple = (f32[256]{0}, f32[512]{0}, f32[2560]{0}, f32[1792]{0}, f32[1536]{0}) tuple(f32[256]{0} %collective-permute, f32[512]{0} %collective-permute.1, f32[2560]{0} %collective-permute.2, f32[1792]{0} %collective-permute.3, f32[1536]{0} %collective-permute.4)
 })";
+  HloModuleConfig config = GetModuleConfigForTest();
+  auto opts = GetDebugOptionsForTest();
+  opts.set_xla_ignore_channel_id(true);
+  config.set_debug_options(opts);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(hlo_string, config));
 
   const int64_t total_count = 5;
   CollectivePermuteCombiner combine(1024 * 1024, kMaxCombineCount);
   ASSERT_EQ(CollectivePermuteCount(*module), total_count);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
-  // Expect two combined collective permute ops since there are two types of
-  // channel_id in HLO
-  EXPECT_EQ(CollectivePermuteCount(*module), 2);
+  // Expect one combined collective permute op since channel_id is ignored
+  EXPECT_EQ(CollectivePermuteCount(*module), 1);
   EXPECT_TRUE(changed);
 }
 
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
index 518ac25d93c7..5ae9f7f7f7ca 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collective_transformation_reorderer.cc
@@ -84,7 +84,7 @@ GetAllGatherTransformations(HloInstruction* all_gather) {
     int64_t reshaped_all_gather_dimension = 0;
     int64_t reshaped_num_strides = 1;
     while (reshaped_all_gather_dimension <
-               transformation_hlo->shape().dimensions_size() &&
+               transformation_hlo->shape().dimensions().size() &&
            reshaped_num_strides < all_gather_num_strides) {
       reshaped_num_strides *=
           transformation_hlo->shape().dimensions(reshaped_all_gather_dimension);
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
index b5e755e8df41..2508100d6c84 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "tsl/platform/errors.h"
+#include "xla/tsl/platform/errors.h"
 
 namespace xla {
 
@@ -74,6 +74,9 @@ absl::StatusOr<bool> CollectivesScheduleLinearizer::Run(
       if (prev_done && !reachability->IsConnected(start, prev_done)) {
         // If prev_done and start are independent, enforce ordering.
         TF_RETURN_IF_ERROR(prev_done->AddControlDependencyTo(next));
+        // Adding control dependency does not update the reachability map.
+        reachability->UpdateReachabilityThroughInstruction(start);
+
         VLOG(1) << "Adding control dependency from " << prev_done->ToString()
                 << " to " << start->ToString();
         changed = true;
diff --git a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer_test.cc b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer_test.cc
index f6bd23d2ee4e..ff07396caa00 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer_test.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/collectives_schedule_linearizer_test.cc
@@ -24,9 +24,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -207,5 +209,59 @@ ENTRY entry {
   EXPECT_TRUE(absl::c_linear_search(ars1->control_predecessors(), ard0));
 }
 
+TEST_F(CollectivesScheduleLinearizerTest, DefUseOrder) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = f32[] add(a, b)
+}
+
+ENTRY entry {
+  p0 = f32[100] parameter(0), parameter_replication={false}
+  p1 = f32[100] parameter(1), parameter_replication={false}
+  i0 = f32[100] add(p0, p1)
+  i1 = f32[100] multiply(p0, p1)
+  i2 = f32[100] divide(p0, p1)
+  c1 = f32[100] all-reduce(i0), replica_groups={}, to_apply=sum, channel_id=1
+  c2 = f32[100] all-reduce(i1), replica_groups={}, to_apply=sum, channel_id=1
+  c3 = f32[100] all-reduce(i2), replica_groups={}, to_apply=sum, channel_id=1
+  t = f32[100] add(c1, c2)
+  ROOT out = f32[100] add(t, c3)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCollectivesSchedule(module.get());
+  EXPECT_EQ(CountControlEdges(*module->entry_computation()), 2);
+
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  const HloInstruction *t = root->operand(0);   // t = add(c1, c2)
+  const HloInstruction *c3 = root->operand(1);  // c3 = all-reduce(i2)...
+  EXPECT_EQ(t->opcode(), HloOpcode::kAdd);
+  EXPECT_EQ(c3->opcode(), HloOpcode::kAllReduce);
+
+  const HloInstruction *c1 = t->operand(0);
+  const HloInstruction *c2 = t->operand(1);
+  EXPECT_EQ(c1->opcode(), HloOpcode::kAllReduce);
+  EXPECT_EQ(c2->opcode(), HloOpcode::kAllReduce);
+
+  bool found_i0 = false;
+  // Verify that i0 is before c1.
+  for (const auto &instruction : module->entry_computation()->instructions()) {
+    if (instruction->name() == "c1") EXPECT_TRUE(found_i0);
+    if (instruction->name() == "i0") found_i0 = true;
+  }
+  // Calling MakeInstructionPostOrder() again to verify idempotence.
+  auto post_order = module->entry_computation()->MakeInstructionPostOrder();
+  found_i0 = false;
+  for (HloInstruction *instruction : post_order) {
+    if (instruction->name() == "c1") EXPECT_TRUE(found_i0);
+    if (instruction->name() == "i0") found_i0 = true;
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
index 46969f114a90..97e15921371e 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation.cc
@@ -191,7 +191,7 @@ absl::StatusOr<HloInstruction*> InsertTokenIntoTuple(HloInstruction* tuple,
 
   HloInstruction* input_token_gte =
       computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-          tuple, tuple->shape().tuple_shapes_size() - 1));
+          tuple, tuple->shape().tuple_shapes().size() - 1));
   return input_token_gte;
 }
 }  // namespace
diff --git a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation_test.cc b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation_test.cc
index 9aaff1042dbd..9d43038b3bce 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation_test.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/infeed_token_propagation_test.cc
@@ -105,17 +105,17 @@ ENTRY main {
 
   // The infeed output token should have propagated through the conditional.
   HloInstruction* cond = FindInstruction(module.get(), "cond.0");
-  EXPECT_EQ(cond->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond->shape().tuple_shapes()[0].IsToken());
 
   // The infeed input token should have propagated through the true tuple.
   HloInstruction* true_tuple = FindInstruction(module.get(), "true_tuple.0");
-  EXPECT_EQ(true_tuple->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(true_tuple->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(true_tuple->shape().tuple_shapes()[0].IsToken());
 
   // The infeed input token should not have propagated through the false tuple.
   HloInstruction* false_tuple = FindInstruction(module.get(), "false_tuple.0");
-  EXPECT_EQ(false_tuple->shape().tuple_shapes_size(), 0);
+  EXPECT_EQ(false_tuple->shape().tuple_shapes().size(), 0);
 
   // The infeed output token should have propagated through the true
   // computation's root.
@@ -161,17 +161,17 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the conditional.
   HloInstruction* cond = FindInstruction(module.get(), "cond.0");
-  EXPECT_EQ(cond->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed input token should have propagated through the true tuple.
   HloInstruction* true_tuple = FindInstruction(module.get(), "true_tuple.0");
-  EXPECT_EQ(true_tuple->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(true_tuple->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(true_tuple->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed input token should not have propagated through the false tuple.
   HloInstruction* false_tuple = FindInstruction(module.get(), "false_tuple.0");
-  EXPECT_EQ(false_tuple->shape().tuple_shapes_size(), 0);
+  EXPECT_EQ(false_tuple->shape().tuple_shapes().size(), 0);
 
   // The outfeed output token should have propagated through the true
   // computation's root.
@@ -214,17 +214,17 @@ ENTRY main {
 
   // The infeed output token should have propagated through the conditional.
   HloInstruction* cond = FindInstruction(module.get(), "cond.0");
-  EXPECT_EQ(cond->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond->shape().tuple_shapes()[0].IsToken());
 
   // The infeed input token should have propagated through the true tuple.
   const HloInstruction* true_tuple = cond->operand(1);
-  EXPECT_EQ(true_tuple->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(true_tuple->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(true_tuple->shape().tuple_shapes()[0].IsToken());
 
   // The infeed input token should not have propagated through the false tuple.
   const HloInstruction* false_tuple = cond->operand(2);
-  EXPECT_EQ(false_tuple->shape().tuple_shapes_size(), 0);
+  EXPECT_EQ(false_tuple->shape().tuple_shapes().size(), 0);
 
   // The infeed output token should have propagated through the true
   // computation's root.
@@ -270,18 +270,18 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the conditional.
   HloInstruction* cond = FindInstruction(module.get(), "cond.0");
-  EXPECT_EQ(cond->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed input token should have propagated through the true tuple.
   HloInstruction* true_tuple = cond->mutable_operand(1);
   EXPECT_TRUE(true_tuple->shape().IsTuple());
-  EXPECT_EQ(true_tuple->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(true_tuple->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(true_tuple->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed input token should not have propagated through the false tuple.
   HloInstruction* false_tuple = FindInstruction(module.get(), "false_tuple.0");
-  EXPECT_EQ(false_tuple->shape().tuple_shapes_size(), 0);
+  EXPECT_EQ(false_tuple->shape().tuple_shapes().size(), 0);
 
   // The outfeed output token should have propagated through the true
   // computation's root.
@@ -326,17 +326,17 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the conditional.
   HloInstruction* cond = FindInstruction(module.get(), "cond.0");
-  EXPECT_EQ(cond->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed input token should have propagated through the true tuple.
   HloInstruction* true_tuple = FindInstruction(module.get(), "true_tuple.0");
-  EXPECT_EQ(true_tuple->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(true_tuple->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(true_tuple->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed input token should not have propagated through the false tuple.
   HloInstruction* false_tuple = FindInstruction(module.get(), "false_tuple.0");
-  EXPECT_EQ(false_tuple->shape().tuple_shapes_size(), 0);
+  EXPECT_EQ(false_tuple->shape().tuple_shapes().size(), 0);
 
   // The outfeed output token should have propagated through the true
   // computation's root.
@@ -378,12 +378,12 @@ ENTRY main {
 
   // The infeed output token should have propagated through the loop.
   HloInstruction* loop = FindInstruction(module.get(), "while.0");
-  EXPECT_EQ(loop->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(loop->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(loop->shape().tuple_shapes()[0].IsToken());
 
   // The infeed input token should have propagated through the loop tuple.
   HloInstruction* loop_tuple = FindInstruction(module.get(), "while_tuple.0");
-  EXPECT_EQ(loop_tuple->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(loop_tuple->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(loop_tuple->shape().tuple_shapes()[0].IsToken());
 
   // The infeed output token should have propagated through the while body root.
@@ -393,14 +393,14 @@ ENTRY main {
 
   // The infeed input token should have propagated through the body parameter.
   HloInstruction* body_param = body_comp->parameter_instruction(0);
-  EXPECT_EQ(body_param->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(body_param->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(body_param->shape().tuple_shapes()[0].IsToken());
 
   // The infeed input token should have propagated through the condition
   // parameter.
   HloComputation* cond_comp = FindComputation(module.get(), "cond");
   HloInstruction* cond_param = cond_comp->parameter_instruction(0);
-  EXPECT_EQ(cond_param->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond_param->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond_param->shape().tuple_shapes()[0].IsToken());
 }
 
@@ -436,12 +436,12 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the loop.
   HloInstruction* loop = FindInstruction(module.get(), "while.0");
-  EXPECT_EQ(loop->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(loop->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(loop->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed input token should have propagated through the loop tuple.
   HloInstruction* loop_tuple = FindInstruction(module.get(), "while_tuple.0");
-  EXPECT_EQ(loop_tuple->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(loop_tuple->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(loop_tuple->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed output token should have propagated through the while body
@@ -452,14 +452,14 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the body parameter.
   HloInstruction* body_param = body_comp->parameter_instruction(0);
-  EXPECT_EQ(body_param->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(body_param->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(body_param->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed output token should have propagated through the condition
   // parameter.
   HloComputation* cond_comp = FindComputation(module.get(), "cond");
   HloInstruction* cond_param = cond_comp->parameter_instruction(0);
-  EXPECT_EQ(cond_param->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(cond_param->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(cond_param->shape().tuple_shapes()[1].IsToken());
 }
 
@@ -494,12 +494,12 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the loop.
   HloInstruction* loop = FindInstruction(module.get(), "while.0");
-  EXPECT_EQ(loop->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(loop->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(loop->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed input token should have propagated through the loop tuple.
   HloInstruction* loop_tuple = FindInstruction(module.get(), "while_tuple.0");
-  EXPECT_EQ(loop_tuple->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(loop_tuple->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(loop_tuple->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed output token should have propagated through the while body
@@ -509,14 +509,14 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the body parameter.
   HloInstruction* body_param = body_comp->parameter_instruction(0);
-  EXPECT_EQ(body_param->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(body_param->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(body_param->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed output token should have propagated through the condition
   // parameter.
   HloComputation* cond_comp = FindComputation(module.get(), "cond");
   HloInstruction* cond_param = cond_comp->parameter_instruction(0);
-  EXPECT_EQ(cond_param->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond_param->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond_param->shape().tuple_shapes()[0].IsToken());
 }
 
@@ -551,7 +551,7 @@ ENTRY main {
   // The outfeed output token should have propagated through the loop.
   HloInstruction* loop = FindInstruction(module.get(), "while.0");
   EXPECT_TRUE(loop->shape().IsTuple());
-  EXPECT_EQ(loop->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(loop->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(loop->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed input token should have propagated through the loop tuple.
@@ -565,14 +565,14 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the body parameter.
   HloInstruction* body_param = body_comp->parameter_instruction(0);
-  EXPECT_EQ(body_param->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(body_param->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(body_param->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed output token should have propagated through the condition
   // parameter.
   HloComputation* cond_comp = FindComputation(module.get(), "cond");
   HloInstruction* cond_param = cond_comp->parameter_instruction(0);
-  EXPECT_EQ(cond_param->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(cond_param->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(cond_param->shape().tuple_shapes()[1].IsToken());
 }
 
@@ -622,14 +622,14 @@ ENTRY main {
   // The infeed and outfeed output tokens should have propagated through the
   // loop.
   HloInstruction* loop = FindInstruction(module.get(), "while.0");
-  EXPECT_EQ(loop->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(loop->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(loop->shape().tuple_shapes()[0].IsToken());
   EXPECT_TRUE(loop->shape().tuple_shapes()[1].IsToken());
 
   // The infeed and outfeed input tokens should have propagated through the loop
   // tuple.
   HloInstruction* loop_tuple = FindInstruction(module.get(), "while_tuple.0");
-  EXPECT_EQ(loop_tuple->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(loop_tuple->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(loop_tuple->shape().tuple_shapes()[0].IsToken());
   EXPECT_TRUE(loop_tuple->shape().tuple_shapes()[1].IsToken());
 
@@ -642,17 +642,17 @@ ENTRY main {
 
   // The outfeed output token should have propagated through the conditional.
   HloInstruction* cond = FindInstruction(module.get(), "cond.0");
-  EXPECT_EQ(cond->shape().tuple_shapes_size(), 1);
+  EXPECT_EQ(cond->shape().tuple_shapes().size(), 1);
   EXPECT_TRUE(cond->shape().tuple_shapes()[0].IsToken());
 
   // The outfeed input token should have propagated through the true tuple.
   HloInstruction* true_tuple = FindInstruction(module.get(), "true_tuple.0");
-  EXPECT_EQ(true_tuple->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(true_tuple->shape().tuple_shapes().size(), 2);
   EXPECT_TRUE(true_tuple->shape().tuple_shapes()[1].IsToken());
 
   // The outfeed input token should not have propagated through the false tuple.
   HloInstruction* false_tuple = FindInstruction(module.get(), "false_tuple.0");
-  EXPECT_EQ(false_tuple->shape().tuple_shapes_size(), 0);
+  EXPECT_EQ(false_tuple->shape().tuple_shapes().size(), 0);
 
   // The outfeed output token should have propagated through the true
   // computation's root.
diff --git a/third_party/xla/xla/hlo/transforms/collectives/while_loop_all_reduce_code_motion_setup.cc b/third_party/xla/xla/hlo/transforms/collectives/while_loop_all_reduce_code_motion_setup.cc
index 63e9da060128..4a0de74e6b11 100644
--- a/third_party/xla/xla/hlo/transforms/collectives/while_loop_all_reduce_code_motion_setup.cc
+++ b/third_party/xla/xla/hlo/transforms/collectives/while_loop_all_reduce_code_motion_setup.cc
@@ -34,7 +34,7 @@ namespace xla {
 bool ReorderReduceTranspose::InstructionMatchesPattern(
     HloInstruction* instruction) {
   // Instruction must be in while loop body.
-  if (!instruction->parent()->IsWhileBodyComputation()) {
+  if (!instruction->parent()->GetUniqueCaller(HloOpcode::kWhile)) {
     return false;
   }
   // Search for Reduce Scatter Transpose pairs with optional convert in between
@@ -140,7 +140,7 @@ absl::StatusOr<HloInstruction*> ReorderReduceTranspose::ExpandInstruction(
   // now changed based on the transpose, so find it through the transpose
   // permutation.
   int64_t new_scatter_dim = -1;
-  for (int i = 0; i < transpose->shape().rank(); i++) {
+  for (int i = 0; i < transpose->shape().dimensions().size(); i++) {
     if (transpose->dimensions()[i] == reduce_scatter->scatter_dimension()) {
       new_scatter_dim = i;
       break;
@@ -159,7 +159,7 @@ absl::StatusOr<HloInstruction*> ReorderReduceTranspose::ExpandInstruction(
 bool ReorderConvertReduceAdd::InstructionMatchesPattern(
     HloInstruction* instruction) {
   // Instruction must be in while loop body.
-  if (!instruction->parent()->IsWhileBodyComputation()) {
+  if (!instruction->parent()->GetUniqueCaller(HloOpcode::kWhile)) {
     return false;
   }
   // Check if the instruction is an add operation
diff --git a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
index 570afa9e3d50..92c4448b7049 100644
--- a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
+++ b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations.cc
@@ -22,7 +22,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/side_effect_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
@@ -34,23 +34,18 @@ namespace xla {
 namespace {
 absl::StatusOr<absl::string_view> GetCustomCallTarget(
     absl::string_view external_annotation) {
-  if (external_annotation ==
-          host_memory_offload_annotations::kMemoryTargetPinnedHost ||
-      external_annotation ==
-          host_memory_offload_annotations::kMemoryTargetUnpinnedHost) {
-    return host_memory_offload_annotations::kMoveToHostCustomCallTarget;
+  if (external_annotation == memory_annotations::kMemoryTargetPinnedHost ||
+      external_annotation == memory_annotations::kMemoryTargetUnpinnedHost) {
+    return memory_annotations::kMoveToHostCustomCallTarget;
   }
-  if (external_annotation ==
-      host_memory_offload_annotations::kMemoryTargetDevice) {
-    return host_memory_offload_annotations::kMoveToDeviceCustomCallTarget;
+  if (external_annotation == memory_annotations::kMemoryTargetDevice) {
+    return memory_annotations::kMoveToDeviceCustomCallTarget;
   }
-  if (external_annotation ==
-      host_memory_offload_annotations::kMemoryTargetDeviceSram) {
-    return host_memory_offload_annotations::kPinToDeviceSramCustomCallTarget;
+  if (external_annotation == memory_annotations::kMemoryTargetDeviceSram) {
+    return memory_annotations::kPinToDeviceSramCustomCallTarget;
   }
-  if (external_annotation ==
-      host_memory_offload_annotations::kMemoryTargetPinnedDevice) {
-    return host_memory_offload_annotations::kPinToDeviceCustomCallTarget;
+  if (external_annotation == memory_annotations::kMemoryTargetPinnedDevice) {
+    return memory_annotations::kPinToDeviceCustomCallTarget;
   }
   return absl::InvalidArgumentError(
       absl::StrCat("Invalid external annotation: ", external_annotation));
@@ -67,14 +62,12 @@ ConvertCustomCallWithExternalAnnotationToInternalAnnotation(
   // XLA currently does not differentiate between pinned and unpinned host
   // memory.
   const bool is_to_host_case =
-      (it->second == host_memory_offload_annotations::kMemoryTargetPinnedHost ||
-       it->second ==
-           host_memory_offload_annotations::kMemoryTargetUnpinnedHost);
+      (it->second == memory_annotations::kMemoryTargetPinnedHost ||
+       it->second == memory_annotations::kMemoryTargetUnpinnedHost);
   const bool is_to_device_case =
-      (it->second == host_memory_offload_annotations::kMemoryTargetDevice ||
-       it->second == host_memory_offload_annotations::kMemoryTargetDeviceSram ||
-       it->second ==
-           host_memory_offload_annotations::kMemoryTargetPinnedDevice);
+      (it->second == memory_annotations::kMemoryTargetDevice ||
+       it->second == memory_annotations::kMemoryTargetDeviceSram ||
+       it->second == memory_annotations::kMemoryTargetPinnedDevice);
   if (!is_to_host_case && !is_to_device_case) {
     return false;
   }
@@ -87,8 +80,8 @@ ConvertCustomCallWithExternalAnnotationToInternalAnnotation(
       return Internal(
           "Custom calls with target %s must have exactly one operand. %s "
           "has %d.",
-          host_memory_offload_annotations::kDevicePlacement,
-          instruction->name(), instruction->operand_count());
+          memory_annotations::kDevicePlacement, instruction->name(),
+          instruction->operand_count());
     }
     HloInstruction* input = instruction->mutable_operand(0);
     HloInstruction* move_to_host_custom_call =
@@ -123,8 +116,7 @@ absl::StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::Run(
   bool changed = false;
   for (HloComputation* c : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : c->MakeInstructionPostOrder()) {
-      if (instruction->IsCustomCall(
-              host_memory_offload_annotations::kDevicePlacement)) {
+      if (instruction->IsCustomCall(memory_annotations::kDevicePlacement)) {
         TF_ASSIGN_OR_RETURN(
             auto result,
             ConvertCustomCallWithExternalAnnotationToInternalAnnotation(
diff --git a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations_test.cc b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations_test.cc
index dab4d055d8f2..a62969dcee8b 100644
--- a/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations_test.cc
+++ b/third_party/xla/xla/hlo/transforms/convert_memory_placement_to_internal_annotations_test.cc
@@ -23,7 +23,7 @@
 #include "absl/strings/string_view.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -250,9 +250,9 @@ ENTRY main.183 {
   for (auto* c : module->computations()) {
     for (auto* instr : c->instructions()) {
       if (instr->IsCustomCall(
-              host_memory_offload_annotations::kMoveToHostCustomCallTarget) ||
+              memory_annotations::kMoveToHostCustomCallTarget) ||
           instr->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+              memory_annotations::kMoveToDeviceCustomCallTarget)) {
         ++custom_calls_count;
       }
     }
@@ -474,9 +474,9 @@ ENTRY main.183 {
   for (auto* c : module->computations()) {
     for (auto* instr : c->instructions()) {
       if (instr->IsCustomCall(
-              host_memory_offload_annotations::kMoveToHostCustomCallTarget) ||
+              memory_annotations::kMoveToHostCustomCallTarget) ||
           instr->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+              memory_annotations::kMoveToDeviceCustomCallTarget)) {
         ++custom_calls_count;
       }
     }
@@ -503,8 +503,8 @@ TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest,
   int64_t move_to_host_count = 0;
   for (auto* c : module->computations()) {
     for (auto* instr : c->instructions()) {
-      move_to_host_count += instr->IsCustomCall(
-          host_memory_offload_annotations::kMoveToHostCustomCallTarget);
+      move_to_host_count +=
+          instr->IsCustomCall(memory_annotations::kMoveToHostCustomCallTarget);
     }
   }
   EXPECT_EQ(move_to_host_count, 1);
@@ -521,7 +521,7 @@ TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest,
     broadcast.3 = s32[8,2]{1,0} broadcast(constant.2), dimensions={}
     multiply.4 = s32[8,2]{1,0} multiply(Arg_0.1, broadcast.3), metadata={op_name="jit(f)/jit(main)/mul" source_file="third_party/py/jax/tests/memories_test.py" source_line=707}
     custom-call.5 = s32[8,2]{1,0} custom-call(multiply.4), custom_call_target="Sharding", sharding={devices=[2,1]<=[2]}, metadata={op_name="jit(f)/jit(main)/device_put" source_file="third_party/py/jax/tests/memories_test.py" source_line=708}
-    custom-call.6 = s32[8,2]{1,0} custom-call(custom-call.5), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="device_sram"}, metadata={op_name="jit(f)/jit(main)/device_put" source_file="third_party/py/jax/tests/memories_test.py" source_line=708}
+    custom-call.6 = s32[8,2]{1,0} custom-call(custom-call.5), custom_call_target="annotate_device_placement", custom_call_has_side_effect=true, frontend_attributes={_xla_buffer_placement="vmem"}, metadata={op_name="jit(f)/jit(main)/device_put" source_file="third_party/py/jax/tests/memories_test.py" source_line=708}
     ROOT multiply.7 = s32[8,2]{1,0} multiply(custom-call.6, broadcast.3), metadata={op_name="jit(f)/jit(main)/mul" source_file="third_party/py/jax/tests/memories_test.py" source_line=709}
   } // main.8 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -530,14 +530,14 @@ TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest,
       ConvertMemoryPlacementToInternalAnnotations().Run(module.get()).value();
   EXPECT_TRUE(changed);
   XLA_VLOG_LINES(1, module->ToString());
-  int64_t pin_todevice_sramcount = 0;
+  int64_t pin_to_vmem_count = 0;
   for (auto* c : module->computations()) {
     for (auto* instr : c->instructions()) {
-      pin_todevice_sramcount += instr->IsCustomCall(
-          host_memory_offload_annotations::kPinToDeviceSramCustomCallTarget);
+      pin_to_vmem_count += instr->IsCustomCall(
+          memory_annotations::kPinToDeviceSramCustomCallTarget);
     }
   }
-  EXPECT_EQ(pin_todevice_sramcount, 1);
+  EXPECT_EQ(pin_to_vmem_count, 1);
 }
 
 TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest,
@@ -563,8 +563,8 @@ TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest,
   int64_t pin_todevice_count = 0;
   for (auto* c : module->computations()) {
     for (auto* instr : c->instructions()) {
-      pin_todevice_count += instr->IsCustomCall(
-          host_memory_offload_annotations::kPinToDeviceCustomCallTarget);
+      pin_todevice_count +=
+          instr->IsCustomCall(memory_annotations::kPinToDeviceCustomCallTarget);
     }
   }
   EXPECT_EQ(pin_todevice_count, 1);
diff --git a/third_party/xla/xla/hlo/transforms/despecializer.cc b/third_party/xla/xla/hlo/transforms/despecializer.cc
index 56bd13211c49..11297956ae6c 100644
--- a/third_party/xla/xla/hlo/transforms/despecializer.cc
+++ b/third_party/xla/xla/hlo/transforms/despecializer.cc
@@ -157,7 +157,8 @@ absl::StatusOr<bool> DeconstructReduceWindowToReduceBroadcast::Run(
     auto reduce_window = rw.first;
     auto reduce_dim_index = rw.second;
     if (reduce_window == nullptr || reduce_dim_index < 0 ||
-        reduce_dim_index >= reduce_window->operand(0)->shape().rank()) {
+        reduce_dim_index >=
+            reduce_window->operand(0)->shape().dimensions().size()) {
       continue;
     }
     std::vector<int64_t> reduce_instr_dimensions;
diff --git a/third_party/xla/xla/hlo/transforms/expanders/BUILD b/third_party/xla/xla/hlo/transforms/expanders/BUILD
index 7a41cdd34e99..0f6e3fe2e453 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/BUILD
+++ b/third_party/xla/xla/hlo/transforms/expanders/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Implementation of XLA’s HLO expander transformations.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -156,7 +156,6 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:hlo_creation_utils",
         "//xla/service:pattern_matcher",
-        "//xla/service:pattern_matcher_gmock",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
@@ -289,7 +288,6 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/service:dynamic_padder",
         "//xla/service:pattern_matcher",
-        "//xla/service:pattern_matcher_gmock",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",  # fixdeps: keep
@@ -303,10 +301,7 @@ cc_library(
     hdrs = ["bitcast_dtypes_expander.h"],
     deps = [
         ":op_expander_pass",
-        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
@@ -316,7 +311,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_module_config",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -334,8 +328,6 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -488,9 +480,7 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander.cc
index 3cccad769aff..ddb505801d92 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander.cc
@@ -86,7 +86,7 @@ absl::StatusOr<HloInstruction*> BitcastDtypesExpander::ExpandInstruction(
                                       broadcasted_input_shape));
       input = BitcastConvertType(input, input_logical_type);
       TF_ASSIGN_OR_RETURN(Shape input_shape, b.GetShape(input));
-      XlaOp iota = Iota(&b, input_shape, input_shape.dimensions_size() - 1);
+      XlaOp iota = Iota(&b, input_shape, input_shape.dimensions().size() - 1);
       XlaOp iota_m = Mul(ScalarLike(input, output_bit_width), iota);
       input = And(ShiftRightLogical(input, iota_m),
                   ScalarLike(input, output_bit_width_mask));
@@ -100,11 +100,11 @@ absl::StatusOr<HloInstruction*> BitcastDtypesExpander::ExpandInstruction(
           ConstantR0WithType(&b, output_logical_type, input_bit_width),
           Iota(&b,
                ShapeUtil::ChangeElementType(from_shape, output_logical_type),
-               from_shape.rank() - 1));
+               from_shape.dimensions().size() - 1));
       input = ShiftLeft(input, iota_m);
       input = Reduce(input, Zero(&b, output_logical_type),
                      CreateScalarOrComputation(output_logical_type, &b),
-                     {from_shape.rank() - 1});
+                     {from_shape.dimensions_size() - 1});
     }
 
     BitcastConvertType(input, to_shape.element_type());
diff --git a/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander_test.cc b/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander_test.cc
index 033bd4d5d84c..374941f94d29 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander_test.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/bitcast_dtypes_expander_test.cc
@@ -49,25 +49,25 @@ ENTRY main {
 // CHECK: HloModule bitcast_to_smaller
 // CHECK: %xla.bitcast_convert_s32_10__2_s8_10_4_.17 (a.1: s32[10]) -> s8[10,4] {
 // CHECK:  %[[VAL_0:.*]] = s32[10]{0} parameter(0)
-// CHECK:  %[[VAL_1:.*]] = s32[10,1]{1,0} reshape(s32[10]{0} %[[VAL_0]])
-// CHECK:  %[[VAL_2:.*]] = s32[10,1]{1,0} broadcast(s32[10,1]{1,0} %[[VAL_1]]), dimensions={0,1}
-// CHECK:  %[[VAL_3:.*]] = s32[10]{0} reshape(s32[10,1]{1,0} %[[VAL_2]])
-// CHECK:  %[[VAL_4:.*]] = s32[10,4]{1,0} broadcast(s32[10]{0} %[[VAL_3]]), dimensions={0}
-// CHECK:  %[[VAL_5:.*]] = u32[10,4]{1,0} bitcast-convert(s32[10,4]{1,0} %[[VAL_4]])
+// CHECK:  %[[VAL_1:.*]] = s32[10,1]{1,0} reshape(%[[VAL_0]])
+// CHECK:  %[[VAL_2:.*]] = s32[10,1]{1,0} broadcast(%[[VAL_1]]), dimensions={0,1}
+// CHECK:  %[[VAL_3:.*]] = s32[10]{0} reshape(%[[VAL_2]])
+// CHECK:  %[[VAL_4:.*]] = s32[10,4]{1,0} broadcast(%[[VAL_3]]), dimensions={0}
+// CHECK:  %[[VAL_5:.*]] = u32[10,4]{1,0} bitcast-convert(%[[VAL_4]])
 // CHECK:  %[[VAL_6:.*]] = u32[] constant(8)
-// CHECK:  %[[VAL_7:.*]] = u32[10,4]{1,0} broadcast(u32[] %[[VAL_6]]), dimensions={}
+// CHECK:  %[[VAL_7:.*]] = u32[10,4]{1,0} broadcast(%[[VAL_6]]), dimensions={}
 // CHECK:  %[[VAL_8:.*]] = u32[10,4]{1,0} iota(), iota_dimension=1
-// CHECK:  %[[VAL_9:.*]] = u32[10,4]{1,0} multiply(u32[10,4]{1,0} %[[VAL_7]], u32[10,4]{1,0} %[[VAL_8]])
-// CHECK:  %[[VAL_10:.*]] = u32[10,4]{1,0} shift-right-logical(u32[10,4]{1,0} %[[VAL_5]], u32[10,4]{1,0} %[[VAL_9]])
+// CHECK:  %[[VAL_9:.*]] = u32[10,4]{1,0} multiply(%[[VAL_7]], %[[VAL_8]])
+// CHECK:  %[[VAL_10:.*]] = u32[10,4]{1,0} shift-right-logical(%[[VAL_5]], %[[VAL_9]])
 // CHECK:  %[[VAL_11:.*]] = u32[] constant(255)
-// CHECK:  %[[VAL_12:.*]] = u32[10,4]{1,0} broadcast(u32[] %[[VAL_11]]), dimensions={}
-// CHECK:  %[[VAL_13:.*]] = u32[10,4]{1,0} and(u32[10,4]{1,0} %[[VAL_10]], u32[10,4]{1,0} %[[VAL_12]])
-// CHECK:  %[[VAL_14:.*]] = u8[10,4]{1,0} convert(u32[10,4]{1,0} %[[VAL_13]])
-// CHECK:  ROOT %[[VAL_15:.*]] = s8[10,4]{1,0} bitcast-convert(u8[10,4]{1,0} %[[VAL_14]])
+// CHECK:  %[[VAL_12:.*]] = u32[10,4]{1,0} broadcast(%[[VAL_11]]), dimensions={}
+// CHECK:  %[[VAL_13:.*]] = u32[10,4]{1,0} and(%[[VAL_10]], %[[VAL_12]])
+// CHECK:  %[[VAL_14:.*]] = u8[10,4]{1,0} convert(%[[VAL_13]])
+// CHECK:  ROOT %[[VAL_15:.*]] = s8[10,4]{1,0} bitcast-convert(%[[VAL_14]])
 // CHECK: }
 // CHECK: ENTRY %main (p: s32[10]) -> s8[10,4] {
 // CHECK:  %[[VAL_16:.*]] = s32[10]{0} parameter(0)
-// CHECK:  ROOT %[[VAL_17:.*]] = s8[10,4]{1,0} call(s32[10]{0} %[[VAL_16]]), to_apply=%[[VAL_18:.*]]
+// CHECK:  ROOT %[[VAL_17:.*]] = s8[10,4]{1,0} call(%[[VAL_16]]), to_apply=%[[VAL_18:.*]]
 // CHECK: }
 )"));
 }
@@ -92,25 +92,25 @@ ENTRY main {
 // CHECK: HloModule bitcast_to_smaller, entry_computation_layout={(s64[10]{0})->s32[10,2]{1,0}}
 // CHECK: %xla.bitcast_convert_s64_10__2_s32_10_2_.17 (a.1: s64[10]) -> s32[10,2] {
 // CHECK:   %[[VAL_0:.*]] = s64[10]{0} parameter(0)
-// CHECK:   %[[VAL_1:.*]] = s64[10,1]{1,0} reshape(s64[10]{0} %[[VAL_0]])
-// CHECK:   %[[VAL_2:.*]] = s64[10,1]{1,0} broadcast(s64[10,1]{1,0} %[[VAL_1]]), dimensions={0,1}
-// CHECK:   %[[VAL_3:.*]] = s64[10]{0} reshape(s64[10,1]{1,0} %[[VAL_2]])
-// CHECK:   %[[VAL_4:.*]] = s64[10,2]{1,0} broadcast(s64[10]{0} %[[VAL_3]]), dimensions={0}
-// CHECK:   %[[VAL_5:.*]] = u64[10,2]{1,0} bitcast-convert(s64[10,2]{1,0} %[[VAL_4]])
+// CHECK:   %[[VAL_1:.*]] = s64[10,1]{1,0} reshape(%[[VAL_0]])
+// CHECK:   %[[VAL_2:.*]] = s64[10,1]{1,0} broadcast(%[[VAL_1]]), dimensions={0,1}
+// CHECK:   %[[VAL_3:.*]] = s64[10]{0} reshape(%[[VAL_2]])
+// CHECK:   %[[VAL_4:.*]] = s64[10,2]{1,0} broadcast(%[[VAL_3]]), dimensions={0}
+// CHECK:   %[[VAL_5:.*]] = u64[10,2]{1,0} bitcast-convert(%[[VAL_4]])
 // CHECK:   %[[VAL_6:.*]] = u64[] constant(32)
-// CHECK:   %[[VAL_7:.*]] = u64[10,2]{1,0} broadcast(u64[] %[[VAL_6]]), dimensions={}
+// CHECK:   %[[VAL_7:.*]] = u64[10,2]{1,0} broadcast(%[[VAL_6]]), dimensions={}
 // CHECK:   %[[VAL_8:.*]] = u64[10,2]{1,0} iota(), iota_dimension=1
-// CHECK:   %[[VAL_9:.*]] = u64[10,2]{1,0} multiply(u64[10,2]{1,0} %[[VAL_7]], u64[10,2]{1,0} %[[VAL_8]])
-// CHECK:   %[[VAL_10:.*]] = u64[10,2]{1,0} shift-right-logical(u64[10,2]{1,0} %[[VAL_5]], u64[10,2]{1,0} %[[VAL_9]])
+// CHECK:   %[[VAL_9:.*]] = u64[10,2]{1,0} multiply(%[[VAL_7]], %[[VAL_8]])
+// CHECK:   %[[VAL_10:.*]] = u64[10,2]{1,0} shift-right-logical(%[[VAL_5]], %[[VAL_9]])
 // CHECK:   %[[VAL_11:.*]] = u64[] constant(4294967295)
-// CHECK:   %[[VAL_12:.*]] = u64[10,2]{1,0} broadcast(u64[] %[[VAL_11]]), dimensions={}
-// CHECK:   %[[VAL_13:.*]] = u64[10,2]{1,0} and(u64[10,2]{1,0} %[[VAL_10]], u64[10,2]{1,0} %[[VAL_12]])
-// CHECK:   %[[VAL_14:.*]] = u32[10,2]{1,0} convert(u64[10,2]{1,0} %[[VAL_13]])
-// CHECK:   ROOT %[[VAL_15:.*]] = s32[10,2]{1,0} bitcast-convert(u32[10,2]{1,0} %[[VAL_14]])
+// CHECK:   %[[VAL_12:.*]] = u64[10,2]{1,0} broadcast(%[[VAL_11]]), dimensions={}
+// CHECK:   %[[VAL_13:.*]] = u64[10,2]{1,0} and(%[[VAL_10]], %[[VAL_12]])
+// CHECK:   %[[VAL_14:.*]] = u32[10,2]{1,0} convert(%[[VAL_13]])
+// CHECK:   ROOT %[[VAL_15:.*]] = s32[10,2]{1,0} bitcast-convert(%[[VAL_14]])
 // CHECK: }
 // CHECK: ENTRY %main (p: s64[10]) -> s32[10,2] {
 // CHECK:   %[[VAL_16:.*]] = s64[10]{0} parameter(0)
-// CHECK:   ROOT %[[VAL_17:.*]] = s32[10,2]{1,0} call(s64[10]{0} %[[VAL_16]]), to_apply=%[[VAL_18:.*]]
+// CHECK:   ROOT %[[VAL_17:.*]] = s32[10,2]{1,0} call(%[[VAL_16]]), to_apply=%[[VAL_18:.*]]
 // CHECK: }
 )"));
 }
@@ -138,24 +138,24 @@ ENTRY main {
 // CHECK: %or_U32.10 (lhs.11: u32[], rhs.12: u32[]) -> u32[] {
 // CHECK:  %[[VAL_0:.*]] = u32[] parameter(0)
 // CHECK:  %[[VAL_1:.*]] = u32[] parameter(1)
-// CHECK:  ROOT %[[VAL_2:.*]] = u32[] or(u32[] %[[VAL_0]], u32[] %[[VAL_1]])
+// CHECK:  ROOT %[[VAL_2:.*]] = u32[] or(%[[VAL_0]], %[[VAL_1]])
 // CHECK: }
 // CHECK: %xla.bitcast_convert_s8_10_4__2_s32_10_.16 (a.1: s8[10,4]) -> s32[10] {
 // CHECK:  %[[VAL_3:.*]] = s8[10,4]{1,0} parameter(0)
-// CHECK:  %[[VAL_4:.*]] = u8[10,4]{1,0} bitcast-convert(s8[10,4]{1,0} %[[VAL_3]])
-// CHECK:  %[[VAL_5:.*]] = u32[10,4]{1,0} convert(u8[10,4]{1,0} %[[VAL_4]])
+// CHECK:  %[[VAL_4:.*]] = u8[10,4]{1,0} bitcast-convert(%[[VAL_3]])
+// CHECK:  %[[VAL_5:.*]] = u32[10,4]{1,0} convert(%[[VAL_4]])
 // CHECK:  %[[VAL_6:.*]] = u32[] constant(8)
-// CHECK:  %[[VAL_7:.*]] = u32[10,4]{1,0} broadcast(u32[] %[[VAL_6]]), dimensions={}
+// CHECK:  %[[VAL_7:.*]] = u32[10,4]{1,0} broadcast(%[[VAL_6]]), dimensions={}
 // CHECK:  %[[VAL_8:.*]] = u32[10,4]{1,0} iota(), iota_dimension=1
-// CHECK:  %[[VAL_9:.*]] = u32[10,4]{1,0} multiply(u32[10,4]{1,0} %[[VAL_7]], u32[10,4]{1,0} %[[VAL_8]])
-// CHECK:  %[[VAL_10:.*]] = u32[10,4]{1,0} shift-left(u32[10,4]{1,0} %[[VAL_5]], u32[10,4]{1,0} %[[VAL_9]])
+// CHECK:  %[[VAL_9:.*]] = u32[10,4]{1,0} multiply(%[[VAL_7]], %[[VAL_8]])
+// CHECK:  %[[VAL_10:.*]] = u32[10,4]{1,0} shift-left(%[[VAL_5]], %[[VAL_9]])
 // CHECK:  %[[VAL_11:.*]] = u32[] constant(0)
-// CHECK:  %[[VAL_12:.*]] = u32[10]{0} reduce(u32[10,4]{1,0} %[[VAL_10]], u32[] %[[VAL_11]]), dimensions={1}, to_apply=%[[VAL_13:.*]]
-// CHECK:  ROOT %[[VAL_14:.*]] = s32[10]{0} bitcast-convert(u32[10]{0} %[[VAL_12]])
+// CHECK:  %[[VAL_12:.*]] = u32[10]{0} reduce(%[[VAL_10]], %[[VAL_11]]), dimensions={1}, to_apply=%[[VAL_13:.*]]
+// CHECK:  ROOT %[[VAL_14:.*]] = s32[10]{0} bitcast-convert(%[[VAL_12]])
 // CHECK: }
 // CHECK: ENTRY %main (p: s8[10,4]) -> s32[10] {
 // CHECK:  %[[VAL_15:.*]] = s8[10,4]{1,0} parameter(0)
-// CHECK:  ROOT %[[VAL_16:.*]] = s32[10]{0} call(s8[10,4]{1,0} %[[VAL_15]]), to_apply=%[[VAL_17:.*]]
+// CHECK:  ROOT %[[VAL_16:.*]] = s32[10]{0} call(%[[VAL_15]]), to_apply=%[[VAL_17:.*]]
 // CHECK: }
 )"));
 }
diff --git a/third_party/xla/xla/hlo/transforms/expanders/cholesky_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/cholesky_expander.cc
index 56794a3985ad..b5df50e1956c 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/cholesky_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/cholesky_expander.cc
@@ -66,7 +66,7 @@ absl::StatusOr<std::pair<XlaOp, XlaOp>> CholeskyExpander::CholeskyUnblocked(
     XlaOp a, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int ndims = a_shape.rank();
+  const int ndims = a_shape.dimensions().size();
   const int64_t n = ShapeUtil::GetDimension(a_shape, -1);
   std::vector<int64_t> error_dims(a_shape.dimensions().begin(),
                                   a_shape.dimensions().end());
@@ -139,7 +139,7 @@ XlaOp CholeskyExpander::BuildCholesky(XlaOp a, int64_t block_size,
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int ndims = a_shape.rank();
+    const int ndims = a_shape.dimensions().size();
     if (ndims < 2) {
       return InvalidArgument(
           "Argument to Cholesky must have rank >= 2; shape was %s",
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
index 339165f48511..a3787d88ebbd 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dot_decomposer.cc
@@ -72,7 +72,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
   };
 
   const auto& lhs_shape = original_dot->operand(0)->shape();
-  const int64_t lhs_rank = lhs_shape.rank();
+  const int64_t lhs_rank = lhs_shape.dimensions().size();
   const int64_t num_lhs_non_contracting_dims =
       lhs_rank - num_batch_dims - num_contracting_dims;
 
@@ -138,7 +138,7 @@ absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
       &transposed_lhs->metadata());
 
   const auto& rhs_shape = original_dot->operand(1)->shape();
-  const int64_t rhs_rank = rhs_shape.rank();
+  const int64_t rhs_rank = rhs_shape.dimensions().size();
   const int64_t num_rhs_non_contracting_dims =
       rhs_rank - num_batch_dims - num_contracting_dims;
   std::vector<int64_t> rhs_non_contracting_dims;
@@ -291,9 +291,9 @@ absl::StatusOr<bool> DotDecomposer::Run(
       // A dot is not canonical if it has more than one non-contracting
       // dimension.
       if (dnums.lhs_batch_dimensions_size() + 2 <
-              instruction->operand(0)->shape().rank() ||
+              instruction->operand(0)->shape().dimensions().size() ||
           dnums.rhs_batch_dimensions_size() + 2 <
-              instruction->operand(1)->shape().rank()) {
+              instruction->operand(1)->shape().dimensions().size()) {
         non_canonical_dots.push_back(instruction);
         continue;
       }
diff --git a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
index 8472b031859b..faae8fdec809 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/dynamic_index_splitter.cc
@@ -50,7 +50,7 @@ absl::StatusOr<bool> DynamicIndexSplitter::Run(
       }
       auto parent = dynamic_op->parent();
       bool is_update = dynamic_op->opcode() == HloOpcode::kDynamicUpdateSlice;
-      int64_t num_indices = dynamic_op->operand(0)->shape().rank();
+      int64_t num_indices = dynamic_op->operand(0)->shape().dimensions().size();
 
       if (num_indices == 0) {
         // If the operand rank is 0, directly replace R0 DS/DUS with the
@@ -74,7 +74,7 @@ absl::StatusOr<bool> DynamicIndexSplitter::Run(
         // This DS/DUS already uses scalar indices.
         continue;
       }
-      TF_RET_CHECK(index_operand->shape().rank() == 1);
+      TF_RET_CHECK(index_operand->shape().dimensions().size() == 1);
       auto index_element_type = index_operand->shape().element_type();
       std::vector<HloInstruction*> index_array;
       index_array.reserve(num_indices);
diff --git a/third_party/xla/xla/hlo/transforms/expanders/eigh_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/eigh_expander.cc
index b934245d6f33..33752d60cae8 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/eigh_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/eigh_expander.cc
@@ -204,7 +204,7 @@ void PermuteRowsInColumn(XlaOp& top, XlaOp& bottom) {
   if (k <= 1) {
     return;
   }
-  int ndim = shape.dimensions_size();
+  int ndim = shape.dimensions().size();
   std::tie(top, bottom) =
       std::make_tuple(ConcatInDim(builder,
                                   {SliceInMinorDims(top, {0, 0}, {1, k}),
@@ -224,7 +224,7 @@ void PermuteColumnsInRow(XlaOp& left, XlaOp& right) {
   if (k <= 1) {
     return;
   }
-  int ndim = shape.dimensions_size();
+  int ndim = shape.dimensions().size();
   std::tie(left, right) =
       std::make_tuple(ConcatInDim(builder,
                                   {SliceInMinorDims(left, {0}, {1}),
@@ -280,7 +280,7 @@ absl::StatusOr<FrobeniusNorms> ComputeFrobeniusNorms(XlaOp w_tl, XlaOp w_tr,
                                                      XlaOp w_bl, XlaOp w_br) {
   XlaBuilder* builder = w_tl.builder();
   TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(w_tl));
-  const int64_t num_dims = shape.rank();
+  const int64_t num_dims = shape.dimensions().size();
   auto square_norm = [](XlaOp x) -> XlaOp {
     return Real(x * MaybeConjugate(x, true));
   };
@@ -363,7 +363,7 @@ absl::Status EighExpander::SortByEigenvalues(XlaOp& v, XlaOp& w) {
   XlaBuilder* builder = v.builder();
   TF_ASSIGN_OR_RETURN(Shape v_shape, builder->GetShape(v));
   TF_ASSIGN_OR_RETURN(Shape w_shape, builder->GetShape(w));
-  const int64_t num_dims = v_shape.rank();
+  const int64_t num_dims = v_shape.dimensions().size();
   auto dimensions = v_shape.dimensions();
 
   std::vector<int64_t> broadcast_dims(num_dims - 1);
@@ -439,7 +439,7 @@ XlaOp EighExpander::BuildEigh(XlaOp a, bool lower, int64_t max_iter, float tol,
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int64_t num_dims = a_shape.rank();
+    const int64_t num_dims = a_shape.dimensions().size();
     if (num_dims < 2) {
       return InvalidArgument(
           "Arguments to Eigen decomposition must have rank >= 2: got shape %s.",
diff --git a/third_party/xla/xla/hlo/transforms/expanders/qr_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/qr_expander.cc
index c23bc8279da2..dcf329f7c6d8 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/qr_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/qr_expander.cc
@@ -211,7 +211,7 @@ absl::StatusOr<QrDecomposition> QrExpander::QrBlock(
     XlaOp a, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int num_dims = a_shape.rank();
+  const int num_dims = a_shape.dimensions().size();
   if (num_dims < 2) {
     return InvalidArgument("Argument to QR must have rank >= 2; got shape %s",
                            a_shape.ToString());
@@ -387,7 +387,7 @@ absl::StatusOr<XlaOp> QrExpander::BuildQrDecomposition(
     XlaOp a, int64_t block_size, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int num_dims = a_shape.rank();
+  const int num_dims = a_shape.dimensions().size();
   if (num_dims < 2) {
     return InvalidArgument("Arguments to QR must have rank >= 2: got shape %s",
                            a_shape.ToString());
@@ -452,7 +452,7 @@ absl::StatusOr<XlaOp> QrExpander::ProductOfElementaryHouseholderReflectors(
   XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
   TF_ASSIGN_OR_RETURN(Shape taus_shape, builder->GetShape(taus));
-  const int num_dims = a_shape.rank();
+  const int num_dims = a_shape.dimensions().size();
   if (num_dims < 2) {
     return InvalidArgument("Arguments to QR must have rank >= 2: got shape %s",
                            a_shape.ToString());
diff --git a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
index 3b7746cfdb61..2fe502429287 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/reduce_decomposer.cc
@@ -89,7 +89,7 @@ class ReduceDecomposerVisitor : public DfsHloRewriteVisitor {
 
     std::vector<Shape> output_shapes;
     if (shape.IsTuple()) {
-      for (int i = 0; i < shape.tuple_shapes_size(); i++) {
+      for (int i = 0; i < shape.tuple_shapes().size(); i++) {
         output_shapes.push_back(ShapeUtil::GetTupleElementShape(shape, i));
         TF_RET_CHECK(output_shapes[i].layout() == output_shapes[0].layout());
       }
diff --git a/third_party/xla/xla/hlo/transforms/expanders/rng_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/rng_expander.cc
index dfcc95c0324f..28ac82b81eef 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/rng_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/rng_expander.cc
@@ -38,8 +38,8 @@ namespace xla {
 namespace {
 
 int64_t GlobalRandomValue() {
-  static auto* mu = new absl::Mutex();
-  static std::mt19937_64 rng{42};
+  static auto* const mu = new absl::Mutex();
+  static std::mt19937_64 rng{8};
   absl::MutexLock l(mu);
   return rng();
 }
diff --git a/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander.cc
index 775fe3ef1cb7..7e77eacd0637 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander.cc
@@ -127,6 +127,11 @@ absl::StatusOr<HloInstruction*> StableSortExpander::ExpandInstruction(
     }
     sort = Cast<HloSortInstruction>(new_sort);
     iota_index = sort->operand_count() - 1;
+  } else if (sort->to_apply()->caller_instructions().size() > 1) {
+    // Even if we didn't need to add an iota, we still need to clone the
+    // comparator if it has more than one use.
+    sort->set_to_apply(
+        sort->GetModule()->AddEmbeddedComputation(sort->to_apply()->Clone()));
   }
 
   // Modify the computation to break ties using the iota operand.
diff --git a/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander_test.cc b/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander_test.cc
index e577e8c557ba..08f96c742a13 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander_test.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/stable_sort_expander_test.cc
@@ -358,5 +358,78 @@ TEST_F(StableSortExpanderTest, StabilizeSortR1NoRoot) {
       /*iota_parameter=*/1);
 }
 
+TEST_F(StableSortExpanderTest, MultipleSortsSingleComputationNoIota) {
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    compare {
+      p.0.lhs = f32[] parameter(0)
+      p.0.rhs = f32[] parameter(1)
+      ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+    }
+
+    ENTRY sort_computation {
+      keys.0 = f32[64,8732]{1,0} parameter(0)
+      keys.1 = f32[64,8732]{1,0} parameter(1)
+      sort.0 = f32[64,8732]{1,0} sort(keys.0),
+        dimensions={1}, to_apply=compare, is_stable=true
+      sort.1 = f32[64,8732]{1,0} sort(keys.1),
+        dimensions={1}, to_apply=compare, is_stable=true
+      ROOT add = f32[64,8732]{1,0} add(sort.0, sort.1)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::Add(
+                  m::GetTupleElement(m::Sort(m::Parameter(0), m::Iota()), 0),
+                  m::GetTupleElement(m::Sort(m::Parameter(1), m::Iota()), 0))));
+  EXPECT_NE(root->operand(0)->operand(0)->to_apply()->unique_id(),
+            root->operand(1)->operand(0)->to_apply()->unique_id());
+}
+
+TEST_F(StableSortExpanderTest, MultipleSortsSingleComputationWithIota) {
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    compare {
+      p.0.lhs = f32[] parameter(0)
+      p.0.rhs = f32[] parameter(1)
+      p.1.lhs = s32[] parameter(2)
+      p.1.rhs = s32[] parameter(3)
+      ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+    }
+
+    ENTRY sort_computation {
+      keys.0 = f32[64,8732]{1,0} parameter(0)
+      keys.1 = f32[64,8732]{1,0} parameter(1)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort.0 = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys.0, values),
+        dimensions={1}, to_apply=compare, is_stable=true
+      sort.1 = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys.1, values),
+        dimensions={1}, to_apply=compare, is_stable=true
+      gte.0 = f32[64,8732]{1,0} get-tuple-element(sort.0), index=0
+      gte.1 = f32[64,8732]{1,0} get-tuple-element(sort.1), index=0
+      ROOT add = f32[64,8732]{1,0} add(gte.0, gte.1)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).value());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::Add(
+                  m::GetTupleElement(m::Sort(m::Parameter(0), m::Iota()), 0),
+                  m::GetTupleElement(m::Sort(m::Parameter(1), m::Iota()), 0))));
+  EXPECT_NE(root->operand(0)->operand(0)->to_apply()->unique_id(),
+            root->operand(1)->operand(0)->to_apply()->unique_id());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc b/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
index 5e70dbb26c7d..0513bc996520 100644
--- a/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offload_legalize.cc
@@ -38,8 +38,8 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/host_offload_utils.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
@@ -60,8 +60,8 @@ constexpr std::array<HloOpcode, 2> kUsersOpcodes = {HloOpcode::kSlice,
 
 // Find an annotation moving up. Meant to find an annotation from a DUS operand.
 HloInstruction* FindToHostAnnotationToUpdate(HloInstruction* instr) {
-  while (!instr->IsCustomCall(
-      host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+  while (
+      !instr->IsCustomCall(memory_annotations::kMoveToHostCustomCallTarget)) {
     if ((instr->opcode() != HloOpcode::kBitcast &&
          instr->opcode() != HloOpcode::kCopy &&
          instr->opcode() != HloOpcode::kReshape) ||
@@ -76,8 +76,8 @@ HloInstruction* FindToHostAnnotationToUpdate(HloInstruction* instr) {
 // Find an annotation moving up. Meant to find an annotation from a DUS
 // instruction.
 HloInstruction* FindToDeviceAnnotationToUpdate(HloInstruction* instr) {
-  while (!instr->IsCustomCall(
-      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+  while (
+      !instr->IsCustomCall(memory_annotations::kMoveToDeviceCustomCallTarget)) {
     if (instr->user_count() != 1 ||
         (instr->opcode() != HloOpcode::kBitcast &&
          instr->opcode() != HloOpcode::kReshape &&
@@ -103,57 +103,6 @@ HloInstruction* FindDUSFromAnnotation(HloInstruction* instr) {
   return instr;
 }
 
-// Make sure that broadcasts are duplicated for each use.
-absl::StatusOr<bool> DuplicateBroadcastForEachUse(HloModule* module) {
-  bool split_at_least_one = false;
-  for (HloComputation* computation : module->computations()) {
-    std::vector<HloInstruction*> broadcasts;
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kBroadcast ||
-          !instruction->HasConstantOperand()) {
-        continue;
-      }
-      broadcasts.push_back(instruction);
-    }
-    for (HloInstruction* instruction : broadcasts) {
-      if (instruction->opcode() != HloOpcode::kBroadcast ||
-          !instruction->HasConstantOperand()) {
-        continue;
-      }
-      absl::InlinedVector<HloUse, 8> uses;
-      for (HloInstruction* user : instruction->users()) {
-        for (int64_t i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) != instruction) {
-            continue;
-          }
-          uses.push_back(HloUse{user, i, /*operand_index=*/{}});
-        }
-      }
-
-      if (uses.size() <= 1) {
-        VLOG(5) << "Skipping broadcast " << instruction->ToString()
-                << " which has " << uses.size() << " uses";
-        continue;
-      }
-
-      VLOG(5) << "Splitting broadcast " << instruction->ToString()
-              << " which has " << uses.size() << " uses";
-      split_at_least_one = true;
-      // Don't create a new broadcast for the first use; we can still use the
-      // original.
-      for (int i = 1; i < uses.size(); ++i) {
-        const HloUse& use = uses[i];
-        HloInstruction* new_broadcast =
-            instruction->parent()->AddInstruction(instruction->Clone());
-        VLOG(5) << "New broadcast " << new_broadcast->ToString();
-        TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
-            use.operand_number, new_broadcast));
-      }
-    }
-  }
-  return split_at_least_one;
-}
-
 struct InstructionAndIndex {
   HloInstruction* instruction;
   int index;
@@ -167,6 +116,7 @@ struct InstructionAndIndex {
 // Walk up in the chain of memory offloaded instructions. absl::Status not-ok
 // when an instructions not supported or end of chain reached. Walks one
 // instruction at a time.
+// Returns current_value if there is nowhere else to go.
 absl::StatusOr<InstructionAndIndex> WalkUpMemoryOffload(
     InstructionAndIndex current_value, const CallGraph& call_graph) {
   // TODO(maggioni): Verify that set of instructions supported in chain by
@@ -197,8 +147,11 @@ absl::StatusOr<InstructionAndIndex> WalkUpMemoryOffload(
       return InstructionAndIndex(root, index);
     }
     case HloOpcode::kParameter: {
-      CHECK_NE(instruction->parent(),
-               instruction->GetModule()->entry_computation());
+      if (instruction->parent() ==
+          instruction->GetModule()->entry_computation()) {
+        // We reached the top. No further to go.
+        return current_value;
+      }
       std::vector<HloInstruction*> callers =
           call_graph.GetComputationCallers(instruction->parent());
       if (callers.size() != 1) {
@@ -218,7 +171,7 @@ absl::StatusOr<InstructionAndIndex> WalkUpMemoryOffload(
     case HloOpcode::kCustomCall: {
       if (!instruction->IsCustomCall("AllocateBuffer") &&
           !instruction->IsCustomCall(
-              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+              memory_annotations::kMoveToHostCustomCallTarget)) {
         return absl::InvalidArgumentError(
             "Expected AllocateBuffer or MoveToHost custom-call");
       }
@@ -335,8 +288,8 @@ absl::StatusOr<std::vector<InstructionAndIndex>> WalkDownMemoryOffload(
         break;
       }
       case HloOpcode::kCustomCall: {
-        if (user->IsCustomCall(host_memory_offload_annotations::
-                                   kMoveToDeviceCustomCallTarget)) {
+        if (user->IsCustomCall(
+                memory_annotations::kMoveToDeviceCustomCallTarget)) {
           results.emplace_back(user, current_value.index);
           break;
         }
@@ -481,7 +434,8 @@ absl::Status MoveCopyDown(
               "Expecting copy to only change instructions layout. Copy: %s",
               copy_to_move->ToString()));
         }
-        if (after_bitcast_shape.rank() == before_bitcast_shape.rank() - 1) {
+        if (after_bitcast_shape.dimensions().size() ==
+            before_bitcast_shape.dimensions().size() - 1) {
           if (!(ShapeUtil::IsEffectivelyMostMajorDimension(before_bitcast_shape,
                                                            0) &&
                 before_bitcast_shape.dimensions(0) == 1)) {
@@ -504,8 +458,8 @@ absl::Status MoveCopyDown(
               " Also updating shape after copy from %s to %s",
               shape_after_copy.ToString(true), new_copy_shape.ToString(true));
           shape_after_copy = new_copy_shape;
-        } else if (after_bitcast_shape.rank() ==
-                   before_bitcast_shape.rank() + 1) {
+        } else if (after_bitcast_shape.dimensions().size() ==
+                   before_bitcast_shape.dimensions().size() + 1) {
           if (!(ShapeUtil::IsEffectivelyMostMajorDimension(after_bitcast_shape,
                                                            0) &&
                 after_bitcast_shape.dimensions(0) == 1)) {
@@ -570,7 +524,7 @@ absl::Status MoveCopyDown(
              "happens";
       if (absl::c_linear_search(kUsersOpcodes, instruction->opcode()) ||
           instruction->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+              memory_annotations::kMoveToDeviceCustomCallTarget)) {
         HloInstruction* annotation =
             FindToDeviceAnnotationToUpdate(instruction);
         CHECK_NE(annotation, nullptr)
@@ -635,7 +589,7 @@ absl::Status MoveCopyDown(
             instruction->operand(1)->shape().layout().minor_to_major()) {
           HloInstruction* update_slice = instruction->mutable_operand(1);
           CHECK(update_slice->IsCustomCall(
-              host_memory_offload_annotations::kMoveToHostCustomCallTarget));
+              memory_annotations::kMoveToHostCustomCallTarget));
           *update_slice->mutable_shape()->mutable_layout() =
               instruction->shape().layout();
           HloInstruction* new_copy =
@@ -682,7 +636,7 @@ bool ShouldMoveCopyDown(InstructionAndIndex copy_to_move) {
     for (const host_offload_utils::InstructionAndShapeIndex& successor :
          successors.value()) {
       if (successor.instruction->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+              memory_annotations::kMoveToDeviceCustomCallTarget)) {
         continue;
       }
       queue.push(successor);
@@ -711,7 +665,7 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
     starting_instr = instruction;
   }
   if (!(starting_instr->IsCustomCall(
-            host_memory_offload_annotations::kMoveToHostCustomCallTarget) ||
+            memory_annotations::kMoveToHostCustomCallTarget) ||
         IsEntryComputationParameter(starting_instr) ||
         starting_instr->opcode() == HloOpcode::kDynamicUpdateSlice)) {
     return absl::InternalError(
@@ -751,7 +705,7 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
         // Check if this dynamic-update-slice doesn't have an annotation
         // attached.
         if (!real_annotation->IsCustomCall(
-                host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+                memory_annotations::kMoveToHostCustomCallTarget)) {
           return false;
         }
       }
@@ -769,19 +723,19 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
     if (absl::c_linear_search(kUsersOpcodes,
                               stack.back().instruction->opcode()) ||
         stack.back().instruction->IsCustomCall(
-            host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+            memory_annotations::kMoveToDeviceCustomCallTarget)) {
       HloInstruction* annotation =
           FindToDeviceAnnotationToUpdate(stack.back().instruction);
       if (!annotation ||
           !annotation->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+              memory_annotations::kMoveToDeviceCustomCallTarget)) {
         VLOG(5) << "Couldn't find annotation for consumer instruction in chain";
         return false;
       }
 
       // Fix up while body's root instruction shape along the way.
       if (annotation->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+              memory_annotations::kMoveToDeviceCustomCallTarget)) {
         for (HloInstruction* user : annotation->users()) {
           HloInstruction* root_instruction =
               annotation->parent()->root_instruction();
@@ -866,7 +820,7 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
       // only check if we can easily move it up by swapping places with its
       // operand.
       if (copy_to_move->operand(0)->IsCustomCall(
-              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+              memory_annotations::kMoveToHostCustomCallTarget)) {
         HloInstruction* custom_call = copy_to_move->mutable_operand(0);
         TF_RETURN_IF_ERROR(copy_to_move->ReplaceAllUsesWith(custom_call));
         TF_RETURN_IF_ERROR(copy_to_move->ReplaceOperandWith(
@@ -924,14 +878,14 @@ HostOffloadLegalize::FindStartingInstructionsOfHostMemoryOffload(
                 .shape();
         // TODO(mingyao): Add support for tuple parameter.
         if (param_shape.has_layout() &&
-            param_shape.layout().memory_space() == kHostMemorySpaceColor) {
+            param_shape.layout().memory_space() == Layout::kHostMemorySpace) {
           starting_instructions.push_back(instruction);
           continue;
         }
       }
 
       if (instruction->IsCustomCall(
-              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+              memory_annotations::kMoveToHostCustomCallTarget)) {
         starting_instructions.push_back(instruction);
       }
     }
@@ -943,20 +897,6 @@ absl::StatusOr<bool> HostOffloadLegalize::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-
-  // Split broadcasts so that each HloUse of a broadcast instruction will get
-  // its own copy.
-  // TODO(b/319293925): Do not blindly duplicate all broadcasts, instead do it
-  // only when necessary.
-  TF_ASSIGN_OR_RETURN(bool duplicated_at_least_one_broadcast,
-                      DuplicateBroadcastForEachUse(module));
-  if (duplicated_at_least_one_broadcast) {
-    changed = true;
-  }
-  if (!after_layout_) {
-    return changed;
-  }
-
   // Look for layout changing copies which happen during host memory offload. If
   // any are found, move them outside of the offload section.
   std::vector<HloInstruction*> starting_instructions =
diff --git a/third_party/xla/xla/hlo/transforms/host_offload_legalize.h b/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
index e08c842ee0bc..efe8845c2a58 100644
--- a/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
+++ b/third_party/xla/xla/hlo/transforms/host_offload_legalize.h
@@ -34,10 +34,7 @@ class HloCostAnalysis;
 // legalization that could block that is welcome into this pass.
 class HostOffloadLegalize : public HloModulePass {
  public:
-  explicit HostOffloadLegalize(int64_t host_memory_space_color,
-                               bool after_layout)
-      : kHostMemorySpaceColor(host_memory_space_color),
-        after_layout_(after_layout) {}
+  HostOffloadLegalize() = default;
   ~HostOffloadLegalize() override = default;
 
   absl::string_view name() const override { return "host-offload-legalize"; }
@@ -48,9 +45,6 @@ class HostOffloadLegalize : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  const int64_t kHostMemorySpaceColor;
-  const bool after_layout_;
-
   // For any memory offloaded to the host, return the instruction which is the
   // start of such and offload. These will either be "MoveToHost" annotations or
   // entry computation parameters.
diff --git a/third_party/xla/xla/hlo/transforms/host_offload_legalize_test.cc b/third_party/xla/xla/hlo/transforms/host_offload_legalize_test.cc
index 12e3c6935cda..9b2e5d830fb2 100644
--- a/third_party/xla/xla/hlo/transforms/host_offload_legalize_test.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offload_legalize_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -40,15 +40,12 @@ namespace {
 
 class HostOffloadLegalizeTest : public HloHardwareIndependentTestBase {
  protected:
-  static constexpr int64_t kHostMemorySpaceColor{5};
-
   absl::StatusOr<bool> RunHostOffloadLegalize(HloModule* module) {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (module->has_schedule()) {
       return absl::InternalError("Expected a non-scheduled module");
     }
-    HostOffloadLegalize host_offload_legalize(kHostMemorySpaceColor,
-                                              /*after_layout=*/true);
+    HostOffloadLegalize host_offload_legalize;
     return host_offload_legalize.Run(module);
   }
 
@@ -61,9 +58,8 @@ class HostOffloadLegalizeTest : public HloHardwareIndependentTestBase {
     for (const HloComputation* computation : module->computations()) {
       for (const HloInstruction* instruction : computation->instructions()) {
         if (instruction->IsCustomCall(
-                {host_memory_offload_annotations::kMoveToHostCustomCallTarget,
-                 host_memory_offload_annotations::
-                     kMoveToDeviceCustomCallTarget})) {
+                {memory_annotations::kMoveToHostCustomCallTarget,
+                 memory_annotations::kMoveToDeviceCustomCallTarget})) {
           return true;
         }
       }
@@ -200,7 +196,7 @@ ENTRY main.24 {
   ASSERT_NE(custom_call, nullptr);
   EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
   EXPECT_EQ(custom_call->shape().layout(),
-            LayoutUtil::MakeLayout({0, 1}, {}, {}, {}, {Tile{{8, 128}}}));
+            LayoutUtil::MakeLayout({0, 1}, {Tile{{8, 128}}}));
   EXPECT_EQ(custom_call->users()[0]->shape().layout(),
             LayoutUtil::MakeLayout({1, 0}));
 }
@@ -243,7 +239,7 @@ ENTRY main.24 {
   const HloInstruction* custom_call =
       module->entry_computation()->root_instruction()->operand(0);
   EXPECT_TRUE(custom_call->IsCustomCall(
-      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget));
+      memory_annotations::kMoveToDeviceCustomCallTarget));
   EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
   EXPECT_EQ(custom_call->shape().layout(),
             LayoutUtil::MakeLayout({3, 2, 1, 0}));
@@ -289,7 +285,7 @@ ENTRY main.24 {
   const HloInstruction* custom_call =
       module->entry_computation()->root_instruction()->operand(0);
   EXPECT_TRUE(custom_call->IsCustomCall(
-      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget));
+      memory_annotations::kMoveToDeviceCustomCallTarget));
   EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
   EXPECT_EQ(custom_call->shape().layout(),
             LayoutUtil::MakeLayout({3, 2, 1, 0}));
@@ -545,13 +541,13 @@ ENTRY main {
   EXPECT_TRUE(changed);
   XLA_VLOG_LINES(1, module->ToString());
   HloInstruction* custom_call = FindInstruction(module.get(), "custom-call");
-  EXPECT_EQ(custom_call->shape().layout(),
-            LayoutUtil::MakeLayout({3, 2, 1, 0}, {}, {}, {},
-                                   {Tile{{4, 128}}, Tile{{2, 1}}}));
+  EXPECT_EQ(
+      custom_call->shape().layout(),
+      LayoutUtil::MakeLayout({3, 2, 1, 0}, {Tile{{4, 128}}, Tile{{2, 1}}}));
   EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
-  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
-            LayoutUtil::MakeLayout({3, 1, 2, 0}, {}, {}, {},
-                                   {Tile{{8, 128}}, Tile{{2, 1}}}));
+  EXPECT_EQ(
+      custom_call->users()[0]->shape().layout(),
+      LayoutUtil::MakeLayout({3, 1, 2, 0}, {Tile{{8, 128}}, Tile{{2, 1}}}));
 }
 
 TEST_F(HostOffloadLegalizeTest, MoveCopyOverBitcast_2) {
@@ -575,13 +571,13 @@ ENTRY main {
   EXPECT_TRUE(changed);
   XLA_VLOG_LINES(1, module->ToString());
   HloInstruction* custom_call = FindInstruction(module.get(), "custom-call");
-  EXPECT_EQ(custom_call->shape().layout(),
-            LayoutUtil::MakeLayout({4, 3, 2, 1, 0}, {}, {}, {},
-                                   {Tile{{4, 128}}, Tile{{2, 1}}}));
+  EXPECT_EQ(
+      custom_call->shape().layout(),
+      LayoutUtil::MakeLayout({4, 3, 2, 1, 0}, {Tile{{4, 128}}, Tile{{2, 1}}}));
   EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
-  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
-            LayoutUtil::MakeLayout({3, 4, 2, 1, 0}, {}, {}, {},
-                                   {Tile{{8, 128}}, Tile{{2, 1}}}));
+  EXPECT_EQ(
+      custom_call->users()[0]->shape().layout(),
+      LayoutUtil::MakeLayout({3, 4, 2, 1, 0}, {Tile{{8, 128}}, Tile{{2, 1}}}));
 }
 
 TEST_F(HostOffloadLegalizeTest, MoveCopyUp) {
@@ -606,7 +602,7 @@ ENTRY main {
   HloInstruction* custom_call = FindInstruction(module.get(), "custom_call");
   EXPECT_TRUE(custom_call->IsRoot());
   EXPECT_TRUE(custom_call->IsCustomCall(
-      host_memory_offload_annotations::kMoveToHostCustomCallTarget));
+      memory_annotations::kMoveToHostCustomCallTarget));
   const HloInstruction* copy = custom_call->operand(0);
   ASSERT_EQ(copy->opcode(), HloOpcode::kCopy);
   const HloInstruction* param = copy->operand(0);
@@ -614,6 +610,30 @@ ENTRY main {
   EXPECT_EQ(param->opcode(), HloOpcode::kParameter);
 }
 
+// Check that HostOffloadLegalize doesn't crash when the base operand of the
+// dynamic-update-slice is a parameter.
+TEST_F(HostOffloadLegalizeTest, NoCrashBaseIsStreamed) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[8,512,64]{1,2,0:T(8,128)S(5)}, f32[1,512,64]{1,2,0:T(8,128)}, s32[]{:T(128)})->f32[8,512,64]{1,2,0:T(8,128)S(5)}}
+
+ENTRY main {
+  param1 = f32[8,512,64]{1,2,0:T(8,128)S(5)} parameter(0)
+  param2 = f32[1,512,64]{1,2,0:T(8,128)} parameter(1)
+  custom_call = f32[1,512,64]{1,2,0:T(8,128)S(5)} custom-call(param2), custom_call_target="MoveToHost"
+  param3 = s32[]{:T(128)} parameter(2)
+  constant = s32[]{:T(128)} constant(0)
+  ROOT dynamic-update-slice = f32[8,512,64]{1,2,0:T(8,128)} dynamic-update-slice(param1, param2, param3, constant, constant)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloadLegalize(module.get()));
+
+  ASSERT_FALSE(changed);
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.cc b/third_party/xla/xla/hlo/transforms/host_offloader.cc
index e8ea76bbb136..8d0cec70142b 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/hlo/transforms/host_offloader.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <iomanip>
 #include <memory>
@@ -42,8 +43,8 @@ limitations under the License.
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_cse.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/host_offload_utils.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
@@ -89,6 +90,23 @@ bool SetBuffersToMemorySpaceColor(
   return changed;
 }
 
+void PrintTrace(InstructionAndShapeIndex instruction_and_shape_index,
+                const absl::flat_hash_map<InstructionAndShapeIndex,
+                                          InstructionAndShapeIndex>& previous) {
+  std::vector<InstructionAndShapeIndex> trace;
+  trace.push_back(instruction_and_shape_index);
+  auto it = previous.find(instruction_and_shape_index);
+  while (it != previous.end()) {
+    trace.push_back(it->second);
+    instruction_and_shape_index = it->second;
+    it = previous.find(instruction_and_shape_index);
+  }
+  std::reverse(trace.begin(), trace.end());
+  for (const auto& instruction_and_shape_index : trace) {
+    VLOG(1) << "  " << instruction_and_shape_index.ToString();
+  }
+}
+
 }  // namespace
 
 bool HostOffloader::InstructionIsAllowedBetweenMoveToHostAndDus(
@@ -105,8 +123,8 @@ bool HostOffloader::InstructionIsAllowedBetweenDsAndMoveToDevice(
     const HloInstruction* instruction) const {
   if (instruction->opcode() == HloOpcode::kReduce) {
     // TODO(b/333902007): Remove this once trivial reduces no longer appear.
-    return ShapeUtil::TrueRank(instruction->operand(0)->shape()) ==
-           ShapeUtil::TrueRank(instruction->shape());
+    return ShapeUtil::TrueNumDimensions(instruction->operand(0)->shape()) ==
+           ShapeUtil::TrueNumDimensions(instruction->shape());
   }
   if (instruction->opcode() == HloOpcode::kReshape) {
     return ShapeUtil::ReshapeIsBitcast(instruction->operand(0)->shape(),
@@ -135,6 +153,8 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
   HloInstruction* starting_instruction =
       starting_instruction_and_index.instruction;
   std::queue<InstructionAndShapeIndex> queue;
+  absl::flat_hash_map<InstructionAndShapeIndex, InstructionAndShapeIndex>
+      previous;
   queue.push(starting_instruction_and_index);
   while (!queue.empty()) {
     InstructionAndShapeIndex instruction_and_shape_index = queue.front();
@@ -146,14 +166,13 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
     bool need_to_wrap_instruction_as_host_compute = false;
     if (instruction->opcode() == HloOpcode::kCustomCall &&
         instruction->custom_call_target() ==
-            host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+            memory_annotations::kMoveToHostCustomCallTarget) {
       // This MoveToHost custom call is a no-op; save it to remove later.
       already_visited_move_to_host_custom_calls_.insert(instruction);
       mth_custom_calls_to_remove.insert(instruction);
     } else if (instruction->opcode() == HloOpcode::kCustomCall &&
                instruction->custom_call_target() ==
-                   host_memory_offload_annotations::
-                       kMoveToDeviceCustomCallTarget) {
+                   memory_annotations::kMoveToDeviceCustomCallTarget) {
       // This MoveToDevice marks the end of this path.
       custom_calls_to_insert_copies_before.insert(instruction);
       continue;
@@ -244,7 +263,9 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
       LOG(WARNING) << absl::StreamFormat(
           "Found an instruction (\"%s\") which does device compute in host "
           "memory space. Converting into host compute. This is likely to have "
-          "a very high overhead.",
+          "a very slow execution time. If you're using JAX, use device_put() "
+          "to move the inputs to the device so that computation happens on the "
+          "device.",
           instruction->name());
       host_offload_utils::SetHostComputeFrontendAttribute(*instruction);
     }
@@ -278,6 +299,10 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
             starting_instruction_and_index.ToString());
         continue;
       } else {
+        if (VLOG_IS_ON(1)) {
+          LOG(INFO) << "Instruction trace leading to error:";
+          PrintTrace(instruction_and_shape_index, previous);
+        }
         return absl::InvalidArgumentError(
             absl::StrFormat("Tensor which is moved to host (starting from %s) "
                             "is returned from the entry computation but the "
@@ -290,6 +315,9 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
         const std::vector<InstructionAndShapeIndex> successors,
         host_offload_utils::GetSuccessors(instruction_and_shape_index));
     for (const InstructionAndShapeIndex& successor : successors) {
+      if (VLOG_IS_ON(1)) {
+        previous.emplace(successor, instruction_and_shape_index);
+      }
       queue.push(successor);
     }
   }
@@ -599,7 +627,7 @@ absl::StatusOr<bool> HostOffloader::SliceLeadsToMoveToDeviceCustomCall(
     HloInstruction* current_instruction = instruction_and_shape.instruction;
     if (current_instruction->opcode() == HloOpcode::kCustomCall &&
         current_instruction->custom_call_target() ==
-            host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
+            memory_annotations::kMoveToDeviceCustomCallTarget) {
       // This path ended with the MoveToDevice custom call. This path is good.
       continue;
     }
@@ -726,23 +754,44 @@ absl::Status HostOffloader::CreateAllocateBufferForDynamicUpdateSlice(
         // Found a broadcast.
         found_broadcast = true;
         HloInstruction* broadcast_user = instruction_and_shape.instruction;
-        const auto operand_indices =
-            broadcast_user->OperandIndices(predecessor_instruction);
-        CHECK(!operand_indices.empty())
-            << "We could only have the broadcast as a predecessor if it is an "
-               "operand of this instruction; something is wrong.";
         HloInstruction* allocate_buffer =
             predecessor_instruction->parent()->AddInstruction(
                 HloInstruction::CreateCustomCall(
                     predecessor_instruction->shape(), {}, "AllocateBuffer"));
-        VLOG(1) << absl::StreamFormat(
-            "Created new AllocateBuffer instruction \"%s\"",
-            allocate_buffer->ToString());
         SetMemorySpace(allocate_buffer->mutable_shape(),
                        Layout::kHostMemorySpace);
-        for (int64_t index : operand_indices) {
-          TF_RETURN_IF_ERROR(
-              broadcast_user->ReplaceOperandWith(index, allocate_buffer));
+        VLOG(1) << absl::StreamFormat(
+            "Created new AllocateBuffer instruction \"%s\" to replace "
+            "broadcast \"%s\"'s use at index %s in user \"%s\"",
+            allocate_buffer->ToString(), predecessor_instruction->name(),
+            instruction_and_shape.shape_index.ToString(),
+            broadcast_user->name());
+        if (instruction_and_shape.shape_index.size() == 1) {
+          // Have a shape index, this broadcast must be going into a tuple
+          // (because the shape index is meant to be a tuple index, not a use
+          // index). Use only the index from which we arrived here, as any other
+          // index might not be expecting host memory.
+          CHECK_EQ(instruction->opcode(), HloOpcode::kTuple)
+              << "Expecting a tuple when shape index has ndim>0";
+          TF_RETURN_IF_ERROR(broadcast_user->ReplaceOperandWith(
+              instruction_and_shape.shape_index[0], allocate_buffer));
+        } else {
+          // Any shape index larger than 1 would mean that the broadcast
+          // produces a tuple, which is not possible.
+          CHECK_EQ(instruction_and_shape.shape_index.size(), 0)
+              << "Only other supported shape index ndim is 0";
+          // Ideally, we'd like to know via which index we arrived here, but we
+          // do not. We'll look up at which indices this broadcast is used.
+          const auto operand_indices =
+              broadcast_user->OperandIndices(predecessor_instruction);
+          // 0 uses would be a hard error, as that would mean there is a bug in
+          // the GetPredecessors function. If there are more than 1 uses, we do
+          // not know via which use we arrived here and setting all uses as host
+          // memory space could be incorrect.
+          CHECK_EQ(operand_indices.size(), 1)
+              << "Only a single use it currently supported";
+          TF_RETURN_IF_ERROR(broadcast_user->ReplaceOperandWith(
+              operand_indices[0], allocate_buffer));
         }
         if (predecessor_instruction->user_count() == 0) {
           // No remaining users. Remove the broadcast.
@@ -873,33 +922,33 @@ absl::StatusOr<bool> UpdateMemorySpaceForHostOffloadedOutputs(
   HloInstruction* root = called_computation->root_instruction();
   Shape* root_shape = root->mutable_shape();
 
-  host_instrs_tree.ForEachMutableElement([&](ShapeIndex output_index,
-                                             std::vector<
-                                                 InstructionAndShapeIndex>*
-                                                 instruction_and_shape_indexes)
-                                             -> void {
-    for (InstructionAndShapeIndex& instr_and_shape :
-         *instruction_and_shape_indexes) {
-      // If instruction is MoveToHost, we will replace usage.
-      if (instr_and_shape.instruction->IsCustomCall(
-              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
-        to_replace.push_back(instr_and_shape);
-        continue;
-      }
+  host_instrs_tree.ForEachMutableElement(
+      [&](ShapeIndex output_index,
+          std::vector<InstructionAndShapeIndex>* instruction_and_shape_indexes)
+          -> void {
+        for (InstructionAndShapeIndex& instr_and_shape :
+             *instruction_and_shape_indexes) {
+          // If instruction is MoveToHost, we will replace usage.
+          if (instr_and_shape.instruction->IsCustomCall(
+                  memory_annotations::kMoveToHostCustomCallTarget)) {
+            to_replace.push_back(instr_and_shape);
+            continue;
+          }
 
-      SetMemorySpace(ShapeUtil::GetMutableSubshape(
-                         instr_and_shape.instruction->mutable_shape(),
-                         instr_and_shape.shape_index),
-                     Layout::kHostMemorySpace);
-    }
+          SetMemorySpace(ShapeUtil::GetMutableSubshape(
+                             instr_and_shape.instruction->mutable_shape(),
+                             instr_and_shape.shape_index),
+                         Layout::kHostMemorySpace);
+        }
 
-    if (!instruction_and_shape_indexes->empty()) {
-      // Update the memory space for the output of the computation call
-      // itself.
-      SetMemorySpace(ShapeUtil::GetMutableSubshape(root_shape, output_index),
-                     Layout::kHostMemorySpace);
-    }
-  });
+        if (!instruction_and_shape_indexes->empty()) {
+          // Update the memory space for the output of the computation call
+          // itself.
+          SetMemorySpace(
+              ShapeUtil::GetMutableSubshape(root_shape, output_index),
+              Layout::kHostMemorySpace);
+        }
+      });
   bool modified = false;
   // Remove MoveToHost usage.
   for (InstructionAndShapeIndex& instr_and_shape : to_replace) {
@@ -934,7 +983,7 @@ bool ExtraCheckForValidUsageOnHostForHostOffloadedOutputs(
   // generic redundant copies removal.
   if (instruction->opcode() == HloOpcode::kCustomCall &&
       instruction->custom_call_target() !=
-          host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+          memory_annotations::kMoveToHostCustomCallTarget) {
     return false;
   }
 
@@ -1062,7 +1111,7 @@ absl::StatusOr<bool> HostOffloader::ProcessNextMoveToHostInstr(
     HloComputation* computation) {
   for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) {
     if (instruction->IsCustomCall(
-            host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+            memory_annotations::kMoveToHostCustomCallTarget)) {
       TF_ASSIGN_OR_RETURN(bool removed_move_to_host,
                           HandleMoveToHostCustomCall(instruction));
       if (removed_move_to_host) {
@@ -1184,7 +1233,7 @@ absl::StatusOr<bool> HostOffloader::Run(
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->IsCustomCall(
-              host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+              memory_annotations::kMoveToDeviceCustomCallTarget)) {
         TF_ASSIGN_OR_RETURN(bool result,
                             HandleMoveToDeviceCustomCall(instruction));
         changed = changed || result;
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
index 7670c1910f2b..19c5716bc1b7 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "xla/hlo/transforms/host_offload_legalize.h"
 #include "xla/layout.h"
 #include "xla/service/hlo_verifier.h"
-#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/host_offload_utils.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -52,15 +52,13 @@ namespace {
 
 class HostOffloaderTest : public HloHardwareIndependentTestBase {
  protected:
-  absl::StatusOr<bool> RunHostOffloader(HloModule* module,
-                                        bool after_layout = false) {
+  absl::StatusOr<bool> RunHostOffloader(HloModule* module) {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (module->has_schedule()) {
       return absl::InternalError("Expected a non-scheduled module");
     }
     bool changed = false;
-    HostOffloadLegalize host_offload_legalize(Layout::kHostMemorySpace,
-                                              after_layout);
+    HostOffloadLegalize host_offload_legalize;
     TF_ASSIGN_OR_RETURN(bool legal_changed, host_offload_legalize.Run(module));
     changed |= legal_changed;
     HostOffloader host_offloader;
@@ -78,9 +76,8 @@ class HostOffloaderTest : public HloHardwareIndependentTestBase {
     for (const HloComputation* computation : module->computations()) {
       for (const HloInstruction* instruction : computation->instructions()) {
         if (instruction->IsCustomCall(
-                {host_memory_offload_annotations::kMoveToHostCustomCallTarget,
-                 host_memory_offload_annotations::
-                     kMoveToDeviceCustomCallTarget})) {
+                {memory_annotations::kMoveToHostCustomCallTarget,
+                 memory_annotations::kMoveToDeviceCustomCallTarget})) {
           return true;
         }
       }
@@ -436,8 +433,7 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
   HloVerifier verifier(/*layout_sensitive=*/true,
@@ -484,8 +480,7 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
   HloVerifier verifier(/*layout_sensitive=*/true,
@@ -548,8 +543,7 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
   HloVerifier verifier(/*layout_sensitive=*/true,
@@ -617,8 +611,7 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
   HloVerifier verifier(/*layout_sensitive=*/true,
diff --git a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
index 0ce60b25310d..4714f7b86c29 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloading_prepare.cc
@@ -29,14 +29,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/call_graph.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using xla::host_memory_offload_annotations::kMoveToHostCustomCallTarget;
+using xla::memory_annotations::kMoveToHostCustomCallTarget;
 
 bool IsHostAsyncStart(const HloInstruction* instruction) {
   return instruction->opcode() == HloOpcode::kAsyncStart &&
diff --git a/third_party/xla/xla/hlo/transforms/host_offloading_prepare_test.cc b/third_party/xla/xla/hlo/transforms/host_offloading_prepare_test.cc
index a19724ca2286..75d7f2d7b315 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloading_prepare_test.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloading_prepare_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 
@@ -99,12 +99,12 @@ ENTRY main {
     // None of the inputs should be a "to host" custom call.
     for (const HloInstruction* operand : instruction->operands()) {
       EXPECT_FALSE(operand->IsCustomCall(
-          {host_memory_offload_annotations::kMoveToHostCustomCallTarget}));
+          {memory_annotations::kMoveToHostCustomCallTarget}));
     }
     // None of the outputs should be a "to device" custom call.
     for (const HloInstruction* user : instruction->users()) {
       EXPECT_FALSE(user->IsCustomCall(
-          {host_memory_offload_annotations::kMoveToDeviceCustomCallTarget}));
+          {memory_annotations::kMoveToDeviceCustomCallTarget}));
     }
   }
 }
@@ -149,12 +149,12 @@ ENTRY main {
     // None of the inputs should be a "to host" custom call.
     for (const HloInstruction* operand : instruction->operands()) {
       EXPECT_FALSE(operand->IsCustomCall(
-          {host_memory_offload_annotations::kMoveToHostCustomCallTarget}));
+          {memory_annotations::kMoveToHostCustomCallTarget}));
     }
     // None of the outputs should be a "to device" custom call.
     for (const HloInstruction* user : instruction->users()) {
       EXPECT_FALSE(user->IsCustomCall(
-          {host_memory_offload_annotations::kMoveToDeviceCustomCallTarget}));
+          {memory_annotations::kMoveToDeviceCustomCallTarget}));
     }
   }
 }
@@ -200,12 +200,12 @@ ENTRY main {
     // None of the inputs should be a "to host" custom call.
     for (const HloInstruction* operand : instruction->operands()) {
       EXPECT_FALSE(operand->IsCustomCall(
-          {host_memory_offload_annotations::kMoveToHostCustomCallTarget}));
+          {memory_annotations::kMoveToHostCustomCallTarget}));
     }
     // None of the outputs should be a "to device" custom call.
     for (const HloInstruction* user : instruction->users()) {
       EXPECT_FALSE(user->IsCustomCall(
-          {host_memory_offload_annotations::kMoveToDeviceCustomCallTarget}));
+          {memory_annotations::kMoveToDeviceCustomCallTarget}));
     }
   }
 }
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc b/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
index 3dc14572dc40..c88232dd0023 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "xla/hlo/transforms/memory_space_propagation.h"
 
-#include <cstdint>
+#include <optional>
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/service/hlo_value.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 
@@ -50,10 +56,9 @@ absl::StatusOr<bool> MemorySpacePropagation::Run(
           ShapeUtil::ForEachLeafShape(
               instruction->operand(operand_idx)->shape(),
               [&](const Shape& sub_shape, const ShapeIndex& index) {
-                int64_t memory_space = sub_shape.layout().memory_space();
                 modified |=
                     Propagate(index, instruction->fused_parameter(operand_idx),
-                              memory_space);
+                              sub_shape);
               });
         }
 
@@ -61,9 +66,8 @@ absl::StatusOr<bool> MemorySpacePropagation::Run(
         ShapeUtil::ForEachLeafShape(
             instruction->shape(),
             [&](const Shape& sub_shape, const ShapeIndex& index) {
-              int64_t memory_space = sub_shape.layout().memory_space();
               modified |= Propagate(index, instruction->fused_expression_root(),
-                                    memory_space);
+                                    sub_shape);
             });
       }
     }
@@ -73,7 +77,7 @@ absl::StatusOr<bool> MemorySpacePropagation::Run(
 
 bool MemorySpacePropagation::Propagate(ShapeIndexView index,
                                        const HloInstruction* callee_instruction,
-                                       int64_t memory_space) const {
+                                       const Shape& src_shape) const {
   bool modified = false;
   const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
       callee_instruction, ShapeIndex(index));
@@ -82,16 +86,27 @@ bool MemorySpacePropagation::Propagate(ShapeIndexView index,
     HloInstruction* instruction = position.instruction;
     Shape* shape = ShapeUtil::GetMutableSubshape(instruction->mutable_shape(),
                                                  position.index);
-    if (shape->layout().memory_space() == memory_space) {
+    std::optional<SplitConfig> dest_split_config =
+        LayoutUtil::GetSplitConfig(*shape);
+    std::optional<SplitConfig> src_split_config =
+        LayoutUtil::GetSplitConfig(src_shape);
+
+    if (shape->layout().memory_space() == src_shape.layout().memory_space() &&
+        dest_split_config == src_split_config) {
       continue;
     }
-    shape->mutable_layout()->set_memory_space(memory_space);
+    shape->mutable_layout()->set_memory_space(
+        src_shape.layout().memory_space());
+    shape->mutable_layout()->clear_split_configs();
+    if (src_split_config.has_value()) {
+      shape->mutable_layout()->add_split_configs(*src_split_config);
+    }
     modified = true;
 
     // For fusion outputs, propagate the memory space to the fusion root.
     if (instruction->opcode() == HloOpcode::kFusion) {
       Propagate(position.index, instruction->fused_expression_root(),
-                memory_space);
+                src_shape);
     }
 
     const HloInstruction* parent_fusion =
@@ -100,7 +115,7 @@ bool MemorySpacePropagation::Propagate(ShapeIndexView index,
     // to the output of the calling fusion instruction.
     if (instruction == instruction->parent()->root_instruction() &&
         parent_fusion->parent()->IsFusionComputation()) {
-      Propagate(position.index, parent_fusion, memory_space);
+      Propagate(position.index, parent_fusion, src_shape);
     }
 
     // For nested fusion parameters, pop one level up and propagate the memory
@@ -109,7 +124,7 @@ bool MemorySpacePropagation::Propagate(ShapeIndexView index,
         parent_fusion->parent()->IsFusionComputation()) {
       const HloInstruction* fusion_operand =
           parent_fusion->operand(instruction->parameter_number());
-      Propagate(position.index, fusion_operand, memory_space);
+      Propagate(position.index, fusion_operand, src_shape);
     }
   }
 
@@ -118,7 +133,7 @@ bool MemorySpacePropagation::Propagate(ShapeIndexView index,
     if (use.instruction->opcode() == HloOpcode::kFusion) {
       modified |= Propagate(
           use.operand_index,
-          use.instruction->fused_parameter(use.operand_number), memory_space);
+          use.instruction->fused_parameter(use.operand_number), src_shape);
     }
   }
   return modified;
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation.h b/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
index b3998f542d39..6a28d4b57035 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation.h
@@ -25,11 +25,12 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/shape.h"
 
 namespace xla {
 
-// This is a legalization pass that propagates the memory space in the layout to
-// the fusion computations.
+// This is a legalization pass that propagates the memory space (and associated
+// split config) in the layout to the fusion computations.
 class MemorySpacePropagation : public HloModulePass {
  public:
   ~MemorySpacePropagation() override = default;
@@ -42,9 +43,10 @@ class MemorySpacePropagation : public HloModulePass {
  private:
   // Given the shape index (operand or output) and its corresponding instruction
   // in the fused computation (parameter or root), propagates the memory space
-  // in the callee side. Returns true if the module is modified.
+  // (and associated split config) in the callee side. Returns true if the
+  // module is modified.
   bool Propagate(ShapeIndexView index, const HloInstruction* callee_instruction,
-                 int64_t memory_space) const;
+                 const Shape& src_shape) const;
 
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
 };
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc b/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
index a1252d596ee2..f9e0bc6356bd 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
@@ -94,10 +94,10 @@ TEST_F(MemorySpacePropagationTest, NonTupleOutput) {
     %param0 = s32[6]{0:T(128)} parameter(0)
     %param1 = s32[1]{0:T(128)} parameter(1)
     %param2 = s32[5]{0:T(128)} parameter(2)
-    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg0 = s32[6]{0:T(128)S(1)SC(0:3)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)SC(0:3)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
     ROOT %root = s32[6]{0:T(128)} copy(%fusion)
   }
   )";
@@ -111,7 +111,7 @@ TEST_F(MemorySpacePropagationTest, NonTupleOutput) {
     %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
     %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
     %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
-    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    %param_0.1 = s32[6]{0:T(128)S(1)SC(0:3)} parameter(0)
     ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
   }
 
@@ -119,10 +119,10 @@ TEST_F(MemorySpacePropagationTest, NonTupleOutput) {
     %param0 = s32[6]{0:T(128)} parameter(0)
     %param1 = s32[1]{0:T(128)} parameter(1)
     %param2 = s32[5]{0:T(128)} parameter(2)
-    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg0 = s32[6]{0:T(128)S(1)SC(0:3)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)SC(0:3)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
     ROOT %root = s32[6]{0:T(128)} copy(%fusion)
   }
   )";
@@ -160,8 +160,8 @@ TEST_F(MemorySpacePropagationTest, TupleOutput) {
     %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
-    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %fusion = (s32[6]{0:T(128)S(1)SC(0:3)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)SC(0:3)} get-tuple-element(%fusion), index=0
     %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
     ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
   }
@@ -177,9 +177,9 @@ TEST_F(MemorySpacePropagationTest, TupleOutput) {
     %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
     %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
     %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
-    %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    %add.0 = s32[6]{0:T(128)S(1)SC(0:3)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
     %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
-    ROOT %tuple = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
+    ROOT %tuple = (s32[6]{0:T(128)S(1)SC(0:3)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
   }
 
   ENTRY %entry {
@@ -189,8 +189,8 @@ TEST_F(MemorySpacePropagationTest, TupleOutput) {
     %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
-    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %fusion = (s32[6]{0:T(128)S(1)SC(0:3)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)SC(0:3)} get-tuple-element(%fusion), index=0
     %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
     ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
   }
@@ -231,10 +231,10 @@ TEST_F(MemorySpacePropagationTest, NestedInputFusion) {
     %param0 = s32[3,2]{0,1:T(128)} parameter(0)
     %param1 = s32[1]{0:T(128)} parameter(1)
     %param2 = s32[5]{0:T(128)} parameter(2)
-    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)SC(1:1)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)SC(1:1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
     ROOT %root = s32[6]{0:T(128)} copy(%fusion)
   }
   )";
@@ -242,7 +242,7 @@ TEST_F(MemorySpacePropagationTest, NestedInputFusion) {
   HloModule NestedFusion
 
   %bitcast_fusion {
-    %bf_param = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    %bf_param = s32[3,2]{0,1:T(128)S(1)SC(1:1)} parameter(0)
     ROOT %bitcast = s32[6]{0:T(128)} bitcast(%bf_param)
   }
 
@@ -253,7 +253,7 @@ TEST_F(MemorySpacePropagationTest, NestedInputFusion) {
     %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
     %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
     %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
-    %param_0.1 = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    %param_0.1 = s32[3,2]{0,1:T(128)S(1)SC(1:1)} parameter(0)
     %fusion.1 = s32[6]{0:T(128)} fusion(%param_0.1), kind=kLoop, calls=bitcast_fusion
     ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %fusion.1)
   }
@@ -262,10 +262,10 @@ TEST_F(MemorySpacePropagationTest, NestedInputFusion) {
     %param0 = s32[3,2]{0,1:T(128)} parameter(0)
     %param1 = s32[1]{0:T(128)} parameter(1)
     %param2 = s32[5]{0:T(128)} parameter(2)
-    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)SC(1:1)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)SC(1:1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
     ROOT %root = s32[6]{0:T(128)} copy(%fusion)
   }
   )";
@@ -308,7 +308,7 @@ TEST_F(MemorySpacePropagationTest, NestedOutputFusion) {
     %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %fusion = s32[3,2]{0,1:T(128)S(1)SC(1:1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
     ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
   }
   )";
@@ -317,7 +317,7 @@ TEST_F(MemorySpacePropagationTest, NestedOutputFusion) {
 
   %bitcast_fusion {
     %bf_param = s32[6]{0:T(128)} parameter(0)
-    ROOT %bitcast = s32[3,2]{0,1:T(128)S(1)} bitcast(%bf_param)
+    ROOT %bitcast = s32[3,2]{0,1:T(128)S(1)SC(1:1)} bitcast(%bf_param)
   }
 
   %fused_computation {
@@ -329,7 +329,7 @@ TEST_F(MemorySpacePropagationTest, NestedOutputFusion) {
     %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
     %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
     %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %param_0.1)
-    ROOT %fusion.1 = s32[3,2]{0,1:T(128)S(1)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
+    ROOT %fusion.1 = s32[3,2]{0,1:T(128)S(1)SC(1:1)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
   }
 
   ENTRY %entry {
@@ -339,7 +339,7 @@ TEST_F(MemorySpacePropagationTest, NestedOutputFusion) {
     %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %fusion = s32[3,2]{0,1:T(128)S(1)SC(1:1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
     ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
   }
   )";
@@ -374,10 +374,10 @@ TEST_F(MemorySpacePropagationTest, BitcastInFusion) {
     %param0 = s32[6]{0:T(128)} parameter(0)
     %param1 = s32[1]{0:T(128)} parameter(1)
     %param2 = s32[5]{0:T(128)} parameter(2)
-    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg0 = s32[6]{0:T(128)S(1)SC(0:3)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    ROOT %fusion = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %fusion = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)SC(0:3)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
   }
   )";
   absl::string_view expected_hlo_string = R"(
@@ -390,9 +390,9 @@ TEST_F(MemorySpacePropagationTest, BitcastInFusion) {
     %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
     %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)S(1)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
     %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
-    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
-    %bitcast.0 = s32[6]{0:T(128)} bitcast(s32[6]{0:T(128)S(1)} %param_0.1)
-    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %param_0.1)
+    %param_0.1 = s32[6]{0:T(128)S(1)SC(0:3)} parameter(0)
+    %bitcast.0 = s32[6]{0:T(128)} bitcast(s32[6]{0:T(128)S(1)SC(0:3)} %param_0.1)
+    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)SC(0:3)} %param_0.1)
     ROOT %tuple = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) tuple(%bitcast.0, %multiply.0)
   }
 
@@ -400,10 +400,10 @@ TEST_F(MemorySpacePropagationTest, BitcastInFusion) {
     %param0 = s32[6]{0:T(128)} parameter(0)
     %param1 = s32[1]{0:T(128)} parameter(1)
     %param2 = s32[5]{0:T(128)} parameter(2)
-    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg0 = s32[6]{0:T(128)S(1)SC(0:3)} copy(%param0)
     %arg1 = s32[1]{0:T(128)} copy(%param1)
     %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
-    ROOT %fusion = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %fusion = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)SC(0:3)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
   }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
diff --git a/third_party/xla/xla/hlo/transforms/operand_upcaster.cc b/third_party/xla/xla/hlo/transforms/operand_upcaster.cc
index ed6b4d41ff44..2e774787e2e2 100644
--- a/third_party/xla/xla/hlo/transforms/operand_upcaster.cc
+++ b/third_party/xla/xla/hlo/transforms/operand_upcaster.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <optional>
 
-#include "absl/algorithm/container.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -63,12 +62,6 @@ bool OperandUpcaster::InstructionMatchesPattern(HloInstruction* instruction) {
     return false;
   }
 
-  // Always expand packed nibble precision mode.
-  if (absl::c_count(instruction->precision_config().operand_precision(),
-                    PrecisionConfig::PACKED_NIBBLE) == 2) {
-    return true;
-  }
-
   PrimitiveType inferred_type = (*status_or_inferred_shape)->element_type();
   if (instruction->shape().element_type() == inferred_type &&
       instruction->operand(0)->shape().element_type() == inferred_type &&
@@ -81,56 +74,8 @@ bool OperandUpcaster::InstructionMatchesPattern(HloInstruction* instruction) {
 
 absl::StatusOr<HloInstruction*> OperandUpcaster::ExpandInstruction(
     HloInstruction* instruction) {
-  const bool packed_nibble =
-      absl::c_count(instruction->precision_config().operand_precision(),
-                    PrecisionConfig::PACKED_NIBBLE) == 2;
   auto type = instruction->shape().element_type();
 
-  // If the precision is packed nibble create clone the linear op for each
-  // nibble of lhs and rhs.
-  if (packed_nibble) {
-    HloInstruction *lhs_n0 = instruction->mutable_operand(0), *lhs_n1 = lhs_n0,
-                   *rhs_n0 = instruction->mutable_operand(1), *rhs_n1 = rhs_n0;
-
-    TF_ASSIGN_OR_RETURN(lhs_n0, MakeBinaryHlo(HloOpcode::kShiftLeft, lhs_n0,
-                                              MakeScalarLike(lhs_n0, 4)));
-    HloOpcode lhs_shift = ShapeUtil::ElementIsSigned(lhs_n0->shape())
-                              ? HloOpcode::kShiftRightArithmetic
-                              : HloOpcode::kShiftRightLogical;
-    TF_ASSIGN_OR_RETURN(
-        lhs_n0, MakeBinaryHlo(lhs_shift, lhs_n0, MakeScalarLike(lhs_n0, 4)));
-    lhs_n0 = MakeConvertToHlo(lhs_n0, type);
-
-    TF_ASSIGN_OR_RETURN(
-        lhs_n1, MakeBinaryHlo(lhs_shift, lhs_n1, MakeScalarLike(lhs_n1, 4)));
-    lhs_n1 = MakeConvertToHlo(lhs_n1, type);
-
-    TF_ASSIGN_OR_RETURN(rhs_n0, MakeBinaryHlo(HloOpcode::kShiftLeft, rhs_n0,
-                                              MakeScalarLike(rhs_n0, 4)));
-    HloOpcode rhs_shift = ShapeUtil::ElementIsSigned(rhs_n0->shape())
-                              ? HloOpcode::kShiftRightArithmetic
-                              : HloOpcode::kShiftRightLogical;
-    TF_ASSIGN_OR_RETURN(
-        rhs_n0, MakeBinaryHlo(rhs_shift, rhs_n0, MakeScalarLike(rhs_n0, 4)));
-    rhs_n0 = MakeConvertToHlo(rhs_n0, type);
-
-    TF_ASSIGN_OR_RETURN(
-        rhs_n1, MakeBinaryHlo(rhs_shift, rhs_n1, MakeScalarLike(rhs_n1, 4)));
-    rhs_n1 = MakeConvertToHlo(rhs_n1, type);
-
-    HloInstruction* linear_n0 =
-        instruction->parent()->AddInstruction(instruction->CloneWithNewOperands(
-            instruction->shape(), {lhs_n0, rhs_n0}));
-    linear_n0->mutable_precision_config()->mutable_operand_precision()->Set(
-        0, PrecisionConfig::DEFAULT);
-    linear_n0->mutable_precision_config()->mutable_operand_precision()->Set(
-        1, PrecisionConfig::DEFAULT);
-    HloInstruction* linear_n1 =
-        instruction->parent()->AddInstruction(linear_n0->CloneWithNewOperands(
-            instruction->shape(), {lhs_n1, rhs_n1}));
-    return MakeBinaryHlo(HloOpcode::kAdd, linear_n0, linear_n1);
-  }
-
   for (int i = 0; i < HloDotInstruction::kOperands; ++i) {
     auto* operand = instruction->mutable_operand(i);
     if (operand->shape().element_type() == type) {
diff --git a/third_party/xla/xla/hlo/transforms/sharding_format_picker.h b/third_party/xla/xla/hlo/transforms/sharding_format_picker.h
deleted file mode 100644
index a6cbeb9420a4..000000000000
--- a/third_party/xla/xla/hlo/transforms/sharding_format_picker.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_HLO_TRANSFORMS_SHARDING_FORMAT_PICKER_H_
-#define XLA_HLO_TRANSFORMS_SHARDING_FORMAT_PICKER_H_
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-
-namespace xla {
-
-// Test-only pass to transform the HloSharding format of all the instructions in
-// a module to the selected format.
-class ShardingFormatPicker : public HloModulePass {
- public:
-  enum class ShardingType {
-    kV1,            // Converts all HloSharding to V1 format.
-    kBestEffortV2,  // Best effort to convert all HloSharding to V2 format.
-  };
-  explicit ShardingFormatPicker(ShardingType sharding_type)
-      : sharding_type_(sharding_type) {}
-  absl::string_view name() const override { return "sharding-format-picker"; }
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
- private:
-  const ShardingType sharding_type_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_HLO_TRANSFORMS_SHARDING_FORMAT_PICKER_H_
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
index 46eac7108f83..035017400c85 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Implementation of XLA’s HLO simplifier transformations.
 
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/tsl:tsl.bzl", "tsl_copts")
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tsl_copts")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla/tsl/platform/default:cuda_build_defs.bzl",
@@ -176,15 +176,13 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:call_graph",
         "//xla/service:float_support",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -204,6 +202,7 @@ xla_cc_test(
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_verifier",
         "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -218,14 +217,13 @@ cc_library(
     hdrs = ["hlo_computation_deduplicator.h"],
     deps = [
         "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -251,6 +249,7 @@ cc_library(
     name = "flatten_call_graph",
     srcs = ["flatten_call_graph.cc"],
     hdrs = ["flatten_call_graph.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -278,8 +277,9 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/service:call_graph",
-        "@com_google_absl//absl/status:statusor",
+        "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -292,18 +292,14 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/analysis:tuple_points_to_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/transforms/simplifiers:constant_deferring",
         "//xla/service:buffer_value",
         "//xla/service:logical_buffer",
         "//xla/service/heap_simulator",
-        "//xla/tsl/lib/gtl:map_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -324,11 +320,9 @@ xla_cc_test(
     name = "hlo_memory_scheduler_test",
     srcs = ["hlo_memory_scheduler_test.cc"],
     deps = [
-        ":hlo_dce",
         ":hlo_memory_scheduler",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/analysis:hlo_ordering",
@@ -341,13 +335,8 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -375,10 +364,12 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_module_config",
-        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:host_offload_utils",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -388,6 +379,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -417,10 +409,11 @@ xla_cc_test(
         "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:hlo_creation_utils",
-        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:layout_assignment",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
+        "//xla/tests:test_utils",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -444,7 +437,6 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
@@ -629,10 +621,8 @@ cc_library(
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -694,6 +684,7 @@ cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
     hdrs = ["tuple_simplifier.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -808,18 +799,21 @@ cc_library(
     name = "hlo_dce",
     srcs = ["hlo_dce.cc"],
     hdrs = ["hlo_dce.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/service:call_graph",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -835,10 +829,13 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:pattern_matcher",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -854,7 +851,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -916,16 +912,16 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:buffer_value",
         "//xla/service:hlo_cost_analysis",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1092,9 +1088,10 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1173,6 +1170,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
index c4b9fc9edbce..9fcdf7ce1eb7 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -61,13 +62,15 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/host_offload_utils.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/shape_inference.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
@@ -381,7 +384,7 @@ bool ValidateTilingOfBitcast(
         VLOG(2) << "Abort b/c tiled dimension " << op_dim << " has size 1.\n";
         return false;
       }
-    } else if (bitcast_shape.dimensions_size() <= operand_map[op_dim][0]) {
+    } else if (bitcast_shape.dimensions().size() <= operand_map[op_dim][0]) {
       VLOG(2) << "Abort because the bitcasted dimensions are not aligned!\n";
       return false;
     } else if (bitcast_shape.dimensions(operand_map[op_dim][0]) <
@@ -435,8 +438,8 @@ bool ValidateTilingOfBitcast(
 // will return vectors map_a_ab = {0, -1, 1} and map_b_ab = {-1, 2}
 std::pair<std::vector<int64_t>, std::vector<int64_t>> ConstructToDotMaps(
     DotDimensionNumbers dnums, const Shape& a_shape, const Shape& b_shape) {
-  std::vector<int64_t> map_a_ab(a_shape.rank(), -1),
-      map_b_ab(b_shape.rank(), -1);
+  std::vector<int64_t> map_a_ab(a_shape.dimensions().size(), -1),
+      map_b_ab(b_shape.dimensions().size(), -1);
   int64_t ab_index = 0;
   // Extract a and b contraction dimensions from dnums
   auto a_batch_dims = dnums.lhs_batch_dimensions();
@@ -450,7 +453,7 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> ConstructToDotMaps(
     ab_index++;
   }
   // Then add the free dimensions from a
-  for (int64_t a_index = 0; a_index < a_shape.rank(); a_index++) {
+  for (int64_t a_index = 0; a_index < a_shape.dimensions().size(); a_index++) {
     if (!absl::c_linear_search(a_contracting_dims, a_index) &&
         !absl::c_linear_search(a_batch_dims, a_index)) {
       map_a_ab[a_index] = ab_index;
@@ -458,7 +461,7 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> ConstructToDotMaps(
     }
   }
   // Finally add the free dimensions from b
-  for (int64_t b_index = 0; b_index < b_shape.rank(); b_index++) {
+  for (int64_t b_index = 0; b_index < b_shape.dimensions().size(); b_index++) {
     if (!absl::c_linear_search(b_contracting_dims, b_index) &&
         !absl::c_linear_search(b_batch_dims, b_index)) {
       map_b_ab[b_index] = ab_index;
@@ -476,8 +479,8 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> ConstructToDotMaps(
 std::pair<std::vector<int64_t>, std::vector<int64_t>> ConstructFromDotMaps(
     const HloInstruction* dot, const Shape& a_shape, const Shape& b_shape) {
   // Reserve space for new maps
-  std::vector<int64_t> map_ab_a(dot->shape().rank(), -1),
-      map_ab_b(dot->shape().rank(), -1);
+  std::vector<int64_t> map_ab_a(dot->shape().dimensions().size(), -1),
+      map_ab_b(dot->shape().dimensions().size(), -1);
   // Construct the maps going in the opposite direction
   std::vector<int64_t> map_a_ab, map_b_ab;
   std::tie(map_a_ab, map_b_ab) =
@@ -1040,18 +1043,19 @@ absl::Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
     std::optional<int64_t> index_concat_dimension;
     std::optional<int64_t> update_concat_dimension;
     // Don't try to combine scatters of different ranks.
-    if (lhs_scatter_index->shape().rank() !=
-        rhs_scatter_index->shape().rank()) {
+    if (lhs_scatter_index->shape().dimensions().size() !=
+        rhs_scatter_index->shape().dimensions().size()) {
       return absl::OkStatus();
     }
 
-    int64_t first_index_dim = lhs_scatter_index->shape().rank();
-    int64_t first_update_dim = lhs_scatter_update->shape().rank();
+    int64_t first_index_dim = lhs_scatter_index->shape().dimensions().size();
+    int64_t first_update_dim = lhs_scatter_update->shape().dimensions().size();
     // Find a dimension where it is possible to concatenate the indices and
     // updates. This is the first and only non-equal dimension or the first
     // equally sized dimension.
-    for (int64_t d = lhs_scatter_index->shape().rank() - 1,
-                 update_dim = lhs_scatter_update->shape().rank() - 1;
+    for (int64_t
+             d = lhs_scatter_index->shape().dimensions().size() - 1,
+             update_dim = lhs_scatter_update->shape().dimensions().size() - 1;
          d >= 0; --d) {
       if (d == lhs_dnums.index_vector_dim()) {
         continue;
@@ -1098,7 +1102,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
 
     // A scalar scatter will require additional reshapes of the index and
     // update.
-    if (*index_concat_dimension == lhs_scatter_index->shape().rank()) {
+    if (*index_concat_dimension ==
+        lhs_scatter_index->shape().dimensions().size()) {
       return absl::OkStatus();
     }
     const bool update_concat_is_cheap =
@@ -1358,9 +1363,9 @@ std::optional<std::vector<std::vector<int64_t>>>
 AlgebraicSimplifierVisitor::ComputeBitcastDimMap(const Shape& bitcast_shape,
                                                  const Shape& operand_shape) {
   std::vector<std::vector<int64_t>> operand_dim_map(
-      operand_shape.dimensions_size());
-  int64_t bitcast_rank = bitcast_shape.dimensions_size();
-  int64_t operand_rank = operand_shape.dimensions_size();
+      operand_shape.dimensions().size());
+  int64_t bitcast_rank = bitcast_shape.dimensions().size();
+  int64_t operand_rank = operand_shape.dimensions().size();
   int64_t cur_bitcast_size = 1, cur_operand_size = 1;
   int64_t operand_pos = -1, operand_dim = -1;
   for (int64_t bitcast_pos = 0; bitcast_pos < bitcast_rank; ++bitcast_pos) {
@@ -1372,7 +1377,7 @@ AlgebraicSimplifierVisitor::ComputeBitcastDimMap(const Shape& bitcast_shape,
       }
       continue;
     }
-    CHECK_LT(bitcast_dim, bitcast_shape.dimensions_size());
+    CHECK_LT(bitcast_dim, bitcast_shape.dimensions().size());
     int64_t bitcast_dim_size = bitcast_shape.dimensions()[bitcast_dim];
     auto prev_bitcast_size = cur_bitcast_size;
     cur_bitcast_size *= bitcast_dim_size;
@@ -1493,7 +1498,7 @@ std::optional<Shape> AlgebraicSimplifierVisitor::ReshapeLayoutDimensions(
       }
     }
   }
-  for (int i = 0; i < result_shape.rank(); ++i) {
+  for (int i = 0; i < result_shape.dimensions().size(); ++i) {
     if (result_shape.dimensions(i) == 1) {
       bitcast_pos++;
       // Since there is a possibility of over-incrementing bitcast_pos
@@ -1507,7 +1512,7 @@ std::optional<Shape> AlgebraicSimplifierVisitor::ReshapeLayoutDimensions(
       (*reshaped_dimensions)[bitcast_pos] = i;
     }
   }
-  CHECK_EQ(bitcast_pos + 1, result_shape.rank());
+  CHECK_EQ(bitcast_pos + 1, result_shape.dimensions().size());
   return new_shape;
 }
 
@@ -1515,9 +1520,10 @@ std::vector<std::vector<int64_t>>
 AlgebraicSimplifierVisitor::InvertBitcastDimMap(
     const Shape& original_shape, const Shape& bitcast_shape,
     const std::vector<std::vector<int64_t>>& original_map) {
-  std::vector<std::vector<int64_t>> result_map(bitcast_shape.dimensions_size());
+  std::vector<std::vector<int64_t>> result_map(
+      bitcast_shape.dimensions().size());
   // Invert the operand map into result map.
-  for (auto i = 0; i < original_shape.rank(); ++i) {
+  for (auto i = 0; i < original_shape.dimensions().size(); ++i) {
     auto j = original_shape.layout().minor_to_major(i);
     VLOG(3) << "traversing minor to major (" << i << ")=" << j;
     for (auto k : original_map[j]) {
@@ -1904,7 +1910,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleConcatenate(
       return absl::OkStatus();
     }
     PaddingConfig padding_config;
-    for (int64_t dim = 0; dim < operands[0]->shape().rank(); ++dim) {
+    for (int64_t dim = 0; dim < operands[0]->shape().dimensions().size();
+         ++dim) {
       auto padding_config_dim = padding_config.add_dimensions();
       padding_config_dim->set_edge_padding_high(0);
       padding_config_dim->set_edge_padding_low(0);
@@ -1931,7 +1938,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleConcatenate(
       operands[0]->shape().dimensions(concatenate_dimension) == 1) {
     Shape new_shape = operands[0]->shape();
     DimensionVector broadcast_dims;
-    for (int64_t i = 0; i < new_shape.rank(); ++i) {
+    for (int64_t i = 0; i < new_shape.dimensions().size(); ++i) {
       if (i == concatenate_dimension) {
         continue;
       }
@@ -2399,8 +2406,8 @@ AlgebraicSimplifierVisitor::RemoveDegenerateDimensionFromDot(
     HloDotInstruction* dot) {
   const Shape& lhs_shape = dot->operand(0)->shape();
   int64_t num_degenerate_lhs_dims = 0;
-  std::vector<int64_t> lhs_dimension_map(lhs_shape.rank(), -1);
-  for (int64_t i = 0; i < lhs_shape.rank(); ++i) {
+  std::vector<int64_t> lhs_dimension_map(lhs_shape.dimensions().size(), -1);
+  for (int64_t i = 0; i < lhs_shape.dimensions().size(); ++i) {
     if (lhs_shape.dimensions(i) == 1) {
       ++num_degenerate_lhs_dims;
     } else {
@@ -2410,8 +2417,8 @@ AlgebraicSimplifierVisitor::RemoveDegenerateDimensionFromDot(
 
   const Shape& rhs_shape = dot->operand(1)->shape();
   int64_t num_degenerate_rhs_dims = 0;
-  std::vector<int64_t> rhs_dimension_map(rhs_shape.rank(), -1);
-  for (int64_t i = 0; i < rhs_shape.rank(); ++i) {
+  std::vector<int64_t> rhs_dimension_map(rhs_shape.dimensions().size(), -1);
+  for (int64_t i = 0; i < rhs_shape.dimensions().size(); ++i) {
     if (rhs_shape.dimensions(i) == 1) {
       ++num_degenerate_rhs_dims;
     } else {
@@ -2581,7 +2588,7 @@ absl::Status AlgebraicSimplifierVisitor::SimplifyTransposeOfBroadcast(
 absl::StatusOr<bool>
 AlgebraicSimplifierVisitor::RemoveTransposesFromDotOperands(
     HloDotInstruction* dot) {
-  const int64_t rank = dot->shape().rank();
+  const int64_t rank = dot->shape().dimensions().size();
   const auto& dnums = dot->dot_dimension_numbers();
   HloInstruction* lhs = dot->mutable_operand(0);
   HloInstruction* rhs = dot->mutable_operand(1);
@@ -2688,10 +2695,10 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::MoveDotParamToRhs(
   std::vector<int64_t> permutation;
   const int64_t num_batch_dims = dot_dims.lhs_batch_dimensions_size();
   const int64_t lhs_non_contracting_batch =
-      new_dot->operand(0)->shape().rank() - num_batch_dims -
+      new_dot->operand(0)->shape().dimensions().size() - num_batch_dims -
       dot_dims.lhs_contracting_dimensions_size();
   const int64_t rhs_non_contracting_batch =
-      new_dot->operand(1)->shape().rank() - num_batch_dims -
+      new_dot->operand(1)->shape().dimensions().size() - num_batch_dims -
       dot_dims.rhs_contracting_dimensions_size();
   for (int i = 0; i != num_batch_dims; ++i) {
     permutation.push_back(i);
@@ -2716,7 +2723,7 @@ AlgebraicSimplifierVisitor::NormalizeDotOperandToBatchMajorAndContractingMinor(
     absl::Span<const int64_t> contracting_dimensions) {
   std::vector<int64_t> transpose_dimensions(batch_dimensions.begin(),
                                             batch_dimensions.end());
-  for (int64_t i = 0; i < dot_operand->shape().rank(); ++i) {
+  for (int64_t i = 0; i < dot_operand->shape().dimensions().size(); ++i) {
     if (!(absl::c_linear_search(batch_dimensions, i) ||
           absl::c_linear_search(contracting_dimensions, i))) {
       transpose_dimensions.push_back(i);
@@ -2749,7 +2756,7 @@ absl::StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
   if (dnums.lhs_contracting_dimensions_size() != 1 ||
       dnums.lhs_batch_dimensions_size() != 0 ||
       Cast<HloDotInstruction>(dot)->sparse_operands() ||
-      dot->shape().dimensions_size() != 2) {  // dot output 2D
+      dot->shape().dimensions().size() != 2) {  // dot output 2D
     return nullptr;
   }
 
@@ -2893,7 +2900,7 @@ absl::StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   if (dnums.lhs_contracting_dimensions_size() != 1 ||
       dnums.lhs_batch_dimensions_size() != 0 ||
       Cast<HloDotInstruction>(dot)->sparse_operands() ||
-      dot->shape().dimensions_size() != 2) {  // dot output 2D
+      dot->shape().dimensions().size() != 2) {  // dot output 2D
     VLOG(10) << "DotOfGather: Can only optimize 2D, non-batch dot operations.";
     return nullptr;
   }
@@ -3073,7 +3080,7 @@ AlgebraicSimplifierVisitor::OptimizeDotOfReorderContractingDims(
   auto unmodified_dims = ShapeUtil::DimensionsUnmodifiedByReshape(
       reshape->operand(0)->shape(), reshape->shape());
   CHECK_EQ(lhs_contracting_dims.size(), 1);
-  if ((unmodified_dims.size() != reshape->shape().rank() - 1) ||
+  if ((unmodified_dims.size() != reshape->shape().dimensions().size() - 1) ||
       absl::c_any_of(unmodified_dims,
                      [&](const std::pair<int64_t, int64_t>& p) {
                        return p.second == lhs_contracting_dims[0];
@@ -3093,7 +3100,7 @@ AlgebraicSimplifierVisitor::OptimizeDotOfReorderContractingDims(
     unmodified_transpose_dims.insert(pair.first);
   }
   lhs_contracting_dims.Clear();
-  for (int64_t i = 0; i < transpose->shape().dimensions_size(); ++i) {
+  for (int64_t i = 0; i < transpose->shape().dimensions().size(); ++i) {
     if (!unmodified_transpose_dims.contains(i)) {
       lhs_contracting_dims.Add(i);
     }
@@ -3200,7 +3207,8 @@ AlgebraicSimplifierVisitor::OptimizeDotOfReorderContractingDims(
     ++it;
   }
   // Then compute the transpose dims.
-  std::vector<int64_t> rhs_transpose_dims(rhs_reshape->shape().rank());
+  std::vector<int64_t> rhs_transpose_dims(
+      rhs_reshape->shape().dimensions().size());
   absl::c_iota(rhs_transpose_dims, 0);
   it = rhs_transpose_dims.erase(
       rhs_transpose_dims.begin() + rhs_contracting_dims[0],
@@ -3277,9 +3285,9 @@ AlgebraicSimplifierVisitor::AssociativeReorderDotOperator(
 
     // Construct maps between corresponding dot contracting dimensions
     std::vector<int64_t> contracting_dim_map_forward(
-        reorder_from->shape().rank(), -1);
+        reorder_from->shape().dimensions().size(), -1);
     std::vector<int64_t> contracting_dim_map_backward(
-        reorder_to->shape().rank(), -1);
+        reorder_to->shape().dimensions().size(), -1);
     for (int64_t i = 0; i < dnums.lhs_contracting_dimensions_size(); i++) {
       auto from_index = lhs_to_rhs ? dnums.lhs_contracting_dimensions()[i]
                                    : dnums.rhs_contracting_dimensions()[i];
@@ -3336,7 +3344,8 @@ AlgebraicSimplifierVisitor::AssociativeReorderDotOperator(
 
       // Compute start_indices, limit_indices, and strides for slicing from
       // the padding dimensions
-      for (int64_t to_dim = 0; to_dim < reorder_to->shape().rank(); to_dim++) {
+      for (int64_t to_dim = 0; to_dim < reorder_to->shape().dimensions().size();
+           to_dim++) {
         int64_t start_index = 0;
         int64_t limit_index = reorder_to->shape().dimensions(to_dim);
         int64_t stride = 1;
@@ -3396,8 +3405,8 @@ AlgebraicSimplifierVisitor::AssociativeReorderDotOperator(
       // of the corresponding contracting dimensions in the other dot operand
       DimensionVector reduce_dims;
       const int64_t pre_broadcast_rank =
-          reorder_from->mutable_operand(0)->shape().rank();
-      int64_t post_broadcast_rank = reorder_from->shape().rank();
+          reorder_from->mutable_operand(0)->shape().dimensions().size();
+      int64_t post_broadcast_rank = reorder_from->shape().dimensions().size();
       Shape new_broadcast_shape = reorder_from->shape();
 
       // Construct map from broadcasted shape to its original shape. Broadcast
@@ -3514,16 +3523,17 @@ AlgebraicSimplifierVisitor::RewriteAsMultiplyDotWithZeroLhsContractingDim(
   if (!ShapeUtil::SameElementType(dot->shape(), new_rhs->shape())) {
     new_rhs = MakeConvertToHlo(new_rhs, dot->shape().element_type());
   }
-  if (dot->shape().rank() != lhs->shape().rank()) {
-    std::vector<int64_t> lhs_broadcast_dims(lhs->shape().rank());
+  if (dot->shape().dimensions().size() != lhs->shape().dimensions().size()) {
+    std::vector<int64_t> lhs_broadcast_dims(lhs->shape().dimensions().size());
     absl::c_iota(lhs_broadcast_dims, 0);
     new_lhs = dot->AddInstruction(HloInstruction::CreateBroadcast(
         dot->shape(), new_lhs, lhs_broadcast_dims));
   }
-  if (dot->shape().rank() != rhs->shape().rank()) {
+  if (dot->shape().dimensions().size() != rhs->shape().dimensions().size()) {
     std::vector<int64_t> rhs_broadcast_dims(dnums.lhs_batch_dimensions_size());
     absl::c_iota(rhs_broadcast_dims, 0);
-    for (int64_t i = lhs->shape().rank(); i < dot->shape().rank(); ++i) {
+    for (int64_t i = lhs->shape().dimensions().size();
+         i < dot->shape().dimensions().size(); ++i) {
       rhs_broadcast_dims.push_back(i);
     }
     new_rhs = dot->AddInstruction(HloInstruction::CreateBroadcast(
@@ -3821,7 +3831,7 @@ AlgebraicSimplifierVisitor::AssociativeReorderNestedDot(HloDotInstruction* dot,
     // reordering may permute the dimensions of the shape. To correct for
     // this, we build a map from old_outer dimensions to new_outer
     // dimensions and use it to transpose new_outer.
-    DimensionVector permutation(new_outer->shape().rank());
+    DimensionVector permutation(new_outer->shape().dimensions().size());
 
     // Construct additional maps to make the permutation
     std::vector<int64_t> map_outer_lhs, map_outer_rhs;
@@ -3851,7 +3861,7 @@ AlgebraicSimplifierVisitor::AssociativeReorderNestedDot(HloDotInstruction* dot,
 
     // Create permutation to do the transpose
     bool add_transpose = false;
-    for (int64_t i = 0; i < outer->shape().rank(); i++) {
+    for (int64_t i = 0; i < outer->shape().dimensions().size(); i++) {
       int64_t new_outer_index;
       if (map_outer_other[i] == -1) {
         int64_t inner_index = map_outer_inner[i];
@@ -3913,17 +3923,17 @@ absl::Status AlgebraicSimplifierVisitor::RewriteBatchPlusContractingAsReduce(
     new_rhs = MakeConvertToHlo(new_rhs, dot->shape().element_type());
   }
 
-  int64_t lhs_outer_dims =
-      lhs->shape().rank() - (dnums.lhs_batch_dimensions_size() +
-                             dnums.lhs_contracting_dimensions_size());
-  int64_t rhs_outer_dims =
-      rhs->shape().rank() - (dnums.rhs_batch_dimensions_size() +
-                             dnums.rhs_contracting_dimensions_size());
+  int64_t lhs_outer_dims = lhs->shape().dimensions().size() -
+                           (dnums.lhs_batch_dimensions_size() +
+                            dnums.lhs_contracting_dimensions_size());
+  int64_t rhs_outer_dims = rhs->shape().dimensions().size() -
+                           (dnums.rhs_batch_dimensions_size() +
+                            dnums.rhs_contracting_dimensions_size());
   CHECK(lhs_outer_dims == 0 || rhs_outer_dims == 0);
   if (rhs_outer_dims > 0) {
     std::vector<int64_t> lhs_broadcast_dims(dnums.lhs_batch_dimensions_size());
     absl::c_iota(lhs_broadcast_dims, 0);
-    lhs_broadcast_dims.resize(lhs->shape().rank());
+    lhs_broadcast_dims.resize(lhs->shape().dimensions().size());
     std::iota(lhs_broadcast_dims.begin() + dnums.lhs_batch_dimensions_size(),
               lhs_broadcast_dims.end(),
               dnums.lhs_batch_dimensions_size() + rhs_outer_dims);
@@ -3932,7 +3942,7 @@ absl::Status AlgebraicSimplifierVisitor::RewriteBatchPlusContractingAsReduce(
   } else if (lhs_outer_dims > 0) {
     std::vector<int64_t> rhs_broadcast_dims(dnums.rhs_batch_dimensions_size());
     absl::c_iota(rhs_broadcast_dims, 0);
-    rhs_broadcast_dims.resize(rhs->shape().rank());
+    rhs_broadcast_dims.resize(rhs->shape().dimensions().size());
     std::iota(rhs_broadcast_dims.begin() + dnums.rhs_batch_dimensions_size(),
               rhs_broadcast_dims.end(),
               dnums.rhs_batch_dimensions_size() + lhs_outer_dims);
@@ -3957,7 +3967,7 @@ absl::Status AlgebraicSimplifierVisitor::RewriteBatchPlusContractingAsReduce(
 }
 
 bool AlgebraicSimplifierVisitor::SupportedDotPrecisionConfig(
-    const PrecisionConfig& config) {
+    const PrecisionConfig& config, bool has_contracting_dim) {
   return config.algorithm() == PrecisionConfig::ALG_UNSET ||
          // TODO(loislo): Fixes a failure on a test with CPU backend.
          config.algorithm() == PrecisionConfig::ALG_DOT_F32_F32_F32;
@@ -3991,14 +4001,10 @@ absl::Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
         dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
 
-  const bool is_packed_nibble =
-      absl::c_linear_search(dot->precision_config().operand_precision(),
-                            PrecisionConfig::PACKED_NIBBLE);
-  const bool can_rewrite_dot_with_precision_config_algorithm =
-      SupportedDotPrecisionConfig(dot->precision_config());
   // If there are no contracting dimensions, a dot can be rewritten as
   // mul(broadcast(transpose(x)),broadcast(transpose(y)))
-  if (!is_packed_nibble && can_rewrite_dot_with_precision_config_algorithm &&
+  if (SupportedDotPrecisionConfig(dot->precision_config(),
+                                  /*has_contracting_dim=*/false) &&
       options_.enable_dot_to_multiply_rewrite() &&
       dnums.lhs_contracting_dimensions_size() == 0) {
     return RewriteAsMultiplyDotWithZeroLhsContractingDim(dot, lhs, rhs, dnums);
@@ -4025,10 +4031,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
 
   // If the lhs or rhs have only batch and contracting dimensions, a dot can be
   // rewritten as reduce(mul(broadcast(transpose(x)),broadcast(transpose(y))))
-  if (!is_packed_nibble && can_rewrite_dot_with_precision_config_algorithm &&
+  if (SupportedDotPrecisionConfig(dot->precision_config(),
+                                  /*has_contracting_dim=*/true) &&
       options_.enable_dot_strength_reduction() &&
-      DotHasOnlyBatchAndContractingOnOneOperand(lhs->shape().rank(),
-                                                rhs->shape().rank(), dnums) &&
+      DotHasOnlyBatchAndContractingOnOneOperand(
+          lhs->shape().dimensions().size(), rhs->shape().dimensions().size(),
+          dnums) &&
       ShouldStrengthReduceDotToReduce(dot)) {
     return RewriteBatchPlusContractingAsReduce(dot_cast, lhs, rhs, dnums);
   }
@@ -4088,7 +4096,7 @@ namespace {
 std::vector<int64_t> GetPaddedDims(const HloInstruction* pad) {
   CHECK_EQ(pad->opcode(), HloOpcode::kPad);
   std::vector<int64_t> padded_dims;
-  for (int64_t i = 0; i != pad->shape().rank(); ++i) {
+  for (int64_t i = 0; i != pad->shape().dimensions().size(); ++i) {
     if (pad->padding_config().dimensions(i).edge_padding_high() != 0 ||
         pad->padding_config().dimensions(i).edge_padding_low() != 0 ||
         pad->padding_config().dimensions(i).interior_padding() != 0) {
@@ -4186,8 +4194,8 @@ GatherOfPadInfo CheckPaddedDimsForGatherOfPad(
       start_indices_dims_to_output_dims =
           GetStartIndicesDimToOutputDimForExplicitBatchingDims(
               dnums.start_indices_batching_dims(), dnums.index_vector_dim(),
-              dnums.offset_dims(), start_indices->shape().rank(),
-              gather->shape().rank());
+              dnums.offset_dims(), start_indices->shape().dimensions().size(),
+              gather->shape().dimensions().size());
   for (int64_t operand_dim : padded_operand_dims) {
     if (!absl::c_linear_search(operand_batching_dims, operand_dim)) {
       continue;
@@ -4215,7 +4223,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
   // Gathering from a scalar operand is simply a broadcast of that scalar
   if (ShapeUtil::IsEffectiveScalar(operand_shape)) {
     HloInstruction* new_operand = gather->mutable_operand(0);
-    if (operand_shape.rank()) {
+    if (!operand_shape.dimensions().empty()) {
       TF_ASSIGN_OR_RETURN(new_operand,
                           MakeReshapeHlo(ShapeUtil::MakeScalarShape(
                                              operand_shape.element_type()),
@@ -4228,10 +4236,10 @@ absl::Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
   // If the operand of a gather is very small, it is easier to fuse a
   // sequence of selects.
   const Shape& index_shape = gather->operand(1)->shape();
-  if (operand_shape.rank() == 1 &&
+  if (operand_shape.dimensions().size() == 1 &&
       operand_shape.dimensions(0) <= options_.very_small_gather_size() &&
       gather->gather_dimension_numbers().index_vector_dim() ==
-          index_shape.rank() &&
+          index_shape.dimensions().size() &&
       gather->gather_dimension_numbers().collapsed_slice_dims_size() == 1) {
     const int64_t operand_elements = operand_shape.dimensions(0);
     auto get_value = [&](int64_t i) {
@@ -4289,9 +4297,9 @@ absl::Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
     if (info.should_transform) {
       Shape gather_shape = gather->shape();
       for (int64_t padded_dim : padded_dims) {
-        gather_shape.mutable_dimensions()
-            [gather_operand_passthrough_operand_to_output_dims[padded_dim]] =
-            pad->operand(0)->shape().dimensions()[padded_dim];
+        gather_shape.set_dimensions(
+            gather_operand_passthrough_operand_to_output_dims[padded_dim],
+            pad->operand(0)->shape().dimensions()[padded_dim]);
       }
       auto gather_inst = Cast<HloGatherInstruction>(gather);
       std::vector<int64_t> slice_sizes;
@@ -4315,7 +4323,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
               gather_inst->gather_dimension_numbers(), slice_sizes,
               gather_inst->indices_are_sorted()));
       PaddingConfig pad_config;
-      for (int64_t i = 0; i != gather->shape().rank(); ++i) {
+      for (int64_t i = 0; i != gather->shape().dimensions().size(); ++i) {
         auto dimension = pad_config.add_dimensions();
         if (gather_operand_passthrough_output_to_operand_dims.contains(i) &&
             absl::c_linear_search(
@@ -4379,11 +4387,11 @@ absl::Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
         Shape gather_shape = gather->shape();
         for (int64_t padded_dim : padded_dims) {
           int64_t to_dim = reshape_unmodified_dims[padded_dim];
-          reshape_shape.mutable_dimensions()[to_dim] =
-              pad->operand(0)->shape().dimensions()[padded_dim];
-          gather_shape.mutable_dimensions()
-              [gather_operand_passthrough_operand_to_output_dims[to_dim]] =
-              pad->operand(0)->shape().dimensions()[padded_dim];
+          reshape_shape.set_dimensions(
+              to_dim, pad->operand(0)->shape().dimensions()[padded_dim]);
+          gather_shape.set_dimensions(
+              gather_operand_passthrough_operand_to_output_dims[to_dim],
+              pad->operand(0)->shape().dimensions()[padded_dim]);
         }
         HloInstruction* result =
             gather->AddInstruction(HloInstruction::CreateReshape(
@@ -4405,7 +4413,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
             gather_inst->gather_dimension_numbers(), slice_sizes,
             gather_inst->indices_are_sorted()));
         PaddingConfig pad_config;
-        for (int64_t i = 0; i != gather->shape().rank(); ++i) {
+        for (int64_t i = 0; i != gather->shape().dimensions().size(); ++i) {
           auto dimension = pad_config.add_dimensions();
           if (gather_operand_passthrough_output_to_operand_dims.contains(i) &&
               reshape_dims_to_padded_dims.contains(
@@ -5223,7 +5231,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleBroadcast(
 
   // A broadcast that has the same input and output rank can be converted into a
   // transpose with the inverse of broadcast's dimensions.
-  if (broadcast->shape().rank() == operand->shape().rank() &&
+  if (broadcast->shape().dimensions().size() ==
+          operand->shape().dimensions().size() &&
       ShapeUtil::ElementsIn(broadcast->shape()) ==
           ShapeUtil::ElementsIn(operand->shape())) {
     return ReplaceWithNewInstruction(
@@ -5315,8 +5324,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleBroadcast(
     auto new_operand = operand->AddInstruction(HloInstruction::CreateReshape(
         ShapeUtil::DropDegenerateDimensions(operand->shape()), operand));
     std::vector<int64_t> new_dims;
-    new_dims.reserve(new_operand->shape().rank());
-    for (int64_t i = 0; i < operand->shape().rank(); ++i) {
+    new_dims.reserve(new_operand->shape().dimensions().size());
+    for (int64_t i = 0; i < operand->shape().dimensions().size(); ++i) {
       if (operand->shape().dimensions(i) != 1) {
         new_dims.push_back(dims[i]);
       }
@@ -5614,7 +5623,7 @@ absl::Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   if (HasInteriorPadding(pad->padding_config())) {
     PaddingConfig padding_config = pad->padding_config();
     bool cleared_interior_padding = false;
-    for (int64_t i = 0; i < pad->shape().rank(); ++i) {
+    for (int64_t i = 0; i < pad->shape().dimensions().size(); ++i) {
       if (padding_config.dimensions(i).interior_padding() > 0 &&
           pad->operand(0)->shape().dimensions(i) == 1) {
         cleared_interior_padding = true;
@@ -5678,7 +5687,7 @@ absl::Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   if (pad_dims < dimension_index &&
       pad->operand(0)->opcode() == HloOpcode::kBroadcast &&
       pad->operand(0)->user_count() == 1 &&
-      pad->operand(0)->operand(0)->shape().rank() <= pad_dims) {
+      pad->operand(0)->operand(0)->shape().dimensions().size() <= pad_dims) {
     // Check broadcast operand dimensions is a subset of pading_dimensions.
     // If not, skip the optimization.
     bool opt_is_valid = true;
@@ -5722,6 +5731,7 @@ absl::Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
       }
       *broadcast->mutable_shape() = broadcast_shape1;
       *broadcast->mutable_dimensions() = broadcast_dimensions;
+      broadcast->clear_sharding();
       simplifier_->UpdateLayout(broadcast->mutable_shape());
       auto pad2 = pad->AddInstruction(pad->CloneWithNewShape(pad_shape1));
       *pad2->mutable_padding_config() = pad_config;
@@ -5742,7 +5752,7 @@ absl::Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     // First construct the padding config with non-negative entries and the
     // compute the shape of this new pad instruction.
     PaddingConfig nonzero_padding = pad->padding_config();
-    for (int i = 0; i < pad->padding_config().dimensions_size(); ++i) {
+    for (int i = 0; i < pad->padding_config().dimensions().size(); ++i) {
       PaddingConfig::PaddingConfigDimension* padding_dimension =
           nonzero_padding.mutable_dimensions(i);
       // Set negative padding to zero.
@@ -5772,7 +5782,7 @@ absl::Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     std::vector<int64_t> start_indices;
     std::vector<int64_t> end_indices;
     std::vector<int64_t> strides;
-    for (int64_t i = 0; i < pad->padding_config().dimensions_size(); ++i) {
+    for (int64_t i = 0; i < pad->padding_config().dimensions().size(); ++i) {
       const PaddingConfig::PaddingConfigDimension& padding_dimension =
           pad->padding_config().dimensions(i);
       int64_t start = 0;
@@ -6057,7 +6067,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleRemainder(
     int64_t iota_upper_bound = iota->shape().dimensions(
         Cast<HloIotaInstruction>(iota)->iota_dimension());
     std::optional<int64_t> divisor_val = divisor->literal().GetIntegralAsS64(
-        std::vector<int64_t>(0, divisor->shape().dimensions_size()));
+        std::vector<int64_t>(0, divisor->shape().dimensions().size()));
     if (divisor_val && *divisor_val >= iota_upper_bound) {
       return ReplaceInstruction(remainder, iota);
     }
@@ -6085,7 +6095,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleRemainder(
     int64_t iota_upper_bound = iota->shape().dimensions(
         Cast<HloIotaInstruction>(iota)->iota_dimension());
     std::optional<int64_t> divisor_val = divisor->literal().GetIntegralAsS64(
-        std::vector<int64_t>(0, divisor->shape().dimensions_size()));
+        std::vector<int64_t>(0, divisor->shape().dimensions().size()));
     if (divisor_val) {
       // Check whether divisor_val + iota_upper_bound - 1 overflows.
       std::optional<int64_t> max_val =
@@ -6334,7 +6344,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReshape(
       new_dus_operands.push_back(nullptr);
       auto zero = MakeScalarLike(dus->mutable_operand(2), 0);
       const Shape& old_slice_shape = dus->operand(1)->shape();
-      for (int64_t i = 0; i <= old_slice_shape.rank(); ++i) {
+      for (int64_t i = 0; i <= old_slice_shape.dimensions().size(); ++i) {
         if (absl::c_linear_search(trivial_reshape->deleted_dimensions, i)) {
           continue;
         }
@@ -6343,7 +6353,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReshape(
           new_slice_shape.push_back(1);
           new_dus_operands.push_back(zero);
         }
-        if (i < old_slice_shape.rank()) {
+        if (i < old_slice_shape.dimensions().size()) {
           new_slice_shape.push_back(old_slice_shape.dimensions(i));
           new_dus_operands.push_back(dus->mutable_operand(2 + i));
         }
@@ -6502,7 +6512,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
   if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
     VLOG(10) << "Trying to simplify scalar slice of concat";
     // Only do this for R1, there's no chance of this being useful otherwise.
-    if (slice->shape().rank() != 1) {
+    if (slice->shape().dimensions().size() != 1) {
       VLOG(10) << "Not folding, slice is not rank 1";
       return false;
     }
@@ -6553,7 +6563,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
     return false;
   }
   HloInstruction* new_slice_operand = reshape->mutable_operand(0);
-  int64_t slice_rank = slice->shape().rank();
+  int64_t slice_rank = slice->shape().dimensions().size();
   std::vector<int64_t> sliced_dims;
   for (int64_t i = 0; i < slice_rank; ++i) {
     if (slice->slice_starts(i) != 0 ||
@@ -6565,7 +6575,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
   if (sliced_dims.size() == 1 && sliced_dims[0] == 0 &&
       slice->slice_starts(0) == 0) {
     const Shape& new_slice_shape = new_slice_operand->shape();
-    const int64_t rank = new_slice_shape.rank();
+    const int64_t rank = new_slice_shape.dimensions().size();
     std::vector<int64_t> new_slice_starts(rank, 0);
     std::vector<int64_t> new_slice_stides(rank, 1);
     std::vector<int64_t> new_slice_limits(new_slice_shape.dimensions().begin(),
@@ -6643,11 +6653,11 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReverse(
 }
 
 absl::StatusOr<bool> AlgebraicSimplifierVisitor::RemoveRedundantStride(
-    absl::Nonnull<HloInstruction*> slice) {
+    HloInstruction* absl_nonnull slice) {
   CHECK(slice->opcode() == HloOpcode::kSlice);
 
   std::vector<int64_t> index_to_change;
-  for (int64_t i = 0; i < slice->shape().rank(); ++i) {
+  for (int64_t i = 0; i < slice->shape().dimensions().size(); ++i) {
     const int64_t start = slice->slice_starts(i);
     const int64_t stride = slice->slice_strides(i);
     const int64_t limit = slice->slice_limits(i);
@@ -6698,7 +6708,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     bool slice_in_padding = false;
     std::vector<int64_t> new_starts = slice->slice_starts();
     std::vector<int64_t> new_limits = slice->slice_limits();
-    for (int64_t i = 0; i < slice->shape().rank(); ++i) {
+    for (int64_t i = 0; i < slice->shape().dimensions().size(); ++i) {
       const int64_t start = slice->slice_starts(i);
       const int64_t stride = slice->slice_strides(i);
       const int64_t limit = slice->slice_limits(i);
@@ -6793,9 +6803,9 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     std::vector<int64_t> new_slice_starts;
     std::vector<int64_t> new_slice_strides;
     std::vector<int64_t> new_slice_limits;
-    new_slice_starts.reserve(broadcast_operand->shape().rank());
-    new_slice_strides.reserve(broadcast_operand->shape().rank());
-    new_slice_limits.reserve(broadcast_operand->shape().rank());
+    new_slice_starts.reserve(broadcast_operand->shape().dimensions().size());
+    new_slice_strides.reserve(broadcast_operand->shape().dimensions().size());
+    new_slice_limits.reserve(broadcast_operand->shape().dimensions().size());
     for (int64_t dim : broadcast->dimensions()) {
       new_slice_starts.push_back(slice->slice_starts(dim));
       new_slice_strides.push_back(slice->slice_strides(dim));
@@ -6805,7 +6815,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     VLOG(3) << "Original slice: " << slice->ToString();
     VLOG(3) << "Original broadcast: " << broadcast->ToString();
     auto new_slice_shape = broadcast_operand->shape();
-    for (int64_t i = 0; i < broadcast_operand->shape().rank(); ++i) {
+    for (int64_t i = 0; i < broadcast_operand->shape().dimensions().size();
+         ++i) {
       int64_t size_i = (new_slice_limits[i] - new_slice_starts[i] +
                         new_slice_strides[i] - 1) /
                        new_slice_strides[i];
@@ -6854,7 +6865,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
                           const DimensionVector& operand_strides,
                           HloInstruction* meta, int dimension) {
       DimensionVector start_indices, limit_indices, strides;
-      for (int64_t i = 0; i < meta->shape().rank(); ++i) {
+      for (int64_t i = 0; i < meta->shape().dimensions().size(); ++i) {
         start_indices.push_back(operand_start_indices[i]);
         limit_indices.push_back(i != dimension ? operand_limit_indices[i]
                                                : meta->shape().dimensions(i));
@@ -6865,7 +6876,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
 
     // Here we build up the slice dimensions for lhs
     DimensionVector lhs_start_indices, lhs_limit_indices, lhs_strides;
-    for (int64_t lhs_index = 0; lhs_index < lhs->shape().rank(); ++lhs_index) {
+    for (int64_t lhs_index = 0; lhs_index < lhs->shape().dimensions().size();
+         ++lhs_index) {
       int64_t size = lhs->shape().dimensions(lhs_index);
       // If it is not a contracting dimension, we slice it according to the
       // slicing of the corresponding dimension in dot
@@ -6883,7 +6895,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
 
     // Here we do the same for rhs
     DimensionVector rhs_start_indices, rhs_limit_indices, rhs_strides;
-    for (int64_t rhs_index = 0; rhs_index < rhs->shape().rank(); ++rhs_index) {
+    for (int64_t rhs_index = 0; rhs_index < rhs->shape().dimensions().size();
+         ++rhs_index) {
       int64_t size = rhs->shape().dimensions(rhs_index);
       // If it is not a contracting dimension, we slice it according to the
       // slicing of the corresponding dimension in dot
@@ -6949,8 +6962,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     // dimensions. Otherwise, it will conflict with an existing optimization
     // that converts dot to mul(broadcast)
     if (!DotHasOnlyBatchAndContractingOnOneOperand(
-            ShapeUtil::TrueRank(new_lhs->shape()),
-            ShapeUtil::TrueRank(new_rhs->shape()), dnums)) {
+            ShapeUtil::TrueNumDimensions(new_lhs->shape()),
+            ShapeUtil::TrueNumDimensions(new_rhs->shape()), dnums)) {
       VLOG(10) << "Reordering slice into dot operands";
       return ReplaceInstruction(slice, new_dot);
     }
@@ -7053,7 +7066,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
           !window_util::HasPadding(window)) {
         return DimensionVector{};
       }
-      auto rank = reduce_window->shape().dimensions_size();
+      auto rank = reduce_window->shape().dimensions().size();
       auto& slice_starts = slice->slice_starts();
       auto& slice_limits = slice->slice_limits();
       DimensionVector reduce_dims;
@@ -7166,7 +7179,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
   // DynamicSlice clamps the offset. If the slice size has the same size on a
   // dim as the operand, we can replace it with zero.
   std::vector<int> same_size_dims_to_simplify;
-  for (int64_t dim = 0; dim < operand->shape().rank(); ++dim) {
+  for (int64_t dim = 0; dim < operand->shape().dimensions().size(); ++dim) {
     if (!(dynamic_slice->operand(dim + 1)->IsConstant() &&
           IsAll(dynamic_slice->operand(dim + 1), 0)) &&
         operand->shape().dimensions(dim) ==
@@ -7189,9 +7202,9 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
   HloInstruction* broadcast_operand;
   if (Match(operand, m::Broadcast(m::Op(&broadcast_operand)))) {
     std::vector<HloInstruction*> new_indices;
-    new_indices.reserve(broadcast_operand->shape().rank());
+    new_indices.reserve(broadcast_operand->shape().dimensions().size());
     std::vector<int64_t> new_slice_sizes;
-    new_slice_sizes.reserve(broadcast_operand->shape().rank());
+    new_slice_sizes.reserve(broadcast_operand->shape().dimensions().size());
 
     for (int64_t dim : operand->dimensions()) {
       new_indices.push_back(dynamic_slice->mutable_operand(1 + dim));
@@ -7204,7 +7217,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     HloInstruction* new_dynamic_slice = broadcast_operand;
     if (!new_slice_sizes.empty()) {
       auto new_ds_shape = broadcast_operand->shape();
-      for (int64_t i = 0; i < broadcast_operand->shape().rank(); ++i) {
+      for (int64_t i = 0; i < broadcast_operand->shape().dimensions().size();
+           ++i) {
         new_ds_shape.set_dimensions(i, new_slice_sizes[i]);
       }
       simplifier_->UpdateLayout(&new_ds_shape);
@@ -7227,10 +7241,11 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     int64_t slice_dim = 0;
     HloInstruction* zero = MakeScalarLike(dynamic_slice->mutable_operand(1), 0);
     std::vector<HloInstruction*> starts;
-    starts.reserve(reshape_operand->shape().rank());
+    starts.reserve(reshape_operand->shape().dimensions().size());
     std::vector<int64_t> slice_sizes;
-    slice_sizes.reserve(reshape_operand->shape().rank());
-    for (int64_t dim = 0; dim < reshape_operand->shape().rank(); ++dim) {
+    slice_sizes.reserve(reshape_operand->shape().dimensions().size());
+    for (int64_t dim = 0; dim < reshape_operand->shape().dimensions().size();
+         ++dim) {
       if (reshape_operand->shape().dimensions(dim) == 1) {
         starts.push_back(zero);
         slice_sizes.push_back(1);
@@ -7288,7 +7303,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
         dynamic_slice->operand(0)->shape().has_layout() &&
         dynamic_slice->shape().layout().memory_space() ==
             dynamic_slice->operand(0)->shape().layout().memory_space()))) {
-    const int64_t rank = operand->shape().rank();
+    const int64_t rank = operand->shape().dimensions().size();
     std::vector<int64_t> slice_starts(rank);
     std::vector<int64_t> slice_limits(rank);
     std::vector<int64_t> slice_strides(rank, 1);
@@ -7317,7 +7332,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
   // should be a rank 1 array of size 1 with element type matching that of the
   // scalar index (except the signedness).
   const PrimitiveType element_type = dynamic_slice->shape().element_type();
-  if (operand->shape().rank() == 1 && dynamic_slice->shape().rank() == 1 &&
+  if (operand->shape().dimensions().size() == 1 &&
+      dynamic_slice->shape().dimensions().size() == 1 &&
       dynamic_slice->shape().dimensions(0) == 1 &&
       (element_type == S32 || element_type == U32)) {
     // Match multiply(x, broadcast(scalar)) and return the scalar
@@ -7456,8 +7472,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
     // HostOffloader is run the broadcast should be rewritten to an
     // AllocateBuffer so this dus->pad rewrite won't apply anymore.
     auto is_host_offloading = [&](HloInstruction* hlo) {
-      const auto custom_call_pattern = m::CustomCall(
-          {host_memory_offload_annotations::kMoveToHostCustomCallTarget});
+      const auto custom_call_pattern =
+          m::CustomCall({memory_annotations::kMoveToHostCustomCallTarget});
       if (Match(hlo, custom_call_pattern)) {
         return true;
       }
@@ -7479,7 +7495,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
 
     PaddingConfig padding_config;
     if (compatible) {
-      for (int64_t dim = 0; dim < updated_shape.rank(); ++dim) {
+      for (int64_t dim = 0; dim < updated_shape.dimensions().size(); ++dim) {
         auto padding_config_dim = padding_config.add_dimensions();
         auto slice_dim_start = update_start_indx->operand(dim + offset);
         if (!Match(slice_dim_start, m::ConstantScalar())) {
@@ -7530,7 +7546,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
   // DynamicUpdateSlice clamps the offset. If the slice size has the same size
   // on a dim as dus_update, we can replace it with zero.
   std::vector<int> same_size_dims_to_simplify;
-  for (int64_t dim = 0; dim < dus_update->shape().rank(); ++dim) {
+  for (int64_t dim = 0; dim < dus_update->shape().dimensions().size(); ++dim) {
     if (!(dynamic_update_slice->operand(dim + 2)->IsConstant() &&
           IsAll(dynamic_update_slice->operand(dim + 2), 0)) &&
         dus_update->shape().dimensions(dim) ==
@@ -7712,103 +7728,63 @@ static bool ReductionComputationsEquivalent(const HloComputation& a,
          category_a == category_b;
 }
 
-absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
-  HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
-  bool multi_output_reduce = reduce->shape().IsTuple();
-  // For tuple reduce, we require all reduce shapes to be the same, up to the
-  // element types, so we can just the first operand and the first result as a
-  // representative.
-  auto arg = reduce->inputs()[0];
-  auto init_value = reduce->init_values()[0];
-  const Shape& reduce_result_shape =
-      multi_output_reduce ? reduce->shape().tuple_shapes(0) : reduce->shape();
+// Returns true if the reduce should be replaced with a broadcast.
+static bool ShouldBroadcastReduce(const HloInstruction* arg,
+                                  const Shape& reduce_result_shape) {
+  return ShapeUtil::IsZeroElementArray(arg->shape()) ||
+         ShapeUtil::IsZeroElementArray(reduce_result_shape);
+}
 
-  absl::Span<const int64_t> dimensions(reduce->dimensions());
-  HloComputation* function = reduce->to_apply();
-  if (ShapeUtil::IsZeroElementArray(arg->shape()) ||
-      ShapeUtil::IsZeroElementArray(reduce_result_shape)) {
-    if (multi_output_reduce) {
-      std::vector<HloInstruction*> broadcast_inits;
-      int64_t inputs = reduce->input_count();
-      broadcast_inits.reserve(inputs);
-      for (int64_t i = 0; i < inputs; ++i) {
-        broadcast_inits.push_back(reduce->init_values()[i]->AddInstruction(
-            HloInstruction::CreateBroadcast(reduce->shape().tuple_shapes(i),
-                                            reduce->init_values()[i], {})));
-      }
-      return ReplaceWithNewInstruction(
-          reduce, HloInstruction::CreateTuple(broadcast_inits));
-    } else {
-      return ReplaceWithNewInstruction(
-          reduce,
-          HloInstruction::CreateBroadcast(reduce_result_shape, init_value, {}));
+absl::Status AlgebraicSimplifierVisitor::BroadcastReduce(
+    const Shape& reduce_result_shape, bool multi_output_reduce,
+    HloReduceInstruction* reduce) {
+  auto init_values = reduce->init_values();
+  auto init_for_single_output = init_values[0];
+  if (multi_output_reduce) {
+    std::vector<HloInstruction*> broadcast_inits;
+    int64_t inputs = reduce->input_count();
+    broadcast_inits.reserve(inputs);
+    for (int64_t i = 0; i < inputs; ++i) {
+      broadcast_inits.push_back(
+          init_values[i]->AddInstruction(HloInstruction::CreateBroadcast(
+              reduce->shape().tuple_shapes(i), init_values[i], {})));
     }
+    return ReplaceWithNewInstruction(
+        reduce, HloInstruction::CreateTuple(broadcast_inits));
   }
 
-  // Turn trivial variadic reductions into normal reductions.
-  if (multi_output_reduce && reduce->shape().tuple_shapes_size() == 1 &&
-      reduce->input_count() == 1 &&
-      Match(function->root_instruction(), m::Tuple())) {
-    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
-        replacements;
-    replacements[function->root_instruction()] = nullptr;
-    auto new_function = computation_->parent()->AddEmbeddedComputation(
-        function->CloneWithReplacements(
-            &replacements, /*extra_parameters=*/{},
-            /*context=*/nullptr,
-            /*suffix=*/"clone",
-            /*new_root=*/function->root_instruction()->operand(0)));
-    auto new_reduce = reduce->AddInstruction(
-        HloInstruction::CreateReduce(reduce_result_shape, arg, init_value,
-                                     reduce->dimensions(), new_function));
-    return ReplaceWithNewInstruction(reduce,
-                                     HloInstruction::CreateTuple({new_reduce}));
-  }
+  return ReplaceWithNewInstruction(
+      reduce, HloInstruction::CreateBroadcast(reduce_result_shape,
+                                              init_for_single_output, {}));
+}
 
-  // If the reduction results in the same number of elements, then the only
-  // possible side effect would be a reshape. Since the init_value is an
-  // identity of the reduction function, we can therefore replace the reduce
-  // with a simple reshape, ignoring the reduction function completely.
-  if (ShapeUtil::ElementsIn(reduce_result_shape) ==
-          ShapeUtil::ElementsIn(arg->shape()) &&
-      (!options_.is_layout_sensitive() ||
-       options_.ReshapeIsBitcast(arg->shape(), reduce_result_shape))) {
-    if (multi_output_reduce) {
-      std::vector<HloInstruction*> reshaped_args;
-      int64_t inputs = reduce->input_count();
-      reshaped_args.reserve(inputs);
-      for (int64_t i = 0; i < inputs; ++i) {
-        reshaped_args.push_back(
-            reduce->AddInstruction(HloInstruction::CreateReshape(
-                reduce->shape().tuple_shapes(i), reduce->inputs()[i])));
-      }
-      return ReplaceWithNewInstruction(
-          reduce, HloInstruction::CreateTuple(reshaped_args));
-    } else {
-      return ReplaceWithNewInstruction(
-          reduce, HloInstruction::CreateReshape(reduce_result_shape, arg));
+absl::Status AlgebraicSimplifierVisitor::ReplaceReduceWithReshape(
+    const Shape& reduce_result_shape, bool multi_output_reduce,
+    HloReduceInstruction* reduce) {
+  auto init_for_single_output = reduce->inputs()[0];
+  if (multi_output_reduce) {
+    std::vector<HloInstruction*> reshaped_args;
+    auto reduce_inputs = reduce->inputs();
+    int64_t inputs = reduce->input_count();
+    reshaped_args.reserve(inputs);
+    for (int64_t i = 0; i < inputs; ++i) {
+      reshaped_args.push_back(
+          reduce->AddInstruction(HloInstruction::CreateReshape(
+              reduce->shape().tuple_shapes(i), reduce_inputs[i])));
     }
+    return ReplaceWithNewInstruction(
+        reduce, HloInstruction::CreateTuple(reshaped_args));
   }
 
-  if (options_.is_layout_sensitive()) {
-    return absl::OkStatus();
-  }
-
-  HloInstruction* negate_arg;
-  if (ShapeUtil::ElementIsFloating(reduce->shape()) &&
-      Match(arg, m::Negate(m::Op(&negate_arg))) &&
-      IsScalarConstantZero(init_value) &&
-      Match(reduce->to_apply()->root_instruction(),
-            m::AddAnyOrder(m::Parameter(0), m::Parameter(1)))) {
-    TF_RETURN_IF_ERROR(reduce->ReplaceOperandWith(0, negate_arg));
-    auto users = reduce->users();
-    auto* negated_reduce = arg->AddInstruction(HloInstruction::CreateUnary(
-        reduce->shape(), HloOpcode::kNegate, reduce));
-    MarkAsChanged();
-    return reduce->ReplaceUsesWith(users, negated_reduce);
-  }
+  return ReplaceWithNewInstruction(
+      reduce, HloInstruction::CreateReshape(reduce_result_shape,
+                                            init_for_single_output));
+}
 
-  // Try to reorder reduce(dot(A, B)) to dot(A, reduce(B))
+std::optional<absl::Status>
+AlgebraicSimplifierVisitor::ReorderReduceDotToDotReduce(
+    HloInstruction* arg, HloInstruction* init_value, HloComputation* function,
+    HloReduceInstruction* reduce) {
   if (options_.raise_slice_and_reduce_through_dot()) {
     HloInstruction *a, *b;
     // Reordering does not seem possible if the dot has batch dimensions. We
@@ -7846,17 +7822,20 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
           MakeReduceHlo(b, init_value, reduce_b_dims, function));
 
       // Construct maps from reduce_a and reduce_b to a and b
-      std::vector<int64_t> map_reduce_a_a(reduce_a->shape().rank(), -1),
-          map_reduce_b_b(reduce_b->shape().rank(), -1);
+      std::vector<int64_t> map_reduce_a_a(reduce_a->shape().dimensions().size(),
+                                          -1),
+          map_reduce_b_b(reduce_b->shape().dimensions().size(), -1);
       int64_t reduce_a_index = 0;
-      for (int64_t a_index = 0; a_index < a->shape().rank(); ++a_index) {
+      for (int64_t a_index = 0; a_index < a->shape().dimensions().size();
+           ++a_index) {
         if (!absl::c_linear_search(reduce_a_dims, a_index)) {
           map_reduce_a_a[reduce_a_index] = a_index;
           ++reduce_a_index;
         }
       }
       int64_t reduce_b_index = 0;
-      for (int64_t b_index = 0; b_index < b->shape().rank(); ++b_index) {
+      for (int64_t b_index = 0; b_index < b->shape().dimensions().size();
+           ++b_index) {
         if (!absl::c_linear_search(reduce_b_dims, b_index)) {
           map_reduce_b_b[reduce_b_index] = b_index;
           ++reduce_b_index;
@@ -7868,7 +7847,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
       const auto& b_contracting_dims = ab_dnums.rhs_contracting_dimensions();
       DotDimensionNumbers new_dot_dnums;
       for (int64_t reduce_a_index = 0;
-           reduce_a_index < reduce_a->shape().rank(); ++reduce_a_index) {
+           reduce_a_index < reduce_a->shape().dimensions().size();
+           ++reduce_a_index) {
         if (map_reduce_a_a[reduce_a_index] != -1) {
           int64_t a_index = map_reduce_a_a[reduce_a_index];
           if (absl::c_linear_search(a_contracting_dims, a_index)) {
@@ -7877,7 +7857,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
         }
       }
       for (int64_t reduce_b_index = 0;
-           reduce_b_index < reduce_b->shape().rank(); ++reduce_b_index) {
+           reduce_b_index < reduce_b->shape().dimensions().size();
+           ++reduce_b_index) {
         if (map_reduce_b_b[reduce_b_index] != -1) {
           int64_t b_index = map_reduce_b_b[reduce_b_index];
           if (absl::c_linear_search(b_contracting_dims, b_index)) {
@@ -7909,6 +7890,110 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
       }
     }
   }
+  return std::nullopt;
+}
+
+absl::Status AlgebraicSimplifierVisitor::MergeReduces(
+    const Shape& reduce_result_shape, HloInstruction* init_value,
+    HloComputation* function, HloInstruction* arg,
+    HloReduceInstruction* reduce) {
+  // Create a new reduce with the combined reduction dimensions of both
+  // reduces.
+  std::vector<int64_t> arg_dims = *arg->mutable_dimensions();
+  absl::c_sort(arg_dims);
+  std::vector<int64_t> reduce_dims = *reduce->mutable_dimensions();
+  absl::c_sort(reduce_dims);
+  // Transform reduce_dims to the same rank as the operand of the operand.
+  // TODO(b/411670134): No need to iterate as the vectors are sorted.
+  for (int64_t arg_dim : arg_dims) {
+    for (int64_t& dim : reduce_dims) {
+      if (dim >= arg_dim) {
+        ++dim;
+      }
+    }
+  }
+  std::vector<int64_t> new_dimensions;
+  new_dimensions.reserve(arg->dimensions().size() +
+                         reduce->dimensions().size());
+  std::merge(arg_dims.begin(), arg_dims.end(), reduce_dims.begin(),
+             reduce_dims.end(), std::back_inserter(new_dimensions));
+  return ReplaceWithNewInstruction(
+      reduce,
+      HloInstruction::CreateReduce(reduce_result_shape, arg->mutable_operand(0),
+                                   init_value, new_dimensions, function));
+}
+
+absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
+  HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
+  bool multi_output_reduce = reduce->shape().IsTuple();
+  // For tuple reduce, we require all reduce shapes to be the same, up to the
+  // element types, so we can just the first operand and the first result as a
+  // representative.
+  auto arg = reduce->inputs()[0];
+  auto init_value = reduce->init_values()[0];
+  const Shape& reduce_result_shape =
+      multi_output_reduce ? reduce->shape().tuple_shapes(0) : reduce->shape();
+
+  absl::Span<const int64_t> dimensions(reduce->dimensions());
+  HloComputation* function = reduce->to_apply();
+  if (ShouldBroadcastReduce(arg, reduce_result_shape)) {
+    return BroadcastReduce(reduce_result_shape, multi_output_reduce, reduce);
+  }
+
+  // Turn trivial variadic reductions into normal reductions.
+  if (multi_output_reduce && reduce->shape().tuple_shapes().size() == 1 &&
+      reduce->input_count() == 1 &&
+      Match(function->root_instruction(), m::Tuple())) {
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+        replacements;
+    replacements[function->root_instruction()] = nullptr;
+    auto new_function = computation_->parent()->AddEmbeddedComputation(
+        function->CloneWithReplacements(
+            &replacements, /*extra_parameters=*/{},
+            /*context=*/nullptr,
+            /*suffix=*/"clone",
+            /*new_root=*/function->root_instruction()->operand(0)));
+    auto new_reduce = reduce->AddInstruction(
+        HloInstruction::CreateReduce(reduce_result_shape, arg, init_value,
+                                     reduce->dimensions(), new_function));
+    return ReplaceWithNewInstruction(reduce,
+                                     HloInstruction::CreateTuple({new_reduce}));
+  }
+
+  // If the reduction results in the same number of elements, then the only
+  // possible side effect would be a reshape. Since the init_value is an
+  // identity of the reduction function, we can therefore replace the reduce
+  // with a simple reshape, ignoring the reduction function completely.
+  if (ShapeUtil::ElementsIn(reduce_result_shape) ==
+          ShapeUtil::ElementsIn(arg->shape()) &&
+      (!options_.is_layout_sensitive() ||
+       options_.ReshapeIsBitcast(arg->shape(), reduce_result_shape))) {
+    return ReplaceReduceWithReshape(reduce_result_shape, multi_output_reduce,
+                                    reduce);
+  }
+
+  if (options_.is_layout_sensitive()) {
+    return absl::OkStatus();
+  }
+
+  HloInstruction* negate_arg;
+  if (ShapeUtil::ElementIsFloating(reduce->shape()) &&
+      Match(arg, m::Negate(m::Op(&negate_arg))) &&
+      IsScalarConstantZero(init_value) &&
+      Match(reduce->to_apply()->root_instruction(),
+            m::AddAnyOrder(m::Parameter(0), m::Parameter(1)))) {
+    TF_RETURN_IF_ERROR(reduce->ReplaceOperandWith(0, negate_arg));
+    auto users = reduce->users();
+    auto* negated_reduce = arg->AddInstruction(HloInstruction::CreateUnary(
+        reduce->shape(), HloOpcode::kNegate, reduce));
+    MarkAsChanged();
+    return reduce->ReplaceUsesWith(users, negated_reduce);
+  }
+
+  // Try to reorder reduce(dot(A, B)) to dot(A, reduce(B))
+  if (ReorderReduceDotToDotReduce(arg, init_value, function, reduce)) {
+    return absl::OkStatus();
+  }
 
   // TODO(b/131122694): Most of those optimizations below can be done for
   // multi-output reduces.
@@ -7921,7 +8006,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   // result may require a transpose of the output.
   if (arg->opcode() == HloOpcode::kTranspose &&
       (options_.unconditionally_simplify_reduce_of_transpose_or_reshape() ||
-       (reduce->shape().rank() < 2 || arg->user_count() == 1 ||
+       (reduce->shape().dimensions().size() < 2 || arg->user_count() == 1 ||
         absl::c_all_of(arg->users(), [](HloInstruction* use) {
           return use->opcode() == HloOpcode::kReduce;
         })))) {
@@ -7969,29 +8054,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   if (arg->opcode() == HloOpcode::kReduce &&
       init_value->Identical(*arg->operand(1)) &&
       ReductionComputationsEquivalent(*function, *arg->to_apply())) {
-    // Create a new reduce with the combined reduction dimensions of both
-    // reduces.
-    std::vector<int64_t> arg_dims = *arg->mutable_dimensions();
-    absl::c_sort(arg_dims);
-    std::vector<int64_t> reduce_dims = *reduce->mutable_dimensions();
-    absl::c_sort(reduce_dims);
-    // Transform reduce_dims to the same rank as the operand of the operand.
-    for (int64_t arg_dim : arg_dims) {
-      for (int64_t& dim : reduce_dims) {
-        if (dim >= arg_dim) {
-          ++dim;
-        }
-      }
-    }
-    std::vector<int64_t> new_dimensions;
-    new_dimensions.reserve(arg->dimensions().size() +
-                           reduce->dimensions().size());
-    std::merge(arg_dims.begin(), arg_dims.end(), reduce_dims.begin(),
-               reduce_dims.end(), std::back_inserter(new_dimensions));
-    return ReplaceWithNewInstruction(
-        reduce, HloInstruction::CreateReduce(
-                    reduce_result_shape, arg->mutable_operand(0), init_value,
-                    new_dimensions, function));
+    return MergeReduces(reduce_result_shape, init_value, function, arg, reduce);
   }
 
   // Handle two cases of reduce(reshape(x)).
@@ -8009,14 +8072,16 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
 
     // True for those dimensions of the reduce input that are not reduced, false
     // for the dims that are reduced.
-    absl::InlinedVector<bool, 8> arg_dim_in_output(arg->shape().rank(), true);
+    absl::InlinedVector<bool, 8> arg_dim_in_output(
+        arg->shape().dimensions().size(), true);
     for (auto dim : dimensions) {
       arg_dim_in_output[dim] = false;
     }
 
     // True for those dimensions of the reduce input that are unmodified by the
     // reshape.
-    absl::InlinedVector<bool, 8> arg_dim_unmodified(arg->shape().rank(), false);
+    absl::InlinedVector<bool, 8> arg_dim_unmodified(
+        arg->shape().dimensions().size(), false);
     for (auto [input_idx, output_idx] : unmodified_dims) {
       arg_dim_unmodified[output_idx] = true;
     }
@@ -8039,7 +8104,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
         }
       }
       std::vector<int64_t> new_reduce_dimensions;
-      for (int64_t i = 0; i < arg->operand(0)->shape().rank(); ++i) {
+      for (int64_t i = 0; i < arg->operand(0)->shape().dimensions().size();
+           ++i) {
         if (!dimensions_not_to_reduce.contains(i)) {
           new_reduce_dimensions.push_back(i);
         }
@@ -8094,7 +8160,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
       }
     }
     if (options_.enable_unconditional_reduce_of_concat_replacement() ||
-        same_shapes || reduce->shape().rank() == 0) {
+        same_shapes || reduce->shape().dimensions().empty()) {
       HloInstruction* old_reduce = nullptr;
       for (HloInstruction* operand : arg->operands()) {
         HloInstruction* new_reduce =
@@ -8223,8 +8289,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
              m::AndAnyOrder(m::Parameter(0), m::Parameter(1))) ||
        Match(function->root_instruction(),
              m::OrAnyOrder(m::Parameter(0), m::Parameter(1))))) {
-    if (broadcast_arg->shape().rank() == 0 &&
-        reduce->dimensions().size() == arg->shape().rank()) {
+    if (broadcast_arg->shape().dimensions().empty() &&
+        reduce->dimensions().size() == arg->shape().dimensions().size()) {
       return ReplaceWithNewInstruction(
           reduce,
           HloInstruction::CreateBinary(
@@ -8238,6 +8304,10 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   if (arg->opcode() == HloOpcode::kBroadcast &&
       Match(reduce->to_apply()->root_instruction(),
             m::AddAnyOrder(m::Parameter(0), m::Parameter(1)))) {
+    TF_RET_CHECK(
+        std::is_sorted(arg->dimensions().begin(), arg->dimensions().end()))
+        << "Broadcasts need to be canonicalized before algebraic "
+           "simplification.";
     bool only_reduce_dims_from_broadcast = true;
     int64_t common_dims_prod = 1;
     int64_t num_common_dims = 0;
@@ -8245,7 +8315,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
     std::vector<int64_t> new_broadcast_dims;
 
     // Now we build up the new broadcast shape and dims vector
-    for (int64_t i = 0; i < arg->shape().rank(); ++i) {
+    for (int64_t i = 0; i < arg->shape().dimensions().size(); ++i) {
       bool added_by_broadcast = !absl::c_linear_search(arg->dimensions(), i);
       bool removed_by_reduce = absl::c_linear_search(reduce->dimensions(), i);
 
@@ -8408,7 +8478,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                 m::MinimumAnyOrder(m::Parameter(0), m::Parameter(1)),
                 m::MaximumAnyOrder(m::Parameter(0), m::Parameter(1))))) {
     const HloInstruction* nested_root = function->root_instruction();
-    DimensionVector broadcast_dims(nested_root->shape().dimensions_size());
+    DimensionVector broadcast_dims(nested_root->shape().dimensions().size());
     absl::c_iota(broadcast_dims, 0);
     TF_ASSIGN_OR_RETURN(
         auto new_op, MakeBinaryHlo(nested_root->opcode(), operand,
@@ -8448,7 +8518,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduceWindow(
         return DimensionVector{};
       }
       DimensionVector reduce_dims;
-      for (int64_t i = 0; i < window.dimensions_size(); ++i) {
+      for (int64_t i = 0; i < window.dimensions().size(); ++i) {
         if (window.dimensions(i).size() == 1) {
           continue;
         } else if (reduce_window->shape().dimensions(i) == 1) {
@@ -8551,7 +8621,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduceWindow(
       VLOG(10) << "Window has interior padding.";
       return false;
     }
-    for (int64_t i = 0; i < pad_config.dimensions_size(); ++i) {
+    for (int64_t i = 0; i < pad_config.dimensions().size(); ++i) {
       const auto& pad_dimension = pad_config.dimensions(i);
       if ((pad_dimension.edge_padding_low() != 0 ||
            pad_dimension.edge_padding_high() != 0) &&
@@ -8562,7 +8632,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     }
     VLOG(10) << "Found to be padding trivial dimensions only.";
 
-    for (int64_t i = 0; i < window.dimensions_size(); ++i) {
+    for (int64_t i = 0; i < window.dimensions().size(); ++i) {
       const auto& pad_dimension = pad_config.dimensions(i);
       const WindowDimension& window_dimension = window.dimensions(i);
       bool dimension_has_padding = (pad_dimension.edge_padding_low() != 0 ||
@@ -8611,9 +8681,9 @@ absl::Status AlgebraicSimplifierVisitor::HandleReduceWindow(
   // Carry out the folding of the pad into reduce_window.
   VLOG(10) << "Folding pad into reduce-window.";
   Window new_window = window;
-  const int64_t rank = reduce_window->shape().rank();
-  TF_RET_CHECK(pad_config.dimensions_size() == rank);
-  TF_RET_CHECK(window.dimensions_size() == rank);
+  const int64_t rank = reduce_window->shape().dimensions().size();
+  TF_RET_CHECK(pad_config.dimensions().size() == rank);
+  TF_RET_CHECK(window.dimensions().size() == rank);
   for (int64_t i = 0; i < rank; ++i) {
     const auto& pad_dim = pad_config.dimensions(i);
     auto& window_dim = *new_window.mutable_dimensions(i);
@@ -8888,7 +8958,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
     HloInstruction* lhs = dot->mutable_operand(0);
     HloInstruction* rhs = dot->mutable_operand(1);
 
-    const int64_t rank = dot->shape().rank();
+    const int64_t rank = dot->shape().dimensions().size();
     const auto& dnums = dot->dot_dimension_numbers();
 
     // Dot must be "somewhat canonical": batch dimensions at the beginning and
@@ -8900,11 +8970,11 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
         dnums.lhs_contracting_dimensions_size() == 0 ||
         dnums.lhs_contracting_dimensions_size() +
                 dnums.lhs_batch_dimensions_size() + 1 !=
-            lhs->shape().rank() ||
+            lhs->shape().dimensions().size() ||
         dnums.rhs_contracting_dimensions_size() == 0 ||
         dnums.rhs_contracting_dimensions_size() +
                 dnums.rhs_batch_dimensions_size() + 1 !=
-            rhs->shape().rank()) {
+            rhs->shape().dimensions().size()) {
       return false;
     }
 
@@ -8950,11 +9020,13 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
         }
       }
       const int64_t num_rhs_outer_dims =
-          rhs->shape().rank() - (dnums.rhs_contracting_dimensions_size() +
-                                 dnums.rhs_batch_dimensions_size());
+          rhs->shape().dimensions().size() -
+          (dnums.rhs_contracting_dimensions_size() +
+           dnums.rhs_batch_dimensions_size());
       const int64_t num_lhs_outer_dims =
-          lhs->shape().rank() - (dnums.lhs_contracting_dimensions_size() +
-                                 dnums.lhs_batch_dimensions_size());
+          lhs->shape().dimensions().size() -
+          (dnums.lhs_contracting_dimensions_size() +
+           dnums.lhs_batch_dimensions_size());
       for (int64_t i = 0; i < num_rhs_outer_dims; ++i) {
         if (transpose->dimensions(i + num_batch_dims) !=
             i + num_batch_dims + num_lhs_outer_dims) {
@@ -9017,6 +9089,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
   // the reshape/transpose combination can be interpreted as a space-to-depth
   // transformation.
   if (!options_.is_layout_sensitive() &&
+      options_.rewrite_reshape_transpose_as_slice_concatenate() &&
       operand->opcode() == HloOpcode::kReshape &&
       transpose->user_count() == 1 &&
       HloOpcode::kReshape == transpose->users()[0]->opcode()) {
@@ -9025,15 +9098,16 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
     HloInstruction* outer_reshape = transpose->users()[0];
     TF_ASSIGN_OR_RETURN(
         bool did_transform, ([&]() -> absl::StatusOr<bool> {
-          if (operand->shape().dimensions_size() !=
-              reshape_operand->shape().dimensions_size() + 1) {
+          if (operand->shape().dimensions().size() !=
+              reshape_operand->shape().dimensions().size() + 1) {
             return false;
           }
 
           // Check that the reshape is splitting a single dimension into two.
           int64_t split_dim = 0;
           bool found_split_dims = false;
-          for (int64_t dim = 0; dim < reshape_operand->shape().rank(); dim++) {
+          for (int64_t dim = 0;
+               dim < reshape_operand->shape().dimensions().size(); dim++) {
             if (operand->shape().dimensions(dim) !=
                 reshape_operand->shape().dimensions(dim)) {
               const int64_t expected_size =
@@ -9051,7 +9125,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
             return false;
           }
           for (int64_t dim = split_dim + 1;
-               dim < reshape_operand->shape().rank(); dim++) {
+               dim < reshape_operand->shape().dimensions().size(); dim++) {
             if (operand->shape().dimensions(dim + 1) !=
                 reshape_operand->shape().dimensions(dim)) {
               return false;
@@ -9072,7 +9146,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
           // transpose
           int64_t transpose_dim = 0;
           bool found_transpose_dim = false;
-          for (int64_t dim = 0; dim < operand->shape().rank(); dim++) {
+          for (int64_t dim = 0; dim < operand->shape().dimensions().size();
+               dim++) {
             if (transpose->dimensions(dim) == split_dim) {
               transpose_dim = dim;
               found_transpose_dim = true;
@@ -9086,7 +9161,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
               transpose_dim == split_dim + 1) {
             return false;
           }
-          for (int64_t dim = 0; dim < operand->shape().rank(); dim++) {
+          for (int64_t dim = 0; dim < operand->shape().dimensions().size();
+               dim++) {
             int64_t offset = 0;
             if (dim > transpose_dim) {
               offset--;
@@ -9103,7 +9179,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
 
           // Check that the outer reshape has the same shape as the input,
           // with the transformed dimensions appropriately scaled by num_chunks.
-          for (int64_t dim = 0; dim < reshape_operand->shape().rank(); dim++) {
+          for (int64_t dim = 0;
+               dim < reshape_operand->shape().dimensions().size(); dim++) {
             if (dim == transpose_dim - 1) {
               if (outer_reshape->shape().dimensions(dim) !=
                   reshape_operand->shape().dimensions(dim) * num_chunks) {
@@ -9127,7 +9204,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
             std::vector<int64_t> start_indices;
             std::vector<int64_t> end_indices;
             std::vector<int64_t> strides;
-            const auto rank = reshape_operand->shape().rank();
+            const auto rank = reshape_operand->shape().dimensions().size();
             start_indices.reserve(rank);
             end_indices.reserve(rank);
             strides.reserve(rank);
@@ -9456,8 +9533,8 @@ AlgebraicSimplifierVisitor::PromoteConvolutionToF32IfNotOnednnCompatible(
   auto dims = (*convolution)->window().dimensions().size();
   if (dims >= 4 || dims <= 0) can_rewrite = false;
 
-  if (inp_shape.rank() != ker_shape.rank() ||
-      inp_shape.rank() != out_shape.rank()) {
+  if (inp_shape.dimensions().size() != ker_shape.dimensions().size() ||
+      inp_shape.dimensions().size() != out_shape.dimensions().size()) {
     can_rewrite = false;
   }
 
@@ -9580,7 +9657,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
     return false;
   }
   auto add_bitcast = [&](Shape shape, HloInstruction* operand) {
-    std::vector<int64_t> dims(operand->shape().dimensions_size());
+    std::vector<int64_t> dims(operand->shape().dimensions().size());
     std::iota(dims.begin(), dims.end(), 0);
     return operand->AddInstruction(
         HloInstruction::CreateBitcast(shape, operand));
@@ -9594,7 +9671,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
 
   // Computes the product of the non-feature dimensions.
   int64_t conv_width = 1;
-  for (int i = 0; i < input_shape.dimensions_size(); ++i) {
+  for (int i = 0; i < input_shape.dimensions().size(); ++i) {
     if (i != dnums.input_feature_dimension()) {
       conv_width *= input_shape.dimensions(i);
     }
@@ -9631,9 +9708,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
 
 absl::StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToMultiply(
     HloInstruction* convolution) {
-  if (options_.is_layout_sensitive() ||
-      absl::c_linear_search(convolution->precision_config().operand_precision(),
-                            PrecisionConfig::PACKED_NIBBLE)) {
+  if (options_.is_layout_sensitive()) {
     return false;
   }
 
@@ -9674,8 +9749,8 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToMultiply(
   }
 
   // Calculate permutations for the operand dimensions.
-  DimensionVector input_permutation(input_shape.rank());
-  DimensionVector kernel_permutation(kernel_shape.rank());
+  DimensionVector input_permutation(input_shape.dimensions().size());
+  DimensionVector kernel_permutation(kernel_shape.dimensions().size());
 
   input_permutation[dnums.output_batch_dimension()] =
       dnums.input_batch_dimension();
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
index fc1489daffac..1a2cab861649 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -338,6 +339,14 @@ class AlgebraicSimplifierOptions {
     enable_onednn_support_ = enable_onednn_support;
   }
 
+  bool rewrite_reshape_transpose_as_slice_concatenate() const {
+    return rewrite_reshape_transpose_as_slice_concatenate_;
+  }
+
+  void set_rewrite_reshape_transpose_as_slice_concatenate(bool value) {
+    rewrite_reshape_transpose_as_slice_concatenate_ = value;
+  }
+
  private:
   // Metadata struct can be used to store any metadata information encapsulated
   // with the AlgebraicSimplifierOptions that can be later used in an
@@ -393,6 +402,7 @@ class AlgebraicSimplifierOptions {
       false
 #endif  // INTEL_MKL
   };
+  bool rewrite_reshape_transpose_as_slice_concatenate_{true};
   Metadata metadata_;
 };
 
@@ -588,7 +598,8 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
  private:
   // Returns whether the dot precision config is supported by simplifier.
-  virtual bool SupportedDotPrecisionConfig(const PrecisionConfig& config);
+  virtual bool SupportedDotPrecisionConfig(const PrecisionConfig& config,
+                                           bool has_contracting_dim);
 
   // Makes algorithm specific set of instructions for multiply with precision
   // algorithm in mind. In the trivial case it returns just multiply.
@@ -807,7 +818,33 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // slice instruction is replaced).
   // - For example in slices=([0:X:X]), where X == dimension
   absl::StatusOr<bool> RemoveRedundantStride(
-      absl::Nonnull<HloInstruction*> slice);
+      HloInstruction* absl_nonnull slice);
+
+  // Helper function for HandleReduce. Replaces a reduce with a broadcast of the
+  // init values if the reduce is operating on a zero-element array or the
+  // result of the reduce is a zero-element array.
+  absl::Status BroadcastReduce(const Shape& reduce_result_shape,
+                               bool multi_output_reduce,
+                               HloReduceInstruction* reduce);
+
+  // Helper function for HandleReduce. Converts a (both single and multi output)
+  // reduce to a reshape.
+  absl::Status ReplaceReduceWithReshape(const Shape& reduce_result_shape,
+                                        bool multi_output_reduce,
+                                        HloReduceInstruction* reduce);
+
+  // Helper function for HandleReduce. Reorders reduce dot
+  // to a dot reduce. reduce(dot(A, B)) to dot(A, reduce(B))
+  std::optional<absl::Status> ReorderReduceDotToDotReduce(
+      HloInstruction* arg, HloInstruction* init_value, HloComputation* function,
+      HloReduceInstruction* reduce);
+
+  // Helper function for HandleReduce. Merge two reduces with same computation
+  // and initial value into a single reduce.
+  absl::Status MergeReduces(const Shape& reduce_result_shape,
+                            HloInstruction* init_value,
+                            HloComputation* function, HloInstruction* arg,
+                            HloReduceInstruction* reduce);
 
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
index c3ae8506afe6..7cff2113e4ac 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
@@ -53,12 +53,13 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/hlo_creation_utils.h"
-#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/layout_assignment.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/shape_inference.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -6312,6 +6313,31 @@ ENTRY entry {
                              .WithShapeEqualTo(&result_shape)));
 }
 
+// Test that the transformation above doesn't happen when disabled.
+TEST_F(AlgebraicSimplifierTest, DisabledTransposeReshapeToConcatSlice) {
+  const std::string& hlo_string = R"(
+HloModule TransposeReshapeDepthToSpace
+
+ENTRY entry {
+  %param = f32[8,14,14,128] parameter(0)
+  %reshape.1 = f32[8,14,14,2,64] reshape(%param)
+  %transpose = transpose(%reshape.1), dimensions={0,1,3,2,4}
+  ROOT %reshape.2 = f32[8,28,14,64] reshape(%transpose)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_rewrite_reshape_transpose_as_slice_concatenate(false);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_FALSE(simplifier.Run(module.get()).value());
+
+  Shape result_shape = ShapeUtil::MakeShape(F32, {8, 28, 14, 64});
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Reshape(m::Transpose(m::Reshape(m::Parameter(0))))));
+}
+
 // Test that a depth-to-space transformation expressed as
 // reshape(transpose(reshape(op))) with a large number of chunks
 // is not rewritten.
@@ -8132,7 +8158,7 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffload {
   ROOT dynamic_update_slice = bf16[56,2,2048,2,128] dynamic-update-slice(broadcast_0, custom_call, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0)
 }
 )",
-      host_memory_offload_annotations::kMoveToHostCustomCallTarget);
+      memory_annotations::kMoveToHostCustomCallTarget);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   VLOG(2) << "Before rewrite dus->pad\n" << module->ToString();
@@ -8153,9 +8179,8 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffload {
       module->entry_computation()->root_instruction(),
       GmockMatch(m::DynamicUpdateSlice(
           m::Broadcast(m::ConstantScalar(0)),
-          m::CustomCall(
-              {host_memory_offload_annotations::kMoveToHostCustomCallTarget},
-              m::Parameter(0)),
+          m::CustomCall({memory_annotations::kMoveToHostCustomCallTarget},
+                        m::Parameter(0)),
           m::ConstantScalar(0), m::ConstantScalar(0), m::ConstantScalar(0),
           m::ConstantScalar(0), m::ConstantScalar(0))));
 }
@@ -8180,7 +8205,7 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffloadWithReshape {
   ROOT dynamic_update_slice = bf16[56,2,2048,2,128] dynamic-update-slice(broadcast_0, reshape, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0)
 }
 )",
-      host_memory_offload_annotations::kMoveToHostCustomCallTarget);
+      memory_annotations::kMoveToHostCustomCallTarget);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   VLOG(2) << "Before rewrite dus->pad\n" << module->ToString();
@@ -8202,9 +8227,9 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffloadWithReshape {
       module->entry_computation()->root_instruction(),
       GmockMatch(m::DynamicUpdateSlice(
           m::Broadcast(m::ConstantScalar(0)),
-          m::Reshape(m::CustomCall(
-              {host_memory_offload_annotations::kMoveToHostCustomCallTarget},
-              m::Parameter(0))),
+          m::Reshape(
+              m::CustomCall({memory_annotations::kMoveToHostCustomCallTarget},
+                            m::Parameter(0))),
           m::ConstantScalar(0), m::ConstantScalar(0), m::ConstantScalar(0),
           m::ConstantScalar(0), m::ConstantScalar(0))));
 }
@@ -8229,7 +8254,7 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffloadWithBitcast {
   ROOT dynamic_update_slice = bf16[56,2,2048,2,128] dynamic-update-slice(broadcast_0, bitcast, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0)
 }
 )",
-      host_memory_offload_annotations::kMoveToHostCustomCallTarget);
+      memory_annotations::kMoveToHostCustomCallTarget);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   VLOG(2) << "Before rewrite dus->pad\n" << module->ToString();
@@ -8251,9 +8276,9 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffloadWithBitcast {
       module->entry_computation()->root_instruction(),
       GmockMatch(m::DynamicUpdateSlice(
           m::Broadcast(m::ConstantScalar(0)),
-          m::Bitcast(m::CustomCall(
-              {host_memory_offload_annotations::kMoveToHostCustomCallTarget},
-              m::Parameter(0))),
+          m::Bitcast(
+              m::CustomCall({memory_annotations::kMoveToHostCustomCallTarget},
+                            m::Parameter(0))),
           m::ConstantScalar(0), m::ConstantScalar(0), m::ConstantScalar(0),
           m::ConstantScalar(0), m::ConstantScalar(0))));
 }
@@ -8279,7 +8304,7 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffloadMultiLevel {
   ROOT dynamic_update_slice = bf16[56,2,2048,2,128]{4,3,2,1,0} dynamic-update-slice(broadcast_0, bitcast, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0)
 }
 )",
-      host_memory_offload_annotations::kMoveToHostCustomCallTarget);
+      memory_annotations::kMoveToHostCustomCallTarget);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   VLOG(2) << "Before rewrite dus->pad\n" << module->ToString();
@@ -8305,9 +8330,9 @@ ENTRY DynamicUpdateSliceOfBroadcastToPadHostOffloadMultiLevel {
       module->entry_computation()->root_instruction(),
       GmockMatch(m::DynamicUpdateSlice(
           m::Broadcast(m::ConstantScalar(0)),
-          m::Bitcast(m::Copy(m::CustomCall(
-              {host_memory_offload_annotations::kMoveToHostCustomCallTarget},
-              m::Parameter(0)))),
+          m::Bitcast(m::Copy(
+              m::CustomCall({memory_annotations::kMoveToHostCustomCallTarget},
+                            m::Parameter(0)))),
           m::ConstantScalar(0), m::ConstantScalar(0), m::ConstantScalar(0),
           m::ConstantScalar(0), m::ConstantScalar(0))));
 }
@@ -9825,11 +9850,25 @@ TEST_F(AlgebraicSimplifierTest, CanDisableDotToMultiplyRewrite) {
 TEST_F(AlgebraicSimplifierTest,
        NoDotToMultiplyRewriteWithPrecisionConfigAlgorithm) {
   constexpr char kModuleStr[] = R"(
+HloModule test
+ENTRY dot {
+ a = f32[128]{0} parameter(0)
+ b = f32[128]{0} parameter(1)
+ ROOT dot = f32[128,128]{1,0} dot(a, b), algorithm=dot_tf32_tf32_f32
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest,
+       NoDotToMultiplyRewriteZeroContractingDimWithPrecisionConfigAlgorithm) {
+  constexpr char kModuleStr[] = R"(
     HloModule test
     ENTRY dot {
-      a = f32[128]{0} parameter(0)
-      b = f32[128]{0} parameter(1)
-      ROOT dot = f32[128,128]{1,0} dot(a, b), algorithm=dot_tf32_tf32_f32
+    a = f32[] parameter(0)
+    b = f32[] parameter(1)
+    ROOT dot = f32[] dot(a, b), algorithm=dot_tf32_tf32_f32
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
@@ -12973,5 +13012,29 @@ TEST_F(AlgebraicSimplifierTest, CopyReshapeToReshapeCopyWithHostCopies) {
   EXPECT_FALSE(simplifier.Run(m.get()).value());
 }
 
+TEST_F(AlgebraicSimplifierTest, SimplifyShardedPad) {
+  const char* hlo = R"(
+HloModule test, num_partitions=4
+
+ENTRY main {
+  c0 = f32[] constant(0)
+  c1 = f32[] constant(1)
+  b0 = f32[512,34,5]{2,1,0} broadcast(c0), dimensions={}, sharding={devices=[1,2,2]<=[2,2]T(1,0)}
+  ROOT pad = f32[512,46,5]{2,1,0} pad(b0, c1), padding=0_0x6_6x0_0, sharding={devices=[1,2,2]<=[2,2]T(1,0)}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(
+                  m::Pad(m::Broadcast(m::Constant()), m::Constant()))));
+  TF_EXPECT_OK(VerifyHloModule(m.get(),
+                               /*layout_sensitive=*/true,
+                               /*allow_mixed_precision=*/true));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
index 2fcac8f27f53..2e028417e99a 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/broadcast_canonicalizer.cc
@@ -58,7 +58,7 @@ absl::StatusOr<bool> BroadcastCanonicalizer::Run(
       std::vector<int64_t> new_broadcast_dims(hlo->shape().dimensions().begin(),
                                               hlo->shape().dimensions().end());
       absl::c_sort(new_dims);
-      const int64_t rank = hlo->shape().rank();
+      const int64_t rank = hlo->shape().dimensions_size();
       for (int i = 0; i < new_dims.size(); ++i) {
         new_broadcast_dims[new_dims[i]] =
             hlo->operand(0)->shape().dimensions(i);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
index 7e64cc06dfe2..a157a2c0483d 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter.cc
@@ -500,12 +500,12 @@ absl::Status ConvolutionVisitor::HandleConvolution(
     // Add a spatial dimension to emulate a larger output feature dimension
     // to avoid creating a convolution with group_count = 1.
     std::vector<int64_t> new_filter_dimension;
-    new_filter_dimension.reserve(filter->shape().rank() + 1);
+    new_filter_dimension.reserve(filter->shape().dimensions_size() + 1);
     const int64_t depthwise_multiplier =
         filter->shape().dimensions(kernel_output_feature_dim) / group_count;
     // Split the kernel output feature dimension into group count and
     // depthwise mutilipler.
-    for (int64_t i = 0; i < filter->shape().rank(); ++i) {
+    for (int64_t i = 0; i < filter->shape().dimensions_size(); ++i) {
       if (i == kernel_output_feature_dim) {
         new_filter_dimension.push_back(group_count);
         new_filter_dimension.push_back(depthwise_multiplier);
@@ -530,7 +530,8 @@ absl::Status ConvolutionVisitor::HandleConvolution(
             filter));
 
     auto new_activation_shape = convolution->operand(0)->shape();
-    dim_numbers.add_input_spatial_dimensions(new_activation_shape.rank());
+    dim_numbers.add_input_spatial_dimensions(
+        new_activation_shape.dimensions_size());
 
     // Create and activations spatial dimension of size 1 with a reversed
     // window and high and low padding equal to the depthwise_multiplier -1.
@@ -553,8 +554,8 @@ absl::Status ConvolutionVisitor::HandleConvolution(
     // Split the output feature dimension into and output feature of group
     // count and depthwise multipler as an output spatial dimension.
     std::vector<int64_t> new_output_dimension;
-    new_output_dimension.reserve(convolution->shape().rank() + 1);
-    for (int64_t i = 0; i < convolution->shape().rank(); ++i) {
+    new_output_dimension.reserve(convolution->shape().dimensions_size() + 1);
+    for (int64_t i = 0; i < convolution->shape().dimensions_size(); ++i) {
       if (i == dim_numbers.output_feature_dimension()) {
         new_output_dimension.push_back(group_count);
         new_output_dimension.push_back(depthwise_multiplier);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter_test.cc
index 0f2ab3e2431d..170ebe63fbe1 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/convolution_group_converter_test.cc
@@ -92,7 +92,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   EXPECT_EQ(root->opcode(), HloOpcode::kReshape);
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConvolution);
   EXPECT_EQ(root->operand(0)->feature_group_count(), 1);
-  EXPECT_EQ(root->operand(0)->shape().rank(), 4);
+  EXPECT_EQ(root->operand(0)->shape().dimensions_size(), 4);
 }
 
 TEST_F(ConvolutionGroupConverterTest,
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
index 9f070497caf7..8a468e93c52b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
@@ -134,7 +134,7 @@ absl::StatusOr<HloInstruction*> TryMergeSameOperand(HloInstruction* a,
   HloDotInstruction* dot_a = Cast<HloDotInstruction>(a);
   HloDotInstruction* dot_b = Cast<HloDotInstruction>(b);
   if (!absl::c_equal(dot_a->sparsity(), dot_b->sparsity(),
-                     protobuf_util::ProtobufEquals)) {
+                     protobuf_util::HaveSameSerialization)) {
     VLOG(3) << "Can't merge dots because they have mismatching sparsity "
                "descriptors:\n"
             << "\t" << a->ToString() << "\n"
@@ -170,7 +170,7 @@ absl::StatusOr<HloInstruction*> TryMergeSameOperand(HloInstruction* a,
            dnums.rhs_batch_dimensions_size());
   std::set<int64_t> used_dims;
   int64_t shared_op_num_non_contracting_dims =
-      shared_op->shape().rank() - dnums.lhs_batch_dimensions_size();
+      shared_op->shape().dimensions_size() - dnums.lhs_batch_dimensions_size();
   if (lhs_same) {
     shared_op_num_non_contracting_dims -=
         dnums.lhs_contracting_dimensions_size();
@@ -186,7 +186,7 @@ absl::StatusOr<HloInstruction*> TryMergeSameOperand(HloInstruction* a,
     used_dims.insert(dnums.lhs_batch_dimensions().begin(),
                      dnums.lhs_batch_dimensions().end());
   }
-  if (used_dims.size() + 1 != diff_op_a->shape().rank()) {
+  if (used_dims.size() + 1 != diff_op_a->shape().dimensions_size()) {
     VLOG(3)
         << "Can't merge dots because the different operands don't have exactly "
            "one non-contracting dimension:\n"
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
index 18eb7f0eca56..b69a16051dc7 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.cc
@@ -66,7 +66,7 @@ absl::StatusOr<bool> SliceConcatForwarding(HloInstruction* slice) {
     return false;
   }
 
-  if (slice->shape().rank() != 1) {
+  if (slice->shape().dimensions_size() != 1) {
     // Slice concat forwarding only work for size 1 tensor.
     return false;
   }
@@ -105,15 +105,15 @@ absl::StatusOr<bool> ReshapeBroadcastForwarding(HloInstruction* reshape) {
     return false;
   }
 
-  if (reshape->shape().rank() != 0) {
+  if (reshape->shape().dimensions_size() != 0) {
     return false;
   }
 
-  if (broadcast->shape().rank() != 1) {
+  if (broadcast->shape().dimensions_size() != 1) {
     return false;
   }
 
-  if (broadcast->mutable_operand(0)->shape().rank() != 0) {
+  if (broadcast->mutable_operand(0)->shape().dimensions_size() != 0) {
     return false;
   }
 
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
index 68a017b16050..1f8a60c8bcea 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph.cc
@@ -35,53 +35,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-// Helper to replace the called computation at a while, call, conditional or
-// async instruction. This function replaces exactly one instance of
-// 'computation' with 'new_computation' even if 'instruction' calls
-// 'computation' more than once.
-void ReplaceCalledComputation(HloInstruction* instruction,
-                              HloComputation* computation,
-                              HloComputation* new_computation) {
-  switch (instruction->opcode()) {
-    case HloOpcode::kWhile: {
-      if (computation == instruction->while_condition()) {
-        instruction->set_while_condition(new_computation);
-      } else {
-        CHECK_EQ(computation, instruction->while_body());
-        instruction->set_while_body(new_computation);
-      }
-      break;
-    }
-    case HloOpcode::kCall: {
-      CHECK_EQ(instruction->to_apply(), computation);
-      instruction->set_to_apply(new_computation);
-      break;
-    }
-    case HloOpcode::kConditional: {
-      for (int b = 0; b < instruction->branch_count(); ++b) {
-        if (b == instruction->branch_count() - 1) {
-          CHECK_EQ(computation, instruction->branch_computation(b));
-        }
-        if (computation == instruction->branch_computation(b)) {
-          instruction->set_branch_computation(b, new_computation);
-          break;
-        }
-      }
-      break;
-    }
-    case HloOpcode::kAsyncStart: {
-      CHECK(computation->IsAsyncComputation());
-      computation->RemoveAsyncStart();
-      instruction->ReplaceCalledComputations(
-          [&](HloComputation*) { return new_computation; });
-      new_computation->AddAsyncStart(instruction);
-      break;
-    }
-    default:
-      LOG(FATAL) << "unexpected opcode: " << instruction->opcode();
-  }
-}
-
 // Flatten a single call graph node. Expects to visit nodes in postorder.
 absl::Status FlattenNode(const CallGraphNode& node) {
   HloComputation* computation = node.computation();
@@ -101,13 +54,20 @@ absl::Status FlattenNode(const CallGraphNode& node) {
       continue;
     }
 
+    std::vector<HloComputation*> worklist;
     // Clone computation for the remaining sequential context call sites.
-    HloComputation* clone =
-        module->AddEmbeddedComputation(computation->Clone());
-    ReplaceCalledComputation(call_site.instruction(), computation, clone);
+    call_site.instruction()->ReplaceCalledComputations(
+        [&](HloComputation* callee) {
+          if (callee == computation) {
+            HloComputation* clone =
+                module->AddEmbeddedComputation(callee->Clone());
+            worklist.push_back(clone);
+            return clone;
+          }
+          return callee;
+        });
+
     // Clone the sub-tree of all computations called from this node.
-    std::vector<HloComputation*> worklist;
-    worklist.push_back(clone);
     while (!worklist.empty()) {
       auto current = worklist.back();
       worklist.pop_back();
@@ -116,10 +76,11 @@ absl::Status FlattenNode(const CallGraphNode& node) {
             CallContext::kControlFlow) {
           continue;
         }
-        for (auto callee : instruction->called_computations()) {
-          HloComputation* callee_clone =
-              module->AddEmbeddedComputation(callee->Clone());
-          ReplaceCalledComputation(instruction, callee, callee_clone);
+
+        instruction->ReplaceCalledComputations([&](HloComputation* callee) {
+          return module->AddEmbeddedComputation(callee->Clone());
+        });
+        for (auto* callee_clone : instruction->called_computations()) {
           worklist.push_back(callee_clone);
         }
       }
@@ -137,24 +98,6 @@ absl::Status AnnotateNode(const CallGraphNode& node) {
       for (HloComputation* computation : instruction->called_computations()) {
         computation->SetFusionInstruction(instruction);
       }
-
-    } else if (instruction->opcode() == HloOpcode::kCustomCall) {
-      for (HloComputation* computation : instruction->called_computations()) {
-        computation->SetCustomCallInstruction(instruction);
-      }
-
-    } else if (hlo_query::IsCollectiveCommunicationOp(instruction->opcode())) {
-      for (HloComputation* computation : instruction->called_computations()) {
-        computation->SetCollectiveCallInstruction(instruction);
-      }
-
-    } else if (instruction->opcode() == HloOpcode::kWhile) {
-      instruction->while_body()->SetWhileCallInstruction(instruction);
-
-    } else if (instruction->opcode() == HloOpcode::kConditional) {
-      for (HloComputation* branch : instruction->branch_computations()) {
-        branch->SetConditionalCallInstruction(instruction);
-      }
     }
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph_test.cc
index 25994e76284d..b5d0f861343b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/flatten_call_graph_test.cc
@@ -19,15 +19,16 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "absl/status/statusor.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/literal_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -171,11 +172,12 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   }
 
   HloComputation* entry_computation;
+  HloInstruction* while_op;
   {
     HloComputation::Builder builder(TestName() + ".entry");
     HloInstruction* false_constant = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
-    builder.AddInstruction(HloInstruction::CreateWhile(
+    while_op = builder.AddInstruction(HloInstruction::CreateWhile(
         ShapeUtil::MakeShape(PRED, {}), cond_computation, cond_computation,
         false_constant));
     entry_computation = module->AddEntryComputation(builder.Build());
@@ -190,9 +192,10 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   {
     TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
+    EXPECT_NE(while_op->while_body(), while_op->while_condition());
     std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
-    EXPECT_EQ(1, cond_node.caller_callsites().size());
+    EXPECT_EQ(0, cond_node.caller_callsites().size());
   }
 }
 
@@ -243,7 +246,7 @@ TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(56.0f)));
   auto constant2 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(12.0f)));
-  builder.AddInstruction(HloInstruction::CreateConditional(
+  auto cond = builder.AddInstruction(HloInstruction::CreateConditional(
       kScalarShape, pred, constant1, sub_computation, constant2,
       sub_computation));
   module->AddEntryComputation(builder.Build());
@@ -253,10 +256,12 @@ TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
   EXPECT_TRUE(result);
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   // The true and false computations must now be different.
-  EXPECT_EQ(3, module->computation_count());
+  EXPECT_EQ(4, module->computation_count());
+  EXPECT_NE(cond->branch_computation(0), cond->branch_computation(1));
 
+  //  The original computation is no longer used.
   const CallGraphNode& sub_node = call_graph->GetNode(sub_computation);
-  EXPECT_EQ(1, sub_node.caller_callsites().size());
+  EXPECT_EQ(0, sub_node.caller_callsites().size());
 }
 
 TEST_F(FlattenCallGraphTest, AsyncCall) {
@@ -309,5 +314,88 @@ ENTRY %main (a: f32[4096], b: f32[4096]) -> f32[4096] {
                 ->called_computations()[0]);
 }
 
+TEST_F(FlattenCallGraphTest, WhileInCall) {
+  std::string hlo_string = R"(
+HloModule WhileInCall
+
+  %while_cond {
+    %p.cond = (f32[4096]{0}) parameter(0)
+    ROOT %eq = pred[] constant(false)
+  }
+
+  %while_body {
+    ROOT %p = (f32[4096]{0}) parameter(0)
+  }
+
+  %called_computation(arg: f32[4096]) -> f32[4096] {
+    %arg = f32[4096]{0} parameter(0)
+    %while_init = (f32[4096]{0}) tuple(%arg)
+    %while = (f32[4096]{0}) while(%while_init), condition=%while_cond, body=%while_body
+    ROOT %get-tuple-element = f32[4096]{0} get-tuple-element(%while), index=0
+  }
+
+  ENTRY %main (a: f32[4096], b: f32[4096]) -> f32[4096] {
+    %a = f32[4096]{0} parameter(0)
+    %b = f32[4096]{0} parameter(1)
+    %call0 = f32[4096]{0} call(%a), to_apply=%called_computation
+    %call1 = f32[4096]{0} call(%b), to_apply=%called_computation
+    ROOT %multiply = f32[4096]{0} multiply(%call0, %call1)
+  }
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  ASSERT_EQ(module->computation_count(), 4);
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
+  EXPECT_TRUE(result);
+  EXPECT_EQ(module->computation_count(), 7);
+}
+
+TEST_F(FlattenCallGraphTest, CallInWhileInCall) {
+  std::string hlo_string = R"(
+HloModule CallInWhileInCall
+
+  %called_computation_internal(arg: f32[4096]) -> f32[4096] {
+    ROOT %arg.internal = f32[4096]{0} parameter(0)
+  }
+
+  %while_cond {
+    %p.cond = (f32[4096]{0}) parameter(0)
+    ROOT %eq = pred[] constant(false)
+  }
+
+  %while_body {
+    %p.body = (f32[4096]{0}) parameter(0)
+    %gte = f32[4096]{0} get-tuple-element(%p.body), index=0
+    %call.internal = f32[4096]{0} call(%gte), to_apply=%called_computation_internal
+    ROOT %tuple.body = (f32[4096]{0}) tuple(%call.internal)
+  }
+
+  %called_computation_external(arg: f32[4096]) -> f32[4096] {
+    %arg.external = f32[4096]{0} parameter(0)
+    %while.init = (f32[4096]{0}) tuple(%arg.external)
+    %while = (f32[4096]{0}) while(%while.init), condition=%while_cond, body=%while_body
+    ROOT %get-tuple-element = f32[4096]{0} get-tuple-element(%while), index=0
+  }
+
+  ENTRY %main (a: f32[4096], b: f32[4096]) -> f32[4096] {
+    %a = f32[4096]{0} parameter(0)
+    %b = f32[4096]{0} parameter(1)
+    %call0 = f32[4096]{0} call(%a), to_apply=%called_computation_external
+    %call1 = f32[4096]{0} call(%b), to_apply=%called_computation_external
+    ROOT %multiply = f32[4096]{0} multiply(%call0, %call1)
+  }
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  ASSERT_EQ(module->computation_count(), 5);
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
+  EXPECT_TRUE(result);
+  EXPECT_EQ(module->computation_count(), 9);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
index cf978bf581fc..22d1c19704c9 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/base/optimization.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
@@ -39,9 +40,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -462,6 +460,10 @@ absl::Status FloatNormalizationVisitor::HandleMultipleOutputs(
 }
 
 absl::Status FloatNormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
+  if (ABSL_PREDICT_FALSE(float_support_->ShouldSkipInstruction(*hlo))) {
+    return absl::OkStatus();
+  }
+
   int high_prec_count = 0;
   int low_prec_count = 0;
 
@@ -591,6 +593,8 @@ absl::Status FloatNormalizationVisitor::DefaultAction(HloInstruction* hlo) {
       hlo->opcode() == HloOpcode::kWhile ||            //
       hlo->opcode() == HloOpcode::kConditional ||      //
       hlo->opcode() == HloOpcode::kBitcastConvert ||   //
+      hlo->opcode() == HloOpcode::kAsyncStart ||       //
+      hlo->opcode() == HloOpcode::kAsyncDone ||        //
       hlo->HasSideEffectNoRecurse()) {
     return absl::OkStatus();
   }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization_test.cc
index f61d7f0c1d38..a78fe9296ae8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/float_normalization_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -212,6 +213,73 @@ TEST_F(FloatNormalizationTest, ResolveIfUnsupportedBF16) {
   EXPECT_EQ(mul1->operand(0)->opcode(), HloOpcode::kConvert);
 }
 
+TEST_F(FloatNormalizationTest, ResolveIfUnsupportedBF16CalledComputation) {
+  constexpr absl::string_view kHlo = R"(
+HloModule main
+
+ENTRY main {
+  arg.0 = f32[2,4] parameter(0)
+  arg.1 = bf16[2,4] parameter(1)
+  arg.2 = f32[2,4] parameter(2)
+  ROOT call.0 = call(arg.0, arg.1, arg.2), to_apply={
+    arg.0 = f32[2,4] parameter(0)
+    arg.1 = bf16[2,4] parameter(1)
+    arg.2 = f32[2,4] parameter(2)
+    multiply.0 = bf16[2,4] multiply(arg.0, arg.1)
+    ROOT multiply.1 = bf16[2,4] multiply(multiply.0, arg.2)
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+
+  EXPECT_TRUE(Normalize(module.get()));
+
+  HloInstruction* call0 = FindInstruction(module.get(), "call.0");
+  ASSERT_NE(call0, nullptr);
+  HloComputation* computation = call0->to_apply();
+  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
+  HloInstruction* multiply1 = FindInstruction(module.get(), "multiply.1");
+  ASSERT_NE(multiply1, nullptr);
+  EXPECT_EQ(computation->root_instruction()->operand(0), multiply1);
+  EXPECT_EQ(multiply1->shape().element_type(), F32);
+  EXPECT_EQ(multiply1->shape().element_type(), F32);
+  EXPECT_EQ(multiply1->operand(0)->opcode(), HloOpcode::kConvert);
+}
+
+TEST_F(FloatNormalizationTest, ResolveIfUnsupportedBF16AsyncComputation) {
+  constexpr absl::string_view kHlo = R"(
+HloModule main
+
+ENTRY main {
+  arg.0 = f32[2,4] parameter(0)
+  arg.1 = bf16[2,4] parameter(1)
+  arg.2 = f32[2,4] parameter(2)
+  call-start.0 = ((f32[2,4], bf16[2,4], f32[2,4]), bf16[2,4], s32[]) call-start(arg.0, arg.1, arg.2), to_apply={
+    arg.0 = f32[2,4] parameter(0)
+    arg.1 = bf16[2,4] parameter(1)
+    arg.2 = f32[2,4] parameter(2)
+    multiply.0 = bf16[2,4] multiply(arg.0, arg.1)
+    ROOT multiply.1 = bf16[2,4] multiply(multiply.0, arg.2)
+  }
+  ROOT call-done.0 = bf16[2,4] call-done(call-start.0)
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+
+  EXPECT_TRUE(Normalize(module.get()));
+  HloInstruction* call_start0 = FindInstruction(module.get(), "call-start.0");
+  ASSERT_NE(call_start0, nullptr);
+  HloComputation* computation =
+      call_start0->async_wrapped_instruction()->to_apply();
+  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
+  HloInstruction* multiply1 = FindInstruction(module.get(), "multiply.1");
+  ASSERT_NE(multiply1, nullptr);
+  EXPECT_EQ(computation->root_instruction()->operand(0), multiply1);
+  EXPECT_EQ(multiply1->shape().element_type(), F32);
+  EXPECT_EQ(multiply1->shape().element_type(), F32);
+  EXPECT_EQ(multiply1->operand(0)->opcode(), HloOpcode::kConvert);
+}
+
 TEST_F(FloatNormalizationTest, ResolveUnsupportedMixedPrecisionSubtraction) {
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
index 799120e5be0f..fcec117cd8b2 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/gather_simplifier.cc
@@ -130,7 +130,8 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
 bool GatherSimplifier::IsSimplifiedGather(const HloGatherInstruction* gather) {
   auto* start_indices = gather->operands()[1];
   const auto& dims = gather->gather_dimension_numbers();
-  return start_indices->shape().rank() == 2 && dims.index_vector_dim() == 1 &&
+  return start_indices->shape().dimensions_size() == 2 &&
+         dims.index_vector_dim() == 1 &&
          IsIdentityPermutation(dims.start_index_map()) &&
          dims.collapsed_slice_dims().empty() &&
          *dims.offset_dims().begin() == 1 &&
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
index b1b3b1eda7a2..d3dc79e6b9c6 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.cc
@@ -18,12 +18,14 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/shape_util.h"
 #include "tsl/platform/logging.h"
 
@@ -91,9 +93,17 @@ absl::StatusOr<bool> HloComputationDeduplicator::Run(
     // with large number of instructions or large-size constants due to increase
     // in time taken to stringify.
     if (comp->IsEntryComputation() || comp->instruction_count() > 128 ||
-        ContainsLargeConstants(comp) || comp->IsCollectiveCalledComputation()) {
+        ContainsLargeConstants(comp)) {
       continue;
     }
+    // Don't deduplicate collectives and non-collectives.
+    if (absl::c_any_of(
+            comp->caller_instructions(), [](const HloInstruction* instr) {
+              return hlo_query::IsCollectiveCommunicationOp(instr->opcode());
+            })) {
+      continue;
+    }
+
     std::string comp_str = comp->ToString(options);
     auto poss_dup = unique_comps.find(comp_str);
     if (poss_dup != unique_comps.end() &&
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
index 6a50f10d3606..5a9ef91900d8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.cc
@@ -146,7 +146,7 @@ absl::StatusOr<bool> HloConstantFolding::Run(
   // default case. This retains the behavior from before while loop support in
   // HloEvaluator and may be revised.
   auto evaluator = std::make_unique<HloEvaluator>(
-      /*max_loop_iterations=*/level_ == Level::kAgressive ? -1 : 0);
+      /*max_loop_iterations=*/level_ == Level::kAggressive ? -1 : 0);
   // fast-path lets us e.g. use Eigen for matmuls.
   evaluator->set_use_fast_path(true);
 
@@ -237,6 +237,11 @@ absl::StatusOr<bool> HloConstantFolding::Run(
         continue;
       }
 
+      // Skip constant folding for instructions that cannot be safely removed.
+      if (!computation->IsSafelyRemovable(instruction)) {
+        continue;
+      }
+
       if (instruction->opcode() == HloOpcode::kPad &&
           instruction->operand(0)->opcode() == HloOpcode::kBroadcast &&
           instruction->operand(1)->opcode() == HloOpcode::kConstant) {
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
index 2f03b81714fd..084185d4ec45 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding.h
@@ -42,7 +42,7 @@ class HloConstantFolding : public HloModulePass {
     // some workloads, but it can have deteremental effects on others as well as
     // having a large increase in compile time.
     // Use with caution.
-    kAgressive,
+    kAggressive,
   };
 
   explicit HloConstantFolding(Level level = Level::kDefault) : level_(level) {}
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding_test.cc
index 16558d34d870..4388a9e61b2e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_constant_folding_test.cc
@@ -393,6 +393,44 @@ TEST_F(HloConstantFoldingTest,
   EXPECT_FALSE(result);
 }
 
+TEST_F(HloConstantFoldingTest, ConstantFoldCopyOp) {
+  // Replace %copy.3 with %constant.2
+  const char* const kModuleStr = R"(
+  HloModule m
+  ENTRY main {
+    %p0 = f32[] parameter(0)
+    %constant.2 = f32[] constant(0)
+    ROOT %copy.3 = f32[] copy(f32[] %constant.2)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_TRUE(result);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Constant()));
+}
+
+TEST_F(HloConstantFoldingTest, DontFoldCopyOp_NonSafelyRemovableOp) {
+  // copy.3 is not SafelyRemovable (has control-predecessors)
+  // Skip ConstantFolding
+  const char* const kModuleStr = R"(
+  HloModule m
+  ENTRY main {
+    %p0 = f32[] parameter(0)
+    %copy.1 = f32[] copy(f32[] %p0)
+    %constant.2 = f32[] constant(0)
+    ROOT %copy.3 = f32[] copy(f32[] %constant.2), control-predecessors={%copy.1}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_FALSE(result);
+}
+
 TEST_F(HloConstantFoldingTest, FoldOpsWhereOneOperandIsBroadcast) {
   const char* const kModuleStr = R"(
   HloModule test
@@ -438,7 +476,7 @@ TEST_F(HloConstantFoldingTest, AgressiveFoldOpsWhereBothOperandAreBroadcast) {
   })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr));
-  HloConstantFolding constant_folding(HloConstantFolding::Level::kAgressive);
+  HloConstantFolding constant_folding(HloConstantFolding::Level::kAggressive);
   TF_ASSERT_OK_AND_ASSIGN(bool result,
                           RunHloPass(&constant_folding, module.get()));
   EXPECT_TRUE(result);
@@ -576,7 +614,7 @@ TEST_F(HloConstantFoldingTest, FoldWhile) {
     }
    )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(mod_str));
-  HloConstantFolding const_fold(HloConstantFolding::Level::kAgressive);
+  HloConstantFolding const_fold(HloConstantFolding::Level::kAggressive);
   TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&const_fold, module.get()));
   EXPECT_TRUE(result);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
index fe18654bf846..0f99bcfa3f6b 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.cc
@@ -17,14 +17,18 @@ limitations under the License.
 
 #include <cstdint>
 #include <iterator>
+#include <memory>
+#include <optional>
 #include <set>
 #include <stack>
 #include <tuple>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -34,12 +38,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -47,7 +51,7 @@ namespace {
 
 // Checks if the instruction is a removable while given
 // remove_cross_partition_collective_ops
-bool IsRemovableWhile(HloInstruction* instruction,
+bool IsRemovableWhile(const HloInstruction* instruction,
                       bool remove_cross_partition_collective_ops) {
   if (instruction->opcode() != HloOpcode::kWhile) {
     return false;
@@ -163,39 +167,92 @@ absl::StatusOr<bool> RemoveMultiOutputFusionsUnusedOutputs(
 }  // namespace
 
 /*static*/ absl::StatusOr<bool> HloDCE::RunOnComputation(
-    HloComputation* computation, bool remove_cross_partition_collective_ops) {
+    HloComputation* computation, bool remove_cross_partition_collective_ops,
+    CallGraph* call_graph) {
   // We do this first, because it may create dead roots which we can clean up
   // next.
   TF_ASSIGN_OR_RETURN(bool changed,
                       RemoveMultiOutputFusionsUnusedOutputs(computation));
 
+  auto computation_callers =
+      [call_graph](
+          const HloComputation* computation) -> std::vector<HloInstruction*> {
+    if (call_graph == nullptr) {
+      return {};
+    }
+    return call_graph->GetComputationCallers(computation);
+  };
+
   // Remove any dead roots and their dead transitive operands. Collect
   // them into a separate list first to avoid problems with iterating through
   // the computation's instruction while simultaneously removing instructions.
   std::vector<HloInstruction*> dead_roots;
   for (auto* instruction : computation->instructions()) {
-    auto maybe_collective_op = DynCast<HloCollectiveInstruction>(instruction);
-    if (instruction->IsDead() && computation->IsSafelyRemovable(instruction) &&
-        (!instruction->IsCustomCall("Sharding") ||
-         (!instruction->operand(0)->IsRoot() &&
-          instruction->operand(0)->opcode() != HloOpcode::kParameter &&
-          instruction->operand(0)->user_count() == 1)) &&
-        (!instruction->HasSideEffect() ||
-         (remove_cross_partition_collective_ops && maybe_collective_op &&
-          !maybe_collective_op->constrain_layout()) ||
-         IsRemovableWhile(instruction,
-                          remove_cross_partition_collective_ops))) {
-      dead_roots.push_back(instruction);
+    if (!instruction->IsDead()) {
+      continue;
+    }
+    if (!computation->IsSafelyRemovable(
+            instruction,
+            /*ignore_control_dependency=*/false,
+            /*computation_callers=*/computation_callers)) {
+      continue;
+    }
+    // We cannot remove a parameter directly, because it may cause a
+    // renumbering of other parameters which may invalidate some of the
+    // pointers in the worklist.
+    if (instruction->opcode() == HloOpcode::kParameter) {
+      continue;
+    }
+    if (instruction->IsCustomCall("Sharding") &&
+        (instruction->operand(0)->IsRoot() ||
+         instruction->operand(0)->opcode() == HloOpcode::kParameter ||
+         instruction->operand(0)->user_count() != 1)) {
+      continue;
     }
+    if (instruction->HasSideEffect()) {
+      auto maybe_collective_op = DynCast<HloCollectiveInstruction>(instruction);
+      bool allow_collective = remove_cross_partition_collective_ops &&
+                              maybe_collective_op &&
+                              !maybe_collective_op->constrain_layout();
+      bool allow_while =
+          IsRemovableWhile(instruction, remove_cross_partition_collective_ops);
+      if (!allow_collective && !allow_while) {
+        continue;
+      }
+    }
+    dead_roots.push_back(instruction);
   }
 
   for (HloInstruction* dead_root : dead_roots) {
     VLOG(1) << "Removing dead root " << dead_root->ToString()
             << " and its unused operands";
-    TF_RETURN_IF_ERROR(
-        computation->RemoveInstructionAndUnusedOperands(dead_root));
+    TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(
+        dead_root, /*cleanup=*/std::nullopt,
+        /*ignore_control_dependencies=*/false,
+        /*computation_callers=*/computation_callers));
     changed = true;
   }
+
+  auto parameters = computation->parameter_instructions();
+  // Sort into decreasing order by parameter number, otherwise the renumbering
+  // of parameters when one parameter is deleted will cause issues.
+  absl::c_reverse(parameters);
+  for (HloInstruction* parameter : parameters) {
+    if (parameter->IsDead() &&
+        computation->IsSafelyRemovable(
+            parameter,
+            /*ignore_control_dependency=*/false,
+            /*computation_callers=*/computation_callers)) {
+      VLOG(1) << "Removing dead parameter " << parameter->ToString()
+              << " and its unused operands";
+      TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(
+          parameter, /*cleanup=*/std::nullopt,
+          /*ignore_control_dependencies=*/false,
+          /*computation_callers=*/computation_callers));
+      changed = true;
+    }
+  }
+
   return changed;
 }
 
@@ -207,6 +264,11 @@ absl::StatusOr<bool> HloDCE::Run(
   VLOG(2) << "Before dce; threads: " << absl::StrJoin(execution_threads, ",");
   XLA_VLOG_LINES(2, module->ToString());
 
+  std::unique_ptr<CallGraph> call_graph;
+  if (use_call_analysis_) {
+    call_graph = CallGraph::Build(module);
+  }
+
   // Run DCE on each computation. Visit callers before callees so that we
   // cleanup dead get-tuple-element users of MultiOutput fusions before cleaning
   // up the fusion computation. If the same callee is referred to by multiple
@@ -230,8 +292,8 @@ absl::StatusOr<bool> HloDCE::Run(
         execution_threads.contains(computation->execution_thread())) {
       TF_ASSIGN_OR_RETURN(
           bool computation_changed,
-          RunOnComputation(computation,
-                           remove_cross_partition_collective_ops_));
+          RunOnComputation(computation, remove_cross_partition_collective_ops_,
+                           call_graph.get()));
       changed |= computation_changed;
     }
 
@@ -244,6 +306,18 @@ absl::StatusOr<bool> HloDCE::Run(
       }
     }
   }
+  // Some computations might have been left dangling due to being detached
+  // indirectly. We need to rebuild the call graph to find these.
+  if (use_call_analysis_) {
+    call_graph = CallGraph::Build(module);
+    for (HloComputation* computation :
+         module->computations(execution_threads)) {
+      if (!computation->IsEntryComputation() &&
+          call_graph->GetComputationCallers(computation).empty()) {
+        to_remove.insert(computation);
+      }
+    }
+  }
   for (auto computation : to_remove) {
     // Only remove computations from the specified execution threads.
     if (execution_threads.empty() ||
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
index a7579ad3538a..d6f1e4e10803 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/call_graph.h"
 
 namespace xla {
 
@@ -33,20 +34,23 @@ namespace xla {
 // dead if it is not the entry computation of the module and it is not reachable
 // from the entry computation.
 //
-// This pass does not remove dead parameter instructions, as parameter
-// instructions cannot be deleted.
+// This pass does not remove dead parameter instructions, unless call analysis
+// is enabled. Using this will slow down compilation. This is only beneficial
+// to do so if the graph is not inlined.
 class HloDCE : public HloModulePass {
  public:
-  HloDCE() : remove_cross_partition_collective_ops_(false) {}
-  explicit HloDCE(bool remove_cross_partition_collective_ops)
+  explicit HloDCE(bool remove_cross_partition_collective_ops = false,
+                  bool use_call_analysis = false)
       : remove_cross_partition_collective_ops_(
-            remove_cross_partition_collective_ops) {}
+            remove_cross_partition_collective_ops),
+        use_call_analysis_(use_call_analysis) {}
   ~HloDCE() override {}
   absl::string_view name() const override { return "dce"; }
 
   // Run DCE on a computation.
   static absl::StatusOr<bool> RunOnComputation(
-      HloComputation* computation, bool remove_cross_partition_collective_ops);
+      HloComputation* computation, bool remove_cross_partition_collective_ops,
+      CallGraph* call_graph = nullptr);
 
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
@@ -57,6 +61,7 @@ class HloDCE : public HloModulePass {
 
  private:
   bool remove_cross_partition_collective_ops_;
+  bool use_call_analysis_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc
index 36c86c76fa42..b3827ba430fc 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_dce_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -30,12 +31,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/layout_util.h"
 #include "xla/literal_util.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 
@@ -799,6 +802,7 @@ TEST_F(HloDceTest, MultiOutputFusionRemoveUnusedTupleElementAdjustTuple) {
           m::Tuple(m::Negate(), m::Add()).WithShapeEqualTo(&expected_shape)));
   EXPECT_EQ(module->MakeComputationPostOrder().size(), 2);
 }
+
 TEST_F(HloDceTest,
        MultiOutputFusionRemoveUnusedTupleElementWithControlAdjustTupleAndDep) {
   constexpr char kHloString[] = R"(
@@ -832,5 +836,119 @@ TEST_F(HloDceTest,
   EXPECT_EQ(add2->control_predecessors().size(), 1);
   EXPECT_EQ(add2->control_predecessors()[0], fusion);
 }
+
+TEST_F(HloDceTest, UnusedCalledParameter) {
+  constexpr absl::string_view kHlo = R"(
+HloModule main
+
+ENTRY main {
+  arg.0 = s32[] parameter(0)
+  arg.1 = s32[] parameter(1)
+  ROOT call.0 = (s32[]) call(arg.0, arg.1), to_apply={
+    arg.0 = s32[] parameter(0)
+    arg.1 = s32[] parameter(1)
+    ROOT tuple.0 = tuple(arg.0)
+  }
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloDCE dce(/*remove_cross_partition_collective_ops=*/false,
+             /*use_call_analysis=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, dce.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloComputation* main = module->entry_computation();
+  EXPECT_EQ(main->parameter_instructions().size(), 2);
+
+  HloInstruction* call0 = main->root_instruction();
+  ASSERT_EQ(call0->opcode(), HloOpcode::kCall);
+  // arg.1 should have been removed.
+  EXPECT_EQ(call0->operand_count(), 1);
+  EXPECT_EQ(call0->operand(0), main->parameter_instruction(0));
+
+  HloComputation* called_computation = call0->to_apply();
+  EXPECT_EQ(called_computation->parameter_instructions().size(), 1);
+}
+
+TEST_F(HloDceTest, UnusedAsyncParameter) {
+  constexpr absl::string_view kHlo = R"(
+HloModule main
+
+ENTRY main {
+  arg.0 = s32[] parameter(0)
+  arg.1 = s32[] parameter(1)
+  call-start.0 = ((s32[], s32[]), (s32[]), s32[]) call-start(arg.0, arg.1), to_apply={
+    arg.0 = s32[] parameter(0)
+    arg.1 = s32[] parameter(1)
+    ROOT tuple.0 = tuple(arg.0)
+  }, async_execution_thread="thread"
+  ROOT call-done.0 = (s32[]) call-done(call-start.0)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloDCE dce(/*remove_cross_partition_collective_ops=*/false,
+             /*use_call_analysis=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, dce.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloComputation* main = module->entry_computation();
+  EXPECT_EQ(main->parameter_instructions().size(), 2);
+
+  HloInstruction* call_done0 = main->root_instruction();
+  ASSERT_EQ(call_done0->opcode(), HloOpcode::kAsyncDone);
+  HloInstruction* call_start0 = call_done0->async_chain_start();
+  // arg.1 should have been removed.
+  EXPECT_EQ(call_start0->operand_count(), 1);
+  EXPECT_EQ(call_start0->operand(0), main->parameter_instruction(0));
+
+  HloComputation* async_wrapped_computation =
+      call_start0->async_wrapped_computation();
+  EXPECT_EQ(async_wrapped_computation->parameter_instructions().size(), 1);
+
+  HloComputation* async_computation =
+      call_start0->async_wrapped_instruction()->to_apply();
+  EXPECT_EQ(async_computation->parameter_instructions().size(), 1);
+}
+
+TEST_F(HloDceTest, IndirectComputationRemoval) {
+  constexpr absl::string_view kHlo = R"(
+HloModule main
+
+ENTRY main {
+  arg.0 = s32[] parameter(0)
+  call.0 = (s32[]) call(arg.0), to_apply={
+    arg.0 = s32[] parameter(0)
+    ROOT tuple.0 = tuple(arg.0)
+  }
+  gte.0 = get-tuple-element(call.0), index=0
+  ROOT call.1 = (s32[]) call(gte.0), to_apply={
+    arg.0 = s32[] parameter(0)
+    zero.0 = s32[] constant(0)
+    ROOT tuple.0 = tuple(zero.0)
+  }
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloDCE dce(/*remove_cross_partition_collective_ops=*/false,
+             /*use_call_analysis=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, dce.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  // call.0 should be removed.
+  EXPECT_EQ(module->computation_count(), 2);
+
+  HloComputation* main = module->entry_computation();
+  EXPECT_EQ(main->parameter_instructions().size(), 1);
+
+  HloInstruction* call1 = module->entry_computation()->root_instruction();
+  ASSERT_EQ(call1->opcode(), HloOpcode::kCall);
+  EXPECT_EQ(call1->operand_count(), 0);
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
index 3f97c6f0c3de..1e711fbb994c 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.cc
@@ -390,31 +390,36 @@ int64_t SumLogicalBufferSizes(
   return size;
 }
 
-absl::StatusOr<HloInstructionSequence> ScheduleComputationHelper(
-    HloComputation* computation,
-    const TuplePointsToAnalysis& points_to_analysis,
+}  // namespace
+
+absl::StatusOr<HloSchedule> ComputationSchedulerAlgorithm::Run(
+    const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
-    const BufferValue::SizeFunction& size_function,
-    const MemorySchedulerAlgorithm& algorithm,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory) {
-  VLOG(2) << "Computation: " << computation->name();
-
-  if (algorithm) {
-    return algorithm(computation, points_to_analysis, alias_analysis,
-                     size_function, postprocessor, peak_memory);
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    int64_t* peak_memory) const {
+  HloSchedule schedule(module);
+  for (HloComputation* computation :
+       module->MakeComputationPostOrder(execution_threads)) {
+    if (!computation->IsFusionComputation()) {
+      TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence,
+                          Run(computation, points_to_analysis, alias_analysis));
+      if (postprocessor_) {
+        computation_sequence = postprocessor_(computation_sequence);
+      }
+      schedule.set_sequence(computation, std::move(computation_sequence));
+    }
   }
-  return DefaultMemoryScheduler(computation, points_to_analysis, alias_analysis,
-                                size_function, postprocessor, peak_memory);
+  if (peak_memory) {
+    TF_ASSIGN_OR_RETURN(*peak_memory, HeapSimulator::MinimumMemoryForModule(
+                                          schedule, size_function_));
+  }
+  return schedule;
 }
 
-}  // namespace
-
-absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler::Run(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const BufferValue::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory) {
+    const HloAliasAnalysis& alias_analysis) const {
   // These variables are a hack to prevent overflows.
   int64_t cumulative_total_size = 0;
   int64_t total_hlos = computation->instruction_count();
@@ -439,7 +444,7 @@ absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     // instructions with lots of fan-out will be visited earlier.
     stats.extra_users = hlo->users().empty() ? 0 : hlo->users().size() - 1;
     int64_t logical_buffer_size = SumLogicalBufferSizes(
-        points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function);
+        points_to_analysis.GetBuffersDefinedByInstruction(hlo), size_function_);
     stats.total_sizes = logical_buffer_size;
     cumulative_total_size += logical_buffer_size;
     absl::flat_hash_set<const HloInstruction*> unique_operands(
@@ -484,25 +489,14 @@ absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler(
         }
         return a->name() < b->name();
       }));
-  if (postprocessor) {
-    sequence = postprocessor(sequence);
-  }
   CHECK_EQ(sequence.size(), computation->instruction_count());
-  if (peak_memory) {
-    TF_ASSIGN_OR_RETURN(
-        *peak_memory,
-        HeapSimulator::MinimumMemoryForComputation(
-            *computation, sequence, alias_analysis, size_function));
-  }
   return sequence;
 }
 
-absl::StatusOr<HloInstructionSequence> BFSMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> BFScheduler::Run(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const BufferValue::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory) {
+    const HloAliasAnalysis& alias_analysis) const {
   // Index of HloInstruction in the `computation`.
   absl::flat_hash_map<const HloInstruction*, int64_t> inst_index;
 
@@ -546,148 +540,28 @@ absl::StatusOr<HloInstructionSequence> BFSMemoryScheduler(
   }
 
   CHECK_EQ(sequence.size(), computation->instruction_count());
-  if (peak_memory) {
-    TF_ASSIGN_OR_RETURN(
-        *peak_memory,
-        HeapSimulator::MinimumMemoryForComputation(
-            *computation, sequence, alias_analysis, size_function));
-  }
-
   return sequence;
 }
 
-ModuleSchedulerAlgorithm ComputationSchedulerToModuleScheduler(
-    const MemorySchedulerAlgorithm& computation_scheduler,
-    const MemorySchedulerPostprocessor& postprocessor) {
-  return [computation_scheduler, postprocessor](
-             const HloModule* module,
-             const TuplePointsToAnalysis& points_to_analysis,
-             const HloAliasAnalysis& alias_analysis,
-             const LogicalBuffer::SizeFunction& size_func,
-             const absl::flat_hash_set<absl::string_view>& execution_threads,
-             int64_t* peak_memory) -> absl::StatusOr<HloSchedule> {
-    HloSchedule schedule(module);
-    for (auto* computation :
-         module->MakeComputationPostOrder(execution_threads)) {
-      if (!computation->IsFusionComputation()) {
-        TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence,
-                            ScheduleComputationHelper(
-                                computation, points_to_analysis, alias_analysis,
-                                size_func, computation_scheduler, postprocessor,
-                                /*peak_memory=*/nullptr));
-        schedule.set_sequence(computation, std::move(computation_sequence));
-      }
-    }
-    if (peak_memory) {
-      TF_ASSIGN_OR_RETURN(*peak_memory, HeapSimulator::MinimumMemoryForModule(
-                                            schedule, size_func));
-    }
-    return std::move(schedule);
-  };
-}
-
-absl::StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    HloComputation* computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const BufferValue::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory) {
-  TF_ASSIGN_OR_RETURN(
-      HloInstructionSequence sequence,
-      ListScheduler::Run(computation, points_to_analysis, size_function));
-  if (postprocessor) {
-    sequence = postprocessor(sequence);
-  }
-  if (peak_memory) {
-    TF_ASSIGN_OR_RETURN(
-        *peak_memory,
-        HeapSimulator::MinimumMemoryForComputation(
-            *computation, sequence, alias_analysis, size_function));
-  }
-  return sequence;
-}
-
-absl::StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> ListMemoryScheduler::Run(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const BufferValue::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory) {
-  HloInstructionSequence sequence(computation->MakeInstructionPostOrder());
-  if (postprocessor) {
-    sequence = postprocessor(sequence);
-  }
-  if (peak_memory) {
-    TF_ASSIGN_OR_RETURN(
-        *peak_memory,
-        HeapSimulator::MinimumMemoryForComputation(
-            *computation, sequence, alias_analysis, size_function));
-  }
-  return sequence;
+    const HloAliasAnalysis& alias_analysis) const {
+  return ListScheduler::Run(computation, points_to_analysis, size_function_);
 }
 
-absl::StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
+absl::StatusOr<HloInstructionSequence> PostOrderScheduler::Run(
     HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const BufferValue::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory) {
-  // We try a few schedulers and choose whichever returns a lower min-memory,
-  // not accounting for fragmentation.
-  // - List is a scheduler that uses greedy heuristics.
-  // - DFS visits HLOs in postorder, with a heuristic to decide the order of
-  //   children.
-  // - Postorder does not use any heuristics.
-  // List wins for most of our benchmarks; postorder-based schedulers win for
-  // some RNNs.
-  int64_t list_memory;
-  TF_ASSIGN_OR_RETURN(
-      HloInstructionSequence list_sequence,
-      ListMemoryScheduler(computation, points_to_analysis, alias_analysis,
-                          size_function, postprocessor, &list_memory));
-  VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
-
-  int64_t dfs_memory;
-  TF_ASSIGN_OR_RETURN(
-      HloInstructionSequence dfs_sequence,
-      DFSMemoryScheduler(computation, points_to_analysis, alias_analysis,
-                         size_function, postprocessor, &dfs_memory));
-  VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
-
-  int64_t post_order_memory;
-  TF_ASSIGN_OR_RETURN(HloInstructionSequence post_order_sequence,
-                      PostOrderMemoryScheduler(
-                          computation, points_to_analysis, alias_analysis,
-                          size_function, postprocessor, &post_order_memory));
-  VLOG(2) << "Min-memory post order sequence: "
-          << HumanReadableNumBytes(post_order_memory);
-
-  auto min_memory = std::min({dfs_memory, post_order_memory, list_memory});
-  if (peak_memory) {
-    *peak_memory = min_memory;
-  }
-
-  if (min_memory == list_memory) {
-    VLOG(2) << "Chose min-memory list sequence: "
-            << HumanReadableNumBytes(list_memory);
-    return list_sequence;
-  } else if (min_memory == dfs_memory) {
-    VLOG(2) << "Chose min-memory dfs sequence: "
-            << HumanReadableNumBytes(dfs_memory);
-    return dfs_sequence;
-  } else {
-    VLOG(2) << "Chose min-memory post_order sequence: "
-            << HumanReadableNumBytes(post_order_memory);
-    return post_order_sequence;
-  }
+    const HloAliasAnalysis& alias_analysis) const {
+  return HloInstructionSequence(computation->MakeInstructionPostOrder());
 }
 
-absl::StatusOr<HloSchedule> DefaultModuleScheduler(
+absl::StatusOr<HloSchedule> DefaultMemoryScheduler::Run(
     const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
     const HloAliasAnalysis& alias_analysis,
-    const BufferValue::SizeFunction& size_function,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
-    int64_t* peak_memory) {
+    int64_t* peak_memory) const {
   // We try a few schedulers and choose whichever returns a lower min-memory,
   // not accounting for fragmentation.
   // - List is a scheduler that uses greedy heuristics.
@@ -699,26 +573,22 @@ absl::StatusOr<HloSchedule> DefaultModuleScheduler(
   int64_t list_memory;
   TF_ASSIGN_OR_RETURN(
       HloSchedule list_sequence,
-      ComputationSchedulerToModuleScheduler(ListMemoryScheduler, {})(
-          module, points_to_analysis, alias_analysis, size_function,
-          execution_threads, &list_memory));
-
+      list_scheduler_.Run(module, points_to_analysis, alias_analysis,
+                          execution_threads, &list_memory));
   VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
   int64_t dfs_memory;
   TF_ASSIGN_OR_RETURN(
       HloSchedule dfs_sequence,
-      ComputationSchedulerToModuleScheduler(DFSMemoryScheduler, {})(
-          module, points_to_analysis, alias_analysis, size_function,
-          execution_threads, &dfs_memory));
+      dfs_scheduler_.Run(module, points_to_analysis, alias_analysis,
+                         execution_threads, &dfs_memory));
   VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
   int64_t post_order_memory;
   TF_ASSIGN_OR_RETURN(
       HloSchedule post_order_sequence,
-      ComputationSchedulerToModuleScheduler(PostOrderMemoryScheduler, {})(
-          module, points_to_analysis, alias_analysis, size_function,
-          execution_threads, &post_order_memory));
+      post_order_scheduler_.Run(module, points_to_analysis, alias_analysis,
+                                execution_threads, &post_order_memory));
   VLOG(2) << "Min-memory post order sequence: "
           << HumanReadableNumBytes(post_order_memory);
 
@@ -743,8 +613,7 @@ absl::StatusOr<HloSchedule> DefaultModuleScheduler(
 }
 
 absl::StatusOr<HloSchedule> ScheduleModule(
-    const HloModule* module, const BufferValue::SizeFunction& size_function,
-    const ModuleSchedulerAlgorithm& algorithm,
+    const HloModule* module, const ModuleSchedulerAlgorithm& algorithm,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
     int64_t* peak_memory) {
   tsl::profiler::ScopedAnnotation annotation([&] {
@@ -756,27 +625,29 @@ absl::StatusOr<HloSchedule> ScheduleModule(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module));
 
-  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                      (algorithm ? algorithm : DefaultModuleScheduler)(
-                          module, *points_to_analysis, *alias_analysis,
-                          size_function, execution_threads, peak_memory));
+  TF_ASSIGN_OR_RETURN(
+      HloSchedule schedule,
+      algorithm.Run(module, *points_to_analysis, *alias_analysis,
+                    execution_threads, peak_memory));
 
   TF_RETURN_IF_ERROR(schedule.Verify());
 
-  return std::move(schedule);
+  return schedule;
 }
 
-HloMemoryScheduler::HloMemoryScheduler(
-    const BufferValue::SizeFunction& size_function,
-    const ModuleSchedulerAlgorithm& algorithm)
-    : size_function_(size_function), algorithm_(algorithm) {}
+absl::StatusOr<HloSchedule> ScheduleModule(
+    const HloModule* module, const BufferValue::SizeFunction& size_function,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    int64_t* peak_memory) {
+  return ScheduleModule(module, DefaultMemoryScheduler(size_function),
+                        execution_threads, peak_memory);
+}
 
 absl::StatusOr<bool> HloMemoryScheduler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleModule(module, size_function_, algorithm_, execution_threads));
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module, *algorithm_, execution_threads));
   TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
   return true;
 }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
index 231030d2ad26..4fd7f273fd85 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
+#include <utility>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -28,109 +30,159 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/service/logical_buffer.h"
+#include "xla/service/buffer_value.h"
 
 namespace xla {
 
+// A module scheduler computes an execution sequence for the HLO instructions in
+// 'module' given a points-to analysis result that describes buffer aliasing.
+// peak_memory (may be nullptr) is set to the peak memory of the resulting
+// schedule according to the HeapSimulator.
+//
+// TODO(yunxing): Cleanup usage of TuplePointsToAnalysis.
+class ModuleSchedulerAlgorithm {
+ public:
+  virtual ~ModuleSchedulerAlgorithm() = default;
+  virtual absl::StatusOr<HloSchedule> Run(
+      const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      int64_t* peak_memory) const = 0;
+};
+
 // Postprocessor of the HloInstructionSequence. This is an opt-in postprocessing
-// function to MemorySchedulerAlgorithm to enforce certain hlo schedule
+// function to ComputationSchedulerAlgorithm to enforce certain hlo schedule
 // constraints desired for custom-calls.
-using MemorySchedulerPostprocessor =
+using SchedulerPostprocessor =
     std::function<HloInstructionSequence(const HloInstructionSequence&)>;
 
-// A memory scheduler computes an execution sequence for the HLO instructions in
-// 'computation' that minimizes peak memory (or finds a balance between memory
-// and available concurrency), given a points-to analysis result that describes
-// buffer aliasing, together with a target-specific size function that maps a
-// tensor's logical size to its padded size. peak_memory (may be nullptr) is set
-// to the peak memory of the resulting schedule according to the HeapSimulator.
-//
-// TODO(yunxing): Cleanup usage of TuplePointsToAnalysis.
-using MemorySchedulerAlgorithm =
-    std::function<absl::StatusOr<HloInstructionSequence>(
-        HloComputation*, const TuplePointsToAnalysis&, const HloAliasAnalysis&,
-        const LogicalBuffer::SizeFunction&,
-        const MemorySchedulerPostprocessor&,
-        /*peak_memory*/ int64_t*)>;
-
-// Scheduler for the entire module.
-using ModuleSchedulerAlgorithm = std::function<absl::StatusOr<HloSchedule>(
-    const HloModule*, const TuplePointsToAnalysis&, const HloAliasAnalysis&,
-    const LogicalBuffer::SizeFunction&,
-    const absl::flat_hash_set<absl::string_view>& execution_threads,
-    /*peak_memory*/ int64_t*)>;
-
 // Lift a computation scheduler into a module scheduler by calling the
 // computation scheduler on all computations in a module.
-ModuleSchedulerAlgorithm ComputationSchedulerToModuleScheduler(
-    const MemorySchedulerAlgorithm&, const MemorySchedulerPostprocessor& = {});
-
-// List scheduler
-absl::StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    HloComputation* computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
-
-// DFS-order scheduler
-absl::StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    HloComputation* computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+// size_function is a target-specific size function that maps a tensor's logical
+// size to its padded size.
+class ComputationSchedulerAlgorithm : public ModuleSchedulerAlgorithm {
+ public:
+  virtual absl::StatusOr<HloInstructionSequence> Run(
+      HloComputation* computation,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis) const = 0;
+  absl::StatusOr<HloSchedule> Run(
+      const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      int64_t* peak_memory) const override;
+
+ protected:
+  ComputationSchedulerAlgorithm(BufferValue::SizeFunction size_function,
+                                SchedulerPostprocessor postprocessor)
+      : size_function_(std::move(size_function)),
+        postprocessor_(std::move(postprocessor)) {}
+
+  BufferValue::SizeFunction size_function_;
+  SchedulerPostprocessor postprocessor_;
+};
+
+// Class implementing a list scheduler of HLO instructions which produces a
+// sequence which minimizes memory usage by preferring to schedule the node that
+// frees bigger buffer and defines smaller outputs.
+class ListMemoryScheduler : public ComputationSchedulerAlgorithm {
+ public:
+  explicit ListMemoryScheduler(BufferValue::SizeFunction size_function,
+                               SchedulerPostprocessor postprocessor = {})
+      : ComputationSchedulerAlgorithm(std::move(size_function),
+                                      std::move(postprocessor)) {}
+  using ModuleSchedulerAlgorithm::Run;
+  absl::StatusOr<HloInstructionSequence> Run(
+      HloComputation* computation,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis) const override;
+};
+
+// DFS-order scheduler with a heuristic to decide which operand to visit first.
+class DFSMemoryScheduler : public ComputationSchedulerAlgorithm {
+ public:
+  explicit DFSMemoryScheduler(BufferValue::SizeFunction size_function,
+                              SchedulerPostprocessor postprocessor = {})
+      : ComputationSchedulerAlgorithm(std::move(size_function),
+                                      std::move(postprocessor)) {}
+  using ModuleSchedulerAlgorithm::Run;
+  absl::StatusOr<HloInstructionSequence> Run(
+      HloComputation* computation,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis) const override;
+};
 
 // BFS-order scheduler
 //
-// BFS-order scheduler is a simple memory scheduler that schedules instructions
-// in a breadth-first order, which maximizes the available concurrency at the
-// cost of increased memory usage (HLO operations that do not have buffer
-// conflicts can be executed in parallel).
+// BFS-order scheduler is a simple scheduler that schedules instructions  in a
+// breadth-first order, which maximizes the available concurrency at the cost of
+// increased memory usage (HLO operations that do not have buffer conflicts can
+// be executed in parallel).
 //
 // This is the most trivial scheduling optimized for maximum concurrency. In
 // practice it is only useful for CPU backend where memory is cheap and we have
 // a lot of available compute cores, and cheap concurrency primitives.
-absl::StatusOr<HloInstructionSequence> BFSMemoryScheduler(
-    HloComputation* computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+class BFScheduler : public ComputationSchedulerAlgorithm {
+ public:
+  explicit BFScheduler(BufferValue::SizeFunction size_function,
+                       SchedulerPostprocessor postprocessor = {})
+      : ComputationSchedulerAlgorithm(std::move(size_function),
+                                      std::move(postprocessor)) {}
+  absl::StatusOr<HloInstructionSequence> Run(
+      HloComputation* computation,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis) const override;
+};
 
 // Naive Post Order scheduler
-absl::StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    HloComputation* computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
+class PostOrderScheduler : public ComputationSchedulerAlgorithm {
+ public:
+  explicit PostOrderScheduler(BufferValue::SizeFunction size_function,
+                              SchedulerPostprocessor postprocessor = {})
+      : ComputationSchedulerAlgorithm(std::move(size_function),
+                                      std::move(postprocessor)) {}
+  using ModuleSchedulerAlgorithm::Run;
+  absl::StatusOr<HloInstructionSequence> Run(
+      HloComputation* computation,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis) const override;
+};
 
 // The default scheduling algorithm. Runs the list scheduler, the DFS scheduler,
 // and the post-order scheduler and chooses whichever returns a lower min-
 // memory, not accounting for fragmentation. peak_memory (may be nullptr) is set
 // to the peak memory of the resulting schedule according to the HeapSimulator.
-absl::StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    HloComputation* computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerPostprocessor& postprocessor, int64_t* peak_memory);
-
-absl::StatusOr<HloSchedule> DefaultModuleScheduler(
-    const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
-    const HloAliasAnalysis& alias_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const absl::flat_hash_set<absl::string_view>& execution_threads,
-    int64_t* peak_memory);
+class DefaultMemoryScheduler : public ModuleSchedulerAlgorithm {
+ public:
+  explicit DefaultMemoryScheduler(
+      const BufferValue::SizeFunction& size_function,
+      const SchedulerPostprocessor& postprocessor = {})
+      : list_scheduler_(size_function, postprocessor),
+        dfs_scheduler_(size_function, postprocessor),
+        post_order_scheduler_(size_function, postprocessor) {}
+  absl::StatusOr<HloSchedule> Run(
+      const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+      const HloAliasAnalysis& alias_analysis,
+      const absl::flat_hash_set<absl::string_view>& execution_threads,
+      int64_t* peak_memory) const override;
+
+ private:
+  ListMemoryScheduler list_scheduler_;
+  DFSMemoryScheduler dfs_scheduler_;
+  PostOrderScheduler post_order_scheduler_;
+};
 
 // Returns an HloSchedule which seeks to minimize the memory required for the
 // module. size_function is the function returning the number of bytes required
 // for a LogicalBuffer. peak_memory (if not nullptr) is set to the largest peak
 // memory (according to the HeapSimulator) of all computations in the module.
 absl::StatusOr<HloSchedule> ScheduleModule(
-    const HloModule* module, const LogicalBuffer::SizeFunction& size_function,
-    const ModuleSchedulerAlgorithm& algorithm = {},
+    const HloModule* module, const ModuleSchedulerAlgorithm& algorithm,
+    const absl::flat_hash_set<absl::string_view>& execution_threads = {},
+    int64_t* peak_memory = nullptr);
+// Schedule the module using the DefaultMemoryScheduler algorithm.
+absl::StatusOr<HloSchedule> ScheduleModule(
+    const HloModule* module, const BufferValue::SizeFunction& size_function,
     const absl::flat_hash_set<absl::string_view>& execution_threads = {},
     int64_t* peak_memory = nullptr);
 
@@ -139,13 +191,13 @@ absl::StatusOr<HloSchedule> ScheduleModule(
 // HloModule::set_schedule.
 class HloMemoryScheduler : public HloModulePass {
  public:
-  // size_function is the function returning the number of bytes required for a
-  // LogicalBuffer. algorithm is the memory scheduling algorithm to use. If not
-  // specified, then DefaultMemoryScheduler is used.
-  explicit HloMemoryScheduler(const LogicalBuffer::SizeFunction& size_function,
-                              const ModuleSchedulerAlgorithm& algorithm = {});
-
-  ~HloMemoryScheduler() override = default;
+  // algorithm is the memory scheduling algorithm to use. If not specified, then
+  // DefaultMemoryScheduler is used.
+  explicit HloMemoryScheduler(
+      std::unique_ptr<ModuleSchedulerAlgorithm> algorithm)
+      : algorithm_(std::move(algorithm)) {}
+  explicit HloMemoryScheduler(const BufferValue::SizeFunction& size_function)
+      : algorithm_(std::make_unique<DefaultMemoryScheduler>(size_function)) {}
 
   absl::string_view name() const override { return "hlo-memory-scheduler"; }
 
@@ -155,17 +207,15 @@ class HloMemoryScheduler : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  LogicalBuffer::SizeFunction size_function_;
-
-  ModuleSchedulerAlgorithm algorithm_;
+  std::unique_ptr<ModuleSchedulerAlgorithm> algorithm_;
 };
 
 // A pass which produces a naive, but correct schedule. The schedule is produced
 // using a DFS traversal of the graph with no attempt to minimize memory use.
 class HloTrivialScheduler : public HloModulePass {
  public:
+  HloTrivialScheduler() = default;
   absl::string_view name() const override { return "hlo-trivial-scheduler"; }
-
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
       HloModule* module,
@@ -177,9 +227,7 @@ class HloTrivialScheduler : public HloModulePass {
 class HloDescheduler : public HloModulePass {
  public:
   HloDescheduler() = default;
-  ~HloDescheduler() override = default;
   absl::string_view name() const override { return "hlo-descheduler"; }
-
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
       HloModule* module,
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler_test.cc
index 8ffc8e3b19c5..4ad8f050cb11 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_memory_scheduler_test.cc
@@ -42,9 +42,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -157,8 +154,7 @@ ENTRY root {
   int64_t peak_memory;
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(module.get(), size_fn,
-                     ComputationSchedulerToModuleScheduler(ListMemoryScheduler),
+      ScheduleModule(module.get(), ListMemoryScheduler(size_fn),
                      /*execution_threads=*/{}, &peak_memory));
   TF_ASSERT_OK(module->set_schedule(schedule));
   // Verify that all instructions are in the sequence.
@@ -208,10 +204,9 @@ ENTRY entry {
     return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(module.get(), size_fn,
-                                         ComputationSchedulerToModuleScheduler(
-                                             ListMemoryScheduler)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloSchedule schedule,
+      ScheduleModule(module.get(), ListMemoryScheduler(size_fn)));
   // Verify that all instructions are in the sequence.
   const std::vector<HloInstruction*>& sequence =
       schedule.sequence(module->entry_computation()).instructions();
@@ -254,12 +249,10 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   module->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(
-          module.get(),
-          [](const BufferValue& buffer) {
-            return ShapeUtil::ByteSizeOf(buffer.shape(), 1);
-          },
-          ComputationSchedulerToModuleScheduler(ListMemoryScheduler)));
+      ScheduleModule(module.get(),
+                     ListMemoryScheduler([](const BufferValue& buffer) {
+                       return ShapeUtil::ByteSizeOf(buffer.shape(), 1);
+                     })));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -305,12 +298,10 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(
-          module.get(),
-          [](const BufferValue& buffer) {
-            return ShapeUtil::ByteSizeOf(buffer.shape(), 2);
-          },
-          ComputationSchedulerToModuleScheduler(ListMemoryScheduler)));
+      ScheduleModule(module.get(),
+                     ListMemoryScheduler([](const BufferValue& buffer) {
+                       return ShapeUtil::ByteSizeOf(buffer.shape(), 2);
+                     })));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -413,12 +404,9 @@ TEST_F(HloSchedulingTest, BFSScheduler) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(
-          module.get(),
-          [](const BufferValue& buffer) {
-            return ShapeUtil::ByteSizeOf(buffer.shape());
-          },
-          ComputationSchedulerToModuleScheduler(BFSMemoryScheduler)));
+      ScheduleModule(module.get(), BFScheduler([](const BufferValue& buffer) {
+                       return ShapeUtil::ByteSizeOf(buffer.shape());
+                     })));
 
   const std::vector<HloInstruction*>& sequence =
       schedule.sequence(module->entry_computation()).instructions();
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_test.cc
index 6f0a72ce3edf..b32f08400366 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -34,14 +35,14 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/hlo_rematerialization_test_utils.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/layout.h"
+#include "xla/service/buffer_value.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -60,8 +61,7 @@ class AsyncRematerializationTest : public RematerializationTestBase {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (!module->has_schedule()) {
       HloMemoryScheduler scheduler(
-          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); },
-          ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
+          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); });
       TF_EXPECT_OK(scheduler.Run(module).status());
     }
     HloRematerialization::RematerializationModeConfig config(
@@ -143,8 +143,7 @@ class RecomputeAndCompressHloRematerializationTest
     TF_EXPECT_OK(verifier().Run(module).status());
     if (!module->has_schedule()) {
       HloMemoryScheduler scheduler(
-          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); },
-          ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
+          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); });
       TF_EXPECT_OK(scheduler.Run(module).status());
     }
 
@@ -1146,9 +1145,9 @@ class CompressingRematerializationTest : public RematerializationTestBase {
         ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape);
     int64_t size =
         ShapeUtil::ByteSizeOfPrimitiveType(descending_shape.element_type());
-    for (int64_t i = 0; i < descending_shape.rank(); ++i) {
+    for (int64_t i = 0; i < descending_shape.dimensions_size(); ++i) {
       int64_t dim = descending_shape.dimensions(i);
-      if (i == descending_shape.rank() - 1) {
+      if (i == descending_shape.dimensions_size() - 1) {
         dim = RoundUpTo<int64_t>(dim, 64);
       }
       size *= dim;
@@ -1159,7 +1158,7 @@ class CompressingRematerializationTest : public RematerializationTestBase {
   // Swap the layout of the two most-minor dimensions if the second-minor
   // dimension is bigger than the most-minor dimension.
   static absl::StatusOr<Shape> ChooseCompactLayoutForShape(const Shape& shape) {
-    if (shape.rank() != 2) {
+    if (shape.dimensions_size() != 2) {
       return shape;
     }
     Shape result = shape;
@@ -1379,8 +1378,7 @@ class OffloadingRematerializationTest : public RematerializationTestBase {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (!module->has_schedule()) {
       HloMemoryScheduler scheduler(
-          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); },
-          ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
+          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); });
       TF_EXPECT_OK(scheduler.Run(module).status());
     }
     // Create a configuration where any compute is much much slower than any
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
index 794df7c96073..153a0d2785fb 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -82,11 +83,9 @@ class HostMemoryTransferAsyncifierVisitor : public DfsHloVisitorWithDefault {
     // Everything is as expected. Replace this dynamic-slice with the async
     // equivalent.
     const Shape context_shape = ShapeUtil::MakeScalarShape(U32);
-    const Shape transfer_bytes_shape = ShapeUtil::MakeScalarShape(S32);
-    TF_ASSIGN_OR_RETURN(
-        HloInstruction * async_done,
-        dynamic_slice->parent()->CreateAsyncInstructions(
-            dynamic_slice, {context_shape, transfer_bytes_shape}));
+    TF_ASSIGN_OR_RETURN(HloInstruction * async_done,
+                        dynamic_slice->parent()->CreateAsyncInstructions(
+                            dynamic_slice, {context_shape}));
     VLOG(1) << "DynamicSlice \"" << dynamic_slice->ToString()
             << "\" is slicing from host memory. Converting to async "
             << async_done->ToString();
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
index 86151cdb7c32..7b6df3f12334 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reduce_window_rewriter.cc
@@ -159,7 +159,7 @@ absl::StatusOr<bool> ReduceWindowRewriter::TryOptimizeCumSumOrProd(
 
   // Try to find the scan axis. We expect all window dimensions to be trivial,
   // except for one.
-  int64_t rank = operand_shape.rank();
+  int64_t rank = operand_shape.dimensions_size();
   const Window& window = reduce_window->window();
   int64_t scan_dim_num = -1;
   for (int i = 0; i < rank; ++i) {
@@ -531,7 +531,7 @@ absl::StatusOr<bool> ReduceWindowRewriter::Run(
         continue;
       }
 
-      if (reduce_window->inputs().front()->shape().rank() != 1) {
+      if (reduce_window->inputs().front()->shape().dimensions_size() != 1) {
         continue;
       }
       TF_RETURN_IF_ERROR(ReplaceReduceWindowWithReshape(reduce_window));
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
index b8a3c7600ae5..537f4234a904 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/reshape_mover.cc
@@ -162,7 +162,7 @@ bool ReshapeMover::CanTriviallyRearrange(const HloInstruction* instr,
     if (rearrange->opcode() == HloOpcode::kReshape) {
       return ShapeUtil::IsScalar(instr->operand(0)->shape()) ||
              (options_.reshape_of_1d_broadcast_is_cheap &&
-              ShapeUtil::TrueRank(instr->operand(0)->shape()) <= 1) ||
+              ShapeUtil::TrueNumDimensions(instr->operand(0)->shape()) <= 1) ||
              (options_.reshape_of_1d_broadcast_is_cheap &&
               ShapeUtil::ReshapeLeavesDimensionsUnmodified(
                   /*from_shape=*/rearrange->shape(),
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
index bc81e69684e7..5329a07fac7d 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/tree_reduction_rewriter.cc
@@ -71,7 +71,8 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
 
     std::vector<int64_t> window_dimensions;
     std::vector<int64_t> window_strides;
-    for (int64_t dim_idx = 0; dim_idx < input_shape.rank(); dim_idx++) {
+    for (int64_t dim_idx = 0; dim_idx < input_shape.dimensions_size();
+         dim_idx++) {
       if (!absl::c_linear_search(hlo->dimensions(), dim_idx)) {
         window_dimensions.push_back(1);
         window_strides.push_back(1);
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
index 2ea480fadcd8..f67e98592c9d 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.cc
@@ -37,23 +37,34 @@ absl::StatusOr<bool> ZeroSizedHloElimination::Run(
   for (HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
+      if (!ShapeUtil::IsZeroElementArray(instruction->shape())) {
+        continue;
+      }
       if (instruction->HasSideEffect() || !instruction->shape().IsArray() ||
+          !instruction->shape().is_static() ||
           instruction->opcode() == HloOpcode::kConstant) {
         continue;
       }
-      if (comp->IsSafelyRemovable(instruction) &&
-          ShapeUtil::IsZeroElementArray(instruction->shape()) &&
-          instruction->shape().is_static()) {
-        // If the instruction doesn't have a layout, use a default layout for
-        // the literal.
-        Shape shape = instruction->shape();
-        if (!LayoutUtil::HasLayout(shape)) {
-          LayoutUtil::SetToDefaultLayout(&shape);
-        }
+      // If the instruction doesn't have a layout, use a default layout for
+      // the literal.
+      Shape shape = instruction->shape();
+      if (!LayoutUtil::HasLayout(shape)) {
+        LayoutUtil::SetToDefaultLayout(&shape);
+      }
+
+      if (comp->IsSafelyRemovable(instruction)) {
         TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
             instruction,
             HloInstruction::CreateConstant(Literal::CreateFromShape(shape))));
         changed = true;
+      } else if (instruction->opcode() == HloOpcode::kParameter &&
+                 !instruction->HasControlDependencies() &&
+                 !instruction->IsDead()) {
+        HloInstruction* constant =
+            comp->AddInstruction(HloInstruction::CreateConstant(
+                Literal::CreateFromShape(instruction->shape())));
+        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(constant));
+        changed = true;
       }
     }
   }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination_test.cc
index 05f23a3d216d..5495891e0ff7 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination_test.cc
@@ -16,21 +16,27 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h"
 
 #include <memory>
-#include <vector>
 
+#include <gmock/gmock.h>
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
+
+using ::testing::Each;
+using ::testing::Eq;
+using ::testing::Property;
+
 class ZeroSizedHloEliminationTest : public HloHardwareIndependentTestBase {
  protected:
   ZeroSizedHloEliminationTest()
@@ -41,13 +47,14 @@ class ZeroSizedHloEliminationTest : public HloHardwareIndependentTestBase {
                 0, ShapeUtil::MakeShape(F32, {3, 0}), "zero sized param"))) {}
 
   absl::StatusOr<bool> RunZeroSizedElimination() {
-    auto module = CreateNewVerifiedModule("zero_sized_elimination_test_module");
-    module->AddEntryComputation(builder_.Build());
-    return ZeroSizedHloElimination{}.Run(module.get());
+    module_ = CreateNewVerifiedModule("zero_sized_elimination_test_module");
+    module_->AddEntryComputation(builder_.Build());
+    return ZeroSizedHloElimination{}.Run(module_.get());
   }
 
   HloComputation::Builder builder_;
   HloInstruction* zero_sized_param_;
+  std::unique_ptr<HloModule> module_;
 };
 
 TEST_F(ZeroSizedHloEliminationTest, EliminatedZeroSizedOp) {
@@ -55,21 +62,31 @@ TEST_F(ZeroSizedHloEliminationTest, EliminatedZeroSizedOp) {
       zero_sized_param_->shape(), HloOpcode::kTanh, zero_sized_param_));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
   EXPECT_TRUE(changed);
+  EXPECT_EQ(module_->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kConstant);
 }
 
-TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateParameter) {
+TEST_F(ZeroSizedHloEliminationTest, ReplacesParameterUsesWithConstant) {
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+  const HloComputation* entry = module_->entry_computation();
+  ASSERT_EQ(entry->num_parameters(), 1);
+  EXPECT_EQ(entry->parameter_instruction(0)->user_count(), 0);
+  EXPECT_EQ(entry->root_instruction()->opcode(), HloOpcode::kConstant)
+      << module_->ToString();
 }
 
 TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateSideEffects) {
-  auto token = builder_.AddInstruction(HloInstruction::CreateToken());
-  auto send = builder_.AddInstruction(HloInstruction::CreateSend(
-      zero_sized_param_, token, /*channel_id*/ 0, /*is_host_transfer=*/false));
-  builder_.AddInstruction(HloInstruction::CreateSendDone(
-      send, send->channel_id(), /*is_host_transfer=*/false));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
-  EXPECT_FALSE(changed);
+  HloInstruction* token =
+      builder_.AddInstruction(HloInstruction::CreateToken());
+  HloInstruction* send = builder_.AddInstruction(
+      HloInstruction::CreateSend(zero_sized_param_, token, /*channel_id*/ 0,
+                                 /*is_host_transfer=*/false));
+  HloInstruction* send_done =
+      builder_.AddInstruction(HloInstruction::CreateSendDone(
+          send, send->channel_id(), /*is_host_transfer=*/false));
+  ASSERT_TRUE(RunZeroSizedElimination().status().ok());
+  EXPECT_EQ(send_done->operand(0), send);
 }
 
 TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateConstant) {
@@ -90,6 +107,10 @@ TEST_F(ZeroSizedHloEliminationTest, ZeroSizedInstructionWithoutLayoutFolded) {
       HloInstruction::CreateBinary(op_shape, HloOpcode::kAdd, param1, param2));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
   EXPECT_TRUE(changed);
+  const HloComputation* entry = module_->entry_computation();
+  EXPECT_THAT(entry->parameter_instructions(),
+              Each(Property(&HloInstruction::user_count, Eq(0))));
+  EXPECT_EQ(entry->root_instruction()->opcode(), HloOpcode::kConstant);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/hlo/transforms/tests/BUILD b/third_party/xla/xla/hlo/transforms/tests/BUILD
index 841c772a5126..1a49092b21ec 100644
--- a/third_party/xla/xla/hlo/transforms/tests/BUILD
+++ b/third_party/xla/xla/hlo/transforms/tests/BUILD
@@ -2,7 +2,6 @@ load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla:internal"],
     licenses = ["notice"],
 )
 
@@ -13,6 +12,8 @@ lit_test_suite(
             # go/keep-sorted start
             "algebraic_simplifier.hlo",
             "cholesky_expander.hlo",
+            "operand_upcaster.hlo",
+            "optimization_barrier_expander.hlo",
             "rewrite_bf16_conv_to_onednn.hlo",
             "rng_bit_generator_expander.hlo",
             "rng_expander.hlo",
diff --git a/third_party/xla/xla/hlo/transforms/tests/algebraic_simplifier.hlo b/third_party/xla/xla/hlo/transforms/tests/algebraic_simplifier.hlo
index 72fbad3cd326..3f0e18d22eab 100644
--- a/third_party/xla/xla/hlo/transforms/tests/algebraic_simplifier.hlo
+++ b/third_party/xla/xla/hlo/transforms/tests/algebraic_simplifier.hlo
@@ -6,9 +6,9 @@
 // CHECK-LABEL: ENTRY %test
 // CHECK-NEXT:  %[[p0:[^ ]+]] = s32[8]{0} parameter(0)
 // CHECK-NEXT:  %[[p1:[^ ]+]] = s32[8]{0} parameter(1)
-// CHECK-NEXT:  %[[add:[^ ]+]] = s32[8]{0} add(s32[8]{0} %[[p0]], s32[8]{0} %[[p1]])
+// CHECK-NEXT:  %[[add:[^ ]+]] = s32[8]{0} add(%[[p0]], %[[p1]])
 // CHECK-NEXT:  %[[p2:[^ ]+]] = s32[8]{0} parameter(2)
-// CHECK-NEXT:  ROOT %[[multiply:[^ ]+]] = s32[8]{0} multiply(s32[8]{0} %[[add]], s32[8]{0} %[[p2]])
+// CHECK-NEXT:  ROOT %[[multiply:[^ ]+]] = s32[8]{0} multiply(%[[add]], %[[p2]])
 
 HloModule m
 ENTRY test {
diff --git a/third_party/xla/xla/hlo/transforms/tests/cholesky_expander.hlo b/third_party/xla/xla/hlo/transforms/tests/cholesky_expander.hlo
index 53b19ac36b15..f1a177a84ddb 100644
--- a/third_party/xla/xla/hlo/transforms/tests/cholesky_expander.hlo
+++ b/third_party/xla/xla/hlo/transforms/tests/cholesky_expander.hlo
@@ -5,80 +5,80 @@
 
 // CHECK:       %[[$unblocked_body_15:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_16:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_16]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element(%[[parameter_16]]), index=0
 // CHECK-NEXT:  %[[constant_21:[^ ]+]] = s32[] constant(1)
-// CHECK-NEXT:  %[[add_22:[^ ]+]] = s32[] add(s32[] %[[get_tuple_element_17]], s32[] %[[constant_21]])
-// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_16]]), index=1
+// CHECK-NEXT:  %[[add_22:[^ ]+]] = s32[] add(%[[get_tuple_element_17]], %[[constant_21]])
+// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element(%[[parameter_16]]), index=1
 // CHECK-NEXT:  %[[iota_24:[^ ]+]] = s32[32,4,4]{2,1,0} iota(), iota_dimension=1
 // CHECK-NEXT:  %[[iota_23:[^ ]+]] = s32[32,4,4]{2,1,0} iota(), iota_dimension=2
-// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[32,4,4]{2,1,0} compare(s32[32,4,4]{2,1,0} %[[iota_24]], s32[32,4,4]{2,1,0} %[[iota_23]]), direction=GE
-// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = s32[32,4,4]{2,1,0} broadcast(s32[] %[[get_tuple_element_17]]), dimensions={}
-// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[32,4,4]{2,1,0} compare(s32[32,4,4]{2,1,0} %[[iota_23]], s32[32,4,4]{2,1,0} %[[broadcast_26]]), direction=EQ
-// CHECK-NEXT:  %[[and_28:[^ ]+]] = pred[32,4,4]{2,1,0} and(pred[32,4,4]{2,1,0} %[[compare_25]], pred[32,4,4]{2,1,0} %[[compare_27]])
-// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_16]]), index=2
-// CHECK-NEXT:  %[[dot_31:[^ ]+]] = f16[32,4,4]{2,1,0} dot(f16[32,4,4]{2,1,0} %[[get_tuple_element_19]], f16[32,4,4]{2,1,0} %[[get_tuple_element_19]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
-// CHECK-NEXT:  %[[transpose_32:[^ ]+]] = f16[32,4,4]{2,1,0} transpose(f16[32,4,4]{2,1,0} %[[dot_31]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[subtract_33:[^ ]+]] = f16[32,4,4]{2,1,0} subtract(f16[32,4,4]{2,1,0} %[[get_tuple_element_18]], f16[32,4,4]{2,1,0} %[[transpose_32]])
+// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[32,4,4]{2,1,0} compare(%[[iota_24]], %[[iota_23]]), direction=GE
+// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = s32[32,4,4]{2,1,0} broadcast(%[[get_tuple_element_17]]), dimensions={}
+// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[32,4,4]{2,1,0} compare(%[[iota_23]], %[[broadcast_26]]), direction=EQ
+// CHECK-NEXT:  %[[and_28:[^ ]+]] = pred[32,4,4]{2,1,0} and(%[[compare_25]], %[[compare_27]])
+// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element(%[[parameter_16]]), index=2
+// CHECK-NEXT:  %[[dot_31:[^ ]+]] = f16[32,4,4]{2,1,0} dot(%[[get_tuple_element_19]], %[[get_tuple_element_19]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+// CHECK-NEXT:  %[[transpose_32:[^ ]+]] = f16[32,4,4]{2,1,0} transpose(%[[dot_31]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[subtract_33:[^ ]+]] = f16[32,4,4]{2,1,0} subtract(%[[get_tuple_element_18]], %[[transpose_32]])
 // CHECK-NEXT:  %[[constant_34:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_slice_35:[^ ]+]] = f16[32,1,1]{2,1,0} dynamic-slice(f16[32,4,4]{2,1,0} %[[subtract_33]], s32[] %[[constant_34]], s32[] %[[get_tuple_element_17]], s32[] %[[get_tuple_element_17]]), dynamic_slice_sizes={32,1,1}
-// CHECK-NEXT:  %[[sqrt_36:[^ ]+]] = f16[32,1,1]{2,1,0} sqrt(f16[32,1,1]{2,1,0} %[[dynamic_slice_35]])
-// CHECK-NEXT:  %[[reshape_39:[^ ]+]] = f16[32]{0} reshape(f16[32,1,1]{2,1,0} %[[sqrt_36]])
-// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(f16[32]{0} %[[reshape_39]]), dimensions={0}
-// CHECK-NEXT:  %[[divide_41:[^ ]+]] = f16[32,4,4]{2,1,0} divide(f16[32,4,4]{2,1,0} %[[subtract_33]], f16[32,4,4]{2,1,0} %[[broadcast_40]])
+// CHECK-NEXT:  %[[dynamic_slice_35:[^ ]+]] = f16[32,1,1]{2,1,0} dynamic-slice(%[[subtract_33]], %[[constant_34]], %[[get_tuple_element_17]], %[[get_tuple_element_17]]), dynamic_slice_sizes={32,1,1}
+// CHECK-NEXT:  %[[sqrt_36:[^ ]+]] = f16[32,1,1]{2,1,0} sqrt(%[[dynamic_slice_35]])
+// CHECK-NEXT:  %[[reshape_39:[^ ]+]] = f16[32]{0} reshape(%[[sqrt_36]])
+// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(%[[reshape_39]]), dimensions={0}
+// CHECK-NEXT:  %[[divide_41:[^ ]+]] = f16[32,4,4]{2,1,0} divide(%[[subtract_33]], %[[broadcast_40]])
 // CHECK-NEXT:  %[[constant_29:[^ ]+]] = f16[] constant(0)
-// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(f16[] %[[constant_29]]), dimensions={}
-// CHECK-NEXT:  %[[select_42:[^ ]+]] = f16[32,4,4]{2,1,0} select(pred[32,4,4]{2,1,0} %[[and_28]], f16[32,4,4]{2,1,0} %[[divide_41]], f16[32,4,4]{2,1,0} %[[broadcast_30]])
-// CHECK-NEXT:  %[[add_43:[^ ]+]] = f16[32,4,4]{2,1,0} add(f16[32,4,4]{2,1,0} %[[select_42]], f16[32,4,4]{2,1,0} %[[get_tuple_element_19]])
-// CHECK-NEXT:  %[[get_tuple_element_20:[^ ]+]] = pred[32,1,1]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_16]]), index=3
-// CHECK-NEXT:  %[[compare_37:[^ ]+]] = pred[32,1,1]{2,1,0} compare(f16[32,1,1]{2,1,0} %[[sqrt_36]], f16[32,1,1]{2,1,0} %[[sqrt_36]]), direction=NE
-// CHECK-NEXT:  %[[or_38:[^ ]+]] = pred[32,1,1]{2,1,0} or(pred[32,1,1]{2,1,0} %[[get_tuple_element_20]], pred[32,1,1]{2,1,0} %[[compare_37]])
-// CHECK-NEXT:  ROOT %[[tuple_44:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) tuple(s32[] %[[add_22]], f16[32,4,4]{2,1,0} %[[get_tuple_element_18]], f16[32,4,4]{2,1,0} %[[add_43]], pred[32,1,1]{2,1,0} %[[or_38]])
+// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(%[[constant_29]]), dimensions={}
+// CHECK-NEXT:  %[[select_42:[^ ]+]] = f16[32,4,4]{2,1,0} select(%[[and_28]], %[[divide_41]], %[[broadcast_30]])
+// CHECK-NEXT:  %[[add_43:[^ ]+]] = f16[32,4,4]{2,1,0} add(%[[select_42]], %[[get_tuple_element_19]])
+// CHECK-NEXT:  %[[get_tuple_element_20:[^ ]+]] = pred[32,1,1]{2,1,0} get-tuple-element(%[[parameter_16]]), index=3
+// CHECK-NEXT:  %[[compare_37:[^ ]+]] = pred[32,1,1]{2,1,0} compare(%[[sqrt_36]], %[[sqrt_36]]), direction=NE
+// CHECK-NEXT:  %[[or_38:[^ ]+]] = pred[32,1,1]{2,1,0} or(%[[get_tuple_element_20]], %[[compare_37]])
+// CHECK-NEXT:  ROOT %[[tuple_44:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) tuple(%[[add_22]], %[[get_tuple_element_18]], %[[add_43]], %[[or_38]])
 
 // CHECK:       %[[$unblocked_condition_45:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_46:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_48:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_46]]), index=1
-// CHECK-NEXT:  %[[get_tuple_element_49:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_46]]), index=2
-// CHECK-NEXT:  %[[get_tuple_element_50:[^ ]+]] = pred[32,1,1]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_46]]), index=3
-// CHECK-NEXT:  %[[get_tuple_element_47:[^ ]+]] = s32[] get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[parameter_46]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_48:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element(%[[parameter_46]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_49:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element(%[[parameter_46]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_50:[^ ]+]] = pred[32,1,1]{2,1,0} get-tuple-element(%[[parameter_46]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_47:[^ ]+]] = s32[] get-tuple-element(%[[parameter_46]]), index=0
 // CHECK-NEXT:  %[[constant_51:[^ ]+]] = s32[] constant(4)
-// CHECK-NEXT:  ROOT %[[compare_52:[^ ]+]] = pred[] compare(s32[] %[[get_tuple_element_47]], s32[] %[[constant_51]]), direction=LT
+// CHECK-NEXT:  ROOT %[[compare_52:[^ ]+]] = pred[] compare(%[[get_tuple_element_47]], %[[constant_51]]), direction=LT
 
 // CHECK:       %[[$xla_cholesky_f16_32_4_4__upper_70:[^ ]+]]
 // CHECK-NEXT:  %[[constant_13:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[a_1:[^ ]+]] = f16[32,4,4]{2,1,0} parameter(0)
-// CHECK-NEXT:  %[[transpose_2:[^ ]+]] = f16[32,4,4]{1,2,0} transpose(f16[32,4,4]{2,1,0} %[[a_1]]), dimensions={0,2,1}
-// CHECK-NEXT:  %[[slice_7:[^ ]+]] = f16[32,4,4]{2,1,0} slice(f16[32,4,4]{1,2,0} %[[transpose_2]]), slice={[0:32], [0:4], [0:4]}
-// CHECK-NEXT:  %[[slice_8:[^ ]+]] = f16[32,4,4]{2,1,0} slice(f16[32,4,4]{2,1,0} %[[slice_7]]), slice={[0:32], [0:4], [0:4]}
+// CHECK-NEXT:  %[[transpose_2:[^ ]+]] = f16[32,4,4]{1,2,0} transpose(%[[a_1]]), dimensions={0,2,1}
+// CHECK-NEXT:  %[[slice_7:[^ ]+]] = f16[32,4,4]{2,1,0} slice(%[[transpose_2]]), slice={[0:32], [0:4], [0:4]}
+// CHECK-NEXT:  %[[slice_8:[^ ]+]] = f16[32,4,4]{2,1,0} slice(%[[slice_7]]), slice={[0:32], [0:4], [0:4]}
 // CHECK-NEXT:  %[[constant_9:[^ ]+]] = f16[] constant(0)
-// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(f16[] %[[constant_9]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(%[[constant_9]]), dimensions={}
 // CHECK-NEXT:  %[[constant_11:[^ ]+]] = pred[] constant(false)
-// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = pred[32,1,1]{2,1,0} broadcast(pred[] %[[constant_11]]), dimensions={}
-// CHECK-NEXT:  %[[tuple_14:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) tuple(s32[] %[[constant_13]], f16[32,4,4]{2,1,0} %[[slice_8]], f16[32,4,4]{2,1,0} %[[broadcast_10]], pred[32,1,1]{2,1,0} %[[broadcast_12]])
-// CHECK-NEXT:  %[[while_53:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) while((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[tuple_14]]), condition=%[[$unblocked_condition_45]], body=%[[$unblocked_body_15]]
-// CHECK-NEXT:  %[[get_tuple_element_54:[^ ]+]] = s32[] get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[while_53]]), index=0
-// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[while_53]]), index=1
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = pred[32,1,1]{2,1,0} broadcast(%[[constant_11]]), dimensions={}
+// CHECK-NEXT:  %[[tuple_14:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) tuple(%[[constant_13]], %[[slice_8]], %[[broadcast_10]], %[[broadcast_12]])
+// CHECK-NEXT:  %[[while_53:[^ ]+]] = (s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) while(%[[tuple_14]]), condition=%[[$unblocked_condition_45]], body=%[[$unblocked_body_15]]
+// CHECK-NEXT:  %[[get_tuple_element_54:[^ ]+]] = s32[] get-tuple-element(%[[while_53]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element(%[[while_53]]), index=1
 // CHECK-NEXT:  %[[constant_5:[^ ]+]] = pred[] constant(false)
-// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = pred[32,1,1]{2,1,0} broadcast(pred[] %[[constant_5]]), dimensions={}
-// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = pred[32,1,1]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[while_53]]), index=3
-// CHECK-NEXT:  %[[or_58:[^ ]+]] = pred[32,1,1]{2,1,0} or(pred[32,1,1]{2,1,0} %[[broadcast_6]], pred[32,1,1]{2,1,0} %[[get_tuple_element_57]])
-// CHECK-NEXT:  %[[broadcast_63:[^ ]+]] = pred[32,1,1]{2,1,0} broadcast(pred[32,1,1]{2,1,0} %[[or_58]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[reshape_64:[^ ]+]] = pred[32]{0} reshape(pred[32,1,1]{2,1,0} %[[broadcast_63]])
-// CHECK-NEXT:  %[[broadcast_65:[^ ]+]] = pred[32,4,4]{2,1,0} broadcast(pred[32]{0} %[[reshape_64]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = pred[32,1,1]{2,1,0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = pred[32,1,1]{2,1,0} get-tuple-element(%[[while_53]]), index=3
+// CHECK-NEXT:  %[[or_58:[^ ]+]] = pred[32,1,1]{2,1,0} or(%[[broadcast_6]], %[[get_tuple_element_57]])
+// CHECK-NEXT:  %[[broadcast_63:[^ ]+]] = pred[32,1,1]{2,1,0} broadcast(%[[or_58]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[reshape_64:[^ ]+]] = pred[32]{0} reshape(%[[broadcast_63]])
+// CHECK-NEXT:  %[[broadcast_65:[^ ]+]] = pred[32,4,4]{2,1,0} broadcast(%[[reshape_64]]), dimensions={0}
 // CHECK-NEXT:  %[[constant_66:[^ ]+]] = f16[] constant(nan)
-// CHECK-NEXT:  %[[broadcast_67:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(f16[] %[[constant_66]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_67:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(%[[constant_66]]), dimensions={}
 // CHECK-NEXT:  %[[constant_3:[^ ]+]] = f16[] constant(0)
-// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(f16[] %[[constant_3]]), dimensions={}
-// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element((s32[], f16[32,4,4]{2,1,0}, f16[32,4,4]{2,1,0}, pred[32,1,1]{2,1,0}) %[[while_53]]), index=2
+// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = f16[32,4,4]{2,1,0} broadcast(%[[constant_3]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = f16[32,4,4]{2,1,0} get-tuple-element(%[[while_53]]), index=2
 // CHECK-NEXT:  %[[constant_59:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_60:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_61:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_update_slice_62:[^ ]+]] = f16[32,4,4]{2,1,0} dynamic-update-slice(f16[32,4,4]{2,1,0} %[[broadcast_4]], f16[32,4,4]{2,1,0} %[[get_tuple_element_56]], s32[] %[[constant_59]], s32[] %[[constant_60]], s32[] %[[constant_61]])
-// CHECK-NEXT:  %[[select_68:[^ ]+]] = f16[32,4,4]{2,1,0} select(pred[32,4,4]{2,1,0} %[[broadcast_65]], f16[32,4,4]{2,1,0} %[[broadcast_67]], f16[32,4,4]{2,1,0} %[[dynamic_update_slice_62]])
-// CHECK-NEXT:  ROOT %[[transpose_69:[^ ]+]] = f16[32,4,4]{1,2,0} transpose(f16[32,4,4]{2,1,0} %[[select_68]]), dimensions={0,2,1}
+// CHECK-NEXT:  %[[dynamic_update_slice_62:[^ ]+]] = f16[32,4,4]{2,1,0} dynamic-update-slice(%[[broadcast_4]], %[[get_tuple_element_56]], %[[constant_59]], %[[constant_60]], %[[constant_61]])
+// CHECK-NEXT:  %[[select_68:[^ ]+]] = f16[32,4,4]{2,1,0} select(%[[broadcast_65]], %[[broadcast_67]], %[[dynamic_update_slice_62]])
+// CHECK-NEXT:  ROOT %[[transpose_69:[^ ]+]] = f16[32,4,4]{1,2,0} transpose(%[[select_68]]), dimensions={0,2,1}
 
 // CHECK-LABEL: ENTRY %test
 // CHECK-NEXT:  %[[input:[^ ]+]] = f16[32,4,4]{2,1,0} parameter(0)
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f16[32,4,4]{2,1,0} call(f16[32,4,4]{2,1,0} %[[input]]), to_apply=%[[$xla_cholesky_f16_32_4_4__upper_70]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f16[32,4,4]{2,1,0} call(%[[input]]), to_apply=%[[$xla_cholesky_f16_32_4_4__upper_70]]
 
 HloModule CholeskyExpanderTest
 
@@ -93,88 +93,88 @@ ENTRY test {
 
 // CHECK:       %[[$unblocked_body_15:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_16:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_16]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element(%[[parameter_16]]), index=0
 // CHECK-NEXT:  %[[constant_21:[^ ]+]] = s32[] constant(1)
-// CHECK-NEXT:  %[[add_22:[^ ]+]] = s32[] add(s32[] %[[get_tuple_element_17]], s32[] %[[constant_21]])
-// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_16]]), index=1
+// CHECK-NEXT:  %[[add_22:[^ ]+]] = s32[] add(%[[get_tuple_element_17]], %[[constant_21]])
+// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element(%[[parameter_16]]), index=1
 // CHECK-NEXT:  %[[iota_24:[^ ]+]] = s32[4,8,8]{2,1,0} iota(), iota_dimension=1
 // CHECK-NEXT:  %[[iota_23:[^ ]+]] = s32[4,8,8]{2,1,0} iota(), iota_dimension=2
-// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[4,8,8]{2,1,0} compare(s32[4,8,8]{2,1,0} %[[iota_24]], s32[4,8,8]{2,1,0} %[[iota_23]]), direction=GE
-// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = s32[4,8,8]{2,1,0} broadcast(s32[] %[[get_tuple_element_17]]), dimensions={}
-// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[4,8,8]{2,1,0} compare(s32[4,8,8]{2,1,0} %[[iota_23]], s32[4,8,8]{2,1,0} %[[broadcast_26]]), direction=EQ
-// CHECK-NEXT:  %[[and_28:[^ ]+]] = pred[4,8,8]{2,1,0} and(pred[4,8,8]{2,1,0} %[[compare_25]], pred[4,8,8]{2,1,0} %[[compare_27]])
-// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_16]]), index=2
-// CHECK-NEXT:  %[[real_31:[^ ]+]] = f32[4,8,8]{2,1,0} real(c64[4,8,8]{2,1,0} %[[get_tuple_element_19]])
-// CHECK-NEXT:  %[[imag_32:[^ ]+]] = f32[4,8,8]{2,1,0} imag(c64[4,8,8]{2,1,0} %[[get_tuple_element_19]])
-// CHECK-NEXT:  %[[negate_33:[^ ]+]] = f32[4,8,8]{2,1,0} negate(f32[4,8,8]{2,1,0} %[[imag_32]])
-// CHECK-NEXT:  %[[complex_34:[^ ]+]] = c64[4,8,8]{2,1,0} complex(f32[4,8,8]{2,1,0} %[[real_31]], f32[4,8,8]{2,1,0} %[[negate_33]])
-// CHECK-NEXT:  %[[dot_35:[^ ]+]] = c64[4,8,8]{2,1,0} dot(c64[4,8,8]{2,1,0} %[[get_tuple_element_19]], c64[4,8,8]{2,1,0} %[[complex_34]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
-// CHECK-NEXT:  %[[transpose_36:[^ ]+]] = c64[4,8,8]{2,1,0} transpose(c64[4,8,8]{2,1,0} %[[dot_35]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[subtract_37:[^ ]+]] = c64[4,8,8]{2,1,0} subtract(c64[4,8,8]{2,1,0} %[[get_tuple_element_18]], c64[4,8,8]{2,1,0} %[[transpose_36]])
+// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[4,8,8]{2,1,0} compare(%[[iota_24]], %[[iota_23]]), direction=GE
+// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = s32[4,8,8]{2,1,0} broadcast(%[[get_tuple_element_17]]), dimensions={}
+// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[4,8,8]{2,1,0} compare(%[[iota_23]], %[[broadcast_26]]), direction=EQ
+// CHECK-NEXT:  %[[and_28:[^ ]+]] = pred[4,8,8]{2,1,0} and(%[[compare_25]], %[[compare_27]])
+// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element(%[[parameter_16]]), index=2
+// CHECK-NEXT:  %[[real_31:[^ ]+]] = f32[4,8,8]{2,1,0} real(%[[get_tuple_element_19]])
+// CHECK-NEXT:  %[[imag_32:[^ ]+]] = f32[4,8,8]{2,1,0} imag(%[[get_tuple_element_19]])
+// CHECK-NEXT:  %[[negate_33:[^ ]+]] = f32[4,8,8]{2,1,0} negate(%[[imag_32]])
+// CHECK-NEXT:  %[[complex_34:[^ ]+]] = c64[4,8,8]{2,1,0} complex(%[[real_31]], %[[negate_33]])
+// CHECK-NEXT:  %[[dot_35:[^ ]+]] = c64[4,8,8]{2,1,0} dot(%[[get_tuple_element_19]], %[[complex_34]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+// CHECK-NEXT:  %[[transpose_36:[^ ]+]] = c64[4,8,8]{2,1,0} transpose(%[[dot_35]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[subtract_37:[^ ]+]] = c64[4,8,8]{2,1,0} subtract(%[[get_tuple_element_18]], %[[transpose_36]])
 // CHECK-NEXT:  %[[constant_38:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_slice_39:[^ ]+]] = c64[4,1,1]{2,1,0} dynamic-slice(c64[4,8,8]{2,1,0} %[[subtract_37]], s32[] %[[constant_38]], s32[] %[[get_tuple_element_17]], s32[] %[[get_tuple_element_17]]), dynamic_slice_sizes={4,1,1}
-// CHECK-NEXT:  %[[real_40:[^ ]+]] = f32[4,1,1]{2,1,0} real(c64[4,1,1]{2,1,0} %[[dynamic_slice_39]])
-// CHECK-NEXT:  %[[sqrt_41:[^ ]+]] = f32[4,1,1]{2,1,0} sqrt(f32[4,1,1]{2,1,0} %[[real_40]])
+// CHECK-NEXT:  %[[dynamic_slice_39:[^ ]+]] = c64[4,1,1]{2,1,0} dynamic-slice(%[[subtract_37]], %[[constant_38]], %[[get_tuple_element_17]], %[[get_tuple_element_17]]), dynamic_slice_sizes={4,1,1}
+// CHECK-NEXT:  %[[real_40:[^ ]+]] = f32[4,1,1]{2,1,0} real(%[[dynamic_slice_39]])
+// CHECK-NEXT:  %[[sqrt_41:[^ ]+]] = f32[4,1,1]{2,1,0} sqrt(%[[real_40]])
 // CHECK-NEXT:  %[[constant_42:[^ ]+]] = f32[] constant(0)
-// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = f32[4,1,1]{2,1,0} broadcast(f32[] %[[constant_42]]), dimensions={}
-// CHECK-NEXT:  %[[complex_44:[^ ]+]] = c64[4,1,1]{2,1,0} complex(f32[4,1,1]{2,1,0} %[[sqrt_41]], f32[4,1,1]{2,1,0} %[[broadcast_43]])
-// CHECK-NEXT:  %[[reshape_47:[^ ]+]] = c64[4]{0} reshape(c64[4,1,1]{2,1,0} %[[complex_44]])
-// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(c64[4]{0} %[[reshape_47]]), dimensions={0}
-// CHECK-NEXT:  %[[divide_49:[^ ]+]] = c64[4,8,8]{2,1,0} divide(c64[4,8,8]{2,1,0} %[[subtract_37]], c64[4,8,8]{2,1,0} %[[broadcast_48]])
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = f32[4,1,1]{2,1,0} broadcast(%[[constant_42]]), dimensions={}
+// CHECK-NEXT:  %[[complex_44:[^ ]+]] = c64[4,1,1]{2,1,0} complex(%[[sqrt_41]], %[[broadcast_43]])
+// CHECK-NEXT:  %[[reshape_47:[^ ]+]] = c64[4]{0} reshape(%[[complex_44]])
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(%[[reshape_47]]), dimensions={0}
+// CHECK-NEXT:  %[[divide_49:[^ ]+]] = c64[4,8,8]{2,1,0} divide(%[[subtract_37]], %[[broadcast_48]])
 // CHECK-NEXT:  %[[constant_29:[^ ]+]] = c64[] constant((0, 0))
-// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(c64[] %[[constant_29]]), dimensions={}
-// CHECK-NEXT:  %[[select_50:[^ ]+]] = c64[4,8,8]{2,1,0} select(pred[4,8,8]{2,1,0} %[[and_28]], c64[4,8,8]{2,1,0} %[[divide_49]], c64[4,8,8]{2,1,0} %[[broadcast_30]])
-// CHECK-NEXT:  %[[add_51:[^ ]+]] = c64[4,8,8]{2,1,0} add(c64[4,8,8]{2,1,0} %[[select_50]], c64[4,8,8]{2,1,0} %[[get_tuple_element_19]])
-// CHECK-NEXT:  %[[get_tuple_element_20:[^ ]+]] = pred[4,1,1]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_16]]), index=3
-// CHECK-NEXT:  %[[compare_45:[^ ]+]] = pred[4,1,1]{2,1,0} compare(f32[4,1,1]{2,1,0} %[[sqrt_41]], f32[4,1,1]{2,1,0} %[[sqrt_41]]), direction=NE
-// CHECK-NEXT:  %[[or_46:[^ ]+]] = pred[4,1,1]{2,1,0} or(pred[4,1,1]{2,1,0} %[[get_tuple_element_20]], pred[4,1,1]{2,1,0} %[[compare_45]])
-// CHECK-NEXT:  ROOT %[[tuple_52:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) tuple(s32[] %[[add_22]], c64[4,8,8]{2,1,0} %[[get_tuple_element_18]], c64[4,8,8]{2,1,0} %[[add_51]], pred[4,1,1]{2,1,0} %[[or_46]])
+// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(%[[constant_29]]), dimensions={}
+// CHECK-NEXT:  %[[select_50:[^ ]+]] = c64[4,8,8]{2,1,0} select(%[[and_28]], %[[divide_49]], %[[broadcast_30]])
+// CHECK-NEXT:  %[[add_51:[^ ]+]] = c64[4,8,8]{2,1,0} add(%[[select_50]], %[[get_tuple_element_19]])
+// CHECK-NEXT:  %[[get_tuple_element_20:[^ ]+]] = pred[4,1,1]{2,1,0} get-tuple-element(%[[parameter_16]]), index=3
+// CHECK-NEXT:  %[[compare_45:[^ ]+]] = pred[4,1,1]{2,1,0} compare(%[[sqrt_41]], %[[sqrt_41]]), direction=NE
+// CHECK-NEXT:  %[[or_46:[^ ]+]] = pred[4,1,1]{2,1,0} or(%[[get_tuple_element_20]], %[[compare_45]])
+// CHECK-NEXT:  ROOT %[[tuple_52:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) tuple(%[[add_22]], %[[get_tuple_element_18]], %[[add_51]], %[[or_46]])
 
 // CHECK:       %[[$unblocked_condition_53:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_54:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_54]]), index=1
-// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_54]]), index=2
-// CHECK-NEXT:  %[[get_tuple_element_58:[^ ]+]] = pred[4,1,1]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_54]]), index=3
-// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = s32[] get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[parameter_54]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element(%[[parameter_54]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element(%[[parameter_54]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_58:[^ ]+]] = pred[4,1,1]{2,1,0} get-tuple-element(%[[parameter_54]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = s32[] get-tuple-element(%[[parameter_54]]), index=0
 // CHECK-NEXT:  %[[constant_59:[^ ]+]] = s32[] constant(8)
-// CHECK-NEXT:  ROOT %[[compare_60:[^ ]+]] = pred[] compare(s32[] %[[get_tuple_element_55]], s32[] %[[constant_59]]), direction=LT
+// CHECK-NEXT:  ROOT %[[compare_60:[^ ]+]] = pred[] compare(%[[get_tuple_element_55]], %[[constant_59]]), direction=LT
 
 // CHECK:       %[[$xla_cholesky_c64_4_8_8__upper_78:[^ ]+]]
 // CHECK-NEXT:  %[[constant_13:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[a_1:[^ ]+]] = c64[4,8,8]{2,1,0} parameter(0)
-// CHECK-NEXT:  %[[transpose_2:[^ ]+]] = c64[4,8,8]{1,2,0} transpose(c64[4,8,8]{2,1,0} %[[a_1]]), dimensions={0,2,1}
-// CHECK-NEXT:  %[[slice_7:[^ ]+]] = c64[4,8,8]{2,1,0} slice(c64[4,8,8]{1,2,0} %[[transpose_2]]), slice={[0:4], [0:8], [0:8]}
-// CHECK-NEXT:  %[[slice_8:[^ ]+]] = c64[4,8,8]{2,1,0} slice(c64[4,8,8]{2,1,0} %[[slice_7]]), slice={[0:4], [0:8], [0:8]}
+// CHECK-NEXT:  %[[transpose_2:[^ ]+]] = c64[4,8,8]{1,2,0} transpose(%[[a_1]]), dimensions={0,2,1}
+// CHECK-NEXT:  %[[slice_7:[^ ]+]] = c64[4,8,8]{2,1,0} slice(%[[transpose_2]]), slice={[0:4], [0:8], [0:8]}
+// CHECK-NEXT:  %[[slice_8:[^ ]+]] = c64[4,8,8]{2,1,0} slice(%[[slice_7]]), slice={[0:4], [0:8], [0:8]}
 // CHECK-NEXT:  %[[constant_9:[^ ]+]] = c64[] constant((0, 0))
-// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(c64[] %[[constant_9]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(%[[constant_9]]), dimensions={}
 // CHECK-NEXT:  %[[constant_11:[^ ]+]] = pred[] constant(false)
-// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = pred[4,1,1]{2,1,0} broadcast(pred[] %[[constant_11]]), dimensions={}
-// CHECK-NEXT:  %[[tuple_14:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) tuple(s32[] %[[constant_13]], c64[4,8,8]{2,1,0} %[[slice_8]], c64[4,8,8]{2,1,0} %[[broadcast_10]], pred[4,1,1]{2,1,0} %[[broadcast_12]])
-// CHECK-NEXT:  %[[while_61:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) while((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[tuple_14]]), condition=%[[$unblocked_condition_53]], body=%[[$unblocked_body_15]]
-// CHECK-NEXT:  %[[get_tuple_element_62:[^ ]+]] = s32[] get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[while_61]]), index=0
-// CHECK-NEXT:  %[[get_tuple_element_63:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[while_61]]), index=1
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = pred[4,1,1]{2,1,0} broadcast(%[[constant_11]]), dimensions={}
+// CHECK-NEXT:  %[[tuple_14:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) tuple(%[[constant_13]], %[[slice_8]], %[[broadcast_10]], %[[broadcast_12]])
+// CHECK-NEXT:  %[[while_61:[^ ]+]] = (s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) while(%[[tuple_14]]), condition=%[[$unblocked_condition_53]], body=%[[$unblocked_body_15]]
+// CHECK-NEXT:  %[[get_tuple_element_62:[^ ]+]] = s32[] get-tuple-element(%[[while_61]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_63:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element(%[[while_61]]), index=1
 // CHECK-NEXT:  %[[constant_5:[^ ]+]] = pred[] constant(false)
-// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = pred[4,1,1]{2,1,0} broadcast(pred[] %[[constant_5]]), dimensions={}
-// CHECK-NEXT:  %[[get_tuple_element_65:[^ ]+]] = pred[4,1,1]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[while_61]]), index=3
-// CHECK-NEXT:  %[[or_66:[^ ]+]] = pred[4,1,1]{2,1,0} or(pred[4,1,1]{2,1,0} %[[broadcast_6]], pred[4,1,1]{2,1,0} %[[get_tuple_element_65]])
-// CHECK-NEXT:  %[[broadcast_71:[^ ]+]] = pred[4,1,1]{2,1,0} broadcast(pred[4,1,1]{2,1,0} %[[or_66]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[reshape_72:[^ ]+]] = pred[4]{0} reshape(pred[4,1,1]{2,1,0} %[[broadcast_71]])
-// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = pred[4,8,8]{2,1,0} broadcast(pred[4]{0} %[[reshape_72]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = pred[4,1,1]{2,1,0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_65:[^ ]+]] = pred[4,1,1]{2,1,0} get-tuple-element(%[[while_61]]), index=3
+// CHECK-NEXT:  %[[or_66:[^ ]+]] = pred[4,1,1]{2,1,0} or(%[[broadcast_6]], %[[get_tuple_element_65]])
+// CHECK-NEXT:  %[[broadcast_71:[^ ]+]] = pred[4,1,1]{2,1,0} broadcast(%[[or_66]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[reshape_72:[^ ]+]] = pred[4]{0} reshape(%[[broadcast_71]])
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = pred[4,8,8]{2,1,0} broadcast(%[[reshape_72]]), dimensions={0}
 // CHECK-NEXT:  %[[constant_74:[^ ]+]] = c64[] constant((nan, 0))
-// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(c64[] %[[constant_74]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(%[[constant_74]]), dimensions={}
 // CHECK-NEXT:  %[[constant_3:[^ ]+]] = c64[] constant((0, 0))
-// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(c64[] %[[constant_3]]), dimensions={}
-// CHECK-NEXT:  %[[get_tuple_element_64:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element((s32[], c64[4,8,8]{2,1,0}, c64[4,8,8]{2,1,0}, pred[4,1,1]{2,1,0}) %[[while_61]]), index=2
+// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = c64[4,8,8]{2,1,0} broadcast(%[[constant_3]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_64:[^ ]+]] = c64[4,8,8]{2,1,0} get-tuple-element(%[[while_61]]), index=2
 // CHECK-NEXT:  %[[constant_67:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_68:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_69:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_update_slice_70:[^ ]+]] = c64[4,8,8]{2,1,0} dynamic-update-slice(c64[4,8,8]{2,1,0} %[[broadcast_4]], c64[4,8,8]{2,1,0} %[[get_tuple_element_64]], s32[] %[[constant_67]], s32[] %[[constant_68]], s32[] %[[constant_69]])
-// CHECK-NEXT:  %[[select_76:[^ ]+]] = c64[4,8,8]{2,1,0} select(pred[4,8,8]{2,1,0} %[[broadcast_73]], c64[4,8,8]{2,1,0} %[[broadcast_75]], c64[4,8,8]{2,1,0} %[[dynamic_update_slice_70]])
-// CHECK-NEXT:  ROOT %[[transpose_77:[^ ]+]] = c64[4,8,8]{1,2,0} transpose(c64[4,8,8]{2,1,0} %[[select_76]]), dimensions={0,2,1}
+// CHECK-NEXT:  %[[dynamic_update_slice_70:[^ ]+]] = c64[4,8,8]{2,1,0} dynamic-update-slice(%[[broadcast_4]], %[[get_tuple_element_64]], %[[constant_67]], %[[constant_68]], %[[constant_69]])
+// CHECK-NEXT:  %[[select_76:[^ ]+]] = c64[4,8,8]{2,1,0} select(%[[broadcast_73]], %[[broadcast_75]], %[[dynamic_update_slice_70]])
+// CHECK-NEXT:  ROOT %[[transpose_77:[^ ]+]] = c64[4,8,8]{1,2,0} transpose(%[[select_76]]), dimensions={0,2,1}
 
 // CHECK-LABEL: ENTRY %test
 // CHECK-NEXT:  %[[input:[^ ]+]] = c64[4,8,8]{2,1,0} parameter(0)
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = c64[4,8,8]{2,1,0} call(c64[4,8,8]{2,1,0} %[[input]]), to_apply=%[[$xla_cholesky_c64_4_8_8__upper_78]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = c64[4,8,8]{2,1,0} call(%[[input]]), to_apply=%[[$xla_cholesky_c64_4_8_8__upper_78]]
 
 HloModule CholeskyExpanderTest
 
@@ -189,153 +189,153 @@ ENTRY test {
 
 // CHECK:       %[[$unblocked_body_15:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_16:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_16]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element(%[[parameter_16]]), index=0
 // CHECK-NEXT:  %[[constant_21:[^ ]+]] = s32[] constant(1)
-// CHECK-NEXT:  %[[add_22:[^ ]+]] = s32[] add(s32[] %[[get_tuple_element_17]], s32[] %[[constant_21]])
-// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_16]]), index=1
+// CHECK-NEXT:  %[[add_22:[^ ]+]] = s32[] add(%[[get_tuple_element_17]], %[[constant_21]])
+// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_16]]), index=1
 // CHECK-NEXT:  %[[iota_24:[^ ]+]] = s32[1,128,128]{2,1,0} iota(), iota_dimension=1
 // CHECK-NEXT:  %[[iota_23:[^ ]+]] = s32[1,128,128]{2,1,0} iota(), iota_dimension=2
-// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[1,128,128]{2,1,0} compare(s32[1,128,128]{2,1,0} %[[iota_24]], s32[1,128,128]{2,1,0} %[[iota_23]]), direction=GE
-// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = s32[1,128,128]{2,1,0} broadcast(s32[] %[[get_tuple_element_17]]), dimensions={}
-// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[1,128,128]{2,1,0} compare(s32[1,128,128]{2,1,0} %[[iota_23]], s32[1,128,128]{2,1,0} %[[broadcast_26]]), direction=EQ
-// CHECK-NEXT:  %[[and_28:[^ ]+]] = pred[1,128,128]{2,1,0} and(pred[1,128,128]{2,1,0} %[[compare_25]], pred[1,128,128]{2,1,0} %[[compare_27]])
-// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_16]]), index=2
-// CHECK-NEXT:  %[[dot_31:[^ ]+]] = f32[1,128,128]{2,1,0} dot(f32[1,128,128]{2,1,0} %[[get_tuple_element_19]], f32[1,128,128]{2,1,0} %[[get_tuple_element_19]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
-// CHECK-NEXT:  %[[transpose_32:[^ ]+]] = f32[1,128,128]{2,1,0} transpose(f32[1,128,128]{2,1,0} %[[dot_31]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[subtract_33:[^ ]+]] = f32[1,128,128]{2,1,0} subtract(f32[1,128,128]{2,1,0} %[[get_tuple_element_18]], f32[1,128,128]{2,1,0} %[[transpose_32]])
+// CHECK-NEXT:  %[[compare_25:[^ ]+]] = pred[1,128,128]{2,1,0} compare(%[[iota_24]], %[[iota_23]]), direction=GE
+// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = s32[1,128,128]{2,1,0} broadcast(%[[get_tuple_element_17]]), dimensions={}
+// CHECK-NEXT:  %[[compare_27:[^ ]+]] = pred[1,128,128]{2,1,0} compare(%[[iota_23]], %[[broadcast_26]]), direction=EQ
+// CHECK-NEXT:  %[[and_28:[^ ]+]] = pred[1,128,128]{2,1,0} and(%[[compare_25]], %[[compare_27]])
+// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_16]]), index=2
+// CHECK-NEXT:  %[[dot_31:[^ ]+]] = f32[1,128,128]{2,1,0} dot(%[[get_tuple_element_19]], %[[get_tuple_element_19]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+// CHECK-NEXT:  %[[transpose_32:[^ ]+]] = f32[1,128,128]{2,1,0} transpose(%[[dot_31]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[subtract_33:[^ ]+]] = f32[1,128,128]{2,1,0} subtract(%[[get_tuple_element_18]], %[[transpose_32]])
 // CHECK-NEXT:  %[[constant_34:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_slice_35:[^ ]+]] = f32[1,1,1]{2,1,0} dynamic-slice(f32[1,128,128]{2,1,0} %[[subtract_33]], s32[] %[[constant_34]], s32[] %[[get_tuple_element_17]], s32[] %[[get_tuple_element_17]]), dynamic_slice_sizes={1,1,1}
-// CHECK-NEXT:  %[[sqrt_36:[^ ]+]] = f32[1,1,1]{2,1,0} sqrt(f32[1,1,1]{2,1,0} %[[dynamic_slice_35]])
-// CHECK-NEXT:  %[[reshape_39:[^ ]+]] = f32[1]{0} reshape(f32[1,1,1]{2,1,0} %[[sqrt_36]])
-// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(f32[1]{0} %[[reshape_39]]), dimensions={0}
-// CHECK-NEXT:  %[[divide_41:[^ ]+]] = f32[1,128,128]{2,1,0} divide(f32[1,128,128]{2,1,0} %[[subtract_33]], f32[1,128,128]{2,1,0} %[[broadcast_40]])
+// CHECK-NEXT:  %[[dynamic_slice_35:[^ ]+]] = f32[1,1,1]{2,1,0} dynamic-slice(%[[subtract_33]], %[[constant_34]], %[[get_tuple_element_17]], %[[get_tuple_element_17]]), dynamic_slice_sizes={1,1,1}
+// CHECK-NEXT:  %[[sqrt_36:[^ ]+]] = f32[1,1,1]{2,1,0} sqrt(%[[dynamic_slice_35]])
+// CHECK-NEXT:  %[[reshape_39:[^ ]+]] = f32[1]{0} reshape(%[[sqrt_36]])
+// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(%[[reshape_39]]), dimensions={0}
+// CHECK-NEXT:  %[[divide_41:[^ ]+]] = f32[1,128,128]{2,1,0} divide(%[[subtract_33]], %[[broadcast_40]])
 // CHECK-NEXT:  %[[constant_29:[^ ]+]] = f32[] constant(0)
-// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(f32[] %[[constant_29]]), dimensions={}
-// CHECK-NEXT:  %[[select_42:[^ ]+]] = f32[1,128,128]{2,1,0} select(pred[1,128,128]{2,1,0} %[[and_28]], f32[1,128,128]{2,1,0} %[[divide_41]], f32[1,128,128]{2,1,0} %[[broadcast_30]])
-// CHECK-NEXT:  %[[add_43:[^ ]+]] = f32[1,128,128]{2,1,0} add(f32[1,128,128]{2,1,0} %[[select_42]], f32[1,128,128]{2,1,0} %[[get_tuple_element_19]])
-// CHECK-NEXT:  %[[get_tuple_element_20:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_16]]), index=3
-// CHECK-NEXT:  %[[compare_37:[^ ]+]] = pred[1,1,1]{2,1,0} compare(f32[1,1,1]{2,1,0} %[[sqrt_36]], f32[1,1,1]{2,1,0} %[[sqrt_36]]), direction=NE
-// CHECK-NEXT:  %[[or_38:[^ ]+]] = pred[1,1,1]{2,1,0} or(pred[1,1,1]{2,1,0} %[[get_tuple_element_20]], pred[1,1,1]{2,1,0} %[[compare_37]])
-// CHECK-NEXT:  ROOT %[[tuple_44:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(s32[] %[[add_22]], f32[1,128,128]{2,1,0} %[[get_tuple_element_18]], f32[1,128,128]{2,1,0} %[[add_43]], pred[1,1,1]{2,1,0} %[[or_38]])
+// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(%[[constant_29]]), dimensions={}
+// CHECK-NEXT:  %[[select_42:[^ ]+]] = f32[1,128,128]{2,1,0} select(%[[and_28]], %[[divide_41]], %[[broadcast_30]])
+// CHECK-NEXT:  %[[add_43:[^ ]+]] = f32[1,128,128]{2,1,0} add(%[[select_42]], %[[get_tuple_element_19]])
+// CHECK-NEXT:  %[[get_tuple_element_20:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element(%[[parameter_16]]), index=3
+// CHECK-NEXT:  %[[compare_37:[^ ]+]] = pred[1,1,1]{2,1,0} compare(%[[sqrt_36]], %[[sqrt_36]]), direction=NE
+// CHECK-NEXT:  %[[or_38:[^ ]+]] = pred[1,1,1]{2,1,0} or(%[[get_tuple_element_20]], %[[compare_37]])
+// CHECK-NEXT:  ROOT %[[tuple_44:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(%[[add_22]], %[[get_tuple_element_18]], %[[add_43]], %[[or_38]])
 
 // CHECK:       %[[$unblocked_condition_45:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_46:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_48:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_46]]), index=1
-// CHECK-NEXT:  %[[get_tuple_element_49:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_46]]), index=2
-// CHECK-NEXT:  %[[get_tuple_element_50:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_46]]), index=3
-// CHECK-NEXT:  %[[get_tuple_element_47:[^ ]+]] = s32[] get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_46]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_48:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_46]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_49:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_46]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_50:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element(%[[parameter_46]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_47:[^ ]+]] = s32[] get-tuple-element(%[[parameter_46]]), index=0
 // CHECK-NEXT:  %[[constant_51:[^ ]+]] = s32[] constant(128)
-// CHECK-NEXT:  ROOT %[[compare_52:[^ ]+]] = pred[] compare(s32[] %[[get_tuple_element_47]], s32[] %[[constant_51]]), direction=LT
+// CHECK-NEXT:  ROOT %[[compare_52:[^ ]+]] = pred[] compare(%[[get_tuple_element_47]], %[[constant_51]]), direction=LT
 
 // CHECK:       %[[$unblocked_body_82:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_83:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_84:[^ ]+]] = s32[] get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_83]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_84:[^ ]+]] = s32[] get-tuple-element(%[[parameter_83]]), index=0
 // CHECK-NEXT:  %[[constant_88:[^ ]+]] = s32[] constant(1)
-// CHECK-NEXT:  %[[add_89:[^ ]+]] = s32[] add(s32[] %[[get_tuple_element_84]], s32[] %[[constant_88]])
-// CHECK-NEXT:  %[[get_tuple_element_85:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_83]]), index=1
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = s32[] add(%[[get_tuple_element_84]], %[[constant_88]])
+// CHECK-NEXT:  %[[get_tuple_element_85:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_83]]), index=1
 // CHECK-NEXT:  %[[iota_91:[^ ]+]] = s32[1,128,128]{2,1,0} iota(), iota_dimension=1
 // CHECK-NEXT:  %[[iota_90:[^ ]+]] = s32[1,128,128]{2,1,0} iota(), iota_dimension=2
-// CHECK-NEXT:  %[[compare_92:[^ ]+]] = pred[1,128,128]{2,1,0} compare(s32[1,128,128]{2,1,0} %[[iota_91]], s32[1,128,128]{2,1,0} %[[iota_90]]), direction=GE
-// CHECK-NEXT:  %[[broadcast_93:[^ ]+]] = s32[1,128,128]{2,1,0} broadcast(s32[] %[[get_tuple_element_84]]), dimensions={}
-// CHECK-NEXT:  %[[compare_94:[^ ]+]] = pred[1,128,128]{2,1,0} compare(s32[1,128,128]{2,1,0} %[[iota_90]], s32[1,128,128]{2,1,0} %[[broadcast_93]]), direction=EQ
-// CHECK-NEXT:  %[[and_95:[^ ]+]] = pred[1,128,128]{2,1,0} and(pred[1,128,128]{2,1,0} %[[compare_92]], pred[1,128,128]{2,1,0} %[[compare_94]])
-// CHECK-NEXT:  %[[get_tuple_element_86:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_83]]), index=2
-// CHECK-NEXT:  %[[dot_98:[^ ]+]] = f32[1,128,128]{2,1,0} dot(f32[1,128,128]{2,1,0} %[[get_tuple_element_86]], f32[1,128,128]{2,1,0} %[[get_tuple_element_86]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
-// CHECK-NEXT:  %[[transpose_99:[^ ]+]] = f32[1,128,128]{2,1,0} transpose(f32[1,128,128]{2,1,0} %[[dot_98]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[subtract_100:[^ ]+]] = f32[1,128,128]{2,1,0} subtract(f32[1,128,128]{2,1,0} %[[get_tuple_element_85]], f32[1,128,128]{2,1,0} %[[transpose_99]])
+// CHECK-NEXT:  %[[compare_92:[^ ]+]] = pred[1,128,128]{2,1,0} compare(%[[iota_91]], %[[iota_90]]), direction=GE
+// CHECK-NEXT:  %[[broadcast_93:[^ ]+]] = s32[1,128,128]{2,1,0} broadcast(%[[get_tuple_element_84]]), dimensions={}
+// CHECK-NEXT:  %[[compare_94:[^ ]+]] = pred[1,128,128]{2,1,0} compare(%[[iota_90]], %[[broadcast_93]]), direction=EQ
+// CHECK-NEXT:  %[[and_95:[^ ]+]] = pred[1,128,128]{2,1,0} and(%[[compare_92]], %[[compare_94]])
+// CHECK-NEXT:  %[[get_tuple_element_86:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_83]]), index=2
+// CHECK-NEXT:  %[[dot_98:[^ ]+]] = f32[1,128,128]{2,1,0} dot(%[[get_tuple_element_86]], %[[get_tuple_element_86]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+// CHECK-NEXT:  %[[transpose_99:[^ ]+]] = f32[1,128,128]{2,1,0} transpose(%[[dot_98]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[subtract_100:[^ ]+]] = f32[1,128,128]{2,1,0} subtract(%[[get_tuple_element_85]], %[[transpose_99]])
 // CHECK-NEXT:  %[[constant_101:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_slice_102:[^ ]+]] = f32[1,1,1]{2,1,0} dynamic-slice(f32[1,128,128]{2,1,0} %[[subtract_100]], s32[] %[[constant_101]], s32[] %[[get_tuple_element_84]], s32[] %[[get_tuple_element_84]]), dynamic_slice_sizes={1,1,1}
-// CHECK-NEXT:  %[[sqrt_103:[^ ]+]] = f32[1,1,1]{2,1,0} sqrt(f32[1,1,1]{2,1,0} %[[dynamic_slice_102]])
-// CHECK-NEXT:  %[[reshape_106:[^ ]+]] = f32[1]{0} reshape(f32[1,1,1]{2,1,0} %[[sqrt_103]])
-// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(f32[1]{0} %[[reshape_106]]), dimensions={0}
-// CHECK-NEXT:  %[[divide_108:[^ ]+]] = f32[1,128,128]{2,1,0} divide(f32[1,128,128]{2,1,0} %[[subtract_100]], f32[1,128,128]{2,1,0} %[[broadcast_107]])
+// CHECK-NEXT:  %[[dynamic_slice_102:[^ ]+]] = f32[1,1,1]{2,1,0} dynamic-slice(%[[subtract_100]], %[[constant_101]], %[[get_tuple_element_84]], %[[get_tuple_element_84]]), dynamic_slice_sizes={1,1,1}
+// CHECK-NEXT:  %[[sqrt_103:[^ ]+]] = f32[1,1,1]{2,1,0} sqrt(%[[dynamic_slice_102]])
+// CHECK-NEXT:  %[[reshape_106:[^ ]+]] = f32[1]{0} reshape(%[[sqrt_103]])
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(%[[reshape_106]]), dimensions={0}
+// CHECK-NEXT:  %[[divide_108:[^ ]+]] = f32[1,128,128]{2,1,0} divide(%[[subtract_100]], %[[broadcast_107]])
 // CHECK-NEXT:  %[[constant_96:[^ ]+]] = f32[] constant(0)
-// CHECK-NEXT:  %[[broadcast_97:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(f32[] %[[constant_96]]), dimensions={}
-// CHECK-NEXT:  %[[select_109:[^ ]+]] = f32[1,128,128]{2,1,0} select(pred[1,128,128]{2,1,0} %[[and_95]], f32[1,128,128]{2,1,0} %[[divide_108]], f32[1,128,128]{2,1,0} %[[broadcast_97]])
-// CHECK-NEXT:  %[[add_110:[^ ]+]] = f32[1,128,128]{2,1,0} add(f32[1,128,128]{2,1,0} %[[select_109]], f32[1,128,128]{2,1,0} %[[get_tuple_element_86]])
-// CHECK-NEXT:  %[[get_tuple_element_87:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_83]]), index=3
-// CHECK-NEXT:  %[[compare_104:[^ ]+]] = pred[1,1,1]{2,1,0} compare(f32[1,1,1]{2,1,0} %[[sqrt_103]], f32[1,1,1]{2,1,0} %[[sqrt_103]]), direction=NE
-// CHECK-NEXT:  %[[or_105:[^ ]+]] = pred[1,1,1]{2,1,0} or(pred[1,1,1]{2,1,0} %[[get_tuple_element_87]], pred[1,1,1]{2,1,0} %[[compare_104]])
-// CHECK-NEXT:  ROOT %[[tuple_111:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(s32[] %[[add_89]], f32[1,128,128]{2,1,0} %[[get_tuple_element_85]], f32[1,128,128]{2,1,0} %[[add_110]], pred[1,1,1]{2,1,0} %[[or_105]])
+// CHECK-NEXT:  %[[broadcast_97:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(%[[constant_96]]), dimensions={}
+// CHECK-NEXT:  %[[select_109:[^ ]+]] = f32[1,128,128]{2,1,0} select(%[[and_95]], %[[divide_108]], %[[broadcast_97]])
+// CHECK-NEXT:  %[[add_110:[^ ]+]] = f32[1,128,128]{2,1,0} add(%[[select_109]], %[[get_tuple_element_86]])
+// CHECK-NEXT:  %[[get_tuple_element_87:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element(%[[parameter_83]]), index=3
+// CHECK-NEXT:  %[[compare_104:[^ ]+]] = pred[1,1,1]{2,1,0} compare(%[[sqrt_103]], %[[sqrt_103]]), direction=NE
+// CHECK-NEXT:  %[[or_105:[^ ]+]] = pred[1,1,1]{2,1,0} or(%[[get_tuple_element_87]], %[[compare_104]])
+// CHECK-NEXT:  ROOT %[[tuple_111:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(%[[add_89]], %[[get_tuple_element_85]], %[[add_110]], %[[or_105]])
 
 // CHECK:       %[[$unblocked_condition_112:[^ ]+]]
 // CHECK-NEXT:  %[[parameter_113:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) parameter(0)
-// CHECK-NEXT:  %[[get_tuple_element_115:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_113]]), index=1
-// CHECK-NEXT:  %[[get_tuple_element_116:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_113]]), index=2
-// CHECK-NEXT:  %[[get_tuple_element_117:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_113]]), index=3
-// CHECK-NEXT:  %[[get_tuple_element_114:[^ ]+]] = s32[] get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[parameter_113]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_115:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_113]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_116:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[parameter_113]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_117:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element(%[[parameter_113]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_114:[^ ]+]] = s32[] get-tuple-element(%[[parameter_113]]), index=0
 // CHECK-NEXT:  %[[constant_118:[^ ]+]] = s32[] constant(128)
-// CHECK-NEXT:  ROOT %[[compare_119:[^ ]+]] = pred[] compare(s32[] %[[get_tuple_element_114]], s32[] %[[constant_118]]), direction=LT
+// CHECK-NEXT:  ROOT %[[compare_119:[^ ]+]] = pred[] compare(%[[get_tuple_element_114]], %[[constant_118]]), direction=LT
 
 // CHECK:       %[[$xla_cholesky_f32_1_256_256__upper_137:[^ ]+]]
 // CHECK-NEXT:  %[[constant_13:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[a_1:[^ ]+]] = f32[1,256,256]{2,1,0} parameter(0)
-// CHECK-NEXT:  %[[transpose_2:[^ ]+]] = f32[1,256,256]{1,2,0} transpose(f32[1,256,256]{2,1,0} %[[a_1]]), dimensions={0,2,1}
-// CHECK-NEXT:  %[[slice_7:[^ ]+]] = f32[1,256,128]{2,1,0} slice(f32[1,256,256]{1,2,0} %[[transpose_2]]), slice={[0:1], [0:256], [0:128]}
-// CHECK-NEXT:  %[[slice_8:[^ ]+]] = f32[1,128,128]{2,1,0} slice(f32[1,256,128]{2,1,0} %[[slice_7]]), slice={[0:1], [0:128], [0:128]}
+// CHECK-NEXT:  %[[transpose_2:[^ ]+]] = f32[1,256,256]{1,2,0} transpose(%[[a_1]]), dimensions={0,2,1}
+// CHECK-NEXT:  %[[slice_7:[^ ]+]] = f32[1,256,128]{2,1,0} slice(%[[transpose_2]]), slice={[0:1], [0:256], [0:128]}
+// CHECK-NEXT:  %[[slice_8:[^ ]+]] = f32[1,128,128]{2,1,0} slice(%[[slice_7]]), slice={[0:1], [0:128], [0:128]}
 // CHECK-NEXT:  %[[constant_9:[^ ]+]] = f32[] constant(0)
-// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(f32[] %[[constant_9]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_10:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(%[[constant_9]]), dimensions={}
 // CHECK-NEXT:  %[[constant_11:[^ ]+]] = pred[] constant(false)
-// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(pred[] %[[constant_11]]), dimensions={}
-// CHECK-NEXT:  %[[tuple_14:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(s32[] %[[constant_13]], f32[1,128,128]{2,1,0} %[[slice_8]], f32[1,128,128]{2,1,0} %[[broadcast_10]], pred[1,1,1]{2,1,0} %[[broadcast_12]])
-// CHECK-NEXT:  %[[while_53:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) while((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[tuple_14]]), condition=%[[$unblocked_condition_45]], body=%[[$unblocked_body_15]]
-// CHECK-NEXT:  %[[get_tuple_element_54:[^ ]+]] = s32[] get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_53]]), index=0
-// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_53]]), index=1
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(%[[constant_11]]), dimensions={}
+// CHECK-NEXT:  %[[tuple_14:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(%[[constant_13]], %[[slice_8]], %[[broadcast_10]], %[[broadcast_12]])
+// CHECK-NEXT:  %[[while_53:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) while(%[[tuple_14]]), condition=%[[$unblocked_condition_45]], body=%[[$unblocked_body_15]]
+// CHECK-NEXT:  %[[get_tuple_element_54:[^ ]+]] = s32[] get-tuple-element(%[[while_53]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_55:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[while_53]]), index=1
 // CHECK-NEXT:  %[[constant_80:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[slice_69:[^ ]+]] = f32[1,128,128]{2,1,0} slice(f32[1,256,256]{1,2,0} %[[transpose_2]]), slice={[0:1], [128:256], [128:256]}
+// CHECK-NEXT:  %[[slice_69:[^ ]+]] = f32[1,128,128]{2,1,0} slice(%[[transpose_2]]), slice={[0:1], [128:256], [128:256]}
 // CHECK-NEXT:  %[[constant_3:[^ ]+]] = f32[] constant(0)
-// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = f32[1,256,256]{2,1,0} broadcast(f32[] %[[constant_3]]), dimensions={}
-// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_53]]), index=2
+// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = f32[1,256,256]{2,1,0} broadcast(%[[constant_3]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_56:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[while_53]]), index=2
 // CHECK-NEXT:  %[[constant_59:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_60:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_61:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_update_slice_62:[^ ]+]] = f32[1,256,256]{2,1,0} dynamic-update-slice(f32[1,256,256]{2,1,0} %[[broadcast_4]], f32[1,128,128]{2,1,0} %[[get_tuple_element_56]], s32[] %[[constant_59]], s32[] %[[constant_60]], s32[] %[[constant_61]])
-// CHECK-NEXT:  %[[slice_63:[^ ]+]] = f32[1,128,128]{2,1,0} slice(f32[1,256,128]{2,1,0} %[[slice_7]]), slice={[0:1], [128:256], [0:128]}
-// CHECK-NEXT:  %[[triangular_solve_64:[^ ]+]] = f32[1,128,128]{2,1,0} triangular-solve(f32[1,128,128]{2,1,0} %[[get_tuple_element_56]], f32[1,128,128]{2,1,0} %[[slice_63]]), lower=true, transpose_a=ADJOINT
+// CHECK-NEXT:  %[[dynamic_update_slice_62:[^ ]+]] = f32[1,256,256]{2,1,0} dynamic-update-slice(%[[broadcast_4]], %[[get_tuple_element_56]], %[[constant_59]], %[[constant_60]], %[[constant_61]])
+// CHECK-NEXT:  %[[slice_63:[^ ]+]] = f32[1,128,128]{2,1,0} slice(%[[slice_7]]), slice={[0:1], [128:256], [0:128]}
+// CHECK-NEXT:  %[[triangular_solve_64:[^ ]+]] = f32[1,128,128]{2,1,0} triangular-solve(%[[get_tuple_element_56]], %[[slice_63]]), lower=true, transpose_a=ADJOINT
 // CHECK-NEXT:  %[[constant_65:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_66:[^ ]+]] = s32[] constant(128)
 // CHECK-NEXT:  %[[constant_67:[^ ]+]] = s32[] constant(0)
-// CHECK-NEXT:  %[[dynamic_update_slice_68:[^ ]+]] = f32[1,256,256]{2,1,0} dynamic-update-slice(f32[1,256,256]{2,1,0} %[[dynamic_update_slice_62]], f32[1,128,128]{2,1,0} %[[triangular_solve_64]], s32[] %[[constant_65]], s32[] %[[constant_66]], s32[] %[[constant_67]])
-// CHECK-NEXT:  %[[slice_70:[^ ]+]] = f32[1,128,128]{2,1,0} slice(f32[1,256,256]{2,1,0} %[[dynamic_update_slice_68]]), slice={[0:1], [128:256], [0:128]}
-// CHECK-NEXT:  %[[slice_71:[^ ]+]] = f32[1,128,128]{2,1,0} slice(f32[1,256,256]{2,1,0} %[[dynamic_update_slice_68]]), slice={[0:1], [128:256], [0:128]}
-// CHECK-NEXT:  %[[dot_72:[^ ]+]] = f32[1,128,128]{2,1,0} dot(f32[1,128,128]{2,1,0} %[[slice_70]], f32[1,128,128]{2,1,0} %[[slice_71]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
-// CHECK-NEXT:  %[[transpose_73:[^ ]+]] = f32[1,128,128]{2,1,0} transpose(f32[1,128,128]{2,1,0} %[[dot_72]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[subtract_74:[^ ]+]] = f32[1,128,128]{2,1,0} subtract(f32[1,128,128]{2,1,0} %[[slice_69]], f32[1,128,128]{2,1,0} %[[transpose_73]])
-// CHECK-NEXT:  %[[slice_75:[^ ]+]] = f32[1,128,128]{2,1,0} slice(f32[1,128,128]{2,1,0} %[[subtract_74]]), slice={[0:1], [0:128], [0:128]}
+// CHECK-NEXT:  %[[dynamic_update_slice_68:[^ ]+]] = f32[1,256,256]{2,1,0} dynamic-update-slice(%[[dynamic_update_slice_62]], %[[triangular_solve_64]], %[[constant_65]], %[[constant_66]], %[[constant_67]])
+// CHECK-NEXT:  %[[slice_70:[^ ]+]] = f32[1,128,128]{2,1,0} slice(%[[dynamic_update_slice_68]]), slice={[0:1], [128:256], [0:128]}
+// CHECK-NEXT:  %[[slice_71:[^ ]+]] = f32[1,128,128]{2,1,0} slice(%[[dynamic_update_slice_68]]), slice={[0:1], [128:256], [0:128]}
+// CHECK-NEXT:  %[[dot_72:[^ ]+]] = f32[1,128,128]{2,1,0} dot(%[[slice_70]], %[[slice_71]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+// CHECK-NEXT:  %[[transpose_73:[^ ]+]] = f32[1,128,128]{2,1,0} transpose(%[[dot_72]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[subtract_74:[^ ]+]] = f32[1,128,128]{2,1,0} subtract(%[[slice_69]], %[[transpose_73]])
+// CHECK-NEXT:  %[[slice_75:[^ ]+]] = f32[1,128,128]{2,1,0} slice(%[[subtract_74]]), slice={[0:1], [0:128], [0:128]}
 // CHECK-NEXT:  %[[constant_76:[^ ]+]] = f32[] constant(0)
-// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(f32[] %[[constant_76]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = f32[1,128,128]{2,1,0} broadcast(%[[constant_76]]), dimensions={}
 // CHECK-NEXT:  %[[constant_78:[^ ]+]] = pred[] constant(false)
-// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(pred[] %[[constant_78]]), dimensions={}
-// CHECK-NEXT:  %[[tuple_81:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(s32[] %[[constant_80]], f32[1,128,128]{2,1,0} %[[slice_75]], f32[1,128,128]{2,1,0} %[[broadcast_77]], pred[1,1,1]{2,1,0} %[[broadcast_79]])
-// CHECK-NEXT:  %[[while_120:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) while((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[tuple_81]]), condition=%[[$unblocked_condition_112]], body=%[[$unblocked_body_82]]
-// CHECK-NEXT:  %[[get_tuple_element_121:[^ ]+]] = s32[] get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_120]]), index=0
-// CHECK-NEXT:  %[[get_tuple_element_122:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_120]]), index=1
+// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(%[[constant_78]]), dimensions={}
+// CHECK-NEXT:  %[[tuple_81:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(%[[constant_80]], %[[slice_75]], %[[broadcast_77]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[while_120:[^ ]+]] = (s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) while(%[[tuple_81]]), condition=%[[$unblocked_condition_112]], body=%[[$unblocked_body_82]]
+// CHECK-NEXT:  %[[get_tuple_element_121:[^ ]+]] = s32[] get-tuple-element(%[[while_120]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_122:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[while_120]]), index=1
 // CHECK-NEXT:  %[[constant_5:[^ ]+]] = pred[] constant(false)
-// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(pred[] %[[constant_5]]), dimensions={}
-// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_53]]), index=3
-// CHECK-NEXT:  %[[or_58:[^ ]+]] = pred[1,1,1]{2,1,0} or(pred[1,1,1]{2,1,0} %[[broadcast_6]], pred[1,1,1]{2,1,0} %[[get_tuple_element_57]])
-// CHECK-NEXT:  %[[get_tuple_element_124:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_120]]), index=3
-// CHECK-NEXT:  %[[or_125:[^ ]+]] = pred[1,1,1]{2,1,0} or(pred[1,1,1]{2,1,0} %[[or_58]], pred[1,1,1]{2,1,0} %[[get_tuple_element_124]])
-// CHECK-NEXT:  %[[broadcast_130:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(pred[1,1,1]{2,1,0} %[[or_125]]), dimensions={0,1,2}
-// CHECK-NEXT:  %[[reshape_131:[^ ]+]] = pred[1]{0} reshape(pred[1,1,1]{2,1,0} %[[broadcast_130]])
-// CHECK-NEXT:  %[[broadcast_132:[^ ]+]] = pred[1,256,256]{2,1,0} broadcast(pred[1]{0} %[[reshape_131]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_57:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element(%[[while_53]]), index=3
+// CHECK-NEXT:  %[[or_58:[^ ]+]] = pred[1,1,1]{2,1,0} or(%[[broadcast_6]], %[[get_tuple_element_57]])
+// CHECK-NEXT:  %[[get_tuple_element_124:[^ ]+]] = pred[1,1,1]{2,1,0} get-tuple-element(%[[while_120]]), index=3
+// CHECK-NEXT:  %[[or_125:[^ ]+]] = pred[1,1,1]{2,1,0} or(%[[or_58]], %[[get_tuple_element_124]])
+// CHECK-NEXT:  %[[broadcast_130:[^ ]+]] = pred[1,1,1]{2,1,0} broadcast(%[[or_125]]), dimensions={0,1,2}
+// CHECK-NEXT:  %[[reshape_131:[^ ]+]] = pred[1]{0} reshape(%[[broadcast_130]])
+// CHECK-NEXT:  %[[broadcast_132:[^ ]+]] = pred[1,256,256]{2,1,0} broadcast(%[[reshape_131]]), dimensions={0}
 // CHECK-NEXT:  %[[constant_133:[^ ]+]] = f32[] constant(nan)
-// CHECK-NEXT:  %[[broadcast_134:[^ ]+]] = f32[1,256,256]{2,1,0} broadcast(f32[] %[[constant_133]]), dimensions={}
-// CHECK-NEXT:  %[[get_tuple_element_123:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element((s32[], f32[1,128,128]{2,1,0}, f32[1,128,128]{2,1,0}, pred[1,1,1]{2,1,0}) %[[while_120]]), index=2
+// CHECK-NEXT:  %[[broadcast_134:[^ ]+]] = f32[1,256,256]{2,1,0} broadcast(%[[constant_133]]), dimensions={}
+// CHECK-NEXT:  %[[get_tuple_element_123:[^ ]+]] = f32[1,128,128]{2,1,0} get-tuple-element(%[[while_120]]), index=2
 // CHECK-NEXT:  %[[constant_126:[^ ]+]] = s32[] constant(0)
 // CHECK-NEXT:  %[[constant_127:[^ ]+]] = s32[] constant(128)
 // CHECK-NEXT:  %[[constant_128:[^ ]+]] = s32[] constant(128)
-// CHECK-NEXT:  %[[dynamic_update_slice_129:[^ ]+]] = f32[1,256,256]{2,1,0} dynamic-update-slice(f32[1,256,256]{2,1,0} %[[dynamic_update_slice_68]], f32[1,128,128]{2,1,0} %[[get_tuple_element_123]], s32[] %[[constant_126]], s32[] %[[constant_127]], s32[] %[[constant_128]])
-// CHECK-NEXT:  %[[select_135:[^ ]+]] = f32[1,256,256]{2,1,0} select(pred[1,256,256]{2,1,0} %[[broadcast_132]], f32[1,256,256]{2,1,0} %[[broadcast_134]], f32[1,256,256]{2,1,0} %[[dynamic_update_slice_129]])
-// CHECK-NEXT:  ROOT %[[transpose_136:[^ ]+]] = f32[1,256,256]{1,2,0} transpose(f32[1,256,256]{2,1,0} %[[select_135]]), dimensions={0,2,1}
+// CHECK-NEXT:  %[[dynamic_update_slice_129:[^ ]+]] = f32[1,256,256]{2,1,0} dynamic-update-slice(%[[dynamic_update_slice_68]], %[[get_tuple_element_123]], %[[constant_126]], %[[constant_127]], %[[constant_128]])
+// CHECK-NEXT:  %[[select_135:[^ ]+]] = f32[1,256,256]{2,1,0} select(%[[broadcast_132]], %[[broadcast_134]], %[[dynamic_update_slice_129]])
+// CHECK-NEXT:  ROOT %[[transpose_136:[^ ]+]] = f32[1,256,256]{1,2,0} transpose(%[[select_135]]), dimensions={0,2,1}
 
 // CHECK-LABEL: ENTRY %test
 // CHECK-NEXT:  %[[input:[^ ]+]] = f32[1,256,256]{2,1,0} parameter(0)
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[1,256,256]{2,1,0} call(f32[1,256,256]{2,1,0} %[[input]]), to_apply=%[[$xla_cholesky_f32_1_256_256__upper_137]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[1,256,256]{2,1,0} call(%[[input]]), to_apply=%[[$xla_cholesky_f32_1_256_256__upper_137]]
 
 HloModule CholeskyExpanderTest
 
 ENTRY test {
   input = f32[1,256,256] parameter(0)
-  ROOT decomp = f32[1,256,256] cholesky(%input)
+  ROOT decomp = f32[1,256,256] cholesky(input)
 }
diff --git a/third_party/xla/xla/hlo/transforms/tests/operand_upcaster.hlo b/third_party/xla/xla/hlo/transforms/tests/operand_upcaster.hlo
new file mode 100644
index 000000000000..364de426a3b5
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/tests/operand_upcaster.hlo
@@ -0,0 +1,19 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt %s --split-input-file --passes=operand_upcaster | FileCheck %s
+
+// CHECK-LABEL: HloModule TestDot, entry_computation_layout={(s8[8]{0}, s16[8]{0})->s32[8]{0}}
+
+// CHECK-LABEL: ENTRY %test_dot
+// CHECK-NEXT:  %[[a:[^ ]+]] = s8[8]{0} parameter(0)
+// CHECK-NEXT:  %[[convert:[^ ]+]] = s32[8]{0} convert(%[[a]])
+// CHECK-NEXT:  %[[b:[^ ]+]] = s16[8]{0} parameter(1)
+// CHECK-NEXT:  %[[convert_1:[^ ]+]] = s32[8]{0} convert(%[[b]])
+// CHECK-NEXT:  ROOT %[[result:[^ ]+]] = s32[8]{0} dot(%[[convert]], %[[convert_1]]), lhs_contracting_dims={}, rhs_contracting_dims={}
+
+HloModule TestDot
+
+ENTRY test_dot {
+  a = s8[8] parameter(0)
+  b = s16[8] parameter(1)
+  ROOT result = s32[8] dot(a, b)
+}
diff --git a/third_party/xla/xla/hlo/transforms/tests/optimization_barrier_expander.hlo b/third_party/xla/xla/hlo/transforms/tests/optimization_barrier_expander.hlo
new file mode 100644
index 000000000000..9c52891f589c
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/tests/optimization_barrier_expander.hlo
@@ -0,0 +1,25 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: hlo-opt %s --passes=algsimp,cse_barrier_expander | FileCheck %s
+
+// CHECK-LABEL: HloModule TestExpandOptBarrierToNoOp, entry_computation_layout={(s32[8]{0}, s32[8]{0}, s32[8]{0})->s32[8]{0}}
+
+// CHECK-LABEL: ENTRY %test_case
+// CHECK-NEXT:  %[[a:[^ ]+]] = s32[8]{0} parameter(0)
+// CHECK-NEXT:  %[[c:[^ ]+]] = s32[8]{0} parameter(2)
+// CHECK-NEXT:  %[[ac:[^ ]+]] = s32[8]{0} multiply(%[[a]], %[[c]])
+// CHECK-NEXT:  %[[b:[^ ]+]] = s32[8]{0} parameter(1)
+// CHECK-NEXT:  %[[bc:[^ ]+]] = s32[8]{0} multiply(%[[b]], %[[c]])
+// CHECK-NEXT:  ROOT %[[ac_plus_bc:[^ ]+]] = s32[8]{0} add(%[[ac]], %[[bc]])
+
+HloModule TestExpandOptBarrierToNoOp
+
+ENTRY test_case {
+  a = s32[8] parameter(0)
+  b = s32[8] parameter(1)
+  c = s32[8] parameter(2)
+  ac = s32[8] multiply(a, c)
+  bc = s32[8] multiply(b, c)
+  ac_barrier = s32[8] opt-barrier(ac)
+  bc_barrier = s32[8] opt-barrier(bc)
+  ROOT ac_plus_bc = s32[8] add(ac_barrier, bc_barrier)
+}
diff --git a/third_party/xla/xla/hlo/transforms/tests/rewrite_bf16_conv_to_onednn.hlo b/third_party/xla/xla/hlo/transforms/tests/rewrite_bf16_conv_to_onednn.hlo
index 85c6bb1631ea..90bc6f605620 100644
--- a/third_party/xla/xla/hlo/transforms/tests/rewrite_bf16_conv_to_onednn.hlo
+++ b/third_party/xla/xla/hlo/transforms/tests/rewrite_bf16_conv_to_onednn.hlo
@@ -6,7 +6,7 @@
 // CHECK-LABEL: ENTRY %test_case
 // CHECK-NEXT:  %[[input:[^ ]+]] = bf16[8,4,5,5,1]{4,3,2,1,0} parameter(0)
 // CHECK-NEXT:  %[[kernel:[^ ]+]] = bf16[3,3,3,1,32]{4,3,2,1,0} parameter(1)
-// CHECK-NEXT:  ROOT %[[convolution:[^ ]+]] = bf16[8,4,5,5,32]{4,3,2,1,0} convolution(bf16[8,4,5,5,1]{4,3,2,1,0} %[[input]], bf16[3,3,3,1,32]{4,3,2,1,0} %[[kernel]]), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=b012f_012io->b012f
+// CHECK-NEXT:  ROOT %[[convolution:[^ ]+]] = bf16[8,4,5,5,32]{4,3,2,1,0} convolution(%[[input]], %[[kernel]]), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=b012f_012io->b012f
 
 HloModule TestBf16ConvolutionRewritableToOnednn
 
@@ -22,11 +22,11 @@ ENTRY test_case {
 
 // CHECK-LABEL: ENTRY %test_case
 // CHECK-NEXT:  %[[input:[^ ]+]] = bf16[8,4,5,5,1]{4,3,2,1,0} parameter(0)
-// CHECK-NEXT:  %[[convert:[^ ]+]] = f32[8,4,5,5,1]{4,3,2,1,0} convert(bf16[8,4,5,5,1]{4,3,2,1,0} %[[input]])
+// CHECK-NEXT:  %[[convert:[^ ]+]] = f32[8,4,5,5,1]{4,3,2,1,0} convert(%[[input]])
 // CHECK-NEXT:  %[[kernel:[^ ]+]] = bf16[3,3,3,1,32]{4,3,2,1,0} parameter(1)
-// CHECK-NEXT:  %[[convert_1:[^ ]+]] = f32[3,3,3,1,32]{4,3,2,1,0} convert(bf16[3,3,3,1,32]{4,3,2,1,0} %[[kernel]])
-// CHECK-NEXT:  %[[convolution_1:[^ ]+]] = f32[4,4,5,5,32]{4,3,2,1,0} convolution(f32[8,4,5,5,1]{4,3,2,1,0} %[[convert]], f32[3,3,3,1,32]{4,3,2,1,0} %[[convert_1]]), window={size=3x3x3 pad=-1_-1x1_1x1_1}, dim_labels=b012f_012io->b012f
-// CHECK-NEXT:  ROOT %[[convert_2:[^ ]+]] = bf16[4,4,5,5,32]{4,3,2,1,0} convert(f32[4,4,5,5,32]{4,3,2,1,0} %[[convolution_1]])
+// CHECK-NEXT:  %[[convert_1:[^ ]+]] = f32[3,3,3,1,32]{4,3,2,1,0} convert(%[[kernel]])
+// CHECK-NEXT:  %[[convolution_1:[^ ]+]] = f32[4,4,5,5,32]{4,3,2,1,0} convolution(%[[convert]], %[[convert_1]]), window={size=3x3x3 pad=-1_-1x1_1x1_1}, dim_labels=b012f_012io->b012f
+// CHECK-NEXT:  ROOT %[[convert_2:[^ ]+]] = bf16[4,4,5,5,32]{4,3,2,1,0} convert(%[[convolution_1]])
 
 HloModule TestBf16ConvolutionNotRewritableToOnednn
 
diff --git a/third_party/xla/xla/hlo/transforms/tests/rng_bit_generator_expander.hlo b/third_party/xla/xla/hlo/transforms/tests/rng_bit_generator_expander.hlo
index dd553731ebf9..abab398221e2 100644
--- a/third_party/xla/xla/hlo/transforms/tests/rng_bit_generator_expander.hlo
+++ b/third_party/xla/xla/hlo/transforms/tests/rng_bit_generator_expander.hlo
@@ -5,255 +5,255 @@
 
 // CHECK:       %[[$rng_247:[^ ]+]]
 // CHECK-NEXT:  %[[state_1:[^ ]+]] = u64[2]{0} parameter(0)
-// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[state_1]]), slice={[1:2]}
-// CHECK-NEXT:  %[[reshape_5:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_4]])
-// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[reshape_5]]), dimensions={}
-// CHECK-NEXT:  %[[convert_11:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[broadcast_6]])
-// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[state_1]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_2]])
-// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(u64[] %[[reshape_3]])
-// CHECK-NEXT:  %[[bitcast_convert_19:[^ ]+]] = u32[] bitcast-convert(u32[] %[[convert_16]])
-// CHECK-NEXT:  %[[broadcast_24:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_25:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[convert_11]], u32[1]{0} %[[broadcast_24]])
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_5:[^ ]+]] = u64[] reshape(%[[slice_4]])
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = u64[1]{0} broadcast(%[[reshape_5]]), dimensions={}
+// CHECK-NEXT:  %[[convert_11:[^ ]+]] = u32[1]{0} convert(%[[broadcast_6]])
+// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(%[[slice_2]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[reshape_3]])
+// CHECK-NEXT:  %[[bitcast_convert_19:[^ ]+]] = u32[] bitcast-convert(%[[convert_16]])
+// CHECK-NEXT:  %[[broadcast_24:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_25:[^ ]+]] = u32[1]{0} add(%[[convert_11]], %[[broadcast_24]])
 // CHECK-NEXT:  %[[constant_10:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_10]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_13:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[broadcast_6]], u64[1]{0} %[[broadcast_12]])
-// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_13]])
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = u64[1]{0} broadcast(%[[constant_10]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_13:[^ ]+]] = u64[1]{0} shift-right-logical(%[[broadcast_6]], %[[broadcast_12]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_13]])
 // CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_3]], u64[] %[[constant_15]])
-// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_17]])
-// CHECK-NEXT:  %[[bitcast_convert_20:[^ ]+]] = u32[] bitcast-convert(u32[] %[[convert_18]])
-// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_27:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[convert_14]], u32[1]{0} %[[broadcast_26]])
-// CHECK-NEXT:  %[[add_28:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_25]], u32[1]{0} %[[add_27]])
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(%[[reshape_3]], %[[constant_15]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[bitcast_convert_20:[^ ]+]] = u32[] bitcast-convert(%[[convert_18]])
+// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_27:[^ ]+]] = u32[1]{0} add(%[[convert_14]], %[[broadcast_26]])
+// CHECK-NEXT:  %[[add_28:[^ ]+]] = u32[1]{0} add(%[[add_25]], %[[add_27]])
 // CHECK-NEXT:  %[[constant_29:[^ ]+]] = u32[] constant(13)
-// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_29]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_31:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_27]], u32[1]{0} %[[broadcast_30]])
+// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = u32[1]{0} broadcast(%[[constant_29]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_31:[^ ]+]] = u32[1]{0} shift-left(%[[add_27]], %[[broadcast_30]])
 // CHECK-NEXT:  %[[constant_32:[^ ]+]] = u32[] constant(19)
-// CHECK-NEXT:  %[[broadcast_33:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_32]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_34:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_27]], u32[1]{0} %[[broadcast_33]])
-// CHECK-NEXT:  %[[or_35:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_31]], u32[1]{0} %[[shift_right_logical_34]])
-// CHECK-NEXT:  %[[xor_36:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_28]], u32[1]{0} %[[or_35]])
-// CHECK-NEXT:  %[[add_37:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_28]], u32[1]{0} %[[xor_36]])
+// CHECK-NEXT:  %[[broadcast_33:[^ ]+]] = u32[1]{0} broadcast(%[[constant_32]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_34:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_27]], %[[broadcast_33]])
+// CHECK-NEXT:  %[[or_35:[^ ]+]] = u32[1]{0} or(%[[shift_left_31]], %[[shift_right_logical_34]])
+// CHECK-NEXT:  %[[xor_36:[^ ]+]] = u32[1]{0} xor(%[[add_28]], %[[or_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u32[1]{0} add(%[[add_28]], %[[xor_36]])
 // CHECK-NEXT:  %[[constant_38:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_38]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_40:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_36]], u32[1]{0} %[[broadcast_39]])
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u32[1]{0} broadcast(%[[constant_38]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_40:[^ ]+]] = u32[1]{0} shift-left(%[[xor_36]], %[[broadcast_39]])
 // CHECK-NEXT:  %[[constant_41:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_42:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_41]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_43:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_36]], u32[1]{0} %[[broadcast_42]])
-// CHECK-NEXT:  %[[or_44:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_40]], u32[1]{0} %[[shift_right_logical_43]])
-// CHECK-NEXT:  %[[xor_45:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_37]], u32[1]{0} %[[or_44]])
-// CHECK-NEXT:  %[[add_46:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_37]], u32[1]{0} %[[xor_45]])
+// CHECK-NEXT:  %[[broadcast_42:[^ ]+]] = u32[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_43:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_36]], %[[broadcast_42]])
+// CHECK-NEXT:  %[[or_44:[^ ]+]] = u32[1]{0} or(%[[shift_left_40]], %[[shift_right_logical_43]])
+// CHECK-NEXT:  %[[xor_45:[^ ]+]] = u32[1]{0} xor(%[[add_37]], %[[or_44]])
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = u32[1]{0} add(%[[add_37]], %[[xor_45]])
 // CHECK-NEXT:  %[[constant_47:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_47]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_49:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_45]], u32[1]{0} %[[broadcast_48]])
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u32[1]{0} broadcast(%[[constant_47]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_49:[^ ]+]] = u32[1]{0} shift-left(%[[xor_45]], %[[broadcast_48]])
 // CHECK-NEXT:  %[[constant_50:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_51:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_50]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_52:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_45]], u32[1]{0} %[[broadcast_51]])
-// CHECK-NEXT:  %[[or_53:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_49]], u32[1]{0} %[[shift_right_logical_52]])
-// CHECK-NEXT:  %[[xor_54:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_46]], u32[1]{0} %[[or_53]])
-// CHECK-NEXT:  %[[add_55:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_46]], u32[1]{0} %[[xor_54]])
-// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_65:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_55]], u32[1]{0} %[[broadcast_64]])
+// CHECK-NEXT:  %[[broadcast_51:[^ ]+]] = u32[1]{0} broadcast(%[[constant_50]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_52:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_45]], %[[broadcast_51]])
+// CHECK-NEXT:  %[[or_53:[^ ]+]] = u32[1]{0} or(%[[shift_left_49]], %[[shift_right_logical_52]])
+// CHECK-NEXT:  %[[xor_54:[^ ]+]] = u32[1]{0} xor(%[[add_46]], %[[or_53]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u32[1]{0} add(%[[add_46]], %[[xor_54]])
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_65:[^ ]+]] = u32[1]{0} add(%[[add_55]], %[[broadcast_64]])
 // CHECK-NEXT:  %[[constant_56:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_56]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_58:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_54]], u32[1]{0} %[[broadcast_57]])
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u32[1]{0} broadcast(%[[constant_56]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_58:[^ ]+]] = u32[1]{0} shift-left(%[[xor_54]], %[[broadcast_57]])
 // CHECK-NEXT:  %[[constant_59:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_60:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_59]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_61:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_54]], u32[1]{0} %[[broadcast_60]])
-// CHECK-NEXT:  %[[or_62:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_58]], u32[1]{0} %[[shift_right_logical_61]])
-// CHECK-NEXT:  %[[xor_63:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_55]], u32[1]{0} %[[or_62]])
+// CHECK-NEXT:  %[[broadcast_60:[^ ]+]] = u32[1]{0} broadcast(%[[constant_59]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_61:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_54]], %[[broadcast_60]])
+// CHECK-NEXT:  %[[or_62:[^ ]+]] = u32[1]{0} or(%[[shift_left_58]], %[[shift_right_logical_61]])
+// CHECK-NEXT:  %[[xor_63:[^ ]+]] = u32[1]{0} xor(%[[add_55]], %[[or_62]])
 // CHECK-NEXT:  %[[constant_21:[^ ]+]] = u32[] constant(466688986)
-// CHECK-NEXT:  %[[xor_22:[^ ]+]] = u32[] xor(u32[] %[[constant_21]], u32[] %[[bitcast_convert_19]])
-// CHECK-NEXT:  %[[xor_23:[^ ]+]] = u32[] xor(u32[] %[[xor_22]], u32[] %[[bitcast_convert_20]])
-// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_67:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_63]], u32[1]{0} %[[broadcast_66]])
+// CHECK-NEXT:  %[[xor_22:[^ ]+]] = u32[] xor(%[[constant_21]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[xor_23:[^ ]+]] = u32[] xor(%[[xor_22]], %[[bitcast_convert_20]])
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = u32[1]{0} add(%[[xor_63]], %[[broadcast_66]])
 // CHECK-NEXT:  %[[constant_68:[^ ]+]] = u32[] constant(1)
-// CHECK-NEXT:  %[[broadcast_69:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_68]]), dimensions={}
-// CHECK-NEXT:  %[[add_70:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_67]], u32[1]{0} %[[broadcast_69]])
-// CHECK-NEXT:  %[[add_71:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_65]], u32[1]{0} %[[add_70]])
+// CHECK-NEXT:  %[[broadcast_69:[^ ]+]] = u32[1]{0} broadcast(%[[constant_68]]), dimensions={}
+// CHECK-NEXT:  %[[add_70:[^ ]+]] = u32[1]{0} add(%[[add_67]], %[[broadcast_69]])
+// CHECK-NEXT:  %[[add_71:[^ ]+]] = u32[1]{0} add(%[[add_65]], %[[add_70]])
 // CHECK-NEXT:  %[[constant_72:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_72]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_74:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_70]], u32[1]{0} %[[broadcast_73]])
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u32[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_74:[^ ]+]] = u32[1]{0} shift-left(%[[add_70]], %[[broadcast_73]])
 // CHECK-NEXT:  %[[constant_75:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_76:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_75]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_77:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_70]], u32[1]{0} %[[broadcast_76]])
-// CHECK-NEXT:  %[[or_78:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_74]], u32[1]{0} %[[shift_right_logical_77]])
-// CHECK-NEXT:  %[[xor_79:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_71]], u32[1]{0} %[[or_78]])
-// CHECK-NEXT:  %[[add_80:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_71]], u32[1]{0} %[[xor_79]])
+// CHECK-NEXT:  %[[broadcast_76:[^ ]+]] = u32[1]{0} broadcast(%[[constant_75]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_77:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_70]], %[[broadcast_76]])
+// CHECK-NEXT:  %[[or_78:[^ ]+]] = u32[1]{0} or(%[[shift_left_74]], %[[shift_right_logical_77]])
+// CHECK-NEXT:  %[[xor_79:[^ ]+]] = u32[1]{0} xor(%[[add_71]], %[[or_78]])
+// CHECK-NEXT:  %[[add_80:[^ ]+]] = u32[1]{0} add(%[[add_71]], %[[xor_79]])
 // CHECK-NEXT:  %[[constant_81:[^ ]+]] = u32[] constant(29)
-// CHECK-NEXT:  %[[broadcast_82:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_81]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_83:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_79]], u32[1]{0} %[[broadcast_82]])
+// CHECK-NEXT:  %[[broadcast_82:[^ ]+]] = u32[1]{0} broadcast(%[[constant_81]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_83:[^ ]+]] = u32[1]{0} shift-left(%[[xor_79]], %[[broadcast_82]])
 // CHECK-NEXT:  %[[constant_84:[^ ]+]] = u32[] constant(3)
-// CHECK-NEXT:  %[[broadcast_85:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_84]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_86:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_79]], u32[1]{0} %[[broadcast_85]])
-// CHECK-NEXT:  %[[or_87:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_83]], u32[1]{0} %[[shift_right_logical_86]])
-// CHECK-NEXT:  %[[xor_88:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_80]], u32[1]{0} %[[or_87]])
-// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_80]], u32[1]{0} %[[xor_88]])
+// CHECK-NEXT:  %[[broadcast_85:[^ ]+]] = u32[1]{0} broadcast(%[[constant_84]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_86:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_79]], %[[broadcast_85]])
+// CHECK-NEXT:  %[[or_87:[^ ]+]] = u32[1]{0} or(%[[shift_left_83]], %[[shift_right_logical_86]])
+// CHECK-NEXT:  %[[xor_88:[^ ]+]] = u32[1]{0} xor(%[[add_80]], %[[or_87]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[1]{0} add(%[[add_80]], %[[xor_88]])
 // CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_91:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_90]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_92:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_88]], u32[1]{0} %[[broadcast_91]])
+// CHECK-NEXT:  %[[broadcast_91:[^ ]+]] = u32[1]{0} broadcast(%[[constant_90]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_92:[^ ]+]] = u32[1]{0} shift-left(%[[xor_88]], %[[broadcast_91]])
 // CHECK-NEXT:  %[[constant_93:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_93]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_95:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_88]], u32[1]{0} %[[broadcast_94]])
-// CHECK-NEXT:  %[[or_96:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_92]], u32[1]{0} %[[shift_right_logical_95]])
-// CHECK-NEXT:  %[[xor_97:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_89]], u32[1]{0} %[[or_96]])
-// CHECK-NEXT:  %[[add_98:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_89]], u32[1]{0} %[[xor_97]])
-// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_108:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_98]], u32[1]{0} %[[broadcast_107]])
+// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u32[1]{0} broadcast(%[[constant_93]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_95:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_88]], %[[broadcast_94]])
+// CHECK-NEXT:  %[[or_96:[^ ]+]] = u32[1]{0} or(%[[shift_left_92]], %[[shift_right_logical_95]])
+// CHECK-NEXT:  %[[xor_97:[^ ]+]] = u32[1]{0} xor(%[[add_89]], %[[or_96]])
+// CHECK-NEXT:  %[[add_98:[^ ]+]] = u32[1]{0} add(%[[add_89]], %[[xor_97]])
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_108:[^ ]+]] = u32[1]{0} add(%[[add_98]], %[[broadcast_107]])
 // CHECK-NEXT:  %[[constant_99:[^ ]+]] = u32[] constant(24)
-// CHECK-NEXT:  %[[broadcast_100:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_99]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_101:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_97]], u32[1]{0} %[[broadcast_100]])
+// CHECK-NEXT:  %[[broadcast_100:[^ ]+]] = u32[1]{0} broadcast(%[[constant_99]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_101:[^ ]+]] = u32[1]{0} shift-left(%[[xor_97]], %[[broadcast_100]])
 // CHECK-NEXT:  %[[constant_102:[^ ]+]] = u32[] constant(8)
-// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_102]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_104:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_97]], u32[1]{0} %[[broadcast_103]])
-// CHECK-NEXT:  %[[or_105:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_101]], u32[1]{0} %[[shift_right_logical_104]])
-// CHECK-NEXT:  %[[xor_106:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_98]], u32[1]{0} %[[or_105]])
-// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_110:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_106]], u32[1]{0} %[[broadcast_109]])
+// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u32[1]{0} broadcast(%[[constant_102]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_104:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_97]], %[[broadcast_103]])
+// CHECK-NEXT:  %[[or_105:[^ ]+]] = u32[1]{0} or(%[[shift_left_101]], %[[shift_right_logical_104]])
+// CHECK-NEXT:  %[[xor_106:[^ ]+]] = u32[1]{0} xor(%[[add_98]], %[[or_105]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_110:[^ ]+]] = u32[1]{0} add(%[[xor_106]], %[[broadcast_109]])
 // CHECK-NEXT:  %[[constant_111:[^ ]+]] = u32[] constant(2)
-// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_111]]), dimensions={}
-// CHECK-NEXT:  %[[add_113:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_110]], u32[1]{0} %[[broadcast_112]])
-// CHECK-NEXT:  %[[add_114:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_108]], u32[1]{0} %[[add_113]])
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[constant_111]]), dimensions={}
+// CHECK-NEXT:  %[[add_113:[^ ]+]] = u32[1]{0} add(%[[add_110]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[add_114:[^ ]+]] = u32[1]{0} add(%[[add_108]], %[[add_113]])
 // CHECK-NEXT:  %[[constant_115:[^ ]+]] = u32[] constant(13)
-// CHECK-NEXT:  %[[broadcast_116:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_115]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_117:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_113]], u32[1]{0} %[[broadcast_116]])
+// CHECK-NEXT:  %[[broadcast_116:[^ ]+]] = u32[1]{0} broadcast(%[[constant_115]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_117:[^ ]+]] = u32[1]{0} shift-left(%[[add_113]], %[[broadcast_116]])
 // CHECK-NEXT:  %[[constant_118:[^ ]+]] = u32[] constant(19)
-// CHECK-NEXT:  %[[broadcast_119:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_118]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_120:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_113]], u32[1]{0} %[[broadcast_119]])
-// CHECK-NEXT:  %[[or_121:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_117]], u32[1]{0} %[[shift_right_logical_120]])
-// CHECK-NEXT:  %[[xor_122:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_114]], u32[1]{0} %[[or_121]])
-// CHECK-NEXT:  %[[add_123:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_114]], u32[1]{0} %[[xor_122]])
+// CHECK-NEXT:  %[[broadcast_119:[^ ]+]] = u32[1]{0} broadcast(%[[constant_118]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_120:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_113]], %[[broadcast_119]])
+// CHECK-NEXT:  %[[or_121:[^ ]+]] = u32[1]{0} or(%[[shift_left_117]], %[[shift_right_logical_120]])
+// CHECK-NEXT:  %[[xor_122:[^ ]+]] = u32[1]{0} xor(%[[add_114]], %[[or_121]])
+// CHECK-NEXT:  %[[add_123:[^ ]+]] = u32[1]{0} add(%[[add_114]], %[[xor_122]])
 // CHECK-NEXT:  %[[constant_124:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_125:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_124]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_126:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_122]], u32[1]{0} %[[broadcast_125]])
+// CHECK-NEXT:  %[[broadcast_125:[^ ]+]] = u32[1]{0} broadcast(%[[constant_124]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_126:[^ ]+]] = u32[1]{0} shift-left(%[[xor_122]], %[[broadcast_125]])
 // CHECK-NEXT:  %[[constant_127:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_128:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_127]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_129:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_122]], u32[1]{0} %[[broadcast_128]])
-// CHECK-NEXT:  %[[or_130:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_126]], u32[1]{0} %[[shift_right_logical_129]])
-// CHECK-NEXT:  %[[xor_131:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_123]], u32[1]{0} %[[or_130]])
-// CHECK-NEXT:  %[[add_132:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_123]], u32[1]{0} %[[xor_131]])
+// CHECK-NEXT:  %[[broadcast_128:[^ ]+]] = u32[1]{0} broadcast(%[[constant_127]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_129:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_122]], %[[broadcast_128]])
+// CHECK-NEXT:  %[[or_130:[^ ]+]] = u32[1]{0} or(%[[shift_left_126]], %[[shift_right_logical_129]])
+// CHECK-NEXT:  %[[xor_131:[^ ]+]] = u32[1]{0} xor(%[[add_123]], %[[or_130]])
+// CHECK-NEXT:  %[[add_132:[^ ]+]] = u32[1]{0} add(%[[add_123]], %[[xor_131]])
 // CHECK-NEXT:  %[[constant_133:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_134:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_133]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_135:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_131]], u32[1]{0} %[[broadcast_134]])
+// CHECK-NEXT:  %[[broadcast_134:[^ ]+]] = u32[1]{0} broadcast(%[[constant_133]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_135:[^ ]+]] = u32[1]{0} shift-left(%[[xor_131]], %[[broadcast_134]])
 // CHECK-NEXT:  %[[constant_136:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_136]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_138:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_131]], u32[1]{0} %[[broadcast_137]])
-// CHECK-NEXT:  %[[or_139:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_135]], u32[1]{0} %[[shift_right_logical_138]])
-// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_132]], u32[1]{0} %[[or_139]])
-// CHECK-NEXT:  %[[add_141:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_132]], u32[1]{0} %[[xor_140]])
-// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_151:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_141]], u32[1]{0} %[[broadcast_150]])
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[constant_136]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_138:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_131]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[or_139:[^ ]+]] = u32[1]{0} or(%[[shift_left_135]], %[[shift_right_logical_138]])
+// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(%[[add_132]], %[[or_139]])
+// CHECK-NEXT:  %[[add_141:[^ ]+]] = u32[1]{0} add(%[[add_132]], %[[xor_140]])
+// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_151:[^ ]+]] = u32[1]{0} add(%[[add_141]], %[[broadcast_150]])
 // CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_143:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_142]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_144:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_140]], u32[1]{0} %[[broadcast_143]])
+// CHECK-NEXT:  %[[broadcast_143:[^ ]+]] = u32[1]{0} broadcast(%[[constant_142]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_144:[^ ]+]] = u32[1]{0} shift-left(%[[xor_140]], %[[broadcast_143]])
 // CHECK-NEXT:  %[[constant_145:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_146:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_145]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_147:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_140]], u32[1]{0} %[[broadcast_146]])
-// CHECK-NEXT:  %[[or_148:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_144]], u32[1]{0} %[[shift_right_logical_147]])
-// CHECK-NEXT:  %[[xor_149:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_141]], u32[1]{0} %[[or_148]])
-// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_153:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_149]], u32[1]{0} %[[broadcast_152]])
+// CHECK-NEXT:  %[[broadcast_146:[^ ]+]] = u32[1]{0} broadcast(%[[constant_145]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_147:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_140]], %[[broadcast_146]])
+// CHECK-NEXT:  %[[or_148:[^ ]+]] = u32[1]{0} or(%[[shift_left_144]], %[[shift_right_logical_147]])
+// CHECK-NEXT:  %[[xor_149:[^ ]+]] = u32[1]{0} xor(%[[add_141]], %[[or_148]])
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_153:[^ ]+]] = u32[1]{0} add(%[[xor_149]], %[[broadcast_152]])
 // CHECK-NEXT:  %[[constant_154:[^ ]+]] = u32[] constant(3)
-// CHECK-NEXT:  %[[broadcast_155:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_154]]), dimensions={}
-// CHECK-NEXT:  %[[add_156:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_153]], u32[1]{0} %[[broadcast_155]])
-// CHECK-NEXT:  %[[add_157:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_151]], u32[1]{0} %[[add_156]])
+// CHECK-NEXT:  %[[broadcast_155:[^ ]+]] = u32[1]{0} broadcast(%[[constant_154]]), dimensions={}
+// CHECK-NEXT:  %[[add_156:[^ ]+]] = u32[1]{0} add(%[[add_153]], %[[broadcast_155]])
+// CHECK-NEXT:  %[[add_157:[^ ]+]] = u32[1]{0} add(%[[add_151]], %[[add_156]])
 // CHECK-NEXT:  %[[constant_158:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_158]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_160:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_156]], u32[1]{0} %[[broadcast_159]])
+// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u32[1]{0} broadcast(%[[constant_158]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_160:[^ ]+]] = u32[1]{0} shift-left(%[[add_156]], %[[broadcast_159]])
 // CHECK-NEXT:  %[[constant_161:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_162:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_161]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_163:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_156]], u32[1]{0} %[[broadcast_162]])
-// CHECK-NEXT:  %[[or_164:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_160]], u32[1]{0} %[[shift_right_logical_163]])
-// CHECK-NEXT:  %[[xor_165:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_157]], u32[1]{0} %[[or_164]])
-// CHECK-NEXT:  %[[add_166:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_157]], u32[1]{0} %[[xor_165]])
+// CHECK-NEXT:  %[[broadcast_162:[^ ]+]] = u32[1]{0} broadcast(%[[constant_161]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_163:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_156]], %[[broadcast_162]])
+// CHECK-NEXT:  %[[or_164:[^ ]+]] = u32[1]{0} or(%[[shift_left_160]], %[[shift_right_logical_163]])
+// CHECK-NEXT:  %[[xor_165:[^ ]+]] = u32[1]{0} xor(%[[add_157]], %[[or_164]])
+// CHECK-NEXT:  %[[add_166:[^ ]+]] = u32[1]{0} add(%[[add_157]], %[[xor_165]])
 // CHECK-NEXT:  %[[constant_167:[^ ]+]] = u32[] constant(29)
-// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_167]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_169:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_165]], u32[1]{0} %[[broadcast_168]])
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[constant_167]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_169:[^ ]+]] = u32[1]{0} shift-left(%[[xor_165]], %[[broadcast_168]])
 // CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(3)
-// CHECK-NEXT:  %[[broadcast_171:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_170]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_172:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_165]], u32[1]{0} %[[broadcast_171]])
-// CHECK-NEXT:  %[[or_173:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_169]], u32[1]{0} %[[shift_right_logical_172]])
-// CHECK-NEXT:  %[[xor_174:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_166]], u32[1]{0} %[[or_173]])
-// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_166]], u32[1]{0} %[[xor_174]])
+// CHECK-NEXT:  %[[broadcast_171:[^ ]+]] = u32[1]{0} broadcast(%[[constant_170]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_172:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_165]], %[[broadcast_171]])
+// CHECK-NEXT:  %[[or_173:[^ ]+]] = u32[1]{0} or(%[[shift_left_169]], %[[shift_right_logical_172]])
+// CHECK-NEXT:  %[[xor_174:[^ ]+]] = u32[1]{0} xor(%[[add_166]], %[[or_173]])
+// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[1]{0} add(%[[add_166]], %[[xor_174]])
 // CHECK-NEXT:  %[[constant_176:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_177:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_176]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_178:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_174]], u32[1]{0} %[[broadcast_177]])
+// CHECK-NEXT:  %[[broadcast_177:[^ ]+]] = u32[1]{0} broadcast(%[[constant_176]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_178:[^ ]+]] = u32[1]{0} shift-left(%[[xor_174]], %[[broadcast_177]])
 // CHECK-NEXT:  %[[constant_179:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_179]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_174]], u32[1]{0} %[[broadcast_180]])
-// CHECK-NEXT:  %[[or_182:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_178]], u32[1]{0} %[[shift_right_logical_181]])
-// CHECK-NEXT:  %[[xor_183:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_175]], u32[1]{0} %[[or_182]])
-// CHECK-NEXT:  %[[add_184:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_175]], u32[1]{0} %[[xor_183]])
-// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_194:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_184]], u32[1]{0} %[[broadcast_193]])
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u32[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_174]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[or_182:[^ ]+]] = u32[1]{0} or(%[[shift_left_178]], %[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[xor_183:[^ ]+]] = u32[1]{0} xor(%[[add_175]], %[[or_182]])
+// CHECK-NEXT:  %[[add_184:[^ ]+]] = u32[1]{0} add(%[[add_175]], %[[xor_183]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_194:[^ ]+]] = u32[1]{0} add(%[[add_184]], %[[broadcast_193]])
 // CHECK-NEXT:  %[[constant_185:[^ ]+]] = u32[] constant(24)
-// CHECK-NEXT:  %[[broadcast_186:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_185]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_187:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_183]], u32[1]{0} %[[broadcast_186]])
+// CHECK-NEXT:  %[[broadcast_186:[^ ]+]] = u32[1]{0} broadcast(%[[constant_185]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_187:[^ ]+]] = u32[1]{0} shift-left(%[[xor_183]], %[[broadcast_186]])
 // CHECK-NEXT:  %[[constant_188:[^ ]+]] = u32[] constant(8)
-// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_188]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_183]], u32[1]{0} %[[broadcast_189]])
-// CHECK-NEXT:  %[[or_191:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_187]], u32[1]{0} %[[shift_right_logical_190]])
-// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_184]], u32[1]{0} %[[or_191]])
-// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_196:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_192]], u32[1]{0} %[[broadcast_195]])
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u32[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_183]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[or_191:[^ ]+]] = u32[1]{0} or(%[[shift_left_187]], %[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[add_184]], %[[or_191]])
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_196:[^ ]+]] = u32[1]{0} add(%[[xor_192]], %[[broadcast_195]])
 // CHECK-NEXT:  %[[constant_197:[^ ]+]] = u32[] constant(4)
-// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_197]]), dimensions={}
-// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_196]], u32[1]{0} %[[broadcast_198]])
-// CHECK-NEXT:  %[[add_200:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_194]], u32[1]{0} %[[add_199]])
+// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(%[[constant_197]]), dimensions={}
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[1]{0} add(%[[add_196]], %[[broadcast_198]])
+// CHECK-NEXT:  %[[add_200:[^ ]+]] = u32[1]{0} add(%[[add_194]], %[[add_199]])
 // CHECK-NEXT:  %[[constant_201:[^ ]+]] = u32[] constant(13)
-// CHECK-NEXT:  %[[broadcast_202:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_201]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_203:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_199]], u32[1]{0} %[[broadcast_202]])
+// CHECK-NEXT:  %[[broadcast_202:[^ ]+]] = u32[1]{0} broadcast(%[[constant_201]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_203:[^ ]+]] = u32[1]{0} shift-left(%[[add_199]], %[[broadcast_202]])
 // CHECK-NEXT:  %[[constant_204:[^ ]+]] = u32[] constant(19)
-// CHECK-NEXT:  %[[broadcast_205:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_204]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_206:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_199]], u32[1]{0} %[[broadcast_205]])
-// CHECK-NEXT:  %[[or_207:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_203]], u32[1]{0} %[[shift_right_logical_206]])
-// CHECK-NEXT:  %[[xor_208:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_200]], u32[1]{0} %[[or_207]])
-// CHECK-NEXT:  %[[add_209:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_200]], u32[1]{0} %[[xor_208]])
+// CHECK-NEXT:  %[[broadcast_205:[^ ]+]] = u32[1]{0} broadcast(%[[constant_204]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_206:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_199]], %[[broadcast_205]])
+// CHECK-NEXT:  %[[or_207:[^ ]+]] = u32[1]{0} or(%[[shift_left_203]], %[[shift_right_logical_206]])
+// CHECK-NEXT:  %[[xor_208:[^ ]+]] = u32[1]{0} xor(%[[add_200]], %[[or_207]])
+// CHECK-NEXT:  %[[add_209:[^ ]+]] = u32[1]{0} add(%[[add_200]], %[[xor_208]])
 // CHECK-NEXT:  %[[constant_210:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_211:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_210]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_212:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_208]], u32[1]{0} %[[broadcast_211]])
+// CHECK-NEXT:  %[[broadcast_211:[^ ]+]] = u32[1]{0} broadcast(%[[constant_210]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_212:[^ ]+]] = u32[1]{0} shift-left(%[[xor_208]], %[[broadcast_211]])
 // CHECK-NEXT:  %[[constant_213:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_214:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_213]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_215:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_208]], u32[1]{0} %[[broadcast_214]])
-// CHECK-NEXT:  %[[or_216:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_212]], u32[1]{0} %[[shift_right_logical_215]])
-// CHECK-NEXT:  %[[xor_217:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_209]], u32[1]{0} %[[or_216]])
-// CHECK-NEXT:  %[[add_218:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_209]], u32[1]{0} %[[xor_217]])
+// CHECK-NEXT:  %[[broadcast_214:[^ ]+]] = u32[1]{0} broadcast(%[[constant_213]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_215:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_208]], %[[broadcast_214]])
+// CHECK-NEXT:  %[[or_216:[^ ]+]] = u32[1]{0} or(%[[shift_left_212]], %[[shift_right_logical_215]])
+// CHECK-NEXT:  %[[xor_217:[^ ]+]] = u32[1]{0} xor(%[[add_209]], %[[or_216]])
+// CHECK-NEXT:  %[[add_218:[^ ]+]] = u32[1]{0} add(%[[add_209]], %[[xor_217]])
 // CHECK-NEXT:  %[[constant_219:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_220:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_219]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_221:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_217]], u32[1]{0} %[[broadcast_220]])
+// CHECK-NEXT:  %[[broadcast_220:[^ ]+]] = u32[1]{0} broadcast(%[[constant_219]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_221:[^ ]+]] = u32[1]{0} shift-left(%[[xor_217]], %[[broadcast_220]])
 // CHECK-NEXT:  %[[constant_222:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_222]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_224:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_217]], u32[1]{0} %[[broadcast_223]])
-// CHECK-NEXT:  %[[or_225:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_221]], u32[1]{0} %[[shift_right_logical_224]])
-// CHECK-NEXT:  %[[xor_226:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_218]], u32[1]{0} %[[or_225]])
-// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_218]], u32[1]{0} %[[xor_226]])
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(%[[constant_222]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_224:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_217]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[or_225:[^ ]+]] = u32[1]{0} or(%[[shift_left_221]], %[[shift_right_logical_224]])
+// CHECK-NEXT:  %[[xor_226:[^ ]+]] = u32[1]{0} xor(%[[add_218]], %[[or_225]])
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[1]{0} add(%[[add_218]], %[[xor_226]])
 // CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_229:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_228]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_230:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_226]], u32[1]{0} %[[broadcast_229]])
+// CHECK-NEXT:  %[[broadcast_229:[^ ]+]] = u32[1]{0} broadcast(%[[constant_228]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_230:[^ ]+]] = u32[1]{0} shift-left(%[[xor_226]], %[[broadcast_229]])
 // CHECK-NEXT:  %[[constant_231:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_231]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_233:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_226]], u32[1]{0} %[[broadcast_232]])
-// CHECK-NEXT:  %[[or_234:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_230]], u32[1]{0} %[[shift_right_logical_233]])
-// CHECK-NEXT:  %[[xor_235:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_227]], u32[1]{0} %[[or_234]])
-// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_239:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_235]], u32[1]{0} %[[broadcast_238]])
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u32[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_233:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_226]], %[[broadcast_232]])
+// CHECK-NEXT:  %[[or_234:[^ ]+]] = u32[1]{0} or(%[[shift_left_230]], %[[shift_right_logical_233]])
+// CHECK-NEXT:  %[[xor_235:[^ ]+]] = u32[1]{0} xor(%[[add_227]], %[[or_234]])
+// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_239:[^ ]+]] = u32[1]{0} add(%[[xor_235]], %[[broadcast_238]])
 // CHECK-NEXT:  %[[constant_240:[^ ]+]] = u32[] constant(5)
-// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_240]]), dimensions={}
-// CHECK-NEXT:  %[[add_242:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_239]], u32[1]{0} %[[broadcast_241]])
-// CHECK-NEXT:  %[[reshape_244:[^ ]+]] = u64[1]{0} reshape(u64[] %[[reshape_3]])
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u32[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[add_242:[^ ]+]] = u32[1]{0} add(%[[add_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[reshape_244:[^ ]+]] = u64[1]{0} reshape(%[[reshape_3]])
 // CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_7]]), dimensions={}
-// CHECK-NEXT:  %[[add_9:[^ ]+]] = u64[1]{0} add(u64[1]{0} %[[slice_4]], u64[1]{0} %[[broadcast_8]])
-// CHECK-NEXT:  %[[concatenate_245:[^ ]+]] = u64[2]{0} concatenate(u64[1]{0} %[[reshape_244]], u64[1]{0} %[[add_9]]), dimensions={0}
-// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_237:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_227]], u32[1]{0} %[[broadcast_236]])
-// CHECK-NEXT:  %[[reshape_243:[^ ]+]] = u32[] reshape(u32[1]{0} %[[add_237]])
-// CHECK-NEXT:  ROOT %[[tuple_246:[^ ]+]] = (u64[2]{0}, u32[]) tuple(u64[2]{0} %[[concatenate_245]], u32[] %[[reshape_243]])
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = u64[1]{0} broadcast(%[[constant_7]]), dimensions={}
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = u64[1]{0} add(%[[slice_4]], %[[broadcast_8]])
+// CHECK-NEXT:  %[[concatenate_245:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_244]], %[[add_9]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_237:[^ ]+]] = u32[1]{0} add(%[[add_227]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[reshape_243:[^ ]+]] = u32[] reshape(%[[add_237]])
+// CHECK-NEXT:  ROOT %[[tuple_246:[^ ]+]] = (u64[2]{0}, u32[]) tuple(%[[concatenate_245]], %[[reshape_243]])
 
 // CHECK-LABEL: ENTRY %test_default
 // CHECK-NEXT:  %[[initial_state:[^ ]+]] = u64[2]{0} constant({0, 1})
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[2]{0}, u32[]) call(u64[2]{0} %[[initial_state]]), to_apply=%[[$rng_247]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[2]{0}, u32[]) call(%[[initial_state]]), to_apply=%[[$rng_247]]
 
 HloModule TestDefaultAlgorithm
 
@@ -268,255 +268,255 @@ ENTRY test_default {
 
 // CHECK:       %[[$rng_247:[^ ]+]]
 // CHECK-NEXT:  %[[state_1:[^ ]+]] = u64[4]{0} parameter(0)
-// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[1]{0} slice(u64[4]{0} %[[state_1]]), slice={[1:2]}
-// CHECK-NEXT:  %[[reshape_5:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_4]])
-// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[reshape_5]]), dimensions={}
-// CHECK-NEXT:  %[[convert_11:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[broadcast_6]])
-// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(u64[4]{0} %[[state_1]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_2]])
-// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(u64[] %[[reshape_3]])
-// CHECK-NEXT:  %[[bitcast_convert_19:[^ ]+]] = u32[] bitcast-convert(u32[] %[[convert_16]])
-// CHECK-NEXT:  %[[broadcast_24:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_25:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[convert_11]], u32[1]{0} %[[broadcast_24]])
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_5:[^ ]+]] = u64[] reshape(%[[slice_4]])
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = u64[1]{0} broadcast(%[[reshape_5]]), dimensions={}
+// CHECK-NEXT:  %[[convert_11:[^ ]+]] = u32[1]{0} convert(%[[broadcast_6]])
+// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(%[[slice_2]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[reshape_3]])
+// CHECK-NEXT:  %[[bitcast_convert_19:[^ ]+]] = u32[] bitcast-convert(%[[convert_16]])
+// CHECK-NEXT:  %[[broadcast_24:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_25:[^ ]+]] = u32[1]{0} add(%[[convert_11]], %[[broadcast_24]])
 // CHECK-NEXT:  %[[constant_10:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_10]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_13:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[broadcast_6]], u64[1]{0} %[[broadcast_12]])
-// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_13]])
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = u64[1]{0} broadcast(%[[constant_10]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_13:[^ ]+]] = u64[1]{0} shift-right-logical(%[[broadcast_6]], %[[broadcast_12]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_13]])
 // CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_3]], u64[] %[[constant_15]])
-// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_17]])
-// CHECK-NEXT:  %[[bitcast_convert_20:[^ ]+]] = u32[] bitcast-convert(u32[] %[[convert_18]])
-// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_27:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[convert_14]], u32[1]{0} %[[broadcast_26]])
-// CHECK-NEXT:  %[[add_28:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_25]], u32[1]{0} %[[add_27]])
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(%[[reshape_3]], %[[constant_15]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[bitcast_convert_20:[^ ]+]] = u32[] bitcast-convert(%[[convert_18]])
+// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_27:[^ ]+]] = u32[1]{0} add(%[[convert_14]], %[[broadcast_26]])
+// CHECK-NEXT:  %[[add_28:[^ ]+]] = u32[1]{0} add(%[[add_25]], %[[add_27]])
 // CHECK-NEXT:  %[[constant_29:[^ ]+]] = u32[] constant(13)
-// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_29]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_31:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_27]], u32[1]{0} %[[broadcast_30]])
+// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = u32[1]{0} broadcast(%[[constant_29]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_31:[^ ]+]] = u32[1]{0} shift-left(%[[add_27]], %[[broadcast_30]])
 // CHECK-NEXT:  %[[constant_32:[^ ]+]] = u32[] constant(19)
-// CHECK-NEXT:  %[[broadcast_33:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_32]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_34:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_27]], u32[1]{0} %[[broadcast_33]])
-// CHECK-NEXT:  %[[or_35:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_31]], u32[1]{0} %[[shift_right_logical_34]])
-// CHECK-NEXT:  %[[xor_36:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_28]], u32[1]{0} %[[or_35]])
-// CHECK-NEXT:  %[[add_37:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_28]], u32[1]{0} %[[xor_36]])
+// CHECK-NEXT:  %[[broadcast_33:[^ ]+]] = u32[1]{0} broadcast(%[[constant_32]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_34:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_27]], %[[broadcast_33]])
+// CHECK-NEXT:  %[[or_35:[^ ]+]] = u32[1]{0} or(%[[shift_left_31]], %[[shift_right_logical_34]])
+// CHECK-NEXT:  %[[xor_36:[^ ]+]] = u32[1]{0} xor(%[[add_28]], %[[or_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u32[1]{0} add(%[[add_28]], %[[xor_36]])
 // CHECK-NEXT:  %[[constant_38:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_38]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_40:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_36]], u32[1]{0} %[[broadcast_39]])
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u32[1]{0} broadcast(%[[constant_38]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_40:[^ ]+]] = u32[1]{0} shift-left(%[[xor_36]], %[[broadcast_39]])
 // CHECK-NEXT:  %[[constant_41:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_42:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_41]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_43:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_36]], u32[1]{0} %[[broadcast_42]])
-// CHECK-NEXT:  %[[or_44:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_40]], u32[1]{0} %[[shift_right_logical_43]])
-// CHECK-NEXT:  %[[xor_45:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_37]], u32[1]{0} %[[or_44]])
-// CHECK-NEXT:  %[[add_46:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_37]], u32[1]{0} %[[xor_45]])
+// CHECK-NEXT:  %[[broadcast_42:[^ ]+]] = u32[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_43:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_36]], %[[broadcast_42]])
+// CHECK-NEXT:  %[[or_44:[^ ]+]] = u32[1]{0} or(%[[shift_left_40]], %[[shift_right_logical_43]])
+// CHECK-NEXT:  %[[xor_45:[^ ]+]] = u32[1]{0} xor(%[[add_37]], %[[or_44]])
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = u32[1]{0} add(%[[add_37]], %[[xor_45]])
 // CHECK-NEXT:  %[[constant_47:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_47]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_49:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_45]], u32[1]{0} %[[broadcast_48]])
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u32[1]{0} broadcast(%[[constant_47]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_49:[^ ]+]] = u32[1]{0} shift-left(%[[xor_45]], %[[broadcast_48]])
 // CHECK-NEXT:  %[[constant_50:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_51:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_50]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_52:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_45]], u32[1]{0} %[[broadcast_51]])
-// CHECK-NEXT:  %[[or_53:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_49]], u32[1]{0} %[[shift_right_logical_52]])
-// CHECK-NEXT:  %[[xor_54:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_46]], u32[1]{0} %[[or_53]])
-// CHECK-NEXT:  %[[add_55:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_46]], u32[1]{0} %[[xor_54]])
-// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_65:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_55]], u32[1]{0} %[[broadcast_64]])
+// CHECK-NEXT:  %[[broadcast_51:[^ ]+]] = u32[1]{0} broadcast(%[[constant_50]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_52:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_45]], %[[broadcast_51]])
+// CHECK-NEXT:  %[[or_53:[^ ]+]] = u32[1]{0} or(%[[shift_left_49]], %[[shift_right_logical_52]])
+// CHECK-NEXT:  %[[xor_54:[^ ]+]] = u32[1]{0} xor(%[[add_46]], %[[or_53]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u32[1]{0} add(%[[add_46]], %[[xor_54]])
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_65:[^ ]+]] = u32[1]{0} add(%[[add_55]], %[[broadcast_64]])
 // CHECK-NEXT:  %[[constant_56:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_56]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_58:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_54]], u32[1]{0} %[[broadcast_57]])
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u32[1]{0} broadcast(%[[constant_56]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_58:[^ ]+]] = u32[1]{0} shift-left(%[[xor_54]], %[[broadcast_57]])
 // CHECK-NEXT:  %[[constant_59:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_60:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_59]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_61:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_54]], u32[1]{0} %[[broadcast_60]])
-// CHECK-NEXT:  %[[or_62:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_58]], u32[1]{0} %[[shift_right_logical_61]])
-// CHECK-NEXT:  %[[xor_63:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_55]], u32[1]{0} %[[or_62]])
+// CHECK-NEXT:  %[[broadcast_60:[^ ]+]] = u32[1]{0} broadcast(%[[constant_59]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_61:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_54]], %[[broadcast_60]])
+// CHECK-NEXT:  %[[or_62:[^ ]+]] = u32[1]{0} or(%[[shift_left_58]], %[[shift_right_logical_61]])
+// CHECK-NEXT:  %[[xor_63:[^ ]+]] = u32[1]{0} xor(%[[add_55]], %[[or_62]])
 // CHECK-NEXT:  %[[constant_21:[^ ]+]] = u32[] constant(466688986)
-// CHECK-NEXT:  %[[xor_22:[^ ]+]] = u32[] xor(u32[] %[[constant_21]], u32[] %[[bitcast_convert_19]])
-// CHECK-NEXT:  %[[xor_23:[^ ]+]] = u32[] xor(u32[] %[[xor_22]], u32[] %[[bitcast_convert_20]])
-// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_67:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_63]], u32[1]{0} %[[broadcast_66]])
+// CHECK-NEXT:  %[[xor_22:[^ ]+]] = u32[] xor(%[[constant_21]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[xor_23:[^ ]+]] = u32[] xor(%[[xor_22]], %[[bitcast_convert_20]])
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = u32[1]{0} add(%[[xor_63]], %[[broadcast_66]])
 // CHECK-NEXT:  %[[constant_68:[^ ]+]] = u32[] constant(1)
-// CHECK-NEXT:  %[[broadcast_69:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_68]]), dimensions={}
-// CHECK-NEXT:  %[[add_70:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_67]], u32[1]{0} %[[broadcast_69]])
-// CHECK-NEXT:  %[[add_71:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_65]], u32[1]{0} %[[add_70]])
+// CHECK-NEXT:  %[[broadcast_69:[^ ]+]] = u32[1]{0} broadcast(%[[constant_68]]), dimensions={}
+// CHECK-NEXT:  %[[add_70:[^ ]+]] = u32[1]{0} add(%[[add_67]], %[[broadcast_69]])
+// CHECK-NEXT:  %[[add_71:[^ ]+]] = u32[1]{0} add(%[[add_65]], %[[add_70]])
 // CHECK-NEXT:  %[[constant_72:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_72]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_74:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_70]], u32[1]{0} %[[broadcast_73]])
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u32[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_74:[^ ]+]] = u32[1]{0} shift-left(%[[add_70]], %[[broadcast_73]])
 // CHECK-NEXT:  %[[constant_75:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_76:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_75]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_77:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_70]], u32[1]{0} %[[broadcast_76]])
-// CHECK-NEXT:  %[[or_78:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_74]], u32[1]{0} %[[shift_right_logical_77]])
-// CHECK-NEXT:  %[[xor_79:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_71]], u32[1]{0} %[[or_78]])
-// CHECK-NEXT:  %[[add_80:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_71]], u32[1]{0} %[[xor_79]])
+// CHECK-NEXT:  %[[broadcast_76:[^ ]+]] = u32[1]{0} broadcast(%[[constant_75]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_77:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_70]], %[[broadcast_76]])
+// CHECK-NEXT:  %[[or_78:[^ ]+]] = u32[1]{0} or(%[[shift_left_74]], %[[shift_right_logical_77]])
+// CHECK-NEXT:  %[[xor_79:[^ ]+]] = u32[1]{0} xor(%[[add_71]], %[[or_78]])
+// CHECK-NEXT:  %[[add_80:[^ ]+]] = u32[1]{0} add(%[[add_71]], %[[xor_79]])
 // CHECK-NEXT:  %[[constant_81:[^ ]+]] = u32[] constant(29)
-// CHECK-NEXT:  %[[broadcast_82:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_81]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_83:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_79]], u32[1]{0} %[[broadcast_82]])
+// CHECK-NEXT:  %[[broadcast_82:[^ ]+]] = u32[1]{0} broadcast(%[[constant_81]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_83:[^ ]+]] = u32[1]{0} shift-left(%[[xor_79]], %[[broadcast_82]])
 // CHECK-NEXT:  %[[constant_84:[^ ]+]] = u32[] constant(3)
-// CHECK-NEXT:  %[[broadcast_85:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_84]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_86:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_79]], u32[1]{0} %[[broadcast_85]])
-// CHECK-NEXT:  %[[or_87:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_83]], u32[1]{0} %[[shift_right_logical_86]])
-// CHECK-NEXT:  %[[xor_88:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_80]], u32[1]{0} %[[or_87]])
-// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_80]], u32[1]{0} %[[xor_88]])
+// CHECK-NEXT:  %[[broadcast_85:[^ ]+]] = u32[1]{0} broadcast(%[[constant_84]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_86:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_79]], %[[broadcast_85]])
+// CHECK-NEXT:  %[[or_87:[^ ]+]] = u32[1]{0} or(%[[shift_left_83]], %[[shift_right_logical_86]])
+// CHECK-NEXT:  %[[xor_88:[^ ]+]] = u32[1]{0} xor(%[[add_80]], %[[or_87]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[1]{0} add(%[[add_80]], %[[xor_88]])
 // CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_91:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_90]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_92:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_88]], u32[1]{0} %[[broadcast_91]])
+// CHECK-NEXT:  %[[broadcast_91:[^ ]+]] = u32[1]{0} broadcast(%[[constant_90]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_92:[^ ]+]] = u32[1]{0} shift-left(%[[xor_88]], %[[broadcast_91]])
 // CHECK-NEXT:  %[[constant_93:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_93]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_95:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_88]], u32[1]{0} %[[broadcast_94]])
-// CHECK-NEXT:  %[[or_96:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_92]], u32[1]{0} %[[shift_right_logical_95]])
-// CHECK-NEXT:  %[[xor_97:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_89]], u32[1]{0} %[[or_96]])
-// CHECK-NEXT:  %[[add_98:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_89]], u32[1]{0} %[[xor_97]])
-// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_108:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_98]], u32[1]{0} %[[broadcast_107]])
+// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u32[1]{0} broadcast(%[[constant_93]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_95:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_88]], %[[broadcast_94]])
+// CHECK-NEXT:  %[[or_96:[^ ]+]] = u32[1]{0} or(%[[shift_left_92]], %[[shift_right_logical_95]])
+// CHECK-NEXT:  %[[xor_97:[^ ]+]] = u32[1]{0} xor(%[[add_89]], %[[or_96]])
+// CHECK-NEXT:  %[[add_98:[^ ]+]] = u32[1]{0} add(%[[add_89]], %[[xor_97]])
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_108:[^ ]+]] = u32[1]{0} add(%[[add_98]], %[[broadcast_107]])
 // CHECK-NEXT:  %[[constant_99:[^ ]+]] = u32[] constant(24)
-// CHECK-NEXT:  %[[broadcast_100:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_99]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_101:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_97]], u32[1]{0} %[[broadcast_100]])
+// CHECK-NEXT:  %[[broadcast_100:[^ ]+]] = u32[1]{0} broadcast(%[[constant_99]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_101:[^ ]+]] = u32[1]{0} shift-left(%[[xor_97]], %[[broadcast_100]])
 // CHECK-NEXT:  %[[constant_102:[^ ]+]] = u32[] constant(8)
-// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_102]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_104:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_97]], u32[1]{0} %[[broadcast_103]])
-// CHECK-NEXT:  %[[or_105:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_101]], u32[1]{0} %[[shift_right_logical_104]])
-// CHECK-NEXT:  %[[xor_106:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_98]], u32[1]{0} %[[or_105]])
-// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_110:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_106]], u32[1]{0} %[[broadcast_109]])
+// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u32[1]{0} broadcast(%[[constant_102]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_104:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_97]], %[[broadcast_103]])
+// CHECK-NEXT:  %[[or_105:[^ ]+]] = u32[1]{0} or(%[[shift_left_101]], %[[shift_right_logical_104]])
+// CHECK-NEXT:  %[[xor_106:[^ ]+]] = u32[1]{0} xor(%[[add_98]], %[[or_105]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_110:[^ ]+]] = u32[1]{0} add(%[[xor_106]], %[[broadcast_109]])
 // CHECK-NEXT:  %[[constant_111:[^ ]+]] = u32[] constant(2)
-// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_111]]), dimensions={}
-// CHECK-NEXT:  %[[add_113:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_110]], u32[1]{0} %[[broadcast_112]])
-// CHECK-NEXT:  %[[add_114:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_108]], u32[1]{0} %[[add_113]])
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[constant_111]]), dimensions={}
+// CHECK-NEXT:  %[[add_113:[^ ]+]] = u32[1]{0} add(%[[add_110]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[add_114:[^ ]+]] = u32[1]{0} add(%[[add_108]], %[[add_113]])
 // CHECK-NEXT:  %[[constant_115:[^ ]+]] = u32[] constant(13)
-// CHECK-NEXT:  %[[broadcast_116:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_115]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_117:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_113]], u32[1]{0} %[[broadcast_116]])
+// CHECK-NEXT:  %[[broadcast_116:[^ ]+]] = u32[1]{0} broadcast(%[[constant_115]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_117:[^ ]+]] = u32[1]{0} shift-left(%[[add_113]], %[[broadcast_116]])
 // CHECK-NEXT:  %[[constant_118:[^ ]+]] = u32[] constant(19)
-// CHECK-NEXT:  %[[broadcast_119:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_118]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_120:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_113]], u32[1]{0} %[[broadcast_119]])
-// CHECK-NEXT:  %[[or_121:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_117]], u32[1]{0} %[[shift_right_logical_120]])
-// CHECK-NEXT:  %[[xor_122:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_114]], u32[1]{0} %[[or_121]])
-// CHECK-NEXT:  %[[add_123:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_114]], u32[1]{0} %[[xor_122]])
+// CHECK-NEXT:  %[[broadcast_119:[^ ]+]] = u32[1]{0} broadcast(%[[constant_118]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_120:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_113]], %[[broadcast_119]])
+// CHECK-NEXT:  %[[or_121:[^ ]+]] = u32[1]{0} or(%[[shift_left_117]], %[[shift_right_logical_120]])
+// CHECK-NEXT:  %[[xor_122:[^ ]+]] = u32[1]{0} xor(%[[add_114]], %[[or_121]])
+// CHECK-NEXT:  %[[add_123:[^ ]+]] = u32[1]{0} add(%[[add_114]], %[[xor_122]])
 // CHECK-NEXT:  %[[constant_124:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_125:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_124]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_126:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_122]], u32[1]{0} %[[broadcast_125]])
+// CHECK-NEXT:  %[[broadcast_125:[^ ]+]] = u32[1]{0} broadcast(%[[constant_124]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_126:[^ ]+]] = u32[1]{0} shift-left(%[[xor_122]], %[[broadcast_125]])
 // CHECK-NEXT:  %[[constant_127:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_128:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_127]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_129:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_122]], u32[1]{0} %[[broadcast_128]])
-// CHECK-NEXT:  %[[or_130:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_126]], u32[1]{0} %[[shift_right_logical_129]])
-// CHECK-NEXT:  %[[xor_131:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_123]], u32[1]{0} %[[or_130]])
-// CHECK-NEXT:  %[[add_132:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_123]], u32[1]{0} %[[xor_131]])
+// CHECK-NEXT:  %[[broadcast_128:[^ ]+]] = u32[1]{0} broadcast(%[[constant_127]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_129:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_122]], %[[broadcast_128]])
+// CHECK-NEXT:  %[[or_130:[^ ]+]] = u32[1]{0} or(%[[shift_left_126]], %[[shift_right_logical_129]])
+// CHECK-NEXT:  %[[xor_131:[^ ]+]] = u32[1]{0} xor(%[[add_123]], %[[or_130]])
+// CHECK-NEXT:  %[[add_132:[^ ]+]] = u32[1]{0} add(%[[add_123]], %[[xor_131]])
 // CHECK-NEXT:  %[[constant_133:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_134:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_133]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_135:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_131]], u32[1]{0} %[[broadcast_134]])
+// CHECK-NEXT:  %[[broadcast_134:[^ ]+]] = u32[1]{0} broadcast(%[[constant_133]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_135:[^ ]+]] = u32[1]{0} shift-left(%[[xor_131]], %[[broadcast_134]])
 // CHECK-NEXT:  %[[constant_136:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_136]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_138:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_131]], u32[1]{0} %[[broadcast_137]])
-// CHECK-NEXT:  %[[or_139:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_135]], u32[1]{0} %[[shift_right_logical_138]])
-// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_132]], u32[1]{0} %[[or_139]])
-// CHECK-NEXT:  %[[add_141:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_132]], u32[1]{0} %[[xor_140]])
-// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_151:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_141]], u32[1]{0} %[[broadcast_150]])
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[constant_136]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_138:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_131]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[or_139:[^ ]+]] = u32[1]{0} or(%[[shift_left_135]], %[[shift_right_logical_138]])
+// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(%[[add_132]], %[[or_139]])
+// CHECK-NEXT:  %[[add_141:[^ ]+]] = u32[1]{0} add(%[[add_132]], %[[xor_140]])
+// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_151:[^ ]+]] = u32[1]{0} add(%[[add_141]], %[[broadcast_150]])
 // CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_143:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_142]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_144:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_140]], u32[1]{0} %[[broadcast_143]])
+// CHECK-NEXT:  %[[broadcast_143:[^ ]+]] = u32[1]{0} broadcast(%[[constant_142]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_144:[^ ]+]] = u32[1]{0} shift-left(%[[xor_140]], %[[broadcast_143]])
 // CHECK-NEXT:  %[[constant_145:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_146:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_145]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_147:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_140]], u32[1]{0} %[[broadcast_146]])
-// CHECK-NEXT:  %[[or_148:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_144]], u32[1]{0} %[[shift_right_logical_147]])
-// CHECK-NEXT:  %[[xor_149:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_141]], u32[1]{0} %[[or_148]])
-// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_153:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_149]], u32[1]{0} %[[broadcast_152]])
+// CHECK-NEXT:  %[[broadcast_146:[^ ]+]] = u32[1]{0} broadcast(%[[constant_145]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_147:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_140]], %[[broadcast_146]])
+// CHECK-NEXT:  %[[or_148:[^ ]+]] = u32[1]{0} or(%[[shift_left_144]], %[[shift_right_logical_147]])
+// CHECK-NEXT:  %[[xor_149:[^ ]+]] = u32[1]{0} xor(%[[add_141]], %[[or_148]])
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_153:[^ ]+]] = u32[1]{0} add(%[[xor_149]], %[[broadcast_152]])
 // CHECK-NEXT:  %[[constant_154:[^ ]+]] = u32[] constant(3)
-// CHECK-NEXT:  %[[broadcast_155:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_154]]), dimensions={}
-// CHECK-NEXT:  %[[add_156:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_153]], u32[1]{0} %[[broadcast_155]])
-// CHECK-NEXT:  %[[add_157:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_151]], u32[1]{0} %[[add_156]])
+// CHECK-NEXT:  %[[broadcast_155:[^ ]+]] = u32[1]{0} broadcast(%[[constant_154]]), dimensions={}
+// CHECK-NEXT:  %[[add_156:[^ ]+]] = u32[1]{0} add(%[[add_153]], %[[broadcast_155]])
+// CHECK-NEXT:  %[[add_157:[^ ]+]] = u32[1]{0} add(%[[add_151]], %[[add_156]])
 // CHECK-NEXT:  %[[constant_158:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_158]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_160:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_156]], u32[1]{0} %[[broadcast_159]])
+// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u32[1]{0} broadcast(%[[constant_158]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_160:[^ ]+]] = u32[1]{0} shift-left(%[[add_156]], %[[broadcast_159]])
 // CHECK-NEXT:  %[[constant_161:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_162:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_161]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_163:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_156]], u32[1]{0} %[[broadcast_162]])
-// CHECK-NEXT:  %[[or_164:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_160]], u32[1]{0} %[[shift_right_logical_163]])
-// CHECK-NEXT:  %[[xor_165:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_157]], u32[1]{0} %[[or_164]])
-// CHECK-NEXT:  %[[add_166:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_157]], u32[1]{0} %[[xor_165]])
+// CHECK-NEXT:  %[[broadcast_162:[^ ]+]] = u32[1]{0} broadcast(%[[constant_161]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_163:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_156]], %[[broadcast_162]])
+// CHECK-NEXT:  %[[or_164:[^ ]+]] = u32[1]{0} or(%[[shift_left_160]], %[[shift_right_logical_163]])
+// CHECK-NEXT:  %[[xor_165:[^ ]+]] = u32[1]{0} xor(%[[add_157]], %[[or_164]])
+// CHECK-NEXT:  %[[add_166:[^ ]+]] = u32[1]{0} add(%[[add_157]], %[[xor_165]])
 // CHECK-NEXT:  %[[constant_167:[^ ]+]] = u32[] constant(29)
-// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_167]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_169:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_165]], u32[1]{0} %[[broadcast_168]])
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[constant_167]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_169:[^ ]+]] = u32[1]{0} shift-left(%[[xor_165]], %[[broadcast_168]])
 // CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(3)
-// CHECK-NEXT:  %[[broadcast_171:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_170]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_172:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_165]], u32[1]{0} %[[broadcast_171]])
-// CHECK-NEXT:  %[[or_173:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_169]], u32[1]{0} %[[shift_right_logical_172]])
-// CHECK-NEXT:  %[[xor_174:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_166]], u32[1]{0} %[[or_173]])
-// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_166]], u32[1]{0} %[[xor_174]])
+// CHECK-NEXT:  %[[broadcast_171:[^ ]+]] = u32[1]{0} broadcast(%[[constant_170]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_172:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_165]], %[[broadcast_171]])
+// CHECK-NEXT:  %[[or_173:[^ ]+]] = u32[1]{0} or(%[[shift_left_169]], %[[shift_right_logical_172]])
+// CHECK-NEXT:  %[[xor_174:[^ ]+]] = u32[1]{0} xor(%[[add_166]], %[[or_173]])
+// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[1]{0} add(%[[add_166]], %[[xor_174]])
 // CHECK-NEXT:  %[[constant_176:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_177:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_176]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_178:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_174]], u32[1]{0} %[[broadcast_177]])
+// CHECK-NEXT:  %[[broadcast_177:[^ ]+]] = u32[1]{0} broadcast(%[[constant_176]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_178:[^ ]+]] = u32[1]{0} shift-left(%[[xor_174]], %[[broadcast_177]])
 // CHECK-NEXT:  %[[constant_179:[^ ]+]] = u32[] constant(16)
-// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_179]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_174]], u32[1]{0} %[[broadcast_180]])
-// CHECK-NEXT:  %[[or_182:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_178]], u32[1]{0} %[[shift_right_logical_181]])
-// CHECK-NEXT:  %[[xor_183:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_175]], u32[1]{0} %[[or_182]])
-// CHECK-NEXT:  %[[add_184:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_175]], u32[1]{0} %[[xor_183]])
-// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_20]]), dimensions={}
-// CHECK-NEXT:  %[[add_194:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_184]], u32[1]{0} %[[broadcast_193]])
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u32[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_174]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[or_182:[^ ]+]] = u32[1]{0} or(%[[shift_left_178]], %[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[xor_183:[^ ]+]] = u32[1]{0} xor(%[[add_175]], %[[or_182]])
+// CHECK-NEXT:  %[[add_184:[^ ]+]] = u32[1]{0} add(%[[add_175]], %[[xor_183]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_194:[^ ]+]] = u32[1]{0} add(%[[add_184]], %[[broadcast_193]])
 // CHECK-NEXT:  %[[constant_185:[^ ]+]] = u32[] constant(24)
-// CHECK-NEXT:  %[[broadcast_186:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_185]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_187:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_183]], u32[1]{0} %[[broadcast_186]])
+// CHECK-NEXT:  %[[broadcast_186:[^ ]+]] = u32[1]{0} broadcast(%[[constant_185]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_187:[^ ]+]] = u32[1]{0} shift-left(%[[xor_183]], %[[broadcast_186]])
 // CHECK-NEXT:  %[[constant_188:[^ ]+]] = u32[] constant(8)
-// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_188]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_183]], u32[1]{0} %[[broadcast_189]])
-// CHECK-NEXT:  %[[or_191:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_187]], u32[1]{0} %[[shift_right_logical_190]])
-// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_184]], u32[1]{0} %[[or_191]])
-// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_196:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_192]], u32[1]{0} %[[broadcast_195]])
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u32[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_183]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[or_191:[^ ]+]] = u32[1]{0} or(%[[shift_left_187]], %[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[add_184]], %[[or_191]])
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_196:[^ ]+]] = u32[1]{0} add(%[[xor_192]], %[[broadcast_195]])
 // CHECK-NEXT:  %[[constant_197:[^ ]+]] = u32[] constant(4)
-// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_197]]), dimensions={}
-// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_196]], u32[1]{0} %[[broadcast_198]])
-// CHECK-NEXT:  %[[add_200:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_194]], u32[1]{0} %[[add_199]])
+// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(%[[constant_197]]), dimensions={}
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[1]{0} add(%[[add_196]], %[[broadcast_198]])
+// CHECK-NEXT:  %[[add_200:[^ ]+]] = u32[1]{0} add(%[[add_194]], %[[add_199]])
 // CHECK-NEXT:  %[[constant_201:[^ ]+]] = u32[] constant(13)
-// CHECK-NEXT:  %[[broadcast_202:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_201]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_203:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[add_199]], u32[1]{0} %[[broadcast_202]])
+// CHECK-NEXT:  %[[broadcast_202:[^ ]+]] = u32[1]{0} broadcast(%[[constant_201]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_203:[^ ]+]] = u32[1]{0} shift-left(%[[add_199]], %[[broadcast_202]])
 // CHECK-NEXT:  %[[constant_204:[^ ]+]] = u32[] constant(19)
-// CHECK-NEXT:  %[[broadcast_205:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_204]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_206:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[add_199]], u32[1]{0} %[[broadcast_205]])
-// CHECK-NEXT:  %[[or_207:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_203]], u32[1]{0} %[[shift_right_logical_206]])
-// CHECK-NEXT:  %[[xor_208:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_200]], u32[1]{0} %[[or_207]])
-// CHECK-NEXT:  %[[add_209:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_200]], u32[1]{0} %[[xor_208]])
+// CHECK-NEXT:  %[[broadcast_205:[^ ]+]] = u32[1]{0} broadcast(%[[constant_204]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_206:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_199]], %[[broadcast_205]])
+// CHECK-NEXT:  %[[or_207:[^ ]+]] = u32[1]{0} or(%[[shift_left_203]], %[[shift_right_logical_206]])
+// CHECK-NEXT:  %[[xor_208:[^ ]+]] = u32[1]{0} xor(%[[add_200]], %[[or_207]])
+// CHECK-NEXT:  %[[add_209:[^ ]+]] = u32[1]{0} add(%[[add_200]], %[[xor_208]])
 // CHECK-NEXT:  %[[constant_210:[^ ]+]] = u32[] constant(15)
-// CHECK-NEXT:  %[[broadcast_211:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_210]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_212:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_208]], u32[1]{0} %[[broadcast_211]])
+// CHECK-NEXT:  %[[broadcast_211:[^ ]+]] = u32[1]{0} broadcast(%[[constant_210]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_212:[^ ]+]] = u32[1]{0} shift-left(%[[xor_208]], %[[broadcast_211]])
 // CHECK-NEXT:  %[[constant_213:[^ ]+]] = u32[] constant(17)
-// CHECK-NEXT:  %[[broadcast_214:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_213]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_215:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_208]], u32[1]{0} %[[broadcast_214]])
-// CHECK-NEXT:  %[[or_216:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_212]], u32[1]{0} %[[shift_right_logical_215]])
-// CHECK-NEXT:  %[[xor_217:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_209]], u32[1]{0} %[[or_216]])
-// CHECK-NEXT:  %[[add_218:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_209]], u32[1]{0} %[[xor_217]])
+// CHECK-NEXT:  %[[broadcast_214:[^ ]+]] = u32[1]{0} broadcast(%[[constant_213]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_215:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_208]], %[[broadcast_214]])
+// CHECK-NEXT:  %[[or_216:[^ ]+]] = u32[1]{0} or(%[[shift_left_212]], %[[shift_right_logical_215]])
+// CHECK-NEXT:  %[[xor_217:[^ ]+]] = u32[1]{0} xor(%[[add_209]], %[[or_216]])
+// CHECK-NEXT:  %[[add_218:[^ ]+]] = u32[1]{0} add(%[[add_209]], %[[xor_217]])
 // CHECK-NEXT:  %[[constant_219:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_220:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_219]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_221:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_217]], u32[1]{0} %[[broadcast_220]])
+// CHECK-NEXT:  %[[broadcast_220:[^ ]+]] = u32[1]{0} broadcast(%[[constant_219]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_221:[^ ]+]] = u32[1]{0} shift-left(%[[xor_217]], %[[broadcast_220]])
 // CHECK-NEXT:  %[[constant_222:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_222]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_224:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_217]], u32[1]{0} %[[broadcast_223]])
-// CHECK-NEXT:  %[[or_225:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_221]], u32[1]{0} %[[shift_right_logical_224]])
-// CHECK-NEXT:  %[[xor_226:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_218]], u32[1]{0} %[[or_225]])
-// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_218]], u32[1]{0} %[[xor_226]])
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(%[[constant_222]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_224:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_217]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[or_225:[^ ]+]] = u32[1]{0} or(%[[shift_left_221]], %[[shift_right_logical_224]])
+// CHECK-NEXT:  %[[xor_226:[^ ]+]] = u32[1]{0} xor(%[[add_218]], %[[or_225]])
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[1]{0} add(%[[add_218]], %[[xor_226]])
 // CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(6)
-// CHECK-NEXT:  %[[broadcast_229:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_228]]), dimensions={}
-// CHECK-NEXT:  %[[shift_left_230:[^ ]+]] = u32[1]{0} shift-left(u32[1]{0} %[[xor_226]], u32[1]{0} %[[broadcast_229]])
+// CHECK-NEXT:  %[[broadcast_229:[^ ]+]] = u32[1]{0} broadcast(%[[constant_228]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_230:[^ ]+]] = u32[1]{0} shift-left(%[[xor_226]], %[[broadcast_229]])
 // CHECK-NEXT:  %[[constant_231:[^ ]+]] = u32[] constant(26)
-// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_231]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_233:[^ ]+]] = u32[1]{0} shift-right-logical(u32[1]{0} %[[xor_226]], u32[1]{0} %[[broadcast_232]])
-// CHECK-NEXT:  %[[or_234:[^ ]+]] = u32[1]{0} or(u32[1]{0} %[[shift_left_230]], u32[1]{0} %[[shift_right_logical_233]])
-// CHECK-NEXT:  %[[xor_235:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[add_227]], u32[1]{0} %[[or_234]])
-// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[bitcast_convert_19]]), dimensions={}
-// CHECK-NEXT:  %[[add_239:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[xor_235]], u32[1]{0} %[[broadcast_238]])
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u32[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_233:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_226]], %[[broadcast_232]])
+// CHECK-NEXT:  %[[or_234:[^ ]+]] = u32[1]{0} or(%[[shift_left_230]], %[[shift_right_logical_233]])
+// CHECK-NEXT:  %[[xor_235:[^ ]+]] = u32[1]{0} xor(%[[add_227]], %[[or_234]])
+// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_239:[^ ]+]] = u32[1]{0} add(%[[xor_235]], %[[broadcast_238]])
 // CHECK-NEXT:  %[[constant_240:[^ ]+]] = u32[] constant(5)
-// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[constant_240]]), dimensions={}
-// CHECK-NEXT:  %[[add_242:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_239]], u32[1]{0} %[[broadcast_241]])
-// CHECK-NEXT:  %[[reshape_244:[^ ]+]] = u64[1]{0} reshape(u64[] %[[reshape_3]])
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u32[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[add_242:[^ ]+]] = u32[1]{0} add(%[[add_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[reshape_244:[^ ]+]] = u64[1]{0} reshape(%[[reshape_3]])
 // CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_7]]), dimensions={}
-// CHECK-NEXT:  %[[add_9:[^ ]+]] = u64[1]{0} add(u64[1]{0} %[[slice_4]], u64[1]{0} %[[broadcast_8]])
-// CHECK-NEXT:  %[[concatenate_245:[^ ]+]] = u64[2]{0} concatenate(u64[1]{0} %[[reshape_244]], u64[1]{0} %[[add_9]]), dimensions={0}
-// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[xor_23]]), dimensions={}
-// CHECK-NEXT:  %[[add_237:[^ ]+]] = u32[1]{0} add(u32[1]{0} %[[add_227]], u32[1]{0} %[[broadcast_236]])
-// CHECK-NEXT:  %[[reshape_243:[^ ]+]] = u32[] reshape(u32[1]{0} %[[add_237]])
-// CHECK-NEXT:  ROOT %[[tuple_246:[^ ]+]] = (u64[2]{0}, u32[]) tuple(u64[2]{0} %[[concatenate_245]], u32[] %[[reshape_243]])
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = u64[1]{0} broadcast(%[[constant_7]]), dimensions={}
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = u64[1]{0} add(%[[slice_4]], %[[broadcast_8]])
+// CHECK-NEXT:  %[[concatenate_245:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_244]], %[[add_9]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_237:[^ ]+]] = u32[1]{0} add(%[[add_227]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[reshape_243:[^ ]+]] = u32[] reshape(%[[add_237]])
+// CHECK-NEXT:  ROOT %[[tuple_246:[^ ]+]] = (u64[2]{0}, u32[]) tuple(%[[concatenate_245]], %[[reshape_243]])
 
 // CHECK-LABEL: ENTRY %test_three_fry
 // CHECK-NEXT:  %[[initial_state:[^ ]+]] = u64[4]{0} constant({9, 10, 11, 12})
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[4]{0}, u32[]) call(u64[4]{0} %[[initial_state]]), to_apply=%[[$rng_247]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[4]{0}, u32[]) call(%[[initial_state]]), to_apply=%[[$rng_247]]
 
 HloModule TestThreeFryAlgorithm
 
@@ -527,365 +527,838 @@ ENTRY test_three_fry {
 
 // -----
 
+// CHECK-LABEL: HloModule TestThreeFryAlgorithm, entry_computation_layout={()->(u64[4]{0}, u64[])}
+
+// CHECK:       %[[$rng_193:[^ ]+]]
+// CHECK-NEXT:  %[[state_1:[^ ]+]] = u64[4]{0} parameter(0)
+// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(%[[slice_2]])
+// CHECK-NEXT:  %[[reshape_190:[^ ]+]] = u64[1]{0} reshape(%[[reshape_3]])
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[1:2]}
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = u64[1]{0} broadcast(%[[constant_7]]), dimensions={}
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = u64[1]{0} add(%[[slice_4]], %[[broadcast_8]])
+// CHECK-NEXT:  %[[concatenate_191:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_190]], %[[add_9]]), dimensions={0}
+// CHECK-NEXT:  %[[reshape_5:[^ ]+]] = u64[] reshape(%[[slice_4]])
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = u64[] broadcast(%[[reshape_5]]), dimensions={}
+// CHECK-NEXT:  %[[convert_11:[^ ]+]] = u32[] convert(%[[broadcast_6]])
+// CHECK-NEXT:  %[[convert_15:[^ ]+]] = u32[] convert(%[[reshape_3]])
+// CHECK-NEXT:  %[[bitcast_convert_18:[^ ]+]] = u32[] bitcast-convert(%[[convert_15]])
+// CHECK-NEXT:  %[[add_23:[^ ]+]] = u32[] add(%[[convert_11]], %[[bitcast_convert_18]])
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_12:[^ ]+]] = u64[] shift-right-logical(%[[broadcast_6]], %[[constant_10]])
+// CHECK-NEXT:  %[[convert_13:[^ ]+]] = u32[] convert(%[[shift_right_logical_12]])
+// CHECK-NEXT:  %[[constant_14:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_16:[^ ]+]] = u64[] shift-right-logical(%[[reshape_3]], %[[constant_14]])
+// CHECK-NEXT:  %[[convert_17:[^ ]+]] = u32[] convert(%[[shift_right_logical_16]])
+// CHECK-NEXT:  %[[bitcast_convert_19:[^ ]+]] = u32[] bitcast-convert(%[[convert_17]])
+// CHECK-NEXT:  %[[add_24:[^ ]+]] = u32[] add(%[[convert_13]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[add_25:[^ ]+]] = u32[] add(%[[add_23]], %[[add_24]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = u32[] constant(13)
+// CHECK-NEXT:  %[[shift_left_27:[^ ]+]] = u32[] shift-left(%[[add_24]], %[[constant_26]])
+// CHECK-NEXT:  %[[constant_28:[^ ]+]] = u32[] constant(19)
+// CHECK-NEXT:  %[[shift_right_logical_29:[^ ]+]] = u32[] shift-right-logical(%[[add_24]], %[[constant_28]])
+// CHECK-NEXT:  %[[or_30:[^ ]+]] = u32[] or(%[[shift_left_27]], %[[shift_right_logical_29]])
+// CHECK-NEXT:  %[[xor_31:[^ ]+]] = u32[] xor(%[[add_25]], %[[or_30]])
+// CHECK-NEXT:  %[[add_32:[^ ]+]] = u32[] add(%[[add_25]], %[[xor_31]])
+// CHECK-NEXT:  %[[constant_33:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[shift_left_34:[^ ]+]] = u32[] shift-left(%[[xor_31]], %[[constant_33]])
+// CHECK-NEXT:  %[[constant_35:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[shift_right_logical_36:[^ ]+]] = u32[] shift-right-logical(%[[xor_31]], %[[constant_35]])
+// CHECK-NEXT:  %[[or_37:[^ ]+]] = u32[] or(%[[shift_left_34]], %[[shift_right_logical_36]])
+// CHECK-NEXT:  %[[xor_38:[^ ]+]] = u32[] xor(%[[add_32]], %[[or_37]])
+// CHECK-NEXT:  %[[add_39:[^ ]+]] = u32[] add(%[[add_32]], %[[xor_38]])
+// CHECK-NEXT:  %[[constant_40:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[shift_left_41:[^ ]+]] = u32[] shift-left(%[[xor_38]], %[[constant_40]])
+// CHECK-NEXT:  %[[constant_42:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[shift_right_logical_43:[^ ]+]] = u32[] shift-right-logical(%[[xor_38]], %[[constant_42]])
+// CHECK-NEXT:  %[[or_44:[^ ]+]] = u32[] or(%[[shift_left_41]], %[[shift_right_logical_43]])
+// CHECK-NEXT:  %[[xor_45:[^ ]+]] = u32[] xor(%[[add_39]], %[[or_44]])
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = u32[] add(%[[add_39]], %[[xor_45]])
+// CHECK-NEXT:  %[[add_53:[^ ]+]] = u32[] add(%[[add_46]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[constant_47:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[shift_left_48:[^ ]+]] = u32[] shift-left(%[[xor_45]], %[[constant_47]])
+// CHECK-NEXT:  %[[constant_49:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[shift_right_logical_50:[^ ]+]] = u32[] shift-right-logical(%[[xor_45]], %[[constant_49]])
+// CHECK-NEXT:  %[[or_51:[^ ]+]] = u32[] or(%[[shift_left_48]], %[[shift_right_logical_50]])
+// CHECK-NEXT:  %[[xor_52:[^ ]+]] = u32[] xor(%[[add_46]], %[[or_51]])
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = u32[] constant(466688986)
+// CHECK-NEXT:  %[[xor_21:[^ ]+]] = u32[] xor(%[[constant_20]], %[[bitcast_convert_18]])
+// CHECK-NEXT:  %[[xor_22:[^ ]+]] = u32[] xor(%[[xor_21]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[add_54:[^ ]+]] = u32[] add(%[[xor_52]], %[[xor_22]])
+// CHECK-NEXT:  %[[constant_55:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[add_56:[^ ]+]] = u32[] add(%[[add_54]], %[[constant_55]])
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = u32[] add(%[[add_53]], %[[add_56]])
+// CHECK-NEXT:  %[[constant_58:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[shift_left_59:[^ ]+]] = u32[] shift-left(%[[add_56]], %[[constant_58]])
+// CHECK-NEXT:  %[[constant_60:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[shift_right_logical_61:[^ ]+]] = u32[] shift-right-logical(%[[add_56]], %[[constant_60]])
+// CHECK-NEXT:  %[[or_62:[^ ]+]] = u32[] or(%[[shift_left_59]], %[[shift_right_logical_61]])
+// CHECK-NEXT:  %[[xor_63:[^ ]+]] = u32[] xor(%[[add_57]], %[[or_62]])
+// CHECK-NEXT:  %[[add_64:[^ ]+]] = u32[] add(%[[add_57]], %[[xor_63]])
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = u32[] constant(29)
+// CHECK-NEXT:  %[[shift_left_66:[^ ]+]] = u32[] shift-left(%[[xor_63]], %[[constant_65]])
+// CHECK-NEXT:  %[[constant_67:[^ ]+]] = u32[] constant(3)
+// CHECK-NEXT:  %[[shift_right_logical_68:[^ ]+]] = u32[] shift-right-logical(%[[xor_63]], %[[constant_67]])
+// CHECK-NEXT:  %[[or_69:[^ ]+]] = u32[] or(%[[shift_left_66]], %[[shift_right_logical_68]])
+// CHECK-NEXT:  %[[xor_70:[^ ]+]] = u32[] xor(%[[add_64]], %[[or_69]])
+// CHECK-NEXT:  %[[add_71:[^ ]+]] = u32[] add(%[[add_64]], %[[xor_70]])
+// CHECK-NEXT:  %[[constant_72:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[shift_left_73:[^ ]+]] = u32[] shift-left(%[[xor_70]], %[[constant_72]])
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[shift_right_logical_75:[^ ]+]] = u32[] shift-right-logical(%[[xor_70]], %[[constant_74]])
+// CHECK-NEXT:  %[[or_76:[^ ]+]] = u32[] or(%[[shift_left_73]], %[[shift_right_logical_75]])
+// CHECK-NEXT:  %[[xor_77:[^ ]+]] = u32[] xor(%[[add_71]], %[[or_76]])
+// CHECK-NEXT:  %[[add_78:[^ ]+]] = u32[] add(%[[add_71]], %[[xor_77]])
+// CHECK-NEXT:  %[[add_85:[^ ]+]] = u32[] add(%[[add_78]], %[[xor_22]])
+// CHECK-NEXT:  %[[constant_79:[^ ]+]] = u32[] constant(24)
+// CHECK-NEXT:  %[[shift_left_80:[^ ]+]] = u32[] shift-left(%[[xor_77]], %[[constant_79]])
+// CHECK-NEXT:  %[[constant_81:[^ ]+]] = u32[] constant(8)
+// CHECK-NEXT:  %[[shift_right_logical_82:[^ ]+]] = u32[] shift-right-logical(%[[xor_77]], %[[constant_81]])
+// CHECK-NEXT:  %[[or_83:[^ ]+]] = u32[] or(%[[shift_left_80]], %[[shift_right_logical_82]])
+// CHECK-NEXT:  %[[xor_84:[^ ]+]] = u32[] xor(%[[add_78]], %[[or_83]])
+// CHECK-NEXT:  %[[add_86:[^ ]+]] = u32[] add(%[[xor_84]], %[[bitcast_convert_18]])
+// CHECK-NEXT:  %[[constant_87:[^ ]+]] = u32[] constant(2)
+// CHECK-NEXT:  %[[add_88:[^ ]+]] = u32[] add(%[[add_86]], %[[constant_87]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[add_85]], %[[add_88]])
+// CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(13)
+// CHECK-NEXT:  %[[shift_left_91:[^ ]+]] = u32[] shift-left(%[[add_88]], %[[constant_90]])
+// CHECK-NEXT:  %[[constant_92:[^ ]+]] = u32[] constant(19)
+// CHECK-NEXT:  %[[shift_right_logical_93:[^ ]+]] = u32[] shift-right-logical(%[[add_88]], %[[constant_92]])
+// CHECK-NEXT:  %[[or_94:[^ ]+]] = u32[] or(%[[shift_left_91]], %[[shift_right_logical_93]])
+// CHECK-NEXT:  %[[xor_95:[^ ]+]] = u32[] xor(%[[add_89]], %[[or_94]])
+// CHECK-NEXT:  %[[add_96:[^ ]+]] = u32[] add(%[[add_89]], %[[xor_95]])
+// CHECK-NEXT:  %[[constant_97:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[shift_left_98:[^ ]+]] = u32[] shift-left(%[[xor_95]], %[[constant_97]])
+// CHECK-NEXT:  %[[constant_99:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[shift_right_logical_100:[^ ]+]] = u32[] shift-right-logical(%[[xor_95]], %[[constant_99]])
+// CHECK-NEXT:  %[[or_101:[^ ]+]] = u32[] or(%[[shift_left_98]], %[[shift_right_logical_100]])
+// CHECK-NEXT:  %[[xor_102:[^ ]+]] = u32[] xor(%[[add_96]], %[[or_101]])
+// CHECK-NEXT:  %[[add_103:[^ ]+]] = u32[] add(%[[add_96]], %[[xor_102]])
+// CHECK-NEXT:  %[[constant_104:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[shift_left_105:[^ ]+]] = u32[] shift-left(%[[xor_102]], %[[constant_104]])
+// CHECK-NEXT:  %[[constant_106:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[shift_right_logical_107:[^ ]+]] = u32[] shift-right-logical(%[[xor_102]], %[[constant_106]])
+// CHECK-NEXT:  %[[or_108:[^ ]+]] = u32[] or(%[[shift_left_105]], %[[shift_right_logical_107]])
+// CHECK-NEXT:  %[[xor_109:[^ ]+]] = u32[] xor(%[[add_103]], %[[or_108]])
+// CHECK-NEXT:  %[[add_110:[^ ]+]] = u32[] add(%[[add_103]], %[[xor_109]])
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_110]], %[[bitcast_convert_18]])
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[shift_left_112:[^ ]+]] = u32[] shift-left(%[[xor_109]], %[[constant_111]])
+// CHECK-NEXT:  %[[constant_113:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[shift_right_logical_114:[^ ]+]] = u32[] shift-right-logical(%[[xor_109]], %[[constant_113]])
+// CHECK-NEXT:  %[[or_115:[^ ]+]] = u32[] or(%[[shift_left_112]], %[[shift_right_logical_114]])
+// CHECK-NEXT:  %[[xor_116:[^ ]+]] = u32[] xor(%[[add_110]], %[[or_115]])
+// CHECK-NEXT:  %[[add_118:[^ ]+]] = u32[] add(%[[xor_116]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[constant_119:[^ ]+]] = u32[] constant(3)
+// CHECK-NEXT:  %[[add_120:[^ ]+]] = u32[] add(%[[add_118]], %[[constant_119]])
+// CHECK-NEXT:  %[[add_121:[^ ]+]] = u32[] add(%[[add_117]], %[[add_120]])
+// CHECK-NEXT:  %[[constant_122:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[shift_left_123:[^ ]+]] = u32[] shift-left(%[[add_120]], %[[constant_122]])
+// CHECK-NEXT:  %[[constant_124:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u32[] shift-right-logical(%[[add_120]], %[[constant_124]])
+// CHECK-NEXT:  %[[or_126:[^ ]+]] = u32[] or(%[[shift_left_123]], %[[shift_right_logical_125]])
+// CHECK-NEXT:  %[[xor_127:[^ ]+]] = u32[] xor(%[[add_121]], %[[or_126]])
+// CHECK-NEXT:  %[[add_128:[^ ]+]] = u32[] add(%[[add_121]], %[[xor_127]])
+// CHECK-NEXT:  %[[constant_129:[^ ]+]] = u32[] constant(29)
+// CHECK-NEXT:  %[[shift_left_130:[^ ]+]] = u32[] shift-left(%[[xor_127]], %[[constant_129]])
+// CHECK-NEXT:  %[[constant_131:[^ ]+]] = u32[] constant(3)
+// CHECK-NEXT:  %[[shift_right_logical_132:[^ ]+]] = u32[] shift-right-logical(%[[xor_127]], %[[constant_131]])
+// CHECK-NEXT:  %[[or_133:[^ ]+]] = u32[] or(%[[shift_left_130]], %[[shift_right_logical_132]])
+// CHECK-NEXT:  %[[xor_134:[^ ]+]] = u32[] xor(%[[add_128]], %[[or_133]])
+// CHECK-NEXT:  %[[add_135:[^ ]+]] = u32[] add(%[[add_128]], %[[xor_134]])
+// CHECK-NEXT:  %[[constant_136:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[shift_left_137:[^ ]+]] = u32[] shift-left(%[[xor_134]], %[[constant_136]])
+// CHECK-NEXT:  %[[constant_138:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[shift_right_logical_139:[^ ]+]] = u32[] shift-right-logical(%[[xor_134]], %[[constant_138]])
+// CHECK-NEXT:  %[[or_140:[^ ]+]] = u32[] or(%[[shift_left_137]], %[[shift_right_logical_139]])
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[] xor(%[[add_135]], %[[or_140]])
+// CHECK-NEXT:  %[[add_142:[^ ]+]] = u32[] add(%[[add_135]], %[[xor_141]])
+// CHECK-NEXT:  %[[add_149:[^ ]+]] = u32[] add(%[[add_142]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[constant_143:[^ ]+]] = u32[] constant(24)
+// CHECK-NEXT:  %[[shift_left_144:[^ ]+]] = u32[] shift-left(%[[xor_141]], %[[constant_143]])
+// CHECK-NEXT:  %[[constant_145:[^ ]+]] = u32[] constant(8)
+// CHECK-NEXT:  %[[shift_right_logical_146:[^ ]+]] = u32[] shift-right-logical(%[[xor_141]], %[[constant_145]])
+// CHECK-NEXT:  %[[or_147:[^ ]+]] = u32[] or(%[[shift_left_144]], %[[shift_right_logical_146]])
+// CHECK-NEXT:  %[[xor_148:[^ ]+]] = u32[] xor(%[[add_142]], %[[or_147]])
+// CHECK-NEXT:  %[[add_150:[^ ]+]] = u32[] add(%[[xor_148]], %[[xor_22]])
+// CHECK-NEXT:  %[[constant_151:[^ ]+]] = u32[] constant(4)
+// CHECK-NEXT:  %[[add_152:[^ ]+]] = u32[] add(%[[add_150]], %[[constant_151]])
+// CHECK-NEXT:  %[[add_153:[^ ]+]] = u32[] add(%[[add_149]], %[[add_152]])
+// CHECK-NEXT:  %[[constant_154:[^ ]+]] = u32[] constant(13)
+// CHECK-NEXT:  %[[shift_left_155:[^ ]+]] = u32[] shift-left(%[[add_152]], %[[constant_154]])
+// CHECK-NEXT:  %[[constant_156:[^ ]+]] = u32[] constant(19)
+// CHECK-NEXT:  %[[shift_right_logical_157:[^ ]+]] = u32[] shift-right-logical(%[[add_152]], %[[constant_156]])
+// CHECK-NEXT:  %[[or_158:[^ ]+]] = u32[] or(%[[shift_left_155]], %[[shift_right_logical_157]])
+// CHECK-NEXT:  %[[xor_159:[^ ]+]] = u32[] xor(%[[add_153]], %[[or_158]])
+// CHECK-NEXT:  %[[add_160:[^ ]+]] = u32[] add(%[[add_153]], %[[xor_159]])
+// CHECK-NEXT:  %[[constant_161:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[shift_left_162:[^ ]+]] = u32[] shift-left(%[[xor_159]], %[[constant_161]])
+// CHECK-NEXT:  %[[constant_163:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[shift_right_logical_164:[^ ]+]] = u32[] shift-right-logical(%[[xor_159]], %[[constant_163]])
+// CHECK-NEXT:  %[[or_165:[^ ]+]] = u32[] or(%[[shift_left_162]], %[[shift_right_logical_164]])
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[] xor(%[[add_160]], %[[or_165]])
+// CHECK-NEXT:  %[[add_167:[^ ]+]] = u32[] add(%[[add_160]], %[[xor_166]])
+// CHECK-NEXT:  %[[constant_168:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[shift_left_169:[^ ]+]] = u32[] shift-left(%[[xor_166]], %[[constant_168]])
+// CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[shift_right_logical_171:[^ ]+]] = u32[] shift-right-logical(%[[xor_166]], %[[constant_170]])
+// CHECK-NEXT:  %[[or_172:[^ ]+]] = u32[] or(%[[shift_left_169]], %[[shift_right_logical_171]])
+// CHECK-NEXT:  %[[xor_173:[^ ]+]] = u32[] xor(%[[add_167]], %[[or_172]])
+// CHECK-NEXT:  %[[add_174:[^ ]+]] = u32[] add(%[[add_167]], %[[xor_173]])
+// CHECK-NEXT:  %[[add_181:[^ ]+]] = u32[] add(%[[add_174]], %[[xor_22]])
+// CHECK-NEXT:  %[[convert_185:[^ ]+]] = u64[] convert(%[[add_181]])
+// CHECK-NEXT:  %[[constant_175:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[shift_left_176:[^ ]+]] = u32[] shift-left(%[[xor_173]], %[[constant_175]])
+// CHECK-NEXT:  %[[constant_177:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[shift_right_logical_178:[^ ]+]] = u32[] shift-right-logical(%[[xor_173]], %[[constant_177]])
+// CHECK-NEXT:  %[[or_179:[^ ]+]] = u32[] or(%[[shift_left_176]], %[[shift_right_logical_178]])
+// CHECK-NEXT:  %[[xor_180:[^ ]+]] = u32[] xor(%[[add_174]], %[[or_179]])
+// CHECK-NEXT:  %[[add_182:[^ ]+]] = u32[] add(%[[xor_180]], %[[bitcast_convert_18]])
+// CHECK-NEXT:  %[[constant_183:[^ ]+]] = u32[] constant(5)
+// CHECK-NEXT:  %[[add_184:[^ ]+]] = u32[] add(%[[add_182]], %[[constant_183]])
+// CHECK-NEXT:  %[[convert_186:[^ ]+]] = u64[] convert(%[[add_184]])
+// CHECK-NEXT:  %[[constant_187:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_188:[^ ]+]] = u64[] shift-left(%[[convert_186]], %[[constant_187]])
+// CHECK-NEXT:  %[[or_189:[^ ]+]] = u64[] or(%[[convert_185]], %[[shift_left_188]])
+// CHECK-NEXT:  ROOT %[[tuple_192:[^ ]+]] = (u64[2]{0}, u64[]) tuple(%[[concatenate_191]], %[[or_189]])
+
+// CHECK-LABEL: ENTRY %test_three_fry_u64
+// CHECK-NEXT:  %[[initial_state:[^ ]+]] = u64[4]{0} constant({9, 10, 11, 12})
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[4]{0}, u64[]) call(%[[initial_state]]), to_apply=%[[$rng_193]]
+
+HloModule TestThreeFryAlgorithm
+
+ENTRY test_three_fry_u64 {
+  initial_state = u64[4] constant({9, 10, 11, 12})
+  ROOT result = (u64[4], u64[]) rng-bit-generator(initial_state), algorithm=rng_three_fry
+}
+
+// -----
+
+// CHECK-LABEL: HloModule TestThreeFryAlgorithm, entry_computation_layout={()->(u64[4]{0}, u8[])}
+
+// CHECK:       %[[$rng_248:[^ ]+]]
+// CHECK-NEXT:  %[[state_1:[^ ]+]] = u64[4]{0} parameter(0)
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_5:[^ ]+]] = u64[] reshape(%[[slice_4]])
+// CHECK-NEXT:  %[[broadcast_6:[^ ]+]] = u64[1]{0} broadcast(%[[reshape_5]]), dimensions={}
+// CHECK-NEXT:  %[[convert_11:[^ ]+]] = u32[1]{0} convert(%[[broadcast_6]])
+// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(%[[slice_2]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[reshape_3]])
+// CHECK-NEXT:  %[[bitcast_convert_19:[^ ]+]] = u32[] bitcast-convert(%[[convert_16]])
+// CHECK-NEXT:  %[[broadcast_24:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_25:[^ ]+]] = u32[1]{0} add(%[[convert_11]], %[[broadcast_24]])
+// CHECK-NEXT:  %[[constant_10:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_12:[^ ]+]] = u64[1]{0} broadcast(%[[constant_10]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_13:[^ ]+]] = u64[1]{0} shift-right-logical(%[[broadcast_6]], %[[broadcast_12]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_13]])
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(%[[reshape_3]], %[[constant_15]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[bitcast_convert_20:[^ ]+]] = u32[] bitcast-convert(%[[convert_18]])
+// CHECK-NEXT:  %[[broadcast_26:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_27:[^ ]+]] = u32[1]{0} add(%[[convert_14]], %[[broadcast_26]])
+// CHECK-NEXT:  %[[add_28:[^ ]+]] = u32[1]{0} add(%[[add_25]], %[[add_27]])
+// CHECK-NEXT:  %[[constant_29:[^ ]+]] = u32[] constant(13)
+// CHECK-NEXT:  %[[broadcast_30:[^ ]+]] = u32[1]{0} broadcast(%[[constant_29]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_31:[^ ]+]] = u32[1]{0} shift-left(%[[add_27]], %[[broadcast_30]])
+// CHECK-NEXT:  %[[constant_32:[^ ]+]] = u32[] constant(19)
+// CHECK-NEXT:  %[[broadcast_33:[^ ]+]] = u32[1]{0} broadcast(%[[constant_32]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_34:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_27]], %[[broadcast_33]])
+// CHECK-NEXT:  %[[or_35:[^ ]+]] = u32[1]{0} or(%[[shift_left_31]], %[[shift_right_logical_34]])
+// CHECK-NEXT:  %[[xor_36:[^ ]+]] = u32[1]{0} xor(%[[add_28]], %[[or_35]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u32[1]{0} add(%[[add_28]], %[[xor_36]])
+// CHECK-NEXT:  %[[constant_38:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u32[1]{0} broadcast(%[[constant_38]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_40:[^ ]+]] = u32[1]{0} shift-left(%[[xor_36]], %[[broadcast_39]])
+// CHECK-NEXT:  %[[constant_41:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[broadcast_42:[^ ]+]] = u32[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_43:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_36]], %[[broadcast_42]])
+// CHECK-NEXT:  %[[or_44:[^ ]+]] = u32[1]{0} or(%[[shift_left_40]], %[[shift_right_logical_43]])
+// CHECK-NEXT:  %[[xor_45:[^ ]+]] = u32[1]{0} xor(%[[add_37]], %[[or_44]])
+// CHECK-NEXT:  %[[add_46:[^ ]+]] = u32[1]{0} add(%[[add_37]], %[[xor_45]])
+// CHECK-NEXT:  %[[constant_47:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u32[1]{0} broadcast(%[[constant_47]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_49:[^ ]+]] = u32[1]{0} shift-left(%[[xor_45]], %[[broadcast_48]])
+// CHECK-NEXT:  %[[constant_50:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[broadcast_51:[^ ]+]] = u32[1]{0} broadcast(%[[constant_50]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_52:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_45]], %[[broadcast_51]])
+// CHECK-NEXT:  %[[or_53:[^ ]+]] = u32[1]{0} or(%[[shift_left_49]], %[[shift_right_logical_52]])
+// CHECK-NEXT:  %[[xor_54:[^ ]+]] = u32[1]{0} xor(%[[add_46]], %[[or_53]])
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u32[1]{0} add(%[[add_46]], %[[xor_54]])
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_65:[^ ]+]] = u32[1]{0} add(%[[add_55]], %[[broadcast_64]])
+// CHECK-NEXT:  %[[constant_56:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u32[1]{0} broadcast(%[[constant_56]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_58:[^ ]+]] = u32[1]{0} shift-left(%[[xor_54]], %[[broadcast_57]])
+// CHECK-NEXT:  %[[constant_59:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[broadcast_60:[^ ]+]] = u32[1]{0} broadcast(%[[constant_59]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_61:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_54]], %[[broadcast_60]])
+// CHECK-NEXT:  %[[or_62:[^ ]+]] = u32[1]{0} or(%[[shift_left_58]], %[[shift_right_logical_61]])
+// CHECK-NEXT:  %[[xor_63:[^ ]+]] = u32[1]{0} xor(%[[add_55]], %[[or_62]])
+// CHECK-NEXT:  %[[constant_21:[^ ]+]] = u32[] constant(466688986)
+// CHECK-NEXT:  %[[xor_22:[^ ]+]] = u32[] xor(%[[constant_21]], %[[bitcast_convert_19]])
+// CHECK-NEXT:  %[[xor_23:[^ ]+]] = u32[] xor(%[[xor_22]], %[[bitcast_convert_20]])
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_67:[^ ]+]] = u32[1]{0} add(%[[xor_63]], %[[broadcast_66]])
+// CHECK-NEXT:  %[[constant_68:[^ ]+]] = u32[] constant(1)
+// CHECK-NEXT:  %[[broadcast_69:[^ ]+]] = u32[1]{0} broadcast(%[[constant_68]]), dimensions={}
+// CHECK-NEXT:  %[[add_70:[^ ]+]] = u32[1]{0} add(%[[add_67]], %[[broadcast_69]])
+// CHECK-NEXT:  %[[add_71:[^ ]+]] = u32[1]{0} add(%[[add_65]], %[[add_70]])
+// CHECK-NEXT:  %[[constant_72:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u32[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_74:[^ ]+]] = u32[1]{0} shift-left(%[[add_70]], %[[broadcast_73]])
+// CHECK-NEXT:  %[[constant_75:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[broadcast_76:[^ ]+]] = u32[1]{0} broadcast(%[[constant_75]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_77:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_70]], %[[broadcast_76]])
+// CHECK-NEXT:  %[[or_78:[^ ]+]] = u32[1]{0} or(%[[shift_left_74]], %[[shift_right_logical_77]])
+// CHECK-NEXT:  %[[xor_79:[^ ]+]] = u32[1]{0} xor(%[[add_71]], %[[or_78]])
+// CHECK-NEXT:  %[[add_80:[^ ]+]] = u32[1]{0} add(%[[add_71]], %[[xor_79]])
+// CHECK-NEXT:  %[[constant_81:[^ ]+]] = u32[] constant(29)
+// CHECK-NEXT:  %[[broadcast_82:[^ ]+]] = u32[1]{0} broadcast(%[[constant_81]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_83:[^ ]+]] = u32[1]{0} shift-left(%[[xor_79]], %[[broadcast_82]])
+// CHECK-NEXT:  %[[constant_84:[^ ]+]] = u32[] constant(3)
+// CHECK-NEXT:  %[[broadcast_85:[^ ]+]] = u32[1]{0} broadcast(%[[constant_84]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_86:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_79]], %[[broadcast_85]])
+// CHECK-NEXT:  %[[or_87:[^ ]+]] = u32[1]{0} or(%[[shift_left_83]], %[[shift_right_logical_86]])
+// CHECK-NEXT:  %[[xor_88:[^ ]+]] = u32[1]{0} xor(%[[add_80]], %[[or_87]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[1]{0} add(%[[add_80]], %[[xor_88]])
+// CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[broadcast_91:[^ ]+]] = u32[1]{0} broadcast(%[[constant_90]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_92:[^ ]+]] = u32[1]{0} shift-left(%[[xor_88]], %[[broadcast_91]])
+// CHECK-NEXT:  %[[constant_93:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u32[1]{0} broadcast(%[[constant_93]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_95:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_88]], %[[broadcast_94]])
+// CHECK-NEXT:  %[[or_96:[^ ]+]] = u32[1]{0} or(%[[shift_left_92]], %[[shift_right_logical_95]])
+// CHECK-NEXT:  %[[xor_97:[^ ]+]] = u32[1]{0} xor(%[[add_89]], %[[or_96]])
+// CHECK-NEXT:  %[[add_98:[^ ]+]] = u32[1]{0} add(%[[add_89]], %[[xor_97]])
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_108:[^ ]+]] = u32[1]{0} add(%[[add_98]], %[[broadcast_107]])
+// CHECK-NEXT:  %[[constant_99:[^ ]+]] = u32[] constant(24)
+// CHECK-NEXT:  %[[broadcast_100:[^ ]+]] = u32[1]{0} broadcast(%[[constant_99]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_101:[^ ]+]] = u32[1]{0} shift-left(%[[xor_97]], %[[broadcast_100]])
+// CHECK-NEXT:  %[[constant_102:[^ ]+]] = u32[] constant(8)
+// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u32[1]{0} broadcast(%[[constant_102]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_104:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_97]], %[[broadcast_103]])
+// CHECK-NEXT:  %[[or_105:[^ ]+]] = u32[1]{0} or(%[[shift_left_101]], %[[shift_right_logical_104]])
+// CHECK-NEXT:  %[[xor_106:[^ ]+]] = u32[1]{0} xor(%[[add_98]], %[[or_105]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_110:[^ ]+]] = u32[1]{0} add(%[[xor_106]], %[[broadcast_109]])
+// CHECK-NEXT:  %[[constant_111:[^ ]+]] = u32[] constant(2)
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[constant_111]]), dimensions={}
+// CHECK-NEXT:  %[[add_113:[^ ]+]] = u32[1]{0} add(%[[add_110]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[add_114:[^ ]+]] = u32[1]{0} add(%[[add_108]], %[[add_113]])
+// CHECK-NEXT:  %[[constant_115:[^ ]+]] = u32[] constant(13)
+// CHECK-NEXT:  %[[broadcast_116:[^ ]+]] = u32[1]{0} broadcast(%[[constant_115]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_117:[^ ]+]] = u32[1]{0} shift-left(%[[add_113]], %[[broadcast_116]])
+// CHECK-NEXT:  %[[constant_118:[^ ]+]] = u32[] constant(19)
+// CHECK-NEXT:  %[[broadcast_119:[^ ]+]] = u32[1]{0} broadcast(%[[constant_118]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_120:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_113]], %[[broadcast_119]])
+// CHECK-NEXT:  %[[or_121:[^ ]+]] = u32[1]{0} or(%[[shift_left_117]], %[[shift_right_logical_120]])
+// CHECK-NEXT:  %[[xor_122:[^ ]+]] = u32[1]{0} xor(%[[add_114]], %[[or_121]])
+// CHECK-NEXT:  %[[add_123:[^ ]+]] = u32[1]{0} add(%[[add_114]], %[[xor_122]])
+// CHECK-NEXT:  %[[constant_124:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[broadcast_125:[^ ]+]] = u32[1]{0} broadcast(%[[constant_124]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_126:[^ ]+]] = u32[1]{0} shift-left(%[[xor_122]], %[[broadcast_125]])
+// CHECK-NEXT:  %[[constant_127:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[broadcast_128:[^ ]+]] = u32[1]{0} broadcast(%[[constant_127]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_129:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_122]], %[[broadcast_128]])
+// CHECK-NEXT:  %[[or_130:[^ ]+]] = u32[1]{0} or(%[[shift_left_126]], %[[shift_right_logical_129]])
+// CHECK-NEXT:  %[[xor_131:[^ ]+]] = u32[1]{0} xor(%[[add_123]], %[[or_130]])
+// CHECK-NEXT:  %[[add_132:[^ ]+]] = u32[1]{0} add(%[[add_123]], %[[xor_131]])
+// CHECK-NEXT:  %[[constant_133:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[broadcast_134:[^ ]+]] = u32[1]{0} broadcast(%[[constant_133]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_135:[^ ]+]] = u32[1]{0} shift-left(%[[xor_131]], %[[broadcast_134]])
+// CHECK-NEXT:  %[[constant_136:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[constant_136]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_138:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_131]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[or_139:[^ ]+]] = u32[1]{0} or(%[[shift_left_135]], %[[shift_right_logical_138]])
+// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(%[[add_132]], %[[or_139]])
+// CHECK-NEXT:  %[[add_141:[^ ]+]] = u32[1]{0} add(%[[add_132]], %[[xor_140]])
+// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_151:[^ ]+]] = u32[1]{0} add(%[[add_141]], %[[broadcast_150]])
+// CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[broadcast_143:[^ ]+]] = u32[1]{0} broadcast(%[[constant_142]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_144:[^ ]+]] = u32[1]{0} shift-left(%[[xor_140]], %[[broadcast_143]])
+// CHECK-NEXT:  %[[constant_145:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[broadcast_146:[^ ]+]] = u32[1]{0} broadcast(%[[constant_145]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_147:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_140]], %[[broadcast_146]])
+// CHECK-NEXT:  %[[or_148:[^ ]+]] = u32[1]{0} or(%[[shift_left_144]], %[[shift_right_logical_147]])
+// CHECK-NEXT:  %[[xor_149:[^ ]+]] = u32[1]{0} xor(%[[add_141]], %[[or_148]])
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_153:[^ ]+]] = u32[1]{0} add(%[[xor_149]], %[[broadcast_152]])
+// CHECK-NEXT:  %[[constant_154:[^ ]+]] = u32[] constant(3)
+// CHECK-NEXT:  %[[broadcast_155:[^ ]+]] = u32[1]{0} broadcast(%[[constant_154]]), dimensions={}
+// CHECK-NEXT:  %[[add_156:[^ ]+]] = u32[1]{0} add(%[[add_153]], %[[broadcast_155]])
+// CHECK-NEXT:  %[[add_157:[^ ]+]] = u32[1]{0} add(%[[add_151]], %[[add_156]])
+// CHECK-NEXT:  %[[constant_158:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u32[1]{0} broadcast(%[[constant_158]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_160:[^ ]+]] = u32[1]{0} shift-left(%[[add_156]], %[[broadcast_159]])
+// CHECK-NEXT:  %[[constant_161:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[broadcast_162:[^ ]+]] = u32[1]{0} broadcast(%[[constant_161]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_163:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_156]], %[[broadcast_162]])
+// CHECK-NEXT:  %[[or_164:[^ ]+]] = u32[1]{0} or(%[[shift_left_160]], %[[shift_right_logical_163]])
+// CHECK-NEXT:  %[[xor_165:[^ ]+]] = u32[1]{0} xor(%[[add_157]], %[[or_164]])
+// CHECK-NEXT:  %[[add_166:[^ ]+]] = u32[1]{0} add(%[[add_157]], %[[xor_165]])
+// CHECK-NEXT:  %[[constant_167:[^ ]+]] = u32[] constant(29)
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[constant_167]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_169:[^ ]+]] = u32[1]{0} shift-left(%[[xor_165]], %[[broadcast_168]])
+// CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(3)
+// CHECK-NEXT:  %[[broadcast_171:[^ ]+]] = u32[1]{0} broadcast(%[[constant_170]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_172:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_165]], %[[broadcast_171]])
+// CHECK-NEXT:  %[[or_173:[^ ]+]] = u32[1]{0} or(%[[shift_left_169]], %[[shift_right_logical_172]])
+// CHECK-NEXT:  %[[xor_174:[^ ]+]] = u32[1]{0} xor(%[[add_166]], %[[or_173]])
+// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[1]{0} add(%[[add_166]], %[[xor_174]])
+// CHECK-NEXT:  %[[constant_176:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[broadcast_177:[^ ]+]] = u32[1]{0} broadcast(%[[constant_176]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_178:[^ ]+]] = u32[1]{0} shift-left(%[[xor_174]], %[[broadcast_177]])
+// CHECK-NEXT:  %[[constant_179:[^ ]+]] = u32[] constant(16)
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u32[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_174]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[or_182:[^ ]+]] = u32[1]{0} or(%[[shift_left_178]], %[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[xor_183:[^ ]+]] = u32[1]{0} xor(%[[add_175]], %[[or_182]])
+// CHECK-NEXT:  %[[add_184:[^ ]+]] = u32[1]{0} add(%[[add_175]], %[[xor_183]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_20]]), dimensions={}
+// CHECK-NEXT:  %[[add_194:[^ ]+]] = u32[1]{0} add(%[[add_184]], %[[broadcast_193]])
+// CHECK-NEXT:  %[[constant_185:[^ ]+]] = u32[] constant(24)
+// CHECK-NEXT:  %[[broadcast_186:[^ ]+]] = u32[1]{0} broadcast(%[[constant_185]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_187:[^ ]+]] = u32[1]{0} shift-left(%[[xor_183]], %[[broadcast_186]])
+// CHECK-NEXT:  %[[constant_188:[^ ]+]] = u32[] constant(8)
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u32[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_183]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[or_191:[^ ]+]] = u32[1]{0} or(%[[shift_left_187]], %[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[add_184]], %[[or_191]])
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_196:[^ ]+]] = u32[1]{0} add(%[[xor_192]], %[[broadcast_195]])
+// CHECK-NEXT:  %[[constant_197:[^ ]+]] = u32[] constant(4)
+// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(%[[constant_197]]), dimensions={}
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[1]{0} add(%[[add_196]], %[[broadcast_198]])
+// CHECK-NEXT:  %[[add_200:[^ ]+]] = u32[1]{0} add(%[[add_194]], %[[add_199]])
+// CHECK-NEXT:  %[[constant_201:[^ ]+]] = u32[] constant(13)
+// CHECK-NEXT:  %[[broadcast_202:[^ ]+]] = u32[1]{0} broadcast(%[[constant_201]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_203:[^ ]+]] = u32[1]{0} shift-left(%[[add_199]], %[[broadcast_202]])
+// CHECK-NEXT:  %[[constant_204:[^ ]+]] = u32[] constant(19)
+// CHECK-NEXT:  %[[broadcast_205:[^ ]+]] = u32[1]{0} broadcast(%[[constant_204]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_206:[^ ]+]] = u32[1]{0} shift-right-logical(%[[add_199]], %[[broadcast_205]])
+// CHECK-NEXT:  %[[or_207:[^ ]+]] = u32[1]{0} or(%[[shift_left_203]], %[[shift_right_logical_206]])
+// CHECK-NEXT:  %[[xor_208:[^ ]+]] = u32[1]{0} xor(%[[add_200]], %[[or_207]])
+// CHECK-NEXT:  %[[add_209:[^ ]+]] = u32[1]{0} add(%[[add_200]], %[[xor_208]])
+// CHECK-NEXT:  %[[constant_210:[^ ]+]] = u32[] constant(15)
+// CHECK-NEXT:  %[[broadcast_211:[^ ]+]] = u32[1]{0} broadcast(%[[constant_210]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_212:[^ ]+]] = u32[1]{0} shift-left(%[[xor_208]], %[[broadcast_211]])
+// CHECK-NEXT:  %[[constant_213:[^ ]+]] = u32[] constant(17)
+// CHECK-NEXT:  %[[broadcast_214:[^ ]+]] = u32[1]{0} broadcast(%[[constant_213]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_215:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_208]], %[[broadcast_214]])
+// CHECK-NEXT:  %[[or_216:[^ ]+]] = u32[1]{0} or(%[[shift_left_212]], %[[shift_right_logical_215]])
+// CHECK-NEXT:  %[[xor_217:[^ ]+]] = u32[1]{0} xor(%[[add_209]], %[[or_216]])
+// CHECK-NEXT:  %[[add_218:[^ ]+]] = u32[1]{0} add(%[[add_209]], %[[xor_217]])
+// CHECK-NEXT:  %[[constant_219:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[broadcast_220:[^ ]+]] = u32[1]{0} broadcast(%[[constant_219]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_221:[^ ]+]] = u32[1]{0} shift-left(%[[xor_217]], %[[broadcast_220]])
+// CHECK-NEXT:  %[[constant_222:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(%[[constant_222]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_224:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_217]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[or_225:[^ ]+]] = u32[1]{0} or(%[[shift_left_221]], %[[shift_right_logical_224]])
+// CHECK-NEXT:  %[[xor_226:[^ ]+]] = u32[1]{0} xor(%[[add_218]], %[[or_225]])
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[1]{0} add(%[[add_218]], %[[xor_226]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(6)
+// CHECK-NEXT:  %[[broadcast_229:[^ ]+]] = u32[1]{0} broadcast(%[[constant_228]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_230:[^ ]+]] = u32[1]{0} shift-left(%[[xor_226]], %[[broadcast_229]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = u32[] constant(26)
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u32[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_233:[^ ]+]] = u32[1]{0} shift-right-logical(%[[xor_226]], %[[broadcast_232]])
+// CHECK-NEXT:  %[[or_234:[^ ]+]] = u32[1]{0} or(%[[shift_left_230]], %[[shift_right_logical_233]])
+// CHECK-NEXT:  %[[xor_235:[^ ]+]] = u32[1]{0} xor(%[[add_227]], %[[or_234]])
+// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u32[1]{0} broadcast(%[[bitcast_convert_19]]), dimensions={}
+// CHECK-NEXT:  %[[add_239:[^ ]+]] = u32[1]{0} add(%[[xor_235]], %[[broadcast_238]])
+// CHECK-NEXT:  %[[constant_240:[^ ]+]] = u32[] constant(5)
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u32[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[add_242:[^ ]+]] = u32[1]{0} add(%[[add_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[reshape_245:[^ ]+]] = u64[1]{0} reshape(%[[reshape_3]])
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = u64[1]{0} broadcast(%[[constant_7]]), dimensions={}
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = u64[1]{0} add(%[[slice_4]], %[[broadcast_8]])
+// CHECK-NEXT:  %[[concatenate_246:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_245]], %[[add_9]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u32[1]{0} broadcast(%[[xor_23]]), dimensions={}
+// CHECK-NEXT:  %[[add_237:[^ ]+]] = u32[1]{0} add(%[[add_227]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[reshape_243:[^ ]+]] = u32[] reshape(%[[add_237]])
+// CHECK-NEXT:  %[[convert_244:[^ ]+]] = u8[] convert(%[[reshape_243]])
+// CHECK-NEXT:  ROOT %[[tuple_247:[^ ]+]] = (u64[2]{0}, u8[]) tuple(%[[concatenate_246]], %[[convert_244]])
+
+// CHECK-LABEL: ENTRY %test_three_fry_u8
+// CHECK-NEXT:  %[[initial_state:[^ ]+]] = u64[4]{0} constant({9, 10, 11, 12})
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[4]{0}, u8[]) call(%[[initial_state]]), to_apply=%[[$rng_248]]
+
+HloModule TestThreeFryAlgorithm
+
+ENTRY test_three_fry_u8 {
+  initial_state = u64[4] constant({9, 10, 11, 12})
+  ROOT result = (u64[4], u8[]) rng-bit-generator(initial_state), algorithm=rng_three_fry
+}
+
+// -----
+
 // CHECK-LABEL: HloModule TestPhiloxAlgorithm, entry_computation_layout={()->(u64[6]{0}, u32[])}
 
 // CHECK:       %[[$rng_353:[^ ]+]]
 // CHECK-NEXT:  %[[state_1:[^ ]+]] = u64[6]{0} parameter(0)
-// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(u64[6]{0} %[[state_1]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_2]])
-// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[] convert(u64[] %[[reshape_3]])
+// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(%[[slice_2]])
+// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[] convert(%[[reshape_3]])
 // CHECK-NEXT:  %[[constant_86:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_87:[^ ]+]] = u32[] add(u32[] %[[convert_6]], u32[] %[[constant_86]])
+// CHECK-NEXT:  %[[add_87:[^ ]+]] = u32[] add(%[[convert_6]], %[[constant_86]])
 // CHECK-NEXT:  %[[constant_114:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_115:[^ ]+]] = u32[] add(u32[] %[[add_87]], u32[] %[[constant_114]])
+// CHECK-NEXT:  %[[add_115:[^ ]+]] = u32[] add(%[[add_87]], %[[constant_114]])
 // CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_143:[^ ]+]] = u32[] add(u32[] %[[add_115]], u32[] %[[constant_142]])
+// CHECK-NEXT:  %[[add_143:[^ ]+]] = u32[] add(%[[add_115]], %[[constant_142]])
 // CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_171:[^ ]+]] = u32[] add(u32[] %[[add_143]], u32[] %[[constant_170]])
+// CHECK-NEXT:  %[[add_171:[^ ]+]] = u32[] add(%[[add_143]], %[[constant_170]])
 // CHECK-NEXT:  %[[constant_198:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[] add(u32[] %[[add_171]], u32[] %[[constant_198]])
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[] add(%[[add_171]], %[[constant_198]])
 // CHECK-NEXT:  %[[constant_226:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[] add(u32[] %[[add_199]], u32[] %[[constant_226]])
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[] add(%[[add_199]], %[[constant_226]])
 // CHECK-NEXT:  %[[constant_254:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_255:[^ ]+]] = u32[] add(u32[] %[[add_227]], u32[] %[[constant_254]])
+// CHECK-NEXT:  %[[add_255:[^ ]+]] = u32[] add(%[[add_227]], %[[constant_254]])
 // CHECK-NEXT:  %[[constant_282:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_283:[^ ]+]] = u32[] add(u32[] %[[add_255]], u32[] %[[constant_282]])
+// CHECK-NEXT:  %[[add_283:[^ ]+]] = u32[] add(%[[add_255]], %[[constant_282]])
 // CHECK-NEXT:  %[[constant_310:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_311:[^ ]+]] = u32[] add(u32[] %[[add_283]], u32[] %[[constant_310]])
+// CHECK-NEXT:  %[[add_311:[^ ]+]] = u32[] add(%[[add_283]], %[[constant_310]])
 // CHECK-NEXT:  %[[constant_338:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_339:[^ ]+]] = u32[] add(u32[] %[[add_311]], u32[] %[[constant_338]])
+// CHECK-NEXT:  %[[add_339:[^ ]+]] = u32[] add(%[[add_311]], %[[constant_338]])
 // CHECK-NEXT:  %[[constant_5:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_7:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_3]], u64[] %[[constant_5]])
-// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_7]])
+// CHECK-NEXT:  %[[shift_right_logical_7:[^ ]+]] = u64[] shift-right-logical(%[[reshape_3]], %[[constant_5]])
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[shift_right_logical_7]])
 // CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(u32[] %[[convert_8]], u32[] %[[constant_88]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
 // CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(u32[] %[[add_89]], u32[] %[[constant_116]])
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
 // CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(u32[] %[[add_117]], u32[] %[[constant_144]])
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
 // CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(u32[] %[[add_145]], u32[] %[[constant_172]])
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
 // CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(u32[] %[[add_173]], u32[] %[[constant_200]])
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
 // CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(u32[] %[[add_201]], u32[] %[[constant_228]])
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
 // CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(u32[] %[[add_229]], u32[] %[[constant_256]])
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
 // CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(u32[] %[[add_257]], u32[] %[[constant_284]])
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
 // CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(u32[] %[[add_285]], u32[] %[[constant_312]])
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
 // CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(u32[] %[[add_313]], u32[] %[[constant_340]])
-// CHECK-NEXT:  %[[reshape_350:[^ ]+]] = u64[1]{0} reshape(u64[] %[[reshape_3]])
-// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[2]{0} slice(u64[6]{0} %[[state_1]]), slice={[1:3]}
-// CHECK-NEXT:  %[[slice_9:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[slice_4]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_10:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_9]])
-// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[] convert(u64[] %[[reshape_10]])
-// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u64[] convert(u32[] %[[convert_14]])
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[reshape_350:[^ ]+]] = u64[1]{0} reshape(%[[reshape_3]])
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[2]{0} slice(%[[state_1]]), slice={[1:3]}
+// CHECK-NEXT:  %[[slice_9:[^ ]+]] = u64[1]{0} slice(%[[slice_4]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_10:[^ ]+]] = u64[] reshape(%[[slice_9]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[] convert(%[[reshape_10]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u64[] convert(%[[convert_14]])
 // CHECK-NEXT:  %[[constant_13:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_15:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_10]], u64[] %[[constant_13]])
-// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_15]])
-// CHECK-NEXT:  %[[convert_23:[^ ]+]] = u64[] convert(u32[] %[[convert_16]])
+// CHECK-NEXT:  %[[shift_right_logical_15:[^ ]+]] = u64[] shift-right-logical(%[[reshape_10]], %[[constant_13]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[shift_right_logical_15]])
+// CHECK-NEXT:  %[[convert_23:[^ ]+]] = u64[] convert(%[[convert_16]])
 // CHECK-NEXT:  %[[constant_24:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_left_25:[^ ]+]] = u64[] shift-left(u64[] %[[convert_23]], u64[] %[[constant_24]])
-// CHECK-NEXT:  %[[or_26:[^ ]+]] = u64[] or(u64[] %[[convert_22]], u64[] %[[shift_left_25]])
+// CHECK-NEXT:  %[[shift_left_25:[^ ]+]] = u64[] shift-left(%[[convert_23]], %[[constant_24]])
+// CHECK-NEXT:  %[[or_26:[^ ]+]] = u64[] or(%[[convert_22]], %[[shift_left_25]])
 // CHECK-NEXT:  %[[constant_51:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_52:[^ ]+]] = u64[] add(u64[] %[[or_26]], u64[] %[[constant_51]])
-// CHECK-NEXT:  %[[reshape_59:[^ ]+]] = u64[1]{0} reshape(u64[] %[[add_52]])
-// CHECK-NEXT:  %[[compare_54:[^ ]+]] = pred[] compare(u64[] %[[add_52]], u64[] %[[or_26]]), direction=LT
-// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[slice_4]]), slice={[1:2]}
-// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_11]])
-// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(u64[] %[[reshape_12]])
-// CHECK-NEXT:  %[[convert_27:[^ ]+]] = u64[] convert(u32[] %[[convert_18]])
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = u64[] add(%[[or_26]], %[[constant_51]])
+// CHECK-NEXT:  %[[reshape_59:[^ ]+]] = u64[1]{0} reshape(%[[add_52]])
+// CHECK-NEXT:  %[[compare_54:[^ ]+]] = pred[] compare(%[[add_52]], %[[or_26]]), direction=LT
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[slice_4]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_27:[^ ]+]] = u64[] convert(%[[convert_18]])
 // CHECK-NEXT:  %[[constant_17:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_19:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_12]], u64[] %[[constant_17]])
-// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_19]])
-// CHECK-NEXT:  %[[convert_28:[^ ]+]] = u64[] convert(u32[] %[[convert_20]])
+// CHECK-NEXT:  %[[shift_right_logical_19:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_17]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[shift_right_logical_19]])
+// CHECK-NEXT:  %[[convert_28:[^ ]+]] = u64[] convert(%[[convert_20]])
 // CHECK-NEXT:  %[[constant_29:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_left_30:[^ ]+]] = u64[] shift-left(u64[] %[[convert_28]], u64[] %[[constant_29]])
-// CHECK-NEXT:  %[[or_31:[^ ]+]] = u64[] or(u64[] %[[convert_27]], u64[] %[[shift_left_30]])
+// CHECK-NEXT:  %[[shift_left_30:[^ ]+]] = u64[] shift-left(%[[convert_28]], %[[constant_29]])
+// CHECK-NEXT:  %[[or_31:[^ ]+]] = u64[] or(%[[convert_27]], %[[shift_left_30]])
 // CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_55:[^ ]+]] = u64[] add(u64[] %[[or_31]], u64[] %[[constant_53]])
-// CHECK-NEXT:  %[[broadcast_56:[^ ]+]] = u64[] broadcast(u64[] %[[add_55]]), dimensions={}
-// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u64[] broadcast(u64[] %[[or_31]]), dimensions={}
-// CHECK-NEXT:  %[[select_58:[^ ]+]] = u64[] select(pred[] %[[compare_54]], u64[] %[[broadcast_56]], u64[] %[[broadcast_57]])
-// CHECK-NEXT:  %[[reshape_60:[^ ]+]] = u64[1]{0} reshape(u64[] %[[select_58]])
-// CHECK-NEXT:  %[[concatenate_61:[^ ]+]] = u64[2]{0} concatenate(u64[1]{0} %[[reshape_59]], u64[1]{0} %[[reshape_60]]), dimensions={0}
-// CHECK-NEXT:  %[[concatenate_351:[^ ]+]] = u64[3]{0} concatenate(u64[1]{0} %[[reshape_350]], u64[2]{0} %[[concatenate_61]]), dimensions={0}
-// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_53]])
+// CHECK-NEXT:  %[[broadcast_56:[^ ]+]] = u64[] broadcast(%[[add_55]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u64[] broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_58:[^ ]+]] = u64[] select(%[[compare_54]], %[[broadcast_56]], %[[broadcast_57]])
+// CHECK-NEXT:  %[[reshape_60:[^ ]+]] = u64[1]{0} reshape(%[[select_58]])
+// CHECK-NEXT:  %[[concatenate_61:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_59]], %[[reshape_60]]), dimensions={0}
+// CHECK-NEXT:  %[[concatenate_351:[^ ]+]] = u64[3]{0} concatenate(%[[reshape_350]], %[[concatenate_61]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
 // CHECK-NEXT:  %[[iota_21:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
-// CHECK-NEXT:  %[[add_33:[^ ]+]] = u64[1]{0} add(u64[1]{0} %[[broadcast_32]], u64[1]{0} %[[iota_21]])
-// CHECK-NEXT:  %[[convert_42:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[add_33]])
-// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[convert_42]])
+// CHECK-NEXT:  %[[add_33:[^ ]+]] = u64[1]{0} add(%[[broadcast_32]], %[[iota_21]])
+// CHECK-NEXT:  %[[convert_42:[^ ]+]] = u32[1]{0} convert(%[[add_33]])
+// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u64[1]{0} convert(%[[convert_42]])
 // CHECK-NEXT:  %[[constant_63:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_63]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_65:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_62]], u64[1]{0} %[[broadcast_64]])
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1]{0} broadcast(%[[constant_63]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_65:[^ ]+]] = u64[1]{0} multiply(%[[convert_62]], %[[broadcast_64]])
 // CHECK-NEXT:  %[[constant_67:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_67]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_69:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_65]], u64[1]{0} %[[broadcast_68]])
-// CHECK-NEXT:  %[[convert_70:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_69]])
-// CHECK-NEXT:  %[[broadcast_35:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_26]]), dimensions={}
-// CHECK-NEXT:  %[[compare_36:[^ ]+]] = pred[1]{0} compare(u64[1]{0} %[[add_33]], u64[1]{0} %[[broadcast_35]]), direction=LT
+// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u64[1]{0} broadcast(%[[constant_67]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_69:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_65]], %[[broadcast_68]])
+// CHECK-NEXT:  %[[convert_70:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_69]])
+// CHECK-NEXT:  %[[broadcast_35:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[compare_36:[^ ]+]] = pred[1]{0} compare(%[[add_33]], %[[broadcast_35]]), direction=LT
 // CHECK-NEXT:  %[[constant_34:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_37:[^ ]+]] = u64[] add(u64[] %[[or_31]], u64[] %[[constant_34]])
-// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[add_37]]), dimensions={}
-// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_31]]), dimensions={}
-// CHECK-NEXT:  %[[select_40:[^ ]+]] = u64[1]{0} select(pred[1]{0} %[[compare_36]], u64[1]{0} %[[broadcast_38]], u64[1]{0} %[[broadcast_39]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_34]])
+// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = u64[1]{0} broadcast(%[[add_37]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u64[1]{0} broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_40:[^ ]+]] = u64[1]{0} select(%[[compare_36]], %[[broadcast_38]], %[[broadcast_39]])
 // CHECK-NEXT:  %[[constant_46:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_46]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_49:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[select_40]], u64[1]{0} %[[broadcast_48]])
-// CHECK-NEXT:  %[[convert_50:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_49]])
-// CHECK-NEXT:  %[[xor_83:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_70]], u32[1]{0} %[[convert_50]])
-// CHECK-NEXT:  %[[broadcast_84:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[convert_8]]), dimensions={}
-// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_83]], u32[1]{0} %[[broadcast_84]])
-// CHECK-NEXT:  %[[convert_99:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_85]])
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u64[1]{0} broadcast(%[[constant_46]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_49:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_40]], %[[broadcast_48]])
+// CHECK-NEXT:  %[[convert_50:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_49]])
+// CHECK-NEXT:  %[[xor_83:[^ ]+]] = u32[1]{0} xor(%[[convert_70]], %[[convert_50]])
+// CHECK-NEXT:  %[[broadcast_84:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[xor_83]], %[[broadcast_84]])
+// CHECK-NEXT:  %[[convert_99:[^ ]+]] = u64[1]{0} convert(%[[xor_85]])
 // CHECK-NEXT:  %[[constant_100:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_101:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_100]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_102:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_99]], u64[1]{0} %[[broadcast_101]])
+// CHECK-NEXT:  %[[broadcast_101:[^ ]+]] = u64[1]{0} broadcast(%[[constant_100]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_102:[^ ]+]] = u64[1]{0} multiply(%[[convert_99]], %[[broadcast_101]])
 // CHECK-NEXT:  %[[constant_104:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_105:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_104]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_106:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_102]], u64[1]{0} %[[broadcast_105]])
-// CHECK-NEXT:  %[[convert_107:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_106]])
-// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[select_40]])
-// CHECK-NEXT:  %[[convert_71:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[convert_47]])
+// CHECK-NEXT:  %[[broadcast_105:[^ ]+]] = u64[1]{0} broadcast(%[[constant_104]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_106:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_102]], %[[broadcast_105]])
+// CHECK-NEXT:  %[[convert_107:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_106]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[select_40]])
+// CHECK-NEXT:  %[[convert_71:[^ ]+]] = u64[1]{0} convert(%[[convert_47]])
 // CHECK-NEXT:  %[[constant_72:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_72]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_71]], u64[1]{0} %[[broadcast_73]])
-// CHECK-NEXT:  %[[convert_75:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_74]])
-// CHECK-NEXT:  %[[xor_108:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_107]], u32[1]{0} %[[convert_75]])
-// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_87]]), dimensions={}
-// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_108]], u32[1]{0} %[[broadcast_109]])
-// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_110]])
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u64[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = u64[1]{0} multiply(%[[convert_71]], %[[broadcast_73]])
+// CHECK-NEXT:  %[[convert_75:[^ ]+]] = u32[1]{0} convert(%[[multiply_74]])
+// CHECK-NEXT:  %[[xor_108:[^ ]+]] = u32[1]{0} xor(%[[convert_107]], %[[convert_75]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[add_87]]), dimensions={}
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[xor_108]], %[[broadcast_109]])
+// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u64[1]{0} convert(%[[xor_110]])
 // CHECK-NEXT:  %[[constant_119:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_119]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_118]], u64[1]{0} %[[broadcast_120]])
+// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1]{0} broadcast(%[[constant_119]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = u64[1]{0} multiply(%[[convert_118]], %[[broadcast_120]])
 // CHECK-NEXT:  %[[constant_123:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_123]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_121]], u64[1]{0} %[[broadcast_124]])
-// CHECK-NEXT:  %[[convert_126:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_125]])
+// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u64[1]{0} broadcast(%[[constant_123]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_121]], %[[broadcast_124]])
+// CHECK-NEXT:  %[[convert_126:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_125]])
 // CHECK-NEXT:  %[[constant_76:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_76]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_78:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_74]], u64[1]{0} %[[broadcast_77]])
-// CHECK-NEXT:  %[[convert_79:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_78]])
+// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = u64[1]{0} broadcast(%[[constant_76]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_78:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_74]], %[[broadcast_77]])
+// CHECK-NEXT:  %[[convert_79:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_78]])
 // CHECK-NEXT:  %[[constant_41:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_41]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_44:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[add_33]], u64[1]{0} %[[broadcast_43]])
-// CHECK-NEXT:  %[[convert_45:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_44]])
-// CHECK-NEXT:  %[[xor_80:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_79]], u32[1]{0} %[[convert_45]])
-// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[convert_6]]), dimensions={}
-// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_80]], u32[1]{0} %[[broadcast_81]])
-// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_82]])
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u64[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_44:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_33]], %[[broadcast_43]])
+// CHECK-NEXT:  %[[convert_45:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_44]])
+// CHECK-NEXT:  %[[xor_80:[^ ]+]] = u32[1]{0} xor(%[[convert_79]], %[[convert_45]])
+// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = u32[1]{0} broadcast(%[[convert_6]]), dimensions={}
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[xor_80]], %[[broadcast_81]])
+// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u64[1]{0} convert(%[[xor_82]])
 // CHECK-NEXT:  %[[constant_91:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_91]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_90]], u64[1]{0} %[[broadcast_92]])
-// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_93]])
-// CHECK-NEXT:  %[[xor_139:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_126]], u32[1]{0} %[[convert_94]])
-// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_117]]), dimensions={}
-// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_139]], u32[1]{0} %[[broadcast_140]])
-// CHECK-NEXT:  %[[convert_155:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_141]])
+// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1]{0} broadcast(%[[constant_91]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = u64[1]{0} multiply(%[[convert_90]], %[[broadcast_92]])
+// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1]{0} convert(%[[multiply_93]])
+// CHECK-NEXT:  %[[xor_139:[^ ]+]] = u32[1]{0} xor(%[[convert_126]], %[[convert_94]])
+// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[xor_139]], %[[broadcast_140]])
+// CHECK-NEXT:  %[[convert_155:[^ ]+]] = u64[1]{0} convert(%[[xor_141]])
 // CHECK-NEXT:  %[[constant_156:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_157:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_156]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_158:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_155]], u64[1]{0} %[[broadcast_157]])
+// CHECK-NEXT:  %[[broadcast_157:[^ ]+]] = u64[1]{0} broadcast(%[[constant_156]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_158:[^ ]+]] = u64[1]{0} multiply(%[[convert_155]], %[[broadcast_157]])
 // CHECK-NEXT:  %[[constant_160:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_161:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_160]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_162:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_158]], u64[1]{0} %[[broadcast_161]])
-// CHECK-NEXT:  %[[convert_163:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_162]])
+// CHECK-NEXT:  %[[broadcast_161:[^ ]+]] = u64[1]{0} broadcast(%[[constant_160]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_162:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_158]], %[[broadcast_161]])
+// CHECK-NEXT:  %[[convert_163:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_162]])
 // CHECK-NEXT:  %[[constant_95:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_95]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_97:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_93]], u64[1]{0} %[[broadcast_96]])
-// CHECK-NEXT:  %[[convert_98:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_97]])
-// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_65]])
-// CHECK-NEXT:  %[[xor_111:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_98]], u32[1]{0} %[[convert_66]])
-// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_89]]), dimensions={}
-// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_111]], u32[1]{0} %[[broadcast_112]])
-// CHECK-NEXT:  %[[convert_127:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_113]])
+// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u64[1]{0} broadcast(%[[constant_95]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_97:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_93]], %[[broadcast_96]])
+// CHECK-NEXT:  %[[convert_98:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_97]])
+// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1]{0} convert(%[[multiply_65]])
+// CHECK-NEXT:  %[[xor_111:[^ ]+]] = u32[1]{0} xor(%[[convert_98]], %[[convert_66]])
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[xor_111]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[convert_127:[^ ]+]] = u64[1]{0} convert(%[[xor_113]])
 // CHECK-NEXT:  %[[constant_128:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_129:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_128]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_130:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_127]], u64[1]{0} %[[broadcast_129]])
-// CHECK-NEXT:  %[[convert_131:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_130]])
-// CHECK-NEXT:  %[[xor_164:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_163]], u32[1]{0} %[[convert_131]])
-// CHECK-NEXT:  %[[broadcast_165:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_143]]), dimensions={}
-// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_164]], u32[1]{0} %[[broadcast_165]])
-// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_166]])
+// CHECK-NEXT:  %[[broadcast_129:[^ ]+]] = u64[1]{0} broadcast(%[[constant_128]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_130:[^ ]+]] = u64[1]{0} multiply(%[[convert_127]], %[[broadcast_129]])
+// CHECK-NEXT:  %[[convert_131:[^ ]+]] = u32[1]{0} convert(%[[multiply_130]])
+// CHECK-NEXT:  %[[xor_164:[^ ]+]] = u32[1]{0} xor(%[[convert_163]], %[[convert_131]])
+// CHECK-NEXT:  %[[broadcast_165:[^ ]+]] = u32[1]{0} broadcast(%[[add_143]]), dimensions={}
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[xor_164]], %[[broadcast_165]])
+// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u64[1]{0} convert(%[[xor_166]])
 // CHECK-NEXT:  %[[constant_175:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_175]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_174]], u64[1]{0} %[[broadcast_176]])
+// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1]{0} broadcast(%[[constant_175]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = u64[1]{0} multiply(%[[convert_174]], %[[broadcast_176]])
 // CHECK-NEXT:  %[[constant_179:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_179]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_177]], u64[1]{0} %[[broadcast_180]])
-// CHECK-NEXT:  %[[convert_182:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u64[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_177]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[convert_182:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_181]])
 // CHECK-NEXT:  %[[constant_132:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_132]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_134:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_130]], u64[1]{0} %[[broadcast_133]])
-// CHECK-NEXT:  %[[convert_135:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_134]])
-// CHECK-NEXT:  %[[convert_103:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_102]])
-// CHECK-NEXT:  %[[xor_136:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_135]], u32[1]{0} %[[convert_103]])
-// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_115]]), dimensions={}
-// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_136]], u32[1]{0} %[[broadcast_137]])
-// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_138]])
+// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = u64[1]{0} broadcast(%[[constant_132]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_134:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_130]], %[[broadcast_133]])
+// CHECK-NEXT:  %[[convert_135:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_134]])
+// CHECK-NEXT:  %[[convert_103:[^ ]+]] = u32[1]{0} convert(%[[multiply_102]])
+// CHECK-NEXT:  %[[xor_136:[^ ]+]] = u32[1]{0} xor(%[[convert_135]], %[[convert_103]])
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[add_115]]), dimensions={}
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[xor_136]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u64[1]{0} convert(%[[xor_138]])
 // CHECK-NEXT:  %[[constant_147:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_147]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_149:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_146]], u64[1]{0} %[[broadcast_148]])
-// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_149]])
-// CHECK-NEXT:  %[[xor_195:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_182]], u32[1]{0} %[[convert_150]])
-// CHECK-NEXT:  %[[broadcast_196:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_173]]), dimensions={}
-// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_195]], u32[1]{0} %[[broadcast_196]])
-// CHECK-NEXT:  %[[convert_211:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_197]])
+// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1]{0} broadcast(%[[constant_147]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_149:[^ ]+]] = u64[1]{0} multiply(%[[convert_146]], %[[broadcast_148]])
+// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1]{0} convert(%[[multiply_149]])
+// CHECK-NEXT:  %[[xor_195:[^ ]+]] = u32[1]{0} xor(%[[convert_182]], %[[convert_150]])
+// CHECK-NEXT:  %[[broadcast_196:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[xor_195]], %[[broadcast_196]])
+// CHECK-NEXT:  %[[convert_211:[^ ]+]] = u64[1]{0} convert(%[[xor_197]])
 // CHECK-NEXT:  %[[constant_212:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_213:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_212]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_214:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_211]], u64[1]{0} %[[broadcast_213]])
+// CHECK-NEXT:  %[[broadcast_213:[^ ]+]] = u64[1]{0} broadcast(%[[constant_212]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_214:[^ ]+]] = u64[1]{0} multiply(%[[convert_211]], %[[broadcast_213]])
 // CHECK-NEXT:  %[[constant_216:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_217:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_216]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_218:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_214]], u64[1]{0} %[[broadcast_217]])
-// CHECK-NEXT:  %[[convert_219:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_218]])
+// CHECK-NEXT:  %[[broadcast_217:[^ ]+]] = u64[1]{0} broadcast(%[[constant_216]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_218:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_214]], %[[broadcast_217]])
+// CHECK-NEXT:  %[[convert_219:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_218]])
 // CHECK-NEXT:  %[[constant_151:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_151]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_153:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_149]], u64[1]{0} %[[broadcast_152]])
-// CHECK-NEXT:  %[[convert_154:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_153]])
-// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_121]])
-// CHECK-NEXT:  %[[xor_167:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_154]], u32[1]{0} %[[convert_122]])
-// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_145]]), dimensions={}
-// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_167]], u32[1]{0} %[[broadcast_168]])
-// CHECK-NEXT:  %[[convert_183:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_169]])
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u64[1]{0} broadcast(%[[constant_151]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_153:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_149]], %[[broadcast_152]])
+// CHECK-NEXT:  %[[convert_154:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_153]])
+// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1]{0} convert(%[[multiply_121]])
+// CHECK-NEXT:  %[[xor_167:[^ ]+]] = u32[1]{0} xor(%[[convert_154]], %[[convert_122]])
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[xor_167]], %[[broadcast_168]])
+// CHECK-NEXT:  %[[convert_183:[^ ]+]] = u64[1]{0} convert(%[[xor_169]])
 // CHECK-NEXT:  %[[constant_184:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_185:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_184]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_186:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_183]], u64[1]{0} %[[broadcast_185]])
-// CHECK-NEXT:  %[[convert_187:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_186]])
-// CHECK-NEXT:  %[[xor_220:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_219]], u32[1]{0} %[[convert_187]])
-// CHECK-NEXT:  %[[broadcast_221:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_199]]), dimensions={}
-// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_220]], u32[1]{0} %[[broadcast_221]])
-// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_222]])
+// CHECK-NEXT:  %[[broadcast_185:[^ ]+]] = u64[1]{0} broadcast(%[[constant_184]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_186:[^ ]+]] = u64[1]{0} multiply(%[[convert_183]], %[[broadcast_185]])
+// CHECK-NEXT:  %[[convert_187:[^ ]+]] = u32[1]{0} convert(%[[multiply_186]])
+// CHECK-NEXT:  %[[xor_220:[^ ]+]] = u32[1]{0} xor(%[[convert_219]], %[[convert_187]])
+// CHECK-NEXT:  %[[broadcast_221:[^ ]+]] = u32[1]{0} broadcast(%[[add_199]]), dimensions={}
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[xor_220]], %[[broadcast_221]])
+// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u64[1]{0} convert(%[[xor_222]])
 // CHECK-NEXT:  %[[constant_231:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_231]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_233:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_230]], u64[1]{0} %[[broadcast_232]])
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_233:[^ ]+]] = u64[1]{0} multiply(%[[convert_230]], %[[broadcast_232]])
 // CHECK-NEXT:  %[[constant_235:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_235]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_237:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_233]], u64[1]{0} %[[broadcast_236]])
-// CHECK-NEXT:  %[[convert_238:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_237]])
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u64[1]{0} broadcast(%[[constant_235]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_237:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_233]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[convert_238:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_237]])
 // CHECK-NEXT:  %[[constant_188:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_188]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_186]], u64[1]{0} %[[broadcast_189]])
-// CHECK-NEXT:  %[[convert_191:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_190]])
-// CHECK-NEXT:  %[[convert_159:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_158]])
-// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_191]], u32[1]{0} %[[convert_159]])
-// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_171]]), dimensions={}
-// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_192]], u32[1]{0} %[[broadcast_193]])
-// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_194]])
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u64[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_186]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[convert_191:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[convert_159:[^ ]+]] = u32[1]{0} convert(%[[multiply_158]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[convert_191]], %[[convert_159]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[add_171]]), dimensions={}
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[xor_192]], %[[broadcast_193]])
+// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u64[1]{0} convert(%[[xor_194]])
 // CHECK-NEXT:  %[[constant_203:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_203]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_205:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_202]], u64[1]{0} %[[broadcast_204]])
-// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_205]])
-// CHECK-NEXT:  %[[xor_251:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_238]], u32[1]{0} %[[convert_206]])
-// CHECK-NEXT:  %[[broadcast_252:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_229]]), dimensions={}
-// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_251]], u32[1]{0} %[[broadcast_252]])
-// CHECK-NEXT:  %[[convert_267:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_253]])
+// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1]{0} broadcast(%[[constant_203]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_205:[^ ]+]] = u64[1]{0} multiply(%[[convert_202]], %[[broadcast_204]])
+// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1]{0} convert(%[[multiply_205]])
+// CHECK-NEXT:  %[[xor_251:[^ ]+]] = u32[1]{0} xor(%[[convert_238]], %[[convert_206]])
+// CHECK-NEXT:  %[[broadcast_252:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[xor_251]], %[[broadcast_252]])
+// CHECK-NEXT:  %[[convert_267:[^ ]+]] = u64[1]{0} convert(%[[xor_253]])
 // CHECK-NEXT:  %[[constant_268:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_269:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_268]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_270:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_267]], u64[1]{0} %[[broadcast_269]])
+// CHECK-NEXT:  %[[broadcast_269:[^ ]+]] = u64[1]{0} broadcast(%[[constant_268]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_270:[^ ]+]] = u64[1]{0} multiply(%[[convert_267]], %[[broadcast_269]])
 // CHECK-NEXT:  %[[constant_272:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_273:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_272]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_274:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_270]], u64[1]{0} %[[broadcast_273]])
-// CHECK-NEXT:  %[[convert_275:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_274]])
+// CHECK-NEXT:  %[[broadcast_273:[^ ]+]] = u64[1]{0} broadcast(%[[constant_272]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_274:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_270]], %[[broadcast_273]])
+// CHECK-NEXT:  %[[convert_275:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_274]])
 // CHECK-NEXT:  %[[constant_207:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_207]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_209:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_205]], u64[1]{0} %[[broadcast_208]])
-// CHECK-NEXT:  %[[convert_210:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_209]])
-// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_177]])
-// CHECK-NEXT:  %[[xor_223:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_210]], u32[1]{0} %[[convert_178]])
-// CHECK-NEXT:  %[[broadcast_224:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_201]]), dimensions={}
-// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_223]], u32[1]{0} %[[broadcast_224]])
-// CHECK-NEXT:  %[[convert_239:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_225]])
+// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u64[1]{0} broadcast(%[[constant_207]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_209:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_205]], %[[broadcast_208]])
+// CHECK-NEXT:  %[[convert_210:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_209]])
+// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1]{0} convert(%[[multiply_177]])
+// CHECK-NEXT:  %[[xor_223:[^ ]+]] = u32[1]{0} xor(%[[convert_210]], %[[convert_178]])
+// CHECK-NEXT:  %[[broadcast_224:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[xor_223]], %[[broadcast_224]])
+// CHECK-NEXT:  %[[convert_239:[^ ]+]] = u64[1]{0} convert(%[[xor_225]])
 // CHECK-NEXT:  %[[constant_240:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_240]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_242:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_239]], u64[1]{0} %[[broadcast_241]])
-// CHECK-NEXT:  %[[convert_243:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_242]])
-// CHECK-NEXT:  %[[xor_276:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_275]], u32[1]{0} %[[convert_243]])
-// CHECK-NEXT:  %[[broadcast_277:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_255]]), dimensions={}
-// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_276]], u32[1]{0} %[[broadcast_277]])
-// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_278]])
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u64[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_242:[^ ]+]] = u64[1]{0} multiply(%[[convert_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[convert_243:[^ ]+]] = u32[1]{0} convert(%[[multiply_242]])
+// CHECK-NEXT:  %[[xor_276:[^ ]+]] = u32[1]{0} xor(%[[convert_275]], %[[convert_243]])
+// CHECK-NEXT:  %[[broadcast_277:[^ ]+]] = u32[1]{0} broadcast(%[[add_255]]), dimensions={}
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[xor_276]], %[[broadcast_277]])
+// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u64[1]{0} convert(%[[xor_278]])
 // CHECK-NEXT:  %[[constant_287:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_287]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_289:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_286]], u64[1]{0} %[[broadcast_288]])
+// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1]{0} broadcast(%[[constant_287]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_289:[^ ]+]] = u64[1]{0} multiply(%[[convert_286]], %[[broadcast_288]])
 // CHECK-NEXT:  %[[constant_291:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_291]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_293:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_289]], u64[1]{0} %[[broadcast_292]])
-// CHECK-NEXT:  %[[convert_294:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_293]])
+// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u64[1]{0} broadcast(%[[constant_291]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_293:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_289]], %[[broadcast_292]])
+// CHECK-NEXT:  %[[convert_294:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_293]])
 // CHECK-NEXT:  %[[constant_244:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_245:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_244]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_246:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_242]], u64[1]{0} %[[broadcast_245]])
-// CHECK-NEXT:  %[[convert_247:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_246]])
-// CHECK-NEXT:  %[[convert_215:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_214]])
-// CHECK-NEXT:  %[[xor_248:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_247]], u32[1]{0} %[[convert_215]])
-// CHECK-NEXT:  %[[broadcast_249:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_227]]), dimensions={}
-// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_248]], u32[1]{0} %[[broadcast_249]])
-// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_250]])
+// CHECK-NEXT:  %[[broadcast_245:[^ ]+]] = u64[1]{0} broadcast(%[[constant_244]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_246:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_242]], %[[broadcast_245]])
+// CHECK-NEXT:  %[[convert_247:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_246]])
+// CHECK-NEXT:  %[[convert_215:[^ ]+]] = u32[1]{0} convert(%[[multiply_214]])
+// CHECK-NEXT:  %[[xor_248:[^ ]+]] = u32[1]{0} xor(%[[convert_247]], %[[convert_215]])
+// CHECK-NEXT:  %[[broadcast_249:[^ ]+]] = u32[1]{0} broadcast(%[[add_227]]), dimensions={}
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[xor_248]], %[[broadcast_249]])
+// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u64[1]{0} convert(%[[xor_250]])
 // CHECK-NEXT:  %[[constant_259:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_259]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_261:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_258]], u64[1]{0} %[[broadcast_260]])
-// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_261]])
-// CHECK-NEXT:  %[[xor_307:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_294]], u32[1]{0} %[[convert_262]])
-// CHECK-NEXT:  %[[broadcast_308:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_285]]), dimensions={}
-// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_307]], u32[1]{0} %[[broadcast_308]])
-// CHECK-NEXT:  %[[convert_323:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_309]])
+// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1]{0} broadcast(%[[constant_259]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_261:[^ ]+]] = u64[1]{0} multiply(%[[convert_258]], %[[broadcast_260]])
+// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1]{0} convert(%[[multiply_261]])
+// CHECK-NEXT:  %[[xor_307:[^ ]+]] = u32[1]{0} xor(%[[convert_294]], %[[convert_262]])
+// CHECK-NEXT:  %[[broadcast_308:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[xor_307]], %[[broadcast_308]])
+// CHECK-NEXT:  %[[convert_323:[^ ]+]] = u64[1]{0} convert(%[[xor_309]])
 // CHECK-NEXT:  %[[constant_324:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_325:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_324]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_326:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_323]], u64[1]{0} %[[broadcast_325]])
+// CHECK-NEXT:  %[[broadcast_325:[^ ]+]] = u64[1]{0} broadcast(%[[constant_324]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_326:[^ ]+]] = u64[1]{0} multiply(%[[convert_323]], %[[broadcast_325]])
 // CHECK-NEXT:  %[[constant_328:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_329:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_328]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_330:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_326]], u64[1]{0} %[[broadcast_329]])
-// CHECK-NEXT:  %[[convert_331:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_330]])
+// CHECK-NEXT:  %[[broadcast_329:[^ ]+]] = u64[1]{0} broadcast(%[[constant_328]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_330:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_326]], %[[broadcast_329]])
+// CHECK-NEXT:  %[[convert_331:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_330]])
 // CHECK-NEXT:  %[[constant_263:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_263]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_265:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_261]], u64[1]{0} %[[broadcast_264]])
-// CHECK-NEXT:  %[[convert_266:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_265]])
-// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_233]])
-// CHECK-NEXT:  %[[xor_279:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_266]], u32[1]{0} %[[convert_234]])
-// CHECK-NEXT:  %[[broadcast_280:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_257]]), dimensions={}
-// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_279]], u32[1]{0} %[[broadcast_280]])
-// CHECK-NEXT:  %[[convert_295:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_281]])
+// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u64[1]{0} broadcast(%[[constant_263]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_265:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_261]], %[[broadcast_264]])
+// CHECK-NEXT:  %[[convert_266:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_265]])
+// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1]{0} convert(%[[multiply_233]])
+// CHECK-NEXT:  %[[xor_279:[^ ]+]] = u32[1]{0} xor(%[[convert_266]], %[[convert_234]])
+// CHECK-NEXT:  %[[broadcast_280:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[xor_279]], %[[broadcast_280]])
+// CHECK-NEXT:  %[[convert_295:[^ ]+]] = u64[1]{0} convert(%[[xor_281]])
 // CHECK-NEXT:  %[[constant_296:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_297:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_296]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_298:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_295]], u64[1]{0} %[[broadcast_297]])
-// CHECK-NEXT:  %[[convert_299:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_298]])
-// CHECK-NEXT:  %[[xor_332:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_331]], u32[1]{0} %[[convert_299]])
-// CHECK-NEXT:  %[[broadcast_333:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_311]]), dimensions={}
-// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_332]], u32[1]{0} %[[broadcast_333]])
-// CHECK-NEXT:  %[[reshape_342:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[xor_334]])
-// CHECK-NEXT:  %[[convert_327:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_326]])
-// CHECK-NEXT:  %[[reshape_343:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[convert_327]])
+// CHECK-NEXT:  %[[broadcast_297:[^ ]+]] = u64[1]{0} broadcast(%[[constant_296]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_298:[^ ]+]] = u64[1]{0} multiply(%[[convert_295]], %[[broadcast_297]])
+// CHECK-NEXT:  %[[convert_299:[^ ]+]] = u32[1]{0} convert(%[[multiply_298]])
+// CHECK-NEXT:  %[[xor_332:[^ ]+]] = u32[1]{0} xor(%[[convert_331]], %[[convert_299]])
+// CHECK-NEXT:  %[[broadcast_333:[^ ]+]] = u32[1]{0} broadcast(%[[add_311]]), dimensions={}
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[xor_332]], %[[broadcast_333]])
+// CHECK-NEXT:  %[[reshape_342:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_334]])
+// CHECK-NEXT:  %[[convert_327:[^ ]+]] = u32[1]{0} convert(%[[multiply_326]])
+// CHECK-NEXT:  %[[reshape_343:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_327]])
 // CHECK-NEXT:  %[[constant_300:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_301:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_300]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_302:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_298]], u64[1]{0} %[[broadcast_301]])
-// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_302]])
-// CHECK-NEXT:  %[[convert_271:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_270]])
-// CHECK-NEXT:  %[[xor_304:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_303]], u32[1]{0} %[[convert_271]])
-// CHECK-NEXT:  %[[broadcast_305:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_283]]), dimensions={}
-// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_304]], u32[1]{0} %[[broadcast_305]])
-// CHECK-NEXT:  %[[convert_314:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_306]])
+// CHECK-NEXT:  %[[broadcast_301:[^ ]+]] = u64[1]{0} broadcast(%[[constant_300]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_302:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_298]], %[[broadcast_301]])
+// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_302]])
+// CHECK-NEXT:  %[[convert_271:[^ ]+]] = u32[1]{0} convert(%[[multiply_270]])
+// CHECK-NEXT:  %[[xor_304:[^ ]+]] = u32[1]{0} xor(%[[convert_303]], %[[convert_271]])
+// CHECK-NEXT:  %[[broadcast_305:[^ ]+]] = u32[1]{0} broadcast(%[[add_283]]), dimensions={}
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[xor_304]], %[[broadcast_305]])
+// CHECK-NEXT:  %[[convert_314:[^ ]+]] = u64[1]{0} convert(%[[xor_306]])
 // CHECK-NEXT:  %[[constant_315:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_316:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_315]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_317:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_314]], u64[1]{0} %[[broadcast_316]])
+// CHECK-NEXT:  %[[broadcast_316:[^ ]+]] = u64[1]{0} broadcast(%[[constant_315]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_317:[^ ]+]] = u64[1]{0} multiply(%[[convert_314]], %[[broadcast_316]])
 // CHECK-NEXT:  %[[constant_319:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_320:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_319]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_321:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_317]], u64[1]{0} %[[broadcast_320]])
-// CHECK-NEXT:  %[[convert_322:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_321]])
-// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_289]])
-// CHECK-NEXT:  %[[xor_335:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_322]], u32[1]{0} %[[convert_290]])
-// CHECK-NEXT:  %[[broadcast_336:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_313]]), dimensions={}
-// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_335]], u32[1]{0} %[[broadcast_336]])
-// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[xor_337]])
-// CHECK-NEXT:  %[[convert_318:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_317]])
-// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[convert_318]])
-// CHECK-NEXT:  %[[concatenate_346:[^ ]+]] = u32[1,4]{1,0} concatenate(u32[1,1]{1,0} %[[reshape_342]], u32[1,1]{1,0} %[[reshape_343]], u32[1,1]{1,0} %[[reshape_344]], u32[1,1]{1,0} %[[reshape_345]]), dimensions={1}
-// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[4]{0} reshape(u32[1,4]{1,0} %[[concatenate_346]])
-// CHECK-NEXT:  %[[slice_348:[^ ]+]] = u32[1]{0} slice(u32[4]{0} %[[reshape_347]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[] reshape(u32[1]{0} %[[slice_348]])
-// CHECK-NEXT:  ROOT %[[tuple_352:[^ ]+]] = (u64[3]{0}, u32[]) tuple(u64[3]{0} %[[concatenate_351]], u32[] %[[reshape_349]])
+// CHECK-NEXT:  %[[broadcast_320:[^ ]+]] = u64[1]{0} broadcast(%[[constant_319]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_321:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_317]], %[[broadcast_320]])
+// CHECK-NEXT:  %[[convert_322:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_321]])
+// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1]{0} convert(%[[multiply_289]])
+// CHECK-NEXT:  %[[xor_335:[^ ]+]] = u32[1]{0} xor(%[[convert_322]], %[[convert_290]])
+// CHECK-NEXT:  %[[broadcast_336:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[xor_335]], %[[broadcast_336]])
+// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_337]])
+// CHECK-NEXT:  %[[convert_318:[^ ]+]] = u32[1]{0} convert(%[[multiply_317]])
+// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_318]])
+// CHECK-NEXT:  %[[concatenate_346:[^ ]+]] = u32[1,4]{1,0} concatenate(%[[reshape_342]], %[[reshape_343]], %[[reshape_344]], %[[reshape_345]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[4]{0} reshape(%[[concatenate_346]])
+// CHECK-NEXT:  %[[slice_348:[^ ]+]] = u32[1]{0} slice(%[[reshape_347]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[] reshape(%[[slice_348]])
+// CHECK-NEXT:  ROOT %[[tuple_352:[^ ]+]] = (u64[3]{0}, u32[]) tuple(%[[concatenate_351]], %[[reshape_349]])
 
 // CHECK-LABEL: ENTRY %test_philox
 // CHECK-NEXT:  %[[initial_state:[^ ]+]] = u64[6]{0} constant({0, 1, 8, 9, 4, 1})
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[6]{0}, u32[]) call(u64[6]{0} %[[initial_state]]), to_apply=%[[$rng_353]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[6]{0}, u32[]) call(%[[initial_state]]), to_apply=%[[$rng_353]]
 
 HloModule TestPhiloxAlgorithm
 
@@ -893,3 +1366,752 @@ ENTRY test_philox {
   initial_state = u64[6] constant({0, 1, 8, 9, 4, 1})
   ROOT result = (u64[6], u32[]) rng-bit-generator(initial_state), algorithm=rng_philox
 }
+
+// -----
+
+// CHECK-LABEL: HloModule TestPhiloxAlgorithm, entry_computation_layout={()->(u64[6]{0}, u64[])}
+
+// CHECK:       %[[$rng_363:[^ ]+]]
+// CHECK-NEXT:  %[[state_1:[^ ]+]] = u64[6]{0} parameter(0)
+// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(%[[slice_2]])
+// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[] convert(%[[reshape_3]])
+// CHECK-NEXT:  %[[constant_86:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_87:[^ ]+]] = u32[] add(%[[convert_6]], %[[constant_86]])
+// CHECK-NEXT:  %[[constant_114:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_115:[^ ]+]] = u32[] add(%[[add_87]], %[[constant_114]])
+// CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_143:[^ ]+]] = u32[] add(%[[add_115]], %[[constant_142]])
+// CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_171:[^ ]+]] = u32[] add(%[[add_143]], %[[constant_170]])
+// CHECK-NEXT:  %[[constant_198:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[] add(%[[add_171]], %[[constant_198]])
+// CHECK-NEXT:  %[[constant_226:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[] add(%[[add_199]], %[[constant_226]])
+// CHECK-NEXT:  %[[constant_254:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_255:[^ ]+]] = u32[] add(%[[add_227]], %[[constant_254]])
+// CHECK-NEXT:  %[[constant_282:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_283:[^ ]+]] = u32[] add(%[[add_255]], %[[constant_282]])
+// CHECK-NEXT:  %[[constant_310:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_311:[^ ]+]] = u32[] add(%[[add_283]], %[[constant_310]])
+// CHECK-NEXT:  %[[constant_338:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_339:[^ ]+]] = u32[] add(%[[add_311]], %[[constant_338]])
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_7:[^ ]+]] = u64[] shift-right-logical(%[[reshape_3]], %[[constant_5]])
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[shift_right_logical_7]])
+// CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
+// CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
+// CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
+// CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
+// CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
+// CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[reshape_360:[^ ]+]] = u64[1]{0} reshape(%[[reshape_3]])
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[2]{0} slice(%[[state_1]]), slice={[1:3]}
+// CHECK-NEXT:  %[[slice_9:[^ ]+]] = u64[1]{0} slice(%[[slice_4]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_10:[^ ]+]] = u64[] reshape(%[[slice_9]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[] convert(%[[reshape_10]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u64[] convert(%[[convert_14]])
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_15:[^ ]+]] = u64[] shift-right-logical(%[[reshape_10]], %[[constant_13]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[shift_right_logical_15]])
+// CHECK-NEXT:  %[[convert_23:[^ ]+]] = u64[] convert(%[[convert_16]])
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_25:[^ ]+]] = u64[] shift-left(%[[convert_23]], %[[constant_24]])
+// CHECK-NEXT:  %[[or_26:[^ ]+]] = u64[] or(%[[convert_22]], %[[shift_left_25]])
+// CHECK-NEXT:  %[[constant_51:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = u64[] add(%[[or_26]], %[[constant_51]])
+// CHECK-NEXT:  %[[reshape_59:[^ ]+]] = u64[1]{0} reshape(%[[add_52]])
+// CHECK-NEXT:  %[[compare_54:[^ ]+]] = pred[] compare(%[[add_52]], %[[or_26]]), direction=LT
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[slice_4]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_27:[^ ]+]] = u64[] convert(%[[convert_18]])
+// CHECK-NEXT:  %[[constant_17:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_19:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_17]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[shift_right_logical_19]])
+// CHECK-NEXT:  %[[convert_28:[^ ]+]] = u64[] convert(%[[convert_20]])
+// CHECK-NEXT:  %[[constant_29:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_30:[^ ]+]] = u64[] shift-left(%[[convert_28]], %[[constant_29]])
+// CHECK-NEXT:  %[[or_31:[^ ]+]] = u64[] or(%[[convert_27]], %[[shift_left_30]])
+// CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_53]])
+// CHECK-NEXT:  %[[broadcast_56:[^ ]+]] = u64[] broadcast(%[[add_55]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u64[] broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_58:[^ ]+]] = u64[] select(%[[compare_54]], %[[broadcast_56]], %[[broadcast_57]])
+// CHECK-NEXT:  %[[reshape_60:[^ ]+]] = u64[1]{0} reshape(%[[select_58]])
+// CHECK-NEXT:  %[[concatenate_61:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_59]], %[[reshape_60]]), dimensions={0}
+// CHECK-NEXT:  %[[concatenate_361:[^ ]+]] = u64[3]{0} concatenate(%[[reshape_360]], %[[concatenate_61]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[iota_21:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[add_33:[^ ]+]] = u64[1]{0} add(%[[broadcast_32]], %[[iota_21]])
+// CHECK-NEXT:  %[[convert_42:[^ ]+]] = u32[1]{0} convert(%[[add_33]])
+// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u64[1]{0} convert(%[[convert_42]])
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1]{0} broadcast(%[[constant_63]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_65:[^ ]+]] = u64[1]{0} multiply(%[[convert_62]], %[[broadcast_64]])
+// CHECK-NEXT:  %[[constant_67:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u64[1]{0} broadcast(%[[constant_67]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_69:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_65]], %[[broadcast_68]])
+// CHECK-NEXT:  %[[convert_70:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_69]])
+// CHECK-NEXT:  %[[broadcast_35:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[compare_36:[^ ]+]] = pred[1]{0} compare(%[[add_33]], %[[broadcast_35]]), direction=LT
+// CHECK-NEXT:  %[[constant_34:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_34]])
+// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = u64[1]{0} broadcast(%[[add_37]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u64[1]{0} broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_40:[^ ]+]] = u64[1]{0} select(%[[compare_36]], %[[broadcast_38]], %[[broadcast_39]])
+// CHECK-NEXT:  %[[constant_46:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u64[1]{0} broadcast(%[[constant_46]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_49:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_40]], %[[broadcast_48]])
+// CHECK-NEXT:  %[[convert_50:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_49]])
+// CHECK-NEXT:  %[[xor_83:[^ ]+]] = u32[1]{0} xor(%[[convert_70]], %[[convert_50]])
+// CHECK-NEXT:  %[[broadcast_84:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[xor_83]], %[[broadcast_84]])
+// CHECK-NEXT:  %[[convert_99:[^ ]+]] = u64[1]{0} convert(%[[xor_85]])
+// CHECK-NEXT:  %[[constant_100:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_101:[^ ]+]] = u64[1]{0} broadcast(%[[constant_100]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_102:[^ ]+]] = u64[1]{0} multiply(%[[convert_99]], %[[broadcast_101]])
+// CHECK-NEXT:  %[[constant_104:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_105:[^ ]+]] = u64[1]{0} broadcast(%[[constant_104]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_106:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_102]], %[[broadcast_105]])
+// CHECK-NEXT:  %[[convert_107:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_106]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[select_40]])
+// CHECK-NEXT:  %[[convert_71:[^ ]+]] = u64[1]{0} convert(%[[convert_47]])
+// CHECK-NEXT:  %[[constant_72:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u64[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = u64[1]{0} multiply(%[[convert_71]], %[[broadcast_73]])
+// CHECK-NEXT:  %[[convert_75:[^ ]+]] = u32[1]{0} convert(%[[multiply_74]])
+// CHECK-NEXT:  %[[xor_108:[^ ]+]] = u32[1]{0} xor(%[[convert_107]], %[[convert_75]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[add_87]]), dimensions={}
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[xor_108]], %[[broadcast_109]])
+// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u64[1]{0} convert(%[[xor_110]])
+// CHECK-NEXT:  %[[constant_119:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1]{0} broadcast(%[[constant_119]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = u64[1]{0} multiply(%[[convert_118]], %[[broadcast_120]])
+// CHECK-NEXT:  %[[constant_123:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u64[1]{0} broadcast(%[[constant_123]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_121]], %[[broadcast_124]])
+// CHECK-NEXT:  %[[convert_126:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_125]])
+// CHECK-NEXT:  %[[constant_76:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = u64[1]{0} broadcast(%[[constant_76]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_78:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_74]], %[[broadcast_77]])
+// CHECK-NEXT:  %[[convert_79:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_78]])
+// CHECK-NEXT:  %[[constant_41:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u64[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_44:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_33]], %[[broadcast_43]])
+// CHECK-NEXT:  %[[convert_45:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_44]])
+// CHECK-NEXT:  %[[xor_80:[^ ]+]] = u32[1]{0} xor(%[[convert_79]], %[[convert_45]])
+// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = u32[1]{0} broadcast(%[[convert_6]]), dimensions={}
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[xor_80]], %[[broadcast_81]])
+// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u64[1]{0} convert(%[[xor_82]])
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1]{0} broadcast(%[[constant_91]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = u64[1]{0} multiply(%[[convert_90]], %[[broadcast_92]])
+// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1]{0} convert(%[[multiply_93]])
+// CHECK-NEXT:  %[[xor_139:[^ ]+]] = u32[1]{0} xor(%[[convert_126]], %[[convert_94]])
+// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[xor_139]], %[[broadcast_140]])
+// CHECK-NEXT:  %[[convert_155:[^ ]+]] = u64[1]{0} convert(%[[xor_141]])
+// CHECK-NEXT:  %[[constant_156:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_157:[^ ]+]] = u64[1]{0} broadcast(%[[constant_156]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_158:[^ ]+]] = u64[1]{0} multiply(%[[convert_155]], %[[broadcast_157]])
+// CHECK-NEXT:  %[[constant_160:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_161:[^ ]+]] = u64[1]{0} broadcast(%[[constant_160]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_162:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_158]], %[[broadcast_161]])
+// CHECK-NEXT:  %[[convert_163:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_162]])
+// CHECK-NEXT:  %[[constant_95:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u64[1]{0} broadcast(%[[constant_95]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_97:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_93]], %[[broadcast_96]])
+// CHECK-NEXT:  %[[convert_98:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_97]])
+// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1]{0} convert(%[[multiply_65]])
+// CHECK-NEXT:  %[[xor_111:[^ ]+]] = u32[1]{0} xor(%[[convert_98]], %[[convert_66]])
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[xor_111]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[convert_127:[^ ]+]] = u64[1]{0} convert(%[[xor_113]])
+// CHECK-NEXT:  %[[constant_128:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_129:[^ ]+]] = u64[1]{0} broadcast(%[[constant_128]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_130:[^ ]+]] = u64[1]{0} multiply(%[[convert_127]], %[[broadcast_129]])
+// CHECK-NEXT:  %[[convert_131:[^ ]+]] = u32[1]{0} convert(%[[multiply_130]])
+// CHECK-NEXT:  %[[xor_164:[^ ]+]] = u32[1]{0} xor(%[[convert_163]], %[[convert_131]])
+// CHECK-NEXT:  %[[broadcast_165:[^ ]+]] = u32[1]{0} broadcast(%[[add_143]]), dimensions={}
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[xor_164]], %[[broadcast_165]])
+// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u64[1]{0} convert(%[[xor_166]])
+// CHECK-NEXT:  %[[constant_175:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1]{0} broadcast(%[[constant_175]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = u64[1]{0} multiply(%[[convert_174]], %[[broadcast_176]])
+// CHECK-NEXT:  %[[constant_179:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u64[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_177]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[convert_182:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[constant_132:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = u64[1]{0} broadcast(%[[constant_132]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_134:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_130]], %[[broadcast_133]])
+// CHECK-NEXT:  %[[convert_135:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_134]])
+// CHECK-NEXT:  %[[convert_103:[^ ]+]] = u32[1]{0} convert(%[[multiply_102]])
+// CHECK-NEXT:  %[[xor_136:[^ ]+]] = u32[1]{0} xor(%[[convert_135]], %[[convert_103]])
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[add_115]]), dimensions={}
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[xor_136]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u64[1]{0} convert(%[[xor_138]])
+// CHECK-NEXT:  %[[constant_147:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1]{0} broadcast(%[[constant_147]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_149:[^ ]+]] = u64[1]{0} multiply(%[[convert_146]], %[[broadcast_148]])
+// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1]{0} convert(%[[multiply_149]])
+// CHECK-NEXT:  %[[xor_195:[^ ]+]] = u32[1]{0} xor(%[[convert_182]], %[[convert_150]])
+// CHECK-NEXT:  %[[broadcast_196:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[xor_195]], %[[broadcast_196]])
+// CHECK-NEXT:  %[[convert_211:[^ ]+]] = u64[1]{0} convert(%[[xor_197]])
+// CHECK-NEXT:  %[[constant_212:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_213:[^ ]+]] = u64[1]{0} broadcast(%[[constant_212]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_214:[^ ]+]] = u64[1]{0} multiply(%[[convert_211]], %[[broadcast_213]])
+// CHECK-NEXT:  %[[constant_216:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_217:[^ ]+]] = u64[1]{0} broadcast(%[[constant_216]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_218:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_214]], %[[broadcast_217]])
+// CHECK-NEXT:  %[[convert_219:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_218]])
+// CHECK-NEXT:  %[[constant_151:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u64[1]{0} broadcast(%[[constant_151]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_153:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_149]], %[[broadcast_152]])
+// CHECK-NEXT:  %[[convert_154:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_153]])
+// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1]{0} convert(%[[multiply_121]])
+// CHECK-NEXT:  %[[xor_167:[^ ]+]] = u32[1]{0} xor(%[[convert_154]], %[[convert_122]])
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[xor_167]], %[[broadcast_168]])
+// CHECK-NEXT:  %[[convert_183:[^ ]+]] = u64[1]{0} convert(%[[xor_169]])
+// CHECK-NEXT:  %[[constant_184:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_185:[^ ]+]] = u64[1]{0} broadcast(%[[constant_184]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_186:[^ ]+]] = u64[1]{0} multiply(%[[convert_183]], %[[broadcast_185]])
+// CHECK-NEXT:  %[[convert_187:[^ ]+]] = u32[1]{0} convert(%[[multiply_186]])
+// CHECK-NEXT:  %[[xor_220:[^ ]+]] = u32[1]{0} xor(%[[convert_219]], %[[convert_187]])
+// CHECK-NEXT:  %[[broadcast_221:[^ ]+]] = u32[1]{0} broadcast(%[[add_199]]), dimensions={}
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[xor_220]], %[[broadcast_221]])
+// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u64[1]{0} convert(%[[xor_222]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_233:[^ ]+]] = u64[1]{0} multiply(%[[convert_230]], %[[broadcast_232]])
+// CHECK-NEXT:  %[[constant_235:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u64[1]{0} broadcast(%[[constant_235]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_237:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_233]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[convert_238:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_237]])
+// CHECK-NEXT:  %[[constant_188:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u64[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_186]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[convert_191:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[convert_159:[^ ]+]] = u32[1]{0} convert(%[[multiply_158]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[convert_191]], %[[convert_159]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[add_171]]), dimensions={}
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[xor_192]], %[[broadcast_193]])
+// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u64[1]{0} convert(%[[xor_194]])
+// CHECK-NEXT:  %[[constant_203:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1]{0} broadcast(%[[constant_203]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_205:[^ ]+]] = u64[1]{0} multiply(%[[convert_202]], %[[broadcast_204]])
+// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1]{0} convert(%[[multiply_205]])
+// CHECK-NEXT:  %[[xor_251:[^ ]+]] = u32[1]{0} xor(%[[convert_238]], %[[convert_206]])
+// CHECK-NEXT:  %[[broadcast_252:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[xor_251]], %[[broadcast_252]])
+// CHECK-NEXT:  %[[convert_267:[^ ]+]] = u64[1]{0} convert(%[[xor_253]])
+// CHECK-NEXT:  %[[constant_268:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_269:[^ ]+]] = u64[1]{0} broadcast(%[[constant_268]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_270:[^ ]+]] = u64[1]{0} multiply(%[[convert_267]], %[[broadcast_269]])
+// CHECK-NEXT:  %[[constant_272:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_273:[^ ]+]] = u64[1]{0} broadcast(%[[constant_272]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_274:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_270]], %[[broadcast_273]])
+// CHECK-NEXT:  %[[convert_275:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_274]])
+// CHECK-NEXT:  %[[constant_207:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u64[1]{0} broadcast(%[[constant_207]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_209:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_205]], %[[broadcast_208]])
+// CHECK-NEXT:  %[[convert_210:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_209]])
+// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1]{0} convert(%[[multiply_177]])
+// CHECK-NEXT:  %[[xor_223:[^ ]+]] = u32[1]{0} xor(%[[convert_210]], %[[convert_178]])
+// CHECK-NEXT:  %[[broadcast_224:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[xor_223]], %[[broadcast_224]])
+// CHECK-NEXT:  %[[convert_239:[^ ]+]] = u64[1]{0} convert(%[[xor_225]])
+// CHECK-NEXT:  %[[constant_240:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u64[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_242:[^ ]+]] = u64[1]{0} multiply(%[[convert_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[convert_243:[^ ]+]] = u32[1]{0} convert(%[[multiply_242]])
+// CHECK-NEXT:  %[[xor_276:[^ ]+]] = u32[1]{0} xor(%[[convert_275]], %[[convert_243]])
+// CHECK-NEXT:  %[[broadcast_277:[^ ]+]] = u32[1]{0} broadcast(%[[add_255]]), dimensions={}
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[xor_276]], %[[broadcast_277]])
+// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u64[1]{0} convert(%[[xor_278]])
+// CHECK-NEXT:  %[[constant_287:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1]{0} broadcast(%[[constant_287]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_289:[^ ]+]] = u64[1]{0} multiply(%[[convert_286]], %[[broadcast_288]])
+// CHECK-NEXT:  %[[constant_291:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u64[1]{0} broadcast(%[[constant_291]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_293:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_289]], %[[broadcast_292]])
+// CHECK-NEXT:  %[[convert_294:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_293]])
+// CHECK-NEXT:  %[[constant_244:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_245:[^ ]+]] = u64[1]{0} broadcast(%[[constant_244]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_246:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_242]], %[[broadcast_245]])
+// CHECK-NEXT:  %[[convert_247:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_246]])
+// CHECK-NEXT:  %[[convert_215:[^ ]+]] = u32[1]{0} convert(%[[multiply_214]])
+// CHECK-NEXT:  %[[xor_248:[^ ]+]] = u32[1]{0} xor(%[[convert_247]], %[[convert_215]])
+// CHECK-NEXT:  %[[broadcast_249:[^ ]+]] = u32[1]{0} broadcast(%[[add_227]]), dimensions={}
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[xor_248]], %[[broadcast_249]])
+// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u64[1]{0} convert(%[[xor_250]])
+// CHECK-NEXT:  %[[constant_259:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1]{0} broadcast(%[[constant_259]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_261:[^ ]+]] = u64[1]{0} multiply(%[[convert_258]], %[[broadcast_260]])
+// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1]{0} convert(%[[multiply_261]])
+// CHECK-NEXT:  %[[xor_307:[^ ]+]] = u32[1]{0} xor(%[[convert_294]], %[[convert_262]])
+// CHECK-NEXT:  %[[broadcast_308:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[xor_307]], %[[broadcast_308]])
+// CHECK-NEXT:  %[[convert_323:[^ ]+]] = u64[1]{0} convert(%[[xor_309]])
+// CHECK-NEXT:  %[[constant_324:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_325:[^ ]+]] = u64[1]{0} broadcast(%[[constant_324]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_326:[^ ]+]] = u64[1]{0} multiply(%[[convert_323]], %[[broadcast_325]])
+// CHECK-NEXT:  %[[constant_328:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_329:[^ ]+]] = u64[1]{0} broadcast(%[[constant_328]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_330:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_326]], %[[broadcast_329]])
+// CHECK-NEXT:  %[[convert_331:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_330]])
+// CHECK-NEXT:  %[[constant_263:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u64[1]{0} broadcast(%[[constant_263]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_265:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_261]], %[[broadcast_264]])
+// CHECK-NEXT:  %[[convert_266:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_265]])
+// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1]{0} convert(%[[multiply_233]])
+// CHECK-NEXT:  %[[xor_279:[^ ]+]] = u32[1]{0} xor(%[[convert_266]], %[[convert_234]])
+// CHECK-NEXT:  %[[broadcast_280:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[xor_279]], %[[broadcast_280]])
+// CHECK-NEXT:  %[[convert_295:[^ ]+]] = u64[1]{0} convert(%[[xor_281]])
+// CHECK-NEXT:  %[[constant_296:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_297:[^ ]+]] = u64[1]{0} broadcast(%[[constant_296]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_298:[^ ]+]] = u64[1]{0} multiply(%[[convert_295]], %[[broadcast_297]])
+// CHECK-NEXT:  %[[convert_299:[^ ]+]] = u32[1]{0} convert(%[[multiply_298]])
+// CHECK-NEXT:  %[[xor_332:[^ ]+]] = u32[1]{0} xor(%[[convert_331]], %[[convert_299]])
+// CHECK-NEXT:  %[[broadcast_333:[^ ]+]] = u32[1]{0} broadcast(%[[add_311]]), dimensions={}
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[xor_332]], %[[broadcast_333]])
+// CHECK-NEXT:  %[[convert_342:[^ ]+]] = u64[1]{0} convert(%[[xor_334]])
+// CHECK-NEXT:  %[[convert_327:[^ ]+]] = u32[1]{0} convert(%[[multiply_326]])
+// CHECK-NEXT:  %[[convert_343:[^ ]+]] = u64[1]{0} convert(%[[convert_327]])
+// CHECK-NEXT:  %[[constant_344:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_345:[^ ]+]] = u64[1]{0} broadcast(%[[constant_344]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_346:[^ ]+]] = u64[1]{0} shift-left(%[[convert_343]], %[[broadcast_345]])
+// CHECK-NEXT:  %[[or_347:[^ ]+]] = u64[1]{0} or(%[[convert_342]], %[[shift_left_346]])
+// CHECK-NEXT:  %[[reshape_354:[^ ]+]] = u64[1,1]{1,0} reshape(%[[or_347]])
+// CHECK-NEXT:  %[[constant_300:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_301:[^ ]+]] = u64[1]{0} broadcast(%[[constant_300]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_302:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_298]], %[[broadcast_301]])
+// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_302]])
+// CHECK-NEXT:  %[[convert_271:[^ ]+]] = u32[1]{0} convert(%[[multiply_270]])
+// CHECK-NEXT:  %[[xor_304:[^ ]+]] = u32[1]{0} xor(%[[convert_303]], %[[convert_271]])
+// CHECK-NEXT:  %[[broadcast_305:[^ ]+]] = u32[1]{0} broadcast(%[[add_283]]), dimensions={}
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[xor_304]], %[[broadcast_305]])
+// CHECK-NEXT:  %[[convert_314:[^ ]+]] = u64[1]{0} convert(%[[xor_306]])
+// CHECK-NEXT:  %[[constant_315:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_316:[^ ]+]] = u64[1]{0} broadcast(%[[constant_315]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_317:[^ ]+]] = u64[1]{0} multiply(%[[convert_314]], %[[broadcast_316]])
+// CHECK-NEXT:  %[[constant_319:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_320:[^ ]+]] = u64[1]{0} broadcast(%[[constant_319]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_321:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_317]], %[[broadcast_320]])
+// CHECK-NEXT:  %[[convert_322:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_321]])
+// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1]{0} convert(%[[multiply_289]])
+// CHECK-NEXT:  %[[xor_335:[^ ]+]] = u32[1]{0} xor(%[[convert_322]], %[[convert_290]])
+// CHECK-NEXT:  %[[broadcast_336:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[xor_335]], %[[broadcast_336]])
+// CHECK-NEXT:  %[[convert_348:[^ ]+]] = u64[1]{0} convert(%[[xor_337]])
+// CHECK-NEXT:  %[[convert_318:[^ ]+]] = u32[1]{0} convert(%[[multiply_317]])
+// CHECK-NEXT:  %[[convert_349:[^ ]+]] = u64[1]{0} convert(%[[convert_318]])
+// CHECK-NEXT:  %[[constant_350:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_351:[^ ]+]] = u64[1]{0} broadcast(%[[constant_350]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_352:[^ ]+]] = u64[1]{0} shift-left(%[[convert_349]], %[[broadcast_351]])
+// CHECK-NEXT:  %[[or_353:[^ ]+]] = u64[1]{0} or(%[[convert_348]], %[[shift_left_352]])
+// CHECK-NEXT:  %[[reshape_355:[^ ]+]] = u64[1,1]{1,0} reshape(%[[or_353]])
+// CHECK-NEXT:  %[[concatenate_356:[^ ]+]] = u64[1,2]{1,0} concatenate(%[[reshape_354]], %[[reshape_355]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_357:[^ ]+]] = u64[2]{0} reshape(%[[concatenate_356]])
+// CHECK-NEXT:  %[[slice_358:[^ ]+]] = u64[1]{0} slice(%[[reshape_357]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_359:[^ ]+]] = u64[] reshape(%[[slice_358]])
+// CHECK-NEXT:  ROOT %[[tuple_362:[^ ]+]] = (u64[3]{0}, u64[]) tuple(%[[concatenate_361]], %[[reshape_359]])
+
+// CHECK-LABEL: ENTRY %test_philox_u64
+// CHECK-NEXT:  %[[initial_state:[^ ]+]] = u64[6]{0} constant({0, 1, 8, 9, 4, 1})
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[6]{0}, u64[]) call(%[[initial_state]]), to_apply=%[[$rng_363]]
+
+HloModule TestPhiloxAlgorithm
+
+ENTRY test_philox_u64 {
+  initial_state = u64[6] constant({0, 1, 8, 9, 4, 1})
+  ROOT result = (u64[6], u64[]) rng-bit-generator(initial_state), algorithm=rng_philox
+}
+
+// -----
+
+// CHECK-LABEL: HloModule TestPhiloxAlgorithm, entry_computation_layout={()->(u64[6]{0}, u8[])}
+
+// CHECK:       %[[$rng_354:[^ ]+]]
+// CHECK-NEXT:  %[[state_1:[^ ]+]] = u64[6]{0} parameter(0)
+// CHECK-NEXT:  %[[slice_2:[^ ]+]] = u64[1]{0} slice(%[[state_1]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_3:[^ ]+]] = u64[] reshape(%[[slice_2]])
+// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[] convert(%[[reshape_3]])
+// CHECK-NEXT:  %[[constant_86:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_87:[^ ]+]] = u32[] add(%[[convert_6]], %[[constant_86]])
+// CHECK-NEXT:  %[[constant_114:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_115:[^ ]+]] = u32[] add(%[[add_87]], %[[constant_114]])
+// CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_143:[^ ]+]] = u32[] add(%[[add_115]], %[[constant_142]])
+// CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_171:[^ ]+]] = u32[] add(%[[add_143]], %[[constant_170]])
+// CHECK-NEXT:  %[[constant_198:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[] add(%[[add_171]], %[[constant_198]])
+// CHECK-NEXT:  %[[constant_226:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[] add(%[[add_199]], %[[constant_226]])
+// CHECK-NEXT:  %[[constant_254:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_255:[^ ]+]] = u32[] add(%[[add_227]], %[[constant_254]])
+// CHECK-NEXT:  %[[constant_282:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_283:[^ ]+]] = u32[] add(%[[add_255]], %[[constant_282]])
+// CHECK-NEXT:  %[[constant_310:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_311:[^ ]+]] = u32[] add(%[[add_283]], %[[constant_310]])
+// CHECK-NEXT:  %[[constant_338:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_339:[^ ]+]] = u32[] add(%[[add_311]], %[[constant_338]])
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_7:[^ ]+]] = u64[] shift-right-logical(%[[reshape_3]], %[[constant_5]])
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[shift_right_logical_7]])
+// CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
+// CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
+// CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
+// CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
+// CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
+// CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[reshape_351:[^ ]+]] = u64[1]{0} reshape(%[[reshape_3]])
+// CHECK-NEXT:  %[[slice_4:[^ ]+]] = u64[2]{0} slice(%[[state_1]]), slice={[1:3]}
+// CHECK-NEXT:  %[[slice_9:[^ ]+]] = u64[1]{0} slice(%[[slice_4]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_10:[^ ]+]] = u64[] reshape(%[[slice_9]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[] convert(%[[reshape_10]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u64[] convert(%[[convert_14]])
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_15:[^ ]+]] = u64[] shift-right-logical(%[[reshape_10]], %[[constant_13]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[shift_right_logical_15]])
+// CHECK-NEXT:  %[[convert_23:[^ ]+]] = u64[] convert(%[[convert_16]])
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_25:[^ ]+]] = u64[] shift-left(%[[convert_23]], %[[constant_24]])
+// CHECK-NEXT:  %[[or_26:[^ ]+]] = u64[] or(%[[convert_22]], %[[shift_left_25]])
+// CHECK-NEXT:  %[[constant_51:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = u64[] add(%[[or_26]], %[[constant_51]])
+// CHECK-NEXT:  %[[reshape_59:[^ ]+]] = u64[1]{0} reshape(%[[add_52]])
+// CHECK-NEXT:  %[[compare_54:[^ ]+]] = pred[] compare(%[[add_52]], %[[or_26]]), direction=LT
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[slice_4]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_27:[^ ]+]] = u64[] convert(%[[convert_18]])
+// CHECK-NEXT:  %[[constant_17:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_19:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_17]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[shift_right_logical_19]])
+// CHECK-NEXT:  %[[convert_28:[^ ]+]] = u64[] convert(%[[convert_20]])
+// CHECK-NEXT:  %[[constant_29:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_30:[^ ]+]] = u64[] shift-left(%[[convert_28]], %[[constant_29]])
+// CHECK-NEXT:  %[[or_31:[^ ]+]] = u64[] or(%[[convert_27]], %[[shift_left_30]])
+// CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_53]])
+// CHECK-NEXT:  %[[broadcast_56:[^ ]+]] = u64[] broadcast(%[[add_55]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u64[] broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_58:[^ ]+]] = u64[] select(%[[compare_54]], %[[broadcast_56]], %[[broadcast_57]])
+// CHECK-NEXT:  %[[reshape_60:[^ ]+]] = u64[1]{0} reshape(%[[select_58]])
+// CHECK-NEXT:  %[[concatenate_61:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_59]], %[[reshape_60]]), dimensions={0}
+// CHECK-NEXT:  %[[concatenate_352:[^ ]+]] = u64[3]{0} concatenate(%[[reshape_351]], %[[concatenate_61]]), dimensions={0}
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[iota_21:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[add_33:[^ ]+]] = u64[1]{0} add(%[[broadcast_32]], %[[iota_21]])
+// CHECK-NEXT:  %[[convert_42:[^ ]+]] = u32[1]{0} convert(%[[add_33]])
+// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u64[1]{0} convert(%[[convert_42]])
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1]{0} broadcast(%[[constant_63]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_65:[^ ]+]] = u64[1]{0} multiply(%[[convert_62]], %[[broadcast_64]])
+// CHECK-NEXT:  %[[constant_67:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u64[1]{0} broadcast(%[[constant_67]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_69:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_65]], %[[broadcast_68]])
+// CHECK-NEXT:  %[[convert_70:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_69]])
+// CHECK-NEXT:  %[[broadcast_35:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[compare_36:[^ ]+]] = pred[1]{0} compare(%[[add_33]], %[[broadcast_35]]), direction=LT
+// CHECK-NEXT:  %[[constant_34:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_34]])
+// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = u64[1]{0} broadcast(%[[add_37]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u64[1]{0} broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_40:[^ ]+]] = u64[1]{0} select(%[[compare_36]], %[[broadcast_38]], %[[broadcast_39]])
+// CHECK-NEXT:  %[[constant_46:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u64[1]{0} broadcast(%[[constant_46]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_49:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_40]], %[[broadcast_48]])
+// CHECK-NEXT:  %[[convert_50:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_49]])
+// CHECK-NEXT:  %[[xor_83:[^ ]+]] = u32[1]{0} xor(%[[convert_70]], %[[convert_50]])
+// CHECK-NEXT:  %[[broadcast_84:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[xor_83]], %[[broadcast_84]])
+// CHECK-NEXT:  %[[convert_99:[^ ]+]] = u64[1]{0} convert(%[[xor_85]])
+// CHECK-NEXT:  %[[constant_100:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_101:[^ ]+]] = u64[1]{0} broadcast(%[[constant_100]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_102:[^ ]+]] = u64[1]{0} multiply(%[[convert_99]], %[[broadcast_101]])
+// CHECK-NEXT:  %[[constant_104:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_105:[^ ]+]] = u64[1]{0} broadcast(%[[constant_104]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_106:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_102]], %[[broadcast_105]])
+// CHECK-NEXT:  %[[convert_107:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_106]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[select_40]])
+// CHECK-NEXT:  %[[convert_71:[^ ]+]] = u64[1]{0} convert(%[[convert_47]])
+// CHECK-NEXT:  %[[constant_72:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u64[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = u64[1]{0} multiply(%[[convert_71]], %[[broadcast_73]])
+// CHECK-NEXT:  %[[convert_75:[^ ]+]] = u32[1]{0} convert(%[[multiply_74]])
+// CHECK-NEXT:  %[[xor_108:[^ ]+]] = u32[1]{0} xor(%[[convert_107]], %[[convert_75]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[add_87]]), dimensions={}
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[xor_108]], %[[broadcast_109]])
+// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u64[1]{0} convert(%[[xor_110]])
+// CHECK-NEXT:  %[[constant_119:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1]{0} broadcast(%[[constant_119]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = u64[1]{0} multiply(%[[convert_118]], %[[broadcast_120]])
+// CHECK-NEXT:  %[[constant_123:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u64[1]{0} broadcast(%[[constant_123]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_121]], %[[broadcast_124]])
+// CHECK-NEXT:  %[[convert_126:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_125]])
+// CHECK-NEXT:  %[[constant_76:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = u64[1]{0} broadcast(%[[constant_76]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_78:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_74]], %[[broadcast_77]])
+// CHECK-NEXT:  %[[convert_79:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_78]])
+// CHECK-NEXT:  %[[constant_41:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u64[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_44:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_33]], %[[broadcast_43]])
+// CHECK-NEXT:  %[[convert_45:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_44]])
+// CHECK-NEXT:  %[[xor_80:[^ ]+]] = u32[1]{0} xor(%[[convert_79]], %[[convert_45]])
+// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = u32[1]{0} broadcast(%[[convert_6]]), dimensions={}
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[xor_80]], %[[broadcast_81]])
+// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u64[1]{0} convert(%[[xor_82]])
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1]{0} broadcast(%[[constant_91]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = u64[1]{0} multiply(%[[convert_90]], %[[broadcast_92]])
+// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1]{0} convert(%[[multiply_93]])
+// CHECK-NEXT:  %[[xor_139:[^ ]+]] = u32[1]{0} xor(%[[convert_126]], %[[convert_94]])
+// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[xor_139]], %[[broadcast_140]])
+// CHECK-NEXT:  %[[convert_155:[^ ]+]] = u64[1]{0} convert(%[[xor_141]])
+// CHECK-NEXT:  %[[constant_156:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_157:[^ ]+]] = u64[1]{0} broadcast(%[[constant_156]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_158:[^ ]+]] = u64[1]{0} multiply(%[[convert_155]], %[[broadcast_157]])
+// CHECK-NEXT:  %[[constant_160:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_161:[^ ]+]] = u64[1]{0} broadcast(%[[constant_160]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_162:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_158]], %[[broadcast_161]])
+// CHECK-NEXT:  %[[convert_163:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_162]])
+// CHECK-NEXT:  %[[constant_95:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u64[1]{0} broadcast(%[[constant_95]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_97:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_93]], %[[broadcast_96]])
+// CHECK-NEXT:  %[[convert_98:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_97]])
+// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1]{0} convert(%[[multiply_65]])
+// CHECK-NEXT:  %[[xor_111:[^ ]+]] = u32[1]{0} xor(%[[convert_98]], %[[convert_66]])
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[xor_111]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[convert_127:[^ ]+]] = u64[1]{0} convert(%[[xor_113]])
+// CHECK-NEXT:  %[[constant_128:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_129:[^ ]+]] = u64[1]{0} broadcast(%[[constant_128]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_130:[^ ]+]] = u64[1]{0} multiply(%[[convert_127]], %[[broadcast_129]])
+// CHECK-NEXT:  %[[convert_131:[^ ]+]] = u32[1]{0} convert(%[[multiply_130]])
+// CHECK-NEXT:  %[[xor_164:[^ ]+]] = u32[1]{0} xor(%[[convert_163]], %[[convert_131]])
+// CHECK-NEXT:  %[[broadcast_165:[^ ]+]] = u32[1]{0} broadcast(%[[add_143]]), dimensions={}
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[xor_164]], %[[broadcast_165]])
+// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u64[1]{0} convert(%[[xor_166]])
+// CHECK-NEXT:  %[[constant_175:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1]{0} broadcast(%[[constant_175]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = u64[1]{0} multiply(%[[convert_174]], %[[broadcast_176]])
+// CHECK-NEXT:  %[[constant_179:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u64[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_177]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[convert_182:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[constant_132:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = u64[1]{0} broadcast(%[[constant_132]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_134:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_130]], %[[broadcast_133]])
+// CHECK-NEXT:  %[[convert_135:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_134]])
+// CHECK-NEXT:  %[[convert_103:[^ ]+]] = u32[1]{0} convert(%[[multiply_102]])
+// CHECK-NEXT:  %[[xor_136:[^ ]+]] = u32[1]{0} xor(%[[convert_135]], %[[convert_103]])
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[add_115]]), dimensions={}
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[xor_136]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u64[1]{0} convert(%[[xor_138]])
+// CHECK-NEXT:  %[[constant_147:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1]{0} broadcast(%[[constant_147]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_149:[^ ]+]] = u64[1]{0} multiply(%[[convert_146]], %[[broadcast_148]])
+// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1]{0} convert(%[[multiply_149]])
+// CHECK-NEXT:  %[[xor_195:[^ ]+]] = u32[1]{0} xor(%[[convert_182]], %[[convert_150]])
+// CHECK-NEXT:  %[[broadcast_196:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[xor_195]], %[[broadcast_196]])
+// CHECK-NEXT:  %[[convert_211:[^ ]+]] = u64[1]{0} convert(%[[xor_197]])
+// CHECK-NEXT:  %[[constant_212:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_213:[^ ]+]] = u64[1]{0} broadcast(%[[constant_212]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_214:[^ ]+]] = u64[1]{0} multiply(%[[convert_211]], %[[broadcast_213]])
+// CHECK-NEXT:  %[[constant_216:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_217:[^ ]+]] = u64[1]{0} broadcast(%[[constant_216]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_218:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_214]], %[[broadcast_217]])
+// CHECK-NEXT:  %[[convert_219:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_218]])
+// CHECK-NEXT:  %[[constant_151:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u64[1]{0} broadcast(%[[constant_151]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_153:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_149]], %[[broadcast_152]])
+// CHECK-NEXT:  %[[convert_154:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_153]])
+// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1]{0} convert(%[[multiply_121]])
+// CHECK-NEXT:  %[[xor_167:[^ ]+]] = u32[1]{0} xor(%[[convert_154]], %[[convert_122]])
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[xor_167]], %[[broadcast_168]])
+// CHECK-NEXT:  %[[convert_183:[^ ]+]] = u64[1]{0} convert(%[[xor_169]])
+// CHECK-NEXT:  %[[constant_184:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_185:[^ ]+]] = u64[1]{0} broadcast(%[[constant_184]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_186:[^ ]+]] = u64[1]{0} multiply(%[[convert_183]], %[[broadcast_185]])
+// CHECK-NEXT:  %[[convert_187:[^ ]+]] = u32[1]{0} convert(%[[multiply_186]])
+// CHECK-NEXT:  %[[xor_220:[^ ]+]] = u32[1]{0} xor(%[[convert_219]], %[[convert_187]])
+// CHECK-NEXT:  %[[broadcast_221:[^ ]+]] = u32[1]{0} broadcast(%[[add_199]]), dimensions={}
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[xor_220]], %[[broadcast_221]])
+// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u64[1]{0} convert(%[[xor_222]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_233:[^ ]+]] = u64[1]{0} multiply(%[[convert_230]], %[[broadcast_232]])
+// CHECK-NEXT:  %[[constant_235:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u64[1]{0} broadcast(%[[constant_235]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_237:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_233]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[convert_238:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_237]])
+// CHECK-NEXT:  %[[constant_188:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u64[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_186]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[convert_191:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[convert_159:[^ ]+]] = u32[1]{0} convert(%[[multiply_158]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[convert_191]], %[[convert_159]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[add_171]]), dimensions={}
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[xor_192]], %[[broadcast_193]])
+// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u64[1]{0} convert(%[[xor_194]])
+// CHECK-NEXT:  %[[constant_203:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1]{0} broadcast(%[[constant_203]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_205:[^ ]+]] = u64[1]{0} multiply(%[[convert_202]], %[[broadcast_204]])
+// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1]{0} convert(%[[multiply_205]])
+// CHECK-NEXT:  %[[xor_251:[^ ]+]] = u32[1]{0} xor(%[[convert_238]], %[[convert_206]])
+// CHECK-NEXT:  %[[broadcast_252:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[xor_251]], %[[broadcast_252]])
+// CHECK-NEXT:  %[[convert_267:[^ ]+]] = u64[1]{0} convert(%[[xor_253]])
+// CHECK-NEXT:  %[[constant_268:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_269:[^ ]+]] = u64[1]{0} broadcast(%[[constant_268]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_270:[^ ]+]] = u64[1]{0} multiply(%[[convert_267]], %[[broadcast_269]])
+// CHECK-NEXT:  %[[constant_272:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_273:[^ ]+]] = u64[1]{0} broadcast(%[[constant_272]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_274:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_270]], %[[broadcast_273]])
+// CHECK-NEXT:  %[[convert_275:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_274]])
+// CHECK-NEXT:  %[[constant_207:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u64[1]{0} broadcast(%[[constant_207]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_209:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_205]], %[[broadcast_208]])
+// CHECK-NEXT:  %[[convert_210:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_209]])
+// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1]{0} convert(%[[multiply_177]])
+// CHECK-NEXT:  %[[xor_223:[^ ]+]] = u32[1]{0} xor(%[[convert_210]], %[[convert_178]])
+// CHECK-NEXT:  %[[broadcast_224:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[xor_223]], %[[broadcast_224]])
+// CHECK-NEXT:  %[[convert_239:[^ ]+]] = u64[1]{0} convert(%[[xor_225]])
+// CHECK-NEXT:  %[[constant_240:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u64[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_242:[^ ]+]] = u64[1]{0} multiply(%[[convert_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[convert_243:[^ ]+]] = u32[1]{0} convert(%[[multiply_242]])
+// CHECK-NEXT:  %[[xor_276:[^ ]+]] = u32[1]{0} xor(%[[convert_275]], %[[convert_243]])
+// CHECK-NEXT:  %[[broadcast_277:[^ ]+]] = u32[1]{0} broadcast(%[[add_255]]), dimensions={}
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[xor_276]], %[[broadcast_277]])
+// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u64[1]{0} convert(%[[xor_278]])
+// CHECK-NEXT:  %[[constant_287:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1]{0} broadcast(%[[constant_287]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_289:[^ ]+]] = u64[1]{0} multiply(%[[convert_286]], %[[broadcast_288]])
+// CHECK-NEXT:  %[[constant_291:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u64[1]{0} broadcast(%[[constant_291]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_293:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_289]], %[[broadcast_292]])
+// CHECK-NEXT:  %[[convert_294:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_293]])
+// CHECK-NEXT:  %[[constant_244:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_245:[^ ]+]] = u64[1]{0} broadcast(%[[constant_244]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_246:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_242]], %[[broadcast_245]])
+// CHECK-NEXT:  %[[convert_247:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_246]])
+// CHECK-NEXT:  %[[convert_215:[^ ]+]] = u32[1]{0} convert(%[[multiply_214]])
+// CHECK-NEXT:  %[[xor_248:[^ ]+]] = u32[1]{0} xor(%[[convert_247]], %[[convert_215]])
+// CHECK-NEXT:  %[[broadcast_249:[^ ]+]] = u32[1]{0} broadcast(%[[add_227]]), dimensions={}
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[xor_248]], %[[broadcast_249]])
+// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u64[1]{0} convert(%[[xor_250]])
+// CHECK-NEXT:  %[[constant_259:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1]{0} broadcast(%[[constant_259]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_261:[^ ]+]] = u64[1]{0} multiply(%[[convert_258]], %[[broadcast_260]])
+// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1]{0} convert(%[[multiply_261]])
+// CHECK-NEXT:  %[[xor_307:[^ ]+]] = u32[1]{0} xor(%[[convert_294]], %[[convert_262]])
+// CHECK-NEXT:  %[[broadcast_308:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[xor_307]], %[[broadcast_308]])
+// CHECK-NEXT:  %[[convert_323:[^ ]+]] = u64[1]{0} convert(%[[xor_309]])
+// CHECK-NEXT:  %[[constant_324:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_325:[^ ]+]] = u64[1]{0} broadcast(%[[constant_324]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_326:[^ ]+]] = u64[1]{0} multiply(%[[convert_323]], %[[broadcast_325]])
+// CHECK-NEXT:  %[[constant_328:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_329:[^ ]+]] = u64[1]{0} broadcast(%[[constant_328]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_330:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_326]], %[[broadcast_329]])
+// CHECK-NEXT:  %[[convert_331:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_330]])
+// CHECK-NEXT:  %[[constant_263:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u64[1]{0} broadcast(%[[constant_263]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_265:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_261]], %[[broadcast_264]])
+// CHECK-NEXT:  %[[convert_266:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_265]])
+// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1]{0} convert(%[[multiply_233]])
+// CHECK-NEXT:  %[[xor_279:[^ ]+]] = u32[1]{0} xor(%[[convert_266]], %[[convert_234]])
+// CHECK-NEXT:  %[[broadcast_280:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[xor_279]], %[[broadcast_280]])
+// CHECK-NEXT:  %[[convert_295:[^ ]+]] = u64[1]{0} convert(%[[xor_281]])
+// CHECK-NEXT:  %[[constant_296:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_297:[^ ]+]] = u64[1]{0} broadcast(%[[constant_296]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_298:[^ ]+]] = u64[1]{0} multiply(%[[convert_295]], %[[broadcast_297]])
+// CHECK-NEXT:  %[[convert_299:[^ ]+]] = u32[1]{0} convert(%[[multiply_298]])
+// CHECK-NEXT:  %[[xor_332:[^ ]+]] = u32[1]{0} xor(%[[convert_331]], %[[convert_299]])
+// CHECK-NEXT:  %[[broadcast_333:[^ ]+]] = u32[1]{0} broadcast(%[[add_311]]), dimensions={}
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[xor_332]], %[[broadcast_333]])
+// CHECK-NEXT:  %[[reshape_342:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_334]])
+// CHECK-NEXT:  %[[convert_327:[^ ]+]] = u32[1]{0} convert(%[[multiply_326]])
+// CHECK-NEXT:  %[[reshape_343:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_327]])
+// CHECK-NEXT:  %[[constant_300:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_301:[^ ]+]] = u64[1]{0} broadcast(%[[constant_300]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_302:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_298]], %[[broadcast_301]])
+// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_302]])
+// CHECK-NEXT:  %[[convert_271:[^ ]+]] = u32[1]{0} convert(%[[multiply_270]])
+// CHECK-NEXT:  %[[xor_304:[^ ]+]] = u32[1]{0} xor(%[[convert_303]], %[[convert_271]])
+// CHECK-NEXT:  %[[broadcast_305:[^ ]+]] = u32[1]{0} broadcast(%[[add_283]]), dimensions={}
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[xor_304]], %[[broadcast_305]])
+// CHECK-NEXT:  %[[convert_314:[^ ]+]] = u64[1]{0} convert(%[[xor_306]])
+// CHECK-NEXT:  %[[constant_315:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_316:[^ ]+]] = u64[1]{0} broadcast(%[[constant_315]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_317:[^ ]+]] = u64[1]{0} multiply(%[[convert_314]], %[[broadcast_316]])
+// CHECK-NEXT:  %[[constant_319:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_320:[^ ]+]] = u64[1]{0} broadcast(%[[constant_319]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_321:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_317]], %[[broadcast_320]])
+// CHECK-NEXT:  %[[convert_322:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_321]])
+// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1]{0} convert(%[[multiply_289]])
+// CHECK-NEXT:  %[[xor_335:[^ ]+]] = u32[1]{0} xor(%[[convert_322]], %[[convert_290]])
+// CHECK-NEXT:  %[[broadcast_336:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[xor_335]], %[[broadcast_336]])
+// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_337]])
+// CHECK-NEXT:  %[[convert_318:[^ ]+]] = u32[1]{0} convert(%[[multiply_317]])
+// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_318]])
+// CHECK-NEXT:  %[[concatenate_346:[^ ]+]] = u32[1,4]{1,0} concatenate(%[[reshape_342]], %[[reshape_343]], %[[reshape_344]], %[[reshape_345]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[4]{0} reshape(%[[concatenate_346]])
+// CHECK-NEXT:  %[[slice_348:[^ ]+]] = u32[1]{0} slice(%[[reshape_347]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[] reshape(%[[slice_348]])
+// CHECK-NEXT:  %[[convert_350:[^ ]+]] = u8[] convert(%[[reshape_349]])
+// CHECK-NEXT:  ROOT %[[tuple_353:[^ ]+]] = (u64[3]{0}, u8[]) tuple(%[[concatenate_352]], %[[convert_350]])
+
+// CHECK-LABEL: ENTRY %test_philox_u8
+// CHECK-NEXT:  %[[initial_state:[^ ]+]] = u64[6]{0} constant({0, 1, 8, 9, 4, 1})
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = (u64[6]{0}, u8[]) call(%[[initial_state]]), to_apply=%[[$rng_354]]
+
+HloModule TestPhiloxAlgorithm
+
+ENTRY test_philox_u8 {
+  initial_state = u64[6] constant({0, 1, 8, 9, 4, 1})
+  ROOT result = (u64[6], u8[]) rng-bit-generator(initial_state), algorithm=rng_philox
+}
diff --git a/third_party/xla/xla/hlo/transforms/tests/rng_expander.hlo b/third_party/xla/xla/hlo/transforms/tests/rng_expander.hlo
index 422fc30f7617..431700082bbb 100644
--- a/third_party/xla/xla/hlo/transforms/tests/rng_expander.hlo
+++ b/third_party/xla/xla/hlo/transforms/tests/rng_expander.hlo
@@ -5,370 +5,370 @@
 
 // CHECK:       %[[$rng_358:[^ ]+]]
 // CHECK-NEXT:  %[[state_2:[^ ]+]] = u64[2]{0} parameter(1)
-// CHECK-NEXT:  %[[slice_9:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[state_2]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_10:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_9]])
-// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[] convert(u64[] %[[reshape_10]])
-// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u64[] convert(u32[] %[[convert_14]])
+// CHECK-NEXT:  %[[slice_9:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_10:[^ ]+]] = u64[] reshape(%[[slice_9]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[] convert(%[[reshape_10]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u64[] convert(%[[convert_14]])
 // CHECK-NEXT:  %[[constant_13:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_15:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_10]], u64[] %[[constant_13]])
-// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_15]])
-// CHECK-NEXT:  %[[convert_23:[^ ]+]] = u64[] convert(u32[] %[[convert_16]])
+// CHECK-NEXT:  %[[shift_right_logical_15:[^ ]+]] = u64[] shift-right-logical(%[[reshape_10]], %[[constant_13]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[shift_right_logical_15]])
+// CHECK-NEXT:  %[[convert_23:[^ ]+]] = u64[] convert(%[[convert_16]])
 // CHECK-NEXT:  %[[constant_24:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_left_25:[^ ]+]] = u64[] shift-left(u64[] %[[convert_23]], u64[] %[[constant_24]])
-// CHECK-NEXT:  %[[or_26:[^ ]+]] = u64[] or(u64[] %[[convert_22]], u64[] %[[shift_left_25]])
+// CHECK-NEXT:  %[[shift_left_25:[^ ]+]] = u64[] shift-left(%[[convert_23]], %[[constant_24]])
+// CHECK-NEXT:  %[[or_26:[^ ]+]] = u64[] or(%[[convert_22]], %[[shift_left_25]])
 // CHECK-NEXT:  %[[constant_51:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_52:[^ ]+]] = u64[] add(u64[] %[[or_26]], u64[] %[[constant_51]])
-// CHECK-NEXT:  %[[reshape_59:[^ ]+]] = u64[1]{0} reshape(u64[] %[[add_52]])
-// CHECK-NEXT:  %[[compare_54:[^ ]+]] = pred[] compare(u64[] %[[add_52]], u64[] %[[or_26]]), direction=LT
-// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[state_2]]), slice={[1:2]}
-// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_11]])
-// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(u64[] %[[reshape_12]])
-// CHECK-NEXT:  %[[convert_27:[^ ]+]] = u64[] convert(u32[] %[[convert_18]])
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = u64[] add(%[[or_26]], %[[constant_51]])
+// CHECK-NEXT:  %[[reshape_59:[^ ]+]] = u64[1]{0} reshape(%[[add_52]])
+// CHECK-NEXT:  %[[compare_54:[^ ]+]] = pred[] compare(%[[add_52]], %[[or_26]]), direction=LT
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_27:[^ ]+]] = u64[] convert(%[[convert_18]])
 // CHECK-NEXT:  %[[constant_17:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_19:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_12]], u64[] %[[constant_17]])
-// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_19]])
-// CHECK-NEXT:  %[[convert_28:[^ ]+]] = u64[] convert(u32[] %[[convert_20]])
+// CHECK-NEXT:  %[[shift_right_logical_19:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_17]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[shift_right_logical_19]])
+// CHECK-NEXT:  %[[convert_28:[^ ]+]] = u64[] convert(%[[convert_20]])
 // CHECK-NEXT:  %[[constant_29:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_left_30:[^ ]+]] = u64[] shift-left(u64[] %[[convert_28]], u64[] %[[constant_29]])
-// CHECK-NEXT:  %[[or_31:[^ ]+]] = u64[] or(u64[] %[[convert_27]], u64[] %[[shift_left_30]])
+// CHECK-NEXT:  %[[shift_left_30:[^ ]+]] = u64[] shift-left(%[[convert_28]], %[[constant_29]])
+// CHECK-NEXT:  %[[or_31:[^ ]+]] = u64[] or(%[[convert_27]], %[[shift_left_30]])
 // CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_55:[^ ]+]] = u64[] add(u64[] %[[or_31]], u64[] %[[constant_53]])
-// CHECK-NEXT:  %[[broadcast_56:[^ ]+]] = u64[] broadcast(u64[] %[[add_55]]), dimensions={}
-// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u64[] broadcast(u64[] %[[or_31]]), dimensions={}
-// CHECK-NEXT:  %[[select_58:[^ ]+]] = u64[] select(pred[] %[[compare_54]], u64[] %[[broadcast_56]], u64[] %[[broadcast_57]])
-// CHECK-NEXT:  %[[reshape_60:[^ ]+]] = u64[1]{0} reshape(u64[] %[[select_58]])
-// CHECK-NEXT:  %[[concatenate_61:[^ ]+]] = u64[2]{0} concatenate(u64[1]{0} %[[reshape_59]], u64[1]{0} %[[reshape_60]]), dimensions={0}
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_53]])
+// CHECK-NEXT:  %[[broadcast_56:[^ ]+]] = u64[] broadcast(%[[add_55]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u64[] broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_58:[^ ]+]] = u64[] select(%[[compare_54]], %[[broadcast_56]], %[[broadcast_57]])
+// CHECK-NEXT:  %[[reshape_60:[^ ]+]] = u64[1]{0} reshape(%[[select_58]])
+// CHECK-NEXT:  %[[concatenate_61:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_59]], %[[reshape_60]]), dimensions={0}
 // CHECK-NEXT:  %[[key_1:[^ ]+]] = u64[] parameter(0)
-// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[] convert(u64[] %[[key_1]])
+// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[] convert(%[[key_1]])
 // CHECK-NEXT:  %[[constant_86:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_87:[^ ]+]] = u32[] add(u32[] %[[convert_6]], u32[] %[[constant_86]])
+// CHECK-NEXT:  %[[add_87:[^ ]+]] = u32[] add(%[[convert_6]], %[[constant_86]])
 // CHECK-NEXT:  %[[constant_114:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_115:[^ ]+]] = u32[] add(u32[] %[[add_87]], u32[] %[[constant_114]])
+// CHECK-NEXT:  %[[add_115:[^ ]+]] = u32[] add(%[[add_87]], %[[constant_114]])
 // CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_143:[^ ]+]] = u32[] add(u32[] %[[add_115]], u32[] %[[constant_142]])
+// CHECK-NEXT:  %[[add_143:[^ ]+]] = u32[] add(%[[add_115]], %[[constant_142]])
 // CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_171:[^ ]+]] = u32[] add(u32[] %[[add_143]], u32[] %[[constant_170]])
+// CHECK-NEXT:  %[[add_171:[^ ]+]] = u32[] add(%[[add_143]], %[[constant_170]])
 // CHECK-NEXT:  %[[constant_198:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[] add(u32[] %[[add_171]], u32[] %[[constant_198]])
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[] add(%[[add_171]], %[[constant_198]])
 // CHECK-NEXT:  %[[constant_226:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[] add(u32[] %[[add_199]], u32[] %[[constant_226]])
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[] add(%[[add_199]], %[[constant_226]])
 // CHECK-NEXT:  %[[constant_254:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_255:[^ ]+]] = u32[] add(u32[] %[[add_227]], u32[] %[[constant_254]])
+// CHECK-NEXT:  %[[add_255:[^ ]+]] = u32[] add(%[[add_227]], %[[constant_254]])
 // CHECK-NEXT:  %[[constant_282:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_283:[^ ]+]] = u32[] add(u32[] %[[add_255]], u32[] %[[constant_282]])
+// CHECK-NEXT:  %[[add_283:[^ ]+]] = u32[] add(%[[add_255]], %[[constant_282]])
 // CHECK-NEXT:  %[[constant_310:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_311:[^ ]+]] = u32[] add(u32[] %[[add_283]], u32[] %[[constant_310]])
+// CHECK-NEXT:  %[[add_311:[^ ]+]] = u32[] add(%[[add_283]], %[[constant_310]])
 // CHECK-NEXT:  %[[constant_338:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_339:[^ ]+]] = u32[] add(u32[] %[[add_311]], u32[] %[[constant_338]])
+// CHECK-NEXT:  %[[add_339:[^ ]+]] = u32[] add(%[[add_311]], %[[constant_338]])
 // CHECK-NEXT:  %[[constant_5:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_7:[^ ]+]] = u64[] shift-right-logical(u64[] %[[key_1]], u64[] %[[constant_5]])
-// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_7]])
+// CHECK-NEXT:  %[[shift_right_logical_7:[^ ]+]] = u64[] shift-right-logical(%[[key_1]], %[[constant_5]])
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[shift_right_logical_7]])
 // CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(u32[] %[[convert_8]], u32[] %[[constant_88]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
 // CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(u32[] %[[add_89]], u32[] %[[constant_116]])
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
 // CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(u32[] %[[add_117]], u32[] %[[constant_144]])
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
 // CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(u32[] %[[add_145]], u32[] %[[constant_172]])
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
 // CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(u32[] %[[add_173]], u32[] %[[constant_200]])
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
 // CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(u32[] %[[add_201]], u32[] %[[constant_228]])
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
 // CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(u32[] %[[add_229]], u32[] %[[constant_256]])
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
 // CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(u32[] %[[add_257]], u32[] %[[constant_284]])
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
 // CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(u32[] %[[add_285]], u32[] %[[constant_312]])
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
 // CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(u32[] %[[add_313]], u32[] %[[constant_340]])
-// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
 // CHECK-NEXT:  %[[iota_21:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
-// CHECK-NEXT:  %[[add_33:[^ ]+]] = u64[1]{0} add(u64[1]{0} %[[broadcast_32]], u64[1]{0} %[[iota_21]])
-// CHECK-NEXT:  %[[convert_42:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[add_33]])
-// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[convert_42]])
+// CHECK-NEXT:  %[[add_33:[^ ]+]] = u64[1]{0} add(%[[broadcast_32]], %[[iota_21]])
+// CHECK-NEXT:  %[[convert_42:[^ ]+]] = u32[1]{0} convert(%[[add_33]])
+// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u64[1]{0} convert(%[[convert_42]])
 // CHECK-NEXT:  %[[constant_63:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_63]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_65:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_62]], u64[1]{0} %[[broadcast_64]])
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1]{0} broadcast(%[[constant_63]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_65:[^ ]+]] = u64[1]{0} multiply(%[[convert_62]], %[[broadcast_64]])
 // CHECK-NEXT:  %[[constant_67:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_67]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_69:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_65]], u64[1]{0} %[[broadcast_68]])
-// CHECK-NEXT:  %[[convert_70:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_69]])
-// CHECK-NEXT:  %[[broadcast_35:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_26]]), dimensions={}
-// CHECK-NEXT:  %[[compare_36:[^ ]+]] = pred[1]{0} compare(u64[1]{0} %[[add_33]], u64[1]{0} %[[broadcast_35]]), direction=LT
+// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u64[1]{0} broadcast(%[[constant_67]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_69:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_65]], %[[broadcast_68]])
+// CHECK-NEXT:  %[[convert_70:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_69]])
+// CHECK-NEXT:  %[[broadcast_35:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[compare_36:[^ ]+]] = pred[1]{0} compare(%[[add_33]], %[[broadcast_35]]), direction=LT
 // CHECK-NEXT:  %[[constant_34:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_37:[^ ]+]] = u64[] add(u64[] %[[or_31]], u64[] %[[constant_34]])
-// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[add_37]]), dimensions={}
-// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_31]]), dimensions={}
-// CHECK-NEXT:  %[[select_40:[^ ]+]] = u64[1]{0} select(pred[1]{0} %[[compare_36]], u64[1]{0} %[[broadcast_38]], u64[1]{0} %[[broadcast_39]])
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_34]])
+// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = u64[1]{0} broadcast(%[[add_37]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u64[1]{0} broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_40:[^ ]+]] = u64[1]{0} select(%[[compare_36]], %[[broadcast_38]], %[[broadcast_39]])
 // CHECK-NEXT:  %[[constant_46:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_46]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_49:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[select_40]], u64[1]{0} %[[broadcast_48]])
-// CHECK-NEXT:  %[[convert_50:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_49]])
-// CHECK-NEXT:  %[[xor_83:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_70]], u32[1]{0} %[[convert_50]])
-// CHECK-NEXT:  %[[broadcast_84:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[convert_8]]), dimensions={}
-// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_83]], u32[1]{0} %[[broadcast_84]])
-// CHECK-NEXT:  %[[convert_99:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_85]])
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u64[1]{0} broadcast(%[[constant_46]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_49:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_40]], %[[broadcast_48]])
+// CHECK-NEXT:  %[[convert_50:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_49]])
+// CHECK-NEXT:  %[[xor_83:[^ ]+]] = u32[1]{0} xor(%[[convert_70]], %[[convert_50]])
+// CHECK-NEXT:  %[[broadcast_84:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[xor_83]], %[[broadcast_84]])
+// CHECK-NEXT:  %[[convert_99:[^ ]+]] = u64[1]{0} convert(%[[xor_85]])
 // CHECK-NEXT:  %[[constant_100:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_101:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_100]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_102:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_99]], u64[1]{0} %[[broadcast_101]])
+// CHECK-NEXT:  %[[broadcast_101:[^ ]+]] = u64[1]{0} broadcast(%[[constant_100]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_102:[^ ]+]] = u64[1]{0} multiply(%[[convert_99]], %[[broadcast_101]])
 // CHECK-NEXT:  %[[constant_104:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_105:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_104]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_106:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_102]], u64[1]{0} %[[broadcast_105]])
-// CHECK-NEXT:  %[[convert_107:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_106]])
-// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[select_40]])
-// CHECK-NEXT:  %[[convert_71:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[convert_47]])
+// CHECK-NEXT:  %[[broadcast_105:[^ ]+]] = u64[1]{0} broadcast(%[[constant_104]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_106:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_102]], %[[broadcast_105]])
+// CHECK-NEXT:  %[[convert_107:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_106]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[select_40]])
+// CHECK-NEXT:  %[[convert_71:[^ ]+]] = u64[1]{0} convert(%[[convert_47]])
 // CHECK-NEXT:  %[[constant_72:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_72]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_71]], u64[1]{0} %[[broadcast_73]])
-// CHECK-NEXT:  %[[convert_75:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_74]])
-// CHECK-NEXT:  %[[xor_108:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_107]], u32[1]{0} %[[convert_75]])
-// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_87]]), dimensions={}
-// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_108]], u32[1]{0} %[[broadcast_109]])
-// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_110]])
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u64[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = u64[1]{0} multiply(%[[convert_71]], %[[broadcast_73]])
+// CHECK-NEXT:  %[[convert_75:[^ ]+]] = u32[1]{0} convert(%[[multiply_74]])
+// CHECK-NEXT:  %[[xor_108:[^ ]+]] = u32[1]{0} xor(%[[convert_107]], %[[convert_75]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[add_87]]), dimensions={}
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[xor_108]], %[[broadcast_109]])
+// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u64[1]{0} convert(%[[xor_110]])
 // CHECK-NEXT:  %[[constant_119:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_119]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_118]], u64[1]{0} %[[broadcast_120]])
+// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1]{0} broadcast(%[[constant_119]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = u64[1]{0} multiply(%[[convert_118]], %[[broadcast_120]])
 // CHECK-NEXT:  %[[constant_123:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_123]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_121]], u64[1]{0} %[[broadcast_124]])
-// CHECK-NEXT:  %[[convert_126:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_125]])
+// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u64[1]{0} broadcast(%[[constant_123]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_121]], %[[broadcast_124]])
+// CHECK-NEXT:  %[[convert_126:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_125]])
 // CHECK-NEXT:  %[[constant_76:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_76]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_78:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_74]], u64[1]{0} %[[broadcast_77]])
-// CHECK-NEXT:  %[[convert_79:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_78]])
+// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = u64[1]{0} broadcast(%[[constant_76]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_78:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_74]], %[[broadcast_77]])
+// CHECK-NEXT:  %[[convert_79:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_78]])
 // CHECK-NEXT:  %[[constant_41:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_41]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_44:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[add_33]], u64[1]{0} %[[broadcast_43]])
-// CHECK-NEXT:  %[[convert_45:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_44]])
-// CHECK-NEXT:  %[[xor_80:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_79]], u32[1]{0} %[[convert_45]])
-// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[convert_6]]), dimensions={}
-// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_80]], u32[1]{0} %[[broadcast_81]])
-// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_82]])
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u64[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_44:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_33]], %[[broadcast_43]])
+// CHECK-NEXT:  %[[convert_45:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_44]])
+// CHECK-NEXT:  %[[xor_80:[^ ]+]] = u32[1]{0} xor(%[[convert_79]], %[[convert_45]])
+// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = u32[1]{0} broadcast(%[[convert_6]]), dimensions={}
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[xor_80]], %[[broadcast_81]])
+// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u64[1]{0} convert(%[[xor_82]])
 // CHECK-NEXT:  %[[constant_91:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_91]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_90]], u64[1]{0} %[[broadcast_92]])
-// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_93]])
-// CHECK-NEXT:  %[[xor_139:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_126]], u32[1]{0} %[[convert_94]])
-// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_117]]), dimensions={}
-// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_139]], u32[1]{0} %[[broadcast_140]])
-// CHECK-NEXT:  %[[convert_155:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_141]])
+// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1]{0} broadcast(%[[constant_91]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = u64[1]{0} multiply(%[[convert_90]], %[[broadcast_92]])
+// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1]{0} convert(%[[multiply_93]])
+// CHECK-NEXT:  %[[xor_139:[^ ]+]] = u32[1]{0} xor(%[[convert_126]], %[[convert_94]])
+// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[xor_139]], %[[broadcast_140]])
+// CHECK-NEXT:  %[[convert_155:[^ ]+]] = u64[1]{0} convert(%[[xor_141]])
 // CHECK-NEXT:  %[[constant_156:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_157:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_156]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_158:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_155]], u64[1]{0} %[[broadcast_157]])
+// CHECK-NEXT:  %[[broadcast_157:[^ ]+]] = u64[1]{0} broadcast(%[[constant_156]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_158:[^ ]+]] = u64[1]{0} multiply(%[[convert_155]], %[[broadcast_157]])
 // CHECK-NEXT:  %[[constant_160:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_161:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_160]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_162:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_158]], u64[1]{0} %[[broadcast_161]])
-// CHECK-NEXT:  %[[convert_163:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_162]])
+// CHECK-NEXT:  %[[broadcast_161:[^ ]+]] = u64[1]{0} broadcast(%[[constant_160]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_162:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_158]], %[[broadcast_161]])
+// CHECK-NEXT:  %[[convert_163:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_162]])
 // CHECK-NEXT:  %[[constant_95:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_95]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_97:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_93]], u64[1]{0} %[[broadcast_96]])
-// CHECK-NEXT:  %[[convert_98:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_97]])
-// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_65]])
-// CHECK-NEXT:  %[[xor_111:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_98]], u32[1]{0} %[[convert_66]])
-// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_89]]), dimensions={}
-// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_111]], u32[1]{0} %[[broadcast_112]])
-// CHECK-NEXT:  %[[convert_127:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_113]])
+// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u64[1]{0} broadcast(%[[constant_95]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_97:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_93]], %[[broadcast_96]])
+// CHECK-NEXT:  %[[convert_98:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_97]])
+// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1]{0} convert(%[[multiply_65]])
+// CHECK-NEXT:  %[[xor_111:[^ ]+]] = u32[1]{0} xor(%[[convert_98]], %[[convert_66]])
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[xor_111]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[convert_127:[^ ]+]] = u64[1]{0} convert(%[[xor_113]])
 // CHECK-NEXT:  %[[constant_128:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_129:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_128]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_130:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_127]], u64[1]{0} %[[broadcast_129]])
-// CHECK-NEXT:  %[[convert_131:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_130]])
-// CHECK-NEXT:  %[[xor_164:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_163]], u32[1]{0} %[[convert_131]])
-// CHECK-NEXT:  %[[broadcast_165:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_143]]), dimensions={}
-// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_164]], u32[1]{0} %[[broadcast_165]])
-// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_166]])
+// CHECK-NEXT:  %[[broadcast_129:[^ ]+]] = u64[1]{0} broadcast(%[[constant_128]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_130:[^ ]+]] = u64[1]{0} multiply(%[[convert_127]], %[[broadcast_129]])
+// CHECK-NEXT:  %[[convert_131:[^ ]+]] = u32[1]{0} convert(%[[multiply_130]])
+// CHECK-NEXT:  %[[xor_164:[^ ]+]] = u32[1]{0} xor(%[[convert_163]], %[[convert_131]])
+// CHECK-NEXT:  %[[broadcast_165:[^ ]+]] = u32[1]{0} broadcast(%[[add_143]]), dimensions={}
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[xor_164]], %[[broadcast_165]])
+// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u64[1]{0} convert(%[[xor_166]])
 // CHECK-NEXT:  %[[constant_175:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_175]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_174]], u64[1]{0} %[[broadcast_176]])
+// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1]{0} broadcast(%[[constant_175]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = u64[1]{0} multiply(%[[convert_174]], %[[broadcast_176]])
 // CHECK-NEXT:  %[[constant_179:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_179]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_177]], u64[1]{0} %[[broadcast_180]])
-// CHECK-NEXT:  %[[convert_182:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u64[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_177]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[convert_182:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_181]])
 // CHECK-NEXT:  %[[constant_132:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_132]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_134:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_130]], u64[1]{0} %[[broadcast_133]])
-// CHECK-NEXT:  %[[convert_135:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_134]])
-// CHECK-NEXT:  %[[convert_103:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_102]])
-// CHECK-NEXT:  %[[xor_136:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_135]], u32[1]{0} %[[convert_103]])
-// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_115]]), dimensions={}
-// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_136]], u32[1]{0} %[[broadcast_137]])
-// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_138]])
+// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = u64[1]{0} broadcast(%[[constant_132]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_134:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_130]], %[[broadcast_133]])
+// CHECK-NEXT:  %[[convert_135:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_134]])
+// CHECK-NEXT:  %[[convert_103:[^ ]+]] = u32[1]{0} convert(%[[multiply_102]])
+// CHECK-NEXT:  %[[xor_136:[^ ]+]] = u32[1]{0} xor(%[[convert_135]], %[[convert_103]])
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[add_115]]), dimensions={}
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[xor_136]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u64[1]{0} convert(%[[xor_138]])
 // CHECK-NEXT:  %[[constant_147:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_147]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_149:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_146]], u64[1]{0} %[[broadcast_148]])
-// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_149]])
-// CHECK-NEXT:  %[[xor_195:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_182]], u32[1]{0} %[[convert_150]])
-// CHECK-NEXT:  %[[broadcast_196:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_173]]), dimensions={}
-// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_195]], u32[1]{0} %[[broadcast_196]])
-// CHECK-NEXT:  %[[convert_211:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_197]])
+// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1]{0} broadcast(%[[constant_147]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_149:[^ ]+]] = u64[1]{0} multiply(%[[convert_146]], %[[broadcast_148]])
+// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1]{0} convert(%[[multiply_149]])
+// CHECK-NEXT:  %[[xor_195:[^ ]+]] = u32[1]{0} xor(%[[convert_182]], %[[convert_150]])
+// CHECK-NEXT:  %[[broadcast_196:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[xor_195]], %[[broadcast_196]])
+// CHECK-NEXT:  %[[convert_211:[^ ]+]] = u64[1]{0} convert(%[[xor_197]])
 // CHECK-NEXT:  %[[constant_212:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_213:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_212]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_214:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_211]], u64[1]{0} %[[broadcast_213]])
+// CHECK-NEXT:  %[[broadcast_213:[^ ]+]] = u64[1]{0} broadcast(%[[constant_212]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_214:[^ ]+]] = u64[1]{0} multiply(%[[convert_211]], %[[broadcast_213]])
 // CHECK-NEXT:  %[[constant_216:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_217:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_216]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_218:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_214]], u64[1]{0} %[[broadcast_217]])
-// CHECK-NEXT:  %[[convert_219:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_218]])
+// CHECK-NEXT:  %[[broadcast_217:[^ ]+]] = u64[1]{0} broadcast(%[[constant_216]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_218:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_214]], %[[broadcast_217]])
+// CHECK-NEXT:  %[[convert_219:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_218]])
 // CHECK-NEXT:  %[[constant_151:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_151]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_153:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_149]], u64[1]{0} %[[broadcast_152]])
-// CHECK-NEXT:  %[[convert_154:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_153]])
-// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_121]])
-// CHECK-NEXT:  %[[xor_167:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_154]], u32[1]{0} %[[convert_122]])
-// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_145]]), dimensions={}
-// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_167]], u32[1]{0} %[[broadcast_168]])
-// CHECK-NEXT:  %[[convert_183:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_169]])
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u64[1]{0} broadcast(%[[constant_151]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_153:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_149]], %[[broadcast_152]])
+// CHECK-NEXT:  %[[convert_154:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_153]])
+// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1]{0} convert(%[[multiply_121]])
+// CHECK-NEXT:  %[[xor_167:[^ ]+]] = u32[1]{0} xor(%[[convert_154]], %[[convert_122]])
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[xor_167]], %[[broadcast_168]])
+// CHECK-NEXT:  %[[convert_183:[^ ]+]] = u64[1]{0} convert(%[[xor_169]])
 // CHECK-NEXT:  %[[constant_184:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_185:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_184]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_186:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_183]], u64[1]{0} %[[broadcast_185]])
-// CHECK-NEXT:  %[[convert_187:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_186]])
-// CHECK-NEXT:  %[[xor_220:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_219]], u32[1]{0} %[[convert_187]])
-// CHECK-NEXT:  %[[broadcast_221:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_199]]), dimensions={}
-// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_220]], u32[1]{0} %[[broadcast_221]])
-// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_222]])
+// CHECK-NEXT:  %[[broadcast_185:[^ ]+]] = u64[1]{0} broadcast(%[[constant_184]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_186:[^ ]+]] = u64[1]{0} multiply(%[[convert_183]], %[[broadcast_185]])
+// CHECK-NEXT:  %[[convert_187:[^ ]+]] = u32[1]{0} convert(%[[multiply_186]])
+// CHECK-NEXT:  %[[xor_220:[^ ]+]] = u32[1]{0} xor(%[[convert_219]], %[[convert_187]])
+// CHECK-NEXT:  %[[broadcast_221:[^ ]+]] = u32[1]{0} broadcast(%[[add_199]]), dimensions={}
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[xor_220]], %[[broadcast_221]])
+// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u64[1]{0} convert(%[[xor_222]])
 // CHECK-NEXT:  %[[constant_231:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_231]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_233:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_230]], u64[1]{0} %[[broadcast_232]])
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_233:[^ ]+]] = u64[1]{0} multiply(%[[convert_230]], %[[broadcast_232]])
 // CHECK-NEXT:  %[[constant_235:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_235]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_237:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_233]], u64[1]{0} %[[broadcast_236]])
-// CHECK-NEXT:  %[[convert_238:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_237]])
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u64[1]{0} broadcast(%[[constant_235]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_237:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_233]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[convert_238:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_237]])
 // CHECK-NEXT:  %[[constant_188:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_188]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_186]], u64[1]{0} %[[broadcast_189]])
-// CHECK-NEXT:  %[[convert_191:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_190]])
-// CHECK-NEXT:  %[[convert_159:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_158]])
-// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_191]], u32[1]{0} %[[convert_159]])
-// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_171]]), dimensions={}
-// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_192]], u32[1]{0} %[[broadcast_193]])
-// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_194]])
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u64[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_186]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[convert_191:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[convert_159:[^ ]+]] = u32[1]{0} convert(%[[multiply_158]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[convert_191]], %[[convert_159]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[add_171]]), dimensions={}
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[xor_192]], %[[broadcast_193]])
+// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u64[1]{0} convert(%[[xor_194]])
 // CHECK-NEXT:  %[[constant_203:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_203]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_205:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_202]], u64[1]{0} %[[broadcast_204]])
-// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_205]])
-// CHECK-NEXT:  %[[xor_251:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_238]], u32[1]{0} %[[convert_206]])
-// CHECK-NEXT:  %[[broadcast_252:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_229]]), dimensions={}
-// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_251]], u32[1]{0} %[[broadcast_252]])
-// CHECK-NEXT:  %[[convert_267:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_253]])
+// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1]{0} broadcast(%[[constant_203]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_205:[^ ]+]] = u64[1]{0} multiply(%[[convert_202]], %[[broadcast_204]])
+// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1]{0} convert(%[[multiply_205]])
+// CHECK-NEXT:  %[[xor_251:[^ ]+]] = u32[1]{0} xor(%[[convert_238]], %[[convert_206]])
+// CHECK-NEXT:  %[[broadcast_252:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[xor_251]], %[[broadcast_252]])
+// CHECK-NEXT:  %[[convert_267:[^ ]+]] = u64[1]{0} convert(%[[xor_253]])
 // CHECK-NEXT:  %[[constant_268:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_269:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_268]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_270:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_267]], u64[1]{0} %[[broadcast_269]])
+// CHECK-NEXT:  %[[broadcast_269:[^ ]+]] = u64[1]{0} broadcast(%[[constant_268]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_270:[^ ]+]] = u64[1]{0} multiply(%[[convert_267]], %[[broadcast_269]])
 // CHECK-NEXT:  %[[constant_272:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_273:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_272]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_274:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_270]], u64[1]{0} %[[broadcast_273]])
-// CHECK-NEXT:  %[[convert_275:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_274]])
+// CHECK-NEXT:  %[[broadcast_273:[^ ]+]] = u64[1]{0} broadcast(%[[constant_272]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_274:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_270]], %[[broadcast_273]])
+// CHECK-NEXT:  %[[convert_275:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_274]])
 // CHECK-NEXT:  %[[constant_207:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_207]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_209:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_205]], u64[1]{0} %[[broadcast_208]])
-// CHECK-NEXT:  %[[convert_210:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_209]])
-// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_177]])
-// CHECK-NEXT:  %[[xor_223:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_210]], u32[1]{0} %[[convert_178]])
-// CHECK-NEXT:  %[[broadcast_224:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_201]]), dimensions={}
-// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_223]], u32[1]{0} %[[broadcast_224]])
-// CHECK-NEXT:  %[[convert_239:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_225]])
+// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u64[1]{0} broadcast(%[[constant_207]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_209:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_205]], %[[broadcast_208]])
+// CHECK-NEXT:  %[[convert_210:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_209]])
+// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1]{0} convert(%[[multiply_177]])
+// CHECK-NEXT:  %[[xor_223:[^ ]+]] = u32[1]{0} xor(%[[convert_210]], %[[convert_178]])
+// CHECK-NEXT:  %[[broadcast_224:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[xor_223]], %[[broadcast_224]])
+// CHECK-NEXT:  %[[convert_239:[^ ]+]] = u64[1]{0} convert(%[[xor_225]])
 // CHECK-NEXT:  %[[constant_240:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_240]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_242:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_239]], u64[1]{0} %[[broadcast_241]])
-// CHECK-NEXT:  %[[convert_243:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_242]])
-// CHECK-NEXT:  %[[xor_276:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_275]], u32[1]{0} %[[convert_243]])
-// CHECK-NEXT:  %[[broadcast_277:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_255]]), dimensions={}
-// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_276]], u32[1]{0} %[[broadcast_277]])
-// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_278]])
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u64[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_242:[^ ]+]] = u64[1]{0} multiply(%[[convert_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[convert_243:[^ ]+]] = u32[1]{0} convert(%[[multiply_242]])
+// CHECK-NEXT:  %[[xor_276:[^ ]+]] = u32[1]{0} xor(%[[convert_275]], %[[convert_243]])
+// CHECK-NEXT:  %[[broadcast_277:[^ ]+]] = u32[1]{0} broadcast(%[[add_255]]), dimensions={}
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[xor_276]], %[[broadcast_277]])
+// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u64[1]{0} convert(%[[xor_278]])
 // CHECK-NEXT:  %[[constant_287:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_287]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_289:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_286]], u64[1]{0} %[[broadcast_288]])
+// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1]{0} broadcast(%[[constant_287]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_289:[^ ]+]] = u64[1]{0} multiply(%[[convert_286]], %[[broadcast_288]])
 // CHECK-NEXT:  %[[constant_291:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_291]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_293:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_289]], u64[1]{0} %[[broadcast_292]])
-// CHECK-NEXT:  %[[convert_294:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_293]])
+// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u64[1]{0} broadcast(%[[constant_291]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_293:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_289]], %[[broadcast_292]])
+// CHECK-NEXT:  %[[convert_294:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_293]])
 // CHECK-NEXT:  %[[constant_244:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_245:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_244]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_246:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_242]], u64[1]{0} %[[broadcast_245]])
-// CHECK-NEXT:  %[[convert_247:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_246]])
-// CHECK-NEXT:  %[[convert_215:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_214]])
-// CHECK-NEXT:  %[[xor_248:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_247]], u32[1]{0} %[[convert_215]])
-// CHECK-NEXT:  %[[broadcast_249:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_227]]), dimensions={}
-// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_248]], u32[1]{0} %[[broadcast_249]])
-// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_250]])
+// CHECK-NEXT:  %[[broadcast_245:[^ ]+]] = u64[1]{0} broadcast(%[[constant_244]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_246:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_242]], %[[broadcast_245]])
+// CHECK-NEXT:  %[[convert_247:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_246]])
+// CHECK-NEXT:  %[[convert_215:[^ ]+]] = u32[1]{0} convert(%[[multiply_214]])
+// CHECK-NEXT:  %[[xor_248:[^ ]+]] = u32[1]{0} xor(%[[convert_247]], %[[convert_215]])
+// CHECK-NEXT:  %[[broadcast_249:[^ ]+]] = u32[1]{0} broadcast(%[[add_227]]), dimensions={}
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[xor_248]], %[[broadcast_249]])
+// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u64[1]{0} convert(%[[xor_250]])
 // CHECK-NEXT:  %[[constant_259:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_259]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_261:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_258]], u64[1]{0} %[[broadcast_260]])
-// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_261]])
-// CHECK-NEXT:  %[[xor_307:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_294]], u32[1]{0} %[[convert_262]])
-// CHECK-NEXT:  %[[broadcast_308:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_285]]), dimensions={}
-// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_307]], u32[1]{0} %[[broadcast_308]])
-// CHECK-NEXT:  %[[convert_323:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_309]])
+// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1]{0} broadcast(%[[constant_259]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_261:[^ ]+]] = u64[1]{0} multiply(%[[convert_258]], %[[broadcast_260]])
+// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1]{0} convert(%[[multiply_261]])
+// CHECK-NEXT:  %[[xor_307:[^ ]+]] = u32[1]{0} xor(%[[convert_294]], %[[convert_262]])
+// CHECK-NEXT:  %[[broadcast_308:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[xor_307]], %[[broadcast_308]])
+// CHECK-NEXT:  %[[convert_323:[^ ]+]] = u64[1]{0} convert(%[[xor_309]])
 // CHECK-NEXT:  %[[constant_324:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_325:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_324]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_326:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_323]], u64[1]{0} %[[broadcast_325]])
+// CHECK-NEXT:  %[[broadcast_325:[^ ]+]] = u64[1]{0} broadcast(%[[constant_324]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_326:[^ ]+]] = u64[1]{0} multiply(%[[convert_323]], %[[broadcast_325]])
 // CHECK-NEXT:  %[[constant_328:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_329:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_328]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_330:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_326]], u64[1]{0} %[[broadcast_329]])
-// CHECK-NEXT:  %[[convert_331:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_330]])
+// CHECK-NEXT:  %[[broadcast_329:[^ ]+]] = u64[1]{0} broadcast(%[[constant_328]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_330:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_326]], %[[broadcast_329]])
+// CHECK-NEXT:  %[[convert_331:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_330]])
 // CHECK-NEXT:  %[[constant_263:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_263]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_265:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_261]], u64[1]{0} %[[broadcast_264]])
-// CHECK-NEXT:  %[[convert_266:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_265]])
-// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_233]])
-// CHECK-NEXT:  %[[xor_279:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_266]], u32[1]{0} %[[convert_234]])
-// CHECK-NEXT:  %[[broadcast_280:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_257]]), dimensions={}
-// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_279]], u32[1]{0} %[[broadcast_280]])
-// CHECK-NEXT:  %[[convert_295:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_281]])
+// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u64[1]{0} broadcast(%[[constant_263]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_265:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_261]], %[[broadcast_264]])
+// CHECK-NEXT:  %[[convert_266:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_265]])
+// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1]{0} convert(%[[multiply_233]])
+// CHECK-NEXT:  %[[xor_279:[^ ]+]] = u32[1]{0} xor(%[[convert_266]], %[[convert_234]])
+// CHECK-NEXT:  %[[broadcast_280:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[xor_279]], %[[broadcast_280]])
+// CHECK-NEXT:  %[[convert_295:[^ ]+]] = u64[1]{0} convert(%[[xor_281]])
 // CHECK-NEXT:  %[[constant_296:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_297:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_296]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_298:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_295]], u64[1]{0} %[[broadcast_297]])
-// CHECK-NEXT:  %[[convert_299:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_298]])
-// CHECK-NEXT:  %[[xor_332:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_331]], u32[1]{0} %[[convert_299]])
-// CHECK-NEXT:  %[[broadcast_333:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_311]]), dimensions={}
-// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_332]], u32[1]{0} %[[broadcast_333]])
-// CHECK-NEXT:  %[[reshape_342:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[xor_334]])
-// CHECK-NEXT:  %[[convert_327:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_326]])
-// CHECK-NEXT:  %[[reshape_343:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[convert_327]])
+// CHECK-NEXT:  %[[broadcast_297:[^ ]+]] = u64[1]{0} broadcast(%[[constant_296]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_298:[^ ]+]] = u64[1]{0} multiply(%[[convert_295]], %[[broadcast_297]])
+// CHECK-NEXT:  %[[convert_299:[^ ]+]] = u32[1]{0} convert(%[[multiply_298]])
+// CHECK-NEXT:  %[[xor_332:[^ ]+]] = u32[1]{0} xor(%[[convert_331]], %[[convert_299]])
+// CHECK-NEXT:  %[[broadcast_333:[^ ]+]] = u32[1]{0} broadcast(%[[add_311]]), dimensions={}
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[xor_332]], %[[broadcast_333]])
+// CHECK-NEXT:  %[[reshape_342:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_334]])
+// CHECK-NEXT:  %[[convert_327:[^ ]+]] = u32[1]{0} convert(%[[multiply_326]])
+// CHECK-NEXT:  %[[reshape_343:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_327]])
 // CHECK-NEXT:  %[[constant_300:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_301:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_300]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_302:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_298]], u64[1]{0} %[[broadcast_301]])
-// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_302]])
-// CHECK-NEXT:  %[[convert_271:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_270]])
-// CHECK-NEXT:  %[[xor_304:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_303]], u32[1]{0} %[[convert_271]])
-// CHECK-NEXT:  %[[broadcast_305:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_283]]), dimensions={}
-// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_304]], u32[1]{0} %[[broadcast_305]])
-// CHECK-NEXT:  %[[convert_314:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_306]])
+// CHECK-NEXT:  %[[broadcast_301:[^ ]+]] = u64[1]{0} broadcast(%[[constant_300]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_302:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_298]], %[[broadcast_301]])
+// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_302]])
+// CHECK-NEXT:  %[[convert_271:[^ ]+]] = u32[1]{0} convert(%[[multiply_270]])
+// CHECK-NEXT:  %[[xor_304:[^ ]+]] = u32[1]{0} xor(%[[convert_303]], %[[convert_271]])
+// CHECK-NEXT:  %[[broadcast_305:[^ ]+]] = u32[1]{0} broadcast(%[[add_283]]), dimensions={}
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[xor_304]], %[[broadcast_305]])
+// CHECK-NEXT:  %[[convert_314:[^ ]+]] = u64[1]{0} convert(%[[xor_306]])
 // CHECK-NEXT:  %[[constant_315:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_316:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_315]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_317:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_314]], u64[1]{0} %[[broadcast_316]])
+// CHECK-NEXT:  %[[broadcast_316:[^ ]+]] = u64[1]{0} broadcast(%[[constant_315]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_317:[^ ]+]] = u64[1]{0} multiply(%[[convert_314]], %[[broadcast_316]])
 // CHECK-NEXT:  %[[constant_319:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_320:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_319]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_321:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_317]], u64[1]{0} %[[broadcast_320]])
-// CHECK-NEXT:  %[[convert_322:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_321]])
-// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_289]])
-// CHECK-NEXT:  %[[xor_335:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_322]], u32[1]{0} %[[convert_290]])
-// CHECK-NEXT:  %[[broadcast_336:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_313]]), dimensions={}
-// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_335]], u32[1]{0} %[[broadcast_336]])
-// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[xor_337]])
-// CHECK-NEXT:  %[[convert_318:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_317]])
-// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[convert_318]])
-// CHECK-NEXT:  %[[concatenate_346:[^ ]+]] = u32[1,4]{1,0} concatenate(u32[1,1]{1,0} %[[reshape_342]], u32[1,1]{1,0} %[[reshape_343]], u32[1,1]{1,0} %[[reshape_344]], u32[1,1]{1,0} %[[reshape_345]]), dimensions={1}
-// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[4]{0} reshape(u32[1,4]{1,0} %[[concatenate_346]])
-// CHECK-NEXT:  %[[slice_348:[^ ]+]] = u32[1]{0} slice(u32[4]{0} %[[reshape_347]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[] reshape(u32[1]{0} %[[slice_348]])
+// CHECK-NEXT:  %[[broadcast_320:[^ ]+]] = u64[1]{0} broadcast(%[[constant_319]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_321:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_317]], %[[broadcast_320]])
+// CHECK-NEXT:  %[[convert_322:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_321]])
+// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1]{0} convert(%[[multiply_289]])
+// CHECK-NEXT:  %[[xor_335:[^ ]+]] = u32[1]{0} xor(%[[convert_322]], %[[convert_290]])
+// CHECK-NEXT:  %[[broadcast_336:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[xor_335]], %[[broadcast_336]])
+// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_337]])
+// CHECK-NEXT:  %[[convert_318:[^ ]+]] = u32[1]{0} convert(%[[multiply_317]])
+// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_318]])
+// CHECK-NEXT:  %[[concatenate_346:[^ ]+]] = u32[1,4]{1,0} concatenate(%[[reshape_342]], %[[reshape_343]], %[[reshape_344]], %[[reshape_345]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[4]{0} reshape(%[[concatenate_346]])
+// CHECK-NEXT:  %[[slice_348:[^ ]+]] = u32[1]{0} slice(%[[reshape_347]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[] reshape(%[[slice_348]])
 // CHECK-NEXT:  %[[constant_350:[^ ]+]] = u32[] constant(9)
-// CHECK-NEXT:  %[[shift_right_logical_351:[^ ]+]] = u32[] shift-right-logical(u32[] %[[reshape_349]], u32[] %[[constant_350]])
-// CHECK-NEXT:  %[[convert_352:[^ ]+]] = f32[] convert(u32[] %[[shift_right_logical_351]])
+// CHECK-NEXT:  %[[shift_right_logical_351:[^ ]+]] = u32[] shift-right-logical(%[[reshape_349]], %[[constant_350]])
+// CHECK-NEXT:  %[[convert_352:[^ ]+]] = f32[] convert(%[[shift_right_logical_351]])
 // CHECK-NEXT:  %[[constant_353:[^ ]+]] = f32[] constant(1.1920929e-07)
-// CHECK-NEXT:  %[[multiply_354:[^ ]+]] = f32[] multiply(f32[] %[[convert_352]], f32[] %[[constant_353]])
+// CHECK-NEXT:  %[[multiply_354:[^ ]+]] = f32[] multiply(%[[convert_352]], %[[constant_353]])
 // CHECK-NEXT:  %[[b_or_sigma_4:[^ ]+]] = f32[] parameter(3)
 // CHECK-NEXT:  %[[a_or_mean_3:[^ ]+]] = f32[] parameter(2)
-// CHECK-NEXT:  %[[subtract_355:[^ ]+]] = f32[] subtract(f32[] %[[b_or_sigma_4]], f32[] %[[a_or_mean_3]])
-// CHECK-NEXT:  %[[multiply_356:[^ ]+]] = f32[] multiply(f32[] %[[multiply_354]], f32[] %[[subtract_355]])
-// CHECK-NEXT:  ROOT %[[add_357:[^ ]+]] = f32[] add(f32[] %[[multiply_356]], f32[] %[[a_or_mean_3]])
+// CHECK-NEXT:  %[[subtract_355:[^ ]+]] = f32[] subtract(%[[b_or_sigma_4]], %[[a_or_mean_3]])
+// CHECK-NEXT:  %[[multiply_356:[^ ]+]] = f32[] multiply(%[[multiply_354]], %[[subtract_355]])
+// CHECK-NEXT:  ROOT %[[add_357:[^ ]+]] = f32[] add(%[[multiply_356]], %[[a_or_mean_3]])
 
 // CHECK-LABEL: ENTRY %test_uniform
 // CHECK-NEXT:  %[[shape:[^ ]+]] = u64[3]{0} constant({2, 4, 8})
-// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{[0-9]+}})
+// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{.*}})
 // CHECK-NEXT:  %[[rng_get_and_update_state:[^ ]+]] = u64[2]{0} rng-get-and-update-state(), delta=1
 // CHECK-NEXT:  %[[min:[^ ]+]] = f32[] constant(0)
 // CHECK-NEXT:  %[[max:[^ ]+]] = f32[] constant(1)
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[] call(u64[] %[[constant]], u64[2]{0} %[[rng_get_and_update_state]], f32[] %[[min]], f32[] %[[max]]), to_apply=%[[$rng_358]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[] call(%[[constant]], %[[rng_get_and_update_state]], %[[min]], %[[max]]), to_apply=%[[$rng_358]]
 
 HloModule TestUniformDistribution
 
@@ -385,396 +385,396 @@ ENTRY test_uniform {
 
 // CHECK:       %[[$rng_384:[^ ]+]]
 // CHECK-NEXT:  %[[state_2:[^ ]+]] = u64[2]{0} parameter(1)
-// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[state_2]]), slice={[0:1]}
-// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_11]])
-// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(u64[] %[[reshape_12]])
-// CHECK-NEXT:  %[[convert_24:[^ ]+]] = u64[] convert(u32[] %[[convert_16]])
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_24:[^ ]+]] = u64[] convert(%[[convert_16]])
 // CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_12]], u64[] %[[constant_15]])
-// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_17]])
-// CHECK-NEXT:  %[[convert_25:[^ ]+]] = u64[] convert(u32[] %[[convert_18]])
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_15]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[convert_25:[^ ]+]] = u64[] convert(%[[convert_18]])
 // CHECK-NEXT:  %[[constant_26:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_left_27:[^ ]+]] = u64[] shift-left(u64[] %[[convert_25]], u64[] %[[constant_26]])
-// CHECK-NEXT:  %[[or_28:[^ ]+]] = u64[] or(u64[] %[[convert_24]], u64[] %[[shift_left_27]])
+// CHECK-NEXT:  %[[shift_left_27:[^ ]+]] = u64[] shift-left(%[[convert_25]], %[[constant_26]])
+// CHECK-NEXT:  %[[or_28:[^ ]+]] = u64[] or(%[[convert_24]], %[[shift_left_27]])
 // CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_54:[^ ]+]] = u64[] add(u64[] %[[or_28]], u64[] %[[constant_53]])
-// CHECK-NEXT:  %[[reshape_61:[^ ]+]] = u64[1]{0} reshape(u64[] %[[add_54]])
-// CHECK-NEXT:  %[[compare_56:[^ ]+]] = pred[] compare(u64[] %[[add_54]], u64[] %[[or_28]]), direction=LT
-// CHECK-NEXT:  %[[slice_13:[^ ]+]] = u64[1]{0} slice(u64[2]{0} %[[state_2]]), slice={[1:2]}
-// CHECK-NEXT:  %[[reshape_14:[^ ]+]] = u64[] reshape(u64[1]{0} %[[slice_13]])
-// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(u64[] %[[reshape_14]])
-// CHECK-NEXT:  %[[convert_29:[^ ]+]] = u64[] convert(u32[] %[[convert_20]])
+// CHECK-NEXT:  %[[add_54:[^ ]+]] = u64[] add(%[[or_28]], %[[constant_53]])
+// CHECK-NEXT:  %[[reshape_61:[^ ]+]] = u64[1]{0} reshape(%[[add_54]])
+// CHECK-NEXT:  %[[compare_56:[^ ]+]] = pred[] compare(%[[add_54]], %[[or_28]]), direction=LT
+// CHECK-NEXT:  %[[slice_13:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_14:[^ ]+]] = u64[] reshape(%[[slice_13]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[reshape_14]])
+// CHECK-NEXT:  %[[convert_29:[^ ]+]] = u64[] convert(%[[convert_20]])
 // CHECK-NEXT:  %[[constant_19:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_21:[^ ]+]] = u64[] shift-right-logical(u64[] %[[reshape_14]], u64[] %[[constant_19]])
-// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_21]])
-// CHECK-NEXT:  %[[convert_30:[^ ]+]] = u64[] convert(u32[] %[[convert_22]])
+// CHECK-NEXT:  %[[shift_right_logical_21:[^ ]+]] = u64[] shift-right-logical(%[[reshape_14]], %[[constant_19]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u32[] convert(%[[shift_right_logical_21]])
+// CHECK-NEXT:  %[[convert_30:[^ ]+]] = u64[] convert(%[[convert_22]])
 // CHECK-NEXT:  %[[constant_31:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_left_32:[^ ]+]] = u64[] shift-left(u64[] %[[convert_30]], u64[] %[[constant_31]])
-// CHECK-NEXT:  %[[or_33:[^ ]+]] = u64[] or(u64[] %[[convert_29]], u64[] %[[shift_left_32]])
+// CHECK-NEXT:  %[[shift_left_32:[^ ]+]] = u64[] shift-left(%[[convert_30]], %[[constant_31]])
+// CHECK-NEXT:  %[[or_33:[^ ]+]] = u64[] or(%[[convert_29]], %[[shift_left_32]])
 // CHECK-NEXT:  %[[constant_55:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_57:[^ ]+]] = u64[] add(u64[] %[[or_33]], u64[] %[[constant_55]])
-// CHECK-NEXT:  %[[broadcast_58:[^ ]+]] = u64[] broadcast(u64[] %[[add_57]]), dimensions={}
-// CHECK-NEXT:  %[[broadcast_59:[^ ]+]] = u64[] broadcast(u64[] %[[or_33]]), dimensions={}
-// CHECK-NEXT:  %[[select_60:[^ ]+]] = u64[] select(pred[] %[[compare_56]], u64[] %[[broadcast_58]], u64[] %[[broadcast_59]])
-// CHECK-NEXT:  %[[reshape_62:[^ ]+]] = u64[1]{0} reshape(u64[] %[[select_60]])
-// CHECK-NEXT:  %[[concatenate_63:[^ ]+]] = u64[2]{0} concatenate(u64[1]{0} %[[reshape_61]], u64[1]{0} %[[reshape_62]]), dimensions={0}
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_55]])
+// CHECK-NEXT:  %[[broadcast_58:[^ ]+]] = u64[] broadcast(%[[add_57]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_59:[^ ]+]] = u64[] broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_60:[^ ]+]] = u64[] select(%[[compare_56]], %[[broadcast_58]], %[[broadcast_59]])
+// CHECK-NEXT:  %[[reshape_62:[^ ]+]] = u64[1]{0} reshape(%[[select_60]])
+// CHECK-NEXT:  %[[concatenate_63:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_61]], %[[reshape_62]]), dimensions={0}
 // CHECK-NEXT:  %[[key_1:[^ ]+]] = u64[] parameter(0)
-// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(u64[] %[[key_1]])
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[key_1]])
 // CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(u32[] %[[convert_8]], u32[] %[[constant_88]])
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
 // CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(u32[] %[[add_89]], u32[] %[[constant_116]])
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
 // CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(u32[] %[[add_117]], u32[] %[[constant_144]])
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
 // CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(u32[] %[[add_145]], u32[] %[[constant_172]])
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
 // CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(u32[] %[[add_173]], u32[] %[[constant_200]])
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
 // CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(u32[] %[[add_201]], u32[] %[[constant_228]])
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
 // CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(u32[] %[[add_229]], u32[] %[[constant_256]])
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
 // CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(u32[] %[[add_257]], u32[] %[[constant_284]])
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
 // CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(u32[] %[[add_285]], u32[] %[[constant_312]])
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
 // CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(2654435769)
-// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(u32[] %[[add_313]], u32[] %[[constant_340]])
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
 // CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[shift_right_logical_9:[^ ]+]] = u64[] shift-right-logical(u64[] %[[key_1]], u64[] %[[constant_7]])
-// CHECK-NEXT:  %[[convert_10:[^ ]+]] = u32[] convert(u64[] %[[shift_right_logical_9]])
+// CHECK-NEXT:  %[[shift_right_logical_9:[^ ]+]] = u64[] shift-right-logical(%[[key_1]], %[[constant_7]])
+// CHECK-NEXT:  %[[convert_10:[^ ]+]] = u32[] convert(%[[shift_right_logical_9]])
 // CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_91:[^ ]+]] = u32[] add(u32[] %[[convert_10]], u32[] %[[constant_90]])
+// CHECK-NEXT:  %[[add_91:[^ ]+]] = u32[] add(%[[convert_10]], %[[constant_90]])
 // CHECK-NEXT:  %[[constant_118:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_119:[^ ]+]] = u32[] add(u32[] %[[add_91]], u32[] %[[constant_118]])
+// CHECK-NEXT:  %[[add_119:[^ ]+]] = u32[] add(%[[add_91]], %[[constant_118]])
 // CHECK-NEXT:  %[[constant_146:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_147:[^ ]+]] = u32[] add(u32[] %[[add_119]], u32[] %[[constant_146]])
+// CHECK-NEXT:  %[[add_147:[^ ]+]] = u32[] add(%[[add_119]], %[[constant_146]])
 // CHECK-NEXT:  %[[constant_174:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[] add(u32[] %[[add_147]], u32[] %[[constant_174]])
+// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[] add(%[[add_147]], %[[constant_174]])
 // CHECK-NEXT:  %[[constant_202:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_203:[^ ]+]] = u32[] add(u32[] %[[add_175]], u32[] %[[constant_202]])
+// CHECK-NEXT:  %[[add_203:[^ ]+]] = u32[] add(%[[add_175]], %[[constant_202]])
 // CHECK-NEXT:  %[[constant_230:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_231:[^ ]+]] = u32[] add(u32[] %[[add_203]], u32[] %[[constant_230]])
+// CHECK-NEXT:  %[[add_231:[^ ]+]] = u32[] add(%[[add_203]], %[[constant_230]])
 // CHECK-NEXT:  %[[constant_258:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_259:[^ ]+]] = u32[] add(u32[] %[[add_231]], u32[] %[[constant_258]])
+// CHECK-NEXT:  %[[add_259:[^ ]+]] = u32[] add(%[[add_231]], %[[constant_258]])
 // CHECK-NEXT:  %[[constant_286:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_287:[^ ]+]] = u32[] add(u32[] %[[add_259]], u32[] %[[constant_286]])
+// CHECK-NEXT:  %[[add_287:[^ ]+]] = u32[] add(%[[add_259]], %[[constant_286]])
 // CHECK-NEXT:  %[[constant_314:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_315:[^ ]+]] = u32[] add(u32[] %[[add_287]], u32[] %[[constant_314]])
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = u32[] add(%[[add_287]], %[[constant_314]])
 // CHECK-NEXT:  %[[constant_342:[^ ]+]] = u32[] constant(3144134277)
-// CHECK-NEXT:  %[[add_343:[^ ]+]] = u32[] add(u32[] %[[add_315]], u32[] %[[constant_342]])
+// CHECK-NEXT:  %[[add_343:[^ ]+]] = u32[] add(%[[add_315]], %[[constant_342]])
 // CHECK-NEXT:  %[[constant_369:[^ ]+]] = f32[] constant(6.28318548)
-// CHECK-NEXT:  %[[broadcast_370:[^ ]+]] = f32[1]{0} broadcast(f32[] %[[constant_369]]), dimensions={}
-// CHECK-NEXT:  %[[broadcast_34:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_370:[^ ]+]] = f32[1]{0} broadcast(%[[constant_369]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_34:[^ ]+]] = u64[1]{0} broadcast(%[[or_28]]), dimensions={}
 // CHECK-NEXT:  %[[iota_23:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
-// CHECK-NEXT:  %[[add_35:[^ ]+]] = u64[1]{0} add(u64[1]{0} %[[broadcast_34]], u64[1]{0} %[[iota_23]])
-// CHECK-NEXT:  %[[convert_44:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[add_35]])
-// CHECK-NEXT:  %[[convert_64:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[convert_44]])
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = u64[1]{0} add(%[[broadcast_34]], %[[iota_23]])
+// CHECK-NEXT:  %[[convert_44:[^ ]+]] = u32[1]{0} convert(%[[add_35]])
+// CHECK-NEXT:  %[[convert_64:[^ ]+]] = u64[1]{0} convert(%[[convert_44]])
 // CHECK-NEXT:  %[[constant_65:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_65]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_67:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_64]], u64[1]{0} %[[broadcast_66]])
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u64[1]{0} broadcast(%[[constant_65]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_67:[^ ]+]] = u64[1]{0} multiply(%[[convert_64]], %[[broadcast_66]])
 // CHECK-NEXT:  %[[constant_69:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_70:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_69]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_71:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_67]], u64[1]{0} %[[broadcast_70]])
-// CHECK-NEXT:  %[[convert_72:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_71]])
-// CHECK-NEXT:  %[[broadcast_37:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_28]]), dimensions={}
-// CHECK-NEXT:  %[[compare_38:[^ ]+]] = pred[1]{0} compare(u64[1]{0} %[[add_35]], u64[1]{0} %[[broadcast_37]]), direction=LT
+// CHECK-NEXT:  %[[broadcast_70:[^ ]+]] = u64[1]{0} broadcast(%[[constant_69]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_71:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_67]], %[[broadcast_70]])
+// CHECK-NEXT:  %[[convert_72:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_71]])
+// CHECK-NEXT:  %[[broadcast_37:[^ ]+]] = u64[1]{0} broadcast(%[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[compare_38:[^ ]+]] = pred[1]{0} compare(%[[add_35]], %[[broadcast_37]]), direction=LT
 // CHECK-NEXT:  %[[constant_36:[^ ]+]] = u64[] constant(1)
-// CHECK-NEXT:  %[[add_39:[^ ]+]] = u64[] add(u64[] %[[or_33]], u64[] %[[constant_36]])
-// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[add_39]]), dimensions={}
-// CHECK-NEXT:  %[[broadcast_41:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[or_33]]), dimensions={}
-// CHECK-NEXT:  %[[select_42:[^ ]+]] = u64[1]{0} select(pred[1]{0} %[[compare_38]], u64[1]{0} %[[broadcast_40]], u64[1]{0} %[[broadcast_41]])
+// CHECK-NEXT:  %[[add_39:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_36]])
+// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = u64[1]{0} broadcast(%[[add_39]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_41:[^ ]+]] = u64[1]{0} broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_42:[^ ]+]] = u64[1]{0} select(%[[compare_38]], %[[broadcast_40]], %[[broadcast_41]])
 // CHECK-NEXT:  %[[constant_48:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_50:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_48]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_51:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[select_42]], u64[1]{0} %[[broadcast_50]])
-// CHECK-NEXT:  %[[convert_52:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_51]])
-// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_72]], u32[1]{0} %[[convert_52]])
-// CHECK-NEXT:  %[[broadcast_86:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[convert_10]]), dimensions={}
-// CHECK-NEXT:  %[[xor_87:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_85]], u32[1]{0} %[[broadcast_86]])
-// CHECK-NEXT:  %[[convert_101:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_87]])
+// CHECK-NEXT:  %[[broadcast_50:[^ ]+]] = u64[1]{0} broadcast(%[[constant_48]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_51:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_42]], %[[broadcast_50]])
+// CHECK-NEXT:  %[[convert_52:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_51]])
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[convert_72]], %[[convert_52]])
+// CHECK-NEXT:  %[[broadcast_86:[^ ]+]] = u32[1]{0} broadcast(%[[convert_10]]), dimensions={}
+// CHECK-NEXT:  %[[xor_87:[^ ]+]] = u32[1]{0} xor(%[[xor_85]], %[[broadcast_86]])
+// CHECK-NEXT:  %[[convert_101:[^ ]+]] = u64[1]{0} convert(%[[xor_87]])
 // CHECK-NEXT:  %[[constant_102:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_102]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_104:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_101]], u64[1]{0} %[[broadcast_103]])
+// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u64[1]{0} broadcast(%[[constant_102]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_104:[^ ]+]] = u64[1]{0} multiply(%[[convert_101]], %[[broadcast_103]])
 // CHECK-NEXT:  %[[constant_106:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_106]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_108:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_104]], u64[1]{0} %[[broadcast_107]])
-// CHECK-NEXT:  %[[convert_109:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_108]])
-// CHECK-NEXT:  %[[convert_49:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[select_42]])
-// CHECK-NEXT:  %[[convert_73:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[convert_49]])
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u64[1]{0} broadcast(%[[constant_106]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_108:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_104]], %[[broadcast_107]])
+// CHECK-NEXT:  %[[convert_109:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_108]])
+// CHECK-NEXT:  %[[convert_49:[^ ]+]] = u32[1]{0} convert(%[[select_42]])
+// CHECK-NEXT:  %[[convert_73:[^ ]+]] = u64[1]{0} convert(%[[convert_49]])
 // CHECK-NEXT:  %[[constant_74:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_74]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_76:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_73]], u64[1]{0} %[[broadcast_75]])
-// CHECK-NEXT:  %[[convert_77:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_76]])
-// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_109]], u32[1]{0} %[[convert_77]])
-// CHECK-NEXT:  %[[broadcast_111:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_89]]), dimensions={}
-// CHECK-NEXT:  %[[xor_112:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_110]], u32[1]{0} %[[broadcast_111]])
-// CHECK-NEXT:  %[[convert_120:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_112]])
+// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = u64[1]{0} broadcast(%[[constant_74]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_76:[^ ]+]] = u64[1]{0} multiply(%[[convert_73]], %[[broadcast_75]])
+// CHECK-NEXT:  %[[convert_77:[^ ]+]] = u32[1]{0} convert(%[[multiply_76]])
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[convert_109]], %[[convert_77]])
+// CHECK-NEXT:  %[[broadcast_111:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_112:[^ ]+]] = u32[1]{0} xor(%[[xor_110]], %[[broadcast_111]])
+// CHECK-NEXT:  %[[convert_120:[^ ]+]] = u64[1]{0} convert(%[[xor_112]])
 // CHECK-NEXT:  %[[constant_121:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_122:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_121]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_123:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_120]], u64[1]{0} %[[broadcast_122]])
+// CHECK-NEXT:  %[[broadcast_122:[^ ]+]] = u64[1]{0} broadcast(%[[constant_121]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_123:[^ ]+]] = u64[1]{0} multiply(%[[convert_120]], %[[broadcast_122]])
 // CHECK-NEXT:  %[[constant_125:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_126:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_125]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_127:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_123]], u64[1]{0} %[[broadcast_126]])
-// CHECK-NEXT:  %[[convert_128:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_127]])
+// CHECK-NEXT:  %[[broadcast_126:[^ ]+]] = u64[1]{0} broadcast(%[[constant_125]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_127:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_123]], %[[broadcast_126]])
+// CHECK-NEXT:  %[[convert_128:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_127]])
 // CHECK-NEXT:  %[[constant_78:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_78]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_80:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_76]], u64[1]{0} %[[broadcast_79]])
-// CHECK-NEXT:  %[[convert_81:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_80]])
+// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = u64[1]{0} broadcast(%[[constant_78]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_80:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_76]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[convert_81:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_80]])
 // CHECK-NEXT:  %[[constant_43:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_45:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_43]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_46:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[add_35]], u64[1]{0} %[[broadcast_45]])
-// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_46]])
-// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_81]], u32[1]{0} %[[convert_47]])
-// CHECK-NEXT:  %[[broadcast_83:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[convert_8]]), dimensions={}
-// CHECK-NEXT:  %[[xor_84:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_82]], u32[1]{0} %[[broadcast_83]])
-// CHECK-NEXT:  %[[convert_92:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_84]])
+// CHECK-NEXT:  %[[broadcast_45:[^ ]+]] = u64[1]{0} broadcast(%[[constant_43]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_46:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_35]], %[[broadcast_45]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_46]])
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[convert_81]], %[[convert_47]])
+// CHECK-NEXT:  %[[broadcast_83:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_84:[^ ]+]] = u32[1]{0} xor(%[[xor_82]], %[[broadcast_83]])
+// CHECK-NEXT:  %[[convert_92:[^ ]+]] = u64[1]{0} convert(%[[xor_84]])
 // CHECK-NEXT:  %[[constant_93:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_93]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_95:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_92]], u64[1]{0} %[[broadcast_94]])
-// CHECK-NEXT:  %[[convert_96:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_95]])
-// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_128]], u32[1]{0} %[[convert_96]])
-// CHECK-NEXT:  %[[broadcast_142:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_119]]), dimensions={}
-// CHECK-NEXT:  %[[xor_143:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_141]], u32[1]{0} %[[broadcast_142]])
-// CHECK-NEXT:  %[[convert_157:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_143]])
+// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u64[1]{0} broadcast(%[[constant_93]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_95:[^ ]+]] = u64[1]{0} multiply(%[[convert_92]], %[[broadcast_94]])
+// CHECK-NEXT:  %[[convert_96:[^ ]+]] = u32[1]{0} convert(%[[multiply_95]])
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[convert_128]], %[[convert_96]])
+// CHECK-NEXT:  %[[broadcast_142:[^ ]+]] = u32[1]{0} broadcast(%[[add_119]]), dimensions={}
+// CHECK-NEXT:  %[[xor_143:[^ ]+]] = u32[1]{0} xor(%[[xor_141]], %[[broadcast_142]])
+// CHECK-NEXT:  %[[convert_157:[^ ]+]] = u64[1]{0} convert(%[[xor_143]])
 // CHECK-NEXT:  %[[constant_158:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_158]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_160:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_157]], u64[1]{0} %[[broadcast_159]])
+// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u64[1]{0} broadcast(%[[constant_158]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_160:[^ ]+]] = u64[1]{0} multiply(%[[convert_157]], %[[broadcast_159]])
 // CHECK-NEXT:  %[[constant_162:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_163:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_162]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_164:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_160]], u64[1]{0} %[[broadcast_163]])
-// CHECK-NEXT:  %[[convert_165:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_164]])
+// CHECK-NEXT:  %[[broadcast_163:[^ ]+]] = u64[1]{0} broadcast(%[[constant_162]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_164:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_160]], %[[broadcast_163]])
+// CHECK-NEXT:  %[[convert_165:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_164]])
 // CHECK-NEXT:  %[[constant_97:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_98:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_97]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_99:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_95]], u64[1]{0} %[[broadcast_98]])
-// CHECK-NEXT:  %[[convert_100:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_99]])
-// CHECK-NEXT:  %[[convert_68:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_67]])
-// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_100]], u32[1]{0} %[[convert_68]])
-// CHECK-NEXT:  %[[broadcast_114:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_91]]), dimensions={}
-// CHECK-NEXT:  %[[xor_115:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_113]], u32[1]{0} %[[broadcast_114]])
-// CHECK-NEXT:  %[[convert_129:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_115]])
+// CHECK-NEXT:  %[[broadcast_98:[^ ]+]] = u64[1]{0} broadcast(%[[constant_97]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_99:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_95]], %[[broadcast_98]])
+// CHECK-NEXT:  %[[convert_100:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_99]])
+// CHECK-NEXT:  %[[convert_68:[^ ]+]] = u32[1]{0} convert(%[[multiply_67]])
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[convert_100]], %[[convert_68]])
+// CHECK-NEXT:  %[[broadcast_114:[^ ]+]] = u32[1]{0} broadcast(%[[add_91]]), dimensions={}
+// CHECK-NEXT:  %[[xor_115:[^ ]+]] = u32[1]{0} xor(%[[xor_113]], %[[broadcast_114]])
+// CHECK-NEXT:  %[[convert_129:[^ ]+]] = u64[1]{0} convert(%[[xor_115]])
 // CHECK-NEXT:  %[[constant_130:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_131:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_130]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_132:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_129]], u64[1]{0} %[[broadcast_131]])
-// CHECK-NEXT:  %[[convert_133:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_132]])
-// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_165]], u32[1]{0} %[[convert_133]])
-// CHECK-NEXT:  %[[broadcast_167:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_145]]), dimensions={}
-// CHECK-NEXT:  %[[xor_168:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_166]], u32[1]{0} %[[broadcast_167]])
-// CHECK-NEXT:  %[[convert_176:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_168]])
+// CHECK-NEXT:  %[[broadcast_131:[^ ]+]] = u64[1]{0} broadcast(%[[constant_130]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_132:[^ ]+]] = u64[1]{0} multiply(%[[convert_129]], %[[broadcast_131]])
+// CHECK-NEXT:  %[[convert_133:[^ ]+]] = u32[1]{0} convert(%[[multiply_132]])
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[convert_165]], %[[convert_133]])
+// CHECK-NEXT:  %[[broadcast_167:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_168:[^ ]+]] = u32[1]{0} xor(%[[xor_166]], %[[broadcast_167]])
+// CHECK-NEXT:  %[[convert_176:[^ ]+]] = u64[1]{0} convert(%[[xor_168]])
 // CHECK-NEXT:  %[[constant_177:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_178:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_177]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_179:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_176]], u64[1]{0} %[[broadcast_178]])
+// CHECK-NEXT:  %[[broadcast_178:[^ ]+]] = u64[1]{0} broadcast(%[[constant_177]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_179:[^ ]+]] = u64[1]{0} multiply(%[[convert_176]], %[[broadcast_178]])
 // CHECK-NEXT:  %[[constant_181:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_182:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_181]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_183:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_179]], u64[1]{0} %[[broadcast_182]])
-// CHECK-NEXT:  %[[convert_184:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_183]])
+// CHECK-NEXT:  %[[broadcast_182:[^ ]+]] = u64[1]{0} broadcast(%[[constant_181]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_183:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_179]], %[[broadcast_182]])
+// CHECK-NEXT:  %[[convert_184:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_183]])
 // CHECK-NEXT:  %[[constant_134:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_135:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_134]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_136:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_132]], u64[1]{0} %[[broadcast_135]])
-// CHECK-NEXT:  %[[convert_137:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_136]])
-// CHECK-NEXT:  %[[convert_105:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_104]])
-// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_137]], u32[1]{0} %[[convert_105]])
-// CHECK-NEXT:  %[[broadcast_139:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_117]]), dimensions={}
-// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_138]], u32[1]{0} %[[broadcast_139]])
-// CHECK-NEXT:  %[[convert_148:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_140]])
+// CHECK-NEXT:  %[[broadcast_135:[^ ]+]] = u64[1]{0} broadcast(%[[constant_134]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_136:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_132]], %[[broadcast_135]])
+// CHECK-NEXT:  %[[convert_137:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_136]])
+// CHECK-NEXT:  %[[convert_105:[^ ]+]] = u32[1]{0} convert(%[[multiply_104]])
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[convert_137]], %[[convert_105]])
+// CHECK-NEXT:  %[[broadcast_139:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(%[[xor_138]], %[[broadcast_139]])
+// CHECK-NEXT:  %[[convert_148:[^ ]+]] = u64[1]{0} convert(%[[xor_140]])
 // CHECK-NEXT:  %[[constant_149:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_149]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_151:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_148]], u64[1]{0} %[[broadcast_150]])
-// CHECK-NEXT:  %[[convert_152:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_151]])
-// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_184]], u32[1]{0} %[[convert_152]])
-// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_175]]), dimensions={}
-// CHECK-NEXT:  %[[xor_199:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_197]], u32[1]{0} %[[broadcast_198]])
-// CHECK-NEXT:  %[[convert_213:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_199]])
+// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u64[1]{0} broadcast(%[[constant_149]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_151:[^ ]+]] = u64[1]{0} multiply(%[[convert_148]], %[[broadcast_150]])
+// CHECK-NEXT:  %[[convert_152:[^ ]+]] = u32[1]{0} convert(%[[multiply_151]])
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[convert_184]], %[[convert_152]])
+// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(%[[add_175]]), dimensions={}
+// CHECK-NEXT:  %[[xor_199:[^ ]+]] = u32[1]{0} xor(%[[xor_197]], %[[broadcast_198]])
+// CHECK-NEXT:  %[[convert_213:[^ ]+]] = u64[1]{0} convert(%[[xor_199]])
 // CHECK-NEXT:  %[[constant_214:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_215:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_214]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_216:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_213]], u64[1]{0} %[[broadcast_215]])
+// CHECK-NEXT:  %[[broadcast_215:[^ ]+]] = u64[1]{0} broadcast(%[[constant_214]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_216:[^ ]+]] = u64[1]{0} multiply(%[[convert_213]], %[[broadcast_215]])
 // CHECK-NEXT:  %[[constant_218:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_219:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_218]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_220:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_216]], u64[1]{0} %[[broadcast_219]])
-// CHECK-NEXT:  %[[convert_221:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_220]])
+// CHECK-NEXT:  %[[broadcast_219:[^ ]+]] = u64[1]{0} broadcast(%[[constant_218]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_220:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_216]], %[[broadcast_219]])
+// CHECK-NEXT:  %[[convert_221:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_220]])
 // CHECK-NEXT:  %[[constant_153:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_154:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_153]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_155:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_151]], u64[1]{0} %[[broadcast_154]])
-// CHECK-NEXT:  %[[convert_156:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_155]])
-// CHECK-NEXT:  %[[convert_124:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_123]])
-// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_156]], u32[1]{0} %[[convert_124]])
-// CHECK-NEXT:  %[[broadcast_170:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_147]]), dimensions={}
-// CHECK-NEXT:  %[[xor_171:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_169]], u32[1]{0} %[[broadcast_170]])
-// CHECK-NEXT:  %[[convert_185:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_171]])
+// CHECK-NEXT:  %[[broadcast_154:[^ ]+]] = u64[1]{0} broadcast(%[[constant_153]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_155:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_151]], %[[broadcast_154]])
+// CHECK-NEXT:  %[[convert_156:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_155]])
+// CHECK-NEXT:  %[[convert_124:[^ ]+]] = u32[1]{0} convert(%[[multiply_123]])
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[convert_156]], %[[convert_124]])
+// CHECK-NEXT:  %[[broadcast_170:[^ ]+]] = u32[1]{0} broadcast(%[[add_147]]), dimensions={}
+// CHECK-NEXT:  %[[xor_171:[^ ]+]] = u32[1]{0} xor(%[[xor_169]], %[[broadcast_170]])
+// CHECK-NEXT:  %[[convert_185:[^ ]+]] = u64[1]{0} convert(%[[xor_171]])
 // CHECK-NEXT:  %[[constant_186:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_187:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_186]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_188:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_185]], u64[1]{0} %[[broadcast_187]])
-// CHECK-NEXT:  %[[convert_189:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_188]])
-// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_221]], u32[1]{0} %[[convert_189]])
-// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_201]]), dimensions={}
-// CHECK-NEXT:  %[[xor_224:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_222]], u32[1]{0} %[[broadcast_223]])
-// CHECK-NEXT:  %[[convert_232:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_224]])
+// CHECK-NEXT:  %[[broadcast_187:[^ ]+]] = u64[1]{0} broadcast(%[[constant_186]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_188:[^ ]+]] = u64[1]{0} multiply(%[[convert_185]], %[[broadcast_187]])
+// CHECK-NEXT:  %[[convert_189:[^ ]+]] = u32[1]{0} convert(%[[multiply_188]])
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[convert_221]], %[[convert_189]])
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_224:[^ ]+]] = u32[1]{0} xor(%[[xor_222]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[convert_232:[^ ]+]] = u64[1]{0} convert(%[[xor_224]])
 // CHECK-NEXT:  %[[constant_233:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_234:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_233]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_235:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_232]], u64[1]{0} %[[broadcast_234]])
+// CHECK-NEXT:  %[[broadcast_234:[^ ]+]] = u64[1]{0} broadcast(%[[constant_233]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_235:[^ ]+]] = u64[1]{0} multiply(%[[convert_232]], %[[broadcast_234]])
 // CHECK-NEXT:  %[[constant_237:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_237]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_239:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_235]], u64[1]{0} %[[broadcast_238]])
-// CHECK-NEXT:  %[[convert_240:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_239]])
+// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u64[1]{0} broadcast(%[[constant_237]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_239:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_235]], %[[broadcast_238]])
+// CHECK-NEXT:  %[[convert_240:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_239]])
 // CHECK-NEXT:  %[[constant_190:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_191:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_190]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_192:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_188]], u64[1]{0} %[[broadcast_191]])
-// CHECK-NEXT:  %[[convert_193:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_192]])
-// CHECK-NEXT:  %[[convert_161:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_160]])
-// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_193]], u32[1]{0} %[[convert_161]])
-// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_173]]), dimensions={}
-// CHECK-NEXT:  %[[xor_196:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_194]], u32[1]{0} %[[broadcast_195]])
-// CHECK-NEXT:  %[[convert_204:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_196]])
+// CHECK-NEXT:  %[[broadcast_191:[^ ]+]] = u64[1]{0} broadcast(%[[constant_190]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_192:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_188]], %[[broadcast_191]])
+// CHECK-NEXT:  %[[convert_193:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_192]])
+// CHECK-NEXT:  %[[convert_161:[^ ]+]] = u32[1]{0} convert(%[[multiply_160]])
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[convert_193]], %[[convert_161]])
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_196:[^ ]+]] = u32[1]{0} xor(%[[xor_194]], %[[broadcast_195]])
+// CHECK-NEXT:  %[[convert_204:[^ ]+]] = u64[1]{0} convert(%[[xor_196]])
 // CHECK-NEXT:  %[[constant_205:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_206:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_205]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_207:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_204]], u64[1]{0} %[[broadcast_206]])
-// CHECK-NEXT:  %[[convert_208:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_207]])
-// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_240]], u32[1]{0} %[[convert_208]])
-// CHECK-NEXT:  %[[broadcast_254:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_231]]), dimensions={}
-// CHECK-NEXT:  %[[xor_255:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_253]], u32[1]{0} %[[broadcast_254]])
-// CHECK-NEXT:  %[[convert_269:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_255]])
+// CHECK-NEXT:  %[[broadcast_206:[^ ]+]] = u64[1]{0} broadcast(%[[constant_205]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_207:[^ ]+]] = u64[1]{0} multiply(%[[convert_204]], %[[broadcast_206]])
+// CHECK-NEXT:  %[[convert_208:[^ ]+]] = u32[1]{0} convert(%[[multiply_207]])
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[convert_240]], %[[convert_208]])
+// CHECK-NEXT:  %[[broadcast_254:[^ ]+]] = u32[1]{0} broadcast(%[[add_231]]), dimensions={}
+// CHECK-NEXT:  %[[xor_255:[^ ]+]] = u32[1]{0} xor(%[[xor_253]], %[[broadcast_254]])
+// CHECK-NEXT:  %[[convert_269:[^ ]+]] = u64[1]{0} convert(%[[xor_255]])
 // CHECK-NEXT:  %[[constant_270:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_271:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_270]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_272:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_269]], u64[1]{0} %[[broadcast_271]])
+// CHECK-NEXT:  %[[broadcast_271:[^ ]+]] = u64[1]{0} broadcast(%[[constant_270]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_272:[^ ]+]] = u64[1]{0} multiply(%[[convert_269]], %[[broadcast_271]])
 // CHECK-NEXT:  %[[constant_274:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_275:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_274]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_276:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_272]], u64[1]{0} %[[broadcast_275]])
-// CHECK-NEXT:  %[[convert_277:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_276]])
+// CHECK-NEXT:  %[[broadcast_275:[^ ]+]] = u64[1]{0} broadcast(%[[constant_274]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_276:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_272]], %[[broadcast_275]])
+// CHECK-NEXT:  %[[convert_277:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_276]])
 // CHECK-NEXT:  %[[constant_209:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_210:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_209]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_211:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_207]], u64[1]{0} %[[broadcast_210]])
-// CHECK-NEXT:  %[[convert_212:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_211]])
-// CHECK-NEXT:  %[[convert_180:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_179]])
-// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_212]], u32[1]{0} %[[convert_180]])
-// CHECK-NEXT:  %[[broadcast_226:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_203]]), dimensions={}
-// CHECK-NEXT:  %[[xor_227:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_225]], u32[1]{0} %[[broadcast_226]])
-// CHECK-NEXT:  %[[convert_241:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_227]])
+// CHECK-NEXT:  %[[broadcast_210:[^ ]+]] = u64[1]{0} broadcast(%[[constant_209]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_211:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_207]], %[[broadcast_210]])
+// CHECK-NEXT:  %[[convert_212:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_211]])
+// CHECK-NEXT:  %[[convert_180:[^ ]+]] = u32[1]{0} convert(%[[multiply_179]])
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[convert_212]], %[[convert_180]])
+// CHECK-NEXT:  %[[broadcast_226:[^ ]+]] = u32[1]{0} broadcast(%[[add_203]]), dimensions={}
+// CHECK-NEXT:  %[[xor_227:[^ ]+]] = u32[1]{0} xor(%[[xor_225]], %[[broadcast_226]])
+// CHECK-NEXT:  %[[convert_241:[^ ]+]] = u64[1]{0} convert(%[[xor_227]])
 // CHECK-NEXT:  %[[constant_242:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_243:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_242]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_244:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_241]], u64[1]{0} %[[broadcast_243]])
-// CHECK-NEXT:  %[[convert_245:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_244]])
-// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_277]], u32[1]{0} %[[convert_245]])
-// CHECK-NEXT:  %[[broadcast_279:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_257]]), dimensions={}
-// CHECK-NEXT:  %[[xor_280:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_278]], u32[1]{0} %[[broadcast_279]])
-// CHECK-NEXT:  %[[convert_288:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_280]])
+// CHECK-NEXT:  %[[broadcast_243:[^ ]+]] = u64[1]{0} broadcast(%[[constant_242]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_244:[^ ]+]] = u64[1]{0} multiply(%[[convert_241]], %[[broadcast_243]])
+// CHECK-NEXT:  %[[convert_245:[^ ]+]] = u32[1]{0} convert(%[[multiply_244]])
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[convert_277]], %[[convert_245]])
+// CHECK-NEXT:  %[[broadcast_279:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_280:[^ ]+]] = u32[1]{0} xor(%[[xor_278]], %[[broadcast_279]])
+// CHECK-NEXT:  %[[convert_288:[^ ]+]] = u64[1]{0} convert(%[[xor_280]])
 // CHECK-NEXT:  %[[constant_289:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_290:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_289]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_291:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_288]], u64[1]{0} %[[broadcast_290]])
+// CHECK-NEXT:  %[[broadcast_290:[^ ]+]] = u64[1]{0} broadcast(%[[constant_289]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_291:[^ ]+]] = u64[1]{0} multiply(%[[convert_288]], %[[broadcast_290]])
 // CHECK-NEXT:  %[[constant_293:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_294:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_293]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_295:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_291]], u64[1]{0} %[[broadcast_294]])
-// CHECK-NEXT:  %[[convert_296:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_295]])
+// CHECK-NEXT:  %[[broadcast_294:[^ ]+]] = u64[1]{0} broadcast(%[[constant_293]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_295:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_291]], %[[broadcast_294]])
+// CHECK-NEXT:  %[[convert_296:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_295]])
 // CHECK-NEXT:  %[[constant_246:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_247:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_246]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_248:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_244]], u64[1]{0} %[[broadcast_247]])
-// CHECK-NEXT:  %[[convert_249:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_248]])
-// CHECK-NEXT:  %[[convert_217:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_216]])
-// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_249]], u32[1]{0} %[[convert_217]])
-// CHECK-NEXT:  %[[broadcast_251:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_229]]), dimensions={}
-// CHECK-NEXT:  %[[xor_252:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_250]], u32[1]{0} %[[broadcast_251]])
-// CHECK-NEXT:  %[[convert_260:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_252]])
+// CHECK-NEXT:  %[[broadcast_247:[^ ]+]] = u64[1]{0} broadcast(%[[constant_246]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_248:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_244]], %[[broadcast_247]])
+// CHECK-NEXT:  %[[convert_249:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_248]])
+// CHECK-NEXT:  %[[convert_217:[^ ]+]] = u32[1]{0} convert(%[[multiply_216]])
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[convert_249]], %[[convert_217]])
+// CHECK-NEXT:  %[[broadcast_251:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_252:[^ ]+]] = u32[1]{0} xor(%[[xor_250]], %[[broadcast_251]])
+// CHECK-NEXT:  %[[convert_260:[^ ]+]] = u64[1]{0} convert(%[[xor_252]])
 // CHECK-NEXT:  %[[constant_261:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_262:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_261]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_263:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_260]], u64[1]{0} %[[broadcast_262]])
-// CHECK-NEXT:  %[[convert_264:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_263]])
-// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_296]], u32[1]{0} %[[convert_264]])
-// CHECK-NEXT:  %[[broadcast_310:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_287]]), dimensions={}
-// CHECK-NEXT:  %[[xor_311:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_309]], u32[1]{0} %[[broadcast_310]])
-// CHECK-NEXT:  %[[convert_325:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_311]])
+// CHECK-NEXT:  %[[broadcast_262:[^ ]+]] = u64[1]{0} broadcast(%[[constant_261]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_263:[^ ]+]] = u64[1]{0} multiply(%[[convert_260]], %[[broadcast_262]])
+// CHECK-NEXT:  %[[convert_264:[^ ]+]] = u32[1]{0} convert(%[[multiply_263]])
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[convert_296]], %[[convert_264]])
+// CHECK-NEXT:  %[[broadcast_310:[^ ]+]] = u32[1]{0} broadcast(%[[add_287]]), dimensions={}
+// CHECK-NEXT:  %[[xor_311:[^ ]+]] = u32[1]{0} xor(%[[xor_309]], %[[broadcast_310]])
+// CHECK-NEXT:  %[[convert_325:[^ ]+]] = u64[1]{0} convert(%[[xor_311]])
 // CHECK-NEXT:  %[[constant_326:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_327:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_326]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_328:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_325]], u64[1]{0} %[[broadcast_327]])
+// CHECK-NEXT:  %[[broadcast_327:[^ ]+]] = u64[1]{0} broadcast(%[[constant_326]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_328:[^ ]+]] = u64[1]{0} multiply(%[[convert_325]], %[[broadcast_327]])
 // CHECK-NEXT:  %[[constant_330:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_331:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_330]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_332:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_328]], u64[1]{0} %[[broadcast_331]])
-// CHECK-NEXT:  %[[convert_333:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_332]])
+// CHECK-NEXT:  %[[broadcast_331:[^ ]+]] = u64[1]{0} broadcast(%[[constant_330]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_332:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_328]], %[[broadcast_331]])
+// CHECK-NEXT:  %[[convert_333:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_332]])
 // CHECK-NEXT:  %[[constant_265:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_266:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_265]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_267:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_263]], u64[1]{0} %[[broadcast_266]])
-// CHECK-NEXT:  %[[convert_268:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_267]])
-// CHECK-NEXT:  %[[convert_236:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_235]])
-// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_268]], u32[1]{0} %[[convert_236]])
-// CHECK-NEXT:  %[[broadcast_282:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_259]]), dimensions={}
-// CHECK-NEXT:  %[[xor_283:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_281]], u32[1]{0} %[[broadcast_282]])
-// CHECK-NEXT:  %[[convert_297:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_283]])
+// CHECK-NEXT:  %[[broadcast_266:[^ ]+]] = u64[1]{0} broadcast(%[[constant_265]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_267:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_263]], %[[broadcast_266]])
+// CHECK-NEXT:  %[[convert_268:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_267]])
+// CHECK-NEXT:  %[[convert_236:[^ ]+]] = u32[1]{0} convert(%[[multiply_235]])
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[convert_268]], %[[convert_236]])
+// CHECK-NEXT:  %[[broadcast_282:[^ ]+]] = u32[1]{0} broadcast(%[[add_259]]), dimensions={}
+// CHECK-NEXT:  %[[xor_283:[^ ]+]] = u32[1]{0} xor(%[[xor_281]], %[[broadcast_282]])
+// CHECK-NEXT:  %[[convert_297:[^ ]+]] = u64[1]{0} convert(%[[xor_283]])
 // CHECK-NEXT:  %[[constant_298:[^ ]+]] = u64[] constant(3449720151)
-// CHECK-NEXT:  %[[broadcast_299:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_298]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_300:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_297]], u64[1]{0} %[[broadcast_299]])
-// CHECK-NEXT:  %[[convert_301:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_300]])
-// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_333]], u32[1]{0} %[[convert_301]])
-// CHECK-NEXT:  %[[broadcast_335:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_313]]), dimensions={}
-// CHECK-NEXT:  %[[xor_336:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_334]], u32[1]{0} %[[broadcast_335]])
-// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[xor_336]])
-// CHECK-NEXT:  %[[convert_329:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_328]])
-// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[convert_329]])
+// CHECK-NEXT:  %[[broadcast_299:[^ ]+]] = u64[1]{0} broadcast(%[[constant_298]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_300:[^ ]+]] = u64[1]{0} multiply(%[[convert_297]], %[[broadcast_299]])
+// CHECK-NEXT:  %[[convert_301:[^ ]+]] = u32[1]{0} convert(%[[multiply_300]])
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[convert_333]], %[[convert_301]])
+// CHECK-NEXT:  %[[broadcast_335:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_336:[^ ]+]] = u32[1]{0} xor(%[[xor_334]], %[[broadcast_335]])
+// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_336]])
+// CHECK-NEXT:  %[[convert_329:[^ ]+]] = u32[1]{0} convert(%[[multiply_328]])
+// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_329]])
 // CHECK-NEXT:  %[[constant_302:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_303:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_302]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_304:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_300]], u64[1]{0} %[[broadcast_303]])
-// CHECK-NEXT:  %[[convert_305:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_304]])
-// CHECK-NEXT:  %[[convert_273:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_272]])
-// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_305]], u32[1]{0} %[[convert_273]])
-// CHECK-NEXT:  %[[broadcast_307:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_285]]), dimensions={}
-// CHECK-NEXT:  %[[xor_308:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_306]], u32[1]{0} %[[broadcast_307]])
-// CHECK-NEXT:  %[[convert_316:[^ ]+]] = u64[1]{0} convert(u32[1]{0} %[[xor_308]])
+// CHECK-NEXT:  %[[broadcast_303:[^ ]+]] = u64[1]{0} broadcast(%[[constant_302]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_304:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_300]], %[[broadcast_303]])
+// CHECK-NEXT:  %[[convert_305:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_304]])
+// CHECK-NEXT:  %[[convert_273:[^ ]+]] = u32[1]{0} convert(%[[multiply_272]])
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[convert_305]], %[[convert_273]])
+// CHECK-NEXT:  %[[broadcast_307:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_308:[^ ]+]] = u32[1]{0} xor(%[[xor_306]], %[[broadcast_307]])
+// CHECK-NEXT:  %[[convert_316:[^ ]+]] = u64[1]{0} convert(%[[xor_308]])
 // CHECK-NEXT:  %[[constant_317:[^ ]+]] = u64[] constant(3528531795)
-// CHECK-NEXT:  %[[broadcast_318:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_317]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_319:[^ ]+]] = u64[1]{0} multiply(u64[1]{0} %[[convert_316]], u64[1]{0} %[[broadcast_318]])
+// CHECK-NEXT:  %[[broadcast_318:[^ ]+]] = u64[1]{0} broadcast(%[[constant_317]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_319:[^ ]+]] = u64[1]{0} multiply(%[[convert_316]], %[[broadcast_318]])
 // CHECK-NEXT:  %[[constant_321:[^ ]+]] = u64[] constant(32)
-// CHECK-NEXT:  %[[broadcast_322:[^ ]+]] = u64[1]{0} broadcast(u64[] %[[constant_321]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_323:[^ ]+]] = u64[1]{0} shift-right-logical(u64[1]{0} %[[multiply_319]], u64[1]{0} %[[broadcast_322]])
-// CHECK-NEXT:  %[[convert_324:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[shift_right_logical_323]])
-// CHECK-NEXT:  %[[convert_292:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_291]])
-// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[convert_324]], u32[1]{0} %[[convert_292]])
-// CHECK-NEXT:  %[[broadcast_338:[^ ]+]] = u32[1]{0} broadcast(u32[] %[[add_315]]), dimensions={}
-// CHECK-NEXT:  %[[xor_339:[^ ]+]] = u32[1]{0} xor(u32[1]{0} %[[xor_337]], u32[1]{0} %[[broadcast_338]])
-// CHECK-NEXT:  %[[reshape_346:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[xor_339]])
-// CHECK-NEXT:  %[[convert_320:[^ ]+]] = u32[1]{0} convert(u64[1]{0} %[[multiply_319]])
-// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[1,1]{1,0} reshape(u32[1]{0} %[[convert_320]])
-// CHECK-NEXT:  %[[concatenate_348:[^ ]+]] = u32[1,4]{1,0} concatenate(u32[1,1]{1,0} %[[reshape_344]], u32[1,1]{1,0} %[[reshape_345]], u32[1,1]{1,0} %[[reshape_346]], u32[1,1]{1,0} %[[reshape_347]]), dimensions={1}
-// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[4]{0} reshape(u32[1,4]{1,0} %[[concatenate_348]])
-// CHECK-NEXT:  %[[slice_350:[^ ]+]] = u32[2]{0} slice(u32[4]{0} %[[reshape_349]]), slice={[0:2]}
-// CHECK-NEXT:  %[[reshape_351:[^ ]+]] = u32[2]{0} reshape(u32[2]{0} %[[slice_350]])
+// CHECK-NEXT:  %[[broadcast_322:[^ ]+]] = u64[1]{0} broadcast(%[[constant_321]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_323:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_319]], %[[broadcast_322]])
+// CHECK-NEXT:  %[[convert_324:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_323]])
+// CHECK-NEXT:  %[[convert_292:[^ ]+]] = u32[1]{0} convert(%[[multiply_291]])
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[convert_324]], %[[convert_292]])
+// CHECK-NEXT:  %[[broadcast_338:[^ ]+]] = u32[1]{0} broadcast(%[[add_315]]), dimensions={}
+// CHECK-NEXT:  %[[xor_339:[^ ]+]] = u32[1]{0} xor(%[[xor_337]], %[[broadcast_338]])
+// CHECK-NEXT:  %[[reshape_346:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_339]])
+// CHECK-NEXT:  %[[convert_320:[^ ]+]] = u32[1]{0} convert(%[[multiply_319]])
+// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_320]])
+// CHECK-NEXT:  %[[concatenate_348:[^ ]+]] = u32[1,4]{1,0} concatenate(%[[reshape_344]], %[[reshape_345]], %[[reshape_346]], %[[reshape_347]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[4]{0} reshape(%[[concatenate_348]])
+// CHECK-NEXT:  %[[slice_350:[^ ]+]] = u32[2]{0} slice(%[[reshape_349]]), slice={[0:2]}
+// CHECK-NEXT:  %[[reshape_351:[^ ]+]] = u32[2]{0} reshape(%[[slice_350]])
 // CHECK-NEXT:  %[[constant_352:[^ ]+]] = u32[] constant(9)
-// CHECK-NEXT:  %[[broadcast_353:[^ ]+]] = u32[2]{0} broadcast(u32[] %[[constant_352]]), dimensions={}
-// CHECK-NEXT:  %[[shift_right_logical_354:[^ ]+]] = u32[2]{0} shift-right-logical(u32[2]{0} %[[reshape_351]], u32[2]{0} %[[broadcast_353]])
-// CHECK-NEXT:  %[[convert_355:[^ ]+]] = f32[2]{0} convert(u32[2]{0} %[[shift_right_logical_354]])
+// CHECK-NEXT:  %[[broadcast_353:[^ ]+]] = u32[2]{0} broadcast(%[[constant_352]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_354:[^ ]+]] = u32[2]{0} shift-right-logical(%[[reshape_351]], %[[broadcast_353]])
+// CHECK-NEXT:  %[[convert_355:[^ ]+]] = f32[2]{0} convert(%[[shift_right_logical_354]])
 // CHECK-NEXT:  %[[constant_356:[^ ]+]] = f32[] constant(1.1920929e-07)
-// CHECK-NEXT:  %[[broadcast_357:[^ ]+]] = f32[2]{0} broadcast(f32[] %[[constant_356]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_358:[^ ]+]] = f32[2]{0} multiply(f32[2]{0} %[[convert_355]], f32[2]{0} %[[broadcast_357]])
+// CHECK-NEXT:  %[[broadcast_357:[^ ]+]] = f32[2]{0} broadcast(%[[constant_356]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_358:[^ ]+]] = f32[2]{0} multiply(%[[convert_355]], %[[broadcast_357]])
 // CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(1)
 // CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[] constant(0)
-// CHECK-NEXT:  %[[subtract_359:[^ ]+]] = f32[] subtract(f32[] %[[constant_6]], f32[] %[[constant_5]])
-// CHECK-NEXT:  %[[broadcast_360:[^ ]+]] = f32[2]{0} broadcast(f32[] %[[subtract_359]]), dimensions={}
-// CHECK-NEXT:  %[[multiply_361:[^ ]+]] = f32[2]{0} multiply(f32[2]{0} %[[multiply_358]], f32[2]{0} %[[broadcast_360]])
-// CHECK-NEXT:  %[[broadcast_362:[^ ]+]] = f32[2]{0} broadcast(f32[] %[[constant_5]]), dimensions={}
-// CHECK-NEXT:  %[[add_363:[^ ]+]] = f32[2]{0} add(f32[2]{0} %[[multiply_361]], f32[2]{0} %[[broadcast_362]])
-// CHECK-NEXT:  %[[slice_365:[^ ]+]] = f32[1]{0} slice(f32[2]{0} %[[add_363]]), slice={[1:2]}
-// CHECK-NEXT:  %[[multiply_371:[^ ]+]] = f32[1]{0} multiply(f32[1]{0} %[[broadcast_370]], f32[1]{0} %[[slice_365]])
-// CHECK-NEXT:  %[[cosine_379:[^ ]+]] = f32[1]{0} cosine(f32[1]{0} %[[multiply_371]])
+// CHECK-NEXT:  %[[subtract_359:[^ ]+]] = f32[] subtract(%[[constant_6]], %[[constant_5]])
+// CHECK-NEXT:  %[[broadcast_360:[^ ]+]] = f32[2]{0} broadcast(%[[subtract_359]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_361:[^ ]+]] = f32[2]{0} multiply(%[[multiply_358]], %[[broadcast_360]])
+// CHECK-NEXT:  %[[broadcast_362:[^ ]+]] = f32[2]{0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[add_363:[^ ]+]] = f32[2]{0} add(%[[multiply_361]], %[[broadcast_362]])
+// CHECK-NEXT:  %[[slice_365:[^ ]+]] = f32[1]{0} slice(%[[add_363]]), slice={[1:2]}
+// CHECK-NEXT:  %[[multiply_371:[^ ]+]] = f32[1]{0} multiply(%[[broadcast_370]], %[[slice_365]])
+// CHECK-NEXT:  %[[cosine_379:[^ ]+]] = f32[1]{0} cosine(%[[multiply_371]])
 // CHECK-NEXT:  %[[constant_372:[^ ]+]] = f32[] constant(-2)
-// CHECK-NEXT:  %[[broadcast_374:[^ ]+]] = f32[1]{0} broadcast(f32[] %[[constant_372]]), dimensions={}
-// CHECK-NEXT:  %[[slice_364:[^ ]+]] = f32[1]{0} slice(f32[2]{0} %[[add_363]]), slice={[0:1]}
+// CHECK-NEXT:  %[[broadcast_374:[^ ]+]] = f32[1]{0} broadcast(%[[constant_372]]), dimensions={}
+// CHECK-NEXT:  %[[slice_364:[^ ]+]] = f32[1]{0} slice(%[[add_363]]), slice={[0:1]}
 // CHECK-NEXT:  %[[constant_366:[^ ]+]] = f32[] constant(1e-07)
-// CHECK-NEXT:  %[[broadcast_367:[^ ]+]] = f32[1]{0} broadcast(f32[] %[[constant_366]]), dimensions={}
-// CHECK-NEXT:  %[[maximum_368:[^ ]+]] = f32[1]{0} maximum(f32[1]{0} %[[slice_364]], f32[1]{0} %[[broadcast_367]])
-// CHECK-NEXT:  %[[log_373:[^ ]+]] = f32[1]{0} log(f32[1]{0} %[[maximum_368]])
-// CHECK-NEXT:  %[[multiply_375:[^ ]+]] = f32[1]{0} multiply(f32[1]{0} %[[broadcast_374]], f32[1]{0} %[[log_373]])
-// CHECK-NEXT:  %[[sqrt_376:[^ ]+]] = f32[1]{0} sqrt(f32[1]{0} %[[multiply_375]])
-// CHECK-NEXT:  %[[multiply_380:[^ ]+]] = f32[1]{0} multiply(f32[1]{0} %[[cosine_379]], f32[1]{0} %[[sqrt_376]])
+// CHECK-NEXT:  %[[broadcast_367:[^ ]+]] = f32[1]{0} broadcast(%[[constant_366]]), dimensions={}
+// CHECK-NEXT:  %[[maximum_368:[^ ]+]] = f32[1]{0} maximum(%[[slice_364]], %[[broadcast_367]])
+// CHECK-NEXT:  %[[log_373:[^ ]+]] = f32[1]{0} log(%[[maximum_368]])
+// CHECK-NEXT:  %[[multiply_375:[^ ]+]] = f32[1]{0} multiply(%[[broadcast_374]], %[[log_373]])
+// CHECK-NEXT:  %[[sqrt_376:[^ ]+]] = f32[1]{0} sqrt(%[[multiply_375]])
+// CHECK-NEXT:  %[[multiply_380:[^ ]+]] = f32[1]{0} multiply(%[[cosine_379]], %[[sqrt_376]])
 // CHECK-NEXT:  %[[a_or_mean_3:[^ ]+]] = f32[] parameter(2)
 // CHECK-NEXT:  %[[b_or_sigma_4:[^ ]+]] = f32[] parameter(3)
-// CHECK-NEXT:  %[[sine_377:[^ ]+]] = f32[1]{0} sine(f32[1]{0} %[[multiply_371]])
-// CHECK-NEXT:  %[[multiply_378:[^ ]+]] = f32[1]{0} multiply(f32[1]{0} %[[sine_377]], f32[1]{0} %[[sqrt_376]])
-// CHECK-NEXT:  %[[reshape_381:[^ ]+]] = f32[] reshape(f32[1]{0} %[[multiply_378]])
-// CHECK-NEXT:  %[[multiply_382:[^ ]+]] = f32[] multiply(f32[] %[[b_or_sigma_4]], f32[] %[[reshape_381]])
-// CHECK-NEXT:  ROOT %[[add_383:[^ ]+]] = f32[] add(f32[] %[[a_or_mean_3]], f32[] %[[multiply_382]])
+// CHECK-NEXT:  %[[sine_377:[^ ]+]] = f32[1]{0} sine(%[[multiply_371]])
+// CHECK-NEXT:  %[[multiply_378:[^ ]+]] = f32[1]{0} multiply(%[[sine_377]], %[[sqrt_376]])
+// CHECK-NEXT:  %[[reshape_381:[^ ]+]] = f32[] reshape(%[[multiply_378]])
+// CHECK-NEXT:  %[[multiply_382:[^ ]+]] = f32[] multiply(%[[b_or_sigma_4]], %[[reshape_381]])
+// CHECK-NEXT:  ROOT %[[add_383:[^ ]+]] = f32[] add(%[[a_or_mean_3]], %[[multiply_382]])
 
 // CHECK-LABEL: ENTRY %test_normal
 // CHECK-NEXT:  %[[shape:[^ ]+]] = u64[3]{0} constant({8, 4, 2})
-// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{[0-9]+}})
+// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{.*}})
 // CHECK-NEXT:  %[[rng_get_and_update_state:[^ ]+]] = u64[2]{0} rng-get-and-update-state(), delta=1
 // CHECK-NEXT:  %[[mean:[^ ]+]] = f32[] constant(0)
 // CHECK-NEXT:  %[[stdev:[^ ]+]] = f32[] constant(1)
-// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[] call(u64[] %[[constant]], u64[2]{0} %[[rng_get_and_update_state]], f32[] %[[mean]], f32[] %[[stdev]]), to_apply=%[[$rng_384]]
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[] call(%[[constant]], %[[rng_get_and_update_state]], %[[mean]], %[[stdev]]), to_apply=%[[$rng_384]]
 
 HloModule TestNormalDistribution
 
@@ -784,3 +784,1625 @@ ENTRY test_normal {
   shape = u64[3] constant({8, 4, 2})
   ROOT result = f32[] rng(mean, stdev, shape), distribution=rng_normal
 }
+
+// -----
+
+// CHECK-LABEL: HloModule TestNormalDistribution, entry_computation_layout={()->f32[2,2]{1,0}}
+
+// CHECK:       %[[$rng_387:[^ ]+]]
+// CHECK-NEXT:  %[[state_2:[^ ]+]] = u64[2]{0} parameter(1)
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_24:[^ ]+]] = u64[] convert(%[[convert_16]])
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_15]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[convert_25:[^ ]+]] = u64[] convert(%[[convert_18]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_27:[^ ]+]] = u64[] shift-left(%[[convert_25]], %[[constant_26]])
+// CHECK-NEXT:  %[[or_28:[^ ]+]] = u64[] or(%[[convert_24]], %[[shift_left_27]])
+// CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_54:[^ ]+]] = u64[] add(%[[or_28]], %[[constant_53]])
+// CHECK-NEXT:  %[[reshape_61:[^ ]+]] = u64[1]{0} reshape(%[[add_54]])
+// CHECK-NEXT:  %[[compare_56:[^ ]+]] = pred[] compare(%[[add_54]], %[[or_28]]), direction=LT
+// CHECK-NEXT:  %[[slice_13:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_14:[^ ]+]] = u64[] reshape(%[[slice_13]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[reshape_14]])
+// CHECK-NEXT:  %[[convert_29:[^ ]+]] = u64[] convert(%[[convert_20]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_21:[^ ]+]] = u64[] shift-right-logical(%[[reshape_14]], %[[constant_19]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u32[] convert(%[[shift_right_logical_21]])
+// CHECK-NEXT:  %[[convert_30:[^ ]+]] = u64[] convert(%[[convert_22]])
+// CHECK-NEXT:  %[[constant_31:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_32:[^ ]+]] = u64[] shift-left(%[[convert_30]], %[[constant_31]])
+// CHECK-NEXT:  %[[or_33:[^ ]+]] = u64[] or(%[[convert_29]], %[[shift_left_32]])
+// CHECK-NEXT:  %[[constant_55:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_55]])
+// CHECK-NEXT:  %[[broadcast_58:[^ ]+]] = u64[] broadcast(%[[add_57]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_59:[^ ]+]] = u64[] broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_60:[^ ]+]] = u64[] select(%[[compare_56]], %[[broadcast_58]], %[[broadcast_59]])
+// CHECK-NEXT:  %[[reshape_62:[^ ]+]] = u64[1]{0} reshape(%[[select_60]])
+// CHECK-NEXT:  %[[concatenate_63:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_61]], %[[reshape_62]]), dimensions={0}
+// CHECK-NEXT:  %[[key_1:[^ ]+]] = u64[] parameter(0)
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[key_1]])
+// CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
+// CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
+// CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
+// CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
+// CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
+// CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_9:[^ ]+]] = u64[] shift-right-logical(%[[key_1]], %[[constant_7]])
+// CHECK-NEXT:  %[[convert_10:[^ ]+]] = u32[] convert(%[[shift_right_logical_9]])
+// CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_91:[^ ]+]] = u32[] add(%[[convert_10]], %[[constant_90]])
+// CHECK-NEXT:  %[[constant_118:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_119:[^ ]+]] = u32[] add(%[[add_91]], %[[constant_118]])
+// CHECK-NEXT:  %[[constant_146:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_147:[^ ]+]] = u32[] add(%[[add_119]], %[[constant_146]])
+// CHECK-NEXT:  %[[constant_174:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[] add(%[[add_147]], %[[constant_174]])
+// CHECK-NEXT:  %[[constant_202:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_203:[^ ]+]] = u32[] add(%[[add_175]], %[[constant_202]])
+// CHECK-NEXT:  %[[constant_230:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_231:[^ ]+]] = u32[] add(%[[add_203]], %[[constant_230]])
+// CHECK-NEXT:  %[[constant_258:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_259:[^ ]+]] = u32[] add(%[[add_231]], %[[constant_258]])
+// CHECK-NEXT:  %[[constant_286:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_287:[^ ]+]] = u32[] add(%[[add_259]], %[[constant_286]])
+// CHECK-NEXT:  %[[constant_314:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = u32[] add(%[[add_287]], %[[constant_314]])
+// CHECK-NEXT:  %[[constant_342:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_343:[^ ]+]] = u32[] add(%[[add_315]], %[[constant_342]])
+// CHECK-NEXT:  %[[a_or_mean_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[broadcast_385:[^ ]+]] = f32[2,2]{1,0} broadcast(%[[a_or_mean_3]]), dimensions={}
+// CHECK-NEXT:  %[[b_or_sigma_4:[^ ]+]] = f32[] parameter(3)
+// CHECK-NEXT:  %[[broadcast_383:[^ ]+]] = f32[2,2]{1,0} broadcast(%[[b_or_sigma_4]]), dimensions={}
+// CHECK-NEXT:  %[[constant_369:[^ ]+]] = f32[] constant(6.28318548)
+// CHECK-NEXT:  %[[broadcast_370:[^ ]+]] = f32[1,1,2]{2,1,0} broadcast(%[[constant_369]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_34:[^ ]+]] = u64[1]{0} broadcast(%[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[iota_23:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = u64[1]{0} add(%[[broadcast_34]], %[[iota_23]])
+// CHECK-NEXT:  %[[convert_44:[^ ]+]] = u32[1]{0} convert(%[[add_35]])
+// CHECK-NEXT:  %[[convert_64:[^ ]+]] = u64[1]{0} convert(%[[convert_44]])
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u64[1]{0} broadcast(%[[constant_65]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_67:[^ ]+]] = u64[1]{0} multiply(%[[convert_64]], %[[broadcast_66]])
+// CHECK-NEXT:  %[[constant_69:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_70:[^ ]+]] = u64[1]{0} broadcast(%[[constant_69]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_71:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_67]], %[[broadcast_70]])
+// CHECK-NEXT:  %[[convert_72:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_71]])
+// CHECK-NEXT:  %[[broadcast_37:[^ ]+]] = u64[1]{0} broadcast(%[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[compare_38:[^ ]+]] = pred[1]{0} compare(%[[add_35]], %[[broadcast_37]]), direction=LT
+// CHECK-NEXT:  %[[constant_36:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_39:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_36]])
+// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = u64[1]{0} broadcast(%[[add_39]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_41:[^ ]+]] = u64[1]{0} broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_42:[^ ]+]] = u64[1]{0} select(%[[compare_38]], %[[broadcast_40]], %[[broadcast_41]])
+// CHECK-NEXT:  %[[constant_48:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_50:[^ ]+]] = u64[1]{0} broadcast(%[[constant_48]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_51:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_42]], %[[broadcast_50]])
+// CHECK-NEXT:  %[[convert_52:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_51]])
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[convert_72]], %[[convert_52]])
+// CHECK-NEXT:  %[[broadcast_86:[^ ]+]] = u32[1]{0} broadcast(%[[convert_10]]), dimensions={}
+// CHECK-NEXT:  %[[xor_87:[^ ]+]] = u32[1]{0} xor(%[[xor_85]], %[[broadcast_86]])
+// CHECK-NEXT:  %[[convert_101:[^ ]+]] = u64[1]{0} convert(%[[xor_87]])
+// CHECK-NEXT:  %[[constant_102:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u64[1]{0} broadcast(%[[constant_102]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_104:[^ ]+]] = u64[1]{0} multiply(%[[convert_101]], %[[broadcast_103]])
+// CHECK-NEXT:  %[[constant_106:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u64[1]{0} broadcast(%[[constant_106]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_108:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_104]], %[[broadcast_107]])
+// CHECK-NEXT:  %[[convert_109:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_108]])
+// CHECK-NEXT:  %[[convert_49:[^ ]+]] = u32[1]{0} convert(%[[select_42]])
+// CHECK-NEXT:  %[[convert_73:[^ ]+]] = u64[1]{0} convert(%[[convert_49]])
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = u64[1]{0} broadcast(%[[constant_74]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_76:[^ ]+]] = u64[1]{0} multiply(%[[convert_73]], %[[broadcast_75]])
+// CHECK-NEXT:  %[[convert_77:[^ ]+]] = u32[1]{0} convert(%[[multiply_76]])
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[convert_109]], %[[convert_77]])
+// CHECK-NEXT:  %[[broadcast_111:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_112:[^ ]+]] = u32[1]{0} xor(%[[xor_110]], %[[broadcast_111]])
+// CHECK-NEXT:  %[[convert_120:[^ ]+]] = u64[1]{0} convert(%[[xor_112]])
+// CHECK-NEXT:  %[[constant_121:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_122:[^ ]+]] = u64[1]{0} broadcast(%[[constant_121]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_123:[^ ]+]] = u64[1]{0} multiply(%[[convert_120]], %[[broadcast_122]])
+// CHECK-NEXT:  %[[constant_125:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_126:[^ ]+]] = u64[1]{0} broadcast(%[[constant_125]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_127:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_123]], %[[broadcast_126]])
+// CHECK-NEXT:  %[[convert_128:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_127]])
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = u64[1]{0} broadcast(%[[constant_78]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_80:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_76]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[convert_81:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_80]])
+// CHECK-NEXT:  %[[constant_43:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_45:[^ ]+]] = u64[1]{0} broadcast(%[[constant_43]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_46:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_35]], %[[broadcast_45]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_46]])
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[convert_81]], %[[convert_47]])
+// CHECK-NEXT:  %[[broadcast_83:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_84:[^ ]+]] = u32[1]{0} xor(%[[xor_82]], %[[broadcast_83]])
+// CHECK-NEXT:  %[[convert_92:[^ ]+]] = u64[1]{0} convert(%[[xor_84]])
+// CHECK-NEXT:  %[[constant_93:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u64[1]{0} broadcast(%[[constant_93]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_95:[^ ]+]] = u64[1]{0} multiply(%[[convert_92]], %[[broadcast_94]])
+// CHECK-NEXT:  %[[convert_96:[^ ]+]] = u32[1]{0} convert(%[[multiply_95]])
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[convert_128]], %[[convert_96]])
+// CHECK-NEXT:  %[[broadcast_142:[^ ]+]] = u32[1]{0} broadcast(%[[add_119]]), dimensions={}
+// CHECK-NEXT:  %[[xor_143:[^ ]+]] = u32[1]{0} xor(%[[xor_141]], %[[broadcast_142]])
+// CHECK-NEXT:  %[[convert_157:[^ ]+]] = u64[1]{0} convert(%[[xor_143]])
+// CHECK-NEXT:  %[[constant_158:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u64[1]{0} broadcast(%[[constant_158]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_160:[^ ]+]] = u64[1]{0} multiply(%[[convert_157]], %[[broadcast_159]])
+// CHECK-NEXT:  %[[constant_162:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_163:[^ ]+]] = u64[1]{0} broadcast(%[[constant_162]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_164:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_160]], %[[broadcast_163]])
+// CHECK-NEXT:  %[[convert_165:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_164]])
+// CHECK-NEXT:  %[[constant_97:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_98:[^ ]+]] = u64[1]{0} broadcast(%[[constant_97]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_99:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_95]], %[[broadcast_98]])
+// CHECK-NEXT:  %[[convert_100:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_99]])
+// CHECK-NEXT:  %[[convert_68:[^ ]+]] = u32[1]{0} convert(%[[multiply_67]])
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[convert_100]], %[[convert_68]])
+// CHECK-NEXT:  %[[broadcast_114:[^ ]+]] = u32[1]{0} broadcast(%[[add_91]]), dimensions={}
+// CHECK-NEXT:  %[[xor_115:[^ ]+]] = u32[1]{0} xor(%[[xor_113]], %[[broadcast_114]])
+// CHECK-NEXT:  %[[convert_129:[^ ]+]] = u64[1]{0} convert(%[[xor_115]])
+// CHECK-NEXT:  %[[constant_130:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_131:[^ ]+]] = u64[1]{0} broadcast(%[[constant_130]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_132:[^ ]+]] = u64[1]{0} multiply(%[[convert_129]], %[[broadcast_131]])
+// CHECK-NEXT:  %[[convert_133:[^ ]+]] = u32[1]{0} convert(%[[multiply_132]])
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[convert_165]], %[[convert_133]])
+// CHECK-NEXT:  %[[broadcast_167:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_168:[^ ]+]] = u32[1]{0} xor(%[[xor_166]], %[[broadcast_167]])
+// CHECK-NEXT:  %[[convert_176:[^ ]+]] = u64[1]{0} convert(%[[xor_168]])
+// CHECK-NEXT:  %[[constant_177:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_178:[^ ]+]] = u64[1]{0} broadcast(%[[constant_177]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_179:[^ ]+]] = u64[1]{0} multiply(%[[convert_176]], %[[broadcast_178]])
+// CHECK-NEXT:  %[[constant_181:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_182:[^ ]+]] = u64[1]{0} broadcast(%[[constant_181]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_183:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_179]], %[[broadcast_182]])
+// CHECK-NEXT:  %[[convert_184:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_183]])
+// CHECK-NEXT:  %[[constant_134:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_135:[^ ]+]] = u64[1]{0} broadcast(%[[constant_134]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_136:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_132]], %[[broadcast_135]])
+// CHECK-NEXT:  %[[convert_137:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_136]])
+// CHECK-NEXT:  %[[convert_105:[^ ]+]] = u32[1]{0} convert(%[[multiply_104]])
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[convert_137]], %[[convert_105]])
+// CHECK-NEXT:  %[[broadcast_139:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(%[[xor_138]], %[[broadcast_139]])
+// CHECK-NEXT:  %[[convert_148:[^ ]+]] = u64[1]{0} convert(%[[xor_140]])
+// CHECK-NEXT:  %[[constant_149:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u64[1]{0} broadcast(%[[constant_149]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_151:[^ ]+]] = u64[1]{0} multiply(%[[convert_148]], %[[broadcast_150]])
+// CHECK-NEXT:  %[[convert_152:[^ ]+]] = u32[1]{0} convert(%[[multiply_151]])
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[convert_184]], %[[convert_152]])
+// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(%[[add_175]]), dimensions={}
+// CHECK-NEXT:  %[[xor_199:[^ ]+]] = u32[1]{0} xor(%[[xor_197]], %[[broadcast_198]])
+// CHECK-NEXT:  %[[convert_213:[^ ]+]] = u64[1]{0} convert(%[[xor_199]])
+// CHECK-NEXT:  %[[constant_214:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_215:[^ ]+]] = u64[1]{0} broadcast(%[[constant_214]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_216:[^ ]+]] = u64[1]{0} multiply(%[[convert_213]], %[[broadcast_215]])
+// CHECK-NEXT:  %[[constant_218:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_219:[^ ]+]] = u64[1]{0} broadcast(%[[constant_218]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_220:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_216]], %[[broadcast_219]])
+// CHECK-NEXT:  %[[convert_221:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_220]])
+// CHECK-NEXT:  %[[constant_153:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_154:[^ ]+]] = u64[1]{0} broadcast(%[[constant_153]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_155:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_151]], %[[broadcast_154]])
+// CHECK-NEXT:  %[[convert_156:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_155]])
+// CHECK-NEXT:  %[[convert_124:[^ ]+]] = u32[1]{0} convert(%[[multiply_123]])
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[convert_156]], %[[convert_124]])
+// CHECK-NEXT:  %[[broadcast_170:[^ ]+]] = u32[1]{0} broadcast(%[[add_147]]), dimensions={}
+// CHECK-NEXT:  %[[xor_171:[^ ]+]] = u32[1]{0} xor(%[[xor_169]], %[[broadcast_170]])
+// CHECK-NEXT:  %[[convert_185:[^ ]+]] = u64[1]{0} convert(%[[xor_171]])
+// CHECK-NEXT:  %[[constant_186:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_187:[^ ]+]] = u64[1]{0} broadcast(%[[constant_186]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_188:[^ ]+]] = u64[1]{0} multiply(%[[convert_185]], %[[broadcast_187]])
+// CHECK-NEXT:  %[[convert_189:[^ ]+]] = u32[1]{0} convert(%[[multiply_188]])
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[convert_221]], %[[convert_189]])
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_224:[^ ]+]] = u32[1]{0} xor(%[[xor_222]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[convert_232:[^ ]+]] = u64[1]{0} convert(%[[xor_224]])
+// CHECK-NEXT:  %[[constant_233:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_234:[^ ]+]] = u64[1]{0} broadcast(%[[constant_233]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_235:[^ ]+]] = u64[1]{0} multiply(%[[convert_232]], %[[broadcast_234]])
+// CHECK-NEXT:  %[[constant_237:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u64[1]{0} broadcast(%[[constant_237]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_239:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_235]], %[[broadcast_238]])
+// CHECK-NEXT:  %[[convert_240:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_239]])
+// CHECK-NEXT:  %[[constant_190:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_191:[^ ]+]] = u64[1]{0} broadcast(%[[constant_190]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_192:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_188]], %[[broadcast_191]])
+// CHECK-NEXT:  %[[convert_193:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_192]])
+// CHECK-NEXT:  %[[convert_161:[^ ]+]] = u32[1]{0} convert(%[[multiply_160]])
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[convert_193]], %[[convert_161]])
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_196:[^ ]+]] = u32[1]{0} xor(%[[xor_194]], %[[broadcast_195]])
+// CHECK-NEXT:  %[[convert_204:[^ ]+]] = u64[1]{0} convert(%[[xor_196]])
+// CHECK-NEXT:  %[[constant_205:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_206:[^ ]+]] = u64[1]{0} broadcast(%[[constant_205]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_207:[^ ]+]] = u64[1]{0} multiply(%[[convert_204]], %[[broadcast_206]])
+// CHECK-NEXT:  %[[convert_208:[^ ]+]] = u32[1]{0} convert(%[[multiply_207]])
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[convert_240]], %[[convert_208]])
+// CHECK-NEXT:  %[[broadcast_254:[^ ]+]] = u32[1]{0} broadcast(%[[add_231]]), dimensions={}
+// CHECK-NEXT:  %[[xor_255:[^ ]+]] = u32[1]{0} xor(%[[xor_253]], %[[broadcast_254]])
+// CHECK-NEXT:  %[[convert_269:[^ ]+]] = u64[1]{0} convert(%[[xor_255]])
+// CHECK-NEXT:  %[[constant_270:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_271:[^ ]+]] = u64[1]{0} broadcast(%[[constant_270]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_272:[^ ]+]] = u64[1]{0} multiply(%[[convert_269]], %[[broadcast_271]])
+// CHECK-NEXT:  %[[constant_274:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_275:[^ ]+]] = u64[1]{0} broadcast(%[[constant_274]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_276:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_272]], %[[broadcast_275]])
+// CHECK-NEXT:  %[[convert_277:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_276]])
+// CHECK-NEXT:  %[[constant_209:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_210:[^ ]+]] = u64[1]{0} broadcast(%[[constant_209]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_211:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_207]], %[[broadcast_210]])
+// CHECK-NEXT:  %[[convert_212:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_211]])
+// CHECK-NEXT:  %[[convert_180:[^ ]+]] = u32[1]{0} convert(%[[multiply_179]])
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[convert_212]], %[[convert_180]])
+// CHECK-NEXT:  %[[broadcast_226:[^ ]+]] = u32[1]{0} broadcast(%[[add_203]]), dimensions={}
+// CHECK-NEXT:  %[[xor_227:[^ ]+]] = u32[1]{0} xor(%[[xor_225]], %[[broadcast_226]])
+// CHECK-NEXT:  %[[convert_241:[^ ]+]] = u64[1]{0} convert(%[[xor_227]])
+// CHECK-NEXT:  %[[constant_242:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_243:[^ ]+]] = u64[1]{0} broadcast(%[[constant_242]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_244:[^ ]+]] = u64[1]{0} multiply(%[[convert_241]], %[[broadcast_243]])
+// CHECK-NEXT:  %[[convert_245:[^ ]+]] = u32[1]{0} convert(%[[multiply_244]])
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[convert_277]], %[[convert_245]])
+// CHECK-NEXT:  %[[broadcast_279:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_280:[^ ]+]] = u32[1]{0} xor(%[[xor_278]], %[[broadcast_279]])
+// CHECK-NEXT:  %[[convert_288:[^ ]+]] = u64[1]{0} convert(%[[xor_280]])
+// CHECK-NEXT:  %[[constant_289:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_290:[^ ]+]] = u64[1]{0} broadcast(%[[constant_289]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_291:[^ ]+]] = u64[1]{0} multiply(%[[convert_288]], %[[broadcast_290]])
+// CHECK-NEXT:  %[[constant_293:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_294:[^ ]+]] = u64[1]{0} broadcast(%[[constant_293]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_295:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_291]], %[[broadcast_294]])
+// CHECK-NEXT:  %[[convert_296:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_295]])
+// CHECK-NEXT:  %[[constant_246:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_247:[^ ]+]] = u64[1]{0} broadcast(%[[constant_246]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_248:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_244]], %[[broadcast_247]])
+// CHECK-NEXT:  %[[convert_249:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_248]])
+// CHECK-NEXT:  %[[convert_217:[^ ]+]] = u32[1]{0} convert(%[[multiply_216]])
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[convert_249]], %[[convert_217]])
+// CHECK-NEXT:  %[[broadcast_251:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_252:[^ ]+]] = u32[1]{0} xor(%[[xor_250]], %[[broadcast_251]])
+// CHECK-NEXT:  %[[convert_260:[^ ]+]] = u64[1]{0} convert(%[[xor_252]])
+// CHECK-NEXT:  %[[constant_261:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_262:[^ ]+]] = u64[1]{0} broadcast(%[[constant_261]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_263:[^ ]+]] = u64[1]{0} multiply(%[[convert_260]], %[[broadcast_262]])
+// CHECK-NEXT:  %[[convert_264:[^ ]+]] = u32[1]{0} convert(%[[multiply_263]])
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[convert_296]], %[[convert_264]])
+// CHECK-NEXT:  %[[broadcast_310:[^ ]+]] = u32[1]{0} broadcast(%[[add_287]]), dimensions={}
+// CHECK-NEXT:  %[[xor_311:[^ ]+]] = u32[1]{0} xor(%[[xor_309]], %[[broadcast_310]])
+// CHECK-NEXT:  %[[convert_325:[^ ]+]] = u64[1]{0} convert(%[[xor_311]])
+// CHECK-NEXT:  %[[constant_326:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_327:[^ ]+]] = u64[1]{0} broadcast(%[[constant_326]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_328:[^ ]+]] = u64[1]{0} multiply(%[[convert_325]], %[[broadcast_327]])
+// CHECK-NEXT:  %[[constant_330:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_331:[^ ]+]] = u64[1]{0} broadcast(%[[constant_330]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_332:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_328]], %[[broadcast_331]])
+// CHECK-NEXT:  %[[convert_333:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_332]])
+// CHECK-NEXT:  %[[constant_265:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_266:[^ ]+]] = u64[1]{0} broadcast(%[[constant_265]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_267:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_263]], %[[broadcast_266]])
+// CHECK-NEXT:  %[[convert_268:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_267]])
+// CHECK-NEXT:  %[[convert_236:[^ ]+]] = u32[1]{0} convert(%[[multiply_235]])
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[convert_268]], %[[convert_236]])
+// CHECK-NEXT:  %[[broadcast_282:[^ ]+]] = u32[1]{0} broadcast(%[[add_259]]), dimensions={}
+// CHECK-NEXT:  %[[xor_283:[^ ]+]] = u32[1]{0} xor(%[[xor_281]], %[[broadcast_282]])
+// CHECK-NEXT:  %[[convert_297:[^ ]+]] = u64[1]{0} convert(%[[xor_283]])
+// CHECK-NEXT:  %[[constant_298:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_299:[^ ]+]] = u64[1]{0} broadcast(%[[constant_298]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_300:[^ ]+]] = u64[1]{0} multiply(%[[convert_297]], %[[broadcast_299]])
+// CHECK-NEXT:  %[[convert_301:[^ ]+]] = u32[1]{0} convert(%[[multiply_300]])
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[convert_333]], %[[convert_301]])
+// CHECK-NEXT:  %[[broadcast_335:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_336:[^ ]+]] = u32[1]{0} xor(%[[xor_334]], %[[broadcast_335]])
+// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_336]])
+// CHECK-NEXT:  %[[convert_329:[^ ]+]] = u32[1]{0} convert(%[[multiply_328]])
+// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_329]])
+// CHECK-NEXT:  %[[constant_302:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_303:[^ ]+]] = u64[1]{0} broadcast(%[[constant_302]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_304:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_300]], %[[broadcast_303]])
+// CHECK-NEXT:  %[[convert_305:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_304]])
+// CHECK-NEXT:  %[[convert_273:[^ ]+]] = u32[1]{0} convert(%[[multiply_272]])
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[convert_305]], %[[convert_273]])
+// CHECK-NEXT:  %[[broadcast_307:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_308:[^ ]+]] = u32[1]{0} xor(%[[xor_306]], %[[broadcast_307]])
+// CHECK-NEXT:  %[[convert_316:[^ ]+]] = u64[1]{0} convert(%[[xor_308]])
+// CHECK-NEXT:  %[[constant_317:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_318:[^ ]+]] = u64[1]{0} broadcast(%[[constant_317]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_319:[^ ]+]] = u64[1]{0} multiply(%[[convert_316]], %[[broadcast_318]])
+// CHECK-NEXT:  %[[constant_321:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_322:[^ ]+]] = u64[1]{0} broadcast(%[[constant_321]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_323:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_319]], %[[broadcast_322]])
+// CHECK-NEXT:  %[[convert_324:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_323]])
+// CHECK-NEXT:  %[[convert_292:[^ ]+]] = u32[1]{0} convert(%[[multiply_291]])
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[convert_324]], %[[convert_292]])
+// CHECK-NEXT:  %[[broadcast_338:[^ ]+]] = u32[1]{0} broadcast(%[[add_315]]), dimensions={}
+// CHECK-NEXT:  %[[xor_339:[^ ]+]] = u32[1]{0} xor(%[[xor_337]], %[[broadcast_338]])
+// CHECK-NEXT:  %[[reshape_346:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_339]])
+// CHECK-NEXT:  %[[convert_320:[^ ]+]] = u32[1]{0} convert(%[[multiply_319]])
+// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_320]])
+// CHECK-NEXT:  %[[concatenate_348:[^ ]+]] = u32[1,4]{1,0} concatenate(%[[reshape_344]], %[[reshape_345]], %[[reshape_346]], %[[reshape_347]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[4]{0} reshape(%[[concatenate_348]])
+// CHECK-NEXT:  %[[slice_350:[^ ]+]] = u32[4]{0} slice(%[[reshape_349]]), slice={[0:4]}
+// CHECK-NEXT:  %[[reshape_351:[^ ]+]] = u32[1,2,2]{2,1,0} reshape(%[[slice_350]])
+// CHECK-NEXT:  %[[constant_352:[^ ]+]] = u32[] constant(9)
+// CHECK-NEXT:  %[[broadcast_353:[^ ]+]] = u32[1,2,2]{2,1,0} broadcast(%[[constant_352]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_354:[^ ]+]] = u32[1,2,2]{2,1,0} shift-right-logical(%[[reshape_351]], %[[broadcast_353]])
+// CHECK-NEXT:  %[[convert_355:[^ ]+]] = f32[1,2,2]{2,1,0} convert(%[[shift_right_logical_354]])
+// CHECK-NEXT:  %[[constant_356:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[broadcast_357:[^ ]+]] = f32[1,2,2]{2,1,0} broadcast(%[[constant_356]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_358:[^ ]+]] = f32[1,2,2]{2,1,0} multiply(%[[convert_355]], %[[broadcast_357]])
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[subtract_359:[^ ]+]] = f32[] subtract(%[[constant_6]], %[[constant_5]])
+// CHECK-NEXT:  %[[broadcast_360:[^ ]+]] = f32[1,2,2]{2,1,0} broadcast(%[[subtract_359]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_361:[^ ]+]] = f32[1,2,2]{2,1,0} multiply(%[[multiply_358]], %[[broadcast_360]])
+// CHECK-NEXT:  %[[broadcast_362:[^ ]+]] = f32[1,2,2]{2,1,0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[add_363:[^ ]+]] = f32[1,2,2]{2,1,0} add(%[[multiply_361]], %[[broadcast_362]])
+// CHECK-NEXT:  %[[slice_365:[^ ]+]] = f32[1,1,2]{2,1,0} slice(%[[add_363]]), slice={[0:1], [1:2], [0:2]}
+// CHECK-NEXT:  %[[multiply_371:[^ ]+]] = f32[1,1,2]{2,1,0} multiply(%[[broadcast_370]], %[[slice_365]])
+// CHECK-NEXT:  %[[sine_377:[^ ]+]] = f32[1,1,2]{2,1,0} sine(%[[multiply_371]])
+// CHECK-NEXT:  %[[constant_372:[^ ]+]] = f32[] constant(-2)
+// CHECK-NEXT:  %[[broadcast_374:[^ ]+]] = f32[1,1,2]{2,1,0} broadcast(%[[constant_372]]), dimensions={}
+// CHECK-NEXT:  %[[slice_364:[^ ]+]] = f32[1,1,2]{2,1,0} slice(%[[add_363]]), slice={[0:1], [0:1], [0:2]}
+// CHECK-NEXT:  %[[constant_366:[^ ]+]] = f32[] constant(1e-07)
+// CHECK-NEXT:  %[[broadcast_367:[^ ]+]] = f32[1,1,2]{2,1,0} broadcast(%[[constant_366]]), dimensions={}
+// CHECK-NEXT:  %[[maximum_368:[^ ]+]] = f32[1,1,2]{2,1,0} maximum(%[[slice_364]], %[[broadcast_367]])
+// CHECK-NEXT:  %[[log_373:[^ ]+]] = f32[1,1,2]{2,1,0} log(%[[maximum_368]])
+// CHECK-NEXT:  %[[multiply_375:[^ ]+]] = f32[1,1,2]{2,1,0} multiply(%[[broadcast_374]], %[[log_373]])
+// CHECK-NEXT:  %[[sqrt_376:[^ ]+]] = f32[1,1,2]{2,1,0} sqrt(%[[multiply_375]])
+// CHECK-NEXT:  %[[multiply_378:[^ ]+]] = f32[1,1,2]{2,1,0} multiply(%[[sine_377]], %[[sqrt_376]])
+// CHECK-NEXT:  %[[cosine_379:[^ ]+]] = f32[1,1,2]{2,1,0} cosine(%[[multiply_371]])
+// CHECK-NEXT:  %[[multiply_380:[^ ]+]] = f32[1,1,2]{2,1,0} multiply(%[[cosine_379]], %[[sqrt_376]])
+// CHECK-NEXT:  %[[concatenate_381:[^ ]+]] = f32[1,2,2]{2,1,0} concatenate(%[[multiply_378]], %[[multiply_380]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_382:[^ ]+]] = f32[2,2]{1,0} reshape(%[[concatenate_381]])
+// CHECK-NEXT:  %[[multiply_384:[^ ]+]] = f32[2,2]{1,0} multiply(%[[broadcast_383]], %[[reshape_382]])
+// CHECK-NEXT:  ROOT %[[add_386:[^ ]+]] = f32[2,2]{1,0} add(%[[broadcast_385]], %[[multiply_384]])
+
+// CHECK-LABEL: ENTRY %test_normal_multidim_split
+// CHECK-NEXT:  %[[shape:[^ ]+]] = u64[3]{0} constant({8, 4, 2})
+// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{.*}})
+// CHECK-NEXT:  %[[rng_get_and_update_state:[^ ]+]] = u64[2]{0} rng-get-and-update-state(), delta=4
+// CHECK-NEXT:  %[[mean:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[stdev:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[2,2]{1,0} call(%[[constant]], %[[rng_get_and_update_state]], %[[mean]], %[[stdev]]), to_apply=%[[$rng_387]]
+
+HloModule TestNormalDistribution
+
+ENTRY test_normal_multidim_split {
+  mean = f32[] constant(0)
+  stdev = f32[] constant(1)
+  shape = u64[3] constant({8, 4, 2})
+  ROOT result = f32[2,2] rng(mean, stdev, shape), distribution=rng_normal
+}
+
+// -----
+
+// CHECK-LABEL: HloModule TestNormalDistribution, entry_computation_layout={()->f32[3,5]{1,0}}
+
+// CHECK:       %[[$rng_388:[^ ]+]]
+// CHECK-NEXT:  %[[state_2:[^ ]+]] = u64[2]{0} parameter(1)
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_24:[^ ]+]] = u64[] convert(%[[convert_16]])
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_15]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[convert_25:[^ ]+]] = u64[] convert(%[[convert_18]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_27:[^ ]+]] = u64[] shift-left(%[[convert_25]], %[[constant_26]])
+// CHECK-NEXT:  %[[or_28:[^ ]+]] = u64[] or(%[[convert_24]], %[[shift_left_27]])
+// CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(5)
+// CHECK-NEXT:  %[[add_54:[^ ]+]] = u64[] add(%[[or_28]], %[[constant_53]])
+// CHECK-NEXT:  %[[reshape_61:[^ ]+]] = u64[1]{0} reshape(%[[add_54]])
+// CHECK-NEXT:  %[[compare_56:[^ ]+]] = pred[] compare(%[[add_54]], %[[or_28]]), direction=LT
+// CHECK-NEXT:  %[[slice_13:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_14:[^ ]+]] = u64[] reshape(%[[slice_13]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[reshape_14]])
+// CHECK-NEXT:  %[[convert_29:[^ ]+]] = u64[] convert(%[[convert_20]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_21:[^ ]+]] = u64[] shift-right-logical(%[[reshape_14]], %[[constant_19]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u32[] convert(%[[shift_right_logical_21]])
+// CHECK-NEXT:  %[[convert_30:[^ ]+]] = u64[] convert(%[[convert_22]])
+// CHECK-NEXT:  %[[constant_31:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_32:[^ ]+]] = u64[] shift-left(%[[convert_30]], %[[constant_31]])
+// CHECK-NEXT:  %[[or_33:[^ ]+]] = u64[] or(%[[convert_29]], %[[shift_left_32]])
+// CHECK-NEXT:  %[[constant_55:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_55]])
+// CHECK-NEXT:  %[[broadcast_58:[^ ]+]] = u64[] broadcast(%[[add_57]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_59:[^ ]+]] = u64[] broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_60:[^ ]+]] = u64[] select(%[[compare_56]], %[[broadcast_58]], %[[broadcast_59]])
+// CHECK-NEXT:  %[[reshape_62:[^ ]+]] = u64[1]{0} reshape(%[[select_60]])
+// CHECK-NEXT:  %[[concatenate_63:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_61]], %[[reshape_62]]), dimensions={0}
+// CHECK-NEXT:  %[[key_1:[^ ]+]] = u64[] parameter(0)
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[key_1]])
+// CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
+// CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
+// CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
+// CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
+// CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
+// CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_9:[^ ]+]] = u64[] shift-right-logical(%[[key_1]], %[[constant_7]])
+// CHECK-NEXT:  %[[convert_10:[^ ]+]] = u32[] convert(%[[shift_right_logical_9]])
+// CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_91:[^ ]+]] = u32[] add(%[[convert_10]], %[[constant_90]])
+// CHECK-NEXT:  %[[constant_118:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_119:[^ ]+]] = u32[] add(%[[add_91]], %[[constant_118]])
+// CHECK-NEXT:  %[[constant_146:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_147:[^ ]+]] = u32[] add(%[[add_119]], %[[constant_146]])
+// CHECK-NEXT:  %[[constant_174:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[] add(%[[add_147]], %[[constant_174]])
+// CHECK-NEXT:  %[[constant_202:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_203:[^ ]+]] = u32[] add(%[[add_175]], %[[constant_202]])
+// CHECK-NEXT:  %[[constant_230:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_231:[^ ]+]] = u32[] add(%[[add_203]], %[[constant_230]])
+// CHECK-NEXT:  %[[constant_258:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_259:[^ ]+]] = u32[] add(%[[add_231]], %[[constant_258]])
+// CHECK-NEXT:  %[[constant_286:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_287:[^ ]+]] = u32[] add(%[[add_259]], %[[constant_286]])
+// CHECK-NEXT:  %[[constant_314:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = u32[] add(%[[add_287]], %[[constant_314]])
+// CHECK-NEXT:  %[[constant_342:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_343:[^ ]+]] = u32[] add(%[[add_315]], %[[constant_342]])
+// CHECK-NEXT:  %[[a_or_mean_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[broadcast_386:[^ ]+]] = f32[3,5]{1,0} broadcast(%[[a_or_mean_3]]), dimensions={}
+// CHECK-NEXT:  %[[b_or_sigma_4:[^ ]+]] = f32[] parameter(3)
+// CHECK-NEXT:  %[[broadcast_384:[^ ]+]] = f32[3,5]{1,0} broadcast(%[[b_or_sigma_4]]), dimensions={}
+// CHECK-NEXT:  %[[constant_369:[^ ]+]] = f32[] constant(6.28318548)
+// CHECK-NEXT:  %[[broadcast_370:[^ ]+]] = f32[3,3,1]{2,1,0} broadcast(%[[constant_369]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_34:[^ ]+]] = u64[5]{0} broadcast(%[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[iota_23:[^ ]+]] = u64[5]{0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = u64[5]{0} add(%[[broadcast_34]], %[[iota_23]])
+// CHECK-NEXT:  %[[convert_44:[^ ]+]] = u32[5]{0} convert(%[[add_35]])
+// CHECK-NEXT:  %[[convert_64:[^ ]+]] = u64[5]{0} convert(%[[convert_44]])
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u64[5]{0} broadcast(%[[constant_65]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_67:[^ ]+]] = u64[5]{0} multiply(%[[convert_64]], %[[broadcast_66]])
+// CHECK-NEXT:  %[[constant_69:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_70:[^ ]+]] = u64[5]{0} broadcast(%[[constant_69]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_71:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_67]], %[[broadcast_70]])
+// CHECK-NEXT:  %[[convert_72:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_71]])
+// CHECK-NEXT:  %[[broadcast_37:[^ ]+]] = u64[5]{0} broadcast(%[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[compare_38:[^ ]+]] = pred[5]{0} compare(%[[add_35]], %[[broadcast_37]]), direction=LT
+// CHECK-NEXT:  %[[constant_36:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_39:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_36]])
+// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = u64[5]{0} broadcast(%[[add_39]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_41:[^ ]+]] = u64[5]{0} broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_42:[^ ]+]] = u64[5]{0} select(%[[compare_38]], %[[broadcast_40]], %[[broadcast_41]])
+// CHECK-NEXT:  %[[constant_48:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_50:[^ ]+]] = u64[5]{0} broadcast(%[[constant_48]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_51:[^ ]+]] = u64[5]{0} shift-right-logical(%[[select_42]], %[[broadcast_50]])
+// CHECK-NEXT:  %[[convert_52:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_51]])
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[5]{0} xor(%[[convert_72]], %[[convert_52]])
+// CHECK-NEXT:  %[[broadcast_86:[^ ]+]] = u32[5]{0} broadcast(%[[convert_10]]), dimensions={}
+// CHECK-NEXT:  %[[xor_87:[^ ]+]] = u32[5]{0} xor(%[[xor_85]], %[[broadcast_86]])
+// CHECK-NEXT:  %[[convert_101:[^ ]+]] = u64[5]{0} convert(%[[xor_87]])
+// CHECK-NEXT:  %[[constant_102:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u64[5]{0} broadcast(%[[constant_102]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_104:[^ ]+]] = u64[5]{0} multiply(%[[convert_101]], %[[broadcast_103]])
+// CHECK-NEXT:  %[[constant_106:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u64[5]{0} broadcast(%[[constant_106]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_108:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_104]], %[[broadcast_107]])
+// CHECK-NEXT:  %[[convert_109:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_108]])
+// CHECK-NEXT:  %[[convert_49:[^ ]+]] = u32[5]{0} convert(%[[select_42]])
+// CHECK-NEXT:  %[[convert_73:[^ ]+]] = u64[5]{0} convert(%[[convert_49]])
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = u64[5]{0} broadcast(%[[constant_74]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_76:[^ ]+]] = u64[5]{0} multiply(%[[convert_73]], %[[broadcast_75]])
+// CHECK-NEXT:  %[[convert_77:[^ ]+]] = u32[5]{0} convert(%[[multiply_76]])
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[5]{0} xor(%[[convert_109]], %[[convert_77]])
+// CHECK-NEXT:  %[[broadcast_111:[^ ]+]] = u32[5]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_112:[^ ]+]] = u32[5]{0} xor(%[[xor_110]], %[[broadcast_111]])
+// CHECK-NEXT:  %[[convert_120:[^ ]+]] = u64[5]{0} convert(%[[xor_112]])
+// CHECK-NEXT:  %[[constant_121:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_122:[^ ]+]] = u64[5]{0} broadcast(%[[constant_121]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_123:[^ ]+]] = u64[5]{0} multiply(%[[convert_120]], %[[broadcast_122]])
+// CHECK-NEXT:  %[[constant_125:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_126:[^ ]+]] = u64[5]{0} broadcast(%[[constant_125]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_127:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_123]], %[[broadcast_126]])
+// CHECK-NEXT:  %[[convert_128:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_127]])
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = u64[5]{0} broadcast(%[[constant_78]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_80:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_76]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[convert_81:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_80]])
+// CHECK-NEXT:  %[[constant_43:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_45:[^ ]+]] = u64[5]{0} broadcast(%[[constant_43]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_46:[^ ]+]] = u64[5]{0} shift-right-logical(%[[add_35]], %[[broadcast_45]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_46]])
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[5]{0} xor(%[[convert_81]], %[[convert_47]])
+// CHECK-NEXT:  %[[broadcast_83:[^ ]+]] = u32[5]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_84:[^ ]+]] = u32[5]{0} xor(%[[xor_82]], %[[broadcast_83]])
+// CHECK-NEXT:  %[[convert_92:[^ ]+]] = u64[5]{0} convert(%[[xor_84]])
+// CHECK-NEXT:  %[[constant_93:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u64[5]{0} broadcast(%[[constant_93]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_95:[^ ]+]] = u64[5]{0} multiply(%[[convert_92]], %[[broadcast_94]])
+// CHECK-NEXT:  %[[convert_96:[^ ]+]] = u32[5]{0} convert(%[[multiply_95]])
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[5]{0} xor(%[[convert_128]], %[[convert_96]])
+// CHECK-NEXT:  %[[broadcast_142:[^ ]+]] = u32[5]{0} broadcast(%[[add_119]]), dimensions={}
+// CHECK-NEXT:  %[[xor_143:[^ ]+]] = u32[5]{0} xor(%[[xor_141]], %[[broadcast_142]])
+// CHECK-NEXT:  %[[convert_157:[^ ]+]] = u64[5]{0} convert(%[[xor_143]])
+// CHECK-NEXT:  %[[constant_158:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u64[5]{0} broadcast(%[[constant_158]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_160:[^ ]+]] = u64[5]{0} multiply(%[[convert_157]], %[[broadcast_159]])
+// CHECK-NEXT:  %[[constant_162:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_163:[^ ]+]] = u64[5]{0} broadcast(%[[constant_162]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_164:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_160]], %[[broadcast_163]])
+// CHECK-NEXT:  %[[convert_165:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_164]])
+// CHECK-NEXT:  %[[constant_97:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_98:[^ ]+]] = u64[5]{0} broadcast(%[[constant_97]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_99:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_95]], %[[broadcast_98]])
+// CHECK-NEXT:  %[[convert_100:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_99]])
+// CHECK-NEXT:  %[[convert_68:[^ ]+]] = u32[5]{0} convert(%[[multiply_67]])
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[5]{0} xor(%[[convert_100]], %[[convert_68]])
+// CHECK-NEXT:  %[[broadcast_114:[^ ]+]] = u32[5]{0} broadcast(%[[add_91]]), dimensions={}
+// CHECK-NEXT:  %[[xor_115:[^ ]+]] = u32[5]{0} xor(%[[xor_113]], %[[broadcast_114]])
+// CHECK-NEXT:  %[[convert_129:[^ ]+]] = u64[5]{0} convert(%[[xor_115]])
+// CHECK-NEXT:  %[[constant_130:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_131:[^ ]+]] = u64[5]{0} broadcast(%[[constant_130]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_132:[^ ]+]] = u64[5]{0} multiply(%[[convert_129]], %[[broadcast_131]])
+// CHECK-NEXT:  %[[convert_133:[^ ]+]] = u32[5]{0} convert(%[[multiply_132]])
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[5]{0} xor(%[[convert_165]], %[[convert_133]])
+// CHECK-NEXT:  %[[broadcast_167:[^ ]+]] = u32[5]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_168:[^ ]+]] = u32[5]{0} xor(%[[xor_166]], %[[broadcast_167]])
+// CHECK-NEXT:  %[[convert_176:[^ ]+]] = u64[5]{0} convert(%[[xor_168]])
+// CHECK-NEXT:  %[[constant_177:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_178:[^ ]+]] = u64[5]{0} broadcast(%[[constant_177]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_179:[^ ]+]] = u64[5]{0} multiply(%[[convert_176]], %[[broadcast_178]])
+// CHECK-NEXT:  %[[constant_181:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_182:[^ ]+]] = u64[5]{0} broadcast(%[[constant_181]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_183:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_179]], %[[broadcast_182]])
+// CHECK-NEXT:  %[[convert_184:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_183]])
+// CHECK-NEXT:  %[[constant_134:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_135:[^ ]+]] = u64[5]{0} broadcast(%[[constant_134]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_136:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_132]], %[[broadcast_135]])
+// CHECK-NEXT:  %[[convert_137:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_136]])
+// CHECK-NEXT:  %[[convert_105:[^ ]+]] = u32[5]{0} convert(%[[multiply_104]])
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[5]{0} xor(%[[convert_137]], %[[convert_105]])
+// CHECK-NEXT:  %[[broadcast_139:[^ ]+]] = u32[5]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[5]{0} xor(%[[xor_138]], %[[broadcast_139]])
+// CHECK-NEXT:  %[[convert_148:[^ ]+]] = u64[5]{0} convert(%[[xor_140]])
+// CHECK-NEXT:  %[[constant_149:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u64[5]{0} broadcast(%[[constant_149]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_151:[^ ]+]] = u64[5]{0} multiply(%[[convert_148]], %[[broadcast_150]])
+// CHECK-NEXT:  %[[convert_152:[^ ]+]] = u32[5]{0} convert(%[[multiply_151]])
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[5]{0} xor(%[[convert_184]], %[[convert_152]])
+// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[5]{0} broadcast(%[[add_175]]), dimensions={}
+// CHECK-NEXT:  %[[xor_199:[^ ]+]] = u32[5]{0} xor(%[[xor_197]], %[[broadcast_198]])
+// CHECK-NEXT:  %[[convert_213:[^ ]+]] = u64[5]{0} convert(%[[xor_199]])
+// CHECK-NEXT:  %[[constant_214:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_215:[^ ]+]] = u64[5]{0} broadcast(%[[constant_214]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_216:[^ ]+]] = u64[5]{0} multiply(%[[convert_213]], %[[broadcast_215]])
+// CHECK-NEXT:  %[[constant_218:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_219:[^ ]+]] = u64[5]{0} broadcast(%[[constant_218]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_220:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_216]], %[[broadcast_219]])
+// CHECK-NEXT:  %[[convert_221:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_220]])
+// CHECK-NEXT:  %[[constant_153:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_154:[^ ]+]] = u64[5]{0} broadcast(%[[constant_153]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_155:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_151]], %[[broadcast_154]])
+// CHECK-NEXT:  %[[convert_156:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_155]])
+// CHECK-NEXT:  %[[convert_124:[^ ]+]] = u32[5]{0} convert(%[[multiply_123]])
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[5]{0} xor(%[[convert_156]], %[[convert_124]])
+// CHECK-NEXT:  %[[broadcast_170:[^ ]+]] = u32[5]{0} broadcast(%[[add_147]]), dimensions={}
+// CHECK-NEXT:  %[[xor_171:[^ ]+]] = u32[5]{0} xor(%[[xor_169]], %[[broadcast_170]])
+// CHECK-NEXT:  %[[convert_185:[^ ]+]] = u64[5]{0} convert(%[[xor_171]])
+// CHECK-NEXT:  %[[constant_186:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_187:[^ ]+]] = u64[5]{0} broadcast(%[[constant_186]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_188:[^ ]+]] = u64[5]{0} multiply(%[[convert_185]], %[[broadcast_187]])
+// CHECK-NEXT:  %[[convert_189:[^ ]+]] = u32[5]{0} convert(%[[multiply_188]])
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[5]{0} xor(%[[convert_221]], %[[convert_189]])
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[5]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_224:[^ ]+]] = u32[5]{0} xor(%[[xor_222]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[convert_232:[^ ]+]] = u64[5]{0} convert(%[[xor_224]])
+// CHECK-NEXT:  %[[constant_233:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_234:[^ ]+]] = u64[5]{0} broadcast(%[[constant_233]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_235:[^ ]+]] = u64[5]{0} multiply(%[[convert_232]], %[[broadcast_234]])
+// CHECK-NEXT:  %[[constant_237:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u64[5]{0} broadcast(%[[constant_237]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_239:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_235]], %[[broadcast_238]])
+// CHECK-NEXT:  %[[convert_240:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_239]])
+// CHECK-NEXT:  %[[constant_190:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_191:[^ ]+]] = u64[5]{0} broadcast(%[[constant_190]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_192:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_188]], %[[broadcast_191]])
+// CHECK-NEXT:  %[[convert_193:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_192]])
+// CHECK-NEXT:  %[[convert_161:[^ ]+]] = u32[5]{0} convert(%[[multiply_160]])
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[5]{0} xor(%[[convert_193]], %[[convert_161]])
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[5]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_196:[^ ]+]] = u32[5]{0} xor(%[[xor_194]], %[[broadcast_195]])
+// CHECK-NEXT:  %[[convert_204:[^ ]+]] = u64[5]{0} convert(%[[xor_196]])
+// CHECK-NEXT:  %[[constant_205:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_206:[^ ]+]] = u64[5]{0} broadcast(%[[constant_205]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_207:[^ ]+]] = u64[5]{0} multiply(%[[convert_204]], %[[broadcast_206]])
+// CHECK-NEXT:  %[[convert_208:[^ ]+]] = u32[5]{0} convert(%[[multiply_207]])
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[5]{0} xor(%[[convert_240]], %[[convert_208]])
+// CHECK-NEXT:  %[[broadcast_254:[^ ]+]] = u32[5]{0} broadcast(%[[add_231]]), dimensions={}
+// CHECK-NEXT:  %[[xor_255:[^ ]+]] = u32[5]{0} xor(%[[xor_253]], %[[broadcast_254]])
+// CHECK-NEXT:  %[[convert_269:[^ ]+]] = u64[5]{0} convert(%[[xor_255]])
+// CHECK-NEXT:  %[[constant_270:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_271:[^ ]+]] = u64[5]{0} broadcast(%[[constant_270]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_272:[^ ]+]] = u64[5]{0} multiply(%[[convert_269]], %[[broadcast_271]])
+// CHECK-NEXT:  %[[constant_274:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_275:[^ ]+]] = u64[5]{0} broadcast(%[[constant_274]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_276:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_272]], %[[broadcast_275]])
+// CHECK-NEXT:  %[[convert_277:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_276]])
+// CHECK-NEXT:  %[[constant_209:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_210:[^ ]+]] = u64[5]{0} broadcast(%[[constant_209]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_211:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_207]], %[[broadcast_210]])
+// CHECK-NEXT:  %[[convert_212:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_211]])
+// CHECK-NEXT:  %[[convert_180:[^ ]+]] = u32[5]{0} convert(%[[multiply_179]])
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[5]{0} xor(%[[convert_212]], %[[convert_180]])
+// CHECK-NEXT:  %[[broadcast_226:[^ ]+]] = u32[5]{0} broadcast(%[[add_203]]), dimensions={}
+// CHECK-NEXT:  %[[xor_227:[^ ]+]] = u32[5]{0} xor(%[[xor_225]], %[[broadcast_226]])
+// CHECK-NEXT:  %[[convert_241:[^ ]+]] = u64[5]{0} convert(%[[xor_227]])
+// CHECK-NEXT:  %[[constant_242:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_243:[^ ]+]] = u64[5]{0} broadcast(%[[constant_242]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_244:[^ ]+]] = u64[5]{0} multiply(%[[convert_241]], %[[broadcast_243]])
+// CHECK-NEXT:  %[[convert_245:[^ ]+]] = u32[5]{0} convert(%[[multiply_244]])
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[5]{0} xor(%[[convert_277]], %[[convert_245]])
+// CHECK-NEXT:  %[[broadcast_279:[^ ]+]] = u32[5]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_280:[^ ]+]] = u32[5]{0} xor(%[[xor_278]], %[[broadcast_279]])
+// CHECK-NEXT:  %[[convert_288:[^ ]+]] = u64[5]{0} convert(%[[xor_280]])
+// CHECK-NEXT:  %[[constant_289:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_290:[^ ]+]] = u64[5]{0} broadcast(%[[constant_289]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_291:[^ ]+]] = u64[5]{0} multiply(%[[convert_288]], %[[broadcast_290]])
+// CHECK-NEXT:  %[[constant_293:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_294:[^ ]+]] = u64[5]{0} broadcast(%[[constant_293]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_295:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_291]], %[[broadcast_294]])
+// CHECK-NEXT:  %[[convert_296:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_295]])
+// CHECK-NEXT:  %[[constant_246:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_247:[^ ]+]] = u64[5]{0} broadcast(%[[constant_246]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_248:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_244]], %[[broadcast_247]])
+// CHECK-NEXT:  %[[convert_249:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_248]])
+// CHECK-NEXT:  %[[convert_217:[^ ]+]] = u32[5]{0} convert(%[[multiply_216]])
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[5]{0} xor(%[[convert_249]], %[[convert_217]])
+// CHECK-NEXT:  %[[broadcast_251:[^ ]+]] = u32[5]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_252:[^ ]+]] = u32[5]{0} xor(%[[xor_250]], %[[broadcast_251]])
+// CHECK-NEXT:  %[[convert_260:[^ ]+]] = u64[5]{0} convert(%[[xor_252]])
+// CHECK-NEXT:  %[[constant_261:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_262:[^ ]+]] = u64[5]{0} broadcast(%[[constant_261]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_263:[^ ]+]] = u64[5]{0} multiply(%[[convert_260]], %[[broadcast_262]])
+// CHECK-NEXT:  %[[convert_264:[^ ]+]] = u32[5]{0} convert(%[[multiply_263]])
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[5]{0} xor(%[[convert_296]], %[[convert_264]])
+// CHECK-NEXT:  %[[broadcast_310:[^ ]+]] = u32[5]{0} broadcast(%[[add_287]]), dimensions={}
+// CHECK-NEXT:  %[[xor_311:[^ ]+]] = u32[5]{0} xor(%[[xor_309]], %[[broadcast_310]])
+// CHECK-NEXT:  %[[convert_325:[^ ]+]] = u64[5]{0} convert(%[[xor_311]])
+// CHECK-NEXT:  %[[constant_326:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_327:[^ ]+]] = u64[5]{0} broadcast(%[[constant_326]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_328:[^ ]+]] = u64[5]{0} multiply(%[[convert_325]], %[[broadcast_327]])
+// CHECK-NEXT:  %[[constant_330:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_331:[^ ]+]] = u64[5]{0} broadcast(%[[constant_330]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_332:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_328]], %[[broadcast_331]])
+// CHECK-NEXT:  %[[convert_333:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_332]])
+// CHECK-NEXT:  %[[constant_265:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_266:[^ ]+]] = u64[5]{0} broadcast(%[[constant_265]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_267:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_263]], %[[broadcast_266]])
+// CHECK-NEXT:  %[[convert_268:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_267]])
+// CHECK-NEXT:  %[[convert_236:[^ ]+]] = u32[5]{0} convert(%[[multiply_235]])
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[5]{0} xor(%[[convert_268]], %[[convert_236]])
+// CHECK-NEXT:  %[[broadcast_282:[^ ]+]] = u32[5]{0} broadcast(%[[add_259]]), dimensions={}
+// CHECK-NEXT:  %[[xor_283:[^ ]+]] = u32[5]{0} xor(%[[xor_281]], %[[broadcast_282]])
+// CHECK-NEXT:  %[[convert_297:[^ ]+]] = u64[5]{0} convert(%[[xor_283]])
+// CHECK-NEXT:  %[[constant_298:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_299:[^ ]+]] = u64[5]{0} broadcast(%[[constant_298]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_300:[^ ]+]] = u64[5]{0} multiply(%[[convert_297]], %[[broadcast_299]])
+// CHECK-NEXT:  %[[convert_301:[^ ]+]] = u32[5]{0} convert(%[[multiply_300]])
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[5]{0} xor(%[[convert_333]], %[[convert_301]])
+// CHECK-NEXT:  %[[broadcast_335:[^ ]+]] = u32[5]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_336:[^ ]+]] = u32[5]{0} xor(%[[xor_334]], %[[broadcast_335]])
+// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[5,1]{1,0} reshape(%[[xor_336]])
+// CHECK-NEXT:  %[[convert_329:[^ ]+]] = u32[5]{0} convert(%[[multiply_328]])
+// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[5,1]{1,0} reshape(%[[convert_329]])
+// CHECK-NEXT:  %[[constant_302:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_303:[^ ]+]] = u64[5]{0} broadcast(%[[constant_302]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_304:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_300]], %[[broadcast_303]])
+// CHECK-NEXT:  %[[convert_305:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_304]])
+// CHECK-NEXT:  %[[convert_273:[^ ]+]] = u32[5]{0} convert(%[[multiply_272]])
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[5]{0} xor(%[[convert_305]], %[[convert_273]])
+// CHECK-NEXT:  %[[broadcast_307:[^ ]+]] = u32[5]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_308:[^ ]+]] = u32[5]{0} xor(%[[xor_306]], %[[broadcast_307]])
+// CHECK-NEXT:  %[[convert_316:[^ ]+]] = u64[5]{0} convert(%[[xor_308]])
+// CHECK-NEXT:  %[[constant_317:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_318:[^ ]+]] = u64[5]{0} broadcast(%[[constant_317]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_319:[^ ]+]] = u64[5]{0} multiply(%[[convert_316]], %[[broadcast_318]])
+// CHECK-NEXT:  %[[constant_321:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_322:[^ ]+]] = u64[5]{0} broadcast(%[[constant_321]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_323:[^ ]+]] = u64[5]{0} shift-right-logical(%[[multiply_319]], %[[broadcast_322]])
+// CHECK-NEXT:  %[[convert_324:[^ ]+]] = u32[5]{0} convert(%[[shift_right_logical_323]])
+// CHECK-NEXT:  %[[convert_292:[^ ]+]] = u32[5]{0} convert(%[[multiply_291]])
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[5]{0} xor(%[[convert_324]], %[[convert_292]])
+// CHECK-NEXT:  %[[broadcast_338:[^ ]+]] = u32[5]{0} broadcast(%[[add_315]]), dimensions={}
+// CHECK-NEXT:  %[[xor_339:[^ ]+]] = u32[5]{0} xor(%[[xor_337]], %[[broadcast_338]])
+// CHECK-NEXT:  %[[reshape_346:[^ ]+]] = u32[5,1]{1,0} reshape(%[[xor_339]])
+// CHECK-NEXT:  %[[convert_320:[^ ]+]] = u32[5]{0} convert(%[[multiply_319]])
+// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[5,1]{1,0} reshape(%[[convert_320]])
+// CHECK-NEXT:  %[[concatenate_348:[^ ]+]] = u32[5,4]{1,0} concatenate(%[[reshape_344]], %[[reshape_345]], %[[reshape_346]], %[[reshape_347]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[20]{0} reshape(%[[concatenate_348]])
+// CHECK-NEXT:  %[[slice_350:[^ ]+]] = u32[18]{0} slice(%[[reshape_349]]), slice={[0:18]}
+// CHECK-NEXT:  %[[reshape_351:[^ ]+]] = u32[3,3,2]{2,1,0} reshape(%[[slice_350]])
+// CHECK-NEXT:  %[[constant_352:[^ ]+]] = u32[] constant(9)
+// CHECK-NEXT:  %[[broadcast_353:[^ ]+]] = u32[3,3,2]{2,1,0} broadcast(%[[constant_352]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_354:[^ ]+]] = u32[3,3,2]{2,1,0} shift-right-logical(%[[reshape_351]], %[[broadcast_353]])
+// CHECK-NEXT:  %[[convert_355:[^ ]+]] = f32[3,3,2]{2,1,0} convert(%[[shift_right_logical_354]])
+// CHECK-NEXT:  %[[constant_356:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[broadcast_357:[^ ]+]] = f32[3,3,2]{2,1,0} broadcast(%[[constant_356]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_358:[^ ]+]] = f32[3,3,2]{2,1,0} multiply(%[[convert_355]], %[[broadcast_357]])
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[subtract_359:[^ ]+]] = f32[] subtract(%[[constant_6]], %[[constant_5]])
+// CHECK-NEXT:  %[[broadcast_360:[^ ]+]] = f32[3,3,2]{2,1,0} broadcast(%[[subtract_359]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_361:[^ ]+]] = f32[3,3,2]{2,1,0} multiply(%[[multiply_358]], %[[broadcast_360]])
+// CHECK-NEXT:  %[[broadcast_362:[^ ]+]] = f32[3,3,2]{2,1,0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[add_363:[^ ]+]] = f32[3,3,2]{2,1,0} add(%[[multiply_361]], %[[broadcast_362]])
+// CHECK-NEXT:  %[[slice_365:[^ ]+]] = f32[3,3,1]{2,1,0} slice(%[[add_363]]), slice={[0:3], [0:3], [1:2]}
+// CHECK-NEXT:  %[[multiply_371:[^ ]+]] = f32[3,3,1]{2,1,0} multiply(%[[broadcast_370]], %[[slice_365]])
+// CHECK-NEXT:  %[[sine_377:[^ ]+]] = f32[3,3,1]{2,1,0} sine(%[[multiply_371]])
+// CHECK-NEXT:  %[[constant_372:[^ ]+]] = f32[] constant(-2)
+// CHECK-NEXT:  %[[broadcast_374:[^ ]+]] = f32[3,3,1]{2,1,0} broadcast(%[[constant_372]]), dimensions={}
+// CHECK-NEXT:  %[[slice_364:[^ ]+]] = f32[3,3,1]{2,1,0} slice(%[[add_363]]), slice={[0:3], [0:3], [0:1]}
+// CHECK-NEXT:  %[[constant_366:[^ ]+]] = f32[] constant(1e-07)
+// CHECK-NEXT:  %[[broadcast_367:[^ ]+]] = f32[3,3,1]{2,1,0} broadcast(%[[constant_366]]), dimensions={}
+// CHECK-NEXT:  %[[maximum_368:[^ ]+]] = f32[3,3,1]{2,1,0} maximum(%[[slice_364]], %[[broadcast_367]])
+// CHECK-NEXT:  %[[log_373:[^ ]+]] = f32[3,3,1]{2,1,0} log(%[[maximum_368]])
+// CHECK-NEXT:  %[[multiply_375:[^ ]+]] = f32[3,3,1]{2,1,0} multiply(%[[broadcast_374]], %[[log_373]])
+// CHECK-NEXT:  %[[sqrt_376:[^ ]+]] = f32[3,3,1]{2,1,0} sqrt(%[[multiply_375]])
+// CHECK-NEXT:  %[[multiply_378:[^ ]+]] = f32[3,3,1]{2,1,0} multiply(%[[sine_377]], %[[sqrt_376]])
+// CHECK-NEXT:  %[[cosine_379:[^ ]+]] = f32[3,3,1]{2,1,0} cosine(%[[multiply_371]])
+// CHECK-NEXT:  %[[multiply_380:[^ ]+]] = f32[3,3,1]{2,1,0} multiply(%[[cosine_379]], %[[sqrt_376]])
+// CHECK-NEXT:  %[[concatenate_381:[^ ]+]] = f32[3,3,2]{2,1,0} concatenate(%[[multiply_378]], %[[multiply_380]]), dimensions={2}
+// CHECK-NEXT:  %[[reshape_382:[^ ]+]] = f32[3,6]{1,0} reshape(%[[concatenate_381]])
+// CHECK-NEXT:  %[[slice_383:[^ ]+]] = f32[3,5]{1,0} slice(%[[reshape_382]]), slice={[0:3], [0:5]}
+// CHECK-NEXT:  %[[multiply_385:[^ ]+]] = f32[3,5]{1,0} multiply(%[[broadcast_384]], %[[slice_383]])
+// CHECK-NEXT:  ROOT %[[add_387:[^ ]+]] = f32[3,5]{1,0} add(%[[broadcast_386]], %[[multiply_385]])
+
+// CHECK-LABEL: ENTRY %test_normal_multidim_nosplit
+// CHECK-NEXT:  %[[shape:[^ ]+]] = u64[3]{0} constant({8, 4, 2})
+// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{.*}})
+// CHECK-NEXT:  %[[rng_get_and_update_state:[^ ]+]] = u64[2]{0} rng-get-and-update-state(), delta=15
+// CHECK-NEXT:  %[[mean:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[stdev:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = f32[3,5]{1,0} call(%[[constant]], %[[rng_get_and_update_state]], %[[mean]], %[[stdev]]), to_apply=%[[$rng_388]]
+
+HloModule TestNormalDistribution
+
+ENTRY test_normal_multidim_nosplit {
+  mean = f32[] constant(0)
+  stdev = f32[] constant(1)
+  shape = u64[3] constant({8, 4, 2})
+  ROOT result = f32[3,5] rng(mean, stdev, shape), distribution=rng_normal
+}
+
+// -----
+
+// CHECK-LABEL: HloModule TestNormalDistributionSmallFloat, entry_computation_layout={()->f16[]}
+
+// CHECK:       %[[$rng_384:[^ ]+]]
+// CHECK-NEXT:  %[[state_2:[^ ]+]] = u64[2]{0} parameter(1)
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_24:[^ ]+]] = u64[] convert(%[[convert_16]])
+// CHECK-NEXT:  %[[constant_15:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_17:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_15]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[shift_right_logical_17]])
+// CHECK-NEXT:  %[[convert_25:[^ ]+]] = u64[] convert(%[[convert_18]])
+// CHECK-NEXT:  %[[constant_26:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_27:[^ ]+]] = u64[] shift-left(%[[convert_25]], %[[constant_26]])
+// CHECK-NEXT:  %[[or_28:[^ ]+]] = u64[] or(%[[convert_24]], %[[shift_left_27]])
+// CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_54:[^ ]+]] = u64[] add(%[[or_28]], %[[constant_53]])
+// CHECK-NEXT:  %[[reshape_61:[^ ]+]] = u64[1]{0} reshape(%[[add_54]])
+// CHECK-NEXT:  %[[compare_56:[^ ]+]] = pred[] compare(%[[add_54]], %[[or_28]]), direction=LT
+// CHECK-NEXT:  %[[slice_13:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_14:[^ ]+]] = u64[] reshape(%[[slice_13]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[reshape_14]])
+// CHECK-NEXT:  %[[convert_29:[^ ]+]] = u64[] convert(%[[convert_20]])
+// CHECK-NEXT:  %[[constant_19:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_21:[^ ]+]] = u64[] shift-right-logical(%[[reshape_14]], %[[constant_19]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u32[] convert(%[[shift_right_logical_21]])
+// CHECK-NEXT:  %[[convert_30:[^ ]+]] = u64[] convert(%[[convert_22]])
+// CHECK-NEXT:  %[[constant_31:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_32:[^ ]+]] = u64[] shift-left(%[[convert_30]], %[[constant_31]])
+// CHECK-NEXT:  %[[or_33:[^ ]+]] = u64[] or(%[[convert_29]], %[[shift_left_32]])
+// CHECK-NEXT:  %[[constant_55:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_57:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_55]])
+// CHECK-NEXT:  %[[broadcast_58:[^ ]+]] = u64[] broadcast(%[[add_57]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_59:[^ ]+]] = u64[] broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_60:[^ ]+]] = u64[] select(%[[compare_56]], %[[broadcast_58]], %[[broadcast_59]])
+// CHECK-NEXT:  %[[reshape_62:[^ ]+]] = u64[1]{0} reshape(%[[select_60]])
+// CHECK-NEXT:  %[[concatenate_63:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_61]], %[[reshape_62]]), dimensions={0}
+// CHECK-NEXT:  %[[key_1:[^ ]+]] = u64[] parameter(0)
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[key_1]])
+// CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
+// CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
+// CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
+// CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
+// CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
+// CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_9:[^ ]+]] = u64[] shift-right-logical(%[[key_1]], %[[constant_7]])
+// CHECK-NEXT:  %[[convert_10:[^ ]+]] = u32[] convert(%[[shift_right_logical_9]])
+// CHECK-NEXT:  %[[constant_90:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_91:[^ ]+]] = u32[] add(%[[convert_10]], %[[constant_90]])
+// CHECK-NEXT:  %[[constant_118:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_119:[^ ]+]] = u32[] add(%[[add_91]], %[[constant_118]])
+// CHECK-NEXT:  %[[constant_146:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_147:[^ ]+]] = u32[] add(%[[add_119]], %[[constant_146]])
+// CHECK-NEXT:  %[[constant_174:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_175:[^ ]+]] = u32[] add(%[[add_147]], %[[constant_174]])
+// CHECK-NEXT:  %[[constant_202:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_203:[^ ]+]] = u32[] add(%[[add_175]], %[[constant_202]])
+// CHECK-NEXT:  %[[constant_230:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_231:[^ ]+]] = u32[] add(%[[add_203]], %[[constant_230]])
+// CHECK-NEXT:  %[[constant_258:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_259:[^ ]+]] = u32[] add(%[[add_231]], %[[constant_258]])
+// CHECK-NEXT:  %[[constant_286:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_287:[^ ]+]] = u32[] add(%[[add_259]], %[[constant_286]])
+// CHECK-NEXT:  %[[constant_314:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_315:[^ ]+]] = u32[] add(%[[add_287]], %[[constant_314]])
+// CHECK-NEXT:  %[[constant_342:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_343:[^ ]+]] = u32[] add(%[[add_315]], %[[constant_342]])
+// CHECK-NEXT:  %[[constant_369:[^ ]+]] = f32[] constant(6.28318548)
+// CHECK-NEXT:  %[[broadcast_370:[^ ]+]] = f32[1]{0} broadcast(%[[constant_369]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_34:[^ ]+]] = u64[1]{0} broadcast(%[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[iota_23:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[add_35:[^ ]+]] = u64[1]{0} add(%[[broadcast_34]], %[[iota_23]])
+// CHECK-NEXT:  %[[convert_44:[^ ]+]] = u32[1]{0} convert(%[[add_35]])
+// CHECK-NEXT:  %[[convert_64:[^ ]+]] = u64[1]{0} convert(%[[convert_44]])
+// CHECK-NEXT:  %[[constant_65:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_66:[^ ]+]] = u64[1]{0} broadcast(%[[constant_65]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_67:[^ ]+]] = u64[1]{0} multiply(%[[convert_64]], %[[broadcast_66]])
+// CHECK-NEXT:  %[[constant_69:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_70:[^ ]+]] = u64[1]{0} broadcast(%[[constant_69]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_71:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_67]], %[[broadcast_70]])
+// CHECK-NEXT:  %[[convert_72:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_71]])
+// CHECK-NEXT:  %[[broadcast_37:[^ ]+]] = u64[1]{0} broadcast(%[[or_28]]), dimensions={}
+// CHECK-NEXT:  %[[compare_38:[^ ]+]] = pred[1]{0} compare(%[[add_35]], %[[broadcast_37]]), direction=LT
+// CHECK-NEXT:  %[[constant_36:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_39:[^ ]+]] = u64[] add(%[[or_33]], %[[constant_36]])
+// CHECK-NEXT:  %[[broadcast_40:[^ ]+]] = u64[1]{0} broadcast(%[[add_39]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_41:[^ ]+]] = u64[1]{0} broadcast(%[[or_33]]), dimensions={}
+// CHECK-NEXT:  %[[select_42:[^ ]+]] = u64[1]{0} select(%[[compare_38]], %[[broadcast_40]], %[[broadcast_41]])
+// CHECK-NEXT:  %[[constant_48:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_50:[^ ]+]] = u64[1]{0} broadcast(%[[constant_48]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_51:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_42]], %[[broadcast_50]])
+// CHECK-NEXT:  %[[convert_52:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_51]])
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[convert_72]], %[[convert_52]])
+// CHECK-NEXT:  %[[broadcast_86:[^ ]+]] = u32[1]{0} broadcast(%[[convert_10]]), dimensions={}
+// CHECK-NEXT:  %[[xor_87:[^ ]+]] = u32[1]{0} xor(%[[xor_85]], %[[broadcast_86]])
+// CHECK-NEXT:  %[[convert_101:[^ ]+]] = u64[1]{0} convert(%[[xor_87]])
+// CHECK-NEXT:  %[[constant_102:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_103:[^ ]+]] = u64[1]{0} broadcast(%[[constant_102]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_104:[^ ]+]] = u64[1]{0} multiply(%[[convert_101]], %[[broadcast_103]])
+// CHECK-NEXT:  %[[constant_106:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_107:[^ ]+]] = u64[1]{0} broadcast(%[[constant_106]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_108:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_104]], %[[broadcast_107]])
+// CHECK-NEXT:  %[[convert_109:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_108]])
+// CHECK-NEXT:  %[[convert_49:[^ ]+]] = u32[1]{0} convert(%[[select_42]])
+// CHECK-NEXT:  %[[convert_73:[^ ]+]] = u64[1]{0} convert(%[[convert_49]])
+// CHECK-NEXT:  %[[constant_74:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_75:[^ ]+]] = u64[1]{0} broadcast(%[[constant_74]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_76:[^ ]+]] = u64[1]{0} multiply(%[[convert_73]], %[[broadcast_75]])
+// CHECK-NEXT:  %[[convert_77:[^ ]+]] = u32[1]{0} convert(%[[multiply_76]])
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[convert_109]], %[[convert_77]])
+// CHECK-NEXT:  %[[broadcast_111:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_112:[^ ]+]] = u32[1]{0} xor(%[[xor_110]], %[[broadcast_111]])
+// CHECK-NEXT:  %[[convert_120:[^ ]+]] = u64[1]{0} convert(%[[xor_112]])
+// CHECK-NEXT:  %[[constant_121:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_122:[^ ]+]] = u64[1]{0} broadcast(%[[constant_121]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_123:[^ ]+]] = u64[1]{0} multiply(%[[convert_120]], %[[broadcast_122]])
+// CHECK-NEXT:  %[[constant_125:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_126:[^ ]+]] = u64[1]{0} broadcast(%[[constant_125]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_127:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_123]], %[[broadcast_126]])
+// CHECK-NEXT:  %[[convert_128:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_127]])
+// CHECK-NEXT:  %[[constant_78:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_79:[^ ]+]] = u64[1]{0} broadcast(%[[constant_78]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_80:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_76]], %[[broadcast_79]])
+// CHECK-NEXT:  %[[convert_81:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_80]])
+// CHECK-NEXT:  %[[constant_43:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_45:[^ ]+]] = u64[1]{0} broadcast(%[[constant_43]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_46:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_35]], %[[broadcast_45]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_46]])
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[convert_81]], %[[convert_47]])
+// CHECK-NEXT:  %[[broadcast_83:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_84:[^ ]+]] = u32[1]{0} xor(%[[xor_82]], %[[broadcast_83]])
+// CHECK-NEXT:  %[[convert_92:[^ ]+]] = u64[1]{0} convert(%[[xor_84]])
+// CHECK-NEXT:  %[[constant_93:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_94:[^ ]+]] = u64[1]{0} broadcast(%[[constant_93]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_95:[^ ]+]] = u64[1]{0} multiply(%[[convert_92]], %[[broadcast_94]])
+// CHECK-NEXT:  %[[convert_96:[^ ]+]] = u32[1]{0} convert(%[[multiply_95]])
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[convert_128]], %[[convert_96]])
+// CHECK-NEXT:  %[[broadcast_142:[^ ]+]] = u32[1]{0} broadcast(%[[add_119]]), dimensions={}
+// CHECK-NEXT:  %[[xor_143:[^ ]+]] = u32[1]{0} xor(%[[xor_141]], %[[broadcast_142]])
+// CHECK-NEXT:  %[[convert_157:[^ ]+]] = u64[1]{0} convert(%[[xor_143]])
+// CHECK-NEXT:  %[[constant_158:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_159:[^ ]+]] = u64[1]{0} broadcast(%[[constant_158]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_160:[^ ]+]] = u64[1]{0} multiply(%[[convert_157]], %[[broadcast_159]])
+// CHECK-NEXT:  %[[constant_162:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_163:[^ ]+]] = u64[1]{0} broadcast(%[[constant_162]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_164:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_160]], %[[broadcast_163]])
+// CHECK-NEXT:  %[[convert_165:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_164]])
+// CHECK-NEXT:  %[[constant_97:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_98:[^ ]+]] = u64[1]{0} broadcast(%[[constant_97]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_99:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_95]], %[[broadcast_98]])
+// CHECK-NEXT:  %[[convert_100:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_99]])
+// CHECK-NEXT:  %[[convert_68:[^ ]+]] = u32[1]{0} convert(%[[multiply_67]])
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[convert_100]], %[[convert_68]])
+// CHECK-NEXT:  %[[broadcast_114:[^ ]+]] = u32[1]{0} broadcast(%[[add_91]]), dimensions={}
+// CHECK-NEXT:  %[[xor_115:[^ ]+]] = u32[1]{0} xor(%[[xor_113]], %[[broadcast_114]])
+// CHECK-NEXT:  %[[convert_129:[^ ]+]] = u64[1]{0} convert(%[[xor_115]])
+// CHECK-NEXT:  %[[constant_130:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_131:[^ ]+]] = u64[1]{0} broadcast(%[[constant_130]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_132:[^ ]+]] = u64[1]{0} multiply(%[[convert_129]], %[[broadcast_131]])
+// CHECK-NEXT:  %[[convert_133:[^ ]+]] = u32[1]{0} convert(%[[multiply_132]])
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[convert_165]], %[[convert_133]])
+// CHECK-NEXT:  %[[broadcast_167:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_168:[^ ]+]] = u32[1]{0} xor(%[[xor_166]], %[[broadcast_167]])
+// CHECK-NEXT:  %[[convert_176:[^ ]+]] = u64[1]{0} convert(%[[xor_168]])
+// CHECK-NEXT:  %[[constant_177:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_178:[^ ]+]] = u64[1]{0} broadcast(%[[constant_177]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_179:[^ ]+]] = u64[1]{0} multiply(%[[convert_176]], %[[broadcast_178]])
+// CHECK-NEXT:  %[[constant_181:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_182:[^ ]+]] = u64[1]{0} broadcast(%[[constant_181]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_183:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_179]], %[[broadcast_182]])
+// CHECK-NEXT:  %[[convert_184:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_183]])
+// CHECK-NEXT:  %[[constant_134:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_135:[^ ]+]] = u64[1]{0} broadcast(%[[constant_134]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_136:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_132]], %[[broadcast_135]])
+// CHECK-NEXT:  %[[convert_137:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_136]])
+// CHECK-NEXT:  %[[convert_105:[^ ]+]] = u32[1]{0} convert(%[[multiply_104]])
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[convert_137]], %[[convert_105]])
+// CHECK-NEXT:  %[[broadcast_139:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_140:[^ ]+]] = u32[1]{0} xor(%[[xor_138]], %[[broadcast_139]])
+// CHECK-NEXT:  %[[convert_148:[^ ]+]] = u64[1]{0} convert(%[[xor_140]])
+// CHECK-NEXT:  %[[constant_149:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_150:[^ ]+]] = u64[1]{0} broadcast(%[[constant_149]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_151:[^ ]+]] = u64[1]{0} multiply(%[[convert_148]], %[[broadcast_150]])
+// CHECK-NEXT:  %[[convert_152:[^ ]+]] = u32[1]{0} convert(%[[multiply_151]])
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[convert_184]], %[[convert_152]])
+// CHECK-NEXT:  %[[broadcast_198:[^ ]+]] = u32[1]{0} broadcast(%[[add_175]]), dimensions={}
+// CHECK-NEXT:  %[[xor_199:[^ ]+]] = u32[1]{0} xor(%[[xor_197]], %[[broadcast_198]])
+// CHECK-NEXT:  %[[convert_213:[^ ]+]] = u64[1]{0} convert(%[[xor_199]])
+// CHECK-NEXT:  %[[constant_214:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_215:[^ ]+]] = u64[1]{0} broadcast(%[[constant_214]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_216:[^ ]+]] = u64[1]{0} multiply(%[[convert_213]], %[[broadcast_215]])
+// CHECK-NEXT:  %[[constant_218:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_219:[^ ]+]] = u64[1]{0} broadcast(%[[constant_218]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_220:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_216]], %[[broadcast_219]])
+// CHECK-NEXT:  %[[convert_221:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_220]])
+// CHECK-NEXT:  %[[constant_153:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_154:[^ ]+]] = u64[1]{0} broadcast(%[[constant_153]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_155:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_151]], %[[broadcast_154]])
+// CHECK-NEXT:  %[[convert_156:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_155]])
+// CHECK-NEXT:  %[[convert_124:[^ ]+]] = u32[1]{0} convert(%[[multiply_123]])
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[convert_156]], %[[convert_124]])
+// CHECK-NEXT:  %[[broadcast_170:[^ ]+]] = u32[1]{0} broadcast(%[[add_147]]), dimensions={}
+// CHECK-NEXT:  %[[xor_171:[^ ]+]] = u32[1]{0} xor(%[[xor_169]], %[[broadcast_170]])
+// CHECK-NEXT:  %[[convert_185:[^ ]+]] = u64[1]{0} convert(%[[xor_171]])
+// CHECK-NEXT:  %[[constant_186:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_187:[^ ]+]] = u64[1]{0} broadcast(%[[constant_186]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_188:[^ ]+]] = u64[1]{0} multiply(%[[convert_185]], %[[broadcast_187]])
+// CHECK-NEXT:  %[[convert_189:[^ ]+]] = u32[1]{0} convert(%[[multiply_188]])
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[convert_221]], %[[convert_189]])
+// CHECK-NEXT:  %[[broadcast_223:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_224:[^ ]+]] = u32[1]{0} xor(%[[xor_222]], %[[broadcast_223]])
+// CHECK-NEXT:  %[[convert_232:[^ ]+]] = u64[1]{0} convert(%[[xor_224]])
+// CHECK-NEXT:  %[[constant_233:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_234:[^ ]+]] = u64[1]{0} broadcast(%[[constant_233]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_235:[^ ]+]] = u64[1]{0} multiply(%[[convert_232]], %[[broadcast_234]])
+// CHECK-NEXT:  %[[constant_237:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_238:[^ ]+]] = u64[1]{0} broadcast(%[[constant_237]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_239:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_235]], %[[broadcast_238]])
+// CHECK-NEXT:  %[[convert_240:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_239]])
+// CHECK-NEXT:  %[[constant_190:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_191:[^ ]+]] = u64[1]{0} broadcast(%[[constant_190]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_192:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_188]], %[[broadcast_191]])
+// CHECK-NEXT:  %[[convert_193:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_192]])
+// CHECK-NEXT:  %[[convert_161:[^ ]+]] = u32[1]{0} convert(%[[multiply_160]])
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[convert_193]], %[[convert_161]])
+// CHECK-NEXT:  %[[broadcast_195:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_196:[^ ]+]] = u32[1]{0} xor(%[[xor_194]], %[[broadcast_195]])
+// CHECK-NEXT:  %[[convert_204:[^ ]+]] = u64[1]{0} convert(%[[xor_196]])
+// CHECK-NEXT:  %[[constant_205:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_206:[^ ]+]] = u64[1]{0} broadcast(%[[constant_205]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_207:[^ ]+]] = u64[1]{0} multiply(%[[convert_204]], %[[broadcast_206]])
+// CHECK-NEXT:  %[[convert_208:[^ ]+]] = u32[1]{0} convert(%[[multiply_207]])
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[convert_240]], %[[convert_208]])
+// CHECK-NEXT:  %[[broadcast_254:[^ ]+]] = u32[1]{0} broadcast(%[[add_231]]), dimensions={}
+// CHECK-NEXT:  %[[xor_255:[^ ]+]] = u32[1]{0} xor(%[[xor_253]], %[[broadcast_254]])
+// CHECK-NEXT:  %[[convert_269:[^ ]+]] = u64[1]{0} convert(%[[xor_255]])
+// CHECK-NEXT:  %[[constant_270:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_271:[^ ]+]] = u64[1]{0} broadcast(%[[constant_270]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_272:[^ ]+]] = u64[1]{0} multiply(%[[convert_269]], %[[broadcast_271]])
+// CHECK-NEXT:  %[[constant_274:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_275:[^ ]+]] = u64[1]{0} broadcast(%[[constant_274]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_276:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_272]], %[[broadcast_275]])
+// CHECK-NEXT:  %[[convert_277:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_276]])
+// CHECK-NEXT:  %[[constant_209:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_210:[^ ]+]] = u64[1]{0} broadcast(%[[constant_209]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_211:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_207]], %[[broadcast_210]])
+// CHECK-NEXT:  %[[convert_212:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_211]])
+// CHECK-NEXT:  %[[convert_180:[^ ]+]] = u32[1]{0} convert(%[[multiply_179]])
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[convert_212]], %[[convert_180]])
+// CHECK-NEXT:  %[[broadcast_226:[^ ]+]] = u32[1]{0} broadcast(%[[add_203]]), dimensions={}
+// CHECK-NEXT:  %[[xor_227:[^ ]+]] = u32[1]{0} xor(%[[xor_225]], %[[broadcast_226]])
+// CHECK-NEXT:  %[[convert_241:[^ ]+]] = u64[1]{0} convert(%[[xor_227]])
+// CHECK-NEXT:  %[[constant_242:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_243:[^ ]+]] = u64[1]{0} broadcast(%[[constant_242]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_244:[^ ]+]] = u64[1]{0} multiply(%[[convert_241]], %[[broadcast_243]])
+// CHECK-NEXT:  %[[convert_245:[^ ]+]] = u32[1]{0} convert(%[[multiply_244]])
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[convert_277]], %[[convert_245]])
+// CHECK-NEXT:  %[[broadcast_279:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_280:[^ ]+]] = u32[1]{0} xor(%[[xor_278]], %[[broadcast_279]])
+// CHECK-NEXT:  %[[convert_288:[^ ]+]] = u64[1]{0} convert(%[[xor_280]])
+// CHECK-NEXT:  %[[constant_289:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_290:[^ ]+]] = u64[1]{0} broadcast(%[[constant_289]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_291:[^ ]+]] = u64[1]{0} multiply(%[[convert_288]], %[[broadcast_290]])
+// CHECK-NEXT:  %[[constant_293:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_294:[^ ]+]] = u64[1]{0} broadcast(%[[constant_293]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_295:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_291]], %[[broadcast_294]])
+// CHECK-NEXT:  %[[convert_296:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_295]])
+// CHECK-NEXT:  %[[constant_246:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_247:[^ ]+]] = u64[1]{0} broadcast(%[[constant_246]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_248:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_244]], %[[broadcast_247]])
+// CHECK-NEXT:  %[[convert_249:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_248]])
+// CHECK-NEXT:  %[[convert_217:[^ ]+]] = u32[1]{0} convert(%[[multiply_216]])
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[convert_249]], %[[convert_217]])
+// CHECK-NEXT:  %[[broadcast_251:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_252:[^ ]+]] = u32[1]{0} xor(%[[xor_250]], %[[broadcast_251]])
+// CHECK-NEXT:  %[[convert_260:[^ ]+]] = u64[1]{0} convert(%[[xor_252]])
+// CHECK-NEXT:  %[[constant_261:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_262:[^ ]+]] = u64[1]{0} broadcast(%[[constant_261]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_263:[^ ]+]] = u64[1]{0} multiply(%[[convert_260]], %[[broadcast_262]])
+// CHECK-NEXT:  %[[convert_264:[^ ]+]] = u32[1]{0} convert(%[[multiply_263]])
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[convert_296]], %[[convert_264]])
+// CHECK-NEXT:  %[[broadcast_310:[^ ]+]] = u32[1]{0} broadcast(%[[add_287]]), dimensions={}
+// CHECK-NEXT:  %[[xor_311:[^ ]+]] = u32[1]{0} xor(%[[xor_309]], %[[broadcast_310]])
+// CHECK-NEXT:  %[[convert_325:[^ ]+]] = u64[1]{0} convert(%[[xor_311]])
+// CHECK-NEXT:  %[[constant_326:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_327:[^ ]+]] = u64[1]{0} broadcast(%[[constant_326]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_328:[^ ]+]] = u64[1]{0} multiply(%[[convert_325]], %[[broadcast_327]])
+// CHECK-NEXT:  %[[constant_330:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_331:[^ ]+]] = u64[1]{0} broadcast(%[[constant_330]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_332:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_328]], %[[broadcast_331]])
+// CHECK-NEXT:  %[[convert_333:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_332]])
+// CHECK-NEXT:  %[[constant_265:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_266:[^ ]+]] = u64[1]{0} broadcast(%[[constant_265]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_267:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_263]], %[[broadcast_266]])
+// CHECK-NEXT:  %[[convert_268:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_267]])
+// CHECK-NEXT:  %[[convert_236:[^ ]+]] = u32[1]{0} convert(%[[multiply_235]])
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[convert_268]], %[[convert_236]])
+// CHECK-NEXT:  %[[broadcast_282:[^ ]+]] = u32[1]{0} broadcast(%[[add_259]]), dimensions={}
+// CHECK-NEXT:  %[[xor_283:[^ ]+]] = u32[1]{0} xor(%[[xor_281]], %[[broadcast_282]])
+// CHECK-NEXT:  %[[convert_297:[^ ]+]] = u64[1]{0} convert(%[[xor_283]])
+// CHECK-NEXT:  %[[constant_298:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_299:[^ ]+]] = u64[1]{0} broadcast(%[[constant_298]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_300:[^ ]+]] = u64[1]{0} multiply(%[[convert_297]], %[[broadcast_299]])
+// CHECK-NEXT:  %[[convert_301:[^ ]+]] = u32[1]{0} convert(%[[multiply_300]])
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[convert_333]], %[[convert_301]])
+// CHECK-NEXT:  %[[broadcast_335:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_336:[^ ]+]] = u32[1]{0} xor(%[[xor_334]], %[[broadcast_335]])
+// CHECK-NEXT:  %[[reshape_344:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_336]])
+// CHECK-NEXT:  %[[convert_329:[^ ]+]] = u32[1]{0} convert(%[[multiply_328]])
+// CHECK-NEXT:  %[[reshape_345:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_329]])
+// CHECK-NEXT:  %[[constant_302:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_303:[^ ]+]] = u64[1]{0} broadcast(%[[constant_302]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_304:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_300]], %[[broadcast_303]])
+// CHECK-NEXT:  %[[convert_305:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_304]])
+// CHECK-NEXT:  %[[convert_273:[^ ]+]] = u32[1]{0} convert(%[[multiply_272]])
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[convert_305]], %[[convert_273]])
+// CHECK-NEXT:  %[[broadcast_307:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_308:[^ ]+]] = u32[1]{0} xor(%[[xor_306]], %[[broadcast_307]])
+// CHECK-NEXT:  %[[convert_316:[^ ]+]] = u64[1]{0} convert(%[[xor_308]])
+// CHECK-NEXT:  %[[constant_317:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_318:[^ ]+]] = u64[1]{0} broadcast(%[[constant_317]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_319:[^ ]+]] = u64[1]{0} multiply(%[[convert_316]], %[[broadcast_318]])
+// CHECK-NEXT:  %[[constant_321:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_322:[^ ]+]] = u64[1]{0} broadcast(%[[constant_321]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_323:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_319]], %[[broadcast_322]])
+// CHECK-NEXT:  %[[convert_324:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_323]])
+// CHECK-NEXT:  %[[convert_292:[^ ]+]] = u32[1]{0} convert(%[[multiply_291]])
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[convert_324]], %[[convert_292]])
+// CHECK-NEXT:  %[[broadcast_338:[^ ]+]] = u32[1]{0} broadcast(%[[add_315]]), dimensions={}
+// CHECK-NEXT:  %[[xor_339:[^ ]+]] = u32[1]{0} xor(%[[xor_337]], %[[broadcast_338]])
+// CHECK-NEXT:  %[[reshape_346:[^ ]+]] = u32[1,1]{1,0} reshape(%[[xor_339]])
+// CHECK-NEXT:  %[[convert_320:[^ ]+]] = u32[1]{0} convert(%[[multiply_319]])
+// CHECK-NEXT:  %[[reshape_347:[^ ]+]] = u32[1,1]{1,0} reshape(%[[convert_320]])
+// CHECK-NEXT:  %[[concatenate_348:[^ ]+]] = u32[1,4]{1,0} concatenate(%[[reshape_344]], %[[reshape_345]], %[[reshape_346]], %[[reshape_347]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_349:[^ ]+]] = u32[4]{0} reshape(%[[concatenate_348]])
+// CHECK-NEXT:  %[[slice_350:[^ ]+]] = u32[2]{0} slice(%[[reshape_349]]), slice={[0:2]}
+// CHECK-NEXT:  %[[reshape_351:[^ ]+]] = u32[2]{0} reshape(%[[slice_350]])
+// CHECK-NEXT:  %[[constant_352:[^ ]+]] = u32[] constant(9)
+// CHECK-NEXT:  %[[broadcast_353:[^ ]+]] = u32[2]{0} broadcast(%[[constant_352]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_354:[^ ]+]] = u32[2]{0} shift-right-logical(%[[reshape_351]], %[[broadcast_353]])
+// CHECK-NEXT:  %[[convert_355:[^ ]+]] = f32[2]{0} convert(%[[shift_right_logical_354]])
+// CHECK-NEXT:  %[[constant_356:[^ ]+]] = f32[] constant(1.1920929e-07)
+// CHECK-NEXT:  %[[broadcast_357:[^ ]+]] = f32[2]{0} broadcast(%[[constant_356]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_358:[^ ]+]] = f32[2]{0} multiply(%[[convert_355]], %[[broadcast_357]])
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[subtract_359:[^ ]+]] = f32[] subtract(%[[constant_6]], %[[constant_5]])
+// CHECK-NEXT:  %[[broadcast_360:[^ ]+]] = f32[2]{0} broadcast(%[[subtract_359]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_361:[^ ]+]] = f32[2]{0} multiply(%[[multiply_358]], %[[broadcast_360]])
+// CHECK-NEXT:  %[[broadcast_362:[^ ]+]] = f32[2]{0} broadcast(%[[constant_5]]), dimensions={}
+// CHECK-NEXT:  %[[add_363:[^ ]+]] = f32[2]{0} add(%[[multiply_361]], %[[broadcast_362]])
+// CHECK-NEXT:  %[[slice_365:[^ ]+]] = f32[1]{0} slice(%[[add_363]]), slice={[1:2]}
+// CHECK-NEXT:  %[[multiply_371:[^ ]+]] = f32[1]{0} multiply(%[[broadcast_370]], %[[slice_365]])
+// CHECK-NEXT:  %[[cosine_379:[^ ]+]] = f32[1]{0} cosine(%[[multiply_371]])
+// CHECK-NEXT:  %[[constant_372:[^ ]+]] = f32[] constant(-2)
+// CHECK-NEXT:  %[[broadcast_374:[^ ]+]] = f32[1]{0} broadcast(%[[constant_372]]), dimensions={}
+// CHECK-NEXT:  %[[slice_364:[^ ]+]] = f32[1]{0} slice(%[[add_363]]), slice={[0:1]}
+// CHECK-NEXT:  %[[constant_366:[^ ]+]] = f32[] constant(1e-07)
+// CHECK-NEXT:  %[[broadcast_367:[^ ]+]] = f32[1]{0} broadcast(%[[constant_366]]), dimensions={}
+// CHECK-NEXT:  %[[maximum_368:[^ ]+]] = f32[1]{0} maximum(%[[slice_364]], %[[broadcast_367]])
+// CHECK-NEXT:  %[[log_373:[^ ]+]] = f32[1]{0} log(%[[maximum_368]])
+// CHECK-NEXT:  %[[multiply_375:[^ ]+]] = f32[1]{0} multiply(%[[broadcast_374]], %[[log_373]])
+// CHECK-NEXT:  %[[sqrt_376:[^ ]+]] = f32[1]{0} sqrt(%[[multiply_375]])
+// CHECK-NEXT:  %[[multiply_380:[^ ]+]] = f32[1]{0} multiply(%[[cosine_379]], %[[sqrt_376]])
+// CHECK-NEXT:  %[[a_or_mean_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[b_or_sigma_4:[^ ]+]] = f32[] parameter(3)
+// CHECK-NEXT:  %[[sine_377:[^ ]+]] = f32[1]{0} sine(%[[multiply_371]])
+// CHECK-NEXT:  %[[multiply_378:[^ ]+]] = f32[1]{0} multiply(%[[sine_377]], %[[sqrt_376]])
+// CHECK-NEXT:  %[[reshape_381:[^ ]+]] = f32[] reshape(%[[multiply_378]])
+// CHECK-NEXT:  %[[multiply_382:[^ ]+]] = f32[] multiply(%[[b_or_sigma_4]], %[[reshape_381]])
+// CHECK-NEXT:  ROOT %[[add_383:[^ ]+]] = f32[] add(%[[a_or_mean_3]], %[[multiply_382]])
+
+// CHECK-LABEL: ENTRY %test_normal_small_float
+// CHECK-NEXT:  %[[shape:[^ ]+]] = f16[3]{0} constant({8, 4, 2})
+// CHECK-NEXT:  %[[convert_2:[^ ]+]] = f32[3]{0} convert(%[[shape]])
+// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{.*}})
+// CHECK-NEXT:  %[[rng_get_and_update_state:[^ ]+]] = u64[2]{0} rng-get-and-update-state(), delta=1
+// CHECK-NEXT:  %[[mean:[^ ]+]] = f16[] constant(0)
+// CHECK-NEXT:  %[[convert:[^ ]+]] = f32[] convert(%[[mean]])
+// CHECK-NEXT:  %[[stdev:[^ ]+]] = f16[] constant(1)
+// CHECK-NEXT:  %[[convert_1:[^ ]+]] = f32[] convert(%[[stdev]])
+// CHECK-NEXT:  %[[call:[^ ]+]] = f32[] call(%[[constant]], %[[rng_get_and_update_state]], %[[convert]], %[[convert_1]]), to_apply=%[[$rng_384]]
+// CHECK-NEXT:  ROOT %[[convert_3:[^ ]+]] = f16[] convert(%[[call]])
+
+HloModule TestNormalDistributionSmallFloat
+
+ENTRY test_normal_small_float {
+  mean = f16[] constant(0)
+  stdev = f16[] constant(1)
+  shape = f16[3] constant({8, 4, 2})
+  ROOT result = f16[] rng(mean, stdev, shape), distribution=rng_normal
+}
+
+// -----
+
+// CHECK-LABEL: HloModule TestUniformDistributionInt, entry_computation_layout={()->u64[]}
+
+// CHECK:       %[[$rng_371:[^ ]+]]
+// CHECK-NEXT:  %[[state_2:[^ ]+]] = u64[2]{0} parameter(1)
+// CHECK-NEXT:  %[[slice_9:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_10:[^ ]+]] = u64[] reshape(%[[slice_9]])
+// CHECK-NEXT:  %[[convert_14:[^ ]+]] = u32[] convert(%[[reshape_10]])
+// CHECK-NEXT:  %[[convert_22:[^ ]+]] = u64[] convert(%[[convert_14]])
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_15:[^ ]+]] = u64[] shift-right-logical(%[[reshape_10]], %[[constant_13]])
+// CHECK-NEXT:  %[[convert_16:[^ ]+]] = u32[] convert(%[[shift_right_logical_15]])
+// CHECK-NEXT:  %[[convert_23:[^ ]+]] = u64[] convert(%[[convert_16]])
+// CHECK-NEXT:  %[[constant_24:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_25:[^ ]+]] = u64[] shift-left(%[[convert_23]], %[[constant_24]])
+// CHECK-NEXT:  %[[or_26:[^ ]+]] = u64[] or(%[[convert_22]], %[[shift_left_25]])
+// CHECK-NEXT:  %[[constant_51:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_52:[^ ]+]] = u64[] add(%[[or_26]], %[[constant_51]])
+// CHECK-NEXT:  %[[reshape_59:[^ ]+]] = u64[1]{0} reshape(%[[add_52]])
+// CHECK-NEXT:  %[[compare_54:[^ ]+]] = pred[] compare(%[[add_52]], %[[or_26]]), direction=LT
+// CHECK-NEXT:  %[[slice_11:[^ ]+]] = u64[1]{0} slice(%[[state_2]]), slice={[1:2]}
+// CHECK-NEXT:  %[[reshape_12:[^ ]+]] = u64[] reshape(%[[slice_11]])
+// CHECK-NEXT:  %[[convert_18:[^ ]+]] = u32[] convert(%[[reshape_12]])
+// CHECK-NEXT:  %[[convert_27:[^ ]+]] = u64[] convert(%[[convert_18]])
+// CHECK-NEXT:  %[[constant_17:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_19:[^ ]+]] = u64[] shift-right-logical(%[[reshape_12]], %[[constant_17]])
+// CHECK-NEXT:  %[[convert_20:[^ ]+]] = u32[] convert(%[[shift_right_logical_19]])
+// CHECK-NEXT:  %[[convert_28:[^ ]+]] = u64[] convert(%[[convert_20]])
+// CHECK-NEXT:  %[[constant_29:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_left_30:[^ ]+]] = u64[] shift-left(%[[convert_28]], %[[constant_29]])
+// CHECK-NEXT:  %[[or_31:[^ ]+]] = u64[] or(%[[convert_27]], %[[shift_left_30]])
+// CHECK-NEXT:  %[[constant_53:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_55:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_53]])
+// CHECK-NEXT:  %[[broadcast_56:[^ ]+]] = u64[] broadcast(%[[add_55]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_57:[^ ]+]] = u64[] broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_58:[^ ]+]] = u64[] select(%[[compare_54]], %[[broadcast_56]], %[[broadcast_57]])
+// CHECK-NEXT:  %[[reshape_60:[^ ]+]] = u64[1]{0} reshape(%[[select_58]])
+// CHECK-NEXT:  %[[concatenate_61:[^ ]+]] = u64[2]{0} concatenate(%[[reshape_59]], %[[reshape_60]]), dimensions={0}
+// CHECK-NEXT:  %[[key_1:[^ ]+]] = u64[] parameter(0)
+// CHECK-NEXT:  %[[convert_6:[^ ]+]] = u32[] convert(%[[key_1]])
+// CHECK-NEXT:  %[[constant_86:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_87:[^ ]+]] = u32[] add(%[[convert_6]], %[[constant_86]])
+// CHECK-NEXT:  %[[constant_114:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_115:[^ ]+]] = u32[] add(%[[add_87]], %[[constant_114]])
+// CHECK-NEXT:  %[[constant_142:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_143:[^ ]+]] = u32[] add(%[[add_115]], %[[constant_142]])
+// CHECK-NEXT:  %[[constant_170:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_171:[^ ]+]] = u32[] add(%[[add_143]], %[[constant_170]])
+// CHECK-NEXT:  %[[constant_198:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_199:[^ ]+]] = u32[] add(%[[add_171]], %[[constant_198]])
+// CHECK-NEXT:  %[[constant_226:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_227:[^ ]+]] = u32[] add(%[[add_199]], %[[constant_226]])
+// CHECK-NEXT:  %[[constant_254:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_255:[^ ]+]] = u32[] add(%[[add_227]], %[[constant_254]])
+// CHECK-NEXT:  %[[constant_282:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_283:[^ ]+]] = u32[] add(%[[add_255]], %[[constant_282]])
+// CHECK-NEXT:  %[[constant_310:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_311:[^ ]+]] = u32[] add(%[[add_283]], %[[constant_310]])
+// CHECK-NEXT:  %[[constant_338:[^ ]+]] = u32[] constant(2654435769)
+// CHECK-NEXT:  %[[add_339:[^ ]+]] = u32[] add(%[[add_311]], %[[constant_338]])
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[shift_right_logical_7:[^ ]+]] = u64[] shift-right-logical(%[[key_1]], %[[constant_5]])
+// CHECK-NEXT:  %[[convert_8:[^ ]+]] = u32[] convert(%[[shift_right_logical_7]])
+// CHECK-NEXT:  %[[constant_88:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_89:[^ ]+]] = u32[] add(%[[convert_8]], %[[constant_88]])
+// CHECK-NEXT:  %[[constant_116:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_117:[^ ]+]] = u32[] add(%[[add_89]], %[[constant_116]])
+// CHECK-NEXT:  %[[constant_144:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_145:[^ ]+]] = u32[] add(%[[add_117]], %[[constant_144]])
+// CHECK-NEXT:  %[[constant_172:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_173:[^ ]+]] = u32[] add(%[[add_145]], %[[constant_172]])
+// CHECK-NEXT:  %[[constant_200:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_201:[^ ]+]] = u32[] add(%[[add_173]], %[[constant_200]])
+// CHECK-NEXT:  %[[constant_228:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_229:[^ ]+]] = u32[] add(%[[add_201]], %[[constant_228]])
+// CHECK-NEXT:  %[[constant_256:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_257:[^ ]+]] = u32[] add(%[[add_229]], %[[constant_256]])
+// CHECK-NEXT:  %[[constant_284:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_285:[^ ]+]] = u32[] add(%[[add_257]], %[[constant_284]])
+// CHECK-NEXT:  %[[constant_312:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_313:[^ ]+]] = u32[] add(%[[add_285]], %[[constant_312]])
+// CHECK-NEXT:  %[[constant_340:[^ ]+]] = u32[] constant(3144134277)
+// CHECK-NEXT:  %[[add_341:[^ ]+]] = u32[] add(%[[add_313]], %[[constant_340]])
+// CHECK-NEXT:  %[[a_or_mean_3:[^ ]+]] = u64[] parameter(2)
+// CHECK-NEXT:  %[[broadcast_32:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[iota_21:[^ ]+]] = u64[1]{0} iota(), iota_dimension=0
+// CHECK-NEXT:  %[[add_33:[^ ]+]] = u64[1]{0} add(%[[broadcast_32]], %[[iota_21]])
+// CHECK-NEXT:  %[[convert_42:[^ ]+]] = u32[1]{0} convert(%[[add_33]])
+// CHECK-NEXT:  %[[convert_62:[^ ]+]] = u64[1]{0} convert(%[[convert_42]])
+// CHECK-NEXT:  %[[constant_63:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_64:[^ ]+]] = u64[1]{0} broadcast(%[[constant_63]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_65:[^ ]+]] = u64[1]{0} multiply(%[[convert_62]], %[[broadcast_64]])
+// CHECK-NEXT:  %[[constant_67:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_68:[^ ]+]] = u64[1]{0} broadcast(%[[constant_67]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_69:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_65]], %[[broadcast_68]])
+// CHECK-NEXT:  %[[convert_70:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_69]])
+// CHECK-NEXT:  %[[broadcast_35:[^ ]+]] = u64[1]{0} broadcast(%[[or_26]]), dimensions={}
+// CHECK-NEXT:  %[[compare_36:[^ ]+]] = pred[1]{0} compare(%[[add_33]], %[[broadcast_35]]), direction=LT
+// CHECK-NEXT:  %[[constant_34:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[add_37:[^ ]+]] = u64[] add(%[[or_31]], %[[constant_34]])
+// CHECK-NEXT:  %[[broadcast_38:[^ ]+]] = u64[1]{0} broadcast(%[[add_37]]), dimensions={}
+// CHECK-NEXT:  %[[broadcast_39:[^ ]+]] = u64[1]{0} broadcast(%[[or_31]]), dimensions={}
+// CHECK-NEXT:  %[[select_40:[^ ]+]] = u64[1]{0} select(%[[compare_36]], %[[broadcast_38]], %[[broadcast_39]])
+// CHECK-NEXT:  %[[constant_46:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_48:[^ ]+]] = u64[1]{0} broadcast(%[[constant_46]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_49:[^ ]+]] = u64[1]{0} shift-right-logical(%[[select_40]], %[[broadcast_48]])
+// CHECK-NEXT:  %[[convert_50:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_49]])
+// CHECK-NEXT:  %[[xor_83:[^ ]+]] = u32[1]{0} xor(%[[convert_70]], %[[convert_50]])
+// CHECK-NEXT:  %[[broadcast_84:[^ ]+]] = u32[1]{0} broadcast(%[[convert_8]]), dimensions={}
+// CHECK-NEXT:  %[[xor_85:[^ ]+]] = u32[1]{0} xor(%[[xor_83]], %[[broadcast_84]])
+// CHECK-NEXT:  %[[convert_99:[^ ]+]] = u64[1]{0} convert(%[[xor_85]])
+// CHECK-NEXT:  %[[constant_100:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_101:[^ ]+]] = u64[1]{0} broadcast(%[[constant_100]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_102:[^ ]+]] = u64[1]{0} multiply(%[[convert_99]], %[[broadcast_101]])
+// CHECK-NEXT:  %[[constant_104:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_105:[^ ]+]] = u64[1]{0} broadcast(%[[constant_104]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_106:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_102]], %[[broadcast_105]])
+// CHECK-NEXT:  %[[convert_107:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_106]])
+// CHECK-NEXT:  %[[convert_47:[^ ]+]] = u32[1]{0} convert(%[[select_40]])
+// CHECK-NEXT:  %[[convert_71:[^ ]+]] = u64[1]{0} convert(%[[convert_47]])
+// CHECK-NEXT:  %[[constant_72:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_73:[^ ]+]] = u64[1]{0} broadcast(%[[constant_72]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_74:[^ ]+]] = u64[1]{0} multiply(%[[convert_71]], %[[broadcast_73]])
+// CHECK-NEXT:  %[[convert_75:[^ ]+]] = u32[1]{0} convert(%[[multiply_74]])
+// CHECK-NEXT:  %[[xor_108:[^ ]+]] = u32[1]{0} xor(%[[convert_107]], %[[convert_75]])
+// CHECK-NEXT:  %[[broadcast_109:[^ ]+]] = u32[1]{0} broadcast(%[[add_87]]), dimensions={}
+// CHECK-NEXT:  %[[xor_110:[^ ]+]] = u32[1]{0} xor(%[[xor_108]], %[[broadcast_109]])
+// CHECK-NEXT:  %[[convert_118:[^ ]+]] = u64[1]{0} convert(%[[xor_110]])
+// CHECK-NEXT:  %[[constant_119:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_120:[^ ]+]] = u64[1]{0} broadcast(%[[constant_119]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_121:[^ ]+]] = u64[1]{0} multiply(%[[convert_118]], %[[broadcast_120]])
+// CHECK-NEXT:  %[[constant_123:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_124:[^ ]+]] = u64[1]{0} broadcast(%[[constant_123]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_125:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_121]], %[[broadcast_124]])
+// CHECK-NEXT:  %[[convert_126:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_125]])
+// CHECK-NEXT:  %[[constant_76:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_77:[^ ]+]] = u64[1]{0} broadcast(%[[constant_76]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_78:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_74]], %[[broadcast_77]])
+// CHECK-NEXT:  %[[convert_79:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_78]])
+// CHECK-NEXT:  %[[constant_41:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_43:[^ ]+]] = u64[1]{0} broadcast(%[[constant_41]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_44:[^ ]+]] = u64[1]{0} shift-right-logical(%[[add_33]], %[[broadcast_43]])
+// CHECK-NEXT:  %[[convert_45:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_44]])
+// CHECK-NEXT:  %[[xor_80:[^ ]+]] = u32[1]{0} xor(%[[convert_79]], %[[convert_45]])
+// CHECK-NEXT:  %[[broadcast_81:[^ ]+]] = u32[1]{0} broadcast(%[[convert_6]]), dimensions={}
+// CHECK-NEXT:  %[[xor_82:[^ ]+]] = u32[1]{0} xor(%[[xor_80]], %[[broadcast_81]])
+// CHECK-NEXT:  %[[convert_90:[^ ]+]] = u64[1]{0} convert(%[[xor_82]])
+// CHECK-NEXT:  %[[constant_91:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_92:[^ ]+]] = u64[1]{0} broadcast(%[[constant_91]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_93:[^ ]+]] = u64[1]{0} multiply(%[[convert_90]], %[[broadcast_92]])
+// CHECK-NEXT:  %[[convert_94:[^ ]+]] = u32[1]{0} convert(%[[multiply_93]])
+// CHECK-NEXT:  %[[xor_139:[^ ]+]] = u32[1]{0} xor(%[[convert_126]], %[[convert_94]])
+// CHECK-NEXT:  %[[broadcast_140:[^ ]+]] = u32[1]{0} broadcast(%[[add_117]]), dimensions={}
+// CHECK-NEXT:  %[[xor_141:[^ ]+]] = u32[1]{0} xor(%[[xor_139]], %[[broadcast_140]])
+// CHECK-NEXT:  %[[convert_155:[^ ]+]] = u64[1]{0} convert(%[[xor_141]])
+// CHECK-NEXT:  %[[constant_156:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_157:[^ ]+]] = u64[1]{0} broadcast(%[[constant_156]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_158:[^ ]+]] = u64[1]{0} multiply(%[[convert_155]], %[[broadcast_157]])
+// CHECK-NEXT:  %[[constant_160:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_161:[^ ]+]] = u64[1]{0} broadcast(%[[constant_160]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_162:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_158]], %[[broadcast_161]])
+// CHECK-NEXT:  %[[convert_163:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_162]])
+// CHECK-NEXT:  %[[constant_95:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_96:[^ ]+]] = u64[1]{0} broadcast(%[[constant_95]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_97:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_93]], %[[broadcast_96]])
+// CHECK-NEXT:  %[[convert_98:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_97]])
+// CHECK-NEXT:  %[[convert_66:[^ ]+]] = u32[1]{0} convert(%[[multiply_65]])
+// CHECK-NEXT:  %[[xor_111:[^ ]+]] = u32[1]{0} xor(%[[convert_98]], %[[convert_66]])
+// CHECK-NEXT:  %[[broadcast_112:[^ ]+]] = u32[1]{0} broadcast(%[[add_89]]), dimensions={}
+// CHECK-NEXT:  %[[xor_113:[^ ]+]] = u32[1]{0} xor(%[[xor_111]], %[[broadcast_112]])
+// CHECK-NEXT:  %[[convert_127:[^ ]+]] = u64[1]{0} convert(%[[xor_113]])
+// CHECK-NEXT:  %[[constant_128:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_129:[^ ]+]] = u64[1]{0} broadcast(%[[constant_128]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_130:[^ ]+]] = u64[1]{0} multiply(%[[convert_127]], %[[broadcast_129]])
+// CHECK-NEXT:  %[[convert_131:[^ ]+]] = u32[1]{0} convert(%[[multiply_130]])
+// CHECK-NEXT:  %[[xor_164:[^ ]+]] = u32[1]{0} xor(%[[convert_163]], %[[convert_131]])
+// CHECK-NEXT:  %[[broadcast_165:[^ ]+]] = u32[1]{0} broadcast(%[[add_143]]), dimensions={}
+// CHECK-NEXT:  %[[xor_166:[^ ]+]] = u32[1]{0} xor(%[[xor_164]], %[[broadcast_165]])
+// CHECK-NEXT:  %[[convert_174:[^ ]+]] = u64[1]{0} convert(%[[xor_166]])
+// CHECK-NEXT:  %[[constant_175:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_176:[^ ]+]] = u64[1]{0} broadcast(%[[constant_175]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_177:[^ ]+]] = u64[1]{0} multiply(%[[convert_174]], %[[broadcast_176]])
+// CHECK-NEXT:  %[[constant_179:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_180:[^ ]+]] = u64[1]{0} broadcast(%[[constant_179]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_181:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_177]], %[[broadcast_180]])
+// CHECK-NEXT:  %[[convert_182:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_181]])
+// CHECK-NEXT:  %[[constant_132:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_133:[^ ]+]] = u64[1]{0} broadcast(%[[constant_132]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_134:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_130]], %[[broadcast_133]])
+// CHECK-NEXT:  %[[convert_135:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_134]])
+// CHECK-NEXT:  %[[convert_103:[^ ]+]] = u32[1]{0} convert(%[[multiply_102]])
+// CHECK-NEXT:  %[[xor_136:[^ ]+]] = u32[1]{0} xor(%[[convert_135]], %[[convert_103]])
+// CHECK-NEXT:  %[[broadcast_137:[^ ]+]] = u32[1]{0} broadcast(%[[add_115]]), dimensions={}
+// CHECK-NEXT:  %[[xor_138:[^ ]+]] = u32[1]{0} xor(%[[xor_136]], %[[broadcast_137]])
+// CHECK-NEXT:  %[[convert_146:[^ ]+]] = u64[1]{0} convert(%[[xor_138]])
+// CHECK-NEXT:  %[[constant_147:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_148:[^ ]+]] = u64[1]{0} broadcast(%[[constant_147]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_149:[^ ]+]] = u64[1]{0} multiply(%[[convert_146]], %[[broadcast_148]])
+// CHECK-NEXT:  %[[convert_150:[^ ]+]] = u32[1]{0} convert(%[[multiply_149]])
+// CHECK-NEXT:  %[[xor_195:[^ ]+]] = u32[1]{0} xor(%[[convert_182]], %[[convert_150]])
+// CHECK-NEXT:  %[[broadcast_196:[^ ]+]] = u32[1]{0} broadcast(%[[add_173]]), dimensions={}
+// CHECK-NEXT:  %[[xor_197:[^ ]+]] = u32[1]{0} xor(%[[xor_195]], %[[broadcast_196]])
+// CHECK-NEXT:  %[[convert_211:[^ ]+]] = u64[1]{0} convert(%[[xor_197]])
+// CHECK-NEXT:  %[[constant_212:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_213:[^ ]+]] = u64[1]{0} broadcast(%[[constant_212]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_214:[^ ]+]] = u64[1]{0} multiply(%[[convert_211]], %[[broadcast_213]])
+// CHECK-NEXT:  %[[constant_216:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_217:[^ ]+]] = u64[1]{0} broadcast(%[[constant_216]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_218:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_214]], %[[broadcast_217]])
+// CHECK-NEXT:  %[[convert_219:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_218]])
+// CHECK-NEXT:  %[[constant_151:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_152:[^ ]+]] = u64[1]{0} broadcast(%[[constant_151]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_153:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_149]], %[[broadcast_152]])
+// CHECK-NEXT:  %[[convert_154:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_153]])
+// CHECK-NEXT:  %[[convert_122:[^ ]+]] = u32[1]{0} convert(%[[multiply_121]])
+// CHECK-NEXT:  %[[xor_167:[^ ]+]] = u32[1]{0} xor(%[[convert_154]], %[[convert_122]])
+// CHECK-NEXT:  %[[broadcast_168:[^ ]+]] = u32[1]{0} broadcast(%[[add_145]]), dimensions={}
+// CHECK-NEXT:  %[[xor_169:[^ ]+]] = u32[1]{0} xor(%[[xor_167]], %[[broadcast_168]])
+// CHECK-NEXT:  %[[convert_183:[^ ]+]] = u64[1]{0} convert(%[[xor_169]])
+// CHECK-NEXT:  %[[constant_184:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_185:[^ ]+]] = u64[1]{0} broadcast(%[[constant_184]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_186:[^ ]+]] = u64[1]{0} multiply(%[[convert_183]], %[[broadcast_185]])
+// CHECK-NEXT:  %[[convert_187:[^ ]+]] = u32[1]{0} convert(%[[multiply_186]])
+// CHECK-NEXT:  %[[xor_220:[^ ]+]] = u32[1]{0} xor(%[[convert_219]], %[[convert_187]])
+// CHECK-NEXT:  %[[broadcast_221:[^ ]+]] = u32[1]{0} broadcast(%[[add_199]]), dimensions={}
+// CHECK-NEXT:  %[[xor_222:[^ ]+]] = u32[1]{0} xor(%[[xor_220]], %[[broadcast_221]])
+// CHECK-NEXT:  %[[convert_230:[^ ]+]] = u64[1]{0} convert(%[[xor_222]])
+// CHECK-NEXT:  %[[constant_231:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_232:[^ ]+]] = u64[1]{0} broadcast(%[[constant_231]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_233:[^ ]+]] = u64[1]{0} multiply(%[[convert_230]], %[[broadcast_232]])
+// CHECK-NEXT:  %[[constant_235:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_236:[^ ]+]] = u64[1]{0} broadcast(%[[constant_235]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_237:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_233]], %[[broadcast_236]])
+// CHECK-NEXT:  %[[convert_238:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_237]])
+// CHECK-NEXT:  %[[constant_188:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_189:[^ ]+]] = u64[1]{0} broadcast(%[[constant_188]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_190:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_186]], %[[broadcast_189]])
+// CHECK-NEXT:  %[[convert_191:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_190]])
+// CHECK-NEXT:  %[[convert_159:[^ ]+]] = u32[1]{0} convert(%[[multiply_158]])
+// CHECK-NEXT:  %[[xor_192:[^ ]+]] = u32[1]{0} xor(%[[convert_191]], %[[convert_159]])
+// CHECK-NEXT:  %[[broadcast_193:[^ ]+]] = u32[1]{0} broadcast(%[[add_171]]), dimensions={}
+// CHECK-NEXT:  %[[xor_194:[^ ]+]] = u32[1]{0} xor(%[[xor_192]], %[[broadcast_193]])
+// CHECK-NEXT:  %[[convert_202:[^ ]+]] = u64[1]{0} convert(%[[xor_194]])
+// CHECK-NEXT:  %[[constant_203:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_204:[^ ]+]] = u64[1]{0} broadcast(%[[constant_203]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_205:[^ ]+]] = u64[1]{0} multiply(%[[convert_202]], %[[broadcast_204]])
+// CHECK-NEXT:  %[[convert_206:[^ ]+]] = u32[1]{0} convert(%[[multiply_205]])
+// CHECK-NEXT:  %[[xor_251:[^ ]+]] = u32[1]{0} xor(%[[convert_238]], %[[convert_206]])
+// CHECK-NEXT:  %[[broadcast_252:[^ ]+]] = u32[1]{0} broadcast(%[[add_229]]), dimensions={}
+// CHECK-NEXT:  %[[xor_253:[^ ]+]] = u32[1]{0} xor(%[[xor_251]], %[[broadcast_252]])
+// CHECK-NEXT:  %[[convert_267:[^ ]+]] = u64[1]{0} convert(%[[xor_253]])
+// CHECK-NEXT:  %[[constant_268:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_269:[^ ]+]] = u64[1]{0} broadcast(%[[constant_268]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_270:[^ ]+]] = u64[1]{0} multiply(%[[convert_267]], %[[broadcast_269]])
+// CHECK-NEXT:  %[[constant_272:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_273:[^ ]+]] = u64[1]{0} broadcast(%[[constant_272]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_274:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_270]], %[[broadcast_273]])
+// CHECK-NEXT:  %[[convert_275:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_274]])
+// CHECK-NEXT:  %[[constant_207:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_208:[^ ]+]] = u64[1]{0} broadcast(%[[constant_207]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_209:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_205]], %[[broadcast_208]])
+// CHECK-NEXT:  %[[convert_210:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_209]])
+// CHECK-NEXT:  %[[convert_178:[^ ]+]] = u32[1]{0} convert(%[[multiply_177]])
+// CHECK-NEXT:  %[[xor_223:[^ ]+]] = u32[1]{0} xor(%[[convert_210]], %[[convert_178]])
+// CHECK-NEXT:  %[[broadcast_224:[^ ]+]] = u32[1]{0} broadcast(%[[add_201]]), dimensions={}
+// CHECK-NEXT:  %[[xor_225:[^ ]+]] = u32[1]{0} xor(%[[xor_223]], %[[broadcast_224]])
+// CHECK-NEXT:  %[[convert_239:[^ ]+]] = u64[1]{0} convert(%[[xor_225]])
+// CHECK-NEXT:  %[[constant_240:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_241:[^ ]+]] = u64[1]{0} broadcast(%[[constant_240]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_242:[^ ]+]] = u64[1]{0} multiply(%[[convert_239]], %[[broadcast_241]])
+// CHECK-NEXT:  %[[convert_243:[^ ]+]] = u32[1]{0} convert(%[[multiply_242]])
+// CHECK-NEXT:  %[[xor_276:[^ ]+]] = u32[1]{0} xor(%[[convert_275]], %[[convert_243]])
+// CHECK-NEXT:  %[[broadcast_277:[^ ]+]] = u32[1]{0} broadcast(%[[add_255]]), dimensions={}
+// CHECK-NEXT:  %[[xor_278:[^ ]+]] = u32[1]{0} xor(%[[xor_276]], %[[broadcast_277]])
+// CHECK-NEXT:  %[[convert_286:[^ ]+]] = u64[1]{0} convert(%[[xor_278]])
+// CHECK-NEXT:  %[[constant_287:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_288:[^ ]+]] = u64[1]{0} broadcast(%[[constant_287]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_289:[^ ]+]] = u64[1]{0} multiply(%[[convert_286]], %[[broadcast_288]])
+// CHECK-NEXT:  %[[constant_291:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_292:[^ ]+]] = u64[1]{0} broadcast(%[[constant_291]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_293:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_289]], %[[broadcast_292]])
+// CHECK-NEXT:  %[[convert_294:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_293]])
+// CHECK-NEXT:  %[[constant_244:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_245:[^ ]+]] = u64[1]{0} broadcast(%[[constant_244]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_246:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_242]], %[[broadcast_245]])
+// CHECK-NEXT:  %[[convert_247:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_246]])
+// CHECK-NEXT:  %[[convert_215:[^ ]+]] = u32[1]{0} convert(%[[multiply_214]])
+// CHECK-NEXT:  %[[xor_248:[^ ]+]] = u32[1]{0} xor(%[[convert_247]], %[[convert_215]])
+// CHECK-NEXT:  %[[broadcast_249:[^ ]+]] = u32[1]{0} broadcast(%[[add_227]]), dimensions={}
+// CHECK-NEXT:  %[[xor_250:[^ ]+]] = u32[1]{0} xor(%[[xor_248]], %[[broadcast_249]])
+// CHECK-NEXT:  %[[convert_258:[^ ]+]] = u64[1]{0} convert(%[[xor_250]])
+// CHECK-NEXT:  %[[constant_259:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_260:[^ ]+]] = u64[1]{0} broadcast(%[[constant_259]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_261:[^ ]+]] = u64[1]{0} multiply(%[[convert_258]], %[[broadcast_260]])
+// CHECK-NEXT:  %[[convert_262:[^ ]+]] = u32[1]{0} convert(%[[multiply_261]])
+// CHECK-NEXT:  %[[xor_307:[^ ]+]] = u32[1]{0} xor(%[[convert_294]], %[[convert_262]])
+// CHECK-NEXT:  %[[broadcast_308:[^ ]+]] = u32[1]{0} broadcast(%[[add_285]]), dimensions={}
+// CHECK-NEXT:  %[[xor_309:[^ ]+]] = u32[1]{0} xor(%[[xor_307]], %[[broadcast_308]])
+// CHECK-NEXT:  %[[convert_323:[^ ]+]] = u64[1]{0} convert(%[[xor_309]])
+// CHECK-NEXT:  %[[constant_324:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_325:[^ ]+]] = u64[1]{0} broadcast(%[[constant_324]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_326:[^ ]+]] = u64[1]{0} multiply(%[[convert_323]], %[[broadcast_325]])
+// CHECK-NEXT:  %[[constant_328:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_329:[^ ]+]] = u64[1]{0} broadcast(%[[constant_328]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_330:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_326]], %[[broadcast_329]])
+// CHECK-NEXT:  %[[convert_331:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_330]])
+// CHECK-NEXT:  %[[constant_263:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_264:[^ ]+]] = u64[1]{0} broadcast(%[[constant_263]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_265:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_261]], %[[broadcast_264]])
+// CHECK-NEXT:  %[[convert_266:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_265]])
+// CHECK-NEXT:  %[[convert_234:[^ ]+]] = u32[1]{0} convert(%[[multiply_233]])
+// CHECK-NEXT:  %[[xor_279:[^ ]+]] = u32[1]{0} xor(%[[convert_266]], %[[convert_234]])
+// CHECK-NEXT:  %[[broadcast_280:[^ ]+]] = u32[1]{0} broadcast(%[[add_257]]), dimensions={}
+// CHECK-NEXT:  %[[xor_281:[^ ]+]] = u32[1]{0} xor(%[[xor_279]], %[[broadcast_280]])
+// CHECK-NEXT:  %[[convert_295:[^ ]+]] = u64[1]{0} convert(%[[xor_281]])
+// CHECK-NEXT:  %[[constant_296:[^ ]+]] = u64[] constant(3449720151)
+// CHECK-NEXT:  %[[broadcast_297:[^ ]+]] = u64[1]{0} broadcast(%[[constant_296]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_298:[^ ]+]] = u64[1]{0} multiply(%[[convert_295]], %[[broadcast_297]])
+// CHECK-NEXT:  %[[convert_299:[^ ]+]] = u32[1]{0} convert(%[[multiply_298]])
+// CHECK-NEXT:  %[[xor_332:[^ ]+]] = u32[1]{0} xor(%[[convert_331]], %[[convert_299]])
+// CHECK-NEXT:  %[[broadcast_333:[^ ]+]] = u32[1]{0} broadcast(%[[add_311]]), dimensions={}
+// CHECK-NEXT:  %[[xor_334:[^ ]+]] = u32[1]{0} xor(%[[xor_332]], %[[broadcast_333]])
+// CHECK-NEXT:  %[[convert_342:[^ ]+]] = u64[1]{0} convert(%[[xor_334]])
+// CHECK-NEXT:  %[[convert_327:[^ ]+]] = u32[1]{0} convert(%[[multiply_326]])
+// CHECK-NEXT:  %[[convert_343:[^ ]+]] = u64[1]{0} convert(%[[convert_327]])
+// CHECK-NEXT:  %[[constant_344:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_345:[^ ]+]] = u64[1]{0} broadcast(%[[constant_344]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_346:[^ ]+]] = u64[1]{0} shift-left(%[[convert_343]], %[[broadcast_345]])
+// CHECK-NEXT:  %[[or_347:[^ ]+]] = u64[1]{0} or(%[[convert_342]], %[[shift_left_346]])
+// CHECK-NEXT:  %[[reshape_354:[^ ]+]] = u64[1,1]{1,0} reshape(%[[or_347]])
+// CHECK-NEXT:  %[[constant_300:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_301:[^ ]+]] = u64[1]{0} broadcast(%[[constant_300]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_302:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_298]], %[[broadcast_301]])
+// CHECK-NEXT:  %[[convert_303:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_302]])
+// CHECK-NEXT:  %[[convert_271:[^ ]+]] = u32[1]{0} convert(%[[multiply_270]])
+// CHECK-NEXT:  %[[xor_304:[^ ]+]] = u32[1]{0} xor(%[[convert_303]], %[[convert_271]])
+// CHECK-NEXT:  %[[broadcast_305:[^ ]+]] = u32[1]{0} broadcast(%[[add_283]]), dimensions={}
+// CHECK-NEXT:  %[[xor_306:[^ ]+]] = u32[1]{0} xor(%[[xor_304]], %[[broadcast_305]])
+// CHECK-NEXT:  %[[convert_314:[^ ]+]] = u64[1]{0} convert(%[[xor_306]])
+// CHECK-NEXT:  %[[constant_315:[^ ]+]] = u64[] constant(3528531795)
+// CHECK-NEXT:  %[[broadcast_316:[^ ]+]] = u64[1]{0} broadcast(%[[constant_315]]), dimensions={}
+// CHECK-NEXT:  %[[multiply_317:[^ ]+]] = u64[1]{0} multiply(%[[convert_314]], %[[broadcast_316]])
+// CHECK-NEXT:  %[[constant_319:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_320:[^ ]+]] = u64[1]{0} broadcast(%[[constant_319]]), dimensions={}
+// CHECK-NEXT:  %[[shift_right_logical_321:[^ ]+]] = u64[1]{0} shift-right-logical(%[[multiply_317]], %[[broadcast_320]])
+// CHECK-NEXT:  %[[convert_322:[^ ]+]] = u32[1]{0} convert(%[[shift_right_logical_321]])
+// CHECK-NEXT:  %[[convert_290:[^ ]+]] = u32[1]{0} convert(%[[multiply_289]])
+// CHECK-NEXT:  %[[xor_335:[^ ]+]] = u32[1]{0} xor(%[[convert_322]], %[[convert_290]])
+// CHECK-NEXT:  %[[broadcast_336:[^ ]+]] = u32[1]{0} broadcast(%[[add_313]]), dimensions={}
+// CHECK-NEXT:  %[[xor_337:[^ ]+]] = u32[1]{0} xor(%[[xor_335]], %[[broadcast_336]])
+// CHECK-NEXT:  %[[convert_348:[^ ]+]] = u64[1]{0} convert(%[[xor_337]])
+// CHECK-NEXT:  %[[convert_318:[^ ]+]] = u32[1]{0} convert(%[[multiply_317]])
+// CHECK-NEXT:  %[[convert_349:[^ ]+]] = u64[1]{0} convert(%[[convert_318]])
+// CHECK-NEXT:  %[[constant_350:[^ ]+]] = u64[] constant(32)
+// CHECK-NEXT:  %[[broadcast_351:[^ ]+]] = u64[1]{0} broadcast(%[[constant_350]]), dimensions={}
+// CHECK-NEXT:  %[[shift_left_352:[^ ]+]] = u64[1]{0} shift-left(%[[convert_349]], %[[broadcast_351]])
+// CHECK-NEXT:  %[[or_353:[^ ]+]] = u64[1]{0} or(%[[convert_348]], %[[shift_left_352]])
+// CHECK-NEXT:  %[[reshape_355:[^ ]+]] = u64[1,1]{1,0} reshape(%[[or_353]])
+// CHECK-NEXT:  %[[concatenate_356:[^ ]+]] = u64[1,2]{1,0} concatenate(%[[reshape_354]], %[[reshape_355]]), dimensions={1}
+// CHECK-NEXT:  %[[reshape_357:[^ ]+]] = u64[2]{0} reshape(%[[concatenate_356]])
+// CHECK-NEXT:  %[[slice_358:[^ ]+]] = u64[1]{0} slice(%[[reshape_357]]), slice={[0:1]}
+// CHECK-NEXT:  %[[reshape_359:[^ ]+]] = u64[] reshape(%[[slice_358]])
+// CHECK-NEXT:  %[[b_or_sigma_4:[^ ]+]] = u64[] parameter(3)
+// CHECK-NEXT:  %[[bitcast_convert_360:[^ ]+]] = u64[] bitcast-convert(%[[b_or_sigma_4]])
+// CHECK-NEXT:  %[[bitcast_convert_361:[^ ]+]] = u64[] bitcast-convert(%[[a_or_mean_3]])
+// CHECK-NEXT:  %[[subtract_362:[^ ]+]] = u64[] subtract(%[[bitcast_convert_360]], %[[bitcast_convert_361]])
+// CHECK-NEXT:  %[[remainder_363:[^ ]+]] = u64[] remainder(%[[reshape_359]], %[[subtract_362]])
+// CHECK-NEXT:  %[[constant_364:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  %[[shift_right_logical_365:[^ ]+]] = u64[] shift-right-logical(%[[remainder_363]], %[[constant_364]])
+// CHECK-NEXT:  %[[bitcast_convert_366:[^ ]+]] = u64[] bitcast-convert(%[[shift_right_logical_365]])
+// CHECK-NEXT:  %[[add_367:[^ ]+]] = u64[] add(%[[a_or_mean_3]], %[[bitcast_convert_366]])
+// CHECK-NEXT:  %[[subtract_368:[^ ]+]] = u64[] subtract(%[[remainder_363]], %[[shift_right_logical_365]])
+// CHECK-NEXT:  %[[bitcast_convert_369:[^ ]+]] = u64[] bitcast-convert(%[[subtract_368]])
+// CHECK-NEXT:  ROOT %[[add_370:[^ ]+]] = u64[] add(%[[add_367]], %[[bitcast_convert_369]])
+
+// CHECK-LABEL: ENTRY %test_uniform_int
+// CHECK-NEXT:  %[[shape:[^ ]+]] = u64[3]{0} constant({8, 4, 2})
+// CHECK-NEXT:  %[[constant:[^ ]+]] = u64[] constant({{.*}})
+// CHECK-NEXT:  %[[rng_get_and_update_state:[^ ]+]] = u64[2]{0} rng-get-and-update-state(), delta=2
+// CHECK-NEXT:  %[[mean:[^ ]+]] = u64[] constant(0)
+// CHECK-NEXT:  %[[stdev:[^ ]+]] = u64[] constant(1)
+// CHECK-NEXT:  ROOT %[[call:[^ ]+]] = u64[] call(%[[constant]], %[[rng_get_and_update_state]], %[[mean]], %[[stdev]]), to_apply=%[[$rng_371]]
+
+HloModule TestUniformDistributionInt
+
+ENTRY test_uniform_int {
+  mean = u64[] constant(0)
+  stdev = u64[] constant(1)
+  shape = u64[3] constant({8, 4, 2})
+  ROOT result = u64[] rng(mean, stdev, shape), distribution=rng_uniform
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
index 578b4c974df5..9ed713d6473a 100644
--- a/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
+++ b/third_party/xla/xla/hlo/transforms/while_loop_trip_count_annotator.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
 
+#include <cstdint>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/hlo/translate/BUILD b/third_party/xla/xla/hlo/translate/BUILD
index ce7a3fbe09a8..29c9edcee7ca 100644
--- a/third_party/xla/xla/hlo/translate/BUILD
+++ b/third_party/xla/xla/hlo/translate/BUILD
@@ -1,5 +1,5 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
@@ -97,6 +97,7 @@ cc_library(
         "//xla/mlir/utils:error_util",
         "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/mlir_hlo:mhlo_passes",
+        "//xla/mlir_hlo:stablehlo_extension_passes",
         "//xla/service:hlo_proto_cc",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:errors",
@@ -111,8 +112,7 @@ cc_library(
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:UBDialect",
-        "@local_tsl//tsl/platform:errors",
         "@stablehlo//:register",
+        "@stablehlo//:stablehlo_passes",
     ],
 )
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/BUILD
index 7d290b75a6dc..670f44a3c901 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -21,11 +21,13 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/mlir_hlo",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -36,7 +38,6 @@ cc_library(
     deps = [
         ":attribute_importer",
         ":hlo_utils",
-        "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -48,6 +49,8 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -58,8 +61,6 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir_hlo",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -67,6 +68,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -119,6 +121,7 @@ cc_library(
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@stablehlo//:base",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -137,6 +140,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
+        "//xla/mlir_hlo:mhlo_passes",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -144,7 +148,9 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -179,13 +185,16 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/mlir/utils:type_util",
-        "//xla/mlir_hlo",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:SparseTensorEnums",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -194,18 +203,14 @@ xla_cc_test(
     srcs = ["hlo_utils_test.cc"],
     deps = [
         ":hlo_utils",
-        "//xla:literal",
-        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
-        "//xla/mlir_hlo",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -240,7 +245,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
index 4cc344f80a16..c453cd2e1b17 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
@@ -34,6 +34,8 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -80,7 +82,7 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncStart(
     return tsl::errors::InvalidArgument(
         "expected async_bundle tuple result type");
   }
-  auto result_types = result_type.cast<mlir::TupleType>().getTypes();
+  auto result_types = mlir::cast<mlir::TupleType>(result_type).getTypes();
   if (result_types.size() < 2) {
     return tsl::errors::InvalidArgument(
         "async_bundle must contain at least two values");
@@ -107,10 +109,10 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncStart(
   // Attach the frontend_attributes and sharding attributes to the async op
   // instead of the sync op. First, semantically sharding attributes cannot be
   // attached to the sync op since the sync op may not produce the same number
-  // of results as the sharding's tuple element count, e.g., `mhlo.send` vs. HLO
-  // `send`. Second, `mlir_hlo_to_hlo.cc` imports these attributes from the
-  // `mhlo.async_start` ops, so attaching them to the sync op will make them
-  // disappear during MHLO to HLO lowering.
+  // of results as the sharding's tuple element count, e.g., `stablehlo.send`
+  // vs. HLO `send`. Second, `mlir_hlo_to_hlo.cc` imports these attributes from
+  // the `mhlo.async_start` ops, so attaching them to the sync op will make them
+  // disappear during StableHLO/MHLO to HLO lowering.
   for (auto it = attributes.begin(); it != attributes.end();) {
     if (it->getName() == kShardingAttr ||
         it->getName() == kFrontendAttributesAttr) {
@@ -195,7 +197,8 @@ absl::StatusOr<mlir::Operation*> ImportSend(
     channel_handle.set_type(send_op->is_host_transfer()
                                 ? ChannelHandle::DEVICE_TO_HOST
                                 : ChannelHandle::DEVICE_TO_DEVICE);
-    attributes.push_back(ConvertChannelHandle(channel_handle, builder));
+    attributes.push_back(
+        stablehlo::ConvertChannelHandle(channel_handle, builder));
   }
 
   bool isPipelined =
@@ -211,7 +214,7 @@ absl::StatusOr<mlir::Operation*> ImportSend(
     // format of (args, results, scratchpad), so to rewrite the `send` and
     // `send-done` ops to use the new-style async API, we need to reorder the
     // arguments to be in (args, token, sync flag) order.
-    auto result_types = result_type.cast<mlir::TupleType>().getTypes();
+    auto result_types = mlir::cast<mlir::TupleType>(result_type).getTypes();
     if (result_types.size() != 3)
       return InvalidArgument("send should return a 3-tuple");
     auto async_arg_type = mlir::TupleType::get(
@@ -219,22 +222,22 @@ absl::StatusOr<mlir::Operation*> ImportSend(
     auto async_bundled_tuple = mlir::TupleType::get(
         builder->getContext(),
         {async_arg_type, result_types[2], result_types[1]});
-    return ImportOldStyleAsyncStart<mlir::mhlo::SendOp>(
+    return ImportOldStyleAsyncStart<mlir::stablehlo::SendOp>(
         symbol_table, attributes, operands, loc, async_bundled_tuple, builder,
         "send_", [](auto) { return absl::OkStatus(); });
   }
 
   // Otherwise return send op for non-pipelined send.
-  // Skip empty data in MLIR send(tuple<>, token) --> mhlo.send(token)
+  // Skip empty data in MLIR send(tuple<>, token) --> stablehlo.send(token)
   auto token = operands[1];
   llvm::ArrayRef<mlir::Value> args = operands;
   if (args.size() == 2 && IsEmptyTuple(args[0].getType())) {
     args = args.drop_front(1);
   }
-  auto send =
-      builder
-          ->create<mlir::mhlo::SendOp>(loc, token.getType(), args, attributes)
-          .getOperation();
+  auto send = builder
+                  ->create<mlir::stablehlo::SendOp>(loc, token.getType(), args,
+                                                    attributes)
+                  .getOperation();
   if (instruction->has_sharding()) {
     const HloSharding& sharding = instruction->sharding();
     if (sharding.IsTuple() && sharding.tuple_elements().size() == 3) {
@@ -268,7 +271,8 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
     channel_handle.set_type(recv_op->is_host_transfer()
                                 ? ChannelHandle::HOST_TO_DEVICE
                                 : ChannelHandle::DEVICE_TO_DEVICE);
-    attributes.push_back(ConvertChannelHandle(channel_handle, builder));
+    attributes.push_back(
+        stablehlo::ConvertChannelHandle(channel_handle, builder));
   }
 
   // Currently only consolidates async recv with result, 0-result recv uses old
@@ -294,14 +298,14 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
     auto async_result_type_tuple = builder->getTupleType(async_result_types);
     auto async_bundled_tuple = builder->getTupleType(
         {result_types[2], async_result_type_tuple, result_types[1]});
-    return ImportOldStyleAsyncStart<mlir::mhlo::RecvOp>(
+    return ImportOldStyleAsyncStart<mlir::stablehlo::RecvOp>(
         symbol_table, attributes, operands, loc, async_bundled_tuple, builder,
         "recv_", [](auto) { return absl::OkStatus(); });
   }
 
   // Return recv op for non-pipelined send, skip empty tuple result type
   if (!IsEmptyTuple(result_types[0])) {
-    auto recv = builder->create<mlir::mhlo::RecvOp>(
+    auto recv = builder->create<mlir::stablehlo::RecvOp>(
         loc, llvm::SmallVector<mlir::Type>{result_types[0], result_types[2]},
         operands, attributes);
     if (instruction->has_sharding()) {
@@ -324,13 +328,13 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
 
   // Recv with no result, only token.
   // To keep parity, if op only returns token, wrap in tuple<tuple<>, token>
-  auto recv = builder->create<mlir::mhlo::RecvOp>(
+  auto recv = builder->create<mlir::stablehlo::RecvOp>(
       loc, llvm::SmallVector<mlir::Type>{result_types[2]}, operands,
       attributes);
-  auto empty_tuple =
-      builder->create<mlir::mhlo::TupleOp>(loc, llvm::ArrayRef<mlir::Value>{});
+  auto empty_tuple = builder->create<mlir::stablehlo::TupleOp>(
+      loc, llvm::ArrayRef<mlir::Value>{});
 
-  return builder->create<mlir::mhlo::TupleOp>(
+  return builder->create<mlir::stablehlo::TupleOp>(
       loc,
       llvm::ArrayRef<mlir::Value>{empty_tuple.getResult(), recv.getResult(0)});
 }
@@ -350,22 +354,24 @@ absl::StatusOr<mlir::Operation*> ImportAllGatherStart(
   attributes.push_back(
       ConvertReplicaGroups(all_gather_start->replica_groups(), builder));
   if (all_gather_start->channel_id().has_value())
-    attributes.push_back(
-        ConvertChannelHandle(all_gather_start->channel_id().value(), builder));
+    attributes.push_back(stablehlo::ConvertChannelHandle(
+        all_gather_start->channel_id().value(), builder));
   if (all_gather_start->use_global_device_ids())
     attributes.push_back(ConvertUseGlobalDeviceIds(builder));
   if (all_gather_start->operands().size() > 1)
-    return InvalidArgument("Async tuple all-gather is not supported in MHLO");
+    return InvalidArgument(
+        "Async tuple all-gather is not supported in StableHLO");
 
   if (!llvm::isa<mlir::TupleType>(result_type)) {
     // Async AllGather's output type is bundle<input_type,output_type>
     // There are some instances where the output type is not a tuple, this seems
-    // to be the more modern case, so we will wrap these in a tuple for MHLO.
+    // to be the more modern case, so we will wrap these in a tuple for
+    // StableHLO.
     result_type = mlir::TupleType::get(builder->getContext(),
                                        {operands[0].getType(), result_type});
   }
 
-  return ImportOldStyleAsyncStart<mlir::mhlo::AllGatherOp>(
+  return ImportOldStyleAsyncStart<mlir::stablehlo::AllGatherOp>(
       symbol_table, attributes, operands, loc, result_type, builder,
       "all_gather_", [](auto) { return absl::OkStatus(); });
 }
@@ -375,28 +381,30 @@ absl::StatusOr<mlir::Operation*> ImportAllReduceStart(
     const llvm::SmallVectorImpl<mlir::Value>& operands,
     llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
     mlir::Type result_type, mlir::OpBuilder* builder,
-    std::function<absl::Status(mlir::mhlo::AllReduceOp)> mutate_op,
+    std::function<absl::Status(mlir::stablehlo::AllReduceOp)> mutate_op,
     mlir::SymbolTable& symbol_table) {
   auto all_reduce_start = Cast<HloAllReduceInstruction>(instruction);
   attributes.push_back(
       ConvertReplicaGroups(all_reduce_start->replica_groups(), builder));
   if (all_reduce_start->channel_id().has_value())
-    attributes.push_back(
-        ConvertChannelHandle(all_reduce_start->channel_id().value(), builder));
+    attributes.push_back(stablehlo::ConvertChannelHandle(
+        all_reduce_start->channel_id().value(), builder));
   if (all_reduce_start->use_global_device_ids())
     attributes.push_back(ConvertUseGlobalDeviceIds(builder));
   if (all_reduce_start->operands().size() > 1)
-    return InvalidArgument("Async tuple all-reduce is not supported in MHLO");
+    return InvalidArgument(
+        "Async tuple all-reduce is not supported in StableHLO");
 
   if (!llvm::isa<mlir::TupleType>(result_type)) {
     // Async AllReduce's output type is bundle<input_type,output_type>
     // There are some instances where the output type is not a tuple, this seems
-    // to be the more modern case, so we will wrap these in a tuple for MHLO.
+    // to be the more modern case, so we will wrap these in a tuple for
+    // StableHLO.
     result_type = mlir::TupleType::get(builder->getContext(),
                                        {operands[0].getType(), result_type});
   }
 
-  return ImportOldStyleAsyncStart<mlir::mhlo::AllReduceOp>(
+  return ImportOldStyleAsyncStart<mlir::stablehlo::AllReduceOp>(
       symbol_table, attributes, operands, loc, result_type, builder,
       "all_reduce_", mutate_op);
 }
@@ -414,11 +422,12 @@ absl::StatusOr<mlir::Operation*> ImportCollectivePermuteStart(
   if (!llvm::isa<mlir::TupleType>(result_type)) {
     // Async CollectivePermute's output type is bundle<input_type,output_type>
     // There are some instances where the output type is not a tuple, this seems
-    // to be the more modern case, so we will wrap these in a tuple for MHLO.
+    // to be the more modern case, so we will wrap these in a tuple for
+    // StableHLO.
     result_type = mlir::TupleType::get(builder->getContext(),
                                        {operands[0].getType(), result_type});
   }
-  return ImportOldStyleAsyncStart<mlir::mhlo::CollectivePermuteOp>(
+  return ImportOldStyleAsyncStart<mlir::stablehlo::CollectivePermuteOp>(
       symbol_table, attributes, operands, loc, result_type, builder,
       "collective_permute_", [&](auto) { return absl::OkStatus(); });
 }
@@ -439,8 +448,8 @@ absl::StatusOr<mlir::Operation*> ImportCopyStart(
                                 *cross_program_prefetch_index)));
     // Cross-program prefetch allows copy ops to accept tuples, in which
     // case, we need to double-wrap inputs and outputs in tuples.
-    if (operands[0].getType().isa<mlir::TupleType>()) {
-      auto result_types = result_type.cast<mlir::TupleType>().getTypes();
+    if (mlir::isa<mlir::TupleType>(operands[0].getType())) {
+      auto result_types = mlir::cast<mlir::TupleType>(result_type).getTypes();
       result_type = mlir::TupleType::get(
           context,
           {mlir::TupleType::get(context, {result_types[0]}),
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.h
index 116d17f86c7b..30fc9fbd125e 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -62,7 +63,7 @@ absl::StatusOr<mlir::Operation*> ImportAllReduceStart(
     const llvm::SmallVectorImpl<mlir::Value>& operands,
     llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
     mlir::Type result_type, mlir::OpBuilder* builder,
-    std::function<absl::Status(mlir::mhlo::AllReduceOp)> mutate_op,
+    std::function<absl::Status(mlir::stablehlo::AllReduceOp)> mutate_op,
     mlir::SymbolTable& symbol_table);
 
 absl::StatusOr<mlir::Operation*> ImportCollectivePermuteStart(
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
index aca98fa96512..ba76f5cce6cf 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
@@ -27,41 +27,28 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
+namespace stablehlo {
 
-mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
-                                       mlir::Builder* builder) {
-  if (!config) return {};
-
-  // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
-  // case and the parser sticks it in. Maybe we should too.
-  llvm::SmallVector<mlir::Attribute, 4> operand_precision_attrs;
-
-  for (auto prec : config->operand_precision()) {
-    operand_precision_attrs.push_back(mlir::mhlo::PrecisionAttr::get(
-        builder->getContext(),
-        mlir::mhlo::symbolizePrecision(PrecisionConfig_Precision_Name(prec))
-            .value()));
-  }
-  return builder->getArrayAttr(operand_precision_attrs);
-}
-
-// Converts the gather dimensions to attribute.
-mlir::mhlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
+mlir::stablehlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
     const xla::GatherDimensionNumbers& dnums, mlir::Builder* builder) {
   std::vector<int64_t> offset_dims(dnums.offset_dims().begin(),
                                    dnums.offset_dims().end());
@@ -75,13 +62,13 @@ mlir::mhlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
       dnums.start_indices_batching_dims().end());
   std::vector<int64_t> start_index_map(dnums.start_index_map().begin(),
                                        dnums.start_index_map().end());
-  return mlir::mhlo::GatherDimensionNumbersAttr::get(
+  return mlir::stablehlo::GatherDimensionNumbersAttr::get(
       builder->getContext(), offset_dims, collapsed_slice_dims,
       operand_batching_dims, start_indices_batching_dims, start_index_map,
       dnums.index_vector_dim());
 }
 
-mlir::mhlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
+mlir::stablehlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
     const xla::ScatterDimensionNumbers& dnums, mlir::Builder* builder) {
   std::vector<int64_t> update_window_dims(dnums.update_window_dims().begin(),
                                           dnums.update_window_dims().end());
@@ -95,13 +82,75 @@ mlir::mhlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
   std::vector<int64_t> scatter_dims_to_operand_dims(
       dnums.scatter_dims_to_operand_dims().begin(),
       dnums.scatter_dims_to_operand_dims().end());
-  return mlir::mhlo::ScatterDimensionNumbersAttr::get(
+  return mlir::stablehlo::ScatterDimensionNumbersAttr::get(
       builder->getContext(), update_window_dims, inserted_window_dims,
       input_batching_dims, scatter_indices_batching_dims,
       scatter_dims_to_operand_dims, dnums.index_vector_dim());
 }
 
-mlir::mhlo::DotAlgorithmAttr ConvertDotAlgorithm(
+mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel,
+                                          mlir::Builder* builder) {
+  return builder->getNamedAttr(
+      "channel_handle",
+      mlir::stablehlo::ChannelHandleAttr::get(
+          builder->getContext(), channel.handle(), channel.type()));
+}
+
+mlir::NamedAttribute ConvertChannelHandle(std::optional<int64_t> channel_id,
+                                          mlir::Builder* builder) {
+  ChannelHandle channel_handle;
+  if (channel_id) channel_handle.set_handle(*channel_id);
+  return stablehlo::ConvertChannelHandle(channel_handle, builder);
+}
+
+mlir::stablehlo::ConvDimensionNumbersAttr ConvertConvDimensionNumbers(
+    const xla::ConvolutionDimensionNumbers& dnums, mlir::Builder* builder) {
+  auto arrayref = [](absl::Span<const int64_t> array) {
+    return llvm::ArrayRef<int64_t>{array.data(), array.size()};
+  };
+  llvm::SmallVector<int64_t, 4> input_spatial_dims(
+      dnums.input_spatial_dimensions().begin(),
+      dnums.input_spatial_dimensions().end());
+  llvm::SmallVector<int64_t, 4> kernel_spatial_dims(
+      dnums.kernel_spatial_dimensions().begin(),
+      dnums.kernel_spatial_dimensions().end());
+  llvm::SmallVector<int64_t, 4> output_spatial_dims(
+      dnums.output_spatial_dimensions().begin(),
+      dnums.output_spatial_dimensions().end());
+  return mlir::stablehlo::ConvDimensionNumbersAttr::get(
+      builder->getContext(), dnums.input_batch_dimension(),
+      dnums.input_feature_dimension(),
+      arrayref(dnums.input_spatial_dimensions()),
+      dnums.kernel_input_feature_dimension(),
+      dnums.kernel_output_feature_dimension(),
+      arrayref(dnums.kernel_spatial_dimensions()),
+      dnums.output_batch_dimension(), dnums.output_feature_dimension(),
+      arrayref(dnums.output_spatial_dimensions()));
+}
+
+absl::StatusOr<mlir::stablehlo::CustomCallApiVersion>
+ConvertCustomCallApiVersion(xla::CustomCallApiVersion api_version) {
+  switch (api_version) {
+    case xla::CustomCallApiVersion::API_VERSION_UNSPECIFIED:
+      return mlir::stablehlo::CustomCallApiVersion::API_VERSION_UNSPECIFIED;
+    case xla::CustomCallApiVersion::API_VERSION_ORIGINAL:
+      return mlir::stablehlo::CustomCallApiVersion::API_VERSION_ORIGINAL;
+    case xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
+      return mlir::stablehlo::CustomCallApiVersion::
+          API_VERSION_STATUS_RETURNING;
+    case xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
+      return mlir::stablehlo::CustomCallApiVersion::
+          API_VERSION_STATUS_RETURNING_UNIFIED;
+    case xla::CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      return mlir::stablehlo::CustomCallApiVersion::API_VERSION_TYPED_FFI;
+    default:
+      return InvalidArgument("Unknown CustomCallApiVersion enum value #%d (%s)",
+                             api_version,
+                             xla::CustomCallApiVersion_Name(api_version));
+  }
+}
+
+mlir::stablehlo::DotAlgorithmAttr ConvertDotAlgorithm(
     const PrecisionConfig::Algorithm algorithm, mlir::Builder* builder) {
   mlir::Type lhs, rhs, accum;
   int64_t lhsComponentCount = 1, rhsComponentCount = 1,
@@ -170,13 +219,141 @@ mlir::mhlo::DotAlgorithmAttr ConvertDotAlgorithm(
     }
     default:
       // Unset, sentinels
-      return mlir::mhlo::DotAlgorithmAttr{};
+      return mlir::stablehlo::DotAlgorithmAttr{};
   }
-  return mlir::mhlo::DotAlgorithmAttr::get(
+  return mlir::stablehlo::DotAlgorithmAttr::get(
       builder->getContext(), lhs, rhs, accum, lhsComponentCount,
       rhsComponentCount, numPrimitiveOperations, allowImpreciseAccumulation);
 }
 
+mlir::stablehlo::DotDimensionNumbersAttr ConvertDotDimensionNumbers(
+    const DotDimensionNumbers& dnums, mlir::Builder* builder) {
+  auto arrayref = [](absl::Span<const int64_t> array) {
+    return llvm::ArrayRef<int64_t>{array.data(), array.size()};
+  };
+  return mlir::stablehlo::DotDimensionNumbersAttr::get(
+      builder->getContext(), arrayref(dnums.lhs_batch_dimensions()),
+      arrayref(dnums.rhs_batch_dimensions()),
+      arrayref(dnums.lhs_contracting_dimensions()),
+      arrayref(dnums.rhs_contracting_dimensions()));
+}
+
+mlir::ArrayAttr ConvertOutputOperandAliasing(
+    const std::vector<std::pair<
+        xla::ShapeIndex, std::pair<int64_t, xla::ShapeIndex>>>& aliasInfo,
+    mlir::Builder* builder) {
+  auto arrayref = [](absl::Span<const int64_t> array) {
+    return llvm::ArrayRef<int64_t>{array.data(), array.size()};
+  };
+  std::vector<mlir::Attribute> attrs;
+  for (auto& [output_tuple_idx, operand_idx] : aliasInfo) {
+    auto attr = mlir::stablehlo::OutputOperandAliasAttr::get(
+        builder->getContext(), arrayref(output_tuple_idx), operand_idx.first,
+        arrayref(operand_idx.second));
+    attrs.push_back(attr);
+  }
+  return builder->getArrayAttr(attrs);
+}
+
+mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
+                                       mlir::Builder* builder) {
+  if (!config) return {};
+
+  // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
+  // case and the parser sticks it in. Maybe we should too.
+  llvm::SmallVector<mlir::Attribute, 4> operand_precision_attrs;
+
+  for (auto prec : config->operand_precision()) {
+    operand_precision_attrs.push_back(mlir::stablehlo::PrecisionAttr::get(
+        builder->getContext(), mlir::stablehlo::symbolizePrecision(
+                                   PrecisionConfig_Precision_Name(prec))
+                                   .value()));
+  }
+  return builder->getArrayAttr(operand_precision_attrs);
+}
+
+mlir::stablehlo::ResultAccuracyAttr ConvertResultAccuracy(
+    const ResultAccuracy& result_accuracy, mlir::Builder* builder) {
+  if (result_accuracy.has_tolerance()) {
+    return mlir::stablehlo::ResultAccuracyAttr::get(
+        builder->getContext(),
+        llvm::APFloat(result_accuracy.tolerance().atol()),
+        llvm::APFloat(result_accuracy.tolerance().rtol()),
+        result_accuracy.tolerance().ulps(),
+        // Explicitly set the mode to TOLERANCE since ResultAccuracy has no
+        // TOLERANCE enum.
+        mlir::stablehlo::ResultAccuracyModeAttr::get(
+            builder->getContext(),
+            mlir::stablehlo::ResultAccuracyMode::TOLERANCE));
+  }
+  return mlir::stablehlo::ResultAccuracyAttr::get(
+      builder->getContext(), llvm::APFloat(0.0), llvm::APFloat(0.0), 0,
+      mlir::stablehlo::ResultAccuracyModeAttr::get(
+          builder->getContext(),
+          mlir::stablehlo::symbolizeResultAccuracyMode(result_accuracy.mode())
+              .value()));
+}
+
+}  // namespace stablehlo
+
+mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
+                                       mlir::Builder* builder) {
+  if (!config) return {};
+
+  // TODO(b/129709049) The HLO text format elides this in the all DEFAULT
+  // case and the parser sticks it in. Maybe we should too.
+  llvm::SmallVector<mlir::Attribute, 4> operand_precision_attrs;
+
+  for (auto prec : config->operand_precision()) {
+    operand_precision_attrs.push_back(mlir::mhlo::PrecisionAttr::get(
+        builder->getContext(),
+        mlir::mhlo::symbolizePrecision(PrecisionConfig_Precision_Name(prec))
+            .value()));
+  }
+  return builder->getArrayAttr(operand_precision_attrs);
+}
+
+// Converts the gather dimensions to attribute.
+mlir::mhlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
+    const xla::GatherDimensionNumbers& dnums, mlir::Builder* builder) {
+  std::vector<int64_t> offset_dims(dnums.offset_dims().begin(),
+                                   dnums.offset_dims().end());
+  std::vector<int64_t> collapsed_slice_dims(
+      dnums.collapsed_slice_dims().begin(), dnums.collapsed_slice_dims().end());
+  std::vector<int64_t> operand_batching_dims(
+      dnums.operand_batching_dims().begin(),
+      dnums.operand_batching_dims().end());
+  std::vector<int64_t> start_indices_batching_dims(
+      dnums.start_indices_batching_dims().begin(),
+      dnums.start_indices_batching_dims().end());
+  std::vector<int64_t> start_index_map(dnums.start_index_map().begin(),
+                                       dnums.start_index_map().end());
+  return mlir::mhlo::GatherDimensionNumbersAttr::get(
+      builder->getContext(), offset_dims, collapsed_slice_dims,
+      operand_batching_dims, start_indices_batching_dims, start_index_map,
+      dnums.index_vector_dim());
+}
+
+mlir::mhlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
+    const xla::ScatterDimensionNumbers& dnums, mlir::Builder* builder) {
+  std::vector<int64_t> update_window_dims(dnums.update_window_dims().begin(),
+                                          dnums.update_window_dims().end());
+  std::vector<int64_t> inserted_window_dims(
+      dnums.inserted_window_dims().begin(), dnums.inserted_window_dims().end());
+  std::vector<int64_t> input_batching_dims(dnums.input_batching_dims().begin(),
+                                           dnums.input_batching_dims().end());
+  std::vector<int64_t> scatter_indices_batching_dims(
+      dnums.scatter_indices_batching_dims().begin(),
+      dnums.scatter_indices_batching_dims().end());
+  std::vector<int64_t> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  return mlir::mhlo::ScatterDimensionNumbersAttr::get(
+      builder->getContext(), update_window_dims, inserted_window_dims,
+      input_batching_dims, scatter_indices_batching_dims,
+      scatter_dims_to_operand_dims, dnums.index_vector_dim());
+}
+
 mlir::mhlo::DotDimensionNumbersAttr ConvertDotDimensionNumbers(
     const DotDimensionNumbers& dnums, mlir::Builder* builder) {
   auto arrayref = [](absl::Span<const int64_t> array) {
@@ -255,56 +432,16 @@ absl::StatusOr<mlir::mhlo::SparsityDescriptorAttr> ConvertSparsityDescriptor(
   }
 }
 
-absl::StatusOr<mlir::mhlo::FftType> ConvertFftType(FftType type) {
-  switch (type) {
-    case FftType::FFT:
-      return mlir::mhlo::FftType::FFT;
-    case FftType::IFFT:
-      return mlir::mhlo::FftType::IFFT;
-    case FftType::RFFT:
-      return mlir::mhlo::FftType::RFFT;
-    case FftType::IRFFT:
-      return mlir::mhlo::FftType::IRFFT;
-    default:
-      return InvalidArgument("Unknown FFT type enum value #%d", type);
-  }
-}
-
-absl::StatusOr<mlir::mhlo::Transpose> ConvertTranspose(
-    xla::TriangularSolveOptions_Transpose transpose) {
-  switch (transpose) {
-    case TriangularSolveOptions::NO_TRANSPOSE:
-      return mlir::mhlo::Transpose::NO_TRANSPOSE;
-    case TriangularSolveOptions::TRANSPOSE:
-      return mlir::mhlo::Transpose::TRANSPOSE;
-    case TriangularSolveOptions::ADJOINT:
-      return mlir::mhlo::Transpose::ADJOINT;
-    case TriangularSolveOptions::TRANSPOSE_INVALID:
-      return mlir::mhlo::Transpose::TRANSPOSE_INVALID;
-    default:
-      return InvalidArgument("Unknown transpose enum value #%d", transpose);
-  }
-}
-
 absl::StatusOr<mlir::mhlo::CustomCallApiVersion> ConvertCustomCallApiVersion(
     xla::CustomCallApiVersion api_version) {
-  switch (api_version) {
-    case xla::CustomCallApiVersion::API_VERSION_UNSPECIFIED:
-      return mlir::mhlo::CustomCallApiVersion::API_VERSION_UNSPECIFIED;
-    case xla::CustomCallApiVersion::API_VERSION_ORIGINAL:
-      return mlir::mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL;
-    case xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
-      return mlir::mhlo::CustomCallApiVersion::API_VERSION_STATUS_RETURNING;
-    case xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
-      return mlir::mhlo::CustomCallApiVersion::
-          API_VERSION_STATUS_RETURNING_UNIFIED;
-    case xla::CustomCallApiVersion::API_VERSION_TYPED_FFI:
-      return mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI;
-    default:
-      return InvalidArgument("Unknown CustomCallApiVersion enum value #%d (%s)",
-                             api_version,
-                             xla::CustomCallApiVersion_Name(api_version));
-  }
+  TF_ASSIGN_OR_RETURN(auto stablehlo_api_version,
+                      stablehlo::ConvertCustomCallApiVersion(api_version));
+  auto mhlo_api_version = mlir::mhlo::symbolizeCustomCallApiVersion(
+      mlir::stablehlo::stringifyCustomCallApiVersion(stablehlo_api_version));
+  if (!mhlo_api_version.has_value())
+    return InvalidArgument("Unknown CustomCallApiVersion enum value #%d",
+                           api_version);
+  return mhlo_api_version.value();
 }
 
 mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel,
@@ -314,6 +451,7 @@ mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel,
       mlir::mhlo::ChannelHandleAttr::get(builder->getContext(),
                                          channel.handle(), channel.type()));
 }
+
 mlir::NamedAttribute ConvertChannelHandle(std::optional<int64_t> channel_id,
                                           mlir::Builder* builder) {
   ChannelHandle channel_handle;
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
index 93e4514aa609..acc18302bcaa 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
@@ -33,23 +34,68 @@ limitations under the License.
 
 namespace xla {
 
+namespace stablehlo {
+// Converts the channel handle to attributes.
+mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel,
+                                          mlir::Builder* builder);
+mlir::NamedAttribute ConvertChannelHandle(std::optional<int64_t> channel_id,
+                                          mlir::Builder* builder);
+
+absl::StatusOr<mlir::stablehlo::CustomCallApiVersion>
+ConvertCustomCallApiVersion(xla::CustomCallApiVersion api_version);
+
+// Converts the gather dimensions to attributes.
+mlir::stablehlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
+    const xla::GatherDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the scatter dimensions to attributes.
+mlir::stablehlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
+    const xla::ScatterDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the dot algorithm to attributes.
+mlir::stablehlo::DotAlgorithmAttr ConvertDotAlgorithm(
+    PrecisionConfig::Algorithm algorithm, mlir::Builder* builder);
+
+// Converts the conv dimensions to attributes.
+mlir::stablehlo::ConvDimensionNumbersAttr ConvertConvDimensionNumbers(
+    const xla::ConvolutionDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the dot dimensions to attributes.
+mlir::stablehlo::DotDimensionNumbersAttr ConvertDotDimensionNumbers(
+    const DotDimensionNumbers& dnums, mlir::Builder* builder);
+
+// Converts the output operand aliasing to attributes.
+mlir::ArrayAttr ConvertOutputOperandAliasing(
+    const std::vector<std::pair<
+        xla::ShapeIndex, std::pair<int64_t, xla::ShapeIndex>>>& aliasInfo,
+    mlir::Builder* builder);
+
+// Converts an XLA PrecisionConfig to the corresponding MLIR attribute.
+mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
+                                       mlir::Builder* builder);
+
+// Converts an XLA ResultAccuracy to the corresponding MLIR attribute.
+mlir::stablehlo::ResultAccuracyAttr ConvertResultAccuracy(
+    const ResultAccuracy& result_accuracy, mlir::Builder* builder);
+
+}  // namespace stablehlo
+
 // Converts an XLA PrecisionConfig to the corresponding MLIR attribute.
 mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
                                        mlir::Builder* builder);
 
 // Converts the gather dimensions to attributes.
+// [Deprecated] Used in TF2XLA only.
 mlir::mhlo::GatherDimensionNumbersAttr ConvertGatherDimensionNumbers(
     const xla::GatherDimensionNumbers& dnums, mlir::Builder* builder);
 
 // Converts the scatter dimensions to attributes.
+// [Deprecated] Used in TF2XLA only.
 mlir::mhlo::ScatterDimensionNumbersAttr ConvertScatterDimensionNumbers(
     const xla::ScatterDimensionNumbers& dnums, mlir::Builder* builder);
 
-// Converts the dot algorithm to attributes.
-mlir::mhlo::DotAlgorithmAttr ConvertDotAlgorithm(
-    PrecisionConfig::Algorithm algorithm, mlir::Builder* builder);
-
 // Converts the dot dimensions to attributes.
+// Used by sparse dot.
 mlir::mhlo::DotDimensionNumbersAttr ConvertDotDimensionNumbers(
     const DotDimensionNumbers& dnums, mlir::Builder* builder);
 
@@ -58,6 +104,7 @@ mlir::mhlo::RaggedDotDimensionNumbersAttr ConvertRaggedDotDimensionNumbers(
     const RaggedDotDimensionNumbers& dnums, mlir::Builder* builder);
 
 // Converts the conv dimensions to attributes.
+// [Deprecated] Used in TF2XLA only.
 mlir::mhlo::ConvDimensionNumbersAttr ConvertConvDimensionNumbers(
     const xla::ConvolutionDimensionNumbers& dnums, mlir::Builder* builder);
 
@@ -71,10 +118,6 @@ mlir::ArrayAttr ConvertOutputOperandAliasing(
 absl::StatusOr<mlir::mhlo::SparsityDescriptorAttr> ConvertSparsityDescriptor(
     xla::SparsityDescriptor sparsity_descriptor, mlir::Builder* builder);
 
-absl::StatusOr<mlir::mhlo::FftType> ConvertFftType(FftType type);
-absl::StatusOr<mlir::mhlo::Transpose> ConvertTranspose(
-    TriangularSolveOptions_Transpose transpose);
-
 absl::StatusOr<mlir::mhlo::CustomCallApiVersion> ConvertCustomCallApiVersion(
     xla::CustomCallApiVersion api_version);
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
index 10a78e149ec9..ede0bec69c1b 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -66,9 +66,9 @@ absl::StatusOr<mlir::Operation*> ImportDynamicBroadcastInDimOp(
   }
 
   return builder
-      ->create<mlir::mhlo::DynamicBroadcastInDimOp>(
+      ->create<mlir::stablehlo::DynamicBroadcastInDimOp>(
           loc, result_type, operands[0], operands[1],
-          builder->getI64TensorAttr(broadcast_dimensions))
+          builder->getDenseI64ArrayAttr(broadcast_dimensions))
       .getOperation();
 }
 
@@ -79,7 +79,7 @@ absl::StatusOr<mlir::Operation*> ImportDynamicReshapeOp(
     return Internal("backend_config attribute must be empty.");
   }
   return builder
-      ->create<mlir::mhlo::DynamicReshapeOp>(loc, result_type, operands)
+      ->create<mlir::stablehlo::DynamicReshapeOp>(loc, result_type, operands)
       .getOperation();
 }
 
@@ -90,7 +90,7 @@ absl::StatusOr<mlir::Operation*> ImportRealDynamicSliceOp(
     return Internal("backend_config attribute must be empty.");
   }
   return builder
-      ->create<mlir::mhlo::RealDynamicSliceOp>(loc, result_type, operands)
+      ->create<mlir::stablehlo::RealDynamicSliceOp>(loc, result_type, operands)
       .getOperation();
 }
 
@@ -187,7 +187,7 @@ absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
 
   if (custom_call_target == "mhlo.uniform_quantize") {
     return builder
-        ->create<mlir::mhlo::UniformQuantizeOp>(
+        ->create<mlir::stablehlo::UniformQuantizeOp>(
             loc,
             mlir::RankedTensorType::get(
                 mlir::cast<mlir::RankedTensorType>(result_type).getShape(),
@@ -198,7 +198,8 @@ absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
 
   if (custom_call_target == "mhlo.uniform_dequantize") {
     return builder
-        ->create<mlir::mhlo::UniformDequantizeOp>(loc, result_type, operands)
+        ->create<mlir::stablehlo::UniformDequantizeOp>(loc, result_type,
+                                                       operands)
         .getOperation();
   }
   return InvalidArgument("Unsupported MHLO op custom_call %s",
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc
index 1fd2fa52a1b6..eae90d3901c1 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LogicalResult.h"
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
@@ -46,17 +45,21 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/TypeRange.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "stablehlo/dialect/Base.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -121,16 +124,17 @@ std::string SanitizeFunctionName(llvm::StringRef name) {
 bool DotIsDefault(const HloInstruction* instruction) {
   // If LHS/RHS has rank greater than 2, not default dot
   const auto& operands = instruction->operands();
-  if (operands[0]->shape().rank() > 2 || operands[1]->shape().rank() > 2) {
+  if (operands[0]->shape().dimensions().size() > 2 ||
+      operands[1]->shape().dimensions().size() > 2) {
     return false;
   }
 
   auto dnums = instruction->dot_dimension_numbers();
   DotDimensionNumbers default_dimension_numbers;
   default_dimension_numbers.add_lhs_contracting_dimensions(
-      instruction->operand(0)->shape().dimensions_size() == 1 ? 0 : 1);
+      instruction->operand(0)->shape().dimensions().size() == 1 ? 0 : 1);
   default_dimension_numbers.add_rhs_contracting_dimensions(0);
-  return protobuf_util::ProtobufEquals(dnums, default_dimension_numbers);
+  return protobuf_util::HaveSameSerialization(dnums, default_dimension_numbers);
 }
 
 ArrayRef<HloSharding> FlattenTupleSharding(const HloSharding& sharding) {
@@ -140,6 +144,19 @@ ArrayRef<HloSharding> FlattenTupleSharding(const HloSharding& sharding) {
   return sharding;
 }
 
+// Returns true if changed.
+bool FoldGetTupleElementOfTuple(mlir::stablehlo::GetTupleElementOp op) {
+  if (auto tupleOp =
+          op->getOperand(0).getDefiningOp<mlir::stablehlo::TupleOp>()) {
+    int64_t idx = op.getIndex();
+    llvm::SmallVector<Value> new_operand{tupleOp.getOperand(idx)};
+    op->replaceAllUsesWith(new_operand);
+    op->erase();
+    return true;
+  }
+  return false;
+}
+
 // Clean up the GetTupleElementOp, created during the flattening of
 // tuple arguments and return values, if eligible for folding. Removal of
 // get-tuple-element can transitively make the defining TupleOp dead to be
@@ -151,13 +168,10 @@ void CleanUpTupleOps(mlir::Block* block, mlir::OpBuilder* builder) {
   while (changed) {
     changed = false;
     for (Operation& op : llvm::make_early_inc_range(block->getOperations())) {
-      if (llvm::isa<mlir::mhlo::GetTupleElementOp>(op)) {
-        folded_results.clear();
-        if (failed(builder->tryFold(&op, folded_results))) continue;
-        op.replaceAllUsesWith(folded_results);
-        op.erase();
-        changed = true;
-      } else if (llvm::isa<mlir::mhlo::TupleOp>(op) &&
+      if (auto get_tuple_op =
+              llvm::dyn_cast<mlir::stablehlo::GetTupleElementOp>(op)) {
+        changed = FoldGetTupleElementOfTuple(get_tuple_op);
+      } else if (llvm::isa<mlir::stablehlo::TupleOp>(op) &&
                  mlir::isOpTriviallyDead(&op)) {
         op.erase();
         changed = true;
@@ -166,12 +180,39 @@ void CleanUpTupleOps(mlir::Block* block, mlir::OpBuilder* builder) {
   }
 }
 
-Operation* createReturnOp(mlir::OpBuilder& builder, mlir::Location loc,
-                          mlir::ValueRange operands, bool is_func) {
-  if (is_func) {
-    return builder.create<mlir::func::ReturnOp>(loc, operands);
+llvm::ArrayRef<int64_t> ToArrayRef(absl::Span<const int64_t> span) {
+  return llvm::ArrayRef<int64_t>(span.data(), span.size());
+}
+
+Operation* CreateReturnOp(mlir::OpBuilder& builder, mlir::Location loc,
+                          mlir::ValueRange operands,
+                          mlir::Dialect* parent_dialect) {
+  LLVM_DEBUG(llvm::dbgs() << "CreateReturnOp: "
+                          << parent_dialect->getNamespace() << '\n');
+  if (llvm::isa<mlir::mhlo::MhloDialect>(parent_dialect)) {
+    // Potentially unused, but if future MHLO ops have bodies, will be needed.
+    return builder.create<mlir::mhlo::ReturnOp>(loc, operands);
   }
-  return builder.create<mlir::mhlo::ReturnOp>(loc, operands);
+  if (llvm::isa<mlir::stablehlo::StablehloDialect>(parent_dialect)) {
+    return builder.create<mlir::stablehlo::ReturnOp>(loc, operands);
+  }
+  return builder.create<mlir::func::ReturnOp>(loc, operands);
+}
+
+Operation* WrapInTuple(mlir::OpBuilder* builder, Operation* op) {
+  LLVM_DEBUG(llvm::dbgs() << "WrapInTuple: " << op->getName()
+                          << op->getResultTypes() << '\n');
+  return builder->create<mlir::stablehlo::TupleOp>(op->getLoc(),
+                                                   op->getResults());
+}
+
+Operation* GetTupleElementOp(mlir::OpBuilder* builder, Value value,
+                             int64_t index,
+                             llvm::SmallVector<NamedAttribute>&& attributes) {
+  attributes.push_back(
+      builder->getNamedAttr("index", builder->getI32IntegerAttr(index)));
+  return builder->create<mlir::stablehlo::GetTupleElementOp>(
+      value.getLoc(), value, builder->getI32IntegerAttr(index));
 }
 
 // Creates an array of zeros like the given MLIR type, if type has bounded
@@ -196,7 +237,7 @@ absl::StatusOr<mlir::Value> createConstantZeroLike(mlir::Value operand,
                           << type << '\n');
   if (type.hasStaticShape())
     return builder
-        ->create<mlir::mhlo::ConstantOp>(loc, builder->getZeroAttr(type))
+        ->create<mlir::stablehlo::ConstantOp>(loc, builder->getZeroAttr(type))
         ->getResult(0);
 
   // Note: Currently this only supports a single bounded dimension.
@@ -214,25 +255,25 @@ absl::StatusOr<mlir::Value> createConstantZeroLike(mlir::Value operand,
                                 input_shape.dimensions().end());
   auto padded_type =
       mlir::RankedTensorType::get(padded_dims, type.getElementType());
-  auto padded_constant = builder->create<mlir::mhlo::ConstantOp>(
+  auto padded_constant = builder->create<mlir::stablehlo::ConstantOp>(
       loc, builder->getZeroAttr(padded_type));
 
   // Get or Set the dimensions size based on the operand type.
-  auto dim_size = builder->create<mlir::mhlo::GetDimensionSizeOp>(
+  auto dim_size = builder->create<mlir::stablehlo::GetDimensionSizeOp>(
       loc, operand, builder->getI64IntegerAttr(bounded_dim));
   std::vector<mlir::Value> operands = {padded_constant->getResult(0), dim_size};
   std::vector<mlir::NamedAttribute> attributes{builder->getNamedAttr(
       "dimension", builder->getI64IntegerAttr(bounded_dim))};
-  return builder->create<mlir::mhlo::SetDimensionSizeOp>(loc, type, operands,
-                                                         attributes);
+  return builder->create<mlir::stablehlo::SetDimensionSizeOp>(
+      loc, type, operands, attributes);
 }
 
 }  // namespace
 
 void HloFunctionImporter::ReplaceBlockArgumentsWithImplicitOperands(
     mlir::Operation* op, llvm::ArrayRef<mlir::Value> implicit_operands) {
-  assert((mlir::dyn_cast<mlir::mhlo::IfOp>(*op) ||
-          mlir::dyn_cast<mlir::mhlo::CaseOp>(*op)) &&
+  assert((mlir::dyn_cast<mlir::stablehlo::IfOp>(*op) ||
+          mlir::dyn_cast<mlir::stablehlo::CaseOp>(*op)) &&
          "Unexpected mlir op in "
          "HloFunctionImporter::ReplaceBlockArgumentsWithImplicitOperands!");
 
@@ -247,18 +288,18 @@ void HloFunctionImporter::ReplaceBlockArgumentsWithImplicitOperands(
 }
 
 static bool IsNestedTupleInData(Type type) {
-  auto tuple_type = type.dyn_cast<mlir::TupleType>();
+  auto tuple_type = mlir::dyn_cast<mlir::TupleType>(type);
   if (!tuple_type) return false;
 
-  assert(tuple_type.getType(1).isa<mlir::mhlo::TokenType>() &&
+  assert(llvm::isa<mlir::stablehlo::TokenType>(tuple_type.getType(1)) &&
          "Infeed: Non token type");
   auto data_type = tuple_type.getType(0);
 
-  auto data_tuple_type = data_type.dyn_cast<mlir::TupleType>();
+  auto data_tuple_type = mlir::dyn_cast<mlir::TupleType>(data_type);
   if (!data_tuple_type) return false;
 
   for (auto child_type : data_tuple_type.getTypes()) {
-    if (child_type.isa<mlir::TupleType>()) return true;
+    if (mlir::isa<mlir::TupleType>(child_type)) return true;
   }
 
   return false;
@@ -276,7 +317,7 @@ mlir::Attribute GetFrontendAttributes(mlir::Builder& b,
 
 void HloFunctionImporter::FlattenTupleType(
     Type type, llvm::SmallVectorImpl<Type>& flattened_types) {
-  auto tuple_type = type.dyn_cast<mlir::TupleType>();
+  auto tuple_type = mlir::dyn_cast<mlir::TupleType>(type);
   if (!tuple_type) {
     flattened_types.push_back(type);
     return;
@@ -290,17 +331,16 @@ void HloFunctionImporter::FlattenTupleType(
 void HloFunctionImporter::FlattenTupleValue(
     mlir::OpBuilder* func_builder, mlir::Location loc, Value value,
     llvm::SmallVectorImpl<Value>& flattened_values) {
-  auto tuple_type = value.getType().dyn_cast<mlir::TupleType>();
+  auto tuple_type = llvm::dyn_cast<mlir::TupleType>(value.getType());
   if (!tuple_type) {
     flattened_values.push_back(value);
     return;
   }
 
-  int flattenIdx = 0;
-  for (auto child_type : tuple_type.getTypes()) {
-    auto sub_value = func_builder->create<mlir::mhlo::GetTupleElementOp>(
-        loc, child_type, value, func_builder->getI32IntegerAttr(flattenIdx++));
-    FlattenTupleValue(func_builder, loc, sub_value, flattened_values);
+  for (int64_t flattenIdx = 0; flattenIdx < tuple_type.size(); ++flattenIdx) {
+    auto sub_value = GetTupleElementOp(func_builder, value, flattenIdx, {});
+    FlattenTupleValue(func_builder, loc, sub_value->getResult(0),
+                      flattened_values);
   }
 }
 
@@ -422,8 +462,8 @@ absl::StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
         }
         // NOTE: since we are flattening args, all arguments will share the same
         // location as the tuple parameter instruction.
-        function.getArgument(i).setLoc(
-            mlir::mhlo::GenerateInstructionLocation(instruction, context_));
+        function.getArgument(arg_index).setLoc(
+            mlir::hlo::GenerateInstructionLocation(instruction, context_));
         ++arg_index;
       }
     } else {
@@ -451,7 +491,7 @@ absl::StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
         }
       }
       function.getArgument(arg_index).setLoc(
-          mlir::mhlo::GenerateInstructionLocation(instruction, context_));
+          mlir::hlo::GenerateInstructionLocation(instruction, context_));
       ++arg_index;
     }
   }
@@ -577,7 +617,7 @@ absl::Status HloFunctionImporter::ImportInstructions(
     int flatten_idx = 0;
     for (Type computation_arg_type : computation_arg_types) {
       auto orig_tuple_arg_type =
-          computation_arg_type.dyn_cast<mlir::TupleType>();
+          mlir::dyn_cast<mlir::TupleType>(computation_arg_type);
 
       // If the computation-parameter type is non-tuple, no action is needed.
       if (!orig_tuple_arg_type) {
@@ -586,7 +626,7 @@ absl::Status HloFunctionImporter::ImportInstructions(
         continue;
       }
 
-      // For each tuple-typed computation parameter, create a mhlo::TupleOp
+      // For each tuple-typed computation parameter, create a TupleOp
       // value in the region body, using the already flattened values in
       // 'arguments'. For example: With computation parameters: [tuple<T1>,
       // tuple<T2, T4>] We have, 'arguments' = [T1 arg1, T2 arg2, T3 arg3] and
@@ -619,9 +659,10 @@ absl::Status HloFunctionImporter::ImportInstructions(
     // Flatten tuples in results of this region.
     llvm::SmallVector<Value> flattened_return_operands;
     FlattenTupleValue(&builder, loc, result, flattened_return_operands);
-    createReturnOp(builder, loc, flattened_return_operands, is_func);
+    CreateReturnOp(builder, loc, flattened_return_operands,
+                   block->getParentOp()->getDialect());
   } else {
-    createReturnOp(builder, loc, result, is_func);
+    CreateReturnOp(builder, loc, result, block->getParentOp()->getDialect());
   }
 
   CleanUpTupleOps(block, &builder);
@@ -669,7 +710,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                            : instruction_shape;
   TF_ASSIGN_OR_RETURN(auto result_type,
                       ConvertShapeToType<RankedTensorType>(shape, *builder_));
-  mlir::Location loc = mlir::mhlo::GenerateInstructionLocation(
+  mlir::Location loc = mlir::hlo::GenerateInstructionLocation(
       instruction, func_builder->getContext());
 
   llvm::SmallVector<NamedAttribute, 10> attributes;
@@ -706,7 +747,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto attr = CreateDenseElementsAttrFromLiteral(literal, *builder_);
       if (!attr.ok()) return attr.status();
       mlir::Operation* new_operation =
-          func_builder->create<mlir::mhlo::ConstantOp>(loc, attr.value());
+          func_builder->create<mlir::stablehlo::ConstantOp>(loc, attr.value());
       for (auto attr : attributes) {
         new_operation->setAttr(attr.getName(), attr.getValue());
       }
@@ -714,7 +755,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kIota: {
       return func_builder
-          ->create<mlir::mhlo::IotaOp>(
+          ->create<mlir::stablehlo::IotaOp>(
               loc, result_type,
               func_builder->getI64IntegerAttr(
                   Cast<HloIotaInstruction>(instruction)->iota_dimension()))
@@ -739,20 +780,23 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
 
       if (instruction->opcode() == HloOpcode::kAsyncStart) {
         auto bundle_result_type = mlir::mhlo::AsyncBundleType::get(
-            context_, result_type.cast<mlir::TupleType>().getTypes());
+            context_, llvm::cast<mlir::TupleType>(result_type).getTypes());
+        // XLA Feature -- MHLO Only
         return func_builder
             ->create<mlir::mhlo::AsyncStartOp>(loc, bundle_result_type,
                                                operands, attributes)
             .getOperation();
       } else if (instruction->opcode() == HloOpcode::kAsyncUpdate) {
         auto bundle_result_type = mlir::mhlo::AsyncBundleType::get(
-            context_, result_type.cast<mlir::TupleType>().getTypes());
+            context_, llvm::cast<mlir::TupleType>(result_type).getTypes());
+        // XLA Feature -- MHLO Only
         return func_builder
             ->create<mlir::mhlo::AsyncUpdateOp>(loc, bundle_result_type,
                                                 operands, attributes)
             .getOperation();
       } else {
         assert(instruction->opcode() == HloOpcode::kAsyncDone);
+        // XLA Feature -- MHLO Only
         return func_builder
             ->create<mlir::mhlo::AsyncDoneOp>(loc, result_type, operands,
                                               attributes)
@@ -762,12 +806,12 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     case HloOpcode::kBroadcast: {
       // Note that the HLO broadcast is more powerful than the XLA broadcast
       // op. BroadcastInDim offers a superset of the HLO op's functionality.
-      attributes.push_back(
-          builder_->getNamedAttr("broadcast_dimensions",
-                                 ConvertDimensions(instruction->dimensions())));
+      attributes.push_back(builder_->getNamedAttr(
+          "broadcast_dimensions",
+          ConvertArray(ToArrayRef(instruction->dimensions()))));
       return func_builder
-          ->create<mlir::mhlo::BroadcastInDimOp>(loc, result_type, operands,
-                                                 attributes)
+          ->create<mlir::stablehlo::BroadcastInDimOp>(loc, result_type,
+                                                      operands, attributes)
           .getOperation();
     }
 
@@ -785,15 +829,15 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         FlattenTupleType(result_type, flattened_ret_types);
 
         auto op = func_builder
-                      ->create<mlir::mhlo::BatchNormGradOp>(
+                      ->create<mlir::stablehlo::BatchNormGradOp>(
                           loc, flattened_ret_types, operands, attributes)
                       .getOperation();
 
         return CreateTupleFromOpResults(func_builder, loc, op, result_type);
       } else if (instruction->opcode() == HloOpcode::kBatchNormInference) {
         return func_builder
-            ->create<mlir::mhlo::BatchNormInferenceOp>(loc, result_type,
-                                                       operands, attributes)
+            ->create<mlir::stablehlo::BatchNormInferenceOp>(
+                loc, result_type, operands, attributes)
             .getOperation();
       } else {
         assert(instruction->opcode() == HloOpcode::kBatchNormTraining);
@@ -803,7 +847,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         FlattenTupleType(result_type, flattened_ret_types);
 
         auto op = func_builder
-                      ->create<mlir::mhlo::BatchNormTrainingOp>(
+                      ->create<mlir::stablehlo::BatchNormTrainingOp>(
                           loc, flattened_ret_types, operands, attributes)
                       .getOperation();
 
@@ -812,44 +856,55 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
 
     case HloOpcode::kDot: {
       auto dot = Cast<HloDotInstruction>(instruction);
+      if (dot->sparse_operands()) {
+        attributes.push_back(builder_->getNamedAttr(
+            "precision_config",
+            ConvertPrecisionConfig(&instruction->precision_config(),
+                                   builder_)));
+        attributes.push_back(builder_->getNamedAttr(
+            "dot_dimension_numbers",
+            ConvertDotDimensionNumbers(instruction->dot_dimension_numbers(),
+                                       builder_)));
+        for (const SparsityDescriptor& descriptor : dot->sparsity()) {
+          TF_ASSIGN_OR_RETURN(auto sparsity,
+                              ConvertSparsityDescriptor(descriptor, builder_));
+          attributes.push_back(builder_->getNamedAttr(
+              descriptor.index() == 0 ? "lhs_sparsity" : "rhs_sparsity",
+              sparsity));
+        }
+        // XLA Feature -- MHLO Only
+        return func_builder
+            ->create<mlir::mhlo::SparseDotOp>(loc, result_type, operands,
+                                              attributes)
+            .getOperation();
+      }
+
+      // Dot or DotGeneral
       attributes.push_back(builder_->getNamedAttr(
-          "precision_config",
-          ConvertPrecisionConfig(&instruction->precision_config(), builder_)));
+          "precision_config", stablehlo::ConvertPrecisionConfig(
+                                  &instruction->precision_config(), builder_)));
       if (instruction->precision_config().algorithm() !=
           PrecisionConfig::ALG_UNSET) {
         attributes.push_back(builder_->getNamedAttr(
             "algorithm",
-            ConvertDotAlgorithm(instruction->precision_config().algorithm(),
-                                builder_)));
+            stablehlo::ConvertDotAlgorithm(
+                instruction->precision_config().algorithm(), builder_)));
       }
       // Consider consolidating DotOps together.
       if (DotIsDefault(instruction) && !dot->sparse_operands()) {
         return func_builder
-            ->create<mlir::mhlo::DotOp>(loc, result_type, operands, attributes)
+            ->create<mlir::stablehlo::DotOp>(loc, result_type, operands,
+                                             attributes)
             .getOperation();
       }
 
       attributes.push_back(builder_->getNamedAttr(
           "dot_dimension_numbers",
-          ConvertDotDimensionNumbers(instruction->dot_dimension_numbers(),
-                                     builder_)));
-      if (!dot->sparse_operands()) {
-        return func_builder
-            ->create<mlir::mhlo::DotGeneralOp>(loc, result_type, operands,
-                                               attributes)
-            .getOperation();
-      }
-
-      for (const SparsityDescriptor& descriptor : dot->sparsity()) {
-        TF_ASSIGN_OR_RETURN(auto sparsity,
-                            ConvertSparsityDescriptor(descriptor, builder_));
-        attributes.push_back(builder_->getNamedAttr(
-            descriptor.index() == 0 ? "lhs_sparsity" : "rhs_sparsity",
-            sparsity));
-      }
+          stablehlo::ConvertDotDimensionNumbers(
+              instruction->dot_dimension_numbers(), builder_)));
       return func_builder
-          ->create<mlir::mhlo::SparseDotOp>(loc, result_type, operands,
-                                            attributes)
+          ->create<mlir::stablehlo::DotGeneralOp>(loc, result_type, operands,
+                                                  attributes)
           .getOperation();
     }
     case HloOpcode::kRaggedAllToAll: {
@@ -868,12 +923,12 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "call_target_name", builder_->getStringAttr("ragged_all_to_all")));
       attributes.push_back(builder_->getNamedAttr(
           "api_version",
-          mlir::mhlo::CustomCallApiVersionAttr::get(
+          mlir::stablehlo::CustomCallApiVersionAttr::get(
               builder_->getContext(),
-              mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)));
+              mlir::stablehlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)));
       return func_builder
-          ->create<mlir::mhlo::CustomCallOp>(loc, result_type, operands,
-                                             attributes)
+          ->create<mlir::stablehlo::CustomCallOp>(loc, result_type, operands,
+                                                  attributes)
           .getOperation();
     }
     case HloOpcode::kRaggedDot: {
@@ -884,6 +939,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "ragged_dot_dimension_numbers",
           ConvertRaggedDotDimensionNumbers(
               instruction->ragged_dot_dimension_numbers(), builder_)));
+      // XLA Feature -- MHLO Only
       return func_builder
           ->create<mlir::mhlo::RaggedDotOp>(loc, result_type, operands,
                                             attributes)
@@ -945,7 +1001,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             std::stoi(
                 frontend_attributes_map.find("composite.version")->second));
 
-        new_operation = func_builder->create<mlir::mhlo::CompositeOp>(
+        new_operation = func_builder->create<mlir::stablehlo::CompositeOp>(
             loc, result_type, operands);
         new_operation->setAttr("name", name);
         new_operation->setAttr("composite_attributes", composite_attributes);
@@ -1002,11 +1058,11 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(ConvertReplicaGroups(
           collective_broadcast->replica_groups(), builder_));
       if (collective_broadcast->channel_id().has_value())
-        attributes.push_back(ConvertChannelHandle(
+        attributes.push_back(stablehlo::ConvertChannelHandle(
             collective_broadcast->channel_id().value(), builder_));
       return func_builder
-          ->create<mlir::mhlo::CollectiveBroadcastOp>(loc, result_type,
-                                                      operands, attributes)
+          ->create<mlir::stablehlo::CollectiveBroadcastOp>(loc, result_type,
+                                                           operands, attributes)
           .getOperation();
     }
 
@@ -1015,11 +1071,11 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(ConvertSourceTargetPairs(
           collective_permute->source_target_pairs(), builder_));
       if (collective_permute->channel_id().has_value())
-        attributes.push_back(ConvertChannelHandle(
+        attributes.push_back(stablehlo::ConvertChannelHandle(
             collective_permute->channel_id().value(), builder_));
       return func_builder
-          ->create<mlir::mhlo::CollectivePermuteOp>(loc, result_type, operands,
-                                                    attributes)
+          ->create<mlir::stablehlo::CollectivePermuteOp>(loc, result_type,
+                                                         operands, attributes)
           .getOperation();
     }
     case HloOpcode::kCollectivePermuteStart: {
@@ -1072,11 +1128,6 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             builder_->getNamedAttr("result_layouts", result_layouts));
       }
 
-      attributes.push_back(
-          ConvertCustomCallSchedule(custom_call->custom_call_schedule()));
-      TF_ASSIGN_OR_RETURN(
-          auto mlir_api_version,
-          ConvertCustomCallApiVersion(custom_call->api_version()));
       attributes.push_back(builder_->getNamedAttr(
           "call_target_name",
           builder_->getStringAttr(custom_call->custom_call_target())));
@@ -1096,7 +1147,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         } else {
           mlir::Attribute attr =
               mlir::parseAttribute(raw_backend_config, builder_->getContext());
-          if (!attr.isa<mlir::DictionaryAttr>())
+          if (!mlir::isa<mlir::DictionaryAttr>(attr))
             return Internal(
                 "Couldn't parse backend config into a dictionary attribute");
 
@@ -1115,16 +1166,42 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             builder_->getNamedAttr("mhlo.literal", attr.value()));
       }
 
+      // StableHLO CustomCall doesn't have a schedule attribute
+      if (custom_call->custom_call_schedule() !=
+          CustomCallSchedule::SCHEDULE_NONE) {
+        attributes.push_back(
+            ConvertCustomCallSchedule(custom_call->custom_call_schedule()));
+        TF_ASSIGN_OR_RETURN(
+            auto mlir_api_version,
+            ConvertCustomCallApiVersion(custom_call->api_version()));
+        attributes.push_back(builder_->getNamedAttr(
+            "api_version", mlir::mhlo::CustomCallApiVersionAttr::get(
+                               builder_->getContext(), mlir_api_version)));
+        attributes.push_back(builder_->getNamedAttr(
+            "output_operand_aliases",
+            ConvertOutputOperandAliasing(instruction->output_operand_aliasing(),
+                                         builder_)));
+        // XLA Feature - MHLO Only
+        return func_builder
+            ->create<mlir::mhlo::CustomCallOp>(loc, result_type, operands,
+                                               attributes)
+            .getOperation();
+      }
+
+      // Valid StableHLO CustomCall
+      TF_ASSIGN_OR_RETURN(
+          auto mlir_api_version,
+          stablehlo::ConvertCustomCallApiVersion(custom_call->api_version()));
       attributes.push_back(builder_->getNamedAttr(
-          "api_version", mlir::mhlo::CustomCallApiVersionAttr::get(
+          "api_version", mlir::stablehlo::CustomCallApiVersionAttr::get(
                              builder_->getContext(), mlir_api_version)));
       attributes.push_back(builder_->getNamedAttr(
           "output_operand_aliases",
-          ConvertOutputOperandAliasing(instruction->output_operand_aliasing(),
-                                       builder_)));
+          stablehlo::ConvertOutputOperandAliasing(
+              instruction->output_operand_aliasing(), builder_)));
       return func_builder
-          ->create<mlir::mhlo::CustomCallOp>(loc, result_type, operands,
-                                             attributes)
+          ->create<mlir::stablehlo::CustomCallOp>(loc, result_type, operands,
+                                                  attributes)
           .getOperation();
     }
     case HloOpcode::kCompare: {
@@ -1135,8 +1212,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       if (compare->type() != default_type)
         attributes.push_back(ConvertComparisonType(compare->type()));
       return func_builder
-          ->create<mlir::mhlo::CompareOp>(loc, result_type, operands,
-                                          attributes)
+          ->create<mlir::stablehlo::CompareOp>(loc, result_type, operands,
+                                               attributes)
           .getOperation();
     }
     case HloOpcode::kCholesky: {
@@ -1144,28 +1221,29 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "lower",
           builder_->getBoolAttr(instruction->cholesky_options().lower())));
       return func_builder
-          ->create<mlir::mhlo::CholeskyOp>(loc, result_type, operands,
-                                           attributes)
+          ->create<mlir::stablehlo::CholeskyOp>(loc, result_type, operands,
+                                                attributes)
           .getOperation();
     }
     case HloOpcode::kGather: {
       auto gather_instruction = Cast<HloGatherInstruction>(instruction);
       attributes.push_back(builder_->getNamedAttr(
           "dimension_numbers",
-          ConvertGatherDimensionNumbers(
+          stablehlo::ConvertGatherDimensionNumbers(
               gather_instruction->gather_dimension_numbers(), builder_)));
 
       std::vector<int64_t> slice_sizes(
           gather_instruction->gather_slice_sizes().begin(),
           gather_instruction->gather_slice_sizes().end());
       attributes.push_back(
-          builder_->getNamedAttr("slice_sizes", Convert(slice_sizes)));
+          builder_->getNamedAttr("slice_sizes", ConvertArray(slice_sizes)));
       attributes.push_back(builder_->getNamedAttr(
           "indices_are_sorted",
           builder_->getBoolAttr(gather_instruction->indices_are_sorted())));
 
       return func_builder
-          ->create<mlir::mhlo::GatherOp>(loc, result_type, operands, attributes)
+          ->create<mlir::stablehlo::GatherOp>(loc, result_type, operands,
+                                              attributes)
           .getOperation();
     }
     case HloOpcode::kDynamicSlice: {
@@ -1173,14 +1251,14 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           instruction->dynamic_slice_sizes().begin(),
           instruction->dynamic_slice_sizes().end());
       return func_builder
-          ->create<mlir::mhlo::DynamicSliceOp>(
+          ->create<mlir::stablehlo::DynamicSliceOp>(
               loc, result_type, operands[0],
-              llvm::ArrayRef(operands).drop_front(), Convert(slice_sizes))
+              llvm::ArrayRef(operands).drop_front(), ConvertArray(slice_sizes))
           .getOperation();
     }
     case HloOpcode::kDynamicUpdateSlice: {
       return func_builder
-          ->create<mlir::mhlo::DynamicUpdateSliceOp>(
+          ->create<mlir::stablehlo::DynamicUpdateSliceOp>(
               loc, result_type, operands[0], operands[1],
               llvm::ArrayRef<Value>(operands.begin() + 2, operands.end()))
           .getOperation();
@@ -1206,7 +1284,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       llvm::SmallVector<Type> flattened_ret_types;
       FlattenTupleType(result_type, flattened_ret_types);
 
-      auto op = func_builder->create<mlir::mhlo::InfeedOp>(
+      auto op = func_builder->create<mlir::stablehlo::InfeedOp>(
           loc, flattened_ret_types, operands, attributes);
 
       return CreateTupleFromOpResults(func_builder, loc, op.getOperation(),
@@ -1225,7 +1303,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       FlattenTupleValue(func_builder, loc, operands[0], flattened_operands);
       flattened_operands.push_back(operands[1]);
 
-      auto op = func_builder->create<mlir::mhlo::OutfeedOp>(
+      auto op = func_builder->create<mlir::stablehlo::OutfeedOp>(
           loc, result_type, flattened_operands, attributes);
 
       return op.getOperation();
@@ -1246,18 +1324,18 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
 
       return func_builder
-          ->create<mlir::mhlo::PadOp>(loc, result_type, operands[0],
-                                      operands[1], Convert(edge_padding_low),
-                                      Convert(edge_padding_high),
-                                      Convert(interior_padding))
+          ->create<mlir::stablehlo::PadOp>(
+              loc, result_type, operands[0], operands[1],
+              ConvertArray(edge_padding_low), ConvertArray(edge_padding_high),
+              ConvertArray(interior_padding))
           .getOperation();
     }
     case HloOpcode::kScatter: {
       auto scatter = Cast<HloScatterInstruction>(instruction);
       attributes.push_back(builder_->getNamedAttr(
           "scatter_dimension_numbers",
-          ConvertScatterDimensionNumbers(scatter->scatter_dimension_numbers(),
-                                         builder_)));
+          stablehlo::ConvertScatterDimensionNumbers(
+              scatter->scatter_dimension_numbers(), builder_)));
       attributes.push_back(builder_->getNamedAttr(
           "indices_are_sorted",
           builder_->getBoolAttr(scatter->indices_are_sorted())));
@@ -1267,7 +1345,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       llvm::SmallVector<Type> flattened_types;
       FlattenTupleType(result_type, flattened_types);
 
-      auto scatter_op = func_builder->create<mlir::mhlo::ScatterOp>(
+      auto scatter_op = func_builder->create<mlir::stablehlo::ScatterOp>(
           loc, flattened_types, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*scatter->to_apply(),
                                         &scatter_op.getUpdateComputation()));
@@ -1287,13 +1365,13 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         padding.push_back(dim.padding_low());
         padding.push_back(dim.padding_high());
       }
-      attributes.push_back(
-          builder_->getNamedAttr("window_strides", Convert(window_strides)));
-      attributes.push_back(builder_->getNamedAttr("window_dimensions",
-                                                  Convert(window_dimensions)));
+      attributes.push_back(builder_->getNamedAttr(
+          "window_strides", ConvertArray(window_strides)));
+      attributes.push_back(builder_->getNamedAttr(
+          "window_dimensions", ConvertArray(window_dimensions)));
       attributes.push_back(ConvertPadding(padding));
       auto select_scatter_op =
-          func_builder->create<mlir::mhlo::SelectAndScatterOp>(
+          func_builder->create<mlir::stablehlo::SelectAndScatterOp>(
               loc, result_type, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*select_scatter->select(),
                                         &select_scatter_op.getSelect()));
@@ -1305,28 +1383,29 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(builder_->getNamedAttr(
           "dimension", builder_->getI64IntegerAttr(instruction->dimension())));
       return func_builder
-          ->create<mlir::mhlo::SetDimensionSizeOp>(loc, result_type, operands,
-                                                   attributes)
+          ->create<mlir::stablehlo::SetDimensionSizeOp>(loc, result_type,
+                                                        operands, attributes)
           .getOperation();
     }
     case HloOpcode::kSlice: {
       return func_builder
-          ->create<mlir::mhlo::SliceOp>(
+          ->create<mlir::stablehlo::SliceOp>(
               loc, result_type, operands[0],
-              ConvertDimensions(instruction->slice_starts()),
-              ConvertDimensions(instruction->slice_limits()),
-              ConvertDimensions(instruction->slice_strides()))
+              ConvertArray(instruction->slice_starts()),
+              ConvertArray(instruction->slice_limits()),
+              ConvertArray(instruction->slice_strides()))
           .getOperation();
     }
     case HloOpcode::kSort: {
       auto sort_instruction = Cast<HloSortInstruction>(instruction);
 
       llvm::SmallVector<Type, 4> return_types = {result_type};
-      if (mlir::TupleType tuple_ty = result_type.dyn_cast<mlir::TupleType>()) {
+      if (mlir::TupleType tuple_ty =
+              mlir::dyn_cast<mlir::TupleType>(result_type)) {
         return_types = llvm::to_vector<6>(tuple_ty.getTypes());
       }
 
-      auto sort_op = func_builder->create<mlir::mhlo::SortOp>(
+      auto sort_op = func_builder->create<mlir::stablehlo::SortOp>(
           loc, return_types, operands,
           builder_->getI64IntegerAttr(sort_instruction->sort_dimension()),
           builder_->getBoolAttr(sort_instruction->is_stable()));
@@ -1338,19 +1417,16 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         return sort_op.getOperation();
       }
 
-      return func_builder
-          ->create<mlir::mhlo::TupleOp>(loc, result_type, sort_op.getResults())
-          .getOperation();
+      return WrapInTuple(func_builder, sort_op);
     }
     case HloOpcode::kTopK: {
       auto topk_instruction = Cast<HloTopKInstruction>(instruction);
+      // XLA Feature -- MHLO Only
       auto topk_op = func_builder->create<mlir::mhlo::TopKOp>(
-          loc, result_type.dyn_cast<mlir::TupleType>().getTypes(), operands[0],
-          builder_->getI64IntegerAttr(topk_instruction->k()),
+          loc, mlir::dyn_cast<mlir::TupleType>(result_type).getTypes(),
+          operands[0], builder_->getI64IntegerAttr(topk_instruction->k()),
           builder_->getBoolAttr(topk_instruction->largest()));
-      return func_builder
-          ->create<mlir::mhlo::TupleOp>(loc, result_type, topk_op.getResults())
-          .getOperation();
+      return WrapInTuple(func_builder, topk_op);
     }
     case HloOpcode::kCopyStart: {
       return ImportCopyStart(instruction, loc, operands, attributes,
@@ -1387,7 +1463,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                                               flattened_operands.end());
 
       mlir::Type pred_or_index_type =
-          operands[0].getType().cast<mlir::TensorType>().getElementType();
+          mlir::cast<mlir::TensorType>(operands[0].getType()).getElementType();
       // It is a predicated conditional if first argument is a boolean and
       // should be mapped to If op.
       if (pred_or_index_type.isInteger(1)) {
@@ -1399,7 +1475,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         assert(rets.size() == 1);
         FlattenTupleType(rets[0], flattened_ret_types);
 
-        auto op = func_builder->create<mlir::mhlo::IfOp>(
+        auto op = func_builder->create<mlir::stablehlo::IfOp>(
             loc, flattened_ret_types, flattened_operands[0], attributes);
         TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->true_computation(),
                                           &op.getTrueBranch()));
@@ -1426,7 +1502,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       FlattenTupleType(rets[0], flattened_ret_types);
 
       int num_branches = instruction->branch_count();
-      auto op = func_builder->create<mlir::mhlo::CaseOp>(
+      auto op = func_builder->create<mlir::stablehlo::CaseOp>(
           loc, flattened_ret_types, flattened_operands[0], attributes,
           num_branches);
       for (const auto& index_and_computation :
@@ -1447,14 +1523,14 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kConcatenate: {
       return func_builder
-          ->create<mlir::mhlo::ConcatenateOp>(
+          ->create<mlir::stablehlo::ConcatenateOp>(
               loc, result_type, operands,
               builder_->getI64IntegerAttr(instruction->concatenate_dimension()))
           .getOperation();
     }
     case HloOpcode::kAllGather: {
       auto all_gather = Cast<HloAllGatherInstruction>(instruction);
-      auto result_tuple_ty = result_type.dyn_cast<mlir::TupleType>();
+      auto result_tuple_ty = mlir::dyn_cast<mlir::TupleType>(result_type);
 
       llvm::SmallVector<Type> result_types = {result_type};
       if (result_tuple_ty) {
@@ -1466,17 +1542,14 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(
           ConvertReplicaGroups(all_gather->replica_groups(), builder_));
       if (all_gather->channel_id().has_value())
-        attributes.push_back(
-            ConvertChannelHandle(all_gather->channel_id().value(), builder_));
+        attributes.push_back(stablehlo::ConvertChannelHandle(
+            all_gather->channel_id().value(), builder_));
       if (all_gather->use_global_device_ids())
         attributes.push_back(ConvertUseGlobalDeviceIds(builder_));
-      auto all_gather_op = func_builder->create<mlir::mhlo::AllGatherOp>(
+      auto all_gather_op = func_builder->create<mlir::stablehlo::AllGatherOp>(
           loc, result_types, operands, attributes);
       if (result_tuple_ty) {
-        return func_builder
-            ->create<mlir::mhlo::TupleOp>(loc, result_type,
-                                          all_gather_op.getResults())
-            .getOperation();
+        return WrapInTuple(func_builder, all_gather_op);
       }
       return all_gather_op.getOperation();
     }
@@ -1490,7 +1563,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kAllReduce: {
       auto all_reduce = Cast<HloAllReduceInstruction>(instruction);
-      auto result_tuple_ty = result_type.dyn_cast<mlir::TupleType>();
+      auto result_tuple_ty = mlir::dyn_cast<mlir::TupleType>(result_type);
 
       llvm::SmallVector<Type> result_types = {result_type};
       if (result_tuple_ty) {
@@ -1500,24 +1573,21 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(
           ConvertReplicaGroups(all_reduce->replica_groups(), builder_));
       if (all_reduce->channel_id().has_value())
-        attributes.push_back(
-            ConvertChannelHandle(all_reduce->channel_id().value(), builder_));
+        attributes.push_back(stablehlo::ConvertChannelHandle(
+            all_reduce->channel_id().value(), builder_));
       if (all_reduce->use_global_device_ids())
         attributes.push_back(ConvertUseGlobalDeviceIds(builder_));
-      auto all_reduce_op = func_builder->create<mlir::mhlo::AllReduceOp>(
+      auto all_reduce_op = func_builder->create<mlir::stablehlo::AllReduceOp>(
           loc, result_types, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*all_reduce->to_apply(),
                                         &all_reduce_op.getComputation()));
       if (result_tuple_ty) {
-        return func_builder
-            ->create<mlir::mhlo::TupleOp>(loc, result_type,
-                                          all_reduce_op.getResults())
-            .getOperation();
+        return WrapInTuple(func_builder, all_reduce_op);
       }
       return all_reduce_op.getOperation();
     }
     case HloOpcode::kAllReduceStart: {
-      auto appendRegion = [&](mlir::mhlo::AllReduceOp all_reduce_sync) {
+      auto appendRegion = [&](mlir::stablehlo::AllReduceOp all_reduce_sync) {
         TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->to_apply(),
                                           &all_reduce_sync.getComputation()));
         return absl::OkStatus();
@@ -1533,54 +1603,59 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kAllToAll: {
       auto all_to_all = Cast<HloAllToAllInstruction>(instruction);
-      auto result_tuple_ty = result_type.dyn_cast<mlir::TupleType>();
+
+      auto replica_groups_attr = llvm::cast<DenseIntElementsAttr>(
+          ConvertReplicaGroups(all_to_all->replica_groups(), builder_)
+              .getValue());
 
       // Check invariants of array all-to-all. This is a sanity check and is
       // verified by the HLO verifier.
+      auto result_tuple_ty = llvm::dyn_cast<mlir::TupleType>(result_type);
       if (result_tuple_ty) {
+        // Tuple all-to-all is MHLO only for now.
         if (all_to_all->split_dimension().has_value()) {
           return InvalidArgument(
               "Tuple all-to-all should not have a split dimension");
         }
-      } else {
-        if (!all_to_all->split_dimension().has_value() ||
-            operands.size() != 1 || all_to_all->replica_groups().empty()) {
-          return InvalidArgument(
-              "Array all-to-all should have a split dimension, one operand and "
-              "non-empty replica groups");
+        llvm::SmallVector<Type, 4> return_types =
+            llvm::to_vector<4>(result_tuple_ty.getTypes());
+        mlir::mhlo::ChannelHandleAttr channel_handle_attr{};
+        if (all_to_all->channel_id().has_value()) {
+          channel_handle_attr = llvm::cast<mlir::mhlo::ChannelHandleAttr>(
+              ConvertChannelHandle(all_to_all->channel_id().value(), builder_)
+                  .getValue());
         }
-      }
-
-      auto replica_groups_attr =
-          ConvertReplicaGroups(all_to_all->replica_groups(), builder_)
-              .getValue()
-              .cast<DenseIntElementsAttr>();
 
-      llvm::SmallVector<Type, 4> return_types = {result_type};
-      if (result_tuple_ty) {
-        return_types = llvm::to_vector<4>(result_tuple_ty.getTypes());
+        // TODO(b/408024772): Fix StableHLO AllToAll to support tuple types the
+        // way that XLA expects it. Currently it is mis-designed.
+        // XLA Feature -- MHLO Only
+        auto result = func_builder->create<mlir::mhlo::AllToAllOp>(
+            loc, return_types, operands, nullptr, nullptr, nullptr,
+            replica_groups_attr, channel_handle_attr);
+        return WrapInTuple(func_builder, result);
+      }
+      // Array AllToAll
+      if (!all_to_all->split_dimension().has_value() || operands.size() != 1 ||
+          all_to_all->replica_groups().empty()) {
+        return InvalidArgument(
+            "Array all-to-all should have a split dimension, one operand and "
+            "non-empty replica groups");
       }
 
-      auto result = func_builder->create<mlir::mhlo::AllToAllOp>(
-          loc, return_types, operands, nullptr, nullptr, nullptr,
-          replica_groups_attr);
-
+      mlir::stablehlo::ChannelHandleAttr channel_handle_attr{};
       if (all_to_all->channel_id().has_value()) {
-        auto handle =
-            ConvertChannelHandle(all_to_all->channel_id().value(), builder_);
-        result.setChannelHandleAttr(
-            handle.getValue().cast<mlir::mhlo::ChannelHandleAttr>());
+        channel_handle_attr = llvm::cast<mlir::stablehlo::ChannelHandleAttr>(
+            stablehlo::ConvertChannelHandle(all_to_all->channel_id().value(),
+                                            builder_)
+                .getValue());
       }
 
-      if (result_tuple_ty) {
-        return func_builder
-            ->create<mlir::mhlo::TupleOp>(loc, result_type, result.getResults())
-            .getOperation();
-      }
+      auto result = func_builder->create<mlir::stablehlo::AllToAllOp>(
+          loc, result_type, operands[0], all_to_all->split_dimension().value(),
+          all_to_all->split_dimension().value(),
+          all_to_all->replica_groups()[0].replica_ids_size(),
+          replica_groups_attr, channel_handle_attr);
 
-      result.setSplitDimension(all_to_all->split_dimension().value());
-      result.setConcatDimension(all_to_all->split_dimension().value());
-      result.setSplitCount(all_to_all->replica_groups()[0].replica_ids_size());
       return result.getOperation();
     }
     case HloOpcode::kReduce: {
@@ -1588,14 +1663,15 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       // operands are corresponding initial values.
       size_t num_inputs = operands.size() / 2;
       llvm::SmallVector<Type, 4> return_types = {result_type};
-      if (mlir::TupleType tuple_ty = result_type.dyn_cast<mlir::TupleType>()) {
+      if (mlir::TupleType tuple_ty =
+              mlir::dyn_cast<mlir::TupleType>(result_type)) {
         return_types = llvm::to_vector<6>(tuple_ty.getTypes());
       }
 
-      auto reduce = func_builder->create<mlir::mhlo::ReduceOp>(
-          loc, return_types, llvm::ArrayRef(operands).take_front(num_inputs),
-          llvm::ArrayRef(operands).drop_front(num_inputs),
-          ConvertDimensions(instruction->dimensions()));
+      auto reduce = func_builder->create<mlir::stablehlo::ReduceOp>(
+          loc, return_types, mlir::ValueRange(operands).take_front(num_inputs),
+          mlir::ValueRange(operands).drop_front(num_inputs),
+          ConvertArray(ToArrayRef(instruction->dimensions())));
       TF_RETURN_IF_ERROR(
           ImportAsRegion(*instruction->to_apply(), &reduce.getBody()));
 
@@ -1607,11 +1683,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         return reduce.getOperation();
       }
 
-      mlir::Operation* operation =
-          func_builder
-              ->create<mlir::mhlo::TupleOp>(loc, result_type,
-                                            reduce.getResults())
-              .getOperation();
+      mlir::Operation* operation = WrapInTuple(func_builder, reduce);
       for (auto attr : attributes) {
         operation->setAttr(attr.getName(), attr.getValue());
       }
@@ -1619,27 +1691,27 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kReverse: {
       return func_builder
-          ->create<mlir::mhlo::ReverseOp>(
+          ->create<mlir::stablehlo::ReverseOp>(
               loc, result_type, operands[0],
-              ConvertDimensions(instruction->dimensions()))
+              ConvertArray(ToArrayRef(instruction->dimensions())))
           .getOperation();
     }
     case HloOpcode::kRng: {
-      auto shape = func_builder->create<mlir::mhlo::ConstantOp>(
-          loc, Convert(result_type.cast<RankedTensorType>().getShape()));
+      auto shape = func_builder->create<mlir::stablehlo::ConstantOp>(
+          loc, Convert(mlir::cast<RankedTensorType>(result_type).getShape()));
       switch (instruction->random_distribution()) {
         case RNG_UNIFORM:
           return func_builder
-              ->create<mlir::mhlo::RngOp>(
+              ->create<mlir::stablehlo::RngOp>(
                   loc, result_type, operands[0], operands[1], shape,
-                  ::mlir::mhlo::RngDistribution::UNIFORM)
+                  ::mlir::stablehlo::RngDistribution::UNIFORM)
               .getOperation();
 
         case RNG_NORMAL:
           return func_builder
-              ->create<mlir::mhlo::RngOp>(loc, result_type, operands[0],
-                                          operands[1], shape,
-                                          ::mlir::mhlo::RngDistribution::NORMAL)
+              ->create<mlir::stablehlo::RngOp>(
+                  loc, result_type, operands[0], operands[1], shape,
+                  ::mlir::stablehlo::RngDistribution::NORMAL)
               .getOperation();
 
         default:
@@ -1651,13 +1723,13 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     case HloOpcode::kRngBitGenerator: {
       // HloRngBitGeneratorInstruction can have two kinds of shapes, (1)
       // tuple(output_state, output_data), and (2) output_data.
-      // mhlo::RngBitGeneratorOp has only one shape, (output_state,
+      // stablehlo::RngBitGeneratorOp has only one shape, (output_state,
       // output_data).
       auto rng_op = Cast<HloRngBitGeneratorInstruction>(instruction);
 
-      auto algorithm_attr = mlir::mhlo::RngAlgorithmAttr::get(
+      auto algorithm_attr = mlir::stablehlo::RngAlgorithmAttr::get(
           builder_->getContext(),
-          *mlir::mhlo::symbolizeRngAlgorithm(rng_op->algorithm()));
+          *mlir::stablehlo::symbolizeRngAlgorithm(rng_op->algorithm()));
       attributes.push_back(
           builder_->getNamedAttr("rng_algorithm", algorithm_attr));
 
@@ -1682,7 +1754,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
       CHECK_EQ(flattened_ret_types.size(), 2);
 
-      auto op = func_builder->create<mlir::mhlo::RngBitGeneratorOp>(
+      auto op = func_builder->create<mlir::stablehlo::RngBitGeneratorOp>(
           loc, flattened_ret_types, operands[0], attributes);
       if (rng_op->shape().IsArray()) {
         return op.getOperation();
@@ -1691,6 +1763,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                                       result_type);
     }
     case HloOpcode::kRngGetAndUpdateState: {
+      // XLA Feature -- MHLO Only
       return func_builder
           ->create<mlir::mhlo::XlaRngGetAndUpdateStateOp>(
               loc, result_type,
@@ -1705,7 +1778,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       FlattenTupleType(operands[0].getType(), flattened_operand_types);
       FlattenTupleValue(func_builder, loc, operands[0], flattened_operands);
 
-      auto op = func_builder->create<mlir::mhlo::WhileOp>(
+      auto op = func_builder->create<mlir::stablehlo::WhileOp>(
           loc, flattened_operand_types, flattened_operands);
 
       TF_RETURN_IF_ERROR(
@@ -1716,28 +1789,24 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                                       operands[0].getType());
     }
     case HloOpcode::kGetTupleElement: {
-      attributes.push_back(builder_->getNamedAttr(
-          "index", builder_->getIntegerAttr(builder_->getIntegerType(32),
-                                            instruction->tuple_index())));
-      return func_builder
-          ->create<mlir::mhlo::GetTupleElementOp>(loc, result_type, operands,
-                                                  attributes)
-          .getOperation();
+      return GetTupleElementOp(func_builder, operands[0],
+                               instruction->tuple_index(),
+                               std::move(attributes));
     };
     case HloOpcode::kGetDimensionSize: {
       attributes.push_back(builder_->getNamedAttr(
           "dimension", builder_->getI64IntegerAttr(instruction->dimension())));
       return func_builder
-          ->create<mlir::mhlo::GetDimensionSizeOp>(loc, result_type, operands,
-                                                   attributes)
+          ->create<mlir::stablehlo::GetDimensionSizeOp>(loc, result_type,
+                                                        operands, attributes)
           .getOperation();
     };
     case HloOpcode::kTranspose: {
       attributes.push_back(builder_->getNamedAttr(
-          "permutation", ConvertDimensions(instruction->dimensions())));
+          "permutation", ConvertArray(ToArrayRef(instruction->dimensions()))));
       return func_builder
-          ->create<mlir::mhlo::TransposeOp>(loc, result_type, operands,
-                                            attributes)
+          ->create<mlir::stablehlo::TransposeOp>(loc, result_type, operands,
+                                                 attributes)
           .getOperation();
     }
     case HloOpcode::kTriangularSolve: {
@@ -1752,17 +1821,17 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "unit_diagonal",
           builder_->getBoolAttr(
               instruction->triangular_solve_options().unit_diagonal())));
-      auto transpose_a = mlir::mhlo::TransposeAttr::get(
+      auto transpose_a = mlir::stablehlo::TransposeAttr::get(
           builder_->getContext(),
-          mlir::mhlo::symbolizeTranspose(
+          mlir::stablehlo::symbolizeTranspose(
               TriangularSolveOptions::Transpose_Name(
                   instruction->triangular_solve_options().transpose_a()))
               .value());
 
       attributes.push_back(builder_->getNamedAttr("transpose_a", transpose_a));
       return func_builder
-          ->create<mlir::mhlo::TriangularSolveOp>(loc, result_type, operands,
-                                                  attributes)
+          ->create<mlir::stablehlo::TriangularSolveOp>(loc, result_type,
+                                                       operands, attributes)
           .getOperation();
     }
     case HloOpcode::kReduceScatter: {
@@ -1773,12 +1842,12 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(
           ConvertReplicaGroups(reduce_scatter->replica_groups(), builder_));
       if (reduce_scatter->channel_id().has_value())
-        attributes.push_back(ConvertChannelHandle(
+        attributes.push_back(stablehlo::ConvertChannelHandle(
             reduce_scatter->channel_id().value(), builder_));
       if (reduce_scatter->use_global_device_ids())
         attributes.push_back(ConvertUseGlobalDeviceIds(builder_));
       auto reduce_scatter_op =
-          func_builder->create<mlir::mhlo::ReduceScatterOp>(
+          func_builder->create<mlir::stablehlo::ReduceScatterOp>(
               loc, result_type, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*reduce_scatter->to_apply(),
                                         &reduce_scatter_op.getComputation()));
@@ -1787,7 +1856,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kReduceWindow: {
       llvm::SmallVector<Type, 4> return_types = {result_type};
-      if (mlir::TupleType tuple_ty = result_type.dyn_cast<mlir::TupleType>()) {
+      if (mlir::TupleType tuple_ty =
+              mlir::dyn_cast<mlir::TupleType>(result_type)) {
         return_types = llvm::to_vector<6>(tuple_ty.getTypes());
       }
       llvm::SmallVector<int64_t, 4> sizes, strides, base_dilations,
@@ -1801,16 +1871,16 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         padding.push_back(dim.padding_low());
         padding.push_back(dim.padding_high());
       }
-      attributes.push_back(builder_->getNamedAttr("window_dimensions",
-                                                  ConvertDimensions(sizes)));
       attributes.push_back(
-          builder_->getNamedAttr("window_strides", ConvertDimensions(strides)));
-      attributes.push_back(builder_->getNamedAttr(
-          "base_dilations", ConvertDimensions(base_dilations)));
+          builder_->getNamedAttr("window_dimensions", ConvertArray(sizes)));
+      attributes.push_back(
+          builder_->getNamedAttr("window_strides", ConvertArray(strides)));
       attributes.push_back(builder_->getNamedAttr(
-          "window_dilations", ConvertDimensions(win_dilations)));
+          "base_dilations", ConvertArray(base_dilations)));
+      attributes.push_back(builder_->getNamedAttr("window_dilations",
+                                                  ConvertArray(win_dilations)));
       attributes.push_back(ConvertPadding(padding));
-      auto reduce = func_builder->create<mlir::mhlo::ReduceWindowOp>(
+      auto reduce = func_builder->create<mlir::stablehlo::ReduceWindowOp>(
           loc, return_types, operands, attributes);
       TF_RETURN_IF_ERROR(
           ImportAsRegion(*instruction->to_apply(), &reduce.getBody()));
@@ -1820,14 +1890,12 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         return reduce.getOperation();
       }
 
-      return func_builder
-          ->create<mlir::mhlo::TupleOp>(loc, result_type, reduce.getResults())
-          .getOperation();
+      return WrapInTuple(func_builder, reduce);
     }
     case HloOpcode::kMap: {
-      auto op = func_builder->create<mlir::mhlo::MapOp>(
+      auto op = func_builder->create<mlir::stablehlo::MapOp>(
           loc, result_type, operands,
-          ConvertDimensions(instruction->dimensions()));
+          ConvertArray(ToArrayRef(instruction->dimensions())));
       TF_RETURN_IF_ERROR(
           ImportAsRegion(*instruction->to_apply(), &op.getComputation()));
       return op.getOperation();
@@ -1846,17 +1914,17 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
 
       attributes.push_back(
-          builder_->getNamedAttr("window_strides", Convert(strides)));
+          builder_->getNamedAttr("window_strides", ConvertArray(strides)));
       attributes.push_back(ConvertPadding(paddings));
       attributes.push_back(
-          builder_->getNamedAttr("lhs_dilation", Convert(lhs_dilations)));
+          builder_->getNamedAttr("lhs_dilation", ConvertArray(lhs_dilations)));
       attributes.push_back(
-          builder_->getNamedAttr("rhs_dilation", Convert(rhs_dilations)));
+          builder_->getNamedAttr("rhs_dilation", ConvertArray(rhs_dilations)));
       attributes.push_back(
-          builder_->getNamedAttr("window_reversal", Convert(reversals)));
+          builder_->getNamedAttr("window_reversal", ConvertArray(reversals)));
       attributes.push_back(builder_->getNamedAttr(
           "dimension_numbers",
-          ConvertConvDimensionNumbers(
+          stablehlo::ConvertConvDimensionNumbers(
               instruction->convolution_dimension_numbers(), builder_)));
       attributes.push_back(builder_->getNamedAttr(
           "feature_group_count",
@@ -1865,8 +1933,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "batch_group_count",
           builder_->getI64IntegerAttr(instruction->batch_group_count())));
       attributes.push_back(builder_->getNamedAttr(
-          "precision_config",
-          ConvertPrecisionConfig(&instruction->precision_config(), builder_)));
+          "precision_config", stablehlo::ConvertPrecisionConfig(
+                                  &instruction->precision_config(), builder_)));
 
       // If the element types of the operands for convolution are different,
       // insert a convert op to convert the operands to the common element type
@@ -1876,19 +1944,20 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto lhs_element_type = instruction->operand(0)->shape().element_type();
       auto rhs_element_type = instruction->operand(1)->shape().element_type();
       if (lhs_element_type != rhs_element_type) {
+        // Cast LHS or RHS to the common element type.
         if (primitive_util::CastPreservesValues(lhs_element_type,
                                                 rhs_element_type)) {
           auto convert_op_return_type =
               mlir::cast<mlir::ShapedType>(lhs.getType())
                   .clone(mlir::getElementTypeOrSelf(rhs));
-          lhs = func_builder->create<mlir::mhlo::ConvertOp>(
+          lhs = func_builder->create<mlir::stablehlo::ConvertOp>(
               loc, convert_op_return_type, lhs);
         } else if (primitive_util::CastPreservesValues(rhs_element_type,
                                                        lhs_element_type)) {
           auto convert_op_return_type =
               mlir::cast<mlir::ShapedType>(rhs.getType())
                   .clone(mlir::getElementTypeOrSelf(lhs));
-          rhs = func_builder->create<mlir::mhlo::ConvertOp>(
+          rhs = func_builder->create<mlir::stablehlo::ConvertOp>(
               loc, convert_op_return_type, rhs);
         } else {
           return InvalidArgument(
@@ -1897,33 +1966,29 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
               instruction->operand(0)->shape().ToString(),
               instruction->operand(1)->shape().ToString());
         }
-        return func_builder
-            ->create<mlir::mhlo::ConvolutionOp>(
-                loc, result_type, std::vector<mlir::Value>{lhs, rhs},
-                attributes)
-            .getOperation();
       }
 
       return func_builder
-          ->create<mlir::mhlo::ConvolutionOp>(loc, result_type, operands,
-                                              attributes)
+          ->create<mlir::stablehlo::ConvolutionOp>(
+              loc, result_type, std::vector<mlir::Value>{lhs, rhs}, attributes)
           .getOperation();
     }
 
     case HloOpcode::kFft: {
-      auto fft_type = mlir::mhlo::FftTypeAttr::get(
-          builder_->getContext(),
-          mlir::mhlo::symbolizeFftType(FftType_Name(instruction->fft_type()))
-              .value());
+      auto fft_type = mlir::stablehlo::FftTypeAttr::get(
+          builder_->getContext(), mlir::stablehlo::symbolizeFftType(
+                                      FftType_Name(instruction->fft_type()))
+                                      .value());
 
       std::vector<int64_t> fft_length(instruction->fft_length().begin(),
                                       instruction->fft_length().end());
 
       attributes.push_back(builder_->getNamedAttr("fft_type", fft_type));
       attributes.push_back(
-          builder_->getNamedAttr("fft_length", Convert(fft_length)));
+          builder_->getNamedAttr("fft_length", ConvertArray(fft_length)));
       return func_builder
-          ->create<mlir::mhlo::FftOp>(loc, result_type, operands, attributes)
+          ->create<mlir::stablehlo::FftOp>(loc, result_type, operands,
+                                           attributes)
           .getOperation();
     }
 
@@ -1934,26 +1999,28 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       // arith::OrIOp instead.
       if (instruction->shape().element_type() == PRED) {
         return func_builder
-            ->create<mlir::mhlo::OrOp>(loc, result_type, operands, attributes)
+            ->create<mlir::stablehlo::OrOp>(loc, result_type, operands,
+                                            attributes)
             .getOperation();
       } else {
         return func_builder
-            ->create<mlir::mhlo::AddOp>(loc, result_type, operands, attributes)
+            ->create<mlir::stablehlo::AddOp>(loc, result_type, operands,
+                                             attributes)
             .getOperation();
       }
     }
     case HloOpcode::kAfterAll: {
-      // HLO AfterAll ops without any token input are used to just create a
-      // token. MHLO has a special op CreateToken for this case.
+      // HLO AfterAll ops without any token input are used to create a token.
+      // TODO(b/408024772): Remove CreateTokenOp, it is redundant.
       if (instruction->operands().empty()) {
         return func_builder
-            ->create<mlir::mhlo::CreateTokenOp>(loc, result_type, operands,
-                                                attributes)
+            ->create<mlir::stablehlo::CreateTokenOp>(loc, result_type, operands,
+                                                     attributes)
             .getOperation();
       } else {
         return func_builder
-            ->create<mlir::mhlo::AfterAllOp>(loc, result_type, operands,
-                                             attributes)
+            ->create<mlir::stablehlo::AfterAllOp>(loc, result_type, operands,
+                                                  attributes)
             .getOperation();
       }
     }
@@ -1961,14 +2028,14 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     case HloOpcode::kConvert: {
       // Convert to boolean is special, it requires a comparison to 0 instead of
       // a truncation to i1, otherwise it is a 1-1 translation.
-      auto ranked_type = result_type.dyn_cast<mlir::RankedTensorType>();
+      auto ranked_type = mlir::dyn_cast<mlir::RankedTensorType>(result_type);
       mlir::IntegerType integer_type =
           (ranked_type)
-              ? ranked_type.getElementType().dyn_cast<mlir::IntegerType>()
+              ? mlir::dyn_cast<mlir::IntegerType>(ranked_type.getElementType())
               : nullptr;
       if (!integer_type || integer_type.getWidth() != 1) {
         // Simple case: 1-1 mapping.
-        return {func_builder->create<mlir::mhlo::ConvertOp>(
+        return {func_builder->create<mlir::stablehlo::ConvertOp>(
             loc, result_type, operands, attributes)};
       }
 
@@ -1979,10 +2046,11 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           createConstantZeroLike(operands[0], input_shape, func_builder, loc));
       std::vector<mlir::Value> compare_operands = {operands[0], zero};
       std::vector<mlir::NamedAttribute> attributes = {builder_->getNamedAttr(
-          "comparison_direction", mlir::mhlo::ComparisonDirectionAttr::get(
-                                      func_builder->getContext(),
-                                      mlir::mhlo::ComparisonDirection::NE))};
-      return {func_builder->create<mlir::mhlo::CompareOp>(
+          "comparison_direction",
+          mlir::stablehlo::ComparisonDirectionAttr::get(
+              func_builder->getContext(),
+              mlir::stablehlo::ComparisonDirection::NE))};
+      return {func_builder->create<mlir::stablehlo::CompareOp>(
           loc, result_type, compare_operands, attributes)};
     }
     case HloOpcode::kOptimizationBarrier: {
@@ -1991,7 +2059,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       FlattenTupleType(operands[0].getType(), flattened_operand_types);
       FlattenTupleValue(func_builder, loc, operands[0], flattened_operands);
 
-      auto op = func_builder->create<mlir::mhlo::OptimizationBarrierOp>(
+      auto op = func_builder->create<mlir::stablehlo::OptimizationBarrierOp>(
           loc, flattened_operand_types, flattened_operands);
 
       return CreateTupleFromOpResults(func_builder, loc, op.getOperation(),
@@ -2026,40 +2094,35 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "entry_metadata",
           ConvertSharding(*(*entry_metadata)->sharding(), builder_)));
 
+      // XLA Feature -- MHLO Only
       return func_builder
           ->create<mlir::mhlo::DomainOp>(loc, result_type, operands, attributes)
           .getOperation();
     }
 
-#define NO_ATTRIBUTE_CASE(hlo_op_code, mlir_op)                               \
-  case HloOpcode::hlo_op_code: {                                              \
-    return func_builder                                                       \
-        ->create<mlir::mhlo::mlir_op>(loc, result_type, operands, attributes) \
-        .getOperation();                                                      \
+#define NO_ATTRIBUTE_CASE(hlo_op_code, mlir_op)                        \
+  case HloOpcode::hlo_op_code: {                                       \
+    return func_builder                                                \
+        ->create<mlir::stablehlo::mlir_op>(loc, result_type, operands, \
+                                           attributes)                 \
+        .getOperation();                                               \
   }
 
       // broadcast dimensions are never added here because they don't exist as
       // part of the HLO instruction. They are only a convenience in the XLA
       // builder API.
       NO_ATTRIBUTE_CASE(kAbs, AbsOp);
-      NO_ATTRIBUTE_CASE(kAddDependency, AddDependencyOp);
       NO_ATTRIBUTE_CASE(kAnd, AndOp);
       NO_ATTRIBUTE_CASE(kAtan2, Atan2Op);
       NO_ATTRIBUTE_CASE(kBitcastConvert, BitcastConvertOp);
-      NO_ATTRIBUTE_CASE(kCbrt, CbrtOp);
       NO_ATTRIBUTE_CASE(kClz, ClzOp);
       NO_ATTRIBUTE_CASE(kCeil, CeilOp);
       NO_ATTRIBUTE_CASE(kClamp, ClampOp);
       NO_ATTRIBUTE_CASE(kComplex, ComplexOp);
-      NO_ATTRIBUTE_CASE(kCos, CosineOp);
       NO_ATTRIBUTE_CASE(kDivide, DivOp);
-      NO_ATTRIBUTE_CASE(kExp, ExpOp);
-      NO_ATTRIBUTE_CASE(kExpm1, Expm1Op);
       NO_ATTRIBUTE_CASE(kFloor, FloorOp);
       NO_ATTRIBUTE_CASE(kIsFinite, IsFiniteOp);
       NO_ATTRIBUTE_CASE(kImag, ImagOp);
-      NO_ATTRIBUTE_CASE(kLog, LogOp);
-      NO_ATTRIBUTE_CASE(kLog1p, Log1pOp);
       NO_ATTRIBUTE_CASE(kMaximum, MaxOp);
       NO_ATTRIBUTE_CASE(kMinimum, MinOp);
       NO_ATTRIBUTE_CASE(kMultiply, MulOp);
@@ -2072,32 +2135,64 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       NO_ATTRIBUTE_CASE(kReal, RealOp);
       NO_ATTRIBUTE_CASE(kRemainder, RemOp);
       NO_ATTRIBUTE_CASE(kReplicaId, ReplicaIdOp);
-      NO_ATTRIBUTE_CASE(kStochasticConvert, StochasticConvertOp);
-      NO_ATTRIBUTE_CASE(kLogistic, LogisticOp);
-      NO_ATTRIBUTE_CASE(kErf, ErfOp);
       // The dimensions attribute is not present on the HLO Reshape
       // instruction. If dimensions are non-default, the XLA builder
       // implements it as a separate transpose.
       NO_ATTRIBUTE_CASE(kReshape, ReshapeOp);
       NO_ATTRIBUTE_CASE(kRoundNearestAfz, RoundOp);
       NO_ATTRIBUTE_CASE(kRoundNearestEven, RoundNearestEvenOp);
-      NO_ATTRIBUTE_CASE(kRsqrt, RsqrtOp);
       NO_ATTRIBUTE_CASE(kSelect, SelectOp);
       NO_ATTRIBUTE_CASE(kShiftLeft, ShiftLeftOp);
       NO_ATTRIBUTE_CASE(kShiftRightArithmetic, ShiftRightArithmeticOp);
       NO_ATTRIBUTE_CASE(kShiftRightLogical, ShiftRightLogicalOp);
       NO_ATTRIBUTE_CASE(kSign, SignOp);
-      NO_ATTRIBUTE_CASE(kSin, SineOp);
-      NO_ATTRIBUTE_CASE(kSqrt, SqrtOp);
       NO_ATTRIBUTE_CASE(kSubtract, SubtractOp);
-      NO_ATTRIBUTE_CASE(kTan, TanOp);
-      NO_ATTRIBUTE_CASE(kTanh, TanhOp);
       NO_ATTRIBUTE_CASE(kTuple, TupleOp);
       NO_ATTRIBUTE_CASE(kXor, XorOp);
-      NO_ATTRIBUTE_CASE(kCopy, CopyOp);
 
 #undef NO_ATTRIBUTE_CASE
 
+#define NO_ATTRIBUTE_CASE_MHLO(hlo_op_code, mlir_op)                          \
+  case HloOpcode::hlo_op_code: {                                              \
+    return func_builder                                                       \
+        ->create<mlir::mhlo::mlir_op>(loc, result_type, operands, attributes) \
+        .getOperation();                                                      \
+  }
+      NO_ATTRIBUTE_CASE_MHLO(kAddDependency, AddDependencyOp);
+      NO_ATTRIBUTE_CASE_MHLO(kCopy, CopyOp);
+      NO_ATTRIBUTE_CASE_MHLO(kErf, ErfOp);
+      NO_ATTRIBUTE_CASE_MHLO(kStochasticConvert, StochasticConvertOp);
+
+#undef NO_ATTRIBUTE_CASE_MHLO
+
+#define RESULT_ACCURACY_CASE(hlo_op_code, mlir_op)                            \
+  case HloOpcode::hlo_op_code: {                                              \
+    if (instruction->has_result_accuracy()) {                                 \
+      attributes.push_back(builder_->getNamedAttr(                            \
+          "result_accuracy", stablehlo::ConvertResultAccuracy(                \
+                                 instruction->result_accuracy(), builder_))); \
+    }                                                                         \
+    return func_builder                                                       \
+        ->create<mlir::stablehlo::mlir_op>(loc, result_type, operands,        \
+                                           attributes)                        \
+        .getOperation();                                                      \
+  }
+
+      RESULT_ACCURACY_CASE(kCbrt, CbrtOp);
+      RESULT_ACCURACY_CASE(kCos, CosineOp);
+      RESULT_ACCURACY_CASE(kExp, ExpOp);
+      RESULT_ACCURACY_CASE(kExpm1, Expm1Op);
+      RESULT_ACCURACY_CASE(kLog, LogOp);
+      RESULT_ACCURACY_CASE(kLog1p, Log1pOp);
+      RESULT_ACCURACY_CASE(kLogistic, LogisticOp);
+      RESULT_ACCURACY_CASE(kRsqrt, RsqrtOp);
+      RESULT_ACCURACY_CASE(kSin, SineOp);
+      RESULT_ACCURACY_CASE(kSqrt, SqrtOp);
+      RESULT_ACCURACY_CASE(kTan, TanOp);
+      RESULT_ACCURACY_CASE(kTanh, TanhOp);
+
+#undef RESULT_ACCURACY_CASE
+
     case HloOpcode::kFusion: {
       // Flatten the tuple-typed operands.
       llvm::SmallVector<Value> flattened_operands =
@@ -2116,6 +2211,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "output_operand_aliasing",
           ConvertOutputOperandAliasing(instruction->output_operand_aliasing(),
                                        builder_)));
+      // XLA Feature -- MHLO Only
       auto fusion = func_builder->create<mlir::mhlo::FusionOp>(
           loc, flattened_ret_types, flattened_operands, attributes);
       TF_RETURN_IF_ERROR(
@@ -2126,6 +2222,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                                       result_type);
     }
     case HloOpcode::kBitcast: {
+      // XLA Feature -- MHLO Only
       auto bitcast = func_builder->create<mlir::mhlo::BitcastOp>(
           loc, result_type, operands, attributes);
       // Store the source and result layout as attributes. Although the MHLO
@@ -2137,7 +2234,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       return bitcast.getOperation();
     }
     case HloOpcode::kReducePrecision: {
-      auto op = func_builder->create<mlir::mhlo::ReducePrecisionOp>(
+      auto op = func_builder->create<mlir::stablehlo::ReducePrecisionOp>(
           loc, result_type, operands[0], attributes);
       op.setExponentBitsAttr(func_builder->getIntegerAttr(
           func_builder->getI32Type(), instruction->exponent_bits()));
@@ -2184,7 +2281,12 @@ HloFunctionImporter::ImportInstructionWithLayout(
     LLVM_DEBUG(llvm::dbgs() << "  instruction skipped.\n");
     return op;
   }
-  LLVM_DEBUG(llvm::dbgs() << "  imported: " << *op << '\n');
+
+  // Print generic in debug since module may be invalid while printing.
+  LLVM_DEBUG({
+    op->print(llvm::dbgs(), mlir::OpPrintingFlags().printGenericOpForm());
+    llvm::dbgs() << "\n";
+  });
 
   // See MlirToHloConversionOptions for more about layouts.
   //
@@ -2241,8 +2343,8 @@ mlir::NamedAttribute HloFunctionImporter::ConvertComparisonDirection(
     ComparisonDirection direction) {
   return builder_->getNamedAttr(
       "comparison_direction",
-      mlir::mhlo::ComparisonDirectionAttr::get(
-          builder_->getContext(), mlir::mhlo::symbolizeComparisonDirection(
+      mlir::stablehlo::ComparisonDirectionAttr::get(
+          builder_->getContext(), mlir::stablehlo::symbolizeComparisonDirection(
                                       ComparisonDirectionToString(direction))
                                       .value()));
 }
@@ -2251,9 +2353,9 @@ mlir::NamedAttribute HloFunctionImporter::ConvertComparisonType(
     Comparison::Type type) {
   return builder_->getNamedAttr(
       "compare_type",
-      mlir::mhlo::ComparisonTypeAttr::get(
+      mlir::stablehlo::ComparisonTypeAttr::get(
           builder_->getContext(),
-          mlir::mhlo::symbolizeComparisonType(ComparisonTypeToString(type))
+          mlir::stablehlo::symbolizeComparisonType(ComparisonTypeToString(type))
               .value()));
 }
 
@@ -2268,6 +2370,16 @@ mlir::DenseIntElementsAttr HloFunctionImporter::ConvertDimensions(
       dimensions);
 }
 
+mlir::DenseI64ArrayAttr HloFunctionImporter::ConvertArray(
+    llvm::ArrayRef<int64_t> elements) {
+  return builder_->getDenseI64ArrayAttr(elements);
+}
+
+mlir::DenseBoolArrayAttr HloFunctionImporter::ConvertArray(
+    llvm::ArrayRef<bool> elements) {
+  return builder_->getDenseBoolArrayAttr(elements);
+}
+
 mlir::DenseIntElementsAttr HloFunctionImporter::Convert(
     llvm::ArrayRef<int64_t> elements) {
   return DenseIntElementsAttr::get(
@@ -2326,7 +2438,7 @@ absl::Status HloFunctionImporter::ConvertShapeToMlirLayout(
   }
   if (shape.IsTuple()) {
     std::vector<mlir::Attribute> tuple_layouts;
-    for (int i = 0; i < shape.tuple_shapes_size(); i++) {
+    for (int i = 0; i < shape.tuple_shapes().size(); i++) {
       TF_RETURN_IF_ERROR(
           ConvertShapeToMlirLayout(shape.tuple_shapes(i), flattened_attr));
     }
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h
index ec0ffa87ad1f..7b34a32e720f 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -134,6 +136,7 @@ class HloFunctionImporter {
     context_->loadDialect<mlir::arith::ArithDialect>();
     context_->loadDialect<mlir::func::FuncDialect>();
     context_->loadDialect<mlir::mhlo::MhloDialect>();
+    context_->loadDialect<mlir::stablehlo::StablehloDialect>();
     context_->loadDialect<mlir::sparse_tensor::SparseTensorDialect>();
   }
 
@@ -208,6 +211,12 @@ class HloFunctionImporter {
   mlir::DenseIntElementsAttr ConvertDimensions(
       absl::Span<const int64_t> op_dimensions);
 
+  // Converts Array ref to a DenseI64ArrayAttr.
+  mlir::DenseI64ArrayAttr ConvertArray(llvm::ArrayRef<int64_t> elements);
+
+  // Converts Array ref to a DenseBoolArrayAttr.
+  mlir::DenseBoolArrayAttr ConvertArray(llvm::ArrayRef<bool> elements);
+
   // Converts Array ref to an DenseIntElementsAttr.
   mlir::DenseIntElementsAttr Convert(llvm::ArrayRef<int64_t> elements);
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.cc
index b0ae41e09091..6958f7ae0c37 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.cc
@@ -19,36 +19,57 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Quant/IR/Quant.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/PassManager.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h"
 #include "xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
+#define DEBUG_TYPE "xla-translate"
+
 namespace xla {
 
 HloModuleImporter::HloModuleImporter(mlir::ModuleOp module,
                                      bool import_all_computation,
-                                     bool flatten_computation_args_result)
+                                     bool flatten_computation_args_result,
+                                     bool emit_stablehlo)
     : import_all_computation_(import_all_computation),
       flatten_computation_args_result_(flatten_computation_args_result),
       symbol_table_(module),
+      emit_stablehlo_(emit_stablehlo),
       builder_(module.getContext()) {
   module.getContext()->loadDialect<mlir::arith::ArithDialect>();
   module.getContext()->loadDialect<mlir::func::FuncDialect>();
   module.getContext()->loadDialect<mlir::mhlo::MhloDialect>();
+  module.getContext()->loadDialect<mlir::stablehlo::StablehloDialect>();
   module.getContext()->loadDialect<mlir::quant::QuantDialect>();
 }
 
+namespace {
+absl::Status ConvertToMhlo(mlir::ModuleOp module) {
+  mlir::PassManager pm(module.getContext());
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  if (failed(pm.run(module))) {
+    return absl::InternalError("Failed to convert to MHLO");
+  }
+  return absl::OkStatus();
+}
+}  // namespace
+
 absl::Status HloModuleImporter::Import(const HloModule& hlo_module) {
   auto module = llvm::cast<mlir::ModuleOp>(symbol_table_.getOp());
   module.setName(hlo_module.name());
@@ -68,11 +89,18 @@ absl::Status HloModuleImporter::Import(const HloModule& hlo_module) {
   if (!import_all_computation_) {
     // Only import the entry computation, any reachable one will be imported
     // unless turned into a region operation.
-    return HloFunctionImporter::ImportAsFunc(
-               *hlo_module.entry_computation(), symbol_table_, &function_map_,
-               &builder_,
-               /*is_main*/ true, flatten_computation_args_result_)
-        .status();
+    TF_RETURN_IF_ERROR(HloFunctionImporter::ImportAsFunc(
+                           *hlo_module.entry_computation(), symbol_table_,
+                           &function_map_, &builder_,
+                           /*is_main*/ true, flatten_computation_args_result_)
+                           .status());
+
+    // Convert all ops to MHLO
+    LLVM_DEBUG(llvm::dbgs() << "Emit StableHLO: " << emit_stablehlo_ << "\n");
+    if (!emit_stablehlo_) {
+      TF_RETURN_IF_ERROR(ConvertToMhlo(module));
+    }
+    return absl::OkStatus();
   }
 
   auto* module_entry_computation = hlo_module.entry_computation();
@@ -89,6 +117,12 @@ absl::Status HloModuleImporter::Import(const HloModule& hlo_module) {
       hlo_module, module, flatten_computation_args_result_, builder_);
   TF_RETURN_IF_ERROR(ImportLayoutModes(
       hlo_module, module, flatten_computation_args_result_, builder_));
+
+  // Convert all ops to MHLO
+  LLVM_DEBUG(llvm::dbgs() << "Emit StableHLO: " << emit_stablehlo_ << "\n");
+  if (!emit_stablehlo_) {
+    TF_RETURN_IF_ERROR(ConvertToMhlo(module));
+  }
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h
index 8937f673035a..0abfe50b93e0 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h
@@ -38,7 +38,8 @@ class HloModuleImporter {
  public:
   explicit HloModuleImporter(mlir::ModuleOp module,
                              bool import_all_computation = false,
-                             bool flatten_computation_args_result = false);
+                             bool flatten_computation_args_result = false,
+                             bool emit_stablehlo = false);
 
   // Import the HloModule into the MLIR Module.
   absl::Status Import(const xla::HloModule& module);
@@ -50,6 +51,7 @@ class HloModuleImporter {
   bool import_all_computation_;
   bool flatten_computation_args_result_;
   mlir::SymbolTable symbol_table_;
+  bool emit_stablehlo_;
   mlir::Builder builder_;
 
   // Map for tracking which MLIR function map to which HLO Computation. This
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc
index 39293742e068..9db6d66a0676 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc
@@ -33,43 +33,47 @@ namespace xla {
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToMlirHlo(
     mlir::MLIRContext& ctx, xla::HloModuleProto const* hlo_module,
-    bool import_all_computations, bool flatten_computation_args_result) {
+    bool import_all_computations, bool flatten_computation_args_result,
+    bool emit_stablehlo) {
   mlir::OwningOpRef<mlir::ModuleOp> module =
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(&ctx));
-  TF_RETURN_IF_ERROR(ConvertHloToMlirHlo(*module, hlo_module,
-                                         import_all_computations,
-                                         flatten_computation_args_result));
+  TF_RETURN_IF_ERROR(
+      ConvertHloToMlirHlo(*module, hlo_module, import_all_computations,
+                          flatten_computation_args_result, emit_stablehlo));
   return module;
 }
 
 absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
                                  xla::HloModuleProto const* hlo_module_proto,
                                  bool import_all_computation,
-                                 bool flatten_computation_args_result) {
+                                 bool flatten_computation_args_result,
+                                 bool emit_stablehlo) {
   mlir::BaseScopedDiagnosticHandler diag_handler(module.getContext());
   return HloModuleImporter(module, import_all_computation,
-                           flatten_computation_args_result)
+                           flatten_computation_args_result, emit_stablehlo)
       .Import(*hlo_module_proto);
 }
 
 absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
                                  const xla::HloModule* hlo_module,
                                  bool import_all_computation,
-                                 bool flatten_computation_args_result) {
+                                 bool flatten_computation_args_result,
+                                 bool emit_stablehlo) {
   mlir::BaseScopedDiagnosticHandler diag_handler(module.getContext());
   return HloModuleImporter(module, import_all_computation,
-                           flatten_computation_args_result)
+                           flatten_computation_args_result, emit_stablehlo)
       .Import(*hlo_module);
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToMlirHlo(
     mlir::MLIRContext& ctx, const xla::HloModule* hlo_module,
-    bool import_all_computations, bool flatten_computation_args_result) {
+    bool import_all_computations, bool flatten_computation_args_result,
+    bool emit_stablehlo) {
   mlir::OwningOpRef<mlir::ModuleOp> module =
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(&ctx));
-  TF_RETURN_IF_ERROR(ConvertHloToMlirHlo(*module, hlo_module,
-                                         import_all_computations,
-                                         flatten_computation_args_result));
+  TF_RETURN_IF_ERROR(
+      ConvertHloToMlirHlo(*module, hlo_module, import_all_computations,
+                          flatten_computation_args_result, emit_stablehlo));
   return module;
 }
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
index 248910652756..cf1bab0b56e2 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
@@ -42,12 +42,13 @@ class HloModuleProto;
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToMlirHlo(
     mlir::MLIRContext& ctx, xla::HloModuleProto const* hlo_module,
     bool import_all_computations = false,
-    bool flatten_computation_args_result = false);
+    bool flatten_computation_args_result = false, bool emit_stablehlo = false);
 
 absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
                                  xla::HloModuleProto const* hlo_module,
                                  bool import_all_computations = false,
-                                 bool flatten_computation_args_result = false);
+                                 bool flatten_computation_args_result = false,
+                                 bool emit_stablehlo = false);
 
 // Converts an HLO module to a MLIR module in HLO dialect.
 //
@@ -59,12 +60,13 @@ absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToMlirHlo(
     mlir::MLIRContext& ctx, const xla::HloModule* hlo_module,
     bool import_all_computations = false,
-    bool flatten_computation_args_result = false);
+    bool flatten_computation_args_result = false, bool emit_stablehlo = false);
 
 absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
                                  const xla::HloModule* hlo_module,
                                  bool import_all_computations = false,
-                                 bool flatten_computation_args_result = false);
+                                 bool flatten_computation_args_result = false,
+                                 bool emit_stablehlo = false);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
index 51523a73ad4d..aa4e95e2ddaf 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -36,10 +37,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/mlir/utils/type_util.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/tsl/platform/statusor.h"
@@ -113,7 +115,7 @@ absl::StatusOr<AffineMap> GetPermutationIfAvailable(const Shape& shape,
     return Internal("Permutations for dynamic shapes are not yet supported");
   }
   int64_t accumulated_stride = 1;
-  llvm::SmallVector<int64_t, 4> strides(shape.rank(), 1);
+  llvm::SmallVector<int64_t, 4> strides(shape.dimensions().size(), 1);
   for (int64_t dim : LayoutUtil::MinorToMajor(shape)) {
     strides[dim] = accumulated_stride;
     accumulated_stride *= shape.dimensions(dim);
@@ -176,7 +178,7 @@ mlir::DenseIntElementsAttr CreateDenseIntElementsAttrFromVector(
 mlir::Value CreateTupleValue(mlir::OpBuilder* func_builder, mlir::Location loc,
                              mlir::ValueRange& flatten_values,
                              mlir::Type type) {
-  auto tuple_type = type.dyn_cast<mlir::TupleType>();
+  auto tuple_type = mlir::dyn_cast<mlir::TupleType>(type);
   if (!tuple_type) {
     assert(!flatten_values.empty());
     auto retval = flatten_values.front();
@@ -189,7 +191,7 @@ mlir::Value CreateTupleValue(mlir::OpBuilder* func_builder, mlir::Location loc,
     flatten_sub_values.push_back(
         CreateTupleValue(func_builder, loc, flatten_values, child_type));
 
-  return func_builder->create<mlir::mhlo::TupleOp>(loc, flatten_sub_values)
+  return func_builder->create<mlir::stablehlo::TupleOp>(loc, flatten_sub_values)
       .getResult();
 }
 
@@ -197,15 +199,14 @@ mlir::Operation* CreateTupleFromOpResults(mlir::OpBuilder* func_builder,
                                           mlir::Location loc,
                                           mlir::Operation* op,
                                           mlir::Type type) {
-  if (!type.isa<mlir::TupleType>()) return op;
+  if (!mlir::isa<mlir::TupleType>(type)) return op;
 
   mlir::ValueRange flattened_results_ref(op->getResults());
   auto result =
       CreateTupleValue(func_builder, loc, flattened_results_ref, type);
-  auto defining_tuple_op = result.getDefiningOp<mlir::mhlo::TupleOp>();
-  assert(defining_tuple_op && "builder didn't return the right type");
-  auto tupleOp = defining_tuple_op.getOperation();
-  return tupleOp;
+  mlir::Operation* tuple_op = result.getDefiningOp<mlir::stablehlo::TupleOp>();
+  assert(tuple_op && "builder didn't return the right type");
+  return tuple_op;
 }
 
 mlir::Operation* WrapVariadicResultsInTuple(mlir::OpBuilder* builder,
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
index b0c121f9101d..e1f4010c8b8a 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/SparseTensor/IR/Enums.h"
@@ -36,11 +37,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/mlir/utils/type_util.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -65,7 +66,7 @@ static absl::StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
   if (!element_type_or.ok()) return element_type_or.status();
 
   bool is_bounded_dynamic = false;
-  int64_t rank = xla_ty.rank();
+  int64_t rank = xla_ty.dimensions().size();
   llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamic);
   llvm::SmallVector<int64_t, 4> bounds(rank, mlir::ShapedType::kDynamic);
   for (int64_t dim = 0; dim < rank; ++dim) {
@@ -79,7 +80,7 @@ static absl::StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
       shape[dim] = dim_size;
     }
   }
-  using mlir::mhlo::TypeExtensionsAttr;
+  using mlir::stablehlo::TypeExtensionsAttr;
   mlir::Attribute encoding;
   if (is_bounded_dynamic) {
     encoding = TypeExtensionsAttr::get(builder.getContext(), bounds);
@@ -93,51 +94,6 @@ static absl::StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
   //
   // For example, we wouldn't be able to represent the xla type
   // `f32[4,<=4]{1,0:D(D,C)}`.
-  if (xla_ty.has_layout()) {
-    auto layout = xla_ty.layout();
-    if (LayoutUtil::IsSparse(layout)) {
-      if (is_bounded_dynamic)
-        return Unimplemented(
-            "MHLO doesn't support bounded dynamic shapes for sparse tensors");
-      llvm::SmallVector<mlir::sparse_tensor::LevelType> lts;
-      for (size_t i = 0, e = layout.dim_level_types_size(); i < e; ++i) {
-        auto dlt = layout.dim_level_type(i);
-        bool ordered =
-            i < layout.dim_ordered_size() ? layout.dim_ordered(i) : true;
-        bool unique =
-            i < layout.dim_unique_size() ? layout.dim_unique(i) : true;
-        switch (dlt) {
-          case DimLevelType::DIM_DENSE:
-            lts.push_back(*mlir::sparse_tensor::buildLevelType(
-                mlir::sparse_tensor::LevelFormat::Dense, ordered, unique));
-            break;
-          case DimLevelType::DIM_COMPRESSED:
-            lts.push_back(*mlir::sparse_tensor::buildLevelType(
-                mlir::sparse_tensor::LevelFormat::Compressed, ordered, unique));
-            break;
-          case DimLevelType::DIM_SINGLETON:
-            lts.push_back(*mlir::sparse_tensor::buildLevelType(
-                mlir::sparse_tensor::LevelFormat::Singleton, ordered, unique));
-            break;
-          case DimLevelType::DIM_LOOSE_COMPRESSED:
-            lts.push_back(*mlir::sparse_tensor::buildLevelType(
-                mlir::sparse_tensor::LevelFormat::LooseCompressed, ordered,
-                unique));
-            break;
-          default:
-            return InvalidArgument("Unknown DimLevelType from HLO");
-        }
-      }
-      auto ordering = layout.minor_to_major();
-      llvm::SmallVector<uint32_t> major_to_minor = {ordering.rbegin(),
-                                                    ordering.rend()};
-      auto id_map = mlir::AffineMap::getPermutationMap(major_to_minor,
-                                                       builder.getContext());
-      // TODO(atondwal): support sizes other than 32 when XLA does
-      encoding = SparseTensorEncodingAttr::get(
-          builder.getContext(), lts, id_map, mlir::AffineMap(), 32, 32);
-    }
-  }
   return TypeT::get(shape, element_type_or.value(), encoding);
 }
 
@@ -160,7 +116,7 @@ static absl::StatusOr<mlir::Type> ConvertShapeToType(const Shape& shape,
                                                      mlir::Builder builder) {
   if (shape.IsTuple()) {
     llvm::SmallVector<mlir::Type, 4> contents;
-    contents.reserve(shape.tuple_shapes_size());
+    contents.reserve(shape.tuple_shapes().size());
     for (const auto& subtype : shape.tuple_shapes()) {
       TF_ASSIGN_OR_RETURN(auto mlir_subtype,
                           ConvertShapeToType<TypeT>(subtype, builder));
@@ -169,7 +125,7 @@ static absl::StatusOr<mlir::Type> ConvertShapeToType(const Shape& shape,
     return builder.getTupleType(contents);
   }
   if (shape.IsToken()) {
-    return mlir::mhlo::TokenType::get(builder.getContext());
+    return mlir::stablehlo::TokenType::get(builder.getContext());
   }
   return ConvertTensorShapeToType<TypeT>(shape, builder);
 }
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils_test.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils_test.cc
index 68a9607a5e5d..e7306858de66 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils_test.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/DebugStringHelper.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -34,7 +34,7 @@ namespace {
 
 TEST(ConvertTensorShapeToType, Simple) {
   mlir::MLIRContext context;
-  context.loadDialect<mlir::mhlo::MhloDialect>();
+  context.loadDialect<mlir::stablehlo::StablehloDialect>();
   mlir::Builder builder(&context);
 
   // Static shape.
@@ -59,7 +59,8 @@ TEST(ConvertTensorShapeToType, Simple) {
         ConvertTensorShapeToType<mlir::RankedTensorType>(shape, builder));
 
     int64_t bounds[] = {8, mlir::ShapedType::kDynamic};
-    auto extensions = mlir::mhlo::TypeExtensionsAttr::get(&context, bounds);
+    auto extensions =
+        mlir::stablehlo::TypeExtensionsAttr::get(&context, bounds);
     auto expected = mlir::RankedTensorType::get(
         {mlir::ShapedType::kDynamic, 128}, builder.getI32Type(), extensions);
     EXPECT_TRUE(type == expected)
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.cc
index f752a4b2577a..48169f4d7493 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.cc
@@ -26,9 +26,8 @@ limitations under the License.
 #include "xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
-// TODO(herhut): Refactor the format.
 mlir::Location GenerateInstructionLocation(
     const xla::HloInstruction* instruction, mlir::MLIRContext* context) {
   mlir::Builder b(context);
@@ -61,5 +60,5 @@ mlir::Location GenerateInstructionLocation(
                                  instruction->metadata().source_line(), 0)});
 }
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.h
index 0137fa446b02..06174be31d89 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/location_importer.h
@@ -21,14 +21,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 
 // Returns an MLIR Location generated from HLO Instruction. Uses instruction
 // metadata if present or instruction name.
 mlir::Location GenerateInstructionLocation(
     const xla::HloInstruction* instruction, mlir::MLIRContext* context);
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
 
 #endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_LOCATION_IMPORTER_H_
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc
index b9c33dcfcf47..f6e12ff07e7c 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/module_attributes_importer.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
-#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -349,7 +348,8 @@ void ImportParameterLayoutModes(mlir::func::FuncOp main,
   CHECK_EQ(parameter_shapes.size(), main.getNumArguments());
   for (size_t i = 0; i < main.getNumArguments(); ++i) {
     const Shape& shape = *parameter_shapes[i];
-    if (shape.IsTuple() || (shape.IsArray() && shape.rank() == 0)) continue;
+    if (shape.IsTuple() || (shape.IsArray() && shape.dimensions().size() == 0))
+      continue;
     if (LayoutUtil::HasAnyLayout(*parameter_shapes[i])) continue;
     main.setArgAttrs(
         i, AppendAutoLayoutModeAttribute(builder, main.getArgAttrDict(i)));
@@ -368,7 +368,8 @@ void ImportResultLayoutModes(mlir::func::FuncOp main,
   CHECK_EQ(result_shapes.size(), main.getNumResults());
   for (size_t i = 0; i < main.getNumResults(); ++i) {
     const Shape& shape = *result_shapes[i];
-    if (shape.IsTuple() || (shape.IsArray() && shape.rank() == 0)) continue;
+    if (shape.IsTuple() || (shape.IsArray() && shape.dimensions().size() == 0))
+      continue;
     if (LayoutUtil::HasAnyLayout(shape)) continue;
     main.setResultAttrs(
         i, AppendAutoLayoutModeAttribute(builder, main.getResultAttrDict(i)));
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.cc
index 9448af4acb5e..9b53614c0c8c 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_utils.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 mlir::Location GetLocationFromFrameIndex(int frame_id, mlir::Builder& builder,
                                          const xla::HloModule* hlo_module) {
   std::vector<mlir::Location> stack_locations;
@@ -57,5 +57,5 @@ mlir::Location GetLocationFromFrameIndex(int frame_id, mlir::Builder& builder,
   return mlir::CallSiteLoc::get(stack_locations[0],
                                 stack_locations_ref.drop_front());
 }
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h
index f5210d558d91..3fec1b2fc879 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/stack_location_utils.h
@@ -22,13 +22,13 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 
 namespace mlir {
-namespace mhlo {
+namespace hlo {
 // Construct MLIR location from frame index.
 // Returns unknown location if frame is not presented.
 mlir::Location GetLocationFromFrameIndex(int frame_id, mlir::Builder &builder,
                                          const xla::HloModule *hlo_module);
 
-}  // namespace mhlo
+}  // namespace hlo
 }  // namespace mlir
 
 #endif  // XLA_HLO_TRANSLATE_HLO_TO_MHLO_STACK_LOCATION_UTILS_H_
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/BUILD b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/BUILD
index 0be7da5dac02..51fdb9712fae 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/BUILD
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/BUILD
@@ -22,9 +22,11 @@ lit_test_suite(
             "fusion.hlo",
             "if_conditional.hlo",
             "import.hlo",
+            "import_locs.hlo",
             "import_async.hlo",
             "import_bounded_dynamism.hlo",
             "import_entry_computation_layout.hlo",
+            "import_emit_stablehlo.hlo",
             "layouts_and_names.hlo",
             "location.hlo",
             "module_attributes.hlo",
@@ -37,6 +39,7 @@ lit_test_suite(
             "stacktrace_to_location.hlo",
             "types.hlo",
             "while.hlo",
+            "result_accuracy.hlo",
         ],
         include = [
             "*.hlo",
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo
index 63d45aba1250..a03696475cce 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import.hlo
@@ -2,9 +2,6 @@
 // RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-flatten-computation-args-result=false %s -o - | FileCheck %s -check-prefix=NO_DEAD_FUNCTION
 // RUN: hlo-translate --print-sugar=false -hlo-to-mlir -emit-mhlo %s | FileCheck %s -check-prefix=FLATTEN-CHECK
 
-// CHECK: #[[$DC:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 32, crdWidth = 32 }>
-// CHECK: #[[$CSS:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, nonordered), d2 : singleton(nonordered)), posWidth = 32, crdWidth = 32 }>
-
 // NO_DEAD_FUNCTION-NOT: @test
 
 // CHECK: module @foobar
@@ -1995,18 +1992,6 @@ add {
   ROOT %domain.2 = u32[] domain(u32[] %Arg_0.1), domain={kind="sharding", entry={maximal device=1}, exit={}}
 }
 
-// CHECK-LABEL : func private @sparse
-// CHECK: tensor<10x10xf32, #[[$DC]]>) -> tensor<10x10xf32, #[[$DC]]>
-%sparse {
-  ROOT root = f32[10,10]{1,0:D(D,C)} parameter(0)
-}
-
-// CHECK-LABEL : func private @sparse_nu_no
-// CHECK: tensor<3x4x5xf32, #[[$CSS]]>) -> tensor<3x4x5xf32, #[[$CSS]]>
-%sparse_nu_no {
-  ROOT root = f32[3,4,5]{2,1,0:D(C+,S+~,S~)} parameter(0)
-}
-
 // CHECK-LABEL: func private @add_dependency
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<4x4xf32>)
 %add_dependency (Arg_0.1: f32[4,4]) -> f32[4,4] {
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_bounded_dynamism_stablehlo.mlir b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_bounded_dynamism_stablehlo.mlir
new file mode 100644
index 000000000000..ab00b96b84f2
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_bounded_dynamism_stablehlo.mlir
@@ -0,0 +1,13 @@
+// RUN: hlo-translate -hlo-to-mlir -split-input-file -verify-diagnostics %s | FileCheck %s
+
+HloModule main, entry_computation_layout={(f32[16,50]{1,0}, s64[1,<=16]{1,0})->f32[<=16,50]{1,0}}
+
+// CHECK-LABEL: main
+// CHECK:      stablehlo.reshape {{.*}} (tensor<1x?xi64, #stablehlo.bounds<?, 16>>) -> tensor<?xi64, #stablehlo.bounds<16>>
+// CHECK-NEXT: "stablehlo.gather"{{.*}} : (tensor<16x50xf32>, tensor<?xi64, #stablehlo.bounds<16>>) -> tensor<?x50xf32, #stablehlo.bounds<16, ?>>
+ENTRY %main.5 (Arg_0.1: f32[16,50], Arg_1.2: s64[1,<=16]) -> f32[<=16,50] {
+  %Arg_0.1 = f32[16,50] parameter(0)
+  %Arg_1.2 = s64[1,<=16] parameter(1)
+  %reshape.3 = s64[<=16] reshape(%Arg_1.2), metadata={source_file="/tmp/t.mlir" source_line=3}
+  ROOT %gather.4 = f32[<=16,50] gather(%Arg_0.1, %reshape.3), offset_dims={1}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1, slice_sizes={1,50}, metadata={source_file="/tmp/t.mlir" source_line=4}
+}
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_emit_stablehlo.hlo b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_emit_stablehlo.hlo
new file mode 100644
index 000000000000..eddb91986a12
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_emit_stablehlo.hlo
@@ -0,0 +1,2788 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+//
+// The script is designed to make adding checks to
+// a test case fast, it is *not* designed to be authoritative
+// minimized and named to reflect the test intent.
+
+// Regnerate them using the following command:
+// $ TFILE=/path/to/xla/hlo/translate/hlo_to_mhlo/tests/import_emit_stablehlo.hlo
+// $ DELIM="Hlo Module" # Remove the space in the middle when running cmd. This comment needs the space since the source file is regex matched.
+// $ xla-translate $TFILE -hlo-text-to-mlir-hlo --emit-stablehlo --split-input-file --hlo-import-all-computations | \
+//     third_party/llvm/llvm-project/mlir/utils/generate-test-checks.py --source $TFILE --source_delim_regex="$DELIM" --starts_from_scope=0 -i
+
+// RUN: xla-translate %s -hlo-text-to-mlir-hlo --emit-stablehlo --split-input-file --hlo-import-all-computations | FileCheck %s
+
+// CHECK-LABEL: module @foo attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<i1>) -> tensor<i1> {
+// CHECK:           return %[[VAL_0]] : tensor<i1>
+// CHECK:         }
+// CHECK:       }
+HloModule foo, entry_computation_layout={(pred[])->pred[]}
+
+ENTRY %main.2 (Arg_0.1: pred[]) -> pred[] {
+  ROOT %Arg_0.1 = pred[] parameter(0)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xi1>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.xor %[[VAL_0]], %[[VAL_0]] : tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(pred[2]{0})->pred[2]{0}}
+
+ENTRY %main.3 (Arg_0.1: pred[2]) -> pred[2] {
+  %Arg_0.1 = pred[2] parameter(0)
+  ROOT %xor.2 = pred[2] xor(%Arg_0.1, %Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token, %[[VAL_1:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.after_all %[[VAL_0]], %[[VAL_1]] {xla_shape = "token[]"} : !stablehlo.token
+// CHECK:           return %[[VAL_2]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[], token[])->token[]}
+
+ENTRY %main.4 (Arg_0.1: token[], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = token[] parameter(0)
+  %Arg_1.2 = token[] parameter(1)
+  ROOT %after-all.3 = token[] after-all(%Arg_0.1, %Arg_1.2)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main() -> !stablehlo.token {
+// CHECK:           %[[VAL_0:.*]] = stablehlo.create_token {xla_shape = "token[]"} : !stablehlo.token
+// CHECK:           return %[[VAL_0]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={()->token[]}
+
+ENTRY %main.2 () -> token[] {
+  ROOT %after-all.1 = token[] after-all()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<10xf32>) -> tensor<5xf32> {
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.reduce_scatter"(%[[VAL_3]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 0>, replica_groups = dense<{{\[\[}}0, 2], [1, 3]]> : tensor<2x2xi64>, scatter_dimension = 0 : i64}> ({
+// CHECK:           ^bb0(%[[VAL_5:.*]]: tensor<f32>, %[[VAL_6:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_7:.*]] = stablehlo.maximum %[[VAL_5]], %[[VAL_6]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<f32>
+// CHECK:           }) : (tensor<10xf32>) -> tensor<5xf32>
+// CHECK:           return %[[VAL_4]] : tensor<5xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10]{0})->f32[5]{0}}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+  ROOT %maximum.5 = f32[] maximum(%Arg_0.3, %Arg_1.4)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[10]) -> f32[5] {
+  %Arg_0.1 = f32[10] parameter(0)
+  ROOT %reduce-scatter.6 = f32[5] reduce-scatter(%Arg_0.1), channel_id=5, replica_groups={{0,2},{1,3}}, dimensions={0}, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<128x32xf32>) -> tensor<128x128xf32> {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.all_gather"(%[[VAL_0]]) <{all_gather_dim = 1 : i64, channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<{{\[\[}}0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>}> : (tensor<128x32xf32>) -> tensor<128x128xf32>
+// CHECK:           return %[[VAL_1]] : tensor<128x128xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[128,32]{1,0})->f32[128,128]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[128,32]) -> f32[128,128] {
+  %Arg_0.1 = f32[128,32] parameter(0)
+  ROOT %all-gather.2 = f32[128,128] all-gather(%Arg_0.1), channel_id=1, replica_groups={{0,2,4,6},{1,3,5,7}}, dimensions={1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<128x32xf32>) -> tensor<128x128xf32> {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.all_gather"(%[[VAL_0]]) <{all_gather_dim = 1 : i64, channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<{{\[\[}}0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>, use_global_device_ids}> : (tensor<128x32xf32>) -> tensor<128x128xf32>
+// CHECK:           return %[[VAL_1]] : tensor<128x128xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[128,32]{1,0})->f32[128,128]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[128,32]) -> f32[128,128] {
+  %Arg_0.1 = f32[128,32] parameter(0)
+  ROOT %all-gather.2 = f32[128,128] all-gather(%Arg_0.1), channel_id=1, replica_groups={{0,2,4,6},{1,3,5,7}}, dimensions={1}, use_global_device_ids=true
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<8x2xf32>, %[[VAL_1:.*]]: tensor<8x4xf32>) -> tuple<tensor<8x8xf32>, tensor<8x16xf32>> {
+// CHECK:           %[[VAL_2:.*]]:2 = "stablehlo.all_gather"(%[[VAL_0]], %[[VAL_1]]) <{all_gather_dim = 1 : i64, channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<{{\[\[}}0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>, use_global_device_ids}> : (tensor<8x2xf32>, tensor<8x4xf32>) -> (tensor<8x8xf32>, tensor<8x16xf32>)
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple %[[VAL_2]]#0, %[[VAL_2]]#1 {xla_shape = "(f32[8,8]{1,0}, f32[8,16]{1,0})"} : tuple<tensor<8x8xf32>, tensor<8x16xf32>>
+// CHECK:           return %[[VAL_3]] : tuple<tensor<8x8xf32>, tensor<8x16xf32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[8,2]{1,0}, f32[8,4]{1,0})->(f32[8,8]{1,0}, f32[8,16]{1,0})}
+
+ENTRY %main.10 (Arg_0.1: f32[8,2], Arg_1.2: f32[8,4]) -> (f32[8,8], f32[8,16]) {
+  %Arg_0.1 = f32[8,2] parameter(0)
+  %Arg_1.2 = f32[8,4] parameter(1)
+  %tuple.3 = (f32[8,2], f32[8,4]) tuple(%Arg_0.1, %Arg_1.2)
+  %get-tuple-element.4 = f32[8,2] get-tuple-element(%tuple.3), index=0
+  %get-tuple-element.5 = f32[8,4] get-tuple-element(%tuple.3), index=1
+  %all-gather.6 = (f32[8,8], f32[8,16]) all-gather(%get-tuple-element.4, %get-tuple-element.5), channel_id=1, replica_groups={{0,2,4,6},{1,3,5,7}}, dimensions={1}, use_global_device_ids=true
+  %get-tuple-element.7 = f32[8,8] get-tuple-element(%all-gather.6), index=0
+  %get-tuple-element.8 = f32[8,16] get-tuple-element(%all-gather.6), index=1
+  ROOT %tuple.9 = (f32[8,8], f32[8,16]) tuple(%get-tuple-element.7, %get-tuple-element.8)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<10xf32>) -> tensor<10xf32> {
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.all_reduce"(%[[VAL_3]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 0>, replica_groups = dense<{{\[\[}}0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>}> ({
+// CHECK:           ^bb0(%[[VAL_5:.*]]: tensor<f32>, %[[VAL_6:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_7:.*]] = stablehlo.maximum %[[VAL_5]], %[[VAL_6]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<f32>
+// CHECK:           }) : (tensor<10xf32>) -> tensor<10xf32>
+// CHECK:           return %[[VAL_4]] : tensor<10xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10]{0})->f32[10]{0}}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+  ROOT %maximum.5 = f32[] maximum(%Arg_0.3, %Arg_1.4)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[10]) -> f32[10] {
+  %Arg_0.1 = f32[10] parameter(0)
+  ROOT %all-reduce.6 = f32[10] all-reduce(%Arg_0.1), channel_id=5, replica_groups={{0,2,4,6},{1,3,5,7}}, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<10xf32>) -> tensor<10xf32> {
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.all_reduce"(%[[VAL_3]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 0>, replica_groups = dense<{{\[\[}}0, 2, 4, -1], [1, 3, 5, 6]]> : tensor<2x4xi64>}> ({
+// CHECK:           ^bb0(%[[VAL_5:.*]]: tensor<f32>, %[[VAL_6:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_7:.*]] = stablehlo.maximum %[[VAL_5]], %[[VAL_6]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<f32>
+// CHECK:           }) : (tensor<10xf32>) -> tensor<10xf32>
+// CHECK:           return %[[VAL_4]] : tensor<10xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10]{0})->f32[10]{0}}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+  ROOT %maximum.5 = f32[] maximum(%Arg_0.3, %Arg_1.4)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[10]) -> f32[10] {
+  %Arg_0.1 = f32[10] parameter(0)
+  ROOT %all-reduce.6 = f32[10] all-reduce(%Arg_0.1), channel_id=5, replica_groups={{0,2,4},{1,3,5,6}}, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<10xf32>) -> tensor<10xf32> {
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.all_reduce"(%[[VAL_3]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 0>, replica_groups = dense<{{\[\[}}0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>, use_global_device_ids}> ({
+// CHECK:           ^bb0(%[[VAL_5:.*]]: tensor<f32>, %[[VAL_6:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_7:.*]] = stablehlo.maximum %[[VAL_5]], %[[VAL_6]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<f32>
+// CHECK:           }) : (tensor<10xf32>) -> tensor<10xf32>
+// CHECK:           return %[[VAL_4]] : tensor<10xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10]{0})->f32[10]{0}}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+  ROOT %maximum.5 = f32[] maximum(%Arg_0.3, %Arg_1.4)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[10]) -> f32[10] {
+  %Arg_0.1 = f32[10] parameter(0)
+  ROOT %all-reduce.6 = f32[10] all-reduce(%Arg_0.1), channel_id=5, replica_groups={{0,2,4,6},{1,3,5,7}}, use_global_device_ids=true, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.6(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<8xf32>, %[[VAL_4:.*]]: tensor<f32>) -> tuple<tensor<8xf32>, tensor<f32>> {
+// CHECK:           %[[VAL_5:.*]]:2 = "stablehlo.all_reduce"(%[[VAL_3]], %[[VAL_4]]) <{replica_groups = dense<> : tensor<0x0xi64>}> ({
+// CHECK:           ^bb0(%[[VAL_6:.*]]: tensor<f32>, %[[VAL_7:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_8:.*]] = stablehlo.add %[[VAL_6]], %[[VAL_7]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_8]] : tensor<f32>
+// CHECK:           }) : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+// CHECK:           %[[VAL_9:.*]] = stablehlo.tuple %[[VAL_10:.*]]#0, %[[VAL_10]]#1 {xla_shape = "(f32[8]{0}, f32[])"} : tuple<tensor<8xf32>, tensor<f32>>
+// CHECK:           return %[[VAL_9]] : tuple<tensor<8xf32>, tensor<f32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[8]{0}, f32[])->(f32[8]{0}, f32[])}
+
+%region_0.6 (Arg_0.7: f32[], Arg_1.8: f32[]) -> f32[] {
+  %Arg_0.7 = f32[] parameter(0)
+  %Arg_1.8 = f32[] parameter(1)
+  ROOT %add.9 = f32[] add(%Arg_0.7, %Arg_1.8)
+}
+
+ENTRY %main.14 (Arg_0.1: f32[8], Arg_1.2: f32[]) -> (f32[8], f32[]) {
+  %Arg_0.1 = f32[8] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  %tuple.3 = (f32[8], f32[]) tuple(%Arg_0.1, %Arg_1.2)
+  %get-tuple-element.4 = f32[8] get-tuple-element(%tuple.3), index=0
+  %get-tuple-element.5 = f32[] get-tuple-element(%tuple.3), index=1
+  %all-reduce.10 = (f32[8], f32[]) all-reduce(%get-tuple-element.4, %get-tuple-element.5), replica_groups={}, to_apply=%region_0.6
+  %get-tuple-element.11 = f32[8] get-tuple-element(%all-reduce.10), index=0
+  %get-tuple-element.12 = f32[] get-tuple-element(%all-reduce.10), index=1
+  ROOT %tuple.13 = (f32[8], f32[]) tuple(%get-tuple-element.11, %get-tuple-element.12)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<10xf32>) -> tensor<5xf32> {
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.reduce_scatter"(%[[VAL_3]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 0>, replica_groups = dense<{{\[\[}}0, 2], [1, 3]]> : tensor<2x2xi64>, scatter_dimension = 0 : i64}> ({
+// CHECK:           ^bb0(%[[VAL_5:.*]]: tensor<f32>, %[[VAL_6:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_7:.*]] = stablehlo.maximum %[[VAL_5]], %[[VAL_6]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<f32>
+// CHECK:           }) : (tensor<10xf32>) -> tensor<5xf32>
+// CHECK:           return %[[VAL_4]] : tensor<5xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10]{0})->f32[5]{0}}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+  ROOT %maximum.5 = f32[] maximum(%Arg_0.3, %Arg_1.4)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[10]) -> f32[5] {
+  %Arg_0.1 = f32[10] parameter(0)
+  ROOT %reduce-scatter.6 = f32[5] reduce-scatter(%Arg_0.1), channel_id=5, replica_groups={{0,2},{1,3}}, dimensions={0}, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<10xf32>) -> tensor<5xf32> {
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.reduce_scatter"(%[[VAL_3]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 0>, replica_groups = dense<{{\[\[}}0, 2], [1, 3]]> : tensor<2x2xi64>, scatter_dimension = 0 : i64, use_global_device_ids}> ({
+// CHECK:           ^bb0(%[[VAL_5:.*]]: tensor<f32>, %[[VAL_6:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_7:.*]] = stablehlo.maximum %[[VAL_5]], %[[VAL_6]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<f32>
+// CHECK:           }) : (tensor<10xf32>) -> tensor<5xf32>
+// CHECK:           return %[[VAL_4]] : tensor<5xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10]{0})->f32[5]{0}}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+  ROOT %maximum.5 = f32[] maximum(%Arg_0.3, %Arg_1.4)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[10]) -> f32[5] {
+  %Arg_0.1 = f32[10] parameter(0)
+  ROOT %reduce-scatter.6 = f32[5] reduce-scatter(%Arg_0.1), channel_id=5, replica_groups={{0,2},{1,3}}, use_global_device_ids=true, dimensions={0}, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x2x2x2xf32>, %[[VAL_1:.*]]: tensor<2xf32>, %[[VAL_2:.*]]: tensor<2xf32>, %[[VAL_3:.*]]: tensor<2xf32>, %[[VAL_4:.*]]: tensor<2x2x2x2xf32>) -> tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>> {
+// CHECK:           %[[VAL_5:.*]], %[[VAL_6:.*]], %[[VAL_7:.*]] = "stablehlo.batch_norm_grad"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]]) <{epsilon = 1.000000e-03 : f32, feature_index = 0 : i64}> : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+// CHECK:           %[[VAL_8:.*]] = stablehlo.tuple %[[VAL_5]], %[[VAL_6]], %[[VAL_7]] {xla_shape = "(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})"} : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+// CHECK:           return %[[VAL_8]] : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}, f32[2]{0}, f32[2,2,2,2]{3,2,1,0})->(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})}
+
+ENTRY %main.11 (Arg_0.1: f32[2,2,2,2], Arg_1.2: f32[2], Arg_2.3: f32[2], Arg_3.4: f32[2], Arg_4.5: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
+  %Arg_0.1 = f32[2,2,2,2] parameter(0)
+  %Arg_1.2 = f32[2] parameter(1)
+  %Arg_2.3 = f32[2] parameter(2)
+  %Arg_3.4 = f32[2] parameter(3)
+  %Arg_4.5 = f32[2,2,2,2] parameter(4)
+  %batch-norm-grad.6 = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-grad(%Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_3.4, %Arg_4.5), epsilon=0.001, feature_index=0
+  %get-tuple-element.7 = f32[2,2,2,2] get-tuple-element(%batch-norm-grad.6), index=0
+  %get-tuple-element.8 = f32[2] get-tuple-element(%batch-norm-grad.6), index=1
+  %get-tuple-element.9 = f32[2] get-tuple-element(%batch-norm-grad.6), index=2
+  ROOT %tuple.10 = (f32[2,2,2,2], f32[2], f32[2]) tuple(%get-tuple-element.7, %get-tuple-element.8, %get-tuple-element.9)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x2x2x2xf32>, %[[VAL_1:.*]]: tensor<2xf32>, %[[VAL_2:.*]]: tensor<2xf32>) -> tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>> {
+// CHECK:           %[[VAL_3:.*]], %[[VAL_4:.*]], %[[VAL_5:.*]] = "stablehlo.batch_norm_training"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+// CHECK:           %[[VAL_6:.*]] = stablehlo.tuple %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] {xla_shape = "(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})"} : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+// CHECK:           return %[[VAL_6]] : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})->(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})}
+
+ENTRY %main.9 (Arg_0.1: f32[2,2,2,2], Arg_1.2: f32[2], Arg_2.3: f32[2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
+  %Arg_0.1 = f32[2,2,2,2] parameter(0)
+  %Arg_1.2 = f32[2] parameter(1)
+  %Arg_2.3 = f32[2] parameter(2)
+  %batch-norm-training.4 = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-training(%Arg_0.1, %Arg_1.2, %Arg_2.3), epsilon=0.001, feature_index=3
+  %get-tuple-element.5 = f32[2,2,2,2] get-tuple-element(%batch-norm-training.4), index=0
+  %get-tuple-element.6 = f32[2] get-tuple-element(%batch-norm-training.4), index=1
+  %get-tuple-element.7 = f32[2] get-tuple-element(%batch-norm-training.4), index=2
+  ROOT %tuple.8 = (f32[2,2,2,2], f32[2], f32[2]) tuple(%get-tuple-element.5, %get-tuple-element.6, %get-tuple-element.7)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xf32>, %[[VAL_1:.*]]: tensor<4xf32>, %[[VAL_2:.*]]: tensor<4xi32>, %[[VAL_3:.*]]: tensor<4xi32>) -> tuple<tensor<4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.atan2 %[[VAL_0]], %[[VAL_1]] : tensor<4xf32>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.shift_left %[[VAL_2]], %[[VAL_3]] : tensor<4xi32>
+// CHECK:           %[[VAL_6:.*]] = stablehlo.shift_right_arithmetic %[[VAL_2]], %[[VAL_3]] : tensor<4xi32>
+// CHECK:           %[[VAL_7:.*]] = stablehlo.shift_right_logical %[[VAL_2]], %[[VAL_3]] : tensor<4xi32>
+// CHECK:           %[[VAL_8:.*]] = stablehlo.tuple %[[VAL_4]], %[[VAL_5]], %[[VAL_6]], %[[VAL_7]] {xla_shape = "(f32[4]{0}, s32[4]{0}, s32[4]{0}, s32[4]{0})"} : tuple<tensor<4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>>
+// CHECK:           return %[[VAL_8]] : tuple<tensor<4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4]{0}, f32[4]{0}, s32[4]{0}, s32[4]{0})->(f32[4]{0}, s32[4]{0}, s32[4]{0}, s32[4]{0})}
+
+ENTRY %main.10 (Arg_0.1: f32[4], Arg_1.2: f32[4], Arg_2.3: s32[4], Arg_3.4: s32[4]) -> (f32[4], s32[4], s32[4], s32[4]) {
+  %Arg_0.1 = f32[4] parameter(0)
+  %Arg_1.2 = f32[4] parameter(1)
+  %atan2.5 = f32[4] atan2(%Arg_0.1, %Arg_1.2)
+  %Arg_2.3 = s32[4] parameter(2)
+  %Arg_3.4 = s32[4] parameter(3)
+  %shift-left.6 = s32[4] shift-left(%Arg_2.3, %Arg_3.4)
+  %shift-right-arithmetic.7 = s32[4] shift-right-arithmetic(%Arg_2.3, %Arg_3.4)
+  %shift-right-logical.8 = s32[4] shift-right-logical(%Arg_2.3, %Arg_3.4)
+  ROOT %tuple.9 = (f32[4], s32[4], s32[4], s32[4]) tuple(%atan2.5, %shift-left.6, %shift-right-arithmetic.7, %shift-right-logical.8)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.bitcast_convert %[[VAL_0]] : (tensor<2xi32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[2]{0})->f32[2]{0}}
+
+ENTRY %main.3 (Arg_0.1: s32[2]) -> f32[2] {
+  %Arg_0.1 = s32[2] parameter(0)
+  ROOT %bitcast-convert.2 = f32[2] bitcast-convert(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xi32>) -> tensor<1x2x3x4xi32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.broadcast_in_dim %[[VAL_0]], dims = [3] : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
+// CHECK:           return %[[VAL_1]] : tensor<1x2x3x4xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[4]{0})->s32[1,2,3,4]{3,2,1,0}}
+
+ENTRY %main.3 (Arg_0.1: s32[4]) -> s32[1,2,3,4] {
+  %Arg_0.1 = s32[4] parameter(0)
+  ROOT %broadcast.2 = s32[1,2,3,4] broadcast(%Arg_0.1), dimensions={3}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<1xf32>) -> tensor<1x10xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.broadcast_in_dim %[[VAL_0]], dims = [0] : (tensor<1xf32>) -> tensor<1x10xf32>
+// CHECK:           return %[[VAL_1]] : tensor<1x10xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[1]{0})->f32[1,10]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[1]) -> f32[1,10] {
+  %Arg_0.1 = f32[1] parameter(0)
+  ROOT %broadcast.2 = f32[1,10] broadcast(%Arg_0.1), dimensions={0}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main() -> !stablehlo.token {
+// CHECK:           %[[VAL_0:.*]] = stablehlo.create_token {xla_shape = "token[]"} : !stablehlo.token
+// CHECK:           return %[[VAL_0]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={()->token[]}
+
+ENTRY %main.2 () -> token[] {
+  ROOT %after-all.1 = token[] after-all()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @empty_callee.2() -> tuple<> {
+// CHECK:           %[[VAL_0:.*]] = stablehlo.tuple  {xla_shape = "()"} : tuple<>
+// CHECK:           return %[[VAL_0]] : tuple<>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = call @empty_callee.2() {xla_shape = "()"} : () -> tuple<>
+// CHECK:           return %[[VAL_1]] : tensor<4xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[4]{0})->s32[4]{0}}
+
+%empty_callee.2 () -> () {
+  ROOT %tuple.3 = () tuple()
+}
+
+ENTRY %main.5 (Arg_0.1: s32[4]) -> s32[4] {
+  ROOT %Arg_0.1 = s32[4] parameter(0)
+  %call.4 = () call(), to_apply=%empty_callee.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @callee.2(%[[VAL_0:.*]]: tensor<4xi32>, %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
+// CHECK:         func.func private @callee.7(%[[VAL_3:.*]]: tensor<4xi32>, %[[VAL_4:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_5:.*]] = stablehlo.add %[[VAL_3]], %[[VAL_4]] : tensor<4xi32>
+// CHECK:           return %[[VAL_5]] : tensor<4xi32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_6:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_7:.*]] = call @callee.2(%[[VAL_6]], %[[VAL_6]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           %[[VAL_8:.*]] = call @callee.7(%[[VAL_7]], %[[VAL_7]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_8]] : tensor<4xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[4]{0})->s32[4]{0}}
+
+%callee.2 (Arg_0.3: s32[4], Arg_1.4: s32[4]) -> s32[4] {
+  %Arg_0.3 = s32[4] parameter(0)
+  %Arg_1.4 = s32[4] parameter(1)
+  ROOT %add.5 = s32[4] add(%Arg_0.3, %Arg_1.4)
+}
+
+%callee.7 (Arg_0.8: s32[4], Arg_1.9: s32[4]) -> s32[4] {
+  %Arg_0.8 = s32[4] parameter(0)
+  %Arg_1.9 = s32[4] parameter(1)
+  ROOT %add.10 = s32[4] add(%Arg_0.8, %Arg_1.9)
+}
+
+ENTRY %main.12 (Arg_0.1: s32[4]) -> s32[4] {
+  %Arg_0.1 = s32[4] parameter(0)
+  %call.6 = s32[4] call(%Arg_0.1, %Arg_0.1), to_apply=%callee.2
+  ROOT %call.11 = s32[4] call(%call.6, %call.6), to_apply=%callee.7
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @callee.2(%[[VAL_0:.*]]: tensor<4xi32>, %[[VAL_1:.*]]: tensor<4xi32>) -> tuple<tensor<4xi32>, tensor<4xi32>> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<4xi32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.multiply %[[VAL_0]], %[[VAL_1]] : tensor<4xi32>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.tuple %[[VAL_2]], %[[VAL_3]] {xla_shape = "(s32[4]{0}, s32[4]{0})"} : tuple<tensor<4xi32>, tensor<4xi32>>
+// CHECK:           return %[[VAL_4]] : tuple<tensor<4xi32>, tensor<4xi32>>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_5:.*]]: tensor<4xi32>) -> tuple<tensor<4xi32>, tensor<4xi32>> {
+// CHECK:           %[[VAL_6:.*]] = call @callee.2(%[[VAL_5]], %[[VAL_5]]) {xla_shape = "(s32[4]{0}, s32[4]{0})"} : (tensor<4xi32>, tensor<4xi32>) -> tuple<tensor<4xi32>, tensor<4xi32>>
+// CHECK:           %[[VAL_7:.*]] = stablehlo.get_tuple_element %[[VAL_6]][0] : (tuple<tensor<4xi32>, tensor<4xi32>>) -> tensor<4xi32>
+// CHECK:           %[[VAL_8:.*]] = stablehlo.get_tuple_element %[[VAL_6]][1] : (tuple<tensor<4xi32>, tensor<4xi32>>) -> tensor<4xi32>
+// CHECK:           %[[VAL_9:.*]] = stablehlo.tuple %[[VAL_7]], %[[VAL_8]] {xla_shape = "(s32[4]{0}, s32[4]{0})"} : tuple<tensor<4xi32>, tensor<4xi32>>
+// CHECK:           return %[[VAL_9]] : tuple<tensor<4xi32>, tensor<4xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[4]{0})->(s32[4]{0}, s32[4]{0})}
+
+%callee.2 (Arg_0.3: s32[4], Arg_1.4: s32[4]) -> (s32[4], s32[4]) {
+  %Arg_0.3 = s32[4] parameter(0)
+  %Arg_1.4 = s32[4] parameter(1)
+  %add.5 = s32[4] add(%Arg_0.3, %Arg_1.4)
+  %multiply.6 = s32[4] multiply(%Arg_0.3, %Arg_1.4)
+  ROOT %tuple.7 = (s32[4], s32[4]) tuple(%add.5, %multiply.6)
+}
+
+ENTRY %main.12 (Arg_0.1: s32[4]) -> (s32[4], s32[4]) {
+  %Arg_0.1 = s32[4] parameter(0)
+  %call.8 = (s32[4], s32[4]) call(%Arg_0.1, %Arg_0.1), to_apply=%callee.2
+  %get-tuple-element.9 = s32[4] get-tuple-element(%call.8), index=0
+  %get-tuple-element.10 = s32[4] get-tuple-element(%call.8), index=1
+  ROOT %tuple.11 = (s32[4], s32[4]) tuple(%get-tuple-element.9, %get-tuple-element.10)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.cosine %[[VAL_0]] : tensor<1x16x16x3xf32>
+// CHECK:           return %[[VAL_1]] : tensor<1x16x16x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[1,16,16,3]{3,2,1,0})->f32[1,16,16,3]{3,2,1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
+  %Arg_0.1 = f32[1,16,16,3] parameter(0)
+  ROOT %cosine.2 = f32[1,16,16,3] cosine(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.sine %[[VAL_0]] : tensor<1x16x16x3xf32>
+// CHECK:           return %[[VAL_1]] : tensor<1x16x16x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[1,16,16,3]{3,2,1,0})->f32[1,16,16,3]{3,2,1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
+  %Arg_0.1 = f32[1,16,16,3] parameter(0)
+  ROOT %sine.2 = f32[1,16,16,3] sine(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.exponential %[[VAL_0]] {result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>} : tensor<f32>
+// CHECK:           return %[[VAL_1]] : tensor<f32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %exponential.2 = f32[] exponential(%Arg_0.1), result_accuracy={tolerance={atol=0,rtol=0,ulps=10}}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<128x32xf32>) -> tensor<128x32xf32> {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.collective_broadcast"(%[[VAL_0]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<{{\[\[}}0, 1], [2, 3]]> : tensor<2x2xi64>}> : (tensor<128x32xf32>) -> tensor<128x32xf32>
+// CHECK:           return %[[VAL_1]] : tensor<128x32xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[128,32]{1,0})->f32[128,32]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[128,32]) -> f32[128,32] {
+  %Arg_0.1 = f32[128,32] parameter(0)
+  ROOT %collective-broadcast.2 = f32[128,32] collective-broadcast(%Arg_0.1), channel_id=1, replica_groups={{0,1},{2,3}}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<128x32xf32>) -> tensor<128x32xf32> {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.collective_permute"(%[[VAL_0]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, source_target_pairs = dense<{{\[\[}}0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>}> : (tensor<128x32xf32>) -> tensor<128x32xf32>
+// CHECK:           return %[[VAL_1]] : tensor<128x32xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[128,32]{1,0})->f32[128,32]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[128,32]) -> f32[128,32] {
+  %Arg_0.1 = f32[128,32] parameter(0)
+  ROOT %collective-permute.2 = f32[128,32] collective-permute(%Arg_0.1), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3}}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<5x2xf32>, %[[VAL_1:.*]]: tensor<5x5xf32>, %[[VAL_2:.*]]: tensor<5x7xf32>) -> tensor<5x14xf32> {
+// CHECK:           %[[VAL_3:.*]] = stablehlo.concatenate %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], dim = 1 : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
+// CHECK:           return %[[VAL_3]] : tensor<5x14xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[5,2]{1,0}, f32[5,5]{1,0}, f32[5,7]{1,0})->f32[5,14]{1,0}}
+
+ENTRY %main.5 (Arg_0.1: f32[5,2], Arg_1.2: f32[5,5], Arg_2.3: f32[5,7]) -> f32[5,14] {
+  %Arg_0.1 = f32[5,2] parameter(0)
+  %Arg_1.2 = f32[5,5] parameter(1)
+  %Arg_2.3 = f32[5,7] parameter(2)
+  ROOT %concatenate.4 = f32[5,14] concatenate(%Arg_0.1, %Arg_1.2, %Arg_2.3), dimensions={1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main() -> tuple<> {
+// CHECK:           %[[VAL_0:.*]] = stablehlo.tuple  {xla_shape = "()"} : tuple<>
+// CHECK:           return %[[VAL_0]] : tuple<>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={()->()}
+
+ENTRY %main.2 () -> () {
+  ROOT %tuple.1 = () tuple()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<100x26x26x32xf32>, %[[VAL_1:.*]]: tensor<3x3x1x32xf32>) -> tensor<100x28x28x1xf32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.convolution(%[[VAL_0]], %[[VAL_1]]) dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f], window = {stride = [1, 1], pad = {{\[\[}}2, 2], [2, 2]], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [false, false]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<100x26x26x32xf32>, tensor<3x3x1x32xf32>) -> tensor<100x28x28x1xf32>
+// CHECK:           return %[[VAL_2]] : tensor<100x28x28x1xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[100,26,26,32]{3,2,1,0}, f32[3,3,1,32]{3,2,1,0})->f32[100,28,28,1]{3,2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[100,26,26,32], Arg_1.2: f32[3,3,1,32]) -> f32[100,28,28,1] {
+  %Arg_0.1 = f32[100,26,26,32] parameter(0)
+  %Arg_1.2 = f32[3,3,1,32] parameter(1)
+  ROOT %convolution.3 = f32[100,28,28,1] convolution(%Arg_0.1, %Arg_1.2), window={size=3x3 pad=2_2x2_2}, dim_labels=b01f_01oi->b01f
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<100x26x26x32xi8>, %[[VAL_1:.*]]: tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.convolution(%[[VAL_0]], %[[VAL_1]]) dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f], window = {stride = [1, 1], pad = {{\[\[}}2, 2], [2, 2]], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [false, false]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<100x26x26x32xi8>, tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32>
+// CHECK:           return %[[VAL_2]] : tensor<100x28x28x1xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s8[100,26,26,32]{3,2,1,0}, s8[3,3,1,32]{3,2,1,0})->s32[100,28,28,1]{3,2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: s8[100,26,26,32], Arg_1.2: s8[3,3,1,32]) -> s32[100,28,28,1] {
+  %Arg_0.1 = s8[100,26,26,32] parameter(0)
+  %Arg_1.2 = s8[3,3,1,32] parameter(1)
+  ROOT %convolution.3 = s32[100,28,28,1] convolution(%Arg_0.1, %Arg_1.2), window={size=3x3 pad=2_2x2_2}, dim_labels=b01f_01oi->b01f
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<100x26x26x32xi8>, %[[VAL_1:.*]]: tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.convolution(%[[VAL_0]], %[[VAL_1]]) dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f], window = {stride = [1, 1], pad = {{\[\[}}2, 2], [2, 2]], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [true, true]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<100x26x26x32xi8>, tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32>
+// CHECK:           return %[[VAL_2]] : tensor<100x28x28x1xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s8[100,26,26,32]{3,2,1,0}, s8[3,3,1,32]{3,2,1,0})->s32[100,28,28,1]{3,2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: s8[100,26,26,32], Arg_1.2: s8[3,3,1,32]) -> s32[100,28,28,1] {
+  %Arg_0.1 = s8[100,26,26,32] parameter(0)
+  %Arg_1.2 = s8[3,3,1,32] parameter(1)
+  ROOT %convolution.3 = s32[100,28,28,1] convolution(%Arg_0.1, %Arg_1.2), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.convert %[[VAL_0]] : (tensor<2xi32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[2]{0})->f32[2]{0}}
+
+ENTRY %main.3 (Arg_0.1: s32[2]) -> f32[2] {
+  %Arg_0.1 = s32[2] parameter(0)
+  ROOT %convert.2 = f32[2] convert(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.convert %[[VAL_0]] : (tensor<2xf32>) -> tensor<2xf8E5M2>
+// CHECK:           %[[VAL_2:.*]] = stablehlo.convert %[[VAL_1]] : (tensor<2xf8E5M2>) -> tensor<2xf8E4M3FN>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.convert %[[VAL_2]] : (tensor<2xf8E4M3FN>) -> tensor<2xf8E4M3FNUZ>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.convert %[[VAL_3]] : (tensor<2xf8E4M3FNUZ>) -> tensor<2xf8E5M2FNUZ>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.convert %[[VAL_4]] : (tensor<2xf8E5M2FNUZ>) -> tensor<2xf8E4M3>
+// CHECK:           %[[VAL_6:.*]] = stablehlo.convert %[[VAL_5]] : (tensor<2xf8E4M3>) -> tensor<2xf8E3M4>
+// CHECK:           %[[VAL_7:.*]] = stablehlo.convert %[[VAL_6]] : (tensor<2xf8E3M4>) -> tensor<2xf4E2M1FN>
+// CHECK:           %[[VAL_8:.*]] = stablehlo.convert %[[VAL_7]] : (tensor<2xf4E2M1FN>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_8]] : tensor<2xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2]{0})->f32[2]{0}}
+
+ENTRY %main.10 (Arg_0.1: f32[2]) -> f32[2] {
+  %Arg_0.1 = f32[2] parameter(0)
+  %convert.2 = f8e5m2[2] convert(%Arg_0.1)
+  %convert.3 = f8e4m3fn[2] convert(%convert.2)
+  %convert.4 = f8e4m3fnuz[2] convert(%convert.3)
+  %convert.5 = f8e5m2fnuz[2] convert(%convert.4)
+  %convert.6 = f8e4m3[2] convert(%convert.5)
+  %convert.7 = f8e3m4[2] convert(%convert.6)
+  %convert.8 = f4e2m1fn[2] convert(%convert.7)
+  ROOT %convert.9 = f32[2] convert(%convert.8)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<5x5xf32>, %[[VAL_1:.*]]: tensor<5x5xui32>) -> tensor<5x5xi8> {
+// CHECK:           %[[VAL_2:.*]] = "mhlo.stochastic_convert"(%[[VAL_0]], %[[VAL_1]]) : (tensor<5x5xf32>, tensor<5x5xui32>) -> tensor<5x5xi8>
+// CHECK:           return %[[VAL_2]] : tensor<5x5xi8>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[5,5]{1,0}, u32[5,5]{1,0})->s8[5,5]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[5,5], Arg_1.2: u32[5,5]) -> s8[5,5] {
+  %Arg_0.1 = f32[5,5] parameter(0)
+  %Arg_1.2 = u32[5,5] parameter(1)
+  ROOT %stochastic-convert.3 = s8[5,5] stochastic-convert(%Arg_0.1, %Arg_1.2)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = mhlo.copy %[[VAL_0]] : tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[2]{0})->s32[2]{0}}
+
+ENTRY %main.3 (Arg_0.1: s32[2]) -> s32[2] {
+  %Arg_0.1 = s32[2] parameter(0)
+  ROOT %copy.2 = s32[2] copy(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @sum.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<10xf32>) -> tensor<10xf32> {
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.all_reduce"(%[[VAL_3]]) <{replica_groups = dense<{{\[\[}}0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>}> ({
+// CHECK:           ^bb0(%[[VAL_5:.*]]: tensor<f32>, %[[VAL_6:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_7:.*]] = stablehlo.add %[[VAL_5]], %[[VAL_6]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<f32>
+// CHECK:           }) : (tensor<10xf32>) -> tensor<10xf32>
+// CHECK:           return %[[VAL_4]] : tensor<10xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10]{0})->f32[10]{0}}
+
+%sum.2 (x.3: f32[], y.4: f32[]) -> f32[] {
+  %x.3 = f32[] parameter(0)
+  %y.4 = f32[] parameter(1)
+  ROOT %add.5 = f32[] add(%x.3, %y.4)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[10]) -> f32[10] {
+  %Arg_0.1 = f32[10] parameter(0)
+  ROOT %all-reduce.6 = f32[10] all-reduce(%Arg_0.1), replica_groups={{0,2,4,6},{1,3,5,7}}, to_apply=%sum.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x3xf32>) -> tensor<2x3xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.custom_call @SetBound(%[[VAL_0]]) {backend_config = "", mhlo.literal = dense<1> : tensor<i32>} : (tensor<2x3xf32>) -> tensor<2x3xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0})->f32[2,3]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[2,3]) -> f32[2,3] {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  ROOT %custom-call.2 = f32[2,3] custom-call(%Arg_0.1), custom_call_target="SetBound", literal=s32[] 1
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<6xf32>, %[[VAL_1:.*]]: tensor<6xf32>, %[[VAL_2:.*]]: tensor<3xi32>, %[[VAL_3:.*]]: tensor<3xi32>, %[[VAL_4:.*]]: tensor<3xi32>, %[[VAL_5:.*]]: tensor<3xi32>) -> tensor<6xf32> {
+// CHECK:           %[[VAL_6:.*]] = stablehlo.custom_call @ragged_all_to_all(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) {api_version = 4 : i32, backend_config = {channel_id = 1 : i64, replica_groups = dense<{{\[\[}}0, 1, 2]]> : tensor<1x3xi64>}} : (tensor<6xf32>, tensor<6xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<6xf32>
+// CHECK:           return %[[VAL_6]] : tensor<6xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[6]{0}, f32[6]{0}, s32[3]{0}, s32[3]{0}, s32[3]{0}, /*index=5*/s32[3]{0})->f32[6]{0}}
+
+ENTRY %main.8 (Arg_0.1: f32[6], Arg_1.2: f32[6], Arg_2.3: s32[3], Arg_3.4: s32[3], Arg_4.5: s32[3], Arg_5.6: s32[3]) -> f32[6] {
+  %Arg_0.1 = f32[6] parameter(0)
+  %Arg_1.2 = f32[6] parameter(1)
+  %Arg_2.3 = s32[3] parameter(2)
+  %Arg_3.4 = s32[3] parameter(3)
+  %Arg_4.5 = s32[3] parameter(4)
+  %Arg_5.6 = s32[3] parameter(5)
+  ROOT %ragged-all-to-all.7 = f32[6] ragged-all-to-all(%Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_3.4, %Arg_4.5, /*index=5*/%Arg_5.6), channel_id=1, replica_groups={{0,1,2}}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @top_k_gt_comparator.5(%[[VAL_0:.*]]: tensor<bf16>, %[[VAL_1:.*]]: tensor<bf16>, %[[VAL_2:.*]]: tensor<i32>, %[[VAL_3:.*]]: tensor<i32>) -> tensor<i1> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.compare  GT, %[[VAL_0]], %[[VAL_1]] : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+// CHECK:           return %[[VAL_4]] : tensor<i1>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_5:.*]]: tensor<16x256xbf16>, %[[VAL_6:.*]]: tensor<i32>, %[[VAL_7:.*]]: tensor<16x256xi32>, %[[VAL_8:.*]]: tensor<bf16>) -> tuple<tensor<16x4xbf16>, tensor<16x4xi32>> {
+// CHECK:           %[[VAL_9:.*]]:2 = "stablehlo.sort"(%[[VAL_5]], %[[VAL_7]]) <{dimension = 1 : i64, is_stable = false}> ({
+// CHECK:           ^bb0(%[[VAL_10:.*]]: tensor<bf16>, %[[VAL_11:.*]]: tensor<bf16>, %[[VAL_12:.*]]: tensor<i32>, %[[VAL_13:.*]]: tensor<i32>):
+// CHECK:             %[[VAL_14:.*]] = stablehlo.compare  GT, %[[VAL_10]], %[[VAL_11]] : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+// CHECK:             stablehlo.return %[[VAL_14]] : tensor<i1>
+// CHECK:           }) : (tensor<16x256xbf16>, tensor<16x256xi32>) -> (tensor<16x256xbf16>, tensor<16x256xi32>)
+// CHECK:           %[[VAL_15:.*]] = stablehlo.slice %[[VAL_16:.*]]#0 [0:16, 0:4] : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
+// CHECK:           %[[VAL_17:.*]] = stablehlo.slice %[[VAL_16]]#1 [0:16, 0:4] : (tensor<16x256xi32>) -> tensor<16x4xi32>
+// CHECK:           %[[VAL_18:.*]] = stablehlo.tuple %[[VAL_15]], %[[VAL_17]] {xla_shape = "(bf16[16,4]{1,0}, s32[16,4]{1,0})"} : tuple<tensor<16x4xbf16>, tensor<16x4xi32>>
+// CHECK:           return %[[VAL_18]] : tuple<tensor<16x4xbf16>, tensor<16x4xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(bf16[16,256]{1,0}, s32[], s32[16,256]{1,0}, bf16[])->(bf16[16,4]{1,0}, s32[16,4]{1,0})}
+
+%top_k_gt_comparator.5 (Arg_0.6: bf16[], Arg_1.7: bf16[], Arg_2.8: s32[], Arg_3.9: s32[]) -> pred[] {
+  %Arg_2.8 = s32[] parameter(2)
+  %Arg_3.9 = s32[] parameter(3)
+  %Arg_0.6 = bf16[] parameter(0)
+  %Arg_1.7 = bf16[] parameter(1)
+  ROOT %compare.10 = pred[] compare(%Arg_0.6, %Arg_1.7), direction=GT
+}
+
+ENTRY %main.20 (Arg_0.1: bf16[16,256], Arg_1.2: s32[], Arg_2.3: s32[16,256], Arg_3.4: bf16[]) -> (bf16[16,4], s32[16,4]) {
+  %Arg_1.2 = s32[] parameter(1)
+  %Arg_3.4 = bf16[] parameter(3)
+  %Arg_0.1 = bf16[16,256] parameter(0)
+  %Arg_2.3 = s32[16,256] parameter(2)
+  %sort.11 = (bf16[16,256], s32[16,256]) sort(%Arg_0.1, %Arg_2.3), dimensions={1}, to_apply=%top_k_gt_comparator.5
+  %get-tuple-element.12 = bf16[16,256] get-tuple-element(%sort.11), index=0
+  %slice.13 = bf16[16,4] slice(%get-tuple-element.12), slice={[0:16], [0:4]}
+  %get-tuple-element.14 = s32[16,256] get-tuple-element(%sort.11), index=1
+  %slice.15 = s32[16,4] slice(%get-tuple-element.14), slice={[0:16], [0:4]}
+  %tuple.16 = (bf16[16,4], s32[16,4]) tuple(%slice.13, %slice.15)
+  %get-tuple-element.17 = bf16[16,4] get-tuple-element(%tuple.16), index=0
+  %get-tuple-element.18 = s32[16,4] get-tuple-element(%tuple.16), index=1
+  ROOT %tuple.19 = (bf16[16,4], s32[16,4]) tuple(%get-tuple-element.17, %get-tuple-element.18)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @top_k_gt_comparator.5(%[[VAL_0:.*]]: tensor<bf16>, %[[VAL_1:.*]]: tensor<bf16>, %[[VAL_2:.*]]: tensor<i32>, %[[VAL_3:.*]]: tensor<i32>) -> tensor<i1> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.compare  GT, %[[VAL_0]], %[[VAL_1]] : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+// CHECK:           return %[[VAL_4]] : tensor<i1>
+// CHECK:         }
+// CHECK:         func.func private @top_k_gt_comparator.14(%[[VAL_5:.*]]: tensor<bf16>, %[[VAL_6:.*]]: tensor<bf16>, %[[VAL_7:.*]]: tensor<i32>, %[[VAL_8:.*]]: tensor<i32>) -> tensor<i1> {
+// CHECK:           %[[VAL_9:.*]] = stablehlo.compare  GT, %[[VAL_5]], %[[VAL_6]] : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+// CHECK:           return %[[VAL_9]] : tensor<i1>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_10:.*]]: tensor<16x256xbf16>, %[[VAL_11:.*]]: tensor<i32>, %[[VAL_12:.*]]: tensor<16x256xi32>, %[[VAL_13:.*]]: tensor<bf16>) -> tuple<tensor<16x4xbf16>, tensor<16x4xi32>> {
+// CHECK:           %[[VAL_14:.*]] = stablehlo.custom_call @PartialReduce(%[[VAL_10]], %[[VAL_12]], %[[VAL_13]], %[[VAL_11]]) {backend_config = "{\22log2_reduction\22: 1, \22reduction_dim\22: 1, \22to_apply_type\22: \22comparator\22, \22top_k\22: 4, \22recall_target\22: 0.949218}", called_computations = [@top_k_gt_comparator.5], xla_shape = "(bf16[16,128]{1,0}, s32[16,128]{1,0})"} : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> tuple<tensor<16x128xbf16>, tensor<16x128xi32>>
+// CHECK:           %[[VAL_15:.*]] = stablehlo.get_tuple_element %[[VAL_14]][0] : (tuple<tensor<16x128xbf16>, tensor<16x128xi32>>) -> tensor<16x128xbf16>
+// CHECK:           %[[VAL_16:.*]] = stablehlo.get_tuple_element %[[VAL_14]][1] : (tuple<tensor<16x128xbf16>, tensor<16x128xi32>>) -> tensor<16x128xi32>
+// CHECK:           %[[VAL_17:.*]]:2 = "stablehlo.sort"(%[[VAL_15]], %[[VAL_16]]) <{dimension = 1 : i64, is_stable = false}> ({
+// CHECK:           ^bb0(%[[VAL_18:.*]]: tensor<bf16>, %[[VAL_19:.*]]: tensor<bf16>, %[[VAL_20:.*]]: tensor<i32>, %[[VAL_21:.*]]: tensor<i32>):
+// CHECK:             %[[VAL_22:.*]] = stablehlo.compare  GT, %[[VAL_18]], %[[VAL_19]] : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+// CHECK:             stablehlo.return %[[VAL_22]] : tensor<i1>
+// CHECK:           }) : (tensor<16x128xbf16>, tensor<16x128xi32>) -> (tensor<16x128xbf16>, tensor<16x128xi32>)
+// CHECK:           %[[VAL_23:.*]] = stablehlo.slice %[[VAL_24:.*]]#0 [0:16, 0:4] : (tensor<16x128xbf16>) -> tensor<16x4xbf16>
+// CHECK:           %[[VAL_25:.*]] = stablehlo.slice %[[VAL_24]]#1 [0:16, 0:4] : (tensor<16x128xi32>) -> tensor<16x4xi32>
+// CHECK:           %[[VAL_26:.*]] = stablehlo.tuple %[[VAL_23]], %[[VAL_25]] {xla_shape = "(bf16[16,4]{1,0}, s32[16,4]{1,0})"} : tuple<tensor<16x4xbf16>, tensor<16x4xi32>>
+// CHECK:           return %[[VAL_26]] : tuple<tensor<16x4xbf16>, tensor<16x4xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(bf16[16,256]{1,0}, s32[], s32[16,256]{1,0}, bf16[])->(bf16[16,4]{1,0}, s32[16,4]{1,0})}
+
+%top_k_gt_comparator.5 (Arg_0.6: bf16[], Arg_1.7: bf16[], Arg_2.8: s32[], Arg_3.9: s32[]) -> pred[] {
+  %Arg_2.8 = s32[] parameter(2)
+  %Arg_3.9 = s32[] parameter(3)
+  %Arg_0.6 = bf16[] parameter(0)
+  %Arg_1.7 = bf16[] parameter(1)
+  ROOT %compare.10 = pred[] compare(%Arg_0.6, %Arg_1.7), direction=GT
+}
+
+%top_k_gt_comparator.14 (Arg_0.15: bf16[], Arg_1.16: bf16[], Arg_2.17: s32[], Arg_3.18: s32[]) -> pred[] {
+  %Arg_2.17 = s32[] parameter(2)
+  %Arg_3.18 = s32[] parameter(3)
+  %Arg_0.15 = bf16[] parameter(0)
+  %Arg_1.16 = bf16[] parameter(1)
+  ROOT %compare.19 = pred[] compare(%Arg_0.15, %Arg_1.16), direction=GT
+}
+
+ENTRY %main.29 (Arg_0.1: bf16[16,256], Arg_1.2: s32[], Arg_2.3: s32[16,256], Arg_3.4: bf16[]) -> (bf16[16,4], s32[16,4]) {
+  %Arg_0.1 = bf16[16,256] parameter(0)
+  %Arg_2.3 = s32[16,256] parameter(2)
+  %Arg_3.4 = bf16[] parameter(3)
+  %Arg_1.2 = s32[] parameter(1)
+  %custom-call.11 = (bf16[16,128], s32[16,128]) custom-call(%Arg_0.1, %Arg_2.3, %Arg_3.4, %Arg_1.2), custom_call_target="PartialReduce", called_computations={%top_k_gt_comparator.5}, backend_config={"log2_reduction": 1, "reduction_dim": 1, "to_apply_type": "comparator", "top_k": 4, "recall_target": 0.949218}
+  %get-tuple-element.12 = bf16[16,128] get-tuple-element(%custom-call.11), index=0
+  %get-tuple-element.13 = s32[16,128] get-tuple-element(%custom-call.11), index=1
+  %sort.20 = (bf16[16,128], s32[16,128]) sort(%get-tuple-element.12, %get-tuple-element.13), dimensions={1}, to_apply=%top_k_gt_comparator.14
+  %get-tuple-element.21 = bf16[16,128] get-tuple-element(%sort.20), index=0
+  %slice.22 = bf16[16,4] slice(%get-tuple-element.21), slice={[0:16], [0:4]}
+  %get-tuple-element.23 = s32[16,128] get-tuple-element(%sort.20), index=1
+  %slice.24 = s32[16,4] slice(%get-tuple-element.23), slice={[0:16], [0:4]}
+  %tuple.25 = (bf16[16,4], s32[16,4]) tuple(%slice.22, %slice.24)
+  %get-tuple-element.26 = bf16[16,4] get-tuple-element(%tuple.25), index=0
+  %get-tuple-element.27 = s32[16,4] get-tuple-element(%tuple.25), index=1
+  ROOT %tuple.28 = (bf16[16,4], s32[16,4]) tuple(%get-tuple-element.26, %get-tuple-element.27)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x3xf32>, %[[VAL_1:.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32> {
+// CHECK:           %[[VAL_2:.*]] = mhlo.custom_call @foo(%[[VAL_0]], %[[VAL_1]]) {backend_config = "bar", custom_call_schedule = #mhlo<custom_call_schedule LATEST>, has_side_effect = true} : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0}, f32[5,5]{1,0})->f32[1,2,3]{2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[2,3], Arg_1.2: f32[5,5]) -> f32[1,2,3] {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  %Arg_1.2 = f32[5,5] parameter(1)
+  ROOT %custom-call.3 = f32[1,2,3] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="foo", custom_call_has_side_effect=true, schedule=SCHEDULE_LATEST, backend_config="bar"
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x3xf32>, %[[VAL_1:.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32> {
+// CHECK:           %[[VAL_2:.*]] = mhlo.custom_call @foo(%[[VAL_0]], %[[VAL_1]]) {backend_config = "bar", custom_call_schedule = #mhlo<custom_call_schedule EARLIEST>, has_side_effect = true} : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0}, f32[5,5]{1,0})->f32[1,2,3]{2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[2,3], Arg_1.2: f32[5,5]) -> f32[1,2,3] {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  %Arg_1.2 = f32[5,5] parameter(1)
+  ROOT %custom-call.3 = f32[1,2,3] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="foo", custom_call_has_side_effect=true, schedule=SCHEDULE_EARLIEST, backend_config="bar"
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.custom_call @foo(%[[VAL_0]]) {backend_config = "", xla_shape = "(f32[2,3]{1,0})"} : (tensor<2x3xf32>) -> tuple<tensor<2x3xf32>>
+// CHECK:           return %[[VAL_1]] : tuple<tensor<2x3xf32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0})->(f32[2,3]{1,0})}
+
+ENTRY %main.3 (Arg_0.1: f32[2,3]) -> (f32[2,3]) {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  ROOT %custom-call.2 = (f32[2,3]) custom-call(%Arg_0.1), custom_call_target="foo"
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.custom_call @foo(%[[VAL_0]]) {backend_config = "", xla_shape = "(f32[2,3]{1,0}, f16[4,5]{1,0})"} : (tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+// CHECK:           return %[[VAL_1]] : tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0})->(f32[2,3]{1,0}, f16[4,5]{1,0})}
+
+ENTRY %main.3 (Arg_0.1: f32[2,3]) -> (f32[2,3], f16[4,5]) {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  ROOT %custom-call.2 = (f32[2,3], f16[4,5]) custom-call(%Arg_0.1), custom_call_target="foo"
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.custom_call @foo(%[[VAL_0]]) {backend_config = "", xla_shape = "(f32[2,3]{1,0}, f16[4,5]{1,0})"} : (tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+// CHECK:           %[[VAL_2:.*]] = stablehlo.get_tuple_element %[[VAL_1]][0] : (tuple<tensor<2x3xf32>, tensor<4x5xf16>>) -> tensor<2x3xf32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.get_tuple_element %[[VAL_1]][1] : (tuple<tensor<2x3xf32>, tensor<4x5xf16>>) -> tensor<4x5xf16>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.tuple %[[VAL_2]], %[[VAL_3]] {xla_shape = "(f32[2,3]{1,0}, f16[4,5]{1,0})"} : tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+// CHECK:           return %[[VAL_4]] : tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0})->(f32[2,3]{1,0}, f16[4,5]{1,0})}
+
+ENTRY %main.6 (Arg_0.1: f32[2,3]) -> (f32[2,3], f16[4,5]) {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  %custom-call.2 = (f32[2,3], f16[4,5]) custom-call(%Arg_0.1), custom_call_target="foo"
+  %get-tuple-element.3 = f32[2,3] get-tuple-element(%custom-call.2), index=0
+  %get-tuple-element.4 = f16[4,5] get-tuple-element(%custom-call.2), index=1
+  ROOT %tuple.5 = (f32[2,3], f16[4,5]) tuple(%get-tuple-element.3, %get-tuple-element.4)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3xi8>, %[[VAL_1:.*]]: tensor<3xi8>) -> tensor<i64> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.dot %[[VAL_0]], %[[VAL_1]], precision = [DEFAULT, DEFAULT] : (tensor<3xi8>, tensor<3xi8>) -> tensor<i64>
+// CHECK:           return %[[VAL_2]] : tensor<i64>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s8[3]{0}, s8[3]{0})->s64[]}
+
+ENTRY %main.4 (Arg_0.1: s8[3], Arg_1.2: s8[3]) -> s64[] {
+  %Arg_0.1 = s8[3] parameter(0)
+  %Arg_1.2 = s8[3] parameter(1)
+  ROOT %dot.3 = s64[] dot(%Arg_0.1, %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3xi4>, %[[VAL_1:.*]]: tensor<3xi4>) -> tensor<i8> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.dot %[[VAL_0]], %[[VAL_1]], precision = [DEFAULT, DEFAULT] : (tensor<3xi4>, tensor<3xi4>) -> tensor<i8>
+// CHECK:           return %[[VAL_2]] : tensor<i8>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s4[3]{0}, s4[3]{0})->s8[]}
+
+ENTRY %main.4 (Arg_0.1: s4[3], Arg_1.2: s4[3]) -> s8[] {
+  %Arg_0.1 = s4[3] parameter(0)
+  %Arg_1.2 = s4[3] parameter(1)
+  ROOT %dot.3 = s8[] dot(%Arg_0.1, %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3xui4>, %[[VAL_1:.*]]: tensor<3xui4>) -> tensor<ui8> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.dot %[[VAL_0]], %[[VAL_1]], precision = [DEFAULT, DEFAULT] : (tensor<3xui4>, tensor<3xui4>) -> tensor<ui8>
+// CHECK:           return %[[VAL_2]] : tensor<ui8>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(u4[3]{0}, u4[3]{0})->u8[]}
+
+ENTRY %main.4 (Arg_0.1: u4[3], Arg_1.2: u4[3]) -> u8[] {
+  %Arg_0.1 = u4[3] parameter(0)
+  %Arg_1.2 = u4[3] parameter(1)
+  ROOT %dot.3 = u8[] dot(%Arg_0.1, %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x2x2xi8>, %[[VAL_1:.*]]: tensor<2x2x3xi8>) -> tensor<2x2x3xi32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.dot_general %[[VAL_0]], %[[VAL_1]], batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [DEFAULT, DEFAULT] : (tensor<2x2x2xi8>, tensor<2x2x3xi8>) -> tensor<2x2x3xi32>
+// CHECK:           return %[[VAL_2]] : tensor<2x2x3xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s8[2,2,2]{2,1,0}, s8[2,2,3]{2,1,0})->s32[2,2,3]{2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: s8[2,2,2], Arg_1.2: s8[2,2,3]) -> s32[2,2,3] {
+  %Arg_0.1 = s8[2,2,2] parameter(0)
+  %Arg_1.2 = s8[2,2,3] parameter(1)
+  ROOT %dot.3 = s32[2,2,3] dot(%Arg_0.1, %Arg_1.2), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<10x16xbf16>, %[[VAL_1:.*]]: tensor<32x20xbf16>, %[[VAL_2:.*]]: tensor<10x2xui16>) -> tensor<10x20xf32> {
+// CHECK:           %[[VAL_3:.*]] = "mhlo.sparse_dot"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>, lhs_sparsity = #mhlo.sparsity<dimension = 1, n = 2, m = 4>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}> : (tensor<10x16xbf16>, tensor<32x20xbf16>, tensor<10x2xui16>) -> tensor<10x20xf32>
+// CHECK:           return %[[VAL_3]] : tensor<10x20xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(bf16[10,16]{1,0}, bf16[32,20]{1,0}, u16[10,2]{1,0})->f32[10,20]{1,0}}
+
+ENTRY %main.5 (Arg_0.1: bf16[10,16], Arg_1.2: bf16[32,20], Arg_2.3: u16[10,2]) -> f32[10,20] {
+  %Arg_0.1 = bf16[10,16] parameter(0)
+  %Arg_1.2 = bf16[32,20] parameter(1)
+  %Arg_2.3 = u16[10,2] parameter(2)
+  ROOT %dot.4 = f32[10,20] dot(%Arg_0.1, %Arg_1.2, %Arg_2.3), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xi32>, %[[VAL_1:.*]]: tensor<4x5xi32>) -> tensor<3x5xi32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.dot %[[VAL_0]], %[[VAL_1]], precision = [DEFAULT, DEFAULT] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<3x4xi32>, tensor<4x5xi32>) -> tensor<3x5xi32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.transpose %[[VAL_2]], dims = [0, 1] : (tensor<3x5xi32>) -> tensor<3x5xi32>
+// CHECK:           return %[[VAL_3]] : tensor<3x5xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3,4]{1,0}, s32[4,5]{1,0})->s32[3,5]{1,0}}
+
+ENTRY %main.5 (Arg_0.1: s32[3,4], Arg_1.2: s32[4,5]) -> s32[3,5] {
+  %Arg_0.1 = s32[3,4] parameter(0)
+  %Arg_1.2 = s32[4,5] parameter(1)
+  %dot.3 = s32[3,5] dot(%Arg_0.1, %Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, frontend_attributes={grad_x="false",grad_y="false"}
+  ROOT %transpose.4 = s32[3,5] transpose(%dot.3), dimensions={0,1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.fft %[[VAL_0]], type =  RFFT, length = [9] : (tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
+// CHECK:           return %[[VAL_1]] : tensor<3x5xcomplex<f32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,9]{1,0})->c64[3,5]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[3,9]) -> c64[3,5] {
+  %Arg_0.1 = f32[3,9] parameter(0)
+  ROOT %fft.2 = c64[3,5] fft(%Arg_0.1), fft_type=RFFT, fft_length={9}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<200x100x300xf32>, %[[VAL_1:.*]]: tensor<10x2xi32>) -> tensor<10x300xf32> {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.gather"(%[[VAL_0]], %[[VAL_1]]) <{dimension_numbers = #stablehlo.gather<offset_dims = [1], collapsed_slice_dims = [0, 1], start_index_map = [0, 1], index_vector_dim = 1>, indices_are_sorted = true, slice_sizes = array<i64: 1, 1, 300>}> : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
+// CHECK:           return %[[VAL_2]] : tensor<10x300xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[10,2]{1,0})->f32[10,300]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[200,100,300], Arg_1.2: s32[10,2]) -> f32[10,300] {
+  %Arg_0.1 = f32[200,100,300] parameter(0)
+  %Arg_1.2 = s32[10,2] parameter(1)
+  ROOT %gather.3 = f32[10,300] gather(%Arg_0.1, %Arg_1.2), offset_dims={1}, collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=1, slice_sizes={1,1,300}, indices_are_sorted=true
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<200x100x300xf32>, %[[VAL_1:.*]]: tensor<100x200x1xi32>) -> tensor<100x200x300xf32> {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.gather"(%[[VAL_0]], %[[VAL_1]]) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], operand_batching_dims = [0, 1], start_indices_batching_dims = [1, 0], start_index_map = [2], index_vector_dim = 2>, indices_are_sorted = true, slice_sizes = array<i64: 1, 1, 300>}> : (tensor<200x100x300xf32>, tensor<100x200x1xi32>) -> tensor<100x200x300xf32>
+// CHECK:           return %[[VAL_2]] : tensor<100x200x300xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[100,200,1]{2,1,0})->f32[100,200,300]{2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[200,100,300], Arg_1.2: s32[100,200,1]) -> f32[100,200,300] {
+  %Arg_0.1 = f32[200,100,300] parameter(0)
+  %Arg_1.2 = s32[100,200,1] parameter(1)
+  ROOT %gather.3 = f32[100,200,300] gather(%Arg_0.1, %Arg_1.2), offset_dims={2}, collapsed_slice_dims={}, start_index_map={2}, operand_batching_dims={0,1}, start_indices_batching_dims={1,0}, index_vector_dim=2, slice_sizes={1,1,300}, indices_are_sorted=true
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x2xf32>, %[[VAL_1:.*]]: tensor<i32>) -> tensor<i32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.constant dense<2> : tensor<i32>
+// CHECK:           return %[[VAL_2]] : tensor<i32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,2]{1,0}, s32[])->s32[]}
+
+ENTRY %main.4 (Arg_0.1: f32[4,2], Arg_1.2: s32[]) -> s32[] {
+  %Arg_0.1 = f32[4,2] parameter(0)
+  %Arg_1.2 = s32[] parameter(1)
+  ROOT %constant.3 = s32[] constant(2)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x4xf32>, %[[VAL_1:.*]]: tensor<i32>) -> tensor<4x?xf32, #stablehlo.bounds<?, 4>> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.set_dimension_size %[[VAL_0]], %[[VAL_1]], dim = 1 : (tensor<4x4xf32>, tensor<i32>) -> tensor<4x?xf32, #stablehlo.bounds<?, 4>>
+// CHECK:           return %[[VAL_2]] : tensor<4x?xf32, #stablehlo.bounds<?, 4>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,4]{1,0}, s32[])->f32[4,<=4]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[4,4], Arg_1.2: s32[]) -> f32[4,<=4] {
+  %Arg_0.1 = f32[4,4] parameter(0)
+  %Arg_1.2 = s32[] parameter(1)
+  ROOT %set-dimension-size.3 = f32[4,<=4] set-dimension-size(%Arg_0.1, %Arg_1.2), dimensions={1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.get_tuple_element %[[VAL_0]][0] : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+// CHECK:           return %[[VAL_1]] : tensor<f32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={((f32[], s32[]))->f32[]}
+
+ENTRY %main.3 (Arg_0.1: (f32[], s32[])) -> f32[] {
+  %Arg_0.1 = (f32[], s32[]) parameter(0)
+  ROOT %get-tuple-element.2 = f32[] get-tuple-element(%Arg_0.1), index=0
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !stablehlo.token> {
+// CHECK:           %[[VAL_1:.*]]:3 = "stablehlo.infeed"(%[[VAL_0]]) <{infeed_config = "foobar", layout = {{\[\[}}1, 0], []]}> : (!stablehlo.token) -> (tensor<3x3xi32>, tensor<i1>, !stablehlo.token)
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]]#0, %[[VAL_1]]#1 {xla_shape = "(s32[3,3]{1,0}, pred[])"} : tuple<tensor<3x3xi32>, tensor<i1>>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple %[[VAL_2]], %[[VAL_1]]#2 {xla_shape = "((s32[3,3]{1,0}, pred[]), token[])"} : tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !stablehlo.token>
+// CHECK:           return %[[VAL_3]] : tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !stablehlo.token>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->((s32[3,3]{1,0}, pred[]), token[])}
+
+ENTRY %main.9 (Arg_0.1: token[]) -> ((s32[3,3], pred[]), token[]) {
+  %Arg_0.1 = token[] parameter(0)
+  %infeed.2 = ((s32[3,3], pred[]), token[]) infeed(%Arg_0.1), infeed_config="foobar"
+  %get-tuple-element.3 = (s32[3,3], pred[]) get-tuple-element(%infeed.2), index=0
+  %get-tuple-element.4 = s32[3,3] get-tuple-element(%get-tuple-element.3), index=0
+  %get-tuple-element.5 = pred[] get-tuple-element(%get-tuple-element.3), index=1
+  %tuple.7 = (s32[3,3], pred[]) tuple(%get-tuple-element.4, %get-tuple-element.5)
+  %get-tuple-element.6 = token[] get-tuple-element(%infeed.2), index=1
+  ROOT %tuple.8 = ((s32[3,3], pred[]), token[]) tuple(%tuple.7, %get-tuple-element.6)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, mhlo.xla_entry_computation_result_layout = [dense<[0, 1]> : tensor<2xindex>], mhlo.xla_entry_computation_result_tiles = {{\[\[}}]]} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> tensor<3x3xi32> {
+// CHECK:           %[[VAL_1:.*]]:2 = "stablehlo.infeed"(%[[VAL_0]]) <{infeed_config = "foobar", layout = {{\[\[}}1, 0]]}> : (!stablehlo.token) -> (tensor<3x3xi32>, !stablehlo.token)
+// CHECK:           return %[[VAL_1]]#0 : tensor<3x3xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->s32[3,3]{0,1}}
+
+ENTRY %main.6 (Arg_0.1: token[]) -> s32[3,3] {
+  %Arg_0.1 = token[] parameter(0)
+  %infeed.2 = ((s32[3,3]), token[]) infeed(%Arg_0.1), infeed_config="foobar"
+  %get-tuple-element.3 = (s32[3,3]) get-tuple-element(%infeed.2), index=0
+  ROOT %get-tuple-element.4 = s32[3,3] get-tuple-element(%get-tuple-element.3), index=0
+  %get-tuple-element.5 = token[] get-tuple-element(%infeed.2), index=1
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.infeed"(%[[VAL_0]]) <{infeed_config = "foobar", layout = []}> : (!stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_1]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->token[]}
+
+ENTRY %main.4 (Arg_0.1: token[]) -> token[] {
+  %Arg_0.1 = token[] parameter(0)
+  %infeed.2 = ((), token[]) infeed(%Arg_0.1), infeed_config="foobar"
+  ROOT %get-tuple-element.3 = token[] get-tuple-element(%infeed.2), index=1
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main() -> tensor<1x10xf32> {
+// CHECK:           %[[VAL_0:.*]] = stablehlo.iota dim = 0 : tensor<10xf32>
+// CHECK:           %[[VAL_1:.*]] = stablehlo.reshape %[[VAL_0]] : (tensor<10xf32>) -> tensor<1x10xf32>
+// CHECK:           return %[[VAL_1]] : tensor<1x10xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={()->f32[1,10]{1,0}}
+
+ENTRY %main.3 () -> f32[1,10] {
+  %iota.1 = f32[10] iota(), iota_dimension=0
+  ROOT %reshape.2 = f32[1,10] reshape(%iota.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.3(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<4xf32>, %[[VAL_4:.*]]: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK:           %[[VAL_5:.*]] = "stablehlo.map"(%[[VAL_3]], %[[VAL_4]]) <{dimensions = array<i64: 0>}> ({
+// CHECK:           ^bb0(%[[VAL_6:.*]]: tensor<f32>, %[[VAL_7:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_8:.*]] = stablehlo.add %[[VAL_6]], %[[VAL_7]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_8]] : tensor<f32>
+// CHECK:           }) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:           return %[[VAL_5]] : tensor<4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4]{0}, f32[4]{0})->f32[4]{0}}
+
+%region_0.3 (Arg_0.4: f32[], Arg_1.5: f32[]) -> f32[] {
+  %Arg_0.4 = f32[] parameter(0)
+  %Arg_1.5 = f32[] parameter(1)
+  ROOT %add.6 = f32[] add(%Arg_0.4, %Arg_1.5)
+}
+
+ENTRY %main.8 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
+  %Arg_0.1 = f32[4] parameter(0)
+  %Arg_1.2 = f32[4] parameter(1)
+  ROOT %map.7 = f32[4] map(%Arg_0.1, %Arg_1.2), dimensions={0}, to_apply=%region_0.3
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xf32>, %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xf32> {
+// CHECK:           return %[[VAL_0]] : tensor<4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4]{0}, s32[4]{0})->f32[4]{0}}
+
+ENTRY %main.3 (Arg_0.1: f32[4], Arg_1.2: s32[4]) -> f32[4] {
+  ROOT %Arg_0.1 = f32[4] parameter(0)
+  %Arg_1.2 = s32[4] parameter(1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3xi32>, %[[VAL_1:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.outfeed"(%[[VAL_0]], %[[VAL_1]]) <{outfeed_config = "foobar"}> {xla_shape = "token[]"} : (tensor<3xi32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_2]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3]{0}, token[])->token[]}
+
+ENTRY %main.5 (Arg_0.1: s32[3], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = s32[3] parameter(0)
+  %tuple.3 = (s32[3]) tuple(%Arg_0.1)
+  %Arg_1.2 = token[] parameter(1)
+  ROOT %outfeed.4 = token[] outfeed(%tuple.3, %Arg_1.2), outfeed_shape=(s32[3]{0}), outfeed_config="foobar"
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x2xi32>, %[[VAL_1:.*]]: !stablehlo.token) -> (!stablehlo.token {mhlo.sharding = "{{\{\{}}devices=[2,1]0,1}, {maximal device=0}}"}) {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.custom_call @Sharding(%[[VAL_0]]) {backend_config = "", mhlo.sharding = "{devices=[1,2]0,1}"} : (tensor<3x2xi32>) -> tensor<3x2xi32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[VAL_2]]) {backend_config = "", mhlo.sharding = "{devices=[1,2]0,1}"} : (tensor<3x2xi32>) -> tensor<6x2xi32>
+// CHECK:           %[[VAL_4:.*]] = "stablehlo.outfeed"(%[[VAL_3]], %[[VAL_1]]) <{outfeed_config = "foobar"}> {mhlo.sharding = "{{\{\{}}devices=[2,1]0,1}, {maximal device=0}}", xla_shape = "token[]"} : (tensor<6x2xi32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_4]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3,2]{1,0}, token[])->token[]}
+
+ENTRY %main.7 (Arg_0.1: s32[3,2], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = s32[3,2] parameter(0)
+  %custom-call.3 = s32[3,2] custom-call(%Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2]0,1}
+  %custom-call.4 = s32[6,2] custom-call(%custom-call.3), custom_call_target="SPMDShardToFullShape", sharding={devices=[1,2]0,1}
+  %tuple.5 = (s32[6,2]) tuple(%custom-call.4)
+  %Arg_1.2 = token[] parameter(1)
+  ROOT %outfeed.6 = token[] outfeed(%tuple.5, %Arg_1.2), outfeed_shape=(s32[6,2]{1,0}), outfeed_config="foobar", sharding={{devices=[2,1]0,1}, {maximal device=0}}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3xi32>, %[[VAL_1:.*]]: tensor<3xi32>, %[[VAL_2:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_3:.*]] = "stablehlo.outfeed"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{outfeed_config = "foobar"}> {xla_shape = "token[]"} : (tensor<3xi32>, tensor<3xi32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_3]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3]{0}, s32[3]{0}, token[])->token[]}
+
+ENTRY %main.6 (Arg_0.1: s32[3], Arg_1.2: s32[3], Arg_2.3: token[]) -> token[] {
+  %Arg_0.1 = s32[3] parameter(0)
+  %Arg_1.2 = s32[3] parameter(1)
+  %tuple.4 = (s32[3], s32[3]) tuple(%Arg_0.1, %Arg_1.2)
+  %Arg_2.3 = token[] parameter(2)
+  ROOT %outfeed.5 = token[] outfeed(%tuple.4, %Arg_2.3), outfeed_shape=(s32[3]{0}, s32[3]{0}), outfeed_config="foobar"
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.outfeed"(%[[VAL_0]]) <{outfeed_config = "foobar"}> {xla_shape = "token[]"} : (!stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_1]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->token[]}
+
+ENTRY %main.4 (Arg_0.1: token[]) -> token[] {
+  %tuple.2 = () tuple()
+  %Arg_0.1 = token[] parameter(0)
+  ROOT %outfeed.3 = token[] outfeed(%tuple.2, %Arg_0.1), outfeed_shape=(), outfeed_config="foobar"
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x6xf32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<13x19xf32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.pad %[[VAL_0]], %[[VAL_1]], low = [2, 3], high = [4, 5], interior = [1, 1] : (tensor<4x6xf32>, tensor<f32>) -> tensor<13x19xf32>
+// CHECK:           return %[[VAL_2]] : tensor<13x19xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,6]{1,0}, f32[])->f32[13,19]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[4,6], Arg_1.2: f32[]) -> f32[13,19] {
+  %Arg_0.1 = f32[4,6] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  ROOT %pad.3 = f32[13,19] pad(%Arg_0.1, %Arg_1.2), padding=2_4_1x3_5_1
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> tuple<tensor<3x4xi32>, !stablehlo.token> {
+// CHECK:           %[[VAL_1:.*]]:2 = "stablehlo.recv"(%[[VAL_0]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 3>, is_host_transfer = true}> : (!stablehlo.token) -> (tensor<3x4xi32>, !stablehlo.token)
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]]#0, %[[VAL_1]]#1 {xla_shape = "(s32[3,4]{1,0}, token[])"} : tuple<tensor<3x4xi32>, !stablehlo.token>
+// CHECK:           return %[[VAL_2]] : tuple<tensor<3x4xi32>, !stablehlo.token>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->(s32[3,4]{1,0}, token[])}
+
+ENTRY %main.7 (Arg_0.1: token[]) -> (s32[3,4], token[]) {
+  %Arg_0.1 = token[] parameter(0)
+  %recv.2 = (s32[3,4], u32[], token[]) recv(%Arg_0.1), channel_id=5, is_host_transfer=true
+  %recv-done.3 = (s32[3,4], token[]) recv-done(%recv.2), channel_id=5, is_host_transfer=true
+  %get-tuple-element.4 = s32[3,4] get-tuple-element(%recv-done.3), index=0
+  %get-tuple-element.5 = token[] get-tuple-element(%recv-done.3), index=1
+  ROOT %tuple.6 = (s32[3,4], token[]) tuple(%get-tuple-element.4, %get-tuple-element.5)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> tuple<tensor<3x4xi32>, !stablehlo.token> {
+// CHECK:           %[[VAL_1:.*]]:2 = "stablehlo.recv"(%[[VAL_0]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 1>, is_host_transfer = false}> : (!stablehlo.token) -> (tensor<3x4xi32>, !stablehlo.token)
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]]#0, %[[VAL_1]]#1 {xla_shape = "(s32[3,4]{1,0}, token[])"} : tuple<tensor<3x4xi32>, !stablehlo.token>
+// CHECK:           return %[[VAL_2]] : tuple<tensor<3x4xi32>, !stablehlo.token>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->(s32[3,4]{1,0}, token[])}
+
+ENTRY %main.7 (Arg_0.1: token[]) -> (s32[3,4], token[]) {
+  %Arg_0.1 = token[] parameter(0)
+  %recv.2 = (s32[3,4], u32[], token[]) recv(%Arg_0.1), channel_id=5
+  %recv-done.3 = (s32[3,4], token[]) recv-done(%recv.2), channel_id=5
+  %get-tuple-element.4 = s32[3,4] get-tuple-element(%recv-done.3), index=0
+  %get-tuple-element.5 = token[] get-tuple-element(%recv-done.3), index=1
+  ROOT %tuple.6 = (s32[3,4], token[]) tuple(%get-tuple-element.4, %get-tuple-element.5)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.recv"(%[[VAL_0]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 1>, is_host_transfer = false}> : (!stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_1]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->token[]}
+
+ENTRY %main.6 (Arg_0.1: token[]) -> token[] {
+  %Arg_0.1 = token[] parameter(0)
+  %recv.2 = ((), u32[], token[]) recv(%Arg_0.1), channel_id=5
+  %recv-done.3 = ((), token[]) recv-done(%recv.2), channel_id=5
+  %get-tuple-element.4 = () get-tuple-element(%recv-done.3), index=0
+  ROOT %get-tuple-element.5 = token[] get-tuple-element(%recv-done.3), index=1
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.5(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<i32>, %[[VAL_2:.*]]: tensor<f32>, %[[VAL_3:.*]]: tensor<i32>) -> tuple<tensor<f32>, tensor<i32>> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_2]] : tensor<f32>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.maximum %[[VAL_1]], %[[VAL_3]] : tensor<i32>
+// CHECK:           %[[VAL_6:.*]] = stablehlo.tuple %[[VAL_4]], %[[VAL_5]] {xla_shape = "(f32[], s32[])"} : tuple<tensor<f32>, tensor<i32>>
+// CHECK:           return %[[VAL_6]] : tuple<tensor<f32>, tensor<i32>>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_7:.*]]: tensor<1x10xf32>, %[[VAL_8:.*]]: tensor<1x10xi32>, %[[VAL_9:.*]]: tensor<f32>, %[[VAL_10:.*]]: tensor<i32>) -> tuple<tensor<1xf32>, tensor<1xi32>> {
+// CHECK:           %[[VAL_11:.*]]:2 = stablehlo.reduce(%[[VAL_7]] init: %[[VAL_9]]), (%[[VAL_8]] init: %[[VAL_10]]) across dimensions = [1] : (tensor<1x10xf32>, tensor<1x10xi32>, tensor<f32>, tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>)
+// CHECK:            reducer(%[[VAL_12:.*]]: tensor<f32>, %[[VAL_13:.*]]: tensor<f32>) (%[[VAL_14:.*]]: tensor<i32>, %[[VAL_15:.*]]: tensor<i32>)  {
+// CHECK:             %[[VAL_16:.*]] = stablehlo.maximum %[[VAL_12]], %[[VAL_13]] : tensor<f32>
+// CHECK:             %[[VAL_17:.*]] = stablehlo.maximum %[[VAL_14]], %[[VAL_15]] : tensor<i32>
+// CHECK:             stablehlo.return %[[VAL_16]], %[[VAL_17]] : tensor<f32>, tensor<i32>
+// CHECK:           }
+// CHECK:           %[[VAL_18:.*]] = stablehlo.tuple %[[VAL_11]]#0, %[[VAL_11]]#1 {xla_shape = "(f32[1]{0}, s32[1]{0})"} : tuple<tensor<1xf32>, tensor<1xi32>>
+// CHECK:           return %[[VAL_18]] : tuple<tensor<1xf32>, tensor<1xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[1,10]{1,0}, s32[1,10]{1,0}, f32[], s32[])->(f32[1]{0}, s32[1]{0})}
+
+%region_0.5 (Arg_0.6: f32[], Arg_1.7: s32[], Arg_2.8: f32[], Arg_3.9: s32[]) -> (f32[], s32[]) {
+  %Arg_0.6 = f32[] parameter(0)
+  %Arg_2.8 = f32[] parameter(2)
+  %maximum.10 = f32[] maximum(%Arg_0.6, %Arg_2.8)
+  %Arg_1.7 = s32[] parameter(1)
+  %Arg_3.9 = s32[] parameter(3)
+  %maximum.11 = s32[] maximum(%Arg_1.7, %Arg_3.9)
+  ROOT %tuple.12 = (f32[], s32[]) tuple(%maximum.10, %maximum.11)
+}
+
+ENTRY %main.17 (Arg_0.1: f32[1,10], Arg_1.2: s32[1,10], Arg_2.3: f32[], Arg_3.4: s32[]) -> (f32[1], s32[1]) {
+  %Arg_0.1 = f32[1,10] parameter(0)
+  %Arg_1.2 = s32[1,10] parameter(1)
+  %Arg_2.3 = f32[] parameter(2)
+  %Arg_3.4 = s32[] parameter(3)
+  %reduce.13 = (f32[1], s32[1]) reduce(%Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_3.4), dimensions={1}, to_apply=%region_0.5
+  %get-tuple-element.14 = f32[1] get-tuple-element(%reduce.13), index=0
+  %get-tuple-element.15 = s32[1] get-tuple-element(%reduce.13), index=1
+  ROOT %tuple.16 = (f32[1], s32[1]) tuple(%get-tuple-element.14, %get-tuple-element.15)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.3(%[[VAL_0:.*]]: tensor<i32>, %[[VAL_1:.*]]: tensor<i32>) -> tensor<i32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_1]] : tensor<i32>
+// CHECK:           return %[[VAL_2]] : tensor<i32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<2x17x31x7xi32>) -> tensor<2x5x8x7xi32> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.constant dense<-2147483648> : tensor<i32>
+// CHECK:           %[[VAL_5:.*]] = "stablehlo.reduce_window"(%[[VAL_3]], %[[VAL_4]]) <{base_dilations = array<i64: 1, 1, 1, 1>, padding = dense<{{\[\[}}0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>, window_dilations = array<i64: 1, 2, 2, 1>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 4, 4, 1>}> ({
+// CHECK:           ^bb0(%[[VAL_6:.*]]: tensor<i32>, %[[VAL_7:.*]]: tensor<i32>):
+// CHECK:             %[[VAL_8:.*]] = stablehlo.maximum %[[VAL_6]], %[[VAL_7]] : tensor<i32>
+// CHECK:             stablehlo.return %[[VAL_8]] : tensor<i32>
+// CHECK:           }) : (tensor<2x17x31x7xi32>, tensor<i32>) -> tensor<2x5x8x7xi32>
+// CHECK:           return %[[VAL_5]] : tensor<2x5x8x7xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[2,17,31,7]{3,2,1,0})->s32[2,5,8,7]{3,2,1,0}}
+
+%region_0.3 (Arg_0.4: s32[], Arg_1.5: s32[]) -> s32[] {
+  %Arg_0.4 = s32[] parameter(0)
+  %Arg_1.5 = s32[] parameter(1)
+  ROOT %maximum.6 = s32[] maximum(%Arg_0.4, %Arg_1.5)
+}
+
+ENTRY %main.8 (Arg_0.1: s32[2,17,31,7]) -> s32[2,5,8,7] {
+  %Arg_0.1 = s32[2,17,31,7] parameter(0)
+  %constant.2 = s32[] constant(-2147483648)
+  ROOT %reduce-window.7 = s32[2,5,8,7] reduce-window(%Arg_0.1, %constant.2), window={size=1x2x2x1 stride=1x4x4x1 pad=0_0x2_0x0_2x0_0 rhs_dilate=1x2x2x1}, to_apply=%region_0.3
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xf32>) -> tensor<1x2xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.reshape %[[VAL_0]] : (tensor<2xf32>) -> tensor<1x2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<1x2xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2]{0})->f32[1,2]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[2]) -> f32[1,2] {
+  %Arg_0.1 = f32[2] parameter(0)
+  ROOT %reshape.2 = f32[1,2] reshape(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.reverse %[[VAL_0]], dims = [1, 2] : tensor<10x11x12x13xf32>
+// CHECK:           return %[[VAL_1]] : tensor<10x11x12x13xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10,11,12,13]{3,2,1,0})->f32[10,11,12,13]{3,2,1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[10,11,12,13]) -> f32[10,11,12,13] {
+  %Arg_0.1 = f32[10,11,12,13] parameter(0)
+  ROOT %reverse.2 = f32[10,11,12,13] reverse(%Arg_0.1), dimensions={1,2}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<2x3x5xf32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.rng %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], distribution =  NORMAL : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+// CHECK:           return %[[VAL_4]] : tensor<2x3x5xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[], f32[])->f32[2,3,5]{2,1,0}}
+
+ENTRY %main.5 (Arg_0.1: f32[], Arg_1.2: f32[]) -> f32[2,3,5] {
+  %constant.3 = s64[3] constant({2, 3, 5})
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  ROOT %rng.4 = f32[2,3,5] rng(%Arg_0.1, %Arg_1.2), distribution=rng_normal
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main() -> tensor<2x3x5xf32> {
+// CHECK:           %[[VAL_0:.*]] = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+// CHECK:           %[[VAL_1:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.rng %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], distribution =  UNIFORM : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+// CHECK:           return %[[VAL_4]] : tensor<2x3x5xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={()->f32[2,3,5]{2,1,0}}
+
+ENTRY %main.5 () -> f32[2,3,5] {
+  %constant.3 = s64[3] constant({2, 3, 5})
+  %constant.1 = f32[] constant(0)
+  %constant.2 = f32[] constant(1)
+  ROOT %rng.4 = f32[2,3,5] rng(%constant.1, %constant.2), distribution=rng_uniform
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.4(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<200x100x300xf32>, %[[VAL_4:.*]]: tensor<10x2xi32>, %[[VAL_5:.*]]: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
+// CHECK:           %[[VAL_6:.*]] = "stablehlo.scatter"(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) <{indices_are_sorted = true, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1], inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = true}> ({
+// CHECK:           ^bb0(%[[VAL_7:.*]]: tensor<f32>, %[[VAL_8:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_9:.*]] = stablehlo.add %[[VAL_7]], %[[VAL_8]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_9]] : tensor<f32>
+// CHECK:           }) : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
+// CHECK:           return %[[VAL_6]] : tensor<200x100x300xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[10,2]{1,0}, f32[10,300]{1,0})->f32[200,100,300]{2,1,0}}
+
+%region_0.4 (Arg_0.5: f32[], Arg_1.6: f32[]) -> f32[] {
+  %Arg_0.5 = f32[] parameter(0)
+  %Arg_1.6 = f32[] parameter(1)
+  ROOT %add.7 = f32[] add(%Arg_0.5, %Arg_1.6)
+}
+
+ENTRY %main.9 (Arg_0.1: f32[200,100,300], Arg_1.2: s32[10,2], Arg_2.3: f32[10,300]) -> f32[200,100,300] {
+  %Arg_0.1 = f32[200,100,300] parameter(0)
+  %Arg_1.2 = s32[10,2] parameter(1)
+  %Arg_2.3 = f32[10,300] parameter(2)
+  ROOT %scatter.8 = f32[200,100,300] scatter(%Arg_0.1, %Arg_1.2, %Arg_2.3), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=%region_0.4
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.4(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           return %[[VAL_2]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_3:.*]]: tensor<200x100x300xf32>, %[[VAL_4:.*]]: tensor<100x200x1xi32>, %[[VAL_5:.*]]: tensor<100x200x300xf32>) -> tensor<200x100x300xf32> {
+// CHECK:           %[[VAL_6:.*]] = "stablehlo.scatter"(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) <{indices_are_sorted = true, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [2], input_batching_dims = [0, 1], scatter_indices_batching_dims = [1, 0], scatter_dims_to_operand_dims = [2], index_vector_dim = 2>, unique_indices = true}> ({
+// CHECK:           ^bb0(%[[VAL_7:.*]]: tensor<f32>, %[[VAL_8:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_9:.*]] = stablehlo.add %[[VAL_7]], %[[VAL_8]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_9]] : tensor<f32>
+// CHECK:           }) : (tensor<200x100x300xf32>, tensor<100x200x1xi32>, tensor<100x200x300xf32>) -> tensor<200x100x300xf32>
+// CHECK:           return %[[VAL_6]] : tensor<200x100x300xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[100,200,1]{2,1,0}, f32[100,200,300]{2,1,0})->f32[200,100,300]{2,1,0}}
+
+%region_0.4 (Arg_0.5: f32[], Arg_1.6: f32[]) -> f32[] {
+  %Arg_0.5 = f32[] parameter(0)
+  %Arg_1.6 = f32[] parameter(1)
+  ROOT %add.7 = f32[] add(%Arg_0.5, %Arg_1.6)
+}
+
+ENTRY %main.9 (Arg_0.1: f32[200,100,300], Arg_1.2: s32[100,200,1], Arg_2.3: f32[100,200,300]) -> f32[200,100,300] {
+  %Arg_0.1 = f32[200,100,300] parameter(0)
+  %Arg_1.2 = s32[100,200,1] parameter(1)
+  %Arg_2.3 = f32[100,200,300] parameter(2)
+  ROOT %scatter.8 = f32[200,100,300] scatter(%Arg_0.1, %Arg_1.2, %Arg_2.3), update_window_dims={2}, inserted_window_dims={}, scatter_dims_to_operand_dims={2}, input_batching_dims={0,1}, scatter_indices_batching_dims={1,0}, index_vector_dim=2, indices_are_sorted=true, unique_indices=true, to_apply=%region_0.4
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.4(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>, %[[VAL_2:.*]]: tensor<f32>, %[[VAL_3:.*]]: tensor<f32>) -> tuple<tensor<f32>, tensor<f32>> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_1]] : tensor<f32>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.add %[[VAL_2]], %[[VAL_3]] : tensor<f32>
+// CHECK:           %[[VAL_6:.*]] = stablehlo.tuple %[[VAL_4]], %[[VAL_5]] {xla_shape = "(f32[], f32[])"} : tuple<tensor<f32>, tensor<f32>>
+// CHECK:           return %[[VAL_6]] : tuple<tensor<f32>, tensor<f32>>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_7:.*]]: tensor<200x100x300xf32>, %[[VAL_8:.*]]: tensor<10x2xi64>, %[[VAL_9:.*]]: tensor<10x300xf32>) -> tuple<tensor<200x100x300xf32>, tensor<200x100x300xf32>> {
+// CHECK:           %[[VAL_10:.*]]:2 = "stablehlo.scatter"(%[[VAL_7]], %[[VAL_7]], %[[VAL_8]], %[[VAL_9]], %[[VAL_9]]) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1], inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = false}> ({
+// CHECK:           ^bb0(%[[VAL_11:.*]]: tensor<f32>, %[[VAL_12:.*]]: tensor<f32>, %[[VAL_13:.*]]: tensor<f32>, %[[VAL_14:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_15:.*]] = stablehlo.add %[[VAL_11]], %[[VAL_12]] : tensor<f32>
+// CHECK:             %[[VAL_16:.*]] = stablehlo.add %[[VAL_13]], %[[VAL_14]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_15]], %[[VAL_16]] : tensor<f32>, tensor<f32>
+// CHECK:           }) : (tensor<200x100x300xf32>, tensor<200x100x300xf32>, tensor<10x2xi64>, tensor<10x300xf32>, tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>)
+// CHECK:           %[[VAL_17:.*]] = stablehlo.tuple %[[VAL_18:.*]]#0, %[[VAL_18]]#1 {xla_shape = "(f32[200,100,300]{2,1,0}, f32[200,100,300]{2,1,0})"} : tuple<tensor<200x100x300xf32>, tensor<200x100x300xf32>>
+// CHECK:           return %[[VAL_17]] : tuple<tensor<200x100x300xf32>, tensor<200x100x300xf32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s64[10,2]{1,0}, f32[10,300]{1,0})->(f32[200,100,300]{2,1,0}, f32[200,100,300]{2,1,0})}
+
+%region_0.4 (Arg_0.5: f32[], Arg_1.6: f32[], Arg_2.7: f32[], Arg_3.8: f32[]) -> (f32[], f32[]) {
+  %Arg_0.5 = f32[] parameter(0)
+  %Arg_1.6 = f32[] parameter(1)
+  %add.9 = f32[] add(%Arg_0.5, %Arg_1.6)
+  %Arg_2.7 = f32[] parameter(2)
+  %Arg_3.8 = f32[] parameter(3)
+  %add.10 = f32[] add(%Arg_2.7, %Arg_3.8)
+  ROOT %tuple.11 = (f32[], f32[]) tuple(%add.9, %add.10)
+}
+
+ENTRY %main.16 (Arg_0.1: f32[200,100,300], Arg_1.2: s64[10,2], Arg_2.3: f32[10,300]) -> (f32[200,100,300], f32[200,100,300]) {
+  %Arg_0.1 = f32[200,100,300] parameter(0)
+  %Arg_1.2 = s64[10,2] parameter(1)
+  %Arg_2.3 = f32[10,300] parameter(2)
+  %scatter.12 = (f32[200,100,300], f32[200,100,300]) scatter(%Arg_0.1, %Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_2.3), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=%region_0.4
+  %get-tuple-element.13 = f32[200,100,300] get-tuple-element(%scatter.12), index=0
+  %get-tuple-element.14 = f32[200,100,300] get-tuple-element(%scatter.12), index=1
+  ROOT %tuple.15 = (f32[200,100,300], f32[200,100,300]) tuple(%get-tuple-element.13, %get-tuple-element.14)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<i1>, %[[VAL_1:.*]]: tensor<2x3xi32>, %[[VAL_2:.*]]: tensor<2x3xi32>) -> tensor<2x3xi32> {
+// CHECK:           %[[VAL_3:.*]] = stablehlo.broadcast_in_dim %[[VAL_0]], dims = [] : (tensor<i1>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.select %[[VAL_3]], %[[VAL_1]], %[[VAL_2]] : tensor<2x3xi1>, tensor<2x3xi32>
+// CHECK:           return %[[VAL_4]] : tensor<2x3xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(pred[], s32[2,3]{1,0}, s32[2,3]{1,0})->s32[2,3]{1,0}}
+
+ENTRY %main.6 (Arg_0.1: pred[], Arg_1.2: s32[2,3], Arg_2.3: s32[2,3]) -> s32[2,3] {
+  %Arg_0.1 = pred[] parameter(0)
+  %broadcast.4 = pred[2,3] broadcast(%Arg_0.1), dimensions={}
+  %Arg_1.2 = s32[2,3] parameter(1)
+  %Arg_2.3 = s32[2,3] parameter(2)
+  ROOT %select.5 = s32[2,3] select(%broadcast.4, %Arg_1.2, %Arg_2.3)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.4(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<i1> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.compare  GE, %[[VAL_0]], %[[VAL_1]],  TOTALORDER : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:           return %[[VAL_2]] : tensor<i1>
+// CHECK:         }
+// CHECK:         func.func private @region_1.8(%[[VAL_3:.*]]: tensor<f32>, %[[VAL_4:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_5:.*]] = stablehlo.add %[[VAL_3]], %[[VAL_4]] : tensor<f32>
+// CHECK:           return %[[VAL_5]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_6:.*]]: tensor<10x24x24x64xf32>, %[[VAL_7:.*]]: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32> {
+// CHECK:           %[[VAL_8:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_9:.*]] = "stablehlo.select_and_scatter"(%[[VAL_6]], %[[VAL_7]], %[[VAL_8]]) <{padding = dense<0> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}> ({
+// CHECK:           ^bb0(%[[VAL_10:.*]]: tensor<f32>, %[[VAL_11:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_12:.*]] = stablehlo.compare  GE, %[[VAL_10]], %[[VAL_11]],  TOTALORDER : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:             stablehlo.return %[[VAL_12]] : tensor<i1>
+// CHECK:           }, {
+// CHECK:           ^bb0(%[[VAL_13:.*]]: tensor<f32>, %[[VAL_14:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_15:.*]] = stablehlo.add %[[VAL_13]], %[[VAL_14]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_15]] : tensor<f32>
+// CHECK:           }) : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+// CHECK:           return %[[VAL_9]] : tensor<10x24x24x64xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10,24,24,64]{3,2,1,0}, f32[10,12,12,64]{3,2,1,0})->f32[10,24,24,64]{3,2,1,0}}
+
+%region_0.4 (Arg_0.5: f32[], Arg_1.6: f32[]) -> pred[] {
+  %Arg_0.5 = f32[] parameter(0)
+  %Arg_1.6 = f32[] parameter(1)
+  ROOT %compare.7 = pred[] compare(%Arg_0.5, %Arg_1.6), direction=GE, type=TOTALORDER
+}
+
+%region_1.8 (Arg_0.9: f32[], Arg_1.10: f32[]) -> f32[] {
+  %Arg_0.9 = f32[] parameter(0)
+  %Arg_1.10 = f32[] parameter(1)
+  ROOT %add.11 = f32[] add(%Arg_0.9, %Arg_1.10)
+}
+
+ENTRY %main.13 (Arg_0.1: f32[10,24,24,64], Arg_1.2: f32[10,12,12,64]) -> f32[10,24,24,64] {
+  %Arg_0.1 = f32[10,24,24,64] parameter(0)
+  %Arg_1.2 = f32[10,12,12,64] parameter(1)
+  %constant.3 = f32[] constant(0)
+  ROOT %select-and-scatter.12 = f32[10,24,24,64] select-and-scatter(%Arg_0.1, %Arg_1.2, %constant.3), window={size=1x2x2x1 stride=1x2x2x1}, select=%region_0.4, scatter=%region_1.8
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xi32>, %[[VAL_1:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.send"(%[[VAL_0]], %[[VAL_1]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 2>, is_host_transfer = true}> {xla_shape = "token[]"} : (tensor<3x4xi32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_2]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3,4]{1,0}, token[])->token[]}
+
+ENTRY %main.5 (Arg_0.1: s32[3,4], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = s32[3,4] parameter(0)
+  %Arg_1.2 = token[] parameter(1)
+  %send.3 = (s32[3,4], u32[], token[]) send(%Arg_0.1, %Arg_1.2), channel_id=5, is_host_transfer=true
+  ROOT %send-done.4 = token[] send-done(%send.3), channel_id=5, is_host_transfer=true
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xi32>, %[[VAL_1:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.send"(%[[VAL_0]], %[[VAL_1]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 1>, is_host_transfer = false}> {xla_shape = "token[]"} : (tensor<3x4xi32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_2]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3,4]{1,0}, token[])->token[]}
+
+ENTRY %main.5 (Arg_0.1: s32[3,4], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = s32[3,4] parameter(0)
+  %Arg_1.2 = token[] parameter(1)
+  %send.3 = (s32[3,4], u32[], token[]) send(%Arg_0.1, %Arg_1.2), channel_id=5
+  ROOT %send-done.4 = token[] send-done(%send.3), channel_id=5
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.send"(%[[VAL_0]]) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 1>, is_host_transfer = false}> {xla_shape = "token[]"} : (!stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_1]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(token[])->token[]}
+
+ENTRY %main.5 (Arg_0.1: token[]) -> token[] {
+  %tuple.2 = () tuple()
+  %Arg_0.1 = token[] parameter(0)
+  %send.3 = ((), u32[], token[]) send(%tuple.2, %Arg_0.1), channel_id=5
+  ROOT %send-done.4 = token[] send-done(%send.3), channel_id=5
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x4xf32>, %[[VAL_1:.*]]: tensor<i32>) -> tensor<4x?xf32, #stablehlo.bounds<?, 4>> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.set_dimension_size %[[VAL_0]], %[[VAL_1]], dim = 1 : (tensor<4x4xf32>, tensor<i32>) -> tensor<4x?xf32, #stablehlo.bounds<?, 4>>
+// CHECK:           return %[[VAL_2]] : tensor<4x?xf32, #stablehlo.bounds<?, 4>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,4]{1,0}, s32[])->f32[4,<=4]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[4,4], Arg_1.2: s32[]) -> f32[4,<=4] {
+  %Arg_0.1 = f32[4,4] parameter(0)
+  %Arg_1.2 = s32[] parameter(1)
+  ROOT %set-dimension-size.3 = f32[4,<=4] set-dimension-size(%Arg_0.1, %Arg_1.2), dimensions={1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.slice %[[VAL_0]] [1:2, 0:4:2] : (tensor<3x4xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<1x2xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3,4]{1,0})->s32[1,2]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: s32[3,4]) -> s32[1,2] {
+  %Arg_0.1 = s32[3,4] parameter(0)
+  ROOT %slice.2 = s32[1,2] slice(%Arg_0.1), slice={[1:2:1], [0:4:2]}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xi32>, %[[VAL_1:.*]]: tensor<i64>, %[[VAL_2:.*]]: tensor<i64>) -> tensor<1x4xi32> {
+// CHECK:           %[[VAL_3:.*]] = stablehlo.dynamic_slice %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], sizes = [1, 4] : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+// CHECK:           return %[[VAL_3]] : tensor<1x4xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3,4]{1,0}, s64[], s64[])->s32[1,4]{1,0}}
+
+ENTRY %main.5 (Arg_0.1: s32[3,4], Arg_1.2: s64[], Arg_2.3: s64[]) -> s32[1,4] {
+  %Arg_0.1 = s32[3,4] parameter(0)
+  %Arg_1.2 = s64[] parameter(1)
+  %Arg_2.3 = s64[] parameter(2)
+  ROOT %dynamic-slice.4 = s32[1,4] dynamic-slice(%Arg_0.1, %Arg_1.2, %Arg_2.3), dynamic_slice_sizes={1,4}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, mhlo.xla_entry_computation_result_layout = [dense<[2, 3, 0, 1]> : tensor<4xindex>], mhlo.xla_entry_computation_result_tiles = {{\[\[}}]]} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.transpose %[[VAL_0]], dims = [1, 0, 3, 2] : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2x1x4x3xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[1,2,3,4]{3,2,1,0})->s32[2,1,4,3]{2,3,0,1}}
+
+ENTRY %main.3 (Arg_0.1: s32[1,2,3,4]) -> s32[2,1,4,3] {
+  %Arg_0.1 = s32[1,2,3,4] parameter(0)
+  ROOT %transpose.2 = s32[2,1,4,3] transpose(%Arg_0.1), dimensions={1,0,3,2}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x4xf32>, %[[VAL_1:.*]]: tensor<4x3xf32>) -> tensor<4x3xf32> {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.triangular_solve"(%[[VAL_0]], %[[VAL_1]]) <{left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true}> : (tensor<4x4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
+// CHECK:           return %[[VAL_2]] : tensor<4x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,4]{1,0}, f32[4,3]{1,0})->f32[4,3]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[4,4], Arg_1.2: f32[4,3]) -> f32[4,3] {
+  %Arg_0.1 = f32[4,4] parameter(0)
+  %Arg_1.2 = f32[4,3] parameter(1)
+  ROOT %triangular-solve.3 = f32[4,3] triangular-solve(%Arg_0.1, %Arg_1.2), left_side=true, lower=true, unit_diagonal=true, transpose_a=NO_TRANSPOSE
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<i32>) -> tuple<tensor<f32>, tensor<i32>> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_0]], %[[VAL_1]] {xla_shape = "(f32[], s32[])"} : tuple<tensor<f32>, tensor<i32>>
+// CHECK:           return %[[VAL_2]] : tuple<tensor<f32>, tensor<i32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[], s32[])->(f32[], s32[])}
+
+ENTRY %main.4 (Arg_0.1: f32[], Arg_1.2: s32[]) -> (f32[], s32[]) {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = s32[] parameter(1)
+  ROOT %tuple.3 = (f32[], s32[]) tuple(%Arg_0.1, %Arg_1.2)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xf32>, %[[VAL_1:.*]]: tensor<4xi32>) -> tuple<tensor<4xf32>, tensor<4xf32>, tensor<4xi32>, tensor<4xi32>> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.exponential_minus_one %[[VAL_0]] : tensor<4xf32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.log_plus_one %[[VAL_0]] : tensor<4xf32>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.not %[[VAL_1]] : tensor<4xi32>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.popcnt %[[VAL_1]] : tensor<4xi32>
+// CHECK:           %[[VAL_6:.*]] = stablehlo.tuple %[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]] {xla_shape = "(f32[4]{0}, f32[4]{0}, s32[4]{0}, s32[4]{0})"} : tuple<tensor<4xf32>, tensor<4xf32>, tensor<4xi32>, tensor<4xi32>>
+// CHECK:           return %[[VAL_6]] : tuple<tensor<4xf32>, tensor<4xf32>, tensor<4xi32>, tensor<4xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4]{0}, s32[4]{0})->(f32[4]{0}, f32[4]{0}, s32[4]{0}, s32[4]{0})}
+
+ENTRY %main.8 (Arg_0.1: f32[4], Arg_1.2: s32[4]) -> (f32[4], f32[4], s32[4], s32[4]) {
+  %Arg_0.1 = f32[4] parameter(0)
+  %exponential-minus-one.3 = f32[4] exponential-minus-one(%Arg_0.1)
+  %log-plus-one.4 = f32[4] log-plus-one(%Arg_0.1)
+  %Arg_1.2 = s32[4] parameter(1)
+  %not.5 = s32[4] not(%Arg_1.2)
+  %popcnt.6 = s32[4] popcnt(%Arg_1.2)
+  ROOT %tuple.7 = (f32[4], f32[4], s32[4], s32[4]) tuple(%exponential-minus-one.3, %log-plus-one.4, %not.5, %popcnt.6)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xi1>, %[[VAL_1:.*]]: tensor<4xi1>) -> tensor<4xi1> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.xor %[[VAL_0]], %[[VAL_1]] : tensor<4xi1>
+// CHECK:           return %[[VAL_2]] : tensor<4xi1>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(pred[4]{0}, pred[4]{0})->pred[4]{0}}
+
+ENTRY %main.4 (Arg_0.1: pred[4], Arg_1.2: pred[4]) -> pred[4] {
+  %Arg_0.1 = pred[4] parameter(0)
+  %Arg_1.2 = pred[4] parameter(1)
+  ROOT %xor.3 = pred[4] xor(%Arg_0.1, %Arg_1.2)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<16x16xf32>, %[[VAL_1:.*]]: tensor<16x16xi32>) -> tuple<> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple  {xla_shape = "()"} : tuple<>
+// CHECK:           return %[[VAL_2]] : tuple<>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[16,16]{1,0}, s32[16,16]{1,0})->()}
+
+ENTRY %main.4 (Arg_0.1: f32[16,16], Arg_1.2: s32[16,16]) -> () {
+  %Arg_0.1 = f32[16,16] parameter(0)
+  %Arg_1.2 = s32[16,16] parameter(1)
+  ROOT %tuple.3 = () tuple()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<16x16xf32>) -> tuple<> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.tuple  {xla_shape = "()"} : tuple<>
+// CHECK:           return %[[VAL_1]] : tuple<>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[16,16]{1,0})->()}
+
+ENTRY %main.3 (Arg_0.1: f32[16,16]) -> () {
+  %Arg_0.1 = f32[16,16] parameter(0)
+  ROOT %tuple.2 = () tuple()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<16x16xf32>) -> (tensor<16x16xf32> {mhlo.sharding = "{devices=[1,2]0,1}"}) {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.custom_call @Sharding(%[[VAL_0]]) {backend_config = "", mhlo.sharding = "{devices=[1,2]0,1}"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
+// CHECK:           return %[[VAL_1]] : tensor<16x16xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[16,16]{1,0})->f32[16,16]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[16,16]) -> f32[16,16] {
+  %Arg_0.1 = f32[16,16] parameter(0)
+  ROOT %custom-call.2 = f32[16,16] custom-call(%Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2]0,1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @foo.3(%[[VAL_0:.*]]: tensor<2x3xf32>, %[[VAL_1:.*]]: tensor<5x5xf32>) -> tensor<2x3xf32> {
+// CHECK:           return %[[VAL_0]] : tensor<2x3xf32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_2:.*]]: tensor<2x3xf32>, %[[VAL_3:.*]]: tensor<5x5xf32>) -> tensor<2x3xf32> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.custom_call @foo(%[[VAL_2]], %[[VAL_3]]) {backend_config = "", called_computations = [@foo.3]} : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<2x3xf32>
+// CHECK:           return %[[VAL_4]] : tensor<2x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0}, f32[5,5]{1,0})->f32[2,3]{1,0}}
+
+%foo.3 (Arg_0.4: f32[2,3], Arg_1.5: f32[5,5]) -> f32[2,3] {
+  ROOT %Arg_0.4 = f32[2,3] parameter(0)
+  %Arg_1.5 = f32[5,5] parameter(1)
+}
+
+ENTRY %main.7 (Arg_0.1: f32[2,3], Arg_1.2: f32[5,5]) -> f32[2,3] {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  %Arg_1.2 = f32[5,5] parameter(1)
+  ROOT %custom-call.6 = f32[2,3] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="foo", called_computations={%foo.3}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xcomplex<f32>>, %[[VAL_1:.*]]: tensor<2xcomplex<f64>>) -> tuple<tensor<2xf32>, tensor<2xf64>> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.abs %[[VAL_0]] : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.abs %[[VAL_1]] : (tensor<2xcomplex<f64>>) -> tensor<2xf64>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.tuple %[[VAL_2]], %[[VAL_3]] {xla_shape = "(f32[2]{0}, f64[2]{0})"} : tuple<tensor<2xf32>, tensor<2xf64>>
+// CHECK:           return %[[VAL_4]] : tuple<tensor<2xf32>, tensor<2xf64>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(c64[2]{0}, c128[2]{0})->(f32[2]{0}, f64[2]{0})}
+
+ENTRY %main.6 (Arg_0.1: c64[2], Arg_1.2: c128[2]) -> (f32[2], f64[2]) {
+  %Arg_0.1 = c64[2] parameter(0)
+  %abs.3 = f32[2] abs(%Arg_0.1)
+  %Arg_1.2 = c128[2] parameter(1)
+  %abs.4 = f64[2] abs(%Arg_1.2)
+  ROOT %tuple.5 = (f32[2], f64[2]) tuple(%abs.3, %abs.4)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xui8>) -> tensor<4xui8> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.not %[[VAL_0]] : tensor<4xui8>
+// CHECK:           return %[[VAL_1]] : tensor<4xui8>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(u8[4]{0})->u8[4]{0}}
+
+ENTRY %main.3 (Arg_0.1: u8[4]) -> u8[4] {
+  %Arg_0.1 = u8[4] parameter(0)
+  ROOT %not.2 = u8[4] not(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.not %[[VAL_0]] : tensor<4xi32>
+// CHECK:           return %[[VAL_1]] : tensor<4xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[4]{0})->s32[4]{0}}
+
+ENTRY %main.3 (Arg_0.1: s32[4]) -> s32[4] {
+  %Arg_0.1 = s32[4] parameter(0)
+  ROOT %not.2 = s32[4] not(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4xf32>, %[[VAL_1:.*]]: tensor<i32>) -> tensor<?xf32, #stablehlo.bounds<4>> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.set_dimension_size %[[VAL_0]], %[[VAL_1]], dim = 0 : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #stablehlo.bounds<4>>
+// CHECK:           return %[[VAL_2]] : tensor<?xf32, #stablehlo.bounds<4>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4]{0}, s32[])->f32[<=4]{0}}
+
+ENTRY %main.4 (Arg_0.1: f32[4], Arg_1.2: s32[]) -> f32[<=4] {
+  %Arg_0.1 = f32[4] parameter(0)
+  %Arg_1.2 = s32[] parameter(1)
+  ROOT %set-dimension-size.3 = f32[<=4] set-dimension-size(%Arg_0.1, %Arg_1.2), dimensions={0}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>, %[[VAL_1:.*]]: !stablehlo.token) -> tuple<tensor<3x4xf32>, !stablehlo.token> {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.send"(%[[VAL_0]], %[[VAL_1]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 2>, is_host_transfer = true}> {mhlo.frontend_attributes = {_xla_host_transfer_rendezvous = "channel_dtoh_0"}, xla_shape = "token[]"} : (tensor<3x4xf32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           %[[VAL_3:.*]]:2 = "stablehlo.recv"(%[[VAL_2]]) <{channel_handle = #stablehlo.channel_handle<handle = 2, type = 3>, is_host_transfer = true}> {mhlo.frontend_attributes = {_xla_host_transfer_rendezvous = "channel_htod_0"}} : (!stablehlo.token) -> (tensor<3x4xf32>, !stablehlo.token)
+// CHECK:           %[[VAL_4:.*]] = stablehlo.tuple %[[VAL_3]]#0, %[[VAL_3]]#1 {xla_shape = "(f32[3,4]{1,0}, token[])"} : tuple<tensor<3x4xf32>, !stablehlo.token>
+// CHECK:           return %[[VAL_4]] : tuple<tensor<3x4xf32>, !stablehlo.token>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0}, token[])->(f32[3,4]{1,0}, token[])}
+
+ENTRY %main.10 (Arg_0.1: f32[3,4], Arg_1.2: token[]) -> (f32[3,4], token[]) {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  %Arg_1.2 = token[] parameter(1)
+  %send.3 = (f32[3,4], u32[], token[]) send(%Arg_0.1, %Arg_1.2), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_host_transfer_rendezvous="channel_dtoh_0"}
+  %send-done.4 = token[] send-done(%send.3), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_host_transfer_rendezvous="channel_dtoh_0"}
+  %recv.5 = (f32[3,4], u32[], token[]) recv(%send-done.4), channel_id=2, is_host_transfer=true, frontend_attributes={_xla_host_transfer_rendezvous="channel_htod_0"}
+  %recv-done.6 = (f32[3,4], token[]) recv-done(%recv.5), channel_id=2, is_host_transfer=true, frontend_attributes={_xla_host_transfer_rendezvous="channel_htod_0"}
+  %get-tuple-element.7 = f32[3,4] get-tuple-element(%recv-done.6), index=0, frontend_attributes={_xla_host_transfer_rendezvous="channel_htod_0"}
+  %get-tuple-element.8 = token[] get-tuple-element(%recv-done.6), index=1, frontend_attributes={_xla_host_transfer_rendezvous="channel_htod_0"}
+  ROOT %tuple.9 = (f32[3,4], token[]) tuple(%get-tuple-element.7, %get-tuple-element.8)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>, %[[VAL_1:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.send"(%[[VAL_0]], %[[VAL_1]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 2>, is_host_transfer = true}> {xla_shape = "token[]"} : (tensor<3x4xf32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_2]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0}, token[])->token[]}
+
+ENTRY %main.5 (Arg_0.1: f32[3,4], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  %Arg_1.2 = token[] parameter(1)
+  %send.3 = (f32[3,4], u32[], token[]) send(%Arg_0.1, %Arg_1.2), channel_id=1, is_host_transfer=true
+  ROOT %send-done.4 = token[] send-done(%send.3), channel_id=1, is_host_transfer=true
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>, %[[VAL_1:.*]]: !stablehlo.token) -> !stablehlo.token {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.send"(%[[VAL_0]], %[[VAL_1]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 2>, is_host_transfer = true}> {xla_shape = "token[]"} : (tensor<3x4xf32>, !stablehlo.token) -> !stablehlo.token
+// CHECK:           return %[[VAL_2]] : !stablehlo.token
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0}, token[])->token[]}
+
+ENTRY %main.5 (Arg_0.1: f32[3,4], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  %Arg_1.2 = token[] parameter(1)
+  %send.3 = (f32[3,4], u32[], token[]) send(%Arg_0.1, %Arg_1.2), channel_id=1, is_host_transfer=true
+  ROOT %send-done.4 = token[] send-done(%send.3), channel_id=1, is_host_transfer=true
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
+// CHECK:           %[[VAL_1:.*]], %[[VAL_2:.*]] = stablehlo.rng_bit_generator %[[VAL_0]], algorithm =  PHILOX : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple %[[VAL_1]], %[[VAL_2]] {xla_shape = "(u64[3]{0}, u32[2,2]{1,0})"} : tuple<tensor<3xui64>, tensor<2x2xui32>>
+// CHECK:           return %[[VAL_3]] : tuple<tensor<3xui64>, tensor<2x2xui32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(u64[3]{0})->(u64[3]{0}, u32[2,2]{1,0})}
+
+ENTRY %main.6 (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
+  %Arg_0.1 = u64[3] parameter(0)
+  %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(%Arg_0.1), algorithm=rng_philox
+  %get-tuple-element.3 = u64[3] get-tuple-element(%rng-bit-generator.2), index=0
+  %get-tuple-element.4 = u32[2,2] get-tuple-element(%rng-bit-generator.2), index=1
+  ROOT %tuple.5 = (u64[3], u32[2,2]) tuple(%get-tuple-element.3, %get-tuple-element.4)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.cbrt %[[VAL_0]] : tensor<3x4xf32>
+// CHECK:           return %[[VAL_1]] : tensor<3x4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0})->f32[3,4]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  ROOT %cbrt.2 = f32[3,4] cbrt(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.reduce_precision %[[VAL_0]], format = e8m10 : tensor<3x4xf32>
+// CHECK:           return %[[VAL_1]] : tensor<3x4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0})->f32[3,4]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  ROOT %reduce-precision.2 = f32[3,4] reduce-precision(%Arg_0.1), exponent_bits=8, mantissa_bits=10
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
+// CHECK:           %[[VAL_1:.*]] = mhlo.bitcast %[[VAL_0]] {result_layout = dense<[2, 1, 0]> : tensor<3xindex>, source_layout = dense<[1, 0]> : tensor<2xindex>} : (tensor<3x4xf32>) -> tensor<3x4x1xf32>
+// CHECK:           return %[[VAL_1]] : tensor<3x4x1xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0})->f32[3,4,1]{2,1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[3,4]) -> f32[3,4,1] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  ROOT %bitcast.2 = f32[3,4,1] bitcast(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x4xf32>, %[[VAL_1:.*]]: tensor<3x4xf32>) -> tuple<tensor<4x4xf32>, tensor<3x4xf32>> {
+// CHECK:           %[[VAL_2:.*]]:2 = stablehlo.optimization_barrier %[[VAL_0]], %[[VAL_1]] : tensor<4x4xf32>, tensor<3x4xf32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple %[[VAL_2]]#0, %[[VAL_2]]#1 {xla_shape = "(f32[4,4]{1,0}, f32[3,4]{1,0})"} : tuple<tensor<4x4xf32>, tensor<3x4xf32>>
+// CHECK:           return %[[VAL_3]] : tuple<tensor<4x4xf32>, tensor<3x4xf32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,4]{1,0}, f32[3,4]{1,0})->(f32[4,4]{1,0}, f32[3,4]{1,0})}
+
+ENTRY %main.8 (Arg_0.1: f32[4,4], Arg_1.2: f32[3,4]) -> (f32[4,4], f32[3,4]) {
+  %Arg_0.1 = f32[4,4] parameter(0)
+  %Arg_1.2 = f32[3,4] parameter(1)
+  %tuple.3 = (f32[4,4], f32[3,4]) tuple(%Arg_0.1, %Arg_1.2), sharding={{replicated}, {devices=[1,2]<=[2]}}
+  %opt-barrier.4 = (f32[4,4], f32[3,4]) opt-barrier(%tuple.3), sharding={{replicated}, {devices=[1,2]<=[2]}}
+  %get-tuple-element.5 = f32[4,4] get-tuple-element(%opt-barrier.4), index=0, sharding={replicated}
+  %get-tuple-element.6 = f32[3,4] get-tuple-element(%opt-barrier.4), index=1, sharding={devices=[1,2]<=[2]}
+  ROOT %tuple.7 = (f32[4,4], f32[3,4]) tuple(%get-tuple-element.5, %get-tuple-element.6)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main() -> tensor<ui32> {
+// CHECK:           %[[VAL_0:.*]] = stablehlo.partition_id : tensor<ui32>
+// CHECK:           return %[[VAL_0]] : tensor<ui32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={()->u32[]}
+
+ENTRY %main.2 () -> u32[] {
+  ROOT %partition-id.1 = u32[] partition-id()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<ui32>) -> tensor<ui32> {
+// CHECK:           %[[VAL_1:.*]] = "mhlo.domain"(%[[VAL_0]]) <{entry_metadata = "{maximal device=1}", exit_metadata = "{}", kind = #mhlo<kind sharding>}> : (tensor<ui32>) -> tensor<ui32>
+// CHECK:           return %[[VAL_1]] : tensor<ui32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(u32[])->u32[]}
+
+ENTRY %main.3 (Arg_0.1: u32[]) -> u32[] {
+  %Arg_0.1 = u32[] parameter(0)
+  ROOT %domain.2 = u32[] domain(%Arg_0.1), domain={kind="sharding", entry={maximal device=1}, exit={}}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x4xf32>, %[[VAL_1:.*]]: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "stablehlo.triangular_solve"(%[[VAL_0]], %[[VAL_1]]) <{left_side = false, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = false}> : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
+// CHECK:           return %[[VAL_2]] : tensor<3x4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,4]{1,0}, f32[3,4]{1,0})->f32[3,4]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[4,4], Arg_1.2: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[4,4] parameter(0)
+  %Arg_1.2 = f32[3,4] parameter(1)
+  ROOT %triangular-solve.3 = f32[3,4] triangular-solve(%Arg_0.1, %Arg_1.2), lower=true, transpose_a=NO_TRANSPOSE
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.5(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<i32>, %[[VAL_2:.*]]: tensor<f32>, %[[VAL_3:.*]]: tensor<i32>) -> tuple<tensor<f32>, tensor<i32>> {
+// CHECK:           %[[VAL_4:.*]] = stablehlo.add %[[VAL_0]], %[[VAL_2]] : tensor<f32>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.add %[[VAL_1]], %[[VAL_3]] : tensor<i32>
+// CHECK:           %[[VAL_6:.*]] = stablehlo.tuple %[[VAL_4]], %[[VAL_5]] {xla_shape = "(f32[], s32[])"} : tuple<tensor<f32>, tensor<i32>>
+// CHECK:           return %[[VAL_6]] : tuple<tensor<f32>, tensor<i32>>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_7:.*]]: tensor<4x2xf32>, %[[VAL_8:.*]]: tensor<4x2xi32>, %[[VAL_9:.*]]: tensor<f32>, %[[VAL_10:.*]]: tensor<i32>) -> tuple<tensor<2x2xf32>, tensor<2x2xi32>> {
+// CHECK:           %[[VAL_11:.*]]:2 = "stablehlo.reduce_window"(%[[VAL_7]], %[[VAL_8]], %[[VAL_9]], %[[VAL_10]]) <{base_dilations = array<i64: 1, 1>, padding = dense<{{\[\[}}2, 2], [0, 0]]> : tensor<2x2xi64>, window_dilations = array<i64: 1, 1>, window_dimensions = array<i64: 5, 1>, window_strides = array<i64: 3, 1>}> ({
+// CHECK:           ^bb0(%[[VAL_12:.*]]: tensor<f32>, %[[VAL_13:.*]]: tensor<i32>, %[[VAL_14:.*]]: tensor<f32>, %[[VAL_15:.*]]: tensor<i32>):
+// CHECK:             %[[VAL_16:.*]] = stablehlo.add %[[VAL_12]], %[[VAL_14]] : tensor<f32>
+// CHECK:             %[[VAL_17:.*]] = stablehlo.add %[[VAL_13]], %[[VAL_15]] : tensor<i32>
+// CHECK:             stablehlo.return %[[VAL_16]], %[[VAL_17]] : tensor<f32>, tensor<i32>
+// CHECK:           }) : (tensor<4x2xf32>, tensor<4x2xi32>, tensor<f32>, tensor<i32>) -> (tensor<2x2xf32>, tensor<2x2xi32>)
+// CHECK:           %[[VAL_18:.*]] = stablehlo.tuple %[[VAL_19:.*]]#0, %[[VAL_19]]#1 {xla_shape = "(f32[2,2]{1,0}, s32[2,2]{1,0})"} : tuple<tensor<2x2xf32>, tensor<2x2xi32>>
+// CHECK:           return %[[VAL_18]] : tuple<tensor<2x2xf32>, tensor<2x2xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,2]{1,0}, s32[4,2]{1,0}, f32[], s32[])->(f32[2,2]{1,0}, s32[2,2]{1,0})}
+
+%region_0.5 (Arg_0.6: f32[], Arg_1.7: s32[], Arg_2.8: f32[], Arg_3.9: s32[]) -> (f32[], s32[]) {
+  %Arg_0.6 = f32[] parameter(0)
+  %Arg_2.8 = f32[] parameter(2)
+  %add.10 = f32[] add(%Arg_0.6, %Arg_2.8)
+  %Arg_1.7 = s32[] parameter(1)
+  %Arg_3.9 = s32[] parameter(3)
+  %add.11 = s32[] add(%Arg_1.7, %Arg_3.9)
+  ROOT %tuple.12 = (f32[], s32[]) tuple(%add.10, %add.11)
+}
+
+ENTRY %main.17 (Arg_0.1: f32[4,2], Arg_1.2: s32[4,2], Arg_2.3: f32[], Arg_3.4: s32[]) -> (f32[2,2], s32[2,2]) {
+  %Arg_0.1 = f32[4,2] parameter(0)
+  %Arg_1.2 = s32[4,2] parameter(1)
+  %Arg_2.3 = f32[] parameter(2)
+  %Arg_3.4 = s32[] parameter(3)
+  %reduce-window.13 = (f32[2,2], s32[2,2]) reduce-window(%Arg_0.1, %Arg_1.2, %Arg_2.3, %Arg_3.4), window={size=5x1 stride=3x1 pad=2_2x0_0}, to_apply=%region_0.5
+  %get-tuple-element.14 = f32[2,2] get-tuple-element(%reduce-window.13), index=0
+  %get-tuple-element.15 = s32[2,2] get-tuple-element(%reduce-window.13), index=1
+  ROOT %tuple.16 = (f32[2,2], s32[2,2]) tuple(%get-tuple-element.14, %get-tuple-element.15)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.round_nearest_even %[[VAL_0]] : tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2]{0})->f32[2]{0}}
+
+ENTRY %main.3 (Arg_0.1: f32[2]) -> f32[2] {
+  %Arg_0.1 = f32[2] parameter(0)
+  ROOT %round-nearest-even.2 = f32[2] round-nearest-even(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.tan %[[VAL_0]] : tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2]{0})->f32[2]{0}}
+
+ENTRY %main.3 (Arg_0.1: f32[2]) -> f32[2] {
+  %Arg_0.1 = f32[2] parameter(0)
+  ROOT %tan.2 = f32[2] tan(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<4x4xf32>) -> tuple<tensor<4x2xf32>, tensor<4x2xi32>> {
+// CHECK:           %[[VAL_1:.*]], %[[VAL_2:.*]] = mhlo.topk(%[[VAL_0]], k = 2) : tensor<4x4xf32> -> (tensor<4x2xf32>, tensor<4x2xi32>)
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple %[[VAL_1]], %[[VAL_2]] {xla_shape = "(f32[4,2]{1,0}, s32[4,2]{1,0})"} : tuple<tensor<4x2xf32>, tensor<4x2xi32>>
+// CHECK:           return %[[VAL_3]] : tuple<tensor<4x2xf32>, tensor<4x2xi32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,4]{1,0})->(f32[4,2]{1,0}, s32[4,2]{1,0})}
+
+ENTRY %main.6 (Arg_0.1: f32[4,4]) -> (f32[4,2], s32[4,2]) {
+  %Arg_0.1 = f32[4,4] parameter(0)
+  %topk.2 = (f32[4,2], s32[4,2]) topk(%Arg_0.1), k=2, largest=true
+  %get-tuple-element.3 = f32[4,2] get-tuple-element(%topk.2), index=0
+  %get-tuple-element.4 = s32[4,2] get-tuple-element(%topk.2), index=1
+  ROOT %tuple.5 = (f32[4,2], s32[4,2]) tuple(%get-tuple-element.3, %get-tuple-element.4)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, %[[VAL_1:.*]]: tensor<5x5xf32>) -> tuple<> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.custom_call @foo(%[[VAL_0]], %[[VAL_1]]) {backend_config = "", output_operand_aliases = [#stablehlo.output_operand_alias<output_tuple_indices = [0], operand_index = 0, operand_tuple_indices = [1]>], xla_shape = "(f32[2,3]{1,0})"} : (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple  {xla_shape = "()"} : tuple<>
+// CHECK:           return %[[VAL_3]] : tuple<>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={((f32[1,1]{1,0}, f32[2,3]{1,0}), f32[5,5]{1,0})->()}
+
+ENTRY %main.5 (Arg_0.1: (f32[1,1], f32[2,3]), Arg_1.2: f32[5,5]) -> () {
+  %Arg_0.1 = (f32[1,1], f32[2,3]) parameter(0)
+  %Arg_1.2 = f32[5,5] parameter(1)
+  %custom-call.3 = (f32[2,3]) custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="foo", output_to_operand_aliasing={{0}: (0, {1})}
+  ROOT %tuple.4 = () tuple()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, %[[VAL_1:.*]]: tensor<5x5xf32>) -> tuple<> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.custom_call @foo(%[[VAL_0]], %[[VAL_1]]) {backend_config = "", output_operand_aliases = [#stablehlo.output_operand_alias<output_tuple_indices = [], operand_index = 0, operand_tuple_indices = [1]>]} : (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tensor<2x3xf32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple  {xla_shape = "()"} : tuple<>
+// CHECK:           return %[[VAL_3]] : tuple<>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={((f32[1,1]{1,0}, f32[2,3]{1,0}), f32[5,5]{1,0})->()}
+
+ENTRY %main.5 (Arg_0.1: (f32[1,1], f32[2,3]), Arg_1.2: f32[5,5]) -> () {
+  %Arg_0.1 = (f32[1,1], f32[2,3]) parameter(0)
+  %Arg_1.2 = f32[5,5] parameter(1)
+  %custom-call.3 = f32[2,3] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="foo", output_to_operand_aliasing={{}: (0, {1})}
+  ROOT %tuple.4 = () tuple()
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.create_token {xla_shape = "token[]"} : !stablehlo.token
+// CHECK:           %[[VAL_2:.*]] = mhlo.add_dependency %[[VAL_0]], %[[VAL_1]] : (tensor<3x4xf32>, !stablehlo.token) -> tensor<3x4xf32>
+// CHECK:           return %[[VAL_2]] : tensor<3x4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0})->f32[3,4]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  %after-all.2 = token[] after-all()
+  ROOT %add-dependency.3 = f32[3,4] add-dependency(%Arg_0.1, %after-all.2)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<3x4xf32>) -> tensor<3x4xf32> attributes {execution_thread = "test_thread"} {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.create_token {xla_shape = "token[]"} : !stablehlo.token
+// CHECK:           %[[VAL_2:.*]] = mhlo.add_dependency %[[VAL_0]], %[[VAL_1]] : (tensor<3x4xf32>, !stablehlo.token) -> tensor<3x4xf32>
+// CHECK:           return %[[VAL_2]] : tensor<3x4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[3,4]{1,0})->f32[3,4]{1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  %after-all.2 = token[] after-all()
+  ROOT %add-dependency.3 = f32[3,4] add-dependency(%Arg_0.1, %after-all.2)
+}, execution_thread="test_thread"
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "stablehlo.all_to_all"(%[[VAL_0]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, concat_dimension = 1 : i64, replica_groups = dense<{{\[\[}}1, 2], [0, 3]]> : tensor<2x2xi64>, split_count = 2 : i64, split_dimension = 1 : i64}> : (tensor<2x2xi32>) -> tensor<2x2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2x2xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[2,2]{1,0})->s32[2,2]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: s32[2,2]) -> s32[2,2] {
+  %Arg_0.1 = s32[2,2] parameter(0)
+  ROOT %all-to-all.2 = s32[2,2] all-to-all(%Arg_0.1), channel_id=1, replica_groups={{1,2},{0,3}}, dimensions={1}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<128x4xf32>, %[[VAL_1:.*]]: tensor<128x4xf32>) -> tuple<tensor<128x4xf32>, tensor<128x4xf32>> {
+// CHECK:           %[[VAL_2:.*]]:2 = "mhlo.all_to_all"(%[[VAL_0]], %[[VAL_1]]) <{channel_handle = #mhlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<{{\[\[}}0, 1]]> : tensor<1x2xi64>}> : (tensor<128x4xf32>, tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>)
+// CHECK:           %[[VAL_3:.*]] = stablehlo.tuple %[[VAL_2]]#0, %[[VAL_2]]#1 {xla_shape = "(f32[128,4]{1,0}, f32[128,4]{1,0})"} : tuple<tensor<128x4xf32>, tensor<128x4xf32>>
+// CHECK:           return %[[VAL_3]] : tuple<tensor<128x4xf32>, tensor<128x4xf32>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[128,4]{1,0}, f32[128,4]{1,0})->(f32[128,4]{1,0}, f32[128,4]{1,0})}
+
+ENTRY %main.7 (Arg_0.1: f32[128,4], Arg_1.2: f32[128,4]) -> (f32[128,4], f32[128,4]) {
+  %Arg_0.1 = f32[128,4] parameter(0)
+  %Arg_1.2 = f32[128,4] parameter(1)
+  %all-to-all.3 = (f32[128,4], f32[128,4]) all-to-all(%Arg_0.1, %Arg_1.2), channel_id=1, replica_groups={{0,1}}
+  %get-tuple-element.4 = f32[128,4] get-tuple-element(%all-to-all.3), index=0
+  %get-tuple-element.5 = f32[128,4] get-tuple-element(%all-to-all.3), index=1
+  ROOT %tuple.6 = (f32[128,4], f32[128,4]) tuple(%get-tuple-element.4, %get-tuple-element.5)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<2x3xf32>, %[[VAL_1:.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.custom_call @foo(%[[VAL_0]], %[[VAL_1]]) {api_version = 4 : i32, backend_config = {user_attr0 = 123 : i32, user_attr1 = dense<42> : tensor<i32>}, has_side_effect = true} : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2x3xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,3]{1,0}, f32[5,5]{1,0})->f32[1,2,3]{2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: f32[2,3], Arg_1.2: f32[5,5]) -> f32[1,2,3] {
+  %Arg_0.1 = f32[2,3] parameter(0)
+  %Arg_1.2 = f32[5,5] parameter(1)
+  ROOT %custom-call.3 = f32[1,2,3] custom-call(%Arg_0.1, %Arg_1.2), custom_call_target="foo", custom_call_has_side_effect=true, api_version=API_VERSION_TYPED_FFI, backend_config={user_attr0 = 123 : i32, user_attr1 = dense<42> : tensor<i32>}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<f32> {mhlo.parameter_replication = [true]}, %[[VAL_1:.*]]: tuple<tensor<2x4xf32>, tuple<tensor<2x4xf32>>> {mhlo.parameter_replication = [false, true]}) -> tensor<f32> {
+// CHECK:           return %[[VAL_0]] : tensor<f32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0})))->f32[]}
+
+ENTRY %main.3 (Arg_0.1: f32[], Arg_1.2: (f32[2,4], (f32[2,4]))) -> f32[] {
+  ROOT %Arg_0.1 = f32[] parameter(0), parameter_replication={true}
+  %Arg_1.2 = (f32[2,4], (f32[2,4])) parameter(1), parameter_replication={false,true}
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<?x784xf32>) -> tensor<?x784xf32> {
+// CHECK:           %[[VAL_1:.*]] = stablehlo.abs %[[VAL_0]] : tensor<?x784xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?x784xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[?,784]{1,0})->f32[?,784]{1,0}}
+
+ENTRY %main.3 (Arg_0.1: f32[?,784]) -> f32[?,784] {
+  %Arg_0.1 = f32[?,784] parameter(0)
+  ROOT %abs.2 = f32[?,784] abs(%Arg_0.1)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.3(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.compare  NE, %[[VAL_0]], %[[VAL_2]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:           %[[VAL_4:.*]] = stablehlo.compare  NE, %[[VAL_1]], %[[VAL_2]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.or %[[VAL_3]], %[[VAL_4]] : tensor<i1>
+// CHECK:           %[[VAL_6:.*]] = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_7:.*]] = stablehlo.select %[[VAL_5]], %[[VAL_6]], %[[VAL_2]] : tensor<i1>, tensor<f32>
+// CHECK:           return %[[VAL_7]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_8:.*]]: tensor<2x2xf32>) -> tuple<tensor<i1>> {
+// CHECK:           %[[VAL_9:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_10:.*]] = stablehlo.reduce(%[[VAL_8]] init: %[[VAL_9]]) across dimensions = [0, 1] : (tensor<2x2xf32>, tensor<f32>) -> tensor<f32>
+// CHECK:            reducer(%[[VAL_11:.*]]: tensor<f32>, %[[VAL_12:.*]]: tensor<f32>)  {
+// CHECK:             %[[VAL_13:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:             %[[VAL_14:.*]] = stablehlo.compare  NE, %[[VAL_11]], %[[VAL_13]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:             %[[VAL_15:.*]] = stablehlo.compare  NE, %[[VAL_12]], %[[VAL_13]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:             %[[VAL_16:.*]] = stablehlo.or %[[VAL_14]], %[[VAL_15]] : tensor<i1>
+// CHECK:             %[[VAL_17:.*]] = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+// CHECK:             %[[VAL_18:.*]] = stablehlo.select %[[VAL_16]], %[[VAL_17]], %[[VAL_13]] : tensor<i1>, tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_18]] : tensor<f32>
+// CHECK:           }
+// CHECK:           %[[VAL_19:.*]] = stablehlo.compare  NE, %[[VAL_10]], %[[VAL_9]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:           %[[VAL_20:.*]] = stablehlo.tuple %[[VAL_19]] {xla_shape = "(pred[])"} : tuple<tensor<i1>>
+// CHECK:           return %[[VAL_20]] : tuple<tensor<i1>>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,2]{1,0})->(pred[])}
+
+%region_0.3 (Arg_0.4: f32[], Arg_1.5: f32[]) -> f32[] {
+  %Arg_0.4 = f32[] parameter(0)
+  %constant.7 = f32[] constant(0)
+  %compare.8 = pred[] compare(%Arg_0.4, %constant.7), direction=NE
+  %Arg_1.5 = f32[] parameter(1)
+  %compare.9 = pred[] compare(%Arg_1.5, %constant.7), direction=NE
+  %or.10 = pred[] or(%compare.8, %compare.9)
+  %constant.6 = f32[] constant(1)
+  ROOT %select.11 = f32[] select(%or.10, %constant.6, %constant.7)
+}
+
+ENTRY %main.15 (Arg_0.1: f32[2,2]) -> (pred[]) {
+  %Arg_0.1 = f32[2,2] parameter(0)
+  %constant.2 = f32[] constant(0)
+  %reduce.12 = f32[] reduce(%Arg_0.1, %constant.2), dimensions={0,1}, to_apply=%region_0.3
+  %compare.13 = pred[] compare(%reduce.12, %constant.2), direction=NE
+  ROOT %tuple.14 = (pred[]) tuple(%compare.13)
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           return %[[VAL_0]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_2:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_3:.*]] = "stablehlo.all_reduce"(%[[VAL_2]]) <{replica_groups = dense<{{\[\[}}0], [1]]> : tensor<2x1xi64>}> ({
+// CHECK:           ^bb0(%[[VAL_4:.*]]: tensor<f32>, %[[VAL_5:.*]]: tensor<f32>):
+// CHECK:             stablehlo.return %[[VAL_4]] : tensor<f32>
+// CHECK:           }) : (tensor<f32>) -> tensor<f32>
+// CHECK:           return %[[VAL_3]] : tensor<f32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[])->f32[]}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  ROOT %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+}
+
+ENTRY %main.6 (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  ROOT %all-reduce.5 = f32[] all-reduce(%Arg_0.1), replica_groups={{0},{1}}, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.2(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           return %[[VAL_0]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_2:.*]]: tensor<4x16xf32>) -> tensor<4x4xf32> {
+// CHECK:           %[[VAL_3:.*]] = "stablehlo.reduce_scatter"(%[[VAL_2]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<{{\[\[}}0, 1, 2, 3]]> : tensor<1x4xi64>, scatter_dimension = 1 : i64, use_global_device_ids}> ({
+// CHECK:           ^bb0(%[[VAL_4:.*]]: tensor<f32>, %[[VAL_5:.*]]: tensor<f32>):
+// CHECK:             stablehlo.return %[[VAL_4]] : tensor<f32>
+// CHECK:           }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
+// CHECK:           return %[[VAL_3]] : tensor<4x4xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[4,16]{1,0})->f32[4,4]{1,0}}
+
+%region_0.2 (Arg_0.3: f32[], Arg_1.4: f32[]) -> f32[] {
+  ROOT %Arg_0.3 = f32[] parameter(0)
+  %Arg_1.4 = f32[] parameter(1)
+}
+
+ENTRY %main.6 (Arg_0.1: f32[4,16]) -> f32[4,4] {
+  %Arg_0.1 = f32[4,16] parameter(0)
+  ROOT %reduce-scatter.5 = f32[4,4] reduce-scatter(%Arg_0.1), channel_id=1, replica_groups={{0,1,2,3}}, use_global_device_ids=true, dimensions={1}, to_apply=%region_0.2
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.3(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.maximum %[[VAL_0]], %[[VAL_2]] : tensor<f32>
+// CHECK:           return %[[VAL_3]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_4:.*]]: tensor<2x17x31x7xf32>, %[[VAL_5:.*]]: tensor<f32>) -> tensor<2x16x30x7xf32> {
+// CHECK:           %[[VAL_6:.*]] = "stablehlo.reduce_window"(%[[VAL_4]], %[[VAL_5]]) <{base_dilations = array<i64: 1, 1, 1, 1>, padding = dense<0> : tensor<4x2xi64>, window_dilations = array<i64: 1, 1, 1, 1>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 1, 1, 1>}> ({
+// CHECK:           ^bb0(%[[VAL_7:.*]]: tensor<f32>, %[[VAL_8:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_9:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:             %[[VAL_10:.*]] = stablehlo.maximum %[[VAL_7]], %[[VAL_9]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_10]] : tensor<f32>
+// CHECK:           }) : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x16x30x7xf32>
+// CHECK:           return %[[VAL_6]] : tensor<2x16x30x7xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[2,17,31,7]{3,2,1,0}, f32[])->f32[2,16,30,7]{3,2,1,0}}
+
+%region_0.3 (Arg_0.4: f32[], Arg_1.5: f32[]) -> f32[] {
+  %Arg_1.5 = f32[] parameter(1)
+  %Arg_0.4 = f32[] parameter(0)
+  %constant.6 = f32[] constant(0)
+  ROOT %maximum.7 = f32[] maximum(%Arg_0.4, %constant.6)
+}
+
+ENTRY %main.9 (Arg_0.1: f32[2,17,31,7], Arg_1.2: f32[]) -> f32[2,16,30,7] {
+  %Arg_0.1 = f32[2,17,31,7] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  ROOT %reduce-window.8 = f32[2,16,30,7] reduce-window(%Arg_0.1, %Arg_1.2), window={size=1x2x2x1}, to_apply=%region_0.3
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.4(%[[VAL_0:.*]]: tensor<i32>, %[[VAL_1:.*]]: tensor<i32>) -> tensor<i32> {
+// CHECK:           return %[[VAL_1]] : tensor<i32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_2:.*]]: tensor<3xi32>, %[[VAL_3:.*]]: tensor<1x1xi32>, %[[VAL_4:.*]]: tensor<1xi32>) -> tensor<3xi32> {
+// CHECK:           %[[VAL_5:.*]] = "stablehlo.scatter"(%[[VAL_2]], %[[VAL_3]], %[[VAL_4]]) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false}> ({
+// CHECK:           ^bb0(%[[VAL_6:.*]]: tensor<i32>, %[[VAL_7:.*]]: tensor<i32>):
+// CHECK:             stablehlo.return %[[VAL_7]] : tensor<i32>
+// CHECK:           }) : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK:           return %[[VAL_5]] : tensor<3xi32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(s32[3]{0}, s32[1,1]{1,0}, s32[1]{0})->s32[3]{0}}
+
+%region_0.4 (Arg_0.5: s32[], Arg_1.6: s32[]) -> s32[] {
+  %Arg_0.5 = s32[] parameter(0)
+  ROOT %Arg_1.6 = s32[] parameter(1)
+}
+
+ENTRY %main.8 (Arg_0.1: s32[3], Arg_1.2: s32[1,1], Arg_2.3: s32[1]) -> s32[3] {
+  %Arg_0.1 = s32[3] parameter(0)
+  %Arg_1.2 = s32[1,1] parameter(1)
+  %Arg_2.3 = s32[1] parameter(2)
+  ROOT %scatter.7 = s32[3] scatter(%Arg_0.1, %Arg_1.2, %Arg_2.3), update_window_dims={}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0}, index_vector_dim=1, to_apply=%region_0.4
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func private @region_0.4(%[[VAL_0:.*]]: tensor<f32>, %[[VAL_1:.*]]: tensor<f32>) -> tensor<i1> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = stablehlo.compare  GE, %[[VAL_0]], %[[VAL_2]],  TOTALORDER : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:           return %[[VAL_3]] : tensor<i1>
+// CHECK:         }
+// CHECK:         func.func private @region_1.9(%[[VAL_4:.*]]: tensor<f32>, %[[VAL_5:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK:           return %[[VAL_5]] : tensor<f32>
+// CHECK:         }
+// CHECK:         func.func @main(%[[VAL_6:.*]]: tensor<10x24x24x64xf32>, %[[VAL_7:.*]]: tensor<10x23x23x64xf32>, %[[VAL_8:.*]]: tensor<f32>) -> tensor<10x24x24x64xf32> {
+// CHECK:           %[[VAL_9:.*]] = "stablehlo.select_and_scatter"(%[[VAL_6]], %[[VAL_7]], %[[VAL_8]]) <{padding = dense<0> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 1, 1, 1>}> ({
+// CHECK:           ^bb0(%[[VAL_10:.*]]: tensor<f32>, %[[VAL_11:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_12:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:             %[[VAL_13:.*]] = stablehlo.compare  GE, %[[VAL_10]], %[[VAL_12]],  TOTALORDER : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:             stablehlo.return %[[VAL_13]] : tensor<i1>
+// CHECK:           }, {
+// CHECK:           ^bb0(%[[VAL_14:.*]]: tensor<f32>, %[[VAL_15:.*]]: tensor<f32>):
+// CHECK:             stablehlo.return %[[VAL_15]] : tensor<f32>
+// CHECK:           }) : (tensor<10x24x24x64xf32>, tensor<10x23x23x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+// CHECK:           return %[[VAL_9]] : tensor<10x24x24x64xf32>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[10,24,24,64]{3,2,1,0}, f32[10,23,23,64]{3,2,1,0}, f32[])->f32[10,24,24,64]{3,2,1,0}}
+
+%region_0.4 (Arg_0.5: f32[], Arg_1.6: f32[]) -> pred[] {
+  %Arg_1.6 = f32[] parameter(1)
+  %Arg_0.5 = f32[] parameter(0)
+  %constant.7 = f32[] constant(0)
+  ROOT %compare.8 = pred[] compare(%Arg_0.5, %constant.7), direction=GE, type=TOTALORDER
+}
+
+%region_1.9 (Arg_0.10: f32[], Arg_1.11: f32[]) -> f32[] {
+  %Arg_0.10 = f32[] parameter(0)
+  ROOT %Arg_1.11 = f32[] parameter(1)
+}
+
+ENTRY %main.13 (Arg_0.1: f32[10,24,24,64], Arg_1.2: f32[10,23,23,64], Arg_2.3: f32[]) -> f32[10,24,24,64] {
+  %Arg_0.1 = f32[10,24,24,64] parameter(0)
+  %Arg_1.2 = f32[10,23,23,64] parameter(1)
+  %Arg_2.3 = f32[] parameter(2)
+  ROOT %select-and-scatter.12 = f32[10,24,24,64] select-and-scatter(%Arg_0.1, %Arg_1.2, %Arg_2.3), window={size=1x2x2x1}, select=%region_0.4, scatter=%region_1.9
+}
+
+// -----
+
+// CHECK-LABEL: module @main attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false} {
+// CHECK:         func.func @main(%[[VAL_0:.*]]: tensor<16x16xf32>, %[[VAL_1:.*]]: tensor<16x16xi32>) -> tuple<> {
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple  {xla_shape = "()"} : tuple<>
+// CHECK:           return %[[VAL_2]] : tuple<>
+// CHECK:         }
+// CHECK:       }
+HloModule main, entry_computation_layout={(f32[16,16]{1,0}, s32[16,16]{1,0})->()}
+
+ENTRY %main.4 (Arg_0.1: f32[16,16], Arg_1.2: s32[16,16]) -> () {
+  %Arg_0.1 = f32[16,16] parameter(0)
+  %Arg_1.2 = s32[16,16] parameter(1)
+  ROOT %tuple.3 = () tuple()
+}
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_locs.hlo b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_locs.hlo
new file mode 100644
index 000000000000..a60ec2b62f58
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/import_locs.hlo
@@ -0,0 +1,9 @@
+// RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-flatten-computation-args-result=false -hlo-import-all-computations -mlir-print-debuginfo %s -o - | FileCheck %s
+// CHECK-LABEL: func.func @main(
+// CHECK-SAME: %arg0: tensor<32xi32> loc("hello"),
+// CHECK-SAME: %arg1: tensor<32xi32> loc("world")
+ENTRY param_metadata {
+  param_0 = s32[32]{0} parameter(0), metadata={op_name="hello"}
+  param_1 = s32[32]{0} parameter(1), metadata={op_name="world"}
+  ROOT custom-call.0 = s32[32]{0} custom-call(param_0, param_1), custom_call_target="Sharding"
+}
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/result_accuracy.hlo b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/result_accuracy.hlo
new file mode 100644
index 000000000000..5464e511c9dd
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/tests/result_accuracy.hlo
@@ -0,0 +1,33 @@
+// RUN: hlo-translate -hlo-to-mlir -emit-mhlo -split-input-file %s | FileCheck %s
+
+HloModule main, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY %main (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  // CHECK: %0 = mhlo.exponential %arg0 {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : tensor<f32>
+  ROOT %exponential.2 = f32[] exponential(%Arg_0.1), result_accuracy={tolerance={atol=1.0,rtol=0,ulps=10}}
+}
+
+// -----
+
+ENTRY %main (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  // CHECK: %0 = mhlo.exponential %arg0 {result_accuracy = #mhlo.result_accuracy<mode = #mhlo.result_accuracy_mode<HIGHEST>>} : tensor<f32>
+  ROOT %exponential.2 = f32[] exponential(%Arg_0.1), result_accuracy={mode=HIGHEST}
+}
+
+// -----
+
+ENTRY %main (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  // CHECK: %0 = mhlo.exponential %arg0 : tensor<f32>
+  ROOT %exponential.2 = f32[] exponential(%Arg_0.1), result_accuracy={mode=DEFAULT}
+}
+
+// -----
+
+ENTRY %main (Arg_0.1: f32[]) -> f32[] {
+  %Arg_0.1 = f32[] parameter(0)
+  // CHECK: %0 = mhlo.exponential %arg0 : tensor<f32>
+  ROOT %exponential.2 = f32[] exponential(%Arg_0.1)
+}
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.cc
index e203008f7187..9771e245c785 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.cc
@@ -54,7 +54,8 @@ bool LoadHloProto(const std::string& contents, HloProto* hlo_proto) {
 
 mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
-    bool import_all_computations, bool flatten_computation_args_result) {
+    bool import_all_computations, bool flatten_computation_args_result,
+    bool emit_stablehlo) {
   mlir::OwningOpRef<mlir::ModuleOp> module =
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(context));
 
@@ -67,7 +68,7 @@ mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
 
   auto status = ConvertHloToMlirHlo(
       module.get(), hlo_proto.mutable_hlo_module(), import_all_computations,
-      flatten_computation_args_result);
+      flatten_computation_args_result, emit_stablehlo);
   if (!status.ok()) {
     module->emitError("Hlo module import failed: ") << status.message();
     return nullptr;
@@ -78,7 +79,8 @@ mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
 
 mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
-    bool import_all_computations, bool flatten_computation_args_result) {
+    bool import_all_computations, bool flatten_computation_args_result,
+    bool emit_stablehlo) {
   mlir::OwningOpRef<mlir::ModuleOp> module =
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(context));
 
@@ -93,7 +95,7 @@ mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
   auto hlo_module = std::move(hlo_module_error.value());
   auto status =
       ConvertHloToMlirHlo(*module, hlo_module.get(), import_all_computations,
-                          flatten_computation_args_result);
+                          flatten_computation_args_result, emit_stablehlo);
   if (!status.ok()) {
     module->emitError("HLO Module import failed: ") << status.message();
     return nullptr;
@@ -134,7 +136,7 @@ mlir::OwningOpRef<mlir::ModuleOp> HloTextToStablehloTranslateFunction(
     return nullptr;
   }
 
-  return std::move(stablehlo_module.value());
+  return std::move(stablehlo_module).value();
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.h b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.h
index 69b06bf9a608..4ad30ebb7f97 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.h
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate.h
@@ -40,7 +40,7 @@ namespace xla {
 mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
     bool import_all_computations = false,
-    bool flatten_computation_args_result = false);
+    bool flatten_computation_args_result = false, bool emit_stablehlo = false);
 
 // Converts a HloModule stored in text form for a file with the given
 // `input_filename` into a MHLO module. Creates MLIR entities into the given
@@ -54,7 +54,7 @@ mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
 mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
     bool import_all_computations = false,
-    bool flatten_computation_args_result = false);
+    bool flatten_computation_args_result = false, bool emit_stablehlo = false);
 
 // Converts a HloModuleProto stored in the file with the given `input_filename`
 // into a StableHLO module. Creates MLIR entities into the given MLIR `context`.
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate_registration.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate_registration.cc
index c1ef2a37675c..10e04967f1b0 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate_registration.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/translate_registration.cc
@@ -30,18 +30,25 @@ llvm::cl::opt<bool> import_all_computations(
 llvm::cl::opt<bool> flatten_computation_args_result(
     "hlo-flatten-computation-args-result",
     llvm::cl::desc("Enable flattening computation arguments and results."));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> emit_stablehlo(
+    "emit-stablehlo",
+    llvm::cl::desc("Allow a mix of MHLO and StableHLO ops in the output."));
 }  // namespace
 
 static mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslate(
     llvm::StringRef input, mlir::MLIRContext* context) {
   return xla::HloToMlirHloTranslateFunction(
-      input, context, import_all_computations, flatten_computation_args_result);
+      input, context, import_all_computations, flatten_computation_args_result,
+      emit_stablehlo);
 }
 
 static mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslate(
     llvm::StringRef input, mlir::MLIRContext* context) {
   return xla::HloTextToMlirHloTranslateFunction(
-      input, context, import_all_computations, flatten_computation_args_result);
+      input, context, import_all_computations, flatten_computation_args_result,
+      emit_stablehlo);
 }
 
 static mlir::OwningOpRef<mlir::ModuleOp> HloToStablehloTranslate(
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD
index f5f50af1d3a8..326c88676bb1 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
@@ -34,6 +34,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@stablehlo//:base",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -64,7 +65,6 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -81,7 +81,6 @@ cc_library(
     deps = [
         ":stack_frame_index_builder",
         "//xla:xla_data_proto_cc",
-        "@com_google_absl//absl/log",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -97,6 +96,7 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -118,17 +118,17 @@ cc_library(
 cc_library(
     name = "mlir_hlo_to_hlo",
     srcs = [
+        "hlo_op_writer.inc",
         "mlir_hlo_to_hlo.cc",
-        "operator_writers.inc",
     ],
     hdrs = ["mlir_hlo_to_hlo.h"],
     deps = [
         ":attribute_exporter",
+        ":hlo_op_writer_inc",
         ":layout_util",
         ":literal_exporter",
         ":location_exporter",
         ":module_attributes_exporter",
-        ":operator_writer_inc",
         ":stack_frame_index_builder",
         ":type_to_shape",
         "//xla:array",
@@ -153,6 +153,7 @@ cc_library(
         "//xla/mlir/utils:type_util",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:mhlo_passes",
+        "//xla/mlir_hlo:stablehlo_extension_passes",
         "//xla/service:computation_layout",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
@@ -179,20 +180,22 @@ cc_library(
         "@local_tsl//tsl/platform:ml_dtypes",
         "@stablehlo//:base",
         "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
     ],
 )
 
 build_test(
-    name = "operator_writer_gen_build_test",
+    name = "gen_hlo_op_writer_build_test",
     targets = [
-        ":operator_writer_gen",
+        ":gen_hlo_op_writer",
     ],
 )
 
 cc_binary(
-    name = "operator_writer_gen",
-    srcs = ["operator_writer_gen.cc"],
+    name = "gen_hlo_op_writer",
+    srcs = ["gen_hlo_op_writer.cc"],
     deps = [
+        "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TableGen",
         "@llvm-project//mlir:TableGen",
@@ -200,16 +203,17 @@ cc_binary(
 )
 
 gentbl_cc_library(
-    name = "operator_writer_inc",
+    name = "hlo_op_writer_inc",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [([], "operator_writers.inc")],
-    tblgen = ":operator_writer_gen",
-    td_file = "//xla/mlir_hlo:mhlo/IR/hlo_ops.td",
+    tbl_outs = {"hlo_op_writer.inc": []},
+    tblgen = ":gen_hlo_op_writer",
+    td_file = "gen_hlo_op_writer.td",
     deps = [
         "//xla/mlir_hlo:hlo_ops_td_files",
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+        "@stablehlo//:stablehlo_ops_td_filegroup",
     ],
 )
 
@@ -223,7 +227,6 @@ xla_cc_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
index 652401bf7c37..b4ca370e222d 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
 #include "stablehlo/dialect/Base.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -66,6 +67,35 @@ ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
   return output;
 }
 
+ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
+    mlir::stablehlo::ConvDimensionNumbersAttr input) {
+  ConvolutionDimensionNumbers output;
+
+  output.set_input_batch_dimension(input.getInputBatchDimension());
+  output.set_input_feature_dimension(input.getInputFeatureDimension());
+  for (auto v : input.getInputSpatialDimensions()) {
+    output.add_input_spatial_dimensions(v);
+  }
+
+  output.set_kernel_input_feature_dimension(
+      input.getKernelInputFeatureDimension());
+  output.set_kernel_output_feature_dimension(
+      input.getKernelOutputFeatureDimension());
+
+  for (auto v : input.getKernelSpatialDimensions()) {
+    output.add_kernel_spatial_dimensions(v);
+  }
+
+  output.set_output_batch_dimension(input.getOutputBatchDimension());
+  output.set_output_feature_dimension(input.getOutputFeatureDimension());
+
+  for (auto v : input.getOutputSpatialDimensions()) {
+    output.add_output_spatial_dimensions(v);
+  }
+
+  return output;
+}
+
 absl::StatusOr<xla::PrecisionConfig::Algorithm> ConvertDotAlgorithm(
     mlir::mhlo::DotAlgorithmAttr attr) {
   auto algorithm = mlir::hlo::detail::getKnownDotAlgorithm(
@@ -106,6 +136,46 @@ absl::StatusOr<xla::PrecisionConfig::Algorithm> ConvertDotAlgorithm(
   return Internal("Unknown dot algorithm");
 }
 
+absl::StatusOr<xla::PrecisionConfig::Algorithm> ConvertDotAlgorithm(
+    mlir::stablehlo::DotAlgorithmAttr attr) {
+  auto algorithm = mlir::hlo::detail::getKnownDotAlgorithm(
+      attr.getLhsPrecisionType(), attr.getRhsPrecisionType(),
+      attr.getAccumulationType(), attr.getLhsComponentCount(),
+      attr.getRhsComponentCount(), attr.getNumPrimitiveOperations(),
+      attr.getAllowImpreciseAccumulation());
+  if (failed(algorithm)) return Internal("Unknown dot algorithm");
+
+  switch (algorithm.value()) {
+    case mlir::hlo::detail::KnownDotAlgorithm::ANY_F8_ANY_F8_F32:
+      return xla::PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32;
+    case mlir::hlo::detail::KnownDotAlgorithm::ANY_F8_ANY_F8_F32_FAST_ACCUM:
+      return xla::PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM;
+    case mlir::hlo::detail::KnownDotAlgorithm::F16_F16_F16:
+      return xla::PrecisionConfig::ALG_DOT_F16_F16_F16;
+    case mlir::hlo::detail::KnownDotAlgorithm::F16_F16_F32:
+      return xla::PrecisionConfig::ALG_DOT_F16_F16_F32;
+    case mlir::hlo::detail::KnownDotAlgorithm::BF16_BF16_BF16:
+      return xla::PrecisionConfig::ALG_DOT_BF16_BF16_BF16;
+    case mlir::hlo::detail::KnownDotAlgorithm::BF16_BF16_F32:
+      return xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32;
+    case mlir::hlo::detail::KnownDotAlgorithm::BF16_BF16_F32_X3:
+      return xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3;
+    case mlir::hlo::detail::KnownDotAlgorithm::BF16_BF16_F32_X6:
+      return xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6;
+    case mlir::hlo::detail::KnownDotAlgorithm::BF16_BF16_F32_X9:
+      return xla::PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9;
+    case mlir::hlo::detail::KnownDotAlgorithm::TF32_TF32_F32:
+      return xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32;
+    case mlir::hlo::detail::KnownDotAlgorithm::TF32_TF32_F32_X3:
+      return xla::PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3;
+    case mlir::hlo::detail::KnownDotAlgorithm::F32_F32_F32:
+      return xla::PrecisionConfig::ALG_DOT_F32_F32_F32;
+    case mlir::hlo::detail::KnownDotAlgorithm::F64_F64_F64:
+      return xla::PrecisionConfig::ALG_DOT_F64_F64_F64;
+  }
+  return Internal("Unknown dot algorithm");
+}
+
 // Convert replica group from MLIR encoding to HLO.
 // See HloFunctionImporter::ConvertReplicaGroups for the MLIR encoding.
 absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
@@ -191,6 +261,26 @@ absl::StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
   }
 }
 
+absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
+    mlir::stablehlo::CustomCallApiVersion api_version) {
+  switch (api_version) {
+    case mlir::stablehlo::CustomCallApiVersion::API_VERSION_UNSPECIFIED:
+      return xla::CustomCallApiVersion::API_VERSION_UNSPECIFIED;
+    case mlir::stablehlo::CustomCallApiVersion::API_VERSION_ORIGINAL:
+      return xla::CustomCallApiVersion::API_VERSION_ORIGINAL;
+    case mlir::stablehlo::CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
+      return xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING;
+    case mlir::stablehlo::CustomCallApiVersion::
+        API_VERSION_STATUS_RETURNING_UNIFIED:
+      return xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED;
+    case mlir::stablehlo::CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      return xla::CustomCallApiVersion::API_VERSION_TYPED_FFI;
+    default:
+      return InvalidArgument("Unknown CustomCallApiVersion enum value #%d",
+                             api_version);
+  }
+}
+
 absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version) {
   switch (api_version) {
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h
index bc8344ce11b0..a45f09b842eb 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo.pb.h"
@@ -38,10 +39,17 @@ namespace xla {
 ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
     mlir::mhlo::ConvDimensionNumbersAttr input);
 
+// Converts the conv dimensions attribute to XLA HLO.
+ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
+    mlir::stablehlo::ConvDimensionNumbersAttr input);
+
 // Converts the dot algorithm attribute to XLA HLO.
 absl::StatusOr<xla::PrecisionConfig::Algorithm> ConvertDotAlgorithm(
     mlir::mhlo::DotAlgorithmAttr attr);
 
+absl::StatusOr<xla::PrecisionConfig::Algorithm> ConvertDotAlgorithm(
+    mlir::stablehlo::DotAlgorithmAttr attr);
+
 absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
     mlir::DenseIntElementsAttr input);
 
@@ -59,6 +67,9 @@ absl::StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
 absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version);
 
+absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
+    mlir::stablehlo::CustomCallApiVersion api_version);
+
 absl::StatusOr<
     std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
 ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr);
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.cc
new file mode 100644
index 000000000000..6d99f7d0b269
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.cc
@@ -0,0 +1,250 @@
+/* Copyright 2019 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Main.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include "mlir/TableGen/Argument.h"
+#include "mlir/TableGen/Attribute.h"
+#include "mlir/TableGen/Operator.h"
+
+using llvm::interleaveComma;
+using llvm::raw_ostream;
+using llvm::RecordKeeper;
+using llvm::StringRef;
+using mlir::tblgen::Attribute;
+using mlir::tblgen::NamedAttribute;
+using mlir::tblgen::NamedTypeConstraint;
+using mlir::tblgen::Operator;
+
+static std::string GetDefaultAttrExport(
+    const mlir::tblgen::NamedAttribute& named_attr) {
+  Attribute attr = named_attr.attr;
+  StringRef storage_type = attr.getStorageType();
+  // For some attribute types we have a general conversion, so use that.
+  if (!attr.isEnumAttr() && (storage_type.ends_with("BoolAttr") ||
+                             storage_type.ends_with("FloatAttr") ||
+                             storage_type.ends_with("IntegerAttr") ||
+                             storage_type.ends_with("StringAttr"))) {
+    // The return type may contains qualified namespaces. Split to remove them.
+    std::pair<StringRef, StringRef> splits = attr.getReturnType().rsplit("::");
+    StringRef symbol = splits.second;
+    if (symbol.empty()) symbol = splits.first;
+    return "Convert" + symbol.str();
+  }
+  return "Convert_" + named_attr.name.str();
+}
+
+static StringRef GetClientBuilder(const Operator& op) {
+  static const auto* kOpToXLABuilderMap =
+      new llvm::StringMap<StringRef>{{"ReverseOp", "Rev"},
+                                     {"ConcatenateOp", "ConcatInDim"},
+                                     {"ConvOp", "ConvGeneralDilated"}};
+
+  StringRef op_name = op.getCppClassName();
+
+  // Default case where the client builder method names closely follow the op
+  // names in the dialect. For e.g., AddOp -> xla::Add method.
+  if (!kOpToXLABuilderMap->count(op_name)) return op_name.drop_back(2);
+
+  // Otherwise, if the op to client builder method mapping is provided.
+  return kOpToXLABuilderMap->lookup(op_name);
+}
+
+static void BuildOperator(const Operator& op, raw_ostream& os) {
+  os << "mlir::LogicalResult ExportXlaOp(" << op.getCppNamespace()
+     << "::" << op.getCppClassName() << " op, OpLoweringContext ctx) {\n"
+     << "  auto& value_map = *ctx.values;\n"
+     << "  auto result = op.getResult();\n";
+
+  // Build a conversion for each of the arguments.
+  int operand_number = 0;
+  for (int index : llvm::seq<int>(0, op.getNumArgs())) {
+    auto arg = op.getArg(index);
+
+    // Emit an argument for an operand.
+    if (auto* operand_cst = arg.dyn_cast<NamedTypeConstraint*>()) {
+      std::string xla_arg = "xla_arg_" + std::to_string(index);
+      // Handle a non-variadic operand.
+      if (!operand_cst->isVariableLength()) {
+        os << "  xla::XlaOp " << xla_arg << ";\n";
+        os << "  if (failed(GetXlaOp(*op.getODSOperands(" << operand_number++
+           << ").begin(), value_map, &" << xla_arg << ", op)))\n";
+        os << "    return mlir::failure();\n";
+        continue;
+      }
+
+      // Otherwise, this is a varidiac operand list.
+      os << "  std::vector<xla::XlaOp> " << xla_arg << ";\n"
+         << "  for (auto operand : op.getODSOperands(" << operand_number++
+         << ")) {\n";
+      os << "    xla::XlaOp result;\n";
+      os << "    if (failed(GetXlaOp(operand, value_map, &result, op)))\n";
+      os << "      return mlir::failure();\n";
+      os << "    " << xla_arg << ".push_back(result);\n";
+      os << "  }\n";
+      continue;
+    }
+
+    // Otherwise, this is an attribute.
+    auto named_attr = arg.get<NamedAttribute*>();
+    os << "  auto xla_arg_" << index << " = "
+       << GetDefaultAttrExport(*named_attr) << "(op.get"
+       << convertToCamelFromSnakeCase(op.getArgName(index),
+                                      /*capitalizeFirst=*/true)
+       << "());\n";
+  }
+
+  // Emit call to client API
+  os << "  auto xla_result = xla::" << GetClientBuilder(op) << "(";
+
+  // If all operands are variadic, then pass the builder explicitly to xla
+  // client API call
+  if (op.getNumOperands() == op.getNumVariableLengthOperands()) {
+    os << "ctx.builder";
+    if (op.getNumArgs() != 0) os << ", ";
+  }
+
+  // Emit each of the arguments.
+  interleaveComma(llvm::seq<int>(0, op.getNumArgs()), os,
+                  [&](int i) { os << "Unwrap(xla_arg_" << i << ')'; });
+  os << ");\n";
+
+  os << "  value_map[result] = xla_result;\n";
+  os << "  return mlir::success();\n";
+  os << "}\n";
+}
+
+// The function below has a non-constant reference as that is required by LLVM's
+// TableGenMain.
+// NOLINTNEXTLINE
+static bool OperatorWritersMain(raw_ostream& os, const RecordKeeper& records) {
+  // Get the list of operations that need a custom HLO converter.
+  const auto* custom_convert_op_defs = llvm::dyn_cast_or_null<llvm::ListInit>(
+      records.getGlobal("CustomHloConverterOps"));
+  if (!custom_convert_op_defs) {
+    llvm::errs() << "Failed to get CustomHloConverterOps list\n";
+    return false;
+  }
+  // Convert the list to a set for faster lookups.
+  std::unordered_set<std::string> custom_convert_op_names;
+  for (const auto* op_def : custom_convert_op_defs->getValues())
+    custom_convert_op_names.insert(op_def->getAsString());
+
+  // Get the list of StableHLO operations that are allowed to be directly
+  // converted to HLO without intermediate MHLO step.
+  const auto* hlo_conversion_allowed_op_defs =
+      llvm::dyn_cast_or_null<llvm::ListInit>(
+          records.getGlobal("HloConversionAllowedOps"));
+  if (!hlo_conversion_allowed_op_defs) {
+    llvm::errs() << "Failed to get HloConversionAllowedOps list\n";
+    return false;
+  }
+  // Convert the list to a set for faster lookups.
+  absl::flat_hash_set<std::string> hlo_conversion_allowed_op_names;
+  for (const auto* op_def : hlo_conversion_allowed_op_defs->getValues())
+    hlo_conversion_allowed_op_names.insert(op_def->getAsString());
+
+  emitSourceFileHeader("MLIR XLA Builders", os);
+
+  // Emit all the HLO writers.
+  std::array<StringRef, 2> dialect_defs = {"MHLO_Op", "StableHLO_Op"};
+  for (auto dialect_def : dialect_defs) {
+    for (const auto* def : records.getAllDerivedDefinitions(dialect_def)) {
+      Operator op(def);
+
+      if (dialect_def == "StableHLO_Op" &&
+          !(hlo_conversion_allowed_op_names.contains(def->getName().str()))) {
+        continue;
+      }
+
+      if (custom_convert_op_names.count(def->getName().str()) > 0) continue;
+
+      BuildOperator(op, os);
+    }
+  }
+
+  // Emit a function to generate an XLA operation for the operations with
+  // auto-generated builders.
+  os << "mlir::LogicalResult ExportXlaOperator(\n"
+        "mlir::Operation* op, OpLoweringContext lowering_context) {\n\n";
+
+  // Create a scoped object to assign sharding to generated XLA ops. Any HLO
+  // can have an attribute of "sharding".
+  os << "  xla::XlaScopedShardingAssignment sharding(lowering_context.builder, "
+        "CreateOpShardingFromAttribute(op));\n\n";
+
+  // Create a scoped object to assign frontend attributes to generated XLA ops.
+  // Any HLO can have an attribute of "frontend_attributes", which are used to
+  // pass hints / configuration options.
+  os << "  xla::XlaScopedFrontendAttributesAssignment "
+        "frontend_attributes(lowering_context.builder, "
+        "CreateXlaFrontendAttributesFromOp(op));\n\n";
+
+  // Create a scoped object to assign op metadata to generated XLA ops.
+  os << "  xla::XlaScopedOpMetadataAssignment "
+        "op_metadata(lowering_context.builder, "
+        "mlir::mhlo::CreateOpMetadataFromLocation("
+        "op, lowering_context.frame_index_builder));\n\n";
+
+  // Retrieve all the definitions derived from MHLO_Op and sort by record name.
+  for (auto dialect_def : dialect_defs) {
+    for (const auto* def : records.getAllDerivedDefinitions(dialect_def)) {
+      // Skip operations that have a custom exporter.
+      Operator op(def);
+
+      if (dialect_def == "StableHLO_Op" &&
+          !(hlo_conversion_allowed_op_names.contains(def->getName().str()))) {
+        continue;
+      }
+
+      // Cast to the current operation and build the exporter.
+      os << "  if (auto xla_op = llvm::dyn_cast<" << op.getCppNamespace()
+         << "::" << op.getCppClassName() << ">(op)) {\n";
+      os << "    return ";
+
+      if (custom_convert_op_names.count(def->getName().str()) > 0)
+        os << op.getCppNamespace() << "::";
+
+      os << "ExportXlaOp(xla_op, lowering_context);\n";
+      os << "  }\n";
+    }
+  }
+  os << "  return mlir::failure();\n"
+        "}\n";
+  return false;
+}
+
+int main(int argc, char** argv) {
+  llvm::InitLLVM y(argc, argv);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+  return TableGenMain(argv[0], &OperatorWritersMain);
+}
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.td b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.td
new file mode 100644
index 000000000000..7ebe9b087102
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/gen_hlo_op_writer.td
@@ -0,0 +1,268 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "xla/mlir_hlo/mhlo/IR/hlo_ops.td"
+include "stablehlo/dialect/StablehloOps.td"
+
+// List of All StableHLO ops. Any new StableHLO op should be added here and
+// if applicable at the CustomHloConverterOps.
+defvar HloConversionAllowedOps = [
+  StableHLO_AbsOp,
+  StableHLO_AddOp,
+  StableHLO_AfterAllOp,
+  StableHLO_AllGatherOp,
+  StableHLO_AllReduceOp,
+  StableHLO_AllToAllOp,
+  StableHLO_AndOp,
+  StableHLO_Atan2Op,
+  StableHLO_BatchNormGradOp,
+  StableHLO_BatchNormInferenceOp,
+  StableHLO_BatchNormTrainingOp,
+  StableHLO_BitcastConvertOp,
+  StableHLO_BroadcastInDimOp,
+  StableHLO_BroadcastOp,
+  StableHLO_CaseOp,
+  StableHLO_CbrtOp,
+  StableHLO_CeilOp,
+  StableHLO_CholeskyOp,
+  StableHLO_ClampOp,
+  StableHLO_ClzOp,
+  StableHLO_CollectiveBroadcastOp,
+  StableHLO_CollectivePermuteOp,
+  StableHLO_CompareOp,
+  StableHLO_ComplexOp,
+  StableHLO_CompositeOp,
+  StableHLO_ConcatenateOp,
+  StableHLO_ConstantOp,
+  StableHLO_ConvertOp,
+  StableHLO_ConvolutionOp,
+  StableHLO_CosineOp,
+  StableHLO_CreateTokenOp,
+  StableHLO_CrossReplicaSumOp,
+  StableHLO_CustomCallOp,
+  StableHLO_DivOp,
+  StableHLO_DotGeneralOp,
+  StableHLO_DotOp,
+  StableHLO_DynamicBroadcastInDimOp,
+  StableHLO_DynamicConvOp,
+  StableHLO_DynamicGatherOp,
+  StableHLO_DynamicIotaOp,
+  StableHLO_DynamicPadOp,
+  StableHLO_DynamicReshapeOp,
+  StableHLO_DynamicSliceOp,
+  StableHLO_DynamicUpdateSliceOp,
+  StableHLO_EinsumOp,
+  StableHLO_Expm1Op,
+  StableHLO_ExpOp,
+  StableHLO_FftOp,
+  StableHLO_FloorOp,
+  StableHLO_GatherOp,
+  StableHLO_GetDimensionSizeOp,
+  StableHLO_GetTupleElementOp,
+  StableHLO_IfOp,
+  StableHLO_ImagOp,
+  StableHLO_InfeedOp,
+  StableHLO_IotaOp,
+  StableHLO_IsFiniteOp,
+  StableHLO_Log1pOp,
+  StableHLO_LogisticOp,
+  StableHLO_LogOp,
+  StableHLO_MapOp,
+  StableHLO_MaxOp,
+  StableHLO_MinOp,
+  StableHLO_MulOp,
+  StableHLO_NegOp,
+  StableHLO_NotOp,
+  StableHLO_OptimizationBarrierOp,
+  StableHLO_OrOp,
+  StableHLO_OutfeedOp,
+  StableHLO_PadOp,
+  StableHLO_PartitionIdOp,
+  StableHLO_PopulationCountOp,
+  StableHLO_PowOp,
+  StableHLO_RealDynamicSliceOp,
+  StableHLO_RealOp,
+  StableHLO_RecvOp,
+  StableHLO_ReduceOp,
+  StableHLO_ReducePrecisionOp,
+  StableHLO_ReduceScatterOp,
+  StableHLO_ReduceWindowOp,
+  StableHLO_RemOp,
+  StableHLO_ReplicaIdOp,
+  StableHLO_ReshapeOp,
+  StableHLO_ReturnOp,
+  StableHLO_ReverseOp,
+  StableHLO_RngBitGeneratorOp,
+  StableHLO_RngOp,
+  StableHLO_RoundNearestEvenOp,
+  StableHLO_RoundOp,
+  StableHLO_RsqrtOp,
+  StableHLO_ScatterOp,
+  StableHLO_SelectAndScatterOp,
+  StableHLO_SelectOp,
+  StableHLO_SendOp,
+  StableHLO_SetDimensionSizeOp,
+  StableHLO_ShiftLeftOp,
+  StableHLO_ShiftRightArithmeticOp,
+  StableHLO_ShiftRightLogicalOp,
+  StableHLO_SignOp,
+  StableHLO_SineOp,
+  StableHLO_SliceOp,
+  StableHLO_SortOp,
+  StableHLO_SqrtOp,
+  StableHLO_SubtractOp,
+  StableHLO_TanhOp,
+  StableHLO_TanOp,
+  StableHLO_TorchIndexSelectOp,
+  StableHLO_TransposeOp,
+  StableHLO_TriangularSolveOp,
+  StableHLO_TupleOp,
+  StableHLO_UnaryEinsumOp,
+  StableHLO_UniformDequantizeOp,
+  StableHLO_UniformQuantizeOp,
+  StableHLO_WhileOp,
+  StableHLO_XorOp,
+];
+
+// List of StableHLO and MHLO ops that need a custom HLO converter.
+defvar CustomHloConverterOps = [
+  // StableHLO ops
+  // go/keep-sorted start
+  StableHLO_AllGatherOp,
+  StableHLO_AllReduceOp,
+  StableHLO_AllToAllOp,
+  StableHLO_BatchNormGradOp,
+  StableHLO_BatchNormTrainingOp,
+  StableHLO_BitcastConvertOp,
+  StableHLO_BroadcastInDimOp,
+  StableHLO_CaseOp,
+  StableHLO_CollectiveBroadcastOp,
+  StableHLO_CompareOp,
+  StableHLO_CompositeOp,
+  StableHLO_ConstantOp,
+  StableHLO_ConvertOp,
+  StableHLO_ConvolutionOp,
+  StableHLO_CosineOp,
+  StableHLO_CustomCallOp,
+  StableHLO_DotGeneralOp,
+  StableHLO_DotOp,
+  StableHLO_DynamicBroadcastInDimOp,
+  StableHLO_DynamicConvOp,
+  StableHLO_DynamicGatherOp,
+  StableHLO_DynamicIotaOp,
+  StableHLO_DynamicPadOp,
+  StableHLO_DynamicReshapeOp,
+  StableHLO_IfOp,
+  StableHLO_InfeedOp,
+  StableHLO_IotaOp,
+  StableHLO_MapOp,
+  StableHLO_OptimizationBarrierOp,
+  StableHLO_OutfeedOp,
+  StableHLO_PadOp,
+  StableHLO_PartitionIdOp,
+  StableHLO_RealDynamicSliceOp,
+  StableHLO_RecvOp,
+  StableHLO_ReduceOp,
+  StableHLO_ReduceScatterOp,
+  StableHLO_ReduceWindowOp,
+  StableHLO_ReshapeOp,
+  StableHLO_ReturnOp,
+  StableHLO_RngBitGeneratorOp,
+  StableHLO_RngOp,
+  StableHLO_ScatterOp,
+  StableHLO_SelectAndScatterOp,
+  StableHLO_SendOp,
+  StableHLO_SetDimensionSizeOp,
+  StableHLO_SineOp,
+  StableHLO_SortOp,
+  StableHLO_SubtractOp,
+  StableHLO_TanOp,
+  StableHLO_UnaryEinsumOp,
+  StableHLO_UniformDequantizeOp,
+  StableHLO_UniformQuantizeOp,
+  StableHLO_WhileOp,
+  // go/keep-sorted end
+
+  // MHLO ops.
+  // go/keep-sorted start
+  MHLO_AddDependencyOp,
+  MHLO_AllGatherOp,
+  MHLO_AllReduceOp,
+  MHLO_AllToAllOp,
+  MHLO_AsyncDoneOp,
+  MHLO_AsyncStartOp,
+  MHLO_AsyncUpdateOp,
+  MHLO_BatchNormGradOp,
+  MHLO_BatchNormTrainingOp,
+  MHLO_BitcastConvertOp,
+  MHLO_BitcastOp,
+  MHLO_BroadcastInDimOp,
+  MHLO_CaseOp,
+  MHLO_CollectiveBroadcastOp,
+  MHLO_CompareOp,
+  MHLO_CompositeOp,
+  MHLO_ConstantOp,
+  MHLO_ConvertOp,
+  MHLO_ConvolutionOp,
+  MHLO_CopyOp,
+  MHLO_CosineOp,
+  MHLO_CustomCallOp,
+  MHLO_DomainOp,
+  MHLO_DotGeneralOp,
+  MHLO_DotOp,
+  MHLO_DynamicBroadcastInDimOp,
+  MHLO_DynamicConvOp,
+  MHLO_DynamicGatherOp,
+  MHLO_DynamicIotaOp,
+  MHLO_DynamicPadOp,
+  MHLO_DynamicReshapeOp,
+  MHLO_FusionOp,
+  MHLO_IfOp,
+  MHLO_InfeedOp,
+  MHLO_IotaOp,
+  MHLO_MapOp,
+  MHLO_MinimumBroadcastShapesOp,
+  MHLO_OptimizationBarrierOp,
+  MHLO_OutfeedOp,
+  MHLO_PadOp,
+  MHLO_PartitionIdOp,
+  MHLO_RaggedDotOp,
+  MHLO_RealDynamicSliceOp,
+  MHLO_RecvOp,
+  MHLO_ReduceOp,
+  MHLO_ReduceScatterOp,
+  MHLO_ReduceWindowOp,
+  MHLO_ReshapeOp,
+  MHLO_ReturnOp,
+  MHLO_RngBitGeneratorOp,
+  MHLO_RngOp,
+  MHLO_ScatterOp,
+  MHLO_SelectAndScatterOp,
+  MHLO_SendOp,
+  MHLO_SetDimensionSizeOp,
+  MHLO_SineOp,
+  MHLO_SortOp,
+  MHLO_SparseDotOp,
+  MHLO_StochasticConvertOp,
+  MHLO_SubtractOp,
+  MHLO_TanOp,
+  MHLO_TopKOp,
+  MHLO_TraceOp,
+  MHLO_UniformDequantizeOp,
+  MHLO_UniformQuantizeOp,
+  MHLO_WhileOp,
+  MHLO_XlaRngGetAndUpdateStateOp,
+  // go/keep-sorted end
+];
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
index ff354c04c95b..a8e359247e86 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/layout_util.cc
@@ -54,8 +54,8 @@ absl::Status RewriteLayoutWithShardedShape(
         sharding->TileOffsetForDevice(*xla_shape, device);
     std::vector<int64_t> limit =
         sharding->TileLimitForDevice(*xla_shape, device);
-    std::vector<int64_t> dimensions(xla_shape->rank());
-    for (int64_t i = 0; i < xla_shape->rank(); ++i) {
+    std::vector<int64_t> dimensions(xla_shape->dimensions().size());
+    for (int64_t i = 0; i < xla_shape->dimensions().size(); ++i) {
       dimensions[i] = limit[i] - offset[i];
     }
     xla::Shape per_device_xla_shape =
@@ -84,7 +84,7 @@ absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
     std::optional<xla::OpSharding> sharding, bool fast_mem) {
   if (original_shape.IsTuple()) {
     std::vector<xla::XlaOp> elements;
-    for (int i = 0; i < original_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < original_shape.tuple_shapes().size(); ++i) {
       auto subsharding = sharding ? sharding->tuple_shardings(i) : sharding;
       TF_ASSIGN_OR_RETURN(
           auto element,
@@ -115,7 +115,7 @@ absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
         &to_shape));
   }
   if (xla::ShapeUtil::Compatible(original_shape, to_shape)) {
-    for (int64_t i = 0; i < original_shape.rank(); ++i) {
+    for (int64_t i = 0; i < original_shape.dimensions().size(); ++i) {
       to_shape.set_dynamic_dimension(i, original_shape.is_dynamic_dimension(i));
     }
   }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 79dab834d14d..8391636e00da 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "mhlo/transforms/passes.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -63,6 +65,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "stablehlo/dialect/Base.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "stablehlo/transforms/Passes.h"
 #include "xla/array.h"
 #include "xla/comparison_util.h"
 #include "xla/debug_options_flags.h"
@@ -92,6 +96,7 @@ limitations under the License.
 #include "xla/mlir/utils/type_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/mlir_hlo/stablehlo_ext/transforms/passes.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo.pb.h"
@@ -103,6 +108,8 @@ limitations under the License.
 #include "xla/tsl/platform/types.h"
 #include "xla/xla_data.pb.h"
 
+#define DEBUG_TYPE "xla-translate"
+
 using ::int64_t;
 using ::tsl::int16;
 using ::tsl::int32;
@@ -252,15 +259,6 @@ static std::vector<int64_t> ConvertDenseIntAttr(
   return ConvertDenseIntAttr(*attr);
 }
 
-// Converts the broadcast_dimensions attribute into a vector of dimension
-// numbers (empty if the attribute is absent).
-static std::vector<int64_t> Convert_broadcast_dimensions(
-    std::optional<mlir::DenseIntElementsAttr> broadcast_dimensions) {
-  if (!broadcast_dimensions.has_value()) return {};
-
-  return ConvertDenseIntAttr(*broadcast_dimensions);
-}
-
 static std::vector<xla::CrossProgramPrefetch> Convert_cross_program_prefetches(
     mlir::ArrayAttr prefetches) {
   std::vector<xla::CrossProgramPrefetch> cross_program_prefetches;
@@ -285,6 +283,18 @@ static xla::FftType Convert_fft_type(mlir::mhlo::FftType fft_type) {
   return fft_type_enum;
 }
 
+// Converts StringRef to xla FftType enum
+static xla::FftType Convert_fft_type(mlir::stablehlo::FftType fft_type) {
+  xla::FftType fft_type_enum;
+  // Illegal fft_type string would be caught by the verifier, so 'FftType_Parse'
+  // call below should never return false.
+  if (!FftType_Parse(std::string(mlir::stablehlo::stringifyFftType(fft_type)),
+                     &fft_type_enum))
+    return xla::FftType::FFT;
+
+  return fft_type_enum;
+}
+
 static std::vector<std::pair<int64_t, int64_t>> Convert_padding(
     std::optional<mlir::DenseIntElementsAttr> padding) {
   return xla::ConvertNx2Attribute(padding).value();
@@ -324,7 +334,7 @@ static void SetLayout(xla::Shape& shape, mlir::DenseIntElementsAttr layout) {
 
 static void SetLayout(xla::Shape& shape, mlir::ArrayAttr layouts) {
   if (shape.IsTuple()) {
-    for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < shape.tuple_shapes().size(); ++i) {
       SetLayout(*shape.mutable_tuple_shapes(i),
                 mlir::cast<mlir::DenseIntElementsAttr>(layouts[i]));
     }
@@ -353,6 +363,13 @@ static xla::TriangularSolveOptions::Transpose Convert_transpose_a(
       .value();
 }
 
+// Converts StringRef to xla Transpose enum.
+static xla::TriangularSolveOptions::Transpose Convert_transpose_a(
+    mlir::stablehlo::Transpose transpose) {
+  return xla::ConvertTranspose(mlir::stablehlo::stringifyTranspose(transpose))
+      .value();
+}
+
 static xla::Layout ExtractLayout(
     mlir::Operation* op, int rank,
     llvm::StringRef attr_name = kDefaultLayoutAttrName) {
@@ -396,18 +413,39 @@ static mlir::FailureOr<xla::Shape> ExtractXlaShape(mlir::Operation* op) {
     return ConvertDenseIntAttr(attribute);                   \
   }
 
+#define I64_ARRAY_ATTR_TO_VECTOR(attribute)               \
+  static std::vector<int64_t> Convert_##attribute(        \
+      std::optional<llvm::ArrayRef<int64_t>> attribute) { \
+    if (!attribute.has_value()) return {};                \
+    return {attribute->begin(), attribute->end()};        \
+  }
+
 I64_ELEMENTS_ATTR_TO_VECTOR(broadcast_sizes);
+I64_ARRAY_ATTR_TO_VECTOR(broadcast_sizes);
+I64_ELEMENTS_ATTR_TO_VECTOR(broadcast_dimensions);
+I64_ARRAY_ATTR_TO_VECTOR(broadcast_dimensions);
 I64_ELEMENTS_ATTR_TO_VECTOR(permutation);
+I64_ARRAY_ATTR_TO_VECTOR(permutation);
 I64_ELEMENTS_ATTR_TO_VECTOR(start_indices);
+I64_ARRAY_ATTR_TO_VECTOR(start_indices);
 I64_ELEMENTS_ATTR_TO_VECTOR(limit_indices);
+I64_ARRAY_ATTR_TO_VECTOR(limit_indices);
 I64_ELEMENTS_ATTR_TO_VECTOR(strides);
+I64_ARRAY_ATTR_TO_VECTOR(strides);
 I64_ELEMENTS_ATTR_TO_VECTOR(slice_sizes);
+I64_ARRAY_ATTR_TO_VECTOR(slice_sizes);
 I64_ELEMENTS_ATTR_TO_VECTOR(fft_length);
+I64_ARRAY_ATTR_TO_VECTOR(fft_length);
 I64_ELEMENTS_ATTR_TO_VECTOR(dimensions);
+I64_ARRAY_ATTR_TO_VECTOR(dimensions);
 I64_ELEMENTS_ATTR_TO_VECTOR(window_strides);
+I64_ARRAY_ATTR_TO_VECTOR(window_strides);
 I64_ELEMENTS_ATTR_TO_VECTOR(lhs_dilation);
+I64_ARRAY_ATTR_TO_VECTOR(lhs_dilation);
 I64_ELEMENTS_ATTR_TO_VECTOR(rhs_dilation);
+I64_ARRAY_ATTR_TO_VECTOR(rhs_dilation);
 
+#undef I64_ARRAY_ATTR_TO_VECTOR
 #undef I64_ELEMENTS_ATTR_TO_VECTOR
 
 #define BOOL_ELEMENTS_ATTR_TO_VECTOR(attribute)           \
@@ -454,6 +492,33 @@ static std::unique_ptr<xla::PrecisionConfig> Convert_precision_config(
   return precision_config;
 }
 
+// Converts the precision config array of strings attribute into the
+// corresponding XLA proto. All the strings are assumed to be valid names of the
+// Precision enum. This should have been checked in the op verify method.
+static std::unique_ptr<xla::PrecisionConfig> Convert_precision_config_stablehlo(
+    std::optional<mlir::ArrayAttr> optional_precision_config_attr) {
+  if (!optional_precision_config_attr.has_value()) return nullptr;
+
+  auto precision_config = std::make_unique<xla::PrecisionConfig>();
+  for (auto attr : optional_precision_config_attr.value()) {
+    xla::PrecisionConfig::Precision p;
+    auto operand_precision =
+        mlir::stablehlo::stringifyPrecision(
+            mlir::cast<mlir::stablehlo::PrecisionAttr>(attr).getValue())
+            .str();
+    if (xla::PrecisionConfig::Precision_Parse(operand_precision, &p)) {
+      precision_config->add_operand_precision(p);
+    } else {
+      auto* context = attr.getContext();
+      mlir::emitError(mlir::UnknownLoc::get(context))
+          << "unexpected operand precision " << operand_precision;
+      return nullptr;
+    }
+  }
+
+  return precision_config;
+}
+
 static xla::DotDimensionNumbers Convert_dot_dimension_numbers(
     mlir::mhlo::DotDimensionNumbersAttr dot_dimension_numbers_attr) {
   xla::DotDimensionNumbers dot_dimension_numbers;
@@ -485,6 +550,37 @@ static xla::DotDimensionNumbers Convert_dot_dimension_numbers(
   return dot_dimension_numbers;
 }
 
+static xla::DotDimensionNumbers Convert_dot_dimension_numbers(
+    mlir::stablehlo::DotDimensionNumbersAttr dot_dimension_numbers_attr) {
+  xla::DotDimensionNumbers dot_dimension_numbers;
+
+  auto rhs_contracting_dimensions =
+      dot_dimension_numbers_attr.getRhsContractingDimensions();
+  auto lhs_contracting_dimensions =
+      dot_dimension_numbers_attr.getLhsContractingDimensions();
+  auto rhs_batch_dimensions =
+      dot_dimension_numbers_attr.getRhsBatchingDimensions();
+  auto lhs_batch_dimensions =
+      dot_dimension_numbers_attr.getLhsBatchingDimensions();
+
+  for (const auto& val : rhs_contracting_dimensions) {
+    dot_dimension_numbers.add_rhs_contracting_dimensions(val);
+  }
+  for (const auto& val : lhs_contracting_dimensions) {
+    dot_dimension_numbers.add_lhs_contracting_dimensions(val);
+  }
+
+  for (const auto& val : rhs_batch_dimensions) {
+    dot_dimension_numbers.add_rhs_batch_dimensions(val);
+  }
+
+  for (const auto& val : lhs_batch_dimensions) {
+    dot_dimension_numbers.add_lhs_batch_dimensions(val);
+  }
+
+  return dot_dimension_numbers;
+}
+
 static xla::RaggedDotDimensionNumbers Convert_ragged_dot_dimension_numbers(
     mlir::mhlo::RaggedDotDimensionNumbersAttr
         ragged_dot_dimension_numbers_attr) {
@@ -559,6 +655,24 @@ std::optional<xla::ChannelHandle> Convert_channel_handle(
   return Convert_channel_handle(attr.value());
 }
 
+// `ChannelHandleAttr` is NOT optional for stablehlo sendOp, RecvOp.
+xla::ChannelHandle Convert_channel_handle(
+    mlir::stablehlo::ChannelHandleAttr attr) {
+  xla::ChannelHandle channel_handle;
+  channel_handle.set_handle(attr.getHandle());
+  channel_handle.set_type(
+      static_cast<xla::ChannelHandle::ChannelType>(attr.getType()));
+  return channel_handle;
+}
+
+// `ChannelHandleAttr` is optional for stablehlo CollectivePermuteOp,
+// CollectiveBroadcastOp, AllToAllOp, AllReduceOp, AllGatherOp, ReduceScatterOp.
+std::optional<xla::ChannelHandle> Convert_channel_handle(
+    std::optional<mlir::stablehlo::ChannelHandleAttr> attr) {
+  if (!attr.has_value()) return std::nullopt;
+  return Convert_channel_handle(attr.value());
+}
+
 // Converts the comparison_direction string attribute into the XLA enum. The
 // string is assumed to correspond to exactly one of the allowed strings
 // representing the enum. This should have been checked in the op verify method.
@@ -568,8 +682,8 @@ static xla::ComparisonDirection Convert_comparison_direction(
       .value();
 }
 
-static xla::GatherDimensionNumbers Convert_dimension_numbers(
-    mlir::mhlo::GatherDimensionNumbersAttr input) {
+template <typename T>
+static xla::GatherDimensionNumbers Convert_dimension_numbers(T input) {
   xla::GatherDimensionNumbers output;
 
   auto offset_dims = input.getOffsetDims();
@@ -602,6 +716,53 @@ static xla::GatherDimensionNumbers Convert_dimension_numbers(
   return output;
 }
 
+static xla::GatherDimensionNumbers Convert_dimension_numbers(
+    mlir::mhlo::GatherDimensionNumbersAttr input) {
+  return Convert_dimension_numbers<mlir::mhlo::GatherDimensionNumbersAttr>(
+      input);
+}
+
+static xla::GatherDimensionNumbers Convert_dimension_numbers(
+    mlir::stablehlo::GatherDimensionNumbersAttr input) {
+  return Convert_dimension_numbers<mlir::stablehlo::GatherDimensionNumbersAttr>(
+      input);
+}
+
+static xla::ScatterDimensionNumbers Convert_scatter_dimension_numbers(
+    mlir::stablehlo::ScatterDimensionNumbersAttr input) {
+  xla::ScatterDimensionNumbers output;
+
+  auto update_window_dims = input.getUpdateWindowDims();
+  std::copy(update_window_dims.begin(), update_window_dims.end(),
+            tsl::protobuf::RepeatedFieldBackInserter(
+                output.mutable_update_window_dims()));
+
+  auto inserted_window_dims = input.getInsertedWindowDims();
+  std::copy(inserted_window_dims.begin(), inserted_window_dims.end(),
+            tsl::protobuf::RepeatedFieldBackInserter(
+                output.mutable_inserted_window_dims()));
+
+  auto input_batching_dims = input.getInputBatchingDims();
+  std::copy(input_batching_dims.begin(), input_batching_dims.end(),
+            tsl::protobuf::RepeatedFieldBackInserter(
+                output.mutable_input_batching_dims()));
+
+  auto scatter_indices_batching_dims = input.getScatterIndicesBatchingDims();
+  std::copy(scatter_indices_batching_dims.begin(),
+            scatter_indices_batching_dims.end(),
+            tsl::protobuf::RepeatedFieldBackInserter(
+                output.mutable_scatter_indices_batching_dims()));
+
+  auto scatter_dims_to_operand_dims = input.getScatterDimsToOperandDims();
+  std::copy(scatter_dims_to_operand_dims.begin(),
+            scatter_dims_to_operand_dims.end(),
+            tsl::protobuf::RepeatedFieldBackInserter(
+                output.mutable_scatter_dims_to_operand_dims()));
+
+  output.set_index_vector_dim(input.getIndexVectorDim());
+  return output;
+}
+
 static xla::ScatterDimensionNumbers Convert_scatter_dimension_numbers(
     mlir::mhlo::ScatterDimensionNumbersAttr input) {
   xla::ScatterDimensionNumbers output;
@@ -638,6 +799,8 @@ static xla::ScatterDimensionNumbers Convert_scatter_dimension_numbers(
 }
 
 // Converts ResultAccuracyAttr to XLA ResultAccuracy proto.
+// This function name is non-standard to match the codegen
+// function name, similar to other attribute converters.
 static xla::ResultAccuracy Convert_result_accuracy(
     std::optional<mlir::mhlo::ResultAccuracyAttr>
         optional_result_accuracy_attr) {
@@ -647,26 +810,63 @@ static xla::ResultAccuracy Convert_result_accuracy(
   if (optional_result_accuracy_attr.value().getMode().getValue() ==
       mlir::mhlo::ResultAccuracyMode::TOLERANCE) {
     result_accuracy.mutable_tolerance()->set_atol(
-        optional_result_accuracy_attr.value().getAtol().convertToFloat());
+        optional_result_accuracy_attr.value().getAtol().convertToDouble());
     result_accuracy.mutable_tolerance()->set_rtol(
-        optional_result_accuracy_attr.value().getRtol().convertToFloat());
+        optional_result_accuracy_attr.value().getRtol().convertToDouble());
     result_accuracy.mutable_tolerance()->set_ulps(
         optional_result_accuracy_attr.value().getUlps());
-  } else {
-    xla::ResultAccuracy::Mode mode;
-    auto result_accuracy_mode =
-        ::mlir::mhlo::stringifyResultAccuracyMode(
-            optional_result_accuracy_attr.value().getMode().getValue())
-            .str();
-    if (xla::ResultAccuracy::Mode_Parse(result_accuracy_mode, &mode)) {
-      result_accuracy.set_mode(mode);
-    } else {
-      auto* context = optional_result_accuracy_attr.value().getContext();
-      mlir::emitError(mlir::UnknownLoc::get(context))
-          << "unexpected result accuracy mode " << result_accuracy_mode;
-      return xla::ResultAccuracy();
-    }
+    return result_accuracy;
+  }
+
+  xla::ResultAccuracy::Mode mode;
+  auto result_accuracy_mode =
+      ::mlir::mhlo::stringifyResultAccuracyMode(
+          optional_result_accuracy_attr.value().getMode().getValue())
+          .str();
+  if (!xla::ResultAccuracy::Mode_Parse(result_accuracy_mode, &mode)) {
+    auto* context = optional_result_accuracy_attr.value().getContext();
+    mlir::emitError(mlir::UnknownLoc::get(context))
+        << "unexpected result accuracy mode " << result_accuracy_mode;
+    return xla::ResultAccuracy();
+  }
+
+  result_accuracy.set_mode(mode);
+  return result_accuracy;
+}
+
+// Converts ResultAccuracyAttr to XLA ResultAccuracy proto.
+// This function name is non-standard to match the codegen
+// function name, similar to other attribute converters.
+static xla::ResultAccuracy Convert_result_accuracy(
+    std::optional<mlir::stablehlo::ResultAccuracyAttr>
+        optional_result_accuracy_attr) {
+  if (!optional_result_accuracy_attr.has_value()) return xla::ResultAccuracy();
+
+  auto result_accuracy = xla::ResultAccuracy();
+  if (optional_result_accuracy_attr.value().getMode().getValue() ==
+      mlir::stablehlo::ResultAccuracyMode::TOLERANCE) {
+    result_accuracy.mutable_tolerance()->set_atol(
+        optional_result_accuracy_attr.value().getAtol().convertToDouble());
+    result_accuracy.mutable_tolerance()->set_rtol(
+        optional_result_accuracy_attr.value().getRtol().convertToDouble());
+    result_accuracy.mutable_tolerance()->set_ulps(
+        optional_result_accuracy_attr.value().getUlps());
+    return result_accuracy;
+  }
+
+  xla::ResultAccuracy::Mode mode;
+  auto result_accuracy_mode =
+      ::mlir::stablehlo::stringifyResultAccuracyMode(
+          optional_result_accuracy_attr.value().getMode().getValue())
+          .str();
+  if (!xla::ResultAccuracy::Mode_Parse(result_accuracy_mode, &mode)) {
+    auto* context = optional_result_accuracy_attr.value().getContext();
+    mlir::emitError(mlir::UnknownLoc::get(context))
+        << "unexpected result accuracy mode " << result_accuracy_mode;
+    return xla::ResultAccuracy();
   }
+
+  result_accuracy.set_mode(mode);
   return result_accuracy;
 }
 
@@ -907,6 +1107,12 @@ class ConvertToHloModule {
       ConvertToHloModule::ValueLoweringMap* value_lowering,
       xla::XlaOp* return_value);
 
+  LogicalResult LowerStablehloCompositeCall(
+      mlir::Operation* inst, xla::XlaBuilder* module_builder,
+      xla::XlaBuilder* builder,
+      ConvertToHloModule::ValueLoweringMap* value_lowering,
+      xla::XlaOp* return_value);
+
   // Lower constant to HLO constant instruction
   LogicalResult LowerConstant(
       mlir::Operation* inst, xla::XlaBuilder* builder,
@@ -992,97 +1198,1743 @@ class ConvertToHloModule {
 }  // namespace
 }  // namespace mlir
 
-namespace {
+namespace {
+
+struct OpLoweringContext {
+  llvm::DenseMap<mlir::Value, xla::XlaOp>* values;
+  mlir::ConvertToHloModule* converter;
+  xla::XlaBuilder* builder;
+  mlir::StackFrameIndexBuilder* frame_index_builder;
+};
+
+mlir::LogicalResult GetTuple(mlir::Operation* op,
+                             mlir::Operation::operand_range values,
+                             OpLoweringContext ctx,
+                             llvm::SmallVectorImpl<xla::XlaOp>& results) {
+  results.reserve(values.size());
+  for (mlir::Value value : values) {
+    if (failed(GetXlaOp(value, *ctx.values, &results.emplace_back(), op)))
+      return mlir::failure();
+  }
+  return mlir::success();
+}
+
+mlir::LogicalResult GetXlaOps(mlir::Operation* op,
+                              llvm::ArrayRef<mlir::Value> values,
+                              OpLoweringContext ctx,
+                              llvm::SmallVectorImpl<xla::XlaOp>& results) {
+  results.reserve(values.size());
+  for (mlir::Value value : values) {
+    if (failed(GetXlaOp(value, *ctx.values, &results.emplace_back(), op)))
+      return mlir::failure();
+  }
+  return mlir::success();
+}
+
+// Checks that the results of `op` are simply returned at the end of this
+// function rather than used by other ops in the same function.
+//
+// Used to check that new-style async ops on computations that contain sync
+// versions of old-style async ops can be exported by downgrading to old-style
+// async ops.
+bool SimplyReturnedOp(mlir::Operation* op) {
+  for (auto operand : op->getOperands()) {
+    if (!llvm::isa<mlir::BlockArgument>(operand)) return false;
+  }
+
+  auto users = op->getUsers();
+  if (users.empty()) return false;
+
+  auto first_user = *users.begin();
+  for (auto user : users) {
+    if (first_user != user) return false;
+  }
+
+  if (llvm::isa<mlir::func::ReturnOp>(first_user)) return true;
+  return false;
+}
+
+void BuildGetTupleElementsForTupleResults(
+    mlir::Operation* op, xla::XlaOp tuple, xla::XlaBuilder* builder,
+    llvm::DenseMap<mlir::Value, xla::XlaOp>& values,
+    unsigned num_implicit_results = 0) {
+  const std::optional<xla::OpSharding>& sharding = builder->sharding();
+  if (sharding.has_value()) {
+    bool is_tuple_sharding = sharding->type() == xla::OpSharding::TUPLE;
+    assert(!is_tuple_sharding || (op->getNumResults() + num_implicit_results ==
+                                  sharding->tuple_shardings_size()));
+    for (auto [index, result] : llvm::enumerate(op->getResults())) {
+      // If `sharding` is not a tuple sharding, then every `get-tuple-element`
+      // gets the same sharding.
+      xla::XlaScopedShardingAssignment scoped_sharding(
+          builder,
+          is_tuple_sharding ? sharding->tuple_shardings(index) : sharding);
+      values[result] = xla::GetTupleElement(tuple, index);
+    }
+  } else {
+    xla::XlaScopedShardingAssignment scoped_sharding(builder, std::nullopt);
+    for (auto [index, result] : llvm::enumerate(op->getResults())) {
+      values[result] = xla::GetTupleElement(tuple, index);
+    }
+  }
+}
+
+void BuildGetTupleElementsForTupleResults(mlir::Operation* op, xla::XlaOp tuple,
+                                          OpLoweringContext ctx,
+                                          unsigned num_implicit_results = 0) {
+  BuildGetTupleElementsForTupleResults(op, tuple, ctx.builder, *ctx.values,
+                                       num_implicit_results);
+}
+
+}  // namespace
+
+namespace mlir {
+
+namespace stablehlo {
+namespace {
+
+LogicalResult ExportXlaOp(ConstantOp op, OpLoweringContext ctx) {
+  return failure();
+}
+
+LogicalResult ExportXlaOp(BroadcastInDimOp op, OpLoweringContext ctx) {
+  auto type = mlir::dyn_cast<RankedTensorType>(op.getType());
+  if (!type) {
+    return failure();
+  }
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op))) {
+    return failure();
+  }
+
+  // Use TypeToShape to handle bounded dynamism.
+  // HLO expects broadcast sizes to use the bound's value, not kDynamic.
+  xla::Shape shape = xla::TypeToShape(type);
+  value_map[op] =
+      BroadcastInDim(operand, shape.dimensions(),
+                     Convert_broadcast_dimensions(op.getBroadcastDimensions()));
+  return success();
+}
+
+LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
+  // This op has no expression in the legacy export format.
+  return failure();
+}
+
+LogicalResult ExportXlaOp(mlir::stablehlo::ConvolutionOp op,
+                          OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp lhs, rhs;
+  if (failed(GetXlaOp(op.getLhs(), value_map, &lhs, op))) {
+    return mlir::failure();
+  }
+  if (failed(GetXlaOp(op.getRhs(), value_map, &rhs, op))) {
+    return mlir::failure();
+  }
+  xla::PrimitiveType preferred_element_type =
+      xla::ConvertMlirTypeToPrimitiveType(getElementTypeOrSelf(op.getType()));
+  xla::XlaOp xla_result = xla::ConvGeneralDilated(
+      lhs, rhs, Convert_window_strides(op.getWindowStrides()),
+      Convert_padding(op.getPadding()),
+      Convert_lhs_dilation(op.getLhsDilation()),
+      Convert_rhs_dilation(op.getRhsDilation()),
+      xla::ConvertConvDimensionNumbers(op.getDimensionNumbers()),
+      Convertuint64_t(op.getFeatureGroupCount()),
+      Convertuint64_t(op.getBatchGroupCount()),
+      Unwrap(Convert_precision_config(op.getPrecisionConfig())),
+      preferred_element_type, op.getWindowReversal());
+  value_map[op] = xla_result;
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(ConvertOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+
+  value_map[op] = xla::ConvertElementType(
+      operand,
+      xla::ConvertMlirTypeToPrimitiveType(getElementTypeOrSelf(op.getType())));
+  return success();
+}
+
+LogicalResult ExportXlaOp(CosineOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  xla::XlaOp arg;
+  if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &arg, op)))
+    return mlir::failure();
+  xla::ResultAccuracy result_accuracy =
+      Convert_result_accuracy(op.getResultAccuracy());
+  auto xla_result = xla::Cos(Unwrap(arg), result_accuracy);
+  value_map[result] = xla_result;
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(SineOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  xla::XlaOp arg;
+  xla::ResultAccuracy result_accuracy =
+      Convert_result_accuracy(op.getResultAccuracy());
+  if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &arg, op)))
+    return mlir::failure();
+  auto xla_result = xla::Sin(Unwrap(arg), result_accuracy);
+  value_map[result] = xla_result;
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(TanOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  xla::XlaOp arg;
+  xla::ResultAccuracy result_accuracy =
+      Convert_result_accuracy(op.getResultAccuracy());
+  if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &arg, op)))
+    return mlir::failure();
+  auto xla_result = xla::Tan(Unwrap(arg), result_accuracy);
+  value_map[result] = xla_result;
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(SubtractOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  xla::XlaOp lhs;
+  if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &lhs, op)))
+    return mlir::failure();
+
+  xla::XlaOp rhs;
+  if (failed(GetXlaOp(*op.getODSOperands(1).begin(), value_map, &rhs, op)))
+    return mlir::failure();
+
+  auto xla_result = xla::Sub(Unwrap(lhs), Unwrap(rhs));
+  value_map[result] = xla_result;
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(AllGatherOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op.getOperation(), op.getOperands(), ctx, operands)))
+    return op.emitOpError("failed to get tuple");
+
+  mlir::FailureOr<xla::Shape> shape_or = ExtractXlaShape(op.getOperation());
+  if (failed(shape_or)) return op.emitOpError("failed to extract XLA shape");
+
+  auto all_gather_dim = op.getAllGatherDim();
+  int64_t shard_count = 0;
+
+  for (const auto& indexed_pair :
+       llvm::enumerate(llvm::zip(op.getOperandTypes(), op.getResultTypes()))) {
+    auto [operand_type, result_type] = indexed_pair.value();
+    TensorType operand_ttype = mlir::cast<TensorType>(operand_type);
+    TensorType result_ttype = mlir::cast<TensorType>(result_type);
+    if (!operand_ttype || !result_ttype)
+      return op.emitOpError("operands/results must be TensorTypes");
+
+    if (!operand_ttype.hasStaticShape() || !result_ttype.hasStaticShape())
+      return op.emitOpError("operands/results must have static shapes");
+
+    if (indexed_pair.index() == 0) {
+      shard_count = result_ttype.getDimSize(all_gather_dim) /
+                    operand_ttype.getDimSize(all_gather_dim);
+    }
+  }
+
+  if (shape_or->IsTuple()) {
+    std::optional<xla::Layout> layout = std::nullopt;
+    if (shape_or->has_layout()) layout = shape_or->layout();
+
+    auto tuple = xla::AllGatherTuple(
+        operands, all_gather_dim, shard_count,
+        Convert_replica_groups(op.getReplicaGroups()),
+        Convert_channel_handle(op.getChannelHandle()), layout,
+        Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
+    return success();
+  }
+
+  value_map[op->getResults()[0]] = xla::AllGather(
+      operands[0], all_gather_dim, shard_count,
+      Convert_replica_groups(op.getReplicaGroups()),
+      Convert_channel_handle(op.getChannelHandle()), std::nullopt,
+      Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(SendOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  llvm::SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op, op.getInputs(), ctx, operands))) return failure();
+
+  xla::XlaOp operand;
+  if (operands.size() == 1)
+    operand = operands[0];
+  else
+    operand = Tuple(ctx.builder, operands);
+
+  xla::XlaOp token;
+  if (failed(GetXlaOp(op.getToken(), value_map, &token, op))) return failure();
+
+  // SendOp has 1 result, but HLO Send has 3 results. Convert the sharding to a
+  // tuple sharding with 3 entries.
+  if (ctx.builder->sharding().has_value()) {
+    xla::OpSharding sharding = *ctx.builder->sharding();
+    const xla::OpSharding single_sharding = *ctx.builder->sharding();
+    sharding.set_type(xla::OpSharding::TUPLE);
+    auto* tuple_shardings = sharding.mutable_tuple_shardings();
+    tuple_shardings->Add(xla::OpSharding(single_sharding));
+    tuple_shardings->Add(xla::OpSharding(single_sharding));
+    tuple_shardings->Add(xla::OpSharding(single_sharding));
+    xla::XlaScopedShardingAssignment sharding_scope(ctx.builder, sharding);
+    token = xla::internal::XlaBuilderFriend::BuildSend(
+        ctx.builder, operand, token,
+        Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
+  } else {
+    token = xla::internal::XlaBuilderFriend::BuildSend(
+        ctx.builder, operand, token,
+        Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
+  }
+  value_map[op] = xla::internal::XlaBuilderFriend::BuildSendDone(
+      ctx.builder, token, Convert_channel_handle(op.getChannelHandle()),
+      op.getIsHostTransfer());
+  return success();
+}
+
+LogicalResult ExportXlaOp(RecvOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  xla::XlaOp token;
+  if (failed(GetXlaOp(op.getToken(), value_map, &token, op))) return failure();
+
+  // stablehlo.recvOp produces multiple results. The shape argument expected by
+  // the xla client API is a tuple type with two element-types: data_type : A
+  // tuple containing all the stablehlo.RecvOp result types except
+  //             the token type.
+  // token_type : The last result type of stablehlo.recvOp.
+  auto result_types = op.getResultTypes();
+  auto num_results = op.getNumResults();
+
+  xla::Shape token_shape = xla::TypeToShape(result_types[num_results - 1]);
+  std::vector<xla::Shape> subshapes;
+  for (const auto& item : llvm::enumerate(result_types)) {
+    if (item.index() == num_results - 1) break;
+    subshapes.push_back(xla::TypeToShape(item.value()));
+  }
+
+  xla::Shape data_shape;
+  if (subshapes.size() == 1)
+    data_shape = subshapes[0];
+  else
+    data_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
+
+  auto get_sharding = [](const xla::OpSharding& sharding) {
+    xla::OpSharding ret;
+    if (sharding.type() != xla::OpSharding::TUPLE) {
+      ret = sharding;
+    } else {
+      ret = sharding.tuple_shardings(0);
+    }
+    return ret;
+  };
+  if (ctx.builder->sharding().has_value()) {
+    // HLO Recv needs a 3-tuple sharding. Get the sharding from the builder and
+    // make it a 3-tuple sharding.
+    std::optional<xla::OpSharding> sharding = *ctx.builder->sharding();
+    xla::OpSharding single_sharding = get_sharding(*sharding);
+    auto* tuple_shardings = sharding->mutable_tuple_shardings();
+    tuple_shardings->Clear();
+    for (int i = 0; i < 3; ++i) {
+      tuple_shardings->Add(xla::OpSharding(single_sharding));
+    }
+    xla::XlaScopedShardingAssignment sharding_scope(ctx.builder, sharding);
+    token = xla::internal::XlaBuilderFriend::BuildRecv(
+        ctx.builder, token, data_shape,
+        Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
+  } else {
+    token = xla::internal::XlaBuilderFriend::BuildRecv(
+        ctx.builder, token, data_shape,
+        Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
+  }
+
+  xla::XlaOp xla_result;
+  {
+    xla::XlaScopedShardingAssignment sharding_scope(ctx.builder,
+                                                    ctx.builder->sharding());
+    xla_result = xla::internal::XlaBuilderFriend::BuildRecvDone(
+        ctx.builder, token, data_shape,
+        Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
+  }
+
+  xla::XlaOp data_tuple_element;
+  if (ctx.builder->sharding().has_value()) {
+    // HLO GetTupleElement needs a single sharding,
+    xla::XlaScopedShardingAssignment sharding_scope(
+        ctx.builder, get_sharding(*ctx.builder->sharding()));
+    data_tuple_element = xla::GetTupleElement(xla_result, 0);
+  } else {
+    data_tuple_element = xla::GetTupleElement(xla_result, 0);
+  }
+
+  if (subshapes.size() == 1) {
+    value_map[op.getResult(0)] = data_tuple_element;
+  } else {
+    for (const auto& item : llvm::enumerate(op.getResults())) {
+      if (item.index() == num_results - 1) break;
+      value_map[item.value()] =
+          xla::GetTupleElement(data_tuple_element, item.index());
+    }
+  }
+
+  value_map[op.getResult(num_results - 1)] =
+      xla::GetTupleElement(xla_result, 1);
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(InfeedOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp token;
+  if (failed(GetXlaOp(op.getToken(), value_map, &token, op))) return failure();
+
+  // stablehlo.infeed produces multiple results. The shape argument expected
+  // by the xla client API is a tuple type with two element-types:
+  // data_type : A tuple containing all the stablehlo.infeedOp result types
+  // except
+  //             the token type.
+  // token_type : The last result type of stablehlo.infeedOp.
+  auto result_types = op.getResultTypes();
+  auto num_results = op.getNumResults();
+
+  xla::Shape token_shape = xla::TypeToShape(result_types[num_results - 1]);
+  std::vector<xla::Shape> subshapes;
+  for (const auto& item : llvm::enumerate(result_types)) {
+    if (item.index() == num_results - 1) break;
+    subshapes.push_back(xla::TypeToShape(item.value()));
+  }
+
+  xla::Shape data_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
+  auto xla_result = xla::InfeedWithToken(token, data_shape,
+                                         std::string(op.getInfeedConfig()));
+  ctx.builder->ClearSharding();
+
+  if (!subshapes.empty()) {
+    auto data_tuple_element = xla::GetTupleElement(xla_result, 0);
+    for (const auto& item : llvm::enumerate(op.getResults())) {
+      if (item.index() == num_results - 1) break;
+      value_map[item.value()] =
+          xla::GetTupleElement(data_tuple_element, item.index());
+    }
+  }
+
+  value_map[op.getResult(num_results - 1)] =
+      xla::GetTupleElement(xla_result, 1);
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(OutfeedOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  llvm::SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op, op.getInputs(), ctx, operands))) return failure();
+
+  const auto sharding = ctx.builder->sharding();
+  xla::XlaOp operand;
+
+  if (sharding.has_value() &&
+      sharding->tuple_shardings_size() != operands.size()) {
+    xla::XlaScopedShardingAssignment scoped_sharding(ctx.builder, std::nullopt);
+    operand = Tuple(ctx.builder, operands);
+  } else {
+    operand = Tuple(ctx.builder, operands);
+  }
+  std::vector<xla::Shape> subshapes;
+  for (auto operand : op.getInputs())
+    subshapes.push_back(xla::TypeToShape(operand.getType()));
+
+  xla::Shape shape_with_layout = xla::ShapeUtil::MakeTupleShape(subshapes);
+
+  xla::XlaOp token;
+  if (failed(GetXlaOp(op.getToken(), value_map, &token, op))) return failure();
+
+  value_map[op] = xla::OutfeedWithToken(operand, token, shape_with_layout,
+                                        std::string(op.getOutfeedConfig()));
+  return success();
+}
+
+LogicalResult ExportXlaOp(OptimizationBarrierOp op, OpLoweringContext ctx) {
+  // In case StableHLO's OptimizationBarrierOp has multiple operands,
+  // create xla::Tuple, using those operands, to be used as
+  // sole operand of xla::OptimizationBarrier.
+  llvm::SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op, op.getOperands(), ctx, operands))) return failure();
+  if (operands.empty()) return success();
+
+  auto& value_map = *ctx.values;
+  if (operands.size() == 1) {
+    value_map[op.getOperation()->getResult(0)] =
+        xla::OptimizationBarrier(operands[0]);
+  } else {
+    auto result = xla::OptimizationBarrier(Tuple(ctx.builder, operands));
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
+  }
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
+  xla::XlaComputation true_branch;
+  xla::XlaComputation false_branch;
+  auto& value_map = *ctx.values;
+
+  // stablehlo.IfOp does not have any operands or blocks arguments.
+  // The computation inside the region-blocks use implicit captures of values
+  // defined above.
+  // In order to create the xla parameters for functions corresponding to
+  // IfOp regions, we need to infer the a region-block's arguments, using all
+  // the values used in the region but defined above. Note that in case there
+  // are zero implicit capture for a region, we use an empty tuple as the xla
+  // parameter.
+  //
+  // Note that the implicit values used in true and false branch regions might
+  // be different and, as a result, the xla parameters for the corresponding
+  // regions could have different shapes.
+  llvm::SetVector<mlir::Value> implicit_true_operand_set,
+      implicit_false_operand_set;
+  getUsedValuesDefinedAbove(op.getTrueBranch(), op.getTrueBranch(),
+                            implicit_true_operand_set);
+  getUsedValuesDefinedAbove(op.getFalseBranch(), op.getFalseBranch(),
+                            implicit_false_operand_set);
+
+  llvm::SmallVector<mlir::Value> implicit_true_operands =
+      implicit_true_operand_set.takeVector();
+  llvm::SmallVector<mlir::Value> implicit_false_operands =
+      implicit_false_operand_set.takeVector();
+
+  llvm::SmallVector<std::optional<xla::OpSharding>> ret_shardings =
+      GetResultShardings(ctx.builder->sharding(), op->getNumResults());
+
+  llvm::SmallVector<xla::XlaOp> true_args;
+  if (failed(GetXlaOps(op, implicit_true_operands, ctx, true_args)))
+    return failure();
+
+  llvm::SmallVector<xla::XlaOp> false_args;
+  if (failed(GetXlaOps(op, implicit_false_operands, ctx, false_args)))
+    return failure();
+
+  llvm::SmallVector<std::optional<xla::OpSharding>> true_arg_shardings,
+      false_arg_shardings;
+  if (!ret_shardings.empty() || op->getNumResults() == 0) {
+    // We only add arg shardings if there are result shardings or no results,
+    // otherwise it means sharding propagation hasn't been done yet.
+    true_arg_shardings = GetXlaOpShardings(true_args);
+    false_arg_shardings = GetXlaOpShardings(false_args);
+  }
+
+  // Create xla parameters for functions corresponding to ifOp regions using the
+  // implicit captures operands. Also export the instructions within those
+  // regions.
+  if (failed(ctx.converter->LowerRegionAsComputation(
+          &op.getTrueBranch(), &true_branch, implicit_true_operands,
+          /*implicit_results=*/{}, /*ensure_single_arg=*/true,
+          true_arg_shardings, ret_shardings)) ||
+      failed(ctx.converter->LowerRegionAsComputation(
+          &op.getFalseBranch(), &false_branch, implicit_false_operands,
+          /*implicit_results=*/{}, /*ensure_single_arg=*/true,
+          false_arg_shardings, ret_shardings))) {
+    return failure();
+  }
+
+  // Create the Xla pred argument.
+  xla::XlaOp pred;
+  if (failed(GetXlaOp(op.getPred(), value_map, &pred, op))) return failure();
+
+  // Create the true branch Xla argument.
+  xla::XlaOp true_arg =
+      CreateTupleIfMultipleOps(ctx.builder, true_args, true_arg_shardings);
+
+  // Create the false branch Xla argument.
+  xla::XlaOp false_arg =
+      CreateTupleIfMultipleOps(ctx.builder, false_args, false_arg_shardings);
+
+  // Create XLA Conditional op.
+  auto ifop =
+      xla::Conditional(pred, true_arg, true_branch, false_arg, false_branch);
+
+  // stablehlo.IfOp have multiple returns, untuple all the results of XLA's.
+  if (op.getNumResults() == 1) {
+    value_map[op.getResult(0)] = ifop;
+  } else {
+    BuildGetTupleElementsForTupleResults(op, ifop, ctx);
+  }
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
+  llvm::DenseMap<mlir::Value, xla::XlaOp>& value_map = *ctx.values;
+  // OperandRange operands = op.branch_operands();
+  MutableArrayRef<Region> branches = op.getBranches();
+  llvm::SmallVector<xla::XlaOp, 4> branch_operands(branches.size());
+  std::vector<xla::XlaComputation> computations(branches.size());
+  std::vector<xla::XlaComputation*> computations_p(branches.size());
+
+  // stablehlo.CaseOp does not have any operands or blocks arguments.
+  // The computation inside the region-blocks use implicit captures of values
+  // defined above.
+  // In order to create the xla parameters for functions corresponding to
+  // CaseOp regions, we need to infer the a region-block's arguments, using all
+  // the values used in the region but defined above. Note that in case there
+  // are zero implicit captures for a region, we use an empty tuple as the xla
+  // parameter.
+  //
+  // Note that the implicit values used in the regions might
+  // be different and, as a result, the xla parameters for the corresponding
+  // regions could have different shapes.
+  for (unsigned i = 0; i < branches.size(); ++i) {
+    llvm::SetVector<mlir::Value> implicit_operand_set;
+    getUsedValuesDefinedAbove(branches[i], branches[i], implicit_operand_set);
+    llvm::SmallVector<mlir::Value> implicit_operands =
+        implicit_operand_set.takeVector();
+
+    llvm::SmallVector<std::optional<xla::OpSharding>> ret_shardings =
+        GetResultShardings(ctx.builder->sharding(), op->getNumResults());
+
+    // Create the branches[i]'s Xla argument.
+    llvm::SmallVector<xla::XlaOp> args;
+    if (failed(GetXlaOps(op, implicit_operands, ctx, args))) return failure();
+
+    llvm::SmallVector<std::optional<xla::OpSharding>> arg_shardings;
+    if (!ret_shardings.empty()) {
+      // We only add arg shardings if there are result shardings, otherwise it
+      // means sharding propagation hasn't been done yet.
+      arg_shardings = GetXlaOpShardings(args);
+    }
+
+    branch_operands[i] =
+        CreateTupleIfMultipleOps(ctx.builder, args, arg_shardings);
+
+    // Create xla parameters for functions corresponding to region branches[i]
+    // using the implicit captures operands. Also export the instructions within
+    // that region.
+    computations_p[i] = &computations[i];
+    if (failed(ctx.converter->LowerRegionAsComputation(
+            &branches[i], computations_p[i], implicit_operands,
+            /*implicit_results=*/{}, /*ensure_single_arg=*/true, arg_shardings,
+            ret_shardings)))
+      return failure();
+  }
+
+  xla::XlaOp index;
+  if (failed(GetXlaOp(op.getIndex(), value_map, &index, op))) return failure();
+
+  xla::XlaOp caseop = xla::Conditional(index, computations_p, branch_operands);
+
+  // stablehlo.CaseOp have multiple returns, untuple all the results of XLA's.
+  if (op.getNumResults() == 1) {
+    value_map[op.getResult(0)] = caseop;
+  } else {
+    BuildGetTupleElementsForTupleResults(op, caseop, ctx);
+  }
+  return success();
+}
+
+LogicalResult ExportXlaOp(WhileOp op, OpLoweringContext ctx) {
+  xla::XlaComputation condition;
+  xla::XlaComputation body;
+
+  // If the results of the while op have a sharding, we use those shardings for
+  // the corresponding arguments and return shardings in the body and condition.
+  llvm::SmallVector<std::optional<xla::OpSharding>> res_shardings =
+      GetResultShardings(ctx.builder->sharding(), op->getNumResults());
+
+  // stablehlo.WhileOp has operands and corresponding blocks arguments, but the
+  // computation inside its region-blocks can also use implicit captures of
+  // values defined above.
+  // In order to create the xla parameters for functions corresponding to
+  // WhileOp regions, we need to infer the implicit region-block's arguments,
+  // using all the values used in the region but defined above.
+  //
+  // Note that the body and cond regions of WhileOp share the same block
+  // arguments, so we collect the implicit values for both in a single set.
+  llvm::SetVector<mlir::Value> implicit_operand_set;
+  getUsedValuesDefinedAbove(op->getRegions(), implicit_operand_set);
+  llvm::SmallVector<mlir::Value> implicit_operands =
+      implicit_operand_set.takeVector();
+
+  llvm::SmallVector<xla::XlaOp> implicit_args;
+  if (failed(GetXlaOps(op, implicit_operands, ctx, implicit_args)))
+    return failure();
+
+  // We need to append the shardings of the implicit values to the result
+  // shardings, since the HLO While will have those implcit values as additional
+  // operands and results.
+  llvm::SmallVector<std::optional<xla::OpSharding>> implicit_shardings;
+  if (!implicit_args.empty() && !res_shardings.empty()) {
+    // We only add implicit arg shardings if there are result shardings,
+    // otherwise it means sharding propagation hasn't been done yet.
+    implicit_shardings = GetXlaOpShardings(implicit_args);
+
+    res_shardings.append(implicit_shardings.begin(), implicit_shardings.end());
+    if (std::optional<xla::OpSharding> new_sharding =
+            CreateTupleSharding(res_shardings)) {
+      ctx.builder->SetSharding(*new_sharding);
+    }
+  }
+
+  // The body of the While needs to return the same number of values as its
+  // arguments, as they are carried over to the next iteration. Thus, we pass
+  // the `implicit_operands` as `implicit_results`, to carry them over as is.
+  if (failed(ctx.converter->LowerRegionAsComputation(
+          &op.getBody(), &body, implicit_operands,
+          /*implicit_results=*/implicit_operands,
+          /*ensure_single_arg=*/true, /*arg_shardings=*/res_shardings,
+          /*ret_shardings=*/res_shardings)) ||
+      failed(ctx.converter->LowerRegionAsComputation(
+          &op.getCond(), &condition, implicit_operands,
+          /*implicit_results=*/{},
+          /*ensure_single_arg=*/true, /*arg_shardings=*/res_shardings))) {
+    return failure();
+  }
+
+  // In case StableHLO's whileOp has multiple operands, create xla::Tuple, using
+  // those operands, to be used as sole operand of xla::While.
+  llvm::SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op, op.getOperands(), ctx, operands))) return failure();
+  operands.append(implicit_args.begin(), implicit_args.end());
+
+  xla::XlaOp operand = operands[0];
+  if (operands.size() > 1) operand = Tuple(ctx.builder, operands);
+
+  xla::XlaOp whileop = xla::While(condition, body, operand);
+
+  auto& value_map = *ctx.values;
+  auto shape_or = whileop.builder()->GetShape(whileop);
+  if (!shape_or.ok()) {
+    return op.emitError(shape_or.status().ToString());
+  }
+
+  xla::Shape& shape = shape_or.value();
+  if (!shape.IsTuple()) {
+    value_map[op.getResult(0)] = whileop;
+    return success();
+  }
+
+  // stablehlo.WhileOp supports multiple returns,
+  // untuple all the results of XLA's.
+  BuildGetTupleElementsForTupleResults(
+      op, whileop, ctx, /*num_implicit_results=*/implicit_args.size());
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(AllReduceOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaComputation computation;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.getComputation(),
+                                                     &computation))) {
+    return failure();
+  }
+
+  SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op.getOperation(), op.getOperands(), ctx, operands)))
+    return failure();
+
+  mlir::FailureOr<xla::Shape> shape_or = ExtractXlaShape(op.getOperation());
+  if (failed(shape_or)) return failure();
+  if (shape_or->IsTuple()) {
+    std::optional<xla::Shape> shape_with_layout = std::nullopt;
+    if (shape_or->has_layout()) shape_with_layout = shape_or.value();
+    auto tuple = xla::AllReduceTuple(
+        operands, computation, Convert_replica_groups(op.getReplicaGroups()),
+        Convert_channel_handle(op.getChannelHandle()), shape_with_layout,
+        Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
+  } else {
+    value_map[op->getResults()[0]] = xla::AllReduce(
+        operands[0], computation, Convert_replica_groups(op.getReplicaGroups()),
+        Convert_channel_handle(op.getChannelHandle()), std::nullopt,
+        Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+  }
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(ReduceOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaComputation body;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.getBody(), &body))) {
+    return failure();
+  }
+  llvm::SmallVector<xla::XlaOp> operands, init_values;
+  if (failed(GetTuple(op, op.getInputs(), ctx, operands)) ||
+      failed(GetTuple(op, op.getInitValues(), ctx, init_values))) {
+    return failure();
+  }
+  xla::XlaOp result =
+      xla::Reduce(ctx.builder, operands, init_values, body,
+                  Convert_broadcast_dimensions(op.getDimensions()));
+  if (op.getNumResults() == 1) {
+    value_map[op.getResult(0)] = result;
+  } else {
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
+  }
+  return success();
+}
+
+LogicalResult ExportXlaOp(MapOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaComputation computation;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.getComputation(),
+                                                     &computation))) {
+    return failure();
+  }
+  llvm::SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op, op.getInputs(), ctx, operands))) return failure();
+  value_map[op] = xla::Map(ctx.builder, operands, computation,
+                           Convert_dimensions(op.getDimensions()));
+  return success();
+}
+
+LogicalResult ExportXlaOp(ReturnOp op, OpLoweringContext ctx) {
+  // Failure on purpose because `stablehlo::ReturnOp` will be handled by
+  // special purpose logic in `ConvertToHloModule::Lower`.
+  return failure();
+}
+
+LogicalResult ExportXlaOp(AllToAllOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op.getOperation(), op.getOperands(), ctx, operands))) {
+    return failure();
+  }
+
+  mlir::FailureOr<xla::Shape> shape_or = ExtractXlaShape(op.getOperation());
+  if (failed(shape_or)) return failure();
+  if (shape_or->IsTuple()) {
+    std::optional<xla::Layout> layout = std::nullopt;
+    if (shape_or->has_layout()) {
+      layout = shape_or->layout();
+    }
+    auto tuple = xla::AllToAllTuple(
+        operands, Convert_replica_groups(op.getReplicaGroups()), layout,
+        Convert_channel_handle(op.getChannelHandle()));
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
+  } else {
+    std::optional<uint64_t> splitDimension = op.getSplitDimension();
+    std::optional<uint64_t> concatDimension = op.getConcatDimension();
+    std::optional<uint64_t> splitCount = op.getSplitCount();
+
+    // ArrayAllToAll always has exactly one operand (checked in the verifier).
+    value_map[op->getResults()[0]] = xla::AllToAll(
+        operands[0], *splitDimension, *concatDimension, *splitCount,
+        Convert_replica_groups(op.getReplicaGroups()),
+        /*layout=*/std::nullopt, Convert_channel_handle(op.getChannelHandle()));
+  }
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(BatchNormGradOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  xla::XlaOp operand, scale, mean, variance, grad_output;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  if (failed(GetXlaOp(op.getScale(), value_map, &scale, op))) return failure();
+  if (failed(GetXlaOp(op.getMean(), value_map, &mean, op))) return failure();
+  if (failed(GetXlaOp(op.getVariance(), value_map, &variance, op)))
+    return failure();
+  if (failed(GetXlaOp(op.getGradOutput(), value_map, &grad_output, op)))
+    return failure();
+
+  auto xla_result =
+      xla::BatchNormGrad(operand, scale, mean, variance, grad_output,
+                         ConvertAPFloat(op.getEpsilon()), op.getFeatureIndex());
+
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
+
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(BatchNormTrainingOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  xla::XlaOp operand, scale, offset;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  if (failed(GetXlaOp(op.getScale(), value_map, &scale, op))) return failure();
+  if (failed(GetXlaOp(op.getOffset(), value_map, &offset, op)))
+    return failure();
+
+  auto xla_result = xla::BatchNormTraining(operand, scale, offset,
+                                           ConvertAPFloat(op.getEpsilon()),
+                                           op.getFeatureIndex());
+
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
+
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(BitcastConvertOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+
+  value_map[op] = xla::BitcastConvertType(
+      operand,
+      xla::ConvertMlirTypeToPrimitiveType(getElementTypeOrSelf(op.getType())));
+  return success();
+}
+
+LogicalResult ExportXlaOp(CollectiveBroadcastOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  value_map[op->getResult(0)] = xla::CollectiveBroadcast(
+      operand, Convert_replica_groups(op.getReplicaGroups()),
+      Convert_channel_handle(op.getChannelHandle()));
+
+  return success();
+}
+
+// Specialize CompareOp export to set broadcast_dimensions argument.
+mlir::LogicalResult ExportXlaOp(mlir::stablehlo::CompareOp op,
+                                OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp lhs, rhs;
+  if (failed(GetXlaOp(op.getLhs(), value_map, &lhs, op)))
+    return mlir::failure();
+  if (failed(GetXlaOp(op.getRhs(), value_map, &rhs, op)))
+    return mlir::failure();
+  auto dir = Convert_comparison_direction(
+      mlir::stablehlo::stringifyComparisonDirection(
+          op.getComparisonDirection()));
+  auto type_attr = op.getCompareTypeAttr();
+
+  xla::XlaOp xla_result;
+  if (type_attr &&
+      type_attr.getValue() != mlir::stablehlo::ComparisonType::NOTYPE) {
+    auto type = xla::StringToComparisonType(
+                    stringifyComparisonType(type_attr.getValue()).str())
+                    .value();
+    xla_result = xla::Compare(lhs, rhs, /*broadcast_dimensions=*/{}, dir, type);
+  } else {
+    xla_result = xla::Compare(lhs, rhs, dir);
+  }
+  value_map[op] = xla_result;
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(SortOp op, OpLoweringContext ctx) {
+  xla::XlaComputation comparator;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.getComparator(),
+                                                     &comparator)))
+    return failure();
+
+  llvm::SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op, op.getInputs(), ctx, operands))) return failure();
+  auto sorted =
+      xla::Sort(operands, comparator, op.getDimension(), op.getIsStable());
+
+  auto& value_map = *ctx.values;
+  auto shape_or = sorted.builder()->GetShape(sorted);
+  if (!shape_or.ok()) {
+    return op.emitError(shape_or.status().ToString());
+  }
+
+  xla::Shape& shape = shape_or.value();
+  if (!shape.IsTuple()) {
+    value_map[op.getResult(0)] = sorted;
+    return success();
+  }
+
+  // MLIR's sort supports multiple returns, untuple all the results of XLA's.
+  BuildGetTupleElementsForTupleResults(op, sorted, ctx);
+  return success();
+}
+
+LogicalResult ExportXlaOp(CompositeOp, OpLoweringContext) {
+  // Failure on purpose because `stablehlo::CompositeOp` will be handled by
+  // special purpose logic in `ConvertToHloModule::Lower`.
+  return failure();
+}
+
+LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  llvm::SmallVector<xla::XlaOp> args;
+  if (failed(GetTuple(op, op.getInputs(), ctx, args))) return failure();
+
+  // Specially handle custom_calls from StableHLO that need stability guarantees
+  // that XLA doesn't provide at the moment.
+  //
+  // In particular, we need 6mo backward compat and 1mo forward compat. This
+  // will be provided by the StableHLO team by updating the following lowering.
+  // This lowering provides that compatibility guarantee, lowering to the
+  // appropriate HLO as the HLO implementing this custom_call may change.
+  //
+  // The only custom_call covered by the guarantee right now is ApproxTopK.
+  // This means that any custom_call with call_target_name = "ApproxTopK"
+  // written against the specification below will continue to behave as
+  // described within the compatibility window.
+  //
+  // The attributes supported by the ApproxTopK custom_call are:
+  //
+  //  - called_computation : This indicates the comparator for scoring entries
+  //  - has_side_effect: always False
+  //  - api_version : always 4, the typed FFI API
+  //  - backend_config : The actual arguments to ApproxTopK. This includes
+  //    + top_k:i64 : the number of results to return
+  //    + reduction_dim:i64 : which dimension to search for the top k elements
+  //    + recall_target:f32: the expected number of top-k entries returned,
+  //        divided by k.
+  //    + aggregate_to_topk:bool : When true, aggregates approximate results to
+  //        top-k. When false, returns the approximate results. The number of
+  //        the approximate results is implementation defined and is greater
+  //        equals to the specified `k`.
+  //    + reduction_input_size_override:i64 : When set to a nonnegative value,
+  //        it overrides the size determined by `input[reduction_dim]` for
+  //        evaluating the recall. This option is useful when the given
+  //        `input` is only a subset of the overall computation in SPMD or
+  //        distributed pipelines, where the true input size cannot be deferred
+  //        by the `input` shape.
+  //    + is_fallback:bool : use the CPU/GPU fallback instead of the TPU
+  //        implementation that uses PartialReduce (optional)
+  //
+  // The operands are a sequence of inputs over which to search, followed
+  // by a list of initial values for each tensor in the first
+  // list. Thus, we must have an even number of operands consisting of a
+  // sequence of tensors with the same shape followed by the same number of
+  // rank-0 tensors with the same element types as the corresponding inputs.
+  // NB. Here, We mean "shape" in the StableHLO sense of the dimensions of
+  // a the tensor, excluding the element type, not the the HLO sense, which
+  // includes it.
+  //
+  // Given the above operands and attributes, the custom_call returns tensors
+  // with the same shapes as the inputs (i.e. the first half of the operands),
+  // save for reduction_dim, which may have changed in accordance with the
+  // values of aggregate_to_topk, recall_target, and
+  // reduction_input_size_override above. These tensors will contain slices of
+  // the input tensors perpendicular to that axis, which have approximately the
+  // top values of the comparator along that axis to within recall_target.
+  //
+  // The operands and attributes must obey the following constraints:
+  //
+  // (C1) size(inputs) = size(init_values) = size(results)
+  // (C2) All inputs have the same shape.
+  // (C3) element_type(inputs[i]) = element_type(init_values[i])
+  //                              = element_type(results[i]) for all i in [0, N)
+  // (C4) shape(results[i]) = shape(inputs[i]) except that the dimension size
+  //      of inputs[i] corresponding to reduction_dim_are replaced with a
+  //      value >=k, which can be determined using ApproxTopKReductionOutputSize
+  // (C5) called_computation has type
+  //      (tensor<E0>, tensor<E0>, ..., tensor<EN-1>, tensor<EN-1>) ->
+  //      tensor<i1>
+  //        where Ei = element_type(inputs[i])
+  // (C6) 0 <= reduction_dim < rank(inputs[0])
+  // (C7) 0 < recall_target <= 1.0
+  // (C8) dim(inputs[0],reduction_dim) < reduction_input_size_override
+  //        || reduction_input_size_override < 0
+  //
+  // See arxiv:2206.14286 for more details.
+  //
+  // This feature is at time of writing only used by JAX, and is tested in the
+  // jax2tf backwards compatibility tests.
+
+  if (op.getCallTargetName() == kApproxTopK) {
+    auto isSupportedAttrName = [](NamedAttribute attr) {
+      auto name = attr.getName();
+      return name == kCallTargetName || name == kBackendConfig ||
+             name == kApiVersion || name == kCalledComputations ||
+             name == kHasSideEffect;
+    };
+    for (const auto& attr : op->getAttrs()) {
+      if (!isSupportedAttrName(attr))
+        return op.emitOpError()
+               << attr.getName().getValue()
+               << " is not a supported attribute for ApproxTopK";
+    }
+    auto backend_config =
+        mlir::dyn_cast_or_null<mlir::DictionaryAttr>(op.getBackendConfigAttr());
+    if (!backend_config)
+      return op.emitOpError() << "Missing backend_config attribute";
+
+    for (auto attr : backend_config) {
+      auto name = attr.getName();
+      if (!(name == kTopK || name == kReductionDim || name == kRecallTarget ||
+            name == kAggregateToTopk || name == kReductionInputSizeOverride ||
+            name == kIsFallback))
+        return op.emitOpError()
+               << name.getValue() << " is not a supported backend_config"
+               << " attribute for ApproxTopK";
+    }
+
+    auto checkI64Attr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name))
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      auto attr = backend_config.getAs<IntegerAttr>(attr_name);
+      if (!attr || !attr.getType().isInteger(64))
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of i64 type";
+      return success();
+    };
+    auto checkF32Attr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name))
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      auto attr = backend_config.getAs<FloatAttr>(attr_name);
+      if (!attr || !attr.getType().isF32())
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of f32 type";
+      return success();
+    };
+    auto checkBoolAttr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name))
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      auto attr = backend_config.getAs<BoolAttr>(attr_name);
+      if (!attr)
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of bool type";
+      return success();
+    };
+    if (failed(checkI64Attr(kTopK))) return failure();
+    if (failed(checkI64Attr(kReductionDim))) return failure();
+    if (failed(checkF32Attr(kRecallTarget))) return failure();
+    if (failed(checkBoolAttr(kAggregateToTopk))) return failure();
+    if (failed(checkI64Attr(kReductionInputSizeOverride))) return failure();
+    bool has_is_fallback = backend_config.contains(kIsFallback);
+    if (has_is_fallback && !backend_config.getAs<BoolAttr>(kIsFallback))
+      return op.emitOpError()
+             << "is_fallback attribute in backend_config must be of bool type";
+
+    int64_t top_k = backend_config.getAs<IntegerAttr>(kTopK).getInt();
+    int64_t reduction_dim =
+        backend_config.getAs<IntegerAttr>(kReductionDim).getInt();
+    float recall_target = backend_config.getAs<FloatAttr>(kRecallTarget)
+                              .getValue()
+                              .convertToFloat();
+    bool aggregate_to_topk =
+        backend_config.getAs<BoolAttr>(kAggregateToTopk).getValue();
+    int64_t reduction_input_size_override =
+        backend_config.getAs<IntegerAttr>(kReductionInputSizeOverride).getInt();
+    bool is_fallback = has_is_fallback &&
+                       backend_config.getAs<BoolAttr>(kIsFallback).getValue();
+
+    // (C1)
+    if (args.size() % 2 != 0) {
+      return op.emitOpError() << "ApproxTopK takes an even number of operands.";
+    }
+    auto num_inputs = args.size() / 2;
+    absl::Span<const xla::XlaOp> inputs(args.begin(), num_inputs);
+    absl::Span<const xla::XlaOp> init_values(args.begin() + num_inputs,
+                                             num_inputs);
+    if (num_inputs != op.getNumResults()) {
+      return op.emitOpError() << "num_results does not match num_inputs";
+    }
+
+    SmallVector<RankedTensorType> input_types, init_value_types, result_types;
+    for (size_t i = 0; i < num_inputs; ++i) {
+      auto input_type =
+          mlir::dyn_cast<RankedTensorType>(op.getOperand(i).getType());
+      if (!input_type) return failure();
+      input_types.push_back(input_type);
+      auto init_value_type = mlir::dyn_cast<RankedTensorType>(
+          op.getOperand(num_inputs + i).getType());
+      if (!init_value_type) return failure();
+      init_value_types.push_back(init_value_type);
+      auto result_type =
+          mlir::dyn_cast<RankedTensorType>(op.getResult(i).getType());
+      if (!result_type) return failure();
+      result_types.push_back(result_type);
+    }
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      // (C2)
+      if (input_types[0].getShape() != input_types[i].getShape()) {
+        return op.emitOpError() << "input shape mismatch at position " << i;
+      }
+
+      // (C3)
+      if (init_value_types[i].getElementType() !=
+          input_types[i].getElementType()) {
+        return op.emitOpError()
+               << "input and init_value element type mismatch at position "
+               << i;
+      }
+      if (input_types[i].getElementType() != result_types[i].getElementType()) {
+        return op.emitOpError()
+               << "result element type mismatch at position " << i;
+      }
+
+      // (C4)
+      for (size_t j = 0; j < input_types[i].getRank(); ++j) {
+        if (j == reduction_dim) {
+          auto reduction_output_size = xla::ApproxTopKReductionOutputSize(
+              input_types[i].getShape()[j], input_types[i].getRank(), top_k,
+              recall_target, aggregate_to_topk, reduction_input_size_override);
+          if (!reduction_output_size.ok()) return failure();
+          if (result_types[i].getShape()[j] != reduction_output_size->first)
+            return op.emitOpError()
+                   << "ApproxTopK aggregates to k="
+                   << reduction_output_size->first << ", but got "
+                   << result_types[i].getShape()[j];
+          continue;
+        }
+        if (input_types[i].getShape()[j] != result_types[i].getShape()[j]) {
+          return op.emitOpError() << "result shape mismatch at position " << i
+                                  << ", index " << j;
+        }
+      }
+    }
+
+    // (C5)
+    auto called_computations = op.getCalledComputations();
+    if (called_computations.size() != 1) {
+      return op.emitOpError()
+             << "ApproxTopK takes exactly 1 called_computation.";
+    }
+    mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
+        mlir::cast<FlatSymbolRefAttr>(op.getCalledComputations()[0]));
+    mlir::FunctionType callee_type = callee.getFunctionType();
+    SmallVector<Type, 4> expected_callee_input_types;
+    for (unsigned i = 0; i < num_inputs; ++i) {
+      auto scalar = RankedTensorType::get({}, input_types[i].getElementType());
+      expected_callee_input_types.push_back(scalar);
+      expected_callee_input_types.push_back(scalar);
+    }
+    FunctionType expected_callee_type = mlir::FunctionType::get(
+        op->getContext(), expected_callee_input_types,
+        RankedTensorType::get({}, IntegerType::get(op->getContext(), 1)));
+    if (callee_type != expected_callee_type) {
+      return op.emitOpError()
+             << "called_computation type does not match the expected type. Got "
+             << callee_type << " expected " << expected_callee_type;
+    }
+
+    if (failed(ctx.converter->RunOnFunction(callee))) return failure();
+    xla::XlaComputation& comparator =
+        ctx.converter->GetLoweredComputation(callee);
+
+    // (C6)
+    if (reduction_dim < 0 || reduction_dim > input_types[0].getRank())
+      return op.emitOpError() << "reduction_dim out of range";
+    // (C7)
+    if (recall_target <= 0 || recall_target > 1.0)
+      return op.emitOpError() << "recall_target out of range";
+    // (C8)
+    if (reduction_input_size_override >= 0 &&
+        reduction_input_size_override <
+            input_types[0].getShape()[reduction_dim])
+      return op.emitOpError() << "reduction_input_size_override out of range";
+
+    xla::XlaOp cc_op;
+    if (is_fallback) {
+      cc_op = xla::ApproxTopKFallback(
+          ctx.builder, inputs, init_values, top_k, reduction_dim, comparator,
+          recall_target, aggregate_to_topk, reduction_input_size_override);
+    } else {
+      cc_op = xla::ApproxTopK(ctx.builder, inputs, init_values, top_k,
+                              reduction_dim, comparator, recall_target,
+                              aggregate_to_topk, reduction_input_size_override);
+    }
+    BuildGetTupleElementsForTupleResults(op, cc_op, ctx);
+    return success();
+  } else if (op.getCallTargetName() == kRaggedAllToAll) {
+    auto backend_config =
+        mlir::dyn_cast_or_null<mlir::DictionaryAttr>(op.getBackendConfigAttr());
+    auto isSupportedAttrName = [](NamedAttribute attr) {
+      auto name = attr.getName();
+      return name == kCallTargetName || name == kBackendConfig ||
+             name == kApiVersion || name == kCalledComputations ||
+             name == kHasSideEffect || name == kMhloSharding;
+    };
+    for (const auto& attr : op->getAttrs()) {
+      if (!isSupportedAttrName(attr))
+        return op.emitOpError()
+               << attr.getName().getValue()
+               << " is not a supported attribute for RaggedAllToAll";
+    }
+    DenseIntElementsAttr replica_groups =
+        backend_config.getAs<DenseIntElementsAttr>(kReplicaGroups);
+    xla::ChannelHandle channel_handle;
+    channel_handle.set_handle(
+        backend_config.getAs<IntegerAttr>(kChannelId).getInt());
+    channel_handle.set_type(xla::ChannelHandle::CHANNEL_TYPE_INVALID);
+    xla::XlaOp ragged_all_to_all_op =
+        RaggedAllToAll(args[0], args[1], args[2], args[3], args[4], args[5],
+                       Convert_replica_groups(replica_groups), channel_handle);
+    value_map[op.getResult(0)] = ragged_all_to_all_op;
+    return success();
+  }
+
+  if (op.getCalledComputations().size() > 1)
+    return op.emitOpError()
+           << "cannot export with more than one called computations";
+
+  // Custom call can be exported either with called computation or with layout
+  // attributes. The XlaBuilder API does not allow both.
+  if (!op.getCalledComputations().empty() && op.getOperandLayouts() &&
+      op.getResultLayouts()) {
+    return op.emitOpError() << "cannot export if both called computation and "
+                               "layouts are specified";
+  }
+
+  auto xla_api_version = xla::ConvertCustomCallApiVersion(op.getApiVersion());
+  if (!xla_api_version.ok()) return failure();
+
+  // CustomCallOp backend config can be either a string if we use any of the
+  // older custom call API versions, or a dictionary attribute if we use typed
+  // FFI. We always pass it as a string to the HLO instruction. If it was a
+  // dictionary attribute we rely on MLIR printing to convert it to string.
+  std::string backend_config;
+
+  if (*xla_api_version == xla::CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+    // Serialize backend config dictionary as a string.
+    if (auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(
+            op.getBackendConfig().value_or(mlir::Attribute()))) {
+      llvm::raw_string_ostream(backend_config) << dict;
+    }
+  } else {
+    // Forward backend config string to the HLO instruction.
+    if (auto str = mlir::dyn_cast_or_null<mlir::StringAttr>(
+            op.getBackendConfig().value_or(mlir::Attribute()))) {
+      llvm::raw_string_ostream(backend_config) << str.strref();
+    }
+  }
+
+  absl::StatusOr<xla::Literal> literal;
+  const xla::Literal* literal_ptr = nullptr;
+  auto literal_attr = op->getAttrOfType<DenseElementsAttr>(kMhloLiteral);
+  if (literal_attr) {
+    literal = mhlo::CreateLiteralFromAttribute(literal_attr, {});
+    if (!literal.ok()) return failure();
+    literal_ptr = &*literal;
+  }
+
+  auto aliasInfo =
+      xla::ConvertOutputOperandAliasing(op.getOutputOperandAliases());
+  auto output_operand_aliasing = absl::MakeSpan(*aliasInfo);
+
+  auto custom_call_schedule = xla::SCHEDULE_NONE;
+
+  std::string call_target_name(op.getCallTargetName());
+  xla::Shape result_shape;
+  if (op->getNumResults() == 1) {
+    result_shape = xla::TypeToShape(op.getResult(0).getType());
+  } else {
+    std::vector<xla::Shape> subshapes;
+    for (const auto& item : op.getResults().getType()) {
+      subshapes.push_back(xla::TypeToShape(item));
+    }
+    result_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
+  }
+
+  xla::XlaOp custom_call;
+  if (op.getCalledComputations().size() == 1) {
+    mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
+        mlir::cast<FlatSymbolRefAttr>(op.getCalledComputations()[0]));
+    if (failed(ctx.converter->RunOnFunction(callee))) return failure();
+    xla::XlaComputation& computation =
+        ctx.converter->GetLoweredComputation(callee);
+    custom_call = xla::CustomCallWithComputation(
+        ctx.builder, call_target_name, args, computation, result_shape,
+        backend_config, op.getHasSideEffect(), output_operand_aliasing,
+        literal_ptr, custom_call_schedule, *xla_api_version);
+  } else if (op.getOperandLayouts() && op.getResultLayouts()) {
+    auto operand_shapes_with_layout = ConvertTypesToShapesWithLayout(
+        op.getOperandTypes(), op.getOperandLayouts().value());
+    SetLayout(result_shape, op.getResultLayouts().value());
+
+    custom_call = xla::CustomCallWithLayout(
+        ctx.builder, call_target_name, args, result_shape,
+        operand_shapes_with_layout, backend_config, op.getHasSideEffect(),
+        output_operand_aliasing, literal_ptr, custom_call_schedule,
+        *xla_api_version);
+  } else {
+    custom_call = xla::CustomCall(
+        ctx.builder, call_target_name, args, result_shape, backend_config,
+        op.getHasSideEffect(), output_operand_aliasing, literal_ptr,
+        custom_call_schedule, *xla_api_version);
+  }
+
+  if (op->getNumResults() == 1) {
+    value_map[op.getResult(0)] = custom_call;
+  } else {
+    BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
+  }
+
+  return success();
+}
+
+LogicalResult ExportXlaOp(DotGeneralOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp lhs, rhs;
+  if (failed(GetXlaOp(op.getLhs(), value_map, &lhs, op)))
+    return mlir::failure();
+  if (failed(GetXlaOp(op.getRhs(), value_map, &rhs, op)))
+    return mlir::failure();
+  xla::PrimitiveType preferred_element_type =
+      xla::ConvertMlirTypeToPrimitiveType(getElementTypeOrSelf(op.getType()));
+
+  // Precision Config / Algorithm
+  auto precision_config = Convert_precision_config(op.getPrecisionConfig());
+  if (op.getAlgorithmAttr()) {
+    absl::StatusOr<xla::PrecisionConfig::Algorithm> algorithm =
+        xla::ConvertDotAlgorithm(op.getAlgorithmAttr());
+    if (!algorithm.ok()) {
+      return op.emitError(algorithm.status().ToString());
+    }
+    if (precision_config == nullptr) {
+      precision_config = std::make_unique<xla::PrecisionConfig>();
+    }
+    precision_config->set_algorithm(algorithm.value());
+  }
+  auto xlaOp = xla::DotGeneral(
+      lhs, rhs, Convert_dot_dimension_numbers(op.getDotDimensionNumbers()),
+      Unwrap(precision_config), preferred_element_type);
+
+  value_map[op] = xlaOp;
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(DotOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp lhs, rhs;
+  if (failed(GetXlaOp(op.getLhs(), value_map, &lhs, op)))
+    return mlir::failure();
+  if (failed(GetXlaOp(op.getRhs(), value_map, &rhs, op)))
+    return mlir::failure();
+  xla::PrimitiveType preferred_element_type =
+      xla::ConvertMlirTypeToPrimitiveType(getElementTypeOrSelf(op.getType()));
+  value_map[op] = xla::Dot(
+      lhs, rhs,
+      Unwrap(Convert_precision_config_stablehlo(op.getPrecisionConfig())),
+      preferred_element_type);
+  return mlir::success();
+}
+
+LogicalResult ExportXlaOp(DynamicConvOp op, OpLoweringContext ctx) {
+  return failure();
+}
+
+LogicalResult ExportXlaOp(DynamicGatherOp op, OpLoweringContext ctx) {
+  return failure();
+}
+
+LogicalResult ExportXlaOp(DynamicIotaOp op, OpLoweringContext ctx) {
+  return failure();
+}
+
+LogicalResult ExportXlaOp(DynamicPadOp op, OpLoweringContext ctx) {
+  return failure();
+}
+
+LogicalResult ExportXlaOp(UnaryEinsumOp op, OpLoweringContext ctx) {
+  return failure();
+}
+
+LogicalResult ExportXlaOp(DynamicReshapeOp op, OpLoweringContext ctx) {
+  auto resultType = mlir::dyn_cast<RankedTensorType>(op.getResult().getType());
+  if (!resultType) return op->emitOpError() << "expected ranked result";
+  auto resultBounds = hlo::encodingToBounds(resultType.getEncoding());
+  if (resultBounds.empty())
+    return op->emitOpError() << "expected bounded result";
+  auto shapeType =
+      mlir::dyn_cast<RankedTensorType>(op.getOutputShape().getType());
+  if (!shapeType || !shapeType.getElementType().isInteger(32))
+    return op->emitOpError() << "expected output shape to be tensor<Nxi32>";
+
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  xla::XlaOp outputShape;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  if (failed(GetXlaOp(op.getOutputShape(), value_map, &outputShape, op)))
+    return failure();
+
+  SmallVector<xla::XlaOp> dimSizes;
+  SmallVector<int64_t> newSizeBounds;
+  std::vector<bool> dimsAreDynamic;
+  for (auto i = 0; i < resultType.getRank(); ++i) {
+    auto runtimeSizeX1 = xla::Slice(outputShape, {i}, {i + 1}, {1});
+    dimSizes.push_back(xla::Reshape(runtimeSizeX1, {}));
+
+    auto dimSize = resultType.getDimSize(i);
+    auto dimBound = resultBounds[i];
+    if (!hlo::isStaticDimSize(dimSize) && !hlo::isStaticDimSize(dimBound))
+      return op->emitOpError() << "unbounded dynamism is not supported";
+    newSizeBounds.push_back(hlo::isStaticDimSize(dimSize) ? dimSize : dimBound);
+    dimsAreDynamic.push_back(!hlo::isStaticDimSize(dimSize));
+  }
+  value_map[op] =
+      xla::DynamicReshape(operand, dimSizes, newSizeBounds, dimsAreDynamic);
+  return success();
+}
+
+LogicalResult ExportXlaOp(ReshapeOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+
+  value_map[op] =
+      xla::Reshape(operand, xla::TypeToShape(op.getType()).dimensions());
+  return success();
+}
+
+LogicalResult ExportXlaOp(IotaOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  value_map[op] = xla::Iota(ctx.builder, xla::TypeToShape(op.getType()),
+                            op.getIotaDimension());
+  return success();
+}
+
+LogicalResult ExportXlaOp(PadOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::PaddingConfig padding_config;
+  auto edge_padding_low = op.getEdgePaddingLow();
+  auto edge_padding_high = op.getEdgePaddingHigh();
+  auto interior_padding = op.getInteriorPadding();
+  for (int64_t i = 0, end = edge_padding_low.size(); i < end; ++i) {
+    auto* dims = padding_config.add_dimensions();
+    dims->set_edge_padding_low(edge_padding_low[i]);
+    dims->set_edge_padding_high(edge_padding_high[i]);
+    dims->set_interior_padding(interior_padding[i]);
+  }
+  xla::XlaOp operand, padding_value;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  if (failed(GetXlaOp(op.getPaddingValue(), value_map, &padding_value, op)))
+    return failure();
+
+  value_map[op] = xla::Pad(operand, padding_value, padding_config);
+  return success();
+}
+
+LogicalResult ExportXlaOp(PartitionIdOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::Shape shape = xla::TypeToShape(op.getResult().getType());
+  value_map[op] =
+      xla::internal::XlaBuilderFriend::BuildPartitionId(ctx.builder, shape);
+  return success();
+}
+
+LogicalResult ExportXlaOp(RealDynamicSliceOp op, OpLoweringContext ctx) {
+  // TODO(b/264240901): Implement MHLO export for RealDynamicSliceOp.
+  return failure();
+}
+
+LogicalResult ExportXlaOp(ReduceScatterOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  TensorType operand_type = mlir::cast<TensorType>(op.getOperand().getType());
+  TensorType result_type = op.getType();
+  if (!operand_type.hasStaticShape() || !result_type.hasStaticShape())
+    return failure();
+  auto scatter_dim = op.getScatterDimension();
+  int64_t shard_count = operand_type.getDimSize(scatter_dim) /
+                        result_type.getDimSize(scatter_dim);
+
+  xla::XlaComputation computation;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.getComputation(),
+                                                     &computation))) {
+    return failure();
+  }
+
+  value_map[op] = xla::ReduceScatter(
+      operand, computation, scatter_dim, shard_count,
+      Convert_replica_groups(op.getReplicaGroups()),
+      Convert_channel_handle(op.getChannelHandle()), std::nullopt,
+      Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+  return success();
+}
+
+LogicalResult ExportXlaOp(ReduceWindowOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaComputation body;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.getBody(), &body))) {
+    return failure();
+  }
+  llvm::SmallVector<xla::XlaOp> operands, init_values;
+  if (failed(GetTuple(op, op.getInputs(), ctx, operands)) ||
+      failed(GetTuple(op, op.getInitValues(), ctx, init_values))) {
+    return failure();
+  }
+
+  xla::XlaOp result = xla::ReduceWindowWithGeneralPadding(
+      operands, init_values, body, op.getWindowDimensions(),
+      op.getWindowStrides().value(), op.getBaseDilations().value(),
+      op.getWindowDilations().value(), Convert_padding(op.getPadding()));
+
+  if (op.getNumResults() == 1) {
+    value_map[op.getResult(0)] = result;
+  } else {
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
+  }
+  return success();
+}
+
+LogicalResult ExportXlaOp(RngOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp a, b;
+  if (failed(GetXlaOp(op.getA(), value_map, &a, op))) return failure();
+  if (failed(GetXlaOp(op.getB(), value_map, &b, op))) return failure();
 
-struct OpLoweringContext {
-  llvm::DenseMap<mlir::Value, xla::XlaOp>* values;
-  mlir::ConvertToHloModule* converter;
-  xla::XlaBuilder* builder;
-  mlir::StackFrameIndexBuilder* frame_index_builder;
-};
+  if (op.getRngDistribution() == RngDistribution::UNIFORM) {
+    value_map[op] = xla::RngUniform(a, b, xla::TypeToShape(op.getType()));
+    return success();
+  }
 
-mlir::LogicalResult GetTuple(mlir::Operation* op,
-                             mlir::Operation::operand_range values,
-                             OpLoweringContext ctx,
-                             llvm::SmallVectorImpl<xla::XlaOp>& results) {
-  results.reserve(values.size());
-  for (mlir::Value value : values) {
-    if (failed(GetXlaOp(value, *ctx.values, &results.emplace_back(), op)))
-      return mlir::failure();
+  if (op.getRngDistribution() == RngDistribution::NORMAL) {
+    value_map[op] = xla::RngNormal(a, b, xla::TypeToShape(op.getType()));
+    return success();
   }
-  return mlir::success();
+  return failure();
 }
 
-mlir::LogicalResult GetXlaOps(mlir::Operation* op,
-                              llvm::ArrayRef<mlir::Value> values,
-                              OpLoweringContext ctx,
-                              llvm::SmallVectorImpl<xla::XlaOp>& results) {
-  results.reserve(values.size());
-  for (mlir::Value value : values) {
-    if (failed(GetXlaOp(value, *ctx.values, &results.emplace_back(), op)))
-      return mlir::failure();
-  }
+LogicalResult ExportXlaOp(RngBitGeneratorOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto results = op.getResults();
+  auto xla_arg_1 = value_map[*op.getODSOperands(0).begin()];
+  auto xla_result = xla::RngBitGenerator(
+      static_cast<xla::RandomAlgorithm>(op.getRngAlgorithm()),
+      Unwrap(xla_arg_1), xla::TypeToShape(results[1].getType()));
+
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
   return mlir::success();
 }
 
-// Checks that the results of `op` are simply returned at the end of this
-// function rather than used by other ops in the same function.
-//
-// Used to check that new-style async ops on computations that contain sync
-// versions of old-style async ops can be exported by downgrading to old-style
-// async ops.
-bool SimplyReturnedOp(mlir::Operation* op) {
-  for (auto operand : op->getOperands()) {
-    if (!llvm::isa<mlir::BlockArgument>(operand)) return false;
+LogicalResult ExportXlaOp(ScatterOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaComputation update_computation;
+  if (failed(ctx.converter->LowerRegionAsComputation(&op.getUpdateComputation(),
+                                                     &update_computation))) {
+    return failure();
   }
+  xla::ScatterDimensionNumbers dimension_numbers =
+      Convert_scatter_dimension_numbers(op.getScatterDimensionNumbers());
 
-  auto users = op->getUsers();
-  if (users.empty()) return false;
+  llvm::SmallVector<xla::XlaOp> operands;
+  llvm::SmallVector<xla::XlaOp> updates;
+  if (failed(GetTuple(op, op.getInputs(), ctx, operands))) return failure();
+  if (failed(GetTuple(op, op.getUpdates(), ctx, updates))) return failure();
 
-  auto first_user = *users.begin();
-  for (auto user : users) {
-    if (first_user != user) return false;
+  xla::XlaOp scatter_indices;
+  if (failed(GetXlaOp(op.getScatterIndices(), value_map, &scatter_indices, op)))
+    return failure();
+
+  auto scatter_op = xla::Scatter(
+      operands, scatter_indices, updates, update_computation, dimension_numbers,
+      op.getIndicesAreSorted(), op.getUniqueIndices());
+  if (op->getNumResults() == 1) {
+    value_map[op.getResult(0)] = scatter_op;
+    return success();
   }
 
-  if (llvm::isa<mlir::func::ReturnOp>(first_user)) return true;
-  return false;
+  // mhlo.ScatterOp supports multiple returns, untuple all the results of XLA's.
+  BuildGetTupleElementsForTupleResults(op, scatter_op, ctx);
+
+  return success();
 }
 
-void BuildGetTupleElementsForTupleResults(
-    mlir::Operation* op, xla::XlaOp tuple, xla::XlaBuilder* builder,
-    llvm::DenseMap<mlir::Value, xla::XlaOp>& values,
-    unsigned num_implicit_results = 0) {
-  const std::optional<xla::OpSharding>& sharding = builder->sharding();
-  if (sharding.has_value()) {
-    bool is_tuple_sharding = sharding->type() == xla::OpSharding::TUPLE;
-    assert(!is_tuple_sharding || (op->getNumResults() + num_implicit_results ==
-                                  sharding->tuple_shardings_size()));
-    for (auto [index, result] : llvm::enumerate(op->getResults())) {
-      // If `sharding` is not a tuple sharding, then every `get-tuple-element`
-      // gets the same sharding.
-      xla::XlaScopedShardingAssignment scoped_sharding(
-          builder,
-          is_tuple_sharding ? sharding->tuple_shardings(index) : sharding);
-      values[result] = xla::GetTupleElement(tuple, index);
-    }
-  } else {
-    xla::XlaScopedShardingAssignment scoped_sharding(builder, std::nullopt);
-    for (auto [index, result] : llvm::enumerate(op->getResults())) {
-      values[result] = xla::GetTupleElement(tuple, index);
+LogicalResult ExportXlaOp(SelectAndScatterOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaComputation select;
+  xla::XlaComputation scatter;
+  if (failed(
+          ctx.converter->LowerRegionAsComputation(&op.getSelect(), &select)) ||
+      failed(ctx.converter->LowerRegionAsComputation(&op.getScatter(),
+                                                     &scatter))) {
+    return failure();
+  }
+  xla::XlaOp operand, source, init_value;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  if (failed(GetXlaOp(op.getSource(), value_map, &source, op)))
+    return failure();
+  if (failed(GetXlaOp(op.getInitValue(), value_map, &init_value, op)))
+    return failure();
+
+  value_map[op] = xla::SelectAndScatterWithGeneralPadding(
+      operand, select, op.getWindowDimensions().value(),
+      op.getWindowStrides().value(), Convert_padding(op.getPadding()), source,
+      init_value, scatter);
+  return success();
+}
+
+// TODO(b/298671312): The semantics of xla::SetDimensionSize have changed so
+// that it always returns a dynamic shape.  The old semantics are still
+// available through xla::RemoveDynamicDimension, so to avoid changing MHLO
+// semantics we explicitly check for that case here.  However, we should
+// consider adding a RemoveDynamicDimensionOp to HLO and MHLO.
+mlir::LogicalResult ExportXlaOp(mlir::stablehlo::SetDimensionSizeOp op,
+                                OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  xla::XlaOp array;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &array, op)))
+    return mlir::failure();
+  auto dimension = Convertuint64_t(op.getDimension());
+  auto shape_or = ctx.builder->GetShapePtr(array);
+  if (!shape_or.ok()) {
+    return op.emitError(shape_or.status().ToString());
+  }
+  xla::XlaOp xla_result;
+  if (auto constant = llvm::dyn_cast_or_null<mlir::mhlo::ConstantOp>(
+          op.getSize().getDefiningOp());
+      constant != nullptr) {
+    auto value = constant.getValue();
+    auto values = value.getValues<mlir::IntegerAttr>();
+    if ((*values.begin()).getValue().getSExtValue() ==
+        shape_or.value()->dimensions(dimension)) {
+      xla_result = xla::RemoveDynamicDimension(array, dimension);
     }
   }
+  if (!xla_result.valid()) {
+    xla::XlaOp dynamic_size;
+    if (failed(GetXlaOp(op.getSize(), value_map, &dynamic_size, op)))
+      return mlir::failure();
+    xla_result = xla::SetDimensionSize(array, dynamic_size, dimension);
+  }
+  value_map[result] = xla_result;
+  return mlir::success();
 }
 
-void BuildGetTupleElementsForTupleResults(mlir::Operation* op, xla::XlaOp tuple,
-                                          OpLoweringContext ctx,
-                                          unsigned num_implicit_results = 0) {
-  BuildGetTupleElementsForTupleResults(op, tuple, ctx.builder, *ctx.values,
-                                       num_implicit_results);
+LogicalResult ExportXlaOp(UniformQuantizeOp op, OpLoweringContext ctx) {
+  // Currently, it doesn't have an XLA builder equivalent.
+  // TODO(b/230671877): Implement XLA import/export for quantized MHLO ops.
+  return failure();
+}
+
+LogicalResult ExportXlaOp(UniformDequantizeOp op, OpLoweringContext ctx) {
+  // Currently, it doesn't have an XLA builder equivalent.
+  // TODO(b/230671877): Implement XLA import/export for quantized MHLO ops.
+  return failure();
 }
 
 }  // namespace
+}  // namespace stablehlo
 
-namespace mlir {
 namespace mhlo {
 namespace {
 LogicalResult ExportXlaOp(CollectiveBroadcastOp op, OpLoweringContext ctx) {
@@ -1104,7 +2956,9 @@ LogicalResult ExportXlaOp(CompositeOp, OpLoweringContext) {
 }
 
 LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
-  // This op has no expression in the legacy export format.
+  // HLO has no support for DynamicBroadcastInDimOp.
+  // These all must be refined away before lowering.
+  // See https://openxla.org/stablehlo/dynamism
   return failure();
 }
 
@@ -1656,7 +3510,9 @@ LogicalResult ExportXlaOp(CosineOp op, OpLoweringContext ctx) {
   xla::XlaOp arg;
   if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &arg, op)))
     return mlir::failure();
-  auto xla_result = xla::Cos(Unwrap(arg));
+  xla::ResultAccuracy result_accuracy =
+      Convert_result_accuracy(op.getResultAccuracy());
+  auto xla_result = xla::Cos(Unwrap(arg), result_accuracy);
   value_map[result] = xla_result;
   return mlir::success();
 }
@@ -1665,9 +3521,11 @@ LogicalResult ExportXlaOp(TanOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   auto result = op.getResult();
   xla::XlaOp arg;
+  xla::ResultAccuracy result_accuracy =
+      Convert_result_accuracy(op.getResultAccuracy());
   if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &arg, op)))
     return mlir::failure();
-  auto xla_result = xla::Tan(Unwrap(arg));
+  auto xla_result = xla::Tan(Unwrap(arg), result_accuracy);
   value_map[result] = xla_result;
   return mlir::success();
 }
@@ -1917,9 +3775,9 @@ LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
     if (failed(GetXlaOps(op, implicit_operands, ctx, args))) return failure();
 
     llvm::SmallVector<std::optional<xla::OpSharding>> arg_shardings;
-    if (!ret_shardings.empty()) {
-      // We only add arg shardings if there are result shardings, otherwise it
-      // means sharding propagation hasn't been done yet.
+    if (!ret_shardings.empty() || op->getNumResults() == 0) {
+      // We only add arg shardings if there are result shardings or no results,
+      // otherwise it means sharding propagation hasn't been done yet.
       arg_shardings = GetXlaOpShardings(args);
     }
 
@@ -2928,9 +4786,11 @@ mlir::LogicalResult ExportXlaOp(mlir::mhlo::SineOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   auto result = op.getResult();
   xla::XlaOp arg;
+  xla::ResultAccuracy result_accuracy =
+      Convert_result_accuracy(op.getResultAccuracy());
   if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &arg, op)))
     return mlir::failure();
-  auto xla_result = xla::Sin(Unwrap(arg));
+  auto xla_result = xla::Sin(Unwrap(arg), result_accuracy);
   value_map[result] = xla_result;
   return mlir::success();
 }
@@ -3189,7 +5049,7 @@ LogicalResult ExportXlaOp(MinimumBroadcastShapesOp op, OpLoweringContext ctx) {
 }  // namespace mhlo
 }  // namespace mlir
 
-#include "xla/hlo/translate/mhlo_to_hlo/operator_writers.inc"
+#include "xla/hlo/translate/mhlo_to_hlo/hlo_op_writer.inc"
 
 namespace mlir {
 namespace {
@@ -3400,6 +5260,64 @@ LogicalResult ConvertToHloModule::LowerCast(
   return success();
 }
 
+LogicalResult ConvertToHloModule::LowerStablehloCompositeCall(
+    mlir::Operation* inst, xla::XlaBuilder* module_builder,
+    xla::XlaBuilder* builder,
+    ConvertToHloModule::ValueLoweringMap* value_lowering,
+    xla::XlaOp* return_value) {
+  auto& value_map = *value_lowering;
+  SmallVector<xla::XlaOp, 1> operands;
+  for (const Value& val : inst->getOperands()) {
+    xla::XlaOp operand;
+    if (failed(GetXlaOp(val, value_map, &operand, inst))) {
+      return failure();
+    }
+    operands.push_back(operand);
+  }
+
+  auto composite_op = cast<stablehlo::CompositeOp>(inst);
+  xla::XlaComputation computation;
+  if (failed(LowerBasicBlockAsFunction(
+          /*block=*/&module_
+              .lookupSymbol<mlir::func::FuncOp>(composite_op.getDecomposition())
+              .getBody()
+              .front(),
+          /*builder=*/
+          module_builder_
+              .CreateSubBuilder(composite_op.getDecomposition().str())
+              .get(),
+          /*is_entry_function=*/false,
+          /*ensure_single_arg=*/false,
+          /*entry_args_same_across_replicas=*/{},
+          /*arg_shardings=*/{}, /*ret_shardings=*/{},
+          /*fe_attrs=*/{}, /*result=*/&computation,
+          /*implicit_operands=*/{}))) {
+    return failure();
+  }
+
+  std::string composite_attributes;
+  llvm::raw_string_ostream(composite_attributes)
+      << composite_op.getCompositeAttributes();
+
+  xla::XlaOp composite_call = xla::CompositeCall(
+      builder, computation, operands, composite_op.getName().str(),
+      composite_attributes, composite_op.getVersion());
+
+  // Use GetTupleElement for multiple outputs
+  unsigned num_results = composite_op.getNumResults();
+  if (num_results > 1) {
+    for (unsigned i = 0; i != num_results; ++i) {
+      value_map[composite_op.getResult(i)] =
+          xla::GetTupleElement(composite_call, i);
+    }
+  } else if (num_results == 1) {
+    value_map[composite_op.getResult(0)] = composite_call;
+  }
+  *return_value = composite_call;
+
+  return success();
+}
+
 LogicalResult ConvertToHloModule::LowerCompositeCall(
     mlir::Operation* inst, xla::XlaBuilder* module_builder,
     xla::XlaBuilder* builder,
@@ -3594,7 +5512,18 @@ LogicalResult ConvertToHloModule::LowerReturn(
                 /*fast_mem=*/false);
         if (!reshape.ok())
           return inst->emitError() << reshape.status().message();
-        returns[index] = reshape.value();
+
+        absl::StatusOr<xla::Shape> old_shape =
+            builder->GetShape(returns[index]);
+        absl::StatusOr<xla::Shape> new_shape =
+            builder->GetShape(reshape.value());
+        if (!old_shape.ok() || !new_shape.ok()) {
+          return inst->emitError() << "Failed to get shape.";
+        }
+        if (!xla::ShapeUtil::Equal(*old_shape, *new_shape)) {
+          returns[index] = reshape.value();
+        }
+        // TODO(b/417428036). Remove the dead reshapes.
       }
     }
 
@@ -3630,8 +5559,8 @@ LogicalResult ConvertToHloModule::Lower(
     ConvertToHloModule::ValueLoweringMap* value_lowering,
     xla::XlaOp* return_value) {
   // Explicitly fail for ops that are not supported for export.
-  if (inst->getDialect() !=
-          inst->getContext()->getLoadedDialect<mlir::mhlo::MhloDialect>() &&
+  if (!mlir::isa<mhlo::MhloDialect, stablehlo::StablehloDialect>(
+          inst->getDialect()) &&
       !mlir::isa<mlir::func::ConstantOp, mlir::arith::ConstantOp,
                  mlir::func::CallOp, mlir::tensor::CastOp,
                  mlir::func::ReturnOp>(inst)) {
@@ -3657,7 +5586,7 @@ LogicalResult ConvertToHloModule::Lower(
     }
     // For infeed ops stemming back to InfeedDequeueTuple, respect the
     // layout attribute, and create the corresponding layout in hlo.
-    if (isa<mhlo::InfeedOp>(inst)) {
+    if (isa<mhlo::InfeedOp, stablehlo::InfeedOp>(inst)) {
       return LowerInfeed(inst, builder, value_lowering);
     }
     return success();
@@ -3676,11 +5605,21 @@ LogicalResult ConvertToHloModule::Lower(
                               return_value);
   }
 
+  if (auto composite_op = dyn_cast<stablehlo::CompositeOp>(inst)) {
+    return LowerStablehloCompositeCall(inst, &module_builder_, builder,
+                                       value_lowering, return_value);
+  }
+
   ElementsAttr const_attr;
   if (matchPattern(inst, m_Constant(&const_attr))) {
     return LowerConstant(inst, builder, value_lowering, const_attr);
   }
 
+  if (isa<stablehlo::ReturnOp, mlir::func::ReturnOp>(inst)) {
+    return LowerReturn(inst, is_entry_function, ret_shardings, implicit_results,
+                       builder, value_lowering, return_value, options_);
+  }
+
   if (isa<mhlo::ReturnOp, mlir::func::ReturnOp>(inst)) {
     return LowerReturn(inst, is_entry_function, ret_shardings, implicit_results,
                        builder, value_lowering, return_value, options_);
@@ -4049,11 +5988,14 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
   }
 
   xla::XlaOp return_value;
-  for (auto& inst : *block)
+  for (auto& inst : *block) {
+    if (isa<stablehlo::StablehloDialect>(inst.getDialect()))
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Lowering: " << inst.getName().getStringRef() << "\n");
     if (failed(Lower(&inst, is_entry_function, ret_shardings, implicit_results,
                      builder, &lowering, &return_value)))
       return failure();
-
+  }
   // Build the XlaComputation and check for failures.
   auto computation_or =
       return_value.valid() ? builder->Build(return_value) : builder->Build();
@@ -4062,6 +6004,7 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
     return failure();
   }
   *result = std::move(computation_or.value());
+  LLVM_DEBUG(llvm::dbgs() << "Created: " << result->name() << "\n");
   return success();
 }
 
@@ -4094,9 +6037,18 @@ absl::Status PrepareForExport(mlir::ModuleOp module) {
     // Experimental support for exporting dynamic MHLO programs to HLO.
     // Only bounded dynamism is planned to be supported; unbounded dynamism
     // is out of scope for now.
+    //
+    // Shape -> MHLO
+    // Currently takes overhead if input is MHLO for MHLO->StableHLO, can
+    // be deleted once conversion can assume StableHLO input.
+    mlir::mhlo::HloLegalizeToStablehloPassOptions options;
+    options.allow_xla_features_ = true;
+    pm.addNestedPass<mlir::func::FuncOp>(
+        stablehlo_ext::createSymbolicShapeOptimizationPass());
+    pm.addPass(mhlo::createHloLegalizeToStablehloPass(options));
     pm.addNestedPass<mlir::func::FuncOp>(
-        mhlo::createSymbolicShapeOptimizationPass());
-    pm.addNestedPass<mlir::func::FuncOp>(mhlo::createShapeLegalizeToHloPass());
+        stablehlo::createShapeLegalizeToStablehloPass());
+    pm.addPass(mhlo::createStablehloLegalizeToHloPass());
   }
 
   mlir::BaseScopedDiagnosticHandler handler(module.getContext());
@@ -4124,7 +6076,10 @@ absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
   // temporarily support StableHLO to MHLO lowering here as well to ensure
   // a smooth migration.
   mlir::PassManager pm(module->getContext());
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  mhlo::StablehloLegalizeToHloPassOptions shlo_pass_opts;
+  shlo_pass_opts.convert_xla_supported_stablehlo_ =
+      !options.direct_stablehlo_to_hlo;
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass(shlo_pass_opts));
   if (failed(pm.run(module))) {
     return tsl::errors::Internal("Unable to convert StableHLO to MHLO");
   }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
index 6a90e32bbb24..f61223f5d85a 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
@@ -61,6 +61,12 @@ struct MlirToHloConversionOptions {
   // Multiple return values are always converted to a tuple and returned as a
   // single value.
   bool return_tuple = true;
+
+  // If true, StableHLO ops that are supported by XLA will be converted directly
+  // to HLO. Otherwise, they will be converted to MHLO and then lowered to HLO.
+  // This is a temporary flag to support the ongoing direct stableHLO to HLO
+  // translation.
+  bool direct_stablehlo_to_hlo = false;
 };
 
 // Prefer `ConvertMlirHloToHloModule` over this method when possible, as it
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/module_attributes_exporter.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/module_attributes_exporter.cc
index afdae4739d79..2f1c78a8d0fe 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/module_attributes_exporter.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/module_attributes_exporter.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -29,6 +31,8 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/xla_data.pb.h"
 
+#define DEBUG_TYPE "hlo-translate"
+
 namespace mlir {
 namespace mhlo {
 namespace {
@@ -129,6 +133,18 @@ absl::Status AddTileToShapeProto(
   return absl::OkStatus();
 }
 
+// Finds the entry XLA computation.
+absl::StatusOr<xla::HloComputationProto*> FindEntryComputation(
+    xla::HloModuleProto& hlo) {
+  const int id = hlo.entry_computation_id();
+  for (auto& c : *hlo.mutable_computations()) {
+    if (c.id() == id) {
+      return &c;
+    }
+  }
+  return absl::InvalidArgumentError("Missing entry computation");
+}
+
 }  // namespace
 
 void ExportHloModuleConfig(xla::HloModuleConfig& config,
@@ -146,16 +162,24 @@ void ExportHloModuleConfig(xla::HloModuleConfig& config,
 absl::Status ExportModuleEntryComputationParameterLayouts(
     const mlir::ArrayAttr& xla_entry_computation_parameter_layout,
     xla::HloModuleProto& hlo_module) {
-  for (auto [i, parameter_layout] :
+  auto entry_computation = FindEntryComputation(hlo_module);
+  if (!entry_computation.ok()) return entry_computation.status();
+
+  auto entry_computation_params =
+      entry_computation.value()->mutable_program_shape()->mutable_parameters();
+  auto host_program_params =
+      hlo_module.mutable_host_program_shape()->mutable_parameters();
+
+  LLVM_DEBUG(llvm::dbgs() << "Setting "
+                          << xla_entry_computation_parameter_layout.size()
+                          << " parameter layouts for "
+                          << entry_computation.value()->name() << "\n");
+
+  for (auto [arg_i, parameter_layout] :
        llvm::enumerate(xla_entry_computation_parameter_layout)) {
     auto status = AddLayoutToShapeProto(
-        parameter_layout,
-        hlo_module.mutable_host_program_shape()->mutable_parameters()->Mutable(
-            i),
-        hlo_module.mutable_computations(0)
-            ->mutable_program_shape()
-            ->mutable_parameters()
-            ->Mutable(i));
+        parameter_layout, host_program_params->Mutable(arg_i),
+        entry_computation_params->Mutable(arg_i));
     if (!status.ok()) return status;
   }
   return absl::OkStatus();
@@ -164,16 +188,24 @@ absl::Status ExportModuleEntryComputationParameterLayouts(
 absl::Status ExportModuleEntryComputationParameterTiles(
     const mlir::ArrayAttr& xla_entry_computation_parameter_tiles,
     xla::HloModuleProto& hlo_module) {
+  auto entry_computation = FindEntryComputation(hlo_module);
+  if (!entry_computation.ok()) return entry_computation.status();
+
+  auto entry_computation_params =
+      entry_computation.value()->mutable_program_shape()->mutable_parameters();
+  auto host_program_params =
+      hlo_module.mutable_host_program_shape()->mutable_parameters();
+
+  LLVM_DEBUG(llvm::dbgs() << "Setting "
+                          << xla_entry_computation_parameter_tiles.size()
+                          << " parameter tiles for "
+                          << entry_computation.value()->name() << "\n");
+
   for (auto [arg_i, parameter_tile_arg] :
        llvm::enumerate(xla_entry_computation_parameter_tiles)) {
-    auto status = AddTileToShapeProto(
-        parameter_tile_arg,
-        hlo_module.mutable_host_program_shape()->mutable_parameters()->Mutable(
-            arg_i),
-        hlo_module.mutable_computations(0)
-            ->mutable_program_shape()
-            ->mutable_parameters()
-            ->Mutable(arg_i));
+    auto status = AddTileToShapeProto(parameter_tile_arg,
+                                      host_program_params->Mutable(arg_i),
+                                      entry_computation_params->Mutable(arg_i));
     if (!status.ok()) return status;
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/operator_writer_gen.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/operator_writer_gen.cc
deleted file mode 100644
index 53e6aea94724..000000000000
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/operator_writer_gen.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Main.h"
-#include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenBackend.h"
-#include "mlir/TableGen/Argument.h"
-#include "mlir/TableGen/Attribute.h"
-#include "mlir/TableGen/Operator.h"
-
-using llvm::interleaveComma;
-using llvm::raw_ostream;
-using llvm::RecordKeeper;
-using llvm::StringRef;
-using mlir::tblgen::Attribute;
-using mlir::tblgen::NamedAttribute;
-using mlir::tblgen::NamedTypeConstraint;
-using mlir::tblgen::Operator;
-
-static std::string GetDefaultAttrExport(
-    const mlir::tblgen::NamedAttribute& named_attr) {
-  Attribute attr = named_attr.attr;
-  StringRef storage_type = attr.getStorageType();
-  // For some attribute types we have a general conversion, so use that.
-  if (!attr.isEnumAttr() && (storage_type.ends_with("BoolAttr") ||
-                             storage_type.ends_with("FloatAttr") ||
-                             storage_type.ends_with("IntegerAttr") ||
-                             storage_type.ends_with("StringAttr"))) {
-    // The return type may contains qualified namespaces. Split to remove them.
-    std::pair<StringRef, StringRef> splits = attr.getReturnType().rsplit("::");
-    StringRef symbol = splits.second;
-    if (symbol.empty()) symbol = splits.first;
-    return "Convert" + symbol.str();
-  }
-  return "Convert_" + named_attr.name.str();
-}
-
-static StringRef GetClientBuilder(const Operator& op) {
-  static const auto* kOpToXLABuilderMap =
-      new llvm::StringMap<StringRef>{{"ReverseOp", "Rev"},
-                                     {"ConcatenateOp", "ConcatInDim"},
-                                     {"ConvOp", "ConvGeneralDilated"}};
-
-  StringRef op_name = op.getCppClassName();
-
-  // Default case where the client builder method names closely follow the op
-  // names in the dialect. For e.g., AddOp -> xla::Add method.
-  if (!kOpToXLABuilderMap->count(op_name)) return op_name.drop_back(2);
-
-  // Otherwise, if the op to client builder method mapping is provided.
-  return kOpToXLABuilderMap->lookup(op_name);
-}
-
-static void BuildOperator(const Operator& op, raw_ostream& os) {
-  os << "mlir::LogicalResult ExportXlaOp(mlir::mhlo::" << op.getCppClassName()
-     << " op, OpLoweringContext ctx) {\n"
-     << "  auto& value_map = *ctx.values;\n"
-     << "  auto result = op.getResult();\n";
-
-  // Build a conversion for each of the arguments.
-  int operand_number = 0;
-  for (int index : llvm::seq<int>(0, op.getNumArgs())) {
-    auto arg = op.getArg(index);
-
-    // Emit an argument for an operand.
-    if (auto* operand_cst = arg.dyn_cast<NamedTypeConstraint*>()) {
-      std::string xla_arg = "xla_arg_" + std::to_string(index);
-      // Handle a non-variadic operand.
-      if (!operand_cst->isVariableLength()) {
-        os << "  xla::XlaOp " << xla_arg << ";\n";
-        os << "  if (failed(GetXlaOp(*op.getODSOperands(" << operand_number++
-           << ").begin(), value_map, &" << xla_arg << ", op)))\n";
-        os << "    return mlir::failure();\n";
-        continue;
-      }
-
-      // Otherwise, this is a varidiac operand list.
-      os << "  std::vector<xla::XlaOp> " << xla_arg << ";\n"
-         << "  for (auto operand : op.getODSOperands(" << operand_number++
-         << ")) {\n";
-      os << "    xla::XlaOp result;\n";
-      os << "    if (failed(GetXlaOp(operand, value_map, &result, op)))\n";
-      os << "      return mlir::failure();\n";
-      os << "    " << xla_arg << ".push_back(result);\n";
-      os << "  }\n";
-      continue;
-    }
-
-    // Otherwise, this is an attribute.
-    auto named_attr = arg.get<NamedAttribute*>();
-    os << "  auto xla_arg_" << index << " = "
-       << GetDefaultAttrExport(*named_attr) << "(op.get"
-       << convertToCamelFromSnakeCase(op.getArgName(index),
-                                      /*capitalizeFirst=*/true)
-       << "());\n";
-  }
-
-  // Emit call to client API
-  os << "  auto xla_result = xla::" << GetClientBuilder(op) << "(";
-
-  // If all operands are variadic, then pass the builder explicitly to xla
-  // client API call
-  if (op.getNumOperands() == op.getNumVariableLengthOperands()) {
-    os << "ctx.builder";
-    if (op.getNumArgs() != 0) os << ", ";
-  }
-
-  // Emit each of the arguments.
-  interleaveComma(llvm::seq<int>(0, op.getNumArgs()), os,
-                  [&](int i) { os << "Unwrap(xla_arg_" << i << ')'; });
-  os << ");\n";
-
-  os << "  value_map[result] = xla_result;\n";
-  os << "  return mlir::success();\n";
-  os << "}\n";
-}
-
-// The function below has a non-constant reference as that is required by LLVM's
-// TableGenMain.
-// NOLINTNEXTLINE
-static bool OperatorWritersMain(raw_ostream& os, const RecordKeeper& records) {
-  emitSourceFileHeader("MLIR XLA Builders", os);
-
-  // Emit all the helper functions.
-  for (const auto* def : records.getAllDerivedDefinitions("MHLO_Op")) {
-    Operator op(def);
-
-    // Skip operations that have a custom exporter.
-    if (!def->getValueAsBit("hasCustomHLOConverter")) BuildOperator(op, os);
-  }
-
-  // Emit a function to generate an XLA operation for the operations with
-  // auto-generated builders.
-  os << "mlir::LogicalResult ExportXlaOperator(\n"
-        "mlir::Operation* op, OpLoweringContext lowering_context) {\n\n";
-
-  // Create a scoped object to assign sharding to generated XLA ops. Any HLO
-  // can have an attribute of "sharding".
-  os << "  xla::XlaScopedShardingAssignment sharding(lowering_context.builder, "
-        "CreateOpShardingFromAttribute(op));\n\n";
-
-  // Create a scoped object to assign frontend attributes to generated XLA ops.
-  // Any HLO can have an attribute of "frontend_attributes", which are used to
-  // pass hints / configuration options.
-  os << "  xla::XlaScopedFrontendAttributesAssignment "
-        "frontend_attributes(lowering_context.builder, "
-        "CreateXlaFrontendAttributesFromOp(op));\n\n";
-
-  // Create a scoped object to assign op metadata to generated XLA ops.
-  os << "  xla::XlaScopedOpMetadataAssignment "
-        "op_metadata(lowering_context.builder, "
-        "mlir::mhlo::CreateOpMetadataFromLocation("
-        "op, lowering_context.frame_index_builder));\n\n";
-
-  // Retrieve all the definitions derived from MHLO_Op and sort by record name.
-  for (const auto* def : records.getAllDerivedDefinitions("MHLO_Op")) {
-    // Skip operations that have a custom exporter.
-    Operator op(def);
-
-    // Cast to the current operation and build the exporter.
-    os << "  if (auto xla_op = llvm::dyn_cast<mlir::mhlo::"
-       << op.getCppClassName() << ">(op)) {\n";
-    os << "    return ";
-    // The autogenerated converters aren't in the same namespace.
-    // TODO(jpienaar): Reconsider this.
-    if (def->getValueAsBit("hasCustomHLOConverter")) os << "mlir::mhlo::";
-    os << "ExportXlaOp(xla_op, lowering_context);\n";
-    os << "  }\n";
-  }
-
-  os << "  return mlir::failure();\n"
-        "}\n";
-  return false;
-}
-
-int main(int argc, char** argv) {
-  llvm::InitLLVM y(argc, argv);
-  llvm::cl::ParseCommandLineOptions(argc, argv);
-  return TableGenMain(argv[0], &OperatorWritersMain);
-}
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/add.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/add.mlir
index cdaef75f7753..35696ceea86b 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/add.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/add.mlir
@@ -12,10 +12,10 @@ func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT: %Arg_0.1 = f32[4] parameter(0)
   // CHECK-NEXT: %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT: %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
+  // CHECK-NEXT: %add.3 = f32[4] add(%Arg_0.1, %Arg_1.2)
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 
-  // CHECK-NEXT: ROOT %add.4 = f32[4] add(f32[4] %add.3, f32[4] %Arg_1.2)
+  // CHECK-NEXT: ROOT %add.4 = f32[4] add(%add.3, %Arg_1.2)
   %1 = "mhlo.add"(%0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   func.return %1 : tensor<4xf32>
 }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/attributes.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/attributes.mlir
index d525ac4fdeb4..40df18a01ed2 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/attributes.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/attributes.mlir
@@ -3,7 +3,9 @@
 // CHECK-LABEL: HloModule dot_algorithm_f8_f8_f32
 module @dot_algorithm_f8_f8_f32 {
   func.func @main(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-    // CHECK: f32[2,2,2] dot(f32[2,2,2] {{.*}}, f32[2,2,2] {{.*}}), {{.*}}, algorithm=dot_any_f8_any_f8_f32
+    // CHECK: %[[ARG0:.+]] = f32[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[2,2,2] parameter(1)
+    // CHECK: f32[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_any_f8_any_f8_f32
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -25,7 +27,9 @@ module @dot_algorithm_f8_f8_f32 {
 // CHECK-LABEL: HloModule dot_algorithm_f8_f8_f32_fast_accum
 module @dot_algorithm_f8_f8_f32_fast_accum {
   func.func @main(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-    // CHECK: f32[2,2,2] dot(f32[2,2,2] {{.*}}, f32[2,2,2] {{.*}}), {{.*}}, algorithm=dot_any_f8_any_f8_f32_fast_accum
+    // CHECK: %[[ARG0:.+]] = f32[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[2,2,2] parameter(1)
+    // CHECK: f32[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_any_f8_any_f8_f32_fast_accum
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -47,7 +51,9 @@ module @dot_algorithm_f8_f8_f32_fast_accum {
 // CHECK-LABEL: HloModule dot_algorithm_f16_f16_f16
 module @dot_algorithm_f16_f16_f16 {
   func.func @main(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-    // CHECK: f32[2,2,2] dot(f32[2,2,2] {{.*}}, f32[2,2,2] {{.*}}), {{.*}}, algorithm=dot_f16_f16_f16
+    // CHECK: %[[ARG0:.+]] = f32[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[2,2,2] parameter(1)
+    // CHECK: f32[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_f16_f16_f16
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -69,7 +75,9 @@ module @dot_algorithm_f16_f16_f16 {
 // CHECK-LABEL: HloModule dot_algorithm_f16_f16_f32
 module @dot_algorithm_f16_f16_f32 {
   func.func @main(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-    // CHECK: f32[2,2,2] dot(f32[2,2,2] {{.*}}, f32[2,2,2] {{.*}}), {{.*}}, algorithm=dot_f16_f16_f32
+    // CHECK: %[[ARG0:.+]] = f32[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[2,2,2] parameter(1)
+    // CHECK: f32[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_f16_f16_f32
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -91,7 +99,9 @@ module @dot_algorithm_f16_f16_f32 {
 // CHECK-LABEL: HloModule dot_algorithm_bf16_bf16_bf16
 module @dot_algorithm_bf16_bf16_bf16 {
   func.func @main(%arg0: tensor<2x2x2xbf16>, %arg1: tensor<2x2x2xbf16>) -> tensor<2x2x2xbf16> {
-    // CHECK: bf16[2,2,2] dot(bf16[2,2,2] {{.*}}, bf16[2,2,2] {{.*}}), {{.*}}, algorithm=dot_bf16_bf16_bf16
+    // CHECK: %[[ARG0:.+]] = bf16[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = bf16[2,2,2] parameter(1)
+    // CHECK: bf16[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_bf16_bf16_bf16
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -113,7 +123,9 @@ module @dot_algorithm_bf16_bf16_bf16 {
 // CHECK-LABEL: HloModule dot_algorithm_bf16_bf16_f32
 module @dot_algorithm_bf16_bf16_f32 {
   func.func @main(%arg0: tensor<2x2x2xbf16>, %arg1: tensor<2x2x2xbf16>) -> tensor<2x2x2xbf16> {
-    // CHECK: bf16[2,2,2] dot(bf16[2,2,2] {{.*}}, bf16[2,2,2] {{.*}}), {{.*}}, algorithm=dot_bf16_bf16_f32
+    // CHECK: %[[ARG0:.+]] = bf16[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = bf16[2,2,2] parameter(1)
+    // CHECK: bf16[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_bf16_bf16_f32
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -135,7 +147,9 @@ module @dot_algorithm_bf16_bf16_f32 {
 // CHECK-LABEL: HloModule dot_algorithm_bf16_bf16_f32_x3
 module @dot_algorithm_bf16_bf16_f32_x3 {
   func.func @main(%arg0: tensor<2x2x2xbf16>, %arg1: tensor<2x2x2xbf16>) -> tensor<2x2x2xbf16> {
-    // CHECK: bf16[2,2,2] dot(bf16[2,2,2] {{.*}}, bf16[2,2,2] {{.*}}), {{.*}}, algorithm=dot_bf16_bf16_f32_x3
+    // CHECK: %[[ARG0:.+]] = bf16[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = bf16[2,2,2] parameter(1)
+    // CHECK: bf16[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_bf16_bf16_f32_x3
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -157,7 +171,9 @@ module @dot_algorithm_bf16_bf16_f32_x3 {
 // CHECK-LABEL: HloModule dot_algorithm_bf16_bf16_f32_x6
 module @dot_algorithm_bf16_bf16_f32_x6 {
   func.func @main(%arg0: tensor<2x2x2xbf16>, %arg1: tensor<2x2x2xbf16>) -> tensor<2x2x2xbf16> {
-    // CHECK: bf16[2,2,2] dot(bf16[2,2,2] {{.*}}, bf16[2,2,2] {{.*}}), {{.*}}, algorithm=dot_bf16_bf16_f32_x6
+    // CHECK: %[[ARG0:.+]] = bf16[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = bf16[2,2,2] parameter(1)
+    // CHECK: bf16[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_bf16_bf16_f32_x6
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -179,7 +195,9 @@ module @dot_algorithm_bf16_bf16_f32_x6 {
 // CHECK-LABEL: HloModule dot_algorithm_tf32_tf32_f32
 module @dot_algorithm_tf32_tf32_f32 {
   func.func @main(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-    // CHECK: f32[2,2,2] dot(f32[2,2,2] {{.*}}, f32[2,2,2] {{.*}}), {{.*}}, algorithm=dot_tf32_tf32_f32
+    // CHECK: %[[ARG0:.+]] = f32[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[2,2,2] parameter(1)
+    // CHECK: f32[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_tf32_tf32_f32
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -201,7 +219,9 @@ module @dot_algorithm_tf32_tf32_f32 {
 // CHECK-LABEL: HloModule dot_algorithm_tf32_tf32_f32_x3
 module @dot_algorithm_tf32_tf32_f32_x3 {
   func.func @main(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-    // CHECK: f32[2,2,2] dot(f32[2,2,2] {{.*}}, f32[2,2,2] {{.*}}), {{.*}}, algorithm=dot_tf32_tf32_f32_x3
+    // CHECK: %[[ARG0:.+]] = f32[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[2,2,2] parameter(1)
+    // CHECK: f32[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_tf32_tf32_f32_x3
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -223,7 +243,9 @@ module @dot_algorithm_tf32_tf32_f32_x3 {
 // CHECK-LABEL: HloModule dot_algorithm_f32_f32_f32
 module @dot_algorithm_f32_f32_f32 {
   func.func @main(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> tensor<2x2x2xf32> {
-    // CHECK: f32[2,2,2] dot(f32[2,2,2] {{.*}}, f32[2,2,2] {{.*}}), {{.*}}, algorithm=dot_f32_f32_f32
+    // CHECK: %[[ARG0:.+]] = f32[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[2,2,2] parameter(1)
+    // CHECK: f32[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_f32_f32_f32
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
@@ -245,7 +267,9 @@ module @dot_algorithm_f32_f32_f32 {
 // CHECK-LABEL: HloModule dot_algorithm_f64_f64_f64
 module @dot_algorithm_f64_f64_f64 {
   func.func @main(%arg0: tensor<2x2x2xf64>, %arg1: tensor<2x2x2xf64>) -> tensor<2x2x2xf64> {
-    // CHECK: f64[2,2,2] dot(f64[2,2,2] {{.*}}, f64[2,2,2] {{.*}}), {{.*}}, algorithm=dot_f64_f64_f64
+    // CHECK: %[[ARG0:.+]] = f64[2,2,2] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f64[2,2,2] parameter(1)
+    // CHECK: f64[2,2,2] dot(%[[ARG0]], %[[ARG1]]), {{.*}}, algorithm=dot_f64_f64_f64
     %0 = "mhlo.dot_general"(%arg0, %arg1) <{
       dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
       precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/call.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/call.mlir
index 791dd91a8ebb..c688cb554a1e 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/call.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/call.mlir
@@ -4,7 +4,7 @@ module @call_with_backend_config {
   func.func @main(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
     // CHECK:      ENTRY %main.{{[0-9]+}} ([[ARG0:Arg_0.[0-9]+]]: s32[8,2]) -> s32[8,2] {
     // CHECK-NEXT:   %[[ARG0]] = s32[8,2] parameter(0)
-    // CHECK-NEXT:   s32[8,2] call(s32[8,2] %[[ARG0]]), to_apply=%g.{{[0-9.]+}}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+    // CHECK-NEXT:   s32[8,2] call(%[[ARG0]]), to_apply=%g.{{[0-9.]+}}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
     %0 = call @g.2(%arg0) {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
     %1 = mhlo.custom_call @MoveToHost(%0) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
     return %1 : tensor<8x2xi32>
@@ -22,7 +22,7 @@ module @call_with_sharding {
   func.func @main(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
     // CHECK:      ENTRY %main.{{[0-9]+}} ([[ARG0:Arg_0.[0-9]+]]: s32[8,2]) -> s32[8,2] {
     // CHECK-NEXT:   %[[ARG0]] = s32[8,2] parameter(0)
-    // CHECK-NEXT:   s32[8,2] call(s32[8,2] %[[ARG0]]), to_apply=%g.{{[0-9.]+}}, sharding={devices=[2,2]<=[4]}
+    // CHECK-NEXT:   s32[8,2] call(%[[ARG0]]), to_apply=%g.{{[0-9.]+}}, sharding={devices=[2,2]<=[4]}
     %0 = call @g.2(%arg0) {mhlo.sharding = "{devices=[2,2]<=[4]}"} : (tensor<8x2xi32>) -> tensor<8x2xi32>
     %1 = mhlo.custom_call @MoveToHost(%0) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
     return %1 : tensor<8x2xi32>
@@ -40,11 +40,11 @@ module @call_with_sharding_multiple_results {
   func.func @main(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
     // CHECK:               ENTRY %main.{{[0-9]+}} ([[ARG0:Arg_0.[0-9]+]]: s32[8,2]) -> s32[8,2] {
     // CHECK-NEXT:            %[[ARG0]] = s32[8,2] parameter(0)
-    // CHECK-NEXT:            %[[CALL:.*]] = (s32[8,2], s32[8,2]) call(s32[8,2] %[[ARG0]]), to_apply=%g.2.2,
+    // CHECK-NEXT:            %[[CALL:.*]] = (s32[8,2], s32[8,2]) call(%[[ARG0]]), to_apply=%g.2.2,
     // CHECK-SAME{LITERAL}:     sharding={{maximal device=0}, {replicated}}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
-    // CHECK-NEXT:            %[[IGNORE:.*]] = s32[8,2] get-tuple-element((s32[8,2], s32[8,2]) %[[CALL]]), index=1, sharding={replicated}
-    // CHECK-NEXT:            %[[GET_ELEMENT:.*]] = s32[8,2] get-tuple-element((s32[8,2], s32[8,2]) %[[CALL]]), index=0, sharding={maximal device=0}
-    // CHECK-NEXT:            ROOT %custom-call.{{[0-9]+}} = s32[8,2] custom-call(s32[8,2] %[[GET_ELEMENT]]), custom_call_target="MoveToHost"
+    // CHECK-NEXT:            %[[IGNORE:.*]] = s32[8,2] get-tuple-element(%[[CALL]]), index=1, sharding={replicated}
+    // CHECK-NEXT:            %[[GET_ELEMENT:.*]] = s32[8,2] get-tuple-element(%[[CALL]]), index=0, sharding={maximal device=0}
+    // CHECK-NEXT:            ROOT %custom-call.{{[0-9]+}} = s32[8,2] custom-call(%[[GET_ELEMENT]]), custom_call_target="MoveToHost"
     %0:2 = call @g.2(%arg0) {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}, mhlo.sharding = "{{maximal device=0}, {replicated}}"} : (tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
     %1 = mhlo.custom_call @MoveToHost(%0#0) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
     return %1 : tensor<8x2xi32>
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/case.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/case.mlir
index 8be1cf2a8914..af9344820164 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/case.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/case.mlir
@@ -20,17 +20,17 @@ func.func @main() -> tensor<f32> {
 
 // CHECK: %[[NEGATE_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
 // CHECK:   %[[ARG:.*]] = f32[] parameter(0)
-// CHECK:   ROOT %[[RESULT:.*]] = f32[] negate(f32[] %[[ARG]])
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] negate(%[[ARG]])
 // CHECK: }
 
 // CHECK: %[[COPY_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
 // CHECK:   %[[ARG:.*]] = f32[] parameter(0)
-// CHECK:   ROOT %[[RESULT:.*]] = f32[] copy(f32[] %[[ARG]])
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] copy(%[[ARG]])
 // CHECK: }
 
 // CHECK: %[[FLOOR_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
 // CHECK:   %[[ARG:.*]] = f32[] parameter(0)
-// CHECK:   ROOT %[[RESULT:.*]] = f32[] floor(f32[] %[[ARG]])
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] floor(%[[ARG]])
 // CHECK: }
 
 // CHECK-LABEL: ENTRY
@@ -40,7 +40,7 @@ func.func @main() -> tensor<f32> {
 // CHECK-DAG: %[[OPERAND_1:.*]] = f32[] constant(56)
 // CHECK-DAG: %[[OPERAND_2:.*]] = f32[] constant(12)
 // CHECK-DAG: %[[OPERAND_3:.*]] = f32[] constant(13)
-// CHECK: ROOT %[[RESULT:.*]] = f32[] conditional(s32[] %[[INDEX]], f32[] %[[OPERAND_1]], f32[] %[[OPERAND_2]], f32[] %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
+// CHECK: ROOT %[[RESULT:.*]] = f32[] conditional(%[[INDEX]], %[[OPERAND_1]], %[[OPERAND_2]], %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
 
 // -----
 
@@ -64,20 +64,20 @@ func.func @main() -> (tensor<f32>, tensor<f32>) {
 
 // CHECK: %[[NEGATE_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
 // CHECK:   %[[ARG:.*]] = f32[] parameter(0)
-// CHECK:   %[[NEGATE:.*]] = f32[] negate(f32[] %[[ARG]])
-// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[NEGATE]], f32[] %[[NEGATE]])
+// CHECK:   %[[NEGATE:.*]] = f32[] negate(%[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(%[[NEGATE]], %[[NEGATE]])
 // CHECK: }
 
 // CHECK: %[[COPY_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
 // CHECK:   %[[ARG:.*]] = f32[] parameter(0)
-// CHECK:   %[[COPY:.*]] = f32[] copy(f32[] %[[ARG]])
-// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[COPY]], f32[] %[[COPY]])
+// CHECK:   %[[COPY:.*]] = f32[] copy(%[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(%[[COPY]], %[[COPY]])
 // CHECK: }
 
 // CHECK: %[[FLOOR_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
 // CHECK:   %[[ARG:.*]] = f32[] parameter(0)
-// CHECK:   %[[FLOOR:.*]] = f32[] floor(f32[] %[[ARG]])
-// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[FLOOR]], f32[] %[[FLOOR]])
+// CHECK:   %[[FLOOR:.*]] = f32[] floor(%[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(%[[FLOOR]], %[[FLOOR]])
 // CHECK: }
 
 // CHECK-LABEL: ENTRY
@@ -87,13 +87,13 @@ func.func @main() -> (tensor<f32>, tensor<f32>) {
 // CHECK-DAG: %[[OPERAND_1:.*]] = f32[] constant(56)
 // CHECK-DAG: %[[OPERAND_2:.*]] = f32[] constant(12)
 // CHECK-DAG: %[[OPERAND_3:.*]] = f32[] constant(13)
-// CHECK: %[[TUPLE:.*]] = (f32[], f32[]) conditional(s32[] %[[INDEX]], f32[] %[[OPERAND_1]], f32[] %[[OPERAND_2]], f32[] %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
-// CHECK: %[[RES_1:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[TUPLE]]), index=0
-// CHECK: %[[RES_2:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[TUPLE]]), index=1
-// CHECK: ROOT %[[RESULT:.*]] = (f32[], f32[]) tuple(f32[] %[[RES_1]], f32[] %[[RES_2]])
+// CHECK: %[[TUPLE:.*]] = (f32[], f32[]) conditional(%[[INDEX]], %[[OPERAND_1]], %[[OPERAND_2]], %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
+// CHECK: %[[RES_1:.*]] = f32[] get-tuple-element(%[[TUPLE]]), index=0
+// CHECK: %[[RES_2:.*]] = f32[] get-tuple-element(%[[TUPLE]]), index=1
+// CHECK: ROOT %[[RESULT:.*]] = (f32[], f32[]) tuple(%[[RES_1]], %[[RES_2]])
 
 // -----
-// Test export mhlo::CaseOp with diffrent number of block-arguments (even 0).
+// Test export mhlo::CaseOp with different number of block-arguments (even 0).
 
 func.func @main() -> (tensor<f32>, tensor<f32>) {
   %cst = arith.constant dense<1> : tensor<i32>
@@ -117,24 +117,24 @@ func.func @main() -> (tensor<f32>, tensor<f32>) {
 
 // CHECK: %[[NEGATE_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
 // CHECK:   %[[ARG:.*]] = f32[] parameter(0)
-// CHECK:   %[[NEGATE:.*]] = f32[] negate(f32[] %[[ARG]])
-// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[NEGATE]], f32[] %[[NEGATE]])
+// CHECK:   %[[NEGATE:.*]] = f32[] negate(%[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(%[[NEGATE]], %[[NEGATE]])
 // CHECK: }
 
 // CHECK: %[[COPY_BRANCH:.*]] ({{.*}}: (f32[], f32[])) -> (f32[], f32[]) {
 // CHECK:   %[[ARG:.*]] = (f32[], f32[]) parameter(0)
-// CHECK-DAG:   %[[GTE1:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[ARG]]), index=0
-// CHECK-DAG:   %[[COPY1:.*]] = f32[] copy(f32[] %[[GTE1]])
-// CHECK-DAG:   %[[GTE2:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[ARG]]), index=1
-// CHECK-DAG:   %[[COPY2:.*]] = f32[] copy(f32[] %[[GTE2]])
-// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[COPY1]], f32[] %[[COPY2]])
+// CHECK-DAG:   %[[GTE1:.*]] = f32[] get-tuple-element(%[[ARG]]), index=0
+// CHECK-DAG:   %[[COPY1:.*]] = f32[] copy(%[[GTE1]])
+// CHECK-DAG:   %[[GTE2:.*]] = f32[] get-tuple-element(%[[ARG]]), index=1
+// CHECK-DAG:   %[[COPY2:.*]] = f32[] copy(%[[GTE2]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(%[[COPY1]], %[[COPY2]])
 // CHECK: }
 
 // CHECK: %[[FLOOR_BRANCH:.*]] ({{.*}}: ()) -> (f32[], f32[]) {
 // CHECK:   %[[ARG:.*]] = () parameter(0)
 // CHECK:   %[[CST:.*]] = f32[] constant
-// CHECK:   %[[FLOOR:.*]] = f32[] floor(f32[] %[[CST]])
-// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[FLOOR]], f32[] %[[FLOOR]])
+// CHECK:   %[[FLOOR:.*]] = f32[] floor(%[[CST]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(%[[FLOOR]], %[[FLOOR]])
 // CHECK: }
 
 // CHECK-LABEL: ENTRY
@@ -144,11 +144,11 @@ func.func @main() -> (tensor<f32>, tensor<f32>) {
 // CHECK-DAG: %[[OPERAND_1:.*]] = f32[] constant(56)
 // CHECK-DAG: %[[OPERAND_2:.*]] = f32[] constant(12)
 // CHECK-DAG: %[[OPERAND_3:.*]] = f32[] constant(13)
-// CHECK-DAG: %[[TUPLE1:.*]] = (f32[], f32[]) tuple(f32[] %[[OPERAND_2]], f32[] %[[OPERAND_3]])
+// CHECK-DAG: %[[TUPLE1:.*]] = (f32[], f32[]) tuple(%[[OPERAND_2]], %[[OPERAND_3]])
 // CHECK-DAG: %[[TUPLE2:.*]] = () tuple()
 
-// CHECK: %[[COND:.*]] = (f32[], f32[]) conditional(s32[] %[[INDEX]], f32[] %[[OPERAND_1]], (f32[], f32[]) %[[TUPLE1]], () %[[TUPLE2]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
+// CHECK: %[[COND:.*]] = (f32[], f32[]) conditional(%[[INDEX]], %[[OPERAND_1]], %[[TUPLE1]], %[[TUPLE2]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
 
-// CHECK: %[[RES_1:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[COND]]), index=0
-// CHECK: %[[RES_2:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[COND]]), index=1
-// CHECK: ROOT %[[RESULT:.*]] = (f32[], f32[]) tuple(f32[] %[[RES_1]], f32[] %[[RES_2]])
+// CHECK: %[[RES_1:.*]] = f32[] get-tuple-element(%[[COND]]), index=0
+// CHECK: %[[RES_2:.*]] = f32[] get-tuple-element(%[[COND]]), index=1
+// CHECK: ROOT %[[RESULT:.*]] = (f32[], f32[]) tuple(%[[RES_1]], %[[RES_2]])
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/composite.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/composite.mlir
index 0147971a96c3..70ebca79e276 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/composite.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/composite.mlir
@@ -5,11 +5,11 @@ module @composite {
   // CHECK: %[[ADD:add.[0-9]+]] ([[ARG0:Arg_0.[0-9]+]]: f32[]) -> f32[] {
   // CHECK:   %[[ARG0]] = f32[] parameter(0)
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(2)
-  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(f32[] %[[ARG0]], f32[] %[[CONSTANT]])
+  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(%[[ARG0]], %[[CONSTANT]])
   // CHECK: }
   // CHECK: ENTRY %main.{{[0-9]+}} () -> f32[] {
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(42)
-  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(f32[] %[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="1"}
+  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(%[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="1"}
   // CHECK: }
   func.func @main() -> tensor<f32> {
     %0 = mhlo.constant dense<4.200000e+01> : tensor<f32>
@@ -41,7 +41,7 @@ module @composite {
   //CHECK: }
   //CHECK: ENTRY %main.{{[0-9]+}} () -> () {
   //CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(42)
-  //CHECK:   %call.5 = () call(f32[] %[[CONSTANT]]), to_apply=%[[RETURN]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="1"}
+  //CHECK:   %call.5 = () call(%[[CONSTANT]]), to_apply=%[[RETURN]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="1"}
   //CHECK:   ROOT %tuple.{{[0-9]+}} = () tuple()
   //CHECK: }
   func.func @main() -> () {
@@ -70,15 +70,15 @@ module @composite {
   //CHECK: %[[ADD:add.[0-9]+]] ([[ARG:Arg_0.[0-9]+]]: f32[]) -> (f32[], f32[]) {
   //CHECK:   %[[ARG]] = f32[] parameter(0)
   //CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(2)
-  //CHECK:   %[[ADDOP:add.[0-9]+]] = f32[] add(f32[] %[[ARG]], f32[] %[[CONSTANT]])
-  //CHECK:   ROOT %tuple.{{[0-9]+}} = (f32[], f32[]) tuple(f32[] %[[ADDOP]], f32[] %[[ADDOP]])
+  //CHECK:   %[[ADDOP:add.[0-9]+]] = f32[] add(%[[ARG]], %[[CONSTANT]])
+  //CHECK:   ROOT %tuple.{{[0-9]+}} = (f32[], f32[]) tuple(%[[ADDOP]], %[[ADDOP]])
   //CHECK: }
   //CHECK: ENTRY %main.{{[0-9]+}} () -> (f32[], f32[]) {
   //CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(42)
-  //CHECK:   %[[CALL:call.[0-9]+]] = (f32[], f32[]) call(f32[] %[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="1"}
-  //CHECK:   %[[GTE0:get-tuple-element.[0-9]+]] = f32[] get-tuple-element((f32[], f32[]) %[[CALL]]), index=0
-  //CHECK:   %[[GTE1:get-tuple-element.[0-9]+]] = f32[] get-tuple-element((f32[], f32[]) %[[CALL]]), index=1
-  //CHECK:   ROOT %tuple.{{[0-9]+}} = (f32[], f32[]) tuple(f32[] %[[GTE0]], f32[] %[[GTE1]])
+  //CHECK:   %[[CALL:call.[0-9]+]] = (f32[], f32[]) call(%[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="1"}
+  //CHECK:   %[[GTE0:get-tuple-element.[0-9]+]] = f32[] get-tuple-element(%[[CALL]]), index=0
+  //CHECK:   %[[GTE1:get-tuple-element.[0-9]+]] = f32[] get-tuple-element(%[[CALL]]), index=1
+  //CHECK:   ROOT %tuple.{{[0-9]+}} = (f32[], f32[]) tuple(%[[GTE0]], %[[GTE1]])
   //CHECK: }
   func.func @main() -> (tensor<f32>, tensor<f32>) {
     %0 = mhlo.constant dense<4.200000e+01> : tensor<f32>
@@ -108,11 +108,11 @@ module @composite {
   // CHECK: %[[ADD:add.[0-9]+]] ([[ARG:Arg_0.[0-9]+]]: f32[]) -> f32[] {
   // CHECK:   %[[ARG]] = f32[] parameter(0)
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(2)
-  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(f32[] %[[ARG]], f32[] %[[CONSTANT]])
+  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(%[[ARG]], %[[CONSTANT]])
   // CHECK: }
   // CHECK: ENTRY %main.{{[0-9]+}} () -> f32[] {
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(42)
-  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(f32[] %[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={},composite.name="foo.bar",composite.version="1"}
+  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(%[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={},composite.name="foo.bar",composite.version="1"}
   // CHECK: }
   func.func @main() -> tensor<f32> {
     %0 = mhlo.constant dense<4.200000e+01> : tensor<f32>
@@ -137,11 +137,11 @@ module @composite {
   // CHECK: %[[ADD:add.[0-9]+]] ([[ARG:Arg_0.[0-9]+]]: f32[]) -> f32[] {
   // CHECK:   %[[ARG]] = f32[] parameter(0)
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(2)
-  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(f32[] %[[ARG]], f32[] %[[CONSTANT]])
+  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(%[[ARG]], %[[CONSTANT]])
   // CHECK: }
   // CHECK: ENTRY %main.{{[0-9]+}} () -> f32[] {
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(42)
-  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(f32[] %[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="0"}
+  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(%[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={n = 1 : i32, tensor = dense<1> : tensor<i32>},composite.name="foo.bar",composite.version="0"}
   // CHECK: }
   func.func @main() -> tensor<f32> {
     %0 = mhlo.constant dense<4.200000e+01> : tensor<f32>
@@ -169,11 +169,11 @@ module @composite {
   // CHECK: %[[ADD:add.[0-9]+]] ([[ARG:Arg_0.[0-9]+]]: f32[]) -> f32[] {
   // CHECK:   %[[ARG]] = f32[] parameter(0)
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(2)
-  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(f32[] %[[ARG]], f32[] %[[CONSTANT]])
+  // CHECK:   ROOT %add.{{[0-9]+}} = f32[] add(%[[ARG]], %[[CONSTANT]])
   // CHECK: }
   // CHECK: ENTRY %main.{{[0-9]+}} () -> f32[] {
   // CHECK:   %[[CONSTANT:constant.[0-9]+]] = f32[] constant(42)
-  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(f32[] %[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={},composite.name="foo.bar",composite.version="0"}
+  // CHECK:   ROOT %call.{{[0-9]+}} = f32[] call(%[[CONSTANT]]), to_apply=%[[ADD]], is_composite=true, frontend_attributes={composite.attributes={},composite.name="foo.bar",composite.version="0"}
   // CHECK: }
   func.func @main() -> tensor<f32> {
     %0 = mhlo.constant dense<4.200000e+01> : tensor<f32>
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/dynamic.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/dynamic.mlir
index bb83b10cc3f5..229e2527d184 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/dynamic.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/dynamic.mlir
@@ -10,12 +10,12 @@ func.func @main(%arg0: tensor<?x1xi64, #mhlo.type_extensions<bounds = [4, ?]>>)
   func.return %4 : tensor<1x?xi64, #mhlo.type_extensions<bounds = [?, 4]>>
   //      CHECK: %[[ARG0:.*]] = s64[<=4,1] parameter(0)
   // CHECK-NEXT: %[[SIZE0x1:.*]] = s32[1] constant({1})
-  // CHECK-NEXT: %[[SIZE1:.*]] = s32[] get-dimension-size(s64[<=4,1] %[[ARG0]]), dimensions={0}
-  // CHECK-NEXT: %[[SIZE1x1:.*]] = s32[1] reshape(s32[] %[[SIZE1]])
-  // CHECK-NEXT: %[[SHAPE:.*]] = s32[2] concatenate(s32[1] %[[SIZE0x1]], s32[1] %[[SIZE1x1]]), dimensions={0}
-  // CHECK-NEXT: %[[SHAPE0x1:.*]] = s32[1] slice(s32[2] %[[SHAPE]]), slice={[0:1]}
-  // CHECK-NEXT: %[[SHAPE0:.*]] = s32[] reshape(s32[1] %[[SHAPE0x1]])
-  // CHECK-NEXT: %[[SHAPE1x1:.*]] = s32[1] slice(s32[2] %[[SHAPE]]), slice={[1:2]}
-  // CHECK-NEXT: %[[SHAPE1:.*]] = s32[] reshape(s32[1] %[[SHAPE1x1]])
-  // CHECK-NEXT: ROOT %dynamic-reshape.10 = s64[1,<=4] dynamic-reshape(s64[<=4,1] %[[ARG0]], s32[] %[[SHAPE0]], s32[] %[[SHAPE1]])
+  // CHECK-NEXT: %[[SIZE1:.*]] = s32[] get-dimension-size(%[[ARG0]]), dimensions={0}
+  // CHECK-NEXT: %[[SIZE1x1:.*]] = s32[1] reshape(%[[SIZE1]])
+  // CHECK-NEXT: %[[SHAPE:.*]] = s32[2] concatenate(%[[SIZE0x1]], %[[SIZE1x1]]), dimensions={0}
+  // CHECK-NEXT: %[[SHAPE0x1:.*]] = s32[1] slice(%[[SHAPE]]), slice={[0:1]}
+  // CHECK-NEXT: %[[SHAPE0:.*]] = s32[] reshape(%[[SHAPE0x1]])
+  // CHECK-NEXT: %[[SHAPE1x1:.*]] = s32[1] slice(%[[SHAPE]]), slice={[1:2]}
+  // CHECK-NEXT: %[[SHAPE1:.*]] = s32[] reshape(%[[SHAPE1x1]])
+  // CHECK-NEXT: ROOT %dynamic-reshape.10 = s64[1,<=4] dynamic-reshape(%[[ARG0]], %[[SHAPE0]], %[[SHAPE1]])
 }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export-with-layouts.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
index 3d44aff99a72..5a32ac0aa707 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
@@ -9,7 +9,7 @@
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
-  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4]{1,0:D(D,C)} parameter(0)
+  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4]{1,0} parameter(0)
   return %arg : tensor<3x4xf32, #CSR>
 }
 
@@ -23,7 +23,7 @@ func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xf32, #COO>) -> tensor<3x4xf32, #COO> {
-  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4]{1,0:D(C+,S)} parameter(0)
+  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4]{1,0} parameter(0)
   return %arg : tensor<3x4xf32, #COO>
 }
 
@@ -37,7 +37,7 @@ func.func @main(%arg: tensor<3x4xf32, #COO>) -> tensor<3x4xf32, #COO> {
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
-  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4]{1,0:D(D,C)} parameter(0)
+  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4]{1,0} parameter(0)
   return %arg : tensor<3x4xf32, #CSR>
 }
 
@@ -51,7 +51,7 @@ func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4x5xf32, #UnorderedCOOTensor>) -> tensor<3x4x5xf32, #UnorderedCOOTensor> {
-  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4,5]{2,1,0:D(C+,S+~,S~)} parameter(0)
+  // CHECK: ROOT %[[ARG0:.*]] = f32[3,4,5]{2,1,0} parameter(0)
   return %arg : tensor<3x4x5xf32, #UnorderedCOOTensor>
 }
 
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
index 180c203de882..42da73c6ca73 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export.mlir
@@ -19,7 +19,7 @@ func.func @main(%arg0: tensor<2xi1>) -> tensor<2xi1> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG:.*]] = pred[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = pred[2] xor(pred[2] %[[ARG]], pred[2] %[[ARG]])
+// CHECK:  ROOT %[[RESULT:.*]] = pred[2] xor(%[[ARG]], %[[ARG]])
 
 // -----
 
@@ -32,7 +32,7 @@ func.func @main(%arg0: !mhlo.token, %arg1: !mhlo.token) -> !mhlo.token {
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = token[] parameter(0)
 // CHECK:  %[[ARG1:.*]] = token[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = token[] after-all(token[] %[[ARG0]], token[] %[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = token[] after-all(%[[ARG0]], %[[ARG1]])
 
 // -----
 
@@ -69,7 +69,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
 // CHECK:  %[[COMPUTATION:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[5] reduce-scatter(f32[10] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[5] reduce-scatter(%[[ARG0]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2},{1,3}}
 // CHECK-SAME: dimensions={0}
@@ -90,7 +90,7 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x128xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[128,32] parameter(0)
-// CHECK: ROOT %[[OUTPUT:.*]] = f32[128,128] all-gather(f32[128,32] %[[INPUT]])
+// CHECK: ROOT %[[OUTPUT:.*]] = f32[128,128] all-gather(%[[INPUT]])
 // CHECK-SAME: channel_id=1
 // CHECK-SAME{LITERAL}: replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME: dimensions={1}
@@ -111,7 +111,7 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x128xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[128,32] parameter(0)
-// CHECK: ROOT %[[OUTPUT:.*]] = f32[128,128] all-gather(f32[128,32] %[[INPUT]])
+// CHECK: ROOT %[[OUTPUT:.*]] = f32[128,128] all-gather(%[[INPUT]])
 // CHECK-SAME: channel_id=1
 // CHECK-SAME{LITERAL}: replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME: dimensions={1}
@@ -123,9 +123,9 @@ func.func private @main(%arg0: tensor<8x2xf32>, %arg1: tensor<8x4xf32>) -> tuple
   // CHECK:      %[[ARG0:.*]] = f32[8,2] parameter(0)
   // CHECK-NEXT: %[[ARG1:.*]] = f32[8,4] parameter(1)
   // CHECK-NEXT: %[[TUPLE:.*]] = (f32[8,2], f32[8,4]) tuple
-  // CHECK-NEXT: %[[TUPLE_ARG0:.*]] = f32[8,2] get-tuple-element((f32[8,2], f32[8,4]) %[[TUPLE]]), index=0
-  // CHECK-NEXT: %[[TUPLE_ARG1:.*]] = f32[8,4] get-tuple-element((f32[8,2], f32[8,4]) %[[TUPLE]]), index=1
-  // CHECK-NEXT: (f32[8,8], f32[8,16]) all-gather(f32[8,2] %[[TUPLE_ARG0]], f32[8,4] %[[TUPLE_ARG1]]), channel_id=1, replica_groups={{.*}}, dimensions={1}
+  // CHECK-NEXT: %[[TUPLE_ARG0:.*]] = f32[8,2] get-tuple-element(%[[TUPLE]]), index=0
+  // CHECK-NEXT: %[[TUPLE_ARG1:.*]] = f32[8,4] get-tuple-element(%[[TUPLE]]), index=1
+  // CHECK-NEXT: (f32[8,8], f32[8,16]) all-gather(%[[TUPLE_ARG0]], %[[TUPLE_ARG1]]), channel_id=1, replica_groups={{.*}}, dimensions={1}
   %0:2 = "mhlo.all_gather"(%arg0, %arg1) {
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
@@ -159,7 +159,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 // CHECK:  %[[COMPUTATION:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(f32[10] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(%[[ARG0]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME:  to_apply=%[[COMPUTATION]]
@@ -188,7 +188,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 // CHECK:  %[[COMPUTATION:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(f32[10] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(%[[ARG0]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2,4},{1,3,5,6}}
 // CHECK-SAME:  to_apply=%[[COMPUTATION]]
@@ -217,7 +217,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 // CHECK:  %[[COMPUTATION:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(f32[10] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(%[[ARG0]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME:  use_global_device_ids=true
@@ -229,9 +229,9 @@ func.func private @main(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tuple<tenso
   // CHECK:      %[[ARG0:.*]] = f32[8] parameter(0)
   // CHECK-NEXT: %[[ARG1:.*]] = f32[] parameter(1)
   // CHECK-NEXT: %[[TUPLE:.*]] = (f32[8], f32[]) tuple
-  // CHECK-NEXT: %[[TUPLE_ARG0:.*]] = f32[8] get-tuple-element((f32[8], f32[]) %[[TUPLE]]), index=0
-  // CHECK-NEXT: %[[TUPLE_ARG1:.*]] = f32[] get-tuple-element((f32[8], f32[]) %[[TUPLE]]), index=1
-  // CHECK-NEXT: (f32[8], f32[]) all-reduce(f32[8] %[[TUPLE_ARG0]], f32[] %[[TUPLE_ARG1]]), replica_groups={}, to_apply={{.*}}
+  // CHECK-NEXT: %[[TUPLE_ARG0:.*]] = f32[8] get-tuple-element(%[[TUPLE]]), index=0
+  // CHECK-NEXT: %[[TUPLE_ARG1:.*]] = f32[] get-tuple-element(%[[TUPLE]]), index=1
+  // CHECK-NEXT: (f32[8], f32[]) all-reduce(%[[TUPLE_ARG0]], %[[TUPLE_ARG1]]), replica_groups={}, to_apply={{.*}}
   %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
   ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %2 = mhlo.add %arg2, %arg3 : tensor<f32>
@@ -265,7 +265,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
 // CHECK:  %[[COMPUTATION:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[5] reduce-scatter(f32[10] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[5] reduce-scatter(%[[ARG0]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2},{1,3}}
 // CHECK-SAME:  dimensions={0}
@@ -296,7 +296,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
 // CHECK:  %[[COMPUTATION:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[5] reduce-scatter(f32[10] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[5] reduce-scatter(%[[ARG0]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2},{1,3}}
 // CHECK-SAME:  use_global_device_ids=true
@@ -318,12 +318,12 @@ func.func @main(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tenso
 // CHECK:  [[VAL_3:%.*]] = f32[2] parameter(2)
 // CHECK:  [[VAL_4:%.*]] = f32[2] parameter(3)
 // CHECK:  [[VAL_5:%.*]] = f32[2,2,2,2] parameter(4)
-// CHECK:  [[BNG:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-grad(f32[2,2,2,2] [[VAL_1]], f32[2] [[VAL_2]], f32[2] [[VAL_3]], f32[2] [[VAL_4]], f32[2,2,2,2] [[VAL_5]]), epsilon=0.001, feature_index=0
-// CHECK:  [[GTE0:%.*]] = f32[2,2,2,2] get-tuple-element((f32[2,2,2,2], f32[2], f32[2]) [[BNG]]), index=0
-// CHECK:  [[GTE1:%.*]] = f32[2] get-tuple-element((f32[2,2,2,2], f32[2], f32[2]) [[BNG]]), index=1
-// CHECK:  [[GTE2:%.*]] = f32[2] get-tuple-element((f32[2,2,2,2], f32[2], f32[2]) [[BNG]]), index=2
+// CHECK:  [[BNG:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-grad([[VAL_1]], [[VAL_2]], [[VAL_3]], [[VAL_4]], [[VAL_5]]), epsilon=0.001, feature_index=0
+// CHECK:  [[GTE0:%.*]] = f32[2,2,2,2] get-tuple-element([[BNG]]), index=0
+// CHECK:  [[GTE1:%.*]] = f32[2] get-tuple-element([[BNG]]), index=1
+// CHECK:  [[GTE2:%.*]] = f32[2] get-tuple-element([[BNG]]), index=2
 // CHECK:  ROOT
-// CHECK-SAME: [[RES:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) tuple(f32[2,2,2,2] [[GTE0]], f32[2] [[GTE1]], f32[2] [[GTE2]])
+// CHECK-SAME: [[RES:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) tuple([[GTE0]], [[GTE1]], [[GTE2]])
 
 
 // -----
@@ -339,12 +339,12 @@ func.func @main(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: ten
 // CHECK:  [[VAL_1:%.*]] = f32[2,2,2,2] parameter(0)
 // CHECK:  [[VAL_2:%.*]] = f32[2] parameter(1)
 // CHECK:  [[VAL_3:%.*]] = f32[2] parameter(2)
-// CHECK:  [[BNT:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-training(f32[2,2,2,2] [[VAL_1]], f32[2] [[VAL_2]], f32[2] [[VAL_3]]), epsilon=0.001, feature_index=3
-// CHECK:  [[GTE0:%.*]] = f32[2,2,2,2] get-tuple-element((f32[2,2,2,2], f32[2], f32[2]) [[BNT]]), index=0
-// CHECK:  [[GTE1:%.*]] = f32[2] get-tuple-element((f32[2,2,2,2], f32[2], f32[2]) [[BNT]]), index=1
-// CHECK:  [[GTE2:%.*]] = f32[2] get-tuple-element((f32[2,2,2,2], f32[2], f32[2]) [[BNT]]), index=2
+// CHECK:  [[BNT:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-training([[VAL_1]], [[VAL_2]], [[VAL_3]]), epsilon=0.001, feature_index=3
+// CHECK:  [[GTE0:%.*]] = f32[2,2,2,2] get-tuple-element([[BNT]]), index=0
+// CHECK:  [[GTE1:%.*]] = f32[2] get-tuple-element([[BNT]]), index=1
+// CHECK:  [[GTE2:%.*]] = f32[2] get-tuple-element([[BNT]]), index=2
 // CHECK:  ROOT
-// CHECK-SAME: [[RES:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) tuple(f32[2,2,2,2] [[GTE0]], f32[2] [[GTE1]], f32[2] [[GTE2]])
+// CHECK-SAME: [[RES:%.*]] = (f32[2,2,2,2], f32[2], f32[2]) tuple([[GTE0]], [[GTE1]], [[GTE2]])
 
 // -----
 
@@ -352,22 +352,22 @@ func.func @main(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: ten
 func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xi32>, %arg3: tensor<4xi32>) -> (tensor<4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   // CHECK:  [[VAL_1:%.*]] = f32[4] parameter(0)
   // CHECK:  [[VAL_2:%.*]] = f32[4] parameter(1)
-  // CHECK:  [[ATAN2:%.*]] = f32[4] atan2(f32[4] [[VAL_1]], f32[4] [[VAL_2]])
+  // CHECK:  [[ATAN2:%.*]] = f32[4] atan2([[VAL_1]], [[VAL_2]])
   // CHECK:  [[VAL_3:%.*]] = s32[4] parameter(2)
   // CHECK:  [[VAL_4:%.*]] = s32[4] parameter(3)
   %0 = mhlo.atan2 %arg0, %arg1 : tensor<4xf32>
 
-  // CHECK:  [[SHL:%.*]] = s32[4] shift-left(s32[4] [[VAL_3]], s32[4] [[VAL_4]])
+  // CHECK:  [[SHL:%.*]] = s32[4] shift-left([[VAL_3]], [[VAL_4]])
   %1 = mhlo.shift_left %arg2, %arg3 : tensor<4xi32>
 
-  // CHECK:  [[SHRA:%.*]] = s32[4] shift-right-arithmetic(s32[4] [[VAL_3]], s32[4] [[VAL_4]])
+  // CHECK:  [[SHRA:%.*]] = s32[4] shift-right-arithmetic([[VAL_3]], [[VAL_4]])
   %2 = mhlo.shift_right_arithmetic %arg2, %arg3 : tensor<4xi32>
 
-  // CHECK:  [[SHRL:%.*]] = s32[4] shift-right-logical(s32[4] [[VAL_3]], s32[4] [[VAL_4]])
+  // CHECK:  [[SHRL:%.*]] = s32[4] shift-right-logical([[VAL_3]], [[VAL_4]])
   %3 = mhlo.shift_right_logical %arg2, %arg3 : tensor<4xi32>
 
   // CHECK:  ROOT
-  // CHECK-SAME:  [[VAL_9:%.*]] = (f32[4], s32[4], s32[4], s32[4]) tuple(f32[4] [[ATAN2]], s32[4] [[SHL]], s32[4] [[SHRA]], s32[4] [[SHRL]])
+  // CHECK-SAME:  [[VAL_9:%.*]] = (f32[4], s32[4], s32[4], s32[4]) tuple([[ATAN2]], [[SHL]], [[SHRA]], [[SHRL]])
   func.return %0, %1, %2, %3 : tensor<4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>
 }
 
@@ -381,14 +381,14 @@ func.func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG:.*]] = s32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2] bitcast-convert(s32[2] %[[ARG]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[2] bitcast-convert(%[[ARG]])
 
 // -----
 
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<4xi32>) -> tensor<1x2x3x4xi32> {
   // CHECK:  [[ARG:%.*]] = s32[4] parameter(0)
-  // CHECK-NEXT:  ROOT %broadcast.2 = s32[1,2,3,4] broadcast(s32[4] [[ARG]]), dimensions={3}
+  // CHECK-NEXT:  ROOT %broadcast.2 = s32[1,2,3,4] broadcast([[ARG]]), dimensions={3}
   %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>}> : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
   func.return %0 : tensor<1x2x3x4xi32>
 }
@@ -405,7 +405,7 @@ func.func @main(%arg0: tensor<1xf32>) -> tensor<1x10xf32> {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = f32[1] parameter(0)
-// CHECK:  ROOT %broadcast.2 = f32[1,10] broadcast(f32[1] [[ARG]]), dimensions={0}
+// CHECK:  ROOT %broadcast.2 = f32[1,10] broadcast([[ARG]]), dimensions={0}
 
 // -----
 
@@ -456,19 +456,19 @@ func.func @callee(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 // CHECK:  %[[ARG_1]] = s32[4] parameter(0)
 // CHECK:  %[[ARG_2]] = s32[4] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  s32[4] add(s32[4] %[[ARG_1]], s32[4] %[[ARG_2]])
+// CHECK-SAME:  s32[4] add(%[[ARG_1]], %[[ARG_2]])
 
 // CHECK:  [[CALLEE_2:%.*]] ([[ARG_3:.*]]: s32[4], [[ARG_4:.*]]: s32[4]) -> s32[4] {
 // CHECK:  %[[ARG_3]] = s32[4] parameter(0)
 // CHECK:  %[[ARG_4]] = s32[4] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  s32[4] add(s32[4] %[[ARG_3]], s32[4] %[[ARG_4]])
+// CHECK-SAME:  s32[4] add(%[[ARG_3]], %[[ARG_4]])
 
 // CHECK:  ENTRY [[MAIN:%.*]] ([[ARG:.*]]: s32[4]) -> s32[4] {
 // CHECK:  %[[ARG]] = s32[4] parameter(0)
-// CHECK:  [[CALL_OUT:%.*]] = s32[4] call(s32[4] %[[ARG]], s32[4] %[[ARG]]), to_apply=[[CALLEE_1]]
+// CHECK:  [[CALL_OUT:%.*]] = s32[4] call(%[[ARG]], %[[ARG]]), to_apply=[[CALLEE_1]]
 // CHECK:  ROOT
-// CHECK-SAME:  s32[4] call(s32[4] [[CALL_OUT]], s32[4] [[CALL_OUT]]), to_apply=[[CALLEE_2]]
+// CHECK-SAME:  s32[4] call([[CALL_OUT]], [[CALL_OUT]]), to_apply=[[CALLEE_2]]
 
 // -----
 
@@ -490,11 +490,41 @@ func.func @callee(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>,
 // CHECK:  ENTRY
 // CHECK-SAME:  [[MAIN:%.*]] ([[ARG:.*]]: s32[4]) -> (s32[4], s32[4]) {
 // CHECK:  %[[ARG]] = s32[4] parameter(0)
-// CHECK:  [[CALL_OUT:%.*]] = (s32[4], s32[4]) call(s32[4] %[[ARG]], s32[4] %[[ARG]]), to_apply=[[CALLEE]]
-// CHECK:  [[OUT_0:%.*]] = s32[4] get-tuple-element((s32[4], s32[4]) [[CALL_OUT]]), index=0
-// CHECK:  [[OUT_1:%.*]] = s32[4] get-tuple-element((s32[4], s32[4]) [[CALL_OUT]]), index=1
+// CHECK:  [[CALL_OUT:%.*]] = (s32[4], s32[4]) call(%[[ARG]], %[[ARG]]), to_apply=[[CALLEE]]
+// CHECK:  [[OUT_0:%.*]] = s32[4] get-tuple-element([[CALL_OUT]]), index=0
+// CHECK:  [[OUT_1:%.*]] = s32[4] get-tuple-element([[CALL_OUT]]), index=1
 // CHECK:  ROOT
-// CHECK-SAME:  (s32[4], s32[4]) tuple(s32[4] [[OUT_0]], s32[4] [[OUT_1]])
+// CHECK-SAME:  (s32[4], s32[4]) tuple([[OUT_0]], [[OUT_1]])
+
+// -----
+
+// CHECK:  HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
+  // CHECK: f32[1,16,16,3] cosine
+  %0 = mhlo.cosine %arg0 : tensor<1x16x16x3xf32>
+  return %0 : tensor<1x16x16x3xf32>
+}
+
+// -----
+
+// CHECK:  HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
+  // CHECK: f32[1,16,16,3] sine
+  %0 = mhlo.sine %arg0 : tensor<1x16x16x3xf32>
+  return %0 : tensor<1x16x16x3xf32>
+}
+
+// -----
+
+// CHECK:  HloModule
+// CHECK: ENTRY
+func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: f32[] exponential({{.*}}), result_accuracy={tolerance={atol=0,rtol=0,ulps=10}}
+  %0 = mhlo.exponential %arg0 {result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : tensor<f32>
+  return %0 : tensor<f32>
+}
 
 // -----
 
@@ -508,7 +538,7 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
 }
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = f32[128,32] parameter(0)
-// CHECK:  ROOT [[RESULT:%.*]] = f32[128,32] collective-broadcast(f32[128,32] [[ARG]]), channel_id=1
+// CHECK:  ROOT [[RESULT:%.*]] = f32[128,32] collective-broadcast([[ARG]]), channel_id=1
 // CHECK-SAME{LITERAL}:  replica_groups={{0,1},{2,3}}
 // -----
 
@@ -522,7 +552,7 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
 }
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = f32[128,32] parameter(0)
-// CHECK:  ROOT [[RESULT:%.*]] = f32[128,32] collective-permute(f32[128,32] [[ARG]]), channel_id=1, source_target_pairs={{\{\{}}0,1},{1,2},{2,3}}
+// CHECK:  ROOT [[RESULT:%.*]] = f32[128,32] collective-permute([[ARG]]), channel_id=1, source_target_pairs={{\{\{}}0,1},{1,2},{2,3}}
 
 // -----
 
@@ -540,7 +570,7 @@ func.func @main(%arg0 : tensor<5x2xf32>,
 // CHECK:  %[[ARG0:.*]] = f32[5,2] parameter(0)
 // CHECK:  %[[ARG1:.*]] = f32[5,5] parameter(1)
 // CHECK:  %[[ARG2:.*]] = f32[5,7] parameter(2)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[5,14] concatenate(f32[5,2] %[[ARG0]], f32[5,5] %[[ARG1]], f32[5,7] %[[ARG2]]), dimensions={1}
+// CHECK:  ROOT %[[RESULT:.*]] = f32[5,14] concatenate(%[[ARG0]], %[[ARG1]], %[[ARG2]]), dimensions={1}
 
 // -----
 
@@ -558,7 +588,7 @@ func.func @main() {
   %cst_1 = arith.constant dense<1> : tensor<1xi32>
 
   // CHECK:  %[[C:.*]] = s32[] constant(1)
-  // CHECK:  s32[10] broadcast(s32[] %[[C]])
+  // CHECK:  s32[10] broadcast(%[[C]])
   %cst_2 = arith.constant dense<1> : tensor<10xi32>
 
   // CHECK:  s32[4] constant({1, 2, 3, 4})
@@ -644,7 +674,7 @@ func.func @main(%arg0 : tensor<100x26x26x32xf32>, %arg1 : tensor<3x3x1x32xf32>)
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[100,26,26,32] parameter(0)
 // CHECK:  %[[ARG1:.*]] = f32[3,3,1,32] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[100,28,28,1] convolution(f32[100,26,26,32] %[[ARG0]], f32[3,3,1,32] %[[ARG1]]),
+// CHECK:  ROOT %[[RESULT:.*]] = f32[100,28,28,1] convolution(%[[ARG0]], %[[ARG1]]),
 // CHECK-SAME:  window={size=3x3 pad=2_2x2_2},
 // CHECK-SAME:  dim_labels=b01f_01oi->b01f
 
@@ -678,7 +708,7 @@ func.func @main(%arg0 : tensor<100x26x26x32xi8>, %arg1 : tensor<3x3x1x32xi8>) ->
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = s8[100,26,26,32] parameter(0)
 // CHECK:  %[[ARG1:.*]] = s8[3,3,1,32] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[100,28,28,1] convolution(s8[100,26,26,32] %[[ARG0]], s8[3,3,1,32] %[[ARG1]]),
+// CHECK:  ROOT %[[RESULT:.*]] = s32[100,28,28,1] convolution(%[[ARG0]], %[[ARG1]]),
 // CHECK-SAME:  window={size=3x3 pad=2_2x2_2},
 // CHECK-SAME:  dim_labels=b01f_01oi->b01f
 
@@ -713,7 +743,7 @@ func.func @main(%arg0 : tensor<100x26x26x32xi8>, %arg1 : tensor<3x3x1x32xi8>) ->
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = s8[100,26,26,32] parameter(0)
 // CHECK:  %[[ARG1:.*]] = s8[3,3,1,32] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[100,28,28,1] convolution(s8[100,26,26,32] %[[ARG0]], s8[3,3,1,32] %[[ARG1]]),
+// CHECK:  ROOT %[[RESULT:.*]] = s32[100,28,28,1] convolution(%[[ARG0]], %[[ARG1]]),
 // CHECK-SAME:  window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1},
 // CHECK-SAME:  dim_labels=b01f_01oi->b01f
 
@@ -727,7 +757,7 @@ func.func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG:.*]] = s32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2] convert(s32[2] %[[ARG]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[2] convert(%[[ARG]])
 
 // -----
 
@@ -754,22 +784,22 @@ func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG:.*]] = f32[2] parameter(0)
-// CHECK:  %[[E5M2_VAL:.*]] = f8e5m2[2] convert(f32[2] %[[ARG]])
-// CHECK:  %[[F32_VAL:.*]] = f32[2] convert(f8e5m2[2] %[[E5M2_VAL]])
-// CHECK:  %[[E4M3FN_VAL:.*]] = f8e4m3fn[2] convert(f32[2] %[[F32_VAL]])
-// CHECK:  %[[F32_VAL2:.*]] = f32[2] convert(f8e4m3fn[2] %[[E4M3FN_VAL]])
-// CHECK:  %[[E4M3FNUZ_VAL:.*]] = f8e4m3fnuz[2] convert(f32[2] %[[F32_VAL2]])
-// CHECK:  %[[F32_VAL3:.*]] = f32[2] convert(f8e4m3fnuz[2] %[[E4M3FNUZ_VAL]])
-// CHECK:  %[[E5M2FNUZ_VAL:.*]] = f8e5m2fnuz[2] convert(f32[2] %[[F32_VAL3]])
-// CHECK:  %[[F32_VAL4:.*]] = f32[2] convert(f8e5m2fnuz[2] %[[E5M2FNUZ_VAL]])
-// CHECK:  %[[E4M3_VAL:.*]] = f8e4m3[2] convert(f32[2] %[[F32_VAL4]])
-// CHECK:  %[[F32_VAL5:.*]] = f32[2] convert(f8e4m3[2] %[[E4M3_VAL]])
-// CHECK:  %[[E3M4_VAL:.*]] = f8e3m4[2] convert(f32[2] %[[F32_VAL5]])
-// CHECK:  %[[F32_VAL6:.*]] = f32[2] convert(f8e3m4[2] %[[E3M4_VAL]])
-// CHECK:  %[[E2M1FN_VAL:.*]] = f4e2m1fn[2] convert(f32[2] %[[F32_VAL6]])
-// CHECK:  %[[F32_VAL7:.*]] = f32[2] convert(f4e2m1fn[2] %[[E2M1FN_VAL]])
-// CHECK:  %[[E8M0FNU_VAL:.*]] = f8e8m0fnu[2] convert(f32[2] %[[F32_VAL7]])
-// CHECK:  ROOT %[[F32_VAL8:.*]] = f32[2] convert(f8e8m0fnu[2] %[[E8M0FNU_VAL]])
+// CHECK:  %[[E5M2_VAL:.*]] = f8e5m2[2] convert(%[[ARG]])
+// CHECK:  %[[F32_VAL:.*]] = f32[2] convert(%[[E5M2_VAL]])
+// CHECK:  %[[E4M3FN_VAL:.*]] = f8e4m3fn[2] convert(%[[F32_VAL]])
+// CHECK:  %[[F32_VAL2:.*]] = f32[2] convert(%[[E4M3FN_VAL]])
+// CHECK:  %[[E4M3FNUZ_VAL:.*]] = f8e4m3fnuz[2] convert(%[[F32_VAL2]])
+// CHECK:  %[[F32_VAL3:.*]] = f32[2] convert(%[[E4M3FNUZ_VAL]])
+// CHECK:  %[[E5M2FNUZ_VAL:.*]] = f8e5m2fnuz[2] convert(%[[F32_VAL3]])
+// CHECK:  %[[F32_VAL4:.*]] = f32[2] convert(%[[E5M2FNUZ_VAL]])
+// CHECK:  %[[E4M3_VAL:.*]] = f8e4m3[2] convert(%[[F32_VAL4]])
+// CHECK:  %[[F32_VAL5:.*]] = f32[2] convert(%[[E4M3_VAL]])
+// CHECK:  %[[E3M4_VAL:.*]] = f8e3m4[2] convert(%[[F32_VAL5]])
+// CHECK:  %[[F32_VAL6:.*]] = f32[2] convert(%[[E3M4_VAL]])
+// CHECK:  %[[E2M1FN_VAL:.*]] = f4e2m1fn[2] convert(%[[F32_VAL6]])
+// CHECK:  %[[F32_VAL7:.*]] = f32[2] convert(%[[E2M1FN_VAL]])
+// CHECK:  %[[E8M0FNU_VAL:.*]] = f8e8m0fnu[2] convert(%[[F32_VAL7]])
+// CHECK:  ROOT %[[F32_VAL8:.*]] = f32[2] convert(%[[E8M0FNU_VAL]])
 
 // -----
 
@@ -782,7 +812,7 @@ func.func @main(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xui32>) -> tensor<5x5xi
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[5,5] parameter(0)
 // CHECK:  %[[ARG1:.*]] = u32[5,5] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = s8[5,5] stochastic-convert(f32[5,5] %[[ARG0]], u32[5,5] %[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = s8[5,5] stochastic-convert(%[[ARG0]], %[[ARG1]])
 
 // -----
 
@@ -794,7 +824,7 @@ func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = s32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[2] copy(s32[2] [[ARG]])
+// CHECK:  ROOT %[[RESULT:.*]] = s32[2] copy([[ARG]])
 
 // -----
 
@@ -806,11 +836,11 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 }
 
 // CHECK:  %[[SUM_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> f32[]
-// CHECK:  ROOT %[[RESULT:.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[] add(%[[ARG0]], %[[ARG1]])
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(f32[10] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(%[[ARG0]])
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME:  to_apply=%[[SUM_COMPUTATION]]
 
@@ -825,7 +855,7 @@ func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // CHECK:  ENTRY
 // CHECK:  [[VAL_1:%.*]] = f32[2,3] parameter(0)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[2,3] custom-call(f32[2,3] [[VAL_1]])
+// CHECK-SAME:  f32[2,3] custom-call([[VAL_1]])
 // CHECK-SAME:  custom_call_target="SetBound"
 // CHECK-SAME:  literal=s32[] 1
 
@@ -845,7 +875,7 @@ func.func @main(%arg0: tensor<6xf32>, %arg1: tensor<6xf32>, %arg2: tensor<3xi32>
 // CHECK: [[ARG_4:%.*]] = s32[3] parameter(4)
 // CHECK: [[ARG_5:%.*]] = s32[3] parameter(5)
 // CHECK: ROOT
-// CHECK-SAME: f32[6] ragged-all-to-all(f32[6] [[ARG_0]], f32[6] [[ARG_1]], s32[3] [[ARG_2]], s32[3] [[ARG_3]], s32[3] [[ARG_4]], /*index=5*/s32[3] [[ARG_5]])
+// CHECK-SAME: f32[6] ragged-all-to-all([[ARG_0]], [[ARG_1]], [[ARG_2]], [[ARG_3]], [[ARG_4]], /*index=5*/[[ARG_5]])
 // CHECK-SAME{LITERAL}: channel_id=1, replica_groups={{0,1,2}}
 
 // -----
@@ -876,9 +906,9 @@ func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: te
 // CHECK-DAG:   [[ARG1:%.*]] = s32[] parameter(1)
 // CHECK-DAG:   [[ARG2:%.*]] = s32[16,256] parameter(2)
 // CHECK-DAG:   [[ARG3:%.*]] = bf16[] parameter(3)
-// CHECK-DAG:   [[VAL0:%.*]] = (bf16[16,256], s32[16,256]) sort(bf16[16,256] [[ARG0]], s32[16,256] [[ARG2]])
-// CHECK-DAG:   [[VAL1:%.*]] = s32[16,256] get-tuple-element((bf16[16,256], s32[16,256]) [[VAL0]])
-// CHECK-DAG:   [[VAL2:%.*]] = s32[16,4] slice(s32[16,256] [[VAL1]])
+// CHECK-DAG:   [[VAL0:%.*]] = (bf16[16,256], s32[16,256]) sort([[ARG0]], [[ARG2]])
+// CHECK-DAG:   [[VAL1:%.*]] = s32[16,256] get-tuple-element([[VAL0]])
+// CHECK-DAG:   [[VAL2:%.*]] = s32[16,4] slice([[VAL1]])
 
 // -----
 
@@ -908,14 +938,14 @@ func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: te
 // CHECK:   s32[] parameter(3)
 // CHECK:   [[ARG0:%.*]] = bf16[] parameter(0)
 // CHECK:   [[ARG1:%.*]] = bf16[] parameter(1)
-// CHECK:   ROOT [[VAL:%.*]] = pred[] compare(bf16[] [[ARG0]], bf16[] [[ARG1]]), direction=GT
+// CHECK:   ROOT [[VAL:%.*]] = pred[] compare([[ARG0]], [[ARG1]]), direction=GT
 
 // CHECK: ENTRY
 // CHECK-DAG:   [[ARG0:%.*]] = bf16[16,256] parameter(0)
 // CHECK-DAG:   [[ARG1:%.*]] = s32[] parameter(1)
 // CHECK-DAG:   [[ARG2:%.*]] = s32[16,256] parameter(2)
 // CHECK-DAG:   [[ARG3:%.*]] = bf16[] parameter(3)
-// CHECK-DAG:   (bf16[16,128], s32[16,128]) custom-call(bf16[16,256] [[ARG0]], s32[16,256] [[ARG2]], bf16[] [[ARG3]], s32[] [[ARG1]]),
+// CHECK-DAG:   (bf16[16,128], s32[16,128]) custom-call([[ARG0]], [[ARG2]], [[ARG3]], [[ARG1]]),
 // CHECK-SAME: custom_call_target="PartialReduce", called_computations={%top_k_gt_comparator.[[COMPARATOR]]}
 // CHECK-SAME: backend_config={"log2_reduction": 1, "reduction_dim": 1, "to_apply_type": "comparator", "top_k": 4, "recall_target": 0.949218}
 
@@ -1522,7 +1552,7 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
 // CHECK:  [[VAL_1:%.*]] = f32[2,3] parameter(0)
 // CHECK:  [[VAL_2:%.*]] = f32[5,5] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]])
+// CHECK-SAME:  f32[1,2,3] custom-call([[VAL_1]], [[VAL_2]])
 // CHECK-SAME:  custom_call_target="foo"
 // CHECK-SAME:  custom_call_has_side_effect=true
 // CHECK-SAME:  schedule=SCHEDULE_LATEST
@@ -1540,7 +1570,7 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
 // CHECK:  [[VAL_1:%.*]] = f32[2,3] parameter(0)
 // CHECK:  [[VAL_2:%.*]] = f32[5,5] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]])
+// CHECK-SAME:  f32[1,2,3] custom-call([[VAL_1]], [[VAL_2]])
 // CHECK-SAME:  custom_call_target="foo"
 // CHECK-SAME:  custom_call_has_side_effect=true
 // CHECK-SAME:  schedule=SCHEDULE_EARLIEST
@@ -1557,7 +1587,7 @@ func.func @main(%arg0: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>> {
 // CHECK:  ENTRY
 // CHECK:  [[ARG0:%.*]] = f32[2,3] parameter(0)
 // CHECK:  ROOT
-// CHECK-SAME:  (f32[2,3]) custom-call(f32[2,3] [[ARG0]])
+// CHECK-SAME:  (f32[2,3]) custom-call([[ARG0]])
 // CHECK-SAME:  custom_call_target="foo"
 
 // -----
@@ -1571,7 +1601,7 @@ func.func @main(%arg0: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16
 // CHECK:  ENTRY
 // CHECK:  [[ARG0:%.*]] = f32[2,3] parameter(0)
 // CHECK:  ROOT
-// CHECK-SAME:  (f32[2,3], f16[4,5]) custom-call(f32[2,3] [[ARG0]])
+// CHECK-SAME:  (f32[2,3], f16[4,5]) custom-call([[ARG0]])
 // CHECK-SAME:  custom_call_target="foo"
 
 // -----
@@ -1584,12 +1614,12 @@ func.func @main(%arg0: tensor<2x3xf32>) -> (tensor<2x3xf32>, tensor<4x5xf16>) {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG0:%.*]] = f32[2,3] parameter(0)
-// CHECK:  [[OUTS:%.*]] = (f32[2,3], f16[4,5]) custom-call(f32[2,3] [[ARG0]])
+// CHECK:  [[OUTS:%.*]] = (f32[2,3], f16[4,5]) custom-call([[ARG0]])
 // CHECK-SAME:  custom_call_target="foo"
-// CHECK-DAG:  [[OUT0:%.*]] = f32[2,3] get-tuple-element((f32[2,3], f16[4,5]) [[OUTS]]), index=0
-// CHECK-DAG:  [[OUT1:%.*]] = f16[4,5] get-tuple-element((f32[2,3], f16[4,5]) [[OUTS]]), index=1
+// CHECK-DAG:  [[OUT0:%.*]] = f32[2,3] get-tuple-element([[OUTS]]), index=0
+// CHECK-DAG:  [[OUT1:%.*]] = f16[4,5] get-tuple-element([[OUTS]]), index=1
 // CHECK:  ROOT
-// CHECK-SAME: (f32[2,3], f16[4,5]) tuple(f32[2,3] [[OUT0]], f16[4,5] [[OUT1]])
+// CHECK-SAME: (f32[2,3], f16[4,5]) tuple([[OUT0]], [[OUT1]])
 
 // -----
 
@@ -1605,7 +1635,7 @@ func.func @main(%arg0: tensor<3xi8>, %arg1: tensor<3xi8>) -> tensor<i64> {
 // CHECK: %[[ARG0]] = s8[3] parameter(0)
 // CHECK: %[[ARG1]] = s8[3] parameter(1)
 // CHECK: ROOT
-// CHECK-SAME: s64[] dot(s8[3] %[[ARG0]], s8[3] %[[ARG1]]),
+// CHECK-SAME: s64[] dot(%[[ARG0]], %[[ARG1]])
 
 // -----
 
@@ -1620,7 +1650,7 @@ func.func @main(%arg0: tensor<3xi4>, %arg1: tensor<3xi4>) -> tensor<i8> {
 // CHECK:  [[CALLEE_1:%.*]] ([[ARG_1:.*]]: s4[3], [[ARG_2:.*]]: s4[3]) -> s8[]
 // CHECK:  %[[ARG_1:.*]] = s4[3] parameter(0)
 // CHECK:  %[[ARG_2:.*]] = s4[3] parameter(1)
-// CHECK:  ROOT %[[DOT:.*]] = s8[] dot(s4[3] %[[ARG_1:.*]], s4[3] %[[ARG_2:.*]])
+// CHECK:  ROOT %[[DOT:.*]] = s8[] dot(%[[ARG_1:.*]], %[[ARG_2:.*]])
 
 // -----
 
@@ -1635,7 +1665,7 @@ func.func @main(%arg0: tensor<3xui4>, %arg1: tensor<3xui4>) -> tensor<ui8> {
 // CHECK:  [[CALLEE_1:%.*]] ([[ARG_1:.*]]: u4[3], [[ARG_2:.*]]: u4[3]) -> u8[]
 // CHECK:  %[[ARG_1:.*]] = u4[3] parameter(0)
 // CHECK:  %[[ARG_2:.*]] = u4[3] parameter(1)
-// CHECK:  ROOT %[[DOT:.*]] = u8[] dot(u4[3] %[[ARG_1:.*]], u4[3] %[[ARG_2:.*]])
+// CHECK:  ROOT %[[DOT:.*]] = u8[] dot(%[[ARG_1:.*]], %[[ARG_2:.*]])
 
 // -----
 
@@ -1658,7 +1688,7 @@ func.func @main(%arg0: tensor<2x2x2xi8>, %arg1: tensor<2x2x3xi8>) -> tensor<2x2x
 // CHECK: %[[ARG0]] = s8[2,2,2] parameter(0)
 // CHECK: %[[ARG1]] = s8[2,2,3] parameter(1)
 // CHECK: ROOT
-// CHECK-SAME: s32[2,2,3] dot(s8[2,2,2] %[[ARG0]], s8[2,2,3] %[[ARG1]]),
+// CHECK-SAME: s32[2,2,3] dot(%[[ARG0]], %[[ARG1]]),
 // CHECK-SAME: lhs_batch_dims={0}
 // CHECK-SAME: lhs_contracting_dims={2}
 // CHECK-SAME: rhs_batch_dims={0}
@@ -1668,7 +1698,10 @@ func.func @main(%arg0: tensor<2x2x2xi8>, %arg1: tensor<2x2x3xi8>) -> tensor<2x2x
 
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<10x16xbf16>, %arg1: tensor<32x20xbf16>, %meta: tensor<10x2xui16>) -> tensor<10x20xf32> {
-  // CHECK:  dot(bf16[10,16] %{{.*}}, bf16[32,20] %{{.*}}, u16[10,2] %{{.*}}), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+  // CHECK:  [[ARG0:%.*]] = bf16[10,16] parameter(0)
+  // CHECK:  [[ARG1:%.*]] = bf16[32,20] parameter(1)
+  // CHECK:  [[META:%.*]] = u16[10,2] parameter(2)
+  // CHECK:  dot([[ARG0]], [[ARG1]], [[META]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
   %0 = "mhlo.sparse_dot"(%arg0, %arg1, %meta) {
     lhs_sparsity = #mhlo.sparsity<dimension=1, n=2, m=4>,
     dot_dimension_numbers = #mhlo.dot<
@@ -1684,7 +1717,9 @@ func.func @main(%arg0: tensor<10x16xbf16>, %arg1: tensor<32x20xbf16>, %meta: ten
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<3x4xi32>, %arg1: tensor<4x5xi32>) -> tensor<3x5xi32> {
   // Simple einsum is lowered to HLO dot op.
-  // CHECK:  dot(s32[3,4] %{{.*}}, s32[4,5] %{{.*}}), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  // CHECK:  [[ARG0:%.*]] = s32[3,4] parameter(0)
+  // CHECK:  [[ARG1:%.*]] = s32[4,5] parameter(1)
+  // CHECK:  dot([[ARG0]], [[ARG1]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
   %0 = "mhlo.einsum"(%arg0, %arg1) <{einsum_config = "ab,bc->ac"}> : (tensor<3x4xi32>, tensor<4x5xi32>) -> tensor<3x5xi32>
   func.return %0 : tensor<3x5xi32>
 }
@@ -1699,7 +1734,7 @@ func.func @main(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = f32[3,9] parameter(0)
-// CHECK:  c64[3,5] fft(f32[3,9] [[ARG]]), fft_type=RFFT, fft_length={9}
+// CHECK:  c64[3,5] fft([[ARG]]), fft_type=RFFT, fft_length={9}
 
 // -----
 
@@ -1707,7 +1742,7 @@ func.func @main(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
 func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x300xf32> {
   // CHECK:  [[ARG0:%.*]] = f32[200,100,300] parameter(0)
   // CHECK:  [[ARG1:%.*]] = s32[10,2] parameter(1)
-  // CHECK:  f32[10,300] gather(f32[200,100,300] [[ARG0]], s32[10,2] [[ARG1]])
+  // CHECK:  f32[10,300] gather([[ARG0]], [[ARG1]])
   // CHECK-SAME:  offset_dims={1}
   // CHECK-SAME:  collapsed_slice_dims={0,1}
   // CHECK-SAME:  start_index_map={0,1}
@@ -1733,7 +1768,7 @@ func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tens
 func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<100x200x1xi32>) -> tensor<100x200x300xf32> {
   // CHECK:  [[ARG0:%.*]] = f32[200,100,300] parameter(0)
   // CHECK:  [[ARG1:%.*]] = s32[100,200,1] parameter(1)
-  // CHECK:  f32[100,200,300] gather(f32[200,100,300] [[ARG0]], s32[100,200,1] [[ARG1]])
+  // CHECK:  f32[100,200,300] gather([[ARG0]], [[ARG1]])
   // CHECK-SAME:  offset_dims={2}
   // CHECK-SAME:  collapsed_slice_dims={}
   // CHECK-SAME:  start_index_map={2}
@@ -1768,8 +1803,8 @@ func.func @main(%arg: tensor<4x2xf32>, %size: tensor<i32>) -> tensor<i32> {
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = f32[4,2] parameter(0)
 // CHECK:  [[SIZE:%.*]] = s32[] parameter(1)
-// CHECK:  [[DYNAMIC:%.*]] = f32[4,<=2] set-dimension-size(f32[4,2] [[ARG]], s32[] [[SIZE]]), dimensions={1}
-// CHECK:  ROOT %[[RESULT:.*]] = s32[] get-dimension-size(f32[4,<=2] [[DYNAMIC]]), dimensions={1}
+// CHECK:  [[DYNAMIC:%.*]] = f32[4,<=2] set-dimension-size([[ARG]], [[SIZE]]), dimensions={1}
+// CHECK:  ROOT %[[RESULT:.*]] = s32[] get-dimension-size([[DYNAMIC]]), dimensions={1}
 
 
 // -----
@@ -1784,7 +1819,7 @@ func.func @main(%arg: tensor<?x4xf32, #mhlo.type_extensions<bounds = [8, ?]>>) -
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = f32[<=8,4] parameter(0)
 // CHECK:  [[SIZE:%.*]] = s32[] constant(8)
-// CHECK:  ROOT [[DYNAMIC:%.*]] = f32[8,4] set-dimension-size(f32[<=8,4] [[ARG]], s32[] [[SIZE]]), dimensions={0}
+// CHECK:  ROOT [[DYNAMIC:%.*]] = f32[8,4] set-dimension-size([[ARG]], [[SIZE]]), dimensions={0}
 
 // -----
 
@@ -1796,7 +1831,7 @@ func.func @main(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = (f32[], s32[]) parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[] get-tuple-element((f32[], s32[]) %[[ARG0]]), index=0
+// CHECK:  ROOT %[[RESULT:.*]] = f32[] get-tuple-element(%[[ARG0]]), index=0
 
 // -----
 
@@ -1811,11 +1846,11 @@ func.func @main(%arg0: !mhlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>,
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((s32[3,3], pred[]), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:  [[GTE1:%.*]] = (s32[3,3], pred[]) get-tuple-element(((s32[3,3], pred[]), token[]) [[INFEED]]), index=0
-// CHECK:  [[GTE2:%.*]] = s32[3,3] get-tuple-element((s32[3,3], pred[]) [[GTE1]]), index=0
-// CHECK:  [[GTE3:%.*]] = pred[] get-tuple-element((s32[3,3], pred[]) [[GTE1]]), index=1
-// CHECK:  [[GTE4:%.*]] = token[] get-tuple-element(((s32[3,3], pred[]), token[]) [[INFEED]]), index=1
+// CHECK:  [[INFEED:%.*]] = ((s32[3,3], pred[]), token[]) infeed([[ARG]]), infeed_config="foobar"
+// CHECK:  [[GTE1:%.*]] = (s32[3,3], pred[]) get-tuple-element([[INFEED]]), index=0
+// CHECK:  [[GTE2:%.*]] = s32[3,3] get-tuple-element([[GTE1]]), index=0
+// CHECK:  [[GTE3:%.*]] = pred[] get-tuple-element([[GTE1]]), index=1
+// CHECK:  [[GTE4:%.*]] = token[] get-tuple-element([[INFEED]]), index=1
 
 // -----
 
@@ -1827,10 +1862,10 @@ func.func @main(%arg0: !mhlo.token) -> tensor<3x3xi32> {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((s32[3,3]), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:  [[GTE0:%.*]] = (s32[3,3]) get-tuple-element(((s32[3,3]), token[]) [[INFEED]]), index=0
-// CHECK:  ROOT [[GTE1:%.*]] = s32[3,3] get-tuple-element((s32[3,3]) [[GTE0]]), index=0
-// CHECK:  [[GTE2:%.*]] = token[] get-tuple-element(((s32[3,3]), token[]) [[INFEED]]), index=1
+// CHECK:  [[INFEED:%.*]] = ((s32[3,3]), token[]) infeed([[ARG]]), infeed_config="foobar"
+// CHECK:  [[GTE0:%.*]] = (s32[3,3]) get-tuple-element([[INFEED]]), index=0
+// CHECK:  ROOT [[GTE1:%.*]] = s32[3,3] get-tuple-element([[GTE0]]), index=0
+// CHECK:  [[GTE2:%.*]] = token[] get-tuple-element([[INFEED]]), index=1
 
 // -----
 
@@ -1843,8 +1878,8 @@ func.func @main(%arg0: !mhlo.token) -> !mhlo.token {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:   ROOT [[TOKEN:%.*]] = token[] get-tuple-element(((), token[]) [[INFEED]]), index=1
+// CHECK:  [[INFEED:%.*]] = ((), token[]) infeed([[ARG]]), infeed_config="foobar"
+// CHECK:   ROOT [[TOKEN:%.*]] = token[] get-tuple-element([[INFEED]]), index=1
 
 // -----
 
@@ -1875,14 +1910,14 @@ func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
 // CHECK:  [[ARG_0:%.*]] = f32[] parameter(0)
 // CHECK:  [[ARG_1:%.*]] = f32[] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[] add(f32[] [[ARG_0]], f32[] [[ARG_1]])
+// CHECK-SAME:  f32[] add([[ARG_0]], [[ARG_1]])
 // CHECK:  }
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG_2:%.*]] = f32[4] parameter(0)
 // CHECK:  [[ARG_3:%.*]] = f32[4] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[4] map(f32[4] [[ARG_2]], f32[4] [[ARG_3]]), dimensions={0}, to_apply=[[COMPUTATION]]
+// CHECK-SAME:  f32[4] map([[ARG_2]], [[ARG_3]]), dimensions={0}, to_apply=[[COMPUTATION]]
 
 // -----
 
@@ -1903,7 +1938,7 @@ func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xi32>) -> tensor<4xf32> {
 // CHECK:  [[ARG_2:%.*]] = f32[4] parameter(0)
 // CHECK:  [[ARG_3:%.*]] = s32[4] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[4] map(f32[4] [[ARG_2]], s32[4] [[ARG_3]]), dimensions={0}, to_apply=[[COMPUTATION]]
+// CHECK-SAME:  f32[4] map([[ARG_2]], [[ARG_3]]), dimensions={0}, to_apply=[[COMPUTATION]]
 
 // -----
 
@@ -1916,9 +1951,9 @@ func.func @main(%data: tensor<3xi32>, %token: !mhlo.token) -> !mhlo.token {
 
 // CHECK:  ENTRY
 // CHECK:  [[DATA:%.*]] = s32[3] parameter(0)
-// CHECK-DAG: [[DATATUPLE:%.*]] = (s32[3]) tuple(s32[3] [[DATA]])
+// CHECK-DAG: [[DATATUPLE:%.*]] = (s32[3]) tuple([[DATA]])
 // CHECK-DAG:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed((s32[3]) [[DATATUPLE]], token[] [[TOKEN]]), outfeed_shape=(s32[3]{0}), outfeed_config="foobar"
+// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed([[DATATUPLE]], [[TOKEN]]), outfeed_shape=(s32[3]{0}), outfeed_config="foobar"
 
 // -----
 
@@ -1945,15 +1980,15 @@ func.func @main(%data: tensor<3x2xi32>, %token: !mhlo.token) -> !mhlo.token {
 
 // CHECK:  ENTRY
 // CHECK:  [[DATA:%.*]] = s32[3,2] parameter(0)
-// CHECK:  [[SHARD:%.*]] = s32[3,2] custom-call(s32[3,2] [[DATA]])
+// CHECK:  [[SHARD:%.*]] = s32[3,2] custom-call([[DATA]])
 // CHECK-SAME: custom_call_target="Sharding"
 // CHECK-SAME: sharding={devices=[1,2]0,1}
-// CHECK:  [[FULL:%.*]] = s32[6,2] custom-call(s32[3,2] [[SHARD]])
+// CHECK:  [[FULL:%.*]] = s32[6,2] custom-call([[SHARD]])
 // CHECK-SAME: custom_call_target="SPMDShardToFullShape"
 // CHECK-SAME: sharding={devices=[1,2]0,1}
-// CHECK-DAG: [[DATATUPLE:%.*]] = (s32[6,2]) tuple(s32[6,2] [[FULL]])
+// CHECK-DAG: [[DATATUPLE:%.*]] = (s32[6,2]) tuple([[FULL]])
 // CHECK-DAG:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed((s32[6,2]) [[DATATUPLE]], token[] [[TOKEN]]), outfeed_shape=(s32[6,2]{1,0}), outfeed_config="foobar",
+// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed([[DATATUPLE]], [[TOKEN]]), outfeed_shape=(s32[6,2]{1,0}), outfeed_config="foobar",
 // CHECK-SAME: sharding={
 // CHECK-SAME: {devices=[2,1]0,1}, {maximal device=0}
 // CHECK-SAME: }
@@ -1969,9 +2004,9 @@ func.func @main(%data1: tensor<3xi32>, %data2: tensor<3xi32>, %token: !mhlo.toke
 // CHECK:  ENTRY
 // CHECK:  [[DATA1:%.*]] = s32[3] parameter(0)
 // CHECK:  [[DATA2:%.*]] = s32[3] parameter(1)
-// CHECK-DAG:  [[TUPLE:%.*]] = (s32[3], s32[3]) tuple(s32[3] [[DATA1]], s32[3] [[DATA2]])
+// CHECK-DAG:  [[TUPLE:%.*]] = (s32[3], s32[3]) tuple([[DATA1]], [[DATA2]])
 // CHECK-DAG:  [[TOKEN:%.*]] = token[] parameter(2)
-// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed((s32[3], s32[3]) [[TUPLE]], token[] [[TOKEN]]), outfeed_shape=(s32[3]{0}, s32[3]{0}), outfeed_config="foobar"
+// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed([[TUPLE]], [[TOKEN]]), outfeed_shape=(s32[3]{0}, s32[3]{0}), outfeed_config="foobar"
 
 // -----
 
@@ -1984,7 +2019,7 @@ func.func @main(%token: !mhlo.token) -> !mhlo.token {
 // CHECK: ENTRY
 // CHECK-DAG:   [[EMPTY_TUPLE:%.*]] = () tuple()
 // CHECK-DAG:   [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:   ROOT [[RESULT:%.*]] = token[] outfeed(() [[EMPTY_TUPLE]], token[] [[TOKEN]]), outfeed_shape=(), outfeed_config="foobar"
+// CHECK:   ROOT [[RESULT:%.*]] = token[] outfeed([[EMPTY_TUPLE]], [[TOKEN]]), outfeed_shape=(), outfeed_config="foobar"
 
 // -----
 
@@ -1998,7 +2033,7 @@ func.func @main(%arg: tensor<4x6xf32>, %pad: tensor<f32>) -> tensor<13x19xf32> {
 // CHECK:  [[ARG:%.*]] = f32[4,6] parameter(0)
 // CHECK:  [[PADDING_VAL:%.*]] = f32[] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[13,19] pad(f32[4,6] [[ARG]], f32[] [[PADDING_VAL]]), padding=2_4_1x3_5_1
+// CHECK-SAME:  f32[13,19] pad([[ARG]], [[PADDING_VAL]]), padding=2_4_1x3_5_1
 
 // -----
 
@@ -2017,8 +2052,8 @@ func.func @main(%token: !mhlo.token) -> tuple<tensor<3x4xi32>, !mhlo.token> {
 
 // CHECK:  ENTRY
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv(token[] [[TOKEN]]), channel_id=5, is_host_transfer=true
-// CHECK:  (s32[3,4], token[]) recv-done((s32[3,4], u32[], token[]) [[RECV]]), channel_id=5, is_host_transfer=true
+// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv([[TOKEN]]), channel_id=5, is_host_transfer=true
+// CHECK:  (s32[3,4], token[]) recv-done([[RECV]]), channel_id=5, is_host_transfer=true
 
 // -----
 
@@ -2037,8 +2072,8 @@ func.func @main(%token: !mhlo.token) -> tuple<tensor<3x4xi32>, !mhlo.token> {
 
 // CHECK:  ENTRY
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv(token[] [[TOKEN]]), channel_id=5
-// CHECK:  (s32[3,4], token[]) recv-done((s32[3,4], u32[], token[]) [[RECV]]), channel_id=5
+// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv([[TOKEN]]), channel_id=5
+// CHECK:  (s32[3,4], token[]) recv-done([[RECV]]), channel_id=5
 
 
 // -----
@@ -2057,10 +2092,10 @@ func.func @main(%token: !mhlo.token) -> !mhlo.token {
 
 // CHECK:  ENTRY
 // CHECK-NEXT:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK-NEXT:  [[RECV:%.*]] = ((), u32[], token[]) recv(token[] [[ARG]]), channel_id=5
-// CHECK-NEXT:  [[RECV_DONE:%.*]] = ((), token[]) recv-done(((), u32[], token[]) [[RECV]]), channel_id=5
-// CHECK-NEXT:  [[DATA:%.*]] =   () get-tuple-element(((), token[]) [[RECV_DONE]]), index=0
-// CHECK-NEXT:  ROOT [[TOKEN:%.*]] =   token[] get-tuple-element(((), token[]) [[RECV_DONE]]), index=1
+// CHECK-NEXT:  [[RECV:%.*]] = ((), u32[], token[]) recv([[ARG]]), channel_id=5
+// CHECK-NEXT:  [[RECV_DONE:%.*]] = ((), token[]) recv-done([[RECV]]), channel_id=5
+// CHECK-NEXT:  [[DATA:%.*]] =   () get-tuple-element([[RECV_DONE]]), index=0
+// CHECK-NEXT:  ROOT [[TOKEN:%.*]] =   token[] get-tuple-element([[RECV_DONE]]), index=1
 
 // -----
 
@@ -2077,16 +2112,16 @@ func.func @main(%arg0 : tensor<1x10xf32>, %arg1 : tensor<1x10xi32>, %arg2 : tens
 
 // CHECK:  %[[REGION:region_[0-9]+]]
 // CHECK-SAME:  ([[ARG_FA:.*]]: f32[], [[ARG_IA:.*]]: s32[], [[ARG_FB:.*]]: f32[], [[ARG_IB:.*]]: s32[]) -> (f32[], s32[])
-// CHECK:  %[[FMAX:.*]] = f32[] maximum(f32[] %[[ARG_FA]], f32[] %[[ARG_FB]])
-// CHECK:  %[[IMAX:.*]] = s32[] maximum(s32[] %[[ARG_IA]], s32[] %[[ARG_IB]])
-// CHECK:  ROOT %[[RESULT_REGION:.*]] = (f32[], s32[]) tuple(f32[] %[[FMAX]], s32[] %[[IMAX]])
+// CHECK:  %[[FMAX:.*]] = f32[] maximum(%[[ARG_FA]], %[[ARG_FB]])
+// CHECK:  %[[IMAX:.*]] = s32[] maximum(%[[ARG_IA]], %[[ARG_IB]])
+// CHECK:  ROOT %[[RESULT_REGION:.*]] = (f32[], s32[]) tuple(%[[FMAX]], %[[IMAX]])
 
 // CHECK:  ENTRY
 // CHECK-SAME:  ([[ARG0:.*]]: f32[1,10], [[ARG1:.*]]: s32[1,10], [[ARG2:.*]]: f32[], [[ARG3:.*]]: s32[]) -> (f32[1], s32[1])
-// CHECK:  %[[RESULT:.*]] = (f32[1], s32[1]) reduce(f32[1,10] %[[ARG0]], s32[1,10] %[[ARG1]], f32[] %[[ARG2]], s32[] %[[ARG3]]), dimensions={1}, to_apply=%[[REGION]]
-// CHECK:  %[[RESULT0:.*]] = f32[1] get-tuple-element((f32[1], s32[1]) %[[RESULT]]), index=0
-// CHECK:  %[[RESULT1:.*]] = s32[1] get-tuple-element((f32[1], s32[1]) %[[RESULT]]), index=1
-// CHECK:  ROOT %[[RESULT:.*]] = (f32[1], s32[1]) tuple(f32[1] %[[RESULT0]], s32[1] %[[RESULT1]])
+// CHECK:  %[[RESULT:.*]] = (f32[1], s32[1]) reduce(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]), dimensions={1}, to_apply=%[[REGION]]
+// CHECK:  %[[RESULT0:.*]] = f32[1] get-tuple-element(%[[RESULT]]), index=0
+// CHECK:  %[[RESULT1:.*]] = s32[1] get-tuple-element(%[[RESULT]]), index=1
+// CHECK:  ROOT %[[RESULT:.*]] = (f32[1], s32[1]) tuple(%[[RESULT0]], %[[RESULT1]])
 
 // -----
 
@@ -2108,12 +2143,12 @@ func.func @main(%arg0: tensor<2x17x31x7xi32>) -> tensor<2x5x8x7xi32> {
 }
 
 // CHECK:  %[[MAX_COMPUTATION:.*]] ([[ARG0:.*]]: s32[], [[ARG1:.*]]: s32[]) -> s32[]
-// CHECK:  ROOT %[[RESULT:.*]] = s32[] maximum(s32[] %[[ARG0]], s32[] %[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = s32[] maximum(%[[ARG0]], %[[ARG1]])
 
 // CHECK:  ENTRY
 // CHECK-DAG:  %[[ARG0:.*]] = s32[2,17,31,7] parameter(0)
 // CHECK-DAG:  %[[INIT:.*]] = s32[] constant(-2147483648)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[2,5,8,7] reduce-window(s32[2,17,31,7] %[[ARG0]], s32[] %constant.2),
+// CHECK:  ROOT %[[RESULT:.*]] = s32[2,5,8,7] reduce-window(%[[ARG0]], %constant.2),
 // CHECK-SAME:  window={size=1x2x2x1 stride=1x4x4x1 pad=0_0x2_0x0_2x0_0 rhs_dilate=1x2x2x1},
 // CHECK-SAME:  to_apply=%[[MAX_COMPUTATION]]
 
@@ -2127,7 +2162,7 @@ func.func @main(%arg0: tensor<2xf32>) -> tensor<1x2xf32> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[1,2] reshape(f32[2] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[1,2] reshape(%[[ARG0]])
 
 // -----
 
@@ -2141,7 +2176,7 @@ func.func @main(%arg0 : tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10,11,12,13] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10,11,12,13] reverse(f32[10,11,12,13] %[[ARG0]]), dimensions={1,2}
+// CHECK:  ROOT %[[RESULT:.*]] = f32[10,11,12,13] reverse(%[[ARG0]]), dimensions={1,2}
 
 // -----
 
@@ -2155,7 +2190,7 @@ func.func @main(%mu: tensor<f32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
 // CHECK:  ENTRY
 // CHECK:  %[[MU:.*]] = f32[] parameter(0)
 // CHECK:  %[[SIGMA:.*]] = f32[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2,3,5] rng(f32[] %[[MU]], f32[] %[[SIGMA]]), distribution=rng_normal
+// CHECK:  ROOT %[[RESULT:.*]] = f32[2,3,5] rng(%[[MU]], %[[SIGMA]]), distribution=rng_normal
 
 // -----
 
@@ -2171,7 +2206,7 @@ func.func @main() -> tensor<2x3x5xf32> {
 // CHECK:  ENTRY
 // CHECK-DAG:  %[[A:.*]] = f32[] constant(0)
 // CHECK-DAG:  %[[B:.*]] = f32[] constant(1)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2,3,5] rng(f32[] %[[A]], f32[] %[[B]]), distribution=rng_uniform
+// CHECK:  ROOT %[[RESULT:.*]] = f32[2,3,5] rng(%[[A]], %[[B]]), distribution=rng_uniform
 
 // -----
 
@@ -2200,7 +2235,7 @@ func.func @main(%input_tensor: tensor<200x100x300xf32>, %scatter_indices: tensor
 // CHECK:  [[VAL_2:%.*]] = s32[10,2] parameter(1)
 // CHECK:  [[VAL_3:%.*]] = f32[10,300] parameter(2)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[200,100,300] scatter(f32[200,100,300] [[VAL_1]], s32[10,2] [[VAL_2]], f32[10,300] [[VAL_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=[[COMPUTATION]]
+// CHECK-SAME:  f32[200,100,300] scatter([[VAL_1]], [[VAL_2]], [[VAL_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=[[COMPUTATION]]
 
 // -----
 
@@ -2230,7 +2265,7 @@ func.func @main(%input_tensor: tensor<200x100x300xf32>, %scatter_indices: tensor
 // CHECK:  [[VAL_2:%.*]] = s32[100,200,1] parameter(1)
 // CHECK:  [[VAL_3:%.*]] = f32[100,200,300] parameter(2)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[200,100,300] scatter(f32[200,100,300] [[VAL_1]], s32[100,200,1] [[VAL_2]], f32[100,200,300] [[VAL_3]]), update_window_dims={2}, inserted_window_dims={}, scatter_dims_to_operand_dims={2}, input_batching_dims={0,1}, scatter_indices_batching_dims={1,0}, index_vector_dim=2, indices_are_sorted=true, unique_indices=true, to_apply=[[COMPUTATION]]
+// CHECK-SAME:  f32[200,100,300] scatter([[VAL_1]], [[VAL_2]], [[VAL_3]]), update_window_dims={2}, inserted_window_dims={}, scatter_dims_to_operand_dims={2}, input_batching_dims={0,1}, scatter_indices_batching_dims={1,0}, index_vector_dim=2, indices_are_sorted=true, unique_indices=true, to_apply=[[COMPUTATION]]
 
 // -----
 
@@ -2250,7 +2285,7 @@ func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi64>, %arg2:
 // CHECK:  [[VAL_1:%.*]] = f32[200,100,300] parameter(0)
 // CHECK:  [[VAL_2:%.*]] = s64[10,2] parameter(1)
 // CHECK:  [[VAL_3:%.*]] = f32[10,300] parameter(2)
-// CHECK: (f32[200,100,300], f32[200,100,300]) scatter(f32[200,100,300] [[VAL_1]], f32[200,100,300] [[VAL_1]], s64[10,2] [[VAL_2]], f32[10,300] [[VAL_3]], f32[10,300] [[VAL_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=[[COMPUTATION]]
+// CHECK: (f32[200,100,300], f32[200,100,300]) scatter([[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_3]], [[VAL_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=[[COMPUTATION]]
 
 // -----
 
@@ -2258,11 +2293,11 @@ func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi64>, %arg2:
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // CHECK:  %[[ARG0:.*]] = pred[] parameter(0)
-  // CHECK:  %[[COND:.*]] = pred[2,3] broadcast(pred[] %[[ARG0]]), dimensions={}
+  // CHECK:  %[[COND:.*]] = pred[2,3] broadcast(%[[ARG0]]), dimensions={}
   // CHECK:  %[[ARG1:.*]] = s32[2,3] parameter(1)
   // CHECK:  %[[ARG2:.*]] = s32[2,3] parameter(2)
 
-  // CHECK:  ROOT %[[RES:.*]] = s32[2,3] select(pred[2,3] %[[COND]], s32[2,3] %[[ARG1]], s32[2,3] %[[ARG2]])
+  // CHECK:  ROOT %[[RES:.*]] = s32[2,3] select(%[[COND]], %[[ARG1]], %[[ARG2]])
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   func.return %0 : tensor<2x3xi32>
 }
@@ -2288,10 +2323,10 @@ func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>)
 }
 
 // CHECK:  %[[SELECT_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> pred[] {
-// CHECK:  ROOT %[[RESULT:.*]] = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GE, type=TOTALORDER
+// CHECK:  ROOT %[[RESULT:.*]] = pred[] compare(%[[ARG0]], %[[ARG1]]), direction=GE, type=TOTALORDER
 
 // CHECK:  %[[SCATTER_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> f32[] {
-// CHECK:  ROOT %[[RESULT:.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[] add(%[[ARG0]], %[[ARG1]])
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[10,24,24,64] parameter(0)
@@ -2299,7 +2334,7 @@ func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>)
 // CHECK:  %[[INIT:.*]] = f32[] constant(0)
 
 // CHECK:  ROOT %[[RESULT:.*]] = f32[10,24,24,64]
-// CHECK-SAME:  select-and-scatter(f32[10,24,24,64] %[[ARG0]], f32[10,12,12,64] %[[ARG1]], f32[] %[[INIT]]),
+// CHECK-SAME:  select-and-scatter(%[[ARG0]], %[[ARG1]], %[[INIT]]),
 // CHECK-SAME:  window={size=1x2x2x1 stride=1x2x2x1},
 // CHECK-SAME:  select=%[[SELECT_COMPUTATION]], scatter=%[[SCATTER_COMPUTATION]]
 
@@ -2320,9 +2355,9 @@ func.func @main(%arg: tensor<3x4xi32>, %token: !mhlo.token) -> !mhlo.token {
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = s32[3,4] parameter(0)
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send(s32[3,4] [[ARG]], token[] [[TOKEN]]), channel_id=5, is_host_transfer=true
+// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send([[ARG]], [[TOKEN]]), channel_id=5, is_host_transfer=true
 // CHECK:  ROOT
-// CHECK-SAME:  token[] send-done((s32[3,4], u32[], token[]) [[SEND]]), channel_id=5, is_host_transfer=true
+// CHECK-SAME:  token[] send-done([[SEND]]), channel_id=5, is_host_transfer=true
 
 // -----
 
@@ -2341,9 +2376,9 @@ func.func @main(%arg: tensor<3x4xi32>, %token: !mhlo.token) -> !mhlo.token {
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = s32[3,4] parameter(0)
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send(s32[3,4] [[ARG]], token[] [[TOKEN]]), channel_id=5
+// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send([[ARG]], [[TOKEN]]), channel_id=5
 // CHECK:  ROOT
-// CHECK-SAME:  token[] send-done((s32[3,4], u32[], token[]) [[SEND]]), channel_id=5
+// CHECK-SAME:  token[] send-done([[SEND]]), channel_id=5
 
 // -----
 
@@ -2362,9 +2397,9 @@ func.func @main(%token: !mhlo.token) -> !mhlo.token {
 // CHECK: ENTRY
 // CHECK-DAG:   [[ARG:%.*]] = () tuple()
 // CHECK-DAG:   [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:   [[SEND:%.*]] = ((), u32[], token[]) send(() [[ARG]], token[] [[TOKEN]]), channel_id=5
+// CHECK:   [[SEND:%.*]] = ((), u32[], token[]) send([[ARG]], [[TOKEN]]), channel_id=5
 // CHECK:  ROOT
-// CHECK-SAME:   token[] send-done(((), u32[], token[]) [[SEND]]), channel_id=5
+// CHECK-SAME:   token[] send-done([[SEND]]), channel_id=5
 
 // -----
 
@@ -2378,7 +2413,7 @@ func.func @main(%arg: tensor<4x4xf32>, %size: tensor<i32>) -> tensor<4x4xf32> {
 // CHECK:  [[ARG:%.*]] = f32[4,4] parameter(0)
 // CHECK:  [[SIZE:%.*]] = s32[] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[4,<=4] set-dimension-size(f32[4,4] [[ARG]], s32[] [[SIZE]]), dimensions={1}
+// CHECK-SAME:  f32[4,<=4] set-dimension-size([[ARG]], [[SIZE]]), dimensions={1}
 
 // -----
 
@@ -2391,7 +2426,7 @@ func.func @main(%arg: tensor<3x4xi32>) -> tensor<1x2xi32> {
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = s32[3,4] parameter(0)
 // CHECK:  ROOT
-// CHECK-SAME:  s32[1,2] slice(s32[3,4] [[ARG]]), slice={[1:2:1], [0:4:2]}
+// CHECK-SAME:  s32[1,2] slice([[ARG]]), slice={[1:2:1], [0:4:2]}
 
 // -----
 
@@ -2406,7 +2441,7 @@ func.func @main(%arg: tensor<3x4xi32>, %start1: tensor<i64>, %start2: tensor<i64
 // CHECK:  %[[ARG1:.*]] = s64[] parameter(1)
 // CHECK:  %[[ARG2:.*]] = s64[] parameter(2)
 // CHECK:  ROOT
-// CHECK-SAME:  s32[1,4] dynamic-slice(s32[3,4] %[[ARG]], s64[] %[[ARG1]], s64[] %[[ARG2]]), dynamic_slice_sizes={1,4}
+// CHECK-SAME:  s32[1,4] dynamic-slice(%[[ARG]], %[[ARG1]], %[[ARG2]]), dynamic_slice_sizes={1,4}
 
 // -----
 
@@ -2414,7 +2449,7 @@ func.func @main(%arg: tensor<3x4xi32>, %start1: tensor<i64>, %start2: tensor<i64
 func.func @main(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   // CHECK:  [[ARG:%.*]] = s32[1,2,3,4] parameter(0)
 
-  // CHECK-NEXT:  ROOT %transpose.2 = s32[2,1,4,3] transpose(s32[1,2,3,4] [[ARG]]), dimensions={1,0,3,2}
+  // CHECK-NEXT:  ROOT %transpose.2 = s32[2,1,4,3] transpose([[ARG]]), dimensions={1,0,3,2}
   %0 = "mhlo.transpose"(%arg0) <{permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}> : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0 : tensor<2x1x4x3xi32>
 }
@@ -2430,7 +2465,7 @@ func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf3
 // CHECK:  [[ARG_A:%.*]] = f32[4,4] parameter(0)
 // CHECK:  [[ARG_B:%.*]] = f32[4,3] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[4,3] triangular-solve(f32[4,4] [[ARG_A]], f32[4,3] [[ARG_B]]), left_side=true, lower=true, unit_diagonal=true, transpose_a=NO_TRANSPOSE
+// CHECK-SAME:  f32[4,3] triangular-solve([[ARG_A]], [[ARG_B]]), left_side=true, lower=true, unit_diagonal=true, transpose_a=NO_TRANSPOSE
 
 // -----
 
@@ -2443,24 +2478,24 @@ func.func @main(%arg0: tensor<f32>, %arg1 : tensor<i32>) -> tuple<tensor<f32>, t
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[] parameter(0)
 // CHECK:  %[[ARG1:.*]] = s32[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = (f32[], s32[]) tuple(f32[] %[[ARG0]], s32[] %[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = (f32[], s32[]) tuple(%[[ARG0]], %[[ARG1]])
 
 // -----
 
 // CHECK:  HloModule
 func.func @main(%arg_f32: tensor<4xf32>, %arg_i32: tensor<4xi32>) -> (tensor<4xf32>, tensor<4xf32>, tensor<4xi32>, tensor<4xi32>) {
   // CHECK:  [[ARG_F32:%.*]] = f32[4] parameter(0)
-  // CHECK:  [[EXPM1:%.*]] = f32[4] exponential-minus-one(f32[4] [[ARG_F32]])
+  // CHECK:  [[EXPM1:%.*]] = f32[4] exponential-minus-one([[ARG_F32]])
   %expm1 = "mhlo.exponential_minus_one"(%arg_f32) : (tensor<4xf32>) -> tensor<4xf32>
 
-  // CHECK:  [[LOG1P:%.*]] = f32[4] log-plus-one(f32[4] [[ARG_F32]])
+  // CHECK:  [[LOG1P:%.*]] = f32[4] log-plus-one([[ARG_F32]])
   %log1p = "mhlo.log_plus_one"(%arg_f32) : (tensor<4xf32>) -> tensor<4xf32>
 
   // CHECK:  [[ARG_I32:%.*]] = s32[4] parameter(1)
-  // CHECK:  [[NOT:%.*]] = s32[4] not(s32[4] [[ARG_I32]])
+  // CHECK:  [[NOT:%.*]] = s32[4] not([[ARG_I32]])
   %not = "mhlo.not"(%arg_i32) : (tensor<4xi32>) -> tensor<4xi32>
 
-  // CHECK:  [[POPCNT:%.*]] = s32[4] popcnt(s32[4] [[ARG_I32]])
+  // CHECK:  [[POPCNT:%.*]] = s32[4] popcnt([[ARG_I32]])
   %popcnt = "mhlo.popcnt"(%arg_i32) : (tensor<4xi32>) -> tensor<4xi32>
 
   func.return %expm1, %log1p, %not, %popcnt : tensor<4xf32>, tensor<4xf32>, tensor<4xi32>, tensor<4xi32>
@@ -2473,7 +2508,7 @@ func.func @main(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
   // CHECK:  [[VAL_1:%.*]] = pred[4] parameter(0)
   // CHECK:  [[VAL_2:%.*]] = pred[4] parameter(1)
   %0 = mhlo.xor %arg0, %arg1 : tensor<4xi1>
-  // CHECK:  ROOT [[VAL_3:%.*]] = pred[4] xor(pred[4] [[VAL_1]], pred[4] [[VAL_2]])
+  // CHECK:  ROOT [[VAL_3:%.*]] = pred[4] xor([[VAL_1]], [[VAL_2]])
   func.return %0 : tensor<4xi1>
 }
 
@@ -2490,11 +2525,11 @@ func.func @main(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
 }
 
 // CHECK: %[[SORT_CMP:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[], {{.*}}: s32[], {{.*}}: s32[]) -> pred[] {
-// CHECK:   ROOT %compare.{{[0-9+]}} = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GT
+// CHECK:   ROOT %compare.{{[0-9+]}} = pred[] compare(%[[ARG0]], %[[ARG1]]), direction=GT
 
-// CHECK: [[SORT:%.+]] = (f32[16,16], s32[16,16]) sort(f32[16,16] %Arg_0.1, s32[16,16] %Arg_1.2), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
-// CHECK: [[GET0:%.+]] = f32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=0
-// CHECK: [[GET1:%.+]] = s32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=1
+// CHECK: [[SORT:%.+]] = (f32[16,16], s32[16,16]) sort(%Arg_0.1, %Arg_1.2), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
+// CHECK: [[GET0:%.+]] = f32[16,16] get-tuple-element([[SORT]]), index=0
+// CHECK: [[GET1:%.+]] = s32[16,16] get-tuple-element([[SORT]]), index=1
 
 // -----
 
@@ -2509,9 +2544,9 @@ func.func @main(%input0: tensor<16x16xf32>) {
 }
 
 // CHECK: %[[SORT_CMP:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> pred[] {
-// CHECK:   ROOT %[[CMP:.*]] = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GT
+// CHECK:   ROOT %[[CMP:.*]] = pred[] compare(%[[ARG0]], %[[ARG1]]), direction=GT
 
-// CHECK: %[[RESULT:.*]] = f32[16,16] sort(f32[16,16] %Arg_0.1), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
+// CHECK: %[[RESULT:.*]] = f32[16,16] sort(%Arg_0.1), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
 
 // -----
 
@@ -2533,7 +2568,7 @@ func.func @main(%arg0: tensor<16x16xf32>) -> tensor<16x16xf32> {
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = f32[16,16] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[16,16] custom-call(f32[16,16] %[[ARG0]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[16,16] custom-call(%[[ARG0]])
 // CHECK-SAME: custom_call_target="Sharding"
 // CHECK-SAME: sharding={devices=[1,2]0,1}
 
@@ -2568,10 +2603,10 @@ func.func @main(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xcomplex<f64>>) ->
 
 // CHECK:  ENTRY
 // CHECK:  %[[ARG0:.*]] = c64[2] parameter(0)
-// CHECK:  %[[ABS0:.*]] = f32[2] abs(c64[2] %[[ARG0]])
+// CHECK:  %[[ABS0:.*]] = f32[2] abs(%[[ARG0]])
 // CHECK:  %[[ARG1:.*]] = c128[2] parameter(1)
-// CHECK:  %[[ABS1:.*]] = f64[2] abs(c128[2] %[[ARG1]])
-// CHECK:  ROOT %[[RESULT:.*]] = (f32[2], f64[2]) tuple(f32[2] %[[ABS0]], f64[2] %[[ABS1]])
+// CHECK:  %[[ABS1:.*]] = f64[2] abs(%[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = (f32[2], f64[2]) tuple(%[[ABS0]], %[[ABS1]])
 
 // -----
 
@@ -2583,7 +2618,7 @@ func.func @main(%arg0: tensor<4xui8>) -> tensor<4xui8> {
 
 // CHECK: ENTRY
 // CHECK: %[[ARG0:.*]] = u8[4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = u8[4] not(u8[4] %[[ARG0]])
+// CHECK: ROOT %[[RESULT:.*]] = u8[4] not(%[[ARG0]])
 
 // -----
 
@@ -2596,7 +2631,7 @@ func.func @main(%arg0: tensor<4xi32>) -> tensor<*xi32> {
 
 // CHECK: ENTRY
 // CHECK: %[[ARG0:.*]] = s32[4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = s32[4] not(s32[4] %[[ARG0]])
+// CHECK: ROOT %[[RESULT:.*]] = s32[4] not(%[[ARG0]])
 
 // -----
 
@@ -2629,11 +2664,11 @@ func.func @main(%arg: tensor<3x4xf32>, %token: !mhlo.token) -> tuple<tensor<3x4x
 // CHECK:  ENTRY
 // CHECK:  %[[SEND:.*]] = (f32[3,4], u32[], token[]) send
 // CHECK-SAME: frontend_attributes={_xla_host_transfer_rendezvous="channel_dtoh_0"}
-// CHECK:  %[[SEND_DONE:.*]] = token[] send-done((f32[3,4], u32[], token[]) %[[SEND]])
+// CHECK:  %[[SEND_DONE:.*]] = token[] send-done(%[[SEND]])
 // CHECK-SAME: frontend_attributes={_xla_host_transfer_rendezvous="channel_dtoh_0"}
-// CHECK:  %[[RECV:.*]] = (f32[3,4], u32[], token[]) recv(token[] %[[SEND_DONE]])
+// CHECK:  %[[RECV:.*]] = (f32[3,4], u32[], token[]) recv(%[[SEND_DONE]])
 // CHECK-SAME: frontend_attributes={_xla_host_transfer_rendezvous="channel_htod_0"}
-// CHECK:  %{{.*}} = (f32[3,4], token[]) recv-done((f32[3,4], u32[], token[]) %[[RECV]])
+// CHECK:  %{{.*}} = (f32[3,4], token[]) recv-done(%[[RECV]])
 // CHECK-SAME: frontend_attributes={_xla_host_transfer_rendezvous="channel_htod_0"}
 
 // -----
@@ -2669,11 +2704,11 @@ func.func @main(%arg: tensor<3x4xf32>, %token: !mhlo.token) -> !mhlo.token {
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
 // CHECK: %[[ARG0:.*]] = u64[3] parameter(0)
-// CHECK: [[RNG:%.*]] = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %[[ARG0]]), algorithm=rng_philox
-// CHECK:  [[GTE0:%.*]] = u64[3] get-tuple-element((u64[3], u32[2,2]) [[RNG]]), index=0
-// CHECK:  [[GTE1:%.*]] = u32[2,2] get-tuple-element((u64[3], u32[2,2]) [[RNG]]), index=1
+// CHECK: [[RNG:%.*]] = (u64[3], u32[2,2]) rng-bit-generator(%[[ARG0]]), algorithm=rng_philox
+// CHECK:  [[GTE0:%.*]] = u64[3] get-tuple-element([[RNG]]), index=0
+// CHECK:  [[GTE1:%.*]] = u32[2,2] get-tuple-element([[RNG]]), index=1
 // CHECK:  ROOT
-// CHECK-SAME: [[RES:%.*]] = (u64[3], u32[2,2]) tuple(u64[3] [[GTE0]], u32[2,2] [[GTE1]])
+// CHECK-SAME: [[RES:%.*]] = (u64[3], u32[2,2]) tuple([[GTE0]], [[GTE1]])
   %0:2 = "mhlo.rng_bit_generator"(%arg) <{rng_algorithm = #mhlo.rng_algorithm<PHILOX>}> : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
   %1 = "mhlo.tuple"(%0#0, %0#1) : (tensor<3xui64>, tensor<2x2xui32>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
   func.return %1 : tuple<tensor<3xui64>, tensor<2x2xui32>>
@@ -2684,7 +2719,7 @@ func.func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] cbrt(f32[3,4] %[[ARG0]])
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] cbrt(%[[ARG0]])
   %0 = "mhlo.cbrt"(%arg) : (tensor<3x4xf32>) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
 }
@@ -2694,7 +2729,7 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] reduce-precision(f32[3,4] %[[ARG0]]), exponent_bits=8, mantissa_bits=10
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] reduce-precision(%[[ARG0]]), exponent_bits=8, mantissa_bits=10
   %0 = "mhlo.reduce_precision"(%arg) {exponent_bits = 8 : i32, mantissa_bits = 10 : i32} : (tensor<3x4xf32>) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
 }
@@ -2704,7 +2739,7 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
 // CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4,1] bitcast(f32[3,4] %[[ARG0]])
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4,1] bitcast(%[[ARG0]])
   %0 = "mhlo.bitcast"(%arg) : (tensor<3x4xf32>) -> tensor<3x4x1xf32>
   func.return %0 : tensor<3x4x1xf32>
 }
@@ -2715,11 +2750,11 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
 func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>) {
 // CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
 // CHECK: %[[ARG1:.*]] = f32[3,4] parameter(1)
-// CHECK: %[[ARGS:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
-// CHECK: %[[OPT:.*]] = (f32[4,4], f32[3,4]) opt-barrier((f32[4,4], f32[3,4]) %[[ARGS]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
-// CHECK: %[[GTE0:.*]] = f32[4,4] get-tuple-element((f32[4,4], f32[3,4]) %[[OPT]]), index=0, sharding={replicated}
-// CHECK: %[[GTE1:.*]] = f32[3,4] get-tuple-element((f32[4,4], f32[3,4]) %[[OPT]]), index=1, sharding={devices=[1,2]<=[2]}
-// CHECK: ROOT %[[ROOT:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[GTE0]], f32[3,4] %[[GTE1]])
+// CHECK: %[[ARGS:.*]] = (f32[4,4], f32[3,4]) tuple(%[[ARG0]], %[[ARG1]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
+// CHECK: %[[OPT:.*]] = (f32[4,4], f32[3,4]) opt-barrier(%[[ARGS]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
+// CHECK: %[[GTE0:.*]] = f32[4,4] get-tuple-element(%[[OPT]]), index=0, sharding={replicated}
+// CHECK: %[[GTE1:.*]] = f32[3,4] get-tuple-element(%[[OPT]]), index=1, sharding={devices=[1,2]<=[2]}
+// CHECK: ROOT %[[ROOT:.*]] = (f32[4,4], f32[3,4]) tuple(%[[GTE0]], %[[GTE1]])
   %0, %1 = "mhlo.optimization_barrier"(%arg0, %arg1) {mhlo.sharding = "{{replicated}, {devices=[1,2]<=[2]}}"} : (tensor<4x4xf32>, tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>)
   func.return %0, %1 : tensor<4x4xf32>, tensor<3x4xf32>
 }
@@ -2749,7 +2784,7 @@ func.func private @main(%arg0: tensor<ui32>) -> tensor<ui32> {
 func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
 // CHECK: %[[ARG1:.*]] = f32[3,4] parameter(1)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] triangular-solve(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]]), lower=true, transpose_a=NO_TRANSPOSE
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] triangular-solve(%[[ARG0]], %[[ARG1]]), lower=true, transpose_a=NO_TRANSPOSE
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = false, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = false} : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
   func.return %0: tensor<3x4xf32>
 }
@@ -2760,18 +2795,18 @@ func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf3
 // CHECK: %[[APPLYFN:.*]] ({{.*}}) -> (f32[], s32[]) {
 // CHECK: %[[A0:.*]] = f32[] parameter(0)
 // CHECK: %[[B0:.*]] = f32[] parameter(2)
-// CHECK: %[[ADDF32:.*]] = f32[] add(f32[] %[[A0]], f32[] %[[B0]])
+// CHECK: %[[ADDF32:.*]] = f32[] add(%[[A0]], %[[B0]])
 // CHECK: %[[A1:.*]] = s32[] parameter(1)
 // CHECK: %[[B1:.*]] = s32[] parameter(3)
-// CHECK: %[[ADDS32:.*]] = s32[] add(s32[] %[[A1]], s32[] %[[B1]])
-// CHECK: ROOT %{{.*}} = (f32[], s32[]) tuple(f32[] %[[ADDF32]], s32[] %[[ADDS32]])
+// CHECK: %[[ADDS32:.*]] = s32[] add(%[[A1]], %[[B1]])
+// CHECK: ROOT %{{.*}} = (f32[], s32[]) tuple(%[[ADDF32]], %[[ADDS32]])
 
 // CHECK: ENTRY
 // CHECK: %[[ARG0:.*]] = f32[4,2] parameter(0)
 // CHECK: %[[ARG1:.*]] = s32[4,2] parameter(1)
 // CHECK: %[[ARG2:.*]] = f32[] parameter(2)
 // CHECK: %[[ARG3:.*]] = s32[] parameter(3)
-// CHECK: (f32[2,2], s32[2,2]) reduce-window(f32[4,2] %[[ARG0]], s32[4,2] %[[ARG1]], f32[] %[[ARG2]], s32[] %[[ARG3]])
+// CHECK: (f32[2,2], s32[2,2]) reduce-window(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]])
 // CHECK-SAME: window={size=5x1 stride=3x1 pad=2_2x0_0}
 // CHECK-SAME: to_apply=%[[APPLYFN]]
 func.func @main(%arg0: tensor<4x2xf32>, %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) -> (tensor<2x2xf32>, tensor<2x2xi32>) {
@@ -2793,7 +2828,7 @@ func.func @main(%arg0: tensor<4x2xf32>, %arg1: tensor<4x2xi32>, %init0: tensor<f
 func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK: %[[ARG0:.*]] = f32[2] parameter(0)
   %0 = "mhlo.round_nearest_even"(%arg0) {} : (tensor<2xf32>) -> tensor<2xf32>
-  // CHECK: round-nearest-even(f32[2] %[[ARG0]])
+  // CHECK: round-nearest-even(%[[ARG0]])
   func.return %0 : tensor<2xf32>
 }
 
@@ -2803,7 +2838,7 @@ func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK: %[[ARG0:.*]] = f32[2] parameter(0)
   %0 = "mhlo.tan"(%arg0) {} : (tensor<2xf32>) -> tensor<2xf32>
-  // CHECK: tan(f32[2] %[[ARG0]])
+  // CHECK: tan(%[[ARG0]])
   func.return %0 : tensor<2xf32>
 }
 
@@ -2813,7 +2848,7 @@ func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x2xf32>, tensor<4x2xi32>) {
   // CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
   %0:2 = "mhlo.topk"(%arg0) {k = 2, largest = true} : (tensor<4x4xf32>) -> (tensor<4x2xf32>, tensor<4x2xi32>)
-  // CHECK: (f32[4,2], s32[4,2]) topk(f32[4,4] %[[ARG0]]), k=2, largest=true
+  // CHECK: (f32[4,2], s32[4,2]) topk(%[[ARG0]]), k=2, largest=true
   func.return %0#0, %0#1 : tensor<4x2xf32>, tensor<4x2xi32>
 }
 
@@ -2855,7 +2890,7 @@ func.func @main(%arg0: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, %arg1: tensor<5x
 func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
 // CHECK: %[[TOK:.*]] = token[] after-all()
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] add-dependency(f32[3,4] %[[ARG0]], token[] %[[TOK]])
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] add-dependency(%[[ARG0]], %[[TOK]])
   %token = "mhlo.after_all"() : () -> !mhlo.token
   %0 = "mhlo.add_dependency"(%arg, %token) : (tensor<3x4xf32>, !mhlo.token) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
@@ -2876,7 +2911,7 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> attributes {execution_
 // CHECK:  HloModule
 func.func private @main(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // CHECK: %[[ARG0:.*]] = s32[2,2] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = s32[2,2] all-to-all(s32[2,2] %[[ARG0]]), channel_id=1, replica_groups={{.}}{1,2},{0,3}}, dimensions={1}
+// CHECK: ROOT %[[RESULT:.*]] = s32[2,2] all-to-all(%[[ARG0]]), channel_id=1, replica_groups={{.}}{1,2},{0,3}}, dimensions={1}
   %0 = "mhlo.all_to_all"(%arg0) {
     concat_dimension = 1 : i64,
     replica_groups = dense<[[1, 2], [0, 3]]> : tensor<2x2xi64>,
@@ -2891,7 +2926,7 @@ func.func private @main(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 func.func private @main(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>) -> tuple<tensor<128x4xf32>, tensor<128x4xf32>> {
 // CHECK: %[[ARG0:.*]] = f32[128,4] parameter(0)
 // CHECK: %[[ARG1:.*]] = f32[128,4] parameter(1)
-// CHECK: (f32[128,4], f32[128,4]) all-to-all(f32[128,4] %[[ARG0]], f32[128,4] %[[ARG1]]), channel_id=1, replica_groups={{.}}{0,1}}
+// CHECK: (f32[128,4], f32[128,4]) all-to-all(%[[ARG0]], %[[ARG1]]), channel_id=1, replica_groups={{.}}{0,1}}
   %0:2 = "mhlo.all_to_all"(%arg0, %arg1) {
     replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 1>
@@ -2922,7 +2957,7 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
 // CHECK:  [[VAL_1:%.*]] = f32[2,3] parameter(0)
 // CHECK:  [[VAL_2:%.*]] = f32[5,5] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]])
+// CHECK-SAME:  f32[1,2,3] custom-call([[VAL_1]], [[VAL_2]])
 // CHECK-SAME:  custom_call_target="foo"
 // CHECK-SAME:  custom_call_has_side_effect=true
 // CHECK-SAME:  api_version=API_VERSION_TYPED_FFI
@@ -2948,8 +2983,8 @@ func.func @main(%operand: tensor<?x784xf32>) -> tensor<?x784xf32> {
 //       CHECK: HloModule {{.*}}, entry_computation_layout={(f32[?,784]{1,0})->f32[?,784]{1,0}}
 // CHECK-EMPTY:
 //  CHECK-NEXT: ENTRY {{.*}} ([[ARG0:.*]]: f32[?,784]) -> f32[?,784] {
-//  CHECK-NEXT:   [[ARG0]] = f32[?,784] parameter(0)
-//  CHECK-NEXT:   ROOT {{.*}} = f32[?,784] abs(f32[?,784] %Arg_0.1), {{.*}}
+//  CHECK-NEXT:   %[[ARG0]] = f32[?,784] parameter(0)
+//  CHECK-NEXT:   ROOT {{.*}} = f32[?,784] abs(%[[ARG0]]), {{.*}}
 //  CHECK-NEXT: }
 
 // -----
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_async.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_async.mlir
index add453c9a276..70efb9c6b6d2 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_async.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_async.mlir
@@ -20,12 +20,12 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x128xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[128,32] parameter(0)
-// CHECK: %[[OUTPUT:.*]] = f32[128,128] all-gather-start(f32[128,32] %[[INPUT]])
+// CHECK: %[[OUTPUT:.*]] = f32[128,128] all-gather-start(%[[INPUT]])
 // CHECK-SAME: channel_id=1
 // CHECK-SAME{LITERAL}: replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME: dimensions={1}
 // CHECK-SAME: use_global_device_ids=true
-// CHECK: ROOT {{.*}} f32[128,128] all-gather-done(f32[128,128] %[[OUTPUT]]
+// CHECK: ROOT {{.*}} f32[128,128] all-gather-done(%[[OUTPUT]]
 
 // -----
 
@@ -55,11 +55,11 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[10] parameter(0)
-// CHECK: %[[OUTPUT:.*]] = f32[10] all-reduce-start(f32[10] %[[INPUT]])
+// CHECK: %[[OUTPUT:.*]] = f32[10] all-reduce-start(%[[INPUT]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME: use_global_device_ids=true
-// CHECK: ROOT {{.*}} f32[10] all-reduce-done(f32[10] %[[OUTPUT]]
+// CHECK: ROOT {{.*}} f32[10] all-reduce-done(%[[OUTPUT]]
 
 // -----
 
@@ -125,10 +125,10 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[128,32] parameter(0)
-// CHECK: %[[OUTPUT:.*]] = f32[128,32] collective-permute-start(f32[128,32] %[[INPUT]])
+// CHECK: %[[OUTPUT:.*]] = f32[128,32] collective-permute-start(%[[INPUT]])
 // CHECK-SAME:  channel_id=1
 // CHECK-SAME{LITERAL}:  source_target_pairs={{0,1},{1,2},{2,3}}
-// CHECK: ROOT {{.*}} f32[128,32] collective-permute-done(f32[128,32] %[[OUTPUT]]
+// CHECK: ROOT {{.*}} f32[128,32] collective-permute-done(%[[OUTPUT]]
 
 // -----
 
@@ -146,9 +146,9 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[128,32] parameter(0)
-// CHECK: %[[OUTPUT:.*]] = (f32[128,32], f32[128,32], u32[]) copy-start(f32[128,32] %[[INPUT]])
+// CHECK: %[[OUTPUT:.*]] = (f32[128,32], f32[128,32], u32[]) copy-start(%[[INPUT]])
 // CHECK-SAME:  cross_program_prefetch_index=0
-// CHECK: ROOT {{.*}} f32[128,32] copy-done((f32[128,32], f32[128,32], u32[]) %[[OUTPUT]]
+// CHECK: ROOT {{.*}} f32[128,32] copy-done(%[[OUTPUT]]
 
 // -----
 
@@ -173,8 +173,8 @@ func.func @main(%token: !mhlo.token) -> (!mhlo.token) {
 
 // CHECK:  ENTRY
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:  [[RECV:%.*]] = ((), u32[], token[]) recv(token[] [[TOKEN]]), channel_id=5
-// CHECK:  ((), token[]) recv-done(((), u32[], token[]) [[RECV]]), channel_id=5
+// CHECK:  [[RECV:%.*]] = ((), u32[], token[]) recv([[TOKEN]]), channel_id=5
+// CHECK:  ((), token[]) recv-done([[RECV]]), channel_id=5
 
 // -----
 
@@ -198,17 +198,17 @@ func.func @main(%token: !mhlo.token) -> (tensor<3x4xi32>, !mhlo.token) {
 
 // CHECK:  ENTRY
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv(token[] [[TOKEN]]), channel_id=5, is_host_transfer
+// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv([[TOKEN]]), channel_id=5, is_host_transfer
 // CHECK-SAME:  sharding={
 // CHECK-SAME:    {maximal device=0}, {maximal device=0}, {maximal device=0}
 // CHECK-SAME:  }
-// CHECK:  [[RECV_DONE:%.*]] = (s32[3,4], token[]) recv-done((s32[3,4], u32[], token[]) [[RECV]]), channel_id=5, is_host_transfer
+// CHECK:  [[RECV_DONE:%.*]] = (s32[3,4], token[]) recv-done([[RECV]]), channel_id=5, is_host_transfer
 // CHECK-SAME:  sharding={
 // CHECK-SAME:    {maximal device=0}, {maximal device=0}
 // CHECK-SAME:  }
-// CHECK:  [[TUPLE0:%.*]] = s32[3,4] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=0, sharding={maximal device=0}
-// CHECK:  [[TUPLE1:%.*]] = token[] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=1, sharding={maximal device=0}
-// CHECK:  ROOT {{%.*}} = (s32[3,4], token[]) tuple(s32[3,4] [[TUPLE0]], token[] [[TUPLE1]])
+// CHECK:  [[TUPLE0:%.*]] = s32[3,4] get-tuple-element([[RECV_DONE]]), index=0, sharding={maximal device=0}
+// CHECK:  [[TUPLE1:%.*]] = token[] get-tuple-element([[RECV_DONE]]), index=1, sharding={maximal device=0}
+// CHECK:  ROOT {{%.*}} = (s32[3,4], token[]) tuple([[TUPLE0]], [[TUPLE1]])
 
 // -----
 
@@ -233,9 +233,9 @@ func.func @main(%arg: tensor<3x4xi32>, %token: !mhlo.token) -> !mhlo.token {
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = s32[3,4] parameter(0)
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send(s32[3,4] [[ARG]], token[] [[TOKEN]]), channel_id=5, is_host_transfer
+// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send([[ARG]], [[TOKEN]]), channel_id=5, is_host_transfer
 // CHECK:  ROOT
-// CHECK-SAME:  token[] send-done((s32[3,4], u32[], token[]) [[SEND]]), channel_id=5
+// CHECK-SAME:  token[] send-done([[SEND]]), channel_id=5
 
 // -----
 
@@ -258,9 +258,9 @@ func.func @main(%token: !mhlo.token) -> !mhlo.token {
 
 // CHECK:  ENTRY
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:  [[SEND:%.*]] = ((), u32[], token[]) send(() [[UNIT:%.*]], token[] [[TOKEN]]), channel_id=5
+// CHECK:  [[SEND:%.*]] = ((), u32[], token[]) send([[UNIT:%.*]], [[TOKEN]]), channel_id=5
 // CHECK:  ROOT
-// CHECK-SAME:  token[] send-done(((), u32[], token[]) [[SEND]]), channel_id=5
+// CHECK-SAME:  token[] send-done([[SEND]]), channel_id=5
 
 // -----
 
@@ -275,12 +275,12 @@ func.func @AsyncOp(%arg0: tensor<10xf32>) -> tensor<20xf32>
 // CHECK: ENTRY
 func.func @main(%arg0: tensor<10xf32>) -> tensor<20xf32> {
   // CHECK: %[[ARG0:.*]] = f32[10] parameter(0)
-  // CHECK: %[[START:.*]] = ((f32[10]), f32[20], s32[]) async-start(f32[10] %[[ARG0]])
+  // CHECK: %[[START:.*]] = ((f32[10]), f32[20], s32[]) async-start(%[[ARG0]])
   // CHECK-SAME: calls=[[CALLED_COMPUTATION]]
   %0 = "mhlo.async_start"(%arg0) {called_computation = @AsyncOp, execution_thread = "thread"} : (tensor<10xf32>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
-  // CHECK: %[[UPDATE:.*]] = ((f32[10]), f32[20], s32[]) async-update(((f32[10]), f32[20], s32[]) %[[START]])
+  // CHECK: %[[UPDATE:.*]] = ((f32[10]), f32[20], s32[]) async-update(%[[START]])
   %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
-  // CHECK: ROOT %{{.*}} = (f32[20]) async-done(((f32[10]), f32[20], s32[]) %[[UPDATE]])
+  // CHECK: ROOT %{{.*}} = (f32[20]) async-done(%[[UPDATE]])
   %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> tensor<20xf32>
   return %2 : tensor<20xf32>
 }
@@ -300,10 +300,10 @@ func.func @AsyncOp(%arg0: tensor<10xf32>) -> tensor<20xf32>
 // CHECK: ENTRY
 func.func @main(%arg0: tensor<10xf32>) -> tensor<20xf32> {
   // CHECK: %[[ARG0:.*]] = f32[10] parameter(0)
-  // CHECK: %[[START:.*]] = ((f32[10]), f32[20], s32[]) async-start(f32[10] %[[ARG0]]), async_execution_thread="thread", calls=[[CALLED_COMPUTATION]],
-  // CHECK: %[[UPDATE:.*]] = ((f32[10]), f32[20], s32[]) async-update(((f32[10]), f32[20], s32[]) %[[START]])
+  // CHECK: %[[START:.*]] = ((f32[10]), f32[20], s32[]) async-start(%[[ARG0]]), async_execution_thread="thread", calls=[[CALLED_COMPUTATION]],
+  // CHECK: %[[UPDATE:.*]] = ((f32[10]), f32[20], s32[]) async-update(%[[START]])
   // CHECK: ROOT
-  // CHECK-SAME: (f32[20]) async-done(((f32[10]), f32[20], s32[]) %[[UPDATE]])
+  // CHECK-SAME: (f32[20]) async-done(%[[UPDATE]])
 
   %0 = "mhlo.async_start"(%arg0) {called_computation = @AsyncOp, execution_thread="thread"} : (tensor<10xf32>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
   %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
@@ -321,12 +321,12 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<20xf32> {
 // CHECK: ENTRY
 func.func @main() -> tensor<1x2xf32> attributes {allow_soft_placement = false, tf.entry_function = {control_outputs = "", inputs = "", outputs = "_retval0"}} {
   // CHECK:               %[[AFTER_ALL:.*]] = token[] after-all()
-  // CHECK-NEXT:          %[[RECV:.*]] = (f32[1,2], u32[], token[]) recv(token[] %[[AFTER_ALL]]), channel_id=2, is_host_transfer=true,
+  // CHECK-NEXT:          %[[RECV:.*]] = (f32[1,2], u32[], token[]) recv(%[[AFTER_ALL]]), channel_id=2, is_host_transfer=true,
   // CHECK-SAME{LITERAL}:   sharding={{manual}, {manual}, {manual}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_1_retvals_htod_0"}
-  // CHECK-NEXT:          %[[RECV_DONE:.*]] = (f32[1,2], token[]) recv-done((f32[1,2], u32[], token[]) %[[RECV]]), channel_id=2, is_host_transfer=true,
+  // CHECK-NEXT:          %[[RECV_DONE:.*]] = (f32[1,2], token[]) recv-done(%[[RECV]]), channel_id=2, is_host_transfer=true,
   // CHECK-SAME{LITERAL}:   sharding={{manual}, {manual}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_1_retvals_htod_0"}
-  // CHECK-NEXT:          ROOT %[[GET_TUPLE_0:.*]] = f32[1,2] get-tuple-element((f32[1,2], token[]) %[[RECV_DONE]]), index=0, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_1_retvals_htod_0"}
-  // CHECK-NEXT:          %[[GET_TUPLE_1:.*]] = token[] get-tuple-element((f32[1,2], token[]) %[[RECV_DONE]]), index=1, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_1_retvals_htod_0"}
+  // CHECK-NEXT:          ROOT %[[GET_TUPLE_0:.*]] = f32[1,2] get-tuple-element(%[[RECV_DONE]]), index=0, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_1_retvals_htod_0"}
+  // CHECK-NEXT:          %[[GET_TUPLE_1:.*]] = token[] get-tuple-element(%[[RECV_DONE]]), index=1, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_1_retvals_htod_0"}
   %0 = mhlo.create_token : !mhlo.token
   %1:2 = "mhlo.recv"(%0) <{channel_handle = #mhlo.channel_handle<handle = 2, type = 3>, is_host_transfer = true}> {mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "host_compute_channel_1_retvals_htod_0"}, mhlo.sharding = "\08\04"} : (!mhlo.token) -> (tensor<1x2xf32>, !mhlo.token)
   return %1#0 : tensor<1x2xf32>
@@ -346,15 +346,15 @@ func.func @main() -> tensor<1x2xf32> attributes {allow_soft_placement = false, t
 func.func @main(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> attributes {allow_soft_placement = false, tf.entry_function = {control_outputs = "", inputs = "_arg0", outputs = "_retval0"}} {
   // CHECK:               %[[ARG0:.*]] = s64[1,2] parameter(0)
   // CHECK-NEXT:          %[[AFTER_ALL:.*]] = token[] after-all()
-  // CHECK-NEXT:          %[[SEND:.*]] = (s64[1,2], u32[], token[]) send(s64[1,2] %[[ARG0]], token[] %[[AFTER_ALL]]), channel_id=3, is_host_transfer=true,
+  // CHECK-NEXT:          %[[SEND:.*]] = (s64[1,2], u32[], token[]) send(%[[ARG0]], %[[AFTER_ALL]]), channel_id=3, is_host_transfer=true,
   // CHECK-SAME{LITERAL}:   sharding={{manual}, {manual}, {manual}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_args_dtoh_0"}
-  // CHECK-NEXT:          %[[SEND_DONE:.*]] = token[] send-done((s64[1,2], u32[], token[]) %[[SEND]]), channel_id=3, is_host_transfer=true, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_args_dtoh_0"}
-  // CHECK-NEXT:          %[[RECV:.*]] = (s64[1,2], u32[], token[]) recv(token[] %[[SEND_DONE]]), channel_id=4, is_host_transfer=true,
+  // CHECK-NEXT:          %[[SEND_DONE:.*]] = token[] send-done(%[[SEND]]), channel_id=3, is_host_transfer=true, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_args_dtoh_0"}
+  // CHECK-NEXT:          %[[RECV:.*]] = (s64[1,2], u32[], token[]) recv(%[[SEND_DONE]]), channel_id=4, is_host_transfer=true,
   // CHECK-SAME{LITERAL}:   sharding={{manual}, {manual}, {manual}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_retvals_htod_0"}
-  // CHECK-NEXT:          %[[RECV_DONE:.*]] = (s64[1,2], token[]) recv-done((s64[1,2], u32[], token[]) %[[RECV]]), channel_id=4, is_host_transfer=true,
+  // CHECK-NEXT:          %[[RECV_DONE:.*]] = (s64[1,2], token[]) recv-done(%[[RECV]]), channel_id=4, is_host_transfer=true,
   // CHECK-SAME{LITERAL}:   sharding={{manual}, {manual}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_retvals_htod_0"}
-  // CHECK-NEXT:          ROOT %[[GET_TUPLE_0:.*]] = s64[1,2] get-tuple-element((s64[1,2], token[]) %[[RECV_DONE]]), index=0, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_retvals_htod_0"}
-  // CHECK-NEXT:          %[[GET_TUPLE_1:.*]]  = token[] get-tuple-element((s64[1,2], token[]) %[[RECV_DONE]]), index=1, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_retvals_htod_0"}
+  // CHECK-NEXT:          ROOT %[[GET_TUPLE_0:.*]] = s64[1,2] get-tuple-element(%[[RECV_DONE]]), index=0, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_retvals_htod_0"}
+  // CHECK-NEXT:          %[[GET_TUPLE_1:.*]]  = token[] get-tuple-element(%[[RECV_DONE]]), index=1, sharding={manual}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_rendezvous="host_compute_channel_0_retvals_htod_0"}
   %0 = mhlo.create_token : !mhlo.token
   %1 = "mhlo.send"(%arg0, %0) <{channel_handle = #mhlo.channel_handle<handle = 3, type = 2>, is_host_transfer = true}> {mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "host_compute_channel_0_args_dtoh_0"}, mhlo.sharding = "\08\04"} : (tensor<1x2xi64>, !mhlo.token) -> !mhlo.token
   %2:2 = "mhlo.recv"(%1) <{channel_handle = #mhlo.channel_handle<handle = 4, type = 3>, is_host_transfer = true}> {mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "host_compute_channel_0_retvals_htod_0"}, mhlo.sharding = "\08\04"} : (!mhlo.token) -> (tensor<1x2xi64>, !mhlo.token)
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_bounded_dynamism.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_bounded_dynamism.mlir
index 811bc4a44f82..5f44d0c3c4f2 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_bounded_dynamism.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_bounded_dynamism.mlir
@@ -30,6 +30,7 @@ func.func @main(%arg0: tensor<1x?x512xf32, #mhlo.type_extensions<bounds = [?, 18
 // CHECK-LITERAL: HloModule main
 func.func @main(%arg0: tensor<?xf32, #mhlo.type_extensions<bounds = [5]>>) -> tensor<1x?xf32, #mhlo.type_extensions<bounds = [?, 5]>> {
   %0 = mhlo.reshape %arg0 : (tensor<?xf32, #mhlo.type_extensions<bounds = [5]>>) -> tensor<1x?xf32, #mhlo.type_extensions<bounds = [?, 5]>>
-  // CHECK: f32[1,<=5] reshape(f32[<=5]
+  // CHECK: %[[ARG0:.+]] = f32[<=5] parameter(0)
+  // CHECK: f32[1,<=5] reshape(%[[ARG0]])
   return %0 : tensor<1x?xf32, #mhlo.type_extensions<bounds = [?, 5]>>
 }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_entry_computation_layout.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_entry_computation_layout.mlir
index ced6fb5e257c..a082394aefec 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_entry_computation_layout.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_entry_computation_layout.mlir
@@ -47,7 +47,7 @@ module @entry attributes {
 // CHECK:   %Arg_3.4 = s32[] parameter(3)
 // CHECK:   %Arg_0.1 = f32[2,3,4]{2,1,0} parameter(0)
 // CHECK:   %Arg_1.2 = f32[2,3,4]{2,1,0} parameter(1)
-// CHECK:   ROOT %add.5 = f32[2,3,4]{2,1,0} add(f32[2,3,4]{2,1,0} %Arg_0.1, f32[2,3,4]{2,1,0} %Arg_1.2)
+// CHECK:   ROOT %add.5 = f32[2,3,4]{2,1,0} add(%Arg_0.1, %Arg_1.2)
 // CHECK: }
 
 // -----
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_replicas.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_replicas.mlir
index 5b11ed4a4c32..f5cc357961db 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_replicas.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/export_replicas.mlir
@@ -12,4 +12,4 @@ func.func @main(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32> {mhlo.is_same
 // CHECK:  %[[ARG0:.*]] = f32[16,16] parameter(0)
 // CHECK-NOT: parameter_replication={true}
 // CHECK:  %[[ARG1:.*]] = f32[16,16] parameter(1), parameter_replication={true}
-// CHECK:  ROOT %[[RESULT:.*]] = f32[16,16] add(f32[16,16] %[[ARG0]], f32[16,16] %[[ARG1]])
+// CHECK:  ROOT %[[RESULT:.*]] = f32[16,16] add(%[[ARG0]], %[[ARG1]])
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/function.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/function.mlir
index 29ac479e024e..06c9c8dc8289 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/function.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/function.mlir
@@ -8,9 +8,9 @@ module @non_entry_function_shardings {
 
   // CHECK:      %called_computation.{{[0-9]+}} (Arg_0.{{[0-9]+}}: s32[8,2]) -> s32[8,2] {
   // CHECK-NEXT:   %[[ARG:.*]] = s32[8,2] parameter(0), sharding={devices=[2,2]<=[4]}
-  // CHECK-NEXT:   %[[MULT:.*]] = s32[8,2] multiply(s32[8,2] %[[ARG]], s32[8,2] %[[ARG]])
-  // CHECK-NEXT:   %[[TUPLE:.*]] = (s32[8,2]) tuple(s32[8,2] %[[MULT]])
-  // CHECK-NEXT:   ROOT %get-tuple-element.{{[0-9]+}} = s32[8,2] get-tuple-element((s32[8,2]) %[[TUPLE]]), index=0, sharding={devices=[2,2]<=[4]}
+  // CHECK-NEXT:   %[[MULT:.*]] = s32[8,2] multiply(%[[ARG]], %[[ARG]])
+  // CHECK-NEXT:   %[[TUPLE:.*]] = (s32[8,2]) tuple(%[[MULT]])
+  // CHECK-NEXT:   ROOT %get-tuple-element.{{[0-9]+}} = s32[8,2] get-tuple-element(%[[TUPLE]]), index=0, sharding={devices=[2,2]<=[4]}
   func.func private @called_computation(%arg0: tensor<8x2xi32> {mhlo.sharding = "{devices=[2,2]<=[4]}"}) -> (tensor<8x2xi32> {mhlo.sharding = "{devices=[2,2]<=[4]}"}) {
     %0 = mhlo.multiply %arg0, %arg0 : tensor<8x2xi32>
     return %0 : tensor<8x2xi32>
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/fusion.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/fusion.mlir
index 62e5ab5664f2..904ad43e490d 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/fusion.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/fusion.mlir
@@ -7,13 +7,13 @@
 // CHECK: ENTRY
 // CHECK:   %[[PARAM0:.*]] = f32[] parameter(0)
 // CHECK:   %[[PARAM1:.*]] = f32[] parameter(1)
-// CHECK:   %[[FUSION0:.*]] = f32[] fusion(f32[] %[[PARAM0]], f32[] %[[PARAM1]]), kind=kLoop, calls=%[[REGION0]]
-// CHECK:   %[[FUSION1:.*]] = (f32[], f32[]) fusion(f32[] %[[PARAM0]], f32[] %[[PARAM1]]), kind=kLoop, calls=%[[REGION1]]
-// CHECK:   f32[] get-tuple-element((f32[], f32[]) %[[FUSION1]]), index=0
-// CHECK:   f32[] get-tuple-element((f32[], f32[]) %[[FUSION1]]), index=1
-// CHECK:   %[[FUSION2:.*]] = (f32[], f32[]) fusion(f32[] %[[PARAM0]]), kind=kLoop, calls=%[[REGION2]]
-// CHECK:   f32[] get-tuple-element((f32[], f32[]) %[[FUSION2]]), index=0
-// CHECK:   f32[] get-tuple-element((f32[], f32[]) %[[FUSION2]]), index=1
+// CHECK:   %[[FUSION0:.*]] = f32[] fusion(%[[PARAM0]], %[[PARAM1]]), kind=kLoop, calls=%[[REGION0]]
+// CHECK:   %[[FUSION1:.*]] = (f32[], f32[]) fusion(%[[PARAM0]], %[[PARAM1]]), kind=kLoop, calls=%[[REGION1]]
+// CHECK:   f32[] get-tuple-element(%[[FUSION1]]), index=0
+// CHECK:   f32[] get-tuple-element(%[[FUSION1]]), index=1
+// CHECK:   %[[FUSION2:.*]] = (f32[], f32[]) fusion(%[[PARAM0]]), kind=kLoop, calls=%[[REGION2]]
+// CHECK:   f32[] get-tuple-element(%[[FUSION2]]), index=0
+// CHECK:   f32[] get-tuple-element(%[[FUSION2]]), index=1
 // CHECK: }
 func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) {
   %result = "mhlo.fusion"(%arg0, %arg1) ({
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/if.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/if.mlir
index 4c4144221247..836c257c7a0c 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/if.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/if.mlir
@@ -13,10 +13,10 @@ func.func @main(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
   // CHECK:   %[[VAL0:.+]] = f32[] constant(10)
   %cst = arith.constant  dense<1.000000e+01> : tensor<f32>
 
-  // CHECK:   %[[VAL1:.+]] = pred[] compare(f32[] %[[A0]], f32[] %[[VAL0]]), direction=LT
+  // CHECK:   %[[VAL1:.+]] = pred[] compare(%[[A0]], %[[VAL0]]), direction=LT
   %0 = "mhlo.compare"(%arg0, %cst) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
 
-  // CHECK:   %[[VAL2:.+]] = f32[] conditional(pred[] %[[VAL1]], f32[] %[[A0]], f32[] %[[A0]]), true_computation=[[R0]], false_computation=[[R1]]
+  // CHECK:   %[[VAL2:.+]] = f32[] conditional(%[[VAL1]], %[[A0]], %[[A0]]), true_computation=[[R0]], false_computation=[[R1]]
   %2 = "mhlo.if"(%0) ({
     %6 = "mhlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
     "mhlo.return"(%6) : (tensor<f32>) -> ()
@@ -25,7 +25,7 @@ func.func @main(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
     "mhlo.return"(%6) : (tensor<f32>) -> ()
   }) : (tensor<i1>) -> tensor<f32>
 
-  // CHECK:   ROOT %[[VAL3:.+]] = (f32[]) tuple(f32[] %[[VAL2]])
+  // CHECK:   ROOT %[[VAL3:.+]] = (f32[]) tuple(%[[VAL2]])
   %3 = "mhlo.tuple"(%2) : (tensor<f32>) -> tuple<tensor<f32>>
   func.return %3 : tuple<tensor<f32>>
 }
@@ -65,9 +65,9 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 // CHECK: ENTRY
 // CHECK-DAG: %[[A0:.+]] = f32[] parameter(0)
 // CHECK-DAG: %[[A1:.+]] = f32[] parameter(1)
-// CHECK-DAG: %[[TUPLE1:.+]] = (f32[], f32[]) tuple(f32[] %[[A0]], f32[] %[[A1]])
-// CHECK-DAG: %[[TUPLE2:.+]] = (f32[], f32[]) tuple(f32[] %[[A0]], f32[] %[[A1]])
-// CHECK-DAG: %[[COND:.+]] = (f32[], f32[]) conditional(pred[] %[[PRED:.+]], (f32[], f32[]) %[[TUPLE1]], (f32[], f32[]) %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
+// CHECK-DAG: %[[TUPLE1:.+]] = (f32[], f32[]) tuple(%[[A0]], %[[A1]])
+// CHECK-DAG: %[[TUPLE2:.+]] = (f32[], f32[]) tuple(%[[A0]], %[[A1]])
+// CHECK-DAG: %[[COND:.+]] = (f32[], f32[]) conditional(%[[PRED:.+]], %[[TUPLE1]], %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
 
 // -----
 // Test export mhlo::IfOp with multiple args, but different numbers of args for
@@ -105,9 +105,9 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 // CHECK-DAG: %[[A0:.+]] = f32[] parameter(0)
 // CHECK-DAG: %[[CST:.+]] = f32[] constant(10)
 // CHECK-DAG: %[[A1:.+]] = f32[] parameter(1)
-// CHECK-DAG: %[[TUPLE1:.+]] = (f32[], f32[], f32[]) tuple(f32[] %[[CST]], f32[] %[[A1]], f32[] %[[A0]])
-// CHECK-DAG: %[[TUPLE2:.+]] = (f32[], f32[]) tuple(f32[] %[[A0]], f32[] %[[A1]])
-// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(pred[] %[[PRED:.+]], (f32[], f32[], f32[]) %[[TUPLE1]], (f32[], f32[]) %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
+// CHECK-DAG: %[[TUPLE1:.+]] = (f32[], f32[], f32[]) tuple(%[[CST]], %[[A1]], %[[A0]])
+// CHECK-DAG: %[[TUPLE2:.+]] = (f32[], f32[]) tuple(%[[A0]], %[[A1]])
+// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(%[[PRED:.+]], %[[TUPLE1]], %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
 
 // -----
 // Test export mhlo::IfOp with false branch having no implict captures.
@@ -145,9 +145,9 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 // CHECK-DAG: %[[A0:.+]] = f32[] parameter(0)
 // CHECK-DAG: %[[CST:.+]] = f32[] constant(10)
 // CHECK-DAG: %[[A1:.+]] = f32[] parameter(1)
-// CHECK-DAG: %[[TUPLE1:.+]] = (f32[], f32[], f32[]) tuple(f32[] %[[CST]], f32[] %[[A1]], f32[] %[[A0]])
+// CHECK-DAG: %[[TUPLE1:.+]] = (f32[], f32[], f32[]) tuple(%[[CST]], %[[A1]], %[[A0]])
 // CHECK-DAG: %[[TUPLE2:.+]] = () tuple()
-// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(pred[] %[[PRED:.+]], (f32[], f32[], f32[]) %[[TUPLE1]], () %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
+// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(%[[PRED:.+]], %[[TUPLE1]], %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
 
 // -----
 // Test export mhlo::IfOp with true branch having no implict captures.
@@ -186,8 +186,8 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 // CHECK-DAG: %[[CST:.+]] = f32[] constant(10)
 // CHECK-DAG: %[[A1:.+]] = f32[] parameter(1)
 // CHECK-DAG: %[[TUPLE1:.+]] = () tuple()
-// CHECK-DAG: %[[TUPLE2:.+]] = (f32[], f32[], f32[]) tuple(f32[] %[[CST]], f32[] %[[A1]], f32[] %[[A0]])
-// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(pred[] %[[PRED:.+]], () %[[TUPLE1]], (f32[], f32[], f32[]) %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
+// CHECK-DAG: %[[TUPLE2:.+]] = (f32[], f32[], f32[]) tuple(%[[CST]], %[[A1]], %[[A0]])
+// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(%[[PRED:.+]], %[[TUPLE1]], %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
 
 // -----
 // Test export mhlo::IfOp with both branches having no implict captures.
@@ -223,7 +223,7 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 // CHECK: ENTRY
 // CHECK: %[[TUPLE1:.+]] = () tuple()
 // CHECK: %[[TUPLE2:.+]] = () tuple()
-// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(pred[] %[[PRED:.+]], () %[[TUPLE1]], () %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
+// CHECK: %[[COND:.+]] = (f32[], f32[]) conditional(%[[PRED:.+]], %[[TUPLE1]], %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
 
 // -----
 // Test export nested mhlo::IfOp.
@@ -278,7 +278,7 @@ func.func @main(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> te
 // CHECK-NEXT:   %[[A2_EMPTY_TUPLE]] = () parameter(0)
 // CHECK-DAG:   %[[CST2:.+]] = f32[] constant(10)
 // CHECK-DAG: %[[TUPLE2:.+]] = () tuple()
-// CHECK:  %[[COND2:.+]] = f32[] conditional(pred[] %{{.+}}, f32[] %[[CST2]], () %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
+// CHECK:  %[[COND2:.+]] = f32[] conditional(%{{.+}}, %[[CST2]], %[[TUPLE2]]), true_computation=[[R0]], false_computation=[[R1]]
 // CHECK:  ROOT %tuple.{{[0-9]+}} = (f32[], f32[]) tuple
 // CHECK-NEXT: }
 
@@ -293,5 +293,5 @@ func.func @main(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> te
 // CHECK-DAG: %[[A1:.+]] = f32[] parameter(1)
 // CHECK-DAG: %[[A2:.+]] = f32[] parameter(2)
 // CHECK-DAG: %[[TUPLE1:.+]]  = () tuple()
-// CHECK-DAG: %[[TUPLE2:.+]]  = (f32[], f32[]) tuple(f32[] %[[A1]], f32[] %[[A2]])
-// CHECK: (f32[], f32[]) conditional(pred[] %[[A0]], () %[[TUPLE1]], (f32[], f32[]) %[[TUPLE2]]), true_computation=[[R2]], false_computation=[[R3]]
+// CHECK-DAG: %[[TUPLE2:.+]]  = (f32[], f32[]) tuple(%[[A1]], %[[A2]])
+// CHECK: (f32[], f32[]) conditional(%[[A0]], %[[TUPLE1]], %[[TUPLE2]]), true_computation=[[R2]], false_computation=[[R3]]
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/int4.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/int4.mlir
index 615fe126d681..15a6c64ba133 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/int4.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/int4.mlir
@@ -6,9 +6,9 @@
 func.func @main() -> tensor<6xi4> {
   // CHECK-NEXT: %[[CONSTANT:.*]] = s4[6] constant({1, -2, -3, 4, -8, 7})
   %0 = mhlo.constant dense<[1, -2, -3, 4, -8, 7]> : tensor<6xi4>
-  // CHECK-NEXT: %[[CONVERT1:.*]] = s8[6] convert(s4[6] %[[CONSTANT]])
+  // CHECK-NEXT: %[[CONVERT1:.*]] = s8[6] convert(%[[CONSTANT]])
   %1 = "mhlo.convert"(%0) : (tensor<6xi4>) -> tensor<6xi8>
-  // CHECK-NEXT: ROOT %[[CONVERT2:.*]] = s4[6] convert(s8[6] %[[CONVERT1]])
+  // CHECK-NEXT: ROOT %[[CONVERT2:.*]] = s4[6] convert(%[[CONVERT1]])
   %2 = "mhlo.convert"(%1) : (tensor<6xi8>) -> tensor<6xi4>
   func.return %2 : tensor<6xi4>
 }
@@ -19,9 +19,9 @@ func.func @main() -> tensor<6xi4> {
 func.func @main() -> tensor<4xui4> {
   // CHECK-NEXT: %[[CONSTANT:.*]] = u4[4] constant({1, 2, 3, 15})
   %0 = mhlo.constant dense<[1, 2, 3, 15]> : tensor<4xui4>
-  // CHECK-NEXT: %[[CONVERT1:.*]] = u8[4] convert(u4[4] %[[CONSTANT]])
+  // CHECK-NEXT: %[[CONVERT1:.*]] = u8[4] convert(%[[CONSTANT]])
   %1 = "mhlo.convert"(%0) : (tensor<4xui4>) -> tensor<4xui8>
-  // CHECK-NEXT: ROOT %[[CONVERT2:.*]] = u4[4] convert(u8[4] %[[CONVERT1]])
+  // CHECK-NEXT: ROOT %[[CONVERT2:.*]] = u4[4] convert(%[[CONVERT1]])
   %2 = "mhlo.convert"(%1) : (tensor<4xui8>) -> tensor<4xui4>
   func.return %2 : tensor<4xui4>
 }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/layouts_and_names.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/layouts_and_names.mlir
index 6d908d5d6a0e..26cba7ca71c5 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/layouts_and_names.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/layouts_and_names.mlir
@@ -46,11 +46,11 @@ func.func @main(%arg0: !mhlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>,
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((s32[3,3]{0,1}, pred[]), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:  [[GTE1:%.*]] = (s32[3,3]{0,1}, pred[]) get-tuple-element(((s32[3,3]{0,1}, pred[]), token[]) [[INFEED]]), index=0
-// CHECK:  [[GTE2:%.*]] = s32[3,3]{0,1} get-tuple-element((s32[3,3]{0,1}, pred[]) [[GTE1]]), index=0
-// CHECK:  [[GTE3:%.*]] = pred[] get-tuple-element((s32[3,3]{0,1}, pred[]) [[GTE1]]), index=1
-// CHECK:  [[GTE4:%.*]] = token[] get-tuple-element(((s32[3,3]{0,1}, pred[]), token[]) [[INFEED]]), index=1
+// CHECK:  [[INFEED:%.*]] = ((s32[3,3]{0,1}, pred[]), token[]) infeed([[ARG]]), infeed_config="foobar"
+// CHECK:  [[GTE1:%.*]] = (s32[3,3]{0,1}, pred[]) get-tuple-element([[INFEED]]), index=0
+// CHECK:  [[GTE2:%.*]] = s32[3,3]{0,1} get-tuple-element([[GTE1]]), index=0
+// CHECK:  [[GTE3:%.*]] = pred[] get-tuple-element([[GTE1]]), index=1
+// CHECK:  [[GTE4:%.*]] = token[] get-tuple-element([[INFEED]]), index=1
 
 // -----
 
@@ -64,11 +64,11 @@ func.func @main(%arg0: !mhlo.token) -> tuple<tensor<3x3xi32>, !mhlo.token> {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((s32[3,3]{0,1}), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:  [[GTE0:%.*]] = (s32[3,3]{0,1}) get-tuple-element(((s32[3,3]{0,1}), token[]) [[INFEED]]), index=0
-// CHECK:  [[GTE1:%.*]] = s32[3,3]{0,1} get-tuple-element((s32[3,3]{0,1}) [[GTE0]]), index=0
-// CHECK:  [[GTE2:%.*]] = token[] get-tuple-element(((s32[3,3]{0,1}), token[]) [[INFEED]]), index=1
-// CHECK:  ROOT [[RES:%.*]] = (s32[3,3]{1,0}, token[]) tuple(s32[3,3]{0,1} [[GTE1]], token[] [[GTE2]]
+// CHECK:  [[INFEED:%.*]] = ((s32[3,3]{0,1}), token[]) infeed([[ARG]]), infeed_config="foobar"
+// CHECK:  [[GTE0:%.*]] = (s32[3,3]{0,1}) get-tuple-element([[INFEED]]), index=0
+// CHECK:  [[GTE1:%.*]] = s32[3,3]{0,1} get-tuple-element([[GTE0]]), index=0
+// CHECK:  [[GTE2:%.*]] = token[] get-tuple-element([[INFEED]]), index=1
+// CHECK:  ROOT [[RES:%.*]] = (s32[3,3]{1,0}, token[]) tuple([[GTE1]], [[GTE2]]
 
 // -----
 
@@ -81,5 +81,5 @@ func.func @main(%arg0: !mhlo.token) -> !mhlo.token {
 
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK: ROOT [[GTE1:%.*]] = ((), token[]) get-tuple-element(((), token[]) [[INFEED]]), index=1
+// CHECK:  [[INFEED:%.*]] = ((), token[]) infeed([[ARG]]), infeed_config="foobar"
+// CHECK: ROOT [[GTE1:%.*]] = ((), token[]) get-tuple-element([[INFEED]]), index=1
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/module_attributes.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/module_attributes.mlir
index 6ad08374e5d2..eb0363b92f8a 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/module_attributes.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/module_attributes.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-translate -verify-diagnostics -split-input-file -mlir-hlo-to-hlo %s | FileCheck %s
+// RUN: xla-translate -verify-diagnostics -split-input-file -mlir-hlo-to-hlo --hlo-import-all-computations %s | FileCheck %s
 // RUN: xla-translate -verify-diagnostics -split-input-file -mlir-hlo-to-hlo --via-builder=true %s | FileCheck %s
 
 module attributes { mhlo.cross_program_prefetches = [ #mhlo.cross_program_prefetch<parameter = 1, indices = [0], offset = 0> ] } {
@@ -101,11 +101,10 @@ module @ModuleWithFrontendAttributes attributes {
   }
 }
 
-
-
 // -----
 
-module attributes {
+// CHECK-LABEL: input_output_alias_module
+module @input_output_alias_module attributes {
 //      CHECK:   input_output_alias {
 // CHECK-NEXT:    entries {
 // CHECK-NEXT:      output_shape_index: 0
@@ -141,4 +140,97 @@ module attributes {
   func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32> ) -> (tensor<1xf32>, tensor<1xf32>) {
     func.return %arg0, %arg1: tensor<1xf32>, tensor<1xf32>
   }
-}
\ No newline at end of file
+}
+
+// -----
+
+// Check that function order in module does not impact HLO module entry
+// computation assignment.
+
+// CHECK-LABEL: entry_computation_with_multiple
+// CHECK-LABEL: host_program_shape {
+// CHECK-NEXT: parameters {
+// CHECK-NEXT:   element_type: BF16
+// CHECK-NEXT:   dimensions: 10
+// CHECK-NEXT:   dimensions: 20
+// CHECK-NEXT:   layout {
+// CHECK-NEXT:     minor_to_major: 0
+// CHECK-NEXT:     minor_to_major: 1
+// CHECK-NEXT:     tail_padding_alignment_in_elements: 1
+// CHECK-NEXT:   }
+// CHECK-NEXT:   is_dynamic_dimension: false
+// CHECK-NEXT:   is_dynamic_dimension: false
+// CHECK-NEXT: }
+// CHECK-NEXT: parameters {
+// CHECK-NEXT:   element_type: BF16
+// CHECK-NEXT:   dimensions: 20
+// CHECK-NEXT:   layout {
+// CHECK-NEXT:     minor_to_major: 0
+// CHECK-NEXT:     tail_padding_alignment_in_elements: 1
+// CHECK-NEXT:   }
+// CHECK-NEXT:   is_dynamic_dimension: false
+// CHECK-NEXT: }
+// CHECK-NEXT: result {
+// CHECK-NEXT:   element_type: F32
+// CHECK-NEXT:   layout {
+// CHECK-NEXT:     tail_padding_alignment_in_elements: 1
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+module @entry_computation_with_multiple attributes {
+    mhlo.xla_entry_computation_parameter_layouts = [dense<[0, 1]> : tensor<2xindex>, dense<0> : tensor<1xindex>],
+    mhlo.xla_entry_computation_parameter_tiles = [[], []]
+  } {
+  func.func @callee(%arg0: tensor<10x20xbf16>) -> tensor<f32> {
+    %cst = mhlo.constant dense<1.000000e+00> : tensor<f32>
+    return %cst : tensor<f32>
+  }
+  func.func @main(%arg0: tensor<10x20xbf16> {mhlo.sharding = "{devices=[4,1,2]<=[2,4]T(1,0) last_tile_dim_replicate}"}, %arg1: tensor<20xbf16> {mhlo.sharding = "{replicated}"}) -> tensor<f32> {
+    %0 = call @callee(%arg0) : (tensor<10x20xbf16>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: entry_computation_with_multiple_swap
+// CHECK-LABEL: host_program_shape {
+// CHECK-NEXT: parameters {
+// CHECK-NEXT:   element_type: BF16
+// CHECK-NEXT:   dimensions: 10
+// CHECK-NEXT:   dimensions: 20
+// CHECK-NEXT:   layout {
+// CHECK-NEXT:     minor_to_major: 0
+// CHECK-NEXT:     minor_to_major: 1
+// CHECK-NEXT:     tail_padding_alignment_in_elements: 1
+// CHECK-NEXT:   }
+// CHECK-NEXT:   is_dynamic_dimension: false
+// CHECK-NEXT:   is_dynamic_dimension: false
+// CHECK-NEXT: }
+// CHECK-NEXT: parameters {
+// CHECK-NEXT:   element_type: BF16
+// CHECK-NEXT:   dimensions: 20
+// CHECK-NEXT:   layout {
+// CHECK-NEXT:     minor_to_major: 0
+// CHECK-NEXT:     tail_padding_alignment_in_elements: 1
+// CHECK-NEXT:   }
+// CHECK-NEXT:   is_dynamic_dimension: false
+// CHECK-NEXT: }
+// CHECK-NEXT: result {
+// CHECK-NEXT:   element_type: F32
+// CHECK-NEXT:   layout {
+// CHECK-NEXT:     tail_padding_alignment_in_elements: 1
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+module @entry_computation_with_multiple_swap attributes {
+    mhlo.xla_entry_computation_parameter_layouts = [dense<[0, 1]> : tensor<2xindex>, dense<0> : tensor<1xindex>],
+    mhlo.xla_entry_computation_parameter_tiles = [[], []]
+  } {
+  func.func @main(%arg0: tensor<10x20xbf16> {mhlo.sharding = "{devices=[4,1,2]<=[2,4]T(1,0) last_tile_dim_replicate}"}, %arg1: tensor<20xbf16> {mhlo.sharding = "{replicated}"}) -> tensor<f32> {
+    %0 = call @callee(%arg0) : (tensor<10x20xbf16>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+  func.func @callee(%arg0: tensor<10x20xbf16>) -> tensor<f32> {
+    %cst = mhlo.constant dense<1.000000e+00> : tensor<f32>
+    return %cst : tensor<f32>
+  }
+}
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir
index 878285fd21d4..dd1b7221d170 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/multiple_return_tuple.mlir
@@ -8,7 +8,7 @@
 // TUPLE-LABEL: ENTRY %main.{{.*}} (arg_tuple.1: (s32[4])) -> (s32[4], s32[1,2,3,4])
 func.func @main(%arg0: tensor<4xi32>) -> (tensor<4xi32>, tensor<1x2x3x4xi32>) {
   // CHECK-NEXT: %Arg_0.1 = s32[4] parameter(0)
-  // CHECK-NEXT: %broadcast.2 = s32[1,2,3,4] broadcast(s32[4] %Arg_0.1), dimensions={3}
+  // CHECK-NEXT: %broadcast.2 = s32[1,2,3,4] broadcast(%Arg_0.1), dimensions={3}
   %0 = "mhlo.broadcast"(%arg0) <{broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>}> : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
   func.return %arg0, %0 : tensor<4xi32>, tensor<1x2x3x4xi32>
 }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/ragged_dot.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/ragged_dot.mlir
index 51602b1e34ed..0b6c8f6e00c0 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/ragged_dot.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/ragged_dot.mlir
@@ -2,10 +2,13 @@
 
 module @ragged_dot_non_contracting {
   func.func @main(%lhs : tensor<19x11x5xf32>, %rhs : tensor<3x5x7xf32>, %group_sizes : tensor<19x3xi64>) -> tensor<19x11x7xf32> {
-    // CHECK: f32[19,11,7] ragged-dot(f32[19,11,5] {{.*}}, f32[3,5,7] {{.*}}, s64[19,3] {{.*}}), lhs_contracting_dims={2}, rhs_contracting_dims={1}, lhs_ragged_dims={1}, rhs_group_dims={0}
+    // CHECK: %[[ARG0:.+]] = f32[19,11,5] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[3,5,7] parameter(1)
+    // CHECK: %[[ARG2:.+]] = s64[19,3] parameter(2)
+    // CHECK: f32[19,11,7] ragged-dot(%[[ARG0]], %[[ARG1]], %[[ARG2]]), lhs_contracting_dims={2}, rhs_contracting_dims={1}, lhs_ragged_dims={1}, rhs_group_dims={0}
     %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
       ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-        dot_dimension_numbers = <
+        dot_dimension_numbers = #mhlo.dot<
           lhs_batching_dimensions = [],
           rhs_batching_dimensions = [],
           lhs_contracting_dimensions = [2],
@@ -24,10 +27,13 @@ module @ragged_dot_non_contracting {
 
 module @ragged_dot_contracting {
   func.func @main(%lhs : tensor<11x19x5xf32>, %rhs : tensor<19x5x7xf32>, %group_sizes : tensor<19x3xi64>) -> tensor<3x11x7xf32> {
-    // CHECK: f32[3,11,7] ragged-dot(f32[11,19,5] {{.*}}, f32[19,5,7] {{.*}}, s64[19,3] {{.*}}), lhs_contracting_dims={1,2}, rhs_contracting_dims={0,1}, lhs_ragged_dims={2}
+    // CHECK: %[[ARG0:.+]] = f32[11,19,5] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[19,5,7] parameter(1)
+    // CHECK: %[[ARG2:.+]] = s64[19,3] parameter(2)
+    // CHECK: f32[3,11,7] ragged-dot(%[[ARG0]], %[[ARG1]], %[[ARG2]]), lhs_contracting_dims={1,2}, rhs_contracting_dims={0,1}, lhs_ragged_dims={2}
     %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
       ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-        dot_dimension_numbers = <
+        dot_dimension_numbers = #mhlo.dot<
           lhs_batching_dimensions = [],
           rhs_batching_dimensions = [],
           lhs_contracting_dimensions = [1,2],
@@ -46,10 +52,13 @@ module @ragged_dot_contracting {
 
 module @ragged_dot_batch {
   func.func @main(%lhs : tensor<19x17x11x5xf32>, %rhs : tensor<19x17x5x7xf32>, %group_sizes : tensor<19x3xi64>) -> tensor<19x17x11x7xf32> {
-    // CHECK: f32[19,17,11,7] ragged-dot(f32[19,17,11,5] {{.*}}, f32[19,17,5,7] {{.*}}, s64[19,3] {{.*}}), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, lhs_ragged_dims={1}
+    // CHECK: %[[ARG0:.+]] = f32[19,17,11,5] parameter(0)
+    // CHECK: %[[ARG1:.+]] = f32[19,17,5,7] parameter(1)
+    // CHECK: %[[ARG2:.+]] = s64[19,3] parameter(2)
+    // CHECK: f32[19,17,11,7] ragged-dot(%[[ARG0]], %[[ARG1]], %[[ARG2]]), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, lhs_ragged_dims={1}
     %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
       ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-        dot_dimension_numbers = <
+        dot_dimension_numbers = #mhlo.dot<
           lhs_batching_dimensions = [0,1],
           rhs_batching_dimensions = [0,1],
           lhs_contracting_dimensions = [3],
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/sharding.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/sharding.mlir
index b7255055f4b3..0f07fe5a4867 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/sharding.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/sharding.mlir
@@ -16,9 +16,9 @@ func.func public @main(%arg0: tensor<f32> {mhlo.sharding = ""}, %arg1: tensor<4x
 // CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[5,8,128]) -> f32[5,8,128]
 func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"}) -> (tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"}) {
   // CHECK-NEXT: %Arg_0.1 = f32[5,8,128] parameter(0), sharding={devices=[1,2,1]0,1}
-  // CHECK-NEXT: %custom-call.2 = f32[5,8,128] custom-call(f32[5,8,128] %Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2,1]0,1}
-  // CHECK-NEXT: %tuple.3 = (f32[5,8,128]) tuple(f32[5,8,128] %custom-call.2)
-  // CHECK-NEXT: ROOT %get-tuple-element.4 = f32[5,8,128] get-tuple-element((f32[5,8,128]) %tuple.3), index=0
+  // CHECK-NEXT: %custom-call.2 = f32[5,8,128] custom-call(%Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2,1]0,1}
+  // CHECK-NEXT: %tuple.3 = (f32[5,8,128]) tuple(%custom-call.2)
+  // CHECK-NEXT: ROOT %get-tuple-element.4 = f32[5,8,128] get-tuple-element(%tuple.3), index=0
   // CHECK-SAME: sharding={devices=[1,2,1]0,1}
   %0 = "mhlo.custom_call"(%arg0) {call_target_name = "Sharding",
 				  mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"
@@ -31,10 +31,10 @@ func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\
 // CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[4,4]) -> (f32[4,4], f32[4,4])
 func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x4xf32> {mhlo.sharding = "\08\03\1A\03\02\01\02\22\04\00\01\02\03B\01\00"}, tensor<4x4xf32>) {
   // CHECK-NEXT: %Arg_0.1 = f32[4,4] parameter(0)
-  // CHECK-NEXT: [[RESHAPE_0:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
-  // CHECK-NEXT: [[RESHAPE_1:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1)
+  // CHECK-NEXT: [[RESHAPE_0:%.*]] = f32[4,4] reshape(%Arg_0.1), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  // CHECK-NEXT: [[RESHAPE_1:%.*]] = f32[4,4] reshape(%Arg_0.1)
   // CHECK-NOT:  sharding
-  // CHECK-NEXT: ROOT {{%.*}} = (f32[4,4], f32[4,4]) tuple(f32[4,4] [[RESHAPE_0]], f32[4,4] [[RESHAPE_1]])
+  // CHECK-NEXT: ROOT {{%.*}} = (f32[4,4], f32[4,4]) tuple(%Arg_0.1, %Arg_0.1)
   // CHECK-SAME: sharding={{\{}}{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}, {replicated}}
   return %arg0, %arg0 : tensor<4x4xf32>, tensor<4x4xf32>
 }
@@ -44,8 +44,8 @@ func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x4xf32> {mhlo.sharding = "\0
 // CHECK-LABEL: ENTRY %main.{{.*}} () -> f32[4]
 func.func @main() -> (tensor<4xf32>) {
   // CHECK-NEXT: %constant.1 = f32[] constant(3.1415925)
-  // CHECK-NEXT: %broadcast.2 = f32[4] broadcast(f32[] %constant.1), dimensions={}, sharding={devices=[2]0,1}
-  // CHECK-NEXT: ROOT %add.3 = f32[4] add(f32[4] %broadcast.2, f32[4] %broadcast.2)
+  // CHECK-NEXT: %broadcast.2 = f32[4] broadcast(%constant.1), dimensions={}, sharding={devices=[2]0,1}
+  // CHECK-NEXT: ROOT %add.3 = f32[4] add(%broadcast.2, %broadcast.2)
   %0 = mhlo.constant {mhlo.sharding = "{devices=[2]0,1}"} dense<3.1415926> : tensor<4xf32>
   %1 = mhlo.add %0, %0 : tensor<4xf32>
   return %1 : tensor<4xf32>
@@ -56,8 +56,8 @@ func.func @main() -> (tensor<4xf32>) {
 // CHECK-LABEL: ENTRY %main.{{.*}} () -> f32[12,24,36]
 func.func @main() -> (tensor<12x24x36xf32>) {
   // CHECK-NEXT: %constant.1 = f32[] constant(3.1415925)
-  // CHECK-NEXT: %broadcast.2 = f32[12,24,36] broadcast(f32[] %constant.1), dimensions={}, sharding={devices=[1,2,1]0,1}
-  // CHECK-NEXT: ROOT %add.3 = f32[12,24,36] add(f32[12,24,36] %broadcast.2, f32[12,24,36] %broadcast.2)
+  // CHECK-NEXT: %broadcast.2 = f32[12,24,36] broadcast(%constant.1), dimensions={}, sharding={devices=[1,2,1]0,1}
+  // CHECK-NEXT: ROOT %add.3 = f32[12,24,36] add(%broadcast.2, %broadcast.2)
   %0 = mhlo.constant {mhlo.sharding = "{devices=[1,2,1]0,1}"} dense<3.1415926> : tensor<12x24x36xf32>
   %1 = mhlo.add %0, %0 : tensor<12x24x36xf32>
   return %1 : tensor<12x24x36xf32>
@@ -68,13 +68,13 @@ func.func @main() -> (tensor<12x24x36xf32>) {
 // CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: u64[2]) -> (u64[2], u32[512,4])
 func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64> {mhlo.sharding = "{devices=[2,16]<=[32] last_tile_dim_replicate}"}, tensor<512x4xui32> {mhlo.sharding = "{devices=[4,8]<=[32]}"}) {
   // CHECK-NEXT: %Arg_0.1 = u64[2] parameter(0)
-  // CHECK-NEXT: %rng-bit-generator.2 = (u64[2], u32[512,4]) rng-bit-generator(u64[2] %Arg_0.1), algorithm=rng_default, sharding={{\{}}{replicated}, {devices=[8,4]<=[32]}}
-  // CHECK-NEXT: %get-tuple-element.3 = u64[2] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=0, sharding={replicated}
-  // CHECK-NEXT: %add.5 = u64[2] add(u64[2] %get-tuple-element.3, u64[2] %get-tuple-element.3)
-  // CHECK-NEXT: %reshape.6 = u64[2] reshape(u64[2] %add.5)
-  // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=1, sharding={devices=[8,4]<=[32]}
-  // CHECK-NEXT: %reshape.7 = u32[512,4] reshape(u32[512,4] %get-tuple-element.4)
-  // CHECK-NEXT: ROOT %tuple.8 = (u64[2], u32[512,4]) tuple(u64[2] %reshape.6, u32[512,4] %reshape.7), sharding={{\{}}{devices=[2,16]<=[32] last_tile_dim_replicate}, {devices=[4,8]<=[32]}}
+  // CHECK-NEXT: %rng-bit-generator.2 = (u64[2], u32[512,4]) rng-bit-generator(%Arg_0.1), algorithm=rng_default, sharding={{\{}}{replicated}, {devices=[8,4]<=[32]}}
+  // CHECK-NEXT: %get-tuple-element.3 = u64[2] get-tuple-element(%rng-bit-generator.2), index=0, sharding={replicated}
+  // CHECK-NEXT: %add.5 = u64[2] add(%get-tuple-element.3, %get-tuple-element.3)
+  // CHECK-NEXT: %reshape.6 = u64[2] reshape(%add.5)
+  // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element(%rng-bit-generator.2), index=1, sharding={devices=[8,4]<=[32]}
+  // CHECK-NEXT: %reshape.7 = u32[512,4] reshape(%get-tuple-element.4)
+  // CHECK-NEXT: ROOT %tuple.8 = (u64[2], u32[512,4]) tuple(%add.5, %get-tuple-element.4), sharding={{\{}}{devices=[2,16]<=[32] last_tile_dim_replicate}, {devices=[4,8]<=[32]}}
   %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> {mhlo.sharding = "{{replicated}, {devices=[8,4]<=[32]}}"} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>)
   %0 = mhlo.add %output_state, %output_state : tensor<2xui64>
   return %0, %output : tensor<2xui64>, tensor<512x4xui32>
@@ -85,11 +85,11 @@ func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64> {mhlo.sharding = "{dev
 // CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: u64[2]) -> (u64[2], u32[512,4])
 func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>) {
   // CHECK-NEXT: %Arg_0.1 = u64[2] parameter(0)
-  // CHECK-NEXT: %rng-bit-generator.2 = (u64[2], u32[512,4]) rng-bit-generator(u64[2] %Arg_0.1), algorithm=rng_default, sharding={{\{}}{replicated}, {replicated}}
-  // CHECK-NEXT: %get-tuple-element.3 = u64[2] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=0, sharding={replicated}
-  // CHECK-NEXT: %add.5 = u64[2] add(u64[2] %get-tuple-element.3, u64[2] %get-tuple-element.3)
-  // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=1, sharding={replicated}
-  // CHECK-NEXT: ROOT %tuple.6 = (u64[2], u32[512,4]) tuple(u64[2] %add.5, u32[512,4] %get-tuple-element.4)
+  // CHECK-NEXT: %rng-bit-generator.2 = (u64[2], u32[512,4]) rng-bit-generator(%Arg_0.1), algorithm=rng_default, sharding={{\{}}{replicated}, {replicated}}
+  // CHECK-NEXT: %get-tuple-element.3 = u64[2] get-tuple-element(%rng-bit-generator.2), index=0, sharding={replicated}
+  // CHECK-NEXT: %add.5 = u64[2] add(%get-tuple-element.3, %get-tuple-element.3)
+  // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element(%rng-bit-generator.2), index=1, sharding={replicated}
+  // CHECK-NEXT: ROOT %tuple.6 = (u64[2], u32[512,4]) tuple(%add.5, %get-tuple-element.4)
   %output_state, %output = "mhlo.rng_bit_generator"(%arg0) <{rng_algorithm = #mhlo.rng_algorithm<DEFAULT>}> {mhlo.sharding = "{replicated}"} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>)
   %0 = mhlo.add %output_state, %output_state : tensor<2xui64>
   return %0, %output : tensor<2xui64>, tensor<512x4xui32>
@@ -101,17 +101,17 @@ func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>) {
 
 // CHECK:      %[[BODY:region_0.[0-9]+]] ([[ARG:Arg_.[0-9]+]]: s32[]) -> s32[] {
 // CHECK-NEXT:   %[[ARG]] = s32[] parameter(0), sharding={replicated}
-// CHECK-NEXT:   %[[ADD:add.[0-9]+]] = s32[] add(s32[] %[[ARG]], s32[] %[[ARG]])
-// CHECK-NEXT:   %[[TUPLE:tuple.[0-9]+]] = (s32[]) tuple(s32[] %[[ADD]])
-// CHECK-NEXT:   ROOT %get-tuple-element.{{[0-9]+}} = s32[] get-tuple-element((s32[]) %[[TUPLE]]), index=0, sharding={replicated}
+// CHECK-NEXT:   %[[ADD:add.[0-9]+]] = s32[] add(%[[ARG]], %[[ARG]])
+// CHECK-NEXT:   %[[TUPLE:tuple.[0-9]+]] = (s32[]) tuple(%[[ADD]])
+// CHECK-NEXT:   ROOT %get-tuple-element.{{[0-9]+}} = s32[] get-tuple-element(%[[TUPLE]]), index=0, sharding={replicated}
 
 // CHECK:      %[[COND:region_1.[0-9]+]] ([[ARG:Arg_.[0-9]+]]: s32[]) -> pred[] {
 // CHECK-NEXT:   %[[ARG]] = s32[] parameter(0), sharding={replicated}
-// CHECK-NEXT:   ROOT %compare.{{[0-9]+}} = pred[] compare(s32[] %[[ARG]], s32[] %[[ARG]]), direction=LT
+// CHECK-NEXT:   ROOT %compare.{{[0-9]+}} = pred[] compare(%[[ARG]], %[[ARG]]), direction=LT
 
 // CHECK:      ENTRY %main.{{[0-9]+}} ([[ARG:Arg_0.[0-9]+]]: s32[]) -> s32[] {
 // CHECK-NEXT:   %[[ARG]] = s32[] parameter(0)
-// CHECK-NEXT:   ROOT %while.10 = s32[] while(s32[] %[[ARG]]), condition=%[[COND]], body=%[[BODY]], sharding={replicated}
+// CHECK-NEXT:   ROOT %while.10 = s32[] while(%[[ARG]]), condition=%[[COND]], body=%[[BODY]], sharding={replicated}
 
 func.func @main(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = mhlo.while(%iterArg = %arg0) : tensor<i32> attributes {mhlo.sharding = "{replicated}"}
@@ -132,33 +132,33 @@ func.func @main(%arg0: tensor<i32>) -> tensor<i32> {
 // CHECK:      %[[BODY:region_0.[0-9]+]] ([[ARG_TUPLE:arg_tuple.[0-9]+]]: (s32[], f32[4], f32[4])) -> (s32[], f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], f32[4]) parameter(0)
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE0:get-tuple-element.[0-9]+]] = s32[] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={replicated}
-// CHECK-NEXT:   %[[GTE1:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
-// CHECK-NEXT:   %[[GTE2:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=2, sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   %[[ADD:add.[0-9]+]] = f32[4] add(f32[4] %[[GTE1]], f32[4] %[[GTE2]])
-// CHECK-NEXT:   ROOT %tuple.{{[0-9]+}} = (s32[], f32[4], f32[4]) tuple(s32[] %[[GTE0]], f32[4] %[[ADD]], f32[4] %[[GTE2]])
+// CHECK-NEXT:   %[[GTE0:get-tuple-element.[0-9]+]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={replicated}
+// CHECK-NEXT:   %[[GTE1:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %[[GTE2:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=2, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   %[[ADD:add.[0-9]+]] = f32[4] add(%[[GTE1]], %[[GTE2]])
+// CHECK-NEXT:   ROOT %tuple.{{[0-9]+}} = (s32[], f32[4], f32[4]) tuple(%[[GTE0]], %[[ADD]], %[[GTE2]])
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
 
 // CHECK:      %[[COND:region_1.[0-9]+]] ([[ARG_TUPLE:arg_tuple.[0-9]+]]: (s32[], f32[4], f32[4])) -> pred[] {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], f32[4]) parameter(0)
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE15:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
-// CHECK-NEXT:   %[[GTE16:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=2, sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   %[[GTE14:get-tuple-element.[0-9]+]] = s32[] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={replicated}
-// CHECK-NEXT:   ROOT %compare.{{[0-9]+}} = pred[] compare(s32[] %[[GTE14]], s32[] %[[GTE14]]), direction=LT
+// CHECK-NEXT:   %[[GTE15:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %[[GTE16:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=2, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   %[[GTE14:get-tuple-element.[0-9]+]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={replicated}
+// CHECK-NEXT:   ROOT %compare.{{[0-9]+}} = pred[] compare(%[[GTE14]], %[[GTE14]]), direction=LT
 
 // CHECK:      ENTRY %main.{{[0-9]+}} ([[ARG0:Arg_0.[0-9]+]]: s32[], [[ARG1:Arg_1.[0-9]+]]: f32[4], [[ARG2:Arg_2.[0-9]+]]: f32[4]) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
 // CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1)
 // CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
-// CHECK-NEXT:   %[[TUPLE:tuple.[0-9]+]] = (s32[], f32[4], f32[4]) tuple(s32[] %[[ARG0]], f32[4] %[[ARG1]], f32[4] %[[ARG2]])
+// CHECK-NEXT:   %[[TUPLE:tuple.[0-9]+]] = (s32[], f32[4], f32[4]) tuple(%[[ARG0]], %[[ARG1]], %[[ARG2]])
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[WHILE:while.[0-9]+]] = (s32[], f32[4], f32[4]) while((s32[], f32[4], f32[4]) %[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
+// CHECK-NEXT:   %[[WHILE:while.[0-9]+]] = (s32[], f32[4], f32[4]) while(%[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE19:get-tuple-element.[0-9]+]] = s32[] get-tuple-element((s32[], f32[4], f32[4]) %[[WHILE]]), index=0, sharding={replicated}
-// CHECK-NEXT:   %[[GTE20:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[WHILE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
-// CHECK-NEXT:   %[[GTE21:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[WHILE]]), index=2, sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   ROOT %tuple.{{[0-9]+}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE20]], f32[4] %[[GTE21]])
+// CHECK-NEXT:   %[[GTE19:get-tuple-element.[0-9]+]] = s32[] get-tuple-element(%[[WHILE]]), index=0, sharding={replicated}
+// CHECK-NEXT:   %[[GTE20:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element(%[[WHILE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %[[GTE21:get-tuple-element.[0-9]+]] = f32[4] get-tuple-element(%[[WHILE]]), index=2, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   ROOT %tuple.{{[0-9]+}} = (f32[4], f32[4]) tuple(%[[GTE20]], %[[GTE21]])
 
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
   %0:3 = mhlo.while(%iterArg = %arg0, %iterArg_0 = %arg1, %iterArg_1 = %arg2) : tensor<i32>, tensor<4xf32>, tensor<4xf32>
@@ -180,33 +180,33 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>)
 // CHECK:      %[[BODY:region_0.[0-9]+]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], f32[4])) -> (s32[], f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], f32[4]) parameter(0)
 // CHECK-SAME:     sharding={{\{}}{manual}, {manual}, {manual}}
-// CHECK-NEXT:   %[[GTE7:get-tuple-element.*]] = s32[] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={manual}
-// CHECK-NEXT:   %[[GTE8:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=1, sharding={manual}
-// CHECK-NEXT:   %[[GTE9:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=2, sharding={manual}
-// CHECK-NEXT:   %[[ADD:add.*]] = f32[4] add(f32[4] %[[GTE8]], f32[4] %[[GTE9]])
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (s32[], f32[4], f32[4]) tuple(s32[] %[[GTE7]], f32[4] %[[ADD]], f32[4] %[[GTE9]])
+// CHECK-NEXT:   %[[GTE7:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={manual}
+// CHECK-NEXT:   %[[GTE8:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1, sharding={manual}
+// CHECK-NEXT:   %[[GTE9:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=2, sharding={manual}
+// CHECK-NEXT:   %[[ADD:add.*]] = f32[4] add(%[[GTE8]], %[[GTE9]])
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (s32[], f32[4], f32[4]) tuple(%[[GTE7]], %[[ADD]], %[[GTE9]])
 // CHECK-SAME:     sharding={{\{}}{manual}, {manual}, {manual}}
 
 // CHECK:      %[[COND:region_1.[0-9]+]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], f32[4])) -> pred[] {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], f32[4]) parameter(0)
 // CHECK-SAME:     sharding={{\{}}{manual}, {manual}, {manual}}
-// CHECK-NEXT:   %[[GTE15:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=1, sharding={manual}
-// CHECK-NEXT:   %[[GTE16:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=2, sharding={manual}
-// CHECK-NEXT:   %[[GTE14:get-tuple-element.*]] = s32[] get-tuple-element((s32[], f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={manual}
-// CHECK-NEXT:   ROOT %compare.{{.*}} = pred[] compare(s32[] %[[GTE14]], s32[] %[[GTE14]]), direction=LT
+// CHECK-NEXT:   %[[GTE15:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1, sharding={manual}
+// CHECK-NEXT:   %[[GTE16:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=2, sharding={manual}
+// CHECK-NEXT:   %[[GTE14:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={manual}
+// CHECK-NEXT:   ROOT %compare.{{.*}} = pred[] compare(%[[GTE14]], %[[GTE14]]), direction=LT
 
 // CHECK:      ENTRY %main.{{.*}} ([[ARG0:Arg_0.*]]: s32[], [[ARG1:Arg_1.*]]: f32[4], [[ARG2:Arg_2.*]]: f32[4]) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
 // CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1)
 // CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
-// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], f32[4]) tuple(s32[] %[[ARG0]], f32[4] %[[ARG1]], f32[4] %[[ARG2]])
+// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], f32[4]) tuple(%[[ARG0]], %[[ARG1]], %[[ARG2]])
 // CHECK-SAME:     sharding={{\{}}{manual}, {manual}, {manual}}
-// CHECK-NEXT:   %[[WHILE:while.*]] = (s32[], f32[4], f32[4]) while((s32[], f32[4], f32[4]) %[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
+// CHECK-NEXT:   %[[WHILE:while.*]] = (s32[], f32[4], f32[4]) while(%[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
 // CHECK-SAME:     sharding={{\{}}{manual}, {manual}, {manual}}
-// CHECK-NEXT:   %[[GTE19:get-tuple-element.*]] = s32[] get-tuple-element((s32[], f32[4], f32[4]) %[[WHILE]]), index=0, sharding={manual}
-// CHECK-NEXT:   %[[GTE20:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[WHILE]]), index=1, sharding={manual}
-// CHECK-NEXT:   %[[GTE21:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], f32[4]) %[[WHILE]]), index=2, sharding={manual}
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE20]], f32[4] %[[GTE21]])
+// CHECK-NEXT:   %[[GTE19:get-tuple-element.*]] = s32[] get-tuple-element(%[[WHILE]]), index=0, sharding={manual}
+// CHECK-NEXT:   %[[GTE20:get-tuple-element.*]] = f32[4] get-tuple-element(%[[WHILE]]), index=1, sharding={manual}
+// CHECK-NEXT:   %[[GTE21:get-tuple-element.*]] = f32[4] get-tuple-element(%[[WHILE]]), index=2, sharding={manual}
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(%[[GTE20]], %[[GTE21]])
 
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
   %0:3 = mhlo.while(%iterArg = %arg0, %iterArg_0 = %arg1, %iterArg_1 = %arg2) : tensor<i32>, tensor<4xf32>, tensor<4xf32>
@@ -227,29 +227,29 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32>)
 
 // CHECK:      %[[BRANCH0:region_0.*]] ([[ARG_TUPLE:arg_tuple.*]]: (f32[4], f32[4])) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (f32[4], f32[4]) parameter(0), sharding={{\{}}{devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}}
-// CHECK-NEXT:   %[[GTE10:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
-// CHECK-NEXT:   %[[GTE11:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=1
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE10]], f32[4] %[[GTE11]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %[[GTE10:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %[[GTE11:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(%[[GTE10]], %[[GTE11]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
 
 // CHECK:      %[[BRANCH1:region_1.*]] ([[ARG_TUPLE:arg_tuple.*]]: (f32[4], f32[4])) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (f32[4], f32[4]) parameter(0), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE15:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={replicated}
-// CHECK-NEXT:   %[[GTE16:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=1, sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE15]], f32[4] %[[GTE16]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %[[GTE15:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={replicated}
+// CHECK-NEXT:   %[[GTE16:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(%[[GTE15]], %[[GTE16]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
 
 // CHECK:      ENTRY %main.{{.*}} ([[ARG0:Arg_0.*]]: s32[], [[ARG1:Arg_1.*]]: f32[4], [[ARG2:Arg_2.*]]: f32[4], [[ARG3:Arg_3.*]]: f32[4], [[ARG4:Arg_4.*]]: f32[4]) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
 // CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1), sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
 // CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
-// CHECK-NEXT:   %[[TUPLE6:tuple.*]] = (f32[4], f32[4]) tuple(f32[4] %[[ARG1]], f32[4] %[[ARG2]]), sharding={{\{}}{devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}}
+// CHECK-NEXT:   %[[TUPLE6:tuple.*]] = (f32[4], f32[4]) tuple(%[[ARG1]], %[[ARG2]]), sharding={{\{}}{devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}}
 // CHECK-NEXT:   %[[ARG3]] = f32[4] parameter(3), sharding={replicated}
 // CHECK-NEXT:   %[[ARG4]] = f32[4] parameter(4), sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   %[[TUPLE7:tuple.*]] = (f32[4], f32[4]) tuple(f32[4] %[[ARG3]], f32[4] %[[ARG4]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[COND:conditional.*]] = (f32[4], f32[4]) conditional(s32[] %[[ARG0]], (f32[4], f32[4]) %[[TUPLE6]], (f32[4], f32[4]) %[[TUPLE7]]), branch_computations={%[[BRANCH0]], %[[BRANCH1]]},
+// CHECK-NEXT:   %[[TUPLE7:tuple.*]] = (f32[4], f32[4]) tuple(%[[ARG3]], %[[ARG4]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %[[COND:conditional.*]] = (f32[4], f32[4]) conditional(%[[ARG0]], %[[TUPLE6]], %[[TUPLE7]]), branch_computations={%[[BRANCH0]], %[[BRANCH1]]},
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE19:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[COND]]), index=0, sharding={replicated}
-// CHECK-NEXT:   %[[GTE20:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[COND]]), index=1, sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE19]], f32[4] %[[GTE20]])
+// CHECK-NEXT:   %[[GTE19:get-tuple-element.*]] = f32[4] get-tuple-element(%[[COND]]), index=0, sharding={replicated}
+// CHECK-NEXT:   %[[GTE20:get-tuple-element.*]] = f32[4] get-tuple-element(%[[COND]]), index=1, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(%[[GTE19]], %[[GTE20]])
 
 func.func @main(%arg0: tensor<i32>,
                 %arg1: tensor<4xf32> {mhlo.sharding = "{devices=[2,2]<=[4] last_tile_dim_replicate}"},
@@ -264,6 +264,54 @@ func.func @main(%arg0: tensor<i32>,
   func.return %0#0, %0#1 : tensor<4xf32>, tensor<4xf32>
 }
 
+// -----
+
+// Test export mhlo::CaseOp with no results and captured variables that have
+// shardings.
+
+// CHECK-LABEL: HloModule main
+
+// CHECK:      %[[EMPTY_BRANCH:.*]] ({{.*}}: ()) -> () {
+// CHECK-NEXT:   %[[ARG:.*]] = () parameter(0)
+// CHECK-NEXT:   ROOT %{{.*}} = () tuple()
+// CHECK-NEXT: }
+
+// CHECK:      %[[CALLBACK_BRANCH:.*]] ({{.*}}: (s32[], s64[8])) -> () {
+// CHECK-NEXT:   %[[ARG:.*]] = (s32[], s64[8]) parameter(0)
+// CHECK-NEXT:   %[[ELEMENT_0:.*]] = s32[] get-tuple-element(%[[ARG]]), index=0, sharding={manual}
+// CHECK-NEXT:   %[[ELEMENT_1:.*]] = s64[8] get-tuple-element(%[[ARG]]), index=1, sharding={manual}
+// CHECK-NEXT:   %{{.*}} = () custom-call(%[[ELEMENT_0]], %[[ELEMENT_1]]),
+// CHECK-SAME{LITERAL}: custom_call_target="xla_ffi_python_cpu_callback", sharding={{manual}}
+// CHECK-NEXT:   ROOT %{{.*}} = () tuple()
+// CHECK-NEXT: }
+
+// CHECK: ENTRY {{.*}} ({{.*}}: s32[], {{.*}}: s64[8], {{.*}}: pred[]) -> () {
+// CHECK-NEXT:    %[[ARG_2:.*]] = pred[] parameter(2)
+// CHECK-NEXT:    %[[CONVERT:.*]] = s32[] convert(%[[ARG_2]]), sharding={manual}
+// CHECK-NEXT:    %[[EMPTY_TUPLE:.*]] = () tuple()
+// CHECK-NEXT:    %[[ARG_0:.*]] = s32[] parameter(0)
+// CHECK-NEXT:    %[[FULL_TO_SHARD:.*]] = s32[] custom-call(%[[ARG_0]]), custom_call_target="SPMDFullToShardShape", sharding={manual}
+// CHECK-NEXT:    %[[ARG_1:.*]] = s64[8] parameter(1), sharding={manual}
+// CHECK-NEXT:    %[[ARG_TUPLE:.*]] = (s32[], s64[8]) tuple(%[[FULL_TO_SHARD]], %[[ARG_1]]),
+// CHECK-SAME{LITERAL}: sharding={{manual}, {manual}}
+// CHECK-NEXT:    %{{.*}} = () conditional(%[[CONVERT]], %[[EMPTY_TUPLE]], %[[ARG_TUPLE]]), branch_computations={%[[EMPTY_BRANCH]], %[[CALLBACK_BRANCH]]},
+// CHECK-SAME{LITERAL}: sharding={{replicated}}
+// CHECK-NEXT:    ROOT %{{.*}} = () tuple()
+// CHECK-NEXT: }
+
+
+
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<8xi64> {mhlo.sharding = "{manual}"}, %arg3: tensor<i1>) {
+  %0 = mhlo.custom_call @SPMDFullToShardShape(%arg0) {mhlo.sharding = "{manual}"} : (tensor<i32>) -> tensor<i32>
+  %1 = mhlo.convert %arg3 {mhlo.sharding = "{manual}"} : (tensor<i1>) -> tensor<i32>
+  "mhlo.case"(%1) ({
+    mhlo.return
+  }, {
+    mhlo.custom_call @xla_ffi_python_cpu_callback(%0, %arg1) {mhlo.sharding = "{manual}"} : (tensor<i32>, tensor<8xi64>) -> ()
+    mhlo.return
+  }) {mhlo.sharding = "{replicated}"} : (tensor<i32>) -> ()
+  return
+}
 
 // -----
 
@@ -279,7 +327,7 @@ func.func @main(%arg0: tensor<i32>,
 // CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
 // CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1), sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
 // CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
-// CHECK-NEXT:   ROOT %conditional.{{.*}} = f32[4] conditional(s32[] %[[ARG0]], f32[4] %[[ARG1]], f32[4] %[[ARG2]]), branch_computations={%[[BRANCH0]], %[[BRANCH1]]}
+// CHECK-NEXT:   ROOT %conditional.{{.*}} = f32[4] conditional(%[[ARG0]], %[[ARG1]], %[[ARG2]]), branch_computations={%[[BRANCH0]], %[[BRANCH1]]}
 func.func @main(%arg0: tensor<i32>,
                 %arg1: tensor<4xf32> {mhlo.sharding = "{devices=[2,2]<=[4] last_tile_dim_replicate}"},
                 %arg2: tensor<4xf32>) -> tensor<4xf32> {
@@ -297,29 +345,29 @@ func.func @main(%arg0: tensor<i32>,
 
 // CHECK:      %[[BRANCH0:region_0.*]] ([[ARG_TUPLE:arg_tuple.*]]: (f32[4], f32[4])) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (f32[4], f32[4]) parameter(0), sharding={{\{}}{devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}}
-// CHECK-NEXT:   %[[GTE10:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
-// CHECK-NEXT:   %[[GTE11:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=1
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE10]], f32[4] %[[GTE11]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %[[GTE10:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %[[GTE11:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(%[[GTE10]], %[[GTE11]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
 
 // CHECK:      %[[BRANCH1:region_1.*]] ([[ARG_TUPLE:arg_tuple.*]]: (f32[4], f32[4])) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (f32[4], f32[4]) parameter(0), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE15:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=0, sharding={replicated}
-// CHECK-NEXT:   %[[GTE16:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %[[ARG_TUPLE]]), index=1, sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE15]], f32[4] %[[GTE16]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %[[GTE15:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=0, sharding={replicated}
+// CHECK-NEXT:   %[[GTE16:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=1, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(%[[GTE15]], %[[GTE16]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
 
 // CHECK:      ENTRY %main.{{.*}} ([[ARG0:Arg_0.*]]: pred[], [[ARG1:Arg_1.*]]: f32[4], [[ARG2:Arg_2.*]]: f32[4], [[ARG3:Arg_3.*]]: f32[4], [[ARG4:Arg_4.*]]: f32[4]) -> (f32[4], f32[4]) {
 // CHECK-NEXT:   %[[ARG0]] = pred[] parameter(0)
 // CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1), sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
 // CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
-// CHECK-NEXT:   %[[TUPLE6:tuple.*]] = (f32[4], f32[4]) tuple(f32[4] %[[ARG1]], f32[4] %[[ARG2]]), sharding={{\{}}{devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}}
+// CHECK-NEXT:   %[[TUPLE6:tuple.*]] = (f32[4], f32[4]) tuple(%[[ARG1]], %[[ARG2]]), sharding={{\{}}{devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}}
 // CHECK-NEXT:   %[[ARG3]] = f32[4] parameter(3), sharding={replicated}
 // CHECK-NEXT:   %[[ARG4]] = f32[4] parameter(4), sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   %[[TUPLE7:tuple.*]] = (f32[4], f32[4]) tuple(f32[4] %[[ARG3]], f32[4] %[[ARG4]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %conditional.18 = (f32[4], f32[4]) conditional(pred[] %[[ARG0]], (f32[4], f32[4]) %[[TUPLE6]], (f32[4], f32[4]) %[[TUPLE7]]), true_computation=%[[BRANCH0]], false_computation=%[[BRANCH1]],
+// CHECK-NEXT:   %[[TUPLE7:tuple.*]] = (f32[4], f32[4]) tuple(%[[ARG3]], %[[ARG4]]), sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %conditional.18 = (f32[4], f32[4]) conditional(%[[ARG0]], %[[TUPLE6]], %[[TUPLE7]]), true_computation=%[[BRANCH0]], false_computation=%[[BRANCH1]],
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE19:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %conditional.18), index=0, sharding={replicated}
-// CHECK-NEXT:   %[[GTE20:get-tuple-element.*]] = f32[4] get-tuple-element((f32[4], f32[4]) %conditional.18), index=1, sharding={devices=[4]<=[4]}
-// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(f32[4] %[[GTE19]], f32[4] %[[GTE20]])
+// CHECK-NEXT:   %[[GTE19:get-tuple-element.*]] = f32[4] get-tuple-element(%conditional.18), index=0, sharding={replicated}
+// CHECK-NEXT:   %[[GTE20:get-tuple-element.*]] = f32[4] get-tuple-element(%conditional.18), index=1, sharding={devices=[4]<=[4]}
+// CHECK-NEXT:   ROOT %tuple.{{.*}} = (f32[4], f32[4]) tuple(%[[GTE19]], %[[GTE20]])
 
 func.func @main(%arg0: tensor<i1>,
                 %arg1: tensor<4xf32> {mhlo.sharding = "{devices=[2,2]<=[4] last_tile_dim_replicate}"},
@@ -348,7 +396,7 @@ func.func @main(%arg0: tensor<i1>,
 // CHECK-NEXT:   %[[ARG0]] = pred[] parameter(0)
 // CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1), sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
 // CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
-// CHECK-NEXT:   ROOT %conditional.{{.*}} = f32[4] conditional(pred[] %[[ARG0]], f32[4] %[[ARG1]], f32[4] %[[ARG2]]), true_computation=%[[TRUE]], false_computation=%[[FALSE]]
+// CHECK-NEXT:   ROOT %conditional.{{.*}} = f32[4] conditional(%[[ARG0]], %[[ARG1]], %[[ARG2]]), true_computation=%[[TRUE]], false_computation=%[[FALSE]]
 
 func.func @main(%arg0: tensor<i1>,
                 %arg1: tensor<4xf32> {mhlo.sharding = "{devices=[2,2]<=[4] last_tile_dim_replicate}"},
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while.mlir
index e99a57fe7f92..b444dbb27bdb 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while.mlir
@@ -6,10 +6,10 @@ module {
     %0 = "mhlo.while"(%arg0) ({
     // CHECK: [[R0:%.+]] ([[A0:.+]]: s64[]) -> s64[] {
     // CHECK:   %[[A0]] = s64[] parameter(0)
-    // CHECK:   ROOT %add.{{.*}} = s64[] add(s64[] %[[A0]], s64[] %[[A0]])
+    // CHECK:   ROOT %add.{{.*}} = s64[] add(%[[A0]], %[[A0]])
     // CHECK: [[R1:%.+]] ([[A0:.+]]: s64[]) -> pred[] {
     // CHECK:   %[[A0]] = s64[] parameter(0)
-    // CHECK:   ROOT %compare.{{.*}} = pred[] compare(s64[] %[[A0]], s64[] %[[A0]]), direction=LT
+    // CHECK:   ROOT %compare.{{.*}} = pred[] compare(%[[A0]], %[[A0]]), direction=LT
     ^bb0(%arg1: tensor<i64>):
       %1 = "mhlo.compare"(%arg1, %arg1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<i64>, tensor<i64>) -> tensor<i1>
       "mhlo.return"(%1) : (tensor<i1>) -> ()
@@ -21,7 +21,7 @@ module {
 
     // CHECK: ENTRY %main.{{.*}} ([[A0:.+]]: s64[]) -> s64[] {
     // CHECK:   %[[A0]] = s64[] parameter(0)
-    // CHECK:   ROOT %while.{{.*}} = s64[] while(s64[] %[[A0]]), condition=[[R1]], body=[[R0]]
+    // CHECK:   ROOT %while.{{.*}} = s64[] while(%[[A0]]), condition=[[R1]], body=[[R0]]
     func.return %0 : tensor<i64>
   }
 }
@@ -32,22 +32,22 @@ module {
 
 // CHECK: [[BODY:%.+]] ([[TUPLE:.+]]: (s32[], s32[], f32[], f32[])) -> (s32[], s32[], f32[], f32[]) {
 // CHECK-NEXT:  %[[TUPLE]] = (s32[], s32[], f32[], f32[]) parameter(0)
-// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=0
-// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=1
-// CHECK-NEXT:  %[[GTE_2:.*]] = f32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=2
-// CHECK-NEXT:  %[[GTE_3:.*]] = f32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=3
-// CHECK-NEXT:  %[[ADD:.*]] = f32[] add(f32[] %[[GTE_2]], f32[] %[[GTE_3]])
-// CHECK-NEXT:  ROOT %[[TUPLE_RES:.*]] = (s32[], s32[], f32[], f32[]) tuple(s32[] %[[GTE_0]], s32[] %[[GTE_1]], f32[] %[[GTE_2]], f32[] %[[ADD]])
+// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=0
+// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=1
+// CHECK-NEXT:  %[[GTE_2:.*]] = f32[] get-tuple-element(%[[TUPLE]]), index=2
+// CHECK-NEXT:  %[[GTE_3:.*]] = f32[] get-tuple-element(%[[TUPLE]]), index=3
+// CHECK-NEXT:  %[[ADD:.*]] = f32[] add(%[[GTE_2]], %[[GTE_3]])
+// CHECK-NEXT:  ROOT %[[TUPLE_RES:.*]] = (s32[], s32[], f32[], f32[]) tuple(%[[GTE_0]], %[[GTE_1]], %[[GTE_2]], %[[ADD]])
 // CHECK: }
 
 // CHECK: [[COND:%.+]] ([[TUPLE:.+]]: (s32[], s32[], f32[], f32[])) -> pred[] {
 // CHECK-NEXT:  %[[TUPLE]] = (s32[], s32[], f32[], f32[]) parameter(0)
-// CHECK-NEXT:  %[[GTE_0:.*]] = f32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=2
-// CHECK-NEXT:  %[[GTE_1:.*]] = f32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=3
+// CHECK-NEXT:  %[[GTE_0:.*]] = f32[] get-tuple-element(%[[TUPLE]]), index=2
+// CHECK-NEXT:  %[[GTE_1:.*]] = f32[] get-tuple-element(%[[TUPLE]]), index=3
 // CHECK-NEXT:  %[[CST_0:.*]] = s32[] constant(0)
-// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=0
-// CHECK-NEXT:  %[[GTE_3:.*]] = s32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[TUPLE]]), index=1
-// CHECK-NEXT:  ROOT %[[CMP:.*]] = pred[] compare(s32[] %[[GTE_2]], s32[] %[[GTE_3]]), direction=LT
+// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=0
+// CHECK-NEXT:  %[[GTE_3:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=1
+// CHECK-NEXT:  ROOT %[[CMP:.*]] = pred[] compare(%[[GTE_2]], %[[GTE_3]]), direction=LT
 // CHECK: }
 
 
@@ -56,12 +56,12 @@ module {
 // CHECK-NEXT:  %[[CST_1:.*]] = s32[] constant(100)
 // CHECK-NEXT:  %[[CST_2:.*]] = f32[] constant(1)
 // CHECK-NEXT:  %[[ARG_0:.*]] = f32[] parameter(0)
-// CHECK-NEXT:  %[[TUPLE:.*]] = (s32[], s32[], f32[], f32[]) tuple(s32[] %[[CST_0]], s32[] %[[CST_1]], f32[] %[[CST_2]], f32[] %[[ARG_0]])
-// CHECK-NEXT:  %[[WHILE:.*]] = (s32[], s32[], f32[], f32[]) while((s32[], s32[], f32[], f32[]) %[[TUPLE]]), condition=[[COND]], body=[[BODY]]
-// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[WHILE]]), index=0
-// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[WHILE]]), index=1
-// CHECK-NEXT:  %[[GTE_2:.*]] = f32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[WHILE]]), index=2
-// CHECK-NEXT:  ROOT %[[GTE_3:.*]] = f32[] get-tuple-element((s32[], s32[], f32[], f32[]) %[[WHILE]]), index=3
+// CHECK-NEXT:  %[[TUPLE:.*]] = (s32[], s32[], f32[], f32[]) tuple(%[[CST_0]], %[[CST_1]], %[[CST_2]], %[[ARG_0]])
+// CHECK-NEXT:  %[[WHILE:.*]] = (s32[], s32[], f32[], f32[]) while(%[[TUPLE]]), condition=[[COND]], body=[[BODY]]
+// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element(%[[WHILE]]), index=0
+// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element(%[[WHILE]]), index=1
+// CHECK-NEXT:  %[[GTE_2:.*]] = f32[] get-tuple-element(%[[WHILE]]), index=2
+// CHECK-NEXT:  ROOT %[[GTE_3:.*]] = f32[] get-tuple-element(%[[WHILE]]), index=3
 
 func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
@@ -86,41 +86,41 @@ func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK: [[BODY:%.+]] ([[TUPLE_0:.+]]: (s32[1], s32[2], f32[1], f32[3])) -> (s32[1], s32[2], f32[1], f32[3]) {
 // CHECK-NEXT:   %[[TUPLE_0:.*]] = (s32[1], s32[2], f32[1], f32[3]) parameter(0)
-// CHECK-NEXT:   %[[GTE_0:.*]] = s32[1]  get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=0
-// CHECK-NEXT:   %[[GTE_1:.*]] = s32[2] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=1
-// CHECK-NEXT:   %[[GTE_2:.*]] = f32[1] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=2
-// CHECK-NEXT:   %[[GTE_3:.*]] = f32[3] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=3
-// CHECK-NEXT:   %[[BDCAST_0:.*]] = f32[1] broadcast(f32[1] %[[GTE_2]]), dimensions={0}
-// CHECK-NEXT:   %[[RESHAPE_0:.*]] = f32[] reshape(f32[1] %[[BDCAST_0]])
-// CHECK-NEXT:   %[[BDCAST_1:.*]] = f32[3] broadcast(f32[] %[[RESHAPE_0]]), dimensions={}
-// CHECK-NEXT:   %[[ADD:.*]] = f32[3] add(f32[3] %[[GTE_3]], f32[3] %[[BDCAST_1]])
-// CHECK-NEXT:   ROOT %[[TUPLE_0:.*]] = (s32[1], s32[2], f32[1], f32[3]) tuple(s32[1] %[[GTE_0]], s32[2] %[[GTE_1]], f32[1] %[[GTE_2]], f32[3] %[[ADD]])
+// CHECK-NEXT:   %[[GTE_0:.*]] = s32[1]  get-tuple-element(%[[TUPLE_0]]), index=0
+// CHECK-NEXT:   %[[GTE_1:.*]] = s32[2] get-tuple-element(%[[TUPLE_0]]), index=1
+// CHECK-NEXT:   %[[GTE_2:.*]] = f32[1] get-tuple-element(%[[TUPLE_0]]), index=2
+// CHECK-NEXT:   %[[GTE_3:.*]] = f32[3] get-tuple-element(%[[TUPLE_0]]), index=3
+// CHECK-NEXT:   %[[BDCAST_0:.*]] = f32[1] broadcast(%[[GTE_2]]), dimensions={0}
+// CHECK-NEXT:   %[[RESHAPE_0:.*]] = f32[] reshape(%[[BDCAST_0]])
+// CHECK-NEXT:   %[[BDCAST_1:.*]] = f32[3] broadcast(%[[RESHAPE_0]]), dimensions={}
+// CHECK-NEXT:   %[[ADD:.*]] = f32[3] add(%[[GTE_3]], %[[BDCAST_1]])
+// CHECK-NEXT:   ROOT %[[TUPLE_0:.*]] = (s32[1], s32[2], f32[1], f32[3]) tuple(%[[GTE_0]], %[[GTE_1]], %[[GTE_2]], %[[ADD]])
 // CHECK: }
 
 // CHECK: [[COND:%.+]] ([[TUPLE_0:.+]]: (s32[1], s32[2], f32[1], f32[3])) -> pred[] {
 // CHECK-NEXT:   %[[TUPLE_0:.*]] = (s32[1], s32[2], f32[1], f32[3]) parameter(0)
-// CHECK-NEXT:   %[[GTE_0:.*]] = f32[1] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=2
-// CHECK-NEXT:   %[[GTE_1:.*]] = f32[3] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=3
-// CHECK-NEXT:   %[[GTE_2:.*]] = s32[1] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=0
+// CHECK-NEXT:   %[[GTE_0:.*]] = f32[1] get-tuple-element(%[[TUPLE_0]]), index=2
+// CHECK-NEXT:   %[[GTE_1:.*]] = f32[3] get-tuple-element(%[[TUPLE_0]]), index=3
+// CHECK-NEXT:   %[[GTE_2:.*]] = s32[1] get-tuple-element(%[[TUPLE_0]]), index=0
 // CHECK-NEXT:   %[[CST_0:.*]] = s32[] constant(0)
-// CHECK-NEXT:   %[[RED_0:.*]] = s32[] reduce(s32[1] %[[GTE_2]], s32[] %[[CST_0]]), dimensions={0}, to_apply=
-// CHECK-NEXT:   %[[GTE_3:.*]] = s32[2] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE_0]]), index=1
-// CHECK-NEXT:   %[[RED_1:.*]] = s32[] reduce(s32[2] %[[GTE_3]], s32[] %[[CST_0]]), dimensions={0}, to_apply=
-// CHECK-NEXT:   ROOT %[[CMP:.*]] = pred[] compare(s32[] %[[RED_0]], s32[] %[[RED_1]]), direction=LT
+// CHECK-NEXT:   %[[RED_0:.*]] = s32[] reduce(%[[GTE_2]], %[[CST_0]]), dimensions={0}, to_apply=
+// CHECK-NEXT:   %[[GTE_3:.*]] = s32[2] get-tuple-element(%[[TUPLE_0]]), index=1
+// CHECK-NEXT:   %[[RED_1:.*]] = s32[] reduce(%[[GTE_3]], %[[CST_0]]), dimensions={0}, to_apply=
+// CHECK-NEXT:   ROOT %[[CMP:.*]] = pred[] compare(%[[RED_0]], %[[RED_1]]), direction=LT
 // CHECK: }
 
 // CHECK: ENTRY
 // CHECK-NEXT:  %[[CST_0:.*]] = s32[1] constant({0})
 // CHECK-NEXT:  %[[CST_1:.*]] = s32[] constant(100)
-// CHECK-NEXT:  %[[BDCAST_0:.*]] = s32[2] broadcast(s32[] %[[CST_1]]), dimensions={}
+// CHECK-NEXT:  %[[BDCAST_0:.*]] = s32[2] broadcast(%[[CST_1]]), dimensions={}
 // CHECK-NEXT:  %[[CST_2:.*]] = f32[1] constant({1})
 // CHECK-NEXT:  %[[ARG_0:.*]] = f32[3] parameter(0)
-// CHECK-NEXT:  %[[TUPLE:.*]] = (s32[1], s32[2], f32[1], f32[3]) tuple(s32[1] %[[CST_0]], s32[2] %[[BDCAST_0]], f32[1] %[[CST_2]], f32[3] %[[ARG_0]])
-// CHECK-NEXT:  %[[WHILE:.*]] = (s32[1], s32[2], f32[1], f32[3]) while((s32[1], s32[2], f32[1], f32[3]) %[[TUPLE]]), condition=[[COND]], body=[[BODY]]
-// CHECK-NEXT:  %[[GTE_0:.*]] = s32[1] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[WHILE]]), index=0
-// CHECK-NEXT:  %[[GTE_1:.*]] = s32[2] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[WHILE]]), index=1
-// CHECK-NEXT:  %[[GTE_2:.*]] = f32[1] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[WHILE]]), index=2
-// CHECK-NEXT:  ROOT %[[GTE_3:.*]] = f32[3] get-tuple-element((s32[1], s32[2], f32[1], f32[3]) %[[WHILE]]), index=3
+// CHECK-NEXT:  %[[TUPLE:.*]] = (s32[1], s32[2], f32[1], f32[3]) tuple(%[[CST_0]], %[[BDCAST_0]], %[[CST_2]], %[[ARG_0]])
+// CHECK-NEXT:  %[[WHILE:.*]] = (s32[1], s32[2], f32[1], f32[3]) while(%[[TUPLE]]), condition=[[COND]], body=[[BODY]]
+// CHECK-NEXT:  %[[GTE_0:.*]] = s32[1] get-tuple-element(%[[WHILE]]), index=0
+// CHECK-NEXT:  %[[GTE_1:.*]] = s32[2] get-tuple-element(%[[WHILE]]), index=1
+// CHECK-NEXT:  %[[GTE_2:.*]] = f32[1] get-tuple-element(%[[WHILE]]), index=2
+// CHECK-NEXT:  ROOT %[[GTE_3:.*]] = f32[3] get-tuple-element(%[[WHILE]]), index=3
 
 func.func @main(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %0 = mhlo.constant dense<0> : tensor<1xi32>
@@ -156,36 +156,36 @@ func.func @main(%arg0: tensor<3xf32>) -> tensor<3xf32> {
 
 // CHECK: [[BODY:%.+]] ([[TUPLE:.+]]: (s32[], s32[], s32[])) -> (s32[], s32[], s32[]) {
 // CHECK-NEXT: %[[TUPLE:.*]] = (s32[], s32[], s32[]) parameter(0)
-// CHECK-NEXT: %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[TUPLE]]), index=0
-// CHECK-NEXT: %[[GTE_1:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[TUPLE]]), index=1
-// CHECK-NEXT: %[[ADD:.*]] = s32[] add(s32[] %[[GTE_0]], s32[] %[[GTE_1]])
-// CHECK-NEXT: %[[GTE_2:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[TUPLE]]), index=2
-// CHECK-NEXT: ROOT %[[TUPLE_RES:.*]] = (s32[], s32[], s32[]) tuple(s32[] %[[ADD]], s32[] %[[GTE_1]], s32[] %[[GTE_2]])
+// CHECK-NEXT: %[[GTE_0:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=0
+// CHECK-NEXT: %[[GTE_1:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=1
+// CHECK-NEXT: %[[ADD:.*]] = s32[] add(%[[GTE_0]], %[[GTE_1]])
+// CHECK-NEXT: %[[GTE_2:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=2
+// CHECK-NEXT: ROOT %[[TUPLE_RES:.*]] = (s32[], s32[], s32[]) tuple(%[[ADD]], %[[GTE_1]], %[[GTE_2]])
 // CHECK: }
 
 // CHECK: [[COND:%.+]] ([[TUPLE:.+]]: (s32[], s32[], s32[])) -> pred[] {
 // CHECK-NEXT:  %[[TUPLE:.*]] = (s32[], s32[], s32[]) parameter(0)
-// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[TUPLE]]), index=1
-// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[TUPLE]]), index=0
-// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[TUPLE]]), index=2
-// CHECK-NEXT:  ROOT %[[CMP:.*]] = pred[] compare(s32[] %[[GTE_1]], s32[] %[[GTE_2]]), direction=LT
+// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=1
+// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=0
+// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element(%[[TUPLE]]), index=2
+// CHECK-NEXT:  ROOT %[[CMP:.*]] = pred[] compare(%[[GTE_1]], %[[GTE_2]]), direction=LT
 // CHECK: }
 
 // CHECK: ENTRY
 // CHECK-NEXT:  %[[ARG_0:.*]] = (s32[], (s32[], (s32[]))) parameter(0)
-// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], (s32[], (s32[]))) %[[ARG_0]]), index=0
-// CHECK-NEXT:  %[[GTE_1:.*]] = (s32[], (s32[])) get-tuple-element((s32[], (s32[], (s32[]))) %[[ARG_0]]), index=1
-// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element((s32[], (s32[])) %[[GTE_1]]), index=0
-// CHECK-NEXT:  %[[GTE_3:.*]] = (s32[]) get-tuple-element((s32[], (s32[])) %[[GTE_1]]), index=1
-// CHECK-NEXT:  %[[GTE_4:.*]] = s32[] get-tuple-element((s32[]) %[[GTE_3]]), index=0
-// CHECK-NEXT:  %[[TUPLE_0:.*]] = (s32[], s32[], s32[]) tuple(s32[] %[[GTE_0]], s32[] %[[GTE_2]], s32[] %[[GTE_4]])
-// CHECK-NEXT:  %[[WHILE:.*]] = (s32[], s32[], s32[]) while((s32[], s32[], s32[]) %[[TUPLE_0]]), condition=[[COND]], body=[[BODY]]
-// CHECK-NEXT:  %[[GTE_5:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[WHILE]]), index=0
-// CHECK-NEXT:  %[[GTE_6:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[WHILE]]), index=1
-// CHECK-NEXT:  %[[GTE_7:.*]] = s32[] get-tuple-element((s32[], s32[], s32[]) %[[WHILE]]), index=2
-// CHECK-NEXT:  %[[TUPLE_1:.*]] = (s32[]) tuple(s32[] %[[GTE_7]])
-// CHECK-NEXT:  %[[TUPLE_2:.*]] = (s32[], (s32[])) tuple(s32[] %[[GTE_6]], (s32[]) %[[TUPLE_1]])
-// CHECK-NEXT:  ROOT %[[TUPLE_3:.*]] = (s32[], (s32[], (s32[]))) tuple(s32[] %[[GTE_5]], (s32[], (s32[])) %[[TUPLE_2]])
+// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element(%[[ARG_0]]), index=0
+// CHECK-NEXT:  %[[GTE_1:.*]] = (s32[], (s32[])) get-tuple-element(%[[ARG_0]]), index=1
+// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element(%[[GTE_1]]), index=0
+// CHECK-NEXT:  %[[GTE_3:.*]] = (s32[]) get-tuple-element(%[[GTE_1]]), index=1
+// CHECK-NEXT:  %[[GTE_4:.*]] = s32[] get-tuple-element(%[[GTE_3]]), index=0
+// CHECK-NEXT:  %[[TUPLE_0:.*]] = (s32[], s32[], s32[]) tuple(%[[GTE_0]], %[[GTE_2]], %[[GTE_4]])
+// CHECK-NEXT:  %[[WHILE:.*]] = (s32[], s32[], s32[]) while(%[[TUPLE_0]]), condition=[[COND]], body=[[BODY]]
+// CHECK-NEXT:  %[[GTE_5:.*]] = s32[] get-tuple-element(%[[WHILE]]), index=0
+// CHECK-NEXT:  %[[GTE_6:.*]] = s32[] get-tuple-element(%[[WHILE]]), index=1
+// CHECK-NEXT:  %[[GTE_7:.*]] = s32[] get-tuple-element(%[[WHILE]]), index=2
+// CHECK-NEXT:  %[[TUPLE_1:.*]] = (s32[]) tuple(%[[GTE_7]])
+// CHECK-NEXT:  %[[TUPLE_2:.*]] = (s32[], (s32[])) tuple(%[[GTE_6]], %[[TUPLE_1]])
+// CHECK-NEXT:  ROOT %[[TUPLE_3:.*]] = (s32[], (s32[], (s32[]))) tuple(%[[GTE_5]], %[[TUPLE_2]])
 
  func.func  @main(%arg0: tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>) -> tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>> {
     %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>) -> tensor<i32>
@@ -216,29 +216,29 @@ func.func @main(%arg0: tensor<3xf32>) -> tensor<3xf32> {
 // CHECK-NEXT:  pred[] constant(false)
 // CHECK-NEXT:  %[[ARG_0:.*]] = f32[3,3] parameter(0)
 // CHECK-NEXT:  %[[CST_1:.*]] = f32[] constant(2)
-// CHECK-NEXT:  %[[BDCAST:.*]] = f32[3,3] broadcast(f32[] %[[CST_1]]), dimensions={}
-// CHECK-NEXT:  ROOT %[[ADD:.*]] = f32[3,3] add(f32[3,3] %[[ARG_0]], f32[3,3] %[[BDCAST]])
+// CHECK-NEXT:  %[[BDCAST:.*]] = f32[3,3] broadcast(%[[CST_1]]), dimensions={}
+// CHECK-NEXT:  ROOT %[[ADD:.*]] = f32[3,3] add(%[[ARG_0]], %[[BDCAST]])
 // CHECK: }
 
 // CHECK: [[REDUCER:%.+]] ([[ARG_0:.+]]: f32[], [[ARG_1:.+]]: f32[]) -> f32[] {
 // CHECK-NEXT:   constant(false)
 // CHECK-NEXT:   %[[ARG_0:.*]] = f32[] parameter(0)
 // CHECK-NEXT:   %[[ARG_1:.*]] = f32[] parameter(1)
-// CHECK-NEXT:   ROOT %[[ADD:.*]] = f32[] add(f32[] %[[ARG_0]], f32[] %[[ARG_1]])
+// CHECK-NEXT:   ROOT %[[ADD:.*]] = f32[] add(%[[ARG_0]], %[[ARG_1]])
 // CHECK: }
 
 // CHECK: [[COND:%.+]] ([[ARG_0:.+]]: f32[3,3]) -> pred[] {
 // CHECK-NEXT:   pred[] constant(false)
 // CHECK-NEXT:   %[[ARG_0:.*]] = f32[3,3] parameter(0)
 // CHECK-NEXT:   %[[CST_0:.*]] = f32[] constant(0)
-// CHECK-NEXT:   %[[REDUCE:.*]] = f32[] reduce(f32[3,3] %[[ARG_0]], f32[] %[[CST_0]]), dimensions={0,1}, to_apply=[[REDUCER]]
+// CHECK-NEXT:   %[[REDUCE:.*]] = f32[] reduce(%[[ARG_0]], %[[CST_0]]), dimensions={0,1}, to_apply=[[REDUCER]]
 // CHECK-NEXT:   %[[CST_1:.*]] = f32[] constant(100)
-// CHECK-NEXT:   ROOT %[[CMP:.*]] = pred[] compare(f32[] %[[REDUCE]], f32[] %[[CST_1]]), direction=LT
+// CHECK-NEXT:   ROOT %[[CMP:.*]] = pred[] compare(%[[REDUCE]], %[[CST_1]]), direction=LT
 
 // CHECK: ENTRY
 // CHECK-NEXT:  %[[CST_0:.*]] = pred[] constant(false)
 // CHECK-NEXT:  %[[ARG_0:.*]] = f32[3,3] parameter(0)
-// CHECK-NEXT:  ROOT %[[WHILE:.*]] = f32[3,3] while(f32[3,3] %[[ARG_0]]), condition=[[COND]], body=[[BODY]]
+// CHECK-NEXT:  ROOT %[[WHILE:.*]] = f32[3,3] while(%[[ARG_0]]), condition=[[COND]], body=[[BODY]]
 
 func.func @main(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
   %0 = mhlo.constant dense<false> : tensor<i1>
@@ -272,29 +272,29 @@ func.func @main(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
 
 // CHECK: [[BODY:%.+]] ([[ARG_TUPLE:.+]]: (s32[], s32[])) -> (s32[], s32[]) {
 // CHECK-NEXT:  %[[ARG_TUPLE:.*]] = (s32[], s32[]) parameter(0)
-// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[ARG_TUPLE]]), index=0
-// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[ARG_TUPLE]]), index=1
-// CHECK-NEXT:  %[[TUPLE_0:.*]] = (s32[], s32[]) tuple(s32[] %[[GTE_0]], s32[] %[[GTE_1]])
-// CHECK-NEXT:  %[[CC:.*]] = (s32[], s32[]) custom-call(s32[] %[[GTE_0]], (s32[], s32[]) %[[TUPLE_0]])
-// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[CC]]), index=0
-// CHECK-NEXT:  %[[GTE_3:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[CC]]), index=1
-// CHECK-NEXT:  ROOT %[[TUPLE_1:.*]] = (s32[], s32[]) tuple(s32[] %[[GTE_2]], s32[] %[[GTE_3]])
+// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=0
+// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=1
+// CHECK-NEXT:  %[[TUPLE_0:.*]] = (s32[], s32[]) tuple(%[[GTE_0]], %[[GTE_1]])
+// CHECK-NEXT:  %[[CC:.*]] = (s32[], s32[]) custom-call(%[[GTE_0]], %[[TUPLE_0]])
+// CHECK-NEXT:  %[[GTE_2:.*]] = s32[] get-tuple-element(%[[CC]]), index=0
+// CHECK-NEXT:  %[[GTE_3:.*]] = s32[] get-tuple-element(%[[CC]]), index=1
+// CHECK-NEXT:  ROOT %[[TUPLE_1:.*]] = (s32[], s32[]) tuple(%[[GTE_2]], %[[GTE_3]])
 // CHECK: }
 
 // CHECK: [[COND:%.+]] ([[ARG_TUPLE:.+]]: (s32[], s32[])) -> pred[] {
 // CHECK-NEXT:  %[[ARG_TUPLE:.*]] = (s32[], s32[]) parameter(0)
-// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[ARG_TUPLE]]), index=0
-// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[ARG_TUPLE]]), index=1
-// CHECK-NEXT:  ROOT %compare.{{.*}} = pred[] compare(s32[] %[[GTE_0]], s32[] %[[GTE_1]]), direction=LT
+// CHECK-NEXT:  %[[GTE_0:.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=0
+// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=1
+// CHECK-NEXT:  ROOT %compare.{{.*}} = pred[] compare(%[[GTE_0]], %[[GTE_1]]), direction=LT
 // CHECK: }
 
 // CHECK: ENTRY
 // CHECK-NEXT:  %[[CST_0:.*]] = s32[] constant(0)
 // CHECK-NEXT:  %[[ARG_0:.*]] = s32[] parameter(0)
-// CHECK-NEXT:  %[[TUPLE:.*]] = (s32[], s32[]) tuple(s32[] %[[CST_0]], s32[] %[[ARG_0]])
-// CHECK-NEXT:  %[[WHILE:.*]] = (s32[], s32[]) while((s32[], s32[]) %[[TUPLE]]), condition=[[COND]], body=[[BODY]]
-// CHECK-NEXT:  ROOT %[[GTE_0:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[WHILE]]), index=0
-// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element((s32[], s32[]) %[[WHILE]]), index=1
+// CHECK-NEXT:  %[[TUPLE:.*]] = (s32[], s32[]) tuple(%[[CST_0]], %[[ARG_0]])
+// CHECK-NEXT:  %[[WHILE:.*]] = (s32[], s32[]) while(%[[TUPLE]]), condition=[[COND]], body=[[BODY]]
+// CHECK-NEXT:  ROOT %[[GTE_0:.*]] = s32[] get-tuple-element(%[[WHILE]]), index=0
+// CHECK-NEXT:  %[[GTE_1:.*]] = s32[] get-tuple-element(%[[WHILE]]), index=1
 
 func.func @main(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while_free_vars.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while_free_vars.mlir
index b1ebb843cf83..3d2dbb74c1fd 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while_free_vars.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/while_free_vars.mlir
@@ -8,18 +8,18 @@
 // CHECK:      %[[BODY:region_0.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[], s32[], f32[4])) -> (s32[], f32[4], s32[], s32[], f32[4]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[], s32[], f32[4]) parameter(0)
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
-// CHECK-DAG:    %[[GTE12:get-tuple-element.*]] = s32[] get-tuple-element((s32[], f32[4], s32[], s32[], f32[4]) %[[ARG_TUPLE]]), index=3
-// CHECK-DAG:    %[[GTE13:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], s32[], s32[], f32[4]) %[[ARG_TUPLE]]), index=4, sharding={devices=[4]<=[4]}
-// CHECK-DAG:    %[[ADD14:add.*]] = s32[] add(s32[] %get-tuple-element.{{.*}}, s32[] %[[GTE12]])
-// CHECK-DAG:    %[[ADD15:add.*]] = f32[4] add(f32[4] %get-tuple-element.{{.*}}, f32[4] %[[GTE13]])
-// CHECK:        ROOT %tuple.{{.*}} = (s32[], f32[4], s32[], s32[], f32[4]) tuple(s32[] %[[ADD14]], f32[4] %[[ADD15]], s32[] %get-tuple-element.{{.*}}, s32[] %[[GTE12]], f32[4] %[[GTE13]])
+// CHECK-DAG:    %[[GTE12:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=3
+// CHECK-DAG:    %[[GTE13:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=4, sharding={devices=[4]<=[4]}
+// CHECK-DAG:    %[[ADD14:add.*]] = s32[] add(%get-tuple-element.{{.*}}, %[[GTE12]])
+// CHECK-DAG:    %[[ADD15:add.*]] = f32[4] add(%get-tuple-element.{{.*}}, %[[GTE13]])
+// CHECK:        ROOT %tuple.{{.*}} = (s32[], f32[4], s32[], s32[], f32[4]) tuple(%[[ADD14]], %[[ADD15]], %get-tuple-element.{{.*}}, %[[GTE12]], %[[GTE13]])
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
 
 // CHECK:      %[[COND:region_1.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[], s32[], f32[4])) -> pred[] {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[], s32[], f32[4]) parameter(0)
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
-// CHECK:        %[[GTE21:get-tuple-element.*]] = s32[] get-tuple-element((s32[], f32[4], s32[], s32[], f32[4]) %[[ARG_TUPLE]]), index=2
-// CHECK-NEXT:   ROOT %compare.{{.*}} = pred[] compare(s32[] %get-tuple-element.{{.*}}, s32[] %[[GTE21]]), direction=LT
+// CHECK:        %[[GTE21:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=2
+// CHECK-NEXT:   ROOT %compare.{{.*}} = pred[] compare(%get-tuple-element.{{.*}}, %[[GTE21]]), direction=LT
 
 // CHECK:      ENTRY %main.{{.*}} ([[ARG0:Arg_0.*]]: s32[], [[ARG1:Arg_1.*]]: f32[4], [[ARG2:Arg_2.*]]: f32[4]) -> f32[4] {
 // CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
@@ -27,12 +27,12 @@
 // CHECK-NEXT:   %[[CONSTANT4:constant.*]] = s32[] constant(0)
 // CHECK-NEXT:   %[[CONSTANT5:constant.*]] = s32[] constant(1)
 // CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
-// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], s32[], s32[], f32[4]) tuple(s32[] %[[ARG0]], f32[4] %[[ARG1]], s32[] %[[CONSTANT4]], s32[] %[[CONSTANT5]], f32[4] %[[ARG2]])
+// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], s32[], s32[], f32[4]) tuple(%[[ARG0]], %[[ARG1]], %[[CONSTANT4]], %[[CONSTANT5]], %[[ARG2]])
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[WHILE:while.25]] = (s32[], f32[4], s32[], s32[], f32[4]) while((s32[], f32[4], s32[], s32[], f32[4]) %[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
+// CHECK-NEXT:   %[[WHILE:while.25]] = (s32[], f32[4], s32[], s32[], f32[4]) while(%[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
 // CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
-// CHECK-NEXT:   %[[GTE26:get-tuple-element.*]] = s32[] get-tuple-element((s32[], f32[4], s32[], s32[], f32[4]) %[[WHILE]]), index=0, sharding={replicated}
-// CHECK-NEXT:   ROOT %[[GTE27:get-tuple-element.*]] = f32[4] get-tuple-element((s32[], f32[4], s32[], s32[], f32[4]) %[[WHILE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+// CHECK-NEXT:   %[[GTE26:get-tuple-element.*]] = s32[] get-tuple-element(%[[WHILE]]), index=0
+// CHECK-NEXT:   ROOT %[[GTE27:get-tuple-element.*]] = f32[4] get-tuple-element(%[[WHILE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
 
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32> {mhlo.sharding = "{devices=[4]<=[4]}"}) -> tensor<4xf32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
@@ -60,21 +60,21 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32> {
 
 // CHECK:      %[[BODY:region_0.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[])) -> (s32[], f32[4], s32[]) {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[]) parameter(0)
-// CHECK:        %[[GTE:get-tuple-element.*]] = s32[] get-tuple-element((s32[], f32[4], s32[]) %[[ARG_TUPLE]]), index=2
-// CHECK:        %[[ADD:add.*]] = s32[] add(s32[] %get-tuple-element.{{.*}}, s32[] %[[GTE]])
-// CHECK:        ROOT %tuple.{{.*}} = (s32[], f32[4], s32[]) tuple(s32[] %[[ADD]], f32[4] %get-tuple-element.{{.*}}, s32[] %[[GTE]])
+// CHECK:        %[[GTE:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=2
+// CHECK:        %[[ADD:add.*]] = s32[] add(%get-tuple-element.{{.*}}, %[[GTE]])
+// CHECK:        ROOT %tuple.{{.*}} = (s32[], f32[4], s32[]) tuple(%[[ADD]], %get-tuple-element.{{.*}}, %[[GTE]])
 
 // CHECK:      %[[COND:region_1.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[])) -> pred[] {
 // CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[]) parameter(0)
-// CHECK:        %[[GTE:get-tuple-element..*]] = s32[] get-tuple-element((s32[], f32[4], s32[]) %[[ARG_TUPLE]]), index=2
-// CHECK:        ROOT %compare.{{.*}} = pred[] compare(s32[] %get-tuple-element.{{.*}}, s32[] %[[GTE]]), direction=LT
+// CHECK:        %[[GTE:get-tuple-element..*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=2
+// CHECK:        ROOT %compare.{{.*}} = pred[] compare(%get-tuple-element.{{.*}}, %[[GTE]]), direction=LT
 
 // CHECK:      ENTRY %main.{{.*}} ([[ARG0:Arg_0.*]]: s32[], [[ARG1:Arg_1.*]]: f32[4], [[ARG2:Arg_2.*]]: s32[]) -> f32[4] {
 // CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
 // CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1)
 // CHECK-NEXT:   %[[ARG2]] = s32[] parameter(2)
-// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], s32[]) tuple(s32[] %[[ARG0]], f32[4] %[[ARG1]], s32[] %[[ARG2]])
-// CHECK-NEXT:   %while.{{.*}} = (s32[], f32[4], s32[]) while((s32[], f32[4], s32[]) %[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
+// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], s32[]) tuple(%[[ARG0]], %[[ARG1]], %[[ARG2]])
+// CHECK-NEXT:   %while.{{.*}} = (s32[], f32[4], s32[]) while(%[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
 
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<i32>) -> tensor<4xf32> {
   %2:2 = mhlo.while(%iterArg = %arg0, %iterArg_0 = %arg1) : tensor<i32>, tensor<4xf32>
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.cc
index 8c0016ff24d2..63efd0c82ec0 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.cc
@@ -149,7 +149,8 @@ absl::Status ConvertMlirHloToHloViaBuilder(
 mlir::LogicalResult MlirHloToHloTextTranslateFunction(
     mlir::ModuleOp module, llvm::raw_ostream& output, bool emit_return_tuple,
     bool emit_use_tuple_arg, bool print_layouts, bool print_large_constants,
-    bool print_sugar, bool via_builder, bool with_layouts) {
+    bool print_sugar, bool via_builder, bool with_layouts,
+    bool direct_stablehlo_to_hlo) {
   if (!module) return mlir::failure();
 
   HloProto hloProto;
@@ -157,6 +158,7 @@ mlir::LogicalResult MlirHloToHloTextTranslateFunction(
   options.propagate_layouts = with_layouts;
   options.use_tuple_args = emit_use_tuple_arg;
   options.return_tuple = emit_return_tuple;
+  options.direct_stablehlo_to_hlo = direct_stablehlo_to_hlo;
   absl::StatusOr<std::unique_ptr<HloModule>> statusOrHloModule;
   if (via_builder) {
     auto status = ConvertMlirHloToHloViaBuilder(module, &hloProto, options);
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.h b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.h
index 064db33984b8..a47c880f8fb9 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.h
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/translate.h
@@ -27,15 +27,31 @@ limitations under the License.
 
 namespace xla {
 
+// Translates the given MLIR module containing MHLO to a HLO module.
+// The resulting HLO is written to the output stream.
+// `emit_return_tuple` controls whether the return value should be a tuple.
+// `emit_use_tuple_arg` controls whether the arguments should be a tuple.
 mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
                                                   llvm::raw_ostream& output,
                                                   bool emit_return_tuple,
                                                   bool emit_use_tuple_arg);
 
+// Translates the given MLIR module containing MHLO to a HLO text program.
+// The resulting HLO text is written to the output stream.
+// `emit_return_tuple` controls whether the return value should be a tuple.
+// `emit_use_tuple_arg` controls whether the arguments should be a tuple.
+// `print_layouts` controls whether to print layouts.
+// `print_large_constants` controls whether to print large constants.
+// `print_sugar` controls whether to print sugar.
+// `via_builder` controls whether to use the HLO builder.
+// `with_layouts` controls whether to print layouts.
+// `direct_stablehlo_to_hlo` controls whether to translate StableHLO directly to
+// HLO.
 mlir::LogicalResult MlirHloToHloTextTranslateFunction(
     mlir::ModuleOp module, llvm::raw_ostream& output, bool emit_return_tuple,
     bool emit_use_tuple_arg, bool print_layouts, bool print_large_constants,
-    bool print_sugar, bool via_builder, bool with_layouts);
+    bool print_sugar, bool via_builder, bool with_layouts,
+    bool direct_stablehlo_to_hlo = false);
 
 // Translate the MHLO program in in-memory file 'buffer' to a HLO program
 // written in a file represented with handle 'output_stream';
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/type_to_shape.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/type_to_shape.cc
index 289caa5ec175..7bfbbff39956 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/type_to_shape.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/type_to_shape.cc
@@ -52,23 +52,20 @@ using xla::PrimitiveType;
 
 namespace xla {
 
-std::optional<std::tuple<DimLevelType, bool, bool>> ConvertDimLevelType(
+std::optional<DimLevelType> ConvertDimLevelType(
     mlir::sparse_tensor::LevelType lt) {
   auto f = mlir::sparse_tensor::getLevelFormat(lt);
   if (!f) return std::nullopt;
 
-  bool unique = mlir::sparse_tensor::isUniqueLT(lt);
-  bool ordered = mlir::sparse_tensor::isOrderedLT(lt);
   switch (*f) {
     case mlir::sparse_tensor::LevelFormat::Singleton:
-      return std::make_tuple(DimLevelType::DIM_SINGLETON, unique, ordered);
+      return DimLevelType::DIM_SINGLETON;
     case mlir::sparse_tensor::LevelFormat::Compressed:
-      return std::make_tuple(DimLevelType::DIM_COMPRESSED, unique, ordered);
+      return DimLevelType::DIM_COMPRESSED;
     case mlir::sparse_tensor::LevelFormat::Dense:
-      return std::make_tuple(DimLevelType::DIM_DENSE, unique, ordered);
+      return DimLevelType::DIM_DENSE;
     case mlir::sparse_tensor::LevelFormat::LooseCompressed:
-      return std::make_tuple(DimLevelType::DIM_LOOSE_COMPRESSED, unique,
-                             ordered);
+      return DimLevelType::DIM_LOOSE_COMPRESSED;
     default:
       return std::nullopt;
   }
@@ -180,17 +177,6 @@ Shape TypeToShape(mlir::Type type) {
       // added to xla
       if (sparse.getPosWidth() != 32 || sparse.getCrdWidth() != 32) return {};
 
-      llvm::SmallVector<DimLevelType, 3> lvl_types;
-      llvm::SmallVector<bool, 3> level_unique;
-      llvm::SmallVector<bool, 3> level_ordered;
-      for (auto lt : sparse.getLvlTypes()) {
-        auto new_lt = ConvertDimLevelType(lt);
-        if (!new_lt) return {};
-        lvl_types.push_back(std::get<0>(*new_lt));
-        level_unique.push_back(std::get<1>(*new_lt));
-        level_ordered.push_back(std::get<2>(*new_lt));
-      }
-
       std::vector<int64_t> ordering(rank);
       std::iota(ordering.rbegin(), ordering.rend(), 0);
       // Uses an identity map for dim ordering as the default value.
@@ -201,8 +187,7 @@ Shape TypeToShape(mlir::Type type) {
       auto final_ordering = mlir::applyPermutationMap(
           dimToLvl, llvm::ArrayRef<int64_t>(ordering));
       auto sparse_shape = ::xla::ShapeUtil::MakeShapeWithSparseLayout(
-          primitive_type, shape, final_ordering, lvl_types, level_unique,
-          level_ordered);
+          primitive_type, shape, final_ordering);
       return sparse_shape;
     }
 
diff --git a/third_party/xla/xla/hlo/translate/stablehlo.cc b/third_party/xla/xla/hlo/translate/stablehlo.cc
index 29f3da115953..6ac0dd6e2199 100644
--- a/third_party/xla/xla/hlo/translate/stablehlo.cc
+++ b/third_party/xla/xla/hlo/translate/stablehlo.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "xla/hlo/translate/stablehlo.h"
 
 #include <memory>
-#include <utility>
 
+#include "mhlo/transforms/passes.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -26,7 +26,6 @@ limitations under the License.
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
@@ -34,6 +33,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 #include "stablehlo/dialect/Register.h"
+#include "stablehlo/transforms/Passes.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h"
 #include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
@@ -41,39 +41,40 @@ limitations under the License.
 #include "xla/mlir/utils/error_util.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/mlir_hlo/stablehlo_ext/transforms/passes.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "tsl/platform/errors.h"
+#include "xla/tsl/platform/errors.h"
+
+#define DEBUG_TYPE "xla-translate"
 
 namespace xla {
 
 namespace {
-absl::Status MhloToStablehlo(mlir::ModuleOp module) {
-  auto context = module.getContext();
-  mlir::PassManager pm(context);
-  mlir::BaseScopedDiagnosticHandler diag_handler(context);
-  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
-  if (failed(pm.run(module))) {
-    return diag_handler.ConsumeStatus();
-  }
-  return absl::OkStatus();
-}
 
 // TODO(b/385393967) Separate createCanonicalizerPass from StableHLO -> HLO
 // Translation
 absl::Status StablehloToMhlo(mlir::ModuleOp module, bool run_canonicalizer) {
   mlir::MLIRContext* context = module->getContext();
   mlir::PassManager pm(context);
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  // CHLO -> MHLO for high level ops (TopK, Erf, RaggedDot, etc.)
+  // CHLO -> StableHLO otherwise
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo_ext::createChloRecomposeOpsPass());
+  pm.addPass(mlir::createSymbolDCEPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloPass());
+      mlir::mhlo::createChloLegalizeToHighLevelMhloPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::createChloLegalizeToStablehloPass());
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   if (run_canonicalizer) {
     pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   }
   // In order to export to XLA, we must sink constants to control flow
   // regions, since XLA uses functional control flow.
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createSinkConstantsToControlFlowPass());
+      mlir::stablehlo_ext::createSinkConstantsToControlFlowPass());
+
   mlir::BaseScopedDiagnosticHandler diagnostic_handler(context);
   if (failed(pm.run(module))) {
     VLOG(1) << "MHLO->HLO lowering passes failed. Module:\n" << module;
@@ -86,6 +87,42 @@ absl::Status StablehloToMhlo(mlir::ModuleOp module, bool run_canonicalizer) {
   return absl::OkStatus();
 }
 
+absl::Status ConvertStablehloToHloProtoInternal(mlir::ModuleOp module,
+                                                xla::HloProto* hlo_proto,
+                                                bool use_tuple_args,
+                                                bool return_tuple,
+                                                bool run_canonicalizer) {
+  if (!module) return absl::InvalidArgumentError("Module is null");
+
+  TF_RETURN_IF_ERROR(StablehloToMhlo(module, run_canonicalizer));
+
+  mlir::MlirToHloConversionOptions options;
+  options.return_tuple = return_tuple;
+  options.use_tuple_args = use_tuple_args;
+  TF_RETURN_IF_ERROR(mlir::ConvertMlirHloToHlo(module, hlo_proto, options));
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertStablehloToHloInternal(
+    mlir::ModuleOp module, bool use_tuple_args, bool return_tuple) {
+  xla::HloProto hlo_proto;
+  TF_RETURN_IF_ERROR(ConvertStablehloToHloProtoInternal(
+      module, &hlo_proto, use_tuple_args, return_tuple,
+      /*run_canonicalizer=*/true));
+
+  // Create default config and modify config with values stored
+  // in MLIR module attributes
+  const xla::HloModuleProto& module_proto = hlo_proto.hlo_module();
+  auto config = xla::HloModule::CreateModuleConfigFromProto(
+      module_proto, xla::GetDebugOptionsFromFlags());
+  if (!config.ok()) {
+    return config.status();
+  }
+  mlir::mhlo::ExportHloModuleConfig(config.value(), module);
+
+  return xla::HloModule::CreateFromProto(module_proto, config.value());
+}
+
 }  // namespace
 
 void RegisterMlirToHloDependentDialects(mlir::DialectRegistry& registry) {
@@ -93,7 +130,7 @@ void RegisterMlirToHloDependentDialects(mlir::DialectRegistry& registry) {
   mlir::func::registerAllExtensions(registry);
   mlir::mhlo::registerAllMhloDialects(registry);
   registry.insert<mlir::tensor::TensorDialect, mlir::arith::ArithDialect,
-                  mlir::shape::ShapeDialect, mlir::ub::UBDialect>();
+                  mlir::shape::ShapeDialect>();
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToStablehlo(
@@ -102,10 +139,10 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToStablehlo(
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(&ctx));
   TF_RETURN_IF_ERROR(HloModuleImporter(mlir_module.get(),
                                        /*import_all_computation=*/true,
-                                       /*flatten_computation_args_result=*/true)
+                                       /*flatten_computation_args_result=*/true,
+                                       /*emit_stablehlo=*/true)
                          .Import(*hlo_module));
-  TF_RETURN_IF_ERROR(MhloToStablehlo(mlir_module.get()));
-  return std::move(mlir_module);
+  return mlir_module;
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToStablehlo(
@@ -114,41 +151,33 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertHloToStablehlo(
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(&ctx));
   TF_RETURN_IF_ERROR(HloModuleImporter(mlir_module.get(),
                                        /*import_all_computation=*/true,
-                                       /*flatten_computation_args_result=*/true)
+                                       /*flatten_computation_args_result=*/true,
+                                       /*emit_stablehlo=*/true)
                          .Import(*hlo_module_proto));
-  TF_RETURN_IF_ERROR(MhloToStablehlo(mlir_module.get()));
-  return std::move(mlir_module);
+  return mlir_module;
 }
 
 absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertStablehloToHlo(
     mlir::ModuleOp module) {
-  xla::HloProto hlo_proto;
-  TF_RETURN_IF_ERROR(ConvertStablehloToHloProto(module, &hlo_proto));
-
-  // Create default config and modify config with values stored
-  // in MLIR module attributes
-  const xla::HloModuleProto& module_proto = hlo_proto.hlo_module();
-  auto config = xla::HloModule::CreateModuleConfigFromProto(
-      module_proto, xla::GetDebugOptionsFromFlags());
-  if (!config.ok()) {
-    return config.status();
-  }
-  mlir::mhlo::ExportHloModuleConfig(config.value(), module);
+  return ConvertStablehloToHloInternal(module,
+                                       /*use_tuple_args=*/false,
+                                       /*return_tuple=*/false);
+}
 
-  return xla::HloModule::CreateFromProto(module_proto, config.value());
+absl::StatusOr<std::unique_ptr<xla::HloModule>>
+ConvertStablehloToHloWithOptions(mlir::ModuleOp module, bool use_tuple_args,
+                                 bool return_tuple) {
+  return ConvertStablehloToHloInternal(module, use_tuple_args, return_tuple);
 }
 
 absl::Status ConvertStablehloToHloProto(mlir::ModuleOp module,
                                         xla::HloProto* hlo_proto) {
   if (!module) return absl::InvalidArgumentError("Module is null");
 
-  TF_RETURN_IF_ERROR(StablehloToMhlo(module, /*run_canonicalizer=*/true));
-
-  mlir::MlirToHloConversionOptions options;
-  options.return_tuple = false;
-  options.use_tuple_args = false;
-  TF_RETURN_IF_ERROR(mlir::ConvertMlirHloToHlo(module, hlo_proto, options));
-  return absl::OkStatus();
+  return ConvertStablehloToHloProtoInternal(module, hlo_proto,
+                                            /*use_tuple_args=*/false,
+                                            /*return_tuple=*/false,
+                                            /*run_canonicalizer=*/true);
 }
 
 absl::Status ConvertStablehloWithManyArgsToHloProto(mlir::ModuleOp module,
@@ -156,17 +185,12 @@ absl::Status ConvertStablehloWithManyArgsToHloProto(mlir::ModuleOp module,
                                                     bool use_tuple_args) {
   if (!module) return absl::InvalidArgumentError("Module is null");
 
-  TF_RETURN_IF_ERROR(StablehloToMhlo(module, /*run_canonicalizer=*/false));
-
-  mlir::MlirToHloConversionOptions options;
-  options.return_tuple = false;
-  options.use_tuple_args = use_tuple_args;
-  // Remove attributes introduced by `import_all_computation=true` at
-  // ConvertHloToStablehlo.
+  // TODO: Why are we removing attributes.
   module->removeAttr("mhlo.xla_entry_computation_parameter_layouts");
   module->removeAttr("mhlo.xla_entry_computation_parameter_tiles");
-  TF_RETURN_IF_ERROR(mlir::ConvertMlirHloToHlo(module, hlo_proto, options));
-  return absl::OkStatus();
+  return ConvertStablehloToHloProtoInternal(module, hlo_proto, use_tuple_args,
+                                            /*return_tuple=*/false,
+                                            /*run_canonicalizer=*/false);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/translate/stablehlo.h b/third_party/xla/xla/hlo/translate/stablehlo.h
index 1c6493449739..93fab097e63b 100644
--- a/third_party/xla/xla/hlo/translate/stablehlo.h
+++ b/third_party/xla/xla/hlo/translate/stablehlo.h
@@ -48,6 +48,15 @@ absl::StatusOr<std::unique_ptr<xla::HloModule>> ConvertStablehloToHlo(
 absl::Status ConvertStablehloToHloProto(mlir::ModuleOp module,
                                         xla::HloProto* hlo_proto);
 
+// Convert StableHLO module to HloModule.
+// DO NOT USE THIS METHOD WITHOUT A GOOD REASON. Prefer ConvertStablehloToHlo.
+// Currently it exists to satisfy the PJRT compilation APIs where a framework
+// may specify that a computation should use tuples. This is seldom used, the
+// main exception being computations with 2k+ parameters targeting TPU.
+absl::StatusOr<std::unique_ptr<xla::HloModule>>
+ConvertStablehloToHloWithOptions(mlir::ModuleOp module, bool use_tuple_args,
+                                 bool return_tuple);
+
 // Convert StableHLO module to HloModuleProto.
 // Some platforms run out of memory when the argument list is too long.
 // This API wraps the arguments in a tuple (if use_tuple_args = true)
diff --git a/third_party/xla/xla/hlo/translate/stablehlo_to_hlo/translate.cc b/third_party/xla/xla/hlo/translate/stablehlo_to_hlo/translate.cc
index ee50467832e8..68b9757fee7a 100644
--- a/third_party/xla/xla/hlo/translate/stablehlo_to_hlo/translate.cc
+++ b/third_party/xla/xla/hlo/translate/stablehlo_to_hlo/translate.cc
@@ -58,7 +58,9 @@ mlir::LogicalResult StablehloToHloTextTranslateFunction(
   if (!module) return mlir::failure();
 
   mlir::PassManager pm(module->getContext());
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  mlir::mhlo::StablehloLegalizeToHloPassOptions shlo_pass_opts;
+  shlo_pass_opts.convert_xla_supported_stablehlo_ = false;
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass(shlo_pass_opts));
   if (failed(pm.run(module))) {
     module->dump();
     return mlir::failure();
@@ -66,7 +68,8 @@ mlir::LogicalResult StablehloToHloTextTranslateFunction(
 
   return xla::MlirHloToHloTextTranslateFunction(
       module, output, emit_return_tuple, emit_use_tuple_arg, print_layouts,
-      print_large_constants, print_sugar, via_builder, with_layouts);
+      print_large_constants, print_sugar, via_builder, with_layouts,
+      /*direct_stablehlo_to_hlo=*/true);
 }
 
 mlir::LogicalResult StablehloToHloTextMain(
diff --git a/third_party/xla/xla/hlo/translate/tests/BUILD b/third_party/xla/xla/hlo/translate/tests/BUILD
index 4c8c5068c6eb..8c963f7abc20 100644
--- a/third_party/xla/xla/hlo/translate/tests/BUILD
+++ b/third_party/xla/xla/hlo/translate/tests/BUILD
@@ -10,13 +10,21 @@ lit_test_suite(
     name = "all_tests",
     srcs = enforce_glob(
         [
+            # go/keep-sorted start
+            "chlo.mlir",
             "emit_mhlo.hlo",
             "emit_proto.mlir",
             "print_large_constants.mlir",
             "print_layouts.mlir",
             "simple.hlo",
             "simple.mlir",
+            "stablehlo.mlir",
+            "stablehlo_invalid.mlir",
+            "stablehlo_unary_elementwise.mlir",
+            "stablehlo_while.mlir",
+            "stablehlo_while_free_vars.mlir",
             "vhlo_input.mlir",
+            # go/keep-sorted end
         ],
         include = [
             "*.mlir",
@@ -30,6 +38,8 @@ lit_test_suite(
     ],
     tools = [
         "//xla/hlo/tools:hlo-translate",
+        "//xla/hlo/translate:xla-translate",
+        "//xla/mlir_hlo:mlir-hlo-opt",
         "@llvm-project//llvm:FileCheck",
         "@llvm-project//llvm:not",
     ],
diff --git a/third_party/xla/xla/hlo/translate/tests/chlo.mlir b/third_party/xla/xla/hlo/translate/tests/chlo.mlir
new file mode 100644
index 000000000000..a20c78a6d102
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/tests/chlo.mlir
@@ -0,0 +1,38 @@
+// RUN: hlo-translate -mlir-to-hlo -split-input-file %s | FileCheck %s
+
+// Validating chlo.op -> mhlo.op -> hlo.op conversion.
+
+// CHECK-LABEL: main
+func.func @main(%arg : tensor<f16>) -> tensor<f16> {
+  // CHECK: %[[ARG:.*]] = f16[] parameter(0)
+  // CHECK: erf(%[[ARG]])
+  %1 = "chlo.erf"(%arg) : (tensor<f16>) -> tensor<f16>
+  func.return %1 : tensor<f16>
+}
+
+// -----
+
+func.func @main(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
+  // CHECK: %[[ARG:.*]] = f32[16,16] parameter(0)
+  // CHECK: (f32[16,8], s32[16,8]) topk(%[[ARG]]), k=8, largest=true
+  %1:2 = chlo.top_k(%arg, k=8) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
+  func.return %1#0, %1#1 : tensor<16x8xf32>, tensor<16x8xi32>
+}
+
+// -----
+
+func.func @main(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3x2x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<2x11x7xf32> {
+  // CHECK: ragged-dot
+  %0 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
+    ragged_dot_dimension_numbers = #chlo.ragged_dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [1],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [2],
+      lhs_ragged_dimensions = [1],
+      rhs_group_dimensions = [0]
+    >,
+    precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>]
+  } : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  func.return %0 : tensor<2x11x7xf32>
+}
diff --git a/third_party/xla/xla/hlo/translate/tests/print_layouts.mlir b/third_party/xla/xla/hlo/translate/tests/print_layouts.mlir
index 40528c15e92f..b7e4209ddf35 100644
--- a/third_party/xla/xla/hlo/translate/tests/print_layouts.mlir
+++ b/third_party/xla/xla/hlo/translate/tests/print_layouts.mlir
@@ -3,11 +3,11 @@
 // CHECK-LABEL: main
 // CHECK:  ENTRY
 // CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((s32[3,3]{0,1}, pred[]), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:  [[GTE1:%.*]] = (s32[3,3]{0,1}, pred[]) get-tuple-element(((s32[3,3]{0,1}, pred[]), token[]) [[INFEED]]), index=0
-// CHECK:  [[GTE2:%.*]] = s32[3,3]{0,1} get-tuple-element((s32[3,3]{0,1}, pred[]) [[GTE1]]), index=0
-// CHECK:  [[GTE3:%.*]] = pred[] get-tuple-element((s32[3,3]{0,1}, pred[]) [[GTE1]]), index=1
-// CHECK:  [[GTE4:%.*]] = token[] get-tuple-element(((s32[3,3]{0,1}, pred[]), token[]) [[INFEED]]), index=1
+// CHECK:  [[INFEED:%.*]] = ((s32[3,3]{0,1}, pred[]), token[]) infeed([[ARG]]), infeed_config="foobar"
+// CHECK:  [[GTE1:%.*]] = (s32[3,3]{0,1}, pred[]) get-tuple-element([[INFEED]]), index=0
+// CHECK:  [[GTE2:%.*]] = s32[3,3]{0,1} get-tuple-element([[GTE1]]), index=0
+// CHECK:  [[GTE3:%.*]] = pred[] get-tuple-element([[GTE1]]), index=1
+// CHECK:  [[GTE4:%.*]] = token[] get-tuple-element([[INFEED]]), index=1
 func.func @main(%arg0: !stablehlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !stablehlo.token> {
   %0:3 = "stablehlo.infeed"(%arg0) {infeed_config = "foobar", layout=[[0, 1], [0]]} : (!stablehlo.token) -> (tensor<3x3xi32>, tensor<i1>, !stablehlo.token)
   %1 = "stablehlo.tuple"(%0#0, %0#1) : (tensor<3x3xi32>, tensor<i1>) -> tuple<tensor<3x3xi32>, tensor<i1>>
diff --git a/third_party/xla/xla/hlo/translate/tests/simple.mlir b/third_party/xla/xla/hlo/translate/tests/simple.mlir
index 1a3e1f5246bb..74f3f5073161 100644
--- a/third_party/xla/xla/hlo/translate/tests/simple.mlir
+++ b/third_party/xla/xla/hlo/translate/tests/simple.mlir
@@ -4,9 +4,9 @@
 func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
   // CHECK: %Arg_0.1 = f32[4] parameter(0)
   // CHECK: %Arg_1.2 = f32[4] parameter(1)
-  // CHECK: %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
+  // CHECK: %add.3 = f32[4] add(%Arg_0.1, %Arg_1.2)
   %0 = stablehlo.add %arg0, %arg1 : tensor<4xf32>
-  // CHECK: ROOT %dot.4 = f32[] dot(f32[4] %add.3, f32[4] %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  // CHECK: ROOT %dot.4 = f32[] dot(%add.3, %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
   %1 = stablehlo.dot %0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
   func.return %1 : tensor<f32>
 }
@@ -18,9 +18,9 @@ func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
 func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT: %Arg_0.1 = f32[4] parameter(0)
   // CHECK-NEXT: %Arg_1.2 = f32[4] parameter(1)
-  // CHECK-NEXT: %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
+  // CHECK-NEXT: %add.3 = f32[4] add(%Arg_0.1, %Arg_1.2)
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  // CHECK-NEXT: ROOT %add.4 = f32[4] add(f32[4] %add.3, f32[4] %Arg_1.2)
+  // CHECK-NEXT: ROOT %add.4 = f32[4] add(%add.3, %Arg_1.2)
   %1 = "mhlo.add"(%0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   func.return %1 : tensor<4xf32>
 }
diff --git a/third_party/xla/xla/hlo/translate/tests/stablehlo.mlir b/third_party/xla/xla/hlo/translate/tests/stablehlo.mlir
new file mode 100644
index 000000000000..a5e1fffe132f
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/tests/stablehlo.mlir
@@ -0,0 +1,1906 @@
+// RUN: xla-translate --stablehlo-to-hlo-text -split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --stablehlo-legalize-to-hlo=convert-xla-supported-stablehlo=false -split-input-file %s | FileCheck %s --check-prefix CHECK-DIRECT
+
+// Tests for all stablehlo ops to validate stablehlo -> hlo conversion.
+
+
+// CHECK-LABEL: HloModule
+
+// CHECK: %[[ARG0:.*]] = f32[4] parameter(0)
+// CHECK: %[[ARG1:.*]] = f32[4] parameter(1)
+// CHECK: ROOT %add.3 = f32[4] add(%[[ARG0]], %[[ARG1]])
+func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = stablehlo.add %arg0, %arg1 : tensor<4xf32>  func.return %0 : tensor<4xf32>
+}
+// CHECK-DIRECT: stablehlo.add
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[3,4]{1,0})->s32[1,2]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[3,4] parameter(0)
+// CHECK-NEXT:  ROOT %[[slice_2:[^ ]+]] = s32[1,2] slice(%[[Arg_0_1]]), slice={[1:2:1], [0:4:2]},
+func.func @main(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
+  %0 = stablehlo.slice %arg0 [1:2, 0:4:2] : (tensor<3x4xi32>) -> tensor<1x2xi32>
+  return %0 : tensor<1x2xi32>
+}
+// CHECK-DIRECT: stablehlo.slice
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[3,4]{1,0}, s64[], s64[])->s32[1,4]{1,0}}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[3,4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s64[] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = s64[] parameter(2)
+// CHECK-NEXT:  ROOT %[[dynamic_slice_4:[^ ]+]] = s32[1,4] dynamic-slice(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), dynamic_slice_sizes={1,4},
+func.func @main(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  %0 = stablehlo.dynamic_slice %arg0, %arg1, %arg2, sizes = [1, 4] : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
+  return %0 : tensor<1x4xi32>
+}
+// CHECK-DIRECT: stablehlo.dynamic_slice
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[4]{0})->s32[1,2,3,4]{3,2,1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[4] parameter(0)
+// CHECK-NEXT:  ROOT %[[broadcast_2:[^ ]+]] = s32[1,2,3,4] broadcast(%[[Arg_0_1]]), dimensions={3}
+func.func @main(%arg0: tensor<4xi32>) -> tensor<1x2x3x4xi32> {
+  %0 = stablehlo.broadcast %arg0, sizes = [1, 2, 3] : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
+  return %0 : tensor<1x2x3x4xi32>
+}
+// CHECK-DIRECT: stablehlo.broadcast
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[1]{0})->f32[1,10]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[1] parameter(0)
+// CHECK-NEXT:  ROOT %[[broadcast_2:[^ ]+]] = f32[1,10] broadcast(%[[Arg_0_1]]), dimensions={0}
+func.func @main(%arg0: tensor<1xf32>) -> tensor<1x10xf32> {
+  %0 = stablehlo.broadcast_in_dim %arg0, dims = [0] : (tensor<1xf32>) -> tensor<1x10xf32>
+  return %0 : tensor<1x10xf32>
+}
+// CHECK-DIRECT: stablehlo.broadcast_in_dim
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[100,26,26,32]{3,2,1,0}, f32[3,3,1,32]{3,2,1,0})->f32[100,28,28,1]{3,2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[100,26,26,32] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[3,3,1,32] parameter(1)
+// CHECK-NEXT:  ROOT %[[convolution_3:[^ ]+]] = f32[100,28,28,1] convolution(%[[Arg_0_1]], %[[Arg_1_2]]), window={size=3x3 pad=2_2x2_2}, dim_labels=b01f_01oi->b01f, metadata=
+
+module {
+  func.func @main(%arg0: tensor<100x26x26x32xf32>, %arg1: tensor<3x3x1x32xf32>) -> tensor<100x28x28x1xf32> {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<100x26x26x32xf32>, tensor<3x3x1x32xf32>) -> tensor<100x28x28x1xf32>
+  return %0 : tensor<100x28x28x1xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.convolution
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s8[100,26,26,32]{3,2,1,0}, s8[3,3,1,32]{3,2,1,0})->s32[100,28,28,1]{3,2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s8[100,26,26,32] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s8[3,3,1,32] parameter(1)
+// CHECK-NEXT:  ROOT %[[convolution_3:[^ ]+]] = s32[100,28,28,1] convolution(%[[Arg_0_1]], %[[Arg_1_2]]), window={size=3x3 pad=2_2x2_2}, dim_labels=b01f_01oi->b01f, metadata=
+
+module {
+  func.func @main(%arg0: tensor<100x26x26x32xi8>, %arg1: tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32> {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<100x26x26x32xi8>, tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32>
+  return %0 : tensor<100x28x28x1xi32>
+  }
+}
+// CHECK-DIRECT: stablehlo.convolution
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s8[100,26,26,32]{3,2,1,0}, s8[3,3,1,32]{3,2,1,0})->s32[100,28,28,1]{3,2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s8[100,26,26,32] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s8[3,3,1,32] parameter(1)
+// CHECK-NEXT:  ROOT %[[convolution_3:[^ ]+]] = s32[100,28,28,1] convolution(%[[Arg_0_1]], %[[Arg_1_2]]), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f, metadata=
+
+module {
+  func.func @main(%arg0: tensor<100x26x26x32xi8>, %arg1: tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32> {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, o, i]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [true, true]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<100x26x26x32xi8>, tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32>
+  return %0 : tensor<100x28x28x1xi32>
+  }
+}
+// CHECK-DIRECT: stablehlo.convolution
+
+// -----
+// Binary elementwise ops
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,2]{1,0}, f32[2,2]{1,0})->f32[2,2]{1,0}}
+
+// CHECK:       ENTRY %[[$main_11:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,2] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[2,2] parameter(1)
+// CHECK-NEXT:  %[[add_3:[^ ]+]] = f32[2,2] add(%[[Arg_0_1]], %[[Arg_1_2]]),
+// CHECK-NEXT:  %[[atan2_4:[^ ]+]] = f32[2,2] atan2(%[[add_3]], %[[Arg_1_2]]),
+// CHECK-NEXT:  %[[divide_5:[^ ]+]] = f32[2,2] divide(%[[atan2_4]], %[[Arg_1_2]]),
+// CHECK-NEXT:  %[[maximum_6:[^ ]+]] = f32[2,2] maximum(%[[divide_5]], %[[Arg_1_2]]),
+// CHECK-NEXT:  %[[minimum_7:[^ ]+]] = f32[2,2] minimum(%[[maximum_6]], %[[Arg_1_2]]),
+// CHECK-NEXT:  %[[multiply_8:[^ ]+]] = f32[2,2] multiply(%[[minimum_7]], %[[Arg_1_2]]),
+// CHECK-NEXT:  %[[power_9:[^ ]+]] = f32[2,2] power(%[[multiply_8]], %[[Arg_1_2]]),
+// CHECK-NEXT:  ROOT %[[subtract_10:[^ ]+]] = f32[2,2] subtract(%[[power_9]], %[[Arg_1_2]]),
+
+func.func @main(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %0 = stablehlo.add %arg0, %arg1 : tensor<2x2xf32>
+  %1 = stablehlo.atan2 %0, %arg1 : tensor<2x2xf32>
+  %2 = stablehlo.divide %1, %arg1 : tensor<2x2xf32>
+  %3 = stablehlo.maximum %2, %arg1 : tensor<2x2xf32>
+  %4 = stablehlo.minimum %3, %arg1 : tensor<2x2xf32>
+  %5 = stablehlo.multiply %4, %arg1 : tensor<2x2xf32>
+  %6 = stablehlo.power %5, %arg1 : tensor<2x2xf32>
+  %7 = stablehlo.subtract %6, %arg1 : tensor<2x2xf32>
+  return %7 : tensor<2x2xf32>
+}
+// CHECK-DIRECT: stablehlo.add
+// CHECK-DIRECT: stablehlo.atan2
+// CHECK-DIRECT: stablehlo.divide
+// CHECK-DIRECT: stablehlo.maximum
+// CHECK-DIRECT: stablehlo.minimum
+// CHECK-DIRECT: stablehlo.multiply
+// CHECK-DIRECT: stablehlo.power
+// CHECK-DIRECT: stablehlo.subtract
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[128,32]{1,0})->f32[128,128]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[128,32] parameter(0)
+// CHECK-NEXT:  ROOT %[[all_gather_2:[^ ]+]] = f32[128,128] all-gather(%[[Arg_0_1]]), channel_id=1,
+// CHECK-SAME{{LITERAL}}: replica_groups={{0,2,4,6},{1,3,5,7}},
+// CHECK-SAME: dimensions={1},
+func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x128xf32> {
+%0 = "stablehlo.all_gather"(%arg0) <{all_gather_dim = 1 : i64, channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>}> {shard_count = 4 : i64} : (tensor<128x32xf32>) -> tensor<128x128xf32>
+return %0 : tensor<128x128xf32>
+}
+// CHECK-DIRECT: stablehlo.all_gather
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[128,32]{1,0})->f32[128,128]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[128,32] parameter(0)
+// CHECK-NEXT:  ROOT %[[all_gather_2:[^ ]+]] = f32[128,128] all-gather(%[[Arg_0_1]]), channel_id=1,
+// CHECK-SAME{{LITERAL}}: replica_groups={{0,2,4,6},{1,3,5,7}},
+// CHECK-SAME: dimensions={1}, use_global_device_ids=true,
+func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x128xf32> {
+  %0 = "stablehlo.all_gather"(%arg0) <{all_gather_dim = 1 : i64, channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>, use_global_device_ids}> {shard_count = 4 : i64} : (tensor<128x32xf32>) -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+// CHECK-DIRECT: stablehlo.all_gather
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[8,2]{1,0}, f32[8,4]{1,0})->(f32[8,8]{1,0}, f32[8,16]{1,0})}
+
+// CHECK:       ENTRY %[[$main_10:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[8,2] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[8,4] parameter(1)
+// CHECK-NEXT:  %[[tuple_3:[^ ]+]] = (f32[8,2], f32[8,4]) tuple(%[[Arg_0_1]], %[[Arg_1_2]]),
+// CHECK-NEXT:  %[[get_tuple_element_4:[^ ]+]] = f32[8,2] get-tuple-element(%[[tuple_3]]), index=0,
+// CHECK-NEXT:  %[[get_tuple_element_5:[^ ]+]] = f32[8,4] get-tuple-element(%[[tuple_3]]), index=1,
+// CHECK-NEXT:  %[[all_gather_6:[^ ]+]] = (f32[8,8], f32[8,16]) all-gather(%[[get_tuple_element_4]], %[[get_tuple_element_5]]), channel_id=1,
+// CHECK-SAME{{LITERAL}}: replica_groups={{0,2,4,6},{1,3,5,7}},
+// CHECK-SAME:  dimensions={1}, use_global_device_ids=true,
+// CHECK-NEXT:  %[[get_tuple_element_7:[^ ]+]] = f32[8,8] get-tuple-element(%[[all_gather_6]]), index=0,
+// CHECK-NEXT:  %[[get_tuple_element_8:[^ ]+]] = f32[8,16] get-tuple-element(%[[all_gather_6]]), index=1,
+// CHECK-NEXT:  ROOT %[[tuple_9:[^ ]+]] = (f32[8,8], f32[8,16]) tuple(%[[get_tuple_element_7]], %[[get_tuple_element_8]]),
+func.func @main(%arg0: tensor<8x2xf32>, %arg1: tensor<8x4xf32>) -> tuple<tensor<8x8xf32>, tensor<8x16xf32>> {
+  %0:2 = "stablehlo.all_gather"(%arg0, %arg1) <{all_gather_dim = 1 : i64, channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>, use_global_device_ids}> : (tensor<8x2xf32>, tensor<8x4xf32>) -> (tensor<8x8xf32>, tensor<8x16xf32>)
+  %1 = stablehlo.tuple %0#0, %0#1 {xla_shape = "(f32[8,8]{0,1}, f32[8,16]{0,1})"} : tuple<tensor<8x8xf32>, tensor<8x16xf32>>
+  return %1 : tuple<tensor<8x8xf32>, tensor<8x16xf32>>
+}
+// CHECK-DIRECT: stablehlo.all_gather
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(pred[4]{0}, pred[4]{0})->pred[4]{0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = pred[4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = pred[4] parameter(1)
+// CHECK-NEXT:  ROOT %[[and_3:[^ ]+]] = pred[4] and(%[[Arg_0_1]], %[[Arg_1_2]]),
+func.func @main(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
+  %0 = stablehlo.and %arg0, %arg1 : tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
+// CHECK-DIRECT: stablehlo.and
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[16,16,16,16]{3,2,1,0}, f32[16]{0}, f32[16]{0}, f32[16]{0}, f32[16]{0})->f32[16,16,16,16]{3,2,1,0}}
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[16,16,16,16] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[16] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[16] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = f32[16] parameter(3)
+// CHECK-NEXT:  %[[Arg_4_5:[^ ]+]] = f32[16] parameter(4)
+// CHECK-NEXT:  ROOT %[[batch_norm_inference_6:[^ ]+]] = f32[16,16,16,16] batch-norm-inference(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]], %[[Arg_3_4]], %[[Arg_4_5]]), epsilon=0.001, feature_index=0,
+func.func @main(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
+  %0 = "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {
+  epsilon = 0.001 : f32,
+  feature_index = 0 : i64
+  } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
+  func.return %0 : tensor<16x16x16x16xf32>
+}
+// CHECK-DIRECT: stablehlo.batch_norm_inference
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[3,3]{1,0})->f32[3,3]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[3,3] parameter(0)
+// CHECK-NEXT:  ROOT %[[cholesky_2:[^ ]+]] = f32[3,3] cholesky(%[[Arg_0_1]]), lower=true,
+func.func @main(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
+  %0 = stablehlo.cholesky %arg0 {lower = true} : (tensor<3x3xf32>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+}
+// CHECK-DIRECT: stablehlo.cholesky
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[16,8]{1,0})->f32[16,8]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[16,8] parameter(0)
+// CHECK-NEXT:  ROOT %[[collective_permute_2:[^ ]+]] = f32[16,8] collective-permute(%[[Arg_0_1]]),
+// CHECK-SAME{{LITERAL}}: source_target_pairs={{0,1},{1,2},{2,3}},
+func.func @main(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
+  %0 = "stablehlo.collective_permute"(%arg0) {
+  source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
+  channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>
+  } : (tensor<16x8xf32>) -> tensor<16x8xf32>
+  func.return %0 : tensor<16x8xf32>
+}
+// CHECK-DIRECT: stablehlo.collective_permute
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[8]{0}, f32[8]{0})->f32[16]{0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[8] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[8] parameter(1)
+// CHECK-NEXT:  ROOT %[[concatenate_3:[^ ]+]] = f32[16] concatenate(%[[Arg_0_1]], %[[Arg_1_2]]), dimensions={0},
+
+func.func @main(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<16xf32> {
+  %0 = "stablehlo.concatenate"(%arg0, %arg1) {
+  dimension = 0 : i64
+  } : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
+  func.return %0 : tensor<16xf32>
+}
+// CHECK-DIRECT: stablehlo.concatenate
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$sum_2:[^ ]+]]
+// CHECK-NEXT:  %[[x_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[y_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_5:[^ ]+]] = f32[] add(%[[x_3]], %[[y_4]])
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  ROOT %[[all_reduce_6:[^ ]+]] = f32[] all-reduce(%[[Arg_0_1]]),
+// CHECK-SAME{{LITERAL}}: replica_groups={{0},{1}},
+// CHECK-SAME: to_apply=%[[$sum_2]],
+func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
+  %0 = "stablehlo.cross-replica-sum"(%arg0) {
+  replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
+  } : (tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+// CHECK-DIRECT: stablehlo.cross-replica-sum
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[16]{0}, f32[4]{0}, s64[])->f32[16]{0}}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[16] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[4] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = s64[] parameter(2)
+// CHECK-NEXT:  ROOT %[[dynamic_update_slice_4:[^ ]+]] = f32[16] dynamic-update-slice(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]),
+func.func @main(%arg0: tensor<16xf32>, %arg1: tensor<4xf32>, %arg2: tensor<i64>) -> tensor<16xf32> {
+  %0 = "stablehlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
+  func.return %0 : tensor<16xf32>
+}
+// CHECK-DIRECT: stablehlo.dynamic_update_slice
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[8,16]{1,0}, f32[16,8]{1,0})->f32[8,8]{1,0}}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[8,16] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[16,8] parameter(1)
+// CHECK-NEXT:  %[[dot_3:[^ ]+]] = f32[8,8] dot(%[[Arg_0_1]], %[[Arg_1_2]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, frontend_attributes={grad_x="false",grad_y="false"},
+// CHECK-NEXT:  ROOT %[[transpose_4:[^ ]+]] = f32[8,8] transpose(%[[dot_3]]), dimensions={0,1},
+func.func @main(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
+  %0 = "stablehlo.einsum"(%arg0, %arg1) {
+  einsum_config = "ab,bc->ac"
+  } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+// CHECK-DIRECT: stablehlo.einsum
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(c64[16]{0})->c64[16]{0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[16] parameter(0)
+// CHECK-NEXT:  ROOT %[[fft_2:[^ ]+]] = c64[16] fft(%[[Arg_0_1]]), fft_type=FFT, fft_length={16},
+func.func @main(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
+  %0 = "stablehlo.fft"(%arg0) {
+  fft_type = #stablehlo<fft_type FFT>,
+  fft_length = array<i64: 16>
+  } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
+  func.return %0 : tensor<16xcomplex<f32>>
+}
+// CHECK-DIRECT: stablehlo.fft
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(c64[16]{0})->c64[16]{0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[16] parameter(0)
+// CHECK-NEXT:  ROOT %[[fft_2:[^ ]+]] = c64[16] fft(%[[Arg_0_1]]), fft_type=IFFT, fft_length={16},
+func.func @main(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
+  %0 = "stablehlo.fft"(%arg0) {
+  fft_type = #stablehlo<fft_type IFFT>,
+  fft_length = array<i64: 16>
+  } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
+  func.return %0 : tensor<16xcomplex<f32>>
+}
+// CHECK-DIRECT: stablehlo.fft
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[16]{0})->c64[9]{0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[16] parameter(0)
+// CHECK-NEXT:  ROOT %[[fft_2:[^ ]+]] = c64[9] fft(%[[Arg_0_1]]), fft_type=RFFT, fft_length={16},
+func.func @main(%arg0: tensor<16xf32>) -> tensor<9xcomplex<f32>> {
+  %0 = "stablehlo.fft"(%arg0) {
+  fft_type = #stablehlo<fft_type RFFT>,
+  fft_length = array<i64: 16>
+  } : (tensor<16xf32>) -> tensor<9xcomplex<f32>>
+  func.return %0 : tensor<9xcomplex<f32>>
+}
+// CHECK-DIRECT: stablehlo.fft
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(c64[9]{0})->f32[16]{0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = c64[9] parameter(0)
+// CHECK-NEXT:  ROOT %[[fft_2:[^ ]+]] = f32[16] fft(%[[Arg_0_1]]), fft_type=IRFFT, fft_length={16},
+func.func @main(%arg0: tensor<9xcomplex<f32>>) -> tensor<16xf32> {
+  %0 = "stablehlo.fft"(%arg0) {
+  fft_type = #stablehlo<fft_type IRFFT>,
+  fft_length = array<i64: 16>
+  } : (tensor<9xcomplex<f32>>) -> tensor<16xf32>
+  func.return %0 : tensor<16xf32>
+}
+// CHECK-DIRECT: stablehlo.fft
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[?]{0})->s32[]}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[?] parameter(0)
+// CHECK-NEXT:  ROOT %[[get_dimension_size_2:[^ ]+]] = s32[] get-dimension-size(%[[Arg_0_1]]), dimensions={0},
+
+func.func @main(%arg0: tensor<?xf32>) -> tensor<i32> {
+  %0 = "stablehlo.get_dimension_size"(%arg0) {
+  dimension = 0 : i64
+  } : (tensor<?xf32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+// CHECK-DIRECT: stablehlo.get_dimension_size
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(pred[4]{0}, pred[4]{0})->pred[4]{0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = pred[4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = pred[4] parameter(1)
+// CHECK-NEXT:  ROOT %[[or_3:[^ ]+]] = pred[4] or(%[[Arg_0_1]], %[[Arg_1_2]]),
+func.func @main(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
+  %0 = stablehlo.or %arg0, %arg1 : tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
+// CHECK-DIRECT: stablehlo.or
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[4]{0}, s32[4]{0})->s32[4]{0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[4] parameter(1)
+// CHECK-NEXT:  ROOT %[[or_3:[^ ]+]] = s32[4] or(%[[Arg_0_1]], %[[Arg_1_2]]),
+func.func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  %0 = stablehlo.or %arg0, %arg1 : tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+// CHECK-DIRECT: stablehlo.or
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={()->u32[]}
+
+// CHECK:       ENTRY %[[$main_2:[^ ]+]]
+// CHECK-NEXT:  ROOT %[[replica_id_1:[^ ]+]] = u32[] replica-id(),
+func.func @main() -> tensor<ui32> {
+  %0 = "stablehlo.replica_id"() : () -> tensor<ui32>
+  func.return %0 : tensor<ui32>
+}
+// CHECK-DIRECT: stablehlo.replica_id
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[16]{0})->f32[16]{0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[16] parameter(0)
+// CHECK-NEXT:  ROOT %[[reverse_2:[^ ]+]] = f32[16] reverse(%[[Arg_0_1]]), dimensions={0},
+func.func @main(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+  %0 = "stablehlo.reverse"(%arg0) {
+  dimensions = array<i64: 0>
+  } : (tensor<16xf32>) -> tensor<16xf32>
+  func.return %0 : tensor<16xf32>
+}
+// CHECK-DIRECT: stablehlo.reverse
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(pred[], s32[2,3]{1,0}, s32[2,3]{1,0})->s32[2,3]{1,0}}
+
+// CHECK:       ENTRY %[[$main_6:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = pred[] parameter(0)
+// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = pred[2,3] broadcast(%[[Arg_0_1]]), dimensions={},
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[2,3] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = s32[2,3] parameter(2)
+// CHECK-NEXT:  ROOT %[[select_5:[^ ]+]] = s32[2,3] select(%[[broadcast_4]], %[[Arg_1_2]], %[[Arg_2_3]]),
+func.func @main(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
+  %0 = stablehlo.select %arg0, %arg1, %arg2 : tensor<i1>, tensor<2x3xi32>
+  return %0 : tensor<2x3xi32>
+}
+// CHECK-DIRECT: stablehlo.select
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[5,1,5]{2,1,0}, s32[2]{0})->f32[2,1,5]{2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[5,1,5] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[2] parameter(1)
+// CHECK-NEXT:  ROOT %[[gather_3:[^ ]+]] = f32[2,1,5] gather(%[[Arg_0_1]], %[[Arg_1_2]]), offset_dims={1,2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1, slice_sizes={1,1,5},
+func.func @main(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
+  %0 = "stablehlo.torch_index_select"(%arg0, %arg1) {
+  dim = 0 : i64,
+  batch_dims = 0 : i64
+  } : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
+  func.return %0 : tensor<2x1x5xf32>
+}
+// CHECK-DIRECT: stablehlo.torch_index_select
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[1,2,3,4]{3,2,1,0})->s32[2,1,4,3]{2,3,0,1}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[1,2,3,4] parameter(0)
+// CHECK-NEXT:  ROOT %[[transpose_2:[^ ]+]] = s32[2,1,4,3] transpose(%[[Arg_0_1]]), dimensions={1,0,3,2},
+func.func @main(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
+  %0 = stablehlo.transpose %arg0, dims = [1, 0, 3, 2] : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  return %0 : tensor<2x1x4x3xi32>
+}
+// CHECK-DIRECT: stablehlo.transpose
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[4,4]{1,0}, f32[3,4]{1,0})->f32[3,4]{1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[4,4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[3,4] parameter(1)
+// CHECK-NEXT:  ROOT %[[triangular_solve_3:[^ ]+]] = f32[3,4] triangular-solve(%[[Arg_0_1]], %[[Arg_1_2]]), lower=true, transpose_a=NO_TRANSPOSE,
+func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
+  %0 = "stablehlo.triangular_solve"(%arg0, %arg1) <{left_side = false, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = false}> : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+// CHECK-DIRECT: stablehlo.triangular_solve
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[4,4]{1,0}, f32[3,4]{1,0})->f32[3,4]{1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[4,4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[3,4] parameter(1)
+// CHECK-NEXT:  ROOT %[[triangular_solve_3:[^ ]+]] = f32[3,4] triangular-solve(%[[Arg_0_1]], %[[Arg_1_2]]), lower=true, transpose_a=NO_TRANSPOSE,
+func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
+  %0 = "stablehlo.triangular_solve"(%arg0, %arg1) <{left_side = false, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = false}> : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+// CHECK-DIRECT: stablehlo.triangular_solve
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(pred[4]{0}, pred[4]{0})->pred[4]{0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = pred[4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = pred[4] parameter(1)
+// CHECK-NEXT:  ROOT %[[xor_3:[^ ]+]] = pred[4] xor(%[[Arg_0_1]], %[[Arg_1_2]]),
+func.func @main(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
+  %0 = stablehlo.xor %arg0, %arg1 : tensor<4xi1>
+  return %0 : tensor<4xi1>
+}
+// CHECK-DIRECT: stablehlo.xor
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[4]{0}, s32[4]{0})->s32[4]{0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[4] parameter(1)
+// CHECK-NEXT:  ROOT %[[xor_3:[^ ]+]] = s32[4] xor(%[[Arg_0_1]], %[[Arg_1_2]]),
+func.func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  %0 = stablehlo.xor %arg0, %arg1 : tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+// CHECK-DIRECT: stablehlo.xor
+
+// -----
+
+func.func @main() {
+  // CHECK: token[]
+  %0 = "stablehlo.create_token"() : () -> !stablehlo.token
+  func.return
+}
+// CHECK-DIRECT: stablehlo.create_token
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], s32[])->(f32[], s32[])}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[tuple_3:[^ ]+]] = (f32[], s32[]) tuple(%[[Arg_0_1]], %[[Arg_1_2]]),
+func.func @main(%arg0: tensor<f32>, %arg1: tensor<i32>) -> tuple<tensor<f32>, tensor<i32>> {
+  %0 = stablehlo.tuple %arg0, %arg1 : tuple<tensor<f32>, tensor<i32>>
+  return %0 : tuple<tensor<f32>, tensor<i32>>
+  }
+// CHECK-DIRECT: stablehlo.tuple
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], token[])->token[]}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = token[] parameter(1)
+// CHECK-NEXT:  %[[send_3:[^ ]+]] = (f32[], u32[], token[]) send(%[[Arg_0_1]], %[[Arg_1_2]]), is_host_transfer=true, metadata
+// CHECK-NEXT:  ROOT %[[send_done_4:[^ ]+]] = token[] send-done(%[[send_3]]), is_host_transfer=true, metadata=
+
+func.func @main(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
+  %0 = "stablehlo.send"(%arg0, %arg1) {
+  channel_handle = #stablehlo.channel_handle<handle = 0, type = 2>,
+  is_host_transfer = true
+  } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
+  func.return %0 : !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.send
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(token[])->(f32[], token[])}
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = token[] parameter(0)
+// CHECK-NEXT:  %[[recv_2:[^ ]+]] = (f32[], u32[], token[]) recv(%[[Arg_0_1]]), is_host_transfer=true,
+// CHECK-NEXT:  %[[recv_done_3:[^ ]+]] = (f32[], token[]) recv-done(%[[recv_2]]), is_host_transfer=true,
+// CHECK-NEXT:  %[[get_tuple_element_4:[^ ]+]] = f32[] get-tuple-element(%[[recv_done_3]]), index=0,
+// CHECK-NEXT:  %[[get_tuple_element_5:[^ ]+]] = token[] get-tuple-element(%[[recv_done_3]]), index=1,
+// CHECK-NEXT:  ROOT %[[tuple_6:[^ ]+]] = (f32[], token[]) tuple(%[[get_tuple_element_4]], %[[get_tuple_element_5]])
+
+func.func @main(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
+  %0:2 = "stablehlo.recv"(%arg0) {
+  channel_handle = #stablehlo.channel_handle<handle = 0, type = 3>,
+  is_host_transfer = true
+  } : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
+  func.return %0#0, %0#1 : tensor<f32>, !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.recv
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(token[])->((s32[3,3]{1,0}, pred[]), token[])}
+
+// CHECK:       ENTRY %[[$main_9:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = token[] parameter(0)
+// CHECK-NEXT:  %[[infeed_2:[^ ]+]] = ((s32[3,3], pred[]), token[]) infeed(%[[Arg_0_1]]), infeed_config="foobar",
+// CHECK-NEXT:  %[[get_tuple_element_3:[^ ]+]] = (s32[3,3], pred[]) get-tuple-element(%[[infeed_2]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_4:[^ ]+]] = s32[3,3] get-tuple-element(%[[get_tuple_element_3]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_5:[^ ]+]] = pred[] get-tuple-element(%[[get_tuple_element_3]]), index=1, metadata=
+// CHECK-NEXT:  %[[tuple_7:[^ ]+]] = (s32[3,3], pred[]) tuple(%[[get_tuple_element_4]], %[[get_tuple_element_5]]), metadata=
+// CHECK-NEXT:  %[[get_tuple_element_6:[^ ]+]] = token[] get-tuple-element(%[[infeed_2]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_8:[^ ]+]] = ((s32[3,3], pred[]), token[]) tuple(%[[tuple_7]], %[[get_tuple_element_6]]), metadata=
+
+func.func @main(%arg0: !stablehlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !stablehlo.token> {
+  %0:3 = "stablehlo.infeed"(%arg0) <{infeed_config = "foobar", layout = [[0, 1], [0]]}> : (!stablehlo.token) -> (tensor<3x3xi32>, tensor<i1>, !stablehlo.token)
+  %1 = stablehlo.tuple %0#0, %0#1 : tuple<tensor<3x3xi32>, tensor<i1>>
+  %2 = stablehlo.tuple %1, %0#2 : tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !stablehlo.token>
+  return %2 : tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !stablehlo.token>
+}
+// CHECK-DIRECT: stablehlo.infeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(token[])->s32[3,3]{0,1}}
+
+// CHECK:       ENTRY %[[$main_6:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = token[] parameter(0)
+// CHECK-NEXT:  %[[infeed_2:[^ ]+]] = ((s32[3,3]), token[]) infeed(%[[Arg_0_1]]), infeed_config="foobar", metadata=
+// CHECK-NEXT:  %[[get_tuple_element_3:[^ ]+]] = (s32[3,3]) get-tuple-element(%[[infeed_2]]), index=0, metadata=
+// CHECK-NEXT:  ROOT %[[get_tuple_element_4:[^ ]+]] = s32[3,3] get-tuple-element(%[[get_tuple_element_3]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_5:[^ ]+]] = token[] get-tuple-element(%[[infeed_2]]), index=1, metadata=
+func.func @main(%arg0: !stablehlo.token) -> tensor<3x3xi32> {
+  %0:2 = "stablehlo.infeed"(%arg0) <{infeed_config = "foobar", layout = [[0, 1]]}> : (!stablehlo.token) -> (tensor<3x3xi32>, !stablehlo.token)
+  return %0#0 : tensor<3x3xi32>
+}
+// CHECK-DIRECT: stablehlo.infeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(token[])->token[]}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = token[] parameter(0)
+// CHECK-NEXT:  %[[infeed_2:[^ ]+]] = ((), token[]) infeed(%[[Arg_0_1]]), infeed_config="foobar", metadata=
+// CHECK-NEXT:  ROOT %[[get_tuple_element_3:[^ ]+]] = token[] get-tuple-element(%[[infeed_2]]), index=1, metadata=
+
+func.func @main(%arg0: !stablehlo.token) -> !stablehlo.token {
+  %0 = "stablehlo.infeed"(%arg0) <{infeed_config = "foobar", layout = []}> : (!stablehlo.token) -> !stablehlo.token
+  return %0 : !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.infeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], token[])->token[]}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[tuple_3:[^ ]+]] = (f32[]) tuple(%[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = token[] parameter(1)
+// CHECK-NEXT:  ROOT %[[outfeed_4:[^ ]+]] = token[] outfeed(%[[tuple_3]], %[[Arg_1_2]]), outfeed_shape=(f32[]), metadata=
+
+func.func @main(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
+  %0 = "stablehlo.outfeed"(%arg0, %arg1) {
+  outfeed_config = ""
+  } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
+  func.return %0 : !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.outfeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[3]{0}, token[])->token[]}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[3] parameter(0)
+// CHECK-NEXT:  %[[tuple_3:[^ ]+]] = (s32[3]) tuple(%[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = token[] parameter(1)
+// CHECK-NEXT:  ROOT %[[outfeed_4:[^ ]+]] = token[] outfeed(%[[tuple_3]], %[[Arg_1_2]]), outfeed_shape=(s32[3]{0}), outfeed_config="foobar", metadata=
+
+func.func @main(%arg0: tensor<3xi32>, %arg1: !stablehlo.token) -> !stablehlo.token {
+  %0 = "stablehlo.outfeed"(%arg0, %arg1) <{outfeed_config = "foobar"}> : (tensor<3xi32>, !stablehlo.token) -> !stablehlo.token
+  return %0 : !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.outfeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[3,2]{1,0}, token[])->token[]}
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[3,2] parameter(0)
+// CHECK-NEXT:  %[[custom_call_3:[^ ]+]] = s32[3,2] custom-call(%[[Arg_0_1]]), custom_call_target="Sharding", sharding={devices=[1,2]0,1}, metadata=
+// CHECK-NEXT:  %[[custom_call_4:[^ ]+]] = s32[6,2] custom-call(%[[custom_call_3]]), custom_call_target="SPMDShardToFullShape", sharding={devices=[1,2]0,1}, metadata=
+// CHECK-NEXT:  %[[tuple_5:[^ ]+]] = (s32[6,2]) tuple(%[[custom_call_4]]), metadata=
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = token[] parameter(1)
+// CHECK-NEXT:  ROOT %[[outfeed_6:[^ ]+]] = token[] outfeed(%[[tuple_5]], %[[Arg_1_2]]), outfeed_shape=(s32[6,2]{1,0}), outfeed_config="foobar",
+// CHECK-SAME{{LITERAL}} : sharding={{devices=[2,1]0,1}, {maximal device=0}},
+func.func @main(%arg0: tensor<3x2xi32>, %arg1: !stablehlo.token) -> !stablehlo.token {
+  %0 = stablehlo.custom_call @Sharding(%arg0) {backend_config = "", mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<3x2xi32>) -> tensor<3x2xi32>
+  %1 = stablehlo.custom_call @SPMDShardToFullShape(%0) {backend_config = "", mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<3x2xi32>) -> tensor<6x2xi32>
+  %2 = "stablehlo.outfeed"(%1, %arg1) <{outfeed_config = "foobar"}> {mhlo.sharding = "\08\02*\0A\08\03\1A\02\02\01\22\02\00\01*\08\08\01\1A\01\01\22\01\00"} : (tensor<6x2xi32>, !stablehlo.token) -> !stablehlo.token
+  return %2 : !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.outfeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[3]{0}, s32[3]{0}, token[])->token[]}
+
+// CHECK:       ENTRY %[[$main_6:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[3] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[3] parameter(1)
+// CHECK-NEXT:  %[[tuple_4:[^ ]+]] = (s32[3], s32[3]) tuple(%[[Arg_0_1]], %[[Arg_1_2]]), metadata=
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = token[] parameter(2)
+// CHECK-NEXT:  ROOT %[[outfeed_5:[^ ]+]] = token[] outfeed(%[[tuple_4]], %[[Arg_2_3]]), outfeed_shape=(s32[3]{0}, s32[3]{0}), outfeed_config="foobar", metadata=
+
+func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>, %arg2: !stablehlo.token) -> !stablehlo.token {
+  %0 = "stablehlo.outfeed"(%arg0, %arg1, %arg2) <{outfeed_config = "foobar"}> : (tensor<3xi32>, tensor<3xi32>, !stablehlo.token) -> !stablehlo.token
+  return %0 : !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.outfeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(token[])->token[]}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[tuple_2:[^ ]+]] = () tuple(), metadata=
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = token[] parameter(0)
+// CHECK-NEXT:  ROOT %[[outfeed_3:[^ ]+]] = token[] outfeed(%[[tuple_2]], %[[Arg_0_1]]), outfeed_shape=(), outfeed_config="foobar", metadata=
+
+func.func @main(%arg0: !stablehlo.token) -> !stablehlo.token {
+  %0 = "stablehlo.outfeed"(%arg0) <{outfeed_config = "foobar"}> : (!stablehlo.token) -> !stablehlo.token
+  return %0 : !stablehlo.token
+}
+// CHECK-DIRECT: stablehlo.outfeed
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={((f32[], f32[], f32[], f32[], f32[]))->f32[]}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = (f32[], f32[], f32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  ROOT %[[get_tuple_element_2:[^ ]+]] = f32[] get-tuple-element(%[[Arg_0_1]]), index=4, metadata=
+
+func.func @main(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
+  %0 = "stablehlo.get_tuple_element"(%arg0) {
+  index = 4 : i32
+  } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+// CHECK-DIRECT: stablehlo.get_tuple_element
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  ROOT %[[opt_barrier_2:[^ ]+]] = f32[] opt-barrier(%[[Arg_0_1]]), metadata=
+
+func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
+  %0 = "stablehlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+// CHECK-DIRECT: stablehlo.optimization_barrier
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(pred[], f32[], f32[])->f32[]}
+
+// CHECK:       %[[$region_0_4:[^ ]+]]
+// CHECK-NEXT:  ROOT %[[Arg__5:[^ ]+]] = f32[] parameter(0)
+
+// CHECK:       %[[$region_1_6:[^ ]+]]
+// CHECK-NEXT:  ROOT %[[Arg__7:[^ ]+]] = f32[] parameter(0)
+
+// CHECK:       ENTRY %[[$main_9:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = pred[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  ROOT %[[conditional_8:[^ ]+]] = f32[] conditional(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), true_computation=%[[$region_0_4]], false_computation=%[[$region_1_6]], metadata=
+
+func.func @main(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  %0 = "stablehlo.if"(%arg0) ({
+  "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
+  }, {
+  "stablehlo.return"(%arg2) : (tensor<f32>) -> ()
+  }) : (tensor<i1>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+// CHECK-DIRECT: stablehlo.if
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[], f32[])->f32[]}
+
+// CHECK:       %[[$region_0_3:[^ ]+]]
+// CHECK-NEXT:  ROOT %[[Arg__4:[^ ]+]] = f32[] parameter(0)
+
+// CHECK:       ENTRY %[[$main_6:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[conditional_5:[^ ]+]] = f32[] conditional(%[[Arg_0_1]], %[[Arg_1_2]]), branch_computations={%[[$region_0_3]]}, metadata=
+
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %0 = "stablehlo.case"(%arg0) ({
+  "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
+  }) : (tensor<i32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+// CHECK-DIRECT: stablehlo.case
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[10]{0})->f32[10]{0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[maximum_5:[^ ]+]] = f32[] maximum(%[[Arg_0_3]], %[[Arg_1_4]]), metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[10] parameter(0)
+// CHECK-NEXT:  ROOT %[[all_reduce_6:[^ ]+]] = f32[10] all-reduce(%[[Arg_0_1]]), channel_id=5,
+// CHECK-SAME{{LITERAL}}:  replica_groups={{0,2,4,6},{1,3,5,7}}, to_apply=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %0 = "stablehlo.all_reduce"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 2>, replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  %1 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
+  return %0 : tensor<10xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.all_reduce
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[10]{0})->f32[10]{0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[maximum_5:[^ ]+]] = f32[] maximum(%[[Arg_0_3]], %[[Arg_1_4]]), metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[10] parameter(0)
+// CHECK-NEXT:  ROOT %[[all_reduce_6:[^ ]+]] = f32[10] all-reduce(%[[Arg_0_1]]), channel_id=5,
+// CHECK-SAME{{LITERAL}}:  replica_groups={{0,2,4},{1,3,5,6}}, to_apply=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %0 = "stablehlo.all_reduce"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 2>, replica_groups = dense<[[0, 2, 4, -1], [1, 3, 5, 6]]> : tensor<2x4xi64>}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  %1 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
+  return %0 : tensor<10xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.all_reduce
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[10]{0})->f32[10]{0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[maximum_5:[^ ]+]] = f32[] maximum(%[[Arg_0_3]], %[[Arg_1_4]]), metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[10] parameter(0)
+// CHECK-NEXT:  ROOT %[[all_reduce_6:[^ ]+]] = f32[10] all-reduce(%[[Arg_0_1]]), channel_id=5,
+// CHECK-SAME{{LITERAL}}:  replica_groups={{0,2,4,6},{1,3,5,7}}, use_global_device_ids=true, to_apply=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %0 = "stablehlo.all_reduce"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 2>, replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>, use_global_device_ids}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  %1 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<10xf32>) -> tensor<10xf32>
+  return %0 : tensor<10xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.all_reduce
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[8]{0}, f32[])->(f32[8]{0}, f32[])}
+
+// CHECK:       %[[$region_0_6:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_7:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_8:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_9:[^ ]+]] = f32[] add(%[[Arg_0_7]], %[[Arg_1_8]]), metadata=
+
+// CHECK:       ENTRY %[[$main_14:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[8] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[tuple_3:[^ ]+]] = (f32[8], f32[]) tuple(%[[Arg_0_1]], %[[Arg_1_2]]), metadata=
+// CHECK-NEXT:  %[[get_tuple_element_4:[^ ]+]] = f32[8] get-tuple-element(%[[tuple_3]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_5:[^ ]+]] = f32[] get-tuple-element(%[[tuple_3]]), index=1, metadata=
+// CHECK-NEXT:  %[[all_reduce_10:[^ ]+]] = (f32[8], f32[]) all-reduce(%[[get_tuple_element_4]], %[[get_tuple_element_5]]), replica_groups={}, to_apply=%[[$region_0_6]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_11:[^ ]+]] = f32[8] get-tuple-element(%[[all_reduce_10]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_12:[^ ]+]] = f32[] get-tuple-element(%[[all_reduce_10]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_13:[^ ]+]] = (f32[8], f32[]) tuple(%[[get_tuple_element_11]], %[[get_tuple_element_12]]), metadata=
+
+module {
+  func.func @main(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tuple<tensor<8xf32>, tensor<f32>> {
+  %0:2 = "stablehlo.all_reduce"(%arg0, %arg1) <{replica_groups = dense<> : tensor<0x0xi64>}> ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+  %2 = stablehlo.add %arg2, %arg3 : tensor<f32>
+  stablehlo.return %2 : tensor<f32>
+  }) : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  %1 = stablehlo.tuple %0#0, %0#1 {xla_shape = "(f32[8]{0}, f32[])"} : tuple<tensor<8xf32>, tensor<f32>>
+  return %1 : tuple<tensor<8xf32>, tensor<f32>>
+  }
+}
+// CHECK-DIRECT: stablehlo.all_reduce
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[1,10]{1,0}, s32[1,10]{1,0}, f32[], s32[])->(f32[1]{0}, s32[1]{0})}
+
+// CHECK:       %[[$region_0_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_6:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_2_8:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[maximum_10:[^ ]+]] = f32[] maximum(%[[Arg_0_6]], %[[Arg_2_8]]),
+// CHECK-NEXT:  %[[Arg_1_7:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  %[[Arg_3_9:[^ ]+]] = s32[] parameter(3)
+// CHECK-NEXT:  %[[maximum_11:[^ ]+]] = s32[] maximum(%[[Arg_1_7]], %[[Arg_3_9]]),
+// CHECK-NEXT:  ROOT %[[tuple_12:[^ ]+]] = (f32[], s32[]) tuple(%[[maximum_10]], %[[maximum_11]])
+
+// CHECK:  ENTRY %[[$main_17:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[1,10] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[1,10] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = s32[] parameter(3)
+// CHECK-NEXT:  %[[reduce_13:[^ ]+]] = (f32[1], s32[1]) reduce(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]], %[[Arg_3_4]]), dimensions={1}, to_apply=%[[$region_0_5]],
+// CHECK-NEXT:  %[[get_tuple_element_14:[^ ]+]] = f32[1] get-tuple-element(%[[reduce_13]]), index=0,
+// CHECK-NEXT:  %[[get_tuple_element_15:[^ ]+]] = s32[1] get-tuple-element(%[[reduce_13]]), index=1,
+// CHECK-NEXT:  ROOT %[[tuple_16:[^ ]+]] = (f32[1], s32[1]) tuple(%[[get_tuple_element_14]], %[[get_tuple_element_15]])
+func.func @main(%arg0: tensor<1x10xf32>, %arg1: tensor<1x10xi32>, %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>) {
+  %0:2 = stablehlo.reduce(%arg0 init: %arg2), (%arg1 init: %arg3) across dimensions = [1] : (tensor<1x10xf32>, tensor<1x10xi32>, tensor<f32>, tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>)
+    reducer(%arg4: tensor<f32>, %arg6: tensor<f32>) (%arg5: tensor<i32>, %arg7: tensor<i32>)  {
+    %1 = stablehlo.maximum %arg4, %arg6 : tensor<f32>
+    %2 = stablehlo.maximum %arg5, %arg7 : tensor<i32>
+    stablehlo.return %1, %2 : tensor<f32>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32>
+}
+// CHECK-DIRECT: stablehlo.reduce
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[4]{0}, f32[4]{0})->f32[4]{0}}
+
+// CHECK:       %[[$region_0_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_4:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_5:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_6:[^ ]+]] = f32[] add(%[[Arg_0_4]], %[[Arg_1_5]]), metadata=
+
+// CHECK:       ENTRY %[[$main_8:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[4] parameter(1)
+// CHECK-NEXT:  ROOT %[[map_7:[^ ]+]] = f32[4] map(%[[Arg_0_1]], %[[Arg_1_2]]), dimensions={0}, to_apply=%[[$region_0_3]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "stablehlo.map"(%arg0, %arg1) <{dimensions = array<i64: 0>}> ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+  %1 = stablehlo.add %arg2, %arg3 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.map
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[2,2]{1,0})->s32[2,2]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[2,2] parameter(0)
+// CHECK-NEXT:  ROOT %[[all_to_all_2:[^ ]+]] = s32[2,2] all-to-all(%[[Arg_0_1]]), channel_id=1,
+// CHECK-SAME{{LITERAL}}:  replica_groups={{1,2},{0,3}}, dimensions={1}, metadata=
+
+func.func @main(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  %0 = "stablehlo.all_to_all"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 1>, concat_dimension = 1 : i64, replica_groups = dense<[[1, 2], [0, 3]]> : tensor<2x2xi64>, split_count = 2 : i64, split_dimension = 1 : i64}> : (tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+// CHECK-DIRECT: stablehlo.all_to_all
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}, f32[2]{0}, f32[2,2,2,2]{3,2,1,0})->(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})}
+
+// CHECK:       ENTRY %[[$main_11:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,2,2,2] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[2] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[2] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = f32[2] parameter(3)
+// CHECK-NEXT:  %[[Arg_4_5:[^ ]+]] = f32[2,2,2,2] parameter(4)
+// CHECK-NEXT:  %[[batch_norm_grad_6:[^ ]+]] = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-grad(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]], %[[Arg_3_4]], %[[Arg_4_5]]), epsilon=0.001, feature_index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_7:[^ ]+]] = f32[2,2,2,2] get-tuple-element(%[[batch_norm_grad_6]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_8:[^ ]+]] = f32[2] get-tuple-element(%[[batch_norm_grad_6]]), index=1, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_9:[^ ]+]] = f32[2] get-tuple-element(%[[batch_norm_grad_6]]), index=2, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_10:[^ ]+]] = (f32[2,2,2,2], f32[2], f32[2]) tuple(%[[get_tuple_element_7]], %[[get_tuple_element_8]], %[[get_tuple_element_9]]), metadata=
+
+func.func @main(%arg0: tensor<2x2x2x2xf32>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>, %arg4: tensor<2x2x2x2xf32>) -> tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>> {
+  %grad_operand, %grad_scale, %grad_offset = "stablehlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) <{epsilon = 1.000000e-03 : f32, feature_index = 0 : i64}> : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+  %0 = stablehlo.tuple %grad_operand, %grad_scale, %grad_offset : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+  return %0 : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+}
+// CHECK-DIRECT: stablehlo.batch_norm_grad
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})->(f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0})}
+
+// CHECK:       ENTRY %[[$main_9:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,2,2,2] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[2] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[2] parameter(2)
+// CHECK-NEXT:  %[[batch_norm_training_4:[^ ]+]] = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-training(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), epsilon=0.001, feature_index=3, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_5:[^ ]+]] = f32[2,2,2,2] get-tuple-element(%[[batch_norm_training_4]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_6:[^ ]+]] = f32[2] get-tuple-element(%[[batch_norm_training_4]]), index=1, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_7:[^ ]+]] = f32[2] get-tuple-element(%[[batch_norm_training_4]]), index=2, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_8:[^ ]+]] = (f32[2,2,2,2], f32[2], f32[2]) tuple(%[[get_tuple_element_5]], %[[get_tuple_element_6]], %[[get_tuple_element_7]]), metadata=
+
+func.func @main(%arg0: tensor<2x2x2x2xf32>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>) -> tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>> {
+  %output, %batch_mean, %batch_var = "stablehlo.batch_norm_training"(%arg0, %arg1, %arg2) <{epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}> : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+  %0 = stablehlo.tuple %output, %batch_mean, %batch_var : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+  return %0 : tuple<tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>>
+}
+// CHECK-DIRECT: stablehlo.batch_norm_training
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[2]{0})->f32[2]{0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[2] parameter(0)
+// CHECK-NEXT:  ROOT %[[bitcast_convert_2:[^ ]+]] = f32[2] bitcast-convert(%[[Arg_0_1]]), metadata=
+
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
+  %0 = stablehlo.bitcast_convert %arg0 : (tensor<2xi32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+// CHECK-DIRECT: stablehlo.bitcast_convert
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], f32[], f32[])->f32[]}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  ROOT %[[clamp_4:[^ ]+]] = f32[] clamp(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), metadata=
+
+func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  %0 = "stablehlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+// CHECK-DIRECT: stablehlo.clamp
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[128,32]{1,0})->f32[128,32]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[128,32] parameter(0)
+// CHECK-NEXT:  ROOT %[[collective_broadcast_2:[^ ]+]] = f32[128,32] collective-broadcast(%[[Arg_0_1]]), channel_id=1,
+// CHECK-SAME{{LITERAL}} : replica_groups={{0,1},{2,3}}, metadata=
+
+func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+  %0 = "stablehlo.collective_broadcast"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<[[0, 1], [2, 3]]> : tensor<2x2xi64>}> : (tensor<128x32xf32>) -> tensor<128x32xf32>
+  return %0 : tensor<128x32xf32>
+}
+// CHECK-DIRECT: stablehlo.collective_broadcast
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[16]{0})->f32[16]{0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[compare_5:[^ ]+]] = pred[] compare(%[[Arg_0_3]], %[[Arg_1_4]]), direction=GT, metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[16] parameter(0)
+// CHECK-NEXT:  ROOT %[[sort_6:[^ ]+]] = f32[16] sort(%[[Arg_0_1]]), dimensions={0}, is_stable=true, to_apply=%[[$region_0_2]], metadata=
+
+func.func @main(%arg0: tensor<16xf32>) -> tensor<16xf32> {
+  %0 = "stablehlo.sort"(%arg0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  %1 = "stablehlo.compare"(%arg1, %arg2) {compare_type = #stablehlo<comparison_type FLOAT>, comparison_direction = #stablehlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  "stablehlo.return"(%1) : (tensor<i1>) -> ()
+  }) {
+  dimension = 0 : i64,
+  is_stable = true
+  } : (tensor<16xf32>) -> tensor<16xf32>
+  func.return %0 : tensor<16xf32>
+}
+// CHECK-DIRECT: stablehlo.sort
+// CHECK-DIRECT: stablehlo.compare
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s64[])->s64[]}
+
+// CHECK:       %[[$add_n_impl_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = s64[] parameter(0)
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = s64[] constant(2)
+// CHECK-NEXT:  ROOT %[[add_5:[^ ]+]] = s64[] add(%[[Arg_0_3]], %[[constant_4]]), metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s64[] parameter(0)
+// CHECK-NEXT:  ROOT %[[call_6:[^ ]+]] = s64[] call(%[[Arg_0_1]]), to_apply=%[[$add_n_impl_2]], is_composite=true, frontend_attributes={composite.attributes={n = 2 : i64},composite.name="stablehlo.add_n",composite.version="0"}
+
+func.func @main(%arg0 : tensor<i64>) -> tensor<i64> {
+  %0 = stablehlo.composite "stablehlo.add_n" %arg0 {
+  composite_attributes = { n = 2 : i64 },
+  decomposition = @add_n.impl
+  } : (tensor<i64>) -> tensor<i64>
+  func.return %0 : tensor<i64>
+}
+
+func.func @add_n.impl(%arg0: tensor<i64>) -> tensor<i64> {
+  %0 = stablehlo.constant dense<2> : tensor<i64>
+  %1 = stablehlo.add %arg0, %0 : tensor<i64>
+  func.return %1 : tensor<i64>
+}
+// CHECK-DIRECT: stablehlo.composite
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,3]{1,0})->f32[2,3]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,3] parameter(0)
+// CHECK-NEXT:  ROOT %[[custom_call_2:[^ ]+]] = f32[2,3] custom-call(%[[Arg_0_1]]), custom_call_target="SetBound", literal=s32[] 1, metadata=
+
+module {
+  func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %0 = stablehlo.custom_call @SetBound(%arg0) {mhlo.literal = dense<1> : tensor<i32>} : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %0 : tensor<2x3xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.custom_call
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[6]{0}, f32[6]{0}, s32[3]{0}, s32[3]{0}, s32[3]{0}, /*index=5*/s32[3]{0})->f32[6]{0}}
+
+// CHECK:       ENTRY %[[$main_8:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[6] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[6] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = s32[3] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = s32[3] parameter(3)
+// CHECK-NEXT:  %[[Arg_4_5:[^ ]+]] = s32[3] parameter(4)
+// CHECK-NEXT:  %[[Arg_5_6:[^ ]+]] = s32[3] parameter(5)
+// CHECK-NEXT:  ROOT %[[ragged_all_to_all_7:[^ ]+]] = f32[6] ragged-all-to-all(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]], %[[Arg_3_4]], %[[Arg_4_5]],
+
+module {
+  func.func @main(%arg0: tensor<6xf32>, %arg1: tensor<6xf32>, %arg2: tensor<3xi32>, %arg3: tensor<3xi32>, %arg4: tensor<3xi32>, %arg5: tensor<3xi32>) -> tensor<6xf32> {
+  %0 = stablehlo.custom_call @ragged_all_to_all(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {api_version = 4 : i32, backend_config = {channel_id = 1 : i64, replica_groups = dense<[[0, 1, 2]]> : tensor<1x3xi64>}} : (tensor<6xf32>, tensor<6xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<6xf32>
+  return %0 : tensor<6xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.custom_call
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(bf16[16,256]{1,0}, s32[], s32[16,256]{1,0}, bf16[])->(bf16[16,4]{1,0}, s32[16,4]{1,0})}
+
+// CHECK:       %[[$top_k_gt_comparator_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_2_8:[^ ]+]] = s32[] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_9:[^ ]+]] = s32[] parameter(3)
+// CHECK-NEXT:  %[[Arg_0_6:[^ ]+]] = bf16[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_7:[^ ]+]] = bf16[] parameter(1)
+// CHECK-NEXT:  ROOT %[[compare_10:[^ ]+]] = pred[] compare(%[[Arg_0_6]], %[[Arg_1_7]]), direction=GT, metadata=
+
+// CHECK:       ENTRY %[[$main_20:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = bf16[] parameter(3)
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = bf16[16,256] parameter(0)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = s32[16,256] parameter(2)
+// CHECK-NEXT:  %[[sort_11:[^ ]+]] = (bf16[16,256], s32[16,256]) sort(%[[Arg_0_1]], %[[Arg_2_3]]), dimensions={1}, to_apply=%[[$top_k_gt_comparator_5]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_12:[^ ]+]] = bf16[16,256] get-tuple-element(%[[sort_11]]), index=0, metadata=
+// CHECK-NEXT:  %[[slice_13:[^ ]+]] = bf16[16,4] slice(%[[get_tuple_element_12]]), slice={[0:16], [0:4]}, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_14:[^ ]+]] = s32[16,256] get-tuple-element(%[[sort_11]]), index=1, metadata=
+// CHECK-NEXT:  %[[slice_15:[^ ]+]] = s32[16,4] slice(%[[get_tuple_element_14]]), slice={[0:16], [0:4]}, metadata=
+// CHECK-NEXT:  %[[tuple_16:[^ ]+]] = (bf16[16,4], s32[16,4]) tuple(%[[slice_13]], %[[slice_15]]), metadata=
+// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = bf16[16,4] get-tuple-element(%[[tuple_16]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = s32[16,4] get-tuple-element(%[[tuple_16]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_19:[^ ]+]] = (bf16[16,4], s32[16,4]) tuple(%[[get_tuple_element_17]], %[[get_tuple_element_18]])
+
+module {
+  func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+  }
+  func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %0:2 = stablehlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {api_version = 4 : i32, backend_config = {aggregate_to_topk = true, is_fallback = true, recall_target = 0.949217975 : f32, reduction_dim = 1 : i64, reduction_input_size_override = -1 : i64, top_k = 4 : i64}, called_computations = [@top_k_gt_comparator]} : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %0#0, %0#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+  }
+}
+// CHECK-DIRECT: stablehlo.custom_call
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(bf16[16,256]{1,0}, s32[], s32[16,256]{1,0}, bf16[])->(bf16[16,4]{1,0}, s32[16,4]{1,0})}
+
+// CHECK:       %[[$top_k_gt_comparator_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_2_8:[^ ]+]] = s32[] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_9:[^ ]+]] = s32[] parameter(3)
+// CHECK-NEXT:  %[[Arg_0_6:[^ ]+]] = bf16[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_7:[^ ]+]] = bf16[] parameter(1)
+// CHECK-NEXT:  ROOT %[[compare_10:[^ ]+]] = pred[] compare(%[[Arg_0_6]], %[[Arg_1_7]]), direction=GT, metadata=
+
+// CHECK:       %[[$top_k_gt_comparator_14:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_2_17:[^ ]+]] = s32[] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_18:[^ ]+]] = s32[] parameter(3)
+// CHECK-NEXT:  %[[Arg_0_15:[^ ]+]] = bf16[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_16:[^ ]+]] = bf16[] parameter(1)
+// CHECK-NEXT:  ROOT %[[compare_19:[^ ]+]] = pred[] compare(%[[Arg_0_15]], %[[Arg_1_16]]), direction=GT, metadata=
+
+// CHECK:       ENTRY %[[$main_29:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = bf16[16,256] parameter(0)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = s32[16,256] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = bf16[] parameter(3)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  %[[custom_call_11:[^ ]+]] = (bf16[16,128], s32[16,128]) custom-call(%[[Arg_0_1]], %[[Arg_2_3]], %[[Arg_3_4]], %[[Arg_1_2]]), custom_call_target="PartialReduce", called_computations={%[[$top_k_gt_comparator_5]]}, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_12:[^ ]+]] = bf16[16,128] get-tuple-element(%[[custom_call_11]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_13:[^ ]+]] = s32[16,128] get-tuple-element(%[[custom_call_11]]), index=1, metadata=
+// CHECK-NEXT:  %[[sort_20:[^ ]+]] = (bf16[16,128], s32[16,128]) sort(%[[get_tuple_element_12]], %[[get_tuple_element_13]]), dimensions={1}, to_apply=%[[$top_k_gt_comparator_14]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_21:[^ ]+]] = bf16[16,128] get-tuple-element(%[[sort_20]]), index=0, metadata=
+// CHECK-NEXT:  %[[slice_22:[^ ]+]] = bf16[16,4] slice(%[[get_tuple_element_21]]), slice={[0:16], [0:4]}, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_23:[^ ]+]] = s32[16,128] get-tuple-element(%[[sort_20]]), index=1, metadata=
+// CHECK-NEXT:  %[[slice_24:[^ ]+]] = s32[16,4] slice(%[[get_tuple_element_23]]), slice={[0:16], [0:4]}, metadata=
+// CHECK-NEXT:  %[[tuple_25:[^ ]+]] = (bf16[16,4], s32[16,4]) tuple(%[[slice_22]], %[[slice_24]]), metadata=
+// CHECK-NEXT:  %[[get_tuple_element_26:[^ ]+]] = bf16[16,4] get-tuple-element(%[[tuple_25]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_27:[^ ]+]] = s32[16,4] get-tuple-element(%[[tuple_25]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_28:[^ ]+]] = (bf16[16,4], s32[16,4]) tuple(%[[get_tuple_element_26]], %[[get_tuple_element_27]])
+
+module {
+  func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+  }
+  func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %0:2 = stablehlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {api_version = 4 : i32, backend_config = {aggregate_to_topk = true, is_fallback = false, recall_target = 0.949217975 : f32, reduction_dim = 1 : i64, reduction_input_size_override = -1 : i64, top_k = 4 : i64}, called_computations = [@top_k_gt_comparator]} : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %0#0, %0#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+  }
+}
+// CHECK-DIRECT: stablehlo.custom_call
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,3]{1,0})->(f32[2,3]{1,0})}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,3] parameter(0)
+// CHECK-NEXT:  ROOT %[[custom_call_2:[^ ]+]] = (f32[2,3]) custom-call(%[[Arg_0_1]]), custom_call_target="foo", metadata=
+
+module {
+  func.func @main(%arg0: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>> {
+  %0 = stablehlo.custom_call @foo(%arg0) : (tensor<2x3xf32>) -> tuple<tensor<2x3xf32>>
+  return %0 : tuple<tensor<2x3xf32>>
+  }
+}
+// CHECK-DIRECT: stablehlo.custom_call
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,3]{1,0})->(f32[2,3]{1,0}, f16[4,5]{1,0})}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,3] parameter(0)
+// CHECK-NEXT:  ROOT %[[custom_call_2:[^ ]+]] = (f32[2,3], f16[4,5]) custom-call(%[[Arg_0_1]]), custom_call_target="foo", metadata=
+
+module {
+  func.func @main(%arg0: tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>> {
+  %0 = stablehlo.custom_call @foo(%arg0) : (tensor<2x3xf32>) -> tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+  return %0 : tuple<tensor<2x3xf32>, tensor<4x5xf16>>
+  }
+}
+// CHECK-DIRECT: stablehlo.custom_call
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2,3]{1,0})->(f32[2,3]{1,0}, f16[4,5]{1,0})}
+
+// CHECK:       ENTRY %[[$main_6:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2,3] parameter(0)
+// CHECK-NEXT:  %[[custom_call_2:[^ ]+]] = (f32[2,3], f16[4,5]) custom-call(%[[Arg_0_1]]), custom_call_target="foo", metadata=
+// CHECK-NEXT:  %[[get_tuple_element_3:[^ ]+]] = f32[2,3] get-tuple-element(%[[custom_call_2]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_4:[^ ]+]] = f16[4,5] get-tuple-element(%[[custom_call_2]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_5:[^ ]+]] = (f32[2,3], f16[4,5]) tuple(%[[get_tuple_element_3]], %[[get_tuple_element_4]])
+
+module {
+  func.func @main(%arg0: tensor<2x3xf32>) -> (tensor<2x3xf32>, tensor<4x5xf16>) {
+  %0:2 = stablehlo.custom_call @foo(%arg0) : (tensor<2x3xf32>) -> (tensor<2x3xf32>, tensor<4x5xf16>)
+  return %0#0, %0#1 : tensor<2x3xf32>, tensor<4x5xf16>
+  }
+}
+// CHECK-DIRECT: stablehlo.custom_call
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s8[2,2,2]{2,1,0}, s8[2,2,3]{2,1,0})->s32[2,2,3]{2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s8[2,2,2] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s8[2,2,3] parameter(1)
+// CHECK-NEXT:  ROOT %[[dot_3:[^ ]+]] = s32[2,2,3] dot(%[[Arg_0_1]], %[[Arg_1_2]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata=
+
+module {
+  func.func @main(%arg0: tensor<2x2x2xi8>, %arg1: tensor<2x2x3xi8>) -> tensor<2x2x3xi32> {
+  %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [] : (tensor<2x2x2xi8>, tensor<2x2x3xi8>) -> tensor<2x2x3xi32>
+  return %0 : tensor<2x2x3xi32>
+  }
+}
+// CHECK-DIRECT: stablehlo.dot_general
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[8,8,16]{2,1,0}, f32[8,16,8]{2,1,0})->f32[8,8,8]{2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[8,8,16] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[8,16,8] parameter(1)
+// CHECK-NEXT:  ROOT %[[dot_3:[^ ]+]] = f32[8,8,8] dot(%[[Arg_0_1]], %[[Arg_1_2]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}, metadata=
+
+func.func @main(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>) -> tensor<8x8x8xf32> {
+  %0 = "stablehlo.dot_general"(%arg0, %arg1) {
+  dot_dimension_numbers = #stablehlo.dot<
+  lhs_batching_dimensions = [0],
+  lhs_contracting_dimensions = [2],
+  rhs_batching_dimensions = [0],
+  rhs_contracting_dimensions = [1]
+  >,
+  precision_config = []
+  } : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
+  func.return %0 : tensor<8x8x8xf32>
+}
+// CHECK-DIRECT: stablehlo.dot_general
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[8,8,16]{2,1,0}, f32[8,16,8]{2,1,0})->f32[8,8,8]{2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[8,8,16] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[8,16,8] parameter(1)
+// CHECK-NEXT:  ROOT %[[dot_3:[^ ]+]] = f32[8,8,8] dot(%[[Arg_0_1]], %[[Arg_1_2]]), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}, algorithm=dot_tf32_tf32_f32, metadata=
+
+func.func @main(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>) -> tensor<8x8x8xf32> {
+  %0 = "stablehlo.dot_general"(%arg0, %arg1) {
+  dot_dimension_numbers = #stablehlo.dot<
+  lhs_batching_dimensions = [0],
+  lhs_contracting_dimensions = [2],
+  rhs_batching_dimensions = [0],
+  rhs_contracting_dimensions = [1]
+  >,
+  algorithm = #stablehlo.dot_algorithm<
+  lhs_precision_type = tf32,
+  rhs_precision_type = tf32,
+  accumulation_type = f32,
+  lhs_component_count = 1,
+  rhs_component_count = 1,
+  num_primitive_operations = 1,
+  allow_imprecise_accumulation = false
+  >
+  } : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
+  func.return %0 : tensor<8x8x8xf32>
+}
+// CHECK-DIRECT: stablehlo.dot_general
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s8[3]{0}, s8[3]{0})->s64[]}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s8[3] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s8[3] parameter(1)
+// CHECK-NEXT:  ROOT %[[dot_3:[^ ]+]] = s64[] dot(%[[Arg_0_1]], %[[Arg_1_2]]), lhs_contracting_dims={0}, rhs_contracting_dims={0}, metadata=
+
+module {
+  func.func @main(%arg0: tensor<3xi8>, %arg1: tensor<3xi8>) -> tensor<i64> {
+  %0 = stablehlo.dot %arg0, %arg1, precision = [DEFAULT, DEFAULT] : (tensor<3xi8>, tensor<3xi8>) -> tensor<i64>
+  return %0 : tensor<i64>
+  }
+}
+// CHECK-DIRECT: stablehlo.dot
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s4[3]{0}, s4[3]{0})->s8[]}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s4[3] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s4[3] parameter(1)
+// CHECK-NEXT:  ROOT %[[dot_3:[^ ]+]] = s8[] dot(%[[Arg_0_1]], %[[Arg_1_2]]), lhs_contracting_dims={0}, rhs_contracting_dims={0}, metadata=
+
+module {
+  func.func @main(%arg0: tensor<3xi4>, %arg1: tensor<3xi4>) -> tensor<i8> {
+  %0 = stablehlo.dot %arg0, %arg1, precision = [DEFAULT, DEFAULT] : (tensor<3xi4>, tensor<3xi4>) -> tensor<i8>
+  return %0 : tensor<i8>
+  }
+}
+// CHECK-DIRECT: stablehlo.dot
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(u4[3]{0}, u4[3]{0})->u8[]}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = u4[3] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = u4[3] parameter(1)
+// CHECK-NEXT:  ROOT %[[dot_3:[^ ]+]] = u8[] dot(%[[Arg_0_1]], %[[Arg_1_2]]), lhs_contracting_dims={0}, rhs_contracting_dims={0}, metadata=
+
+module {
+  func.func @main(%arg0: tensor<3xui4>, %arg1: tensor<3xui4>) -> tensor<ui8> {
+  %0 = stablehlo.dot %arg0, %arg1, precision = [DEFAULT, DEFAULT] : (tensor<3xui4>, tensor<3xui4>) -> tensor<ui8>
+  return %0 : tensor<ui8>
+  }
+}
+// CHECK-DIRECT: stablehlo.dot
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[2]{0})->f32[1,2]{1,0}}
+
+// CHECK:       ENTRY %[[$main_3:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[2] parameter(0)
+// CHECK-NEXT:  ROOT %[[reshape_2:[^ ]+]] = f32[1,2] reshape(%[[Arg_0_1]]), metadata=
+
+module {
+  func.func @main(%arg0: tensor<2xf32>) -> tensor<1x2xf32> {
+  %0 = stablehlo.reshape %arg0 : (tensor<2xf32>) -> tensor<1x2xf32>
+  return %0 : tensor<1x2xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.reshape
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[10,2]{1,0})->f32[10,300]{1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[200,100,300] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[10,2] parameter(1)
+// CHECK-NEXT:  ROOT %[[gather_3:[^ ]+]] = f32[10,300] gather(%[[Arg_0_1]], %[[Arg_1_2]]), offset_dims={1}, collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=1, slice_sizes={1,1,300}, indices_are_sorted=true, metadata=
+
+module {
+  func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x300xf32> {
+  %0 = "stablehlo.gather"(%arg0, %arg1) <{dimension_numbers = #stablehlo.gather<offset_dims = [1], collapsed_slice_dims = [0, 1], start_index_map = [0, 1], index_vector_dim = 1>, indices_are_sorted = true, slice_sizes = array<i64: 1, 1, 300>}> : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
+  return %0 : tensor<10x300xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.gather
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[100,200,1]{2,1,0})->f32[100,200,300]{2,1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[200,100,300] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[100,200,1] parameter(1)
+// CHECK-NEXT:  ROOT %[[gather_3:[^ ]+]] = f32[100,200,300] gather(%[[Arg_0_1]], %[[Arg_1_2]]), offset_dims={2}, collapsed_slice_dims={}, start_index_map={2}, operand_batching_dims={0,1}, start_indices_batching_dims={1,0}, index_vector_dim=2, slice_sizes={1,1,300}, indices_are_sorted=true, metadata=
+
+module {
+  func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<100x200x1xi32>) -> tensor<100x200x300xf32> {
+  %0 = "stablehlo.gather"(%arg0, %arg1) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], operand_batching_dims = [0, 1], start_indices_batching_dims = [1, 0], start_index_map = [2], index_vector_dim = 2>, indices_are_sorted = true, slice_sizes = array<i64: 1, 1, 300>}> : (tensor<200x100x300xf32>, tensor<100x200x1xi32>) -> tensor<100x200x300xf32>
+  return %0 : tensor<100x200x300xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.gather
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={()->f32[1,10]{1,0}}
+
+// CHECK:       ENTRY %[[$main_2:[^ ]+]]
+// CHECK-NEXT:  ROOT %[[iota_1:[^ ]+]] = f32[1,10] iota(), iota_dimension=1, metadata=
+
+module {
+  func.func @main() -> tensor<1x10xf32> {
+  %0 = stablehlo.iota dim = 1 : tensor<1x10xf32>
+  return %0 : tensor<1x10xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.iota
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[4,6]{1,0}, f32[])->f32[13,19]{1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[4,6] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[pad_3:[^ ]+]] = f32[13,19] pad(%[[Arg_0_1]], %[[Arg_1_2]]), padding=2_4_1x3_5_1, metadata=
+
+module {
+  func.func @main(%arg0: tensor<4x6xf32>, %arg1: tensor<f32>) -> tensor<13x19xf32> {
+  %0 = stablehlo.pad %arg0, %arg1, low = [2, 3], high = [4, 5], interior = [1, 1] : (tensor<4x6xf32>, tensor<f32>) -> tensor<13x19xf32>
+  return %0 : tensor<13x19xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.pad
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={()->u32[]}
+
+// CHECK:       ENTRY %[[$main_2:[^ ]+]]
+// CHECK-NEXT:  ROOT %[[partition_id_1:[^ ]+]] = u32[] partition-id(), metadata=
+
+module {
+  func.func @main() -> tensor<ui32> {
+  %0 = stablehlo.partition_id : tensor<ui32>
+  return %0 : tensor<ui32>
+  }
+}
+// CHECK-DIRECT: stablehlo.partition_id
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[10]{0})->f32[5]{0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[maximum_5:[^ ]+]] = f32[] maximum(%[[Arg_0_3]], %[[Arg_1_4]]), metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[10] parameter(0)
+// CHECK-NEXT:  ROOT %[[reduce_scatter_6:[^ ]+]] = f32[5] reduce-scatter(%[[Arg_0_1]]), channel_id=5,
+// CHECK-SAME{{LITERAL}} : replica_groups={{0,2},{1,3}}, dimensions={0}, to_apply=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
+  %0 = "stablehlo.reduce_scatter"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 2>, replica_groups = dense<[[0, 2], [1, 3]]> : tensor<2x2xi64>, scatter_dimension = 0 : i64}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  %1 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<10xf32>) -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.reduce_scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[10]{0})->f32[5]{0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[maximum_5:[^ ]+]] = f32[] maximum(%[[Arg_0_3]], %[[Arg_1_4]]), metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[10] parameter(0)
+// CHECK-NEXT:  ROOT %[[reduce_scatter_6:[^ ]+]] = f32[5] reduce-scatter(%[[Arg_0_1]]), channel_id=5,
+// CHECK-SAME{{LITERAL}} : replica_groups={{0,2},{1,3}}, dimensions={0}, to_apply=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
+  %0 = "stablehlo.reduce_scatter"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 2>, replica_groups = dense<[[0, 2], [1, 3]]> : tensor<2x2xi64>, scatter_dimension = 0 : i64}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  %1 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<10xf32>) -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.reduce_scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[10]{0})->f32[5]{0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[maximum_5:[^ ]+]] = f32[] maximum(%[[Arg_0_3]], %[[Arg_1_4]]), metadata=
+
+// CHECK:       ENTRY %[[$main_7:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[10] parameter(0)
+// CHECK-NEXT:  ROOT %[[reduce_scatter_6:[^ ]+]] = f32[5] reduce-scatter(%[[Arg_0_1]]), channel_id=5,
+// CHECK-SAME{{LITERAL}} : replica_groups={{0,2},{1,3}}, use_global_device_ids=true, dimensions={0}, to_apply=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
+  %0 = "stablehlo.reduce_scatter"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 5, type = 2>, replica_groups = dense<[[0, 2], [1, 3]]> : tensor<2x2xi64>, scatter_dimension = 0 : i64, use_global_device_ids}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  %1 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<10xf32>) -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.reduce_scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[4,16]{1,0})->f32[4,4]{1,0}}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  ROOT %[[Arg_0_3:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_4:[^ ]+]] = f32[] parameter(1)
+
+// CHECK:       ENTRY %[[$main_6:[^ ]+]]
+// CHECK:  %[[Arg_0_1:[^ ]+]] = f32[4,16] parameter(0)
+// CHECK-NEXT:  ROOT %[[reduce_scatter_5:[^ ]+]] = f32[4,4] reduce-scatter(%[[Arg_0_1]]), channel_id=1,
+// CHECK-SAME{{LITERAL}} : replica_groups={{0,1,2,3}}, use_global_device_ids=true, dimensions={1}, to_apply=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<4x16xf32>) -> tensor<4x4xf32> {
+  %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %0 = "stablehlo.reduce_scatter"(%arg0) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>, scatter_dimension = 1 : i64, use_global_device_ids}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+  stablehlo.return %arg1 : tensor<f32>
+  }) : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  return %0 : tensor<4x4xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.reduce_scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[], f32[])->f32[2,3,5]{2,1,0}}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = s64[3] constant({2, 3, 5})
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[rng_4:[^ ]+]] = f32[2,3,5] rng(%[[Arg_0_1]], %[[Arg_1_2]]), distribution=rng_normal, metadata=
+
+module {
+  func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<2x3x5xf32> {
+  %c = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+  %0 = stablehlo.rng %arg0, %arg1, %c, distribution =  NORMAL : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  return %0 : tensor<2x3x5xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.rng
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={()->f32[2,3,5]{2,1,0}}
+
+// CHECK:       ENTRY %[[$main_5:[^ ]+]]
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = s64[3] constant({2, 3, 5})
+// CHECK-NEXT:  %[[constant_1:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  ROOT %[[rng_4:[^ ]+]] = f32[2,3,5] rng(%[[constant_1]], %[[constant_2]]), distribution=rng_uniform, metadata=
+
+module {
+  func.func @main() -> tensor<2x3x5xf32> {
+  %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %cst_0 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+  %c = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
+  %0 = stablehlo.rng %cst, %cst_0, %c, distribution =  UNIFORM : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
+  return %0 : tensor<2x3x5xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.rng
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(u64[3]{0})->(u64[3]{0}, u32[2,2]{1,0})}
+
+// CHECK:       ENTRY %[[$main_6:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = u64[3] parameter(0)
+// CHECK-NEXT:  %[[rng_bit_generator_2:[^ ]+]] = (u64[3], u32[2,2]) rng-bit-generator(%[[Arg_0_1]]), algorithm=rng_philox, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_3:[^ ]+]] = u64[3] get-tuple-element(%[[rng_bit_generator_2]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_4:[^ ]+]] = u32[2,2] get-tuple-element(%[[rng_bit_generator_2]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_5:[^ ]+]] = (u64[3], u32[2,2]) tuple(%[[get_tuple_element_3]], %[[get_tuple_element_4]]), metadata=
+
+module {
+  func.func @main(%arg0: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
+  %output_state, %output = stablehlo.rng_bit_generator %arg0, algorithm =  PHILOX : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
+  %0 = stablehlo.tuple %output_state, %output : tuple<tensor<3xui64>, tensor<2x2xui32>>
+  return %0 : tuple<tensor<3xui64>, tensor<2x2xui32>>
+  }
+}
+// CHECK-DIRECT: stablehlo.rng_bit_generator
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[10,2]{1,0}, f32[10,300]{1,0})->f32[200,100,300]{2,1,0}}
+
+// CHECK:       %[[$region_0_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_5:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_6:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_7:[^ ]+]] = f32[] add(%[[Arg_0_5]], %[[Arg_1_6]]), metadata=
+
+// CHECK:       ENTRY %[[$main_9:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[200,100,300] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[10,2] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[10,300] parameter(2)
+// CHECK-NEXT:  ROOT %[[scatter_8:[^ ]+]] = f32[200,100,300] scatter(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=%[[$region_0_4]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
+  %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) <{indices_are_sorted = true, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1], inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = true}> ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  %1 = stablehlo.add %arg3, %arg4 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
+  return %0 : tensor<200x100x300xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s32[100,200,1]{2,1,0}, f32[100,200,300]{2,1,0})->f32[200,100,300]{2,1,0}}
+
+// CHECK:       %[[$region_0_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_5:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_6:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_7:[^ ]+]] = f32[] add(%[[Arg_0_5]], %[[Arg_1_6]]), metadata=
+
+// CHECK:       ENTRY %[[$main_9:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[200,100,300] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[100,200,1] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[100,200,300] parameter(2)
+// CHECK-NEXT:  ROOT %[[scatter_8:[^ ]+]] = f32[200,100,300] scatter(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), update_window_dims={2}, inserted_window_dims={}, scatter_dims_to_operand_dims={2}, input_batching_dims={0,1}, scatter_indices_batching_dims={1,0}, index_vector_dim=2, indices_are_sorted=true, unique_indices=true, to_apply=%[[$region_0_4]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<100x200x1xi32>, %arg2: tensor<100x200x300xf32>) -> tensor<200x100x300xf32> {
+  %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) <{indices_are_sorted = true, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [2], input_batching_dims = [0, 1], scatter_indices_batching_dims = [1, 0], scatter_dims_to_operand_dims = [2], index_vector_dim = 2>, unique_indices = true}> ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  %1 = stablehlo.add %arg3, %arg4 : tensor<f32>
+  stablehlo.return %1 : tensor<f32>
+  }) : (tensor<200x100x300xf32>, tensor<100x200x1xi32>, tensor<100x200x300xf32>) -> tensor<200x100x300xf32>
+  return %0 : tensor<200x100x300xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[200,100,300]{2,1,0}, s64[10,2]{1,0}, f32[10,300]{1,0})->(f32[200,100,300]{2,1,0}, f32[200,100,300]{2,1,0})}
+
+// CHECK:       %[[$region_0_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_5:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_6:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  %[[add_9:[^ ]+]] = f32[] add(%[[Arg_0_5]], %[[Arg_1_6]]), metadata=
+// CHECK-NEXT:  %[[Arg_2_7:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  %[[Arg_3_8:[^ ]+]] = f32[] parameter(3)
+// CHECK-NEXT:  %[[add_10:[^ ]+]] = f32[] add(%[[Arg_2_7]], %[[Arg_3_8]]), metadata=
+// CHECK-NEXT:  ROOT %[[tuple_11:[^ ]+]] = (f32[], f32[]) tuple(%[[add_9]], %[[add_10]])
+
+// CHECK:       ENTRY %[[$main_16:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[200,100,300] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s64[10,2] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[10,300] parameter(2)
+// CHECK-NEXT:  %[[scatter_12:[^ ]+]] = (f32[200,100,300], f32[200,100,300]) scatter(%[[Arg_0_1]], %[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]], %[[Arg_2_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=%[[$region_0_4]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_13:[^ ]+]] = f32[200,100,300] get-tuple-element(%[[scatter_12]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_14:[^ ]+]] = f32[200,100,300] get-tuple-element(%[[scatter_12]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_15:[^ ]+]] = (f32[200,100,300], f32[200,100,300]) tuple(%[[get_tuple_element_13]], %[[get_tuple_element_14]])
+
+module {
+  func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi64>, %arg2: tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>) {
+  %0:2 = "stablehlo.scatter"(%arg0, %arg0, %arg1, %arg2, %arg2) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1], inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = false}> ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>):
+  %1 = stablehlo.add %arg3, %arg4 : tensor<f32>
+  %2 = stablehlo.add %arg5, %arg6 : tensor<f32>
+  stablehlo.return %1, %2 : tensor<f32>, tensor<f32>
+  }) : (tensor<200x100x300xf32>, tensor<200x100x300xf32>, tensor<10x2xi64>, tensor<10x300xf32>, tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>)
+  return %0#0, %0#1 : tensor<200x100x300xf32>, tensor<200x100x300xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[10,24,24,64]{3,2,1,0}, f32[10,12,12,64]{3,2,1,0}, f32[])->f32[10,24,24,64]{3,2,1,0}}
+
+// CHECK:       %[[$region_0_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_5:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_6:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[compare_7:[^ ]+]] = pred[] compare(%[[Arg_0_5]], %[[Arg_1_6]]), direction=GE, type=TOTALORDER, metadata=
+
+// CHECK:       %[[$region_1_8:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_9:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_10:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_11:[^ ]+]] = f32[] add(%[[Arg_0_9]], %[[Arg_1_10]]), metadata=
+
+// CHECK:       ENTRY %[[$main_13:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[10,24,24,64] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = f32[10,12,12,64] parameter(1)
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = f32[] parameter(2)
+// CHECK-NEXT:  ROOT %[[select_and_scatter_12:[^ ]+]] = f32[10,24,24,64] select-and-scatter(%[[Arg_0_1]], %[[Arg_1_2]], %[[Arg_2_3]]), window={size=1x2x2x1 stride=1x2x2x1}, select=%[[$region_0_4]], scatter=%[[$region_1_8]], metadata=
+
+func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
+  %0 = "stablehlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %1 = "stablehlo.compare"(%arg3, %arg4) {compare_type = #stablehlo<comparison_type TOTALORDER>, comparison_direction = #stablehlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "stablehlo.return"(%1) : (tensor<i1>) -> ()
+  }, {
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %1 = "stablehlo.add"(%arg3, %arg4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    "stablehlo.return"(%1) : (tensor<f32>) -> ()
+  }) {
+  window_dimensions = array<i64: 1, 2, 2, 1>,
+  window_strides = array<i64: 1, 2, 2, 1>,
+  padding = dense<0> : tensor<4x2xi64>
+  } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+  func.return %0 : tensor<10x24x24x64xf32>
+}
+// CHECK-DIRECT: stablehlo.select_and_scatter
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[4,2]{1,0}, s32[])->f32[4,<=2]{1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[4,2] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[set_dimension_size_3:[^ ]+]] = f32[4,<=2] set-dimension-size(%[[Arg_0_1]], %[[Arg_1_2]]), dimensions={1}, metadata=
+
+module {
+  func.func @main(%arg0: tensor<4x2xf32>, %arg1: tensor<i32>) -> tensor<4x2xf32> {
+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 1 : (tensor<4x2xf32>, tensor<i32>) -> tensor<4x2xf32>
+  return %0 : tensor<4x2xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.set_dimension_size
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[4,4]{1,0}, s32[])->f32[4,<=4]{1,0}}
+
+// CHECK:       ENTRY %[[$main_4:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[4,4] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[set_dimension_size_3:[^ ]+]] = f32[4,<=4] set-dimension-size(%[[Arg_0_1]], %[[Arg_1_2]]), dimensions={1}, metadata=
+
+module {
+  func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<i32>) -> tensor<4x?xf32, #stablehlo.bounds<?, 4>> {
+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 1 : (tensor<4x4xf32>, tensor<i32>) -> tensor<4x?xf32, #stablehlo.bounds<?, 4>>
+  return %0 : tensor<4x?xf32, #stablehlo.bounds<?, 4>>
+  }
+}
+// CHECK-DIRECT: stablehlo.set_dimension_size
diff --git a/third_party/xla/xla/hlo/translate/tests/stablehlo_invalid.mlir b/third_party/xla/xla/hlo/translate/tests/stablehlo_invalid.mlir
new file mode 100644
index 000000000000..98509693d932
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/tests/stablehlo_invalid.mlir
@@ -0,0 +1,79 @@
+// RUN: not hlo-translate -mlir-to-hlo -split-input-file %s 2>&1 | FileCheck %s
+
+// StableHLO ops that has no HLO support. These all must be refined away before
+// lowering. See https://openxla.org/stablehlo/dynamism
+
+func.func @main(%arg0: tensor<?xf32>, %arg1: tensor<1xindex>) -> tensor<?xf32> {
+  // CHECK: Shape Error: Invalid element type
+  %0 = stablehlo.dynamic_broadcast_in_dim %arg0, %arg1, dims = [0] {known_expanding_dimensions = array<i64>, known_nonexpanding_dimensions = array<i64: 0>} : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+func.func @main(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<2x2xi32>) -> tensor<1x?x?x16xf32> {
+  // CHECK: Shape Error: Invalid element type
+  %0 = "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
+    window_strides = array<i64: 1, 1>,
+    lhs_dilation = array<i64: 1, 1>,
+    rhs_dilation = array<i64: 1, 1>,
+    window_reversal = array<i1: false, false>,
+    dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
+    feature_group_count = 1 : i64,
+    batch_group_count = 1 : i64,
+    precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
+  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<2x2xi32>) -> tensor<1x?x?x16xf32>
+  func.return %0 : tensor<1x?x?x16xf32>
+}
+
+// -----
+
+func.func @main(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // CHECK: op can't be translated to XLA HLO
+  %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #stablehlo.gather<
+      collapsed_slice_dims = [0, 1],
+      index_vector_dim = 2,
+      offset_dims = [2],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false
+  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @main(%arg0: tensor<1xindex>) -> tensor<?xf32> {
+  // CHECK: op can't be translated to XLA HLO
+  %0 = "stablehlo.dynamic_iota"(%arg0) {
+    iota_dimension = 0 : i64
+  } : (tensor<1xindex>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// -----
+
+func.func @op_dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>, %arg4: tensor<1xindex>) -> tensor<?xf32> {
+  // CHECK: op can't be translated to XLA HLO
+  %0 = "stablehlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// -----
+
+func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
+  // CHECK: op can't be translated to XLA HLO
+  %0 = "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
+  // CHECK: op can't be translated to XLA HLO
+  %0 = "stablehlo.unary_einsum"(%arg0) {
+    einsum_config = "ab->a"
+  } : (tensor<8x16xf32>) -> tensor<8xf32>
+  func.return %0 : tensor<8xf32>
+}
diff --git a/third_party/xla/xla/hlo/translate/tests/stablehlo_unary_elementwise.mlir b/third_party/xla/xla/hlo/translate/tests/stablehlo_unary_elementwise.mlir
new file mode 100644
index 000000000000..2d0b90d6e809
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/tests/stablehlo_unary_elementwise.mlir
@@ -0,0 +1,117 @@
+// RUN: xla-translate --stablehlo-to-hlo-text -split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --stablehlo-legalize-to-hlo=convert-xla-supported-stablehlo=false -split-input-file %s | FileCheck %s --check-prefix CHECK-DIRECT
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[3,4]{1,0}, c64[2]{0}, s32[5]{0}, pred[5]{0}, f32[?,784]{1,0})->(f32[3,4]{1,0}, s32[5]{0}, f32[2]{0}, f32[2]{0}, pred[5]{0}, /*index=5*/f32[?,784]{1,0}, f16[3,4]{1,0}, pred[3,4]{1,0})}
+
+// CHECK:       ENTRY %[[$main_35:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[3,4] parameter(0)
+// CHECK-NEXT:  %[[abs_6:[^ ]+]] = f32[3,4] abs(%[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  %[[cbrt_7:[^ ]+]] = f32[3,4] cbrt(%[[abs_6]]), metadata=
+// CHECK-NEXT:  %[[ceil_8:[^ ]+]] = f32[3,4] ceil(%[[cbrt_7]]), metadata=
+// CHECK-NEXT:  %[[cosine_9:[^ ]+]] = f32[3,4] cosine(%[[ceil_8]]), metadata=
+// CHECK-NEXT:  %[[exponential_10:[^ ]+]] = f32[3,4] exponential(%[[cosine_9]]), metadata=
+// CHECK-NEXT:  %[[exponential_minus_one_11:[^ ]+]] = f32[3,4] exponential-minus-one(%[[exponential_10]]), metadata=
+// CHECK-NEXT:  %[[floor_12:[^ ]+]] = f32[3,4] floor(%[[exponential_minus_one_11]]), metadata=
+// CHECK-NEXT:  %[[log_13:[^ ]+]] = f32[3,4] log(%[[floor_12]]), metadata=
+// CHECK-NEXT:  %[[log_plus_one_14:[^ ]+]] = f32[3,4] log-plus-one(%[[log_13]]), metadata=
+// CHECK-NEXT:  %[[logistic_15:[^ ]+]] = f32[3,4] logistic(%[[log_plus_one_14]]), metadata=
+// CHECK-NEXT:  %[[negate_16:[^ ]+]] = f32[3,4] negate(%[[logistic_15]]), metadata=
+// CHECK-NEXT:  %[[round_nearest_afz_17:[^ ]+]] = f32[3,4] round-nearest-afz(%[[negate_16]]), metadata=
+// CHECK-NEXT:  %[[round_nearest_even_18:[^ ]+]] = f32[3,4] round-nearest-even(%[[round_nearest_afz_17]]), metadata=
+// CHECK-NEXT:  %[[rsqrt_19:[^ ]+]] = f32[3,4] rsqrt(%[[round_nearest_even_18]]), metadata=
+// CHECK-NEXT:  %[[sign_20:[^ ]+]] = f32[3,4] sign(%[[rsqrt_19]]), metadata=
+// CHECK-NEXT:  %[[sine_21:[^ ]+]] = f32[3,4] sine(%[[sign_20]]), metadata=
+// CHECK-NEXT:  %[[sqrt_22:[^ ]+]] = f32[3,4] sqrt(%[[sine_21]]), metadata=
+// CHECK-NEXT:  %[[tan_23:[^ ]+]] = f32[3,4] tan(%[[sqrt_22]]), metadata=
+// CHECK-NEXT:  %[[tanh_24:[^ ]+]] = f32[3,4] tanh(%[[tan_23]]), metadata=
+// CHECK-NEXT:  %[[Arg_2_3:[^ ]+]] = s32[5] parameter(2)
+// CHECK-NEXT:  %[[abs_25:[^ ]+]] = s32[5] abs(%[[Arg_2_3]]), metadata=
+// CHECK-NEXT:  %[[count_leading_zeros_26:[^ ]+]] = s32[5] count-leading-zeros(%[[abs_25]]), metadata=
+// CHECK-NEXT:  %[[not_27:[^ ]+]] = s32[5] not(%[[count_leading_zeros_26]]), metadata=
+// CHECK-NEXT:  %[[Arg_1_2:[^ ]+]] = c64[2] parameter(1)
+// CHECK-NEXT:  %[[imag_28:[^ ]+]] = f32[2] imag(%[[Arg_1_2]]), metadata=
+// CHECK-NEXT:  %[[real_29:[^ ]+]] = f32[2] real(%[[Arg_1_2]]), metadata=
+// CHECK-NEXT:  %[[Arg_3_4:[^ ]+]] = pred[5] parameter(3)
+// CHECK-NEXT:  %[[not_30:[^ ]+]] = pred[5] not(%[[Arg_3_4]]), metadata=
+// CHECK-NEXT:  %[[Arg_4_5:[^ ]+]] = f32[?,784] parameter(4)
+// CHECK-NEXT:  %[[abs_31:[^ ]+]] = f32[?,784] abs(%[[Arg_4_5]]), metadata=
+// CHECK-NEXT:  %[[convert_32:[^ ]+]] = f16[3,4] convert(%[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  %[[is_finite_33:[^ ]+]] = pred[3,4] is-finite(%[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  ROOT %[[tuple_34:[^ ]+]] = (f32[3,4], s32[5], f32[2], f32[2], pred[5], /*index=5*/f32[?,784], f16[3,4], pred[3,4]) tuple(%[[tanh_24]], %[[not_27]], %[[imag_28]], %[[real_29]], %[[not_30]], /*index=5*/%[[abs_31]], %[[convert_32]], %[[is_finite_33]])
+
+func.func @main(
+  %arg_f32: tensor<3x4xf32>,
+  %arg_complex: tensor<2xcomplex<f32>>,
+  %arg_int: tensor<5xi32>,
+  %arg_bool: tensor<5xi1>,
+  %arg_dynamic: tensor<?x784xf32>
+) -> (
+  tensor<3x4xf32>,
+  tensor<5xi32>,
+  tensor<2xf32>,
+  tensor<2xf32>,
+  tensor<5xi1>,
+  tensor<?x784xf32>,
+  tensor<3x4xf16>,
+  tensor<3x4xi1>
+) {
+  %f0 = stablehlo.abs %arg_f32 : tensor<3x4xf32>
+  %f1 = stablehlo.cbrt %f0 : tensor<3x4xf32>
+  %f2 = stablehlo.ceil %f1 : tensor<3x4xf32>
+  %f4 = stablehlo.cosine %f2 : tensor<3x4xf32>
+  %f6 = stablehlo.exponential %f4 : tensor<3x4xf32>
+  %f7 = stablehlo.exponential_minus_one %f6 : tensor<3x4xf32>
+  %f8 = stablehlo.floor %f7 : tensor<3x4xf32>
+  %f11 = stablehlo.log %f8 : tensor<3x4xf32>
+  %f12 = stablehlo.log_plus_one %f11 : tensor<3x4xf32>
+  %f13 = stablehlo.logistic %f12 : tensor<3x4xf32>
+  %f14 = stablehlo.negate %f13 : tensor<3x4xf32>
+  %f19 = stablehlo.round_nearest_afz %f14 : tensor<3x4xf32>
+  %f20 = stablehlo.round_nearest_even %f19 : tensor<3x4xf32>
+  %f21 = stablehlo.rsqrt %f20 : tensor<3x4xf32>
+  %f22 = stablehlo.sign %f21 : tensor<3x4xf32>
+  %f23 = stablehlo.sine %f22 : tensor<3x4xf32>
+  %f24 = stablehlo.sqrt %f23 : tensor<3x4xf32>
+  %f25 = stablehlo.tan %f24 : tensor<3x4xf32>
+  %f26 = stablehlo.tanh %f25 : tensor<3x4xf32>
+  %i0 = stablehlo.abs %arg_int : tensor<5xi32>
+  %i5 = stablehlo.count_leading_zeros %i0 : tensor<5xi32>
+  %i16 = stablehlo.not %i5 : tensor<5xi32>
+  %cx9 = stablehlo.imag %arg_complex : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
+  %cx18 = stablehlo.real %arg_complex : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
+  %b15 = stablehlo.not %arg_bool : tensor<5xi1>
+  %d3 = stablehlo.abs %arg_dynamic : tensor<?x784xf32>
+  %t3 = stablehlo.convert %arg_f32 : (tensor<3x4xf32>) -> tensor<3x4xf16>
+  %t10 = stablehlo.is_finite %arg_f32 : (tensor<3x4xf32>) -> tensor<3x4xi1>
+
+  // Return all the final results to prevent DCE
+  func.return %f26, %i16, %cx9, %cx18, %b15, %d3, %t3, %t10 : tensor<3x4xf32>, tensor<5xi32>, tensor<2xf32>, tensor<2xf32>, tensor<5xi1>, tensor<?x784xf32>, tensor<3x4xf16>, tensor<3x4xi1>
+}
+// CHECK-DIRECT: stablehlo.abs
+// CHECK-DIRECT: stablehlo.cbrt
+// CHECK-DIRECT: stablehlo.ceil
+// CHECK-DIRECT: stablehlo.cosine
+// CHECK-DIRECT: stablehlo.exponential
+// CHECK-DIRECT: stablehlo.exponential_minus_one
+// CHECK-DIRECT: stablehlo.floor
+// CHECK-DIRECT: stablehlo.log
+// CHECK-DIRECT: stablehlo.log_plus_one
+// CHECK-DIRECT: stablehlo.logistic
+// CHECK-DIRECT: stablehlo.negate
+// CHECK-DIRECT: stablehlo.round_nearest_afz
+// CHECK-DIRECT: stablehlo.round_nearest_even
+// CHECK-DIRECT: stablehlo.rsqrt
+// CHECK-DIRECT: stablehlo.sign
+// CHECK-DIRECT: stablehlo.sine
+// CHECK-DIRECT: stablehlo.sqrt
+// CHECK-DIRECT: stablehlo.tan
+// CHECK-DIRECT: stablehlo.tanh
+// CHECK-DIRECT: stablehlo.abs
+// CHECK-DIRECT: stablehlo.count_leading_zeros
+// CHECK-DIRECT: stablehlo.not
+// CHECK-DIRECT: stablehlo.imag
+// CHECK-DIRECT: stablehlo.real
+// CHECK-DIRECT: stablehlo.not
+// CHECK-DIRECT: stablehlo.abs
+// CHECK-DIRECT: stablehlo.convert
+// CHECK-DIRECT: stablehlo.is_finite
diff --git a/third_party/xla/xla/hlo/translate/tests/stablehlo_while.mlir b/third_party/xla/xla/hlo/translate/tests/stablehlo_while.mlir
new file mode 100644
index 000000000000..69e8a64f7296
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/tests/stablehlo_while.mlir
@@ -0,0 +1,321 @@
+// NOTE: Assertions have been autogenerated by hlo/tools/generate_hlo_test_checks.py
+// RUN: xla-translate --stablehlo-to-hlo-text -split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --stablehlo-legalize-to-hlo=convert-xla-supported-stablehlo=false -split-input-file %s | FileCheck %s --check-prefix CHECK-DIRECT
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s64[])->s64[]}
+
+// CHECK:       %[[$region_0_2:[^ ]+]]
+// CHECK-NEXT:  %[[Arg__3:[^ ]+]] = s64[] parameter(0)
+// CHECK-NEXT:  ROOT %[[add_4:[^ ]+]] = s64[] add(%[[Arg__3]], %[[Arg__3]]), metadata=
+
+// CHECK:       %[[$region_1_5:[^ ]+]]
+// CHECK-NEXT:  %[[Arg__6:[^ ]+]] = s64[] parameter(0)
+// CHECK-NEXT:  ROOT %[[constant_7:[^ ]+]] = pred[] constant(false)
+
+// CHECK:       ENTRY %[[$main_9:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s64[] parameter(0)
+// CHECK-NEXT:  ROOT %[[while_8:[^ ]+]] = s64[] while(%[[Arg_0_1]]), condition=%[[$region_1_5]], body=%[[$region_0_2]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<i64>) -> tensor<i64> {
+  %0 = stablehlo.while(%iterArg = %arg0) : tensor<i64>
+  cond {
+  %c = stablehlo.constant dense<false> : tensor<i1>
+  stablehlo.return %c : tensor<i1>
+  } do {
+  %1 = stablehlo.add %iterArg, %iterArg : tensor<i64>
+  stablehlo.return %1 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+  }
+}
+// CHECK-DIRECT: stablehlo.while
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK:       %[[$region_0_6:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_7:[^ ]+]] = (s32[], s32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_8:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_7]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_9:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_7]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_10:[^ ]+]] = f32[] get-tuple-element(%[[arg_tuple_7]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_11:[^ ]+]] = f32[] get-tuple-element(%[[arg_tuple_7]]), index=3
+// CHECK-NEXT:  %[[add_12:[^ ]+]] = f32[] add(%[[get_tuple_element_10]], %[[get_tuple_element_11]]), metadata=
+// CHECK-NEXT:  ROOT %[[tuple_13:[^ ]+]] = (s32[], s32[], f32[], f32[]) tuple(%[[get_tuple_element_8]], %[[get_tuple_element_9]], %[[get_tuple_element_10]], %[[add_12]])
+
+// CHECK:       %[[$region_1_14:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_15:[^ ]+]] = (s32[], s32[], f32[], f32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = f32[] get-tuple-element(%[[arg_tuple_15]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = f32[] get-tuple-element(%[[arg_tuple_15]]), index=3
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[get_tuple_element_16:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_15]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_15]]), index=1
+// CHECK-NEXT:  ROOT %[[compare_21:[^ ]+]] = pred[] compare(%[[get_tuple_element_16]], %[[get_tuple_element_17]]), direction=LT, metadata=
+
+// CHECK:       ENTRY %[[$main_27:[^ ]+]]
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = s32[] constant(100)
+// CHECK-NEXT:  %[[constant_4:[^ ]+]] = f32[] constant(1)
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[tuple_5:[^ ]+]] = (s32[], s32[], f32[], f32[]) tuple(%[[constant_2]], %[[constant_3]], %[[constant_4]], %[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  %[[while_22:[^ ]+]] = (s32[], s32[], f32[], f32[]) while(%[[tuple_5]]), condition=%[[$region_1_14]], body=%[[$region_0_6]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_23:[^ ]+]] = s32[] get-tuple-element(%[[while_22]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_24:[^ ]+]] = s32[] get-tuple-element(%[[while_22]]), index=1, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_25:[^ ]+]] = f32[] get-tuple-element(%[[while_22]]), index=2, metadata=
+// CHECK-NEXT:  ROOT %[[get_tuple_element_26:[^ ]+]] = f32[] get-tuple-element(%[[while_22]]), index=3, metadata=
+
+module {
+  func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
+  %c = stablehlo.constant dense<0> : tensor<i32>
+  %c_0 = stablehlo.constant dense<100> : tensor<i32>
+  %cst = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+  %0:4 = stablehlo.while(%iterArg = %c, %iterArg_1 = %c_0, %iterArg_2 = %cst, %iterArg_3 = %arg0) : tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>
+  cond {
+  %c_4 = stablehlo.constant dense<0> : tensor<i32>
+  %1 = stablehlo.compare  LT, %iterArg, %iterArg_1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  stablehlo.return %1 : tensor<i1>
+  } do {
+  %1 = stablehlo.add %iterArg_2, %iterArg_3 : tensor<f32>
+  stablehlo.return %iterArg, %iterArg_1, %iterArg_2, %1 : tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>
+  }
+  return %0#3 : tensor<f32>
+  }
+}
+// CHECK-DIRECT: stablehlo.while
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[3]{0})->f32[3]{0}}
+
+// CHECK:       %[[$region_0_7:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_8:[^ ]+]] = (s32[1], s32[2], f32[1], f32[3]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_9:[^ ]+]] = s32[1] get-tuple-element(%[[arg_tuple_8]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_10:[^ ]+]] = s32[2] get-tuple-element(%[[arg_tuple_8]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_11:[^ ]+]] = f32[1] get-tuple-element(%[[arg_tuple_8]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_12:[^ ]+]] = f32[3] get-tuple-element(%[[arg_tuple_8]]), index=3
+// CHECK-NEXT:  %[[broadcast_13:[^ ]+]] = f32[1] broadcast(%[[get_tuple_element_11]]), dimensions={0}, metadata=
+// CHECK-NEXT:  %[[reshape_14:[^ ]+]] = f32[] reshape(%[[broadcast_13]]), metadata=
+// CHECK-NEXT:  %[[broadcast_15:[^ ]+]] = f32[3] broadcast(%[[reshape_14]]), dimensions={}, metadata=
+// CHECK-NEXT:  %[[add_16:[^ ]+]] = f32[3] add(%[[get_tuple_element_12]], %[[broadcast_15]]), metadata=
+// CHECK-NEXT:  ROOT %[[tuple_17:[^ ]+]] = (s32[1], s32[2], f32[1], f32[3]) tuple(%[[get_tuple_element_9]], %[[get_tuple_element_10]], %[[get_tuple_element_11]], %[[add_16]])
+
+// CHECK:       %[[$region_2_18:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_19:[^ ]+]] = s32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_20:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_21:[^ ]+]] = s32[] add(%[[Arg_0_19]], %[[Arg_1_20]]), metadata=
+
+// CHECK:       %[[$region_3_22:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_23:[^ ]+]] = s32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_24:[^ ]+]] = s32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_25:[^ ]+]] = s32[] add(%[[Arg_0_23]], %[[Arg_1_24]]), metadata=
+
+// CHECK:       %[[$region_1_26:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_27:[^ ]+]] = (s32[1], s32[2], f32[1], f32[3]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_30:[^ ]+]] = f32[1] get-tuple-element(%[[arg_tuple_27]]), index=2
+// CHECK-NEXT:  %[[get_tuple_element_31:[^ ]+]] = f32[3] get-tuple-element(%[[arg_tuple_27]]), index=3
+// CHECK-NEXT:  %[[get_tuple_element_28:[^ ]+]] = s32[1] get-tuple-element(%[[arg_tuple_27]]), index=0
+// CHECK-NEXT:  %[[constant_32:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[reduce_33:[^ ]+]] = s32[] reduce(%[[get_tuple_element_28]], %[[constant_32]]), dimensions={0}, to_apply=%[[$region_2_18]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_29:[^ ]+]] = s32[2] get-tuple-element(%[[arg_tuple_27]]), index=1
+// CHECK-NEXT:  %[[reduce_34:[^ ]+]] = s32[] reduce(%[[get_tuple_element_29]], %[[constant_32]]), dimensions={0}, to_apply=%[[$region_3_22]], metadata=
+// CHECK-NEXT:  ROOT %[[compare_35:[^ ]+]] = pred[] compare(%[[reduce_33]], %[[reduce_34]]), direction=LT, metadata=
+
+// CHECK:       ENTRY %[[$main_41:[^ ]+]]
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = s32[1] constant({0})
+// CHECK-NEXT:  %[[constant_3:[^ ]+]] = s32[] constant(100)
+// CHECK-NEXT:  %[[broadcast_4:[^ ]+]] = s32[2] broadcast(%[[constant_3]]), dimensions={}
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = f32[1] constant({1})
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[3] parameter(0)
+// CHECK-NEXT:  %[[tuple_6:[^ ]+]] = (s32[1], s32[2], f32[1], f32[3]) tuple(%[[constant_2]], %[[broadcast_4]], %[[constant_5]], %[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  %[[while_36:[^ ]+]] = (s32[1], s32[2], f32[1], f32[3]) while(%[[tuple_6]]), condition=%[[$region_1_26]], body=%[[$region_0_7]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_37:[^ ]+]] = s32[1] get-tuple-element(%[[while_36]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_38:[^ ]+]] = s32[2] get-tuple-element(%[[while_36]]), index=1, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_39:[^ ]+]] = f32[1] get-tuple-element(%[[while_36]]), index=2, metadata=
+// CHECK-NEXT:  ROOT %[[get_tuple_element_40:[^ ]+]] = f32[3] get-tuple-element(%[[while_36]]), index=3, metadata=
+
+module {
+  func.func @main(%arg0: tensor<3xf32>) -> tensor<3xf32> {
+  %c = stablehlo.constant dense<0> : tensor<1xi32>
+  %c_0 = stablehlo.constant dense<100> : tensor<2xi32>
+  %cst = stablehlo.constant dense<1.000000e+00> : tensor<1xf32>
+  %0:4 = stablehlo.while(%iterArg = %c, %iterArg_1 = %c_0, %iterArg_2 = %cst, %iterArg_3 = %arg0) : tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>
+  cond {
+  %c_4 = stablehlo.constant dense<0> : tensor<i32>
+  %1 = stablehlo.reduce(%iterArg init: %c_4) applies stablehlo.add across dimensions = [0] : (tensor<1xi32>, tensor<i32>) -> tensor<i32>
+  %2 = stablehlo.reduce(%iterArg_1 init: %c_4) applies stablehlo.add across dimensions = [0] : (tensor<2xi32>, tensor<i32>) -> tensor<i32>
+  %3 = stablehlo.compare  LT, %1, %2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  stablehlo.return %3 : tensor<i1>
+  } do {
+  %1 = stablehlo.broadcast_in_dim %iterArg_2, dims = [0] : (tensor<1xf32>) -> tensor<3xf32>
+  %2 = stablehlo.add %iterArg_3, %1 : tensor<3xf32>
+  stablehlo.return %iterArg, %iterArg_1, %iterArg_2, %2 : tensor<1xi32>, tensor<2xi32>, tensor<1xf32>, tensor<3xf32>
+  }
+  return %0#3 : tensor<3xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.while
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={((s32[], (s32[], (s32[]))))->(s32[], (s32[], (s32[])))}
+
+// CHECK:       %[[$region_0_8:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_9:[^ ]+]] = (s32[], s32[], s32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_10:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_9]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_11:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_9]]), index=1
+// CHECK-NEXT:  %[[add_13:[^ ]+]] = s32[] add(%[[get_tuple_element_10]], %[[get_tuple_element_11]]), metadata=
+// CHECK-NEXT:  %[[get_tuple_element_12:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_9]]), index=2
+// CHECK-NEXT:  ROOT %[[tuple_14:[^ ]+]] = (s32[], s32[], s32[]) tuple(%[[add_13]], %[[get_tuple_element_11]], %[[get_tuple_element_12]])
+
+// CHECK:       %[[$region_1_15:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_16:[^ ]+]] = (s32[], s32[], s32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_18:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_16]]), index=1
+// CHECK-NEXT:  %[[get_tuple_element_17:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_16]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_19:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_16]]), index=2
+// CHECK-NEXT:  ROOT %[[compare_20:[^ ]+]] = pred[] compare(%[[get_tuple_element_17]], %[[get_tuple_element_19]]), direction=LT, metadata=
+
+// CHECK:       ENTRY %[[$main_28:[^ ]+]]
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = (s32[], (s32[], (s32[]))) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_2:[^ ]+]] = s32[] get-tuple-element(%[[Arg_0_1]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_3:[^ ]+]] = (s32[], (s32[])) get-tuple-element(%[[Arg_0_1]]), index=1, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_4:[^ ]+]] = s32[] get-tuple-element(%[[get_tuple_element_3]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_5:[^ ]+]] = (s32[]) get-tuple-element(%[[get_tuple_element_3]]), index=1, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_6:[^ ]+]] = s32[] get-tuple-element(%[[get_tuple_element_5]]), index=0, metadata=
+// CHECK-NEXT:  %[[tuple_7:[^ ]+]] = (s32[], s32[], s32[]) tuple(%[[get_tuple_element_2]], %[[get_tuple_element_4]], %[[get_tuple_element_6]]), metadata=
+// CHECK-NEXT:  %[[while_21:[^ ]+]] = (s32[], s32[], s32[]) while(%[[tuple_7]]), condition=%[[$region_1_15]], body=%[[$region_0_8]], metadata=
+// CHECK-NEXT:  %[[get_tuple_element_22:[^ ]+]] = s32[] get-tuple-element(%[[while_21]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_23:[^ ]+]] = s32[] get-tuple-element(%[[while_21]]), index=1, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_24:[^ ]+]] = s32[] get-tuple-element(%[[while_21]]), index=2, metadata=
+// CHECK-NEXT:  %[[tuple_25:[^ ]+]] = (s32[]) tuple(%[[get_tuple_element_24]]), metadata=
+// CHECK-NEXT:  %[[tuple_26:[^ ]+]] = (s32[], (s32[])) tuple(%[[get_tuple_element_23]], %[[tuple_25]]), metadata=
+// CHECK-NEXT:  ROOT %[[tuple_27:[^ ]+]] = (s32[], (s32[], (s32[]))) tuple(%[[get_tuple_element_22]], %[[tuple_26]]), metadata=
+
+module {
+  func.func @main(%arg0: tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>) -> tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>> {
+  %0 = stablehlo.get_tuple_element %arg0[0] : (tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>) -> tensor<i32>
+  %1 = stablehlo.get_tuple_element %arg0[1] : (tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>) -> tuple<tensor<i32>, tuple<tensor<i32>>>
+  %2 = stablehlo.get_tuple_element %1[0] : (tuple<tensor<i32>, tuple<tensor<i32>>>) -> tensor<i32>
+  %3 = stablehlo.get_tuple_element %1[1] : (tuple<tensor<i32>, tuple<tensor<i32>>>) -> tuple<tensor<i32>>
+  %4 = stablehlo.get_tuple_element %3[0] : (tuple<tensor<i32>>) -> tensor<i32>
+  %5:3 = stablehlo.while(%iterArg = %0, %iterArg_0 = %2, %iterArg_1 = %4) : tensor<i32>, tensor<i32>, tensor<i32>
+  cond {
+  %9 = stablehlo.compare  LT, %iterArg, %iterArg_1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  stablehlo.return %9 : tensor<i1>
+  } do {
+  %9 = stablehlo.add %iterArg, %iterArg_0 : tensor<i32>
+  stablehlo.return %9, %iterArg_0, %iterArg_1 : tensor<i32>, tensor<i32>, tensor<i32>
+  }
+  %6 = stablehlo.tuple %5#2 : tuple<tensor<i32>>
+  %7 = stablehlo.tuple %5#1, %6 : tuple<tensor<i32>, tuple<tensor<i32>>>
+  %8 = stablehlo.tuple %5#0, %7 {xla_shape = "(s32[], (s32[], (s32[])))"} : tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>
+  return %8 : tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>
+  }
+}
+// CHECK-DIRECT: stablehlo.while
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(f32[3,3]{1,0})->f32[3,3]{1,0}}
+
+// CHECK:       %[[$region_0_3:[^ ]+]]
+// CHECK-NEXT:  %[[constant_5:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  %[[constant_6:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[Arg__4:[^ ]+]] = f32[3,3] parameter(0)
+// CHECK-NEXT:  %[[constant_7:[^ ]+]] = f32[] constant(2)
+// CHECK-NEXT:  %[[broadcast_8:[^ ]+]] = f32[3,3] broadcast(%[[constant_7]]), dimensions={}
+// CHECK-NEXT:  ROOT %[[add_9:[^ ]+]] = f32[3,3] add(%[[Arg__4]], %[[broadcast_8]]), metadata=
+
+// CHECK:       %[[$region_2_10:[^ ]+]]
+// CHECK-NEXT:  %[[constant_13:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  %[[Arg_0_11:[^ ]+]] = f32[] parameter(0)
+// CHECK-NEXT:  %[[Arg_1_12:[^ ]+]] = f32[] parameter(1)
+// CHECK-NEXT:  ROOT %[[add_14:[^ ]+]] = f32[] add(%[[Arg_0_11]], %[[Arg_1_12]]), metadata=
+
+// CHECK:       %[[$region_1_15:[^ ]+]]
+// CHECK-NEXT:  %[[constant_17:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  %[[Arg__16:[^ ]+]] = f32[3,3] parameter(0)
+// CHECK-NEXT:  %[[constant_18:[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  %[[reduce_19:[^ ]+]] = f32[] reduce(%[[Arg__16]], %[[constant_18]]), dimensions={0,1}, to_apply=%[[$region_2_10]], metadata=
+// CHECK-NEXT:  %[[constant_20:[^ ]+]] = f32[] constant(100)
+// CHECK-NEXT:  ROOT %[[compare_21:[^ ]+]] = pred[] compare(%[[reduce_19]], %[[constant_20]]), direction=LT, metadata=
+
+// CHECK:       ENTRY %[[$main_23:[^ ]+]]
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = pred[] constant(false)
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = f32[3,3] parameter(0)
+// CHECK-NEXT:  ROOT %[[while_22:[^ ]+]] = f32[3,3] while(%[[Arg_0_1]]), condition=%[[$region_1_15]], body=%[[$region_0_3]], metadata=
+
+module {
+  func.func @main(%arg0: tensor<3x3xf32>) -> tensor<3x3xf32> {
+  %c = stablehlo.constant dense<false> : tensor<i1>
+  %0 = stablehlo.while(%iterArg = %arg0) : tensor<3x3xf32>
+  cond {
+  %c_0 = stablehlo.constant dense<false> : tensor<i1>
+  %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %1 = stablehlo.reduce(%iterArg init: %cst) across dimensions = [0, 1] : (tensor<3x3xf32>, tensor<f32>) -> tensor<f32>
+  reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
+  %c_2 = stablehlo.constant dense<false> : tensor<i1>
+  %3 = stablehlo.add %arg1, %arg2 : tensor<f32>
+  stablehlo.return %3 : tensor<f32>
+  }
+  %cst_1 = stablehlo.constant dense<1.000000e+02> : tensor<f32>
+  %2 = stablehlo.compare  LT, %1, %cst_1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  stablehlo.return %2 : tensor<i1>
+  } do {
+  %c_0 = stablehlo.constant dense<false> : tensor<i1>
+  %cst = stablehlo.constant dense<2.000000e+00> : tensor<f32>
+  %cst_1 = stablehlo.constant dense<2.000000e+00> : tensor<3x3xf32>
+  %1 = stablehlo.add %iterArg, %cst_1 : tensor<3x3xf32>
+  stablehlo.return %1 : tensor<3x3xf32>
+  }
+  return %0 : tensor<3x3xf32>
+  }
+}
+// CHECK-DIRECT: stablehlo.while
+
+// -----
+
+// CHECK-LABEL: HloModule main, entry_computation_layout={(s32[])->s32[]}
+
+// CHECK:       %[[$region_0_4:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_5:[^ ]+]] = (s32[], s32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_6:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_5]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_7:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_5]]), index=1
+// CHECK-NEXT:  %[[tuple_8:[^ ]+]] = (s32[], s32[]) tuple(%[[get_tuple_element_6]], %[[get_tuple_element_7]]), metadata=
+// CHECK-NEXT:  %[[custom_call_9:[^ ]+]] = (s32[], s32[]) custom-call(%[[get_tuple_element_6]], %[[tuple_8]]), custom_call_target="foo", metadata=
+// CHECK-NEXT:  %[[get_tuple_element_10:[^ ]+]] = s32[] get-tuple-element(%[[custom_call_9]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_11:[^ ]+]] = s32[] get-tuple-element(%[[custom_call_9]]), index=1, metadata=
+// CHECK-NEXT:  ROOT %[[tuple_12:[^ ]+]] = (s32[], s32[]) tuple(%[[get_tuple_element_10]], %[[get_tuple_element_11]])
+
+// CHECK:       %[[$region_1_13:[^ ]+]]
+// CHECK-NEXT:  %[[arg_tuple_14:[^ ]+]] = (s32[], s32[]) parameter(0)
+// CHECK-NEXT:  %[[get_tuple_element_15:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_14]]), index=0
+// CHECK-NEXT:  %[[get_tuple_element_16:[^ ]+]] = s32[] get-tuple-element(%[[arg_tuple_14]]), index=1
+// CHECK-NEXT:  ROOT %[[compare_17:[^ ]+]] = pred[] compare(%[[get_tuple_element_15]], %[[get_tuple_element_16]]), direction=LT, metadata=
+
+// CHECK:       ENTRY %[[$main_21:[^ ]+]]
+// CHECK-NEXT:  %[[constant_2:[^ ]+]] = s32[] constant(0)
+// CHECK-NEXT:  %[[Arg_0_1:[^ ]+]] = s32[] parameter(0)
+// CHECK-NEXT:  %[[tuple_3:[^ ]+]] = (s32[], s32[]) tuple(%[[constant_2]], %[[Arg_0_1]]), metadata=
+// CHECK-NEXT:  %[[while_18:[^ ]+]] = (s32[], s32[]) while(%[[tuple_3]]), condition=%[[$region_1_13]], body=%[[$region_0_4]], metadata=
+// CHECK-NEXT:  ROOT %[[get_tuple_element_19:[^ ]+]] = s32[] get-tuple-element(%[[while_18]]), index=0, metadata=
+// CHECK-NEXT:  %[[get_tuple_element_20:[^ ]+]] = s32[] get-tuple-element(%[[while_18]]), index=1, metadata=
+
+module {
+  func.func @main(%arg0: tensor<i32>) -> tensor<i32> {
+  %c = stablehlo.constant dense<0> : tensor<i32>
+  %0:2 = stablehlo.while(%iterArg = %c, %iterArg_0 = %arg0) : tensor<i32>, tensor<i32>
+  cond {
+  %1 = stablehlo.compare  LT, %iterArg, %iterArg_0 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  stablehlo.return %1 : tensor<i1>
+  } do {
+  %1 = stablehlo.tuple %iterArg, %iterArg_0 : tuple<tensor<i32>, tensor<i32>>
+  %2 = stablehlo.custom_call @foo(%iterArg, %1) {backend_config = "bar", xla_shape = "(s32[], s32[])"} : (tensor<i32>, tuple<tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>>
+  %3 = stablehlo.get_tuple_element %2[0] : (tuple<tensor<i32>, tensor<i32>>) -> tensor<i32>
+  %4 = stablehlo.get_tuple_element %2[1] : (tuple<tensor<i32>, tensor<i32>>) -> tensor<i32>
+  stablehlo.return %3, %4 : tensor<i32>, tensor<i32>
+  }
+  return %0#0 : tensor<i32>
+  }
+}
+// CHECK-DIRECT: stablehlo.while
diff --git a/third_party/xla/xla/hlo/translate/tests/stablehlo_while_free_vars.mlir b/third_party/xla/xla/hlo/translate/tests/stablehlo_while_free_vars.mlir
new file mode 100644
index 000000000000..8f9d902d2c87
--- /dev/null
+++ b/third_party/xla/xla/hlo/translate/tests/stablehlo_while_free_vars.mlir
@@ -0,0 +1,91 @@
+// RUN: xla-translate --stablehlo-to-hlo-text -split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --stablehlo-legalize-to-hlo=convert-xla-supported-stablehlo=false -split-input-file %s | FileCheck %s --check-prefix CHECK-DIRECT
+
+// This test verifies that the correct shardings are added when a while loop
+// has free variables.
+
+// CHECK-LABEL: HloModule main
+
+// CHECK:      %[[BODY:region_0.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[], s32[], f32[4])) -> (s32[], f32[4], s32[], s32[], f32[4]) {
+// CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[], s32[], f32[4]) parameter(0)
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
+// CHECK-DAG:    %[[GTE12:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=3
+// CHECK-DAG:    %[[GTE13:get-tuple-element.*]] = f32[4] get-tuple-element(%[[ARG_TUPLE]]), index=4, sharding={devices=[4]<=[4]}
+// CHECK-DAG:    %[[ADD14:add.*]] = s32[] add(%get-tuple-element.{{.*}}, %[[GTE12]])
+// CHECK-DAG:    %[[ADD15:add.*]] = f32[4] add(%get-tuple-element.{{.*}}, %[[GTE13]])
+// CHECK:        ROOT %tuple.{{.*}} = (s32[], f32[4], s32[], s32[], f32[4]) tuple(%[[ADD14]], %[[ADD15]], %get-tuple-element.{{.*}}, %[[GTE12]], %[[GTE13]])
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
+
+// CHECK:      %[[COND:region_1.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[], s32[], f32[4])) -> pred[] {
+// CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[], s32[], f32[4]) parameter(0)
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
+// CHECK:        %[[GTE21:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=2
+// CHECK-NEXT:   ROOT %compare.{{.*}} = pred[] compare(%get-tuple-element.{{.*}}, %[[GTE21]]), direction=LT
+
+// CHECK:      ENTRY %main.{{.*}} ([[ARG0:Arg_0.*]]: s32[], [[ARG1:Arg_1.*]]: f32[4], [[ARG2:Arg_2.*]]: f32[4]) -> f32[4] {
+// CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
+// CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1)
+// CHECK-NEXT:   %[[CONSTANT4:constant.*]] = s32[] constant(0)
+// CHECK-NEXT:   %[[CONSTANT5:constant.*]] = s32[] constant(1)
+// CHECK-NEXT:   %[[ARG2]] = f32[4] parameter(2)
+// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], s32[], s32[], f32[4]) tuple(%[[ARG0]], %[[ARG1]], %[[CONSTANT4]], %[[CONSTANT5]], %[[ARG2]])
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %[[WHILE:while.25]] = (s32[], f32[4], s32[], s32[], f32[4]) while(%[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
+// CHECK-SAME:     sharding={{\{}}{replicated}, {devices=[2,2]<=[4] last_tile_dim_replicate}, {replicated}, {replicated}, {devices=[4]<=[4]}}
+// CHECK-NEXT:   %[[GTE26:get-tuple-element.*]] = s32[] get-tuple-element(%[[WHILE]]), index=0
+// CHECK-NEXT:   ROOT %[[GTE27:get-tuple-element.*]] = f32[4] get-tuple-element(%[[WHILE]]), index=1, sharding={devices=[2,2]<=[4] last_tile_dim_replicate}
+
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<4xf32> {mhlo.sharding = "{devices=[4]<=[4]}"}) -> tensor<4xf32> {
+  %c = stablehlo.constant dense<0> : tensor<i32>
+  %c_0 = stablehlo.constant dense<1> : tensor<i32>
+  %0:2 = stablehlo.while(%iterArg = %arg0, %iterArg_1 = %arg1) : tensor<i32>, tensor<4xf32> attributes {mhlo.sharding = "{{replicated},{devices=[2,2]<=[4] last_tile_dim_replicate}}"}
+    cond {
+    %1 = stablehlo.compare  LT, %iterArg, %c : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    stablehlo.return %1 : tensor<i1>
+  } do {
+    %1 = stablehlo.add %iterArg, %c_0 : tensor<i32>
+    %2 = stablehlo.add %iterArg_1, %arg2 : tensor<4xf32>
+    stablehlo.return %1, %2 : tensor<i32>, tensor<4xf32>
+  }
+  return %0#1 : tensor<4xf32>
+}
+// CHECK-DIRECT: stablehlo.while
+
+// -----
+
+// This test verifies that a value captured multiple times is only lifted once
+// and all its uses are replaced. Also verifies that no sharding is added to
+// region parameters or root when the while doesn't have a sharding.
+
+// CHECK-LABEL: HloModule main
+
+// CHECK:      %[[BODY:region_0.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[])) -> (s32[], f32[4], s32[]) {
+// CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[]) parameter(0)
+// CHECK:        %[[GTE:get-tuple-element.*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=2
+// CHECK:        %[[ADD:add.*]] = s32[] add(%get-tuple-element.{{.*}}, %[[GTE]])
+// CHECK:        ROOT %tuple.{{.*}} = (s32[], f32[4], s32[]) tuple(%[[ADD]], %get-tuple-element.{{.*}}, %[[GTE]])
+
+// CHECK:      %[[COND:region_1.*]] ([[ARG_TUPLE:arg_tuple.*]]: (s32[], f32[4], s32[])) -> pred[] {
+// CHECK-NEXT:   %[[ARG_TUPLE]] = (s32[], f32[4], s32[]) parameter(0)
+// CHECK:        %[[GTE:get-tuple-element..*]] = s32[] get-tuple-element(%[[ARG_TUPLE]]), index=2
+// CHECK:        ROOT %compare.{{.*}} = pred[] compare(%get-tuple-element.{{.*}}, %[[GTE]]), direction=LT
+
+// CHECK:      ENTRY %main.{{.*}} ([[ARG0:Arg_0.*]]: s32[], [[ARG1:Arg_1.*]]: f32[4], [[ARG2:Arg_2.*]]: s32[]) -> f32[4] {
+// CHECK-NEXT:   %[[ARG0]] = s32[] parameter(0)
+// CHECK-NEXT:   %[[ARG1]] = f32[4] parameter(1)
+// CHECK-NEXT:   %[[ARG2]] = s32[] parameter(2)
+// CHECK-NEXT:   %[[TUPLE:tuple.*]] = (s32[], f32[4], s32[]) tuple(%[[ARG0]], %[[ARG1]], %[[ARG2]])
+// CHECK-NEXT:   %while.{{.*}} = (s32[], f32[4], s32[]) while(%[[TUPLE]]), condition=%[[COND]], body=%[[BODY]]
+
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<4xf32>, %arg2: tensor<i32>) -> tensor<4xf32> {
+  %0:2 = stablehlo.while(%iterArg = %arg0, %iterArg_0 = %arg1) : tensor<i32>, tensor<4xf32>
+    cond {
+    %1 = stablehlo.compare  LT, %iterArg, %arg2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    stablehlo.return %1 : tensor<i1>
+  } do {
+    %1 = stablehlo.add %iterArg, %arg2 : tensor<i32>
+    stablehlo.return %1, %iterArg_0 : tensor<i32>, tensor<4xf32>
+  }
+  return %0#1 : tensor<4xf32>
+}
+// CHECK-DIRECT: stablehlo.while
diff --git a/third_party/xla/xla/hlo/translate/tests/vhlo_input.mlir b/third_party/xla/xla/hlo/translate/tests/vhlo_input.mlir
index f7019397c1ba..062fe9e0b6bd 100644
--- a/third_party/xla/xla/hlo/translate/tests/vhlo_input.mlir
+++ b/third_party/xla/xla/hlo/translate/tests/vhlo_input.mlir
@@ -10,9 +10,9 @@
 func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
   // CHECK: %Arg_0.1 = f32[4] parameter(0)
   // CHECK: %Arg_1.2 = f32[4] parameter(1)
-  // CHECK: %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
+  // CHECK: %add.3 = f32[4] add(%Arg_0.1, %Arg_1.2)
   %0 = stablehlo.add %arg0, %arg1 : tensor<4xf32>
-  // CHECK: ROOT %dot.4 = f32[] dot(f32[4] %add.3, f32[4] %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  // CHECK: ROOT %dot.4 = f32[] dot(%add.3, %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
   %1 = stablehlo.dot %0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
   func.return %1 : tensor<f32>
 }
\ No newline at end of file
diff --git a/third_party/xla/xla/hlo/translate/xla_translate_main.cc b/third_party/xla/xla/hlo/translate/xla_translate_main.cc
index 86d44d08b46e..530a89546f8f 100644
--- a/third_party/xla/xla/hlo/translate/xla_translate_main.cc
+++ b/third_party/xla/xla/hlo/translate/xla_translate_main.cc
@@ -101,8 +101,9 @@ int main(int argc, char** argv) {
   };
 
   if (splitInputFile) {
-    if (failed(mlir::splitAndProcessBuffer(std::move(input), processBuffer,
-                                           output->os())))
+    if (failed(mlir::splitAndProcessBuffer(
+            std::move(input), processBuffer, output->os(),
+            mlir::kDefaultSplitMarker, mlir::kDefaultSplitMarker)))
       return 1;
   } else {
     if (failed(processBuffer(std::move(input), output->os()))) return 1;
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index e65125c71b91..bcf8bb64bba1 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -2,9 +2,10 @@
 #   Implementation of XLA’s HLO utilities used for higher-level transformations.
 
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -38,13 +39,13 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla/hlo/analysis:hlo_alias_analysis",
-        "//xla/hlo/analysis:hlo_dataflow_analysis",
-        "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_buffer",
         "//xla/service:hlo_value",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -60,6 +61,7 @@ xla_cc_test(
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -81,7 +83,9 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -106,6 +110,7 @@ cc_library(
     hdrs = [
         "hlo_sharding_util.h",
     ],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":hlo_container_util",
         "//xla:array",
@@ -219,9 +224,9 @@ xla_cc_test(
     deps = [
         ":hlo_traversal",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/hlo/utils/concurrency/BUILD b/third_party/xla/xla/hlo/utils/concurrency/BUILD
new file mode 100644
index 000000000000..6101e251ef23
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/BUILD
@@ -0,0 +1,101 @@
+# Infrastructure for parallelization of compilation tasks.
+load("//xla:xla.default.bzl", "xla_cc_test")
+# copybara:uncomment load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "type_adapters",
+    hdrs = ["type_adapters.h"],
+    # copybara:uncomment compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@com_google_absl//absl/functional:any_invocable",
+    ],
+)
+
+cc_library(
+    name = "tsl_task_executor",
+    srcs = ["tsl_task_executor.cc"],
+    hdrs = ["tsl_task_executor.h"],
+    # copybara:uncomment compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":type_adapters",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
+cc_library(
+    name = "concurrency_utils",
+    hdrs = ["concurrency_utils.h"],
+    # copybara:uncomment compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":tsl_task_executor",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+## Tests below.
+
+xla_cc_test(
+    name = "type_adapters_test",
+    srcs = ["type_adapters_test.cc"],
+    deps = [
+        ":type_adapters",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "tsl_task_executor_test",
+    size = "small",
+    srcs = ["tsl_task_executor_test.cc"],
+    deps = [
+        ":tsl_task_executor",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "concurrency_utils_test",
+    size = "small",
+    srcs = ["concurrency_utils_test.cc"],
+    deps = [
+        ":concurrency_utils",
+        ":tsl_task_executor",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/utils/concurrency/concurrency_utils.h b/third_party/xla/xla/hlo/utils/concurrency/concurrency_utils.h
new file mode 100644
index 000000000000..d7abb443c87a
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/concurrency_utils.h
@@ -0,0 +1,212 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_UTILS_CONCURRENCY_CONCURRENCY_UTILS_H_
+#define XLA_HLO_UTILS_CONCURRENCY_CONCURRENCY_UTILS_H_
+
+#include <algorithm>
+#include <iterator>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/utils/concurrency/tsl_task_executor.h"
+
+namespace xla::concurrency {
+// Runs an action on all elements from an iterator. A successful run collects
+// all the return values from actions. The implementation guarantees that the
+// order of returned values corresponds to the order of elements in the argument
+// iterator [action(begin), ... action(end-1)]. Note that the action can mutate
+// the objects it receives from the iterator according to their semantics.
+//
+// The overload below is for actions that return a value. `ActionReturnT` must
+// be default constructible.
+//
+// Returns synchronously when all actions finish. Aborts the run on the first
+// failure. If a run aborts the underlying data is likely to be corrupted or
+// partially modified.
+//
+// For synchronization, clients should make sure that actions do not deadlock or
+// corrupt any state they access. Specifically, if actions access any shared
+// mutable state clients must make sure that such access is synchronized.  The
+// run can deadlock in all the standard ways. Specifically, if the action locks
+// a set of shared resources make sure that all locks are acquired in the same
+// order.
+template <typename ActionReturnT, typename ForwardItT, typename TaskExecutorT>
+#if __cplusplus >= 202002L
+  requires(std::forward_iterator<ForwardItT> && !std::is_void_v<ActionReturnT>)
+#endif
+absl::StatusOr<std::vector<ActionReturnT>> ForEach(
+    ForwardItT begin, ForwardItT end,
+    absl::AnyInvocable<absl::StatusOr<ActionReturnT>(
+        typename std::iterator_traits<ForwardItT>::value_type)>
+        action,
+    TaskExecutorT& task_executor,
+    std::optional<int> parallelism = std::nullopt) {
+  static_assert(!std::is_same_v<ActionReturnT, bool>,
+                "Cannot collect vector<bool> concurrently. If you need bool "
+                "return wrap it in a struct.");
+  auto result_size = std::distance(begin, end);
+  std::vector<ActionReturnT> result_storage(result_size);
+  std::vector<Task> tasks;
+  tasks.reserve(result_size);
+
+  auto result_iterator = result_storage.begin();
+  for (auto argument_iterator = begin; argument_iterator != end;
+       ++argument_iterator) {
+    // If modifying this function, keep an eye on iterator capture.
+    // Specifically, evaluate whether capturing the iterator is correct.
+    // For example, we can capture `result_iterator` because we are using
+    // `std::vector`. Should you want to change the result collection consider
+    // if the capture needs to change.
+    auto argument = *argument_iterator;
+    tasks.push_back([result_iterator, argument, &action]() {
+      auto result = action(argument);
+      if (result.ok()) {
+        *result_iterator = *result;
+      }
+      return result.status();
+    });
+    ++result_iterator;
+  }
+  auto status =
+      task_executor.ExecuteIndependentTasks(std::move(tasks), parallelism);
+  if (status.ok()) {
+    return result_storage;
+  }
+  return status;
+}
+
+// Runs an action on all elements from an iterator. Note that the action must be
+// side-effecting to make any sense, and specifically it can be mutating.
+//
+// Returns synchronously when all actions finish. Aborts the run on the first
+// failure. If a run aborts the underlying data is likely to be corrupted or
+// partially modified.
+//
+// For synchronization, clients should make sure that actions do not deadlock or
+// corrupt any state they access. Specifically, if actions access any shared
+// mutable state clients must make sure that such access is synchronized.  The
+// run can deadlock in all the standard ways. Specifically, if the action locks
+// a set of shared resources make sure that all locks are acquired in the same
+// order.
+template <typename ForwardItT, typename TaskExecutorT>
+#if __cplusplus >= 202002L
+  requires(std::forward_iterator<ForwardItT>)
+#endif
+absl::Status ForEach(ForwardItT begin, ForwardItT end,
+                     absl::AnyInvocable<absl::Status(
+                         typename std::iterator_traits<ForwardItT>::value_type)>
+                         action,
+                     TaskExecutorT& task_executor,
+                     std::optional<int> parallelism = std::nullopt) {
+  auto result_size = std::distance(begin, end);
+  std::vector<Task> tasks;
+  tasks.reserve(result_size);
+
+  for (auto iterator = begin; iterator != end; ++iterator) {
+    auto argument = *iterator;
+    tasks.push_back([argument, &action]() { return action(argument); });
+  }
+  return task_executor.ExecuteIndependentTasks(std::move(tasks), parallelism);
+}
+
+// Specializes `ForEach` for an iterator of `xla::HloComputation` and provides a
+// parameter to use when combining return values from individual actions.
+template <typename FinalReturnT, typename PartialReturnT, typename ForwardItT,
+          typename TaskExecutorT>
+absl::StatusOr<FinalReturnT> ForEachHloComputation(
+    ForwardItT begin, ForwardItT end,
+    absl::AnyInvocable<absl::StatusOr<PartialReturnT>(HloComputation*)> action,
+    absl::AnyInvocable<
+        absl::StatusOr<FinalReturnT>(std::vector<PartialReturnT>&)>
+        combiner,
+    TaskExecutorT& task_executor,
+    std::optional<int> parallelism = std::nullopt) {
+  auto result_for_each =
+      ForEach(begin, end, std::move(action), task_executor, parallelism);
+  if (!result_for_each.ok()) {
+    return result_for_each.status();
+  }
+
+  return combiner(*result_for_each);
+}
+
+// Specializes `ForEach` for a span of `xla::HloComputation` and provides a
+// parameter to use when combining return values from individual actions.
+template <typename FinalReturnT, typename PartialReturnT,
+          typename TaskExecutorT>
+absl::StatusOr<FinalReturnT> ForEachHloComputation(
+    absl::Span<HloComputation* const> computations,
+    absl::AnyInvocable<absl::StatusOr<PartialReturnT>(HloComputation*)> action,
+    absl::AnyInvocable<
+        absl::StatusOr<FinalReturnT>(std::vector<PartialReturnT>&)>
+        combiner,
+    TaskExecutorT& task_executor,
+    std::optional<int> parallelism = std::nullopt) {
+  return ForEachHloComputation(computations.begin(), computations.end(),
+                               std::move(action), std::move(combiner),
+                               task_executor, parallelism);
+}
+
+// Specializes `ForEachHloComputation` to take an `xla::HloModule` and run on
+// all computations in it.
+template <typename FinalReturnT, typename PartialReturnT,
+          typename TaskExecutorT>
+absl::StatusOr<FinalReturnT> ForEachHloComputation(
+    HloModule* module,
+    absl::AnyInvocable<absl::StatusOr<PartialReturnT>(HloComputation*)> action,
+    absl::AnyInvocable<
+        absl::StatusOr<FinalReturnT>(std::vector<PartialReturnT>&)>
+        combiner,
+    TaskExecutorT& task_executor,
+    std::optional<int> parallelism = std::nullopt) {
+  // The returned type is not a `forward_iterator` so we create one.
+  auto it = module->computations();
+  std::vector<HloComputation*> computations{it.begin(), it.end()};
+  return ForEachHloComputation(computations, std::move(action),
+                               std::move(combiner), task_executor, parallelism);
+}
+
+// Specializes `ForEachHloComputation` to take an `xla::HloModule` and run on
+// all non-fusion computations in it.
+template <typename FinalReturnT, typename PartialReturnT,
+          typename TaskExecutorT>
+absl::StatusOr<FinalReturnT> ForEachNonfusionHloComputation(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    absl::AnyInvocable<absl::StatusOr<PartialReturnT>(HloComputation*)> action,
+    absl::AnyInvocable<
+        absl::StatusOr<FinalReturnT>(std::vector<PartialReturnT>&)>
+        combiner,
+    TaskExecutorT& task_executor,
+    std::optional<int> parallelism = std::nullopt) {
+  auto computations = module->MakeNonfusionComputations(execution_threads);
+  return ForEachHloComputation(computations, std::move(action),
+                               std::move(combiner), task_executor, parallelism);
+}
+
+}  // namespace xla::concurrency
+
+#endif  // XLA_HLO_UTILS_CONCURRENCY_CONCURRENCY_UTILS_H_
diff --git a/third_party/xla/xla/hlo/utils/concurrency/concurrency_utils_test.cc b/third_party/xla/xla/hlo/utils/concurrency/concurrency_utils_test.cc
new file mode 100644
index 000000000000..242e17c79521
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/concurrency_utils_test.cc
@@ -0,0 +1,255 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/utils/concurrency/concurrency_utils.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/utils/concurrency/tsl_task_executor.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::concurrency {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+struct WrappedT {
+  T val;
+};
+
+TEST(ForEachTest, IterVariantConcurrentlyIncrementsIntegers) {
+  TslTaskExecutor task_executor(5);
+
+  constexpr int kx0 = 0;
+  constexpr int kx1 = 1;
+  constexpr int kx2 = 2;
+
+  int v0 = kx0;
+  int v1 = kx1;
+  int v2 = kx2;
+
+  std::vector<int*> v = {&v0, &v1, &v2};
+
+  ASSERT_EQ(ForEach(
+                v.begin(), v.end(),
+                [](int* element) {
+                  ++(*element);
+                  return absl::OkStatus();
+                },
+                task_executor),
+            absl::OkStatus());
+
+  EXPECT_EQ(v0, kx0 + 1);
+  EXPECT_EQ(v1, kx1 + 1);
+  EXPECT_EQ(v2, kx2 + 1);
+}
+
+TEST(ForEachTest, NonOkStatusPropagatesAsTheFinalResult) {
+  const absl::Status status = absl::CancelledError("Test Error");
+
+  TslTaskExecutor task_executor{3};
+
+  constexpr int kx0 = 0;
+  constexpr int kx1 = 1;
+  constexpr int kx2 = 2;
+
+  int v0 = kx0;
+  int v1 = kx1;
+  int v2 = kx2;
+
+  std::vector<int*> v = {&v0, &v1, &v2};
+
+  EXPECT_THAT(ForEach(
+                  v.begin(), v.end(),
+                  [&status](int* element) { return status; }, task_executor)
+                  .code(),
+              absl::StatusCode::kCancelled);
+}
+
+TEST(ForEachTest, ActionReturnedValuesCollected) {
+  TslTaskExecutor task_executor{3};
+
+  constexpr int kx0 = 0;
+  constexpr int kx1 = 1;
+  constexpr int kx2 = 2;
+
+  int v0 = kx0;
+  int v1 = kx1;
+  int v2 = kx2;
+
+  std::vector<int*> v = {&v0, &v1, &v2};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      (ForEach<int>(
+          v.begin(), v.end(),
+          [](int* element) -> absl::StatusOr<int> { return ++(*element); },
+          task_executor)));
+
+  EXPECT_EQ(v0, kx0 + 1);
+  EXPECT_EQ(v1, kx1 + 1);
+  EXPECT_EQ(v2, kx2 + 1);
+
+  EXPECT_THAT(result, ElementsAreArray({1, 2, 3}));
+}
+
+TEST(ForEachTest, FailureOfTheFirstActionPropagates) {
+  TslTaskExecutor task_executor{3};
+
+  constexpr int kx0 = 0;
+  constexpr int kx1 = 1;
+  constexpr int kx2 = 2;
+
+  int v0 = kx0;
+  int v1 = kx1;
+  int v2 = kx2;
+
+  std::vector<int*> v = {&v0, &v1, &v2};
+
+  EXPECT_EQ(ForEach<int>(
+                v.begin(), v.end(),
+                [](int* element) -> absl::StatusOr<int> {
+                  if (*element % 2 == 1)
+                    return absl::CancelledError("Force a failure.");
+                  return ++(*element);
+                },
+                task_executor)
+                .status()
+                .code(),
+            absl::StatusCode::kCancelled);
+}
+
+class HloComputationTest : public HloHardwareIndependentTestBase {
+ protected:
+  HloComputationTest() = default;
+
+  // Create a computation which takes a scalar and returns its negation.
+  std::unique_ptr<HloComputation> CreateNegateComputation(
+      absl::string_view name = "Negate") {
+    auto builder = HloComputation::Builder(name);
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, r0f32_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
+    return builder.Build();
+  }
+
+  std::unique_ptr<HloModule> CreateNegateModule() {
+    auto module =
+        std::make_unique<HloModule>("NegateModule", HloModuleConfig{});
+    module->AddComputation(CreateNegateComputation("Negate0"), true);
+    module->AddComputation(CreateNegateComputation("Negate1"), false);
+    module->AddComputation(CreateNegateComputation("Negate2"), false);
+
+    return module;
+  };
+
+  Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
+
+  TslTaskExecutor task_executor_{5};
+};
+
+TEST_F(HloComputationTest, ForEachHloComputationBasicCall) {
+  auto comp0 = CreateNegateComputation();
+  auto comp1 = CreateNegateComputation();
+  auto comp2 = CreateNegateComputation();
+
+  std::vector<HloComputation*> v = {comp0.get(), comp1.get(), comp2.get()};
+
+  auto result = ForEachHloComputation<bool, WrappedT<bool>>(
+      v.begin(), v.end(),
+      [](HloComputation* comp) -> absl::StatusOr<WrappedT<bool>> {
+        return WrappedT<bool>{true};
+      },
+      [](std::vector<WrappedT<bool>>& results) {
+        return std::any_of(results.begin(), results.end(),
+                           [](WrappedT<bool> b) { return b.val; });
+      },
+      task_executor_);
+  // For compatibility with OpenXLA.
+  ASSERT_EQ(result.status(), absl::OkStatus());
+  EXPECT_EQ(*result, true);
+}
+
+TEST_F(HloComputationTest, ForEachHloComputationSpanBasicCall) {
+  auto comp0 = CreateNegateComputation();
+  auto comp1 = CreateNegateComputation();
+  auto comp2 = CreateNegateComputation();
+
+  std::vector<HloComputation*> v = {comp0.get(), comp1.get(), comp2.get()};
+
+  auto result = ForEachHloComputation<bool, WrappedT<bool>>(
+      v,
+      [](HloComputation* comp) -> absl::StatusOr<WrappedT<bool>> {
+        return WrappedT<bool>{true};
+      },
+      [](std::vector<WrappedT<bool>>& results) {
+        return std::any_of(results.begin(), results.end(),
+                           [](WrappedT<bool> b) { return b.val; });
+      },
+      task_executor_);
+  // For compatibility with OpenXLA.
+  ASSERT_EQ(result.status(), absl::OkStatus());
+  EXPECT_EQ(*result, true);
+}
+
+TEST_F(HloComputationTest, ForEachHloComputationModuleBasicCall) {
+  auto module = CreateNegateModule();
+
+  auto result = ForEachHloComputation<int, WrappedT<bool>>(
+      module.get(),
+      [](HloComputation* comp) -> absl::StatusOr<WrappedT<bool>> {
+        return WrappedT<bool>{true};
+      },
+      [](std::vector<WrappedT<bool>>& results) { return results.size(); },
+      task_executor_);
+  // For compatibility with OpenXLA.
+  ASSERT_EQ(result.status(), absl::OkStatus());
+  EXPECT_EQ(*result, 3);
+}
+
+TEST_F(HloComputationTest, ForEachNonfusionHloComputationModuleBasicCall) {
+  auto module = CreateNegateModule();
+
+  auto result = ForEachNonfusionHloComputation<int, WrappedT<bool>>(
+      module.get(), {},
+      [](HloComputation* comp) -> absl::StatusOr<WrappedT<bool>> {
+        return WrappedT<bool>{true};
+      },
+      [](std::vector<WrappedT<bool>>& results) { return results.size(); },
+      task_executor_);
+  // For compatibility with OpenXLA.
+  ASSERT_EQ(result.status(), absl::OkStatus());
+  EXPECT_EQ(*result, 3);
+}
+
+}  // namespace
+}  // namespace xla::concurrency
diff --git a/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor.cc b/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor.cc
new file mode 100644
index 000000000000..c363b5420ca1
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor.cc
@@ -0,0 +1,137 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/utils/concurrency/tsl_task_executor.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/utils/concurrency/type_adapters.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace xla::concurrency {
+namespace {
+
+int ResolveParallelism(std::optional<int> parallelism) {
+  if (!parallelism.has_value() || *parallelism <= 0 ||
+      *parallelism > tsl::port::MaxParallelism()) {
+    return tsl::port::MaxParallelism();
+  }
+  return *parallelism;
+}
+
+// Run all actions in a loop within a single schedulable unit.
+// This way we guarantee sequential execution.
+void DispatchSequentialRun(tsl::thread::ThreadPool* thread_pool,
+                           absl::Status& final_status,
+                           absl::BlockingCounter& finished_counter,
+                           std::vector<Task>& original_actions) {
+  thread_pool->Schedule(
+      [&final_status, &finished_counter,
+       actions = TurnMoveOnlyToCopyableWithCaching<absl::Status>::FromVector(
+           std::move(original_actions))]() mutable {
+        for (auto& action : actions) {
+          auto action_status = std::move(action)();
+          if (!action_status.ok()) {
+            final_status = action_status;
+            finished_counter
+                .DecrementCount();  // this will unblock the caller; count == 1
+            return;
+          }
+        }
+        final_status = absl::OkStatus();
+        finished_counter.DecrementCount();
+      });
+}
+
+// Run each action as a separately schedulable unit.
+void DispatchParallelRun(tsl::thread::ThreadPool* thread_pool,
+                         absl::Status& final_status,
+                         absl::BlockingCounter& finished_counter,
+                         absl::Mutex& mu_final_status,
+                         std::vector<Task>& actions) {
+  // When using `tsl::thread::ThreadPool` directly we need to count successful
+  // tasks and signal finish once all are done. Without `finished_conuter` we
+  // do not know when to set `absl::OkStatus()` on the latch.
+  for (auto& action : actions) {
+    thread_pool->Schedule([&final_status, &finished_counter, &mu_final_status,
+                           action = TurnMoveOnlyToCopyableWithCaching(
+                               std::move(action))]() mutable {
+      // Pseudo-cancellation.
+      // The actions will not be invoked. However, the `ThreadPool` will
+      // iterate through all the scheduled tasks and check the status.
+      // Cancellation complexity is O(#tasks).
+      absl::Status current_status = absl::OkStatus();
+      {
+        absl::ReaderMutexLock reader_lock{&mu_final_status};
+        current_status = final_status;
+      }
+      if (current_status.ok()) {
+        auto action_status = std::move(action)();
+        if (!action_status.ok()) {
+          absl::MutexLock write_lock{&mu_final_status};
+          final_status = action_status;
+        }
+      }
+      // Must be the last thing we touch.
+      finished_counter.DecrementCount();
+    });
+  }
+}
+
+}  // namespace
+
+TslTaskExecutor::TslTaskExecutor(std::optional<int> max_parallelism) {
+  auto parallelism = ResolveParallelism(max_parallelism);
+
+  thread_pool_ = std::make_unique<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), kThreadPoolName, parallelism);
+}
+
+absl::Status TslTaskExecutor::ExecuteIndependentTasks(
+    std::vector<Task> tasks, std::optional<int> parallelism) {
+  auto actual_parallelism = ResolveParallelism(parallelism);
+
+  if (actual_parallelism == 1) {  // NOMUTANTS -- Functionally equivalent code
+                                  // paths; but the other is parallelized.
+    // Enforce sequential execution for debugging.
+    absl::BlockingCounter finished_counter(1);
+    absl::Status final_status = absl::OkStatus();
+    DispatchSequentialRun(thread_pool_.get(), final_status, finished_counter,
+                          tasks);
+    finished_counter.Wait();
+    return final_status;
+  }
+
+  absl::Status final_status = absl::OkStatus();
+  {
+    absl::BlockingCounter finished_counter(tasks.size());
+    absl::Mutex mu_final_status;
+
+    DispatchParallelRun(thread_pool_.get(), final_status, finished_counter,
+                        mu_final_status, tasks);
+    // Wait for all tasks to finish, so `latch` can be destroyed.
+    finished_counter.Wait();
+  }
+  return final_status;
+}
+}  // namespace xla::concurrency
diff --git a/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor.h b/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor.h
new file mode 100644
index 000000000000..dbea62a3585b
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor.h
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_UTILS_CONCURRENCY_TSL_TASK_EXECUTOR_H_
+#define XLA_HLO_UTILS_CONCURRENCY_TSL_TASK_EXECUTOR_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace xla::concurrency {
+
+// Tasks must signal a status. We promise to call tasks at most once.
+using Task = absl::AnyInvocable<absl::Status() &&>;
+
+// A thread pool with a higher-level API for parallelization of compiler passes.
+// Not thread safe.
+//
+// All calls are synchronous. Specifically, the call to parallelize work blocks
+// until the work is done, or canceled due to failure of any of the submitted
+// tasks. Once a parallelization call unblocks implementatinos must guarantee
+// that no value caputerd by any of the submitted tasks would be accessed going
+// forward. Specifically, any captured values can be destroyed after the
+// parallelization call returns, even when the work is cancelled.
+//
+// This design is chosen for simplicity & expediency. It has obvious downside
+// that blocking until all work is done will result in many threads idling
+// towards the end of the execution.
+//
+// Features
+// - Batch submitted for execution fails if any individual task fails.
+// - Guarantees in-order processing of tasks when `parallelism` is 1.
+class TslTaskExecutor {
+ public:
+  // Runs all the actions on `parallelism` theads. If fewer threads are
+  // available, runs on as many as it has.
+  //
+  // When `parallelism` == 1 sequential execution is guaranteed.
+  absl::Status ExecuteIndependentTasks(
+      std::vector<Task> tasks, std::optional<int> parallelism = std::nullopt);
+
+  explicit TslTaskExecutor(std::optional<int> max_parallelism = std::nullopt);
+
+ private:
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
+
+  // std::string because `tsl::thread::ThreadPool` wants a string and not a
+  // view.
+  const std::string kThreadPoolName = "TslTaskExecutor";
+};
+
+}  // namespace xla::concurrency
+#endif  // XLA_HLO_UTILS_CONCURRENCY_TSL_TASK_EXECUTOR_H_
diff --git a/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor_test.cc b/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor_test.cc
new file mode 100644
index 000000000000..1ccd5129463d
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/tsl_task_executor_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/utils/concurrency/tsl_task_executor.h"
+
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+
+namespace xla::concurrency {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+TEST(TslTaskExecutorTest, ParallelismOneExecutesInOrder) {
+  const int kSlowWrite = 42;
+  const int kMediumWrite = 79;
+  const int kFastWrite = 255;
+
+  const unsigned int kSlowWait = 1000;
+  const unsigned int kMediumWait = 300;
+  const unsigned int kFastWait = 10;
+
+  auto task_executor = TslTaskExecutor(3);
+
+  std::vector<int> results;
+
+  std::vector<Task> actions;
+  actions.push_back([&results, kSlowWrite]() {
+    absl::SleepFor(absl::Milliseconds(kSlowWait));
+    results.push_back(kSlowWrite);
+    return absl::OkStatus();
+  });
+  actions.push_back([&results, kMediumWrite]() {
+    absl::SleepFor(absl::Milliseconds(kMediumWait));
+    results.push_back(kMediumWrite);
+    return absl::OkStatus();
+  });
+  actions.push_back([&results, kFastWrite]() {
+    absl::SleepFor(absl::Milliseconds(kFastWait));
+    results.push_back(kFastWrite);
+    return absl::OkStatus();
+  });
+
+  TF_ASSERT_OK(task_executor.ExecuteIndependentTasks(std::move(actions), 1));
+  EXPECT_THAT(results,
+              ElementsAreArray({kSlowWrite, kMediumWrite, kFastWrite}));
+}
+
+TEST(TslTaskExecutorTest, SuccessfulExecutionReturnsOkStatus) {
+  auto task_executor = TslTaskExecutor(3);
+
+  std::vector<int> results;
+
+  std::vector<Task> actions;
+  for (int i = 0; i < 20; ++i) {
+    actions.push_back([]() { return absl::OkStatus(); });
+  }
+
+  TF_EXPECT_OK(task_executor.ExecuteIndependentTasks(std::move(actions)));
+}
+
+TEST(TaskExecutor, OnFailureNotAllWorkFinishes) {
+  const int kBeforeCount = 20;
+  const int kAfterCount = 100;
+  const int kThreadCount = 5;
+  auto task_executor = TslTaskExecutor(kThreadCount);
+
+  int finish_counter = 0;
+  absl::Mutex mu_finish_counter;
+
+  std::vector<Task> actions;
+  for (int i = 0; i < kBeforeCount; ++i) {
+    actions.push_back([&]() {
+      absl::MutexLock lock{&mu_finish_counter};
+      ++finish_counter;
+      absl::SleepFor(absl::Milliseconds(10));
+      return absl::OkStatus();
+    });
+  }
+
+  actions.push_back(
+      []() { return absl::UnimplementedError("force a failure"); });
+
+  for (int i = 0; i < kAfterCount; ++i) {
+    actions.push_back([&]() {
+      absl::MutexLock lock{&mu_finish_counter};
+      ++finish_counter;
+      absl::SleepFor(absl::Milliseconds(10));
+      return absl::OkStatus();
+    });
+  }
+
+  // ::testing::StatusIs not available in oss.  //  copybara:strip
+  EXPECT_EQ(task_executor.ExecuteIndependentTasks(std::move(actions)).code(),
+            absl::StatusCode::kUnimplemented);
+}
+
+}  // namespace
+}  // namespace xla::concurrency
diff --git a/third_party/xla/xla/hlo/utils/concurrency/type_adapters.h b/third_party/xla/xla/hlo/utils/concurrency/type_adapters.h
new file mode 100644
index 000000000000..e5af118e709a
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/type_adapters.h
@@ -0,0 +1,66 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_UTILS_CONCURRENCY_TYPE_ADAPTERS_H_
+#define XLA_HLO_UTILS_CONCURRENCY_TYPE_ADAPTERS_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+
+namespace xla::concurrency {
+
+// Turn a move-only & call-once function to copyable by caching.
+//
+// Basically a `absl::AnyInvocable<R(T)&&>` -> `std::function<R(T)>`.
+template <typename R>
+class TurnMoveOnlyToCopyableWithCaching {
+ public:
+  using InnerFunT = absl::AnyInvocable<R() &&>;
+  explicit TurnMoveOnlyToCopyableWithCaching(InnerFunT inner_fun)
+      : fun_{std::make_shared<InnerFunT>(std::move(inner_fun))} {}
+
+  // Wraps each element of a vector of move-only functions to make them
+  // copyable.
+  static std::vector<TurnMoveOnlyToCopyableWithCaching<R>> FromVector(
+      std::vector<InnerFunT> funs) {
+    std::vector<TurnMoveOnlyToCopyableWithCaching<R>> res;
+    res.reserve(funs.size());
+    for (auto& f : funs) {
+      res.emplace_back(std::move(f));
+    }
+    return res;
+  }
+
+  // Make it callable.
+  R operator()() {
+    if (res_ == nullptr) {
+      res_ = std::make_shared<R>(std::move(*fun_)());
+    }
+    return *res_;
+  }
+
+ private:
+  std::shared_ptr<InnerFunT> fun_ = nullptr;
+  std::shared_ptr<R> res_ = nullptr;
+};
+
+// CADT
+template <typename R>
+TurnMoveOnlyToCopyableWithCaching(R r) -> TurnMoveOnlyToCopyableWithCaching<R>;
+
+}  // namespace xla::concurrency
+#endif  // XLA_HLO_UTILS_CONCURRENCY_TYPE_ADAPTERS_H_
diff --git a/third_party/xla/xla/hlo/utils/concurrency/type_adapters_test.cc b/third_party/xla/xla/hlo/utils/concurrency/type_adapters_test.cc
new file mode 100644
index 000000000000..cb89a8c2771a
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/concurrency/type_adapters_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/utils/concurrency/type_adapters.h"
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/functional/any_invocable.h"
+
+namespace xla::concurrency {
+namespace {
+using ::testing::ElementsAreArray;
+
+int call_fun(std::function<int()> f) { return f(); }
+
+TEST(TurnMoveOnlyToCopyableWithCachingTest, CanCopyAssign) {
+  const int kVal = -42;
+  absl::AnyInvocable<int() &&> my_fun = []() { return kVal; };
+
+  auto copyable_my_fun =
+      TurnMoveOnlyToCopyableWithCaching<int>(std::move(my_fun));
+  EXPECT_EQ(copyable_my_fun(), kVal);
+
+  auto my_fun_copy = copyable_my_fun;
+  EXPECT_EQ(copyable_my_fun(), kVal);
+}
+
+TEST(TurnMoveOnlyToCopyableWithCachingTest, CanCaptureCopyable) {
+  const int kVal = -42;
+  absl::AnyInvocable<int() &&> my_fun = []() { return kVal; };
+
+  EXPECT_EQ(call_fun([f = TurnMoveOnlyToCopyableWithCaching<int>(
+                          std::move(my_fun))]() mutable { return f(); }),
+            kVal);
+}
+
+TEST(TurnMoveOnlyToCopyableWithCachingTest, VectorWrappingWrapsEachElement) {
+  const int kVal0 = 42;
+  const int kVal1 = 77;
+
+  std::vector<absl::AnyInvocable<int() &&>> funs;
+  funs.push_back([]() { return kVal0; });
+  funs.push_back([]() { return kVal1; });
+
+  std::vector<int> call0;
+  std::vector<int> call1;
+  for (auto& f :
+       TurnMoveOnlyToCopyableWithCaching<int>::FromVector(std::move(funs))) {
+    call0.push_back(f());
+    call1.push_back(f());
+  }
+
+  EXPECT_THAT(call0, ElementsAreArray({kVal0, kVal1}));
+  EXPECT_THAT(call1, ElementsAreArray({kVal0, kVal1}));
+}
+}  // namespace
+}  // namespace xla::concurrency
diff --git a/third_party/xla/xla/hlo/utils/hlo_live_range.cc b/third_party/xla/xla/hlo/utils/hlo_live_range.cc
index 61ce4c68dfe4..d223a1b329d0 100644
--- a/third_party/xla/xla/hlo/utils/hlo_live_range.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_live_range.cc
@@ -26,6 +26,8 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -49,7 +51,7 @@ absl::StatusOr<std::unique_ptr<HloLiveRange>> HloLiveRange::Run(
   hlo_live_range->FlattenSchedule(*computation);
   hlo_live_range->CalculateBufferStartEndMap();
   hlo_live_range->NormalizeAliasedBuffers();
-  return std::move(hlo_live_range);
+  return hlo_live_range;
 }
 
 void HloLiveRange::NormalizeAliasedBuffers() {
@@ -217,8 +219,9 @@ void HloLiveRange::CalculateBufferStartEndMap() {
     auto async_context_it = computations_in_async_context_.find(computation);
     if (async_context_it != computations_in_async_context_.end()) {
       const HloComputation* async_context = async_context_it->second;
-      CHECK(async_context->IsAsyncComputation());
-      auto async_done = async_context->AsyncStart()->async_chain_done();
+      auto async_start = async_context->GetUniqueCaller(HloOpcode::kAsyncStart);
+      CHECK(async_start) << "Async computations should have a unique caller.";
+      auto async_done = (*async_start)->async_chain_done();
       auto async_done_it = instruction_schedule_.find(async_done);
       CHECK(async_done_it != instruction_schedule_.end());
       definition_end_time =
@@ -321,9 +324,10 @@ std::string HloLiveRange::ToString() const {
     auto it = buffer_live_ranges_.find(value);
     if (it != buffer_live_ranges_.end()) {
       if (it->second.start <= peak_moment && peak_moment <= it->second.end) {
-        int64_t bytes = ShapeUtil::ByteSizeOf(value->instruction()->shape(), 8);
-        absl::StrAppendFormat(&output, "    %s: %lld bytes\n",
-                              value->instruction()->name(), bytes);
+        int64_t bytes = ShapeUtil::ByteSizeOf(value->shape(), 8);
+        absl::StrAppendFormat(&output, "    %s%s: %lld bytes\n",
+                              value->instruction()->name(),
+                              value->index().ToString(), bytes);
       }
     }
   }
diff --git a/third_party/xla/xla/hlo/utils/hlo_live_range_test.cc b/third_party/xla/xla/hlo/utils/hlo_live_range_test.cc
index 5dc63e4434f0..5115b5030d5e 100644
--- a/third_party/xla/xla/hlo/utils/hlo_live_range_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_live_range_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -520,5 +521,84 @@ TEST_F(HloLiveRangeTest, Call) {
   EXPECT_EQ(inst_ranges["e"], std::make_pair(6, 7));
 }
 
+TEST_F(HloLiveRangeTest, ToString) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec4_, "paramA"));
+  auto paramX = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec4_, "paramX"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  module_->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module_.get());
+
+  schedule.set_sequence(module_->entry_computation(), {paramA, paramX, mul});
+
+  Analyze(schedule);
+
+  // The peak is at LogicalTime=2, where all three buffers are live. Each array
+  // of four F32 elements takes 16 bytes.
+  std::string expected_string = R"(HloLiveRange (max 3):
+  InstructionSequence:
+    0:paramA
+    1:paramX
+    2:multiply
+  BufferLiveRange:
+    paramA{}:0-3
+    paramX{}:0-3
+    multiply{}:2-3
+  Live ranges at 2 (peak):
+    paramA{}: 16 bytes
+    paramX{}: 16 bytes
+    multiply{}: 16 bytes
+)";
+  EXPECT_EQ(hlo_live_range_->ToString(), expected_string);
+}
+
+TEST_F(HloLiveRangeTest, ToStringTuple) {
+  auto builder = HloComputation::Builder(TestName());
+  auto paramA = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec4_, "paramA"));
+  auto tuple_const = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::MakeTupleOwned(
+          LiteralUtil::CreateR0<float>(1.0f),
+          LiteralUtil::CreateR1<float>({2.0f, 3.0f, 4.0f, 5.0f}))));
+  auto get_tuple_element = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32vec4_, tuple_const, 1));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec4_, HloOpcode::kAdd, paramA, get_tuple_element));
+  module_->AddEntryComputation(builder.Build());
+  HloSchedule schedule(module_.get());
+
+  schedule.set_sequence(module_->entry_computation(),
+                        {paramA, tuple_const, get_tuple_element, add});
+
+  Analyze(schedule);
+
+  // The peak time is at LogicalTime=1, when both constants in the tuple are
+  // live. The tuple itself has two pointers of 8 bytes each, and the two
+  // constants in the tuple have 4 and 16 bytes respectively.
+  std::string expected_string = R"(HloLiveRange (max 4):
+  InstructionSequence:
+    0:paramA
+    1:constant
+    2:get-tuple-element
+    3:add
+  BufferLiveRange:
+    paramA{}:0-4
+    constant{}:1-2
+    constant{0}:1-1
+    constant{1}:1-3
+    add{}:3-4
+  Live ranges at 1 (peak):
+    paramA{}: 16 bytes
+    constant{}: 16 bytes
+    constant{0}: 4 bytes
+    constant{1}: 16 bytes
+)";
+  EXPECT_EQ(hlo_live_range_->ToString(), expected_string);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers.cc b/third_party/xla/xla/hlo/utils/hlo_matchers.cc
index 584381674d07..354dc6485b24 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers.cc
@@ -15,17 +15,23 @@ limitations under the License.
 
 #include "xla/hlo/utils/hlo_matchers.h"
 
+#include <cstdint>
+#include <optional>
 #include <ostream>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace testing {
@@ -228,6 +234,47 @@ void HloShardingMatcher::DescribeTo(std::ostream* os) const {
   }
 }
 
+bool HloFrontendAttributeMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (std::optional<std::string> value =
+          instruction->get_frontend_attribute(key_)) {
+    if (*value == value_) {
+      return true;
+    }
+    *listener << instruction->ToString() << " has incorrect value for '" << key_
+              << "' frontend attribute (expected: " << value_ << ")";
+    return false;
+  }
+
+  *listener << instruction->ToString() << " has no '" << key_
+            << "' frontend attribute (expected: " << value_ << ")";
+  return false;
+}
+
+void HloFrontendAttributeMatcher::DescribeTo(std::ostream* os) const {
+  *os << key_ << " = \"" << value_ << "\"";
+}
+
+bool HloUsedByMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  for (const HloInstruction* user : instruction->users()) {
+    if (used_by_.MatchAndExplain(user, listener)) {
+      return true;
+    }
+  }
+  *listener << instruction->ToString()
+            << " has no users that match expected:\n\t";
+  used_by_.DescribeTo(listener->stream());
+  return false;
+}
+
+void HloUsedByMatcher::DescribeTo(std::ostream* os) const {
+  *os << "used by ";
+  used_by_.DescribeTo(os);
+}
+
 bool HloDotWithContractingDimsMatcher::MatchAndExplain(
     const HloInstruction* instruction,
     ::testing::MatchResultListener* listener) const {
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers.h b/third_party/xla/xla/hlo/utils/hlo_matchers.h
index 93c11551ce8b..ea6e93580eb7 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers.h
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/test.h"
@@ -153,6 +154,37 @@ class HloShardingMatcher
   std::optional<HloSharding> sharding_;
 };
 
+// Verify the frontend attribute with the provided key of an instruction against
+// the provided value.
+class HloFrontendAttributeMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloFrontendAttributeMatcher(absl::string_view key,
+                                       absl::string_view value)
+      : key_(key), value_(value) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  std::string key_, value_;
+};
+
+class HloUsedByMatcher
+    : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  explicit HloUsedByMatcher(::testing::Matcher<const HloInstruction*> used_by)
+      : used_by_(std::move(used_by)) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  ::testing::Matcher<const HloInstruction*> used_by_;
+};
+
 // Matches a Dot HLO instruction with specific LHS and RHS contracting
 // dimensions.
 class HloDotWithContractingDimsMatcher : public HloMatcher {
@@ -325,6 +357,7 @@ HLO_MATCHER(Pad);
 HLO_MATCHER(PartitionId);
 HLO_MATCHER(Power);
 HLO_MATCHER(RaggedAllToAll);
+HLO_MATCHER(RaggedDot);
 HLO_MATCHER(Recv);
 HLO_MATCHER(RecvDone);
 HLO_MATCHER(Reduce);
@@ -509,6 +542,19 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
       new ::xla::testing::HloShardingMatcher(std::nullopt));
 }
 
+// Verifies that the frontend attribute with the given key is present and
+// matches the given value.
+inline ::testing::Matcher<const ::xla::HloInstruction*> FrontendAttribute(
+    absl::string_view key, absl::string_view value) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloFrontendAttributeMatcher(key, value));
+}
+
+inline ::testing::Matcher<const ::xla::HloInstruction*> UsedBy(
+    ::testing::Matcher<const HloInstruction*> used_by) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloUsedByMatcher(used_by));
+}
+
 inline ::testing::Matcher<const ::xla::HloInstruction*> Dot() {
   return ::testing::MakeMatcher(
       new ::xla::testing::HloMatcher(::xla::HloOpcode::kDot, {}));
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
index b93cb355d229..9c3767938ad2 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
@@ -69,36 +69,30 @@ TEST_F(HloMatchersTest, Test) {
               op::Add(op::Parameter(), op::Multiply(_, op::Parameter())));
 
   // Negative matches: check the explanation string.
-  EXPECT_THAT(
-      Explain(add.get(), op::Parameter()),
-      Eq("(%add = f32[1]{0} add(f32[1]{0} %param, f32[1]{0} %multiply))"));
-  EXPECT_THAT(
-      Explain(add.get(), op::Add(op::Parameter())),
-      Eq("(%add = f32[1]{0} add(f32[1]{0} %param, f32[1]{0} %multiply)) "
-         "has too many operands (got 2, want 1)"));
-  EXPECT_THAT(
-      Explain(add.get(), op::Add(op::Parameter(), op::Parameter())),
-      Eq("(%add = f32[1]{0} add(f32[1]{0} %param, f32[1]{0} %multiply))"
-         "\noperand 1:\n\t"
-         "%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} %param)\n"
-         "doesn't match expected:\n\t"
-         "parameter"
-         ", (%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} "
-         "%param))"));
-  EXPECT_THAT(
-      Explain(add.get(),
-              op::Add(op::Parameter(), op::Multiply(op::Add(), op::Add()))),
-      Eq("(%add = f32[1]{0} add(f32[1]{0} %param, f32[1]{0} %multiply))"
-         "\noperand 1:\n\t"
-         "%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} %param)\n"
-         "doesn't match expected:\n\t"
-         "multiply(add, add)"
-         ", (%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} "
-         "%param))\n"
-         "operand 0:\n\t"
-         "%param = f32[1]{0} parameter(0)\n"
-         "doesn't match expected:\n\t"
-         "add, (%param = f32[1]{0} parameter(0))"));
+  EXPECT_THAT(Explain(add.get(), op::Parameter()),
+              Eq("(%add = f32[1]{0} add(%param, %multiply))"));
+  EXPECT_THAT(Explain(add.get(), op::Add(op::Parameter())),
+              Eq("(%add = f32[1]{0} add(%param, %multiply)) "
+                 "has too many operands (got 2, want 1)"));
+  EXPECT_THAT(Explain(add.get(), op::Add(op::Parameter(), op::Parameter())),
+              Eq("(%add = f32[1]{0} add(%param, %multiply))"
+                 "\noperand 1:\n\t"
+                 "%multiply = f32[1]{0} multiply(%param, %param)\n"
+                 "doesn't match expected:\n\t"
+                 "parameter"
+                 ", (%multiply = f32[1]{0} multiply(%param, %param))"));
+  EXPECT_THAT(Explain(add.get(), op::Add(op::Parameter(),
+                                         op::Multiply(op::Add(), op::Add()))),
+              Eq("(%add = f32[1]{0} add(%param, %multiply))"
+                 "\noperand 1:\n\t"
+                 "%multiply = f32[1]{0} multiply(%param, %param)\n"
+                 "doesn't match expected:\n\t"
+                 "multiply(add, add)"
+                 ", (%multiply = f32[1]{0} multiply(%param, %param))\n"
+                 "operand 0:\n\t"
+                 "%param = f32[1]{0} parameter(0)\n"
+                 "doesn't match expected:\n\t"
+                 "add, (%param = f32[1]{0} parameter(0))"));
 }
 
 TEST_F(HloMatchersTest, CustomCallMatcher) {
@@ -125,8 +119,8 @@ TEST_F(HloMatchersTest, CustomCallMatcher) {
               ::testing::Not(op::CustomCall(::testing::StartsWith("bar"))));
 
   EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")),
-              "(%custom-call = f32[1]{0} custom-call(f32[3]{0} %constant, "
-              "s32[3]{0} %constant), custom_call_target=\"foo_target\") "
+              "(%custom-call = f32[1]{0} custom-call(%constant, %constant), "
+              "custom_call_target=\"foo_target\") "
               "custom-call with call target that isn't equal to \"bar\"");
   EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")),
               R"(custom-call with call target that is equal to "foo_target")");
@@ -231,21 +225,19 @@ ENTRY DotOperationFusion_TransposeFusion {
                             /*lhs_contracting_dim=*/1,
                             /*rhs_contracting_dim=*/0));
 
-  EXPECT_THAT(
-      Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
-                            /*lhs_contracting_dim=*/0,
-                            /*rhs_contracting_dim=*/0)),
-      "(%dot = f32[1,1024]{1,0} dot(f32[1,256]{1,0} %arg0, f32[256,1024]{1,0} "
-      "%arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}) has wrong "
-      "lhs_contracting_dimensions (got {1} want {0})");
-
-  EXPECT_THAT(
-      Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
-                            /*lhs_contracting_dim=*/1,
-                            /*rhs_contracting_dim=*/1)),
-      "(%dot = f32[1,1024]{1,0} dot(f32[1,256]{1,0} %arg0, f32[256,1024]{1,0} "
-      "%arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}) has wrong "
-      "rhs_contracting_dimensions (got {0} want {1})");
+  EXPECT_THAT(Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                                    /*lhs_contracting_dim=*/0,
+                                    /*rhs_contracting_dim=*/0)),
+              "(%dot = f32[1,1024]{1,0} dot(%arg0, %arg1), "
+              "lhs_contracting_dims={1}, rhs_contracting_dims={0}) has wrong "
+              "lhs_contracting_dimensions (got {1} want {0})");
+
+  EXPECT_THAT(Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                                    /*lhs_contracting_dim=*/1,
+                                    /*rhs_contracting_dim=*/1)),
+              "(%dot = f32[1,1024]{1,0} dot(%arg0, %arg1), "
+              "lhs_contracting_dims={1}, rhs_contracting_dims={0}) has wrong "
+              "rhs_contracting_dimensions (got {0} want {1})");
 }
 
 TEST_F(HloMatchersTest, ComparisonMatcher) {
@@ -272,12 +264,12 @@ TEST_F(HloMatchersTest, ComparisonMatcher) {
                                op::Add(op::Parameter(0), op::Parameter(1))));
 
   EXPECT_THAT(Explain(eq.get(), op::Add()),
-              Eq("(%compare = f32[1]{0} compare(f32[1]{0} %param.0, "
-                 "f32[1]{0} %param.1), direction=EQ)"));
-  EXPECT_THAT(Explain(eq.get(), op::Ne()),
-              Eq("(%compare = f32[1]{0} compare(f32[1]{0} %param.0, "
-                 "f32[1]{0} %param.1), direction=EQ) "
-                 "has wrong comparison direction (got EQ, want NE)"));
+              Eq("(%compare = f32[1]{0} compare(%param.0, %param.1), "
+                 "direction=EQ)"));
+  EXPECT_THAT(
+      Explain(eq.get(), op::Ne()),
+      Eq("(%compare = f32[1]{0} compare(%param.0, %param.1), "
+         "direction=EQ) has wrong comparison direction (got EQ, want NE)"));
 }
 
 TEST_F(HloMatchersTest, AsyncCopyMatcher) {
@@ -304,16 +296,12 @@ TEST_F(HloMatchersTest, AsyncCopyMatcher) {
 
   EXPECT_THAT(Explain(copy_start.get(), op::AsyncCopy(2, 1, op::Parameter(0))),
               Eq("(%copy-start = (f32[16]{0:S(2)}, f32[16]{0:S(1)}, u32[]) "
-                 "copy-start(f32[16]{0:S(1)} %p0))"));
+                 "copy-start(%p0))"));
   EXPECT_THAT(Explain(copy_done.get(), op::AsyncCopy(3, 1, op::Parameter(0))),
-              "(%copy-done = f32[16]{0:S(2)} copy-done((f32[16]{0:S(2)}, "
-              "f32[16]{0:S(1)}, u32[]) "
-              "%copy-start)) "
+              "(%copy-done = f32[16]{0:S(2)} copy-done(%copy-start)) "
               "copies to memory space 2, expected 3");
   EXPECT_THAT(Explain(copy_done.get(), op::AsyncCopy(2, 3, op::Parameter(0))),
-              "(%copy-done = f32[16]{0:S(2)} copy-done((f32[16]{0:S(2)}, "
-              "f32[16]{0:S(1)}, u32[]) "
-              "%copy-start)) "
+              "(%copy-done = f32[16]{0:S(2)} copy-done(%copy-start)) "
               "is in the memory space 1, expected 3");
 }
 
@@ -358,7 +346,7 @@ TEST_F(HloMatchersTest, ReplicaGroupsMatcher) {
   EXPECT_THAT(Explain(p0.get(), op::ReplicaGroups({})),
               "%param = f32[5,7]{1,0} parameter(0) not a collective op");
   EXPECT_THAT(Explain(all_to_all.get(), op::ReplicaGroups({{0, 1}, {2, 3}})),
-              "%all-to-all = f32[5,7]{1,0} all-to-all(f32[5,7]{1,0} %param), "
+              "%all-to-all = f32[5,7]{1,0} all-to-all(%param), "
               "replica_groups={{0,2},{1,3}} has incorrect replica_groups "
               "(expected: {{0,1},{2,3}})");
   EXPECT_THAT(all_to_all.get(), op::ReplicaGroups({{0, 2}, {1, 3}}));
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 54684c233316..1b62d4288ae2 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/utils/hlo_container_util.h"
+#include "xla/layout.h"
 #include "xla/literal_util.h"
 #include "xla/map_util.h"
 #include "xla/protobuf_util.h"
@@ -85,7 +86,7 @@ HloInstruction* FormatShape(HloInstruction* data,
       }
       case HloOpcode::kPad: {
         PaddingConfig padding_config;
-        for (int64_t i = 0; i < step.output_shape.rank(); ++i) {
+        for (int64_t i = 0; i < step.output_shape.dimensions().size(); ++i) {
           auto padding_config_dim = padding_config.add_dimensions();
           padding_config_dim->set_edge_padding_low(0);
           padding_config_dim->set_interior_padding(0);
@@ -101,6 +102,12 @@ HloInstruction* FormatShape(HloInstruction* data,
             step.output_shape, data, padding, padding_config));
         break;
       }
+      case HloOpcode::kTranspose: {
+        CHECK(step.xpose_permutation.has_value());
+        data = computation->AddInstruction(HloInstruction::CreateTranspose(
+            step.output_shape, data, *step.xpose_permutation));
+        break;
+      }
       default:
         LOG(FATAL) << "Unsupported formatting step";
     }
@@ -128,13 +135,27 @@ HloInstruction* ReverseFormatShape(
         break;
       }
       case HloOpcode::kPad: {
-        std::vector<int64_t> start_indices(previous_shape.rank(), 0);
-        std::vector<int64_t> strides(previous_shape.rank(), 1);
+        std::vector<int64_t> start_indices(previous_shape.dimensions().size(),
+                                           0);
+        std::vector<int64_t> strides(previous_shape.dimensions().size(), 1);
         data = computation->AddInstruction(
             HloInstruction::CreateSlice(previous_shape, data, start_indices,
                                         previous_shape.dimensions(), strides));
         break;
       }
+      case HloOpcode::kTranspose: {
+        CHECK(step.xpose_permutation.has_value());
+        std::vector<int64_t> reverse_permutation;
+        reverse_permutation.reserve(step.xpose_permutation->size());
+        for (int64_t i = 0; i < step.xpose_permutation->size(); ++i) {
+          reverse_permutation.push_back(
+              absl::c_find(*step.xpose_permutation, i) -
+              step.xpose_permutation->begin());
+        }
+        data = computation->AddInstruction(HloInstruction::CreateTranspose(
+            previous_shape, data, reverse_permutation));
+        break;
+      }
       default:
         LOG(FATAL) << "Unsupported formatting step";
     }
@@ -197,7 +218,7 @@ bool IsSubTilingOrEqualSharding(const Shape& potential_sharded_shape,
   // Different tiled ranks can't be compared (something is wrong, are the
   // shardings for different shapes?)
   if (tiled_data_rank != sharding.TiledDataRank() ||
-      tiled_data_rank != potential_sharded_shape.dimensions_size()) {
+      tiled_data_rank != potential_sharded_shape.dimensions().size()) {
     return false;
   }
 
@@ -673,8 +694,9 @@ bool MergeShardingIfCompatible(const HloSharding& to_merge,
 
   std::vector<OpMetadata> merged_metadata(std::move(dst->metadata()));
   merged_metadata.reserve(merged_metadata.size() + to_merge.metadata().size());
-  const absl::flat_hash_set<OpMetadata, protobuf_util::ProtobufHashWrapper,
-                            protobuf_util::ProtobufEqualsWrapper>
+  const absl::flat_hash_set<OpMetadata,
+                            protobuf_util::ProtobufHashBySerializationFunctor,
+                            protobuf_util::HaveSameSerializationFunctor>
       metadata_set(merged_metadata.begin(), merged_metadata.end());
   absl::c_copy_if(to_merge.metadata(), std::back_inserter(merged_metadata),
                   [&metadata_set](const OpMetadata& data) {
@@ -897,7 +919,7 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
   DimensionVector sharding_tile_dims_stack(
       source_sharding.tile_assignment().dimensions().begin(),
       source_sharding.tile_assignment().dimensions().begin() +
-          source_shape.rank());
+          source_shape.dimensions().size());
   std::reverse(sharding_tile_dims_stack.begin(),
                sharding_tile_dims_stack.end());
   int64_t source_dims_index = -1;
@@ -992,7 +1014,8 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
   if (Product(target_tile_assignment_dimensions) == 1) {
     return std::nullopt;
   }
-  while (target_tile_assignment_dimensions.size() < target_shape.rank()) {
+  while (target_tile_assignment_dimensions.size() <
+         target_shape.dimensions().size()) {
     target_tile_assignment_dimensions.push_back(1);
   }
 
@@ -1024,7 +1047,7 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
     } else if (absl::c_linear_search(subgroup_types, OpSharding::REPLICATED)) {
       target_tile_assignment_dimensions[sharding.SubgroupReplicationDim() -
                                         sharding.TiledDataRank() +
-                                        target_shape.rank()] =
+                                        target_shape.dimensions().size()] =
           partially_replicated.quot;
     } else {
       target_tile_assignment_dimensions.push_back(partially_replicated.quot);
@@ -1050,8 +1073,9 @@ HloSharding PropagateShardingThroughReshape(const Shape& source_shape,
     HloSharding inner_reshaped = PropagateShardingThroughReshape(
         source_shape, target_shape, group.sharding);
     group.sharding = std::move(inner_reshaped);
-    group.data_rank = target_shape.rank();
-    group.group_dims[0] += target_shape.rank() - source_shape.rank();
+    group.data_rank = target_shape.dimensions().size();
+    group.group_dims[0] +=
+        target_shape.dimensions().size() - source_shape.dimensions().size();
     return UngroupSharding(group);
   }
   // Find intervals of consecutive dimensions that could use ReshapeSharding().
@@ -1059,13 +1083,13 @@ HloSharding PropagateShardingThroughReshape(const Shape& source_shape,
   // and if it fails, we find a sub-interval of it or a disjoint interval.
   HloSharding result = HloSharding::Replicate();
   int64_t start_dim = 0;
-  while (start_dim < source_shape.rank()) {
+  while (start_dim < source_shape.dimensions().size()) {
     bool found_compatible = false;
     // For each start_dim, try to use all dims after it. If that fails, reduce
     // the range.
-    for (int64_t end_dim = source_shape.rank(); end_dim > start_dim;
-         --end_dim) {
-      DimensionVector grouped_tiling_dims(source_shape.rank(), 1);
+    for (int64_t end_dim = source_shape.dimensions().size();
+         end_dim > start_dim; --end_dim) {
+      DimensionVector grouped_tiling_dims(source_shape.dimensions().size(), 1);
       for (int64_t i = start_dim; i < end_dim; ++i) {
         grouped_tiling_dims[i] = sharding.tile_assignment().dim(i);
       }
@@ -1094,7 +1118,8 @@ HloSharding PropagateShardingThroughReshape(const Shape& source_shape,
             0);
         int64_t num_replicated_dims =
             sharding.tile_assignment().num_elements() / Product(reshape_dims);
-        const int64_t diff = reshape_dims.size() - target_shape.rank();
+        const int64_t diff =
+            reshape_dims.size() - target_shape.dimensions().size();
         CHECK(diff == 0 || diff == 1);
         if (diff == 0) {
           reshape_dims.push_back(num_replicated_dims);
@@ -1266,11 +1291,12 @@ HloSharding GatherOutputShardingFromIndex(const HloSharding& index_sharding,
   const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
   const GatherScatterDims indices_output_dims =
       GetGatherConnectedDimsAcrossIndicesAndOutput(
-          hlo->operand(1)->shape().rank(), dnums.index_vector_dim(),
-          hlo->shape().rank(), dnums.offset_dims());
+          hlo->operand(1)->shape().dimensions().size(),
+          dnums.index_vector_dim(), hlo->shape().dimensions().size(),
+          dnums.offset_dims());
   return PropagateShardingAlongDimsAndReplicateOthers(
       index_sharding, indices_output_dims.indices_dims,
-      indices_output_dims.output_dims, hlo->shape().rank());
+      indices_output_dims.output_dims, hlo->shape().dimensions().size());
 }
 
 HloSharding GatherIndexShardingFromOutput(const HloSharding& output_sharding,
@@ -1283,11 +1309,13 @@ HloSharding GatherIndexShardingFromOutput(const HloSharding& output_sharding,
   const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
   const GatherScatterDims indices_output_dims =
       GetGatherConnectedDimsAcrossIndicesAndOutput(
-          hlo->operand(1)->shape().rank(), dnums.index_vector_dim(),
-          hlo->shape().rank(), dnums.offset_dims());
+          hlo->operand(1)->shape().dimensions().size(),
+          dnums.index_vector_dim(), hlo->shape().dimensions().size(),
+          dnums.offset_dims());
   return PropagateShardingAlongDimsAndReplicateOthers(
       output_sharding, indices_output_dims.output_dims,
-      indices_output_dims.indices_dims, hlo->operand(1)->shape().rank());
+      indices_output_dims.indices_dims,
+      hlo->operand(1)->shape().dimensions().size());
 }
 
 HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
@@ -1296,9 +1324,9 @@ HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
   }
 
   const GatherDimensionNumbers& dnums = hlo.gather_dimension_numbers();
-  DimensionVector tile_assignment_dims(hlo.shape().rank());
+  DimensionVector tile_assignment_dims(hlo.shape().dimensions().size());
   int64_t num_elements = 1;
-  for (int64_t i = 0; i < hlo.shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo.shape().dimensions().size(); ++i) {
     if (!absl::c_binary_search(dnums.offset_dims(), i)) {
       tile_assignment_dims[i] = hlo.sharding().tile_assignment().dim(i);
       num_elements *= hlo.sharding().tile_assignment().dim(i);
@@ -1327,9 +1355,9 @@ HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
   // - first dimension is non offset dimension,
   // - second dimension is offset dimension,
   // Then the result sharding will be [2,1]{0,2}.
-  DimensionVector slice_starts(hlo.shape().rank(), 0LL),
-      slice_limits(hlo.shape().rank());
-  for (int64_t i = 0; i < hlo.shape().rank(); ++i) {
+  DimensionVector slice_starts(hlo.shape().dimensions().size(), 0LL),
+      slice_limits(hlo.shape().dimensions().size());
+  for (int64_t i = 0; i < hlo.shape().dimensions().size(); ++i) {
     if (!absl::c_binary_search(dnums.offset_dims(), i)) {
       slice_limits[i] = hlo.sharding().tile_assignment().dim(i);
     } else {
@@ -1351,13 +1379,14 @@ HloSharding ScatterIndexShardingFromUpdate(
   const ScatterDimensionNumbers& dnums = scatter->scatter_dimension_numbers();
   const GatherScatterDims indices_update_dims =
       GetGatherConnectedDimsAcrossIndicesAndOutput(
-          scatter->scatter_indices()->shape().rank(), dnums.index_vector_dim(),
-          scatter->scatter_updates()[0]->shape().rank(),
+          scatter->scatter_indices()->shape().dimensions().size(),
+          dnums.index_vector_dim(),
+          scatter->scatter_updates()[0]->shape().dimensions().size(),
           dnums.update_window_dims());
   return PropagateShardingAlongDimsAndReplicateOthers(
       update_sharding, indices_update_dims.output_dims,
       indices_update_dims.indices_dims,
-      scatter->scatter_indices()->shape().rank());
+      scatter->scatter_indices()->shape().dimensions().size());
 }
 
 HloSharding ScatterUpdateShardingFromIndex(
@@ -1369,13 +1398,14 @@ HloSharding ScatterUpdateShardingFromIndex(
   const ScatterDimensionNumbers& dnums = scatter->scatter_dimension_numbers();
   const GatherScatterDims indices_update_dims =
       GetGatherConnectedDimsAcrossIndicesAndOutput(
-          scatter->scatter_indices()->shape().rank(), dnums.index_vector_dim(),
-          scatter->scatter_updates()[0]->shape().rank(),
+          scatter->scatter_indices()->shape().dimensions().size(),
+          dnums.index_vector_dim(),
+          scatter->scatter_updates()[0]->shape().dimensions().size(),
           dnums.update_window_dims());
   return PropagateShardingAlongDimsAndReplicateOthers(
       index_sharding, indices_update_dims.indices_dims,
       indices_update_dims.output_dims,
-      scatter->scatter_updates()[0]->shape().rank());
+      scatter->scatter_updates()[0]->shape().dimensions().size());
 }
 
 HloSharding ScatterEffectiveIndexSharding(
@@ -1388,7 +1418,7 @@ HloSharding ScatterEffectiveIndexSharding(
   const ScatterDimensionNumbers& dnums = scatter.scatter_dimension_numbers();
   int64_t num_elements = 1;
   int64_t index_dim = 0;
-  for (int64_t i = 0; i < scatter.shape().rank(); ++i) {
+  for (int64_t i = 0; i < scatter.shape().dimensions().size(); ++i) {
     if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
       num_elements *= index_sharding.tile_assignment().dim(index_dim);
       index_dim++;
@@ -1408,7 +1438,8 @@ HloSharding ScatterEffectiveIndexSharding(
                                      index_sharding.metadata());
   }
 
-  const int64_t index_rank = scatter.scatter_indices()->shape().rank();
+  const int64_t index_rank =
+      scatter.scatter_indices()->shape().dimensions().size();
   DimensionVector slice_starts(index_rank, 0LL), slice_limits(index_rank);
   for (int64_t i = 0; i < index_rank; ++i) {
     if (i < index_dim) {
@@ -1430,10 +1461,11 @@ HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
   }
 
   const ScatterDimensionNumbers& dnums = scatter.scatter_dimension_numbers();
-  const int64_t data_rank = scatter.scatter_updates()[0]->shape().rank();
+  const int64_t data_rank =
+      scatter.scatter_updates()[0]->shape().dimensions().size();
   DimensionVector tile_assignment_dims(data_rank, 1LL);
   int64_t num_elements = 1;
-  for (int64_t i = 0; i < scatter.shape().rank(); ++i) {
+  for (int64_t i = 0; i < scatter.shape().dimensions().size(); ++i) {
     if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
       CHECK_LT(i, data_rank);
       tile_assignment_dims[i] = data_sharding.tile_assignment().dim(i);
@@ -1479,7 +1511,7 @@ GatherScatterDims GetGatherScatterOperandPassthroughDims(
   CHECK(absl::c_is_sorted(offset_or_window_dims));
 
   int64_t collapsed_or_batching = 0;
-  for (int64_t i = 0; i < operand_shape.rank(); ++i) {
+  for (int64_t i = 0; i < operand_shape.dimensions().size(); ++i) {
     if (IsCollapsedOrBatchingDim(collapsed_or_inserted_dims,
                                  operand_batching_dims, i)) {
       collapsed_or_batching++;
@@ -1542,7 +1574,7 @@ std::optional<HloSharding> PassthroughGatherOutputOrScatterUpdateToOperand(
           offset_or_window_dims, slice_size);
   HloSharding result = PropagateShardingAlongDimsAndReplicateOthers(
       output_or_update_sharding, operand_passthrough_dims.output_dims,
-      operand_passthrough_dims.operand_dims, operand_shape.rank());
+      operand_passthrough_dims.operand_dims, operand_shape.dimensions().size());
   if (result.IsTileMaximal()) {
     return std::nullopt;
   }
@@ -1579,7 +1611,7 @@ std::optional<HloSharding> GatherOperandShardingFromOutputParallelDimensions(
 
   return PropagateShardingAlongDimsAndReplicateOthers(
       output_sharding, parallel_dims.output_dims, parallel_dims.operand_dims,
-      gather.operand(0)->shape().rank());
+      gather.operand(0)->shape().dimensions().size());
 }
 
 }  // namespace
@@ -1597,7 +1629,7 @@ GatherOutputShardingFromOperandOperandPassthroughDimensions(
     const HloInstruction& hlo, absl::Span<const int64_t> slice_sizes) {
   const auto& dnums = hlo.gather_dimension_numbers();
   return PassthroughOperandToGatherOutputOrScatterUpdate(
-      operand_shape, operand_sharding, hlo.shape().rank(),
+      operand_shape, operand_sharding, hlo.shape().dimensions().size(),
       dnums.collapsed_slice_dims(), dnums.operand_batching_dims(),
       dnums.offset_dims(), slice_sizes);
 }
@@ -1638,9 +1670,9 @@ std::optional<HloSharding> GatherOperandShardingFromOutput(
 std::vector<int64_t> GetScatterSliceSize(const Shape& operand_shape,
                                          const Shape& update_shape,
                                          const ScatterDimensionNumbers& dnums) {
-  std::vector<int64_t> slice_size(operand_shape.rank(), 1);
+  std::vector<int64_t> slice_size(operand_shape.dimensions().size(), 1);
   int64_t num_update_window_dims = 0;
-  for (int64_t i = 0; i < operand_shape.rank(); ++i) {
+  for (int64_t i = 0; i < operand_shape.dimensions().size(); ++i) {
     if (IsCollapsedOrBatchingDim(dnums.inserted_window_dims(),
                                  dnums.input_batching_dims(), i)) {
       continue;
@@ -1717,7 +1749,7 @@ ScatterUpdateShardingFromOutputOperandPassthroughDimensions(
   const auto& dnums = scatter->scatter_dimension_numbers();
   return PassthroughOperandToGatherOutputOrScatterUpdate(
       output_shape, output_sharding,
-      scatter->scatter_updates()[0]->shape().rank(),
+      scatter->scatter_updates()[0]->shape().dimensions().size(),
       dnums.inserted_window_dims(), dnums.input_batching_dims(),
       dnums.update_window_dims(), slice_sizes);
 }
@@ -1752,7 +1784,7 @@ std::optional<HloSharding> ScatterUpdateShardingFromOutputParallelDimensions(
 
   return PropagateShardingAlongDimsAndReplicateOthers(
       output_sharding, parallel_dims.operand_dims, parallel_dims.output_dims,
-      scatter.scatter_updates()[0]->shape().rank());
+      scatter.scatter_updates()[0]->shape().dimensions().size());
 }
 
 absl::StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
@@ -2037,7 +2069,8 @@ std::optional<int64_t> GetDimensionForIota(const HloInstruction* maybe_iota,
     return std::nullopt;
   }
   if (maybe_iota->IsConstant()) {
-    std::vector<bool> is_iota_dim(maybe_iota->shape().rank(), true);
+    std::vector<bool> is_iota_dim(maybe_iota->shape().dimensions().size(),
+                                  true);
     maybe_iota->literal().EachCell<int32_t>(
         [&](absl::Span<const int64_t> indices, int32_t val) {
           for (int64_t i = 0; i < indices.size(); ++i) {
@@ -2162,7 +2195,7 @@ std::optional<GatherScatterDims> GetGatherScatterBatchParallelDims(
     int concatenated_dims = 0;
     for (const HloInstruction* op : indices->operands()) {
       const int64_t num_indices_from_element =
-          op->shape().dimensions_size() > index_vector_dim
+          op->shape().dimensions().size() > index_vector_dim
               ? op->shape().dimensions(index_vector_dim)
               : 1;
       if (std::optional<int64_t> maybe_iota_dim =
@@ -2180,7 +2213,7 @@ std::optional<GatherScatterDims> GetGatherScatterBatchParallelDims(
     if (*maybe_iota_dim != index_vector_dim) {
       // This is a case of a single iota with index_dim being out of bounds.
       const int64_t num_indices_from_element =
-          indices->shape().dimensions_size() > index_vector_dim
+          indices->shape().dimensions().size() > index_vector_dim
               ? indices->shape().dimensions(index_vector_dim)
               : 1;
       index_parallel_in_dim.assign(num_indices_from_element, *maybe_iota_dim);
@@ -2305,8 +2338,9 @@ GatherScatterDims GetGatherScatterIndexPassThroughDims(
                    std::back_inserter(excluded_indices_dims));
     }
     return GetGatherConnectedDimsAcrossIndicesAndOutput(
-        gather->operand(1)->shape().rank(), dnums.index_vector_dim(),
-        hlo.shape().rank(), dnums.offset_dims(), excluded_indices_dims);
+        gather->operand(1)->shape().dimensions().size(),
+        dnums.index_vector_dim(), hlo.shape().dimensions().size(),
+        dnums.offset_dims(), excluded_indices_dims);
   }
 
   if (const auto* scatter = DynCast<HloScatterInstruction>(&hlo)) {
@@ -2320,8 +2354,9 @@ GatherScatterDims GetGatherScatterIndexPassThroughDims(
                    std::back_inserter(excluded_indices_dims));
     }
     return GetGatherConnectedDimsAcrossIndicesAndOutput(
-        scatter->scatter_indices()->shape().rank(), dnums.index_vector_dim(),
-        scatter->scatter_updates()[0]->shape().rank(),
+        scatter->scatter_indices()->shape().dimensions().size(),
+        dnums.index_vector_dim(),
+        scatter->scatter_updates()[0]->shape().dimensions().size(),
         dnums.update_window_dims(), excluded_indices_dims);
   }
 
@@ -2334,7 +2369,7 @@ HloSharding InferGatherScatterParallelShardingFromOperandSharding(
     absl::Span<const int64_t> output_parallel_dims) {
   return PropagateShardingAlongDimsAndReplicateOthers(
       operand_sharding, output_aligned_operand_parallel_dims,
-      output_parallel_dims, shape.rank());
+      output_parallel_dims, shape.dimensions().size());
 }
 
 std::string GroupedSharding::ToString() const {
@@ -2345,10 +2380,8 @@ std::string GroupedSharding::ToString() const {
   absl::StrAppend(&result, "data rank: ", data_rank, "\n");
   absl::StrAppend(&result, "subgroup manual: ", subgroup_manual, "\n");
   absl::StrAppend(&result, "inner sharding: ", sharding.ToString(), "\n");
-  absl::StrAppend(&result, "device groups:", "\n");
-  for (auto& device_group : device_groups) {
-    absl::StrAppend(&result, "\t", absl::StrJoin(device_group, ","), "\n");
-  }
+  absl::StrAppend(&result, "device groups:", "\n", device_groups.ToString(),
+                  "\n");
   return result;
 }
 
@@ -2430,22 +2463,16 @@ GroupedSharding GroupShardingOnDims(const HloSharding& sharding,
     perm.insert(perm.begin() + i, index);
   }
 
-  auto grouped_array = sharding.tile_assignment()
-                           .Reshape(reshape_dimensions)
-                           .Transpose(perm)
-                           .array();
+  auto grouped_tile_assignment =
+      sharding.tile_assignment().Reshape(reshape_dimensions).Transpose(perm);
 
   const int64_t num_device_groups = Product(group_dim_sizes);
   const int64_t num_devices = sharding.tile_assignment().num_elements();
   CHECK_EQ(num_devices % num_device_groups, 0);
   const int64_t device_group_size = num_devices / num_device_groups;
-  std::vector<std::vector<int64_t>> device_groups(
-      num_device_groups, std::vector<int64_t>(device_group_size));
-  for (int64_t i = 0; i < num_device_groups; ++i) {
-    device_groups[i].assign(
-        grouped_array.begin() + i * device_group_size,
-        grouped_array.begin() + (i + 1) * device_group_size);
-  }
+
+  DeviceGroupTileAssignment device_groups = DeviceGroupTileAssignment(
+      grouped_tile_assignment.Reshape({num_device_groups, device_group_size}));
 
   auto grouped = GroupedSharding(
       std::move(device_groups),
@@ -2580,13 +2607,7 @@ GroupedSharding GetGroupedReplicatedSharding(const int64_t num_groups,
                                              const int64_t data_rank) {
   CHECK_EQ(num_tiles % num_groups, 0);
   const int64_t group_size = num_tiles / num_groups;
-  std::vector<std::vector<int64_t>> device_groups(
-      num_groups, std::vector<int64_t>(group_size));
-  int64_t device_id = 0;
-  for (auto& device_group : device_groups) {
-    absl::c_iota(device_group, device_id);
-    device_id = device_group.back() + 1;
-  }
+  DeviceGroupTileAssignment device_groups(num_groups, group_size);
   return GroupedSharding(std::move(device_groups), {data_rank}, {num_groups},
                          data_rank, HloSharding::Replicate(),
                          /*subgroup_manual=*/false);
@@ -2622,9 +2643,10 @@ GroupedSharding GetManualSubgroupSharding(const HloSharding& sharding) {
 std::optional<GroupedSharding>
 PartialReplicatedGroupShardingWithAssignedDeviceGroups(
     const HloSharding& sharding, int64_t num_shards,
-    const std::vector<std::vector<int64_t>>& device_groups) {
+    const DeviceGroupTileAssignment& device_groups) {
   if (!sharding.ReplicateOnLastTileDim() ||
-      sharding.tile_assignment().dimensions().back() % device_groups.size() !=
+      sharding.tile_assignment().dimensions().back() %
+              device_groups.num_groups() !=
           0) {
     VLOG(5) << "Failed because not partial replicated or not divisible";
     return std::nullopt;
@@ -2639,31 +2661,32 @@ PartialReplicatedGroupShardingWithAssignedDeviceGroups(
   DimensionVector grouped_tiling_dims(
       sharding.tile_assignment().dimensions().begin(),
       sharding.tile_assignment().dimensions().end());
-  grouped_tiling_dims.back() /= device_groups.size();
+  grouped_tiling_dims.back() /= device_groups.num_groups();
   std::optional<HloSharding> final_sharding;
   const int64_t shard_size_on_replicated_dim =
       sharding.tile_assignment().dimensions().back() / num_shards;
-  for (int64_t group_idx = 0; group_idx < device_groups.size(); ++group_idx) {
+
+  for (int64_t group_idx = 0; group_idx < device_groups.num_groups();
+       ++group_idx) {
     HloSharding group_sharding = HloSharding::Replicate();
     Array<int64_t> grouped_tiling(grouped_tiling_dims);
     Array<int64_t> stacked_pos(
         absl::MakeConstSpan(grouped_tiling_dims.data(),
                             grouped_tiling_dims.size() - 1),
         0);
-    for (int64_t device_idx = 0; device_idx < device_groups[group_idx].size();
-         ++device_idx) {
+    for (int64_t device_idx = 0;
+         device_idx < device_groups.num_devices_per_group(); ++device_idx) {
       VLOG(5) << "Device idx: " << device_idx;
-      const int64_t device = device_groups[group_idx][device_idx];
+      const int64_t device = device_groups(group_idx, device_idx);
       const auto& indices = device_to_index[device];
       absl::Span<const int64_t> stacked_pos_idx =
           absl::MakeConstSpan(indices.data(), indices.size() - 1);
       int64_t& position = stacked_pos(stacked_pos_idx);
       if (position == num_shards) {
         VLOG(5) << "Fail because stacked position overflow " << position
-                << " device_groups " << device_groups.size() << " ["
+                << " device_groups " << device_groups.num_groups() << " ["
                 << absl::StrJoin(indices, ",") << "]";
-        VLOG(5) << "Device: " << device << " "
-                << device_groups[group_idx][device_idx];
+        VLOG(5) << "Device: " << device;
         VLOG(5) << "Indices: " << absl::StrJoin(indices, ",");
         VLOG(5) << "Grouped tiling: " << grouped_tiling.ToString();
         return std::nullopt;
@@ -2699,11 +2722,12 @@ HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
   auto grouped_tiling = grouped_sharding.sharding.tile_assignment();
   if (grouped_sharding.sharding.IsTileMaximal()) {
     tiling_dims = DimensionVector(grouped_sharding.data_rank, 1);
-    if (grouped_sharding.device_groups[0].size() != 1 ||
+    if (grouped_sharding.device_groups.num_devices_per_group() != 1 ||
         absl::c_linear_search(grouped_sharding.group_dims,
                               tiling_dims.size())) {
       // This is partial sharding.
-      tiling_dims.push_back(grouped_sharding.device_groups[0].size());
+      tiling_dims.push_back(
+          grouped_sharding.device_groups.num_devices_per_group());
       partial_sharding = true;
     }
     grouped_tiling = TileAssignment(tiling_dims);
@@ -2749,8 +2773,6 @@ HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
   group_dim_sizes_and_tiling_dims.insert(group_dim_sizes_and_tiling_dims.end(),
                                          tiling_dims.begin(),
                                          tiling_dims.end());
-  Array<int64_t> tiling(group_dim_sizes_and_tiling_dims);
-
   DimensionVector sorted_group_dims(grouped_sharding.group_dims.size());
   std::partial_sort_copy(grouped_sharding.group_dims.begin(),
                          grouped_sharding.group_dims.end(),
@@ -2773,59 +2795,92 @@ HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
     }
   }
 
-  std::vector<int64_t> flattened_device_groups;
-  flattened_device_groups.reserve(grouped_sharding.device_groups.size() *
-                                  grouped_sharding.device_groups[0].size());
-  bool same_length =
-      grouped_tiling.num_elements() == grouped_sharding.device_groups[0].size();
-  for (auto const& v : grouped_sharding.device_groups) {
-    if (same_length) {
+  CHECK_EQ(grouped_tiling.num_elements(),
+           grouped_sharding.device_groups.num_devices_per_group());
+  TileAssignment flattened_tile_assignment;
+  // If the inner sharding (grouped_tiling) is iota tile, we can just apply the
+  // inner sharding's reshape and transpose on the device groups to flatten the
+  // device groups. Here is an example.
+  // Inner tiling: [2,2]<=T(1,0)
+  // Device groups: [2,4]<=[8] = [[0,1,2,3], [4,5,6,7]]
+  // 1. Reshape device groups from [2,4] to [2,2,2] -> [[[0,1], [2,3]], [[4,5],
+  // [6,7]]]
+  // 2. Apply transpose specified by inner sharding: T(0,1,2) -> T(0,2,1) =
+  // [[[0,2],[1,3], [[4,6], [5,7]]]
+  if (grouped_tiling.iota().has_value()) {
+    std::vector<int64_t> reshape_dims(
+        grouped_tiling.iota()->reshape_dims().begin(),
+        grouped_tiling.iota()->reshape_dims().end());
+    std::vector<int> perm(grouped_tiling.iota()->transpose_perm().begin(),
+                          grouped_tiling.iota()->transpose_perm().end());
+    reshape_dims.insert(reshape_dims.begin(),
+                        grouped_sharding.device_groups.num_groups());
+    for (int i = 0; i < perm.size(); i++) {
+      perm[i] = perm[i] + 1;
+    }
+    perm.insert(perm.begin(), 0);
+    flattened_tile_assignment =
+        grouped_sharding.device_groups.Reshape(reshape_dims)
+            .Transpose(perm)
+            .Reshape(group_dim_sizes_and_tiling_dims);
+  } else {
+    Array<int64_t> tiling(group_dim_sizes_and_tiling_dims);
+    std::vector<int64_t> flattened_device_groups;
+    auto device_groups = grouped_sharding.device_groups;
+    flattened_device_groups.reserve(device_groups.num_total_devices());
+
+    for (int64_t i = 0; i < device_groups.num_groups(); ++i) {
       // Reorder the device_groups based on the grouped_tiling.array()
-      for (int64_t i = 0; i < v.size(); ++i) {
+      for (int64_t j = 0; j < device_groups.num_devices_per_group(); ++j) {
         flattened_device_groups.push_back(
-            v[*(grouped_tiling.array().begin() + i)]);
+            device_groups(i, *(grouped_tiling.array().begin() + j)));
       }
-    } else {
-      flattened_device_groups.insert(flattened_device_groups.end(), v.begin(),
-                                     v.end());
     }
+    tiling.SetValues(flattened_device_groups);
+    flattened_tile_assignment = TileAssignment(
+        std::make_shared<const Array<int64_t>>(std::move(tiling)));
   }
-  tiling.SetValues(flattened_device_groups);
-  TileAssignment tile_assignment(
-      std::make_shared<const Array<int64_t>>(std::move(tiling)));
 
   for (int64_t i = 0; i < grouped_sharding.group_dims.size(); ++i) {
     int64_t dim = grouped_sharding.group_dims[i];
     tiling_dims[dim] *= grouped_sharding.group_dim_sizes[i];
   }
-  tile_assignment = tile_assignment.Transpose(perm).Reshape(tiling_dims);
-
+  flattened_tile_assignment =
+      flattened_tile_assignment.Transpose(perm).Reshape(tiling_dims);
   if (grouped_sharding.subgroup_manual) {
-    return HloSharding::Subgroup(tile_assignment, subgroup_types,
+    return HloSharding::Subgroup(flattened_tile_assignment, subgroup_types,
                                  grouped_sharding.sharding.metadata());
   }
-  return partial_sharding ? HloSharding::PartialTile(tile_assignment)
-                          : HloSharding::Tile(tile_assignment);
+  return partial_sharding ? HloSharding::PartialTile(flattened_tile_assignment)
+                          : HloSharding::Tile(flattened_tile_assignment);
 }
 
 bool DeviceGroupsAreMatch(GroupedSharding& lhs, GroupedSharding& rhs,
                           bool ignore_group_order) {
-  if (lhs.device_groups.size() != rhs.device_groups.size()) {
+  if (lhs.device_groups.num_groups() != rhs.device_groups.num_groups()) {
     return false;
   }
 
+  // If both the device groups are in iota format, we can compare the iota
+  // representation directly as this is quite fast.
+  if (lhs.device_groups.has_iota() && rhs.device_groups.has_iota() &&
+      lhs.device_groups == rhs.device_groups) {
+    return true;
+  }
+
   bool matching_groups = true;
-  std::vector<int64_t> device_to_ref_group(lhs.device_groups.size() *
-                                           lhs.device_groups[0].size());
-  for (int64_t g = 0; g < lhs.device_groups.size(); ++g) {
-    for (int64_t device : lhs.device_groups[g]) {
-      device_to_ref_group[device] = g;
+  std::vector<int64_t> device_to_ref_group(
+      lhs.device_groups.num_total_devices());
+  for (int64_t g = 0; g < lhs.device_groups.num_groups(); ++g) {
+    for (int64_t d = 0; d < lhs.device_groups.num_devices_per_group(); ++d) {
+      device_to_ref_group[lhs.device_groups(g, d)] = g;
     }
   }
-  auto unique_ref_dev_group =
-      [&](absl::Span<const int64_t> devices) -> int64_t {
+  auto unique_ref_dev_group = [&](int64_t group) -> int64_t {
     int64_t ref_g = -1;
-    for (int64_t device : devices) {
+    for (int64_t device_idx = 0;
+         device_idx < rhs.device_groups.num_devices_per_group(); ++device_idx) {
+      const int64_t device = rhs.device_groups(group, device_idx);
       if (ref_g == -1) {
         ref_g = device_to_ref_group[device];
       } else if (ref_g != device_to_ref_group[device]) {
@@ -2834,8 +2889,8 @@ bool DeviceGroupsAreMatch(GroupedSharding& lhs, GroupedSharding& rhs,
     }
     return ref_g;
   };
-  for (int64_t g = 0; g < rhs.device_groups.size(); ++g) {
-    int64_t ref_g = unique_ref_dev_group(rhs.device_groups[g]);
+  for (int64_t g = 0; g < rhs.device_groups.num_groups(); ++g) {
+    int64_t ref_g = unique_ref_dev_group(g);
     if (ref_g < 0 || (!ignore_group_order && g != ref_g)) {
       matching_groups = false;
       break;
@@ -2903,14 +2958,14 @@ std::shared_ptr<const HloSharding> CreateTupleSharding(
 std::optional<int64_t> GetFirstTargetDimToMoveShardingTiles(
     const Shape& shape, const HloSharding& sharding, int64_t source_dim,
     std::function<bool(int64_t)> can_be_target_dim) {
-  if (shape.rank() < 2 || shape.dimensions(source_dim) == 1) {
+  if (shape.dimensions().size() < 2 || shape.dimensions(source_dim) == 1) {
     return std::nullopt;
   }
   if (!sharding.IsTiled() || sharding.tile_assignment().dim(source_dim) == 1) {
     return std::nullopt;
   }
 
-  for (int64_t dim = 0; dim < shape.rank(); ++dim) {
+  for (int64_t dim = 0; dim < shape.dimensions().size(); ++dim) {
     if (dim == source_dim) {
       continue;
     }
@@ -2972,8 +3027,9 @@ Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape) {
     return shape;
   }
   Shape result_shape = shape;
-  // sharding.TiledDataRank() == i < shape.rank() is not always true?
-  for (int64_t i = 0; i < sharding.TiledDataRank() && i < shape.rank(); ++i) {
+  // sharding.TiledDataRank() == i < shape.dimensions_size() is not always true?
+  for (int64_t i = 0;
+       i < sharding.TiledDataRank() && i < shape.dimensions().size(); ++i) {
     result_shape.set_dimensions(
         i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
   }
@@ -3007,7 +3063,8 @@ Shape TileLeafShape(const HloSharding& sharding, const Shape& shape) {
     return shape;
   }
   Shape result_shape = shape;
-  for (int64_t i = 0; i < sharding.TiledDataRank() && i < shape.rank(); ++i) {
+  for (int64_t i = 0;
+       i < sharding.TiledDataRank() && i < shape.dimensions().size(); ++i) {
     CHECK_EQ(shape.dimensions(i) % sharding.tile_assignment().dim(i), 0);
     result_shape.set_dimensions(
         i, shape.dimensions(i) / sharding.tile_assignment().dim(i));
@@ -3035,9 +3092,9 @@ absl::Status CanonicalizeLayoutAfterShardingPropagation(
       Shape result_shape = module->mutable_entry_computation_layout()
                                ->mutable_result_layout()
                                ->shape();
-      CHECK_EQ(result_shape.tuple_shapes_size(),
-               shapes_with_layout.second.tuple_shapes_size());
-      for (int64_t i = 0; i < result_shape.tuple_shapes_size(); ++i) {
+      CHECK_EQ(result_shape.tuple_shapes().size(),
+               shapes_with_layout.second.tuple_shapes().size());
+      for (int64_t i = 0; i < result_shape.tuple_shapes().size(); ++i) {
         if (update_output_layout[i]) {
           *result_shape.mutable_tuple_shapes(i) =
               shapes_with_layout.second.tuple_shapes(i);
@@ -3131,7 +3188,7 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
     bool allow_aggressive_resharding) {
   // Always allow improve the sharding if it's straightly better.
   if (to_improved != nullptr && IsShardingStrictlyBetter(from, *to_improved)) {
-    return std::move(from);
+    return from;
   }
   // We don't want to propagate tile maximal shardings.
   if (!IsSpatiallyPartitioned(from)) {
@@ -3139,7 +3196,7 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
   }
   // Any sharding is better than no sharding.
   if (to_improved == nullptr) {
-    return std::move(from);
+    return from;
   }
   // We don't want to propagate manual shardings.
   if (from.IsManual()) {
@@ -3159,7 +3216,7 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
         return std::nullopt;
       }
     }
-    return std::move(from);
+    return from;
   }
   return std::nullopt;
 }
@@ -3276,5 +3333,16 @@ HloSharding InferDotOperandSharding(
       may_combine_partial_sharding);
 }
 
+std::vector<std::vector<int64_t>>
+DeviceGroupTileAssignment::flattened_device_groups() const {
+  std::vector<std::vector<int64_t>> result;
+  result.reserve(num_groups());
+  for (auto it = array().begin(); it != array().end();
+       it += num_devices_per_group()) {
+    result.emplace_back(it, it + num_devices_per_group());
+  }
+  return result;
+}
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 49155a0705da..d9807f75d104 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -34,10 +35,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/layout.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/dot_as_convolution_util.h"
 #include "xla/shape.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace hlo_sharding_util {
@@ -52,6 +55,7 @@ struct FormattingStep {
   std::optional<Shape> reverse_input_shape;
   HloOpcode formatting_opcode;
   HloInstruction* padding_value;
+  std::optional<absl::Span<const int64_t>> xpose_permutation;
 };
 
 struct GatherScatterDims {
@@ -402,11 +406,39 @@ HloSharding InferGatherScatterParallelShardingFromOperandSharding(
     absl::Span<const int64_t> output_aligned_operand_parallel_dims,
     absl::Span<const int64_t> output_parallel_dims);
 
+// Tile assignment representing device groups. Tile assignment has two
+// dimensions and is of shape [num_groups, num_devices_per_group].
+class DeviceGroupTileAssignment : public TileAssignment {
+ public:
+  explicit DeviceGroupTileAssignment(int64_t num_groups,
+                                     int64_t num_devices_per_group)
+      : TileAssignment({num_groups, num_devices_per_group}) {}
+  explicit DeviceGroupTileAssignment(int64_t num_groups,
+                                     int64_t num_devices_per_group,
+                                     absl::Span<const int64_t> reshape_dims,
+                                     absl::Span<const int> transpose_perm)
+      : TileAssignment({num_groups, num_devices_per_group}, reshape_dims,
+                       transpose_perm) {}
+  explicit DeviceGroupTileAssignment(const TileAssignment& tile_assignment)
+      : TileAssignment(tile_assignment) {
+    CHECK(tile_assignment.num_dimensions() == 2)
+        << "DeviceGroupTileAssignment expects TileAssignment to have exactly 2 "
+           "dimensions. Found: "
+        << tile_assignment.num_dimensions();
+  }
+
+  bool has_iota() const { return iota().has_value(); }
+  int64_t num_total_devices() const { return num_elements(); }
+  int64_t num_groups() const { return dim(0); }
+  int64_t num_devices_per_group() const { return dim(1); }
+  std::vector<std::vector<int64_t>> flattened_device_groups() const;
+};
+
 // Represents grouping devices in a tiled sharding along certain dimensions.
 // Elements in group dimensions define different device groups, and the sharding
 // represents the in-group sharding.
 struct GroupedSharding {
-  GroupedSharding(std::vector<std::vector<int64_t>> device_groups,
+  GroupedSharding(DeviceGroupTileAssignment device_groups,
                   DimensionVector group_dims, DimensionVector group_dim_sizes,
                   int64_t data_rank, HloSharding grouped_sharding,
                   bool subgroup_manual = false)
@@ -417,8 +449,7 @@ struct GroupedSharding {
         sharding(std::move(grouped_sharding)),
         subgroup_manual(subgroup_manual) {}
   std::string ToString() const;
-  // TODO(b/316622399): Migrate this to be a TileAssignment.
-  std::vector<std::vector<int64_t>> device_groups;
+  DeviceGroupTileAssignment device_groups;
   DimensionVector group_dims;
   DimensionVector group_dim_sizes;
   int64_t data_rank;
@@ -475,7 +506,7 @@ GroupedSharding GetManualSubgroupSharding(const HloSharding& sharding);
 std::optional<GroupedSharding>
 PartialReplicatedGroupShardingWithAssignedDeviceGroups(
     const HloSharding& sharding, int64_t num_shards,
-    const std::vector<std::vector<int64_t>>& device_groups);
+    const DeviceGroupTileAssignment& device_groups);
 
 // Reconstructs the ungrouped sharding from a GroupedSharding.
 HloSharding UngroupSharding(const GroupedSharding& grouped_sharding);
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index 2881219130b6..a03b9126fba4 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -691,10 +691,8 @@ TEST(HloShardingUtilTest, GetManualSubgroupSharding_ManualOnly) {
   EXPECT_EQ(group_sharding.sharding.tile_assignment(), TileAssignment({1, 2}));
 
   // Expect the device groups are: {0, 2} and {1, 3}
-  EXPECT_THAT(group_sharding.device_groups[0],
-              ::testing::ElementsAreArray({0, 2}));
-  EXPECT_THAT(group_sharding.device_groups[1],
-              ::testing::ElementsAreArray({1, 3}));
+  EXPECT_EQ(group_sharding.device_groups.ToString(),
+            "devices=[2,2]<=[2,2]T(1,0)");
 }
 
 TEST(HloShardingUtilTest, GetManualSubgroupSharding_ManualAndReplicted) {
@@ -711,10 +709,8 @@ TEST(HloShardingUtilTest, GetManualSubgroupSharding_ManualAndReplicted) {
             "{devices=[1,2,2]<=[4] last_tile_dim_replicate}");
 
   // Expect the device groups are: {0, 2, 4, 6} and {1, 3, 5, 7}
-  EXPECT_THAT(group_sharding.device_groups[0],
-              ::testing::ElementsAreArray({0, 2, 4, 6}));
-  EXPECT_THAT(group_sharding.device_groups[1],
-              ::testing::ElementsAreArray({1, 3, 5, 7}));
+  EXPECT_EQ(group_sharding.device_groups.ToString(),
+            "devices=[2,4]<=[4,2]T(1,0)");
 }
 
 TEST(HloShardingUtilTest, GetManualSubgroupSharding_ReplicatedAndManual) {
@@ -731,15 +727,13 @@ TEST(HloShardingUtilTest, GetManualSubgroupSharding_ReplicatedAndManual) {
             "{devices=[1,2,2]<=[4] last_tile_dim_replicate}");
 
   // Expect the device groups are: {0, 1, 4, 5} and {2, 3, 6, 7}
-  EXPECT_THAT(group_sharding.device_groups[0],
-              ::testing::ElementsAreArray({0, 1, 4, 5}));
-  EXPECT_THAT(group_sharding.device_groups[1],
-              ::testing::ElementsAreArray({2, 3, 6, 7}));
+  EXPECT_EQ(group_sharding.device_groups.ToString(),
+            "devices=[2,4]<=[2,2,2]T(1,0,2)");
 }
 
 TEST(HloShardingUtilTest, UngroupSharding_ManualOnly) {
   HloSharding sharding = HloSharding::IotaTile({1, 2});
-  std::vector<std::vector<int64_t>> device_groups = {{0, 2}, {1, 3}};
+  DeviceGroupTileAssignment device_groups(2, 2, {2, 2}, {1, 0});
   DimensionVector group_dims = {2};
   DimensionVector group_dim_sizes = {2};
 
@@ -751,13 +745,12 @@ TEST(HloShardingUtilTest, UngroupSharding_ManualOnly) {
   HloSharding ungroup_sharding = UngroupSharding(grouped);
 
   EXPECT_EQ(ungroup_sharding.ToString(),
-            "{devices=[1,2,2]0,1,2,3 last_tile_dims={manual}}");
+            "{devices=[1,2,2]<=[4] last_tile_dims={manual}}");
 }
 
 TEST(HloShardingUtilTest, UngroupSharding_ReplicatedAndManual) {
   HloSharding sharding = HloSharding::PartialTile(TileAssignment({1, 2, 2}));
-  std::vector<std::vector<int64_t>> device_groups = {{0, 2, 4, 6},
-                                                     {1, 3, 5, 7}};
+  DeviceGroupTileAssignment device_groups(2, 4, {2, 2, 2}, {2, 0, 1});
   DimensionVector group_dims = {3};
   DimensionVector group_dim_sizes = {2};
 
@@ -770,15 +763,14 @@ TEST(HloShardingUtilTest, UngroupSharding_ReplicatedAndManual) {
   HloSharding ungroup_sharding = UngroupSharding(grouped);
   VLOG(1) << "ungroup_sharding: " << ungroup_sharding.ToString();
 
-  EXPECT_EQ(
-      ungroup_sharding.ToString(),
-      "{devices=[1,2,2,2]0,2,1,3,4,6,5,7 last_tile_dims={manual, replicated}}");
+  EXPECT_EQ(ungroup_sharding.ToString(),
+            "{devices=[1,2,2,2]<=[2,2,2]T(0,2,1) last_tile_dims={manual, "
+            "replicated}}");
 }
 
 TEST(HloShardingUtilTest, UngroupSharding_ManualAndReplicated) {
   HloSharding sharding = HloSharding::PartialTile(TileAssignment({1, 2, 2}));
-  std::vector<std::vector<int64_t>> device_groups = {{0, 1, 4, 5},
-                                                     {2, 3, 6, 7}};
+  DeviceGroupTileAssignment device_groups(2, 4, {2, 2, 2}, {1, 0, 2});
   DimensionVector group_dims = {2};
   DimensionVector group_dim_sizes = {2};
 
@@ -791,9 +783,8 @@ TEST(HloShardingUtilTest, UngroupSharding_ManualAndReplicated) {
   HloSharding ungroup_sharding = UngroupSharding(grouped);
   VLOG(1) << "ungroup_sharding: " << ungroup_sharding.ToString();
 
-  EXPECT_EQ(
-      ungroup_sharding.ToString(),
-      "{devices=[1,2,2,2]0,1,2,3,4,5,6,7 last_tile_dims={manual, replicated}}");
+  EXPECT_EQ(ungroup_sharding.ToString(),
+            "{devices=[1,2,2,2]<=[8] last_tile_dims={manual, replicated}}");
 }
 
 TEST(HloShardingUtilTest, UngroupSharding_Replicated) {
@@ -802,8 +793,7 @@ TEST(HloShardingUtilTest, UngroupSharding_Replicated) {
   DimensionVector group_dims = {3};
   DimensionVector group_dim_sizes = {2};
 
-  std::vector<std::vector<int64_t>> device_groups = {{0, 1}, {2, 3}};
-
+  DeviceGroupTileAssignment device_groups(2, 2);
   auto grouped =
       GroupedSharding(std::move(device_groups), std::move(group_dims),
                       std::move(group_dim_sizes), 2, sharding,
@@ -813,7 +803,7 @@ TEST(HloShardingUtilTest, UngroupSharding_Replicated) {
   VLOG(1) << "ungroup_sharding: " << ungroup_sharding.ToString();
 
   EXPECT_EQ(ungroup_sharding.ToString(),
-            "{devices=[1,1,2,2]0,1,2,3 last_tile_dims={manual, replicated}}");
+            "{devices=[1,1,2,2]<=[4] last_tile_dims={manual, replicated}}");
 }
 
 TEST(HloShardingUtilTest, UngroupSharding_Replicated2) {
@@ -821,7 +811,7 @@ TEST(HloShardingUtilTest, UngroupSharding_Replicated2) {
   DimensionVector group_dims = {2};
   DimensionVector group_dim_sizes = {2};
 
-  std::vector<std::vector<int64_t>> device_groups = {{0, 2}, {1, 3}};
+  DeviceGroupTileAssignment device_groups(2, 2, {2, 2}, {1, 0});
 
   auto grouped =
       GroupedSharding(std::move(device_groups), std::move(group_dims),
@@ -831,8 +821,9 @@ TEST(HloShardingUtilTest, UngroupSharding_Replicated2) {
   HloSharding ungroup_sharding = UngroupSharding(grouped);
   VLOG(1) << "ungroup_sharding: " << ungroup_sharding.ToString();
 
-  EXPECT_EQ(ungroup_sharding.ToString(),
-            "{devices=[1,1,2,2]0,2,1,3 last_tile_dims={manual, replicated}}");
+  EXPECT_EQ(
+      ungroup_sharding.ToString(),
+      "{devices=[1,1,2,2]<=[2,2]T(1,0) last_tile_dims={manual, replicated}}");
 }
 
 TEST(HloShardingUtilTest, GroupedAndUngroupedReplicatedSharding) {
@@ -842,8 +833,7 @@ TEST(HloShardingUtilTest, GroupedAndUngroupedReplicatedSharding) {
 }
 
 TEST(HloShardingUtilTest, GroupedAndUngroupedIotaSharding) {
-  std::vector<std::vector<int64_t>> device_groups = {{0, 1, 2, 3, 4, 5},
-                                                     {6, 7, 8, 9, 10, 11}};
+  DeviceGroupTileAssignment device_groups(2, 6);
   GroupedSharding group_sharding = GroupedSharding(
       device_groups, /*group_dims=*/{0}, /*group_dim_sizes=*/{2},
       /*data_rank=*/2, HloSharding::IotaTile({1, 2, 3}, {2, 3}, {1, 0}));
@@ -860,7 +850,8 @@ TEST(HloShardingUtilTest, GroupedAndUngroupedShardingWithUnsortedGroupDims) {
 }
 
 TEST(HloShardingUtilTest, UngroupShardingWithUnsortedGroupDims) {
-  GroupedSharding group_sharding({{0}, {1}, {2}, {3}}, {1, 0}, {2, 2}, 4,
+  DeviceGroupTileAssignment device_groups(4, 1);
+  GroupedSharding group_sharding(device_groups, {1, 0}, {2, 2}, 4,
                                  HloSharding::Replicate());
   EXPECT_EQ(UngroupSharding(group_sharding),
             HloSharding::IotaTile({2, 2, 1, 1}, {2, 2}, {1, 0}));
@@ -870,8 +861,7 @@ TEST(HloShardingUtilTest, DeviceGroupsDoesNotMatch) {
   HloSharding sharding = HloSharding::PartialTile(TileAssignment({2, 2}));
   DimensionVector group_dim_sizes = {2};
 
-  std::vector<std::vector<int64_t>> lhs_device_groups = {{0, 2, 4, 6},
-                                                         {1, 3, 5, 7}};
+  DeviceGroupTileAssignment lhs_device_groups(2, 4, {2, 2, 2}, {2, 0, 1});
   DimensionVector lhs_group_dims = {3};
 
   auto lhs =
@@ -879,8 +869,7 @@ TEST(HloShardingUtilTest, DeviceGroupsDoesNotMatch) {
                       group_dim_sizes, 2, sharding,
                       /*subgroup_manual=*/true);
 
-  std::vector<std::vector<int64_t>> rhs_device_groups = {{0, 1, 4, 5},
-                                                         {2, 3, 6, 7}};
+  DeviceGroupTileAssignment rhs_device_groups(2, 4, {2, 2, 2}, {1, 0, 2});
   DimensionVector rhs_group_dims = {2};
 
   auto rhs =
@@ -895,7 +884,7 @@ TEST(HloShardingUtilTest, DeviceGroupsMatch) {
   HloSharding lhs_sharding = HloSharding::Replicate();
   DimensionVector group_dims = {2};
   DimensionVector group_dim_sizes = {2};
-  std::vector<std::vector<int64_t>> device_groups = {{0, 2}, {1, 3}};
+  DeviceGroupTileAssignment device_groups(2, 2, {2, 2}, {1, 0});
 
   auto lhs = GroupedSharding(
       device_groups, DimensionVector(group_dims.begin(), group_dims.end()),
diff --git a/third_party/xla/xla/hlo/utils/hlo_traversal.cc b/third_party/xla/xla/hlo/utils/hlo_traversal.cc
index 0df983aef2ff..c858dd3644ad 100644
--- a/third_party/xla/xla/hlo/utils/hlo_traversal.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_traversal.cc
@@ -39,7 +39,7 @@ namespace xla {
 namespace {
 
 template <typename F>
-void ResolveUsers(const HloInstruction* value, const HloInstruction* user,
+void ResolveUsers(int64_t user_operand_index, const HloInstruction* user,
                   const HloFusionAdaptor& fusion_adaptor, F&& add_user) {
   if (user->opcode() == HloOpcode::kTuple && user->IsRoot()) {
     if (auto* fusion = user->parent()->FusionInstruction()) {
@@ -48,9 +48,10 @@ void ResolveUsers(const HloInstruction* value, const HloInstruction* user,
       for (const auto* fusion_user : fusion->users()) {
         if (fusion_user->opcode() == HloOpcode::kGetTupleElement) {
           for (const auto* gte_user : fusion_user->users()) {
-            ResolveUsers(fusion_user, gte_user, fusion_adaptor, add_user);
+            ResolveUsers(gte_user->operand_index(fusion_user), gte_user,
+                         fusion_adaptor, add_user);
           }
-        } else if (fusion_adaptor.ContainsInstruction(value)) {
+        } else if (fusion_adaptor.ContainsInstruction(fusion_user)) {
           add_user(fusion_user);
         }
       }
@@ -66,7 +67,7 @@ void ResolveUsers(const HloInstruction* value, const HloInstruction* user,
   if (user->opcode() == HloOpcode::kFusion &&  // Not a nested fusion.
       fusion_adaptor.ContainsInstruction(user->fused_expression_root())) {
     // Add users of the computation's parameter.
-    auto* param = user->fused_parameter(user->operand_index(value));
+    auto* param = user->fused_parameter(user_operand_index);
     for (const auto* param_user : param->users()) {
       add_user(param_user);
     }
@@ -526,13 +527,13 @@ absl::InlinedVector<HloInstructionAdaptor, 2> HloInstructionAdaptor::GetUsers()
   if (instruction_->IsRoot()) {
     if (auto* fusion = instruction_->parent()->FusionInstruction()) {
       for (auto* user : fusion->users()) {
-        ResolveUsers(fusion, user, *parent_, add_user);
+        ResolveUsers(user->operand_index(fusion), user, *parent_, add_user);
       }
     }
   }
 
   for (auto* user : instruction_->users()) {
-    ResolveUsers(instruction_, user, *parent_, add_user);
+    ResolveUsers(user->operand_index(instruction_), user, *parent_, add_user);
   }
 
   return users;
diff --git a/third_party/xla/xla/hlo/utils/hlo_traversal_test.cc b/third_party/xla/xla/hlo/utils/hlo_traversal_test.cc
index ec1239ffa2da..251485ad84f3 100644
--- a/third_party/xla/xla/hlo/utils/hlo_traversal_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_traversal_test.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -46,7 +46,7 @@ using ::testing::IsEmpty;
 
 MATCHER_P(InstructionAdaptorName, name, "") { return arg.name() == name; }
 
-class HloTraversalTest : public HloTestBase {};
+class HloTraversalTest : public HloHardwareIndependentTestBase {};
 
 const char kTestModule[] = R"(
     accumulate {
@@ -130,8 +130,7 @@ TEST_F(HloTraversalTest, AdaptorUsers) {
   HloInstructionAdaptor add{*module->GetComputationWithName("computation1")
                                  ->GetInstructionWithName("add.1"),
                             fusion_adaptor1.get()};
-  EXPECT_THAT(add.GetUsers(), ElementsAre(InstructionAdaptorName("mul"),
-                                          InstructionAdaptorName("tuple.3")));
+  EXPECT_THAT(add.GetUsers(), ElementsAre(InstructionAdaptorName("mul")));
 
   auto fusion_adaptor2 = HloFusionAdaptor::ForInstruction(
       module->entry_computation()->GetInstructionWithName("fusion2"));
diff --git a/third_party/xla/xla/index_util.cc b/third_party/xla/xla/index_util.cc
index 70583907921c..7377db56f9d8 100644
--- a/third_party/xla/xla/index_util.cc
+++ b/third_party/xla/xla/index_util.cc
@@ -38,7 +38,7 @@ namespace xla {
   // I{L(1)} = (linear_index / D{L(0)}) % D{L(1)}
   // I{L(2)} = (linear_index / (D{L(0)} * D{L(1)})) % D{L(2)}
   // ...
-  DimensionVector multi_index(shape.dimensions_size());
+  DimensionVector multi_index(shape.dimensions().size());
 
   // Accumulated product D{L(0)} * D{L(1)} * ...
   int64_t divisor = 1;
@@ -64,7 +64,7 @@ namespace xla {
 
 /* static */ bool IndexUtil::IndexInBounds(const Shape& shape,
                                            absl::Span<const int64_t> index) {
-  int64_t rank = shape.rank();
+  int64_t rank = shape.dimensions().size();
   const int64_t index_size = index.size();
   if (rank != index_size) {
     return false;
diff --git a/third_party/xla/xla/index_util_test.cc b/third_party/xla/xla/index_util_test.cc
index a312293d32b5..a828e7c76743 100644
--- a/third_party/xla/xla/index_util_test.cc
+++ b/third_party/xla/xla/index_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/index_util.h"
 
+#include <cstdint>
 #include <initializer_list>
 #include <vector>
 
diff --git a/third_party/xla/xla/layout.cc b/third_party/xla/xla/layout.cc
index 1b6adc5a6e8a..20fd3bb90ab9 100644
--- a/third_party/xla/xla/layout.cc
+++ b/third_party/xla/xla/layout.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/layout.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <ostream>
@@ -29,22 +30,19 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
+#include "xla/status_macros.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
 TileProto Tile::ToProto() const {
   TileProto tile_proto;
-  SetProto(tile_proto);
-  return tile_proto;
-}
-
-void Tile::SetProto(TileProto& tile_proto) const {
-  tile_proto.Clear();
   for (int64_t i : dimensions()) {
     tile_proto.add_dimensions(i);
   }
+  return tile_proto;
 }
 
 void Tile::Print(Printer* printer) const {
@@ -83,24 +81,11 @@ SplitConfigProto SplitConfig::ToProto() const {
   return split_config_proto;
 }
 
-void SplitConfig::SetProto(SplitConfigProto& split_config_proto) const {
-  split_config_proto.Clear();
-  split_config_proto.set_dimension(dimension_);
-  for (int64_t i : split_indices_) {
-    split_config_proto.add_split_indices(i);
-  }
-}
-
 std::string SplitConfig::ToString() const {
   return absl::StrCat("(", dimension_, ":", absl::StrJoin(split_indices_, ","),
                       ")");
 }
 
-Layout::Layout(absl::Span<const int64_t> minor_to_major)
-    : index_primitive_type_(PRIMITIVE_TYPE_INVALID),
-      pointer_primitive_type_(PRIMITIVE_TYPE_INVALID),
-      minor_to_major_(minor_to_major.begin(), minor_to_major.end()) {}
-
 Layout::Layout(absl::Span<const int64_t> minor_to_major,
                absl::Span<const Tile> tiles, int64_t element_size_in_bits)
     : index_primitive_type_(PRIMITIVE_TYPE_INVALID),
@@ -110,12 +95,9 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
       tiles_(tiles.begin(), tiles.end()) {}
 
 Layout::Layout(absl::Span<const int64_t> minor_to_major,
-               absl::Span<const DimLevelType> dim_level_types,
-               absl::Span<const bool> dim_unique,
-               absl::Span<const bool> dim_ordered, absl::Span<const Tile> tiles,
-               int64_t tail_padding_alignment_in_elements,
-               PrimitiveType index_primitive_type,
+               absl::Span<const Tile> tiles, PrimitiveType index_primitive_type,
                PrimitiveType element_primitive_type,
+               int64_t tail_padding_alignment_in_elements,
                int64_t element_size_in_bits, int64_t memory_space,
                absl::Span<const SplitConfig> split_configs,
                std::unique_ptr<Shape> physical_shape,
@@ -131,29 +113,11 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
       physical_shape_(std::move(physical_shape)),
       dynamic_shape_metadata_prefix_bytes_(
           dynamic_shape_metadata_prefix_bytes) {
-  // Grow dim_attributes_ to the maximum length of "dim_level_types",
-  // "dim_unique", and "dim_ordered", and then initialize the attributes that
-  // should exist.
-  n_dim_level_types_ = dim_level_types.size();
-  n_dim_unique_ = dim_unique.size();
-  n_dim_ordered_ = dim_ordered.size();
-  const int n_attributes = std::max<int>(
-      n_dim_level_types_, std::max<int>(n_dim_unique_, n_dim_ordered_));
-  dim_attributes_.resize(n_attributes);
-  for (int i = 0; i < n_attributes; i++) {
-    if (i < n_dim_level_types_)
-      dim_attributes_[i].dim_level_type = dim_level_types[i];
-    if (i < n_dim_unique_) dim_attributes_[i].dim_unique = dim_unique[i];
-    if (i < n_dim_ordered_) dim_attributes_[i].dim_ordered = dim_ordered[i];
-  }
+  CHECK_GE(tail_padding_alignment_in_elements, 1);
 }
 
 Layout::Layout(const Layout& other)
-    : dim_attributes_(other.dim_attributes_),
-      n_dim_level_types_(other.n_dim_level_types_),
-      n_dim_unique_(other.n_dim_unique_),
-      n_dim_ordered_(other.n_dim_ordered_),
-      index_primitive_type_(other.index_primitive_type_),
+    : index_primitive_type_(other.index_primitive_type_),
       pointer_primitive_type_(other.pointer_primitive_type_),
       memory_space_(other.memory_space_),
       element_size_in_bits_(other.element_size_in_bits_),
@@ -174,10 +138,6 @@ Layout::~Layout() = default;
 
 Layout& Layout::operator=(const Layout& other) {
   if (this != &other) {
-    dim_attributes_ = other.dim_attributes_;
-    n_dim_level_types_ = other.n_dim_level_types_;
-    n_dim_unique_ = other.n_dim_unique_;
-    n_dim_ordered_ = other.n_dim_ordered_;
     minor_to_major_ = other.minor_to_major_;
     tiles_ = other.tiles_;
     tail_padding_alignment_in_elements_ =
@@ -200,30 +160,23 @@ Layout& Layout::operator=(const Layout& other) {
 
 Layout& Layout::operator=(Layout&& other) = default;
 
-/* static */ Layout Layout::CreateFromProto(const LayoutProto& proto) {
+/* static */ absl::StatusOr<Layout> Layout::FromProto(
+    const LayoutProto& proto) {
   Layout layout;
-  for (int dim_level_type : proto.dim_level_types()) {
-    layout.add_dim_level_type(static_cast<DimLevelType>(dim_level_type));
-  }
-  for (bool dim_unique : proto.dim_unique()) {
-    layout.add_dim_unique(dim_unique);
-  }
-  for (bool dim_ordered : proto.dim_ordered()) {
-    layout.add_dim_ordered(dim_ordered);
-  }
   layout.minor_to_major_.reserve(proto.minor_to_major_size());
   for (const int64_t dimension : proto.minor_to_major()) {
     layout.add_minor_to_major(dimension);
   }
   for (const TileProto& tile_proto : proto.tiles()) {
-    *layout.add_tiles() = Tile::CreateFromProto(tile_proto);
-  }
-  if (proto.tail_padding_alignment_in_elements() != 0) {
-    layout.set_tail_padding_alignment_in_elements(
-        proto.tail_padding_alignment_in_elements());
-  } else {
-    layout.set_tail_padding_alignment_in_elements(1);
-  }
+    TF_ASSIGN_OR_RETURN(*layout.add_tiles(), Tile::FromProto(tile_proto));
+  }
+  // If the proto does not have tail_padding_alignment_in_elements set, or have
+  // it set to 0, we treat it as 1.
+  const auto alignment = proto.tail_padding_alignment_in_elements() != 0
+                             ? proto.tail_padding_alignment_in_elements()
+                             : 1;
+  TF_RET_CHECK(alignment > 0);
+  layout.set_tail_padding_alignment_in_elements(alignment);
   layout.set_index_primitive_type(proto.index_primitive_type());
   layout.set_pointer_primitive_type(proto.pointer_primitive_type());
   layout.set_element_size_in_bits(proto.element_size_in_bits());
@@ -232,7 +185,8 @@ Layout& Layout::operator=(Layout&& other) = default;
     layout.add_split_configs(SplitConfig::CreateFromProto(split_config_proto));
   }
   if (proto.has_physical_shape()) {
-    *layout.mutable_physical_shape() = Shape(proto.physical_shape());
+    TF_ASSIGN_OR_RETURN(*layout.mutable_physical_shape(),
+                        Shape::FromProto(proto.physical_shape()));
   }
   layout.set_dynamic_shape_metadata_prefix_bytes(
       proto.dynamic_shape_metadata_prefix_bytes());
@@ -241,27 +195,13 @@ Layout& Layout::operator=(Layout&& other) = default;
 
 LayoutProto Layout::ToProto() const {
   LayoutProto proto;
-  SetProto(proto);
-  return proto;
-}
-
-void Layout::SetProto(LayoutProto& proto) const {
   proto.Clear();
-  for (int i = 0; i < n_dim_level_types_; i++) {
-    proto.add_dim_level_types(dim_level_type(i));
-  }
-  for (int i = 0; i < n_dim_unique_; i++) {
-    proto.add_dim_unique(dim_unique(i));
-  }
-  for (int i = 0; i < n_dim_ordered_; i++) {
-    proto.add_dim_ordered(dim_ordered(i));
-  }
   proto.mutable_minor_to_major()->Reserve(minor_to_major_size());
   for (const int64_t dimension : minor_to_major()) {
     proto.add_minor_to_major(dimension);
   }
   for (const Tile& tile : tiles()) {
-    tile.SetProto(*proto.add_tiles());
+    *proto.add_tiles() = tile.ToProto();
   }
   proto.set_tail_padding_alignment_in_elements(
       tail_padding_alignment_in_elements());
@@ -270,17 +210,23 @@ void Layout::SetProto(LayoutProto& proto) const {
   proto.set_element_size_in_bits(element_size_in_bits_);
   proto.set_memory_space(memory_space_);
   for (const SplitConfig& split_config : split_configs()) {
-    split_config.SetProto(*proto.add_split_configs());
+    *proto.add_split_configs() = split_config.ToProto();
   }
   if (has_physical_shape()) {
     *proto.mutable_physical_shape() = physical_shape_->ToProto();
   }
   proto.set_dynamic_shape_metadata_prefix_bytes(
       dynamic_shape_metadata_prefix_bytes_);
+  return proto;
 }
 
-namespace {
-absl::string_view DimLevelTypeAbbrev(DimLevelType dim_level_type) {
+// Converts a DimLevelType to a single-character abbreviation:
+//   D: DIM_DENSE
+//   C: DIM_COMPRESSED
+//   S: DIM_SINGLETON
+//   H: DIM_LOOSE_COMPRESSED
+//   ?: the DimLevelType is invalid.
+static absl::string_view DimLevelTypeAbbrev(DimLevelType dim_level_type) {
   switch (dim_level_type) {
     case DIM_DENSE:
       return "D";
@@ -291,59 +237,44 @@ absl::string_view DimLevelTypeAbbrev(DimLevelType dim_level_type) {
     case xla::DIM_LOOSE_COMPRESSED:
       return "H";
     default:
-      LOG(FATAL) << "Invalid DimLevelType value: " << dim_level_type;
+      return "?";
   }
 }
-}  // namespace
 
 void Layout::Print(Printer* printer) const {
   printer->Append("{");
   AppendJoin(printer, minor_to_major(), ",");
 
   bool colon_printed = false;
-  auto print_colon = [&]() {
-    if (colon_printed) return;
-    printer->Append(":");
-    colon_printed = true;
-  };
-
-  if (n_dim_level_types_ > 0) {
-    auto print_one = [&](int i) {
-      printer->Append(DimLevelTypeAbbrev(dim_level_type(i)));
-      if (n_dim_unique_ > 0 && !dim_unique(i)) {
-        printer->Append("+");
-      }
-      if (n_dim_ordered_ > 0 && !dim_ordered(i)) {
-        printer->Append("~");
-      }
-    };
-    print_colon();
-    printer->Append("D(");
-    print_one(0);
-    for (int i = 1; i < n_dim_level_types_; ++i) {
-      printer->Append(",");
-      print_one(i);
+  auto print_colon_if_have_not = [&]() {
+    if (!colon_printed) {
+      printer->Append(":");
+      colon_printed = true;
     }
-    printer->Append(")");
-  }
+  };
 
+  // Print the tiles as T(...)...(...).
   if (!tiles().empty()) {
-    print_colon();
+    print_colon_if_have_not();
     printer->Append("T");
     for (const Tile& tile : tiles()) {
       tile.Print(printer);
     }
   }
 
+  // Print the tail padding alignment as L(n). Omit this if n is 1.
   if (tail_padding_alignment_in_elements() != 1) {
-    print_colon();
+    print_colon_if_have_not();
     printer->Append("L(");
     printer->Append(tail_padding_alignment_in_elements());
     printer->Append(")");
   }
 
+  // Print the primitive type used for indices as #(type). Print
+  // #(invalid) if the type is valid but not an integer. Omit this if the type
+  // is PRIMITIVE_TYPE_INVALID.
   if (index_primitive_type() != PRIMITIVE_TYPE_INVALID) {
-    print_colon();
+    print_colon_if_have_not();
     if (primitive_util::IsIntegralType(index_primitive_type())) {
       printer->Append("#(");
       printer->Append(
@@ -354,8 +285,11 @@ void Layout::Print(Printer* printer) const {
     }
   }
 
+  // Print the primitive type used for poitners as *(type). Print *(invalid) if
+  // the type is valid but not a pointer. Omit this if the type is
+  // PRIMITIVE_TYPE_INVALID.
   if (pointer_primitive_type() != PRIMITIVE_TYPE_INVALID) {
-    print_colon();
+    print_colon_if_have_not();
     if (primitive_util::IsIntegralType(pointer_primitive_type())) {
       printer->Append("*(");
       printer->Append(
@@ -366,36 +300,45 @@ void Layout::Print(Printer* printer) const {
     }
   }
 
+  // Print the element size in bits as E(n). Omit this if n is 0.
   if (element_size_in_bits() != 0) {
-    print_colon();
+    print_colon_if_have_not();
     printer->Append("E(");
     printer->Append(element_size_in_bits());
     printer->Append(")");
   }
 
+  // Print the memory space as S(n). Omit this if n is 0.
   if (memory_space() != 0) {
-    print_colon();
+    print_colon_if_have_not();
     printer->Append("S(");
     printer->Append(memory_space());
     printer->Append(")");
   }
+
+  // Print the split configs as SC(...)...(...). Omit this if the split configs
+  // are empty.
   if (!split_configs().empty()) {
-    print_colon();
+    print_colon_if_have_not();
     printer->Append("SC");
     for (const auto& split_config : split_configs()) {
       printer->Append(split_config.ToString());
     }
   }
 
+  // Print the physical shape as P(physical_shape). Omit this if the physical
+  // shape is not set.
   if (has_physical_shape()) {
-    print_colon();
+    print_colon_if_have_not();
     printer->Append("P(");
     physical_shape_->Print(printer, /*print_layout=*/true);
     printer->Append(")");
   }
 
+  // Print the dynamic shape metadata prefix bytes as M(n). Omit this if n is
+  // 0.
   if (dynamic_shape_metadata_prefix_bytes_ > 0) {
-    print_colon();
+    print_colon_if_have_not();
     printer->Append("M(");
     printer->Append(dynamic_shape_metadata_prefix_bytes());
     printer->Append(")");
@@ -411,35 +354,6 @@ std::string Layout::ToString() const {
 }
 
 bool Layout::Equal::operator()(const Layout& lhs, const Layout& rhs) {
-  if (!LayoutUtil::IsDense(lhs) || !LayoutUtil::IsDense(rhs)) {
-    // dim_level_types
-    if (lhs.dim_level_types_size() != rhs.dim_level_types_size()) {
-      return false;
-    }
-    for (int i = 0; i < lhs.dim_level_types_size(); i++) {
-      if (lhs.dim_level_type(i) != rhs.dim_level_type(i)) {
-        return false;
-      }
-    }
-    // dim_unique
-    if (lhs.dim_unique_size() != rhs.dim_unique_size()) {
-      return false;
-    }
-    for (int i = 0; i < lhs.dim_unique_size(); i++) {
-      if (lhs.dim_unique(i) != rhs.dim_unique(i)) {
-        return false;
-      }
-    }
-    // dim_ordered
-    if (lhs.dim_ordered_size() != rhs.dim_ordered_size()) {
-      return false;
-    }
-    for (int i = 0; i < lhs.dim_ordered_size(); i++) {
-      if (lhs.dim_ordered(i) != rhs.dim_ordered(i)) {
-        return false;
-      }
-    }
-  }
   if (lhs.minor_to_major() != rhs.minor_to_major()) {
     return false;
   }
@@ -505,8 +419,10 @@ Shape* Layout::mutable_physical_shape() {
 
 void Layout::clear_physical_shape() { physical_shape_ = nullptr; }
 
-Layout& Layout::DeleteDimension(int64_t dim_to_delete) {
-  for (int64_t i = 0; i < minor_to_major_.size();) {
+Layout& Layout::DeleteDimension(int dim_to_delete) {
+  CHECK_GE(dim_to_delete, 0);
+  CHECK_LT(dim_to_delete, minor_to_major_.size());
+  for (int i = 0; i < minor_to_major_.size();) {
     if (minor_to_major_[i] == dim_to_delete) {
       minor_to_major_.erase(minor_to_major_.begin() + i);
       continue;
@@ -516,13 +432,6 @@ Layout& Layout::DeleteDimension(int64_t dim_to_delete) {
     }
     ++i;
   }
-  // Delete the corresponding dim level types.
-  if (LayoutUtil::IsSparse(*this)) {
-    if (dim_to_delete < n_dim_level_types_) n_dim_level_types_--;
-    if (dim_to_delete < n_dim_unique_) n_dim_unique_--;
-    if (dim_to_delete < n_dim_ordered_) n_dim_ordered_--;
-    dim_attributes_.erase(dim_attributes_.begin() + dim_to_delete);
-  }
   return *this;
 }
 
diff --git a/third_party/xla/xla/layout.h b/third_party/xla/xla/layout.h
index fc3d9fab551e..f5d59ad15c4a 100644
--- a/third_party/xla/xla/layout.h
+++ b/third_party/xla/xla/layout.h
@@ -22,9 +22,15 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/printer.h"
+#include "xla/status_macros.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -34,8 +40,7 @@ namespace xla {
 class Shape;
 
 // Describes a tile used in tiling-based layout. Refer to
-// g3doc/third_party/tensorflow/compiler/xla/g3doc/tiled_layout.md for
-// details.
+// g3doc/third_party/xla/docs/tiled_layout.md for details.
 class Tile {
  public:
   Tile() = default;
@@ -43,19 +48,29 @@ class Tile {
       : dimensions_(dimensions.begin(), dimensions.end()) {}
 
   // De/Serialize a Tile to and from a TileProto.
-  static Tile CreateFromProto(const TileProto& tile_proto) {
-    return Tile(tile_proto.dimensions());
+  static absl::StatusOr<Tile> FromProto(const TileProto& tile_proto) {
+    Tile tile;
+    tile.dimensions_.reserve(tile_proto.dimensions_size());
+    for (int64_t dimension : tile_proto.dimensions()) {
+      TF_RET_CHECK(dimension >= 0);
+      tile.add_dimensions(dimension);
+    }
+    return tile;
   }
   TileProto ToProto() const;
-  void SetProto(TileProto& tile_proto) const;
 
   bool operator==(const Tile& other) const {
     return dimensions() == other.dimensions();
   }
   bool operator!=(const Tile& other) const { return !(*this == other); }
 
+  // Prints the Tile in the following format:
+  // (dimension_1,dimension_2,...).
+  // For example, (*,*,2,*,8) means that the tile has 2 dimensions, and the
+  // dimension sizes are 2 and 8. '*' means the corresponding dimension in
+  // the shape should be combined with the next more minor dimension before
+  // tiling.
   void Print(Printer* printer) const;
-
   std::string ToString() const;
 
   // Returns the bound of the tile in the given dimension index.
@@ -97,7 +112,8 @@ using TileVector = absl::InlinedVector<Tile, 3>;
 // where the splits occur. For example, if the dimension contains 1024 elements,
 // a split indices value of {512} indicates splitting this dimension into two
 // right through the middle. The dimension here refers to the physical dimension
-// such that 0 is the majormost dimension and rank-1 is the minormost dimension.
+// such that 0 is the majormost dimension and (number of dimensions - 1) is the
+// minormost dimension.
 class SplitConfig {
  public:
   SplitConfig(int64_t dimension, absl::Span<const int64_t> split_indices)
@@ -110,7 +126,6 @@ class SplitConfig {
                        split_config_proto.split_indices());
   }
   SplitConfigProto ToProto() const;
-  void SetProto(SplitConfigProto& split_config_proto) const;
 
   bool operator==(const SplitConfig& other) const {
     return dimension() == other.dimension() &&
@@ -118,6 +133,9 @@ class SplitConfig {
   }
   bool operator!=(const SplitConfig& other) const { return !(*this == other); }
 
+  // Formats this SplitConfig as "(dimension:split_indices)".
+  // For example, (0:512,1024) means that dimension 0 is split into three
+  // parts at indices 512 and 1024.
   std::string ToString() const;
 
   // Returns the dimension that is split.
@@ -130,7 +148,8 @@ class SplitConfig {
   // Returns the indices where splits occur.
   absl::Span<const int64_t> split_indices() const { return split_indices_; }
   int64_t split_indices(int64_t idx) const { return split_indices_.at(idx); }
-  int64_t split_indices_size() const { return split_indices_.size(); }
+  ABSL_DEPRECATE_AND_INLINE()
+  int64_t split_indices_size() const { return split_indices().size(); }
   SplitConfig& add_split_indices(int64_t split_index) {
     split_indices_.push_back(split_index);
     return *this;
@@ -150,8 +169,6 @@ class SplitConfig {
   absl::InlinedVector<int64_t, 1> split_indices_;
 };
 
-// TODO: Rename the `dim_level_types` field to `lvl_types`, so that it
-// matches `mlir::sparse_tensor::SparseTensorEncodingAttr`.
 class Layout {
  public:
   Layout();
@@ -159,22 +176,16 @@ class Layout {
   Layout(Layout&& other);
   ~Layout();
 
-  // Constructs a dense layout with the given minor-to-major order.
-  explicit Layout(absl::Span<const int64_t> minor_to_major);
-
-  explicit Layout(absl::Span<const int64_t> minor_to_major,
-                  absl::Span<const Tile> tiles, int64_t element_size_in_bits);
+  Layout(absl::Span<const int64_t> minor_to_major, absl::Span<const Tile> tiles,
+         int64_t element_size_in_bits);
 
-  // Constructs a dense tiled layout with the given minor-to-major order, dim
-  // level types, and tiles.
+  // Constructs a dense tiled layout with the given minor-to-major order and
+  // tiles.
   explicit Layout(absl::Span<const int64_t> minor_to_major,
-                  absl::Span<const DimLevelType> dim_level_types,
-                  absl::Span<const bool> dim_unique,
-                  absl::Span<const bool> dim_ordered,
-                  absl::Span<const Tile> tiles,
-                  int64_t tail_padding_alignment_in_elements = 1,
+                  absl::Span<const Tile> tiles = {},
                   PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
                   PrimitiveType element_primitive_type = PRIMITIVE_TYPE_INVALID,
+                  int64_t tail_padding_alignment_in_elements = 1,
                   int64_t element_size_in_bits = 0, int64_t memory_space = 0,
                   absl::Span<const SplitConfig> split_configs = {},
                   std::unique_ptr<Shape> physical_shape = nullptr,
@@ -183,15 +194,57 @@ class Layout {
   Layout& operator=(const Layout& other);
   Layout& operator=(Layout&& other);
 
-  // Construct a shape from a LayoutProto.
-  static Layout CreateFromProto(const LayoutProto& proto);
+  // Creates a Layout from a LayoutProto.
+  static absl::StatusOr<Layout> FromProto(const LayoutProto& proto);
+
+  ABSL_DEPRECATED("Use FromProto instead.")
+  static Layout CreateFromProto(const LayoutProto& proto) {
+    return FromProto(proto).value();
+  }
 
   // Returns a LayoutProto representation of the Layout.
   LayoutProto ToProto() const;
-  // Sets a LayoutProto to the representation of the Layout.
-  void SetProto(LayoutProto& proto) const;
 
-  // Prints a human-readable string that represents this layout.
+  // Prints this layout as human-readable string, in the format
+  // "{minor_to_major:properties}", where the fields are:
+  //
+  //   minor_to_major: Comma-separated minor-to-major order of the dimensions.
+  //                   E.g. "{1,0}" means that dimension 1 is the most minor
+  //                   dimension, and dimension 0 is the most major dimension.
+  //   properties: concatenation of the following, separated by nothing (a
+  //               property is ommitted if it is the default):
+  //     D(...): Comma-separated list of attributes for each dimension. Each
+  //             attribute is a single character abbreviation of the dimension
+  //             level type
+  //            The  abbreviations can be:
+  //               D: DIM_DENSE
+  //               C: DIM_COMPRESSED
+  //               S: DIM_SINGLETON
+  //               H: DIM_LOOSE_COMPRESSED
+  //             E.g.
+  //               D(D,C): dimension 0 is dense.
+  //                       dimension 1 is compressed.
+  //             If omitted, all dimensions are dense.
+  //     T(...)...(...): The tiling (each (...) is acomma-separated list of
+  //                     tile bound sizes). E.g.
+  //             T(2,4)(3,5): The shape is tiled with 2x4 and 3x5 tiles.
+  //             T(*,*,2,*,4): The dimensions corresponding the '*' are first
+  //                 combined with the next more minor dimension, and then the
+  //                 result shape is tiled with 2x4 tiles.
+  //             If omitted, the shape is not tiled.
+  //     L(n): The tail padding alignment in elements. Omitted if n is 1.
+  //     #(type): The type of the indices.
+  //     *(type): The type of the pointers.
+  //     E(n): The element size in bits.
+  //     S(n): The numeric value of thememory space. See the definition of
+  //           Layout::memory_space() for details.
+  //     SC(...)...(...): List of split configs, separated by nothing. Each
+  //              (...) is a string of the form "(dimension:split_indices)".
+  //              E.g. SC(1:512)(2:1024,2048): dimension 1 is split into 2 parts
+  //              at index 512, and dimension 2 is split into 3 parts at index
+  //              1024 and 2048.
+  //     P(shape): The physical shape.
+  //     M(n): The dynamic shape metadata prefix bytes. Omitted if n is 0.
   void Print(Printer* printer) const;
 
   // Returns a human-readable string that represents this layout.
@@ -280,64 +333,9 @@ class Layout {
   // TODO(b/29771030): Replace or augment these methods with a more ergonomic
   // interface.
 
-  // Methods for accessing the DimLevelType array.
-  int dim_level_types_size() const { return n_dim_level_types_; }
-  DimLevelType dim_level_type(int index) const {
-    return dim_attributes_[index].dim_level_type;
-  }
-  Layout& set_dim_level_type(int index, DimLevelType dim_level_type) {
-    dim_attributes_[index].dim_level_type = dim_level_type;
-    return *this;
-  }
-  Layout& add_dim_level_type(DimLevelType dim_level_type) {
-    while (n_dim_level_types_ >= dim_attributes_.size()) {
-      dim_attributes_.push_back(DimInfo());
-    }
-    dim_attributes_[n_dim_level_types_].dim_level_type = dim_level_type;
-    n_dim_level_types_++;
-    return *this;
-  }
-  Layout& clear_dim_level_types() {
-    n_dim_level_types_ = 0;
-    return *this;
-  }
-
-  // Methods for accessing the dim_unique array.
-  int dim_unique_size() const { return n_dim_unique_; }
-  bool dim_unique(int index) const { return dim_attributes_[index].dim_unique; }
-  Layout& set_dim_unique(int index, bool unique) {
-    dim_attributes_[index].dim_unique = unique;
-    return *this;
-  }
-  Layout& add_dim_unique(bool unique) {
-    while (n_dim_unique_ >= dim_attributes_.size()) {
-      dim_attributes_.push_back(DimInfo());
-    }
-    dim_attributes_[n_dim_unique_].dim_unique = unique;
-    n_dim_unique_++;
-    return *this;
-  }
-
-  // Methods for accessing the dim_ordered array.
-  int dim_ordered_size() const { return n_dim_ordered_; }
-  bool dim_ordered(int index) const {
-    return dim_attributes_[index].dim_ordered;
-  }
-  Layout& set_dim_ordered(int index, bool ordered) {
-    dim_attributes_[index].dim_ordered = ordered;
-    return *this;
-  }
-  Layout& add_dim_ordered(bool ordered) {
-    while (n_dim_ordered_ >= dim_attributes_.size()) {
-      dim_attributes_.push_back(DimInfo());
-    }
-    dim_attributes_[n_dim_ordered_].dim_ordered = ordered;
-    n_dim_ordered_++;
-    return *this;
-  }
-
   // Methods for accessing the minor-to-major array.
-  int minor_to_major_size() const { return minor_to_major_.size(); }
+  ABSL_DEPRECATE_AND_INLINE()
+  int minor_to_major_size() const { return minor_to_major().size(); }
   int64_t minor_to_major(int index) const { return minor_to_major_[index]; }
   Layout& set_minor_to_major(int index, int64_t value) {
     minor_to_major_[index] = value;
@@ -351,15 +349,19 @@ class Layout {
     minor_to_major_.clear();
     return *this;
   }
-  // Removes the given dimension from 'minor_to_major_', and adjusts the other
-  // dimensions accordingly. Also adjusts 'dim_level_types_', 'dim_ordered_' and
-  // 'dim_unique_' in case it is a sparse layout.
-  Layout& DeleteDimension(int64_t dim_to_delete);
+
   absl::Span<const int64_t> minor_to_major() const { return minor_to_major_; }
   DimensionVector* mutable_minor_to_major() { return &minor_to_major_; }
 
+  // Removes the given dimension from 'minor_to_major_', and adjusts the other
+  // dimensions accordingly.
+  //
+  // Precondition: dim_to_delete is in the range [0, minor_to_major_size()).
+  Layout& DeleteDimension(int dim_to_delete);
+
   // Methods for accessing the tile field.
-  int64_t tiles_size() const { return tiles_.size(); }
+  ABSL_DEPRECATE_AND_INLINE()
+  int64_t tiles_size() const { return tiles().size(); }
   const Tile& tiles(int index) const { return tiles_[index]; }
   Tile* mutable_tiles(int index) { return &tiles_[index]; }
   Tile* add_tiles() {
@@ -382,7 +384,9 @@ class Layout {
   int64_t tail_padding_alignment_in_elements() const {
     return tail_padding_alignment_in_elements_;
   }
+
   Layout& set_tail_padding_alignment_in_elements(int64_t value) {
+    CHECK_GE(value, 1);
     tail_padding_alignment_in_elements_ = value;
     return *this;
   }
@@ -410,7 +414,8 @@ class Layout {
     return *this;
   }
 
-  int split_configs_size() const { return split_configs_.size(); }
+  ABSL_DEPRECATE_AND_INLINE()
+  int split_configs_size() const { return split_configs().size(); }
   const SplitConfig& split_configs(int index) const {
     return split_configs_.at(index);
   }
@@ -436,8 +441,9 @@ class Layout {
   int64_t dynamic_shape_metadata_prefix_bytes() const {
     return dynamic_shape_metadata_prefix_bytes_;
   }
-  void set_dynamic_shape_metadata_prefix_bytes(int64_t bytes) {
+  Layout& set_dynamic_shape_metadata_prefix_bytes(int64_t bytes) {
     dynamic_shape_metadata_prefix_bytes_ = bytes;
+    return *this;
   }
 
   void Swap(Layout* other) {
@@ -456,21 +462,6 @@ class Layout {
   }
 
  private:
-  // We store a single inlined vector to hold
-  struct DimInfo {
-    DimInfo()
-        : dim_level_type(DIM_DENSE), dim_unique(false), dim_ordered(false) {}
-
-    DimLevelType dim_level_type : 6;
-    bool dim_unique : 1;
-    bool dim_ordered : 1;
-  };
-  absl::InlinedVector<DimInfo, InlineRank()> dim_attributes_;
-
-  uint8_t n_dim_level_types_ = 0;
-  uint8_t n_dim_unique_ = 0;
-  uint8_t n_dim_ordered_ = 0;
-
   // The primitive type to use for sparse array indices and pointers.  Each of
   // these must either be INVALID, or an unsigned integer type.
   PrimitiveType index_primitive_type_ : 8;
@@ -503,13 +494,15 @@ class Layout {
   // the tensor is split between different physical memories.
   absl::InlinedVector<SplitConfig, 1> split_configs_;
 
-  // The shape is padded at the end to multiple of, in terms of number of
-  // elements. This is useful when tiling does not bring the shape to certain
-  // desired granules. Tiling effectively pads/reshapes/transposes the shape
-  // to another shape. This field pads the total number of elements of that
-  // new shape to a multiple of certain number of elements. This is useful such
-  // as we want a layout which does not tile the data but still requires it to
-  // be padded to certain number of elements.
+  // The shape is padded at the end to a multiple of, in terms of number of
+  // elements, this value. This is useful when tiling does not bring the shape
+  // to certain desired granules. Tiling effectively pads/reshapes/transposes
+  // the shape to another shape. This field pads the total number of elements of
+  // that new shape to a multiple of certain number of elements. This is useful
+  // such as we want a layout which does not tile the data but still requires it
+  // to be padded to certain number of elements.
+  //
+  // Invariant: this must be >= 1.
   int64_t tail_padding_alignment_in_elements_ = 1;
 
   // The physical on-device shape used to represent a sparse array.
diff --git a/third_party/xla/xla/layout_test.cc b/third_party/xla/xla/layout_test.cc
index e26b020ea463..91e6893f0971 100644
--- a/third_party/xla/xla/layout_test.cc
+++ b/third_party/xla/xla/layout_test.cc
@@ -20,39 +20,97 @@ limitations under the License.
 #include <sstream>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "xla/hlo/testlib/test.h"
+#include "xla/layout_util.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-class LayoutTest : public ::testing::Test {};
+using ::testing::ElementsAre;
 
-TEST_F(LayoutTest, ToString) {
-  EXPECT_EQ(Layout().ToString(), "{}");
-  EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
-  EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
-  EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
-                .ToString(),
+TEST(Layout, ToStringForEmpty) { EXPECT_EQ(Layout().ToString(), "{}"); }
+
+TEST(Layout, ToStringForMinorToMajorOnly) {
+  EXPECT_EQ(Layout({1, 2, 0}).ToString(), "{1,2,0}");
+}
+
+TEST(Layout, ToStringForTiles) {
+  EXPECT_EQ(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}).ToString(),
             "{3,2,1,0:T(42,123)(4,5)}");
-  EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
+}
+
+TEST(Layout, ToStringForTileWithCombinedDimensions) {
+  EXPECT_EQ(Layout({3, 2, 1, 0}, {Tile({Tile::kCombineDimension,
+                                        Tile::kCombineDimension, 42, 123})})
+                .ToString(),
+            "{3,2,1,0:T(*,*,42,123)}");
+}
+
+TEST(Layout, ToStringForTailPaddingAlignment) {
+  EXPECT_EQ(Layout({3, 2, 1, 0})
                 .set_tail_padding_alignment_in_elements(100)
-                .set_element_size_in_bits(42)
                 .ToString(),
-            "{3,2,1,0:T(42,123)(4,5)L(100)E(42)}");
-  EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
-                .set_memory_space(3)
+            "{3,2,1,0:L(100)}");
+}
+
+TEST(Layout, ToStringForIndexPrimitiveType) {
+  EXPECT_EQ(Layout({3, 2, 1, 0})
+                .set_index_primitive_type(PrimitiveType::U32)
                 .ToString(),
-            "{3,2,1,0:T(42,123)(4,5)S(3)}");
-  EXPECT_EQ(Layout({0, 1}, {}, {}, {}, {Tile({123})})
+            "{3,2,1,0:#(u32)}");
+}
+
+TEST(Layout, ToStringForPointerPrimitiveType) {
+  EXPECT_EQ(Layout({3, 2, 1, 0})
+                .set_pointer_primitive_type(PrimitiveType::U16)
+                .ToString(),
+            "{3,2,1,0:*(u16)}");
+}
+
+TEST(Layout, ToStringForElementSize) {
+  EXPECT_EQ(Layout({3, 2, 1, 0}).set_element_size_in_bits(42).ToString(),
+            "{3,2,1,0:E(42)}");
+}
+
+TEST(Layout, ToStringForMemorySpace) {
+  EXPECT_EQ(Layout({3, 2, 1, 0}).set_memory_space(3).ToString(),
+            "{3,2,1,0:S(3)}");
+}
+
+TEST(Layout, ToStringForSplitConfigs) {
+  EXPECT_EQ(Layout({0, 1})
                 .add_split_configs(SplitConfig(0, {3}))
                 .add_split_configs(SplitConfig(1, {0, 4}))
                 .ToString(),
-            "{0,1:T(123)SC(0:3)(1:0,4)}");
+            "{0,1:SC(0:3)(1:0,4)}");
+}
+
+TEST(Layout, ToStringForPhysicalShape) {
+  Layout layout({0, 1});
+  *layout.mutable_physical_shape() = ShapeUtil::MakeShape(S32, {10, 20});
+  EXPECT_EQ(layout.ToString(), "{0,1:P(s32[10,20]{1,0})}");
 }
 
-TEST_F(LayoutTest, StreamOut) {
+TEST(Layout, ToStringForDynamicShapeMetadataPrefixBytes) {
+  EXPECT_EQ(
+      Layout({0, 1}).set_dynamic_shape_metadata_prefix_bytes(123).ToString(),
+      "{0,1:M(123)}");
+}
+
+TEST(Layout, ToStringForMutipleProperties) {
+  EXPECT_EQ(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})})
+                .set_tail_padding_alignment_in_elements(100)
+                .set_element_size_in_bits(42)
+                .ToString(),
+            "{3,2,1,0:T(42,123)(4,5)L(100)E(42)}");
+}
+
+TEST(Layout, StreamOut) {
   {
     std::ostringstream oss;
     oss << Tile({7, 8});
@@ -66,19 +124,18 @@ TEST_F(LayoutTest, StreamOut) {
   }
 }
 
-TEST_F(LayoutTest, Equality) {
+TEST(Layout, Equality) {
   EXPECT_EQ(Layout(), Layout());
   const std::vector<int64_t> empty_dims;
   EXPECT_EQ(Layout(empty_dims), Layout(empty_dims));
   EXPECT_EQ(Layout(), Layout(empty_dims));
   EXPECT_EQ(Layout({0, 1, 2, 3}), Layout({0, 1, 2, 3}));
   EXPECT_NE(Layout({0, 1, 2, 3}), Layout({0, 1, 2}));
-  EXPECT_EQ(Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}),
-            Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}));
-  EXPECT_NE(Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}),
-            Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 45})}));
-  EXPECT_NE(Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}),
-            Layout({0, 1, 2, 3}));
+  EXPECT_EQ(Layout({0, 1, 2}, {Tile({42, 44})}),
+            Layout({0, 1, 2}, {Tile({42, 44})}));
+  EXPECT_NE(Layout({0, 1, 2}, {Tile({42, 44})}),
+            Layout({0, 1, 2}, {Tile({42, 45})}));
+  EXPECT_NE(Layout({0, 1, 2}, {Tile({42, 44})}), Layout({0, 1, 2, 3}));
   EXPECT_EQ(Layout({0, 1, 2}).set_element_size_in_bits(33),
             Layout({0, 1, 2}).set_element_size_in_bits(33));
   EXPECT_NE(Layout({0, 1, 2}).set_element_size_in_bits(33),
@@ -87,19 +144,18 @@ TEST_F(LayoutTest, Equality) {
             Layout({0, 1, 2}).set_memory_space(3));
   EXPECT_NE(Layout({0, 1, 2}).set_memory_space(1),
             Layout({0, 1, 2}).set_memory_space(3));
-  EXPECT_FALSE(Layout::Equal()(Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}),
-                               Layout({0, 1, 2})));
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}, {Tile({42, 44})}), Layout({0, 1, 2})));
   EXPECT_EQ(Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
             Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})));
   EXPECT_NE(Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {2})),
             Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {3})));
-  EXPECT_TRUE(Layout::Equal().IgnoreTiles()(
-      Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}), Layout({0, 1, 2})));
-  EXPECT_FALSE(Layout::Equal()(
-      Layout({0, 1, 2}, {}, {}, {}, {}, 1, PRIMITIVE_TYPE_INVALID,
-             PRIMITIVE_TYPE_INVALID, 32),
-      Layout({0, 1, 2}, {}, {}, {}, {}, 1, PRIMITIVE_TYPE_INVALID,
-             PRIMITIVE_TYPE_INVALID, 1)));
+  EXPECT_TRUE(Layout::Equal().IgnoreTiles()(Layout({0, 1, 2}, {Tile({42, 44})}),
+                                            Layout({0, 1, 2})));
+  EXPECT_FALSE(Layout::Equal()(Layout({0, 1, 2}, {}, PRIMITIVE_TYPE_INVALID,
+                                      PRIMITIVE_TYPE_INVALID, 1, 32),
+                               Layout({0, 1, 2}, {}, PRIMITIVE_TYPE_INVALID,
+                                      PRIMITIVE_TYPE_INVALID, 1, 1)));
   EXPECT_TRUE(Layout::Equal().IgnoreElementSize()(
       Layout({0, 1, 2}).set_element_size_in_bits(32),
       Layout({0, 1, 2}).set_element_size_in_bits(1)));
@@ -111,26 +167,43 @@ TEST_F(LayoutTest, Equality) {
       Layout({0, 1, 2}).add_split_configs(SplitConfig(0, {3}))));
 }
 
-TEST_F(LayoutTest, LayoutToFromProto) {
+TEST(Layout, LayoutToFromProto) {
   // Round-trips a Layout through proto de/serialization.
   auto expect_unchanged = [](const Layout& layout) {
-    EXPECT_EQ(layout, Layout::CreateFromProto(layout.ToProto()));
+    const auto layout_proto = layout.ToProto();
+    const auto from_proto_result = Layout::FromProto(layout_proto);
+    TF_ASSERT_OK(from_proto_result);
+    EXPECT_EQ(layout, from_proto_result.value());
   };
 
   expect_unchanged(Layout());
   expect_unchanged(Layout({1, 3, 2, 0}));
   expect_unchanged(Layout({0, 1}).set_element_size_in_bits(42));
-  expect_unchanged(
-      Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})}));
-  expect_unchanged(Layout({1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {}));
-  expect_unchanged(
-      Layout({1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {}, 1,
-             PRIMITIVE_TYPE_INVALID, PRIMITIVE_TYPE_INVALID, 0, 0, {},
-             std::make_unique<Shape>(ShapeUtil::MakeShape(S32, {10, 10}))));
-  expect_unchanged(Layout({0, 1}, {}, {}, {}, {Tile({123})})
+  expect_unchanged(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}));
+  expect_unchanged(Layout({1, 0}, {}));
+  expect_unchanged(Layout(
+      {1, 0}, {}, PRIMITIVE_TYPE_INVALID, PRIMITIVE_TYPE_INVALID, 1, 0, 0, {},
+      std::make_unique<Shape>(ShapeUtil::MakeShape(S32, {10, 10}))));
+  expect_unchanged(Layout({0, 1}, {Tile({123})})
                        .add_split_configs(SplitConfig(0, {3}))
                        .add_split_configs(SplitConfig(1, {0, 4})));
 }
 
+TEST(Layout, DeleteDimensionWorksForDeletingLastDimFromDenseLayout) {
+  Layout layout({0, 1});
+  ASSERT_EQ(layout.minor_to_major().size(), 2);
+
+  layout.DeleteDimension(1);
+  EXPECT_THAT(layout.minor_to_major(), ElementsAre(0));
+}
+
+TEST(Layout, DeleteDimensionWorksForDeletingNonLastDimFromDenseLayout) {
+  Layout layout({1, 0});
+  ASSERT_EQ(layout.minor_to_major().size(), 2);
+
+  layout.DeleteDimension(0);
+  EXPECT_THAT(layout.minor_to_major(), ElementsAre(0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index 749ed8131256..5c025f38a5e3 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -58,15 +58,11 @@ void SetDefaultLayoutToContainer(T* minor_to_major) {
   }
 }
 
-absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
-
 }  // namespace
 
 /* static */ Layout LayoutUtil::MakeLayout(
-    absl::Span<const int64_t> minor_to_major,
-    absl::Span<const DimLevelType> dim_level_types,
-    absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
-    absl::Span<const Tile> tiles, int64_t tail_padding_alignment_in_elements,
+    absl::Span<const int64_t> minor_to_major, absl::Span<const Tile> tiles,
+    int64_t tail_padding_alignment_in_elements,
     PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
     int64_t element_size_in_bits, int64_t memory_space,
     absl::Span<const SplitConfig> split_configs,
@@ -76,15 +72,6 @@ absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
   for (int64_t dimension_number : minor_to_major) {
     layout.add_minor_to_major(dimension_number);
   }
-  for (DimLevelType dim_level_type : dim_level_types) {
-    layout.add_dim_level_type(dim_level_type);
-  }
-  for (bool unique : dim_unique) {
-    layout.add_dim_unique(unique);
-  }
-  for (bool ordered : dim_ordered) {
-    layout.add_dim_ordered(ordered);
-  }
   for (const Tile& tile : tiles) {
     for (int64_t dim : tile.dimensions()) {
       if (dim < 0 && dim != Tile::kCombineDimension) {
@@ -113,18 +100,26 @@ absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
   return layout;
 }
 
-/* static */ Layout LayoutUtil::MakeDescendingLayout(int64_t rank) {
-  std::vector<int64_t> layout(rank);
+/* static */ Layout LayoutUtil::MakeDescendingLayout(int64_t num_dims) {
+  std::vector<int64_t> layout(num_dims);
   std::iota(layout.rbegin(), layout.rend(), static_cast<int64_t>(0));
   return MakeLayout(layout);
 }
 
-/* static */ Layout LayoutUtil::MakeAscendingLayout(int64_t rank) {
-  std::vector<int64_t> layout(rank);
+/* static */ bool LayoutUtil::HasDescendingLayout(const Layout& layout) {
+  return absl::c_is_sorted(layout.minor_to_major(), std::greater<int64_t>());
+}
+
+/* static */ Layout LayoutUtil::MakeAscendingLayout(int64_t num_dims) {
+  std::vector<int64_t> layout(num_dims);
   std::iota(layout.begin(), layout.end(), static_cast<int64_t>(0));
   return MakeLayout(layout);
 }
 
+/* static */ bool LayoutUtil::HasAscendingLayout(const Layout& layout) {
+  return absl::c_is_sorted(layout.minor_to_major(), std::less<int64_t>());
+}
+
 /* static */ Layout LayoutUtil::MakeLayoutFromMajorToMinor(
     absl::Span<const int64_t> major_to_minor) {
   Layout layout;
@@ -136,11 +131,12 @@ absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
 
 namespace {
 
-// Internal helper that creates a default layout for an array of the given rank.
-Layout CreateDefaultLayoutForRank(int64_t rank) {
+// Internal helper that creates a default layout for an array of the given
+// number of dimensions.
+Layout CreateDefaultLayoutForRank(int64_t num_dims) {
   Layout layout;
   auto* minor_to_major = layout.mutable_minor_to_major();
-  minor_to_major->resize(rank, 0);
+  minor_to_major->resize(num_dims, 0);
   SetDefaultLayoutToContainer(minor_to_major);
   return layout;
 }
@@ -155,11 +151,11 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
 
   // A Layout proto corresponds to a single array, not a tuple.
   CHECK(shape.IsArray());
-  return CreateDefaultLayoutForRank(shape.dimensions_size());
+  return CreateDefaultLayoutForRank(shape.dimensions().size());
 }
 
-/* static */ Layout LayoutUtil::GetDefaultLayoutForRank(int64_t rank) {
-  return CreateDefaultLayoutForRank(rank);
+/* static */ Layout LayoutUtil::GetDefaultLayoutForRank(int64_t num_dims) {
+  return CreateDefaultLayoutForRank(num_dims);
 }
 
 /* static */ Layout LayoutUtil::GetDefaultLayoutForR2() {
@@ -180,14 +176,12 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
     for (auto& element_shape : *shape->mutable_tuple_shapes()) {
       SetToDefaultLayout(&element_shape);
     }
-    shape->clear_layout();
   } else if (shape->IsArray()) {
     auto* minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
-    minor_to_major->resize(shape->dimensions_size(), 0);
+    minor_to_major->resize(shape->dimensions().size(), 0);
     SetDefaultLayoutToContainer(minor_to_major);
   } else {
     // Opaque, token types etc. have no layout.
-    shape->clear_layout();
   }
 }
 
@@ -198,8 +192,8 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
 }
 
 /* static */ void LayoutUtil::SetToDefaultLayout(ProgramShape* program_shape) {
-  for (auto& parameter_shape : *program_shape->mutable_parameters()) {
-    LayoutUtil::SetToDefaultLayout(&parameter_shape);
+  for (int i = 0; i < program_shape->parameters_size(); ++i) {
+    LayoutUtil::SetToDefaultLayout(program_shape->mutable_parameters(i));
   }
   LayoutUtil::SetToDefaultLayout(program_shape->mutable_result());
 }
@@ -207,10 +201,6 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
 /* static */ absl::Status LayoutUtil::ValidateLayoutInShape(
     const Shape& shape, bool allow_missing_layouts) {
   if (shape.IsTuple()) {
-    // Tuple shape.
-    if (shape.has_layout()) {
-      return InvalidArgument("tuple should not have a layout field");
-    }
     for (auto& element_shape : shape.tuple_shapes()) {
       TF_RETURN_IF_ERROR(
           ValidateLayoutInShape(element_shape, allow_missing_layouts));
@@ -227,11 +217,6 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
     return ValidateLayoutForShape(shape.layout(), shape);
   } else {
     // Token, opaque, etc. shape.
-    if (shape.has_layout()) {
-      return InvalidArgument(
-          "shape of primitive type %s should not have a layout",
-          PrimitiveType_Name(shape.element_type()));
-    }
     return absl::OkStatus();
   }
 }
@@ -243,174 +228,61 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
   }
 
   if (!shape.IsArray()) {
-    if (layout.minor_to_major_size() != 0) {
-      return InvalidArgument(
-          "shape of primitive type %s should not have a non-trivial layout",
-          PrimitiveType_Name(shape.element_type()));
-    }
     return absl::OkStatus();
   }
 
-  if (layout.minor_to_major_size() != shape.rank()) {
+  if (layout.minor_to_major_size() != shape.dimensions().size()) {
     return InvalidArgument(
         "layout minor_to_major field contains %d elements, "
-        "but shape is rank %d: {%s}; shape: %s",
-        layout.minor_to_major_size(), shape.rank(),
-        absl::StrJoin(layout.minor_to_major(), ", "), shape.ShortDebugString());
+        "but shape has %d dimensions: {%s}; shape: %s",
+        layout.minor_to_major_size(), shape.dimensions().size(),
+        absl::StrJoin(layout.minor_to_major(), ", "), shape.ToString());
   }
 
-  absl::InlinedVector<bool, InlineRank()> dimensions_in_layout(shape.rank(),
-                                                               false);
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  absl::InlinedVector<bool, InlineRank()> dimensions_in_layout(
+      shape.dimensions().size(), false);
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     int64_t dim = layout.minor_to_major(i);
-    if (dim < 0 || dim >= shape.rank()) {
+    if (dim < 0 || dim >= shape.dimensions().size()) {
       return InvalidArgument(
           "layout minor_to_major field has out-of-bounds value: {%s}; shape: "
           "%s",
-          absl::StrJoin(layout.minor_to_major(), ", "),
-          shape.ShortDebugString());
+          absl::StrJoin(layout.minor_to_major(), ", "), shape.ToString());
     }
     if (dimensions_in_layout[dim]) {
       return InvalidArgument(
           "layout minor_to_major field has duplicate values: {%s}; shape: %s",
-          absl::StrJoin(layout.minor_to_major(), ", "),
-          shape.ShortDebugString());
+          absl::StrJoin(layout.minor_to_major(), ", "), shape.ToString());
     }
     dimensions_in_layout[dim] = true;
   }
 
-  if (layout.dim_level_types_size() > 0) {
-    if (layout.dim_level_types_size() != shape.rank()) {
-      std::vector<DimLevelType> dim_level_types(layout.dim_level_types_size());
-      for (int i = 0; i < dim_level_types.size(); i++) {
-        dim_level_types[i] = layout.dim_level_type(i);
-      }
-      return InvalidArgument(
-          "layout dim_level_types field contains %d elements, but shape is "
-          "rank %d: {%s}; shape: %s",
-          layout.dim_level_types_size(), shape.rank(),
-          absl::StrJoin(dim_level_types, ", ",
-                        [](std::string* out, DimLevelType dim_level_type) {
-                          absl::StrAppend(out,
-                                          DimLevelType_Name(dim_level_type));
-                        }),
-          shape.ShortDebugString());
-    }
-  }
-
-  if (layout.dim_unique_size() > 0) {
-    if (layout.dim_unique_size() != shape.rank()) {
-      std::vector<bool> dim_unique(layout.dim_unique_size());
-      for (int i = 0; i < dim_unique.size(); i++) {
-        dim_unique[i] = layout.dim_unique(i);
-      }
-      return InvalidArgument(
-          "layout dim_unique field contains %d elements, but shape is "
-          "rank %d: {%s}; shape: %s",
-          layout.dim_unique_size(), shape.rank(),
-          absl::StrJoin(dim_unique, ", ",
-                        [](std::string* out, bool dim_unique) {
-                          absl::StrAppend(out, BoolToString(dim_unique));
-                        }),
-          shape.ShortDebugString());
-    }
-  }
-
-  if (layout.dim_ordered_size() > 0) {
-    if (layout.dim_ordered_size() != shape.rank()) {
-      std::vector<bool> dim_ordered(layout.dim_ordered_size());
-      for (int i = 0; i < dim_ordered.size(); i++) {
-        dim_ordered[i] = layout.dim_ordered(i);
-      }
-      return InvalidArgument(
-          "layout dim_ordered field contains %d elements, but shape is "
-          "rank %d: {%s}; shape: %s",
-          layout.dim_ordered_size(), shape.rank(),
-          absl::StrJoin(dim_ordered, ", ",
-                        [](std::string* out, bool dim_ordered) {
-                          absl::StrAppend(out, BoolToString(dim_ordered));
-                        }),
-          shape.ShortDebugString());
-    }
-  }
-
   if (layout.tail_padding_alignment_in_elements() <= 0) {
     return InvalidArgument(
         "layout tail_padding_alignment_in_elements field is <= 0: {%d}",
         layout.tail_padding_alignment_in_elements());
   }
 
-  if (LayoutUtil::IsSparse(layout)) {
-    if (layout.tiles_size() > 0) {
-      return InvalidArgument(
-          "layout has tiles, but the shape is a sparse array: %s",
-          shape.ShortDebugString());
-    }
-    if (layout.has_physical_shape()) {
-      TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(layout.physical_shape()));
-      TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-          layout.physical_shape(),
-          [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
-            if (subshape.has_layout() &&
-                subshape.layout().has_physical_shape()) {
-              return InvalidArgument(
-                  "layout has a physical_shape, whose layout also has a "
-                  "physical shape: %s",
-                  shape.ShortDebugString());
-            }
-            return absl::OkStatus();
-          }));
-      if (layout.index_primitive_type() != PRIMITIVE_TYPE_INVALID &&
-          !primitive_util::IsUnsignedIntegralType(
-              layout.index_primitive_type())) {
-        return InvalidArgument(
-            "index_primitive_type is not an unsigned integer type: %s",
-            shape.ShortDebugString());
-      }
-      if (layout.pointer_primitive_type() != PRIMITIVE_TYPE_INVALID &&
-          !primitive_util::IsUnsignedIntegralType(
-              layout.pointer_primitive_type())) {
-        return InvalidArgument(
-            "pointer_primitive_type is not an unsigned integer type: "
-            "%s",
-            shape.ShortDebugString());
-      }
-    }
-  } else {
-    if (layout.index_primitive_type() != PRIMITIVE_TYPE_INVALID) {
-      return InvalidArgument(
-          "layout has a index_primitive_type, but is not a sparse array: %s",
-          shape.ShortDebugString());
-    }
-    if (layout.pointer_primitive_type() != PRIMITIVE_TYPE_INVALID) {
-      return InvalidArgument(
-          "layout has a pointer_primitive_type, but is not a sparse array: %s",
-          shape.ShortDebugString());
-    }
-    if (layout.has_physical_shape()) {
-      return InvalidArgument(
-          "layout has a physical_shape, but is not a sparse array: %s",
-          shape.ShortDebugString());
-    }
-    for (const auto& tile : layout.tiles()) {
-      if (tile.dimensions().empty() ||
-          absl::c_any_of(tile.dimensions(),
-                         [](int64_t dim) { return dim == 0; })) {
-        return InvalidArgument("layout has invalid tiles: %s",
-                               shape.ShortDebugString());
-      }
-    }
+  if (layout.index_primitive_type() != PRIMITIVE_TYPE_INVALID) {
+    return InvalidArgument(
+        "layout has a index_primitive_type, but is not a sparse array: %s",
+        shape.ToString());
   }
-
-  for (int64_t dim = 0; dim < shape.rank(); ++dim) {
-    DimLevelType dim_level_type = GetDimLevelType(layout, dim);
-    bool dim_unique = DimUnique(layout, dim);
-    bool dim_ordered = DimOrdered(layout, dim);
-    if (!ValidateDimLevel(dim_level_type, dim_unique, dim_ordered)) {
-      return InvalidArgument(
-          "layout dimension %d has invalid level encoding %s%s%s: %s", dim,
-          DimLevelType_Name(dim_level_type), dim_unique ? "" : ", non-unique",
-          dim_ordered ? "" : ", non-ordered", shape.ShortDebugString());
+  if (layout.pointer_primitive_type() != PRIMITIVE_TYPE_INVALID) {
+    return InvalidArgument(
+        "layout has a pointer_primitive_type, but is not a sparse array: %s",
+        shape.ToString());
+  }
+  if (layout.has_physical_shape()) {
+    return InvalidArgument(
+        "layout has a physical_shape, but is not a sparse array: %s",
+        shape.ToString());
+  }
+  for (const auto& tile : layout.tiles()) {
+    if (tile.dimensions().empty() ||
+        absl::c_any_of(tile.dimensions(),
+                       [](int64_t dim) { return dim == 0; })) {
+      return InvalidArgument("layout has invalid tiles: %s", shape.ToString());
     }
   }
 
@@ -423,15 +295,18 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
 }
 
 /* static */ void LayoutUtil::ClearLayout(Shape* shape) {
-  shape->clear_layout();
-  for (auto& element_shape : *shape->mutable_tuple_shapes()) {
-    ClearLayout(&element_shape);
+  if (shape->IsArray()) {
+    shape->clear_layout();
+  } else if (shape->IsTuple()) {
+    for (auto& element_shape : *shape->mutable_tuple_shapes()) {
+      ClearLayout(&element_shape);
+    }
   }
 }
 
 /* static */ void LayoutUtil::ClearLayout(ProgramShape* program_shape) {
-  for (auto& parameter_shape : *program_shape->mutable_parameters()) {
-    LayoutUtil::ClearLayout(&parameter_shape);
+  for (int i = 0; i < program_shape->parameters_size(); ++i) {
+    LayoutUtil::ClearLayout(program_shape->mutable_parameters(i));
   }
   LayoutUtil::ClearLayout(program_shape->mutable_result());
 }
@@ -447,64 +322,6 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
       });
 }
 
-/* static */ bool LayoutUtil::IsDenseArray(const Shape& shape) {
-  return shape.IsArray() && (!shape.has_layout() || IsDense(shape.layout()));
-}
-
-/* static */ bool LayoutUtil::IsSparseArray(const Shape& shape) {
-  return shape.IsArray() && shape.has_layout() && IsSparse(shape.layout());
-}
-
-/* static */ bool LayoutUtil::IsCOOArray(const Shape& shape) {
-  return shape.IsArray() && shape.has_layout() && IsCOO(shape.layout());
-}
-
-/* static */ bool LayoutUtil::IsCSRArray(const Shape& shape) {
-  return shape.IsArray() && shape.rank() == 2 && shape.has_layout() &&
-         IsCSR(shape.layout());
-}
-
-/* static */ bool LayoutUtil::IsCSCArray(const Shape& shape) {
-  return shape.IsArray() && shape.rank() == 2 && shape.has_layout() &&
-         IsCSC(shape.layout());
-}
-
-/* static */ bool LayoutUtil::IsDense(const Layout& layout) {
-  for (int i = 0; i < layout.dim_level_types_size(); i++) {
-    if (layout.dim_level_type(i) != DIM_DENSE) return false;
-  }
-  return true;
-}
-
-/* static */ bool LayoutUtil::IsSparse(const Layout& layout) {
-  return !IsDense(layout);
-}
-
-/* static */ bool LayoutUtil::IsCOO(const Layout& layout) {
-  if ((layout.dim_level_types_size() == 0) ||
-      (layout.dim_level_type(0) != DIM_COMPRESSED)) {
-    return false;
-  }
-  for (int i = 1; i < layout.dim_level_types_size(); i++) {
-    if (layout.dim_level_type(i) != DIM_SINGLETON) return false;
-  }
-  return true;
-}
-
-/* static */ bool LayoutUtil::IsCSR(const Layout& layout) {
-  return IsMonotonicWithDim0Major(layout) &&
-         (layout.dim_level_types_size() == 2) &&
-         (layout.dim_level_type(0) == DIM_DENSE) &&
-         (layout.dim_level_type(1) == DIM_COMPRESSED);
-}
-
-/* static */ bool LayoutUtil::IsCSC(const Layout& layout) {
-  return IsMonotonicWithDim0Minor(layout) &&
-         (layout.dim_level_types_size() == 2) &&
-         (layout.dim_level_type(0) == DIM_DENSE) &&
-         (layout.dim_level_type(1) == DIM_COMPRESSED);
-}
-
 /* static */ bool LayoutUtil::IsMonotonicWithDim0Minor(const Layout& layout) {
   return std::is_sorted(layout.minor_to_major().begin(),
                         layout.minor_to_major().end());
@@ -601,9 +418,9 @@ absl::Status CopyLayoutInternal(const Shape& src, Shape* dst) {
       TF_RETURN_IF_ERROR(CopyLayoutInternal(src.tuple_shapes(i),
                                             dst->mutable_tuple_shapes(i)));
     }
-  } else {
+  } else if (src.IsArray()) {
     if (src.has_layout()) {
-      if (src.rank() != dst->rank()) {
+      if (src.dimensions().size() != dst->dimensions().size()) {
         return InvalidArgument("cannot copy layout from shape: ranks differs");
       }
       TF_RETURN_IF_ERROR(
@@ -638,7 +455,7 @@ absl::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
     return true;
   }
   if (lhs.IsArray()) {
-    if (lhs.rank() != rhs.rank()) {
+    if (lhs.dimensions().size() != rhs.dimensions().size()) {
       return false;
     }
     if (!lhs.has_layout() && !rhs.has_layout()) {
@@ -692,13 +509,13 @@ absl::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
                                            absl::Span<const int64_t> indices) {
   CHECK(shape.IsArray());
   CHECK(shape.has_layout());
-  const int rank = shape.rank();
-  CHECK_EQ(rank, indices.size());
+  const int num_dims = shape.dimensions().size();
+  CHECK_EQ(num_dims, indices.size());
 
-  if (rank == 0) {
+  if (num_dims == 0) {
     return 0;
   }
-  if (rank == 1) {
+  if (num_dims == 1) {
     return indices[0];
   }
 
@@ -716,7 +533,7 @@ absl::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
   int64_t within_tile_multiplier = 1;
 
   // We only look at the top-level tile.
-  for (int64_t minor = 0; minor < rank; minor++) {
+  for (int64_t minor = 0; minor < num_dims; minor++) {
     int64_t logical_dim = Minor(shape.layout(), minor);
     int64_t shape_dim_size = shape.dimensions(logical_dim);
     int64_t index = indices[logical_dim];
@@ -741,45 +558,6 @@ absl::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
                             : Layout::kDefaultMemorySpace;
 }
 
-/*static*/ DimLevelType LayoutUtil::GetDimLevelType(const Layout& layout,
-                                                    int64_t dim) {
-  if (layout.dim_level_types_size() == 0) {
-    return DIM_DENSE;
-  }
-  CHECK_LT(dim, layout.dim_level_types_size());
-  return layout.dim_level_type(dim);
-}
-
-/*static*/ bool LayoutUtil::DimUnique(const Layout& layout, int64_t dim) {
-  if (layout.dim_unique_size() == 0) {
-    return true;
-  }
-  CHECK_LT(dim, layout.dim_unique_size());
-  return layout.dim_unique(dim);
-}
-
-/*static*/ bool LayoutUtil::DimOrdered(const Layout& layout, int64_t dim) {
-  if (layout.dim_ordered_size() == 0) {
-    return true;
-  }
-  CHECK_LT(dim, layout.dim_ordered_size());
-  return layout.dim_ordered(dim);
-}
-
-bool LayoutUtil::ValidateDimLevel(DimLevelType dim_level_type, bool dim_unique,
-                                  bool dim_ordered) {
-  switch (dim_level_type) {
-    case DIM_DENSE:
-      return dim_unique && dim_ordered;
-    case DIM_COMPRESSED:
-    case DIM_SINGLETON:
-    case DIM_LOOSE_COMPRESSED:
-      return true;
-    default:
-      return false;
-  }
-}
-
 /*static*/ bool LayoutUtil::ByteStridesIsMajorToMinor(
     absl::Span<const int64_t> byte_strides, absl::Span<const int64_t> dims,
     PrimitiveType element_type) {
@@ -824,10 +602,18 @@ bool LayoutUtil::ValidateDimLevel(DimLevelType dim_level_type, bool dim_unique,
 /*static*/ int64_t LayoutUtil::MaxElementsInPerSplit(const Shape& shape) {
   CHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
   int64_t max_elements_in = 1;
-  for (int dim = 0; dim < shape.rank(); ++dim) {
+  for (int dim = 0; dim < shape.dimensions().size(); ++dim) {
     max_elements_in *= MaxSplitSize(shape, dim);
   }
   return max_elements_in;
 }
 
+/*static*/ std::optional<SplitConfig> LayoutUtil::GetSplitConfig(
+    const Shape& shape) {
+  CHECK_LE(shape.layout().split_configs_size(), 1);
+  return shape.layout().split_configs_size() > 0
+             ? std::make_optional(shape.layout().split_configs(0))
+             : std::nullopt;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/layout_util.h b/third_party/xla/xla/layout_util.h
index ca674b5f327b..46a87ce92b3e 100644
--- a/third_party/xla/xla/layout_util.h
+++ b/third_party/xla/xla/layout_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/layout.h"
@@ -43,9 +44,6 @@ class LayoutUtil {
   // convenience function for protobuf construction.)
   static Layout MakeLayout(
       absl::Span<const int64_t> minor_to_major,
-      absl::Span<const DimLevelType> dim_level_types = {},
-      absl::Span<const bool> dim_unique = {},
-      absl::Span<const bool> dim_ordered = {},
       absl::Span<const Tile> tiles = {},
       int64_t tail_padding_alignment_in_elements = 1,
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
@@ -61,17 +59,23 @@ class LayoutUtil {
 
   // Returns a layout with descending ((i.e. {n-1, n-2, ... 0}) minor-to-major
   // dimensions.
-  static Layout MakeDescendingLayout(int64_t rank);
+  static Layout MakeDescendingLayout(int64_t num_dims);
+
+  // Returns true if the layout is descending.
+  static bool HasDescendingLayout(const Layout& layout);
 
   // Returns a layout with ascending ((i.e. {0, 1, ... n-1}) minor-to-major
   // dimensions.
-  static Layout MakeAscendingLayout(int64_t rank);
+  static Layout MakeAscendingLayout(int64_t num_dims);
+
+  // Returns true if the layout is ascending.
+  static bool HasAscendingLayout(const Layout& layout);
 
   // Returns default layout for the given shape.
   static Layout GetDefaultLayoutForShape(const Shape& shape);
 
   // Helper functions that create default layouts for various ranks.
-  static Layout GetDefaultLayoutForRank(int64_t rank);
+  static Layout GetDefaultLayoutForRank(int64_t num_dims);
   static Layout GetDefaultLayoutForR2();
   static Layout GetDefaultLayoutForR3();
   static Layout GetDefaultLayoutForR4();
@@ -110,41 +114,8 @@ class LayoutUtil {
 
   // Returns whether the given Shape is an array and has a dense in-memory
   // representation.
-  static bool IsDenseArray(const Shape& shape);
-
-  // Returns whether the given Shape is an array and has a sparse in-memory
-  // representation.
-  static bool IsSparseArray(const Shape& shape);
-
-  // Returns whether the given Shape is a sparse array and has a COO (coordinate
-  // matrix) in-memory representation.
-  static bool IsCOOArray(const Shape& shape);
-
-  // Returns whether the given Shape is a sparse array and has a CSR (compressed
-  // sparse row) in-memory representation.
-  static bool IsCSRArray(const Shape& shape);
-
-  // Returns whether the given Shape is a sparse array and has a CSR (compressed
-  // sparse row) in-memory representation.
-  static bool IsCSCArray(const Shape& shape);
-
-  // Returns whether the given Layout has a dense in-memory representation.
-  static bool IsDense(const Layout& layout);
-
-  // Returns whether the given Layout has a sparse in-memory representation.
-  static bool IsSparse(const Layout& layout);
-
-  // Returns whether the given Layout represents a COO (coordinate matrix)
-  // sparse array.
-  static bool IsCOO(const Layout& layout);
-
-  // Returns whether the given Layout represents a CSC (compressed sparse
-  // column) array.
-  static bool IsCSR(const Layout& layout);
-
-  // Returns whether the given Layout represents a CSC (compressed sparse
-  // column) array.
-  static bool IsCSC(const Layout& layout);
+  ABSL_DEPRECATE_AND_INLINE()
+  static bool IsDenseArray(const Shape& shape) { return shape.IsArray(); }
 
   // Returns whether the layout is monotonic and dim 0 is minor in the layout.
   // * R0 and R1: this is always trivially true.
@@ -228,8 +199,8 @@ class LayoutUtil {
   //
   // In the returned vector, the first element represents the most major logical
   // dimension. The element whose contents are 0 represents the most major
-  // physical dimension, and the element with contents (rank - 1) represents
-  // the most minor physical dimension.
+  // physical dimension, and the element with contents (number of dimensions -
+  // 1) represents the most minor physical dimension.
   static std::vector<int64_t> MakeLogicalToPhysical(const Layout& layout);
 
   // Prints a human-readable string that represents the given layout.
@@ -241,7 +212,8 @@ class LayoutUtil {
   // Copies the layout from 'src' to 'dst'. Recursively copies layouts of
   // tuples.  'src' and 'dst' need not be compatible but the two shapes must
   // have the same tuple structure (if any) and arrays must have the same
-  // rank. within the shapes must have the same number of dimensions.
+  // number of dimensions. within the shapes must have the same number of
+  // dimensions.
   static absl::Status CopyLayoutBetweenShapes(const Shape& src, Shape* dst);
 
   // Returns true if the layouts of lhs and rhs are equal, false
@@ -249,7 +221,7 @@ class LayoutUtil {
   //
   // lhs and rhs need not be compatible to have the same layout but the two
   // shapes must have the same tuple structure (if any) and arrays must have the
-  // same rank. Element type is ignored.
+  // same number of dimensions. Element type is ignored.
   static bool LayoutsInShapesEqual(
       const Shape& lhs, const Shape& rhs,
       std::optional<Layout::Equal> equal = std::nullopt);
@@ -275,15 +247,6 @@ class LayoutUtil {
   // returns Layout::kDefaultMemorySpace.
   static int64_t MemorySpace(const Shape& shape);
 
-  static xla::DimLevelType GetDimLevelType(const Layout& layout, int64_t dim);
-  static bool DimUnique(const Layout& layout, int64_t dim);
-  static bool DimOrdered(const Layout& layout, int64_t dim);
-
-  // Return true iff the given DimLevelType and dim_unique/dim_ordered values
-  // represent a valid encoding.
-  static bool ValidateDimLevel(xla::DimLevelType dim_level_type,
-                               bool dim_unique, bool dim_ordered);
-
   // Returns true if `byte_strides` is major to minor order, i.e. the strides
   // form a cumulative product of the byte size and dimensions in reverse order
   // and the smallest stride is the byte size for `element_type`.
@@ -303,6 +266,9 @@ class LayoutUtil {
   // stored in a particular split. This can be useful for calculating how much
   // memory to allocate in each of the memories.
   static int64_t MaxElementsInPerSplit(const Shape& shape);
+
+  // Returns a shape's split config if present.
+  static std::optional<SplitConfig> GetSplitConfig(const Shape& shape);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/layout_util_test.cc b/third_party/xla/xla/layout_util_test.cc
index c1aa071864c4..00da11f53a60 100644
--- a/third_party/xla/xla/layout_util_test.cc
+++ b/third_party/xla/xla/layout_util_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/types/span.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
@@ -32,15 +34,18 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::ContainsRegex;
+using ::testing::HasSubstr;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
 class LayoutUtilTest : public ::testing::Test {
  protected:
-  Shape MakeShapeWithLayout(
-      PrimitiveType element_type, absl::Span<const int64_t> dimensions,
-      absl::Span<const int64_t> minor_to_major,
-      absl::Span<const DimLevelType> dim_level_types = {}) {
+  Shape MakeShapeWithLayout(PrimitiveType element_type,
+                            absl::Span<const int64_t> dimensions,
+                            absl::Span<const int64_t> minor_to_major) {
     Shape shape = ShapeUtil::MakeShape(element_type, dimensions);
-    *shape.mutable_layout() =
-        LayoutUtil::MakeLayout(minor_to_major, dim_level_types);
+    *shape.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
     return shape;
   }
 };
@@ -94,57 +99,6 @@ TEST_F(LayoutUtilTest, CopyLayoutDenseArray) {
   EXPECT_FALSE(dst.has_layout());
 }
 
-TEST_F(LayoutUtilTest, CopyLayoutCSRArray) {
-  Shape src =
-      MakeShapeWithLayout(F32, {2, 3}, {1, 0}, {DIM_DENSE, DIM_COMPRESSED});
-  Shape dst = MakeShapeWithLayout(F32, {2, 3}, {0, 1});
-
-  EXPECT_TRUE(LayoutUtil::IsSparseArray(src));
-  EXPECT_FALSE(LayoutUtil::IsSparseArray(dst));
-
-  EXPECT_TRUE(LayoutUtil::IsCSRArray(src));
-  EXPECT_FALSE(LayoutUtil::IsCSRArray(dst));
-
-  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
-  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-  EXPECT_TRUE(LayoutUtil::IsCSRArray(dst));
-
-  // Should work if destination has no layout.
-  dst.clear_layout();
-  EXPECT_FALSE(LayoutUtil::IsCSRArray(dst));
-  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
-  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-  EXPECT_TRUE(LayoutUtil::IsCSRArray(dst));
-
-  // Convert dst to a CSC array with dim 0 minor layout.
-  *dst.mutable_layout()->mutable_minor_to_major() = {0, 1};
-  EXPECT_TRUE(LayoutUtil::IsCSCArray(dst));
-  EXPECT_FALSE(LayoutUtil::IsCSRArray(dst));
-
-  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
-  *src.mutable_layout()->mutable_physical_shape() = ShapeUtil::MakeTupleShape({
-      ShapeUtil::MakeShapeWithDenseLayout(U32, {2}, {0}, {Tile({100})}),
-      ShapeUtil::MakeShapeWithDenseLayout(U32, {4}, {0}, {Tile({100})}),
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {4}, {0}, {Tile({100})}),
-  });
-  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-  dst.clear_layout();
-  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
-  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-
-  // If source is cleared, then destination should be cleared.
-  src.clear_layout();
-  EXPECT_FALSE(LayoutUtil::IsCSRArray(src));
-  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-  EXPECT_TRUE(dst.has_layout());
-  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
-  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
-  EXPECT_FALSE(dst.has_layout());
-  EXPECT_FALSE(LayoutUtil::IsCSRArray(dst));
-}
-
 TEST_F(LayoutUtilTest, CopyLayoutTuple) {
   Shape src = ShapeUtil::MakeTupleShape(
       {MakeShapeWithLayout(F32, {2, 3}, {0, 1}),
@@ -176,8 +130,7 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleDifferentRank) {
   Shape dst = MakeShapeWithLayout(F32, {2, 3}, {1, 0});
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.message(),
-              ::testing::ContainsRegex("cannot copy layout from shape"));
+  EXPECT_THAT(status.message(), ContainsRegex("cannot copy layout from shape"));
 }
 
 TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
@@ -195,8 +148,7 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.message(),
-              ::testing::ContainsRegex("cannot copy layout from shape"));
+  EXPECT_THAT(status.message(), ContainsRegex("cannot copy layout from shape"));
 }
 
 TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
@@ -207,9 +159,9 @@ TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.message(), ::testing::ContainsRegex(
-                                    "layout minor_to_major field contains .* "
-                                    "elements, but shape is rank"));
+  EXPECT_THAT(status.message(),
+              ContainsRegex("layout minor_to_major field contains .* "
+                            "elements, but shape has"));
 }
 
 TEST_F(LayoutUtilTest, CopyTokenLayout) {
@@ -422,34 +374,14 @@ TEST_F(LayoutUtilTest, ValidateLayout_InvalidArrayLayout) {
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
   EXPECT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("layout minor_to_major field "
-                                   "contains 3 elements, but shape is rank 2"));
-  status =
-      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
-  EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("layout minor_to_major field "
-                                   "contains 3 elements, but shape is rank 2"));
-}
-
-TEST_F(LayoutUtilTest, ValidateLayout_InvalidDimLevelTypes) {
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  *shape.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
-  shape.mutable_layout()->add_dim_level_type(DIM_DENSE);
-  shape.mutable_layout()->add_dim_level_type(DIM_DENSE);
-  shape.mutable_layout()->add_dim_level_type(DIM_DENSE);
-  auto status =
-      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
-  EXPECT_FALSE(status.ok());
-  EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("layout dim_level_types field "
-                                   "contains 3 elements, but shape is rank 2"));
+              HasSubstr("layout minor_to_major field "
+                        "contains 3 elements, but shape has 2 dimensions"));
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
   EXPECT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("layout dim_level_types field "
-                                   "contains 3 elements, but shape is rank 2"));
+              HasSubstr("layout minor_to_major field "
+                        "contains 3 elements, but shape has 2 dimensions"));
 }
 
 TEST_F(LayoutUtilTest, ValidateLayout_MissingArrayLayout) {
@@ -459,53 +391,12 @@ TEST_F(LayoutUtilTest, ValidateLayout_MissingArrayLayout) {
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
   EXPECT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("shape f32[2,3] does not have a layout"));
+              HasSubstr("shape f32[2,3] does not have a layout"));
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
   EXPECT_TRUE(status.ok());
 }
 
-TEST_F(LayoutUtilTest, ValidateLayout_Sparse) {
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  *shape.mutable_layout() = LayoutUtil::MakeLayout(
-      {1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {Tile({10, 10})});
-  EXPECT_THAT(LayoutUtil::ValidateLayoutInShape(shape),
-              tsl::testing::StatusIs(
-                  tsl::error::INVALID_ARGUMENT,
-                  ::testing::HasSubstr(
-                      "layout has tiles, but the shape is a sparse array")));
-  shape.mutable_layout()->clear_tiles();
-  EXPECT_THAT(LayoutUtil::ValidateLayoutInShape(shape), tsl::testing::IsOk());
-  *shape.mutable_layout()->mutable_physical_shape() =
-      ShapeUtil::MakeShape(F32, {6});
-  EXPECT_THAT(LayoutUtil::ValidateLayoutInShape(shape), tsl::testing::IsOk());
-  *shape.mutable_layout()
-       ->mutable_physical_shape()
-       ->mutable_layout()
-       ->mutable_physical_shape() = ShapeUtil::MakeShape(S32, {10});
-  EXPECT_THAT(
-      LayoutUtil::ValidateLayoutInShape(shape),
-      tsl::testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          ::testing::HasSubstr(
-              "layout has a physical_shape, but is not a sparse array")));
-  shape.mutable_layout()->mutable_physical_shape()->clear_layout();
-  shape.mutable_layout()->clear_dim_level_types();
-  EXPECT_THAT(
-      LayoutUtil::ValidateLayoutInShape(shape),
-      tsl::testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          ::testing::HasSubstr(
-              "layout has a physical_shape, but is not a sparse array")));
-  *shape.mutable_layout() =
-      LayoutUtil::MakeLayout({1, 0}, {DIM_DENSE, DIM_DENSE}, {true, false});
-  EXPECT_THAT(LayoutUtil::ValidateLayoutInShape(shape),
-              tsl::testing::StatusIs(
-                  tsl::error::INVALID_ARGUMENT,
-                  ::testing::HasSubstr("layout dimension 1 has invalid level "
-                                       "encoding DIM_DENSE, non-unique")));
-}
-
 TEST_F(LayoutUtilTest, ValidateLayout_TupleSubshapesWithMissingLayouts) {
   Shape sub_1_1_1 = ShapeUtil::MakeShape(F32, {1, 2});
   Shape sub_1_1 = ShapeUtil::MakeTupleShape({sub_1_1_1});
@@ -521,7 +412,7 @@ TEST_F(LayoutUtilTest, ValidateLayout_TupleSubshapesWithMissingLayouts) {
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
   EXPECT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("shape f32[1,2] does not have a layout"));
+              HasSubstr("shape f32[1,2] does not have a layout"));
   status =
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
   EXPECT_TRUE(status.ok());
@@ -534,8 +425,8 @@ TEST_F(LayoutUtilTest, ValidateLayout_TupleSubshapesWithMissingLayouts) {
       LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
   EXPECT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
-              ::testing::HasSubstr("layout minor_to_major field "
-                                   "contains 3 elements, but shape is rank 1"));
+              HasSubstr("layout minor_to_major field "
+                        "contains 3 elements, but shape has 1 dimensions"));
 }
 
 TEST_F(LayoutUtilTest, MoveDimToMajor) {
diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl
index f6977daf95b4..22c519dedd97 100644
--- a/third_party/xla/xla/lit.bzl
+++ b/third_party/xla/xla/lit.bzl
@@ -1,6 +1,7 @@
 """Helper rules for writing LIT tests."""
 
 load("@bazel_skylib//lib:paths.bzl", "paths")
+load("@rules_python//python:defs.bzl", "py_binary")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("//xla/tsl:tsl.bzl", "if_cuda_tools", "if_google", "if_oss")
 load("//xla/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
@@ -234,9 +235,11 @@ def lit_test(
     )
     lit_name = "//third_party/py/lit:lit"
 
+    _ = py_binary  # @unused
+
     # copybara:comment_begin(oss-only)
     lit_name = "lit_custom_" + name
-    native.py_binary(
+    py_binary(
         name = lit_name,
         main = "@llvm-project//llvm:utils/lit/lit.py",
         srcs = ["@llvm-project//llvm:utils/lit/lit.py"],
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index f58a4fafc303..d91cfe064922 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/byte_swap_array.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -111,8 +112,8 @@ template <PrimitiveType kType>
 const Shape& ScalarShapeImpl() {
   static_assert(primitive_util::IsArrayType(kType),
                 "Not a valid type for a scalar.");
-  static const Shape* shape = [] {
-    auto shape = new Shape(kType, {}, {}, {});
+  static const Shape* const shape = [] {
+    auto* const shape = new Shape(kType, {}, {});
     shape->mutable_layout();
     return shape;
   }();
@@ -120,7 +121,7 @@ const Shape& ScalarShapeImpl() {
 }
 
 const Shape& ScalarShape(PrimitiveType type) {
-  return primitive_util::ArrayTypeSwitch<const Shape&>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> const Shape& {
         return ScalarShapeImpl<primitive_type_constant>();
       },
@@ -128,17 +129,18 @@ const Shape& ScalarShape(PrimitiveType type) {
 }
 
 const Shape& NilShape() {
-  static const Shape* shape = new Shape(TUPLE, {}, {}, {});
+  // Create a nullary tuple.
+  static const Shape* const shape = new Shape(std::vector<Shape>());
   return *shape;
 }
 
 // Returns the interned shape pointer in static storage if it's a scalar shape
 // or nil shape.
 const Shape* TryInternShape(const Shape& shape) {
-  if (shape.IsTuple() && shape.tuple_shapes_size() == 0) {
+  if (shape.IsTuple() && shape.tuple_shapes().size() == 0) {
     return &NilShape();
   }
-  if (shape.IsArray() && shape.dimensions_size() == 0 && shape.is_static() &&
+  if (shape.IsArray() && shape.dimensions().size() == 0 && shape.is_static() &&
       shape.has_layout() && shape.layout().tiles_size() == 0 &&
       shape.layout().memory_space() == 0 &&
       shape.layout().element_size_in_bits() == 0) {
@@ -255,9 +257,13 @@ void Literal::SetShape(const Shape& shape) {
     return;
   }
   auto owning_shape_ptr = std::make_unique<Shape>(shape);
-  if (owning_shape_ptr->IsArray() && !owning_shape_ptr->has_layout()) {
-    *owning_shape_ptr->mutable_layout() =
-        LayoutUtil::GetDefaultLayoutForShape(*owning_shape_ptr);
+  if (!LayoutUtil::HasLayout(*owning_shape_ptr)) {
+    ShapeUtil::ForEachMutableLeafShape(
+        owning_shape_ptr.get(), [](Shape* subshape, const ShapeIndex& index) {
+          if (!subshape->has_layout()) {
+            LayoutUtil::SetToDefaultLayout(subshape);
+          }
+        });
   }
   if (owning_shape_ptr->IsArray() &&
       LayoutUtil::HasCustomElementSizeInBits(*owning_shape_ptr)) {
@@ -358,7 +364,7 @@ std::optional<int64_t> LiteralBase::GetFirstInteger() const {
   if (!primitive_util::IsIntegralType(shape().element_type())) {
     return std::nullopt;
   }
-  return primitive_util::IntegralTypeSwitch<std::optional<int64_t>>(
+  return primitive_util::IntegralTypeSwitch(
       [&](auto primitive_type_constant) -> std::optional<int64_t> {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         auto first_element = GetFirstElement<NativeT>();
@@ -400,7 +406,7 @@ absl::Status LiteralBase::SerializeToString(std::string* output) const {
 absl::StatusOr<std::string> LiteralBase::SerializeAsString() const {
   std::string result;
   TF_RETURN_IF_ERROR(SerializeToString(&result));
-  return std::move(result);
+  return result;
 }
 
 template <typename NativeT>
@@ -415,7 +421,8 @@ absl::Status MutableLiteralBase::CopySliceFromInternal(
   // `this->` is needed to workaround MSVC bug: #16882
   NativeT* dest_data = this->data<NativeT>().data();
   const NativeT* src_data = src_literal.data<NativeT>().data();
-  if (src_literal.shape().rank() == 0 || shape().rank() == 0) {
+  if (src_literal.shape().dimensions().size() == 0 ||
+      shape().dimensions().size() == 0) {
     // If any of the two shapes are scalars, just assign the value once.
     TF_RET_CHECK(copy_size.empty());
     dest_data[linear_index(shape(), dest_base)] =
@@ -488,7 +495,7 @@ void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
   if (!proto.has_shape()) {
     return InvalidArgument("LiteralProto has no shape");
   }
-  Shape shape(proto.shape());
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
   if (ShapeUtil::HasPrimitiveType(shape, OPAQUE_TYPE)) {
     return InvalidArgument(
         "Literal shape cannot include OPAQUE_TYPE sub-shape");
@@ -496,9 +503,6 @@ void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
   if (!LayoutUtil::HasLayout(shape)) {
     return InvalidArgument("LiteralProto has no layout");
   }
-  if (LayoutUtil::IsSparseArray(shape)) {
-    return Unimplemented("Sparse literals are not supported");
-  }
 
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
@@ -538,7 +542,7 @@ void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
         return absl::OkStatus();
       }));
 
-  return std::move(literal);
+  return literal;
 }
 
 Literal Literal::SubLiteral(ShapeIndexView shape_index) {
@@ -594,7 +598,7 @@ void CopyElementsBetween(absl::Span<NativeT> dest,
   if (ShapeUtil::IsZeroElementArray(dest_shape)) {
     return;
   }
-  std::vector<int64_t> index(dest_shape.rank());
+  std::vector<int64_t> index(dest_shape.dimensions().size());
   do {
     dest[IndexUtil::MultidimensionalIndexToLinearIndex(dest_shape, index)] =
         src[IndexUtil::MultidimensionalIndexToLinearIndex(src_shape, index)];
@@ -648,13 +652,13 @@ void LiteralBase::Piece::CopyElementsWithDynamicBound(
   if (ShapeUtil::IsZeroElementArray(dest_shape)) {
     return;
   }
-  if (dest_shape.rank() == 1) {
+  if (dest_shape.dimensions().size() == 1) {
     // Fast path for rank 1 arrays.
     int64_t count = std::min(GetDynamicSize(0), src.GetDynamicSize(0));
     std::copy_n(src.data<NativeT>().begin(), count, data<NativeT>().begin());
     return;
   }
-  std::vector<int64_t> index(dest_shape.rank());
+  std::vector<int64_t> index(dest_shape.dimensions().size());
   do {
     bool out_of_bound = false;
     for (int64_t i = 0; i < index.size(); ++i) {
@@ -675,8 +679,8 @@ void LiteralBase::Piece::CopyElementsWithDynamicBound(
 
 absl::Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
                                           bool only_dynamic_bound) {
-  CHECK(subshape_ != nullptr);
-  CHECK(src.subshape_ != nullptr);
+  CHECK_NOTNULL(subshape_);
+  CHECK_NOTNULL(src.subshape_);
   CHECK(LayoutUtil::IsDenseArray(subshape()))
       << __func__ << " is only supported for dense arrays: " << subshape();
   CHECK(LayoutUtil::IsDenseArray(src.subshape()))
@@ -704,8 +708,8 @@ absl::Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
     // If the layouts are equal it's faster just to memcpy.
     memcpy(buffer(), src.buffer(), src.size_bytes_dense());
   } else {
-    std::vector<int64_t> origin(subshape().rank(), 0);
-    primitive_util::ArrayTypeSwitch<void>(
+    std::vector<int64_t> origin(subshape().dimensions().size(), 0);
+    primitive_util::ArrayTypeSwitch(
         [&](auto primitive_type_constant) {
           using NativeT = NativeTypeOf<primitive_type_constant>;
           if (only_dynamic_bound) {
@@ -838,10 +842,10 @@ absl::Status MutableLiteralBase::CopySliceFrom(
   TF_RET_CHECK(LayoutUtil::IsDenseArray(src_literal.shape()))
       << src_literal.shape();
   TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape()));
-  TF_RET_CHECK(src_literal.shape().rank() == src_base.size());
-  TF_RET_CHECK(shape().rank() == dest_base.size());
+  TF_RET_CHECK(src_literal.shape().dimensions().size() == src_base.size());
+  TF_RET_CHECK(shape().dimensions().size() == dest_base.size());
 
-  return primitive_util::ArrayTypeSwitch<absl::Status>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> absl::Status {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         return CopySliceFromInternal<NativeT>(src_literal, src_base, dest_base,
@@ -852,7 +856,7 @@ absl::Status MutableLiteralBase::CopySliceFrom(
 
 void MutableLiteralBase::PopulateR1(const tsl::core::Bitmap& values) {
   CHECK(shape().IsArray());
-  CHECK_EQ(shape().rank(), 1);
+  CHECK_EQ(shape().dimensions().size(), 1);
   CHECK_EQ(element_count(), values.bits());
   CHECK_EQ(shape().element_type(), PRED);
   for (int64_t i = 0; i < static_cast<int64_t>(values.bits()); ++i) {
@@ -864,7 +868,7 @@ void MutableLiteralBase::PopulateInplaceInternal(
     absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator,
     bool parallel) {
   const Shape& this_shape = shape();
-  const int64_t rank = this_shape.rank();
+  const int64_t rank = this_shape.dimensions().size();
   DCHECK(LayoutUtil::IsDenseArray(this_shape));
   char* const dest_base = static_cast<char*>(untyped_data());
   if (rank > 0) {
@@ -875,7 +879,7 @@ void MutableLiteralBase::PopulateInplaceInternal(
     // If we are rank-1 and we are `parallel`, it is better to use a smaller
     // `step` than what `StrideConfig` does: stick the entire dimension in the
     // inner-most loop.
-    if (parallel && this_shape.rank() == 1) {
+    if (parallel && this_shape.dimensions().size() == 1) {
       const int64_t thread_count =
           ShapeUtil::GetForEachIndexParallelThreadCount();
       // Let's just divide up the array into small amounts per thread.
@@ -928,7 +932,7 @@ void MutableLiteralBase::PopulateInplaceInternal(
 void MutableLiteralBase::PopulateLinearInplaceInternal(
     absl::FunctionRef<void(void*, int64_t, int)> populator, bool parallel) {
   const Shape& this_shape = shape();
-  const int64_t rank = this_shape.rank();
+  const int64_t rank = this_shape.dimensions().size();
   DCHECK(LayoutUtil::IsDenseArray(this_shape));
   char* const dest_base = static_cast<char*>(untyped_data());
 
@@ -1033,7 +1037,7 @@ Literal LiteralBase::ToBoundedDynamic(const Shape& bounded_shape) const {
         if (!subshape.IsArray()) {
           return;
         }
-        for (int64_t i = 0; i < subshape.rank(); ++i) {
+        for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
           if (bounded_shape.is_dynamic_dimension(i)) {
             result.SetDynamicSize(i, subshape.dimensions(i));
           }
@@ -1052,7 +1056,7 @@ Literal LiteralBase::ToStatic() const {
         if (!subshape->IsArray()) {
           return;
         }
-        for (int64_t i = 0; i < subshape->rank(); ++i) {
+        for (int64_t i = 0; i < subshape->dimensions().size(); ++i) {
           // GetDynamicSize has a 32-bit return type and may truncate static
           // dimensions, so make sure to skip.
           if (!subshape->is_dynamic_dimension(i)) continue;
@@ -1119,7 +1123,8 @@ absl::StatusOr<Literal> BroadcastHelper(const LiteralBase& src,
         src_shape, src_minor_to_major, src_index);
 
     // Storage for indexing into the result literal.
-    absl::InlinedVector<int64_t, 4> broadcast_index(broadcast_shape.rank(), 0);
+    absl::InlinedVector<int64_t, 4> broadcast_index(
+        broadcast_shape.dimensions().size(), 0);
     absl::Span<int64_t> broadcast_index_span = absl::MakeSpan(broadcast_index);
 
     // Iterate over the broadcast shape copying one element at a time.
@@ -1187,7 +1192,8 @@ absl::StatusOr<Literal> LiteralBase::Reshape(
   }
   Literal output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) {
-    output = Relayout(LayoutUtil::GetDefaultLayoutForRank(shape().rank()));
+    output = Relayout(
+        LayoutUtil::GetDefaultLayoutForRank(shape().dimensions().size()));
   } else {
     output = Clone();
   }
@@ -1205,13 +1211,14 @@ absl::StatusOr<Literal> LiteralBase::Reshape(
         ShapeUtil::HumanString(shape()),
         ShapeUtil::HumanString(output.shape()));
   }
-  return std::move(output);
+  return output;
 }
 
 Literal LiteralBase::Transpose(absl::Span<const int64_t> permutation) const {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
-  CHECK(shape().rank() == permutation.size() && IsPermutation(permutation))
+  CHECK(shape().dimensions().size() == permutation.size() &&
+        IsPermutation(permutation))
       << "Given permutation is not a permutation of dimension numbers";
   // To transpose the array, we just permute the dimensions and layout, and
   // do a straight memory copy of the raw data set.
@@ -1240,7 +1247,7 @@ Literal LiteralBase::Transpose(absl::Span<const int64_t> permutation) const {
   }
   Literal new_literal(permuted_shape);
   if (shape().is_dynamic()) {
-    for (int64_t i = 0; i < shape().rank(); i++) {
+    for (int64_t i = 0; i < shape().dimensions().size(); i++) {
       if (shape().is_dynamic_dimension(i)) {
         // Set the dynamic size of any dynamic dimension in the transposed
         // literal.
@@ -1260,15 +1267,16 @@ void SliceInternal(const LiteralBase& src_literal,
                    absl::Span<const int64_t> start_indices,
                    Literal& result_literal) {
   const Shape& result_shape = result_literal.shape();
-  DimensionVector new_indices(result_shape.rank());
+  DimensionVector new_indices(result_shape.dimensions().size());
   TF_CHECK_OK(
       result_literal.Populate<NativeT>([&](absl::Span<const int64_t> indices) {
-        for (int64_t i = 0; i < result_shape.rank(); ++i) {
+        for (int64_t i = 0; i < result_shape.dimensions().size(); ++i) {
           new_indices[i] = indices[i] + start_indices[i];
         }
         return src_literal.Get<NativeT>(new_indices);
       }));
-  for (int64_t dnum = 0; dnum < src_literal.shape().rank(); ++dnum) {
+  for (int64_t dnum = 0; dnum < src_literal.shape().dimensions().size();
+       ++dnum) {
     if (src_literal.shape().is_dynamic_dimension(dnum)) {
       int64_t dynamic_size =
           src_literal.GetDynamicSize(dnum) - start_indices[dnum];
@@ -1285,7 +1293,7 @@ Literal LiteralBase::Slice(absl::Span<const int64_t> start_indices,
   CHECK(shape().IsArray()) << "tuple is not supported for slice";
 
   DimensionVector result_dimensions;
-  for (int64_t dnum = 0; dnum < shape().rank(); ++dnum) {
+  for (int64_t dnum = 0; dnum < shape().dimensions().size(); ++dnum) {
     CHECK_GE(start_indices[dnum], 0);
     CHECK_LE(limit_indices[dnum], shape().dimensions(dnum))
         << "dnum = " << dnum;
@@ -1298,7 +1306,7 @@ Literal LiteralBase::Slice(absl::Span<const int64_t> start_indices,
       LayoutUtil::MinorToMajor(shape()));
   ShapeUtil::CopyDynamicDimensions(&result_shape, shape());
   Literal result_literal(result_shape);
-  primitive_util::ArrayTypeSwitch<void>(
+  primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> void {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         return SliceInternal<NativeT>(*this, start_indices, result_literal);
@@ -1331,7 +1339,7 @@ std::string LiteralBase::GetAsString(absl::Span<const int64_t> multi_index,
                                      const ShapeIndex& shape_index) const {
   const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
   CHECK(LayoutUtil::IsDenseArray(subshape));
-  return primitive_util::ArrayTypeSwitch<std::string>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> std::string {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
@@ -1394,7 +1402,7 @@ std::optional<double> LiteralBase::GetSumAsDouble(
     return std::nullopt;
   }
 
-  return primitive_util::FloatingPointTypeSwitch<double>(
+  return primitive_util::FloatingPointTypeSwitch(
       [&](auto primitive_type_constant) -> double {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         double sum = 0.0;
@@ -1457,7 +1465,7 @@ absl::Status MutableLiteralBase::SetFromDouble(
     return FailedPrecondition("Array element type is not integral: %s",
                               PrimitiveType_Name(shape().element_type()));
   }
-  primitive_util::FloatingPointTypeSwitch<void>(
+  primitive_util::FloatingPointTypeSwitch(
       [&](auto primitive_type_constant) -> void {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         Set<NativeT>(multi_index, static_cast<NativeT>(value));
@@ -1499,7 +1507,7 @@ void DenseArrayPrintHelper(const LiteralBase& literal,
                            const ShapeIndex& shape_index, bool print_shape,
                            bool print_layout, bool oneline, Printer* printer) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  int64_t rank = subshape.rank();
+  int64_t rank = subshape.dimensions().size();
   const absl::string_view linebreak = oneline ? " " : "\n";
 
   std::function<void(absl::Span<const int64_t> dimensions,
@@ -1568,9 +1576,9 @@ void DenseArrayPrintHelper(const LiteralBase& literal,
     PrintShape(print_layout, subshape, printer);
     if (subshape.is_dynamic()) {
       printer->Append("(");
-      for (int64_t i = 0; i < subshape.dimensions_size(); ++i) {
+      for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
         printer->Append(literal.GetDynamicSize(i, shape_index));
-        if (i < subshape.dimensions_size() - 1) {
+        if (i < subshape.dimensions().size() - 1) {
           printer->Append(",");
         }
       }
@@ -1580,8 +1588,8 @@ void DenseArrayPrintHelper(const LiteralBase& literal,
   }
   std::vector<int64_t> indices = {};
   std::vector<int64_t> dimensions;
-  dimensions.reserve(subshape.rank());
-  for (int64_t i = 0; i < subshape.rank(); ++i) {
+  dimensions.reserve(subshape.dimensions().size());
+  for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
     dimensions.push_back(literal.GetDynamicSize(i, shape_index));
   }
   print_recursive(dimensions, &indices);
@@ -1763,7 +1771,7 @@ absl::Status ConvertIfDestTypeMatches(const LiteralBase& src_literal,
   auto src_data = src_literal.data<NativeSrcT>();
   void* dst_base = dst_literal.untyped_data();
   DCHECK_EQ(src_data.size(), dst_literal.element_count());
-  return primitive_util::ArrayTypeSwitch<absl::Status>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsComplexType(kSrcType) &&
                       !primitive_util::IsComplexType(primitive_type_constant)) {
@@ -1799,7 +1807,7 @@ absl::StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
   // duplicating it N^2 times in the conversion implementation.
   Literal result(
       ShapeUtil::ChangeElementType(literal.shape(), primitive_dest_type));
-  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch<absl::Status>(
+  TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> absl::Status {
         return ConvertIfDestTypeMatches<primitive_type_constant>(literal,
                                                                  result);
@@ -1893,7 +1901,7 @@ absl::StatusOr<Literal> LiteralBase::ConvertToShape(
 template <typename NativeT>
 bool LiteralBase::Piece::EqualElementsInternal(
     const LiteralBase::Piece& other, std::vector<int64_t>* multi_index) const {
-  if (multi_index->size() == subshape().rank()) {
+  if (multi_index->size() == subshape().dimensions().size()) {
     return (Get<NativeT>(*multi_index) == other.Get<NativeT>(*multi_index));
   }
   for (int64_t i = 0; i < GetDynamicSize(multi_index->size()); ++i) {
@@ -1913,7 +1921,7 @@ bool LiteralBase::Piece::EqualDynamicSize(
     return true;
   }
 
-  for (int64_t i = 0; i < subshape().rank(); ++i) {
+  for (int64_t i = 0; i < subshape().dimensions().size(); ++i) {
     if (GetDynamicSize(i) != other.GetDynamicSize(i)) {
       return false;
     }
@@ -1942,7 +1950,7 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
   }
 
   std::vector<int64_t> multi_index;
-  return primitive_util::ArrayTypeSwitch<bool>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> bool {
         using NativeSrcT = NativeTypeOf<primitive_type_constant>;
         return EqualElementsInternal<NativeSrcT>(other, &multi_index);
@@ -1968,14 +1976,14 @@ bool LiteralBase::Equal(const LiteralBase& other, bool layout_sensitive) const {
     if (!piece.subshape().IsArray()) {
       return true;
     }
-    if (subshape.rank() != other_subshape.rank()) {
+    if (subshape.dimensions().size() != other_subshape.dimensions().size()) {
       return false;
     }
     if (layout_sensitive && (subshape.layout() != other_subshape.layout())) {
       return false;
     }
 
-    for (int64_t i = 0; i < subshape.rank(); ++i) {
+    for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
       if (piece.GetDynamicSize(i) != other_piece.GetDynamicSize(i)) {
         return false;
       }
@@ -2025,7 +2033,7 @@ bool Literal::Piece::IsAll(const Literal& scalar) const {
   CHECK(LayoutUtil::IsDenseArray(subshape()))
       << __func__ << " is only supported for dense arrays: " << subshape();
   CHECK_EQ(subshape().element_type(), scalar.shape().element_type());
-  return primitive_util::ArrayTypeSwitch<bool>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> bool {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         return AllElementsEqualValue(this->data<NativeT>(),
@@ -2043,7 +2051,7 @@ int64_t Literal::Piece::CountAll(const Literal& scalar) const {
   CHECK(LayoutUtil::IsDenseArray(subshape()))
       << __func__ << " is only supported for dense arrays: " << subshape();
   CHECK_EQ(subshape().element_type(), scalar.shape().element_type());
-  return primitive_util::ArrayTypeSwitch<int64_t>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> int64_t {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         return absl::c_count_if(
@@ -2070,7 +2078,7 @@ bool LiteralBase::IsAll(int8_t value) const {
     return false;
   }
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  return primitive_util::ArrayTypeSwitch<bool>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> bool {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         NativeT converted(value);
@@ -2101,7 +2109,7 @@ bool LiteralBase::IsAllFloatImpl(float value, bool round_value) const {
     return false;
   }
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  return primitive_util::FloatingPointTypeSwitch<bool>(
+  return primitive_util::FloatingPointTypeSwitch(
       [&](auto primitive_type_constant) -> bool {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         scalar.Set<NativeT>({}, static_cast<NativeT>(value));
@@ -2119,7 +2127,7 @@ bool LiteralBase::IsAllComplex(complex64 value) const {
     return false;
   }
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  return primitive_util::ComplexTypeSwitch<bool>(
+  return primitive_util::ComplexTypeSwitch(
       [&](auto primitive_type_constant) -> bool {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         scalar.Set<NativeT>({}, static_cast<NativeT>(value));
@@ -2138,8 +2146,10 @@ bool LiteralBase::IsAllFirst() const {
     return false;
   }
 
-  absl::InlinedVector<int64_t, 4> start_indices(/*n=*/shape().rank(), 0);
-  absl::InlinedVector<int64_t, 4> end_indices(/*n=*/shape().rank(), 1);
+  absl::InlinedVector<int64_t, 4> start_indices(
+      /*n=*/shape().dimensions().size(), 0);
+  absl::InlinedVector<int64_t, 4> end_indices(/*n=*/shape().dimensions().size(),
+                                              1);
   Literal first = Slice(start_indices, end_indices);
   return IsAll(first.Reshape({}).value());
 }
@@ -2152,11 +2162,11 @@ bool LiteralBase::IsR1Iota() const {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
 
-  if (shape().rank() != 1) {
+  if (shape().dimensions().size() != 1) {
     return false;
   }
 
-  return primitive_util::ArrayTypeSwitch<bool>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> bool {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         const int64_t elements = ShapeUtil::ElementsIn(shape());
@@ -2190,7 +2200,7 @@ bool LiteralBase::IsR1Iota() const {
 // stride. Only applicable for integer iotas. Returns std::nullopt if the
 // literal is not a strided iota.
 std::optional<int64_t> LiteralBase::IsR1StridedIota() const {
-  if (!shape().IsArray() || shape().rank() != 1) {
+  if (!shape().IsArray() || shape().dimensions().size() != 1) {
     return std::nullopt;
   }
 
@@ -2203,7 +2213,7 @@ std::optional<int64_t> LiteralBase::IsR1StridedIota() const {
     return std::nullopt;
   }
 
-  return primitive_util::IntegralTypeSwitch<std::optional<int64_t>>(
+  return primitive_util::IntegralTypeSwitch(
       [&](auto primitive_type_constant) -> std::optional<int64_t> {
         using NativeT = NativeTypeOf<primitive_type_constant>;
 
@@ -2228,7 +2238,7 @@ std::optional<int64_t> LiteralBase::IsR1StridedIota() const {
 bool LiteralBase::IsZero(absl::Span<const int64_t> indices) const {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
-  return primitive_util::ArrayTypeSwitch<bool>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> bool {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         return Get<NativeT>(indices) == NativeT{0};
@@ -2262,24 +2272,27 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
       break;
     case U1:
       *proto->mutable_u1s() = std::string(
-          reinterpret_cast<const char*>(data<u1>().data()), size_bytes_dense());
+          tsl::safe_reinterpret_cast<const char*>(data<u1>().data()),
+          size_bytes_dense());
       break;
     case U2:
       *proto->mutable_u2s() = std::string(
-          reinterpret_cast<const char*>(data<u2>().data()), size_bytes_dense());
+          tsl::safe_reinterpret_cast<const char*>(data<u2>().data()),
+          size_bytes_dense());
       break;
     case U4:
       *proto->mutable_u4s() = std::string(
-          reinterpret_cast<const char*>(data<u4>().data()), size_bytes_dense());
+          tsl::safe_reinterpret_cast<const char*>(data<u4>().data()),
+          size_bytes_dense());
       break;
     case U8:
       proto->set_u8s(static_cast<const unsigned char*>(data<uint8_t>().data()),
                      element_count());
       break;
     case U16:
-      *proto->mutable_u16s() =
-          std::string(reinterpret_cast<const char*>(data<uint16_t>().data()),
-                      size_bytes_dense());
+      *proto->mutable_u16s() = std::string(
+          tsl::safe_reinterpret_cast<const char*>(data<uint16_t>().data()),
+          size_bytes_dense());
       if (!kLittleEndian) {
         ConvertEndianShort(proto->mutable_u16s());
       }
@@ -2292,24 +2305,27 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
       break;
     case S1:
       *proto->mutable_s1s() = std::string(
-          reinterpret_cast<const char*>(data<s1>().data()), size_bytes_dense());
+          tsl::safe_reinterpret_cast<const char*>(data<s1>().data()),
+          size_bytes_dense());
       break;
     case S2:
       *proto->mutable_s2s() = std::string(
-          reinterpret_cast<const char*>(data<s2>().data()), size_bytes_dense());
+          tsl::safe_reinterpret_cast<const char*>(data<s2>().data()),
+          size_bytes_dense());
       break;
     case S4:
       *proto->mutable_s4s() = std::string(
-          reinterpret_cast<const char*>(data<s4>().data()), size_bytes_dense());
+          tsl::safe_reinterpret_cast<const char*>(data<s4>().data()),
+          size_bytes_dense());
       break;
     case S8:
       proto->set_s8s(static_cast<const signed char*>(data<int8_t>().data()),
                      element_count());
       break;
     case S16:
-      *proto->mutable_s16s() =
-          std::string(reinterpret_cast<const char*>(data<int16_t>().data()),
-                      size_bytes_dense());
+      *proto->mutable_s16s() = std::string(
+          tsl::safe_reinterpret_cast<const char*>(data<int16_t>().data()),
+          size_bytes_dense());
       if (!kLittleEndian) {
         ConvertEndianShort(proto->mutable_s16s());
       }
@@ -2321,62 +2337,71 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
       CopyToRepeatedField(proto->mutable_s64s(), data<int64_t>());
       break;
     case F4E2M1FN:
-      *proto->mutable_f4e2m1fns() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float4_e2m1fn>().data()),
-          size_bytes_dense());
+      *proto->mutable_f4e2m1fns() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float4_e2m1fn>().data()),
+                      size_bytes_dense());
       break;
     case F8E5M2:
-      *proto->mutable_f8e5m2s() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e5m2>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e5m2s() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e5m2>().data()),
+                      size_bytes_dense());
       break;
     case F8E4M3:
-      *proto->mutable_f8e4m3s() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e4m3>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e4m3s() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e4m3>().data()),
+                      size_bytes_dense());
       break;
     case F8E4M3FN:
-      *proto->mutable_f8e4m3fns() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e4m3fn>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e4m3fns() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e4m3fn>().data()),
+                      size_bytes_dense());
       break;
     case F8E4M3B11FNUZ:
-      *proto->mutable_f8e4m3b11fnuzs() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e4m3b11fnuz>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e4m3b11fnuzs() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e4m3b11fnuz>().data()),
+                      size_bytes_dense());
       break;
     case F8E5M2FNUZ:
-      *proto->mutable_f8e5m2fnuzs() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e5m2fnuz>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e5m2fnuzs() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e5m2fnuz>().data()),
+                      size_bytes_dense());
       break;
     case F8E4M3FNUZ:
-      *proto->mutable_f8e4m3fnuzs() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e4m3fnuz>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e4m3fnuzs() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e4m3fnuz>().data()),
+                      size_bytes_dense());
       break;
     case F8E3M4:
-      *proto->mutable_f8e3m4s() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e3m4>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e3m4s() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e3m4>().data()),
+                      size_bytes_dense());
       break;
     case F8E8M0FNU:
-      *proto->mutable_f8e8m0fnus() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e8m0fnu>().data()),
-          size_bytes_dense());
+      *proto->mutable_f8e8m0fnus() =
+          std::string(tsl::safe_reinterpret_cast<const char*>(
+                          data<tsl::float8_e8m0fnu>().data()),
+                      size_bytes_dense());
       break;
     case F16:
-      *proto->mutable_f16s() =
-          std::string(reinterpret_cast<const char*>(data<half>().data()),
-                      size_bytes_dense());
+      *proto->mutable_f16s() = std::string(
+          tsl::safe_reinterpret_cast<const char*>(data<half>().data()),
+          size_bytes_dense());
       if (!kLittleEndian) {
         ConvertEndianShort(proto->mutable_f16s());
       }
       break;
     case BF16:
-      *proto->mutable_bf16s() =
-          std::string(reinterpret_cast<const char*>(data<bfloat16>().data()),
-                      size_bytes_dense());
+      *proto->mutable_bf16s() = std::string(
+          tsl::safe_reinterpret_cast<const char*>(data<bfloat16>().data()),
+          size_bytes_dense());
       if (!kLittleEndian) {
         ConvertEndianShort(proto->mutable_bf16s());
       }
@@ -2442,7 +2467,7 @@ absl::Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   // These conditions should have been checked in
   // MutableLiteralBase::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
-  Shape shape(proto.shape());
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto.shape()));
   TF_RET_CHECK(LayoutUtil::HasLayout(shape));
   TF_RET_CHECK(ShapeUtil::Equal(shape, subshape()));
 
@@ -2473,7 +2498,8 @@ absl::Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       TF_RET_CHECK(data<int16_t>().size() * sizeof(int16_t) == s.size());
       memcpy(untyped_data(), s.data(), s.size());
       if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+        ConvertEndianShort(tsl::safe_reinterpret_cast<char*>(untyped_data()),
+                           s.size());
       }
       break;
     }
@@ -2506,7 +2532,8 @@ absl::Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       TF_RET_CHECK(data<uint16_t>().size() * sizeof(uint16_t) == s.size());
       memcpy(untyped_data(), s.data(), s.size());
       if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+        ConvertEndianShort(tsl::safe_reinterpret_cast<char*>(untyped_data()),
+                           s.size());
       }
       break;
     }
@@ -2590,7 +2617,8 @@ absl::Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
       memcpy(untyped_data(), s.data(), s.size());
       if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+        ConvertEndianShort(tsl::safe_reinterpret_cast<char*>(untyped_data()),
+                           s.size());
       }
       break;
     }
@@ -2599,7 +2627,8 @@ absl::Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       TF_RET_CHECK(data<bfloat16>().size() * sizeof(bfloat16) == s.size());
       memcpy(untyped_data(), s.data(), s.size());
       if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+        ConvertEndianShort(tsl::safe_reinterpret_cast<char*>(untyped_data()),
+                           s.size());
       }
       break;
     }
@@ -2704,7 +2733,7 @@ int64_t LiteralBase::size_bytes(const ShapeIndex& shape_index) const {
 
 std::string LiteralBase::GetR1U8AsString() const {
   CHECK(shape().IsArray());
-  CHECK_EQ(shape().rank(), 1);
+  CHECK_EQ(shape().dimensions().size(), 1);
   CHECK_EQ(shape().element_type(), U8);
   return std::string(absl::bit_cast<const char*>(data<uint8_t>().data()),
                      ShapeUtil::ElementsIn(shape()));
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index 2b8de1e1a60e..142739a5d6b3 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -58,6 +58,7 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -292,6 +293,14 @@ class LiteralBase {
   void EachCell(
       absl::FunctionRef<void(absl::Span<const int64_t> indices, NativeT value)>
           per_cell) const;
+  template <typename NativeT>
+  // Like the above, but allows early return. At any time in the iteration, if
+  // the callback returns false, the iteration will be aborted and the function
+  // will return false. Otherwise it will iterate over all elements and return
+  // true.
+  bool EachCellUntilFailure(
+      absl::FunctionRef<bool(absl::Span<const int64_t> indices, NativeT value)>
+          per_cell) const;
 
   // Checks whether all of this literal's values are equal to the given scalar
   // literal.
@@ -415,7 +424,7 @@ class LiteralBase {
           ShapeUtil::ByteSizeOfPrimitiveType(subshape.element_type());
       absl::Span<const int64_t> minor_to_major =
           subshape.layout().minor_to_major();
-      DimensionVector elem_index(subshape.dimensions_size());
+      DimensionVector elem_index(subshape.dimensions().size());
       absl::Span<int64_t> elem_index_span(elem_index.data(), elem_index.size());
       int64_t bytes_hashed = 0;
       while (bytes_hashed < bytes_to_hash) {
@@ -428,7 +437,7 @@ class LiteralBase {
       }
     });
 
-    return std::move(state);
+    return state;
   }
 
   // Templated wrapper struct to control layout sensitivity during Absl::Hash.
@@ -786,7 +795,7 @@ class LiteralBase {
       if (!proto.ParseFromString(shape_bytes)) {
         return InvalidArgument("Failed to parse shape protobuf");
       }
-      Shape shape(proto);
+      TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(proto));
       TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
       return std::move(shape);
     }
@@ -874,7 +883,7 @@ class LiteralBase {
     // Gets/sets the buffer holding dynamic sizes.
     const DynamicSizeType* dynamic_size_buffer() const {
       DCHECK(LayoutUtil::IsDenseArray(*subshape_));
-      return reinterpret_cast<const DynamicSizeType*>(
+      return tsl::safe_reinterpret_cast<const DynamicSizeType*>(
           buffer() + dynamic_size_buffer_offset());
     }
     DynamicSizeType* dynamic_size_buffer() {
@@ -884,7 +893,7 @@ class LiteralBase {
 
     int64_t dynamic_size_buffer_bytes() const {
       DCHECK(LayoutUtil::IsDenseArray(*subshape_));
-      return subshape().dimensions_size() * sizeof(DynamicSizeType);
+      return subshape().dimensions().size() * sizeof(DynamicSizeType);
     }
 
     // Gets or sets the subshape of this piece. This reference points to a
@@ -1692,7 +1701,7 @@ void LiteralBase::Piece::SerializeData(
            primitive_util::NativeToPrimitiveType<NativeT>());
   if (subshape().is_dynamic()) {
     absl::Span<const DynamicSizeType> sizes(dynamic_size_buffer(),
-                                            subshape().rank());
+                                            subshape().dimensions().size());
     state.WriteDynamicSizes(sizes);
   }
   state.WriteElements(data<NativeT>());
@@ -1704,7 +1713,8 @@ bool LiteralBase::Piece::DeserializeData(
   CHECK_EQ(subshape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
   if (subshape().is_dynamic()) {
-    absl::Span<DynamicSizeType> sizes(dynamic_size_buffer(), subshape().rank());
+    absl::Span<DynamicSizeType> sizes(dynamic_size_buffer(),
+                                      subshape().dimensions().size());
     if (!state.ReadDynamicSizes(sizes)) {
       return false;
     }
@@ -1745,7 +1755,7 @@ absl::Status LiteralBase::SerializeWithShapeProto(const ShapeProto& shape_proto,
           return InvalidArgument("Shape cannot be serialized: %s",
                                  shape().ToString());
         }
-        primitive_util::ArrayTypeSwitch<void>(
+        primitive_util::ArrayTypeSwitch(
             [&](auto primitive_type) {
               using NativeT = primitive_util::NativeTypeOf<primitive_type>;
               piece.SerializeData<NativeT>(state);
@@ -1779,7 +1789,7 @@ absl::StatusOr<Literal> Literal::Deserialize(InputIterator begin,
               return InvalidArgument("Shape cannot be deserialized: %s",
                                      shape.ToString());
             }
-            bool ok = primitive_util::ArrayTypeSwitch<bool>(
+            bool ok = primitive_util::ArrayTypeSwitch(
                 [&](auto primitive_type) {
                   using NativeT = primitive_util::NativeTypeOf<primitive_type>;
                   return piece->DeserializeData<NativeT>(state);
@@ -1814,8 +1824,8 @@ absl::Span<const NativeT> LiteralBase::Piece::data() const {
       << PrimitiveType_Name(primitive_util::NativeToPrimitiveType<NativeT>())
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
-  return absl::Span<const NativeT>(reinterpret_cast<const NativeT*>(buffer()),
-                                   element_count());
+  return absl::Span<const NativeT>(
+      tsl::safe_reinterpret_cast<const NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -1832,7 +1842,7 @@ absl::Span<NativeT> LiteralBase::Piece::data() {
       << PrimitiveType_Name(primitive_util::NativeToPrimitiveType<NativeT>())
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
-  return absl::Span<NativeT>(reinterpret_cast<NativeT*>(buffer()),
+  return absl::Span<NativeT>(tsl::safe_reinterpret_cast<NativeT*>(buffer()),
                              element_count());
 }
 
@@ -1930,7 +1940,7 @@ int64_t LiteralBase::CountEqual(T value) const {
     return 0;
   }
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  return primitive_util::ArrayTypeSwitch<int64_t>(
+  return primitive_util::ArrayTypeSwitch(
       [&](auto primitive_type_constant) -> int64_t {
         using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
         scalar.Set<NativeT>({}, static_cast<NativeT>(value));
@@ -1946,7 +1956,7 @@ int64_t LiteralBase::CountEqual(std::complex<T> value) const {
     return 0;
   }
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  return primitive_util::ComplexTypeSwitch<int64_t>(
+  return primitive_util::ComplexTypeSwitch(
       [&](auto primitive_type_constant) -> int64_t {
         using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
         scalar.Set<NativeT>({}, static_cast<NativeT>(value));
@@ -1959,20 +1969,32 @@ template <typename NativeT>
 TF_ATTRIBUTE_NOINLINE void LiteralBase::EachCell(
     absl::FunctionRef<void(absl::Span<const int64_t> indices, NativeT value)>
         per_cell) const {
+  EachCellUntilFailure<NativeT>(
+      [=](absl::Span<const int64_t> indices, NativeT value) {
+        per_cell(indices, value);
+        return true;
+      });
+}
+
+template <typename NativeT>
+TF_ATTRIBUTE_NOINLINE bool LiteralBase::EachCellUntilFailure(
+    absl::FunctionRef<bool(absl::Span<const int64_t> indices, NativeT value)>
+        per_cell) const {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
   if (ShapeUtil::IsZeroElementArray(shape())) {
-    return;
+    return true;
   }
-  std::vector<int64_t> indices(shape().rank(), 0);
+  std::vector<int64_t> indices(shape().dimensions().size(), 0);
 
   Shape shape_dynamic = shape();
-  for (int64_t i = 0; i < shape_dynamic.rank(); ++i) {
+  for (int64_t i = 0; i < shape_dynamic.dimensions().size(); ++i) {
     shape_dynamic.set_dimensions(i, GetDynamicSize(i));
   }
   do {
-    per_cell(indices, Get<NativeT>(indices));
+    if (!per_cell(indices, Get<NativeT>(indices))) return false;
   } while (IndexUtil::BumpIndices(shape_dynamic, absl::MakeSpan(indices)));
+  return true;
 }
 
 template <typename NativeT>
@@ -1984,9 +2006,9 @@ TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::MutableEachCell(
   if (ShapeUtil::IsZeroElementArray(shape())) {
     return;
   }
-  std::vector<int64_t> indices(shape().rank(), 0);
+  std::vector<int64_t> indices(shape().dimensions().size(), 0);
   Shape shape_dynamic = shape();
-  for (int64_t i = 0; i < shape_dynamic.rank(); ++i) {
+  for (int64_t i = 0; i < shape_dynamic.dimensions().size(); ++i) {
     shape_dynamic.set_dimensions(i, GetDynamicSize(i));
   }
   do {
@@ -1999,7 +2021,7 @@ TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::PopulateR1(
     absl::Span<const NativeT> values) {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
-  CHECK_EQ(shape().rank(), 1);
+  CHECK_EQ(shape().dimensions().size(), 1);
   if (shape().is_static()) {
     CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size());
   } else {
@@ -2016,7 +2038,7 @@ TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::PopulateR2(
     std::initializer_list<std::initializer_list<NativeT>> values) {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
-  CHECK_EQ(shape().rank(), 2);
+  CHECK_EQ(shape().dimensions().size(), 2);
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
 
@@ -2052,7 +2074,7 @@ TF_ATTRIBUTE_NOINLINE void MutableLiteralBase::PopulateFromArray(
   CHECK(shape().IsArray());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
-  CHECK_EQ(shape().rank(), values.num_dimensions());
+  CHECK_EQ(shape().dimensions().size(), values.num_dimensions());
   for (int dim = 0; dim < values.num_dimensions(); ++dim) {
     int64_t shape_size = shape().is_dynamic_dimension(dim)
                              ? GetDynamicSize(dim)
@@ -2222,7 +2244,7 @@ Literal LiteralBase::Replicate(int64_t times) const {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
   DimensionVector bounds = {times};
-  bounds.reserve(shape().dimensions_size() + 1);
+  bounds.reserve(shape().dimensions().size() + 1);
   for (int64_t bound : shape().dimensions()) {
     bounds.push_back(bound);
   }
diff --git a/third_party/xla/xla/literal_comparison.cc b/third_party/xla/xla/literal_comparison.cc
index 0814066d7fcd..0e4898b5ae90 100644
--- a/third_party/xla/xla/literal_comparison.cc
+++ b/third_party/xla/xla/literal_comparison.cc
@@ -139,7 +139,7 @@ template <typename NativeT>
 absl::Status Equal(LiteralSlice expected, LiteralSlice actual,
                    absl::Span<int64_t> multi_index, int64_t dimension,
                    Literal* mismatched = nullptr) {
-  if (dimension == expected.shape().dimensions_size()) {
+  if (dimension == expected.shape().dimensions().size()) {
     NativeT expected_value = expected.Get<NativeT>(multi_index);
     NativeT actual_value = actual.Get<NativeT>(multi_index);
     bool result =
@@ -351,7 +351,7 @@ class NearComparator {
     if (error_.low_precision_fp_error_spec.type ==
         PrimitiveType::PRIMITIVE_TYPE_INVALID)
       return -1;
-    return primitive_util::FloatingPointTypeSwitch<int>(
+    return primitive_util::FloatingPointTypeSwitch(
         [&](const auto kType) -> int {
           using NarrowNativeT = primitive_util::NativeTypeOf<kType>;
           // TODO(b/370786669): Once ml_dtypes is updated to include
@@ -518,7 +518,7 @@ class NearComparator {
       }
       return;
     }
-    std::vector<int64_t> multi_index(actual_.shape().rank(), 0);
+    std::vector<int64_t> multi_index(actual_.shape().dimensions().size(), 0);
     CompareLiteralsSlow(0, &multi_index);
   }
 
@@ -706,14 +706,18 @@ absl::Status EqualHelper(const LiteralSlice& expected,
       next_index.pop_back();
     }
   } else {
-    std::vector<int64_t> multi_index(expected.shape().dimensions_size(), 0);
+    std::vector<int64_t> multi_index(
+        expected.shape().IsArray() ? expected.shape().dimensions().size() : 0,
+        0);
     auto index = absl::MakeSpan(multi_index);
 
-    Shape unequal_shape = ShapeUtil::MakeShape(PrimitiveType::PRED,
-                                               expected.shape().dimensions());
+    const Shape unequal_shape = ShapeUtil::MakeShape(
+        PrimitiveType::PRED, expected.shape().IsArray()
+                                 ? expected.shape().dimensions()
+                                 : absl::Span<const int64_t>());
     Literal miscompared(unequal_shape);
-    Literal* miscompared_ptr =
-        (miscompare_callback == nullptr ? nullptr : &miscompared);
+    Literal* const miscompared_ptr =
+        (miscompare_callback == nullptr) ? nullptr : &miscompared;
 
     primitive_util::PrimitiveTypeSwitch<void>(
         [&](auto primitive_type_constant) -> void {
@@ -830,7 +834,7 @@ absl::Status EqualShapes(const Shape& expected, const Shape& actual) {
           ShapeUtil::TupleElementCount(expected),
           ShapeUtil::TupleElementCount(actual));
     }
-    for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < expected.tuple_shapes().size(); ++i) {
       absl::Status result =
           EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
       if (!result.ok()) {
@@ -838,7 +842,7 @@ absl::Status EqualShapes(const Shape& expected, const Shape& actual) {
       }
     }
   } else if (expected.IsArray()) {
-    if (expected.rank() != actual.rank()) {
+    if (expected.dimensions().size() != actual.dimensions().size()) {
       return InvalidArgument("want rank of %s got rank of %s",
                              ShapeUtil::HumanString(expected),
                              ShapeUtil::HumanString(actual));
@@ -848,12 +852,12 @@ absl::Status EqualShapes(const Shape& expected, const Shape& actual) {
                              PrimitiveType_Name(expected.element_type()),
                              PrimitiveType_Name(actual.element_type()));
     }
-    if (expected.dimensions_size() != actual.dimensions_size()) {
+    if (expected.dimensions().size() != actual.dimensions().size()) {
       return InvalidArgument("want dimensions_size %d got dimensions_size %d",
-                             expected.dimensions_size(),
-                             actual.dimensions_size());
+                             expected.dimensions().size(),
+                             actual.dimensions().size());
     }
-    for (int i = 0; i < expected.dimensions_size(); ++i) {
+    for (int i = 0; i < expected.dimensions().size(); ++i) {
       if (expected.dimensions(i) != actual.dimensions(i)) {
         return InvalidArgument(
             "mismatch in dimension #%d expected: %s actual: %s", i,
@@ -873,6 +877,10 @@ absl::Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
       [&expected, &actual](const Shape& expected_shape,
                            const ShapeIndex& index) -> absl::Status {
         auto actual_shape = ShapeUtil::GetSubshape(actual.shape(), index);
+        if (!expected_shape.IsArray()) {
+          return absl::OkStatus();
+        }
+
         for (int i = 0; i < expected_shape.dimensions().size(); ++i) {
           if (!expected_shape.is_dynamic_dimension(i) &&
               !actual_shape.is_dynamic_dimension(i)) {
@@ -930,14 +938,15 @@ absl::Status EmitLiteralsInErrorMessage(const absl::Status& result,
 
 }  // namespace
 
-absl::Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
+absl::Status Equal(const LiteralSlice& expected, const LiteralSlice& actual,
+                   const MiscompareCallback& miscompare_callback) {
   if (VLOG_IS_ON(1)) {
     LOG(INFO) << "expected:";
     XLA_LOG_LINES(INFO, expected.ToString());
     LOG(INFO) << "actual:";
     XLA_LOG_LINES(INFO, actual.ToString());
   }
-  absl::Status result = EqualHelper(expected, actual, {}, nullptr);
+  absl::Status result = EqualHelper(expected, actual, {}, miscompare_callback);
   return EmitLiteralsInErrorMessage(result, expected, actual);
 }
 
diff --git a/third_party/xla/xla/literal_comparison.h b/third_party/xla/xla/literal_comparison.h
index 113ac2240eb3..8626b13f1a1b 100644
--- a/third_party/xla/xla/literal_comparison.h
+++ b/third_party/xla/xla/literal_comparison.h
@@ -43,11 +43,6 @@ absl::Status EqualShapes(const Shape& expected, const Shape& actual);
 absl::Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
                                              const LiteralSlice& actual);
 
-// Returns ok if the expected and actual literals are (bitwise) equal for all
-// elements in the literal. Also, asserts that the rank, dimensions sizes, and
-// primitive type are equal.
-absl::Status Equal(const LiteralSlice& expected, const LiteralSlice& actual);
-
 // Structure that contains the distribution of absolute and relative errors,
 // bucketized into five buckets: [0.0001, 0.001, 0.01, 0.1, 1].
 // Useful to understand the distribution of errors and set the permissible
@@ -67,6 +62,12 @@ using MiscompareCallback = std::function<void(
     const LiteralSlice& mismatches, const ShapeIndex& shape_index,
     const ErrorBuckets& error_buckets)>;
 
+// Returns ok if the expected and actual literals are (bitwise) equal for all
+// elements in the literal. Also, asserts that the rank, dimensions sizes, and
+// primitive type are equal.
+absl::Status Equal(const LiteralSlice& expected, const LiteralSlice& actual,
+                   const MiscompareCallback& miscompare_callback = nullptr);
+
 // Inspects whether the expected and actual literals are within the given error
 // bound for all elements. Also, inspects whether the rank, dimensions sizes,
 // and dimension bounds are equivalent.
diff --git a/third_party/xla/xla/literal_pool.cc b/third_party/xla/xla/literal_pool.cc
index e3ffb1430e45..d26799e70e40 100644
--- a/third_party/xla/xla/literal_pool.cc
+++ b/third_party/xla/xla/literal_pool.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace xla {
 
 LiteralPool* LiteralPool::Default() {
-  static auto* pool = new LiteralPool();
+  static auto* const pool = new LiteralPool();
   return pool;
 }
 
diff --git a/third_party/xla/xla/literal_test.cc b/third_party/xla/xla/literal_test.cc
index 326e9975d57e..86e7cd032bb0 100644
--- a/third_party/xla/xla/literal_test.cc
+++ b/third_party/xla/xla/literal_test.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -130,7 +131,16 @@ using FloatTypes = ::testing::Types<float, half, bfloat16, tsl::float4_e2m1fn,
                                     tsl::float8_e4m3fnuz, tsl::float8_e5m2,
                                     tsl::float8_e5m2fnuz, tsl::float8_e8m0fnu>;
 
-TYPED_TEST_SUITE(LiteralUtilFloatTest, FloatTypes);
+class FloatTypeNames {
+ public:
+  template <typename T>
+  static std::string GetName(int) {
+    PrimitiveType type = primitive_util::NativeToPrimitiveType<T>();
+    return primitive_util::LowercasePrimitiveTypeName(type);
+  }
+};
+
+TYPED_TEST_SUITE(LiteralUtilFloatTest, FloatTypes, FloatTypeNames);
 
 TEST_F(LiteralUtilTest, LiteralScalarToString) {
   auto true_lit = LiteralUtil::CreateR0<bool>(true);
@@ -550,6 +560,27 @@ TEST_F(LiteralUtilTest, DifferentLayoutInEquality) {
   EXPECT_FALSE(colmajor.Equal(rowmajor, true));
 }
 
+TEST_F(LiteralUtilTest, CreateWithoutLayout) {
+  Shape default_layout_shape = ShapeUtil::MakeShape(F32, {2, 1});
+  Shape no_layout_shape = default_layout_shape;
+  no_layout_shape.clear_layout();
+  auto literal =
+      LiteralBase::CreateFromShapeWithUndeterminedLeafArrays(no_layout_shape);
+  // The default Layout should have been added back.
+  EXPECT_EQ(literal.shape(), default_layout_shape);
+}
+
+TEST_F(LiteralUtilTest, CreateWithoutLayout_Tuple) {
+  Shape default_layout_shape = ShapeUtil::MakeShape(F32, {2, 1});
+  Shape no_layout_shape = default_layout_shape;
+  no_layout_shape.clear_layout();
+  Shape literal_shape = ShapeUtil::MakeTupleShape({no_layout_shape});
+  auto literal =
+      LiteralBase::CreateFromShapeWithUndeterminedLeafArrays(literal_shape);
+  // The default Layout should have been added back.
+  EXPECT_EQ(literal.shape().tuple_shapes(0), default_layout_shape);
+}
+
 TEST_F(LiteralUtilTest, TupleEquality) {
   // Test equality with tuples.
   auto scalar = LiteralUtil::CreateR0<float>(1.0);
@@ -971,6 +1002,30 @@ TEST_F(LiteralUtilTest, TransposeR0) {
   EXPECT_EQ(original, reshape);
 }
 
+TEST_F(LiteralUtilTest, EachCellUntilFailureAbortsOnFailure) {
+  auto original = LiteralUtil::CreateR1<int32_t>({1, 2, -2, 4, 5});
+  int count = 0;
+  auto check_positive = [&](absl::Span<const int64_t> indices,
+                            int32_t value) -> bool {
+    count++;
+    return value > 0;
+  };
+  EXPECT_FALSE(original.EachCellUntilFailure<int32_t>(check_positive));
+  EXPECT_EQ(count, 3);
+}
+
+TEST_F(LiteralUtilTest, EachCellUntilFailureGoesThroughAllCells) {
+  auto original = LiteralUtil::CreateR1<int32_t>({1, 2, 3, 4, 5});
+  int count = 0;
+  auto check_positive = [&](absl::Span<const int64_t> indices,
+                            int32_t value) -> bool {
+    count++;
+    return value > 0;
+  };
+  EXPECT_TRUE(original.EachCellUntilFailure<int32_t>(check_positive));
+  EXPECT_EQ(count, 5);
+}
+
 TEST_F(LiteralUtilTest, TransposeR4) {
   // clang-format off
   // F32[1x3x2x4]
@@ -1461,7 +1516,8 @@ TEST_F(LiteralUtilTest, F16) {
   // are in little endian format
   // TODO - modify if we make the data format machine endianness dependent
   Literal m1 = Literal::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
-  const char* d1 = reinterpret_cast<const char*>(m1.data<half>().data());
+  const char* const d1 =
+      tsl::safe_reinterpret_cast<const char*>(m1.data<half>().data());
   EXPECT_EQ(d1[0], 0);
   EXPECT_EQ(d1[1], 0);
   EXPECT_EQ(d1[2], 0);
@@ -1474,12 +1530,16 @@ TEST_F(LiteralUtilTest, F16) {
   half h1(1.0f);
   half h2(2.0f);
   auto m2 = LiteralUtil::CreateR2<half>({{h1, h2}, {h2, h1}});
-  const uint16_t* d2 =
-      reinterpret_cast<const uint16_t*>(m2.data<half>().data());
-  EXPECT_EQ(d2[0], 0x3C00);
-  EXPECT_EQ(d2[1], 0x4000);
-  EXPECT_EQ(d2[2], 0x4000);
-  EXPECT_EQ(d2[3], 0x3C00);
+  const char* const d2 =
+      tsl::safe_reinterpret_cast<const char*>(m2.data<half>().data());
+  EXPECT_EQ(d2[0], 0x00);
+  EXPECT_EQ(d2[1], 0x3C);
+  EXPECT_EQ(d2[2], 0x00);
+  EXPECT_EQ(d2[3], 0x40);
+  EXPECT_EQ(d2[4], 0x00);
+  EXPECT_EQ(d2[5], 0x40);
+  EXPECT_EQ(d2[6], 0x00);
+  EXPECT_EQ(d2[7], 0x3C);
 }
 
 TEST_F(LiteralUtilTest, Populate) {
@@ -1807,26 +1867,26 @@ TYPED_TEST(LiteralUtilFloatTest, ConvertIfTypesMatchF8) {
     GTEST_SKIP() << "Skipping test for non F8 types";
   }
   auto s8 = LiteralUtil::CreateR2WithLayout<int8_t>(
-      {{0, 1}, {2, 3}}, LiteralUtilTest::layout_r2_dim0major_);
+      {{1, 2}, {4, 8}}, LiteralUtilTest::layout_r2_dim0major_);
   auto bf16 = LiteralUtil::CreateR2WithLayout<bfloat16>(
-      {{bfloat16(0.), bfloat16(1.)}, {bfloat16(2.), bfloat16(3.)}},
+      {{bfloat16(1.), bfloat16(2.)}, {bfloat16(4.), bfloat16(8.)}},
       LiteralUtilTest::layout_r2_dim0major_);
   auto f32 = LiteralUtil::CreateR2WithLayout<float>(
-      {{0., 1.}, {2., 3.}}, LiteralUtilTest::layout_r2_dim0major_);
+      {{1., 2.}, {4., 8.}}, LiteralUtilTest::layout_r2_dim0major_);
   auto c128 = LiteralUtil::CreateR2WithLayout<complex128>(
-      {{0., 1.}, {2., 3.}}, LiteralUtilTest::layout_r2_dim0major_);
+      {{1., 2.}, {4., 8.}}, LiteralUtilTest::layout_r2_dim0major_);
   // Let's also use a couple of popular F8 types as sources for conversion
   using f8e5m2_t = tsl::float8_e5m2;
   auto f8e5m2 = LiteralUtil::CreateR2WithLayout<f8e5m2_t>(
-      {{f8e5m2_t{0.}, f8e5m2_t{1.}}, {f8e5m2_t{2.}, f8e5m2_t{3.}}},
+      {{f8e5m2_t{1.}, f8e5m2_t{2.}}, {f8e5m2_t{4.}, f8e5m2_t{8.}}},
       LiteralUtilTest::layout_r2_dim0major_);
   using e4m3fn_t = tsl::float8_e4m3fn;
   auto f8e4m3fn = LiteralUtil::CreateR2WithLayout<e4m3fn_t>(
-      {{e4m3fn_t{0.}, e4m3fn_t{1.}}, {e4m3fn_t{2.}, e4m3fn_t{3.}}},
+      {{e4m3fn_t{1.}, e4m3fn_t{2.}}, {e4m3fn_t{4.}, e4m3fn_t{8.}}},
       LiteralUtilTest::layout_r2_dim0major_);
 
   auto f8 = LiteralUtil::CreateR2WithLayout<TypeParam>(
-      {{TypeParam{0.}, TypeParam{1.}}, {TypeParam{2.}, TypeParam{3.}}},
+      {{TypeParam{1.}, TypeParam{2.}}, {TypeParam{4.}, TypeParam{8.}}},
       LiteralUtilTest::layout_r2_dim0major_);
 
   Literal conv;
@@ -1932,7 +1992,8 @@ TEST_F(LiteralUtilTest, ToProto_f16) {
   EXPECT_EQ(4, m.data<half>().size());
 
   LiteralProto p = m.ToProto();
-  EXPECT_EQ(4, ShapeUtil::ElementsIn(Shape(p.shape())));
+  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(p.shape()));
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(shape));
   EXPECT_EQ(8, p.f16s().size());
   const char* d = p.f16s().data();
   EXPECT_EQ(d[0], 0);
@@ -2049,8 +2110,9 @@ TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtr) {
   std::vector<int64_t> int64_values = {1, 2, 3};
   const Shape literal_shape = ShapeUtil::MakeShape(S64, {3});
 
-  BorrowingLiteral literal(reinterpret_cast<const char*>(int64_values.data()),
-                           literal_shape);
+  BorrowingLiteral literal(
+      tsl::safe_reinterpret_cast<const char*>(int64_values.data()),
+      literal_shape);
 
   EXPECT_EQ(literal.Get<int64_t>({0}), 1);
   EXPECT_EQ(literal.Get<int64_t>({1}), 2);
@@ -2066,8 +2128,9 @@ TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrs) {
 
   std::vector<const char*> src_buf_ptrs;
   src_buf_ptrs.emplace_back(
-      reinterpret_cast<const char*>(one_two_three.data()));
-  src_buf_ptrs.emplace_back(reinterpret_cast<const char*>(hundred.data()));
+      tsl::safe_reinterpret_cast<const char*>(one_two_three.data()));
+  src_buf_ptrs.emplace_back(
+      tsl::safe_reinterpret_cast<const char*>(hundred.data()));
   auto literal_tuple = BorrowingLiteral(
       src_buf_ptrs,
       ShapeUtil::MakeTupleShape({one_two_three_shape, hundred_shape}));
@@ -2093,9 +2156,12 @@ TEST_F(LiteralUtilTest, BorrowingLiteralFromShapeTree) {
   Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, shape});
 
   ShapeTree<const char*> ptr_tree(nested_tuple);
-  *ptr_tree.mutable_element({0, 0}) = reinterpret_cast<char*>(data.data());
-  *ptr_tree.mutable_element({0, 1}) = reinterpret_cast<char*>(data.data());
-  *ptr_tree.mutable_element({1}) = reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({0, 0}) =
+      tsl::safe_reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({0, 1}) =
+      tsl::safe_reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({1}) =
+      tsl::safe_reinterpret_cast<char*>(data.data());
 
   BorrowingLiteral literal(ptr_tree);
 
@@ -2112,9 +2178,12 @@ TEST_F(LiteralUtilTest, MutableBorrowingLiteralFromShapeTree) {
   Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, shape});
 
   ShapeTree<char*> ptr_tree(nested_tuple);
-  *ptr_tree.mutable_element({0, 0}) = reinterpret_cast<char*>(data.data());
-  *ptr_tree.mutable_element({0, 1}) = reinterpret_cast<char*>(data.data());
-  *ptr_tree.mutable_element({1}) = reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({0, 0}) =
+      tsl::safe_reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({0, 1}) =
+      tsl::safe_reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({1}) =
+      tsl::safe_reinterpret_cast<char*>(data.data());
 
   MutableBorrowingLiteral literal(ptr_tree);
 
@@ -2465,8 +2534,9 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
           {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
           .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
+  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto.shape()));
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
+      ShapeUtil::GetTupleElementShape(shape, 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
 
@@ -2483,13 +2553,15 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
           {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
           .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
+  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto.shape()));
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
+      ShapeUtil::GetTupleElementShape(shape, 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
   LiteralProto* element1 = proto.add_tuple_literals();
+  TF_ASSERT_OK_AND_ASSIGN(shape, Shape::FromProto(proto.shape()));
   *element1->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 1).ToProto();
+      ShapeUtil::GetTupleElementShape(shape, 1).ToProto();
   element1->add_f32s(42.0);
   LiteralProto* element2 = proto.add_tuple_literals();
   *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {}).ToProto();
@@ -3038,7 +3110,7 @@ TEST_P(LiteralSerializationTest, Test) {
           return;
         }
         ASSERT_TRUE(subshape.IsArray());
-        primitive_util::ArrayTypeSwitch<void>(
+        primitive_util::ArrayTypeSwitch(
             [&](auto primitive_type) {
               using NativeT = primitive_util::NativeTypeOf<primitive_type>;
               for (auto& element : literal.data<NativeT>(shape_index)) {
diff --git a/third_party/xla/xla/literal_util.cc b/third_party/xla/xla/literal_util.cc
index c975499125a1..a9a9cf1e63c5 100644
--- a/third_party/xla/xla/literal_util.cc
+++ b/third_party/xla/xla/literal_util.cc
@@ -15,16 +15,20 @@ limitations under the License.
 
 #include "xla/literal_util.h"
 
+#include <cmath>
 #include <cstdint>
+#include <cstdlib>
 #include <limits>
 #include <memory>
 #include <optional>
+#include <random>
 #include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "absl/random/uniform_int_distribution.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -326,7 +330,7 @@ void PopulateWithFloatingPointData(
     bool use_large_range, std::optional<int64_t> max_bits_of_precision) {
   using ComputeT =
       std::conditional_t<sizeof(FloatT) < sizeof(float), float, FloatT>;
-  CHECK(engine != nullptr);
+  CHECK_NOTNULL(engine);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   if (max_bits_of_precision.has_value()) {
@@ -356,7 +360,7 @@ template <typename ComplexT>
 void PopulateWithComplexData(Literal* result, std::minstd_rand0* engine,
                              bool no_duplicates, bool use_large_range) {
   using InnerFloatT = typename ComplexT::value_type;
-  CHECK(engine != nullptr);
+  CHECK_NOTNULL(engine);
   CHECK_EQ(result->shape().element_type(),
            primitive_util::NativeToPrimitiveType<ComplexT>());
   Shape floating_point_shape = ShapeUtil::ChangeElementType(
@@ -392,7 +396,7 @@ void PopulateWithRandomIntegralDataWithBounds(Literal* literal,
                                               std::minstd_rand0* engine,
                                               bool no_duplicates, IntT min,
                                               IntT max) {
-  CHECK(engine != nullptr);
+  CHECK_NOTNULL(engine);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
   if (no_duplicates &&
diff --git a/third_party/xla/xla/literal_util.h b/third_party/xla/xla/literal_util.h
index 65350e26d3eb..4f8568110bca 100644
--- a/third_party/xla/xla/literal_util.h
+++ b/third_party/xla/xla/literal_util.h
@@ -326,7 +326,7 @@ template <typename NativeT>
 template <typename T>
 /* static */ Literal LiteralUtil::CreateR0(PrimitiveType primitive_type,
                                            T value) {
-  return primitive_util::ArrayTypeSwitch<Literal>(
+  return primitive_util::ArrayTypeSwitch(
       [&value](auto type) {
         using NativeT = primitive_util::NativeTypeOf<type>;
         return CreateR0(static_cast<NativeT>(value));
diff --git a/third_party/xla/xla/maybe_owning.h b/third_party/xla/xla/maybe_owning.h
index 4f32472ecb2f..2b63a4554337 100644
--- a/third_party/xla/xla/maybe_owning.h
+++ b/third_party/xla/xla/maybe_owning.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+
 // A unique_ptr like class which may or may not have ownership of its pointer.
 // Uses least significant bit of the pointer to indicate ownership.
 template <typename T>
@@ -81,18 +83,21 @@ class MaybeOwning final {
   };
 
   T* RemoveMask() const {
-    return reinterpret_cast<T*>(ptr_and_owning_bit_ & kPointerMask);
+    return tsl::safe_reinterpret_cast<T*>(
+        static_cast<intptr_t>(ptr_and_owning_bit_ & kPointerMask));
   }
 
   static intptr_t TakeUnique(std::unique_ptr<T> unique) {
     T* released = unique.release();
-    DCHECK_EQ(reinterpret_cast<intptr_t>(released) & kOwningBitMask, 0);
-    return reinterpret_cast<intptr_t>(released) | kOwningBitMask;
+    DCHECK_EQ(tsl::safe_reinterpret_cast<intptr_t>(released) & kOwningBitMask,
+              0);
+    return tsl::safe_reinterpret_cast<intptr_t>(released) | kOwningBitMask;
   }
 
   static intptr_t Borrow(const T* borrowed) {
-    DCHECK_EQ(reinterpret_cast<intptr_t>(borrowed) & kOwningBitMask, 0);
-    return reinterpret_cast<intptr_t>(borrowed);
+    DCHECK_EQ(tsl::safe_reinterpret_cast<intptr_t>(borrowed) & kOwningBitMask,
+              0);
+    return tsl::safe_reinterpret_cast<intptr_t>(borrowed);
   }
 
   void MaybeDeleteOwned() {
diff --git a/third_party/xla/xla/metric_table_report.cc b/third_party/xla/xla/metric_table_report.cc
index 6852e7a2a71f..605e9cd43d60 100644
--- a/third_party/xla/xla/metric_table_report.cc
+++ b/third_party/xla/xla/metric_table_report.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <limits>
 #include <string>
 #include <utility>
diff --git a/third_party/xla/xla/mlir/framework/ir/BUILD b/third_party/xla/xla/mlir/framework/ir/BUILD
index 05048b9d618a..e0feb5c25e58 100644
--- a/third_party/xla/xla/mlir/framework/ir/BUILD
+++ b/third_party/xla/xla/mlir/framework/ir/BUILD
@@ -25,32 +25,14 @@ td_library(
 gentbl_cc_library(
     name = "xla_framework_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "xla_framework.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "xla_framework.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "xla_framework_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "xla_framework_dialect.cc.inc",
-        ),
-        (
-            ["-gen-typedef-decls"],
-            "xla_framework_types.h.inc",
-        ),
-        (
-            ["-gen-typedef-defs"],
-            "xla_framework_types.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "xla_framework.h.inc": ["-gen-op-decls"],
+        "xla_framework.cc.inc": ["-gen-op-defs"],
+        "xla_framework_dialect.h.inc": ["-gen-dialect-decls"],
+        "xla_framework_dialect.cc.inc": ["-gen-dialect-defs"],
+        "xla_framework_types.h.inc": ["-gen-typedef-decls"],
+        "xla_framework_types.cc.inc": ["-gen-typedef-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "xla_framework_ops.td",
     deps = [":td_files"],
diff --git a/third_party/xla/xla/mlir/framework/ir/xla_framework_ops.td b/third_party/xla/xla/mlir/framework/ir/xla_framework_ops.td
index 6a72799112f7..6b9d7f400563 100644
--- a/third_party/xla/xla/mlir/framework/ir/xla_framework_ops.td
+++ b/third_party/xla/xla/mlir/framework/ir/xla_framework_ops.td
@@ -81,7 +81,7 @@ def XLAFramework_XLABufferToMemOp : XLAFramework_Op<"buffer_to_mem",
     }]>];
 
   let extraClassDeclaration = [{
-    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+    MemRefType getType() { return llvm::cast<MemRefType>(getResult().getType()); }
   }];
   let assemblyFormat = [{
     $buffer attr-dict `:` type($result)
diff --git a/third_party/xla/xla/mlir/framework/transforms/BUILD b/third_party/xla/xla/mlir/framework/transforms/BUILD
index f52120f9dc0e..b64af611274f 100644
--- a/third_party/xla/xla/mlir/framework/transforms/BUILD
+++ b/third_party/xla/xla/mlir/framework/transforms/BUILD
@@ -12,15 +12,10 @@ package(
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=XlaFramework",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=XlaFramework",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     deps = [
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
index edc1751c169a..c3f44a446a8d 100644
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
@@ -1,5 +1,5 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/README.md b/third_party/xla/xla/mlir/tools/mlir_bisect/README.md
index 570e92b6e538..c8583de8fd2c 100644
--- a/third_party/xla/xla/mlir/tools/mlir_bisect/README.md
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/README.md
@@ -13,7 +13,7 @@ candidate for replacement.
 
 1.  Run a JAX test with snapshots enabled:
 
-    ```
+    ```sh
     bazel test some-jax-test
       --test_env=XLA_FLAGS="--xla_cpu_use_xla_runtime --xla_dump_to=/tmp/dump
       --xla_dump_hlo_snapshots" --test_filter=SomeSpecific.Test
@@ -22,7 +22,7 @@ candidate for replacement.
 
 1.  Figure out the culprit module and pass (sorry, no automation yet):
 
-    ```
+    ```sh
     bazel run tensorflow/compiler/xla/mlir/tools/mlir_replay:mlir_replay -- \
       --mlir-compilation-trace=/tmp/dump/module_0000.jit__something.mlir-trace.pb \
       --hlo-snapshot=/tmp/dump/module_0000.jit__something.snapshot.0.pb \
@@ -35,10 +35,10 @@ candidate for replacement.
     the bisect tool.
 
     Note: If the failing pass is bufferization, you may have to use an earlier
-    snapshot, e.g. before EmptyTensorToAllocTensor.
+    snapshot, e.g. before `EmptyTensorToAllocTensor`.
 1.  Run bisect:
 
-    ```
+    ```sh
     bazel run tensorflow/compiler/xla/mlir/tools/mlir_bisect:mlir-bisect -- \
       --hlo-snapshot=/tmp/dump/module_0000.jit_something.snapshot.0.pb \
       --pass-pipeline="builtin.module(empty-tensor-to-alloc-tensor,one-shot-bufferize{allow-return-allocs bufferize-function-boundaries create-deallocs=0})" \
@@ -50,7 +50,7 @@ candidate for replacement.
 To add a reduction, create a function that generates the candidates and register
 it:
 
-```
+```cpp
 SmallVector<OwningOpRef<ModuleOp>>
 FrobulateAndDefenestrate(BisectState&, dialect::SomeOp some_op) {
   auto [cloned_module_1, cloned_op_1] = CloneModuleFor(some_op);
@@ -68,7 +68,7 @@ REGISTER_MLIR_REDUCE_STRATEGY(FrobulateAndDefenestrate);
 Then, add a test for the strategy. Make sure your strategy is linked into
 mlir-bisect and has `alwayslink` set.
 
-```
+```mlir
 // RUN: mlir-bisect %s --debug-strategy=FrobulateAndDefenestrate | FileCheck %s
 
 func.func @main() {
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/BUILD
index 87d94287d328..b769fade4111 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/BUILD
@@ -1,5 +1,5 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
index 2828445cf7f6..b4152baad9e8 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
@@ -1,3 +1,4 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "if_windows")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -109,7 +110,7 @@ cc_library(
     ),
 )
 
-cc_test(
+xla_cc_test(
     name = "symbol_finder_test",
     srcs = ["symbol_finder_test.cc"],
     deps = [
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/arith.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/arith.cc
index 3dcf6f601778..5cf5fe5a6454 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/arith.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/arith.cc
@@ -78,10 +78,10 @@ InterpreterValue Constant(InterpreterState&, arith::ConstantOp constant) {
     }
 
     auto value = constant.getValue();
-    if (auto integer = value.dyn_cast<IntegerAttr>()) {
+    if (auto integer = mlir::dyn_cast<IntegerAttr>(value)) {
       return {static_cast<T>(integer.getInt())};
     }
-    if (auto float_value = value.dyn_cast<FloatAttr>()) {
+    if (auto float_value = mlir::dyn_cast<FloatAttr>(value)) {
       return {static_cast<T>(float_value.getValueAsDouble())};
     }
 
@@ -135,7 +135,7 @@ llvm::SmallVector<InterpreterValue> UiToFP(
     MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
     InterpreterState&) {
   if (args[0].IsTensor()) {
-    auto ty = op->getResultTypes()[0].cast<ShapedType>();
+    auto ty = mlir::cast<ShapedType>(op->getResultTypes()[0]);
     return {DispatchScalarType(
         ty.getElementType(), [&](auto dummy) -> InterpreterValue {
           auto result =
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/bufferization.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/bufferization.cc
index 7feb91003c4a..f5000e17d1b3 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/bufferization.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/bufferization.cc
@@ -45,7 +45,7 @@ InterpreterValue AllocTensor(
     InterpreterState&, bufferization::AllocTensorOp alloc,
     ArrayRef<int64_t> dynamic_sizes, std::optional<InterpreterValue> copy,
     const std::optional<InterpreterValue>& /*sizeHint*/) {
-  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
+  auto ty = mlir::cast<mlir::ShapedType>(alloc->getResultTypes().front());
   auto shape = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
 
   if (copy) {
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/linalg.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/linalg.cc
index 7e941d5a229c..bab976cd9491 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/linalg.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/linalg.cc
@@ -149,7 +149,7 @@ llvm::SmallVector<InterpreterValue> Map(InterpreterState& state,
       isa<TensorType>(op.getInit().getType()) ? init.Clone() : init;
 
   InterpreterScope scope(state);
-  SmallVector<int64_t> ivs(output.View().Rank());
+  SmallVector<int64_t> ivs(output.View().num_dimensions());
   scope.SetSideChannel(std::make_shared<IterationIndexSideChannel>(ivs));
   for (const auto& indices : output.View().Indices()) {
     std::copy(indices.begin(), indices.end(), ivs.begin());
@@ -276,7 +276,7 @@ SmallVector<InterpreterValue> Dot(InterpreterState&, linalg::DotOp op,
                                   InterpreterValue acc) {
   const auto& lhs = inputs[0];
   const auto& rhs = inputs[1];
-  if (op.getOutputs()[0].getType().isa<TensorType>()) {
+  if (mlir::isa<TensorType>(op.getOutputs()[0].getType())) {
     acc = acc.Clone();
   }
   DispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
@@ -300,7 +300,7 @@ SmallVector<InterpreterValue> Vecmat(InterpreterState&, linalg::VecmatOp op,
                                      InterpreterValue acc) {
   const auto& lhs = inputs[0];
   const auto& rhs = inputs[1];
-  if (op.getOutputs()[0].getType().isa<TensorType>()) {
+  if (mlir::isa<TensorType>(op.getOutputs()[0].getType())) {
     acc = acc.Clone();
   }
   DispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/memref.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/memref.cc
index d467d1010912..fa1cfb4c3e8b 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/memref.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/memref.cc
@@ -129,7 +129,7 @@ InterpreterValue Subview(InterpreterState& state, memref::SubViewOp subview,
     return {};
   }
 
-  if (subview.getResult().getType().getRank() == out_view.Rank()) {
+  if (subview.getResult().getType().getRank() == out_view.num_dimensions()) {
     return out;
   }
 
@@ -137,7 +137,7 @@ InterpreterValue Subview(InterpreterState& state, memref::SubViewOp subview,
   // TODO(jreiffers): Check why subview.getDroppedDims() yields the wrong shape
   // here for 1x2x2x3 (-> 1x2x1x3) -> 1x2x3 (claiming 0 is dropped).
   int64_t dim = 0;
-  while (dim < out_view.Rank() && dim < shape.size()) {
+  while (dim < out_view.num_dimensions() && dim < shape.size()) {
     if (shape[dim] != 1 && out_view.sizes[dim] == 1) {
       out_view.sizes.erase(out_view.sizes.begin() + dim);
       out_view.strides.erase(out_view.strides.begin() + dim);
@@ -147,7 +147,7 @@ InterpreterValue Subview(InterpreterState& state, memref::SubViewOp subview,
       ++dim;
     }
   }
-  while (dim < out_view.Rank()) {
+  while (dim < out_view.num_dimensions()) {
     assert(out_view.sizes.back() == 1 && "expected remaining dims to be 1");
     out_view.sizes.pop_back();
     out_view.strides.pop_back();
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo.cc
index 5a018b160316..65abfec99d3c 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo.cc
@@ -137,7 +137,7 @@ InterpreterValue Concatenate(InterpreterState&, mhlo::ConcatenateOp concat,
 
 InterpreterValue Reshape(InterpreterState&, mhlo::ReshapeOp reshape,
                          const InterpreterValue& in) {
-  auto ty = reshape->getResultTypes()[0].cast<mlir::ShapedType>();
+  auto ty = mlir::cast<mlir::ShapedType>(reshape->getResultTypes()[0]);
   return ReshapeTensor(in, ty.getShape());
 }
 
@@ -214,7 +214,7 @@ InterpreterValue Slice(InterpreterState&, mhlo::SliceOp slice,
 
 llvm::SmallVector<InterpreterValue> Constant(InterpreterState&,
                                              mhlo::ConstantOp constant) {
-  auto ty = constant->getResultTypes()[0].cast<ShapedType>();
+  auto ty = mlir::cast<ShapedType>(constant->getResultTypes()[0]);
   return {DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
     if (ty.getElementType().isUnsignedInteger()) {
       if constexpr (!std::is_same_v<decltype(dummy), bool> &&
@@ -321,13 +321,13 @@ llvm::SmallVector<InterpreterValue> Gather(
   auto& operand = args[0];
   auto& start_indices = args[1];
   const auto& operand_view = operand.View();
-  int64_t operand_rank = operand_view.Rank();
+  int64_t operand_rank = operand_view.num_dimensions();
 
   // Make a fake BufferView for the start indices.
   BufferView start_indices_view = start_indices.View();
-  auto output_rank =
-      static_cast<int64_t>(start_indices_view.Rank() + offset_dims.size());
-  if (index_vector_dim < start_indices_view.Rank()) {
+  auto output_rank = static_cast<int64_t>(start_indices_view.num_dimensions() +
+                                          offset_dims.size());
+  if (index_vector_dim < start_indices_view.num_dimensions()) {
     --output_rank;
     start_indices_view.sizes[index_vector_dim] = 1;
   }
@@ -365,7 +365,7 @@ llvm::SmallVector<InterpreterValue> Gather(
   for (auto start_indices_index : start_indices_view.Indices()) {
     SmallVector<int64_t> operand_base_indices(operand_rank);
     for (auto [i, dim] : llvm::enumerate(start_index_map)) {
-      if (index_vector_dim < start_indices_view.Rank()) {
+      if (index_vector_dim < start_indices_view.num_dimensions()) {
         start_indices_index[index_vector_dim] = static_cast<int64_t>(i);
       }
       operand_base_indices[dim] = std::max<int64_t>(
@@ -410,9 +410,9 @@ llvm::SmallVector<InterpreterValue> Scatter(
   auto update_window_dims = dims.getUpdateWindowDims();
 
   auto input_view = n_inputs.front().View();
-  int64_t operand_rank = input_view.Rank();
-  int64_t updates_rank = n_updates.front().View().Rank();
-  int64_t indices_rank = scatter_indices.View().Rank();
+  int64_t operand_rank = input_view.num_dimensions();
+  int64_t updates_rank = n_updates.front().View().num_dimensions();
+  int64_t indices_rank = scatter_indices.View().num_dimensions();
 
   llvm::SmallVector<int64_t> batch_dims;
   for (int64_t dim = 0; dim < operand_rank; ++dim) {
@@ -507,7 +507,7 @@ InterpreterValue Transpose(InterpreterState&, mhlo::TransposeOp transpose,
 
 InterpreterValue Iota(InterpreterState&, mhlo::IotaOp iota) {
   auto dim = iota.getIotaDimension();
-  auto ty = iota->getResultTypes()[0].cast<ShapedType>();
+  auto ty = mlir::cast<ShapedType>(iota->getResultTypes()[0]);
   return DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
     auto result = TensorOrMemref<decltype(dummy)>::Empty({ty.getShape()[dim]});
     for (const auto& index : result.view.Indices()) {
@@ -689,14 +689,14 @@ InterpreterValue DotGeneralImpl(InterpreterValue& lhs, InterpreterValue& rhs,
   for (int64_t lhs_dim : lhs_batch) {
     dimensions.push_back(lhsv.sizes[lhs_dim]);
   }
-  for (int64_t i = 0; i < lhsv.Rank(); i++) {
+  for (int64_t i = 0; i < lhsv.num_dimensions(); i++) {
     if (!llvm::is_contained(lhs_contracting, i) &&
         !llvm::is_contained(lhs_batch, i)) {
       dimensions.push_back(lhsv.sizes[i]);
       lhs_non_batch.push_back(i);
     }
   }
-  for (int64_t i = 0; i < rhs.View().Rank(); i++) {
+  for (int64_t i = 0; i < rhs.View().num_dimensions(); i++) {
     if (!llvm::is_contained(rhs_contracting, i) &&
         !llvm::is_contained(rhs_batch, i)) {
       dimensions.push_back(rhsv.sizes[i]);
@@ -704,8 +704,8 @@ InterpreterValue DotGeneralImpl(InterpreterValue& lhs, InterpreterValue& rhs,
     }
   }
 
-  SmallVector<int64_t> lhs_index(lhsv.Rank());
-  SmallVector<int64_t> rhs_index(rhsv.Rank());
+  SmallVector<int64_t> lhs_index(lhsv.num_dimensions());
+  SmallVector<int64_t> rhs_index(rhsv.num_dimensions());
   SmallVector<int64_t> output_index(dimensions.size());
   auto output = lhs.TypedAlike(dimensions);
 
@@ -778,7 +778,7 @@ InterpreterValue Dot(InterpreterState& state, mhlo::DotOp op,
   auto ty = cast<ShapedType>(op->getResultTypes()[0]);
   auto result = lhs.TypedAlike(ty.getShape());
 
-  if (lhs.View().Rank() == 1 && rhs.View().Rank() == 1) {
+  if (lhs.View().num_dimensions() == 1 && rhs.View().num_dimensions() == 1) {
     DispatchScalarType(ty, [&](auto dummy) {
       using T = decltype(dummy);
       using TT = TensorOrMemref<T>;
@@ -791,7 +791,8 @@ InterpreterValue Dot(InterpreterState& state, mhlo::DotOp op,
       }
       std::get<TT>(result.storage).at({}) += product;
     });
-  } else if (lhs.View().Rank() == 2 && rhs.View().Rank() == 1) {
+  } else if (lhs.View().num_dimensions() == 2 &&
+             rhs.View().num_dimensions() == 1) {
     DispatchScalarType(ty, [&](auto dummy) {
       using TT = TensorOrMemref<decltype(dummy)>;
       auto lhs_tensor = std::get<TT>(lhs.storage);
@@ -803,7 +804,8 @@ InterpreterValue Dot(InterpreterState& state, mhlo::DotOp op,
         }
       }
     });
-  } else if (lhs.View().Rank() == 2 && rhs.View().Rank() == 2) {
+  } else if (lhs.View().num_dimensions() == 2 &&
+             rhs.View().num_dimensions() == 2) {
     DispatchScalarType(ty, [&](auto dummy) {
       using TT = TensorOrMemref<decltype(dummy)>;
       auto lhs_tensor = std::get<TT>(lhs.storage);
@@ -832,7 +834,7 @@ InterpreterValue DotGeneral(InterpreterState&, mhlo::DotGeneralOp op,
       lhs, rhs, dims.getLhsContractingDimensions(),
       dims.getRhsContractingDimensions(), dims.getLhsBatchingDimensions(),
       dims.getRhsBatchingDimensions(),
-      op->getResultTypes()[0].cast<ShapedType>().getElementType());
+      mlir::cast<ShapedType>(op->getResultTypes()[0]).getElementType());
 }
 
 // TODO(jreiffers): Migrate remaining ops to the safer signature.
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tensor.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tensor.cc
index f6f9dd0bf620..c35bfc8b8708 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tensor.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tensor.cc
@@ -40,7 +40,7 @@ int64_t Dim(InterpreterState& state, tensor::DimOp,
 
 InterpreterValue Empty(InterpreterState&, tensor::EmptyOp op,
                        ArrayRef<int64_t> dynamic_sizes) {
-  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
+  auto ty = mlir::cast<mlir::ShapedType>(op->getResultTypes().front());
   auto shape = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
   return InterpreterValue::MakeTensor(ty.getElementType(), shape);
 }
@@ -56,7 +56,7 @@ InterpreterValue Extract(InterpreterState& state, tensor::ExtractOp,
 
 InterpreterValue FromElements(InterpreterState&, tensor::FromElementsOp op,
                               MutableArrayRef<InterpreterValue> elements) {
-  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
+  auto ty = mlir::cast<mlir::ShapedType>(op->getResultTypes().front());
   auto result = InterpreterValue::MakeTensor(ty.getElementType(),
                                              llvm::to_vector(ty.getShape()));
   for (auto [index, element] : llvm::zip(result.View().Indices(), elements)) {
@@ -126,7 +126,7 @@ llvm::SmallVector<InterpreterValue> ExtractSlice(
   int64_t dim = 0;
   const auto& result_sizes = extract.getResultType().getShape();
   const auto& static_sizes = extract.getStaticSizes();
-  while (dim < out_view.Rank()) {
+  while (dim < out_view.num_dimensions()) {
     if (static_sizes[num_dropped + dim] == 1 &&
         (dim >= result_sizes.size() || result_sizes[dim] != 1)) {
       out_view.sizes.erase(out_view.sizes.begin() + dim);
@@ -186,7 +186,7 @@ llvm::SmallVector<InterpreterValue> InsertSlice(
 
 InterpreterValue Generate(InterpreterState& state, tensor::GenerateOp generate,
                           ArrayRef<int64_t> dynamic_sizes) {
-  auto ty = generate->getResultTypes().front().cast<ShapedType>();
+  auto ty = mlir::cast<ShapedType>(generate->getResultTypes().front());
   auto sizes = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
 
   auto result = InterpreterValue::MakeTensor(ty.getElementType(), sizes);
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.cc
index ee52993bdea2..7a638df25292 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.cc
@@ -121,7 +121,7 @@ InterpreterValue TransposeImpl(const InterpreterValue& in,
 
 int64_t DimImpl(const InterpreterValue& in, int64_t index,
                 InterpreterState& state) {
-  if (index < 0 || index >= in.View().Rank()) {
+  if (index < 0 || index >= in.View().num_dimensions()) {
     state.AddFailure("dimension index out of bounds");
     return 0;
   }
@@ -137,7 +137,7 @@ llvm::SmallVector<InterpreterValue> NoOpTerminator(
 int64_t EvalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims,
                        ArrayRef<int64_t> symbols) {
   int64_t lhs = 0, rhs = 0;
-  if (auto bin = expr.dyn_cast<AffineBinaryOpExpr>()) {
+  if (auto bin = mlir::dyn_cast<AffineBinaryOpExpr>(expr)) {
     lhs = EvalAffineExpr(bin.getLHS(), dims, symbols);
     rhs = EvalAffineExpr(bin.getRHS(), dims, symbols);
   }
@@ -153,11 +153,11 @@ int64_t EvalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims,
     case AffineExprKind::CeilDiv:
       return llvm::divideCeilSigned(lhs, rhs);
     case AffineExprKind::Constant:
-      return expr.cast<AffineConstantExpr>().getValue();
+      return mlir::cast<AffineConstantExpr>(expr).getValue();
     case AffineExprKind::DimId:
-      return dims[expr.cast<AffineDimExpr>().getPosition()];
+      return dims[mlir::cast<AffineDimExpr>(expr).getPosition()];
     case AffineExprKind::SymbolId:
-      return symbols[expr.cast<AffineSymbolExpr>().getPosition()];
+      return symbols[mlir::cast<AffineSymbolExpr>(expr).getPosition()];
   }
 }
 
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
index 6716b1660fa9..4f08a10a18bb 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
@@ -242,7 +242,7 @@ InterpreterValue Contract(InterpreterState&, vector::ContractionOp contraction,
   contraction.getIterationBounds(iter.sizes);
   auto maps = contraction.getIndexingMapsArray();
   auto result_ty = contraction->getResultTypes()[0];
-  auto shaped_ty = result_ty.dyn_cast<ShapedType>();
+  auto shaped_ty = mlir::dyn_cast<ShapedType>(result_ty);
   auto result =
       DispatchScalarType(result_ty, [&](auto dummy) -> InterpreterValue {
         using T = decltype(dummy);
@@ -299,7 +299,7 @@ InterpreterValue Extract(InterpreterState& state, vector::ExtractOp extract,
   for (int64_t offset : extract.getStaticPosition()) {
     state.CheckSuccess(result_view.Slice(0, offset), "index out of bounds");
   }
-  return result_view.Rank() == 0 ? result.ExtractElement({}) : result;
+  return result_view.num_dimensions() == 0 ? result.ExtractElement({}) : result;
 }
 
 InterpreterValue ExtractElement(InterpreterState& state,
@@ -621,8 +621,8 @@ InterpreterValue ShapeCast(InterpreterState&, vector::ShapeCastOp op,
                            const InterpreterValue& in) {
   auto out = in.CoerceLayout({});
   auto& out_view = out.View();
-  out_view.sizes =
-      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
+  out_view.sizes = llvm::to_vector(
+      mlir::cast<ShapedType>(op->getResultTypes()[0]).getShape());
   out_view.strides = BufferView::GetDefaultStrides(out_view.sizes);
   return out;
 }
@@ -635,7 +635,7 @@ InterpreterValue Shuffle(InterpreterState& state, vector::ShuffleOp shuffle,
   result_view.is_vector = true;
 
   auto mask = shuffle.getMask();
-  bool is_zero_dim = v0.View().Rank() == 0;
+  bool is_zero_dim = v0.View().num_dimensions() == 0;
   int64_t size0 = is_zero_dim ? 1 : v0.View().sizes[0];
   for (auto [dst_index, src_index] : llvm::enumerate(mask)) {
     auto src = src_index < size0 ? v0 : v1;
@@ -656,8 +656,8 @@ InterpreterValue Splat(InterpreterState&, vector::SplatOp op,
                        const InterpreterValue& in) {
   auto out = in.AsUnitTensor(/*is_vector=*/true);
   auto& view = out.View();
-  view.sizes =
-      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
+  view.sizes = llvm::to_vector(
+      mlir::cast<ShapedType>(op->getResultTypes()[0]).getShape());
   view.strides = SmallVector<int64_t>(view.sizes.size(), 0);
   return out;
 }
@@ -698,9 +698,9 @@ std::optional<InterpreterValue> ExtractMemorySlice(
   auto mem_slice = memory;
   auto& mem_slice_view = mem_slice.View();
   auto& vector_view = vector.View();
-  for (int64_t i = 0; i < mem_slice_view.Rank(); ++i) {
+  for (int64_t i = 0; i < mem_slice_view.num_dimensions(); ++i) {
     bool found = false;
-    for (int64_t j = 0; !found && j < vector_view.Rank(); ++j) {
+    for (int64_t j = 0; !found && j < vector_view.num_dimensions(); ++j) {
       if (map.getResult(j).isFunctionOfDim(i)) {
         int64_t size = mem_slice_view.sizes[i] - offsets[i];
         bool is_in_bounds = size >= vector_view.sizes[j];
@@ -801,10 +801,12 @@ llvm::SmallVector<InterpreterValue> TransferWrite(
   }
 
   const auto& src_view = src.View();
-  assert(transfer.getPermutationMap().getNumResults() == src_view.Rank() &&
+  assert(transfer.getPermutationMap().getNumResults() ==
+             src_view.num_dimensions() &&
          "expected matching number of results");
 
-  dst = transfer.getSource().getType().isa<TensorType>() ? dst.Clone() : dst;
+  dst =
+      mlir::isa<TensorType>(transfer.getSource().getType()) ? dst.Clone() : dst;
   auto dst_slice = ExtractMemorySlice(state, transfer.getPermutationMap(), dst,
                                       src, offsets, transfer.getInBounds());
   if (!dst_slice) {
@@ -832,7 +834,7 @@ InterpreterValue Transpose(InterpreterState&, vector::TransposeOp transpose,
 
 InterpreterValue TypeCast(InterpreterState&, vector::TypeCastOp,
                           InterpreterValue vector) {
-  vector.View().num_vector_dims = vector.View().Rank();
+  vector.View().num_vector_dims = vector.View().num_dimensions();
   return vector;
 }
 
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.cc
index 21d0bddf7e93..3ddbd10d3688 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.cc
@@ -83,7 +83,7 @@ struct InterpreterValuePrinter {
     } else {
       os << TypeStr::Get(T{}) << ">: ";
     }
-    SmallVector<int64_t> indices(t.view.Rank() +
+    SmallVector<int64_t> indices(t.view.num_dimensions() +
                                  t.view.num_vector_dims.value_or(0));
     std::function<void(int64_t)> print;
     print = [&](int64_t dim) {
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.cc
index 38841a4d0da0..a31b58d71bf4 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.cc
@@ -77,14 +77,14 @@ SmallVector<int64_t> BufferView::GetStridesForLayout(ArrayRef<int64_t> sizes,
 }
 
 LogicalResult BufferView::Slice(int64_t dim_index, int64_t dim_offset) {
-  llvm::SmallVector<int64_t> offsets(Rank(), 0);
+  llvm::SmallVector<int64_t> offsets(num_dimensions(), 0);
   offsets[dim_index] = dim_offset;
   if (auto new_offset = GetPhysicalIndex(offsets)) {
     offset = *new_offset;
   } else {
     return failure();
   }
-  if (dim_index >= Rank()) --*num_vector_dims;
+  if (dim_index >= num_dimensions()) --*num_vector_dims;
   strides.erase(strides.begin() + dim_index);
   sizes.erase(sizes.begin() + dim_index);
   return success();
@@ -92,7 +92,7 @@ LogicalResult BufferView::Slice(int64_t dim_index, int64_t dim_offset) {
 
 LogicalResult BufferView::Slice(int64_t dim_index, int64_t dim_offset,
                                 int64_t dim_size, int64_t dim_stride) {
-  llvm::SmallVector<int64_t> offsets(Rank(), 0);
+  llvm::SmallVector<int64_t> offsets(num_dimensions(), 0);
   offsets[dim_index] = dim_offset;
   if (dim_size == 0) {
     offset = 0;
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h
index 6a1ff9cc746c..85d7207fe22f 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h
@@ -73,7 +73,9 @@ struct BufferView {
   std::optional<int64_t> num_vector_dims = std::nullopt;
   bool is_vector = false;
 
-  int64_t Rank() const { return sizes.size() - num_vector_dims.value_or(0); }
+  int64_t num_dimensions() const {
+    return sizes.size() - num_vector_dims.value_or(0);
+  }
 
   // Removes the dimension from the view. If you need to keep it, use the
   // overload below with dim_size = 1.
@@ -154,7 +156,7 @@ struct BufferView {
       return {
           view_,
           llvm::SmallVector<int64_t>(
-              view_->Rank() +
+              view_->num_dimensions() +
               (include_vector_dims_ ? view_->num_vector_dims.value_or(0) : 0)),
           include_vector_dims_};
     }
@@ -314,8 +316,10 @@ struct TensorOrMemref {
   TensorOrMemref VectorAt(ArrayRef<int64_t> indices) const {
     auto offset = view.GetPhysicalIndex(indices);
     BufferView subview;
-    subview.strides = {view.strides.begin() + view.Rank(), view.strides.end()};
-    subview.sizes = {view.sizes.begin() + view.Rank(), view.sizes.end()};
+    subview.strides = {view.strides.begin() + view.num_dimensions(),
+                       view.strides.end()};
+    subview.sizes = {view.sizes.begin() + view.num_dimensions(),
+                     view.sizes.end()};
     if (offset) {
       subview.offset = *offset;
     } else {
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
index cd273637bc02..cdf5d4addc7f 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
index 205e9db4639b..dce2bca57e5f 100644
--- a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
@@ -56,7 +56,7 @@ std::optional<int64_t> GetCollapsedStrideNaive(llvm::ArrayRef<int64_t> dims,
   // Find all physical indices for the dimensions.
   llvm::SmallBitVector v(view.GetNumElements());
   for (const auto& indices : f.Indices()) {
-    SmallVector<int64_t> view_indices(view.Rank());
+    SmallVector<int64_t> view_indices(view.num_dimensions());
     for (auto [dim, index] : llvm::zip(dims, indices)) {
       view_indices[dim] = index;
     }
@@ -83,9 +83,9 @@ TEST(TensorOrMemrefTest, CollapsedStride) {
                   .strides = BufferView::GetDefaultStrides({1, 2, 3, 1, 5})};
 
   auto check_all = [&]() {
-    for (int64_t i = 0; i < (1 << view.Rank()); ++i) {
+    for (int64_t i = 0; i < (1 << view.num_dimensions()); ++i) {
       SmallVector<int64_t> dims;
-      for (int64_t dim = 0; dim < view.Rank(); ++dim) {
+      for (int64_t dim = 0; dim < view.num_dimensions(); ++dim) {
         if (i & (1 << dim)) dims.push_back(dim);
       }
 
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/BUILD b/third_party/xla/xla/mlir/tools/mlir_replay/BUILD
index 1e44fa39e677..4bc0b5535f3a 100644
--- a/third_party/xla/xla/mlir/tools/mlir_replay/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/BUILD
@@ -1,5 +1,5 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
index 6b2589837775..74e72759f621 100644
--- a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
@@ -78,7 +78,7 @@ template <typename T, template <typename _> class rng_t>
 mlir::interpreter::InterpreterValue RandomTensor(absl::BitGenRef bitgen,
                                                  mlir::Type type) {
   llvm::SmallVector<int64_t> shape;
-  auto shaped_ty = type.dyn_cast<mlir::ShapedType>();
+  auto shaped_ty = mlir::dyn_cast<mlir::ShapedType>(type);
   if (shaped_ty) {
     shape = llvm::to_vector(shaped_ty.getShape());
   }
@@ -102,8 +102,9 @@ mlir::interpreter::InterpreterValue RandomTensor(absl::BitGenRef bitgen,
 
 mlir::FailureOr<mlir::interpreter::InterpreterValue> MakeRandomInput(
     absl::BitGenRef bitgen, mlir::Type type) {
-  auto elem_ty =
-      type.isa<ShapedType>() ? type.cast<ShapedType>().getElementType() : type;
+  auto elem_ty = mlir::isa<ShapedType>(type)
+                     ? mlir::cast<ShapedType>(type).getElementType()
+                     : type;
   if (elem_ty.isF32()) {
     return RandomTensor<float, absl::gaussian_distribution>(bitgen, type);
   }
@@ -120,7 +121,8 @@ mlir::FailureOr<mlir::interpreter::InterpreterValue> MakeRandomInput(
     return RandomTensor<int64_t, absl::uniform_int_distribution>(bitgen, type);
   }
   if (elem_ty.isInteger(1)) {
-    return {{TensorOrMemref<bool>::Empty(type.cast<ShapedType>().getShape())}};
+    return {
+        {TensorOrMemref<bool>::Empty(mlir::cast<ShapedType>(type).getShape())}};
   }
 
   llvm::errs() << "Unsupported type: ";
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD b/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
index 4ded19244246..3d9bbabd8653 100644
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -15,15 +15,12 @@ cc_library(
         ":compiler_trace_proto_cc",
         ":compiler_trace_proto_cc_impl",
         "//xla/service/llvm_ir:llvm_util",
-        "//xla/tsl/platform:env",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
index 1584341e4f7c..8b7e0ec0c22d 100644
--- a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
@@ -177,7 +177,7 @@ void ExecutionTraceListener::LeaveRegion(ArrayRef<InterpreterValue> yielded) {
 llvm::SmallVector<mlir::Attribute> ValueToAttribute(
     const InterpreterValue& value, mlir::Type type) {
   if (std::holds_alternative<Tuple>(value.storage)) {
-    auto types = type.cast<TupleType>().getTypes();
+    auto types = mlir::cast<TupleType>(type).getTypes();
     const auto& t = std::get<Tuple>(value.storage);
     llvm::SmallVector<mlir::Attribute> attrs;
     for (const auto& [v, ty] : llvm::zip(t.values, types)) {
@@ -196,11 +196,11 @@ llvm::SmallVector<mlir::Attribute> ValueToAttribute(
                 .getValues<mlir::Attribute>()[0]};
   }
 
-  if (!type.isa<ShapedType>()) {
+  if (!mlir::isa<ShapedType>(type)) {
     return {};
   }
 
-  auto shaped_ty = type.cast<ShapedType>();
+  auto shaped_ty = mlir::cast<ShapedType>(type);
   return {DispatchScalarType(shaped_ty, [&](auto dummy) -> mlir::Attribute {
     using T = decltype(dummy);
     auto& t = std::get<TensorOrMemref<T>>(value.storage);
diff --git a/third_party/xla/xla/mlir/utils/BUILD b/third_party/xla/xla/mlir/utils/BUILD
index f297c2967cd6..827b84165fa4 100644
--- a/third_party/xla/xla/mlir/utils/BUILD
+++ b/third_party/xla/xla/mlir/utils/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -23,14 +23,13 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "error_util_test",
     srcs = ["error_util_test.cc"],
     linkstatic = 1,
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index c2b78f7aaf11..4c21044efb7b 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -12,6 +12,7 @@ package(
 
 exports_files([
     "mhlo/IR/hlo_ops.td",
+    "mhlo/IR/hlo_utils.td",
 ])
 
 # Python extension sources.
@@ -48,15 +49,10 @@ gentbl_cc_library(
     name = "mhlo_pass_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=AllMhlo",
-            ],
-            "mhlo/transforms/mhlo_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"mhlo/transforms/mhlo_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=AllMhlo",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/transforms/mhlo_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
@@ -66,16 +62,10 @@ gentbl_cc_library(
     name = "hlo_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "mhlo/IR/hlo_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "mhlo/IR/hlo_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "mhlo/IR/hlo_ops.h.inc": ["-gen-op-decls"],
+        "mhlo/IR/hlo_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
@@ -85,16 +75,10 @@ gentbl_cc_library(
     name = "hlo_ops_attrs_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-attrdef-decls"],
-            "mhlo/IR/hlo_ops_attrs.h.inc",
-        ),
-        (
-            ["-gen-attrdef-defs"],
-            "mhlo/IR/hlo_ops_attrs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "mhlo/IR/hlo_ops_attrs.h.inc": ["-gen-attrdef-decls"],
+        "mhlo/IR/hlo_ops_attrs.cc.inc": ["-gen-attrdef-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
@@ -104,16 +88,10 @@ gentbl_cc_library(
     name = "hlo_ops_enums_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-enum-decls"],
-            "mhlo/IR/hlo_ops_enums.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "mhlo/IR/hlo_ops_enums.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "mhlo/IR/hlo_ops_enums.h.inc": ["-gen-enum-decls"],
+        "mhlo/IR/hlo_ops_enums.cc.inc": ["-gen-enum-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
@@ -123,22 +101,16 @@ gentbl_cc_library(
     name = "hlo_ops_typedefs_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-typedef-decls",
-                "--typedefs-dialect=mhlo",
-            ],
-            "mhlo/IR/hlo_ops_typedefs.h.inc",
-        ),
-        (
-            [
-                "-gen-typedef-defs",
-                "--typedefs-dialect=mhlo",
-            ],
-            "mhlo/IR/hlo_ops_typedefs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "mhlo/IR/hlo_ops_typedefs.h.inc": [
+            "-gen-typedef-decls",
+            "--typedefs-dialect=mhlo",
+        ],
+        "mhlo/IR/hlo_ops_typedefs.cc.inc": [
+            "-gen-typedef-defs",
+            "--typedefs-dialect=mhlo",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
@@ -148,12 +120,7 @@ gentbl_cc_library(
     name = "hlo_ops_pattern_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = "mhlo/IR/",
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "mhlo/IR/hlo_patterns.cc.inc",
-        ),
-    ],
+    tbl_outs = {"mhlo/IR/hlo_patterns.cc.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/IR/hlo_patterns.td",
     deps = [
@@ -197,12 +164,7 @@ gentbl_cc_library(
     name = "canonicalize_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "mhlo/IR/mhlo_canonicalize.inc",
-        ),
-    ],
+    tbl_outs = {"mhlo/IR/mhlo_canonicalize.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/IR/mhlo_canonicalize.td",
     deps = [":hlo_ops_td_files"],
@@ -245,15 +207,10 @@ gentbl_cc_library(
     name = "deallocation_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=Deallocation",
-            ],
-            "deallocation/transforms/passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"deallocation/transforms/passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=Deallocation",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "deallocation/transforms/passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
@@ -327,7 +284,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:InliningUtils",
-        "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
@@ -339,9 +295,9 @@ cc_library(
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:base",
-        "@stablehlo//:broadcast_utils",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_assembly_format",
+        "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_type_inference",
     ],
 )
@@ -363,7 +319,6 @@ cc_library(
 cc_library(
     name = "mhlo_passes",
     srcs = [
-        "mhlo/transforms/broadcast_propagation/broadcast_propagation.cc",
         "mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc",
         "mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc",
         "mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc",
@@ -378,16 +333,12 @@ cc_library(
         "mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc",
         "mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc",
         "mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc",
-        "mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc",
         "mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc",
         "mhlo/transforms/mhlo_passes.h.inc",
         "mhlo/transforms/optimize_mhlo/optimize_mhlo.cc",
         "mhlo/transforms/prepare_for_export/prepare_for_export.cc",
-        "mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc",
-        "mhlo/transforms/shape_simplification/shape_simplification.cc",
         "mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc",
         "mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc",
-        "mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc",
         "mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc",
         "mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc",
     ],
@@ -618,7 +569,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -626,12 +576,7 @@ gentbl_cc_library(
     name = "chlo_legalize_to_hlo_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = "mhlo/transforms",
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "mhlo/transforms/chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc",
-        ),
-    ],
+    tbl_outs = {"mhlo/transforms/chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc": ["-gen-rewriters"]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td",
     deps = [":hlo_ops_td_files"],
@@ -816,6 +761,7 @@ cc_library(
         ":mlir_hlo",
         ":transforms_passes",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AMDGPUUtils",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
@@ -861,15 +807,10 @@ gentbl_cc_library(
     name = "transforms_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=LMHLOTransforms",
-            ],
-            "transforms/passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=LMHLOTransforms",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
@@ -879,15 +820,10 @@ gentbl_cc_library(
     name = "gpu_transforms_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=LMHLOGPUTransforms",
-            ],
-            "transforms/gpu_passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"transforms/gpu_passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=LMHLOGPUTransforms",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/gpu_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
@@ -895,11 +831,10 @@ gentbl_cc_library(
 
 cc_library(
     name = "shape_component_analysis",
-    srcs = ["mhlo/analysis/shape_component_analysis.cc"],
-    hdrs = ["mhlo/analysis/shape_component_analysis.h"],
+    srcs = ["stablehlo_ext/analysis/shape_component_analysis.cpp"],
+    hdrs = ["stablehlo_ext/analysis/shape_component_analysis.h"],
     strip_include_prefix = ".",
     deps = [
-        ":mlir_hlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -907,6 +842,7 @@ cc_library(
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -1067,14 +1003,9 @@ gentbl_cc_library(
     name = "stablehlo_extension_pass_inc_gen",
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-            ],
-            "stablehlo_ext/transforms/passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"stablehlo_ext/transforms/passes.h.inc": [
+        "-gen-pass-decls",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "stablehlo_ext/transforms/passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
@@ -1083,14 +1014,17 @@ gentbl_cc_library(
 cc_library(
     name = "stablehlo_extension_passes",
     srcs = [
+        "stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp",
         "stablehlo_ext/transforms/chlo_recompose_ops.cpp",
         "stablehlo_ext/transforms/sdy_refine_shapes.cpp",
+        "stablehlo_ext/transforms/sink_constants_to_control_flow.cpp",
         "stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp",
         "stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp",
         "stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp",
         "stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp",
         "stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp",
         "stablehlo_ext/transforms/stablehlo_refine_shapes.cpp",
+        "stablehlo_ext/transforms/symbolic_shape_optimization.cpp",
     ],
     hdrs = [
         "stablehlo_ext/transforms/passes.h",
@@ -1100,17 +1034,22 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     deps = [
+        ":shape_component_analysis",
         ":stablehlo_extension_base",
         ":stablehlo_extension_ops",
         ":stablehlo_extension_pass_inc_gen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@shardy//shardy/dialect/sdy/ir:dialect",
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 3a09b6e3b338..30f3cec32c47 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
-#include <optional>
 #include <utility>
 
 #include "deallocation/transforms/passes.h"
 #include "deallocation/utils/util.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
@@ -440,7 +440,7 @@ bool simplifyLoopDeallocs(Block& block) {
       }
     }
 
-    breaks_if_you_move_ops::ValueEquivalenceClasses eq;
+    llvm::EquivalenceClasses<Value> eq;
     auto getAliases = [&](RegionBranchPoint point) {
       for (const auto& edge : getSuccessorRegions(rbi, point)) {
         for (auto [pred, succ] : llvm::zip(edge.getPredecessorOperands(),
@@ -456,12 +456,12 @@ bool simplifyLoopDeallocs(Block& block) {
     }
 
     for (auto it = eq.begin(), e = eq.end(); it != e; ++it) {
-      if (!it->isLeader()) continue;
+      if (!(*it)->isLeader()) continue;
 
       breaks_if_you_move_ops::ValueSet equivalentOperands;
       llvm::SmallVector<memref::DeallocOp> deallocs;
       bool failed = false;
-      for (auto member = eq.member_begin(it);
+      for (auto member = eq.member_begin(**it);
            !failed && member != eq.member_end(); ++member) {
         if (operands.contains(*member)) {
           equivalentOperands.insert(*member);
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
index c18c8f9dcd24..2349e243ab03 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
+++ b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <optional>
 #include <set>
 
-#include "llvm/ADT/EquivalenceClasses.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 
 namespace mlir {
@@ -136,8 +135,6 @@ namespace breaks_if_you_move_ops {
 
 // The comparator depends on the location of ops, so if you insert an op into
 // a set and then move it, it may end up in the wrong location.
-using ValueEquivalenceClasses =
-    llvm::EquivalenceClasses<Value, detail::ValueComparator>;
 using ValueSet = std::set<Value, detail::ValueComparator>;
 template <typename T>
 using ValueMap = std::map<Value, T, detail::ValueComparator>;
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/CMakeLists.txt
index 347117c8bcb1..080e665af129 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/CMakeLists.txt
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-add_subdirectory(analysis)
 add_subdirectory(IR)
 add_subdirectory(transforms)
 add_subdirectory(utils)
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td
index 1f9005788362..e4d7cdf400bd 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td
@@ -52,6 +52,9 @@ defvar MHLO_BroadcastDimAttr = I64ElementsAttr;
 // Token type.
 defvar MHLO_Token = HLO_Token;
 
+def StableHLO_TokenType : Type<CPred<"isa<::mlir::stablehlo::TokenType>($_self)">, "stablehlo token">;
+defvar MHLO_AnyToken = AnyTypeOf<[MHLO_Token, StableHLO_TokenType]>;
+
 // Any integer tensor types
 defvar MHLO_IntTensor = HLO_IntTensor;
 
@@ -80,6 +83,7 @@ defvar MHLO_ComplexTensor = HLO_ComplexTensor;
 defvar MHLO_Tuple = HLO_Tuple;
 
 defvar MHLO_TensorOrToken = HLO_TensorOrPerAxisQuantizedTensorOrToken;
+defvar MHLO_TensorOrAnyToken = AnyTypeOf<[MHLO_TensorOrToken, StableHLO_TokenType]>;
 
 defvar MHLO_TensorOrTokenOrTuple = AnyTypeOf<[MHLO_Tensor, MHLO_Token, MHLO_Tuple]>;
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 71e49939e627..1b65ea14eaf8 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -85,6 +85,7 @@ limitations under the License.
 #include "mlir/Transforms/InliningUtils.h"
 #include "stablehlo/dialect/AssemblyFormat.h"
 #include "stablehlo/dialect/Base.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "stablehlo/dialect/TypeInference.h"
 #include "utils/convert_op_folder.h"
 #include "utils/hlo_utils.h"
@@ -1292,10 +1293,6 @@ LogicalResult SparseDotOp::verify() {
   return success();
 }
 
-// ===----------------------------------------------------------------------===//
-// ExpOp
-//===----------------------------------------------------------------------===//
-
 LogicalResult ResultAccuracyAttr::verify(
     ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, APFloat atol,
     APFloat rtol, int64_t ulps, ResultAccuracyModeAttr mode) {
@@ -1304,13 +1301,158 @@ LogicalResult ResultAccuracyAttr::verify(
       stringifyResultAccuracyMode(mode.getValue()));
 }
 
+// ===---------------------------------------------------------------------===//
+// CbrtOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult CbrtOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// CosineOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult CosineOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// ExpOp
+//===----------------------------------------------------------------------===//
+
 LogicalResult ExpOp::verify() {
   if (auto attr = getResultAccuracyAttr()) {
-    if (failed(ResultAccuracyAttr::verify([&] { return emitError(); },
-                                          attr.getAtol(), attr.getRtol(),
-                                          attr.getUlps(), attr.getMode()))) {
-      return failure();
-    }
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// Expm1Op
+//===----------------------------------------------------------------------===//
+
+LogicalResult Expm1Op::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// LogOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult LogOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// Log1pOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult Log1pOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// LogisticOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult LogisticOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// RsqrtOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult RsqrtOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// SinOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SineOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// SqrtOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SqrtOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// TanOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TanOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
+  }
+  return success();
+}
+
+// ===---------------------------------------------------------------------===//
+// TanhOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TanhOp::verify() {
+  if (auto attr = getResultAccuracyAttr()) {
+    return ResultAccuracyAttr::verify([&] { return emitError(); },
+                                      attr.getAtol(), attr.getRtol(),
+                                      attr.getUlps(), attr.getMode());
   }
   return success();
 }
@@ -3331,6 +3473,19 @@ void DynamicReshapeOp::getCanonicalizationPatterns(RewritePatternSet& results,
 // DynamicSliceOp
 //===----------------------------------------------------------------------===//
 
+// Pattern: dynamic_slice(splat_cst, start, end) -> resized_splat_cst
+OpFoldResult DynamicSliceOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  if (!operands[0]) return nullptr;
+
+  auto cst_attr = mlir::dyn_cast<DenseElementsAttr>(operands[0]);
+  if (cst_attr && cst_attr.isSplat()) {
+    return cst_attr.resizeSplat(getResult().getType());
+  }
+
+  return nullptr;
+}
+
 namespace {
 // Canonicalizes DynamicSlice ops that can be replaced instead with Slice ops.
 // This canonicalization is applied the case when the `begin` input values are
@@ -3609,12 +3764,6 @@ LogicalResult RecvOp::verify() {
                            getIsHostTransfer(), getResults());
 }
 
-//===----------------------------------------------------------------------===//
-// CopyOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult CopyOp::fold(FoldAdaptor) { return getOperand(); }
-
 //===----------------------------------------------------------------------===//
 // ReduceWindowOp
 //===----------------------------------------------------------------------===//
@@ -3919,11 +4068,47 @@ static LogicalResult tryFoldOutsideValuesReduction(
   return success();
 }
 
+// Pattern: reduce(args...) ({ return cst1, ..., cstN }) -> cst1, ..., cstN
+static LogicalResult tryFoldEmptyBodyConstantInit(
+    ReduceOp reduceOp, SmallVectorImpl<OpFoldResult>& results) {
+  mlir::Block& bb = reduceOp.getBody().front();
+  if (bb.getOperations().size() > 1) {
+    return failure();
+  }
+
+  auto retOp = mlir::dyn_cast<ReturnOp>(bb.back());
+  if (!retOp) {
+    return failure();
+  }
+
+  for (auto [retOpArg, reduceOpResult] :
+       llvm::zip_equal(retOp.getResults(), reduceOp.getResults())) {
+    auto* cstOp = retOpArg.getDefiningOp();
+    if (!cstOp || !cstOp->hasTrait<mlir::OpTrait::ConstantLike>()) {
+      results.clear();
+      return failure();
+    }
+
+    DenseElementsAttr cstAttr;
+    if (!matchPattern(cstOp, m_Constant(&cstAttr))) {
+      results.clear();
+      return failure();
+    }
+
+    auto resultShapedType =
+        mlir::dyn_cast_or_null<ShapedType>(reduceOpResult.getType());
+    results.push_back(DenseElementsAttr::get(
+        resultShapedType, {cstAttr.getSplatValue<Attribute>()}));
+  }
+  return success();
+}
+
 LogicalResult ReduceOp::fold(FoldAdaptor /*adaptor*/,
                              SmallVectorImpl<OpFoldResult>& results) {
   if (succeeded(tryFoldZeroDimReduction(*this, results))) return success();
   if (succeeded(tryFoldOutsideValuesReduction(*this, results)))
     return success();
+  if (succeeded(tryFoldEmptyBodyConstantInit(*this, results))) return success();
   return failure();
 }
 
@@ -6126,7 +6311,7 @@ LogicalResult ScatterOp::fold(
   // these to be constant: just that we know the type.
   if (updateType == baseType && updateType.hasStaticShape() &&
       baseType.hasStaticShape() && index.isSplat() &&
-      index.getSplatValue<uint32_t>() == 0 &&
+      index.getSplatValue<APInt>().isZero() &&
       llvm::hasSingleElement(getUpdateComputation().front())) {
     foldResults.push_back(getUpdates()[0]);
     return success();
@@ -6379,7 +6564,8 @@ static LogicalResult whileCanonicalization(WhileOp whileOp,
     bodyReturnOp->eraseOperand(idx);
 
   WhileOp newWhileOp = rewriter.create<WhileOp>(
-      whileOp.getLoc(), bodyReturnOp->getOperandTypes(), newOperands);
+      whileOp.getLoc(), bodyReturnOp->getOperandTypes(), newOperands,
+      whileOp->getAttrs());
   newWhileOp.getBodyRegion(0).takeBody(whileOp.getBodyRegion(0));
   newWhileOp.getBodyRegion(1).takeBody(whileOp.getBodyRegion(1));
   for (auto results : llvm::zip(resultsToReplace, newWhileOp->getResults()))
@@ -6807,9 +6993,14 @@ Attribute RaggedDotDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
           {"dot_dimension_numbers", "lhs_ragged_dimensions",
            "rhs_group_dimensions"},
           {[&]() {
-             auto result = DotDimensionNumbersAttr::parse(parser, type);
-             if (!result) return ParseResult(failure());
-             dotDimensionNumbers = llvm::cast<DotDimensionNumbersAttr>(result);
+             Attribute attr;
+             if (failed(parser.parseAttribute(attr)))
+               return ParseResult(failure());
+             dotDimensionNumbers =
+                 llvm::dyn_cast<DotDimensionNumbersAttr>(attr);
+             if (!dotDimensionNumbers)
+               parser.emitError(parser.getCurrentLocation(),
+                                "expected #mhlo.dot attribute");
              return ParseResult(success());
            },
            [&]() { return parseDims(parser, lhsRaggedDimensions); },
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index 0ac3bd12de4a..a32574fcf112 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -233,6 +233,11 @@ def MHLO_CbrtOp: MHLO_UnaryElementwiseOp<"cbrt",
     %result = mhlo.cbrt %operand : tensor<4xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
 }
 def MHLO_CeilOp: MHLO_UnaryElementwiseOp<"ceil",
     [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrQuantizedIntTensor> {
@@ -306,7 +311,11 @@ def MHLO_CosineOp: MHLO_UnaryElementwiseOp<"cosine",
     %result = mhlo.cosine %operand : tensor<2xf32>
     ```
   }];
-
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
   let hasCustomHLOConverter = 1;
 }
@@ -344,7 +353,8 @@ def MHLO_ExpOp: MHLO_UnaryElementwiseOp<"exponential",
     ```
   }];
   let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
-   DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr, "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
   let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
   let hasVerifier = 1;
   let hasFolder = 1;
@@ -364,6 +374,11 @@ def MHLO_Expm1Op: MHLO_UnaryElementwiseOp<"exponential_minus_one",
     %result = mhlo.exponential_minus_one %operand : tensor<2xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
 }
 def MHLO_FloorOp: MHLO_UnaryElementwiseOp<"floor",
     [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrQuantizedIntTensor> {
@@ -440,6 +455,11 @@ def MHLO_LogOp: MHLO_UnaryElementwiseOp<"log",
     %result = mhlo.log %operand : tensor<2x2xf64>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
 }
 def MHLO_Log1pOp: MHLO_UnaryElementwiseOp<"log_plus_one",
@@ -457,6 +477,11 @@ def MHLO_Log1pOp: MHLO_UnaryElementwiseOp<"log_plus_one",
     %result = mhlo.log_plus_one %operand : tensor<6xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
 }
 def MHLO_LogisticOp: MHLO_UnaryElementwiseOp<"logistic",
     [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpComplexOrQuantizedIntTensor> {
@@ -473,6 +498,11 @@ def MHLO_LogisticOp: MHLO_UnaryElementwiseOp<"logistic",
     %result = mhlo.logistic %operand : tensor<2x2xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
 }
 def MHLO_NotOp: MHLO_UnaryElementwiseOp<"not",
@@ -599,6 +629,11 @@ def MHLO_RsqrtOp: MHLO_UnaryElementwiseOp<"rsqrt",
     %result = mhlo.rsqrt %operand : tensor<2x2xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
 }
 
@@ -636,6 +671,11 @@ def MHLO_SineOp: MHLO_UnaryElementwiseOp<"sine",
     %result = mhlo.sine %operand : tensor<2xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
   let hasCustomHLOConverter = 1;
 }
@@ -654,6 +694,11 @@ def MHLO_TanOp: MHLO_UnaryElementwiseOp<"tan",
     %0 = mhlo.tan %arg0 : tensor<2xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
   let hasCustomHLOConverter = 1;
 }
@@ -673,6 +718,11 @@ def MHLO_SqrtOp: MHLO_UnaryElementwiseOp<"sqrt",
     %result = mhlo.sqrt %operand : tensor<2x2xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
 }
 
@@ -692,6 +742,11 @@ def MHLO_TanhOp: MHLO_UnaryElementwiseOp<"tanh",
     %result = mhlo.tanh %operand : tensor<2xf32>
     ```
   }];
+  let arguments = (ins MHLO_FpComplexOrQuantizedIntTensor:$operand,
+                       DefaultValuedOptionalAttr<MHLO_ResultAccuracyAttr,
+                       "::mlir::mhlo::ResultAccuracyMode::DEFAULT">:$result_accuracy);
+  let results = (outs MHLO_FpComplexOrQuantizedIntTensor:$result);
+  let hasVerifier = 1;
   let hasFolder = 1;
 }
 //===----------------------------------------------------------------------===//
@@ -1216,8 +1271,8 @@ def MHLO_AddDependencyOp : MHLO_Op<"add_dependency", [Pure,
     ```
   }];
 
-  let arguments = (ins MHLO_TensorOrToken:$operand, MHLO_Token:$token);
-  let results = (outs MHLO_TensorOrToken:$output);
+  let arguments = (ins MHLO_TensorOrAnyToken:$operand, MHLO_AnyToken:$token);
+  let results = (outs MHLO_TensorOrAnyToken:$output);
   let hasCustomHLOConverter = 1;
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -1878,6 +1933,7 @@ def MHLO_DynamicSliceOp: MHLO_Op<"dynamic_slice",
 
   let results = (outs MHLO_Tensor:$result);
   let hasCanonicalizer = 1;
+  let hasFolder = 1;
 }
 
 def MHLO_DynamicUpdateSliceOp: MHLO_Op<"dynamic_update_slice",
@@ -2469,7 +2525,6 @@ def MHLO_CopyOp: MHLO_Op<"copy",
   );
   let results = (outs MHLO_TensorOrTokenOrTuple:$result);
   let hasCustomHLOConverter = 1;
-  let hasFolder = 1;
 
   let assemblyFormat = [{
     operands attr-dict
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_enums.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_enums.td
index 53903a874fde..b12855091ae2 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_enums.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_enums.td
@@ -27,11 +27,10 @@ include "mlir/IR/PatternBase.td"
 def MHLO_PRECISION_DEFAULT : I32EnumAttrCase<"DEFAULT", 0>;
 def MHLO_PRECISION_HIGH    : I32EnumAttrCase<"HIGH", 1>;
 def MHLO_PRECISION_HIGHEST : I32EnumAttrCase<"HIGHEST", 2>;
-def MHLO_PRECISION_PACKED_NIBBLE : I32EnumAttrCase<"PACKED_NIBBLE", 3>;
 
 def MHLO_Precision : I32EnumAttr<"Precision",
     "XLA precision for an operand. Has backend specific meaning.",
-    [MHLO_PRECISION_DEFAULT, MHLO_PRECISION_HIGH, MHLO_PRECISION_HIGHEST, MHLO_PRECISION_PACKED_NIBBLE]> {
+    [MHLO_PRECISION_DEFAULT, MHLO_PRECISION_HIGH, MHLO_PRECISION_HIGHEST]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::mhlo";
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
index d96ba2b56c20..0af1093613a1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
@@ -24,14 +24,15 @@ include "mlir/IR/PatternBase.td"
 def NullArrayAttr : NativeCodeCall<"ArrayAttr()">;
 
 def NullChannelHandleAttr : NativeCodeCall<"mhlo::ChannelHandleAttr()">;
+def StableHLO_NullChannelHandleAttr : NativeCodeCall<"stablehlo::ChannelHandleAttr()">;
 
 def CastIntElementsAttr : NativeCodeCall<"cast<DenseIntElementsAttr>($0)">;
 
 class ConstantSplat<string value> : NativeCodeCall<
     "hlo::getSplat(&$_builder, $0, " # value # ")">;
 
-class MHLO_ConstantLike<string value> : NativeCodeCall<
-    "chlo::getConstantLike($_builder, $_loc, " # value # ", $0)">;
+class StableHLO_ConstantLike<string value> : NativeCodeCall<
+    "::mlir::stablehlo::getConstantLike($_builder, $_loc, " # value # ", $0)">;
 
 def MHLO_ConstantLikeMaxFiniteValue : NativeCodeCall<
     "chlo::getConstantLikeMaxFiniteValue($_builder, $_loc, $0)">;
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc b/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
deleted file mode 100644
index 19891ffc24e6..000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
+++ /dev/null
@@ -1,841 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mhlo/analysis/shape_component_analysis.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/Support/LLVM.h"
-
-using namespace mlir;
-
-using SymbolicShapeConstraintsMap =
-    ShapeComponentAnalysis::SymbolicShapeConstraintsMap;
-using ShapeOrValueInfo = ShapeComponentAnalysis::ShapeOrValueInfo;
-using Symbol = ShapeComponentAnalysis::Symbol;
-using SymbolicExpr = ShapeComponentAnalysis::SymbolicExpr;
-using SymbolicExprsMap = ShapeComponentAnalysis::SymbolicExprsMap;
-
-namespace {
-// Shape visitor. This implements a symbolic interpreter for MHLO with some
-// shape and tensor dialect ops mixed in. We are interested in shapes (e.g., the
-// dimensions of a tensor) and values (e.g, the elements of a shape tensor). The
-// goal is to assign every component of a shape or value either a symbol, a
-// constant, or a symbolic expression. We propagate these symbolic expressions
-// through the various operations. Later optimization passes can use this
-// information for optimizations, e.g., exploiting the equality of dimensions.
-//
-// The visitation happens in two phases:
-//   1. Find the sources of a value's shape or value. This climbs up the
-//      operations from a given value until an unknown op or a function argument
-//      is found. These sources are assigned the initial symbols for each of
-//      their components.
-//   2. Propagate the initial symbols downwards. This builds symbolic
-//      expressions so users of the analysis can pattern match things like
-//      "two dimensions are multiplied".
-//
-// Conceptually, this is defined recursively. For each op, we compute the
-// required shape or value information for the operands and then derive the
-// resulting symbolic expression.
-struct ShapeVisitor {
-  ShapeVisitor(SymbolicExprsMap *symbolicExprsMap,
-               SymbolicShapeConstraintsMap *symbolicShapeConstraintsMap)
-      : symbolicExprsMap(symbolicExprsMap),
-        symbolicShapeConstraintsMap(symbolicShapeConstraintsMap) {}
-
-  void visit(ShapeOrValueInfo requestedInfo) {
-    backwardsWorklist.push_back(requestedInfo);
-
-    // First, we climb up the operations so we get the set of all ops taking
-    // part in this shape or value computation. An alternative would be
-    // analyzing everything eagerly. This backwards pass allows us to be lazy.
-    while (!backwardsWorklist.empty()) {
-      // Skip if already processed.
-      ShapeOrValueInfo transitivelyRequestedInfo =
-          backwardsWorklist.pop_back_val();
-      if (symbolicExprsMap->count(transitivelyRequestedInfo)) continue;
-
-      // Skip irrelevant cases early.
-      Value value = transitivelyRequestedInfo.value();
-      Type ty = value.getType();
-      if (!ty.isIntOrIndexOrFloat() && !mlir::isa<RankedTensorType>(ty))
-        continue;
-
-      // Handle shapes.
-      if (transitivelyRequestedInfo.isShapeInfo()) {
-        if (value.getDefiningOp<shape::AssumingOp>()) {
-          backwardAssumingShape(value);
-        } else if (auto bcast =
-                       value.getDefiningOp<mhlo::DynamicBroadcastInDimOp>()) {
-          backwardDynamicBroadcastInDimShape(bcast);
-        } else if (auto reshape =
-                       value.getDefiningOp<mhlo::DynamicReshapeOp>()) {
-          backwardDynamicReshapeShape(reshape);
-        } else if (value.getDefiningOp<mhlo::ReduceOp>()) {
-          backwardReduceShape(value);
-        } else if (auto transpose = value.getDefiningOp<mhlo::TransposeOp>()) {
-          backwardTransposeShape(transpose);
-        } else if (auto select = value.getDefiningOp<mhlo::SelectOp>()) {
-          backwardSelectShape(select);
-        } else if (auto arg = mlir::dyn_cast<BlockArgument>(value)) {
-          backwardBlockArgumentShape(arg);
-        } else if (value.getDefiningOp() &&
-                   value.getDefiningOp()
-                       ->hasTrait<OpTrait::SameOperandsAndResultShape>()) {
-          backwardSameOperandsAndResultShape(value);
-        } else {
-          backwardUnknownShape(value);
-        }
-        continue;
-      }
-
-      // Skip irrelevant cases early.
-      auto rankedTy = mlir::dyn_cast<RankedTensorType>(ty);
-      bool isPossiblyInterestingScalar = ty.isIntOrIndex();
-      bool isPossiblyInterestingTensor =
-          rankedTy && rankedTy.getRank() <= 1 && rankedTy.hasStaticShape();
-      if (!isPossiblyInterestingScalar && !isPossiblyInterestingTensor) {
-        continue;
-      }
-
-      // Handle values.
-      assert(transitivelyRequestedInfo.isValueInfo() &&
-             "Expect value info at this point.");
-      if (auto shapeof = value.getDefiningOp<shape::ShapeOfOp>()) {
-        backwardShapeOf(shapeof);
-      } else if (auto bcast = value.getDefiningOp<shape::BroadcastOp>()) {
-        backwardBroadcast(bcast);
-      } else if (auto numElements =
-                     value.getDefiningOp<shape::NumElementsOp>()) {
-        backwardNumElements(numElements);
-      } else if (auto dim = value.getDefiningOp<tensor::DimOp>()) {
-        backwardDim(dim);
-      } else if (auto cast = value.getDefiningOp<arith::IndexCastOp>()) {
-        backwardIndexCast(cast);
-      } else if (auto fromElements =
-                     value.getDefiningOp<tensor::FromElementsOp>()) {
-        backwardTensorFromElements(fromElements);
-      } else if (auto extract = value.getDefiningOp<tensor::ExtractOp>()) {
-        backwardTensorExtract(extract);
-      } else if (auto add = value.getDefiningOp<mhlo::AddOp>()) {
-        backwardBinOp(add);
-      } else if (auto mul = value.getDefiningOp<mhlo::MulOp>()) {
-        backwardBinOp(mul);
-      } else if (auto add = value.getDefiningOp<arith::AddIOp>()) {
-        backwardBinOp(add);
-      } else if (auto mul = value.getDefiningOp<arith::MulIOp>()) {
-        backwardBinOp(mul);
-      } else if (auto concat = value.getDefiningOp<mhlo::ConcatenateOp>()) {
-        backwardConcatenate(concat);
-      } else if (auto reshape = value.getDefiningOp<mhlo::ReshapeOp>()) {
-        backwardReshape(reshape);
-      } else if (auto slice = value.getDefiningOp<mhlo::SliceOp>()) {
-        backwardSlice(slice);
-      } else if (matchPattern(value, m_Constant())) {
-        backwardConstant(value);
-      } else {
-        backwardUnknown(value);
-      }
-    }
-
-    // Second, we walk down from the defs to the uses, building symbolic
-    // expressions for shape and value components.
-    while (!forwardsWorklist.empty()) {
-      auto transitivelyRequestedInfo = forwardsWorklist.pop_back_val();
-
-      // Skip if already processed.
-      if (symbolicExprsMap->count(transitivelyRequestedInfo)) continue;
-
-      // Handle shapes.
-      Value value = transitivelyRequestedInfo.value();
-      if (!transitivelyRequestedInfo.isValueInfo()) {
-        if (value.getDefiningOp<shape::AssumingOp>()) {
-          forwardAssumingShape(value);
-        } else if (auto broadcast =
-                       value.getDefiningOp<mhlo::DynamicBroadcastInDimOp>()) {
-          forwardDynamicBroadcastInDimShape(broadcast);
-        } else if (auto reshape =
-                       value.getDefiningOp<mhlo::DynamicReshapeOp>()) {
-          forwardDynamicReshapeShape(reshape);
-        } else if (value.getDefiningOp<mhlo::ReduceOp>()) {
-          forwardReduceShape(value);
-        } else if (auto transpose = value.getDefiningOp<mhlo::TransposeOp>()) {
-          forwardTransposeShape(transpose);
-        } else if (auto select = value.getDefiningOp<mhlo::SelectOp>()) {
-          forwardSelectShape(select);
-        } else if (value.getDefiningOp() &&
-                   value.getDefiningOp()
-                       ->hasTrait<OpTrait::SameOperandsAndResultShape>()) {
-          forwardSameOperandsShape(value);
-        } else {
-          forwardUnknownShape(value);
-        }
-        continue;
-      }
-
-      // Handle values.
-      assert(transitivelyRequestedInfo.isValueInfo() &&
-             "Expect value info at this point.");
-      if (auto shapeof = value.getDefiningOp<shape::ShapeOfOp>()) {
-        forwardShapeOf(shapeof);
-      } else if (auto bcast = value.getDefiningOp<shape::BroadcastOp>()) {
-        forwardBroadcast(bcast);
-      } else if (auto numElements =
-                     value.getDefiningOp<shape::NumElementsOp>()) {
-        forwardNumElements(numElements);
-      } else if (auto dim = value.getDefiningOp<tensor::DimOp>()) {
-        forwardDim(dim);
-      } else if (auto cast = value.getDefiningOp<arith::IndexCastOp>()) {
-        forwardIndexCast(cast);
-      } else if (auto fromElements =
-                     value.getDefiningOp<tensor::FromElementsOp>()) {
-        forwardTensorFromElements(fromElements);
-      } else if (auto extract = value.getDefiningOp<tensor::ExtractOp>()) {
-        forwardTensorExtract(extract);
-      } else if (auto add = value.getDefiningOp<mhlo::AddOp>()) {
-        forwardBinOp(add, [](AffineExpr a, AffineExpr b) { return a + b; });
-      } else if (auto mul = value.getDefiningOp<mhlo::MulOp>()) {
-        forwardBinOp(mul, [](AffineExpr a, AffineExpr b) { return a * b; });
-      } else if (auto add = value.getDefiningOp<arith::AddIOp>()) {
-        forwardBinOp(add, [](AffineExpr a, AffineExpr b) { return a + b; });
-      } else if (auto mul = value.getDefiningOp<arith::MulIOp>()) {
-        forwardBinOp(mul, [](AffineExpr a, AffineExpr b) { return a * b; });
-      } else if (auto concat = value.getDefiningOp<mhlo::ConcatenateOp>()) {
-        forwardConcatenate(concat);
-      } else if (auto reshape = value.getDefiningOp<mhlo::ReshapeOp>()) {
-        forwardReshape(reshape);
-      } else if (auto slice = value.getDefiningOp<mhlo::SliceOp>()) {
-        forwardSlice(slice);
-      } else if (matchPattern(value, m_Constant())) {
-        forwardConstant(value);
-      } else {
-        forwardUnknown(value);
-      }
-    }
-  }
-
- private:
-  // ===
-  // Functions that traverse the shapes of operations.
-  // ===
-
-  void backwardAssumingShape(Value op) {
-    auto assumingOp = op.getDefiningOp<shape::AssumingOp>();
-    auto number = mlir::cast<OpResult>(op).getResultNumber();
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
-    backwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(
-        cast<shape::AssumingYieldOp>(
-            assumingOp.getDoRegion().back().getTerminator())
-            .getOperand(number)));
-  }
-  void forwardAssumingShape(Value op) {
-    auto assumingOp = op.getDefiningOp<shape::AssumingOp>();
-    auto number = mlir::cast<OpResult>(op).getResultNumber();
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
-    dims = lookup(ShapeOrValueInfo::getShapeInfoOf(
-        cast<shape::AssumingYieldOp>(
-            assumingOp.getDoRegion().back().getTerminator())
-            .getOperand(number)));
-  }
-  void backwardBroadcast(shape::BroadcastOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    for (Value s : op.getShapes())
-      backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(s));
-  }
-  void forwardBroadcast(shape::BroadcastOp op) {
-    auto *ctx = op.getContext();
-
-    // Get operands' info.
-    SmallVector<ArrayRef<SymbolicExpr>> argsInfo =
-        llvm::to_vector(llvm::map_range(op.getShapes(), [&](Value s) {
-          return lookup(ShapeOrValueInfo::getValueInfoOf(s));
-        }));
-
-    // Determine broadcasted rank.
-    size_t rank = 0;
-    for (auto &info : argsInfo) rank = std::max(rank, info.size());
-
-    // Evaluate broadcast per result dimension.
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    for (size_t i = 0; i < rank; ++i) {
-      // Init with neural element.
-      SymbolicExpr bcastedExpr;
-      bcastedExpr.expr = getAffineConstantExpr(1, ctx);
-
-      // Consider all the operands.
-      for (auto &info : argsInfo) {
-        // Find corresponding symbolic expression for the ith result dimension,
-        // if the operand contributes.
-        size_t argRank = info.size();
-        if (i + argRank < rank) continue;
-        size_t j = i + argRank - rank;
-        SymbolicExpr expr = info[j];
-
-        // One dimensions are neutral.
-        if (expr.isConstant(1)) continue;
-
-        // If a dimension is known not to be 1, we can use this expression.
-        if (expr.isKnownNotOne()) {
-          bcastedExpr = expr;
-          break;
-        }
-
-        // If all other dimensions were neutral, try using this expression.
-        if (bcastedExpr.isConstant(1)) {
-          bcastedExpr = expr;
-          continue;
-        }
-
-        // If we have contradicting expressions, give up and create a new
-        // symbol.
-        if (bcastedExpr != expr) {
-          bcastedExpr.expr = getAffineSymbolExpr(0, ctx);
-          bcastedExpr.symbols = {{ShapeOrValueInfo::getValueInfoOf(op), i}};
-          break;
-        }
-      }
-
-      dims.push_back(bcastedExpr);
-    }
-    assert(dims.size() == rank && "expect one expression per dimension");
-  }
-  void backwardDynamicBroadcastInDimShape(mhlo::DynamicBroadcastInDimOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getValueInfoOf(op.getOutputDimensions()));
-  }
-  void forwardDynamicBroadcastInDimShape(mhlo::DynamicBroadcastInDimOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
-    dims = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOutputDimensions()));
-  }
-  void backwardDynamicReshapeShape(mhlo::DynamicReshapeOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getValueInfoOf(op.getOutputShape()));
-  }
-  void forwardDynamicReshapeShape(mhlo::DynamicReshapeOp op) {
-    auto rankedTy = mlir::cast<RankedTensorType>(op.getResult().getType());
-    auto shapeDims =
-        lookup(ShapeOrValueInfo::getValueInfoOf(op.getOutputShape()));
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
-    dimsFromStaticShape(rankedTy, shapeDims, &dims);
-  }
-  void backwardReduceShape(Value op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
-    auto reduceOp = op.getDefiningOp<mhlo::ReduceOp>();
-    if (reduceOp.getInputs().size() == 1) {
-      backwardsWorklist.push_back(
-          ShapeOrValueInfo::getShapeInfoOf(reduceOp.getInputs().back()));
-    }
-  }
-  void forwardReduceShape(Value op) {
-    auto reduceOp = op.getDefiningOp<mhlo::ReduceOp>();
-    if (reduceOp.getInputs().size() != 1) return forwardUnknownShape(op);
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
-    for (const auto &dim : llvm::enumerate(lookup(
-             ShapeOrValueInfo::getShapeInfoOf(reduceOp.getInputs().back())))) {
-      if (!llvm::is_contained(reduceOp.getDimensions(), dim.index()))
-        dims.push_back(dim.value());
-    }
-  }
-  void backwardTransposeShape(mhlo::TransposeOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getShapeInfoOf(op.getOperand()));
-  }
-  void forwardTransposeShape(mhlo::TransposeOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
-    auto in = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getOperand()));
-    auto elem = mlir::cast<DenseIntElementsAttr>(op.getPermutation());
-    for (const auto &val : elem) dims.push_back(in[val.getZExtValue()]);
-  }
-  void backwardSelectShape(mhlo::SelectOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getShapeInfoOf(op.getOnTrue()));
-  }
-  void forwardSelectShape(mhlo::SelectOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
-    // Forward the `on_true` operand, it has the same shape as the output.
-    dims = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getOnTrue()));
-  }
-  void backwardSameOperandsAndResultShape(Value v) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(v));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getShapeInfoOf(v.getDefiningOp()->getOperand(0)));
-  }
-  void forwardSameOperandsShape(Value v) {
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(v));
-    dims = lookup(
-        ShapeOrValueInfo::getShapeInfoOf(v.getDefiningOp()->getOperand(0)));
-  }
-  void backwardBlockArgumentShape(BlockArgument argument) {
-    // JitRT uses rt.symbolic_shape to describe identical dimensions. Make
-    // use of that when it exists.
-    //
-    // Example:
-    //   func @compute(
-    //     %arg0: tensor<?xf32> {rt.symbolic_shape = dense<-2> :
-    //     tensor<1xi64>}, %arg1: tensor<?xf32> {rt.symbolic_shape =
-    //     dense<-2> : tensor<1xi64>})
-    //   } { ... }
-    //
-    // Symbolic shape is a negative value smaller than `-1`. The concrete value
-    // is not known at compile time, and in this particular example it is only
-    // known that both arguments have the same shape.
-    //
-    // TODO(ezhulenev): Add symbolic shape attribute verifier to the jitrt
-    // dialect.
-    if (auto func = dyn_cast_or_null<func::FuncOp>(
-            argument.getOwner()->getParentOp())) {
-      if (auto shape = func.getArgAttrOfType<DenseIntElementsAttr>(
-              argument.getArgNumber(), "rt.symbolic_shape")) {
-        auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(argument));
-        auto id = getAffineSymbolExpr(0, argument.getContext());
-        for (const auto &symbol : llvm::enumerate(shape.getValues<ssize_t>())) {
-          dims.emplace_back();
-          auto &dim = dims.back();
-          if (symbol.value() >= 0) {
-            dim.expr =
-                getAffineConstantExpr(symbol.value(), argument.getContext());
-          } else {
-            auto it = symbolicShapeConstraintsMap->try_emplace(
-                symbol.value(),
-                Symbol{ShapeOrValueInfo::getShapeInfoOf(argument),
-                       symbol.index()});
-            dim.symbols.push_back(it.first->second);
-            dim.expr = id;
-          }
-        }
-        return;
-      }
-    }
-    forwardUnknownShape(argument);
-  }
-  void backwardUnknownShape(Value v) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(v));
-  }
-  void forwardUnknownShape(Value v) {
-    auto rankedTy = mlir::dyn_cast<RankedTensorType>(v.getType());
-    if (!rankedTy) return;
-    auto id = getAffineSymbolExpr(0, v.getContext());
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(v));
-    return dimsFromStaticShape(
-        rankedTy,
-        [&](size_t i) {
-          SymbolicExpr d;
-          d.symbols.push_back({ShapeOrValueInfo::getShapeInfoOf(v), i});
-          d.expr = id;
-          return d;
-        },
-        &dims);
-  }
-
-  // ===
-  // Functions that traverse values. These can be shape tensors (e.g., of type
-  // tensor<3xindex>) or interesting scalars (e.g., of type index).
-  // ===
-
-  void backwardShapeOf(shape::ShapeOfOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
-  }
-  void forwardShapeOf(shape::ShapeOfOp op) {
-    auto rankedTy = mlir::cast<RankedTensorType>(op.getArg().getType());
-    auto arg = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    return dimsFromStaticShape(rankedTy, arg, &dims);
-  }
-  void backwardNumElements(shape::NumElementsOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getValueInfoOf(op.getShape()));
-  }
-  void forwardNumElements(shape::NumElementsOp op) {
-    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getShape()));
-
-    // Accumulate product symbolically and concrete where possible.
-    int64_t concreteProduct = 1;
-    SymbolicExpr dim;
-    for (auto &it : in) {
-      // For constant expressions, we can accumulate a concrete product.
-      if (auto cexpr = dyn_cast<AffineConstantExpr>(it.expr)) {
-        assert(cexpr.getValue() > 0 && "shape value must be positive");
-        concreteProduct *= cexpr.getValue();
-        continue;
-      }
-
-      // Simply copy the first sybolic factor.
-      if (!dim.expr) {
-        dim = it;
-        continue;
-      }
-
-      // Multiply remaining symbolic factors.
-      dim.expr = dim.expr *
-                 it.expr.shiftSymbols(dim.symbols.size(), it.symbols.size());
-      dim.symbols.append(it.symbols);
-    }
-
-    // Combine concrete and symbolic product.
-    if (concreteProduct != 1 || !dim.expr) {
-      auto cexpr = getAffineConstantExpr(concreteProduct, op.getContext());
-      if (dim.expr)
-        dim.expr = cexpr * dim.expr;
-      else
-        dim.expr = cexpr;
-    }
-
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    dims.push_back(dim);
-  }
-  void backwardDim(tensor::DimOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getShapeInfoOf(op.getSource()));
-  }
-  void forwardDim(tensor::DimOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    if (auto index = op.getIndex().getDefiningOp<arith::ConstantOp>()) {
-      int64_t i = mlir::cast<IntegerAttr>(index.getValue()).getInt();
-      auto in = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getSource()));
-      if (i >= static_cast<int64_t>(in.size()) || i < 0)
-        llvm::report_fatal_error("tensor dim out of bounds");
-      dims.push_back({in[i].symbols, in[i].expr});
-    } else {
-      forwardUnknown(op);
-    }
-  }
-  template <typename Op>
-  void backwardBinOp(Op op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    // TODO(jpienaar): Switch to named accessors when MHLO uses prefixed form.
-    backwardsWorklist.append(
-        {ShapeOrValueInfo::getValueInfoOf(op.getOperand(0)),
-         ShapeOrValueInfo::getValueInfoOf(op.getOperand(1))});
-  }
-  template <typename Op, typename Combiner>
-  void forwardBinOp(Op op, Combiner &&combiner) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    // TODO(jpienaar): Switch to named accessors when MHLO uses prefixed form.
-    auto lhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand(0)));
-    auto rhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand(1)));
-    for (int64_t i = 0, e = dim0size(op.getType()); i != e; ++i) {
-      dims.emplace_back();
-      auto &dim = dims.back();
-      dim.symbols.append(lhs[i].symbols);
-      dim.symbols.append(rhs[i].symbols);
-      dim.expr = combiner(lhs[i].expr,
-                          rhs[i].expr.shiftSymbols(rhs[i].symbols.size(),
-                                                   lhs[i].symbols.size()));
-    }
-  }
-  void backwardIndexCast(arith::IndexCastOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op.getIn()));
-  }
-  void forwardIndexCast(arith::IndexCastOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getIn()));
-    for (int64_t i = 0, e = dim0size(op.getType()); i != e; ++i) {
-      // This is intentionally not modelling the truncation/zero extension of
-      // index_cast. While it's incorrect it doesn't really matter for shape
-      // computations.
-      dims.push_back({in[i].symbols, in[i].expr});
-    }
-  }
-  void backwardTensorFromElements(tensor::FromElementsOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    for (auto operand : op.getOperands())
-      backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(operand));
-  }
-  void forwardTensorFromElements(tensor::FromElementsOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    for (auto operand : op.getOperands()) {
-      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(operand));
-      assert(in.size() == 1);
-      dims.push_back({in[0].symbols, in[0].expr});
-    }
-  }
-  void backwardTensorExtract(tensor::ExtractOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getValueInfoOf(op.getTensor()));
-  }
-  void forwardTensorExtract(tensor::ExtractOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    assert(op.getIndices().size() == 1);
-    if (auto index =
-            op.getIndices().front().getDefiningOp<arith::ConstantOp>()) {
-      int64_t i = mlir::cast<IntegerAttr>(index.getValue()).getInt();
-      // We asssume this is in bounds.
-      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getTensor()));
-      dims.push_back({in[i].symbols, in[i].expr});
-    } else {
-      forwardUnknown(op);
-    }
-  }
-  void backwardConstant(Value v) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(v));
-  }
-  void forwardConstant(Value v) {
-    IntegerAttr intAttr;
-    DenseIntElementsAttr denseAttr;
-    if (matchPattern(v, m_Constant(&denseAttr))) {
-      auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
-      for (uint64_t i = 0, e = dim0size(v.getType()); i != e; ++i) {
-        dims.emplace_back();
-        auto &dim = dims.back();
-        dim.expr = getAffineConstantExpr(
-            denseAttr.getValues<APInt>()[i].getSExtValue(), v.getContext());
-      }
-    } else if (matchPattern(v, m_Constant(&intAttr))) {
-      auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
-      dims.emplace_back();
-      auto &dim = dims.back();
-      dim.expr = getAffineConstantExpr(intAttr.getInt(), v.getContext());
-    } else {
-      forwardUnknown(v);
-    }
-  }
-  void backwardConcatenate(mhlo::ConcatenateOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    for (auto operand : op.getOperands())
-      backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(operand));
-  }
-  void forwardConcatenate(mhlo::ConcatenateOp op) {
-    for (auto operand : op.getOperands()) {
-      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(operand));
-      if (in.size() != 1) return forwardUnknown(op);
-    }
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    for (auto operand : op.getOperands()) {
-      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(operand));
-      dims.push_back({in[0].symbols, in[0].expr});
-    }
-  }
-  void backwardReshape(mhlo::ReshapeOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
-  }
-  void forwardReshape(mhlo::ReshapeOp op) {
-    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
-    if (in.size() != 1) return forwardUnknown(op);
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    dims.push_back({in[0].symbols, in[0].expr});
-  }
-  void backwardSlice(mhlo::SliceOp op) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwardsWorklist.push_back(
-        ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
-  }
-  void forwardSlice(mhlo::SliceOp op) {
-    // Only handle slices equivalent to an extract.
-    if (!op.getType().hasStaticShape({1})) {
-      return forwardUnknown(op);
-    }
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
-    auto elem = mlir::cast<DenseIntElementsAttr>(op.getStartIndices());
-    auto i = (*elem.begin()).getZExtValue();
-    if (i >= in.size()) {  // Bounds check.
-      return forwardUnknown(op);
-    }
-    dims.push_back({in[i].symbols, in[i].expr});
-  }
-  void backwardUnknown(Value v) {
-    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(v));
-  }
-  void forwardUnknown(Value v) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
-    auto id = getAffineSymbolExpr(0, v.getContext());
-    for (size_t i = 0, e = dim0size(v.getType()); i != e; ++i) {
-      dims.emplace_back();
-      auto &dim = dims.back();
-      dim.symbols.push_back({ShapeOrValueInfo::getValueInfoOf(v), i});
-      dim.expr = id;
-    }
-  }
-
-  // ===
-  // Helpers
-  // ===
-
-  static void dimsFromStaticShape(
-      RankedTensorType rankedTy,
-      llvm::function_ref<SymbolicExpr(int64_t)> fallback,
-      std::vector<SymbolicExpr> *mergedDims) {
-    auto *ctx = rankedTy.getContext();
-    for (int64_t i = 0, e = rankedTy.getRank(); i != e; ++i) {
-      if (rankedTy.isDynamicDim(i)) {
-        mergedDims->push_back(fallback(i));
-      } else {
-        mergedDims->emplace_back();
-        auto &d = mergedDims->back();
-        d.expr = getAffineConstantExpr(rankedTy.getDimSize(i), ctx);
-      }
-    }
-  }
-
-  static void dimsFromStaticShape(RankedTensorType rankedTy,
-                                  ArrayRef<SymbolicExpr> fallback,
-                                  std::vector<SymbolicExpr> *mergedDims) {
-    return dimsFromStaticShape(
-        rankedTy, [&](int64_t i) { return fallback[i]; }, mergedDims);
-  }
-
-  // Return the size of the first dimension. Returns 1 for scalars.
-  static int64_t dim0size(Type type) {
-    if (auto rankedType = mlir::dyn_cast<RankedTensorType>(type))
-      return rankedType.getRank() == 0 ? 1 : rankedType.getDimSize(0);
-    return 1;
-  }
-
-  // Retrieves the existing information from the cache.
-  ArrayRef<SymbolicExpr> lookup(ShapeOrValueInfo requestedInfo) {
-    auto i = symbolicExprsMap->find(requestedInfo);
-    assert(i != symbolicExprsMap->end() && "op not processed yet?");
-    return llvm::ArrayRef(i->second);
-  }
-
-  // Inserts a new entry into the cache and returns a reference to its result
-  // components.
-  std::vector<SymbolicExpr> &insert(ShapeOrValueInfo requestedInfo) {
-    auto i = symbolicExprsMap->try_emplace(requestedInfo);
-    assert(i.second && "op already processed?");
-    return i.first->second;
-  }
-
-  SymbolicExprsMap *symbolicExprsMap;
-  SymbolicShapeConstraintsMap *symbolicShapeConstraintsMap;
-
-  // Worklists for the forward and backward passes.
-  SmallVector<ShapeOrValueInfo> backwardsWorklist;
-  SmallVector<ShapeOrValueInfo> forwardsWorklist;
-};
-}  // namespace
-
-void ShapeComponentAnalysis::compute(ShapeOrValueInfo requestedInfo) {
-  ShapeVisitor(&symbolicExprsMap, &symbolicShapeConstraintsMap)
-      .visit(requestedInfo);
-}
-
-std::optional<ArrayRef<SymbolicExpr>>
-ShapeComponentAnalysis::ShapeComponentAnalysis::GetShapeInfo(Value value) {
-  auto request = ShapeOrValueInfo::getShapeInfoOf(value);
-  compute(request);
-  auto found = symbolicExprsMap.find(request);
-  if (found == symbolicExprsMap.end()) return {};
-  return llvm::ArrayRef(found->second);
-}
-
-std::optional<ArrayRef<SymbolicExpr>>
-ShapeComponentAnalysis::ShapeComponentAnalysis::GetValueInfo(Value shape) {
-  auto request = ShapeOrValueInfo::getValueInfoOf(shape);
-  compute(request);
-  auto found = symbolicExprsMap.find(request);
-  if (found == symbolicExprsMap.end()) return {};
-  return llvm::ArrayRef(found->second);
-}
-
-void ShapeComponentAnalysis::reset() {
-  symbolicExprsMap.clear();
-  symbolicShapeConstraintsMap.clear();
-}
-
-bool SymbolicExpr::isConstant(int64_t value) const {
-  return isa<AffineConstantExpr>(expr) &&
-         cast<AffineConstantExpr>(expr).getValue() == value;
-}
-
-bool SymbolicExpr::isKnownNotNegativeOne() const {
-  // If the symbol is coming from a shape it can't be a -1. Also allow results
-  // of shape_of, compute_reshape_shape, and num_elements. This is correct, not
-  // complete.
-  auto isGoodSymbol = [](const Symbol &symbol) {
-    if (symbol.source.isShapeInfo()) return true;
-    Operation *op = symbol.source.value().getDefiningOp();
-    if (op == nullptr) return false;
-    return llvm::isa<shape::ShapeOfOp, shape::NumElementsOp>(op);
-  };
-
-  // For constants we know if it's -1 or not. Checking the sign is sufficient
-  // here and allows for reuse below. This is correct, not complete.
-  auto isGoodSymbolOrGoodConstantExpr = [&](AffineExpr expr) {
-    if (auto symExpr = dyn_cast<AffineSymbolExpr>(expr))
-      return isGoodSymbol(symbols[symExpr.getPosition()]);
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(expr))
-      return constExpr.getValue() >= 0;
-    return false;
-  };
-
-  if (isGoodSymbolOrGoodConstantExpr(expr)) return true;
-
-  // Multiplying non-negative symbols and non-negative constants will always
-  // give a positive result. This is correct, not complete.
-  // TODO(kramerb): Could the analysis provide a generic interface for this?
-  if (auto bexpr = dyn_cast<AffineBinaryOpExpr>(expr)) {
-    return bexpr.getKind() == AffineExprKind::Mul &&
-           isGoodSymbolOrGoodConstantExpr(bexpr.getLHS()) &&
-           isGoodSymbolOrGoodConstantExpr(bexpr.getRHS());
-  }
-
-  return false;
-}
-
-bool SymbolicExpr::isKnownNotOne() const {
-  if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
-    return constExpr.getValue() != 1;
-  }
-  return false;
-}
-
-std::optional<Symbol> SymbolicExpr::singleton() const {
-  if (isa<AffineSymbolExpr>(expr) &&
-      cast<AffineSymbolExpr>(expr).getPosition() == 0) {
-    assert(symbols.size() == 1);
-    return symbols[0];
-  }
-  return std::nullopt;
-}
-
-void SymbolicExpr::dump(llvm::raw_ostream &os) const {
-  expr.print(os);
-  if (!symbols.empty()) os << " with";
-  os << "\n";
-  if (symbols.empty()) return;
-  for (const auto &sym : llvm::enumerate(symbols)) {
-    os.indent(4);
-    os << 's' << sym.index() << " = ";
-    if (!sym.value().source.isValueInfo()) os << "shapeof(";
-    sym.value().source.value().print(os);
-    if (!sym.value().source.isValueInfo()) os << ")";
-    os << '[' << sym.value().index << "]\n";
-  }
-}
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
index ac34a3e70745..d68486334c73 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
@@ -29,7 +29,6 @@ add_public_tablegen_target(MLIRChloLegalizeToHloIncGen)
 
 
 add_mlir_library(MhloPasses
-  broadcast_propagation/broadcast_propagation.cc
   collapse_elementwise_map/collapse_elementwise_map.cc
   convert_to_signless/convert_to_signless_pass.cc
   expand_hlo_tuples/expand_hlo_tuples.cc
@@ -39,14 +38,10 @@ add_mlir_library(MhloPasses
   legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
   materialize_broadcasts/materialize_broadcasts.cc
   materialize_broadcasts/materialize_broadcasts_pass.cc
-  merge_assuming_ops/merge_assuming_ops.cc
   mhlo_flatten_tuple/mhlo_flatten_tuple.cc
   prepare_for_export/prepare_for_export.cc
   optimize_mhlo/optimize_mhlo.cc
-  shape_legalize_to_hlo/shape_legalize_to_hlo.cc
-  shape_simplification/shape_simplification.cc
   sink_constants_to_control_flow/sink_constants_to_control_flow.cc
-  symbolic_shape_optimization/symbolic_shape_optimization.cc
   test_infer_shaped_type/test_infer_shaped_type_pass.cc
   unfuse_batch_norm/unfuse_batch_norm.cc
   unfuse_batch_norm/unfuse_batch_norm_pass.cc
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
deleted file mode 100644
index c8268e4335dc..000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
+++ /dev/null
@@ -1,456 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-==============================================================================*/
-
-#include <algorithm>
-#include <memory>
-#include <utility>
-
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_BROADCASTPROPAGATIONPASS
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-// To avoid duplicate broadcasts, we collect all the intended broadcasts ahead
-// of realizing any broadcasts in the IR. These are broadcasted versions of
-// values that we are interested in, and they are uniquely characterized by a
-// `BroadcastIntent` value.
-struct BroadcastIntent {
-  RankedTensorType resultType;
-  Value targetValue;
-  Value outputDimensions;
-  Attribute broadcastDimensions;
-  bool operator==(BroadcastIntent rhs) const {
-    return resultType == rhs.resultType && targetValue == rhs.targetValue &&
-           outputDimensions == rhs.outputDimensions &&
-           broadcastDimensions == rhs.broadcastDimensions;
-  }
-  bool operator!=(BroadcastIntent rhs) const { return !(*this == rhs); }
-};
-
-}  // namespace
-}  // namespace mhlo
-}  // namespace mlir
-
-namespace llvm {
-
-template <>
-struct DenseMapInfo<mlir::mhlo::BroadcastIntent> {
-  static mlir::mhlo::BroadcastIntent getEmptyKey() {
-    return {DenseMapInfo<mlir::RankedTensorType>::getEmptyKey(),
-            DenseMapInfo<mlir::Value>::getEmptyKey(),
-            DenseMapInfo<mlir::Value>::getEmptyKey(),
-            DenseMapInfo<mlir::Attribute>::getEmptyKey()};
-  }
-  static mlir::mhlo::BroadcastIntent getTombstoneKey() {
-    return {DenseMapInfo<mlir::RankedTensorType>::getTombstoneKey(),
-            DenseMapInfo<mlir::Value>::getTombstoneKey(),
-            DenseMapInfo<mlir::Value>::getTombstoneKey(),
-            DenseMapInfo<mlir::Attribute>::getTombstoneKey()};
-  }
-  static unsigned getHashValue(const mlir::mhlo::BroadcastIntent &intent) {
-    return hash_combine(
-        DenseMapInfo<mlir::RankedTensorType>::getHashValue(intent.resultType),
-        DenseMapInfo<mlir::Value>::getHashValue(intent.targetValue),
-        DenseMapInfo<mlir::Value>::getHashValue(intent.outputDimensions),
-        DenseMapInfo<mlir::Attribute>::getHashValue(
-            intent.broadcastDimensions));
-  }
-  static bool isEqual(const mlir::mhlo::BroadcastIntent &lhs,
-                      const mlir::mhlo::BroadcastIntent &rhs) {
-    return lhs == rhs;
-  }
-};
-
-}  // namespace llvm
-
-namespace mlir {
-namespace mhlo {
-namespace {
-
-bool allowsForElementwiseBroadcastPropagation(Operation *op) {
-  if (op && op->hasTrait<mlir::OpTrait::SameOperandsAndResultShape>() &&
-      op->hasTrait<mlir::OpTrait::Elementwise>() && op->getNumResults() == 1) {
-    return true;
-  }
-  if (op && op->hasTrait<hlo::OpTrait::BroadcastingElementwise>() &&
-      op->getNumResults() == 1) {
-    return true;
-  }
-  return false;
-}
-
-bool allowsForBroadcastPropagation(Operation *op) {
-  return llvm::isa_and_nonnull<DynamicBroadcastInDimOp>(op) ||
-         allowsForElementwiseBroadcastPropagation(op);
-}
-
-DenseIntElementsAttr composeBroadcastDimensionsAttr(OpBuilder &builder,
-                                                    DenseIntElementsAttr a,
-                                                    DenseIntElementsAttr b) {
-  SmallVector<int64_t> bVec =
-      llvm::to_vector(llvm::map_range(b, [](const APInt &it) {
-        return static_cast<int64_t>(it.getLimitedValue());
-      }));
-  SmallVector<int64_t> composedVec = llvm::to_vector(llvm::map_range(
-      a, [bVec](const APInt &it) { return bVec[it.getLimitedValue()]; }));
-  return builder.getI64TensorAttr(composedVec);
-}
-
-// Find all the broadcast intents and their dependencies. Start analyzing from
-// the root an collect all broadcast intents that can help broadcast propagation
-// from there.
-void findBroadcastIntents(
-    DynamicBroadcastInDimOp root, Block *parentBlock,
-    BroadcastIntent &rootBcastIntent,
-    SmallVector<BroadcastIntent> &bcastIntents,
-    DenseMap<BroadcastIntent, SmallVector<BroadcastIntent>>
-        &bcastIntentDependencies) {
-  OpBuilder builder(root.getContext());
-
-  // Use the result vector of broadcast intents as a worklist. The set of
-  // broadcast intents helps to ensure their uniqueness.
-  DenseSet<BroadcastIntent> bcastIntentsSet;
-  auto addToWorklistIfNew = [&](BroadcastIntent bcastIntent) {
-    if (!bcastIntentsSet.count(bcastIntent)) {
-      bcastIntentsSet.insert(bcastIntent);
-      bcastIntents.push_back(bcastIntent);
-    }
-  };
-
-  // Derive the broadcast intent associated with the root broadcast operation.
-  // Add it to the worklist to seed the analysis.
-  rootBcastIntent = {mlir::cast<RankedTensorType>(root.getResult().getType()),
-                     root.getOperand(), root.getOutputDimensions(),
-                     root.getBroadcastDimensions()};
-  addToWorklistIfNew(rootBcastIntent);
-
-  // We use result vector of broadcast intents as a worklist, the first `i`
-  // intents of which have been processed.
-  for (int64_t i = 0; i < static_cast<int64_t>(bcastIntents.size()); ++i) {
-    BroadcastIntent it = bcastIntents[i];
-    Operation *producerOp = it.targetValue.getDefiningOp();
-
-    // We can propagate broadcasts over (broadcasting) element-wise operations
-    // and dynamic_broadcast_in_dim ops with the restriction that they must be
-    // in the same block as they may depend on assuming regions.
-    if (!producerOp || producerOp->getBlock() != parentBlock ||
-        !allowsForBroadcastPropagation(producerOp)) {
-      continue;
-    }
-
-    // We can skip broadcasting producers (dynamic_broadcast_in_dim ops) if we
-    // compose their broadcasting dimensions.
-    if (auto producerBcastOp =
-            llvm::dyn_cast<DynamicBroadcastInDimOp>(producerOp)) {
-      DenseIntElementsAttr composedBcastDims = composeBroadcastDimensionsAttr(
-          builder, producerBcastOp.getBroadcastDimensions(),
-          mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
-      BroadcastIntent bcastedOperandIntent = {
-          it.resultType, producerBcastOp.getOperand(), it.outputDimensions,
-          composedBcastDims};
-
-      // Record dependency and "recur".
-      bcastIntentDependencies[it] = {bcastedOperandIntent};
-      addToWorklistIfNew(bcastedOperandIntent);
-      continue;
-    }
-
-    // We can propagate broadcasts over (broadcasting) element-wise operations.
-    // Instead of broadcasting the result of such an op, we can broadcast the
-    // operands and apply the element-wise operation to them.
-    assert(allowsForElementwiseBroadcastPropagation(producerOp));
-    bcastIntentDependencies[it] = {};
-    for (auto operand : producerOp->getOperands()) {
-      auto operandTy = mlir::cast<RankedTensorType>(operand.getType());
-      auto operandBcastDims = operandTy.getRank() == 0
-                                  ? builder.getI64TensorAttr({})
-                                  : it.broadcastDimensions;
-      auto bcastedOperandTy = RankedTensorType::get(it.resultType.getShape(),
-                                                    operandTy.getElementType());
-      BroadcastIntent bcastedOperandIntent = {
-          bcastedOperandTy, operand, it.outputDimensions, operandBcastDims};
-
-      // Record dependency and "recur".
-      bcastIntentDependencies[it].push_back(bcastedOperandIntent);
-      addToWorklistIfNew(bcastedOperandIntent);
-    }
-  }
-}
-
-void sortBroadcastIntentsInReverseTopologicalOrder(
-    SmallVector<BroadcastIntent> &bcastIntentsVec, Block *parentBlock) {
-  // Sort broadcast intents in reverse topological order of the producer ops. We
-  // can use the positions in the block for this. All broadcast intents outside
-  // the block (e.g. arguments) will be sorted towards the front.
-  // This ordering is independent of the output dimensions as dependencies can
-  // only occur between broadcast intents of the same output dimension.
-  std::sort(bcastIntentsVec.begin(), bcastIntentsVec.end(),
-            [parentBlock](const BroadcastIntent &a, const BroadcastIntent &b) {
-              Operation *producerOpA = a.targetValue.getDefiningOp();
-              Operation *producerOpB = b.targetValue.getDefiningOp();
-              bool aInBlock = producerOpA != nullptr &&
-                              producerOpA->getBlock() == parentBlock;
-              bool bInBlock = producerOpB != nullptr &&
-                              producerOpB->getBlock() == parentBlock;
-              if (aInBlock && bInBlock) {
-                return producerOpA->isBeforeInBlock(producerOpB);
-              }
-              return !aInBlock && bInBlock;
-            });
-}
-
-void setInsertionPointToEarliestPointWithAllValuesAvailable(
-    PatternRewriter &rewriter, Block *block, ValueRange values) {
-  Operation *lastDef = nullptr;
-  for (Value v : values) {
-    Operation *def = v.getDefiningOp();
-    if (def && def->getBlock() == block) {
-      if (!lastDef || lastDef->isBeforeInBlock(def)) lastDef = def;
-    }
-  }
-  if (lastDef) {
-    rewriter.setInsertionPointAfter(lastDef);
-  } else {
-    rewriter.setInsertionPointToStart(block);
-  }
-}
-
-DenseMap<BroadcastIntent, Value> realizeBroadcastIntents(
-    SmallVector<BroadcastIntent> &sortedBcastIntents,
-    DenseMap<BroadcastIntent, SmallVector<BroadcastIntent>>
-        &bcastIntentDependencies,
-    Block *parentBlock, PatternRewriter &rewriter) {
-  // Realize broadcast intents in order. They must be sorted so that their
-  // dependencies are realized before them.
-  DenseMap<BroadcastIntent, Value> realizations;
-  for (auto it : sortedBcastIntents) {
-    Operation *producerOp = it.targetValue.getDefiningOp();
-    assert(!realizations.count(it) && "expect unrealized broadcast intent");
-    auto deps = bcastIntentDependencies.find(it);
-
-    // If we cannot propagate broadcasts further, materialize them as a
-    // dynamic_broadcast_in_dim op.
-    if (!producerOp || producerOp->getBlock() != parentBlock ||
-        !allowsForBroadcastPropagation(producerOp)) {
-      assert(deps == bcastIntentDependencies.end() && "expect no dependencies");
-      setInsertionPointToEarliestPointWithAllValuesAvailable(
-          rewriter, parentBlock,
-          ValueRange{it.targetValue, it.outputDimensions});
-      realizations[it] = rewriter.create<DynamicBroadcastInDimOp>(
-          it.targetValue.getLoc(), it.resultType, it.targetValue,
-          it.outputDimensions,
-          mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
-      continue;
-    }
-
-    // For broadcast propagation across dynamic_broadcast_in_dim ops, the
-    // broadcasted value is already materialized. Forward it.
-    if (auto producerBcastOp =
-            llvm::dyn_cast_or_null<DynamicBroadcastInDimOp>(producerOp)) {
-      assert(deps != bcastIntentDependencies.end() &&
-             deps->second.size() == 1 && "expect one dependency");
-      auto bcastedOperand = realizations.find(deps->second.front());
-      assert(bcastedOperand != realizations.end());
-      realizations[it] = Value(bcastedOperand->second);
-      continue;
-    }
-
-    // Othwerwise, realize broadcast intent for a (broadcasting) element-wise
-    // operation based on the broadcasted operands.
-    assert(allowsForElementwiseBroadcastPropagation(producerOp) &&
-           "expect broadcast propagation over an (broadcasting) element-wise "
-           "operation");
-    assert(deps != bcastIntentDependencies.end() &&
-           deps->second.size() == producerOp->getNumOperands() &&
-           "expect one dependency per operand");
-    auto bcastedOperands = llvm::to_vector(
-        llvm::map_range(deps->second, [&](BroadcastIntent operandIntent) {
-          auto bcastedOperand = realizations.find(operandIntent);
-          assert(bcastedOperand != realizations.end() &&
-                 "expect dependencies to be realized earlier");
-          return bcastedOperand->second;
-        }));
-    setInsertionPointToEarliestPointWithAllValuesAvailable(
-        rewriter, parentBlock, bcastedOperands);
-    OperationState newProducerOpState(
-        producerOp->getLoc(), producerOp->getName().getStringRef(),
-        bcastedOperands, it.resultType, producerOp->getAttrs());
-    Operation *newProducerOp = rewriter.create(newProducerOpState);
-    assert(newProducerOp->getNumResults() == 1 && "expect exactly one result");
-    realizations[it] = newProducerOp->getResults().front();
-  }
-
-  return realizations;
-}
-
-void transitivelyEraseUnusedSideEffectFreeOps(Operation *root,
-                                              PatternRewriter &rewriter) {
-  // Find ops to erase.
-  SmallPtrSet<Operation *, 16> opsToEraseSet;
-  SmallVector<Operation *, 16> opsToErase;
-  SmallVector<Operation *, 16> worklist = {root};
-  while (!worklist.empty()) {
-    Operation *op = worklist.pop_back_val();
-
-    // Erase ops only once.
-    if (opsToEraseSet.count(op)) continue;
-
-    // Erase only operations that are unused and free of side effects.
-    if (!isMemoryEffectFree(op) ||
-        !llvm::all_of(op->getUsers(), [opsToEraseSet](Operation *user) {
-          return opsToEraseSet.count(user);
-        })) {
-      continue;
-    }
-
-    // Erase and "recur".
-    opsToEraseSet.insert(op);
-    opsToErase.push_back(op);
-    for (Value operand : op->getOperands()) {
-      if (Operation *def = operand.getDefiningOp()) worklist.push_back(def);
-    }
-  }
-
-  // Finally, erase the ops in the order of their uses.
-  for (Operation *op : opsToErase) rewriter.eraseOp(op);
-}
-
-LogicalResult propagateBroadcast(DynamicBroadcastInDimOp root,
-                                 Block *parentBlock,
-                                 PatternRewriter &rewriter) {
-  // We can move broadcasts up over (i) (broadcasting) element-wise operations
-  // and (i) dynamic_broadcast_in_dim ops. This way, we propagate them through
-  // the IR to perform them early. Instead of broadcasting the result of such an
-  // op, we can broadcast the operands and apply the element-wise operation to
-  // them.
-  //
-  // To avoid exponential growth of the IR, we will do this in two phases:
-  //   1) First, we collect all the unique broadcast intents. These are
-  //      broadcasted versions of values that we are interested in. They may
-  //      later be materialized as an explicit broadcast or they can be the
-  //      direct result of an operation over which a broadcast was propagated.
-  //   2) Then, we fulfill every broadcast intent in reverse topological order
-  //      to ensure that their dependencies (the broadcasted operands) are
-  //      available.
-
-  // Find the unique broadcast intents.
-  BroadcastIntent rootBcastIntent;
-  SmallVector<BroadcastIntent> bcastIntents;
-  DenseMap<BroadcastIntent, SmallVector<BroadcastIntent>>
-      bcastIntentDependencies;
-  findBroadcastIntents(root, parentBlock, rootBcastIntent, bcastIntents,
-                       bcastIntentDependencies);
-
-  // Fail if there is nothing but the root intent, i.e. if there is nothing to
-  // rewrite here.
-  if (bcastIntents.size() <= 1) {
-    assert(bcastIntents.front() == rootBcastIntent && "expect root intent");
-    return failure();
-  }
-
-  // Sort the broadcast intents in reverse topological order so that they can be
-  // materialized and every depency is available when needed.
-  sortBroadcastIntentsInReverseTopologicalOrder(bcastIntents, parentBlock);
-
-  // Realize broadcast intents.
-  DenseMap<BroadcastIntent, Value> realizations = realizeBroadcastIntents(
-      bcastIntents, bcastIntentDependencies, parentBlock, rewriter);
-
-  // Find the operations that may become redundant after replacing the root
-  // operation. This allows us to transitively erase unused side effect-free
-  // operations that result from this rewrite (after the root operation is no
-  // longer accessible).
-  SmallVector<Operation *> possiblyUnused;
-  for (auto operand : root->getOperands()) {
-    if (Operation *def = operand.getDefiningOp()) possiblyUnused.push_back(def);
-  }
-
-  // Replace the root operation with its broadcast intent's realization.
-  rewriter.replaceOp(root, realizations[rootBcastIntent]);
-
-  // Erase all the operations that have become redundant as a result of this
-  // rewrite.
-  for (Operation *op : possiblyUnused) {
-    transitivelyEraseUnusedSideEffectFreeOps(op, rewriter);
-  }
-
-  return success();
-}
-
-struct BroadcastPropagationPattern
-    : public OpRewritePattern<DynamicBroadcastInDimOp> {
-  using OpRewritePattern<DynamicBroadcastInDimOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(DynamicBroadcastInDimOp op,
-                                PatternRewriter &rewriter) const override {
-    return propagateBroadcast(op, op->getBlock(), rewriter);
-  }
-};
-
-struct BroadcastPropagationPass
-    : public impl::BroadcastPropagationPassBase<BroadcastPropagationPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mhlo::MhloDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *ctx = &getContext();
-
-    // Collect patterns.
-    RewritePatternSet patterns(ctx);
-    patterns.add<BroadcastPropagationPattern>(ctx);
-
-    // Apply broadcast propagation in reverse order to start propagation at
-    // the root of broadcast chains. This avoids duplicate work.
-    GreedyRewriteConfig config;
-    config.useTopDownTraversal = false;
-
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns),
-                                     config))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createBroadcastPropagationPass() {
-  return std::make_unique<BroadcastPropagationPass>();
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index 278e4d683111..a279df136ffb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <optional>
 #include <utility>
+#include <vector>
 
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/rewriters.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -143,15 +145,20 @@ struct RaggedDotChloToMhlo : public OpRewritePattern<chlo::RaggedDotOp> {
       precisionConfig = rewriter.getArrayAttr(vector);
     }
 
-    rewriter.replaceOp(
-        raggedDotOp,
-        rewriter
-            .create<mhlo::RaggedDotOp>(
-                raggedDotOp.getLoc(), raggedDotOp.getResult().getType(),
-                raggedDotOp.getLhs(), raggedDotOp.getRhs(),
-                raggedDotOp.getGroupSizes(), raggedDotDimNums, precisionConfig)
-            .getOperation());
+    mhlo::RaggedDotOp mhloOp = rewriter.create<mhlo::RaggedDotOp>(
+        raggedDotOp.getLoc(), raggedDotOp.getResult().getType(),
+        raggedDotOp.getLhs(), raggedDotOp.getRhs(), raggedDotOp.getGroupSizes(),
+        raggedDotDimNums, precisionConfig);
+    std::optional<NamedAttribute> frontendAttributes =
+        raggedDotOp->getAttrDictionary().getNamed("mhlo.frontend_attributes");
+    if (frontendAttributes.has_value()) {
+      std::vector<NamedAttribute> attributes =
+          mhloOp->getDiscardableAttrDictionary().getValue().vec();
+      attributes.push_back(frontendAttributes.value());
+      mhloOp->setDiscardableAttrs(rewriter.getDictionaryAttr(attributes));
+    }
 
+    rewriter.replaceOp(raggedDotOp, mhloOp.getOperation());
     return success();
   }
 };
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
index 0cf775998042..8d3378e9c0d4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
@@ -51,7 +51,7 @@ class ExpandHloTuplesPass
 
   // Expands the mhlo.tuple used in return op. Also updates function
   // signature accordingly.
-  void expandTupledTensorInReturnOp(func::FuncOp func) {
+  LogicalResult expandTupledTensorInReturnOp(func::FuncOp func) {
     FunctionType oldFuncType = func.getFunctionType();
     // Update input signatures.
     // We will flatten the tuples for the function inputs as well.
@@ -81,7 +81,11 @@ class ExpandHloTuplesPass
         Location loc = func.getBody().getLoc();
         for (auto flattenedType : tupleType.getTypes()) {
           expandedInputTypes.push_back(flattenedType);
-          func.insertArgument(++argumentIndex, flattenedType, {}, loc);
+
+          if (failed(func.insertArgument(++argumentIndex, flattenedType, {},
+                                         loc))) {
+            return failure();
+          }
           flattenedOperands.push_back(func.getArgument(argumentIndex));
         }
 
@@ -94,7 +98,9 @@ class ExpandHloTuplesPass
 
         // Now the original argument has been rewired, we should be able to
         // safely erase it.
-        func.eraseArgument(originalArgumentIndex);
+        if (failed(func.eraseArgument(originalArgumentIndex))) {
+          return failure();
+        }
       }
     }
 
@@ -119,7 +125,9 @@ class ExpandHloTuplesPass
       }
     }
 
-    if (returnOp.getOperands() == expandedReturnOperands) return;
+    if (returnOp.getOperands() == expandedReturnOperands) {
+      return success();
+    }
 
     builder.create<mlir::func::ReturnOp>(returnOp.getLoc(),
                                          expandedReturnOperands);
@@ -127,6 +135,7 @@ class ExpandHloTuplesPass
     auto newFuncType = FunctionType::get(
         oldFuncType.getContext(), expandedInputTypes, expandedResultTypes);
     func.setType(newFuncType);
+    return success();
   }
 
   void runOnOperation() override {
@@ -143,7 +152,9 @@ class ExpandHloTuplesPass
         llvm::any_of(llvm::concat<const Type>(entryFunction.getArgumentTypes(),
                                               entryFunction.getResultTypes()),
                      [](Type type) { return mlir::isa<TupleType>(type); })) {
-      expandTupledTensorInReturnOp(entryFunction);
+      if (llvm::failed(expandTupledTensorInReturnOp(entryFunction))) {
+        return signalPassFailure();
+      }
     }
   }
 };
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index fcb185a4d311..ec7e331d45a4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -80,14 +80,6 @@ bool hasPrivateFeaturesNotInStablehlo(HloOpTy hloOp) {
   return false;
 }
 
-bool hasPackedNibble(std::optional<ArrayAttr> precisionConfigAttr) {
-  if (!precisionConfigAttr) return false;
-  return llvm::any_of(*precisionConfigAttr, [&](Attribute attr) {
-    auto precisionAttr = mlir::cast<mhlo::PrecisionAttr>(attr);
-    return precisionAttr.getValue() == mhlo::Precision::PACKED_NIBBLE;
-  });
-}
-
 // EXPERIMENTAL MHLO features are being explored by ML frontends but do not have
 // any agreed upon compatibility guarantees. By default, these features cannot
 // be converted to StableHLO, although the allow-experimental-features flag can
@@ -95,31 +87,11 @@ bool hasPackedNibble(std::optional<ArrayAttr> precisionConfigAttr) {
 // for StableHLO, and they are usually accompanied by a StableHLO GitHub ticket.
 template <typename HloOpTy>
 bool hasExperimentalFeaturesNotInStablehlo(HloOpTy hloOp) {
-  if constexpr (std::is_same<HloOpTy, mhlo::AllReduceOp>::value) {
-    // StableHLO AllReduce doesn't support the tuple form yet.
-    // Proposal: https://github.com/openxla/stablehlo/issues/1370.
-    if (hloOp.getNumOperands() != 1) return true;
-  }
   if constexpr (std::is_same<HloOpTy, mhlo::AllToAllOp>::value) {
     // StableHLO AllToAll doesn't support the tuple form yet.
     // Proposal: https://github.com/openxla/stablehlo/issues/574.
     if (hloOp.getNumOperands() != 1) return true;
   }
-  if constexpr (std::is_same<HloOpTy, mhlo::ConvolutionOp>::value) {
-    // StableHLO ConvolutionOp doesn't support PACKED_NIBBLE yet.
-    // Proposal: https://github.com/openxla/stablehlo/issues/742.
-    if (hasPackedNibble(hloOp.getPrecisionConfig())) return true;
-  }
-  if constexpr (std::is_same<HloOpTy, mhlo::DotGeneralOp>::value) {
-    // StableHLO DotGeneral doesn't support PACKED_NIBBLE yet.
-    // Proposal: https://github.com/openxla/stablehlo/issues/742.
-    if (hasPackedNibble(hloOp.getPrecisionConfig())) return true;
-  }
-  if constexpr (std::is_same<HloOpTy, mhlo::DotOp>::value) {
-    // StableHLO Dot doesn't support PACKED_NIBBLE yet.
-    // Proposal: https://github.com/openxla/stablehlo/issues/742.
-    if (hasPackedNibble(hloOp.getPrecisionConfig())) return true;
-  }
   return false;
 }
 
@@ -294,9 +266,6 @@ Attribute convertAttr(Attribute hloAttr) {
         attr.getOperandTupleIndices());
   }
   if (auto attr = mlir::dyn_cast<mhlo::PrecisionAttr>(hloAttr)) {
-    // StableHLO Precision doesn't support PACKED_NIBBLE yet.
-    // Proposal: https://github.com/openxla/stablehlo/issues/742.
-    if (attr.getValue() == mhlo::Precision::PACKED_NIBBLE) return {};
     RETURN_CONVERTED_ENUM_ATTR(Precision);
   }
   if (auto attr = mlir::dyn_cast<mhlo::RngAlgorithmAttr>(hloAttr)) {
@@ -364,7 +333,7 @@ Attribute convertAttr(Attribute hloAttr) {
 #undef RETURN_CONVERTED_ENUM_ATTR
 
 // Convert array of enum attrs to an array of enum strings
-//   [#mhlo<precision PACKED_NIBBLE>] -> ["PACKED_NIBBLE"]
+//   [#mhlo<precision HiGHEST>] -> ["HIGHEST"]
 //
 // This is stable as long as enum names are not changed. This is needed to avoid
 // a dependency on upstream printing / parsing. If an attribute name is changed,
@@ -386,7 +355,7 @@ Attribute encodePrecisionConfig(ArrayAttr precisionConfigAttr) {
 template <typename FailedToConvertTy>
 LogicalResult notifyConversionFailure(ConversionPatternRewriter& rewriter,
                                       Operation* op,
-                                      std::string const& errorMessage,
+                                      const std::string& errorMessage,
                                       FailedToConvertTy ty) {
   return rewriter.notifyMatchFailure(
       op, [=](Diagnostic& diag) { diag << errorMessage << ": " << ty; });
@@ -469,17 +438,6 @@ LogicalResult convertAttributes(ConversionPatternRewriter& rewriter,
         continue;
     }
 
-    // If PACKED_NIBBLE enum support enabled, convert to string "PACKED_NIBBLE"
-    if constexpr (std::is_same<HloOpTy, mhlo::ConvolutionOp>::value ||
-                  std::is_same<HloOpTy, mhlo::DotGeneralOp>::value ||
-                  std::is_same<HloOpTy, mhlo::DotOp>::value) {
-      if (hloAttr.getName() == "precision_config" &&
-          hasPackedNibble(hloOp.getPrecisionConfig())) {
-        stablehloAttr =
-            encodePrecisionConfig(hloOp.getPrecisionConfig().value());
-      }
-    }
-
     // Handle DenseElements --> DenseArray for certain StableHLO ops
     if constexpr (!std::is_same<HloOpTy, mhlo::ErfOp>::value &&
                   !std::is_same<HloOpTy, mhlo::TopKOp>::value) {
@@ -504,10 +462,10 @@ LogicalResult convertAttributes(ConversionPatternRewriter& rewriter,
 //
 // Example:
 //   %0 = "mhlo.dot"(%arg0, %arg1) {
-//     precision_config = [#mhlo<precision PACKED_NIBBLE>] } ...
+//     precision_config = [#mhlo<precision HIGHEST>] } ...
 //  ==>
 //  %0 = stablehlo.custom_call @mhlo.dot {
-//    mhlo.attributes = {precision_config = ["PACKED_NIBBLE"]}}
+//    mhlo.attributes = {precision_config = ["HIGHEST"]}}
 template <typename HloOpTy>
 LogicalResult rewriteMhloOpAsCustomCall(HloOpTy hloOp,
                                         ConversionPatternRewriter& rewriter,
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
index 7e1318c26264..133067450d40 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
 #include <utility>
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/TypeID.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -39,6 +41,64 @@ namespace mhlo {
 
 namespace {
 
+// AddDependencyOp is the only op that doesn't exist in StableHLO but uses
+// token types. This led to two options (1) support either token type in
+// AddDependencyOp or (2) Design a token conversion (or unrealized cast) between
+// MHLO and StableHLO. Option (1) seems safer, and we can hopefully obsolete
+// mhlo::TokenType all together and just use StableHLO tokens everywhere.
+//
+// Note: Only the second argument needs to be converted. All token creation and
+// propagation is already handled by existing conversions.
+struct AddDependencyOpToStablehloTokenConverter
+    : public OpConversionPattern<mhlo::AddDependencyOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      mhlo::AddDependencyOp op, mhlo::AddDependencyOpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    // Only convert if input token type is MHLO token
+    if (!llvm::isa<stablehlo::TokenType>(adaptor.getToken().getType()))
+      return rewriter.notifyMatchFailure(op, "nothing to convert");
+    rewriter.replaceOpWithNewOp<mhlo::AddDependencyOp>(op, adaptor.getOperand(),
+                                                       adaptor.getToken());
+    return success();
+  }
+};
+
+bool hasMhloOperand(Operation* op) {
+  return llvm::any_of(op->getOperandTypes(), [](Type type) {
+    // Check for !stablehlo.token
+    if (llvm::isa<mhlo::MhloDialect>(type.getDialect())) return true;
+
+    // Check for tensor<X, #stablehlo.bounds<...>>
+    if (auto rankedType = dyn_cast<RankedTensorType>(type)) {
+      return llvm::isa_and_nonnull<mhlo::TypeExtensionsAttr>(
+          rankedType.getEncoding());
+    }
+    // Not StableHLO
+    return false;
+  });
+}
+
+struct UpdateOperandsInUnknownOp : public ConversionPattern {
+  UpdateOperandsInUnknownOp(TypeConverter& converter, MLIRContext* context)
+      : ConversionPattern(converter, MatchAnyOpTypeTag(), /*benefit=*/1,
+                          context) {}
+  LogicalResult matchAndRewrite(
+      Operation* op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const override {
+    // Input types already converted to MHLO.
+    if (llvm::isa<mhlo::MhloDialect, stablehlo::StablehloDialect>(
+            op->getDialect()))
+      return rewriter.notifyMatchFailure(op, "op is not an unknown op");
+
+    if (!hasMhloOperand(op))
+      return rewriter.notifyMatchFailure(op, "op has no mhlo operands");
+
+    rewriter.modifyOpInPlace(op, [&]() { op->setOperands(operands); });
+    return success();
+  }
+};
+
 struct HloLegalizeToStablehloPass
     : public impl::HloLegalizeToStablehloPassBase<HloLegalizeToStablehloPass> {
   HloLegalizeToStablehloPass()
@@ -52,22 +112,30 @@ struct HloLegalizeToStablehloPass
     target.addIllegalDialect<mhlo::MhloDialect>();
     target.addLegalDialect<stablehlo::StablehloDialect>();
 
-    if (allow_xla_features_) {
-      // These ops do not exist in StableHLO.
-      target.addLegalOp<
-          mhlo::AddDependencyOp, mhlo::AsyncDoneOp, mhlo::AsyncStartOp,
-          mhlo::AsyncUpdateOp, mhlo::BitcastOp, mhlo::CopyOp, mhlo::DomainOp,
-          mhlo::ErfOp, mhlo::FusionOp, mhlo::MinimumBroadcastShapesOp,
-          mhlo::RaggedDotOp, mhlo::SparseDotOp, mhlo::StochasticConvertOp,
-          mhlo::TopKOp, mhlo::TraceOp, mhlo::XlaRngGetAndUpdateStateOp>();
-    }
-
     stablehlo::HloToStablehloTypeConverter converter;
     RewritePatternSet patterns(&getContext());
     stablehlo::populateHloToStablehloPatterns(
         &patterns, &converter, &getContext(), allow_experimental_features_);
     stablehlo::registerFuncOpsForTypeConversion(target, patterns, converter);
 
+    if (allow_xla_features_) {
+      // These ops do not exist in StableHLO.
+      target.addLegalOp<
+          mhlo::AsyncDoneOp, mhlo::AsyncStartOp, mhlo::AsyncUpdateOp,
+          mhlo::BitcastOp, mhlo::CopyOp, mhlo::DomainOp, mhlo::ErfOp,
+          mhlo::FusionOp, mhlo::MinimumBroadcastShapesOp, mhlo::RaggedDotOp,
+          mhlo::SparseDotOp, mhlo::StochasticConvertOp, mhlo::TopKOp,
+          mhlo::TraceOp, mhlo::XlaRngGetAndUpdateStateOp>();
+      target.addDynamicallyLegalOp<mhlo::AddDependencyOp>(
+          [](mhlo::AddDependencyOp op) { return !hasMhloOperand(op); });
+      patterns.add<AddDependencyOpToStablehloTokenConverter>(&getContext());
+    }
+
+    // Handle non-MHLO ops that may have bounded dynamism or token types.
+    target.markUnknownOpDynamicallyLegal(
+        [](Operation* op) { return !hasMhloOperand(op); });
+    patterns.add<UpdateOperandsInUnknownOp>(converter, &getContext());
+
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       return signalPassFailure();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
deleted file mode 100644
index d6c4b4767297..000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
+++ /dev/null
@@ -1,478 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-==============================================================================*/
-
-#include <memory>
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/passes.h"
-#include "mhlo/transforms/rewriters.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_MERGEASSUMINGOPSPASS
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-struct ShapeReificationPattern : public OpRewritePattern<shape::ShapeOfOp> {
-  explicit ShapeReificationPattern(MLIRContext *context)
-      : OpRewritePattern<shape::ShapeOfOp>(context) {
-    // Recursively reify until we hit an op that doesn't support it.
-    setHasBoundedRewriteRecursion();
-  }
-
-  LogicalResult matchAndRewrite(shape::ShapeOfOp op,
-                                PatternRewriter &rewriter) const override {
-    // Only reify shape computation if operand allows for it.
-    auto shapeOrigin = op.getArg().getDefiningOp<InferShapedTypeOpInterface>();
-    if (!shapeOrigin) return failure();
-
-    llvm::SmallVector<Value, 1> reifications;
-    if (failed(shapeOrigin.reifyReturnTypeShapes(
-            rewriter, shapeOrigin->getOperands(), reifications)))
-      return failure();
-    assert(reifications.size() == 1);
-    Value reifiedShape = reifications.front();
-
-    // Insert cast if needed.
-    if (reifiedShape.getType() != op.getType()) {
-      reifiedShape = rewriter.create<tensor::CastOp>(op.getLoc(), op.getType(),
-                                                     reifiedShape);
-    }
-
-    rewriter.replaceOp(op, reifiedShape);
-    return success();
-  }
-};
-
-template <typename OpTy>
-struct InlineBroadcastedShapeOperandsPattern : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Find all the shape operands, direct and indirect.
-    SmallVector<Value, 8> inlinedOperands;
-    for (Value direct : op->getOperands()) {
-      if (auto bcastOp = direct.getDefiningOp<shape::BroadcastOp>()) {
-        for (Value indirect : bcastOp->getOperands())
-          inlinedOperands.push_back(indirect);
-      } else {
-        inlinedOperands.push_back(direct);
-      }
-    }
-
-    // Only rewrite if it makes a difference.
-    if (inlinedOperands.size() == op.getNumOperands()) return failure();
-
-    // Inline shape operands.
-    rewriter.replaceOpWithNewOp<OpTy>(op, op->getResultTypes(), inlinedOperands,
-                                      op->getAttrs());
-    return success();
-  }
-};
-
-LogicalResult moveUpIntoAssumingOpMatchAndRewrite(Operation *op,
-                                                  PatternRewriter &rewriter) {
-  // Only implemented for single-result ops.
-  if (op->getNumResults() != 1) return failure();
-
-  // Find a preceding `assuming` op.
-  auto *theBlock = op->getBlock();
-  Operation *prev = op->getPrevNode();
-  while (prev != nullptr && !llvm::isa<shape::AssumingOp>(prev))
-    prev = prev->getPrevNode();
-  auto assumingOp = llvm::dyn_cast_or_null<shape::AssumingOp>(prev);
-  if (!assumingOp) return failure();
-  assert(assumingOp->getBlock() == theBlock && op->getBlock() == theBlock &&
-         "expect assuming op and root op to be in the same block");
-
-  // Make sure that all operands will be available after moving.
-  auto isAvailable = [&](Value v) {
-    Operation *def = v.getDefiningOp();
-    return def == nullptr || def->getBlock() != theBlock ||
-           !assumingOp->isBeforeInBlock(def);
-  };
-  if (!llvm::all_of(op->getOperands(), isAvailable)) return failure();
-
-  Block *body = assumingOp.getBody();
-  auto yieldOp = llvm::cast<shape::AssumingYieldOp>(body->getTerminator());
-
-  // Find the operands to use if the op was within the assuming region. We
-  // will later use their copies, as we copy the assuming op and its body.
-  SmallVector<Value, 8> newOperandsUnmapped =
-      llvm::to_vector<8>(llvm::map_range(op->getOperands(), [&](Value v) {
-        for (const auto &result : llvm::enumerate(assumingOp->getResults())) {
-          if (result.value() == v) return yieldOp->getOperand(result.index());
-        }
-        return v;
-      }));
-
-  // Insert the rewritten assuming op right before the old one.
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(assumingOp);
-  auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-      assumingOp.getLoc(), assumingOp.getWitness(),
-      [&](OpBuilder &b, Location) {
-        // Copy body.
-        IRMapping mapping;
-        for (auto &nested : body->without_terminator())
-          b.clone(nested, mapping);
-
-        // Copy op into the new body and use the mapped operands.
-        for (auto it : llvm::zip(op->getOperands(), newOperandsUnmapped)) {
-          Value oldOperand, newOperandUnmapped;
-          std::tie(oldOperand, newOperandUnmapped) = it;
-          mapping.map(oldOperand, mapping.lookupOrDefault(newOperandUnmapped));
-        }
-        Operation *newOp = b.clone(*op, mapping);
-
-        // Yield the previous results and also the new ones.
-        auto mappedResults = llvm::to_vector<8>(llvm::map_range(
-            yieldOp.getOperands(),
-            [&](Value v) { return mapping.lookupOrDefault(v); }));
-        mappedResults.append(newOp->getResults().begin(),
-                             newOp->getResults().end());
-        return mappedResults;
-      });
-
-  // Replace the assuming op and the root op with the corresponding result
-  // values.
-  ValueRange newAssumingOpResults = newAssumingOp->getResults();
-  rewriter.replaceOp(assumingOp, newAssumingOpResults.drop_back());
-  rewriter.replaceOp(op, newAssumingOpResults.back());
-  return success();
-}
-
-/// Move operation into a preceding assuming op. This allows to process
-/// operations that depend on the assuming op's results. It will eventually
-/// allow to make assuming regions' constraints independent from each other.
-template <typename OpTy>
-struct MoveUpIntoAssumingOpPattern : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    return moveUpIntoAssumingOpMatchAndRewrite(op.getOperation(), rewriter);
-  }
-};
-
-// Move elementwise operations into a preceding assuming op. This will
-// eventually allow for more fusion opportunities.
-struct MoveElementwiseOpsUpIntoAssumingOpPattern : public RewritePattern {
-  explicit MoveElementwiseOpsUpIntoAssumingOpPattern(MLIRContext *ctx)
-      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
-
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    // Apply to all elementwise and broadcasting elementwise operations with no
-    // side effects.
-    if (!op->hasTrait<mlir::OpTrait::Elementwise>() &&
-        !op->hasTrait<hlo::OpTrait::BroadcastingElementwise>()) {
-      return failure();
-    }
-    if (!isMemoryEffectFree(op)) return failure();
-
-    return moveUpIntoAssumingOpMatchAndRewrite(op, rewriter);
-  }
-};
-
-// Move operation into an assuming region if all uses are within its body.
-LogicalResult moveDownIntoAssumingOpMatchAndRewrite(Operation *op,
-                                                    PatternRewriter &rewriter) {
-  auto users = op->getUsers();
-  auto it = users.begin();
-  auto end = users.end();
-  if (it == end) return failure();
-
-  // Find candidate assuming op.
-  auto assumingOp = (it++)->getParentOfType<shape::AssumingOp>();
-  if (!assumingOp || assumingOp->isProperAncestor(op)) return failure();
-
-  // Make sure all uses are within the unique assuming op's body.
-  while (it != end) {
-    auto hopefullySameAssumingOp = (it++)->getParentOfType<shape::AssumingOp>();
-    if (!hopefullySameAssumingOp || hopefullySameAssumingOp != assumingOp) {
-      return failure();
-    }
-  }
-
-  // Move op into the assuming region.
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(assumingOp.getBody());
-  Operation *newOp = rewriter.clone(*op);
-  rewriter.replaceOp(op, newOp->getResults());
-  return success();
-}
-
-// Move elementwise operations into succeeding assuming regions. This will
-// eventually allow for more fusion opportunities.
-struct MoveElementwiseOpsDownIntoAssumingOpPattern : public RewritePattern {
-  explicit MoveElementwiseOpsDownIntoAssumingOpPattern(MLIRContext *ctx)
-      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
-
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    // Apply to all elementwise and broadcasting elementwise operations with no
-    // side effects.
-    if (!op->hasTrait<mlir::OpTrait::Elementwise>() &&
-        !op->hasTrait<hlo::OpTrait::BroadcastingElementwise>()) {
-      return failure();
-    }
-    if (!isMemoryEffectFree(op)) return failure();
-
-    return moveDownIntoAssumingOpMatchAndRewrite(op, rewriter);
-  }
-};
-
-/// Move operation out of assuming op. This is only valid for
-/// constraint-independent ops, like `cstr_broadcastable` and `shape_of`. It
-/// will eventually allow to make assuming regions' constraints independent from
-/// each other.
-template <typename OpTy>
-struct MoveUpOutOfAssumingOpPattern : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Must be inside of an assuming op.
-    auto assumingOp = op->template getParentOfType<shape::AssumingOp>();
-    if (!assumingOp) return failure();
-
-    // Operands must not be defined within the assuming op.
-    Block *body = assumingOp.getBody();
-    auto isAvailable = [&](Value v) {
-      Operation *def = v.getDefiningOp();
-      return def == nullptr || def->getBlock() != body;
-    };
-    if (!llvm::all_of(op->getOperands(), isAvailable)) return failure();
-
-    // Move op before the assuming region.
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPoint(assumingOp);
-    Operation *newOp = rewriter.clone(*op);
-    rewriter.replaceOp(op, newOp->getResults());
-
-    // If the assuming region yields none of the new op's results, these values
-    // are exclusively used in the assuming op's body. In these cases there is
-    // no need for further rewrites.
-    auto isNewOpResult = [newOp](Value v) {
-      return llvm::is_contained(newOp->getResults(), v);
-    };
-    auto yieldOp = cast<shape::AssumingYieldOp>(body->getTerminator());
-    if (llvm::none_of(yieldOp.getOperands(), isNewOpResult)) return success();
-
-    // If the assuming region yields any of the new op's results, these values
-    // can instead bypass the assuming region. There is no need to yield them
-    // explicitly as they are assumed to be independent. The assuming op is
-    // rewritten accordingly.
-    SmallVector<Value, 2> replacementValues;
-    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-        assumingOp.getLoc(), assumingOp.getWitness(),
-        [&](OpBuilder &b, Location) {
-          // Copy body.
-          IRMapping mapping;
-          for (Operation &nested : body->without_terminator()) {
-            b.clone(nested, mapping);
-          }
-
-          // Collect new yield operands.
-          SmallVector<Value, 2> newYieldOperands;
-          for (Value result : yieldOp.getOperands()) {
-            if (isNewOpResult(result)) {
-              replacementValues.push_back(result);
-            } else {
-              newYieldOperands.push_back(mapping.lookupOrDefault(result));
-              replacementValues.push_back(nullptr);
-            }
-          }
-          return newYieldOperands;
-        });
-
-    // Use the assuming op's results for the missing replacement values.
-    auto src = newAssumingOp.getResults().begin();
-    for (auto &dst : replacementValues) {
-      if (dst) continue;
-      dst = *src++;
-    }
-
-    rewriter.replaceOp(assumingOp, replacementValues);
-    return success();
-  }
-};
-
-/// Merge assuming regions if their constraints are independent from each other.
-struct MergeAssumingOpsPattern : public OpRewritePattern<shape::AssumingOp> {
-  using OpRewritePattern<shape::AssumingOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(shape::AssumingOp op,
-                                PatternRewriter &rewriter) const override {
-    // Merge assuming op with directly preceding one if both witnesses are
-    // availiable.
-    auto precedingOp =
-        llvm::dyn_cast_or_null<shape::AssumingOp>(op->getPrevNode());
-    if (!precedingOp) return failure();
-    if (op.getWitness().getDefiningOp() == precedingOp) return failure();
-
-    // Merge witnesses.
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPoint(precedingOp);
-    Value newWitness = rewriter.create<shape::AssumingAllOp>(
-        op.getWitness().getDefiningOp()->getLoc(),
-        ValueRange{precedingOp.getWitness(), op.getWitness()});
-
-    // Merge assuming ops.
-    Block *body_a = precedingOp.getBody();
-    Block *body_b = op.getBody();
-    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-        precedingOp.getLoc(), newWitness, [&](OpBuilder &b, Location) {
-          // Copy preceding op's body.
-          IRMapping mapping;
-          for (auto &nested : body_a->without_terminator()) {
-            b.clone(nested, mapping);
-          }
-
-          // Map result values of preceding assuming op.
-          auto yieldOpA =
-              llvm::dyn_cast<shape::AssumingYieldOp>(body_a->getTerminator());
-          for (auto pair :
-               llvm::zip(precedingOp->getResults(), yieldOpA.getOperands())) {
-            mapping.map(std::get<0>(pair),
-                        mapping.lookupOrDefault(std::get<1>(pair)));
-          }
-
-          // Copy op's body.
-          for (auto &nested : body_b->without_terminator()) {
-            b.clone(nested, mapping);
-          }
-
-          // Collect merged assuming op's results.
-          SmallVector<Value, 4> mappedResults;
-          auto yieldOpB =
-              llvm::dyn_cast<shape::AssumingYieldOp>(body_b->getTerminator());
-          for (Value v : yieldOpA.getOperands()) {
-            mappedResults.push_back(mapping.lookupOrDefault(v));
-          }
-          for (Value v : yieldOpB.getOperands()) {
-            mappedResults.push_back(mapping.lookupOrDefault(v));
-          }
-          return mappedResults;
-        });
-
-    // Replace the two assuming ops with the new corresponding results.
-    ValueRange newResults = newAssumingOp->getResults();
-    size_t splitAt = precedingOp->getNumResults();
-    rewriter.replaceOp(precedingOp, newResults.take_front(splitAt));
-    rewriter.replaceOp(op, newResults.drop_front(splitAt));
-    return success();
-  }
-};
-
-struct EliminateDuplicateCstrBroadcastableOps
-    : public OpRewritePattern<shape::CstrBroadcastableOp> {
-  using OpRewritePattern<shape::CstrBroadcastableOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
-                                PatternRewriter &rewriter) const override {
-    // Search for previous occurence of the same constraint.
-    Operation *it = op->getPrevNode();
-    while (it != nullptr) {
-      if (auto candidate = llvm::dyn_cast<shape::CstrBroadcastableOp>(it)) {
-        if (candidate.getShapes() == op.getShapes()) {
-          rewriter.replaceOp(op, candidate.getResult());
-          return success();
-        }
-      }
-      it = it->getPrevNode();
-    }
-
-    return failure();
-  }
-};
-
-struct MergeAssumingOpsPass
-    : public impl::MergeAssumingOpsPassBase<MergeAssumingOpsPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<shape::ShapeDialect, mhlo::MhloDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *ctx = &getContext();
-    RewritePatternSet patterns(ctx);
-    mhlo::populateMergeAssumingOpsPatterns(ctx, &patterns);
-    GreedyRewriteConfig config;
-    config.maxIterations = GreedyRewriteConfig::kNoLimit;
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns),
-                                     config))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-void populateMergeAssumingOpsPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns) {
-  // clang-format off
-  patterns->add<
-      EliminateDuplicateCstrBroadcastableOps,
-      InlineBroadcastedShapeOperandsPattern<shape::CstrBroadcastableOp>,
-      MergeAssumingOpsPattern,
-      MoveElementwiseOpsDownIntoAssumingOpPattern,
-      MoveElementwiseOpsUpIntoAssumingOpPattern,
-      MoveUpIntoAssumingOpPattern<shape::AssumingAllOp>,
-      MoveUpIntoAssumingOpPattern<shape::CstrBroadcastableOp>,
-      MoveUpIntoAssumingOpPattern<shape::ShapeOfOp>,
-      MoveUpOutOfAssumingOpPattern<shape::AssumingAllOp>,
-      MoveUpOutOfAssumingOpPattern<shape::CstrBroadcastableOp>,
-      MoveUpOutOfAssumingOpPattern<shape::ShapeOfOp>,
-      ShapeReificationPattern>(context);
-  // clang-format on
-  mhlo::DynamicBroadcastInDimOp::getCanonicalizationPatterns(*patterns,
-                                                             context);
-  mhlo::DynamicReshapeOp::getCanonicalizationPatterns(*patterns, context);
-  shape::AssumingAllOp::getCanonicalizationPatterns(*patterns, context);
-  shape::AssumingOp::getCanonicalizationPatterns(*patterns, context);
-  shape::BroadcastOp::getCanonicalizationPatterns(*patterns, context);
-  shape::CstrBroadcastableOp::getCanonicalizationPatterns(*patterns, context);
-  tensor::CastOp::getCanonicalizationPatterns(*patterns, context);
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createMergeAssumingOpsPass() {
-  return std::make_unique<MergeAssumingOpsPass>();
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index 48b6815ceade..853531c1c6e0 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -129,31 +129,6 @@ def TestInferShapedTypeMethodsPass : Pass<"mhlo-test-infer-shaped-type-methods",
   let constructor = "createTestInferShapedTypeMethodsPass()";
 }
 
-def BroadcastPropagationPass : Pass<"mhlo-broadcast-propagation", "func::FuncOp"> {
-  let summary = "Move dynamic broadcasts up over element-wise operations and "
-    "broadcast the operands rather than the result. This will eventually allow "
-    "for larger fusions.";
-  let constructor = "createBroadcastPropagationPass()";
-}
-
-def MergeAssumingOpsPass : Pass<"mhlo-merge-assuming-ops", "func::FuncOp"> {
-  let summary = "Prepare moving dynamic broadcasts up over element-wise "
-    "operations and broadcast the operands rather than the result. This will "
-    "eventually allow for larger fusions.";
-  let constructor = "createMergeAssumingOpsPass()";
-}
-
-def SymbolicShapeOptimization : Pass<"symbolic-shape-optimization", "func::FuncOp"> {
-  let summary = "Analyzes shapes and performs shape-related optimizations";
-  let constructor = "createSymbolicShapeOptimizationPass()";
-}
-
-def ShapeSimplification
-    : Pass<"shape-simplification", "mlir::func::FuncOp"> {
-  let summary = "Simplify shape ops";
-  let constructor = "createShapeSimplification()";
-}
-
 def TestUnfuseBatchNormPass : Pass<"mhlo-test-unfuse-batch-norm", "func::FuncOp"> {
   let summary = "Test pass for materializing 'broadcast_dimensions' attributes.";
   let constructor = "createTestUnfuseBatchNormPass()";
@@ -206,8 +181,12 @@ def HloLegalizeToStablehloPass : Pass<"hlo-legalize-to-stablehlo", "ModuleOp"> {
 
 def StablehloLegalizeToHloPass : Pass<"stablehlo-legalize-to-hlo", "ModuleOp"> {
   let summary = "Legalize StableHLO to HLO.";
-  let constructor = "createStablehloLegalizeToHloPass()";
   let dependentDialects = ["mhlo::MhloDialect"];
+  let options = [
+    Option<"convert_xla_supported_stablehlo_", "convert-xla-supported-stablehlo",
+           "bool", /*default=*/"true",
+           "Don't convert ops that have direct HLO lowering support.">
+  ];
 }
 
 def PrepareForExportPass : Pass<"xla-prepare-for-export", "mlir::func::FuncOp"> {
@@ -223,20 +202,3 @@ def PrepareForExportPass : Pass<"xla-prepare-for-export", "mlir::func::FuncOp">
     canonicalization may undo transformations.
   }];
 }
-
-def ShapeLegalizeToHloPass : Pass<"shape-legalize-to-hlo", "func::FuncOp"> {
-  let summary = "Legalize shape-related ops to HLO.";
-  let constructor = "createShapeLegalizeToHloPass()";
-  let description = [{
-    An experimental pass that legalizes shape-related ops to MHLO ops.
-
-    Bringing shape and data computations together via an optional pass will
-    make it possible for the MHLO ecosystem to potentially leverage the
-    compilation pipelines that use HLO operations to model dynamism.
-  }];
-  let dependentDialects = ["mhlo::MhloDialect"];
-  let options = [
-    Option<"legalize_constraints_", "legalize-constraints", "bool",
-           /*default=*/"false", "Whether to legalize Cstr Ops to shape_assertion custom_call">
-  ];
-}
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
index 81b35d2f9017..3d2aa3b3d31b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -53,25 +53,6 @@ createSinkConstantsToControlFlowPass();
 /// that do not use intrinsics.
 std::unique_ptr<OperationPass<func::FuncOp>>
 createLegalizeTrigonometricToApproximationPass();
-
-// Move dynamic broadcasts up over element-wise operations and broadcast the
-// operands rather than the result. This will eventually allow for larger
-// fusions.
-std::unique_ptr<OperationPass<func::FuncOp>> createBroadcastPropagationPass();
-
-// Prepare moving dynamic broadcasts up over element-wise operations and
-// broadcast the operands rather than the result. This will eventually allow for
-// larger fusions.
-std::unique_ptr<OperationPass<func::FuncOp>> createMergeAssumingOpsPass();
-
-/// Creates a pass to analyze shapes and to use that information for
-/// shape-related optimizations.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createSymbolicShapeOptimizationPass();
-
-// Pass to simplify shape ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeSimplification();
-
 std::unique_ptr<OperationPass<func::FuncOp>>
 createLegalizeDotToDotGeneralPass();
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -91,13 +72,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createCollapseElementwiseMapPass();
 // Pass to replace unsigned types with signless integers.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToSignlessPass();
 
-// Legalizes from the StableHLO dialect to the MHLO dialect.
-std::unique_ptr<OperationPass<ModuleOp>> createStablehloLegalizeToHloPass();
-
-// Legalizes from the Shape dialect to the MHLO dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeLegalizeToHloPass(
-    bool legalizeConstraints = false);
-
 // Test passes.
 std::unique_ptr<Pass> createTestInferShapedTypeMethodsPass();
 std::unique_ptr<Pass> createTestMaterializeBroadcastsPass();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
index ac6949551f0a..fd626abfaeaf 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -101,12 +101,6 @@ inline void populateUnfuseBatchNormPatterns(MLIRContext *context,
 void populateTrigonometricToApproximationPatterns(MLIRContext *context,
                                                   RewritePatternSet *patterns);
 
-// Populate patterns to prepare moving dynamic broadcasts up over element-wise
-// operations and broadcast the operands rather than the result. This will
-// eventually allow for larger fusions.
-void populateMergeAssumingOpsPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns);
-
 // Populate patterns to group reduction and parallel dimensions of reduction
 // operations and realize them through equivalent 1D or 2D reductions.
 void populateGroupReductionDimensionsPatterns(MLIRContext *context,
@@ -148,6 +142,10 @@ void populateStablehloToHloPatterns(RewritePatternSet *patterns,
                                     TypeConverter *converter,
                                     MLIRContext *context);
 
+// Sets up legality definitions for StableHLO ops and non-StableHLO ops that
+// may have StableHLO operands.
+void setupStablehloToHloConversionTarget(ConversionTarget &target);
+
 }  // namespace stablehlo
 
 }  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
deleted file mode 100644
index e00300f7a38f..000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
+++ /dev/null
@@ -1,705 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/DialectRegistry.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Support/TypeID.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_SHAPELEGALIZETOHLOPASS
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-bool hasI32Style(Value value) {
-  auto type = mlir::dyn_cast<ShapedType>(value.getType());
-  return type && type.getElementType().isInteger(32);
-}
-
-// Cast from index-based shape representation used in the Shape dialect to the
-// i32-based representation used in HLO:
-//   * index => tensor<i32>.
-//   * tensor<Nxindex> => tensor<Nxi32>.
-//   * All i32-based types from above => themselves.
-// There is no convenient op that can express this, so we're using
-// unrealized_conversion_cast (with the idea that all these casts will
-// annihilate at the end of the pass).
-Value castToI32(PatternRewriter& rewriter, Location loc, Value value) {
-  Type resultType;
-  if (value.getType().isIndex())
-    resultType = RankedTensorType::get({}, rewriter.getI32Type());
-  if (auto valueType = mlir::dyn_cast<ShapedType>(value.getType())) {
-    if (!valueType.hasStaticShape()) return {};
-    if (valueType.getElementType().isInteger(32)) return value;
-    if (valueType.getElementType().isIndex())
-      resultType =
-          RankedTensorType::get(valueType.getShape(), rewriter.getI32Type());
-  }
-  if (!resultType) return {};
-  auto cast =
-      rewriter.create<UnrealizedConversionCastOp>(loc, resultType, value);
-  return cast.getResult(0);
-}
-
-bool hasIndexStyle(Value value) {
-  if (value.getType().isIndex()) return true;
-  auto type = mlir::dyn_cast<ShapedType>(value.getType());
-  return type && type.getElementType().isIndex();
-}
-
-// Cast from the i32-based shape representation used in HLO to the index-based
-// representation used in the Shape dialect:
-//   * tensor<i32> => index.
-//   * tensor<Nxi32> => tensor<Nxindex>.
-//   * All index-based types from above => themselves.
-// There is no convenient op that can express this, so we're using
-// unrealized_conversion_cast (with the idea that all these casts will
-// annihilate at the end of the pass).
-Value castToIndex(PatternRewriter& rewriter, Location loc, Value value) {
-  Type resultType;
-  if (value.getType().isIndex()) return value;
-  if (auto valueType = mlir::dyn_cast<ShapedType>(value.getType())) {
-    if (!valueType.hasStaticShape()) return {};
-    if (valueType.getElementType().isInteger(32)) {
-      if (valueType.getRank() == 0) {
-        resultType = rewriter.getIndexType();
-      } else {
-        resultType = RankedTensorType::get(valueType.getShape(),
-                                           rewriter.getIndexType());
-      }
-    }
-    if (valueType.getElementType().isIndex()) return value;
-  }
-  if (!resultType) return {};
-  auto cast =
-      rewriter.create<UnrealizedConversionCastOp>(loc, resultType, value);
-  return cast.getResult(0);
-}
-
-void insertShapeAssertionCustomCall(OpBuilder builder, Location loc,
-                                    Value assert) {
-  auto customCall =
-      builder.create<mhlo::CustomCallOp>(loc, TypeRange{}, ValueRange{assert});
-  customCall.setCallTargetName("shape_assertion");
-  customCall.setHasSideEffect(true);
-  customCall->setAttr("error_message",
-                      builder.getStringAttr("Shape assertion failed"));
-}
-
-struct ConvertNumElementsOpPattern
-    : public OpRewritePattern<shape::NumElementsOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(shape::NumElementsOp op,
-                                PatternRewriter& rewriter) const override {
-    // Cast shape from tensor<Nxindex> to tensor<Nxi32>.
-    // This will error out if shape is !shape.shape.
-    auto shapeI32 = castToI32(rewriter, op.getLoc(), op.getShape());
-    if (!shapeI32) return rewriter.notifyMatchFailure(op, "cast to i32 failed");
-    auto rank = mlir::cast<ShapedType>(shapeI32.getType()).getNumElements();
-
-    // Compute the product of the individual dimension sizes.
-    // Using this representation instead of mhlo::ReduceOp because it is more
-    // amenable to optimizations. (Reduce can be folded only if the entire
-    // shape is static, but individual multiplications can be folded if
-    // individual dimensions are static).
-    auto resultI32Type = RankedTensorType::get({}, rewriter.getI32Type());
-    Value resultI32 = rewriter.create<ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<int32_t>(resultI32Type, 1));
-    for (auto i = 0; i < rank; ++i) {
-      auto sizeI32x1 = rewriter.create<SliceOp>(
-          op.getLoc(), shapeI32, rewriter.getI64TensorAttr(i),
-          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
-      auto sizeI32 =
-          rewriter.create<ReshapeOp>(op.getLoc(), resultI32Type, sizeI32x1);
-      resultI32 = rewriter.create<MulOp>(op.getLoc(), resultI32, sizeI32);
-    }
-
-    // Cast result from tensor<i32> to index.
-    // This will error out if the result is !shape.size.
-    auto resultIndex = castToIndex(rewriter, op.getLoc(), resultI32);
-    if (!resultIndex || resultIndex.getType() != op.getResult().getType())
-      return rewriter.notifyMatchFailure(op, "cast to index failed");
-    rewriter.replaceOp(op, resultIndex);
-    return success();
-  }
-};
-
-struct ConvertShapeOfOpPattern : public OpRewritePattern<shape::ShapeOfOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(shape::ShapeOfOp op,
-                                PatternRewriter& rewriter) const override {
-    auto operandType = dyn_cast<RankedTensorType>(op.getArg().getType());
-    if (!operandType)
-      return rewriter.notifyMatchFailure(op, "expected ranked operand");
-
-    // Produce an MHLO equivalent of this shape::ShapeOfOp.
-    // This is a very laborious representation because MHLO is currently lacking
-    // convenient tools to express this.
-    Value shapeI32;
-    if (operandType.getRank() > 0) {
-      SmallVector<Value> sizesI32x1;
-      for (auto i = 0; i < operandType.getRank(); ++i) {
-        auto sizeI32 =
-            rewriter.create<GetDimensionSizeOp>(op.getLoc(), op.getArg(), i);
-        auto sizeI32x1 = rewriter.create<ReshapeOp>(
-            op.getLoc(), RankedTensorType::get({1}, rewriter.getI32Type()),
-            sizeI32);
-        sizesI32x1.push_back(sizeI32x1);
-      }
-      shapeI32 = rewriter.create<ConcatenateOp>(op.getLoc(), sizesI32x1,
-                                                /*dimension=*/0);
-    } else {
-      shapeI32 = rewriter.create<ConstantOp>(
-          op.getLoc(), DenseElementsAttr::get(
-                           RankedTensorType::get({0}, rewriter.getI32Type()),
-                           ArrayRef<Attribute>()));
-    }
-
-    // Cast result from tensor<Nxi32> to tensor<Nxindex>.
-    // This will error out if the result is !shape.shape.
-    auto shapeIndex = castToIndex(rewriter, op.getLoc(), shapeI32);
-    if (!shapeIndex || shapeIndex.getType() != op.getType())
-      return rewriter.notifyMatchFailure(op, "cast to index failed");
-    rewriter.replaceOp(op, shapeIndex);
-    return success();
-  }
-};
-
-struct ConvertConstShapeOpPattern
-    : public OpRewritePattern<shape::ConstShapeOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(shape::ConstShapeOp op,
-                                PatternRewriter& rewriter) const override {
-    auto operandType =
-        mlir::dyn_cast<RankedTensorType>(op.getResult().getType());
-    if (!operandType)
-      return rewriter.notifyMatchFailure(op, "expected ranked operand");
-
-    llvm::SmallVector<int32_t> shape;
-    for (int i : op.getShape().getValues<int64_t>()) {
-      shape.push_back(i);
-    }
-    auto newConst = rewriter.create<mhlo::ConstantOp>(
-        op.getLoc(), DenseElementsAttr::get(
-                         RankedTensorType::get({operandType.getDimSize(0)},
-                                               rewriter.getI32Type()),
-                         ArrayRef(shape)));
-    auto newConstIndex = castToIndex(rewriter, op.getLoc(), newConst);
-    rewriter.replaceOp(op, newConstIndex);
-    return success();
-  }
-};
-
-struct ConvertIndexCastOpPattern : public OpRewritePattern<arith::IndexCastOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(arith::IndexCastOp op,
-                                PatternRewriter& rewriter) const override {
-    Value result = op.getIn();
-    if (hasIndexStyle(op.getIn()) &&
-        !mlir::isa<ShapedType>(op.getIn().getType())) {
-      // Handle a special case of index -> i64.
-      // This is converted to the following sequence:
-      //   unrealized_conversion_cast index -> tensor<i32>
-      //   mhlo.convert tensor<i32> -> tensor<i64>
-      //   unrealized_conversion_cast tensor<i64> -> i64
-      result = castToI32(rewriter, op.getLoc(), result);
-      if (!op.getOut().getType().isInteger(32)) {
-        result = rewriter.create<ConvertOp>(op.getLoc(), result,
-                                            op.getOut().getType());
-      }
-      rewriter.replaceOp(op, rewriter.create<UnrealizedConversionCastOp>(
-                                 op.getLoc(), op.getOut().getType(), result));
-      return success();
-    }
-    if (!mlir::isa<ShapedType>(op.getIn().getType()) &&
-        hasIndexStyle(op.getOut())) {
-      // Handle a special case of i32 -> index.
-      // This is converted to the following sequence:
-      //   unrealized_conversion_cast i32 -> tensor<i32>
-      //   unrealized_conversion_cast tensor<i32> -> index
-      result = rewriter
-                   .create<UnrealizedConversionCastOp>(
-                       op.getLoc(), RankedTensorType::get({}, result.getType()),
-                       result)
-                   .getResult(0);
-      rewriter.replaceOp(op, rewriter.create<UnrealizedConversionCastOp>(
-                                 op.getLoc(), op.getOut().getType(), result));
-      return success();
-    }
-
-    if (hasIndexStyle(result)) {
-      result = castToI32(rewriter, op.getLoc(), result);
-    } else if (!hasI32Style(result)) {
-      return rewriter.notifyMatchFailure(op,
-                                         "expected input with index/i32 style");
-    }
-
-    if (hasIndexStyle(op.getOut())) {
-      result = castToIndex(rewriter, op.getLoc(), result);
-    } else if (!hasI32Style(op.getOut())) {
-      return rewriter.notifyMatchFailure(
-          op, "expected output with index/i32 style");
-    }
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-struct ConvertMulIOpPattern : public OpRewritePattern<arith::MulIOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(arith::MulIOp op,
-                                PatternRewriter& rewriter) const override {
-    // We only handle index types.
-    if (!hasIndexStyle(op.getLhs()) || !hasIndexStyle(op.getRhs()) ||
-        !hasIndexStyle(op.getResult())) {
-      return rewriter.notifyMatchFailure(op, "expected index type");
-    }
-    Value lhs = op.getLhs();
-    if (auto constIndex =
-            dyn_cast_or_null<arith::ConstantIndexOp>(lhs.getDefiningOp())) {
-      lhs = rewriter.create<ConstantOp>(
-          op.getLoc(), DenseIntElementsAttr::get<int32_t>(
-                           RankedTensorType::get({}, rewriter.getI32Type()),
-                           static_cast<int32_t>(constIndex.value())));
-    } else {
-      lhs = castToI32(rewriter, op.getLoc(), op.getLhs());
-    }
-    Value rhs = op.getRhs();
-    if (auto constIndex =
-            dyn_cast_or_null<arith::ConstantIndexOp>(rhs.getDefiningOp())) {
-      rhs = rewriter.create<ConstantOp>(
-          op.getLoc(), DenseIntElementsAttr::get<int32_t>(
-                           RankedTensorType::get({}, rewriter.getI32Type()),
-                           static_cast<int32_t>(constIndex.value())));
-    } else {
-      rhs = castToI32(rewriter, op.getLoc(), op.getRhs());
-    }
-    Value result = rewriter.create<mhlo::MulOp>(op.getLoc(), lhs, rhs);
-    rewriter.replaceOp(op, castToIndex(rewriter, op.getLoc(), result));
-    return success();
-  }
-};
-
-// Pads input tensor<N x i32> by X ones from the left. The number X is
-// determined by input pad. Result is tensor<(X+N) x i32>, where the first X
-// elements are ones.
-Value padFromLeft(PatternRewriter& rewriter, Location loc, Value input,
-                  int64_t pad) {
-  Value padI32 = rewriter.create<ConstantOp>(
-      loc, DenseIntElementsAttr::get<int32_t>(
-               RankedTensorType::get({pad}, rewriter.getI32Type()), 1));
-  return rewriter.create<mhlo::ConcatenateOp>(loc, ValueRange{padI32, input},
-                                              /*dimension=*/0);
-}
-
-struct ConvertShapeBroadcastOpPattern
-    : public OpRewritePattern<shape::BroadcastOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(shape::BroadcastOp op,
-                                PatternRewriter& rewriter) const override {
-    // As defined, op inputs must be 1D tensor or !shape.shape.
-    // We only support inputs of two input 1D tensors.
-    if (op.getShapes().size() != 2) return failure();
-    auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
-    auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
-    if (!shape1 || !shape2) return failure();
-    auto tensorType1 = mlir::dyn_cast<RankedTensorType>(shape1.getType());
-    auto tensorType2 = mlir::dyn_cast<RankedTensorType>(shape2.getType());
-    if (!tensorType1 || !tensorType2) return failure();
-
-    // If the two operand shapes are of different sizes, the smaller one is
-    // padded with 1's from the left.
-    if (tensorType1.getDimSize(0) < tensorType2.getDimSize(0)) {
-      shape1 =
-          padFromLeft(rewriter, op.getLoc(), shape1,
-                      tensorType2.getDimSize(0) - tensorType1.getDimSize(0));
-    } else if (tensorType1.getDimSize(0) > tensorType2.getDimSize(0)) {
-      shape2 =
-          padFromLeft(rewriter, op.getLoc(), shape2,
-                      tensorType1.getDimSize(0) - tensorType2.getDimSize(0));
-    }
-
-    // By definition, broadcasted dims are:
-    //   result[i] = lhs[i] if lhs[i] == rhs[i]
-    //             = lhs[i] if rhs[i] == 1
-    //             = rhs[i] if lhs[i] == 1
-    //
-    // We assume that there is shape.cstr_broadcastable check done elsewhere to
-    // make sure the shapes are broadcastable, then we can calculate broadcast
-    // result simply using MaxOp. In case the shapes are not broadcastable, the
-    // result extent tensor is undefined according to spec. So this
-    // implementation is technically correct.
-    auto broadcasted =
-        rewriter.create<mhlo::MaxOp>(op->getLoc(), shape1, shape2);
-
-    auto broadcastedIndex = castToIndex(rewriter, op.getLoc(), broadcasted);
-    if (!broadcastedIndex ||
-        broadcastedIndex.getType() != op.getResult().getType())
-      return rewriter.notifyMatchFailure(op, "cast to index failed");
-    rewriter.replaceOp(op, broadcastedIndex);
-    return success();
-  }
-};
-
-struct ConvertTensorDimPattern : public OpRewritePattern<tensor::DimOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(tensor::DimOp op,
-                                PatternRewriter& rewriter) const override {
-    // We only support getting static index.
-    auto constIndex =
-        dyn_cast_or_null<arith::ConstantIndexOp>(op.getIndex().getDefiningOp());
-    if (!constIndex) {
-      return failure();
-    }
-
-    auto dim = rewriter.create<mhlo::GetDimensionSizeOp>(
-        op->getLoc(), op.getSource(), constIndex.value());
-    auto dimIndex = castToIndex(rewriter, op.getLoc(), dim);
-    rewriter.replaceOp(op, dimIndex);
-    return success();
-  }
-};
-
-struct ConvertTensorExtractPattern
-    : public OpRewritePattern<tensor::ExtractOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(tensor::ExtractOp op,
-                                PatternRewriter& rewriter) const override {
-    SmallVector<int64_t> indices;
-    auto tensorType = op.getTensor().getType();
-    // We only support getting static indices.
-    for (auto index : op.getIndices()) {
-      auto constIndex =
-          dyn_cast_or_null<arith::ConstantIndexOp>(index.getDefiningOp());
-      if (!constIndex)
-        return rewriter.notifyMatchFailure(op, "expected constant index op");
-
-      // Check if the index is out of range.
-      int idx = indices.size();
-      if (tensorType.isDynamicDim(idx) ||
-          constIndex.value() >= tensorType.getDimSize(idx))
-        return rewriter.notifyMatchFailure(op, "index out of range");
-
-      indices.push_back(constIndex.value());
-    }
-    auto input = castToI32(rewriter, op.getLoc(), op.getTensor());
-    auto startIndices = DenseIntElementsAttr::get(
-        RankedTensorType::get({static_cast<int64_t>(indices.size())},
-                              rewriter.getI64Type()),
-        indices);
-    for (auto& index : indices) {
-      index += 1;
-    }
-    auto limitIndices = DenseIntElementsAttr::get(
-        RankedTensorType::get({static_cast<int64_t>(indices.size())},
-                              rewriter.getI64Type()),
-        indices);
-
-    Value extractedTensor = rewriter.create<SliceOp>(
-        op.getLoc(), input, startIndices, limitIndices,
-        /*strides=*/
-        DenseIntElementsAttr::get<int64_t>(
-            RankedTensorType::get({static_cast<int64_t>(indices.size())},
-                                  rewriter.getI64Type()),
-            1));
-    Value extractedScalarTensor = rewriter.create<ReshapeOp>(
-        op.getLoc(), RankedTensorType::get({}, rewriter.getI32Type()),
-        extractedTensor);
-    if (getElementTypeOrSelf(op.getResult().getType()).isIndex()) {
-      auto extractedIndex =
-          castToIndex(rewriter, op.getLoc(), extractedScalarTensor);
-      rewriter.replaceOp(op, extractedIndex);
-    } else {
-      // For the special case when the input is a i32 tensor and output is i32,
-      // convert the result back to i32 to be consistent:
-      //   unrealized_conversion_cast tensor<i32> -> i32
-      rewriter.replaceOp(op, rewriter.create<UnrealizedConversionCastOp>(
-                                 op.getLoc(), op.getResult().getType(),
-                                 extractedScalarTensor));
-    }
-    return success();
-  }
-};
-
-struct ConvertTensorFromElementsPattern
-    : public OpRewritePattern<tensor::FromElementsOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(tensor::FromElementsOp op,
-                                PatternRewriter& rewriter) const override {
-    auto tensorType =
-        mlir::dyn_cast_or_null<RankedTensorType>(op.getResult().getType());
-    if (!tensorType) {
-      return failure();
-    }
-    if (tensorType.getRank() == 0) {
-      // Handle the special cast of tensor.from_elements i64 -> tensor<i64>
-      // This is converted to unrealized_conversin_cast i64 -> tensor<i64>,
-      // which is later cancelled with previous unrealized_conversin_cast op.
-      rewriter.replaceOp(
-          op, rewriter.create<UnrealizedConversionCastOp>(
-                  op.getLoc(), op.getResult().getType(), op.getElements()[0]));
-      return success();
-    }
-
-    // We only handle 1D tensor with index types. tensor.from_elements spec
-    // allows the same element type only for all input/output.
-    if (tensorType.getRank() != 1) return failure();
-    if (!hasIndexStyle(op.getResult())) return failure();
-
-    SmallVector<Value> elementI32x1;
-    for (size_t i = 0; i < op.getElements().size(); ++i) {
-      if (auto constIndex = dyn_cast_or_null<arith::ConstantIndexOp>(
-              op.getElements()[i].getDefiningOp())) {
-        elementI32x1.push_back(rewriter.create<ConstantOp>(
-            op.getLoc(), DenseIntElementsAttr::get<int32_t>(
-                             RankedTensorType::get({1}, rewriter.getI32Type()),
-                             static_cast<int32_t>(constIndex.value()))));
-      } else {
-        elementI32x1.push_back(rewriter.create<ReshapeOp>(
-            op.getLoc(), RankedTensorType::get({1}, rewriter.getI32Type()),
-            castToI32(rewriter, op->getLoc(), op.getElements()[i])));
-      }
-    }
-    Value tensorI32 =
-        rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), elementI32x1,
-                                             /*dimension=*/0);
-
-    tensorI32 = hasI32Style(op.getResult())
-                    ? tensorI32
-                    : castToIndex(rewriter, op.getLoc(), tensorI32);
-    if (!tensorI32 || tensorI32.getType() != op.getResult().getType())
-      return rewriter.notifyMatchFailure(op, "cast to index failed");
-    rewriter.replaceOp(op, tensorI32);
-    return success();
-  }
-};
-
-struct ConvertCstrBroadcastableOp
-    : public OpRewritePattern<shape::CstrBroadcastableOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
-                                PatternRewriter& rewriter) const override {
-    // As defined, op inputs must be 1D tensor or !shape.shape.
-    // We only support inputs of two 1D tensors.
-    if (op.getShapes().size() != 2) return failure();
-    auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
-    auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
-    if (!shape1 || !shape2) return failure();
-    auto tensorType1 = mlir::dyn_cast<RankedTensorType>(shape1.getType());
-    auto tensorType2 = mlir::dyn_cast<RankedTensorType>(shape2.getType());
-    if (!tensorType1 || !tensorType2) return failure();
-
-    // If the two operand shapes are of different sizes, the smaller one is
-    // padded with 1's from the left.
-    int32_t rank =
-        std::max(tensorType1.getDimSize(0), tensorType2.getDimSize(0));
-    if (tensorType1.getDimSize(0) < tensorType2.getDimSize(0)) {
-      shape1 =
-          padFromLeft(rewriter, op.getLoc(), shape1,
-                      tensorType2.getDimSize(0) - tensorType1.getDimSize(0));
-    } else if (tensorType1.getDimSize(0) > tensorType2.getDimSize(0)) {
-      shape2 =
-          padFromLeft(rewriter, op.getLoc(), shape2,
-                      tensorType1.getDimSize(0) - tensorType2.getDimSize(0));
-    }
-
-    // Compute if each dim is broadcastable. A dim is broadcastable iff
-    // dimSize1 == dimSize2 or dimSize1 == 1 or dimSize2 == 1
-    auto allOne = rewriter.create<mhlo::ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<int32_t>(
-                         RankedTensorType::get({rank}, rewriter.getI32Type()),
-                         static_cast<int32_t>(1)));
-    Value dimSize1Is1 = rewriter.create<mhlo::CompareOp>(
-        op.getLoc(), shape1, allOne, ComparisonDirection::EQ);
-    Value dimSize2Is1 = rewriter.create<mhlo::CompareOp>(
-        op.getLoc(), shape2, allOne, ComparisonDirection::EQ);
-    Value eitherDimSizeIs1 =
-        rewriter.create<mhlo::OrOp>(op.getLoc(), dimSize1Is1, dimSize2Is1);
-    Value dimSizeEq = rewriter.create<mhlo::CompareOp>(
-        op.getLoc(), shape1, shape2, ComparisonDirection::EQ);
-    Value dimBroadcastable =
-        rewriter.create<mhlo::OrOp>(op.getLoc(), eitherDimSizeIs1, dimSizeEq);
-
-    // Iterate over each dim to check that all dims are broadcastable.
-    auto boolType = RankedTensorType::get({1}, rewriter.getI1Type());
-    Value allBroadcastable = rewriter.create<ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<bool>(boolType, true));
-    for (auto i = 0; i < rank; ++i) {
-      Value broadcastable = rewriter.create<SliceOp>(
-          op.getLoc(), dimBroadcastable, rewriter.getI64TensorAttr(i),
-          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
-      allBroadcastable =
-          rewriter.create<AndOp>(op.getLoc(), allBroadcastable, broadcastable);
-    }
-    Value allBroadcastableScalar = rewriter.create<ReshapeOp>(
-        op.getLoc(), RankedTensorType::get({}, rewriter.getI1Type()),
-        allBroadcastable);
-
-    // Add CustomCallOp and replace Cstr op with const witness, which is useful
-    // for canonicalizer to remove the shape.assuming region.
-    insertShapeAssertionCustomCall(rewriter, op->getLoc(),
-                                   allBroadcastableScalar);
-    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op.getOperation(), true);
-    return success();
-  }
-};
-
-template <typename OpType>
-struct CastOperandsPattern : public OpRewritePattern<OpType> {
-  using OpRewritePattern<OpType>::OpRewritePattern;
-  LogicalResult matchAndRewrite(OpType op,
-                                PatternRewriter& rewriter) const override {
-    if (!llvm::any_of(op->getOperands(), hasIndexStyle))
-      return rewriter.notifyMatchFailure(op, "no operands need a cast to i32");
-
-    // If op has operands of type tensor<Nxindex>, cast them to tensor<Nxi32>.
-    // If producers of these operands have been transformed into casts from
-    // tensor<Nxi32> to tensor<Nxindex>, then these casts will annihilate with
-    // each other upon canonicalization.
-    SmallVector<Value> operandsI32;
-    for (auto operand : op->getOperands()) {
-      if (hasIndexStyle(operand)) {
-        operandsI32.push_back(castToI32(rewriter, op.getLoc(), operand));
-      } else {
-        operandsI32.push_back(operand);
-      }
-    }
-
-    rewriter.replaceOpWithNewOp<OpType>(op, op->getResultTypes(), operandsI32,
-                                        op->getAttrs());
-    return success();
-  }
-};
-
-// TODO(b/264240901): Comprehensively support shape computations to the extent
-// needed to support bounded dynamism in MHLO export.
-struct ShapeLegalizeToHloPass
-    : public impl::ShapeLegalizeToHloPassBase<ShapeLegalizeToHloPass> {
-  explicit ShapeLegalizeToHloPass(bool legalizeConstraints)
-      : impl::ShapeLegalizeToHloPassBase<
-            ShapeLegalizeToHloPass>::ShapeLegalizeToHloPassBase() {
-    this->legalize_constraints_ = legalizeConstraints;
-  }
-
-  void runOnOperation() override {
-    // In order to make dynamic MHLO programs compatible with HLO,
-    // we need to get rid of all non-MHLO ops.
-    //
-    // As an example, a cursory inspection of the TF/XLA bridge, which provides
-    // one data point of an MHLO producer that can generate dynamic MHLO
-    // programs, reveals the following non-MHLO ops:
-    //   * shape.broadcast
-    //   * shape.concat
-    //   * shape.cstr_broadcastable
-    //   * shape.cstr_eq
-    //   * shape.dim
-    //   * shape.split_at
-    //   * shape.to_extent_tensor
-    //   * shape.assuming
-    //   * shape.assuming_yield
-    //   * tensor.dim
-    //   * tensor.extract
-    //   * tensor.from_elements
-    //
-    // Most of these ops are convertible to MHLO, although the representation is
-    // going to be pretty laborious for many of them. Luckily, canonicalization
-    // is able to remove unnecessary cruft. At the moment, this pass is a
-    // work in progress, so not all of these ops are supported.
-    //
-    // When legalize_constraints_ is set true, cstr* ops are also legalized.
-    // A shape_assertion custom_call is used to check the constraint. And the
-    // shape.assuming region will consume a shape.const_witness that evaluate to
-    // true, so that it can be removed later in a canonicalizer pass.
-    ConversionTarget target(getContext());
-    target.addIllegalDialect<shape::ShapeDialect>();
-    target.addIllegalDialect<tensor::TensorDialect>();
-    target.addIllegalOp<arith::IndexCastOp>();
-    target.addIllegalOp<arith::MulIOp>();
-    target.addDynamicallyLegalDialect<mhlo::MhloDialect>([](Operation* op) {
-      return !llvm::any_of(op->getOperands(), hasIndexStyle);
-    });
-    target.addLegalOp<tensor::CastOp>();
-    target.addLegalOp<UnrealizedConversionCastOp>();
-    if (this->legalize_constraints_) {
-      target.addLegalOp<shape::ConstWitnessOp, shape::AssumingOp,
-                        shape::AssumingYieldOp>();
-    }
-
-    // The patterns do what one might expect, converting between MLIR-style
-    // and HLO-style shape computations.
-    //
-    // The only complication is that MLIR style uses index/tensor<Nxindex>
-    // whereas HLO style uses tensor<i32>/vararg of tensor<i32>. We bridge
-    // this gap by producing unrealized_conversion_cast ops, which we expect
-    // to ultimately annihilate with each other upon canonicalization if
-    // everything went right.
-    RewritePatternSet patterns(&getContext());
-    patterns.add<ConvertConstShapeOpPattern>(&getContext());
-    patterns.add<ConvertMulIOpPattern>(&getContext());
-    patterns.add<ConvertIndexCastOpPattern>(&getContext());
-    patterns.add<ConvertNumElementsOpPattern>(&getContext());
-    patterns.add<ConvertShapeOfOpPattern>(&getContext());
-    patterns.add<ConvertShapeBroadcastOpPattern>(&getContext());
-    patterns.add<CastOperandsPattern<DynamicBroadcastInDimOp>>(&getContext());
-    patterns.add<CastOperandsPattern<DynamicReshapeOp>>(&getContext());
-    patterns.add<ConvertTensorDimPattern>(&getContext());
-    patterns.add<ConvertTensorExtractPattern>(&getContext());
-    patterns.add<ConvertTensorFromElementsPattern>(&getContext());
-    if (this->legalize_constraints_) {
-      patterns.add<ConvertCstrBroadcastableOp>(&getContext());
-    }
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<func::FuncOp>> createShapeLegalizeToHloPass(
-    bool legalizeConstraints) {
-  return std::make_unique<ShapeLegalizeToHloPass>(legalizeConstraints);
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
deleted file mode 100644
index 1747bd93b492..000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file contains the patterns to simplify shape ops that were deemed not
-// suitable for shape op canonicalization in MLIR Core.
-
-#include <memory>
-#include <utility>
-
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_SHAPESIMPLIFICATION
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-using shape::BroadcastOp;
-using shape::ConstShapeOp;
-using shape::ShapeOfOp;
-
-// Try to remove operands from broadcasts that don't contribute to the final
-// result.
-struct BroadcastRemoveSubsumedOperandsPattern
-    : public OpRewritePattern<BroadcastOp> {
-  using OpRewritePattern<BroadcastOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(BroadcastOp op,
-                                PatternRewriter &rewriter) const override {
-    // First collect the static components when joining all shapes. The
-    // resulting vector contains a static dimension if any operand has a static
-    // non-1 dimension in that position. The remaining dimensions are set to
-    // dynamic size.
-    SmallVector<int64_t> knownExtents;
-    SmallVector<SmallVector<int64_t, 4>, 4> operandExtents;
-    for (Value shape : op.getShapes()) {
-      auto &extents = operandExtents.emplace_back();
-      if (failed(shape::getShapeVec(shape, extents))) return failure();
-
-      // Prepend dynamic dims if sizes don't match.
-      if (extents.size() > knownExtents.size()) {
-        knownExtents.insert(knownExtents.begin(),
-                            extents.size() - knownExtents.size(),
-                            ShapedType::kDynamic);
-      }
-
-      for (size_t i = 0, e = extents.size(); i != e; ++i) {
-        int64_t extent = extents[e - i - 1];
-        if (extent != ShapedType::kDynamic && extent != 1) {
-          int64_t &knownExtent = knownExtents[knownExtents.size() - i - 1];
-          // A dynamic dimension is subsumed by a static one, but bail out for
-          // known conflicting shapes.
-          if (knownExtent != extent && knownExtent != ShapedType::kDynamic)
-            return failure();
-          knownExtent = extent;
-        }
-      }
-    }
-
-    // If we've figured out all shapes to be constants we're done.
-    if (!llvm::is_contained(knownExtents, ShapedType::kDynamic)) {
-      rewriter.replaceOpWithNewOp<ConstShapeOp>(
-          op, op->getResultTypes(), rewriter.getIndexTensorAttr(knownExtents));
-      return success();
-    }
-
-    // If only some dimensions are known see if any of the operands can be
-    // removed without affecting the result.
-    SmallVector<Value, 4> filteredOperands;
-    for (auto tuple : llvm::zip(op.getShapes(), operandExtents)) {
-      Value shape = std::get<0>(tuple);
-      auto &extents = std::get<1>(tuple);
-
-      // An operand can't be dead if it's the only operand of the maximum rank.
-      // Removing it would reduce the rank of the output.
-      if (llvm::count_if(operandExtents, [&](ArrayRef<int64_t> op) {
-            return op.size() >= extents.size();
-          }) <= 1) {
-        filteredOperands.push_back(shape);
-        continue;
-      }
-
-      for (size_t i = 0, e = extents.size(); i != e; ++i) {
-        int64_t extent = extents[e - i - 1];
-        // A dimension of an operand can be subsumed if it's
-        //   - a 1 dimension. All other operands will have 1 dims or better.
-        if (extent == 1) continue;
-
-        //   - a dynamic dim but the result is known to be constant.
-        int64_t knownExtent = knownExtents[knownExtents.size() - i - 1];
-        assert(knownExtent != 1);
-        if (knownExtent != ShapedType::kDynamic &&
-            extent == ShapedType::kDynamic)
-          continue;
-
-        //   - a constant non-1 dimension equal to the "known" dim.
-        // In this case we also have to check whether this operand is the only
-        // contributor of that constant.
-        if (knownExtent != ShapedType::kDynamic && extent == knownExtent &&
-            llvm::count_if(operandExtents, [&](ArrayRef<int64_t> operandShape) {
-              return i < operandShape.size() &&
-                     operandShape[operandShape.size() - i - 1] == knownExtent;
-            }) > 1)
-          continue;
-
-        filteredOperands.push_back(shape);
-        break;
-      }
-    }
-    if (filteredOperands.size() != op.getShapes().size()) {
-      rewriter.replaceOpWithNewOp<BroadcastOp>(op, op->getResultTypes(),
-                                               filteredOperands);
-      return success();
-    }
-    return failure();
-  }
-};
-
-// Convert cases like:
-// ```
-//  %1 = shape.shape_of %arg0 : tensor<?x?x?xf64> -> tensor<3xindex>
-//  %2 = shape.shape_of %arg1 : tensor<?x?x1xf64> -> tensor<3xindex>
-//  %3 = shape.broadcast %1, %2 : tensor<3xindex>, tensor<3xindex>
-//                                -> tensor<3xindex>
-//  %result = tensor.extract %3[%c2] : tensor<3xindex>
-// ```
-// to
-//
-// ```
-//  %result = tensor.dim %arg0[%c2] : tensor<?x?x2048xf64>
-// ```
-struct ExtractFromBroadcastedTensorCanonicalizationPattern
-    : public OpRewritePattern<tensor::ExtractOp> {
-  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::ExtractOp op,
-                                PatternRewriter &rewriter) const override {
-    auto broadcastOp = op.getTensor().getDefiningOp<BroadcastOp>();
-    if (!broadcastOp) return failure();
-
-    // Confirm that there is a constant index. This is required, so we can
-    // confirm the DimOp's input will define the resulting broadcasted shape in
-    // that dimension.
-    auto index =
-        op.getIndices().front().getDefiningOp<arith::ConstantIndexOp>();
-    if (!index) return failure();
-    auto idx = index.value();
-
-    // Iterate through the operands with 3 considerations in this order:
-    // 1. If a static, non-1 dimension is seen, we know this to be the
-    // broadcasted result
-    // 2. If a single dynamic dimension is seen, we know this to be the
-    // broadcasted result (with a possibly 1 or non-1 result)
-    // 3. If no dynamic dimensions and no non-1 static dimensions are seen, we
-    // know the result to be 1
-    //
-    // Iterate through all operands, keeping track of dynamic dimensions and
-    // returning immediately if a non-1 static dimension is seen.
-    ShapeOfOp dynamicShape;
-    int64_t numDynamic = 0;
-    for (auto shape : broadcastOp.getShapes()) {
-      auto shapeOfOp = shape.getDefiningOp<ShapeOfOp>();
-      if (!shapeOfOp) return failure();
-      auto shapedType =
-          mlir::cast<ShapedType>(shapeOfOp->getOperandTypes().front());
-
-      // Abort on the existence of unranked shapes as they require more logic.
-      if (!shapedType.hasRank()) return failure();
-      if (shapedType.getRank() <= idx) continue;
-
-      // Only consider dynamic dimensions after the loop because any non-1
-      // static dimension takes precedence.
-      if (shapedType.isDynamicDim(idx)) {
-        dynamicShape = shapeOfOp;
-        numDynamic++;
-        continue;
-      }
-
-      if (shapedType.getDimSize(idx) == 1) continue;
-
-      // Return as soon as we see a non-1 static dim.
-      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(
-          op, shapedType.getDimSize(idx));
-      return success();
-    }
-    if (numDynamic > 1) return failure();
-
-    // Replace with the single dynamic dimension or 1.
-    if (dynamicShape) {
-      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, dynamicShape.getArg(),
-                                                 index);
-    } else {
-      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 1);
-    }
-    return success();
-  }
-};
-
-struct ShapeSimplification
-    : public impl::ShapeSimplificationBase<ShapeSimplification> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::arith::ArithDialect>();
-    registry.insert<mhlo::MhloDialect>();
-    registry.insert<mlir::func::FuncDialect>();
-    registry.insert<shape::ShapeDialect>();
-    registry.insert<tensor::TensorDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    RewritePatternSet patterns(&getContext());
-
-    for (auto op : context->getRegisteredOperations()) {
-      if (isa<shape::ShapeDialect, mhlo::MhloDialect>(op.getDialect()))
-        op.getCanonicalizationPatterns(patterns, context);
-    }
-
-    patterns.add<BroadcastRemoveSubsumedOperandsPattern,
-                 ExtractFromBroadcastedTensorCanonicalizationPattern>(context);
-
-    auto func = getOperation();
-    if (failed(applyPatternsGreedily(func, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeSimplification() {
-  return std::make_unique<ShapeSimplification>();
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index fa3186ce17a9..352e9a669e9e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/map_stablehlo_to_hlo_op.h"
 #include "mhlo/transforms/rewriters.h"
@@ -193,7 +194,7 @@ Attribute convertAttr(Attribute stablehloAttr) {
 #undef RETURN_CONVERTED_ENUM_ATTR
 
 // Convert array of enum strings to array of enum attrs
-//   ["PACKED_NIBBLE"] --> [#mhlo<precision PACKED_NIBBLE>]
+//   ["HIGHEST"] --> [#mhlo<precision HIGHEST>]
 Attribute decodePrecisionConfig(Attribute stablehloAttr) {
   auto arrayAttr = mlir::dyn_cast<ArrayAttr>(stablehloAttr);
   if (!arrayAttr) return {};
@@ -248,10 +249,10 @@ LogicalResult convertFuncToStablehloRegion(Operation* op, func::FuncOp funcOp,
 //
 // Example:
 //  %0 = stablehlo.custom_call @mhlo.dot {
-//    mhlo.attributes = {precision_config = ["PACKED_NIBBLE"]}}
+//    mhlo.attributes = {precision_config = ["HIGHEST"]}}
 //  ==>
 //   %0 = "mhlo.dot"(%arg0, %arg1) {
-//     precision_config = [#mhlo<precision PACKED_NIBBLE>] } ...
+//     precision_config = [#mhlo<precision HIGHEST>] } ...
 LogicalResult rewriteCustomCallAsMhloOp(stablehlo::CustomCallOp stablehloOp,
                                         ConversionPatternRewriter& rewriter,
                                         const TypeConverter* typeConverter,
@@ -425,6 +426,64 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
   }
 };
 
+// AddDependencyOp is the only op that doesn't exist in StableHLO but uses
+// token types. This led to two options (1) support either token type in
+// AddDependencyOp or (2) Design a token conversion (or unrealized cast) between
+// MHLO and StableHLO. Option (1) seems safer, and we can hopefully obsolete
+// mhlo::TokenType all together and just use StableHLO tokens everywhere.
+//
+// Note: Only the second argument needs to be converted. All token creation and
+// propagation is already handled by existing conversions.
+struct AddDependencyOpToMhoTokenConverter
+    : public OpConversionPattern<mhlo::AddDependencyOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      mhlo::AddDependencyOp op, mhlo::AddDependencyOpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    // Only convert if input token type is MHLO token
+    if (!llvm::isa<mhlo::TokenType>(adaptor.getToken().getType()))
+      return rewriter.notifyMatchFailure(op, "nothing to convert");
+    rewriter.replaceOpWithNewOp<mhlo::AddDependencyOp>(op, adaptor.getOperand(),
+                                                       adaptor.getToken());
+    return success();
+  }
+};
+
+bool hasStablehloOperand(Operation* op) {
+  return llvm::any_of(op->getOperandTypes(), [](Type type) {
+    // Check for !stablehlo.token
+    if (llvm::isa<stablehlo::StablehloDialect>(type.getDialect())) return true;
+
+    // Check for tensor<X, #stablehlo.bounds<...>>
+    if (auto rankedType = dyn_cast<RankedTensorType>(type)) {
+      return llvm::isa_and_nonnull<stablehlo::TypeExtensionsAttr>(
+          rankedType.getEncoding());
+    }
+    // Not StableHLO
+    return false;
+  });
+}
+
+struct UpdateOperandsInUnknownOp : public ConversionPattern {
+  UpdateOperandsInUnknownOp(TypeConverter& converter, MLIRContext* context)
+      : ConversionPattern(converter, MatchAnyOpTypeTag(), /*benefit=*/1,
+                          context) {}
+  LogicalResult matchAndRewrite(
+      Operation* op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const override {
+    // Input types already converted to MHLO.
+    if (llvm::isa<mhlo::MhloDialect, stablehlo::StablehloDialect>(
+            op->getDialect()))
+      return rewriter.notifyMatchFailure(op, "op is not an unknown op");
+
+    if (!hasStablehloOperand(op))
+      return rewriter.notifyMatchFailure(op, "op has no stablehlo operands");
+
+    rewriter.modifyOpInPlace(op, [&]() { op->setOperands(operands); });
+    return success();
+  }
+};
+
 // Deprecated ops.
 template <>
 class StablehloToHloOpConverter<stablehlo::UnaryEinsumOp>
@@ -458,6 +517,22 @@ void populateStablehloToHloPatterns(RewritePatternSet* patterns,
 #define GET_OP_LIST
 #include "stablehlo/dialect/StablehloOps.cpp.inc"
       >(patterns, converter, context);
+
+  // Populate conversion patterns for ops that don't exist in StableHLO
+  // and unknown dialect ops that may have StableHLO operands.
+  patterns->add<AddDependencyOpToMhoTokenConverter>(context);
+  patterns->add<UpdateOperandsInUnknownOp>(*converter, context);
+}
+
+void setupStablehloToHloConversionTarget(ConversionTarget& target) {
+  target.addIllegalDialect<stablehlo::StablehloDialect>();
+  target.addLegalDialect<mhlo::MhloDialect>();
+
+  // Some ops may have MHLO / StableHLO types in operands
+  target.addDynamicallyLegalOp<mhlo::AddDependencyOp>(
+      [](mhlo::AddDependencyOp op) { return !hasStablehloOperand(op); });
+  target.markUnknownOpDynamicallyLegal(
+      [](Operation* op) { return !hasStablehloOperand(op); });
 }
 
 }  // namespace stablehlo
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc
index 9b4c4d4eb64f..8cf0b11551ab 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
 #include <utility>
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
@@ -26,8 +26,8 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Support/TypeID.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "stablehlo/dialect/StablehloOps.h"
 
@@ -39,14 +39,72 @@ namespace mhlo {
 
 namespace {
 
+// Keep this list until Direct StableHLO to HLO Path is well tested in prod.
+/*
+void legalDirectStablehloToHloConversionOps(
+    ConversionTarget& target) {
+  target.addLegalOp<
+      stablehlo::AbsOp, stablehlo::CbrtOp, stablehlo::SqrtOp, stablehlo::TanOp,
+      stablehlo::AddOp, stablehlo::AddOp, stablehlo::AllGatherOp,
+      stablehlo::AfterAllOp, stablehlo::AndOp, stablehlo::BatchNormInferenceOp,
+      stablehlo::Atan2Op, stablehlo::BroadcastInDimOp, stablehlo::BroadcastOp,
+      stablehlo::CeilOp, stablehlo::ClzOp, stablehlo::ConvertOp,
+      stablehlo::CholeskyOp, stablehlo::CollectivePermuteOp,
+      stablehlo::ComplexOp, stablehlo::ConvolutionOp, stablehlo::CosineOp,
+      stablehlo::ConcatenateOp, stablehlo::ConstantOp, stablehlo::DivOp,
+      stablehlo::MaxOp, stablehlo::EinsumOp, stablehlo::FftOp,
+      stablehlo::DynamicUpdateSliceOp, stablehlo::DynamicBroadcastInDimOp,
+      stablehlo::ExpOp, stablehlo::IsFiniteOp, stablehlo::Expm1Op,
+      stablehlo::CrossReplicaSumOp, stablehlo::FloorOp,
+      stablehlo::GetDimensionSizeOp, stablehlo::NegOp, stablehlo::NotOp,
+      stablehlo::ImagOp, stablehlo::DynamicSliceOp, stablehlo::LogOp,
+      stablehlo::LogisticOp, stablehlo::Log1pOp, stablehlo::MinOp,
+      stablehlo::MulOp, stablehlo::PowOp, stablehlo::OrOp,
+      stablehlo::PopulationCountOp, stablehlo::RsqrtOp, stablehlo::SelectOp,
+      stablehlo::ReplicaIdOp, stablehlo::RealOp, stablehlo::RoundNearestEvenOp,
+      stablehlo::RoundOp, stablehlo::ReverseOp, stablehlo::RemOp,
+      stablehlo::ShiftRightArithmeticOp, stablehlo::ShiftRightLogicalOp,
+      stablehlo::SliceOp, stablehlo::TanhOp, stablehlo::TransposeOp,
+      stablehlo::SubtractOp, stablehlo::SignOp, stablehlo::SineOp,
+      stablehlo::TorchIndexSelectOp, stablehlo::ShiftLeftOp,
+      stablehlo::TriangularSolveOp, stablehlo::XorOp, stablehlo::CreateTokenOp,
+      stablehlo::TupleOp, stablehlo::SendOp, stablehlo::RecvOp,
+      stablehlo::InfeedOp, stablehlo::OutfeedOp, stablehlo::GetTupleElementOp,
+      stablehlo::OptimizationBarrierOp, stablehlo::WhileOp, stablehlo::CaseOp,
+      stablehlo::IfOp, stablehlo::AllReduceOp, stablehlo::ReduceOp,
+      stablehlo::MapOp, stablehlo::ReturnOp, stablehlo::AllToAllOp,
+      stablehlo::BatchNormGradOp, stablehlo::BatchNormTrainingOp,
+      stablehlo::BitcastConvertOp, stablehlo::ClampOp,
+      stablehlo::CollectiveBroadcastOp, stablehlo::CompareOp, stablehlo::SortOp,
+      stablehlo::CompositeOp, stablehlo::CustomCallOp, stablehlo::DotGeneralOp,
+      stablehlo::DotOp, stablehlo::DynamicConvOp, stablehlo::DynamicGatherOp,
+      stablehlo::DynamicPadOp, stablehlo::DynamicReshapeOp,
+      stablehlo::DynamicIotaOp, stablehlo::ReshapeOp, stablehlo::GatherOp,
+      stablehlo::IotaOp, stablehlo::PadOp, stablehlo::PartitionIdOp,
+      stablehlo::RealDynamicSliceOp, stablehlo::ReduceWindowOp,
+      stablehlo::ReducePrecisionOp, stablehlo::ReduceScatterOp,
+      stablehlo::RngBitGeneratorOp, stablehlo::RngOp, stablehlo::ScatterOp,
+      stablehlo::SelectAndScatterOp, stablehlo::SetDimensionSizeOp,
+      stablehlo::UniformDequantizeOp, stablehlo::UniformQuantizeOp>();
+}
+*/
+
 struct StablehloLegalizeToHloPass
     : public impl::StablehloLegalizeToHloPassBase<StablehloLegalizeToHloPass> {
+  using StablehloLegalizeToHloPassBase::StablehloLegalizeToHloPassBase;
   void runOnOperation() override {
     ConversionTarget target(getContext());
-    target.addIllegalDialect<stablehlo::StablehloDialect>();
-    target.addLegalDialect<mhlo::MhloDialect>();
+    // All StableHLO ops can go on Direct StableHLO to HLO path. This pass is
+    // is NO-OP for Direct StableHLO to HLO path.
+    if (!convert_xla_supported_stablehlo_) {
+      target.addLegalDialect<stablehlo::StablehloDialect>();
+      return;
+    }
+    stablehlo::setupStablehloToHloConversionTarget(target);
+
+    stablehlo::StablehloToHloTypeConverter converter(
+        convert_xla_supported_stablehlo_);
 
-    stablehlo::StablehloToHloTypeConverter converter;
     RewritePatternSet patterns(&getContext());
     stablehlo::populateStablehloToHloPatterns(&patterns, &converter,
                                               &getContext());
@@ -63,10 +121,5 @@ struct StablehloLegalizeToHloPass
 
 }  // namespace
 
-std::unique_ptr<mlir::OperationPass<ModuleOp>>
-createStablehloLegalizeToHloPass() {
-  return std::make_unique<StablehloLegalizeToHloPass>();
-}
-
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
deleted file mode 100644
index 961e512d2396..000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
+++ /dev/null
@@ -1,811 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstdint>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/analysis/shape_component_analysis.h"
-#include "mhlo/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_SYMBOLICSHAPEOPTIMIZATION
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-using ShapeOrValueInfo = ShapeComponentAnalysis::ShapeOrValueInfo;
-using Symbol = ShapeComponentAnalysis::Symbol;
-using SymbolicExpr = ShapeComponentAnalysis::SymbolicExpr;
-
-namespace {
-
-// Temporary data structure to hold a single dimension of the symbolic result of
-// `shape.broadcast`.
-struct SymbolicBroadcastDimension {
-  size_t operandIndex;
-  size_t operandDim;
-  SymbolicExpr expr;
-};
-
-// Replace shape.broadcast with a shape if it's statically known.
-struct SimplifyBroadcasts : public mlir::OpRewritePattern<shape::BroadcastOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(
-      shape::BroadcastOp op, mlir::PatternRewriter &rewriter) const override {
-    // Require successful shape analysis.
-    ShapeComponentAnalysis shapeAnalysis;
-    llvm::SmallVector<ArrayRef<SymbolicExpr>> shapesInfo;
-    auto shapes = op.getShapes();
-    shapesInfo.reserve(shapes.size());
-    for (Value s : shapes) {
-      auto sInfo = shapeAnalysis.GetValueInfo(s);
-      if (!sInfo) return failure();
-      shapesInfo.push_back(*sInfo);
-    }
-
-    // Find the result rank.
-    size_t rank = 0;
-    for (const auto &sInfo : shapesInfo) rank = std::max(rank, sInfo.size());
-
-    // Compute broadcast symbolically.
-    SmallVector<std::optional<SymbolicBroadcastDimension>> symResult(
-        rank, std::nullopt);
-    for (const auto &sInfo : llvm::enumerate(shapesInfo)) {
-      size_t dimOffset = rank - sInfo.value().size();
-      for (const auto &symExpr : llvm::enumerate(sInfo.value())) {
-        // Unit dimensions are neutral to the final result.
-        if (symExpr.value().isConstant(1)) continue;
-
-        // Use unique expression.
-        size_t i = dimOffset + symExpr.index();
-        if (!symResult[i]) {
-          symResult[i] = {sInfo.index(), symExpr.index(), symExpr.value()};
-          continue;
-        }
-
-        // Bail if the dimensions are neither equal nor 1.
-        if (symResult[i]->expr != symExpr.value()) return failure();
-      }
-    }
-
-    // Materialize broadcast result.
-    auto loc = op.getLoc();
-    DenseMap<int64_t, Value> constants;
-    auto findOrCreateConstant = [&](int64_t c) {
-      auto it = constants.find(c);
-      if (it != constants.end()) return it->second;
-      Value newlyCreated = rewriter.create<arith::ConstantIndexOp>(loc, c);
-      constants[c] = newlyCreated;
-      return newlyCreated;
-    };
-    auto elements = llvm::to_vector<8>(
-        llvm::map_range(symResult, [&](const auto &symResultDim) {
-          // If we know the dimension statically, use a constant.
-          if (!symResultDim) return findOrCreateConstant(1);
-          if (auto cexpr =
-                  dyn_cast<AffineConstantExpr>(symResultDim->expr.expr)) {
-            return findOrCreateConstant(cexpr.getValue());
-          }
-
-          // Othwerise, extract the dimension from the unique operand.
-          Value operand = shapes[symResultDim->operandIndex];
-          Value operandDim = findOrCreateConstant(symResultDim->operandDim);
-          return rewriter.create<tensor::ExtractOp>(loc, operand, operandDim)
-              .getResult();
-        }));
-    Type indexTy = rewriter.getIndexType();
-    Type concreteResultTy =
-        RankedTensorType::get({static_cast<int64_t>(elements.size())}, indexTy);
-    Value result = rewriter.create<tensor::FromElementsOp>(
-        loc, concreteResultTy, elements);
-
-    // Insert cast, if needed.
-    Type expectedTy = op.getResult().getType();
-    if (result.getType() != expectedTy) {
-      result = rewriter.create<tensor::CastOp>(loc, expectedTy, result);
-    }
-
-    rewriter.replaceOp(op, result);
-    return success();
-  }
-};
-
-LogicalResult analyzeDynamicBroadcastInDimExpandingBehavior(
-    ShapeComponentAnalysis &analysis, Value value, Value shape,
-    llvm::SmallSetVector<int64_t, 4> *knownExpandingDims,
-    llvm::SmallSetVector<int64_t, 4> *knownNonexpandingDims) {
-  // Require successful analysis of shapes.
-  auto shapeIn = analysis.GetShapeInfo(value);
-  auto shapeOut = analysis.GetValueInfo(shape);
-  if (!shapeIn || !shapeOut) return failure();
-
-  // Analyze per argument dimension.
-  size_t rankIn = shapeIn->size();
-  size_t rankOut = shapeOut->size();
-  assert(rankIn <= rankOut);
-  size_t dimOutOffset = rankOut - rankIn;
-  for (size_t i = 0; i < rankIn; ++i) {
-    SymbolicExpr dimIn = (*shapeIn)[i];
-    SymbolicExpr dimOut = (*shapeOut)[dimOutOffset + i];
-    if (dimIn.isConstant(1) && dimOut.isKnownNotOne())
-      knownExpandingDims->insert(i);
-    if (dimIn == dimOut || dimOut.isConstant(1))
-      knownNonexpandingDims->insert(i);
-  }
-  return success();
-}
-
-// Analyze `mhlo.dynamic_broadcast_in_dim` op and populate attributes for
-// statically known expanding and non-expanding dimensions.
-struct AnnotateExpandingDimensionsInDynamicBroadcastInDim
-    : public mlir::OpRewritePattern<mhlo::DynamicBroadcastInDimOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(
-      mhlo::DynamicBroadcastInDimOp op,
-      mlir::PatternRewriter &rewriter) const override {
-    // Analyze shapes and identify expanding and non-expanding dims.
-    ShapeComponentAnalysis analysis;
-    llvm::SmallSetVector<int64_t, 4> knownExpandingDims, knownNonexpandingDims;
-    if (failed(analyzeDynamicBroadcastInDimExpandingBehavior(
-            analysis, op.getOperand(), op.getOutputDimensions(),
-            &knownExpandingDims, &knownNonexpandingDims))) {
-      return failure();
-    }
-
-    // Collect possibly already annotated info.
-    auto insertAll = [](llvm::SmallSetVector<int64_t, 4> &dst,
-                        std::optional<DenseIntElementsAttr> src) {
-      if (!src) return;
-      for (auto it : *src) dst.insert(it.getLimitedValue());
-    };
-    insertAll(knownExpandingDims, op.getKnownExpandingDimensions());
-    insertAll(knownNonexpandingDims, op.getKnownNonexpandingDimensions());
-
-    // Fail pattern application if there is nothing new to annotate.
-    auto isEqual = [](llvm::SmallSetVector<int64_t, 4> &set,
-                      DenseIntElementsAttr attr) {
-      return static_cast<int64_t>(set.size()) == attr.size() &&
-             llvm::all_of(attr, [&](auto it) {
-               return set.count(it.getLimitedValue());
-             });
-    };
-    if (op.getKnownExpandingDimensions() &&
-        op.getKnownNonexpandingDimensions() &&
-        isEqual(knownExpandingDims, *op.getKnownExpandingDimensions()) &&
-        isEqual(knownNonexpandingDims, *op.getKnownNonexpandingDimensions())) {
-      return failure();
-    }
-
-    // Annotate op in place.
-    rewriter.startOpModification(op);
-    op.setKnownExpandingDimensionsAttr(
-        rewriter.getI64TensorAttr(knownExpandingDims.takeVector()));
-    op.setKnownNonexpandingDimensionsAttr(
-        rewriter.getI64TensorAttr(knownNonexpandingDims.takeVector()));
-    rewriter.finalizeOpModification(op);
-    return success();
-  }
-};
-
-bool isProduct(AffineExpr expr,
-               llvm::function_ref<void(AffineConstantExpr)> cbkConstantFactor,
-               llvm::function_ref<void(AffineSymbolExpr)> cbkSymbolicFactor) {
-  auto binExpr = dyn_cast<AffineBinaryOpExpr>(expr);
-  if (binExpr && binExpr.getKind() == AffineExprKind::Mul) {
-    return isProduct(binExpr.getLHS(), cbkConstantFactor, cbkSymbolicFactor) &&
-           isProduct(binExpr.getRHS(), cbkConstantFactor, cbkSymbolicFactor);
-  }
-  if (auto symExpr = dyn_cast<AffineSymbolExpr>(expr)) {
-    cbkSymbolicFactor(symExpr);
-    return true;
-  }
-  if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
-    cbkConstantFactor(constExpr);
-    return true;
-  }
-  return false;
-}
-
-bool isSymbolicProduct(const SymbolicExpr &symbolicExpr,
-                       llvm::function_ref<void(int64_t)> cbkConstantFactor,
-                       llvm::function_ref<void(Symbol)> cbkSymbolicFactor) {
-  return isProduct(
-      symbolicExpr.expr,
-      [&](AffineConstantExpr cexpr) { cbkConstantFactor(cexpr.getValue()); },
-      [&](AffineSymbolExpr sexpr) {
-        cbkSymbolicFactor(symbolicExpr.symbols[sexpr.getPosition()]);
-      });
-}
-
-// Represents a product of symbolic and concrete factors. This will allow us to
-// prove product equalities symbolically.
-struct SymbolicProduct {
-  // Product of all concrete factors.
-  int64_t concrete = 1;
-  // List all symbolic factors as they can not be aggregated.
-  llvm::SmallVector<Symbol> symbolic;
-  bool empty() { return concrete == 1 && symbolic.empty(); }
-};
-
-bool isSymbolicProduct(const SymbolicExpr &symbolicExpr,
-                       SymbolicProduct *product) {
-  return isSymbolicProduct(
-      symbolicExpr, [&](int64_t c) { product->concrete *= c; },
-      [&](Symbol s) { product->symbolic.push_back(s); });
-}
-
-LogicalResult materializeReshapeAsScalarExpand(RankedTensorType operandTy,
-                                               RankedTensorType resultTy,
-                                               mhlo::DynamicReshapeOp op,
-                                               PatternRewriter &rewriter) {
-  assert(operandTy.getRank() == 0 && "expect scalar operand");
-  auto loc = op.getLoc();
-  SmallVector<int64_t> unitDims(resultTy.getRank(), 1);
-  auto expandedTy = RankedTensorType::get(unitDims, resultTy.getElementType());
-  Value expandedScalar = rewriter.create<tensor::ExpandShapeOp>(
-      loc, expandedTy, op.getOperand(), ArrayRef<ReassociationIndices>{});
-  if (expandedScalar.getType() != resultTy) {
-    expandedScalar =
-        rewriter.create<tensor::CastOp>(loc, resultTy, expandedScalar);
-  }
-  rewriter.replaceOp(op, expandedScalar);
-  return success();
-}
-
-LogicalResult materializeReshapeAsScalarCollapse(RankedTensorType operandTy,
-                                                 RankedTensorType resultTy,
-                                                 mhlo::DynamicReshapeOp op,
-                                                 PatternRewriter &rewriter) {
-  assert(resultTy.getRank() == 0 && "expect scalar result");
-  auto loc = op.getLoc();
-  Value operand = op.getOperand();
-  SmallVector<int64_t> unitDims(operandTy.getRank(), 1);
-  auto castedOperandTy =
-      RankedTensorType::get(unitDims, operandTy.getElementType());
-  if (operand.getType() != castedOperandTy) {
-    operand = rewriter.create<tensor::CastOp>(loc, castedOperandTy, operand);
-  }
-  Value collapsedScalar = rewriter.create<tensor::CollapseShapeOp>(
-      loc, operand, ArrayRef<ReassociationIndices>{});
-  rewriter.replaceOp(op, collapsedScalar);
-  return success();
-}
-
-enum class DimensionGroupKind {
-  kNone,
-  kExpanding,
-  kCollapsing,
-};
-
-struct DimensionGroup {
-  int64_t size = 0;
-  DimensionGroupKind kind = DimensionGroupKind::kNone;
-};
-
-SymbolicProduct eliminateCommonFactors(SymbolicProduct &a, SymbolicProduct &b) {
-  SymbolicProduct gcd;
-
-  // Eliminate common concrete factors.
-  gcd.concrete = std::gcd(a.concrete, b.concrete);
-  a.concrete /= gcd.concrete;
-  b.concrete /= gcd.concrete;
-
-  // Eliminate common symbolic factors.
-  int64_t i = 0;
-  while (i < static_cast<int64_t>(a.symbolic.size())) {
-    auto *it = llvm::find(b.symbolic, a.symbolic[i]);
-    if (it != b.symbolic.end()) {
-      gcd.symbolic.push_back(*it);
-      std::swap(a.symbolic[i], a.symbolic.back());
-      a.symbolic.pop_back();
-      b.symbolic.erase(it);
-    } else {
-      i++;
-    }
-  }
-
-  return gcd;
-}
-
-bool isUnpairedUnitDimension(
-    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator it,
-    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator end,
-    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator otherIt,
-    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator otherEnd) {
-  return it != end && it->isConstant(1) &&
-         (otherIt == otherEnd || !otherIt->isConstant(1));
-}
-
-int64_t getShapedTypyDimSize(const SymbolicProduct &symProduct) {
-  return symProduct.symbolic.empty() ? symProduct.concrete
-                                     : ShapedType::kDynamic;
-}
-
-// Iterate over the operand's and the result's shape dimensions and find
-// dimension groups that are collapsing, expanding, or untouched:
-//   - Collapsing: Multiple dimensions of the operand shape can be collapsed
-//     into a single dimension of the result shape. We must prove that the
-//     product of the operand shape's dimensions is equal to the corresponding
-//     result dimension.
-//   - Expanding: A single dimension of the operand shape can be expanded into
-//     multiple dimensions of the result shape. We must prove that the product
-//     of the result shape's dimensions is equal to the corresponding operand
-//     dimension. This case is limited to at most one dynamic dimension per
-//     expansion group as otherwise not supported by the `expand_shape` op.
-//   - Untouched: There is a 1:1 correspondance between an operand and a result
-//     shape dimension.
-//
-// We can determine the optimal dimension groups greedily by consuming operand
-// and result dimensions from left to right. If the leading operand dimension is
-// a strict divisor of the leading result dimension, collapsing is required. In
-// this case, we keep consuming the operand dimensions until the products are
-// equal. If the leading result dimension is a strict divisor of the leading
-// operand dimension, expanding is required. In this case, we keep consuming the
-// result dimensions until the products are equal. Trailing unit dimensions may
-// be inlcuded in the dimension group. This is useful iff they are "unpaired",
-// in which case they would only limit us in the subsequent iteration.
-//
-LogicalResult findExpandingAndCollapsingDimensionGroups(
-    ArrayRef<SymbolicExpr> operandShapeInfo,
-    ArrayRef<SymbolicExpr> resultShapeInfo,
-    SmallVector<DimensionGroup> *dimensionGroups,
-    SmallVector<int64_t> *expandedIntermShape) {
-  const auto *operandShapeIt = operandShapeInfo.begin();
-  const auto *operandShapeEnd = operandShapeInfo.end();
-  const auto *resultShapeIt = resultShapeInfo.begin();
-  const auto *resultShapeEnd = resultShapeInfo.end();
-
-  // Crucial iteration state.
-  SymbolicProduct remainingOperandShapeFactors;
-  SymbolicProduct remainingResultShapeFactors;
-  auto anyRemainingFactors = [&]() {
-    return !remainingOperandShapeFactors.empty() ||
-           !remainingResultShapeFactors.empty();
-  };
-
-  while (operandShapeIt != operandShapeEnd && resultShapeIt != resultShapeEnd) {
-    assert(!anyRemainingFactors() &&
-           "expect no remaining factors from previous iteration");
-    DimensionGroup &dimGroup = dimensionGroups->emplace_back();
-
-    // Consume at least one operand and result dimension.
-    {
-      if (!isSymbolicProduct(*operandShapeIt++,
-                             &remainingOperandShapeFactors) ||
-          !isSymbolicProduct(*resultShapeIt++, &remainingResultShapeFactors)) {
-        return failure();
-      }
-      dimGroup.size++;
-      SymbolicProduct gcd = eliminateCommonFactors(remainingOperandShapeFactors,
-                                                   remainingResultShapeFactors);
-      expandedIntermShape->push_back(getShapedTypyDimSize(gcd));
-    }
-
-    // Fail if there are unresolvable, contradicting factors remaining.
-    if (!remainingOperandShapeFactors.empty() &&
-        !remainingResultShapeFactors.empty()) {
-      return failure();
-    }
-
-    // Collapsing: Create a collapsing dimension group.
-    bool requiresCollapsing =
-        remainingOperandShapeFactors.empty() &&
-        (!remainingResultShapeFactors.empty() ||
-         isUnpairedUnitDimension(operandShapeIt, operandShapeEnd, resultShapeIt,
-                                 resultShapeEnd));
-    if (requiresCollapsing) {
-      dimGroup.kind = DimensionGroupKind::kCollapsing;
-
-      // Consume operand shape dimensions until their product matches the
-      // corresponding result dimension (or fail if unresolvable/contradicting
-      // factors are found).
-      while (operandShapeIt != operandShapeEnd &&
-             remainingOperandShapeFactors.empty() &&
-             !remainingResultShapeFactors.empty()) {
-        if (!isSymbolicProduct(*operandShapeIt++,
-                               &remainingOperandShapeFactors)) {
-          return failure();
-        }
-        dimGroup.size++;
-        SymbolicProduct gcd = eliminateCommonFactors(
-            remainingOperandShapeFactors, remainingResultShapeFactors);
-        expandedIntermShape->push_back(getShapedTypyDimSize(gcd));
-      }
-      if (anyRemainingFactors()) return failure();
-
-      // Consume trailing, unpaired unit dimensions.
-      while (isUnpairedUnitDimension(operandShapeIt, operandShapeEnd,
-                                     resultShapeIt, resultShapeEnd)) {
-        operandShapeIt++;
-        dimGroup.size++;
-        expandedIntermShape->push_back(1);
-      }
-
-      continue;
-    }
-
-    // Expanding: Create an expanding dimension group.
-    bool requiresExpanding =
-        remainingResultShapeFactors.empty() &&
-        (!remainingOperandShapeFactors.empty() ||
-         isUnpairedUnitDimension(resultShapeIt, resultShapeEnd, operandShapeIt,
-                                 operandShapeEnd));
-    if (requiresExpanding) {
-      dimGroup.kind = DimensionGroupKind::kExpanding;
-      int64_t numDynamicDims = 0;
-
-      // Consume result shape dimensions until their product matches the
-      // corresponding operand dimension (or fail if unresolvable/contradicting
-      // factors are found).
-      while (resultShapeIt != resultShapeEnd &&
-             remainingResultShapeFactors.empty() &&
-             !remainingOperandShapeFactors.empty()) {
-        if (!isSymbolicProduct(*resultShapeIt++,
-                               &remainingResultShapeFactors)) {
-          return failure();
-        }
-        dimGroup.size++;
-        SymbolicProduct gcd = eliminateCommonFactors(
-            remainingOperandShapeFactors, remainingResultShapeFactors);
-        int64_t tyDimSize = getShapedTypyDimSize(gcd);
-
-        // Allow no more than one dynamic dimension per expansion group.
-        if (tyDimSize == ShapedType::kDynamic) {
-          numDynamicDims++;
-          if (numDynamicDims > 1) return failure();
-        }
-        expandedIntermShape->push_back(tyDimSize);
-      }
-      if (anyRemainingFactors()) return failure();
-
-      // Consume trailing, unpaired unit dimensions.
-      while (isUnpairedUnitDimension(resultShapeIt, resultShapeEnd,
-                                     operandShapeIt, operandShapeEnd)) {
-        resultShapeIt++;
-        dimGroup.size++;
-        expandedIntermShape->push_back(1);
-      }
-
-      continue;
-    }
-
-    // Untouched: 1:1 mapping between operand and result shape dimension. This
-    // is neither expanding nor collapsing.
-    assert(!requiresCollapsing && !requiresExpanding && "expect id case");
-    assert(dimGroup.size == 1 && dimGroup.kind == DimensionGroupKind::kNone &&
-           "expect simple dimension group");
-  }
-
-  // Fail if there are remaining dimensions that could not be consumed.
-  assert(!anyRemainingFactors() && "expect no remaining factors");
-  if (operandShapeIt != operandShapeEnd || resultShapeIt != resultShapeEnd) {
-    return failure();
-  }
-
-  return success();
-}
-
-SmallVector<int64_t> concretizeOperandShape(
-    ArrayRef<int64_t> operandShape, ArrayRef<SymbolicExpr> operandShapeInfo) {
-  SmallVector<int64_t> result;
-  for (auto it : llvm::zip(operandShape, operandShapeInfo)) {
-    auto dimSize = std::get<0>(it);
-    auto sExpr = std::get<1>(it);
-    if (auto cexpr = dyn_cast<AffineConstantExpr>(sExpr.expr)) {
-      int64_t alsoDimSize = cexpr.getValue();
-      assert((ShapedType::isDynamic(dimSize) || dimSize == alsoDimSize) &&
-             "expect shape analysis result to be compatible with type");
-      result.push_back(alsoDimSize);
-      continue;
-    }
-    result.push_back(dimSize);
-  }
-  return result;
-}
-
-std::optional<SmallVector<ReassociationIndices>> requiresReassociationOfKind(
-    DimensionGroupKind kind, const SmallVector<DimensionGroup> &dimGroups) {
-  SmallVector<ReassociationIndices> reassociation;
-  reassociation.reserve(dimGroups.size());
-  bool isStrictlyReassociating = false;
-  int64_t i = 0;
-  for (const DimensionGroup &g : dimGroups) {
-    if (g.kind == kind) {
-      isStrictlyReassociating = true;
-      reassociation.push_back(
-          llvm::to_vector(llvm::seq<int64_t>(i, i + g.size)));
-      i += g.size;
-      continue;
-    }
-    for (int64_t j = 0; j < g.size; j++) reassociation.push_back({i++});
-  }
-
-  // Return the reassociation if expansion is required.
-  if (isStrictlyReassociating) return reassociation;
-  return std::nullopt;
-}
-
-LogicalResult materializeReshapeAsExpandAndCollapse(
-    ShapeComponentAnalysis &shapeAnalysis, RankedTensorType operandTy,
-    RankedTensorType resultTy, mhlo::DynamicReshapeOp op,
-    PatternRewriter &rewriter) {
-  // Require sucessful shape analysis for operand and result shape.
-  auto operandShapeInfo = shapeAnalysis.GetShapeInfo(op.getOperand());
-  if (!operandShapeInfo) return failure();
-  auto resultShapeInfo = shapeAnalysis.GetValueInfo(op.getOutputShape());
-  if (!resultShapeInfo) return failure();
-
-  // Identify dimension groups and the intermediate expanded type.
-  SmallVector<DimensionGroup> dimensionGroups;
-  SmallVector<int64_t> expandedIntermShape;
-  if (failed(findExpandingAndCollapsingDimensionGroups(
-          *operandShapeInfo, *resultShapeInfo, &dimensionGroups,
-          &expandedIntermShape))) {
-    return failure();
-  }
-
-  // Materialize cast, expand, collapse, and cast, as needed.
-  auto loc = op.getLoc();
-  Value interm = op.getOperand();
-  auto castedOperandTy = RankedTensorType::get(
-      concretizeOperandShape(operandTy.getShape(), *operandShapeInfo),
-      operandTy.getElementType());
-  if (operandTy != castedOperandTy) {
-    interm = rewriter.create<tensor::CastOp>(loc, castedOperandTy, interm);
-  }
-  if (auto reassociation = requiresReassociationOfKind(
-          DimensionGroupKind::kExpanding, dimensionGroups)) {
-    interm = rewriter.create<tensor::ExpandShapeOp>(
-        loc,
-        RankedTensorType::get(expandedIntermShape, operandTy.getElementType()),
-        interm, *reassociation);
-  }
-  if (auto reassociation = requiresReassociationOfKind(
-          DimensionGroupKind::kCollapsing, dimensionGroups)) {
-    interm =
-        rewriter.create<tensor::CollapseShapeOp>(loc, interm, *reassociation);
-  }
-  if (interm.getType() != resultTy) {
-    interm = rewriter.create<tensor::CastOp>(loc, resultTy, interm);
-  }
-  rewriter.replaceOp(op, interm);
-  return success();
-}
-
-// Tries to express `dynamic_reshape` ops through `expand_shape` and
-// `collapse_shape` ops.
-struct DynamicReshapeToExpandAndCollapseShape final
-    : public OpRewritePattern<mhlo::DynamicReshapeOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(mhlo::DynamicReshapeOp op,
-                                PatternRewriter &rewriter) const override {
-    auto operandTy =
-        mlir::dyn_cast<RankedTensorType>(op.getOperand().getType());
-    if (!operandTy) return failure();
-    auto resultTy = mlir::dyn_cast<RankedTensorType>(op.getType());
-    if (!resultTy) return failure();
-
-    // Handle degenerate scalar expand case.
-    if (operandTy.getRank() == 0) {
-      return materializeReshapeAsScalarExpand(operandTy, resultTy, op,
-                                              rewriter);
-    }
-
-    // Handle degenerate scalar collapse case.
-    if (resultTy.getRank() == 0) {
-      return materializeReshapeAsScalarCollapse(operandTy, resultTy, op,
-                                                rewriter);
-    }
-
-    ShapeComponentAnalysis shapeAnalysis;
-    return materializeReshapeAsExpandAndCollapse(shapeAnalysis, operandTy,
-                                                 resultTy, op, rewriter);
-  }
-};
-
-// Returns true if all of bcasted_shapes can be broadcasted with output_shape.
-bool isKnownBroadcastable(ShapeComponentAnalysis &analysis,
-                          ValueRange bcastedShapes, Value outputShape) {
-  auto outputShapeDims = analysis.GetValueInfo(outputShape);
-  if (!outputShapeDims) return false;
-  for (Value shape : bcastedShapes) {
-    auto shapeDims = analysis.GetValueInfo(shape);
-    if (!shapeDims) return false;
-    // Iterate backwards over the smallest input shape.
-    for (auto zip : llvm::zip(llvm::reverse(*outputShapeDims),
-                              llvm::reverse(*shapeDims))) {
-      const auto &first = std::get<0>(zip);
-      const auto &second = std::get<1>(zip);
-      // TODO(ezhulenev): What to do with dimensions statically known to be
-      // zero?
-      // Numpy can only broadcast [0] with [1], however Tensorflow can broadcast
-      // [0] with any dimension size, and produces dimension of size [0].
-      // Currently we'll conservatively return failure and will not proceed with
-      // a rewrite.
-      if (first.isConstant(0) || second.isConstant(0)) return false;
-      // If either shape has a static one dimension the broadcast will always
-      // succeed.
-      if (first.isConstant(1) || second.isConstant(1)) continue;
-      // Otherwise dims have to be equal.
-      if (first != second) return false;
-    }
-  }
-  return true;
-}
-
-// Rewrite `shape.cstr_broadcastable` with constant witness if can prove that
-// shapes are broadcastable from a symbolic analysis.
-struct CstrBroadcastableOpLowering
-    : public OpRewritePattern<shape::CstrBroadcastableOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
-                                PatternRewriter &rewriter) const override {
-    ShapeComponentAnalysis shapeComponentAnalysis;
-    if (!isKnownBroadcastable(shapeComponentAnalysis, op.getShapes(),
-                              op.getShapes().front())) {
-      return failure();
-    }
-    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op, true);
-    return success();
-  }
-};
-
-// Returns a shape tensor if the shapes can be broadcasted to a known shape.
-// Will either return one of the shapes or a generated mix of the shapes.
-std::optional<Value> simplifyBroadcast(ShapeComponentAnalysis &analysis,
-                                       ValueRange shapes, Location loc,
-                                       OpBuilder *builder) {
-  // First find the input shape with the largest rank.
-  SmallVector<ArrayRef<ShapeComponentAnalysis::SymbolicExpr>> shapesFound;
-  size_t maxRank = 0;
-  for (const auto &shape : llvm::enumerate(shapes)) {
-    auto foundShape = analysis.GetValueInfo(shape.value());
-    if (!foundShape) return {};
-    shapesFound.push_back(*foundShape);
-    maxRank = std::max(maxRank, foundShape->size());
-  }
-  if (maxRank == 0) {
-    return Value(builder->create<tensor::FromElementsOp>(
-        loc, shapes[0].getType(), SmallVector<Value>()));
-  }
-
-  SmallVector<const ShapeComponentAnalysis::SymbolicExpr *> joinedDimensions(
-      maxRank);
-  SmallVector<std::pair<Value, int64_t>> shapeAndRankForDim(maxRank);
-  for (const auto &shape : llvm::enumerate(shapesFound)) {
-    for (const auto &dim : llvm::enumerate(llvm::reverse(shape.value()))) {
-      // 1 dimensions don't contribute to the final result.
-      if (dim.value().isConstant(1)) continue;
-      // If it's not a 1 dimension it will be present in the result. Remember
-      // where it came from.
-      auto index = maxRank - dim.index() - 1;
-      if (!joinedDimensions[index]) {
-        joinedDimensions[index] = &dim.value();
-        shapeAndRankForDim[index] =
-            std::make_pair(shapes[shape.index()], shape.value().size());
-        continue;
-      }
-      // Bail if the dimensions are neither equal nor 1.
-      if (*joinedDimensions[index] != dim.value()) return {};
-    }
-  }
-  // If the output is the same as one of the inputs just return that.
-  if (llvm::all_equal(shapeAndRankForDim) && shapeAndRankForDim[0].first) {
-    return shapeAndRankForDim[0].first;
-  }
-  // Otherwise rematerialize the shape from the pieces we have.
-  SmallVector<Value> elements;
-  for (size_t i = 0; i != maxRank; ++i) {
-    // 1 dimensions are filtered above, recreate the constant.
-    if (!shapeAndRankForDim[i].first) {
-      auto one = builder->getIntegerAttr(
-          mlir::cast<RankedTensorType>(shapes[0].getType()).getElementType(),
-          1);
-      elements.push_back(builder->create<arith::ConstantOp>(loc, one));
-      continue;
-    }
-    // Extract from one of the shapes, accounting for the reverse indexing
-    // performed by broadcast.
-    Value index = builder->create<arith::ConstantIndexOp>(
-        loc, i - maxRank + shapeAndRankForDim[i].second);
-    elements.push_back(builder->create<tensor::ExtractOp>(
-        loc, shapeAndRankForDim[i].first, index));
-  }
-  return Value(builder->create<tensor::FromElementsOp>(loc, elements));
-}
-
-// Replace shape.broadcast with a shape if it's statically known.
-struct BroadcastOpLowering final
-    : public mlir::OpRewritePattern<shape::BroadcastOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(
-      shape::BroadcastOp op, mlir::PatternRewriter &rewriter) const override {
-    ShapeComponentAnalysis shapeComponentAnalysis;
-    auto newBroadcast = simplifyBroadcast(
-        shapeComponentAnalysis, op.getShapes(), op.getLoc(), &rewriter);
-    if (!newBroadcast) return failure();
-
-    // Insert cast, if needed.
-    Type expectedTy = op.getType();
-    if (newBroadcast->getType() != expectedTy) {
-      newBroadcast = rewriter.create<tensor::CastOp>(op.getLoc(), expectedTy,
-                                                     *newBroadcast);
-    }
-
-    rewriter.replaceOp(op, {*newBroadcast});
-    return success();
-  }
-};
-
-class SymbolicShapeOptimizationPass final
-    : public impl::SymbolicShapeOptimizationBase<
-          SymbolicShapeOptimizationPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *ctx = &getContext();
-    mlir::RewritePatternSet patterns(ctx);
-
-    // clang-format off
-    patterns.insert<
-        AnnotateExpandingDimensionsInDynamicBroadcastInDim,
-        BroadcastOpLowering,
-        CstrBroadcastableOpLowering,
-        DynamicReshapeToExpandAndCollapseShape,
-        SimplifyBroadcasts>(ctx);
-    // clang-format on
-
-    // Collect some relevant canonicalization patterns.
-    shape::AssumingOp::getCanonicalizationPatterns(patterns, ctx);
-    shape::ShapeOfOp::getCanonicalizationPatterns(patterns, ctx);
-
-    if (failed(
-            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // end namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createSymbolicShapeOptimizationPass() {
-  return std::make_unique<SymbolicShapeOptimizationPass>();
-}
-
-}  // end namespace mhlo
-}  // end namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
index f2ad3562a9f3..d07f3178552f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
+#include <cstdint>
+
 #include "llvm/ADT/SmallVector.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc b/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc
index 0c53644a2f03..e5195f5ae130 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc
@@ -103,13 +103,11 @@ RemoveSignTypeConverter::RemoveSignTypeConverter() {
   addConversion(convertInteger);
   addConversion(convertShapedType);
 
-  addArgumentMaterialization(materializeCastFromIllegal);
   addSourceMaterialization(materializeCastToIllegal);
   addTargetMaterialization(materializeCastFromIllegal);
 }
 
 LinalgTypeConverter::LinalgTypeConverter() : RemoveSignTypeConverter() {
-  addArgumentMaterialization(scalarToTensor);
   addSourceMaterialization(scalarToTensor);
   addTargetMaterialization(scalarToTensor);
 }
@@ -182,12 +180,27 @@ Attribute HloToStablehloTypeConverter::convertSourceDialectEncoding(
 }
 
 StablehloToHloTypeConverter::StablehloToHloTypeConverter()
-    : HloTypeConverter() {
+    : HloTypeConverter(), convert_xla_supported_stablehlo_(true) {
   addConversion([](stablehlo::TokenType stablehloType) -> Type {
     return mhlo::TokenType::get(stablehloType.getContext());
   });
 }
 
+StablehloToHloTypeConverter::StablehloToHloTypeConverter(
+    bool convertXlaSupportedStablehlo)
+    : HloTypeConverter(),
+      convert_xla_supported_stablehlo_(convertXlaSupportedStablehlo) {
+  if (convert_xla_supported_stablehlo_) {
+    addConversion([](stablehlo::TokenType stablehloType) -> Type {
+      return mhlo::TokenType::get(stablehloType.getContext());
+    });
+  } else {
+    addConversion([](stablehlo::TokenType stablehloType) -> Type {
+      return stablehlo::TokenType::get(stablehloType.getContext());
+    });
+  }
+}
+
 bool StablehloToHloTypeConverter::isSourceDialect(Dialect& dialect) {
   return dialect.getNamespace() ==
          stablehlo::StablehloDialect::getDialectNamespace();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.h b/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.h
index 0bcba717981b..7ee0b40356ff 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.h
@@ -82,8 +82,12 @@ class HloToStablehloTypeConverter : public HloTypeConverter {
 class StablehloToHloTypeConverter : public HloTypeConverter {
  public:
   StablehloToHloTypeConverter();
+  explicit StablehloToHloTypeConverter(bool convertXlaSupportedStablehlo);
   bool isSourceDialect(Dialect& dialect) override;
   Attribute convertSourceDialectEncoding(Attribute attr) override;
+
+ private:
+  bool convert_xla_supported_stablehlo_;
 };
 
 // Complements StableHLO <=> MHLO conversion patterns with boilerplate that
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/CMakeLists.txt
similarity index 100%
rename from third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
rename to third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/CMakeLists.txt
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/shape_component_analysis.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/shape_component_analysis.cpp
new file mode 100644
index 000000000000..a86e24602a5e
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/shape_component_analysis.cpp
@@ -0,0 +1,851 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "stablehlo_ext/analysis/shape_component_analysis.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
+
+namespace mlir {
+namespace stablehlo_ext {
+
+using SymbolicShapeConstraintsMap =
+    ShapeComponentAnalysis::SymbolicShapeConstraintsMap;
+using ShapeOrValueInfo = ShapeComponentAnalysis::ShapeOrValueInfo;
+using Symbol = ShapeComponentAnalysis::Symbol;
+using SymbolicExpr = ShapeComponentAnalysis::SymbolicExpr;
+using SymbolicExprsMap = ShapeComponentAnalysis::SymbolicExprsMap;
+
+namespace {
+// Shape visitor. This implements a symbolic interpreter for MHLO with some
+// shape and tensor dialect ops mixed in. We are interested in shapes (e.g., the
+// dimensions of a tensor) and values (e.g, the elements of a shape tensor). The
+// goal is to assign every component of a shape or value either a symbol, a
+// constant, or a symbolic expression. We propagate these symbolic expressions
+// through the various operations. Later optimization passes can use this
+// information for optimizations, e.g., exploiting the equality of dimensions.
+//
+// The visitation happens in two phases:
+//   1. Find the sources of a value's shape or value. This climbs up the
+//      operations from a given value until an unknown op or a function argument
+//      is found. These sources are assigned the initial symbols for each of
+//      their components.
+//   2. Propagate the initial symbols downwards. This builds symbolic
+//      expressions so users of the analysis can pattern match things like
+//      "two dimensions are multiplied".
+//
+// Conceptually, this is defined recursively. For each op, we compute the
+// required shape or value information for the operands and then derive the
+// resulting symbolic expression.
+struct ShapeVisitor {
+  ShapeVisitor(SymbolicExprsMap *symbolicExprsMap,
+               SymbolicShapeConstraintsMap *symbolicShapeConstraintsMap)
+      : symbolicExprsMap(symbolicExprsMap),
+        symbolicShapeConstraintsMap(symbolicShapeConstraintsMap) {}
+
+  void visit(ShapeOrValueInfo requestedInfo) {
+    backwardsWorklist.push_back(requestedInfo);
+
+    // First, we climb up the operations so we get the set of all ops taking
+    // part in this shape or value computation. An alternative would be
+    // analyzing everything eagerly. This backwards pass allows us to be lazy.
+    while (!backwardsWorklist.empty()) {
+      // Skip if already processed.
+      ShapeOrValueInfo transitivelyRequestedInfo =
+          backwardsWorklist.pop_back_val();
+      if (symbolicExprsMap->count(transitivelyRequestedInfo)) continue;
+
+      // Skip irrelevant cases early.
+      Value value = transitivelyRequestedInfo.value();
+      Type ty = value.getType();
+      if (!ty.isIntOrIndexOrFloat() && !mlir::isa<RankedTensorType>(ty))
+        continue;
+
+      // Handle shapes.
+      if (transitivelyRequestedInfo.isShapeInfo()) {
+        if (value.getDefiningOp<shape::AssumingOp>()) {
+          backwardAssumingShape(value);
+        } else if (auto bcast = value.getDefiningOp<
+                                stablehlo::DynamicBroadcastInDimOp>()) {
+          backwardDynamicBroadcastInDimShape(bcast);
+        } else if (auto reshape =
+                       value.getDefiningOp<stablehlo::DynamicReshapeOp>()) {
+          backwardDynamicReshapeShape(reshape);
+        } else if (value.getDefiningOp<stablehlo::ReduceOp>()) {
+          backwardReduceShape(value);
+        } else if (auto transpose =
+                       value.getDefiningOp<stablehlo::TransposeOp>()) {
+          backwardTransposeShape(transpose);
+        } else if (auto select = value.getDefiningOp<stablehlo::SelectOp>()) {
+          backwardSelectShape(select);
+        } else if (auto arg = mlir::dyn_cast<BlockArgument>(value)) {
+          backwardBlockArgumentShape(arg);
+        } else if (value.getDefiningOp() &&
+                   value.getDefiningOp()
+                       ->hasTrait<OpTrait::SameOperandsAndResultShape>()) {
+          backwardSameOperandsAndResultShape(value);
+        } else {
+          backwardUnknownShape(value);
+        }
+        continue;
+      }
+
+      // Skip irrelevant cases early.
+      auto rankedTy = mlir::dyn_cast<RankedTensorType>(ty);
+      bool isPossiblyInterestingScalar = ty.isIntOrIndex();
+      bool isPossiblyInterestingTensor =
+          rankedTy && rankedTy.getRank() <= 1 && rankedTy.hasStaticShape();
+      if (!isPossiblyInterestingScalar && !isPossiblyInterestingTensor) {
+        continue;
+      }
+
+      // Handle values.
+      assert(transitivelyRequestedInfo.isValueInfo() &&
+             "Expect value info at this point.");
+      if (auto shapeof = value.getDefiningOp<shape::ShapeOfOp>()) {
+        backwardShapeOf(shapeof);
+      } else if (auto bcast = value.getDefiningOp<shape::BroadcastOp>()) {
+        backwardBroadcast(bcast);
+      } else if (auto numElements =
+                     value.getDefiningOp<shape::NumElementsOp>()) {
+        backwardNumElements(numElements);
+      } else if (auto dim = value.getDefiningOp<tensor::DimOp>()) {
+        backwardDim(dim);
+      } else if (auto cast = value.getDefiningOp<arith::IndexCastOp>()) {
+        backwardIndexCast(cast);
+      } else if (auto fromElements =
+                     value.getDefiningOp<tensor::FromElementsOp>()) {
+        backwardTensorFromElements(fromElements);
+      } else if (auto extract = value.getDefiningOp<tensor::ExtractOp>()) {
+        backwardTensorExtract(extract);
+      } else if (auto add = value.getDefiningOp<stablehlo::AddOp>()) {
+        backwardBinOp(add);
+      } else if (auto mul = value.getDefiningOp<stablehlo::MulOp>()) {
+        backwardBinOp(mul);
+      } else if (auto add = value.getDefiningOp<arith::AddIOp>()) {
+        backwardBinOp(add);
+      } else if (auto mul = value.getDefiningOp<arith::MulIOp>()) {
+        backwardBinOp(mul);
+      } else if (auto concat =
+                     value.getDefiningOp<stablehlo::ConcatenateOp>()) {
+        backwardConcatenate(concat);
+      } else if (auto reshape = value.getDefiningOp<stablehlo::ReshapeOp>()) {
+        backwardReshape(reshape);
+      } else if (auto slice = value.getDefiningOp<stablehlo::SliceOp>()) {
+        backwardSlice(slice);
+      } else if (matchPattern(value, m_Constant())) {
+        backwardConstant(value);
+      } else {
+        backwardUnknown(value);
+      }
+    }
+
+    // Second, we walk down from the defs to the uses, building symbolic
+    // expressions for shape and value components.
+    while (!forwardsWorklist.empty()) {
+      auto transitivelyRequestedInfo = forwardsWorklist.pop_back_val();
+
+      // Skip if already processed.
+      if (symbolicExprsMap->count(transitivelyRequestedInfo)) continue;
+
+      // Handle shapes.
+      Value value = transitivelyRequestedInfo.value();
+      if (!transitivelyRequestedInfo.isValueInfo()) {
+        if (value.getDefiningOp<shape::AssumingOp>()) {
+          forwardAssumingShape(value);
+        } else if (auto broadcast = value.getDefiningOp<
+                                    stablehlo::DynamicBroadcastInDimOp>()) {
+          forwardDynamicBroadcastInDimShape(broadcast);
+        } else if (auto reshape =
+                       value.getDefiningOp<stablehlo::DynamicReshapeOp>()) {
+          forwardDynamicReshapeShape(reshape);
+        } else if (value.getDefiningOp<stablehlo::ReduceOp>()) {
+          forwardReduceShape(value);
+        } else if (auto transpose =
+                       value.getDefiningOp<stablehlo::TransposeOp>()) {
+          forwardTransposeShape(transpose);
+        } else if (auto select = value.getDefiningOp<stablehlo::SelectOp>()) {
+          forwardSelectShape(select);
+        } else if (value.getDefiningOp() &&
+                   value.getDefiningOp()
+                       ->hasTrait<OpTrait::SameOperandsAndResultShape>()) {
+          forwardSameOperandsShape(value);
+        } else {
+          forwardUnknownShape(value);
+        }
+        continue;
+      }
+
+      // Handle values.
+      assert(transitivelyRequestedInfo.isValueInfo() &&
+             "Expect value info at this point.");
+      if (auto shapeof = value.getDefiningOp<shape::ShapeOfOp>()) {
+        forwardShapeOf(shapeof);
+      } else if (auto bcast = value.getDefiningOp<shape::BroadcastOp>()) {
+        forwardBroadcast(bcast);
+      } else if (auto numElements =
+                     value.getDefiningOp<shape::NumElementsOp>()) {
+        forwardNumElements(numElements);
+      } else if (auto dim = value.getDefiningOp<tensor::DimOp>()) {
+        forwardDim(dim);
+      } else if (auto cast = value.getDefiningOp<arith::IndexCastOp>()) {
+        forwardIndexCast(cast);
+      } else if (auto fromElements =
+                     value.getDefiningOp<tensor::FromElementsOp>()) {
+        forwardTensorFromElements(fromElements);
+      } else if (auto extract = value.getDefiningOp<tensor::ExtractOp>()) {
+        forwardTensorExtract(extract);
+      } else if (auto add = value.getDefiningOp<stablehlo::AddOp>()) {
+        forwardBinOp(add, [](AffineExpr a, AffineExpr b) { return a + b; });
+      } else if (auto mul = value.getDefiningOp<stablehlo::MulOp>()) {
+        forwardBinOp(mul, [](AffineExpr a, AffineExpr b) { return a * b; });
+      } else if (auto add = value.getDefiningOp<arith::AddIOp>()) {
+        forwardBinOp(add, [](AffineExpr a, AffineExpr b) { return a + b; });
+      } else if (auto mul = value.getDefiningOp<arith::MulIOp>()) {
+        forwardBinOp(mul, [](AffineExpr a, AffineExpr b) { return a * b; });
+      } else if (auto concat =
+                     value.getDefiningOp<stablehlo::ConcatenateOp>()) {
+        forwardConcatenate(concat);
+      } else if (auto reshape = value.getDefiningOp<stablehlo::ReshapeOp>()) {
+        forwardReshape(reshape);
+      } else if (auto slice = value.getDefiningOp<stablehlo::SliceOp>()) {
+        forwardSlice(slice);
+      } else if (matchPattern(value, m_Constant())) {
+        forwardConstant(value);
+      } else {
+        forwardUnknown(value);
+      }
+    }
+  }
+
+ private:
+  // ===
+  // Functions that traverse the shapes of operations.
+  // ===
+
+  void backwardAssumingShape(Value op) {
+    auto assumingOp = op.getDefiningOp<shape::AssumingOp>();
+    auto number = mlir::cast<OpResult>(op).getResultNumber();
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
+    backwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(
+        cast<shape::AssumingYieldOp>(
+            assumingOp.getDoRegion().back().getTerminator())
+            .getOperand(number)));
+  }
+  void forwardAssumingShape(Value op) {
+    auto assumingOp = op.getDefiningOp<shape::AssumingOp>();
+    auto number = mlir::cast<OpResult>(op).getResultNumber();
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
+    dims = lookup(ShapeOrValueInfo::getShapeInfoOf(
+        cast<shape::AssumingYieldOp>(
+            assumingOp.getDoRegion().back().getTerminator())
+            .getOperand(number)));
+  }
+  void backwardBroadcast(shape::BroadcastOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    for (Value s : op.getShapes())
+      backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(s));
+  }
+  void forwardBroadcast(shape::BroadcastOp op) {
+    auto *ctx = op.getContext();
+
+    // Get operands' info.
+    SmallVector<ArrayRef<SymbolicExpr>> argsInfo =
+        llvm::to_vector(llvm::map_range(op.getShapes(), [&](Value s) {
+          return lookup(ShapeOrValueInfo::getValueInfoOf(s));
+        }));
+
+    // Determine broadcasted rank.
+    size_t rank = 0;
+    for (auto &info : argsInfo) rank = std::max(rank, info.size());
+
+    // Evaluate broadcast per result dimension.
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    for (size_t i = 0; i < rank; ++i) {
+      // Init with neural element.
+      SymbolicExpr bcastedExpr;
+      bcastedExpr.expr = getAffineConstantExpr(1, ctx);
+
+      // Consider all the operands.
+      for (auto &info : argsInfo) {
+        // Find corresponding symbolic expression for the ith result dimension,
+        // if the operand contributes.
+        size_t argRank = info.size();
+        if (i + argRank < rank) continue;
+        size_t j = i + argRank - rank;
+        SymbolicExpr expr = info[j];
+
+        // One dimensions are neutral.
+        if (expr.isConstant(1)) continue;
+
+        // If a dimension is known not to be 1, we can use this expression.
+        if (expr.isKnownNotOne()) {
+          bcastedExpr = expr;
+          break;
+        }
+
+        // If all other dimensions were neutral, try using this expression.
+        if (bcastedExpr.isConstant(1)) {
+          bcastedExpr = expr;
+          continue;
+        }
+
+        // If we have contradicting expressions, give up and create a new
+        // symbol.
+        if (bcastedExpr != expr) {
+          bcastedExpr.expr = getAffineSymbolExpr(0, ctx);
+          bcastedExpr.symbols = {{ShapeOrValueInfo::getValueInfoOf(op), i}};
+          break;
+        }
+      }
+
+      dims.push_back(bcastedExpr);
+    }
+    assert(dims.size() == rank && "expect one expression per dimension");
+  }
+  void backwardDynamicBroadcastInDimShape(
+      stablehlo::DynamicBroadcastInDimOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getValueInfoOf(op.getOutputDimensions()));
+  }
+  void forwardDynamicBroadcastInDimShape(
+      stablehlo::DynamicBroadcastInDimOp op) {
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
+    dims = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOutputDimensions()));
+  }
+  void backwardDynamicReshapeShape(stablehlo::DynamicReshapeOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getValueInfoOf(op.getOutputShape()));
+  }
+  void forwardDynamicReshapeShape(stablehlo::DynamicReshapeOp op) {
+    auto rankedTy = mlir::cast<RankedTensorType>(op.getResult().getType());
+    auto shapeDims =
+        lookup(ShapeOrValueInfo::getValueInfoOf(op.getOutputShape()));
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
+    dimsFromStaticShape(rankedTy, shapeDims, &dims);
+  }
+  void backwardReduceShape(Value op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
+    auto reduceOp = op.getDefiningOp<stablehlo::ReduceOp>();
+    if (reduceOp.getInputs().size() == 1) {
+      backwardsWorklist.push_back(
+          ShapeOrValueInfo::getShapeInfoOf(reduceOp.getInputs().back()));
+    }
+  }
+  void forwardReduceShape(Value op) {
+    auto reduceOp = op.getDefiningOp<stablehlo::ReduceOp>();
+    if (reduceOp.getInputs().size() != 1) return forwardUnknownShape(op);
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
+    for (const auto &dim : llvm::enumerate(lookup(
+             ShapeOrValueInfo::getShapeInfoOf(reduceOp.getInputs().back())))) {
+      if (!llvm::is_contained(reduceOp.getDimensions(), dim.index()))
+        dims.push_back(dim.value());
+    }
+  }
+  void backwardTransposeShape(stablehlo::TransposeOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getShapeInfoOf(op.getOperand()));
+  }
+  void forwardTransposeShape(stablehlo::TransposeOp op) {
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
+    auto in = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getOperand()));
+    for (const auto &val : op.getPermutation()) dims.push_back(in[val]);
+  }
+  void backwardSelectShape(stablehlo::SelectOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getShapeInfoOf(op.getOnTrue()));
+  }
+  void forwardSelectShape(stablehlo::SelectOp op) {
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
+    // Forward the `on_true` operand, it has the same shape as the output.
+    dims = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getOnTrue()));
+  }
+  void backwardSameOperandsAndResultShape(Value v) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(v));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getShapeInfoOf(v.getDefiningOp()->getOperand(0)));
+  }
+  void forwardSameOperandsShape(Value v) {
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(v));
+    dims = lookup(
+        ShapeOrValueInfo::getShapeInfoOf(v.getDefiningOp()->getOperand(0)));
+  }
+  void backwardBlockArgumentShape(BlockArgument argument) {
+    // JitRT uses rt.symbolic_shape to describe identical dimensions. Make
+    // use of that when it exists.
+    //
+    // Example:
+    //   func @compute(
+    //     %arg0: tensor<?xf32> {rt.symbolic_shape = dense<-2> :
+    //     tensor<1xi64>}, %arg1: tensor<?xf32> {rt.symbolic_shape =
+    //     dense<-2> : tensor<1xi64>})
+    //   } { ... }
+    //
+    // Symbolic shape is a negative value smaller than `-1`. The concrete value
+    // is not known at compile time, and in this particular example it is only
+    // known that both arguments have the same shape.
+    //
+    // TODO(ezhulenev): Add symbolic shape attribute verifier to the jitrt
+    // dialect.
+    if (auto func = dyn_cast_or_null<func::FuncOp>(
+            argument.getOwner()->getParentOp())) {
+      if (auto shape = func.getArgAttrOfType<DenseIntElementsAttr>(
+              argument.getArgNumber(), "rt.symbolic_shape")) {
+        auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(argument));
+        auto id = getAffineSymbolExpr(0, argument.getContext());
+        for (const auto &symbol : llvm::enumerate(shape.getValues<ssize_t>())) {
+          dims.emplace_back();
+          auto &dim = dims.back();
+          if (symbol.value() >= 0) {
+            dim.expr =
+                getAffineConstantExpr(symbol.value(), argument.getContext());
+          } else {
+            auto it = symbolicShapeConstraintsMap->try_emplace(
+                symbol.value(),
+                Symbol{ShapeOrValueInfo::getShapeInfoOf(argument),
+                       symbol.index()});
+            dim.symbols.push_back(it.first->second);
+            dim.expr = id;
+          }
+        }
+        return;
+      }
+    }
+    forwardUnknownShape(argument);
+  }
+  void backwardUnknownShape(Value v) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(v));
+  }
+  void forwardUnknownShape(Value v) {
+    auto rankedTy = mlir::dyn_cast<RankedTensorType>(v.getType());
+    if (!rankedTy) return;
+    auto id = getAffineSymbolExpr(0, v.getContext());
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(v));
+    return dimsFromStaticShape(
+        rankedTy,
+        [&](size_t i) {
+          SymbolicExpr d;
+          d.symbols.push_back({ShapeOrValueInfo::getShapeInfoOf(v), i});
+          d.expr = id;
+          return d;
+        },
+        &dims);
+  }
+
+  // ===
+  // Functions that traverse values. These can be shape tensors (e.g., of type
+  // tensor<3xindex>) or interesting scalars (e.g., of type index).
+  // ===
+
+  void backwardShapeOf(shape::ShapeOfOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    backwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
+  }
+  void forwardShapeOf(shape::ShapeOfOp op) {
+    auto rankedTy = mlir::cast<RankedTensorType>(op.getArg().getType());
+    auto arg = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    return dimsFromStaticShape(rankedTy, arg, &dims);
+  }
+  void backwardNumElements(shape::NumElementsOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getValueInfoOf(op.getShape()));
+  }
+  void forwardNumElements(shape::NumElementsOp op) {
+    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getShape()));
+
+    // Accumulate product symbolically and concrete where possible.
+    int64_t concreteProduct = 1;
+    SymbolicExpr dim;
+    for (auto &it : in) {
+      // For constant expressions, we can accumulate a concrete product.
+      if (auto cexpr = dyn_cast<AffineConstantExpr>(it.expr)) {
+        assert(cexpr.getValue() > 0 && "shape value must be positive");
+        concreteProduct *= cexpr.getValue();
+        continue;
+      }
+
+      // Simply copy the first sybolic factor.
+      if (!dim.expr) {
+        dim = it;
+        continue;
+      }
+
+      // Multiply remaining symbolic factors.
+      dim.expr = dim.expr *
+                 it.expr.shiftSymbols(dim.symbols.size(), it.symbols.size());
+      dim.symbols.append(it.symbols);
+    }
+
+    // Combine concrete and symbolic product.
+    if (concreteProduct != 1 || !dim.expr) {
+      auto cexpr = getAffineConstantExpr(concreteProduct, op.getContext());
+      if (dim.expr)
+        dim.expr = cexpr * dim.expr;
+      else
+        dim.expr = cexpr;
+    }
+
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    dims.push_back(dim);
+  }
+  void backwardDim(tensor::DimOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getShapeInfoOf(op.getSource()));
+  }
+  void forwardDim(tensor::DimOp op) {
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    if (auto index = op.getIndex().getDefiningOp<arith::ConstantOp>()) {
+      int64_t i = mlir::cast<IntegerAttr>(index.getValue()).getInt();
+      auto in = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getSource()));
+      if (i >= static_cast<int64_t>(in.size()) || i < 0)
+        llvm::report_fatal_error("tensor dim out of bounds");
+      dims.push_back({in[i].symbols, in[i].expr});
+    } else {
+      forwardUnknown(op);
+    }
+  }
+  template <typename Op>
+  void backwardBinOp(Op op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    // TODO(jpienaar): Switch to named accessors when MHLO uses prefixed form.
+    backwardsWorklist.append(
+        {ShapeOrValueInfo::getValueInfoOf(op.getOperand(0)),
+         ShapeOrValueInfo::getValueInfoOf(op.getOperand(1))});
+  }
+  template <typename Op, typename Combiner>
+  void forwardBinOp(Op op, Combiner &&combiner) {
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    // TODO(jpienaar): Switch to named accessors when MHLO uses prefixed form.
+    auto lhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand(0)));
+    auto rhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand(1)));
+    for (int64_t i = 0, e = dim0size(op.getType()); i != e; ++i) {
+      dims.emplace_back();
+      auto &dim = dims.back();
+      dim.symbols.append(lhs[i].symbols);
+      dim.symbols.append(rhs[i].symbols);
+      dim.expr = combiner(lhs[i].expr,
+                          rhs[i].expr.shiftSymbols(rhs[i].symbols.size(),
+                                                   lhs[i].symbols.size()));
+    }
+  }
+  void backwardIndexCast(arith::IndexCastOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op.getIn()));
+  }
+  void forwardIndexCast(arith::IndexCastOp op) {
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getIn()));
+    for (int64_t i = 0, e = dim0size(op.getType()); i != e; ++i) {
+      // This is intentionally not modelling the truncation/zero extension of
+      // index_cast. While it's incorrect it doesn't really matter for shape
+      // computations.
+      dims.push_back({in[i].symbols, in[i].expr});
+    }
+  }
+  void backwardTensorFromElements(tensor::FromElementsOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    for (auto operand : op.getOperands())
+      backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(operand));
+  }
+  void forwardTensorFromElements(tensor::FromElementsOp op) {
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    for (auto operand : op.getOperands()) {
+      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(operand));
+      assert(in.size() == 1);
+      dims.push_back({in[0].symbols, in[0].expr});
+    }
+  }
+  void backwardTensorExtract(tensor::ExtractOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getValueInfoOf(op.getTensor()));
+  }
+  void forwardTensorExtract(tensor::ExtractOp op) {
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    assert(op.getIndices().size() == 1);
+    if (auto index =
+            op.getIndices().front().getDefiningOp<arith::ConstantOp>()) {
+      int64_t i = mlir::cast<IntegerAttr>(index.getValue()).getInt();
+      // We asssume this is in bounds.
+      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getTensor()));
+      dims.push_back({in[i].symbols, in[i].expr});
+    } else {
+      forwardUnknown(op);
+    }
+  }
+  void backwardConstant(Value v) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(v));
+  }
+  void forwardConstant(Value v) {
+    IntegerAttr intAttr;
+    DenseIntElementsAttr denseAttr;
+    if (matchPattern(v, m_Constant(&denseAttr))) {
+      auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
+      for (uint64_t i = 0, e = dim0size(v.getType()); i != e; ++i) {
+        dims.emplace_back();
+        auto &dim = dims.back();
+        dim.expr = getAffineConstantExpr(
+            denseAttr.getValues<APInt>()[i].getSExtValue(), v.getContext());
+      }
+    } else if (matchPattern(v, m_Constant(&intAttr))) {
+      auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
+      dims.emplace_back();
+      auto &dim = dims.back();
+      dim.expr = getAffineConstantExpr(intAttr.getInt(), v.getContext());
+    } else {
+      forwardUnknown(v);
+    }
+  }
+  void backwardConcatenate(stablehlo::ConcatenateOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    for (auto operand : op.getOperands())
+      backwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(operand));
+  }
+  void forwardConcatenate(stablehlo::ConcatenateOp op) {
+    for (auto operand : op.getOperands()) {
+      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(operand));
+      if (in.size() != 1) return forwardUnknown(op);
+    }
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    for (auto operand : op.getOperands()) {
+      auto in = lookup(ShapeOrValueInfo::getValueInfoOf(operand));
+      dims.push_back({in[0].symbols, in[0].expr});
+    }
+  }
+  void backwardReshape(stablehlo::ReshapeOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
+  }
+  void forwardReshape(stablehlo::ReshapeOp op) {
+    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
+    if (in.size() != 1) return forwardUnknown(op);
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    dims.push_back({in[0].symbols, in[0].expr});
+  }
+  void backwardSlice(stablehlo::SliceOp op) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
+    backwardsWorklist.push_back(
+        ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
+  }
+  void forwardSlice(stablehlo::SliceOp op) {
+    // Only handle slices equivalent to an extract.
+    if (!op.getType().hasStaticShape({1})) {
+      return forwardUnknown(op);
+    }
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
+    auto first = op.getStartIndices().front();
+    if (first >= in.size()) {  // Bounds check.
+      return forwardUnknown(op);
+    }
+    dims.push_back({in[first].symbols, in[first].expr});
+  }
+  void backwardUnknown(Value v) {
+    forwardsWorklist.push_back(ShapeOrValueInfo::getValueInfoOf(v));
+  }
+  void forwardUnknown(Value v) {
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
+    auto id = getAffineSymbolExpr(0, v.getContext());
+    for (size_t i = 0, e = dim0size(v.getType()); i != e; ++i) {
+      dims.emplace_back();
+      auto &dim = dims.back();
+      dim.symbols.push_back({ShapeOrValueInfo::getValueInfoOf(v), i});
+      dim.expr = id;
+    }
+  }
+
+  // ===
+  // Helpers
+  // ===
+
+  static void dimsFromStaticShape(
+      RankedTensorType rankedTy,
+      llvm::function_ref<SymbolicExpr(int64_t)> fallback,
+      std::vector<SymbolicExpr> *mergedDims) {
+    auto *ctx = rankedTy.getContext();
+    for (int64_t i = 0, e = rankedTy.getRank(); i != e; ++i) {
+      if (rankedTy.isDynamicDim(i)) {
+        mergedDims->push_back(fallback(i));
+      } else {
+        mergedDims->emplace_back();
+        auto &d = mergedDims->back();
+        d.expr = getAffineConstantExpr(rankedTy.getDimSize(i), ctx);
+      }
+    }
+  }
+
+  static void dimsFromStaticShape(RankedTensorType rankedTy,
+                                  ArrayRef<SymbolicExpr> fallback,
+                                  std::vector<SymbolicExpr> *mergedDims) {
+    return dimsFromStaticShape(
+        rankedTy, [&](int64_t i) { return fallback[i]; }, mergedDims);
+  }
+
+  // Return the size of the first dimension. Returns 1 for scalars.
+  static int64_t dim0size(Type type) {
+    if (auto rankedType = mlir::dyn_cast<RankedTensorType>(type))
+      return rankedType.getRank() == 0 ? 1 : rankedType.getDimSize(0);
+    return 1;
+  }
+
+  // Retrieves the existing information from the cache.
+  ArrayRef<SymbolicExpr> lookup(ShapeOrValueInfo requestedInfo) {
+    auto i = symbolicExprsMap->find(requestedInfo);
+    assert(i != symbolicExprsMap->end() && "op not processed yet?");
+    return llvm::ArrayRef(i->second);
+  }
+
+  // Inserts a new entry into the cache and returns a reference to its result
+  // components.
+  std::vector<SymbolicExpr> &insert(ShapeOrValueInfo requestedInfo) {
+    auto i = symbolicExprsMap->try_emplace(requestedInfo);
+    assert(i.second && "op already processed?");
+    return i.first->second;
+  }
+
+  SymbolicExprsMap *symbolicExprsMap;
+  SymbolicShapeConstraintsMap *symbolicShapeConstraintsMap;
+
+  // Worklists for the forward and backward passes.
+  SmallVector<ShapeOrValueInfo> backwardsWorklist;
+  SmallVector<ShapeOrValueInfo> forwardsWorklist;
+};
+}  // namespace
+
+void ShapeComponentAnalysis::compute(ShapeOrValueInfo requestedInfo) {
+  ShapeVisitor(&symbolicExprsMap, &symbolicShapeConstraintsMap)
+      .visit(requestedInfo);
+}
+
+std::optional<ArrayRef<SymbolicExpr>>
+ShapeComponentAnalysis::ShapeComponentAnalysis::GetShapeInfo(Value value) {
+  auto request = ShapeOrValueInfo::getShapeInfoOf(value);
+  compute(request);
+  auto found = symbolicExprsMap.find(request);
+  if (found == symbolicExprsMap.end()) return {};
+  return llvm::ArrayRef(found->second);
+}
+
+std::optional<ArrayRef<SymbolicExpr>>
+ShapeComponentAnalysis::ShapeComponentAnalysis::GetValueInfo(Value shape) {
+  auto request = ShapeOrValueInfo::getValueInfoOf(shape);
+  compute(request);
+  auto found = symbolicExprsMap.find(request);
+  if (found == symbolicExprsMap.end()) return {};
+  return llvm::ArrayRef(found->second);
+}
+
+void ShapeComponentAnalysis::reset() {
+  symbolicExprsMap.clear();
+  symbolicShapeConstraintsMap.clear();
+}
+
+bool SymbolicExpr::isConstant(int64_t value) const {
+  return isa<AffineConstantExpr>(expr) &&
+         cast<AffineConstantExpr>(expr).getValue() == value;
+}
+
+bool SymbolicExpr::isKnownNotNegativeOne() const {
+  // If the symbol is coming from a shape it can't be a -1. Also allow results
+  // of shape_of, compute_reshape_shape, and num_elements. This is correct, not
+  // complete.
+  auto isGoodSymbol = [](const Symbol &symbol) {
+    if (symbol.source.isShapeInfo()) return true;
+    Operation *op = symbol.source.value().getDefiningOp();
+    if (op == nullptr) return false;
+    return llvm::isa<shape::ShapeOfOp, shape::NumElementsOp>(op);
+  };
+
+  // For constants we know if it's -1 or not. Checking the sign is sufficient
+  // here and allows for reuse below. This is correct, not complete.
+  auto isGoodSymbolOrGoodConstantExpr = [&](AffineExpr expr) {
+    if (auto symExpr = dyn_cast<AffineSymbolExpr>(expr))
+      return isGoodSymbol(symbols[symExpr.getPosition()]);
+    if (auto constExpr = dyn_cast<AffineConstantExpr>(expr))
+      return constExpr.getValue() >= 0;
+    return false;
+  };
+
+  if (isGoodSymbolOrGoodConstantExpr(expr)) return true;
+
+  // Multiplying non-negative symbols and non-negative constants will always
+  // give a positive result. This is correct, not complete.
+  // TODO(kramerb): Could the analysis provide a generic interface for this?
+  if (auto bexpr = dyn_cast<AffineBinaryOpExpr>(expr)) {
+    return bexpr.getKind() == AffineExprKind::Mul &&
+           isGoodSymbolOrGoodConstantExpr(bexpr.getLHS()) &&
+           isGoodSymbolOrGoodConstantExpr(bexpr.getRHS());
+  }
+
+  return false;
+}
+
+bool SymbolicExpr::isKnownNotOne() const {
+  if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
+    return constExpr.getValue() != 1;
+  }
+  return false;
+}
+
+std::optional<Symbol> SymbolicExpr::singleton() const {
+  if (isa<AffineSymbolExpr>(expr) &&
+      cast<AffineSymbolExpr>(expr).getPosition() == 0) {
+    assert(symbols.size() == 1);
+    return symbols[0];
+  }
+  return std::nullopt;
+}
+
+void SymbolicExpr::dump(llvm::raw_ostream &os) const {
+  expr.print(os);
+  if (!symbols.empty()) os << " with";
+  os << "\n";
+  if (symbols.empty()) return;
+  for (const auto &sym : llvm::enumerate(symbols)) {
+    os.indent(4);
+    os << 's' << sym.index() << " = ";
+    if (!sym.value().source.isValueInfo()) os << "shapeof(";
+    sym.value().source.value().print(os);
+    if (!sym.value().source.isValueInfo()) os << ")";
+    os << '[' << sym.value().index << "]\n";
+  }
+}
+
+}  // namespace stablehlo_ext
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h b/third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/shape_component_analysis.h
similarity index 82%
rename from third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h
rename to third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/shape_component_analysis.h
index 27d3a643de41..2d49f41e7774 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/analysis/shape_component_analysis.h
@@ -13,17 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
-#define MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
+#ifndef STABLEHLO_EXT_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H_
+#define STABLEHLO_EXT_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H_
 
 #include <optional>
 
 #include "llvm/Support/raw_ostream.h"
-#include "mhlo/IR/hlo_ops.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Value.h"
+#include "stablehlo/dialect/StablehloOps.h"
 
 namespace mlir {
+namespace stablehlo_ext {
 
 // Analysis to infer shape information.
 //
@@ -137,34 +138,38 @@ class ShapeComponentAnalysis {
   // Clear analysis data structures.
   void reset();
 };
+}  // namespace stablehlo_ext
 }  // namespace mlir
 
 namespace llvm {
 
 template <>
-struct DenseMapInfo<mlir::ShapeComponentAnalysis::Symbol> {
-  static inline mlir::ShapeComponentAnalysis::Symbol getEmptyKey() {
-    return {mlir::ShapeComponentAnalysis::ShapeOrValueInfo::DenseMapInfo::
-                getEmptyKey(),
+struct DenseMapInfo<mlir::stablehlo_ext::ShapeComponentAnalysis::Symbol> {
+  static inline mlir::stablehlo_ext::ShapeComponentAnalysis::Symbol
+  getEmptyKey() {
+    return {mlir::stablehlo_ext::ShapeComponentAnalysis::ShapeOrValueInfo::
+                DenseMapInfo::getEmptyKey(),
             llvm::DenseMapInfo<size_t>::getEmptyKey()};
   }
-  static inline mlir::ShapeComponentAnalysis::Symbol getTombstoneKey() {
-    return {mlir::ShapeComponentAnalysis::ShapeOrValueInfo::DenseMapInfo::
-                getTombstoneKey(),
+  static inline mlir::stablehlo_ext::ShapeComponentAnalysis::Symbol
+  getTombstoneKey() {
+    return {mlir::stablehlo_ext::ShapeComponentAnalysis::ShapeOrValueInfo::
+                DenseMapInfo::getTombstoneKey(),
             llvm::DenseMapInfo<size_t>::getTombstoneKey()};
   }
-  static unsigned getHashValue(mlir::ShapeComponentAnalysis::Symbol symbol) {
+  static unsigned getHashValue(
+      mlir::stablehlo_ext::ShapeComponentAnalysis::Symbol symbol) {
     return llvm::hash_combine(
-        mlir::ShapeComponentAnalysis::ShapeOrValueInfo::DenseMapInfo::
-            getHashValue(symbol.source),
+        mlir::stablehlo_ext::ShapeComponentAnalysis::ShapeOrValueInfo::
+            DenseMapInfo::getHashValue(symbol.source),
         llvm::DenseMapInfo<size_t>::getHashValue(symbol.index));
   }
-  static bool isEqual(mlir::ShapeComponentAnalysis::Symbol lhs,
-                      mlir::ShapeComponentAnalysis::Symbol rhs) {
+  static bool isEqual(mlir::stablehlo_ext::ShapeComponentAnalysis::Symbol lhs,
+                      mlir::stablehlo_ext::ShapeComponentAnalysis::Symbol rhs) {
     return lhs == rhs;
   }
 };
 
 }  // namespace llvm
 
-#endif  // MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
+#endif  // STABLEHLO_EXT_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H_
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/CMakeLists.txt
index 2cd7db45f8cd..ee58f490ca6d 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/CMakeLists.txt
@@ -19,6 +19,7 @@ add_public_tablegen_target(StablehloExtensionPassesIncGen)
 add_mlir_dialect_library(StablehloExtensionPasses
   PARTIAL_SOURCES_INTENDED
   chlo_recompose_ops.cpp
+  chlo_preserve_high_level_ops.cpp
   stablehlo_canonicalize_dynamism.cpp
   stablehlo_refine_shapes.cpp
   sdy_refine_shapes.cpp
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp
new file mode 100644
index 000000000000..5be9e8ec36bf
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_preserve_high_level_ops.cpp
@@ -0,0 +1,314 @@
+/* Copyright 2024 The StableHLO Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "stablehlo/dialect/ChloOps.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "stablehlo_ext/transforms/passes.h"  // IWYU pragma: keep, passes.h.inc
+
+#define DEBUG_TYPE "stablehlo-ext-chlo"
+
+namespace mlir {
+namespace stablehlo_ext {
+
+#define GEN_PASS_DEF_CHLOPRESERVEHIGHLEVELOPSPASS
+#include "stablehlo_ext/transforms/passes.h.inc"
+
+namespace {
+
+/////////
+// Composite Builder functions
+// TODO: OSS these in StableHLO and use the functions from there.
+///////////
+
+// functionality of the provided `implOp`. The new function is named uniquely
+// and is set to private visibility.
+mlir::func::FuncOp buildFuncOpWrappingOperation(mlir::Operation* op,
+                                                mlir::ModuleOp module) {
+  mlir::SymbolTable symbolTable(module);
+
+  // Create an OpBuilder, insertion point at the end of module's body.
+  mlir::OpBuilder builder(module);
+  builder.setInsertionPointToEnd(&module.getBodyRegion().back());
+
+  // Create the function operation, set private and add to the symbol table.
+  // SymbolTable will resolve all name conflicts.
+  Location loc = op->getLoc();
+  auto funcName = (op->getName().getStringRef() + ".impl").str();
+  mlir::func::FuncOp func = builder.create<mlir::func::FuncOp>(
+      loc, funcName,
+      builder.getFunctionType(op->getOperandTypes(), op->getResultTypes()));
+  func.setPrivate();
+  symbolTable.insert(func);
+
+  Block* body = func.addEntryBlock();
+  builder.setInsertionPointToStart(body);
+  Operation* clonedOp = builder.clone(*op);
+  clonedOp->setOperands(body->getArguments());
+  builder.create<mlir::func::ReturnOp>(loc, clonedOp->getResults());
+
+  LLVM_DEBUG(llvm::dbgs() << "Created function " << func.getName() << "\n");
+  return func;
+}
+
+stablehlo::CompositeOp wrapOperationInComposite(OpBuilder& builder,
+                                                Operation* op,
+                                                const NamedAttrList& attrs,
+                                                int32_t version,
+                                                ModuleOp module) {
+  func::FuncOp decomposition = buildFuncOpWrappingOperation(op, module);
+  auto compositeName = op->getName().getStringRef();
+  auto compositeAttributes = builder.getDictionaryAttr(attrs);
+  auto compositeVersion = version;
+  auto compositeDecomposition = decomposition.getSymName();
+  auto composite = builder.create<stablehlo::CompositeOp>(
+      op->getLoc(), op->getResultTypes(), op->getOperands(), compositeName,
+      compositeAttributes, compositeDecomposition, compositeVersion);
+  return composite;
+}
+
+//////////
+// CHLO Attribute Serialization
+//////////
+
+// ragged_dot_dimension_numbers
+//   #chlo.ragged_dot<lhs_batch = [0], rhs_batch = [1], lhs_contract = [2],
+//                    rhs_contract = [2], lhs_ragged = [1], rhs_group = [0]>
+//   ==>
+//   [[lhs_batch], [rhs_batch], [lhs_contract],
+//    [rhs_contract], [lhs_ragged], [rhs_group]]
+FailureOr<Attribute> serializeRaggedDotDimensionNumbersAttr(
+    chlo::RaggedDotOp op, chlo::RaggedDotDimensionNumbersAttr attr) {
+  OpBuilder builder(op);
+  return builder.getArrayAttr({
+      builder.getI64TensorAttr(attr.getLhsBatchingDimensions()),
+      builder.getI64TensorAttr(attr.getRhsBatchingDimensions()),
+      builder.getI64TensorAttr(attr.getLhsContractingDimensions()),
+      builder.getI64TensorAttr(attr.getRhsContractingDimensions()),
+      builder.getI64TensorAttr(attr.getLhsRaggedDimensions()),
+      builder.getI64TensorAttr(attr.getRhsGroupDimensions()),
+  });
+}
+
+// precision_config
+//   [["DEFAULT"], ["DEFAULT"]]
+//   ==>
+//   [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>]
+FailureOr<Attribute> serializePrecisionConfigAttr(chlo::RaggedDotOp op,
+                                                  ArrayAttr attr) {
+  SmallVector<Attribute> stringAttrs;
+  for (auto hloAttr : attr) {
+    auto precisionAttr = llvm::cast<chlo::PrecisionAttr>(hloAttr);
+    if (!precisionAttr)
+      return op->emitError() << "precision_config is not an ArrayAttr of "
+                                "chlo.precision attributes";
+    StringRef precisionStr = chlo::stringifyPrecision(precisionAttr.getValue());
+    if (precisionStr.empty())
+      return op->emitError() << "invalid CHLO precision attribute";
+    stringAttrs.push_back(StringAttr::get(hloAttr.getContext(), precisionStr));
+  }
+  return ArrayAttr::get(op.getContext(), stringAttrs);
+}
+
+template <typename ChloOpTy>
+FailureOr<Attribute> serializeChloAttribute(ChloOpTy op, NamedAttribute attr) {
+  // Handle RaggedDotOp.
+  if constexpr (std::is_same<ChloOpTy, chlo::RaggedDotOp>::value) {
+    if (auto raggedDotAttr =
+            llvm::dyn_cast<chlo::RaggedDotDimensionNumbersAttr>(
+                attr.getValue()))
+      return serializeRaggedDotDimensionNumbersAttr(op, raggedDotAttr);
+    if (attr.getName() == "precision_config") {
+      return serializePrecisionConfigAttr(
+          op, llvm::cast<ArrayAttr>(attr.getValue()));
+    }
+  }
+
+  // Allow passthrough of builtin attributes.
+  if (attr.getValue().getDialect().getNamespace() != "builtin")
+    return op->emitError(
+               "unsupported dialect attribute for CHLO preservation: ")
+           << attr.getName() << " of dialect "
+           << attr.getValue().getDialect().getNamespace();
+  return attr.getValue();
+}
+
+template <typename ChloOpTy>
+FailureOr<SmallVector<NamedAttribute>> serializeChloAttributes(ChloOpTy op) {
+  SmallVector<NamedAttribute> newAttrs;
+  for (auto attr : op->getAttrs()) {
+    auto serializedAttr = serializeChloAttribute(op, attr);
+    if (failed(serializedAttr)) return failure();
+    newAttrs.emplace_back(attr.getName(), serializedAttr.value());
+  }
+  return newAttrs;
+}
+
+////////
+// (Deprecated) Delete after 12w from submit and flip to composite approach.
+// CHLO to CustomCallOp
+////////
+
+// Needs template since serialization uses constexpr logic.
+template <typename ChloOpTy>
+LogicalResult wrapChloOperationInCustomCall(PatternRewriter& rewriter,
+                                            ChloOpTy op,
+                                            StringRef encodedOpName,
+                                            int32_t version) {
+  auto opAttrs = serializeChloAttributes(op);
+  if (failed(opAttrs)) return op->emitError("failed to serialize attributes");
+
+  SmallVector<NamedAttribute> chloAttributes;
+  chloAttributes.push_back(rewriter.getNamedAttr(
+      "call_target_name", rewriter.getStringAttr(encodedOpName)));
+  chloAttributes.push_back(rewriter.getNamedAttr(
+      "mhlo.attributes", rewriter.getDictionaryAttr(opAttrs.value())));
+  chloAttributes.push_back(rewriter.getNamedAttr(
+      "mhlo.version", rewriter.getI64IntegerAttr(version)));
+  rewriter.replaceOpWithNewOp<stablehlo::CustomCallOp>(
+      op, op->getResultTypes(), op->getOperands(), chloAttributes);
+  return success();
+}
+
+struct RaggedDotOpToCustomCallPattern
+    : public OpRewritePattern<chlo::RaggedDotOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::RaggedDotOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOperationInCustomCall(rewriter, op, "chlo.ragged_dot",
+                                         /*version=*/1);
+  }
+};
+
+struct TopKOpToCustomCallPattern : public OpRewritePattern<chlo::TopKOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::TopKOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOperationInCustomCall(rewriter, op, "mhlo.topk",
+                                         /*version=*/1);
+  }
+};
+
+struct ErfOpToCustomCallPattern : public OpRewritePattern<chlo::ErfOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::ErfOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOperationInCustomCall(rewriter, op, "mhlo.erf",
+                                         /*version=*/1);
+  }
+};
+
+///////
+// CHLO to CompositeOp Patterns
+///////
+
+// Needs template since serialization uses constexpr logic.
+template <typename ChloOpTy>
+LogicalResult wrapChloOpInComposite(ChloOpTy op, int32_t version,
+                                    PatternRewriter& rewriter) {
+  auto compositeAttrs = serializeChloAttributes(op);
+  if (failed(compositeAttrs))
+    return op->emitError("failed to serialize attributes");
+  auto composite =
+      wrapOperationInComposite(rewriter, op, compositeAttrs.value(), version,
+                               (*op).template getParentOfType<ModuleOp>());
+  rewriter.replaceOp(op, composite.getResults());
+  return success();
+}
+
+struct RaggedDotOpToCompositePattern
+    : public OpRewritePattern<chlo::RaggedDotOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::RaggedDotOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOpInComposite(op, /*version=*/1, rewriter);
+  }
+};
+
+struct TopKOpToCompositePattern : public OpRewritePattern<chlo::TopKOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::TopKOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOpInComposite(op, /*version=*/1, rewriter);
+  }
+};
+
+struct ErfOpToCompositePattern : public OpRewritePattern<chlo::ErfOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(chlo::ErfOp op,
+                                PatternRewriter& rewriter) const override {
+    return wrapChloOpInComposite(op, /*version=*/1, rewriter);
+  }
+};
+
+}  // namespace
+
+struct ChloPreserveHighLevelOpsPass
+    : public impl::ChloPreserveHighLevelOpsPassBase<
+          ChloPreserveHighLevelOpsPass> {
+  using ChloPreserveHighLevelOpsPassBase::ChloPreserveHighLevelOpsPassBase;
+
+  void runOnOperation() override {
+    // Do a single traversal to recompose CustomCallOp to CHLO ops.
+    GreedyRewriteConfig config;
+    config.setUseTopDownTraversal(true)
+        .setRegionSimplificationLevel(
+            mlir::GreedySimplifyRegionLevel::Aggressive)
+        .setMaxIterations(2)
+        .setMaxNumRewrites(GreedyRewriteConfig::kNoLimit)
+        .setStrictness(GreedyRewriteStrictness::ExistingOps);
+
+    RewritePatternSet patterns(&getContext());
+    if (useDeprecatedCustomCallEncoding) {
+      // Deprecated CustomCall encoding.
+      patterns.add<RaggedDotOpToCustomCallPattern>(patterns.getContext());
+      patterns.add<TopKOpToCustomCallPattern>(&getContext());
+      patterns.add<ErfOpToCustomCallPattern>(&getContext());
+    } else {
+      patterns.add<RaggedDotOpToCompositePattern>(patterns.getContext());
+      patterns.add<TopKOpToCompositePattern>(&getContext());
+      patterns.add<ErfOpToCompositePattern>(&getContext());
+    }
+
+    // Only apply to CustomCallOps
+    auto moduleOp = getOperation();
+    if (failed(applyPatternsGreedily(moduleOp, std::move(patterns), config))) {
+      moduleOp.emitError("Failed to converge ChloPreserveHighLevelOpsPass in ");
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace stablehlo_ext
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp
index 68ec9699e903..d2b1203465fb 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/chlo_recompose_ops.cpp
@@ -14,20 +14,25 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <utility>
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "stablehlo/dialect/ChloOps.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "stablehlo/transforms/Passes.h"
-#include "stablehlo_ext/transforms/passes.h"  // NOLINT: Used in passes.h.inc
+#include "stablehlo_ext/transforms/passes.h"  // IWYU pragma: keep, passes.h.inc
 
 namespace mlir {
 namespace stablehlo_ext {
@@ -37,6 +42,98 @@ namespace stablehlo_ext {
 
 namespace {
 
+// ragged_dot_dimension_numbers
+//   [[lhs_batch], [rhs_batch], [lhs_contract],
+//    [rhs_contract], [lhs_ragged], [rhs_group]]
+//   ==>
+//   #chlo.ragged_dot<lhs_batch = [0], rhs_batch = [1], lhs_contract = [2],
+//                    rhs_contract = [2], lhs_ragged = [1], rhs_group = [0]>
+FailureOr<Attribute> deserializeRaggedDotDimensionNumbersAttr(
+    Operation* op, NamedAttribute attr) {
+  auto arrayAttr = llvm::dyn_cast<ArrayAttr>(attr.getValue());
+  if (!arrayAttr || arrayAttr.size() != 6)
+    return op->emitError() << "ragged_dot_dimension_numbers is not an "
+                              "ArrayAttr with 6 elements";
+  auto lhsBatch = llvm::dyn_cast<DenseIntElementsAttr>(arrayAttr[0]);
+  auto rhsBatch = llvm::dyn_cast<DenseIntElementsAttr>(arrayAttr[1]);
+  auto lhsContract = llvm::dyn_cast<DenseIntElementsAttr>(arrayAttr[2]);
+  auto rhsContract = llvm::dyn_cast<DenseIntElementsAttr>(arrayAttr[3]);
+  auto lhsRagged = llvm::dyn_cast<DenseIntElementsAttr>(arrayAttr[4]);
+  auto rhsGroup = llvm::dyn_cast<DenseIntElementsAttr>(arrayAttr[5]);
+  if (!lhsBatch || !rhsBatch || !lhsContract || !rhsContract || !lhsRagged ||
+      !rhsGroup)
+    return op->emitError() << "elements in ragged_dot_dimension_numbers are "
+                              "not DenseIntElementsAttrs";
+  return chlo::RaggedDotDimensionNumbersAttr::get(
+      op->getContext(), llvm::to_vector(lhsBatch.getValues<int64_t>()),
+      llvm::to_vector(rhsBatch.getValues<int64_t>()),
+      llvm::to_vector(lhsContract.getValues<int64_t>()),
+      llvm::to_vector(rhsContract.getValues<int64_t>()),
+      llvm::to_vector(lhsRagged.getValues<int64_t>()),
+      llvm::to_vector(rhsGroup.getValues<int64_t>()));
+}
+
+// precision_config
+//   [["DEFAULT"], ["DEFAULT"]]
+//   ==>
+//   [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>]
+FailureOr<Attribute> deserializePrecisionConfigAttr(Operation* op,
+                                                    NamedAttribute attr) {
+  auto arrayAttr = mlir::dyn_cast<ArrayAttr>(attr.getValue());
+  if (!arrayAttr) return {};
+  SmallVector<Attribute> chloAttrs;
+  for (auto precisionAttr : arrayAttr) {
+    auto precisionStr = mlir::dyn_cast<StringAttr>(precisionAttr);
+    if (!precisionStr)
+      return op->emitError()
+             << "precision_config is not an ArrayAttr of StringAttrs";
+    auto precisionOpt = chlo::symbolizePrecision(precisionStr.getValue());
+    if (!precisionOpt.has_value())
+      return op->emitError("invalid precision string");
+    chloAttrs.push_back(
+        chlo::PrecisionAttr::get(op->getContext(), precisionOpt.value()));
+  }
+  return ArrayAttr::get(op->getContext(), chloAttrs);
+}
+
+// Converts attributes serialized into builtin types to CHLO attributes.
+// This is done since CHLO attributes cannot appear in VHLO.
+// An alternative design would be to serialize the assembly format, but this
+// approach should allow more flexibility in maintaining forward/backward
+// compatibility.
+FailureOr<Attribute> deserializeChloAttribute(Operation* op, StringRef opName,
+                                              NamedAttribute attr) {
+  if (opName == "chlo.ragged_dot") {
+    if (attr.getName() == "ragged_dot_dimension_numbers")
+      return deserializeRaggedDotDimensionNumbersAttr(op, attr);
+    if (attr.getName() == "precision_config")
+      return deserializePrecisionConfigAttr(op, attr);
+  }
+
+  // Only allow builtin attrs to pass through.
+  if (attr.getValue().getDialect().getNamespace() != "builtin")
+    return op->emitError() << "unsupported attribute for chlo recompose:"
+                           << attr.getValue();
+
+  // Default to passthrough
+  return attr.getValue();
+}
+
+FailureOr<SmallVector<NamedAttribute>> deserializeChloAttributes(
+    Operation* op, StringRef opName, DictionaryAttr attrs) {
+  SmallVector<NamedAttribute> newAttrs;
+  for (auto attr : attrs.getValue()) {
+    auto chloAttr = deserializeChloAttribute(op, opName, attr);
+    if (failed(chloAttr)) return failure();
+    newAttrs.push_back({attr.getName(), chloAttr.value()});
+  }
+  return newAttrs;
+}
+
+/////////////
+// CustomCall deserialization
+/////////////
+
 FailureOr<DictionaryAttr> getCustomCallOpAttributes(stablehlo::CustomCallOp op,
                                                     PatternRewriter& rewriter) {
   auto attrs = llvm::dyn_cast_or_null<DictionaryAttr>(
@@ -47,14 +144,17 @@ FailureOr<DictionaryAttr> getCustomCallOpAttributes(stablehlo::CustomCallOp op,
   return attrs;
 }
 
+using CustomCallAttrVerifier =
+    std::function<LogicalResult(NamedAttribute, Operation*, PatternRewriter&)>;
+
 LogicalResult verifyCustomCallOpAttributes(
     stablehlo::CustomCallOp op, PatternRewriter& rewriter,
-    std::function<LogicalResult(NamedAttribute)> const& verifyFn) {
+    CustomCallAttrVerifier const& verifyFn) {
   auto attrs = getCustomCallOpAttributes(op, rewriter);
   if (failed(attrs)) return failure();
 
   for (auto attr : attrs->getValue()) {
-    if (failed(verifyFn(attr))) return failure();
+    if (failed(verifyFn(attr, op, rewriter))) return failure();
   }
   return success();
 }
@@ -89,48 +189,144 @@ LogicalResult recomposeChloOpFromCustomCall(stablehlo::CustomCallOp op,
 
   auto attrs = getCustomCallOpAttributes(op, rewriter);
   if (failed(attrs)) return failure();
+  auto chloAttrs =
+      deserializeChloAttributes(op, op.getCallTargetName(), attrs.value());
+  if (failed(chloAttrs)) return failure();
 
   rewriter.replaceOpWithNewOp<OpType>(op, op->getResultTypes(),
-                                      op->getOperands(), attrs->getValue());
+                                      op->getOperands(), chloAttrs.value());
   return success();
 }
 
+/////////
+// Composite deserialization patterns
+////////
+
+template <typename OpType>
+LogicalResult recomposeChloOpFromCompositeOp(stablehlo::CompositeOp op,
+                                             PatternRewriter& rewriter) {
+  // Convert encoded attributes to CHLO attrs.
+  auto attrs =
+      deserializeChloAttributes(op, op.getName(), op.getCompositeAttributes());
+  if (failed(attrs))
+    return rewriter.notifyMatchFailure(op, "failed to deserialize attributes");
+  rewriter.replaceOpWithNewOp<OpType>(op, op->getResultTypes(),
+                                      op->getOperands(), attrs.value());
+  return success();
+}
+
+struct RaggedDotOpRecomposePattern
+    : public OpRewritePattern<stablehlo::CompositeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CompositeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getName() != "chlo.ragged_dot")
+      return rewriter.notifyMatchFailure(op, "not a chlo.ragged_dot");
+    if (op.getVersion() != 1)
+      return rewriter.notifyMatchFailure(
+          op, "unsupported version for chlo.ragged_dot composite");
+    return recomposeChloOpFromCompositeOp<chlo::RaggedDotOp>(op, rewriter);
+  }
+};
+
 struct TopKOpRecomposePattern
+    : public OpRewritePattern<stablehlo::CompositeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CompositeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getName() != "chlo.top_k")
+      return rewriter.notifyMatchFailure(op, "not a chlo.top_k");
+    if (op.getVersion() != 1)
+      return rewriter.notifyMatchFailure(
+          op, "unsupported version for chlo.top_k composite");
+    return recomposeChloOpFromCompositeOp<chlo::TopKOp>(op, rewriter);
+  }
+};
+
+struct ErfOpRecomposePattern : public OpRewritePattern<stablehlo::CompositeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CompositeOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getName() != "chlo.erf")
+      return rewriter.notifyMatchFailure(op, "not a chlo.erf");
+    if (op.getVersion() != 1)
+      return rewriter.notifyMatchFailure(
+          op, "unsupported version for chlo.erf composite");
+    return recomposeChloOpFromCompositeOp<chlo::ErfOp>(op, rewriter);
+  }
+};
+
+/////////
+// (Deprecated) Custom call patterns
+////////
+
+LogicalResult defaultAttrVerifier(NamedAttribute, Operation*,
+                                  PatternRewriter&) {
+  return success();
+}
+
+template <typename ChloOpType>
+LogicalResult recomposeChloOpFromCustomCall(
+    stablehlo::CustomCallOp op, ArrayRef<StringRef> customCallNames,
+    PatternRewriter& rewriter,
+    CustomCallAttrVerifier const& verifyFn = defaultAttrVerifier) {
+  StringRef customCallName = customCallNames[0];
+  if (!llvm::is_contained(customCallNames, op.getCallTargetName()))
+    return rewriter.notifyMatchFailure(
+        op, "not a CHLO custom call for " + customCallName);
+  if (failed(verifyCustomCallOpAttributes(op, rewriter, verifyFn)))
+    return failure();
+  return recomposeChloOpFromCustomCall<ChloOpType>(op, rewriter);
+}
+
+struct RaggedDotOpCustomCallRecomposePattern
     : public OpRewritePattern<stablehlo::CustomCallOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
                                 PatternRewriter& rewriter) const override {
-    if (op.getCallTargetName() != "mhlo.topk") return failure();
-    auto res = verifyCustomCallOpAttributes(
-        op, rewriter, [&](NamedAttribute attr) -> LogicalResult {
-          if (attr.getName() != "largest") return success();
-          if (!cast<BoolAttr>(attr.getValue()).getValue())
-            return rewriter.notifyMatchFailure(
-                op, "largest = false is not supported.");
-          return success();
-        });
-    if (failed(res)) return failure();
-    return recomposeChloOpFromCustomCall<chlo::TopKOp>(op, rewriter);
+    SmallVector<StringRef> customCallNames = {"chlo.ragged_dot"};
+    return recomposeChloOpFromCustomCall<chlo::RaggedDotOp>(op, customCallNames,
+                                                            rewriter);
   }
 };
 
-struct TanOpRecomposePattern
+struct TopKOpCustomCallRecomposePattern
     : public OpRewritePattern<stablehlo::CustomCallOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
                                 PatternRewriter& rewriter) const override {
-    if (op.getCallTargetName() != "mhlo.tan") return failure();
-    return recomposeChloOpFromCustomCall<chlo::TanOp>(op, rewriter);
+    SmallVector<StringRef> customCallNames = {"mhlo.topk", "chlo.top_k"};
+    return recomposeChloOpFromCustomCall<chlo::TopKOp>(
+        op, customCallNames, rewriter, verifyOpAttributes);
+  }
+
+  static LogicalResult verifyOpAttributes(NamedAttribute attr, Operation* op,
+                                          PatternRewriter& rewriter) {
+    if (attr.getName() != "largest") return success();
+    if (!cast<BoolAttr>(attr.getValue()).getValue())
+      return rewriter.notifyMatchFailure(op,
+                                         "largest = false is not supported.");
+    return success();
+  }
+};
+
+struct TanOpCustomCallRecomposePattern
+    : public OpRewritePattern<stablehlo::CustomCallOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
+                                PatternRewriter& rewriter) const override {
+    return recomposeChloOpFromCustomCall<chlo::TanOp>(op, {"mhlo.tan"},
+                                                      rewriter);
   }
 };
 
-struct ErfOpRecomposePattern
+struct ErfOpCustomCallRecomposePattern
     : public OpRewritePattern<stablehlo::CustomCallOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
                                 PatternRewriter& rewriter) const override {
-    if (op.getCallTargetName() != "mhlo.erf") return failure();
-    return recomposeChloOpFromCustomCall<chlo::ErfOp>(op, rewriter);
+    return recomposeChloOpFromCustomCall<chlo::ErfOp>(
+        op, {"mhlo.erf", "chlo.erf"}, rewriter);
   }
 };
 
@@ -143,34 +339,44 @@ struct ChloRecomposeOpsPass
   void runOnOperation() override {
     // Do a single traversal to recompose CustomCallOp to CHLO ops.
     GreedyRewriteConfig config;
-    config.useTopDownTraversal = true;
-    config.enableRegionSimplification = GreedySimplifyRegionLevel::Aggressive;
-    config.maxIterations = 1;
-    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
-    config.strictMode = GreedyRewriteStrictness::ExistingOps;
+    config.setUseTopDownTraversal(true)
+        .setRegionSimplificationLevel(GreedySimplifyRegionLevel::Aggressive)
+        .setMaxIterations(1)
+        .setMaxNumRewrites(GreedyRewriteConfig::kNoLimit)
+        .setStrictness(GreedyRewriteStrictness::ExistingOps);
 
     RewritePatternSet patterns(&getContext());
-    patterns.add<TopKOpRecomposePattern>(&getContext());
-    patterns.add<TanOpRecomposePattern>(&getContext());
+    // CustomCall Patterns
+    patterns.add<ErfOpCustomCallRecomposePattern>(&getContext());
+    patterns.add<RaggedDotOpCustomCallRecomposePattern>(&getContext());
+    patterns.add<TanOpCustomCallRecomposePattern>(&getContext());
+    patterns.add<TopKOpCustomCallRecomposePattern>(&getContext());
+
+    // Composite Patterns
     patterns.add<ErfOpRecomposePattern>(&getContext());
+    patterns.add<RaggedDotOpRecomposePattern>(&getContext());
+    patterns.add<TopKOpRecomposePattern>(&getContext());
 
     // Only apply to CustomCallOps
     auto moduleOp = getOperation();
     llvm::SmallVector<Operation*> candidateOps;
     moduleOp.walk(
         [&](stablehlo::CustomCallOp op) { candidateOps.push_back(op); });
+    moduleOp.walk(
+        [&](stablehlo::CompositeOp op) { candidateOps.push_back(op); });
 
-    if (failed(applyOpPatternsAndFold(candidateOps, std::move(patterns),
-                                      config))) {
+    if (failed(applyOpPatternsGreedily(candidateOps, std::move(patterns),
+                                       config))) {
       moduleOp.emitError("Failed to converge ChloRecomposeOps in ")
-          << config.maxIterations << " iterations";
+          << config.getMaxIterations() << " iterations";
       return signalPassFailure();
     }
   }
 };
 
 void createChloLegalizeToStablehloPipeline(OpPassManager& pm) {
-  pm.addPass(mlir::stablehlo_ext::createChloRecomposeOpsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo_ext::createChloRecomposeOpsPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::createChloLegalizeToStablehloPass());
   pm.addNestedPass<mlir::func::FuncOp>(
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/passes.td b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/passes.td
index fa672812e4fa..05e52c740af0 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/passes.td
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/passes.td
@@ -30,8 +30,39 @@ def StablehloRefineShapesPass : Pass<"stablehlo-ext-refine-shapes", "ModuleOp">
   }];
 }
 
-def ChloRecomposeOpsPass : Pass<"stablehlo-ext-chlo-recompose-ops", "ModuleOp"> {
-  let summary = "(Experimental) Recompose CHLO ops serialized as custom calls.";
+def ChloPreserveHighLevelOpsPass : Pass<"stablehlo-ext-chlo-preserve-high-level-ops", "ModuleOp"> {
+  let summary = "(Experimental) Preserve select CHLO ops as composites.";
+  let description = [{
+    Experimental version of CHLO serialization support.
+
+    This pass converts the following CHLO ops to composites:
+    * chlo.ragged_dot
+
+    ```
+    chlo.some_op {attr = 1}
+    ==>
+    stablehlo.composite "chlo.some_op" {
+      composite_attributes = {attr = 1},
+      decomposition = @chlo.some_op.decomposition
+    }
+    func.func @chlo.some_op.decomposition {
+      %0 = chlo.some_op {attr = 1}
+      return %0
+    }
+    ```
+
+    When CHLO to StableHLO is then run, the decomposition function will be a
+    pure StableHLO implementaiton of the CHLO op's semantics.
+  }];
+  let dependentDialects = ["stablehlo::StablehloDialect", "chlo::ChloDialect"];
+  let options = [
+    Option<"useDeprecatedCustomCallEncoding", "use-custom-call-encoding", "bool", /*default=*/"true",
+           "Use the deprecated custom call encoding for TopK, to be removed for composites after 12w.">,
+  ];
+}
+
+def ChloRecomposeOpsPass : Pass<"stablehlo-ext-chlo-recompose-ops", "mlir::func::FuncOp"> {
+  let summary = "(Experimental) Recompose CHLO ops serialized as custom calls and composites.";
   let description = [{
     Experimental version of CHLO serialization support.
   }];
@@ -131,5 +162,58 @@ def StablehloAddQDQAfterConvPass : Pass<"stablehlo-ext-add-qdq-after-conv", "Mod
     "mlir::quant::QuantDialect",
     "stablehlo::StablehloDialect"
   ];
-} 
+}
+
+def SinkConstantsToControlFlowPass : Pass<"stablehlo-ext-sink-constants-to-control-flow", "func::FuncOp"> {
+  let summary = "Sink constants implicitly captured in control flow regions. This "
+    "is necessary to export to XLA.";
+  let description = [{
+    A pass that sinks constants implicitly captured in control flow regions. This
+    is necessary to export to XLA, because XLA's representation of control flow
+    doesn't have the notion of implicit capture.
+
+    For example given this function:
+
+    ```mlir
+      func @sink_const_to_sort(%arg0: tensor<16xf32>) {
+        %c0 = stablehlo.constant dense<1.0> : tensor<f32>
+        %0 = "stablehlo.sort"(%arg0) ( {
+        ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+          %1 = "stablehlo.divide"(%arg1, %c0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+          %2 = "stablehlo.divide"(%arg2, %c0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+          %3 = "stablehlo.compare"(%1, %2) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+          "stablehlo.return"(%3) : (tensor<i1>) -> ()
+        }) {is_stable = true} : (tensor<16xf32>) -> tensor<16xi32>
+        return
+      }
+    ```
+
+    Observe how the arith.constant is moved into the region it's used in:
+
+    ```mlir
+      module  {
+        func @sink_const_to_sort(%arg0: tensor<16xf32>) {
+          %0 = "stablehlo.sort"(%arg0) ( {
+          ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+            %cst = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+            %1 = stablehlo.divide %arg1, %cst : tensor<f32>
+            %2 = stablehlo.divide %arg2, %cst : tensor<f32>
+            %3 = "stablehlo.compare"(%1, %2) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+            "stablehlo.return"(%3) : (tensor<i1>) -> ()
+          }) {is_stable = true} : (tensor<16xf32>) -> tensor<16xi32>
+          return
+        }
+      }
+    ```
+  }];
+}
+
+def SymbolicShapeOptimizationPass : Pass<"stablehlo-ext-symbolic-shape-optimization", "func::FuncOp"> {
+  let summary = "Analyzes shapes and performs shape-related optimizations";
+  let description = [{
+    This pass analyzes shapes and performs shape-related optimizations, mostly
+    only used from programs resulting from TF compilation. This pass is largely
+    unmaintained otherwise.
+  }];
+}
 
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
index 3fa547c89525..b4879df7c20f 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
@@ -256,14 +256,14 @@ LogicalResult applyShapeRefinementPatterns(OpTy regionOp) {
   // The algorithm behind this pass consists of a single traversal of the
   // function. This is sufficient because we only support one function per
   // program at the moment.
-  // TODO(#1048): Find out why .maxIterations = 1 no longer works.
+  // TODO(#1048): Find out why .setMaxIterations(1) no longer works.
   // There have been recent refactors to applyPatternsGreedily
   // upstream, and that might be the reason.
-  config.useTopDownTraversal = true;
-  config.enableRegionSimplification = GreedySimplifyRegionLevel::Aggressive;
-  config.maxIterations = 2;
-  config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
-  config.strictMode = GreedyRewriteStrictness::AnyOp;
+  config.setUseTopDownTraversal(true)
+      .setRegionSimplificationLevel(GreedySimplifyRegionLevel::Aggressive)
+      .setMaxIterations(2)
+      .setMaxNumRewrites(GreedyRewriteConfig::kNoLimit)
+      .setStrictness(GreedyRewriteStrictness::AnyOp);
 
   populateStablehloExtRefineShapesPatterns(&patterns, context);
   patterns.add<RefineManualComputationOpPattern>(context);
@@ -272,11 +272,11 @@ LogicalResult applyShapeRefinementPatterns(OpTy regionOp) {
   // which is a critical part of implementing type refinement for ops like
   // dynamic_broadcast_in_dim, dynamic_iota and dynamic_reshape whose shape
   // depends on the value of their shape operands.
-  stablehlo::populateStablehloShapeFolderPatterns(&patterns, context);
+  stablehlo::populateStablehloShapeFolderPatterns(context, &patterns);
 
   if (failed(applyPatternsGreedily(regionOp, std::move(patterns), config)))
     regionOp.emitError("Failed to converge StablehloRefineShapes in ")
-        << config.maxIterations << " iterations";
+        << config.getMaxIterations() << " iterations";
 
   return success();
 }
@@ -401,8 +401,8 @@ struct RefineInferTypeOpInterfacePattern
 }  // namespace
 
 /// Patterns for refining shapes of Shardy ops.
-void populateSdyShapeRefinementPatterns(RewritePatternSet* patterns,
-                                        MLIRContext* context) {
+void populateSdyShapeRefinementPatterns(MLIRContext* context,
+                                        RewritePatternSet* patterns) {
   patterns->add<RefineManualComputationOpPattern>(context);
   patterns->add<RefineNamedComputationOpPattern>(context);
   patterns->add<RefineInferTypeOpInterfacePattern>(context);
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.h b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.h
index 12a4b35fa362..104f6438fac2 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.h
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.h
@@ -23,8 +23,8 @@ namespace mlir {
 namespace stablehlo_ext {
 
 /// Populates extension patterns for refining shapes of Shardy ops.
-void populateSdyShapeRefinementPatterns(RewritePatternSet* patterns,
-                                        MLIRContext* context);
+void populateSdyShapeRefinementPatterns(MLIRContext* context,
+                                        RewritePatternSet* patterns);
 
 }  // namespace stablehlo_ext
 }  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sink_constants_to_control_flow.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sink_constants_to_control_flow.cpp
new file mode 100644
index 000000000000..c87b139d8830
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sink_constants_to_control_flow.cpp
@@ -0,0 +1,86 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep, passes.h.inc
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"  // IWYU pragma: keep, passes.h.inc
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "stablehlo/dialect/StablehloOps.h"
+
+namespace mlir {
+namespace stablehlo_ext {
+
+#define GEN_PASS_DEF_SINKCONSTANTSTOCONTROLFLOWPASS
+#include "stablehlo_ext/transforms/passes.h.inc"
+
+namespace {
+
+// A pass that sinks constants implicitly captured in control flow regions. This
+// is necessary to export to XLA.
+//
+// TODO(b/203775547): Any value used within the region that is defined outside
+// of op's region should be sank to the regions and not just the constants. Ops
+// such as If and While whose computations doesn't require fixed signature like
+// Sort or Reduce have an option to pass outside values as operands of the op to
+// avoid recomputing those within internally. Note that doing so is the only
+// option in case of values defined outside that are BlockArguments of any of
+// the parent region.
+class SinkConstantsToControlFlowPass
+    : public impl::SinkConstantsToControlFlowPassBase<
+          SinkConstantsToControlFlowPass> {
+  void runOnOperation() override {
+    getOperation().walk([](Operation* op) {
+      // TODO: Consider special handling for WhileOp.
+      for (Region& region : op->getRegions()) sinkToRegion(&region);
+    });
+  }
+
+ private:
+  // Performs constant sinking into a region.
+  static void sinkToRegion(Region* region) {
+    llvm::DenseMap<Value, Operation*> sunkConstant;
+    visitUsedValuesDefinedAbove({*region}, [&](OpOperand* use) {
+      Value constant = use->get();
+      auto* op = constant.getDefiningOp();
+      if (!op || !op->hasTrait<mlir::OpTrait::ConstantLike>()) return;
+      auto mapEntry = sunkConstant.try_emplace(constant, nullptr);
+      if (!mapEntry.second) {
+        // This constant has already been cloned into the region, reuse it.
+        use->set(mapEntry.first->getSecond()->getResult(0));
+        if (op->use_empty()) op->erase();
+        return;
+      }
+      if (constant.hasOneUse()) {
+        op->moveBefore(&region->front().front());
+        return;
+      }
+      mapEntry.first->getSecond() = op->clone();
+      region->front().getOperations().insert(region->front().begin(),
+                                             mapEntry.first->getSecond());
+      use->set(mapEntry.first->getSecond()->getResult(0));
+    });
+  }
+};
+
+}  // anonymous namespace
+
+}  // namespace stablehlo_ext
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
index f29d0d891291..8fae3ddee4f5 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
@@ -57,9 +57,10 @@ Type getQuantizedType(Location loc, PatternRewriter& rewriter,
 
 struct AddQuantDeQuantAfterConvolutionOp final
     : OpRewritePattern<mlir::stablehlo::ConvolutionOp> {
-  using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
+  using OpRewritePattern::OpRewritePattern;
 
-  LogicalResult match(stablehlo::ConvolutionOp op) const override {
+  LogicalResult matchAndRewrite(stablehlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const override {
     // Match a stablehlo.convolution op if
     // 1. Its operands are defined by stablehlo.uniform_dequantize op,
     // 2. It has a single user.
@@ -77,11 +78,6 @@ struct AddQuantDeQuantAfterConvolutionOp final
     if (isa<stablehlo::UniformQuantizeOp>(*op->getUsers().begin()))
       return failure();
 
-    return success();
-  }
-
-  void rewrite(stablehlo::ConvolutionOp op,
-               PatternRewriter& rewriter) const override {
     auto* clonedConvOp = rewriter.clone(*op);
     auto convResultType =
         cast<ShapedType>(clonedConvOp->getResult(0).getType());
@@ -94,6 +90,7 @@ struct AddQuantDeQuantAfterConvolutionOp final
             op.getLoc(), op.getType(),
             /*input=*/stablehloQuantizeOp.getResult());
     rewriter.replaceAllUsesWith(op, stablehloDeQuantizeOp.getResult());
+    return success();
   }
 };
 
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
index 9cd3e90e6f5d..73af52c7f31b 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
@@ -185,15 +185,15 @@ struct StablehloCanonicalizeDynamismPass
 
   void runOnOperation() override {
     GreedyRewriteConfig config;
-    config.useTopDownTraversal = true;
-    config.enableRegionSimplification = GreedySimplifyRegionLevel::Aggressive;
-    config.maxIterations = 2;
-    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
-    config.strictMode = GreedyRewriteStrictness::AnyOp;
+    config.setUseTopDownTraversal(true)
+        .setRegionSimplificationLevel(GreedySimplifyRegionLevel::Aggressive)
+        .setMaxIterations(2)
+        .setMaxNumRewrites(GreedyRewriteConfig::kNoLimit)
+        .setStrictness(GreedyRewriteStrictness::AnyOp);
 
     RewritePatternSet patterns(&getContext());
-    stablehlo::populateStablehloCanonicalizeDynamismPatterns(&patterns,
-                                                             &getContext());
+    stablehlo::populateStablehloCanonicalizeDynamismPatterns(&getContext(),
+                                                             &patterns);
     patterns.add<CanonicalizeDynamicReduceWindowOpPattern>(&getContext());
     patterns.add<CanonicalizeDynamicRngBitGeneratorOpPattern>(&getContext());
     patterns.add<CanonicalizeDynamicTopKOpPattern>(&getContext());
@@ -202,7 +202,7 @@ struct StablehloCanonicalizeDynamismPass
     auto funcOp = getOperation();
     if (failed(applyPatternsGreedily(funcOp, std::move(patterns), config))) {
       funcOp.emitError("Failed to converge StablehloCanonicalizeDynamism in ")
-          << config.maxIterations << " iterations";
+          << config.getMaxIterations() << " iterations";
       return signalPassFailure();
     }
   }
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
index aabbd0fc55fa..1b2e3cf15eaa 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
@@ -54,7 +54,7 @@ namespace {
 
 // Expands the mhlo.tuple used in return op. Also updates function
 // signature accordingly.
-void expandTupledTensorInReturnOp(func::FuncOp func) {
+LogicalResult expandTupledTensorInReturnOp(func::FuncOp func) {
   FunctionType oldFuncType = func.getFunctionType();
   // Update input signatures.
   // We will flatten the tuples for the function inputs as well.
@@ -84,7 +84,11 @@ void expandTupledTensorInReturnOp(func::FuncOp func) {
       Location loc = func.getBody().getLoc();
       for (auto flattenedType : tupleType.getTypes()) {
         expandedInputTypes.push_back(flattenedType);
-        func.insertArgument(++argumentIndex, flattenedType, {}, loc);
+        if (failed(
+                func.insertArgument(++argumentIndex, flattenedType, {}, loc))) {
+          return failure();
+        }
+
         flattenedOperands.push_back(func.getArgument(argumentIndex));
       }
 
@@ -97,7 +101,9 @@ void expandTupledTensorInReturnOp(func::FuncOp func) {
 
       // Now the original argument has been rewired, we should be able to
       // safely erase it.
-      func.eraseArgument(originalArgumentIndex);
+      if (failed(func.eraseArgument(originalArgumentIndex))) {
+        return failure();
+      }
     }
   }
 
@@ -122,7 +128,7 @@ void expandTupledTensorInReturnOp(func::FuncOp func) {
     }
   }
 
-  if (returnOp.getOperands() == expandedReturnOperands) return;
+  if (returnOp.getOperands() == expandedReturnOperands) return success();
 
   builder.create<mlir::func::ReturnOp>(returnOp.getLoc(),
                                        expandedReturnOperands);
@@ -130,36 +136,46 @@ void expandTupledTensorInReturnOp(func::FuncOp func) {
   auto newFuncType = FunctionType::get(oldFuncType.getContext(),
                                        expandedInputTypes, expandedResultTypes);
   func.setType(newFuncType);
+  return success();
 }
 
 /////////////
 // Flatten Tuples in Custom Calls
 
 // Calculates the flatten types of a value.
-void flattenTupleType(Value value, llvm::SmallVectorImpl<Type> &types) {
-  if (!mlir::isa<TupleType>(value.getType())) {
-    types.push_back(value.getType());
+void flattenTupleType(Type type, llvm::SmallVectorImpl<Type> &flattenedTypes) {
+  auto tupleType = mlir::dyn_cast<mlir::TupleType>(type);
+  if (!tupleType) {
+    flattenedTypes.push_back(type);
     return;
   }
 
-  // This function doesn't handle nested tuple.
-  auto tupleType = mlir::cast<TupleType>(value.getType());
-  types.append(tupleType.begin(), tupleType.end());
+  for (auto childType : tupleType.getTypes()) {
+    flattenTupleType(childType, flattenedTypes);
+  }
 }
 
 // FlattenTupleValue and CreateTupleValue is a pair of functions to create and
 // flatten tuples in the exact same order. CreateTupleValue returns the result
 // of the root TupleOp or given value if the type is not TupleType.
 Value createTupleValue(OpBuilder &builder, Location loc,
-                       ValueRange flattenValues, Type tupleType) {
-  if (!mlir::isa<TupleType>(tupleType)) {
-    assert(flattenValues.size() == 1);
-    return flattenValues[0];
+                       ValueRange &flattenValues, Type type) {
+  auto tupleType = mlir::dyn_cast<mlir::TupleType>(type);
+  if (!tupleType) {
+    assert(!flattenValues.empty());
+    auto retval = flattenValues.front();
+    flattenValues = flattenValues.drop_front();
+    return retval;
   }
 
-  assert(mlir::cast<TupleType>(tupleType).getTypes().size() ==
-         flattenValues.size());
-  return builder.create<stablehlo::TupleOp>(loc, flattenValues);
+  SmallVector<Value> flattenedSubValues;
+  for (auto childType : tupleType.getTypes()) {
+    flattenedSubValues.push_back(
+        createTupleValue(builder, loc, flattenValues, childType));
+  }
+
+  return builder.create<mlir::stablehlo::TupleOp>(loc, flattenedSubValues)
+      .getResult();
 }
 
 void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
@@ -182,6 +198,8 @@ struct FlattenCustomCallOp : public OpRewritePattern<stablehlo::CustomCallOp> {
 
   LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
                                 PatternRewriter &rewriter) const override {
+    // We only flatten a single result tuple, as this is what we expect from
+    // HLO, where an instruction can only have a single result.
     bool flattenResult = op->getNumResults() == 1 &&
                          mlir::isa<TupleType>(op->getResult(0).getType());
     bool flattenOperands = llvm::any_of(op.getInputs(), [](Value operand) {
@@ -195,26 +213,27 @@ struct FlattenCustomCallOp : public OpRewritePattern<stablehlo::CustomCallOp> {
       flattenTupleValue(rewriter, op->getLoc(), operand, flattenedOperands);
 
     llvm::SmallVector<Type, 4> flattenedResultTypes;
-    if (!flattenResult) {
-      flattenedResultTypes.push_back(op->getResult(0).getType());
+    if (flattenResult) {
+      flattenTupleType(op->getResult(0).getType(), flattenedResultTypes);
     } else {
-      // Check for nested tuples.
-      for (Type innerType :
-           mlir::cast<TupleType>(op->getResult(0).getType()).getTypes())
-        if (mlir::isa<TupleType>(innerType)) return failure();
-
-      for (auto result : op->getResults())
-        flattenTupleType(result, flattenedResultTypes);
+      flattenedResultTypes.append(op->result_type_begin(),
+                                  op->result_type_end());
     }
 
     auto flattenedCall = rewriter.create<stablehlo::CustomCallOp>(
         op->getLoc(), flattenedResultTypes, flattenedOperands, op->getAttrs());
 
-    rewriter.replaceOp(op, flattenResult
-                               ? createTupleValue(rewriter, op->getLoc(),
-                                                  flattenedCall.getResults(),
-                                                  op->getResult(0).getType())
-                               : flattenedCall.getResult(0));
+    if (flattenResult) {
+      ValueRange flattenedResultsRef(flattenedCall.getResults());
+      Value newResult =
+          createTupleValue(rewriter, op->getLoc(), flattenedResultsRef,
+                           op->getResult(0).getType());
+      // Verify all flattened results have been consumed.
+      assert(flattenedResultsRef.empty());
+      rewriter.replaceOp(op, newResult);
+    } else {
+      rewriter.replaceOp(op, flattenedCall.getResults());
+    }
     return success();
   }
 };
@@ -235,7 +254,9 @@ struct StablehloCanonicalizeFromHloImportPass
           llvm::any_of(llvm::concat<const Type>(func.getArgumentTypes(),
                                                 func.getResultTypes()),
                        [](Type type) { return mlir::isa<TupleType>(type); })) {
-        expandTupledTensorInReturnOp(func);
+        if (failed(expandTupledTensorInReturnOp(func))) {
+          return signalPassFailure();
+        }
       }
     }
 
@@ -248,11 +269,10 @@ struct StablehloCanonicalizeFromHloImportPass
 
     // Apply patterns without folding
     GreedyRewriteConfig config;
-    config.useTopDownTraversal = true;
-    config.enableRegionSimplification =
-        mlir::GreedySimplifyRegionLevel::Disabled;
-    config.fold = false;
-    config.cseConstants = false;
+    config.setUseTopDownTraversal(true)
+        .setRegionSimplificationLevel(mlir::GreedySimplifyRegionLevel::Disabled)
+        .enableFolding(false)
+        .enableConstantCSE(false);
     if (failed(applyPatternsGreedily(func, std::move(patterns), config)))
       signalPassFailure();
   }
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_refine_shapes.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_refine_shapes.cpp
index cabd6a9fb3cd..d5e0db089b8a 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_refine_shapes.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_refine_shapes.cpp
@@ -154,7 +154,7 @@ struct StablehloRefineShapesPass
           patterns->add<RefineDynamicReduceWindowOpPattern>(context);
           patterns->add<RefineDynamicRngBitGeneratorOpPattern>(context);
           patterns->add<RefineDynamicTopKOpPattern>(context);
-          populateSdyShapeRefinementPatterns(patterns, context);
+          populateSdyShapeRefinementPatterns(context, patterns);
         };
 
     if (failed(stablehlo::refineEntryFunction(*context, func,
@@ -167,12 +167,12 @@ struct StablehloRefineShapesPass
 
 void populateStablehloExtRefineShapesPatterns(RewritePatternSet *patterns,
                                               MLIRContext *context) {
-  stablehlo::populateStablehloRefineShapesPatterns(patterns, context);
-  stablehlo::populateStablehloShapeFolderPatterns(patterns, context);
+  stablehlo::populateStablehloRefineShapesPatterns(context, patterns);
+  stablehlo::populateStablehloShapeFolderPatterns(context, patterns);
   patterns->add<RefineDynamicReduceWindowOpPattern>(context);
   patterns->add<RefineDynamicRngBitGeneratorOpPattern>(context);
   patterns->add<RefineDynamicTopKOpPattern>(context);
-  populateSdyShapeRefinementPatterns(patterns, context);
+  populateSdyShapeRefinementPatterns(context, patterns);
 }
 
 }  // namespace stablehlo_ext
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/symbolic_shape_optimization.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/symbolic_shape_optimization.cpp
new file mode 100644
index 000000000000..8ef8621a531b
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/symbolic_shape_optimization.cpp
@@ -0,0 +1,805 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "stablehlo_ext/analysis/shape_component_analysis.h"
+#include "stablehlo_ext/transforms/passes.h"  // IWYU pragma: keep, passes.h.inc
+
+namespace mlir {
+namespace stablehlo_ext {
+
+#define GEN_PASS_DEF_SYMBOLICSHAPEOPTIMIZATIONPASS
+#include "stablehlo_ext/transforms/passes.h.inc"
+
+using ShapeOrValueInfo = ShapeComponentAnalysis::ShapeOrValueInfo;
+using Symbol = ShapeComponentAnalysis::Symbol;
+using SymbolicExpr = ShapeComponentAnalysis::SymbolicExpr;
+
+namespace {
+
+// Temporary data structure to hold a single dimension of the symbolic result of
+// `shape.broadcast`.
+struct SymbolicBroadcastDimension {
+  size_t operandIndex;
+  size_t operandDim;
+  SymbolicExpr expr;
+};
+
+// Replace shape.broadcast with a shape if it's statically known.
+struct SimplifyBroadcasts : public mlir::OpRewritePattern<shape::BroadcastOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(
+      shape::BroadcastOp op, mlir::PatternRewriter &rewriter) const override {
+    // Require successful shape analysis.
+    ShapeComponentAnalysis shapeAnalysis;
+    llvm::SmallVector<ArrayRef<SymbolicExpr>> shapesInfo;
+    auto shapes = op.getShapes();
+    shapesInfo.reserve(shapes.size());
+    for (Value s : shapes) {
+      auto sInfo = shapeAnalysis.GetValueInfo(s);
+      if (!sInfo) return failure();
+      shapesInfo.push_back(*sInfo);
+    }
+
+    // Find the result rank.
+    size_t rank = 0;
+    for (const auto &sInfo : shapesInfo) rank = std::max(rank, sInfo.size());
+
+    // Compute broadcast symbolically.
+    SmallVector<std::optional<SymbolicBroadcastDimension>> symResult(
+        rank, std::nullopt);
+    for (const auto &sInfo : llvm::enumerate(shapesInfo)) {
+      size_t dimOffset = rank - sInfo.value().size();
+      for (const auto &symExpr : llvm::enumerate(sInfo.value())) {
+        // Unit dimensions are neutral to the final result.
+        if (symExpr.value().isConstant(1)) continue;
+
+        // Use unique expression.
+        size_t i = dimOffset + symExpr.index();
+        if (!symResult[i]) {
+          symResult[i] = {sInfo.index(), symExpr.index(), symExpr.value()};
+          continue;
+        }
+
+        // Bail if the dimensions are neither equal nor 1.
+        if (symResult[i]->expr != symExpr.value()) return failure();
+      }
+    }
+
+    // Materialize broadcast result.
+    auto loc = op.getLoc();
+    DenseMap<int64_t, Value> constants;
+    auto findOrCreateConstant = [&](int64_t c) {
+      auto it = constants.find(c);
+      if (it != constants.end()) return it->second;
+      Value newlyCreated = rewriter.create<arith::ConstantIndexOp>(loc, c);
+      constants[c] = newlyCreated;
+      return newlyCreated;
+    };
+    auto elements = llvm::to_vector<8>(
+        llvm::map_range(symResult, [&](const auto &symResultDim) {
+          // If we know the dimension statically, use a constant.
+          if (!symResultDim) return findOrCreateConstant(1);
+          if (auto cexpr =
+                  dyn_cast<AffineConstantExpr>(symResultDim->expr.expr)) {
+            return findOrCreateConstant(cexpr.getValue());
+          }
+
+          // Othwerise, extract the dimension from the unique operand.
+          Value operand = shapes[symResultDim->operandIndex];
+          Value operandDim = findOrCreateConstant(symResultDim->operandDim);
+          return rewriter.create<tensor::ExtractOp>(loc, operand, operandDim)
+              .getResult();
+        }));
+    Type indexTy = rewriter.getIndexType();
+    Type concreteResultTy =
+        RankedTensorType::get({static_cast<int64_t>(elements.size())}, indexTy);
+    Value result = rewriter.create<tensor::FromElementsOp>(
+        loc, concreteResultTy, elements);
+
+    // Insert cast, if needed.
+    Type expectedTy = op.getResult().getType();
+    if (result.getType() != expectedTy) {
+      result = rewriter.create<tensor::CastOp>(loc, expectedTy, result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+LogicalResult analyzeDynamicBroadcastInDimExpandingBehavior(
+    ShapeComponentAnalysis &analysis, Value value, Value shape,
+    llvm::SmallSetVector<int64_t, 4> *knownExpandingDims,
+    llvm::SmallSetVector<int64_t, 4> *knownNonexpandingDims) {
+  // Require successful analysis of shapes.
+  auto shapeIn = analysis.GetShapeInfo(value);
+  auto shapeOut = analysis.GetValueInfo(shape);
+  if (!shapeIn || !shapeOut) return failure();
+
+  // Analyze per argument dimension.
+  size_t rankIn = shapeIn->size();
+  size_t rankOut = shapeOut->size();
+  assert(rankIn <= rankOut);
+  size_t dimOutOffset = rankOut - rankIn;
+  for (size_t i = 0; i < rankIn; ++i) {
+    SymbolicExpr dimIn = (*shapeIn)[i];
+    SymbolicExpr dimOut = (*shapeOut)[dimOutOffset + i];
+    if (dimIn.isConstant(1) && dimOut.isKnownNotOne())
+      knownExpandingDims->insert(i);
+    if (dimIn == dimOut || dimOut.isConstant(1))
+      knownNonexpandingDims->insert(i);
+  }
+  return success();
+}
+
+// Analyze `mhlo.dynamic_broadcast_in_dim` op and populate attributes for
+// statically known expanding and non-expanding dimensions.
+struct AnnotateExpandingDimensionsInDynamicBroadcastInDim
+    : public mlir::OpRewritePattern<stablehlo::DynamicBroadcastInDimOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(
+      stablehlo::DynamicBroadcastInDimOp op,
+      mlir::PatternRewriter &rewriter) const override {
+    // Analyze shapes and identify expanding and non-expanding dims.
+    ShapeComponentAnalysis analysis;
+    llvm::SmallSetVector<int64_t, 4> knownExpandingDims, knownNonexpandingDims;
+    if (failed(analyzeDynamicBroadcastInDimExpandingBehavior(
+            analysis, op.getOperand(), op.getOutputDimensions(),
+            &knownExpandingDims, &knownNonexpandingDims))) {
+      return failure();
+    }
+
+    // Collect possibly already annotated info.
+    auto insertAll = [](llvm::SmallSetVector<int64_t, 4> &dst,
+                        std::optional<ArrayRef<int64_t>> src) {
+      if (!src) return;
+      for (auto it : *src) dst.insert(it);
+    };
+    insertAll(knownExpandingDims, op.getKnownExpandingDimensions());
+    insertAll(knownNonexpandingDims, op.getKnownNonexpandingDimensions());
+
+    // Fail pattern application if there is nothing new to annotate.
+    auto isEqual = [](llvm::SmallSetVector<int64_t, 4> &set,
+                      ArrayRef<int64_t> attr) {
+      return static_cast<int64_t>(set.size()) == attr.size() &&
+             llvm::all_of(attr, [&](auto it) { return set.count(it); });
+    };
+    if (op.getKnownExpandingDimensions() &&
+        op.getKnownNonexpandingDimensions() &&
+        isEqual(knownExpandingDims, *op.getKnownExpandingDimensions()) &&
+        isEqual(knownNonexpandingDims, *op.getKnownNonexpandingDimensions())) {
+      return failure();
+    }
+
+    // Annotate op in place.
+    rewriter.startOpModification(op);
+    op.setKnownExpandingDimensionsAttr(
+        rewriter.getDenseI64ArrayAttr(knownExpandingDims.takeVector()));
+    op.setKnownNonexpandingDimensionsAttr(
+        rewriter.getDenseI64ArrayAttr(knownNonexpandingDims.takeVector()));
+    rewriter.finalizeOpModification(op);
+    return success();
+  }
+};
+
+bool isProduct(AffineExpr expr,
+               llvm::function_ref<void(AffineConstantExpr)> cbkConstantFactor,
+               llvm::function_ref<void(AffineSymbolExpr)> cbkSymbolicFactor) {
+  auto binExpr = dyn_cast<AffineBinaryOpExpr>(expr);
+  if (binExpr && binExpr.getKind() == AffineExprKind::Mul) {
+    return isProduct(binExpr.getLHS(), cbkConstantFactor, cbkSymbolicFactor) &&
+           isProduct(binExpr.getRHS(), cbkConstantFactor, cbkSymbolicFactor);
+  }
+  if (auto symExpr = dyn_cast<AffineSymbolExpr>(expr)) {
+    cbkSymbolicFactor(symExpr);
+    return true;
+  }
+  if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
+    cbkConstantFactor(constExpr);
+    return true;
+  }
+  return false;
+}
+
+bool isSymbolicProduct(const SymbolicExpr &symbolicExpr,
+                       llvm::function_ref<void(int64_t)> cbkConstantFactor,
+                       llvm::function_ref<void(Symbol)> cbkSymbolicFactor) {
+  return isProduct(
+      symbolicExpr.expr,
+      [&](AffineConstantExpr cexpr) { cbkConstantFactor(cexpr.getValue()); },
+      [&](AffineSymbolExpr sexpr) {
+        cbkSymbolicFactor(symbolicExpr.symbols[sexpr.getPosition()]);
+      });
+}
+
+// Represents a product of symbolic and concrete factors. This will allow us to
+// prove product equalities symbolically.
+struct SymbolicProduct {
+  // Product of all concrete factors.
+  int64_t concrete = 1;
+  // List all symbolic factors as they can not be aggregated.
+  llvm::SmallVector<Symbol> symbolic;
+  bool empty() { return concrete == 1 && symbolic.empty(); }
+};
+
+bool isSymbolicProduct(const SymbolicExpr &symbolicExpr,
+                       SymbolicProduct *product) {
+  return isSymbolicProduct(
+      symbolicExpr, [&](int64_t c) { product->concrete *= c; },
+      [&](Symbol s) { product->symbolic.push_back(s); });
+}
+
+LogicalResult materializeReshapeAsScalarExpand(RankedTensorType operandTy,
+                                               RankedTensorType resultTy,
+                                               stablehlo::DynamicReshapeOp op,
+                                               PatternRewriter &rewriter) {
+  assert(operandTy.getRank() == 0 && "expect scalar operand");
+  auto loc = op.getLoc();
+  SmallVector<int64_t> unitDims(resultTy.getRank(), 1);
+  auto expandedTy = RankedTensorType::get(unitDims, resultTy.getElementType());
+  Value expandedScalar = rewriter.create<tensor::ExpandShapeOp>(
+      loc, expandedTy, op.getOperand(), ArrayRef<ReassociationIndices>{});
+  if (expandedScalar.getType() != resultTy) {
+    expandedScalar =
+        rewriter.create<tensor::CastOp>(loc, resultTy, expandedScalar);
+  }
+  rewriter.replaceOp(op, expandedScalar);
+  return success();
+}
+
+LogicalResult materializeReshapeAsScalarCollapse(RankedTensorType operandTy,
+                                                 RankedTensorType resultTy,
+                                                 stablehlo::DynamicReshapeOp op,
+                                                 PatternRewriter &rewriter) {
+  assert(resultTy.getRank() == 0 && "expect scalar result");
+  auto loc = op.getLoc();
+  Value operand = op.getOperand();
+  SmallVector<int64_t> unitDims(operandTy.getRank(), 1);
+  auto castedOperandTy =
+      RankedTensorType::get(unitDims, operandTy.getElementType());
+  if (operand.getType() != castedOperandTy) {
+    operand = rewriter.create<tensor::CastOp>(loc, castedOperandTy, operand);
+  }
+  Value collapsedScalar = rewriter.create<tensor::CollapseShapeOp>(
+      loc, operand, ArrayRef<ReassociationIndices>{});
+  rewriter.replaceOp(op, collapsedScalar);
+  return success();
+}
+
+enum class DimensionGroupKind {
+  kNone,
+  kExpanding,
+  kCollapsing,
+};
+
+struct DimensionGroup {
+  int64_t size = 0;
+  DimensionGroupKind kind = DimensionGroupKind::kNone;
+};
+
+SymbolicProduct eliminateCommonFactors(SymbolicProduct &a, SymbolicProduct &b) {
+  SymbolicProduct gcd;
+
+  // Eliminate common concrete factors.
+  gcd.concrete = std::gcd(a.concrete, b.concrete);
+  a.concrete /= gcd.concrete;
+  b.concrete /= gcd.concrete;
+
+  // Eliminate common symbolic factors.
+  int64_t i = 0;
+  while (i < static_cast<int64_t>(a.symbolic.size())) {
+    auto *it = llvm::find(b.symbolic, a.symbolic[i]);
+    if (it != b.symbolic.end()) {
+      gcd.symbolic.push_back(*it);
+      std::swap(a.symbolic[i], a.symbolic.back());
+      a.symbolic.pop_back();
+      b.symbolic.erase(it);
+    } else {
+      i++;
+    }
+  }
+
+  return gcd;
+}
+
+bool isUnpairedUnitDimension(
+    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator it,
+    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator end,
+    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator otherIt,
+    ArrayRef<ShapeComponentAnalysis::SymbolicExpr>::iterator otherEnd) {
+  return it != end && it->isConstant(1) &&
+         (otherIt == otherEnd || !otherIt->isConstant(1));
+}
+
+int64_t getShapedTypyDimSize(const SymbolicProduct &symProduct) {
+  return symProduct.symbolic.empty() ? symProduct.concrete
+                                     : ShapedType::kDynamic;
+}
+
+// Iterate over the operand's and the result's shape dimensions and find
+// dimension groups that are collapsing, expanding, or untouched:
+//   - Collapsing: Multiple dimensions of the operand shape can be collapsed
+//     into a single dimension of the result shape. We must prove that the
+//     product of the operand shape's dimensions is equal to the corresponding
+//     result dimension.
+//   - Expanding: A single dimension of the operand shape can be expanded into
+//     multiple dimensions of the result shape. We must prove that the product
+//     of the result shape's dimensions is equal to the corresponding operand
+//     dimension. This case is limited to at most one dynamic dimension per
+//     expansion group as otherwise not supported by the `expand_shape` op.
+//   - Untouched: There is a 1:1 correspondance between an operand and a result
+//     shape dimension.
+//
+// We can determine the optimal dimension groups greedily by consuming operand
+// and result dimensions from left to right. If the leading operand dimension is
+// a strict divisor of the leading result dimension, collapsing is required. In
+// this case, we keep consuming the operand dimensions until the products are
+// equal. If the leading result dimension is a strict divisor of the leading
+// operand dimension, expanding is required. In this case, we keep consuming the
+// result dimensions until the products are equal. Trailing unit dimensions may
+// be inlcuded in the dimension group. This is useful iff they are "unpaired",
+// in which case they would only limit us in the subsequent iteration.
+//
+LogicalResult findExpandingAndCollapsingDimensionGroups(
+    ArrayRef<SymbolicExpr> operandShapeInfo,
+    ArrayRef<SymbolicExpr> resultShapeInfo,
+    SmallVector<DimensionGroup> *dimensionGroups,
+    SmallVector<int64_t> *expandedIntermShape) {
+  const auto *operandShapeIt = operandShapeInfo.begin();
+  const auto *operandShapeEnd = operandShapeInfo.end();
+  const auto *resultShapeIt = resultShapeInfo.begin();
+  const auto *resultShapeEnd = resultShapeInfo.end();
+
+  // Crucial iteration state.
+  SymbolicProduct remainingOperandShapeFactors;
+  SymbolicProduct remainingResultShapeFactors;
+  auto anyRemainingFactors = [&]() {
+    return !remainingOperandShapeFactors.empty() ||
+           !remainingResultShapeFactors.empty();
+  };
+
+  while (operandShapeIt != operandShapeEnd && resultShapeIt != resultShapeEnd) {
+    assert(!anyRemainingFactors() &&
+           "expect no remaining factors from previous iteration");
+    DimensionGroup &dimGroup = dimensionGroups->emplace_back();
+
+    // Consume at least one operand and result dimension.
+    {
+      if (!isSymbolicProduct(*operandShapeIt++,
+                             &remainingOperandShapeFactors) ||
+          !isSymbolicProduct(*resultShapeIt++, &remainingResultShapeFactors)) {
+        return failure();
+      }
+      dimGroup.size++;
+      SymbolicProduct gcd = eliminateCommonFactors(remainingOperandShapeFactors,
+                                                   remainingResultShapeFactors);
+      expandedIntermShape->push_back(getShapedTypyDimSize(gcd));
+    }
+
+    // Fail if there are unresolvable, contradicting factors remaining.
+    if (!remainingOperandShapeFactors.empty() &&
+        !remainingResultShapeFactors.empty()) {
+      return failure();
+    }
+
+    // Collapsing: Create a collapsing dimension group.
+    bool requiresCollapsing =
+        remainingOperandShapeFactors.empty() &&
+        (!remainingResultShapeFactors.empty() ||
+         isUnpairedUnitDimension(operandShapeIt, operandShapeEnd, resultShapeIt,
+                                 resultShapeEnd));
+    if (requiresCollapsing) {
+      dimGroup.kind = DimensionGroupKind::kCollapsing;
+
+      // Consume operand shape dimensions until their product matches the
+      // corresponding result dimension (or fail if unresolvable/contradicting
+      // factors are found).
+      while (operandShapeIt != operandShapeEnd &&
+             remainingOperandShapeFactors.empty() &&
+             !remainingResultShapeFactors.empty()) {
+        if (!isSymbolicProduct(*operandShapeIt++,
+                               &remainingOperandShapeFactors)) {
+          return failure();
+        }
+        dimGroup.size++;
+        SymbolicProduct gcd = eliminateCommonFactors(
+            remainingOperandShapeFactors, remainingResultShapeFactors);
+        expandedIntermShape->push_back(getShapedTypyDimSize(gcd));
+      }
+      if (anyRemainingFactors()) return failure();
+
+      // Consume trailing, unpaired unit dimensions.
+      while (isUnpairedUnitDimension(operandShapeIt, operandShapeEnd,
+                                     resultShapeIt, resultShapeEnd)) {
+        operandShapeIt++;
+        dimGroup.size++;
+        expandedIntermShape->push_back(1);
+      }
+
+      continue;
+    }
+
+    // Expanding: Create an expanding dimension group.
+    bool requiresExpanding =
+        remainingResultShapeFactors.empty() &&
+        (!remainingOperandShapeFactors.empty() ||
+         isUnpairedUnitDimension(resultShapeIt, resultShapeEnd, operandShapeIt,
+                                 operandShapeEnd));
+    if (requiresExpanding) {
+      dimGroup.kind = DimensionGroupKind::kExpanding;
+      int64_t numDynamicDims = 0;
+
+      // Consume result shape dimensions until their product matches the
+      // corresponding operand dimension (or fail if unresolvable/contradicting
+      // factors are found).
+      while (resultShapeIt != resultShapeEnd &&
+             remainingResultShapeFactors.empty() &&
+             !remainingOperandShapeFactors.empty()) {
+        if (!isSymbolicProduct(*resultShapeIt++,
+                               &remainingResultShapeFactors)) {
+          return failure();
+        }
+        dimGroup.size++;
+        SymbolicProduct gcd = eliminateCommonFactors(
+            remainingOperandShapeFactors, remainingResultShapeFactors);
+        int64_t tyDimSize = getShapedTypyDimSize(gcd);
+
+        // Allow no more than one dynamic dimension per expansion group.
+        if (tyDimSize == ShapedType::kDynamic) {
+          numDynamicDims++;
+          if (numDynamicDims > 1) return failure();
+        }
+        expandedIntermShape->push_back(tyDimSize);
+      }
+      if (anyRemainingFactors()) return failure();
+
+      // Consume trailing, unpaired unit dimensions.
+      while (isUnpairedUnitDimension(resultShapeIt, resultShapeEnd,
+                                     operandShapeIt, operandShapeEnd)) {
+        resultShapeIt++;
+        dimGroup.size++;
+        expandedIntermShape->push_back(1);
+      }
+
+      continue;
+    }
+
+    // Untouched: 1:1 mapping between operand and result shape dimension. This
+    // is neither expanding nor collapsing.
+    assert(!requiresCollapsing && !requiresExpanding && "expect id case");
+    assert(dimGroup.size == 1 && dimGroup.kind == DimensionGroupKind::kNone &&
+           "expect simple dimension group");
+  }
+
+  // Fail if there are remaining dimensions that could not be consumed.
+  assert(!anyRemainingFactors() && "expect no remaining factors");
+  if (operandShapeIt != operandShapeEnd || resultShapeIt != resultShapeEnd) {
+    return failure();
+  }
+
+  return success();
+}
+
+SmallVector<int64_t> concretizeOperandShape(
+    ArrayRef<int64_t> operandShape, ArrayRef<SymbolicExpr> operandShapeInfo) {
+  SmallVector<int64_t> result;
+  for (auto it : llvm::zip(operandShape, operandShapeInfo)) {
+    auto dimSize = std::get<0>(it);
+    auto sExpr = std::get<1>(it);
+    if (auto cexpr = dyn_cast<AffineConstantExpr>(sExpr.expr)) {
+      int64_t alsoDimSize = cexpr.getValue();
+      assert((ShapedType::isDynamic(dimSize) || dimSize == alsoDimSize) &&
+             "expect shape analysis result to be compatible with type");
+      result.push_back(alsoDimSize);
+      continue;
+    }
+    result.push_back(dimSize);
+  }
+  return result;
+}
+
+std::optional<SmallVector<ReassociationIndices>> requiresReassociationOfKind(
+    DimensionGroupKind kind, const SmallVector<DimensionGroup> &dimGroups) {
+  SmallVector<ReassociationIndices> reassociation;
+  reassociation.reserve(dimGroups.size());
+  bool isStrictlyReassociating = false;
+  int64_t i = 0;
+  for (const DimensionGroup &g : dimGroups) {
+    if (g.kind == kind) {
+      isStrictlyReassociating = true;
+      reassociation.push_back(
+          llvm::to_vector(llvm::seq<int64_t>(i, i + g.size)));
+      i += g.size;
+      continue;
+    }
+    for (int64_t j = 0; j < g.size; j++) reassociation.push_back({i++});
+  }
+
+  // Return the reassociation if expansion is required.
+  if (isStrictlyReassociating) return reassociation;
+  return std::nullopt;
+}
+
+LogicalResult materializeReshapeAsExpandAndCollapse(
+    ShapeComponentAnalysis &shapeAnalysis, RankedTensorType operandTy,
+    RankedTensorType resultTy, stablehlo::DynamicReshapeOp op,
+    PatternRewriter &rewriter) {
+  // Require sucessful shape analysis for operand and result shape.
+  auto operandShapeInfo = shapeAnalysis.GetShapeInfo(op.getOperand());
+  if (!operandShapeInfo) return failure();
+  auto resultShapeInfo = shapeAnalysis.GetValueInfo(op.getOutputShape());
+  if (!resultShapeInfo) return failure();
+
+  // Identify dimension groups and the intermediate expanded type.
+  SmallVector<DimensionGroup> dimensionGroups;
+  SmallVector<int64_t> expandedIntermShape;
+  if (failed(findExpandingAndCollapsingDimensionGroups(
+          *operandShapeInfo, *resultShapeInfo, &dimensionGroups,
+          &expandedIntermShape))) {
+    return failure();
+  }
+
+  // Materialize cast, expand, collapse, and cast, as needed.
+  auto loc = op.getLoc();
+  Value interm = op.getOperand();
+  auto castedOperandTy = RankedTensorType::get(
+      concretizeOperandShape(operandTy.getShape(), *operandShapeInfo),
+      operandTy.getElementType());
+  if (operandTy != castedOperandTy) {
+    interm = rewriter.create<tensor::CastOp>(loc, castedOperandTy, interm);
+  }
+  if (auto reassociation = requiresReassociationOfKind(
+          DimensionGroupKind::kExpanding, dimensionGroups)) {
+    interm = rewriter.create<tensor::ExpandShapeOp>(
+        loc,
+        RankedTensorType::get(expandedIntermShape, operandTy.getElementType()),
+        interm, *reassociation);
+  }
+  if (auto reassociation = requiresReassociationOfKind(
+          DimensionGroupKind::kCollapsing, dimensionGroups)) {
+    interm =
+        rewriter.create<tensor::CollapseShapeOp>(loc, interm, *reassociation);
+  }
+  if (interm.getType() != resultTy) {
+    interm = rewriter.create<tensor::CastOp>(loc, resultTy, interm);
+  }
+  rewriter.replaceOp(op, interm);
+  return success();
+}
+
+// Tries to express `dynamic_reshape` ops through `expand_shape` and
+// `collapse_shape` ops.
+struct DynamicReshapeToExpandAndCollapseShape final
+    : public OpRewritePattern<stablehlo::DynamicReshapeOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(stablehlo::DynamicReshapeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto operandTy =
+        mlir::dyn_cast<RankedTensorType>(op.getOperand().getType());
+    if (!operandTy) return failure();
+    auto resultTy = mlir::dyn_cast<RankedTensorType>(op.getType());
+    if (!resultTy) return failure();
+
+    // Handle degenerate scalar expand case.
+    if (operandTy.getRank() == 0) {
+      return materializeReshapeAsScalarExpand(operandTy, resultTy, op,
+                                              rewriter);
+    }
+
+    // Handle degenerate scalar collapse case.
+    if (resultTy.getRank() == 0) {
+      return materializeReshapeAsScalarCollapse(operandTy, resultTy, op,
+                                                rewriter);
+    }
+
+    ShapeComponentAnalysis shapeAnalysis;
+    return materializeReshapeAsExpandAndCollapse(shapeAnalysis, operandTy,
+                                                 resultTy, op, rewriter);
+  }
+};
+
+// Returns true if all of bcasted_shapes can be broadcasted with output_shape.
+bool isKnownBroadcastable(ShapeComponentAnalysis &analysis,
+                          ValueRange bcastedShapes, Value outputShape) {
+  auto outputShapeDims = analysis.GetValueInfo(outputShape);
+  if (!outputShapeDims) return false;
+  for (Value shape : bcastedShapes) {
+    auto shapeDims = analysis.GetValueInfo(shape);
+    if (!shapeDims) return false;
+    // Iterate backwards over the smallest input shape.
+    for (auto zip : llvm::zip(llvm::reverse(*outputShapeDims),
+                              llvm::reverse(*shapeDims))) {
+      const auto &first = std::get<0>(zip);
+      const auto &second = std::get<1>(zip);
+      // TODO(ezhulenev): What to do with dimensions statically known to be
+      // zero?
+      // Numpy can only broadcast [0] with [1], however Tensorflow can broadcast
+      // [0] with any dimension size, and produces dimension of size [0].
+      // Currently we'll conservatively return failure and will not proceed with
+      // a rewrite.
+      if (first.isConstant(0) || second.isConstant(0)) return false;
+      // If either shape has a static one dimension the broadcast will always
+      // succeed.
+      if (first.isConstant(1) || second.isConstant(1)) continue;
+      // Otherwise dims have to be equal.
+      if (first != second) return false;
+    }
+  }
+  return true;
+}
+
+// Rewrite `shape.cstr_broadcastable` with constant witness if can prove that
+// shapes are broadcastable from a symbolic analysis.
+struct CstrBroadcastableOpLowering
+    : public OpRewritePattern<shape::CstrBroadcastableOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
+                                PatternRewriter &rewriter) const override {
+    ShapeComponentAnalysis shapeComponentAnalysis;
+    if (!isKnownBroadcastable(shapeComponentAnalysis, op.getShapes(),
+                              op.getShapes().front())) {
+      return failure();
+    }
+    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op, true);
+    return success();
+  }
+};
+
+// Returns a shape tensor if the shapes can be broadcasted to a known shape.
+// Will either return one of the shapes or a generated mix of the shapes.
+std::optional<Value> simplifyBroadcast(ShapeComponentAnalysis &analysis,
+                                       ValueRange shapes, Location loc,
+                                       OpBuilder *builder) {
+  // First find the input shape with the largest rank.
+  SmallVector<ArrayRef<ShapeComponentAnalysis::SymbolicExpr>> shapesFound;
+  size_t maxRank = 0;
+  for (const auto &shape : llvm::enumerate(shapes)) {
+    auto foundShape = analysis.GetValueInfo(shape.value());
+    if (!foundShape) return {};
+    shapesFound.push_back(*foundShape);
+    maxRank = std::max(maxRank, foundShape->size());
+  }
+  if (maxRank == 0) {
+    return Value(builder->create<tensor::FromElementsOp>(
+        loc, shapes[0].getType(), SmallVector<Value>()));
+  }
+
+  SmallVector<const ShapeComponentAnalysis::SymbolicExpr *> joinedDimensions(
+      maxRank);
+  SmallVector<std::pair<Value, int64_t>> shapeAndRankForDim(maxRank);
+  for (const auto &shape : llvm::enumerate(shapesFound)) {
+    for (const auto &dim : llvm::enumerate(llvm::reverse(shape.value()))) {
+      // 1 dimensions don't contribute to the final result.
+      if (dim.value().isConstant(1)) continue;
+      // If it's not a 1 dimension it will be present in the result. Remember
+      // where it came from.
+      auto index = maxRank - dim.index() - 1;
+      if (!joinedDimensions[index]) {
+        joinedDimensions[index] = &dim.value();
+        shapeAndRankForDim[index] =
+            std::make_pair(shapes[shape.index()], shape.value().size());
+        continue;
+      }
+      // Bail if the dimensions are neither equal nor 1.
+      if (*joinedDimensions[index] != dim.value()) return {};
+    }
+  }
+  // If the output is the same as one of the inputs just return that.
+  if (llvm::all_equal(shapeAndRankForDim) && shapeAndRankForDim[0].first) {
+    return shapeAndRankForDim[0].first;
+  }
+  // Otherwise rematerialize the shape from the pieces we have.
+  SmallVector<Value> elements;
+  for (size_t i = 0; i != maxRank; ++i) {
+    // 1 dimensions are filtered above, recreate the constant.
+    if (!shapeAndRankForDim[i].first) {
+      auto one = builder->getIntegerAttr(
+          mlir::cast<RankedTensorType>(shapes[0].getType()).getElementType(),
+          1);
+      elements.push_back(builder->create<arith::ConstantOp>(loc, one));
+      continue;
+    }
+    // Extract from one of the shapes, accounting for the reverse indexing
+    // performed by broadcast.
+    Value index = builder->create<arith::ConstantIndexOp>(
+        loc, i - maxRank + shapeAndRankForDim[i].second);
+    elements.push_back(builder->create<tensor::ExtractOp>(
+        loc, shapeAndRankForDim[i].first, index));
+  }
+  return Value(builder->create<tensor::FromElementsOp>(loc, elements));
+}
+
+// Replace shape.broadcast with a shape if it's statically known.
+struct BroadcastOpLowering final
+    : public mlir::OpRewritePattern<shape::BroadcastOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(
+      shape::BroadcastOp op, mlir::PatternRewriter &rewriter) const override {
+    ShapeComponentAnalysis shapeComponentAnalysis;
+    auto newBroadcast = simplifyBroadcast(
+        shapeComponentAnalysis, op.getShapes(), op.getLoc(), &rewriter);
+    if (!newBroadcast) return failure();
+
+    // Insert cast, if needed.
+    Type expectedTy = op.getType();
+    if (newBroadcast->getType() != expectedTy) {
+      newBroadcast = rewriter.create<tensor::CastOp>(op.getLoc(), expectedTy,
+                                                     *newBroadcast);
+    }
+
+    rewriter.replaceOp(op, {*newBroadcast});
+    return success();
+  }
+};
+
+class SymbolicShapeOptimizationPass final
+    : public impl::SymbolicShapeOptimizationPassBase<
+          SymbolicShapeOptimizationPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+    mlir::RewritePatternSet patterns(ctx);
+
+    // clang-format off
+    patterns.insert<
+        AnnotateExpandingDimensionsInDynamicBroadcastInDim,
+        BroadcastOpLowering,
+        CstrBroadcastableOpLowering,
+        DynamicReshapeToExpandAndCollapseShape,
+        SimplifyBroadcasts>(ctx);
+    // clang-format on
+
+    // Collect some relevant canonicalization patterns.
+    shape::AssumingOp::getCanonicalizationPatterns(patterns, ctx);
+    shape::ShapeOfOp::getCanonicalizationPatterns(patterns, ctx);
+
+    if (failed(
+            mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // end namespace
+
+}  // namespace stablehlo_ext
+}  // end namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 2d21fbe205b7..73dc7410b019 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -3696,3 +3696,23 @@ func.func @ragged_dot_batch(%lhs : tensor<19x17x11x5xf32>, %rhs : tensor<19x17x5
   } : (tensor<19x17x11x5xf32>, tensor<19x17x5x7xf32>, tensor<19x3xi64>) -> tensor<19x17x11x7xf32>
   func.return %0 : tensor<19x17x11x7xf32>
 }
+
+// -----
+
+func.func @ragged_dot_frontend_attributes(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3x2x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<2x11x7xf32> {
+  // CHECK-HIGH-LEVEL: mhlo.ragged_dot
+  // CHECK-HIGH-LEVEL: mhlo.frontend_attributes = {foo = "bar"}
+  %0 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
+    ragged_dot_dimension_numbers = #chlo.ragged_dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [1],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [2],
+      lhs_ragged_dimensions = [1],
+      rhs_group_dimensions = [0]
+    >,
+    precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>],
+    mhlo.frontend_attributes = {foo = "bar"}
+  } : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  func.return %0 : tensor<2x11x7xf32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
index 065d6f35b25e..258ad44e8b36 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
@@ -211,17 +211,6 @@ func.func @concatenate_const_2D_horizontal() -> tensor<2x2xi32> {
   func.return %2 : tensor<2x2xi32>
 }
 
-////////
-// CopyOp
-
-// CHECK-LABEL: func @fold_copy
-// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
-func.func @fold_copy(%arg : tensor<1x4xf32>) -> tensor<1x4xf32> {
-  // CHECK: return [[ARG]]
-  %0 = "mhlo.copy"(%arg) : (tensor<1x4xf32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
 ////////
 // DynamicBroadcastInDimOp
 
@@ -605,6 +594,19 @@ func.func @pad_zero_length(%arg0: tensor<5x0xf32>, %arg1: tensor<f32>) -> tensor
   func.return %0 : tensor<7x2xf32>
 }
 
+////////
+// DynamicSliceOp
+
+// CHECK-LABEL: @fold_dynamic_slice
+func.func @fold_dynamic_slice(%767: tensor<i32>, %203: tensor<i32>) -> tensor<1x1xi32> {
+  %28 = mhlo.constant dense<256> : tensor<6x1xi32>
+  %769 = "mhlo.dynamic_slice"(%28, %767, %203) <{slice_sizes = dense<1> : tensor<2xi64>}> : (tensor<6x1xi32>, tensor<i32>, tensor<i32>) -> tensor<1x1xi32>
+
+  // CHECK: %[[RESULT:.*]] = mhlo.constant dense<256>
+  // CHECK: return %[[RESULT]]
+  return %769 : tensor<1x1xi32>
+}
+
 ////////
 // RealDynamicSliceOp
 
@@ -916,6 +918,30 @@ func.func @dce_while_without_side_effect(%arg0: tensor<i64>) -> tensor<i64> {
   func.return %arg0 : tensor<i64>
 }
 
+// CHECK-LABEL: while_op_dce_no_side_effect
+func.func @while_op_dce_no_side_effect(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<10> : tensor<i32>
+  %2 = mhlo.constant dense<0> : tensor<i32>
+  %3 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %4 = "mhlo.broadcast_in_dim"(%3) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<10xf32>
+  // CHECK: mhlo.while(%iterArg = %2, %iterArg_0 = %3) : tensor<i32>, tensor<10xf32> attributes {mhlo.frontend_attributes = {test_attr = "true"}}
+  %5:3 = mhlo.while(%iterArg = %arg0, %iterArg_0 = %2, %iterArg_1 = %4) : tensor<10xf32>, tensor<i32>, tensor<10xf32> attributes {mhlo.frontend_attributes = {test_attr = "true"}}
+    cond {
+    %6 = mhlo.compare  LT, %iterArg_0, %1,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    mhlo.return %6 : tensor<i1>
+  } do {
+    %6 = "mhlo.dynamic_slice"(%iterArg, %iterArg_0) <{slice_sizes = dense<1> : tensor<1xi64>}> : (tensor<10xf32>, tensor<i32>) -> tensor<1xf32>
+    %7 = mhlo.reshape %6 : (tensor<1xf32>) -> tensor<f32>
+    %8 = mhlo.sine %7 : tensor<f32>
+    %9 = "mhlo.broadcast_in_dim"(%8) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<1xf32>
+    %10 = mhlo.dynamic_update_slice %iterArg_1, %9, %iterArg_0 : (tensor<10xf32>, tensor<1xf32>, tensor<i32>) -> tensor<10xf32>
+    %11 = mhlo.add %iterArg_0, %0 : tensor<i32>
+    mhlo.return %iterArg, %11, %10 : tensor<10xf32>, tensor<i32>, tensor<10xf32>
+  }
+  return %5#2 : tensor<10xf32>
+}
+
 ////////
 // Tensor/Shape canonicalize
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
index 994bc1041a68..339904aa69d8 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
@@ -83,3 +83,71 @@ func.func @zero_ext(%arg0: tensor<0xi1>) -> tensor<i32> {
   // CHECK: return %[[CST]]
   return %5 : tensor<i32>
 }
+
+// -----
+
+// CHECK-LABEL: func @init_constant
+func.func @init_constant() -> tensor<512xi1> {
+  %cst = mhlo.constant dense<true> : tensor<i1>
+  %cst_1 = mhlo.constant dense<true> : tensor<512x1xi1>
+  %0 = mhlo.reduce(%cst_1 init: %cst) across dimensions = [1] : (tensor<512x1xi1>, tensor<i1>) -> tensor<512xi1>
+   reducer(%arg1: tensor<i1>, %arg2: tensor<i1>)  {
+    mhlo.return %cst : tensor<i1>
+  }
+  return %0 : tensor<512xi1>
+
+  // CHECK:      %0 = mhlo.constant dense<true> : tensor<512xi1>
+  // CHECK-NEXT: return %0 : tensor<512xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func @init_arith_constant
+func.func @init_arith_constant() -> tensor<512xi1> {
+  %cst = arith.constant dense<true> : tensor<i1>
+  %cst_1 = arith.constant dense<true> : tensor<512x1xi1>
+  %0 = mhlo.reduce(%cst_1 init: %cst) across dimensions = [1] : (tensor<512x1xi1>, tensor<i1>) -> tensor<512xi1>
+   reducer(%arg1: tensor<i1>, %arg2: tensor<i1>)  {
+    mhlo.return %cst : tensor<i1>
+  }
+  return %0 : tensor<512xi1>
+
+  // CHECK:      %0 = mhlo.constant dense<true> : tensor<512xi1>
+  // CHECK-NEXT: return %0 : tensor<512xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func @init_constant_multiple_inputs
+func.func @init_constant_multiple_inputs() -> (tensor<512xi1>, tensor<512xi32>) {
+  %cst = mhlo.constant dense<false> : tensor<i1>
+  %cst_1 = mhlo.constant dense<true> : tensor<512x1xi1>
+  %cst_2 = mhlo.constant dense<5> : tensor<i32>
+  %cst_3 = mhlo.constant dense<0> : tensor<512x1xi32>
+  %0:2 = mhlo.reduce(%cst_1 init: %cst), (%cst_3 init: %cst_2) across dimensions = [1] : (tensor<512x1xi1>, tensor<512x1xi32>, tensor<i1>, tensor<i32>) -> (tensor<512xi1>, tensor<512xi32>)
+   reducer(%arg1: tensor<i1>, %arg2: tensor<i1>) (%arg3: tensor<i32>, %arg4: tensor<i32>)  {
+    mhlo.return %cst, %cst_2 : tensor<i1>, tensor<i32>
+  }
+  return %0#0, %0#1 : tensor<512xi1>, tensor<512xi32>
+
+  // CHECK:      %0 = mhlo.constant dense<false> : tensor<512xi1>
+  // CHECK-NEXT: %1 = mhlo.constant dense<5> : tensor<512xi32>
+  // CHECK-NEXT: return %0, %1 : tensor<512xi1>, tensor<512xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @init_constant_return_outside_constant
+func.func @init_constant_return_outside_constant() -> tensor<1x128xi1> {
+  %cst = mhlo.constant dense<true> : tensor<i1>
+  %cst_1 = mhlo.constant dense<true> : tensor<1x128x1xi1>
+  %cst_2 = mhlo.constant dense<false> : tensor<i1>
+  %0 = mhlo.reduce(%cst_1 init: %cst_2) across dimensions = [2] : (tensor<1x128x1xi1>, tensor<i1>) -> tensor<1x128xi1>
+  reducer(%arg2: tensor<i1>, %arg3: tensor<i1>)  {
+    mhlo.return %cst : tensor<i1>
+  }
+  return %0 : tensor<1x128xi1>
+
+  // CHECK:      %0 = mhlo.constant dense<true> : tensor<1x128xi1>
+  // CHECK-NEXT: return %0 : tensor<1x128xi1>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index 6e91799784d0..1b152b6ff446 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -798,10 +798,10 @@ func.func @complex_sin(%arg0: tensor<2x2xcomplex<f32>>) -> tensor<2x2xcomplex<f3
 // CHECK-PRIMITIVE-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func.func @copy(%input: tensor<2x4x8xf32>) -> tensor<2x4x8xf32> {
   %0 = "mhlo.copy"(%input) : (tensor<2x4x8xf32>) -> (tensor<2x4x8xf32>)
+  // CHECK-PRIMITIVE: linalg.map
+  // CHECK: return [[ARG]] : tensor<2x4x8xf32>
   func.return %0 : tensor<2x4x8xf32>
 }
-// CHECK: return [[ARG]] : tensor<2x4x8xf32>
-// CHECK-PRIMITIVE: return [[ARG]] : tensor<2x4x8xf32>
 
 // -----
 
@@ -4702,12 +4702,11 @@ func.func @gather(%operand : tensor<1x4x8xi32>, %start_indices : tensor<1x8x2xi3
 // CHECK-SAME:           outs(%[[INIT]] : tensor<1x8x8xi32>)
 // CHECK-SAME:           {someattr}
 // CHECK:           ^bb0
-// CHECK-DAG:         %[[IDX0:.+]] = linalg.index 0
 // CHECK-DAG:         %[[IDX1:.+]] = linalg.index 1
 // CHECK-DAG:         %[[IDX2:.+]] = linalg.index 2
-// CHECK-DAG:         %[[S0_INT:.+]] = tensor.extract %[[START_INDICES]][%[[IDX0]], %[[IDX1]], %[[C0]]] : tensor<1x8x2xi32>
+// CHECK-DAG:         %[[S0_INT:.+]] = tensor.extract %[[START_INDICES]][%[[C0]], %[[IDX1]], %[[C0]]] : tensor<1x8x2xi32>
 // CHECK-DAG:         %[[S0:.+]] = arith.index_cast %[[S0_INT]] : i32 to index
-// CHECK-DAG:         %[[S1_INT:.+]] = tensor.extract %[[START_INDICES]][%[[IDX0]], %[[IDX1]], %[[C1]]] : tensor<1x8x2xi32>
+// CHECK-DAG:         %[[S1_INT:.+]] = tensor.extract %[[START_INDICES]][%[[C0]], %[[IDX1]], %[[C1]]] : tensor<1x8x2xi32>
 // CHECK-DAG:         %[[S1:.+]] = arith.index_cast %[[S1_INT]] : i32 to index
 // CHECK-DAG:         %[[CLAMP0:.+]] = arith.maxsi %[[S0]], %[[C0]]  : index
 // CHECK-DAG:         %[[IN0:.+]] = arith.minsi %[[CLAMP0]], %[[C0]]
@@ -5055,6 +5054,7 @@ func.func @torch_index_select(%arg0: tensor<5x1x5xi32>,
 //      CHECK: func @torch_index_select
 // CHECK-SAME:   %[[INPUT:[a-zA-Z0-9_]*]]
 // CHECK-SAME:   %[[INDEX:[a-zA-Z0-9_]*]]
+//      CHECK: %[[C0:.+]] = arith.constant 0 : index
 //      CHECK: %[[INIT1:.+]] = tensor.empty() :
 //      CHECK: %[[INIT2:.+]] = tensor.empty() :
 //      CHECK: linalg.generic {
@@ -5066,9 +5066,8 @@ func.func @torch_index_select(%arg0: tensor<5x1x5xi32>,
 // CHECK-SAME: {someattr}
 //      CHECK: ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: i32, %{{.+}}: i32):
 //      CHECK:   %[[CAST:.+]] = arith.index_cast %[[VAL]] : i32 to index
-//      CHECK:   %[[J:.+]] = linalg.index 1
 //      CHECK:   %[[K:.+]] = linalg.index 2
-//      CHECK:   %[[VAL2:.+]] = tensor.extract %[[INPUT]][%[[CAST]], %[[J]], %[[K]]] : tensor<5x1x5xi32>
+//      CHECK:   %[[VAL2:.+]] = tensor.extract %[[INPUT]][%[[CAST]], %[[C0]], %[[K]]] : tensor<5x1x5xi32>
 //      CHECK:   linalg.yield %[[VAL2]] : i32
 
 // -----
@@ -5205,6 +5204,7 @@ func.func @torch_index_select_unsigned(%arg0: tensor<5x1x5xui32>,
 //      CHECK: func @torch_index_select_unsigned
 // CHECK-SAME:   %[[INPUT:[a-zA-Z0-9_]*]]
 // CHECK-SAME:   %[[INDEX:[a-zA-Z0-9_]*]]
+//      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[INPUT_SIGNLESS:.*]] = builtin.unrealized_conversion_cast %[[INPUT]] : tensor<5x1x5xui32> to tensor<5x1x5xi32>
 //      CHECK:   %[[INIT:.*]] = tensor.empty() : tensor<1x5xi32>
 //      CHECK:   %[[RES:.+]] = linalg.generic {
@@ -5214,9 +5214,8 @@ func.func @torch_index_select_unsigned(%arg0: tensor<5x1x5xui32>,
 // CHECK-SAME:   ins(%[[INDEX]], %[[INIT]] : tensor<2xi32>, tensor<1x5xi32>)
 //      CHECK:   ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: i32, %{{.+}}: i32):
 //      CHECK:     %[[CAST:.+]] = arith.index_cast %[[VAL]] : i32 to index
-//      CHECK:     %[[J:.+]] = linalg.index 1
 //      CHECK:     %[[K:.+]] = linalg.index 2
-//      CHECK:     %[[VAL2:.+]] = tensor.extract %[[INPUT_SIGNLESS]][%[[CAST]], %[[J]], %[[K]]] : tensor<5x1x5xi32>
+//      CHECK:     %[[VAL2:.+]] = tensor.extract %[[INPUT_SIGNLESS]][%[[CAST]], %[[C0]], %[[K]]] : tensor<5x1x5xi32>
 //      CHECK:     linalg.yield %[[VAL2]] : i32
 //      CHECK:   %[[RES_UNSIGNED:.+]] = builtin.unrealized_conversion_cast %[[RES]] : tensor<2x1x5xi32> to tensor<2x1x5xui32>
 //      CHECK:   return %[[RES_UNSIGNED]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
index 706c9cdd5862..3ace2e8f8a79 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
@@ -4,29 +4,6 @@
 // This test file runs both FileCheck and diagnostic check. These tests all
 // error when the experimental flag is disabled, and pass when it is enabled.
 
-// CHECK-LABEL: "op_all_reduce_tuple"
-func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
-  //               CHECK: "stablehlo.custom_call"(%[[ARG0:.*]], %[[ARG1:.*]]) <{
-  //          CHECK-SAME:  call_target_name = "mhlo.all_reduce", called_computations = [@all_reduce]
-  //          CHECK-SAME: }> {
-  // CHECK-SAME{LITERAL}:    mhlo.attributes = {replica_groups = dense<> : tensor<0x0xi64>}
-  //          CHECK-SAME: } : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
-  //               CHECK: func.func
-  //          CHECK-SAME: sym_name = "all_reduce"
-  //               CHECK: ^bb0(%[[REDUCE_ARG0:.*]]: tensor<f32>, %[[REDUCE_ARG1:.*]]: tensor<f32>):
-  //          CHECK-NEXT: %[[ADD:.*]] = "stablehlo.add"(%[[REDUCE_ARG0]], %[[REDUCE_ARG1]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  //          CHECK-NEXT: "stablehlo.return"(%[[ADD]]) : (tensor<f32>) -> ()
-  // expected-error@+1 {{failed to legalize operation 'mhlo.all_reduce' that was explicitly marked illegal}}
-  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
-  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %2 = mhlo.add %arg2, %arg3 : tensor<f32>
-    mhlo.return %2 : tensor<f32>
-  }) {replica_groups = dense<> : tensor<0x0xi64>} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
-  return %0#0, %0#1 : tensor<8xf32>, tensor<f32>
-}
-
-// -----
-
 // CHECK-LABEL: "op_all_to_all_tuple"
 func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>) {
   //               CHECK: "stablehlo.custom_call"(%arg0, %arg1) <{call_target_name = "mhlo.all_to_all"}> {
@@ -39,16 +16,3 @@ func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32
   return %0#0, %0#1 : tensor<128x4xf32>, tensor<128x4xf32>
 }
 
-// -----
-
-// CHECK-LABEL: "attr_precision_packed_nibble"
-func.func @attr_precision_packed_nibble(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "stablehlo.custom_call"(%arg0, %arg1) <{call_target_name = "mhlo.dot"}> {
-  // CHECK-SAME:    mhlo.attributes = {precision_config = ["PACKED_NIBBLE"]}
-  // CHECK-SAME: } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
-  // expected-error@+1 {{failed to legalize operation 'mhlo.dot' that was explicitly marked illegal}}
-  %0 = "mhlo.dot"(%arg0, %arg1) {
-    precision_config = [#mhlo<precision PACKED_NIBBLE>]
-  } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
-  func.return %0 : tensor<8x8xf32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-partial.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-partial.mlir
index 80ce93cd1aa0..fdb11ec5d39e 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-partial.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-partial.mlir
@@ -32,3 +32,19 @@ func.func @copy() -> tensor<2x1xi32> {
   // CHECK: return %[[COPY]]
   func.return %1 : tensor<2x1xi32>
 }
+
+// -----
+
+// Tokens flow between StableHLO and MHLO ops, so need to have special converson
+// logic. AddDependencyOp is the only op that doesn't exist in StableHLO but
+// uses token types, so it can have either StableHLO or MHLO token types as
+// input.
+
+// CHECK-LABEL: func @add_dependency
+func.func @add_dependency(%arg0: tensor<3x4xf32>) -> tensor<3x4xf32> {
+  // CHECK:      %[[TOK:.*]] = stablehlo.create_token {{.*}} : !stablehlo.token
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.add_dependency %arg0, %[[TOK]] : (tensor<3x4xf32>, !stablehlo.token) -> tensor<3x4xf32>
+  %0 = mhlo.create_token {xla_shape = "token[]"} : !mhlo.token
+  %1 = mhlo.add_dependency %arg0, %0 : (tensor<3x4xf32>, !mhlo.token) -> tensor<3x4xf32>
+  return %1 : tensor<3x4xf32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index a69aac61d464..8c850c4af074 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -346,6 +346,59 @@ func.func @attr_result_accuracy_mode(%arg0: tensor<f32>) -> tensor<f32> {
   func.return %0 : tensor<f32>
 }
 
+// CHECK-LABEL: "test_unary_result_accuracy"
+func.func @test_unary_result_accuracy(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) {
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  %cbrt = "mhlo.cbrt"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %cosine = "mhlo.cosine"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %exponential = "mhlo.exponential"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %exponential_minus_one = "mhlo.exponential_minus_one"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %log = "mhlo.log"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %log_plus_one = "mhlo.log_plus_one"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %logistic = "mhlo.logistic"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %rsqrt = "mhlo.rsqrt"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %sine = "mhlo.sine"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %sqrt = "mhlo.sqrt"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %tan = "mhlo.tan"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %tanh = "mhlo.tanh"(%arg0) {
+    result_accuracy = #mhlo.result_accuracy<atol = 0.000000e+00, rtol = 0.000000e+00, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  func.return %cbrt, %cosine, %exponential, %exponential_minus_one, %log, %log_plus_one, %logistic, %rsqrt, %sine, %sqrt, %tan, %tanh : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+}
+
 // -----
 
 // ============ OPS ============
@@ -413,6 +466,19 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
   func.return %0 : tensor<f32>
 }
 
+// CHECK-LABEL: "op_all_reduce_tuple"
+func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
+  // CHECK: %[[RES:.*]]:2 = "stablehlo.all_reduce"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]])
+  // CHECK-SAME: <{replica_groups = dense<> : tensor<0x0xi64>}>
+  // CHECK: "func.return"(%[[RES]]#0, %[[RES]]#1) : (tensor<8xf32>, tensor<f32>)
+  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %2 = mhlo.add %arg2, %arg3 : tensor<f32>
+    mhlo.return %2 : tensor<f32>
+  }) {replica_groups = dense<> : tensor<0x0xi64>} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  return %0#0, %0#1 : tensor<8xf32>, tensor<f32>
+}
+
 // CHECK-LABEL: "op_all_to_all"
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   //               CHECK: "stablehlo.all_to_all"([[ARG0:%arg[0-9]+]]) <{
@@ -432,6 +498,22 @@ func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   func.return %0 : tensor<16x4xf32>
 }
 
+// CHECK-LABEL: "op_all_to_all_tuple"
+func.func @op_all_to_all_tuple(%arg0: tensor<2x4xi64>, %arg1: tensor<2x4xi64>) -> (tensor<4x2xi64>, tensor<4x2xi64>) {
+  //  CHECK: %[[RES:.*]]:2 = "stablehlo.all_to_all"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]])
+  // CHECK-SAME: <{concat_dimension = 0 : i64,
+  // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
+  // CHECK-SAME:   split_count = 2 : i64, split_dimension = 1 : i64}>
+  // CHECK-SAME:   : (tensor<2x4xi64>, tensor<2x4xi64>) -> (tensor<4x2xi64>, tensor<4x2xi64>)
+  %0:2 = "stablehlo.all_to_all"(%arg0, %arg1) {
+      split_dimension = 1 : i64,
+      concat_dimension = 0 : i64,
+      split_count = 2 : i64,
+      replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+    } : (tensor<2x4xi64>, tensor<2x4xi64>) -> (tensor<4x2xi64>, tensor<4x2xi64>)
+  return %0#0, %0#1 : tensor<4x2xi64>, tensor<4x2xi64>
+}
+
 // CHECK-LABEL: "op_and"
 func.func @op_and(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "stablehlo.and"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
@@ -1756,6 +1838,14 @@ func.func @bounded_dynamism_broadcast_in_dim(%arg0: tensor<1x?xf32, #mhlo.type_e
   return %0 : tensor<2x1x?xf32, #mhlo.type_extensions<bounds = [?, ?, 5]>>
 }
 
+// CHECK-LABEL: bounded_dynamism_with_unknown_op
+func.func @bounded_dynamism_with_unknown_op(%arg0: tensor<1x4xi32>, %arg1: tensor<i32>) -> tensor<1x4xi32> {
+  %0 = "mhlo.set_dimension_size"(%arg0, %arg1) <{dimension = 1 : i64}> : (tensor<1x4xi32>, tensor<i32>) -> tensor<1x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>
+  // CHECK: "tensor.cast"({{.*}}) : (tensor<1x?xi32, #stablehlo.bounds<?, 4>>) -> tensor<1x4xi32> 
+  %cast = tensor.cast %0 : tensor<1x?xi32, #mhlo.type_extensions<bounds = [?, 4]>> to tensor<1x4xi32>
+  return %cast : tensor<1x4xi32>
+}
+
 // ============ TYPES ============
 
 // CHECK-LABEL: "type_i1"
@@ -2032,28 +2122,6 @@ func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!mhlo.token> {
 
 // -----
 
-func.func @attr_precision_config_invalid() -> tensor<8x8xf32> {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
-  %0 = "mhlo.custom_call"() {
-    call_target_name = "foo",
-    precision_config = [#mhlo<precision PACKED_NIBBLE>, 1 : i32]
-  } : () -> tensor<8x8xf32>
-  func.return %0 : tensor<8x8xf32>
-}
-
-// -----
-
-func.func @attr_invalid_nested_in_dictionary() -> tensor<8x8xf32> {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
-  %0 = "mhlo.custom_call"() {
-    call_target_name = "foo",
-    precision_config = {config = #mhlo<precision PACKED_NIBBLE>}
-  } : () -> tensor<8x8xf32>
-  func.return %0 : tensor<8x8xf32>
-}
-
-// -----
-
 func.func @op_add_dependency(%arg0: tensor<16xf32>, %arg1: !mhlo.token) -> tensor<16xf32> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.add_dependency' that was explicitly marked illegal}}
   %0 = "mhlo.add_dependency"(%arg0, %arg1) : (tensor<16xf32>, !mhlo.token) -> tensor<16xf32>
@@ -2093,8 +2161,7 @@ func.func @op_bitcast(%arg0: tensor<i32>) -> tensor<f32> {
 // -----
 
 func.func @op_copy(%arg0: tensor<f32>) -> tensor<f32> {
-  // mhlo.copy is immediately folded away at the first opportunity,
-  // so it doesn't seem to be possible to capture it in FileCheck tests.
+  // expected-error@+1 {{failed to legalize operation 'mhlo.copy' that was explicitly marked illegal}}
   %0 = "mhlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index aaf2a148ed6a..e3911c37bf10 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -1349,7 +1349,7 @@ func.func @comp_compatible_operand_types(%arg0: tensor<3xi32>, %arg1: tensor<?xi
 // -----
 
 func.func @comp_mismatch_return_element_type(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xf16> {
-  // expected-error@+1 {{result #0 must be ranked tensor of pred (AKA boolean or 1-bit integer) values, but got 'tensor<3xf16>'}}
+  // expected-error-re@+1 {{result #0 must be ranked tensor of {{.*}}, but got 'tensor<3xf16>'}}
   %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction EQ>} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xf16>
   func.return %0 : tensor<3xf16>
 }
@@ -1577,7 +1577,7 @@ func.func @cholesky_invalid_rank(%arg0: tensor<1xf32>) -> tensor<1xf32> {
 // -----
 
 func.func @cholesky_invalid_elt(%arg0: tensor<1x2x2xi32>) -> tensor<1x2x2xi32> {
-  // expected-error@+1 {{op operand #0 must be ranked tensor of f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>'}}
+  // expected-error-re@+1 {{op operand #0 must be ranked tensor of {{.*}}, but got 'tensor<1x2x2xi32>'}}
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xi32>) -> tensor<1x2x2xi32>
   func.return %0: tensor<1x2x2xi32>
 }
@@ -1588,7 +1588,7 @@ func.func @cholesky_wrong_infer_shape(%arg0: tensor<1x2x2xf32>) -> tensor<1x2x2x
   // @expected-error@+2 {{'mhlo.cholesky' op failed to infer returned types}}
   // expected-error@+1 {{'mhlo.cholesky' op inferred type(s) 'tensor<1x2x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2x2x2xf32>'}}
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xf32>) -> tensor<1x2x2x2xf32>
-  func.return %0: tensor<1x2x2x2xf32>
+    func.return %0: tensor<1x2x2x2xf32>
 }
 
 // -----
@@ -1640,10 +1640,21 @@ func.func @dot_bad_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi3
 
 // -----
 
-// CHECK-LABEL: func @exponential_result_accuracy
-func.func @exponential_result_accuracy(%arg0: tensor<f32>) -> tensor<f32> {
-  %0 = "mhlo.exponential"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 0.0, rtol = 0.0, ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<f32>) -> tensor<f32>
-  func.return %0: tensor<f32>
+// CHECK-LABEL: func @test_unary_result_accuracy_tol
+func.func @test_unary_result_accuracy_tol(%arg0: tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) {
+  %cbrt = "mhlo.cbrt"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %cosine = "mhlo.cosine"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %exponential = "mhlo.exponential"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %exponential_minus_one = "mhlo.exponential_minus_one"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %log = "mhlo.log"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %log_plus_one = "mhlo.log_plus_one"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %logistic = "mhlo.logistic"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %rsqrt = "mhlo.rsqrt"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %sine = "mhlo.sine"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %sqrt = "mhlo.sqrt"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %tan = "mhlo.tan"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  %tanh = "mhlo.tanh"(%arg0) {result_accuracy = #mhlo.result_accuracy<atol = 1.000000e+00, rtol = 1.000000e+00, ulps = 5, mode = #mhlo.result_accuracy_mode<TOLERANCE>>} : (tensor<2xf32>) -> tensor<2xf32>
+  func.return %cbrt, %cosine, %exponential, %exponential_minus_one, %log, %log_plus_one, %logistic, %rsqrt, %sine, %sqrt, %tan, %tanh : tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>
 }
 
 // -----
@@ -1993,7 +2004,7 @@ func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
 
 func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer or f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error-re@+1 {{op operand #0 must be 0D tensor of {{.*}}, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%mu, %sigma, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2002,7 +2013,7 @@ func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -
 
 func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{op operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer or f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error-re@+1 {{op operand #1 must be 0D tensor of {{.*}}, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%mu, %sigma, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2011,7 +2022,7 @@ func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>
 
 func.func @rng_normal_invalid_shape_rank(%mu: tensor<f32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[[2, 3, 5]]> : tensor<1x3xi64>
-  // expected-error@+1 {{operand #2 must be 1D tensor of index or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer values, but got 'tensor<1x3xi64>'}}
+  // expected-error-re@+1 {{operand #2 must be 1D tensor of {{.*}}, but got 'tensor<1x3xi64>'}}
   %0 = "mhlo.rng"(%mu, %sigma, %shape) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<f32>, tensor<f32>, tensor<1x3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2020,7 +2031,7 @@ func.func @rng_normal_invalid_shape_rank(%mu: tensor<f32>, %sigma: tensor<f32>)
 
 func.func @rng_normal_invalid_type(%arg0: tensor<complex<f32>>, %arg1: tensor<f32>) {
   %cst = "mhlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
-  // expected-error @+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer or f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+  // expected-error-re@+1 {{op operand #0 must be 0D tensor of {{.*}}, but got 'tensor<complex<f32>>'}}
   %0 = "mhlo.rng"(%arg0, %arg1, %cst) <{rng_distribution = #mhlo.rng_distribution<NORMAL>}>: (tensor<complex<f32>>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
   func.return
 }
@@ -2055,7 +2066,7 @@ func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %ar
 
 func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer or f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error-re@+1 {{op operand #0 must be 0D tensor of {{.*}}, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2065,7 +2076,7 @@ func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> ten
 
 func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{op operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer or f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error-re@+1 {{op operand #1 must be 0D tensor of {{.*}}, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2074,7 +2085,7 @@ func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> ten
 
 func.func @rng_uniform_invalid_shape_rank(%a: tensor<f32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[[2, 3, 5]]> : tensor<1x3xi64>
-  // expected-error@+1 {{operand #2 must be 1D tensor of index or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer values, but got 'tensor<1x3xi64>'}}
+  // expected-error-re@+1 {{operand #2 must be 1D tensor of {{.*}}, but got 'tensor<1x3xi64>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<f32>, tensor<f32>, tensor<1x3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2083,7 +2094,7 @@ func.func @rng_uniform_invalid_shape_rank(%a: tensor<f32>, %b: tensor<f32>) -> t
 
 func.func @rng_uniform_invalid_type(%a: tensor<complex<f32>>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{op operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer or f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+  // expected-error-re@+1 {{op operand #0 must be 0D tensor of {{.*}}, but got 'tensor<complex<f32>>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) <{rng_distribution = #mhlo.rng_distribution<UNIFORM>}>: (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -2147,7 +2158,7 @@ func.func @select_scalar_x_y(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tenso
 // -----
 
 func.func @select_bad_pred_type(%arg0: tensor<i32>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // expected-error@+1 {{must be ranked tensor of pred (AKA boolean or 1-bit integer) values}}
+  // expected-error-re@+1 {{op operand #0 must be ranked tensor of {{.*}}, but got 'tensor<i32>'}}
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i32>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   func.return %0 : tensor<2x3xi32>
 }
@@ -2719,7 +2730,7 @@ func.func @or_invalid_f32_type(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> te
 // -----
 
 func.func @floor_invalid_i32_type(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-  // expected-error@+1 {{op operand #0 must be ranked tensor of f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or 2/4/8/16/32-bit uniform quantized signed integer or 2/4/8/16/32-bit uniform quantized unsigned integer values, but got 'tensor<4xi32>'}}
+  // expected-error-re@+1 {{op operand #0 must be ranked tensor of {{.*}}, but got 'tensor<4xi32>'}}
   %0 = "mhlo.floor"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
@@ -3576,7 +3587,7 @@ func.func @stochastic_convert(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xui32>) -
 // -----
 
 func.func @invalid_stochastic_convert_disallowed_random_type(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi8> {
-  // expected-error@+1 {{must be ranked tensor of 2/4/8/16/32/64-bit unsigned integer values, but got 'tensor<2x4xi32>'}}
+  // expected-error-re@+1 {{must be ranked tensor of {{.*}}, but got 'tensor<2x4xi32>'}}
   %0 = "mhlo.stochastic_convert"(%arg0, %arg1) : (tensor<2x4xf32>, tensor<2x4xi32>) -> tensor<2x4xi8>
   return %0 : tensor<2x4xi8>
 }
@@ -5815,7 +5826,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xi32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{op operand #0 must be ranked tensor of f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
+  // expected-error-re@+1 {{op operand #0 must be ranked tensor of {{.*}}, but got 'tensor<2x2x2x2xi32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xi32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -5848,7 +5859,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{op result #1 must be 1D tensor of f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
+  // expected-error-re@+1 {{op result #1 must be 1D tensor of {{.*}}, but got 'tensor<2x2xf32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2x2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -5856,7 +5867,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<*xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<*xf32> {
-  // expected-error@+1 {{op operand #0 must be ranked tensor of f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
+  // expected-error-re@+1 {{op operand #0 must be ranked tensor of {{.*}}, but got 'tensor<*xf32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<*xf32>
 }
@@ -6604,7 +6615,7 @@ func.func @complex(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf32>) -> tensor
 // -----
 
 func.func @complex_int_input(%arg0: tensor<10x10xi32>, %arg1: tensor<10x10xi32>) -> tensor<10x10xcomplex<i32>> {
-  // expected-error@+1 {{operand #0 must be ranked tensor of 32-bit float or 64-bit float values, but got 'tensor<10x10xi32>'}}
+  // expected-error-re@+1 {{operand #0 must be ranked tensor of {{.*}}, but got 'tensor<10x10xi32>'}}
   %0 = "mhlo.complex"(%arg0, %arg1) {} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xcomplex<i32>>
   func.return %0 : tensor<10x10xcomplex<i32>>
 }
@@ -6620,7 +6631,7 @@ func.func @complex_f32_f64_mix_input(%arg0: tensor<10x10xf32>, %arg1: tensor<10x
 // -----
 
 func.func @complex_f16_input(%arg0: tensor<10x10xf16>, %arg1: tensor<10x10xf16>) -> tensor<10x10xcomplex<f16>> {
-  // expected-error@+1 {{operand #0 must be ranked tensor of 32-bit float or 64-bit float values, but got 'tensor<10x10xf16>'}}
+  // expected-error-re@+1 {{operand #0 must be ranked tensor of {{.*}}, but got 'tensor<10x10xf16>'}}
   %0 = "mhlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf16>, tensor<10x10xf16>) -> tensor<10x10xcomplex<f16>>
   func.return %0 : tensor<10x10xcomplex<f16>>
 }
@@ -6812,7 +6823,7 @@ func.func @is_finite(%arg0: tensor<3xf32>) -> tensor<3xi1> {
 // -----
 
 func.func @is_finite_int_input(%arg0: tensor<3xi32>) -> tensor<3xi1> {
-  // expected-error@+1 {{op operand #0 must be ranked tensor of f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
+  // expected-error-re@+1 {{op operand #0 must be ranked tensor of {{.*}}, but got 'tensor<3xi32>'}}
   %0 = "mhlo.is_finite"(%arg0) {} : (tensor<3xi32>) -> tensor<3xi1>
   func.return %0 : tensor<3xi1>
 }
@@ -6820,7 +6831,7 @@ func.func @is_finite_int_input(%arg0: tensor<3xi32>) -> tensor<3xi1> {
 // -----
 
 func.func @is_finite_mismatch_return_element_type(%arg0: tensor<3xf32>) -> tensor<3xi10> {
-  // expected-error@+1 {{result #0 must be ranked tensor of pred (AKA boolean or 1-bit integer) values, but got 'tensor<3xi10>'}}
+  // expected-error-re@+1 {{result #0 must be ranked tensor of {{.*}}, but got 'tensor<3xi10>'}}
   %0 = "mhlo.is_finite"(%arg0) {} : (tensor<3xf32>) -> tensor<3xi10>
   func.return %0 : tensor<3xi10>
 }
@@ -7117,7 +7128,7 @@ func.func @composite_c4(%arg0: !mhlo.token) {
 func.func @ragged_dot_non_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3x2x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<2x11x7xf32> {
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [1],
         lhs_contracting_dimensions = [2],
@@ -7137,7 +7148,7 @@ func.func @ragged_dot_non_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3
 func.func @ragged_dot_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<2x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<3x2x11x7xf32> {
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [0],
         lhs_contracting_dimensions = [2],
@@ -7157,7 +7168,7 @@ func.func @ragged_dot_contracting(%lhs : tensor<2x11x5xf32>, %rhs : tensor<2x5x7
 func.func @ragged_dot_batch(%lhs : tensor<3x11x5xf32>, %rhs : tensor<3x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<3x11x7xf32> {
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [0],
         lhs_contracting_dimensions = [2],
@@ -7177,7 +7188,7 @@ func.func @ragged_dot_incompatible_contracting_dims(%lhs : tensor<11x5xf32>, %rh
   // @expected-error@+1 {{contracting dimension sizes must match}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [],
         rhs_batching_dimensions = [],
         lhs_contracting_dimensions = [1],
@@ -7197,7 +7208,7 @@ func.func @ragged_dot_group_sizes_incorrect_rank(%lhs : tensor<11x5xf32>, %rhs :
   // @expected-error@+1 {{expected group_sizes to have rank 1, got 2}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [],
         rhs_batching_dimensions = [],
         lhs_contracting_dimensions = [1],
@@ -7216,7 +7227,7 @@ func.func @ragged_dot_group_sizes_incorrect_rank(%lhs : tensor<11x5xf32>, %rhs :
 func.func @ragged_dot_mode1_group_sizes_broadcasted(%lhs : tensor<19x17x11x5xf32>, %rhs : tensor<3x19x5x7xf32>, %group_sizes : tensor<3xi64>) -> tensor<19x17x11x7xf32> {
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [1],
         lhs_contracting_dimensions = [3],
@@ -7236,7 +7247,7 @@ func.func @ragged_dot_mode1_group_sizes_incorrect_shape(%lhs : tensor<19x17x11x5
   // @expected-error@+1 {{group_sizes is expected to have shape [19, 17, 3], got [19, 11, 3]}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [1],
         lhs_contracting_dimensions = [3],
@@ -7256,7 +7267,7 @@ func.func @ragged_dot_mode2_group_sizes_incorrect_shape(%lhs : tensor<19x11x17x5
   // @expected-error@+1 {{group_sizes is expected to have shape [19, 17, 3], got [19, 11, 3]}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [0],
         lhs_contracting_dimensions = [2,3],
@@ -7276,7 +7287,7 @@ func.func @ragged_dot_mode3_group_sizes_incorrect_shape(%lhs : tensor<17x19x11x5
   // @expected-error@+1 {{group_sizes is expected to have shape [17, 3], got [19, 3]}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0,1],
         rhs_batching_dimensions = [0,1],
         lhs_contracting_dimensions = [3],
@@ -7296,7 +7307,7 @@ func.func @ragged_dot_incorrect_group_dim_size(%lhs : tensor<11x5xf32>, %rhs : t
   // @expected-error@+1 {{rhs group dimension is expected to have size=2, got 3}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [],
         rhs_batching_dimensions = [],
         lhs_contracting_dimensions = [1],
@@ -7316,7 +7327,7 @@ func.func @ragged_dot_incorrect_number_of_lhs_ragged_dimensions(%lhs : tensor<11
   // @expected-error@+1 {{There must be exactly one ragged dimension in the lhs}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [],
         rhs_batching_dimensions = [],
         lhs_contracting_dimensions = [1],
@@ -7336,7 +7347,7 @@ func.func @ragged_dot_rhs_group_dim_is_batch(%lhs : tensor<3x11x5xf32>, %rhs : t
   // @expected-error@+1 {{has duplicated dimension from rhs_group_dimensions and rhs_batching_dimensions: 0}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [0],
         lhs_contracting_dimensions = [2],
@@ -7356,7 +7367,7 @@ func.func @ragged_dot_rhs_group_dim_is_contracting(%lhs : tensor<11x3xf32>, %rhs
   // @expected-error@+1 {{has duplicated dimension from rhs_group_dimensions and rhs_contracting_dimensions: 1}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [],
         rhs_batching_dimensions = [],
         lhs_contracting_dimensions = [1],
@@ -7376,7 +7387,7 @@ func.func @ragged_dot_nonzero_rhs_group_dims_for_ragged_batch(%lhs : tensor<2x11
   // @expected-error@+1 {{There must be zero group dimensions in the rhs when the ragged dimension is batch or contracting}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [0],
         rhs_batching_dimensions = [1],
         lhs_contracting_dimensions = [2],
@@ -7396,7 +7407,7 @@ func.func @ragged_dot_nonzero_rhs_group_dims_for_ragged_contracting(%lhs : tenso
   // @expected-error@+1 {{There must be zero group dimensions in the rhs when the ragged dimension is batch or contracting}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [],
         rhs_batching_dimensions = [],
         lhs_contracting_dimensions = [1],
@@ -7416,7 +7427,7 @@ func.func @ragged_dot_zero_rhs_group_dims_for_ragged_noncontracting(%lhs : tenso
   // @expected-error@+1 {{There must be exactly one group dimension in the rhs when the lhs ragged dimension is non-contracting}}
   %0 = "mhlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
     ragged_dot_dimension_numbers = #mhlo.ragged_dot<
-      dot_dimension_numbers = <
+      dot_dimension_numbers = #mhlo.dot<
         lhs_batching_dimensions = [],
         rhs_batching_dimensions = [],
         lhs_contracting_dimensions = [1],
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
deleted file mode 100644
index fe28bbe8f497..000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
+++ /dev/null
@@ -1,110 +0,0 @@
-// RUN: mlir-hlo-opt --shape-legalize-to-hlo=legalize-constraints=true --split-input-file --verify-diagnostics %s | FileCheck %s
-
-// -----
-
-// CHECK-LABEL: func.func @shape_cstr_broadcastable
-func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xindex>) {
-  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<2xindex>
-  shape.assuming %0 {
-  }
-  func.return
-  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
-  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
-  // CHECK-NEXT: %[[ONES:.*]] = mhlo.constant dense<1> : tensor<2xi32>
-  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = mhlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS_EQ:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[DIMS2]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = mhlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
-  // CHECK-NEXT: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
-  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = mhlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
-  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = mhlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = mhlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
-  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return
-}
-
-// -----
-
-func.func @shape_cstr_broadcastable_input_shape(%arg0: !shape.shape, %arg1: !shape.shape) {
-  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
-  %0 = shape.cstr_broadcastable %arg0, %arg1 : !shape.shape, !shape.shape
-  shape.assuming %0 {
-  }
-  func.return
-}
-
-// -----
-
-func.func @shape_cstr_broadcastable_different_dims_1(%arg0: tensor<2xindex>, %arg1: tensor<1xindex>) {
-  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<1xindex>
-  shape.assuming %0 {
-  }
-  func.return
-  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
-  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<1xindex> to tensor<1xi32>
-  // CHECK-NEXT: %[[PAD:.*]] = mhlo.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: %[[DIMS2_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[DIMS2]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-  // CHECK-NEXT: %[[ONES:.*]] = mhlo.constant dense<1> : tensor<2xi32>
-  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS2_PAD]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = mhlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS_EQ:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[DIMS2_PAD]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = mhlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
-  // CHECK-NEXT: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
-  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = mhlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
-  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = mhlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = mhlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
-  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return
-}
-
-// -----
-
-func.func @shape_cstr_broadcastable_different_dims_2(%arg0: tensor<1xindex>, %arg1: tensor<2xindex>) {
-  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<1xindex>, tensor<2xindex>
-  shape.assuming %0 {
-  }
-  func.return
-  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<1xindex> to tensor<1xi32>
-  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
-  // CHECK-NEXT: %[[PAD:.*]] = mhlo.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: %[[DIMS1_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[DIMS1]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-  // CHECK-NEXT: %[[ONES:.*]] = mhlo.constant dense<1> : tensor<2xi32>
-  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS1_PAD]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = mhlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS_EQ:.*]] = mhlo.compare  EQ, %[[DIMS1_PAD]], %[[DIMS2]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = mhlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
-  // CHECK-NEXT: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
-  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = mhlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi1>) -> tensor<1xi1>
-  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = mhlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
-  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = mhlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
-  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return
-}
-
-// -----
-
-func.func @shape_cstr_broadcast_too_many_operands(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>, %arg2: tensor<4xindex>) {
-  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
-  %0 = shape.cstr_broadcastable %arg0, %arg1, %arg2 : tensor<4xindex>, tensor<4xindex>, tensor<4xindex>
-  shape.assuming %0 {
-  }
-  func.return
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
deleted file mode 100644
index f60b70b316a0..000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
+++ /dev/null
@@ -1,372 +0,0 @@
-// RUN: mlir-hlo-opt --shape-legalize-to-hlo --split-input-file --verify-diagnostics %s | FileCheck %s
-
-// CHECK-LABEL: func.func @num_elements_tensor_to_index
-func.func @num_elements_tensor_to_index(%arg0: tensor<2xindex>) -> index {
-  %0 = shape.num_elements %arg0 : tensor<2xindex> -> index
-  func.return %0 : index
-  //      CHECK: %[[ARG0_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
-  // CHECK-NEXT: %[[TMP0:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK-NEXT: %[[SIZE0x1:.*]] = "mhlo.slice"(%[[ARG0_I32]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[SIZE0:.*]] = mhlo.reshape %[[SIZE0x1]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[TMP1:.*]] = mhlo.multiply %[[TMP0]], %[[SIZE0]] : tensor<i32>
-  // CHECK-NEXT: %[[SIZE1x1:.*]] = "mhlo.slice"(%[[ARG0_I32]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[SIZE1:.*]] = mhlo.reshape %[[SIZE1x1]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[RESULT_I32:.*]] = mhlo.multiply %[[TMP1]], %[[SIZE1]] : tensor<i32>
-  // CHECK-NEXT: %[[RESULT_INDEX:.*]] = builtin.unrealized_conversion_cast %[[RESULT_I32]] : tensor<i32> to index
-  // CHECK-NEXT: return %[[RESULT_INDEX]] : index
-}
-
-// -----
-
-func.func @num_elements_shape_to_xxx(%arg0: !shape.shape) -> !shape.size {
-  // expected-error@+1 {{failed to legalize operation 'shape.num_elements' that was explicitly marked illegal}}
-  %0 = shape.num_elements %arg0 : !shape.shape -> !shape.size
-  func.return %0 : !shape.size
-}
-
-// -----
-
-func.func @num_elements_xxx_to_size(%arg0: tensor<2xindex>) -> !shape.size {
-  // expected-error@+1 {{failed to legalize operation 'shape.num_elements' that was explicitly marked illegal}}
-  %0 = shape.num_elements %arg0 : tensor<2xindex> -> !shape.size
-  func.return %0 : !shape.size
-}
-
-// -----
-
-// CHECK-LABEL: func.func @shape_of_ranked
-func.func @shape_of_ranked_to_index(%arg0: tensor<?x1xf32>) -> tensor<2xindex> {
-  %0 = shape.shape_of %arg0 : tensor<?x1xf32> -> tensor<2xindex>
-  func.return %0 : tensor<2xindex>
-  //      CHECK: %[[SIZE0x1:.*]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (tensor<?x1xf32>) -> tensor<i32>
-  // CHECK-NEXT: %[[SIZE0:.*]] = mhlo.reshape %[[SIZE0x1]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[SIZE1x1:.*]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 1 : i64}> : (tensor<?x1xf32>) -> tensor<i32>
-  // CHECK-NEXT: %[[SIZE1:.*]] = mhlo.reshape %[[SIZE1x1]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[RESULT_I32:.*]] = "mhlo.concatenate"(%[[SIZE0]], %[[SIZE1]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-  // CHECK-NEXT: %[[RESULT_INDEX:.*]] = builtin.unrealized_conversion_cast %[[RESULT_I32]] : tensor<2xi32> to tensor<2xindex>
-  // CHECK-NEXT: return %[[RESULT_INDEX]] : tensor<2xindex>
-}
-
-// -----
-
-func.func @shape_of_unranked_to_xxx(%arg0: tensor<*xf32>) -> tensor<?xindex> {
-  // expected-error@+1 {{failed to legalize operation 'shape.shape_of' that was explicitly marked illegal}}
-  %0 = shape.shape_of %arg0 : tensor<*xf32> -> tensor<?xindex>
-  func.return %0 : tensor<?xindex>
-}
-
-// -----
-
-func.func @shape_of_ranked_to_shape(%arg0: tensor<?x1xf32>) -> !shape.shape {
-  // expected-error@+1 {{failed to legalize operation 'shape.shape_of' that was explicitly marked illegal}}
-  %0 = shape.shape_of %arg0 : tensor<?x1xf32> -> !shape.shape
-  func.return %0 : !shape.shape
-}
-
-// -----
-
-// CHECK-LABEL: func.func @tensor_dim
-func.func @tensor_dim(%arg0: tensor<?x?xf32>) -> index {
-  %c0 = arith.constant 0 : index
-  %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  func.return %dim : index
-  //      CHECK: %[[DIM_SIZE:.*]] = "mhlo.get_dimension_size"(%arg0) <{dimension = 0 : i64}> : (tensor<?x?xf32>) -> tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_INDEX:.*]] = builtin.unrealized_conversion_cast %[[DIM_SIZE]] : tensor<i32> to index
-  // CHECK-NEXT: return %[[DIM_SIZE_INDEX]] : index
-}
-
-// -----
-
-func.func @tensor_dim_dynamic(%arg0: tensor<?x?xf32>, %arg1: index) -> index {
-  // expected-error@+1 {{failed to legalize operation 'tensor.dim' that was explicitly marked illegal}}
-  %dim = tensor.dim %arg0, %arg1 : tensor<?x?xf32>
-  func.return %dim : index
-}
-
-// -----
-
-// CHECK-LABEL: func.func @tensor_from_elements
-func.func @tensor_from_elements(%arg0: index) -> tensor<2xindex> {
-  %c0 = arith.constant 0 : index
-  %0 = tensor.from_elements %arg0, %c0 : tensor<2xindex>
-  func.return %0 : tensor<2xindex>
-  //      CHECK: %[[ELEMENT1_SCALAR:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
-  // CHECK-NEXT: %[[ELEMENT1:.*]] = mhlo.reshape %[[ELEMENT1_SCALAR]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[ELEMENT2:.*]] = mhlo.constant dense<0> : tensor<1xi32>
-  // CHECK-NEXT: %[[CONCAT:.*]] = "mhlo.concatenate"(%[[ELEMENT1]], %[[ELEMENT2]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-  // CHECK-NEXT: %[[CONCAT_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CONCAT]] : tensor<2xi32> to tensor<2xindex>
-  // CHECK-NEXT: return %[[CONCAT_INDEX]] : tensor<2xindex>
-}
-
-// -----
-
-func.func @tensor_from_elements_i8(%arg0: i8) -> tensor<2xi8> {
-  %c0 = arith.constant 0 : i8
-  // expected-error@+1 {{failed to legalize operation 'tensor.from_elements' that was explicitly marked illegal}}
-  %0 = tensor.from_elements %arg0, %c0 : tensor<2xi8>
-  func.return %0 : tensor<2xi8>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @tensor_from_elements_scalar
-func.func @tensor_from_elements_scalar(%arg0: i64) -> tensor<i64> {
-  %0 = tensor.from_elements %arg0 : tensor<i64>
-  func.return %0 : tensor<i64>
-  //      CHECK: %[[RESULT:.*]] = builtin.unrealized_conversion_cast %arg0 : i64 to tensor<i64>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<i64>
-}
-
-// -----
-
-func.func @tensor_from_elements_rank2(%arg0: index) -> tensor<2x1xindex> {
-  %c0 = arith.constant 0 : index
-  // expected-error@+1 {{failed to legalize operation 'tensor.from_elements' that was explicitly marked illegal}}
-  %0 = tensor.from_elements %arg0, %c0 : tensor<2x1xindex>
-  func.return %0 : tensor<2x1xindex>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @shape_broadcast
-func.func @shape_broadcast(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>) -> tensor<4xindex> {
-  %0 = shape.broadcast %arg0, %arg1 : tensor<4xindex>, tensor<4xindex> -> tensor<4xindex>
-  func.return %0 : tensor<4xindex>
-  //      CHECK: %[[LHS:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<4xindex> to tensor<4xi32>
-  // CHECK-NEXT: %[[RHS:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<4xindex> to tensor<4xi32>
-  // CHECK-NEXT: %[[BROADCAST:.*]] = mhlo.maximum %[[LHS]], %[[RHS]] : tensor<4xi32>
-  // CHECK-NEXT: %[[BROADCAST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[BROADCAST]] : tensor<4xi32> to tensor<4xindex>
-  // CHECK-NEXT: return %[[BROADCAST_INDEX]] : tensor<4xindex>
-}
-
-// -----
-
-func.func @shape_broadcast_different_dims(%arg0: tensor<4xindex>, %arg1: tensor<6xindex>) -> tensor<6xindex> {
-  %0 = shape.broadcast %arg0, %arg1 : tensor<4xindex>, tensor<6xindex> -> tensor<6xindex>
-  func.return %0 : tensor<6xindex>
-  //      CHECK: %[[LHS:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<4xindex> to tensor<4xi32>
-  // CHECK-NEXT: %[[RHS:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<6xindex> to tensor<6xi32>
-  // CHECK-NEXT: %[[PAD:.*]] = mhlo.constant dense<1> : tensor<2xi32>
-  // CHECK-NEXT: %[[LHS_PAD:.*]] = "mhlo.concatenate"(%[[PAD]], %[[LHS]]) <{dimension = 0 : i64}> : (tensor<2xi32>, tensor<4xi32>) -> tensor<6xi32>
-  // CHECK-NEXT: %[[BROADCAST:.*]] = mhlo.maximum %[[LHS_PAD]], %[[RHS]] : tensor<6xi32>
-  // CHECK-NEXT: %[[BROADCAST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[BROADCAST]] : tensor<6xi32> to tensor<6xindex>
-  // CHECK-NEXT: return %[[BROADCAST_INDEX]] : tensor<6xindex>
-}
-
-// -----
-
-func.func @shape_broadcast_result_shape(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>) -> !shape.shape {
-  // expected-error@+1 {{failed to legalize operation 'shape.broadcast' that was explicitly marked illegal}}
-  %0 = shape.broadcast %arg0, %arg1 : tensor<4xindex>, tensor<4xindex> -> !shape.shape
-  func.return %0 : !shape.shape
-}
-
-// -----
-
-func.func @shape_broadcast_input_shape(%arg0: !shape.shape, %arg1: !shape.shape) -> !shape.shape {
-  // expected-error@+1 {{failed to legalize operation 'shape.broadcast' that was explicitly marked illegal}}
-  %0 = shape.broadcast %arg0, %arg1 : !shape.shape, !shape.shape -> !shape.shape
-  func.return %0 : !shape.shape
-}
-
-// -----
-
-func.func @shape_broadcast_too_many_operands(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>, %arg2: tensor<4xindex>) -> tensor<4xindex> {
-  // expected-error@+1 {{failed to legalize operation 'shape.broadcast' that was explicitly marked illegal}}
-  %0 = shape.broadcast %arg0, %arg1, %arg2 : tensor<4xindex>, tensor<4xindex>, tensor<4xindex> -> tensor<4xindex>
-  func.return %0 : tensor<4xindex>
-}
-
-// -----
-
-func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xindex>) -> !shape.witness {
-  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
-  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<2xindex>
-  func.return %0 : !shape.witness
-}
-
-// -----
-
-// CHECK-LABEL: func @const_shape
-func.func @const_shape() -> tensor<2xindex> {
-  %0 = shape.const_shape [6, 4] : tensor<2xindex>
-  return %0 : tensor<2xindex>
-  //      CHECK: %[[CST:.*]] = mhlo.constant dense<[6, 4]> : tensor<2xi32>
-  // CHECK-NEXT: %[[CST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CST]] : tensor<2xi32> to tensor<2xindex>
-  // CHECK-NEXT: return %[[CST_INDEX]] : tensor<2xindex>
-}
-
-// -----
-
-// CHECK-LABEL: func @index_cast_index_to_i32
-func.func @index_cast_index_to_i32(%arg0: tensor<2xindex>) -> tensor<2xi32> {
-  %0 = arith.index_cast %arg0 : tensor<2xindex> to tensor<2xi32>
-  return %0 : tensor<2xi32>
-  // CHECK-NEXT: %[[CST_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
-  // CHECK-NEXT: return %[[CST_I32]] : tensor<2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @index_cast_i32_to_index
-func.func @index_cast_i32_to_index(%arg0: tensor<2xi32>) -> tensor<2xindex> {
-  %0 = arith.index_cast %arg0 : tensor<2xi32> to tensor<2xindex>
-  return %0 : tensor<2xindex>
-  // CHECK-NEXT: %[[CST_INDEX:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xi32> to tensor<2xindex>
-  // CHECK-NEXT: return %[[CST_INDEX]] : tensor<2xindex>
-}
-
-// -----
-
-// CHECK-LABEL: func @index_cast_scalar_index_to_i32
-func.func @index_cast_scalar_index_to_i32(%arg0: index) -> i32 {
-  //      CHECK: %[[CAST_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
-  // CHECK-NEXT: %[[CAST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CAST_I32]] : tensor<i32> to i32
-  // CHECK-NEXT: return %[[CAST_INDEX]] : i32
-  %0 = arith.index_cast %arg0 : index to i32
-  return %0 : i32
-}
-
-// -----
-
-// CHECK-LABEL: func @index_cast_scalar_index_to_i64
-func.func @index_cast_scalar_index_to_i64(%arg0: index) -> i64 {
-  //      CHECK: %[[CAST_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
-  // CHECK-NEXT: %[[CONVERT:.*]] = mhlo.convert %[[CAST_I32]] : (tensor<i32>) -> tensor<i64>
-  // CHECK-NEXT: %[[CAST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CONVERT]] : tensor<i64> to i64
-  // CHECK-NEXT: return %[[CAST_INDEX]] : i64
-  %0 = arith.index_cast %arg0 : index to i64
-  return %0 : i64
-}
-
-// -----
-
-func.func @index_cast_scalar_i32_to_index(%arg0: i32) -> index {
-  //      CHECK: %[[CAST_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : i32 to tensor<i32>
-  // CHECK-NEXT: %[[CAST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CAST_I32]] : tensor<i32> to index
-  // CHECK-NEXT: return %[[CAST_INDEX]] : index
-  %0 = arith.index_cast %arg0 : i32 to index
-  return %0 : index
-}
-
-// -----
-
-func.func @index_cast_index_to_i8(%arg0: tensor<2xindex>) -> tensor<2xi8> {
-  // expected-error@+1 {{failed to legalize operation 'arith.index_cast' that was explicitly marked illegal}}
-  %0 = arith.index_cast %arg0 : tensor<2xindex> to tensor<2xi8>
-  return %0 : tensor<2xi8>
-}
-
-// -----
-
-func.func @index_cast_i8_to_index(%arg0: tensor<2xi8>) -> tensor<2xindex> {
-  // expected-error@+1 {{failed to legalize operation 'arith.index_cast' that was explicitly marked illegal}}
-  %0 = arith.index_cast %arg0 : tensor<2xi8> to tensor<2xindex>
-  return %0 : tensor<2xindex>
-}
-
-
-// -----
-
-// CHECK-LABEL: func @muli
-func.func @muli(%arg0: index, %arg1: index) -> index {
-  %0 = arith.muli %arg0, %arg1 : index
-  return %0 : index
-  //      CHECK: %[[LHS:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
-  // CHECK-NEXT: %[[RHS:.*]] = builtin.unrealized_conversion_cast %arg1 : index to tensor<i32>
-  // CHECK-NEXT: %[[RES:.*]] = mhlo.multiply %[[LHS]], %[[RHS]] : tensor<i32>
-  // CHECK-NEXT: %[[RES_INDEX:.*]] = builtin.unrealized_conversion_cast %[[RES]] : tensor<i32> to index
-  // CHECK-NEXT: return %[[RES_INDEX]] : index
-}
-
-// -----
-
-// CHECK-LABEL: func @muli_const
-func.func @muli_const() -> index {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %0 = arith.muli %c1, %c2 : index
-  return %0 : index
-  //      CHECK: %[[LHS:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK-NEXT: %[[RHS:.*]] = mhlo.constant dense<2> : tensor<i32>
-  // CHECK-NEXT: %[[RES:.*]] = mhlo.multiply %[[LHS]], %[[RHS]] : tensor<i32>
-  // CHECK-NEXT: %[[RES_INDEX:.*]] = builtin.unrealized_conversion_cast %[[RES]] : tensor<i32> to index
-  // CHECK-NEXT: return %[[RES_INDEX]] : index
-}
-
-// -----
-
-func.func @muli_i32(%arg0: i32, %arg1: i32) -> i32 {
-  // expected-error@+1 {{failed to legalize operation 'arith.muli' that was explicitly marked illegal}}
-  %0 = arith.muli %arg0, %arg1 : i32
-  return %0 : i32
-}
-
-// -----
-
-// CHECK-LABEL: func @tensor_extract
-func.func @tensor_extract(%arg0: tensor<3x3xindex>) -> index {
-  %c1 = arith.constant 0 : index
-  %c2 = arith.constant 1 : index
-  %0 = tensor.extract %arg0[%c1, %c2] : tensor<3x3xindex>
-  return %0 : index
-  //      CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<3x3xindex> to tensor<3x3xi32>
-  // CHECK-NEXT: %[[SLICE:.*]] = "mhlo.slice"(%[[CAST]])
-  // CHECK-SAME: limit_indices = dense<[1, 2]> : tensor<2xi64>
-  // CHECK-SAME: start_indices = dense<[0, 1]> : tensor<2xi64>
-  // CHECK-SAME: strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: (tensor<3x3xi32>) -> tensor<1x1xi32>
-  // CHECK-NEXT: %[[RESHAPE:.*]] = mhlo.reshape %[[SLICE]] : (tensor<1x1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[RES_INDEX:.*]] = builtin.unrealized_conversion_cast %[[RESHAPE]] : tensor<i32> to index
-  // CHECK-NEXT: return %[[RES_INDEX]] : index
-}
-
-// -----
-
-// CHECK-LABEL: func @tensor_extract_i32
-func.func @tensor_extract_i32(%arg0: tensor<3x3xi32>) -> i32 {
-  %c1 = arith.constant 0 : index
-  %c2 = arith.constant 1 : index
-  %0 = tensor.extract %arg0[%c1, %c2] : tensor<3x3xi32>
-  return %0 : i32
-  //      CHECK: %[[SLICE:.*]] = "mhlo.slice"(%arg0)
-  // CHECK-SAME: limit_indices = dense<[1, 2]> : tensor<2xi64>
-  // CHECK-SAME: start_indices = dense<[0, 1]> : tensor<2xi64>
-  // CHECK-SAME: strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: (tensor<3x3xi32>) -> tensor<1x1xi32>
-  // CHECK-NEXT: %[[RESHAPE:.*]] = mhlo.reshape %[[SLICE]] : (tensor<1x1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[RES_I32:.*]] = builtin.unrealized_conversion_cast %[[RESHAPE]] : tensor<i32> to i32
-  // CHECK-NEXT: return %[[RES_I32]] : i32
-}
-
-// -----
-
-func.func @tensor_extract_out_of_range(%arg0: tensor<3x3xindex>) -> index {
-  %c1 = arith.constant 4 : index
-  %c2 = arith.constant 4 : index
-  // expected-error@+1 {{failed to legalize operation 'tensor.extract' that was explicitly marked illegal}}
-  %0 = tensor.extract %arg0[%c1, %c2] : tensor<3x3xindex>
-  return %0 : index
-}
-
-// -----
-
-func.func @tensor_extract_dynamic(%arg0: tensor<?x3xindex>) -> index {
-  %c1 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  // expected-error@+1 {{failed to legalize operation 'tensor.extract' that was explicitly marked illegal}}
-  %0 = tensor.extract %arg0[%c1, %c2] : tensor<?x3xindex>
-  return %0 : index
-}
-
-// -----
-
-// CHECK-LABEL: func @shape_of_zero_ranked_tensor
-func.func @shape_of_zero_ranked_tensor(%arg0 : tensor<i32>) -> tensor<0xindex> {
-  //      CHECK: %[[CONST:.*]] = mhlo.constant dense<> : tensor<0xi32>
-  // CHECK-NEXT: %[[RES_DIM0_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CONST]] : tensor<0xi32> to tensor<0xindex>
-  // CHECK-NEXT: return %[[RES_DIM0_INDEX]] : tensor<0xindex>
-  %0 = shape.shape_of %arg0 : tensor<i32> -> tensor<0xindex>
-  func.return %0 : tensor<0xindex>
-}
-
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo-partial.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo-partial.mlir
new file mode 100644
index 000000000000..788f359a9b33
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo-partial.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-hlo-opt --stablehlo-legalize-to-hlo=convert-xla-supported-stablehlo=false --split-input-file --verify-diagnostics %s | FileCheck %s
+
+
+// CHECK-LABEL: op_constant
+func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: stablehlo.constant
+  // CHECK-NOT: mhlo.constant
+  %cst = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  return %cst : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: bounded_dynamic_gather
+func.func @bounded_dynamic_gather(%arg0: tensor<16x50xf32>, %arg1: tensor<1x?xi64, #stablehlo.bounds<?,16>>) -> tensor<?x50xf32, #stablehlo.bounds<16, ?>> {
+  // CHECK: stablehlo.reshape
+  // CHECK-NOT: #mhlo.type_extensions<bounds
+  // CHECK-SAME: #stablehlo.bounds
+  %0 = stablehlo.reshape %arg1 : (tensor<1x?xi64, #stablehlo.bounds<?,16>>) -> tensor<?xi64, #stablehlo.bounds<16>>
+  // CHECK: stablehlo.gather
+  // CHECK-NOT: #mhlo.type_extensions<bounds
+  // CHECK-SAME: #stablehlo.bounds
+  %1 = "stablehlo.gather"(%arg0, %0) <{dimension_numbers = #stablehlo.gather<offset_dims = [1], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 1>, indices_are_sorted = false, slice_sizes = array<i64: 1, 50>}> : (tensor<16x50xf32>, tensor<?xi64, #stablehlo.bounds<16>>) -> tensor<?x50xf32, #stablehlo.bounds<16, ?>>
+  return %1 : tensor<?x50xf32, #stablehlo.bounds<16, ?>>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index f194eb0ffa95..27bdd8ebf0a9 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -260,14 +260,57 @@ func.func @attr_precision_config_highest(%arg0: tensor<8x16xf32>, %arg1: tensor<
 
 // -----
 
-
-// CHECK-LABEL: "attr_result_accuracy"
-func.func @attr_result_accuracy(%arg0: tensor<f32>) -> tensor<f32> {
-  %0 = "stablehlo.exponential"(%arg0) {
-    // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+// CHECK-LABEL: "test_unary_result_accuracy"
+func.func @test_unary_result_accuracy(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) {
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  // CHECK: result_accuracy = #mhlo.result_accuracy<ulps = 10, mode = #mhlo.result_accuracy_mode<TOLERANCE>>
+  %cbrt = "stablehlo.cbrt"(%arg0) {
     result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
   } : (tensor<f32>) -> tensor<f32>
-  func.return %0 : tensor<f32>
+  %cosine = "stablehlo.cosine"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %exponential = "stablehlo.exponential"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %exponential_minus_one = "stablehlo.exponential_minus_one"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %log = "stablehlo.log"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %log_plus_one = "stablehlo.log_plus_one"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %logistic = "stablehlo.logistic"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %rsqrt = "stablehlo.rsqrt"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %sine = "stablehlo.sine"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %sqrt = "stablehlo.sqrt"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %tan = "stablehlo.tan"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  %tanh = "stablehlo.tanh"(%arg0) {
+    result_accuracy = #stablehlo.result_accuracy<ulps = 10, mode = #stablehlo.result_accuracy_mode<TOLERANCE>>
+  } : (tensor<f32>) -> tensor<f32>
+  func.return %cbrt, %cosine, %exponential, %exponential_minus_one, %log, %log_plus_one, %logistic, %rsqrt, %sine, %sqrt, %tan, %tanh : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
 }
 
 // -----
@@ -399,6 +442,22 @@ func.func @op_add(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // -----
 
+// Tokens flow between StableHLO and MHLO ops, so need to have special converson
+// logic. AddDependencyOp is the only op that doesn't exist in StableHLO but
+// uses token types, so it can have either StableHLO or MHLO token types as
+// input.
+
+// CHECK-LABEL: "add_dependency"
+func.func @add_dependency(%arg0: tensor<3x4xf32>) -> tensor<3x4xf32> {
+  // CHECK:      %[[TOK:.*]] = "mhlo.create_token"() {{.*}} : () -> !mhlo.token
+  // CHECK-NEXT: %[[COPY:.*]] = "mhlo.add_dependency"(%arg0, %[[TOK]]) : (tensor<3x4xf32>, !mhlo.token) -> tensor<3x4xf32>
+  %0 = stablehlo.create_token {xla_shape = "token[]"} : !stablehlo.token
+  %1 = mhlo.add_dependency %arg0, %0 : (tensor<3x4xf32>, !stablehlo.token) -> tensor<3x4xf32>
+  return %1 : tensor<3x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: "op_after_all"
 func.func @op_after_all(%arg0: !stablehlo.token) -> !stablehlo.token {
   // CHECK: "mhlo.after_all"([[ARG0:%arg[0-9]+]]) : (!mhlo.token) -> !mhlo.token
@@ -2036,6 +2095,14 @@ func.func @bounded_dynamism_broadcast_in_dim(%arg0: tensor<1x?xf32, #stablehlo.b
   return %0 : tensor<2x1x?xf32, #stablehlo.bounds<?, ?, 5>>
 }
 
+// CHECK-LABEL: bounded_dynamism_with_unknown_op
+func.func @bounded_dynamism_with_unknown_op(%arg0: tensor<1x4xi32>, %arg1: tensor<i32>) -> tensor<1x4xi32> {
+  %0 = "stablehlo.set_dimension_size"(%arg0, %arg1) <{dimension = 1 : i64}> : (tensor<1x4xi32>, tensor<i32>) -> tensor<1x?xi32, #stablehlo.bounds<?, 4>>
+  // CHECK: "tensor.cast"({{.*}}) : (tensor<1x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<1x4xi32>
+  %cast = tensor.cast %0 : tensor<1x?xi32, #stablehlo.bounds<?, 4>> to tensor<1x4xi32>
+  return %cast : tensor<1x4xi32>
+}
+
 // ============ TYPES ============
 
 // CHECK-LABEL: "type_i1"
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
index 0bc78500239a..6635d345193b 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
@@ -98,7 +98,7 @@ func.func @while_with_invalid_tuples(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
   %0 = "mhlo.tuple"(%arg0, %cst_2) : (tensor<3xf32>, tensor<1xf32>) -> tuple<tensor<3xf32>, tensor<1xf32>>
   %1 = "mhlo.tuple"(%cst_1, %0) : (tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>) -> tuple<tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>>
-  // expected-error @+1 {{operand #1 must be variadic of ranked tensor of f4E2M1FN type or f6E2M3FN type or f6E3M2FN type or f8E3M4 type or f8E4M3 type or f8E4M3FN type or f8E4M3FNUZ type or f8E4M3B11FNUZ type or f8E5M2 type or f8E5M2FNUZ type or f8E8M0FNU type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or pred (AKA boolean or 1-bit integer) or 2/4/8/16/32/64-bit signless integer or 2/4/8/16/32/64-bit unsigned integer or complex type with 32-bit float or 64-bit float elements or 2/4/8/16/32-bit uniform quantized signed integer or 2/4/8/16/32-bit uniform quantized unsigned integer values or ranked tensor of 2/4/8/16/32-bit uniform quantized per axis signed integer or 2/4/8/16/32-bit uniform quantized per axis unsigned integer values or token, but got 'tuple<tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>>'}}
+  // expected-error-re@+1 {{operand #1 must be variadic of ranked tensor of {{.*}}, but got 'tuple<tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>>'}}
   %2:2 = "mhlo.while"(%cst_0, %1) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>):
     %t0 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>) -> tensor<2xi32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_preserve_high_level_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_preserve_high_level_ops.mlir
new file mode 100644
index 000000000000..f6e39addc3aa
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_preserve_high_level_ops.mlir
@@ -0,0 +1,91 @@
+// RUN: mlir-hlo-opt %s --stablehlo-ext-chlo-preserve-high-level-ops=use-custom-call-encoding=false -split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s --stablehlo-ext-chlo-preserve-high-level-ops=use-custom-call-encoding=true -split-input-file | FileCheck %s --check-prefixes=CHECK-CC
+// RUN: mlir-hlo-opt %s > %t.0
+// RUN: mlir-hlo-opt %s --stablehlo-ext-chlo-preserve-high-level-ops=use-custom-call-encoding=false | mlir-hlo-opt --stablehlo-ext-chlo-recompose-ops -symbol-dce > %t.1
+// RUN: mlir-hlo-opt %s --stablehlo-ext-chlo-preserve-high-level-ops=use-custom-call-encoding=true | mlir-hlo-opt --stablehlo-ext-chlo-recompose-ops -symbol-dce > %t.2
+// RUN: diff %t.0 %t.1
+// RUN: diff %t.0 %t.2
+
+
+// CHECK-LABEL: func @ragged_dot_to_composite
+func.func @ragged_dot_to_composite(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3x2x5x7xf32>, %group_sizes : tensor<3xi64>) -> (tensor<2x11x7xf32>) {
+  // CHECK: stablehlo.composite "chlo.ragged_dot"
+  // CHECK-SAME{LITERAL}: {composite_attributes = {precision_config = ["DEFAULT", "DEFAULT"]
+  // CHECK-SAME: ragged_dot_dimension_numbers = [dense<0>{{.*}}, dense<1>{{.*}}, dense<2>{{.*}}, dense<2>{{.*}}, dense<1>{{.*}}, dense<0>{{.*}}]
+  // CHECK-CC{LITERAL}: stablehlo.custom_call @chlo.ragged_dot(%arg0, %arg1, %arg2) {mhlo.attributes = {precision_config = ["DEFAULT", "DEFAULT"], ragged_dot_dimension_numbers = [dense<0>
+  %0 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
+    ragged_dot_dimension_numbers = #chlo.ragged_dot<
+      lhs_batching_dimensions = [0], rhs_batching_dimensions = [1],
+      lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2],
+      lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>,
+    precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>]
+  } : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  func.return %0 : tensor<2x11x7xf32>
+}
+// CHECK-LABEL: func.func private @chlo.ragged_dot.impl
+
+// -----
+
+// CHECK-LABEL: func @multiple_ragged_dots_name_conflict
+// CHECK: stablehlo.composite "chlo.ragged_dot"
+// CHECK-LABEL: func.func private @chlo.ragged_dot.impl
+// CHECK: chlo.ragged_dot
+// CHECK-LABEL: func.func private @chlo.ragged_dot.impl_0
+// CHECK: chlo.ragged_dot
+func.func @multiple_ragged_dots_name_conflict(%lhs : tensor<2x11x5xf32>, %rhs : tensor<3x2x5x7xf32>, %group_sizes : tensor<3xi64>) -> (tensor<2x11x7xf32>, tensor<2x11x7xf32>) {
+  %0 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
+    ragged_dot_dimension_numbers = #chlo.ragged_dot<
+      lhs_batching_dimensions = [0], rhs_batching_dimensions = [1],
+      lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2],
+      lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>,
+    precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>]
+  } : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  %1 = "chlo.ragged_dot"(%lhs, %rhs, %group_sizes) {
+    ragged_dot_dimension_numbers = #chlo.ragged_dot<
+      lhs_batching_dimensions = [0], rhs_batching_dimensions = [1],
+      lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2],
+      lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>,
+    precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>]
+  } : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  func.return %0, %1 : tensor<2x11x7xf32>, tensor<2x11x7xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @topk_preserve
+func.func @topk_preserve(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
+  // CHECK-CC: stablehlo.custom_call @mhlo.topk(%arg0) {mhlo.attributes = {k = 4 : i64, largest = true}, mhlo.version = 1 : i64} : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  // CHECK: stablehlo.composite "chlo.top_k" %arg0 {composite_attributes = {k = 4 : i64, largest = true}, decomposition = @chlo.top_k.impl, version = 1 : i32}
+  %values, %indices = chlo.top_k(%arg0, k = 4) {largest = true} : tensor<5x16xf32> -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  return %values, %indices : tensor<?x?xf32>, tensor<?x?xi32>
+}
+
+
+// -----
+
+// CHECK-LABEL: func @erf_preserve
+func.func @erf_preserve(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK-CC: stablehlo.custom_call @mhlo.erf(%arg0) {mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  // CHECK: stablehlo.composite "chlo.erf" %arg0 {decomposition = @chlo.erf.impl, version = 1 : i32}
+  %0 = chlo.erf %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @tan_no_preserve
+func.func @tan_no_preserve(%arg0: tensor<16xf32>) -> tensor<?xf32> {
+  // CHECK: chlo.tan
+  %0 = chlo.tan %arg0 : tensor<16xf32> -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @unregistered_attrs_preserve
+func.func @unregistered_attrs_preserve(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
+  // CHECK-CC: stablehlo.custom_call @mhlo.topk(%arg0) {mhlo.attributes = {k = 4 : i64, largest = true, mhlo.frontend_attributes = {foo = "true"}}, mhlo.version = 1 : i64} : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  // CHECK: stablehlo.composite "chlo.top_k" %arg0 {composite_attributes = {k = 4 : i64, largest = true, mhlo.frontend_attributes = {foo = "true"}}, decomposition = @chlo.top_k.impl, version = 1 : i32}
+  %values, %indices = chlo.top_k(%arg0, k = 4) {largest = true, mhlo.frontend_attributes = {foo = "true"}} : tensor<5x16xf32> -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  return %values, %indices : tensor<?x?xf32>, tensor<?x?xi32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir
index 6825fad03361..579c06d01715 100644
--- a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/chlo_recompose_ops.mlir
@@ -1,31 +1,82 @@
-// RUN: mlir-hlo-opt --stablehlo-ext-chlo-recompose-ops --split-input-file --verify-diagnostics %s | FileCheck %s
+// RUN: mlir-hlo-opt --stablehlo-ext-chlo-recompose-ops --symbol-dce --split-input-file --verify-diagnostics %s | FileCheck %s
+
+/////
+// Composite Recomposition
+
+// CHECK-LABEL: func @erf_recompose_composite
+func.func @erf_recompose_composite(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK-NEXT: chlo.erf
+  // CHECK-NOT: stablehlo.composite
+  %0 = stablehlo.composite "chlo.erf" %arg0 {decomposition = @chlo.erf.impl, version = 1 : i32} : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
+// CHECK-NOT: @chlo.erf.imp
+func.func private @chlo.erf.impl(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  %0 = chlo.erf %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  return %0 : tensor<?x20x20xbf16>
+}
 
 // -----
 
-// CHECK-LABEL: func @recompose_topk
-func.func @recompose_topk(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  // CHECK: %values, %indices = chlo.top_k(%arg0, k = 4) {largest = true} : tensor<5x16xf32> -> (tensor<?x?xf32>, tensor<?x?xi32>)
-  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-    mhlo.attributes = { k = 4 : i64, largest = true}
-  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
-  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
+// CHECK-LABEL: func @ragged_dot_recompose_composite
+func.func @ragged_dot_recompose_composite(%arg0: tensor<2x11x5xf32>, %arg1: tensor<3x2x5x7xf32>, %arg2: tensor<3xi64>) -> tensor<2x11x7xf32> {
+  // CHECK: "chlo.ragged_dot"(%arg0, %arg1, %arg2) <{precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>], ragged_dot_dimension_numbers = #chlo.ragged_dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2], lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>}> : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  // CHECK-NOT: stablehlo.composite
+  %0 = stablehlo.composite "chlo.ragged_dot" %arg0, %arg1, %arg2 {composite_attributes = {precision_config = ["DEFAULT", "DEFAULT"], ragged_dot_dimension_numbers = [dense<0> : tensor<1xi64>, dense<1> : tensor<1xi64>, dense<2> : tensor<1xi64>, dense<2> : tensor<1xi64>, dense<1> : tensor<1xi64>, dense<0> : tensor<1xi64>]}, decomposition = @chlo.ragged_dot.impl, version = 1 : i32} : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  return %0 : tensor<2x11x7xf32>
+}
+// CHECK-NOT: @chlo.ragged_dot.impl
+func.func private @chlo.ragged_dot.impl(%arg0: tensor<2x11x5xf32>, %arg1: tensor<3x2x5x7xf32>, %arg2: tensor<3xi64>) -> tensor<2x11x7xf32> {
+  %0 = "chlo.ragged_dot"(%arg0, %arg1, %arg2) <{precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>], ragged_dot_dimension_numbers = #chlo.ragged_dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2], lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>}> : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  return %0 : tensor<2x11x7xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @recompose_topk_invalid_attr
-func.func @recompose_topk_invalid_attr(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  // CHECK: stablehlo.custom_call @mhlo.topk
-  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-    mhlo.attributes = { k = 4 : i64, largest = false}
-  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
+// CHECK-LABEL: func @topk_recompose_composite
+func.func @topk_recompose_composite(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
+  // CHECK-NEXT: chlo.top_k
+  // CHECK-NOT: stablehlo.composite
+  %0:2 = stablehlo.composite "chlo.top_k" %arg0 {composite_attributes = {k = 4 : i64, largest = true}, decomposition = @chlo.top_k.impl, version = 1 : i32} : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
   return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
 }
+// CHECK-NOT: @chlo.top_k.impl
+func.func private @chlo.top_k.impl(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
+  %values, %indices = chlo.top_k(%arg0, k = 4) {largest = true} : tensor<5x16xf32> -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  return %values, %indices : tensor<?x?xf32>, tensor<?x?xi32>
+}
 
 // -----
 
-// CHECK-LABEL: @recompose_tan
-func.func @recompose_tan(%arg0: tensor<16xf32>) -> tensor<?xf32> {
+/////
+// (Deprecated) CustomCall Recomposition
+
+// CHECK-LABEL: @erf_recompose_cc
+func.func @erf_recompose_cc(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
+  // CHECK: %0 = chlo.erf %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
+  %0 = "stablehlo.custom_call"(%arg0) {
+    backend_config = "",
+    call_target_name = "mhlo.erf",
+    mhlo.attributes = {},
+    mhlo.version = 1 : i64
+  } : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
+  func.return %0 : tensor<?x20x20xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @ragged_dot_recompose_cc
+func.func @ragged_dot_recompose_cc(%arg0: tensor<2x11x5xf32>, %arg1: tensor<3x2x5x7xf32>, %arg2: tensor<3xi64>) -> tensor<2x11x7xf32> {
+  // CHECK: "chlo.ragged_dot"(%arg0, %arg1, %arg2) <{precision_config = [#chlo<precision DEFAULT>, #chlo<precision DEFAULT>], ragged_dot_dimension_numbers = #chlo.ragged_dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [1], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2], lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>}> : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32>
+  // CHECK-NOT: stablehlo.custom_call
+  %0 = stablehlo.custom_call @chlo.ragged_dot(%arg0, %arg1, %arg2) {mhlo.attributes = {precision_config = ["DEFAULT", "DEFAULT"], ragged_dot_dimension_numbers = [dense<0> : tensor<1xi64>, dense<1> : tensor<1xi64>, dense<2> : tensor<1xi64>, dense<2> : tensor<1xi64>, dense<1> : tensor<1xi64>, dense<0> : tensor<1xi64>]}, mhlo.version = 1 : i64} : (tensor<2x11x5xf32>, tensor<3x2x5x7xf32>, tensor<3xi64>) -> tensor<2x11x7xf32> 
+  return %0 : tensor<2x11x7xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @tan_recompose_cc
+func.func @tan_recompose_cc(%arg0: tensor<16xf32>) -> tensor<?xf32> {
   // CHECK: %0 = chlo.tan %arg0 : tensor<16xf32> -> tensor<?xf32>
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "mhlo.tan",
@@ -37,15 +88,22 @@ func.func @recompose_tan(%arg0: tensor<16xf32>) -> tensor<?xf32> {
 
 // -----
 
-// CHECK-LABEL: @recompose_erf
-func.func @recompose_erf(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
-  // CHECK: %0 = chlo.erf %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
-  %0 = "stablehlo.custom_call"(%arg0) {
-    backend_config = "",
-    call_target_name = "mhlo.erf",
-    mhlo.attributes = {},
-    mhlo.version = 1 : i64
-  } : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
-  func.return %0 : tensor<?x20x20xbf16>
+// CHECK-LABEL: func @topk_recompose_cc
+func.func @topk_recompose_cc(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
+  // CHECK: %values, %indices = chlo.top_k(%arg0, k = 4) {largest = true} : tensor<5x16xf32> -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
+    mhlo.attributes = { k = 4 : i64, largest = true}
+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
 }
 
+// -----
+
+// CHECK-LABEL: func @topk_no_recompose_invalid_attr
+func.func @topk_no_recompose_invalid_attr(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
+  // CHECK: stablehlo.custom_call @mhlo.topk
+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
+    mhlo.attributes = { k = 4 : i64, largest = false}
+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/sink-constants-to-control-flow.mlir b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/sink-constants-to-control-flow.mlir
new file mode 100644
index 000000000000..f0c255a796a2
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/sink-constants-to-control-flow.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-hlo-opt %s -stablehlo-ext-sink-constants-to-control-flow | FileCheck %s
+
+// Tests that constants are not sunk to while loops, HLO lowering converts these
+// to input parameters which results in faster execution.
+
+// CHECK-LABEL: func @sink_const_to_while
+func.func @sink_const_to_while(%arg0: tensor<i64>) -> tensor<i64> {
+  %c = stablehlo.constant dense<1> : tensor<i64>
+  %c_0 = stablehlo.constant dense<2> : tensor<i64>
+  // CHECK: stablehlo.while
+  // CHECK-SAME: (%[[ITER_ARG:.*]] = %[[ARG1A:.+]]
+  // CHECK: stablehlo.constant
+  %0 = stablehlo.while(%iterArg = %arg0) : tensor<i64>
+    cond {
+    %1 = stablehlo.compare  LT, %c, %iterArg : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    stablehlo.return %1 : tensor<i1>
+  // CHECK{LITERAL}: } do {
+  // CHECK: stablehlo.constant
+  } do {
+    %1 = stablehlo.add %iterArg, %iterArg : tensor<i64>
+    %2 = stablehlo.add %1, %c_0 : tensor<i64>
+    %3 = stablehlo.add %2, %c_0 : tensor<i64>
+    stablehlo.return %3 : tensor<i64>
+  }
+  return %0 : tensor<i64>
+}
+
+// CHECK-LABEL: func @sink_const_to_conditional
+func.func @sink_const_to_conditional(%arg0: tensor<i64>) -> tensor<i64> {
+  %c = stablehlo.constant dense<1> : tensor<i64>
+  %c_0 = stablehlo.constant dense<2> : tensor<i64>
+  %0 = stablehlo.compare  LT, %arg0, %c : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  // CHECK: stablehlo.if
+  %1 = "stablehlo.if"(%0) ({
+    // CHECK-NEXT: stablehlo.constant dense<1> : tensor<i64>
+    %2 = stablehlo.add %arg0, %c : tensor<i64>
+    stablehlo.return %2 : tensor<i64>
+  // CHECK{LITERAL}: }, {
+  // CHECK-NEXT: stablehlo.constant dense<2> : tensor<i64>
+  }, {
+    %2 = stablehlo.add %arg0, %c_0 : tensor<i64>
+    stablehlo.return %2 : tensor<i64>
+  }) : (tensor<i1>) -> tensor<i64>
+  return %1 : tensor<i64>
+}
+
+// CHECK-LABEL: func @sink_const_to_sort
+func.func @sink_const_to_sort(%arg0: tensor<16xf32>) {
+  %cst = arith.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK: stablehlo.sort
+  // CHECK-NEXT: ^bb0
+  // CHECK-NEXT: arith.constant dense<{{.*}}> : tensor<f32>
+  %0 = "stablehlo.sort"(%arg0) <{is_stable = true}> ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %1 = stablehlo.divide %arg1, %cst : tensor<f32>
+    %2 = stablehlo.divide %arg2, %cst : tensor<f32>
+    %3 = stablehlo.compare  GT, %1, %2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    stablehlo.return %3 : tensor<i1>
+  }) : (tensor<16xf32>) -> tensor<16xf32>
+  return
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/stablehlo_canonicalize_from_hlo_import.mlir b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/stablehlo_canonicalize_from_hlo_import.mlir
index cfe3510f6d3f..2ccd98919f49 100644
--- a/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/stablehlo_canonicalize_from_hlo_import.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/stablehlo_canonicalize_from_hlo_import.mlir
@@ -92,7 +92,7 @@ func.func @custom_call(%x: tensor<6x3xf32>) -> (tensor<6xf32>, tensor<3xf32>) {
 // CHECK-LABEL: @custom_call_tupled_operand
 // CHECK-SAME: %[[ARG0:.*]]: tuple<tensor<ui32>, tensor<i32>>
 func.func @custom_call_tupled_operand(%arg0: tuple<tensor<ui32>, tensor<i32>>)
-  -> (tensor<i32>, tensor<ui32>) {
+  -> (tensor<i32>, tensor<ui32>, tensor<ui32>) {
   // CHECK-NEXT: %[[C0:.*]] = stablehlo.constant dense<1> : tensor<ui32>
   %0 = stablehlo.constant dense<1> : tensor<ui32>
   // CHECK-NEXT: %[[C1:.*]] = stablehlo.constant dense<10> : tensor<i32>
@@ -102,10 +102,26 @@ func.func @custom_call_tupled_operand(%arg0: tuple<tensor<ui32>, tensor<i32>>)
   // CHECK-NEXT: %[[VAR1:.*]] = stablehlo.get_tuple_element %[[ARG0]][0]
   // CHECK-NEXT: %[[VAR2:.*]] = stablehlo.get_tuple_element %[[ARG0]][1]
   // CHECK-NEXT: stablehlo.custom_call @ScalarProgramDummyConstant(%[[C0]], %[[C1]], %[[VAR1]], %[[VAR2]])
-  %3 = stablehlo.custom_call @ScalarProgramDummyConstant(%2)
+  %3:2 = stablehlo.custom_call @ScalarProgramDummyConstant(%2)
     : (tuple<tensor<ui32>, tensor<i32>, tuple<tensor<ui32>, tensor<i32>>>)
-    -> tensor<ui32>
-  return %1, %3 : tensor<i32>, tensor<ui32>
+    -> (tensor<ui32>, tensor<ui32>)
+  return %1, %3#0, %3#1 : tensor<i32>, tensor<ui32>, tensor<ui32>
+}
+
+// -----
+
+// CHECK-LABEL: @custom_call_tupled_result
+// CHECK-SAME: %[[ARG0:.*]]: tensor<ui32>
+func.func @custom_call_tupled_result(%arg0: tensor<ui32>)
+  -> (tuple<tensor<ui32>, tuple<tensor<ui32>, tensor<i32>>, tensor<i32>>) {
+  // CHECK-NEXT: %[[CUSTOM_CALL:.*]]:4 = stablehlo.custom_call @ScalarProgramTupleResult(%[[ARG0]])
+  %0 = stablehlo.custom_call @ScalarProgramTupleResult(%arg0)
+    : (tensor<ui32>)
+    -> tuple<tensor<ui32>, tuple<tensor<ui32>, tensor<i32>>, tensor<i32>>
+  // CHECK-NEXT: %[[TUPLE1:.*]] = stablehlo.tuple %[[CUSTOM_CALL]]#1, %[[CUSTOM_CALL]]#2
+  // CHECK-NEXT: %[[TUPLE2:.*]] = stablehlo.tuple %[[CUSTOM_CALL]]#0, %[[TUPLE1]], %[[CUSTOM_CALL]]#3
+  // CHECK-NEXT: return %[[TUPLE2]]
+  return %0 : tuple<tensor<ui32>, tuple<tensor<ui32>, tensor<i32>>, tensor<i32>>
 }
 
 // -----
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/symbolic-shape-optimization.mlir
similarity index 79%
rename from third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
rename to third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/symbolic-shape-optimization.mlir
index 17f27e2c67f9..f5fb9d2395d2 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/stablehlo_ext/symbolic-shape-optimization.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s --split-input-file --symbolic-shape-optimization | \
+// RUN: mlir-hlo-opt %s --split-input-file --stablehlo-ext-symbolic-shape-optimization | \
 // RUN: FileCheck %s
 
 // CHECK-LABEL: func @reshape_expand_front
@@ -8,7 +8,7 @@ func.func @reshape_expand_front(%arg0: tensor<?x?xf32>) -> tensor<1x?x?xf32> {
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
   %shape = tensor.from_elements %c1, %d0, %d1 : tensor<3xindex>
-  %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
+  %reshape = "stablehlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<?x?xf32>, tensor<3xindex>) -> tensor<1x?x?xf32>
 // CHECK: tensor.expand_shape %arg0 [
 // CHECK-SAME: [0, 1], [2]] {{.*}} : tensor<?x?xf32> into tensor<1x?x?xf32>
@@ -22,7 +22,7 @@ func.func @reshape_expand_front_static(%arg0: tensor<2x?xf32>) -> tensor<1x2x?xf
   %d0 = tensor.dim %arg0, %c0 : tensor<2x?xf32>
   %d1 = tensor.dim %arg0, %c1 : tensor<2x?xf32>
   %shape = tensor.from_elements %c1, %d0, %d1 : tensor<3xindex>
-  %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
+  %reshape = "stablehlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<2x?xf32>, tensor<3xindex>) -> tensor<1x2x?xf32>
 // CHECK: tensor.expand_shape %arg0 [
 // CHECK-SAME: [0, 1], [2]] {{.*}} : tensor<2x?xf32> into tensor<1x2x?xf32>
@@ -38,7 +38,7 @@ func.func @reshape_expand_back(%arg0: tensor<?x?xf32>) -> tensor<?x?x1x1xf32> {
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
   %shape = tensor.from_elements %d0, %d1, %c1, %c1 : tensor<4xindex>
-  %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
+  %reshape = "stablehlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<?x?xf32>, tensor<4xindex>) -> tensor<?x?x1x1xf32>
 // CHECK: tensor.expand_shape %arg0 [
 // CHECK-SAME: [0], [1, 2, 3]] {{.*}} : tensor<?x?xf32> into tensor<?x?x1x1xf32>
@@ -53,8 +53,8 @@ func.func @reshape_expand_scalar(%arg0: tensor<f32>) -> tensor<?x?xf32> {
   // CHECK-DAG: %[[EXPAND:.*]] = tensor.expand_shape %[[ARG]] [] {{.*}} : tensor<f32> into tensor<1x1xf32>
   // CHECK-DAG: %[[RES:.*]] = tensor.cast %[[EXPAND]] : tensor<1x1xf32> to tensor<?x?xf32>
   // CHECK:     return %[[RES]]
-  %shape = mhlo.constant dense<1> : tensor<2xi32>
-  %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
+  %shape = stablehlo.constant dense<1> : tensor<2xi32>
+  %reshape = "stablehlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
   func.return %reshape : tensor<?x?xf32>
 }
@@ -64,11 +64,11 @@ func.func @reshape_expand_scalar(%arg0: tensor<f32>) -> tensor<?x?xf32> {
 // CHECK-LABEL: @reshape_collapse_scalar
 // CHECK-SAME:  %[[ARG:.*]]: tensor<?x?xf32>
 func.func @reshape_collapse_scalar(%arg0 : tensor<?x?xf32>) -> tensor<f32> {
-  %shape = mhlo.constant dense<1> : tensor<0xi32>
+  %shape = stablehlo.constant dense<1> : tensor<0xi32>
   // CHECK-DAG: %[[CASTED_ARG:.*]] = tensor.cast %[[ARG]] : tensor<?x?xf32> to tensor<1x1xf32>
   // CHECK-DAG: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[CASTED_ARG]] [] : tensor<1x1xf32> into tensor<f32>
   // CHECK:     return %[[COLLAPSED]]
-  %reshape = "mhlo.dynamic_reshape"(%arg0, %shape) : (tensor<?x?xf32>, tensor<0xi32>) -> tensor<f32>
+  %reshape = "stablehlo.dynamic_reshape"(%arg0, %shape) : (tensor<?x?xf32>, tensor<0xi32>) -> tensor<f32>
   func.return %reshape : tensor<f32>
 }
 
@@ -76,10 +76,10 @@ func.func @reshape_collapse_scalar(%arg0 : tensor<?x?xf32>) -> tensor<f32> {
 
 // CHECK-LABEL: func @reshape_undefined
 func.func @reshape_undefined(%arg0: tensor<?xf32>) -> tensor<1x1x1xf32> {
-  // CHECK: mhlo.dynamic_reshape
+  // CHECK: stablehlo.dynamic_reshape
   %c1 = arith.constant 1 : index
   %shape = tensor.from_elements %c1, %c1, %c1 : tensor<3xindex>
-  %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
+  %reshape = "stablehlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<?xf32>, tensor<3xindex>) -> tensor<1x1x1xf32>
   func.return %reshape : tensor<1x1x1xf32>
 }
@@ -95,7 +95,7 @@ func.func @shape_expansion(%arg : tensor<?x1xi64>) -> tensor<?x1x1xi64> {
   %c1 = arith.constant 1 : index
   %d0 = tensor.dim %arg, %c0 : tensor<?x1xi64>
   %shape = tensor.from_elements %d0, %c1, %c1 : tensor<3xindex>
-  %result = "mhlo.dynamic_reshape"(%arg, %shape)
+  %result = "stablehlo.dynamic_reshape"(%arg, %shape)
       : (tensor<?x1xi64>, tensor<3xindex>) -> tensor<?x1x1xi64>
   func.return %result : tensor<?x1x1xi64>
 }
@@ -114,7 +114,7 @@ func.func @shape_collapse_and_expansion(%arg : tensor<3x?x1xi64>)
   %d1 = tensor.dim %arg, %c1 : tensor<3x?x1xi64>
   %three_d1 = arith.muli %c3, %d1 : index
   %15 = tensor.from_elements %three_d1, %c1, %c1 : tensor<3xindex>
-  %16 = "mhlo.dynamic_reshape"(%arg, %15)
+  %16 = "stablehlo.dynamic_reshape"(%arg, %15)
       : (tensor<3x?x1xi64>, tensor<3xindex>) -> tensor<?x1x1xi64>
   func.return %16 : tensor<?x1x1xi64>
 }
@@ -137,7 +137,7 @@ func.func @shape_collapse_and_expansion_w_cast(%arg0: tensor<16x8x?x?xf32>) -> t
   %2 = tensor.dim %arg0, %c3 : tensor<16x8x?x?xf32>
   %4 = arith.muli %1, %2 : index
   %5 = tensor.from_elements %c16, %c4, %c2, %4 : tensor<4xindex>
-  %6 = "mhlo.dynamic_reshape"(%arg0, %5)  : (tensor<16x8x?x?xf32>, tensor<4xindex>) -> tensor<16x4x?x?xf32>
+  %6 = "stablehlo.dynamic_reshape"(%arg0, %5)  : (tensor<16x8x?x?xf32>, tensor<4xindex>) -> tensor<16x4x?x?xf32>
   func.return %6 : tensor<16x4x?x?xf32>
 }
 
@@ -161,7 +161,7 @@ func.func @dynamic_reshape_to_collapse_shape(%arg0 : tensor<1x4x?x64x?x8x1x1xf32
   %s0 = arith.muli %c4_i32, %d2_i32 : i32
   %s1 = arith.muli %c64_i32, %d4_i32 : i32
   %shape = tensor.from_elements %s0, %s1, %c8_i32 : tensor<3xi32>
-  %result = "mhlo.dynamic_reshape"(%arg0, %shape)
+  %result = "stablehlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<1x4x?x64x?x8x1x1xf32>, tensor<3xi32>) -> tensor<?x?x8xf32>
   func.return %result : tensor<?x?x8xf32>
 }
@@ -177,7 +177,7 @@ func.func @expansion_unit_dims(%arg0: tensor<1x?x1xi64>) -> tensor<1x1x?x1xi64>
   %c1 = arith.constant 1 : index
   %0 = tensor.dim %arg0, %c1 : tensor<1x?x1xi64>
   %1 = tensor.from_elements %c1, %c1, %0, %c1 : tensor<4xindex>
-  %2 = "mhlo.dynamic_reshape"(%arg0, %1)
+  %2 = "stablehlo.dynamic_reshape"(%arg0, %1)
       : (tensor<1x?x1xi64>, tensor<4xindex>) -> tensor<1x1x?x1xi64>
   func.return %2 : tensor<1x1x?x1xi64>
 }
@@ -187,43 +187,43 @@ func.func @expansion_unit_dims(%arg0: tensor<1x?x1xi64>) -> tensor<1x1x?x1xi64>
 // CHECK-LABEL: @multiple_reductions_and_reshape
 // CHECK-SAME:  %[[ARG:.*]]: tensor<?x?x?x?xi64>
 func.func @multiple_reductions_and_reshape(%arg0: tensor<?x?x?x?xi64>) -> tensor<1x1x1x1xi64> {
-  // CHECK: %[[RED0:.*]] = mhlo.reduce(%[[ARG]]
+  // CHECK: %[[RED0:.*]] = stablehlo.reduce(%[[ARG]]
   // CHECK: %[[RED0_:.*]] = tensor.expand_shape %[[RED0]] {{\[}}[0], [1], [2, 3]{{\]}} {{.*}} : tensor<?x?x?xi64> into tensor<?x?x?x1xi64>
-  // CHECK: %[[RED1:.*]] = mhlo.reduce(%[[RED0_]]
+  // CHECK: %[[RED1:.*]] = stablehlo.reduce(%[[RED0_]]
   // CHECK: %[[RED1_:.*]] = tensor.expand_shape %[[RED1]] {{\[}}[0, 1, 2], [3]{{\]}} {{.*}} : tensor<?x1xi64> into tensor<1x1x?x1xi64>
-  // CHECK: %[[RED2:.*]] = mhlo.reduce(%[[RED1_]]
+  // CHECK: %[[RED2:.*]] = stablehlo.reduce(%[[RED1_]]
   // TODO(b/225204462): This should also become a shape expansion.
-  // CHECK: %[[RED2_:.*]] = mhlo.reshape %[[RED2]] : (tensor<1xi64>) -> tensor<1x1x1x1xi64>
+  // CHECK: %[[RED2_:.*]] = stablehlo.reshape %[[RED2]] : (tensor<1xi64>) -> tensor<1x1x1x1xi64>
   // CHECK: return %[[RED2_]]
-  %0 = mhlo.constant dense<9223372036854775807> : tensor<i64>
+  %0 = stablehlo.constant dense<9223372036854775807> : tensor<i64>
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index
-  %1 = mhlo.constant dense<1> : tensor<i64>
-  %2 = mhlo.reduce(%arg0 init: %0)
-      applies mhlo.minimum across dimensions = [3]
+  %1 = stablehlo.constant dense<1> : tensor<i64>
+  %2 = stablehlo.reduce(%arg0 init: %0)
+      applies stablehlo.minimum across dimensions = [3]
       : (tensor<?x?x?x?xi64>, tensor<i64>) -> tensor<?x?x?xi64>
   %3 = tensor.dim %2, %c0 : tensor<?x?x?xi64>
   %4 = tensor.dim %2, %c1 : tensor<?x?x?xi64>
   %5 = tensor.dim %2, %c2 : tensor<?x?x?xi64>
   %6 = tensor.from_elements %3, %4, %5, %c1 : tensor<4xindex>
-  %7 = "mhlo.dynamic_reshape"(%2, %6)
+  %7 = "stablehlo.dynamic_reshape"(%2, %6)
       : (tensor<?x?x?xi64>, tensor<4xindex>) -> tensor<?x?x?x1xi64>
-  %8 = mhlo.reduce(%7 init: %0)
-      applies mhlo.minimum across dimensions = [0, 1]
+  %8 = stablehlo.reduce(%7 init: %0)
+      applies stablehlo.minimum across dimensions = [0, 1]
       : (tensor<?x?x?x1xi64>, tensor<i64>) -> tensor<?x1xi64>
   %9 = tensor.dim %8, %c0 : tensor<?x1xi64>
   %10 = tensor.from_elements %c1, %9, %c1 : tensor<3xindex>
-  %11 = "mhlo.dynamic_reshape"(%8, %10)
+  %11 = "stablehlo.dynamic_reshape"(%8, %10)
       : (tensor<?x1xi64>, tensor<3xindex>) -> tensor<1x?x1xi64>
   %12 = tensor.dim %11, %c1 : tensor<1x?x1xi64>
   %13 = tensor.from_elements %c1, %c1, %12, %c1 : tensor<4xindex>
-  %14 = "mhlo.dynamic_reshape"(%8, %13)
+  %14 = "stablehlo.dynamic_reshape"(%8, %13)
       : (tensor<?x1xi64>, tensor<4xindex>) -> tensor<1x1x?x1xi64>
-  %15 = mhlo.reduce(%14 init: %1)
-      applies mhlo.multiply across dimensions = [0, 1, 2]
+  %15 = stablehlo.reduce(%14 init: %1)
+      applies stablehlo.multiply across dimensions = [0, 1, 2]
       : (tensor<1x1x?x1xi64>, tensor<i64>) -> tensor<1xi64>
-  %16 = "mhlo.reshape"(%15) : (tensor<1xi64>) -> tensor<1x1x1x1xi64>
+  %16 = "stablehlo.reshape"(%15) : (tensor<1xi64>) -> tensor<1x1x1x1xi64>
   func.return %16 : tensor<1x1x1x1xi64>
 }
 
@@ -284,11 +284,11 @@ func.func @optimize_1dx1d_bcast(
   %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
   %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<1xindex>
       -> tensor<1xindex>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<0>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-      {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 0>
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+      {broadcast_dimensions = array<i64: 0>}
       : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   func.return %3: tensor<?xf32>
 }
@@ -305,11 +305,11 @@ func.func @optimize_1dx2d_bcast_const_shape(
   %1 = shape.shape_of %arg1 : tensor<?x512xf32> -> tensor<2xindex>
   %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<2xindex>
       -> tensor<2xindex>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<0>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-      {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 0>
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+      {broadcast_dimensions = array<i64: 1>}
       : (tensor<512xf32>, tensor<2xindex>) -> tensor<?x512xf32>
   func.return %3: tensor<?x512xf32>
 }
@@ -331,11 +331,11 @@ func.func @optimize_1dx1dx1d_bcast(
       -> tensor<1xindex>
   %4 = shape.broadcast %3, %2 : tensor<1xindex>, tensor<1xindex>
       -> tensor<1xindex>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<0>
-  %5 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %4)
-      {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 0>
+  %5 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %4)
+      {broadcast_dimensions = array<i64: 0>}
       : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   func.return %5: tensor<?xf32>
 }
@@ -353,17 +353,17 @@ func.func @optimize_2dx1d_bcast(
   %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
   %2 = shape.broadcast %0, %1 : tensor<2xindex>, tensor<1xindex>
       -> tensor<2xindex>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<[0, 1]>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-      {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 0, 1>
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+      {broadcast_dimensions = array<i64: 0, 1>}
       : (tensor<10x?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<0>
-  %4 = "mhlo.dynamic_broadcast_in_dim"(%arg1, %2)
-      {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 0>
+  %4 = "stablehlo.dynamic_broadcast_in_dim"(%arg1, %2)
+      {broadcast_dimensions = array<i64: 1>}
       : (tensor<?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
   func.return %3, %4: tensor<10x?xf32>, tensor<10x?xf32>
 }
@@ -381,17 +381,17 @@ func.func @optimize_3dx3d_bcast(
   %1 = shape.shape_of %arg1 : tensor<1x?x1xf32> -> tensor<3xindex>
   %2 = shape.broadcast %0, %1 : tensor<3xindex>, tensor<3xindex>
       -> tensor<3xindex>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<[0, 2]>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-      {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 0, 2>
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+      {broadcast_dimensions = array<i64: 0, 1, 2>}
       : (tensor<?x1x?xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<1>
-  %4 = "mhlo.dynamic_broadcast_in_dim"(%arg1, %2)
-      {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 1>
+  %4 = "stablehlo.dynamic_broadcast_in_dim"(%arg1, %2)
+      {broadcast_dimensions = array<i64: 0, 1, 2>}
       : (tensor<1x?x1xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
   func.return %3, %4: tensor<?x?x?xf32>, tensor<?x?x?xf32>
 }
@@ -412,12 +412,11 @@ func.func @optimize_10d_all_cases(
       -> tensor<10xindex>
   %2 = shape.broadcast %0, %1 : tensor<10xindex>, tensor<10xindex>
       -> tensor<10xindex>
-  // CHECK:      mhlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: known_expanding_dimensions = dense<1>
-  // CHECK-SAME: known_nonexpanding_dimensions = dense<[0, 3, 4, 5, 6, 9]>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-      {broadcast_dimensions = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]>
-      : tensor<10xi64>}
+  // CHECK:      stablehlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: known_expanding_dimensions = array<i64: 1>
+  // CHECK-SAME: known_nonexpanding_dimensions = array<i64: 0, 3, 4, 5, 6, 9>
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+      {broadcast_dimensions = array<i64: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9>}
       : (tensor<1x1x1x8x8x8x?x?x?x?xf32>, tensor<10xindex>)
       -> tensor<?x?x?x?x?x?x?x?x?x?xf32>
   func.return %3: tensor<?x?x?x?x?x?x?x?x?x?xf32>
@@ -519,17 +518,16 @@ func.func @optimize_1dx1d_bcast(
     {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
 ) -> tensor<?xf32> {
   // CHECK:      %[[SHAPE:.*]] = shape.shape_of %[[ARG0]]
-  // CHECK:      %[[DYNAMIC:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE]])
-  // CHECK-SAME:     broadcast_dimensions = dense<0>
-  // CHECK-SAME:     known_expanding_dimensions = dense<>
-  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      %[[DYNAMIC:.*]] = stablehlo.dynamic_broadcast_in_dim %[[ARG0]], %[[SHAPE]], dims = [0]
+  // CHECK-SAME:     known_expanding_dimensions = array<i64>
+  // CHECK-SAME:     known_nonexpanding_dimensions = array<i64: 0>
   // CHECK:      return %[[DYNAMIC]]
   %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
   %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
   %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<1xindex>
       -> tensor<1xindex>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-         {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+         {broadcast_dimensions = array<i64: 0>}
        : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   func.return %3: tensor<?xf32>
 }
@@ -544,17 +542,16 @@ func.func @optimize_1dx2d_bcast_const_shape(
     {rt.symbolic_shape = dense<[-2, 512]> : tensor<2xi64>}
 ) -> tensor<?x512xf32> {
   // CHECK:      %[[SHAPE_0:.*]] = shape.shape_of %[[ARG1_0]]
-  // CHECK:      %[[DYNAMIC_0:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0_0]], %[[SHAPE_0]])
-  // CHECK-SAME:     broadcast_dimensions = dense<1>
-  // CHECK-SAME:     known_expanding_dimensions = dense<>
-  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      %[[DYNAMIC_0:.*]] = stablehlo.dynamic_broadcast_in_dim %[[ARG0_0]], %[[SHAPE_0]], dims = [1]
+  // CHECK-SAME:     known_expanding_dimensions = array<i64>
+  // CHECK-SAME:     known_nonexpanding_dimensions = array<i64: 0>
   // CHECK:      return %[[DYNAMIC_0]]
   %0 = shape.const_shape [512] : tensor<1xindex>
   %1 = shape.shape_of %arg1 : tensor<?x512xf32> -> tensor<2xindex>
   %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<2xindex>
                              -> tensor<2xindex>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-         {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+         {broadcast_dimensions = array<i64: 1>}
        : (tensor<512xf32>, tensor<2xindex>) -> tensor<?x512xf32>
   func.return %3: tensor<?x512xf32>
 }
@@ -572,10 +569,9 @@ func.func @optimize_1dx1dx1d_bcast(
     {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
 ) -> tensor<?xf32> {
   // CHECK:      %[[SHAPE_1:.*]] = shape.shape_of %[[ARG0_1]]
-  // CHECK:      %[[DYNAMIC_1:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0_1]], %[[SHAPE_1]])
-  // CHECK-SAME:     broadcast_dimensions = dense<0>
-  // CHECK-SAME:     known_expanding_dimensions = dense<>
-  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      %[[DYNAMIC_1:.*]] = stablehlo.dynamic_broadcast_in_dim %[[ARG0_1]], %[[SHAPE_1]], dims = [0]
+  // CHECK-SAME:     known_expanding_dimensions = array<i64>
+  // CHECK-SAME:     known_nonexpanding_dimensions = array<i64: 0>
   // CHECK:      return %[[DYNAMIC_1]]
   %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
   %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
@@ -584,8 +580,8 @@ func.func @optimize_1dx1dx1d_bcast(
                              -> tensor<1xindex>
   %4 = shape.broadcast %3, %2 : tensor<1xindex>, tensor<1xindex>
                              -> tensor<1xindex>
-  %5 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %4)
-         {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
+  %5 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %4)
+         {broadcast_dimensions = array<i64: 0>}
        : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
   func.return %5: tensor<?xf32>
 }
@@ -601,24 +597,22 @@ func.func @optimize_2dx1d_bcast(
     {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
 ) -> (tensor<10x?xf32>, tensor<10x?xf32>) {
   // CHECK:      %[[SHAPE_2:.*]] = shape.shape_of %[[ARG0_2]]
-  // CHECK:      %[[DYNAMIC_2:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0_2]], %[[SHAPE_2]])
-  // CHECK-SAME:     broadcast_dimensions = dense<[0, 1]>
-  // CHECK-SAME:     known_expanding_dimensions = dense<>
-  // CHECK-SAME:     known_nonexpanding_dimensions = dense<[0, 1]>
-  // CHECK:      %[[DYNAMIC_3:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1_2]], %[[SHAPE_2]])
-  // CHECK-SAME:     broadcast_dimensions = dense<1>
-  // CHECK-SAME:     known_expanding_dimensions = dense<>
-  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      %[[DYNAMIC_2:.*]] = stablehlo.dynamic_broadcast_in_dim %[[ARG0_2]], %[[SHAPE_2]], dims = [0, 1]
+  // CHECK-SAME:     known_expanding_dimensions = array<i64>
+  // CHECK-SAME:     known_nonexpanding_dimensions = array<i64: 0, 1>
+  // CHECK:      %[[DYNAMIC_3:.*]] = stablehlo.dynamic_broadcast_in_dim %[[ARG1_2]], %[[SHAPE_2]], dims = [1]
+  // CHECK-SAME:     known_expanding_dimensions = array<i64>
+  // CHECK-SAME:     known_nonexpanding_dimensions = array<i64: 0>
   // CHECK:      return %[[DYNAMIC_2]], %[[DYNAMIC_3]]
   %0 = shape.shape_of %arg0 : tensor<10x?xf32> -> tensor<2xindex>
   %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
   %2 = shape.broadcast %0, %1 : tensor<2xindex>, tensor<1xindex>
                              -> tensor<2xindex>
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-         {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  %3 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %2)
+         {broadcast_dimensions = array<i64: 0, 1>}
        : (tensor<10x?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
-  %4 = "mhlo.dynamic_broadcast_in_dim"(%arg1, %2)
-         {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
+  %4 = "stablehlo.dynamic_broadcast_in_dim"(%arg1, %2)
+         {broadcast_dimensions = array<i64: 1>}
        : (tensor<?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
   func.return %3, %4: tensor<10x?xf32>, tensor<10x?xf32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
index f099bf3b4dfe..6022dc31e64a 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
@@ -67,7 +67,11 @@ void AllocToArgPass::runOnOperation() {
     if (auto allocOp = llvm::dyn_cast_or_null<memref::AllocOp>(resultDef)) {
       resultsToErase.set(i);
       auto attrs = funcOp.getResultAttrDict(i);
-      funcOp.insertArgument(funcOp.getNumArguments(), resultTy, attrs, loc);
+
+      if (failed(funcOp.insertArgument(funcOp.getNumArguments(), resultTy,
+                                       attrs, loc))) {
+        return signalPassFailure();
+      }
       rewriter.replaceOp(allocOp, funcOp.getArguments().back());
       continue;
     }
@@ -79,7 +83,10 @@ void AllocToArgPass::runOnOperation() {
       if (auto allocOp = llvm::dyn_cast_or_null<memref::AllocOp>(expandDef)) {
         resultsToErase.set(i);
         auto attrs = funcOp.getResultAttrDict(i);
-        funcOp.insertArgument(funcOp.getNumArguments(), resultTy, attrs, loc);
+        if (failed(funcOp.insertArgument(funcOp.getNumArguments(), resultTy,
+                                         attrs, loc))) {
+          return signalPassFailure();
+        }
 
         // Collapse buffer argument to replace possible uses of the unexpanded
         // buffer.
@@ -100,7 +107,9 @@ void AllocToArgPass::runOnOperation() {
     return signalPassFailure();
   }
 
-  funcOp.eraseResults(resultsToErase);
+  if (failed(funcOp.eraseResults(resultsToErase))) {
+    return signalPassFailure();
+  }
   returnOp->eraseOperands(resultsToErase);
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index 65317fc6652e..2b39be6f0779 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -109,7 +109,6 @@ class CustomBufferizeTypeConverter : public mlir::TypeConverter {
     addConversion([](UnrankedTensorType type) -> Type {
       return UnrankedMemRefType::get(type.getElementType(), 0);
     });
-    addArgumentMaterialization(materializeToTensor);
     addSourceMaterialization(materializeToTensor);
     addTargetMaterialization([](OpBuilder& builder, BaseMemRefType type,
                                 ValueRange inputs, Location loc) -> Value {
@@ -155,7 +154,6 @@ static bufferization::BufferizationOptions getPartialBufferizationOptions() {
   bufferization::BufferizationOptions options;
   options.allowUnknownOps = true;
   options.copyBeforeWrite = true;
-  options.enforceAliasingInvariants = false;
   options.unknownTypeConverterFn =
       [](Value value, Attribute memorySpace,
          const bufferization::BufferizationOptions& options) {
diff --git a/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc b/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
index 8cd4bf99f513..3609530ec1a1 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
@@ -78,11 +78,11 @@ class GenericHostToLLVMPass
       vector::populateVectorToVectorCanonicalizationPatterns(patterns);
       vector::populateVectorBroadcastLoweringPatterns(patterns);
       vector::populateVectorContractLoweringPatterns(
-          patterns, vector::VectorTransformsOptions());
+          patterns, vector::VectorContractLowering());
       vector::populateVectorMaskOpLoweringPatterns(patterns);
       vector::populateVectorShapeCastLoweringPatterns(patterns);
       vector::populateVectorTransposeLoweringPatterns(
-          patterns, vector::VectorTransformsOptions());
+          patterns, vector::VectorTransposeLowering());
       // Vector transfer ops with rank > 1 should be lowered with VectorToSCF.
       vector::populateVectorTransferLoweringPatterns(patterns,
                                                      /*maxTransferRank=*/1);
diff --git a/third_party/xla/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc b/third_party/xla/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
index d490588de450..f50c674e7926 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -61,7 +62,13 @@ class GpuKernelToNVVMPass
 /// that are currently required, currently mixing std, linalg and gpu.
 class GpuKernelToROCDLPass
     : public impl::GpuKernelToROCDLPassBase<GpuKernelToROCDLPass> {
+ public:
+  GpuKernelToROCDLPass(const std::string& chipset) : chipset_(chipset) {}
+
+ private:
   void runOnOperation() override;
+
+  std::string chipset_;
 };
 
 }  // namespace
@@ -71,11 +78,11 @@ static void populateAllCommonVectorProgressiveLoweringPatterns(
   vector::populateVectorToVectorCanonicalizationPatterns(patterns);
   vector::populateVectorBroadcastLoweringPatterns(patterns);
   vector::populateVectorContractLoweringPatterns(
-      patterns, vector::VectorTransformsOptions());
+      patterns, vector::VectorContractLowering());
   vector::populateVectorMaskOpLoweringPatterns(patterns);
   vector::populateVectorShapeCastLoweringPatterns(patterns);
   vector::populateVectorTransposeLoweringPatterns(
-      patterns, vector::VectorTransformsOptions());
+      patterns, vector::VectorTransposeLowering());
   // Vector transfer ops with rank > 1 should be lowered with VectorToSCF.
   vector::populateVectorTransferLoweringPatterns(patterns,
                                                  /*maxTransferRank=*/1);
@@ -130,11 +137,19 @@ void GpuKernelToNVVMPass::runOnOperation() {
 }
 
 void GpuKernelToROCDLPass::runOnOperation() {
+  llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
+      mlir::amdgpu::Chipset::parse(chipset_);
+  if (failed(maybeChipset)) {
+    mlir::emitError(mlir::UnknownLoc::get(&getContext()),
+                    "Invalid chipset name: " + chipset_);
+    return signalPassFailure();
+  }
+
   RewritePatternSet patterns(&getContext());
   LLVMTypeConverter converter(&getContext());
   populateCommonPatterns(converter, patterns);
-  populateGpuToROCDLConversionPatterns(converter, patterns,
-                                       gpu::amd::Runtime::Unknown);
+  populateGpuToROCDLConversionPatterns(
+      converter, patterns, gpu::amd::Runtime::Unknown, *maybeChipset);
   ConversionTarget target(getContext());
   configureGpuToROCDLConversionLegality(target);
   if (failed(
@@ -148,8 +163,9 @@ std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createGpuKernelToNvvmPass(
   return std::make_unique<GpuKernelToNVVMPass>(useBarePtrCallConv);
 }
 
-std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createGpuKernelToRocdlPass() {
-  return std::make_unique<GpuKernelToROCDLPass>();
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createGpuKernelToRocdlPass(
+    const std::string& chipset) {
+  return std::make_unique<GpuKernelToROCDLPass>(chipset);
 }
 
 }  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.h b/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.h
index a36f45e021fa..2accba5f82a3 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.h
+++ b/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.h
@@ -26,7 +26,9 @@ class PassManager;
 namespace gpu {
 class GPUModuleOp;
 }  // namespace gpu
-
+namespace amdgpu {
+class Chipset;
+}
 #define GEN_PASS_DECL
 #include "transforms/gpu_passes.h.inc"
 
@@ -41,7 +43,7 @@ createGpuKernelToNvvmPass(bool useBarePtrCallConv = false);
 
 /// Pass that transforms gpu modules in standard dialect to ROCDL.
 std::unique_ptr<OperationPass<mlir::gpu::GPUModuleOp>>
-createGpuKernelToRocdlPass();
+createGpuKernelToRocdlPass(const std::string& chipset = "gfx000");
 
 #define GEN_PASS_REGISTRATION
 #include "transforms/gpu_passes.h.inc"
diff --git a/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.td b/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.td
index 5ee6f75951f1..53aa73b73517 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.td
+++ b/third_party/xla/xla/mlir_hlo/transforms/gpu_passes.td
@@ -32,6 +32,11 @@ def GpuKernelToROCDLPass : Pass<"gpu-kernel-to-rocdl", "gpu::GPUModuleOp"> {
   let summary = "Pass to transform a gpu module to rocdl.";
   let dependentDialects = ["LLVM::LLVMDialect", "ROCDL::ROCDLDialect"];
   let constructor = "createGpuKernelToRocdlPass()";
+  let options = [
+    Option<"chipset", "chipset", "std::string",
+           /*default=*/"\"gfx000\"",
+           "Chipset that these operations will run on">,
+  ];
 }
 
 #endif // MLIR_HLO_TRANSFORMS_GPU_PASSES
diff --git a/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
index 30932490b989..d0193a50e7ff 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/LogicalResult.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -60,8 +61,10 @@ void UnbufferizePass::runOnOperation() {
     Value newValue = mapping.lookupOrNull(arg);
     if (newValue == nullptr) {
       auto attrs = funcOp.getArgAttrDict(arg.getArgNumber());
-      funcOp.insertArgument(funcOp.getNumArguments(), op.getType(), attrs,
-                            arg.getLoc());
+      if (llvm::failed(funcOp.insertArgument(
+              funcOp.getNumArguments(), op.getType(), attrs, arg.getLoc()))) {
+        return signalPassFailure();
+      }
       newValue = funcOp.getArguments().back();
       mapping.map(arg, newValue);
     }
@@ -79,10 +82,14 @@ void UnbufferizePass::runOnOperation() {
     rewriter.eraseOp(op);
   });
   argsToErase.resize(funcOp.getNumArguments());
-  funcOp.eraseArguments(argsToErase);
+  if (llvm::failed(funcOp.eraseArguments(argsToErase))) {
+    return signalPassFailure();
+  }
   auto resultIndices = llvm::to_vector(llvm::seq<unsigned>(0, results.size()));
-  funcOp.insertResults(resultIndices, TypeRange(ValueRange(results)),
-                       resultAttrs);
+  if (llvm::failed(funcOp.insertResults(
+          resultIndices, TypeRange(ValueRange(results)), resultAttrs))) {
+    return signalPassFailure();
+  }
   Operation *terminator = funcOp.getBody().back().getTerminator();
   rewriter.setInsertionPoint(terminator);
   rewriter.replaceOpWithNewOp<func::ReturnOp>(terminator, results);
diff --git a/third_party/xla/xla/mlir_hlo/utils/cycle_detector_test.cc b/third_party/xla/xla/mlir_hlo/utils/cycle_detector_test.cc
deleted file mode 100644
index 18bdefb50b5e..000000000000
--- a/third_party/xla/xla/mlir_hlo/utils/cycle_detector_test.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "utils/cycle_detector.h"
-
-#include "xla/hlo/testlib/test.h"
-
-class GraphCyclesTest : public ::testing::Test {
- public:
-  GraphCyclesTest() : g_(100) {}
-
-  bool AddEdge(int x, int y) { return g_.InsertEdge(x, y); }
-
-  void AddMultiples() {
-    // For every node x > 0: add edge to 2*x, 3*x
-    for (int x = 1; x < 25; x++) {
-      EXPECT_TRUE(AddEdge(x, 2 * x)) << x;
-      EXPECT_TRUE(AddEdge(x, 3 * x)) << x;
-    }
-  }
-
-  mlir::GraphCycles g_;
-};
-
-TEST_F(GraphCyclesTest, NoCycle) { AddMultiples(); }
-
-TEST_F(GraphCyclesTest, SimpleCycle) {
-  AddMultiples();
-  EXPECT_FALSE(AddEdge(8, 4));
-}
-
-TEST_F(GraphCyclesTest, IndirectCycle) {
-  AddMultiples();
-  EXPECT_TRUE(AddEdge(16, 9));
-  EXPECT_FALSE(AddEdge(9, 2));
-}
-
-TEST_F(GraphCyclesTest, RemoveEdge) {
-  EXPECT_TRUE(AddEdge(1, 2));
-  EXPECT_TRUE(AddEdge(2, 3));
-  EXPECT_TRUE(AddEdge(3, 4));
-  EXPECT_TRUE(AddEdge(4, 5));
-  g_.RemoveEdge(2, 3);
-  EXPECT_FALSE(g_.HasEdge(2, 3));
-}
-
-TEST_F(GraphCyclesTest, IsReachable) {
-  EXPECT_TRUE(AddEdge(1, 2));
-  EXPECT_TRUE(AddEdge(2, 3));
-  EXPECT_TRUE(AddEdge(3, 4));
-  EXPECT_TRUE(AddEdge(4, 5));
-
-  EXPECT_TRUE(g_.IsReachable(1, 5));
-  EXPECT_FALSE(g_.IsReachable(5, 1));
-}
-
-TEST_F(GraphCyclesTest, ContractEdge) {
-  ASSERT_TRUE(AddEdge(1, 2));
-  ASSERT_TRUE(AddEdge(1, 3));
-  ASSERT_TRUE(AddEdge(2, 3));
-  ASSERT_TRUE(AddEdge(2, 4));
-  ASSERT_TRUE(AddEdge(3, 4));
-
-  // It will introduce a cycle if the edge is contracted
-  EXPECT_FALSE(g_.ContractEdge(1, 3).has_value());
-  EXPECT_TRUE(g_.HasEdge(1, 3));
-
-  // Node (2) has more edges.
-  EXPECT_EQ(*g_.ContractEdge(1, 2), 2);
-  EXPECT_TRUE(g_.HasEdge(2, 3));
-  EXPECT_TRUE(g_.HasEdge(2, 4));
-  EXPECT_TRUE(g_.HasEdge(3, 4));
-
-  // Node (2) has more edges.
-  EXPECT_EQ(*g_.ContractEdge(2, 3), 2);
-  EXPECT_TRUE(g_.HasEdge(2, 4));
-}
diff --git a/third_party/xla/xla/online_topsort.h b/third_party/xla/xla/online_topsort.h
new file mode 100644
index 000000000000..b47a4726be5e
--- /dev/null
+++ b/third_party/xla/xla/online_topsort.h
@@ -0,0 +1,783 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This module implements an online topological sort using the two-way-search
+// algorithm for sparse graphs of Bender et al., Section 2. The algorithm
+// incorporates the extension from section 4 to maintain the topological order
+// explicitly in a doubly-linked list.
+//
+// Per Bender et al, inserting m edges into a graph of n nodes takes
+// O(m*min(m**(1/2), n**(2/3))). For the use case of our compiler IR, we assume
+// that the number of edges is at most a small multiple of the number of nodes,
+// and so the graph is quite sparse, and the dominant bound is O(m**(3/2)).
+//
+// We implement several extensions to the algorithm:
+// - we allow adding and removing nodes. This does not require any significant
+//   changes to the algorithm. The original algorithm uses the values of m and n
+//   as part of a scheme for numbering nodes, but the purpose of that scheme is
+//   to combine (level, index) tuples into a single total order. We don't need
+//   explicit position numbers, only the topological order, so we can just use
+//   a lexicographic order of (level, index) tuples directly.
+// - we number indices decreasing from std::numeric_limits<int>::max(). The
+//   careful numbering of indices in the original paper is only to avoid
+//   collisions in the ID space with the level numbers, but since we don't try
+//   to combine these into a single number, we don't need to be quite as
+//   careful.
+// - we allow removing edges. This is a trivial extension; removing an edge
+//   preserves topological ordering. Removing edges may affect the algorithmic
+//   complexity guarantees, but we probably don't care that much.
+//
+// This implementation is not thread-safe.
+//
+// Type parameters:
+// - T is the type of the nodes in the graph.
+// - Index is the type of the index_in_parent field in the nodes. We only care
+//   that the index values form a reasonably dense range starting at 0, since
+//   we use them to index into vectors. If we didn't have a dense range, we
+//   could use an associative map data structure instead, but that would be
+//   slower to lookup.
+// - Link is a pointer to the embedded TopologicalSortNode<T> field in T.
+// - IndexInParent is a pointer to the index_in_parent field in T.
+//   These indices must remain fixed only during a call to AddEdge(), which
+//   is obviously true because we don't allow threads and the topological sort
+//   will not change them, but they are allowed to change between calls.
+// - PredecessorIterator, PredecessorsBegin, PredecessorsEnd iterate over the
+//   predecessors of the node. Duplicates are allowed.
+// - SuccessorIterator, SuccessorsBegin, SuccessorsEnd iterate over the
+//   successors of the node. Duplicates are allowed.
+//
+// References:
+// * Bender, M.A., Fineman, J.T., Gilbert, S. and Tarjan, R.E., 2015. A new
+//   approach to incremental cycle detection and related problems.
+//   ACM Transactions on Algorithms (TALG), 12(2), pp.1-22.
+//   https://dl.acm.org/doi/abs/10.1145/2756553
+
+#ifndef XLA_ONLINE_TOPSORT_H_
+#define XLA_ONLINE_TOPSORT_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/tsl/platform/logging.h"
+
+// The topological sort is an intrusive data structure. Nodes of type T that
+// participate in the topological sort must have a TopologicalSortNode<T>
+// embedded within them.
+template <typename T>
+class TopologicalSortNode {
+ public:
+  TopologicalSortNode() = default;
+  ~TopologicalSortNode() { DCHECK(!in_topological_order()) << level_; }
+
+  TopologicalSortNode(const TopologicalSortNode&) = delete;
+  TopologicalSortNode(TopologicalSortNode&&) = delete;
+  TopologicalSortNode& operator=(const TopologicalSortNode&) = delete;
+  TopologicalSortNode& operator=(TopologicalSortNode&&) = delete;
+
+  void clear() {
+    next_ = nullptr;
+    prev_ = nullptr;
+    level_ = -1;
+    index_ = -1;
+  }
+
+  // Returns true if this node has been added to a topological order.
+  // It may have temporarily been removed from a specific location in that
+  // order if we are in the middle of an AddEdge() operation.
+  bool in_topological_order() const { return level_ >= 0; }
+
+ private:
+  template <typename S, typename Index, TopologicalSortNode<S> S::* Link,
+            Index S::* IndexInParent, typename PredecessorIterator,
+            PredecessorIterator (S::*PredecessorsBegin)() const,
+            PredecessorIterator (S::*PredecessorsEnd)() const,
+            typename SuccessorIterator,
+            SuccessorIterator (S::*SuccessorsBegin)() const,
+            SuccessorIterator (S::*SuccessorsEnd)() const>
+  friend class TopologicalSort;
+
+  template <typename S, TopologicalSortNode<S> S::* Link>
+  friend class TopologicalSortForwardIterator;
+  template <typename S, TopologicalSortNode<S> S::* Link>
+  friend class TopologicalSortReverseIterator;
+
+  int index_ = -1;
+  int level_ = -1;
+
+  // The nodes form a doubly-linked list, where the `next_` pointers are not
+  // circular, but the `prev_` pointers are circular.
+  // There is also an asymmetry in the types of `next_` and `prev_`: the former
+  // is a pointer to a node, while the latter is a pointer to a
+  // TopologicalSortNode embedded within a node. This trick helps us define
+  // an intrusive templated list in C++.
+  T* next_ = nullptr;
+  TopologicalSortNode<T>* prev_ = nullptr;
+};
+
+// Iterator that traverses through the topological sort in order.
+template <typename T, TopologicalSortNode<T> T::* Link>
+class TopologicalSortForwardIterator {
+ public:
+  TopologicalSortForwardIterator() : current_(nullptr) {}
+  explicit TopologicalSortForwardIterator(const TopologicalSortNode<T>* current)
+      : current_(current) {}
+
+  TopologicalSortForwardIterator(const TopologicalSortForwardIterator&) =
+      default;
+  TopologicalSortForwardIterator(TopologicalSortForwardIterator&&) = default;
+  TopologicalSortForwardIterator& operator=(
+      const TopologicalSortForwardIterator&) = default;
+  TopologicalSortForwardIterator& operator=(TopologicalSortForwardIterator&&) =
+      default;
+
+  T& operator*() const { return *current_->next_; }
+  T* operator->() const { return current_->next_; }
+
+  bool operator==(const TopologicalSortForwardIterator& other) const {
+    return current_ == other.current_;
+  }
+  bool operator!=(const TopologicalSortForwardIterator& other) const {
+    return current_ != other.current_;
+  }
+
+  TopologicalSortForwardIterator& operator++() {
+    current_ = &(current_->next_->*Link);
+    return *this;
+  }
+
+  TopologicalSortForwardIterator& operator--() {
+    current_ = &current_->prev_;
+    return *this;
+  }
+
+ private:
+  // Note: the iterator is a pointer to a node whose *next* pointer points to
+  // the current node.
+  TopologicalSortNode<T> const* current_;
+};
+
+// Iterator that traverses through the topological sort in reverse order.
+template <typename T, TopologicalSortNode<T> T::* Link>
+class TopologicalSortReverseIterator {
+ public:
+  TopologicalSortReverseIterator() : current_(nullptr) {}
+  explicit TopologicalSortReverseIterator(const TopologicalSortNode<T>* current)
+      : current_(current) {}
+
+  TopologicalSortReverseIterator(const TopologicalSortReverseIterator&) =
+      default;
+  TopologicalSortReverseIterator(TopologicalSortReverseIterator&&) = default;
+  TopologicalSortReverseIterator& operator=(
+      const TopologicalSortReverseIterator&) = default;
+  TopologicalSortReverseIterator& operator=(TopologicalSortReverseIterator&&) =
+      default;
+
+  T& operator*() const { return *current_->next_; }
+  T* operator->() const { return current_->next_; }
+
+  bool operator==(const TopologicalSortReverseIterator& other) const {
+    return current_ == other.current_;
+  }
+  bool operator!=(const TopologicalSortReverseIterator& other) const {
+    return current_ != other.current_;
+  }
+
+  TopologicalSortReverseIterator& operator++() {
+    current_ = current_->prev_;
+    return *this;
+  }
+
+ private:
+  // Note: the iterator is a pointer to a node whose *next* pointer points to
+  // the current node.
+  TopologicalSortNode<T> const* current_;
+};
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+class TopologicalSort {
+ public:
+  TopologicalSort() {
+    node_.next_ = nullptr;
+    node_.prev_ = &node_;
+    first_in_level_.push_back(&node_);
+  }
+
+  ~TopologicalSort();
+
+  // Invalidates iterators.
+  void AddNode(T* v);
+
+  // Invalidates iterators.
+  void RemoveNode(T* v);
+
+  // Caution: this data structure assumes that there are no parallel edges.
+  // Invalidates any iterators. We assume the user has added the edge to their
+  // own data structure before calling this method.
+  void AddEdge(T* v, T* w);
+
+  // You might wonder why we don't have the following method:
+  // void RemoveEdge(T* v, T* w);
+  // The reason is that we don't need it. Removing an edge preserves topological
+  // ordering, and there's nothing for us to do here. The user still needs to
+  // remove the edge from their own data structure, of course.
+
+  // Returns an iterator over the nodes in topological order.
+  TopologicalSortForwardIterator<T, Link> begin() const {
+    return TopologicalSortForwardIterator<T, Link>(&node_);
+  }
+  TopologicalSortForwardIterator<T, Link> end() const {
+    return TopologicalSortForwardIterator<T, Link>(node_.prev_);
+  }
+
+  // Returns an iterator over the nodes in reverse topological order.
+  TopologicalSortReverseIterator<T, Link> rbegin() const {
+    return TopologicalSortReverseIterator<T, Link>(node_.prev_->prev_);
+  }
+  TopologicalSortReverseIterator<T, Link> rend() const {
+    return TopologicalSortReverseIterator<T, Link>(node_.prev_);
+  }
+
+  // This is a helper for debugging. It logs the current order and checks a
+  // number of invariants.
+  void LogOrder() {
+    std::vector<T*> order;
+    int level = -1;
+    for (T& node : *this) {
+      const auto& link = node.*Link;
+      CHECK_GE(link.level_, level);
+      level = link.level_;
+      if (link.next_) {
+        CHECK((link.next_->*Link).prev_ == &link);
+      } else {
+        CHECK(node_.prev_ == &link);
+      }
+      CHECK(link.prev_->next_ == &node);
+      order.push_back(&node);
+    }
+    auto node_formatter = [](std::string* out, T* v) {
+      absl::StrAppend(out, v->*IndexInParent, "[", (v->*Link).level_, ":",
+                      (v->*Link).index_, "]");
+    };
+    DVLOG(2) << this << " order=" << absl::StrJoin(order, ", ", node_formatter);
+    auto first_in_level_formatter = [](std::string* out,
+                                       TopologicalSortNode<T>* v) {
+      if (v->next_) {
+        absl::StrAppend(out, v->next_->*IndexInParent, ":",
+                        (v->next_->*Link).level_);
+      } else {
+        absl::StrAppend(out, "-:-");
+      }
+    };
+    DVLOG(2) << this << " first_in_level_="
+             << absl::StrJoin(first_in_level_, ", ", first_in_level_formatter);
+
+    CHECK(first_in_level_[0] == &node_);
+    auto it = order.begin();
+    for (TopologicalSortNode<T>* v : first_in_level_) {
+      it = std::find(it, order.end(), v->next_);
+      CHECK(v->next_ == nullptr || it != order.end());
+    }
+  }
+
+  void clear() { node_.clear(); }
+
+ private:
+  // Updates delta_ after we have increased num_edges_ and num_nodes_.
+  // We don't bother decreasing delta_ after removals, since we assume that our
+  // graphs will not significantly shrink.
+  void UpdateDelta();
+
+  // Performs a DFS backwards from v of at most delta_ nodes on the same level,
+  // populating b with nodes in postorder with respect to the search (i.e., a
+  // node appears later in b than its predecessors). Returns true if we should
+  // run a forwards search.
+  bool SearchBackwards(T* v, T* w, std::vector<T*>& b);
+
+  // Performs a DFS forwards from v populating f with nodes in postorder with
+  // respect to the search (i.e., a node appears later in f than all its
+  // predecessors).
+  // (Note "f" is reversed from the paper, which just because we can save time
+  // and reverse it when updating the indices, rather than explicitly reversing
+  // it here.)
+  void SearchForwards(T* v, T* w, std::vector<T*>& f);
+
+  // Removes v from the topological order.
+  void RemoveFromOrder(T* v);
+
+  void UpdateIndex(T* v);
+
+  // Helper that makes sure that the AddEdge() data structures are large enough
+  // to hold nodes with index max_index_in_parent.
+  void UpdateMaxIndexInParent(Index max_index_in_parent) {
+    if (max_index_in_parent >= visited_backwards_.size()) {
+      visited_backwards_.resize(max_index_in_parent + 1);
+      visited_forwards_.resize(max_index_in_parent + 1);
+      increased_.resize(max_index_in_parent + 1);
+    }
+  }
+
+  TopologicalSortNode<T> node_;
+
+  int num_edges_ = 0;  // aka "m" in the paper.
+  int num_nodes_ = 0;  // aka "n" in the paper.
+
+  // How many nodes to search backwards when adding an edge. This should be
+  // ceil(min(m**(1/2), n**(2/3))), but we compute that bound online as we add
+  // nodes and edges via UpdateDelta().
+  int64_t delta_ = 0;
+
+  // The next value of index_ to assign, aka "a" in the paper. Monotonically
+  // decreasing as indices are assigned.
+  // You might also wonder where 'b' from the paper is, but we simply don't
+  // need it, since we're trying to maintain a doubly-linked list in topological
+  // order, and we don't care about computing a topological numbering.
+  int next_index_ = std::numeric_limits<int>::max();
+
+  // The first node in each level or a higher level.
+  // As is the usual convention for this data structure, this is actually the
+  // TopologicalSortNode whose next_ pointer points to that node, if any.
+  // Invariant: There is always at least one level. Futher, these pointers are
+  // never nullptr: there's always a preceding node (node_, if nothing else).
+  std::vector<TopologicalSortNode<T>*> first_in_level_;
+
+  // Visited state for forwards and backwards searches which are used during
+  // AddEdge(). We keep this state in the class to save repeatedly allocating
+  // it. This would not be thread-safe, but neither is AddEdge().
+  std::vector<bool> visited_backwards_;
+  std::vector<bool> visited_forwards_;
+  std::vector<bool> increased_;
+};
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                SuccessorsBegin, SuccessorsEnd>::~TopologicalSort() {
+  TopologicalSortNode<T>* next;
+  for (TopologicalSortNode<T>* node = &node_; node != nullptr; node = next) {
+    if (node->next_) {
+      next = &(node->next_->*Link);
+    } else {
+      next = nullptr;
+    }
+    node->clear();
+  }
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+void TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin, SuccessorsEnd>::AddNode(T* v) {
+  TopologicalSortNode<T>* node = &(v->*Link);
+  if (VLOG_IS_ON(1)) {
+    DVLOG(1) << this << " AddNode(" << v->*IndexInParent << ")";
+    LogOrder();
+  }
+
+  // next_ and prev_ should be nullptr for a new node.
+  CHECK(node->next_ == nullptr);
+  CHECK(node->prev_ == nullptr);
+  node->level_ = 0;
+  node->index_ = next_index_--;
+  ++num_nodes_;
+  UpdateDelta();
+
+  // Add the node to the front of the topological ordering.
+  node->next_ = first_in_level_[0]->next_;
+  node->prev_ = first_in_level_[0];
+  if (node->next_) {
+    (node->next_->*Link).prev_ = node;
+  } else {
+    node_.prev_ = node;
+  }
+  first_in_level_[0]->next_ = v;
+  for (int level = 1;
+       level < first_in_level_.size() && first_in_level_[level] == &node_;
+       ++level) {
+    first_in_level_[level] = node;
+  }
+  if (VLOG_IS_ON(1)) {
+    LogOrder();
+  }
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+void TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin, SuccessorsEnd>::RemoveNode(T* v) {
+  TopologicalSortNode<T>* node = &(v->*Link);
+  DVLOG(1) << this << " RemoveNode(" << v->*IndexInParent << ")";
+  CHECK(node->prev_ == &node_ || node->prev_->in_topological_order());
+  --num_nodes_;
+  if (VLOG_IS_ON(1)) {
+    LogOrder();
+  }
+  RemoveFromOrder(v);
+  node->level_ = -1;
+  node->index_ = -1;
+  if (VLOG_IS_ON(1)) {
+    LogOrder();
+  }
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+void TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin, SuccessorsEnd>::AddEdge(T* v, T* w) {
+  TopologicalSortNode<T>* v_node = &(v->*Link);
+  TopologicalSortNode<T>* w_node = &(w->*Link);
+
+  ++num_edges_;
+  UpdateDelta();
+
+  DVLOG(1) << this << " AddEdge(" << v->*IndexInParent << ", "
+           << w->*IndexInParent << ") v={level=" << v_node->level_ << " "
+           << "index=" << v_node->index_ << "} "
+           << " w={level=" << w_node->level_ << " "
+           << "index=" << w_node->index_ << "} "
+           << "delta_=" << delta_;
+
+  // Verify that both nodes are in the topological order.
+  DCHECK(v_node->in_topological_order());
+  DCHECK(w_node->in_topological_order());
+
+  // Step 1: test order: if w is already higher than v in the lexicographical
+  // order then the current ordering is fine.
+  if (std::tie(v_node->level_, v_node->index_) <
+      std::tie(w_node->level_, w_node->index_)) {
+    if (VLOG_IS_ON(1)) {
+      LogOrder();
+    }
+    return;
+  }
+
+  // Step 2: search backwards from v, until we either find `w`, which means we
+  // have a cycle, visit delta_ edges, or run out of edges to visit.
+  std::vector<T*> b;
+  bool should_search_forwards;
+  bool visited_delta_edges = SearchBackwards(v, w, b);
+  if (visited_delta_edges) {
+    b.resize(1);
+    b.front() = v;
+    RemoveFromOrder(w);
+    w_node->level_ = v_node->level_ + 1;
+
+    should_search_forwards = true;
+  } else if (w_node->level_ == v_node->level_) {
+    // l = b;
+    should_search_forwards = false;
+  } else {
+    // We know that w_node->level < v_node->level, by the case above and by the
+    // test in step 1.
+    DCHECK_LT(w_node->level_, v_node->level_);
+    RemoveFromOrder(w);
+    w_node->level_ = v_node->level_;
+    should_search_forwards = true;
+  }
+
+  // Step 3: search forwards from w, following outgoing edges only from nodes
+  // whose level increases.
+  std::vector<T*> f;
+  if (should_search_forwards) {
+    SearchForwards(v, w, f);
+    if (v_node->level_ < w_node->level_) {
+      b.clear();  // l = reverse(f)
+    } else {
+      CHECK_EQ(v_node->level_, w_node->level_);
+      // l = b + reverse(f)
+    }
+  }
+
+  // Step 4: update indices.
+  auto node_formatter = [](std::string* out, T* v) {
+    absl::StrAppend(out, v->*IndexInParent);
+  };
+  DVLOG(2) << "b=" << absl::StrJoin(b, ", ", node_formatter)
+           << " f=" << absl::StrJoin(f, ", ", node_formatter);
+  for (auto it = f.begin(); it != f.end(); ++it) {
+    UpdateIndex(*it);
+  }
+  for (auto it = b.rbegin(); it != b.rend(); ++it) {
+    UpdateIndex(*it);
+  }
+
+  // Step 5: add the edge.
+  // There's actually nothing to do here, because it's up to the user to add
+  // the edge to their own data structures. It doesn't matter whether the user
+  // does that before or after they call our AddEdge(), since we only search
+  // backwards from v and forwards from w.
+
+  if (VLOG_IS_ON(1)) {
+    LogOrder();
+
+    DVLOG(1) << "end AddEdge(" << v->*IndexInParent << ", " << w->*IndexInParent
+             << ") v={level=" << v_node->level_ << " "
+             << "index=" << v_node->index_ << "} "
+             << " w={level=" << w_node->level_ << " "
+             << "index=" << w_node->index_ << "} "
+             << "delta_=" << delta_;
+  }
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+bool TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin,
+                     SuccessorsEnd>::SearchBackwards(T* v, T* w,
+                                                     std::vector<T*>& b) {
+  std::vector<std::pair<T*, bool>> agenda;
+  std::fill(visited_backwards_.begin(), visited_backwards_.end(), false);
+  int num_edges_visited = 0;
+  agenda.emplace_back(v, false);
+  while (!agenda.empty()) {
+    auto [y, post] = agenda.back();
+    agenda.pop_back();
+    DVLOG(3) << "SearchBackwards visiting " << y->*IndexInParent
+             << " post=" << post;
+    CHECK(y != w) << "Cycle detected";
+    TopologicalSortNode<T>* y_node = &(y->*Link);
+    int level = y_node->level_;
+    if (post) {
+      b.push_back(y);
+      continue;
+    }
+
+    Index y_index_in_parent = y->*IndexInParent;
+    UpdateMaxIndexInParent(y_index_in_parent);
+    if (visited_backwards_[y_index_in_parent]) {
+      continue;
+    }
+    visited_backwards_[y_index_in_parent] = true;
+
+    agenda.emplace_back(y, true);
+    for (auto it = std::invoke(PredecessorsBegin, y);
+         num_edges_visited < delta_ && it != std::invoke(PredecessorsEnd, y);
+         ++it) {
+      T* x = *it;
+      TopologicalSortNode<T>* x_node = &(x->*Link);
+      if (!x_node->in_topological_order()) {
+        continue;
+      }
+      CHECK_LE(x_node->level_, level);
+      VLOG(2) << "visiting edge " << x->*IndexInParent;
+      if (x_node->level_ == level) {
+        ++num_edges_visited;
+        if (num_edges_visited >= delta_) {
+          return true;
+        }
+        agenda.emplace_back(x, false);
+      }
+    }
+  }
+  return false;
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+void TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin,
+                     SuccessorsEnd>::SearchForwards(T* v, T* w,
+                                                    std::vector<T*>& f) {
+  std::fill(visited_forwards_.begin(), visited_forwards_.end(), false);
+  std::fill(increased_.begin(), increased_.end(), false);
+  std::vector<std::pair<T*, bool>> agenda;
+  agenda.emplace_back(w, false);
+  UpdateMaxIndexInParent(w->*IndexInParent);
+  increased_[w->*IndexInParent] = true;
+
+  // f list of vertices whose level increases, in reverse postorder, i.e.,
+  // a vertex appears in f before its successors.
+  while (!agenda.empty()) {
+    auto [x, post] = agenda.back();
+    agenda.pop_back();
+    DVLOG(3) << "SearchForwards visiting " << x->*IndexInParent
+             << " post=" << post;
+    if (post) {
+      f.push_back(x);
+      continue;
+    }
+    Index x_index_in_parent = x->*IndexInParent;
+    UpdateMaxIndexInParent(x_index_in_parent);
+    if (visited_forwards_[x_index_in_parent] ||
+        !increased_[x_index_in_parent]) {
+      continue;
+    }
+    visited_forwards_[x_index_in_parent] = true;
+
+    agenda.emplace_back(x, true);
+
+    TopologicalSortNode<T>* x_node = &(x->*Link);
+    for (auto it = std::invoke(SuccessorsBegin, x);
+         it != std::invoke(SuccessorsEnd, x); ++it) {
+      T* y = *it;
+      VLOG(3) << "fwd edge to " << y->*IndexInParent;
+      TopologicalSortNode<T>* y_node = &(y->*Link);
+      if (!y_node->in_topological_order()) {
+        continue;
+      }
+      Index y_index_in_parent = y->*IndexInParent;
+      UpdateMaxIndexInParent(y_index_in_parent);
+      DCHECK(y != v) << "Cycle detected " << y->*IndexInParent;
+      DCHECK(!visited_backwards_[y_index_in_parent])
+          << "Cycle detected " << y->*IndexInParent;
+      agenda.emplace_back(y, false);
+      if (x_node->level_ > y_node->level_) {
+        RemoveFromOrder(y);
+        y_node->level_ = x_node->level_;
+        increased_[y_index_in_parent] = true;
+      }
+    }
+  }
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+void TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin, SuccessorsEnd>::RemoveFromOrder(T* v) {
+  TopologicalSortNode<T>* v_node = &(v->*Link);
+  // If this node is the last node in any level, it may appear in the
+  // first_in_level_ vector for subsequent levels.
+  for (int level = v_node->level_ + 1;
+       level < first_in_level_.size() && first_in_level_[level] == v_node;
+       ++level) {
+    first_in_level_[level] = v_node->prev_;
+  }
+  v_node->prev_->next_ = v_node->next_;
+  if (v_node->next_) {
+    (v_node->next_->*Link).prev_ = v_node->prev_;
+  } else {
+    node_.prev_ = v_node->prev_;
+  }
+  v_node->next_ = nullptr;
+  v_node->prev_ = nullptr;
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+void TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin, SuccessorsEnd>::UpdateIndex(T* v) {
+  TopologicalSortNode<T>* v_node = &(v->*Link);
+
+  if (v_node->prev_) {
+    // TODO(phawkins): could we just do this above?
+    RemoveFromOrder(v);
+  }
+
+  // Since this node just decreased in index, it now becomes the first node on
+  // its level.
+  v_node->index_ = next_index_--;
+  if (v_node->level_ >= first_in_level_.size()) {
+    TopologicalSortNode<T>* t = first_in_level_.back();
+    while (t->next_ != nullptr) {
+      t = &(t->next_->*Link);
+    }
+    first_in_level_.resize(v_node->level_ + 1, t);
+  }
+
+  TopologicalSortNode<T>* old_first = first_in_level_[v_node->level_];
+  v_node->next_ = old_first->next_;
+  v_node->prev_ = old_first;
+  if (v_node->next_) {
+    (v_node->next_->*Link).prev_ = v_node;
+  } else {
+    node_.prev_ = v_node;
+  }
+  old_first->next_ = v;
+  for (int level = v_node->level_ + 1;
+       level < first_in_level_.size() && first_in_level_[level] == old_first;
+       ++level) {
+    first_in_level_[level] = v_node;
+  }
+}
+
+template <typename T, typename Index, TopologicalSortNode<T> T::* Link,
+          Index T::* IndexInParent, typename PredecessorIterator,
+          PredecessorIterator (T::*PredecessorsBegin)() const,
+          PredecessorIterator (T::*PredecessorsEnd)() const,
+          typename SuccessorIterator,
+          SuccessorIterator (T::*SuccessorsBegin)() const,
+          SuccessorIterator (T::*SuccessorsEnd)() const>
+void TopologicalSort<T, Index, Link, IndexInParent, PredecessorIterator,
+                     PredecessorsBegin, PredecessorsEnd, SuccessorIterator,
+                     SuccessorsBegin, SuccessorsEnd>::UpdateDelta() {
+  int64_t m = num_edges_;
+  int64_t n = num_nodes_;
+  // delta should be ceil(min(m**(1/2), n**(2/3)))
+  while (delta_ * delta_ < m && delta_ * delta_ * delta_ < n * n) {
+    ++delta_;
+  }
+}
+
+#endif  // XLA_ONLINE_TOPSORT_H_
diff --git a/third_party/xla/xla/online_topsort_test.cc b/third_party/xla/xla/online_topsort_test.cc
new file mode 100644
index 000000000000..62021f9522f9
--- /dev/null
+++ b/third_party/xla/xla/online_topsort_test.cc
@@ -0,0 +1,275 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/online_topsort.h"
+
+#include <algorithm>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/random/random.h"
+#include "absl/strings/str_join.h"
+#include "xla/tsl/platform/test.h"
+
+namespace {
+
+struct TestNode {
+  explicit TestNode(int id) : id(id) {}
+
+  int id;
+  std::vector<TestNode*> in;
+  std::vector<TestNode*> out;
+  TopologicalSortNode<TestNode> node;
+
+  std::vector<TestNode*>::const_iterator incoming_begin() const {
+    return in.begin();
+  }
+  std::vector<TestNode*>::const_iterator incoming_end() const {
+    return in.end();
+  }
+  std::vector<TestNode*>::const_iterator outgoing_begin() const {
+    return out.begin();
+  }
+  std::vector<TestNode*>::const_iterator outgoing_end() const {
+    return out.end();
+  }
+};
+
+using Topsort =
+    TopologicalSort<TestNode, int, &TestNode::node, &TestNode::id,
+                    std::vector<TestNode*>::const_iterator,
+                    &TestNode::incoming_begin, &TestNode::incoming_end,
+                    std::vector<TestNode*>::const_iterator,
+                    &TestNode::outgoing_begin, &TestNode::outgoing_end>;
+
+struct TestGraph {
+  void AddNode(int id) {
+    if (id >= node_index.size()) {
+      node_index.resize(id + 1, nullptr);
+    }
+    auto node = std::make_unique<TestNode>(id);
+    CHECK(node_index[id] == nullptr) << id;
+    node_index[id] = node.get();
+    topsort.AddNode(node.get());
+    nodes.push_back(std::move(node));
+  }
+
+  void RemoveNode(int id) {
+    TestNode* node = node_index[id];
+    for (TestNode* x : node->in) {
+      RemoveEdge(x->id, node->id);
+    }
+    for (TestNode* x : node->out) {
+      RemoveEdge(id, x->id);
+    }
+    node_index[id] = nullptr;
+    topsort.RemoveNode(node);
+    auto it = std::find_if(nodes.begin(), nodes.end(),
+                           [node](const auto& x) { return x.get() == node; });
+    CHECK(it != nodes.end());
+    nodes.erase(it);
+  }
+
+  void AddEdge(int from, int to) {
+    CHECK_GE(from, 0);
+    CHECK_LT(from, node_index.size());
+    CHECK_GE(to, 0);
+    CHECK_LT(to, node_index.size());
+    TestNode* from_node = node_index[from];
+    TestNode* to_node = node_index[to];
+    topsort.AddEdge(from_node, to_node);
+    from_node->out.push_back(to_node);
+    to_node->in.push_back(from_node);
+  }
+
+  bool HasEdge(int from, int to) const {
+    TestNode* from_node = node_index[from];
+    TestNode* to_node = node_index[to];
+    return std::find(from_node->out.begin(), from_node->out.end(), to_node) !=
+           from_node->out.end();
+  }
+
+  void RemoveEdge(int from, int to) {
+    TestNode* from_node = node_index[from];
+    TestNode* to_node = node_index[to];
+    auto it = std::find(from_node->out.begin(), from_node->out.end(), to_node);
+    CHECK(it != from_node->out.end());
+    from_node->out.erase(it);
+    it = std::find(to_node->in.begin(), to_node->in.end(), from_node);
+    CHECK(it != to_node->in.end());
+    to_node->in.erase(it);
+  }
+
+  // Returns std::nullopt if the topological order is valid. Otherwise, returns
+  // an edge that is inconsistent with the topological order.
+  std::optional<std::pair<int, int>> TopologicalOrderIsValid() const {
+    std::vector<int> order(node_index.size(), -1);
+    int i = 0;
+    std::vector<const TestNode*> forward;
+    for (const TestNode& node : topsort) {
+      forward.push_back(&node);
+      order[node.id] = i++;
+    }
+
+    // Verifies that the reverse iterator gives the same order.
+    std::vector<const TestNode*> reverse;
+    for (auto it = topsort.rbegin(); it != topsort.rend(); ++it) {
+      reverse.push_back(&*it);
+    }
+    absl::c_reverse(reverse);
+    CHECK(forward == reverse);
+
+    for (const auto& x : nodes) {
+      for (TestNode* y : x->out) {
+        if (order[x->id] >= order[y->id]) {
+          return std::make_pair(x->id, y->id);
+        }
+      }
+    }
+    return std::nullopt;
+  }
+
+  std::vector<std::unique_ptr<TestNode>> nodes;
+  std::vector<TestNode*> node_index;
+  Topsort topsort;
+};
+
+std::string OrderString(const Topsort& top) {
+  std::vector<int> order;
+  for (TestNode& node : top) {
+    order.push_back(node.id);
+  }
+  return absl::StrJoin(order, ",");
+}
+
+MATCHER(HasValidTopologicalOrder, "") {
+  std::optional<std::pair<int, int>> result = arg.TopologicalOrderIsValid();
+  if (!result) {
+    return true;
+  }
+  *result_listener << "Topological order: " << OrderString(arg.topsort)
+                   << " is inconsistent with edge " << result->first << "->"
+                   << result->second;
+  return false;
+}
+
+TEST(TopologicalSortTest, Basic) {
+  TestGraph g;
+  for (int i = 0; i < 10; ++i) {
+    g.AddNode(i);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+  }
+  g.AddEdge(0, 1);
+  ASSERT_THAT(g, HasValidTopologicalOrder());
+  g.AddEdge(1, 2);
+  ASSERT_THAT(g, HasValidTopologicalOrder());
+  g.RemoveNode(0);
+  ASSERT_THAT(g, HasValidTopologicalOrder());
+  g.RemoveNode(1);
+  ASSERT_THAT(g, HasValidTopologicalOrder());
+}
+
+TEST(TopologicalSortTest, Stick) {
+  TestGraph g;
+  int n = 20;
+  for (int i = 0; i < n; ++i) {
+    g.AddNode(i);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+  }
+  for (int i = 0; i < n - 1; ++i) {
+    g.AddEdge(i, i + 1);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+  }
+  for (int i = 0; i < n; ++i) {
+    g.RemoveNode(i);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+  }
+}
+
+TEST(TopologicalSortTest, ChangeOrder) {
+  TestGraph g;
+  int n = 20;
+  for (int i = 0; i < n; ++i) {
+    g.AddNode(i);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+  }
+  for (int i = 0; i < n - 1; ++i) {
+    g.AddEdge(i, i + 1);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+  }
+  g.RemoveEdge(13, 14);
+  ASSERT_THAT(g, HasValidTopologicalOrder());
+  g.AddEdge(n - 1, 0);
+  ASSERT_THAT(g, HasValidTopologicalOrder());
+}
+
+TEST(TopologicalSortTest, Diamonds) {
+  TestGraph g;
+  g.AddNode(0);
+  for (int i = 0; i < 500; ++i) {
+    int j = 3 * i;
+    for (int k = 1; k <= 3; ++k) {
+      g.AddNode(j + k);
+      ASSERT_THAT(g, HasValidTopologicalOrder());
+    }
+    g.AddEdge(j, j + 1);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+    g.AddEdge(j, j + 2);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+    g.AddEdge(j + 1, j + 3);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+    g.AddEdge(j + 2, j + 3);
+    ASSERT_THAT(g, HasValidTopologicalOrder());
+  }
+  ASSERT_THAT(g, HasValidTopologicalOrder());
+}
+
+TEST(TopologicalSortTest, Random) {
+  absl::BitGen gen;
+  for (int trial = 0; trial < 10; ++trial) {
+    int n = absl::Uniform(gen, 10, 1000);
+    int m = absl::Uniform(gen, 0, std::min(n * 5, (n * (n - 1)) / 2));
+    LOG(INFO) << "trial: " << trial << " n: " << n << " m: " << m;
+    std::vector<int> order(n);
+    TestGraph g;
+    for (int i = 0; i < n; ++i) {
+      g.AddNode(i);
+    }
+    absl::c_iota(order, 0);
+    absl::c_shuffle(order, gen);
+    for (int i = 0; i < m; ++i) {
+      int a, b;
+      do {
+        a = absl::Uniform(gen, 0, n);
+        b = absl::Uniform(gen, 0, n);
+        if (a > b) {
+          std::swap(a, b);
+        }
+      } while (a == b || g.HasEdge(order[a], order[b]));
+      g.AddEdge(order[a], order[b]);
+      // Note: this check makes the test O(m^2), but it's valuable to verify
+      // the invariant is maintained.
+      ASSERT_THAT(g, HasValidTopologicalOrder());
+    }
+  }
+}
+
+}  // namespace
diff --git a/third_party/xla/xla/packed_literal_reader.cc b/third_party/xla/xla/packed_literal_reader.cc
index fba45915f600..956ad84e1c42 100644
--- a/third_party/xla/xla/packed_literal_reader.cc
+++ b/third_party/xla/xla/packed_literal_reader.cc
@@ -70,7 +70,7 @@ absl::StatusOr<Literal> PackedLiteralReader::Read(const Shape& shape,
   char* data = absl::bit_cast<char*>(field.data());
   uint64_t bytes = elements * sizeof(float);
   absl::string_view sp;
-  auto s = file_->Read(offset_, bytes, &sp, data);
+  auto s = file_->Read(offset_, sp, absl::MakeSpan(data, bytes));
   offset_ += sp.size();
   if (!s.ok()) {
     return s;
@@ -91,7 +91,8 @@ bool PackedLiteralReader::IsExhausted() const {
   // exhausted the data.
   char single_byte[1];
   absl::string_view sp;
-  auto s = file_->Read(offset_, sizeof(single_byte), &sp, single_byte);
+  auto s = file_->Read(offset_, sp,
+                       absl::MakeSpan(single_byte, sizeof(single_byte)));
   return !s.ok();
 }
 
diff --git a/third_party/xla/xla/parse_flags_from_env.cc b/third_party/xla/xla/parse_flags_from_env.cc
index 1aeda35c52cd..8770d3385bd6 100644
--- a/third_party/xla/xla/parse_flags_from_env.cc
+++ b/third_party/xla/xla/parse_flags_from_env.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
-// modules to parse flags from an environtment variable, or a file named by the
+// modules to parse flags from an environment variable, or a file named by the
 // environment variable.
 
 #include "xla/parse_flags_from_env.h"
@@ -107,7 +107,7 @@ static void ParseArgvFromString(const std::string& flag_str, EnvArgv* a) {
     // b is the index of the start of a flag.
     // Set e to the index just past the end of the flag.
     size_t e = b;
-    while (e != flag_str.size() && isascii(flag_str[e]) &&
+    while (e != flag_str.size() && absl::ascii_isascii(flag_str[e]) &&
            (strchr("-_", flag_str[e]) != nullptr ||
             absl::ascii_isalnum(flag_str[e]))) {
       e++;
@@ -183,7 +183,8 @@ static void SetArgvFromEnv(absl::string_view envvar, EnvArgv* a) {
 // The simulated argv[] parsed from the environment, one for each different
 // environment variable we've seen.
 static absl::flat_hash_map<std::string, EnvArgv>& EnvArgvs() {
-  static auto* env_argvs = new absl::flat_hash_map<std::string, EnvArgv>();
+  static auto* const env_argvs =
+      new absl::flat_hash_map<std::string, EnvArgv>();
   return *env_argvs;
 }
 
diff --git a/third_party/xla/xla/permutation_util.cc b/third_party/xla/xla/permutation_util.cc
index 4df3e57fc234..b70cec34ba53 100644
--- a/third_party/xla/xla/permutation_util.cc
+++ b/third_party/xla/xla/permutation_util.cc
@@ -57,13 +57,4 @@ std::vector<int64_t> ComposePermutations(absl::Span<const int64_t> p1,
   return output;
 }
 
-bool IsIdentityPermutation(absl::Span<const int64_t> permutation) {
-  for (int64_t i = 0; i < permutation.size(); ++i) {
-    if (permutation[i] != i) {
-      return false;
-    }
-  }
-  return true;
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/permutation_util.h b/third_party/xla/xla/permutation_util.h
index 993c860dc401..a44cf4d43945 100644
--- a/third_party/xla/xla/permutation_util.h
+++ b/third_party/xla/xla/permutation_util.h
@@ -81,7 +81,15 @@ std::vector<int64_t> ComposePermutations(absl::Span<const int64_t> p1,
                                          absl::Span<const int64_t> p2);
 
 // Returns true iff permutation == {0, 1, 2, ...}.
-bool IsIdentityPermutation(absl::Span<const int64_t> permutation);
+template <typename Container>
+bool IsIdentityPermutation(const Container& permutation) {
+  for (int64_t i = 0; i < permutation.size(); ++i) {
+    if (permutation[i] != i) {
+      return false;
+    }
+  }
+  return true;
+}
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 5cf9efc125eb..ed17a9cac05f 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -1,5 +1,5 @@
 # copybara:uncomment load("@rules_python//python:proto.bzl", "py_proto_library")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load(
     "//xla/tsl/platform:build_config.bzl",
@@ -65,6 +65,7 @@ cc_library(
     name = "semaphore",
     srcs = ["semaphore.cc"],
     hdrs = ["semaphore.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla:types",
         "@com_google_absl//absl/synchronization",
@@ -84,13 +85,68 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "abstract_tracked_device_buffer",
+    srcs = ["abstract_tracked_device_buffer.cc"],
+    hdrs = ["abstract_tracked_device_buffer.h"],
+    deps = [
+        ":pjrt_client",
+        ":pjrt_future",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "common_pjrt_client",
+    srcs = [
+        "common_pjrt_client.cc",
+        "host_to_device_transfer_manager.cc",
+    ],
+    hdrs = [
+        "common_pjrt_client.h",
+        "host_to_device_transfer_manager.h",
+    ],
+    deps = [
+        ":async_work_runner",
+        ":device_event",
+        ":pjrt_client",
+        ":raw_buffer",
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/profiler/lib:connected_traceme",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
+        "@local_tsl//tsl/profiler/lib:scoped_memory_debug_annotation",
+    ],
+)
+
 cc_library(
     name = "tracked_device_buffer",
     srcs = ["tracked_device_buffer.cc"],
     hdrs = ["tracked_device_buffer.h"],
     deps = [
+        ":abstract_tracked_device_buffer",
         ":event_pool",
         ":pjrt_client",
+        ":pjrt_common",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla/service:executable",
@@ -101,16 +157,19 @@ cc_library(
         "//xla/stream_executor:event",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:context_types_hdrs",
     ],
@@ -156,17 +215,22 @@ cc_library(
         ":worker_thread",
         "//xla:util",
         "//xla/client:local_client",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "//xla/tsl/util:env_var",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -175,6 +239,7 @@ cc_library(
     name = "pjrt_api",
     srcs = ["pjrt_api.cc"],
     hdrs = ["pjrt_api.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
@@ -225,6 +290,7 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_cost_analysis",
         "//xla/tsl/framework:allocator",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -239,7 +305,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -292,7 +357,6 @@ cc_library(
         ":execute_options_proto_cc",
         ":pjrt_common",
         ":pjrt_layout",
-        "//xla:shape_layout",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -307,14 +371,11 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -327,6 +388,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/client:executable_build_options",
+        "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status_matchers",
     ],
@@ -335,6 +397,7 @@ xla_cc_test(
 cc_library(
     name = "pjrt_device_description",
     hdrs = ["pjrt_device_description.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":pjrt_common",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -429,13 +492,14 @@ cc_library(
 cc_library(
     name = "pjrt_layout",
     hdrs = ["pjrt_layout.h"],
-    visibility = ["//xla:friends"],
+    visibility = internal_visibility(["//xla:friends"]),
     deps = [
         "//xla:shape_util",
         "//xla/hlo/parser:hlo_parser",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -481,16 +545,23 @@ cc_library(
     srcs = ["stream_executor_executable.cc"],
     hdrs = ["stream_executor_executable.h"],
     deps = [
+        ":host_memory_spaces",
         ":pjrt_common",
         ":pjrt_executable",
         ":stream_executor_executable_proto_cc",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/client:local_client",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:hlo_proto_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -502,6 +573,7 @@ cc_library(
     hdrs = ["pjrt_stream_executor_client.h"],
     visibility = internal_visibility(["//xla:friends"]),
     deps = [
+        ":abstract_tracked_device_buffer",
         ":event_pool",
         ":host_callback",
         ":host_memory_spaces",
@@ -515,6 +587,7 @@ cc_library(
         ":pjrt_future",
         ":pjrt_stream_executor_device_description",
         ":semaphore",
+        ":stream_executor_executable",
         ":tracked_device_buffer",
         ":transpose",
         ":utils",
@@ -531,6 +604,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/profiling:device_time_measurement",
+        "//xla/pjrt/profiling:profiling_context",
         "//xla/service:compiler",
         "//xla/service:computation_layout",
         "//xla/service:computation_placer",
@@ -557,6 +631,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -593,7 +668,6 @@ xla_cc_test(
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
-        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/service:cpu_plugin",
         "//xla/service:platform_util",
         "//xla/stream_executor:platform",
@@ -604,7 +678,6 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -613,15 +686,16 @@ cc_library(
     name = "mlir_to_hlo",
     srcs = ["mlir_to_hlo.cc"],
     hdrs = ["mlir_to_hlo.h"],
-    visibility = [":friends"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla:debug_options_flags",
         "//xla:util",
         "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
+        "//xla/hlo/translate:stablehlo",
         "//xla/mlir/utils:error_util",
         "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/mlir_hlo:mhlo_passes",
+        "//xla/mlir_hlo:stablehlo_extension_passes",
         "//xla/service/spmd/shardy:constants",
         "//xla/service/spmd/shardy:utils",
         "//xla/service/spmd/shardy/sdy_round_trip:pipelines",
@@ -676,12 +750,13 @@ cc_library(
     deps = [
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -690,10 +765,11 @@ xla_cc_test(
     srcs = ["pjrt_future_test.cc"],
     deps = [
         ":pjrt_future",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -701,6 +777,7 @@ cc_library(
     name = "host_memory_spaces",
     srcs = ["host_memory_spaces.cc"],
     hdrs = ["host_memory_spaces.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":pjrt_client",
         "@com_google_absl//absl/log:check",
@@ -726,6 +803,9 @@ cc_library(
 cc_library(
     name = "lru_cache",
     hdrs = ["lru_cache.h"],
+    visibility = internal_visibility([
+        "//xla:friends",
+    ]),
     deps = [
         "@com_google_absl//absl/container:node_hash_map",
         "@local_tsl//tsl/platform:logging",
@@ -749,7 +829,7 @@ cc_library(
         "transpose_kernels.h",
     ],
     hdrs = ["transpose.h"],
-    visibility = [":friends"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":lru_cache",
         "//xla:compiler_macros",
@@ -799,6 +879,7 @@ cc_library(
     name = "pjrt_c_api_client",
     srcs = ["pjrt_c_api_client.cc"],
     hdrs = ["pjrt_c_api_client.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":compile_options_proto_cc",
         ":mlir_to_hlo",
@@ -890,8 +971,6 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
         "@stablehlo//:version",
     ],
 )
@@ -954,7 +1033,10 @@ cc_library(
         ":pjrt_executable",
         ":pjrt_future",
         "//xla:shape_util",
+        "//xla/ffi:ffi_api",
+        "//xla/ffi/api:ffi",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
@@ -969,9 +1051,13 @@ xla_cc_test(
     deps = [
         ":host_callback",
         ":pjrt_client",
+        "//xla:xla_data_proto_cc",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1018,6 +1104,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -1054,6 +1141,7 @@ cc_library(
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:TargetParser",
     ] + if_cuda_is_configured(
         [
             "@com_google_absl//absl/base",
@@ -1102,15 +1190,32 @@ cc_library(
     ),
 )
 
+cc_library(
+    name = "device_event",
+    hdrs = ["device_event.h"],
+    deps = [
+        ":pjrt_future",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "raw_buffer",
     srcs = ["raw_buffer.cc"],
     hdrs = ["raw_buffer.h"],
     deps = [
+        ":device_event",
+        ":pjrt_client",
         ":pjrt_future",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1139,3 +1244,14 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "async_work_runner",
+    hdrs = ["async_work_runner.h"],
+    deps = [
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/types:span",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.cc b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.cc
new file mode 100644
index 000000000000..a0e99346a94e
--- /dev/null
+++ b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.cc
@@ -0,0 +1,247 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
+
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+
+CommonPjRtBuffer::CommonPjRtBuffer(
+    std::unique_ptr<AbstractTrackedDeviceBuffer> device_buffer)
+    : device_buffer_(std::move(device_buffer)) {
+  for (int i = 0; i < ScopedHold::Type::kMaxValue; ++i) {
+    holds_[i] = 0;
+  }
+}
+
+CommonPjRtBuffer::~CommonPjRtBuffer() {
+  for (int i = 0; i < ScopedHold::Type::kMaxValue; ++i) {
+    CHECK_EQ(holds_[i], 0) << "Non-zero type " << i << " hold on destruction.";
+  }
+}
+
+void CommonPjRtBuffer::WaitForOutstandingUsageHolds() {
+  auto not_in_usage_hold = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return holds_[ScopedHold::kUsage] == 0;
+  };
+  mu_.Await(absl::Condition(&not_in_usage_hold));
+}
+
+void CommonPjRtBuffer::WaitForOutstandingDonationHold() {
+  auto not_in_donation_hold = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return holds_[ScopedHold::kDonation] == 0;
+  };
+  mu_.Await(absl::Condition(&not_in_donation_hold));
+}
+
+absl::StatusOr<AbstractTrackedDeviceBuffer*>
+CommonPjRtBuffer::GetBufferForUsageOrExternalHoldLocked(ScopedHold::Type type) {
+  // All callers should have called WaitForOutstandingDonationHold().
+  CHECK_EQ(holds_[ScopedHold::kDonation], 0);
+  if (device_buffer_ == nullptr) {
+    return absl::InvalidArgumentError("Buffer has been deleted or donated.");
+  } else {
+    ++holds_[type];
+  }
+  return device_buffer_.get();
+}
+
+absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>>
+CommonPjRtBuffer::GetBufferForDonationHoldLocked() {
+  // All callers should have called WaitForOutstandingDonationHold().
+  CHECK_EQ(holds_[ScopedHold::kDonation], 0);
+  if (device_buffer_ == nullptr) {
+    return absl::InvalidArgumentError("Donation requested for invalid buffer");
+  }
+  if (holds_[ScopedHold::kExternalReference] > 0) {
+    return absl::InvalidArgumentError(
+        "Donation requested for buffer with external reference");
+  }
+  // First add the donation hold.
+  ++holds_[ScopedHold::kDonation];
+  // Then wait for any usage holds to be dropped or converted. No new usage
+  // holds can be added until we drop the donation hold so this wait will
+  // complete eventually.
+  WaitForOutstandingUsageHolds();
+  // Because we added a donation hold, nobody could release the buffer while
+  // we were waiting.
+  CHECK(device_buffer_ != nullptr);
+  return std::move(device_buffer_);
+}
+
+void CommonPjRtBuffer::AcquireHoldLocked(ScopedHold* hold) {
+  if (hold->type() == ScopedHold::kDonation) {
+    hold->AcquireDonation(GetBufferForDonationHoldLocked());
+    return;
+  }
+
+  hold->AcquireUsageOrExternalReference(
+      GetBufferForUsageOrExternalHoldLocked(hold->type()));
+}
+
+void CommonPjRtBuffer::DropUsageOrExternalHold(
+    ScopedHold::Type type, AbstractTrackedDeviceBuffer* buffer) {
+  absl::MutexLock lock(&mu_);
+  CHECK(device_buffer_.get() == buffer || device_buffer_ == nullptr);
+  CHECK_GT(holds_[type], 0);
+  --holds_[type];
+}
+
+void CommonPjRtBuffer::DropDonationHold(
+    std::unique_ptr<AbstractTrackedDeviceBuffer> buffer) {
+  absl::MutexLock lock(&mu_);
+  CHECK_EQ(device_buffer_.get(), nullptr);
+  device_buffer_ = std::move(buffer);
+  CHECK_GT(holds_[ScopedHold::kDonation], 0);
+  --holds_[ScopedHold::kDonation];
+  CHECK_EQ(holds_[ScopedHold::kDonation], 0);
+  CHECK_EQ(holds_[ScopedHold::kUsage], 0);
+  CHECK_EQ(holds_[ScopedHold::kExternalReference], 0);
+}
+
+absl::Status CommonPjRtBuffer::ScopedHold::status() const {
+  // Lazily create absl::Status values only when they are requested.
+  switch (state_) {
+    case kUninitialized:
+      return absl::InvalidArgumentError("Buffer has not been initialized");
+    case kValid:
+      return absl::OkStatus();
+    case kMoved:
+      return absl::InvalidArgumentError("Buffer has been moved.");
+    case kConverted:
+      return absl::InvalidArgumentError("Buffer has been converted");
+    case kReleased:
+      return absl::InvalidArgumentError("Buffer has been released");
+    case kDonated:
+      return absl::InvalidArgumentError("Buffer has been donated");
+    case kError:
+      return status_;
+    default:
+      CHECK(false) << "Unexpected state value " << state_;
+  }
+}
+
+void CommonPjRtBuffer::ScopedHold::DropHold() {
+  if (ok()) {
+    if (type_ == kDonation) {
+      parent_->DropDonationHold(std::move(buffer_));
+    } else {
+      parent_->DropUsageOrExternalHold(type_, buffer_ptr_);
+    }
+  }
+}
+
+CommonPjRtBuffer::ScopedHold::~ScopedHold() { DropHold(); }
+
+CommonPjRtBuffer::ScopedHold::ScopedHold(ScopedHold&& other)
+    : parent_(other.parent_),
+      type_(other.type_),
+      state_(other.state_),
+      status_(std::move(other.status_)),
+      buffer_ptr_(other.buffer_ptr_),
+      buffer_(std::move(other.buffer_)) {
+  // Preserve the invariant that status is invalid if buffer == nullptr.
+  other.SetState(kMoved);
+}
+
+void CommonPjRtBuffer::ScopedHold::AcquireDonation(
+    absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>> buffer_or) {
+  CHECK(!ok());
+  if (buffer_or.ok()) {
+    buffer_ = std::move(buffer_or).value();
+    buffer_ptr_ = buffer_.get();
+    SetState(kValid);
+  } else {
+    status_ = std::move(buffer_or).status();
+    buffer_ = nullptr;
+    buffer_ptr_ = nullptr;
+    SetState(kError);
+  }
+  // Check the invariant holds.
+  CHECK(!ok() || buffer_ptr_ != nullptr);
+}
+
+void CommonPjRtBuffer::ScopedHold::AcquireUsageOrExternalReference(
+    absl::StatusOr<AbstractTrackedDeviceBuffer*> buffer_or) {
+  CHECK(!ok());
+  if (buffer_or.ok()) {
+    buffer_.reset();
+    buffer_ptr_ = buffer_or.value();
+    SetState(kValid);
+  } else {
+    status_ = std::move(buffer_or).status();
+    buffer_.reset();
+    buffer_ = nullptr;
+    SetState(kError);
+  }
+  // Check the invariant holds.
+  CHECK(!ok() || buffer_ptr_ != nullptr);
+}
+
+void CommonPjRtBuffer::ScopedHold::ConfirmDonation() {
+  CHECK(ok());
+  CHECK_EQ(type(), kDonation);
+  parent()->ConfirmDonation(buffer());
+  SetState(kDonated);
+}
+
+void CommonPjRtBuffer::ConfirmDonation(
+    AbstractTrackedDeviceBuffer* device_buffer) {
+  absl::MutexLock lock(&mu_);
+  CHECK_EQ(holds_[ScopedHold::kUsage], 0);
+  CHECK_EQ(holds_[ScopedHold::kExternalReference], 0);
+  CHECK_EQ(holds_[ScopedHold::kDonation], 1);
+  holds_[ScopedHold::kDonation] = 0;
+  device_buffer->ConfirmDonation();
+}
+
+std::unique_ptr<AbstractTrackedDeviceBuffer> CommonPjRtBuffer::ReleaseBuffer() {
+  absl::MutexLock lock(&mu_);
+  {
+    tsl::profiler::TraceMe t1("Wait for donation holds");
+    // We first wait for a donation hold to complete if there is one in
+    // progress. If the donation succeeds via ConfirmDonation() then it will
+    // set device_buffer_ to nullptr before returning to this thread.
+    WaitForOutstandingDonationHold();
+  }
+  if (device_buffer_ == nullptr) {
+    // Buffer has been deleted.
+    return nullptr;
+  }
+  // Return device_buffer_ by move which also sets it to nullptr, so
+  // that no other thread can add a hold while we are in
+  // WaitForOutstandingUsageHolds() below.
+  auto buffer = std::move(device_buffer_);
+
+  tsl::profiler::TraceMe t2("Wait for usage holds");
+  WaitForOutstandingUsageHolds();
+  return buffer;
+}
+
+bool CommonPjRtBuffer::IsDeleted() {
+  absl::MutexLock lock(&mu_);
+  return device_buffer_ == nullptr;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
new file mode 100644
index 000000000000..5fd453db673f
--- /dev/null
+++ b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
@@ -0,0 +1,242 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_ABSTRACT_TRACKED_DEVICE_BUFFER_H_
+#define XLA_PJRT_ABSTRACT_TRACKED_DEVICE_BUFFER_H_
+
+#include <array>
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+
+namespace xla {
+
+class AbstractTrackedDeviceBuffer {
+ public:
+  virtual ~AbstractTrackedDeviceBuffer() = default;
+
+  // Only to be called by ScopedHold to mark a successful donation.
+  virtual void ConfirmDonation() = 0;
+};
+
+class CommonPjRtBuffer : public PjRtBuffer {
+ public:
+  // Helper class to retain a "hold" on a CommonPjRtBuffer. A ScopedHold
+  // may not outlive its parent CommonPjRtBuffer.
+  //
+  // There are three types of hold, as follows:
+  //
+  // 1) Usage hold: a transient hold while an operation using the buffer is
+  //    being enqueued to the runtime.
+  // A client acquires a usage hold by calling
+  // CommonPjRtBuffer::GetBufferWithHold(kUsage) or the convenience
+  // wrapper GetBufferWithUsageHold(). If the enqueue completes successfully the
+  // hold should be released using a call to ConvertUsageHold. If the ScopedHold
+  // is deleted without ConvertUsageHold being called, e.g., on error, the hold
+  // is dropped. It is legal to drop a usage hold instead of calling
+  // ConvertUsageHold, even if the buffer was successfully enqueued, as long as
+  // the client ensures that all necessary synchronization has been done.
+  //
+  // 2) External hold: a potentially long-lived hold while the buffer is being
+  //    shared by an external framework, e.g., NumPy.
+  // A client acquires an external hold by calling
+  // CommonPjRtBuffer::GetBufferWithHold(kExternal) or the convenience
+  // wrapper GetBufferWithExternalReference and releases it by deleting the
+  // ScopedHold. The external framework should not modify the underlying buffer
+  // unless it is confident via its own synchronization that modifications do
+  // not race with reads from the CommonPjRtBuffer.
+  //
+  // 3) Donation hold: a transient hold while an execution that donates the
+  //    buffer is being enqueued to the runtime.
+  // A client acquires a donation hold by calling
+  // CommonPjRtBuffer::GetBufferWithHold(kDonation). If the enqueue
+  // completes successfully the hold should be released using a call to
+  // ConfirmDonation after which the buffer is invalid. If the ScopedHold is
+  // deleted without ConfirmDonation being called, e.g., on error, the hold is
+  // dropped and the buffer remains valid. If the buffer is successfully
+  // enqueued the client *must* call ConfirmDonation.
+  //
+  // Donation holds behave like exclusive write locks: when a donation hold
+  // has been acquired, any attempt to acquire another hold of any type will
+  // block until the donation hold is dropped or confirmed. Acquiring a donation
+  // hold will fail with an error if there is any outstanding external hold, and
+  // will block if there are any outstanding usage holds until those holds are
+  // dropped or converted.
+  //
+  // Calls to CommonPjRtBuffer::Release (and transitively to
+  // CommonPjRtBuffer::Delete() and ~CommonPjRtBuffer()) will
+  // block until all usage and donation holds are either deleted or
+  // converted/confirmed.
+  class ScopedHold {
+   public:
+    enum Type { kUsage = 0, kExternalReference, kDonation, kMaxValue };
+    // Use a State enum instead of encoding the state in an error absl::Status
+    // to avoid creating absl::Status values in non-error cases. Creating a
+    // absl::Status entails several allocations and can add O(us) to every use
+    // of a hold.
+    enum State {
+      kUninitialized = 0,
+      kValid,
+      kMoved,
+      kConverted,
+      kReleased,
+      kDonated,
+      kError
+    };
+
+    ~ScopedHold();
+    ScopedHold(ScopedHold&& other);
+    ScopedHold(const ScopedHold&) = delete;
+    ScopedHold& operator=(const ScopedHold&) = delete;
+
+    Type type() const { return type_; }
+
+    absl::Status status() const;
+    bool ok() const { return state_ == kValid; }
+
+    // Access to the underlying device buffer storage. Requires this->ok().
+    AbstractTrackedDeviceBuffer* buffer() const {
+      CHECK_EQ(state_, kValid);
+      CHECK_NE(buffer_ptr_, nullptr);
+      return buffer_ptr_;
+    }
+    CommonPjRtBuffer* parent() const { return parent_; }
+
+    // Confirms that the buffer was successfully donated to an execution.
+    // Only valid for holds of type kDonation. Causes the buffer to become
+    // invalid.
+    void ConfirmDonation();
+
+   protected:
+    ScopedHold(CommonPjRtBuffer* parent, Type type)
+        : parent_(parent), type_(type), state_(kUninitialized) {}
+
+    // Sets buffer state.
+    void SetState(State state) { state_ = state; }
+
+   private:
+    friend class CommonPjRtBuffer;
+
+    // Acquires the unique ownership of the buffer. Called by parent_ to
+    // initialize the donation hold.
+    void AcquireDonation(
+        absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>> buffer_or);
+
+    // Acquires a non-owning reference of the buffer. Called by parent_ to
+    // initialize the usage or external reference hold.
+    void AcquireUsageOrExternalReference(
+        absl::StatusOr<AbstractTrackedDeviceBuffer*> buffer_or);
+
+    // Drops this hold. It resets `holds_` counters. If it is a donation hold
+    // and an error occurs, it returns the device buffer to the
+    // CommonPjRtBuffer.
+    void DropHold();
+
+    CommonPjRtBuffer* const parent_;
+    const Type type_;
+
+    // There is an invariant that if ok() then buffer_.value() != nullptr.
+    State state_;
+    absl::Status status_;
+    // The non-owning pointer to the underlying buffer. It is not nullptr for
+    // all types of holds.
+    AbstractTrackedDeviceBuffer* buffer_ptr_ = nullptr;
+    // If it is a donation hold, `buffer_` will not be nullptr. Otherwise, it is
+    // a nullptr.
+    std::unique_ptr<AbstractTrackedDeviceBuffer> buffer_;
+  };
+
+  bool IsDeleted() override;
+
+ protected:
+  explicit CommonPjRtBuffer(
+      std::unique_ptr<AbstractTrackedDeviceBuffer> device_buffer);
+  ~CommonPjRtBuffer() override;
+
+  // Blocks in mu_.Await until there are no more usage holds.
+  void WaitForOutstandingUsageHolds() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Blocks in mu_.Await until there is no donation hold.
+  void WaitForOutstandingDonationHold() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Adds a donation hold and returns device_buffer_. Returns an error if
+  // device_buffer_ is null, or if a donation hold was requested when there is
+  // an outstanding external hold.
+  // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
+  // must be called first.)
+  absl::StatusOr<std::unique_ptr<AbstractTrackedDeviceBuffer>>
+  GetBufferForDonationHoldLocked() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Adds a hold of usage or external reference and returns non-owning
+  // device_buffer_. Returns an error if device_buffer_ is null.
+  // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
+  // must be called first.)
+  absl::StatusOr<AbstractTrackedDeviceBuffer*>
+  GetBufferForUsageOrExternalHoldLocked(ScopedHold::Type type)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Adds a hold of hold->type() and initializes `hold` with device_buffer_.
+  // Initializes hold with an error if device_buffer_ is null, or if a donation
+  // hold was requested when there is an outstanding external hold.
+  // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
+  // must be called first.)
+  void AcquireHoldLocked(ScopedHold* hold) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Drops a hold without taking any other action. Does a sanity check that
+  // buffer==device_buffer_ or device_buffer_==nullptr.
+  void DropUsageOrExternalHold(ScopedHold::Type type,
+                               AbstractTrackedDeviceBuffer* buffer);
+
+  // Drops a hold without taking any other action. Does a sanity check that
+  // buffer==device_buffer_ or device_buffer_==nullptr.
+  void DropDonationHold(std::unique_ptr<AbstractTrackedDeviceBuffer> buffer);
+
+  // Drops a donation hold and makes *this invalid for further use. Does a
+  // sanity check that buffer==device_buffer_. Called after device_buffer_ was
+  // successfully donated to an execution.
+  void ConfirmDonation(AbstractTrackedDeviceBuffer* device_buffer);
+
+  void DecrementUsage() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    CHECK_GT(holds_[ScopedHold::kUsage], 0);
+    --holds_[ScopedHold::kUsage];
+  }
+
+  std::unique_ptr<AbstractTrackedDeviceBuffer> ReleaseBuffer()
+      ABSL_LOCKS_EXCLUDED(mu_);
+
+  AbstractTrackedDeviceBuffer* device_buffer() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return device_buffer_.get();
+  }
+
+  mutable absl::Mutex mu_;
+  PjRtFuture<>::Promise definition_promise_ ABSL_GUARDED_BY(mu_);
+
+ private:
+  std::unique_ptr<AbstractTrackedDeviceBuffer> device_buffer_
+      ABSL_GUARDED_BY(mu_);
+  // Count of holds on the buffer.
+  std::array<int, ScopedHold::Type::kMaxValue> holds_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_ABSTRACT_TRACKED_DEVICE_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/async_work_runner.h b/third_party/xla/xla/pjrt/async_work_runner.h
new file mode 100644
index 000000000000..42ebf26ac33b
--- /dev/null
+++ b/third_party/xla/xla/pjrt/async_work_runner.h
@@ -0,0 +1,41 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_ASYNC_WORK_RUNNER_H_
+#define XLA_PJRT_ASYNC_WORK_RUNNER_H_
+
+#include "absl/functional/any_invocable.h"
+#include "absl/types/span.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+
+// Async work runner abstracts away the implementation of the underlying thread
+// pool (or concurrent work queue).
+class AsyncWorkRunner {
+ public:
+  virtual ~AsyncWorkRunner() = default;
+
+  // `work` euqueued by `Schedule` may run on the calling thread.
+  virtual void Schedule(absl::AnyInvocable<void() &&> work) = 0;
+  virtual void ScheduleWhenReady(
+      absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
+      absl::AnyInvocable<void() &&> work) = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_ASYNC_WORK_RUNNER_H_
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 426d2cbff344..fdc306ea2846 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -3,7 +3,7 @@ load(
     "if_rocm_is_configured",
 )
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_binary",
     "xla_cc_test",
 )
@@ -70,6 +70,7 @@ cc_library(
         "//xla/ffi:execution_context",
         "//xla/ffi:type_id_registry",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -165,6 +166,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -301,9 +303,12 @@ cc_library(
         ":pjrt_c_api_memory_descriptions_extension_hdrs",
         ":pjrt_c_api_wrapper_impl",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
     ],
 )
@@ -322,8 +327,14 @@ cc_library(
 # PJRT CPU plugin.
 xla_cc_binary(
     name = "pjrt_c_api_cpu_plugin.so",
+    additional_linker_inputs = [
+        ":pjrt_c_api_cpu_version_script.lds",
+    ],
     linkopts = if_macos(
-        [],
+        [
+            "-Wl,-exported_symbol,_GetPjrtApi",
+            "-Wl,-undefined,error",
+        ],
         [
             "-Wl,--version-script,$(location :pjrt_c_api_cpu_version_script.lds)",
             "-Wl,--no-undefined",
@@ -338,7 +349,6 @@ xla_cc_binary(
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_c_api_cpu",
-        ":pjrt_c_api_cpu_version_script.lds",
     ],
 )
 
@@ -379,9 +389,10 @@ cc_library(
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/pjrt/gpu:gpu_topology",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
-        "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # To register GPU AOT compiler
+        "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # buildcleaner: keep to register GPU AOT compiler
         "//xla/python:custom_call_batch_partitioner",
         "//xla/python:custom_partition_callback",
+        "//xla/python:debug_callback_partitioner",  # To register "DebugCallbackCustomCallPartitioner" custom partitioning handler.
         "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
         "//xla/service:compiler",
         "//xla/service:custom_call_target_registry",
@@ -409,6 +420,9 @@ cc_library(
 # PJRT GPU plugin. Can be configured to be built for CUDA or ROCM.
 xla_cc_binary(
     name = "pjrt_c_api_gpu_plugin.so",
+    additional_linker_inputs = [
+        ":pjrt_c_api_gpu_version_script.lds",
+    ],
     linkopts = [
         "-Wl,--version-script,$(location :pjrt_c_api_gpu_version_script.lds)",
         "-Wl,--no-undefined",
@@ -422,7 +436,6 @@ xla_cc_binary(
     ],
     deps = [
         ":pjrt_c_api_gpu",
-        ":pjrt_c_api_gpu_version_script.lds",
         "//xla/service:gpu_plugin",
     ] + if_cuda_is_configured([
         "//xla/stream_executor:cuda_platform",
@@ -440,18 +453,24 @@ cc_library(
     deps = [
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
+        ":pjrt_c_api_wrapper_impl",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/client:executable_build_options",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/builder:xla_computation",
         "//xla/pjrt:compile_options_proto_cc",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/service:computation_placer_hdr",
         "//xla/tsl/platform:status",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -476,6 +495,7 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:util",
         "//xla/client:client_library",
         "//xla/ffi:execution_context",
         "//xla/ffi:ffi_api",
@@ -513,17 +533,13 @@ xla_cc_test(
         ":pjrt_c_api_helpers",
         ":pjrt_c_api_wrapper_impl",
         "//xla:shape_util",
-        "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
         "@stablehlo//:version",
@@ -535,8 +551,8 @@ xla_cc_test(
     srcs = ["pjrt_c_api_cpu_test.cc"],
     deps = [
         ":pjrt_c_api_cpu",
+        ":pjrt_c_api_hdrs",
         ":pjrt_c_api_test_common",
-        ":pjrt_c_api_wrapper_impl",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -577,6 +593,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index b6f85b496f6a..9b8a09a7c392 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,12 @@
 # PJRT C API changelog
 
+## 0.68
+
+* Changed the type of ``topology`` in
+  ``PJRT_TopologyDescription_PlatformName_Args`` and
+  ``PJRT_TopologyDescription_GetDeviceDescriptions_Args`` from
+  ``PJRT_TopologyDescription*`` to ``const PJRT_TopologyDescription*``.
+
 ## 0.67
 * Added ``PJRT_Client_DmaMap`` and ``PJRT_Client_DmaUnmap``.
 
diff --git a/third_party/xla/xla/pjrt/c/README.md b/third_party/xla/xla/pjrt/c/README.md
index acc5923a2735..0bfc52962c51 100644
--- a/third_party/xla/xla/pjrt/c/README.md
+++ b/third_party/xla/xla/pjrt/c/README.md
@@ -15,8 +15,8 @@ opaque to the frameworks.
 
 *   [PJRT C API header](https://github.com/openxla/xla/blob/main/xla/pjrt/c/pjrt_c_api.h)
 *   [PJRT C API changelog](https://github.com/openxla/xla/blob/main/xla/pjrt/c/CHANGELOG.md)
-*   [PJRT C++ API Overview](https://github.com/openxla/xla/blob/main/xla/docs/pjrt/cpp_api_overview.md)
-*   [PJRT integration guide](https://github.com/openxla/xla/blob/main/xla/docs/pjrt/pjrt_integration.md)
+*   [PJRT C++ API Overview](https://github.com/openxla/xla/blob/main/docs/pjrt/cpp_api_overview.md)
+*   [PJRT integration guide](https://github.com/openxla/xla/blob/main/docs/pjrt/pjrt_integration.md)
 *   [PJRT design docs](https://drive.google.com/drive/folders/18M944-QQPk1E34qRyIjkqDRDnpMa3miN)
 *   [PJRT API ABI versioning and compatibility](https://docs.google.com/document/d/1TKB5NyGtdzrpgw5mpyFjVAhJjpSNdF31T6pjPl_UT2o/edit)
 *   [PJRT Plugin Mechanism design doc](https://docs.google.com/document/d/1Qdptisz1tUPGn1qFAVgCV2omnfjN01zoQPwKLdlizas/edit)
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 85fcd6511dd1..18e86b8b401b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -82,7 +82,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 67
+#define PJRT_API_MINOR 68
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -2237,7 +2237,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
 struct PJRT_TopologyDescription_PlatformName_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
-  PJRT_TopologyDescription* topology;
+  const PJRT_TopologyDescription* topology;
   // `platform_name` has the same lifetime as `topology`. It is owned by
   // `topology`.
   const char* platform_name;  // out
@@ -2253,7 +2253,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformName(
 struct PJRT_TopologyDescription_GetDeviceDescriptions_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
-  PJRT_TopologyDescription* topology;
+  const PJRT_TopologyDescription* topology;
   // Has the same lifetime as topology.
   PJRT_DeviceDescription* const* descriptions;  // out
   size_t num_descriptions;                      // out
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
index 75f030997f87..269aff972be4 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "xla/pjrt/c/pjrt_c_api_cpu_internal.h"
 
+#include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_ffi_extension.h"
@@ -27,6 +31,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
@@ -39,10 +44,21 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
       "PJRT_Client_Create_Args", PJRT_Client_Create_Args_STRUCT_SIZE,
       args->struct_size));
 
-  // TODO(b/263170683): cpu_device_count should be configurable after config
-  // options can be passed to PJRT_Client_Create.
   xla::CpuClientOptions options;
   options.cpu_device_count = 4;
+
+  if (args->create_options != nullptr) {
+    absl::flat_hash_map<std::string, xla::PjRtValueType> create_options =
+        ConvertFromPjRtNamedValueList(args->create_options, args->num_options);
+    if (create_options.contains("cpu_device_count")) {
+      int64_t device_count_option =
+          std::get<int64_t>(create_options["cpu_device_count"]);
+      options.cpu_device_count = device_count_option;
+      LOG(INFO) << "cpu_device_count set via create_options: "
+                << device_count_option;
+    }
+  }
+
   PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> client,
                         xla::GetXlaPjrtCpuClient(std::move(options)));
   args->client = pjrt::CreateWrapperClient(std::move(client));
@@ -69,18 +85,16 @@ const PJRT_Api* GetCpuPjrtApi() {
       pjrt::CreateLayoutsExtension(nullptr);
 
   static PJRT_MemoryDescriptions_Extension memory_descriptions_extension =
-      pjrt::CreateMemoryDescriptionsExtension(
-          reinterpret_cast<PJRT_Extension_Base*>(&layouts_extension));
+      pjrt::CreateMemoryDescriptionsExtension(&layouts_extension.base);
 
-  static PJRT_FFI_Extension ffi_extension = pjrt::CreateFfiExtension(
-      reinterpret_cast<PJRT_Extension_Base*>(&memory_descriptions_extension));
+  static PJRT_FFI_Extension ffi_extension =
+      pjrt::CreateFfiExtension(&memory_descriptions_extension.base);
 
   static const PJRT_Api pjrt_api = pjrt::CreatePjrtApi(
       pjrt::cpu_plugin::PJRT_Client_Create,
       pjrt::cpu_plugin::PJRT_ExecuteContext_Create,
       pjrt::cpu_plugin::PJRT_CpuDeviceTopology_Create,
-      pjrt::PJRT_Plugin_Initialize_NoOp,
-      reinterpret_cast<PJRT_Extension_Base*>(&ffi_extension),
+      pjrt::PJRT_Plugin_Initialize_NoOp, &ffi_extension.base,
       pjrt::PJRT_Plugin_Attributes_Xla);
 
   return &pjrt_api;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_test.cc
index 1d10256f1439..48c00ae918e1 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_test.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/pjrt/c/pjrt_c_api_cpu.h"
 
+#include <cstring>
+
+#include <gtest/gtest.h>
+#include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_test.h"
 
 namespace pjrt {
@@ -23,5 +27,43 @@ const bool kUnused = (RegisterPjRtCApiTestFactory([]() { return GetPjrtApi(); },
                                                   /*platform_name=*/"cpu"),
                       true);
 
+TEST(PjRtCApiCpuTest, CreateClientWithCreateOptions) {
+  const PJRT_Api* api_ = GetPjrtApi();
+
+  PJRT_Client_Create_Args args;
+  args.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+
+  PJRT_NamedValue create_option;
+  create_option.struct_size = PJRT_NamedValue_STRUCT_SIZE;
+  create_option.extension_start = nullptr;
+  create_option.name = "cpu_device_count";
+  create_option.name_size = strlen("cpu_device_count");
+  create_option.type = PJRT_NamedValue_Type::PJRT_NamedValue_kInt64;
+  create_option.int64_value = 16;
+  create_option.value_size = 1;
+
+  args.create_options = &create_option;
+  args.num_options = 1;
+  args.client = nullptr;
+  PJRT_Error* error = api_->PJRT_Client_Create(&args);
+  ASSERT_EQ(error, nullptr);
+
+  PJRT_Client_Devices_Args dev_args;
+  dev_args.struct_size = PJRT_Client_Devices_Args_STRUCT_SIZE;
+  dev_args.extension_start = nullptr;
+  dev_args.client = args.client;
+  error = api_->PJRT_Client_Devices(&dev_args);
+  ASSERT_EQ(error, nullptr);
+  ASSERT_EQ(dev_args.num_devices, 16);
+
+  PJRT_Client_Destroy_Args destroy_args;
+  destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
+  destroy_args.extension_start = nullptr;
+  destroy_args.client = args.client;
+  PJRT_Error* destroy_error = GetPjrtApi()->PJRT_Client_Destroy(&destroy_args);
+  ASSERT_EQ(destroy_error, nullptr);
+}
+
 }  // namespace
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
index beee3d9a5e72..a79764f5dc89 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
@@ -130,9 +130,7 @@ typedef PJRT_Error* PJRT_Register_Batch_Partitionable(
     PJRT_Register_Batch_Partitionable_Args* args);
 
 typedef struct PJRT_Custom_Partitioner_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   PJRT_Register_Custom_Partitioner* register_custom_partitioner;
   PJRT_Register_Batch_Partitionable* register_batch_partitionable;
 } PJRT_Custom_Partitioner_Extension;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
index a33bd4aa9ce7..995a2c7e50dc 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_extension.h
@@ -38,11 +38,14 @@ struct PJRT_FFI_TypeID_Register_Args {
 
   const char* type_name;
   size_t type_name_size;
-  int64_t type_id;  // out
+  int64_t type_id;  // in-out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_TypeID_Register_Args, type_id);
 
-// Registers external type in a static type registry.
+// Registers external type in a static type registry. If `type_id` is set to `0`
+// XLA will assign a unique type id to it and return via out argument, otherwise
+// it will verify that user-provided type id matches previously registered type
+// id for the given type name.
 typedef PJRT_Error* PJRT_FFI_TypeID_Register(
     PJRT_FFI_TypeID_Register_Args* args);
 
@@ -67,9 +70,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_UserData_Add_Args, user_data);
 typedef PJRT_Error* PJRT_FFI_UserData_Add(PJRT_FFI_UserData_Add_Args* args);
 
 typedef struct PJRT_FFI_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   PJRT_FFI_TypeID_Register* type_id_register;
   PJRT_FFI_UserData_Add* user_data_add;
 } PJRT_FFI;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
index 0375b39d0b9a..5fa88eab330a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_ffi_internal.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_ffi_internal.h"
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/type_id_registry.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
@@ -31,11 +32,23 @@ static PJRT_Error* PJRT_FFI_TypeID_Register(
       "PJRT_FFI_TypeID_Register_Args",
       PJRT_FFI_TypeID_Register_Args_STRUCT_SIZE, args->struct_size));
 
-  PJRT_ASSIGN_OR_RETURN(
-      auto type_id,
-      xla::ffi::TypeIdRegistry::RegisterExternalTypeId(
-          absl::string_view(args->type_name, args->type_name_size)));
-  args->type_id = type_id.value();
+  absl::string_view type_name(args->type_name, args->type_name_size);
+  xla::ffi::TypeIdRegistry::TypeId type_id(args->type_id);
+
+  if (type_id == xla::ffi::TypeIdRegistry::kUnknownTypeId) {
+    // If type_id is unknown, we are registering a new type and XLA will assign
+    // a unique type id to it.
+    PJRT_ASSIGN_OR_RETURN(
+        auto assigned_type_id,
+        xla::ffi::TypeIdRegistry::AssignExternalTypeId(type_name));
+    args->type_id = assigned_type_id.value();
+
+  } else {
+    // If type_id is set, we are relying on the caller-provided unique type id.
+    PJRT_RETURN_IF_ERROR(
+        xla::ffi::TypeIdRegistry::RegisterExternalTypeId(type_name, type_id));
+  }
+
   return nullptr;
 }
 
@@ -57,9 +70,11 @@ static PJRT_Error* PJRT_FFI_UserData_Add(PJRT_FFI_UserData_Add_Args* args) {
 
 PJRT_FFI_Extension CreateFfiExtension(PJRT_Extension_Base* next) {
   return {
-      /*struct_size=*/PJRT_FFI_Extension_STRUCT_SIZE,
-      /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_FFI,
-      /*next=*/next,
+      PJRT_Extension_Base{
+          /*struct_size=*/PJRT_FFI_Extension_STRUCT_SIZE,
+          /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_FFI,
+          /*next=*/next,
+      },
       /*type_id_register=*/PJRT_FFI_TypeID_Register,
       /*user_data_add=*/PJRT_FFI_UserData_Add,
   };
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
index 28b17e5434f2..c456e1b1a85c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
@@ -43,9 +43,7 @@ typedef PJRT_Error* PJRT_Gpu_Register_Custom_Call(
     PJRT_Gpu_Register_Custom_Call_Args* args);
 
 typedef struct PJRT_Gpu_Custom_Call {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   PJRT_Gpu_Register_Custom_Call* custom_call;
 } PJRT_Gpu_Custom_Call;
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Gpu_Custom_Call, custom_call);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index df283cb3f693..abb9d1a41b6d 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -89,6 +89,7 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
            PJRT_NamedValue_Type::PJRT_NamedValue_kBool},
           {"enable_mock_nccl", PJRT_NamedValue_Type::PJRT_NamedValue_kBool},
           {"mock_gpu_topology", PJRT_NamedValue_Type::PJRT_NamedValue_kString},
+          {"slice_index", PJRT_NamedValue_Type::PJRT_NamedValue_kInt64},
       });
   PJRT_RETURN_IF_ERROR(
       ValidateCreateOptions(create_options, kExpectedOptionNameAndTypes));
@@ -158,6 +159,11 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
       it != create_options.end()) {
     mock_gpu_topology = std::get<std::string>(it->second);
   }
+  std::optional<int64_t> slice_index;
+  if (auto it = create_options.find("slice_index");
+      it != create_options.end()) {
+    slice_index = std::get<int64_t>(it->second);
+  }
 
   xla::GpuClientOptions options;
   options.allocator_config = allocator_config;
@@ -172,6 +178,7 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
       should_stage_host_to_device_transfers;
   options.enable_mock_nccl = enable_mock_nccl;
   options.mock_gpu_topology = mock_gpu_topology;
+  options.slice_index = slice_index;
   PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> client,
                         xla::GetStreamExecutorGpuClient(options));
   args->client = pjrt::CreateWrapperClient(std::move(client));
@@ -312,9 +319,11 @@ PLUGIN_Profiler_Api profiler_api{
 };
 
 PJRT_Profiler_Extension profiler_extension{
-    /*struct_size=*/PJRT_Profiler_Extension_STRUCT_SIZE,
-    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Profiler,
-    /*next=*/nullptr,
+    PJRT_Extension_Base{
+        /*struct_size=*/PJRT_Profiler_Extension_STRUCT_SIZE,
+        /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Profiler,
+        /*next=*/nullptr,
+    },
     /*profiler_api=*/&profiler_api,
 };
 
@@ -341,9 +350,11 @@ PJRT_Error* PJRT_Register_Batch_Partitionable(
 }
 
 PJRT_Custom_Partitioner_Extension custom_partitioner{
-    /*struct_size=*/PJRT_Custom_Partitioner_Extension_STRUCT_SIZE,
-    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner,
-    /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension),
+    PJRT_Extension_Base{
+        /*struct_size=*/PJRT_Custom_Partitioner_Extension_STRUCT_SIZE,
+        /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner,
+        /*next=*/&profiler_extension.base,
+    },
     /*register_custom_partitioner=*/PJRT_Register_Custom_Partitioner,
     /*register_batch_partitionable=*/PJRT_Register_Batch_Partitionable,
 };
@@ -374,9 +385,11 @@ PJRT_Error* PJRT_Wait_Until_Buffer_Ready_On_Stream(
 }
 
 PJRT_Stream_Extension stream{
-    /*struct_size=*/PJRT_Stream_Extension_STRUCT_SIZE,
-    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Stream,
-    /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&custom_partitioner),
+    PJRT_Extension_Base{
+        /*struct_size=*/PJRT_Stream_Extension_STRUCT_SIZE,
+        /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Stream,
+        /*next=*/&custom_partitioner.base,
+    },
     /*get_stream=*/PJRT_Get_Stream_For_External_Ready_Events,
     /*wait_stream=*/PJRT_Wait_Until_Buffer_Ready_On_Stream,
 };
@@ -412,32 +425,31 @@ PJRT_Error* PJRT_Gpu_Register_Custom_Call(
 
 const PJRT_Api* GetGpuPjrtApi() {
   static PJRT_Gpu_Custom_Call custom_call{
-      /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
-      /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call,
-      /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&stream),
+      PJRT_Extension_Base{
+          /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
+          /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call,
+          /*next=*/&stream.base,
+      },
       /*custom_call=*/PJRT_Gpu_Register_Custom_Call,
   };
 
   static PJRT_Layouts_Extension layouts_extension =
-      pjrt::CreateLayoutsExtension(
-          reinterpret_cast<PJRT_Extension_Base*>(&custom_call));
+      pjrt::CreateLayoutsExtension(&custom_call.base);
 
-  static PJRT_FFI_Extension ffi_extension = pjrt::CreateFfiExtension(
-      reinterpret_cast<PJRT_Extension_Base*>(&layouts_extension));
+  static PJRT_FFI_Extension ffi_extension =
+      pjrt::CreateFfiExtension(&layouts_extension.base);
 
   static PJRT_MemoryDescriptions_Extension memory_descriptions_extension =
-      pjrt::CreateMemoryDescriptionsExtension(
-          reinterpret_cast<PJRT_Extension_Base*>(&ffi_extension));
+      pjrt::CreateMemoryDescriptionsExtension(&ffi_extension.base);
 
-  static PJRT_Triton_Extension triton_extension = pjrt::CreateTritonExtension(
-      reinterpret_cast<PJRT_Extension_Base*>(&memory_descriptions_extension));
+  static PJRT_Triton_Extension triton_extension =
+      pjrt::CreateTritonExtension(&memory_descriptions_extension.base);
 
   static const PJRT_Api pjrt_api = pjrt::CreatePjrtApi(
       pjrt::gpu_plugin::PJRT_Client_Create,
       pjrt::gpu_plugin::PJRT_ExecuteContext_Create,
       pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
-      pjrt::PJRT_Plugin_Initialize_NoOp,
-      reinterpret_cast<PJRT_Extension_Base*>(&triton_extension),
+      pjrt::PJRT_Plugin_Initialize_NoOp, &triton_extension.base,
       pjrt::PJRT_Plugin_Attributes_Xla);
 
   return &pjrt_api;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 2304621ba981..03bff3c225f1 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 #include "tsl/platform/mem.h"
 
 namespace pjrt {
@@ -255,6 +256,41 @@ TEST_F(PjrtCApiGpuBufferTest, CopyRawToHostWithInvalidOffset) {
   free(args.dst);
 }
 
+// TODO(b/399495406): Add tests for other GPU Executable behaviors.
+class PjrtCApiGpuExecutableTest : public PjrtCApiGpuTest {
+ protected:
+  std::unique_ptr<PJRT_LoadedExecutable, PJRT_LoadedExecutableDeleter>
+      executable_;
+
+  PjrtCApiGpuExecutableTest() {
+    executable_ = create_executable(api_, client_);
+  }
+
+  ~PjrtCApiGpuExecutableTest() override { executable_.reset(); }
+};
+
+TEST_F(PjrtCApiGpuExecutableTest, GetCompiledMemoryStats) {
+  auto executable = PjrtCApiTestBase::GetExecutable(executable_.get(), api_);
+  TF_ASSERT_OK_AND_ASSIGN(auto stats,
+                          pjrt::GetCompiledMemoryStats(api_, executable.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref_stats,
+                          executable.get()->get()->GetCompiledMemoryStats());
+  EXPECT_EQ(ref_stats.generated_code_size_in_bytes,
+            stats.generated_code_size_in_bytes);
+  EXPECT_EQ(ref_stats.argument_size_in_bytes, stats.argument_size_in_bytes);
+  EXPECT_EQ(ref_stats.output_size_in_bytes, stats.output_size_in_bytes);
+  EXPECT_EQ(ref_stats.alias_size_in_bytes, stats.alias_size_in_bytes);
+  EXPECT_EQ(ref_stats.temp_size_in_bytes, stats.temp_size_in_bytes);
+  EXPECT_EQ(ref_stats.host_generated_code_size_in_bytes,
+            stats.host_generated_code_size_in_bytes);
+  EXPECT_EQ(ref_stats.host_argument_size_in_bytes,
+            stats.host_argument_size_in_bytes);
+  EXPECT_EQ(ref_stats.host_output_size_in_bytes,
+            stats.host_output_size_in_bytes);
+  EXPECT_EQ(ref_stats.host_alias_size_in_bytes, stats.host_alias_size_in_bytes);
+  EXPECT_EQ(ref_stats.host_temp_size_in_bytes, stats.host_temp_size_in_bytes);
+}
+
 TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
   PJRT_ExecuteContext_Create_Args create_arg;
   create_arg.struct_size = PJRT_ExecuteContext_Create_Args_STRUCT_SIZE;
@@ -295,15 +331,15 @@ TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
 }
 
 TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
-  void* host_dma_ptr = nullptr;
   size_t dma_size = 1024 * 1024;
-  ASSERT_EQ(posix_memalign(&host_dma_ptr, dma_size, dma_size), 0);
+  size_t alignment = 1024 * 1024;
+  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
 
   PJRT_Client_DmaMap_Args dma_args;
   dma_args.struct_size = PJRT_Client_DmaMap_Args_STRUCT_SIZE;
   dma_args.extension_start = nullptr;
   dma_args.client = client_;
-  dma_args.data = host_dma_ptr;
+  dma_args.data = host_dma_ptr.get();
   dma_args.size = dma_size;
   PJRT_Error* dma_error = api_->PJRT_Client_DmaMap(&dma_args);
   ASSERT_EQ(dma_error, nullptr);
@@ -313,12 +349,10 @@ TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
   unmap_args.struct_size = PJRT_Client_DmaUnmap_Args_STRUCT_SIZE;
   unmap_args.extension_start = nullptr;
   unmap_args.client = client_;
-  unmap_args.data = host_dma_ptr;
+  unmap_args.data = host_dma_ptr.get();
   PJRT_Error* unmap_error = api_->PJRT_Client_DmaUnmap(&unmap_args);
   ASSERT_EQ(unmap_error, nullptr);
   MakeErrorDeleter(api_)(unmap_error);
-
-  free(host_dma_ptr);
 }
 
 TEST_F(PjrtCApiGpuTransferManagerTest, SetBufferError) {
@@ -914,8 +948,7 @@ TEST(PjrtCApiGpuExtensionTest, CustomCallUntyped) {
   args.handler_initialize = nullptr;
   args.handler_execute = reinterpret_cast<void*>(&TestCustomCallV2);
   auto api = GetPjrtApi();
-  const PJRT_Extension_Base* next =
-      reinterpret_cast<const PJRT_Extension_Base*>(api->extension_start);
+  const PJRT_Extension_Base* next = api->extension_start;
   while (next != nullptr &&
          next->type !=
              PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call) {
@@ -947,8 +980,7 @@ TEST(PjrtCApiGpuExtensionTest, CustomCallTyped) {
   args.handler_initialize = nullptr;
   args.handler_execute = reinterpret_cast<void*>(kNoop);
   auto api = GetPjrtApi();
-  const PJRT_Extension_Base* next =
-      reinterpret_cast<const PJRT_Extension_Base*>(api->extension_start);
+  const PJRT_Extension_Base* next = api->extension_start;
   while (next != nullptr &&
          next->type !=
              PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call) {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index ee2c6ad60242..9d35a4242f81 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -73,7 +73,6 @@ PJRT_ClientDeleter MakeClientDeleter(const PJRT_Api* api) {
     destroy_args.client = client;
 
     PJRT_Error* error = api->PJRT_Client_Destroy(&destroy_args);
-    // TODO(b/236710439): handle the error and remove this CHECK() call
     CHECK(error == nullptr);
   };
 }
@@ -1074,7 +1073,7 @@ absl::string_view PlatformName(const PJRT_Api* api,
   PJRT_TopologyDescription_PlatformName_Args args;
   args.struct_size = PJRT_TopologyDescription_PlatformName_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
-  args.topology = const_cast<PJRT_TopologyDescription*>(topo_desc);
+  args.topology = topo_desc;
   LogFatalIfPjrtError(api->PJRT_TopologyDescription_PlatformName(&args), api);
   return {args.platform_name, args.platform_name_size};
 }
@@ -1085,7 +1084,7 @@ absl::Span<PJRT_DeviceDescription* const> DeviceDescriptions(
   args.struct_size =
       PJRT_TopologyDescription_GetDeviceDescriptions_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
-  args.topology = const_cast<PJRT_TopologyDescription*>(topo_desc);
+  args.topology = topo_desc;
   LogFatalIfPjrtError(
       api->PJRT_TopologyDescription_GetDeviceDescriptions(&args), api);
   return {args.descriptions, args.num_descriptions};
@@ -1120,9 +1119,11 @@ PJRT_Profiler_Extension CreatePjrtProfilerExtension(
       traceme_name, tsl::profiler::ContextType::kPjrtLibraryCall);
   int64_t traceme_context_id = producer.GetContextId();
   PJRT_Profiler_Extension profiler_extension{
-      /*struct_size=*/PJRT_Profiler_Extension_STRUCT_SIZE,
-      /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Profiler,
-      /*next=*/nullptr,
+      PJRT_Extension_Base{
+          /*struct_size=*/PJRT_Profiler_Extension_STRUCT_SIZE,
+          /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Profiler,
+          /*next=*/nullptr,
+      },
       /*profiler_api=*/nullptr,
       /*traceme_context_id=*/traceme_context_id,
   };
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index bf9681cd4f31..62deee165a55 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
index 4ab5a1f6dbe8..403101800198 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
@@ -111,9 +111,7 @@ typedef PJRT_Error* PJRT_Layouts_PJRT_Client_GetDefaultLayout(
 // --------------------------- Extension entrypoint ----------------------------
 
 typedef struct PJRT_Layouts_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
 
   PJRT_Layouts_MemoryLayout_Destroy* PJRT_Layouts_MemoryLayout_Destroy;
   PJRT_Layouts_MemoryLayout_Serialize* PJRT_Layouts_MemoryLayout_Serialize;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h
index 9f5fb6fc7246..06e24b6dab17 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_memory_descriptions_extension.h
@@ -69,9 +69,7 @@ typedef PJRT_Error* PJRT_MemoryDescription_Kind(
     PJRT_MemoryDescription_Kind_Args* args);
 
 typedef struct PJRT_MemoryDescriptions_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   PJRT_DeviceDescription_MemoryDescriptions*
       PJRT_DeviceDescription_MemoryDescriptions;
   PJRT_MemoryDescription_Kind* PJRT_MemoryDescription_Kind;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
index 35222ed40d5e..26d1d3a387f4 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
@@ -29,9 +29,7 @@ extern "C" {
 #define PJRT_API_PROFILER_EXTENSION_VERSION 1
 
 typedef struct PJRT_Profiler_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   // can be nullptr if PJRT_Profiler_Extension is used as an args extension
   PLUGIN_Profiler_Api* profiler_api;
   // valid only when used as an args extension
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_extension.h
index 50e943f712ca..69d9c1ee8080 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_extension.h
@@ -110,9 +110,7 @@ typedef PJRT_Error* PJRT_RawBuffer_CopyRawHostToDevice(
 #define _PJRT_API_STRUCT_FIELD(fn_type) fn_type* fn_type
 
 typedef struct PJRT_RawBuffer_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   _PJRT_API_STRUCT_FIELD(PJRT_RawBuffer_CreateRawAliasOfBuffer);
   _PJRT_API_STRUCT_FIELD(PJRT_RawBuffer_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_RawBuffer_GetOnDeviceSizeInBytes);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.cc
index 4d98cc067a9f..0857eb095ce4 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/casts.h"
 
 #define PJRT_RETURN_FUTURE_IF_ERROR(expr, c_api)                         \
   do {                                                                   \
@@ -68,14 +69,14 @@ PJRT_Memory* PjRtCApiRawBuffer_GetMemorySpace(
   return args.memory_space;
 }
 
-absl::StatusOr<size_t> PjRtCApiRawBuffer_GetOnDeviceSizeInBytes(
+size_t PjRtCApiRawBuffer_GetOnDeviceSizeInBytes(
     const PJRT_Api* c_api, const PJRT_RawBuffer_Extension* extension,
     PJRT_RawBuffer* buffer) {
   PJRT_RawBuffer_GetOnDeviceSizeInBytes_Args args;
   args.struct_size = PJRT_RawBuffer_GetOnDeviceSizeInBytes_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.buffer = buffer;
-  RETURN_STATUS_IF_PJRT_ERROR(
+  pjrt::LogFatalIfPjrtError(
       extension->PJRT_RawBuffer_GetOnDeviceSizeInBytes(&args), c_api);
   return args.on_device_size_in_bytes;
 }
@@ -139,7 +140,7 @@ PjRtMemorySpace* PjRtCApiRawBuffer::memory_space() const {
       pjrt::PjRtCApiRawBuffer_GetMemorySpace(c_api_, c_extension_, c_buffer_));
 }
 
-absl::StatusOr<size_t> PjRtCApiRawBuffer::GetOnDeviceSizeInBytes() const {
+size_t PjRtCApiRawBuffer::GetOnDeviceSizeInBytes() const {
   return pjrt::PjRtCApiRawBuffer_GetOnDeviceSizeInBytes(c_api_, c_extension_,
                                                         c_buffer_);
 }
@@ -172,8 +173,9 @@ PjRtCApiBuffer_CreateRawAliasOfBuffer_Factory(PjRtBuffer* buffer) {
                         pjrt::PjRtCApiBuffer_CreateRawAliasOfBuffer(
                             c_api, extension, c_api_buffer->c_buffer()));
     return tsl::MakeRef<PjRtCApiRawBuffer>(
-        raw_buffer, reinterpret_cast<PjRtCApiClient*>(c_api_buffer->client()),
-        c_api, extension);
+        raw_buffer,
+        tensorflow::down_cast<PjRtCApiClient*>(c_api_buffer->client()), c_api,
+        extension);
   }
   return std::nullopt;
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.h
index 1174aa790ae0..17b526f8778b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_external.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PJRT_C_PJRT_C_API_RAW_BUFFER_EXTERNAL_H_
 #define XLA_PJRT_C_PJRT_C_API_RAW_BUFFER_EXTERNAL_H_
 
+#include "absl/status/statusor.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_raw_buffer_extension.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
@@ -31,7 +32,7 @@ PJRT_Memory* PjRtCApiRawBuffer_GetMemorySpace(
     const PJRT_Api* c_api, const PJRT_RawBuffer_Extension* extension,
     PJRT_RawBuffer* buffer);
 
-absl::StatusOr<size_t> PjRtCApiRawBuffer_GetOnDeviceSizeInBytes(
+size_t PjRtCApiRawBuffer_GetOnDeviceSizeInBytes(
     const PJRT_Api* c_api, const PJRT_RawBuffer_Extension* extension,
     PJRT_RawBuffer* buffer);
 
@@ -64,7 +65,7 @@ class PjRtCApiRawBuffer : public PjRtRawBuffer {
   ~PjRtCApiRawBuffer() override;
 
   PjRtMemorySpace* memory_space() const override;
-  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+  size_t GetOnDeviceSizeInBytes() const override;
   PjRtFuture<> CopyRawHostToDevice(const void* src, int64_t offset,
                                    int64_t transfer_size) override;
   PjRtFuture<> CopyRawDeviceToHost(void* dst, int64_t offset,
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_internal.cc
index f0a0deccb8fa..99fdc5debcba 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_raw_buffer_internal.cc
@@ -57,8 +57,8 @@ PJRT_Error* PJRT_RawBuffer_GetOnDeviceSizeInBytes(
       "PJRT_RawBuffer_GetOnDeviceSizeInBytes_Args",
       PJRT_RawBuffer_GetOnDeviceSizeInBytes_Args_STRUCT_SIZE,
       args->struct_size));
-  PJRT_ASSIGN_OR_RETURN(args->on_device_size_in_bytes,
-                        args->buffer->buffer->GetOnDeviceSizeInBytes());
+  args->on_device_size_in_bytes =
+      args->buffer->buffer->GetOnDeviceSizeInBytes();
   return nullptr;
 }
 PJRT_Error* PJRT_RawBuffer_GetMemorySpace(
@@ -98,9 +98,11 @@ PJRT_Error* PJRT_RawBuffer_CopyRawDeviceToHost(
 
 PJRT_RawBuffer_Extension CreateRawBufferExtension(PJRT_Extension_Base* next) {
   return {
-      /*struct_size=*/PJRT_RawBuffer_Extension_STRUCT_SIZE,
-      /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_RawBuffer,
-      /*next=*/next,
+      PJRT_Extension_Base{
+          /*struct_size=*/PJRT_RawBuffer_Extension_STRUCT_SIZE,
+          /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_RawBuffer,
+          /*next=*/next,
+      },
       /*PJRT_RawBuffer_CreateRawAliasOfBuffer=*/
       pjrt::PJRT_RawBuffer_CreateRawAliasOfBuffer,
       /*PJRT_RawBuffer_Destroy=*/pjrt::PJRT_RawBuffer_Destroy,
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h
index 3c691d43c413..d277becd64ce 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h
@@ -51,9 +51,7 @@ typedef PJRT_Error* PJRT_Wait_Until_Buffer_Ready_On_Stream(
     PJRT_Wait_Until_Buffer_Ready_On_Stream_Args* args);
 
 typedef struct PJRT_Stream_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   PJRT_Get_Stream_For_External_Ready_Events* get_stream;
   PJRT_Wait_Until_Buffer_Ready_On_Stream* wait_stream;
 } PJRT_Stream_Extension;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
index 0517ddfc605d..8fb3d70a5b13 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
@@ -23,11 +23,15 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/client/executable_build_options.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"  // IWYU pragma: keep
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -36,6 +40,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/status.h"
+#include "xla/xla_data.pb.h"
 
 namespace pjrt {
 namespace {
@@ -271,4 +276,43 @@ PjrtCApiTestBase::create_transfer_manager(const xla::Shape& host_shape) {
   return transfer_manager_out;
 }
 
+xla::XlaComputation PjrtCApiTestBase::CreateAddOneComputation() {
+  xla::XlaBuilder builder(std::string{kExecutableName});
+  xla::Shape s = xla::ShapeUtil::MakeShape(xla::F32, {});
+  auto inp = Parameter(&builder, 0, s, "input");
+  auto one = xla::ConstantR0<float>(&builder, 1.0f);
+  auto incremented = Add(inp, one);
+  return builder.Build(incremented).value();
+}
+
+std::unique_ptr<PJRT_LoadedExecutable, PJRT_LoadedExecutableDeleter>
+PjrtCApiTestBase::create_executable(const PJRT_Api* c_api,
+                                    PJRT_Client* client) {
+  return create_executable(c_api, client, CreateAddOneComputation());
+}
+
+std::unique_ptr<PJRT_LoadedExecutable, PJRT_LoadedExecutableDeleter>
+PjrtCApiTestBase::create_executable(const PJRT_Api* c_api, PJRT_Client* client,
+                                    const xla::XlaComputation& computation) {
+  xla::CompileOptions compile_options;
+  compile_options.executable_build_options.set_num_replicas(1);
+  auto compile_result =
+      client->client->CompileAndLoad(computation, compile_options);
+  CHECK_OK(compile_result.status());
+  CHECK_NE(compile_result.value().get(), nullptr);
+  return {new PJRT_LoadedExecutable{std::move(compile_result).value(), client},
+          MakeLoadedExecutableDeleter(c_api)};
+}
+
+std::unique_ptr<PJRT_Executable, PJRT_ExecutableDeleter>
+PjrtCApiTestBase::GetExecutable(PJRT_LoadedExecutable* loaded_executable,
+                                const PJRT_Api* api) {
+  PJRT_LoadedExecutable_GetExecutable_Args args;
+  args.struct_size = PJRT_LoadedExecutable_GetExecutable_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.loaded_executable = loaded_executable;
+  args.executable = nullptr;
+  LogFatalIfPjrtError(api->PJRT_LoadedExecutable_GetExecutable(&args), api);
+  return {args.executable, MakeExecutableDeleter(api)};
+}
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
index 5ea9695e86ab..2812b7d2261c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
@@ -20,7 +20,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/hlo/builder/xla_computation.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -62,6 +64,20 @@ class PjrtCApiTestBase : public ::testing::Test {
 
   std::string BuildSingleDeviceCompileOptionStr();
 
+  static constexpr absl::string_view kExecutableName = "operation";
+
+  xla::XlaComputation CreateAddOneComputation();
+
+  std::unique_ptr<PJRT_LoadedExecutable, PJRT_LoadedExecutableDeleter>
+  create_executable(const PJRT_Api* c_api, PJRT_Client* client);
+
+  std::unique_ptr<PJRT_LoadedExecutable, PJRT_LoadedExecutableDeleter>
+  create_executable(const PJRT_Api* c_api, PJRT_Client* client,
+                    const xla::XlaComputation& computation);
+
+  std::unique_ptr<PJRT_Executable, PJRT_ExecutableDeleter> GetExecutable(
+      PJRT_LoadedExecutable* loaded_executable, const PJRT_Api* api);
+
   absl::Span<PJRT_Device* const> GetClientAddressableDevices() const;
 
   PJRT_Client_BufferFromHostBuffer_Args CreateBufferFromHostBufferArgs(
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
index 5fa0e866e07e..3eef6aca6b2c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
@@ -49,9 +49,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Triton_Compile_Args, out_cluster_dim_z);
 typedef PJRT_Error* PJRT_Triton_Compile(PJRT_Triton_Compile_Args* args);
 
 typedef struct PJRT_Triton_Extension {
-  size_t struct_size;
-  PJRT_Extension_Type type;
-  PJRT_Extension_Base* next;
+  PJRT_Extension_Base base;
   PJRT_Triton_Compile* compile;
 } PJRT_Triton;
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Triton_Extension, compile);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.h
index d10e9e8a0150..a8d955f42711 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.h
@@ -25,9 +25,11 @@ PJRT_Error* PJRT_Triton_Compile(PJRT_Triton_Compile_Args* args);
 
 inline PJRT_Triton_Extension CreateTritonExtension(PJRT_Extension_Base* next) {
   return {
-      /*struct_size=*/PJRT_Triton_Extension_STRUCT_SIZE,
-      /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Triton,
-      /*next=*/next,
+      PJRT_Extension_Base{
+          /*struct_size=*/PJRT_Triton_Extension_STRUCT_SIZE,
+          /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Triton,
+          /*next=*/next,
+      },
       /*compile=*/PJRT_Triton_Compile,
   };
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index e71a321267f0..0f76e7f65db2 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -823,7 +823,7 @@ PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args) {
   PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtLoadedExecutable> executable,
                         std::visit(
                             [args, &options](auto& program) {
-                              return args->client->client->Compile(
+                              return args->client->client->CompileAndLoad(
                                   UnpackPjrtProgram(program), options);
                             },
                             module_or_hlo));
@@ -1360,7 +1360,7 @@ PJRT_Error* PJRT_Executable_NumOutputs(PJRT_Executable_NumOutputs_Args* args) {
   }
   const xla::Shape& shape = output_shapes[0];
   if (shape.IsTuple()) {
-    args->num_outputs = shape.tuple_shapes_size();
+    args->num_outputs = shape.tuple_shapes().size();
   } else {
     // The output size is 1, as it is not a tuple.
     args->num_outputs = 1;
@@ -1765,8 +1765,9 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
     std::vector<std::unique_ptr<xla::PjRtBuffer>> cpp_buffer_list;
     std::optional<xla::PjRtFuture<>> returned_future;
     bool fill_future = args->device_complete_events != nullptr;
-    PJRT_ASSIGN_OR_RETURN(xla::CompileOptions compile_options,
-                          args->executable->get()->GetCompileOptions());
+    PJRT_ASSIGN_OR_RETURN(
+        xla::CompileOptions compile_options,
+        args->executable->get()->GetExecutable()->GetCompileOptions());
     if (compile_options.compile_portable_executable) {
       PJRT_ASSIGN_OR_RETURN(
           cpp_buffer_list,
@@ -1847,9 +1848,10 @@ PJRT_Error* PJRT_Executable_DeserializeAndLoad(
   absl::string_view serialized(args->serialized_executable,
                                args->serialized_executable_size);
 
-  PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtLoadedExecutable> executable,
-                        args->client->client->DeserializeExecutable(
-                            serialized, /*options=*/std::nullopt));
+  PJRT_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtLoadedExecutable> executable,
+      args->client->client->LoadSerializedExecutable(
+          serialized, /*options=*/std::nullopt, xla::LoadOptions()));
 
   args->loaded_executable =
       new PJRT_LoadedExecutable(std::move(executable), args->client);
@@ -1861,7 +1863,8 @@ PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_LoadedExecutable_GetExecutable_Args",
       PJRT_LoadedExecutable_GetExecutable_Args_STRUCT_SIZE, args->struct_size));
-  args->executable = new PJRT_Executable{args->loaded_executable->executable};
+  args->executable =
+      new PJRT_Executable{args->loaded_executable->executable->GetExecutable()};
   return nullptr;
 }
 
@@ -2618,9 +2621,15 @@ PJRT_Client::PJRT_Client(std::unique_ptr<xla::PjRtClient> cpp_client)
       topology(pjrt::GetStatusOrTopologyDescription(*client)) {}
 
 PJRT_Executable::PJRT_Executable(
-    std::shared_ptr<xla::PjRtExecutable> executable)
-    : executable(std::move(executable)),
-      fingerprint(this->executable->FingerprintExecutable()) {}
+    std::shared_ptr<xla::PjRtExecutable> shared_executable)
+    : shared_executable(std::move(shared_executable)),
+      fingerprint(this->shared_executable->FingerprintExecutable()) {
+  executable = this->shared_executable.get();
+}
+
+PJRT_Executable::PJRT_Executable(xla::PjRtExecutable* unowned_executable)
+    : executable(unowned_executable),
+      fingerprint(executable->FingerprintExecutable()) {}
 
 PJRT_LoadedExecutable::PJRT_LoadedExecutable(
     std::shared_ptr<xla::PjRtLoadedExecutable> executable, PJRT_Client* client)
@@ -2830,9 +2839,11 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
 
 PJRT_Layouts_Extension CreateLayoutsExtension(PJRT_Extension_Base* next) {
   return PJRT_Layouts_Extension{
-      /*struct_size=*/PJRT_Layouts_Extension_STRUCT_SIZE,
-      /*type=*/PJRT_Extension_Type_Layouts,
-      /*next=*/next,
+      PJRT_Extension_Base{
+          /*struct_size=*/PJRT_Layouts_Extension_STRUCT_SIZE,
+          /*type=*/PJRT_Extension_Type_Layouts,
+          /*next=*/next,
+      },
       /*PJRT_Layouts_MemoryLayout_Destroy=*/
       pjrt::PJRT_Layouts_MemoryLayout_Destroy,
       /*PJRT_Layouts_MemoryLayout_Serialize=*/
@@ -2847,14 +2858,15 @@ PJRT_Layouts_Extension CreateLayoutsExtension(PJRT_Extension_Base* next) {
 PJRT_MemoryDescriptions_Extension CreateMemoryDescriptionsExtension(
     PJRT_Extension_Base* next) {
   return PJRT_MemoryDescriptions_Extension{
-      /*struct_size=*/PJRT_MemoryDescriptions_Extension_STRUCT_SIZE,
-      /*type=*/PJRT_Extension_Type_MemoryDescriptions,
-      /*next=*/next,
+      PJRT_Extension_Base{
+          /*struct_size=*/PJRT_MemoryDescriptions_Extension_STRUCT_SIZE,
+          /*type=*/PJRT_Extension_Type_MemoryDescriptions,
+          /*next=*/next,
+      },
       /*PJRT_DeviceDescription_MemorySpaces=*/
       pjrt::PJRT_DeviceDescription_MemoryDescriptions,
       /*PJRT_MemoryDescription_Kind=*/
-      pjrt::PJRT_MemoryDescription_Kind,
-  };
+      pjrt::PJRT_MemoryDescription_Kind};
 }
 
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index b926890a99bd..b92d31e5306c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -130,7 +130,8 @@ struct PJRT_ExecuteContext {
 
 struct PJRT_Executable {
   // Must be shared_ptr so that we can share with PJRT_LoadedExecutable.
-  std::shared_ptr<xla::PjRtExecutable> executable;
+  std::shared_ptr<xla::PjRtExecutable> shared_executable;
+  xla::PjRtExecutable* executable;
 
   absl::StatusOr<std::string> fingerprint;
 
@@ -156,9 +157,10 @@ struct PJRT_Executable {
   std::vector<size_t> out_dimension_sizes;
 
   explicit PJRT_Executable(std::shared_ptr<xla::PjRtExecutable> executable);
+  explicit PJRT_Executable(xla::PjRtExecutable* executable);
 
-  const xla::PjRtExecutable* get() const { return executable.get(); }
-  xla::PjRtExecutable* get() { return executable.get(); }
+  const xla::PjRtExecutable* get() const { return executable; }
+  xla::PjRtExecutable* get() { return executable; }
 };
 
 struct PJRT_LoadedExecutable {
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
new file mode 100644
index 000000000000..d17d56f9ceda
--- /dev/null
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -0,0 +1,223 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/common_pjrt_client.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/context_types.h"
+
+namespace xla {
+
+tsl::AsyncValueRef<bool> CommonPjRtClient::CreateAllocationEventForTransfers(
+    PjRtMemorySpace* memory_space,
+    const std::optional<std::string>& debug_info) {
+  return tsl::AsyncValueRef<bool>();
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+CommonPjRtClient::BufferFromHostLiteral(const LiteralSlice& literal,
+                                        PjRtMemorySpace* memory_space,
+                                        const Layout* device_layout) {
+  const Shape& shape = literal.shape();
+
+  if (shape.IsTuple()) {
+    return InvalidArgument(
+        "Tuples are not supported in CommonPjRtClient::BufferFromHostLiteral");
+  }
+  tsl::profiler::TraceMeProducer producer(
+      "CommonPjRtClient::BufferFromHostLiteral",
+      tsl::profiler::ContextType::kPjRt);
+  TF_ASSIGN_OR_RETURN(
+      Shape device_shape,
+      MakeDefaultShapeForMemorySpace(memory_space, shape, device_layout));
+  TF_ASSIGN_OR_RETURN(
+      auto promise_and_event,
+      CreateLinkedEventPromise(memory_space, "BufferFromHostLiteral"));
+  TF_ASSIGN_OR_RETURN(int64_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, device_shape));
+  TF_ASSIGN_OR_RETURN(auto raw_buffer,
+                      AllocateRawBuffer(memory_space, on_device_bytes_count,
+                                        /*allocate_after=*/{}));
+  TF_ASSIGN_OR_RETURN(auto output_buffer,
+                      DefineBuffer(device_shape, raw_buffer,
+                                   {std::move(promise_and_event.second)},
+                                   /*raw_buffer_is_mutable=*/true));
+
+  async_work_runner()->Schedule(
+      [this, shape, literal, raw_buffer = std::move(raw_buffer),
+       definition_event = std::move(promise_and_event.first),
+       device_layout = device_shape.layout(),
+       context_id = producer.GetContextId()]() mutable {
+        tsl::profiler::TraceMeConsumer consumer(
+            "BufferFromHostLiteral H2D Dispatch",
+            tsl::profiler::ContextType::kPjRt, context_id);
+        auto status_or_h2d_transfer_event =
+            LinearizeInto(literal, device_layout, std::move(raw_buffer));
+        CHECK_OK(status_or_h2d_transfer_event);
+        auto h2d_transfer_event = *std::move(status_or_h2d_transfer_event);
+        if (event_tracking_enabled()) {
+          h2d_transfer_event->AppendDescriptionToEvent(
+              " TransferToDevice ", {definition_event.get()});
+        }
+        definition_event->Set(std::move(h2d_transfer_event));
+      });
+  return output_buffer;
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+CommonPjRtClient::CreateUninitializedBuffer(const Shape& shape,
+                                            PjRtMemorySpace* memory_space) {
+  if (shape.IsTuple()) {
+    return InvalidArgument(
+        "Tuples are not supported in "
+        "CommonPjRtClient::CreateUninitializedBuffer");
+  }
+  Shape device_shape;
+  if (!primitive_util::IsArrayType(shape.element_type())) {
+    device_shape = shape;
+  } else {
+    TF_ASSIGN_OR_RETURN(device_shape, MakeDefaultShapeForMemorySpace(
+                                          memory_space, shape, nullptr));
+  }
+  TF_ASSIGN_OR_RETURN(int64_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, device_shape));
+  TF_ASSIGN_OR_RETURN(auto raw_buffer,
+                      AllocateRawBuffer(memory_space, on_device_bytes_count,
+                                        /*allocate_after=*/{}));
+  TF_ASSIGN_OR_RETURN(auto definition_event,
+                      raw_buffer->MakeAllocationReadyEvent());
+  TF_ASSIGN_OR_RETURN(
+      auto output_buffer,
+      DefineBuffer(device_shape, raw_buffer, {std::move(definition_event)},
+                   /*raw_buffer_is_mutable=*/true));
+  return output_buffer;
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+CommonPjRtClient::BufferFromHostBuffer(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    HostBufferSemantics host_buffer_semantics,
+    absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+    PjRtMemorySpace* memory_space, const Layout* device_layout) {
+  TF_ASSIGN_OR_RETURN(
+      Shape device_shape,
+      MakeDefaultShapeForMemorySpace(
+          memory_space, ShapeUtil::MakeShape(type, dims), device_layout));
+  if (host_buffer_semantics ==
+          PjRtClient::HostBufferSemantics::kImmutableZeroCopy ||
+      host_buffer_semantics ==
+          PjRtClient::HostBufferSemantics::kMutableZeroCopy) {
+    if (BufferFromHostBufferSupportsZeroCopy(data, type, dims, byte_strides,
+                                             device_shape, memory_space,
+                                             device_layout)) {
+      TF_ASSIGN_OR_RETURN(int64_t on_device_bytes_count,
+                          GetOnDeviceBytesCount(memory_space, device_shape));
+      TF_ASSIGN_OR_RETURN(
+          auto raw_buffer,
+          ImportForeignMemory(
+              const_cast<void*>(data),  // CONST_CAST_OK=flag controlled.
+              std::move(on_done_with_host_buffer), on_device_bytes_count,
+              memory_space));
+      TF_ASSIGN_OR_RETURN(
+          auto output_buffer,
+          DefineBuffer(
+              device_shape, raw_buffer,
+              absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
+              /*raw_buffer_is_mutable=*/host_buffer_semantics ==
+                  PjRtClient::HostBufferSemantics::kMutableZeroCopy));
+      return output_buffer;
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(int64_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, device_shape));
+  TF_ASSIGN_OR_RETURN(auto raw_buffer,
+                      AllocateRawBuffer(memory_space, on_device_bytes_count,
+                                        /*allocate_after=*/{}));
+  TF_ASSIGN_OR_RETURN(
+      auto definition_event,
+      LinearizeHostBufferInto(
+          data, type, dims, byte_strides, host_buffer_semantics,
+          std::move(on_done_with_host_buffer), device_shape, raw_buffer));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtBuffer> output_buffer,
+      DefineBuffer(device_shape, raw_buffer, {std::move(definition_event)},
+                   /*raw_buffer_is_mutable=*/true));
+  return output_buffer;
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+CommonPjRtClient::CreateViewOfDeviceBuffer(
+    void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
+    std::function<void()> on_delete_callback,
+    std::optional<std::intptr_t> stream) {
+  if (stream) {
+    return Unimplemented(
+        "CommonPjRtClient::CreateViewOfDeviceBuffer does not support `stream` "
+        "argument.");
+  }
+  TF_ASSIGN_OR_RETURN(Shape device_shape, MakeDefaultShapeForMemorySpace(
+                                              memory_space, shape, nullptr));
+  TF_ASSIGN_OR_RETURN(int64_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, device_shape));
+  TF_ASSIGN_OR_RETURN(
+      auto raw_buffer,
+      ImportForeignMemory(device_ptr, std::move(on_delete_callback),
+                          on_device_bytes_count, memory_space));
+  TF_ASSIGN_OR_RETURN(
+      auto output_buffer,
+      DefineBuffer(device_shape, raw_buffer,
+                   absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
+                   /*raw_buffer_is_mutable=*/false));
+  return output_buffer;
+}
+
+absl::StatusOr<xla::Shape> CommonPjRtClient::MakeDefaultShapeForMemorySpace(
+    PjRtMemorySpace* memory_space, xla::Shape shape,
+    const xla::Layout* layout) const {
+  if (layout) {
+    *shape.mutable_layout() = *layout;
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        *shape.mutable_layout(),
+        (*GetTopologyDescription())
+            ->GetDefaultLayout(shape.element_type(), shape.dimensions()));
+  }
+  return shape;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
new file mode 100644
index 000000000000..8bea055be7bf
--- /dev/null
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -0,0 +1,146 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_COMMON_PJRT_CLIENT_H_
+#define XLA_PJRT_COMMON_PJRT_CLIENT_H_
+
+#include "xla/pjrt/async_work_runner.h"
+#include "xla/pjrt/device_event.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/raw_buffer.h"
+
+namespace xla {
+
+// A common base class for Pjrt clients based on raw buffers.
+class CommonPjRtClient : public PjRtClient {
+ public:
+  using PjRtClient::PjRtClient;
+
+  // A thread pool for dispatching background work.
+  // TODO(parkers): make pure virtual and update all clients.
+  virtual AsyncWorkRunner* async_work_runner() const { return nullptr; }
+
+  // Computes the memory requirements for storing shape on memory_space.
+  // TODO(parkers): make pure virtual and update all clients.
+  virtual absl::StatusOr<int64_t> GetOnDeviceBytesCount(
+      PjRtMemorySpace* memory_space, const xla::Shape& shape) const {
+    return absl::UnimplementedError("GetOnDeviceBytesCount is not supported.");
+  }
+
+  // Allocates a raw buffer of a particular size after an optional
+  // allocate_after.
+  virtual absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>
+  AllocateRawBuffer(PjRtMemorySpace* memory_space, size_t on_device_bytes_count,
+                    tsl::AsyncValueRef<bool> allocate_after) {
+    return absl::UnimplementedError("AllocateRawBuffer is not supported");
+  }
+
+  // Imports foreign memory as a raw buffer.
+  virtual absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>
+  ImportForeignMemory(void* device_ptr,
+                      absl::AnyInvocable<void() &&> on_delete_callback,
+                      size_t on_device_bytes_count,
+                      PjRtMemorySpace* memory_space) {
+    return absl::UnimplementedError("ImportForeignMemory is not supported");
+  }
+
+  // Linearizes a literal into a raw buffer and returns a DeviceEvent
+  // for when the linearization is complete.
+  virtual absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> LinearizeInto(
+      const LiteralSlice& literal, const xla::Layout& layout,
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer) {
+    return absl::UnimplementedError("LinearizeInto is not supported");
+  }
+
+  // Defines a pjrt buffer from a shape, raw_buffer and definition events.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> DefineBuffer(
+      const Shape& on_device_shape,
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+      absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
+          definition_device_events,
+      bool raw_buffer_is_mutable) {
+    return absl::UnimplementedError("DefineBuffer is not supported");
+  }
+
+  // When calling APIs that take extra debug information, we may want
+  // to omit this debug information if it is not going to be used.
+  virtual bool event_tracking_enabled() { return false; }
+
+  // Create a linked device-event and device-event-promise such that
+  // setting an event into the event promise populates the device-event.
+  virtual absl::StatusOr<std::pair<tsl::RCReference<PjRtDeviceEventPromise>,
+                                   tsl::RCReference<PjRtDeviceEvent>>>
+  CreateLinkedEventPromise(PjRtMemorySpace* memory_space,
+                           absl::string_view debug_info) {
+    return absl::UnimplementedError(
+        "CreateLinkedEventPromise is not supported");
+  }
+
+  // Registers the necessary debug information for an allocation event.
+  // TODO(parkers): Once everything is unified this should be controlled
+  // by a non-device-specific config instead of delegating this control
+  // to a device-specific config.
+  virtual tsl::AsyncValueRef<bool> CreateAllocationEventForTransfers(
+      PjRtMemorySpace* memory_space,
+      const std::optional<std::string>& debug_info);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space,
+      const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream) override;
+
+  // Applies memory-space normalization logic on top of
+  // GetTopologyDescription()->GetDefaultLayout() to select the default
+  // device layout (if not provided).
+  virtual absl::StatusOr<xla::Shape> MakeDefaultShapeForMemorySpace(
+      PjRtMemorySpace* memory_space, xla::Shape shape,
+      const xla::Layout* layout) const;
+
+  virtual bool BufferFromHostBufferSupportsZeroCopy(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides, const Shape& shape,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) const {
+    return false;
+  }
+
+  virtual absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+  LinearizeHostBufferInto(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      const xla::Shape& device_shape,
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer) {
+    return absl::UnimplementedError("LinearizeHostBufferInto is not supported");
+  }
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_COMMON_PJRT_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/compile_options.proto b/third_party/xla/xla/pjrt/compile_options.proto
index a19c518a6971..cca632a24187 100644
--- a/third_party/xla/xla/pjrt/compile_options.proto
+++ b/third_party/xla/xla/pjrt/compile_options.proto
@@ -171,4 +171,5 @@ message CompileOptionsProto {
 message ExecutableAndOptionsProto {
   bytes serialized_executable = 1;
   CompileOptionsProto compile_options = 2;
+  string pjrt_client_name = 3;
 }
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 183e65aa369d..c8c0896e42be 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/pjrt/cpu:package_groups.bzl", "xla_cpu_internal_packages")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -19,21 +19,21 @@ xla_cc_test(
         ":cpu_client",
         "//xla/pjrt:pjrt_client_test_common",
         "//xla/tsl/platform:test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
 cc_library(
-    name = "tracked_tfrt_cpu_device_buffer",
-    srcs = ["tracked_tfrt_cpu_device_buffer.cc"],
-    hdrs = ["tracked_tfrt_cpu_device_buffer.h"],
+    name = "tracked_cpu_device_buffer",
+    srcs = ["tracked_cpu_device_buffer.cc"],
+    hdrs = ["tracked_cpu_device_buffer.h"],
     visibility =
         internal_visibility(["//xla/pjrt/cpu:legacy_cpu_buffer_internal_users"]),
     deps = [
-        "//xla:cpu_function_runtime",
+        ":cpu_event",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/service/cpu:cpu_event",
+        "//xla/backends/cpu:alignment",
+        "//xla/pjrt:abstract_tracked_device_buffer",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
@@ -49,38 +49,49 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "tracked_tfrt_cpu_device_buffer_test",
-    srcs = ["tracked_tfrt_cpu_device_buffer_test.cc"],
+    name = "tracked_cpu_device_buffer_test",
+    srcs = ["tracked_cpu_device_buffer_test.cc"],
     deps = [
-        ":tracked_tfrt_cpu_device_buffer",
+        ":cpu_event",
+        ":tracked_cpu_device_buffer",
         "//xla:util",
-        "//xla/service/cpu:cpu_event",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
-    name = "abstract_tfrt_cpu_buffer",
-    srcs = ["abstract_tfrt_cpu_buffer.cc"],
-    hdrs = ["abstract_tfrt_cpu_buffer.h"],
+    name = "abstract_cpu_buffer",
+    srcs = [
+        "abstract_cpu_buffer.cc",
+        "raw_buffer.cc",
+    ],
+    hdrs = [
+        "abstract_cpu_buffer.h",
+        "raw_buffer.h",
+    ],
     visibility = internal_visibility(["//xla/pjrt/cpu:legacy_cpu_buffer_internal_users"]),
     deps = [
-        ":tracked_tfrt_cpu_device_buffer",
+        ":cpu_event",
+        ":tracked_cpu_device_buffer",
         "//xla:cpu_function_runtime",
         "//xla:literal",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:abstract_tracked_device_buffer",
+        "//xla/pjrt:async_work_runner",
+        "//xla/pjrt:device_event",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:raw_buffer",
         "//xla/pjrt:transpose",
         "//xla/pjrt:utils",
         "//xla/service:shaped_buffer",
-        "//xla/service/cpu:cpu_event",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_xfeed",
         "//xla/stream_executor:device_memory",
@@ -101,6 +112,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
@@ -112,7 +124,7 @@ cc_library(
     hdrs = ["cpu_device.h"],
     visibility = internal_visibility(["//xla/pjrt/cpu:legacy_cpu_topology_users"]),
     deps = [
-        ":tfrt_cpu_async_execution_tracker",
+        ":cpu_async_execution_tracker",
         "//xla:literal",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:pjrt_client",
@@ -134,16 +146,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cpu_event",
+    hdrs = ["cpu_event.h"],
+    visibility = internal_visibility(["//xla/pjrt/cpu:legacy_cpu_client_users"]),
+)
+
 cc_library(
     name = "cpu_client",
     srcs = ["cpu_client.cc"],
     hdrs = ["cpu_client.h"],
     visibility = internal_visibility(["//xla/pjrt/cpu:legacy_cpu_client_users"]),
     deps = [
-        ":abstract_tfrt_cpu_buffer",
+        ":abstract_cpu_buffer",
+        ":cpu_async_execution_tracker",
         ":cpu_device",
-        ":tfrt_cpu_async_execution_tracker",
-        ":tracked_tfrt_cpu_device_buffer",
+        ":cpu_event",
+        ":tracked_cpu_device_buffer",
         "//xla:array",
         "//xla:cpu_function_runtime",
         "//xla:debug_options_flags",
@@ -164,7 +183,11 @@ cc_library(
         "//xla/client:executable_build_options",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/pjrt:async_work_runner",
+        "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:compile_options_proto_cc",
+        "//xla/pjrt:device_event",
         "//xla/pjrt:host_callback",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:layout_mode",
@@ -175,11 +198,11 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:raw_buffer",
         "//xla/pjrt:semaphore",
         "//xla/pjrt:transpose",
         "//xla/pjrt:utils",
         "//xla/pjrt/plugin/xla_cpu:cpu_client_options",
-        "//xla/pjrt/plugin/xla_cpu:cpu_device_description",
         "//xla/pjrt/plugin/xla_cpu:cpu_execute_options",
         "//xla/pjrt/plugin/xla_cpu:cpu_topology",
         "//xla/pjrt/plugin/xla_cpu:cpu_topology_description",
@@ -197,7 +220,6 @@ cc_library(
         "//xla/service:hlo_value",
         "//xla/service:maybe_owning_device_memory",
         "//xla/service/cpu:cpu_compiler",
-        "//xla/service/cpu:cpu_event",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_executable_run_options",
         "//xla/service/cpu:cpu_runtime",
@@ -211,6 +233,7 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
@@ -275,11 +298,11 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "tfrt_cpu_async_execution_tracker",
-    srcs = ["tfrt_cpu_async_execution_tracker.cc"],
-    hdrs = ["tfrt_cpu_async_execution_tracker.h"],
+    name = "cpu_async_execution_tracker",
+    srcs = ["cpu_async_execution_tracker.cc"],
+    hdrs = ["cpu_async_execution_tracker.h"],
     deps = [
-        "//xla/service/cpu:cpu_event",
+        ":cpu_event",
         "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
new file mode 100644
index 000000000000..1efe76b1e72d
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
@@ -0,0 +1,615 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/abstract_cpu_buffer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/cpu_function_runtime.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/async_work_runner.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/pjrt/utils.h"
+#include "xla/primitive_util.h"
+#include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/cpu/cpu_xfeed.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+namespace {
+
+constexpr size_t kSmallDataTransferByteSize = 102400;  // 100 KiB
+
+// Unpacks and copies the packed data at `input` into the literal at the given
+// ShapeIndex.
+void UnpackIntNToLiteral(PrimitiveType input_element_type,
+                         const CpuDeviceMemory& input,
+                         MutableLiteralBase* literal,
+                         const ShapeIndex& shape_index) {
+  absl::Span<const char> input_span{
+      static_cast<const char*>(input.untyped_data()), input.size_bytes()};
+  size_t output_size = static_cast<size_t>(ShapeUtil::ByteSizeOf(
+      ShapeUtil::GetSubshape(literal->shape(), shape_index)));
+  absl::Span<char> output_span{
+      static_cast<char*>(literal->untyped_data(shape_index)), output_size};
+  primitive_util::UnpackIntN(input_element_type, input_span, output_span);
+}
+
+// `device_buffer`'s definition event must be ready before calling this
+// function.
+void CopyCpuBufferToLiteral(const Shape& device_shape,
+                            TrackedCpuDeviceBuffer* device_buffer,
+                            MutableLiteralBase* literal) {
+  CHECK(!device_shape.IsTuple());
+  const tsl::AsyncValueRef<CpuDeviceMemory>& b = device_buffer->buffer();
+  CHECK(b.IsConcrete());
+  if (primitive_util::IsSubByteNonPredType(device_shape.element_type())) {
+    UnpackIntNToLiteral(device_shape.element_type(), *b, literal,
+                        /*shape_index=*/{});
+  } else {
+    std::memcpy(literal->untyped_data(), b->untyped_data(),
+                ShapeUtil::ByteSizeOf(device_shape));
+  }
+}
+
+// `buffers` must be available.
+ShapedBuffer AsShapedBuffer(int device_ordinal, const Shape& on_device_shape,
+                            tsl::AsyncValueRef<CpuDeviceMemory> buf) {
+  ShapedBuffer shaped_buffer(on_device_shape, device_ordinal);
+  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+      shaped_buffer.buffers().begin();
+  CHECK(buf.IsConcrete());
+  CHECK(iterator != shaped_buffer.buffers().end());
+  iterator->second =
+      se::DeviceMemoryBase(buf->untyped_data(), buf->size_bytes());
+  ++iterator;
+  CHECK(iterator == shaped_buffer.buffers().end());
+  return shaped_buffer;
+}
+
+}  //  namespace
+
+AbstractCpuBuffer::AbstractCpuBuffer(
+    Shape on_device_shape,
+    std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer)
+    : CommonPjRtBuffer(std::move(tracked_device_buffer)),
+      on_device_shape_(std::move(on_device_shape)) {}
+
+AbstractCpuBuffer::~AbstractCpuBuffer() { AbstractCpuBuffer::Delete(); }
+
+absl::StatusOr<Shape> AbstractCpuBuffer::logical_on_device_shape() {
+  if (on_device_shape_.is_static()) {
+    return on_device_shape_;
+  }
+
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto* device_buffer = AcquireUsage(usage_event);
+  if (device_buffer == nullptr) {
+    return InvalidArgument(
+        "logical_on_device_shape() called on deleted or donated buffer");
+  }
+  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+  // Wait for the definition event.
+  const auto& av = device_buffer->definition_event();
+  BlockUntilReady(av.GetAsyncValue());
+  if (auto* error = av.GetErrorIfPresent()) {
+    return Internal("Error Execute: %s", error->message());
+  }
+
+  // Safe to call `AsShapedBuffer` because the definition event is ready.
+  ShapedBuffer shaped_buffer =
+      AsShapedBuffer(device()->local_hardware_id().value(), on_device_shape_,
+                     device_buffer->buffer());
+  Shape ret_shape = on_device_shape_;
+  TF_RETURN_IF_ERROR(ReadDynamicShapesOnCpu(
+      &shaped_buffer, &ret_shape, cpu::CpuExecutable::ShapeSizeBytes));
+  return ret_shape;
+}
+
+absl::StatusOr<size_t> AbstractCpuBuffer::GetOnDeviceSizeInBytes() const {
+  return ShapeUtil::ByteSizeOf(on_device_shape_);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+AbstractCpuBuffer::AcquireExternalReference() {
+  class ScopedExternalReference : public PjRtBuffer::ExternalReference {
+   public:
+    explicit ScopedExternalReference(AbstractCpuBuffer::ScopedHold hold)
+        : external_reference_(std::move(hold)),
+          data_(external_reference_->buffer()) {
+      DCHECK(external_reference_.type() == ScopedHold::kExternalReference);
+      DCHECK(data_);
+      // We need to wait for the memory to be allocated before sharing it with
+      // external frameworks like NumPy.
+      tsl::BlockUntilReady(data_);
+      CHECK(data_.IsConcrete());
+      data_ptr_ = data_->untyped_data();
+    }
+
+    ~ScopedExternalReference() override = default;
+
+   private:
+    AbstractCpuBuffer::ScopedHold external_reference_;
+    // Keep a reference to the underlying data used. Note that it is still
+    // users' responsibility to synchronize reads and writes to the data.
+    tsl::AsyncValueRef<CpuDeviceMemory> data_;
+  };
+
+  ScopedHold hold = GetBufferWithHold(ScopedHold::kExternalReference);
+  TF_RETURN_IF_ERROR(hold.status());
+  return {std::make_unique<ScopedExternalReference>(std::move(hold))};
+}
+
+class TrackedCpuDeviceBufferExternalReference
+    : public PjRtBuffer::ExternalReference {
+ public:
+  explicit TrackedCpuDeviceBufferExternalReference(
+      std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer)
+      : device_buffer_(std::move(tracked_device_buffer)) {
+    // We need to wait for the memory to be allocated before sharing it with
+    // external frameworks like NumPy.
+    const auto& buffer = device_buffer_->buffer();
+    tsl::BlockUntilReady(buffer);
+    CHECK(buffer.IsConcrete());
+    data_ptr_ = buffer->untyped_data();
+  }
+
+  ~TrackedCpuDeviceBufferExternalReference() override = default;
+
+ private:
+  std::unique_ptr<TrackedCpuDeviceBuffer> device_buffer_;
+};
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+AbstractCpuBuffer::ReleaseDeviceMemoryOwnership(
+    bool wait_for_operations_to_complete) {
+  if (on_device_shape_.IsTuple()) {
+    return InvalidArgument(
+        "ReleaseDeviceMemoryOwnership allowed only for non-tuple");
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer,
+      Release(wait_for_operations_to_complete));
+
+  std::unique_ptr<PjRtBuffer::ExternalReference> ref;
+  if (tracked_device_buffer) {
+    ref = std::make_unique<TrackedCpuDeviceBufferExternalReference>(
+        std::move(tracked_device_buffer));
+  }
+  return ref;
+}
+
+void AbstractCpuBuffer::Delete() {
+  std::unique_ptr<TrackedCpuDeviceBuffer> device_buffer(
+      static_cast<TrackedCpuDeviceBuffer*>(ReleaseBuffer().release()));
+  if (device_buffer == nullptr) return;
+
+  // Now that all holds have completed and no more can be added, we can get
+  // the final set of usage events.
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> usage_events =
+      device_buffer->LockUseAndTransferUsageEvents();
+
+  std::vector<tsl::AsyncValue*> event_avs;
+  event_avs.reserve(usage_events.size() + 1);
+  for (auto& event : usage_events) {
+    event_avs.push_back(event.GetAsyncValue());
+  }
+
+  // We should also wait for the definition event.
+  event_avs.push_back(device_buffer->definition_event().GetAsyncValue());
+
+  RunWhenReady(event_avs, [device_buffer = std::move(device_buffer)]() mutable {
+    device_buffer.reset();
+  });
+}
+
+absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>>
+AbstractCpuBuffer::Release(bool wait_for_operations_to_complete) {
+  std::unique_ptr<TrackedCpuDeviceBuffer> device_buffer(
+      static_cast<TrackedCpuDeviceBuffer*>(ReleaseBuffer().release()));
+  if (device_buffer == nullptr) return {nullptr};
+
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> events;
+  // Now that all holds have completed and no more can be added, we can get
+  // the final set of usage events.
+  events = device_buffer->LockUseAndTransferUsageEvents();
+
+  if (wait_for_operations_to_complete) {
+    // Block the host until all usage events have completed. Usage events
+    // dominate definition events, so this also waits for the buffer to be
+    // defined. Return the first error encountered.
+    absl::Status first_error;
+    for (const auto& av : events) {
+      BlockUntilReady(av.GetAsyncValue());
+      if (auto* error = av.GetErrorIfPresent()) {
+        first_error.Update(Internal("Error Execute: %s", error->message()));
+      }
+    }
+    if (!first_error.ok()) return std::move(first_error);
+  }
+
+  return device_buffer;
+}
+
+TrackedCpuDeviceBuffer* AbstractCpuBuffer::AcquireUsage(
+    tsl::AsyncValueRef<CpuEvent> usage_event) {
+  absl::MutexLock lock(&mu_);
+  if (!device_buffer()) {
+    return nullptr;
+  }
+
+  device_buffer()->AddUsageEvents(absl::MakeSpan(&usage_event, 1));
+  return device_buffer();
+}
+
+AbstractCpuBuffer::ScopedHold AbstractCpuBuffer::GetBufferWithHold(
+    ScopedHold::Type type) {
+  absl::MutexLock lock(&mu_);
+  // Ensure that at most one donation hold can be in progress at a time.
+  WaitForOutstandingDonationHold();
+  ScopedHold hold(this, type);
+  AcquireHoldLocked(&hold);
+  return hold;
+}
+
+AbstractCpuBuffer::ScopedHold AbstractCpuBuffer::AcquireDonation() {
+  return GetBufferWithHold(ScopedHold::kDonation);
+}
+
+PjRtFuture<> AbstractCpuBuffer::DoAsyncWorkOnBuffer(
+    absl::string_view method_name,
+    absl::AnyInvocable<absl::Status(const Shape& device_shape,
+                                    TrackedCpuDeviceBuffer* device_buffer) &&>
+        work_on_buffer,
+    bool should_do_work_sync, AsyncWorkRunner* async_work_runner) {
+  auto name_generator = [buffer_name = buffer_name(), method_name]() {
+    return absl::StrCat(buffer_name, "::", method_name);
+  };
+  tsl::profiler::TraceMe traceme(name_generator);
+  if (IsEmptyTuple()) {
+    return PjRtFuture<>(
+        InvalidArgument("%s called on empty tuple", method_name));
+  }
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto* device_buffer = AcquireUsage(usage_event);
+  if (device_buffer == nullptr) {
+    return PjRtFuture<>(InvalidArgument(
+        "CopyToHostAsync() called on deleted or donated buffer"));
+  }
+  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+  tsl::RCReference<tsl::AsyncValue> device_buffer_wait_av =
+      device_buffer->definition_event().CopyRCRef();
+
+  absl::StatusOr<Shape> device_shape = logical_on_device_shape();
+  if (!device_shape.ok()) {
+    return PjRtFuture<>(device_shape.status());
+  }
+  if (device_buffer_wait_av->IsConcrete() && should_do_work_sync) {
+    // Unblock ToLiteral caller.
+    return PjRtFuture<>(
+        std::move(work_on_buffer)(*device_shape, device_buffer));
+  } else {
+    std::vector<tsl::RCReference<tsl::AsyncValue>> device_buffer_wait_avs{
+        device_buffer_wait_av};
+    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+    // Wait for buffer definition events to finish before d2h dispatch. D2H
+    // dispatch should be in parallel, e.g. one Execute event finish may trigger
+    // multiple outputs' D2H, they should happen in different threads in
+    // parallel.
+    async_work_runner->ScheduleWhenReady(
+        device_buffer_wait_avs,
+        [device_buffer_wait_av = std::move(device_buffer_wait_av),
+         work_on_buffer = std::move(work_on_buffer), promise, device_buffer,
+         device_shape, ready_on_exit = std::move(ready_on_exit)]() mutable {
+          tsl::profiler::TraceMe traceme("D2H Dispatch");
+          // Errors in src buffer are surfaced to user.
+          if (auto* error = device_buffer_wait_av->GetErrorIfPresent()) {
+            promise.Set(*error);
+            return;
+          }
+          auto status = std::move(work_on_buffer)(*device_shape, device_buffer);
+          // Unblock ToLiteral event.
+          if (!status.ok()) {
+            promise.Set(status);
+          } else {
+            promise.Set();
+          }
+        });
+    return PjRtFuture<>(
+        std::move(promise),
+        /*on_block_start=*/
+        [name_generator]() {
+          tsl::profiler::TraceMeProducer traceme(name_generator);
+          return PjRtFutureHelpers::ProfilingKeys(
+              {/*traceme_context_id =*/traceme.GetContextId()});
+        },
+        /*on_block_end=*/
+        [name_generator](PjRtFutureHelpers::ProfilingKeys keys) {
+          tsl::profiler::TraceMeConsumer traceme(name_generator,
+                                                 keys.traceme_context_id);
+        });
+  }
+}
+
+PjRtFuture<> AbstractCpuBuffer::ToLiteralHelper(
+    MutableLiteralBase* literal, AsyncWorkRunner* async_work_runner) {
+  bool should_sync_copy = !literal->shape().IsTuple() &&
+                          literal->size_bytes() < kSmallDataTransferByteSize;
+  auto work_on_buffer =
+      [literal](const Shape& device_shape,
+                TrackedCpuDeviceBuffer* device_buffer) -> absl::Status {
+    CopyCpuBufferToLiteral(device_shape, device_buffer, literal);
+    return absl::OkStatus();
+  };
+  return DoAsyncWorkOnBuffer("ToLiteral", std::move(work_on_buffer),
+                             should_sync_copy, async_work_runner);
+}
+
+PjRtFuture<> AbstractCpuBuffer::CopyRawToHostHelper(
+    void* dst, int64_t offset, int64_t transfer_size,
+    AsyncWorkRunner* async_work_runner) {
+  bool should_sync_copy = transfer_size < kSmallDataTransferByteSize;
+  auto work_on_buffer =
+      [dst, offset, transfer_size](
+          const Shape& device_shape,
+          TrackedCpuDeviceBuffer* device_buffer) -> absl::Status {
+    if (device_shape.IsTuple()) {
+      return InvalidArgument("CopyRawToHost not implemented for tuples.");
+    } else if (offset < 0 ||
+               offset + transfer_size > ShapeUtil::ByteSizeOf(device_shape)) {
+      return InvalidArgument("CopyRawToHost out of bounds.");
+    }
+    const tsl::AsyncValueRef<CpuDeviceMemory>& b = device_buffer->buffer();
+    CHECK(b.IsConcrete());
+    std::memcpy(dst, reinterpret_cast<char*>(b->untyped_data()) + offset,
+                transfer_size);
+    return absl::OkStatus();
+  };
+  return DoAsyncWorkOnBuffer("CopyRawToHost", std::move(work_on_buffer),
+                             should_sync_copy, async_work_runner);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+AbstractCpuBuffer::CopyToDeviceAcrossClients(PjRtDevice* dst_device) {
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+  // Avoid use-after-free on `literal` due to unsequenced move and use.
+  Literal* literal_pointer = literal.get();
+  absl::InlinedVector<int64_t, 4> byte_strides(
+      literal->shape().dimensions().size());
+  TF_RETURN_IF_ERROR(
+      ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+  TF_ASSIGN_OR_RETURN(PjRtMemorySpace * dst_memory_space,
+                      dst_device->default_memory_space());
+  return dst_device->client()->BufferFromHostBuffer(
+      literal_pointer->untyped_data(), literal_pointer->shape().element_type(),
+      literal_pointer->shape().dimensions(), byte_strides,
+      PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
+      [literal{std::move(literal)}]() { /* frees literal */ }, dst_memory_space,
+      /*device_layout=*/nullptr);
+}
+
+absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>>
+AbstractCpuBuffer::CopyToDeviceHelper(AsyncWorkRunner* async_work_runner) {
+  // Copy each leaf buffer to a destination buffer.
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto* src_device_buffer = AcquireUsage(usage_event);
+  if (src_device_buffer == nullptr) {
+    return InvalidArgument("CopyToDevice called on deleted or donated buffer");
+  }
+  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+  auto dst_buffer = CpuDeviceMemory::CreateDelayedMemory();
+  auto dst_definition_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+
+  // Wait for src buffer definition events to finish before d2d dispatch.
+  // Errors are propagated asynchronously in dst buffer's definition events.
+  const auto& src_definition_event = src_device_buffer->definition_event();
+
+  auto copy_task = [src_buffer = src_device_buffer->buffer(), dst_buffer,
+                    dst_definition_event, src_definition_event,
+                    ready_on_exit = std::move(ready_on_exit)]() mutable {
+    tsl::profiler::TraceMe traceme("D2D Dispatch");
+    if (auto* error = src_definition_event.GetErrorIfPresent()) {
+      // Any error discovered in src buffer are propagated to dst buffer
+      // definition events, which will surface to users in
+      // dst_buffer->ToLiteral().
+      dst_definition_event.SetError(*error);
+      return;
+    }
+
+    CHECK(src_buffer.IsConcrete());
+    auto status = CpuDeviceMemory::AllocateInto(src_buffer->size_bytes(),
+                                                dst_buffer.AsPtr());
+    if (!status.ok()) {
+      dst_definition_event.SetError(status);
+      return;
+    }
+    std::memcpy(dst_buffer->untyped_data(), src_buffer->untyped_data(),
+                src_buffer->size_bytes());
+    dst_definition_event.SetStateConcrete();
+  };
+
+  src_definition_event.AndThen(
+      [async_work_runner, copy_task = std::move(copy_task)]() mutable {
+        async_work_runner->Schedule(std::move(copy_task));
+      });
+
+  return std::make_unique<TrackedCpuDeviceBuffer>(
+      /*owns_buffers=*/true, dst_buffer, src_device_buffer->BufferSize(),
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>{
+          std::move(dst_definition_event)});
+}
+
+PjRtFuture<> AbstractCpuBuffer::GetReadyFuture() {
+  tsl::AsyncValueRef<CpuEvent> definition_event;
+  {
+    absl::MutexLock lock(&mu_);
+    if (!device_buffer()) {
+      return PjRtFuture<>(InvalidArgument(
+          "GetReadyFuture() called on deleted or donated buffer"));
+    }
+    definition_event = device_buffer()->definition_event();
+  }
+  DCHECK(definition_event);
+
+  if (definition_event.IsAvailable()) {
+    if (definition_event.IsError()) {
+      const absl::Status& s = definition_event.GetError();
+      return PjRtFuture<>(tsl::errors::CreateWithUpdatedMessage(
+          s, absl::StrCat("Buffer Definition Event: ", s.message())));
+    }
+    return PjRtFuture<>(absl::OkStatus());
+  } else {
+    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+    definition_event.AndThen(
+        [definition_event = definition_event.AsPtr(), promise]() mutable {
+          if (definition_event.IsError()) {
+            const absl::Status& s = definition_event.GetError();
+            promise.Set(tsl::errors::CreateWithUpdatedMessage(
+                s, absl::StrCat("Buffer Definition Event: ", s.message())));
+          } else {
+            promise.Set();
+          }
+        });
+
+    std::string message = absl::StrCat(buffer_name(), "::Await");
+    return PjRtFuture<>(
+        std::move(promise),
+        /*on_block_start=*/
+        [message]() {
+          absl::string_view message_view(message);
+          tsl::profiler::TraceMeProducer traceme(message_view);
+          VLOG(1) << message_view;
+          return PjRtFutureHelpers::ProfilingKeys(
+              {/*traceme_context_id=*/traceme.GetContextId()});
+        },
+        /*on_block_end=*/
+        [message](PjRtFutureHelpers::ProfilingKeys keys) {
+          absl::string_view message_view(message);
+          tsl::profiler::TraceMeConsumer traceme(message_view,
+                                                 keys.traceme_context_id);
+        });
+  }
+}
+
+void PackOrCopy(PrimitiveType element_type, const LiteralSlice& literal,
+                void* data, int64_t size) {
+  if (primitive_util::IsSubByteNonPredType(element_type)) {
+    const int bit_width = primitive_util::BitWidth(element_type);
+    absl::Span<const char> src_data_span(
+        static_cast<const char*>(literal.untyped_data()), literal.size_bytes());
+    absl::Span<char> dst_data_span(static_cast<char*>(data), size);
+    PackIntN(bit_width, src_data_span, dst_data_span);
+  } else {
+    CHECK_EQ(literal.size_bytes(), size);
+    std::memcpy(data, literal.untyped_data(), size);
+  }
+}
+
+/*static*/ absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>>
+AbstractCpuBuffer::AllocateTrackedDeviceBuffer(
+    const Shape& on_device_shape,
+    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events) {
+  if (on_device_shape.IsTuple()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Tuples are not supported for cpu-buffers: ",
+                     on_device_shape.ToString()));
+  }
+  size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSIGN_OR_RETURN(tsl::AsyncValueRef<CpuDeviceMemory> device_buffer,
+                      CpuDeviceMemory::Allocate(byte_size));
+  return std::make_unique<TrackedCpuDeviceBuffer>(
+      /*owns_buffers=*/true, std::move(device_buffer),
+      std::move(definition_events));
+}
+
+/*static*/ void AbstractCpuBuffer::AllocateAvsAndEvents(
+    const Shape& shape,
+    absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
+    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events) {
+  // Nested tuple shapes are not supported here.
+  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes().size() : 1;
+  for (int i = 0; i < num_leaf_buffers; ++i) {
+    tsl::AsyncValueRef<CpuEvent> definition_event =
+        tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+    definition_events->push_back(definition_event.CopyRef());
+    avs->push_back(std::move(definition_event));
+  }
+}
+
+/*static*/ bool AbstractCpuBuffer::BufferFromHostBufferSupportsZeroCopy(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides, const Shape& shape) {
+  if (byte_strides && !HasMajorToMinorLayout(type, dims, *byte_strides)) {
+    return false;
+  }
+  // Packed arrays are unpacked on host and packed on device.
+  if (primitive_util::IsSubByteNonPredType(type)) {
+    return false;
+  }
+
+  // If the input buffer has a default layout and is sufficiently aligned, we
+  // can simply point to the input array's data without any further copies. At
+  // the time of writing we require a 16-byte alignment because XLA may generate
+  // code which requires it.
+  if ((absl::bit_cast<std::uintptr_t>(data) &
+       (cpu_function_runtime::MinAlign() - 1)) != 0) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.h b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.h
new file mode 100644
index 000000000000..38fbc525d519
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.h
@@ -0,0 +1,239 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_ABSTRACT_CPU_BUFFER_H_
+#define XLA_PJRT_CPU_ABSTRACT_CPU_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/async_work_runner.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// A RAII helper class used to set an AsyncValueRef<CpuEvent> to a ready state
+// upon destruction. In many cases in PjRt implementation, there will be
+// multiple return statements in the function, all of which require setting some
+// AsyncValueRef<CpuEvent> to be ready. This class could make such code more
+// robust by using setting the AsyncValue in the destructor.
+class MarkEventReadyOnExit {
+ public:
+  explicit MarkEventReadyOnExit(tsl::AsyncValueRef<CpuEvent> event)
+      : event_(std::move(event)) {}
+
+  MarkEventReadyOnExit(const MarkEventReadyOnExit&) = delete;
+  MarkEventReadyOnExit& operator=(const MarkEventReadyOnExit&) = delete;
+  MarkEventReadyOnExit(MarkEventReadyOnExit&&) noexcept = default;
+  MarkEventReadyOnExit& operator=(MarkEventReadyOnExit&&) noexcept = default;
+
+  ~MarkEventReadyOnExit() {
+    if (event_) event_.SetStateConcrete();
+  }
+
+  tsl::AsyncValueRef<CpuEvent> Release() && { return std::move(event_); }
+
+ private:
+  tsl::AsyncValueRef<CpuEvent> event_;
+};
+
+class AbstractCpuBuffer : public CommonPjRtBuffer {
+ public:
+  class ScopedHold : public CommonPjRtBuffer::ScopedHold {
+   public:
+    TrackedCpuDeviceBuffer* buffer() const {
+      return static_cast<TrackedCpuDeviceBuffer*>(
+          CommonPjRtBuffer::ScopedHold::buffer());
+    }
+    TrackedCpuDeviceBuffer* operator->() const { return buffer(); }
+    const TrackedCpuDeviceBuffer& operator*() const { return *buffer(); }
+    AbstractCpuBuffer* parent() const {
+      return static_cast<AbstractCpuBuffer*>(
+          CommonPjRtBuffer::ScopedHold::parent());
+    }
+
+   private:
+    using CommonPjRtBuffer::ScopedHold::ScopedHold;
+    friend class AbstractCpuBuffer;
+  };
+  AbstractCpuBuffer(
+      Shape on_device_shape,
+      std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer);
+  ~AbstractCpuBuffer() override;
+
+  const Shape& on_device_shape() const override { return on_device_shape_; }
+
+  absl::StatusOr<Shape> logical_on_device_shape() override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override;
+
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
+    return PjRtFuture<>(Unimplemented("CopyRawToHost not implemented"));
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override {
+    return Unimplemented("CopyToMemorySpace not implemented");
+  }
+
+  void Delete() override;
+
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
+    on_done(Unimplemented("CopyToRemoteDevice not implemented."),
+            /*sends_were_enqueued=*/false);
+  }
+
+  PjRtFuture<> GetReadyFuture() override;
+
+  bool IsOnCpu() const override { return true; }
+
+  // Acquires the device buffer for shared read-only usages, and it also adds
+  // the `usage_event` to it. Any donation event in the future is expected to be
+  // serialized after all the usage events added through this method. Returns
+  // nullptr if the buffer is already donated or there is outstanding external
+  // references.
+  TrackedCpuDeviceBuffer* AcquireUsage(
+      tsl::AsyncValueRef<CpuEvent> usage_event);
+
+  // Acquires the device buffer for exclusive donation. The caller of this
+  // method is expected to use the usage events and definition events to
+  // serialize this donation with previous usages. After this method is called,
+  // calls to AcquireUsage() will fail. Returns error status if the buffer is
+  // already donated or there is outstanding external references.
+  ScopedHold AcquireDonation();
+
+  // Allocates a new `TrackedCpuDeviceBuffer` with the given shape and
+  // definition events.
+  static absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>>
+  AllocateTrackedDeviceBuffer(
+      const Shape& on_device_shape,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
+
+  // Allocates new cpu events to `avs` and `definition_events`. If `shape` is a
+  // tuple, multiple events will be allocated. Otherwise, `avs` and
+  // `definition_events` will only contain one event.
+  static void AllocateAvsAndEvents(
+      const Shape& shape,
+      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events);
+
+  // A helper function to determine if a BufferFromHostBuffer call is elligable
+  // for zero copy construction.
+  static bool BufferFromHostBufferSupportsZeroCopy(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      const Shape& shape);
+
+  // Returns a hold on the TrackedCpuDeviceBuffer holding the device
+  // buffers. See comment on ScopedHold.
+  ScopedHold GetBufferWithHold(ScopedHold::Type type);
+
+ protected:
+  virtual absl::string_view buffer_name() const = 0;
+
+  TrackedCpuDeviceBuffer* device_buffer() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return static_cast<TrackedCpuDeviceBuffer*>(
+        CommonPjRtBuffer::device_buffer());
+  }
+
+  PjRtFuture<> ToLiteralHelper(MutableLiteralBase* literal,
+                               AsyncWorkRunner* async_work_runner);
+
+  PjRtFuture<> DoAsyncWorkOnBuffer(
+      absl::string_view method_name,
+      absl::AnyInvocable<absl::Status(const Shape& device_shape,
+                                      TrackedCpuDeviceBuffer* device_buffer) &&>
+          work_on_buffer,
+      bool should_do_work_sync, AsyncWorkRunner* async_work_runner);
+
+  PjRtFuture<> CopyRawToHostHelper(void* dst, int64_t offset,
+                                   int64_t transfer_size,
+                                   AsyncWorkRunner* async_work_runner);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceAcrossClients(
+      PjRtDevice* dst_device);
+
+  absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>> CopyToDeviceHelper(
+      AsyncWorkRunner* async_work_runner);
+
+  bool IsEmptyTuple() const {
+    return on_device_shape_.IsTuple() &&
+           on_device_shape_.tuple_shapes().size() == 0;
+  }
+
+  // Similar to Delete, drops the buffer's reference to its associated device
+  // memory, leaving the buffer in an invalid state, but returns the
+  // TrackedCpuDeviceBuffer rather than freeing the device memory, so that
+  // another framework can take ownership of it. The buffer returned from
+  // Release may be safely dropped at any time even if it still has pending
+  // async operations. The client should call Await before calling Release with
+  // wait_for_operations_to_complete=false, to ensure that the host has
+  // synchronized past any outstanding write operations to the buffer. If
+  // wait_for_operations_to_complete=true the host will block until any
+  // potentially outstanding asynchronous operations have completed before
+  // returning, in which case it is safe to read or mutate the returned buffer.
+  // If the buffer was shared via an external reference it is the client's
+  // responsibility that accesses via that reference do not interfere with
+  // accesses via the buffer returned from Release.
+  absl::StatusOr<std::unique_ptr<TrackedCpuDeviceBuffer>> Release(
+      bool wait_for_operations_to_complete);
+
+  const Shape on_device_shape_;
+};
+
+// Helper for copying into potentially sub-byte packed literals.
+void PackOrCopy(PrimitiveType element_type, const LiteralSlice& literal,
+                void* data, int64_t size);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_ABSTRACT_CPU_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
deleted file mode 100644
index ba8d1310c028..000000000000
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
+++ /dev/null
@@ -1,1071 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/casts.h"
-#include "absl/base/thread_annotations.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/cpu_function_runtime.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/transpose.h"
-#include "xla/pjrt/utils.h"
-#include "xla/primitive_util.h"
-#include "xla/service/cpu/cpu_event.h"
-#include "xla/service/cpu/cpu_executable.h"
-#include "xla/service/cpu/cpu_xfeed.h"
-#include "xla/service/shaped_buffer.h"
-#include "xla/shape.h"
-#include "xla/shape_tree.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/tsl/concurrency/async_value.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/profiler/lib/connected_traceme.h"
-#include "tsl/profiler/lib/traceme.h"
-
-namespace xla {
-namespace {
-
-constexpr size_t kSmallDataTransferByteSize = 102400;  // 100 KiB
-
-// Unpacks and copies the packed data at `input` into the literal at the given
-// ShapeIndex.
-void UnpackIntNToLiteral(PrimitiveType input_element_type,
-                         const MaybeOwningCpuMemory& input,
-                         MutableLiteralBase* literal,
-                         const ShapeIndex& shape_index) {
-  absl::Span<const char> input_span{static_cast<const char*>(input.data()),
-                                    input.size()};
-  size_t output_size = static_cast<size_t>(ShapeUtil::ByteSizeOf(
-      ShapeUtil::GetSubshape(literal->shape(), shape_index)));
-  absl::Span<char> output_span{
-      static_cast<char*>(literal->untyped_data(shape_index)), output_size};
-  primitive_util::UnpackIntN(input_element_type, input_span, output_span);
-}
-
-// `device_buffer`'s definition event must be ready before calling this
-// function.
-void CopyCpuBufferToLiteral(const Shape& device_shape,
-                            TrackedTfrtCpuDeviceBuffer* device_buffer,
-                            MutableLiteralBase* literal) {
-  if (!device_shape.IsTuple()) {
-    const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
-        device_buffer->Buffers()[0];
-    CHECK(b.IsConcrete());
-    if (primitive_util::IsSubByteNonPredType(device_shape.element_type())) {
-      UnpackIntNToLiteral(device_shape.element_type(), *b, literal,
-                          /*shape_index=*/{});
-    } else {
-      std::memcpy(literal->untyped_data(), b->data(),
-                  ShapeUtil::ByteSizeOf(device_shape));
-    }
-  } else {
-    // Tuple case.
-    int num_leaves = literal->shape().tuple_shapes().size();
-    for (int i = 0; i < num_leaves; ++i) {
-      const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
-          device_buffer->Buffers()[i];
-      CHECK(b.IsConcrete());
-      if (primitive_util::IsSubByteNonPredType(device_shape.element_type())) {
-        UnpackIntNToLiteral(device_shape.element_type(), *b, literal, {i});
-      } else {
-        std::memcpy(
-            literal->untyped_data({i}), b->data(),
-            ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(device_shape, {i})));
-      }
-    }
-  }
-}
-
-// `buffers` must be available.
-ShapedBuffer AsShapedBuffer(
-    int device_ordinal, const Shape& on_device_shape,
-    absl::Span<const tsl::AsyncValueRef<MaybeOwningCpuMemory>> buffers) {
-  ShapedBuffer shaped_buffer(on_device_shape, device_ordinal);
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
-      shaped_buffer.buffers().begin();
-  for (const auto& buf : buffers) {
-    CHECK(buf.IsConcrete());
-    CHECK(iterator != shaped_buffer.buffers().end());
-    iterator->second = se::DeviceMemoryBase(buf->data(), buf->size());
-    ++iterator;
-  }
-  CHECK(iterator == shaped_buffer.buffers().end());
-  return shaped_buffer;
-}
-
-}  //  namespace
-
-AbstractTfrtCpuBuffer::AbstractTfrtCpuBuffer(
-    Shape on_device_shape,
-    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
-    : on_device_shape_(std::move(on_device_shape)),
-      tracked_device_buffer_(std::move(tracked_device_buffer)) {}
-
-AbstractTfrtCpuBuffer::~AbstractTfrtCpuBuffer() {
-  AbstractTfrtCpuBuffer::Delete();
-}
-
-absl::StatusOr<Shape> AbstractTfrtCpuBuffer::logical_on_device_shape() {
-  if (on_device_shape_.is_static()) {
-    return on_device_shape_;
-  }
-
-  auto usage_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto* device_buffer = AcquireUsage(usage_event);
-  if (device_buffer == nullptr) {
-    return InvalidArgument(
-        "logical_on_device_shape() called on deleted or donated buffer");
-  }
-  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
-
-  // Wait for the definition event.
-  const auto& av = device_buffer->definition_event();
-  BlockUntilReady(av.GetAsyncValue());
-  if (auto* error = av.GetErrorIfPresent()) {
-    return Internal("Error Execute: %s", error->message());
-  }
-
-  // Safe to call `AsShapedBuffer` because the definition event is ready.
-  ShapedBuffer shaped_buffer =
-      AsShapedBuffer(device()->local_hardware_id().value(), on_device_shape_,
-                     device_buffer->Buffers());
-  Shape ret_shape = on_device_shape_;
-  TF_RETURN_IF_ERROR(ReadDynamicShapesOnCpu(
-      &shaped_buffer, &ret_shape, cpu::CpuExecutable::ShapeSizeBytes));
-  return ret_shape;
-}
-
-absl::StatusOr<size_t> AbstractTfrtCpuBuffer::GetOnDeviceSizeInBytes() const {
-  return ShapeUtil::ByteSizeOf(on_device_shape_);
-}
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
-AbstractTfrtCpuBuffer::AcquireExternalReference() {
-  class ScopedExternalReference : public PjRtBuffer::ExternalReference {
-   public:
-    explicit ScopedExternalReference(
-        AbstractTfrtCpuBuffer* buffer,
-        tsl::AsyncValueRef<MaybeOwningCpuMemory> data)
-        : buffer_(buffer), data_(std::move(data)) {
-      DCHECK(data_);
-      // We need to wait for the memory to be allocated before sharing it with
-      // external frameworks like NumPy.
-      tsl::BlockUntilReady(data_);
-      CHECK(data_.IsConcrete());
-      data_ptr_ = data_->data();
-    }
-
-    ~ScopedExternalReference() override { buffer_->DropExternalReference(); }
-
-   private:
-    AbstractTfrtCpuBuffer* buffer_ = nullptr;
-    // Keep a reference to the underlying data used. Note that it is still
-    // users' responsibility to synchronize reads and writes to the data.
-    tsl::AsyncValueRef<MaybeOwningCpuMemory> data_;
-  };
-
-  absl::MutexLock lock(&mu_);
-  if (tracked_device_buffer_ == nullptr) {
-    return InvalidArgument("Buffer has been deleted or donated.");
-  }
-
-  ++external_reference_counter_;
-
-  return {std::make_unique<ScopedExternalReference>(
-      this, tracked_device_buffer_->Buffers()[0])};
-}
-
-void AbstractTfrtCpuBuffer::DropExternalReference() {
-  absl::MutexLock lock(&mu_);
-  CHECK_GT(external_reference_counter_, 0);
-  --external_reference_counter_;
-  if (external_reference_counter_ == 0 && external_references_dropped_event_) {
-    external_references_dropped_event_->SetStateConcrete();
-  }
-}
-
-class TrackedCpuDeviceBufferExternalReference
-    : public PjRtBuffer::ExternalReference {
- public:
-  explicit TrackedCpuDeviceBufferExternalReference(
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
-      : tracked_device_buffer_(std::move(tracked_device_buffer)) {
-    // We need to wait for the memory to be allocated before sharing it with
-    // external frameworks like NumPy.
-    const auto& buffer = tracked_device_buffer_->Buffers()[0];
-    tsl::BlockUntilReady(buffer);
-    CHECK(buffer.IsConcrete());
-    data_ptr_ = buffer->data();
-  }
-
-  ~TrackedCpuDeviceBufferExternalReference() override = default;
-
- private:
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer_;
-};
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
-AbstractTfrtCpuBuffer::ReleaseDeviceMemoryOwnership(
-    bool wait_for_operations_to_complete) {
-  if (on_device_shape_.IsTuple()) {
-    return InvalidArgument(
-        "ReleaseDeviceMemoryOwnership allowed only for non-tuple");
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-      Release(wait_for_operations_to_complete));
-
-  std::unique_ptr<PjRtBuffer::ExternalReference> ref;
-  if (tracked_device_buffer) {
-    ref = std::make_unique<TrackedCpuDeviceBufferExternalReference>(
-        std::move(tracked_device_buffer));
-  }
-  return ref;
-}
-
-void AbstractTfrtCpuBuffer::CommitDonation() {
-  absl::MutexLock lock(&mu_);
-  CHECK(pending_donation_);
-  CHECK(!tracked_device_buffer_);
-  pending_donation_ = false;
-}
-
-void AbstractTfrtCpuBuffer::AbortDonation(
-    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer) {
-  absl::MutexLock lock(&mu_);
-  CHECK(pending_donation_);
-  CHECK(!tracked_device_buffer_);
-  pending_donation_ = false;
-  tracked_device_buffer_ = std::move(device_buffer);
-}
-
-void AbstractTfrtCpuBuffer::Delete() {
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer;
-  std::optional<tsl::AsyncValueRef<CpuEvent>> external_references_dropped_event;
-  {
-    absl::MutexLock lock(&mu_);
-    device_buffer = ReleaseBufferLocked();
-    if (device_buffer == nullptr) return;
-
-    if (external_reference_counter_ > 0) {
-      external_references_dropped_event = external_references_dropped_event_ =
-          tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-    }
-  }
-
-  // Now that all holds have completed and no more can be added, we can get
-  // the final set of usage events.
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> usage_events =
-      device_buffer->LockUseAndTransferUsageEvents();
-
-  std::vector<tsl::AsyncValue*> event_avs;
-  event_avs.reserve(usage_events.size() + 1);
-  for (auto& event : usage_events) {
-    event_avs.push_back(event.GetAsyncValue());
-  }
-
-  // We should also wait for the definition event.
-  event_avs.push_back(device_buffer->definition_event().GetAsyncValue());
-  if (external_references_dropped_event) {
-    event_avs.push_back(external_references_dropped_event->GetAsyncValue());
-  }
-
-  RunWhenReady(event_avs, [device_buffer = std::move(device_buffer)]() mutable {
-    device_buffer.reset();
-  });
-}
-
-bool AbstractTfrtCpuBuffer::IsDeleted() {
-  absl::MutexLock lock(&mu_);
-  return tracked_device_buffer_ == nullptr;
-}
-
-std::unique_ptr<TrackedTfrtCpuDeviceBuffer>
-AbstractTfrtCpuBuffer::ReleaseBufferLocked() {
-  auto condition = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) {
-    return !pending_donation_;
-  };
-  mu_.Await(absl::Condition(&condition));
-  return std::move(tracked_device_buffer_);
-}
-
-absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
-AbstractTfrtCpuBuffer::Release(bool wait_for_operations_to_complete) {
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer;
-  {
-    absl::MutexLock lock(&mu_);
-    device_buffer = ReleaseBufferLocked();
-  }
-  if (device_buffer == nullptr) return {nullptr};
-
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> events;
-  // Now that all holds have completed and no more can be added, we can get
-  // the final set of usage events.
-  events = device_buffer->LockUseAndTransferUsageEvents();
-
-  if (wait_for_operations_to_complete) {
-    // Block the host until all usage events have completed. Usage events
-    // dominate definition events, so this also waits for the buffer to be
-    // defined. Return the first error encountered.
-    absl::Status first_error;
-    for (const auto& av : events) {
-      BlockUntilReady(av.GetAsyncValue());
-      if (auto* error = av.GetErrorIfPresent()) {
-        first_error.Update(Internal("Error Execute: %s", error->message()));
-      }
-    }
-    if (!first_error.ok()) return std::move(first_error);
-  }
-
-  return device_buffer;
-}
-
-TrackedTfrtCpuDeviceBuffer* AbstractTfrtCpuBuffer::AcquireUsage(
-    tsl::AsyncValueRef<CpuEvent> usage_event) {
-  absl::MutexLock lock(&mu_);
-  if (!tracked_device_buffer_) {
-    return nullptr;
-  }
-
-  tracked_device_buffer_->AddUsageEvents(absl::MakeSpan(&usage_event, 1));
-  return tracked_device_buffer_.get();
-}
-
-absl::StatusOr<AbstractTfrtCpuBuffer::DonationTransaction>
-AbstractTfrtCpuBuffer::AcquireDonation() {
-  absl::MutexLock lock(&mu_);
-
-  if (tracked_device_buffer_ == nullptr) {
-    return InvalidArgument("Donation requested for invalid buffer");
-  }
-
-  if (external_reference_counter_ > 0) {
-    return InvalidArgument(
-        "Donation requested for buffer with external reference");
-  }
-
-  CHECK(!pending_donation_);
-  pending_donation_ = true;
-
-  // Swap out `tracked_device_buffer_` so that no one can acquire a usage event
-  // after this point.
-  return DonationTransaction(this, std::move(tracked_device_buffer_));
-}
-
-PjRtFuture<> AbstractTfrtCpuBuffer::DoAsyncWorkOnBuffer(
-    absl::string_view method_name,
-    absl::AnyInvocable<
-        absl::Status(const Shape& device_shape,
-                     TrackedTfrtCpuDeviceBuffer* device_buffer) &&>
-        work_on_buffer,
-    bool should_do_work_sync, AsyncWorkRunner* async_work_runner) {
-  auto name_generator = [buffer_name = buffer_name(), method_name]() {
-    return absl::StrCat(buffer_name, "::", method_name);
-  };
-  tsl::profiler::TraceMe traceme(name_generator);
-  if (IsEmptyTuple()) {
-    return PjRtFuture<>(
-        InvalidArgument("%s called on empty tuple", method_name));
-  }
-  auto usage_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto* device_buffer = AcquireUsage(usage_event);
-  if (device_buffer == nullptr) {
-    return PjRtFuture<>(InvalidArgument(
-        "CopyToHostAsync() called on deleted or donated buffer"));
-  }
-  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
-
-  tsl::RCReference<tsl::AsyncValue> device_buffer_wait_av =
-      device_buffer->definition_event().CopyRCRef();
-
-  absl::StatusOr<Shape> device_shape = logical_on_device_shape();
-  if (!device_shape.ok()) {
-    return PjRtFuture<>(device_shape.status());
-  }
-  if (device_buffer_wait_av->IsConcrete() && should_do_work_sync) {
-    // Unblock ToLiteral caller.
-    return PjRtFuture<>(
-        std::move(work_on_buffer)(*device_shape, device_buffer));
-  } else {
-    std::vector<tsl::RCReference<tsl::AsyncValue>> device_buffer_wait_avs{
-        device_buffer_wait_av};
-    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
-    // Wait for buffer definition events to finish before d2h dispatch. D2H
-    // dispatch should be in parallel, e.g. one Execute event finish may trigger
-    // multiple outputs' D2H, they should happen in different threads in
-    // parallel.
-    async_work_runner->ScheduleWhenReady(
-        device_buffer_wait_avs,
-        [device_buffer_wait_av = std::move(device_buffer_wait_av),
-         work_on_buffer = std::move(work_on_buffer), promise, device_buffer,
-         device_shape, ready_on_exit = std::move(ready_on_exit)]() mutable {
-          tsl::profiler::TraceMe traceme("D2H Dispatch");
-          // Errors in src buffer are surfaced to user.
-          if (auto* error = device_buffer_wait_av->GetErrorIfPresent()) {
-            promise.Set(*error);
-            return;
-          }
-          auto status = std::move(work_on_buffer)(*device_shape, device_buffer);
-          // Unblock ToLiteral event.
-          if (!status.ok()) {
-            promise.Set(status);
-          } else {
-            promise.Set();
-          }
-        });
-    return PjRtFuture<>(
-        std::move(promise),
-        /*on_block_start=*/
-        [name_generator]() {
-          tsl::profiler::TraceMeProducer traceme(name_generator);
-          return PjRtFutureHelpers::ProfilingKeys(
-              {/*traceme_context_id =*/traceme.GetContextId()});
-        },
-        /*on_block_end=*/
-        [name_generator](PjRtFutureHelpers::ProfilingKeys keys) {
-          tsl::profiler::TraceMeConsumer traceme(name_generator,
-                                                 keys.traceme_context_id);
-        });
-  }
-}
-
-PjRtFuture<> AbstractTfrtCpuBuffer::ToLiteralHelper(
-    MutableLiteralBase* literal, AsyncWorkRunner* async_work_runner) {
-  bool should_sync_copy = !literal->shape().IsTuple() &&
-                          literal->size_bytes() < kSmallDataTransferByteSize;
-  auto work_on_buffer =
-      [literal](const Shape& device_shape,
-                TrackedTfrtCpuDeviceBuffer* device_buffer) -> absl::Status {
-    CopyCpuBufferToLiteral(device_shape, device_buffer, literal);
-    return absl::OkStatus();
-  };
-  return DoAsyncWorkOnBuffer("ToLiteral", std::move(work_on_buffer),
-                             should_sync_copy, async_work_runner);
-}
-
-PjRtFuture<> AbstractTfrtCpuBuffer::CopyRawToHostHelper(
-    void* dst, int64_t offset, int64_t transfer_size,
-    AsyncWorkRunner* async_work_runner) {
-  bool should_sync_copy = transfer_size < kSmallDataTransferByteSize;
-  auto work_on_buffer =
-      [dst, offset, transfer_size](
-          const Shape& device_shape,
-          TrackedTfrtCpuDeviceBuffer* device_buffer) -> absl::Status {
-    if (device_shape.IsTuple()) {
-      return InvalidArgument("CopyRawToHost not implemented for tuples.");
-    } else if (offset < 0 ||
-               offset + transfer_size > ShapeUtil::ByteSizeOf(device_shape)) {
-      return InvalidArgument("CopyRawToHost out of bounds.");
-    }
-    const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
-        device_buffer->Buffers()[0];
-    CHECK(b.IsConcrete());
-    std::memcpy(dst, reinterpret_cast<char*>(b->data()) + offset,
-                transfer_size);
-    return absl::OkStatus();
-  };
-  return DoAsyncWorkOnBuffer("CopyRawToHost", std::move(work_on_buffer),
-                             should_sync_copy, async_work_runner);
-}
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-AbstractTfrtCpuBuffer::CopyToDeviceAcrossClients(PjRtDevice* dst_device) {
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
-  // Avoid use-after-free on `literal` due to unsequenced move and use.
-  Literal* literal_pointer = literal.get();
-  absl::InlinedVector<int64_t, 4> byte_strides(
-      literal->shape().dimensions_size());
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
-  TF_ASSIGN_OR_RETURN(PjRtMemorySpace * dst_memory_space,
-                      dst_device->default_memory_space());
-  return dst_device->client()->BufferFromHostBuffer(
-      literal_pointer->untyped_data(), literal_pointer->shape().element_type(),
-      literal_pointer->shape().dimensions(), byte_strides,
-      PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
-      [literal{std::move(literal)}]() { /* frees literal */ }, dst_memory_space,
-      /*device_layout=*/nullptr);
-}
-
-absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
-AbstractTfrtCpuBuffer::CopyToDeviceHelper(AsyncWorkRunner* async_work_runner) {
-  // Copy each leaf buffer to a destination buffer.
-  auto usage_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto* src_device_buffer = AcquireUsage(usage_event);
-  if (src_device_buffer == nullptr) {
-    return InvalidArgument("CopyToDevice called on deleted or donated buffer");
-  }
-  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
-
-  int num_leaf_buffers = src_device_buffer->Buffers().size();
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> src_buffers;
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> dst_buffers;
-  absl::InlinedVector<size_t, 4> dst_buffers_sizes;
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> dst_definition_events;
-  src_buffers.reserve(num_leaf_buffers);
-  dst_buffers.reserve(num_leaf_buffers);
-  dst_buffers_sizes.reserve(num_leaf_buffers);
-  dst_definition_events.reserve(num_leaf_buffers);
-
-  for (int i = 0; i < num_leaf_buffers; ++i) {
-    src_buffers.push_back(std::move(src_device_buffer->Buffers()[i]));
-    dst_buffers.push_back(
-        tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>());
-    dst_buffers_sizes.push_back(src_device_buffer->BufferSizes()[i]);
-    dst_definition_events.push_back(
-        tsl::MakeConstructedAsyncValueRef<CpuEvent>());
-  }
-
-  // Wait for src buffer definition events to finish before d2d dispatch.
-  // Errors are propagated asynchronously in dst buffer's definition events.
-  const auto& src_definition_event = src_device_buffer->definition_event();
-
-  auto copy_task = [num_leaf_buffers, src_buffers = std::move(src_buffers),
-                    dst_buffers_copies = dst_buffers, dst_definition_events,
-                    src_definition_event,
-                    ready_on_exit = std::move(ready_on_exit)]() mutable {
-    tsl::profiler::TraceMe traceme("D2D Dispatch");
-    if (auto* error = src_definition_event.GetErrorIfPresent()) {
-      for (int i = 0; i < num_leaf_buffers; ++i) {
-        // Any error discovered in src buffer are propagated to dst buffer
-        // definition events, which will surface to users in
-        // dst_buffer->ToLiteral().
-        dst_definition_events[i].SetError(*error);
-      }
-      return;
-    }
-
-    for (int i = 0; i < num_leaf_buffers; ++i) {
-      // `src_buffers` are available because `src_definition_event` should have
-      // been ready.
-      CHECK(src_buffers[i].IsConcrete());
-      auto dst_memory = MaybeOwningCpuMemory::Allocate(src_buffers[i]->size());
-      if (!dst_memory.ok()) {
-        dst_definition_events[i].SetError(dst_memory.status());
-        continue;
-      }
-      dst_buffers_copies[i].emplace(std::move(*dst_memory));
-      std::memcpy(dst_buffers_copies[i]->data(), src_buffers[i]->data(),
-                  src_buffers[i]->size());
-      dst_definition_events[i].SetStateConcrete();
-    }
-  };
-
-  src_definition_event.AndThen(
-      [async_work_runner, copy_task = std::move(copy_task)]() mutable {
-        async_work_runner->Schedule(std::move(copy_task));
-      });
-
-  return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      on_device_shape_.IsTuple(), /*owns_buffers=*/true, std::move(dst_buffers),
-      std::move(dst_buffers_sizes), std::move(dst_definition_events));
-}
-
-PjRtFuture<> AbstractTfrtCpuBuffer::GetReadyFuture() {
-  tsl::AsyncValueRef<CpuEvent> definition_event;
-  {
-    absl::MutexLock lock(&mu_);
-    if (!tracked_device_buffer_) {
-      return PjRtFuture<>(InvalidArgument(
-          "GetReadyFuture() called on deleted or donated buffer"));
-    }
-    definition_event = tracked_device_buffer_->definition_event();
-  }
-  DCHECK(definition_event);
-
-  if (definition_event.IsAvailable()) {
-    if (definition_event.IsError()) {
-      return PjRtFuture<>(
-          FailedPrecondition("Buffer Definition Event: %s",
-                             definition_event.GetError().message()));
-    }
-    return PjRtFuture<>(absl::OkStatus());
-  } else {
-    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
-    definition_event.AndThen([definition_event = definition_event.AsPtr(),
-                              promise]() mutable {
-      if (definition_event.IsError()) {
-        promise.Set(FailedPrecondition("Buffer Definition Event: %s",
-                                       definition_event.GetError().message()));
-      } else {
-        promise.Set();
-      }
-    });
-
-    std::string message = absl::StrCat(buffer_name(), "::Await");
-    return PjRtFuture<>(
-        std::move(promise),
-        /*on_block_start=*/
-        [message]() {
-          absl::string_view message_view(message);
-          tsl::profiler::TraceMeProducer traceme(message_view);
-          VLOG(1) << message_view;
-          return PjRtFutureHelpers::ProfilingKeys(
-              {/*traceme_context_id=*/traceme.GetContextId()});
-        },
-        /*on_block_end=*/
-        [message](PjRtFutureHelpers::ProfilingKeys keys) {
-          absl::string_view message_view(message);
-          tsl::profiler::TraceMeConsumer traceme(message_view,
-                                                 keys.traceme_context_id);
-        });
-  }
-}
-
-namespace {
-
-void PackOrCopy(PrimitiveType element_type, const LiteralSlice& literal,
-                void* data, int64_t size) {
-  if (primitive_util::IsSubByteNonPredType(element_type)) {
-    const int bit_width = primitive_util::BitWidth(element_type);
-    absl::Span<const char> src_data_span(
-        static_cast<const char*>(literal.untyped_data()), literal.size_bytes());
-    absl::Span<char> dst_data_span(static_cast<char*>(data), size);
-    PackIntN(bit_width, src_data_span, dst_data_span);
-  } else {
-    CHECK_EQ(literal.size_bytes(), size);
-    std::memcpy(data, literal.untyped_data(), size);
-  }
-}
-
-}  // namespace
-
-// The buffer's memory should have been allocated before calling this function.
-void AbstractTfrtCpuBuffer::CopyFromLiteral(
-    const LiteralSlice& literal, const Shape& shape,
-    absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-    AsyncWorkRunner* async_work_runner) {
-  auto usage_event = tsl::MakeAvailableAsyncValueRef<CpuEvent>();
-  auto* device_buffer = AcquireUsage(std::move(usage_event));
-  CHECK(device_buffer);
-  if (!shape.IsTuple()) {
-    // It is OK to capture `buffer` pointer because the `output_buffer` can't be
-    // deleted until all the usage holds have gone away.
-    async_work_runner->Schedule(
-        [literal, av = (*avs)[0].CopyRef(), device_buffer, shape]() mutable {
-          tsl::profiler::TraceMe traceme("H2D Dispatch");
-          const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
-              device_buffer->Buffers()[0];
-          CHECK(b.IsConcrete());
-          PackOrCopy(shape.element_type(), literal, b->data(), b->size());
-          // Signal copy is complete.
-          av->SetStateConcrete();
-        });
-  } else {
-    // For tuple, transfer leaf literal individually in parallel.
-    for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
-      // It is OK to capture `buffer` pointer because the `output_buffer` can't
-      // be deleted until all the usage holds have gone away.
-      async_work_runner->Schedule([i, literal, av = (*avs)[i].CopyRef(), shape,
-                                   device_buffer]() mutable {
-        tsl::profiler::TraceMe traceme("H2D Dispatch");
-        auto slice = LiteralSlice(literal, {i});
-        const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
-            device_buffer->Buffers()[i];
-        CHECK(b.IsConcrete());
-        PackOrCopy(slice.shape().element_type(), slice, b->data(), b->size());
-        // Signal copy is complete.
-        av->SetStateConcrete();
-      });
-    }
-  }
-}
-
-/*static*/ absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
-AbstractTfrtCpuBuffer::AllocateTrackedDeviceBuffer(
-    const Shape& on_device_shape,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events) {
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
-  if (!on_device_shape.IsTuple()) {
-    size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
-    TF_ASSIGN_OR_RETURN(tsl::AsyncValueRef<MaybeOwningCpuMemory> device_buffer,
-                        MaybeOwningCpuMemory::AllocateAvailableAvr(byte_size));
-    buffers.push_back(std::move(device_buffer));
-    return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-        /*is_tuple=*/false, /*owns_buffers=*/true, std::move(buffers),
-        std::move(definition_events));
-  }
-  // Tuple case.
-  buffers.reserve(on_device_shape.tuple_shapes().size());
-  for (const auto& leaf_shape : on_device_shape.tuple_shapes()) {
-    size_t byte_size = ShapeUtil::ByteSizeOf(leaf_shape);
-    TF_ASSIGN_OR_RETURN(tsl::AsyncValueRef<MaybeOwningCpuMemory> device_buffer,
-                        MaybeOwningCpuMemory::AllocateAvailableAvr(byte_size));
-    buffers.push_back(std::move(device_buffer));
-  }
-  return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      /*is_tuple=*/true, /*owns_buffers=*/true, std::move(buffers),
-      std::move(definition_events));
-}
-
-/*static*/ void AbstractTfrtCpuBuffer::AllocateAvsAndEvents(
-    const Shape& shape,
-    absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events) {
-  // Nested tuple shapes are not supported here.
-  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes_size() : 1;
-  for (int i = 0; i < num_leaf_buffers; ++i) {
-    tsl::AsyncValueRef<CpuEvent> definition_event =
-        tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-    definition_events->push_back(definition_event.CopyRef());
-    avs->push_back(std::move(definition_event));
-  }
-}
-
-/*static*/ absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
-AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
-    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
-    std::optional<absl::Span<int64_t const>> byte_strides,
-    PjRtClient::HostBufferSemantics host_buffer_semantics,
-    absl::AnyInvocable<void() &&> on_done_with_host_buffer, const Shape& shape,
-    AsyncWorkRunner* async_work_runner, absl::Mutex* transpose_mu,
-    TransposePlanCache* transpose_cache) {
-  bool has_default_layout =
-      !byte_strides || HasMajorToMinorLayout(type, dims, *byte_strides);
-  const int bit_width = primitive_util::BitWidth(type);
-  // Packed arrays are unpacked on host and packed on device.
-  bool is_packed = primitive_util::IsSubByteNonPredType(type);
-
-  // If the input buffer has a default layout and is sufficiently aligned, we
-  // can simply point to the input array's data without any further copies. At
-  // the time of writing we require a 16-byte alignment because XLA may generate
-  // code which requires it.
-  bool is_aligned_data = ((absl::bit_cast<std::uintptr_t>(data) &
-                           (cpu_function_runtime::MinAlign() - 1)) == 0);
-
-  using HostBufferSemantics = PjRtClient::HostBufferSemantics;
-  bool immutable_zero_copy_semantics =
-      host_buffer_semantics == HostBufferSemantics::kImmutableZeroCopy;
-  bool mutable_zero_copy_semantics =
-      host_buffer_semantics == HostBufferSemantics::kMutableZeroCopy;
-
-  bool can_use_zero_copy =
-      has_default_layout && !is_packed && is_aligned_data &&
-      (immutable_zero_copy_semantics || mutable_zero_copy_semantics);
-
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
-  absl::AnyInvocable<void() &&> on_delete_callback;
-  size_t byte_size = ShapeUtil::ByteSizeOf(shape);
-  bool owns_buffers = true;
-
-  if (can_use_zero_copy && mutable_zero_copy_semantics) {
-    // For a mutable zero copy semantics we pass a no-op deleter because
-    // underlying buffer is owned by the caller and it will free it when
-    // PjRt will call `on_done_with_host_buffer` callback.
-    MaybeOwningCpuMemory::OwnedDataPtr::deleter_type no_op = +[](void*) {};
-    buffers.push_back(tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
-        MaybeOwningCpuMemory::OwnedDataPtr(
-            reinterpret_cast<uint8_t*>(const_cast<void*>(data)), no_op),
-        byte_size));
-    on_delete_callback = std::move(on_done_with_host_buffer);
-
-  } else if (can_use_zero_copy && immutable_zero_copy_semantics) {
-    // For immutable zero-copy semantics we pass non-owning cpu memory.
-    owns_buffers = false;
-    buffers.push_back(tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
-        const_cast<void*>(data), byte_size));
-    on_delete_callback = std::move(on_done_with_host_buffer);
-
-  } else {
-    size_t dst_byte_size =
-        is_packed ? CeilOfRatio<size_t>(byte_size, 8 / bit_width) : byte_size;
-    TF_ASSIGN_OR_RETURN(
-        tsl::AsyncValueRef<MaybeOwningCpuMemory> device_buffer,
-        MaybeOwningCpuMemory::AllocateAvailableAvr(dst_byte_size));
-    auto dst_data_ptr = device_buffer->data();
-    buffers.push_back(device_buffer);
-    if (!has_default_layout || is_packed) {
-      // If the input array does not have a major-to-minor layout, transpose it
-      // into major-to-minor layout. Currently we choose to always do this
-      // synchronously.
-      // TODO(phawkins): consider performing the transpose asynchronously.
-      // TODO(phawkins): parallelize the transpose.
-      std::shared_ptr<TransposePlan> transpose;
-      {
-        absl::InlinedVector<int64_t, 4> permutation(dims.size());
-        absl::c_iota(permutation, 0);
-        TransposePlan::Options options;
-        options.elem_size_in_bytes = primitive_util::ByteWidth(type);
-        options.dims = dims;
-        options.permutation = permutation;
-        if (byte_strides) {
-          options.input_layout = TransposePlan::Striding{*byte_strides};
-        }
-        absl::MutexLock lock(transpose_mu);
-        TF_ASSIGN_OR_RETURN(transpose, transpose_cache->GetOrCreate(options));
-      }
-      if (!is_packed) {
-        transpose->Execute(data, dst_data_ptr);
-      } else {
-        // First transpose the unpacked data into a new temporary buffer, then
-        // pack the data.
-        // TODO(reedwm): Fuse the transpose and packing by having TransposePlan
-        // support packing.
-        auto data_transposed = std::make_unique<char[]>(byte_size);
-        transpose->Execute(data, data_transposed.get());
-        absl::Span<const char> src_data_span(data_transposed.get(), byte_size);
-        absl::Span<char> dst_data_span(static_cast<char*>(dst_data_ptr),
-                                       dst_byte_size);
-        PackIntN(bit_width, src_data_span, dst_data_span);
-      }
-      if (on_done_with_host_buffer) {
-        std::move(on_done_with_host_buffer)();
-        on_done_with_host_buffer = nullptr;
-      }
-    } else {
-      bool should_sync_copy =
-          host_buffer_semantics ==
-              PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall ||
-          (byte_size < kSmallDataTransferByteSize);
-      if (should_sync_copy) {
-        std::memcpy(dst_data_ptr, data, byte_size);
-        if (on_done_with_host_buffer) {
-          std::move(on_done_with_host_buffer)();
-          on_done_with_host_buffer = nullptr;
-        }
-      } else {
-        tsl::AsyncValueRef<CpuEvent> copy_event =
-            tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-        definition_events.push_back(copy_event.CopyRef());
-        async_work_runner->Schedule(
-            [device_buffer = std::move(device_buffer), dst_data_ptr, data,
-             byte_size, copy_event = std::move(copy_event),
-             on_done_with_host_buffer =
-                 std::move(on_done_with_host_buffer)]() mutable {
-              tsl::profiler::TraceMe traceme("H2D Dispatch");
-              std::memcpy(dst_data_ptr, data, byte_size);
-              if (on_done_with_host_buffer) {
-                std::move(on_done_with_host_buffer)();
-                on_done_with_host_buffer = nullptr;
-              }
-              // Signal copy is complete.
-              copy_event.SetStateConcrete();
-            });
-      }
-    }
-  }
-  return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      /*is_tuple=*/false, owns_buffers, std::move(buffers),
-      std::move(definition_events), std::move(on_delete_callback));
-}
-
-AbstractAsyncHostToHostMemoryTransferManager::
-    AbstractAsyncHostToHostMemoryTransferManager(
-        absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs,
-        absl::InlinedVector<std::unique_ptr<AbstractTfrtCpuBuffer>, 4> buffers,
-        absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers,
-        absl::InlinedVector<size_t, 4> buffer_sizes,
-        absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight,
-        absl::InlinedVector<bool, 4> last_transfer_finished,
-        AsyncWorkRunner* async_work_runner)
-    : transfers_in_flight_(0),
-      avs_(std::move(avs)),
-      buffer_transfers_in_flight_(std::move(buffer_transfers_in_flight)),
-      last_transfer_finished_(std::move(last_transfer_finished)),
-      buffers_(std::move(buffers)),
-      device_buffers_(std::move(device_buffers)),
-      buffer_sizes_(std::move(buffer_sizes)),
-      async_work_runner_(async_work_runner) {}
-
-AbstractAsyncHostToHostMemoryTransferManager::
-    ~AbstractAsyncHostToHostMemoryTransferManager() {
-  // Wait for in-flight transfers to finish.
-  absl::Condition transfers_finished(
-      +[](int* t) { return *t == 0; }, &transfers_in_flight_);
-  VLOG(2) << "Waiting for in-flight transfers to finish.";
-  absl::MutexLock l(&mu_);
-  mu_.Await(transfers_finished);
-  for (auto& avref : avs_) {
-    auto av = avref;
-    if (av && av->IsUnavailable()) {
-      av->SetError(absl::InternalError(
-          "Async transfer object was deleted before transfers completed."));
-    }
-  }
-  VLOG(2) << "In-flight transfers finished.";
-}
-
-size_t AbstractAsyncHostToHostMemoryTransferManager::buffer_size(
-    int buffer_index) const {
-  CHECK_GE(buffer_index, 0);
-  CHECK_LT(buffer_index, buffer_sizes_.size());
-  return buffer_sizes_[buffer_index];
-}
-
-std::unique_ptr<PjRtBuffer>
-AbstractAsyncHostToHostMemoryTransferManager::RetrieveBuffer(int buffer_index) {
-  absl::MutexLock l(&mu_);
-  CHECK_GE(buffer_index, 0);
-  CHECK_LT(buffer_index, buffers_.size());
-  return std::move(buffers_[buffer_index]);
-}
-
-absl::Status
-AbstractAsyncHostToHostMemoryTransferManager::TransferLiteralToBuffer(
-    int buffer_index, const LiteralSlice& literal,
-    absl::AnyInvocable<void() &&> on_done) {
-  const Shape& shape = literal.shape();
-  if (shape.has_layout() &&
-      !LayoutUtil::IsMonotonicWithDim0Major(shape.layout())) {
-    return absl::UnimplementedError(
-        "PjRt CPU's TransferLiteralToBuffer does not support "
-        "non-major-to-minor layout");
-  }
-  return FillRawDataToSubBuffer(
-      buffer_index,
-      [literal](void* b, int64_t size) {
-        PackOrCopy(literal.shape().element_type(), literal, b, size);
-      },
-      /*is_last_transfer=*/true, std::move(on_done));
-}
-
-absl::Status
-AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToBuffer(
-    int buffer_index, absl::string_view data,
-    absl::AnyInvocable<void() &&> on_done) {
-  return TransferRawDataToSubBuffer(
-      buffer_index, data.data(), /*offset=*/0, data.size(),
-      /*is_last_transfer=*/true, std::move(on_done));
-}
-
-// The definition events of `device_buffers_` must be ready before calling this
-// function.
-absl::Status
-AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToSubBuffer(
-    int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
-    bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) {
-  return FillRawDataToSubBuffer(
-      buffer_index,
-      [offset, data, transfer_size](void* b, int64_t size) {
-        std::memcpy(reinterpret_cast<char*>(b) + offset, data, transfer_size);
-      },
-      is_last_transfer, std::move(on_done));
-}
-
-absl::Status
-AbstractAsyncHostToHostMemoryTransferManager::FillRawDataToSubBuffer(
-    int buffer_index,
-    absl::AnyInvocable<void(void* data, int64_t size)> fill_fn,
-    bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) {
-  {
-    // We release the lock when out of scope because
-    // `async_work_runner_->Schedule` might sometimes run the closure in this
-    // thread!
-    absl::MutexLock l(&mu_);
-
-    CHECK_GE(buffer_index, 0);
-    CHECK_LT(buffer_index, buffers_.size());
-    CHECK(!last_transfer_finished_[buffer_index]);
-    ++buffer_transfers_in_flight_[buffer_index];
-    ++transfers_in_flight_;
-  }
-
-  CHECK(async_work_runner_ != nullptr);
-  async_work_runner_->Schedule([this, fill_fn = std::move(fill_fn),
-                                is_last_transfer, on_done = std::move(on_done),
-                                buffer_index]() mutable -> void {
-    tsl::RCReference<tsl::AsyncValue> event;
-    {
-      absl::MutexLock l(&mu_);
-      const auto& b = device_buffers_[buffer_index]->Buffers()[0];
-      CHECK(b.IsConcrete());
-      fill_fn(reinterpret_cast<char*>(b->data()), b->size());
-      if (is_last_transfer) {
-        last_transfer_finished_[buffer_index] = true;
-      }
-      --buffer_transfers_in_flight_[buffer_index];
-      --transfers_in_flight_;
-      if (buffer_transfers_in_flight_[buffer_index] == 0 &&
-          last_transfer_finished_[buffer_index]) {
-        std::swap(event, avs_[buffer_index]);
-      }
-    }
-    // Call on_done outside the lock because it may call
-    // ~AbstractAsyncHostToHostMemoryTransferManager.
-    std::move(on_done)();
-    if (event) {
-      event->SetStateConcrete();
-    }
-  });
-  return absl::OkStatus();
-}
-
-void AbstractAsyncHostToHostMemoryTransferManager::SetBufferError(
-    int buffer_index, absl::Status error) {
-  absl::MutexLock l(&mu_);
-  avs_[buffer_index]->SetError(error);
-}
-
-/*static*/ absl::Status
-AbstractAsyncHostToHostMemoryTransferManager::PopulateAsyncTransferManagerData(
-    absl::Span<const std::unique_ptr<AbstractTfrtCpuBuffer>> buffers,
-    absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4>& device_buffers,
-    absl::InlinedVector<size_t, 4>& buffer_sizes,
-    absl::InlinedVector<int64_t, 4>& buffer_transfers_in_flight,
-    absl::InlinedVector<bool, 4>& last_transfer_finished) {
-  buffer_transfers_in_flight.resize(buffers.size(), 0);
-  last_transfer_finished.resize(buffers.size(), false);
-
-  device_buffers.reserve(buffers.size());
-  for (const auto& buffer : buffers) {
-    // We can make the usage event available right away because the buffer's
-    // definition event will be made available after the usage has completed.
-    auto usage_event = tsl::MakeAvailableAsyncValueRef<CpuEvent>();
-    auto* device_buffer = buffer->AcquireUsage(std::move(usage_event));
-    CHECK(device_buffer);
-    device_buffers.push_back(device_buffer);
-  }
-
-  buffer_sizes.reserve(buffers.size());
-  for (const auto& buffer : buffers) {
-    TF_ASSIGN_OR_RETURN(auto buffer_size, buffer->GetOnDeviceSizeInBytes());
-    buffer_sizes.push_back(buffer_size);
-  }
-
-  return absl::OkStatus();
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
deleted file mode 100644
index c8d9fbea131c..000000000000
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
+++ /dev/null
@@ -1,395 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
-#define XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/literal.h"
-#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/transpose.h"
-#include "xla/service/cpu/cpu_event.h"
-#include "xla/shape.h"
-#include "xla/tsl/concurrency/async_value.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-
-// A RAII helper class used to set an AsyncValueRef<CpuEvent> to a ready state
-// upon destruction. In many cases in PjRt implementation, there will be
-// multiple return statements in the function, all of which require setting some
-// AsyncValueRef<CpuEvent> to be ready. This class could make such code more
-// robust by using setting the AsyncValue in the destructor.
-class MarkEventReadyOnExit {
- public:
-  explicit MarkEventReadyOnExit(tsl::AsyncValueRef<CpuEvent> event)
-      : event_(std::move(event)) {}
-
-  MarkEventReadyOnExit(const MarkEventReadyOnExit&) = delete;
-  MarkEventReadyOnExit& operator=(const MarkEventReadyOnExit&) = delete;
-  MarkEventReadyOnExit(MarkEventReadyOnExit&&) noexcept = default;
-  MarkEventReadyOnExit& operator=(MarkEventReadyOnExit&&) noexcept = default;
-
-  ~MarkEventReadyOnExit() {
-    if (event_) event_.SetStateConcrete();
-  }
-
-  tsl::AsyncValueRef<CpuEvent> Release() && { return std::move(event_); }
-
- private:
-  tsl::AsyncValueRef<CpuEvent> event_;
-};
-
-// Async work runner abstracts away the implementation of the underlying thread
-// pool (or concurrent work queue).
-class AsyncWorkRunner {
- public:
-  virtual ~AsyncWorkRunner() = default;
-
-  // `work` euqueued by `Schedule` may run on the calling thread.
-  virtual void Schedule(absl::AnyInvocable<void()> work) = 0;
-  virtual void ScheduleWhenReady(
-      absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
-      absl::AnyInvocable<void()> work) = 0;
-};
-
-class AbstractTfrtCpuBuffer : public PjRtBuffer {
- public:
-  AbstractTfrtCpuBuffer(
-      Shape on_device_shape,
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer);
-  ~AbstractTfrtCpuBuffer() override;
-
-  const Shape& on_device_shape() const override { return on_device_shape_; }
-
-  absl::StatusOr<Shape> logical_on_device_shape() override;
-
-  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
-      override;
-
-  absl::StatusOr<std::unique_ptr<ExternalReference>>
-  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override;
-
-  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
-
-  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
-                             int64_t transfer_size) override {
-    return PjRtFuture<>(Unimplemented("CopyRawToHost not implemented"));
-  }
-
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
-      PjRtMemorySpace* dst_memory_space) override {
-    return Unimplemented("CopyToMemorySpace not implemented");
-  }
-
-  void Delete() override;
-
-  bool IsDeleted() override;
-
-  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
-                          RemoteSendCallback on_done) override {
-    on_done(Unimplemented("CopyToRemoteDevice not implemented."),
-            /*sends_were_enqueued=*/false);
-  }
-
-  PjRtFuture<> GetReadyFuture() override;
-
-  bool IsOnCpu() const override { return true; }
-
-  // Acquires the device buffer for shared read-only usages, and it also adds
-  // the `usage_event` to it. Any donation event in the future is expected to be
-  // serialized after all the usage events added through this method. Returns
-  // nullptr if the buffer is already donated or there is outstanding external
-  // references.
-  TrackedTfrtCpuDeviceBuffer* AcquireUsage(
-      tsl::AsyncValueRef<CpuEvent> usage_event);
-
-  // A helper class for managing a pending donation. It should be committed upon
-  // success. Otherwise, the donated buffer is returned to the
-  // AbstractTfrtCpuBuffer.
-  class DonationTransaction {
-   public:
-    explicit DonationTransaction(
-        AbstractTfrtCpuBuffer* buffer,
-        std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer)
-        : buffer_(buffer), device_buffer_(std::move(device_buffer)) {
-      CHECK(buffer_);
-    }
-    DonationTransaction(const DonationTransaction&) = delete;
-    DonationTransaction& operator=(const DonationTransaction&) = delete;
-    DonationTransaction(DonationTransaction&&) = default;
-    DonationTransaction& operator=(DonationTransaction&& other) noexcept {
-      Abort();
-
-      buffer_ = other.buffer_;
-      device_buffer_ = std::move(other.device_buffer_);
-      return *this;
-    }
-
-    ~DonationTransaction() { Abort(); }
-
-    // Commit the donation. The rvalue ref qualifier is used to ensure the
-    // semantic that it can be committed at most once.
-    void Commit() && {
-      buffer_->CommitDonation();
-      device_buffer_.reset();
-    }
-
-    TrackedTfrtCpuDeviceBuffer* device_buffer() const {
-      return device_buffer_.get();
-    }
-
-   private:
-    void Abort() {
-      if (device_buffer_) buffer_->AbortDonation(std::move(device_buffer_));
-    }
-
-    AbstractTfrtCpuBuffer* buffer_ = nullptr;
-    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer_;
-  };
-
-  // Acquires the device buffer for exclusive donation. The caller of this
-  // method is expected to use the usage events and definition events to
-  // serialize this donation with previous usages. After this method is called,
-  // calls to AcquireUsage() will fail. Returns error status if the buffer is
-  // already donated or there is outstanding external references.
-  absl::StatusOr<DonationTransaction> AcquireDonation();
-
-  // A helper function for PjRtClient::BufferFromHostLiteral. Copy the literal
-  // to the current buffer asynchronously. `avs` is used to signal when the copy
-  // is complete and `async_work_runner` is used to schedule the async work into
-  // the underlying thread pool or work queue (usually owned by the client).
-  void CopyFromLiteral(
-      const LiteralSlice& literal, const Shape& shape,
-      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-      AsyncWorkRunner* async_work_runner);
-
-  // Allocates a new `TrackedTfrtCpuDeviceBuffer` with the given shape and
-  // definition events.
-  static absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
-  AllocateTrackedDeviceBuffer(
-      const Shape& on_device_shape,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
-
-  // Allocates new cpu events to `avs` and `definition_events`. If `shape` is a
-  // tuple, multiple events will be allocated. Otherwise, `avs` and
-  // `definition_events` will only contain one event.
-  static void AllocateAvsAndEvents(
-      const Shape& shape,
-      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events);
-
-  // A helper function for PjRtClient::BufferFromHostBuffer. Creates a new cpu
-  // device buffer from the host buffer (maybe zero-copy or async).
-  // `transpose_mu` and `transpose_cache` are used to transpose the input
-  // layout.
-  static absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
-  BufferFromHostBufferHelper(
-      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
-      std::optional<absl::Span<int64_t const>> byte_strides,
-      PjRtClient::HostBufferSemantics host_buffer_semantics,
-      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
-      const Shape& shape, AsyncWorkRunner* async_work_runner,
-      absl::Mutex* transpose_mu, TransposePlanCache* transpose_cache);
-
- protected:
-  virtual absl::string_view buffer_name() const = 0;
-
-  PjRtFuture<> ToLiteralHelper(MutableLiteralBase* literal,
-                               AsyncWorkRunner* async_work_runner);
-
-  PjRtFuture<> DoAsyncWorkOnBuffer(
-      absl::string_view method_name,
-      absl::AnyInvocable<
-          absl::Status(const Shape& device_shape,
-                       TrackedTfrtCpuDeviceBuffer* device_buffer) &&>
-          work_on_buffer,
-      bool should_do_work_sync, AsyncWorkRunner* async_work_runner);
-
-  PjRtFuture<> CopyRawToHostHelper(void* dst, int64_t offset,
-                                   int64_t transfer_size,
-                                   AsyncWorkRunner* async_work_runner);
-
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceAcrossClients(
-      PjRtDevice* dst_device);
-
-  absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
-  CopyToDeviceHelper(AsyncWorkRunner* async_work_runner);
-
-  bool IsEmptyTuple() const {
-    return on_device_shape_.IsTuple() &&
-           on_device_shape_.tuple_shapes_size() == 0;
-  }
-
-  void DropExternalReference();
-
-  // Commits the pending donation by setting `pending_donation_` to false.
-  // `pending_donation_` must be true before calling this method.
-  void CommitDonation();
-
-  // Aborts the pending donation by returning the donated buffer, and setting
-  // `pending_donation_` to false. `pending_donation_` must be true before
-  // calling this method.
-  void AbortDonation(std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer);
-
-  // Similar to Delete, drops the buffer's reference to its associated device
-  // memory, leaving the buffer in an invalid state, but returns the
-  // TrackedTfrtCpuDeviceBuffer rather than freeing the device memory, so that
-  // another framework can take ownership of it. The buffer returned from
-  // Release may be safely dropped at any time even if it still has pending
-  // async operations. The client should call Await before calling Release with
-  // wait_for_operations_to_complete=false, to ensure that the host has
-  // synchronized past any outstanding write operations to the buffer. If
-  // wait_for_operations_to_complete=true the host will block until any
-  // potentially outstanding asynchronous operations have completed before
-  // returning, in which case it is safe to read or mutate the returned buffer.
-  // If the buffer was shared via an external reference it is the client's
-  // responsibility that accesses via that reference do not interfere with
-  // accesses via the buffer returned from Release.
-  absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>> Release(
-      bool wait_for_operations_to_complete);
-
-  // Releases the device buffer by returning a unique_ptr of it. If there is
-  // outstanding donation or usage holds, this method blocks until those holds
-  // are committed or dropped.
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> ReleaseBufferLocked()
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  const Shape on_device_shape_;
-
-  mutable absl::Mutex mu_;
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer_
-      ABSL_GUARDED_BY(mu_);
-  // Count of external references on the buffer.
-  int external_reference_counter_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // If this buffer has external references when Delete() is called, this event
-  // is populated by Delete(). When the last external reference is released,
-  // the event is triggered, which is a precondition for the buffer being
-  std::optional<tsl::AsyncValueRef<CpuEvent>> external_references_dropped_event_
-      ABSL_GUARDED_BY(mu_);
-
-  // `pending_donation_` indicates whether a donation is pending. The destructor
-  // of the AbstractTfrtCpuBuffer will wait for a pending donation, as the
-  // donation might fail. Note that concurrent calls to AcquireUsage() and
-  // AcquireDonation() might fail even if the pending donation is aborted later.
-  bool pending_donation_ ABSL_GUARDED_BY(mu_) = false;
-};
-
-class AbstractAsyncHostToHostMemoryTransferManager
-    : public PjRtClient::AsyncHostToDeviceTransferManager {
- public:
-  ~AbstractAsyncHostToHostMemoryTransferManager() override;
-
-  size_t buffer_count() const override { return buffer_sizes_.size(); }
-
-  size_t buffer_size(int buffer_index) const override;
-
-  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override;
-
-  absl::Status TransferLiteralToBuffer(
-      int buffer_index, const LiteralSlice& literal,
-      absl::AnyInvocable<void() &&> on_done) override;
-
-  absl::Status TransferRawDataToBuffer(
-      int buffer_index, absl::string_view data,
-      absl::AnyInvocable<void() &&> on_done) override;
-
-  absl::Status TransferRawDataToSubBuffer(
-      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
-      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override;
-
-  void SetBufferError(int buffer_index, absl::Status error) override;
-
-  void AddTransferMetadata(const TransferMetadata& meta) override {
-    LOG(WARNING) << "AddTransferMetadata not implemented for "
-                    "AbstractAsyncHostToHostMemoryTransferManager";
-  }
-
- protected:
-  AbstractAsyncHostToHostMemoryTransferManager(
-      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs,
-      absl::InlinedVector<std::unique_ptr<AbstractTfrtCpuBuffer>, 4> buffers,
-      absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers,
-      absl::InlinedVector<size_t, 4> buffer_sizes,
-      absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight,
-      absl::InlinedVector<bool, 4> last_transfer_finished,
-      AsyncWorkRunner* async_work_runner);
-
-  // Initialize `device_buffers`, `buffer_sizes`, `buffer_transfers_in_flight`,
-  // and `last_transfer_finished` from `buffers`.
-  static absl::Status PopulateAsyncTransferManagerData(
-      absl::Span<const std::unique_ptr<AbstractTfrtCpuBuffer>> buffers,
-      absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4>& device_buffers,
-      absl::InlinedVector<size_t, 4>& buffer_sizes,
-      absl::InlinedVector<int64_t, 4>& buffer_transfers_in_flight,
-      absl::InlinedVector<bool, 4>& last_transfer_finished);
-
-  absl::Status FillRawDataToSubBuffer(
-      int buffer_index,
-      absl::AnyInvocable<void(void* data, int64_t size)> fill_fn,
-      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done);
-
-  mutable absl::Mutex mu_;
-  // The number of transfers that are currently in flight.
-  int transfers_in_flight_ ABSL_GUARDED_BY(mu_);
-  // AsyncValues used to mark buffers as ready for consumption.
-  absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs_
-      ABSL_GUARDED_BY(mu_);
-  // Holds the number of in-flight transfers for each buffer.
-  absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight_
-      ABSL_GUARDED_BY(mu_);
-  // Flag to indicate whether we have seen the last transfer of each buffer.
-  absl::InlinedVector<bool, 4> last_transfer_finished_ ABSL_GUARDED_BY(mu_);
-  // The newly created buffers, which will be returned to the caller via
-  // Retrieve.
-  absl::InlinedVector<std::unique_ptr<AbstractTfrtCpuBuffer>, 4> buffers_
-      ABSL_GUARDED_BY(mu_);
-  // Device buffers which we use to get the underlying memory to populate.
-  absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers_
-      ABSL_GUARDED_BY(mu_);
-  // Cached versions of the sizes of all the buffers. Not modified after
-  // creation, so not guarded by mu_.
-  absl::InlinedVector<size_t, 4> buffer_sizes_;
-
-  AsyncWorkRunner* async_work_runner_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_async_execution_tracker.cc b/third_party/xla/xla/pjrt/cpu/cpu_async_execution_tracker.cc
new file mode 100644
index 000000000000..39d7346731fc
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/cpu_async_execution_tracker.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/cpu_async_execution_tracker.h"
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla {
+
+CpuScopedAsyncExecution::CpuScopedAsyncExecution(
+    CpuAsyncExecutionTracker* tracker, int32_t launch_id, Key key)
+    : tracker_(tracker), launch_id_(launch_id), key_(key) {}
+
+CpuScopedAsyncExecution::CpuScopedAsyncExecution(
+    CpuScopedAsyncExecution&& other)
+    : tracker_(other.tracker_), launch_id_(other.launch_id_), key_(other.key_) {
+  other.tracker_ = nullptr;
+}
+
+CpuScopedAsyncExecution::~CpuScopedAsyncExecution() {
+  if (tracker_ != nullptr) {
+    tracker_->RemoveAsyncExecution(launch_id_, key_);
+  }
+}
+
+void CpuScopedAsyncExecution::SetStateConcrete() {
+  if (tracker_ != nullptr) {
+    tracker_->SetStateConcrete(launch_id_, key_);
+    tracker_ = nullptr;
+  }
+}
+
+void CpuScopedAsyncExecution::SetError(absl::Status error) {
+  if (tracker_ != nullptr) {
+    tracker_->SetError(launch_id_, key_, std::move(error));
+    tracker_ = nullptr;
+  }
+}
+
+CpuScopedAsyncExecution CpuAsyncExecutionTracker::NewAsyncExecution(
+    int32_t launch_id, tsl::AsyncValueRef<CpuEvent> execute_event) {
+  absl::MutexLock lock(&mu_);
+  Key async_execution_key = execute_event.GetAsyncValue();
+  executions_[launch_id].insert(
+      {async_execution_key, std::move(execute_event)});
+  return CpuScopedAsyncExecution(this, launch_id, async_execution_key);
+}
+
+bool CpuAsyncExecutionTracker::SetError(int32_t launch_id, absl::Status error) {
+  absl::ReleasableMutexLock lock(&mu_);
+  auto it = executions_.find(launch_id);
+  if (it != executions_.end()) {
+    absl::flat_hash_map<Key, tsl::AsyncValueRef<CpuEvent>> execute_events =
+        std::move(it->second);
+    executions_.erase(it);
+    lock.Release();
+
+    if (execute_events.size() == 1) {
+      // Fast path for an execution with a unique `launch_id`.
+      tsl::AsyncValueRef<CpuEvent>& execute_event =
+          execute_events.begin()->second;
+      if (execute_event.IsUnavailable()) {
+        execute_event.SetError(std::move(error));
+        return true;
+      }
+      return false;
+    } else {
+      bool any_success = false;
+      for (auto& [key, execute_event] : execute_events) {
+        if (execute_event.IsUnavailable()) {
+          execute_event.SetError(error);
+          any_success = true;
+        }
+      }
+      return any_success;
+    }
+  }
+  return false;
+}
+
+void CpuAsyncExecutionTracker::SetError(int32_t launch_id, Key key,
+                                        absl::Status error) {
+  absl::ReleasableMutexLock lock(&mu_);
+  auto it = executions_.find(launch_id);
+  if (it != executions_.end()) {
+    auto it2 = it->second.find(key);
+    if (it2 != it->second.end()) {
+      tsl::AsyncValueRef<CpuEvent> execute_event = std::move(it2->second);
+      it->second.erase(it2);
+      if (it->second.empty()) {
+        executions_.erase(it);
+      }
+      lock.Release();
+
+      if (execute_event.IsUnavailable()) {
+        execute_event.SetError(error);
+      }
+    }
+  }
+}
+
+void CpuAsyncExecutionTracker::SetStateConcrete(int32_t launch_id, Key key) {
+  absl::ReleasableMutexLock lock(&mu_);
+  auto it = executions_.find(launch_id);
+  if (it != executions_.end()) {
+    auto it2 = it->second.find(key);
+    if (it2 != it->second.end()) {
+      tsl::AsyncValueRef<CpuEvent> execute_event = std::move(it2->second);
+      it->second.erase(it2);
+      if (it->second.empty()) {
+        executions_.erase(it);
+      }
+      lock.Release();
+
+      if (execute_event.IsUnavailable()) {
+        execute_event.SetStateConcrete();
+      }
+    }
+  }
+}
+
+void CpuAsyncExecutionTracker::RemoveAsyncExecution(int32_t launch_id,
+                                                    Key key) {
+  absl::MutexLock lock(&mu_);
+  auto it = executions_.find(launch_id);
+  if (it != executions_.end()) {
+    auto it2 = it->second.find(key);
+    if (it2 != it->second.end()) {
+      it->second.erase(it2);
+      if (it->second.empty()) {
+        executions_.erase(it);
+      }
+    }
+  }
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_async_execution_tracker.h b/third_party/xla/xla/pjrt/cpu/cpu_async_execution_tracker.h
new file mode 100644
index 000000000000..c8e9fd232ab0
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/cpu_async_execution_tracker.h
@@ -0,0 +1,113 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_CPU_ASYNC_EXECUTION_TRACKER_H_
+#define XLA_PJRT_CPU_CPU_ASYNC_EXECUTION_TRACKER_H_
+
+#include <cstdint>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla {
+
+class CpuAsyncExecutionTracker;
+
+// RAII wrapper for an async execution. It reports the completion of the async
+// execution to the tracker and acts as a helper to set the state of the execute
+// event only if it is not set yet. Typically, either `SetStateConcrete()` or
+// `SetError()` should be called before the destruction of
+// `CpuScopedAsyncExecution`.
+//
+// Not thread-safe.
+class CpuScopedAsyncExecution {
+ public:
+  // Opaque key that uniquely identifies an async execution for tracking
+  // purposes.
+  using Key = const void*;
+
+  CpuScopedAsyncExecution(CpuAsyncExecutionTracker* tracker, int32_t launch_id,
+                          Key key);
+  CpuScopedAsyncExecution(CpuScopedAsyncExecution&& other);
+  ~CpuScopedAsyncExecution();
+
+  CpuScopedAsyncExecution(const CpuScopedAsyncExecution&) = delete;
+
+  // Sets the state of the execution to a ready state. No-op if the execute
+  // event is already set.
+  void SetStateConcrete();
+
+  // Sets the state of the execution to an error. No-op if the execute event is
+  // already set.
+  void SetError(absl::Status error);
+
+ private:
+  CpuAsyncExecutionTracker* tracker_;
+  int32_t launch_id_;
+  Key key_;
+};
+
+// Tracks async executions that have not finished yet. Upon destruction, the
+// tracker will wait for all async executions to finish to help graceful
+// teardown of the runtime state.
+//
+// Thread-safe.
+class CpuAsyncExecutionTracker {
+ public:
+  using Key = CpuScopedAsyncExecution::Key;
+
+  // Registers a new execution dispatched to a device.
+  CpuScopedAsyncExecution NewAsyncExecution(
+      int32_t launch_id, tsl::AsyncValueRef<CpuEvent> execute_event);
+
+  // Sets the state of any executions with `launch_id` to an error. Returns true
+  // if it succeeds to set the state. Returns false if all executions have been
+  // removed or their execute event is already set.
+  bool SetError(int32_t launch_id, absl::Status error);
+
+  // Below is used by `CpuScopedAsyncExecution`.
+
+  // Sets the state of the execute event to an error. Returns true if it
+  // succeeds to set the state. Returns false if the execution has been removed
+  // or the execute event is already set.
+  void SetError(int32_t launch_id, Key key, absl::Status error);
+
+  // Sets the state of the execute event to a ready state. Returns true if it
+  // succeeds to set the state. Returns false if the execution has been removed
+  // or the execute event is already set.
+  void SetStateConcrete(int32_t launch_id, Key key);
+
+  // Removes the execution from the tracker without setting the state of the
+  // execute event.
+  void RemoveAsyncExecution(int32_t launch_id, Key key);
+
+ private:
+  absl::Mutex mu_;
+
+  // Maps launch_id to the execute event of async executions that have no
+  // execute state set yet. The map value tracks multiple execute events from
+  // async executions that use the same `launch_id`.
+  absl::flat_hash_map<int32_t,
+                      absl::flat_hash_map<Key, tsl::AsyncValueRef<CpuEvent>>>
+      executions_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_CPU_ASYNC_EXECUTION_TRACKER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index d73bfcc4e68b..cdf3927df53c 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/pjrt/cpu/cpu_client.h"
 
+#include <tuple>
+
 #define EIGEN_USE_THREADS
 
 #include <algorithm>
@@ -33,6 +35,7 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/base/dynamic_annotations.h"
+#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
@@ -63,17 +66,22 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/compile_options.pb.h"
-#include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
+#include "xla/pjrt/cpu/abstract_cpu_buffer.h"
+#include "xla/pjrt/cpu/cpu_async_execution_tracker.h"
 #include "xla/pjrt/cpu/cpu_device.h"
-#include "xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.h"
-#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/pjrt/cpu/raw_buffer.h"
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
+#include "xla/pjrt/device_event.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/host_to_device_transfer_manager.h"
 #include "xla/pjrt/layout_mode.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -86,6 +94,7 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_cpu/cpu_execute_options.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
+#include "xla/pjrt/raw_buffer.h"
 #include "xla/pjrt/semaphore.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/pjrt/utils.h"
@@ -94,7 +103,6 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/cpu/cpu_compiler.h"
-#include "xla/service/cpu/cpu_event.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
 #include "xla/service/cpu/cpu_runtime.h"
@@ -136,7 +144,7 @@ namespace {
 // Converts the shape used to represent the host buffer to the shape used to
 // represent the on-device buffer.
 Shape HostShapeToOnDeviceShape(const Shape& shape) {
-  // AbstractTfrtCpuBuffer packs sub-byte non-pred types. The on-device shape
+  // AbstractCpuBuffer packs sub-byte non-pred types. The on-device shape
   // should reflect this so that our memory allocation and overflow checks are
   // correct.
   if (primitive_util::IsSubByteNonPredType(shape.element_type())) {
@@ -155,43 +163,30 @@ absl::StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBuffer(
     absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
     TfrtCpuDevice* device, TfrtCpuClient* client) {
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-      AbstractTfrtCpuBuffer::AllocateTrackedDeviceBuffer(
+      std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer,
+      AbstractCpuBuffer::AllocateTrackedDeviceBuffer(
           on_device_shape, std::move(definition_events)));
   return std::make_unique<TfrtCpuBuffer>(
       on_device_shape, std::move(tracked_device_buffer), client, device,
       *device->default_memory_space());
 }
 
-absl::StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBufferAndAvs(
-    const Shape& on_device_shape,
-    absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-    TfrtCpuDevice* device, TfrtCpuClient* client) {
-  // Add a placeholder definition event for each leaf buffer when creating the
-  // buffer.
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
-  AbstractTfrtCpuBuffer::AllocateAvsAndEvents(on_device_shape, avs,
-                                              &definition_events);
-  return AllocateDestinationBuffer(
-      on_device_shape, std::move(definition_events),
-      tensorflow::down_cast<TfrtCpuDevice*>(device), client);
-}
-
 void EnqueueWork(tsl::thread::ThreadPool* pool,
-                 absl::AnyInvocable<void()> callee) {
+                 absl::AnyInvocable<void() &&> callee) {
   // TSL TheadPool expects std::function that must be copyable, so we are
   // forced to do a little bit of manual memory management here.
-  pool->Schedule([ptr = new absl::AnyInvocable<void()>(std::move(callee))]() {
-    (*ptr)();
-    delete ptr;
-  });
+  pool->Schedule(
+      [ptr = new absl::AnyInvocable<void() &&>(std::move(callee))]() {
+        std::move (*ptr)();
+        delete ptr;
+      });
 }
 
 // Enqueue to PjRtClient pool when all `values` are ready.
 void EnqueueWorkWhenReady(
     tsl::thread::ThreadPool* pool,
     absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
-    absl::AnyInvocable<void()> callee) {
+    absl::AnyInvocable<void() &&> callee) {
   RunWhenReady(values, [pool, callee = std::move(callee)]() mutable {
     EnqueueWork(pool, std::move(callee));
   });
@@ -202,13 +197,13 @@ class ThreadPoolAsyncWorkRunner : public AsyncWorkRunner {
   explicit ThreadPoolAsyncWorkRunner(tsl::thread::ThreadPool* pool)
       : pool_(pool) {}
 
-  void Schedule(absl::AnyInvocable<void()> work) override {
+  void Schedule(absl::AnyInvocable<void() &&> work) override {
     EnqueueWork(pool_, std::move(work));
   }
 
   void ScheduleWhenReady(
       absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
-      absl::AnyInvocable<void()> work) override {
+      absl::AnyInvocable<void() &&> work) override {
     EnqueueWorkWhenReady(pool_, values, std::move(work));
   }
 
@@ -216,81 +211,6 @@ class ThreadPoolAsyncWorkRunner : public AsyncWorkRunner {
   tsl::thread::ThreadPool* pool_;
 };
 
-class TfrtCpuAsyncHostToDeviceTransferManager
-    : public AbstractAsyncHostToHostMemoryTransferManager {
- public:
-  static absl::StatusOr<
-      std::unique_ptr<TfrtCpuAsyncHostToDeviceTransferManager>>
-  Create(absl::Span<const Shape> shapes, TfrtCpuDevice* device,
-         TfrtCpuClient* client) {
-    absl::InlinedVector<std::unique_ptr<AbstractTfrtCpuBuffer>, 4> buffers;
-    buffers.reserve(shapes.size());
-    absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs;
-    avs.reserve(shapes.size());
-    for (const auto& shape : shapes) {
-      if (shape.IsTuple()) {
-        return Unimplemented(
-            "Tuples are not supported by "
-            "TfrtCpuAsyncHostToDeviceTransferManager");
-      }
-      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> local_avs;
-      TF_ASSIGN_OR_RETURN(auto buffer, AllocateDestinationBufferAndAvs(
-                                           HostShapeToOnDeviceShape(shape),
-                                           &local_avs, device, client));
-      CHECK_EQ(local_avs.size(), 1);
-      avs.push_back(std::move(local_avs[0]));
-      buffers.push_back(std::move(buffer));
-    }
-
-    absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers;
-    absl::InlinedVector<size_t, 4> buffer_sizes;
-    absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight;
-    absl::InlinedVector<bool, 4> last_transfer_finished;
-    TF_RETURN_IF_ERROR(
-        AbstractAsyncHostToHostMemoryTransferManager::
-            PopulateAsyncTransferManagerData(
-                buffers, device_buffers, buffer_sizes,
-                buffer_transfers_in_flight, last_transfer_finished));
-
-    return absl::WrapUnique(new TfrtCpuAsyncHostToDeviceTransferManager(
-        std::move(avs), std::move(buffers), std::move(device_buffers),
-        std::move(buffer_sizes), std::move(buffer_transfers_in_flight),
-        std::move(last_transfer_finished), client->async_work_runner(),
-        device));
-  }
-
-  PjRtDevice* device() const override { return device_; }
-
- private:
-  TfrtCpuAsyncHostToDeviceTransferManager(
-      absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs,
-      absl::InlinedVector<std::unique_ptr<AbstractTfrtCpuBuffer>, 4> buffers,
-      absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers,
-      absl::InlinedVector<size_t, 4> buffer_sizes,
-      absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight,
-      absl::InlinedVector<bool, 4> last_transfer_finished,
-      AsyncWorkRunner* async_work_runner, TfrtCpuDevice* device)
-      : AbstractAsyncHostToHostMemoryTransferManager(
-            std::move(avs), std::move(buffers), std::move(device_buffers),
-            std::move(buffer_sizes), std::move(buffer_transfers_in_flight),
-            std::move(last_transfer_finished), async_work_runner),
-        device_(device) {}
-
-  TfrtCpuDevice* device_;
-};
-
-// Converts a const span of unique_ptr<TfrtCpuDevice> to a const span of
-// unique_ptr<PjRtDevice>. This is a safe operation because the resulting span
-// only permits access to elements via pointer dereference, and unique_ptr
-// values remain immutable.
-absl::Span<const std::unique_ptr<PjRtDevice>> GetPjRtDeviceSpan(
-    absl::Span<const std::unique_ptr<TfrtCpuDevice>> devices) {
-  static_assert(std::is_base_of_v<PjRtDevice, TfrtCpuDevice>);
-  return absl::Span<const std::unique_ptr<PjRtDevice>>(
-      reinterpret_cast<const std::unique_ptr<PjRtDevice>*>(devices.data()),
-      devices.size());
-}
-
 }  // namespace
 
 static int CpuDeviceCount() {
@@ -314,9 +234,9 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
     devices.push_back(std::move(device));
   }
 
-  return std::unique_ptr<PjRtClient>(std::make_unique<TfrtCpuClient>(
+  return std::unique_ptr<PjRtClient>(new TfrtCpuClient(
       options.process_id, std::move(devices), std::move(options.collectives),
-      num_threads, options.asynchronous,
+      num_threads, options.asynchronous, options.legacy_memory_space_behavior,
       std::move(options.customize_hlo_module_config)));
 }
 
@@ -335,10 +255,23 @@ static tsl::ThreadOptions GetThreadOptions() {
   return thread_options;
 }
 
+// Returns the CPU devices from the given TfrtCpuDevices.
+// Precondition: `devices` doesn't contain nullptr.
+static std::vector<CpuTopology::CpuDevice> GetCpuDevices(
+    absl::Span<const std::unique_ptr<TfrtCpuDevice>> devices) {
+  std::vector<CpuTopology::CpuDevice> cpu_devices;
+  cpu_devices.reserve(devices.size());
+  for (const auto& device : devices) {
+    cpu_devices.push_back(CpuTopology::CpuDevice{
+        device->process_index(), device->local_hardware_id().value()});
+  }
+  return cpu_devices;
+}
+
 TfrtCpuClient::TfrtCpuClient(
     int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
     std::shared_ptr<cpu::CpuCollectives> collectives, size_t num_threads,
-    bool asynchronous,
+    bool asynchronous, bool legacy_memory_space_behavior,
     std::function<void(HloModuleConfig&)> customize_hlo_module_config)
     : process_index_(process_index),
       owned_devices_(std::move(devices)),
@@ -358,9 +291,8 @@ TfrtCpuClient::TfrtCpuClient(
           tsl::MakeAvailableAsyncValueRef<CpuEvent>()),
       transpose_cache_(1024),
       collectives_(std::move(collectives)),
-      topology_(CpuTopologyDescription::Create(
-          platform_id(), platform_name(), platform_version(),
-          GetPjRtDeviceSpan(owned_devices_), cpu::DetectMachineAttributes())),
+      topology_(platform_id(), platform_name(), platform_version(),
+                GetCpuDevices(owned_devices_), cpu::DetectMachineAttributes()),
       asynchronous_(asynchronous),
       customize_hlo_module_config_(std::move(customize_hlo_module_config)) {
   for (const std::unique_ptr<TfrtCpuDevice>& device : owned_devices_) {
@@ -382,17 +314,38 @@ TfrtCpuClient::TfrtCpuClient(
   for (int idx = 0; idx < addressable_devices_.size(); ++idx) {
     auto* const device = addressable_devices_[idx];
     CHECK(device != nullptr) << idx;
+    auto* cpu_device = tensorflow::down_cast<TfrtCpuDevice*>(device);
 
-    // Use the device id to construct a globally unique memory space id. We
-    // do not promise that memory space ids and device ids are the same.
+    // Use the device id to construct a globally unique memory space id.
     const int id = device->id();
-    auto memory_space = std::make_unique<UnpinnedHostMemorySpace>(id, device);
-    tensorflow::down_cast<TfrtCpuDevice*>(device)->AttachMemorySpace(
-        memory_space.get());
-    memory_spaces_.push_back(memory_space.get());
-    owned_memory_spaces_.push_back(std::move(memory_space));
-  }
 
+    if (legacy_memory_space_behavior) {
+      auto memory_space = std::make_unique<UnpinnedHostMemorySpace>(id, device);
+      cpu_device->AttachMemorySpace(memory_space.get());
+      memory_spaces_.push_back(memory_space.get());
+      owned_memory_spaces_.push_back(std::move(memory_space));
+    } else {
+      // The first attached memory space is returned as the default by
+      // TfrtCpuDevice, so attach the device memory space first.
+      auto cpu_device_memory_space =
+          std::make_unique<CpuDeviceMemorySpace>(id * 3 + 0, device);
+      cpu_device->AttachMemorySpace(cpu_device_memory_space.get());
+      memory_spaces_.push_back(cpu_device_memory_space.get());
+      owned_memory_spaces_.push_back(std::move(cpu_device_memory_space));
+
+      auto unpinned_memory_space =
+          std::make_unique<UnpinnedHostMemorySpace>(id * 3 + 1, device);
+      cpu_device->AttachMemorySpace(unpinned_memory_space.get());
+      memory_spaces_.push_back(unpinned_memory_space.get());
+      owned_memory_spaces_.push_back(std::move(unpinned_memory_space));
+
+      auto pinned_memory_space =
+          std::make_unique<PinnedHostMemorySpace>(id * 3 + 2, device);
+      cpu_device->AttachMemorySpace(pinned_memory_space.get());
+      memory_spaces_.push_back(pinned_memory_space.get());
+      owned_memory_spaces_.push_back(std::move(pinned_memory_space));
+    }
+  }
   VLOG(1) << "TfrtCpuClient created.";
 }
 
@@ -482,8 +435,8 @@ FindResultBufferAllocationIndex(const BufferAssignment& assignment,
     buffer_indices.push_back(buffer_index);
     return {std::move(buffer_indices)};
   }
-  buffer_indices.reserve(result_shape.tuple_shapes_size());
-  for (int i = 0; i < result_shape.tuple_shapes_size(); ++i) {
+  buffer_indices.reserve(result_shape.tuple_shapes().size());
+  for (int i = 0; i < result_shape.tuple_shapes().size(); ++i) {
     // Find the buffer allocations that corresponds to the output tuple,
     // including the tuple index table.
     const HloValueSet& sources = root_value_set.element({i});
@@ -517,8 +470,9 @@ absl::StatusOr<std::string> TfrtCpuExecutable::SerializeExecutable() const {
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
-                                     std::optional<CompileOptions> options) {
+TfrtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
+                                        std::optional<CompileOptions> options,
+                                        const LoadOptions& load_options) {
   ExecutableAndOptionsProto proto;
   if (serialized.size() > std::numeric_limits<int>::max()) {
     return Internal(
@@ -543,7 +497,7 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
                       compiler.LoadAotCompilationResult(str));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      aot_result->LoadExecutable(&compiler, /*executor=*/nullptr));
+      std::move(*aot_result).LoadExecutable(&compiler, /*executor=*/nullptr));
 
   // Set up other arguments for TfrtCpuExecutable
   // TODO(b/232263665): Remove duplicated code in DeserializeExecutable and
@@ -599,13 +553,8 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
         addressable_devices.push_back(device);
       }
     }
-    if (addressable_devices.empty()) {
-      return InvalidArgument(
-          "Device assignment (%s) does not have any local devices.",
-          device_assignment->ToString());
-    }
 
-    if (build_options.device_ordinal() < 0) {
+    if (!addressable_devices.empty() && build_options.device_ordinal() < 0) {
       build_options.set_device_ordinal(
           addressable_devices.front()->local_hardware_id().value());
     }
@@ -624,13 +573,6 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(tfrt_cpu_executable));
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-TfrtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
-                                        std::optional<CompileOptions> options,
-                                        const LoadOptions& load_options) {
-  return DeserializeExecutable(serialized, options);
-}
-
 static absl::StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
     const XlaComputation& computation,
     const absl::Span<const Shape* const> argument_layouts,
@@ -680,8 +622,62 @@ static absl::StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
                              compile_options);
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
-    mlir::ModuleOp module, CompileOptions options) {
+static absl::StatusOr<std::unique_ptr<xla::Executable>> CompileAheadOfTime(
+    const XlaComputation& computation,
+    const absl::Span<const Shape* const> argument_layouts,
+    const ExecutableBuildOptions& build_options,
+    const ExecutionOptions& execution_options,
+    const xla::AotCompilationOptions& compile_options, int num_threads,
+    std::function<void(HloModuleConfig&)> customize_hlo_module_config) {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation.GetProgramShape());
+  // Unoptimized HloModuleConfig.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> hlo_module_config,
+      CreateModuleConfig(program_shape, argument_layouts, &execution_options,
+                         execution_options.num_replicas(), num_threads,
+                         /*aot_options=*/&compile_options));
+
+  // Apply the user-provided callback to customize the HloModuleConfig.
+  if (customize_hlo_module_config) {
+    customize_hlo_module_config(*hlo_module_config);
+  }
+
+  // Unoptimized HloModule.
+  const xla::HloModuleProto& hlo_module_proto = computation.proto();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      xla::HloModule::CreateFromProto(hlo_module_proto, *hlo_module_config));
+
+  cpu::CpuCompiler compiler;
+  // TODO (basioli): honor build_options.run_backend_only() for AOT.
+
+  auto hlo_module_group =
+      std::make_unique<HloModuleGroup>(std::move(hlo_module));
+
+  // Compile AOT.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      compiler.CompileAheadOfTime(std::move(hlo_module_group),
+                                  compile_options));
+
+  if (aot_results.size() != 1) {
+    return Internal("Expected 1 AOT compilation result, got %d.",
+                    aot_results.size());
+  }
+
+  // Technically not needed, but it makes sense so that we know serialization
+  // and deserialization works.
+  TF_ASSIGN_OR_RETURN(std::string serialized_aot_result,
+                      aot_results[0]->SerializeAsString());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
+                      compiler.LoadAotCompilationResult(serialized_aot_result));
+
+  return std::move(*aot_result).LoadExecutable(&compiler, /*executor=*/nullptr);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfrtCpuClient::CompileAndLoad(mlir::ModuleOp module, CompileOptions options) {
   XlaComputation xla_computation;
   const ExecutableBuildOptions& exec_build_options =
       options.executable_build_options;
@@ -691,7 +687,7 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
       /*return_tuple=*/false, exec_build_options.use_shardy_partitioner()));
 
   if (options.argument_layouts) {
-    return Compile(xla_computation, options);
+    return CompileAndLoad(xla_computation, options);
   }
 
   TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> arg_layout_modes,
@@ -728,8 +724,9 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
                          layout_callback, options);
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
-    const XlaComputation& computation, CompileOptions options) {
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfrtCpuClient::CompileAndLoad(const XlaComputation& computation,
+                              CompileOptions options) {
   std::vector<const Shape*> argument_layout_pointers;
   const ExecutableBuildOptions& build_options =
       options.executable_build_options;
@@ -750,12 +747,38 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
                          /*layout_canonicalization_callback=*/nullptr, options);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfrtCpuClient::CompileAheadOfTimeAndLoad(
+    const XlaComputation& computation, CompileOptions options,
+    const AotCompilationOptions& aot_options) {
+  std::vector<const Shape*> argument_layout_pointers;
+  const ExecutableBuildOptions& build_options =
+      options.executable_build_options;
+  const bool allow_auto_layout =
+      build_options.has_debug_options() &&
+      build_options.debug_options().xla_pjrt_allow_auto_layout_in_hlo();
+  TF_RETURN_IF_ERROR(DetermineArgumentLayoutsFromCompileOptions(
+      computation,
+      [allow_auto_layout](Shape shape) -> absl::StatusOr<Shape> {
+        if (allow_auto_layout && !shape.has_layout()) {
+          return shape;
+        }
+        return LayoutUtil::GetWithDefaultLayout(shape);
+      },
+      options.argument_layouts, &options.executable_build_options,
+      &argument_layout_pointers));
+  return CompileInternal(computation, argument_layout_pointers,
+                         /*layout_canonicalization_callback=*/nullptr, options,
+                         &aot_options);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 TfrtCpuClient::CompileInternal(
     const XlaComputation& computation,
     const std::vector<const Shape*>& argument_layout_pointers,
     LayoutCanonicalizationCallback layout_canonicalization_callback,
-    CompileOptions options) {
+    CompileOptions options,
+    const AotCompilationOptions* absl_nullable aot_options) {
   tsl::profiler::TraceMe traceme("TfrtCpuClient::Compile");
   auto input_options = options;
 
@@ -816,13 +839,7 @@ TfrtCpuClient::CompileInternal(
         addressable_devices.push_back(device);
       }
     }
-    if (addressable_devices.empty()) {
-      return InvalidArgument(
-          "Device assignment (%s) does not have any local devices.",
-          device_assignment->ToString());
-    }
-
-    if (build_options.device_ordinal() < 0) {
+    if (!addressable_devices.empty() && build_options.device_ordinal() < 0) {
       build_options.set_device_ordinal(
           addressable_devices.front()->local_hardware_id().value());
     }
@@ -830,20 +847,33 @@ TfrtCpuClient::CompileInternal(
 
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
+
+  std::unique_ptr<Executable> cpu_executable;
   ExecutionOptions execution_options =
       CreateExecutionOptions(build_options, &program_shape);
-  xla::Compiler::CompileOptions compile_options{
-      build_options.device_allocator(), build_options.compile_thread_pool(),
-      build_options.layout_canonicalization_callback()};
-  if (!compile_options.thread_pool) {
-    compile_options.thread_pool = pjrt_client_thread_pool();
+
+  if (aot_options) {
+    TF_ASSIGN_OR_RETURN(
+        cpu_executable,
+        CompileAheadOfTime(computation, argument_layout_pointers, build_options,
+                           execution_options, *aot_options,
+                           eigen_intraop_device()->getPool()->NumThreads(),
+                           customize_hlo_module_config_));
+  } else {
+    xla::Compiler::CompileOptions compile_options{
+        build_options.device_allocator(), build_options.compile_thread_pool(),
+        build_options.layout_canonicalization_callback()};
+    if (!compile_options.thread_pool) {
+      compile_options.thread_pool = pjrt_client_thread_pool();
+    }
+    TF_ASSIGN_OR_RETURN(
+        cpu_executable,
+        JitCompile(computation, argument_layout_pointers, build_options,
+                   execution_options, compile_options,
+                   eigen_intraop_device()->getPool()->NumThreads(),
+                   customize_hlo_module_config_));
   }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> cpu_executable,
-      JitCompile(computation, argument_layout_pointers, build_options,
-                 execution_options, compile_options,
-                 eigen_intraop_device()->getPool()->NumThreads(),
-                 customize_hlo_module_config_));
+
   auto cpu_executable_ptr =
       tensorflow::down_cast<cpu::CpuExecutable*>(cpu_executable.get());
 
@@ -879,38 +909,13 @@ static bool IsAlignedData(void* ptr) {
           (cpu_function_runtime::MinAlign() - 1)) == 0;
 }
 
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-TfrtCpuClient::CreateViewOfDeviceBuffer(
-    void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
-    std::function<void()> on_delete_callback,
-    std::optional<std::intptr_t> stream) {
-  if (stream) {
-    return Unimplemented(
-        "TfrtCpuClient::CreateViewOfDeviceBuffer does not support `stream` "
-        "argument.");
-  }
-  if (!IsAlignedData(device_ptr)) {
-    return InvalidArgument(
-        "Can't create a view of buffer with unaligned data, ptr: %#x is not "
-        "aligned to %d bytes. ",
-        reinterpret_cast<std::uintptr_t>(device_ptr),
-        cpu_function_runtime::MinAlign());
-  }
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
-  size_t byte_size = ShapeUtil::ByteSizeOf(shape);
-  auto non_owning_buffer =
-      tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(device_ptr,
-                                                            byte_size);
-  buffers.push_back(std::move(non_owning_buffer));
-  auto tracked_device_buffer = std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      /*is_tuple=*/false, /*owns_buffers=*/false, std::move(buffers),
-      /*definition_event=*/tsl::MakeAvailableAsyncValueRef<CpuEvent>(),
-      std::move(on_delete_callback));
-  CHECK_EQ(memory_space->devices().size(), 1);
-  auto* device = memory_space->devices().front();
-  return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
-      shape, std::move(tracked_device_buffer), this,
-      tensorflow::down_cast<TfrtCpuDevice*>(device), memory_space));
+absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>
+TfrtCpuClient::ImportForeignMemory(
+    void* device_ptr, absl::AnyInvocable<void() &&> on_delete_callback,
+    size_t on_device_bytes_count, PjRtMemorySpace* memory_space) {
+  return CpuRawBuffer::ImportForeignMemory(device_ptr,
+                                           std::move(on_delete_callback),
+                                           on_device_bytes_count, memory_space);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateErrorBuffer(
@@ -922,14 +927,12 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateErrorBuffer(
   }
   // Create a dummy buffer because the rest of the code expects a buffer
   // regardless of whether the definition event is an error.
-  TF_ASSIGN_OR_RETURN(auto buffer, MaybeOwningCpuMemory::AllocateAvailableAvr(
-                                       ShapeUtil::ByteSizeOf(shape)));
+  TF_ASSIGN_OR_RETURN(auto buffer,
+                      CpuDeviceMemory::Allocate(ShapeUtil::ByteSizeOf(shape)));
   return std::make_unique<TfrtCpuBuffer>(
       shape,
-      std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-          /*is_tuple=*/false, /*owns_buffers=*/true,
-          absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>{
-              std::move(buffer)},
+      std::make_unique<TrackedCpuDeviceBuffer>(
+          /*owns_buffers=*/true, std::move(buffer),
           absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>{
               tsl::AsyncValueRef<CpuEvent>(
                   tsl::MakeErrorAsyncValueRef(std::move(error)))}),
@@ -937,99 +940,106 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateErrorBuffer(
       *device->default_memory_space());
 }
 
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-TfrtCpuClient::CreateUninitializedBuffer(const Shape& shape,
-                                         PjRtMemorySpace* memory_space) {
-  tsl::profiler::TraceMe traceme("TfrtCpuClient::CreateUninitializedBuffer");
-  VLOG(1) << "TfrtCpuClient::CreateUninitializedBuffer: shape: "
-          << shape.DebugString()
-          << " memory_space: " << memory_space->DebugString();
-  CHECK_EQ(memory_space->devices().size(), 1);
-  PjRtDevice* device = memory_space->devices().front();
-  return AllocateDestinationBuffer(
-      HostShapeToOnDeviceShape(shape), /*definition_events=*/{},
-      tensorflow::down_cast<TfrtCpuDevice*>(device), this);
-}
-
-absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
-TfrtCpuClient::CreateBuffersForAsyncHostToDevice(
-    absl::Span<const Shape> shapes, PjRtMemorySpace* memory_space) {
-  CHECK_EQ(memory_space->devices().size(), 1);
-  auto* tfrt_device =
-      tensorflow::down_cast<TfrtCpuDevice*>(memory_space->devices().front());
-  return TfrtCpuAsyncHostToDeviceTransferManager::Create(shapes, tfrt_device,
-                                                         this);
-}
-
 absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
 TfrtCpuClient::CreateBuffersForAsyncHostToDevice(
     absl::Span<const PjRtClient::ShapeSpec> shape_specs,
     std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
     PjRtMemorySpace* memory_space) {
-  TF_ASSIGN_OR_RETURN(std::vector<xla::Shape> device_shapes,
-                      ConvertShapeSpecsToShapes(shape_specs, device_layouts));
-  return CreateBuffersForAsyncHostToDevice(absl::MakeSpan(device_shapes),
-                                           memory_space);
+  return xla::CreateAsyncHostToDeviceTransferManager(
+      shape_specs, device_layouts, memory_space);
+}
+
+bool TfrtCpuClient::BufferFromHostBufferSupportsZeroCopy(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides, const Shape& shape,
+    PjRtMemorySpace* memory_space, const Layout* device_layout) const {
+  return AbstractCpuBuffer::BufferFromHostBufferSupportsZeroCopy(
+      data, type, dims, byte_strides, shape);
 }
 
-absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+TfrtCpuClient::LinearizeHostBufferInto(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
     HostBufferSemantics host_buffer_semantics,
     absl::AnyInvocable<void() &&> on_done_with_host_buffer,
-    PjRtMemorySpace* memory_space, const Layout* device_layout) {
-  CHECK_EQ(memory_space->devices().size(), 1);
-  PjRtDevice* device = memory_space->devices().front();
-  tsl::profiler::TraceMe traceme("TfrtCpuClient::BufferFromHostBuffer");
-  Shape shape = ShapeUtil::MakeShape(type, dims);
-  VLOG(2) << "TfrtCpuClient::BufferFromHostBuffer: shape: " << shape.ToString()
-          << " device: " << device->DebugString();
-
-  if (!device->IsAddressable()) {
-    return InvalidArgument("Cannot copy array to non-addressable device %s",
-                           device->DebugString());
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-      AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
+    const xla::Shape& device_shape,
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer) {
+  return tensorflow::down_cast<CpuRawBuffer*>(raw_buffer.get())
+      ->CopyFromHostBuffer(
           data, type, dims, byte_strides, host_buffer_semantics,
-          std::move(on_done_with_host_buffer), shape, async_work_runner(),
-          &transpose_mu_, &transpose_cache_));
+          std::move(on_done_with_host_buffer), device_shape,
+          async_work_runner(), &transpose_mu_, &transpose_cache_);
+}
 
-  return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
-      shape, std::move(tracked_device_buffer), this,
-      tensorflow::down_cast<TfrtCpuDevice*>(device), memory_space));
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> TfrtCpuClient::LinearizeInto(
+    const LiteralSlice& literal, const xla::Layout& layout,
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer) {
+  return tensorflow::down_cast<CpuRawBuffer*>(raw_buffer.get())
+      ->CopyFromLiteral(literal, layout, async_work_runner());
 }
 
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-TfrtCpuClient::BufferFromHostLiteral(const LiteralSlice& literal,
-                                     PjRtMemorySpace* memory_space) {
-  CHECK_EQ(memory_space->devices().size(), 1);
-  PjRtDevice* device = memory_space->devices().front();
+absl::StatusOr<std::pair<tsl::RCReference<PjRtDeviceEventPromise>,
+                         tsl::RCReference<PjRtDeviceEvent>>>
+TfrtCpuClient::CreateLinkedEventPromise(PjRtMemorySpace* memory_space,
+                                        absl::string_view debug_info) {
+  auto definition_event_promise = tsl::MakeIndirectAsyncValue();
+  auto definition_event = tsl::MakeRef<CpuTrackedDeviceEvent>(
+      tsl::AsyncValueRef<CpuEvent>(definition_event_promise));
+  return std::make_pair(tsl::MakeRef<CpuTrackedDeviceEventPromise>(
+                            std::move(definition_event_promise)),
+                        std::move(definition_event));
+}
 
-  tsl::profiler::TraceMe traceme("TfrtCpuClient::BufferFromHostLiteral");
-  VLOG(1) << "TfrtCpuClient::BufferFromHostLiteral: shape: "
-          << literal.shape().DebugString()
-          << " device: " << device->DebugString();
-  const Shape& shape = literal.shape();
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::DefineBuffer(
+    const Shape& on_device_shape,
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+    absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
+        definition_device_events,
+    bool raw_buffer_is_mutable) {
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
+  for (auto& ev : definition_device_events) {
+    definition_events.push_back(
+        tensorflow::down_cast<CpuTrackedDeviceEvent*>(ev.get())->event());
+  }
+  return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
+      on_device_shape,
+      std::make_unique<TrackedCpuDeviceBuffer>(
+          /*owns_buffers=*/raw_buffer_is_mutable,
+          tensorflow::down_cast<CpuRawBuffer*>(raw_buffer.get())->buffer(),
+          std::move(definition_events)),
+      this,
+      tensorflow::down_cast<TfrtCpuDevice*>(
+          raw_buffer->memory_space()->devices()[0]),
+      raw_buffer->memory_space()));
+}
 
-  absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<TfrtCpuBuffer> output_buffer,
-                      AllocateDestinationBufferAndAvs(
-                          HostShapeToOnDeviceShape(shape), &avs,
-                          tensorflow::down_cast<TfrtCpuDevice*>(device), this));
+absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>
+TfrtCpuClient::AllocateRawBuffer(PjRtMemorySpace* memory_space,
+                                 size_t on_device_bytes_count,
+                                 tsl::AsyncValueRef<bool> allocate_after) {
+  CHECK(allocate_after == nullptr) << "allocate_after is not supported for "
+                                      "TfrtCpuClient.";
+  return xla::CpuRawBuffer::Allocate(memory_space, on_device_bytes_count);
+}
 
-  output_buffer->CopyFromLiteral(literal, shape, &avs, async_work_runner());
+absl::StatusOr<int64_t> TfrtCpuClient::GetOnDeviceBytesCount(
+    PjRtMemorySpace* memory_space, const xla::Shape& shape) const {
+  return xla::ShapeUtil::ByteSizeOf(shape);
+}
 
-  return std::unique_ptr<PjRtBuffer>(std::move(output_buffer));
+absl::StatusOr<xla::Shape> TfrtCpuClient::MakeDefaultShapeForMemorySpace(
+    PjRtMemorySpace* memory_space, xla::Shape shape,
+    const xla::Layout* layout) const {
+  return MakeDefaultCpuBufferShape(std::move(shape), layout);
 }
 
 TfrtCpuBuffer::TfrtCpuBuffer(
     Shape on_device_shape,
-    std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+    std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer,
     TfrtCpuClient* client, TfrtCpuDevice* device, PjRtMemorySpace* memory_space)
-    : AbstractTfrtCpuBuffer(std::move(on_device_shape),
-                            std::move(tracked_device_buffer)),
+    : AbstractCpuBuffer(std::move(on_device_shape),
+                        std::move(tracked_device_buffer)),
       client_(client),
       device_(device),
       memory_space_(memory_space) {}
@@ -1068,12 +1078,6 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToMemorySpace(
   CHECK_EQ(dst_memory_space->devices().size(), 1);
   PjRtDevice* dst_device = dst_memory_space->devices().front();
   tsl::profiler::TraceMe traceme("TfrtCpuBuffer::CopyToDevice");
-  // TODO(zhangqiaorjc): Remove this restriction after removing the test that
-  // explicitly asserts this.
-  if (dst_device == device_) {
-    return InvalidArgument(
-        "CopyToDevice cannot accept the same source and destination devices");
-  }
 
   // Copying across PjRtClients involves a copy through the host.
   if (dst_device->client() != client_) {
@@ -1086,7 +1090,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToMemorySpace(
   }
 
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+      std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer,
       CopyToDeviceHelper(client()->async_work_runner()));
 
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
@@ -1142,9 +1146,9 @@ TfrtCpuExecutable::TfrtCpuExecutable(
     }
   } else {
     input_buffer_sizes_in_bytes_.reserve(
-        computation_layout.parameter_shape(0).tuple_shapes_size());
+        computation_layout.parameter_shape(0).tuple_shapes().size());
     for (int i = 0;
-         i < computation_layout.parameter_shape(0).tuple_shapes_size(); ++i) {
+         i < computation_layout.parameter_shape(0).tuple_shapes().size(); ++i) {
       input_buffer_sizes_in_bytes_.push_back(ShapeUtil::ByteSizeOf(
           computation_layout.parameter_shape(0).tuple_shapes(i)));
     }
@@ -1174,25 +1178,25 @@ namespace {
 // Some helper structs to support delayed memory allocation.
 
 struct BufferInfo {
-  tsl::AsyncValueRef<MaybeOwningCpuMemory> buffer;
+  tsl::AsyncValueRef<CpuDeviceMemory> buffer;
   bool owns_buffer;
   size_t buffer_size;
 };
 
 struct BufferAlloc {
   // All data members should have the same size.
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<CpuDeviceMemory>, 4> buffers;
   absl::InlinedVector<size_t, 4> allocation_sizes;
 
   void Allocate() {
     for (int i = 0; i < buffers.size(); ++i) {
-      auto memory = MaybeOwningCpuMemory::Allocate(allocation_sizes[i]);
-      if (!memory.ok()) {
-        buffers[i].SetError(memory.status());
+      auto status = CpuDeviceMemory::AllocateInto(allocation_sizes[i],
+                                                  buffers[i].AsPtr());
+      if (!status.ok()) {
+        buffers[i].SetError(status);
         return;
       }
-      buffers[i].emplace(std::move(*memory));
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(buffers[i]->data(),
+      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(buffers[i]->untyped_data(),
                                           allocation_sizes[i]);
     }
   }
@@ -1200,21 +1204,21 @@ struct BufferAlloc {
 
 struct BufferAllocAndCopy {
   // All data members should have the same size.
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> src_buffers;
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> dst_buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<CpuDeviceMemory>, 4> src_buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<CpuDeviceMemory>, 4> dst_buffers;
   absl::InlinedVector<size_t, 4> allocation_sizes;
 
   void AllocateAndCopy() {
     for (int i = 0; i < src_buffers.size(); ++i) {
-      auto memory = MaybeOwningCpuMemory::Allocate(allocation_sizes[i]);
-      if (!memory.ok()) {
-        dst_buffers[i].SetError(memory.status());
+      auto status = CpuDeviceMemory::AllocateInto(allocation_sizes[i],
+                                                  dst_buffers[i].AsPtr());
+      if (!status.ok()) {
+        dst_buffers[i].SetError(status);
         return;
       }
-      dst_buffers[i].emplace(std::move(*memory));
       CHECK(src_buffers[i].IsConcrete());
-      std::memcpy(dst_buffers[i]->data(), src_buffers[i]->data(),
-                  allocation_sizes[i]);
+      std::memcpy(dst_buffers[i]->untyped_data(),
+                  src_buffers[i]->untyped_data(), allocation_sizes[i]);
     }
   }
 };
@@ -1226,24 +1230,52 @@ struct BufferAllocAndCopy {
 static absl::StatusOr<BufferInfo> MemoryForAllocation(
     const BufferAllocation& allocation,
     absl::Span<const cpu::ConstantAllocation> constants,
-    absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments,
-    BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy) {
+    absl::Span<std::pair<bool, TrackedCpuDeviceBuffer*> const> arguments,
+    BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy,
+    const tsl::AsyncValueRef<CpuDeviceMemory>& tuple_index_table) {
   BufferInfo buffer_info;
   if (allocation.is_entry_computation_parameter()) {
-    auto [can_donate, arg] = arguments[allocation.parameter_number()];
-    tsl::AsyncValueRef<MaybeOwningCpuMemory> out =
-        arg->Buffer(allocation.param_shape_index());
-    CHECK_EQ(allocation.size(), arg->BufferSize(allocation.param_shape_index()))
+    bool can_donate = false;
+    TrackedCpuDeviceBuffer* arg = nullptr;
+    size_t buffer_size;
+    tsl::AsyncValuePtr<CpuDeviceMemory> out;
+    if (tuple_index_table) {
+      if (allocation.param_shape_index().empty()) {
+        out = tuple_index_table.AsPtr();
+        buffer_size = arguments.size() * sizeof(void*);
+      } else if (allocation.param_shape_index().size() == 1) {
+        std::tie(can_donate, arg) =
+            arguments[allocation.param_shape_index()[0]];
+        out = arg->buffer().AsPtr();
+        buffer_size = arg->BufferSize();
+      } else {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Nested tuples are not supported for argument: ",
+            allocation.parameter_number(),
+            " at shape index:", allocation.param_shape_index().ToString()));
+      }
+    } else if (!allocation.param_shape_index().empty()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Nested tuples are not supported for argument: ",
+          allocation.parameter_number(),
+          " at shape index:", allocation.param_shape_index().ToString()));
+    } else {
+      std::tie(can_donate, arg) = arguments[allocation.parameter_number()];
+      out = arg->buffer().AsPtr();
+      buffer_size = arg->BufferSize();
+    }
+    CHECK_EQ(allocation.size(), buffer_size)
         << "Size mismatch on param " << allocation.parameter_number()
         << " at shape index " << allocation.param_shape_index().ToString();
 
     // If we don't own the buffer, we can't overwrite it or donate it. For
     // example we might be pointing to a buffer owned by the client whose
     // lifetime will not extend past the lifetime of the donated input buffer.
-    if ((!can_donate || !arg->owns_buffers()) && !allocation.is_readonly()) {
-      auto copy = tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+    if ((!can_donate || (arg && !arg->owns_buffers())) &&
+        !allocation.is_readonly()) {
+      auto copy = CpuDeviceMemory::CreateDelayedMemory();
 
-      buffer_alloc_and_copy.src_buffers.push_back(std::move(out));
+      buffer_alloc_and_copy.src_buffers.push_back(out.CopyRef());
       buffer_alloc_and_copy.dst_buffers.push_back(copy);
       buffer_alloc_and_copy.allocation_sizes.push_back(allocation.size());
 
@@ -1253,31 +1285,30 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
       return buffer_info;
     }
 
-    buffer_info.buffer = std::move(out);
-    buffer_info.owns_buffer = arg->owns_buffers();
-    buffer_info.buffer_size = arg->BufferSize(allocation.param_shape_index());
+    buffer_info.buffer = out.CopyRef();
+    buffer_info.owns_buffer = !arg || arg->owns_buffers();
+    buffer_info.buffer_size = buffer_size;
     return buffer_info;
 
   } else if (allocation.is_constant() &&
              allocation.index() < constants.size()) {
     se::DeviceMemoryBase constant =
         constants[allocation.index()].AsDeviceMemoryBase();
-    buffer_info.buffer = tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
+    buffer_info.buffer = CpuDeviceMemory::CreateConstantMemory(
         constant.opaque(), constant.size());
     buffer_info.owns_buffer = false;
     buffer_info.buffer_size = constant.size();
     return buffer_info;
 
   } else if (allocation.is_constant() || allocation.is_thread_local()) {
-    buffer_info.buffer =
-        tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>();
+    buffer_info.buffer = CpuDeviceMemory::CreateConstantMemory(nullptr, 0);
     buffer_info.owns_buffer = true;
     buffer_info.buffer_size = 0;
     return buffer_info;
   }
 
   // Output and temporary buffer.
-  auto out = tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+  auto out = CpuDeviceMemory::CreateDelayedMemory();
 
   buffer_alloc.buffers.push_back(out);
   buffer_alloc.allocation_sizes.push_back(allocation.size());
@@ -1291,15 +1322,16 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
 static absl::StatusOr<std::vector<BufferInfo>> CreateBufferTable(
     const BufferAssignment& assignment,
     absl::Span<const cpu::ConstantAllocation> constants,
-    absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments,
-    BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy) {
+    absl::Span<std::pair<bool, TrackedCpuDeviceBuffer*> const> arguments,
+    BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy,
+    const tsl::AsyncValueRef<CpuDeviceMemory>& tuple_index_table) {
   std::vector<BufferInfo> buffer_table(assignment.Allocations().size());
   for (BufferAllocation::Index i = 0; i < buffer_table.size(); ++i) {
     const BufferAllocation& allocation = assignment.GetAllocation(i);
     TF_ASSIGN_OR_RETURN(
         buffer_table[i],
         MemoryForAllocation(allocation, constants, arguments, buffer_alloc,
-                            buffer_alloc_and_copy));
+                            buffer_alloc_and_copy, tuple_index_table));
   }
   return std::move(buffer_table);
 }
@@ -1316,8 +1348,8 @@ static absl::InlinedVector<BufferInfo, 4> CreateResultBufferInfo(
 }
 
 absl::Status TfrtCpuExecutable::CheckBufferCompatibilities(
-    absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const>
-        input_buffers) const {
+    absl::Span<std::pair<bool, TrackedCpuDeviceBuffer*> const> input_buffers)
+    const {
   if (input_buffers.size() != input_buffer_sizes_in_bytes_.size()) {
     return InvalidArgument(
         "Execution supplied %lld buffers but compiled program expected %lld "
@@ -1326,11 +1358,11 @@ absl::Status TfrtCpuExecutable::CheckBufferCompatibilities(
   }
   for (int i = 0; i < input_buffers.size(); ++i) {
     const auto& buffer = input_buffers[i].second;
-    if (input_buffer_sizes_in_bytes_[i] != buffer->BufferSizes()[0]) {
+    if (input_buffer_sizes_in_bytes_[i] != buffer->BufferSize()) {
       return InvalidArgument(
           "Executable expected parameter %d of size %lld but got buffer with "
           "incompatible size %lld",
-          i, input_buffer_sizes_in_bytes_[i], buffer->BufferSizes()[0]);
+          i, input_buffer_sizes_in_bytes_[i], buffer->BufferSize());
     }
   }
   return absl::OkStatus();
@@ -1382,10 +1414,9 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   auto execute_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
   MarkEventReadyOnExit ready_on_exit(execute_event);
 
-  absl::InlinedVector<TfrtCpuBuffer::DonationTransaction, 4>
-      donation_transactions;
+  absl::InlinedVector<TfrtCpuBuffer::ScopedHold, 4> donation_transactions;
 
-  absl::InlinedVector<std::pair<bool, TrackedTfrtCpuDeviceBuffer*>, 4>
+  absl::InlinedVector<std::pair<bool, TrackedCpuDeviceBuffer*>, 4>
       tracked_buffers;
   tracked_buffers.reserve(argument_handles.size());
   // To avoid clobbering inputs, we must ensure that
@@ -1412,7 +1443,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
           device->DebugString());
     }
 
-    TrackedTfrtCpuDeviceBuffer* tracked_buffer;
+    TrackedCpuDeviceBuffer* tracked_buffer;
     auto get_buffer = [&](int i) -> absl::Status {
       bool must_donate = donate_it != parameters_that_must_be_donated_.end() &&
                          *donate_it == i;
@@ -1420,8 +1451,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
           tfrt_buffer, donation_clashes, must_donate, i, replica, partition));
       if (must_donate) {
         ++donate_it;
-        absl::StatusOr<TfrtCpuBuffer::DonationTransaction>
-            donation_transaction = tfrt_buffer->AcquireDonation();
+        TfrtCpuBuffer::ScopedHold donation_transaction =
+            tfrt_buffer->AcquireDonation();
         // On CPU, we allow donation to succeed by introducing a copy. This was
         // added when enabling buffer donation on CPU since it turned out that a
         // number of users were holding external references to buffers that were
@@ -1431,15 +1462,14 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
           // After acquiring the buffer for donation, we retrieve the dependent
           // usage events. Note that we don't need any locking here as
           // AcquireDonation() is supposed to synchronize with other usages.
-          for (const auto& ev :
-               donation_transaction->device_buffer()->UsageEvents()) {
+          for (const auto& ev : donation_transaction->UsageEvents()) {
             if (!ev.IsAvailable()) {
               input_deps.push_back(ev.CopyRCRef());
             }
           }
-          tracked_buffer = donation_transaction->device_buffer();
+          tracked_buffer = donation_transaction.buffer();
           tracked_buffers.emplace_back(/*can_donate=*/true, tracked_buffer);
-          donation_transactions.push_back(std::move(*donation_transaction));
+          donation_transactions.push_back(std::move(donation_transaction));
           return absl::OkStatus();
         }
       }
@@ -1466,30 +1496,29 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
 
   // Tuplize the inputs if compiler expects a single tuple argument but runtime
   // gets many inputs that are not yet tupled.
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tuplized_arg;
+  tsl::AsyncValueRef<CpuDeviceMemory> tuple_index_table;
   if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
-    bool owns_buffers = true;
-    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>
-        leaf_buffers;
-    absl::InlinedVector<size_t, 4> leaf_buffer_sizes;
+    absl::InlinedVector<tsl::AsyncValueRef<CpuDeviceMemory>, 4> leaf_buffers;
     leaf_buffers.reserve(tracked_buffers.size());
-    leaf_buffer_sizes.reserve(tracked_buffers.size());
     for (const auto& tracked_buffer : tracked_buffers) {
-      owns_buffers = owns_buffers && tracked_buffer.second->owns_buffers();
-      auto span = tracked_buffer.second->Buffers();
-      leaf_buffers.insert(leaf_buffers.end(), span.begin(), span.end());
-      auto size_span = tracked_buffer.second->BufferSizes();
-      leaf_buffer_sizes.insert(leaf_buffer_sizes.end(), size_span.begin(),
-                               size_span.end());
+      leaf_buffers.push_back(tracked_buffer.second->buffer());
     }
-
-    // Tuplize into a single input.
-    tracked_buffers.clear();
-    tuplized_arg = std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-        /*is_tuple=*/true, owns_buffers, std::move(leaf_buffers),
-        std::move(leaf_buffer_sizes),
-        /*definition_event=*/tsl::MakeConstructedAsyncValueRef<CpuEvent>());
-    tracked_buffers.emplace_back(false, tuplized_arg.get());
+    tuple_index_table = CpuDeviceMemory::CreateDelayedMemory();
+    tsl::RunWhenReady(
+        absl::MakeConstSpan(leaf_buffers),
+        [buffers = leaf_buffers,
+         tuple_index_table = tuple_index_table]() mutable {
+          size_t index_table_byte_size = buffers.size() * sizeof(void*);
+          // We assume tuple table allocations will not fail.
+          CHECK_OK(CpuDeviceMemory::AllocateInto(index_table_byte_size,
+                                                 tuple_index_table.AsPtr()));
+          uintptr_t* index_table =
+              reinterpret_cast<uintptr_t*>(tuple_index_table->untyped_data());
+          for (int i = 0; i < buffers.size(); ++i) {
+            index_table[i] =
+                absl::bit_cast<uintptr_t>(buffers[i]->untyped_data());
+          }
+        });
   }
 
   auto* cpu_executable =
@@ -1502,7 +1531,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
       std::vector<BufferInfo> buffer_table,
       CreateBufferTable(cpu_executable->buffer_assignment(),
                         cpu_executable->constants(), tracked_buffers,
-                        buffer_alloc, buffer_alloc_and_copy));
+                        buffer_alloc, buffer_alloc_and_copy,
+                        tuple_index_table));
   auto result_buffers_info =
       CreateResultBufferInfo(result_buffer_indices_, buffer_table);
 
@@ -1600,7 +1630,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
       if (buffer_info.buffer.IsError()) {
         return buffer_info.buffer.GetError();
       }
-      buffer_pointers.push_back(buffer_info.buffer->data());
+      buffer_pointers.push_back(buffer_info.buffer->untyped_data());
     }
     void* result_buffer = buffer_pointers[result_buffer_index_];
 
@@ -1615,8 +1645,9 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
       absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
       buffer_device_mem.reserve(buffer_table.size());
       for (const auto& buffer_info : buffer_table) {
-        buffer_device_mem.emplace_back(se::DeviceMemoryBase(
-            buffer_info.buffer->data(), buffer_info.buffer->size()));
+        buffer_device_mem.emplace_back(
+            se::DeviceMemoryBase(buffer_info.buffer->untyped_data(),
+                                 buffer_info.buffer->size_bytes()));
       }
 
       cpu::BufferAllocations allocations(buffer_device_mem);
@@ -1652,7 +1683,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     }
 
     for (auto& donation_transaction : donation_transactions) {
-      std::move(donation_transaction).Commit();
+      std::move(donation_transaction).ConfirmDonation();
     }
 
     // Forward errors (if any) after executing compute function or thunks.
@@ -1680,7 +1711,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     }
     std::vector<tsl::RCReference<tsl::AsyncValue>> input_deps_avs_copy =
         CopyAsyncValues(input_deps);
-    TfrtCpuScopedAsyncExecution scoped_async_execution =
+    CpuScopedAsyncExecution scoped_async_execution =
         device->async_execution_tracker()->NewAsyncExecution(
             run_id.ToInt(), std::move(ready_on_exit).Release());
     EnqueueWorkWhenReady(
@@ -1694,7 +1725,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
          device_assignment = std::move(device_assignment),
          cpu_run_options = std::move(cpu_run_options),
          compute_reservation = std::move(compute_reservation),
-         tuplized_arg = std::move(tuplized_arg),
+         tuple_index_table = std::move(tuple_index_table),
          donation_transactions = std::move(donation_transactions),
          scoped_async_execution = std::move(scoped_async_execution),
          input_deps_avs = std::move(input_deps_avs_copy),
@@ -1730,7 +1761,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
                            buffer_info.buffer.GetError().message()));
               return;
             }
-            buffer_pointers.push_back(buffer_info.buffer->data());
+            buffer_pointers.push_back(buffer_info.buffer->untyped_data());
           }
           void* result_buffer = buffer_pointers[result_buffer_index];
 
@@ -1754,8 +1785,9 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
             absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
             buffer_device_mem.reserve(buffer_table.size());
             for (const auto& buffer_info : buffer_table) {
-              buffer_device_mem.emplace_back(se::DeviceMemoryBase(
-                  buffer_info.buffer->data(), buffer_info.buffer->size()));
+              buffer_device_mem.emplace_back(
+                  se::DeviceMemoryBase(buffer_info.buffer->untyped_data(),
+                                       buffer_info.buffer->size_bytes()));
             }
 
             cpu::BufferAllocations allocations(buffer_device_mem);
@@ -1800,7 +1832,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
           }
 
           for (auto& donation_transaction : donation_transactions) {
-            std::move(donation_transaction).Commit();
+            std::move(donation_transaction).ConfirmDonation();
           }
 
           if (!status.ok()) {
@@ -1816,41 +1848,29 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   // Create output TFRT buffers.
   const Shape& result_shape = cpu_executable_->result_shape();
   std::vector<std::unique_ptr<PjRtBuffer>> res;
-  if (options.untuple_result && result_shape.IsTuple()) {
+  if (result_shape.IsTuple()) {
     res.reserve(result_buffers_info.size());
     for (int i = 0; i < result_buffers_info.size(); ++i) {
       // Program execution writes to output buffers so it's a definition event.
       absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
       definition_events.push_back(execute_event.CopyRef());
       auto leaf_tracked_device_buffer =
-          std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-              /*is_tuple=*/false, result_buffers_info[i].owns_buffer,
-              absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>{
-                  std::move(result_buffers_info[i].buffer)},
-              absl::InlinedVector<size_t, 4>{
-                  result_buffers_info[i].buffer_size},
-              std::move(definition_events));
+          std::make_unique<TrackedCpuDeviceBuffer>(
+              result_buffers_info[i].owns_buffer,
+              std::move(result_buffers_info[i].buffer),
+              result_buffers_info[i].buffer_size, std::move(definition_events));
       auto leaf_buffer = std::make_unique<TfrtCpuBuffer>(
           result_shape.tuple_shapes(i), std::move(leaf_tracked_device_buffer),
           client_, device, *device->default_memory_space());
       res.push_back(std::move(leaf_buffer));
     }
   } else {
-    bool owns_buffers = true;
-    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>
-        sub_buffers;
-    absl::InlinedVector<size_t, 4> sub_buffer_sizes;
-    sub_buffers.reserve(result_buffers_info.size());
-    sub_buffer_sizes.reserve(result_buffers_info.size());
-    for (int i = 0; i < result_buffers_info.size(); ++i) {
-      owns_buffers = owns_buffers && result_buffers_info[i].owns_buffer;
-      sub_buffers.push_back(std::move(result_buffers_info[i].buffer));
-      sub_buffer_sizes.push_back(result_buffers_info[i].buffer_size);
-    }
+    CHECK_EQ(result_buffers_info.size(), 1);
     // Program execution writes to output buffers so it's a definition event.
-    auto tracked_device_buffer = std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-        /*is_tuple=*/result_shape.IsTuple(), owns_buffers,
-        std::move(sub_buffers), std::move(sub_buffer_sizes),
+    auto tracked_device_buffer = std::make_unique<TrackedCpuDeviceBuffer>(
+        result_buffers_info[0].owns_buffer,
+        std::move(result_buffers_info[0].buffer),
+        result_buffers_info[0].buffer_size,
         /*definition_event=*/execute_event);
     auto tfrt_output_buffer = std::make_unique<TfrtCpuBuffer>(
         result_shape, std::move(tracked_device_buffer), client_, device,
@@ -1916,6 +1936,14 @@ TfrtCpuExecutable::Execute(
   tsl::profiler::TraceMeProducer activity("TfrtCpuExecutable::Execute",
                                           tsl::profiler::ContextType::kPjRt,
                                           run_id.ToInt());
+  if (!options.untuple_result && cpu_executable_->module()
+                                     .config()
+                                     .entry_computation_layout()
+                                     .result_shape()
+                                     .IsTuple()) {
+    return InvalidArgument(
+        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
+  }
   if (device_assignment_ == nullptr) {
     return InvalidArgument("Execute expects a non-null device_assignment");
   }
@@ -2039,6 +2067,14 @@ TfrtCpuExecutable::ExecuteSharded(
   if (device_assignment_ == nullptr) {
     return InvalidArgument("ExecuteShard expects a non-null device_assignment");
   }
+  if (!options.untuple_result && cpu_executable_->module()
+                                     .config()
+                                     .entry_computation_layout()
+                                     .result_shape()
+                                     .IsTuple()) {
+    return InvalidArgument(
+        "Tuple results must be untupled using ExecuteOptions::untuple_result.");
+  }
   for (int i = 0; i < addressable_devices_.size(); ++i) {
     if (addressable_devices_[i] == device) {
       VLOG(1) << "ExecuteShard executes computation " << name()
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index b929ce27302f..92292b91e8a1 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/nullability.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -44,21 +45,23 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
-#include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
+#include "xla/pjrt/async_work_runner.h"
+#include "xla/pjrt/common_pjrt_client.h"
+#include "xla/pjrt/cpu/abstract_cpu_buffer.h"
 #include "xla/pjrt/cpu/cpu_device.h"
-#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
-#include "xla/pjrt/plugin/xla_cpu/cpu_device_description.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
-#include "xla/service/cpu/cpu_event.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -73,13 +76,8 @@ limitations under the License.
 
 namespace xla {
 
-class TfrtCpuClient final : public PjRtClient {
+class TfrtCpuClient final : public CommonPjRtClient {
  public:
-  TfrtCpuClient(
-      int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
-      std::shared_ptr<cpu::CpuCollectives> collectives, size_t num_threads,
-      bool asynchronous,
-      std::function<void(HloModuleConfig&)> customize_hlo_module_config);
   ~TfrtCpuClient() override;
 
   int process_index() const override { return process_index_; }
@@ -121,18 +119,22 @@ class TfrtCpuClient final : public PjRtClient {
   absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const override;
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       const XlaComputation& computation, CompileOptions options) override;
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       mlir::ModuleOp module, CompileOptions options) override;
 
+  // TODO(b/403584258): PJRT wants to have just one simple Compile API. When the
+  // CPU runtime stops supporting the legacy runtime we will unify our compile
+  // paths better and this will be redundant.
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  CompileAheadOfTimeAndLoad(const XlaComputation& computation,
+                            CompileOptions options,
+                            const AotCompilationOptions& aot_options);
+
   // For TfrtCpuClient, `options` is mandatory.
   // This function returns an InvalidArgument error if `std::nullopt` is passed.
   // TODO(b/237720161): make it actually optional
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized,
-      std::optional<CompileOptions> options) override;
-
   absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
   LoadSerializedExecutable(absl::string_view serialized,
                            std::optional<CompileOptions> options,
@@ -141,28 +143,13 @@ class TfrtCpuClient final : public PjRtClient {
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
       absl::Status error, const Shape& shape, PjRtMemorySpace* memory) override;
 
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
-      const Shape& shape, PjRtMemorySpace* device) override;
-
-  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
-  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
-                                    PjRtMemorySpace* memory_space) override;
-
   absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(
       absl::Span<const PjRtClient::ShapeSpec> shape_specs,
       std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
       PjRtMemorySpace* memory_space) override;
 
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
-      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
-      std::optional<absl::Span<int64_t const>> byte_strides,
-      HostBufferSemantics host_buffer_semantics,
-      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
-      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
-
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
-      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override;
+  using PjRtClient::BufferFromHostLiteral;
 
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
@@ -171,20 +158,15 @@ class TfrtCpuClient final : public PjRtClient {
     return Unimplemented("MakeCrossHostReceiveBuffers not implemented.");
   }
 
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
-      void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
-      std::function<void()> on_delete_callback,
-      std::optional<std::intptr_t> stream) override;
-
-  absl::Status Defragment() override {
-    return Unimplemented("Defragment not implemented.");
-  }
+  absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> ImportForeignMemory(
+      void* device_ptr, absl::AnyInvocable<void() &&> on_delete_callback,
+      size_t on_device_bytes_count, PjRtMemorySpace* memory_space) override;
 
   tsl::thread::ThreadPool* pjrt_client_thread_pool() const {
     return pjrt_client_thread_pool_.get();
   }
 
-  AsyncWorkRunner* async_work_runner() const {
+  AsyncWorkRunner* async_work_runner() const override {
     return async_work_runner_.get();
   }
 
@@ -215,14 +197,64 @@ class TfrtCpuClient final : public PjRtClient {
     return &topology_;
   }
 
+  absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> AllocateRawBuffer(
+      PjRtMemorySpace* memory_space, size_t on_device_bytes_count,
+      tsl::AsyncValueRef<bool> allocate_after) override;
+
+  absl::StatusOr<std::pair<tsl::RCReference<PjRtDeviceEventPromise>,
+                           tsl::RCReference<PjRtDeviceEvent>>>
+  CreateLinkedEventPromise(PjRtMemorySpace* memory_space,
+                           absl::string_view debug_info) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DefineBuffer(
+      const Shape& on_device_shape,
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+      absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
+          definition_device_events,
+      bool raw_buffer_is_mutable) override;
+
+  absl::StatusOr<int64_t> GetOnDeviceBytesCount(
+      PjRtMemorySpace* memory_space, const xla::Shape& shape) const override;
+
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> LinearizeHostBufferInto(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      const xla::Shape& device_shape,
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer) override;
+
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> LinearizeInto(
+      const LiteralSlice& literal, const xla::Layout& layout,
+      tsl::RCReference<CommonPjRtRawBuffer> raw_buffer) override;
+
+  absl::StatusOr<xla::Shape> MakeDefaultShapeForMemorySpace(
+      PjRtMemorySpace* memory_space, xla::Shape shape,
+      const xla::Layout* layout) const override;
+
+  bool BufferFromHostBufferSupportsZeroCopy(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides, const Shape& shape,
+      PjRtMemorySpace* memory_space,
+      const Layout* device_layout) const override;
+
  private:
   friend class TfrtCpuExecutable;
+  friend absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
+      CpuClientOptions options);
+
+  TfrtCpuClient(
+      int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
+      std::shared_ptr<cpu::CpuCollectives> collectives, size_t num_threads,
+      bool asynchronous, bool legacy_memory_space_behavior,
+      std::function<void(HloModuleConfig&)> customize_hlo_module_config);
 
   absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileInternal(
       const XlaComputation& computation,
       const std::vector<const Shape*>& argument_layout_pointers,
       LayoutCanonicalizationCallback layout_canonicalization_callback,
-      CompileOptions options);
+      CompileOptions options,
+      const AotCompilationOptions* absl_nullable aot_options = nullptr);
 
   int process_index_;
   // Includes all devices, including non-addressable devices.
@@ -287,13 +319,12 @@ class TfrtCpuClient final : public PjRtClient {
       tsl::MakeAvailableAsyncValueRef<CpuEvent>();
 };
 
-class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
+class TfrtCpuBuffer final : public AbstractCpuBuffer {
  public:
-  TfrtCpuBuffer(
-      Shape on_device_shape,
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-      TfrtCpuClient* client, TfrtCpuDevice* device,
-      PjRtMemorySpace* memory_space);
+  TfrtCpuBuffer(Shape on_device_shape,
+                std::unique_ptr<TrackedCpuDeviceBuffer> tracked_device_buffer,
+                TfrtCpuClient* client, TfrtCpuDevice* device,
+                PjRtMemorySpace* memory_space);
 
   TfrtCpuBuffer(const TfrtCpuBuffer&) = delete;
   TfrtCpuBuffer(TfrtCpuBuffer&&) = delete;
@@ -379,12 +410,13 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
   absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
     CompiledMemoryStats memory_stats = CompiledMemoryStats();
     memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
-    const HloProto* proto = cpu_executable_->hlo_proto();
+    const BufferAssignmentProto* proto =
+        cpu_executable_->buffer_assignment_proto();
     if (!proto) {
       return tsl::errors::FailedPrecondition(
-          "cpu_executable_ has no hlo_proto.");
+          "cpu_executable_ has no buffer_assignment_proto.");
     }
-    memory_stats.serialized_hlo_proto = proto->SerializeAsString();
+    memory_stats.buffer_assignment = *proto;
     memory_stats.PopulateBufferStatsFromAllocations(
         cpu_executable_->GetAllocations());
     return memory_stats;
@@ -436,8 +468,8 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
   // Checks that the input buffers passed in by the user have the correct size
   // on device for the compiled program.
   absl::Status CheckBufferCompatibilities(
-      absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const>
-          input_buffers) const;
+      absl::Span<std::pair<bool, TrackedCpuDeviceBuffer*> const> input_buffers)
+      const;
 
   absl::StatusOr<Result> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index eeac606168a0..3a17308a457a 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -87,7 +87,40 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$TestError", "Host",
                          kTestError);
 
 TEST(TfrtCpuClientTest, MemorySpace) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
+  CpuClientOptions options;
+  options.legacy_memory_space_behavior = false;
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(std::move(options)));
+  ASSERT_GE(client->devices().size(), 1);
+
+  ASSERT_EQ(client->memory_spaces().size(),
+            client->addressable_devices().size() * 3);
+  for (auto* device : client->devices()) {
+    TF_ASSERT_OK_AND_ASSIGN(auto* default_memory_space,
+                            device->default_memory_space());
+    EXPECT_EQ(default_memory_space->kind(), CpuDeviceMemorySpace::kKind);
+    EXPECT_EQ(default_memory_space->kind_id(), CpuDeviceMemorySpace::kKindId);
+    EXPECT_THAT(device->memory_space_by_kind(CpuDeviceMemorySpace::kKind),
+                IsOkAndHolds(default_memory_space));
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto cpu_device_memory_space,
+        device->memory_space_by_kind(CpuDeviceMemorySpace::kKind));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto unpinned_host_memory_space,
+        device->memory_space_by_kind(UnpinnedHostMemorySpace::kKind));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto pinned_host_memory_space,
+        device->memory_space_by_kind(PinnedHostMemorySpace::kKind));
+    device->memory_spaces(),
+        ElementsAre(cpu_device_memory_space, unpinned_host_memory_space,
+                    pinned_host_memory_space);
+  }
+}
+
+TEST(TfrtCpuClientTest, LegacyMemorySpace) {
+  CpuClientOptions options;
+  options.legacy_memory_space_behavior = true;
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(std::move(options)));
   ASSERT_GE(client->devices().size(), 1);
 
   ASSERT_EQ(client->memory_spaces().size(),
@@ -123,7 +156,7 @@ ENTRY DonationWithExecutionError() -> f32[2, 2] {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   TF_ASSERT_OK_AND_ASSIGN(auto fingerprint,
                           pjrt_executable->FingerprintExecutable());
@@ -174,7 +207,7 @@ TEST(TfrtCpuClientTest, HloSnapshot) {
   debug_opts->set_xla_dump_hlo_snapshots(true);
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, options));
+                          client->CompileAndLoad(xla_computation, options));
 
   std::vector<float> data1{1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
   std::vector<float> data2{10.0, 20.0, 30.0, 40.0, 50.0, 60.0};
@@ -305,6 +338,20 @@ TEST(TfrtCpuClientTest, BufferFromLiteralInt4) {
               ElementsAreArray(literal.data<s4>()));
 }
 
+TEST(TfrtCpuClientTest, CopyToMemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
+  xla::Shape shape = xla::ShapeUtil::MakeShape(S32, {128, 256});
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(buffer,
+                          buffer->CopyToMemorySpace(buffer->memory_space()));
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  EXPECT_THAT(received_literal->data<int32_t>(),
+              ElementsAreArray(literal.data<int32_t>()));
+}
+
 TEST(TfrtCpuClientTest, AsyncTransferCallsOnDone) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(F32, {3, 2});
@@ -432,7 +479,7 @@ ENTRY Identity() -> f32[2, 2] {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   TF_ASSERT_OK_AND_ASSIGN(auto fingerprint,
                           pjrt_executable->FingerprintExecutable());
@@ -470,7 +517,7 @@ ENTRY Identity() -> f32[2, 2] {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   TF_ASSERT_OK_AND_ASSIGN(auto fingerprint,
                           pjrt_executable->FingerprintExecutable());
@@ -512,7 +559,7 @@ ENTRY Identity() -> f32[2, 2] {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   TF_ASSERT_OK_AND_ASSIGN(auto fingerprint,
                           pjrt_executable->FingerprintExecutable());
@@ -573,7 +620,7 @@ ENTRY Identity() -> f32[2, 2] {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   TF_ASSERT_OK_AND_ASSIGN(auto fingerprint,
                           pjrt_executable->FingerprintExecutable());
@@ -662,7 +709,7 @@ TEST(TfrtCpuClientTest, ForwardUserDataToFfiHandler) {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   ExecuteContext context;
   TF_ASSERT_OK(context.ffi_context().Emplace<MemsetValue>(42.0f));
@@ -711,7 +758,7 @@ TEST(TfrtCpuClientTest, PassAttrToFfiHandler) {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   ExecuteOptions opts;
   auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_device.cc b/third_party/xla/xla/pjrt/cpu/cpu_device.cc
index be8fd4b04751..ea55f489d3b5 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_device.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_device.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
-#include "xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.h"
+#include "xla/pjrt/cpu/cpu_async_execution_tracker.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/service/cpu/cpu_xfeed.h"
@@ -40,8 +40,7 @@ TfrtCpuDevice::TfrtCpuDevice(int process_id, int local_device_id,
     : description_(process_id, local_device_id),
       max_inflight_computations_semaphore_(
           /*capacity=*/max_inflight_computations),
-      async_execution_tracker_(
-          std::make_unique<TfrtCpuAsyncExecutionTracker>()) {}
+      async_execution_tracker_(std::make_unique<CpuAsyncExecutionTracker>()) {}
 
 absl::Status TfrtCpuDevice::TransferToInfeed(const LiteralSlice& literal) {
   return TransferLiteralToInfeedOnCpu(local_hardware_id().value(), literal);
@@ -69,7 +68,11 @@ absl::Span<PjRtMemorySpace* const> TfrtCpuDevice::memory_spaces() const {
 }
 
 absl::StatusOr<PjRtMemorySpace*> TfrtCpuDevice::default_memory_space() const {
-  return memory_space_by_kind_id(UnpinnedHostMemorySpace::kKindId);
+  if (memory_spaces_.empty()) {
+    return absl::FailedPreconditionError(
+        "TfrtCpuDevice::default_memory_space(): No memory space found.");
+  }
+  return memory_spaces_.front();
 }
 
 absl::StatusOr<PjRtMemorySpace*> TfrtCpuDevice::memory_space_by_kind(
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_device.h b/third_party/xla/xla/pjrt/cpu/cpu_device.h
index 42615cdd83b5..2cb20c6e64c3 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_device.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_device.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
-#include "xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.h"
+#include "xla/pjrt/cpu/cpu_async_execution_tracker.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_future.h"
@@ -92,7 +92,7 @@ class TfrtCpuDevice final : public PjRtDevice {
   absl::StatusOr<bool> PoisonExecution(int32_t launch_id,
                                        absl::Status error) override;
 
-  TfrtCpuAsyncExecutionTracker* async_execution_tracker() {
+  CpuAsyncExecutionTracker* async_execution_tracker() {
     return async_execution_tracker_.get();
   }
 
@@ -107,7 +107,7 @@ class TfrtCpuDevice final : public PjRtDevice {
   // ahead of the device.
   Semaphore max_inflight_computations_semaphore_;
 
-  std::unique_ptr<TfrtCpuAsyncExecutionTracker> async_execution_tracker_;
+  std::unique_ptr<CpuAsyncExecutionTracker> async_execution_tracker_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_event.h b/third_party/xla/xla/pjrt/cpu/cpu_event.h
similarity index 88%
rename from third_party/xla/xla/service/cpu/cpu_event.h
rename to third_party/xla/xla/pjrt/cpu/cpu_event.h
index e5a8d8039278..e3d58073a528 100644
--- a/third_party/xla/xla/service/cpu/cpu_event.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_event.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_CPU_CPU_EVENT_H_
-#define XLA_SERVICE_CPU_CPU_EVENT_H_
+#ifndef XLA_PJRT_CPU_CPU_EVENT_H_
+#define XLA_PJRT_CPU_CPU_EVENT_H_
 
 namespace xla {
 
@@ -26,4 +26,4 @@ struct CpuEvent {
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_CPU_CPU_EVENT_H_
+#endif  // XLA_PJRT_CPU_CPU_EVENT_H_
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
new file mode 100644
index 000000000000..82090fe9b2a4
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
@@ -0,0 +1,323 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/raw_buffer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/cpu_function_runtime.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/pjrt/async_work_runner.h"
+#include "xla/pjrt/cpu/abstract_cpu_buffer.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
+#include "xla/pjrt/device_event.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/pjrt/utils.h"
+#include "xla/primitive_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+
+constexpr size_t kSmallDataTransferByteSize = 102400;  // 100 KiB
+
+void CpuTrackedDeviceEventPromise::Set(
+    tsl::RCReference<PjRtDeviceEvent> event) {
+  auto tpu_event =
+      tensorflow::down_cast<CpuTrackedDeviceEvent*>(event.get())->event();
+  av_->ForwardTo(std::move(tpu_event));
+}
+
+PjRtFuture<> CpuTrackedDeviceEvent::GetReadyFuture() {
+  PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+  event_.AndThen([promise, event = event_]() mutable {
+    if (auto* error = event.GetErrorIfPresent()) {
+      promise.Set(*error);
+    } else {
+      promise.Set();
+    }
+  });
+
+  return PjRtFuture<>(
+      promise,
+      /*on_block_start=*/
+      [ready_event = FormRef(promise.async_value()),
+       callee_method = callee_method_, callee_type = callee_type_]() {
+        tsl::profiler::TraceMeProducer traceme(
+            [&] { return absl::StrCat(callee_type, "::", callee_method); });
+        return PjRtFutureHelpers::ProfilingKeys({traceme.GetContextId()});
+      },
+      /*on_block_end=*/
+      [callee_method = callee_method_,
+       callee_type = callee_type_](PjRtFutureHelpers::ProfilingKeys keys) {
+        tsl::profiler::TraceMeConsumer traceme(
+            [&] { return absl::StrCat(callee_type, "::", callee_method); },
+            keys.traceme_context_id);
+      });
+}
+
+void CpuTrackedDeviceEvent::AndThen(absl::AnyInvocable<void() &&> cb) {
+  event_.AndThen(std::move(cb));
+}
+
+/*static*/ absl::StatusOr<tsl::RCReference<CpuRawBuffer>>
+CpuRawBuffer::Allocate(PjRtMemorySpace* memory_space, size_t size_bytes) {
+  TF_ASSIGN_OR_RETURN(auto memory, CpuDeviceMemory::Allocate(size_bytes));
+  return tsl::MakeRef<CpuRawBuffer>(memory_space, std::move(memory));
+}
+
+/*static*/ absl::StatusOr<tsl::RCReference<CpuRawBuffer>>
+CpuRawBuffer::ImportForeignMemory(
+    void* data, absl::AnyInvocable<void() &&> on_delete_callback,
+    size_t on_device_bytes_count, PjRtMemorySpace* memory_space) {
+  if ((absl::bit_cast<std::uintptr_t>(data) &
+       (cpu_function_runtime::MinAlign() - 1)) != 0) {
+    return InvalidArgument(
+        "Can't create a view of buffer with unaligned data, ptr: %#x is not "
+        "aligned to %d bytes. ",
+        reinterpret_cast<std::uintptr_t>(data),
+        cpu_function_runtime::MinAlign());
+  }
+  return tsl::MakeRef<CpuRawBuffer>(
+      memory_space,
+      CpuDeviceMemory::CreateForeignMemory(data, on_device_bytes_count,
+                                           std::move(on_delete_callback)));
+}
+
+size_t CpuRawBuffer::GetOnDeviceSizeInBytes() const {
+  return buffer_->size_bytes();
+}
+
+void* CpuRawBuffer::GetHostPointer() const { return buffer_->untyped_data(); }
+
+absl::Status CpuRawBuffer::ValidateSlice(int64_t offset, int64_t slice_size) {
+  size_t buffer_size = GetOnDeviceSizeInBytes();
+  if (offset < 0 || offset > buffer_size || buffer_size - offset < slice_size) {
+    return InvalidArgument(
+        "Invalid slicing of buffer size %lld with "
+        "invalid offset %lld, slice size %lld",
+        buffer_size, offset, slice_size);
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+CpuRawBuffer::CopyRawHostToDeviceAndReturnEvent(const void* src, int64_t offset,
+                                                int64_t transfer_size) {
+  TF_RETURN_IF_ERROR(ValidateSlice(offset, transfer_size));
+  std::memcpy(static_cast<uint8_t*>(GetHostPointer()) + offset, src,
+              transfer_size);
+  return tsl::MakeRef<CpuTrackedDeviceEvent>(
+      tsl::MakeAvailableAsyncValueRef<CpuEvent>());
+}
+
+PjRtFuture<> CpuRawBuffer::CopyRawDeviceToHost(void* dst, int64_t offset,
+                                               int64_t transfer_size) {
+  auto s = ValidateSlice(offset, transfer_size);
+  if (!s.ok()) {
+    return PjRtFuture<>(s);
+  }
+  std::memcpy(dst, static_cast<uint8_t*>(GetHostPointer()) + offset,
+              transfer_size);
+  return PjRtFuture<>(absl::OkStatus());
+}
+
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> CpuRawBuffer::CopyFromLiteral(
+    const LiteralSlice& literal, const xla::Layout& layout,
+    AsyncWorkRunner* async_work_runner) {
+  auto event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  async_work_runner->Schedule([literal, layout, event, buffer = buffer_]() {
+    CHECK(buffer.IsConcrete());
+    const xla::Shape& shape = literal.shape();
+    if ((!shape.has_layout() &&
+         !xla::LayoutUtil::IsMonotonicWithDim0Major(layout)) ||
+        (shape.layout() != layout)) {
+      auto shape_copy = xla::ShapeUtil::MakeShape(
+          literal.shape().element_type(), literal.shape().dimensions());
+      shape_copy.mutable_layout()->mutable_minor_to_major()->assign(
+          layout.minor_to_major().begin(), layout.minor_to_major().end());
+
+      xla::Literal literal_copy(shape_copy);
+      CHECK_OK(literal_copy.CopyFrom(literal));
+      PackOrCopy(literal_copy.shape().element_type(), literal_copy,
+                 buffer->untyped_data(), buffer->size_bytes());
+    } else {
+      PackOrCopy(literal.shape().element_type(), literal,
+                 buffer->untyped_data(), buffer->size_bytes());
+    }
+    event.SetStateConcrete();
+  });
+  return tsl::MakeRef<CpuTrackedDeviceEvent>(std::move(event), "CpuRawBuffer",
+                                             "CopyFromLiteral");
+}
+
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+CpuRawBuffer::CopyFromHostBuffer(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    PjRtClient::HostBufferSemantics host_buffer_semantics,
+    absl::AnyInvocable<void() &&> on_done_with_host_buffer, const Shape& shape,
+    AsyncWorkRunner* async_work_runner, absl::Mutex* transpose_mu,
+    TransposePlanCache* transpose_cache) {
+  tsl::AsyncValueRef<CpuDeviceMemory> device_buffer = buffer_;
+  bool has_default_layout =
+      !byte_strides || HasMajorToMinorLayout(type, dims, *byte_strides);
+  const int bit_width = primitive_util::BitWidth(type);
+  // Packed arrays are unpacked on host and packed on device.
+  bool is_packed = primitive_util::IsSubByteNonPredType(type);
+
+  size_t byte_size = ShapeUtil::ByteSizeOf(shape);
+  size_t dst_byte_size = byte_size;
+  if (is_packed) {
+    byte_size *= 8 / bit_width;
+  }
+  auto dst_data_ptr = device_buffer->untyped_data();
+  if (!has_default_layout || is_packed) {
+    // If the input array does not have a major-to-minor layout, transpose it
+    // into major-to-minor layout. Currently we choose to always do this
+    // synchronously.
+    // TODO(phawkins): consider performing the transpose asynchronously.
+    // TODO(phawkins): parallelize the transpose.
+    std::shared_ptr<TransposePlan> transpose;
+    {
+      absl::InlinedVector<int64_t, 4> permutation(dims.size());
+      absl::c_iota(permutation, 0);
+      TransposePlan::Options options;
+      options.elem_size_in_bytes = primitive_util::ByteWidth(type);
+      options.dims = dims;
+      options.permutation = permutation;
+      if (byte_strides) {
+        options.input_layout = TransposePlan::Striding{*byte_strides};
+      }
+      absl::MutexLock lock(transpose_mu);
+      TF_ASSIGN_OR_RETURN(transpose, transpose_cache->GetOrCreate(options));
+    }
+    if (!is_packed) {
+      transpose->Execute(data, dst_data_ptr);
+    } else {
+      // First transpose the unpacked data into a new temporary buffer, then
+      // pack the data.
+      // TODO(reedwm): Fuse the transpose and packing by having TransposePlan
+      // support packing.
+      auto data_transposed = std::make_unique<char[]>(byte_size);
+      transpose->Execute(data, data_transposed.get());
+      absl::Span<const char> src_data_span(data_transposed.get(), byte_size);
+      absl::Span<char> dst_data_span(static_cast<char*>(dst_data_ptr),
+                                     dst_byte_size);
+      PackIntN(bit_width, src_data_span, dst_data_span);
+    }
+    if (on_done_with_host_buffer) {
+      std::move(on_done_with_host_buffer)();
+      on_done_with_host_buffer = nullptr;
+    }
+  } else {
+    bool should_sync_copy =
+        host_buffer_semantics ==
+            PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall ||
+        (byte_size < kSmallDataTransferByteSize);
+    if (should_sync_copy) {
+      std::memcpy(dst_data_ptr, data, byte_size);
+      if (on_done_with_host_buffer) {
+        std::move(on_done_with_host_buffer)();
+        on_done_with_host_buffer = nullptr;
+      }
+    } else {
+      tsl::AsyncValueRef<CpuEvent> copy_event =
+          tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+      auto result = tsl::MakeRef<CpuTrackedDeviceEvent>(
+          copy_event.CopyRef(), "CpuRawBuffer", "CopyFromHostBuffer");
+      async_work_runner->Schedule([device_buffer, dst_data_ptr, data, byte_size,
+                                   copy_event = std::move(copy_event),
+                                   on_done_with_host_buffer = std::move(
+                                       on_done_with_host_buffer)]() mutable {
+        tsl::profiler::TraceMe traceme("H2D Dispatch");
+        std::memcpy(dst_data_ptr, data, byte_size);
+        if (on_done_with_host_buffer) {
+          std::move(on_done_with_host_buffer)();
+          on_done_with_host_buffer = nullptr;
+        }
+        // Signal copy is complete.
+        copy_event.SetStateConcrete();
+      });
+      return result;
+    }
+  }
+  return tsl::MakeRef<CpuTrackedDeviceEvent>(
+      tsl::MakeAvailableAsyncValueRef<CpuEvent>());
+}
+
+absl::StatusOr<xla::Shape> MakeDefaultCpuBufferShape(
+    xla::Shape shape, const xla::Layout* layout) {
+  if (layout) {
+    shape.mutable_layout()->mutable_minor_to_major()->assign(
+        layout->minor_to_major().begin(), layout->minor_to_major().end());
+  } else {
+    xla::LayoutUtil::SetToDefaultLayout(&shape);
+  }
+  auto element_type = shape.element_type();
+  if (primitive_util::IsSubByteNonPredType(element_type)) {
+    shape.mutable_layout()->set_element_size_in_bits(
+        primitive_util::BitWidth(element_type));
+  }
+  if (layout) {
+    auto layout_copy = *layout;
+    if (primitive_util::IsSubByteNonPredType(element_type)) {
+      layout_copy.set_element_size_in_bits(
+          primitive_util::BitWidth(element_type));
+    }
+    if (layout_copy != shape.layout()) {
+      return absl::UnimplementedError(
+          absl::StrCat("PjRt CPU buffers do not support non-default layout. ",
+                       shape.ToString(), " vs ", layout_copy.ToString()));
+    }
+  }
+  return shape;
+}
+
+absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+CpuRawBuffer::MakeAllocationReadyEvent() {
+  return tsl::MakeRef<CpuTrackedDeviceEvent>(
+      tsl::MakeAvailableAsyncValueRef<CpuEvent>());
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.h b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
new file mode 100644
index 000000000000..7ca8d1412d99
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
@@ -0,0 +1,153 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_RAW_BUFFER_H_
+#define XLA_PJRT_CPU_RAW_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/async_work_runner.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
+#include "xla/pjrt/device_event.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/raw_buffer.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+
+class CpuTrackedDeviceEventPromise : public PjRtDeviceEventPromise {
+ public:
+  explicit CpuTrackedDeviceEventPromise(
+      tsl::RCReference<tsl::IndirectAsyncValue> av)
+      : av_(av) {}
+
+  tsl::AsyncValue* async_value() override { return av_.get(); }
+
+  void Set(tsl::RCReference<PjRtDeviceEvent> event) override;
+
+  void SetError(absl::Status s) override { av_->SetError(std::move(s)); }
+
+  tsl::RCReference<tsl::IndirectAsyncValue>& av() { return av_; }
+
+ private:
+  tsl::RCReference<tsl::IndirectAsyncValue> av_;
+};
+
+class CpuTrackedDeviceEvent : public PjRtDeviceEvent {
+ public:
+  explicit CpuTrackedDeviceEvent(
+      tsl::AsyncValueRef<CpuEvent> event,
+      const char* callee_type = "CpuTrackedDeviceEvent",
+      const char* callee_method = "Unknown")
+      : event_(std::move(event)),
+        callee_type_(callee_type),
+        callee_method_(callee_method) {}
+
+  const tsl::AsyncValueRef<CpuEvent>& event() const { return event_; }
+
+  const absl::Status& status() const override {
+    return event_.GetAsyncValue()->GetError();
+  }
+
+  PjRtFuture<> GetReadyFuture() override;
+
+  PjRtDeviceEvent::State state() const override {
+    switch (event_.GetAsyncValue()->state()) {
+      case tsl::AsyncValue::State::kError:
+        return PjRtDeviceEvent::State::kError;
+      case tsl::AsyncValue::State::kConcrete:
+        return PjRtDeviceEvent::State::kReady;
+      default:
+        return PjRtDeviceEvent::State::kPending;
+    }
+  }
+
+  void AndThen(absl::AnyInvocable<void() &&> cb) override;
+
+ private:
+  tsl::AsyncValueRef<CpuEvent> event_;
+  const char* callee_type_;
+  const char* callee_method_;
+};
+
+class CpuRawBuffer : public CommonPjRtRawBuffer {
+ public:
+  CpuRawBuffer(PjRtMemorySpace* memory_space,
+               tsl::AsyncValueRef<CpuDeviceMemory> buffer)
+      : memory_space_(memory_space), buffer_(std::move(buffer)) {}
+
+  absl::Status ValidateSlice(int64_t offset, int64_t slice_size);
+
+  // Allocates owning memory.
+  static absl::StatusOr<tsl::RCReference<CpuRawBuffer>> Allocate(
+      PjRtMemorySpace* memory_space, size_t size_bytes);
+
+  // Imports foreign memory.
+  static absl::StatusOr<tsl::RCReference<CpuRawBuffer>> ImportForeignMemory(
+      void* data, absl::AnyInvocable<void() &&> on_delete_callback,
+      size_t on_device_bytes_count, PjRtMemorySpace* memory_space);
+
+  size_t GetOnDeviceSizeInBytes() const override;
+
+  void* GetHostPointer() const override;
+
+  const tsl::AsyncValueRef<CpuDeviceMemory>& buffer() const { return buffer_; }
+
+  PjRtMemorySpace* memory_space() const override { return memory_space_; }
+
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+  CopyRawHostToDeviceAndReturnEvent(const void* src, int64_t offset,
+                                    int64_t transfer_size) override;
+
+  PjRtFuture<> CopyRawDeviceToHost(void* dst, int64_t offset,
+                                   int64_t transfer_size) override;
+
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> CopyFromLiteral(
+      const LiteralSlice& literal, const xla::Layout& layout,
+      AsyncWorkRunner* async_work_runner);
+
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> MakeAllocationReadyEvent()
+      override;
+
+  absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> CopyFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      PjRtClient::HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      const Shape& shape, AsyncWorkRunner* async_work_runner,
+      absl::Mutex* transpose_mu, TransposePlanCache* transpose_cache);
+
+ private:
+  PjRtMemorySpace* const memory_space_;
+  tsl::AsyncValueRef<CpuDeviceMemory> buffer_;
+};
+
+absl::StatusOr<xla::Shape> MakeDefaultCpuBufferShape(xla::Shape shape,
+                                                     const xla::Layout* layout);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_RAW_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.cc b/third_party/xla/xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.cc
deleted file mode 100644
index 68297d69f012..000000000000
--- a/third_party/xla/xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.h"
-
-#include <cstdint>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/service/cpu/cpu_event.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-
-namespace xla {
-
-TfrtCpuScopedAsyncExecution::TfrtCpuScopedAsyncExecution(
-    TfrtCpuAsyncExecutionTracker* tracker, int32_t launch_id, Key key)
-    : tracker_(tracker), launch_id_(launch_id), key_(key) {}
-
-TfrtCpuScopedAsyncExecution::TfrtCpuScopedAsyncExecution(
-    TfrtCpuScopedAsyncExecution&& other)
-    : tracker_(other.tracker_), launch_id_(other.launch_id_), key_(other.key_) {
-  other.tracker_ = nullptr;
-}
-
-TfrtCpuScopedAsyncExecution::~TfrtCpuScopedAsyncExecution() {
-  if (tracker_ != nullptr) {
-    tracker_->RemoveAsyncExecution(launch_id_, key_);
-  }
-}
-
-void TfrtCpuScopedAsyncExecution::SetStateConcrete() {
-  if (tracker_ != nullptr) {
-    tracker_->SetStateConcrete(launch_id_, key_);
-    tracker_ = nullptr;
-  }
-}
-
-void TfrtCpuScopedAsyncExecution::SetError(absl::Status error) {
-  if (tracker_ != nullptr) {
-    tracker_->SetError(launch_id_, key_, std::move(error));
-    tracker_ = nullptr;
-  }
-}
-
-TfrtCpuScopedAsyncExecution TfrtCpuAsyncExecutionTracker::NewAsyncExecution(
-    int32_t launch_id, tsl::AsyncValueRef<CpuEvent> execute_event) {
-  absl::MutexLock lock(&mu_);
-  Key async_execution_key = execute_event.GetAsyncValue();
-  executions_[launch_id].insert(
-      {async_execution_key, std::move(execute_event)});
-  return TfrtCpuScopedAsyncExecution(this, launch_id, async_execution_key);
-}
-
-bool TfrtCpuAsyncExecutionTracker::SetError(int32_t launch_id,
-                                            absl::Status error) {
-  absl::ReleasableMutexLock lock(&mu_);
-  auto it = executions_.find(launch_id);
-  if (it != executions_.end()) {
-    absl::flat_hash_map<Key, tsl::AsyncValueRef<CpuEvent>> execute_events =
-        std::move(it->second);
-    executions_.erase(it);
-    lock.Release();
-
-    if (execute_events.size() == 1) {
-      // Fast path for an execution with a unique `launch_id`.
-      tsl::AsyncValueRef<CpuEvent>& execute_event =
-          execute_events.begin()->second;
-      if (execute_event.IsUnavailable()) {
-        execute_event.SetError(std::move(error));
-        return true;
-      }
-      return false;
-    } else {
-      bool any_success = false;
-      for (auto& [key, execute_event] : execute_events) {
-        if (execute_event.IsUnavailable()) {
-          execute_event.SetError(error);
-          any_success = true;
-        }
-      }
-      return any_success;
-    }
-  }
-  return false;
-}
-
-void TfrtCpuAsyncExecutionTracker::SetError(int32_t launch_id, Key key,
-                                            absl::Status error) {
-  absl::ReleasableMutexLock lock(&mu_);
-  auto it = executions_.find(launch_id);
-  if (it != executions_.end()) {
-    auto it2 = it->second.find(key);
-    if (it2 != it->second.end()) {
-      tsl::AsyncValueRef<CpuEvent> execute_event = std::move(it2->second);
-      it->second.erase(it2);
-      if (it->second.empty()) {
-        executions_.erase(it);
-      }
-      lock.Release();
-
-      if (execute_event.IsUnavailable()) {
-        execute_event.SetError(error);
-      }
-    }
-  }
-}
-
-void TfrtCpuAsyncExecutionTracker::SetStateConcrete(int32_t launch_id,
-                                                    Key key) {
-  absl::ReleasableMutexLock lock(&mu_);
-  auto it = executions_.find(launch_id);
-  if (it != executions_.end()) {
-    auto it2 = it->second.find(key);
-    if (it2 != it->second.end()) {
-      tsl::AsyncValueRef<CpuEvent> execute_event = std::move(it2->second);
-      it->second.erase(it2);
-      if (it->second.empty()) {
-        executions_.erase(it);
-      }
-      lock.Release();
-
-      if (execute_event.IsUnavailable()) {
-        execute_event.SetStateConcrete();
-      }
-    }
-  }
-}
-
-void TfrtCpuAsyncExecutionTracker::RemoveAsyncExecution(int32_t launch_id,
-                                                        Key key) {
-  absl::MutexLock lock(&mu_);
-  auto it = executions_.find(launch_id);
-  if (it != executions_.end()) {
-    auto it2 = it->second.find(key);
-    if (it2 != it->second.end()) {
-      it->second.erase(it2);
-      if (it->second.empty()) {
-        executions_.erase(it);
-      }
-    }
-  }
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.h b/third_party/xla/xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.h
deleted file mode 100644
index 8d5c5f02703e..000000000000
--- a/third_party/xla/xla/pjrt/cpu/tfrt_cpu_async_execution_tracker.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PJRT_CPU_TFRT_CPU_ASYNC_EXECUTION_TRACKER_H_
-#define XLA_PJRT_CPU_TFRT_CPU_ASYNC_EXECUTION_TRACKER_H_
-
-#include <cstdint>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/service/cpu/cpu_event.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-
-namespace xla {
-
-class TfrtCpuAsyncExecutionTracker;
-
-// RAII wrapper for an async execution. It reports the completion of the async
-// execution to the tracker and acts as a helper to set the state of the execute
-// event only if it is not set yet. Typically, either `SetStateConcrete()` or
-// `SetError()` should be called before the destruction of
-// `TfrtCpuScopedAsyncExecution`.
-//
-// Not thread-safe.
-class TfrtCpuScopedAsyncExecution {
- public:
-  // Opaque key that uniquely identifies an async execution for tracking
-  // purposes.
-  using Key = const void*;
-
-  TfrtCpuScopedAsyncExecution(TfrtCpuAsyncExecutionTracker* tracker,
-                              int32_t launch_id, Key key);
-  TfrtCpuScopedAsyncExecution(TfrtCpuScopedAsyncExecution&& other);
-  ~TfrtCpuScopedAsyncExecution();
-
-  TfrtCpuScopedAsyncExecution(const TfrtCpuScopedAsyncExecution&) = delete;
-
-  // Sets the state of the execution to a ready state. No-op if the execute
-  // event is already set.
-  void SetStateConcrete();
-
-  // Sets the state of the execution to an error. No-op if the execute event is
-  // already set.
-  void SetError(absl::Status error);
-
- private:
-  TfrtCpuAsyncExecutionTracker* tracker_;
-  int32_t launch_id_;
-  Key key_;
-};
-
-// Tracks async executions that have not finished yet. Upon destruction, the
-// tracker will wait for all async executions to finish to help graceful
-// teardown of the runtime state.
-//
-// Thread-safe.
-class TfrtCpuAsyncExecutionTracker {
- public:
-  using Key = TfrtCpuScopedAsyncExecution::Key;
-
-  // Registers a new execution dispatched to a device.
-  TfrtCpuScopedAsyncExecution NewAsyncExecution(
-      int32_t launch_id, tsl::AsyncValueRef<CpuEvent> execute_event);
-
-  // Sets the state of any executions with `launch_id` to an error. Returns true
-  // if it succeeds to set the state. Returns false if all executions have been
-  // removed or their execute event is already set.
-  bool SetError(int32_t launch_id, absl::Status error);
-
-  // Below is used by `TfrtCpuScopedAsyncExecution`.
-
-  // Sets the state of the execute event to an error. Returns true if it
-  // succeeds to set the state. Returns false if the execution has been removed
-  // or the execute event is already set.
-  void SetError(int32_t launch_id, Key key, absl::Status error);
-
-  // Sets the state of the execute event to a ready state. Returns true if it
-  // succeeds to set the state. Returns false if the execution has been removed
-  // or the execute event is already set.
-  void SetStateConcrete(int32_t launch_id, Key key);
-
-  // Removes the execution from the tracker without setting the state of the
-  // execute event.
-  void RemoveAsyncExecution(int32_t launch_id, Key key);
-
- private:
-  absl::Mutex mu_;
-
-  // Maps launch_id to the execute event of async executions that have no
-  // execute state set yet. The map value tracks multiple execute events from
-  // async executions that use the same `launch_id`.
-  absl::flat_hash_map<int32_t,
-                      absl::flat_hash_map<Key, tsl::AsyncValueRef<CpuEvent>>>
-      executions_ ABSL_GUARDED_BY(mu_);
-};
-
-}  // namespace xla
-
-#endif  // XLA_PJRT_CPU_TFRT_CPU_ASYNC_EXECUTION_TRACKER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
new file mode 100644
index 000000000000..527db0a649f7
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
@@ -0,0 +1,199 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
+
+#include <cstddef>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/backends/cpu/alignment.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/util.h"
+#include "tsl/platform/mem.h"
+
+namespace xla {
+namespace {
+
+// Returns an AsyncValueRef<CpuEvent> that will be ready after all the async
+// values in `events` are ready. If errors occurs, one of the errors will be
+// propagated through the returned async value.
+tsl::AsyncValueRef<CpuEvent> AfterAll(
+    absl::Span<const tsl::AsyncValueRef<CpuEvent>> events) {
+  if (events.empty()) {
+    return tsl::MakeAvailableAsyncValueRef<CpuEvent>();
+  }
+  if (events.size() == 1) {
+    return events.front();
+  }
+
+  tsl::CountDownAsyncValueRef<CpuEvent> after_all(events.size());
+  for (auto& event : events) {
+    event.AndThen([after_all](absl::Status status) mutable {
+      after_all.CountDown(std::move(status));
+    });
+  }
+
+  return std::move(after_all).AsRef();
+}
+}  // namespace
+
+class CpuDeviceMemoryOwned final : public CpuDeviceMemory {
+ public:
+  CpuDeviceMemoryOwned(void* base, size_t size) : CpuDeviceMemory(base, size) {}
+
+  ~CpuDeviceMemoryOwned() final {
+    CHECK_NE(untyped_data(), nullptr);
+    tsl::port::AlignedSizedFree(untyped_data(), cpu::MinAlign(), size_bytes());
+  }
+};
+
+class CpuDeviceMemoryForeign final : public CpuDeviceMemory {
+ public:
+  CpuDeviceMemoryForeign(void* base, size_t size,
+                         absl::AnyInvocable<void() &&> on_delete_callback)
+      : CpuDeviceMemory(base, size),
+        on_delete_callback_(std::move(on_delete_callback)) {}
+
+  ~CpuDeviceMemoryForeign() final {
+    if (on_delete_callback_) {
+      std::move(on_delete_callback_)();
+    }
+  }
+
+ private:
+  absl::AnyInvocable<void() &&> on_delete_callback_;
+};
+
+class CpuDeviceMemoryConstant final : public CpuDeviceMemory {
+ public:
+  CpuDeviceMemoryConstant(void* base, size_t size)
+      : CpuDeviceMemory(base, size) {}
+};
+
+tsl::AsyncValueRef<CpuDeviceMemory> CpuDeviceMemory::CreateDelayedMemory() {
+  return tsl::MakeUnconstructedAsyncValueRef<CpuDeviceMemoryOwned>();
+}
+
+tsl::AsyncValueRef<CpuDeviceMemory> CpuDeviceMemory::CreateForeignMemory(
+    void* base, size_t size, absl::AnyInvocable<void() &&> on_delete_callback) {
+  return tsl::MakeAvailableAsyncValueRef<CpuDeviceMemoryForeign>(
+      base, size, std::move(on_delete_callback));
+}
+
+tsl::AsyncValueRef<CpuDeviceMemory> CpuDeviceMemory::CreateConstantMemory(
+    void* base, size_t size) {
+  return tsl::MakeAvailableAsyncValueRef<CpuDeviceMemoryConstant>(base, size);
+}
+
+// Allocates owning memory wrapped in an available `AsyncValueRef`.
+absl::StatusOr<tsl::AsyncValueRef<CpuDeviceMemory>> CpuDeviceMemory::Allocate(
+    size_t size_bytes) {
+  if (void* data = tsl::port::AlignedMalloc(size_bytes, cpu::MinAlign())) {
+    return tsl::MakeAvailableAsyncValueRef<CpuDeviceMemoryOwned>(data,
+                                                                 size_bytes);
+  }
+  return ResourceExhausted("Out of memory allocating %d bytes.", size_bytes);
+}
+
+absl::Status CpuDeviceMemory::AllocateInto(
+    size_t size_bytes, tsl::AsyncValuePtr<CpuDeviceMemory> delayed_memory) {
+  auto owned_memory = delayed_memory.DynCast<CpuDeviceMemoryOwned>();
+  if (!owned_memory) {
+    return Internal("Delayed memory is not a CpuDeviceMemoryOwned");
+  }
+  if (void* data = tsl::port::AlignedMalloc(size_bytes, cpu::MinAlign())) {
+    owned_memory.emplace(data, size_bytes);
+    return absl::OkStatus();
+  }
+  return ResourceExhausted("Out of memory allocating %d bytes.", size_bytes);
+}
+
+TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
+    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
+    : TrackedCpuDeviceBuffer(owns_buffers, std::move(buffer),
+                             AfterAll(definition_events)) {}
+
+TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
+    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    size_t buffer_size,
+    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
+    : TrackedCpuDeviceBuffer(owns_buffers, std::move(buffer), buffer_size,
+                             AfterAll(definition_events)) {}
+
+TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
+    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    tsl::AsyncValueRef<CpuEvent> definition_event)
+    : owns_buffers_(owns_buffers),
+      buffer_(std::move(buffer)),
+      definition_event_(std::move(definition_event)) {
+  DCHECK(definition_event_);
+  CHECK(buffer_.IsConcrete());
+  buffer_size_ = buffer_->size_bytes();
+}
+
+TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
+    bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+    size_t buffer_size, tsl::AsyncValueRef<CpuEvent> definition_event)
+    : owns_buffers_(owns_buffers),
+      buffer_(std::move(buffer)),
+      buffer_size_(buffer_size),
+      definition_event_(std::move(definition_event)) {
+  DCHECK(definition_event_);
+}
+
+TrackedCpuDeviceBuffer::~TrackedCpuDeviceBuffer() { ReleaseDeviceMemory(); }
+
+size_t TrackedCpuDeviceBuffer::BufferSize() { return buffer_size_; }
+
+void TrackedCpuDeviceBuffer::AddUsageEvents(
+    absl::Span<tsl::AsyncValueRef<CpuEvent>> events) {
+  // Periodically remove available usage events to prevent memory blowup.
+  if (usage_events_.size() >= 1024) {
+    int i = 0;
+    while (i < usage_events_.size()) {
+      auto& event = usage_events_[i];
+      if (event.IsAvailable()) {
+        using std::swap;
+        swap(event, usage_events_.back());
+        usage_events_.pop_back();
+        continue;
+      }
+      ++i;
+    }
+  }
+  for (auto& ev : events) {
+    usage_events_.push_back(std::move(ev));
+  }
+}
+
+absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
+TrackedCpuDeviceBuffer::LockUseAndTransferUsageEvents() {
+  return std::move(usage_events_);
+}
+
+void TrackedCpuDeviceBuffer::ReleaseDeviceMemory() {
+  buffer_ = tsl::AsyncValueRef<CpuDeviceMemory>();
+  definition_event_.reset();
+  usage_events_.clear();
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
new file mode 100644
index 000000000000..05f1f92a2b14
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
@@ -0,0 +1,162 @@
+/* Copyright 2021 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_TRACKED_CPU_DEVICE_BUFFER_H_
+#define XLA_PJRT_CPU_TRACKED_CPU_DEVICE_BUFFER_H_
+
+#include <cstddef>
+#include <cstdlib>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla {
+
+// A region of device memory that can be used to construct PjRt buffers. Device
+// memory can be either owned or non-owned.
+class CpuDeviceMemory {
+ public:
+  virtual ~CpuDeviceMemory() = default;
+
+  CpuDeviceMemory(const CpuDeviceMemory&) = delete;
+  CpuDeviceMemory& operator=(const CpuDeviceMemory&) = delete;
+
+  void* untyped_data() const { return base_; }
+  size_t size_bytes() const { return size_bytes_; }
+
+  // Creates an unavailable AsyncValueRef placeholder for a delayed
+  // memory allocation (see `AllocateInto` below).
+  static tsl::AsyncValueRef<CpuDeviceMemory> CreateDelayedMemory();
+
+  // Creates an available AsyncValueRef to a CpuDeviceMemory that wraps foreign
+  // memory. Will call on_delete_callback on the last-ref.
+  static tsl::AsyncValueRef<CpuDeviceMemory> CreateForeignMemory(
+      void* base, size_t size,
+      absl::AnyInvocable<void() &&> on_delete_callback);
+
+  // Creates an available AsyncValueRef to a CpuDeviceMemory that wraps a
+  // constant memory with infinite lifetime. No action will be taken on decref.
+  static tsl::AsyncValueRef<CpuDeviceMemory> CreateConstantMemory(void* base,
+                                                                  size_t size);
+
+  // Allocates owning memory wrapped in an available `AsyncValueRef`.
+  static absl::StatusOr<tsl::AsyncValueRef<CpuDeviceMemory>> Allocate(
+      size_t size_bytes);
+
+  // Allocates owning memory into the previously created delayed memory
+  // placeholder (see `CreateDelayedMemory` above).
+  static absl::Status AllocateInto(
+      size_t size_bytes, tsl::AsyncValuePtr<CpuDeviceMemory> delayed_memory);
+
+ protected:
+  CpuDeviceMemory(void* base, size_t size) : base_(base), size_bytes_(size) {}
+
+  void* base_;
+  size_t size_bytes_;
+};
+
+// A class that represents a CPU device buffer: it can be a single memory region
+// or multiple memory regions for a tuple buffers. It also tracks the definition
+// and usage of the memory to allow for synchronized usage and deletion of CPU
+// memory. This class is thread-compatible.
+class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
+ public:
+  // For non-tuple, takes a single buffer.
+  // For tuple, takes the leaf buffers. Tuple index table created internally.
+  // Nested tuple is not supported.
+
+  // Constructor for allocated cpu memory, i.e., `buffer` should have concrete
+  // states. Definition event is after the list of `definition_events`.
+  TrackedCpuDeviceBuffer(
+      bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
+
+  // Variant with single definition event.
+  TrackedCpuDeviceBuffer(bool owns_buffers,
+                         tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+                         tsl::AsyncValueRef<CpuEvent> definition_event);
+
+  // Constructor for unallocated cpu memory, i.e., `buffer` will have
+  // unconstructed states, and we also need to provide `buffer_size` which will
+  // be the size of the `buffer` after allocation. Definition event is after the
+  // list of `definition_events`. Callers need to ensure cpu memory is allocated
+  // before the definition event is ready.
+  TrackedCpuDeviceBuffer(
+      bool owns_buffers, tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+      size_t buffer_size,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
+
+  // Variant with single definition event.
+  TrackedCpuDeviceBuffer(bool owns_buffers,
+                         tsl::AsyncValueRef<CpuDeviceMemory> buffer,
+                         size_t buffer_size,
+                         tsl::AsyncValueRef<CpuEvent> definition_event);
+
+  TrackedCpuDeviceBuffer(TrackedCpuDeviceBuffer&&) noexcept = default;
+  TrackedCpuDeviceBuffer& operator=(TrackedCpuDeviceBuffer&&) noexcept =
+      default;
+
+  ~TrackedCpuDeviceBuffer();
+
+  const tsl::AsyncValueRef<CpuDeviceMemory>& buffer() { return buffer_; }
+
+  size_t BufferSize();
+
+  const tsl::AsyncValueRef<CpuEvent>& definition_event() const {
+    return definition_event_;
+  }
+
+  absl::Span<const tsl::AsyncValueRef<CpuEvent>> UsageEvents() const {
+    return usage_events_;
+  }
+
+  void AddUsageEvents(absl::Span<tsl::AsyncValueRef<CpuEvent>> events);
+
+  // Return the usage events for the buffers. After
+  // LockUseAndTransferUsageEvents is called, it is illegal to AddUsageEvent.
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
+  LockUseAndTransferUsageEvents();
+
+  bool owns_buffers() const { return owns_buffers_; }
+
+ private:
+  // Relinquishes ownership of the buffer's device memory, e.g., after the
+  // buffer is passed to a computation that aliases its inputs to outputs.
+  void ReleaseDeviceMemory();
+
+  void ConfirmDonation() override { ReleaseDeviceMemory(); }
+
+  bool owns_buffers_;
+
+  // If non-tuple, `buffers_` contains 1 buffer; otherwise all leaf buffers.
+  tsl::AsyncValueRef<CpuDeviceMemory> buffer_;
+  // Should correspond to size of each buffer in `buffers_` when `buffers_` is
+  // available.
+  size_t buffer_size_;
+  // The definition event are associated with CPU operations that write to the
+  // buffers.
+  tsl::AsyncValueRef<CpuEvent> definition_event_;
+  // Usage events are associated with CPU operations that read from the buffers.
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> usage_events_;
+};
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_TRACKED_CPU_DEVICE_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
new file mode 100644
index 000000000000..cae9bb2a5080
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2022 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/tracked_cpu_device_buffer.h"
+
+#include <cstring>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "xla/pjrt/cpu/cpu_event.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace {
+
+using ::tsl::BlockUntilReady;
+using ::tsl::MakeConstructedAsyncValueRef;
+using ::tsl::thread::ThreadPool;
+
+TEST(TrackedCpuDeviceBufferTest, Basic) {
+  std::string expected = "tracked_cpu_device_buffer_test";
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer,
+                          CpuDeviceMemory::Allocate(expected.size()));
+
+  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
+
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
+
+  thread_pool.Schedule([&]() {
+    std::memcpy(buffer->untyped_data(), expected.data(), expected.size());
+    definition_event.SetStateConcrete();
+  });
+
+  TrackedCpuDeviceBuffer tracked_buffer(
+      /*owns_buffers=*/true, buffer, definition_event);
+
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
+
+  auto result = tracked_buffer.buffer();
+  ASSERT_TRUE(result.IsAvailable());
+  EXPECT_EQ(std::string(static_cast<const char*>(result->untyped_data()),
+                        result->size_bytes()),
+            expected);
+}
+
+TEST(TrackedCpuDeviceBufferTest, BasicError) {
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer, CpuDeviceMemory::Allocate(64));
+
+  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
+
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
+
+  thread_pool.Schedule([&]() {
+    definition_event.SetError(
+        Internal("tracked_cpu_device_buffer_test error."));
+  });
+
+  TrackedCpuDeviceBuffer tracked_buffer(
+      /*owns_buffers=*/true, buffer, definition_event);
+
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
+
+  ASSERT_TRUE(tracked_buffer.definition_event().IsError());
+  EXPECT_EQ(tracked_buffer.definition_event().GetError().message(),
+            "tracked_cpu_device_buffer_test error.");
+}
+
+TEST(TrackedCpuDeviceBufferTest, DelayedAllocation) {
+  std::string expected = "tracked_cpu_device_buffer_test";
+
+  auto buffer = CpuDeviceMemory::CreateDelayedMemory();
+  auto malloc_event = MakeConstructedAsyncValueRef<CpuEvent>();
+  malloc_event.AndThen([buffer, buffer_size = expected.size()]() mutable {
+    CHECK_OK(CpuDeviceMemory::AllocateInto(buffer_size, buffer.AsPtr()));
+  });
+
+  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
+  TrackedCpuDeviceBuffer tracked_buffer(/*owns_buffers=*/true, buffer,
+                                        expected.size(), definition_event);
+  auto result = tracked_buffer.buffer();
+  ASSERT_FALSE(result.IsAvailable());
+  ASSERT_EQ(tracked_buffer.BufferSize(), expected.size());
+
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
+
+  thread_pool.Schedule([&]() {
+    malloc_event.SetStateConcrete();
+    std::memcpy(buffer->untyped_data(), expected.data(), expected.size());
+    definition_event.SetStateConcrete();
+  });
+
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
+
+  EXPECT_EQ(std::string(static_cast<const char*>(result->untyped_data()),
+                        result->size_bytes()),
+            expected);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
deleted file mode 100644
index 6951b9646521..000000000000
--- a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
-
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-
-#include "absl/base/casts.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/service/cpu/cpu_event.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-
-namespace xla {
-namespace {
-
-// Returns an AsyncValueRef<CpuEvent> that will be ready after all the async
-// values in `events` are ready. If errors occurs, one of the errors will be
-// propagated through the returned async value.
-tsl::AsyncValueRef<CpuEvent> AfterAll(
-    absl::Span<const tsl::AsyncValueRef<CpuEvent>> events) {
-  if (events.empty()) return tsl::MakeAvailableAsyncValueRef<CpuEvent>();
-
-  struct State {
-    State(int count, tsl::AsyncValueRef<CpuEvent> after_all)
-        : count(count), after_all(std::move(after_all)) {}
-    std::atomic<int> count;
-    tsl::AsyncValueRef<CpuEvent> after_all;
-
-    absl::Mutex mutex;
-    absl::Status error;
-  };
-
-  auto after_all = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto* state = new State(events.size(), after_all);
-
-  for (auto& event : events) {
-    event.AndThen([state, event = event.AsPtr()]() {
-      if (event.IsError()) {
-        absl::MutexLock lock(&state->mutex);
-        state->error = event.GetError();
-      }
-
-      if (state->count.fetch_sub(1, std::memory_order_acq_rel) == 1) {
-        if (!state->error.ok()) {
-          state->after_all.SetError(state->error);
-        } else {
-          state->after_all.SetStateConcrete();
-        }
-        delete state;
-      }
-    });
-  }
-
-  return after_all;
-}
-
-}  // namespace
-
-TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
-    bool is_tuple, bool owns_buffers,
-    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
-    absl::AnyInvocable<void() &&> on_delete_callback)
-    : TrackedTfrtCpuDeviceBuffer(is_tuple, owns_buffers, std::move(buffers),
-                                 AfterAll(definition_events),
-                                 std::move(on_delete_callback)) {}
-
-TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
-    bool is_tuple, bool owns_buffers,
-    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-    absl::InlinedVector<size_t, 4> buffer_sizes,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
-    absl::AnyInvocable<void() &&> on_delete_callback)
-    : TrackedTfrtCpuDeviceBuffer(
-          is_tuple, owns_buffers, std::move(buffers), std::move(buffer_sizes),
-          AfterAll(definition_events), std::move(on_delete_callback)) {}
-
-TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
-    bool is_tuple, bool owns_buffers,
-    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-    tsl::AsyncValueRef<CpuEvent> definition_event,
-    absl::AnyInvocable<void() &&> on_delete_callback)
-    : is_tuple_(is_tuple),
-      owns_buffers_(owns_buffers),
-      buffers_(std::move(buffers)),
-      definition_event_(std::move(definition_event)),
-      on_delete_callback_(std::move(on_delete_callback)) {
-  DCHECK(definition_event_);
-  for (const auto& buffer : buffers_) {
-    CHECK(buffer.IsConcrete());
-    buffer_sizes_.push_back(buffer->size());
-  }
-  if (is_tuple) {
-    size_t index_table_byte_size = buffers_.size() * sizeof(void*);
-    // We assume tuple table allocations will not fail.
-    tuple_index_table_ =
-        MaybeOwningCpuMemory::AllocateAvailableAvr(index_table_byte_size)
-            .value();
-    uintptr_t* index_table =
-        reinterpret_cast<uintptr_t*>(tuple_index_table_->data());
-    for (int i = 0; i < buffers_.size(); ++i) {
-      index_table[i] = absl::bit_cast<uintptr_t>(buffers_[i]->data());
-    }
-  }
-}
-
-TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
-    bool is_tuple, bool owns_buffers,
-    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-    absl::InlinedVector<size_t, 4> buffer_sizes,
-    tsl::AsyncValueRef<CpuEvent> definition_event,
-    absl::AnyInvocable<void() &&> on_delete_callback)
-    : is_tuple_(is_tuple),
-      owns_buffers_(owns_buffers),
-      buffers_(std::move(buffers)),
-      buffer_sizes_(std::move(buffer_sizes)),
-      definition_event_(std::move(definition_event)),
-      on_delete_callback_(std::move(on_delete_callback)) {
-  DCHECK(definition_event_);
-  if (is_tuple) {
-    tuple_index_table_ =
-        tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
-    tsl::RunWhenReady(
-        absl::MakeConstSpan(buffers_),
-        [buffers = buffers_, tuple_index_table = tuple_index_table_] {
-          size_t index_table_byte_size = buffers.size() * sizeof(void*);
-          // We assume tuple table allocations will not fail.
-          tuple_index_table.emplace(
-              MaybeOwningCpuMemory::Allocate(index_table_byte_size).value());
-          uintptr_t* index_table =
-              reinterpret_cast<uintptr_t*>(tuple_index_table->data());
-          for (int i = 0; i < buffers.size(); ++i) {
-            index_table[i] = absl::bit_cast<uintptr_t>(buffers[i]->data());
-          }
-        });
-  }
-}
-
-TrackedTfrtCpuDeviceBuffer::~TrackedTfrtCpuDeviceBuffer() {
-  ReleaseDeviceMemory();
-  if (on_delete_callback_) {
-    std::move(on_delete_callback_)();
-  }
-}
-
-tsl::AsyncValueRef<MaybeOwningCpuMemory> TrackedTfrtCpuDeviceBuffer::Buffer(
-    const ShapeIndex& shape_index) {
-  if (shape_index.empty()) {
-    // shape_index={}
-    if (is_tuple_) return tuple_index_table_;
-    return buffers_[0];
-  }
-  // shape_index={i}
-  CHECK(is_tuple_);
-  CHECK_EQ(shape_index.size(), 1) << "nested tuple not supported";
-  return buffers_[shape_index[0]];
-}
-
-size_t TrackedTfrtCpuDeviceBuffer::BufferSize(const ShapeIndex& shape_index) {
-  if (shape_index.empty()) {
-    // shape_index={}
-    if (is_tuple_) return buffers_.size() * sizeof(void*);
-    return buffer_sizes_[0];
-  }
-  // shape_index={i}
-  CHECK(is_tuple_);
-  CHECK_EQ(shape_index.size(), 1) << "nested tuple not supported";
-  return buffer_sizes_[shape_index[0]];
-}
-
-void TrackedTfrtCpuDeviceBuffer::AddUsageEvents(
-    absl::Span<tsl::AsyncValueRef<CpuEvent>> events) {
-  // Periodically remove available usage events to prevent memory blowup.
-  if (usage_events_.size() >= 1024) {
-    int i = 0;
-    while (i < usage_events_.size()) {
-      auto& event = usage_events_[i];
-      if (event.IsAvailable()) {
-        using std::swap;
-        swap(event, usage_events_.back());
-        usage_events_.pop_back();
-        continue;
-      }
-      ++i;
-    }
-  }
-  for (auto& ev : events) {
-    usage_events_.push_back(std::move(ev));
-  }
-}
-
-absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
-TrackedTfrtCpuDeviceBuffer::LockUseAndTransferUsageEvents() {
-  return std::move(usage_events_);
-}
-
-void TrackedTfrtCpuDeviceBuffer::ReleaseDeviceMemory() {
-  tuple_index_table_.reset();
-  buffers_.clear();
-  definition_event_.reset();
-  usage_events_.clear();
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
deleted file mode 100644
index 8784e42aa593..000000000000
--- a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
-#define XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <utility>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "xla/cpu_function_runtime.h"
-#include "xla/service/cpu/cpu_event.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "tsl/platform/mem.h"
-
-namespace xla {
-
-class MaybeOwningCpuMemory {
- public:
-  using OwnedDataPtr = std::unique_ptr<uint8_t[], void (*)(void*)>;
-
-  MaybeOwningCpuMemory() = default;
-
-  // Non-owning.
-  MaybeOwningCpuMemory(void* buf, size_t size) : buf_(buf), size_(size) {}
-
-  // Owning.
-  MaybeOwningCpuMemory(OwnedDataPtr data, size_t size)
-      : buf_(data.get()), data_(std::move(data)), size_(size) {}
-
-  // Move-only.
-  MaybeOwningCpuMemory(MaybeOwningCpuMemory&&) = default;
-  MaybeOwningCpuMemory& operator=(MaybeOwningCpuMemory&&) = default;
-  MaybeOwningCpuMemory(const MaybeOwningCpuMemory&) = delete;
-  MaybeOwningCpuMemory& operator=(const MaybeOwningCpuMemory&) = delete;
-
-  // Allocates owning memory wrapped in an available `AsyncValueRef`.
-  static absl::StatusOr<tsl::AsyncValueRef<MaybeOwningCpuMemory>>
-  AllocateAvailableAvr(size_t size) {
-    TF_ASSIGN_OR_RETURN(auto memory, Allocate(size));
-    return tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
-        std::move(memory));
-  }
-
-  // Allocates raw owning memory. The typical usage is for delayed allocation.
-  static absl::StatusOr<MaybeOwningCpuMemory> Allocate(size_t size) {
-    uint8_t* data = static_cast<uint8_t*>(
-        tsl::port::AlignedMalloc(size, cpu_function_runtime::MinAlign()));
-    if (!data) {
-      return ResourceExhausted("Out of memory allocating %d bytes.", size);
-    }
-    return MaybeOwningCpuMemory(OwnedDataPtr{data, tsl::port::AlignedFree},
-                                size);
-  }
-
-  void* data() const { return buf_; }
-  size_t size() const { return size_; }
-
- private:
-  void* buf_ = nullptr;                  // Non-owning data pointer.
-  OwnedDataPtr data_ = {nullptr, free};  // Owning data pointer;
-  size_t size_ = 0;                      // Size in number of bytes.
-};
-
-// Class that represents CPU buffers. It optionally owns the buffers. It also
-// tracks the definition and usage of the memory to allow for synchronized usage
-// and deletion of CPU memory. This class is thread-compatible.
-class TrackedTfrtCpuDeviceBuffer {
- public:
-  // For non-tuple, takes a single buffer.
-  // For tuple, takes the leaf buffers. Tuple index table created internally.
-  // Nested tuple is not supported.
-
-  // Constructor for allocated cpu memory, i.e., `buffers` should have concrete
-  // states. Definition event is after the list of `definition_events`.
-  TrackedTfrtCpuDeviceBuffer(
-      bool is_tuple, bool owns_buffers,
-      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
-      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
-
-  // Variant with single definition event.
-  TrackedTfrtCpuDeviceBuffer(
-      bool is_tuple, bool owns_buffers,
-      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-      tsl::AsyncValueRef<CpuEvent> definition_event,
-      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
-
-  // Constructor for unallocated cpu memory, i.e., `buffers` have unconstructed
-  // states, also needs to provide `buffer_sizes` which will be the sizes of
-  // the `buffers` after allocation. Definition event is after the list of
-  // `definition_events`. Callers need to ensure cpu memory is allocated before
-  // the definition event is ready.
-  TrackedTfrtCpuDeviceBuffer(
-      bool is_tuple, bool owns_buffers,
-      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-      absl::InlinedVector<size_t, 4> buffer_sizes,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
-      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
-
-  // Variant with single definition event.
-  TrackedTfrtCpuDeviceBuffer(
-      bool is_tuple, bool owns_buffers,
-      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
-      absl::InlinedVector<size_t, 4> buffer_sizes,
-      tsl::AsyncValueRef<CpuEvent> definition_event,
-      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
-
-  // Move-only.
-  TrackedTfrtCpuDeviceBuffer(TrackedTfrtCpuDeviceBuffer&&) noexcept = default;
-  TrackedTfrtCpuDeviceBuffer& operator=(TrackedTfrtCpuDeviceBuffer&&) noexcept =
-      default;
-  TrackedTfrtCpuDeviceBuffer(const TrackedTfrtCpuDeviceBuffer&) = delete;
-  TrackedTfrtCpuDeviceBuffer& operator=(const TrackedTfrtCpuDeviceBuffer&) =
-      delete;
-
-  ~TrackedTfrtCpuDeviceBuffer();
-
-  absl::Span<const tsl::AsyncValueRef<MaybeOwningCpuMemory>> Buffers() {
-    return buffers_;
-  }
-
-  absl::Span<const size_t> BufferSizes() { return buffer_sizes_; }
-
-  tsl::AsyncValueRef<MaybeOwningCpuMemory> Buffer(
-      const ShapeIndex& shape_index);
-
-  size_t BufferSize(const ShapeIndex& shape_index);
-
-  const tsl::AsyncValueRef<CpuEvent>& definition_event() const {
-    return definition_event_;
-  }
-
-  absl::Span<const tsl::AsyncValueRef<CpuEvent>> UsageEvents() const {
-    return usage_events_;
-  }
-
-  void AddUsageEvents(absl::Span<tsl::AsyncValueRef<CpuEvent>> events);
-
-  // Return the usage events for the buffers. After
-  // LockUseAndTransferUsageEvents is called, it is illegal to AddUsageEvent.
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
-  LockUseAndTransferUsageEvents();
-
-  // Relinquishes ownership of the buffer's device memory, e.g., after the
-  // buffer is passed to a computation that aliases its inputs to outputs.
-  void ReleaseDeviceMemory();
-
-  bool owns_buffers() const { return owns_buffers_; }
-
- private:
-  bool is_tuple_;
-  bool owns_buffers_;
-  // If tuple, tuple index table is created and stored.
-  tsl::AsyncValueRef<MaybeOwningCpuMemory> tuple_index_table_;
-  // If non-tuple, `buffers_` contains 1 buffer; otherwise all leaf buffers.
-  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers_;
-  // Should correspond to size of each buffer in `buffers_` when `buffers_` is
-  // available.
-  absl::InlinedVector<size_t, 4> buffer_sizes_;
-  // The definition event are associated with CPU operations that write to the
-  // buffers.
-  tsl::AsyncValueRef<CpuEvent> definition_event_;
-  // Usage events are associated with CPU operations that read from the buffers.
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> usage_events_;
-  // A callback to call when the TrackedTfrtCpuDeviceBuffer is about to be
-  // destroyed.
-  absl::AnyInvocable<void() &&> on_delete_callback_;
-};
-}  // namespace xla
-
-#endif  // XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
deleted file mode 100644
index fb8c091a7003..000000000000
--- a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
-
-#include <cstring>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "xla/service/cpu/cpu_event.h"
-#include "xla/tsl/concurrency/async_value.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/util.h"
-
-namespace xla {
-namespace {
-
-using ::tsl::BlockUntilReady;
-using ::tsl::MakeConstructedAsyncValueRef;
-using ::tsl::MakeUnconstructedAsyncValueRef;
-using ::tsl::thread::ThreadPool;
-
-TEST(TrackedTfrtCpuDeviceBufferTest, Basic) {
-  std::string expected = "tracked_tfrt_cpu_device_buffer_test";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer, MaybeOwningCpuMemory::AllocateAvailableAvr(expected.size()));
-
-  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
-
-  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
-                         /*num_threads=*/4);
-
-  thread_pool.Schedule([&]() {
-    std::memcpy(buffer->data(), expected.data(), expected.size());
-    definition_event.SetStateConcrete();
-  });
-
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(
-      /*is_tuple=*/false, /*owns_buffers=*/true, {buffer}, definition_event,
-      /*on_delete_callback_=*/nullptr);
-
-  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
-
-  auto result = tracked_buffer.Buffers()[0];
-  ASSERT_TRUE(result.IsAvailable());
-  EXPECT_EQ(
-      std::string(static_cast<const char*>(result->data()), result->size()),
-      expected);
-}
-
-TEST(TrackedTfrtCpuDeviceBufferTest, Tuple) {
-  std::string expected_0 = "tracked_tfrt_cpu_device_buffer_test";
-  std::string expected_1 = "tuple";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_0,
-      MaybeOwningCpuMemory::AllocateAvailableAvr(expected_0.size()));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_1,
-      MaybeOwningCpuMemory::AllocateAvailableAvr(expected_1.size()));
-
-  auto definition_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
-  auto definition_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
-
-  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
-                         /*num_threads=*/4);
-
-  thread_pool.Schedule([&]() {
-    std::memcpy(buffer_0->data(), expected_0.data(), expected_0.size());
-    definition_event_0.SetStateConcrete();
-  });
-  thread_pool.Schedule([&]() {
-    std::memcpy(buffer_1->data(), expected_1.data(), expected_1.size());
-    definition_event_1.SetStateConcrete();
-  });
-
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(
-      /*is_tuple=*/true, /*owns_buffers=*/true, {buffer_0, buffer_1},
-      {definition_event_0, definition_event_1},
-      /*on_delete_callback_=*/nullptr);
-
-  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
-
-  auto result_0 = tracked_buffer.Buffers()[0];
-  auto result_1 = tracked_buffer.Buffers()[1];
-  ASSERT_TRUE(result_0.IsAvailable());
-  ASSERT_TRUE(result_1.IsAvailable());
-  EXPECT_EQ(
-      std::string(static_cast<const char*>(result_0->data()), result_0->size()),
-      expected_0);
-  EXPECT_EQ(
-      std::string(static_cast<const char*>(result_1->data()), result_1->size()),
-      expected_1);
-}
-
-TEST(TrackedTfrtCpuDeviceBufferTest, BasicError) {
-  TF_ASSERT_OK_AND_ASSIGN(auto buffer,
-                          MaybeOwningCpuMemory::AllocateAvailableAvr(64));
-
-  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
-
-  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
-                         /*num_threads=*/4);
-
-  thread_pool.Schedule([&]() {
-    definition_event.SetError(
-        Internal("tracked_tfrt_cpu_device_buffer_test error."));
-  });
-
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(
-      /*is_tuple=*/false, /*owns_buffers=*/true, {buffer}, definition_event,
-      /*on_delete_callback_=*/nullptr);
-
-  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
-
-  ASSERT_TRUE(tracked_buffer.definition_event().IsError());
-  EXPECT_EQ(tracked_buffer.definition_event().GetError().message(),
-            "tracked_tfrt_cpu_device_buffer_test error.");
-}
-
-TEST(TrackedTfrtCpuDeviceBufferTest, TupleError) {
-  std::string expected = "tracked_tfrt_cpu_device_buffer_test";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_0,
-      MaybeOwningCpuMemory::AllocateAvailableAvr(expected.size()));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_1,
-      MaybeOwningCpuMemory::AllocateAvailableAvr(expected.size()));
-
-  auto definition_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
-  auto definition_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
-
-  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
-                         /*num_threads=*/4);
-
-  thread_pool.Schedule([&]() {
-    std::memcpy(buffer_0->data(), expected.data(), expected.size());
-    definition_event_0.SetStateConcrete();
-  });
-  thread_pool.Schedule([&]() {
-    definition_event_1.SetError(
-        Internal("tracked_tfrt_cpu_device_buffer_test tuple error."));
-  });
-
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(
-      /*is_tuple=*/true, /*owns_buffers=*/true, {buffer_0, buffer_1},
-      {definition_event_0, definition_event_1},
-      /*on_delete_callback_=*/nullptr);
-
-  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
-
-  ASSERT_TRUE(tracked_buffer.definition_event().IsError());
-  EXPECT_EQ(tracked_buffer.definition_event().GetError().message(),
-            "tracked_tfrt_cpu_device_buffer_test tuple error.");
-}
-
-TEST(TrackedTfrtCpuDeviceBufferTest, DelayedAllocation) {
-  std::string expected = "tracked_tfrt_cpu_device_buffer_test";
-
-  auto buffer = MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
-  auto malloc_event = MakeConstructedAsyncValueRef<CpuEvent>();
-  malloc_event.AndThen([buffer_copy = buffer.CopyRef(),
-                        buffer_size = expected.size()] {
-    buffer_copy.emplace(MaybeOwningCpuMemory::Allocate(buffer_size).value());
-  });
-
-  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(/*is_tuple=*/false,
-                                            /*owns_buffers=*/true, {buffer},
-                                            {expected.size()}, definition_event,
-                                            /*on_delete_callback_=*/nullptr);
-  auto result = tracked_buffer.Buffers()[0];
-  ASSERT_FALSE(result.IsAvailable());
-  ASSERT_EQ(tracked_buffer.BufferSizes()[0], expected.size());
-
-  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
-                         /*num_threads=*/4);
-
-  thread_pool.Schedule([&]() {
-    malloc_event.SetStateConcrete();
-    std::memcpy(buffer->data(), expected.data(), expected.size());
-    definition_event.SetStateConcrete();
-  });
-
-  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
-
-  EXPECT_EQ(
-      std::string(static_cast<const char*>(result->data()), result->size()),
-      expected);
-}
-
-TEST(TrackedTfrtCpuDeviceBufferTest, DelayedAllocationTuple) {
-  std::string expected_0 = "tracked_tfrt_cpu_device_buffer_test";
-  std::string expected_1 = "tuple";
-
-  auto buffer_0 = MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
-  auto malloc_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
-  malloc_event_0.AndThen(
-      [buffer_0_copy = buffer_0.CopyRef(), buffer_0_size = expected_0.size()] {
-        buffer_0_copy.emplace(
-            MaybeOwningCpuMemory::Allocate(buffer_0_size).value());
-      });
-  auto buffer_1 = MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
-  auto malloc_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
-  malloc_event_1.AndThen(
-      [buffer_1_copy = buffer_1.CopyRef(), buffer_1_size = expected_1.size()] {
-        buffer_1_copy.emplace(
-            MaybeOwningCpuMemory::Allocate(buffer_1_size).value());
-      });
-
-  auto definition_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
-  auto definition_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(
-      /*is_tuple=*/true,
-      /*owns_buffers=*/true, {buffer_0, buffer_1},
-      {expected_0.size(), expected_1.size()},
-      {definition_event_0, definition_event_1},
-      /*on_delete_callback_=*/nullptr);
-
-  auto result_0 = tracked_buffer.Buffers()[0];
-  auto result_1 = tracked_buffer.Buffers()[1];
-  ASSERT_FALSE(result_0.IsAvailable());
-  ASSERT_FALSE(result_1.IsAvailable());
-  ASSERT_EQ(tracked_buffer.BufferSizes()[0], expected_0.size());
-  ASSERT_EQ(tracked_buffer.BufferSizes()[1], expected_1.size());
-
-  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
-                         /*num_threads=*/4);
-
-  thread_pool.Schedule([&]() {
-    malloc_event_0.SetStateConcrete();
-    std::memcpy(buffer_0->data(), expected_0.data(), expected_0.size());
-    definition_event_0.SetStateConcrete();
-  });
-  thread_pool.Schedule([&]() {
-    malloc_event_1.SetStateConcrete();
-    std::memcpy(buffer_1->data(), expected_1.data(), expected_1.size());
-    definition_event_1.SetStateConcrete();
-  });
-
-  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
-
-  EXPECT_EQ(
-      std::string(static_cast<const char*>(result_0->data()), result_0->size()),
-      expected_0);
-  EXPECT_EQ(
-      std::string(static_cast<const char*>(result_1->data()), result_1->size()),
-      expected_1);
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/device_event.h b/third_party/xla/xla/pjrt/device_event.h
new file mode 100644
index 000000000000..c760f10c95b7
--- /dev/null
+++ b/third_party/xla/xla/pjrt/device_event.h
@@ -0,0 +1,89 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_DEVICE_EVENT_H_
+#define XLA_PJRT_DEVICE_EVENT_H_
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+
+// A common base class between events and promises that allow adding extra
+// metadata.
+class PjRtDeviceEventOrPromise
+    : public tsl::ReferenceCounted<PjRtDeviceEventOrPromise> {
+ public:
+  virtual ~PjRtDeviceEventOrPromise() = default;
+
+  // If this event is based on async-value, return it.
+  virtual tsl::AsyncValue* async_value() { return nullptr; }
+
+  // If this event type supports tracking, add tracking information.
+  virtual void AppendDescriptionToEvent(
+      absl::string_view description,
+      absl::Span<PjRtDeviceEventOrPromise* const> waiters) {}
+};
+
+// A device event occurs (potentially) on a device. It can be waited on
+// directly or passed between APIs which may be able to handle these events
+// directly.
+class PjRtDeviceEvent : public PjRtDeviceEventOrPromise {
+ public:
+  ~PjRtDeviceEvent() override = default;
+
+  enum class State {
+    kPending,
+    kReady,
+    kError,
+  };
+
+  // Runs a callback when an event becomes ready.
+  virtual void AndThen(absl::AnyInvocable<void() &&> cb) = 0;
+
+  // Polls current event state.
+  virtual State state() const = 0;
+
+  // Check if ready.
+  bool ok() const { return state() != State::kError; }
+
+  // Fetches the error if this event is in state kError.
+  virtual const absl::Status& status() const = 0;
+
+  // Converts a device-event into a future.
+  virtual PjRtFuture<> GetReadyFuture() = 0;
+};
+
+// Instead of taking a device event as an argument, apis may instead decide to
+// return a promise which is fulfilled later.
+class PjRtDeviceEventPromise : public PjRtDeviceEventOrPromise {
+ public:
+  ~PjRtDeviceEventPromise() override = default;
+
+  // Fulfill the promise.
+  virtual void Set(tsl::RCReference<PjRtDeviceEvent> event) = 0;
+
+  // Mark the promise as an error.
+  virtual void SetError(absl::Status s) = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_DEVICE_EVENT_H_
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 55aeaf8f3e6c..f88dcb45fed3 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -1,5 +1,5 @@
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/tsl:tsl.bzl", "if_oss")
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -8,7 +8,7 @@ licenses(["notice"])
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla:internal"],
+    default_visibility = internal_visibility(["//xla/pjrt:friends"]),
 )
 
 tf_proto_library(
@@ -26,7 +26,6 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/tsl/distributed_runtime/coordination:coordination_service",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_impl",
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
@@ -50,7 +49,6 @@ xla_cc_test(
         ":in_memory_key_value_store",
         ":protocol_proto_cc",
         ":topology_util",
-        "//xla:test_helpers",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
@@ -122,6 +120,8 @@ cc_library(
         "//xla/pjrt:utils",
         "//xla/pjrt/gpu:gpu_topology_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -146,11 +146,11 @@ xla_cc_test(
         ":protocol_proto_cc",
         ":service",
         ":topology_util",
-        "//xla:protobuf_util",
         "//xla:status_macros",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -182,6 +182,7 @@ cc_library(
     hdrs = ["in_memory_key_value_store.h"],
     deps = [
         ":key_value_store_interface",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/pjrt/distributed/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
index e4176d07e48c..a0040833ab7b 100644
--- a/third_party/xla/xla/pjrt/distributed/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -41,11 +42,11 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/distributed/topology_util.h"
-#include "xla/protobuf_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -54,9 +55,12 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
 using ::testing::IsEmpty;
+using ::testing::Matches;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
+using ::tsl::proto_testing::EqualsProto;
 using tsl::testing::StatusIs;
 
 constexpr absl::Duration kHeartbeatInterval = absl::Milliseconds(500);
@@ -225,8 +229,7 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
                            /*get_global_topology_timeout=*/absl::Minutes(1),
                            kv_store.get(), locals[0], &topology,
                            /*assign_global_device_ids=*/true));
-    TF_RET_CHECK(
-        xla::protobuf_util::ProtobufEquals(topology, expected_topology))
+    TF_RET_CHECK(Matches(EqualsProto(expected_topology))(topology))
         << topology.DebugString();
     TF_RETURN_IF_ERROR(client->KeyValueSet("key1", "value1"));
     TF_ASSIGN_OR_RETURN(
@@ -251,8 +254,7 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
         /*get_local_topology_timeout=*/absl::Minutes(1),
         /*get_global_topology_timeout=*/absl::Minutes(1), kv_store.get(),
         locals[1], &topology, /*assign_global_device_ids=*/true));
-    TF_RET_CHECK(
-        xla::protobuf_util::ProtobufEquals(topology, expected_topology))
+    TF_RET_CHECK(Matches(EqualsProto(expected_topology))(topology))
         << topology.DebugString();
     TF_ASSIGN_OR_RETURN(
         std::string value,
@@ -310,8 +312,7 @@ TEST_F(ClientServerTest, EnumerateElevenDevices) {
         /*get_local_topology_timeout=*/absl::Minutes(1),
         /*get_global_topology_timeout=*/absl::Minutes(1), kv_store.get(),
         locals[node_id], &topology, /*assign_global_device_ids=*/true));
-    TF_RET_CHECK(
-        xla::protobuf_util::ProtobufEquals(topology, expected_topology))
+    TF_RET_CHECK(Matches(EqualsProto(expected_topology))(topology))
         << topology.DebugString();
     return absl::OkStatus();
   };
diff --git a/third_party/xla/xla/pjrt/distributed/in_memory_key_value_store.h b/third_party/xla/xla/pjrt/distributed/in_memory_key_value_store.h
index 7c8b2988f561..2ea5c91341fd 100644
--- a/third_party/xla/xla/pjrt/distributed/in_memory_key_value_store.h
+++ b/third_party/xla/xla/pjrt/distributed/in_memory_key_value_store.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
diff --git a/third_party/xla/xla/pjrt/distributed/protocol.proto b/third_party/xla/xla/pjrt/distributed/protocol.proto
index 962d7bed85ad..3c73c130734b 100644
--- a/third_party/xla/xla/pjrt/distributed/protocol.proto
+++ b/third_party/xla/xla/pjrt/distributed/protocol.proto
@@ -58,7 +58,8 @@ message DeviceProto {
   int32 global_device_id = 4;  // Globally unique ID number.
 
   // Devices with the same slice_index are connected by fast network, e.g.
-  // NVLink on GPUs.
+  // NVLink on GPUs. Note that fast-interconnect can be cross-host, i.e. a
+  // slice may include multiple hosts.
   int32 slice_index = 5;
 
   // Store vendor-specific compute capability.
@@ -70,7 +71,15 @@ message DeviceProto {
   string device_kind = 8;
   string to_string = 9;
   string debug_string = 10;
+
   map<string, DeviceAttributeProto> attributes = 11;
+
+  // Multiple hosts may be connected by NVLink, fabric_uuid uniquely identifies
+  // a fast-interconnect domain, thus can be used to identify the actual number
+  // of ranks connected in a single fast-interconnect domain. Only meaningful
+  // for SM100 (Blackwell) and onwards.
+  // fabric_uuid is constructed in the format of "clusterUuid/cliqueId".
+  string fabric_uuid = 12;
 }
 
 message LocalTopologyProto {
@@ -79,6 +88,9 @@ message LocalTopologyProto {
   // See /proc/sys/kernel/random/boot_id.
   string boot_id = 2;
   repeated DeviceProto devices = 3;
+
+  // Explicit slice index; derived from boot_id if absent
+  optional int32 slice_index = 4;
 }
 
 message GlobalTopologyProto {
diff --git a/third_party/xla/xla/pjrt/distributed/service.cc b/third_party/xla/xla/pjrt/distributed/service.cc
index 8d66d9b37d47..9103c154d95a 100644
--- a/third_party/xla/xla/pjrt/distributed/service.cc
+++ b/third_party/xla/xla/pjrt/distributed/service.cc
@@ -34,7 +34,7 @@ limitations under the License.
 
 namespace {
 
-std::unique_ptr<tsl::CoordinationServiceInterface> EnableCoordinationService(
+std::unique_ptr<tsl::CoordinationService> EnableCoordinationService(
     const xla::CoordinationServiceImpl::Options& options) {
   const std::string job_name = "jax_worker";
   tensorflow::CoordinationServiceConfig config;
@@ -51,8 +51,8 @@ std::unique_ptr<tsl::CoordinationServiceInterface> EnableCoordinationService(
       config.mutable_coordinated_job_list()->Add();
   job->set_name(job_name);
   job->set_num_tasks(options.num_nodes);
-  auto service = tsl::CoordinationServiceInterface::EnableCoordinationService(
-      options.env, config, /*cache=*/nullptr);
+  auto service =
+      tsl::CoordinationService::Create(options.env, config, /*cache=*/nullptr);
   return service;
 }
 }  // namespace
diff --git a/third_party/xla/xla/pjrt/distributed/service.h b/third_party/xla/xla/pjrt/distributed/service.h
index d1e3279a7a4a..8d1142feffa9 100644
--- a/third_party/xla/xla/pjrt/distributed/service.h
+++ b/third_party/xla/xla/pjrt/distributed/service.h
@@ -76,7 +76,7 @@ class CoordinationServiceImpl {
 
  private:
   tsl::Env* env_ = nullptr;  // Not owned.
-  std::unique_ptr<tsl::CoordinationServiceInterface> coord_service_;
+  std::unique_ptr<tsl::CoordinationService> coord_service_;
   std::unique_ptr<tsl::thread::ThreadPool> coord_compute_pool_;
   std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<tsl::Thread> coord_rpc_thread_;
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.cc b/third_party/xla/xla/pjrt/distributed/topology_util.cc
index ad4645085f0b..55b2c6eca329 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.cc
@@ -23,11 +23,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/mutex.h"
@@ -35,6 +38,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/utils.h"
 #include "xla/util.h"
@@ -72,6 +76,19 @@ bool SameLocalTopology(const LocalTopologyProto& a,
   return true;
 }
 
+// Returns true if all devices have a valid fabric_uuid.
+bool HasFabricUuid(absl::Span<LocalTopologyProto> local_topologies) {
+  for (const LocalTopologyProto& local : local_topologies) {
+    for (const DeviceProto& device : local.devices()) {
+      if (device.fabric_uuid().empty() ||
+          device.fabric_uuid() == "00000000-0000-0000-0000-000000000000/0") {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 // Exists on Linux systems. Unique per OS kernel restart.
@@ -155,37 +172,62 @@ static absl::StatusOr<std::vector<LocalTopologyProto>> GetAllLocalTopologies(
 }
 
 // Steals the contents of `local_topologies`.
-GlobalTopologyProto BuildGlobalTopology(
+absl::StatusOr<GlobalTopologyProto> BuildGlobalTopology(
     absl::Span<LocalTopologyProto> local_topologies,
     bool assign_global_device_ids) {
+  CHECK(!local_topologies.empty());
+  bool explicit_slice_indices = local_topologies[0].has_slice_index();
+  if (explicit_slice_indices) {
+    // Every local topology explicitly declares its slice_index.
+    for (LocalTopologyProto& local : local_topologies) {
+      if (!local.has_slice_index()) {
+        return InvalidArgument(
+            "Either all of or none of the local topologies "
+            "should explicitly set slice_index");
+      }
+      int slice_index = local.slice_index();
+      for (DeviceProto& device : *local.mutable_devices()) {
+        device.set_slice_index(slice_index);
+      }
+    }
+  } else {
+    // Assign local devices of the same fabric_uuid/boot_id to the same
+    // slice_index.
+    const bool has_fabric_uuid = HasFabricUuid(local_topologies);
+    absl::flat_hash_map<std::string, int> id_to_slice_index;
+    for (LocalTopologyProto& local : local_topologies) {
+      if (local.has_slice_index()) {
+        return InvalidArgument(
+            "Either all of or none of the local topologies "
+            "should explicitly set slice_index");
+      }
+      for (DeviceProto& device : *local.mutable_devices()) {
+        // Each new fabric_uuid/boot_id seen is treated as a new slice.
+        auto [it, _] = id_to_slice_index.try_emplace(
+            has_fabric_uuid ? device.fabric_uuid() : local.boot_id(),
+            id_to_slice_index.size());
+        device.set_slice_index(it->second);
+      }
+    }
+    if (VLOG_IS_ON(10)) {
+      for (auto it = id_to_slice_index.begin(); it != id_to_slice_index.end();
+           ++it) {
+        LOG(INFO) << "BuildGlobalTopology id_to_slice_index " << it->first
+                  << "->" << it->second;
+      }
+    }
+  }
+
   GlobalTopologyProto global_topology;
   int next_global_device_id = 0;
-  // Assign local devices of the same host to the same slice_index.
-  int next_slice_index = 0;
-  absl::flat_hash_map<std::string, int> boot_id_to_slice_index;
   for (LocalTopologyProto& local : local_topologies) {
-    // Every new boot_id seen is treated as a new host/slice.
-    absl::string_view boot_id = local.boot_id();
-    auto [it, inserted] =
-        boot_id_to_slice_index.try_emplace(boot_id, next_slice_index);
-    if (inserted) {
-      ++next_slice_index;
-    }
-    for (DeviceProto& device : *local.mutable_devices()) {
-      if (assign_global_device_ids) {
+    if (assign_global_device_ids) {
+      for (DeviceProto& device : *local.mutable_devices()) {
         device.set_global_device_id(next_global_device_id++);
       }
-      device.set_slice_index(it->second);
     }
     global_topology.add_nodes()->Swap(&local);
   }
-  if (VLOG_IS_ON(10)) {
-    for (auto it = boot_id_to_slice_index.begin();
-         it != boot_id_to_slice_index.end(); ++it) {
-      LOG(INFO) << "BuildGlobalTopology boot_id_to_slice_index " << it->first
-                << "->" << it->second;
-    }
-  }
   return global_topology;
 }
 
@@ -239,9 +281,10 @@ absl::Status ExchangeTopologies(absl::string_view platform, int node_id,
     TF_ASSIGN_OR_RETURN(std::vector<LocalTopologyProto> local_topologies,
                         GetAllLocalTopologies(platform, num_nodes, kv_store,
                                               get_local_topology_timeout));
-    *global_topology =
+    TF_ASSIGN_OR_RETURN(
+        *global_topology,
         BuildGlobalTopology(absl::Span<LocalTopologyProto>(local_topologies),
-                            assign_global_device_ids);
+                            assign_global_device_ids));
     TF_RETURN_IF_ERROR(kv_store->Set(global_topology_key,
                                      global_topology->SerializeAsString()));
   } else {
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.h b/third_party/xla/xla/pjrt/distributed/topology_util.h
index 2e492d9c9073..55f0d69a2ef4 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.h
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
@@ -53,7 +54,7 @@ absl::Status ExchangeTopologies(absl::string_view platform, int node_id,
 // Given a LocalTopologyProto object from each node, builds a
 // GlobalTopologyProto that describes all nodes. Steals the contents of the
 // LocalTopologyProtos.
-GlobalTopologyProto BuildGlobalTopology(
+absl::StatusOr<GlobalTopologyProto> BuildGlobalTopology(
     absl::Span<LocalTopologyProto> local_topologies,
     bool assign_global_device_ids);
 
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util_test.cc b/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
index 3926aa52c0a5..d6211252f617 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <vector>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/test_helpers.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "tsl/platform/env.h"
@@ -46,14 +46,89 @@ TEST(TopologyTest, BuildGlobalTopology) {
   DeviceProto* d3 = locals[1].add_devices();
   d3->set_local_device_ordinal(1);
 
-  GlobalTopologyProto global =
+  TF_ASSERT_OK_AND_ASSIGN(
+      GlobalTopologyProto global,
       BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals),
-                          /*assign_global_device_ids=*/true);
+                          /*assign_global_device_ids=*/true));
   EXPECT_EQ(global.nodes_size(), 2);
   EXPECT_EQ(global.nodes()[0].devices_size(), 2);
   EXPECT_EQ(global.nodes()[1].devices_size(), 2);
 }
 
+TEST(TopologyTest, BuildGlobalTopologyWithFabricUuid) {
+  std::vector<LocalTopologyProto> locals(2);
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  d0->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(1);
+  d1->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+  DeviceProto* d2 = locals[1].add_devices();
+  d2->set_local_device_ordinal(0);
+  d2->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+  DeviceProto* d3 = locals[1].add_devices();
+  d3->set_local_device_ordinal(1);
+  d3->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      GlobalTopologyProto global,
+      BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals),
+                          /*assign_global_device_ids=*/true));
+  EXPECT_EQ(global.nodes_size(), 2);
+  EXPECT_EQ(global.nodes()[0].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[1].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[0].devices()[0].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[0].devices()[1].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[1].devices()[0].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[1].devices()[1].slice_index(), 0);
+}
+
+TEST(TopologyTest, BuildGlobalTopologyMultipleFabricUuid) {
+  std::vector<LocalTopologyProto> locals(4);
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  d0->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(1);
+  d1->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+  DeviceProto* d2 = locals[1].add_devices();
+  d2->set_local_device_ordinal(0);
+  d2->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+  DeviceProto* d3 = locals[1].add_devices();
+  d3->set_local_device_ordinal(1);
+  d3->set_fabric_uuid("00000000-0000-0000-0000-000000000001/0");
+  DeviceProto* d4 = locals[2].add_devices();
+  d4->set_local_device_ordinal(0);
+  d4->set_fabric_uuid("00000000-0000-0000-0000-000000000002/0");
+  DeviceProto* d5 = locals[2].add_devices();
+  d5->set_local_device_ordinal(1);
+  d5->set_fabric_uuid("00000000-0000-0000-0000-000000000002/0");
+  DeviceProto* d6 = locals[3].add_devices();
+  d6->set_local_device_ordinal(0);
+  d6->set_fabric_uuid("00000000-0000-0000-0000-000000000002/0");
+  DeviceProto* d7 = locals[3].add_devices();
+  d7->set_local_device_ordinal(1);
+  d7->set_fabric_uuid("00000000-0000-0000-0000-000000000002/0");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      GlobalTopologyProto global,
+      BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals),
+                          /*assign_global_device_ids=*/true));
+  EXPECT_EQ(global.nodes_size(), 4);
+  EXPECT_EQ(global.nodes()[0].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[1].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[2].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[3].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[0].devices()[0].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[0].devices()[1].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[1].devices()[0].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[1].devices()[1].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[2].devices()[0].slice_index(), 1);
+  EXPECT_EQ(global.nodes()[2].devices()[1].slice_index(), 1);
+  EXPECT_EQ(global.nodes()[3].devices()[0].slice_index(), 1);
+  EXPECT_EQ(global.nodes()[3].devices()[1].slice_index(), 1);
+}
+
 TEST(TopologyTest, ExchangeTopology) {
   int num_nodes = 2;
   std::vector<LocalTopologyProto> locals(num_nodes);
@@ -177,6 +252,40 @@ TEST(TopologyTest, ExchangeTopology_TwiceWithDifferentLocalTopology_Fails) {
   }
 }
 
+TEST(TopologyTest, BuildGlobalTopologyWithExplicitSliceIndices) {
+  // Set slice_index explicitly, and expect boot id to be ignored.
+  std::string boot_id = "foo";
+  std::vector<LocalTopologyProto> locals(2);
+  locals[0].set_boot_id(boot_id);
+  locals[1].set_boot_id(boot_id);
+  locals[0].set_node_id(0);
+  locals[1].set_node_id(1);
+  locals[0].set_slice_index(1);
+  locals[1].set_slice_index(0);
+  // Adds 2 devices to each host.
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(1);
+  DeviceProto* d2 = locals[1].add_devices();
+  d2->set_local_device_ordinal(0);
+  DeviceProto* d3 = locals[1].add_devices();
+  d3->set_local_device_ordinal(1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      GlobalTopologyProto global,
+      BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals),
+                          /*assign_global_device_ids=*/true));
+
+  EXPECT_EQ(global.nodes_size(), 2);
+  EXPECT_EQ(global.nodes()[0].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[0].devices()[0].slice_index(), 1);
+  EXPECT_EQ(global.nodes()[0].devices()[1].slice_index(), 1);
+  EXPECT_EQ(global.nodes()[1].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[1].devices()[0].slice_index(), 0);
+  EXPECT_EQ(global.nodes()[1].devices()[1].slice_index(), 0);
+}
+
 TEST(TopologyTest, BuildGpuTopology) {
   std::string slice_0_boot_id = "foo";
   std::string slice_1_boot_id = "bar";
@@ -200,9 +309,10 @@ TEST(TopologyTest, BuildGpuTopology) {
   d3->set_local_device_ordinal(1);
   d3->set_core_count(20);
 
-  GlobalTopologyProto global =
+  TF_ASSERT_OK_AND_ASSIGN(
+      GlobalTopologyProto global,
       BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals),
-                          /*assign_global_device_ids=*/true);
+                          /*assign_global_device_ids=*/true));
 
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_topology, BuildGpuTopology(global));
   EXPECT_EQ(gpu_topology.device_ids_size(), 4);
@@ -229,9 +339,10 @@ TEST(TopologyTest, BuildGpuTopologyWithDifferentNumHostsPerSlice) {
   DeviceProto* d2 = locals[2].add_devices();
   d2->set_local_device_ordinal(0);
 
-  GlobalTopologyProto global =
+  TF_ASSERT_OK_AND_ASSIGN(
+      GlobalTopologyProto global,
       BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals),
-                          /*assign_global_device_ids=*/true);
+                          /*assign_global_device_ids=*/true));
 
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_topology, BuildGpuTopology(global));
   EXPECT_EQ(gpu_topology.device_ids_size(), 3);
@@ -256,9 +367,10 @@ TEST(TopologyTest, BuildGpuTopologyWithDifferentNumDevicesPerHost) {
   DeviceProto* d2 = locals[1].add_devices();
   d2->set_local_device_ordinal(0);
 
-  GlobalTopologyProto global =
+  TF_ASSERT_OK_AND_ASSIGN(
+      GlobalTopologyProto global,
       BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals),
-                          /*assign_global_device_ids=*/true);
+                          /*assign_global_device_ids=*/true));
 
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_topology, BuildGpuTopology(global));
   EXPECT_EQ(gpu_topology.device_ids_size(), 3);
diff --git a/third_party/xla/xla/pjrt/executable_metadata.proto b/third_party/xla/xla/pjrt/executable_metadata.proto
index db308d57af47..6066bdc0a9b4 100644
--- a/third_party/xla/xla/pjrt/executable_metadata.proto
+++ b/third_party/xla/xla/pjrt/executable_metadata.proto
@@ -12,7 +12,6 @@ message CompiledMemoryStatsProto {
   int64 output_size_in_bytes = 3;
   int64 alias_size_in_bytes = 4;
   int64 temp_size_in_bytes = 5;
-  xla.HloProto hlo_proto = 6;
 
   // Host memory usage stats.
   int64 host_generated_code_size_in_bytes = 7;
@@ -20,4 +19,8 @@ message CompiledMemoryStatsProto {
   int64 host_output_size_in_bytes = 9;
   int64 host_alias_size_in_bytes = 10;
   int64 host_temp_size_in_bytes = 11;
+
+  xla.BufferAssignmentProto buffer_assignment = 12;
+
+  reserved 6;
 }
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 368056140d12..81955a930920 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -1,6 +1,5 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/pjrt/gpu:package_groups.bzl", "xla_gpu_internal_packages")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//xla/tests:build_defs.bzl", "xla_test")
@@ -58,14 +57,20 @@ cc_library(
         ":gpu_metrics",
         ":gpu_topology",
         ":gpu_topology_proto_cc",
+        ":raw_buffer",
         ":se_gpu_topology_description",
+        "//xla:executable_run_options",
         "//xla:literal",
+        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/client:client_library",
         "//xla/client:local_client",
+        "//xla/core/collectives",
+        "//xla/core/collectives:collectives_registry",
         "//xla/hlo/builder:xla_computation",
         "//xla/pjrt:compile_options_proto_cc",
         "//xla/pjrt:event_pool",
@@ -89,6 +94,7 @@ cc_library(
         "//xla/pjrt/distributed:topology_util",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_allocator_config",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:buffer_assignment",
         "//xla/service:compiler",
         "//xla/service:computation_placer_hdr",
         "//xla/service:executable",
@@ -105,6 +111,7 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor/integrations:tf_allocator_adapter",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/framework:allocator",
         "//xla/tsl/framework:bfc_allocator",
         "//xla/tsl/framework:device_id",
@@ -127,6 +134,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -139,12 +147,15 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:traceme",
     ] + if_cuda_or_rocm([
         # keep sorted
-        ":nccl_id_store",
         "//xla:debug_options_flags",
         "//xla/service/gpu:gpu_compiler",
+        "//xla/service/gpu:gpu_constants",
+        "//xla/service/gpu:gpu_executable",
         "//xla/service/gpu:gpu_memory_space_assignment",
+        "//xla/service/gpu:stream_executor_util",
     ]) + if_cuda([
         # keep sorted
+        "//xla/service/gpu/model:gpu_collective_performance_model",
         "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm([
@@ -153,20 +164,22 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "se_gpu_pjrt_client_test",
     srcs = ["se_gpu_pjrt_client_test.cc"],
-    tags = [
-        "gpu",
+    backend_tags = {"gpu": [
+        "multi_gpu_h100",
         "no_oss",
         "noasan",
         "nomsan",
-        "requires-gpu-nvidia:2",
-    ],
+    ]},
+    backends = ["gpu"],
     deps = [
         ":gpu_topology",
         ":gpu_topology_proto_cc",
         ":se_gpu_pjrt_client",
+        ":se_gpu_topology_description",
+        "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -180,29 +193,36 @@ xla_cc_test(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test",
+        "//xla/pjrt:compile_options_proto_cc",
         "//xla/pjrt:host_memory_spaces",
+        "//xla/pjrt:local_device_state",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_stream_executor_client",
+        "//xla/pjrt:raw_buffer",
         "//xla/pjrt/distributed",
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
         "//xla/pjrt/profiling:device_time_measurement",
         "//xla/pjrt/profiling/test_util:mock_device_time_measurement",
-        "//xla/service:gpu_plugin",
         "//xla/service:platform_util",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:subprocess",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -213,38 +233,83 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "nccl_id_store",
-    srcs = ["nccl_id_store.cc"],
-    hdrs = ["nccl_id_store.h"],
+xla_test(
+    name = "se_gpu_pjrt_client_benchmark_test",
+    srcs = ["se_gpu_pjrt_client_benchmark_test.cc"],
+    backends = ["gpu"],
     deps = [
-        "//xla:status_macros",
+        ":se_gpu_pjrt_client",
+        "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/plugin/xla_gpu:xla_gpu_allocator_config",
+        "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+# TODO(b/409713313): Move this test to collectives directory.
+xla_test(
+    name = "se_gpu_pjrt_client_nvshmem_test",
+    srcs = ["se_gpu_pjrt_client_nvshmem_test.cc"],
+    backend_tags = {"gpu": [
+        "multi_gpu_h100",
+        "no_oss",
+        "noasan",
+        "notap",  # TODO(b/399931591): Re-enable once flakiness is resolved.
+        "nomsan",
+    ]},
+    backends = ["gpu"],
+    env = {
+        "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
+    },
+    deps = [
+        ":gpu_topology_proto_cc",
+        ":se_gpu_pjrt_client",
+        "//xla:shape_util",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/core/collectives:clique_id",
-        "//xla/core/collectives:clique_key",
-        "//xla/pjrt/distributed:key_value_store_interface",
-        "//xla/service:global_device_id",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:raw_buffer",
+        "//xla/pjrt/distributed",
+        "//xla/pjrt/distributed:client",
+        "//xla/pjrt/distributed:in_memory_key_value_store",
+        "//xla/pjrt/distributed:service",
+        "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:platform_util",
+        "//xla/tests:literal_test_util",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -292,7 +357,9 @@ cc_library(
     hdrs = ["se_gpu_pjrt_compiler.h"],
     deps = [
         ":se_gpu_pjrt_client",
+        ":se_gpu_topology_description",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
@@ -306,21 +373,19 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_module_util",
+        "//xla/service:hlo_proto_cc",
         "//xla/service:local_service_utils",
         "//xla/service:platform_util",
         "//xla/service/gpu:executable_proto_cc",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/stream_executor/rocm:rocm_platform_id",
-        "//xla/stream_executor/sycl:sycl_platform_id",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -401,6 +466,7 @@ xla_test(
         ":gpu_topology",
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler_impl",
+        ":se_gpu_topology_description",
         "//xla:literal",
         "//xla:literal_util",
         "//xla/hlo/builder:xla_computation",
@@ -411,8 +477,9 @@ xla_test(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
-        "//xla/service:platform_util",
         "//xla/tests:literal_test_util",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -421,8 +488,6 @@ xla_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -444,8 +509,8 @@ xla_test(
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
         "//xla/service:compiler",
         "//xla/tests:literal_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -454,8 +519,6 @@ xla_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -463,13 +526,18 @@ cc_library(
     name = "se_gpu_topology_description",
     srcs = ["se_gpu_topology_description.cc"],
     hdrs = ["se_gpu_topology_description.h"],
-    visibility = internal_visibility(["//xla/pjrt/gpu:legacy_gpu_topology_users"]),
+    visibility = internal_visibility([
+        "//xla/pjrt/gpu:legacy_gpu_topology_users",
+        ":__subpackages__",
+    ]),
     deps = [
         ":gpu_topology",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_stream_executor_device_description",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/lib/strings:proto_serialization",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
@@ -478,3 +546,21 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "raw_buffer",
+    srcs = ["raw_buffer.cc"],
+    hdrs = ["raw_buffer.h"],
+    deps = [
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:pjrt_stream_executor_client",
+        "//xla/pjrt:raw_buffer",
+        "//xla/pjrt:tracked_device_buffer",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+    alwayslink = 1,
+)
diff --git a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc b/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
deleted file mode 100644
index a2a72856e6d9..000000000000
--- a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/pjrt/gpu/nccl_id_store.h"
-
-#include <string>
-#include <utility>
-
-#include "absl/status/statusor.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/time/time.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/core/collectives/clique_id.h"
-#include "xla/core/collectives/clique_key.h"
-#include "xla/status_macros.h"
-#include "xla/util.h"
-#include "tsl/platform/casts.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-
-absl::StatusOr<CliqueId> NcclIdStore::GetNcclUniqueId(const CliqueKey& key) {
-  auto* gpu_key = tsl::down_cast<const gpu::GpuCliqueKey*>(&key);
-  if (gpu_key == nullptr) {
-    return InvalidArgument("Expected GPU clique key");
-  }
-
-  // The caller must ensure that threads calling this method concurrently have
-  // unique keys, otherwise the global key-value store may hold the wrong value.
-  {
-    absl::MutexLock lock(&mu_);
-    auto it = cache_.find(*gpu_key);
-    if (it != cache_.end()) {
-      return it->second;
-    }
-  }
-  CliqueId clique_id;
-  int primary_node_id = device_to_node_.at(gpu_key->root_device());
-  if (node_id_ == primary_node_id) {
-    TF_ASSIGN_OR_RETURN(clique_id,
-                        gpu::GpuCollectives::Default()->CreateUniqueCliqueId());
-    TF_RETURN_IF_ERROR(
-        kv_store_->Set(gpu_key->ToString(), clique_id.ToString()));
-  } else {
-    TF_ASSIGN_OR_RETURN(std::string id_str,
-                        kv_store_->Get(gpu_key->ToString(), absl::Minutes(10)));
-    clique_id = CliqueId(id_str);
-  }
-  absl::MutexLock lock(&mu_);
-  auto result = cache_.emplace(*gpu_key, std::move(clique_id));
-  TF_RET_CHECK(result.second) << "Unique ID already in cache.";
-  return result.first->second;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/nccl_id_store.h b/third_party/xla/xla/pjrt/gpu/nccl_id_store.h
deleted file mode 100644
index fe8b060cb946..000000000000
--- a/third_party/xla/xla/pjrt/gpu/nccl_id_store.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PJRT_GPU_NCCL_ID_STORE_H_
-#define XLA_PJRT_GPU_NCCL_ID_STORE_H_
-
-#include <memory>
-#include <utility>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/statusor.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
-#include "xla/core/collectives/clique_id.h"
-#include "xla/core/collectives/clique_key.h"
-#include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/service/global_device_id.h"
-
-namespace xla {
-
-// A table mapping GpuCliqueKeys to CliqueIds. In a distributed setup the
-// table of NCCL IDs is kept on the master node (node 0). The node of the first
-// participating device will create the unique id.
-class NcclIdStore {
- public:
-  NcclIdStore(int node_id,
-              absl::flat_hash_map<GlobalDeviceId, int> device_to_node,
-              std::shared_ptr<KeyValueStoreInterface> kv_store)
-      : node_id_(node_id),
-        device_to_node_(std::move(device_to_node)),
-        kv_store_(std::move(kv_store)) {}
-
-  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key);
-
- private:
-  const int node_id_;
-  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
-  const std::shared_ptr<KeyValueStoreInterface> kv_store_;
-
-  absl::Mutex mu_;
-  absl::flat_hash_map<gpu::GpuCliqueKey, CliqueId> cache_ ABSL_GUARDED_BY(mu_);
-};
-
-}  // namespace xla
-
-#endif  // XLA_PJRT_GPU_NCCL_ID_STORE_H_
diff --git a/third_party/xla/xla/pjrt/gpu/raw_buffer.cc b/third_party/xla/xla/pjrt/gpu/raw_buffer.cc
new file mode 100644
index 000000000000..dbb30e0537fa
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/raw_buffer.cc
@@ -0,0 +1,68 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/gpu/raw_buffer.h"
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/pjrt/raw_buffer.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+
+PjRtFuture<> PjRtStreamExecutorRawBuffer::CopyRawHostToDevice(
+    const void* src, int64_t offset, int64_t transfer_size) {
+  return client_->CopyRawHostToDevice(local_device_, device_buffer_, src,
+                                      offset, transfer_size);
+}
+
+PjRtFuture<> PjRtStreamExecutorRawBuffer::CopyRawDeviceToHost(
+    void* dst, int64_t offset, int64_t transfer_size) {
+  return client_->CopyRawDeviceToHost(local_device_, device_buffer_, dst,
+                                      offset, transfer_size);
+}
+
+std::optional<absl::StatusOr<tsl::RCReference<PjRtRawBuffer>>>
+CreateGPURawBuffer(PjRtBuffer* buffer) {
+  if (auto* se_buffer = dynamic_cast<PjRtStreamExecutorBuffer*>(buffer)) {
+    auto* se_client = dynamic_cast<PjRtStreamExecutorClient*>(buffer->client());
+    if (se_client == nullptr) {
+      return absl::InvalidArgumentError("invalid se-client");
+    }
+    PjRtStreamExecutorBuffer::ScopedHold hold(
+        se_buffer->GetBufferWithUsageHold());
+    if (!hold.ok()) {
+      return hold.status();
+    }
+    if (!hold->device_memory()) {
+      return absl::InvalidArgumentError(
+          "Create raw buffer called on an invalid buffer");
+    }
+    return tsl::MakeRef<PjRtStreamExecutorRawBuffer>(
+        se_client, se_buffer->memory_space(),
+        se_buffer->device()->local_device_state(), hold->device_memory());
+  }
+  return std::nullopt;
+}
+
+REGISTER_PJRT_RAW_BUFFER_FACTORY(CreateGPURawBuffer);
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/raw_buffer.h b/third_party/xla/xla/pjrt/gpu/raw_buffer.h
new file mode 100644
index 000000000000..84a8071d85b3
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/raw_buffer.h
@@ -0,0 +1,63 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_RAW_BUFFER_H_
+#define XLA_PJRT_GPU_RAW_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/pjrt/raw_buffer.h"
+#include "xla/pjrt/tracked_device_buffer.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+
+namespace xla {
+
+class PjRtStreamExecutorRawBuffer : public PjRtRawBuffer {
+ public:
+  PjRtStreamExecutorRawBuffer(PjRtStreamExecutorClient* client,
+                              PjRtMemorySpace* memory_space,
+                              LocalDeviceState* local_device,
+                              tsl::RCReference<RawSEDeviceMemory> device_buffer)
+      : client_(client),
+        memory_space_(memory_space),
+        local_device_(local_device),
+        device_buffer_(device_buffer) {}
+  PjRtMemorySpace* memory_space() const override { return memory_space_; }
+
+  size_t GetOnDeviceSizeInBytes() const override {
+    return device_buffer_->mem().size();
+  }
+
+  PjRtFuture<> CopyRawHostToDevice(const void* src, int64_t offset,
+                                   int64_t transfer_size) override;
+
+  PjRtFuture<> CopyRawDeviceToHost(void* dst, int64_t offset,
+                                   int64_t transfer_size) override;
+
+ private:
+  PjRtStreamExecutorClient* client_;
+  PjRtMemorySpace* memory_space_;
+  LocalDeviceState* local_device_;
+  tsl::RCReference<RawSEDeviceMemory> device_buffer_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_RAW_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 67286afcf032..95cf33a73808 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -46,7 +46,12 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/client/local_client.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/collectives_registry.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
@@ -78,6 +83,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
@@ -86,6 +92,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
 #include "tsl/platform/casts.h"
@@ -103,16 +110,19 @@ limitations under the License.
 #include "xla/debug_options_flags.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/gpu/gpu_metrics.h"
-#include "xla/pjrt/gpu/nccl_id_store.h"
 #include "xla/pjrt/stream_executor_executable.pb.h"
 #include "xla/service/gpu/gpu_compiler.h"
+#include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_memory_space_assignment.h"
+#include "xla/service/gpu/stream_executor_util.h"
 #include "xla/xla.pb.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "xla/service/gpu/model/gpu_collective_performance_model.h"
 #include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
 #elif TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -123,7 +133,7 @@ limitations under the License.
 #include "xla/util.h"
 
 namespace xla {
-class AsyncHostToDeviceTransferManager
+class GpuAsyncHostToDeviceTransferManager
     : public xla::PjRtClient::AsyncHostToDeviceTransferManager {
  public:
   static absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
@@ -138,7 +148,7 @@ class AsyncHostToDeviceTransferManager
           device_layouts->size(), shape_specs.size());
     }
     absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers;
-    absl::InlinedVector<std::shared_ptr<TrackedDeviceBuffer>, 4> buffer_ptrs;
+    absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs;
     absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 4>
         definition_events;
     absl::InlinedVector<Shape, 4> device_shapes;
@@ -183,18 +193,18 @@ class AsyncHostToDeviceTransferManager
           tensorflow::down_cast<PjRtStreamExecutorBuffer*>(buffer.get());
       DCHECK(se_buffer);
       auto hold = se_buffer->GetBufferWithUsageHold();
-      buffer_ptrs.push_back(hold.buffer());
+      buffer_ptrs.push_back(hold->device_memory());
       buffers.push_back(std::move(buffer));
     }
 
-    return std::make_unique<AsyncHostToDeviceTransferManager>(
+    return std::make_unique<GpuAsyncHostToDeviceTransferManager>(
         std::move(buffers), std::move(buffer_ptrs),
         std::move(definition_events), std::move(device_shapes), device);
   }
 
-  AsyncHostToDeviceTransferManager(
+  GpuAsyncHostToDeviceTransferManager(
       absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers,
-      absl::InlinedVector<std::shared_ptr<TrackedDeviceBuffer>, 4> buffer_ptrs,
+      absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs,
       absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 4>
           definition_events,
       absl::InlinedVector<Shape, 4> device_shapes,
@@ -208,13 +218,13 @@ class AsyncHostToDeviceTransferManager
         device_(device) {
     buffer_sizes_.reserve(buffer_ptrs_.size());
     for (const auto& ptr : buffer_ptrs_) {
-      DCHECK_EQ(ptr->device_memory().size(), 1);
-      buffer_sizes_.push_back(ptr->device_memory()[0]->mem().size());
+      DCHECK(ptr);
+      buffer_sizes_.push_back(ptr->mem().size());
     }
     last_transfer_started_.resize(buffer_ptrs_.size(), false);
   }
 
-  ~AsyncHostToDeviceTransferManager() override {
+  ~GpuAsyncHostToDeviceTransferManager() override {
     auto transfers_finished = [this]() {
       mu_.AssertHeld();
       return transfers_in_flight_ == 0;
@@ -245,7 +255,7 @@ class AsyncHostToDeviceTransferManager
       int buffer_index, const LiteralSlice& literal,
       absl::AnyInvocable<void() &&> on_done) override {
     tsl::profiler::TraceMe traceme(
-        "AsyncHostToDeviceTransferManager::TransferLiteralToBuffer");
+        "GpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer");
     auto* stream = device_->local_device_state()->host_to_device_stream();
     auto* se_client =
         tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
@@ -254,7 +264,7 @@ class AsyncHostToDeviceTransferManager
     TransferManager* transfer_manager =
         se_client->client()->backend().transfer_manager();
 
-    std::shared_ptr<TrackedDeviceBuffer> buffer;
+    tsl::RCReference<RawSEDeviceMemory> buffer;
     {
       absl::MutexLock l(&mu_);
 
@@ -268,15 +278,6 @@ class AsyncHostToDeviceTransferManager
       last_transfer_started_[buffer_index] = true;
       buffer = buffer_ptrs_[buffer_index];
       DCHECK(buffer);
-      if (buffer->device_memory().empty()) {
-        return InvalidArgument(
-            "TransferLiteralToBuffer requested for buffer index %d which has "
-            "been donated. Async transfer of donated buffers is not supported "
-            "in SE:GPU",
-            buffer_index);
-      }
-      DCHECK_EQ(buffer->device_memory().size(), 1);
-
       ++transfers_in_flight_;
     }
 
@@ -285,19 +286,20 @@ class AsyncHostToDeviceTransferManager
     // TODO(misard) assess if it would be preferable to introduce a heuristic to
     // put the transfer into the calling thread for small literals.
     auto transfer_h2d = [this, buffer_index, stream, transfer_manager, literal,
-                         device_buffer = buffer.get(),
+                         device = device_, device_buffer = buffer,
                          local_device =
                              std::move(device_->local_device_state()),
                          on_done = std::move(on_done)]() mutable {
       tsl::profiler::TraceMe traceme(
-          "AsyncHostToDeviceTransferManager::TransferLiteralToBuffer::transfer_"
+          "GpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer::"
+          "transfer_"
           "h2d");
 
       auto event = local_device->event_pool().AllocateEvent(stream->parent());
 
       // Initiate linearization and transfer of the buffer on the stream.
       ShapedBuffer buffer =
-          device_buffer->AsShapedBuffer(device_shapes_[buffer_index]);
+          device_buffer->AsShapedBuffer(device, device_shapes_[buffer_index]);
       TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
           stream, literal, buffer));
       local_device->event_pool().ThenRecordEvent(stream, event.value());
@@ -370,15 +372,7 @@ class AsyncHostToDeviceTransferManager
       last_transfer_started_[buffer_index] = true;
     }
     DCHECK(buffer_ptrs_[buffer_index]);
-    if (buffer_ptrs_[buffer_index]->device_memory().empty()) {
-      return InvalidArgument(
-          "TransferRawDataToSubBuffer requested for buffer index %d which has "
-          "been donated. Async transfer of donated buffers is not supported "
-          "in SE:GPU",
-          buffer_index);
-    }
-    DCHECK_EQ(buffer_ptrs_[buffer_index]->device_memory().size(), 1);
-    auto& buffer_memory = buffer_ptrs_[buffer_index]->device_memory()[0]->mem();
+    auto& buffer_memory = buffer_ptrs_[buffer_index]->mem();
     se::DeviceMemoryBase sub_buffer;
     CHECK_LE(offset, buffer_memory.size());
     CHECK_LE(transfer_size, buffer_memory.size() - offset);
@@ -453,7 +447,7 @@ class AsyncHostToDeviceTransferManager
   absl::InlinedVector<size_t, 4> buffer_sizes_;
   // References to the underlying storage for all the buffers, which ensures
   // that the buffers can't be freed before all transfers complete.
-  absl::InlinedVector<std::shared_ptr<TrackedDeviceBuffer>, 4> buffer_ptrs_
+  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffer_ptrs_
       ABSL_GUARDED_BY(mu_);
   // True if the last transfer for a buffer has been initiated. Used to prevent
   // a client initiating another transfer after the last transfer has already
@@ -483,7 +477,7 @@ class AsyncHostToDeviceTransferManager
       if (is_last_transfer) {
         // Drop our reference to the TrackedDeviceBuffer for this buffer.
         CHECK(buffer_ptrs_[buffer_index]);
-        buffer_ptrs_[buffer_index] = nullptr;
+        buffer_ptrs_[buffer_index] = tsl::RCReference<xla::RawSEDeviceMemory>();
         CHECK_GT(remaining_buffer_count_, 0);
         --remaining_buffer_count_;
         definition_events_[buffer_index]->SetSequencingEvent(std::move(event),
@@ -608,7 +602,7 @@ StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
   PjRtDevice* device = memory_space->devices()[0];
   auto* stream_executor_device =
       tensorflow::down_cast<PjRtStreamExecutorDevice*>(device);
-  return xla::AsyncHostToDeviceTransferManager::Create(
+  return xla::GpuAsyncHostToDeviceTransferManager::Create(
       shape_specs, std::move(device_layouts), stream_executor_device, this,
       memory_space);
 }
@@ -644,15 +638,19 @@ PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
     return PjRtFuture<>(hold.status());
   }
 
-  auto device_buffer = hold.buffer();
-  if (device_buffer->device_memory().size() != 1) {
-    return PjRtFuture<>(InvalidArgument("Copy raw buffer called on tuple"));
+  auto device_memory = hold->device_memory();
+  if (!device_memory) {
+    return PjRtFuture<>(
+        InvalidArgument("Copy raw buffer called on an invalid buffer"));
   }
 
   auto promise = PjRtFuture<>::CreatePromise();
   auto usage_event =
       std::make_shared<BufferSequencingEvent>(this->thread_pool());
 
+  auto definition_events = hold->definition_events();
+  auto first_definition_event = definition_events[0];
+
   // When using the ComputeSynchronized allocation model, retain a reference to
   // the device_buffer until the copy completes, to ensure that the buffer isn't
   // deleted or donated while it is still in use. The choice of retaining a
@@ -663,7 +661,9 @@ PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
   hold.ConvertUsageHold(stream, usage_event, /*reference_held=*/true);
 
   auto async_copy = [this, promise, offset, transfer_size, stream, local_device,
-                     device_buffer, usage_event = std::move(usage_event)](
+                     owning_device_memory = std::move(device_memory),
+                     definition_events = std::move(definition_events),
+                     usage_event = std::move(usage_event)](
                         absl::StatusOr<void*> dst) mutable {
     absl::StatusOr<EventPool::Handle> event =
         local_device->event_pool().AllocateEvent(stream->parent());
@@ -672,14 +672,13 @@ PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
       return;
     }
 
-    absl::Status defined_status =
-        device_buffer->definition_events()[0]->GetDefinedStatus();
+    absl::Status defined_status = definition_events[0]->GetDefinedStatus();
     if (!defined_status.ok()) {
       promise.Set(defined_status);
       return;
     }
 
-    auto& device_memory = device_buffer->device_memory()[0]->mem();
+    auto& device_memory = owning_device_memory->mem();
     if (offset < 0 || offset > device_memory.size() ||
         device_memory.size() - offset < transfer_size) {
       promise.Set(
@@ -697,7 +696,8 @@ PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
       sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
     }
 
-    WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
+    WaitForBufferDefinitionEventsOnStream(absl::MakeSpan(definition_events),
+                                          stream);
 
     if (transfer_size != 0) {
       if (should_stage_host_to_device_transfers() &&
@@ -746,7 +746,8 @@ PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
     usage_event->SetSequencingEvent(std::move(event).value(), stream);
 
     auto callback_status = local_device->ThenExecuteCallback(
-        stream, [promise, device_buffer = std::move(device_buffer)]() mutable {
+        stream, [promise, owning_device_memory =
+                              std::move(owning_device_memory)]() mutable {
           promise.Set();
         });
     if (!callback_status.ok()) {
@@ -755,7 +756,7 @@ PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
     }
   };
 
-  device_buffer->definition_events()[0]->ExecuteOrAddToFutureTasks(
+  first_definition_event->ExecuteOrAddToFutureTasks(
       absl::StrFormat("async_copy_raw_sub_buffer_to_host_%p", &async_copy),
       [this, dst, async_copy = std::move(async_copy)]() mutable {
         dst.OnReady([this, async_copy = std::move(async_copy)](
@@ -784,10 +785,95 @@ PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
       });
 }
 
+PjRtFuture<> StreamExecutorGpuClient::CopyRawHostToDevice(
+    LocalDeviceState* local_device,
+    tsl::RCReference<RawSEDeviceMemory> device_buffer, const void* src,
+    int64_t offset, int64_t transfer_size) {
+  auto promise = PjRtFuture<>::CreatePromise();
+  se::Stream* stream = local_device->host_to_device_stream();
+  thread_pool()->Schedule([local_device, stream,
+                           buffer = std::move(device_buffer), src, offset,
+                           transfer_size, promise]() mutable {
+    se::DeviceMemoryBase sub_buffer = buffer->mem();
+    if (transfer_size < sub_buffer.size()) {
+      sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
+    }
+    auto status = stream->Memcpy(&sub_buffer, src, transfer_size);
+    if (!status.ok()) {
+      promise.Set(std::move(status));
+      return;
+    }
+    auto callback_status = local_device->ThenExecuteCallback(
+        stream,
+        [promise, buffer = std::move(buffer)]() mutable { promise.Set(); });
+  });
+  return PjRtFuture<>(
+      std::move(promise),
+      /*on_block_start=*/
+      []() {
+        tsl::profiler::TraceMeProducer traceme(
+            "StreamExecutorGpuClient::CopyRawHostToDevice");
+        VLOG(1) << "StreamExecutorGpuClient::CopyRawHostToDevice";
+        return PjRtFutureHelpers::ProfilingKeys(
+            {/*traceme_context_id =*/traceme.GetContextId()});
+      },
+      /*on_block_end=*/
+      [](PjRtFutureHelpers::ProfilingKeys keys) {
+        tsl::profiler::TraceMeConsumer traceme(
+            "StreamExecutorGpuClient::CopyRawHostToDevice",
+            keys.traceme_context_id);
+      });
+}
+
+PjRtFuture<> StreamExecutorGpuClient::CopyRawDeviceToHost(
+    LocalDeviceState* local_device,
+    tsl::RCReference<RawSEDeviceMemory> device_buffer, void* dst,
+    int64_t offset, int64_t transfer_size) {
+  auto promise = PjRtFuture<>::CreatePromise();
+  se::Stream* stream = local_device->GetDeviceToHostStream();
+  thread_pool()->Schedule([local_device, stream,
+                           buffer = std::move(device_buffer), dst, offset,
+                           transfer_size, promise]() mutable {
+    se::DeviceMemoryBase sub_buffer = buffer->mem();
+    if (transfer_size < sub_buffer.size()) {
+      sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
+    }
+    auto status = stream->Memcpy(dst, sub_buffer, transfer_size);
+    if (!status.ok()) {
+      promise.Set(std::move(status));
+      return;
+    }
+    auto callback_status = local_device->ThenExecuteCallback(
+        stream,
+        [promise, buffer = std::move(buffer)]() mutable { promise.Set(); });
+  });
+  return PjRtFuture<>(
+      std::move(promise),
+      /*on_block_start=*/
+      []() {
+        tsl::profiler::TraceMeProducer traceme(
+            "StreamExecutorGpuClient::CopyRawDeviceToHost");
+        VLOG(1) << "StreamExecutorGpuClient::CopyRawDeviceToHost";
+        return PjRtFutureHelpers::ProfilingKeys(
+            {/*traceme_context_id =*/traceme.GetContextId()});
+      },
+      /*on_block_end=*/
+      [](PjRtFutureHelpers::ProfilingKeys keys) {
+        tsl::profiler::TraceMeConsumer traceme(
+            "StreamExecutorGpuClient::CopyRawDeviceToHost",
+            keys.traceme_context_id);
+      });
+}
+
+absl::StatusOr<Layout> StreamExecutorGpuClient::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) {
+  return topology_.GetDefaultLayout(element_type, dims);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::Compile(const XlaComputation& computation,
-                                 CompileOptions options) {
-  auto executable = PjRtStreamExecutorClient::Compile(computation, options);
+StreamExecutorGpuClient::CompileAndLoad(mlir::ModuleOp module,
+                                        CompileOptions options) {
+  auto executable = PjRtStreamExecutorClient::CompileAndLoad(module, options);
 
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
   for (const PjRtDevice* device : addressable_devices()) {
@@ -811,44 +897,38 @@ StreamExecutorGpuClient::Compile(const XlaComputation& computation,
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
-                                        std::optional<CompileOptions> options,
-                                        const LoadOptions& load_options) {
-  return PjRtStreamExecutorClient::DeserializeExecutable(serialized, options);
+StreamExecutorGpuClient::CompileAndLoad(const XlaComputation& computation,
+                                        CompileOptions options) {
+  auto executable =
+      PjRtStreamExecutorClient::CompileAndLoad(computation, options);
+
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  for (const PjRtDevice* device : addressable_devices()) {
+    LocalDeviceState* local_device_state =
+        tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
+            ->local_device_state();
+    int64_t free_memory, total_memory;
+    if (local_device_state != nullptr) {
+      se::StreamExecutor* executor = local_device_state->executor();
+      int device_ordinal = executor->device_ordinal();
+      if (executor->DeviceMemoryUsage(&free_memory, &total_memory)) {
+        gpu_metrics::RecordFreeGpuSystemMemory(device_ordinal, free_memory);
+      } else {
+        LOG(ERROR) << "Failed to query available memory for GPU "
+                   << device_ordinal;
+      }
+    }
+  }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  return executable;
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::Load(std::unique_ptr<PjRtExecutable> executable) {
-  auto se_executable = absl::WrapUnique(
-      tensorflow::down_cast<StreamExecutorExecutable*>(executable.release()));
-
-  CompileOptions compile_options = se_executable->compile_options();
-  CompileOptions input_options = compile_options;
-  TF_RETURN_IF_ERROR(compile_options.ApplyAllOptionOverrides());
-  TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
-                      GetExecutableExtras(&compile_options));
-
-  // Load Executable from AOT compilation result.
-  std::vector<std::unique_ptr<LocalExecutable>> local_executables;
-  local_executables.reserve(se_executable->aot_executables().size());
-  for (std::unique_ptr<xla::AotCompilationResult>& aot_executable :
-       se_executable->aot_executables()) {
-    TF_ASSIGN_OR_RETURN(std::string serialized,
-                        aot_executable->SerializeAsString());
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<LocalExecutable> local_executable,
-        client()->Load(serialized, compile_options.executable_build_options));
-    local_executables.push_back(std::move(local_executable));
-  }
-  bool parameter_is_tupled_arguments =
-      compile_options.parameter_is_tupled_arguments;
-  auto ret = std::make_unique<PjRtStreamExecutorLoadedExecutable>(
-      std::move(local_executables), parameter_is_tupled_arguments,
-      std::move(extras.device_assignment), std::move(input_options),
-      std::move(extras.addressable_device_logical_ids),
-      std::move(extras.addressable_devices), this);
-  TF_RETURN_IF_ERROR(ret->SetUpDonation(parameter_is_tupled_arguments));
-  return std::unique_ptr<PjRtLoadedExecutable>(std::move(ret));
+StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
+                                        std::optional<CompileOptions> options,
+                                        const LoadOptions& load_options) {
+  return PjRtStreamExecutorClient::LoadSerializedExecutable(serialized, options,
+                                                            load_options);
 }
 
 namespace {
@@ -1048,7 +1128,7 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     std::optional<absl::string_view> mock_gpu_topology,
-    absl::Duration get_local_topology_timeout,
+    std::optional<int> slice_index, absl::Duration get_local_topology_timeout,
     absl::Duration get_global_topology_timeout) {
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   LocalTopologyProto local_topology;
@@ -1061,6 +1141,9 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     boot_id_str = boot_id_str_or_status.value();
   }
   local_topology.set_boot_id(boot_id_str);
+  if (slice_index.has_value()) {
+    local_topology.set_slice_index(*slice_index);
+  }
   for (const auto& ordinal_and_device : local_device_states) {
     const se::Platform* platform =
         ordinal_and_device.second->executor()->GetPlatform();
@@ -1072,9 +1155,17 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     device_proto->set_local_device_ordinal(ordinal_and_device.first);
     device_proto->set_name(desc->name());
     device_proto->set_vendor(desc->device_vendor());
-    device_proto->set_compute_capability(
-        MakeComputeCapabilityString(desc.get()));
+    auto compute_capability = MakeComputeCapabilityString(desc.get());
+    device_proto->set_compute_capability(compute_capability);
     device_proto->set_core_count(desc->core_count());
+#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
+    if (std::stoi(compute_capability) >= 9) {
+      auto fabric_info = GetDeviceFabricInfo(ordinal_and_device.first);
+      if (fabric_info.ok()) {
+        device_proto->set_fabric_uuid(*fabric_info);
+      }
+    }
+#endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
   }
 
   GlobalTopologyProto global_topology;
@@ -1111,8 +1202,9 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
         local_topologies[node_id].set_boot_id(absl::StrCat(i));
       }
     }
-    global_topology = BuildGlobalTopology(absl::MakeSpan(local_topologies),
-                                          /*assign_global_device_ids=*/true);
+    TF_ASSIGN_OR_RETURN(global_topology,
+                        BuildGlobalTopology(absl::MakeSpan(local_topologies),
+                                            /*assign_global_device_ids=*/true));
   } else {
     TF_RETURN_IF_ERROR(ExchangeTopologies(
         platform_name, node_id, num_nodes, get_local_topology_timeout,
@@ -1151,16 +1243,20 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
   }
   gpu_executable_run_options->set_gpu_global_device_ids(
       std::move(gpu_device_ids));
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  if (num_nodes > 1) {
-    auto nccl_id_store = std::make_shared<NcclIdStore>(node_id, device_to_node,
-                                                       std::move(kv_store));
-    gpu_executable_run_options->set_clique_id_callback(
-        [nccl_id_store](const CliqueKey& key) {
-          return nccl_id_store->GetNcclUniqueId(key);
-        });
+
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Default("gpu"));
+  xla::gpu::GpuCollectives* gpu_collectives =
+      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+
+  if (gpu_collectives == nullptr) {
+    return absl::InternalError("Failed to get GPU collectives");
   }
-#endif  // GOOGLE_CUDA
+
+  TF_RETURN_IF_ERROR(gpu_collectives->InitializeTopology(
+      {node_id, global_topology.nodes().size(), local_device_states.size(),
+       kv_store, device_to_node, gpu_executable_run_options}));
+
   TF_ASSIGN_OR_RETURN(GpuTopologyProto gpu_topology,
                       BuildGpuTopology(global_topology));
   return std::make_pair(std::move(devices), gpu_topology);
@@ -1185,7 +1281,8 @@ StreamExecutorGpuDevice::StreamExecutorGpuDevice(
     std::string compute_capability, int core_count, int node_id,
     int slice_index)
     : PjRtStreamExecutorDevice(id, std::move(local_device_state),
-                               std::move(device_kind), node_id),
+                               /*process_index=*/node_id,
+                               std::move(device_kind)),
       device_vendor_(std::move(device_vendor)),
       slice_index_(slice_index) {
   std::array<int, 1> coords = {local_device_id().value()};
@@ -1297,10 +1394,11 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
   TF_RET_CHECK(options.num_nodes == 1 || kv_store != nullptr);
   TF_ASSIGN_OR_RETURN(
       DeviceTopologyPair device_topology_pair,
-      BuildDistributedDevices(
-          pjrt_platform_name, std::move(local_device_states), options.node_id,
-          options.num_nodes, gpu_run_options.get(), kv_store,
-          options.enable_mock_nccl, options.mock_gpu_topology));
+      BuildDistributedDevices(pjrt_platform_name,
+                              std::move(local_device_states), options.node_id,
+                              options.num_nodes, gpu_run_options.get(),
+                              kv_store, options.enable_mock_nccl,
+                              options.mock_gpu_topology, options.slice_index));
 
   auto gpu_topology = std::shared_ptr<const GpuTopology>(
       GpuTopology::FromProto(device_topology_pair.second));
@@ -1328,4 +1426,297 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
   return devices;
 }
 
+absl::StatusOr<std::string> GetDeviceFabricInfo(const int device_ordinal) {
+#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
+  if (!gpu::GpuPerformanceWithCollectiveModel::InitNvml()) {
+    return absl::InternalError("Failed to initialize NVML library.");
+  }
+
+  // NVML library is not a part of the CUDA toolkit, so there might be a
+  // situation when user is using CUDA 12.4 an higher, but the host NVML
+  // version doen't have the required functions.
+  if (xla_nvmlDeviceGetHandleByPciBusId_v2 == nullptr ||
+      xla_nvmlDeviceGetGpuFabricInfoV == nullptr) {
+    return absl::InternalError("NVML library doesn't have required functions.");
+  }
+
+  char pciBusId[] = "00000000:00:00.0";
+  cudaDeviceGetPCIBusId(pciBusId, sizeof(pciBusId), device_ordinal);
+  nvmlDevice_t device;
+  auto get_bus_id_status =
+      xla_nvmlDeviceGetHandleByPciBusId_v2(pciBusId, &device);
+  CHECK_EQ(get_bus_id_status, NVML_SUCCESS);
+
+  nvmlGpuFabricInfoV_t fabricInfo = {
+      .version = nvmlGpuFabricInfo_v2,
+      .state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED};
+  auto get_fabric_info_status =
+      xla_nvmlDeviceGetGpuFabricInfoV(device, &fabricInfo);
+  CHECK_EQ(get_fabric_info_status, NVML_SUCCESS);
+
+  if (fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+    std::string error_message =
+        "NVML doesn't support extracting fabric info or NVLink is not used by "
+        "the device.";
+    VLOG(2) << error_message;
+    return absl::InternalError(error_message);
+  }
+
+  CHECK_EQ(sizeof(fabricInfo.clusterUuid), 16);
+  std::string uuid_str = absl::StrFormat(
+      "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+      fabricInfo.clusterUuid[0], fabricInfo.clusterUuid[1],
+      fabricInfo.clusterUuid[2], fabricInfo.clusterUuid[3],
+      fabricInfo.clusterUuid[4], fabricInfo.clusterUuid[5],
+      fabricInfo.clusterUuid[6], fabricInfo.clusterUuid[7],
+      fabricInfo.clusterUuid[8], fabricInfo.clusterUuid[9],
+      fabricInfo.clusterUuid[10], fabricInfo.clusterUuid[11],
+      fabricInfo.clusterUuid[12], fabricInfo.clusterUuid[13],
+      fabricInfo.clusterUuid[14], fabricInfo.clusterUuid[15]);
+  return absl::StrCat(uuid_str, "/", std::to_string(fabricInfo.cliqueId));
+#else   // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
+  std::string error_message = "NVML usage is not supported";
+  VLOG(2) << error_message;
+  return absl::InternalError(error_message);
+#endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 12040
+}
+
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+static absl::Status CheckAlignment(const BufferAllocation& allocation,
+                                   se::DeviceMemoryBase buffer, int arg_idx) {
+  const int64_t expected_alignment = [&] {
+    if (allocation.is_entry_computation_parameter()) {
+      return gpu::kEntryParameterAlignBytes;
+    } else if (allocation.is_constant()) {
+      return gpu::kConstantBufferAlignBytes;
+    } else {
+      return gpu::kXlaAllocatedBufferAlignBytes;
+    }
+  }();
+  if (!buffer.is_null() &&
+      reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment != 0) {
+    return Internal(
+        "Address of buffer %d must be a multiple of %x, but "
+        "was %p",
+        arg_idx, expected_alignment, buffer.opaque());
+  }
+  return absl::OkStatus();
+}
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+absl::StatusOr<PjRtStreamExecutorExecutionOutput>
+StreamExecutorGpuClient::RunAsync(
+    LocalExecutable& exec, PjRtDevice* device,
+    std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> arguments,
+    ExecutableRunOptions run_options_inp) {
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  std::vector<const Shape*> argument_shapes;
+  argument_shapes.reserve(arguments.size());
+  for (const ShapeTree<PjRtStreamExecutorExecutionInput>& arg : arguments) {
+    argument_shapes.push_back(&arg.shape());
+  }
+
+  TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                      exec.RunHelper(argument_shapes, run_options_inp));
+  auto* gpu_exec =
+      tensorflow::down_cast<xla::gpu::GpuExecutable*>(exec.executable());
+  const ServiceExecutableRunOptions* run_options = &options_and_stream.first;
+  se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
+
+  se::StreamExecutor* executor = run_options->stream()->parent();
+
+  // Use the `device_ordinal` from the `run_options` if it is provided. This is
+  // the ordinal of the logical devices (e.g., virtual GPUs). If it is not
+  // provided, the ordinals of the logical and physical devices are the same.
+  const int device_ordinal = run_options->device_ordinal() != -1
+                                 ? run_options->device_ordinal()
+                                 : executor->device_ordinal();
+
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("GpuExecutable::ExecuteAsyncOnStreamImpl(",
+                   gpu_exec->module_name(), ")"));
+
+  // GpuExecutable always bound to a single GpuContext during its execution, so
+  // we activate it once to skip expensive context activations later.
+  auto activation = executor->Activate();
+
+  // Lock the GPU with a shared lock so that we don't interfere with autotuning
+  // that may be running during JIT compilation while allowing multiple XLA
+  // computations to use the same GPU simultaneously. We do not add locking for
+  // "recursive" invocations, which are done when holding a lock already.
+  std::variant<absl::ReaderMutexLock, absl::WriterMutexLock> gpu_lock(
+      std::in_place_index_t<0>{}, &gpu::GetGpuMutex(executor));
+
+  // Maybe update to a writer lock to get exclusive access to underlying GPU.
+  if (auto* gpu_opts = run_options->run_options().gpu_executable_run_options();
+      gpu_opts && gpu_opts->requires_exclusive_lock_on_gpu()) {
+    gpu_lock.emplace<1>(&gpu::GetGpuMutex(executor));
+  }
+
+  const gpu::GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
+  {
+    tsl::profiler::TraceMe hlo_module_activity(
+        [&] { return std::string("Resolve constant globals"); },
+        tsl::profiler::TraceMeLevel::kInfo);
+
+    TF_ASSIGN_OR_RETURN(
+        globals, gpu_exec->ResolveConstantGlobals(run_options->stream()));
+  }
+
+  absl::Span<const BufferAllocation> allocations = gpu_exec->GetAllocations();
+
+  std::vector<se::DeviceMemoryBase> buffers(allocations.size());
+  {
+    tsl::profiler::TraceMe hlo_module_activity(
+        [&] { return std::string("Build buffer allocations"); },
+        tsl::profiler::TraceMeLevel::kInfo);
+    const int64_t num_buffers = allocations.size();
+    for (int64_t i = 0; i < num_buffers; ++i) {
+      const BufferAllocation& allocation = allocations[i];
+      se::DeviceMemoryBase& buffer = buffers[i];
+      if (allocation.is_thread_local()) {
+        // buffer = se::DeviceMemoryBase{};
+      } else if (allocation.is_entry_computation_parameter()) {
+        int64_t param_no = allocation.parameter_number();
+        buffer = [&] {
+          return arguments[param_no]
+              .element(allocation.param_shape_index())
+              .buf->mem();
+        }();
+        if (buffer.is_null() && buffer.size() > 0) {
+          return FailedPrecondition(
+              "Cannot run XLA computation because pointer to (sub-)buffer at "
+              "index %s of parameter %d was null.  All pointers to "
+              "(sub-)buffers must not be null, unless the (sub-)buffer has "
+              "zero elements.",
+              allocation.param_shape_index().ToString(), param_no);
+        }
+      } else if (allocation.is_constant()) {
+        auto it = globals->find(i);
+        if (it != globals->end()) {
+          buffer = it->second;
+        }
+      } else {
+        // Allocate each allocation that might escape, or is the temp buffer.
+        CHECK(allocation.maybe_live_out() ||
+              allocation.IsPreallocatedTempBuffer());
+        const int64_t buffer_size = allocation.size();
+        if (buffer_size > 0) {
+          TF_ASSIGN_OR_RETURN(
+              se::OwningDeviceMemory owning_buffer,
+              memory_allocator->Allocate(device_ordinal, buffer_size,
+                                         /*retry_on_failure=*/true,
+                                         /*memory_space=*/allocation.color()));
+          buffer = owning_buffer.Release();
+        }
+      }
+      TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i));
+    }
+  }
+  xla::gpu::BufferAllocations buffer_allocations(buffers, device_ordinal,
+                                                 memory_allocator);
+  VLOG(3) << buffer_allocations.ToString();
+
+  std::set<se::DeviceMemoryBase> buffers_in_result;
+
+  xla::ShapeTree<tsl::RCReference<RawSEDeviceMemory>> results(
+      gpu_exec->output_shape());
+
+  for (auto& p : results) {
+    const ShapeIndex& index = p.first;
+    if (!gpu_exec->output_info().contains(index)) {
+      continue;
+    }
+    const gpu::GpuExecutable::OutputInfo& output_info =
+        gpu_exec->output_info().at(index);
+    const BufferAllocation* allocation =
+        &allocations[output_info.allocation_index];
+    se::DeviceMemoryBase result_buffer;
+
+    VLOG(4) << "Looking at: allocation " << output_info.allocation_index
+            << " @ index: " << index.ToString();
+
+    if (output_info.alias_config) {
+      PjRtStreamExecutorExecutionInput& input =
+          *arguments[allocation->parameter_number()].mutable_element(
+              allocation->param_shape_index());
+      if (output_info.alias_config->must_alias() && !input.is_donated) {
+        return InvalidArgument(
+            "An input was configured to be must-alias at "
+            "compile time but not donated at runtime: allocation %d",
+            output_info.allocation_index);
+      }
+      if (input.is_donated) {
+        // If the caller passes the ownership of the device memory, reuse it
+        // as the output buffer. It is up to the caller whether or not to
+        // donate a buffer; the aliasing information describes which buffers
+        // may alias, not buffers that must alias.
+        buffers_in_result.insert(input.buf->mem());
+        p.second = input.buf;
+        input.is_donated = false;
+        continue;
+      } else if (!output_info.passthrough &&
+                 !ShapeUtil::GetSubshape(gpu_exec->output_shape(), index)
+                      .IsTuple()) {
+        // The guard is above is not to insert copy-protection when aliasing
+        // pass-through params, as we do not need to write into the output
+        // buffer.
+        VLOG(3) << "Using copy-protection: aliasing is specified, but the "
+                   "buffer is not donated; allocating a fresh buffer";
+        int64_t allocation_size = ShapeUtil::ByteSizeOf(
+            ShapeUtil::GetSubshape(gpu_exec->output_shape(), index));
+        absl::StatusOr<se::OwningDeviceMemory> allocated_buffer =
+            memory_allocator->Allocate(device_ordinal, allocation_size,
+                                       /*retry_on_failure=*/true,
+                                       /*memory_space=*/allocation->color());
+        if (!allocated_buffer.ok()) {
+          return gpu_exec->VerboseAllocationError(allocated_buffer.status());
+        }
+        result_buffer = allocated_buffer->Release();
+        se::DeviceMemoryBase& aliased_buffer =
+            buffer_allocations.GetMutableDeviceAddress(
+                output_info.allocation_index);
+        CHECK_EQ(aliased_buffer.size(), result_buffer.size());
+        TF_RETURN_IF_ERROR(run_options->stream()->MemcpyD2D(
+            &result_buffer, aliased_buffer, aliased_buffer.size()));
+        aliased_buffer = result_buffer;
+      }
+    }
+
+    if (result_buffer.is_null()) {
+      // The source instruction should have a non-parameter buffer
+      // assigned.
+      result_buffer =
+          buffer_allocations.GetDeviceAddress(output_info.allocation_index);
+    }
+    buffers_in_result.insert(result_buffer);
+
+    p.second = RawSEDeviceMemory::Create(
+        result_buffer, device->local_device_id(), memory_allocator);
+  }
+
+  TF_RETURN_IF_ERROR(gpu_exec->ExecuteThunks(buffer_allocations, run_options));
+
+  TF_RETURN_IF_ERROR(buffer_allocations.TearDown(buffers_in_result,
+                                                 gpu_exec->GetAllocations()));
+
+  std::vector<tsl::RCReference<RawSEDeviceMemory>> to_be_released;
+
+  // Free allocations for arguments.
+  for (ShapeTree<PjRtStreamExecutorExecutionInput>& input : arguments) {
+    for (auto& v : input) {
+      if (v.second.is_donated) {
+        to_be_released.push_back(std::move(v.second.buf));
+      }
+    }
+  }
+
+  return PjRtStreamExecutorExecutionOutput(
+      {std::move(results), std::move(to_be_released), {}});
+#else
+  return PjRtStreamExecutorClient::RunAsync(exec, device, std::move(arguments),
+                                            std::move(run_options_inp));
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 1871737b9438..b9742f105512 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "xla/client/local_client.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/layout.h"
@@ -125,28 +126,39 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
                                       int64_t offset,
                                       int64_t transfer_size) override;
 
+  PjRtFuture<> CopyRawHostToDevice(
+      LocalDeviceState* local_device,
+      tsl::RCReference<RawSEDeviceMemory> device_buffer, const void* src,
+      int64_t offset, int64_t transfer_size) override;
+
+  PjRtFuture<> CopyRawDeviceToHost(
+      LocalDeviceState* local_device,
+      tsl::RCReference<RawSEDeviceMemory> device_buffer, void* dst,
+      int64_t offset, int64_t transfer_size) override;
+
   absl::StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
       const override {
     return &topology_;
   }
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
-      std::unique_ptr<PjRtExecutable> executable,
-      const LoadOptions& load_options) override {
-    return absl::WrapUnique<PjRtLoadedExecutable>(
-        tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
-  }
-
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
-      std::unique_ptr<PjRtExecutable> executable);
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
 
   absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerialized(
       absl::string_view serialized, std::optional<CompileOptions> options,
       const LoadOptions& load_options);
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       const XlaComputation& computation, CompileOptions options) override;
 
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
+      mlir::ModuleOp module, CompileOptions options) override;
+
+  absl::StatusOr<PjRtStreamExecutorExecutionOutput> RunAsync(
+      LocalExecutable& exec, PjRtDevice* device,
+      std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> arguments,
+      ExecutableRunOptions run_options) override;
+
  private:
   xla::StreamExecutorGpuTopologyDescription topology_;
   std::shared_ptr<KeyValueStoreInterface> kv_store_;
@@ -165,12 +177,17 @@ absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
     gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
     std::optional<absl::string_view> mock_gpu_topology = std::nullopt,
+    std::optional<int> slice_index = std::nullopt,
     absl::Duration get_local_topology_timeout = absl::Minutes(2),
     absl::Duration get_global_topology_timeout = absl::Minutes(5));
 
 absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     const GpuClientOptions& options);
 
+// Get the fabric info of a local device ordinal in the format of
+// "clusterUuid/cliqueId". Empty on SM90 or lower.
+absl::StatusOr<std::string> GetDeviceFabricInfo(int device_ordinal);
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_benchmark_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_benchmark_test.cc
new file mode 100644
index 000000000000..6b817d7f52a7
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_benchmark_test.cc
@@ -0,0 +1,165 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+
+namespace xla {
+
+using HostBufferSemantics = PjRtClient::HostBufferSemantics;
+
+TEST(StreamExecutorGpuClientBenchmarkTest, NoOp) {
+  // An empty test is needed to avoid build error.
+}
+
+static void BM_AddTwoScalars(benchmark::State& state) {
+  constexpr absl::string_view program = R"(
+    module {
+      func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
+        %0 = stablehlo.add %arg0, %arg0 : tensor<f32>
+        return %0 : tensor<f32>
+    }
+  })";
+
+  mlir::MLIRContext context;
+  auto module = xla::ParseMlirModuleString(program, context);
+  CHECK_OK(module) << "Failed to parse MLIR program";
+
+  GpuClientOptions client_option;
+  client_option.allocator_config.kind = GpuAllocatorConfig::Kind::kBFC;
+  client_option.allocator_config.preallocate = true;
+  client_option.allocator_config.memory_fraction = 0.005;
+  client_option.allowed_devices = {0};
+
+  auto client = GetStreamExecutorGpuClient(client_option);
+  CHECK_OK(client) << "Failed to create StreamExecutorGpuClient";
+  PjRtDevice* device = (*client)->addressable_devices().front();
+
+  xla::CompileOptions compile_options;
+  auto executable = (*client)->CompileAndLoad(**module, compile_options);
+  CHECK_OK(executable) << "Failed to compile executable";
+
+  float input = 42.0f;
+
+  for (auto _ : state) {
+    auto input_buffer = (*client)->BufferFromHostBuffer(
+        &input, F32, {},
+        /*byte_strides=*/std::nullopt,
+        HostBufferSemantics::kImmutableOnlyDuringCall,
+        /*on_done_with_host_buffer=*/nullptr, *device->default_memory_space(),
+        /*device_layout=*/nullptr);
+    CHECK_OK(input_buffer) << "Failed to create input buffer";
+    std::array<PjRtBuffer*, 1> args = {input_buffer->get()};
+
+    auto result = (*executable)->ExecuteSharded(args, device, ExecuteOptions());
+    CHECK_OK(result) << "Failed to execute executable";
+    CHECK_EQ(result->size(), 1) << "Expected 1 result buffer";
+
+    PjRtBuffer* result_buffer = result->front().get();
+    auto literal = result_buffer->ToLiteralSync();
+    CHECK_OK(literal) << "Failed to convert buffer to literal";
+    VLOG(10) << "Result: " << **literal;
+  }
+}
+
+BENCHMARK(BM_AddTwoScalars);
+
+static void BM_AddManyScalars(benchmark::State& state) {
+  constexpr absl::string_view program = R"(
+    module {
+      func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>,
+                      %arg2: tensor<f32>, %arg3: tensor<f32>,
+                      %arg4: tensor<f32>, %arg5: tensor<f32>,
+                      %arg6: tensor<f32>, %arg7: tensor<f32>,
+                      %arg8: tensor<f32>, %arg9: tensor<f32>)
+    -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>)
+    {
+        %0 = stablehlo.add %arg0, %arg1 : tensor<f32>
+        %1 = stablehlo.add %arg2, %arg3 : tensor<f32>
+        %2 = stablehlo.add %arg4, %arg5 : tensor<f32>
+        %3 = stablehlo.add %arg6, %arg7 : tensor<f32>
+        %4 = stablehlo.add %arg8, %arg9 : tensor<f32>
+        return %0, %1, %2, %3, %4 : tensor<f32>, tensor<f32>, tensor<f32>,
+                                    tensor<f32>, tensor<f32>
+    }
+  })";
+
+  mlir::MLIRContext context;
+  auto module = xla::ParseMlirModuleString(program, context);
+  CHECK_OK(module) << "Failed to parse MLIR program";
+
+  GpuClientOptions client_option;
+  client_option.allocator_config.kind = GpuAllocatorConfig::Kind::kBFC;
+  client_option.allocator_config.preallocate = true;
+  client_option.allocator_config.memory_fraction = 0.005;
+  client_option.allowed_devices = {0};
+
+  auto client = GetStreamExecutorGpuClient(client_option);
+  CHECK_OK(client) << "Failed to create StreamExecutorGpuClient";
+  CHECK_EQ((*client)->addressable_devices().size(), 1);
+  PjRtDevice* device = (*client)->addressable_devices().front();
+
+  xla::CompileOptions compile_options;
+  auto executable = (*client)->CompileAndLoad(**module, compile_options);
+  CHECK_OK(executable) << "Failed to compile executable";
+
+  float inputs[10] = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f};
+
+  for (auto _ : state) {
+    std::array<absl::StatusOr<std::unique_ptr<PjRtBuffer>>, 10> input_buffers;
+    std::array<PjRtBuffer*, 10> args;
+
+    for (size_t i = 0; i < 10; ++i) {
+      input_buffers[i] = (*client)->BufferFromHostBuffer(
+          &inputs[i], F32, {},
+          /*byte_strides=*/std::nullopt,
+          HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/nullptr, *device->default_memory_space(),
+          /*device_layout=*/nullptr);
+      CHECK_OK(input_buffers[i]) << "Failed to create input buffer";
+      args[i] = input_buffers[i]->get();
+    }
+
+    auto result = (*executable)->ExecuteSharded(args, device, ExecuteOptions());
+    CHECK_OK(result) << "Failed to execute executable";
+    CHECK_EQ(result->size(), 5) << "Expected 5 result buffer";
+
+    for (size_t i = 0; i < 5; ++i) {
+      PjRtBuffer* result_buffer = (*result)[i].get();
+      auto literal = result_buffer->ToLiteralSync();
+      CHECK_OK(literal) << "Failed to convert buffer to literal";
+      VLOG(10) << "Result [" << i << "]: " << **literal;
+    }
+  }
+}
+
+BENCHMARK(BM_AddManyScalars);
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
new file mode 100644
index 000000000000..ecd1b0757f72
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
@@ -0,0 +1,307 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdlib.h>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout.h"
+#include "xla/pjrt/distributed/client.h"
+#include "xla/pjrt/distributed/distributed.h"
+#include "xla/pjrt/distributed/in_memory_key_value_store.h"
+#include "xla/pjrt/distributed/service.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/pjrt/raw_buffer.h"
+#include "xla/service/platform_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using ::testing::NotNull;
+using ::testing::SizeIs;
+
+HloInstruction* FindInstruction(const HloModule* module, HloOpcode opcode) {
+  for (const HloComputation* computation : module->computations()) {
+    if (HloInstruction* instruction =
+            hlo_query::FindInstruction(computation, opcode)) {
+      return instruction;
+    }
+  }
+  return nullptr;
+}
+
+absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileExecutable(
+    absl::string_view program, xla::PjRtClient& client,
+    xla::CompileOptions compile_options = xla::CompileOptions()) {
+  TF_ASSIGN_OR_RETURN(auto hlo_module,
+                      ParseAndReturnUnverifiedModule(program, {}));
+
+  xla::XlaComputation xla_computation(hlo_module->ToProto());
+  return client.CompileAndLoad(xla_computation, compile_options);
+}
+
+// Register a mock "mosaic_gpu" custom call op for NvshmemMemoryTest, since
+// mosaic_gpu is defined in JAX and won't be available to the unit test.
+static absl::Status MockMosaicGpu(ffi::AnyBuffer arg,
+                                  ffi::Result<ffi::AnyBuffer> ret,
+                                  absl::string_view module) {
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kMockMosaicGpu, MockMosaicGpu,
+                       ffi::Ffi::Bind()
+                           .Arg<ffi::AnyBuffer>()
+                           .Ret<ffi::AnyBuffer>()
+                           .Attr<absl::string_view>("module"));
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "mosaic_gpu",
+                         PlatformUtil::CanonicalPlatformName("GPU").value(),
+                         kMockMosaicGpu);
+
+// Verify that the client can initialize NVSHMEM and that buffers used by
+// mosaic_gpu custom calls are assigned to the collective memory space.
+TEST(StreamExecutorGpuClientTest, NvshmemMemoryTest) {
+  static constexpr char const* kProgram = R"(
+    HloModule ffi_handler
+    ENTRY main {
+      param = s32[1,4]{1,0} parameter(0)
+      reshape = s32[4]{0} reshape(param)
+      ROOT %custom-call = s32[4] custom-call(param),
+          custom_call_target="mosaic_gpu",
+          api_version=API_VERSION_TYPED_FFI,
+          backend_config={"custom_call_backend_config": {"attributes": "{module = \"nvshmem\"}"}}
+    })";
+  // Nvshmem requires one gpu per process.
+  GpuClientOptions client_options;
+  client_options.node_id = 0;
+  client_options.allowed_devices = {0};
+  client_options.num_nodes = 1;
+  client_options.kv_store = std::make_shared<InMemoryKeyValueStore>();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetStreamExecutorGpuClient(client_options));
+  xla::CompileOptions options;
+  options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_experimental_enable_nvshmem(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtLoadedExecutable> executable,
+                          CompileExecutable(kProgram, *client, options));
+  std::vector<int32_t> data{1, 2, 3, 4};
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {1, 4},
+                                                    /*minor_to_major=*/{1, 0});
+  shape.mutable_layout()->set_memory_space(Layout::kDefaultMemorySpace);
+
+  PjRtDevice* const device = client->addressable_devices()[0];
+  TF_EXPECT_OK(device->default_memory_space());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> input,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/nullptr, *device->default_memory_space(),
+          /*device_layout=*/nullptr));
+  EXPECT_EQ(input->memory_space()->kind(), "device");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::vector<absl::string_view>> memory_kinds,
+      executable->GetOutputMemoryKinds());
+  EXPECT_EQ(memory_kinds.size(), 1);
+  EXPECT_EQ(memory_kinds[0].size(), 1);
+  EXPECT_EQ(memory_kinds[0][0], "device");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
+      executable->Execute({{input.get()}}, ExecuteOptions()));
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
+  Shape result_shape = result_buffers[0]->on_device_shape();
+  int64_t memory_space = result_shape.layout().memory_space();
+  EXPECT_EQ(memory_space, 1);
+}
+
+// Verify that all-reduce ops that use nvshmem buffers
+// are colored correctly
+TEST(StreamExecutorGpuClientTest, NvshmemMemArCanBeColored) {
+  const absl::string_view kProgram = R"(
+    HloModule test
+    apply_op {
+    x = u32[] parameter(0)
+    y = u32[] parameter(1)
+    ROOT apply_op = u32[] add(x, y)
+    }
+    ENTRY test_computation {
+    id = u32[] replica-id()
+    all-reduce = u32[] all-reduce-start(id), to_apply=apply_op, backend_config={"collective_backend_config":{"backend":"NVSHMEM"}}
+    ROOT all-reduce-done = u32[] all-reduce-done(all-reduce)
+    }
+    )";
+  // Nvshmem requires one gpu per process.
+  GpuClientOptions client_options;
+  client_options.node_id = 0;
+  client_options.allowed_devices = {0};
+  client_options.num_nodes = 1;
+  client_options.kv_store = std::make_shared<InMemoryKeyValueStore>();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetStreamExecutorGpuClient(client_options));
+  xla::CompileOptions options;
+  options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_experimental_enable_nvshmem(true);
+
+  options.executable_build_options.set_run_backend_only(true);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnUnverifiedModule(kProgram, {}));
+  xla::XlaComputation xla_computation(hlo_module->ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtLoadedExecutable> executable,
+                          client->CompileAndLoad(xla_computation, options));
+
+  // Verify that the collective memory space is used.
+  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+  HloInstruction* all_reduce_start =
+      FindInstruction(modules[0].get(), HloOpcode::kAllReduceStart);
+  EXPECT_THAT(all_reduce_start, NotNull());
+  EXPECT_EQ(all_reduce_start->shape().layout().memory_space(), 1);
+  EXPECT_THAT(all_reduce_start->operands(), SizeIs(1));
+  const HloInstruction* input = all_reduce_start->operand(0);
+  EXPECT_EQ(input->shape().layout().memory_space(), 1);
+}
+
+absl::Status UserBufferWithNvshmemMallocTestBody(const int node_id,
+                                                 const int num_nodes) {
+  const absl::string_view kModuleStr = R"(
+HloModule test
+apply_op {
+x = u32[] parameter(0)
+y = u32[] parameter(1)
+ROOT apply_op = u32[] add(x, y)
+}
+ENTRY test_computation {
+id = u32[] replica-id()
+ROOT all-reduce = u32[] all-reduce(id), to_apply=apply_op
+}
+)";
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+  if (node_id == 0) {
+    xla::CoordinationServiceImpl::Options service_options;
+    service_options.num_nodes = num_nodes;
+    TF_ASSIGN_OR_RETURN(service, xla::GetDistributedRuntimeService(
+                                     "[::]:12346", service_options));
+  }
+
+  xla::DistributedRuntimeClient::Options distributed_options;
+  distributed_options.node_id = node_id;
+  distributed_options.init_timeout = absl::Seconds(120);
+  auto distributed_client =
+      GetDistributedRuntimeClient("127.0.0.1:12346", distributed_options);
+  TF_QCHECK_OK(distributed_client->Connect());
+  GpuClientOptions client_options;
+  client_options.node_id = node_id;
+  client_options.allowed_devices = {node_id};
+  client_options.num_nodes = num_nodes;
+  client_options.kv_store =
+      GetDistributedKeyValueStore(distributed_client, /*key_prefix=*/"gpu:");
+  ;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                      GetStreamExecutorGpuClient(client_options));
+  xla::CompileOptions options;
+  options.executable_build_options.set_num_replicas(num_nodes);
+  options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_experimental_enable_nvshmem(true);
+  options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_enable_nccl_user_buffers(true);
+
+  TF_ASSIGN_OR_RETURN(auto hlo_module,
+                      ParseAndReturnUnverifiedModule(kModuleStr, {}));
+  xla::XlaComputation xla_computation(hlo_module->ToProto());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtLoadedExecutable> executable,
+                      client->CompileAndLoad(xla_computation, options));
+
+  // Verify that the collective memory space is used.
+  TF_ASSIGN_OR_RETURN(auto modules, executable->GetHloModules());
+  HloInstruction* all_reduce_start =
+      FindInstruction(modules[0].get(), HloOpcode::kAllReduceStart);
+  EXPECT_THAT(all_reduce_start, NotNull());
+  EXPECT_EQ(all_reduce_start->shape().layout().memory_space(), 1);
+  EXPECT_THAT(all_reduce_start->operands(), SizeIs(1));
+  const HloInstruction* input = all_reduce_start->operand(0);
+  EXPECT_EQ(input->shape().layout().memory_space(), 1);
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results,
+      executable->Execute(/*argument_handles=*/{{}}, /*options=*/{}));
+  EXPECT_EQ(results.size(), 1);
+  EXPECT_EQ(results[0].size(), 1);
+  TF_ASSIGN_OR_RETURN(auto literal, results[0][0]->ToLiteralSync());
+  if (node_id == 0) {
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, *literal);
+  } else if (node_id == 1) {
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({20, 25, 21, 26}, *literal);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char* argv[]) {
+  int node_id = -1;
+  int num_nodes = -1;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("node_id", &node_id, "Node ID for multiprocess tests."),
+      tsl::Flag("num_nodes", &num_nodes,
+                "Number of nodes for multiprocess tests."),
+  };
+  std::string usage = tsl::Flags::Usage(argv[0], flag_list);
+  tsl::Flags::Parse(&argc, argv, flag_list);
+  testing::InitGoogleTest(&argc, argv);
+  if (node_id >= 0) {
+    absl::Status result =
+        xla::UserBufferWithNvshmemMallocTestBody(node_id, num_nodes);
+    if (!result.ok()) {
+      LOG(ERROR) << result;
+    }
+    return result.raw_code();
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index 6272fc4f2204..9d1d6813045a 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -24,24 +24,29 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include <optional>
+#include <set>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/debug_options_flags.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/builder/xla_computation.h"
@@ -50,21 +55,26 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/pjrt/profiling/test_util/mock_device_time_measurement.h"
+#include "xla/pjrt/raw_buffer.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -74,21 +84,20 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/subprocess.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/mem.h"
+#include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
@@ -111,7 +120,7 @@ absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileExecutable(
                       ParseAndReturnUnverifiedModule(program, {}));
 
   xla::XlaComputation xla_computation(hlo_module->ToProto());
-  return client.Compile(xla_computation, compile_options);
+  return client.CompileAndLoad(xla_computation, compile_options);
 }
 
 // Given the result of a PjrtExecutable::Execute call (TF-status of vectors of
@@ -156,9 +165,16 @@ static constexpr char const* kProgram = R"(HloModule HostTransfer
       ROOT result = f32[2] get-tuple-element(recv-done), index=0
     })";
 
+GpuClientOptions DefaultOptions() {
+  // Most test cases expect exactly 2 GPUs.
+  GpuClientOptions options;
+  options.allowed_devices = std::set<int>({0, 1});
+  return options;
+}
+
 TEST(StreamExecutorGpuClientTest, MemorySpace) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->devices().size(), 1);
 
   for (auto* device : client->devices()) {
@@ -179,7 +195,7 @@ TEST(StreamExecutorGpuClientTest, MemorySpace) {
 
 TEST(StreamExecutorGpuClientTest, MemorySpacesUniqueIds) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->devices().size(), 1);
 
   absl::flat_hash_map<int, std::string> memories;
@@ -193,9 +209,59 @@ TEST(StreamExecutorGpuClientTest, MemorySpacesUniqueIds) {
   }
 }
 
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+TEST(StreamExecutorGpuClientTest, DonateExternalMem) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+  auto shape = xla::ShapeUtil::MakeScalarShape(xla::F32);
+
+  std::vector<float> data = {1.0f};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer_a,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/nullptr,
+          client->addressable_devices()[0]->memory_spaces()[0],
+          /*device_layout=*/nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer_ref,
+                          buffer_a->AcquireExternalReference());
+
+  auto device_ptr = buffer_ref->OpaqueDeviceMemoryDataPointer();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer, client->CreateViewOfDeviceBuffer(
+                       device_ptr, shape, buffer_a->memory_space(),
+                       [buf = std::shared_ptr<PjRtBuffer::ExternalReference>(
+                            std::move(buffer_ref))]() {}));
+
+  static constexpr char const* kAddProgram =
+      R"(
+HloModule jit_add_one, input_output_alias={ {}: (0, {}, may-alias) }, entry_computation_layout={(f32[])->f32[]}
+
+ENTRY main.5 {
+  x = f32[] parameter(0), sharding={replicated}
+  constant = f32[] constant(1)
+  ROOT result = f32[] add(x, constant)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kAddProgram, *client));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, executable->Execute({{buffer.get()}}, /*options=*/{}));
+
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result[0].size(), 1);
+  TF_EXPECT_OK(result[0][0]->GetReadyFuture().Await());
+}
+#endif  // defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+
 TEST(StreamExecutorGpuClientTest, PropagateError) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto shape = xla::ShapeUtil::MakeScalarShape(xla::F32);
   absl::Status input_error = absl::InvalidArgumentError("input error");
   TF_ASSERT_OK_AND_ASSIGN(
@@ -231,7 +297,7 @@ ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
 // TODO(b/372735047): Fix and reenable.
 TEST(StreamExecutorGpuClientTest, DISABLED_DonateWithControlDependency) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto shape = xla::ShapeUtil::MakeScalarShape(xla::F32);
   absl::Status input_error = absl::InvalidArgumentError("input error");
   TF_ASSERT_OK_AND_ASSIGN(
@@ -275,7 +341,7 @@ ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
 
 TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
@@ -327,7 +393,7 @@ TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
 
 TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
@@ -362,7 +428,7 @@ TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
 
 TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
@@ -434,7 +500,7 @@ TEST(StreamExecutorGpuClientTest, ForwardUserDataToFfiHandler) {
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
 
@@ -484,7 +550,7 @@ TEST(StreamExecutorGpuClientTest, PassAttrToFfiHandler) {
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
 
@@ -498,7 +564,7 @@ TEST(StreamExecutorGpuClientTest, PassAttrToFfiHandler) {
 
 TEST(StreamExecutorGpuClientTest, ToLiteralAsync) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   auto* d = client->addressable_devices()[0];
@@ -536,7 +602,7 @@ TEST(StreamExecutorGpuClientTest, ToLiteralAsync) {
 
 TEST(StreamExecutorGpuClientTest, ToLiteralAsyncWithNonCompactLayout) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   xla::Shape transposed_shape = xla::ShapeUtil::MakeShapeWithDenseLayout(
@@ -557,34 +623,97 @@ TEST(StreamExecutorGpuClientTest, ToLiteralAsyncWithNonCompactLayout) {
           client->addressable_devices()[0]->memory_spaces()[0]));
   auto buffer = transfer_manager->RetrieveBuffer(0);
 
-  absl::Mutex mu;
+  absl::Notification n;
   auto literal = std::make_shared<Literal>(
       ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
-  bool got_literal = false;
 
   TF_ASSERT_OK(
       transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
 
   buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
-    absl::MutexLock l(&mu);
     TF_ASSERT_OK(s);
-    got_literal = true;
+    n.Notify();
   });
   buffer.reset();
 
-  {
-    absl::MutexLock l(&mu);
-    mu.Await(absl::Condition(&got_literal));
-  }
+  n.WaitForNotification();
 
   ASSERT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
   ASSERT_EQ(src_literal.data<int32_t>(),
             literal->Relayout(src_literal.shape().layout()).data<int32_t>());
 }
 
+TEST(StreamExecutorGpuClientTest, ToLiteralAsyncWithDifferentMajorToMinor) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  xla::Shape shape = xla::ShapeUtil::MakeShapeWithDenseLayout(
+      xla::S32, {2, 3}, /*minor_to_major=*/{1, 0});
+  xla::Literal src_literal = xla::LiteralUtil::CreateR2WithLayout<int32_t>(
+      {{3, 14, 25}, {36, 47, 58}}, shape.layout());
+
+  PjRtClient::ShapeSpec spec;
+  spec.element_type = src_literal.shape().element_type();
+  spec.dims = DimensionVector(src_literal.shape().dimensions().begin(),
+                              src_literal.shape().dimensions().end());
+  xla::Shape transposed_shape = xla::ShapeUtil::MakeShapeWithDenseLayout(
+      xla::S32, {2, 3}, /*minor_to_major=*/{0, 1});
+  std::vector<std::optional<xla::Layout>> device_layouts = {
+      std::make_optional(transposed_shape.layout())};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          {spec}, device_layouts,
+          client->addressable_devices()[0]->memory_spaces()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+
+  absl::Notification n;
+  auto literal = std::make_shared<Literal>(shape);
+
+  TF_ASSERT_OK(
+      transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
+
+  buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
+    TF_ASSERT_OK(s);
+    n.Notify();
+  });
+  buffer.reset();
+
+  n.WaitForNotification();
+
+  ASSERT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
+  ASSERT_EQ(src_literal.data<int32_t>(),
+            literal->Relayout(src_literal.shape().layout()).data<int32_t>());
+}
+
+TEST(StreamExecutorGpuClientTest, ToLiteralAsyncToken) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  xla::Literal literal = xla::LiteralUtil::CreateToken();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostLiteral(
+          literal, client->addressable_devices()[0]->memory_spaces()[0]));
+  TF_ASSERT_OK(buffer->GetReadyFuture().Await());
+
+  absl::Notification n;
+
+  buffer->ToLiteral(&literal).OnReady([&](absl::Status s) {
+    TF_ASSERT_OK(s);
+    n.Notify();
+  });
+  buffer.reset();
+
+  n.WaitForNotification();
+}
+
 TEST(StreamExecutorGpuClientTest, ToLiteralAsyncBeforeBufferReady) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   auto* d = client->addressable_devices()[0];
@@ -625,7 +754,7 @@ TEST(StreamExecutorGpuClientTest, ToLiteralAsyncBeforeBufferReady) {
 
 TEST(StreamExecutorGpuClientTest, FromHostAsync) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   auto* d = client->addressable_devices()[0];
@@ -694,7 +823,7 @@ TEST(StreamExecutorGpuClientTest, FromHostAsync) {
 
 TEST(StreamExecutorGpuClientTest, FromHostAsyncPinnedHost) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
   TF_ASSERT_OK_AND_ASSIGN(
       auto* pinned_memory_space,
@@ -766,7 +895,7 @@ TEST(StreamExecutorGpuClientTest, FromHostAsyncPinnedHost) {
 
 TEST(StreamExecutorGpuClientTest, FromHostAsyncPinnedHostChunked) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_THAT(client->addressable_devices(), SizeIs(Gt(0)));
   TF_ASSERT_OK_AND_ASSIGN(
       PjRtMemorySpace * memspace,
@@ -850,7 +979,7 @@ TEST(StreamExecutorGpuClientTest, DeleteBufferThenFulfillBufferNoDeadLock) {
 
 TEST(StreamExecutorGpuClientTest, CopyRawToHostFullBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtBuffer> buffer,
@@ -870,7 +999,7 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostFullBuffer) {
 
 TEST(StreamExecutorGpuClientTest, CopyRawToHostSubBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -889,7 +1018,7 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostSubBuffer) {
 
 TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -907,7 +1036,7 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
 
 TEST(StreamExecutorGpuClientTest, CopyRawToHostFuture) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtBuffer> buffer,
@@ -920,8 +1049,8 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostFuture) {
   auto ready = buffer->GetReadyFuture();
   auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
 
-  // Drop the buffer before fulfilling `dst`. The transfer should still keep the
-  // buffer alive.
+  // Drop the buffer before fulfilling `dst`. The transfer should still keep
+  // the buffer alive.
   buffer.reset();
   ready.OnReady([dst_promise = std::move(dst_promise),
                  size](absl::Status status) mutable {
@@ -940,7 +1069,7 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostFuture) {
 
 TEST(StreamExecutorGpuClientTest, AsyncCopyToDevice) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 2);
 
   // d0 is the device we will perform local/remote sends from.
@@ -974,7 +1103,7 @@ TEST(StreamExecutorGpuClientTest, AsyncCopyToDevice) {
 
 TEST(StreamExecutorGpuClientTest, CreateMixOfErrorBuffers) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   std::vector<Literal> src_literals;
@@ -1080,9 +1209,40 @@ TEST(StreamExecutorGpuClientTest, DistributedInit) {
   }
 }
 
+TEST(StreamExecutorGpuClientTest, GetDeviceFabricInfo) {
+  auto kv_store = std::make_shared<InMemoryKeyValueStore>();
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
+                                      "PopulateAndRetrieveFabricInfos", 4);
+  constexpr int num_nodes = 2;
+  for (int node_id = 0; node_id < num_nodes; ++node_id) {
+    thread_pool.Schedule([kv_store, node_id] {
+      GpuClientOptions options = DefaultOptions();
+      options.node_id = node_id;
+      options.num_nodes = num_nodes;
+      options.kv_store = kv_store;
+      TF_ASSERT_OK_AND_ASSIGN(auto client, GetStreamExecutorGpuClient(options));
+      for (const auto& device : client->addressable_devices()) {
+        LocalDeviceState* local_device_state =
+            tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
+                ->local_device_state();
+        if (local_device_state != nullptr) {
+          se::StreamExecutor* executor = local_device_state->executor();
+          if (std::stoi(MakeComputeCapabilityString(
+                  &executor->GetDeviceDescription())) == 9) {
+            auto fabric_info = GetDeviceFabricInfo(executor->device_ordinal());
+            if (fabric_info.ok()) {
+              ADD_FAILURE();
+            }
+          }
+        }
+      }
+    });
+  }
+}
+
 TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->addressable_devices().size(), 2);
 
   for (auto device : client->addressable_devices()) {
@@ -1091,6 +1251,7 @@ TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<PjRtBuffer> buffer,
         client->BufferFromHostLiteral(literal, memory_space));
+    TF_ASSERT_OK(buffer->GetReadyFuture().Await());
 
     auto stats = device->GetAllocatorStats();
     TF_ASSERT_OK(stats.status());
@@ -1100,7 +1261,7 @@ TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
 
 TEST(StreamExecutorGpuClientTest, GpuDeviceDescriptionTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   for (int device_index = 0; device_index < client->device_count();
        device_index++) {
     auto coords =
@@ -1111,11 +1272,50 @@ TEST(StreamExecutorGpuClientTest, GpuDeviceDescriptionTest) {
   }
 }
 
-TEST(StreamExecutorGpuClientTest, MockNcclClientTest) {
+TEST(StreamExecutorGpuClientTest, GetTopologyDescriptionWithGlobalDevicesTest) {
   const int num_nodes = 4;
   GpuClientOptions options;
   options.num_nodes = num_nodes;
   options.enable_mock_nccl = true;
+  options.mock_gpu_topology = "2x2x2";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetStreamExecutorGpuClient(options));
+  int devices_per_host = client->addressable_device_count();
+
+  TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
+                          client->GetTopologyDescription());
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+      device_descriptions = topology->DeviceDescriptions();
+  EXPECT_EQ(client->device_count(), device_descriptions.size());
+
+  for (const auto& device_description : device_descriptions) {
+    EXPECT_EQ(device_description->process_index(),
+              device_description->id() / devices_per_host);
+  }
+}
+
+TEST(TfrtCpuClientTest, CopyToMemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+  for (auto* memory_space : client->memory_spaces()) {
+    xla::Shape shape = xla::ShapeUtil::MakeShape(S32, {128, 256});
+    TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto buffer, client->BufferFromHostLiteral(literal, memory_space));
+    TF_ASSERT_OK_AND_ASSIGN(buffer,
+                            buffer->CopyToMemorySpace(buffer->memory_space()));
+    TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+    EXPECT_THAT(received_literal->data<int32_t>(),
+                ElementsAreArray(literal.data<int32_t>()));
+  }
+}
+
+TEST(StreamExecutorGpuClientTest, MockNcclClientTest) {
+  GpuClientOptions options = DefaultOptions();
+  const int num_nodes = 4;
+  options.num_nodes = num_nodes;
+  options.enable_mock_nccl = true;
   TF_ASSERT_OK_AND_ASSIGN(auto client, GetStreamExecutorGpuClient(options));
 
   auto devices_per_host = client->addressable_device_count();
@@ -1131,7 +1331,7 @@ TEST(StreamExecutorGpuClientTest, MockNcclClientTest) {
 }
 
 TEST(StreamExecutorGpuClientTest, ShouldStageHostToDeviceTransfersSetToTrue) {
-  GpuClientOptions options_staging;
+  GpuClientOptions options_staging = DefaultOptions();
   options_staging.should_stage_host_to_device_transfers = true;
   TF_ASSERT_OK_AND_ASSIGN(auto client_staging,
                           GetStreamExecutorGpuClient(options_staging));
@@ -1160,7 +1360,7 @@ TEST(StreamExecutorGpuClientTest, ShouldStageHostToDeviceTransfersSetToTrue) {
 }
 
 TEST(StreamExecutorGpuClientTest, ShouldStageHostToDeviceTransfersSetToFalse) {
-  GpuClientOptions options_no_staging;
+  GpuClientOptions options_no_staging = DefaultOptions();
   options_no_staging.should_stage_host_to_device_transfers = false;
   TF_ASSERT_OK_AND_ASSIGN(auto client_no_staging,
                           GetStreamExecutorGpuClient(options_no_staging));
@@ -1189,7 +1389,7 @@ TEST(StreamExecutorGpuClientTest, ShouldStageHostToDeviceTransfersSetToFalse) {
 }
 
 TEST(StreamExecutorGpuClientTest, MockNcclClientWithGpuTopologyTest) {
-  GpuClientOptions options;
+  GpuClientOptions options = DefaultOptions();
   options.enable_mock_nccl = true;
   options.num_nodes = 8;
   options.mock_gpu_topology = "2x4x2";
@@ -1226,7 +1426,7 @@ module @jit_f attributes {mhlo.num_partitions = 8 : i32,
 })";
 
 TEST(StreamExecutorGpuClientTest, MockNcclClientWithGpuTopologyExecuteTest) {
-  GpuClientOptions client_options;
+  GpuClientOptions client_options = DefaultOptions();
   client_options.enable_mock_nccl = true;
   client_options.num_nodes = 4;
   client_options.mock_gpu_topology = "2x2x2";
@@ -1244,7 +1444,8 @@ TEST(StreamExecutorGpuClientTest, MockNcclClientWithGpuTopologyExecuteTest) {
   options.executable_build_options.set_num_partitions(8)
       .set_use_spmd_partitioning(true)
       .set_allow_spmd_sharding_propagation_to_output({true});
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->Compile(*module, options));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->CompileAndLoad(*module, options));
 
   Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {1}, {0});
   std::vector<std::unique_ptr<PjRtBuffer>> inputs;
@@ -1270,7 +1471,7 @@ TEST(StreamExecutorGpuClientTest, MockNcclClientWithGpuTopologyExecuteTest) {
 }
 
 TEST(StreamExecutorGpuClientTest, MockNcclClientWithGpuTopologyMismatchTest) {
-  GpuClientOptions options;
+  GpuClientOptions options = DefaultOptions();
   options.enable_mock_nccl = true;
   options.num_nodes = 16;
   options.mock_gpu_topology = "2x4";
@@ -1279,7 +1480,7 @@ TEST(StreamExecutorGpuClientTest, MockNcclClientWithGpuTopologyMismatchTest) {
 
 TEST(StreamExecutorGpuClientTest, BufferFromHostBufferPinnedMemory) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   std::vector<int32_t> data{1, 2, 3, 4};
   Shape shape = ShapeUtil::MakeShape(S32, {4});
   TF_ASSERT_OK_AND_ASSIGN(
@@ -1305,7 +1506,7 @@ TEST(StreamExecutorGpuClientTest, BufferFromHostBufferPinnedMemory) {
 
 TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpace) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   std::vector<int32_t> data{1, 2, 3, 4};
   Shape shape = ShapeUtil::MakeShape(S32, {4});
   auto device = client->addressable_devices()[0];
@@ -1335,7 +1536,7 @@ TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpace) {
 
 TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   std::vector<int8_t> data{1, 2, 3, 4};
   Shape shape = ShapeUtil::MakeShape(S4, {4});
   auto device = client->addressable_devices()[0];
@@ -1365,7 +1566,7 @@ TEST(StreamExecutorGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
 
 TEST(StreamExecutorGpuClientTest, OpaqueDeviceMemoryDataPointer) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_THAT(client->addressable_devices(), SizeIs(Gt(0)));
   PjRtDevice* device = client->addressable_devices()[0];
   TF_ASSERT_OK_AND_ASSIGN(
@@ -1486,7 +1687,7 @@ constexpr char const* kCollectiveMemorySpaceOutput = R"(
 
 TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto input, CreateDeviceBufferForTest(client.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kD2HProgram, *client));
@@ -1496,15 +1697,15 @@ TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTest) {
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
   EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "pinned_host");
 
-  TF_ASSERT_OK_AND_ASSIGN(auto memory_stats,
-                          executable->GetCompiledMemoryStats());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto memory_stats, executable->GetExecutable()->GetCompiledMemoryStats());
   EXPECT_EQ(memory_stats.output_size_in_bytes, 0);
   EXPECT_EQ(memory_stats.host_output_size_in_bytes, 16);
 }
 
 TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto input, CreateDeviceBufferForTest(client.get()));
 
   // Build the output shape with the correct memory space set.
@@ -1537,23 +1738,23 @@ TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTupleTest) {
 
 TEST(StreamExecutorGpuClientTest, ExecutablePinnedHostOutputMemoryKindTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kD2HProgram, *client));
 
   TF_ASSERT_OK_AND_ASSIGN(auto memory_kinds,
-                          executable->GetOutputMemoryKinds());
+                          executable->GetExecutable()->GetOutputMemoryKinds());
   EXPECT_EQ(memory_kinds.size(), 1);
   EXPECT_EQ(memory_kinds[0].size(), 1);
   EXPECT_EQ(memory_kinds[0][0], "pinned_host");
 }
 
-// Verify the output device memory kind with collective memory space shape when
-// NCCL user buffer is enabled.
+// Verify the output device memory kind with collective memory space shape
+// when NCCL user buffer is enabled.
 TEST(StreamExecutorGpuClientTest,
      ExecutableCollectiveMemoryOutputMemoryKindTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   xla::CompileOptions options;
   options.executable_build_options.mutable_debug_options()
       ->set_xla_gpu_enable_nccl_user_buffers(true);
@@ -1580,7 +1781,7 @@ TEST(StreamExecutorGpuClientTest,
   EXPECT_EQ(input->memory_space()->kind(), "device");
 
   TF_ASSERT_OK_AND_ASSIGN(auto memory_kinds,
-                          executable->GetOutputMemoryKinds());
+                          executable->GetExecutable()->GetOutputMemoryKinds());
   EXPECT_EQ(memory_kinds.size(), 1);
   EXPECT_EQ(memory_kinds[0].size(), 1);
   EXPECT_EQ(memory_kinds[0][0], "device");
@@ -1597,7 +1798,7 @@ TEST(StreamExecutorGpuClientTest,
 TEST(StreamExecutorGpuClientTest,
      ExecutablePinnedHostTupleOutputMemoryKindTest) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   // Build the output shape with the correct memory space set.
   Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {4}, {0});
@@ -1615,7 +1816,7 @@ TEST(StreamExecutorGpuClientTest,
       CompileExecutable(kD2HProgramTupleOutput, *client, options));
 
   TF_ASSERT_OK_AND_ASSIGN(auto memory_kinds,
-                          executable->GetOutputMemoryKinds());
+                          executable->GetExecutable()->GetOutputMemoryKinds());
   EXPECT_EQ(memory_kinds.size(), 1);
   EXPECT_EQ(memory_kinds[0].size(), 2);
   EXPECT_EQ(memory_kinds[0][0], "device");
@@ -1643,14 +1844,15 @@ TEST(StreamExecutorGpuClientTest, MlirParameterHostMemorySpaceIsSetInHlo) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   mlir::MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           xla::ParseMlirModuleString(kMlirH2D, context));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->Compile(*module, {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->CompileAndLoad(*module, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto modules,
+                          executable->GetExecutable()->GetHloModules());
 
   auto first_param_layout =
       modules[0]->entry_computation_layout().parameter_layout(0).layout();
@@ -1681,14 +1883,15 @@ TEST(StreamExecutorGpuClientTest, MlirResultHostMemorySpaceIsSetInHlo) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   mlir::MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           xla::ParseMlirModuleString(kMlirD2H, context));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->Compile(*module, {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->CompileAndLoad(*module, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto modules,
+                          executable->GetExecutable()->GetHloModules());
 
   auto first_param_layout =
       modules[0]->entry_computation_layout().parameter_layout(0).layout();
@@ -1707,7 +1910,7 @@ TEST(StreamExecutorGpuClientTest, ProfileExecution) {
       ROOT res = f32[] add(c0, c1)
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
   ExecutionProfile profile;
@@ -1732,14 +1935,15 @@ TEST(StreamExecutorGpuClientTest, MlirAutoResultLayoutIsSet) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   mlir::MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(auto module, xla::ParseMlirModuleString(
                                            kMlirWithParameterLayout, context));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->Compile(*module, {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->CompileAndLoad(*module, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto modules,
+                          executable->GetExecutable()->GetHloModules());
 
   auto result_layout =
       modules[0]->entry_computation_layout().result_layout().layout();
@@ -1761,20 +1965,60 @@ TEST(StreamExecutorGpuClientTest, MlirAutoParameterLayoutIsSet) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   mlir::MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(auto module, xla::ParseMlirModuleString(
                                            kMlirWithParameterLayout, context));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->Compile(*module, {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->CompileAndLoad(*module, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto modules,
+                          executable->GetExecutable()->GetHloModules());
+
+  // Check that the executable can be serialized.
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
+                          executable->GetExecutable()->SerializeExecutable());
 
   auto first_param_layout =
       modules[0]->entry_computation_layout().parameter_layout(0).layout();
   EXPECT_EQ(first_param_layout, Layout({1, 2, 0}));
 }
 
+TEST(StreamExecutorGpuClientTest, ValidatesClientName) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+
+  static constexpr char const* kAddProgram =
+      R"(
+HloModule Add.6, entry_computation_layout={(f32[], f32[])->(f32[], f32[])}
+
+ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
+  %a.1 = f32[] parameter(0)
+  %b.2 = f32[] parameter(1)
+  %add.3 = f32[] add(f32[] %a.1, f32[] %b.2)
+  %add.4 = f32[] add(f32[] %add.3, f32[] %add.3)
+  ROOT %tuple.5 = (f32[], f32[]) tuple(f32[] %add.3, f32[] %add.4)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kAddProgram, *client));
+  auto gpu_exe = static_cast<PjRtStreamExecutorLoadedExecutable*>(
+      std::move(executable).get());
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          gpu_exe->SerializeExecutable());
+
+  ExecutableAndOptionsProto proto;
+  proto.ParseFromString(serialized);
+  EXPECT_EQ(proto.pjrt_client_name(), "PjRtStreamExecutorClient");
+  proto.set_pjrt_client_name("SomeGpuClient");
+  serialized = proto.SerializeAsString();
+
+  EXPECT_THAT(client->DeserializeExecutable(serialized, std::nullopt),
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("PjRt client type expected by the serialized "
+                                 "executable: SomeGpuClient")));
+}
+
 TEST(StreamExecutorGpuClientTest, MlirParameterLayoutIsSetInHlo) {
   constexpr char kMlirWithParameterLayout[] =
       R"(
@@ -1788,14 +2032,15 @@ TEST(StreamExecutorGpuClientTest, MlirParameterLayoutIsSetInHlo) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   mlir::MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(auto module, xla::ParseMlirModuleString(
                                            kMlirWithParameterLayout, context));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->Compile(*module, {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->CompileAndLoad(*module, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto modules,
+                          executable->GetExecutable()->GetHloModules());
 
   auto first_param_layout =
       modules[0]->entry_computation_layout().parameter_layout(0).layout();
@@ -1815,7 +2060,7 @@ TEST(StreamExecutorGpuClientTest, MlirParameterLayoutFromOptionsIsSetInHlo) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   mlir::MLIRContext context;
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -1871,15 +2116,17 @@ TEST(StreamExecutorGpuClientTest,
       auto module, xla::ParseMlirModuleString(
                        mlir_mul_explicit_sharding_layout_and_memory, context));
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   xla::CompileOptions options;
   options.executable_build_options.set_num_partitions(2)
       .set_use_spmd_partitioning(true)
       .set_allow_spmd_sharding_propagation_to_output({true});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto executable, client->Compile(*module, options));
-  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->CompileAndLoad(*module, options));
+  TF_ASSERT_OK_AND_ASSIGN(auto modules,
+                          executable->GetExecutable()->GetHloModules());
 
   auto first_param_layout =
       modules[0]->entry_computation_layout().parameter_layout(0).layout();
@@ -1891,7 +2138,8 @@ TEST(StreamExecutorGpuClientTest,
 
   // Verify that the executable's layout callback is null.
   // This is necessary for the executable to be serializable.
-  EXPECT_EQ(executable->GetCompileOptions()
+  EXPECT_EQ(executable->GetExecutable()
+                ->GetCompileOptions()
                 .value()
                 .executable_build_options.layout_canonicalization_callback(),
             nullptr);
@@ -1899,12 +2147,20 @@ TEST(StreamExecutorGpuClientTest,
 
 TEST(StreamExecutorGpuClientTest, GetDefaultLayout) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto shape = ShapeUtil::MakeShape(S4, {2, 2});
+
   TF_ASSERT_OK_AND_ASSIGN(
       auto layout,
       client->GetDefaultLayout(shape.element_type(), shape.dimensions()));
   EXPECT_EQ(layout.element_size_in_bits(), 4);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* const topology,
+                          client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      layout,
+      topology->GetDefaultLayout(shape.element_type(), shape.dimensions()));
+  EXPECT_EQ(layout.element_size_in_bits(), 4);
 }
 
 TEST(StreamExecutorGpuClientTest, AutoLayoutIsSupported) {
@@ -1926,23 +2182,28 @@ TEST(StreamExecutorGpuClientTest, AutoLayoutIsSupported) {
           hlo_text, {}, HloParserOptions().set_fill_missing_layouts(false)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   CompileOptions compile_options;
   compile_options.executable_build_options.mutable_debug_options()
       ->set_xla_pjrt_allow_auto_layout_in_hlo(true);
   XlaComputation computation = m->ToProto();
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                          client->Compile(computation, compile_options));
-  TF_ASSERT_OK_AND_ASSIGN(auto layouts, executable->GetParameterLayouts());
+                          client->CompileAndLoad(computation, compile_options));
+  TF_ASSERT_OK_AND_ASSIGN(auto layouts,
+                          executable->GetExecutable()->GetParameterLayouts());
   // Check that the assigned layouts are not default.
   EXPECT_NE(layouts[0]->ToString(), "{2,1,0}");
   EXPECT_NE(layouts[1]->ToString(), "{2,1,0}");
 }
 
-// Same test as SendRecvChunked, but check GPU device time measurement.
-TEST(StreamExecutorGpuClientTest, NonZeroGPUDeviceTimeMeasurement) {
+// Same test as SendRecvChunked, but check non-zero GPU device time measurement.
+TEST(StreamExecutorGpuClientTest, NonZeroGPUDeviceTimeMeasurementSingleGPU) {
+  if (tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "DeviceTimeMeasurement implementation isn't available in OSS.";
+  }
   TF_ASSERT_OK_AND_ASSIGN(auto client,
-                          GetStreamExecutorGpuClient(GpuClientOptions()));
+                          GetStreamExecutorGpuClient(DefaultOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
@@ -1999,28 +2260,84 @@ TEST(StreamExecutorGpuClientTest, NonZeroGPUDeviceTimeMeasurement) {
       absl::ZeroDuration());
 }
 
+// Same test as MockNcclClientWithGpuTopologyExecuteTest, but check non-zero
+// GPU device time measurement.
+TEST(StreamExecutorGpuClientTest, NonZeroGPUDeviceTimeMeasurementMultiGPU) {
+  if (tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "DeviceTimeMeasurement implementation isn't available in OSS.";
+  }
+  GpuClientOptions client_options;
+  client_options.enable_mock_nccl = true;
+  client_options.num_nodes = 4;
+  client_options.mock_gpu_topology = "2x2x2";
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(client_options));
+
+  auto devices_per_host = client->addressable_device_count();
+  EXPECT_EQ(devices_per_host, 2) << "This test requires 2 local GPUs.";
+
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, xla::ParseMlirModuleString(kMlirDistributedSum, context));
+
+  xla::CompileOptions options;
+  options.executable_build_options.set_num_partitions(8)
+      .set_use_spmd_partitioning(true)
+      .set_allow_spmd_sharding_propagation_to_output({true});
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->CompileAndLoad(*module, options));
+
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {1}, {0});
+  std::vector<std::unique_ptr<PjRtBuffer>> inputs;
+  std::vector<std::vector<PjRtBuffer*>> input_ptrs;
+  for (int i = 0; i < devices_per_host; i++) {
+    auto device = client->addressable_devices()[i];
+    std::vector<int32_t> data{i};
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto input,
+        client->BufferFromHostBuffer(
+            data.data(), shape.element_type(), shape.dimensions(),
+            /*byte_strides=*/std::nullopt,
+            PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+            /*on_done_with_host_buffer=*/nullptr,
+            *device->default_memory_space(), /*device_layout=*/nullptr));
+    input_ptrs.push_back({input.get()});
+    inputs.push_back(std::move(input));
+  }
+
+  // Test non-zero GPU device time measurement.
+  auto measurement0 = CreateDeviceTimeMeasurement();
+
+  // Test that running the program does not crash/hang.
+  TF_ASSERT_OK(
+      executable->Execute(absl::MakeSpan(input_ptrs), ExecuteOptions()));
+
+  // Check measurement after execution completes.
+  EXPECT_GT(
+      measurement0->GetTotalDuration(DeviceTimeMeasurement::DeviceType::kGpu),
+      absl::ZeroDuration());
+}
+
 TEST(StreamExecutorGpuClientTest, DmaMapUnmap) {
-  GpuClientOptions options = GpuClientOptions();
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_client, GetStreamExecutorGpuClient(options));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   auto client =
       tensorflow::down_cast<PjRtStreamExecutorClient*>(gpu_client.get());
   size_t dma_size = 1024;
   size_t alignment = 4096;
-  void* host_dma_ptr = nullptr;
-  (void)posix_memalign(&host_dma_ptr, alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
-  EXPECT_TRUE(client->IsDmaMapped(host_dma_ptr, dma_size));
-  EXPECT_FALSE(
-      client->IsDmaMapped(reinterpret_cast<char*>(host_dma_ptr) + 5, dma_size));
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
-  EXPECT_FALSE(client->IsDmaMapped(host_dma_ptr, dma_size));
-  free(host_dma_ptr);
+  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
+  EXPECT_TRUE(client->IsDmaMapped(host_dma_ptr.get(), dma_size));
+  EXPECT_FALSE(client->IsDmaMapped(
+      reinterpret_cast<char*>(host_dma_ptr.get()) + 5, dma_size));
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+  EXPECT_FALSE(client->IsDmaMapped(host_dma_ptr.get(), dma_size));
 }
 
 TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
-  GpuClientOptions options = GpuClientOptions();
-
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetStreamExecutorGpuClient(options));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
   ASSERT_GE(client->devices().size(), 2);
 
   size_t test_length = 0.5l * 1024 * 1024;
@@ -2043,11 +2360,10 @@ TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   size_t dma_size = 2 * 1024 * 1024;
   size_t alignment = 1024;
-  void* host_dma_ptr = nullptr;
-  (void)posix_memalign(&host_dma_ptr, alignment, dma_size);
-  TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
+  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
 
-  auto result = first_buffer->CopyRawToHost(host_dma_ptr, 0, size);
+  auto result = first_buffer->CopyRawToHost(host_dma_ptr.get(), 0, size);
   TF_EXPECT_OK(result.Await());
 
   PjRtDevice* const second_device = client->addressable_devices()[1];
@@ -2058,13 +2374,50 @@ TEST(StreamExecutorGpuClientTest, MultipleDeviceShareDmaMapping) {
   auto second_buffer = transfer_manager->RetrieveBuffer(0);
 
   TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
-      0, host_dma_ptr, 0, size, true, []() {}));
+      0, host_dma_ptr.get(), 0, size, true, []() {}));
   TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
   EXPECT_EQ(literal->element_count(), test_length);
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
 
-  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr));
-  free(host_dma_ptr);
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+}
+
+TEST(TpuLocalClientTest, RawBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(DefaultOptions()));
+
+  std::vector<int32_t> data(256);
+  std::iota(data.begin(), data.end(), 10);
+
+  Shape shape = ShapeUtil::MakeShape(S32, {256});
+  auto buffer =
+      client
+          ->BufferFromHostBuffer(
+              data.data(), shape.element_type(), shape.dimensions(),
+              /*byte_strides=*/std::nullopt,
+              PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+              nullptr,
+              *client->addressable_devices()[0]->default_memory_space(),
+              /*device_layout=*/nullptr)
+          .value();
+  TF_ASSERT_OK(buffer->GetReadyFuture().Await());
+  TF_ASSERT_OK_AND_ASSIGN(auto raw_buffer,
+                          PjRtRawBuffer::CreateRawAliasOfBuffer(buffer.get()));
+  ASSERT_EQ(raw_buffer->memory_space(), buffer->memory_space());
+  size_t on_device_size = raw_buffer->GetOnDeviceSizeInBytes();
+  ASSERT_EQ(on_device_size, 1024);
+
+  std::vector<int32_t> data2(256);
+  std::iota(data2.begin(), data2.end(), 47);
+  auto* dst1 = tsl::port::AlignedMalloc(1024, 1024);
+  auto* dst2 = tsl::port::AlignedMalloc(1024, 1024);
+  memcpy(dst1, data2.data(), sizeof(int32_t) * data2.size());
+  TF_EXPECT_OK(raw_buffer->CopyRawHostToDevice(dst1, 0, 1024).Await());
+  TF_EXPECT_OK(raw_buffer->CopyRawDeviceToHost(dst2, 0, 1024).Await());
+  EXPECT_EQ(absl::MakeSpan(reinterpret_cast<int32_t*>(dst2), 256), data2);
+
+  tsl::port::AlignedFree(dst1);
+  tsl::port::AlignedFree(dst2);
 }
 
 struct ShardedAutotuningTestInfo {
@@ -2092,7 +2445,11 @@ TEST_P(ShardedAutotuningTest, ShardedAutotuningWorks) {
   std::string cache_dir;
   CHECK(tsl::Env::Default()->LocalTempFilename(&cache_dir));
 
-  tsl::setenv("TF_CPP_VMODULE", "gemm_fusion_autotuner=1", /*overwrite=*/true);
+  if (tsl::kIsOpenSource) {
+    // Test relies on VLOG(1) messages. Enable VLOG(1) in OSS.
+    tsl::setenv("TF_CPP_VMODULE", "gemm_fusion_autotuner=1",
+                /*overwrite=*/true);
+  }
 
   // Compile twice to test both empty and non-empty disk cache.
   for (int iteration = 0; iteration < 2; ++iteration) {
@@ -2109,6 +2466,11 @@ TEST_P(ShardedAutotuningTest, ShardedAutotuningWorks) {
       argv.push_back(absl::StrFormat("--num_nodes_using_cache=%d",
                                      param.num_nodes_using_cache));
       argv.push_back(absl::StrFormat("--cache_dir=%s", cache_dir));
+      // Test relies on VLOG(1) messages. Enable VLOG(1) in Non-OSS.
+      if (!tsl::kIsOpenSource) {
+        argv.push_back("--vmodule=gemm_fusion_autotuner=1");
+        argv.push_back("--logtostderr");
+      }
       child[node_id].SetProgram("/proc/self/exe", argv);
       child[node_id].SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
       child[node_id].SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
@@ -2137,6 +2499,8 @@ TEST_P(ShardedAutotuningTest, ShardedAutotuningWorks) {
                         "Rank %d / %d: autotuning %d / 1 fusions", node_id,
                         param.num_active_nodes, num_fusions_to_autotune)));
       } else {
+        stderr_str = absl::StrReplaceAll(
+            stderr_str, {{"sharded_autotuning_test", "sharded_test"}});
         EXPECT_THAT(stderr_str, Not(HasSubstr("autotuning")));
       }
     }
@@ -2163,7 +2527,7 @@ absl::Status ShardedAutotuningWorksTestBody(const int node_id,
   auto distributed_client =
       GetDistributedRuntimeClient("127.0.0.1:12345", distributed_options);
   TF_QCHECK_OK(distributed_client->Connect());
-  GpuClientOptions options;
+  GpuClientOptions options = DefaultOptions();
   options.node_id = node_id;
   options.allowed_devices = {node_id};
   options.num_nodes = ShardedAutotuningTest::kNumNodes;
@@ -2193,7 +2557,6 @@ absl::Status ShardedAutotuningWorksTestBody(const int node_id,
   DebugOptions& debug_options =
       *compile_options.executable_build_options.mutable_debug_options();
   debug_options.set_xla_gpu_shard_autotuning(true);
-  debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
   debug_options.set_xla_gpu_cublas_fallback(false);
 
   if (node_id < num_nodes_using_cache) {
@@ -2221,13 +2584,14 @@ absl::Status ShardedAutotuningWorksTestBody(const int node_id,
                                             /*return_tuple=*/false,
                                             /*use_shardy=*/false));
     TF_ASSIGN_OR_RETURN(executable,
-                        client->Compile(computation, compile_options));
+                        client->CompileAndLoad(computation, compile_options));
   } else {
-    TF_ASSIGN_OR_RETURN(executable, client->Compile(*module, compile_options));
+    TF_ASSIGN_OR_RETURN(executable,
+                        client->CompileAndLoad(*module, compile_options));
   }
 
   const std::string optimized_hlo =
-      executable->GetHloModules()->front()->ToString();
+      executable->GetExecutable()->GetHloModules()->front()->ToString();
   TF_RET_CHECK(absl::StrContains(optimized_hlo, "triton_gemm"))
       << optimized_hlo;
 
@@ -2242,6 +2606,7 @@ INSTANTIATE_TEST_SUITE_P(
                                                                {false, 2, 1},
                                                                {false, 2, 2}}),
     ShardedAutotuningTestInfo::Name);
+
 }  // namespace
 }  // namespace xla
 
@@ -2273,7 +2638,7 @@ int main(int argc, char* argv[]) {
         node_id, num_active_nodes, num_nodes_using_cache, cache_dir,
         use_xla_computation);
     if (!result.ok()) {
-      LOG(ERROR) << result.ToString();
+      LOG(ERROR) << result;
     }
     return result.raw_code();
   }
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index f2d63370b6fc..56cf2c76c6e5 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -40,16 +41,18 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/dump.h"
 #include "xla/service/gpu/executable.pb.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/local_service_utils.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
+#include "xla/status_macros.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -80,22 +83,17 @@ absl::Status IsValidTopologyAndClientForCompile(
     return absl::InvalidArgumentError(
         "SE:GPU compiler requires a GPU PjRtClient.");
   }
-  TF_ASSIGN_OR_RETURN(auto client_topology, client->GetTopologyDescription());
-
-  if (!IsSameTopology(topology, *client_topology)) {
-    return absl::UnimplementedError(
-        "SE:GPU compiler requires the topology same as the one in the client.");
-  }
   return absl::OkStatus();
 }
 
-absl::StatusOr<xla::Compiler*> GetCompilerForDefaultGpuPlatform() {
+absl::StatusOr<std::unique_ptr<xla::Compiler>>
+GetCompilerForDefaultGpuPlatform() {
   TF_ASSIGN_OR_RETURN(stream_executor::Platform * platform,
                       PlatformUtil::GetPlatform("gpu"));
   return Compiler::GetForPlatform(platform);
 }
 
-absl::StatusOr<xla::Compiler*> GetCompilerForPlatform(
+absl::StatusOr<std::unique_ptr<xla::Compiler>> GetCompilerForPlatform(
     std::optional<stream_executor::Platform::Id> platform_id) {
   if (!platform_id.has_value()) {
     return GetCompilerForDefaultGpuPlatform();
@@ -129,8 +127,12 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
   CompileOptions input_options = options;
   if (!options.target_config) {
     if (client != nullptr) {
+      TF_RET_CHECK(IsGpuClient(*client))
+          << "GPU compilation requires a GPU PjRt client.";
       TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
-      return client->Compile(computation, options);
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtExecutable> executable,
+                          client->Compile(computation, options));
+      return executable;
     }
     const auto& gpu_topology =
         tensorflow::down_cast<const xla::StreamExecutorGpuTopologyDescription&>(
@@ -162,7 +164,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
       HloModule::CreateFromProto(hlo_module_proto, *hlo_config));
   UpdateEntryComputationLayout(
       hlo_module.get(), std::bind(&Compiler::DefaultDeviceShapeRepresentation,
-                                  gpu_compiler, std::placeholders::_1));
+                                  gpu_compiler.get(), std::placeholders::_1));
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
   Compiler::CompileOptions opts;
   opts.target_config = options.target_config;
@@ -176,21 +178,16 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
   const int num_partitions = hlo_module->config().num_partitions();
   const std::string name = hlo_module->name();
   const std::string fingerprint = hlo_module->GetFingerprint128();
-  const int num_outputs = hlo_module->result_shape().IsTuple()
-                              ? hlo_module->result_shape().tuple_shapes_size()
-                              : 1;
   auto unique_module_group =
       std::make_unique<HloModuleGroup>(std::move(hlo_module));
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
       gpu_compiler->CompileAheadOfTime(std::move(unique_module_group),
                                        aot_options));
-  std::vector<std::vector<absl::string_view>> output_memory_kinds(1);
-  output_memory_kinds[0].resize(num_outputs,
-                                StreamExecutorGpuHbmMemorySpace::kKind);
   return std::make_unique<StreamExecutorExecutable>(
       std::move(input_options), std::move(aot_results), num_replicas,
-      num_partitions, name, fingerprint, std::move(output_memory_kinds));
+      num_partitions, name, fingerprint,
+      /*default_memory_kind=*/StreamExecutorGpuHbmMemorySpace::kKind);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
@@ -198,6 +195,15 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    mlir::ModuleOp module,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
+  if (!options.target_config && client != nullptr) {
+    TF_RET_CHECK(IsGpuClient(*client))
+        << "GPU compilation requires a GPU PjRt client.";
+    TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtExecutable> executable,
+                        client->Compile(module, options));
+    return executable;
+  }
+
   CompileOptions input_options = options;
   XlaComputation xla_computation;
   TF_RETURN_IF_ERROR(MlirToXlaComputation(
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 46c5ed6a36e7..07b4440583ff 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -40,8 +40,8 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/service/compiler.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -99,8 +99,9 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           compiler.Compile(opts, mlir_module.get(), *topology,
                                            /*client=*/nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          se_client->Load(std::move(executable)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      se_client->Load(std::move(executable), LoadOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
@@ -129,7 +130,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
-      se_client->Load(std::move(executable)));
+      se_client->Load(std::move(executable), LoadOptions()));
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
       loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
@@ -192,16 +193,63 @@ TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) {
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
-      se_client->Load(std::move(executable)));
+      se_client->Load(std::move(executable), LoadOptions()));
 
   // Serialize the executable and deserialize it without failure.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
                           se_client->SerializeExecutable(*loaded_executable));
   TF_ASSERT_OK_AND_ASSIGN(
       auto deserialized_executable,
-      se_client->DeserializeExecutable(serialized_executable, std::nullopt));
+      se_client->LoadSerializedExecutable(serialized_executable, std::nullopt,
+                                          LoadOptions()));
+
+  EXPECT_EQ(deserialized_executable->GetExecutable()->name(), "Identity");
+}
+
+constexpr char const* kD2HProgramTupleOutput = R"(
+  HloModule f
+
+  ENTRY main.5 {
+    p = s32[4]{0} parameter(0)
+    cc = s32[4] custom-call(p),
+        custom_call_target="annotate_device_placement",
+        frontend_attributes={_xla_buffer_placement="pinned_host"}
+    ROOT tuple = (s32[4]{0}, s32[4]{0}) tuple(s32[4]{0} p, s32[4]{0} cc)
+  }
+)";
+TEST(StreamExecutorGpuCompilerTest, UnloadedExecutableMemoryStats) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  auto se_client = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  StreamExecutorGpuCompiler compiler(se_client->client()->platform()->id());
+  xla::CompileOptions options;
+  options.target_config = Compiler::TargetConfig(
+      se_client->client()->backend().default_stream_executor());
+
+  // Build the output shape with the correct memory space set.
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {4}, {0});
+  Shape host_shape = shape;
+  host_shape.mutable_layout()->set_memory_space(Layout::kHostMemorySpace);
+  Shape out_shape = ShapeUtil::MakeTupleShape({shape, host_shape});
+  options.executable_build_options.set_result_layout(out_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
+                          GetXlaComputation(kD2HProgramTupleOutput));
+  TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
+                          se_client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtExecutable> executable,
+      compiler.Compile(options, computation, *topology, /*client=*/nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(CompiledMemoryStats compiled_memory_stats,
+                          executable->GetCompiledMemoryStats());
 
-  EXPECT_EQ(deserialized_executable->name(), "Identity");
+  EXPECT_EQ(compiled_memory_stats.argument_size_in_bytes, 16);
+  EXPECT_EQ(compiled_memory_stats.output_size_in_bytes, 32);
+  EXPECT_GT(compiled_memory_stats.temp_size_in_bytes, 0);
+  EXPECT_EQ(compiled_memory_stats.host_temp_size_in_bytes, 0);
+  EXPECT_EQ(compiled_memory_stats.host_output_size_in_bytes, 16);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index e653f15ec269..9b0fad4250c3 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
 
 #include <memory>
-#include <utility>
+#include <optional>
+#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -36,19 +37,22 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/tests/literal_test_util.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 using ::tsl::testing::StatusIs;
 
+constexpr absl::string_view kFakeDeviceName = "Fake_device";
+
 constexpr absl::string_view kProgram = R"(HloModule Computation
 
 ENTRY Computation() -> s32[] {
@@ -83,7 +87,8 @@ std::shared_ptr<xla::GpuTopology> GetGpuTopology(
 TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
   StreamExecutorGpuCompiler compiler;
   StreamExecutorGpuTopologyDescription topology(
-      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2, 10));
+      CudaId(), CudaName(),
+      GetGpuTopology({0, 1}, kFakeDeviceName, 1, 1, 2, 10));
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
@@ -94,14 +99,15 @@ TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
 TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
   StreamExecutorGpuCompiler compiler;
   StreamExecutorGpuTopologyDescription topology(
-      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2, 10));
+      CudaId(), CudaName(),
+      GetGpuTopology({0, 1}, kFakeDeviceName, 1, 1, 2, 10));
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
                                client.get()),
-              StatusIs(absl::StatusCode::kUnimplemented));
+              StatusIs(absl::StatusCode::kOk));
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessXla) {
@@ -110,16 +116,13 @@ TEST(StreamExecutorGpuCompilerTest, SuccessXla) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
-  TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
-  TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                          compiler.Compile(xla::CompileOptions(), computation,
-                                           *topology, client.get()));
-  const LoadOptions load_options;
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          client->Load(std::move(executable), load_options));
-
   TF_ASSERT_OK_AND_ASSIGN(
-      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+      std::unique_ptr<xla::PjRtLoadedExecutable> loaded_executable,
+      client->CompileAndLoad(computation, xla::CompileOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          loaded_executable->Execute(
+                              /*argument_handles=*/{{}}, /*options=*/{}));
 
   ASSERT_EQ(result.size(), 1);
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
@@ -140,7 +143,8 @@ TEST(StreamExecutorGpuCompilerTest, NoClientMlir) {
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
 
   StreamExecutorGpuTopologyDescription topology(
-      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2, 10));
+      CudaId(), CudaName(),
+      GetGpuTopology({0, 1}, kFakeDeviceName, 1, 1, 2, 10));
 
   EXPECT_THAT(
       compiler.Compile(xla::CompileOptions(), mlir_module.get(), topology,
@@ -158,13 +162,14 @@ TEST(StreamExecutorGpuCompilerTest, TopologyNotSameMlir) {
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
 
   StreamExecutorGpuTopologyDescription topology(
-      CudaId(), CudaName(), GetGpuTopology({0, 1}, "Fake_device", 1, 1, 2, 10));
+      CudaId(), CudaName(),
+      GetGpuTopology({0, 1}, kFakeDeviceName, 1, 1, 2, 10));
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   EXPECT_THAT(compiler.Compile(xla::CompileOptions(), mlir_module.get(),
                                topology, client.get()),
-              StatusIs(absl::StatusCode::kUnimplemented));
+              StatusIs(absl::StatusCode::kOk));
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessMlir) {
@@ -178,17 +183,55 @@ TEST(StreamExecutorGpuCompilerTest, SuccessMlir) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
-  TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
   TF_ASSERT_OK_AND_ASSIGN(
-      auto executable,
-      compiler.Compile(xla::CompileOptions(), mlir_module.get(), *topology,
-                       client.get()));
-  const LoadOptions load_options;
-  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
-                          client->Load(std::move(executable), load_options));
+      std::unique_ptr<xla::PjRtLoadedExecutable> loaded_executable,
+      client->CompileAndLoad(mlir_module.get(), xla::CompileOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          loaded_executable->Execute(
+                              /*argument_handles=*/{{}}, /*options=*/{}));
+
+  ASSERT_EQ(result.size(), 1);
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  ASSERT_EQ(result_buffers.size(), 1);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          result_buffers[0]->ToLiteralSync());
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
+}
+
+TEST(StreamExecutorGpuCompilerTest, SuccessMlirCanBeSerialized) {
+  StreamExecutorGpuCompiler compiler;
+
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
+
+  auto mlir_module =
+      mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+
+  StreamExecutorGpuTopologyDescription topology(
+      CudaId(), CudaName(),
+      GetGpuTopology({0, 1}, kFakeDeviceName, 1, 1, 2, 10));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+      std::unique_ptr<xla::PjRtExecutable> executable,
+      compiler.Compile(xla::CompileOptions(), mlir_module.get(), topology,
+                       client.get()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          executable->SerializeExecutable());
+  ASSERT_FALSE(serialized.empty());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable_from_serialized,
+                          client->LoadSerializedExecutable(
+                              serialized, std::nullopt, xla::LoadOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          loaded_executable_from_serialized->Execute(
+                              /*argument_handles=*/{{}}, /*options=*/{}));
 
   ASSERT_EQ(result.size(), 1);
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
index a15868653833..7635cc3f3971 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
@@ -22,11 +22,14 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
+
 absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
     const {
   std::string result;
@@ -39,6 +42,15 @@ absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
 absl::StatusOr<Layout> StreamExecutorGpuTopologyDescription::GetDefaultLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dims) const {
   Shape shape = ShapeUtil::MakeShape(element_type, dims);
-  return LayoutUtil::GetWithDefaultLayout(shape).layout();
+  Layout layout = LayoutUtil::GetWithDefaultLayout(shape).layout();
+  // `GetWithDefaultLayout` returns a padded layout for sub-byte types since the
+  // notion of "default" is context dependent and in this case means the default
+  // for literals for historical reasons. Because of this, we need to manually
+  // populate the `element_size_in_bits` for sub-byte types here.
+  if (primitive_util::IsSubByteNonPredType(element_type)) {
+    layout.set_element_size_in_bits(primitive_util::BitWidth(element_type));
+  }
+  return layout;
 }
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
index 6dbe1757f332..59d7fa9dca96 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
@@ -27,6 +27,8 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -64,9 +66,16 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
       const override {
     std::vector<std::unique_ptr<const PjRtDeviceDescription>> devices;
     devices.reserve(gpu_topology_->number_of_devices());
+    int32_t num_devices_per_host = gpu_topology_->num_devices_per_host();
     for (const int device_id : gpu_topology_->device_ids()) {
+      // The process index of a device can be inferred from its global device id
+      // because global device ids are always assigned to each node in the
+      // topology in the order they appear in the input when constructing the
+      // global view.
+      const int process_index =
+          num_devices_per_host == -1 ? 0 : (device_id / num_devices_per_host);
       devices.push_back(std::make_unique<PjRtStreamExecutorDeviceDescription>(
-          device_id, std::string(platform_version())));
+          device_id, process_index, std::string(platform_version())));
     }
     return devices;
   }
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index 59dcd864cd28..f0bee945a441 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -1,6 +1,6 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/pjrt/gpu:package_groups.bzl", "xla_gpu_internal_packages")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -21,45 +21,116 @@ cc_library(
     defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm(["TENSORFLOW_USE_ROCM=1"]),
     visibility = internal_visibility(["//xla/pjrt/gpu:legacy_gpu_client_users"]),
     deps = [
+        ":gpu_event",
+        ":host_memory_allocator",
+        ":tracked_gpu_device_buffer",
+        "//xla:debug_options_flags",
+        "//xla:executable_run_options",
         "//xla:literal",
+        "//xla:shape_layout",
+        "//xla:shape_tree",
+        "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
+        "//xla/core/collectives",
+        "//xla/core/collectives:collectives_registry",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/ir:hlo",
+        "//xla/pjrt:compile_options_proto_cc",
+        "//xla/pjrt:host_callback",
         "//xla/pjrt:host_memory_spaces",
+        "//xla/pjrt:layout_mode",
+        "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_stream_executor_device_description",
+        "//xla/pjrt:semaphore",
+        "//xla/pjrt:stream_executor_executable",
+        "//xla/pjrt:transpose",
         "//xla/pjrt:utils",
+        "//xla/pjrt:worker_thread",
+        "//xla/pjrt/distributed:in_memory_key_value_store",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/pjrt/distributed:protocol_proto_cc",
+        "//xla/pjrt/distributed:topology_util",
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/pjrt/gpu:gpu_topology",
         "//xla/pjrt/gpu:gpu_topology_proto_cc",
+        "//xla/pjrt/gpu:se_gpu_topology_description",
+        "//xla/pjrt/plugin/xla_gpu:xla_gpu_allocator_config",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:compiler",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:executable",
+        "//xla/service:generic_transfer_manager",
+        "//xla/service:global_device_id",
+        "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
+        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:shaped_buffer",
+        "//xla/service:transfer_manager",
+        "//xla/service/gpu:gpu_executable_run_options",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/integrations:tf_allocator_adapter",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/framework:allocator",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:fingerprint",
-    ],
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/profiler/lib:connected_traceme",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ] + if_cuda([
+        # keep sorted
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_config",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 xla_cc_test(
@@ -73,13 +144,65 @@ xla_cc_test(
         "requires-gpu-nvidia:2",
     ],
     deps = [
+        ":gpu_event",
         ":tfrt_gpu_client",
+        ":tracked_gpu_device_buffer",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:test",
         "//xla/pjrt:host_memory_spaces",
+        "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:raw_buffer",
+        "//xla/pjrt/distributed:in_memory_key_value_store",
+        "//xla/pjrt/gpu:gpu_topology",
+        "//xla/pjrt/gpu:gpu_topology_proto_cc",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
         "//xla/service:gpu_plugin",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream",
+        "//xla/tests:literal_test_util",
+        "//xla/tsl/concurrency:async_value",
+        # copybara:uncomment "//xla/tsl/framework:allocator",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -105,7 +228,108 @@ xla_cc_test(
         "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "tracked_gpu_device_buffer",
+    srcs = ["tracked_gpu_device_buffer.cc"],
+    hdrs = ["tracked_gpu_device_buffer.h"],
+    deps = [
+        ":gpu_event",
+        "//xla:shape_tree",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/pjrt:pjrt_client",
+        "//xla/service:shaped_buffer",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/framework:allocator",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:stacktrace",
+    ],
+)
+
+xla_cc_test(
+    name = "tracked_gpu_device_buffer_test",
+    srcs = ["tracked_gpu_device_buffer_test.cc"],
+    tags = [
+        "gpu",
+        "no_oss",
+        "noasan",
+        "nomsan",
+        "requires-gpu-nvidia:2",
+    ],
+    deps = [
+        ":gpu_event",
+        ":tracked_gpu_device_buffer",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/client:client_library",
+        "//xla/client:local_client",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_future",
+        "//xla/service:gpu_plugin",
+        "//xla/service:shaped_buffer",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/tsl/concurrency:async_value",
+        # copybara:uncomment "//xla/tsl/framework:allocator",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "tfrt_gpu_buffer_test",
+    srcs = ["tfrt_gpu_buffer_test.cc"],
+    tags = [
+        "gpu",
+        "no_oss",
+        "noasan",
+        "nomsan",
+        "requires-gpu-nvidia:2",
+    ],
+    deps = [
+        ":gpu_event",
+        ":tfrt_gpu_client",
+        ":tracked_gpu_device_buffer",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:gpu_plugin",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:casts",
+    ],
+)
+
+cc_library(
+    name = "host_memory_allocator",
+    hdrs = ["host_memory_allocator.h"],
+    deps = [
+        "//xla/tsl/framework:allocator",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event.h b/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event.h
index 5e08d6f55833..3f7edde12e7e 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PJRT_GPU_TFRT_GPU_EVENT_H_
 
 #include <cstddef>
+#include <utility>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
@@ -63,6 +64,31 @@ class TfrtEventSet {
   absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4> events_;
 };
 
+// A RAII helper class used to set an AsyncValueRef<GpuEvent> to a ready state
+// upon destruction. In many cases in PjRt implementation, there will be
+// multiple return statements in the function, all of which require setting
+// some AsyncValueRef<GpuEvent> to be ready. This class could make such code
+// more robust by using setting the AsyncValue in the destructor.
+class MarkGpuEventReadyOnExit {
+ public:
+  explicit MarkGpuEventReadyOnExit(tsl::AsyncValueRef<GpuEvent> event)
+      : event_(std::move(event)) {}
+
+  MarkGpuEventReadyOnExit(const MarkGpuEventReadyOnExit&) = delete;
+  MarkGpuEventReadyOnExit& operator=(const MarkGpuEventReadyOnExit&) = delete;
+  MarkGpuEventReadyOnExit(MarkGpuEventReadyOnExit&&) = default;
+  MarkGpuEventReadyOnExit& operator=(MarkGpuEventReadyOnExit&&) = default;
+
+  ~MarkGpuEventReadyOnExit() {
+    if (event_) event_.SetStateConcrete();
+  }
+
+  tsl::AsyncValueRef<GpuEvent> Release() && { return std::move(event_); }
+
+ private:
+  tsl::AsyncValueRef<GpuEvent> event_;
+};
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_GPU_TFRT_GPU_EVENT_H_
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc
index fe9526e56f30..cfbe9fc7a140 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/gpu_event_test.cc
@@ -111,5 +111,18 @@ TEST(TfrtEventSetTest, ClearEvents) {
   EXPECT_EQ(event_set.size(), 0);
 }
 
+TEST(MarkEventReadyOnExitTest, EventReleaseAndReadyOnExit) {
+  tsl::AsyncValueRef<GpuEvent> event =
+      tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  tsl::AsyncValueRef<GpuEvent> released_event =
+      MarkGpuEventReadyOnExit(event).Release();
+  EXPECT_EQ(event.GetAsyncValue(), released_event.GetAsyncValue());
+  {
+    MarkGpuEventReadyOnExit ready_on_exit(event);
+    EXPECT_FALSE(event.IsAvailable());
+  }
+  EXPECT_TRUE(event.IsAvailable());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h b/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h
new file mode 100644
index 000000000000..cef01b496e48
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h
@@ -0,0 +1,46 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
+#define XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "xla/tsl/framework/allocator.h"
+
+namespace xla {
+class HostMemoryAllocator {
+ public:
+  explicit HostMemoryAllocator(std::unique_ptr<tsl::Allocator> allocator)
+      : allocator_(std::move(allocator)) {}
+
+  // Uses tsl::Allocator destructor as the deleter for owned pointer.
+  using OwnedPtr = std::unique_ptr<void, std::function<void(void*)>>;
+  OwnedPtr Allocate(size_t size) {
+    if (size == 0) return OwnedPtr(nullptr, [](void* ptr) {});
+    return OwnedPtr(
+        allocator_->AllocateRaw(tsl::Allocator::kAllocatorAlignment, size),
+        [this](void* ptr) { allocator_->DeallocateRaw(ptr); });
+  }
+
+ private:
+  std::unique_ptr<tsl::Allocator> allocator_;
+};
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer_test.cc
new file mode 100644
index 000000000000..9dde5492e5a7
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer_test.cc
@@ -0,0 +1,313 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/gpu/tfrt/gpu_event.h"
+#include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
+#include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
+
+namespace xla {
+namespace {
+
+using ::tsl::thread::ThreadPool;
+
+TEST(TfrtGpuBufferTest, CreateBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(client->devices()[0]);
+  auto size_in_bytes = ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_buffer,
+      GpuDeviceMemory::Allocate(device->allocator(),
+                                device->local_device_id().value(),
+                                size_in_bytes));
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(device_buffer));
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref),
+      tsl::MakeAvailableAsyncValueRef<GpuEvent>());
+  auto memory_space = device->default_memory_space().value();
+  auto buffer = std::make_unique<TfrtGpuBuffer>(
+      on_device_shape, std::move(tracked_device_buffer),
+      tensorflow::down_cast<TfrtGpuClient*>(client.get()), device,
+      memory_space);
+
+  EXPECT_EQ(buffer->on_device_shape(), on_device_shape);
+  EXPECT_EQ(buffer->device(), device);
+  EXPECT_EQ(buffer->client(), client.get());
+  EXPECT_EQ(buffer->memory_space(), memory_space);
+  EXPECT_EQ(buffer->GetOnDeviceSizeInBytes().value(), size_in_bytes);
+}
+
+TEST(TfrtGpuBufferTest, AcquireExternalReference) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(client->devices()[0]);
+  auto size_in_bytes = ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_buffer,
+      GpuDeviceMemory::Allocate(device->allocator(),
+                                device->local_device_id().value(),
+                                size_in_bytes));
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(device_buffer));
+  auto definition_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref), definition_event);
+  auto memory_space = device->default_memory_space().value();
+  auto buffer = std::make_unique<TfrtGpuBuffer>(
+      on_device_shape, std::move(tracked_device_buffer),
+      tensorflow::down_cast<TfrtGpuClient*>(client.get()), device,
+      memory_space);
+
+  ThreadPool thread_pool(tsl::Env::Default(), "gpu_buffer_test",
+                         /*num_threads=*/4);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>> ref_status;
+  auto ref_acquired_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  thread_pool.Schedule([&]() {
+    ref_status = buffer->AcquireExternalReference();
+    ref_acquired_event.SetStateConcrete();
+  });
+  // AcquireExternalReference should block until the definition event is
+  // triggered.
+  absl::SleepFor(absl::Milliseconds(100));
+  EXPECT_FALSE(ref_acquired_event.IsAvailable());
+
+  // Trigger the definition event. AcquireExternalReference should be unblocked.
+  definition_event.SetStateConcrete();
+  BlockUntilReady(ref_acquired_event.GetAsyncValue());
+  EXPECT_OK(ref_status);
+
+  // TODO(b/382117736): external reference should block donation.
+}
+
+TEST(TfrtGpuBufferTest, ReleaseDeviceMemoryOwnershipNoWait) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(client->devices()[0]);
+  auto size_in_bytes = ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_buffer,
+      GpuDeviceMemory::Allocate(device->allocator(),
+                                device->local_device_id().value(),
+                                size_in_bytes));
+  void* device_memory_opaque = device_buffer.buffer().opaque();
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(device_buffer));
+
+  auto definition_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref), definition_event);
+
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  std::array usage_events{usage_event.CopyRef()};
+  tracked_device_buffer->AddUsageEvents(absl::MakeSpan(usage_events));
+
+  bool destructed = false;
+  tracked_device_buffer->deallocation_event().AndThen(
+      [&] { destructed = true; });
+  auto memory_space = device->default_memory_space().value();
+  auto buffer = std::make_unique<TfrtGpuBuffer>(
+      on_device_shape, std::move(tracked_device_buffer),
+      tensorflow::down_cast<TfrtGpuClient*>(client.get()), device,
+      memory_space);
+
+  // Release and don't wait for definition or usage events to complete.
+  auto ref_status = buffer->ReleaseDeviceMemoryOwnership(
+      /*wait_for_operations_to_complete=*/false);
+  EXPECT_OK(ref_status);
+  auto ref = std::move(ref_status).value();
+  EXPECT_EQ(device_memory_opaque, ref->OpaqueDeviceMemoryDataPointer());
+
+  // Release again should return nullptr.
+  auto ref_status_2 = buffer->ReleaseDeviceMemoryOwnership(
+      /*wait_for_operations_to_complete=*/false);
+  EXPECT_OK(ref_status_2);
+  EXPECT_EQ(nullptr, ref_status_2.value().get());
+}
+
+TEST(TfrtGpuBufferTest, ReleaseDeviceMemoryOwnershipWait) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(client->devices()[0]);
+  auto size_in_bytes = ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_buffer,
+      GpuDeviceMemory::Allocate(device->allocator(),
+                                device->local_device_id().value(),
+                                size_in_bytes));
+  void* device_memory_opaque = device_buffer.buffer().opaque();
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(device_buffer));
+
+  auto definition_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref), definition_event);
+
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  std::array usage_events{usage_event.CopyRef()};
+  tracked_device_buffer->AddUsageEvents(absl::MakeSpan(usage_events));
+
+  bool destructed = false;
+  tracked_device_buffer->deallocation_event().AndThen(
+      [&] { destructed = true; });
+  auto memory_space = device->default_memory_space().value();
+  auto buffer = std::make_unique<TfrtGpuBuffer>(
+      on_device_shape, std::move(tracked_device_buffer),
+      tensorflow::down_cast<TfrtGpuClient*>(client.get()), device,
+      memory_space);
+
+  ThreadPool thread_pool(tsl::Env::Default(), "gpu_buffer_test",
+                         /*num_threads=*/4);
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>> ref_status;
+  auto ref_acquired_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  thread_pool.Schedule([&]() {
+    ref_status = buffer->ReleaseDeviceMemoryOwnership(
+        /*wait_for_operations_to_complete=*/true);
+    ref_acquired_event.SetStateConcrete();
+  });
+  // AcquireExternalReference should block until the definition event is
+  // triggered.
+  absl::SleepFor(absl::Milliseconds(100));
+  EXPECT_FALSE(ref_acquired_event.IsAvailable());
+
+  // Trigger the definition event.
+  definition_event.SetStateConcrete();
+  EXPECT_FALSE(ref_acquired_event.IsAvailable());
+
+  // Trigger the usage event.
+  usage_event.SetStateConcrete();
+  BlockUntilReady(ref_acquired_event.GetAsyncValue());
+  EXPECT_OK(ref_status);
+
+  // TODO(b/382117736): should also block until donation event is triggered.
+  auto ref = std::move(ref_status).value();
+  EXPECT_EQ(device_memory_opaque, ref->OpaqueDeviceMemoryDataPointer());
+
+  // Release again should return nullptr.
+  auto ref_status_2 = buffer->ReleaseDeviceMemoryOwnership(
+      /*wait_for_operations_to_complete=*/false);
+  EXPECT_OK(ref_status_2);
+  EXPECT_EQ(nullptr, ref_status_2.value().get());
+}
+
+TEST(TfrtGpuBufferTest, Delete) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(client->devices()[0]);
+  auto size_in_bytes = ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_buffer,
+      GpuDeviceMemory::Allocate(device->allocator(),
+                                device->local_device_id().value(),
+                                size_in_bytes));
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(device_buffer));
+
+  auto definition_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref), definition_event);
+
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  std::array usage_events{usage_event.CopyRef()};
+  tracked_device_buffer->AddUsageEvents(absl::MakeSpan(usage_events));
+
+  bool destructed = false;
+  tracked_device_buffer->deallocation_event().AndThen(
+      [&] { destructed = true; });
+  auto memory_space = device->default_memory_space().value();
+  auto buffer = std::make_unique<TfrtGpuBuffer>(
+      on_device_shape, std::move(tracked_device_buffer),
+      tensorflow::down_cast<TfrtGpuClient*>(client.get()), device,
+      memory_space);
+
+  // Delete the buffer. The underlying device memory should not be freed until
+  // the usage event is triggered.
+  buffer->Delete();
+  EXPECT_TRUE(buffer->IsDeleted());
+  absl::SleepFor(absl::Milliseconds(50));
+  EXPECT_FALSE(destructed);
+
+  definition_event.SetStateConcrete();
+  EXPECT_FALSE(destructed);
+
+  // TODO(b/382117736): should also wait for donation event.
+
+  usage_event.SetStateConcrete();
+  EXPECT_TRUE(destructed);
+}
+
+TEST(TfrtGpuBufferTest, IsDeviceShapeWhenStaticShape) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  std::vector<int32_t> data{1, 2, 3, 4, 5, 6};
+  for (PrimitiveType t : {F32, F16, S8, BF16}) {
+    Shape shape = ShapeUtil::MakeShape(t, {3, 2});
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<PjRtBuffer> buffer,
+        client->BufferFromHostBuffer(
+            data.data(), shape.element_type(), shape.dimensions(),
+            /*byte_strides=*/std::nullopt,
+            PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+            client->memory_spaces()[0], /*device_layout=*/nullptr));
+    EXPECT_EQ(buffer->on_device_shape(), shape);
+    EXPECT_EQ(*buffer->logical_on_device_shape(), shape);
+  }
+}
+
+// TODO: b/382117736 - Add test for logical shape when shape is dynamic after
+// TfrtGpuClient::Execute() is ready.
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index ab0de845151f..fc29801df941 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -15,7 +15,15 @@ limitations under the License.
 
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 
+#include <algorithm>
+#include <array>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,51 +32,924 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#include "mlir/IR/BuiltinOps.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/collectives_registry.h"
+#include "xla/debug_options_flags.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/maybe_owning.h"
+#include "xla/pjrt/compile_options.pb.h"
+#include "xla/pjrt/distributed/in_memory_key_value_store.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/pjrt/gpu/se_gpu_topology_description.h"
+#include "xla/pjrt/gpu/tfrt/gpu_event.h"
+#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
+#include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
+#include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/layout_mode.h"
+#include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/pjrt/semaphore.h"
+#include "xla/pjrt/stream_executor_executable.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/pjrt/utils.h"
+#include "xla/pjrt/worker_thread.h"
+#include "xla/primitive_util.h"
+#include "xla/service/compiler.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
+#include "xla/service/generic_transfer_manager.h"
+#include "xla/service/global_device_id.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/shape.h"
+#include "xla/shape_layout.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/platform/mem.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/context_types.h"
+#include "tsl/profiler/lib/traceme.h"
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif
 
 namespace xla {
 namespace {
 
-static std::string get_platform_version(xla::LocalClient* xla_client) {
-  const stream_executor::DeviceDescription& device =
-      xla_client->backend().default_stream_executor()->GetDeviceDescription();
-  if (std::holds_alternative<stream_executor::RocmComputeCapability>(
-          device.gpu_compute_capability())) {
-    return absl::StrCat("rocm ", device.runtime_version());
+constexpr absl::string_view kPjRtClientName = "TfrtGpuClient";
+
+absl::StatusOr<Shape> GetDestinationDeviceShape(const Shape& host_shape,
+                                                TfrtGpuDevice* device,
+                                                TfrtGpuClient* client,
+                                                PjRtMemorySpace* memory_space) {
+  if (host_shape.IsTuple()) {
+    return InvalidArgument(
+        "Cannot allocate a PjRtStreamExecutorBuffer for a tuple.");
   }
-  if (std::holds_alternative<stream_executor::CudaComputeCapability>(
-          device.gpu_compute_capability())) {
-    return absl::StrCat("cuda ", device.runtime_version());
+
+  PjRtMemorySpace* default_memory_space =
+      device->default_memory_space().value_or(nullptr);
+  if (!memory_space) {
+    memory_space = default_memory_space;
   }
-  return "<unknown>";
+  bool is_pinned_host_memory =
+      memory_space && (memory_space->kind() == PinnedHostMemorySpace::kKind);
+  // Only allow pinned host memory or device memory.
+  if (memory_space != default_memory_space && !is_pinned_host_memory) {
+    return InvalidArgument("Buffer allocation: invalid memory space");
+  }
+
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(host_shape));
+  TransferManager* transfer_manager =
+      client->xla_client()->backend().transfer_manager();
+  Shape device_shape = transfer_manager->HostShapeToDeviceShape(host_shape);
+  if (is_pinned_host_memory) {
+    device_shape.mutable_layout()->set_memory_space(Layout::kHostMemorySpace);
+  }
+  TF_RET_CHECK(LayoutUtil::HasLayout(device_shape));
+  return device_shape;
+}
+
+absl::StatusOr<std::unique_ptr<TfrtGpuBuffer>> AllocateTfrtGpuDestinationBuffer(
+    const Shape& on_host_shape,
+    absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4> definition_events,
+    TfrtGpuDevice* device, TfrtGpuClient* client, PjRtMemorySpace* memory_space,
+    int64_t pack_size = 0) {
+  if (on_host_shape.IsTuple()) {
+    return Unimplemented(
+        "tuple case not implemented for AllocateTfrtGpuDestinationBuffer");
+  }
+  TF_ASSIGN_OR_RETURN(
+      Shape on_device_shape,
+      GetDestinationDeviceShape(on_host_shape, device, client, memory_space));
+  size_t byte_size =
+      pack_size > 0 ? pack_size : ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSIGN_OR_RETURN(
+      auto device_buffer,
+      GpuDeviceMemory::Allocate(client->allocator(),
+                                device->local_device_id().value(), byte_size,
+                                LayoutUtil::MemorySpace(on_device_shape)));
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(device_buffer));
+  return std::make_unique<TfrtGpuBuffer>(
+      on_device_shape,
+      std::make_unique<TrackedGpuDeviceBuffer>(buffer_async_value_ref,
+                                               std::move(definition_events)),
+      client, device, memory_space);
+}
+
+void EnqueueWork(tsl::thread::ThreadPool* pool,
+                 absl::AnyInvocable<void() &&> callee) {
+  // TSL TheadPool expects std::function that must be copyable, so we are
+  // forced to do a little bit of manual memory management here.
+  pool->Schedule(
+      [ptr = new absl::AnyInvocable<void() &&>(std::move(callee))]() {
+        std::move (*ptr)();
+        delete ptr;
+      });
+}
+
+// Enqueue to a thread pool when all `values` are ready.
+void EnqueueWorkWhenReady(
+    tsl::thread::ThreadPool* pool,
+    absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
+    absl::AnyInvocable<void()> callee) {
+  tsl::RunWhenReady(values, [pool, callee = std::move(callee)]() mutable {
+    VLOG(2) << "EnqueueWork: pool: " << pool;
+    EnqueueWork(pool, std::move(callee));
+  });
+}
+
+std::string MakeComputeCapabilityString(
+    const stream_executor::DeviceDescription* desc) {
+  stream_executor::GpuComputeCapability cc = desc->gpu_compute_capability();
+  if (std::holds_alternative<stream_executor::CudaComputeCapability>(cc)) {
+    auto nvcc = std::get<stream_executor::CudaComputeCapability>(cc);
+    return absl::StrCat(nvcc.major, ".", nvcc.minor);
+  }
+  if (std::holds_alternative<stream_executor::RocmComputeCapability>(cc)) {
+    auto rocmcc = std::get<stream_executor::RocmComputeCapability>(cc);
+    return rocmcc.gfx_version();
+  }
+  return "unknown";
+}
+
+bool IsAllZeros(const DeviceAssignment& assignment) {
+  return std::all_of(
+      assignment.begin(), assignment.end(),
+      [](const DeviceAssignment::value_type& v) { return v == 0; });
+}
+
+std::vector<tsl::RCReference<tsl::AsyncValue>> CopyAsyncValues(
+    absl::Span<const tsl::RCReference<tsl::AsyncValue>> events) {
+  std::vector<tsl::RCReference<tsl::AsyncValue>> avs;
+  avs.reserve(events.size());
+  for (const auto& ev : events) {
+    avs.push_back(ev);
+  }
+  return avs;
+}
+
+// Checks that the input buffers passed in by the user have the correct size
+// on device for the compiled program.
+absl::Status CheckBufferCompatibilities(
+    absl::Span<int64_t const> input_buffer_sizes_in_bytes,
+    absl::Span<TrackedGpuDeviceBuffer* const> input_buffers) {
+  if (input_buffers.size() != input_buffer_sizes_in_bytes.size()) {
+    return InvalidArgument(
+        "Execution supplied %lld buffers but compiled program expected %lld "
+        "buffers",
+        input_buffers.size(), input_buffer_sizes_in_bytes.size());
+  }
+  for (int i = 0; i < input_buffers.size(); ++i) {
+    const auto& buffer = input_buffers[i];
+    if (input_buffer_sizes_in_bytes[i] != buffer->buffer()->size_bytes()) {
+      return InvalidArgument(
+          "Executable expected parameter %d of size %lld but got buffer with"
+          " incompatible size %lld ",
+          i, input_buffer_sizes_in_bytes[i], buffer->buffer()->size_bytes());
+    }
+  }
+  return absl::OkStatus();
+}
+
+template <typename MemorySpaceKind>
+bool IsMemorySpaceKind(const PjRtMemorySpace* memory_space) {
+  return memory_space->kind_id() == MemorySpaceKind::kKindId;
+}
+
+class TfrtGpuAsyncHostToDeviceTransferManager final
+    : public PjRtClient::AsyncHostToDeviceTransferManager {
+ public:
+  static absl::StatusOr<
+      std::unique_ptr<TfrtGpuAsyncHostToDeviceTransferManager>>
+  Create(absl::Span<const PjRtClient::ShapeSpec> shape_specs,
+         std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+         TfrtGpuDevice* device, TfrtGpuClient* client,
+         PjRtMemorySpace* memory_space) {
+    if (device_layouts.has_value() &&
+        device_layouts->size() != shape_specs.size()) {
+      return InvalidArgument(
+          "Number of layouts %d does not match the number of shapes %d",
+          device_layouts->size(), shape_specs.size());
+    }
+    absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers;
+    absl::InlinedVector<tsl::AsyncValueRef<GpuDeviceMemory>, 4> buffer_ptrs;
+    absl::InlinedVector<absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4>, 4>
+        definition_events;
+    absl::InlinedVector<Shape, 4> device_shapes;
+    buffers.reserve(shape_specs.size());
+    buffer_ptrs.reserve(shape_specs.size());
+    definition_events.reserve(shape_specs.size());
+    device_shapes.reserve(shape_specs.size());
+    for (int i = 0; i < shape_specs.size(); ++i) {
+      const PjRtClient::ShapeSpec& shape_spec = shape_specs[i];
+      if (shape_spec.element_type == TUPLE) {
+        return Unimplemented(
+            "Async buffer transfer of tuples not implemented.");
+      }
+      // Initialize a definition event for each async buffer. The definition
+      // event will block the buffer usage until the transfer is done.
+      tsl::AsyncValueRef<GpuEvent> copy_event =
+          tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+      // Since transfer of tuples are not supported, we can use a single event
+      // for each buffer.
+      definition_events.push_back({copy_event.CopyRef()});
+      Shape& device_shape = device_shapes.emplace_back(
+          ShapeUtil::MakeShape(shape_spec.element_type, shape_spec.dims));
+      if (device_layouts.has_value() && (*device_layouts)[i].has_value()) {
+        *device_shape.mutable_layout() = *(*device_layouts)[i];
+      } else {
+        TF_ASSIGN_OR_RETURN(device_shape,
+                            client->xla_client()
+                                ->backend()
+                                .transfer_manager()
+                                ->ChooseCompactLayoutForShape(device_shape));
+      }
+      absl::StatusOr<std::unique_ptr<TfrtGpuBuffer>> buffer =
+          AllocateTfrtGpuDestinationBuffer(device_shape,
+                                           definition_events.back(), device,
+                                           client, memory_space);
+      if (!buffer.ok()) {
+        copy_event.SetError(buffer.status());
+        return absl::InternalError("Failed to allocate buffer.");
+      } else {
+        buffer_ptrs.push_back(buffer->get()->GetBufferPtr());
+      }
+
+      buffers.push_back(std::move(*buffer));
+    }
+
+    return std::make_unique<TfrtGpuAsyncHostToDeviceTransferManager>(
+        std::move(buffers), std::move(buffer_ptrs),
+        std::move(definition_events), std::move(device_shapes), device);
+  }
+
+  TfrtGpuAsyncHostToDeviceTransferManager(
+      absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<GpuDeviceMemory>, 4> buffer_ptrs,
+      absl::InlinedVector<absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4>,
+                          4>
+          definition_events,
+      absl::InlinedVector<Shape, 4> device_shapes, TfrtGpuDevice* device)
+      : buffers_(std::move(buffers)),
+        h2d_thread_(std::make_unique<WorkerThread>(
+            tsl::Env::Default(),
+            "TfrtGpuAsyncHostToDeviceTransferManager_h2d_thread")),
+        buffer_ptrs_(std::move(buffer_ptrs)),
+        buffer_sizes_(GetBufferSizes(buffers_)),
+        definition_events_(std::move(definition_events)),
+        device_shapes_(std::move(device_shapes)),
+        remaining_buffer_count_(buffers_.size()),
+        device_(device),
+        client_(tsl::down_cast<TfrtGpuClient*>(device_->client())) {
+    VLOG(2) << "TfrtGpuAsyncHostToDeviceTransferManager::"
+               "TfrtGpuAsyncHostToDeviceTransferManager: this="
+            << this << " buffers_.size()=" << buffers_.size();
+
+    last_transfer_started_.resize(buffer_ptrs_.size(), false);
+  }
+
+  ~TfrtGpuAsyncHostToDeviceTransferManager() override {
+    auto transfers_finished = [this]() {
+      mu_.AssertHeld();
+      return transfers_in_flight_ == 0;
+    };
+    {
+      absl::MutexLock l(&mu_);
+      // Make sure we don't leave dangling pointers in cleanup routines even
+      // if the client lets the object go out of scope.
+      mu_.Await(absl::Condition(&transfers_finished));
+    }
+  }
+
+  size_t buffer_count() const override { return buffer_sizes_.size(); };
+
+  size_t buffer_size(int buffer_index) const override {
+    DCHECK_LT(buffer_index, buffer_sizes_.size());
+    return buffer_sizes_[buffer_index];
+  }
+
+  PjRtDevice* device() const override { return device_; }
+
+  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override {
+    absl::MutexLock l(&mu_);
+    DCHECK_LT(buffer_index, buffers_.size());
+    return std::move(buffers_[buffer_index]);
+  };
+
+  absl::Status TransferLiteralToBuffer(
+      int buffer_index, const LiteralSlice& literal,
+      absl::AnyInvocable<void() &&> on_done) override {
+    tsl::profiler::TraceMe traceme(
+        "AsyncHostToDeviceTransferManager::TransferLiteralToBuffer");
+    VLOG(2) << "TfrtGpuAsyncHostToDeviceTransferManager::"
+               "TransferLiteralToBuffer: this="
+            << this << " buffer_index=" << buffer_index;
+    auto* client = tsl::down_cast<TfrtGpuClient*>(device_->client());
+    DCHECK(client);
+
+    TransferManager* transfer_manager =
+        client->xla_client()->backend().transfer_manager();
+
+    tsl::AsyncValueRef<GpuDeviceMemory> buffer;
+    {
+      absl::MutexLock l(&mu_);
+
+      DCHECK_LT(buffer_index, buffer_ptrs_.size());
+      if (last_transfer_started_[buffer_index]) {
+        return InvalidArgument(
+            "TransferLiteralToBuffer requested for buffer index %d which has "
+            "already been fully transferred",
+            buffer_index);
+      }
+      last_transfer_started_[buffer_index] = true;
+      buffer = buffer_ptrs_[buffer_index];
+      DCHECK(buffer);
+
+      ++transfers_in_flight_;
+    }
+
+    // The host to device transfer is performed on a thread pool, mostly
+    // because it includes linearization that may be slow.
+    // TODO(misard) assess if it would be preferable to introduce a heuristic
+    // to put the transfer into the calling thread for small literals.
+    auto transfer_h2d = [this, buffer_index, transfer_manager,
+                         literal = std::move(literal),
+                         buffer = std::move(buffer),
+                         on_done = std::move(on_done)]() mutable {
+      tsl::profiler::TraceMe traceme(
+          "TfrtGpuAsyncHostToDeviceTransferManager::TransferLiteralToBuffer::"
+          "transfer_h2d");
+
+      // Initiate linearization and transfer of the buffer on the stream.
+      ShapedBuffer shaped_buffer =
+          buffer->AsShapedBuffer(device_shapes_[buffer_index], device_);
+
+      auto stream = device_->stream();
+
+      GenericTransferManager::LiteralFromDeviceMetadata transfer_metadata;
+      // We never call device functions from the `done` callback.
+      transfer_metadata.callback_is_host_callback_safe = true;
+      TransferManager::TransferMetadata* transfer_metadata_ptr =
+          (dynamic_cast<GenericTransferManager*>(transfer_manager) != nullptr)
+              ? &transfer_metadata
+              : nullptr;
+
+      TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
+          stream, literal, shaped_buffer, transfer_metadata_ptr));
+
+      auto status = stream->DoHostCallback(
+          [buffer_index, on_done = std::move(on_done), this]() mutable {
+            CleanUp(buffer_index, true, std::move(on_done));
+          });
+
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to do host callback for transfer_h2d";
+      }
+    };
+    // Enqueue the transfer to the h2d thread.
+    h2d_thread_->Schedule(std::move(transfer_h2d));
+    return absl::OkStatus();
+  }
+
+  absl::Status TransferRawDataToBuffer(
+      int buffer_index, absl::string_view data,
+      absl::AnyInvocable<void() &&> on_done) override {
+    return TransferRawDataToSubBuffer(buffer_index, data.data(),
+                                      /*offset=*/0, data.size(),
+                                      /*is_last_transfer=*/true,
+                                      std::move(on_done));
+  }
+
+  absl::Status TransferRawDataToSubBuffer(
+      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
+      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
+    VLOG(2) << "TfrtGpuAsyncHostToDeviceTransferManager::"
+               "TransferRawDataToSubBuffer: this="
+            << this << " buffer_index=" << buffer_index << " offset=" << offset
+            << " transfer_size=" << transfer_size
+            << " is_last_transfer=" << is_last_transfer;
+
+    auto* client = tsl::down_cast<TfrtGpuClient*>(device_->client());
+    DCHECK(client);
+
+    HostMemoryAllocator::OwnedPtr staging_buffer;
+    if (client->should_stage_host_to_device_transfers() &&
+        !client->IsDmaMapped(data, transfer_size)) {
+      HostMemoryAllocator* host_memory_allocator =
+          client->host_memory_allocator();
+      if (host_memory_allocator == nullptr) {
+        return InvalidArgument(
+            "host_memory_allocator should be initialized for staging buffer "
+            "transfer.");
+      }
+      staging_buffer = host_memory_allocator->Allocate(transfer_size);
+    }
+
+    se::DeviceMemoryBase sub_buffer;
+    {
+      absl::MutexLock l(&mu_);
+      DCHECK_LT(buffer_index, buffer_ptrs_.size());
+      if (last_transfer_started_[buffer_index]) {
+        return InvalidArgument(
+            "TransferRawData requested for buffer index %d which has "
+            "already been fully transferred",
+            buffer_index);
+      }
+      if (is_last_transfer) {
+        last_transfer_started_[buffer_index] = true;
+      }
+      DCHECK(buffer_ptrs_[buffer_index]);
+      tsl::AsyncValueRef<GpuDeviceMemory>& buffer_memory =
+          buffer_ptrs_[buffer_index];
+      CHECK_LE(offset, buffer_memory->size_bytes());
+      CHECK_LE(transfer_size, buffer_memory->size_bytes() - offset);
+      if (transfer_size < buffer_memory->size_bytes()) {
+        sub_buffer =
+            buffer_memory->buffer().GetByteSlice(offset, transfer_size);
+      } else {
+        sub_buffer = buffer_memory->buffer();
+      }
+
+      ++transfers_in_flight_;
+    }
+
+    auto copy_to_gpu =
+        [transfer_size, staging_buffer = std::move(staging_buffer), data,
+         sub_buffer = std::move(sub_buffer), buffer_index, is_last_transfer,
+         on_done = std::move(on_done), this]() mutable {
+          tsl::profiler::TraceMe traceme(
+              "TfrtGpuAsyncHostToDeviceTransferManager::"
+              "TransferRawDataToSubBuffer::"
+              "copy_to_gpu");
+
+          if (transfer_size != 0) {
+            if (staging_buffer != nullptr) {
+              std::memcpy(staging_buffer.get(), data, transfer_size);
+            }
+
+            auto stream = device_->stream();
+
+            TF_CHECK_OK(stream->Memcpy(
+                &sub_buffer, staging_buffer ? staging_buffer.get() : data,
+                transfer_size))
+                << "Failed to copy data to GPU";
+
+            auto status = stream->DoHostCallback(
+                [buffer_index, is_last_transfer, on_done = std::move(on_done),
+                 this]() mutable {
+                  CleanUp(buffer_index, is_last_transfer, std::move(on_done));
+                });
+
+            if (!status.ok()) {
+              LOG(ERROR) << "Failed to do host callback for copy_to_gpu";
+            }
+          } else {
+            CleanUp(buffer_index, is_last_transfer, std::move(on_done));
+          }
+        };
+    // Enqueue the transfer to the h2d thread.
+    h2d_thread_->Schedule(std::move(copy_to_gpu));
+    return absl::OkStatus();
+  }
+
+  void SetBufferError(int buffer_index, absl::Status error) override {
+    {
+      absl::MutexLock l(&mu_);
+      // For a given buffer_index, SetBufferError can't be called twice, or
+      // called after the last transfer has been enqueued.
+      CHECK(!definition_events_[buffer_index].back().IsConcrete());
+      definition_events_[buffer_index].back().SetError(error);
+    }
+    LOG(ERROR) << "SetBufferError sets the " << buffer_index
+               << "th buffer error: " << error;
+  }
+
+  void AddTransferMetadata(const TransferMetadata& meta) override {}
+
+ private:
+  static absl::InlinedVector<size_t, 4> GetBufferSizes(
+      absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4>& buffers) {
+    absl::InlinedVector<size_t, 4> buffer_sizes;
+    buffer_sizes.reserve(buffers.size());
+    for (const auto& buffer : buffers) {
+      buffer_sizes.push_back(buffer->GetOnDeviceSizeInBytes().value());
+    }
+    return buffer_sizes;
+  }
+
+  void CleanUp(int buffer_index, bool is_last_transfer,
+               absl::AnyInvocable<void() &&> on_done) {
+    {
+      tsl::profiler::TraceMe traceme(
+          "TfrtGpuAsyncHostToDeviceTransferManager::CleanUp");
+
+      absl::MutexLock l(&mu_);
+
+      CHECK_GT(transfers_in_flight_, 0);
+      --transfers_in_flight_;
+      VLOG(2) << "CleanUp for buffer_index=" << buffer_index
+              << " is_last_transfer=" << is_last_transfer
+              << " remaining_buffer_count_=" << remaining_buffer_count_
+              << "; this: " << this;
+
+      if (is_last_transfer) {
+        // Drop our reference to the TrackedDeviceBuffer for this buffer.
+        CHECK(buffer_ptrs_[buffer_index]);
+        buffer_ptrs_[buffer_index] = nullptr;
+        CHECK_GT(remaining_buffer_count_, 0);
+        --remaining_buffer_count_;
+        definition_events_[buffer_index].back().SetStateConcrete();
+        if (remaining_buffer_count_ == 0) {
+          VLOG(2) << "TransferLiteralToBuffer for all buffers is done. this: "
+                  << this;
+        }
+      }
+    }
+
+    // Call on_done after finishing all housekeeping and releasing the lock.
+    EnqueueWork(client_->non_blocking_thread_pool(), std::move(on_done));
+  }
+
+  absl::Mutex mu_;
+  // The newly created buffers, which will be returned to the caller via
+  // Retrieve.
+  absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers_
+      ABSL_GUARDED_BY(mu_);
+
+  // Just a single thread, to ensure transfers are ordered. Its lifetime is
+  // managed by H2DTransferManager. We assume `h2d_thread` is destructed before
+  // `client_`, so `on_done` callbacks on `h2d_thread` will be handled by
+  // threads managed by `client_`.
+  std::unique_ptr<WorkerThread> h2d_thread_;
+
+  absl::InlinedVector<tsl::AsyncValueRef<GpuDeviceMemory>, 4> buffer_ptrs_
+      ABSL_GUARDED_BY(mu_);
+  // Cached versions of the sizes of all the buffers, so we can return them
+  // without acquiring mu_.
+  const absl::InlinedVector<size_t, 4> buffer_sizes_;
+  // True if the last transfer for a buffer has been initiated. Used to
+  // prevent a client initiating another transfer after the last transfer has
+  // already been initiated.
+  absl::InlinedVector<bool, 4> last_transfer_started_ ABSL_GUARDED_BY(mu_);
+  // The buffer definition events on all the buffers, unblocked once the
+  // corresponding buffer transfer has completed.
+  absl::InlinedVector<absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4>, 4>
+      definition_events_ ABSL_GUARDED_BY(mu_);
+  // Device shapes for all buffers with either compact or custom layout.
+  const absl::InlinedVector<Shape, 4> device_shapes_;
+  // Count of buffers that have not yet been fully transferred.
+  size_t remaining_buffer_count_ ABSL_GUARDED_BY(mu_);
+  // Count of transfers that have been started but have not yet called
+  // cleanup. Used to block in the destructor to avoid dangling pointers in
+  // cleanup.
+  int transfers_in_flight_ ABSL_GUARDED_BY(mu_) = 0;
+
+  TfrtGpuDevice* const device_;  // not owned.
+  TfrtGpuClient* const client_;  // not owned.
+};
+
+std::optional<stream_executor::GpuTargetConfigProto> GetTargetConfigForDevices(
+    absl::Span<PjRtDevice* const> devices) {
+  if (devices.empty()) {
+    return std::nullopt;
+  }
+  // Temporary ability to disable TargetConfig via env var until
+  // internal tests can be fixed.
+  const char* disable_target_config_str =
+      std::getenv("PJRT_GPU_SE_DISABLE_TARGET_CONFIG");
+  int disable_target_config = 0;
+  if (disable_target_config_str &&
+      absl::SimpleAtoi(disable_target_config_str, &disable_target_config)) {
+    if (disable_target_config == 1) {
+      return std::nullopt;
+    }
+  }
+  for (const PjRtDevice* device : devices) {
+    se::StreamExecutor* executor =
+        tensorflow::down_cast<const TfrtGpuDevice*>(device)->executor();
+    if (executor != nullptr) {
+      return xla::Compiler::TargetConfig(executor).ToProto();
+    }
+  }
+  return std::nullopt;
+}
+
+absl::flat_hash_map<std::string, PjRtDeviceAttribute> GetAttrsForDevices(
+    std::optional<stream_executor::GpuTargetConfigProto> target_config) {
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attrs;
+  if (target_config.has_value()) {
+    std::string attr;
+    if (tsl::protobuf::TextFormat::PrintToString(*target_config, &attr)) {
+      attrs["target_config"] = std::move(attr);
+    }
+  }
+  return attrs;
+}
+
+class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
+ public:
+  TfrtGpuCopyToDeviceStream(int64_t channel_id, se::Stream* stream,
+                            se::DeviceMemoryBase dst,
+                            tsl::AsyncValueRef<std::unique_ptr<se::Event>> done)
+      : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
+        channel_id_(channel_id),
+        stream_(stream),
+        dst_(dst),
+        done_(std::move(done)) {}
+
+  PjRtFuture<> AddChunk(PjRtChunk chunk) final {
+    tsl::profiler::TraceMe trace([&] {
+      return tsl::profiler::TraceMeEncode("TfrtGpuCopyToDeviceStream::AddChunk",
+                                          {{"channel_id", channel_id_}});
+    });
+
+    absl::ReleasableMutexLock lock(&mu_);
+
+    VLOG(3) << "Add chunk to a H2D channel #" << channel_id_ << ": "
+            << "size=" << chunk.size() << ", "
+            << "current_bytes=" << current_bytes_ << ", "
+            << "total_bytes=" << total_bytes_;
+
+    if (chunk.size() % granule_size_in_bytes() != 0) {
+      done_.SetError(absl::InvalidArgumentError(absl::StrFormat(
+          "Chunk size (%d) was not a multiple of the granule size (%d)",
+          chunk.size(), granule_size_in_bytes())));
+      return PjRtFuture<>(done_.GetError());
+    }
+
+    if (current_bytes_ + chunk.size() > total_bytes_) {
+      done_.SetError(absl::InvalidArgumentError(
+          absl::StrFormat("Adding chunk of size %d would overflow buffer of "
+                          "size %d (%d already transferred)",
+                          chunk.size(), total_bytes_, current_bytes_)));
+      return PjRtFuture<>(done_.GetError());
+    }
+
+    se::DeviceMemoryBase dst(
+        reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
+        dst_.size() - current_bytes_);
+
+    current_bytes_ += chunk.size();
+    bool complete = IsCompleteLocked();
+    lock.Release();
+
+    auto copied = stream_->Memcpy(&dst, chunk.data(), chunk.size());
+    if (!copied.ok()) {
+      done_.SetError(copied);
+      return PjRtFuture<>(done_.GetError());
+    }
+
+    // Delete chunk once the memcpy operation completes.
+    auto deleted = stream_->DoHostCallback([chunk = std::move(chunk)]() {});
+    if (!deleted.ok()) {
+      done_.SetError(deleted);
+      return PjRtFuture<>(done_.GetError());
+    }
+
+    // Record done event once processed the last chunk. It is the caller
+    // responsibility to synchronize with this event before submitting any new
+    // computations to the stream.
+    if (complete) {
+      auto recorded = stream_->RecordEvent(done_.get().get());
+      if (!recorded.ok()) {
+        done_.SetError(recorded);
+        return PjRtFuture<>(done_.GetError());
+      }
+      done_.SetStateConcrete();
+    }
+
+    return PjRtFuture<>(absl::OkStatus());
+  }
+
+ private:
+  int64_t channel_id_;
+  se::Stream* stream_;
+  se::DeviceMemoryBase dst_;
+
+  // Async value will become available after we'll submit the last memcpy
+  // operation, and the event will be recorded on the stream.
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_;
+};
+
+template <typename T>
+const T* FindCallback(int channel_id, absl::Span<const T> callbacks) {
+  // TODO(ezhulenev): Can we use binary search here assuming that callbacks
+  // are sorted by channel id? Are they always sorted?
+  auto it = absl::c_find_if(callbacks, [&](const T& callback) {
+    return callback.channel_id == channel_id;
+  });
+  return it == callbacks.end() ? nullptr : &*it;
+}
+
+// Converts PjRt SendCallbacks to an XLA StreamExecutor send function.
+SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
+    int replica, const ExecuteOptions& options,
+    tsl::thread::ThreadPool* thread_pool) {
+  // Check if we have callbacks registered for the given replica.
+  if (replica >= options.send_callbacks.size()) {
+    return [replica](int64_t channel_id, se::Stream*, const Shape&,
+                     const se::DeviceMemoryBase&,
+                     const absl::flat_hash_map<std::string, std::string>&) {
+      return Internal(
+          "Don't send a buffer to the channel_id=%d, there was no send "
+          "callbacks registered for the replica=%d",
+          channel_id, replica);
+    };
+  }
+
+  // SendCallbacks registered for a device ordinal. Can be empty.
+  absl::Span<const SendCallback> callbacks = options.send_callbacks[replica];
+
+  return [callbacks, thread_pool](
+             int64_t channel_id, se::Stream* stream, const Shape& shape,
+             const se::DeviceMemoryBase& src,
+             const absl::flat_hash_map<std::string, std::string>&)
+             -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
+    VLOG(3) << "Send " << src.size() << " bytes to channel #" << channel_id
+            << " (shape=" << shape.ToString() << ")";
+
+    const SendCallback* send = FindCallback(channel_id, callbacks);
+    if (!send) {
+      return InvalidArgument(
+          "Failed to send a buffer to the channel_id=%d, callback not found",
+          channel_id);
+    }
+
+    // Allocate event that will signal completion of send operation. We do not
+    // actually track the completion of the send callback, we only have to keep
+    // the device memory long enough to complete the memcpy command.
+    TF_ASSIGN_OR_RETURN(auto se_event, stream->parent()->CreateEvent());
+    auto done_event =
+        tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+            std::move(se_event));
+
+    thread_pool->Schedule([done_event, stream, src, channel_id, shape, send] {
+      tsl::profiler::TraceMe trace([&] {
+        return tsl::profiler::TraceMeEncode("TfrtGpuExecutable::Send",
+                                            {{"channel_id", channel_id}});
+      });
+
+      // Allocate chunk on the host for copying data from device.
+      PjRtChunk chunk = PjRtChunk::AllocateDefault(src.size());
+
+      auto status = stream->Memcpy(chunk.data(), src, src.size());
+      if (!status.ok()) {
+        done_event.SetError(status);
+        return;
+      }
+      status = stream->RecordEvent(done_event.get().get());
+      if (!status.ok()) {
+        done_event.SetError(status);
+        return;
+      }
+
+      // Wait for the data to be available on the host.
+      if (auto st = stream->BlockHostUntilDone(); !st.ok()) {
+        done_event.SetError(absl::InternalError(absl::StrFormat(
+            "failed to synchronize send operation with a stream: %s",
+            st.message())));
+        return;
+      }
+
+      // Pass chunk to the registered callback.
+      auto sent = send->callback({shape}, std::move(chunk),
+                                 /*total_size_in_bytes=*/src.size(),
+                                 /*done=*/true);
+
+      if (!sent.ok()) {
+        done_event.SetError(sent);
+      } else {
+        done_event.SetStateConcrete();
+      }
+    });
+
+    return std::move(done_event);
+  };
+}
+
+RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
+    int replica, const ExecuteOptions& options) {
+  // Check if we have callbacks registered for the given replica.
+  if (replica >= options.send_callbacks.size()) {
+    return [replica](int64_t channel_id, se::Stream*, const Shape&,
+                     se::DeviceMemoryBase*,
+                     const absl::flat_hash_map<std::string, std::string>&) {
+      return InvalidArgument(
+          "Failed to receive a buffer from the channel_id=%d, there was no "
+          "recv callbacks registered for the replica=%d",
+          channel_id, replica);
+    };
+  }
+
+  // RecvCallbacks registered for a device ordinal. Can be empty.
+  absl::Span<const RecvCallback> callbacks = options.recv_callbacks[replica];
+
+  return [callbacks](int64_t channel_id, se::Stream* stream, const Shape& shape,
+                     se::DeviceMemoryBase* dst,
+                     const absl::flat_hash_map<std::string, std::string>&)
+             -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
+    VLOG(3) << "Recv from channel #" << channel_id
+            << " (shape=" << shape.ToString() << ")";
+
+    tsl::profiler::TraceMe trace([&] {
+      return tsl::profiler::TraceMeEncode("TfrtGpuExecutable::Recv",
+                                          {{"channel_id", channel_id}});
+    });
+
+    const RecvCallback* recv = FindCallback(channel_id, callbacks);
+    if (!recv) {
+      return InvalidArgument(
+          "Failed to recv a buffer from the channel_id=%d, callback not found",
+          channel_id);
+    }
+
+    // Allocate event that will signal completion of recv operation. We record
+    // it on a stream after submitting the memcpy for the last chunk (see
+    // `TfrtGpuCopyToDeviceStream` implementation above).
+    TF_ASSIGN_OR_RETURN(auto event, stream->parent()->CreateEvent());
+    auto done_event =
+        tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+            std::move(event));
+
+    recv->callback({shape}, std::make_unique<TfrtGpuCopyToDeviceStream>(
+                                channel_id, stream, *dst, done_event));
+
+    return std::move(done_event);
+  };
 }
 
 }  // namespace
@@ -94,25 +975,97 @@ TfrtGpuDevice::TfrtGpuDevice(Options&& options)
     : id_(options.id),
       local_device_id_(options.local_device_id),
       local_hardware_id_(options.local_hardware_id),
-      description_(options.id, options.platform_version) {
+      executor_(options.executor),
+      stream_(options.executor == nullptr
+                  ? nullptr
+                  : options.executor->CreateStream().value()),
+      prng_seed_generator_(prng_seed_device_()),
+      prng_seed_distribution_(std::numeric_limits<int>::min(),
+                              std::numeric_limits<int>::max()),
+      last_collective_launch_event_(
+          tsl::MakeAvailableAsyncValueRef<GpuEvent>()),
+      description_(options.id, options.process_index, options.platform_version),
+      max_inflight_computations_semaphore_(
+          /*capacity=*/options.max_inflight_computations) {
+  std::array<int, 1> coords = {local_device_id_.value()};
+  description_.SetCoords(coords);
+  std::vector<int64_t> v_coords(description_.coords().begin(),
+                                description_.coords().end());
+
+  description_.SetAttributes({
+      {"coords", xla::PjRtDeviceAttribute(v_coords)},
+      {"device_vendor", options.device_vendor},
+      {"slice_index", static_cast<int64_t>(options.slice_index)},
+      {"compute_capability",
+       xla::PjRtDeviceAttribute(options.compute_capability)},
+      {"core_count", static_cast<int64_t>(options.core_count)},
+  });
+
   description_.SetDebugString(absl::StrCat("TFRT_GPU_", id_));
   description_.SetToString(absl::StrCat("GpuDevice(id=", id_, ")"));
 }
 
+TfrtGpuDevice::~TfrtGpuDevice() {
+  // Block the host until all pending work on the stream is done. This is to
+  // avoid user-after-free errors in host callbacks.
+  if (stream_ != nullptr) {
+    absl::Status status = stream_->BlockHostUntilDone();
+    if (!status.ok()) {
+      LOG(ERROR) << "Failed to wait for stream to finish: " << status;
+    }
+  }
+}
+
+void TfrtGpuDevice::SetClient(PjRtClient* client) {
+  CHECK(client_ == nullptr);
+  client_ = client;
+
+  // We have to define debug_string_ and to_string_ here, because
+  // platform_name() requires client_ to be set.
+  CHECK(!client_->platform_name().empty());
+  std::string device_name =
+      absl::StrCat(MakeAsciiTitlecase(client_->platform_name()), "Device");
+  description_.SetDebugString(
+      absl::StrCat(client_->platform_name(), ":", id()));
+  description_.SetToString(absl::StrCat(device_name, "(id=", id(), ")"));
+}
+
+absl::StatusOr<TransferManager*> TfrtGpuDevice::GetTransferManager() {
+  // Downcast Base class to TfrtGpuClient.
+  TfrtGpuClient* client = tensorflow::down_cast<TfrtGpuClient*>(client_);
+  if (client == nullptr) {
+    return absl::InternalError("Client is null");
+  }
+  return client->xla_client()->backend().transfer_manager();
+}
+
 absl::Status TfrtGpuDevice::TransferToInfeed(const LiteralSlice& literal) {
-  return Unimplemented("TransferToInfeed");
+  TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager, GetTransferManager());
+
+  return transfer_manager->TransferLiteralToInfeed(executor_, literal);
 }
 
 absl::Status TfrtGpuDevice::TransferFromOutfeed(
     MutableBorrowingLiteral literal) {
-  return Unimplemented("TransferFromOutfeed");
+  TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager, GetTransferManager());
+
+  return transfer_manager->TransferLiteralFromOutfeed(executor_, literal);
+}
+
+int TfrtGpuDevice::GetNewPrngSeed() {
+  absl::MutexLock lock(&mu_);
+  int x = 0;
+  do {
+    x = prng_seed_distribution_(prng_seed_generator_);
+  } while (x == 0);
+  return x;
 }
 
 void TfrtGpuDevice::AttachMemorySpace(PjRtMemorySpace* memory_space,
                                       bool is_default) {
   CHECK(memory_space != nullptr);
   CHECK(client_ == memory_space->client()) << absl::StrFormat(
-      "Could not attach a PjRtStreamExecutorDevice to a PjRtMemorySpace owned "
+      "Could not attach a TfrtGpuExecutable to a PjRtMemorySpace owned "
       "by a different client, the device's client: %s, the memory space's "
       "client: %s.",
       client_->platform_name(), memory_space->client()->platform_name());
@@ -161,36 +1114,111 @@ absl::StatusOr<PjRtMemorySpace*> TfrtGpuDevice::default_memory_space() const {
   return default_memory_space_;
 }
 
-TfrtGpuClient::TfrtGpuClient(
-    int process_index, xla::LocalClient* xla_client,
-    std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
-    std::unique_ptr<tsl::Allocator> host_memory_allocator,
-    std::shared_ptr<const GpuTopology> gpu_topology)
-    : process_index_(process_index),
-      xla_client_(CHECK_NOTNULL(xla_client)),
-      platform_version_(get_platform_version(xla_client)),
-      owned_devices_(std::move(devices)) {
-  for (const std::unique_ptr<TfrtGpuDevice>& device : owned_devices_) {
-    devices_.push_back(device.get());
-    CHECK(
-        id_to_device_.emplace(device->global_device_id(), device.get()).second)
+absl::StatusOr<tsl::AllocatorStats> TfrtGpuDevice::GetAllocatorStats() const {
+  if (!IsAddressable()) {
+    return FailedPrecondition(
+        "GetAllocatorStats() is allowed only for addressable devices");
+  }
+
+  auto* allocator_adapter = dynamic_cast<se::MultiDeviceAdapter*>(allocator());
+  if (!allocator_adapter) {
+    return Unimplemented(
+        "GetAllocatorStats() is only implemented with MultiDeviceAdapter "
+        "allocator");
+  }
+
+  TF_ASSIGN_OR_RETURN(auto allocator, allocator_adapter->GetAllocator(
+                                          local_device_id().value()));
+
+  auto stats = allocator->GetStats();
+  TF_RET_CHECK(stats.has_value());
+  return stats.value();
+}
+
+se::DeviceMemoryAllocator* TfrtGpuDevice::allocator() const {
+  return tensorflow::down_cast<TfrtGpuClient*>(client())->allocator();
+}
+
+tsl::AsyncValueRef<GpuEvent> TfrtGpuDevice::SetLastCollectiveLaunchEvent(
+    tsl::AsyncValueRef<GpuEvent> event) {
+  absl::MutexLock lock(&mu_);
+  VLOG(2) << "SetLastCollectiveLaunchEvent: IsAvailable: "
+          << event.IsAvailable() << "; pointer: " << event.GetAsyncValue()
+          << "Old Event: IsAvailable: "
+          << last_collective_launch_event_.IsAvailable()
+          << "; pointer: " << last_collective_launch_event_.GetAsyncValue();
+  std::swap(last_collective_launch_event_, event);
+  return event;
+}
+
+namespace {
+
+std::vector<PjRtMemorySpace*> GetMemorySpacePointers(
+    const std::vector<std::unique_ptr<PjRtMemorySpace>>& memory_spaces) {
+  std::vector<PjRtMemorySpace*> memory_spaces_ptrs;
+  memory_spaces_ptrs.reserve(memory_spaces.size());
+  for (const std::unique_ptr<PjRtMemorySpace>& memory_space : memory_spaces) {
+    memory_spaces_ptrs.push_back(memory_space.get());
+  }
+  return memory_spaces_ptrs;
+}
+
+std::vector<PjRtDevice*> InitializeDevices(
+    PjRtClient* client,
+    const std::vector<std::unique_ptr<TfrtGpuDevice>>& owned_devices) {
+  std::vector<PjRtDevice*> devices;
+  devices.reserve(owned_devices.size());
+  for (const std::unique_ptr<TfrtGpuDevice>& device : owned_devices) {
+    device->SetClient(client);
+    devices.push_back(device.get());
+  }
+  return devices;
+}
+
+absl::flat_hash_map<PjRtGlobalDeviceId, TfrtGpuDevice*> GetIdToDeviceMap(
+    absl::Span<const std::unique_ptr<TfrtGpuDevice>> devices) {
+  absl::flat_hash_map<PjRtGlobalDeviceId, TfrtGpuDevice*> id_to_device;
+  for (const std::unique_ptr<TfrtGpuDevice>& device : devices) {
+    CHECK(id_to_device.emplace(device->global_device_id(), device.get()).second)
         << "Duplicate device id: " << device->id();
+  }
+  return id_to_device;
+}
 
-    device->SetClient(this);
+std::vector<PjRtDevice*> GetAddressableDevicePointers(
+    absl::Span<const std::unique_ptr<TfrtGpuDevice>> devices) {
+  std::vector<PjRtDevice*> addressable_devices;
+  for (const std::unique_ptr<TfrtGpuDevice>& device : devices) {
     if (device->IsAddressable()) {
       int idx = device->local_hardware_id().value();
-      if (idx >= addressable_devices_.size()) {
-        addressable_devices_.resize(idx + 1);
+      if (idx >= addressable_devices.size()) {
+        addressable_devices.resize(idx + 1);
       }
-      CHECK(addressable_devices_[idx] == nullptr) << idx;
-      addressable_devices_[idx] = device.get();
+      CHECK(addressable_devices[idx] == nullptr) << idx;
+      addressable_devices[idx] = device.get();
     }
   }
-  for (int idx = 0; idx < addressable_devices_.size(); ++idx) {
-    CHECK(addressable_devices_[idx] != nullptr) << idx;
+  for (PjRtDevice* device : addressable_devices) {
+    CHECK(device != nullptr);
   }
+  return addressable_devices;
+}
+
+StreamExecutorGpuTopologyDescription GetTopology(
+    absl::string_view platform_name,
+    std::shared_ptr<const GpuTopology> gpu_topology,
+    absl::Span<PjRtDevice* const> devices) {
+  auto target_config = GetTargetConfigForDevices(devices);
+  return StreamExecutorGpuTopologyDescription(
+      tsl::Fingerprint64(platform_name), platform_name, std::move(gpu_topology),
+      GetAttrsForDevices(target_config), target_config);
+}
 
-  for (auto* device : addressable_devices()) {
+std::vector<std::unique_ptr<PjRtMemorySpace>> InitializeMemorySpaces(
+    int global_device_count,
+    absl::Span<PjRtDevice* const> addressable_devices) {
+  std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces;
+  for (auto* device : addressable_devices) {
     // Use the device id to construct a globally unique memory space id. We do
     // not promise that memory space ids and device ids are the same.
     TfrtGpuDevice* gpu_device = tensorflow::down_cast<TfrtGpuDevice*>(device);
@@ -199,94 +1227,2966 @@ TfrtGpuClient::TfrtGpuClient(
     auto memory_space =
         std::make_unique<TfrtGpuDeviceMemorySpace>(global_device_id, device);
     gpu_device->AttachMemorySpace(memory_space.get(), /*is_default=*/true);
-    memory_spaces_.push_back(memory_space.get());
-    owned_memory_spaces_.push_back(std::move(memory_space));
+    memory_spaces.push_back(std::move(memory_space));
   }
-  const int basePinnedId = device_count();
-  for (auto* device : addressable_devices()) {
+  const int basePinnedId = global_device_count;
+  for (auto* device : addressable_devices) {
     TfrtGpuDevice* gpu_device = tensorflow::down_cast<TfrtGpuDevice*>(device);
     const int global_device_id = gpu_device->global_device_id().value();
     auto pinned = std::make_unique<PinnedHostMemorySpace>(
         basePinnedId + global_device_id, device);
     gpu_device->AttachMemorySpace(pinned.get());
-    memory_spaces_.push_back(pinned.get());
-    owned_memory_spaces_.push_back(std::move(pinned));
+    memory_spaces.push_back(std::move(pinned));
   }
-
-  LOG(INFO) << "TfrtGpuClient created.";
+  // We don't promise anything about the order of memory spaces, but this
+  // sorting is done for consistency with the device list that's sorted above.
+  absl::c_sort(memory_spaces, [](const std::unique_ptr<PjRtMemorySpace>& a,
+                                 const std::unique_ptr<PjRtMemorySpace>& b) {
+    return a->id() < b->id();
+  });
+  return memory_spaces;
 }
 
-absl::Span<PjRtMemorySpace* const> TfrtGpuClient::memory_spaces() const {
-  return memory_spaces_;
+}  // namespace
+
+TfrtGpuClient::TfrtGpuClient(
+    std::string platform_name, int process_index, xla::LocalClient* xla_client,
+    std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
+    bool should_stage_host_to_device_transfers,
+    MaybeOwning<se::DeviceMemoryAllocator> allocator,
+    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
+    std::shared_ptr<KeyValueStoreInterface> kv_store,
+    std::shared_ptr<const GpuTopology> gpu_topology)
+    : process_index_(process_index),
+      platform_name_(std::move(platform_name)),
+      xla_client_(CHECK_NOTNULL(xla_client)),
+      should_stage_host_to_device_transfers_(
+          should_stage_host_to_device_transfers),
+      allocator_(std::move(allocator)),
+      host_memory_allocator_(std::make_unique<HostMemoryAllocator>(
+          std::move(host_memory_allocator))),
+      devices_(InitializeDevices(this, devices)),
+      id_to_device_(GetIdToDeviceMap(devices)),
+      addressable_devices_(GetAddressableDevicePointers(devices)),
+      computation_placer_(std::make_unique<ComputationPlacer>()),
+      owned_memory_spaces_(
+          InitializeMemorySpaces(devices.size(), addressable_devices_)),
+      memory_spaces_(GetMemorySpacePointers(owned_memory_spaces_)),
+      gpu_run_options_(std::move(gpu_run_options)),
+      transpose_cache_(1024),
+      topology_(GetTopology(platform_name_, std::move(gpu_topology),
+                            addressable_devices_)),
+      kv_store_(std::move(kv_store)),
+      owned_devices_(std::move(devices)),
+      compile_thread_pool_(std::make_unique<tsl::thread::ThreadPool>(
+          tsl::Env::Default(), tsl::ThreadOptions(),
+          "TfrtGpuClient_compile_thread_pool",
+          std::max<int>(DefaultThreadPoolSize(), xla_client->device_count()),
+          true)),
+      blocking_thread_pool_(std::make_unique<tsl::thread::ThreadPool>(
+          tsl::Env::Default(), tsl::ThreadOptions(),
+          "TfrtGpuClient_blocking_thread_pool",
+          std::max<int>(DefaultThreadPoolSize(), xla_client->device_count()),
+          true)),
+      non_blocking_thread_pool_(std::make_unique<tsl::thread::ThreadPool>(
+          tsl::Env::Default(), tsl::ThreadOptions(),
+          "TfrtGpuClient_non_blocking_thread_pool",
+          std::max<int>(DefaultThreadPoolSize(), xla_client->device_count()),
+          true)) {
+  LOG(INFO) << "TfrtGpuClient created with " << addressable_devices_.size()
+            << " / " << devices_.size() << " addressable devices.";
 }
 
-static absl::StatusOr<std::vector<std::unique_ptr<TfrtGpuDevice>>>
-GetTfrtGpuDevices(LocalClient* xla_client) {
-  std::vector<std::unique_ptr<TfrtGpuDevice>> devices;
-  int i = 0;
-  for (se::StreamExecutor* executor :
-       xla_client->backend().stream_executors()) {
-    // TODO(b/382117736): allow GPU allocator parameters to be configurable.
-    TF_ASSIGN_OR_RETURN(auto allocator,
-                        CreateBFCAllocator(executor, /*memory_fraction=*/0.9,
-                                           /*preallocate=*/true, std::nullopt));
-
-    TfrtGpuDevice::Options options;
-    options.id = i;
-    options.local_device_id = PjRtLocalDeviceId(i);
-    options.local_hardware_id = PjRtLocalHardwareId(i);
-    options.executor = executor;
-    options.allocator = std::move(allocator);
-    options.stream_capacity = 4;
-    options.max_inflight_computations = 1;
-    const se::Platform* platform = executor->GetPlatform();
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<xla::se::DeviceDescription> desc,
-        platform->DescriptionForDevice(options.local_hardware_id.value()));
-    options.platform_version = desc->name();
+TfrtGpuClient::~TfrtGpuClient() { LOG(INFO) << "TfrtGpuClient destroyed."; }
 
-    auto device = std::make_unique<TfrtGpuDevice>(std::move(options));
-    devices.push_back(std::move(device));
-    ++i;
-  }
-  return std::move(devices);
+absl::string_view TfrtGpuClient::platform_version() const {
+#define STRINGIFY2(X) #X
+#define STRINGIFY(X) STRINGIFY2(X)
+#if TENSORFLOW_USE_ROCM && defined(TF_ROCM_VERSION)  // rocm
+  // TF_ROCM_VERSION format may change in future. Use it
+  // cautiously
+  return "rocm " STRINGIFY(TF_ROCM_VERSION);
+#elif GOOGLE_CUDA && defined(CUDART_VERSION)  // cuda
+  return "cuda " STRINGIFY(CUDART_VERSION);
+#else
+  return "<unknown>";
+#endif  // TENSORFLOW_USE_ROCM && defined(TF_ROCM_VERSION)
 }
 
-absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
-    const GpuClientOptions& options) {
-  TF_ASSIGN_OR_RETURN(
-      LocalClient * xla_client,
-      GetGpuXlaClient(options.platform_name, options.allowed_devices));
-  EnablePeerAccess(xla_client->backend().stream_executors());
-  std::unique_ptr<tsl::Allocator> host_memory_allocator;
-  if (!xla_client->backend().stream_executors().empty()) {
-    TF_ASSIGN_OR_RETURN(
-        host_memory_allocator,
-        GetGpuHostAllocator(xla_client->backend().stream_executors().front()));
+absl::StatusOr<PjRtDevice*> TfrtGpuClient::LookupDevice(
+    PjRtGlobalDeviceId global_device_id) const {
+  auto it = id_to_device_.find(global_device_id);
+  if (it != id_to_device_.end()) {
+    return it->second;
   }
-  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
-                      GetTfrtGpuDevices(xla_client));
+  return InvalidArgument("No matching device found for device_id %d",
+                         global_device_id.value());
+}
 
-  GpuTopologyProto gpu_topology_proto;
-  for (const auto& device : devices) {
-    if (gpu_topology_proto.platform_version().empty()) {
-      gpu_topology_proto.set_platform_version(
-          std::string(device->device_kind()));
+absl::StatusOr<PjRtDevice*> TfrtGpuClient::LookupAddressableDevice(
+    PjRtLocalDeviceId local_device_id) const {
+  for (auto* device : addressable_devices_) {
+    if (local_device_id == device->local_device_id()) {
+      return device;
     }
-    gpu_topology_proto.add_device_ids(device->id());
   }
+  return InvalidArgument("No matching device found for local_hardware_id %d",
+                         local_device_id.value());
+}
 
-  // TODO(b/382117736): Support multi-host
-  gpu_topology_proto.set_num_slices(1);
-  gpu_topology_proto.set_num_hosts_per_slice(1);
-  gpu_topology_proto.set_num_devices_per_host(devices.size());
+absl::StatusOr<Layout> TfrtGpuClient::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) {
+  return topology_.GetDefaultLayout(element_type, dims);
+}
 
+absl::StatusOr<std::unique_ptr<HloCostAnalysis>>
+TfrtGpuClient::GetHloCostAnalysis() const {
+  return std::make_unique<HloCostAnalysis>(
+      xla_client_->backend().compiler()->ShapeSizeBytesFunction());
+}
+
+absl::Span<PjRtMemorySpace* const> TfrtGpuClient::memory_spaces() const {
+  return memory_spaces_;
+}
+
+absl::StatusOr<DeviceAssignment> TfrtGpuClient::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  return computation_placer_->AssignDevices(num_replicas, num_partitions);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> TfrtGpuClient::Compile(
+    const XlaComputation& computation, CompileOptions options) {
+  return Compile(computation, options, /*lookup_addressable_devices=*/false);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> TfrtGpuClient::Compile(
+    const XlaComputation& computation, CompileOptions options,
+    bool lookup_addressable_devices) {
+  std::vector<const Shape*> argument_layout_pointers;
+  const ExecutableBuildOptions& build_options =
+      options.executable_build_options;
+  const bool allow_auto_layout =
+      build_options.has_debug_options() &&
+      build_options.debug_options().xla_pjrt_allow_auto_layout_in_hlo();
+  TF_RETURN_IF_ERROR(DetermineArgumentLayoutsFromCompileOptions(
+      computation,
+      [local_client = xla_client_,
+       allow_auto_layout](Shape shape) -> absl::StatusOr<Shape> {
+        if (allow_auto_layout && !shape.has_layout()) {
+          return shape;
+        }
+        return local_client->backend()
+            .transfer_manager()
+            ->ChooseCompactLayoutForShape(shape);
+      },
+      options.argument_layouts, &options.executable_build_options,
+      &argument_layout_pointers));
+  return CompileInternal(computation, argument_layout_pointers,
+                         /* layout_canonicalization_callback = */ nullptr,
+                         options, lookup_addressable_devices);
+
+  // TODO: b/382117736 - Record free gpu memory.
+  // Ref:
+  // https://github.com/openxla/xla/blob/b729ae319d85d5ec1ec11c488092c2d6683a63f2/xla/pjrt/gpu/se_gpu_pjrt_client.cc#L792-L809
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> TfrtGpuClient::CompileInternal(
+    const XlaComputation& computation,
+    const std::vector<const Shape*>& argument_layout_pointers,
+    LayoutCanonicalizationCallback layout_canonicalization_callback,
+    CompileOptions options, bool lookup_addressable_devices) {
+  tsl::profiler::TraceMe traceme("TfrtGpuClient::CompileInternal");
+  VLOG(1) << "TfrtGpuClient::CompileInternal";
+  if (key_value_store().has_value() &&
+      !options.executable_build_options.key_value_store()) {
+    options.executable_build_options.set_key_value_store(*key_value_store());
+  }
+  auto input_options = options;
+
+  TF_RETURN_IF_ERROR(options.ApplyAllOptionOverrides());
+  TF_RETURN_IF_ERROR(
+      UpdateCompileOptions(&options, lookup_addressable_devices));
+
+  // It is important to set the canonicalization callback after creating
+  // a copy of the options so that the executable's options remain without
+  // the callback - the callback would break the executable's serializability.
+  if (layout_canonicalization_callback) {
+    options.executable_build_options.set_layout_canonicalization_callback(
+        layout_canonicalization_callback);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+      xla_client_->Compile(computation, argument_layout_pointers,
+                           options.executable_build_options));
+
+  return BuildPjRtExecutable(std::move(local_executables),
+                             std::move(input_options));
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> TfrtGpuClient::Compile(
+    mlir::ModuleOp module, CompileOptions options) {
+  return Compile(module, options, /*lookup_addressable_devices=*/false);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> TfrtGpuClient::Compile(
+    mlir::ModuleOp module, CompileOptions options,
+    bool lookup_addressable_devices) {
+  XlaComputation xla_computation;
+  const ExecutableBuildOptions& exec_build_options =
+      options.executable_build_options;
+  TF_RETURN_IF_ERROR(MlirToXlaComputation(
+      module, xla_computation,
+      /*use_tuple_args=*/options.parameter_is_tupled_arguments,
+      /*return_tuple=*/false, exec_build_options.use_shardy_partitioner()));
+
+  // If the compile options specify argument layout, then let's
+  // fall back to using the options to determine layouts.
+  if (options.argument_layouts) {
+    return Compile(xla_computation, options, lookup_addressable_devices);
+  }
+
+  TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> arg_layout_modes,
+                      GetArgLayoutModes(module));
+  TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> out_layout_modes,
+                      GetOutputLayoutModes(module));
+  TF_ASSIGN_OR_RETURN(std::vector<MemorySpaceColor> arg_memory_spaces,
+                      GetArgMemoryKinds(module));
+  TF_ASSIGN_OR_RETURN(std::vector<MemorySpaceColor> out_memory_spaces,
+                      GetOutputMemoryKinds(module));
+
+  // If auto-sharding modifies shapes of arguments and/or result,
+  // we get a callback to restore the layouts. Let us restore the layouts
+  // according to the attributes we parsed from MLIR.
+  auto layout_callback = [local_client = xla_client_, &arg_layout_modes,
+                          &out_layout_modes, &arg_memory_spaces,
+                          &out_memory_spaces](const HloModule& module)
+      -> absl::StatusOr<std::pair<std::vector<Shape>, Shape>> {
+    XlaComputation xla_computation(XlaComputation(module.ToProto()));
+    return LayoutModesToXlaShapes(
+        xla_computation, arg_layout_modes, out_layout_modes, arg_memory_spaces,
+        out_memory_spaces,
+        [local_client](Shape shape) -> absl::StatusOr<Shape> {
+          return local_client->backend()
+              .transfer_manager()
+              ->ChooseCompactLayoutForShape(shape);
+        });
+  };
+
+  // This call will update result_layout in options.executable_build_options.
+  TF_ASSIGN_OR_RETURN(auto arg_layouts_and_pointers,
+                      LayoutModesToXla(
+                          xla_computation, arg_layout_modes, out_layout_modes,
+                          arg_memory_spaces, out_memory_spaces,
+                          [this](Shape shape) -> absl::StatusOr<Shape> {
+                            return this->xla_client_->backend()
+                                .transfer_manager()
+                                ->ChooseCompactLayoutForShape(shape);
+                          },
+                          options.executable_build_options));
+  return CompileInternal(xla_computation, arg_layouts_and_pointers.second,
+                         layout_callback, options, lookup_addressable_devices);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfrtGpuClient::CompileAndLoad(const XlaComputation& computation,
+                              CompileOptions options) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtExecutable> executable,
+                      Compile(computation, options,
+                              /*lookup_addressable_devices=*/true));
+  return Load(std::move(executable), LoadOptions());
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfrtGpuClient::CompileAndLoad(mlir::ModuleOp module, CompileOptions options) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtExecutable> executable,
+                      Compile(module, options,
+                              /*lookup_addressable_devices=*/true));
+  return Load(std::move(executable), LoadOptions());
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+TfrtGpuClient::CreateViewOfDeviceBuffer(
+    void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
+    std::function<void()> on_delete_callback,
+    std::optional<std::intptr_t> stream) {
+  CHECK_EQ(memory_space->devices().size(), 1);
+  auto* device = memory_space->devices().front();
+  size_t byte_size = ShapeUtil::ByteSizeOf(shape);
+  se::DeviceMemoryBase device_memory(device_ptr, byte_size);
+  auto non_owning_buffer = GpuDeviceMemory(device_memory);
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(non_owning_buffer));
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref),
+      /*definition_event=*/tsl::MakeAvailableAsyncValueRef<GpuEvent>(),
+      std::move(on_delete_callback));
+  return std::make_unique<TfrtGpuBuffer>(
+      shape, std::move(tracked_device_buffer), this,
+      tsl::down_cast<TfrtGpuDevice*>(device), memory_space);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+TfrtGpuClient::CreateUninitializedBuffer(const Shape& shape,
+                                         PjRtMemorySpace* memory_space) {
+  tsl::profiler::TraceMe traceme("TfrtGpuClient::CreateUninitializedBuffer");
+  VLOG(1) << "TfrtGpuClient::CreateUninitializedBuffer: shape: "
+          << shape.DebugString()
+          << " memory_space: " << memory_space->DebugString();
+  TransferManager* transfer_manager =
+      xla_client()->backend().transfer_manager();
+  TF_ASSIGN_OR_RETURN(Shape compact_shape,
+                      transfer_manager->ChooseCompactLayoutForShape(shape));
+  return AllocateTfrtGpuDestinationBuffer(
+      compact_shape, /*definition_events=*/{},
+      tsl::down_cast<TfrtGpuDevice*>(memory_space->devices()[0]), this,
+      memory_space);
+}
+
+absl::StatusOr<std::string> TfrtGpuExecutable::SerializeExecutable() const {
+  if (executables_.size() != 1) {
+    // TODO(b/382117736): Change SerializeExecutable interface to support
+    // multiple partitions.
+    return absl::FailedPreconditionError(
+        "SerializeExecutable with >1 partitions not yet supported");
+  }
+  Executable* built_executable = executables_[0]->executable();
+  Compiler* compiler = client_->xla_client()->backend().compiler();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
+                      compiler->Export(built_executable));
+  TF_ASSIGN_OR_RETURN(std::string serialized, aot_result->SerializeAsString());
+  if (serialized.empty()) {
+    return Internal(
+        "TfrtGpuExecutable::SerializeExecutable proto serialization "
+        "failed");
+  }
+  ExecutableAndOptionsProto proto;
+  *proto.mutable_serialized_executable() = std::move(serialized);
+  TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
+                      compile_options_.ToProto());
+  *proto.mutable_pjrt_client_name() = kPjRtClientName;
+  return proto.SerializeAsString();
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+TfrtGpuClient::BuildPjRtExecutable(
+    std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+    CompileOptions compile_options) {
+  if (local_executables.empty()) {
+    return Internal("No local executable");
+  }
+  if (local_executables.size() != 1) {
+    return Unimplemented("Multiple executables are not supported");
+  }
+  Executable* built_executable = local_executables[0]->executable();
+  if (!built_executable->has_module()) {
+    return absl::InternalError("Executable does not have HLO modules.");
+  }
+  const auto& hlo_module = built_executable->module();
+
+  const int num_replicas = hlo_module.config().replica_count();
+  const int num_partitions = hlo_module.config().num_partitions();
+  const std::string name = hlo_module.name();
+  const std::string fingerprint = hlo_module.GetFingerprint128();
+
+  return std::make_unique<StreamExecutorExecutable>(
+      std::move(compile_options), std::move(local_executables), xla_client_,
+      num_replicas, num_partitions, name, fingerprint,
+      memory_spaces()[0]->kind());
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+TfrtGpuClient::DeserializeExecutable(
+    absl::string_view serialized,
+    std::optional<CompileOptions> compile_options) {
+  TF_ASSIGN_OR_RETURN(
+      auto local_executables_and_options,
+      DeserializeToLocalExecutable(serialized, compile_options));
+
+  return BuildPjRtExecutable(std::move(local_executables_and_options.first),
+                             local_executables_and_options.second);
+}
+
+absl::StatusOr<
+    std::pair<std::vector<std::unique_ptr<LocalExecutable>>, CompileOptions>>
+TfrtGpuClient::DeserializeToLocalExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options) {
+  ExecutableAndOptionsProto proto;
+  if (serialized.size() > std::numeric_limits<int>::max()) {
+    return Internal("Proto is too large (>2GB)");
+  }
+  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+    return Internal("Proto deserialization failed");
+  }
+  if (!proto.pjrt_client_name().empty() &&
+      proto.pjrt_client_name() != kPjRtClientName) {
+    return Internal(
+        "Serialized executable is from an incompatible PjRt client type. "
+        "PjRt client type expected by the serialized executable: %s",
+        proto.pjrt_client_name());
+  }
+
+  CompileOptions compile_options;
+  if (options.has_value()) {
+    compile_options = *std::move(options);
+  } else {
+    TF_ASSIGN_OR_RETURN(compile_options,
+                        CompileOptions::FromProto(proto.compile_options()));
+  }
+
+  tsl::profiler::TraceMe traceme("TfrtGpuClient::DeserializeToLocalExecutable");
+  VLOG(1) << "TfrtGpuClient::DeserializeToLocalExecutable";
+
+  std::string str = std::move(*proto.mutable_serialized_executable());
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<LocalExecutable> loaded,
+      xla_client_->Load(str, compile_options.executable_build_options));
+
+  std::vector<std::unique_ptr<LocalExecutable>> local_executables;
+  local_executables.push_back(std::move(loaded));
+
+  return std::make_pair(std::move(local_executables), compile_options);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfrtGpuClient::LoadSerializedExecutable(absl::string_view serialized,
+                                        std::optional<CompileOptions> options,
+                                        const LoadOptions& load_options) {
+  TF_ASSIGN_OR_RETURN(auto local_executables_and_options,
+                      DeserializeToLocalExecutable(serialized, options));
+  return LoadInternal(std::move(local_executables_and_options.first),
+                      local_executables_and_options.second);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfrtGpuClient::LoadInternal(
+    std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+    CompileOptions compile_options) {
+  auto input_options = compile_options;
+
+  TF_RETURN_IF_ERROR(compile_options.ApplyAllOptionOverrides());
+
+  TF_ASSIGN_OR_RETURN(
+      ExecutableExtras extras,
+      UpdateCompileOptionsAndGetExecutableExtras(&compile_options));
+  std::shared_ptr<DeviceAssignment>& device_assignment =
+      extras.device_assignment;
+  std::vector<TfrtGpuExecutable::LogicalDeviceIds>&
+      addressable_device_logical_ids = extras.addressable_device_logical_ids;
+  std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
+
+  const auto& ex_options = compile_options.executable_build_options;
+  const bool xla_gpu_dump_hlo_unoptimized_snapshots =
+      ex_options.has_debug_options() &&
+      ex_options.debug_options().xla_gpu_dump_hlo_unoptimized_snapshots();
+  HloModuleProto hlo_module_proto;
+  if (xla_gpu_dump_hlo_unoptimized_snapshots) {
+    hlo_module_proto = local_executables[0]->executable()->module().ToProto();
+  }
+
+  auto executable = std::make_unique<TfrtGpuExecutable>(
+      std::move(local_executables),
+      compile_options.parameter_is_tupled_arguments,
+      std::move(device_assignment), std::move(input_options),
+      std::move(addressable_device_logical_ids), std::move(addressable_devices),
+      this);
+
+  TF_RETURN_IF_ERROR(
+      executable->SetUpDonation(compile_options.parameter_is_tupled_arguments));
+  if (xla_gpu_dump_hlo_unoptimized_snapshots) {
+    executable->SetInputHloSnapshotBits(
+        std::move(hlo_module_proto),
+        compile_options.executable_build_options.debug_options());
+  }
+  return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtGpuClient::Load(
+    std::unique_ptr<PjRtExecutable> executable,
+    const LoadOptions& load_options) {
+  auto se_executable = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorExecutable*>(executable.release()));
+  CompileOptions compile_options = se_executable->compile_options();
+
+  tsl::profiler::TraceMe traceme("TfrtGpuClient::Load");
+  VLOG(1) << "TfrtGpuClient::Load";
+
+  TF_ASSIGN_OR_RETURN(
+      auto local_executables,
+      se_executable->ConsumeExecutable(xla_client_, compile_options));
+  return LoadInternal(std::move(local_executables), compile_options);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::CreateErrorBuffer(
+    absl::Status error, const Shape& shape, PjRtMemorySpace* memory_space) {
+  CHECK_EQ(memory_space->devices().size(), 1);
+  if (memory_space->client() != this) {
+    return absl::InvalidArgumentError(
+        "Memory space is not attached to this client");
+  }
+
+  if (IsMemorySpaceKind<UnpinnedHostMemorySpace>(memory_space)) {
+    return absl::InvalidArgumentError(
+        "Error buffers are not supported for unpinned host memory yet");
+  }
+
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(memory_space->devices().front());
+  VLOG(3) << "TfrtGpuClient::CreateErrorBuffer: shape: " << shape.ToString()
+          << " device: " << device->DebugString() << " error: " << error;
+
+  auto buffer_async_value_ref = tsl::MakeErrorAsyncValueRef(error);
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref),
+      /*definition_event=*/tsl::MakeErrorAsyncValueRef(std::move(error)));
+  return std::make_unique<TfrtGpuBuffer>(
+      shape, std::move(tracked_device_buffer), this,
+      tsl::down_cast<TfrtGpuDevice*>(device), memory_space);
+}
+
+absl::Status TfrtGpuClient::UpdateCompileOptions(
+    CompileOptions* options, bool lookup_addressable_devices) {
+  return UpdateCompileOptionsInternal(options, /*returned_extras=*/nullptr,
+                                      lookup_addressable_devices);
+}
+
+absl::StatusOr<TfrtGpuClient::ExecutableExtras>
+TfrtGpuClient::UpdateCompileOptionsAndGetExecutableExtras(
+    CompileOptions* options) {
+  ExecutableExtras extras;
+  TF_RETURN_IF_ERROR(UpdateCompileOptionsInternal(
+      options, &extras, /*lookup_addressable_devices=*/true));
+  return extras;
+}
+
+absl::Status TfrtGpuClient::UpdateCompileOptionsInternal(
+    CompileOptions* options, ExecutableExtras* returned_extras,
+    bool lookup_addressable_devices) {
+  ExecutableBuildOptions& build_options = options->executable_build_options;
+  if (!build_options.compile_thread_pool()) {
+    build_options.set_compile_thread_pool(compile_thread_pool_.get());
+  }
+  if (!build_options.device_allocator()) {
+    build_options.set_device_allocator(allocator());
+  }
+
+  auto layout_callback = [local_client = xla_client_,
+                          options](const HloModule& module)
+      -> absl::StatusOr<std::pair<std::vector<Shape>, Shape>> {
+    ExecutableBuildOptions build_options = options->executable_build_options;
+    std::vector<const Shape*> argument_layout_pointers;
+    std::optional<std::vector<Shape>> argument_layouts =
+        options->argument_layouts;
+    Shape result_layout;
+    const bool allow_auto_layout =
+        build_options.has_debug_options() &&
+        build_options.debug_options().xla_pjrt_allow_auto_layout_in_hlo();
+    TF_RETURN_IF_ERROR(DetermineArgumentLayoutsFromCompileOptions(
+        XlaComputation(module.ToProto()),
+        [local_client,
+         allow_auto_layout](Shape shape) -> absl::StatusOr<Shape> {
+          if (allow_auto_layout && !shape.has_layout()) {
+            return shape;
+          }
+          return local_client->backend()
+              .transfer_manager()
+              ->ChooseCompactLayoutForShape(shape);
+        },
+        argument_layouts, &build_options, &argument_layout_pointers));
+    result_layout = *build_options.result_layout();
+    return std::make_pair(*argument_layouts, result_layout);
+  };
+
+  build_options.set_layout_canonicalization_callback(layout_callback);
+
+  // We don't look up devices when it is not required. It could fail if
+  // we look up a device ID on a client with a different topology.
+  // Note that we always look up devices for XLA GPU shard autotuning, as it
+  // needs to know the number of processes and the current process index.
+  const bool use_xla_gpu_shard_autotuning =
+      build_options.has_debug_options() &&
+      build_options.debug_options().xla_gpu_shard_autotuning();
+  if (!lookup_addressable_devices && !use_xla_gpu_shard_autotuning) {
+    if (build_options.device_ordinal() < 0) {
+      build_options.set_device_ordinal(0);
+    }
+    return absl::OkStatus();
+  }
+
+  ExecutableExtras extras;
+  std::shared_ptr<DeviceAssignment>& device_assignment =
+      extras.device_assignment;
+  std::vector<TfrtGpuExecutable::LogicalDeviceIds>&
+      addressable_device_logical_ids = extras.addressable_device_logical_ids;
+  std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
+
+  int num_replicas;
+  int num_partitions;
+  TF_RETURN_IF_ERROR(ParseDeviceAssignmentCompileOptions(
+      options->compile_portable_executable, &options->executable_build_options,
+      [this](int num_replicas, int num_partitions) {
+        return this->GetDefaultDeviceAssignment(num_replicas, num_partitions);
+      },
+      &num_replicas, &num_partitions, &device_assignment));
+
+  // Find devices that are addressable by this client/task.
+  if (device_assignment != nullptr) {
+    addressable_device_logical_ids.reserve(num_replicas * num_partitions);
+    addressable_devices.reserve(num_replicas * num_partitions);
+    absl::flat_hash_set<int> all_process_indices;
+    std::optional<int> this_process_index;
+    for (int replica = 0; replica < num_replicas; ++replica) {
+      for (int partition = 0; partition < num_partitions; ++partition) {
+        int64_t device_id = (*device_assignment)(replica, partition);
+        PjRtGlobalDeviceId global_device_id(device_id);
+
+        TF_ASSIGN_OR_RETURN(PjRtDevice * device,
+                            LookupDevice(global_device_id));
+        all_process_indices.insert(device->process_index());
+        if (device->process_index() != process_index()) {
+          VLOG(3) << "Non-local device: " << device_id;
+          continue;
+        }
+        if (!this_process_index.has_value()) {
+          this_process_index = all_process_indices.size() - 1;
+        }
+        PjRtLoadedExecutable::LogicalDeviceIds logica_device_ids;
+        logica_device_ids.replica = replica;
+        logica_device_ids.partition = partition;
+        addressable_device_logical_ids.push_back(std::move(logica_device_ids));
+        addressable_devices.push_back(device);
+      }
+    }
+    if (addressable_devices.empty()) {
+      if (build_options.device_ordinal() < 0) {
+        build_options.set_device_ordinal(0);
+      }
+    } else {
+      if (build_options.device_ordinal() < 0) {
+        build_options.set_device_ordinal(
+            addressable_devices.front()->local_hardware_id().value());
+      }
+      build_options.set_process_index(*this_process_index);
+      build_options.set_process_count(all_process_indices.size());
+    }
+  }
+  if (returned_extras != nullptr) {
+    *returned_extras = std::move(extras);
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    HostBufferSemantics host_buffer_semantics,
+    absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+    PjRtMemorySpace* memory_space, const Layout* device_layout) {
+  TfrtGpuDevice* device =
+      tsl::down_cast<TfrtGpuDevice*>(memory_space->devices()[0]);
+
+  tsl::profiler::TraceMe traceme("TfrtGpuClient::BufferFromHostBuffer");
+  Shape device_shape = ShapeUtil::MakeShape(type, dims);
+  VLOG(3) << "TfrtGpuClient::BufferFromHostBuffer: shape: "
+          << device_shape.ToString() << " device: " << device->DebugString();
+  absl::InlinedVector<int64_t, 4> tmp_strides;
+  if (!byte_strides) {
+    tmp_strides.resize(dims.size());
+    TF_RETURN_IF_ERROR(
+        ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(tmp_strides)));
+    byte_strides = tmp_strides;
+  }
+
+  int64_t byte_size = ShapeUtil::ByteSizeOf(device_shape);
+
+  TransferManager* transfer_manager = xla_client_->backend().transfer_manager();
+  if (device_layout != nullptr) {
+    *(device_shape.mutable_layout()) = *device_layout;
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        device_shape,
+        transfer_manager->ChooseCompactLayoutForShape(device_shape));
+  }
+
+  absl::InlinedVector<int64_t, 4> shape_strides(
+      device_shape.dimensions().size());
+  TF_RETURN_IF_ERROR(
+      ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
+  bool host_and_device_strides_equal =
+      (byte_size == 0 || *byte_strides == shape_strides);
+
+  std::shared_ptr<TransposePlan> transpose;
+  if (!host_and_device_strides_equal) {
+    absl::InlinedVector<int64_t, 4> permutation(dims.size());
+    absl::c_reverse_copy(device_shape.layout().minor_to_major(),
+                         permutation.begin());
+    TransposePlan::Options options;
+    options.elem_size_in_bytes = primitive_util::ByteWidth(type);
+    options.dims = dims;
+    options.permutation = permutation;
+    options.input_layout = TransposePlan::Striding{*byte_strides};
+    absl::MutexLock lock(&transpose_mu_);
+    TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
+  }
+
+  bool should_pack = primitive_util::IsSubByteNonPredType(type) &&
+                     transfer_manager->PackSubbyteTypes();
+  int64_t packed_size;
+  if (should_pack) {
+    packed_size =
+        CeilOfRatio<int64_t>(byte_size, 8 / primitive_util::BitWidth(type));
+  } else {
+    packed_size = byte_size;
+  }
+  auto dst_definition_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<TfrtGpuBuffer> output_buffer,
+                      AllocateTfrtGpuDestinationBuffer(
+                          device_shape, {dst_definition_event.CopyRef()},
+                          device, this, memory_space, packed_size));
+  auto copy_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  TrackedGpuDeviceBuffer* allocated_dst_buffer =
+      output_buffer->AcquireUsage(copy_event);
+  CHECK(allocated_dst_buffer != nullptr);
+  auto gpu_buffer = allocated_dst_buffer->buffer();
+  if (gpu_buffer.IsError()) {
+    return gpu_buffer.GetError();
+  }
+
+  // If necessary, allocate a host-side buffer for staging host-to-device
+  // transfers. On GPU this is a buffer in pinned memory.
+  HostMemoryAllocator::OwnedPtr staging_buffer;
+  bool must_use_staging_buffer =
+      host_buffer_semantics == HostBufferSemantics::kImmutableOnlyDuringCall ||
+      !host_and_device_strides_equal || packed_size != byte_size;
+
+  // Allocating multigigabyte pinned buffers can be very slow. In that case,
+  // using a staging buffer is probably worse than not using one.
+  bool should_stage_transfers = !IsDmaMapped(data, packed_size) &&
+                                should_stage_host_to_device_transfers() &&
+                                packed_size < (int64_t{1} << 30);
+
+  bool use_staging_buffer = must_use_staging_buffer || should_stage_transfers;
+
+  auto copy_to_staging_buffer = [allocator = host_memory_allocator(), data,
+                                 byte_size, type, packed_size,
+                                 transpose{std::move(transpose)}, should_pack,
+                                 on_done_with_host_buffer = std::move(
+                                     on_done_with_host_buffer)]() mutable {
+    tsl::profiler::TraceMe traceme("H2D staging copy");
+
+    HostMemoryAllocator::OwnedPtr staging_buffer =
+        allocator->Allocate(transpose ? byte_size : packed_size);
+    void* buffer = staging_buffer.get();
+    const void* data_ptr = data;
+
+    if (transpose) {
+      transpose->Execute(data_ptr, buffer);
+      data_ptr = buffer;
+    }
+    if (should_pack) {
+      primitive_util::PackIntN(
+          type,
+          absl::MakeConstSpan(static_cast<const char*>(data_ptr), byte_size),
+          absl::MakeSpan(static_cast<char*>(buffer), packed_size));
+      data_ptr = buffer;
+    }
+    if (data_ptr != buffer) {
+      std::memcpy(buffer, data_ptr, byte_size);
+    }
+    if (on_done_with_host_buffer) {
+      std::move(on_done_with_host_buffer)();
+    }
+    return staging_buffer;
+  };
+
+  auto copy_to_gpu = [device, packed_size, data,
+                      copy_event(std::move(copy_event)),
+                      dst_definition_event(std::move(dst_definition_event)),
+                      gpu_buffer{gpu_buffer.CopyRef()}](
+                         HostMemoryAllocator::OwnedPtr staging_buffer) {
+    tsl::profiler::TraceMe traceme("H2D GPU copy");
+
+    auto stream = device->stream();
+
+    se::DeviceMemoryBase dest = gpu_buffer->buffer();
+    const void* host_data_ptr;
+    if (staging_buffer) {
+      host_data_ptr = staging_buffer.get();
+    } else {
+      host_data_ptr = data;
+    }
+    absl::Status status = stream->Memcpy(&dest, host_data_ptr, packed_size);
+    if (!status.ok()) {
+      copy_event.SetError(status);
+      dst_definition_event.SetError(status);
+      return;
+    }
+    status = stream->BlockHostUntilDone();
+    if (status.ok()) {
+      copy_event.SetStateConcrete();
+      dst_definition_event.SetStateConcrete();
+    } else {
+      copy_event.SetError(status);
+      dst_definition_event.SetError(status);
+    }
+  };
+
+  // Define H2D copy lambda. First, copy host data to staging buffer, then copy
+  // staging buffer to GPU device.
+  auto h2d_copy = [this, use_staging_buffer,
+                   copy_to_staging_buffer(std::move(copy_to_staging_buffer)),
+                   copy_to_gpu(std::move(copy_to_gpu))]() mutable {
+    HostMemoryAllocator::OwnedPtr staging_buffer;
+    if (use_staging_buffer) {
+      staging_buffer = copy_to_staging_buffer();
+    }
+
+    EnqueueWork(blocking_thread_pool_.get(),
+                [copy_to_gpu(std::move(copy_to_gpu)),
+                 staging_buffer(std::move(staging_buffer))]() mutable {
+                  copy_to_gpu(std::move(staging_buffer));
+                });
+  };
+
+  if (host_buffer_semantics == HostBufferSemantics::kImmutableOnlyDuringCall) {
+    h2d_copy();
+  } else {
+    EnqueueWork(non_blocking_thread_pool_.get(), std::move(h2d_copy));
+  }
+
+  return output_buffer;
+}
+
+absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+TfrtGpuClient::CreateBuffersForAsyncHostToDevice(
+    absl::Span<const ShapeSpec> shape_specs,
+    std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+    PjRtMemorySpace* memory_space) {
+  VLOG(3) << "TfrtGpuClient::CreateBuffersForAsyncHostToDevice";
+  CHECK_EQ(memory_space->devices().size(), 1);
+  PjRtDevice* device = memory_space->devices()[0];
+  auto* tfrt_gpu_device = tensorflow::down_cast<TfrtGpuDevice*>(device);
+  return TfrtGpuAsyncHostToDeviceTransferManager::Create(
+      shape_specs, device_layouts, tfrt_gpu_device, this, memory_space);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+TfrtGpuClient::BufferFromHostLiteral(const LiteralSlice& literal,
+                                     PjRtMemorySpace* memory_space,
+                                     const Layout* device_layout) {
+  if (device_layout) {
+    return absl::UnimplementedError(absl::StrCat(
+        "BufferFromHostLiteral with device_layout is not implemented on "
+        "platform: ",
+        platform_name()));
+  }
+  PjRtDevice* device = memory_space->devices()[0];
+  tsl::profiler::TraceMe traceme("TfrtGpuClient::BufferFromHostLiteral");
+  VLOG(3) << "TfrtGpuClient::BufferFromHostLiteral: shape: "
+          << literal.shape().DebugString()
+          << " device: " << device->DebugString();
+  const Shape& shape = literal.shape();
+
+  // Add a placeholder definition event for each leaf buffer when creating the
+  // buffer. They are set only after h2d dispatch.
+  absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4> definition_events;
+  absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4> avs;
+  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes().size() : 1;
+  for (int i = 0; i < num_leaf_buffers; ++i) {
+    tsl::AsyncValueRef<GpuEvent> definition_event =
+        tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+    definition_events.push_back(definition_event.CopyRef());
+    avs.push_back(std::move(definition_event));
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TfrtGpuBuffer> output_buffer,
+      AllocateTfrtGpuDestinationBuffer(shape, std::move(definition_events),
+                                       tsl::down_cast<TfrtGpuDevice*>(device),
+                                       this, memory_space));
+
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto* device_buffer = output_buffer->AcquireUsage(usage_event);
+  CHECK(device_buffer);
+  if (shape.IsTuple()) {
+    return Unimplemented(
+        "Tuple case is not supported in TfrtGpuClient::BufferFromHostLiteral");
+  }
+  // It is OK to capture `buffer` pointer because the `output_buffer` can't
+  // be deleted until all the usage holds have gone away.
+  VLOG(3) << "BufferFromHostLiteral for device_buffer: " << device_buffer;
+  EnqueueWork(non_blocking_thread_pool_.get(),
+              [literal, av = avs[0], device_buffer, shape, this,
+               device = tsl::down_cast<TfrtGpuDevice*>(device),
+               usage_event = std::move(usage_event)]() mutable {
+                tsl::profiler::TraceMe traceme("H2D Dispatch");
+                TransferManager* transfer_manager =
+                    xla_client()->backend().transfer_manager();
+
+                auto stream = device->stream();
+
+                const auto& buffer = device_buffer->buffer();
+                if (literal.shape().IsArray()) {
+                  CHECK_EQ(literal.size_bytes(), buffer->size_bytes());
+                }
+
+                ShapedBuffer shaped_buffer =
+                    buffer->AsShapedBuffer(shape, device);
+
+                TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
+                    stream, literal, shaped_buffer));
+
+                auto status = stream->BlockHostUntilDone();
+                CHECK_OK(status) << "Failed to block host until done";
+                VLOG(2) << "BufferFromHostLiteral done for device_buffer: "
+                        << device_buffer << " AsyncValue: " << av.get();
+
+                av->SetStateConcrete();
+                usage_event.SetStateConcrete();
+              });
+  return std::unique_ptr<PjRtBuffer>(std::move(output_buffer));
+}
+
+absl::Status TfrtGpuClient::DmaMap(void* data, size_t buffer_size) {
+  tsl::profiler::TraceMe trace_me("TfrtGpuClient::DmaMap");
+  se::StreamExecutor* executor =
+      tensorflow::down_cast<TfrtGpuDevice*>(addressable_devices_[0])
+          ->executor();
+  DCHECK(executor);
+  bool success = executor->HostMemoryRegister(data, buffer_size);
+  if (!success) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to register host memory at address: %ps", data));
+  }
+  absl::MutexLock lock(&dma_maps_mutex_);
+  dma_maps_.insert({data, buffer_size});
+  return absl::OkStatus();
+}
+
+absl::Status TfrtGpuClient::DmaUnmap(void* data) {
+  tsl::profiler::TraceMe trace_me("TfrtGpuClient::DmaUnmap");
+  se::StreamExecutor* executor =
+      tensorflow::down_cast<TfrtGpuDevice*>(addressable_devices_[0])
+          ->executor();
+  DCHECK(executor);
+  bool success = executor->HostMemoryUnregister(data);
+  if (!success) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to unregister host memory at address: %ps", data));
+  }
+  absl::MutexLock lock(&dma_maps_mutex_);
+  dma_maps_.erase(data);
+  return absl::OkStatus();
+}
+
+bool TfrtGpuClient::IsDmaMapped(const void* data_start, int64_t transfer_size) {
+  absl::MutexLock lock(&dma_maps_mutex_);
+  if (dma_maps_.empty()) {
+    return false;
+  }
+  auto it = dma_maps_.lower_bound(data_start);
+  if (it == dma_maps_.end()) {
+    return false;
+  }
+  void* data_end = (char*)data_start + transfer_size;
+  void* map_end = (char*)it->first + it->second;
+  return data_end <= map_end;
+}
+
+namespace {
+
+absl::StatusOr<std::unique_ptr<tsl::Allocator>> CreateAllocatorForDevice(
+    se::StreamExecutor* executor, const GpuAllocatorConfig& allocator_config) {
+  switch (allocator_config.kind) {
+    case GpuAllocatorConfig::Kind::kCudaAsync:
+      return absl::UnimplementedError(
+          "CudaAsync allocator is not supported in TfrtGpuClient.");
+    case GpuAllocatorConfig::Kind::kDefault:
+    case GpuAllocatorConfig::Kind::kBFC:
+      LOG_FIRST_N(INFO, 1) << "Using BFC allocator.";
+      return CreateBFCAllocator(executor, allocator_config.memory_fraction,
+                                allocator_config.preallocate,
+                                allocator_config.gpu_system_memory_size);
+    case GpuAllocatorConfig::Kind::kPlatform:
+      LOG(FATAL) << "Platform allocator should be handled before calling this "
+                    "function.";
+  }
+}
+
+absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
+    LocalClient* xla_client, const GpuAllocatorConfig& allocator_config,
+    const std::vector<std::unique_ptr<TfrtGpuDevice>>& devices) {
+  if (allocator_config.kind == GpuAllocatorConfig::Kind::kPlatform) {
+    LOG(INFO) << "Using platform allocator.";
+    if (allocator_config.collective_memory_size != 0) {
+      LOG(WARNING)
+          << "collective_memory_size is non-zero, but allocator kind is set "
+             "to \"platform\". Collective memory will not be allocated.";
+    }
+    return MaybeOwning<se::DeviceMemoryAllocator>(
+        xla_client->backend().memory_allocator());
+  }
+
+  std::vector<se::MultiDeviceAdapter::AllocatorInfo> allocators;
+  for (const auto& device : devices) {
+    se::StreamExecutor* executor = device->executor();
+    if (executor == nullptr) {
+      // Skips remote devices.
+      continue;
+    }
+
+    // The stream in the allocator will be used during compilation.
+    se::Stream* stream = device->stream();
+    TF_ASSIGN_OR_RETURN(auto allocator,
+                        CreateAllocatorForDevice(executor, allocator_config));
+    allocators.emplace_back(
+        std::move(allocator), stream,
+        /*memory_space=*/static_cast<int>(se::MemoryType::kDevice),
+        executor->device_ordinal(), executor->GetPlatform());
+
+    TF_ASSIGN_OR_RETURN(
+        auto collective_bfc_allocator,
+        CreateCollectiveBFCAllocator(
+            executor,
+            /*memory_fraction=*/1.0 - allocator_config.memory_fraction,
+            allocator_config.collective_memory_size));
+    allocators.emplace_back(std::move(collective_bfc_allocator), stream,
+                            /*memory_space=*/1, executor->device_ordinal(),
+                            executor->GetPlatform());
+
+    TF_ASSIGN_OR_RETURN(auto host_allocator, GetGpuHostAllocator(executor));
+    allocators.emplace_back(
+        std::move(host_allocator), stream,
+        /*memory_space=*/static_cast<int>(se::MemoryType::kHost),
+        executor->device_ordinal(), executor->GetPlatform());
+  }
+  return MaybeOwning<se::DeviceMemoryAllocator>(
+      std::make_unique<se::MultiDeviceAdapter>(xla_client->platform(),
+                                               std::move(allocators)));
+}
+
+using DeviceTopologyPair =
+    std::pair<std::vector<std::unique_ptr<TfrtGpuDevice>>, GpuTopologyProto>;
+
+absl::StatusOr<DeviceTopologyPair> BuildDistributedDevices(
+    absl::string_view platform_name, LocalClient* xla_client, int node_id,
+    int num_nodes, gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
+    std::optional<absl::string_view> mock_gpu_topology,
+    std::optional<int> slice_index, absl::Duration get_local_topology_timeout,
+    absl::Duration get_global_topology_timeout) {
+  std::vector<std::unique_ptr<TfrtGpuDevice>> devices;
+  LocalTopologyProto local_topology;
+  local_topology.set_node_id(node_id);
+  auto boot_id_str_or_status = GetBootIdString();
+  if (!boot_id_str_or_status.ok()) {
+    LOG(INFO) << boot_id_str_or_status.status();
+  } else {
+    local_topology.set_boot_id(boot_id_str_or_status.value());
+  }
+  if (slice_index.has_value()) {
+    local_topology.set_slice_index(*slice_index);
+  }
+  for (se::StreamExecutor* executor :
+       xla_client->backend().stream_executors()) {
+    const se::Platform* platform = executor->GetPlatform();
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<xla::se::DeviceDescription> desc,
+        platform->DescriptionForDevice(executor->device_ordinal()));
+    DeviceProto* device_proto = local_topology.add_devices();
+    device_proto->set_local_device_ordinal(executor->device_ordinal());
+    device_proto->set_name(desc->name());
+    device_proto->set_vendor(desc->device_vendor());
+    device_proto->set_compute_capability(
+        MakeComputeCapabilityString(desc.get()));
+    device_proto->set_core_count(desc->core_count());
+
+    // TODO: hhb
+    // const se::GpuComputeCapability& compute_capability =
+    //     desc->gpu_compute_capability();
+    // if (std::holds_alternative<se::CudaComputeCapability>(compute_capability)
+    // &&
+    //     std::get<se::CudaComputeCapability>(compute_capability).major >= 9) {
+    //   auto fabric_info = GetDeviceFabricInfo(executor->device_ordinal());
+    //   if (fabric_info.ok()) {
+    //     device_proto->set_fabric_uuid(*fabric_info);
+    //   }
+    // }
+  }
+
+  GlobalTopologyProto global_topology;
+  if (enable_mock_nccl) {
+    TopologySizes sizes;
+    if (mock_gpu_topology.has_value()) {
+      TF_ASSIGN_OR_RETURN(sizes, TopologySizes::FromString(*mock_gpu_topology));
+    } else {
+      // If there is no topology spec, we assume that each node is a slice,
+      // there is one process (host) on each slice and each host
+      // has all the local devices.
+      sizes.num_slices = num_nodes;
+      sizes.num_hosts_per_slice = 1;
+      sizes.num_devices_per_host = local_topology.devices().size();
+    }
+
+    if (sizes.num_devices_per_host != local_topology.devices().size()) {
+      return absl::InternalError(
+          "The number of devices per host in 'mock_gpu_topology' "
+          "must be the same as the number of devices in the local topology");
+    }
+
+    if (sizes.num_slices * sizes.num_hosts_per_slice != num_nodes) {
+      return absl::InternalError(
+          "The number of hosts in 'mock_gpu_topology' "
+          "must be the same as 'num_nodes'");
+    }
+
+    std::vector<LocalTopologyProto> local_topologies(num_nodes, local_topology);
+    for (int i = 0; i < sizes.num_slices; ++i) {
+      for (int j = 0; j < sizes.num_hosts_per_slice; j++) {
+        int node_id = i * sizes.num_hosts_per_slice + j;
+        local_topologies[node_id].set_node_id(node_id);
+        local_topologies[node_id].set_boot_id(absl::StrCat(i));
+      }
+    }
+    TF_ASSIGN_OR_RETURN(global_topology,
+                        BuildGlobalTopology(absl::MakeSpan(local_topologies),
+                                            /*assign_global_device_ids=*/true));
+  } else {
+    TF_RETURN_IF_ERROR(ExchangeTopologies(
+        platform_name, node_id, num_nodes, get_local_topology_timeout,
+        get_global_topology_timeout, kv_store.get(), local_topology,
+        &global_topology, /*assign_global_device_ids=*/true));
+  }
+
+  std::map<int, GlobalDeviceId> gpu_device_ids;
+  absl::flat_hash_map<GlobalDeviceId, int> device_to_node;
+  for (const LocalTopologyProto& node : global_topology.nodes()) {
+    for (const DeviceProto& device_proto : node.devices()) {
+      GlobalDeviceId global_device_id(device_proto.global_device_id());
+      device_to_node[global_device_id] = node.node_id();
+      TfrtGpuDevice::Options options;
+      if (node.node_id() == node_id) {
+        gpu_device_ids[device_proto.local_device_ordinal()] = global_device_id;
+        // Assign some descriptive names for profiling tools.
+        // TODO: hhb
+        // NameDeviceAndLauncherThread(node, device_proto,
+        //                             local_device->execute_thread());
+
+        TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                            xla_client->backend().stream_executor(
+                                device_proto.local_device_ordinal()));
+        options.local_device_id = executor->device_ordinal();
+        options.local_hardware_id = executor->device_ordinal();
+        options.executor = executor;
+      } else {
+        options.local_device_id = -1;
+        options.local_hardware_id = -1;
+        options.executor = nullptr;
+      }
+      options.id = device_proto.global_device_id();
+      options.process_index = node.node_id();
+      options.slice_index = device_proto.slice_index();
+      options.max_inflight_computations = 32;
+      options.platform_version = device_proto.name();
+      options.device_vendor = device_proto.vendor();
+      options.compute_capability = device_proto.compute_capability();
+      options.core_count = device_proto.core_count();
+
+      auto device = std::make_unique<TfrtGpuDevice>(std::move(options));
+      devices.push_back(std::move(device));
+    }
+  }
+  for (se::StreamExecutor* executor :
+       xla_client->backend().stream_executors()) {
+    TF_RET_CHECK(gpu_device_ids.find(executor->device_ordinal()) !=
+                 gpu_device_ids.end());
+  }
+  gpu_executable_run_options->set_gpu_global_device_ids(
+      std::move(gpu_device_ids));
+
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Default("gpu"));
+  xla::gpu::GpuCollectives* gpu_collectives =
+      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+
+  if (gpu_collectives == nullptr) {
+    return absl::InternalError("Failed to get GPU collectives");
+  }
+
+  TF_RETURN_IF_ERROR(gpu_collectives->InitializeTopology(
+      {node_id, global_topology.nodes().size(),
+       xla_client->backend().stream_executors().size(), kv_store,
+       device_to_node, gpu_executable_run_options}));
+
+  TF_ASSIGN_OR_RETURN(GpuTopologyProto gpu_topology,
+                      BuildGpuTopology(global_topology));
+  return std::make_pair(std::move(devices), gpu_topology);
+}
+
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
+    const GpuClientOptions& options) {
+#if TENSORFLOW_USE_ROCM
+  const auto* pjrt_platform_name = xla::RocmName();
+#elif TENSORFLOW_USE_SYCL
+  const auto* pjrt_platform_name = xla::SyclName();
+#else   // TENSORFLOW_USE_ROCM
+  const auto* pjrt_platform_name = xla::CudaName();
+#endif  // TENSORFLOW_USE_ROCM
+
+  TF_ASSIGN_OR_RETURN(
+      LocalClient * xla_client,
+      GetGpuXlaClient(options.platform_name, options.allowed_devices));
+  EnablePeerAccess(xla_client->backend().stream_executors());
+
+  std::unique_ptr<tsl::Allocator> host_memory_allocator;
+  if (!xla_client->backend().stream_executors().empty()) {
+    TF_ASSIGN_OR_RETURN(
+        host_memory_allocator,
+        GetGpuHostAllocator(xla_client->backend().stream_executors().front()));
+  }
+
+  auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
+  if (options.enable_mock_nccl) {
+    gpu_run_options->set_enable_mock_collectives();
+  }
+
+  static const bool xla_gpu_require_exclusive_lock =
+      xla::GetDebugOptionsFromFlags().xla_gpu_require_exclusive_lock();
+  if (xla_gpu_require_exclusive_lock) {
+    gpu_run_options->set_requires_exclusive_lock_on_gpu();
+  }
+
+  std::shared_ptr<KeyValueStoreInterface> kv_store = options.kv_store;
+  if (options.enable_mock_nccl) {
+    kv_store = std::make_shared<InMemoryKeyValueStore>();
+  }
+  TF_RET_CHECK(options.num_nodes == 1 || kv_store != nullptr);
+  TF_ASSIGN_OR_RETURN(
+      DeviceTopologyPair device_topology_pair,
+      BuildDistributedDevices(pjrt_platform_name, xla_client, options.node_id,
+                              options.num_nodes, gpu_run_options.get(),
+                              kv_store, options.enable_mock_nccl,
+                              options.mock_gpu_topology, options.slice_index,
+                              absl::Minutes(2), absl::Minutes(5)));
+
+  std::vector<std::unique_ptr<TfrtGpuDevice>> devices =
+      std::move(device_topology_pair.first);
   auto gpu_topology = std::shared_ptr<const GpuTopology>(
-      GpuTopology::FromProto(gpu_topology_proto));
+      GpuTopology::FromProto(device_topology_pair.second));
+
+  TF_ASSIGN_OR_RETURN(
+      auto allocator,
+      CreateDeviceAllocator(xla_client, options.allocator_config, devices));
 
   return std::unique_ptr<PjRtClient>(std::make_unique<TfrtGpuClient>(
-      /*process_index=*/0, xla_client, std::move(devices),
-      std::move(host_memory_allocator), gpu_topology));
+      std::move(pjrt_platform_name), options.node_id, xla_client,
+      std::move(devices), options.should_stage_host_to_device_transfers,
+      std::move(allocator), std::move(host_memory_allocator),
+      std::move(gpu_run_options), std::move(kv_store),
+      std::move(gpu_topology)));
+}
+
+TfrtGpuBuffer::TfrtGpuBuffer(
+    Shape on_device_shape,
+    std::unique_ptr<TrackedGpuDeviceBuffer> tracked_device_buffer,
+    TfrtGpuClient* client, TfrtGpuDevice* device, PjRtMemorySpace* memory_space)
+    : client_(client),
+      on_device_shape_(std::move(on_device_shape)),
+      device_(device),
+      memory_space_(CHECK_NOTNULL(memory_space)),
+      tracked_device_buffer_(std::move(tracked_device_buffer)),
+      donation_event_(tsl::MakeAvailableAsyncValueRef<bool>(false)),
+      external_references_dropped_event_(
+          tsl::MakeConstructedAsyncValueRef<GpuEvent>()) {}
+
+TfrtGpuBuffer::~TfrtGpuBuffer() { Delete(); }
+
+absl::StatusOr<size_t> TfrtGpuBuffer::GetOnDeviceSizeInBytes() const {
+  return ShapeUtil::ByteSizeOf(on_device_shape_);
+}
+
+TrackedGpuDeviceBuffer* TfrtGpuBuffer::AcquireUsage(
+    tsl::AsyncValueRef<GpuEvent> usage_event) {
+  absl::MutexLock lock(&mu_);
+  if (!tracked_device_buffer_) {
+    return nullptr;
+  }
+
+  tracked_device_buffer_->AddUsageEvents(absl::MakeSpan(&usage_event, 1));
+  return tracked_device_buffer_.get();
+}
+
+absl::StatusOr<Shape> TfrtGpuBuffer::logical_on_device_shape() {
+  if (on_device_shape_.is_static()) {
+    return on_device_shape_;
+  }
+
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto* device_buffer = AcquireUsage(usage_event);
+  if (device_buffer == nullptr) {
+    return InvalidArgument(
+        "logical_on_device_shape() called on deleted or donated buffer");
+  }
+  MarkGpuEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+  // Wait for the definition event.
+  const auto& av = device_buffer->definition_event();
+  tsl::BlockUntilReady(av);
+  if (auto* error = av.GetErrorIfPresent()) {
+    return absl::InternalError(
+        absl::StrFormat("Error Execute: %s", error->message()));
+  }
+
+  const auto& buffer = device_buffer->buffer();
+
+  ShapedBuffer shaped_buffer =
+      buffer->AsShapedBuffer(on_device_shape_, device_);
+  Shape ret_shape = on_device_shape_;
+  TransferManager* transfer_manager =
+      client_->xla_client()->backend().transfer_manager();
+
+  auto stream = device_->stream();
+  TF_RETURN_IF_ERROR(
+      transfer_manager->ReadDynamicShapes(stream, &shaped_buffer, &ret_shape));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  return ret_shape;
+}
+
+PjRtFuture<> TfrtGpuBuffer::GetReadyFuture() {
+  VLOG(3) << "TfrtGpuBuffer::GetReadyFuture";
+  tsl::AsyncValueRef<GpuEvent> definition_event;
+  absl::MutexLock lock(&mu_);
+  if (!tracked_device_buffer_) {
+    return PjRtFuture<>(InvalidArgument(
+        "GetReadyFuture() called on deleted or donated buffer"));
+  }
+  definition_event = tracked_device_buffer_->definition_event();
+  DCHECK(definition_event);
+  if (!definition_promise_) {
+    definition_promise_ = PjRtFuture<>::CreatePromise();
+    if (definition_event.IsAvailable()) {
+      if (definition_event.IsError()) {
+        return PjRtFuture<>(
+            FailedPrecondition("Buffer Definition Event: %s",
+                               definition_event.GetError().message()));
+      }
+      definition_promise_.Set(absl::OkStatus());
+    } else {
+      definition_event.AndThen(
+          [definition_event,
+           definition_promise = definition_promise_]() mutable {
+            if (definition_event.IsError()) {
+              VLOG(2) << "definition_event.GetError(): "
+                      << definition_event.GetError();
+              definition_promise.Set(definition_event.GetError());
+            } else {
+              definition_promise.Set(absl::OkStatus());
+            }
+          });
+    }
+  }
+
+  return PjRtFuture<>(
+      definition_promise_,
+      /*on_block_start=*/
+      []() {
+        tsl::profiler::TraceMeProducer traceme("TfrtGpuBuffer::Await");
+        VLOG(3) << "TfrtGpuBuffer::Await";
+        return PjRtFutureHelpers::ProfilingKeys(
+            {/*traceme_context_id=*/traceme.GetContextId()});
+      },
+      /*on_block_end=*/
+      [](PjRtFutureHelpers::ProfilingKeys keys) {
+        tsl::profiler::TraceMeConsumer traceme("TfrtGpuBuffer::Await",
+                                               keys.traceme_context_id);
+      });
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+TfrtGpuBuffer::DonateWithControlDependency(PjRtFuture<> dependency) {
+  VLOG(3) << "TfrtGpuBuffer::DonateWithControlDependency";
+
+  {
+    // TODO(ziyinh): Remove this once we have a better solution.
+    // Wait on the definition and usage events before we can donate the buffer.
+    // This is might not be optimal but avoids issues where a buffer is donated
+    // before its usage event is ready.
+    absl::MutexLock lock(&mu_);
+    auto usage_events = tracked_device_buffer_->LockUseAndTransferUsageEvents();
+    auto definition_event = tracked_device_buffer_->definition_event();
+    tsl::BlockUntilReady(usage_events);
+    tsl::BlockUntilReady(definition_event);
+  }
+  // Acquire donation hold.
+  TF_ASSIGN_OR_RETURN(auto donation_transaction, AcquireDonation());
+
+  TrackedGpuDeviceBuffer* original_tracked_buffer =
+      donation_transaction.device_buffer();
+
+  // Check if the original buffer is valid.
+  if (original_tracked_buffer == nullptr) {
+    // Donation hold is released on destruction, return error.
+    return InvalidArgument(
+        "DonateWithControlDependency was called on a deleted or donated "
+        "buffer.");
+  }
+
+  // Get definition event and buffer data Async Values.
+  tsl::AsyncValueRef<GpuDeviceMemory> buffer_data_av =
+      original_tracked_buffer->buffer();
+  tsl::AsyncValueRef<GpuEvent> original_def_event =
+      original_tracked_buffer->definition_event();
+
+  // Create new event for donated buffer, the dependency will resolve it.
+  tsl::AsyncValueRef<GpuEvent> donation_event =
+      tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+
+  // Create new buffer with the combined event and underlying data from the
+  // original buffer.
+  absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4> new_definition_events{
+      donation_event.CopyRef(), original_def_event.CopyRef()};
+  auto new_tracked_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_data_av), new_definition_events,
+      std::move(original_tracked_buffer->on_delete_callback_));
+
+  // Create the new PjRtBuffer wrapper.
+  auto new_pjrt_buffer = std::make_unique<TfrtGpuBuffer>(
+      on_device_shape(),  // Reuse the shape
+      std::move(new_tracked_buffer),
+      client_,  // Reuse client, device, memory space
+      device_, memory_space_);
+
+  // Resolve the donation event when the dependency is ready.
+  dependency.OnReady([donation_event](absl::Status status) mutable {
+    if (status.ok()) {
+      donation_event.SetStateConcrete();
+    } else {
+      donation_event.SetError(status);
+    }
+  });
+
+  // Commit the donation transaction when the original definition event is ready
+  original_def_event.AndThen(
+      [donation_transaction = std::move(donation_transaction)]() mutable {
+        std::move(donation_transaction).Commit();
+      });
+
+  return new_pjrt_buffer;
+}
+
+bool TfrtGpuBuffer::IsOnCpu() const {
+  return memory_space() != nullptr &&
+         memory_space()->kind() == PinnedHostMemorySpace::kKind;
+}
+
+const tsl::AsyncValueRef<GpuDeviceMemory>& TfrtGpuBuffer::GetBufferPtr() const {
+  absl::MutexLock lock(&mu_);
+  return tracked_device_buffer_->buffer();
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+TfrtGpuBuffer::AcquireExternalReference() {
+  class ScopedExternalReference : public PjRtBuffer::ExternalReference {
+   public:
+    explicit ScopedExternalReference(TfrtGpuBuffer* buffer,
+                                     tsl::AsyncValueRef<GpuDeviceMemory> data)
+        : buffer_(buffer), data_(std::move(data)) {
+      DCHECK(data_);
+      data_ptr_ = data_->buffer().opaque();
+    }
+
+    ~ScopedExternalReference() override { buffer_->DropExternalReference(); }
+
+   private:
+    TfrtGpuBuffer* buffer_ = nullptr;
+    // Keep a reference to the underlying data used. Note that it is still
+    // users' responsibility to synchronize reads and writes to the data.
+    tsl::AsyncValueRef<GpuDeviceMemory> data_;
+  };
+
+  absl::MutexLock lock(&mu_);
+  if (tracked_device_buffer_ == nullptr) {
+    return InvalidArgument("Buffer has been deleted or donated.");
+  }
+
+  // If the external reference event is concrete, it means we previously dropped
+  // the last external reference but want to create one again without having
+  // deleted the buffer. So we need a new external_references_dropped_event_.
+  if (external_references_dropped_event_.IsConcrete()) {
+    external_references_dropped_event_ =
+        tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  }
+
+  ++external_reference_counter_;
+
+  tsl::BlockUntilReady(tracked_device_buffer_->definition_event());
+  if (tracked_device_buffer_->definition_event().IsError()) {
+    return tracked_device_buffer_->definition_event().GetError();
+  }
+  return {std::make_unique<ScopedExternalReference>(
+      this, tracked_device_buffer_->buffer())};
+}
+
+class TrackedGpuDeviceBufferExternalReference
+    : public PjRtBuffer::ExternalReference {
+ public:
+  explicit TrackedGpuDeviceBufferExternalReference(
+      std::unique_ptr<TrackedGpuDeviceBuffer> tracked_device_buffer)
+      : tracked_device_buffer_(std::move(tracked_device_buffer)) {
+    data_ptr_ = tracked_device_buffer_->buffer()->buffer().opaque();
+  }
+
+  ~TrackedGpuDeviceBufferExternalReference() override = default;
+
+ private:
+  std::unique_ptr<TrackedGpuDeviceBuffer> tracked_device_buffer_;
+};
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+TfrtGpuBuffer::ReleaseDeviceMemoryOwnership(
+    bool wait_for_operations_to_complete) {
+  if (on_device_shape_.IsTuple()) {
+    return InvalidArgument(
+        "ReleaseDeviceMemoryOwnership allowed only for non-tuple");
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TrackedGpuDeviceBuffer> tracked_device_buffer,
+      Release(wait_for_operations_to_complete));
+
+  std::unique_ptr<PjRtBuffer::ExternalReference> ref;
+  if (tracked_device_buffer) {
+    ref = std::make_unique<TrackedGpuDeviceBufferExternalReference>(
+        std::move(tracked_device_buffer));
+  }
+  return ref;
+}
+
+PjRtFuture<> TfrtGpuBuffer::ToLiteral(MutableLiteralBase* literal) {
+  VLOG(2) << "TfrtGpuBuffer::ToLiteral for a tensor of shape "
+          << literal->shape().ToString();
+  tsl::profiler::TraceMe traceme("TfrtGpuBuffer::ToLiteral");
+  auto promise = PjRtFuture<>::CreatePromise();
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto* device_buffer = AcquireUsage(usage_event);
+  if (device_buffer == nullptr) {
+    promise.Set(
+        InvalidArgument("ToLiteral() called on deleted or donated buffer"));
+    return PjRtFuture<>(promise);
+  }
+
+  bool unpack_subbyte_types =
+      client_->xla_client()->backend().transfer_manager()->PackSubbyteTypes();
+
+  std::shared_ptr<TransposePlan> transpose;
+  if (on_device_shape().IsArray()) {
+    xla::Layout literal_layout;
+    if (literal->shape().has_layout()) {
+      literal_layout = literal->shape().layout();
+    } else {
+      literal_layout = LayoutUtil::MakeDescendingLayout(
+          on_device_shape().dimensions().size());
+    }
+
+    if (on_device_shape().layout() != literal_layout) {
+      absl::InlinedVector<int64_t, 4> byte_strides(
+          on_device_shape().dimensions().size());
+      absl::Status s = ShapeUtil::ByteStrides(on_device_shape(),
+                                              absl::MakeSpan(byte_strides));
+      if (!s.ok()) {
+        return PjRtFuture<>(s);
+      }
+      absl::Span<const int64_t> dims = on_device_shape().dimensions();
+      absl::InlinedVector<int64_t, 4> permutation(dims.size());
+      absl::c_reverse_copy(literal_layout.minor_to_major(),
+                           permutation.begin());
+      TransposePlan::Options options;
+      options.elem_size_in_bytes =
+          primitive_util::ByteWidth(on_device_shape().element_type());
+      options.dims = on_device_shape().dimensions();
+      options.permutation = permutation;
+      options.input_layout = TransposePlan::Striding{byte_strides};
+      {
+        absl::MutexLock lock(&client_->transpose_mu_);
+        absl::StatusOr<std::shared_ptr<TransposePlan>> t =
+            client_->transpose_cache_.GetOrCreate(options);
+        if (!t.ok()) {
+          return PjRtFuture<>(t.status());
+        }
+        transpose = *std::move(t);
+      }
+    }
+  }
+
+  auto copy_to_host = [device(device_), device_buffer,
+                       usage_event(std::move(usage_event)), literal, promise,
+                       client = client_, on_device_shape{on_device_shape_},
+                       unpack_subbyte_types, transpose]() mutable {
+    tsl::profiler::TraceMe traceme("D2H copy");
+    if (device_buffer->definition_event().IsError()) {
+      usage_event.SetStateConcrete();
+      VLOG(2) << "device_buffer->definition_event().GetError(): "
+              << device_buffer->definition_event().GetError();
+
+      promise.Set(device_buffer->definition_event().GetError());
+      return;
+    }
+    size_t byte_size = device_buffer->buffer()->buffer().size();
+
+    PrimitiveType type = on_device_shape.element_type();
+    bool should_unpack =
+        unpack_subbyte_types && primitive_util::IsSubByteNonPredType(type);
+
+    HostMemoryAllocator::OwnedPtr staging_buffer;
+    void* buffer_ptr;
+    if (should_unpack || transpose != nullptr) {
+      staging_buffer = client->host_memory_allocator()->Allocate(byte_size);
+      buffer_ptr = staging_buffer.get();
+    } else if (on_device_shape.IsArray()) {
+      buffer_ptr = literal->untyped_data();
+    } else {
+      CHECK_EQ(byte_size, 0);
+      buffer_ptr = nullptr;
+    }
+
+    {
+      tsl::profiler::TraceMe traceme2("D2H GPU copy");
+      MarkGpuEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+      auto stream = device->stream();
+
+      CHECK_OK(stream->Memcpy(buffer_ptr, device_buffer->buffer()->buffer(),
+                              byte_size))
+          << "stream->Memcpy failed copying from GPU to host";
+
+      absl::Status status = stream->BlockHostUntilDone();
+      if (!status.ok()) {
+        VLOG(2) << "stream->BlockHostUntilDone failed: " << status;
+        promise.Set(status);
+        return;
+      }
+    }
+    void* buffer;
+    if (should_unpack) {
+      tsl::profiler::TraceMe traceme("D2H staging copy");
+      int64_t unpacked_size = ShapeUtil::ElementsIn(on_device_shape);
+      if (transpose != nullptr) {
+        buffer = tsl::port::AlignedMalloc(unpacked_size,
+                                          tsl::Allocator::kAllocatorAlignment);
+      } else {
+        buffer = literal->untyped_data();
+      }
+      primitive_util::UnpackIntN(
+          on_device_shape.element_type(),
+          absl::MakeConstSpan(static_cast<const char*>(buffer_ptr), byte_size),
+          absl::MakeSpan(static_cast<char*>(buffer), unpacked_size));
+      VLOG(2) << "D2H staging copy done";
+    } else {
+      buffer = buffer_ptr;
+    }
+    if (transpose != nullptr) {
+      tsl::profiler::TraceMe traceme("Transpose");
+      transpose->Execute(buffer, static_cast<char*>(literal->untyped_data()));
+      if (should_unpack) {
+        tsl::port::AlignedFree(buffer);
+      }
+    }
+    promise.Set(absl::OkStatus());
+  };
+  EnqueueWorkWhenReady(client_->blocking_thread_pool(),
+                       {device_buffer->definition_event().CopyRCRef()},
+                       std::move(copy_to_host));
+
+  return PjRtFuture<>(
+      std::move(promise),
+      /*on_block_start=*/
+      []() {
+        tsl::profiler::TraceMeProducer traceme("TfrtGpuBuffer::ToLiteral");
+        VLOG(2) << "TfrtGpuBuffer::ToLiteral::OnBlockStart";
+        return PjRtFutureHelpers::ProfilingKeys(
+            {/*traceme_context_id =*/traceme.GetContextId()});
+      },
+      /*on_block_end=*/
+      [](PjRtFutureHelpers::ProfilingKeys keys) {
+        tsl::profiler::TraceMeConsumer traceme("TfrtGpuBuffer::ToLiteral",
+                                               keys.traceme_context_id);
+      });
+}
+
+PjRtFuture<> TfrtGpuBuffer::LazyToLiteral(
+    absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator) {
+  auto buffer = std::move(generator)();
+  if (!buffer.ok()) {
+    return PjRtFuture<>(buffer.status());
+  }
+  return ToLiteral(buffer.value());
+}
+
+// TODO - b/383558503: Fix cognitive complexity from ClangTidy.
+PjRtFuture<> TfrtGpuBuffer::CopyRawToHostFuture(PjRtFuture<void*> dst,
+                                                int64_t offset,
+                                                int64_t transfer_size) {
+  tsl::profiler::TraceMe traceme("TfrtGpuBuffer::CopyRawToHostFuture");
+  auto promise = PjRtFuture<>::CreatePromise();
+  auto usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  auto* device_buffer = AcquireUsage(usage_event);
+  if (device_buffer == nullptr) {
+    return PjRtFuture<>(
+        InvalidArgument("ToLiteral() called on deleted or donated buffer"));
+  }
+  EnqueueWorkWhenReady(
+      client_->blocking_thread_pool(),
+      {device_buffer->definition_event().CopyRCRef()},
+      [device(device_), device_buffer, usage_event(std::move(usage_event)), dst,
+       promise, client = client_, offset, transfer_size]() mutable {
+        tsl::profiler::TraceMe traceme("D2H copy");
+        if (device_buffer->definition_event().IsError()) {
+          usage_event.SetStateConcrete();
+          LOG(ERROR) << "device_buffer->definition_event().GetError(): "
+                     << device_buffer->definition_event().GetError();
+          promise.Set(device_buffer->definition_event().GetError());
+          return;
+        }
+        se::DeviceMemoryBase device_memory = device_buffer->buffer()->buffer();
+        if (offset < 0 || offset > device_memory.size() ||
+            device_memory.size() - offset < transfer_size) {
+          usage_event.SetStateConcrete();
+          LOG(ERROR) << "Copy raw buffer called on buffer size "
+                     << device_memory.size() << " with invalid offset "
+                     << offset << ", transfer size " << transfer_size;
+          promise.Set(
+              InvalidArgument("Copy raw buffer called on buffer size %lld with "
+                              "invalid offset %lld, transfer size %lld",
+                              device_memory.size(), offset, transfer_size));
+          return;
+        }
+
+        std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
+        if (transfer_size < device_memory.size()) {
+          sub_buffer = std::make_unique<se::DeviceMemoryBase>(
+              device_memory.GetByteSlice(offset, transfer_size));
+        } else {
+          sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
+        }
+        dst.OnReady([client = std::move(client), promise = std::move(promise),
+                     usage_event = std::move(usage_event),
+                     device = std::move(device),
+                     sub_buffer = std::move(sub_buffer),
+                     transfer_size](absl::StatusOr<void*> dst) mutable {
+          HostMemoryAllocator::OwnedPtr staging_buffer;
+          if (client->should_stage_host_to_device_transfers() &&
+              !client->IsDmaMapped(dst.value(), transfer_size)) {
+            staging_buffer =
+                client->host_memory_allocator()->Allocate(transfer_size);
+          }
+
+          {
+            tsl::profiler::TraceMe traceme2("D2H GPU copy");
+            MarkGpuEventReadyOnExit ready_on_exit(std::move(usage_event));
+            auto stream = device->stream();
+            void* host_ptr =
+                staging_buffer != nullptr ? staging_buffer.get() : dst.value();
+
+            CHECK_OK(stream->Memcpy(host_ptr, *sub_buffer, transfer_size))
+                << "stream->Memcpy failed copying from GPU to host";
+            absl::Status status = stream->BlockHostUntilDone();
+            if (!status.ok()) {
+              LOG(ERROR) << "stream->BlockHostUntilDone failed: " << status;
+              promise.Set(status);
+              return;
+            }
+          }
+          if (!dst.ok()) {
+            promise.Set(dst.status());
+            LOG(ERROR) << "dst.status(): " << dst.status();
+            return;
+          }
+          if (staging_buffer != nullptr) {
+            tsl::profiler::TraceMe traceme3("D2H staging copy");
+            std::memcpy(dst.value(), staging_buffer.get(), transfer_size);
+            VLOG(3) << "D2H staging copy done";
+          }
+          promise.Set(absl::OkStatus());
+        });
+      });
+
+  return PjRtFuture<>(
+      std::move(promise),
+      /*on_block_start=*/
+      []() {
+        tsl::profiler::TraceMeProducer traceme(
+            "TfrtGpuBuffer::CopyRawToHostFuture");
+        VLOG(2) << "TfrtGpuBuffer::CopyRawToHostFuture";
+        return PjRtFutureHelpers::ProfilingKeys(
+            {/*traceme_context_id =*/traceme.GetContextId()});
+      },
+      /*on_block_end=*/
+      [](PjRtFutureHelpers::ProfilingKeys keys) {
+        tsl::profiler::TraceMeConsumer traceme(
+            "TfrtGpuBuffer::CopyRawToHostFuture", keys.traceme_context_id);
+      });
+}
+
+void TfrtGpuBuffer::Delete() {
+  tsl::profiler::TraceMe traceme("Gpu buffer delete");
+  VLOG(3) << " TfrtGpuBuffer::Delete";
+  std::unique_ptr<TrackedGpuDeviceBuffer> device_buffer;
+  tsl::AsyncValueRef<GpuEvent> external_references_dropped_event;
+  {
+    absl::MutexLock lock(&mu_);
+    device_buffer = ReleaseBufferLocked();
+    if (device_buffer == nullptr) {
+      return;
+    }
+
+    if (external_reference_counter_ > 0) {
+      external_references_dropped_event =
+          external_references_dropped_event_.CopyRef();
+    } else {
+      external_references_dropped_event =
+          tsl::MakeAvailableAsyncValueRef<GpuEvent>();
+    }
+  }
+  if (device_buffer == nullptr) return;
+
+  tsl::AsyncValueRef<bool> donation_event = GetDonationEvent();
+
+  // Now that all holds have completed and no more can be added, we can get
+  // the final set of usage events.
+  tsl::AsyncValueRef<GpuEvent> usage_event =
+      device_buffer->LockUseAndTransferUsageEvents();
+
+  std::array event_avs{
+      usage_event.GetAsyncValue(),
+      // We should also wait for the definition event.
+      device_buffer->definition_event().GetAsyncValue(),
+      donation_event.GetAsyncValue(),
+      external_references_dropped_event.GetAsyncValue(),
+  };
+
+  tsl::RunWhenReady(
+      event_avs, [device_buffer = std::move(device_buffer),
+                  usage_event(std::move(usage_event)),
+                  donation_event(std::move(donation_event))]() mutable {
+        VLOG(3) << "device_buffer is being deleted: " << device_buffer.get();
+        device_buffer.reset();
+      });
+}
+
+bool TfrtGpuBuffer::IsDeleted() {
+  absl::MutexLock lock(&mu_);
+  return tracked_device_buffer_ == nullptr;
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
+    PjRtMemorySpace* dst_memory_space) {
+  tsl::profiler::TraceMe traceme("TfrtGpuBuffer::CopyToMemorySpace");
+  PjRtDevice* dst_device = dst_memory_space->devices()[0];
+
+  VLOG(1) << " TfrtGpuBuffer::CopyToMemorySpace:  dst_device: " << dst_device
+          << " dst_memory_space: " << dst_memory_space->kind();
+
+  // Copying across PjRtClients involves a copy through the host.
+  if (dst_device->client() != client_) {
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+    // Avoid use-after-free on `literal` due to unsequenced move and use.
+    Literal* literal_pointer = literal.get();
+    absl::InlinedVector<int64_t, 4> byte_strides(
+        literal->shape().dimensions().size());
+    TF_RETURN_IF_ERROR(
+        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+    return dst_device->client()->BufferFromHostBuffer(
+        literal_pointer->untyped_data(),
+        literal_pointer->shape().element_type(),
+        literal_pointer->shape().dimensions(), byte_strides,
+        TfrtGpuClient::HostBufferSemantics::kImmutableZeroCopy,
+        [literal{std::move(literal)}]() { /* frees literal */ },
+        dst_memory_space,
+        /*device_layout=*/nullptr);
+  }
+
+  // Copy each leaf buffer to a destination buffer.
+  auto src_usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  TrackedGpuDeviceBuffer* src_device_buffer = AcquireUsage(src_usage_event);
+  if (src_device_buffer == nullptr) {
+    return InvalidArgument(
+        "CopyToMemorySpace called on deleted or donated buffer");
+  }
+
+  TfrtGpuDevice* gpu_src_device = tsl::down_cast<TfrtGpuDevice*>(device());
+  TfrtGpuDevice* gpu_dst_device = tsl::down_cast<TfrtGpuDevice*>(dst_device);
+  tsl::AsyncValueRef<GpuDeviceMemory> src_buffer = src_device_buffer->buffer();
+
+  auto dst_definition_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  TF_ASSIGN_OR_RETURN(auto output_buffer,
+                      AllocateTfrtGpuDestinationBuffer(
+                          on_device_shape_, {dst_definition_event.CopyRef()},
+                          gpu_dst_device, client_, dst_memory_space));
+  auto dst_usage_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+  TrackedGpuDeviceBuffer* allocated_dst_device_buffer =
+      output_buffer->AcquireUsage(dst_usage_event);
+  CHECK(allocated_dst_device_buffer != nullptr);
+  auto allocated_dst_buffer = allocated_dst_device_buffer->buffer();
+
+  absl::AnyInvocable<void()> transfer_d2d =
+      [src_buffer(src_buffer.CopyRef()),
+       allocated_dst_buffer(allocated_dst_buffer.CopyRef()),
+       dst_definition_event(dst_definition_event.CopyRef()),
+       src_definition_event(src_device_buffer->definition_event().CopyRef()),
+       src_device(gpu_src_device), dst_device(gpu_dst_device),
+       src_usage_event(src_usage_event.CopyRef()),
+       dst_usage_event(dst_usage_event.CopyRef())]() {
+        VLOG(2) << "Request to transfer D2D from "
+                << src_buffer->buffer().opaque() << " on device "
+                << src_device->id() << " to "
+                << allocated_dst_buffer->buffer().opaque() << " on device "
+                << dst_device->id();
+        tsl::profiler::TraceMe traceme("D2D copy");
+
+        MarkGpuEventReadyOnExit ready_on_exit_src(std::move(src_usage_event));
+        MarkGpuEventReadyOnExit ready_on_exit_dst(std::move(dst_usage_event));
+
+        if (const absl::Status* error =
+                dst_definition_event.GetErrorIfPresent()) {
+          allocated_dst_buffer.SetError(*error);
+          dst_definition_event.SetError(*error);
+          return;
+        }
+
+        if (const absl::Status* error =
+                src_definition_event.GetErrorIfPresent()) {
+          allocated_dst_buffer.SetError(*error);
+          dst_definition_event.SetError(*error);
+          return;
+        }
+
+        // TODO: Use the destination device stream for D2D copies.
+        auto stream = src_device->stream();
+        se::DeviceMemoryBase dst(allocated_dst_buffer->buffer());
+        absl::Status status = stream->Memcpy(&dst, src_buffer->buffer(),
+                                             src_buffer->buffer().size());
+        if (!status.ok()) {
+          dst_definition_event.SetError(status);
+          return;
+        }
+        status = stream->BlockHostUntilDone();
+        if (status.ok()) {
+          VLOG(2) << "D2D copy done. dst: " << dst.opaque();
+          dst_definition_event.SetStateConcrete();
+        } else {
+          LOG(ERROR) << "D2D copy failed. dst: " << dst.opaque()
+                     << " status: " << status;
+          dst_definition_event.SetError(status);
+        }
+      };
+
+  EnqueueWorkWhenReady(client_->blocking_thread_pool(),
+                       {src_device_buffer->definition_event().CopyRCRef()},
+                       std::move(transfer_d2d));
+  return output_buffer;
+}
+
+void TfrtGpuBuffer::DropExternalReference() {
+  absl::MutexLock lock(&mu_);
+  CHECK_GT(external_reference_counter_, 0);
+  --external_reference_counter_;
+  if (external_reference_counter_ == 0) {
+    external_references_dropped_event_.SetStateConcrete();
+  }
+}
+
+absl::StatusOr<std::unique_ptr<TrackedGpuDeviceBuffer>> TfrtGpuBuffer::Release(
+    bool wait_for_operations_to_complete) {
+  auto donation_event = GetDonationEvent();
+  tsl::BlockUntilReady(donation_event);
+  std::unique_ptr<TrackedGpuDeviceBuffer> device_buffer;
+  {
+    absl::MutexLock lock(&mu_);
+    device_buffer = ReleaseBufferLocked();
+  }
+  if (device_buffer == nullptr) return {nullptr};
+
+  std::array events{
+      // Now that all holds have completed and no more can be added, we can get
+      // the final set of usage events.
+      device_buffer->LockUseAndTransferUsageEvents(),
+      device_buffer->definition_event().CopyRef(),
+  };
+
+  if (wait_for_operations_to_complete) {
+    // Block the host until all usage events have completed. Usage events
+    // dominate definition events, so this also waits for the buffer to be
+    // defined. Return the first error encountered.
+    absl::Status first_error;
+    for (const auto& av : events) {
+      tsl::BlockUntilReady(av);
+      if (auto* error = av.GetErrorIfPresent()) {
+        first_error.Update(*error);
+      }
+    }
+    if (!first_error.ok()) return std::move(first_error);
+  }
+
+  return device_buffer;
+}
+
+std::unique_ptr<TrackedGpuDeviceBuffer> TfrtGpuBuffer::ReleaseBufferLocked() {
+  tsl::profiler::TraceMe traceme("TfrtGpuBuffer::ReleaseBufferLocked");
+  return std::move(tracked_device_buffer_);
+}
+
+TfrtGpuExecutable::TfrtGpuExecutable(
+    std::vector<std::unique_ptr<LocalExecutable>> executables,
+    bool parameter_is_tupled_arguments,
+    std::shared_ptr<DeviceAssignment> device_assignment,
+    CompileOptions compile_options,
+    std::vector<LogicalDeviceIds> addressable_device_logical_ids,
+    std::vector<PjRtDevice*> addressable_devices, TfrtGpuClient* client)
+    : client_(client),
+      device_assignment_(std::move(device_assignment)),
+      compile_options_(std::move(compile_options)),
+      parameter_is_tupled_arguments_(parameter_is_tupled_arguments),
+      addressable_device_logical_ids_(
+          std::move(addressable_device_logical_ids)),
+      addressable_devices_(std::move(addressable_devices)) {
+  TransferManager* transfer_manager =
+      client_->xla_client()->backend().transfer_manager();
+  tsl::Fprint128 fingerprint = tsl::Fingerprint128(fingerprint_);
+  executables_.reserve(executables.size());
+  for (auto& executable : executables) {
+    const auto& computation_layout =
+        executable->executable()->module().entry_computation_layout();
+    std::vector<Shape> parameter_shapes;
+    parameter_shapes.reserve(computation_layout.parameter_count());
+    for (int i = 0; i < computation_layout.parameter_count(); ++i) {
+      parameter_shapes.push_back(transfer_manager->HostShapeToDeviceShape(
+          computation_layout.parameter_shape(i)));
+    }
+    on_device_executable_parameter_shapes_.push_back(
+        std::make_shared<std::vector<Shape>>(std::move(parameter_shapes)));
+
+    auto input_buffer_sizes_in_bytes = std::make_shared<std::vector<int64_t>>();
+
+    // Assume compiled program expects either many non-tupled arguments or a
+    // singled tupled argument, or no arguments. Nested tuple is not yet
+    // supported.
+    if (computation_layout.parameter_count() == 0) {
+      // No arguments. Do nothing.
+    } else if (computation_layout.parameter_count() == 1 &&
+               computation_layout.parameter_shape(0).IsTuple()) {
+      const std::vector<Shape>& tuple_shapes =
+          computation_layout.parameter_shape(0).tuple_shapes();
+      input_buffer_sizes_in_bytes->reserve(tuple_shapes.size());
+      for (const Shape& shape : tuple_shapes) {
+        input_buffer_sizes_in_bytes->push_back(ShapeUtil::ByteSizeOf(shape));
+      }
+    } else {
+      const std::vector<ShapeLayout>& parameter_layouts =
+          computation_layout.parameter_layouts();
+      input_buffer_sizes_in_bytes->reserve(parameter_layouts.size());
+      for (const ShapeLayout& layout : parameter_layouts) {
+        input_buffer_sizes_in_bytes->push_back(
+            ShapeUtil::ByteSizeOf(layout.shape()));
+      }
+    }
+    input_buffer_sizes_in_bytes_.push_back(
+        std::move(input_buffer_sizes_in_bytes));
+
+    fingerprint = tsl::FingerprintCat128(
+        fingerprint,
+        tsl::Fingerprint128(executable->executable()->module().ToString(
+            HloPrintOptions::ModuleFingerprint())));
+    executables_.emplace_back(std::move(executable));
+  }
+  fingerprint_ = absl::StrCat(fingerprint.low64, fingerprint.high64);
+
+  int num_partitions;
+  if (device_assignment_ == nullptr) {
+    // This must go after `executables_` is initialized.
+    VLOG(3) << "TfrtGpuExecutable portable single-core";
+    num_partitions = 1;
+    CHECK(addressable_devices_.empty());
+  } else {
+    // This must go after `executables_` is initialized.
+    VLOG(3) << "TfrtGpuExecutable device_assignment:\n"
+            << device_assignment_->ToString();
+
+    if ((device_assignment_->replica_count() > 1 ||
+         device_assignment_->computation_count() > 1) &&
+        IsAllZeros(*device_assignment_)) {
+      // This code path should only be triggered when we intentionally compile
+      // an HLO without having enough devices to actually run it. See the
+      // "--run=false" option in
+      // tensorflow/compiler/xla/tools/multihost_hlo_runner/hlo_runner_main.cc.
+      // That will help us debug the XLA compiler locally.
+      LOG(INFO) << "A workaround is in effect to allow compiling multi-device "
+                   "HLOs on machines with fewer devices. Don't run this "
+                   "executable.";
+    } else {
+      CHECK_LE(addressable_devices_.size(), client_->addressable_device_count())
+          << "Inconsistent local device count.";
+    }
+
+    num_partitions = device_assignment_->computation_count();
+  }
+
+  // SPMD sharding produces a single executable for multiple partitions.
+  if (executables_.size() > 1) {
+    CHECK_EQ(num_partitions, executables_.size())
+        << "Number of executables " << executables_.size()
+        << " did not match number of partitions " << num_partitions;
+  }
+}
+
+absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    const RunId& run_id, const ExecuteOptions& options, bool fill_future,
+    TfrtGpuDevice* device) {
+  tsl::profiler::TraceMeProducer activity("TfrtGpuExecutable::ExecuteHelper",
+                                          tsl::profiler::ContextType::kPjRt,
+                                          run_id.ToInt());
+
+  std::shared_ptr<DeviceAssignment> device_assignment;
+  if (device == nullptr) {
+    CHECK(device_assignment_ != nullptr);
+    const int device_id = (*device_assignment_)(replica, partition);
+    VLOG(2) << "device_id: " << device_id;
+    TF_ASSIGN_OR_RETURN(PjRtDevice * pjrt_device,
+                        client_->LookupDevice(PjRtGlobalDeviceId(device_id)));
+    device = tsl::down_cast<TfrtGpuDevice*>(pjrt_device);
+    device_assignment = device_assignment_;
+  } else {
+    CHECK(device_assignment_ == nullptr);
+    CHECK_EQ(replica, 0);
+    CHECK_EQ(partition, 0);
+    CHECK(addressable_devices_.empty());
+    device_assignment = std::make_shared<DeviceAssignment>(1, 1);
+    (*device_assignment)(0, 0) = device->id();
+  }
+  CHECK_EQ(device->process_index(), client_->process_index());
+
+  VLOG(1) << "ExecuteHelper " << name() << ": " << options.launch_id
+          << "; replica: " << replica << "; partition: " << partition
+          << "; mapped to device ordinal for execution: " << device->id();
+
+  // The choice of where we wait is arbitrary; the reason for the wait is
+  // pacing to avoid problems such as memory fragmentation and running ahead
+  // too far, not for correctness. Placing it before the executable launch
+  // allows the inputs for the next executable to be fetched even if the
+  // launch is delayed.
+  std::unique_ptr<Semaphore::ScopedReservation> compute_reservation;
+  {
+    tsl::profiler::TraceMe traceme_compute_reservation(
+        "TfrtGpuExecutable::ExecuteHelper::acquire_sempahore");
+
+    VLOG(1) << "Trying to acquire semaphore for " << name() << " on device "
+            << device->DebugString();
+    compute_reservation = std::make_unique<Semaphore::ScopedReservation>(
+        device->max_inflight_computations_semaphore().ScopedAcquire(1));
+    VLOG(1) << "Acquired semaphore for " << name() << " on device "
+            << device->DebugString();
+  }
+
+  // Handle inputs.
+  if (options.arguments_are_tupled) {
+    if (!parameter_is_tupled_arguments_) {
+      return InvalidArgument(
+          "Arguments may only be supplied as a tuple when the executable was"
+          "compiled with a single tupled parameter");
+    }
+    if (argument_handles.size() != 1) {
+      return InvalidArgument(
+          "Option arguments_are_tupled was true but %d buffers were passed to"
+          "execution",
+          argument_handles.size());
+    }
+  }
+
+  // SPMD sharding produces a single executable for multiple partitions.
+  int executable_idx = executables_.size() > 1 ? partition : 0;
+
+  // `scheduled_event` indicates whether gpu computation is dispatched to the
+  // stream and whether there was an error.
+  auto scheduled_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+
+  // `complete_event` indicates whether gpu computation is complete and whether
+  // there was an error.
+  auto complete_event = tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+
+  absl::InlinedVector<TfrtGpuBuffer::DonationTransaction, 4>
+      donation_transactions;
+
+  absl::InlinedVector<TrackedGpuDeviceBuffer*, 4> tracked_buffers;
+  absl::InlinedVector<bool, 4> buffer_is_donated;
+  tracked_buffers.reserve(argument_handles.size());
+  buffer_is_donated.reserve(argument_handles.size());
+  // To avoid clobbering inputs, we must ensure that
+  //   `extra_deps` = inputs' definition events + donated inputs' usage events.
+  // This also ensures that the returned `complete_event` dominates all inputs'
+  // events, and thus output buffer only need to contain `complete_event` as
+  // the single definition event.
+  std::vector<tsl::RCReference<tsl::AsyncValue>> prepare_input_deps;
+  std::vector<tsl::RCReference<tsl::AsyncValue>> input_deps;
+  input_deps.reserve(argument_handles.size() + 1);
+
+  absl::Span<int const> donated_params =
+      parameters_that_must_be_donated_[executable_idx];
+  auto donate_it = donated_params.begin();
+
+  absl::flat_hash_map<const void*, std::pair<bool, int>> donation_clashes;
+  donation_clashes.reserve(argument_handles.size());
+  for (int i = 0; i < argument_handles.size(); ++i) {
+    PjRtBuffer* handle = argument_handles[i];
+    auto* tfrt_buffer = tsl::down_cast<TfrtGpuBuffer*>(handle);
+
+    if (tfrt_buffer->device() != device) {
+      return InvalidArgument(
+          "Buffer passed to Execute() as argument %d to replica %d is on "
+          "device %s, but replica is assigned to device %s.",
+          i, replica, tfrt_buffer->device()->DebugString(),
+          device->DebugString());
+    }
+    bool donation_denied_at_runtime =
+        options.non_donatable_input_indices.contains(i);
+    bool must_donate = donate_it != donated_params.end() && *donate_it == i &&
+                       !donation_denied_at_runtime;
+    auto tracked_buffer_or = [&]() -> absl::StatusOr<TrackedGpuDeviceBuffer*> {
+      TrackedGpuDeviceBuffer* tracked_buffer = nullptr;
+      if (must_donate) {
+        VLOG(2) << "Buffer for argument_handles[" << i << "] is donated";
+
+        ++donate_it;
+        TF_RETURN_IF_ERROR(TestBufferDonationClashes(
+            handle, donation_clashes, must_donate, i, replica, partition));
+        TF_ASSIGN_OR_RETURN(auto donation_transaction,
+                            tfrt_buffer->AcquireDonation());
+
+        // After acquiring the buffer for donation, we retrieve the dependent
+        // usage events. Note that we don't need any locking here as
+        // AcquireDonation() is supposed to synchronize with other usages.
+        input_deps.push_back(
+            donation_transaction.device_buffer()->AfterAllUsageEvents());
+        tracked_buffer = donation_transaction.device_buffer();
+        donation_transactions.push_back(std::move(donation_transaction));
+        buffer_is_donated.push_back(true);
+      } else {
+        tracked_buffer = tfrt_buffer->AcquireUsage(complete_event);
+        if (!tracked_buffer) {
+          return InvalidArgument(
+              "Invalid buffer passed: buffer has been deleted or donated.");
+        }
+        buffer_is_donated.push_back(false);
+      }
+      return tracked_buffer;
+    }();
+
+    if (!tracked_buffer_or.ok()) {
+      // If something failed when preparing the input, we still need to add it
+      // to the input deps so that it can poison the output buffers.
+      auto error_av = tsl::MakeErrorAsyncValueRef(tracked_buffer_or.status());
+      prepare_input_deps.push_back(error_av);
+      input_deps.push_back(error_av);
+
+      LOG(ERROR) << "argument_handles[" << i
+                 << "]: failed to get tracked buffer with status "
+                 << tracked_buffer_or.status();
+    } else {
+      TrackedGpuDeviceBuffer* tracked_buffer = tracked_buffer_or.value();
+      tracked_buffers.push_back(tracked_buffer);
+      prepare_input_deps.push_back(tracked_buffer->buffer().CopyRCRef());
+
+      VLOG(2) << "argument_handles[" << i
+              << "]: addr = " << tracked_buffer->buffer()->buffer().opaque()
+              << ", logical shape = "
+              << tfrt_buffer->logical_on_device_shape()->ToString();
+
+      // Definition events are never modified after buffer construction. If they
+      // are available and have no error, they can be skipped in input deps.
+      // In contrast, already known errors in the input are taken as deps so
+      // that they can poison output buffers.
+      const auto& definition_event = tracked_buffer->definition_event();
+      if (!definition_event.IsAvailable() || definition_event.IsError()) {
+        VLOG(2) << "definition_event is not available: AsyncValue pointer: "
+                << definition_event.GetAsyncValue();
+        input_deps.push_back(definition_event.CopyRCRef());
+      }
+    }
+  }
+
+  {
+    // Schedule only one collective at a time.
+    tsl::AsyncValueRef<GpuEvent> ordering_event =
+        tsl::MakeConstructedAsyncValueRef<GpuEvent>();
+    tsl::AsyncValueRef<GpuEvent> last_collective_launch_event =
+        device->SetLastCollectiveLaunchEvent(scheduled_event);
+    // We don't use last_collective_launch_event directly because we don't
+    // want the previous failure to be propagated to the current execution.
+    last_collective_launch_event.AndThen(
+        [event = ordering_event.CopyRef()]() { event.SetStateConcrete(); });
+    input_deps.push_back(std::move(ordering_event));
+  }
+
+  std::vector<tsl::AsyncValueRef<GpuDeviceMemory>> output_buffers;
+  std::vector<std::unique_ptr<PjRtBuffer>> outputs;
+  auto gpu_executable = executables_[executable_idx];
+  TF_ASSIGN_OR_RETURN(std::vector<Shape> output_shapes, GetOutputShapes());
+  const Shape& result_shape = output_shapes[executable_idx];
+  bool untuple_result = options.untuple_result;
+  bool result_is_tuple = result_shape.IsTuple();
+  if (options.untuple_result && result_shape.IsTuple()) {
+    output_buffers.reserve(result_shape.tuple_shapes().size());
+    outputs.reserve(output_buffers.size());
+    for (int i = 0; i < result_shape.tuple_shapes().size(); ++i) {
+      output_buffers.push_back(
+          tsl::MakeUnconstructedAsyncValueRef<GpuDeviceMemory>());
+      // Program execution writes to output buffers so it's a definition
+      // event.
+      auto leaf_tracked_device_buffer =
+          std::make_unique<TrackedGpuDeviceBuffer>(
+              output_buffers.back().CopyRef(), scheduled_event.CopyRef());
+      VLOG(3) << "created leaf_tracked_device_buffer: "
+              << leaf_tracked_device_buffer.get();
+
+      const Shape& shape = result_shape.tuple_shapes(i);
+      PjRtMemorySpace* memory_space =
+          device->default_memory_space().value_or(nullptr);
+      if (shape.has_layout() &&
+          shape.layout().memory_space() == Layout::kHostMemorySpace) {
+        TF_ASSIGN_OR_RETURN(memory_space, device->memory_space_by_kind_id(
+                                              PinnedHostMemorySpace::kKindId));
+      }
+
+      auto output = std::make_unique<TfrtGpuBuffer>(
+          result_shape.tuple_shapes(i), std::move(leaf_tracked_device_buffer),
+          client_, device, memory_space);
+      outputs.push_back(std::move(output));
+    }
+  } else {
+    output_buffers.push_back(
+        tsl::MakeUnconstructedAsyncValueRef<GpuDeviceMemory>());
+    // Program execution writes to output buffers so it's a definition event.
+    auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+        output_buffers.back().CopyRef(),
+        /*definition_event=*/scheduled_event.CopyRef());
+    VLOG(3) << "created tracked_device_buffer: " << tracked_device_buffer.get();
+
+    const Shape& shape = result_shape;
+    PjRtMemorySpace* memory_space =
+        device->default_memory_space().value_or(nullptr);
+    if (shape.has_layout() &&
+        shape.layout().memory_space() == Layout::kHostMemorySpace) {
+      TF_ASSIGN_OR_RETURN(memory_space, device->memory_space_by_kind_id(
+                                            PinnedHostMemorySpace::kKindId));
+    }
+
+    auto tfrt_output_buffer = std::make_unique<TfrtGpuBuffer>(
+        result_shape, std::move(tracked_device_buffer), client_, device,
+        memory_space);
+    outputs.push_back(std::move(tfrt_output_buffer));
+  }
+
+  auto ffi_context =
+      options.context != nullptr ? &options.context->ffi_context() : nullptr;
+
+  // Create a PjRt<->StreamExecutor adaptors to send/recv device memory as
+  // PjRt chunks via the user-provided callbacks.
+  SendDeviceMemoryFunction send_device_memory =
+      ConvertSendCallbacksToSendFunction(replica, options,
+                                         client_->non_blocking_thread_pool());
+  RecvDeviceMemoryFunction recv_device_memory =
+      ConvertRecvCallbacksToRecvFunction(replica, options);
+
+  auto execute_fn =
+      [replica, partition, device, launch_id(options.launch_id), run_id(run_id),
+       output_buffers(output_buffers), complete_event(complete_event.CopyRef()),
+       scheduled_event(scheduled_event.CopyRef()),
+       untuple_result(untuple_result), result_is_tuple(result_is_tuple),
+       donation_transactions(std::move(donation_transactions)),
+       parameter_shapes(on_device_executable_parameter_shapes_[executable_idx]),
+       gpu_executable(std::move(gpu_executable)),
+       device_assignment(device_assignment), executable_name(name()),
+       ffi_context(ffi_context), inputs_avs(CopyAsyncValues(input_deps)),
+       execution_profile(options.execution_profile),
+       send_device_memory(std::move(send_device_memory)),
+       recv_device_memory(std::move(recv_device_memory)),
+       compute_reservation(std::move(compute_reservation)),
+       client = client_](std::vector<ExecutionInput> execution_inputs) mutable {
+        VLOG(1) << "execute_fn for " << executable_name
+                << ", run_id: " << launch_id << ", replica: " << replica
+                << ", device: " << device->DebugString();
+
+        tsl::profiler::TraceMeProducer producer(
+            [&] {
+              return tsl::profiler::TraceMeEncode(
+                  "execute_fn",
+                  {{"launch_id", std::to_string(launch_id)},
+                   {"device_ordinal", device->local_device_id().value()}});
+            },
+            tsl::profiler::ContextType::kTfExecutor, run_id.ToInt());
+
+        auto set_error = [&](absl::Status status) {
+          for (auto& output_buffer : output_buffers) {
+            output_buffer.SetError(status);
+          }
+          complete_event.SetError(status);
+          scheduled_event.SetError(status);
+        };
+
+        for (const auto& av : inputs_avs) {
+          if (auto* error = av->GetErrorIfPresent()) {
+            set_error(*error);
+            return;
+          }
+        }
+
+        auto stream = device->stream();
+        ExecutableRunOptions run_options;
+        run_options.set_stream(stream);
+        run_options.set_host_to_device_stream(stream);
+        run_options.set_device_to_host_stream(stream);
+        run_options.set_allocator(client->allocator());
+        run_options.set_device_assignment(device_assignment.get());
+
+        if (launch_id != 0) {
+          run_options.set_run_id(RunId(launch_id));
+        } else {
+          run_options.set_run_id(run_id);
+        }
+        run_options.set_rng_seed(device->GetNewPrngSeed());
+        run_options.set_gpu_executable_run_options(
+            CHECK_NOTNULL(client->gpu_run_options()));
+        run_options.set_launch_id(launch_id);
+        run_options.set_local_device_count(client->device_count());
+        run_options.set_device_ordinal(device->local_device_id().value());
+        run_options.set_physical_device_ordinal(
+            device->local_hardware_id().value());
+        run_options.set_ffi_execution_context(ffi_context);
+        run_options.set_intra_op_thread_pool(
+            client->xla_client()
+                ->backend()
+                .eigen_intra_op_thread_pool_device());
+        run_options.set_send_device_memory_function(&send_device_memory);
+        run_options.set_recv_device_memory_function(&recv_device_memory);
+        run_options.set_execution_profile(execution_profile);
+
+        // TODO(phawkins): *technically* this should probably happen after
+        // calling RunAsync(). But that causes a large performance problem: it
+        // prevents the main thread from freeing the buffer objects.
+        for (auto& donation_transaction : donation_transactions) {
+          VLOG(2) << "Committing donation transaction: "
+                  << donation_transaction.device_buffer();
+          std::move(donation_transaction).Commit();
+        }
+
+        VLOG(1) << "Start calling RunAsync for executable " << executable_name
+                << " on device " << device->DebugString();
+
+        absl::StatusOr<ExecutionOutput> result_buffer_or_status =
+            gpu_executable->RunAsync(std::move(execution_inputs), run_options);
+
+        VLOG(1) << "Finish calling RunAsync for executable " << executable_name
+                << " on device " << device->DebugString() << ", replica "
+                << replica << ", partition " << partition
+                << ", status=" << result_buffer_or_status.status();
+
+        if (!result_buffer_or_status.ok()) {
+          LOG(ERROR) << "Calling RunAsync failed for executable "
+                     << executable_name << " on device "
+                     << device->DebugString()
+                     << ", status = " << result_buffer_or_status.status();
+          set_error(result_buffer_or_status.status());
+          return;
+        }
+
+        ExecutionOutput& execution_output = result_buffer_or_status.value();
+        ScopedShapedBuffer output = execution_output.ConsumeResult();
+        if (untuple_result && result_is_tuple) {
+          for (int i = 0; i < output_buffers.size(); ++i) {
+            ScopedShapedBuffer tuple_buffer = output.TakeSubTree({i});
+            stream_executor::DeviceMemoryBase* elem =
+                tuple_buffer.buffers().mutable_element({});
+            VLOG(2) << "untuple: output_buffers[" << i
+                    << "].emplace: " << elem->opaque();
+            output_buffers[i].emplace(stream_executor::OwningDeviceMemory(
+                *elem, device->local_device_id().value(), device->allocator()));
+            *elem = se::DeviceMemoryBase();
+          }
+        } else {
+          CHECK_EQ(output_buffers.size(), 1);
+          auto* elem = output.buffers().mutable_element({});
+          VLOG(2) << "output_buffers[0].emplace: " << elem->opaque();
+          output_buffers.front().emplace(stream_executor::OwningDeviceMemory(
+              *elem, device->local_device_id().value(), device->allocator()));
+          *elem = se::DeviceMemoryBase();
+        }
+
+        // Set the scheduled event to concrete to indicate that the scheduling
+        // has completed, so that the next execute_fn can start.
+        scheduled_event.SetStateConcrete();
+
+        absl::Status status = stream->BlockHostUntilDone();
+        if (!status.ok()) {
+          LOG(ERROR) << "BlockHostUntilDone failed for executable "
+                     << executable_name << " on device "
+                     << device->DebugString() << ", status = " << status;
+          complete_event.SetError(status);
+        } else {
+          VLOG(1) << "Device execution done for " << executable_name
+                  << ", run_id: " << run_id.ToInt();
+          complete_event.SetStateConcrete();
+        }
+
+        VLOG(1) << "execute_fn for " << executable_name << " on "
+                << device->DebugString() << " is done with status " << status;
+      };
+
+  auto prepare_inputs =
+      [blocking_thread_pool = client_->blocking_thread_pool(), run_id(run_id),
+       executable_name(name()), device,
+       tracked_buffers(std::move(tracked_buffers)),
+       buffer_is_donated(std::move(buffer_is_donated)),
+       prepare_inputs_avs(CopyAsyncValues(prepare_input_deps)),
+       complete_event(complete_event.CopyRef()),
+       scheduled_event(scheduled_event.CopyRef()),
+       output_buffers(std::move(output_buffers)),
+       execute_fn(std::move(execute_fn)), input_deps(std::move(input_deps)),
+       parameter_shapes(on_device_executable_parameter_shapes_[executable_idx]),
+       parameter_is_tupled_arguments(parameter_is_tupled_arguments_),
+       arguments_are_tupled(options.arguments_are_tupled),
+       input_buffer_sizes_in_bytes(
+           input_buffer_sizes_in_bytes_[executable_idx])]() mutable {
+        tsl::profiler::TraceMeConsumer activity(
+            "prepare_inputs", tsl::profiler::ContextType::kPjRt,
+            run_id.ToInt());
+
+        auto set_error = [&](absl::Status status) {
+          complete_event.SetError(status);
+          scheduled_event.SetError(status);
+          for (auto& output_buffer : output_buffers) {
+            output_buffer.SetError(status);
+          }
+        };
+
+        for (const auto& av : prepare_inputs_avs) {
+          if (auto* error = av->GetErrorIfPresent()) {
+            set_error(*error);
+            return;
+          }
+        }
+
+        VLOG(2) << "prepare_inputs for " << executable_name << " on device "
+                << device->DebugString();
+        DCHECK_EQ(tracked_buffers.size(), buffer_is_donated.size());
+
+        absl::Status status = CheckBufferCompatibilities(
+            *input_buffer_sizes_in_bytes, tracked_buffers);
+        if (!status.ok()) {
+          set_error(status);
+          return;
+        }
+
+        std::vector<ExecutionInput> inputs;
+        if (parameter_is_tupled_arguments && !arguments_are_tupled) {
+          inputs.emplace_back(
+              ShapeTree<MaybeOwningDeviceMemory>(&parameter_shapes->front()));
+          ExecutionInput& input = inputs.back();
+          for (int i = 0; i < tracked_buffers.size(); ++i) {
+            VLOG(3) << "tupled input[" << i
+                    << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
+            if (buffer_is_donated[i]) {
+              input.SetUnownedBuffer(
+                  {i}, MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+                           tracked_buffers[i]->buffer()->buffer(),
+                           device->local_hardware_id().value(),
+                           device->allocator())));
+            } else {
+              input.SetBuffer({i}, MaybeOwningDeviceMemory(
+                                       tracked_buffers[i]->buffer()->buffer()));
+            }
+          }
+        } else {
+          inputs.reserve(tracked_buffers.size());
+          for (int i = 0; i < tracked_buffers.size(); ++i) {
+            VLOG(3) << "untupled input[" << i
+                    << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
+            inputs.emplace_back(
+                ShapeTree<MaybeOwningDeviceMemory>(&(*parameter_shapes)[i]));
+            ExecutionInput& input = inputs.back();
+            if (buffer_is_donated[i]) {
+              input.SetUnownedBuffer(
+                  {}, MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+                          tracked_buffers[i]->buffer()->buffer(),
+                          device->local_hardware_id().value(),
+                          device->allocator())));
+            } else {
+              input.SetBuffer({}, MaybeOwningDeviceMemory(
+                                      tracked_buffers[i]->buffer()->buffer()));
+            }
+          }
+        }
+
+        EnqueueWorkWhenReady(blocking_thread_pool, input_deps,
+                             [execute_fn(std::move(execute_fn)),
+                              inputs(std::move(inputs))]() mutable {
+                               execute_fn(std::move(inputs));
+                             });
+      };
+  EnqueueWorkWhenReady(client_->non_blocking_thread_pool(), prepare_input_deps,
+                       std::move(prepare_inputs));
+
+  // Create output TFRT buffers.
+  std::optional<PjRtFuture<>> future;
+  if (fill_future) {
+    auto promise = PjRtFuture<>::CreatePromise();
+    future = PjRtFuture<>(promise);
+    complete_event.AndThen([promise = std::move(promise),
+                            event = complete_event.CopyRef()]() mutable {
+      absl::Status s;
+      if (auto* error = event.GetErrorIfPresent()) {
+        s = *error;
+      }
+      VLOG(1) << "Setting future: " << s;
+      promise.Set(s);
+    });
+  }
+  return Result({/*future=*/std::move(future),
+                 /*buffers=*/std::move(outputs)});
+}
+
+absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+TfrtGpuExecutable::Execute(
+    absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+    const ExecuteOptions& options,
+    std::optional<std::vector<PjRtFuture<>>>& returned_futures) {
+  RunId run_id(options.launch_id);
+  tsl::profiler::TraceMeProducer activity("TfrtGpuExecutable::Execute",
+                                          tsl::profiler::ContextType::kPjRt,
+                                          run_id.ToInt());
+  if (device_assignment_ == nullptr) {
+    return InvalidArgument("Execute expects a non-null device_assignment");
+  }
+  const int num_addressable_devices = addressable_devices_.size();
+
+  if (argument_handles.size() != num_addressable_devices) {
+    return InvalidArgument(
+        "Attempted to execute with %d argument lists when local device "
+        "count is %d (total replica count: %d, partition count: %d)",
+        argument_handles.size(), num_addressable_devices, num_replicas(),
+        num_partitions());
+  }
+
+  VLOG(1) << "Executing computation " << name()
+          << "; num_replicas=" << num_replicas()
+          << " num_partitions=" << num_partitions()
+          << " num_addressable_devices=" << num_addressable_devices;
+
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> wrapped_results(
+      num_addressable_devices);
+  if (returned_futures.has_value()) {
+    returned_futures->resize(num_addressable_devices);
+  }
+  if (num_addressable_devices == 1 && !ThisThreadIsInsideHostCallback()) {
+    // Fast-path if there is only one device — run the computation on the
+    // current thread.
+    const int replica = addressable_device_logical_ids_[0].replica;
+    const int partition = addressable_device_logical_ids_[0].partition;
+
+    // TODO(b/382117736): Dump HLO snapshot.
+    // Dump once before running, in case there's a crash.
+    // MaybeDumpHloSnapshot(gpu_executable_->module(), run_id,
+    //                      argument_handles[0], {});
+
+    auto statusor = ExecuteHelper(
+        argument_handles[0], replica, partition, run_id, options,
+        returned_futures.has_value());
+
+    if (!statusor.ok()) {
+      return std::move(statusor).status();
+    }
+
+    wrapped_results[0] = std::move(statusor->buffers);
+    if (returned_futures.has_value()) {
+      (*returned_futures)[0] = std::move(*statusor->future);
+    }
+
+    // TODO(b/382117736): Dump HLO snapshot.
+    // MaybeDumpHloSnapshot(cpu_executable_->module(), run_id,
+    //                      argument_handles[0], wrapped_results[0]);
+  } else {
+    absl::Mutex mu;
+    int running = num_addressable_devices;
+    int failed = 0;
+    absl::Status first_failure_status;
+
+    for (int i = 0; i < num_addressable_devices; ++i) {
+      const int replica = addressable_device_logical_ids_[i].replica;
+      const int partition = addressable_device_logical_ids_[i].partition;
+      const int device_id = (*device_assignment_)(replica, partition);
+      TF_ASSIGN_OR_RETURN(PjRtDevice * pjrt_device,
+                          client_->LookupDevice(PjRtGlobalDeviceId(device_id)));
+      TfrtGpuDevice* gpu_device =
+          tensorflow::down_cast<TfrtGpuDevice*>(pjrt_device);
+
+      VLOG(1) << "Try to run ExecuteHelper for " << name() << " on device "
+              << gpu_device->DebugString() << ", run_id: " << run_id.ToInt();
+
+      // Gang schedule collectives to ensure that collectives with the same
+      // RunId are run at the same time. We conservatively run only one
+      // collective at a time, because we may not have enough threads to run
+      // arbitrary number of collectives concurrently.
+      EnqueueWork(
+          client_->non_blocking_thread_pool(),
+          [this, gpu_device, replica, partition, i, &argument_handles, &run_id,
+           &options, &returned_futures, &wrapped_results, &mu, &running,
+           &failed, &first_failure_status] {
+            auto statusor = ExecuteHelper(
+                argument_handles[i], replica, partition, run_id, options,
+                returned_futures.has_value());
+            if (statusor.ok()) {
+              wrapped_results[i] = std::move(statusor->buffers);
+              if (returned_futures.has_value()) {
+                (*returned_futures)[i] = std::move(*statusor->future);
+              }
+            }
+
+            absl::MutexLock lock(&mu);
+            --running;
+            if (!statusor.ok()) {
+              if (failed == 0) {
+                first_failure_status = AppendStatus(
+                    std::move(statusor).status(),
+                    absl::StrFormat(
+                        "while running replica %d and partition %d of a "
+                        "replicated computation (other "
+                        "replicas may have failed as well).",
+                        replica, partition));
+              }
+              ++failed;
+            }
+          });
+    }
+
+    {
+      auto done_running = [&]() {
+        mu.AssertHeld();
+        return running == 0;
+      };
+      absl::MutexLock lock(&mu);
+      mu.Await(absl::Condition(&done_running));
+    }
+
+    if (!first_failure_status.ok()) return first_failure_status;
+  }
+  VLOG(1) << "Replicated execution complete.";
+
+  return wrapped_results;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+TfrtGpuExecutable::ExecuteSharded(
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
+  RunId run_id(options.launch_id);
+  tsl::profiler::TraceMeProducer activity("TfrtGpuExecutable::ExecuteSharded",
+                                          tsl::profiler::ContextType::kPjRt,
+                                          run_id.ToInt());
+  if (device_assignment_ == nullptr) {
+    return InvalidArgument("ExecuteShard expects a non-null device_assignment");
+  }
+  for (int i = 0; i < addressable_devices_.size(); ++i) {
+    if (addressable_devices_[i] == device) {
+      VLOG(1) << "ExecuteShard executes computation " << name()
+              << " on assigned replica/partition on device "
+              << device->DebugString();
+      TF_ASSIGN_OR_RETURN(
+          auto result,
+          ExecuteHelper(
+              argument_handles, addressable_device_logical_ids_[i].replica,
+              addressable_device_logical_ids_[i].partition, run_id, options,
+              fill_future));
+      returned_future = std::move(result.future);
+      return std::move(result.buffers);
+    }
+  }
+  return InvalidArgument(
+      "ExecuteShard attempted to execute on device id %d which is not "
+      "addressable by this client",
+      device->global_device_id().value());
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+TfrtGpuExecutable::ExecutePortable(
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+    const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
+    bool fill_future) {
+  RunId run_id(options.launch_id);
+  tsl::profiler::TraceMeProducer activity("TfrtGpuExecutable::ExecutePortable",
+                                          tsl::profiler::ContextType::kPjRt,
+                                          run_id.ToInt());
+  if (device_assignment_ != nullptr) {
+    return InvalidArgument("ExecutePortable gets a non-portable executable");
+  }
+  if (num_replicas() != 1 || num_partitions() != 1) {
+    return InvalidArgument(
+        "ExecutePortable expects a single-core executable but gets "
+        "one with %d replica %d partition",
+        num_replicas(), num_partitions());
+  }
+  if (device == nullptr) {
+    return InvalidArgument("ExecutePortable expects a device to be specified");
+  }
+  VLOG(1) << "ExecutePortable executes single-core portable executable "
+          << name();
+  TF_ASSIGN_OR_RETURN(
+      auto result,
+      ExecuteHelper(
+          argument_handles,
+          /*replica=*/0,
+          /*partition=*/0, run_id, options,
+          fill_future, tsl::down_cast<TfrtGpuDevice*>(device)));
+  returned_future = std::move(result.future);
+  return std::move(result.buffers);
+}
+
+absl::string_view TfrtGpuExecutable::name() const {
+  Executable* executable = executables_[0]->executable();
+  if (executable->has_module()) {
+    return executable->module().name();
+  } else {
+    return "<unknown executable>";
+  }
+}
+
+absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
+TfrtGpuExecutable::GetHloModules() const {
+  std::vector<std::shared_ptr<HloModule>> modules;
+  modules.reserve(executables_.size());
+  for (const auto& local_exec : executables_) {
+    if (!local_exec->executable()->has_module()) {
+      return InvalidArgument("Executable does not have HLO modules.");
+    }
+    modules.push_back(local_exec->executable()->shared_module());
+  }
+  return std::move(modules);
+}
+
+namespace {
+
+absl::StatusOr<absl::string_view> MemoryKindFromSimpleShape(
+    const Shape& shape, absl::string_view default_memory_kind) {
+  if (!shape.has_layout()) {
+    return default_memory_kind;
+  }
+  switch (shape.layout().memory_space()) {
+    case Layout::kHostMemorySpace:
+      return PinnedHostMemorySpace::kKind;
+    case Layout::kGenericFastMemorySpace:
+    case Layout::kDefaultMemorySpace:
+      return default_memory_kind;
+    default:
+      return InvalidArgument("Unexpected memory space %d in output layout",
+                             shape.layout().memory_space());
+  }
+}
+
+absl::StatusOr<std::vector<absl::string_view>> MemoryKindsFromShape(
+    const Shape& shape, absl::string_view default_memory_kind) {
+  if (!shape.IsTuple()) {
+    TF_ASSIGN_OR_RETURN(absl::string_view memory_kind,
+                        MemoryKindFromSimpleShape(shape, default_memory_kind));
+    return {{memory_kind}};
+  }
+  std::vector<absl::string_view> result;
+  result.reserve(shape.tuple_shapes().size());
+  for (const auto& element_shape : shape.tuple_shapes()) {
+    TF_ASSIGN_OR_RETURN(
+        absl::string_view element_memory_kind,
+        MemoryKindFromSimpleShape(element_shape, default_memory_kind));
+    result.push_back(element_memory_kind);
+  }
+  return result;
+}
+
+}  // namespace
+
+absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+TfrtGpuExecutable::GetOutputMemoryKinds() const {
+  TF_ASSIGN_OR_RETURN(auto shapes, GetOutputShapes());
+  if (addressable_devices().empty()) {
+    return Unimplemented(
+        "GetOutputMemoryKinds is not supported when there are no addressable "
+        "devices in TfrtGpuExecutable.");
+  }
+  TF_ASSIGN_OR_RETURN(PjRtMemorySpace * default_memory_space,
+                      addressable_devices()[0]->default_memory_space());
+  std::vector<std::vector<absl::string_view>> out;
+  out.reserve(shapes.size());
+  for (const auto& shape : shapes) {
+    TF_ASSIGN_OR_RETURN(
+        std::vector<absl::string_view> memory_kind,
+        MemoryKindsFromShape(shape, default_memory_space->kind()));
+    out.push_back(memory_kind);
+  }
+  return out;
+}
+
+absl::Status TfrtGpuExecutable::SetUpDonation(bool tuple_inputs) {
+  parameters_that_must_be_donated_.reserve(executables_.size());
+  for (auto& executable : executables_) {
+    TF_ASSIGN_OR_RETURN(std::vector<int> parameters_to_donate,
+                        ComputeParametersThatMustBeDonated(
+                            executable->executable()->module(), tuple_inputs));
+    parameters_that_must_be_donated_.emplace_back(
+        std::move(parameters_to_donate));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<CompiledMemoryStats> TfrtGpuExecutable::GetCompiledMemoryStats()
+    const {
+  if (executables_.size() != 1) {
+    return Unimplemented(
+        "Retrieving CompiledMemoryStats is not supported for multiple "
+        "executables.");
+  }
+  CompiledMemoryStats memory_stats = CompiledMemoryStats();
+  memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
+  const BufferAssignmentProto* proto =
+      executables_[0]->executable()->buffer_assignment_proto();
+  if (proto != nullptr) {
+    memory_stats.buffer_assignment = *proto;
+  }
+  memory_stats.PopulateBufferStatsFromAllocations(
+      executables_[0]->executable()->GetAllocations());
+  return memory_stats;
+}
+
+absl::StatusOr<TfrtGpuBuffer::DonationTransaction>
+TfrtGpuBuffer::AcquireDonation() {
+  absl::MutexLock lock(&mu_);
+
+  if (tracked_device_buffer_ == nullptr) {
+    return InvalidArgument("Donation requested for invalid buffer");
+  }
+
+  if (external_reference_counter_ > 0) {
+    return InvalidArgument(
+        "Donation requested for buffer with external reference");
+  }
+
+  CHECK(donation_event_.IsAvailable());
+  CHECK(!donation_event_.get());
+  donation_event_ = tsl::MakeUnconstructedAsyncValueRef<bool>();
+
+  // Swap out `tracked_device_buffer_` so that no one can acquire a usage
+  // event after this point.
+  VLOG(3) << "TfrtGpuBuffer::AcquireDonation: " << tracked_device_buffer_.get();
+  return DonationTransaction(donation_event_,
+                             std::move(tracked_device_buffer_));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
index a44ba7ddc9f8..06db62337294 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
@@ -16,37 +16,67 @@ limitations under the License.
 #ifndef XLA_PJRT_GPU_TFRT_TFRT_GPU_CLIENT_H_
 #define XLA_PJRT_GPU_TFRT_TFRT_GPU_CLIENT_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
-#include <set>
+#include <random>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#include "mlir/IR/BuiltinOps.h"
 #include "xla/client/local_client.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
+#include "xla/maybe_owning.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/gpu/se_gpu_topology_description.h"
+#include "xla/pjrt/gpu/tfrt/gpu_event.h"
+#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
+#include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
-#include "xla/pjrt/utils.h"
+#include "xla/pjrt/semaphore.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/fingerprint.h"
 
@@ -94,29 +124,23 @@ class TfrtGpuDevice final : public PjRtDevice {
  public:
   struct Options {
     int id;
+    int32_t process_index;
+    int slice_index;
     PjRtLocalDeviceId local_device_id;
     PjRtLocalHardwareId local_hardware_id;
     se::StreamExecutor* executor;
-    std::unique_ptr<tsl::Allocator> allocator;
-    int stream_capacity;
     int max_inflight_computations;
     std::string platform_version;
+    std::string compute_capability;
+    std::string device_vendor;
+    int core_count;
   };
 
   explicit TfrtGpuDevice(Options&& options);
 
-  void SetClient(PjRtClient* client) {
-    CHECK(client_ == nullptr);
-    client_ = client;
+  ~TfrtGpuDevice() override;
 
-    // We have to define debug_string_ and to_string_ here, because
-    // platform_name() requires client_ to be set.
-    std::string device_name =
-        absl::StrCat(MakeAsciiTitlecase(client_->platform_name()), "Device");
-    description_.SetDebugString(
-        absl::StrCat(client_->platform_name(), ":", id()));
-    description_.SetToString(absl::StrCat(device_name, "(id=", id(), ")"));
-  }
+  void SetClient(PjRtClient* client);
 
   const PjRtStreamExecutorDeviceDescription& description() const override {
     return description_;
@@ -124,9 +148,7 @@ class TfrtGpuDevice final : public PjRtDevice {
 
   PjRtClient* client() const override { return client_; }
 
-  bool IsAddressable() const override {
-    return process_index() == client()->process_index();
-  }
+  bool IsAddressable() const override { return local_device_id_ != -1; }
 
   int id() const override { return id_; }
 
@@ -143,6 +165,11 @@ class TfrtGpuDevice final : public PjRtDevice {
 
   absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
 
+  // Returns the semaphore to control the max inflight computations.
+  Semaphore& max_inflight_computations_semaphore() {
+    return max_inflight_computations_semaphore_;
+  }
+
   void AttachMemorySpace(PjRtMemorySpace* memory_space,
                          bool is_default = false);
 
@@ -160,28 +187,73 @@ class TfrtGpuDevice final : public PjRtDevice {
     return nullptr;
   }
 
+  absl::StatusOr<tsl::AllocatorStats> GetAllocatorStats() const override;
+
+  se::DeviceMemoryAllocator* allocator() const;
+
+  // Returns a fresh, PRNG-generated random seed for an XLA computation.
+  int GetNewPrngSeed();
+
+  se::Stream* stream() const { return stream_.get(); }
+
+  se::StreamExecutor* executor() const { return executor_; }
+
+  tsl::AsyncValueRef<GpuEvent> SetLastCollectiveLaunchEvent(
+      tsl::AsyncValueRef<GpuEvent> event);
+
  private:
   friend class TfrtGpuClient;
+  friend class TfrtGpuExecutable;
+  friend class TfrtGpuBuffer;
+
+  absl::StatusOr<TransferManager*> GetTransferManager();
 
   int id_;
   PjRtClient* client_ = nullptr;
-  PjRtLocalDeviceId local_device_id_;
-  PjRtLocalHardwareId local_hardware_id_;
-
+  const PjRtLocalDeviceId local_device_id_;
+  const PjRtLocalHardwareId local_hardware_id_;
+  se::StreamExecutor* executor_;
+  std::unique_ptr<se::Stream> stream_;
   absl::InlinedVector<PjRtMemorySpace*, 1> memory_spaces_;
   absl::flat_hash_map<int, PjRtMemorySpace*> memory_spaces_by_kind_id_;
 
+  absl::Mutex mu_;
+  std::random_device prng_seed_device_ ABSL_GUARDED_BY(mu_);
+  std::mt19937 prng_seed_generator_ ABSL_GUARDED_BY(mu_);
+  std::uniform_int_distribution<> prng_seed_distribution_ ABSL_GUARDED_BY(mu_);
+  // Launching collectives are prone to deadlock when we use fixed-sized
+  // thread pools and stream pools, since ExecuteHelper will block until all
+  // replicas reach the barrier. We ensure that
+  // 1. Thread pool size is at least as large as device_count so one collective
+  //    launch over all devices can succeed.
+  // 2. Gang-schedule each collective by conservatively ensuring a total order
+  //    of collectives and launching only one collective at a time to avoid
+  //    having no active threads to make progress
+  tsl::AsyncValueRef<GpuEvent> last_collective_launch_event_
+      ABSL_GUARDED_BY(mu_);
+
   PjRtStreamExecutorDeviceDescription description_;
   PjRtMemorySpace* default_memory_space_ = nullptr;
+
+  // Semaphore used to limit how many programs can be enqueued by the host
+  // ahead of the device.
+  xla::Semaphore max_inflight_computations_semaphore_;
 };
 
 class TfrtGpuClient final : public PjRtClient {
  public:
-  TfrtGpuClient(int process_index, xla::LocalClient* xla_client,
+  TfrtGpuClient(std::string platform_name, int process_index,
+                xla::LocalClient* xla_client,
                 std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
+                bool should_stage_host_to_device_transfers,
+                MaybeOwning<se::DeviceMemoryAllocator> allocator,
                 std::unique_ptr<tsl::Allocator> host_memory_allocator,
+                std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
+                std::shared_ptr<KeyValueStoreInterface> kv_store,
                 std::shared_ptr<const GpuTopology> gpu_topology);
 
+  ~TfrtGpuClient() override;
+
   int process_index() const override { return process_index_; }
 
   int device_count() const override { return devices_.size(); }
@@ -196,44 +268,585 @@ class TfrtGpuClient final : public PjRtClient {
     return addressable_devices_;
   }
 
+  absl::StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override;
+
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override;
+
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
 
+  xla::LocalClient* xla_client() const { return xla_client_; }
+
+  se::DeviceMemoryAllocator* allocator() { return allocator_.get_mutable(); }
+
+  bool should_stage_host_to_device_transfers() const {
+    return should_stage_host_to_device_transfers_;
+  }
+
+  HostMemoryAllocator* host_memory_allocator() const {
+    return host_memory_allocator_.get();
+  }
+
   PjRtPlatformId platform_id() const override {
     // TODO(b/382117736): Add support for ROCM and SYCL.
     return tsl::Fingerprint64(xla::CudaName());
   }
 
-  absl::string_view platform_name() const override { return xla::CudaName(); }
+  absl::string_view platform_name() const override { return platform_name_; }
+
+  absl::string_view platform_version() const override;
+
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
+
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override;
+
+  tsl::thread::ThreadPool* blocking_thread_pool() const {
+    return blocking_thread_pool_.get();
+  }
+
+  tsl::thread::ThreadPool* non_blocking_thread_pool() const {
+    return non_blocking_thread_pool_.get();
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override;
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
+      const XlaComputation& computation, CompileOptions options) override;
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options) override;
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
+      mlir::ModuleOp mlir_module, CompileOptions options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
+                           const LoadOptions& load_options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable,
+      const LoadOptions& load_options) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtMemorySpace* memory_space,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream) override;
+
+  gpu::GpuExecutableRunOptions* gpu_run_options() const {
+    return gpu_run_options_.get();
+  }
 
-  absl::string_view platform_version() const override {
-    return platform_version_;
+  absl::StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
+      const override {
+    return &topology_;
   }
 
+  std::optional<std::shared_ptr<KeyValueStoreInterface>> key_value_store()
+      const override {
+    return kv_store_;
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
+
+  using PjRtClient::BufferFromHostLiteral;
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space,
+      const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(
+      absl::Span<const ShapeSpec> shape_specs,
+      std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+      PjRtMemorySpace* memory_space) override;
+
+  // Caller is responsible to ensure that `data` has allocated enough memory
+  // for `buffer_size` to do DMA mapping.
+  absl::Status DmaMap(void* data, size_t buffer_size) override;
+
+  absl::Status DmaUnmap(void* data) override;
+
+  bool IsDmaMapped(const void* data_start, int64_t transfer_size);
+
  private:
+  friend class TfrtGpuBuffer;
+
+  // Helper function for creating PjRtStreamExecutorExecutables. Modifies
+  // `options` in-place.
+  struct ExecutableExtras {
+    std::shared_ptr<DeviceAssignment> device_assignment;
+    std::vector<PjRtLoadedExecutable::LogicalDeviceIds>
+        addressable_device_logical_ids;
+    std::vector<PjRtDevice*> addressable_devices;
+  };
+  absl::StatusOr<ExecutableExtras> GetExecutableExtras(CompileOptions* options);
+
+  // Updates `options` for compilation.
+  absl::Status UpdateCompileOptions(CompileOptions* options,
+                                    bool lookup_addressable_devices);
+
+  // Same as above, but also returns the executable extras.
+  absl::StatusOr<ExecutableExtras> UpdateCompileOptionsAndGetExecutableExtras(
+      CompileOptions* options);
+
+  // Updates `options` for compilation, and gets the executable extras if
+  // `returned_extras` is not null. It skips addressable device lookup if
+  // `lookup_addressable_devices` is false.
+  absl::Status UpdateCompileOptionsInternal(CompileOptions* options,
+                                            ExecutableExtras* returned_extras,
+                                            bool lookup_addressable_devices);
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options,
+      bool lookup_addressable_devices);
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options,
+      bool lookup_addressable_devices);
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> CompileInternal(
+      const XlaComputation& computation,
+      const std::vector<const Shape*>& argument_layout_pointers,
+      LayoutCanonicalizationCallback layout_canonicalization_callback,
+      CompileOptions options, bool lookup_addressable_devices);
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> BuildPjRtExecutable(
+      std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+      CompileOptions compile_options);
+
+  absl::StatusOr<
+      std::pair<std::vector<std::unique_ptr<LocalExecutable>>, CompileOptions>>
+  DeserializeToLocalExecutable(absl::string_view serialized,
+                               std::optional<CompileOptions> options);
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadInternal(
+      std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+      CompileOptions compile_options);
+
   int process_index_;
 
+  // Platform name must be initialized before SetClient is called on devices.
+  const std::string platform_name_;
+
   xla::LocalClient* xla_client_;
 
-  const std::string platform_version_;
+  bool should_stage_host_to_device_transfers_;
+
+  // Device memory allocator. If owned, the allocator must outlive the devices,
+  // because it is the device destructor that waits for any outstanding work to
+  // complete.
+  MaybeOwning<se::DeviceMemoryAllocator> allocator_;
+  // Allocator to be used for staging memory transfers to devices.
+  std::unique_ptr<HostMemoryAllocator> host_memory_allocator_;
 
-  // Includes all devices, including non-local devices on multi-host platforms.
-  std::vector<std::unique_ptr<TfrtGpuDevice>> owned_devices_;
   // Pointers to `owned_devices_`.
   std::vector<PjRtDevice*> devices_;
   // Maps Device::id() to the corresponding Device. Includes all devices.
   absl::flat_hash_map<PjRtGlobalDeviceId, TfrtGpuDevice*> id_to_device_;
   // Local devices indexed by local device ordinal.
   std::vector<PjRtDevice*> addressable_devices_;
+  std::unique_ptr<ComputationPlacer> computation_placer_;
 
   // Addressable memory spaces.
   std::vector<std::unique_ptr<PjRtMemorySpace>> owned_memory_spaces_;
   // Pointers to `owned_memory_spaces_`.
   std::vector<PjRtMemorySpace*> memory_spaces_;
+
+  const std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options_;
+
+  // A cache for transpose plans. We use transposes to convert
+  // (possibly strided) buffers provided to BufferFromHostBuffer into dense
+  // major-to-minor layout.
+  absl::Mutex transpose_mu_;
+  TransposePlanCache transpose_cache_ ABSL_GUARDED_BY(transpose_mu_);
+
+  StreamExecutorGpuTopologyDescription topology_;
+  std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  absl::Mutex dma_maps_mutex_;
+  // Maps dma mapped start pointers to their sizes.
+  absl::btree_map<const void*, size_t, std::greater<const void*>> dma_maps_
+      ABSL_GUARDED_BY(dma_maps_mutex_);
+
+  // Includes all devices, including non-local devices on multi-host platforms.
+  // Destructed after the thread pools, to ensure that all kernels in the
+  // streams are finished.
+  std::vector<std::unique_ptr<TfrtGpuDevice>> owned_devices_;
+
+  // Thread pools must be destructed first, to make all the pending tasks are
+  // completed before the client is destructed.
+  std::unique_ptr<tsl::thread::ThreadPool> compile_thread_pool_;
+  std::unique_ptr<tsl::thread::ThreadPool> blocking_thread_pool_;
+  std::unique_ptr<tsl::thread::ThreadPool> non_blocking_thread_pool_;
 };
 
 absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
     const GpuClientOptions& options);
 
+class TfrtGpuBuffer final : public PjRtBuffer {
+ public:
+  TfrtGpuBuffer(Shape on_device_shape,
+                std::unique_ptr<TrackedGpuDeviceBuffer> tracked_device_buffer,
+                TfrtGpuClient* client, TfrtGpuDevice* device,
+                PjRtMemorySpace* memory_space);
+  ~TfrtGpuBuffer() override;
+
+  TfrtGpuBuffer(const TfrtGpuBuffer&) = delete;
+  TfrtGpuBuffer(TfrtGpuBuffer&&) = delete;
+  TfrtGpuBuffer& operator=(const TfrtGpuBuffer&) = delete;
+  TfrtGpuBuffer& operator=(TfrtGpuBuffer&&) = delete;
+
+  PjRtMemorySpace* memory_space() const override { return memory_space_; }
+  const Shape& on_device_shape() const override { return on_device_shape_; }
+  TfrtGpuDevice* device() const override { return device_; }
+  TfrtGpuClient* client() const override { return client_; }
+
+  absl::StatusOr<Shape> logical_on_device_shape() override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+      override;
+
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override;
+
+  using PjRtBuffer::ToLiteralSync;
+  PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
+
+  PjRtFuture<> LazyToLiteral(
+      absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
+      override;
+
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+
+  PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
+                             int64_t transfer_size) override {
+    return CopyRawToHostFuture(PjRtFuture<void*>(dst), offset, transfer_size);
+  }
+
+  PjRtFuture<> CopyRawToHostFuture(PjRtFuture<void*> dst, int64_t offset,
+                                   int64_t transfer_size) override;
+
+  void Delete() override;
+
+  bool IsDeleted() override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override;
+
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
+    on_done(Unimplemented("CopyToRemoteDevice not implemented."),
+            /*sends_were_enqueued=*/false);
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
+      PjRtFuture<> dependency) override;
+
+  PjRtFuture<> GetReadyFuture() override;
+
+  bool IsOnCpu() const override;
+
+  const tsl::AsyncValueRef<GpuDeviceMemory>& GetBufferPtr() const;
+
+ private:
+  // Acquires the device buffer for shared read-only usages, and it also adds
+  // the `usage_event` to it. Any donation event in the future is expected to be
+  // serialized after all the usage events added through this method. Returns
+  // nullptr if the buffer is already donated or there is outstanding external
+  // references.
+  TrackedGpuDeviceBuffer* AcquireUsage(
+      tsl::AsyncValueRef<GpuEvent> usage_event);
+
+  // A helper class for managing a pending donation. It should be committed upon
+  // success. Otherwise, the donated buffer is returned to the TfrtGpuBuffer.
+  class DonationTransaction {
+   public:
+    explicit DonationTransaction(
+        tsl::AsyncValueRef<bool> donation_event,
+        std::unique_ptr<TrackedGpuDeviceBuffer> device_buffer)
+        : donation_event_(donation_event),
+          device_buffer_(std::move(device_buffer)) {
+      VLOG(3) << "DonationTransaction::DonationTransaction";
+    }
+    DonationTransaction(const DonationTransaction&) = delete;
+    DonationTransaction& operator=(const DonationTransaction&) = delete;
+    DonationTransaction(DonationTransaction&&) = default;
+    DonationTransaction& operator=(DonationTransaction&& other) {
+      Abort();
+
+      donation_event_ = other.donation_event_;
+      device_buffer_ = std::move(other.device_buffer_);
+      return *this;
+    }
+
+    ~DonationTransaction() { Abort(); }
+
+    // Commit the donation. The rvalue ref qualifier is used to ensure the
+    // semantic that it can be committed at most once.
+    void Commit() && {
+      donation_event_.emplace(true);
+      device_buffer_->SetUnOwned();
+      device_buffer_.reset();
+    }
+
+    TrackedGpuDeviceBuffer* device_buffer() const {
+      return device_buffer_.get();
+    }
+
+   private:
+    void Abort() {
+      if (device_buffer_) {
+        VLOG(0) << "DonationTransaction::Abort is going to "
+                   "abort donation: "
+                << device_buffer_.get();
+        donation_event_.emplace(false);
+        device_buffer_.reset();  // TODO(b/382117736): We should put this back
+                                 // into the TfrtGpuBuffer instead.
+      }
+    }
+
+    tsl::AsyncValueRef<bool> donation_event_;
+    std::unique_ptr<TrackedGpuDeviceBuffer> device_buffer_;
+  };
+
+  // Acquires the device buffer for exclusive donation. The caller of this
+  // method is expected to use the usage events and definition events to
+  // serialize this donation with previous usages. After this method is called,
+  // calls to AcquireUsage() will fail. Returns error status if the buffer is
+  // already donated or there is outstanding external references.
+  absl::StatusOr<DonationTransaction> AcquireDonation()
+      ABSL_LOCKS_EXCLUDED(mu_);
+
+  tsl::AsyncValueRef<bool> GetDonationEvent() {
+    absl::MutexLock lock(&mu_);
+    return donation_event_;
+  }
+
+  void DropExternalReference();
+
+  // Similar to Delete, drops the buffer's reference to its associated device
+  // memory, leaving the buffer in an invalid state, but returns the
+  // TrackedGpuDeviceBuffer rather than freeing the device memory, so that
+  // another framework can take ownership of it. The buffer returned from
+  // Release may be safely dropped at any time even if it still has pending
+  // async operations. The client should call Await before calling Release with
+  // wait_for_operations_to_complete=false, to ensure that the host has
+  // synchronized past any outstanding write operations to the buffer. If
+  // wait_for_operations_to_complete=true the host will block until any
+  // potentially outstanding asynchronous operations have completed before
+  // returning, in which case it is safe to read or mutate the returned buffer.
+  // If the buffer was shared via an external reference it is the client's
+  // responsibility that accesses via that reference do not interfere with
+  // accesses via the buffer returned from Release.
+  absl::StatusOr<std::unique_ptr<TrackedGpuDeviceBuffer>> Release(
+      bool wait_for_operations_to_complete);
+
+  // Releases the device buffer by returning a unique_ptr of it.
+  std::unique_ptr<TrackedGpuDeviceBuffer> ReleaseBufferLocked()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  TfrtGpuClient* client_;
+  const Shape on_device_shape_;
+  TfrtGpuDevice* const device_;
+  PjRtMemorySpace* const memory_space_;
+
+  mutable absl::Mutex mu_;
+  std::unique_ptr<TrackedGpuDeviceBuffer> tracked_device_buffer_
+      ABSL_GUARDED_BY(mu_);
+  // Count of external references on the buffer.
+  int external_reference_counter_ ABSL_GUARDED_BY(mu_) = 0;
+
+  // `pending_donation_` indicates whether a donation is pending. The destructor
+  // of the TfrtGpuBuffer will wait for a pending donation, as the donation
+  // might fail. Note that concurrent calls to AcquireUsage() and
+  // AcquireDonation() might fail even if the pending donation is aborted later.
+  tsl::AsyncValueRef<bool> donation_event_ ABSL_GUARDED_BY(mu_);
+  PjRtFuture<>::Promise definition_promise_ ABSL_GUARDED_BY(mu_);
+
+  // This event is triggered when the last external reference is released.
+  // It is used to make sure that the buffer is not deleted before all external
+  // references are dropped.
+  // Notice that this event won't be triggered if there is never an external
+  // reference.
+  tsl::AsyncValueRef<GpuEvent> external_references_dropped_event_
+      ABSL_GUARDED_BY(mu_);
+
+  friend class TfrtGpuClient;
+  friend class TfrtGpuExecutable;
+  friend class DonationTransactionPeer;
+};
+
+class TfrtGpuExecutable final : public PjRtLoadedExecutable {
+ public:
+  TfrtGpuExecutable(
+      std::vector<std::unique_ptr<LocalExecutable>> executables,
+      bool parameter_is_tupled_arguments,
+      std::shared_ptr<DeviceAssignment> device_assignment,
+      CompileOptions compile_options,
+      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
+      std::vector<PjRtDevice*> addressable_devices, TfrtGpuClient* client);
+
+  TfrtGpuClient* client() const override { return client_; }
+
+  absl::string_view name() const override;
+
+  int num_replicas() const override {
+    return executables_[0]->build_options().num_replicas();
+  }
+
+  int num_partitions() const override {
+    return executables_[0]->build_options().num_partitions();
+  }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    int64_t size = 0;
+    for (auto& executable : executables_) {
+      size += executable->executable()->SizeOfGeneratedCodeInBytes();
+    }
+    return size;
+  }
+
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override;
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
+
+  const DeviceAssignment& device_assignment() const override {
+    return *device_assignment_;
+  }
+
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override {
+    return addressable_device_logical_ids_;
+  }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override;
+
+  using PjRtLoadedExecutable::Execute;
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
+
+  using PjRtLoadedExecutable::ExecuteSharded;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  using PjRtLoadedExecutable::ExecutePortable;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
+
+  void Delete() override { executables_.clear(); }
+
+  bool IsDeleted() override { return executables_.empty(); }
+
+  absl::Span<const std::shared_ptr<LocalExecutable>> executables() const {
+    return executables_;
+  }
+
+  absl::StatusOr<std::string> SerializeExecutable() const override;
+
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return compile_options_;
+  }
+
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
+    return fingerprint_;
+  };
+
+  void SetInputHloSnapshotBits(HloModuleProto hlo_module,
+                               DebugOptions debug_options) {
+    input_hlo_snapshot_bits_ =
+        std::make_optional<InputHloSnapshotBits>(InputHloSnapshotBits{
+            HloModuleProto(std::move(hlo_module)), std::move(debug_options)});
+  }
+
+ private:
+  friend class TfrtGpuClient;
+
+  // Initializes information about which arguments to which executables must be
+  // donated due to aliases that were specified by the computation.
+  absl::Status SetUpDonation(bool tuple_inputs);
+
+  absl::StatusOr<Result> ExecuteHelper(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const RunId& run_id, const ExecuteOptions& options,
+      bool fill_future, TfrtGpuDevice* device = nullptr);
+
+  // Create shared pointers so we can free them after the execution: with
+  // asynchronous execution, the process being executed can outlive the
+  // executable itself.
+  TfrtGpuClient* const client_;
+  // One executable per partition.
+  std::vector<std::shared_ptr<LocalExecutable>> executables_;
+  // On device shapes of the executable parameters.
+  std::vector<std::shared_ptr<std::vector<Shape>>>
+      on_device_executable_parameter_shapes_;
+
+  // Size on device of each leaf buffer of the compiled program, cached here
+  // for performance reasons.
+  std::vector<std::shared_ptr<std::vector<int64_t>>>
+      input_buffer_sizes_in_bytes_;
+
+  // Per-executable sorted vector of parameters that have any aliased buffers
+  // and thus must be donated when executing the computation.
+  std::vector<std::vector<int>> parameters_that_must_be_donated_;
+  std::shared_ptr<DeviceAssignment> device_assignment_;
+  CompileOptions compile_options_;
+
+  // True if the executables were compiled expecting arguments in a single
+  // tuple.
+  const bool parameter_is_tupled_arguments_;
+
+  // The replica and partition indices of device_assignment_ to be run by this
+  // client. On single-host platforms without partitioning, this is all replicas
+  // (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may not be the
+  // case on multi-host platforms. If there are 4 replicas and 2 partitions on a
+  // single host platform, size of addressable_device_logical_ids_ is 4*2 = 8.
+  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
+
+  // addressable_devices_[i] is the Device to which
+  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
+  // unique_ptrs to play well with the Python bindings (see xla.cc).
+  std::vector<PjRtDevice*> addressable_devices_;
+  std::string fingerprint_;
+
+  struct InputHloSnapshotBits {
+    HloModuleProto hlo_module;
+    DebugOptions debug_options;
+  };
+
+  // The unoptimized (unsharded) HloModule. Primarily used for debugging.
+  std::optional<InputHloSnapshotBits> input_hlo_snapshot_bits_;
+};
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_GPU_TFRT_TFRT_GPU_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
index d7fe3f0de693..4343da9a03c8 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
@@ -15,20 +15,157 @@ limitations under the License.
 
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 
+#include <stdint.h>
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <numeric>
+#include <optional>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/pjrt/distributed/in_memory_key_value_store.h"
+#include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/pjrt/gpu/tfrt/gpu_event.h"
+#include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/pjrt/raw_buffer.h"
+#include "xla/service/platform_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/types.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/mem.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
+
+class DonationTransactionPeer {
+ public:
+  static absl::StatusOr<TfrtGpuBuffer::DonationTransaction> AcquireDonation(
+      TfrtGpuBuffer* tfrt_buffer) {
+    return tfrt_buffer->AcquireDonation();
+  }
+  static tsl::AsyncValueRef<bool> GetDonationEvent(TfrtGpuBuffer* tfrt_buffer) {
+    return tfrt_buffer->GetDonationEvent();
+  }
+};
+
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Gt;
 using ::testing::HasSubstr;
+using ::testing::SizeIs;
 using ::testing::status::IsOkAndHolds;
+using ::testing::status::StatusIs;
+
+absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileExecutable(
+    absl::string_view program, xla::PjRtClient& client,
+    xla::CompileOptions compile_options = xla::CompileOptions()) {
+  TF_ASSIGN_OR_RETURN(auto hlo_module,
+                      ParseAndReturnUnverifiedModule(program, {}));
+
+  xla::XlaComputation xla_computation(hlo_module->ToProto());
+  return client.CompileAndLoad(xla_computation, compile_options);
+}
+
+// Given the result of a PjrtExecutable::Execute call (TF-status of vectors of
+// vectors), extract the zeroth result from the zeroth device.
+absl::StatusOr<std::shared_ptr<xla::Literal>> ExtractSingleResult(
+    absl::StatusOr<std::vector<std::vector<std::unique_ptr<xla::PjRtBuffer>>>>&
+        result) {
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RET_CHECK(result->size() == 1);
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = (*result)[0];
+  TF_RET_CHECK(result_buffers.size() == 1);
+  auto literal_or = result_buffers[0]->ToLiteralSync();
+  if (!literal_or.status().ok()) return literal_or.status();
+  return *literal_or;
+}
+
+static constexpr char const* kProgram = R"(HloModule HostTransfer
+ENTRY SendRecvSynchronous() -> f32[2] {
+  in_chain = token[] after-all()
+
+  data = f32[2] constant({2, 3})
+  send = (f32[2], u32[], token[]) send(data, in_chain),
+    channel_id=1,
+    is_host_transfer=true,
+    frontend_attributes={
+      _xla_host_transfer_handler_name="undef",
+      _xla_host_transfer_rendezvous="undef"
+    }
+  send-done = token[] send-done(send),
+    channel_id=1, is_host_transfer=true
+
+  recv = (f32[2], u32[], token[]) recv(send-done),
+    channel_id=2,
+    is_host_transfer=true,
+    frontend_attributes={
+      _xla_host_transfer_handler_name="undef",
+      _xla_host_transfer_rendezvous="undef"
+    }
+  recv-done = (f32[2], token[]) recv-done(recv),
+    channel_id=2, is_host_transfer=true
+
+  ROOT result = f32[2] get-tuple-element(recv-done), index=0
+})";
 
 TEST(TfrtGpuClientTest, GpuClientOptions) {
   GpuClientOptions options;
@@ -73,5 +210,1610 @@ TEST(TfrtGpuClientTest, MemorySpacesUniqueIds) {
   }
 }
 
+TEST(TfrtGpuClientTest, PropagateError) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  auto shape = xla::ShapeUtil::MakeScalarShape(xla::F32);
+  absl::Status input_error = absl::InvalidArgumentError("input error");
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->CreateErrorBuffer(
+          input_error, shape,
+          *client->addressable_devices()[0]->default_memory_space()));
+
+  static constexpr char const* kAddProgram =
+      R"(
+HloModule Add.6, entry_computation_layout={(f32[], f32[])->(f32[], f32[])}
+
+ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
+  %a.1 = f32[] parameter(0)
+  %b.2 = f32[] parameter(1)
+  %add.3 = f32[] add(f32[] %a.1, f32[] %b.2)
+  %add.4 = f32[] add(f32[] %add.3, f32[] %add.3)
+  ROOT %tuple.5 = (f32[], f32[]) tuple(f32[] %add.3, f32[] %add.4)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kAddProgram, *client));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      executable->Execute({{buffer.get(), buffer.get()}}, /*options=*/{}));
+
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result[0].size(), 1);
+  EXPECT_EQ(result[0][0]->GetReadyFuture().Await(), input_error);
+}
+
+TEST(TfrtGpuClientTest, SendRecvChunked) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  std::array<float, 2> sent_value = {0.0f, 0.0f};
+
+  // Send buffer to host.
+  SendCallback send_callback = {
+      /*channel_id=*/1, [&](const PjRtTransferMetadata& m, PjRtChunk chunk,
+                            int64_t total_size_in_bytes, bool done) {
+        float* data = reinterpret_cast<float*>(chunk.data());
+        sent_value[0] = data[0];
+        sent_value[1] = data[1];
+        return absl::OkStatus();
+      }};
+
+  // Recv buffer from host.
+  RecvCallback recv_callback = {
+      /*channel_id=*/2, [&](const PjRtTransferMetadata& m,
+                            std::unique_ptr<CopyToDeviceStream> stream) {
+        auto chunk0 = PjRtChunk::AllocateDefault(sizeof(float));
+        *reinterpret_cast<float*>(chunk0.data()) = 5.0f;
+        TF_CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
+
+        auto chunk1 = PjRtChunk::AllocateDefault(sizeof(float));
+        *reinterpret_cast<float*>(chunk1.data()) = 6.0f;
+        TF_CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
+
+        return absl::OkStatus();
+      }};
+
+  // Callbacks for point-to-point communication ops.
+  std::vector<std::vector<SendCallback>> send_callbacks = {{send_callback}};
+  std::vector<std::vector<RecvCallback>> recv_callbacks = {{recv_callback}};
+
+  ExecuteOptions opts;
+  opts.send_callbacks = send_callbacks;
+  opts.recv_callbacks = recv_callbacks;
+
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          ExtractSingleResult(result));
+  EXPECT_EQ(sent_value[0], 2.0f);
+  EXPECT_EQ(sent_value[1], 3.0f);
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<float>({5.0f, 6.0f}),
+                                     *result_literal));
+}
+
+TEST(TfrtGpuClientTest, SendErrorNoDeadLock) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  // Always-failing Send handler.
+  SendCallback send_callback = {
+      /*channel_id=*/1,
+      [&](const PjRtTransferMetadata&, PjRtChunk, int64_t, bool) {
+        return Internal("Uh-oh, can send chunk to host");
+      }};
+
+  // No-op Recv handler.
+  RecvCallback recv_callback = {
+      /*channel_id=*/2, [&](const PjRtTransferMetadata& m,
+                            std::unique_ptr<CopyToDeviceStream> stream) {
+        return absl::OkStatus();
+      }};
+
+  // Callbacks for point-to-point communication ops.
+  std::vector<std::vector<SendCallback>> send_callbacks = {{send_callback}};
+  std::vector<std::vector<RecvCallback>> recv_callbacks = {{recv_callback}};
+
+  ExecuteOptions opts;
+  opts.send_callbacks = send_callbacks;
+  opts.recv_callbacks = recv_callbacks;
+
+  // Check that send error safely rejected and we do not dead lock.
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+  EXPECT_THAT(ExtractSingleResult(result).status(),
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("Uh-oh, can send chunk to host")));
+}
+
+TEST(TfrtGpuClientTest, RecvErrorNoDeadLock) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  // No-op Send handler.
+  SendCallback send_callback = {
+      /*channel_id=*/1, [&](const PjRtTransferMetadata&, PjRtChunk, int64_t,
+                            bool) { return absl::OkStatus(); }};
+
+  // Invalid Recv handler that tries to add invalid chunk.
+  RecvCallback recv_callback = {
+      /*channel_id=*/2, [&](const PjRtTransferMetadata& m,
+                            std::unique_ptr<CopyToDeviceStream> stream) {
+        auto chunk = PjRtChunk::AllocateDefault(10 * sizeof(float));
+        stream->AddChunk(std::move(chunk)).Await().IgnoreError();
+        // Return ok status to proceed to corresponding recv-done call.
+        return absl::OkStatus();
+      }};
+
+  // Callbacks for point-to-point communication ops.
+  std::vector<std::vector<SendCallback>> send_callbacks = {{send_callback}};
+  std::vector<std::vector<RecvCallback>> recv_callbacks = {{recv_callback}};
+
+  ExecuteOptions opts;
+  opts.send_callbacks = send_callbacks;
+  opts.recv_callbacks = recv_callbacks;
+
+  // Check that invalid chunk safely rejected and we do not dead lock.
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+  EXPECT_THAT(
+      ExtractSingleResult(result).status(),
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("Adding chunk of size 40 would overflow buffer "
+                         "of size 8 (0 already transferred)")));
+}
+
+// User-defined data type to be passed to FFI handler via the execute context
+// side channel.
+struct MemsetValue {
+  explicit MemsetValue(float value) : value(value) {}
+  float value;
+};
+
+static absl::Status MemsetFromValue(
+    se::Stream* stream, ffi::Result<ffi::BufferR1<PrimitiveType::F32>> result,
+    MemsetValue* memset_value) {
+  uint32_t pattern;
+  std::memcpy(&pattern, &memset_value->value, sizeof(pattern));
+
+  se::DeviceMemoryBase base = result->device_memory();
+  return stream->Memset32(&base, pattern, base.size());
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemsetFromValue, MemsetFromValue,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Ret<ffi::BufferR1<PrimitiveType::F32>>()
+                           .Ctx<ffi::UserData<MemsetValue>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "MemsetFromValue",
+                         PlatformUtil::CanonicalPlatformName("GPU").value(),
+                         kMemsetFromValue);
+
+TEST(TfrtGpuClientTest, ForwardUserDataToFfiHandler) {
+  static constexpr char const* kProgram = R"(
+    HloModule ffi_handler
+    ENTRY main {
+      ROOT %custom-call = f32[4] custom-call(),
+                          custom_call_target="MemsetFromValue",
+                          api_version=API_VERSION_TYPED_FFI
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  ExecuteContext context;
+  TF_ASSERT_OK(context.ffi_context().Emplace<MemsetValue>(42.0f));
+
+  ExecuteOptions opts;
+  opts.context = &context;
+
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          ExtractSingleResult(result));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({42.0f, 42.0f, 42.0f, 42.0f}),
+      *result_literal));
+}
+
+static absl::Status MemsetFromAttr(
+    se::Stream* stream, float attr,
+    ffi::Result<ffi::BufferR1<PrimitiveType::F32>> result) {
+  uint32_t pattern;
+  std::memcpy(&pattern, &attr, sizeof(pattern));
+
+  se::DeviceMemoryBase base = result->device_memory();
+  return stream->Memset32(&base, pattern, base.size());
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemsetFromAttr, MemsetFromAttr,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Attr<float>("attr")
+                           .Ret<ffi::BufferR1<PrimitiveType::F32>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "MemsetFromAttr",
+                         PlatformUtil::CanonicalPlatformName("GPU").value(),
+                         kMemsetFromAttr);
+
+TEST(TfrtGpuClientTest, PassAttrToFfiHandler) {
+  static constexpr char const* kProgram = R"(
+  HloModule ffi_handler
+  ENTRY main {
+    ROOT %custom-call = f32[4] custom-call(),
+        custom_call_target="MemsetFromAttr",
+        api_version=API_VERSION_TYPED_FFI,
+        backend_config={"custom_call_backend_config": {"attributes": "{attr = 3.0 : f32}"}}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  ExecuteOptions opts;
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          ExtractSingleResult(result));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({3.0f, 3.0f, 3.0f, 3.0f}), *result_literal));
+}
+
+TEST(TfrtGpuClientTest, AcquireDonation) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->devices().size(), 1);
+
+  // Create TfrtGpuBuffer.
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(client->devices()[0]);
+  auto size_in_bytes = ShapeUtil::ByteSizeOf(on_device_shape);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_buffer,
+      GpuDeviceMemory::Allocate(device->allocator(),
+                                device->local_device_id().value(),
+                                size_in_bytes));
+  auto buffer_async_value_ref =
+      tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
+          std::move(device_buffer));
+  auto tracked_device_buffer = std::make_unique<TrackedGpuDeviceBuffer>(
+      std::move(buffer_async_value_ref),
+      tsl::MakeAvailableAsyncValueRef<GpuEvent>());
+  auto memory_space = device->default_memory_space().value();
+  auto tfrt_buffer = std::make_unique<TfrtGpuBuffer>(
+      on_device_shape, std::move(tracked_device_buffer),
+      tensorflow::down_cast<TfrtGpuClient*>(client.get()), device,
+      memory_space);
+
+  auto donation_transaction =
+      DonationTransactionPeer::AcquireDonation(tfrt_buffer.get());
+  EXPECT_TRUE(donation_transaction.ok());
+  std::move(*donation_transaction).Commit();
+  EXPECT_EQ(donation_transaction->device_buffer(), nullptr);
+  EXPECT_TRUE(
+      DonationTransactionPeer::GetDonationEvent(tfrt_buffer.get()).get());
+}
+
+TEST(TfrtGpuClientTest, DonateWithControlDependency) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  auto literal = LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
+
+  PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
+  PjRtFuture<> future(promise);
+  auto blocked_buffer =
+      std::move(*(buffer->DonateWithControlDependency(future)));
+  EXPECT_TRUE(buffer->IsDeleted());
+
+  buffer.reset();
+  absl::Mutex mu;
+  auto result_literal = std::make_shared<Literal>(
+      ShapeUtil::DeviceShapeToHostShape(blocked_buffer->on_device_shape()));
+  bool got_literal = false;
+  blocked_buffer->ToLiteral(result_literal.get()).OnReady([&](absl::Status s) {
+    absl::MutexLock l(&mu);
+    TF_ASSERT_OK(s);
+    got_literal = true;
+  });
+  blocked_buffer.reset();
+
+  EXPECT_FALSE(got_literal);
+  promise.Set();
+  EXPECT_TRUE(future.IsReady());
+
+  {
+    absl::MutexLock l(&mu);
+    mu.Await(absl::Condition(&got_literal));
+  }
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(literal, *result_literal));
+}
+
+TEST(TfrtGpuClientTest, ShouldStageHostToDeviceTransfersSetToTrue) {
+  GpuClientOptions options_staging;
+  options_staging.should_stage_host_to_device_transfers = true;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(options_staging));
+  auto* staging_client = tensorflow::down_cast<TfrtGpuClient*>(client.get());
+  EXPECT_TRUE(staging_client->should_stage_host_to_device_transfers());
+  std::vector<int32_t> data(256);
+  std::iota(data.begin(), data.end(), 10);
+  Shape shape = ShapeUtil::MakeShape(S32, {256});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      staging_client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          *client->addressable_devices()[0]->default_memory_space(),
+          /*device_layout=*/nullptr));
+  TF_EXPECT_OK(buffer->GetReadyFuture().Await());
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
+                          buffer->ToLiteralSync());
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*literal, LiteralUtil::CreateR1<int32_t>(data)));
+}
+
+TEST(TfrtGpuClientTest, ShouldStageHostToDeviceTransfersSetToFalse) {
+  GpuClientOptions options_staging;
+  options_staging.should_stage_host_to_device_transfers = false;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(options_staging));
+  auto* staging_client = tensorflow::down_cast<TfrtGpuClient*>(client.get());
+  EXPECT_FALSE(staging_client->should_stage_host_to_device_transfers());
+  std::vector<int32_t> data(256);
+  std::iota(data.begin(), data.end(), 10);
+  Shape shape = ShapeUtil::MakeShape(S32, {256});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      staging_client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          *client->addressable_devices()[0]->default_memory_space(),
+          /*device_layout=*/nullptr));
+  TF_EXPECT_OK(buffer->GetReadyFuture().Await());
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
+                          buffer->ToLiteralSync());
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*literal, LiteralUtil::CreateR1<int32_t>(data)));
+}
+
+TEST(TfrtGpuClientTest, BufferFromHostBufferPinnedMemory) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  std::vector<int32_t> data{1, 2, 3, 4};
+  Shape shape = ShapeUtil::MakeShape(S32, {4});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* pinned_memory_space,
+      client->addressable_devices()[0]->memory_space_by_kind(
+          PinnedHostMemorySpace::kKind));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          pinned_memory_space, /*device_layout=*/nullptr));
+
+  EXPECT_EQ(buffer->memory_space()->kind(), "pinned_host");
+  EXPECT_TRUE(buffer->IsOnCpu());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  std::vector<int32_t> expected{1, 2, 3, 4};
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
+                                     *literal));
+}
+
+TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  std::vector<int32_t> data{1, 2, 3, 4};
+  Shape shape = ShapeUtil::MakeShape(S32, {4});
+  auto device = client->addressable_devices()[0];
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          *device->default_memory_space(), /*device_layout=*/nullptr));
+
+  EXPECT_EQ(buffer->memory_space()->kind(), "device");
+
+  auto* pinned_memory_space = device->memory_spaces()[1];
+  EXPECT_EQ(pinned_memory_space->kind_id(), PinnedHostMemorySpace::kKindId);
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          buffer->CopyToMemorySpace(pinned_memory_space));
+
+  EXPECT_EQ(result->memory_space()->kind(), "pinned_host");
+  EXPECT_TRUE(result->IsOnCpu());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  std::vector<int32_t> expected{1, 2, 3, 4};
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
+                                     *literal));
+}
+
+TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  std::vector<int8_t> data{1, 2, 3, 4};
+  Shape shape = ShapeUtil::MakeShape(S4, {4});
+  auto device = client->addressable_devices()[0];
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          *device->default_memory_space(), /*device_layout=*/nullptr));
+
+  EXPECT_EQ(buffer->memory_space()->kind(), "device");
+
+  TF_EXPECT_OK(buffer->GetReadyFuture().Await());
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> device_literal,
+                          buffer->ToLiteralSync());
+  std::vector<xla::s4> expected{xla::s4(1), xla::s4(2), xla::s4(3), xla::s4(4)};
+  Literal expected_literal = LiteralUtil::CreateR1<xla::s4>(expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *device_literal));
+
+  auto* pinned_memory_space = device->memory_spaces()[1];
+  EXPECT_EQ(pinned_memory_space->kind_id(), PinnedHostMemorySpace::kKindId);
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          buffer->CopyToMemorySpace(pinned_memory_space));
+
+  EXPECT_EQ(result->memory_space()->kind(), "pinned_host");
+  EXPECT_TRUE(result->IsOnCpu());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *literal));
+}
+
+TEST(TfrtGpuClientTest, ToLiteralAsync) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  PjRtDevice* const device = client->addressable_devices()[0];
+  auto src_literal = LiteralUtil::CreateR1<float>({41.0f, 42.0f, 43.0f, 44.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>
+          transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          {src_literal.shape()}, *device->default_memory_space()));
+  std::unique_ptr<PjRtBuffer> buffer = transfer_manager->RetrieveBuffer(0);
+
+  absl::Mutex mu;
+  auto literal = std::make_shared<Literal>(
+      ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
+  bool got_literal = false;
+
+  TF_ASSERT_OK(
+      transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
+
+  buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
+    absl::MutexLock l(&mu);
+    TF_ASSERT_OK(s);
+    got_literal = true;
+  });
+  buffer.reset();
+
+  {
+    absl::MutexLock l(&mu);
+    mu.Await(absl::Condition(&got_literal));
+  }
+
+  EXPECT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
+  EXPECT_EQ(src_literal.data<float>(),
+            literal->Relayout(src_literal.shape().layout()).data<float>());
+}
+
+TEST(TfrtGpuClientTest, ToLiteralAsyncWithNonCompactLayout) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  xla::Shape transposed_shape = xla::ShapeUtil::MakeShapeWithDenseLayout(
+      xla::S32, {2, 3}, /*minor_to_major=*/{0, 1});
+  xla::Literal src_literal = xla::LiteralUtil::CreateR2WithLayout<int32_t>(
+      {{3, 14, 25}, {36, 47, 58}}, transposed_shape.layout());
+
+  PjRtClient::ShapeSpec spec;
+  spec.element_type = src_literal.shape().element_type();
+  spec.dims = DimensionVector(src_literal.shape().dimensions().begin(),
+                              src_literal.shape().dimensions().end());
+  std::vector<std::optional<xla::Layout>> device_layouts = {
+      std::make_optional(transposed_shape.layout())};
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>
+          transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          {spec}, device_layouts,
+          client->addressable_devices()[0]->memory_spaces()[0]));
+  std::unique_ptr<PjRtBuffer> buffer = transfer_manager->RetrieveBuffer(0);
+
+  absl::Notification n;
+  auto literal = std::make_shared<Literal>(
+      ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
+
+  TF_ASSERT_OK(
+      transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
+
+  buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
+    TF_ASSERT_OK(s);
+    n.Notify();
+  });
+  buffer.reset();
+
+  n.WaitForNotification();
+
+  EXPECT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
+  EXPECT_EQ(src_literal.data<int32_t>(),
+            literal->Relayout(src_literal.shape().layout()).data<int32_t>());
+}
+
+TEST(TfrtGpuClientTest, ToLiteralAsyncWithDifferentMajorToMinor) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  xla::Shape shape = xla::ShapeUtil::MakeShapeWithDenseLayout(
+      xla::S32, {2, 3}, /*minor_to_major=*/{1, 0});
+  xla::Literal src_literal = xla::LiteralUtil::CreateR2WithLayout<int32_t>(
+      {{3, 14, 25}, {36, 47, 58}}, shape.layout());
+
+  PjRtClient::ShapeSpec spec;
+  spec.element_type = src_literal.shape().element_type();
+  spec.dims = DimensionVector(src_literal.shape().dimensions().begin(),
+                              src_literal.shape().dimensions().end());
+  xla::Shape transposed_shape = xla::ShapeUtil::MakeShapeWithDenseLayout(
+      xla::S32, {2, 3}, /*minor_to_major=*/{0, 1});
+  std::vector<std::optional<xla::Layout>> device_layouts = {
+      std::make_optional(transposed_shape.layout())};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          {spec}, device_layouts,
+          client->addressable_devices()[0]->memory_spaces()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+
+  absl::Notification n;
+  auto literal = std::make_shared<Literal>(shape);
+
+  TF_ASSERT_OK(
+      transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
+
+  buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
+    TF_ASSERT_OK(s);
+    n.Notify();
+  });
+  buffer.reset();
+
+  n.WaitForNotification();
+
+  ASSERT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
+  ASSERT_EQ(src_literal.data<int32_t>(),
+            literal->Relayout(src_literal.shape().layout()).data<int32_t>());
+}
+
+TEST(TfrtGpuClientTest, ToLiteralAsyncToken) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  xla::Literal literal = xla::LiteralUtil::CreateToken();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostLiteral(
+          literal, client->addressable_devices()[0]->memory_spaces()[0]));
+  TF_ASSERT_OK(buffer->GetReadyFuture().Await());
+
+  absl::Notification n;
+
+  buffer->ToLiteral(&literal).OnReady([&](absl::Status s) {
+    TF_ASSERT_OK(s);
+    n.Notify();
+  });
+  buffer.reset();
+
+  n.WaitForNotification();
+}
+
+TEST(TfrtGpuClientTest, ToLiteralAsyncBeforeBufferReady) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  PjRtDevice* const device = client->addressable_devices()[0];
+  auto src_literal = LiteralUtil::CreateR1<float>({41.0f, 42.0f, 43.0f, 44.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>
+          transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          {src_literal.shape()}, *device->default_memory_space()));
+  std::unique_ptr<PjRtBuffer> buffer = transfer_manager->RetrieveBuffer(0);
+
+  absl::Mutex mu;
+  auto literal = std::make_shared<Literal>(
+      ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
+  bool got_literal = false;
+
+  buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
+    absl::MutexLock l(&mu);
+    TF_ASSERT_OK(s);
+    got_literal = true;
+  });
+
+  absl::SleepFor(absl::Milliseconds(10));
+  ASSERT_FALSE(got_literal);
+  TF_ASSERT_OK(
+      transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
+
+  buffer.reset();
+
+  {
+    absl::MutexLock l(&mu);
+    mu.Await(absl::Condition(&got_literal));
+  }
+
+  EXPECT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
+  EXPECT_EQ(src_literal.data<float>(),
+            literal->Relayout(src_literal.shape().layout()).data<float>());
+}
+
+TEST(TfrtGpuClientTest, FromHostAsync) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  PjRtDevice* const device = client->addressable_devices()[0];
+  std::vector<Literal> src_literals;
+  std::vector<Shape> src_shapes;
+  for (int i = 0; i < 4; ++i) {
+    std::vector<float> data(i + 1);
+    std::iota(data.begin(), data.end(), static_cast<float>(i + 10));
+    src_literals.push_back(LiteralUtil::CreateR1<float>(data));
+    src_shapes.push_back(src_literals.back().shape());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>
+          transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          src_shapes, *device->default_memory_space()));
+  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+  for (int i = 0; i < src_shapes.size(); ++i) {
+    buffers.push_back(transfer_manager->RetrieveBuffer(i));
+  }
+
+  for (int i = 0; i < src_shapes.size(); ++i) {
+    TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
+        i,
+        absl::string_view(static_cast<char*>(src_literals[i].untyped_data()),
+                          src_literals[i].size_bytes()),
+        [&]() {}));
+  }
+
+  absl::Mutex mu;
+  std::vector<std::shared_ptr<Literal>> literals;
+  int got_literal_count = 0;
+  int got_callback_count = 0;
+
+  for (auto& buffer : buffers) {
+    literals.push_back(std::make_shared<Literal>(
+        ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape())));
+    buffer->ToLiteral(literals.back().get()).OnReady([&](absl::Status s) {
+      absl::MutexLock l(&mu);
+      TF_ASSERT_OK(s);
+      ++got_literal_count;
+    });
+    buffer->GetReadyFuture().OnReady([&](absl::Status s) {
+      absl::MutexLock l(&mu);
+      TF_ASSERT_OK(s);
+      ++got_callback_count;
+    });
+    buffer.reset();
+  }
+
+  {
+    auto done = [&]() {
+      return got_literal_count == src_literals.size() &&
+             got_callback_count == src_literals.size();
+    };
+    absl::MutexLock l(&mu);
+    mu.Await(absl::Condition(&done));
+  }
+
+  for (int i = 0; i < src_literals.size(); ++i) {
+    EXPECT_TRUE(
+        ShapeUtil::Compatible(src_literals[i].shape(), literals[i]->shape()));
+    EXPECT_EQ(
+        src_literals[i].data<float>(),
+        literals[i]->Relayout(src_literals[i].shape().layout()).data<float>());
+  }
+}
+
+TEST(TfrtGpuClientTest, FromHostAsyncPinnedHost) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* pinned_memory_space,
+      client->addressable_devices()[0]->memory_space_by_kind(
+          PinnedHostMemorySpace::kKind));
+
+  std::vector<Literal> src_literals;
+  std::vector<Shape> src_shapes;
+  for (int i = 0; i < 4; ++i) {
+    std::vector<float> data(i + 1);
+    std::iota(data.begin(), data.end(), static_cast<float>(i + 10));
+    src_literals.emplace_back(LiteralUtil::CreateR1<float>(data));
+    src_shapes.push_back(src_literals.back().shape());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              src_shapes, pinned_memory_space));
+  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+  for (int i = 0; i < src_shapes.size(); ++i) {
+    buffers.emplace_back(transfer_manager->RetrieveBuffer(i));
+  }
+
+  for (int i = 0; i < src_shapes.size(); ++i) {
+    TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
+        i,
+        absl::string_view(static_cast<char*>(src_literals[i].untyped_data()),
+                          src_literals[i].size_bytes()),
+        [&]() {}));
+  }
+}
+
+TEST(TfrtGpuClientTest, FromHostAsyncPinnedHostChunked) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_THAT(client->addressable_devices(), SizeIs(Gt(0)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      PjRtMemorySpace * memspace,
+      client->addressable_devices()[0]->memory_space_by_kind(
+          PinnedHostMemorySpace::kKind));
+  std::vector<float> data{1, 3, 5, 7, 11, 13, 17, 19};
+  Shape shape = ShapeUtil::MakeShape(F32, {static_cast<int64_t>(data.size())});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager> txm,
+      client->CreateBuffersForAsyncHostToDevice({shape}, memspace));
+  std::unique_ptr<PjRtBuffer> buf = txm->RetrieveBuffer(0);
+  ASSERT_THAT(buf->GetReadyFuture().IsReady(), Eq(false));
+
+  absl::string_view raw_view(reinterpret_cast<char*>(data.data()),
+                             data.size() * sizeof(data[0]));
+  int offset = 0;
+  while (true) {
+    int end = offset + 3;  // unaligned chunk size
+    if (end > raw_view.size()) {
+      end = raw_view.size();
+    }
+    int sz = end - offset;
+    bool reaches_end = end == raw_view.size();
+    TF_ASSERT_OK(txm->TransferRawDataToSubBuffer(
+        /*buffer_index=*/0, raw_view.data() + offset, offset, sz, reaches_end,
+        /*on_done=*/[]() {}));
+    if (reaches_end) {
+      break;
+    }
+    offset = end;
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> lit, buf->ToLiteralSync());
+  EXPECT_THAT(lit->data<float>(), ElementsAreArray(data));
+}
+
+TEST(TfrtGpuClientTest, DeleteBufferThenFulfillBufferNoDeadLock) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_THAT(client->addressable_devices(), SizeIs(Gt(0)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      PjRtMemorySpace * memspace,
+      client->addressable_devices()[0]->memory_space_by_kind(
+          PinnedHostMemorySpace::kKind));
+  std::vector<float> data{1, 3, 5, 7, 11, 13, 17, 19};
+  Shape shape = ShapeUtil::MakeShape(F32, {static_cast<int64_t>(data.size())});
+  std::vector<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+      txms;
+  for (int i = 0; i < 10000; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager> txm,
+        client->CreateBuffersForAsyncHostToDevice({shape}, memspace));
+    std::unique_ptr<PjRtBuffer> buf = txm->RetrieveBuffer(0);
+    ASSERT_THAT(buf->GetReadyFuture().IsReady(), Eq(false));
+    txms.push_back(std::move(txm));
+    // Delete the buffer
+  }
+
+  // At this point, we have 10000 buffers pending deallocation.
+
+  absl::string_view raw_view(
+      reinterpret_cast<char*>(data.data()),  // REINTERPRET_CAST_OK=test
+      data.size() * sizeof(data[0]));
+  for (auto& txm : txms) {
+    int offset = 0;
+    while (true) {
+      int end = offset + 3;  // unaligned chunk size
+      if (end > raw_view.size()) {
+        end = raw_view.size();
+      }
+      int sz = end - offset;
+      bool reaches_end = end == raw_view.size();
+      TF_ASSERT_OK(txm->TransferRawDataToSubBuffer(
+          /*buffer_index=*/0, raw_view.data() + offset, offset, sz, reaches_end,
+          /*on_done=*/[]() {}));
+      if (reaches_end) {
+        break;
+      }
+      offset = end;
+    }
+  }
+}
+
+TEST(TfrtGpuClientTest, CreateMixOfErrorBuffers) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  std::vector<Literal> src_literals;
+  std::vector<Shape> src_shapes;
+  for (int i = 0; i < 4; ++i) {
+    std::vector<float> data(i + 1);
+    std::iota(data.begin(), data.end(), static_cast<float>(i + 10));
+    src_literals.push_back(LiteralUtil::CreateR1<float>(data));
+    src_shapes.push_back(src_literals.back().shape());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>
+          transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          src_shapes, client->addressable_devices()[0]->memory_spaces()[0]));
+  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+  for (int i = 0; i < src_shapes.size(); ++i) {
+    buffers.push_back(transfer_manager->RetrieveBuffer(i));
+  }
+
+  absl::Mutex mu;
+  int got_callback_count = 0;
+  for (int i = 0; i < 4; ++i) {
+    auto& buffer = buffers[i];
+    if (i == 0 || i == 3) {
+      TF_ASSERT_OK(transfer_manager->TransferLiteralToBuffer(i, src_literals[i],
+                                                             [&]() {}));
+      buffer->GetReadyFuture().OnReady([&](absl::Status s) {
+        absl::MutexLock l(&mu);
+        TF_ASSERT_OK(s);
+        ++got_callback_count;
+      });
+    } else {
+      absl::Status error = Internal("error %d", i);
+      transfer_manager->SetBufferError(i, error);
+      buffer->GetReadyFuture().OnReady(
+          [error, &mu, &got_callback_count](absl::Status s) {
+            absl::MutexLock l(&mu);
+            EXPECT_THAT(s.message(), HasSubstr(error.message()));
+            ++got_callback_count;
+          });
+    }
+    buffer.reset();
+  }
+
+  {
+    auto done = [&]() { return got_callback_count == src_literals.size(); };
+    absl::MutexLock l(&mu);
+    QCHECK(mu.AwaitWithTimeout(absl::Condition(&done), absl::Seconds(60)));
+  }
+}
+
+TEST(TfrtGpuClientTest, LookupDevice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->devices().size(), 2);
+  TfrtGpuDevice* device =
+      tensorflow::down_cast<TfrtGpuDevice*>(client->devices()[0]);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* looked_up_device,
+      client->LookupDevice(PjRtGlobalDeviceId(device->id())));
+  EXPECT_EQ(looked_up_device, device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* addressable_device,
+      client->LookupAddressableDevice(device->local_device_id()));
+  EXPECT_EQ(addressable_device, device);
+}
+
+TEST(TfrtGpuClientTest, CreateViewOfDeviceBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  void* device_ptr = (void*)0x12345678;
+  TF_ASSERT_OK_AND_ASSIGN(
+      PjRtMemorySpace * memory_space,
+      client->addressable_devices()[0]->default_memory_space());
+  bool deleted = false;
+  auto on_delete_callback = [&]() { deleted = true; };
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer,
+                          client->CreateViewOfDeviceBuffer(
+                              device_ptr, on_device_shape, memory_space,
+                              on_delete_callback, /*stream=*/std::nullopt));
+  EXPECT_EQ(buffer->on_device_shape(), on_device_shape);
+  EXPECT_EQ(buffer->memory_space(), memory_space);
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto ref, buffer->AcquireExternalReference());
+    EXPECT_EQ(ref->OpaqueDeviceMemoryDataPointer(), device_ptr);
+  }
+  EXPECT_FALSE(deleted);
+  buffer.reset();
+  EXPECT_TRUE(deleted);
+}
+
+TEST(TfrtGpuClientTest, CopyRawToHostFullBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  void* dst =
+      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+
+  auto result = buffer->CopyRawToHost(dst, 0, size);
+  TF_EXPECT_OK(result.Await());
+  EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
+  EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
+
+  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+}
+
+TEST(TfrtGpuClientTest, CopyRawToHostSubBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  void* dst =
+      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+
+  auto result = buffer->CopyRawToHost(dst, 0, sizeof(float));
+  TF_EXPECT_OK(result.Await());
+  EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
+
+  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+}
+
+TEST(TfrtGpuClientTest, CopyRawToHostOutOfRange) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  void* dst =
+      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+
+  auto result = buffer->CopyRawToHost(dst, 1, size);
+  EXPECT_THAT(result.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
+                                       HasSubstr("invalid offset 1")));
+  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+}
+
+TEST(TfrtGpuClientTest, CopyRawToHostFuture) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
+
+  auto dst_promise = xla::PjRtFuture<void*>::CreatePromise();
+  xla::PjRtFuture<void*> dst_future(dst_promise);
+
+  TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  auto ready = buffer->GetReadyFuture();
+  auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
+
+  // Drop the buffer before fulfilling `dst`. The transfer should still keep
+  // the buffer alive.
+  buffer.reset();
+  ready.OnReady([dst_promise = std::move(dst_promise),
+                 size](absl::Status status) mutable {
+    void* dst =
+        tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+    dst_promise.Set(dst);
+  });
+
+  TF_EXPECT_OK(result.Await());
+  TF_ASSERT_OK_AND_ASSIGN(auto* dst, dst_future.Await());
+  EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
+  EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
+
+  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+}
+
+TEST(GpuTopology, FromProto) {
+  GpuTopologyProto msg;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      R"pb(
+        device_ids: [ 3, 2, 1 ]
+        platform_version: "platform_version"
+        num_slices: 2
+        num_hosts_per_slice: 1
+        num_devices_per_host: 3
+      )pb",
+      &msg));
+
+  std::unique_ptr<const GpuTopology> gpu_topology = GpuTopology::FromProto(msg);
+  EXPECT_THAT(gpu_topology->device_ids(), ElementsAre(3, 2, 1));
+  EXPECT_THAT(gpu_topology->platform_version(), "platform_version");
+  EXPECT_THAT(gpu_topology->num_slices(), 2);
+  EXPECT_THAT(gpu_topology->num_hosts_per_slice(), 1);
+  EXPECT_THAT(gpu_topology->num_devices_per_host(), 3);
+}
+
+TEST(GpuTopology, ToProto) {
+  GpuTopology gpu_topology(/*gpu_device_ids=*/{3, 2, 1},
+                           /*platform_version=*/"platform_version",
+                           /*num_slices=*/2,
+                           /*num_hosts_per_slice=*/1,
+                           /*num_devices_per_host=*/3);
+  GpuTopologyProto msg = gpu_topology.ToProto();
+  EXPECT_THAT(msg.device_ids(), ElementsAre(3, 2, 1));
+  EXPECT_THAT(msg.platform_version(), "platform_version");
+  EXPECT_THAT(msg.num_slices(), 2);
+  EXPECT_THAT(msg.num_hosts_per_slice(), 1);
+  EXPECT_THAT(msg.num_devices_per_host(), 3);
+}
+
+TEST(TfrtGpuClientTest, DistributedInit) {
+  auto kv_store = std::make_shared<InMemoryKeyValueStore>();
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "DistributeInit", 4);
+
+  int num_nodes = 2;
+  for (int i = 0; i < num_nodes; i++) {
+    thread_pool.Schedule([kv_store, i, num_nodes] {
+      GpuClientOptions options;
+      options.node_id = i;
+      options.num_nodes = num_nodes;
+      options.kv_store = kv_store;
+      TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(options));
+      EXPECT_TRUE(client->platform_name() == "cuda" ||
+                  client->platform_name() == "rocm");
+      EXPECT_EQ(client->addressable_device_count(), 2);
+      EXPECT_EQ(client->device_count(), 4);
+    });
+  }
+}
+
+namespace {
+constexpr int32_t kData[] = {1, 2, 3, 4};
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateDeviceBufferForTest(
+    xla::PjRtClient* client) {
+  auto device = client->addressable_devices()[0];
+  TF_EXPECT_OK(device->default_memory_space());
+
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {4}, {0});
+  TF_ASSIGN_OR_RETURN(
+      auto input,
+      client->BufferFromHostBuffer(
+          kData, shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/nullptr, *device->default_memory_space(),
+          /*device_layout=*/nullptr));
+  EXPECT_EQ(input->memory_space()->kind(), "device");
+  return input;
+}
+constexpr char const* kD2HProgram = R"(
+  HloModule f
+
+  ENTRY main.5 {
+    p = s32[4]{0} parameter(0)
+    ROOT cc = s32[4] custom-call(p),
+        custom_call_target="annotate_device_placement",
+        frontend_attributes={_xla_buffer_placement="pinned_host"}
+  }
+)";
+
+constexpr char const* kD2HProgramTupleOutput = R"(
+  HloModule f
+
+  ENTRY main.5 {
+    p = s32[4]{0} parameter(0)
+    cc = s32[4] custom-call(p),
+        custom_call_target="annotate_device_placement",
+        frontend_attributes={_xla_buffer_placement="pinned_host"}
+    ROOT tuple = (s32[4]{0}, s32[4]{0}) tuple(s32[4]{0} p, s32[4]{0} cc)
+  }
+)";
+
+}  // namespace
+
+TEST(TfrtGpuClientTest, ExecutablePinnedHostOutputMemoryKindTest) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kD2HProgram, *client));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto memory_kinds,
+                          executable->GetOutputMemoryKinds());
+  EXPECT_EQ(memory_kinds.size(), 1);
+  EXPECT_EQ(memory_kinds[0].size(), 1);
+  EXPECT_EQ(memory_kinds[0][0], "pinned_host");
+}
+
+TEST(TfrtGpuClientTest, ExecutablePinnedHostTupleOutputMemoryKindTest) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  // Build the output shape with the correct memory space set.
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {4}, {0});
+  Shape host_shape = shape;
+  host_shape.mutable_layout()->set_memory_space(Layout::kHostMemorySpace);
+  Shape out_shape = ShapeUtil::MakeTupleShape({shape, host_shape});
+
+  // Set the result layout so that the compiler assertions on memory
+  // spaces pass.
+  xla::CompileOptions options;
+  options.executable_build_options.set_result_layout(out_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CompileExecutable(kD2HProgramTupleOutput, *client, options));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto memory_kinds,
+                          executable->GetOutputMemoryKinds());
+  EXPECT_EQ(memory_kinds.size(), 1);
+  EXPECT_EQ(memory_kinds[0].size(), 2);
+  EXPECT_EQ(memory_kinds[0][0], "device");
+  EXPECT_EQ(memory_kinds[0][1], "pinned_host");
+}
+
+TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTest) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtBuffer> input,
+                          CreateDeviceBufferForTest(client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kD2HProgram, *client));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> result,
+      executable->Execute({{input.get()}}, ExecuteOptions()));
+
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "pinned_host");
+
+  TF_ASSERT_OK_AND_ASSIGN(auto memory_stats,
+                          executable->GetCompiledMemoryStats());
+  EXPECT_EQ(memory_stats.output_size_in_bytes, 0);
+  EXPECT_EQ(memory_stats.host_output_size_in_bytes, 16);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
+                          result_buffers[0]->ToLiteralSync())
+  EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
+}
+
+TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTupleTest) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtBuffer> input,
+                          CreateDeviceBufferForTest(client.get()));
+
+  // Build the output shape with the correct memory space set.
+  Shape host_shape = input->on_device_shape();
+  host_shape.mutable_layout()->set_memory_space(Layout::kHostMemorySpace);
+  Shape out_shape =
+      ShapeUtil::MakeTupleShape({input->on_device_shape(), host_shape});
+
+  // Set the result layout so that the compiler assertions on memory
+  // spaces pass.
+  xla::CompileOptions options;
+  options.executable_build_options.set_result_layout(out_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CompileExecutable(kD2HProgramTupleOutput, *client, options));
+
+  // Untuple the result so that we get separate buffers.
+  // This is how JAX invokes XLA.
+  ExecuteOptions execute_options;
+  execute_options.untuple_result = true;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, executable->Execute({{input.get()}}, execute_options));
+
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  EXPECT_EQ(result_buffers.size(), 2);
+  EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
+  EXPECT_EQ(result_buffers[1]->memory_space()->kind(), "pinned_host");
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
+                          result_buffers[0]->ToLiteralSync())
+  EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> another_literal,
+                          result_buffers[1]->ToLiteralSync())
+  EXPECT_THAT(another_literal->data<int32_t>(), ElementsAreArray(kData));
+}
+
+TEST(TfrtGpuClientTest, MlirParameterLayoutFromOptionsIsSetInHlo) {
+  constexpr char kMlirCopy[] =
+      R"(
+      func.func public @main(%arg0: tensor<2x2x2xi32> {
+              mhlo.layout_mode = "default"
+          }) -> (tensor<2x2x2xi32> {
+              jax.result_info = "",
+              mhlo.layout_mode = "default"}) {
+        return %arg0 : tensor<2x2x2xi32>
+      }
+    )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  mlir::MLIRContext context;
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          xla::ParseMlirModuleString(kMlirCopy, context));
+
+  xla::CompileOptions options;
+  options.argument_layouts = {
+      {ShapeUtil::MakeShapeWithDenseLayout(S32, {2, 2, 2}, {0, 2, 1})}};
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->CompileAndLoad(*module, options));
+  TF_ASSERT_OK_AND_ASSIGN(auto modules, executable->GetHloModules());
+
+  auto first_param_layout =
+      modules[0]->entry_computation_layout().parameter_layout(0).layout();
+  EXPECT_EQ(first_param_layout, Layout({0, 2, 1}));
+}
+
+TEST(TfrtGpuClientTest, GetDefaultLayout) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  auto shape = ShapeUtil::MakeShape(S4, {2, 2});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto layout,
+      client->GetDefaultLayout(shape.element_type(), shape.dimensions()));
+  EXPECT_EQ(layout.element_size_in_bits(), 4);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* const topology,
+                          client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      layout,
+      topology->GetDefaultLayout(shape.element_type(), shape.dimensions()));
+  EXPECT_EQ(layout.element_size_in_bits(), 4);
+}
+
+TEST(TfrtGpuClientTest, AutoLayoutIsSupported) {
+  const char* hlo_text = R"(
+    HloModule DotLayout,
+      entry_computation_layout={(f32[2,3,5],f32[3,4,5])->f32[5,2,4]{2,1,0}}
+
+    ENTRY dot {
+      p0 = f32[2,3,5]{2,1,0} parameter(0)
+      p1 = f32[3,4,5]{2,1,0} parameter(1)
+      ROOT dot.1330.10585 = f32[5,2,4]{2,1,0} dot(p0, p1),
+        lhs_batch_dims={2}, lhs_contracting_dims={1},
+        rhs_batch_dims={2}, rhs_contracting_dims={0}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> m,
+      ParseAndReturnUnverifiedModule(
+          hlo_text, {}, HloParserOptions().set_fill_missing_layouts(false)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  CompileOptions compile_options;
+  compile_options.executable_build_options.mutable_debug_options()
+      ->set_xla_pjrt_allow_auto_layout_in_hlo(true);
+  XlaComputation computation = m->ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->CompileAndLoad(computation, compile_options));
+  TF_ASSERT_OK_AND_ASSIGN(auto layouts, executable->GetParameterLayouts());
+  // Check that the assigned layouts are not default.
+  EXPECT_NE(layouts[0]->ToString(), "{2,1,0}");
+  EXPECT_NE(layouts[1]->ToString(), "{2,1,0}");
+}
+
+TEST(TfrtGpuClientTest, CreateUninitializedBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  Shape on_device_shape = ShapeUtil::MakeShapeWithType<int32_t>({4, 4});
+  TF_ASSERT_OK_AND_ASSIGN(
+      PjRtMemorySpace * memory_space,
+      client->addressable_devices()[0]->default_memory_space());
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer, client->CreateUninitializedBuffer(
+                                           on_device_shape, memory_space));
+  EXPECT_EQ(*buffer->GetOnDeviceSizeInBytes(), 4 * 4 * 4);
+}
+
+TEST(TfrtGpuClientTest, SerializeDeserializeExecutable) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  static constexpr char const* kAddProgram =
+      R"(
+HloModule Add.6, entry_computation_layout={(f32[], f32[])->(f32[], f32[])}
+
+ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
+  %a.1 = f32[] parameter(0)
+  %b.2 = f32[] parameter(1)
+  %add.3 = f32[] add(f32[] %a.1, f32[] %b.2)
+  %add.4 = f32[] add(f32[] %add.3, f32[] %add.3)
+  ROOT %tuple.5 = (f32[], f32[]) tuple(f32[] %add.3, f32[] %add.4)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kAddProgram, *client));
+  auto gpu_exe = static_cast<TfrtGpuExecutable*>(std::move(executable).get());
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          gpu_exe->SerializeExecutable());
+  EXPECT_TRUE(
+      absl::StrContains(serialized, "Generated by LLVM NVPTX Back-End"));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto deserialized, client->DeserializeExecutable(
+                                                 serialized, std::nullopt));
+  EXPECT_EQ(deserialized->num_replicas(), 1);
+  EXPECT_EQ(deserialized->num_partitions(), 1);
+  EXPECT_EQ(deserialized->name(), "Add.6");
+}
+
+TEST(TfrtGpuClientTest, ValidatesClientName) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+
+  static constexpr char const* kAddProgram =
+      R"(
+HloModule Add.6, entry_computation_layout={(f32[], f32[])->(f32[], f32[])}
+
+ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
+  %a.1 = f32[] parameter(0)
+  %b.2 = f32[] parameter(1)
+  %add.3 = f32[] add(f32[] %a.1, f32[] %b.2)
+  %add.4 = f32[] add(f32[] %add.3, f32[] %add.3)
+  ROOT %tuple.5 = (f32[], f32[]) tuple(f32[] %add.3, f32[] %add.4)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kAddProgram, *client));
+  const auto* gpu_exe =
+      static_cast<TfrtGpuExecutable*>(std::move(executable).get());
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized,
+                          gpu_exe->SerializeExecutable());
+
+  ExecutableAndOptionsProto proto;
+  proto.ParseFromString(serialized);
+  EXPECT_EQ(proto.pjrt_client_name(), "TfrtGpuClient");
+  proto.set_pjrt_client_name("SomeGpuClient");
+  serialized = proto.SerializeAsString();
+
+  EXPECT_THAT(client->DeserializeExecutable(serialized, std::nullopt),
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("PjRt client type expected by the serialized "
+                                 "executable: SomeGpuClient")));
+}
+
+TEST(TfrtGpuClientTest, CopyToMemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  for (auto* memory_space : client->memory_spaces()) {
+    xla::Shape shape = xla::ShapeUtil::MakeShape(S32, {128, 256});
+    TF_ASSERT_OK_AND_ASSIGN(Literal literal, xla::MakeFakeLiteral(shape));
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<PjRtBuffer> buffer,
+        client->BufferFromHostLiteral(literal, memory_space));
+    TF_ASSERT_OK_AND_ASSIGN(buffer,
+                            buffer->CopyToMemorySpace(buffer->memory_space()));
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> received_literal,
+                            buffer->ToLiteralSync());
+    EXPECT_THAT(received_literal->data<int32_t>(),
+                ElementsAreArray(literal.data<int32_t>()));
+  }
+}
+
+TEST(TfrtGpuClientTest, AsyncCopyToDevice) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 2);
+
+  // d0 is the device we will perform local/remote sends from.
+  PjRtDevice* const d0 = client->addressable_devices()[0];
+  // d1 is the device we will perform local/remote recvs, where the recv
+  // sync flag may be contended.
+  PjRtDevice* const d1 = client->addressable_devices()[1];
+
+  auto src_literal = LiteralUtil::CreateR1<float>({41.0f, 42.0f, 43.0f, 44.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>
+          transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice({src_literal.shape()},
+                                                *d0->default_memory_space()));
+  std::unique_ptr<PjRtBuffer> src_buffer = transfer_manager->RetrieveBuffer(0);
+  // CopyToMemorySpace won't be enqueued until src_buffer is available.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> local_recv_buffer,
+      src_buffer->CopyToMemorySpace(*d1->default_memory_space()));
+
+  TF_ASSERT_OK(
+      transfer_manager->TransferLiteralToBuffer(0, src_literal, []() {}));
+
+  auto literal = std::make_shared<Literal>(src_literal.shape());
+
+  PjRtFuture<> local_recv_literal = local_recv_buffer->ToLiteral(literal.get());
+  TF_EXPECT_OK(local_recv_literal.Await());
+
+  EXPECT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
+  EXPECT_EQ(src_literal.data<float>(),
+            literal->Relayout(src_literal.shape().layout()).data<float>());
+}
+
+TEST(TfrtGpuClientTest, OnDoneSafelyDestructTransferManagerAsync) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                          GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+  PjRtDevice* const device = client->addressable_devices()[0];
+
+  auto src_literal = LiteralUtil::CreateR1<float>({41.0f, 42.0f, 43.0f, 44.0f});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>
+          transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          {src_literal.shape()}, *device->default_memory_space()));
+  std::unique_ptr<PjRtBuffer> buffer = transfer_manager->RetrieveBuffer(0);
+  absl::Notification done;
+  EXPECT_OK(transfer_manager->TransferLiteralToBuffer(
+      0, src_literal,
+      /*on_done=*/
+      [&done, transfer_manager = std::move(transfer_manager)]() {
+        done.Notify();
+      }));
+  done.WaitForNotification();
+}
+
+TEST(TfrtGpuClientTest, DeviceAttributes) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  ASSERT_EQ(client->platform_name(), "cuda");
+
+  for (int device_index = 0;
+       device_index < client->addressable_devices().size(); ++device_index) {
+    TfrtGpuDevice* device = tensorflow::down_cast<TfrtGpuDevice*>(
+        client->addressable_devices()[device_index]);
+
+    // Attribute `compute_capability`.
+    auto compute_capability =
+        std::get<std::string>(device->Attributes().at("compute_capability"));
+
+    // Gets the expected compute capability.
+    const se::Platform* platform = device->executor()->GetPlatform();
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::se::DeviceDescription> desc,
+                            platform->DescriptionForDevice(0));
+    stream_executor::GpuComputeCapability cc = desc->gpu_compute_capability();
+    auto nvcc = std::get<stream_executor::CudaComputeCapability>(cc);
+    std::string expected_compute_capability =
+        absl::StrCat(nvcc.major, ".", nvcc.minor);
+    EXPECT_EQ(compute_capability, expected_compute_capability);
+
+    // Attribute `coords`.
+    EXPECT_EQ(device->description().coords()[0], device_index);
+
+    // Attribute `device_vendor`.
+    auto device_vendor =
+        std::get<std::string>(device->Attributes().at("device_vendor"));
+    EXPECT_EQ(device_vendor, desc->device_vendor());
+
+    // Attribute `slice_index`.
+    auto slice_index =
+        std::get<int64_t>(device->Attributes().at("slice_index"));
+    EXPECT_EQ(slice_index, 0);
+
+    // Attribute `core_count`.
+    auto core_count = std::get<int64_t>(device->Attributes().at("core_count"));
+    EXPECT_EQ(core_count, desc->core_count());
+  }
+}
+
+TEST(TfrtGpuClientTest, DmaMapUnmap) {
+  GpuClientOptions options = GpuClientOptions();
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_client, GetTfrtGpuClient(options));
+  auto client = tensorflow::down_cast<TfrtGpuClient*>(gpu_client.get());
+  size_t dma_size = 8192;
+  size_t alignment = 4096;
+  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+
+  // DmaMap the first half of the buffer.
+  size_t dma_map_size = dma_size / 2;
+  char* first_half_ptr = static_cast<char*>(host_dma_ptr.get());
+  char* second_half_ptr = first_half_ptr + dma_map_size;
+  int offset = 5;
+  TF_EXPECT_OK(client->DmaMap(first_half_ptr, dma_map_size));
+  EXPECT_TRUE(client->IsDmaMapped(first_half_ptr, dma_map_size));
+  EXPECT_TRUE(client->IsDmaMapped(first_half_ptr + offset, 10));
+  EXPECT_FALSE(client->IsDmaMapped(first_half_ptr + offset, dma_map_size));
+  EXPECT_TRUE(
+      client->IsDmaMapped(first_half_ptr + offset, dma_map_size - offset));
+
+  // Verify boundaries.
+  EXPECT_TRUE(client->IsDmaMapped(first_half_ptr, 1));
+  EXPECT_FALSE(client->IsDmaMapped(first_half_ptr - 1, 1));
+  EXPECT_FALSE(client->IsDmaMapped(first_half_ptr + dma_map_size, 1));
+
+  // DmaMap the second half of the buffer.
+  TF_EXPECT_OK(client->DmaMap(second_half_ptr, dma_map_size));
+  EXPECT_TRUE(client->IsDmaMapped(second_half_ptr, dma_map_size));
+  EXPECT_TRUE(client->IsDmaMapped(second_half_ptr + offset, 10));
+  EXPECT_FALSE(client->IsDmaMapped(second_half_ptr + offset, dma_map_size));
+  EXPECT_TRUE(
+      client->IsDmaMapped(second_half_ptr + offset, dma_map_size - offset));
+
+  // Verify boundaries.
+  EXPECT_TRUE(client->IsDmaMapped(second_half_ptr, 1));
+  EXPECT_TRUE(client->IsDmaMapped(second_half_ptr - 1, 1));
+  EXPECT_FALSE(client->IsDmaMapped(second_half_ptr + dma_map_size, 1));
+
+  // Unmap the first half of the buffer.
+  TF_EXPECT_OK(client->DmaUnmap(first_half_ptr));
+  EXPECT_FALSE(client->IsDmaMapped(first_half_ptr, dma_map_size));
+  EXPECT_FALSE(client->IsDmaMapped(first_half_ptr + offset, 10));
+  EXPECT_FALSE(client->IsDmaMapped(second_half_ptr - 1, 1));
+  EXPECT_TRUE(client->IsDmaMapped(second_half_ptr, 1));
+
+  // Unmap the second half of the buffer.
+  TF_EXPECT_OK(client->DmaUnmap(second_half_ptr));
+  EXPECT_FALSE(client->IsDmaMapped(second_half_ptr, 1));
+}
+
+TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
+  GpuClientOptions options = GpuClientOptions();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtGpuClient(options));
+  if (client->addressable_devices().size() < 2) {
+    GTEST_SKIP() << "Test requires at least two addressable devices.";
+  }
+
+  size_t test_length = 0.5l * 1024 * 1024;
+  std::vector<int32_t> data(test_length);
+  for (int32_t i = 0; i < test_length; ++i) {
+    data[i] = i;
+  }
+  Shape shape = ShapeUtil::MakeShape(S32, {static_cast<int64_t>(data.size())});
+  PjRtDevice* const first_device = client->addressable_devices()[0];
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> first_buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/nullptr,
+          first_device->memory_spaces()[0], /*device_layout=*/nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(int64_t size, first_buffer->GetOnDeviceSizeInBytes());
+
+  size_t dma_size = 2 * 1024 * 1024;
+  size_t alignment = 1024;
+  auto host_dma_ptr = xla::AlignedAlloc(alignment, dma_size);
+  TF_EXPECT_OK(client->DmaMap(host_dma_ptr.get(), dma_size));
+
+  auto result = first_buffer->CopyRawToHost(host_dma_ptr.get(), 0, size);
+  TF_EXPECT_OK(result.Await());
+
+  PjRtDevice* const second_device = client->addressable_devices()[1];
+
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, second_device->memory_spaces()[0]));
+  auto second_buffer = transfer_manager->RetrieveBuffer(0);
+
+  TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
+      0, host_dma_ptr.get(), 0, size, true, []() {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
+  EXPECT_EQ(literal->element_count(), test_length);
+  EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
+
+  TF_EXPECT_OK(client->DmaUnmap(host_dma_ptr.get()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
new file mode 100644
index 000000000000..97fe400e18a4
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
@@ -0,0 +1,162 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/gpu/tfrt/gpu_event.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/stacktrace.h"
+
+namespace xla {
+
+ShapedBuffer GpuDeviceMemory::AsShapedBuffer(const Shape& on_device_shape,
+                                             const PjRtDevice* device) const {
+  ShapedBuffer shaped_buffer(on_device_shape, device->local_device_id().value(),
+                             device->local_hardware_id().value());
+  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+      shaped_buffer.buffers().begin();
+  CHECK(iterator != shaped_buffer.buffers().end());
+  iterator->second = buffer_;
+  ++iterator;
+  CHECK(iterator == shaped_buffer.buffers().end());
+  return shaped_buffer;
+}
+
+void GpuDeviceMemory::SetUnOwned() {
+  CHECK(owns_data())
+      << "SetUnOwned can only be called on an owning GpuDeviceMemory.";
+  owning_buffer_.Release();
+}
+
+absl::StatusOr<GpuDeviceMemory> GpuDeviceMemory::Allocate(
+    se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size) {
+  return Allocate(allocator, device_ordinal, size,
+                  static_cast<int>(se::MemoryType::kDevice));
+}
+
+absl::StatusOr<GpuDeviceMemory> GpuDeviceMemory::Allocate(
+    se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size,
+    int64_t memory_space) {
+  if (size == 0) {
+    return GpuDeviceMemory(se::DeviceMemoryBase());
+  }
+  TF_ASSIGN_OR_RETURN(
+      stream_executor::OwningDeviceMemory memory,
+      allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/true,
+                          memory_space));
+  return GpuDeviceMemory(std::move(memory));
+}
+
+TrackedGpuDeviceBuffer::TrackedGpuDeviceBuffer(
+    tsl::AsyncValueRef<GpuDeviceMemory> buffer,
+    absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4> definition_events,
+    std::function<void()> on_delete_callback)
+    : TrackedGpuDeviceBuffer(std::move(buffer), AfterAll(definition_events),
+                             std::move(on_delete_callback)) {
+  VLOG(4) << "TrackedGpuDeviceBuffer::TrackedGpuDeviceBuffer: " << this << "\n "
+          << tsl::CurrentStackTrace();
+}
+
+TrackedGpuDeviceBuffer::TrackedGpuDeviceBuffer(
+    tsl::AsyncValueRef<GpuDeviceMemory> buffer,
+    tsl::AsyncValueRef<GpuEvent> definition_event,
+    std::function<void()> on_delete_callback)
+    : buffer_(std::move(buffer)),
+      definition_event_(std::move(definition_event)),
+      deallocation_event_(tsl::MakeConstructedAsyncValueRef<GpuEvent>()),
+      on_delete_callback_(std::move(on_delete_callback)) {
+  VLOG(4) << "TrackedGpuDeviceBuffer::TrackedGpuDeviceBuffer: " << this << "\n "
+          << tsl::CurrentStackTrace();
+  DCHECK(definition_event_);
+}
+
+TrackedGpuDeviceBuffer::~TrackedGpuDeviceBuffer() {
+  VLOG(4) << "TrackedGpuDeviceBuffer::~TrackedGpuDeviceBuffer: " << this
+          << " opaque: " << buffer_->buffer().opaque() << "\n "
+          << tsl::CurrentStackTrace();
+
+  ReleaseDeviceMemory();
+  if (on_delete_callback_) {
+    on_delete_callback_();
+  }
+}
+
+void TrackedGpuDeviceBuffer::AddUsageEvents(
+    absl::Span<tsl::AsyncValueRef<GpuEvent>> events) {
+  for (auto& ev : events) {
+    usage_events_.Add(std::move(ev));
+  }
+}
+
+tsl::AsyncValueRef<GpuEvent> TrackedGpuDeviceBuffer::AfterAllUsageEvents() {
+  return usage_events_.AfterAll();
+}
+
+// Schedule tasks to wait for all usage events to be ready. Clear all the usage
+// events that are scheduled and return the ready event. Since all usage events
+// are AsyncValueRef, even TrackedGpuDeviceBuffer no longer holds the usage
+// events, the usage events must be still alive and held by someone who is
+// responsible to set event ready.
+tsl::AsyncValueRef<GpuEvent>
+TrackedGpuDeviceBuffer::LockUseAndTransferUsageEvents() {
+  auto after_all = usage_events_.AfterAll();
+  usage_events_.Clear();
+  return after_all;
+}
+
+void TrackedGpuDeviceBuffer::ReleaseDeviceMemory() {
+  buffer_.reset();
+  definition_event_.reset();
+  usage_events_.Clear();
+  deallocation_event_.SetStateConcrete();
+}
+
+void TrackedGpuDeviceBuffer::SetUnOwned() {
+  if (buffer_.IsAvailable()) {
+    if (buffer_.IsError()) {
+      VLOG(3) << "Setting buffer to unowned: buffer has error state.";
+      return;
+    }
+    buffer_->SetUnOwned();
+  } else {
+    buffer_.AndThen([buffer = buffer_]() {
+      if (buffer.IsError()) {
+        VLOG(3) << "Setting buffer to unowned: buffer has error state.";
+        return;
+      }
+      buffer->SetUnOwned();
+    });
+  }
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
new file mode 100644
index 000000000000..0efb012e3296
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
@@ -0,0 +1,155 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_GPU_TFRT_TRACKED_GPU_DEVICE_BUFFER_H_
+#define XLA_PJRT_GPU_TFRT_TRACKED_GPU_DEVICE_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/gpu/tfrt/gpu_event.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/framework/allocator.h"
+
+namespace xla {
+// TODO(b/400541410): Refactor and Merge this with MaybeOwningDeviceMemory.
+
+// GpuDeviceMemory represents either an owned or unowned GPU memory. It
+// owns GPU memory if an allocator is provided. When the object goes output of
+// scope, it will free the underlying memory if it owns it.
+class GpuDeviceMemory {
+ public:
+  GpuDeviceMemory() = default;
+  GpuDeviceMemory(GpuDeviceMemory&& other) = default;
+  GpuDeviceMemory& operator=(GpuDeviceMemory&& other) = default;
+
+  // Creates non-owning GPU device memory from a raw data pointer.
+  explicit GpuDeviceMemory(stream_executor::DeviceMemoryBase buffer)
+      : buffer_(buffer) {}
+
+  // Creates owning GPU device memory from an owned data pointer.
+  explicit GpuDeviceMemory(stream_executor::OwningDeviceMemory buffer)
+      : owning_buffer_(std::move(buffer)), buffer_(*owning_buffer_) {}
+
+  ShapedBuffer AsShapedBuffer(const Shape& on_device_shape,
+                              const PjRtDevice* device) const;
+
+  // Change ownership from owning to non-owning. Used for buffer donation.
+  void SetUnOwned();
+
+  // Allocates raw owning memory.
+  static absl::StatusOr<GpuDeviceMemory> Allocate(
+      se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size);
+
+  static absl::StatusOr<GpuDeviceMemory> Allocate(
+      se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size,
+      int64_t memory_space);
+
+  stream_executor::DeviceMemoryBase buffer() const { return buffer_; }
+  size_t size_bytes() const { return buffer_.size(); }
+  bool owns_data() const { return !owning_buffer_.is_null(); }
+
+ private:
+  stream_executor::OwningDeviceMemory owning_buffer_;
+  se::DeviceMemoryBase buffer_;
+};
+
+// Class that represents a GPU buffer. It optionally owns the buffer. It also
+// tracks the definition and usage of the memory to allow for synchronized usage
+// and deletion of GPU memory. This class is thread-compatible.
+class TrackedGpuDeviceBuffer {
+ public:
+  TrackedGpuDeviceBuffer(
+      tsl::AsyncValueRef<GpuDeviceMemory> buffer,
+      absl::InlinedVector<tsl::AsyncValueRef<GpuEvent>, 4> definition_events,
+      std::function<void()> on_delete_callback = nullptr);
+
+  TrackedGpuDeviceBuffer(tsl::AsyncValueRef<GpuDeviceMemory> buffer,
+                         tsl::AsyncValueRef<GpuEvent> definition_event,
+                         std::function<void()> on_delete_callback = nullptr);
+
+  TrackedGpuDeviceBuffer(TrackedGpuDeviceBuffer&&) = default;
+  TrackedGpuDeviceBuffer& operator=(TrackedGpuDeviceBuffer&&) = default;
+
+  ~TrackedGpuDeviceBuffer();
+
+  const tsl::AsyncValueRef<GpuDeviceMemory>& buffer() const { return buffer_; }
+
+  const tsl::AsyncValueRef<GpuEvent>& definition_event() const {
+    return definition_event_;
+  }
+
+  const tsl::AsyncValueRef<GpuEvent>& deallocation_event() const {
+    return deallocation_event_;
+  }
+
+  // Adds usage events to the buffer. This usage events could be any device
+  // buffer related events, e.g. D2H/D2D
+  void AddUsageEvents(absl::Span<tsl::AsyncValueRef<GpuEvent>> events);
+
+  // Returns an AsyncValueRef<GpuEvent> that will be ready after all the async
+  // values in usage events are ready. If errors occurs, one of the errors will
+  // be propagated through the returned async value.
+  tsl::AsyncValueRef<GpuEvent> AfterAllUsageEvents();
+
+  // Return the usage events for the buffers. After
+  // LockUseAndTransferUsageEvents is called, it is illegal to AddUsageEvent.
+  tsl::AsyncValueRef<GpuEvent> LockUseAndTransferUsageEvents();
+
+  // Relinquishes ownership of the buffer's device memory, e.g., after the
+  // buffer is passed to a computation that aliases its inputs to outputs.
+  void ReleaseDeviceMemory();
+
+  // Change ownership of underlying GpuDeviceMemory from owning to
+  // non-owning. Used for buffer donation.
+  void SetUnOwned();
+
+  friend class TfrtGpuBuffer;
+
+ private:
+  tsl::AsyncValueRef<GpuDeviceMemory> buffer_;
+
+  // The definition event are associated with GPU operations that write to the
+  // buffers.
+  tsl::AsyncValueRef<GpuEvent> definition_event_;
+
+  // Usage events are associated with GPU operations that read from the buffers.
+  TfrtEventSet usage_events_;
+
+  // An event triggered after this buffer is freed or donated. This event is
+  // used to make sure that allocations are sequenced with respect to
+  // deallocations in program order.
+  tsl::AsyncValueRef<GpuEvent> deallocation_event_;
+
+  // A callback to call when the TrackedGpuDeviceBuffer is about to be
+  // destroyed.
+  std::function<void()> on_delete_callback_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_GPU_TFRT_TRACKED_GPU_DEVICE_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
new file mode 100644
index 000000000000..730c0f458c2b
--- /dev/null
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
@@ -0,0 +1,194 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
+
+#include <stdlib.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/client/client_library.h"
+#include "xla/client/local_client.h"
+#include "xla/literal.h"
+#include "xla/pjrt/gpu/tfrt/gpu_event.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using ::tsl::BlockUntilReady;
+using ::tsl::MakeConstructedAsyncValueRef;
+
+void* kOpaque = reinterpret_cast<void*>(1234567890);
+
+class TestAllocator : public se::DeviceMemoryAllocator {
+ public:
+  TestAllocator() : DeviceMemoryAllocator(nullptr) {}
+
+  using se::DeviceMemoryAllocator::Allocate;
+  absl::StatusOr<stream_executor::OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64_t size, bool retry_on_failure,
+      int64_t memory_space) override {
+    const se::DeviceMemoryBase base(kOpaque, size);
+    return stream_executor::OwningDeviceMemory(base, 0, this);
+  }
+  absl::Status Deallocate(int device_ordinal,
+                          se::DeviceMemoryBase mem) override {
+    return absl::OkStatus();
+  }
+  absl::StatusOr<se::Stream*> GetStream(int device_ordinal) override {
+    LOG(FATAL) << "Unimplemented for TestAllocator.";
+  }
+};
+
+class TestDevice : public PjRtDevice {
+ public:
+  TestDevice() = default;
+
+  PjRtLocalHardwareId local_hardware_id() const override {
+    return PjRtLocalHardwareId(0);
+  }
+
+  PjRtClient* client() const override {
+    LOG(FATAL) << "Unimplemented for TestDevice.";
+  }
+
+  bool IsAddressable() const override {
+    LOG(FATAL) << "Unimplemented for TestDevice.";
+  }
+
+  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override {
+    LOG(FATAL) << "Unimplemented for TestDevice.";
+  }
+
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override {
+    return Unimplemented("Unimplemented for TestDeivce.");
+  }
+
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
+    return Unimplemented("Unimplemented for TestDeivce.");
+  }
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
+    LOG(FATAL) << "Unimplemented for TestDevice.";
+  }
+
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override {
+    LOG(FATAL) << "Unimplemented for TestDevice.";
+  }
+};
+
+TEST(GpuDeviceMemoryTest, MoveConstructorSetOriginalToNull) {
+  TestAllocator allocator;
+  TF_ASSERT_OK_AND_ASSIGN(auto owning_memory, allocator.Allocate(0, 100));
+  GpuDeviceMemory memory(std::move(owning_memory));
+  EXPECT_EQ(memory.buffer().opaque(), kOpaque);
+
+  GpuDeviceMemory another_memory = std::move(memory);
+  EXPECT_TRUE(another_memory.owns_data());
+  EXPECT_EQ(another_memory.buffer().opaque(), kOpaque);
+}
+
+TEST(GpuDeviceMemoryTest, OwningToNonOwning) {
+  TestAllocator allocator;
+  TF_ASSERT_OK_AND_ASSIGN(auto owning_memory, allocator.Allocate(0, 100));
+  GpuDeviceMemory memory(std::move(owning_memory));
+  EXPECT_TRUE(memory.owns_data());
+  memory.SetUnOwned();
+  EXPECT_FALSE(memory.owns_data());
+}
+
+TEST(GpuDeviceMemoryTest, AsShapeBuffer) {
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
+  TestDevice device;
+  Shape shape = ShapeUtil::MakeShape(F32, {1, 2, 3});
+  TestAllocator allocator;
+  int64_t byte_size =
+      client->backend().transfer_manager()->GetByteSizeRequirement(shape);
+  TF_ASSERT_OK_AND_ASSIGN(auto memory,
+                          GpuDeviceMemory::Allocate(&allocator, 0, byte_size));
+  ShapedBuffer result_shaped_buffer = memory.AsShapedBuffer(
+      client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
+      &device);
+  EXPECT_EQ(result_shaped_buffer.root_buffer().size(), byte_size);
+}
+
+TEST(TrackedGpuDeviceBufferTest, TrackedDeviceBufferUsageEndToEnd) {
+  auto usage_event = MakeConstructedAsyncValueRef<GpuEvent>();
+
+  TestAllocator allocator;
+  TF_ASSERT_OK_AND_ASSIGN(auto owning_memory, allocator.Allocate(0, 100));
+  GpuDeviceMemory memory(std::move(owning_memory));
+  auto test_buffer =
+      MakeConstructedAsyncValueRef<GpuDeviceMemory>(std::move(memory));
+
+  auto definition_event = MakeConstructedAsyncValueRef<GpuEvent>();
+
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
+                                      "tracked_buffer_test",
+                                      /*num_threads=*/4);
+
+  TrackedGpuDeviceBuffer tracked_buffer(test_buffer, definition_event,
+                                        /*on_delete_callback_=*/nullptr);
+  tracked_buffer.SetUnOwned();
+  {
+    MarkGpuEventReadyOnExit ready_on_exit(usage_event);
+    tracked_buffer.AddUsageEvents(absl::MakeSpan(&usage_event, 1));
+    // Mimic transfer event in a thread pool.
+    thread_pool.Schedule([&]() {
+      absl::SleepFor(absl::Milliseconds(50));
+      definition_event.SetStateConcrete();
+      test_buffer.SetStateConcrete();
+    });
+    BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
+    EXPECT_EQ(tracked_buffer.buffer()->size_bytes(), 100);
+    auto result = tracked_buffer.buffer();
+    ASSERT_TRUE(result.IsAvailable());
+    EXPECT_FALSE(result->owns_data());
+    EXPECT_EQ(result->buffer().opaque(), kOpaque);
+  }
+  BlockUntilReady(tracked_buffer.AfterAllUsageEvents());
+  BlockUntilReady(tracked_buffer.LockUseAndTransferUsageEvents());
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_callback.cc b/third_party/xla/xla/pjrt/host_callback.cc
index 1faa844936cc..508b10ee70ff 100644
--- a/third_party/xla/xla/pjrt/host_callback.cc
+++ b/third_party/xla/xla/pjrt/host_callback.cc
@@ -20,8 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/shape_util.h"
@@ -171,4 +173,9 @@ CreateHostCallbackStateAndAppendSendRecvCallbacks(
   return context;
 }
 
+// First 64 bits of SHA-512 of "xla::FfiLoadedHostCallbacks".
+ffi::TypeId FfiLoadedHostCallbacks::id = {7357244197867843242};
+XLA_FFI_REGISTER_TYPE(ffi::GetXlaFfiApi(), "FfiLoadedHostCallbacks",
+                      &FfiLoadedHostCallbacks::id);
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_callback.h b/third_party/xla/xla/pjrt/host_callback.h
index 2777cd368d01..e4799fb1fef3 100644
--- a/third_party/xla/xla/pjrt/host_callback.h
+++ b/third_party/xla/xla/pjrt/host_callback.h
@@ -25,9 +25,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/ffi/api/ffi.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
@@ -170,6 +172,12 @@ CreateHostCallbackStateAndAppendSendRecvCallbacks(
     std::vector<RecvCallback>& recv_callbacks,
     bool use_major_to_minor_data_layout_for_callbacks);
 
+struct FfiLoadedHostCallbacks {
+  static ffi::TypeId id;
+  void** callbacks;
+  uint32_t num_callbacks;
+};
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_HOST_CALLBACK_H_
diff --git a/third_party/xla/xla/pjrt/host_callback_test.cc b/third_party/xla/xla/pjrt/host_callback_test.cc
index ef9d5d9ec70c..b6722892afb9 100644
--- a/third_party/xla/xla/pjrt/host_callback_test.cc
+++ b/third_party/xla/xla/pjrt/host_callback_test.cc
@@ -15,15 +15,20 @@ limitations under the License.
 
 #include "xla/pjrt/host_callback.h"
 
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/notification.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/pjrt/host_memory_spaces.cc b/third_party/xla/xla/pjrt/host_memory_spaces.cc
index b052b32de012..109e605e7fc1 100644
--- a/third_party/xla/xla/pjrt/host_memory_spaces.cc
+++ b/third_party/xla/xla/pjrt/host_memory_spaces.cc
@@ -54,4 +54,19 @@ const int PinnedHostMemorySpace::kKindId = []() {
   return static_cast<int>(kind_id);
 }();
 
+CpuDeviceMemorySpace::CpuDeviceMemorySpace(int id, PjRtDevice* device)
+    : id_(id), device_(device) {
+  DCHECK(device_ != nullptr && device_->client() != nullptr);
+  auto* client = device_->client();
+  debug_string_ =
+      absl::StrFormat("CpuDeviceMemory(id=%i, process_index=%i, client=%s)",
+                      id_, client->process_index(), client->platform_name());
+  to_string_ = absl::StrFormat("CPU_DEVICE_%i", id_);
+}
+
+const int CpuDeviceMemorySpace::kKindId = []() {
+  uint32_t kind_id = tsl::Fingerprint32(CpuDeviceMemorySpace::kKind);
+  return static_cast<int>(kind_id);
+}();
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_memory_spaces.h b/third_party/xla/xla/pjrt/host_memory_spaces.h
index 4aa790ef3648..044c403245a6 100644
--- a/third_party/xla/xla/pjrt/host_memory_spaces.h
+++ b/third_party/xla/xla/pjrt/host_memory_spaces.h
@@ -91,6 +91,41 @@ class PinnedHostMemorySpace : public PjRtMemorySpace {
   std::string to_string_;
 };
 
+// Specifically meant for CPU devices, and represents unpinned RAM memory.
+// Logically means the same thing as "unpinned", but exists to maintain parity
+// with accelerator devices such as GPU and TPU: the equivalent of using
+// "device" memory in one of the accelerators would be to use "default" memory
+// in CPU too.
+class CpuDeviceMemorySpace : public PjRtMemorySpace {
+ public:
+  static constexpr absl::string_view kKind = "device";
+  static const int kKindId;
+
+  CpuDeviceMemorySpace(int id, PjRtDevice* device);
+
+  PjRtClient* client() const override { return device_->client(); }
+
+  absl::Span<PjRtDevice* const> devices() const override {
+    return absl::Span<PjRtDevice* const>(&device_, device_ != nullptr ? 1 : 0);
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view kind() const override { return kKind; }
+
+  int kind_id() const override { return kKindId; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  int id_;
+  PjRtDevice* device_ = nullptr;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_HOST_MEMORY_SPACES_H_
diff --git a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
new file mode 100644
index 000000000000..de710154e106
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
@@ -0,0 +1,551 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/host_to_device_transfer_manager.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/literal.h"
+#include "xla/pjrt/common_pjrt_client.h"
+#include "xla/pjrt/device_event.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/raw_buffer.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/context_types.h"
+#include "tsl/profiler/lib/scoped_memory_debug_annotation.h"
+
+namespace xla {
+
+class CommonAsyncHostToDeviceTransferManager
+    : public PjRtClient::AsyncHostToDeviceTransferManager {
+ public:
+  static absl::StatusOr<
+      std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  Create(absl::Span<const PjRtClient::ShapeSpec> shape_specs,
+         std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+         PjRtMemorySpace* memory_space) {
+    if (device_layouts.has_value() &&
+        device_layouts->size() != shape_specs.size()) {
+      return InvalidArgument(
+          "Number of layouts %d does not match the number of shapes %d",
+          device_layouts->size(), shape_specs.size());
+    }
+
+    auto* client =
+        tensorflow::down_cast<CommonPjRtClient*>(memory_space->client());
+    std::optional<std::string> debug_info = std::nullopt;
+    const auto& current_anno =
+        tsl::profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
+    if (current_anno.pending_op_name && current_anno.pending_region_type) {
+      debug_info = std::make_optional<std::string>(absl::StrCat(
+          current_anno.pending_op_name, " ", current_anno.pending_region_type));
+    }
+
+    absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers;
+    // Each buffer gets an allocation event, which is set when the first chunk
+    // of data arrives, and triggers actual HBM allocation immediately before
+    // the data is copied to the device. This lazy allocation design avoids
+    // holding on to empty, unusable HBM while waiting for data, for example
+    // from a remote server,
+    absl::InlinedVector<std::unique_ptr<ScopedEvent>, 4> allocation_events;
+    absl::InlinedVector<tsl::RCReference<PjRtDeviceEventPromise>, 4>
+        definition_events;
+    absl::InlinedVector<Shape, 4> device_shapes;
+    absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
+        undispatched_buffer_refs;
+    absl::InlinedVector<size_t, 4> buffer_sizes;
+    undispatched_buffer_refs.reserve(shape_specs.size());
+    buffer_sizes.reserve(shape_specs.size());
+    buffers.reserve(shape_specs.size());
+    allocation_events.reserve(shape_specs.size());
+    definition_events.reserve(shape_specs.size());
+    device_shapes.reserve(shape_specs.size());
+    for (int i = 0; i < shape_specs.size(); ++i) {
+      const PjRtClient::ShapeSpec& shape_spec = shape_specs[i];
+      if (shape_spec.element_type == TUPLE) {
+        return Unimplemented(
+            "Async buffer transfer of tuples not implemented.");
+      }
+
+      // We make an event that will become available when the final transfer
+      // is complete.
+      tsl::RCReference<PjRtDeviceEventPromise> definition_event_promise;
+      tsl::RCReference<PjRtDeviceEvent> definition_event;
+      if (client->event_tracking_enabled()) {
+        TF_ASSIGN_OR_RETURN(
+            std::tie(definition_event_promise, definition_event),
+            client->CreateLinkedEventPromise(
+                memory_space,
+                absl::StrCat("AsyncHostToDeviceTransferManager Op:",
+                             debug_info.value_or(""))));
+      } else {
+        TF_ASSIGN_OR_RETURN(
+            std::tie(definition_event_promise, definition_event),
+            client->CreateLinkedEventPromise(memory_space, ""));
+      }
+      definition_events.push_back(std::move(definition_event_promise));
+
+      auto allocation_event =
+          client->CreateAllocationEventForTransfers(memory_space, debug_info);
+      if (allocation_event) {
+        allocation_events.push_back(
+            std::make_unique<ScopedEvent>(allocation_event));
+      } else {
+        allocation_events.push_back({});
+      }
+
+      TF_ASSIGN_OR_RETURN(
+          Shape device_shape,
+          client->MakeDefaultShapeForMemorySpace(
+              memory_space,
+              xla::ShapeUtil::MakeShape(shape_spec.element_type,
+                                        shape_spec.dims),
+              device_layouts.has_value() && (*device_layouts)[i].has_value()
+                  ? &(*(*device_layouts)[i])
+                  : nullptr));
+      TF_ASSIGN_OR_RETURN(
+          int64_t on_device_bytes_count,
+          client->GetOnDeviceBytesCount(memory_space, device_shape));
+      TF_ASSIGN_OR_RETURN(
+          auto raw_buffer,
+          client->AllocateRawBuffer(memory_space, on_device_bytes_count,
+                                    allocation_event));
+      TF_ASSIGN_OR_RETURN(auto buffer,
+                          client->DefineBuffer(device_shape, raw_buffer,
+                                               {std::move(definition_event)},
+                                               /*raw_buffer_is_mutable=*/true));
+      device_shapes.push_back(std::move(device_shape));
+      buffers.push_back(std::move(buffer));
+      undispatched_buffer_refs.push_back(raw_buffer);
+      buffer_sizes.push_back(on_device_bytes_count);
+    }
+
+    return std::unique_ptr<CommonAsyncHostToDeviceTransferManager>(
+        new CommonAsyncHostToDeviceTransferManager(
+            std::move(buffers), std::move(undispatched_buffer_refs),
+            std::move(buffer_sizes), std::move(allocation_events),
+            std::move(definition_events), std::move(device_shapes),
+            client->async_work_runner(), client, memory_space,
+            std::move(debug_info)));
+  }
+
+  ~CommonAsyncHostToDeviceTransferManager() override {
+    auto transfers_finished = [this]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return transfers_in_flight_ == 0;
+    };
+    {
+      absl::MutexLock l(&mu_);
+      // Make sure we don't leave dangling pointers in cleanup routines even
+      // if the client lets the object go out of scope.
+      mu_.Await(absl::Condition(&transfers_finished));
+      for (const auto in_flight : buffer_transfers_in_flight_) {
+        CHECK_EQ(in_flight, 0);
+      }
+      // Since there are no transfers in flight, we can't race on the
+      // definition_events_ here. Make sure it has been notified, to avoid
+      // blocking the definition event forever. If the transfers completed
+      // successfully the event will be set after the call to on_done, which
+      // might trigger this destructor, so we reset each entry in
+      // definition_events_ in the case of successful completion which is why
+      // definition_events_[x].GetAsyncValue might return nullptr.
+      for (auto& event : definition_events_) {
+        if (event) {
+          event->SetError(absl::InternalError(
+              "Async transfer object was deleted before transfers completed."));
+        }
+      }
+    }
+  }
+
+  size_t buffer_count() const override { return buffers_.size(); };
+
+  size_t buffer_size(int buffer_index) const override {
+    DCHECK_LT(buffer_index, buffer_sizes_.size());
+    return buffer_sizes_[buffer_index];
+  }
+
+  PjRtDevice* device() const override { return memory_space_->devices()[0]; }
+
+  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override {
+    DCHECK_LT(buffer_index, buffers_.size());
+    return std::move(buffers_[buffer_index]);
+  };
+
+  absl::Status TransferLiteralToBuffer(
+      int buffer_index, const LiteralSlice& literal,
+      absl::AnyInvocable<void() &&> on_done) override {
+    absl::ReleasableMutexLock l(&mu_);
+
+    DCHECK_LT(buffer_index, undispatched_buffer_refs_.size());
+    tsl::RCReference<CommonPjRtRawBuffer>& undispatched_buffer_ref =
+        undispatched_buffer_refs_[buffer_index];
+    if (!undispatched_buffer_ref) {
+      return InvalidArgument(
+          "TransferLiteralToBuffer requested for buffer index %d which has "
+          "already been fully transferred",
+          buffer_index);
+    }
+    // Unblock allocating the underlying memory.
+    allocation_events_[buffer_index].reset();
+
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
+    tsl::RCReference<PjRtDeviceEventPromise> definition_event;
+    using std::swap;
+    swap(raw_buffer, undispatched_buffer_ref);
+    CHECK(raw_buffer);
+    swap(definition_event, definition_events_[buffer_index]);
+    CHECK(definition_event);
+
+    ++transfers_in_flight_;
+    CHECK_EQ(buffer_transfers_in_flight_[buffer_index], 0);
+    ++buffer_transfers_in_flight_[buffer_index];
+
+    // We release the lock here because EnqueueWork might sometimes run the
+    // closure in this thread!
+    l.Release();
+
+    tsl::profiler::TraceMeProducer producer("TransferLiteralToBuffer",
+                                            tsl::profiler::ContextType::kPjRt);
+
+    // The host to device transfer is performed on a thread pool, mostly because
+    // it includes linearization that may be slow.
+    // TODO(misard) assess if it would be preferable to introduce a heuristic to
+    // put the transfer into the calling thread for small literals.
+    async_work_runner_->Schedule(
+        [this, buffer_index, literal, raw_buffer = std::move(raw_buffer),
+         definition_event = std::move(definition_event),
+         on_done = std::move(on_done),
+         context_id = producer.GetContextId()]() mutable {
+          tsl::profiler::TraceMeConsumer consumer(
+              "TransferLiteralToBuffer H2D Dispatch",
+              tsl::profiler::ContextType::kPjRt, context_id);
+          auto status_or_h2d_transfer_event = client_->LinearizeInto(
+              literal, device_shapes_[buffer_index].layout(), raw_buffer);
+          CHECK_OK(status_or_h2d_transfer_event);
+          auto h2d_transfer_event = *std::move(status_or_h2d_transfer_event);
+          if (client_->event_tracking_enabled()) {
+            h2d_transfer_event->AppendDescriptionToEvent(
+                " TransferToDevice TransferLiteralToBuffer",
+                {definition_event.get()});
+          }
+
+          auto cleanup = [this, buffer_index,
+                          transfer_event = h2d_transfer_event,
+                          definition_event = std::move(definition_event),
+                          on_done = std::move(on_done)]() mutable {
+            {
+              absl::MutexLock l(&mu_);
+
+              CHECK_GT(transfers_in_flight_, 0);
+              --transfers_in_flight_;
+              CHECK_EQ(buffer_transfers_in_flight_[buffer_index], 1);
+              --buffer_transfers_in_flight_[buffer_index];
+              CHECK_GT(remaining_buffer_count_, 0);
+              --remaining_buffer_count_;
+            }
+
+            // Call on_done after finishing all housekeeping and releasing the
+            // lock.
+            //
+            // NOTE: on_done may call ~AsyncHostToDeviceTransferManager(), so we
+            // don't touch any class members after this point.
+            std::move(on_done)();
+
+            // Unblock the definition event after calling on_done, just in case
+            // the caller wanted some serialization between finding out about
+            // the buffers becoming available and them being released.
+            CHECK(definition_event);
+            // Dependency of event on transfer_event was recorded above in
+            // AppendDescriptionToEvent.
+            definition_event->Set(std::move(transfer_event));
+          };
+          h2d_transfer_event->AndThen(std::move(cleanup));
+        });
+
+    return absl::OkStatus();
+  }
+
+  absl::Status TransferRawDataToBuffer(
+      int buffer_index, absl::string_view data,
+      absl::AnyInvocable<void() &&> on_done) override {
+    return TransferRawDataToSubBuffer(buffer_index, data.data(),
+                                      /*offset=*/0, data.size(),
+                                      /*is_last_transfer=*/true,
+                                      std::move(on_done));
+  }
+
+  absl::Status TransferRawDataToSubBuffer(
+      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
+      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
+    absl::ReleasableMutexLock l(&mu_);
+    DCHECK_LT(buffer_index, undispatched_buffer_refs_.size());
+    tsl::RCReference<CommonPjRtRawBuffer> undispatched_buffer_ref;
+    // Drop reference to the buffer if this is the last transfer.
+    if (is_last_transfer) {
+      std::swap(undispatched_buffer_ref,
+                undispatched_buffer_refs_[buffer_index]);
+    } else {
+      undispatched_buffer_ref = undispatched_buffer_refs_[buffer_index];
+    }
+    if (!undispatched_buffer_ref) {
+      return InvalidArgument(
+          "TransferRawData requested for buffer index %d which has "
+          "already been fully transferred",
+          buffer_index);
+    }
+    CHECK(definition_events_[buffer_index]);
+    std::string op_name = "TransferRawDataToSubBuffer";
+    std::string region_type = "";
+    if (debug_info_.has_value()) {
+      std::vector<std::string> debug_info =
+          absl::StrSplit(debug_info_.value(), ';');
+      op_name = debug_info.empty() ? "" : debug_info.front();
+      region_type = debug_info.size() > 1 ? debug_info.back() : "";
+    }
+    tsl::profiler::ScopedMemoryDebugAnnotation anno(
+        op_name.c_str(), region_type.c_str(), 0, []() { return ""; });
+    // Unblock allocating the underlying memory.
+    allocation_events_[buffer_index].reset();
+
+    ++transfers_in_flight_;
+    ++buffer_transfers_in_flight_[buffer_index];
+
+    // Release the lock for two reasons:
+    //   (1) Asynchronous calls to this function spend most of their time in
+    //       `::tpu::System::TransferToDevice`, so this reduces lock contention.
+    //   (2) Cleanup of this class may be called within the `on_done` of
+    //        `h2d_transfer_event.AndThen`, which would cause deadlock.
+    l.Release();
+    TF_ASSIGN_OR_RETURN(
+        auto h2d_transfer_event,
+        undispatched_buffer_ref->CopyRawHostToDeviceAndReturnEvent(
+            data, offset, transfer_size));
+    if (client_->event_tracking_enabled()) {
+      // Acquire when logging, for the sake of definition_events_.
+      absl::MutexLock l(&mu_);
+      std::string op_name = debug_info_.has_value()
+                                ? absl::StrCat(" Op:", debug_info_.value())
+                                : "";
+      h2d_transfer_event->AppendDescriptionToEvent(
+          absl::StrCat(" TransferToDevice TransferRawData offset:", offset,
+                       " size:", transfer_size,
+                       " last_transfer:", is_last_transfer, op_name),
+          {definition_events_[buffer_index].get()});
+    }
+
+    h2d_transfer_event->AndThen([this, buffer_index,
+                                 transfer_event = h2d_transfer_event,
+                                 on_done = std::move(on_done)]() mutable {
+      tsl::RCReference<PjRtDeviceEventPromise> definition_event;
+      {
+        absl::MutexLock l(&mu_);
+
+        CHECK_GT(transfers_in_flight_, 0);
+        --transfers_in_flight_;
+        CHECK_GT(buffer_transfers_in_flight_[buffer_index], 0);
+        --buffer_transfers_in_flight_[buffer_index];
+        auto& definition_event_ref = definition_events_[buffer_index];
+        if (buffer_transfers_in_flight_[buffer_index] == 0 &&
+            !undispatched_buffer_refs_[buffer_index]) {
+          CHECK_GT(remaining_buffer_count_, 0);
+          --remaining_buffer_count_;
+          using std::swap;
+          swap(definition_event, definition_event_ref);
+        }
+        if (definition_event_ref) {
+          // If this is not the last completed transfer, then we need to set the
+          // error while holding the lock to avoid a race.
+          auto state = transfer_event->state();
+          if (state == PjRtDeviceEvent::State::kError) {
+            definition_event_ref->SetError(transfer_event->status());
+            definition_event_ref = tsl::RCReference<PjRtDeviceEventPromise>();
+          } else {
+            CHECK(state == PjRtDeviceEvent::State::kReady);
+          }
+        }
+      }
+
+      // Call on_done after finishing all housekeeping and releasing the
+      // lock.
+      //
+      // NOTE: on_done may call ~AsyncHostToDeviceTransferManager(), so we
+      // don't touch any class members after this point.
+      std::move(on_done)();
+
+      // Unblock the definition event after calling on_done, just in case
+      // the caller wanted some serialization between finding out about the
+      // buffers becoming available and them being released.
+      if (definition_event) {
+        // Dependency of event on transfer_event was recorded above in
+        // AppendDescriptionToEvent.
+        definition_event->Set(std::move(transfer_event));
+      }
+    });
+    return absl::OkStatus();
+  }
+
+  void SetBufferError(int buffer_index, absl::Status error) override {
+    absl::MutexLock l(&mu_);
+    // For a given buffer_index, SetBufferError can't be called twice, or
+    // called after the last transfer has been enqueued.
+    auto definition_event = std::move(definition_events_[buffer_index]);
+    CHECK(definition_event);
+    definition_event->SetError(error);
+    if (allocation_events_[buffer_index]) {
+      allocation_events_[buffer_index]->SetError(error);
+    }
+  }
+
+  void AddTransferMetadata(const TransferMetadata& meta) override {
+    if (client_->event_tracking_enabled()) {
+      absl::MutexLock l(&mu_);
+      std::string annotation =
+          absl::StrCat(" ", absl::StrJoin(meta, " ", absl::PairFormatter(":")));
+      for (int i = 0; i < definition_events_.size(); ++i) {
+        const auto& event = definition_events_[i];
+        if (definition_events_.size() > 1) {
+          absl::StrAppend(&annotation, " buf_idx:", i);
+        }
+        event->AppendDescriptionToEvent(annotation, {});
+      }
+    }
+  }
+
+ private:
+  // Helper class that holds an event and makes the event available when the
+  // class goes out of scope. Used for the events that unblock TpuBuffer
+  // allocation to ensure that the allocations are unblocked in all error cases.
+  class ScopedEvent {
+   public:
+    explicit ScopedEvent(::tsl::AsyncValueRef<bool> event)
+        : event_(std::move(event)) {}
+    ~ScopedEvent() {
+      if (event_) {
+        event_.SetStateConcrete();
+      }
+    }
+
+    void SetError(const absl::Status& error) {
+      event_.SetError(error);
+      event_.reset();
+    }
+
+   private:
+    ::tsl::AsyncValueRef<bool> event_;
+  };
+
+  CommonAsyncHostToDeviceTransferManager(
+      absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers,
+      absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4> raw_buffers,
+      absl::InlinedVector<size_t, 4> buffer_sizes,
+      absl::InlinedVector<std::unique_ptr<ScopedEvent>, 4> allocation_events,
+      absl::InlinedVector<tsl::RCReference<PjRtDeviceEventPromise>, 4>
+          definition_events,
+      absl::InlinedVector<Shape, 4> device_shapes,
+      AsyncWorkRunner* async_work_runner, CommonPjRtClient* client,
+      PjRtMemorySpace* memory_space, std::optional<std::string> debug_info)
+      : debug_info_(std::move(debug_info)),
+        buffers_(std::move(buffers)),
+        allocation_events_(std::move(allocation_events)),
+        buffer_sizes_(std::move(buffer_sizes)),
+        undispatched_buffer_refs_(std::move(raw_buffers)),
+        definition_events_(std::move(definition_events)),
+        device_shapes_(std::move(device_shapes)),
+        remaining_buffer_count_(buffers_.size()),
+        transfers_in_flight_(0),
+        async_work_runner_(async_work_runner),
+        client_(client),
+        memory_space_(memory_space) {
+    DCHECK_EQ(memory_space_->devices().size(), 1);
+    buffer_transfers_in_flight_.resize(undispatched_buffer_refs_.size(), 0);
+  }
+
+  std::optional<std::string> debug_info_;
+
+  absl::Mutex mu_;
+  // The newly created buffers, which will be returned to the caller via
+  // Retrieve.
+  absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers_;
+  // Holders for events that block allocation of the underlying memory for the
+  // buffers. When data is first available for a buffer the corresponding
+  // allocation ScopedEvent is destroyed, which triggers allocation of the
+  // memory.
+  absl::InlinedVector<std::unique_ptr<ScopedEvent>, 4> allocation_events_
+      ABSL_GUARDED_BY(mu_);
+  // Cached versions of the sizes of all the buffers, so we can return them
+  // without acquiring mu_.
+  absl::InlinedVector<size_t, 4> buffer_sizes_;
+  // References to the underlying storage for all the buffers, which ensures
+  // that the buffers can't be freed before all transfers are dispatched. The
+  // reference to each buffer is dropped immediately after the last transfer
+  // for that buffer has been dispatched.
+  absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
+      undispatched_buffer_refs_ ABSL_GUARDED_BY(mu_);
+  // Number of transfers in flight for each buffer. Used to determine when the
+  // last transfer has completed, in case the completions arrive out of order.
+  absl::InlinedVector<int, 4> buffer_transfers_in_flight_ ABSL_GUARDED_BY(mu_);
+  // Per buffer definition event. It is made available once the buffer is ready
+  // (either because the transfer for that buffer completed, or because an error
+  // was recorded for that buffer).
+  absl::InlinedVector<tsl::RCReference<PjRtDeviceEventPromise>, 4>
+      definition_events_ ABSL_GUARDED_BY(mu_);
+  // Device shapes for all buffers with either compact or custom layout.
+  const absl::InlinedVector<Shape, 4> device_shapes_;
+  // Count of buffers that have not yet been fully transferred.
+  size_t remaining_buffer_count_ ABSL_GUARDED_BY(mu_);
+  // Count of transfers that have been started but have not yet called cleanup.
+  // Used to block in the destructor to avoid dangling pointers in cleanup.
+  int transfers_in_flight_ ABSL_GUARDED_BY(mu_);
+
+  AsyncWorkRunner* async_work_runner_;  // not owned.
+  CommonPjRtClient* client_;            // not owned.
+  PjRtMemorySpace* memory_space_;       // not owned.
+};
+
+absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+CreateAsyncHostToDeviceTransferManager(
+    absl::Span<const PjRtClient::ShapeSpec> shape_specs,
+    std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+    PjRtMemorySpace* memory_space) {
+  return CommonAsyncHostToDeviceTransferManager::Create(
+      shape_specs, device_layouts, memory_space);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.h b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.h
new file mode 100644
index 000000000000..1afbc6ccb8c2
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.h
@@ -0,0 +1,31 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_HOST_TO_DEVICE_TRANSFER_MANAGER_H_
+#define XLA_PJRT_HOST_TO_DEVICE_TRANSFER_MANAGER_H_
+
+#include "xla/pjrt/common_pjrt_client.h"
+
+namespace xla {
+
+absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+CreateAsyncHostToDeviceTransferManager(
+    absl::Span<const PjRtClient::ShapeSpec> shape_specs,
+    std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
+    PjRtMemorySpace* memory_space);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_HOST_TO_DEVICE_TRANSFER_MANAGER_H_
diff --git a/third_party/xla/xla/pjrt/interpreter/BUILD b/third_party/xla/xla/pjrt/interpreter/BUILD
index 8c95f0c29627..301a94fb1a4a 100644
--- a/third_party/xla/xla/pjrt/interpreter/BUILD
+++ b/third_party/xla/xla/pjrt/interpreter/BUILD
@@ -1,3 +1,4 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -16,6 +17,8 @@ cc_library(
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/backends/interpreter:compiler",
         "//xla/client:executable_build_options",
         "//xla/hlo/builder:xla_computation",
@@ -65,3 +68,24 @@ cc_library(
         "@local_tsl//tsl/platform:fingerprint",
     ],
 )
+
+xla_cc_test(
+    name = "interpreter_client_test",
+    srcs = ["interpreter_client_test.cc"],
+    deps = [
+        ":interpreter_client",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/tests:literal_test_util",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
index d0d077f0dc62..2ab72ffd83a8 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
@@ -64,6 +64,8 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
@@ -177,7 +179,8 @@ inline DeviceAssignment MakeInterpreterDeviceAssignment() {
 }  // namespace
 
 const InterpreterDescription& InterpreterDescription::Singleton() {
-  static const InterpreterDescription* singleton = new InterpreterDescription;
+  static const InterpreterDescription* const singleton =
+      new InterpreterDescription;
   return *singleton;
 }
 
@@ -282,7 +285,7 @@ InterpreterLoadedExecutable::ExecuteSharded(
   std::vector<std::unique_ptr<PjRtBuffer>> result;
   // Untuple result if requested.
   if (options.untuple_result && result_literal.shape().IsTuple()) {
-    const int tuple_count = result_literal.shape().tuple_shapes_size();
+    const int tuple_count = result_literal.shape().tuple_shapes().size();
     result.reserve(tuple_count);
     // DecomposeTuple invalidates result_literal. move(...) to make it obvious.
     std::vector<Literal> tuple_elements =
@@ -312,6 +315,7 @@ absl::StatusOr<Literal> InterpreterLoadedExecutable::Evaluate(
     const HloComputation& computation,
     absl::Span<const Literal* const> arg_literals) {
   absl::MutexLock lock(&hlo_evaluator_lock_);
+  hlo_evaluator_->ResetVisitStates();
   return hlo_evaluator_->Evaluate(computation, arg_literals);
 }
 
@@ -333,8 +337,8 @@ absl::StatusOr<Layout> InterpreterClient::GetDefaultLayout(
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-InterpreterClient::Compile(const XlaComputation& computation,
-                           CompileOptions options) {
+InterpreterClient::CompileAndLoad(const XlaComputation& computation,
+                                  CompileOptions options) {
   std::vector<const Shape*> argument_layout_pointers;
   const ExecutableBuildOptions& build_options =
       options.executable_build_options;
@@ -356,7 +360,8 @@ InterpreterClient::Compile(const XlaComputation& computation,
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-InterpreterClient::Compile(mlir::ModuleOp module, CompileOptions options) {
+InterpreterClient::CompileAndLoad(mlir::ModuleOp module,
+                                  CompileOptions options) {
   XlaComputation xla_computation;
   const ExecutableBuildOptions& exec_build_options =
       options.executable_build_options;
@@ -368,7 +373,7 @@ InterpreterClient::Compile(mlir::ModuleOp module, CompileOptions options) {
   // If the compile options specify argument layout, then let's
   // fall back to using the options to determine layouts.
   if (options.argument_layouts) {
-    return Compile(xla_computation, options);
+    return CompileAndLoad(xla_computation, options);
   }
 
   TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> arg_layout_modes,
@@ -404,19 +409,13 @@ InterpreterClient::Compile(mlir::ModuleOp module, CompileOptions options) {
                          layout_callback, options);
 }
 
-absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-InterpreterClient::BufferFromHostLiteral(const LiteralSlice& literal,
-                                         PjRtMemorySpace* memory_space) {
-  return std::make_unique<InterpreterLiteralWrapperBuffer>(
-      memory_space->client(), memory_space, literal);
-}
-
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 InterpreterClient::BufferFromHostLiteral(const LiteralSlice& literal,
                                          PjRtMemorySpace* memory_space,
                                          const Layout* device_layout) {
   if (device_layout == nullptr) {
-    return BufferFromHostLiteral(literal, memory_space);
+    return std::make_unique<InterpreterLiteralWrapperBuffer>(
+        memory_space->client(), memory_space, literal);
   }
   Literal device_literal = literal.Relayout(*device_layout);
   return std::make_unique<InterpreterLiteralWrapperBuffer>(
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.h b/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
index f6cffda5011c..6a396e747cbf 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.h
@@ -55,6 +55,7 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/fingerprint.h"
 
 namespace xla {
@@ -87,7 +88,7 @@ class InterpreterDescription final : public PjRtDeviceDescription {
 
 class InterpreterMemorySpace final : public PjRtMemorySpace {
  public:
-  explicit InterpreterMemorySpace(absl::Nonnull<PjRtClient*> client)
+  explicit InterpreterMemorySpace(PjRtClient* absl_nonnull client)
       : client_(ABSL_DIE_IF_NULL(client)) {}
 
   PjRtClient* client() const override { return client_; }
@@ -114,7 +115,7 @@ class InterpreterMemorySpace final : public PjRtMemorySpace {
 
 class InterpreterDevice final : public PjRtDevice {
  public:
-  explicit InterpreterDevice(absl::Nonnull<PjRtClient*> client)
+  explicit InterpreterDevice(PjRtClient* absl_nonnull client)
       : client_(ABSL_DIE_IF_NULL(client)) {}
 
   // Return the client that owns this device.
@@ -168,14 +169,14 @@ class InterpreterDevice final : public PjRtDevice {
 // A buffer that wraps a Literal.
 class InterpreterLiteralWrapperBuffer final : public PjRtBuffer {
  public:
-  InterpreterLiteralWrapperBuffer(absl::Nonnull<PjRtClient*> client,
-                                  absl::Nonnull<PjRtMemorySpace*> memory_space,
+  InterpreterLiteralWrapperBuffer(PjRtClient* absl_nonnull client,
+                                  PjRtMemorySpace* absl_nonnull memory_space,
                                   const LiteralSlice& literal)
       : client_(client),
         memory_space_(memory_space),
         literal_(literal.Clone()) {}
-  InterpreterLiteralWrapperBuffer(absl::Nonnull<PjRtClient*> client,
-                                  absl::Nonnull<PjRtMemorySpace*> memory_space,
+  InterpreterLiteralWrapperBuffer(PjRtClient* absl_nonnull client,
+                                  PjRtMemorySpace* absl_nonnull memory_space,
                                   Literal literal)
       : client_(client),
         memory_space_(memory_space),
@@ -290,7 +291,7 @@ class InterpreterLiteralWrapperBuffer final : public PjRtBuffer {
 class InterpreterLoadedExecutable final : public PjRtLoadedExecutable {
  public:
   explicit InterpreterLoadedExecutable(
-      absl::Nonnull<PjRtClient*> client, std::unique_ptr<HloModule> hlo_module,
+      PjRtClient* absl_nonnull client, std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloEvaluator> hlo_evaluator,
       std::optional<DynamicDimensionInference> dynamic_dimension_inference,
       std::shared_ptr<DeviceAssignment> device_assignment,
@@ -453,15 +454,13 @@ class InterpreterClient final : public PjRtClient {
     return std::make_unique<HloCostAnalysis>(ShapeSizeBytes);
   }
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       const XlaComputation& computation, CompileOptions options) override;
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       mlir::ModuleOp module, CompileOptions options) override;
 
-  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
-      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override;
-
+  using PjRtClient::BufferFromHostLiteral;
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtMemorySpace* memory_space,
       const Layout* device_layout) override;
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client_test.cc b/third_party/xla/xla/pjrt/interpreter/interpreter_client_test.cc
new file mode 100644
index 000000000000..8607ba0eee43
--- /dev/null
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/interpreter/interpreter_client.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+TEST(InterpreterClientTest, EvaluateOnceShouldSucceed) {
+  InterpreterClient client;
+  const Shape shape = ShapeUtil::MakeShape(S32, {4});
+  XlaBuilder builder("test");
+  Add(Parameter(&builder, 0, shape, "parameter0"),
+      ConstantR1(&builder, absl::Span<const int32_t>{1, 1, 1, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                          client.CompileAndLoad(computation, CompileOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> argument,
+      client.BufferFromHostLiteral(
+          LiteralUtil::CreateR1(absl::Span<const int32_t>{1, 2, 3, 4}),
+          client.memory_spaces().front()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results,
+      executable->Execute({{argument.get()}}, ExecuteOptions()));
+
+  ASSERT_EQ(results.size(), 1);
+  ASSERT_EQ(results.front().size(), 1);
+  Literal result_literal(shape);
+  TF_ASSERT_OK(results.front().front()->ToLiteralSync(&result_literal));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      result_literal,
+      LiteralUtil::CreateR1(absl::Span<const int32_t>{2, 3, 4, 5})));
+}
+
+TEST(InterpreterClientTest, EvaluateTwiceShouldSucceed) {
+  InterpreterClient client;
+  const Shape shape = ShapeUtil::MakeShape(S32, {4});
+  XlaBuilder builder("test");
+  Add(Parameter(&builder, 0, shape, "parameter0"),
+      ConstantR1(&builder, absl::Span<const int32_t>{1, 1, 1, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                          client.CompileAndLoad(computation, CompileOptions()));
+
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results;
+  for (const Literal& execution_argument :
+       {LiteralUtil::CreateR1(absl::Span<const int32_t>{1, 2, 3, 4}),
+        LiteralUtil::CreateR1(absl::Span<const int32_t>{4, 3, 2, 1})}) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<PjRtBuffer> argument_buffer,
+        client.BufferFromHostLiteral(execution_argument,
+                                     client.memory_spaces().front()));
+    TF_ASSERT_OK_AND_ASSIGN(
+        results.emplace_back(),
+        executable->ExecuteSharded({argument_buffer.get()},
+                                   client.addressable_devices().front(),
+                                   ExecuteOptions()));
+  }
+
+  std::vector<Literal> expected_literals;
+  expected_literals.push_back(
+      LiteralUtil::CreateR1(absl::Span<const int32_t>{2, 3, 4, 5}));
+  expected_literals.push_back(
+      LiteralUtil::CreateR1(absl::Span<const int32_t>{5, 4, 3, 2}));
+
+  ASSERT_EQ(results.size(), 2);
+  Literal actual_literal(shape);
+  for (int i = 0; i < results.size(); ++i) {
+    const std::vector<std::unique_ptr<PjRtBuffer>>& actual_buffers = results[i];
+    EXPECT_EQ(actual_buffers.size(), 1);
+    TF_ASSERT_OK(actual_buffers.front()->ToLiteralSync(&actual_literal));
+    EXPECT_TRUE(LiteralTestUtil::Equal(actual_literal, expected_literals[i]));
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/local_device_state.cc b/third_party/xla/xla/pjrt/local_device_state.cc
index 152c87844fbb..949938482d45 100644
--- a/third_party/xla/xla/pjrt/local_device_state.cc
+++ b/third_party/xla/xla/pjrt/local_device_state.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/pjrt/local_device_state.h"
 
-#include <functional>
+#include <cstdint>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -23,16 +23,23 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/worker_thread.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/tsl/util/env_var.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
@@ -77,7 +84,7 @@ LocalDeviceState::LocalDeviceState(se::StreamExecutor* executor,
   int num_device_to_device_streams =
       stream_options.has_value() ? stream_options->num_device_to_device_streams
                                  : kNumDeviceToDeviceStreams;
-  auto create_stream = [executor, &stream_options](std::string const& name) {
+  auto create_stream = [executor, &stream_options](const std::string& name) {
     std::unique_ptr<stream_executor::Stream> stream;
     if (stream_options.has_value()) {
       stream = executor->CreateStream(stream_options->priority).value();
@@ -174,7 +181,7 @@ absl::Status LocalDeviceState::ThenMemcpyDeviceToDevice(
 }
 
 absl::Status LocalDeviceState::ThenExecuteCallback(
-    se::Stream* stream, std::function<void()> callback) {
+    se::Stream* stream, absl::AnyInvocable<void() &&> callback) {
   tsl::profiler::TraceMe traceme("ThenExecuteCallback");
   if (callback_stream_map_.has_value()) {
     // Prevent concurrent updates to the callback stream map.
@@ -271,9 +278,11 @@ std::unique_ptr<se::Stream> LocalDeviceState::BorrowStreamFromPool() {
   }
 
   // The stream pool is empty, create a new stream.
-  auto stream = compute_stream_->parent()->CreateStream().value();
-  stream->SetName("Pool stream");
-  return stream;
+  absl::StatusOr<std::unique_ptr<se::Stream>> stream =
+      compute_stream_->parent()->CreateStream();
+  CHECK_OK(stream);
+  (*stream)->SetName("Pool stream");
+  return std::move(*stream);
 }
 
 void LocalDeviceState::ReturnStreamToPool(std::unique_ptr<se::Stream> stream) {
diff --git a/third_party/xla/xla/pjrt/local_device_state.h b/third_party/xla/xla/pjrt/local_device_state.h
index a7ed7addd844..8698baae97ef 100644
--- a/third_party/xla/xla/pjrt/local_device_state.h
+++ b/third_party/xla/xla/pjrt/local_device_state.h
@@ -182,7 +182,7 @@ class LocalDeviceState {
   //    execute in a separate thread.
   // b) ThenDoHostCallback waits for the callback to complete.
   absl::Status ThenExecuteCallback(se::Stream* stream,
-                                   std::function<void()> callback);
+                                   absl::AnyInvocable<void() &&> callback);
 
   // Helpers for releasing values on a worker thread at the tail of a stream on
   // a worker thread. Copies `object`, and destroys the copy when the tail of
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index 42218dad2b25..749ed0bd99ff 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
@@ -62,10 +63,11 @@ limitations under the License.
 #include "stablehlo/dialect/Version.h"
 #include "stablehlo/transforms/Passes.h"
 #include "xla/debug_options_flags.h"
-#include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
+#include "xla/hlo/translate/stablehlo.h"
 #include "xla/mlir/utils/error_util.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/mlir_hlo/stablehlo_ext/transforms/passes.h"
 #include "xla/service/spmd/shardy/constants.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/pipelines.h"
 #include "xla/service/spmd/shardy/utils.h"
@@ -74,6 +76,17 @@ limitations under the License.
 
 namespace xla {
 
+void RegisterAllHloDialects(mlir::DialectRegistry& registry) {
+  registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::func::FuncDialect>();
+  registry.insert<mlir::ml_program::MLProgramDialect>();
+  registry.insert<mlir::shape::ShapeDialect>();
+  mlir::func::registerAllExtensions(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::sdy::registerAllDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+}
+
 absl::Status MlirToXlaComputation(mlir::ModuleOp module,
                                   XlaComputation& xla_computation,
                                   bool use_tuple_args, bool return_tuple,
@@ -82,17 +95,26 @@ absl::Status MlirToXlaComputation(mlir::ModuleOp module,
   mlir::BaseScopedDiagnosticHandler diagnostic_handler(context);
   {
     mlir::PassManager pm(context);
+
+    // CHLO -> MHLO for high level ops (TopK, Erf, RaggedDot, etc.)
+    // CHLO -> StableHLO otherwise
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::stablehlo_ext::createChloRecomposeOpsPass());
+    pm.addPass(mlir::createSymbolDCEPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::mhlo::createChloLegalizeToHighLevelMhloPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::stablehlo::createChloLegalizeToStablehloPass());
+
     // Expand stablehlo complex math functions such as log_plus_one, etc.
     pm.addNestedPass<mlir::func::FuncOp>(
         mlir::stablehlo::createStablehloComplexMathExpanderPass());
-    pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::mhlo::createChloLegalizeToHloPass());
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+
     // In order to export to XLA, we must sink constants to control flow
     // regions, since XLA uses functional control flow.
     pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::mhlo::createSinkConstantsToControlFlowPass());
+        mlir::stablehlo_ext::createSinkConstantsToControlFlowPass());
+
     if (failed(pm.run(module))) {
       VLOG(1) << "MHLO->HLO lowering passes failed.";
       module->dump();
@@ -115,13 +137,9 @@ absl::Status MlirToXlaComputation(mlir::ModuleOp module,
     use_tuple_args = false;
   }
 
-  // create config options use use_tuple_args, return_tuple set:
-  mlir::MlirToHloConversionOptions options;
-  options.use_tuple_args = use_tuple_args;
-  options.return_tuple = return_tuple;
-
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                      mlir::ConvertMlirHloToHloModule(module, options));
+                      xla::ConvertStablehloToHloWithOptions(
+                          module, use_tuple_args, return_tuple));
 
   xla_computation = XlaComputation(hlo_module->ToProto());
   return absl::OkStatus();
@@ -130,14 +148,7 @@ absl::Status MlirToXlaComputation(mlir::ModuleOp module,
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
     absl::string_view mlir_module_str, mlir::MLIRContext& context) {
   mlir::DialectRegistry registry;
-  registry.insert<mlir::arith::ArithDialect>();
-  registry.insert<mlir::func::FuncDialect>();
-  registry.insert<mlir::ml_program::MLProgramDialect>();
-  registry.insert<mlir::shape::ShapeDialect>();
-  mlir::func::registerAllExtensions(registry);
-  mlir::mhlo::registerAllMhloDialects(registry);
-  mlir::sdy::registerAllDialects(registry);
-  mlir::stablehlo::registerAllDialects(registry);
+  RegisterAllHloDialects(registry);
   context.appendDialectRegistry(registry);
 
   mlir::BaseScopedDiagnosticHandler diagnostic_handler(&context);
@@ -231,8 +242,7 @@ absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
       mlir::stablehlo::createStablehloComplexMathExpanderPass());
 
   xla::sdy::addSdyRoundTripExportPipeline(pm);
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHighLevelMhloPass());
+  pm.addPass(mlir::stablehlo_ext::createChloPreserveHighLevelOpsPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::createChloLegalizeToStablehloPass());
   pm.addPass(mlir::stablehlo::createStablehloCompatibilityExpanderPass(
@@ -242,13 +252,13 @@ absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::createShapeLegalizeToStablehloPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
-  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());  // not required
   if (!mlir::succeeded(pm.run(mlir_module))) {
     const absl::Status status = diagnostic_handler.ConsumeStatus();
-    return absl::InvalidArgumentError(
-        absl::StrCat("CHLO => [MHLO+Shape] => StableHLO failed;\n\nDetailed "
-                     "error from MLIR: ",
-                     status.message()));
+    return absl::InvalidArgumentError(absl::StrCat(
+        "CHLO => [StableHLO+Shape] => StableHLO failed;\n\nDetailed "
+        "error from MLIR: ",
+        status.message()));
   }
 
   // Avoid mutating the original module if it will be reused elsewhere
@@ -265,8 +275,8 @@ absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
           mlir_module, target.value(), os))) {
     const absl::Status status = diagnostic_handler.ConsumeStatus();
     return absl::InvalidArgumentError(absl::StrCat(
-        "Failed to serialize StableHLO;\n\nDetailed error from MLIR: ",
-        status.message()));
+        "Failed to serialize StableHLO to plugin version ", target.value(),
+        ";\n\nDetailed error from MLIR: ", status.message()));
   }
   return buffer;
 }
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.h b/third_party/xla/xla/pjrt/mlir_to_hlo.h
index 2413851c386f..e907153d3d9c 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.h
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.h
@@ -19,10 +19,14 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/DialectRegistry.h"
 #include "xla/hlo/builder/xla_computation.h"
 
 namespace xla {
 
+// Registers all MLIR dialects that are necessary for using MLIR HLO modules.
+void RegisterAllHloDialects(mlir::DialectRegistry& registry);
+
 // Converts an MHLO/CHLO module string to an mlir::Module.
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
     absl::string_view mlir_module_str, mlir::MLIRContext& context);
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index fa4f036b80ec..cf9e35a0144c 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -360,8 +360,7 @@ InitializeArgsAndCompile(PjRtCApiClient* api_client, const PJRT_Api* c_api,
   args.struct_size = PJRT_Client_Compile_Args_STRUCT_SIZE;
   PJRT_Profiler_Extension profiler_extension =
       pjrt::CreatePjrtProfilerExtension("PJRT_Client_Compile linkage");
-  args.extension_start =
-      reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension);
+  args.extension_start = &profiler_extension.base;
   args.client = client;
   TF_ASSIGN_OR_RETURN(const CompileOptionsProto options_proto,
                       options.ToProto());
@@ -384,16 +383,17 @@ InitializeArgsAndCompile(PjRtCApiClient* api_client, const PJRT_Api* c_api,
   return ret;
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
-    const XlaComputation& computation, CompileOptions options) {
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+PjRtCApiClient::CompileAndLoad(const XlaComputation& computation,
+                               CompileOptions options) {
   std::string module_str = computation.proto().SerializeAsString();
   std::string format(pjrt::kHloFormat);
   return InitializeArgsAndCompile(this, c_api_, c_client_.get(), options,
                                   module_str, format);
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
-    mlir::ModuleOp module, CompileOptions options) {
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+PjRtCApiClient::CompileAndLoad(mlir::ModuleOp module, CompileOptions options) {
   if (!pjrt_c_api()) llvm::report_fatal_error("pjrt_c_api is null");
 
   auto attributes = plugin_attributes()->attributes;
@@ -415,8 +415,9 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-PjRtCApiClient::DeserializeExecutable(absl::string_view serialized,
-                                      std::optional<CompileOptions> options) {
+PjRtCApiClient::LoadSerializedExecutable(absl::string_view serialized,
+                                         std::optional<CompileOptions> options,
+                                         const LoadOptions& load_options) {
   PJRT_Executable_DeserializeAndLoad_Args des_args;
 
   des_args.struct_size = PJRT_Executable_DeserializeAndLoad_Args_STRUCT_SIZE;
@@ -1873,8 +1874,7 @@ PjRtCApiLoadedExecutable::Execute(
   PJRT_Profiler_Extension profiler_extension =
       pjrt::CreatePjrtProfilerExtension(
           "PJRT_LoadedExecutable_Execute linkage");
-  args.extension_start =
-      reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension);
+  args.extension_start = &profiler_extension.base;
 
   RETURN_STATUS_IF_PJRT_ERROR(
       pjrt_c_api()->PJRT_LoadedExecutable_Execute(&args), pjrt_c_api());
@@ -1945,8 +1945,7 @@ PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
   PJRT_Profiler_Extension profiler_extension =
       pjrt::CreatePjrtProfilerExtension(
           "PJRT_LoadedExecutable_Execute linkage");
-  args.extension_start =
-      reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension);
+  args.extension_start = &profiler_extension.base;
 
   RETURN_STATUS_IF_PJRT_ERROR(
       pjrt_c_api()->PJRT_LoadedExecutable_Execute(&args), pjrt_c_api());
@@ -2104,8 +2103,7 @@ std::shared_ptr<const PjRtLayout> PjRtCApiBuffer::layout() const {
 
 const Shape& PjRtCApiBuffer::on_device_shape() const {
   if (!on_device_shape_.has_value()) {
-    Shape shape(element_type(), dimensions(), is_dynamic_dimension(),
-                /*tuple_shapes=*/{});
+    Shape shape(element_type(), dimensions(), is_dynamic_dimension());
     *shape.mutable_layout() = layout()->xla_layout();
     absl::MutexLock lock(&mu_);
     on_device_shape_ = shape;
@@ -2118,8 +2116,7 @@ absl::StatusOr<Shape> PjRtCApiBuffer::logical_on_device_shape() {
   if (!dims.ok()) {
     return dims.status();
   }
-  Shape result(element_type(), *dims, is_dynamic_dimension(),
-               /*tuple_shapes=*/{});
+  Shape result(element_type(), *dims, is_dynamic_dimension());
   *result.mutable_layout() = layout()->xla_layout();
   return result;
 }
@@ -2322,7 +2319,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
     // Copy across PjRtClients by copying through host
     TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
     absl::InlinedVector<int64_t, 4> byte_strides(
-        literal->shape().dimensions_size());
+        literal->shape().dimensions().size());
     TF_RETURN_IF_ERROR(
         ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
     // Avoid use-after-free on `literal` due to unsequenced move and use.
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index aff25f2999f6..d8095e1a3b00 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -309,16 +309,17 @@ class PjRtCApiClient : public PjRtClient {
   absl::StatusOr<Layout> GetDefaultLayout(
       PrimitiveType element_type, absl::Span<const int64_t> dims) override;
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       const XlaComputation& computation, CompileOptions options) override;
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       mlir::ModuleOp module, CompileOptions options) override;
 
-  // `PjRtCApiClient::DeserializeExecutable()` ignores `CompileOptions` arg
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized,
-      std::optional<CompileOptions> options) override;
+  // `PjRtCApiClient::LoadSerializedExecutable()` ignores `CompileOptions` arg
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
+                           const LoadOptions& load_options) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtMemorySpace* memory_space) override {
@@ -345,7 +346,8 @@ class PjRtCApiClient : public PjRtClient {
       PjRtMemorySpace* memory_space, const Layout* device_layout) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
-      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override {
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space,
+      const Layout* device_layout) override {
     return Unimplemented(
         "PJRT C API does not support BufferFromHostLiteral. Please report an "
         "issue at https://github.com/google/jax/issues if you need this "
@@ -370,12 +372,6 @@ class PjRtCApiClient : public PjRtClient {
         "this feature.");
   }
 
-  absl::Status Defragment() override {
-    return Unimplemented(
-        "PJRT C API does not support Defragment. Please report an issue at "
-        "https://github.com/google/jax/issues if you need this feature.");
-  }
-
   absl::Status DmaMap(void* data, size_t size) override;
 
   absl::Status DmaUnmap(void* data) override;
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
index 514d45d3dbba..cde49664cb85 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
@@ -98,7 +98,7 @@ TEST(PjRtCApiClientTest, IsDynamicDimension) {
       DynamicReshape(inp_0, {inp_1, inp_1}, {2, 3}, dims_are_dynamic);
   auto computation = builder.Build(reshaped).value();
   std::unique_ptr<PjRtLoadedExecutable> executable =
-      client->Compile(computation, CompileOptions()).value();
+      client->CompileAndLoad(computation, CompileOptions()).value();
   ExecuteOptions execute_options;
   execute_options.non_donatable_input_indices = {0};
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results =
@@ -157,7 +157,7 @@ TEST(PjRtCApiClientTest, NonEmptyExecutableFingerprint) {
   builder.SetUpAlias({}, 0, {});
   auto computation = builder.Build(sum).value();
   std::unique_ptr<PjRtLoadedExecutable> executable =
-      client->Compile(computation, CompileOptions()).value();
+      client->CompileAndLoad(computation, CompileOptions()).value();
 
   PjRtCApiClient* c_client = dynamic_cast<PjRtCApiClient*>(client.get());
   ASSERT_NE(c_client, nullptr);
@@ -238,7 +238,7 @@ TEST(PjRtClientTest, CompileUsesStableHloVersion) {
     return PJRT_Client_Compile_Orig(args);
   };
   std::unique_ptr<PjRtLoadedExecutable> executable =
-      client->Compile(*module, CompileOptions()).value();
+      client->CompileAndLoad(*module, CompileOptions()).value();
   const_cast<PJRT_Api*>(c_api)->PJRT_Client_Compile = PJRT_Client_Compile_Orig;
 }
 
@@ -310,7 +310,7 @@ TEST(PjRtCApiClientTest, ForwardExecuteContext) {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   TF_ASSERT_OK_AND_ASSIGN(
       auto executable,
-      client->Compile(XlaComputation(hlo_module->ToProto()), {}));
+      client->CompileAndLoad(XlaComputation(hlo_module->ToProto()), {}));
 
   ExecuteContext context;
   TF_ASSERT_OK(context.ffi_context().Emplace<MemsetValue>(42.0f));
diff --git a/third_party/xla/xla/pjrt/pjrt_client.cc b/third_party/xla/xla/pjrt/pjrt_client.cc
index ea3534ff8f24..2938dd39f85e 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client.cc
@@ -15,15 +15,24 @@ limitations under the License.
 
 #include "xla/pjrt/pjrt_client.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/base/casts.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/utils.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 
@@ -83,7 +92,12 @@ absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
 PjRtLoadedExecutable::GetCostAnalysis() const {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloCostAnalysis> hlo_cost_analysis,
                       client()->GetHloCostAnalysis());
-  return PjRtExecutableUtil::RunHloCostAnalysis(*this, hlo_cost_analysis.get());
+  return PjRtExecutableUtil::RunHloCostAnalysis(*GetExecutable(),
+                                                hlo_cost_analysis.get());
+}
+
+PjRtExecutable* PjRtLoadedExecutable::GetExecutable() const {
+  return executable_forwarder_.get();
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 54904ae7c423..69cf88036542 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -58,16 +58,16 @@ limitations under the License.
 #include "xla/tsl/framework/allocator.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 // API notes:
 // PjRt stands for "Pretty much Just another RunTime".
-
 namespace xla {
 
 class PjRtClient;
 class PjRtDevice;
+class PjRtLoadedExecutable;
+class PjRtExecutableForwarder;
+struct CompileOptions;
 
 class PjRtMemorySpace {
  public:
@@ -594,16 +594,25 @@ class PjRtClient {
   }
 
   // Compile `computation` with given `options`.
-  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  virtual absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) {
+    return Unimplemented("Compile with XlaComputation is not supported.");
+  }
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       const XlaComputation& computation, CompileOptions options) {
-    return Unimplemented("Compile with options is not supported.");
+    return Unimplemented(
+        "CompileAndLoad with XlaComputation is not supported.");
   }
 
   // Variant of `Compile` that accepts an MLIR module.
-  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  virtual absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       mlir::ModuleOp module, CompileOptions options) {
     return Unimplemented("Compile with MLIR Module is not supported.");
   }
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
+      mlir::ModuleOp module, CompileOptions options) {
+    return Unimplemented("CompileAndLoad with MLIR Module is not supported.");
+  }
 
   // Deserializes a serialized executable as produced by
   // PjRtExecutable::SerializeExecutable(). `serialized` must have been
@@ -612,10 +621,9 @@ class PjRtClient {
   // Pending completion of b/237720161, `options` is a mandatory argument in
   // most implementations of this interface. They _are_ optional for
   // implementations related to the PJRT C API.
-  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-  DeserializeExecutable(absl::string_view serialized,
-                        std::optional<CompileOptions> options) {
-    return Unimplemented("Deserialize is not supported.");
+  virtual absl::StatusOr<std::unique_ptr<PjRtExecutable>> DeserializeExecutable(
+      absl::string_view serialized, std::optional<CompileOptions> options) {
+    return Unimplemented("Deserializing serialized executable not supported.");
   }
 
   // LoadSerializedExecutable takes the serialized output of PjRtExecutable. The
@@ -856,22 +864,17 @@ class PjRtClient {
   // completes on the return value before letting literal go out of scope.
   virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtMemorySpace* memory_space) {
-    return tsl::errors::Unimplemented(
-        "BufferFromHostLiteral with PjRtMemorySpace is not implemented on "
-        "platform: ",
-        platform_name());
+    return this->BufferFromHostLiteral(literal, memory_space,
+                                       /*device_layout=*/nullptr);
   }
 
   virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtMemorySpace* memory_space,
       const Layout* device_layout) {
-    if (device_layout) {
-      return absl::UnimplementedError(absl::StrCat(
-          "BufferFromHostLiteral with device_layout is not implemented on "
-          "platform: ",
-          platform_name()));
-    }
-    return this->BufferFromHostLiteral(literal, memory_space);
+    return tsl::errors::Unimplemented(
+        "BufferFromHostLiteral with PjRtMemorySpace is not implemented on "
+        "platform: ",
+        platform_name());
   }
 
   // Creates a PjRtBuffer that is a non-owned view of an on-device
@@ -923,12 +926,6 @@ class PjRtClient {
     return Unimplemented("MakeCrossHostReceiveBuffers is not implemented.");
   }
 
-  // TODO(zhangqiaorjc): Experimental API to be removed.
-  // Defragment device memory.
-  virtual absl::Status Defragment() {
-    return Unimplemented("Defragment is not implemented.");
-  }
-
   // Return the PjRtHostMemoryForDeviceManager for this client. It can be
   // nullptr if the implementation does not provide one.
   virtual PjRtHostMemoryForDeviceManager* GetPjRtHostMemoryForDeviceManager()
@@ -972,7 +969,8 @@ class PjRtBuffer {
 
   // Returned dimensions have lifetime of this buffer.
   virtual absl::Span<const int64_t> dimensions() const {
-    return on_device_shape().dimensions();
+    return on_device_shape().IsArray() ? on_device_shape().dimensions()
+                                       : absl::Span<const int64_t>();
   }
 
   // The on-device memory layout of this buffer. Returned via shared_ptr to make
@@ -1266,19 +1264,26 @@ class PjRtBuffer {
 // device-allocated literals. If any input/output alias has been specified in
 // the computation, the parameter containing the input buffer will be donated
 // when passed to the execution.
-class PjRtLoadedExecutable : public PjRtExecutable {
+class PjRtLoadedExecutable {
  public:
-  ~PjRtLoadedExecutable() override = default;
+  PjRtLoadedExecutable() {
+    executable_forwarder_ = std::make_unique<PjRtExecutableForwarder>(this);
+  }
+
+  virtual ~PjRtLoadedExecutable() = default;
 
   virtual PjRtClient* client() const = 0;
 
   virtual const DeviceAssignment& device_assignment() const = 0;
 
+  // Returns the PjRtExecutable that this PjRtLoadedExecutable wraps.
+  virtual PjRtExecutable* GetExecutable() const;
+
   // Returns named values for cost properties of this executable (such as
   // operations, size of input/outputs, and run time estimate). Properties may
   // differ for different platforms.
-  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
-  GetCostAnalysis() const override;
+  virtual absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const;
 
   // The replica and partition indices of device_assignment to be run by this
   // client. On single-host platforms without partitioning, this is all replicas
@@ -1407,6 +1412,97 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   // True if on-device resources associated with the executable are freed.
   virtual bool IsDeleted() = 0;
 
+  // These are all forwarding methods for convenience, wrapping the
+  // corresponding methods on the wrapped PjRtExecutable.
+  virtual int num_replicas() const { return GetExecutable()->num_replicas(); }
+
+  virtual int num_partitions() const {
+    return GetExecutable()->num_partitions();
+  }
+
+  virtual int64_t SizeOfGeneratedCodeInBytes() const {
+    return GetExecutable()->SizeOfGeneratedCodeInBytes();
+  }
+
+  // Unique name for this executable, e.g., HloModule name.
+  virtual absl::string_view name() const { return GetExecutable()->name(); }
+
+  // Return an array of HloModule (optimized) per partition.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
+  GetHloModules() const {
+    return GetExecutable()->GetHloModules();
+  }
+
+  // Returns an output Shape per program, the size should be equal to
+  // `GetHloModules()`.
+  virtual absl::StatusOr<std::vector<Shape>> GetOutputShapes() const {
+    return GetExecutable()->GetOutputShapes();
+  }
+
+  // Returns a list of element types for each output, the size of the outer list
+  // should be equal to `GetHloModules()`.
+  virtual absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
+  GetOutputElementTypes() const {
+    return GetExecutable()->GetOutputElementTypes();
+  }
+
+  // Returns a list of dimensions for each output, the size of the outer list
+  // should be equal to `GetHloModules()`.
+  virtual absl::StatusOr<std::vector<std::vector<DimensionVector>>>
+  GetOutputDimensions() const {
+    return GetExecutable()->GetOutputDimensions();
+  }
+
+  // Returns a list of parameter OpSharding protos.
+  virtual std::optional<std::vector<OpSharding>> GetParameterShardings() const {
+    return GetExecutable()->GetParameterShardings();
+  }
+
+  // Returns a list of output OpSharding protos.
+  virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const {
+    return GetExecutable()->GetOutputShardings();
+  }
+
+  // Return memory stats that allow callers to estimate device memory usage
+  // when running this executable.
+  virtual absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
+    return GetExecutable()->GetCompiledMemoryStats();
+  }
+
+  // Serialize this executable into a string and return the value.
+  virtual absl::StatusOr<std::string> SerializeExecutable() const {
+    return GetExecutable()->SerializeExecutable();
+  }
+
+  virtual absl::StatusOr<std::string> FingerprintExecutable() const {
+    return GetExecutable()->FingerprintExecutable();
+  }
+
+  // Returns the layout of each input parameter.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetParameterLayouts() const {
+    return GetExecutable()->GetParameterLayouts();
+  }
+
+  // Returns the layout of each output.
+  virtual absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  GetOutputLayouts() const {
+    return GetExecutable()->GetOutputLayouts();
+  }
+
+  // Returns a list of lists of memory kind strings for output. The returned
+  // should be equal to `GetHloModules()`.
+  virtual absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const {
+    return GetExecutable()->GetOutputMemoryKinds();
+  }
+
+  // Returns the compile options used to compile the executable
+  virtual absl::StatusOr<struct CompileOptions> GetCompileOptions() const {
+    return GetExecutable()->GetCompileOptions();
+  }
+  // end of convenience forwarding methods
+
  protected:
   // Value returned internally from routines that enqueue an execution,
   // combining the result buffers with a future that becomes ready when the
@@ -1415,6 +1511,80 @@ class PjRtLoadedExecutable : public PjRtExecutable {
     std::optional<PjRtFuture<>> future;
     std::vector<std::unique_ptr<PjRtBuffer>> buffers;
   };
+
+ private:
+  std::unique_ptr<PjRtExecutableForwarder> executable_forwarder_;
+};
+
+// This class is used to provide a wrapper around a PjRtLoadedExecutable that
+// implements the PjRtExecutable interface, allowing that PjRtLoadedExecutable
+// to be treated like a PjRtExecutable. This is useful for plugins that do not
+// not support AoT compilation, for example, as they have no concept of an
+// "unloaded" executable.
+class PjRtExecutableForwarder : public PjRtExecutable {
+ public:
+  explicit PjRtExecutableForwarder(const PjRtLoadedExecutable* executable)
+      : executable_(executable) {}
+
+  explicit PjRtExecutableForwarder(
+      std::unique_ptr<PjRtLoadedExecutable> executable)
+      : owned_executable_(std::move(executable)) {
+    executable_ = owned_executable_.get();
+  }
+
+  int num_replicas() const override { return executable_->num_replicas(); }
+
+  int num_partitions() const override { return executable_->num_partitions(); }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    return executable_->SizeOfGeneratedCodeInBytes();
+  }
+
+  // Unique name for this executable, e.g., HloModule name.
+  absl::string_view name() const override { return executable_->name(); }
+
+  // Return an HloModule (optimized) per partition.
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    return executable_->GetHloModules();
+  }
+
+  // Returns a list of lists of memory kind strings for output. The returned
+  // value is `[num_programs, num_output]`. The size of the outer list should
+  // be equal to `GetHloModules()`. Under SPMD, one can use
+  // `GetOutputMemoryKinds().front()`.
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
+    return executable_->GetOutputMemoryKinds();
+  }
+
+  // Returns named values for cost properties of this executable (such as
+  // operations, size of input/outputs, and run time estimate). Properties may
+  // differ for different platforms.
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override {
+    return executable_->GetCostAnalysis();
+  }
+
+  absl::StatusOr<std::string> SerializeExecutable() const override {
+    return executable_->SerializeExecutable();
+  }
+
+  absl::StatusOr<struct CompileOptions> GetCompileOptions() const override {
+    return executable_->GetCompileOptions();
+  }
+
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
+    return executable_->FingerprintExecutable();
+  }
+
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    return executable_->GetCompiledMemoryStats();
+  }
+
+ private:
+  const PjRtLoadedExecutable* executable_;
+  std::unique_ptr<PjRtLoadedExecutable> owned_executable_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index b6e9f4c38190..59d3d3d167df 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -102,7 +102,7 @@ std::unique_ptr<PjRtLoadedExecutable> MakeIncrementProgram(
   CompileOptions options;
   options.parameter_is_tupled_arguments = tuplize_arg;
   options.executable_build_options.set_device_assignment(assignment);
-  return client->Compile(computation, options).value();
+  return client->CompileAndLoad(computation, options).value();
 }
 
 class PjRtClientTest
@@ -248,14 +248,16 @@ TEST_P(PjRtClientTest, ExecuteWithDonationAbort) {
       MakeIncrementProgram(client.get(), /*alias=*/true, /*device=*/0);
 
   std::vector<int32_t> data(4, 0);
+  auto shared_data = std::make_shared<std::vector<int32_t>>(data);
   Shape shape = ShapeUtil::MakeShape(S32, {4});
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer,
       client->BufferFromHostBuffer(
-          data.data(), shape.element_type(), shape.dimensions(),
+          shared_data->data(), shape.element_type(), shape.dimensions(),
           /*byte_strides=*/std::nullopt,
-          PjRtClient::HostBufferSemantics::kImmutableZeroCopy, nullptr,
-          client->memory_spaces()[0], /*device_layout=*/nullptr));
+          PjRtClient::HostBufferSemantics::kImmutableZeroCopy,
+          [shared_data]() {}, client->memory_spaces()[0],
+          /*device_layout=*/nullptr));
 
   auto external_reference = buffer->AcquireExternalReference();
 
@@ -558,7 +560,7 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
                           ParseAndReturnUnverifiedModule(kProgram, {}));
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, {}));
+                          client->CompileAndLoad(xla_computation, {}));
 
   std::vector<float> data(4, 0);
   TF_ASSERT_OK_AND_ASSIGN(auto buffer0,
@@ -568,6 +570,8 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
   TF_ASSERT_OK_AND_ASSIGN(auto buffer2,
                           MakeFloatBuffer(client.get(), data, {2, 2}));
 
+  xla::ExecuteOptions options;
+  options.untuple_result = true;
   {
     auto result = pjrt_executable->Execute(/*argument_handles=*/{{
                                                buffer0.get(),
@@ -575,7 +579,7 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
                                                buffer1.get(),
                                                buffer0.get(),
                                            }},
-                                           /*options=*/{});
+                                           /*options=*/options);
     ASSERT_FALSE(result.ok());
     EXPECT_THAT(result.status().message(),
                 ::testing::HasSubstr("f(donate(a), donate(a))"));
@@ -587,7 +591,7 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
                                                buffer2.get(),
                                                buffer0.get(),
                                            }},
-                                           /*options=*/{});
+                                           /*options=*/options);
     ASSERT_FALSE(result.ok());
     EXPECT_THAT(result.status().message(),
                 ::testing::HasSubstr("f(a, donate(a))"));
@@ -599,7 +603,7 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
                                                buffer2.get(),
                                                buffer2.get(),
                                            }},
-                                           /*options=*/{});
+                                           /*options=*/options);
     ASSERT_FALSE(result.ok());
     EXPECT_THAT(result.status().message(),
                 ::testing::HasSubstr("f(donate(a), a)"));
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index 3e5a15839138..edf503f949b8 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -79,8 +79,6 @@ inline PjRtPlatformId TpuId() {
 class PjRtCompiler;
 class PjRtClient;
 
-// TODO(b/240299401): Move CompileOptions to this file.
-
 // Abstract interface to represent device topology that is used by the compiler.
 class PjRtTopologyDescription {
  public:
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
index 7fdf22efbc14..36dea50e686a 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
@@ -69,7 +69,7 @@ TEST(PjRtCompilerTest, CompilerNotRegistered) {
   XlaComputation computation;
   auto res = PjRtCompile(options, computation, topology);
 
-  EXPECT_TRUE(tsl::errors::IsNotFound(res.status()));
+  EXPECT_TRUE(absl::IsNotFound(res.status()));
 }
 
 TEST(PjRtCompilerTest, CompilerRegistered) {
@@ -117,7 +117,7 @@ TEST(PjRtCompilerTest, CompilerRegistered) {
   XlaComputation computation;
   auto res = PjRtCompile(options, computation, topology);
 
-  EXPECT_TRUE(tsl::errors::IsUnimplemented(res.status()));
+  EXPECT_TRUE(absl::IsUnimplemented(res.status()));
 }
 
 TEST(PjRtCompilerTest, PjrtCompileComputationMetric) {
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index d37c69999ac0..67acce6e06a9 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/client/executable_build_options.h"
@@ -42,13 +44,10 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/shape.h"
-#include "xla/shape_layout.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -111,7 +110,8 @@ absl::StatusOr<CompileOptions> CompileOptions::FromProto(
     std::vector<Shape> output_argument_layouts;
     output_argument_layouts.reserve(proto.argument_layouts_size());
     for (const auto& argument_layout : proto.argument_layouts()) {
-      output_argument_layouts.emplace_back(Shape(argument_layout));
+      TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(argument_layout));
+      output_argument_layouts.emplace_back(std::move(shape));
     }
     output.argument_layouts = std::move(output_argument_layouts);
   }
@@ -221,7 +221,9 @@ CompiledMemoryStatsProto CompiledMemoryStats::ToProto() const {
   proto.set_output_size_in_bytes(output_size_in_bytes);
   proto.set_alias_size_in_bytes(alias_size_in_bytes);
   proto.set_temp_size_in_bytes(temp_size_in_bytes);
-  proto.mutable_hlo_proto()->ParseFromString(serialized_hlo_proto);
+  if (buffer_assignment.has_value()) {
+    *proto.mutable_buffer_assignment() = *buffer_assignment;
+  }
   proto.set_host_generated_code_size_in_bytes(
       host_generated_code_size_in_bytes);
   proto.set_host_argument_size_in_bytes(host_argument_size_in_bytes);
@@ -239,7 +241,9 @@ CompiledMemoryStats CompiledMemoryStats::FromProto(
   stats.output_size_in_bytes = proto.output_size_in_bytes();
   stats.alias_size_in_bytes = proto.alias_size_in_bytes();
   stats.temp_size_in_bytes = proto.temp_size_in_bytes();
-  stats.serialized_hlo_proto = proto.hlo_proto().SerializeAsString();
+  if (proto.has_buffer_assignment()) {
+    stats.buffer_assignment = proto.buffer_assignment();
+  }
   stats.host_generated_code_size_in_bytes =
       proto.host_generated_code_size_in_bytes();
   stats.host_argument_size_in_bytes = proto.host_argument_size_in_bytes();
@@ -519,8 +523,7 @@ PjRtExecutableUtil::RunHloCostAnalysis(
   return ret;
 }
 
-absl::StatusOr<
-    std::vector<std::pair<std::string, CompileOptions::OptionOverride>>>
+absl::StatusOr<CompileOptions::EnvironmentOptionOverrides>
 CompileOptions::LoadEnvOptionOverrides(
     const google::protobuf::Map<std::string, xla::OptionOverrideProto>&
         env_option_overrides) {
@@ -554,74 +557,161 @@ CompileOptions::LoadEnvOptionOverrides(
   return result;
 }
 
+absl::Status ApplyStringOption(const tsl::protobuf::FieldDescriptor* field,
+                               const std::string& value,
+                               xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddString(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetString(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ApplyInt32Option(const tsl::protobuf::FieldDescriptor* field,
+                              int32_t value, xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddInt32(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetInt32(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+absl::Status ApplyInt64Option(const tsl::protobuf::FieldDescriptor* field,
+                              int64_t value, xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddInt64(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetInt64(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ApplyFloatOption(const tsl::protobuf::FieldDescriptor* field,
+                              float value, xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddFloat(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetFloat(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ApplyDoubleOption(const tsl::protobuf::FieldDescriptor* field,
+                               double value, xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddDouble(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetDouble(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ApplyBoolOption(const tsl::protobuf::FieldDescriptor* field,
+                             bool value, xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddBool(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetBool(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+absl::Status ApplyEnumOption(const tsl::protobuf::FieldDescriptor* field,
+                             int value, xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddEnumValue(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetEnumValue(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+absl::Status ApplyEnumOption(const tsl::protobuf::FieldDescriptor* field,
+                             const tsl::protobuf::EnumValueDescriptor* value,
+                             xla::DebugOptions& debug_options) {
+  if (field->is_repeated()) {
+    debug_options.GetReflection()->AddEnum(&debug_options, field, value);
+  } else {
+    debug_options.GetReflection()->SetEnum(&debug_options, field, value);
+  }
+  return absl::OkStatus();
+}
+
 absl::Status CompileOptions::ApplyOption(const std::string& key,
                                          const OptionOverride& value) {
-  if (auto* xla_field = xla::DebugOptions::descriptor()->FindFieldByName(key)) {
-    xla::DebugOptions& debug_options =
-        *executable_build_options.mutable_debug_options();
-    const tsl::protobuf::Reflection* reflection = debug_options.GetReflection();
-    if (!reflection) {
-      return InvalidArgument(
-          "No reflection object associated with xla::DebugOptions.");
+  auto* xla_field = xla::DebugOptions::descriptor()->FindFieldByName(key);
+  if (xla_field == nullptr) {
+    return InvalidArgument("No such compile option: '%s'", key);
+  }
+  xla::DebugOptions& debug_options =
+      *executable_build_options.mutable_debug_options();
+  const tsl::protobuf::Reflection* reflection = debug_options.GetReflection();
+  if (reflection == nullptr) {
+    return InvalidArgument(
+        "No reflection object associated with xla::DebugOptions.");
+  }
+  if (xla_field->is_repeated()) {
+    debug_options.GetReflection()->ClearField(&debug_options, xla_field);
+  }
+  if (std::holds_alternative<std::string>(value)) {
+    return ApplyOptionFromString(xla_field, std::get<std::string>(value));
+  }
+  switch (xla_field->type()) {
+    case tsl::protobuf::FieldDescriptor::TYPE_BOOL: {
+      if (std::holds_alternative<bool>(value)) {
+        return ApplyBoolOption(xla_field, std::get<bool>(value), debug_options);
+      }
+      break;
     }
-    if (xla_field->type() == tsl::protobuf::FieldDescriptor::TYPE_BOOL &&
-        std::holds_alternative<bool>(value)) {
-      reflection->SetBool(&debug_options, xla_field, std::get<bool>(value));
-      return absl::OkStatus();
-    } else if (std::holds_alternative<std::string>(value)) {
-      TF_RETURN_IF_ERROR(
-          ApplyOptionFromString(xla_field, std::get<std::string>(value)));
-      return absl::OkStatus();
-    } else if (xla_field->type() ==
-                   tsl::protobuf::FieldDescriptor::TYPE_INT32 &&
-               std::holds_alternative<int64_t>(value)) {
-      reflection->SetInt32(&debug_options, xla_field, std::get<int64_t>(value));
-      return absl::OkStatus();
-    } else if (xla_field->type() ==
-                   tsl::protobuf::FieldDescriptor::TYPE_INT64 &&
-               std::holds_alternative<int64_t>(value)) {
-      reflection->SetInt64(&debug_options, xla_field, std::get<int64_t>(value));
-      return absl::OkStatus();
-    } else if (xla_field->type() ==
-                   tsl::protobuf::FieldDescriptor::TYPE_FLOAT &&
-               std::holds_alternative<double>(value)) {
-      reflection->SetFloat(&debug_options, xla_field, std::get<double>(value));
-      return absl::OkStatus();
-    } else if (xla_field->type() ==
-                   tsl::protobuf::FieldDescriptor::TYPE_DOUBLE &&
-               std::holds_alternative<double>(value)) {
-      reflection->SetDouble(&debug_options, xla_field, std::get<double>(value));
-      return absl::OkStatus();
-    } else if (xla_field->type() == tsl::protobuf::FieldDescriptor::TYPE_ENUM) {
+    case tsl::protobuf::FieldDescriptor::TYPE_INT32: {
       if (std::holds_alternative<int64_t>(value)) {
-        if (xla_field->is_repeated()) {
-          reflection->AddEnumValue(&debug_options, xla_field,
-                                   std::get<int64_t>(value));
-        } else {
-          reflection->SetEnumValue(&debug_options, xla_field,
-                                   std::get<int64_t>(value));
+        int64_t int64_value = std::get<int64_t>(value);
+        if (int64_value >= std::numeric_limits<int32_t>::min() &&
+            int64_value <= std::numeric_limits<int32_t>::max()) {
+          return ApplyInt32Option(xla_field, static_cast<int32_t>(int64_value),
+                                  debug_options);
         }
-      } else {
-        auto enum_desc = xla_field->enum_type()->FindValueByName(
-            std::get<std::string>(value));
-        if (enum_desc != nullptr) {
-          if (xla_field->is_repeated()) {
-            reflection->AddEnum(&debug_options, xla_field, enum_desc);
-          } else {
-            reflection->SetEnum(&debug_options, xla_field, enum_desc);
-          }
+      }
+      break;
+    }
+    case tsl::protobuf::FieldDescriptor::TYPE_INT64: {
+      if (std::holds_alternative<int64_t>(value)) {
+        return ApplyInt64Option(xla_field, std::get<int64_t>(value),
+                                debug_options);
+      }
+      break;
+    }
+    case tsl::protobuf::FieldDescriptor::TYPE_FLOAT: {
+      if (std::holds_alternative<double>(value)) {
+        double double_value = std::get<double>(value);
+        if (double_value >= std::numeric_limits<float>::min() &&
+            double_value <= std::numeric_limits<float>::max()) {
+          return ApplyFloatOption(xla_field, static_cast<float>(double_value),
+                                  debug_options);
         }
       }
-      return absl::OkStatus();
-    } else {
-      return InvalidArgument(
-          "While setting option %s, '%s' is not a valid %s value.", key,
-          std::visit([](auto&& arg) { return absl::StrCat(arg); }, value),
-          xla_field->type_name());
+      break;
     }
-  } else {
-    return InvalidArgument("No such compile option: '%s'", key);
+    case tsl::protobuf::FieldDescriptor::TYPE_DOUBLE: {
+      if (std::holds_alternative<double>(value)) {
+        return ApplyFloatOption(xla_field, std::get<double>(value),
+                                debug_options);
+      }
+      break;
+    }
+    case tsl::protobuf::FieldDescriptor::TYPE_ENUM: {
+      if (std::holds_alternative<int64_t>(value)) {
+        return ApplyEnumOption(xla_field, std::get<int64_t>(value),
+                               debug_options);
+      }
+      break;
+    }
+    default:
+      break;
   }
+  return InvalidArgument(
+      "While setting option %s, '%s' is not a valid %s value.", key,
+      std::visit([](auto&& arg) { return absl::StrCat(arg); }, value),
+      xla_field->type_name());
 }
 
 absl::Status CompileOptions::ApplyAllOptionOverrides() {
@@ -631,65 +721,80 @@ absl::Status CompileOptions::ApplyAllOptionOverrides() {
   return absl::OkStatus();
 }
 
-absl::Status CompileOptions::ApplyOptionFromString(
-    const tsl::protobuf::FieldDescriptor* field, const std::string& value) {
-  xla::DebugOptions& debug_options =
-      *executable_build_options.mutable_debug_options();
-  const tsl::protobuf::Reflection* reflection = debug_options.GetReflection();
-  if (field->type() == tsl::protobuf::FieldDescriptor::TYPE_STRING) {
-    reflection->SetString(&debug_options, field, value);
-    return absl::OkStatus();
-  } else if (field->type() == tsl::protobuf::FieldDescriptor::TYPE_INT32) {
-    int int_value;
-    if (absl::SimpleAtoi(value, &int_value)) {
-      reflection->SetInt32(&debug_options, field, int_value);
-      return absl::OkStatus();
+absl::Status ApplyOptionFromSingleString(
+    const tsl::protobuf::FieldDescriptor* field, const std::string& value,
+    xla::DebugOptions& debug_options) {
+  switch (field->type()) {
+    case tsl::protobuf::FieldDescriptor::TYPE_STRING:
+      return ApplyStringOption(field, value, debug_options);
+    case tsl::protobuf::FieldDescriptor::TYPE_INT32: {
+      int32_t int_value;
+      if (absl::SimpleAtoi(value, &int_value)) {
+        return ApplyInt32Option(field, int_value, debug_options);
+      }
+      break;
     }
-  } else if (field->type() == tsl::protobuf::FieldDescriptor::TYPE_INT64) {
-    int int_value;
-    if (absl::SimpleAtoi(value, &int_value)) {
-      reflection->SetInt64(&debug_options, field, int_value);
-      return absl::OkStatus();
+    case tsl::protobuf::FieldDescriptor::TYPE_INT64: {
+      int64_t int_value;
+      if (absl::SimpleAtoi(value, &int_value)) {
+        return ApplyInt64Option(field, int_value, debug_options);
+      }
+      break;
     }
-  } else if (field->type() == tsl::protobuf::FieldDescriptor::TYPE_FLOAT) {
-    float float_value;
-    if (absl::SimpleAtof(value, &float_value)) {
-      reflection->SetFloat(&debug_options, field, float_value);
-      return absl::OkStatus();
+    case tsl::protobuf::FieldDescriptor::TYPE_FLOAT: {
+      float float_value;
+      if (absl::SimpleAtof(value, &float_value)) {
+        return ApplyFloatOption(field, float_value, debug_options);
+      }
+      break;
     }
-  } else if (field->type() == tsl::protobuf::FieldDescriptor::TYPE_BOOL) {
-    bool bvalue = value == "True";
-    if (value == "True" || value == "False") {
-      reflection->SetBool(&debug_options, field, bvalue);
-      return absl::OkStatus();
+    case tsl::protobuf::FieldDescriptor::TYPE_DOUBLE: {
+      double double_value;
+      if (absl::SimpleAtod(value, &double_value)) {
+        return ApplyDoubleOption(field, double_value, debug_options);
+      }
+      break;
     }
-  } else if (field->type() == tsl::protobuf::FieldDescriptor::TYPE_ENUM) {
-    int int_value;
-    if (absl::SimpleAtoi(value, &int_value)) {
-      if (field->is_repeated()) {
-        reflection->AddEnumValue(&debug_options, field, int_value);
-      } else {
-        reflection->SetEnumValue(&debug_options, field, int_value);
+    case tsl::protobuf::FieldDescriptor::TYPE_BOOL: {
+      if (value == "True" || value == "False") {
+        return ApplyBoolOption(field, value == "True", debug_options);
       }
-      return absl::OkStatus();
-    } else {
-      if (value.empty() && field->is_repeated()) {
-        reflection->ClearField(&debug_options, field);
-        return absl::OkStatus();
+      break;
+    }
+    case tsl::protobuf::FieldDescriptor::TYPE_ENUM: {
+      int int_value;
+      if (absl::SimpleAtoi(value, &int_value)) {
+        return ApplyEnumOption(field, int_value, debug_options);
       }
       auto enum_desc = field->enum_type()->FindValueByName(value);
       if (enum_desc != nullptr) {
-        if (field->is_repeated()) {
-          reflection->AddEnum(&debug_options, field, enum_desc);
-        } else {
-          reflection->SetEnum(&debug_options, field, enum_desc);
-        }
+        return ApplyEnumOption(field, enum_desc, debug_options);
       }
+      break;
     }
+    default:
+      break;
   }
   return InvalidArgument(
       "While setting option %s, '%s' is not a valid %s value.", field->name(),
       value, field->type_name());
 }
 
+absl::Status CompileOptions::ApplyOptionFromString(
+    const tsl::protobuf::FieldDescriptor* field, const std::string& value) {
+  if (!field->is_repeated()) {
+    return ApplyOptionFromSingleString(
+        field, value, *executable_build_options.mutable_debug_options());
+  }
+  if (value.empty()) {
+    return absl::OkStatus();
+  }
+  for (const auto& v : absl::StrSplit(value, ',')) {
+    TF_RETURN_IF_ERROR(ApplyOptionFromSingleString(
+        field, std::string(v),
+        *executable_build_options.mutable_debug_options()));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index 24cef9f8a69d..2c8f724e2999 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -26,8 +26,6 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -41,6 +39,7 @@ limitations under the License.
 #include "xla/pjrt/execute_options.pb.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/compiler.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -50,6 +49,8 @@ limitations under the License.
 
 namespace xla {
 
+class PjRtClient;
+
 // Provides configuration for implementations that support compile and execute
 // spanning multiple slices. A slice is a set of devices connected by dedicated
 // high speed interconnect. Connectivity between slices is typically over data
@@ -120,9 +121,7 @@ struct CompileOptions {
   absl::Status ApplyOptionFromString(
       const tsl::protobuf::FieldDescriptor* field, const std::string& value);
 
-  static absl::StatusOr<
-      std::vector<std::pair<std::string, CompileOptions::OptionOverride>>>
-  LoadEnvOptionOverrides(
+  static absl::StatusOr<EnvironmentOptionOverrides> LoadEnvOptionOverrides(
       const google::protobuf::Map<std::string, xla::OptionOverrideProto>&
           env_option_overrides);
 
@@ -297,7 +296,8 @@ struct CompiledMemoryStats {
   int64_t host_alias_size_in_bytes = 0;
   int64_t host_temp_size_in_bytes = 0;
 
-  std::string serialized_hlo_proto = "";
+  std::optional<xla::BufferAssignmentProto> buffer_assignment;
+
   std::string DebugString() const;
 
   CompiledMemoryStatsProto ToProto() const;
@@ -321,7 +321,7 @@ class PjRtExecutable {
   // Unique name for this executable, e.g., HloModule name.
   virtual absl::string_view name() const = 0;
 
-  // Return an HloModule (optimized) per partition.
+  // Return an array of HloModule (optimized) per partition.
   virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
   GetHloModules() const = 0;
 
@@ -363,27 +363,31 @@ class PjRtExecutable {
   // Return memory stats that allow callers to estimate device memory usage
   // when running this executable.
   virtual absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
-    return Unimplemented("Retrieving CompiledMemoryStats is not supported.");
+    return absl::UnimplementedError(
+        "GetCompiledMemoryStats is not implemented.");
   }
 
   // Returns named values for cost properties of this executable (such as
   // operations, size of input/outputs, and run time estimate). Properties may
   // differ for different platforms.
   virtual absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
-  GetCostAnalysis() const = 0;
+  GetCostAnalysis() const {
+    return absl::UnimplementedError("GetCostAnalysis is not implemented.");
+  }
 
   // Serialize this executable into a string and return the value.
   virtual absl::StatusOr<std::string> SerializeExecutable() const {
-    return Unimplemented("Serializing executable is not supported.");
+    return absl::UnimplementedError("SerializeExecutable is not implemented.");
   }
 
   // Return a fingerprint of this executable.
   virtual absl::StatusOr<std::string> FingerprintExecutable() const {
-    return Unimplemented("Fingerprinting executable is not supported.");
+    return absl::UnimplementedError(
+        "FingerprintExecutable is not implemented.");
   }
 
   virtual absl::StatusOr<struct CompileOptions> GetCompileOptions() const {
-    return Unimplemented("CompileOptions not available.");
+    return absl::UnimplementedError("GetCompileOptions is not implemented.");
   }
 };
 
diff --git a/third_party/xla/xla/pjrt/pjrt_executable_test.cc b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
index 72c0da6f04bd..1e446870f781 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/client/executable_build_options.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status_matchers.h"
 
@@ -94,18 +95,24 @@ TEST(ExecuteOptionsTest, SendRecvNotSupported) {
 }
 
 TEST(ExecuteOptionsTest, ApplyOptionsCanParseStringsAndEnums) {
-  using OptionOverride = std::variant<std::string, bool, int64_t, double>;
-  std::vector<std::pair<std::string, OptionOverride>> env_override_options;
-  env_override_options = {
+  CompileOptions src;
+  src.env_option_overrides = {
       {"xla_gpu_use_runtime_fusion", std::string("True")},
       {"xla_gpu_graph_min_graph_size", std::string("2")},
-      {"xla_gpu_disable_async_collectives", std::string("2")},
       {"xla_gpu_redzone_scratch_max_megabytes", std::string("3400")},
       {"xla_gpu_auto_spmd_partitioning_memory_budget_ratio", 0.9},
-      {"xla_gpu_pgle_profile_file_or_directory_path", std::string("abc")}};
-  CompileOptions src;
-  src.env_option_overrides = env_override_options;
-  auto s = src.ApplyAllOptionOverrides();
+      {"xla_gpu_pgle_profile_file_or_directory_path", std::string("abc")},
+      // Repeated fields.
+      {"xla_gpu_disable_async_collectives", std::string("2,REDUCESCATTER")},
+      {"xla_disable_hlo_passes",
+       std::string("rematerialization,something else")},
+      // Repeated fields provided twice. The last one wins.
+      {"xla_enable_hlo_passes_only", std::string("one, two, three")},
+      {"xla_enable_hlo_passes_only", std::string(",,second, , third,")},
+      {"xla_gpu_enable_command_buffer", std::string("CUSTOM_CALL,COLLECTIVES")},
+      {"xla_gpu_enable_command_buffer",
+       static_cast<int64_t>(DebugOptions::CUSTOM_CALL)}};
+  TF_EXPECT_OK(src.ApplyAllOptionOverrides());
   auto& debug_options = src.executable_build_options.debug_options();
   EXPECT_EQ(debug_options.xla_gpu_use_runtime_fusion(), true);
   EXPECT_EQ(debug_options.xla_gpu_graph_min_graph_size(), 2);
@@ -113,8 +120,21 @@ TEST(ExecuteOptionsTest, ApplyOptionsCanParseStringsAndEnums) {
   EXPECT_FLOAT_EQ(
       debug_options.xla_gpu_auto_spmd_partitioning_memory_budget_ratio(), 0.9);
   EXPECT_EQ(debug_options.xla_gpu_pgle_profile_file_or_directory_path(), "abc");
-  EXPECT_EQ(debug_options.xla_gpu_disable_async_collectives().size(), 1);
-  EXPECT_EQ(debug_options.xla_gpu_disable_async_collectives()[0], 2);
+  EXPECT_THAT(debug_options.xla_gpu_disable_async_collectives(),
+              testing::ElementsAre(xla::DebugOptions::ALLGATHER,
+                                   xla::DebugOptions::REDUCESCATTER));
+  EXPECT_THAT(debug_options.xla_disable_hlo_passes(),
+              testing::ElementsAre("rematerialization", "something else"));
+  EXPECT_THAT(debug_options.xla_enable_hlo_passes_only(),
+              testing::ElementsAre("", "", "second", " ", " third", ""));
+  EXPECT_THAT(debug_options.xla_gpu_enable_command_buffer(),
+              testing::ElementsAre(DebugOptions::CUSTOM_CALL));
+
+  // Test that repeated fields are cleared when empty string is provided.
+  src.env_option_overrides = {
+      {"xla_gpu_enable_command_buffer", std::string("")}};
+  TF_EXPECT_OK(src.ApplyAllOptionOverrides());
+  EXPECT_TRUE(debug_options.xla_gpu_enable_command_buffer().empty());
 }
 
 TEST(CompiledMemoryStatsTest, Serialization) {
diff --git a/third_party/xla/xla/pjrt/pjrt_future.cc b/third_party/xla/xla/pjrt/pjrt_future.cc
index c4b6a162c637..adb21382cfd8 100644
--- a/third_party/xla/xla/pjrt/pjrt_future.cc
+++ b/third_party/xla/xla/pjrt/pjrt_future.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/base/optimization.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
@@ -53,7 +55,7 @@ PjRtFuture<> JoinFutures(absl::Span<const PjRtFuture<>> futures) {
 
   for (const PjRtFuture<>& future : futures) {
     future.OnReady([state](absl::Status status) {
-      if (!status.ok()) {
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
         absl::MutexLock lock(&state->mu);
         if (VLOG_IS_ON(2)) {
           if (!state->status.ok() && status.code() != state->status.code()) {
@@ -64,7 +66,7 @@ PjRtFuture<> JoinFutures(absl::Span<const PjRtFuture<>> futures) {
         state->status.Update(status);
       }
 
-      const int pending_count =
+      int32_t pending_count =
           state->pending_count.fetch_sub(1, std::memory_order_acq_rel);
       CHECK_GE(pending_count, 1) << "Pending count can't drop below 0";
 
diff --git a/third_party/xla/xla/pjrt/pjrt_future.h b/third_party/xla/xla/pjrt/pjrt_future.h
index ac76be51859a..52af7bbf8e47 100644
--- a/third_party/xla/xla/pjrt/pjrt_future.h
+++ b/third_party/xla/xla/pjrt/pjrt_future.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "tsl/platform/logging.h"
+#include "xla/tsl/platform/logging.h"
 
 namespace xla {
 
@@ -37,7 +37,7 @@ template <class T = void>
 class PjRtFuture;
 
 namespace internal {
-template <class T, bool unique>
+template <class T, bool is_move_only>
 class PjRtFutureBase;
 }
 
@@ -60,14 +60,14 @@ PjRtFuture<> JoinFutures(absl::Span<const PjRtFuture<>> futures);
 // The caller indicates that the work tracked by the ScopedAsyncTrackingEvent
 // has completed by letting the event go out of scope.
 //
-// ScopedAsyncTrackingEvents are used by some PjRtClient implementations to
+// ScopedAsyncTrackingEvent is used by some PjRtClient implementations to
 // monitor system-wide dependencies.
 class ScopedAsyncTrackingEvent {
  public:
   virtual ~ScopedAsyncTrackingEvent() = default;
 
  private:
-  template <class T, bool unique>
+  template <class T, bool is_move_only>
   friend class internal::PjRtFutureBase;
 
   // Indicates that the ScopedAsyncTrackingEvent won't complete until dependency
@@ -108,11 +108,11 @@ struct IsStatusOr<absl::StatusOr<T>> : public std::true_type {};
 // PjRtFuture<T> (by default we always disable copy constructor when `T` is not
 // copyable), which makes PjRtFuture<T> an `std::unique_ptr`-like container for
 // move-only types.
-template <bool unique>
+template <bool is_move_only>
 class PjRtFutureMoveControl;
 
 template <>
-class PjRtFutureMoveControl</*unique=*/true> {
+class PjRtFutureMoveControl</*is_move_only=*/true> {
  protected:
   PjRtFutureMoveControl() = default;
 
@@ -124,7 +124,7 @@ class PjRtFutureMoveControl</*unique=*/true> {
 };
 
 template <>
-class PjRtFutureMoveControl</*unique=*/false> {
+class PjRtFutureMoveControl</*is_move_only=*/false> {
  protected:
   PjRtFutureMoveControl() = default;
 
@@ -136,11 +136,11 @@ class PjRtFutureMoveControl</*unique=*/false> {
 };
 
 // A base class for a stateful future PjRtFuture<T> and a stateless future
-// PjRtFuture<>. If `unique` is true, PjRtFuture derived from this class acts
-// as a move-only type and the value can be passed to the caller only using move
-// assignment (applied to Await and OnReady APIs).
-template <typename T, bool unique = !std::is_copy_constructible_v<T>>
-class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
+// PjRtFuture<>. If `is_move_only` is true, PjRtFuture derived from this class
+// acts as a move-only type and the value can be passed to the caller only using
+// move assignment (applied to Await and OnReady APIs).
+template <typename T, bool is_move_only = !std::is_copy_constructible_v<T>>
+class PjRtFutureBase : public PjRtFutureMoveControl<is_move_only> {
  protected:
   // A protected constructor that hides AsyncValueRef implementation detail
   // from the end users of PjRtFuture and Promise. Must not be made public!
@@ -195,11 +195,13 @@ class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
   // has no effect.
   void AssertHappensBefore(ScopedAsyncTrackingEvent* event) {
     CHECK(IsValid());
-    if (event) event->AddDependency(promise_.CopyRCRef());
+    if (event) {
+      event->AddDependency(promise_.CopyRCRef());
+    }
   }
 
  protected:
-  static constexpr bool is_unique() { return unique; }
+  static constexpr bool IsMoveOnly() { return is_move_only; }
 
   // PjRtFuture<T>::Promise provides a facility to store a value or an error
   // that is later acquired asynchronously via a PjRtFuture<T> constructed from
@@ -246,29 +248,46 @@ class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
     // In debug builds we track the number of futures created from a promise to
     // detect when a promise for a move-only type can be accidentally shared by
     // multiple futures. We wrap the counter into shared pointer because promise
-    // for a unique future is still copyable, but only one future can be created
-    // from all the copies.
+    // for a move-only future is still copyable, but only one future can be
+    // created from all the copies.
     std::shared_ptr<std::atomic<int64_t>> num_futures_ =
         std::make_shared<std::atomic<int64_t>>(0);
 #endif
   };
 
-  PjRtFutureHelpers::ProfilingKeys OnBlockStart() const {
-    return on_block_start_ ? on_block_start_()
-                           : PjRtFutureHelpers::ProfilingKeys();
-  }
+  class ProfilingCleanup {
+   public:
+    ProfilingCleanup(const PjRtFutureBase* parent,
+                     PjRtFutureHelpers::ProfilingKeys keys)
+        : parent_(parent), keys_(std::move(keys)) {}
+    ~ProfilingCleanup() {
+      if (parent_ && parent_->on_block_end_) {
+        parent_->on_block_end_(std::move(keys_));
+      }
+    }
+    ProfilingCleanup(const ProfilingCleanup& other) = delete;
+    ProfilingCleanup(ProfilingCleanup&& other) = delete;
+
+   private:
+    const PjRtFutureBase* parent_;
+    PjRtFutureHelpers::ProfilingKeys keys_;
+  };
 
-  void OnBlockEnd(PjRtFutureHelpers::ProfilingKeys keys) const {
-    if (on_block_end_) on_block_end_(std::move(keys));
+  ProfilingCleanup OnBlockStartScope() const {
+    return ProfilingCleanup(this, on_block_start_
+                                      ? on_block_start_()
+                                      : PjRtFutureHelpers::ProfilingKeys());
   }
 
-  // Blocks the calling thread until the future is ready.
-  void BlockUntilReady() const {
+  // Calls block_until_ready_fn to wait until the underlying AsyncValue is
+  // concrete. block_until_ready_fn should be equivalent to
+  // tsl::BlockUntilReady.
+  template <typename Fn>
+  void BlockUntilReady(Fn&& block_until_ready_fn) const {
     CHECK(IsValid());
     if (!promise_.IsAvailable()) {
-      PjRtFutureHelpers::ProfilingKeys keys = OnBlockStart();
-      tsl::BlockUntilReady(promise_);
-      OnBlockEnd(std::move(keys));
+      ProfilingCleanup scope = OnBlockStartScope();
+      block_until_ready_fn(promise_.GetAsyncValue());
     }
     DCHECK(promise_.IsConcrete());
   }
@@ -276,19 +295,21 @@ class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
   // Blocks the calling thread until the future is ready, then returns the
   // final value.
   const T& Await() const& {
-    BlockUntilReady();
+    BlockUntilReady(
+        static_cast<void (*)(tsl::AsyncValue*)>(tsl::BlockUntilReady));
     return *promise_;
   }
 
   // Blocks the calling thread until the future is ready, then returns the
   // final value.
-  std::conditional_t<unique, T, const T&> Await() && {
-    BlockUntilReady();
+  std::conditional_t<is_move_only, T, const T&> Await() && {
+    BlockUntilReady(
+        static_cast<void (*)(tsl::AsyncValue*)>(tsl::BlockUntilReady));
 
-    if constexpr (unique) {
+    if constexpr (is_move_only) {
       return std::move(*promise_);
     } else {
-      // We can't move from the promise to the caller because for non-unique
+      // We can't move from the promise to the caller because for copyable
       // futures we can have multiple copies of the PjRtFuture sharing the
       // same underlying promise object.
       return *promise_;
@@ -302,8 +323,9 @@ class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
   // The client should avoid any potentially re-entrant API calls within the
   // callback, for example by using the callback to enqueue work on a
   // client-owned threadpool.
-  template <typename F, std::enable_if_t<std::is_invocable_v<F, const T&> &&
-                                         !unique>* = nullptr>
+  template <typename F,
+            std::enable_if_t<!is_move_only &&
+                             std::is_invocable_v<F, const T&>>* = nullptr>
   void OnReady(F&& f) const& {
     CHECK(IsValid());
     promise_.AndThen(
@@ -320,21 +342,21 @@ class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
   // The client should avoid any potentially re-entrant API calls within the
   // callback, for example by using the callback to enqueue work on a
   // client-owned threadpool.
-  template <
-      typename F,
-      std::enable_if_t<unique ? std::is_invocable_v<F, T>
-                              : std::is_invocable_v<F, const T&>>* = nullptr>
+  template <typename F,
+            std::enable_if_t<is_move_only ? std::is_invocable_v<F, T>
+                                          : std::is_invocable_v<F, const T&>>* =
+                nullptr>
   void OnReady(F&& f) && {
     CHECK(IsValid());
     promise_.AndThen(
         [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
           DCHECK(promise.IsConcrete());
-          if constexpr (unique) {
+          if constexpr (is_move_only) {
             f(std::move(*promise));
           } else {
-            // We can't move from the promise to the caller because for
-            // non-unique futures we can have multiple copies of the PjRtFuture
-            // sharing the same underlying promise object.
+            // We can't move from the promise to the caller because for copyable
+            // futures we can have multiple copies of the PjRtFuture sharing the
+            // same underlying promise object.
             f(*promise);
           }
         });
@@ -363,11 +385,11 @@ class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
 //
 // First, in contrast to AsyncValueRef which has a smart-pointer semantics,
 // future has more of a value semantics, i.e. future of a move-only type also
-// is a move-only type. You can think of a move-only (unique) future as a box to
-// pass a value of type T between asynchronous producer/consumer: you can open
-// the box once to put the value into it and you can open the box only once to
-// take the value out of it. For copyable types PjRtFuture<T> is a copyable
-// type, although all copies share the same underlying value.
+// is a move-only type. You can think of a move-only future as a box to pass a
+// value of type T between asynchronous producer/consumer: you can open the box
+// once to put the value into it and you can open the box only once to take the
+// value out of it. For copyable types PjRtFuture<T> is a copyable type,
+// although all copies share the same underlying value.
 //
 // Second, we want to retain portability in case a future implementation moves
 // away from AsyncValueRef ---- we don't want clients to call arbitrary
@@ -424,9 +446,9 @@ class PjRtFuture : public internal::PjRtFutureBase<absl::StatusOr<T>> {
       : Base(promise.release(), std::move(on_block_start),
              std::move(on_block_end)) {
 #ifndef NDEBUG
-    if constexpr (Base::is_unique()) {
+    if constexpr (Base::IsMoveOnly()) {
       DCHECK_EQ(promise.AddFuture(), 0)
-          << "Unique PjRtFuture cannot share a promise object";
+          << "Move-only PjRtFuture cannot share a promise object";
     }
 #endif
   }
@@ -483,6 +505,7 @@ class PjRtFuture<void> : public internal::PjRtFutureBase<absl::Status> {
              std::move(on_block_end)) {}
 
   using Base::Await;
+  using Base::BlockUntilReady;
   using Base::OnReady;
 };
 
diff --git a/third_party/xla/xla/pjrt/pjrt_future_test.cc b/third_party/xla/xla/pjrt/pjrt_future_test.cc
index 8f342d2f7f19..fd834e4bba2d 100644
--- a/third_party/xla/xla/pjrt/pjrt_future_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_future_test.cc
@@ -22,7 +22,8 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
 
 namespace xla {
 
@@ -231,4 +232,17 @@ TEST(PjRtFutureTest, JoinErrors) {
   EXPECT_EQ(join_two.Await(), absl::InternalError("error #0"));
 }
 
+//===----------------------------------------------------------------------===//
+// Performance benchmarks.
+//===----------------------------------------------------------------------===//
+
+static void BM_CreateOkFuture(benchmark::State& state) {
+  for (auto _ : state) {
+    PjRtFuture<> future(absl::OkStatus());
+    benchmark::DoNotOptimize(future);
+  }
+}
+
+BENCHMARK(BM_CreateOkFuture);
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_layout.h b/third_party/xla/xla/pjrt/pjrt_layout.h
index e4318102bf7c..e3b6d44cf72a 100644
--- a/third_party/xla/xla/pjrt/pjrt_layout.h
+++ b/third_party/xla/xla/pjrt/pjrt_layout.h
@@ -17,11 +17,13 @@ limitations under the License.
 #define XLA_PJRT_PJRT_LAYOUT_H_
 
 #include <memory>
+#include <ostream>
 #include <string>
 #include <utility>
 
 #include "absl/log/check.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/layout.h"
@@ -64,10 +66,19 @@ class PjRtLayout {
     return H::combine(std::move(state), layout.xla_layout_);
   }
 
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const PjRtLayout& layout) {
+    absl::Format(&sink, "%s", layout.ToString());
+  }
+
  private:
   Layout xla_layout_;
 };
 
+inline std::ostream& operator<<(std::ostream& out, const PjRtLayout& layout) {
+  return out << layout.ToString();
+}
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_PJRT_LAYOUT_H_
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 62ebbcf0097d..c5e710a56800 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -87,6 +87,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -100,8 +101,11 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/host_callback.h"
@@ -115,7 +119,9 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/profiling/device_time_measurement.h"
+#include "xla/pjrt/profiling/profiling_context.h"
 #include "xla/pjrt/semaphore.h"
+#include "xla/pjrt/stream_executor_executable.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/pjrt/utils.h"
@@ -149,6 +155,11 @@ limitations under the License.
 
 namespace xla {
 
+template <typename T>
+static std::function<void()> WrapClosureAsCopyable(T cb) {
+  return [state = std::make_shared<T>(std::move(cb))]() { return (*state)(); };
+}
+
 PjRtStreamExecutorMemorySpace::PjRtStreamExecutorMemorySpace(
     int id, PjRtDevice* device, absl::string_view kind, int kind_id)
     : id_(id), device_(device), kind_(kind), kind_id_(kind_id) {
@@ -353,7 +364,7 @@ void RecordUsage(PjRtStreamExecutorBuffer::ScopedHold device_buffer,
                  LocalDeviceState* stream_local_device,
                  std::shared_ptr<BufferSequencingEvent> event,
                  se::Stream* usage_stream,
-                 std::vector<std::shared_ptr<TrackedDeviceBuffer>>*
+                 std::vector<tsl::RCReference<RawSEDeviceMemory>>*
                      buffers_to_release = nullptr) {
   tsl::profiler::TraceMe traceme("RecordUsage");
   bool retain_buffer_until_completion =
@@ -365,9 +376,10 @@ void RecordUsage(PjRtStreamExecutorBuffer::ScopedHold device_buffer,
        LocalDeviceState::kSynchronous);
   if (retain_buffer_until_completion) {
     if (buffers_to_release) {
-      buffers_to_release->push_back(device_buffer.buffer());
+      buffers_to_release->push_back(device_buffer->device_memory());
     } else {
-      buffer_local_device->ThenRelease(usage_stream, device_buffer.buffer())
+      buffer_local_device
+          ->ThenRelease(usage_stream, device_buffer->device_memory())
           .IgnoreError();
     }
   }
@@ -380,7 +392,6 @@ void RecordUsage(PjRtStreamExecutorBuffer::ScopedHold device_buffer,
 // had an event recorded.
 absl::Status AddDestinationBufferSynchronization(
     LocalDeviceState* local_device,
-    PjRtStreamExecutorBuffer::ScopedHold device_buffer,
     std::shared_ptr<BufferSequencingEvent> definition_event,
     se::Stream* copy_stream) {
   absl::StatusOr<EventPool::Handle> event_or =
@@ -391,8 +402,6 @@ absl::Status AddDestinationBufferSynchronization(
   }
   definition_event->SetSequencingEvent(std::move(event_or).value(),
                                        copy_stream);
-  RecordUsage(std::move(device_buffer), local_device, local_device,
-              definition_event, copy_stream);
   return absl::OkStatus();
 }
 
@@ -426,8 +435,9 @@ AllocateDestinationBuffer(
     bool is_uninitialized_create, PjRtStreamExecutorClient* client,
     std::shared_ptr<BufferSequencingEvent> definition_event,
     PjRtMemorySpace* memory_space) {
-  if (on_host_shape.IsTuple() && on_host_shape.tuple_shapes_size() == 0) {
-    return InvalidArgument("Can't make a buffer from an empty tuple");
+  if (on_host_shape.IsTuple()) {
+    return InvalidArgument(
+        "Cannot allocate a PjRtStreamExecutorBuffer for a tuple.");
   }
 
   PjRtMemorySpace* default_memory_space =
@@ -509,129 +519,31 @@ AllocateDestinationBuffer(
           std::make_shared<BufferSequencingEvent>(client->thread_pool()));
     }
   }
-  se::Stream* tuple_table_stream = local_device->host_to_device_stream();
-  if (on_device_shape.IsTuple()) {
-    // We also need to copy the tuple tables, so we'll have an additional
-    // definition event for that copy to complete.
-    if (tuple_table_stream != copy_stream) {
-      if (local_device->allocation_model() ==
-          LocalDeviceState::kComputeSynchronized) {
-        DCHECK(
-            tuple_table_stream->WaitFor(local_device->compute_stream()).ok());
-      } else {
-        DCHECK(transfer_manager->CanShapedBufferBeAccessedNow(
-            local_device->compute_stream()->parent(), dst_buffer));
-      }
-    }
 
-    TF_RETURN_IF_ERROR(transfer_manager->WriteTupleIndexTablesAsync(
-        tuple_table_stream, dst_buffer));
-    // CAUTION: From this point onwards we need to be careful about returning
-    // from error cases because we have started a transfer and must not allow
-    // dst_buffer to be freed too soon in the non-async allocation models.
+  auto mem = RawSEDeviceMemory::Create(dst_buffer.buffer({}),
+                                       device->local_device_id(),
+                                       dst_buffer.memory_allocator());
+  dst_buffer.clear();
 
-    definition_events.emplace_back(
-        std::make_shared<BufferSequencingEvent>(client->thread_pool()));
-    absl::StatusOr<EventPool::Handle> event_or =
-        local_device->event_pool().ThenAllocateAndRecordEvent(
-            tuple_table_stream);
-    if (!event_or.ok()) {
-      StallStreamOnError(local_device, tuple_table_stream);
-      return event_or.status();
-    }
-    definition_events.back()->SetSequencingEvent(std::move(event_or).value(),
-                                                 tuple_table_stream);
-  }
-  std::shared_ptr<TrackedDeviceBuffer> dst_device_buffer =
-      TrackedDeviceBuffer::FromScopedShapedBuffer(&dst_buffer,
-                                                  definition_events, device);
+  auto dst_device_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device, std::move(mem), definition_events);
 
   auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
       on_device_shape, std::move(dst_device_buffer), client, device,
       memory_space);
-
-  if (on_device_shape.IsTuple()) {
-    // Add a usage hold for the tuple table write and immediately convert it to
-    // the appropriate form of synchronization.
-    RecordUsage(py_buffer->GetBufferWithUsageHold(), local_device, local_device,
-                definition_events.back(), tuple_table_stream);
-  }
-
   return py_buffer;
 }
 
-PjRtStreamExecutorBuffer::ScopedHold::~ScopedHold() {
-  if (ok()) {
-    parent_->DropHold(type_, buffer().get());
-  }
-}
-
-PjRtStreamExecutorBuffer::ScopedHold::ScopedHold(ScopedHold&& other)
-    : parent_(other.parent_),
-      type_(other.type_),
-      state_(other.state_),
-      status_(std::move(other.status_)),
-      buffer_(std::move(other.buffer_)) {
-  // Preserve the invariant that status is invalid if buffer == nullptr.
-  other.SetState(kMoved);
-}
-
-void PjRtStreamExecutorBuffer::ScopedHold::Acquire(
-    absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or) {
-  CHECK(!ok());
-  if (buffer_or.ok()) {
-    buffer_ = buffer_or.value();
-    SetState(kValid);
-  } else {
-    status_ = buffer_or.status();
-    buffer_ = nullptr;
-    SetState(kError);
-  }
-  // Check the invariant holds.
-  CHECK(!ok() || buffer_ != nullptr);
-}
-
-PjRtStreamExecutorBuffer::ScopedHold::ForClosure
-PjRtStreamExecutorBuffer::ScopedHold::ToClosure() {
-  CHECK(ok());
-  ForClosure for_closure(parent_, type_, state_, std::move(status_),
-                         std::move(buffer_));
-  SetState(kReleased);
-  return for_closure;
-}
-
 void PjRtStreamExecutorBuffer::ScopedHold::ConvertUsageHold(
     se::Stream* usage_stream, std::shared_ptr<BufferSequencingEvent> event,
     bool reference_held) {
   CHECK(ok());
-  CHECK_EQ(type_, kUsage);
-  parent_->ConvertUsageHold(buffer().get(), usage_stream, std::move(event),
-                            reference_held);
+  CHECK_EQ(type(), kUsage);
+  parent()->ConvertUsageHold(buffer(), usage_stream, std::move(event),
+                             reference_held);
   SetState(kConverted);
 }
 
-void PjRtStreamExecutorBuffer::ScopedHold::ConfirmDonation(
-    bool unsafe_release) {
-  CHECK(ok());
-  CHECK_EQ(type_, kDonation);
-  parent_->ConfirmDonation(buffer().get(), unsafe_release);
-  SetState(kDonated);
-}
-
-void PjRtStreamExecutorBuffer::ScopedHold::AddToInput(
-    ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-    const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
-    ExecutionInput* execution_input,
-    se::DeviceMemoryAllocator* allocator) const {
-  CHECK(ok());
-  if (type_ == kDonation) {
-    buffer()->AddToInputAsDonated(iterator, end, execution_input, allocator);
-  } else {
-    CHECK_EQ(type_, kUsage);
-    buffer()->AddToInputAsImmutable(iterator, end);
-  }
-}
-
 bool PjRtStreamExecutorBuffer::IsOnCpu() const {
   return memory_space() != nullptr &&
          memory_space()->kind() == PinnedHostMemorySpace::kKind;
@@ -643,19 +555,15 @@ absl::StatusOr<Shape> PjRtStreamExecutorBuffer::logical_on_device_shape() {
   }
   auto* local_device = device_->local_device_state();
   auto* stream = local_device->GetDeviceToHostStream();
-  ScopedHold device_buffer(this, ScopedHold::kUsage);
-  {
-    absl::MutexLock lock(&mu_);
-    // We can't perform any other action while a donation hold is in progress.
-    WaitForOutstandingDonationHold();
-    if (device_buffer_ == nullptr) {
-      return InvalidArgument(
-          "logical_on_device_shape() called on deleted or donated buffer");
-    }
-    AcquireHoldLocked(&device_buffer);
+  auto device_buffer = GetBufferWithUsageHold();
+  if (!device_buffer.ok()) {
+    return InvalidArgument(
+        "logical_on_device_shape() called on deleted or donated buffer: %s",
+        device_buffer.status().ToString());
   }
 
-  WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
+  WaitForBufferDefinitionEventsOnStream(device_buffer->definition_events(),
+                                        stream);
   ShapedBuffer shaped_buffer = device_buffer->AsShapedBuffer(on_device_shape_);
   absl::StatusOr<EventPool::Handle> event_or =
       local_device->event_pool().AllocateEvent(stream->parent());
@@ -681,7 +589,7 @@ class ScopedHoldAsExternalReference : public PjRtBuffer::ExternalReference {
       : external_reference_(std::move(hold)) {
     CHECK(external_reference_.type() ==
           PjRtStreamExecutorBuffer::ScopedHold::kExternalReference);
-    data_ptr_ = external_reference_->device_memory().front()->opaque();
+    data_ptr_ = external_reference_->device_memory()->opaque();
   }
 
   ~ScopedHoldAsExternalReference() override = default;
@@ -713,15 +621,15 @@ class TrackedDeviceBufferExternalReference
     : public PjRtBuffer::ExternalReference {
  public:
   explicit TrackedDeviceBufferExternalReference(
-      std::shared_ptr<TrackedDeviceBuffer> tracked_device_buffer)
-      : tracked_device_buffer_(std::move(tracked_device_buffer)) {
-    data_ptr_ = tracked_device_buffer_->device_memory()[0]->opaque();
+      tsl::RCReference<RawSEDeviceMemory> memory)
+      : memory_(std::move(memory)) {
+    data_ptr_ = memory_->opaque();
   }
 
   ~TrackedDeviceBufferExternalReference() override = default;
 
  private:
-  std::shared_ptr<TrackedDeviceBuffer> tracked_device_buffer_;
+  tsl::RCReference<RawSEDeviceMemory> memory_;
 };
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
@@ -731,9 +639,8 @@ PjRtStreamExecutorBuffer::ReleaseDeviceMemoryOwnership(
     return InvalidArgument(
         "ReleaseDeviceMemoryOwnership allowed only for non-tuple");
   }
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<TrackedDeviceBuffer> tracked_device_buffer,
-      Release(wait_for_operations_to_complete));
+  TF_ASSIGN_OR_RETURN(tsl::RCReference<RawSEDeviceMemory> tracked_device_buffer,
+                      Release(wait_for_operations_to_complete));
 
   std::unique_ptr<PjRtBuffer::ExternalReference> ref;
   if (tracked_device_buffer) {
@@ -758,10 +665,7 @@ PjRtStreamExecutorBuffer::DonateWithControlDependency(PjRtFuture<> dependency) {
   }
 
   // Copy all the data in the existing tracked_buffer.
-  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 4> buffers(
-      tracked_buffer->device_memory().begin(),
-      tracked_buffer->device_memory().end());
-  auto original_definition_events = tracked_buffer->definition_events();
+  const auto& original_definition_events = tracked_buffer->definition_events();
   absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 4>
       definition_events;
 
@@ -774,8 +678,8 @@ PjRtStreamExecutorBuffer::DonateWithControlDependency(PjRtFuture<> dependency) {
                            original_definition_events.begin(),
                            original_definition_events.end());
 
-  auto new_device_buffer = std::make_shared<TrackedDeviceBuffer>(
-      device(), std::move(buffers), std::move(definition_events));
+  auto new_device_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device(), tracked_buffer->device_memory(), std::move(definition_events));
 
   // Make the new buffer which is identical to the old, except for the new
   // definition event.
@@ -800,7 +704,7 @@ PjRtStreamExecutorBuffer::DonateWithControlDependency(PjRtFuture<> dependency) {
         local_device->ReturnStreamToPool(std::move(stream));
       });
 
-  tracked_buffer.ConfirmDonation(false);
+  tracked_buffer.ConfirmDonation();
   return new_buffer;
 }
 
@@ -841,7 +745,8 @@ PjRtStreamExecutorClient::BufferFromHostBufferInternal(
         device_shape,
         transfer_manager->ChooseCompactLayoutForShape(device_shape));
   }
-  absl::InlinedVector<int64_t, 4> shape_strides(device_shape.dimensions_size());
+  absl::InlinedVector<int64_t, 4> shape_strides(
+      device_shape.dimensions().size());
   TF_RETURN_IF_ERROR(
       ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
@@ -891,8 +796,8 @@ PjRtStreamExecutorClient::BufferFromHostBufferInternal(
   // Allocating multigigabyte pinned buffers can be very slow. In that case,
   // using a staging buffer is probably worse than not using one.
   // TODO(phawkins): add chunking for transfers.
-  if (!IsDmaMapped(data, packed_size) &&
-      (must_use_staging_buffer || (should_stage_host_to_device_transfers() &&
+  if (must_use_staging_buffer || (!IsDmaMapped(data, packed_size) &&
+                                  (should_stage_host_to_device_transfers() &&
                                    packed_size < (int64_t{1} << 30)))) {
     void* ptr = host_memory_allocator()->AllocateRaw(
         tsl::Allocator::kAllocatorAlignment, transpose ? size : packed_size);
@@ -933,6 +838,9 @@ PjRtStreamExecutorClient::BufferFromHostBufferInternal(
     }
   }
 
+  std::shared_ptr<BufferSequencingEvent> event =
+      device_buffer->definition_events()[0];
+
   // The host to device transfer is performed on a thread pool, mostly because
   // it includes linearization that may be slow. It is OK to capture the
   // py_buffer pointer because the py_buffer can't be deleted until all the
@@ -941,8 +849,9 @@ PjRtStreamExecutorClient::BufferFromHostBufferInternal(
   // put the transfer into the calling thread for small literals.
   auto transfer_h2d =
       [local_client = client(), transfer_manager, local_device, data, size,
-       type, packed_size, movable_device_buffer{device_buffer.ToClosure()},
-       device_shape, should_pack, py_buffer{py_buffer.get()},
+       type, packed_size, event,
+       device_memory_owned = device_buffer->device_memory(), device_shape,
+       should_pack, py_buffer{py_buffer.get()},
        on_device_shape{py_buffer->on_device_shape()},
        staging_buffer{std::move(staging_buffer)},
        on_done_with_host_buffer =
@@ -950,17 +859,14 @@ PjRtStreamExecutorClient::BufferFromHostBufferInternal(
                ? std::make_shared<absl::AnyInvocable<void() &&>>(
                      std::move(on_done_with_host_buffer))
                : nullptr,
-       host_buffer_semantics, transpose{std::move(transpose)}]() {
-        PjRtStreamExecutorBuffer::ScopedHold device_buffer(
-            movable_device_buffer);
+       host_buffer_semantics, transpose{std::move(transpose)}]() mutable {
         // This function uses TF_CHECK_OK and value() since we have no way
         // to report failures from a callback. However, the operations here are
         // unlikely to fail and not recoverable even if we were to fail: DMAs to
         // memory that has already been allocated, and a possible Event
         // allocation.
 
-        se::DeviceMemoryBase device_memory =
-            device_buffer->device_memory()[0]->mem();
+        se::DeviceMemoryBase device_memory = device_memory_owned->mem();
 
         // If applicable on the backend, stage the transfer via host memory
         // allocated via the host_memory_allocator. On GPU, this is pinned
@@ -999,11 +905,13 @@ PjRtStreamExecutorClient::BufferFromHostBufferInternal(
               &device_memory, data, packed_size));
         }
 
-        std::shared_ptr<BufferSequencingEvent> event =
-            device_buffer->definition_events()[0];
         TF_CHECK_OK(AddDestinationBufferSynchronization(
-            local_device, std::move(device_buffer), event,
-            local_device->host_to_device_stream()));
+            local_device, event, local_device->host_to_device_stream()));
+
+        local_device
+            ->ThenRelease(local_device->host_to_device_stream(),
+                          device_memory_owned)
+            .IgnoreError();
 
         TF_CHECK_OK(local_device->ThenExecuteCallback(
             local_device->host_to_device_stream(),
@@ -1015,7 +923,9 @@ PjRtStreamExecutorClient::BufferFromHostBufferInternal(
               }
             }));
       };
-  thread_pool()->Schedule(transfer_h2d);
+  thread_pool()->Schedule(WrapClosureAsCopyable(std::move(transfer_h2d)));
+  RecordUsage(std::move(device_buffer), local_device, local_device, event,
+              local_device->host_to_device_stream());
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 
@@ -1084,8 +994,8 @@ PjRtStreamExecutorClient::CreateErrorBuffer(absl::Status error,
   definition_event->SetDefinedStatus(error);
 
   // Create an empty buffer.
-  auto dummy_device_buffer = std::make_shared<TrackedDeviceBuffer>(
-      device, absl::Span<tsl::RCReference<RawSEDeviceMemory>>(),
+  auto dummy_device_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device, tsl::RCReference<RawSEDeviceMemory>(),
       absl::MakeSpan(&definition_event, 1));
 
   auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
@@ -1096,7 +1006,14 @@ PjRtStreamExecutorClient::CreateErrorBuffer(absl::Status error,
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
-                                                PjRtMemorySpace* memory_space) {
+                                                PjRtMemorySpace* memory_space,
+                                                const Layout* device_layout) {
+  if (device_layout) {
+    return absl::UnimplementedError(absl::StrCat(
+        "BufferFromHostLiteral with device_layout is not implemented on "
+        "platform: ",
+        platform_name()));
+  }
   CHECK_EQ(memory_space->devices().size(), 1);
   PjRtDevice* device = memory_space->devices().front();
 
@@ -1122,6 +1039,9 @@ PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
       py_buffer->GetBufferWithUsageHold());
   CHECK(device_buffer.ok());
 
+  std::shared_ptr<BufferSequencingEvent> event =
+      device_buffer->definition_events()[0];
+
   // The host to device transfer is performed on a thread pool, mostly because
   // it includes linearization that may be slow. It is OK to capture the
   // py_buffer pointer because the py_buffer can't be deleted until all the
@@ -1129,10 +1049,10 @@ PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
   // TODO(misard) assess if it would be preferable to introduce a heuristic to
   // put the transfer into the calling thread for small literals.
   auto transfer_h2d = [local_client = client(), transfer_manager, local_device,
-                       movable_device_buffer{device_buffer.ToClosure()},
-                       literal, py_buffer{py_buffer.get()},
-                       on_device_shape{py_buffer->on_device_shape()}]() {
-    PjRtStreamExecutorBuffer::ScopedHold device_buffer(movable_device_buffer);
+                       device_memory = device_buffer->device_memory(), device,
+                       event, literal, py_buffer{py_buffer.get()},
+                       on_device_shape{
+                           py_buffer->on_device_shape()}]() mutable {
     // This function uses TF_CHECK_OK and value() since we have no way
     // to report failures from a callback. However, the operations here are
     // unlikely to fail and not recoverable even if we were to fail: DMAs to
@@ -1141,14 +1061,15 @@ PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
 
     se::Stream* h2d_stream = local_device->host_to_device_stream();
 
-    ShapedBuffer buffer = device_buffer->AsShapedBuffer(on_device_shape);
+    ShapedBuffer buffer =
+        device_memory->AsShapedBuffer(device, on_device_shape);
     TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
         h2d_stream, literal, buffer));
 
-    std::shared_ptr<BufferSequencingEvent> event =
-        device_buffer->definition_events()[0];
-    TF_CHECK_OK(AddDestinationBufferSynchronization(
-        local_device, std::move(device_buffer), event, h2d_stream));
+    TF_CHECK_OK(
+        AddDestinationBufferSynchronization(local_device, event, h2d_stream));
+
+    local_device->ThenRelease(h2d_stream, device_memory).IgnoreError();
 
     // This can sometimes catch the case where the literal memory has been
     // freed before the H2D transfer was issued.
@@ -1156,7 +1077,9 @@ PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
         .IgnoreError();  // Can return error::Unimplemented
     QCHECK(h2d_stream->ok());
   };
-  thread_pool()->Schedule(transfer_h2d);
+  thread_pool()->Schedule(WrapClosureAsCopyable(std::move(transfer_h2d)));
+  RecordUsage(std::move(device_buffer), local_device, local_device, event,
+              local_device->host_to_device_stream());
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 
@@ -1225,10 +1148,8 @@ PjRtStreamExecutorClient::CreateViewOfDeviceBuffer(
   definition_events.back()->SetSequencingEvent(std::move(event),
                                                definition_stream);
 
-  auto device_buffer = std::make_shared<TrackedDeviceBuffer>(
-      device,
-      std::initializer_list<tsl::RCReference<RawSEDeviceMemory>>{buffer},
-      definition_events);
+  auto device_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device, std::move(buffer), definition_events);
   return std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
       shape, std::move(device_buffer), this, device,
       device->default_memory_space().value_or(nullptr)));
@@ -1371,62 +1292,29 @@ absl::Span<PjRtMemorySpace* const> PjRtStreamExecutorClient::memory_spaces()
 }
 
 PjRtStreamExecutorBuffer::PjRtStreamExecutorBuffer(
-    Shape on_device_shape, std::shared_ptr<TrackedDeviceBuffer> device_buffer,
+    Shape on_device_shape, std::unique_ptr<TrackedDeviceBuffer> device_buffer,
     PjRtClient* client, PjRtDevice* device, PjRtMemorySpace* memory_space)
-    : client_(tensorflow::down_cast<PjRtStreamExecutorClient*>(client)),
+    : CommonPjRtBuffer(std::move(device_buffer)),
+      client_(tensorflow::down_cast<PjRtStreamExecutorClient*>(client)),
       on_device_shape_(std::move(on_device_shape)),
       device_(tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)),
-      memory_space_(memory_space),
-      device_buffer_(std::move(device_buffer)) {
-  for (int i = 0; i < ScopedHold::Type::kMaxValue; ++i) {
-    holds_[i] = 0;
-  }
-}
+      memory_space_(memory_space) {}
 
 PjRtStreamExecutorBuffer::~PjRtStreamExecutorBuffer() {
   Delete();
-  for (int i = 0; i < ScopedHold::Type::kMaxValue; ++i) {
-    CHECK_EQ(holds_[i], 0);
-  }
 }
 
-void PjRtStreamExecutorBuffer::WaitForOutstandingUsageHolds() {
-  auto not_in_usage_hold = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    return holds_[ScopedHold::kUsage] == 0;
-  };
-  mu_.Await(absl::Condition(&not_in_usage_hold));
-}
-
-void PjRtStreamExecutorBuffer::WaitForOutstandingDonationHold() {
-  auto not_in_donation_hold = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    return holds_[ScopedHold::kDonation] == 0;
-  };
-  mu_.Await(absl::Condition(&not_in_donation_hold));
-}
-
-absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>
+absl::StatusOr<tsl::RCReference<RawSEDeviceMemory>>
 PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
   tsl::profiler::TraceMe trace_me("PjRtStreamExecutorBuffer::Release");
-  std::shared_ptr<TrackedDeviceBuffer> device_buffer;
-  TrackedDeviceBuffer::StreamAndEventContainer events;
-  {
-    absl::MutexLock lock(&mu_);
-    // We first wait for a donation hold to complete if there is one in
-    // progress. If the donation succeeds via ConfirmDonation() then it will
-    // set device_buffer_ to nullptr before returning to this thread.
-    WaitForOutstandingDonationHold();
-    if (device_buffer_ == nullptr) {
-      return std::shared_ptr<TrackedDeviceBuffer>();
-    }
-    // Set device_buffer_ to null now so that no other
-    // thread can add a hold while we are in WaitForOutstandingUsageHolds()
-    // below.
-    std::swap(device_buffer_, device_buffer);
-    WaitForOutstandingUsageHolds();
-    // Now that all holds have completed and no more can be added, we can get
-    // the final set of usage events.
-    events = device_buffer->LockUseAndTransferUsageEvents();
-  }
+  std::unique_ptr<TrackedDeviceBuffer> device_buffer(
+      static_cast<TrackedDeviceBuffer*>(ReleaseBuffer().release()));
+  if (device_buffer == nullptr) {
+    return tsl::RCReference<RawSEDeviceMemory>();
+  }
+  TrackedDeviceBuffer::StreamAndEventContainer events =
+      device_buffer->LockUseAndTransferUsageEvents();
+  auto device_memory = device_buffer->device_memory();
   LocalDeviceState* local_device_state = device_->local_device_state();
   if (wait_for_operations_to_complete) {
     // Block the host until all usage events have completed. Usage events
@@ -1502,7 +1390,7 @@ PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
         local_device_state->cleanup_thread()->Schedule(
             [events_to_wait_for_in_a_different_thread =
                  std::move(events_to_wait_for_in_a_different_thread),
-             local_device_state, device_buffer, block_stream]() mutable {
+             local_device_state, device_memory, block_stream]() mutable {
               for (const auto& event :
                    events_to_wait_for_in_a_different_thread) {
                 MaybeWaitForEventOnStream(event.get(), local_device_state,
@@ -1510,20 +1398,20 @@ PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
               }
               if (block_stream != nullptr) {
                 TF_CHECK_OK(local_device_state->ThenExecuteCallback(
-                    block_stream, [device_buffer]() {
-                      // Drops device_buffer shared pointer.
+                    block_stream, [device_memory]() {
+                      // Drops device_memory shared pointer.
                     }));
               }
             });
       } else if (block_stream != nullptr) {
         TF_RETURN_IF_ERROR(local_device_state->ThenExecuteCallback(
-            block_stream, [device_buffer]() {
-              // Drops device_buffer shared pointer.
+            block_stream, [device_memory]() {
+              // Drops device_memory shared pointer.
             }));
       }
     }
   }
-  return device_buffer;
+  return device_memory;
 }
 
 void PjRtStreamExecutorBuffer::Delete() {
@@ -1538,87 +1426,13 @@ void PjRtStreamExecutorBuffer::Delete() {
   TF_CHECK_OK(Release(/*wait_for_operations_to_complete=*/false).status());
 }
 
-bool PjRtStreamExecutorBuffer::IsDeleted() {
-  absl::MutexLock lock(&mu_);
-  return device_buffer_ == nullptr;
-}
-
-absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>
-PjRtStreamExecutorBuffer::GetBufferForHoldLocked(ScopedHold::Type type) {
-  // All callers should have called WaitForOutstandingDonationHold().
-  CHECK_EQ(holds_[ScopedHold::kDonation], 0);
-  if (type == ScopedHold::kDonation) {
-    if (device_buffer_ == nullptr) {
-      return InvalidArgument("Donation requested for invalid buffer");
-    }
-    if (holds_[ScopedHold::kExternalReference] > 0) {
-      return InvalidArgument(
-          "Donation requested for buffer with external reference");
-    }
-    // First add the donation hold.
-    ++holds_[type];
-    // Then wait for any usage holds to be dropped or converted. No new usage
-    // holds can be added until we drop the donation hold so this wait will
-    // complete eventually.
-    WaitForOutstandingUsageHolds();
-    // Because we added a donation hold, nobody could release the buffer while
-    // we were waiting.
-    CHECK(device_buffer_ != nullptr);
-  } else {
-    if (device_buffer_ == nullptr) {
-      return InvalidArgument("Buffer has been deleted or donated.");
-    } else {
-      ++holds_[type];
-    }
-  }
-  return device_buffer_;
-}
-
-void PjRtStreamExecutorBuffer::AcquireHoldLocked(ScopedHold* hold) {
-  hold->Acquire(GetBufferForHoldLocked(hold->type()));
-}
-
 void PjRtStreamExecutorBuffer::ConvertUsageHold(
     TrackedDeviceBuffer* buffer, se::Stream* usage_stream,
     std::shared_ptr<BufferSequencingEvent> event, bool reference_held) {
   absl::MutexLock lock(&mu_);
-  CHECK(device_buffer_.get() == buffer || device_buffer_ == nullptr);
+  CHECK(device_buffer() == buffer || device_buffer() == nullptr);
   buffer->AddUsageEvent(usage_stream, std::move(event), reference_held);
-  CHECK_GT(holds_[ScopedHold::kUsage], 0);
-  --holds_[ScopedHold::kUsage];
-}
-
-void PjRtStreamExecutorBuffer::ConfirmDonation(
-    TrackedDeviceBuffer* device_buffer, bool unsafe_release) {
-  {
-    absl::MutexLock lock(&mu_);
-    CHECK_EQ(holds_[ScopedHold::kUsage], 0);
-    CHECK_EQ(holds_[ScopedHold::kExternalReference], 0);
-    CHECK_EQ(holds_[ScopedHold::kDonation], 1);
-    holds_[ScopedHold::kDonation] = 0;
-    CHECK(device_buffer_.get() == device_buffer);
-    // As a sanity check ensure no more usage events can be added to the buffer.
-    device_buffer->LockUseAndTransferUsageEvents();
-    // Give up ownership of the device memory so we don't free it when the last
-    // reference to device_buffer_ goes away.
-    device_buffer->ReleaseDeviceMemory(unsafe_release);
-    // Make *this invalid so it can't be used again. Any threads blocking in
-    // Release or GetBufferWithHold will see an invalid buffer and return.
-    device_buffer_.reset();
-  }
-}
-
-void PjRtStreamExecutorBuffer::DropHold(ScopedHold::Type type,
-                                        TrackedDeviceBuffer* buffer) {
-  absl::MutexLock lock(&mu_);
-  CHECK(device_buffer_.get() == buffer || device_buffer_ == nullptr);
-  CHECK_GT(holds_[type], 0);
-  --holds_[type];
-  if (type == ScopedHold::kDonation) {
-    CHECK_EQ(holds_[ScopedHold::kDonation], 0);
-    CHECK_EQ(holds_[ScopedHold::kUsage], 0);
-    CHECK_EQ(holds_[ScopedHold::kExternalReference], 0);
-  }
+  DecrementUsage();
 }
 
 PjRtFuture<> PjRtStreamExecutorBuffer::LazyToLiteral(
@@ -1637,16 +1451,11 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
   }
   LocalDeviceState* local_device = device_->local_device_state();
   se::Stream* stream = local_device->GetDeviceToHostStream();
-  ScopedHold device_buffer(this, ScopedHold::kUsage);
-  {
-    absl::MutexLock lock(&mu_);
-    // We can't perform any other action while a donation hold is in progress.
-    WaitForOutstandingDonationHold();
-    if (device_buffer_ == nullptr) {
-      return PjRtFuture<>(InvalidArgument(
-          "CopyToHostAsync() called on deleted or donated buffer"));
-    }
-    AcquireHoldLocked(&device_buffer);
+  auto device_buffer = GetBufferWithUsageHold();
+  if (!device_buffer.ok()) {
+    return PjRtFuture<>(
+        InvalidArgument("ToLiteral() called on deleted or donated buffer: %s",
+                        device_buffer.status().ToString()));
   }
 
   auto promise = PjRtFuture<>::CreatePromise();
@@ -1656,7 +1465,9 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
   TransferManager* transfer_manager =
       client_->client()->backend().transfer_manager();
 
-  auto tracked_device_buffer = device_buffer.buffer();
+  auto device_memory = device_buffer->device_memory();
+  auto definition_events = device_buffer->definition_events();
+  auto first_definition_event = definition_events[0];
 
   // When using the ComputeSynchronized allocation model, retain a
   // reference to the device_buffer until the copy completes, to
@@ -1671,10 +1482,53 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
   // to ToLiteral.
   device_buffer.ConvertUsageHold(stream, usage_event, /*reference_held=*/true);
 
-  auto async_to_literal = [usage_event, tracked_device_buffer, stream,
+  std::shared_ptr<TransposePlan> transpose;
+  if (on_device_shape().IsArray()) {
+    xla::Layout literal_layout;
+    if (literal->shape().has_layout()) {
+      literal_layout = literal->shape().layout();
+    } else {
+      literal_layout = LayoutUtil::MakeDescendingLayout(
+          on_device_shape().dimensions().size());
+    }
+
+    if (on_device_shape().layout() != literal_layout) {
+      absl::InlinedVector<int64_t, 4> byte_strides(
+          on_device_shape().dimensions().size());
+      absl::Status s = ShapeUtil::ByteStrides(on_device_shape(),
+                                              absl::MakeSpan(byte_strides));
+      if (!s.ok()) {
+        return PjRtFuture<>(s);
+      }
+      absl::Span<const int64_t> dims = on_device_shape().dimensions();
+      absl::InlinedVector<int64_t, 4> permutation(dims.size());
+      absl::c_reverse_copy(literal_layout.minor_to_major(),
+                           permutation.begin());
+      TransposePlan::Options options;
+      options.elem_size_in_bytes =
+          primitive_util::ByteWidth(on_device_shape().element_type());
+      options.dims = on_device_shape().dimensions();
+      options.permutation = permutation;
+      options.input_layout = TransposePlan::Striding{byte_strides};
+      {
+        absl::MutexLock lock(&client_->transpose_mu_);
+        absl::StatusOr<std::shared_ptr<TransposePlan>> t =
+            client_->transpose_cache_.GetOrCreate(options);
+        if (!t.ok()) {
+          return PjRtFuture<>(t.status());
+        }
+        transpose = *std::move(t);
+      }
+    }
+  }
+
+  auto async_to_literal = [usage_event,
+                           device_memory = std::move(device_memory),
+                           definition_events = std::move(definition_events),
+                           stream, device = device_,
                            transfer_manager = std::move(transfer_manager),
-                           on_device_shape{on_device_shape_}, literal, promise,
-                           local_device]() mutable {
+                           on_device_shape{on_device_shape_}, literal,
+                           transpose, promise, local_device]() mutable {
     absl::StatusOr<EventPool::Handle> event_or =
         local_device->event_pool().AllocateEvent(stream->parent());
     if (!event_or.ok()) {
@@ -1682,16 +1536,16 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
       return;
     }
 
-    absl::Status defined_status =
-        tracked_device_buffer->definition_events()[0]->GetDefinedStatus();
+    absl::Status defined_status = definition_events[0]->GetDefinedStatus();
     if (!defined_status.ok()) {
       promise.Set(defined_status);
       return;
     }
 
-    WaitForBufferDefinitionEventsOnStream(*tracked_device_buffer, stream);
+    WaitForBufferDefinitionEventsOnStream(absl::MakeSpan(definition_events),
+                                          stream);
     ShapedBuffer shaped_buffer =
-        tracked_device_buffer->AsShapedBuffer(on_device_shape);
+        device_memory->AsShapedBuffer(device, on_device_shape);
 
     GenericTransferManager::LiteralFromDeviceMetadata transfer_metadata;
     // We never call device functions from the `done` callback.
@@ -1702,23 +1556,44 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
             ? &transfer_metadata
             : nullptr;
 
-    transfer_manager->TransferLiteralFromDevice(
-        stream, shaped_buffer, literal,
-        [promise](absl::Status status) mutable {
-          promise.Set(std::move(status));
-        },
-        transfer_metadata_ptr);
+    if (transpose) {
+      // Copy the device buffer to a temporary literal with descending
+      // layout and transpose to the requested layout.
+
+      Shape stage_shape = literal->shape();
+      *stage_shape.mutable_layout() =
+          LayoutUtil::MakeDescendingLayout(stage_shape.dimensions().size());
+      auto staged = std::make_shared<Literal>(stage_shape);
+
+      transfer_manager->TransferLiteralFromDevice(
+          stream, shaped_buffer, staged.get(),
+          [transpose, promise, staged, literal](absl::Status status) mutable {
+            if (status.ok()) {
+              transpose->Execute(staged->untyped_data(),
+                                 literal->untyped_data());
+            }
+            promise.Set(std::move(status));
+          },
+          transfer_metadata_ptr);
+    } else {
+      transfer_manager->TransferLiteralFromDevice(
+          stream, shaped_buffer, literal,
+          [promise](absl::Status status) mutable {
+            promise.Set(std::move(status));
+          },
+          transfer_metadata_ptr);
+    }
 
     local_device->event_pool().ThenRecordEvent(stream, event_or.value());
     usage_event->SetSequencingEvent(std::move(event_or).value(), stream);
 
-    defined_status = local_device->ThenRelease(stream, tracked_device_buffer);
+    defined_status = local_device->ThenRelease(stream, device_memory);
     if (!defined_status.ok()) {
       promise.Set(defined_status);
     }
   };
 
-  tracked_device_buffer->definition_events()[0]->ExecuteOrAddToFutureTasks(
+  first_definition_event->ExecuteOrAddToFutureTasks(
       absl::StrFormat("async_to_literal_%p", literal),
       std::move(async_to_literal));
 
@@ -1742,15 +1617,11 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
 absl::StatusOr<size_t> PjRtStreamExecutorBuffer::GetOnDeviceSizeInBytes()
     const {
   absl::MutexLock lock(&mu_);
-  if (device_buffer_ == nullptr) {
+  if (device_buffer() == nullptr || !device_buffer()->device_memory()) {
     return InvalidArgument(
         "GetOnDeviceSizeInBytes called on deleted or donated buffer");
   }
-  if (device_buffer_->device_memory().size() != 1) {
-    return InvalidArgument(
-        "GetOnDeviceSizeInBytes called on tuple-shaped buffer");
-  }
-  return device_buffer_->device_memory()[0]->mem().size();
+  return device_buffer()->device_memory()->mem().size();
 }
 
 PjRtFuture<> PjRtStreamExecutorBuffer::CopyRawToHost(void* dst, int64_t offset,
@@ -1764,15 +1635,6 @@ PjRtFuture<> PjRtStreamExecutorBuffer::CopyRawToHostFuture(
   return client_->CopyRawSubBufferToHost(this, dst, offset, transfer_size);
 }
 
-absl::StatusOr<ShapedBuffer> PjRtStreamExecutorBuffer::AsShapedBuffer() const {
-  absl::MutexLock lock(&mu_);
-  if (device_buffer_ == nullptr) {
-    return InvalidArgument(
-        "Attempted to fetch value of invalid/deleted buffer.");
-  }
-  return device_buffer_->AsShapedBuffer(on_device_shape_);
-}
-
 PjRtStreamExecutorBuffer::ScopedHold
 PjRtStreamExecutorBuffer::GetBufferWithHold(ScopedHold::Type type) {
   absl::MutexLock lock(&mu_);
@@ -1789,7 +1651,7 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
     PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
     PjRtMemorySpace* dst_memory_space, LocalDeviceState* transfer_local_device,
     LocalDeviceState* src_local_device, se::Stream* transfer_stream,
-    std::shared_ptr<TrackedDeviceBuffer> src_device_buffer) {
+    const TrackedDeviceBuffer& src_device_buffer) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtStreamExecutorBuffer> py_buffer,
                       AllocateDestinationBuffer(
                           ShapeUtil::DeviceShapeToHostShape(on_device_shape_),
@@ -1804,9 +1666,10 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
       dst_device_buffer->definition_events()[0];
 
   // Copy the leaf buffers.
-  auto async_copy_to_device = [src_device_buffer,
-                               dst_device_buffer =
-                                   std::move(dst_device_buffer.buffer()),
+  auto async_copy_to_device = [src_memory = src_device_buffer.device_memory(),
+                               src_definition_events =
+                                   src_device_buffer.definition_events(),
+                               dst_memory = dst_device_buffer->device_memory(),
                                transfer_stream = std::move(transfer_stream),
                                copy_event,
                                on_device_shape{py_buffer->on_device_shape()},
@@ -1821,43 +1684,34 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
     VLOG(1)
         << "PjRtStreamExecutorBuffer::CopyToDeviceHelper::async_copy_to_device";
 
-    absl::Status defined_status =
-        src_device_buffer->definition_events()[0]->GetDefinedStatus();
+    absl::Status defined_status = src_definition_events[0]->GetDefinedStatus();
     // Only proceeds to transfer when the buffer doesn't hold an error.
     if (defined_status.ok()) {
-      WaitForBufferDefinitionEventsOnStream(*src_device_buffer,
+      WaitForBufferDefinitionEventsOnStream(src_definition_events,
                                             transfer_stream);
 
-      ShapedBuffer src_buffer =
-          src_device_buffer->AsShapedBuffer(on_device_shape);
-
-      ShapedBuffer dst_buffer =
-          dst_device_buffer->AsShapedBuffer(on_device_shape);
-      for (const auto& leaf : src_buffer.buffers().leaves()) {
-        const ShapeIndex& index = leaf.first;
-        const se::DeviceMemoryBase& input_buffer = leaf.second;
-        const se::DeviceMemoryBase& output_buffer = dst_buffer.buffer(index);
-        CHECK_EQ(input_buffer.size(), output_buffer.size());
-        if (input_buffer.size() != 0) {
-          auto status = transfer_local_device->ThenMemcpyDeviceToDevice(
-              transfer_stream, dst_local_device->compute_stream(), input_buffer,
-              output_buffer);
-          if (!status.ok()) {
-            LOG(ERROR) << "D2D memory copy failed due to: " << status;
-            StallStreamOnError(transfer_local_device, transfer_stream);
-            if (transfer_local_device == dst_local_device) {
-              // Some copies may have been enqueued before the error was
-              // returned, and StallStreamOnError only makes sure the
-              // destination device is ok, so make sure that the src buffer
-              // remains valid until after any transfers have completed.
-              auto status = src_local_device->ThenRelease(
-                  transfer_stream, std::move(src_device_buffer));
-              if (!status.ok()) {
-                LOG(ERROR) << "ThenRelease failed due to: " << status;
-              }
+      const se::DeviceMemoryBase& input_buffer = src_memory->mem();
+      const se::DeviceMemoryBase& output_buffer = dst_memory->mem();
+      CHECK_EQ(input_buffer.size(), output_buffer.size());
+      if (input_buffer.size() != 0) {
+        auto status = transfer_local_device->ThenMemcpyDeviceToDevice(
+            transfer_stream, dst_local_device->compute_stream(), input_buffer,
+            output_buffer);
+        if (!status.ok()) {
+          LOG(ERROR) << "D2D memory copy failed due to: " << status;
+          StallStreamOnError(transfer_local_device, transfer_stream);
+          if (transfer_local_device == dst_local_device) {
+            // Some copies may have been enqueued before the error was
+            // returned, and StallStreamOnError only makes sure the
+            // destination device is ok, so make sure that the src buffer
+            // remains valid until after any transfers have completed.
+            auto status =
+                src_local_device->ThenRelease(transfer_stream, src_memory);
+            if (!status.ok()) {
+              LOG(ERROR) << "ThenRelease failed due to: " << status;
             }
-            return;
           }
+          return;
         }
       }
 
@@ -1875,16 +1729,15 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
       copy_event->SetDefinedStatus(defined_status);
     }
 
-    auto status = src_local_device->ThenRelease(transfer_stream,
-                                                std::move(src_device_buffer));
+    auto status =
+        src_local_device->ThenRelease(transfer_stream, std::move(src_memory));
     if (!status.ok()) {
       LOG(ERROR) << "ThenRelease failed due to: " << status;
     }
   };
 
-  src_device_buffer->definition_events()[0]->ExecuteOrAddToFutureTasks(
-      absl::StrFormat("async_copy_to_device_%p",
-                      dst_device_buffer.buffer().get()),
+  src_device_buffer.definition_events()[0]->ExecuteOrAddToFutureTasks(
+      absl::StrFormat("async_copy_to_device_%p", dst_device_buffer.buffer()),
       std::move(async_copy_to_device));
 
   RecordUsage(std::move(dst_device_buffer), transfer_local_device,
@@ -1899,19 +1752,13 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorBuffer::CopyToDeviceMemorySpace(
     PjRtDevice* dst_device, PjRtMemorySpace* dst_memory_space) {
-  if (dst_device == device_ && dst_memory_space == memory_space() &&
-      dst_memory_space->kind_id() != PinnedHostMemorySpace::kKindId) {
-    return InvalidArgument(
-        "CopyToDeviceMemorySpace cannot accept the same source and destination "
-        "devices/memory");
-  }
   // Copying across PjRtClients involves a copy through the host.
   if (dst_device->client() != client_) {
     TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
     // Avoid use-after-free on `literal` due to unsequenced move and use.
     Literal* literal_pointer = literal.get();
     absl::InlinedVector<int64_t, 4> byte_strides(
-        literal->shape().dimensions_size());
+        literal->shape().dimensions().size());
     TF_RETURN_IF_ERROR(
         ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
     return dst_device->client()->BufferFromHostBuffer(
@@ -1936,24 +1783,18 @@ PjRtStreamExecutorBuffer::CopyToDeviceMemorySpace(
   se::Stream* transfer_stream =
       transfer_local_device->GetDeviceToDeviceStream();
 
-  ScopedHold src_device_buffer(this, ScopedHold::kUsage);
-  {
-    absl::MutexLock lock(&mu_);
-    // We can't perform any other action while a donation hold is in progress.
-    WaitForOutstandingDonationHold();
-    if (device_buffer_ == nullptr) {
-      return InvalidArgument(
-          "CopyToDevice called on deleted or donated buffer");
-    }
-    AcquireHoldLocked(&src_device_buffer);
+  auto src_device_buffer = GetBufferWithUsageHold();
+  if (!src_device_buffer.ok()) {
+    return InvalidArgument(
+        "CopyToDevice() called on deleted or donated buffer: %s",
+        src_device_buffer.status().ToString());
   }
 
   absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                            std::shared_ptr<BufferSequencingEvent>>>
       buffer_and_event_or = CopyToDeviceHelper(
           dst_device, dst_local_device, dst_memory_space, transfer_local_device,
-          device_->local_device_state(), transfer_stream,
-          src_device_buffer.buffer());
+          device_->local_device_state(), transfer_stream, *src_device_buffer);
   if (!buffer_and_event_or.ok()) {
     return buffer_and_event_or.status();
   }
@@ -1989,34 +1830,37 @@ void PjRtStreamExecutorBuffer::CopyToRemoteDevice(
 }
 
 PjRtFuture<> PjRtStreamExecutorBuffer::GetReadyFuture() {
-  std::shared_ptr<TrackedDeviceBuffer> device_buffer;
+  absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 2>
+      definition_events;
   PjRtFuture<>::Promise definition_promise;
   {
     absl::MutexLock lock(&mu_);
-    if (device_buffer_ == nullptr) {
+    if (device_buffer() == nullptr) {
       return PjRtFuture<>(InvalidArgument(
           "GetReadyFuture() called on deleted or donated buffer"));
     }
     if (!definition_promise_) {
-      device_buffer = device_buffer_;
+      definition_events = device_buffer()->definition_events();
       definition_promise_ = PjRtFuture<>::CreatePromise();
     }
     definition_promise = definition_promise_;
   }
 
-  if (device_buffer) {
+  if (!definition_events.empty()) {
     LocalDeviceState* local_device_state = device_->local_device_state();
+    auto first_definition_event = definition_events[0];
     auto async_wait_for_events =
-        [device_buffer, local_device_state = std::move(local_device_state),
+        [definition_events = std::move(definition_events),
+         local_device_state = std::move(local_device_state),
          definition_promise]() mutable {
           std::unique_ptr<se::Stream> stream;
           absl::Status defined_status =
-              device_buffer->definition_events()[0]->GetDefinedStatus();
+              definition_events[0]->GetDefinedStatus();
           if (!defined_status.ok()) {
             definition_promise.Set(defined_status);
             return;
           }
-          for (auto& event : device_buffer->definition_events()) {
+          for (auto& event : definition_events) {
             if (!event->IsComplete()) {
               if (stream == nullptr) {
                 stream = local_device_state->BorrowStreamFromPool();
@@ -2033,8 +1877,7 @@ PjRtFuture<> PjRtStreamExecutorBuffer::GetReadyFuture() {
             // saves significant time.
             auto status = stream_ptr->DoHostCallback(
                 [definition_promise, stream_ptr, local_device_state,
-                 event_with_status =
-                     device_buffer->definition_events()[0]]() mutable {
+                 event_with_status = definition_events[0]]() mutable {
                   local_device_state->ReturnStreamToPool(
                       std::unique_ptr<se::Stream>(stream_ptr));
                   definition_promise.Set(event_with_status->GetDefinedStatus());
@@ -2047,11 +1890,10 @@ PjRtFuture<> PjRtStreamExecutorBuffer::GetReadyFuture() {
             // All events are already complete; set the `definition_promise`
             // with the status of the buffer's first definition event which may
             // have error status to propagate.
-            definition_promise.Set(
-                device_buffer->definition_events()[0]->GetDefinedStatus());
+            definition_promise.Set(definition_events[0]->GetDefinedStatus());
           }
         };
-    device_buffer->definition_events()[0]->ExecuteOrAddToFutureTasks(
+    first_definition_event->ExecuteOrAddToFutureTasks(
         absl::StrFormat("async_wait_for_events_%p", &async_wait_for_events),
         std::move(async_wait_for_events));
   }
@@ -2094,7 +1936,7 @@ absl::Status CheckCompatibleShapes(bool strict_shape_checking,
   // shape `pred[0]`.
   if (execution_shape.IsToken() &&
       buffer_on_device_shape.element_type() == PrimitiveType::PRED &&
-      buffer_on_device_shape.dimensions_size() == 1 &&
+      buffer_on_device_shape.dimensions().size() == 1 &&
       buffer_on_device_shape.dimensions(0) == 0) {
     return absl::OkStatus();
   }
@@ -2134,7 +1976,9 @@ absl::Status CheckCompatibleShapes(bool strict_shape_checking,
 }
 
 // Makes a tuple from the arguments to an execution.
-absl::StatusOr<std::unique_ptr<TupleHandle>> MakeTupleHelper(
+static absl::StatusOr<std::pair<ShapeTree<PjRtStreamExecutorExecutionInput>,
+                                std::shared_ptr<BufferSequencingEvent>>>
+MakeTupleHelper(
     PjRtStreamExecutorClient* client, LocalDeviceState* local_device,
     bool strict_shape_checking, const Shape& tupled_parameter_shape,
     absl::Span<PjRtBuffer* const> py_buffers,
@@ -2144,9 +1988,9 @@ absl::StatusOr<std::unique_ptr<TupleHandle>> MakeTupleHelper(
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
 
-  if (tupled_parameter_shape.tuple_shapes_size() != py_buffers.size()) {
+  if (tupled_parameter_shape.tuple_shapes().size() != py_buffers.size()) {
     return InvalidArgument("Executable expected %lld parameters but got %lld",
-                           tupled_parameter_shape.tuple_shapes_size(),
+                           tupled_parameter_shape.tuple_shapes().size(),
                            py_buffers.size());
   }
   for (int i = 0; i < py_buffers.size(); ++i) {
@@ -2157,39 +2001,49 @@ absl::StatusOr<std::unique_ptr<TupleHandle>> MakeTupleHelper(
 
   se::Stream* stream = local_device->host_to_device_stream();
   TF_ASSIGN_OR_RETURN(
-      se::OwningDeviceMemory root_table_memory,
+      se::OwningDeviceMemory owned_root_table_memory,
       allocator->Allocate(
           device_ordinal,
           transfer_manager->GetByteSizeRequirement(tupled_parameter_shape)));
+  auto root_table_memory = owned_root_table_memory.cref();
 
   if (local_device->allocation_model() ==
       LocalDeviceState::kComputeSynchronized) {
     TF_RETURN_IF_ERROR(stream->WaitFor(local_device->compute_stream()));
   } else {
     DCHECK(transfer_manager->CanBufferBeAccessedNow(
-        local_device->compute_stream()->parent(), root_table_memory.cref()));
+        local_device->compute_stream()->parent(), root_table_memory));
   }
 
-  ExecutionInput execution_input(tupled_parameter_shape);
-  ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
-      execution_input.MutableBuffers()->begin();
-  ShapeTree<MaybeOwningDeviceMemory>::iterator iterator_end =
-      execution_input.MutableBuffers()->end();
+  ShapeTree<PjRtStreamExecutorExecutionInput> execution_input(
+      tupled_parameter_shape);
+  auto input_iterator = execution_input.begin();
+  auto iterator_end = execution_input.end();
   // First set the root tuple table which is the first buffer in the ShapeTree.
-  execution_input.SetBuffer(
-      input_iterator->first,
-      MaybeOwningDeviceMemory(std::move(root_table_memory)));
+  input_iterator->second = {
+      true,
+      RawSEDeviceMemory::Create(owned_root_table_memory.Release(),
+                                local_device->local_device_id(), allocator)};
   ++input_iterator;
   // Then set each sub-tuple in turn from the parameters.
   for (const PjRtStreamExecutorBuffer::ScopedHold& device_buffer :
        device_buffers) {
-    device_buffer.AddToInput(&input_iterator, iterator_end, &execution_input,
-                             allocator);
+    input_iterator->second = {
+        device_buffer.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation,
+        device_buffer->device_memory()};
+    ++input_iterator;
   }
   CHECK(input_iterator == iterator_end);
 
-  TF_RETURN_IF_ERROR(transfer_manager->WriteRootTupleIndexTable(
-      stream, execution_input.Buffers()));
+  std::vector<se::DeviceMemoryBase> elements;
+  size_t num_elements = ShapeUtil::TupleElementCount(tupled_parameter_shape);
+  elements.reserve(num_elements);
+  for (int64_t i = 0; i < num_elements; ++i) {
+    elements.push_back(execution_input.element({i}).buf->mem());
+  }
+
+  TF_RETURN_IF_ERROR(transfer_manager->WriteSingleTupleIndexTable(
+      stream, elements, tupled_parameter_shape, &root_table_memory));
   absl::StatusOr<EventPool::Handle> event_or =
       local_device->event_pool().ThenAllocateAndRecordEvent(stream);
   if (!event_or.ok()) {
@@ -2200,21 +2054,28 @@ absl::StatusOr<std::unique_ptr<TupleHandle>> MakeTupleHelper(
   auto transfer_event =
       std::make_shared<BufferSequencingEvent>(client->thread_pool());
   transfer_event->SetSequencingEvent(std::move(event_or).value(), stream);
-  return std::make_unique<TupleHandle>(
-      TupleHandle({std::move(execution_input), std::move(transfer_event)}));
+  return std::make_pair(std::move(execution_input), std::move(transfer_event));
 }
 
 // Converts a ScopedShapedBuffer returned from an execution into a
 // PjRtBuffer.
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> OutputBufferHelper(
-    ScopedShapedBuffer* result_buffer,
+    ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
     std::shared_ptr<BufferSequencingEvent> definition_event, PjRtClient* client,
     PjRtDevice* device, LocalDeviceState* local_device,
-    std::vector<std::shared_ptr<TrackedDeviceBuffer>>& buffers_to_release) {
-  std::shared_ptr<TrackedDeviceBuffer> out_buffer =
-      TrackedDeviceBuffer::FromScopedShapedBuffer(result_buffer,
-                                                  {definition_event}, device);
-  const Shape& shape = result_buffer->on_device_shape();
+    std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release) {
+  if (result_buffer.shape().IsTuple()) {
+    return absl::InternalError("OutputBufferHelper called on tuple.");
+  }
+  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 1> buffers;
+  for (auto& item : result_buffer) {
+    buffers.push_back(std::move(item.second));
+  }
+  auto out_buffer = std::make_unique<TrackedDeviceBuffer>(
+      device, std::move(buffers[0]),
+      absl::Span<const std::shared_ptr<BufferSequencingEvent>>{
+          definition_event});
+  const Shape& shape = result_buffer.shape();
   PjRtMemorySpace* memory_space =
       device->default_memory_space().value_or(nullptr);
   if (shape.has_layout()) {
@@ -2237,7 +2098,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> OutputBufferHelper(
     }
   }
   auto pjrt_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      result_buffer->on_device_shape(), std::move(out_buffer), client, device,
+      result_buffer.shape(), std::move(out_buffer), client, device,
       memory_space);
   RecordUsage(pjrt_buffer->GetBufferWithUsageHold(), local_device, local_device,
               definition_event, local_device->compute_stream(),
@@ -2283,7 +2144,8 @@ PjRtStreamExecutorLoadedExecutable::PjRtStreamExecutorLoadedExecutable(
     }
     fingerprint = tsl::FingerprintCat128(
         fingerprint,
-        tsl::Fingerprint128(executable->executable()->module().ToString()));
+        tsl::Fingerprint128(executable->executable()->module().ToString(
+            HloPrintOptions::ModuleFingerprint())));
     executables_.emplace_back(std::move(executable));
     on_device_executable_parameter_shapes_.push_back(
         std::move(parameter_shapes));
@@ -2300,7 +2162,6 @@ PjRtStreamExecutorLoadedExecutable::PjRtStreamExecutorLoadedExecutable(
     // This must go after `executables_` is initialized.
     VLOG(3) << "PjRtStreamExecutorLoadedExecutable device_assignment:\n"
             << device_assignment_->ToString();
-    CHECK_GE(addressable_devices_.size(), 1) << device_assignment_->ToString();
 
     if ((device_assignment_->replica_count() > 1 ||
          device_assignment_->computation_count() > 1) &&
@@ -2357,28 +2218,28 @@ PjRtStreamExecutorLoadedExecutable::ParametersThatMustBeDonated(
   return parameters_that_must_be_donated_[executable_idx];
 }
 
-absl::StatusOr<std::vector<ExecutionInput>>
+absl::StatusOr<std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>>>
 PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
     int device_ordinal, const ExecuteOptions& options,
     absl::Span<const Shape> executable_parameter_shapes,
     absl::Span<PjRtBuffer* const> argument_handles,
     absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
     absl::flat_hash_set<BufferSequencingEvent*>& events) const {
-  std::vector<ExecutionInput> execution_inputs;
+  std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> execution_inputs;
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
   TransferManager* transfer_manager =
       client_->client()->backend().transfer_manager();
-  // Lift tuple_handle outside the conditional so that the event it returns is
-  // not destroyed until after the loop below that waits on events.
-  std::unique_ptr<TupleHandle> tuple_handle;
+  // Lift tuple_write_event outside the conditional so that the event it
+  // returns is not destroyed until after the loop below that waits on events.
+  std::shared_ptr<BufferSequencingEvent> tuple_write_event;
   if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
     TF_ASSIGN_OR_RETURN(
-        tuple_handle,
+        auto tuple_handle,
         MakeTupleHelper(client_, device_state, options.strict_shape_checking,
                         executable_parameter_shapes[0], argument_handles,
                         device_buffers, device_ordinal));
-    events.insert(tuple_handle->event.get());
-    execution_inputs.emplace_back(std::move(tuple_handle->execution_input));
+    tuple_write_event = std::move(tuple_handle.second);
+    execution_inputs.emplace_back(std::move(tuple_handle.first));
   } else {
     if (argument_handles.size() != executable_parameter_shapes.size()) {
       return InvalidArgument("Executable expected %lld arguments but got %lld",
@@ -2394,13 +2255,17 @@ PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
           options.strict_shape_checking, handle->on_device_shape(),
           executable_parameter_shapes[i], *transfer_manager, i));
       execution_inputs.emplace_back(executable_parameter_shapes[i]);
-      ExecutionInput& execution_input = execution_inputs.back();
-      ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
-          execution_input.MutableBuffers()->begin();
-      ShapeTree<MaybeOwningDeviceMemory>::iterator iterator_end =
-          execution_input.MutableBuffers()->end();
-      device_buffers[i].AddToInput(&input_iterator, iterator_end,
-                                   &execution_input, client_->allocator());
+      ShapeTree<PjRtStreamExecutorExecutionInput>& execution_input =
+          execution_inputs.back();
+      auto input_iterator = execution_input.begin();
+      auto iterator_end = execution_input.end();
+      const auto& buf = device_buffers[i]->device_memory();
+      CHECK(input_iterator != iterator_end);
+      input_iterator->second = {
+          device_buffers[i].type() ==
+              PjRtStreamExecutorBuffer::ScopedHold::kDonation,
+          buf};
+      ++input_iterator;
       CHECK(input_iterator == iterator_end);
     }
   }
@@ -2651,19 +2516,69 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   };
 }
 
+absl::StatusOr<PjRtStreamExecutorExecutionOutput>
+PjRtStreamExecutorClient::RunAsync(
+    LocalExecutable& exec, PjRtDevice* device,
+    std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> arguments,
+    ExecutableRunOptions run_options) {
+  std::vector<ExecutionInput> xla_arguments;
+  for (ShapeTree<PjRtStreamExecutorExecutionInput>& input : arguments) {
+    xla_arguments.emplace_back(input.shape());
+    auto& tmp = xla_arguments.back();
+    auto it = tmp.MutableBuffers()->begin();
+    for (auto& v : input) {
+      if (v.second.is_donated) {
+        it->second = MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+            v.second.buf->mem(), device->local_device_id().value(),
+            run_options.allocator()));
+        tmp.SetUnownedIndex(it->first);
+      } else {
+        it->second = MaybeOwningDeviceMemory(v.second.buf->mem());
+      }
+      ++it;
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      ExecutionOutput output,
+      exec.RunAsync(std::move(xla_arguments), std::move(run_options)));
+  ScopedShapedBuffer ssb = output.ConsumeResult();
+  xla::ShapeTree<tsl::RCReference<RawSEDeviceMemory>> results(
+      ssb.on_device_shape());
+  auto it = results.begin();
+  se::DeviceMemoryAllocator* allocator = ssb.memory_allocator();
+  ShapedBuffer released_ssb = ssb.release();
+  for (auto& buf : released_ssb.buffers()) {
+    CHECK(it != results.end());
+    it->second = RawSEDeviceMemory::Create(
+        buf.second, device->local_device_id(), allocator);
+    ++it;
+  }
+  CHECK(it == results.end());
+  for (ShapeTree<PjRtStreamExecutorExecutionInput>& input : arguments) {
+    for (auto& v : input) {
+      if (v.second.is_donated) {
+        v.second.buf->UnsafeReleaseMemory();
+      }
+    }
+  }
+  return PjRtStreamExecutorExecutionOutput(
+      {std::move(results), {}, output.ConsumeToBeReleased()});
+}
+
 // Enqueues a computation onto the compute stream. Each buffer returned in
 // device_buffers has a usage hold added that must be dropped on error or
 // converted on success.
 // When `options` has non-zero `launch_id`, use `launch_id` instead of `run_id`
 // to initialize `run_options`.
-absl::StatusOr<ScopedShapedBuffer>
+absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
 PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
     PjRtDevice* device,
     std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
     std::shared_ptr<DeviceAssignment> device_assignment,
-    std::vector<std::function<void()>>& compute_callbacks) const {
+    std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const {
   int device_ordinal = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
                            ->local_device_state()
                            ->local_device_id()
@@ -2680,8 +2595,8 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
   absl::Span<int const> donated_params =
       ParametersThatMustBeDonated(executable_idx);
   auto donate_it = donated_params.begin();
-  absl::flat_hash_set<PjRtStreamExecutorBuffer*> used_buffers;
-  absl::flat_hash_set<PjRtStreamExecutorBuffer*> donated_buffers;
+  absl::flat_hash_map<const void*, std::pair<bool, int>> donation_clashes;
+  donation_clashes.reserve(argument_handles.size());
   for (int i = 0; i < argument_handles.size(); ++i) {
     auto* handle =
         tensorflow::down_cast<PjRtStreamExecutorBuffer*>(argument_handles[i]);
@@ -2698,29 +2613,8 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     if (must_donate) {
       ++donate_it;
     }
-    bool already_used = !used_buffers.emplace(handle).second;
-    bool already_donated =
-        must_donate ? !donated_buffers.emplace(handle).second
-                    : donated_buffers.find(handle) != donated_buffers.end();
-    if (must_donate && already_donated) {
-      return InvalidArgument(
-          "Attempt to donate the same buffer twice in Execute() (second use: "
-          "flattened argument %d, replica %d). "
-          "Toy example for this bug: `f(donate(a), donate(a))`.",
-          i, replica);
-    } else if (must_donate && already_used) {
-      return InvalidArgument(
-          "Attempt to donate a buffer which is also used by the same call to "
-          "Execute() (second use: flattened argument %d, replica %d). "
-          "Toy example for this bug: `f(a, donate(a))`.",
-          i, replica);
-    } else if (already_donated) {
-      return InvalidArgument(
-          "Attempt to use a buffer that was previously donated in the same "
-          "call to Execute() (second use: flattened argument %d, replica %d). "
-          "Toy example for this bug: `f(donate(a), a)`.",
-          i, replica);
-    }
+    TF_RETURN_IF_ERROR(TestBufferDonationClashes(
+        handle, donation_clashes, must_donate, i, replica, partition));
     device_buffers->emplace_back(handle->GetBufferWithHold(
         must_donate ? PjRtStreamExecutorBuffer::ScopedHold::kDonation
                     : PjRtStreamExecutorBuffer::ScopedHold::kUsage));
@@ -2762,7 +2656,7 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
   }
 
   TF_ASSIGN_OR_RETURN(
-      std::vector<ExecutionInput> execution_inputs,
+      std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> execution_inputs,
       MakeExecutionInputsAndWaitForEvents(
           device_ordinal, options,
           on_device_executable_parameter_shapes_[executable_idx],
@@ -2832,9 +2726,10 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
           *start_time_ns = tsl::Env::Default()->NowNanos();
         }));
   }
-  absl::StatusOr<ExecutionOutput> result_buffer_or_status =
-      executables_[executable_idx]->RunAsync(std::move(execution_inputs),
-                                             run_options);
+
+  absl::StatusOr<PjRtStreamExecutorExecutionOutput> result_buffer_or_status =
+      client_->RunAsync(*executables_[executable_idx], device,
+                        std::move(execution_inputs), run_options);
 
   VLOG(1) << "Replica " << replica << " partition " << partition
           << " completed; ok=" << result_buffer_or_status.ok();
@@ -2865,7 +2760,6 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
   }
 
   if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
-    ExecutionOutput& execution_output = result_buffer_or_status.value();
     // If we used a transient tuple for the arguments we donated its root table
     // buffer. In that case, and/or if we donated any input buffers that were
     // not aliased, the donated buffers are going to be passed back to us via
@@ -2873,23 +2767,11 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     // execution completes. (Currently XLA does not support aliasing tuple
     // tables, so if any donated parameter is a tuple there will be donated but
     // unaliased buffers.)
-    std::vector<se::OwningDeviceMemory> donated_memory =
-        execution_output.ConsumeToBeReleased();
-    absl::InlinedVector<se::DeviceMemoryBase, 3> donated_ptrs;
-    donated_ptrs.reserve(donated_memory.size());
-    for (se::OwningDeviceMemory& owning : donated_memory) {
-      // Release the owning memory so we can pass it to the closure.
-      donated_ptrs.push_back(owning.Release());
-    }
     compute_callbacks.push_back(
-        [references{std::make_tuple(executables_[executable_idx],
-                                    compute_reservation, device_assignment)},
-         donated_ptrs{std::move(donated_ptrs)}, allocator{client_->allocator()},
-         device_ordinal]() {
-          for (const auto& ptr : donated_ptrs) {
-            TF_CHECK_OK(allocator->Deallocate(device_ordinal, ptr));
-          }
-        });
+        [donated_memory = std::move(result_buffer_or_status->to_be_released),
+         se_donated_memory =
+             std::move(result_buffer_or_status->se_to_be_released),
+         exe = executables_[executable_idx]]() mutable {});
   } else {
     // Any donated memory returned by the ExecutionOutput can be immediately
     // freed.
@@ -2899,47 +2781,45 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
                                     device_assignment)}]() {});
   }
 
-  return std::move(result_buffer_or_status).value().ConsumeResult();
+  return std::move(std::move(result_buffer_or_status).value().result);
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
     int device_ordinal, const ExecuteOptions& options,
-    ScopedShapedBuffer result_buffer,
+    ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
     std::shared_ptr<BufferSequencingEvent> definition_event, PjRtDevice* device,
-    std::vector<std::function<void()>>& compute_callbacks,
-    std::vector<std::shared_ptr<TrackedDeviceBuffer>>& buffers_to_release)
+    std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks,
+    std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release)
     const {
   tsl::profiler::TraceMe traceme("MakeOutputBuffers");
   std::vector<std::unique_ptr<PjRtBuffer>> outputs;
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
-  if (options.untuple_result && result_buffer.on_device_shape().IsTuple()) {
-    int tuple_count = result_buffer.on_device_shape().tuple_shapes_size();
+  if (result_buffer.shape().IsTuple()) {
+    int tuple_count = result_buffer.shape().tuple_shapes().size();
     outputs.reserve(tuple_count);
     // Take ownership of each of the output values, leaving only the root table
     // in result_buffer.
     for (int i = 0; i < tuple_count; ++i) {
-      ScopedShapedBuffer tuple_buffer = result_buffer.TakeSubTree({i});
+      TF_ASSIGN_OR_RETURN(
+          ShapeTree<tsl::RCReference<RawSEDeviceMemory>> tuple_buffer,
+          result_buffer.SubShapeTree({i}));
       TF_ASSIGN_OR_RETURN(
           std::unique_ptr<PjRtBuffer> buffer,
-          OutputBufferHelper(&tuple_buffer, definition_event, client_, device,
-                             device_state, buffers_to_release));
+          OutputBufferHelper(std::move(tuple_buffer), definition_event, client_,
+                             device, device_state, buffers_to_release));
       outputs.push_back(std::move(buffer));
     }
     if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
       // Don't release the root buffer until after execution completes.
-      ShapedBuffer root_buffer_holder = result_buffer.release();
-      se::DeviceMemoryBase root_buffer = root_buffer_holder.root_buffer();
-      compute_callbacks.push_back(
-          [root_buffer, allocator{client_->allocator()}, device_ordinal]() {
-            TF_CHECK_OK(allocator->Deallocate(device_ordinal, root_buffer));
-          });
+      auto root_buffer = result_buffer.find({})->second;
+      compute_callbacks.push_back([root_buffer = std::move(root_buffer)]() {});
     }
   } else {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<PjRtBuffer> buffer,
-        OutputBufferHelper(&result_buffer, definition_event, client_, device,
-                           device_state, buffers_to_release));
+        OutputBufferHelper(std::move(result_buffer), definition_event, client_,
+                           device, device_state, buffers_to_release));
     outputs.push_back(std::move(buffer));
   }
   return outputs;
@@ -3015,19 +2895,22 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   // SPMD sharding produces a single executable for multiple partitions.
   int executable_idx = executables_.size() > 1 ? partition : 0;
 
-  std::vector<std::function<void()>> compute_callbacks;
+  std::vector<absl::AnyInvocable<void() &&>> compute_callbacks;
   std::vector<PjRtStreamExecutorBuffer::ScopedHold> device_buffers;
   device_buffers.reserve(argument_handles.size());
-  absl::StatusOr<ScopedShapedBuffer> result_buffer_or_status = EnqueueExecution(
-      argument_handles, replica, partition, executable_idx, run_id, options,
-      device, &device_buffers, std::move(device_assignment), compute_callbacks);
+  absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+      result_buffer_or_status =
+          EnqueueExecution(argument_handles, replica, partition, executable_idx,
+                           run_id, options, device, &device_buffers,
+                           std::move(device_assignment), compute_callbacks);
 
   if (!result_buffer_or_status.ok()) {
     LOG(ERROR) << "Execution of replica " << replica
                << " failed: " << result_buffer_or_status.status();
     return result_buffer_or_status.status();
   }
-  ScopedShapedBuffer result_buffer = std::move(result_buffer_or_status).value();
+  ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer =
+      std::move(result_buffer_or_status).value();
 
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
   se::Stream* stream = device_state->compute_stream();
@@ -3040,7 +2923,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
         // Even though there was an error we need to call ConfirmDonation, which
         // renders b invalid, since the computation has been enqueued and b has
         // been donated.
-        b.ConfirmDonation(true);
+        b.ConfirmDonation();
       }
     }
     return event_or.status();
@@ -3048,7 +2931,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   auto definition_event =
       std::make_shared<BufferSequencingEvent>(client_->thread_pool());
   definition_event->SetSequencingEvent(std::move(event_or).value(), stream);
-  std::vector<std::shared_ptr<TrackedDeviceBuffer>> buffers_to_release;
+  std::vector<tsl::RCReference<RawSEDeviceMemory>> buffers_to_release;
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<PjRtBuffer>> outputs,
       MakeOutputBuffers(device_ordinal, options, std::move(result_buffer),
@@ -3061,7 +2944,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
                   stream, &buffers_to_release);
     } else {
       CHECK(b.type() == PjRtStreamExecutorBuffer::ScopedHold::kDonation);
-      b.ConfirmDonation(true);
+      b.ConfirmDonation();
     }
   }
 
@@ -3074,10 +2957,11 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   }
   TF_RETURN_IF_ERROR(device_state->ThenExecuteCallback(
       stream, [callbacks{std::move(compute_callbacks)},
-               buffers_to_release{std::move(buffers_to_release)}]() {
+               buffers_to_release{std::move(buffers_to_release)}]() mutable {
         for (auto& fn : callbacks) {
-          fn();
+          std::move(fn)();
         }
+        callbacks.clear();
       }));
   metrics::ReportExecutableEnqueueTime(tsl::Env::Default()->NowMicros() -
                                        start_time_usecs);
@@ -3154,6 +3038,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
     results[0] = ExecuteHelper(argument_handles[0], replica, partition, run_id,
                                options, returned_futures.has_value());
   } else {
+    std::unique_ptr<ProfilingContext> pc = CreateProfilingContext();
     absl::Mutex mu;
     int running = num_addressable_devices;
     int failed = 0;
@@ -3167,6 +3052,8 @@ PjRtStreamExecutorLoadedExecutable::Execute(
           *tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
                ->local_device_state();
       device_state.execute_thread()->Schedule([&, replica, partition, i] {
+        std::unique_ptr<WithProfilingContext> wpc =
+            CreateWithProfilingContext(pc.get());
         results[i] =
             ExecuteHelper(argument_handles[i], replica, partition, run_id,
                           options, returned_futures.has_value());
@@ -3338,7 +3225,7 @@ absl::StatusOr<std::vector<absl::string_view>> MemoryKindsFromShape(
     return {{memory_kind}};
   }
   std::vector<absl::string_view> result;
-  result.reserve(shape.tuple_shapes_size());
+  result.reserve(shape.tuple_shapes().size());
   for (const auto& element_shape : shape.tuple_shapes()) {
     TF_ASSIGN_OR_RETURN(
         absl::string_view element_memory_kind,
@@ -3371,15 +3258,24 @@ PjRtStreamExecutorLoadedExecutable::GetOutputMemoryKinds() const {
   return out;
 }
 
+absl::Status PjRtStreamExecutorClient::UpdateCompileOptions(
+    CompileOptions* options, bool lookup_addressable_devices) {
+  return UpdateCompileOptionsInternal(options, /*returned_extras=*/nullptr,
+                                      lookup_addressable_devices);
+}
+
 absl::StatusOr<PjRtStreamExecutorClient::ExecutableExtras>
-PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
+PjRtStreamExecutorClient::UpdateCompileOptionsAndGetExecutableExtras(
+    CompileOptions* options) {
   ExecutableExtras extras;
-  std::shared_ptr<DeviceAssignment>& device_assignment =
-      extras.device_assignment;
-  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
-      addressable_device_logical_ids = extras.addressable_device_logical_ids;
-  std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
+  TF_RETURN_IF_ERROR(UpdateCompileOptionsInternal(
+      options, &extras, /*lookup_addressable_devices=*/true));
+  return extras;
+}
 
+absl::Status PjRtStreamExecutorClient::UpdateCompileOptionsInternal(
+    CompileOptions* options, ExecutableExtras* returned_extras,
+    bool lookup_addressable_devices) {
   ExecutableBuildOptions& build_options = options->executable_build_options;
   if (!build_options.compile_thread_pool()) {
     build_options.set_compile_thread_pool(thread_pool());
@@ -3417,6 +3313,27 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
 
   build_options.set_layout_canonicalization_callback(layout_callback);
 
+  // We don't look up devices when it is not required. It could fail if
+  // we look up a device ID on a client with a different topology.
+  // Note that we always look up devices for XLA GPU shard autotuning, as it
+  // needs to know the number of processes and the current process index.
+  const bool use_xla_gpu_shard_autotuning =
+      build_options.has_debug_options() &&
+      build_options.debug_options().xla_gpu_shard_autotuning();
+  if (!lookup_addressable_devices && !use_xla_gpu_shard_autotuning) {
+    if (build_options.device_ordinal() < 0) {
+      build_options.set_device_ordinal(0);
+    }
+    return absl::OkStatus();
+  }
+
+  ExecutableExtras extras;
+  std::shared_ptr<DeviceAssignment>& device_assignment =
+      extras.device_assignment;
+  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
+      addressable_device_logical_ids = extras.addressable_device_logical_ids;
+  std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
+
   int num_replicas;
   int num_partitions;
   TF_RETURN_IF_ERROR(ParseDeviceAssignmentCompileOptions(
@@ -3455,30 +3372,32 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
       }
     }
     if (addressable_devices.empty()) {
-      return InvalidArgument(
-          "Device assignment (%s) does not have any local devices.",
-          device_assignment->ToString());
-    }
-
-    if (build_options.device_ordinal() < 0) {
-      build_options.set_device_ordinal(
-          addressable_devices.front()->local_hardware_id().value());
+      if (build_options.device_ordinal() < 0) {
+        build_options.set_device_ordinal(0);
+      }
+    } else {
+      if (build_options.device_ordinal() < 0) {
+        build_options.set_device_ordinal(
+            addressable_devices.front()->local_hardware_id().value());
+      }
+      build_options.set_process_index(*this_process_index);
+      build_options.set_process_count(all_process_indices.size());
     }
-
-    build_options.set_process_index(*this_process_index);
-    build_options.set_process_count(all_process_indices.size());
   }
-  return extras;
+  if (returned_extras != nullptr) {
+    *returned_extras = std::move(extras);
+  }
+  return absl::OkStatus();
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 PjRtStreamExecutorClient::CompileInternal(
     const XlaComputation& computation,
     const std::vector<const Shape*>& argument_layout_pointers,
     LayoutCanonicalizationCallback layout_canonicalization_callback,
-    CompileOptions options) {
-  tsl::profiler::TraceMe traceme("PjRtStreamExecutorClient::Compile");
-  VLOG(1) << "PjRtStreamExecutorClient::Compile";
+    CompileOptions options, bool lookup_addressable_devices) {
+  tsl::profiler::TraceMe traceme("PjRtStreamExecutorClient::CompileInternal");
+  VLOG(1) << "PjRtStreamExecutorClient::CompileInternal";
   if (key_value_store().has_value() &&
       !options.executable_build_options.key_value_store()) {
     options.executable_build_options.set_key_value_store(*key_value_store());
@@ -3486,13 +3405,8 @@ PjRtStreamExecutorClient::CompileInternal(
   auto input_options = options;
 
   TF_RETURN_IF_ERROR(options.ApplyAllOptionOverrides());
-
-  TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&options));
-  std::shared_ptr<DeviceAssignment>& device_assignment =
-      extras.device_assignment;
-  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
-      addressable_device_logical_ids = extras.addressable_device_logical_ids;
-  std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
+  TF_RETURN_IF_ERROR(
+      UpdateCompileOptions(&options, lookup_addressable_devices));
 
   // It is important to set the canonicalization callback after creating
   // a copy of the options so that the executable's options remain without
@@ -3507,26 +3421,52 @@ PjRtStreamExecutorClient::CompileInternal(
       client()->Compile(computation, argument_layout_pointers,
                         options.executable_build_options));
 
-  auto executable = std::make_unique<PjRtStreamExecutorLoadedExecutable>(
-      std::move(local_executables), options.parameter_is_tupled_arguments,
-      std::move(device_assignment), std::move(input_options),
-      std::move(addressable_device_logical_ids), std::move(addressable_devices),
-      this);
+  return BuildPjRtExecutable(std::move(local_executables), input_options);
+}
 
-  TF_RETURN_IF_ERROR(
-      executable->SetUpDonation(options.parameter_is_tupled_arguments));
-  const auto& ex_options = options.executable_build_options;
-  if (ex_options.has_debug_options() &&
-      ex_options.debug_options().xla_gpu_dump_hlo_unoptimized_snapshots()) {
-    executable->SetInputHloSnapshotBits(
-        computation.proto(), options.executable_build_options.debug_options());
-  }
-  return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
+                                  CompileOptions options) {
+  return Compile(computation, options, /*lookup_addressable_devices=*/false);
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
+                                  CompileOptions options,
+                                  bool lookup_addressable_devices) {
+  std::vector<const Shape*> argument_layout_pointers;
+  const ExecutableBuildOptions& build_options =
+      options.executable_build_options;
+  const bool allow_auto_layout =
+      build_options.has_debug_options() &&
+      build_options.debug_options().xla_pjrt_allow_auto_layout_in_hlo();
+  TF_RETURN_IF_ERROR(DetermineArgumentLayoutsFromCompileOptions(
+      computation,
+      [local_client = client(),
+       allow_auto_layout](Shape shape) -> absl::StatusOr<Shape> {
+        if (allow_auto_layout && !shape.has_layout()) {
+          return shape;
+        }
+        return local_client->backend()
+            .transfer_manager()
+            ->ChooseCompactLayoutForShape(shape);
+      },
+      options.argument_layouts, &options.executable_build_options,
+      &argument_layout_pointers));
+  return CompileInternal(computation, argument_layout_pointers,
+                         /* layout_canonicalization_callback = */ nullptr,
+                         options, lookup_addressable_devices);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 PjRtStreamExecutorClient::Compile(mlir::ModuleOp module,
                                   CompileOptions options) {
+  return Compile(module, options, /*lookup_addressable_devices=*/false);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+PjRtStreamExecutorClient::Compile(mlir::ModuleOp module, CompileOptions options,
+                                  bool lookup_addressable_devices) {
   XlaComputation xla_computation;
   const ExecutableBuildOptions& exec_build_options =
       options.executable_build_options;
@@ -3538,7 +3478,7 @@ PjRtStreamExecutorClient::Compile(mlir::ModuleOp module,
   // If the compile options specify argument layout, then let's
   // fall back to using the options to determine layouts.
   if (options.argument_layouts) {
-    return Compile(xla_computation, options);
+    return Compile(xla_computation, options, lookup_addressable_devices);
   }
 
   TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> arg_layout_modes,
@@ -3582,36 +3522,33 @@ PjRtStreamExecutorClient::Compile(mlir::ModuleOp module,
                           options.executable_build_options));
 
   return CompileInternal(xla_computation, arg_layouts_and_pointers.second,
-                         layout_callback, options);
+                         layout_callback, options, lookup_addressable_devices);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
-                                  CompileOptions options) {
-  std::vector<const Shape*> argument_layout_pointers;
-  const ExecutableBuildOptions& build_options =
-      options.executable_build_options;
-  const bool allow_auto_layout =
-      build_options.has_debug_options() &&
-      build_options.debug_options().xla_pjrt_allow_auto_layout_in_hlo();
-  TF_RETURN_IF_ERROR(DetermineArgumentLayoutsFromCompileOptions(
-      computation,
-      [local_client = client(),
-       allow_auto_layout](Shape shape) -> absl::StatusOr<Shape> {
-        if (allow_auto_layout && !shape.has_layout()) {
-          return shape;
-        }
-        return local_client->backend()
-            .transfer_manager()
-            ->ChooseCompactLayoutForShape(shape);
-      },
-      options.argument_layouts, &options.executable_build_options,
-      &argument_layout_pointers));
-  return CompileInternal(computation, argument_layout_pointers,
-                         /* layout_canonicalization_callback = */ nullptr,
-                         options);
+PjRtStreamExecutorClient::CompileAndLoad(const XlaComputation& computation,
+                                         CompileOptions options) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtExecutable> executable,
+      Compile(computation, options, /*lookup_addressable_devices=*/true));
+  return Load(std::move(executable), LoadOptions());
 }
 
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+PjRtStreamExecutorClient::CompileAndLoad(mlir::ModuleOp module,
+                                         CompileOptions options) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtExecutable> executable,
+      Compile(module, options, /*lookup_addressable_devices=*/true));
+  return Load(std::move(executable), LoadOptions());
+}
+
+namespace {
+
+constexpr absl::string_view kPjRtClientName = "PjRtStreamExecutorClient";
+
+}  // namespace
+
 absl::StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
     const PjRtLoadedExecutable& executable) const {
   const PjRtStreamExecutorLoadedExecutable* se_executable =
@@ -3643,23 +3580,66 @@ absl::StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
   *proto.mutable_serialized_executable() = std::move(serialized);
   TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
                       se_executable->compile_options_.ToProto());
+  *proto.mutable_pjrt_client_name() = kPjRtClientName;
   return proto.SerializeAsString();
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+PjRtStreamExecutorClient::BuildPjRtExecutable(
+    std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+    CompileOptions compile_options) {
+  if (local_executables.empty()) {
+    return Internal("No local executable");
+  }
+  if (local_executables.size() != 1) {
+    return Unimplemented("Multiple executables are not supported");
+  }
+  Executable* built_executable = local_executables[0]->executable();
+  if (!built_executable->has_module()) {
+    return absl::InternalError("Executable does not have HLO modules.");
+  }
+  const auto& hlo_module = built_executable->module();
+
+  const int num_replicas = hlo_module.config().replica_count();
+  const int num_partitions = hlo_module.config().num_partitions();
+  const std::string name = hlo_module.name();
+  const std::string fingerprint = hlo_module.GetFingerprint128();
+
+  return std::make_unique<StreamExecutorExecutable>(
+      std::move(compile_options), std::move(local_executables), client_,
+      num_replicas, num_partitions, name, fingerprint,
+      memory_spaces()[0]->kind());
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 PjRtStreamExecutorClient::DeserializeExecutable(
+    absl::string_view serialized,
+    std::optional<CompileOptions> compile_options) {
+  TF_ASSIGN_OR_RETURN(
+      auto local_executables_and_options,
+      DeserializeToLocalExecutable(serialized, compile_options));
+
+  return BuildPjRtExecutable(std::move(local_executables_and_options.first),
+                             local_executables_and_options.second);
+}
+
+absl::StatusOr<
+    std::pair<std::vector<std::unique_ptr<LocalExecutable>>, CompileOptions>>
+PjRtStreamExecutorClient::DeserializeToLocalExecutable(
     absl::string_view serialized, std::optional<CompileOptions> options) {
   ExecutableAndOptionsProto proto;
   if (serialized.size() > std::numeric_limits<int>::max()) {
-    return Internal(
-        "PjRtStreamExecutorClient::DeserializeExecutable proto too large "
-        "(>2GB)");
+    return Internal("Proto is too large (>2GB)");
   }
   if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+    return Internal("Proto deserialization failed");
+  }
+  if (!proto.pjrt_client_name().empty() &&
+      proto.pjrt_client_name() != kPjRtClientName) {
     return Internal(
-        "PjRtStreamExecutorClient::DeserializeExecutable proto "
-        "deserialization "
-        "failed");
+        "Serialized executable is from an incompatible PjRt client type. "
+        "PjRt client type expected by the serialized executable: %s",
+        proto.pjrt_client_name());
   }
 
   CompileOptions compile_options;
@@ -3669,19 +3649,10 @@ PjRtStreamExecutorClient::DeserializeExecutable(
     TF_ASSIGN_OR_RETURN(compile_options,
                         CompileOptions::FromProto(proto.compile_options()));
   }
-  auto input_options = compile_options;
 
   tsl::profiler::TraceMe traceme(
-      "PjRtStreamExecutorClient::DeserializeExecutable");
-  VLOG(1) << "PjRtStreamExecutorClient::DeserializeExecutable";
-
-  TF_ASSIGN_OR_RETURN(ExecutableExtras extras,
-                      GetExecutableExtras(&compile_options));
-  std::shared_ptr<DeviceAssignment>& device_assignment =
-      extras.device_assignment;
-  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
-      addressable_device_logical_ids = extras.addressable_device_logical_ids;
-  std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
+      "PjRtStreamExecutorClient::DeserializeToLocalExecutable");
+  VLOG(1) << "PjRtStreamExecutorClient::DeserializeToLocalExecutable";
 
   std::string str = std::move(*proto.mutable_serialized_executable());
   TF_ASSIGN_OR_RETURN(
@@ -3691,6 +3662,45 @@ PjRtStreamExecutorClient::DeserializeExecutable(
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.push_back(std::move(loaded));
 
+  return std::make_pair(std::move(local_executables), compile_options);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+PjRtStreamExecutorClient::LoadSerializedExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options,
+    const LoadOptions& load_options) {
+  TF_ASSIGN_OR_RETURN(auto local_executables_and_options,
+                      DeserializeToLocalExecutable(serialized, options));
+  return LoadInternal(std::move(local_executables_and_options.first),
+                      local_executables_and_options.second);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+PjRtStreamExecutorClient::LoadInternal(
+    std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+    CompileOptions compile_options) {
+  auto input_options = compile_options;
+
+  TF_RETURN_IF_ERROR(compile_options.ApplyAllOptionOverrides());
+
+  TF_ASSIGN_OR_RETURN(
+      ExecutableExtras extras,
+      UpdateCompileOptionsAndGetExecutableExtras(&compile_options));
+  std::shared_ptr<DeviceAssignment>& device_assignment =
+      extras.device_assignment;
+  std::vector<PjRtStreamExecutorLoadedExecutable::LogicalDeviceIds>&
+      addressable_device_logical_ids = extras.addressable_device_logical_ids;
+  std::vector<PjRtDevice*>& addressable_devices = extras.addressable_devices;
+
+  const auto& ex_options = compile_options.executable_build_options;
+  const bool xla_gpu_dump_hlo_unoptimized_snapshots =
+      ex_options.has_debug_options() &&
+      ex_options.debug_options().xla_gpu_dump_hlo_unoptimized_snapshots();
+  HloModuleProto hlo_module_proto;
+  if (xla_gpu_dump_hlo_unoptimized_snapshots) {
+    hlo_module_proto = local_executables[0]->executable()->module().ToProto();
+  }
+
   auto executable = std::make_unique<PjRtStreamExecutorLoadedExecutable>(
       std::move(local_executables),
       compile_options.parameter_is_tupled_arguments,
@@ -3700,14 +3710,27 @@ PjRtStreamExecutorClient::DeserializeExecutable(
 
   TF_RETURN_IF_ERROR(
       executable->SetUpDonation(compile_options.parameter_is_tupled_arguments));
+  if (xla_gpu_dump_hlo_unoptimized_snapshots) {
+    executable->SetInputHloSnapshotBits(
+        std::move(hlo_module_proto),
+        compile_options.executable_build_options.debug_options());
+  }
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-PjRtStreamExecutorClient::LoadSerializedExecutable(
-    absl::string_view serialized, std::optional<CompileOptions> options,
-    const LoadOptions& load_options) {
-  return DeserializeExecutable(serialized, options);
+PjRtStreamExecutorClient::Load(std::unique_ptr<PjRtExecutable> executable,
+                               const LoadOptions& load_options) {
+  auto se_executable = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorExecutable*>(executable.release()));
+  CompileOptions compile_options = se_executable->compile_options();
+
+  tsl::profiler::TraceMe traceme("PjRtStreamExecutorClient::Load");
+  VLOG(1) << "PjRtStreamExecutorClient::Load";
+
+  TF_ASSIGN_OR_RETURN(auto local_executables, se_executable->ConsumeExecutable(
+                                                  client(), compile_options));
+  return LoadInternal(std::move(local_executables), compile_options);
 }
 
 bool PjRtStreamExecutorClient::IsDmaMapped(const void* data_start,
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 72e6239e81e8..ecf44ab0b34b 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -77,12 +78,28 @@ limitations under the License.
 
 namespace xla {
 
+struct PjRtStreamExecutorExecutionInput {
+  // Donation is not complete until ReleaseDeviceMemory() is called on the
+  // TrackedDeviceBuffer that provides buf.
+  bool is_donated;
+  tsl::RCReference<RawSEDeviceMemory> buf;
+};
+
+struct PjRtStreamExecutorExecutionOutput {
+  ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result;
+  // Donated inputs which must be freed.
+  std::vector<tsl::RCReference<RawSEDeviceMemory>> to_be_released;
+  // For PjRtStreamExecutorClient implementations that
+  // use OwningDeviceMemory for donated inputs.
+  std::vector<se::OwningDeviceMemory> se_to_be_released;
+};
+
 class PjRtStreamExecutorDevice : public PjRtDevice {
  public:
-  explicit PjRtStreamExecutorDevice(
-      int id, std::unique_ptr<LocalDeviceState> local_device_state,
-      std::string device_kind, int process_index = 0)
-      : description_(id, std::move(device_kind), process_index),
+  PjRtStreamExecutorDevice(int id,
+                           std::unique_ptr<LocalDeviceState> local_device_state,
+                           int process_index, std::string device_kind)
+      : description_(id, process_index, std::move(device_kind)),
         local_device_id_(local_device_state
                              ? local_device_state->local_device_id()
                              : PjRtLocalDeviceId(-1)),
@@ -260,26 +277,34 @@ class PjRtStreamExecutorClient : public PjRtClient {
   absl::StatusOr<Layout> GetDefaultLayout(
       PrimitiveType element_type, absl::Span<const int64_t> dims) override;
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
+      const XlaComputation& computation, CompileOptions options) override;
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options) override;
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       mlir::ModuleOp mlir_module, CompileOptions options) override;
 
   virtual absl::StatusOr<std::string> SerializeExecutable(
       const PjRtLoadedExecutable& executable) const;
 
-  // For PjRtStreamExecutorClient, `options` is mandatory.
-  // This function returns an InvalidArgument error if `std::nullopt` is passed.
-  // TODO(b/237720161): make it actually optional
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> DeserializeExecutable(
       absl::string_view serialized,
       std::optional<CompileOptions> options) override;
 
+  // For PjRtStreamExecutorClient, `options` is mandatory.
+  // This function returns an InvalidArgument error if `std::nullopt` is passed.
+  // TODO(b/237720161): make it actually optional
   absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
   LoadSerializedExecutable(absl::string_view serialized,
                            std::optional<CompileOptions> options,
                            const LoadOptions& load_options) override;
 
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable,
+      const LoadOptions& load_options) override;
+
   absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const override;
 
@@ -304,8 +329,10 @@ class PjRtStreamExecutorClient : public PjRtClient {
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       PjRtMemorySpace* memory_space, const Layout* device_layout) override;
 
+  using PjRtClient::BufferFromHostLiteral;
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
-      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override;
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space,
+      const Layout* device_layout) override;
 
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
@@ -325,11 +352,6 @@ class PjRtStreamExecutorClient : public PjRtClient {
 
   bool IsDmaMapped(const void* data_start, int64_t transfer_size);
 
-  // TODO(zhangqiaorjc): Experimental. Will be removed.
-  absl::Status Defragment() override {
-    return Unimplemented("Defragment not implemented");
-  }
-
   LocalDeviceState& device_state(int device_ordinal) const {
     return *tensorflow::down_cast<PjRtStreamExecutorDevice*>(
                 LookupAddressableDevice(xla::PjRtLocalDeviceId(device_ordinal))
@@ -351,8 +373,14 @@ class PjRtStreamExecutorClient : public PjRtClient {
 
   tsl::thread::ThreadPool* thread_pool() { return &thread_pool_; }
 
+  virtual absl::StatusOr<PjRtStreamExecutorExecutionOutput> RunAsync(
+      LocalExecutable& exec, PjRtDevice* device,
+      std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>> arguments,
+      ExecutableRunOptions run_options);
+
  protected:
   friend class PjRtStreamExecutorBuffer;
+  friend class PjRtStreamExecutorRawBuffer;
 
   virtual absl::Status EnqueueCrossHostReceive(
       absl::Span<const std::unique_ptr<PjRtBuffer>> buffers,
@@ -375,6 +403,20 @@ class PjRtStreamExecutorClient : public PjRtClient {
     return PjRtFuture<>(Unimplemented("Raw copies to host not implemented."));
   }
 
+  virtual PjRtFuture<> CopyRawHostToDevice(
+      LocalDeviceState* local_device,
+      tsl::RCReference<RawSEDeviceMemory> device_buffer, const void* src,
+      int64_t offset, int64_t transfer_size) {
+    return PjRtFuture<>(Unimplemented("Raw copies h2d not implemented."));
+  }
+
+  virtual PjRtFuture<> CopyRawDeviceToHost(
+      LocalDeviceState* local_device,
+      tsl::RCReference<RawSEDeviceMemory> device_buffer, void* dst,
+      int64_t offset, int64_t transfer_size) {
+    return PjRtFuture<>(Unimplemented("Raw copies d2h not implemented."));
+  }
+
   // Helper function for creating PjRtStreamExecutorExecutables. Modifies
   // `options` in-place.
   struct ExecutableExtras {
@@ -383,13 +425,47 @@ class PjRtStreamExecutorClient : public PjRtClient {
         addressable_device_logical_ids;
     std::vector<PjRtDevice*> addressable_devices;
   };
-  absl::StatusOr<ExecutableExtras> GetExecutableExtras(CompileOptions* options);
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileInternal(
+  // Updates `options` for compilation.
+  absl::Status UpdateCompileOptions(CompileOptions* options,
+                                    bool lookup_addressable_devices);
+
+  // Same as above, but also returns the executable extras.
+  absl::StatusOr<ExecutableExtras> UpdateCompileOptionsAndGetExecutableExtras(
+      CompileOptions* options);
+
+  // Updates `options` for compilation, and gets the executable extras if
+  // `returned_extras` is not null. It skips addressable device lookup if
+  // `lookup_addressable_devices` is false.
+  absl::Status UpdateCompileOptionsInternal(CompileOptions* options,
+                                            ExecutableExtras* returned_extras,
+                                            bool lookup_addressable_devices);
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options,
+      bool lookup_addressable_devices);
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options,
+      bool lookup_addressable_devices);
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> CompileInternal(
       const XlaComputation& computation,
       const std::vector<const Shape*>& argument_layout_pointers,
       LayoutCanonicalizationCallback layout_canonicalization_callback,
-      CompileOptions options);
+      CompileOptions options, bool lookup_addressable_devices);
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> BuildPjRtExecutable(
+      std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+      CompileOptions compile_options);
+
+  absl::StatusOr<
+      std::pair<std::vector<std::unique_ptr<LocalExecutable>>, CompileOptions>>
+  DeserializeToLocalExecutable(absl::string_view serialized,
+                               std::optional<CompileOptions> options);
+
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadInternal(
+      std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+      CompileOptions compile_options);
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBufferInternal(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
@@ -448,110 +524,10 @@ class PjRtStreamExecutorClient : public PjRtClient {
 absl::StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
     absl::Span<const std::vector<PjRtDevice*>> devices);
 
-class PjRtStreamExecutorBuffer : public PjRtBuffer {
+class PjRtStreamExecutorBuffer : public CommonPjRtBuffer {
  public:
-  // Helper class to retain a "hold" on a PjRtStreamExecutorBuffer. A ScopedHold
-  // may not outlive its parent PjRtStreamExecutorBuffer.
-  //
-  // There are three types of hold, as follows:
-  //
-  // 1) Usage hold: a transient hold while an operation using the buffer is
-  //    being enqueued onto a stream.
-  // A client acquires a usage hold by calling
-  // PjRtStreamExecutorBuffer::GetBufferWithHold(kUsage) or the convenience
-  // wrapper GetBufferWithUsageHold(). If the enqueue completes successfully the
-  // hold should be released using a call to ConvertUsageHold. If the ScopedHold
-  // is deleted without ConvertUsageHold being called, e.g., on error, the hold
-  // is dropped. It is legal to drop a usage hold instead of calling
-  // ConvertUsageHold, even if the buffer was successfully enqueued, as long as
-  // the client ensures that all necessary synchronization has been done.
-  //
-  // 2) External hold: a potentially long-lived hold while the buffer is being
-  //    shared by an external framework, e.g., NumPy.
-  // A client acquires an external hold by calling
-  // PjRtStreamExecutorBuffer::GetBufferWithHold(kExternal) or the convenience
-  // wrapper GetBufferWithExternalReference and releases it by deleting the
-  // ScopedHold. The external framework should not modify the underlying buffer
-  // unless it is confident via its own synchronization that modifications do
-  // not race with reads from the PjRtStreamExecutorBuffer.
-  //
-  // 3) Donation hold: a transient hold while an execution that donates the
-  //    buffer is being enqueued onto the compute stream.
-  // A client acquires a donation hold by calling
-  // PjRtStreamExecutorBuffer::GetBufferWithHold(kDonation). If the enqueue
-  // completes successfully the hold should be released using a call to
-  // ConfirmDonation after which the buffer is invalid. If the ScopedHold is
-  // deleted without ConfirmDonation being called, e.g., on error, the hold is
-  // dropped and the buffer remains valid. If the buffer is successfully
-  // enqueued the client *must* call ConfirmDonation.
-  //
-  // Donation holds behave like exclusive write locks: when a donation hold
-  // has been acquired, any attempt to acquire another hold of any type will
-  // block until the donation hold is dropped or confirmed. Acquiring a donation
-  // hold will fail with an error if there is any outstanding external hold, and
-  // will block if there are any outstanding usage holds until those holds are
-  // dropped or converted.
-  //
-  // Calls to PjRtStreamExecutorBuffer::Release (and transitively to
-  // PjRtStreamExecutorBuffer::Delete() and ~PjRtStreamExecutorBuffer()) will
-  // block until all usage and donation holds are either deleted or
-  // converted/confirmed.
-  class ScopedHold {
+  class ScopedHold : public CommonPjRtBuffer::ScopedHold {
    public:
-    enum Type { kUsage = 0, kExternalReference, kDonation, kMaxValue };
-    // Use a State enum instead of encoding the state in an error absl::Status
-    // to avoid creating absl::Status values in non-error cases. Creating a
-    // absl::Status entails several allocations and can add O(us) to every use
-    // of a hold.
-    enum State {
-      kUninitialized = 0,
-      kValid,
-      kMoved,
-      kConverted,
-      kReleased,
-      kDonated,
-      kError
-    };
-
-    ~ScopedHold();
-    ScopedHold(ScopedHold&& other);
-    ScopedHold(const ScopedHold&) = delete;
-    ScopedHold& operator=(const ScopedHold&) = delete;
-
-    Type type() const { return type_; }
-
-    absl::Status status() const {
-      // Lazily create absl::Status values only when they are requested.
-      switch (state_) {
-        case kUninitialized:
-          return InvalidArgument("Buffer has not been initialized");
-        case kValid:
-          return absl::OkStatus();
-        case kMoved:
-          return InvalidArgument("Buffer has been moved.");
-        case kConverted:
-          return InvalidArgument("Buffer has been converted");
-        case kReleased:
-          return InvalidArgument("Buffer has been released");
-        case kDonated:
-          return InvalidArgument("Buffer has been donated");
-        case kError:
-          return status_;
-        default:
-          CHECK(false) << "Unexpected state value " << state_;
-      }
-    }
-    bool ok() const { return state_ == kValid; }
-
-    // Access to the underlying device buffer storage. Requires this->ok().
-    const std::shared_ptr<TrackedDeviceBuffer>& buffer() const {
-      CHECK_EQ(state_, kValid);
-      CHECK_NE(buffer_, nullptr);
-      return buffer_;
-    }
-    TrackedDeviceBuffer* operator->() const { return buffer().get(); }
-    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
-
     // Converts the hold into a usage event. Only valid for holds of type
     // kUsage.
     //
@@ -566,70 +542,25 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
                           std::shared_ptr<BufferSequencingEvent> event,
                           bool reference_held);
 
-    // Confirms that the buffer was successfully donated to an execution.
-    // Only valid for holds of type kDonation. Causes the buffer to become
-    // invalid.
-    // TODO(parkers): Only allow safe releases.
-    void ConfirmDonation(bool unsafe_release);
-
-    // Adds the held device buffers in order to 'iterator'. Used to add the
-    // buffers to an ExecutionInput. We require but do not verify that
-    // 'iterator' when passed in is pointing to a sub-tuple of the
-    // ExecutionInput whose on_device_shape matches that of the
-    // TrackedDeviceBuffer. 'end' is used to check that 'iterator' doesn't run
-    // out of bounds. Donates the device buffers if the hold type is kDonation,
-    // otherwise retains ownership of the device buffers.
-    void AddToInput(ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-                    const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
-                    ExecutionInput* execution_input,
-                    se::DeviceMemoryAllocator* allocator) const;
+    TrackedDeviceBuffer* buffer() const {
+      return static_cast<TrackedDeviceBuffer*>(
+          CommonPjRtBuffer::ScopedHold::buffer());
+    }
+    TrackedDeviceBuffer* operator->() const { return buffer(); }
+    const TrackedDeviceBuffer& operator*() const { return *buffer(); }
+
+    PjRtStreamExecutorBuffer* parent() const {
+      return static_cast<PjRtStreamExecutorBuffer*>(
+          CommonPjRtBuffer::ScopedHold::parent());
+    }
 
    private:
+    using CommonPjRtBuffer::ScopedHold::ScopedHold;
     friend class PjRtStreamExecutorBuffer;
     friend class PjRtStreamExecutorClient;
-
-    // Helper struct that makes it possible to move a ScopedHold through a
-    // closure.
-    using ForClosure =
-        std::tuple<PjRtStreamExecutorBuffer*, Type, State, absl::Status,
-                   std::shared_ptr<TrackedDeviceBuffer>>;
-
-    ScopedHold(PjRtStreamExecutorBuffer* parent, Type type)
-        : parent_(parent), type_(type), state_(kUninitialized) {}
-    explicit ScopedHold(const ForClosure& closure_helper)
-        : parent_(std::get<0>(closure_helper)),
-          type_(std::get<1>(closure_helper)),
-          state_(std::get<2>(closure_helper)),
-          status_(std::get<3>(closure_helper)),
-          buffer_(std::get<4>(closure_helper)) {
-      // Check the buffer is not in an error state.
-      CHECK(status_.ok() && buffer_ != nullptr);
-    }
-
-    // Sets buffer state.
-    void SetState(State state) { state_ = state; }
-
-    // Sets buffer_ and status_. Called by parent_ to initialize the hold.
-    void Acquire(
-        absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or);
-    // Releases the contents of *this, so *this can subsequently be
-    // deleted without releasing the parent's hold. Should be passed to the
-    // appropriate constructor of another ScopedHold, e.g., when a hold must be
-    // passed through a closure that is incompatible with std::move.
-    ForClosure ToClosure();
-
-    PjRtStreamExecutorBuffer* const parent_;
-    const Type type_;
-
-    // There is an invariant that if ok() then
-    // buffer_.value() != nullptr.
-    State state_;
-    absl::Status status_;
-    std::shared_ptr<TrackedDeviceBuffer> buffer_;
   };
-
   PjRtStreamExecutorBuffer(Shape on_device_shape,
-                           std::shared_ptr<TrackedDeviceBuffer> device_buffer,
+                           std::unique_ptr<TrackedDeviceBuffer> device_buffer,
                            PjRtClient* client, PjRtDevice* device,
                            PjRtMemorySpace* memory_space);
   ~PjRtStreamExecutorBuffer() override;
@@ -640,6 +571,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   PjRtStreamExecutorBuffer& operator=(PjRtStreamExecutorBuffer&&) = delete;
 
   const Shape& on_device_shape() const override { return on_device_shape_; }
+
   absl::StatusOr<Shape> logical_on_device_shape() override;
   PjRtMemorySpace* memory_space() const override { return memory_space_; }
   PjRtStreamExecutorDevice* device() const override { return device_; }
@@ -648,7 +580,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   PjRtStreamExecutorClient* client() const override { return client_; }
   bool IsEmptyTuple() const {
     return on_device_shape_.IsTuple() &&
-           on_device_shape_.tuple_shapes_size() == 0;
+           on_device_shape_.tuple_shapes().size() == 0;
   }
 
   absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
@@ -682,12 +614,6 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   // external framework drops the reference.
   void Delete() override;
 
-  bool IsDeleted() override;
-
-  // Returns a view of the PjRtBuffer device memory as a ShapedBuffer. The
-  // PjRtBuffer retains ownership of the device buffers.
-  absl::StatusOr<ShapedBuffer> AsShapedBuffer() const;
-
   // Returns a hold on the TrackedDeviceBuffer holding the device
   // buffers. See comment on ScopedHold.
   ScopedHold GetBufferWithHold(ScopedHold::Type type);
@@ -723,7 +649,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   // If the buffer was shared via an external reference it is the client's
   // responsibility that accesses via that reference do not interfere with
   // accesses via the buffer returned from Release.
-  absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
+  absl::StatusOr<tsl::RCReference<RawSEDeviceMemory>> Release(
       bool wait_for_operations_to_complete);
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
@@ -732,26 +658,10 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
  private:
   friend class PjRtClient;
 
-  // Blocks in mu_.Await until there are no more usage holds.
-  void WaitForOutstandingUsageHolds() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Blocks in mu_.Await until there is no donation hold.
-  void WaitForOutstandingDonationHold() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Adds a hold of 'type' and returns device_buffer_. Returns an error if
-  // device_buffer_ is null, or if a donation hold was requested when there is
-  // an outstanding external hold.
-  // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
-  // must be called first.)
-  absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> GetBufferForHoldLocked(
-      ScopedHold::Type type) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Adds a hold of hold->type() and initializes `hold` with device_buffer_.
-  // Initializes hold with an error if device_buffer_ is null, or if a donation
-  // hold was requested when there is an outstanding external hold.
-  // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
-  // must be called first.)
-  void AcquireHoldLocked(ScopedHold* hold) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  TrackedDeviceBuffer* device_buffer() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return static_cast<TrackedDeviceBuffer*>(CommonPjRtBuffer::device_buffer());
+  }
 
   // Drops a usage hold and calls device_buffer_->AddUsageEvent. Does a sanity
   // check that buffer==device_buffer_ or device_buffer_==nullptr. Called after
@@ -760,15 +670,6 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
                         std::shared_ptr<BufferSequencingEvent> event,
                         bool reference_held);
 
-  // Drops a donation hold and makes *this invalid for further use. Does a
-  // sanity check that buffer==device_buffer_. Called after device_buffer_ was
-  // successfully donated to an execution.
-  void ConfirmDonation(TrackedDeviceBuffer* device_buffer, bool unsafe_release);
-
-  // Drops a hold without taking any other action. Does a sanity check that
-  // buffer==device_buffer_ or device_buffer_==nullptr.
-  void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
-
   absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                            std::shared_ptr<BufferSequencingEvent>>>
   CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
@@ -776,7 +677,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
                      LocalDeviceState* transfer_local_device,
                      LocalDeviceState* src_local_device,
                      se::Stream* transfer_stream,
-                     std::shared_ptr<TrackedDeviceBuffer> src_device_buffer);
+                     const TrackedDeviceBuffer& src_device_buffer);
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceMemorySpace(
       PjRtDevice* dst_device, PjRtMemorySpace* dst_memory_space = nullptr);
 
@@ -784,12 +685,6 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   const Shape on_device_shape_;
   PjRtStreamExecutorDevice* const device_;
   PjRtMemorySpace* const memory_space_;
-
-  mutable absl::Mutex mu_;
-  std::shared_ptr<TrackedDeviceBuffer> device_buffer_ ABSL_GUARDED_BY(mu_);
-  // Count of holds on the buffer.
-  std::array<int, ScopedHold::Type::kMaxValue> holds_ ABSL_GUARDED_BY(mu_);
-  PjRtFuture<>::Promise definition_promise_ ABSL_GUARDED_BY(mu_);
 };
 
 // Allocates the device buffers for a buffer that will be used as the
@@ -855,9 +750,10 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
     }
     CompiledMemoryStats memory_stats = CompiledMemoryStats();
     memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
-    const HloProto* proto = executables_[0]->executable()->hlo_proto();
+    const BufferAssignmentProto* proto =
+        executables_[0]->executable()->buffer_assignment_proto();
     if (proto != nullptr) {
-      memory_stats.serialized_hlo_proto = proto->SerializeAsString();
+      memory_stats.buffer_assignment = *proto;
     }
     memory_stats.PopulateBufferStatsFromAllocations(
         executables_[0]->executable()->GetAllocations());
@@ -948,7 +844,8 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
   virtual absl::Span<int const> ParametersThatMustBeDonated(
       int executable_idx) const;
 
-  virtual absl::StatusOr<std::vector<ExecutionInput>>
+  virtual absl::StatusOr<
+      std::vector<ShapeTree<PjRtStreamExecutorExecutionInput>>>
   MakeExecutionInputsAndWaitForEvents(
       int device_ordinal, const ExecuteOptions& options,
       absl::Span<const Shape> executable_parameter_shapes,
@@ -956,22 +853,24 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
       absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
       absl::flat_hash_set<BufferSequencingEvent*>& events) const;
 
-  absl::StatusOr<ScopedShapedBuffer> EnqueueExecution(
+  absl::StatusOr<ShapeTree<tsl::RCReference<RawSEDeviceMemory>>>
+  EnqueueExecution(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
       const ExecuteOptions& options, PjRtDevice* device,
       std::vector<PjRtStreamExecutorBuffer::ScopedHold>* device_buffers,
       std::shared_ptr<DeviceAssignment> device_assignment,
-      std::vector<std::function<void()>>& compute_callbacks) const;
+      std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks) const;
 
   virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-  MakeOutputBuffers(int device_ordinal, const ExecuteOptions& options,
-                    ScopedShapedBuffer result_buffer,
-                    std::shared_ptr<BufferSequencingEvent> definition_event,
-                    PjRtDevice* device,
-                    std::vector<std::function<void()>>& compute_callbacks,
-                    std::vector<std::shared_ptr<TrackedDeviceBuffer>>&
-                        buffers_to_release) const;
+  MakeOutputBuffers(
+      int device_ordinal, const ExecuteOptions& options,
+      ShapeTree<tsl::RCReference<RawSEDeviceMemory>> result_buffer,
+      std::shared_ptr<BufferSequencingEvent> definition_event,
+      PjRtDevice* device,
+      std::vector<absl::AnyInvocable<void() &&>>& compute_callbacks,
+      std::vector<tsl::RCReference<RawSEDeviceMemory>>& buffers_to_release)
+      const;
 
   absl::StatusOr<Result> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
index 6baae40edf34..2f70860cddd6 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -62,7 +62,7 @@ absl::StatusOr<std::unique_ptr<PjRtStreamExecutorClient>> GetClient() {
       /*allow_event_reuse=*/false, /*use_callback_stream=*/false);
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   devices.emplace_back(std::make_unique<PjRtStreamExecutorDevice>(
-      0, std::move(device_state), "cpu"));
+      0, std::move(device_state), /*process_index=*/0, "cpu"));
   std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces;
   memory_spaces.emplace_back(std::make_unique<PjRtStreamExecutorMemorySpace>(
       0, devices.back().get(), "cpu", 0));
@@ -90,7 +90,7 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> ToyExecutable(
   TF_ASSIGN_OR_RETURN(auto computation,
                       builder.Build(/*remove_dynamic_dimensions=*/true));
   TF_ASSIGN_OR_RETURN(auto executable,
-                      client.Compile(computation, compile_options));
+                      client.CompileAndLoad(computation, compile_options));
   return executable;
 }
 
@@ -105,8 +105,9 @@ absl::Status ExecuteWithSameInputBuffer(
                           shape, *device0->default_memory_space()));
   TF_ASSIGN_OR_RETURN(auto executable,
                       ToyExecutable(*client, shape, std::move(set_up_aliases)));
-  return executable->Execute({{buffer.get(), buffer.get()}}, /*options=*/{})
-      .status();
+  xla::ExecuteOptions options;
+  options.untuple_result = true;
+  return executable->Execute({{buffer.get(), buffer.get()}}, options).status();
 }
 
 TEST(PjRtStreamExecutorClientTest, DonateSameBufferTwice) {
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_device_description.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_device_description.h
index 25bc380f823f..2680d6826a8d 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_device_description.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_device_description.h
@@ -25,8 +25,8 @@ namespace xla {
 
 class PjRtStreamExecutorDeviceDescription : public PjRtDeviceDescription {
  public:
-  explicit PjRtStreamExecutorDeviceDescription(int id, std::string device_kind,
-                                               int process_index = 0)
+  PjRtStreamExecutorDeviceDescription(int id, int process_index,
+                                      std::string device_kind)
       : id_(id),
         process_index_(process_index),
         device_kind_(std::move(device_kind)) {}
diff --git a/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD b/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD
index cd18a91dacbb..101d50dbb35a 100644
--- a/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/example_plugin/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -58,7 +58,6 @@ xla_cc_test(
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_c_pjrt.cc b/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_c_pjrt.cc
index 7945ed29a1c0..56a850699208 100644
--- a/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_c_pjrt.cc
+++ b/third_party/xla/xla/pjrt/plugin/example_plugin/myplugin_c_pjrt.cc
@@ -60,8 +60,7 @@ const PJRT_Api* GetPjrtApi() {
       myplugin_pjrt::PJRT_MypluginClient_Create,
       myplugin_pjrt::PJRT_MypluginExecuteContext_Create,
       myplugin_pjrt::PJRT_MypluginDeviceTopology_Create,
-      pjrt::PJRT_Plugin_Initialize_NoOp,
-      reinterpret_cast<PJRT_Extension_Base*>(&layouts_extension),
+      pjrt::PJRT_Plugin_Initialize_NoOp, &layouts_extension.base,
       pjrt::PJRT_Plugin_Attributes_Xla);
 
   printf("MyPlugin called GetPjrtApi\n");
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD b/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD
index 6ae79fc483df..2e3bc46d7cf5 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -15,7 +15,6 @@ cc_library(
     ],
     hdrs = ["xla_cpu_pjrt_client.h"],
     deps = [
-        ":cpu_client_options",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/cpu:cpu_client",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
index aec801763e14..81d421610238 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_client_options.h
@@ -34,6 +34,10 @@ struct CpuClientOptions {
   // computations inline.
   bool asynchronous = true;
 
+  // If true, there is only one memory space, "unpinned_host", and it behaves
+  // the same as "device" from the non-legacy behavior.
+  bool legacy_memory_space_behavior = true;
+
   // Number of CPU devices. If not provided, the value of
   // --xla_force_host_platform_device_count is used.
   std::optional<int> cpu_device_count = std::nullopt;
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc
index 60a9054588d6..38cb2e5a901e 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.cc
@@ -37,21 +37,6 @@ limitations under the License.
 
 namespace xla {
 
-/*static*/ CpuTopologyDescription CpuTopologyDescription::Create(
-    PjRtPlatformId platform_id, absl::string_view platform_name,
-    absl::string_view platform_version,
-    absl::Span<const std::unique_ptr<PjRtDevice>> devices,
-    absl::Span<const std::string> machine_attributes) {
-  std::vector<CpuTopology::CpuDevice> cpu_devices;
-  cpu_devices.reserve(devices.size());
-  for (const auto& device : devices) {
-    cpu_devices.push_back(CpuTopology::CpuDevice{
-        device->process_index(), device->local_hardware_id().value()});
-  }
-  return CpuTopologyDescription(platform_id, platform_name, platform_version,
-                                cpu_devices, machine_attributes);
-}
-
 absl::StatusOr<Layout> CpuTopologyDescription::GetDefaultLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dims) const {
   Shape shape = ShapeUtil::MakeShape(element_type, dims);
diff --git a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
index 545644c0c7ea..76521e2326e4 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_cpu/cpu_topology_description.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -36,18 +37,12 @@ namespace xla {
 
 class CpuTopologyDescription : public PjRtTopologyDescription {
  public:
-  static CpuTopologyDescription Create(
-      PjRtPlatformId platform_id, absl::string_view platform_name,
-      absl::string_view platform_version,
-      absl::Span<const std::unique_ptr<PjRtDevice>> devices,
-      absl::Span<const std::string> machine_attributes);
-
   // `cpu_device_ids` is the list of logical device ids for the CPU devices and
   // will be used to initialize the CPU topology.
   CpuTopologyDescription(const PjRtPlatformId platform_id,
                          const absl::string_view platform_name,
                          const absl::string_view platform_version,
-                         const std::vector<CpuTopology::CpuDevice> cpu_devices,
+                         std::vector<CpuTopology::CpuDevice> cpu_devices,
                          absl::Span<const std::string> machine_attributes)
       : platform_id_(platform_id),
         platform_name_(platform_name),
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
index e4b1badb423d..17fad658a819 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
@@ -15,8 +15,10 @@ cc_library(
     ],
     hdrs = ["xla_gpu_pjrt_client.h"],
     deps = [
+        ":xla_gpu_client_options",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//xla/pjrt/gpu/tfrt:tfrt_gpu_client",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -50,8 +52,8 @@ xla_test(
     deps = [
         ":xla_gpu_pjrt_client",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//xla/pjrt/gpu/tfrt:tfrt_gpu_client",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
index e40be6b4c189..3e2693f9a09a 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
@@ -46,6 +46,10 @@ struct GpuClientOptions {
   bool enable_mock_nccl = false;
 
   std::optional<std::string> mock_gpu_topology;
+
+  std::optional<int> slice_index;
+
+  bool use_tfrt_gpu_client = false;
 };
 
 }  //  namespace xla
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.cc
index 97839fcf9cec..a7c9eadf1bc4 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.cc
@@ -19,13 +19,18 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 
 namespace xla {
 
 absl::StatusOr<std::unique_ptr<PjRtClient>> GetXlaPjrtGpuClient(
     GpuClientOptions options) {
   // TODO(masonchang): Wrap the GPU Client inside the PJRT Sandwich
+  if (options.use_tfrt_gpu_client) {
+    return GetTfrtGpuClient(options);
+  }
   return GetStreamExecutorGpuClient(options);
 }
 
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.h
index fb01f943d6a6..2c1fcc13fd55 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "absl/status/statusor.h"
-#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client_test.cc
index 13ea2f0a7998..8c4106a1d861 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_pjrt_client_test.cc
@@ -18,14 +18,21 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
-#include "tsl/platform/test.h"
+#include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 
 namespace xla {
 
 TEST(XlaCpuPjrtClientTest, GetXlaPjrtGpuClient) {
-  GpuClientOptions options;
-  ASSERT_OK_AND_ASSIGN(auto client, GetXlaPjrtGpuClient(options));
+  ASSERT_OK_AND_ASSIGN(auto client, GetXlaPjrtGpuClient({}));
   EXPECT_EQ(client->platform_name(), "cuda");
+  EXPECT_NE(dynamic_cast<StreamExecutorGpuClient*>(client.get()), nullptr);
+}
+
+TEST(XlaCpuPjrtClientTest, GetXlaPjrtGpuClientWithTfrtClient) {
+  ASSERT_OK_AND_ASSIGN(auto client,
+                       GetXlaPjrtGpuClient({.use_tfrt_gpu_client = true}));
+  EXPECT_EQ(client->platform_name(), "cuda");
+  EXPECT_NE(dynamic_cast<TfrtGpuClient*>(client.get()), nullptr);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/plugin/xla_tpu/BUILD b/third_party/xla/xla/pjrt/plugin/xla_tpu/BUILD
index 6e986d1fa786..2c9003997c0d 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_tpu/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/xla_tpu/BUILD
@@ -1,3 +1,4 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -24,7 +25,7 @@ cc_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "xla_tpu_pjrt_client_test",
     srcs = ["xla_tpu_pjrt_client_test.cc"],
     tags = ["no_oss"],
@@ -33,7 +34,7 @@ cc_test(
         "//xla/pjrt:pjrt_common",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/profiling/BUILD b/third_party/xla/xla/pjrt/profiling/BUILD
index d271e9c8232a..c19b7aa9e13d 100644
--- a/third_party/xla/xla/pjrt/profiling/BUILD
+++ b/third_party/xla/xla/pjrt/profiling/BUILD
@@ -19,6 +19,7 @@ exports_files(
     [
         "device_time_measurement.h",
         "no_op_device_time_measurement.h",
+        "profiling_context.h",
     ],
     visibility = internal_visibility([
         "//learning/brain/google/runtime:__subpackages__",
@@ -45,6 +46,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "profiling_context_no_op",
+    srcs = [
+        "profiling_context.h",
+        "profiling_context_no_op.cc",
+        "profiling_context_no_op.h",
+    ],
+    # copybara:uncomment_begin(google-only)
+    # compatible_with = ["//buildenv/target:non_prod"],
+    # copybara:uncomment_end
+    deps = ["@llvm-project//llvm:Support"],
+)
+
 cc_library(
     name = "device_time_measurement",
     # copybara:uncomment_begin(google-only)
@@ -56,7 +70,7 @@ cc_library(
         ":no_op_device_time_measurement",
         # copybara:comment_end
         # copybara:uncomment_begin(google-only)
-        # "//learning/brain/google/runtime:device_runtime_profiling",
+        # "//learning/brain/google/runtime:device_runtime_profiling",  # buildcleaner: keep
         # copybara:uncomment_end
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/synchronization",
@@ -64,3 +78,20 @@ cc_library(
         "//xla/pjrt:pjrt_compiler",
     ],
 )
+
+cc_library(
+    name = "profiling_context",
+    # copybara:uncomment_begin(google-only)
+    # compatible_with = ["//buildenv/target:non_prod"],
+    # copybara:uncomment_end
+    textual_hdrs = ["profiling_context.h"],
+    deps = [
+        # copybara:comment_begin(oss-only)
+        ":profiling_context_no_op",
+        # copybara:comment_end
+        # copybara:uncomment_begin(google-only)
+        # "//learning/brain/google/runtime:profiling_context",
+        # copybara:uncomment_end
+        "@llvm-project//llvm:Support",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/profiling/profiling_context.h b/third_party/xla/xla/pjrt/profiling/profiling_context.h
new file mode 100644
index 000000000000..02ddce87208a
--- /dev/null
+++ b/third_party/xla/xla/pjrt/profiling/profiling_context.h
@@ -0,0 +1,63 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PROFILING_PROFILING_CONTEXT_H_
+#define XLA_PJRT_PROFILING_PROFILING_CONTEXT_H_
+
+#include <memory>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+
+namespace xla {
+
+// Interface for a profiling contex that can be passed between device execute
+// threads.
+class ProfilingContext
+    : public llvm::RTTIExtends<ProfilingContext, llvm::RTTIRoot> {
+ public:
+  ProfilingContext() = default;
+  ~ProfilingContext() override = default;
+
+  // For llvm::RTTIExtends.
+  static char ID;  // NOLINT
+};
+
+// Factory function for creating a ProfilingContext object with the current
+// thread's profiling context.
+std::unique_ptr<ProfilingContext> CreateProfilingContext();
+
+// Scoped object that sets the current thread's profiling context to switch_to
+// until the object is destroyed.
+class WithProfilingContext
+    : public llvm::RTTIExtends<WithProfilingContext, llvm::RTTIRoot> {
+ public:
+  WithProfilingContext() = default;
+  ~WithProfilingContext() override = default;
+
+  WithProfilingContext(const WithProfilingContext&) = delete;
+  WithProfilingContext& operator=(const WithProfilingContext&) = delete;
+
+  // For llvm::RTTIExtends.
+  static char ID;  // NOLINT
+};
+
+// Factory function for creating a WithProfilingContext object that installs
+// the switch_to context on the current thread.
+std::unique_ptr<WithProfilingContext> CreateWithProfilingContext(
+    ProfilingContext* switch_to);
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PROFILING_PROFILING_CONTEXT_H_
diff --git a/third_party/xla/xla/pjrt/profiling/profiling_context_no_op.cc b/third_party/xla/xla/pjrt/profiling/profiling_context_no_op.cc
new file mode 100644
index 000000000000..5e15f8178761
--- /dev/null
+++ b/third_party/xla/xla/pjrt/profiling/profiling_context_no_op.cc
@@ -0,0 +1,37 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/profiling/profiling_context_no_op.h"
+
+#include <memory>
+
+#include "xla/pjrt/profiling/profiling_context.h"
+
+namespace xla {
+
+std::unique_ptr<ProfilingContext> CreateProfilingContext() {
+  return std::make_unique<ProfilingContextNoOp>();
+}
+
+std::unique_ptr<WithProfilingContext> CreateWithProfilingContext(
+    ProfilingContext* switch_to) {
+  return std::make_unique<WithProfilingContextNoOp>();
+}
+
+char ProfilingContext::ID = 0;
+char WithProfilingContext::ID = 0;
+char ProfilingContextNoOp::ID = 0;
+char WithProfilingContextNoOp::ID = 0;
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/profiling/profiling_context_no_op.h b/third_party/xla/xla/pjrt/profiling/profiling_context_no_op.h
new file mode 100644
index 000000000000..d5df948cfe92
--- /dev/null
+++ b/third_party/xla/xla/pjrt/profiling/profiling_context_no_op.h
@@ -0,0 +1,51 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_PROFILING_PROFILING_CONTEXT_NO_OP_H_
+#define XLA_PJRT_PROFILING_PROFILING_CONTEXT_NO_OP_H_
+
+#include <memory>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/profiling/profiling_context.h"
+
+namespace xla {
+
+// No-op implementation of ProfilingContext. Holds nothing.
+class ProfilingContextNoOp
+    : public llvm::RTTIExtends<ProfilingContextNoOp, ProfilingContext> {
+ public:
+  ProfilingContextNoOp() = default;
+  ~ProfilingContextNoOp() override = default;
+
+  // For llvm::RTTIExtends.
+  static char ID;  // NOLINT
+};
+
+std::unique_ptr<ProfilingContext> CreateProfilingContext();
+
+class WithProfilingContextNoOp
+    : public llvm::RTTIExtends<WithProfilingContextNoOp, WithProfilingContext> {
+ public:
+  WithProfilingContextNoOp() = default;
+  ~WithProfilingContextNoOp() override = default;
+
+  // For llvm::RTTIExtends.
+  static char ID;  // NOLINT
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_PROFILING_PROFILING_CONTEXT_NO_OP_H_
diff --git a/third_party/xla/xla/pjrt/raw_buffer.cc b/third_party/xla/xla/pjrt/raw_buffer.cc
index 510ac81140bb..6def4ceb5919 100644
--- a/third_party/xla/xla/pjrt/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/raw_buffer.cc
@@ -15,19 +15,34 @@ limitations under the License.
 
 #include "xla/pjrt/raw_buffer.h"
 
+#include <cstdint>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 
 std::vector<RegisterRawBufferFactory::FactoryFuncT>& GetFactoryFuncs() {
-  static auto* funcs = new std::vector<RegisterRawBufferFactory::FactoryFuncT>;
+  static auto* const funcs =
+      new std::vector<RegisterRawBufferFactory::FactoryFuncT>;
   return *funcs;
 }
 
+PjRtFuture<> CommonPjRtRawBuffer::CopyRawHostToDevice(const void* src,
+                                                      int64_t offset,
+                                                      int64_t transfer_size) {
+  auto event = CopyRawHostToDeviceAndReturnEvent(src, offset, transfer_size);
+  if (!event.ok()) {
+    return PjRtFuture<>(event.status());
+  }
+  return (*event)->GetReadyFuture();
+}
+
 absl::StatusOr<tsl::RCReference<PjRtRawBuffer>>
 PjRtRawBuffer::CreateRawAliasOfBuffer(PjRtBuffer* buffer) {
   for (auto* func : GetFactoryFuncs()) {
@@ -36,7 +51,12 @@ PjRtRawBuffer::CreateRawAliasOfBuffer(PjRtBuffer* buffer) {
       return *res;
     }
   }
-  return absl::UnimplementedError("CreateRawAliasOfBuffer not implemented.");
+  if (buffer == nullptr) {
+    return absl::InvalidArgumentError("Cannot create view of null buffer.");
+  }
+  return absl::UnimplementedError(
+      absl::StrCat("CreateRawAliasOfBuffer not implemented for: ",
+                   buffer->client()->platform_version()));
 }
 
 RegisterRawBufferFactory::RegisterRawBufferFactory(
diff --git a/third_party/xla/xla/pjrt/raw_buffer.h b/third_party/xla/xla/pjrt/raw_buffer.h
index 803ac4f9420a..da4495a5858c 100644
--- a/third_party/xla/xla/pjrt/raw_buffer.h
+++ b/third_party/xla/xla/pjrt/raw_buffer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/status/statusor.h"
+#include "xla/pjrt/device_event.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
@@ -37,10 +38,14 @@ class PjRtRawBuffer : public tsl::ReferenceCounted<PjRtRawBuffer> {
   static absl::StatusOr<tsl::RCReference<PjRtRawBuffer>> CreateRawAliasOfBuffer(
       PjRtBuffer* buffer);
 
+  // Memory space that the raw buffer lives on.
   virtual PjRtMemorySpace* memory_space() const = 0;
 
+  // If visible to the host, returns the base pointer for direct access.
+  virtual void* GetHostPointer() const { return nullptr; }
+
   // Returns the number of bytes of the buffer storage on the device.
-  virtual absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const = 0;
+  virtual size_t GetOnDeviceSizeInBytes() const = 0;
 
   // Transfers the buffer to a sub-range of the on-device representation.
   // offset+transfer_size must be less than GetOnDeviceSizeInBytes. The
@@ -65,6 +70,30 @@ class PjRtRawBuffer : public tsl::ReferenceCounted<PjRtRawBuffer> {
                                            int64_t transfer_size) = 0;
 };
 
+// Adds methods common to all implementations of PjRtRawBuffer based on device
+// events.
+class CommonPjRtRawBuffer : public PjRtRawBuffer {
+ public:
+  // Transfers the buffer to a sub-range of the on-device representation.
+  // offset+transfer_size must be less than GetOnDeviceSizeInBytes. The
+  // returned future transitions to ready on error, or after the transfer has
+  // completed.
+  //
+  // Note that the underlying driver may have requirements
+  // on the alignment of `src` and `offset` as well. Look at implementations of
+  // this method for specific alignment requirements.
+  virtual absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+  CopyRawHostToDeviceAndReturnEvent(const void* src, int64_t offset,
+                                    int64_t transfer_size) = 0;
+
+  PjRtFuture<> CopyRawHostToDevice(const void* src, int64_t offset,
+                                   int64_t transfer_size) override;
+
+  // Creates an event which signals when the allocation is complete.
+  virtual absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
+  MakeAllocationReadyEvent() = 0;
+};
+
 class RegisterRawBufferFactory {
  public:
   using FactoryFuncT =
diff --git a/third_party/xla/xla/pjrt/raw_buffer_test.cc b/third_party/xla/xla/pjrt/raw_buffer_test.cc
index 3531a86a6519..c88ca2e34944 100644
--- a/third_party/xla/xla/pjrt/raw_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/raw_buffer_test.cc
@@ -39,8 +39,8 @@ REGISTER_PJRT_RAW_BUFFER_FACTORY(MockFactory);
 TEST(RawBufferTest, FactoryFallback) {
   auto status = PjRtRawBuffer::CreateRawAliasOfBuffer(nullptr).status();
   ASSERT_THAT(status,
-              tsl::testing::StatusIs(tsl::error::UNIMPLEMENTED,
-                                     testing::HasSubstr("not implemented.")));
+              tsl::testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                                     testing::HasSubstr("null buffer.")));
 }
 
 TEST(RawBufferTest, FactoryError) {
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
index ab82fdaf0c2e..f3ced5502e0b 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -18,27 +18,50 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <variant>
+#include <vector>
 
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
+#include "absl/log/check.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/stream_executor_executable.pb.h"
 #include "xla/service/compiler.h"
-#include "tsl/platform/statusor.h"
+#include "xla/service/executable.h"
+#include "xla/shape.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 
 namespace xla {
 absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
     const {
-  if (aot_executables_.empty()) {
-    return absl::InternalError("No local executable");
-  }
-  if (aot_executables_.size() != 1) {
-    return absl::UnimplementedError(
-        "PjRtStreamExecutorClient::SerializeExecutable unimplemented for MPMD "
-        "executables");
+  std::string serialized;
+  if (std::holds_alternative<
+          std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+          executables_)) {
+    const auto& aot_executables =
+        std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+            executables_);
+    if (aot_executables.empty()) {
+      return absl::InternalError("No local executable");
+    }
+    if (aot_executables.size() != 1) {
+      return absl::UnimplementedError(
+          "PjRtStreamExecutorClient::SerializeExecutable unimplemented for "
+          "MPMD executables");
+    }
+    TF_ASSIGN_OR_RETURN(serialized, aot_executables[0]->SerializeAsString());
+  } else {
+    const auto& local_executables =
+        std::get<std::vector<std::unique_ptr<LocalExecutable>>>(executables_);
+    Executable* built_executable = local_executables[0]->executable();
+    CHECK(local_client_ != nullptr);
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<AotCompilationResult> aot_result,
+        local_client_->backend().compiler()->Export(built_executable));
+
+    TF_ASSIGN_OR_RETURN(serialized, aot_result->SerializeAsString());
   }
 
-  TF_ASSIGN_OR_RETURN(std::string serialized,
-                      aot_executables_[0]->SerializeAsString());
   if (serialized.empty()) {
     return absl::InternalError(
         "PjRtStreamExecutorClient::SerializeExecutable proto serialization "
@@ -50,4 +73,84 @@ absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
                       compile_options_.ToProto());
   return proto.SerializeAsString();
 }
+
+namespace {
+
+absl::StatusOr<absl::string_view> MemoryKindFromSimpleShape(
+    const Shape& shape, absl::string_view default_memory_kind) {
+  if (!shape.has_layout()) {
+    return default_memory_kind;
+  }
+  switch (shape.layout().memory_space()) {
+    case Layout::kHostMemorySpace:
+      return PinnedHostMemorySpace::kKind;
+    case Layout::kGenericFastMemorySpace:
+    case Layout::kDefaultMemorySpace:
+      return default_memory_kind;
+    default:
+      return InvalidArgument("Unexpected memory space %d in output layout",
+                             shape.layout().memory_space());
+  }
+}
+
+absl::StatusOr<std::vector<absl::string_view>> MemoryKindsFromShape(
+    const Shape& shape, absl::string_view default_memory_kind) {
+  if (!shape.IsTuple()) {
+    TF_ASSIGN_OR_RETURN(absl::string_view memory_kind,
+                        MemoryKindFromSimpleShape(shape, default_memory_kind));
+    return {{memory_kind}};
+  }
+  std::vector<absl::string_view> result;
+  result.reserve(shape.tuple_shapes().size());
+  for (const auto& element_shape : shape.tuple_shapes()) {
+    TF_ASSIGN_OR_RETURN(
+        absl::string_view element_memory_kind,
+        MemoryKindFromSimpleShape(element_shape, default_memory_kind));
+    result.push_back(element_memory_kind);
+  }
+  return result;
+}
+
+}  // namespace
+
+absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+StreamExecutorExecutable::GetOutputMemoryKinds() const {
+  TF_ASSIGN_OR_RETURN(auto shapes, GetOutputShapes());
+  std::vector<std::vector<absl::string_view>> out;
+  out.reserve(shapes.size());
+  for (const auto& shape : shapes) {
+    TF_ASSIGN_OR_RETURN(std::vector<absl::string_view> memory_kind,
+                        MemoryKindsFromShape(shape, default_memory_kind_));
+    out.push_back(memory_kind);
+  }
+  return out;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<LocalExecutable>>>
+StreamExecutorExecutable::ConsumeExecutable(
+    LocalClient* client, const CompileOptions& compile_options) {
+  if (std::holds_alternative<std::vector<std::unique_ptr<LocalExecutable>>>(
+          executables_)) {
+    return std::get<std::vector<std::unique_ptr<LocalExecutable>>>(
+        std::move(executables_));
+  } else if (std::holds_alternative<
+                 std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+                 executables_)) {
+    auto aot_executables =
+        std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+            std::move(executables_));
+    std::vector<std::unique_ptr<LocalExecutable>> local_executables;
+    local_executables.reserve(aot_executables.size());
+    for (int i = 0; i < aot_executables.size(); ++i) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<LocalExecutable> local_executable,
+          client->Load(std::move(aot_executables[i]),
+                       compile_options.executable_build_options));
+      local_executables.push_back(std::move(local_executable));
+    }
+    return local_executables;
+  }
+  return absl::UnimplementedError("Unsupported executable type.");
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
index 826e4f2912f1..15fb03d1cbec 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -27,10 +27,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/client/local_client.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/compiler.h"
+#include "xla/service/hlo_proto_util.h"
 
 namespace xla {
 class StreamExecutorExecutable : public PjRtExecutable {
@@ -39,15 +41,37 @@ class StreamExecutorExecutable : public PjRtExecutable {
       const CompileOptions& compile_options,
       std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
       int num_replicas, int num_partitions, absl::string_view name,
-      absl::string_view fingerprint,
-      std::optional<std::vector<std::vector<absl::string_view>>>
-          output_memory_kinds)
+      absl::string_view fingerprint, absl::string_view default_memory_kind)
       : compile_options_(compile_options),
-        aot_executables_(std::move(executables)),
+        executables_(std::move(executables)),
         num_replicas_(num_replicas),
         num_partitions_(num_partitions),
         name_(name),
-        fingerprint_(fingerprint) {}
+        fingerprint_(fingerprint),
+        default_memory_kind_(default_memory_kind) {}
+
+  StreamExecutorExecutable(
+      const CompileOptions& compile_options,
+      std::vector<std::unique_ptr<LocalExecutable>> local_executables,
+      LocalClient* local_client, int num_replicas, int num_partitions,
+      absl::string_view name, absl::string_view fingerprint,
+      absl::string_view default_memory_kind)
+      : compile_options_(compile_options),
+        executables_(std::move(local_executables)),
+        local_client_(local_client),
+        num_replicas_(num_replicas),
+        num_partitions_(num_partitions),
+        name_(name),
+        fingerprint_(fingerprint),
+        default_memory_kind_(default_memory_kind) {
+    std::vector<std::shared_ptr<HloModule>> hlo_modules;
+    for (const auto& local_executable :
+         std::get<std::vector<std::unique_ptr<LocalExecutable>>>(
+             executables_)) {
+      hlo_modules.push_back(local_executable->executable()->shared_module());
+    }
+    hlo_modules_ = std::move(hlo_modules);
+  }
 
   absl::StatusOr<std::string> SerializeExecutable() const override;
 
@@ -59,27 +83,76 @@ class StreamExecutorExecutable : public PjRtExecutable {
   }
   absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const override {
-    return absl::UnimplementedError("GetHloModules is not supported.");
+    if (!hlo_modules_.has_value()) {
+      return absl::UnimplementedError("GetHloModules is not supported.");
+    }
+    return *hlo_modules_;
   }
 
-  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
-  GetOutputMemoryKinds() const override {
-    if (output_memory_kinds_.has_value()) {
-      return *output_memory_kinds_;
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    CompiledMemoryStats memory_stats = CompiledMemoryStats();
+    if (auto* aot_executables = std::get_if<
+            std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+            &executables_)) {
+      if (aot_executables->size() != 1) {
+        return Unimplemented(
+            "Retrieving CompiledMemoryStats is not supported for multiple "
+            "executables.");
+      }
+      const auto& aot_executable = (*aot_executables)[0];
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffers,
+                          aot_executable->buffer_assignment());
+
+      memory_stats.buffer_assignment = buffers->ToProto();
+      memory_stats.PopulateBufferStatsFromAllocations(buffers->Allocations());
+      return memory_stats;
+    } else {
+      const auto& local_executables =
+          std::get<std::vector<std::unique_ptr<LocalExecutable>>>(executables_);
+      if (local_executables.size() != 1) {
+        return absl::UnimplementedError(
+            "Retrieving CompiledMemoryStats is not supported for multiple "
+            "executables.");
+      }
+      const BufferAssignmentProto* proto =
+          local_executables[0]->executable()->buffer_assignment_proto();
+      if (proto != nullptr) {
+        memory_stats.buffer_assignment = *proto;
+      }
+      memory_stats.PopulateBufferStatsFromAllocations(
+          local_executables[0]->executable()->GetAllocations());
     }
-    return absl::UnimplementedError("GetOutputMemoryKinds is not supported.");
+    memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
+    return memory_stats;
   }
+
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
+
   absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
   GetCostAnalysis() const override {
     return absl::UnimplementedError("GetCostAnalysis is not supported.");
   }
 
-  int64_t SizeOfGeneratedCodeInBytes() const override { return 0; }
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    if (std::holds_alternative<
+            std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+            executables_)) {
+      return 0;
+    }
+    int64_t size = 0;
+    for (auto& executable :
+         std::get<std::vector<std::unique_ptr<LocalExecutable>>>(
+             executables_)) {
+      size += executable->executable()->SizeOfGeneratedCodeInBytes();
+    }
+    return size;
+  }
 
   const CompileOptions& compile_options() const { return compile_options_; }
-  std::vector<std::unique_ptr<xla::AotCompilationResult>>& aot_executables() {
-    return aot_executables_;
-  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<LocalExecutable>>>
+  ConsumeExecutable(LocalClient* client, const CompileOptions& compile_options);
 
   absl::StatusOr<std::string> FingerprintExecutable() const override {
     return fingerprint_;
@@ -87,13 +160,16 @@ class StreamExecutorExecutable : public PjRtExecutable {
 
  private:
   CompileOptions compile_options_;
-  std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_executables_;
+  std::variant<std::vector<std::unique_ptr<xla::AotCompilationResult>>,
+               std::vector<std::unique_ptr<LocalExecutable>>>
+      executables_;
+  LocalClient* local_client_ = nullptr;
+  std::optional<std::vector<std::shared_ptr<HloModule>>> hlo_modules_;
   int num_replicas_;
   int num_partitions_;
   std::string name_;
   std::string fingerprint_;
-  std::optional<std::vector<std::vector<absl::string_view>>>
-      output_memory_kinds_;
+  absl::string_view default_memory_kind_;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/xla/xla/pjrt/tf_pjrt_client.h
index db63a6b57821..9662d9b3d73b 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -244,19 +244,21 @@ class TfPjRtClient : public PjRtClient {
       const override {
     return wrapped_->GetHloCostAnalysis();
   }
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       const XlaComputation& computation, CompileOptions options) override {
-    return WrapExecutable(wrapped_->Compile(computation, options));
+    return WrapExecutable(wrapped_->CompileAndLoad(computation, options));
   }
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CompileAndLoad(
       mlir::ModuleOp module, CompileOptions options) override {
-    return WrapExecutable(wrapped_->Compile(std::move(module), options));
+    return WrapExecutable(wrapped_->CompileAndLoad(std::move(module), options));
   }
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized,
-      std::optional<CompileOptions> options) override {
-    return WrapExecutable(wrapped_->DeserializeExecutable(serialized, options));
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
+                           const LoadOptions& load_options) override {
+    return WrapExecutable(
+        wrapped_->LoadSerializedExecutable(serialized, options, load_options));
   }
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
@@ -308,7 +310,6 @@ class TfPjRtClient : public PjRtClient {
       const override {
     return wrapped_->GetTopologyDescription();
   }
-  absl::Status Defragment() override { return wrapped_->Defragment(); }
 
   PjRtClient* wrapped() const { return wrapped_.get(); }
 
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc b/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc
index e8fff1c2b606..cce35f4ef69a 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc
@@ -53,7 +53,7 @@ TEST(TfClientTest, ExecuteAndHloSnapshot) {
   debug_opts->set_xla_dump_hlo_snapshots(true);
   XlaComputation xla_computation(hlo_module->ToProto());
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
-                          client->Compile(xla_computation, options));
+                          client->CompileAndLoad(xla_computation, options));
 
   auto* device = client->addressable_devices()[0];
 
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.cc b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
index 0b406519e0ca..1e8d03bd3fab 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
@@ -23,27 +23,26 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
-#include <vector>
 
-#include "absl/container/flat_hash_map.h"
+#include "absl/algorithm/container.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/event.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "tsl/platform/logging.h"
+#include "xla/tsl/platform/logging.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 #include "tsl/profiler/lib/context_types.h"
 
@@ -53,24 +52,19 @@ void BufferSequencingEvent::SetSequencingEvent(EventPool::Handle event,
                                                se::Stream* stream) {
   {
     absl::MutexLock lock(&mu_);
-    defined_status_.emplace(absl::OkStatus());
     CHECK(!event_.event());
     event_ = std::move(event);
     CHECK(streams_defined_on_.empty());
     streams_defined_on_.push_back(stream);
     sequence_number_.store(event_.sequence_number(), std::memory_order_seq_cst);
   }
-  this->ExecuteFutureTasks();
+  defined_status_.emplace(absl::OkStatus());
 }
 
 bool BufferSequencingEvent::EventHasBeenRecorded() const {
   return event_.event() != nullptr;
 }
 
-bool BufferSequencingEvent::IsDefinedNoLock() const {
-  return defined_status_.IsConcrete();
-}
-
 uint64_t BufferSequencingEvent::sequence_number() const {
   uint64_t seq = sequence_number_.load(std::memory_order_seq_cst);
   return seq;
@@ -112,21 +106,18 @@ absl::Status BufferSequencingEvent::WaitForEventOnExternalStream(
 
 bool BufferSequencingEvent::IsPredeterminedErrorOrDefinedOn(
     se::Stream* stream) {
-  absl::MutexLock lock(&mu_);
-  // IsDefined would be true for both a defined buffer and an error buffer.
-  // Can't use BufferSequencingEvent::EventHasBeenRecorded here since that's
-  // only true for a non-error buffer(i.e. defined buffer).
-  mu_.Await(absl::Condition(this, &BufferSequencingEvent::IsDefinedNoLock));
+  tsl::BlockUntilReady(defined_status_);
+  CHECK(defined_status_.IsConcrete());
 
-  if (defined_status_.IsConcrete() && !defined_status_.get().ok()) {
-    // IsPredeterminedError
+  // IsPredeterminedError
+  if (!defined_status_->ok()) {
     return true;
   }
 
   // The set of defined streams is expected to be very small indeed (usually
   // 1-2), so a simple linear scan should be fast enough.
-  return std::find(streams_defined_on_.begin(), streams_defined_on_.end(),
-                   stream) != streams_defined_on_.end();
+  absl::MutexLock lock(&mu_);
+  return absl::c_find(streams_defined_on_, stream) != streams_defined_on_.end();
 }
 
 bool BufferSequencingEvent::IsComplete() {
@@ -145,45 +136,34 @@ void BufferSequencingEvent::ExecuteOrAddToFutureTasks(
   tsl::profiler::TraceMeProducer producer(
       "BufferSequencingEvent::ExecuteOrAddToFutureTasks",
       tsl::profiler::ContextType::kPjRt);
-  uint64_t context_id = producer.GetContextId();
-  auto wrapped_task = [task = std::move(task), context_id]() {
+
+  auto traced_task = [task = std::move(task),
+                      context_id = producer.GetContextId()]() {
     tsl::profiler::TraceMeConsumer consumer("BufferSequencingEvent::Execute",
                                             tsl::profiler::ContextType::kPjRt,
                                             context_id);
     task();
   };
-  {
-    absl::MutexLock lock(&mu_);
-    if (!defined_status_.IsConcrete()) {
-      on_ready_tasks_callback_[task_name] = std::move(wrapped_task);
-      return;
-    }
-    // Release the lock to avoid deadlock, in the case where the
-    // thread_pool_->Schedule() executes wrapped_task inline.
-    // This is rare but could happen. The callbacks could potentially try to
-    // acquire the mutex of this BufferSequencingEvent.
-  }
-  thread_pool_->Schedule(std::move(wrapped_task));
+
+  // Execute the `task` when definition event becomes available. If it's already
+  // available, the task will be executed immediately.
+  defined_status_.AndThen(
+      [this, traced_task = std::move(traced_task)]() mutable {
+        thread_pool_->Schedule(std::move(traced_task));
+      });
 }
 
-void BufferSequencingEvent::ExecuteFutureTasks() {
-  absl::flat_hash_map<std::string, std::function<void()>>
-      on_ready_tasks_callback;
-  {
-    absl::MutexLock lock(&mu_);
-    on_ready_tasks_callback = std::move(on_ready_tasks_callback_);
-    // Release the lock to avoid deadlock, in the case where the
-    // thread_pool_->Schedule() executes call_all_task_callbacks inline.
-    // This is rare but could happen. The callbacks could potentially try to
-    // acquire the mutex of this BufferSequencingEvent.
-  }
-  auto call_all_task_callbacks = [on_ready_tasks_callback =
-                                      std::move(on_ready_tasks_callback)]() {
-    for (auto& [task_name, task_callback] : on_ready_tasks_callback) {
-      task_callback();
-    }
-  };
-  thread_pool_->Schedule(std::move(call_all_task_callbacks));
+ShapedBuffer RawSEDeviceMemory::AsShapedBuffer(
+    PjRtDevice* device, const Shape& on_device_shape) const {
+  ShapedBuffer shaped_buffer(on_device_shape, device->local_device_id().value(),
+                             device->local_hardware_id().value());
+  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+      shaped_buffer.buffers().begin();
+  CHECK(iterator != shaped_buffer.buffers().end());
+  iterator->second = mem();
+  ++iterator;
+  CHECK(iterator == shaped_buffer.buffers().end());
+  return shaped_buffer;
 }
 
 class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
@@ -211,10 +191,10 @@ class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
 };
 
 tsl::RCReference<RawSEDeviceMemory> RawSEDeviceMemory::Create(
-    se::DeviceMemoryBase value, PjRtDevice* device,
+    se::DeviceMemoryBase value, PjRtLocalDeviceId device_id,
     se::DeviceMemoryAllocator* allocator) {
-  return tsl::MakeRef<AllocatedRawSEDeviceMemory>(
-      value, device->local_device_id().value(), allocator);
+  return tsl::MakeRef<AllocatedRawSEDeviceMemory>(value, device_id.value(),
+                                                  allocator);
 }
 
 class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
@@ -241,30 +221,6 @@ tsl::RCReference<RawSEDeviceMemory> RawSEDeviceMemory::CreateForeign(
                                                 std::move(on_delete_callback));
 }
 
-/* static */ std::shared_ptr<TrackedDeviceBuffer>
-TrackedDeviceBuffer::FromScopedShapedBuffer(
-    ScopedShapedBuffer* shaped_buffer,
-    absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events,
-    PjRtDevice* device) {
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
-      shaped_buffer->buffers().begin();
-  std::vector<tsl::RCReference<RawSEDeviceMemory>> buffers;
-  buffers.reserve(1);
-
-  ShapeUtil::ForEachSubshape(
-      shaped_buffer->on_device_shape(), [&](const Shape&, const ShapeIndex&) {
-        CHECK(iterator != shaped_buffer->buffers().end());
-        buffers.push_back(RawSEDeviceMemory::Create(
-            iterator->second, device, shaped_buffer->memory_allocator()));
-        iterator->second = se::DeviceMemoryBase();
-        ++iterator;
-      });
-  CHECK(iterator == shaped_buffer->buffers().end());
-  return std::make_shared<TrackedDeviceBuffer>(
-      device, absl::Span<tsl::RCReference<RawSEDeviceMemory>>(buffers),
-      definition_events);
-}
-
 ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(
     const Shape& on_device_shape) const {
   ShapedBuffer shaped_buffer(on_device_shape,
@@ -272,63 +228,35 @@ ShapedBuffer TrackedDeviceBuffer::AsShapedBuffer(
                              device_->local_hardware_id().value());
   ShapeTree<se::DeviceMemoryBase>::iterator iterator =
       shaped_buffer.buffers().begin();
-  for (const tsl::RCReference<RawSEDeviceMemory>& buf : device_memory_) {
+  if (device_memory_) {
     CHECK(iterator != shaped_buffer.buffers().end());
-    iterator->second = buf->mem();
+    iterator->second = device_memory_->mem();
     ++iterator;
   }
   CHECK(iterator == shaped_buffer.buffers().end());
   return shaped_buffer;
 }
 
-// See comment on ExecutionInput in xla/service/executable.h to understand
-// the meaning of owned/unowned in that class.
-
-void TrackedDeviceBuffer::AddToInputAsImmutable(
-    ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-    const ShapeTree<MaybeOwningDeviceMemory>::iterator& end) const {
-  for (const tsl::RCReference<RawSEDeviceMemory>& buf : device_memory_) {
-    CHECK(*iterator != end);
-    // Set buffers to be case (1) in the comment on ExecutionInput.
-    (*iterator)->second = MaybeOwningDeviceMemory(buf->mem());
-    ++(*iterator);
-  }
-}
-
-void TrackedDeviceBuffer::AddToInputAsDonated(
-    ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-    const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
-    ExecutionInput* execution_input,
-    se::DeviceMemoryAllocator* allocator) const {
-  for (const tsl::RCReference<RawSEDeviceMemory>& buf : device_memory_) {
-    CHECK(*iterator != end);
-    // Set buffers to be case (2) in the comment on ExecutionInput.
-    (*iterator)->second = MaybeOwningDeviceMemory(se::OwningDeviceMemory(
-        buf->mem(), device_->local_device_id().value(), allocator));
-    execution_input->SetUnownedIndex((*iterator)->first);
-    ++(*iterator);
-  }
-}
-
 TrackedDeviceBuffer::TrackedDeviceBuffer(
-    PjRtDevice* device,
-    absl::Span<tsl::RCReference<RawSEDeviceMemory> const> device_memory,
+    PjRtDevice* device, tsl::RCReference<RawSEDeviceMemory> device_memory,
     absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events)
     : device_(device),
-      device_memory_(device_memory.begin(), device_memory.end()),
+      device_memory_(std::move(device_memory)),
       definition_events_(std::make_move_iterator(definition_events.begin()),
                          std::make_move_iterator(definition_events.end())),
       in_use_(true) {}
 
 TrackedDeviceBuffer::~TrackedDeviceBuffer() = default;
 
-void TrackedDeviceBuffer::ReleaseDeviceMemory(bool unsafe_release) {
-  if (unsafe_release) {
-    for (auto& mem : device_memory_) {
-      mem->UnsafeReleaseMemory();
-    }
-  }
-  device_memory_.clear();
+void TrackedDeviceBuffer::ReleaseDeviceMemory() {
+  device_memory_ = tsl::RCReference<RawSEDeviceMemory>();
+}
+
+void TrackedDeviceBuffer::ConfirmDonation() {
+  // As a sanity check ensure no more usage events can be added to the buffer.
+  LockUseAndTransferUsageEvents();
+  // Release the memory so that no new usage is possible.
+  ReleaseDeviceMemory();
 }
 
 void TrackedDeviceBuffer::AddUsageEvent(
@@ -379,12 +307,20 @@ void GetDeviceBufferEvents(
   }
 }
 
-void WaitForBufferDefinitionEventsOnStream(const TrackedDeviceBuffer& buffer,
-                                           se::Stream* stream) {
-  absl::flat_hash_set<BufferSequencingEvent*> events;
-  GetDeviceBufferEvents(buffer, /*get_usage_events=*/false, &events);
-  for (BufferSequencingEvent* event : events) {
-    event->WaitForEventOnStream(stream);
+void WaitForBufferDefinitionEventsOnStream(
+    absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events,
+    se::Stream* stream) {
+  if (definition_events.size() <= 1) {
+    for (const auto& event : definition_events) {
+      event->WaitForEventOnStream(stream);
+    }
+  } else {
+    absl::flat_hash_set<BufferSequencingEvent*> events;
+    for (const auto& event : definition_events) {
+      if (events.emplace(event.get()).second) {
+        event->WaitForEventOnStream(stream);
+      }
+    }
   }
 }
 
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index 4e020b42e96b..a2a83bf082b3 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -25,9 +25,16 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/service/executable.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
@@ -36,7 +43,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "tsl/platform/threadpool.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -112,40 +120,24 @@ class BufferSequencingEvent {
   }
 
   // Executes the `task` if the event is ready; otherwise adds the `task`
-  // callback to `on_ready_tasks_callback_` that can not be executed until the
-  // the event is ready.
+  // callback to `defined_status_` async value, to be executed when it becomes
+  // available.
   void ExecuteOrAddToFutureTasks(const std::string& task_name,
                                  std::function<void()> task);
 
-  // Executes all the callbacks in `on_ready_tasks_callback_`. Those callbacks
-  // can only proceed until the event is ready.
-  void ExecuteFutureTasks();
-
-  bool IsDefined() {
-    absl::MutexLock lock(&mu_);
-    return IsDefinedNoLock();
-  }
-
-  bool IsDefinedNoLock() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool IsDefined() { return defined_status_.IsConcrete(); }
 
   void SetDefinedStatus(absl::Status status) {
-    {
-      absl::MutexLock lock(&mu_);
-      defined_status_.emplace(status);
-    }
-
-    this->ExecuteFutureTasks();
+    defined_status_.emplace(status);
   }
 
   absl::Status GetDefinedStatus() {
-    absl::MutexLock lock(&mu_);
     CHECK(defined_status_.IsConcrete());
-    return defined_status_.get();
+    return *defined_status_;
   }
 
   bool IsPredeterminedError() {
-    absl::MutexLock lock(&mu_);
-    return defined_status_.IsConcrete() && !defined_status_.get().ok();
+    return defined_status_.IsConcrete() && !defined_status_->ok();
   }
 
   // Returns true if either:
@@ -154,11 +146,6 @@ class BufferSequencingEvent {
   // 2. The event is known to have occurred by the tail of 'stream'.
   // If SetSequencingEvent and SetDefinedStatus has not yet been called,
   // blocks the calling thread until either of those 2 happens.
-  // This is checking the above 2 conditions with a single lock. This is needed
-  // in case a buffer is set as an error buffer in a different thread after
-  // IsPredeterminedError() check and before DefinedOn() check, in which case
-  // DefinedOn() would indefinitely wait since the event is never recorded when
-  // the buffer is predetermined error.
   bool IsPredeterminedErrorOrDefinedOn(se::Stream* stream);
 
  private:
@@ -182,17 +169,11 @@ class BufferSequencingEvent {
   // at the tail of the queue, i.e., for any newly enqueued command.
   absl::InlinedVector<se::Stream*, 2> streams_defined_on_ ABSL_GUARDED_BY(mu_);
 
-  // A map of the task name and callback to execute when the
-  // TrackedDeviceBuffer's `definition_events_` are all recorded and ready to be
-  // consumed by other tasks.
-  absl::flat_hash_map<std::string, std::function<void()>>
-      on_ready_tasks_callback_ ABSL_GUARDED_BY(mu_);
-
   tsl::thread::ThreadPool* thread_pool_;
 
   // Indicates if the buffer is in an error status. And error status is used to
   // propagate the error to the buffer consumers.
-  tsl::AsyncValueRef<absl::Status> defined_status_ ABSL_GUARDED_BY(mu_);
+  tsl::AsyncValueRef<absl::Status> defined_status_;
 };
 
 // TODO(parkers): Implement PjRtRawBuffer API.
@@ -210,8 +191,12 @@ class RawSEDeviceMemory : public tsl::ReferenceCounted<RawSEDeviceMemory> {
   // buffer.
   virtual void UnsafeReleaseMemory() = 0;
 
+  // Builds a ShapedBuffer which points to mem() of shape on_device_shape.
+  ShapedBuffer AsShapedBuffer(PjRtDevice* device,
+                              const Shape& on_device_shape) const;
+
   static tsl::RCReference<RawSEDeviceMemory> Create(
-      se::DeviceMemoryBase value, PjRtDevice* device,
+      se::DeviceMemoryBase value, PjRtLocalDeviceId device_id,
       se::DeviceMemoryAllocator* allocator);
   static tsl::RCReference<RawSEDeviceMemory> CreateForeign(
       se::DeviceMemoryBase value,
@@ -225,7 +210,7 @@ class RawSEDeviceMemory : public tsl::ReferenceCounted<RawSEDeviceMemory> {
 // owns all of the device memory in the tuple. It also tracks the definition and
 // usage of the memory on streams, to allow for synchronized usage and deletion
 // of memory under all of the allocation model semantics.
-class TrackedDeviceBuffer {
+class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
  public:
   // Helper object to keep track of usage of the buffer on streams.
   struct StreamAndEvent {
@@ -239,14 +224,6 @@ class TrackedDeviceBuffer {
     bool reference_held;
   };
 
-  // Converts a ScopedShapedBuffer into a TrackedDeviceBuffer. Takes ownership
-  // of the buffers of the shaped_buffer.
-  static std::shared_ptr<TrackedDeviceBuffer> FromScopedShapedBuffer(
-      ScopedShapedBuffer* shaped_buffer,
-      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
-          definition_events,
-      PjRtDevice* device);
-
   // Builds a ShapedBuffer view onto the buffers of 'tree'.
   ShapedBuffer AsShapedBuffer(const Shape& on_device_shape) const;
 
@@ -273,15 +250,12 @@ class TrackedDeviceBuffer {
       ExecutionInput* execution_input,
       se::DeviceMemoryAllocator* allocator) const;
 
-  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 1>& device_memory() {
-    return device_memory_;
-  }
-  const absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 1>&
-  device_memory() const {
+  const tsl::RCReference<RawSEDeviceMemory>& device_memory() const {
     return device_memory_;
   }
-  absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events()
-      const {
+
+  const absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 2>&
+  definition_events() const {
     return definition_events_;
   }
   absl::Span<const StreamAndEvent> usage_events() const {
@@ -290,7 +264,10 @@ class TrackedDeviceBuffer {
 
   // Relinquishes ownership of the buffer's device memory, e.g., after the
   // buffer is passed to a computation that aliases its inputs to outputs.
-  void ReleaseDeviceMemory(bool unsafe_release);
+  void ReleaseDeviceMemory();
+
+  // Only to be called by ScopedHold to mark a successful donation.
+  void ConfirmDonation() override;
 
   // Indicates that the buffer has been used on a stream.
   //
@@ -312,19 +289,17 @@ class TrackedDeviceBuffer {
   // any stream and, e.g. AddUsageHold will CHECK fail.
   StreamAndEventContainer LockUseAndTransferUsageEvents();
 
-  TrackedDeviceBuffer() : in_use_(true) {}
-  TrackedDeviceBuffer(
-      PjRtDevice* device,
-      absl::Span<tsl::RCReference<RawSEDeviceMemory> const> device_memory,
-      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
-          definition_events);
+  TrackedDeviceBuffer(PjRtDevice* device,
+                      tsl::RCReference<RawSEDeviceMemory> device_memory,
+                      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
+                          definition_events);
   ~TrackedDeviceBuffer();
 
  private:
   PjRtDevice* device_;
 
   // Each host-side buffer may have several buffers on-device.
-  absl::InlinedVector<tsl::RCReference<RawSEDeviceMemory>, 1> device_memory_;
+  tsl::RCReference<RawSEDeviceMemory> device_memory_;
 
   // Events that are triggered when the content of one or more buffers is ready
   // during multistream execution. May be nullptr, which is used in the
@@ -354,8 +329,9 @@ void GetDeviceBufferEvents(const TrackedDeviceBuffer& buffer,
                            absl::flat_hash_set<BufferSequencingEvent*>* events);
 
 // Waits for all of the definition events in a buffer on 'stream'.
-void WaitForBufferDefinitionEventsOnStream(const TrackedDeviceBuffer& buffer,
-                                           se::Stream* stream);
+void WaitForBufferDefinitionEventsOnStream(
+    absl::Span<const std::shared_ptr<BufferSequencingEvent>> definition_events,
+    se::Stream* stream);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index dd5d883b0921..86699456f182 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -92,13 +92,13 @@ absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> MakeArray(
                 /*device_ordinal=*/0,
                 client->backend().transfer_manager()->GetByteSizeRequirement(
                     subshape)));
-        device_buffers.push_back(
-            RawSEDeviceMemory::Create(device_memory.Release(), device,
-                                      client->backend().memory_allocator()));
+        device_buffers.push_back(RawSEDeviceMemory::Create(
+            device_memory.Release(), device->local_device_id(),
+            client->backend().memory_allocator()));
         return absl::OkStatus();
       }));
   return std::make_shared<TrackedDeviceBuffer>(
-      device, device_buffers,
+      device, device_buffers[0],
       absl::Span<const std::shared_ptr<BufferSequencingEvent>>());
 }
 
@@ -113,12 +113,9 @@ TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(auto b_buffer, MakeArray(b_shape, client, &device));
   TF_ASSERT_OK_AND_ASSIGN(auto c_buffer, MakeArray(c_shape, client, &device));
 
-  ASSERT_EQ(a_buffer->device_memory().size(), 1);
-  ASSERT_EQ(b_buffer->device_memory().size(), 1);
-  ASSERT_EQ(c_buffer->device_memory().size(), 1);
   std::vector<se::DeviceMemoryBase> expected_buffer_sequence = {
-      a_buffer->device_memory()[0]->mem(), b_buffer->device_memory()[0]->mem(),
-      c_buffer->device_memory()[0]->mem()};
+      a_buffer->device_memory()->mem(), b_buffer->device_memory()->mem(),
+      c_buffer->device_memory()->mem()};
   ShapedBuffer shaped_a = a_buffer->AsShapedBuffer(
       client->backend().transfer_manager()->HostShapeToDeviceShape(a_shape));
   ShapedBuffer shaped_b = b_buffer->AsShapedBuffer(
@@ -147,25 +144,5 @@ TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
   EXPECT_TRUE(expected_it == expected_buffer_sequence.end());
 }
 
-TEST(TrackedDeviceBufferTest, FromScopedShapedBuffer) {
-  TestDevice device;
-  LocalClient* client = ClientLibrary::LocalClientOrDie();
-
-  Literal literal = LiteralUtil::MakeTupleOwned(
-      LiteralUtil::CreateFullWithDescendingLayout<float>({10, 3, 7}, 33.4f),
-      LiteralUtil::One(S64));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      ScopedShapedBuffer shaped_buffer,
-      client->LiteralToShapedBuffer(literal, /*device_ordinal=*/0));
-  std::shared_ptr<TrackedDeviceBuffer> device_buffer =
-      TrackedDeviceBuffer::FromScopedShapedBuffer(&shaped_buffer, {}, &device);
-
-  EXPECT_EQ(device_buffer->device_memory().size(),
-            ShapeUtil::SubshapeCount(
-                client->backend().transfer_manager()->HostShapeToDeviceShape(
-                    literal.shape())));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/triton_cuda.cc b/third_party/xla/xla/pjrt/triton_cuda.cc
index 99465a61dd51..963f4deade02 100644
--- a/third_party/xla/xla/pjrt/triton_cuda.cc
+++ b/third_party/xla/xla/pjrt/triton_cuda.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
@@ -96,7 +97,7 @@ absl::StatusOr<std::unique_ptr<llvm::TargetMachine>> CreateTargetMachine(
   if (target == nullptr) {
     return absl::InternalError(
         absl::StrFormat("Failed to lookup LLVM target based on triple %s: %s",
-                        module->getTargetTriple(), error));
+                        module->getTargetTriple().str(), error));
   }
   llvm::TargetOptions opt;
   if (enable_fp_fusion) {
@@ -109,8 +110,8 @@ absl::StatusOr<std::unique_ptr<llvm::TargetMachine>> CreateTargetMachine(
   opt.MCOptions.AsmVerbose = true;
   opt.MCOptions.PreserveAsmComments = true;
   return std::unique_ptr<llvm::TargetMachine>(target->createTargetMachine(
-      module->getTargetTriple(), arch_name, features, opt, llvm::Reloc::PIC_,
-      std::nullopt, llvm::CodeGenOptLevel::Aggressive));
+      module->getTargetTriple().str(), arch_name, features, opt,
+      llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOptLevel::Aggressive));
 }
 
 absl::Status LinkLibdevice(llvm::Module* module) {
@@ -172,7 +173,7 @@ absl::StatusOr<std::string> LLVMToPTX(mlir::ModuleOp module,
   // We cap the ISA at 8.4 to align with Triton.
   // See get_features() in triton/third_party/nvidia/backend/compiler.py.
   auto features = cc >= "84" ? "+ptx84" : "+ptx" + cc;
-  llvmModule->setTargetTriple("nvptx64-nvidia-cuda");
+  llvmModule->setTargetTriple(llvm::Triple("nvptx64-nvidia-cuda"));
   static absl::once_flag init_target_once;
   absl::call_once(init_target_once, []() {
     LLVMInitializeNVPTXTarget();
diff --git a/third_party/xla/xla/pjrt/utils.cc b/third_party/xla/xla/pjrt/utils.cc
index fcec108940b1..7c203dc6cc10 100644
--- a/third_party/xla/xla/pjrt/utils.cc
+++ b/third_party/xla/xla/pjrt/utils.cc
@@ -85,7 +85,7 @@ absl::StatusOr<Shape> GetShardedShape(const Shape& shape,
 }
 
 absl::StatusOr<Shape> GetShardedShape(const HloInstructionProto& instr) {
-  const Shape unsharded_shape(instr.shape());
+  TF_ASSIGN_OR_RETURN(Shape unsharded_shape, Shape::FromProto(instr.shape()));
   Shape sharded_shape;
   if (instr.has_sharding()) {
     TF_ASSIGN_OR_RETURN(sharded_shape,
@@ -444,7 +444,7 @@ absl::StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
                       computation.GetProgramShape());
   size_t num_args = program_shape.parameters_size() == 1 &&
                             program_shape.parameters(0).IsTuple()
-                        ? program_shape.parameters(0).tuple_shapes_size()
+                        ? program_shape.parameters(0).tuple_shapes().size()
                         : program_shape.parameters_size();
   return GetLayoutModes(computation, "arg_layout_modes", num_args);
 }
@@ -455,7 +455,7 @@ absl::StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
                       computation.GetProgramShape());
   size_t num_args = program_shape.parameters_size() == 1 &&
                             program_shape.parameters(0).IsTuple()
-                        ? program_shape.parameters(0).tuple_shapes_size()
+                        ? program_shape.parameters(0).tuple_shapes().size()
                         : program_shape.parameters_size();
   return GetMemoryKinds(computation, "arg_memory_spaces", num_args);
 }
@@ -465,7 +465,7 @@ absl::StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
   size_t num_outputs = program_shape.result().IsTuple()
-                           ? program_shape.result().tuple_shapes_size()
+                           ? program_shape.result().tuple_shapes().size()
                            : 1;
   return GetLayoutModes(computation, "out_layout_modes", num_outputs);
 }
@@ -475,7 +475,7 @@ absl::StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
   size_t num_outputs = program_shape.result().IsTuple()
-                           ? program_shape.result().tuple_shapes_size()
+                           ? program_shape.result().tuple_shapes().size()
                            : 1;
   return GetMemoryKinds(computation, "out_memory_spaces", num_outputs);
 }
@@ -763,7 +763,7 @@ absl::StatusOr<std::vector<int>> ComputeParametersThatMustBeDonated(
       const Shape& input_tuple_shape =
           computation->parameter_instruction(0)->shape();
       CHECK(input_tuple_shape.IsTuple());
-      return input_tuple_shape.tuple_shapes_size();
+      return input_tuple_shape.tuple_shapes().size();
     } else {
       return computation->num_parameters();
     }
diff --git a/third_party/xla/xla/primitive_util.cc b/third_party/xla/xla/primitive_util.cc
index 062376fd41e5..11fe54364062 100644
--- a/third_party/xla/xla/primitive_util.cc
+++ b/third_party/xla/xla/primitive_util.cc
@@ -35,7 +35,7 @@ namespace xla {
 namespace primitive_util {
 
 int SignificandWidth(PrimitiveType type) {
-  return FloatingPointTypeSwitch<int>(
+  return FloatingPointTypeSwitch(
       [&](auto constant_type) -> int {
         return std::numeric_limits<NativeTypeOf<constant_type>>::digits;
       },
@@ -61,7 +61,7 @@ int UnderflowExponent(PrimitiveType type) {
   // normalized floating-point number." as such it does not actually yield the
   // minimum exponent but one above the minimum exponent that a normalized
   // number can have.
-  return FloatingPointTypeSwitch<int>(
+  return FloatingPointTypeSwitch(
       [&](auto constant_type) -> int {
         return std::numeric_limits<NativeTypeOf<constant_type>>::min_exponent;
       },
@@ -74,7 +74,7 @@ int OverflowExponent(PrimitiveType type) {
   // representable finite floating-point number." as such it does not actually
   // yield the maximum exponent but the exponent of the first integer which
   // overflows.
-  return FloatingPointTypeSwitch<int>(
+  return FloatingPointTypeSwitch(
       [&](auto constant_type) -> int {
         return std::numeric_limits<NativeTypeOf<constant_type>>::max_exponent;
       },
@@ -87,7 +87,7 @@ int ExponentBias(PrimitiveType type) {
 
 bool HasInfinity(PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsFloatingPointType(type))) {
-    return FloatingPointTypeSwitch<bool>(
+    return FloatingPointTypeSwitch(
         [&](auto constant_type) -> bool {
           return std::numeric_limits<NativeTypeOf<constant_type>>::has_infinity;
         },
@@ -98,7 +98,7 @@ bool HasInfinity(PrimitiveType type) {
 
 bool HasNaN(PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsFloatingPointType(type))) {
-    return FloatingPointTypeSwitch<bool>(
+    return FloatingPointTypeSwitch(
         [&](auto constant_type) -> bool {
           return std::numeric_limits<
               NativeTypeOf<constant_type>>::has_quiet_NaN;
@@ -110,7 +110,7 @@ bool HasNaN(PrimitiveType type) {
 
 bool HasNegativeZero(PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsFloatingPointType(type))) {
-    return FloatingPointTypeSwitch<bool>(
+    return FloatingPointTypeSwitch(
         [&](auto constant_type) -> bool {
           return has_negative_zero_v<NativeTypeOf<constant_type>>;
         },
@@ -138,6 +138,86 @@ xla::PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   }
 }
 
+bool CastPreservesValues(const PrimitiveType from_type,
+                         const PrimitiveType to_type) {
+  // * -> *
+  if (from_type == to_type) {
+    return true;
+  }
+  // * -> F8E8M0FNU is not possible because zero cannot be represented.
+  if (to_type == F8E8M0FNU) {
+    return false;
+  }
+  // PRED -> *
+  if (from_type == PRED) {
+    return true;
+  }
+  // ~PRED -> PRED is not safe because it drops almost all numbers.
+  if (to_type == PRED) {
+    return false;
+  }
+  // * -> C is safe if the components of * and C can be safely converted.
+  if (IsComplexType(to_type)) {
+    auto from_component_type =
+        IsComplexType(from_type) ? ComplexComponentType(from_type) : from_type;
+    auto to_component_type = ComplexComponentType(to_type);
+    return CastPreservesValues(from_component_type, to_component_type);
+  }
+  // ~C -> C is not safe because it drops imaginary components.
+  if (IsComplexType(from_type)) {
+    return false;
+  }
+  // F -> F is safe if the exponent/significand are preserved and `to_type`
+  // preserves infinities/nans/unsigned zero in `from_type`.
+  if (IsFloatingPointType(from_type) && IsFloatingPointType(to_type)) {
+    return
+        // Target mantissa should be large enough.
+        SignificandWidth(from_type) <= SignificandWidth(to_type) &&
+        // Target exponent should be large enough.
+        ExponentWidth(from_type) <= ExponentWidth(to_type) &&
+        // HasInfinity check.
+        (!HasInfinity(from_type) || HasInfinity(to_type)) &&
+        // HasNaN check.
+        (!HasNaN(from_type) || HasNaN(to_type)) &&
+        // HasNegativeZero check.
+        (!HasNegativeZero(from_type) || HasNegativeZero(to_type)) &&
+        // Minimum denormal should be representable by target type.
+        (UnderflowExponent(from_type) - SignificandWidth(from_type)) >=
+            (UnderflowExponent(to_type) - SignificandWidth(to_type)) &&
+        // Maximum exponent may be larger with custom bias (e.g. F8E4M3B11FNUZ).
+        OverflowExponent(from_type) <= OverflowExponent(to_type);
+  }
+  // F -> I is not safe because it drops fractional numbers.
+  if (!IsIntegralType(from_type)) {
+    return false;
+  }
+  // An n-bit unsigned integer takes on values from [0, 2^n - 1].
+  // An n-bit signed integer takes on values from [-2^(n-1), 2^(n-1) - 1].
+  // from_bits/to_bits considers the number of non-sign bits.
+  const int from_bits = IsSignedIntegralType(from_type)
+                            ? BitWidth(from_type) - 1
+                            : BitWidth(from_type);
+  const int to_bits =
+      IsSignedIntegralType(to_type) ? BitWidth(to_type) - 1 : BitWidth(to_type);
+  // I -> F is safe if the integer can be represented exactly.
+  if (IsFloatingPointType(to_type)) {
+    // In both cases, we need to handle an exponent of n-1.
+    // However, the significand needed to represent signed two's complement
+    // numbers is smaller by one bit because it will only have a non-zero
+    // trailing significand field when the exponent is smaller than n-1.
+    return from_bits <= SignificandWidth(to_type) &&
+           BitWidth(from_type) - 1 < OverflowExponent(to_type);
+  }
+  // S -> U is not safe because it drops negative numbers.
+  if (IsSignedIntegralType(from_type) && IsUnsignedIntegralType(to_type)) {
+    return false;
+  }
+  // I -> I is safe if the integer can be represented exactly; we've already
+  // ensured that signed to unsigned conversions won't happen here.
+  CHECK(IsIntegralType(to_type));
+  return from_bits <= to_bits;
+}
+
 // Class to memoize the computation of
 //   absl::AsciiStrToLower(PrimitiveType_Name(p))
 // for all PrimitiveType values "p"
@@ -169,7 +249,7 @@ class PrimitiveTypeNameGenerator {
 };
 
 const std::string& LowercasePrimitiveTypeName(PrimitiveType s) {
-  static auto* gen = new PrimitiveTypeNameGenerator();
+  static auto* const gen = new PrimitiveTypeNameGenerator();
   return gen->LowercaseName(s);
 }
 
@@ -180,34 +260,37 @@ namespace {
 // Due to Postel's Law considerations, both "opaque" and "opaque_type" map to
 // the xla::OPAQUE_TYPE enumerator.
 const absl::flat_hash_map<std::string, PrimitiveType>&
-GetPrimitiveTypeStringMap() {
-  static absl::flat_hash_map<std::string, PrimitiveType>* name_to_type = [] {
-    static auto* map = new absl::flat_hash_map<std::string, PrimitiveType>;
-    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
-      if (PrimitiveType_IsValid(i) && i != PRIMITIVE_TYPE_INVALID) {
-        auto value = static_cast<PrimitiveType>(i);
-        (*map)[LowercasePrimitiveTypeName(value)] = value;
-      }
-    }
-    (*map)["opaque"] = OPAQUE_TYPE;
-    return map;
-  }();
+LowerCaseNameToPrimitiveType() {
+  static absl::flat_hash_map<std::string, PrimitiveType>* const name_to_type =
+      [] {
+        static auto* const map =
+            new absl::flat_hash_map<std::string, PrimitiveType>;
+        for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+          if (PrimitiveType_IsValid(i) && i != PRIMITIVE_TYPE_INVALID) {
+            auto value = static_cast<PrimitiveType>(i);
+            (*map)[LowercasePrimitiveTypeName(value)] = value;
+          }
+        }
+        (*map)["opaque"] = OPAQUE_TYPE;
+        return map;
+      }();
   return *name_to_type;
 }
 
 }  // namespace
 
-absl::StatusOr<PrimitiveType> StringToPrimitiveType(absl::string_view name) {
-  const auto& map = GetPrimitiveTypeStringMap();
-  auto found = map.find(name);
+absl::StatusOr<PrimitiveType> StringToPrimitiveType(
+    absl::string_view lower_name) {
+  const auto& map = LowerCaseNameToPrimitiveType();
+  auto found = map.find(lower_name);
   if (found == map.end()) {
-    return InvalidArgument("Invalid element type string: \"%s\".", name);
+    return InvalidArgument("Invalid element type string: \"%s\".", lower_name);
   }
   return found->second;
 }
 
 bool IsPrimitiveTypeName(absl::string_view name) {
-  const auto& map = GetPrimitiveTypeStringMap();
+  const auto& map = LowerCaseNameToPrimitiveType();
   auto found = map.find(name);
   return found != map.end();
 }
diff --git a/third_party/xla/xla/primitive_util.h b/third_party/xla/xla/primitive_util.h
index d8441e1498f8..0127bd070f8d 100644
--- a/third_party/xla/xla/primitive_util.h
+++ b/third_party/xla/xla/primitive_util.h
@@ -19,7 +19,6 @@ limitations under the License.
 #define XLA_PRIMITIVE_UTIL_H_
 
 #include <array>
-#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <string>
@@ -27,7 +26,6 @@ limitations under the License.
 #include <type_traits>
 #include <utility>
 
-#include "absl/base/attributes.h"
 #include "absl/base/optimization.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -78,7 +76,8 @@ bool HasNaN(PrimitiveType type);
 bool HasNegativeZero(PrimitiveType type);
 
 // Returns the XLA primitive type (eg, F32) corresponding to the given
-// template parameter native type (eg, float).
+// template parameter native type (eg, float). Doesn't compile if the native
+// type has no corresponding primitive type.
 template <typename NativeT>
 constexpr PrimitiveType NativeToPrimitiveType() {
   // Make the expression depend on the template parameter NativeT so
@@ -246,8 +245,10 @@ constexpr PrimitiveType NativeToPrimitiveType<complex128>() {
   return C128;
 }
 
-// Returns the native type (eg, float) corresponding to the given template
-// parameter XLA primitive type (eg, F32).
+// PrimitiveTypeToNative<k>::type is an alias for the native type of the given
+// primitive type, and is undefined for primitive types that do not
+// have a corresponding native type. E.g. PrimitiveTypeToNative<F32>::type is
+// float.
 template <PrimitiveType>
 struct PrimitiveTypeToNative;
 
@@ -405,66 +406,99 @@ struct PrimitiveTypeToNative<C128> {
   using type = complex128;
 };
 
-// Token
-template <>
-struct PrimitiveTypeToNative<TOKEN> {
-  using type = void;
-};
-
 template <PrimitiveType kType>
 using NativeTypeOf =
     typename primitive_util::PrimitiveTypeToNative<kType>::type;
 
+// For each possible value k of type PrimitiveType, PrimitiveTypeConstant<k> is
+// a distinct type, and PrimitiveTypeConstant<k>() can be implicitly converted
+// to a compile-time constant of value k.
 template <PrimitiveType kPrimitiveType>
 using PrimitiveTypeConstant =
     std::integral_constant<PrimitiveType, kPrimitiveType>;
 
-// Returns true if values of the given primitive type are held in array shapes.
-inline constexpr bool IsArrayType(PrimitiveType primitive_type) {
-  return primitive_type != TUPLE && primitive_type != OPAQUE_TYPE &&
-         primitive_type != TOKEN && primitive_type > PRIMITIVE_TYPE_INVALID &&
-         primitive_type < PrimitiveType_ARRAYSIZE;
-}
-
+// Returns true if the given primitive type is a MX floating-point type.
 constexpr bool IsMXType(PrimitiveType type) {
   return type == F4E2M1FN || type == F8E8M0FNU;
 }
 
+// Returns true if the given primitive type is an 8-bit floating-point type.
 constexpr bool IsF8Type(PrimitiveType type) {
   return type == F8E5M2 || type == F8E4M3 || type == F8E4M3FN ||
          type == F8E4M3B11FNUZ || type == F8E5M2FNUZ || type == F8E4M3FNUZ ||
-         type == F8E3M4;
+         type == F8E3M4 || type == F8E8M0FNU;
 }
 
+// Returns true if the given primitive type is a floating-point type.
 constexpr bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64 || type == BF16 ||
          IsF8Type(type) || IsMXType(type);
 }
 
+// Returns true if the given primitive type is a complex type.
 constexpr bool IsComplexType(PrimitiveType type) {
   return type == C64 || type == C128;
 }
 
+// Returns true if the given primitive type is a signed integral type.
 constexpr bool IsSignedIntegralType(PrimitiveType type) {
   return type == S1 || type == S2 || type == S4 || type == S8 || type == S16 ||
          type == S32 || type == S64;
 }
 
+// Returns true if the given primitive type is an unsigned integral type.
 constexpr bool IsUnsignedIntegralType(PrimitiveType type) {
   return type == U1 || type == U2 || type == U4 || type == U8 || type == U16 ||
          type == U32 || type == U64;
 }
 
+// Returns true if the given primitive type is an integral type.
 constexpr bool IsIntegralType(PrimitiveType type) {
   return IsUnsignedIntegralType(type) || IsSignedIntegralType(type);
 }
 
+// Returns true if the given primitive type is an 8-bit integral type.
 constexpr bool Is8BitIntegralType(PrimitiveType type) {
   return type == S8 || type == U8;
 }
 
-template <typename R, typename F>
-constexpr R IntegralTypeSwitch(F&& f, PrimitiveType type) {
+// Returns true if values of the given primitive type are held in array shapes.
+constexpr bool IsArrayType(PrimitiveType primitive_type) {
+  return primitive_type == PRED || IsIntegralType(primitive_type) ||
+         IsFloatingPointType(primitive_type) || IsComplexType(primitive_type);
+}
+
+// The following *TypeSwitch functions are used to dispatch on the run-time
+// value of a PrimitiveType. They each take a polymorphic functor `f` and a
+// PrimitiveType value `type` and return the result of applying `f` on a
+// PrimitiveTypeConstant<type> value.
+//
+// They are useful because they allow us to use the run-time value of a
+// PrimitiveType in a context expecting a compile-time constant.
+//
+// For example, consider the following function:
+//
+//   // Returns the size of the native type of the given primitive type.
+//   int GetNativeSizeOf(PrimitiveType type) {
+//     ...
+//   }
+//
+// We can use PrimitiveTypeSwitch to implement it as follows:
+//
+//   int GetNativeSizeOf(PrimitiveType type) {
+//     return PrimitiveTypeSwitch<int>(
+//         // The functor is polymorphic and can accept any
+//         // PrimitiveTypeConstant<type> value.
+//         [&](auto primitive_type) -> int {
+//           // Use primitive_type as a *compile-time* constant of type
+//           // PrimitiveType.
+//           return sizeof(NativeTypeOf<primitive_type>());
+//         },
+//         type);
+//   }
+
+template <typename F>
+constexpr decltype(auto) IntegralTypeSwitch(F&& f, PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsIntegralType(type))) {
     switch (type) {
       case S1:
@@ -502,8 +536,11 @@ constexpr R IntegralTypeSwitch(F&& f, PrimitiveType type) {
   LOG(FATAL) << "Not an integral data type " << type;
 }
 
-template <typename R, typename F>
-constexpr R FloatingPointTypeSwitch(F&& f, PrimitiveType type) {
+// If `type` is a floating-point type, returns the result of applying
+// polymorphic functor f on a PrimitiveTypeConstant<type> value; otherwise
+// crashes.
+template <typename F>
+constexpr decltype(auto) FloatingPointTypeSwitch(F&& f, PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsFloatingPointType(type))) {
     switch (type) {
       case F4E2M1FN:
@@ -548,8 +585,10 @@ constexpr R FloatingPointTypeSwitch(F&& f, PrimitiveType type) {
   LOG(FATAL) << "Not a floating point data type " << type;
 }
 
-template <typename R, typename F>
-constexpr R ComplexTypeSwitch(F&& f, PrimitiveType type) {
+// If `type` is a complex type, returns the result of applying polymorphic
+// functor f on a PrimitiveTypeConstant<type> value; otherwise crashes.
+template <typename F>
+constexpr decltype(auto) ComplexTypeSwitch(F&& f, PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsComplexType(type))) {
     switch (type) {
       case C64:
@@ -563,17 +602,19 @@ constexpr R ComplexTypeSwitch(F&& f, PrimitiveType type) {
   LOG(FATAL) << "Not a complex data type " << type;
 }
 
-template <typename R, typename F>
-constexpr R ArrayTypeSwitch(F&& f, PrimitiveType type) {
+// If `type` is an array type, returns the result of applying polymorphic
+// functor f on a PrimitiveTypeConstant<type> value; otherwise crashes.
+template <typename F>
+constexpr decltype(auto) ArrayTypeSwitch(F&& f, PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsArrayType(type))) {
     if (IsFloatingPointType(type)) {
-      return FloatingPointTypeSwitch<R>(std::forward<F>(f), type);
+      return FloatingPointTypeSwitch(std::forward<F>(f), type);
     }
     if (IsIntegralType(type)) {
-      return IntegralTypeSwitch<R>(std::forward<F>(f), type);
+      return IntegralTypeSwitch(std::forward<F>(f), type);
     }
     if (IsComplexType(type)) {
-      return ComplexTypeSwitch<R>(std::forward<F>(f), type);
+      return ComplexTypeSwitch(std::forward<F>(f), type);
     }
     if (type == PRED) {
       return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::PRED>());
@@ -582,10 +623,13 @@ constexpr R ArrayTypeSwitch(F&& f, PrimitiveType type) {
   LOG(FATAL) << "Not an array data type " << type;
 }
 
+// If `type` is not PRIMITIVE_TYPE_INVALID, returns the result of applying
+// polymorphic functor f on a PrimitiveTypeConstant<type> value; otherwise
+// crashes.
 template <typename R, typename F>
 constexpr R PrimitiveTypeSwitch(F&& f, PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsArrayType(type))) {
-    return ArrayTypeSwitch<R>(std::forward<F>(f), type);
+    return ArrayTypeSwitch(std::forward<F>(f), type);
   }
   if (type == TUPLE) {
     return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::TUPLE>());
@@ -600,7 +644,45 @@ constexpr R PrimitiveTypeSwitch(F&& f, PrimitiveType type) {
   LOG(FATAL) << "unhandled type " << type;
 }
 
+template <typename F>
+constexpr void IntegralTypeForEach(F&& f) {
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S1>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S2>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S4>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S8>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S16>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S32>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S64>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U1>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U2>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U4>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U8>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U16>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U32>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U64>());
+}
+
+template <typename F>
+constexpr void FloatingPointTypeForEach(F&& f) {
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F4E2M1FN>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E3M4>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E4M3>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E4M3FN>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E4M3B11FNUZ>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E4M3FNUZ>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E5M2>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E5M2FNUZ>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F8E8M0FNU>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F16>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::BF16>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F32>());
+  std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::F64>());
+}
+
 namespace internal {
+
+// Returns the number of bits in the native type for a given primitive type if
+// it is an array type. Otherwise, returns 0.
 template <PrimitiveType primitive_type>
 inline constexpr int PrimitiveTypeBitWidth() {
   if constexpr (IsArrayType(primitive_type)) {
@@ -629,24 +711,44 @@ inline constexpr int PrimitiveTypeBitWidth() {
   }
   return 0;
 }
+
+// BitWidthArrayHelper(<i0, i1, ...>) returns an array of bit widths for the
+// given primitive types static_cast<PrimitiveType>(i0),
+// static_cast<PrimitiveType>(i1), ...
 template <int... Types>
 inline constexpr auto BitWidthArrayHelper(
     std::integer_sequence<int, Types...>) {
   return std::array{PrimitiveTypeBitWidth<PrimitiveType{Types}>()...};
 }
 
+// An array of bit widths for all primitive types, where kBitWidths[i] is the
+// bit width of primitive type static_cast<PrimitiveType>(i).
 inline constexpr auto kBitWidths = BitWidthArrayHelper(
     std::make_integer_sequence<int, PrimitiveType_ARRAYSIZE>{});
 
+// ByteWidthArrayHelper(<i0, i1, ...>) returns an array of byte widths for the
+// given primitive types static_cast<PrimitiveType>(i0),
+// static_cast<PrimitiveType>(i1), ...
 template <int... Types>
 inline constexpr auto ByteWidthArrayHelper(
     std::integer_sequence<int, Types...>) {
   return std::array{
-      CeilOfRatio(PrimitiveTypeBitWidth<PrimitiveType{Types}>(), 8)...};
+      // The byte width of a primitive type is the number of bytes needed to
+      // store its bits.
+      CeilOfRatio(PrimitiveTypeBitWidth<PrimitiveType{Types}>(),
+                  // Number of bits in a byte.
+                  8)...};
 }
+
+// kByteWidths is an array of byte widths for all primitive types, where
+// kByteWidths[i] is the byte width of primitive type
+// static_cast<PrimitiveType>(i).
 inline constexpr auto kByteWidths = ByteWidthArrayHelper(
     std::make_integer_sequence<int, PrimitiveType_ARRAYSIZE>{});
 
+// If type is an array type, returns the width of the array element. Otherwise
+// crashes. Depending on the kWidths template parameter, this can return either
+// the bit or byte width of the array element.
 template <const std::array<int, PrimitiveType_ARRAYSIZE>& kWidths>
 inline constexpr int WidthForType(PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsArrayType(type))) {
@@ -654,18 +756,23 @@ inline constexpr int WidthForType(PrimitiveType type) {
   }
   LOG(FATAL) << "Unhandled primitive type " << type;
 }
+
 }  // namespace internal
 
-// Returns the number of bits in the representation for a given type.
+// Returns the number of bits in the representation for a given type. Crashes if
+// the type is not an array type.
 inline constexpr int BitWidth(PrimitiveType type) {
   return internal::WidthForType<internal::kBitWidths>(type);
 }
 
-// Returns the number of bytes in the representation for a given type.
+// Returns the number of bytes in the representation for a given type. Crashes
+// if the type is not an array type.
 inline constexpr int ByteWidth(PrimitiveType type) {
   return internal::WidthForType<internal::kByteWidths>(type);
 }
 
+// Returns the primitive type for the unsigned integral type with the given
+// bit width, or PRIMITIVE_TYPE_INVALID if there is no such type.
 constexpr PrimitiveType UnsignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   switch (src_bitwidth) {
     case 1:
@@ -687,10 +794,12 @@ constexpr PrimitiveType UnsignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   }
 }
 
+// Returns the primitive type for the signed integral type with the given
+// bit width, or PRIMITIVE_TYPE_INVALID if there is no such type.
 PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth);
 
-// Returns the real, imag component type underlying the given complex type.
-// LOG(FATAL)'s if complex_type is not complex.
+// Returns the real, imag component type for the given complex type.
+// Crashes if complex_type is not complex.
 constexpr PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
   switch (complex_type) {
     case C64:
@@ -703,6 +812,8 @@ constexpr PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
   }
 }
 
+// Returns the complex type for the given real, imag component type.
+// Crashes if there's no complex type for the given component type.
 constexpr PrimitiveType ComplexType(PrimitiveType base_type) {
   if (base_type == F32) {
     return C64;
@@ -750,119 +861,28 @@ inline PrimitiveType HigherPrecisionType(PrimitiveType a, PrimitiveType b) {
   return a;
 }
 
-// Returns true if a convert from from_type to to_type loses no precision.
-inline bool CastPreservesValues(PrimitiveType from_type,
-                                PrimitiveType to_type) {
-  // * -> *
-  if (from_type == to_type) {
-    return true;
-  }
-  // * -> F8E8M0FNU is not possible because zero cannot be represented.
-  if (to_type == F8E8M0FNU) {
-    return false;
-  }
-  // PRED -> *
-  if (from_type == PRED) {
-    return true;
-  }
-  // ~PRED -> PRED is not safe because it drops almost all numbers.
-  if (to_type == PRED) {
-    return false;
-  }
-  // * -> C is safe if the components of * and C can be safely converted.
-  if (primitive_util::IsComplexType(to_type)) {
-    auto from_component_type =
-        primitive_util::IsComplexType(from_type)
-            ? primitive_util::ComplexComponentType(from_type)
-            : from_type;
-    auto to_component_type = primitive_util::ComplexComponentType(to_type);
-    return CastPreservesValues(from_component_type, to_component_type);
-  }
-  // ~C -> C is not safe because it drops imaginary components.
-  if (primitive_util::IsComplexType(from_type)) {
-    return false;
-  }
-  // F -> F is safe if the exponent/significand are preserved and `to_type`
-  // preserves infinities/nans/unsigned zero in `from_type`.
-  if (primitive_util::IsFloatingPointType(from_type) &&
-      primitive_util::IsFloatingPointType(to_type)) {
-    return
-        // Target mantissa should be large enough.
-        primitive_util::SignificandWidth(from_type) <=
-            primitive_util::SignificandWidth(to_type) &&
-        // Target exponent should be large enough.
-        primitive_util::ExponentWidth(from_type) <=
-            primitive_util::ExponentWidth(to_type) &&
-        // HasInfinity check.
-        (!primitive_util::HasInfinity(from_type) ||
-         primitive_util::HasInfinity(to_type)) &&
-        // HasNaN check.
-        (!primitive_util::HasNaN(from_type) ||
-         primitive_util::HasNaN(to_type)) &&
-        // HasNegativeZero check.
-        (!primitive_util::HasNegativeZero(from_type) ||
-         primitive_util::HasNegativeZero(to_type)) &&
-        // Minimum denormal should be representable by target type.
-        (primitive_util::UnderflowExponent(from_type) -
-         primitive_util::SignificandWidth(from_type)) >=
-            (primitive_util::UnderflowExponent(to_type) -
-             primitive_util::SignificandWidth(to_type)) &&
-        // Maximum exponent may be larger with custom bias (e.g. F8E4M3B11FNUZ).
-        primitive_util::OverflowExponent(from_type) <=
-            primitive_util::OverflowExponent(to_type);
-  }
-  // F -> I is not safe because it drops fractional numbers.
-  if (!primitive_util::IsIntegralType(from_type)) {
-    return false;
-  }
-  // An n-bit unsigned integer takes on values from [0, 2^n - 1].
-  // An n-bit signed integer takes on values from [-2^(n-1), 2^(n-1) - 1].
-  // from_bits/to_bits considers the number of non-sign bits.
-  const int from_bits = primitive_util::IsSignedIntegralType(from_type)
-                            ? primitive_util::BitWidth(from_type) - 1
-                            : primitive_util::BitWidth(from_type);
-  const int to_bits = primitive_util::IsSignedIntegralType(to_type)
-                          ? primitive_util::BitWidth(to_type) - 1
-                          : primitive_util::BitWidth(to_type);
-  // I -> F is safe if the integer can be represented exactly.
-  if (primitive_util::IsFloatingPointType(to_type)) {
-    // In both cases, we need to handle an exponent of n-1.
-    // However, the significand needed to represent signed two's complement
-    // numbers is smaller by one bit because it will only have a non-zero
-    // trailing significand field when the exponent is smaller than n-1.
-    return from_bits <= primitive_util::SignificandWidth(to_type) &&
-           primitive_util::BitWidth(from_type) - 1 <
-               primitive_util::OverflowExponent(to_type);
-  }
-  // S -> U is not safe because it drops negative numbers.
-  if (primitive_util::IsSignedIntegralType(from_type) &&
-      primitive_util::IsUnsignedIntegralType(to_type)) {
-    return false;
-  }
-  // I -> I is safe if the integer can be represented exactly; we've already
-  // ensured that signed to unsigned conversions won't happen here.
-  CHECK(primitive_util::IsIntegralType(to_type));
-  return from_bits <= to_bits;
-}
+// Returns true if a conversion from from_type to to_type loses no precision.
+bool CastPreservesValues(PrimitiveType from_type, PrimitiveType to_type);
 
 // Returns the lower-case name of the given primitive type.
 const std::string& LowercasePrimitiveTypeName(PrimitiveType s);
 
 // Returns the PrimitiveType matching the given name. The given name is expected
 // to be lower-case.
-absl::StatusOr<PrimitiveType> StringToPrimitiveType(absl::string_view name);
+absl::StatusOr<PrimitiveType> StringToPrimitiveType(
+    absl::string_view lower_name);
 
-// Returns true if the given name is a primitive type string (lower-case).
+// Returns true if the given string is a lower-case primitive type name.
 bool IsPrimitiveTypeName(absl::string_view name);
 
 // Returns whether `type` can be expressed as an instance of T.
 // For example,
-//  IsCanonicalRepresentation<float>(F32)          // true
-//  IsCanonicalRepresentation<xla::bfloat16>(BF16) // true
-//  IsCanonicalRepresentation<int32_t>(S8)         // true, 8 <= 32
-//  IsCanonicalRepresentation<uint16_t>(S16)       // false, unsigned.
+//  CanRepresent<float>(F32)          // true
+//  CanRepresent<xla::bfloat16>(BF16) // true
+//  CanRepresent<int32_t>(S8)         // true, 8 <= 32
+//  CanRepresent<uint16_t>(S16)       // false, unsigned.
 template <typename T>
-bool IsCanonicalRepresentation(PrimitiveType type) {
+constexpr bool CanRepresent(PrimitiveType type) {
   return PrimitiveTypeSwitch<bool>(
       [](auto primitive_type) -> bool {
         if constexpr (primitive_util::IsFloatingPointType(primitive_type) ||
@@ -886,8 +906,9 @@ bool IsCanonicalRepresentation(PrimitiveType type) {
       type);
 }
 
+// Returns true if `x` can be represented by the native type of `ty`.
 inline bool FitsInIntegralType(int64_t x, PrimitiveType ty) {
-  return primitive_util::IntegralTypeSwitch<bool>(
+  return primitive_util::IntegralTypeSwitch(
       [&](auto primitive_type) -> bool {
         using NativeT = primitive_util::NativeTypeOf<primitive_type>;
         return std::numeric_limits<NativeT>::min() <= x &&
@@ -896,16 +917,21 @@ inline bool FitsInIntegralType(int64_t x, PrimitiveType ty) {
       ty);
 }
 
+// Returns true if `type` is smaller than 8 bits and is not PRED.
 constexpr bool IsSubByteNonPredType(PrimitiveType type) {
   return IsArrayType(type) && type != PRED &&
          primitive_util::BitWidth(type) < 8;
 }
 
+// Packs the given input of sub-byte values into the given output. The bit width
+// of the input type must be 2 or 4, or this function will crash.
 inline void PackIntN(PrimitiveType input_type, absl::Span<const char> input,
                      absl::Span<char> output) {
   xla::PackIntN(primitive_util::BitWidth(input_type), input, output);
 }
 
+// Unpacks the given input of sub-byte values into the given output. The bit
+// width of the input type must be 2 or 4, or this function will crash.
 inline void UnpackIntN(PrimitiveType input_type, absl::Span<const char> input,
                        absl::Span<char> output) {
   xla::UnpackIntN(primitive_util::BitWidth(input_type), input, output);
diff --git a/third_party/xla/xla/printer.cc b/third_party/xla/xla/printer.cc
index 3f4dc50c8d99..2d0bfaf02253 100644
--- a/third_party/xla/xla/printer.cc
+++ b/third_party/xla/xla/printer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/printer.h"
 
+#include <cstdint>
 #include <cstring>
 #include <string>
 #include <utility>
@@ -23,10 +24,21 @@ limitations under the License.
 #include "absl/strings/cord.h"
 #include "absl/strings/cord_buffer.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "xla/tsl/platform/logging.h"
 
 namespace xla {
 
+void Printer::AppendInt64List(absl::Span<const int64_t> list,
+                              bool leading_comma) {
+  if (leading_comma) {
+    Append(",");
+  }
+  Append("{");
+  AppendJoin(this, list, ",");
+  Append("}");
+}
+
 void StringPrinter::Append(const absl::AlphaNum& a) {
   absl::StrAppend(&result_, a);
 }
diff --git a/third_party/xla/xla/printer.h b/third_party/xla/xla/printer.h
index fd6c4314e85c..2e951bb0c7e6 100644
--- a/third_party/xla/xla/printer.h
+++ b/third_party/xla/xla/printer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PRINTER_H_
 #define XLA_PRINTER_H_
 
+#include <cstdint>
 #include <iterator>
 #include <string>
 
@@ -39,6 +40,13 @@ class Printer {
 
   // Appends the given string to the printer.
   virtual void Append(const absl::AlphaNum& a) = 0;
+
+  // Prints a list of numbers in the format:
+  //   {<num>(,<num>)*}
+  // , pre-pending a comma if `leading_comma` is true.
+  // May be overridden in some Printer implementations.
+  virtual void AppendInt64List(absl::Span<const int64_t> list,
+                               bool leading_comma);
 };
 
 // A printer implementation that accumulates printed strings into `std::string`.
diff --git a/third_party/xla/xla/protobuf_util.cc b/third_party/xla/xla/protobuf_util.cc
index 7f7af5aa71d8..cf16e3768301 100644
--- a/third_party/xla/xla/protobuf_util.cc
+++ b/third_party/xla/xla/protobuf_util.cc
@@ -28,8 +28,8 @@ limitations under the License.
 namespace xla {
 namespace protobuf_util {
 
-bool ProtobufEquals(const tsl::protobuf::Message& m1,
-                    const tsl::protobuf::Message& m2) {
+bool HaveSameSerialization(const tsl::protobuf::Message& m1,
+                           const tsl::protobuf::Message& m2) {
   // This is a bit fast and loose, but avoids introducing a dependency on
   // the much more complex protobuf::util::MessageDifferencer class.  For
   // our purposes we just say that two protobufs are equal if their serialized
@@ -40,7 +40,7 @@ bool ProtobufEquals(const tsl::protobuf::Message& m1,
   return (serialized1 == serialized2);
 }
 
-size_t ProtobufHash(const tsl::protobuf::Message& m) {
+size_t ProtobufHashBySerialization(const tsl::protobuf::Message& m) {
   // This is a bit fast and loose, but avoids introducing a dependency on
   // the much more complex protobuf::util::MessageDifferencer class.
   // We perform the hash on their serialized representation.
diff --git a/third_party/xla/xla/protobuf_util.h b/third_party/xla/xla/protobuf_util.h
index 4ba58f2f9138..e8e063463dcc 100644
--- a/third_party/xla/xla/protobuf_util.h
+++ b/third_party/xla/xla/protobuf_util.h
@@ -26,34 +26,45 @@ limitations under the License.
 namespace xla {
 namespace protobuf_util {
 
-// Returns true if m1 is equal to m2.
+// Returns true if m1 and m2 have the same serialization.
 //
-// WARNING: We use protocol buffer serialization and then check for
-// equality of the serialized representation, which may miss some
-// cases of equality.  However, for the purposes of the XLA code
-// base, this form of equality checking is sufficient.
-extern bool ProtobufEquals(const tsl::protobuf::Message& m1,
-                           const tsl::protobuf::Message& m2);
-
-// Return the hash of the message "m".
+// WARNING: Protobuf serialization is not guaranteed to be stable. Use this ONLY
+// IF you are SURE that you want this form of equality.
 //
-// WARNING: This uses the same serialization approach used by ProtobufEquals,
-// so the WARNING for that function applies here.
-size_t ProtobufHash(const tsl::protobuf::Message& m);
+// In g3 tests, prefer matchers like ::testing::EqualsProto. In OSS tests,
+// prefer ::tsl::proto_testing::EqualsProto. These have more precise semantics
+// and will give far better error messages.
+[[nodiscard]] bool HaveSameSerialization(const tsl::protobuf::Message& m1,
+                                         const tsl::protobuf::Message& m2);
 
-// Wrappers for above methods so that they can be used in containers.
-class ProtobufEqualsWrapper {
+// Return the hash of the message "m", based on its serialization.
+//
+// WARNING: This uses the same serialization approach used by
+// HaveSameSerialization, so the WARNING for that function applies here.
+[[nodiscard]] size_t ProtobufHashBySerialization(
+    const tsl::protobuf::Message& m);
+
+// Wrappers for HaveSameSerialization() so that we can use protos in containers
+// that require equality.
+//
+// WARNING: This uses the same serialization approach used by
+// HaveSameSerialization, so the WARNING for that function applies here.
+class HaveSameSerializationFunctor {
  public:
-  bool operator()(const tsl::protobuf::Message& m1,
-                  const tsl::protobuf::Message& m2) const {
-    return ProtobufEquals(m1, m2);
+  [[nodiscard]] bool operator()(const tsl::protobuf::Message& m1,
+                                const tsl::protobuf::Message& m2) const {
+    return HaveSameSerialization(m1, m2);
   }
 };
 
-class ProtobufHashWrapper {
+// Functor for hashing a protobuf message by its serialization.
+//
+// WARNING: This uses the same serialization approach used by
+// HaveSameSerialization, so the WARNING for that function applies here.
+class ProtobufHashBySerializationFunctor {
  public:
-  size_t operator()(const tsl::protobuf::Message& m) const {
-    return ProtobufHash(m);
+  [[nodiscard]] size_t operator()(const tsl::protobuf::Message& m) const {
+    return ProtobufHashBySerialization(m);
   }
 };
 
diff --git a/third_party/xla/xla/py_strict.bzl b/third_party/xla/xla/py_strict.bzl
new file mode 100644
index 000000000000..35862b51ddb1
--- /dev/null
+++ b/third_party/xla/xla/py_strict.bzl
@@ -0,0 +1,20 @@
+"""Default (OSS) build versions of Python strict rules."""
+
+load("@local_xla//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+load("@rules_python//python:py_binary.bzl", "py_binary")
+load("@rules_python//python:py_library.bzl", "py_library")
+load("@rules_python//python:py_test.bzl", "py_test")
+
+visibility(DEFAULT_LOAD_VISIBILITY)
+
+# Placeholder to use until bazel supports py_strict_binary.
+def py_strict_binary(name, **kwargs):
+    py_binary(name = name, **kwargs)
+
+# Placeholder to use until bazel supports py_strict_library.
+def py_strict_library(name, **kwargs):
+    py_library(name = name, **kwargs)
+
+# Placeholder to use until bazel supports py_strict_test.
+def py_strict_test(name, **kwargs):
+    py_test(name = name, **kwargs)
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 4d8203d4b819..d44136b5f94b 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1,26 +1,23 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla:pytype.bzl", "pytype_strict_library")
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
     "xla_py_test_deps",
 )
+load("//xla/python:package_groups.bzl", "XLA_PYTHON_XLA_CLIENT_USERS", "XLA_PYTHON_XLA_EXTENSION_USERS")
+load("//xla/python:pywrap.bzl", "nanobind_pywrap_extension")
 load(
     "//xla/tsl:tsl.bzl",
-    "if_cuda_or_rocm",
     "if_google",
-    "if_oss",
     "internal_visibility",
 )
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_pybind_extension")
-load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility(["//visibility:private"]),
+    default_visibility = internal_visibility([":jax"]),
     licenses = ["notice"],
 )
 
@@ -32,133 +29,44 @@ package_group(
     ],
 )
 
-pytype_strict_library(
-    name = "xla_client",
-    srcs = ["xla_client.py"],
-    pytype_srcs = ["xla_client.pyi"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xla_extension",  # buildcleaner: keep
-    ] + if_google([
-        "@ml_dtypes_py//ml_dtypes",
-        "//third_party/py/numpy",
-    ]),
-)
-
-exports_files([
-    "xla_client.py",
-    "xla_client.pyi",
-])
-
-tsl_pybind_extension(
-    name = "custom_calls_testlib",
-    srcs = ["custom_calls_testlib.cc"],
-    visibility = ["//visibility:private"],
-    deps = [
-        "//xla/ffi/api:c_api",
-        "//xla/ffi/api:ffi",
-        "@com_google_absl//absl/status",
-        "@nanobind",
+package_group(
+    name = "jax",
+    packages = [
+        "//third_party/py/jax/...",
     ],
 )
 
-py_strict_test(
-    name = "xla_client_backend_independent_test",
-    srcs = ["xla_client_backend_independent_test.py"],
-    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
-    deps = [
-        ":xla_client",
-        ":xla_extension",
-        "@absl_py//absl/testing:absltest",
-    ] + if_google([
-        "//third_party/py/numpy",
-        "//third_party/py/portpicker",
-    ]) + xla_py_test_deps(),
-)
-
-py_strict_library(
-    name = "xla_client_test",
-    testonly = 1,
-    srcs = ["xla_client_test.py"],
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        ":xla_client",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@absl_py//absl/testing:absltest",
-        "@absl_py//absl/testing:parameterized",
-        "@ml_dtypes_py//ml_dtypes",
-    ] + if_google(["//third_party/py/numpy"]),
+exports_files(
+    srcs = ["pyinit_stub.c"],
+    visibility = [":friends"],
 )
 
-py_strict_test(
-    name = "xla_client_test_cpu",
-    srcs = ["xla_client_test.py"],
-    args = ["--backend=cpu"],
-    env = {
-        "XLA_FLAGS": "--xla_force_host_platform_device_count=4",
-    },
-    main = "xla_client_test.py",
-    tags = [
-        "no_oss",
-        "not_run:arm",
-    ],  # TODO(phawkins): This test passes, but requires --config=monolithic.
-    deps = [
-        ":custom_calls_testlib",
-        ":xla_client",
-        ":xla_extension",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@absl_py//absl/testing:absltest",
-        "@absl_py//absl/testing:parameterized",
+pytype_strict_library(
+    name = "xla_client",
+    srcs = ["xla_client.py"],
+    visibility = internal_visibility(XLA_PYTHON_XLA_CLIENT_USERS),
+    deps = if_google([
+        ":_hlo_pass",
+        ":_ops",
+        ":_profiler",
+        ":_xla_builder",
+        "//third_party/py/jax/jaxlib:xla_client",
         "@ml_dtypes_py//ml_dtypes",
-    ] + if_google(["//third_party/py/numpy"]) + xla_py_test_deps(),
-)
-
-py_strict_test(
-    name = "weakref_lru_cache_test",
-    srcs = ["weakref_lru_cache_test.py"],
-    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
-    deps = [
-        ":xla_client",
-        ":xla_extension",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@absl_py//absl/testing:absltest",
-    ] + xla_py_test_deps(),
+        "//third_party/py/numpy",
+    ]),
 )
 
-py_strict_test(
-    name = "xla_client_test_gpu",
-    srcs = ["xla_client_test.py"],
-    args = ["--backend=gpu"],
-    main = "xla_client_test.py",
-    tags = [
-        "no_oss",
-        "requires-gpu-nvidia",
-    ] + if_google(["config-cuda-only"]),  # TODO(phawkins): This test passes, but requires --config=monolithic.
-    deps = [
-        ":xla_client",
-        ":xla_extension",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@absl_py//absl/testing:absltest",
-        "@absl_py//absl/testing:parameterized",
-        "@ml_dtypes_py//ml_dtypes",
-    ] + if_google(
-        [
-            ":xla_gpu_extension",
-            "//third_party/py/numpy",
-            "//xla/service:gpu_plugin",
-        ],
-        [],
-    ) + xla_py_test_deps(),
+pytype_strict_library(
+    name = "xla_extension",
+    srcs = ["xla_extension.py"],
+    visibility = internal_visibility(XLA_PYTHON_XLA_EXTENSION_USERS),
+    deps = if_google(["//third_party/py/jax/jaxlib:_jax"]),
 )
 
 tsl_pybind_extension(
     name = "status_casters_ext",
+    testonly = 1,
     srcs = ["status_casters_ext.cc"],
-    visibility = ["//visibility:private"],
     deps = [
         "//xla/pjrt:exceptions",
         "//xla/pjrt:status_casters",
@@ -192,6 +100,7 @@ cc_library(
     visibility = internal_visibility([":friends"]),
     deps = [
         ":nb_numpy",
+        ":safe_static_init",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -205,7 +114,6 @@ cc_library(
         "//xla/tsl/python/lib/core:numpy",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -214,54 +122,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "python_ref_manager",
-    srcs = ["python_ref_manager.cc"],
-    hdrs = ["python_ref_manager.h"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "@nanobind",
-    ],
-)
-
-cc_library(
-    name = "traceback",
-    srcs = ["traceback.cc"],
-    hdrs = ["traceback.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        ":nb_class_ptr",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "//xla/pjrt:exceptions",
-        "//xla/tsl/platform:logging",
-        "@local_tsl//tsl/platform",
-    ],
-)
-
 cc_library(
     name = "pprof_profile_builder",
     srcs = ["pprof_profile_builder.cc"],
@@ -277,6 +137,7 @@ cc_library(
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_config_python//:python_headers",  # buildcleaner: keep
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/protobuf:profile_proto_cc",
@@ -285,789 +146,154 @@ cc_library(
 )
 
 cc_library(
-    name = "py_client",
-    srcs = [
-        "py_array.cc",
-        "py_client.cc",
-        "py_compile_only_client.cc",
-        "py_device.cc",
-        "py_device_list.cc",
-        "py_executable.cc",
-        "py_memory_space.cc",
-        "py_program.cc",
-        "py_values.cc",
-        "sharding.cc",
-        "to_ifrt_sharding.cc",
-    ],
-    hdrs = [
-        "py_array.h",
-        "py_client.h",
-        "py_compile_only_client.h",
-        "py_device.h",
-        "py_device_list.h",
-        "py_executable.h",
-        "py_memory_space.h",
-        "py_program.h",
-        "py_values.h",
-        "sharded_device_array.h",
-        "sharding.h",
-        "to_ifrt_sharding.h",
-    ],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
+    name = "inspect_sharding",
+    srcs = ["inspect_sharding.cc"],
+    hdrs = ["inspect_sharding.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
-        ":aggregate_profile",
-        ":callback",
-        ":guard_lib",
-        ":nb_absl_span",
-        ":nb_class_ptr",
-        ":nb_helpers",
-        ":nb_numpy",
-        ":pprof_profile_builder",
-        ":py_client_cpu",
-        ":py_host_callback",
-        ":py_host_callback_proto_cc",
-        ":python_ref_manager",
-        ":traceback",
-        ":types",
-        ":util",
-        ":xplane_to_profile_instructions",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:cord",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@nanobind",
-        "@shardy//shardy/dialect/sdy/ir:dialect",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "//xla:comparison_util",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/ir:hlo",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:host_callback",
-        "//xla/pjrt:host_memory_spaces",
-        "//xla/pjrt:lru_cache",
-        "//xla/pjrt:mlir_to_hlo",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_compiler",
-        "//xla/pjrt:pjrt_device_description",
-        "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_future",
-        "//xla/pjrt:pjrt_layout",
-        "//xla/pjrt:status_casters",
-        "//xla/pjrt:transpose",
-        "//xla/pjrt/distributed",
-        "//xla/pjrt/distributed:client",
-        "//xla/python/compile_only_ifrt:client",
-        "//xla/python/ifrt",
-        "//xla/python/ifrt:attribute_map",
-        "//xla/python/ifrt:custom_call_program",
-        "//xla/python/ifrt:plugin_program",
-        "//xla/python/ifrt:plugin_program_serdes",
-        "//xla/python/ifrt/hlo:hlo_program",
-        "//xla/python/pjrt_ifrt",
-        "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
-        "//xla/python/pjrt_ifrt:pjrt_dtype",
-        "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
-        "//xla/python/pjrt_ifrt:xla_ifrt",
-        "//xla/service:computation_placer_hdr",
-        "//xla/service:custom_call_status",
-        "//xla/service:custom_call_target_registry",
-        "//xla/service:platform_util",
-        "//xla/service/spmd/shardy:constants",
-        "//xla/service/spmd/shardy:utils",
-        "//xla/service/spmd/shardy/sdy_round_trip:pipelines",
-        "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/framework:allocator",
-        "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/python/lib/core:numpy",
-        "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:fingerprint",
-        "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/profiler/lib:profiler_session",
-        "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
-    ] + if_google(["@com_google_protobuf//:any_cc_proto"]),
+        "//xla/service:custom_call_sharding_helper",
+        "//xla/service/spmd:spmd_partitioner",
+        "@com_google_absl//absl/status",
+    ],
+    # Always register 'InspectSharding' custom partitioning handler.
+    alwayslink = 1,
 )
 
 cc_library(
-    name = "py_host_callback",
-    srcs = ["py_host_callback.cc"],
-    hdrs = ["py_host_callback.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
+    name = "custom_partition_callback",
+    srcs = ["custom_partition_callback.cc"],
+    hdrs = ["custom_partition_callback.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
-        ":callback",
-        ":py_host_callback_proto_cc",
-        ":python_ref_manager",
-        ":types",
-        "//xla:shape_util",
-        "//xla:status_macros",
+        "//xla:debug_options_flags",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/ffi",
-        "//xla/ffi:ffi_api",
-        "//xla/pjrt:host_callback",
-        "//xla/pjrt:pjrt_compiler",
-        "//xla/python/ifrt",
-        "//xla/python/pjrt_ifrt",
-        "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
-        "//xla/tsl/concurrency:ref_count",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt/c:pjrt_c_api_custom_partitioner_extension_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_helpers",
+        "//xla/service:call_inliner",
+        "//xla/service:custom_call_sharding_helper",
+        "//xla/service/spmd:spmd_partitioner",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@nanobind",
-    ] + if_google([
-        "@com_google_protobuf//:any_cc_proto",
-    ]),
+    ],
 )
 
 cc_library(
-    name = "callback",
-    srcs = [
-        "callback.cc",
-    ],
-    hdrs = [
-        "callback.h",
-    ],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
+    name = "custom_call_batch_partitioner",
+    srcs = ["custom_call_batch_partitioner.cc"],
+    hdrs = ["custom_call_batch_partitioner.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
-        ":nb_numpy",
-        ":python_ref_manager",
-        "//xla:comparison_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/ffi",
-        "//xla/pjrt:host_callback",
-        "//xla/pjrt:transpose",
-        "//xla/service:custom_call_status",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:inlined_vector",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_sharding_util",
+        "//xla/service:custom_call_sharding_helper",
+        "//xla/service/spmd:spmd_partitioner",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "@nanobind",
     ],
 )
 
 cc_library(
-    name = "py_client_cpu",
-    srcs = ["py_client_cpu.cc"],
-    hdrs = ["py_client_cpu.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
+    name = "debug_callback_partitioner",
+    srcs = ["debug_callback_partitioner.cc"],
+    hdrs = ["debug_callback_partitioner.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
-        ":callback",
-        ":nb_numpy",
-        ":py_host_callback",
-        ":types",
-        "//xla:comparison_util",
         "//xla:shape_util",
-        "//xla/ffi",
-        "//xla/ffi:ffi_api",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:host_callback",
-        "//xla/pjrt:transpose",
-        "//xla/python/ifrt",
-        "//xla/service:custom_call_status",
-        "//xla/service:custom_call_target_registry",
-        "//xla/service:platform_util",
-        "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:inlined_vector",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_sharding",
+        "//xla/service:custom_call_sharding_helper",
+        "//xla/service/spmd:spmd_partitioner",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@nanobind",
     ],
+    # Always register 'DebugCallbackCustomCallPartitioner' custom partitioning handler.
     alwayslink = 1,
 )
 
-cc_library(
-    name = "py_client_gpu",
-    srcs = if_google(
-        ["py_client_gpu.cc"],
-        if_cuda_or_rocm(["py_client_gpu.cc"]),
-    ),
-    hdrs = if_google(
-        ["py_client_gpu.h"],
-        if_cuda_or_rocm(["py_client_gpu.h"]),
-    ),
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    features = ["-use_header_modules"],
-    visibility = internal_visibility([":friends"]),
+nanobind_pywrap_extension(
+    name = "_hlo_pass",
+    srcs = ["hlo_pass.cc"],
+    pytype_srcs = ["_hlo_pass.pyi"],
+    visibility = [":friends"],
     deps = [
-        ":callback",
-        ":nb_numpy",
-        ":py_host_callback",
-        ":types",
-        "//xla:comparison_util",
-        "//xla:shape_util",
-        "//xla/ffi",
-        "//xla/ffi:ffi_api",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:host_callback",
-        "//xla/pjrt:transpose",
-        "//xla/python/ifrt",
-        "//xla/service:custom_call_status",
-        "//xla/service:custom_call_target_registry",
-        "//xla/service:platform_util",
-        "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@nanobind",
-    ] + if_rocm(
-        ["@local_config_rocm//rocm:rocm_headers"],
-        ["@local_config_cuda//cuda:cuda_headers"],
-    ),
-)
-
-cc_library(
-    name = "dlpack",
-    srcs = ["dlpack.cc"],
-    hdrs = ["dlpack.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        ":nb_class_ptr",
-        ":py_client",
-        ":python_ref_manager",
-        ":traceback",
-        ":types",
-        ":util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_compiler",
-        "//xla/pjrt:pjrt_layout",
-        "//xla/python/ifrt",
-        "//xla/python/pjrt_ifrt",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@dlpack",
-        "@llvm-project//llvm:Support",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "@nanobind",
-    ],
-)
-
-cc_library(
-    name = "jax_jit",
-    srcs = ["jax_jit.cc"],
-    hdrs = ["jax_jit.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:private"],  # For the functions to access C++ flags/thread-local variables
-    deps = [
-        ":nb_absl_inlined_vector",
-        ":nb_absl_span",
-        ":nb_helpers",
-        ":py_client",
-        ":python_ref_manager",
-        ":pytree",
-        ":types",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # build_cleaner: keep
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_layout",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/transforms/simplifiers:flatten_call_graph",
+        "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/hlo/transforms/simplifiers:tuple_simplifier",
         "//xla/pjrt:status_casters",
-        "//xla/tsl/platform:logging",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-cc_library(
-    name = "inspect_sharding",
-    srcs = ["inspect_sharding.cc"],
-    hdrs = ["inspect_sharding.h"],
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:custom_call_sharding_helper",
-        "//xla/service/spmd:spmd_partitioner",
-        "@com_google_absl//absl/status",
-    ],
-    # Always register 'InspectSharding' custom partitioning handler.
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "custom_partition_callback",
-    srcs = ["custom_partition_callback.cc"],
-    hdrs = ["custom_partition_callback.h"],
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        "//xla:debug_options_flags",
-        "//xla:util",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/pjrt:mlir_to_hlo",
-        "//xla/pjrt/c:pjrt_c_api_custom_partitioner_extension_hdrs",
-        "//xla/pjrt/c:pjrt_c_api_hdrs",
-        "//xla/pjrt/c:pjrt_c_api_helpers",
         "//xla/service:call_inliner",
-        "//xla/service:custom_call_sharding_helper",
-        "//xla/service/spmd:spmd_partitioner",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "custom_call_batch_partitioner",
-    srcs = ["custom_call_batch_partitioner.cc"],
-    hdrs = ["custom_call_batch_partitioner.h"],
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_sharding_util",
-        "//xla/service:custom_call_sharding_helper",
-        "//xla/service/spmd:spmd_partitioner",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "custom_call_sharding",
-    srcs = ["custom_call_sharding.cc"],
-    hdrs = ["custom_call_sharding.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":custom_call_batch_partitioner",
-        ":custom_partition_callback",
-        ":inspect_sharding",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@nanobind",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_sharding_util",
-        "//xla/pjrt:status_casters",
-        "//xla/pjrt/c:pjrt_c_api_custom_partitioner_extension_hdrs",
-        "//xla/pjrt/c:pjrt_c_api_hdrs",
-        "//xla/pjrt/c:pjrt_c_api_helpers",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "ops",
-    srcs = ["ops.cc"],
-    hdrs = ["ops.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        ":nb_absl_span",
-        ":nb_helpers",
-        ":types",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/types:span",
-        "@nanobind",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/builder/lib:approx_topk",
-        "//xla/hlo/builder/lib:approx_topk_shape",
-        "//xla/hlo/builder/lib:comparators",
-        "//xla/hlo/builder/lib:lu_decomposition",
-        "//xla/hlo/builder/lib:math",
-        "//xla/hlo/builder/lib:qr",
-        "//xla/hlo/builder/lib:self_adjoint_eig",
-        "//xla/hlo/builder/lib:sorting",
-        "//xla/hlo/builder/lib:svd",
-        "//xla/pjrt:status_casters",
-        "//xla/service:hlo_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "pjit",
-    srcs = ["pjit.cc"],
-    hdrs = ["pjit.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
+nanobind_pywrap_extension(
+    name = "_xla_builder",
+    srcs = ["xla_builder.cc"],
+    pytype_deps = [
+        "//third_party/py/jax/jaxlib:_jax",
     ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:private"],
+    pytype_srcs = ["_xla_builder.pyi"],
     deps = [
-        ":config",
-        ":guard_lib",
-        ":jax_jit",
-        ":nb_class_ptr",
         ":nb_helpers",
-        ":nb_numpy",
-        ":py_client",
-        ":python_ref_manager",
-        ":pytree",
-        ":traceback",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
         "//xla:shape_util",
-        "//xla:util",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:lru_cache",
-        "//xla/python/ifrt",
-        "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-cc_library(
-    name = "pmap_lib",
-    srcs = ["pmap_lib.cc"],
-    hdrs = ["pmap_lib.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":config",
-        ":jax_jit",
-        ":nb_class_ptr",
-        ":nb_helpers",
-        ":nb_numpy",
-        ":py_client",
-        ":python_ref_manager",
-        ":pytree",
-        ":traceback",
-        ":types",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:status_casters",
-        "//xla/python/ifrt",
-        "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/python/lib/core:numpy",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-py_strict_test(
-    name = "pytree_test",
-    srcs = ["pytree_test.py"],
-    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
-    deps = [
-        ":xla_client",
-        ":xla_extension",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@absl_py//absl/testing:absltest",
-    ] + xla_py_test_deps(),
-)
-
-tf_proto_library(
-    name = "pytree_proto",
-    srcs = ["pytree.proto"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "pytree",
-    srcs = ["pytree.cc"],
-    hdrs = ["pytree.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = internal_visibility([":friends"]),
-    deps = [
-        ":nb_class_ptr",
-        ":pytree_proto_cc",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "//xla/pjrt:exceptions",
-        "//xla/tsl/platform:logging",
-    ],
-)
-
-cc_library(
-    name = "config",
-    srcs = ["config.cc"],
-    hdrs = ["config.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":python_ref_manager",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "//xla/tsl/platform:logging",
-    ],
-)
-
-py_strict_test(
-    name = "config_test",
-    srcs = ["config_test.py"],
-    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
-    deps = [
-        ":xla_client",
-        ":xla_extension",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@absl_py//absl/testing:absltest",
-    ] + xla_py_test_deps(),
-)
-
-cc_library(
-    name = "mlir",
-    srcs = ["mlir.cc"],
-    hdrs = ["mlir.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        ":refine_polymorphic_shapes",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:BytecodeWriter",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ReconcileUnrealizedCasts",
-        "@llvm-project//mlir:Support",
-        "@nanobind",
-        "@stablehlo//:stablehlo_serialization",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/translate:stablehlo",
-        "//xla/mlir_hlo:mhlo_passes",
-        "//xla/pjrt:mlir_to_hlo",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/builder:xla_builder",
         "//xla/pjrt:status_casters",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
+        "//xla/service:name_uniquer",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+        "@nanobind",
     ],
 )
 
-cc_library(
-    name = "sdy",
-    srcs = ["sdy.cc"],
-    hdrs = ["sdy.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
+nanobind_pywrap_extension(
+    name = "_ops",
+    srcs = ["ops.cc"],
+    pytype_deps = [
+        ":_xla_builder",
+        "//third_party/py/jax/jaxlib:_jax",
     ],
-    features = ["-use_header_modules"],
+    pytype_srcs = ["_ops.pyi"],
     deps = [
+        ":nb_absl_span",
+        ":nb_helpers",
+        ":types",
         # placeholder for index annotation deps
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:BytecodeWriter",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/types:span",
         "@nanobind",
-        "@shardy//shardy/dialect/sdy/ir:dialect",
-        "@shardy//shardy/dialect/sdy/transforms/import:passes",
-        "//xla/hlo/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
-        "//xla/mlir_hlo:all_passes",
-        "//xla/pjrt:mlir_to_hlo",
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/builder/lib:approx_topk",
+        "//xla/hlo/builder/lib:approx_topk_shape",
+        "//xla/hlo/builder/lib:comparators",
+        "//xla/hlo/builder/lib:lu_decomposition",
+        "//xla/hlo/builder/lib:math",
+        "//xla/hlo/builder/lib:qr",
+        "//xla/hlo/builder/lib:self_adjoint_eig",
+        "//xla/hlo/builder/lib:sorting",
+        "//xla/hlo/builder/lib:svd",
         "//xla/pjrt:status_casters",
-        "//xla/service/spmd/shardy:constants",
-        "//xla/service/spmd/shardy:utils",
-        "//xla/service/spmd/shardy/sdy_round_trip:import_shardy_attrs",
-        "//xla/service/spmd/shardy/sdy_round_trip:pipelines",
-        "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
+        "//xla/service:hlo_proto_cc",
     ],
 )
 
@@ -1102,6 +328,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "safe_static_init",
+    hdrs = ["safe_static_init.h"],
+    deps = [
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_library(
     name = "profiler_utils",
     srcs = ["profiler_utils.cc"],
@@ -1121,17 +355,10 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "profiler",
+nanobind_pywrap_extension(
+    name = "_profiler",
     srcs = ["profiler.cc"],
-    hdrs = ["profiler.h"],
-    # TODO(b/172353882): figure out why compatible_with is needed to avoid some internal errors.
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
+    pytype_srcs = ["_profiler.pyi"],
     deps = [
         ":aggregate_profile",
         ":profiler_utils",
@@ -1143,19 +370,15 @@ cc_library(
         "//xla/backends/profiler:profiler_backends",
         "//xla/backends/profiler/cpu:python_tracer",
         "//xla/backends/profiler/plugin:plugin_tracer",
-        "//xla/backends/profiler/plugin:profiler_c_api_hdrs",
         "//xla/pjrt:exceptions",
         "//xla/pjrt:status_casters",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
-        "//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
         "//xla/python/profiler:profile_data_lib",
         "//xla/tsl/platform:macros",
         "//xla/tsl/profiler/rpc:profiler_server_impl",
         "//xla/tsl/profiler/rpc/client:capture_profile",
         "//xla/tsl/profiler/rpc/client:profiler_client_impl",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/profiler/lib:profiler_factory",
-        "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
@@ -1164,144 +387,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "guard_lib",
-    srcs = ["guard_lib.cc"],
-    hdrs = ["guard_lib.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:private"],
-    deps = [
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@nanobind",
-        "//xla:util",
-    ],
-)
-
-cc_library(
-    name = "util",
-    srcs = ["util.cc"],
-    hdrs = ["util.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        "//xla:util",
-        "//xla/python/ifrt",
-        "//xla/tsl/concurrency:ref_count",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "weakref_lru_cache",
-    srcs = ["weakref_lru_cache.cc"],
-    hdrs = ["weakref_lru_cache.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:private"],
-    deps = [
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@nanobind",
-        "@local_config_python//:python_headers",
-        "//xla/pjrt:lru_cache",
-        "//xla/tsl/platform:logging",
-    ],
-)
-
-cc_library(
-    name = "xla_compiler",
-    srcs = ["xla_compiler.cc"],
-    hdrs = ["xla_compiler.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        ":dlpack",
-        ":nb_absl_span",
-        ":nb_helpers",
-        ":nb_numpy",
-        ":py_client",
-        ":types",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@nanobind",
-        "//xla:array",
-        "//xla:debug_options_flags",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/client:executable_build_options",
-        "//xla/ffi",
-        "//xla/ffi:ffi_api",
-        "//xla/ffi/api:c_api",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/transforms/simplifiers:flatten_call_graph",
-        "//xla/hlo/transforms/simplifiers:hlo_dce",
-        "//xla/hlo/transforms/simplifiers:tuple_simplifier",
-        "//xla/pjrt:compile_options_proto_cc",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:status_casters",
-        "//xla/service:call_inliner",
-        "//xla/service:computation_placer",
-        "//xla/service:custom_call_target_registry",
-        "//xla/service:hlo_graph_dumper",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service:name_uniquer",
-        "//xla/tsl/lib/strings:proto_serialization",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-    ],
-)
-
-tf_proto_library(
-    name = "py_host_callback_proto",
-    srcs = ["py_host_callback.proto"],
-    visibility = internal_visibility([":friends"]),
-)
-
 cc_library(
     name = "logging",
     srcs = ["logging.cc"],
@@ -1312,172 +397,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_support",
-    srcs = ["gpu_support.cc"],
-    hdrs = ["gpu_support.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        ":nb_class_ptr",
-        ":py_client",
-        ":py_client_gpu",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:status_casters",
-        "//xla/pjrt/distributed:client",
-        "//xla/pjrt/distributed:key_value_store_interface",
-        "//xla/pjrt/gpu:gpu_helpers",
-        "//xla/pjrt/gpu:se_gpu_pjrt_client",
-        "//xla/python/pjrt_ifrt",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "@nanobind",
-    ],
-)
-
-tsl_pybind_extension(
-    name = "xla_gpu_extension",
-    srcs = ["xla_gpu_support.cc"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    pytype_deps = [],
-    pytype_srcs = [],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gpu_support",
-        ":logging",
-        ":py_client",
-        # placeholder for index annotation deps  # buildcleaner: keep
-        "@com_google_absl//absl/strings:string_view",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "//xla/pjrt:status_casters",
-        "//xla/pjrt:triton",
-    ],
-)
-
-tsl_pybind_extension(
-    name = "xla_extension",
-    srcs = ["xla.cc"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    pytype_deps = [
-        "//third_party/py/numpy",
-    ],
-    pytype_srcs = glob(["xla_extension/*.pyi"]),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":config",
-        ":custom_call_sharding",
-        ":dlpack",
-        ":guard_lib",
-        ":jax_jit",
-        ":logging",
-        ":mlir",
-        ":nb_absl_flat_hash_map",
-        ":nb_absl_span",
-        ":nb_class_ptr",
-        ":ops",
-        ":pjit",
-        ":pmap_lib",
-        ":pprof_profile_builder",
-        ":profiler",
-        ":py_client",
-        ":python_ref_manager",
-        ":pytree",
-        ":refine_polymorphic_shapes",
-        ":sdy",
-        ":traceback",
-        ":types",
-        ":util",
-        ":weakref_lru_cache",
-        ":xla_compiler",
-        # placeholder for index annotation deps
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/log:initialize",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@nanobind",
-        "@local_config_python//:python_headers",  # buildcleaner: keep
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:types",
-        "//xla:util",
-        "//xla/backends/cpu/collectives:cpu_collectives",
-        "//xla/ffi:ffi_api",
-        "//xla/pjrt:exceptions",
-        "//xla/pjrt:mlir_to_hlo",
-        "//xla/pjrt:pjrt_api",
-        "//xla/pjrt:pjrt_c_api_client",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_common",
-        "//xla/pjrt:pjrt_compiler",
-        "//xla/pjrt:pjrt_executable",
-        "//xla/pjrt:pjrt_layout",
-        "//xla/pjrt:status_casters",
-        "//xla/pjrt/c:pjrt_c_api_hdrs",
-        "//xla/pjrt/distributed",
-        "//xla/pjrt/distributed:client",
-        "//xla/pjrt/distributed:key_value_store_interface",
-        "//xla/pjrt/distributed:protocol_proto_cc",
-        "//xla/pjrt/distributed:service",
-        "//xla/pjrt/plugin/xla_cpu:cpu_client_options",
-        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
-        "//xla/python/ifrt",
-        "//xla/python/ifrt:plugin_program",
-        "//xla/python/ifrt:plugin_program_serdes",
-        "//xla/python/ifrt_proxy/client:py_module",
-        "//xla/python/pjrt_ifrt",
-        "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
-        "//xla/python/pjrt_ifrt:xla_ifrt",
-        "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/distributed_runtime/preemption:preemption_sync_manager",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform/cloud:gcs_file_system",
-        "//xla/tsl/python/lib/core:numpy",
-        "@local_tsl//tsl/platform",
-    ] + select({
-        # gloo tcp transport only builds on linux
-        "//xla/tsl:macos": [
-            "//xla/backends/cpu/collectives:gloo_collectives",
-            "//xla/backends/cpu/collectives:gloo_kv_store",
-            "@gloo//:transport_uv",
-        ],
-        "//xla/tsl:windows": [],
-        "//conditions:default": [
-            "//xla/backends/cpu/collectives:gloo_collectives",
-            "//xla/backends/cpu/collectives:gloo_kv_store",
-            "//xla/python/transfer:py_socket_transfer",
-            "@gloo//:transport_tcp",
-        ],
-    }) + select({
-        # mpitrampoline does not build on windows
-        "//xla/tsl:windows": [],
-        # we support MPI collectives only in OSS builds
-        "//conditions:default": if_oss(["//xla/backends/cpu/collectives:mpi_collectives"]),
-    }),
-)
-
 cc_library(
     name = "xplane_to_profile_instructions",
     srcs = ["xplane_to_profile_instructions.cc"],
@@ -1516,21 +435,11 @@ xla_cc_test(
         "//xla/tsl/profiler/utils:file_system_utils",
         "//xla/tsl/profiler/utils:xplane_builder",
         "//xla/tsl/profiler/utils:xplane_schema",
-        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc_impl",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
-cc_library(
-    name = "nb_class_ptr",
-    hdrs = ["nb_class_ptr.h"],
-    copts = ["-fexceptions"],
-    features = ["-use_header_modules"],
-    visibility = internal_visibility([":friends"]),
-    deps = ["@nanobind"],
-)
-
 cc_library(
     name = "nb_helpers",
     hdrs = ["nb_helpers.h"],
@@ -1556,7 +465,9 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@nanobind",
-        # copybara:uncomment "//third_party/py/numpy:multiarray",
+        # copybara:uncomment_begin
+        # "//third_party/py/numpy:multiarray",  # build_cleaner: keep
+        # copybara:uncomment_end
         "@local_config_python//:python_headers",
         "//xla/tsl/python/lib/core:numpy",
     ],
@@ -1634,31 +545,11 @@ xla_cc_test(
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc_impl",
     ],
 )
 
-py_strict_test(
-    name = "jax_jit_test",
-    srcs = ["jax_jit_test.py"],
-    main = "jax_jit_test.py",
-    tags = [
-        "no_oss",
-        "not_run:arm",
-    ],  # TODO(phawkins): This test passes, but requires --config=monolithic.
-    deps = [
-        ":xla_client",
-        ":xla_extension",
-        "//third_party/py/numpy",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
-        "@absl_py//absl/testing:absltest",
-        "@absl_py//absl/testing:parameterized",
-    ] + xla_py_test_deps(),
-)
-
 py_strict_test(
     name = "xla_compiler_test",
     srcs = ["xla_compiler_test.py"],
@@ -1671,3 +562,10 @@ py_strict_test(
         "@absl_py//absl/testing:parameterized",
     ] + xla_py_test_deps(),
 )
+
+cc_library(
+    name = "version",
+    hdrs = ["version.h"],
+    compatible_with = [],
+    visibility = internal_visibility([":friends"]),
+)
diff --git a/third_party/xla/xla/python/_hlo_pass.pyi b/third_party/xla/xla/python/_hlo_pass.pyi
new file mode 100644
index 000000000000..b477ca1b4532
--- /dev/null
+++ b/third_party/xla/xla/python/_hlo_pass.pyi
@@ -0,0 +1,34 @@
+# Copyright 2021 The OpenXLA Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from jax.jaxlib import _jax
+
+class HloPassInterface:
+  @property
+  def name(self) -> str: ...
+  def is_pass_pipeline(self) -> bool: ...
+  def run(self, module: _jax.HloModule) -> bool: ...
+
+class HloDCE(HloPassInterface):
+  def __init__(self) -> None: ...
+
+class CallInliner(HloPassInterface):
+  def __init__(self) -> None: ...
+
+class FlattenCallGraph(HloPassInterface):
+  def __init__(self) -> None: ...
+
+class TupleSimplifer(HloPassInterface):
+  def __init__(self) -> None: ...
diff --git a/third_party/xla/xla/python/_ops.pyi b/third_party/xla/xla/python/_ops.pyi
new file mode 100644
index 000000000000..3f671ea8f04b
--- /dev/null
+++ b/third_party/xla/xla/python/_ops.pyi
@@ -0,0 +1,701 @@
+# Copyright 2021 The JAX Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from collections.abc import Sequence
+import enum
+from typing import Any, overload
+
+from jax.jaxlib import _jax
+
+from . import _xla_builder
+
+XlaComputation = _jax.XlaComputation
+PrimitiveType = _jax.PrimitiveType
+Shape = _jax.Shape
+ShapeIndex = _jax.ShapeIndex
+
+_ChannelHandle = Any
+_ConvDimensionNumbers = Any
+_DotDimensionNumbers = Any
+_Layout = Any
+_LiteralSlice = Any
+_GatherDimensionNumbers = Any
+_PaddingConfig = Any
+_ReplicaGroup = Any
+_ScatterDimensionNumbers = Any
+
+class ShapeIndex:
+  def __init__(self, indices: list[int]) -> None: ...
+  def __eq__(self, other: Any) -> bool: ...
+  def __ne__(self, other: Any) -> bool: ...
+  def __hash__(self) -> int: ...
+  def __repr__(self) -> str: ...
+
+class FftType(enum.IntEnum):
+  FFT = ...
+  IFFT = ...
+  RFFT = ...
+  IRFFT = ...
+
+class PrecisionConfig_Precision(enum.IntEnum):
+  DEFAULT = ...
+  HIGH = ...
+  HIGHEST = ...
+
+class TriangularSolveOptions_Transpose(enum.IntEnum):
+  TRANSPOSE_INVALID = ...
+  NO_TRANSPOSE = ...
+  TRANSPOSE = ...
+  ADJOINT = ...
+
+class RandomAlgorithm(enum.IntEnum):
+  RNG_DEFAULT = ...
+  RNG_THREE_FRY = ...
+  RNG_PHILOX = ...
+
+class ResultAccuracy_Mode(enum.IntEnum):
+  DEFAULT = ...
+  HIGHEST = ...
+  TOLERANCE = ...
+
+class ResultAccuracy:
+  mode: ResultAccuracy_Mode
+  atol: float
+  rtol: float
+  ulps: int
+
+class CustomCallSchedule(enum.IntEnum):
+  SCHEDULE_NONE = ...
+  SCHEDULE_LATEST = ...
+  SCHEDULE_EARLIEST = ...
+
+# TODO(b/189822916): Remove this enum when all clients are migrated to the
+# status-returning API.
+class CustomCallApiVersion(enum.IntEnum):
+  API_VERSION_ORIGINAL = ...
+  API_VERSION_STATUS_RETURNING = ...
+  API_VERSION_STATUS_RETURNING_UNIFIED = ...
+  API_VERSION_TYPED_FFI = ...
+
+def AfterAll(
+    builder: _xla_builder.XlaBuilder, tokens: Sequence[_xla_builder.XlaOp]
+) -> _xla_builder.XlaOp: ...
+def AllGather(
+    operand: _xla_builder.XlaOp,
+    all_gather_dimension: int,
+    shard_count: int,
+    replica_groups: Sequence[_ReplicaGroup] = ...,
+    channel_id: _ChannelHandle | None = ...,
+    shape_with_layout: _Layout | None = ...,
+    use_global_device_ids: bool | None = ...,
+) -> _xla_builder.XlaOp: ...
+def AllReduce(
+    operand: _xla_builder.XlaOp,
+    computation: XlaComputation,
+    replica_groups: Sequence[_ReplicaGroup] = ...,
+    channel_id: _ChannelHandle | None = ...,
+    shape_with_layout: _Layout | None = ...,
+) -> _xla_builder.XlaOp: ...
+def ApproxTopK(
+    builder: _xla_builder.XlaBuilder,
+    operands: Sequence[_xla_builder.XlaOp],
+    init_values: Sequence[_xla_builder.XlaOp],
+    top_k: int,
+    reduction_dim: int,
+    comparator: XlaComputation,
+    recall_target: float | None,
+    aggregate_to_topk: bool | None,
+    reduction_input_size_override: int | None,
+) -> _xla_builder.XlaOp: ...
+def ApproxTopKFallback(
+    builder: _xla_builder.XlaBuilder,
+    operands: Sequence[_xla_builder.XlaOp],
+    init_values: Sequence[_xla_builder.XlaOp],
+    top_k: int,
+    reduction_dim: int,
+    comparator: XlaComputation,
+    recall_target: float | None,
+    aggregate_to_topk: bool | None,
+    reduction_input_size_override: int | None,
+) -> _xla_builder.XlaOp: ...
+def ApproxTopKReductionOutputSize(
+    input_size: int,
+    rank: int,
+    top_k: int,
+    recall_target: float,
+    aggregate_to_topk: bool | None = ...,
+    input_size_override: int | None = ...,
+) -> tuple[int, int]: ...
+def ReduceScatter(
+    operand: _xla_builder.XlaOp,
+    computation: XlaComputation,
+    scatter_dimension: int,
+    shard_count: int,
+    replica_groups: Sequence[_ReplicaGroup] = ...,
+    channel_id: _ChannelHandle | None = ...,
+    layout: _Layout | None = ...,
+    use_global_device_ids: bool | None = ...,
+) -> _xla_builder.XlaOp: ...
+def AllToAll(
+    operand: _xla_builder.XlaOp,
+    split_dimension: int,
+    concat_dimension: int,
+    split_count: int,
+    replica_groups: Sequence[_ReplicaGroup] = ...,
+    layout: _Layout | None = ...,
+    channel_id: _ChannelHandle | None = ...,
+) -> _xla_builder.XlaOp: ...
+def BitcastConvertType(
+    operand: _xla_builder.XlaOp, new_element_type: PrimitiveType
+) -> _xla_builder.XlaOp: ...
+def Broadcast(
+    operand: _xla_builder.XlaOp, sizes: Sequence[int]
+) -> _xla_builder.XlaOp: ...
+def BroadcastInDim(
+    operand: _xla_builder.XlaOp,
+    shape: Sequence[int],
+    broadcast_dimensions: Sequence[int],
+) -> _xla_builder.XlaOp: ...
+def Call(
+    builder: _xla_builder.XlaBuilder,
+    computation: XlaComputation,
+    operands: Sequence[_xla_builder.XlaOp],
+) -> _xla_builder.XlaOp: ...
+def Cholesky(
+    a: _xla_builder.XlaOp, lower: bool = ...
+) -> _xla_builder.XlaOp: ...
+def Clamp(
+    min: _xla_builder.XlaOp,
+    operand: _xla_builder.XlaOp,
+    max: _xla_builder.XlaOp,
+) -> _xla_builder.XlaOp: ...
+def Collapse(
+    operand: _xla_builder.XlaOp, dimensions: Sequence[int]
+) -> _xla_builder.XlaOp: ...
+def CollectivePermute(
+    operand: _xla_builder.XlaOp,
+    source_target_pairs: Sequence[tuple[int, int]],
+    channel_id: _ChannelHandle | None = ...,
+    inplace: bool = ...,
+) -> _xla_builder.XlaOp: ...
+def ConcatInDim(
+    builder: _xla_builder.XlaBuilder,
+    operands: Sequence[_xla_builder.XlaOp],
+    dimension: int,
+) -> _xla_builder.XlaOp: ...
+@overload
+def Conditional(
+    branch_index: _xla_builder.XlaOp,
+    branch_computations: Sequence[XlaComputation],
+    branch_operands: Sequence[_xla_builder.XlaOp],
+) -> _xla_builder.XlaOp: ...
+@overload
+def Conditional(
+    predicate: _xla_builder.XlaOp,
+    true_operand: _xla_builder.XlaOp,
+    true_computation: XlaComputation,
+    false_operand: _xla_builder.XlaOp,
+    false_computation: XlaComputation,
+) -> _xla_builder.XlaOp: ...
+def Constant(
+    builder: _xla_builder.XlaBuilder, value: _LiteralSlice
+) -> _xla_builder.XlaOp: ...
+def ConstantLiteral(
+    builder: _xla_builder.XlaBuilder, value: _LiteralSlice
+) -> _xla_builder.XlaOp: ...
+def ConvGeneralDilated(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    window_strides: Sequence[int],
+    padding: Sequence[tuple[int, int]],
+    lhs_dilation: Sequence[int],
+    rhs_dilation: Sequence[int],
+    dimension_numbers: _ConvDimensionNumbers,
+    feature_group_count: int = ...,
+    batch_group_count: int = ...,
+    precision_config: PrecisionConfig_Precision | None = ...,
+    preferred_element_type: PrimitiveType | None = ...,
+    window_reversal: Sequence[bool] | None = ...,
+) -> _xla_builder.XlaOp: ...
+def ConvertElementType(
+    operand: _xla_builder.XlaOp, new_element_type: PrimitiveType
+) -> _xla_builder.XlaOp: ...
+def CreateToken(builder: _xla_builder.XlaBuilder) -> _xla_builder.XlaOp: ...
+def CrossReplicaSum(
+    operand: _xla_builder.XlaOp, replica_groups: Sequence[_ReplicaGroup] = ...
+) -> _xla_builder.XlaOp: ...
+def CustomCall(
+    builder: _xla_builder.XlaBuilder,
+    call_target_name: bytes,
+    operands: Sequence[_xla_builder.XlaOp],
+    shape: Shape,
+    opaque: bytes = ...,
+    has_side_effect: bool = ...,
+    schedule: CustomCallSchedule = ...,
+    api_version: CustomCallApiVersion = ...,
+) -> _xla_builder.XlaOp: ...
+def CustomCallWithLayout(
+    builder: _xla_builder.XlaBuilder,
+    call_target_name: bytes,
+    operands: Sequence[_xla_builder.XlaOp],
+    shape_with_layout: Shape,
+    operand_shapes_with_layout: Sequence[Shape],
+    opaque: bytes = ...,
+    has_side_effect: bool = ...,
+    schedule: CustomCallSchedule = ...,
+    api_version: CustomCallApiVersion = ...,
+) -> _xla_builder.XlaOp: ...
+def CustomCallWithAliasing(
+    builder: _xla_builder.XlaBuilder,
+    call_target_name: bytes,
+    operands: Sequence[_xla_builder.XlaOp],
+    shape_with_layout: Shape,
+    operand_shapes_with_layout: Sequence[Shape],
+    opaque: bytes = ...,
+    has_side_effect: bool = ...,
+    output_operand_aliasing: Sequence[
+        tuple[ShapeIndex, tuple[int, ShapeIndex]]
+    ] = ...,
+    literal: _LiteralSlice = ...,
+    schedule: CustomCallSchedule = ...,
+    api_version: CustomCallApiVersion = ...,
+) -> _xla_builder.XlaOp: ...
+def Dot(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    precision_config: PrecisionConfig_Precision | None = ...,
+    preferred_element_type: PrimitiveType | None = ...,
+) -> _xla_builder.XlaOp: ...
+def DotGeneral(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    dimensions_numbers: _DotDimensionNumbers,
+    precision_config: PrecisionConfig_Precision | None = ...,
+    preferred_element_type: PrimitiveType | None = ...,
+) -> _xla_builder.XlaOp: ...
+def DynamicReshape(
+    operand: _xla_builder.XlaOp,
+    dim_sizes: Sequence[_xla_builder.XlaOp],
+    new_size_bounds: Sequence[int],
+    dims_are_dynamic: Sequence[bool],
+) -> _xla_builder.XlaOp: ...
+def DynamicSlice(
+    operand: _xla_builder.XlaOp,
+    start_indices: Sequence[_xla_builder.XlaOp],
+    slice_sizes: Sequence[int],
+) -> _xla_builder.XlaOp: ...
+def DynamicUpdateSlice(
+    operand: _xla_builder.XlaOp,
+    update: _xla_builder.XlaOp,
+    start_indices: Sequence[_xla_builder.XlaOp],
+) -> _xla_builder.XlaOp: ...
+def Eigh(
+    a: _xla_builder.XlaOp,
+    lower: bool = ...,
+    max_iter: int = ...,
+    epsilon: float = ...,
+    sort_eigenvalues: bool = ...,
+) -> tuple[_xla_builder.XlaOp, _xla_builder.XlaOp]: ...
+def Fft(
+    operand: _xla_builder.XlaOp, fft_type: FftType, fft_length: Sequence[int]
+) -> _xla_builder.XlaOp: ...
+def Gather(
+    a: _xla_builder.XlaOp,
+    start_indices: _xla_builder.XlaOp,
+    dimension_numbers: _GatherDimensionNumbers,
+    slice_sizes: Sequence[int],
+    indices_are_sorted: bool = ...,
+) -> _xla_builder.XlaOp: ...
+def GetDimensionSize(
+    operand: _xla_builder.XlaOp, index: int
+) -> _xla_builder.XlaOp: ...
+def GetTupleElement(
+    tuple_data: _xla_builder.XlaOp, index: int
+) -> _xla_builder.XlaOp: ...
+def InfeedWithToken(
+    token: _xla_builder.XlaOp, shape: Shape, config: str | None = ...
+) -> _xla_builder.XlaOp: ...
+@overload
+def Iota(
+    builder: _xla_builder.XlaBuilder, shape: Shape, iota_dimension: int
+) -> _xla_builder.XlaOp: ...
+@overload
+def Iota(
+    builder: _xla_builder.XlaBuilder, type: PrimitiveType, size: int
+) -> _xla_builder.XlaOp: ...
+def LU(
+    a: _xla_builder.XlaOp,
+) -> tuple[_xla_builder.XlaOp, _xla_builder.XlaOp, _xla_builder.XlaOp]: ...
+def Map(
+    builder: _xla_builder.XlaBuilder,
+    operands: Sequence[_xla_builder.XlaOp],
+    computation: XlaComputation,
+    dimensions: Sequence[int],
+    static_operands: Sequence[_xla_builder.XlaOp] = ...,
+) -> _xla_builder.XlaOp: ...
+def MultiCollectivePermute(
+    operands: Sequence[_xla_builder.XlaOp],
+    source_target_pairs: Sequence[tuple[int, int]],
+    channel_id: _ChannelHandle | None = ...,
+    inplace: bool = ...,
+) -> _xla_builder.XlaOp: ...
+def NextAfter(
+    __from: _xla_builder.XlaOp, to: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def OutfeedWithToken(
+    operand: _xla_builder.XlaOp,
+    token: _xla_builder.XlaOp,
+    shape_with_layout: Shape,
+    outfeed_config: str | None = ...,
+) -> _xla_builder.XlaOp: ...
+def Pad(
+    operand: _xla_builder.XlaOp,
+    padding_value: _xla_builder.XlaOp,
+    padding_config: _PaddingConfig,
+) -> _xla_builder.XlaOp: ...
+def Parameter(
+    builder: _xla_builder.XlaBuilder,
+    parameter_number: int,
+    shape: Shape,
+    name: str = ...,
+    replicated_at_leaf_buffers: Sequence[bool] = ...,
+) -> _xla_builder.XlaOp: ...
+def ProductOfElementaryHouseholderReflectors(
+    a: _xla_builder.XlaOp, taus: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def QR(
+    a: _xla_builder.XlaOp, full_matrices: bool
+) -> tuple[_xla_builder.XlaOp, _xla_builder.XlaOp]: ...
+def QrDecomposition(
+    a: _xla_builder.XlaOp,
+) -> tuple[_xla_builder.XlaOp, _xla_builder.XlaOp]: ...
+def Reduce(
+    builder: _xla_builder.XlaBuilder,
+    operands: Sequence[_xla_builder.XlaOp],
+    init_values: Sequence[_xla_builder.XlaOp],
+    computation: XlaComputation,
+    dimensions_to_reduce: Sequence[int],
+) -> _xla_builder.XlaOp: ...
+def ReducePrecision(
+    operand: _xla_builder.XlaOp, exponent_bits: int, mantissa_bits: int
+) -> _xla_builder.XlaOp: ...
+@overload
+def ReduceWindowWithGeneralPadding(
+    operand: _xla_builder.XlaOp,
+    init_value: _xla_builder.XlaOp,
+    computation: XlaComputation,
+    window_dimensions: Sequence[int],
+    window_strides: Sequence[int],
+    base_dilations: Sequence[int],
+    window_dilations: Sequence[int],
+    padding: Sequence[tuple[int, int]],
+) -> _xla_builder.XlaOp: ...
+@overload
+def ReduceWindowWithGeneralPadding(
+    operands: Sequence[_xla_builder.XlaOp],
+    init_values: Sequence[_xla_builder.XlaOp],
+    computation: XlaComputation,
+    window_dimensions: Sequence[int],
+    window_strides: Sequence[int],
+    base_dilations: Sequence[int],
+    window_dilations: Sequence[int],
+    padding: Sequence[tuple[int, int]],
+) -> _xla_builder.XlaOp: ...
+def ReplicaId(builder: _xla_builder.XlaBuilder) -> _xla_builder.XlaOp: ...
+def Reshape(
+    operand: _xla_builder.XlaOp, new_sizes: Sequence[int]
+) -> _xla_builder.XlaOp: ...
+def Rev(
+    operand: _xla_builder.XlaOp, dimensions: Sequence[int]
+) -> _xla_builder.XlaOp: ...
+def RngBitGenerator(
+    algorithm: RandomAlgorithm, initial_state: _xla_builder.XlaOp, shape: Shape
+) -> _xla_builder.XlaOp: ...
+def RngNormal(
+    mu: _xla_builder.XlaOp, sigma: _xla_builder.XlaOp, shape: Shape
+) -> _xla_builder.XlaOp: ...
+def RngUniform(
+    a: _xla_builder.XlaOp, b: _xla_builder.XlaOp, shape: Shape
+) -> _xla_builder.XlaOp: ...
+@overload
+def Scatter(
+    input: _xla_builder.XlaOp,
+    scatter_indices: _xla_builder.XlaOp,
+    updates: _xla_builder.XlaOp,
+    update_computation: XlaComputation,
+    dimension_numbers: _ScatterDimensionNumbers,
+    indices_are_sorted: bool = ...,
+    unique_indices: bool = ...,
+) -> _xla_builder.XlaOp: ...
+@overload
+def Scatter(
+    inputs: Sequence[_xla_builder.XlaOp],
+    scatter_indices: _xla_builder.XlaOp,
+    updates: Sequence[_xla_builder.XlaOp],
+    update_computation: XlaComputation,
+    dimension_numbers: _ScatterDimensionNumbers,
+    indices_are_sorted: bool = ...,
+    unique_indices: bool = ...,
+) -> _xla_builder.XlaOp: ...
+def Select(
+    pred: _xla_builder.XlaOp,
+    on_true: _xla_builder.XlaOp,
+    on_false: _xla_builder.XlaOp,
+) -> _xla_builder.XlaOp: ...
+def SelectAndScatterWithGeneralPadding(
+    operand: _xla_builder.XlaOp,
+    select: XlaComputation,
+    window_dimensions: Sequence[int],
+    window_strides: Sequence[int],
+    padding: Sequence[tuple[int, int]],
+    source: _xla_builder.XlaOp,
+    init_value: _xla_builder.XlaOp,
+    scatter: XlaComputation,
+) -> _xla_builder.XlaOp: ...
+def Slice(
+    operand: _xla_builder.XlaOp,
+    start_indices: Sequence[int],
+    limit_indices: Sequence[int],
+    strides: Sequence[int],
+) -> _xla_builder.XlaOp: ...
+def SliceInDim(
+    operand: _xla_builder.XlaOp,
+    start_index: int,
+    limit_index: int,
+    stride: int,
+    dimno: int,
+) -> _xla_builder.XlaOp: ...
+def Sort(
+    builder: _xla_builder.XlaBuilder,
+    operands: Sequence[_xla_builder.XlaOp],
+    comparator: XlaComputation | None = ...,
+    dimension: int = ...,
+    is_stable: bool = ...,
+) -> _xla_builder.XlaOp: ...
+def SVD(
+    a: _xla_builder.XlaOp, max_iter: int = ..., epsilon: float = ...
+) -> tuple[_xla_builder.XlaOp, _xla_builder.XlaOp, _xla_builder.XlaOp]: ...
+def TopK(input: _xla_builder.XlaOp, k: int) -> _xla_builder.XlaOp: ...
+def Transpose(
+    operand: _xla_builder.XlaOp, permutation: Sequence[int]
+) -> _xla_builder.XlaOp: ...
+def TriangularSolve(
+    a: _xla_builder.XlaOp,
+    b: _xla_builder.XlaOp,
+    left_side: bool,
+    lower: bool,
+    unit_diagonal: bool,
+    transpose_a: TriangularSolveOptions_Transpose,
+) -> _xla_builder.XlaOp: ...
+def Tuple(
+    builder: _xla_builder.XlaBuilder, elements: Sequence[_xla_builder.XlaOp]
+) -> _xla_builder.XlaOp: ...
+def While(
+    condition: XlaComputation, body: XlaComputation, init: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def Igamma(
+    a: _xla_builder.XlaOp, x: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def Igammac(
+    a: _xla_builder.XlaOp, x: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def IgammaGradA(
+    a: _xla_builder.XlaOp, x: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def RandomGammaGrad(
+    a: _xla_builder.XlaOp, x: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def RegularizedIncompleteBeta(
+    a: _xla_builder.XlaOp, b: _xla_builder.XlaOp, x: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def Zeta(
+    a: _xla_builder.XlaOp, q: _xla_builder.XlaOp
+) -> _xla_builder.XlaOp: ...
+def Eq(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Ne(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Ge(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Gt(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Lt(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Le(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Add(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Sub(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Mul(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Div(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Rem(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Max(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Min(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def And(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Or(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Xor(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def ShiftLeft(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def ShiftRightArithmetic(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def ShiftRightLogical(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Atan2(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Pow(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Complex(
+    lhs: _xla_builder.XlaOp,
+    rhs: _xla_builder.XlaOp,
+    broadcast_dimensions: Sequence[int] = ...,
+) -> _xla_builder.XlaOp: ...
+def Not(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def PopulationCount(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Clz(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Abs(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Exp(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Expm1(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Floor(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Ceil(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Round(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Log(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Log1p(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Sign(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Cos(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def OptimizationBarrier(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Sin(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Tan(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Tanh(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def IsFinite(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Neg(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Sqrt(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Rsqrt(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Cbrt(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def Square(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Reciprocal(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Erfc(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Erf(
+    operand: _xla_builder.XlaOp, result_accuracy: ResultAccuracy = ...
+) -> _xla_builder.XlaOp: ...
+def ErfInv(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Lgamma(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Digamma(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def BesselI0e(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def BesselI1e(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Acos(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Asin(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Atan(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Acosh(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Asinh(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Atanh(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Cosh(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Sinh(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Real(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Imag(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
+def Conj(__arg: _xla_builder.XlaOp) -> _xla_builder.XlaOp: ...
diff --git a/third_party/xla/xla/python/_profiler.pyi b/third_party/xla/xla/python/_profiler.pyi
new file mode 100644
index 000000000000..94c88ec71e14
--- /dev/null
+++ b/third_party/xla/xla/python/_profiler.pyi
@@ -0,0 +1,64 @@
+# Copyright 2021 The JAX Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from types import TracebackType
+from typing import Any, Union
+
+_Status = Any
+
+class ProfilerServer: ...
+
+def start_server(port: int) -> ProfilerServer: ...
+def register_plugin_profiler(c_api: Any) -> None: ...
+def get_profiled_instructions_proto(tensorboard_dir: str) -> bytes: ...
+def get_instructions_profile(
+    tensorboard_dir: str,
+) -> list[tuple[str, float]]: ...
+def get_fdo_profile(xspace: bytes, as_textproto: bool = ...) -> bytes | str: ...
+
+class ProfilerSession:
+  def __init__(self, options: ProfileOptions | None = ...) -> None: ...
+  def stop(self) -> bytes: ...
+  def export(self, xspace: bytes, tensorboard_dir: str) -> _Status: ...
+
+class ProfileOptions:
+  include_dataset_ops: bool
+  host_tracer_level: int
+  python_tracer_level: int
+  enable_hlo_proto: bool
+  start_timestamp_ns: int
+  duration_ms: int
+  repository_path: str
+  raise_error_on_start_failure: bool
+  # Advanced configuration options for the profiler.
+  # These options might be platform-specific or unstable.
+  advanced_configuration: dict[str, Union[bool, int, str]]
+
+def aggregate_profiled_instructions(
+    profiles: list[bytes], percentile: int
+) -> str: ...
+
+class TraceMe:
+  def __init__(self, name: str, **kwargs: Any) -> None: ...
+  def __enter__(self) -> TraceMe: ...
+  def __exit__(
+      self,
+      exc_type: type[BaseException] | None,
+      exc_value: BaseException | None,
+      exc_tb: TracebackType | None,
+  ) -> bool | None: ...
+  def set_metadata(self, **kwargs): ...
+  @staticmethod
+  def is_enabled() -> bool: ...
diff --git a/third_party/xla/xla/python/_xla_builder.pyi b/third_party/xla/xla/python/_xla_builder.pyi
new file mode 100644
index 000000000000..94cb9df21448
--- /dev/null
+++ b/third_party/xla/xla/python/_xla_builder.pyi
@@ -0,0 +1,48 @@
+# Copyright 2021 The OpenXLA Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Any, Sequence
+
+from jax.jaxlib._jax import OpSharding_Type
+from jax.jaxlib._jax import ProgramShape
+from jax.jaxlib._jax import Shape
+from jax.jaxlib._jax import XlaComputation
+
+_XlaOpMetadata = Any
+
+class FrontendAttributes:
+  def __init__(self) -> None: ...
+  def __setitem__(self, key: str, value: str) -> None: ...
+
+class XlaOp: ...
+
+class XlaBuilder:
+  def __init__(self, name: str) -> None: ...
+  def Build(self, root: XlaOp | None = ...) -> XlaComputation: ...
+  def GetShape(self, __op: XlaOp) -> Shape: ...
+  build = Build
+  def clear_op_metadata(self) -> None: ...
+  get_shape = GetShape
+  def get_program_shape(self, root: XlaOp | None = ...) -> ProgramShape: ...
+  def is_constant(self, __op: XlaOp) -> bool: ...
+  def set_op_metadata(self, metadata: _XlaOpMetadata) -> None: ...
+  def set_sharding(self, sharding: OpSharding_Type) -> None: ...
+  def clear_sharding(self) -> None: ...
+  def setup_alias(
+      self,
+      __output_index: Sequence[int],
+      __param_number: int,
+      __param_index: Sequence[int],
+  ) -> None: ...
diff --git a/third_party/xla/xla/python/aggregate_profile.cc b/third_party/xla/xla/python/aggregate_profile.cc
index 29e09d04821c..55f6797e937f 100644
--- a/third_party/xla/xla/python/aggregate_profile.cc
+++ b/third_party/xla/xla/python/aggregate_profile.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "xla/python/xplane_to_profile_instructions.h"
+#include "tsl/profiler/protobuf/profiled_instructions.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/python/callback.cc b/third_party/xla/xla/python/callback.cc
deleted file mode 100644
index 7f09490504a9..000000000000
--- a/third_party/xla/xla/python/callback.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/callback.h"
-
-#include <Python.h>
-#include <sys/types.h>
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/base/casts.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "xla/ffi/ffi.h"
-#include "xla/pjrt/host_callback.h"
-#include "xla/pjrt/transpose.h"
-#include "xla/primitive_util.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/service/custom_call_status.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-
-CpuCallback::~CpuCallback() {
-  // The destructor may be called without GIL held. In that case, we defer it
-  // to GlobalPyRefManager.
-  std::vector<nb::object> objects;
-  objects.push_back(std::move(callable_));
-  for (auto& arg : args_) {
-    objects.push_back(std::move(arg.dtype));
-  }
-
-  GlobalPyRefManager()->AddGarbage(absl::MakeSpan(objects));
-}
-
-absl::Status CpuCallback::PrepareAndCall(void* result, void** arg_ptrs) {
-  absl::Span<void* const> inputs(arg_ptrs, args_.size());
-  absl::Span<void* const> outputs(reinterpret_cast<void**>(result),
-                                  results_.size());
-
-  nb::gil_scoped_acquire gil;
-  nb::tuple args = nb::steal<nb::tuple>(PyTuple_New(inputs.size()));
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    if (args_[i].type == xla::TOKEN) {
-      PyTuple_SET_ITEM(args.ptr(), i, nb::none().release().ptr());
-    } else {
-      nb_numpy_ndarray array =
-          nb_numpy_ndarray(args_[i].dtype, args_[i].dims, args_[i].strides,
-                           const_cast<void*>(inputs[i]));
-      array.attr("flags").attr("writeable") = nb::bool_(false);
-      PyTuple_SET_ITEM(args.ptr(), i, array.release().ptr());
-    }
-  }
-
-  EnterHostCallback();
-  absl::StatusOr<nb::tuple> maybe_result_tuple = Call(std::move(args));
-  LeaveHostCallback();
-  TF_ASSIGN_OR_RETURN(auto result_tuple, maybe_result_tuple);
-
-  for (size_t i = 0; i < results_.size(); ++i) {
-    if (results_[i].type == xla::TOKEN) {
-      continue;
-    }
-    nb::object output =
-        nb::borrow<nb::object>(PyTuple_GetItem(result_tuple.ptr(), i));
-    nb_numpy_ndarray array = nb_numpy_ndarray::ensure(std::move(output));
-    absl::Span<int64_t const> dims(
-        reinterpret_cast<const int64_t*>(array.shape()), array.ndim());
-    absl::Span<int64_t const> strides(
-        reinterpret_cast<const int64_t*>(array.strides()), array.ndim());
-    if (strides == results_[i].expected_strides) {
-      std::memcpy(outputs[i], array.data(), results_[i].size_in_bytes);
-    } else {
-      xla::TransposePlan::Options options;
-      options.elem_size_in_bytes =
-          xla::primitive_util::ByteWidth(results_[i].type);
-      options.dims = dims;
-      options.permutation = results_[i].reversed_layout;
-      options.input_layout = xla::TransposePlan::Striding{strides};
-      absl::StatusOr<std::shared_ptr<xla::TransposePlan>> plan =
-          transpose_cache_.GetOrCreate(options);
-      if (!plan.ok()) {
-        return std::move(plan).status();
-      }
-      plan.value()->Execute(array.data(), outputs[i]);
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-absl::StatusOr<nb::tuple> CpuCallback::Call(nb::tuple args) {
-  auto py_error_to_status = [](nb::python_error& e) {
-    std::string error_message = e.what();
-    return absl::InternalError(
-        absl::StrFormat("CpuCallback error: %s", error_message));
-  };
-  nb::object result_object;
-  try {
-    result_object = callable_(*nb::borrow<nb::args>(args));
-  } catch (nb::python_error& e) {
-    return py_error_to_status(e);
-  }
-  if (!PyTuple_Check(result_object.ptr())) {
-    return absl::InternalError(
-        absl::StrFormat("CPU callback expected a tuple result, got %s",
-                        nb::cast<absl::string_view>(nb::repr(result_object))));
-  }
-  if (PyTuple_Size(result_object.ptr()) != results_.size()) {
-    return absl::InternalError(
-        absl::StrFormat("CPU callback expected a tuple with %d results, got %d",
-                        results_.size(), PyTuple_Size(result_object.ptr())));
-  }
-  nb::tuple result_tuple = nb::cast<nb::tuple>(result_object);
-  for (size_t i = 0; i < results_.size(); ++i) {
-    nb::object output =
-        nb::borrow<nb::object>(PyTuple_GetItem(result_tuple.ptr(), i));
-    if (results_[i].type == xla::TOKEN) {
-      if (!output.is_none()) {
-        return absl::InternalError(absl::StrFormat(
-            "Token output from Python callback should be None, got %s",
-            nb::cast<absl::string_view>(nb::repr(output))));
-      }
-      continue;
-    }
-    nb_numpy_ndarray array;
-    try {
-      array = nb_numpy_ndarray::from_any(output, NPY_ARRAY_ENSUREARRAY);
-    } catch (nb::python_error& e) {
-      return py_error_to_status(e);
-    }
-    static_assert(sizeof(ssize_t) == sizeof(int64_t),
-                  "Expected ssize_t to be of equal size to int64_t");
-    absl::Span<int64_t const> dims(
-        reinterpret_cast<const int64_t*>(array.shape()), array.ndim());
-    if (dims != results_[i].expected_dims) {
-      return absl::InternalError(absl::StrFormat(
-          "Mismatched result shape for %d-th return value from CPU callback; "
-          "expected array with dimensions %s, got %s",
-          i, absl::StrJoin(results_[i].expected_dims, ","),
-          absl::StrJoin(dims, ",")));
-    }
-  }
-  return result_tuple;
-}
-
-void XlaPythonCpuCallback(void* output, void** inputs,
-                          XlaCustomCallStatus* status) {
-  CpuCallback* callback =
-      absl::bit_cast<CpuCallback*>(*static_cast<uintptr_t*>(inputs[0]));
-  auto s = callback->PrepareAndCall(output, inputs + 1);
-  if (!s.ok()) {
-    auto msg = s.message();
-    XlaCustomCallStatusSetFailure(status, msg.data(), msg.length());
-  }
-}
-
-absl::StatusOr<nb::tuple> CpuCallback::FfiCall(nb::tuple args) {
-  nb::tuple result_tuple;
-  try {
-    auto result_object = callable_(*nb::borrow<nb::args>(args));
-    result_tuple = nb::cast<nb::tuple>(result_object);
-  } catch (nb::python_error& e) {
-    return absl::InternalError(
-        absl::StrFormat("CpuCallback error calling callback: %s", e.what()));
-  }
-  return result_tuple;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/callback.h b/third_party/xla/xla/python/callback.h
deleted file mode 100644
index 95f7406ec506..000000000000
--- a/third_party/xla/xla/python/callback.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_CALLBACK_H_
-#define XLA_PYTHON_CALLBACK_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "nanobind/nanobind.h"
-#include "xla/pjrt/transpose.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/service/custom_call_status.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-
-class CpuCallback {
- public:
-  struct Arg {
-    xla::PrimitiveType type;               // XLA type
-    nb_dtype dtype;                        // NumPy type, for array types.
-    absl::InlinedVector<int64_t, 4> dims;  // Dimensions, for array types.
-    std::vector<int64_t> strides;          // Byte strides, for array types.
-    size_t size_in_bytes;                  // Size of the array in bytes.
-  };
-  struct Result {
-    xla::PrimitiveType type;  // XLA type
-    // Expected output shape, for array types
-    absl::InlinedVector<int64_t, 4> expected_dims;
-    // Expected output byte strides, for array types. If the strides do not
-    // match the output will be transposed into the expected layout.
-    std::vector<int64_t> expected_strides;
-    // The desired order of output dimensions in major-to-minor order.
-    absl::InlinedVector<int64_t, 4> reversed_layout;
-    // Size of the array in bytes.
-    size_t size_in_bytes;
-  };
-
-  explicit CpuCallback(nanobind::callable callable, std::vector<Arg> args,
-                       std::vector<Result> results)
-      : callable_(std::move(callable)),
-        args_(std::move(args)),
-        results_(std::move(results)),
-        transpose_cache_(/*capacity=*/16) {}
-
-  ~CpuCallback();
-
-  const std::vector<Arg>& args() const { return args_; }
-  size_t num_args() const { return args_.size(); }
-
-  const std::vector<Result>& results() const { return results_; }
-  size_t num_results() const { return results_.size(); }
-
-  xla::TransposePlanCache& transpose_cache() { return transpose_cache_; }
-
-  absl::Status PrepareAndCall(void* result, void** arg_ptrs);
-
-  absl::StatusOr<nanobind::tuple> Call(nanobind::tuple args);
-
-  absl::StatusOr<nanobind::tuple> FfiCall(nanobind::tuple args);
-
- private:
-  nanobind::callable callable_;
-  std::vector<Arg> args_;
-  std::vector<Result> results_;
-  xla::TransposePlanCache transpose_cache_;
-};
-
-void XlaPythonCpuCallback(void* output, void** inputs,
-                          XlaCustomCallStatus* status);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_CALLBACK_H_
diff --git a/third_party/xla/xla/python/compile_only_ifrt/BUILD b/third_party/xla/xla/python/compile_only_ifrt/BUILD
index 23a265a9ce84..ce855ffa9389 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/BUILD
+++ b/third_party/xla/xla/python/compile_only_ifrt/BUILD
@@ -3,6 +3,7 @@ load("//xla/tsl:tsl.bzl", "internal_visibility")
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility([
+        "//third_party/py/jax:__subpackages__",
         "//xla/python:__subpackages__",
     ]),
 )
@@ -19,6 +20,7 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
+        "//xla/python/ifrt:user_context",
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
         "//xla/python/pjrt_ifrt:pjrt_dtype",
@@ -26,6 +28,8 @@ cc_library(
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/python/compile_only_ifrt/client.h b/third_party/xla/xla/python/compile_only_ifrt/client.h
index 94e168a699b1..7d9b2c9c7194 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/client.h
+++ b/third_party/xla/xla/python/compile_only_ifrt/client.h
@@ -24,6 +24,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/nullability.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -35,6 +37,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/client.h"
@@ -48,6 +51,7 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/pjrt_attribute_map_util.h"
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
@@ -152,20 +156,19 @@ class CompileOnlyDevice
 class CompileOnlyIfrtCompiler final
     : public llvm::RTTIExtends<CompileOnlyIfrtCompiler, ifrt::Compiler> {
  public:
-  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
+  absl::StatusOr<ifrt::LoadedExecutableRef> CompileAndLoad(
       std::unique_ptr<ifrt::Program> program,
       std::unique_ptr<ifrt::CompileOptions> options) override {
     return Unimplemented("Compile not implemented.");
   }
 
-  absl::StatusOr<std::unique_ptr<ifrt::Executable>> Compile(
+  absl::StatusOr<ifrt::ExecutableRef> Compile(
       std::unique_ptr<ifrt::Program> program, const ifrt::Topology& topology,
       std::unique_ptr<ifrt::CompileOptions> options) override {
     return Unimplemented("Compile not implemented.");
   }
 
-  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>>
-  DeserializeLoadedExecutable(
+  absl::StatusOr<ifrt::LoadedExecutableRef> DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<ifrt::DeserializeExecutableOptions> options) override {
     return Unimplemented("DeserializeLoadedExecutable not implemented.");
@@ -202,40 +205,35 @@ class CompileOnlyIfRtClient final
     }
   }
 
-  absl::StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromHostBuffer(
-      const void* data, ifrt::DType dtype, ifrt::Shape shape,
+  absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromHostBuffer(
+      const void* data, xla::ifrt::DType dtype, xla::ifrt::Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
-      std::shared_ptr<const ifrt::Sharding> sharding,
-      HostBufferSemantics semantics,
-      std::function<void()> on_done_with_host_buffer) override {
+      xla::ifrt::ShardingRef sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer,
+      tsl::RCReference<xla::ifrt::UserContext> user_context) override {
     return Unimplemented(
         "MakeArrayFromHostBuffer not available with compile-only client.");
   }
 
-  absl::StatusOr<tsl::RCReference<ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      ifrt::Shape shape, std::shared_ptr<const ifrt::Sharding> sharding,
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-      ifrt::ArrayCopySemantics semantics) override {
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> MakeArraysFromHostBufferShards(
+      absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
+      HostBufferSemantics semantics,
+      tsl::RCReference<xla::ifrt::UserContext> user_context) override {
     return Unimplemented(
-        "AssembleArrayFromSingleDeviceArrays not available with compile-only "
+        "MakeArraysFromHostBufferShards not available with compile-only "
         "client.");
   }
-  absl::StatusOr<tsl::RCReference<ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      ifrt::Shape shape, std::shared_ptr<const ifrt::Sharding> sharding,
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
-      ifrt::ArrayCopySemantics array_copy_semantics,
-      ifrt::SingleDeviceShardSemantics single_device_shard_semantics) override {
+
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> MakeErrorArrays(
+      const absl::Status& error, absl::Span<const ifrt::ArraySpec> array_specs,
+      tsl::RCReference<ifrt::UserContext> user_context) override {
     return Unimplemented(
-        "AssembleArrayFromSingleDeviceArrays not available with compile-only "
-        "client.");
+        "MakeErrorArrays not available with compile-only client.");
   }
-  absl::StatusOr<tsl::RCReference<ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      ifrt::DType dtype, ifrt::Shape shape,
-      std::shared_ptr<const ifrt::Sharding> sharding,
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
+
+  absl::StatusOr<ifrt::ArrayRef> AssembleArrayFromSingleDeviceArrays(
+      ifrt::DType dtype, ifrt::Shape shape, ifrt::ShardingRef sharding,
+      absl::Span<ifrt::ArrayRef> arrays,
       ifrt::ArrayCopySemantics array_copy_semantics,
       ifrt::SingleDeviceShardSemantics single_device_shard_semantics) override {
     return Unimplemented(
@@ -243,29 +241,28 @@ class CompileOnlyIfRtClient final
         "client.");
   }
 
-  absl::StatusOr<std::vector<tsl::RCReference<ifrt::Array>>> CopyArrays(
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> CopyArrays(
+      absl::Span<ifrt::ArrayRef> arrays,
       std::optional<ifrt::DeviceListRef> devices,
       std::optional<ifrt::MemoryKind> memory_kind,
       ifrt::ArrayCopySemantics semantics) override {
     return Unimplemented("CopyArrays not available with compile-only client.");
   }
 
-  absl::StatusOr<std::vector<tsl::RCReference<ifrt::Array>>> RemapArrays(
-      const ifrt::RemapPlan& plan,
-      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
+  absl::StatusOr<std::vector<ifrt::ArrayRef>> RemapArrays(
+      const ifrt::RemapPlan& plan, absl::Span<ifrt::ArrayRef> arrays,
       ifrt::ArrayCopySemantics semantics) override {
     return Unimplemented("RemapArrays not available with compile-only client.");
   }
 
   ifrt::Future<> GetReadyFuture(
-      absl::Span<const tsl::RCReference<ifrt::Value>> values) override {
+      absl::Span<const ifrt::ValueRef> values) override {
     return ifrt::Future<>(Unimplemented(
         "GetReadyFuture not available with compile-only client."));
   }
 
   absl::StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
-      absl::Span<tsl::RCReference<ifrt::Value>> values) override {
+      absl::Span<ifrt::ValueRef> values) override {
     return Unimplemented("MakeTuple not available with compile-only client.");
   }
 
@@ -318,6 +315,10 @@ class CompileOnlyIfRtClient final
 
   ifrt::Compiler* GetDefaultCompiler() override { return &default_compiler_; }
 
+  tsl::RCReference<xla::ifrt::UserContext> CreateUserContext() override {
+    return tsl::RCReference<xla::ifrt::UserContext>();
+  }
+
   static char ID;  // NOLINT
 
   const ifrt::PjRtTopology& topology() const { return *topology_; }
diff --git a/third_party/xla/xla/python/config.cc b/third_party/xla/xla/python/config.cc
deleted file mode 100644
index 12e849b7480f..000000000000
--- a/third_party/xla/xla/python/config.cc
+++ /dev/null
@@ -1,343 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/config.h"
-
-#include <Python.h>
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "xla/python/python_ref_manager.h"
-#include "xla/tsl/platform/logging.h"
-
-namespace jax {
-
-namespace nb = nanobind;
-
-// Singleton object used to represet "value not set" in thread-local configs.
-nb::object UnsetObject() {
-  return nb::steal(PyObject_CallObject(
-      reinterpret_cast<PyObject*>(&PyBaseObject_Type), nullptr));
-}
-
-// Each configuration object has:
-// * a global value, and
-// * a thread-local value.
-// When querying the state of a config, the thread-local value is used if it is
-// set. Otherwise, the global value is used.
-
-// This class represents all of the thread-local configuration state for a
-// thread.
-class ThreadLocalConfigState {
- public:
-  ThreadLocalConfigState();
-  ~ThreadLocalConfigState();
-
-  static ThreadLocalConfigState& Instance() {
-    thread_local auto state = std::make_unique<ThreadLocalConfigState>();
-    return *state;
-  }
-
-  nb::object Get(int key) {
-    DCHECK_GE(key, 0);
-    return key >= entries_.size() ? nb::object() : entries_[key];
-  }
-
-  void Set(int key, nb::object value);
-
- private:
-  friend class GlobalConfigState;
-
-  // These values are accessed in one of two ways:
-  // * The owning thread reads or writes them, while holding the GIL, or, under
-  //   free-threading, while the owning thread is in ATTACHED gc state.
-  // * Other threads may read or clear values while performing a garbarge
-  //   collection.
-  // No locking is needed because a GC thread cannot run concurrently with other
-  // Python threads; even under free-threading Python uses a stop-the-world GC.
-  std::vector<nb::object> entries_;
-};
-
-// This class represents all of the global configuration state.
-// TODO(phawkins): to support free-threading, we will need to add locking to
-// this class.
-class GlobalConfigState {
- public:
-  static GlobalConfigState& Instance() {
-    static auto state = new GlobalConfigState();
-    return *state;
-  }
-
-  nb::object Get(int key) const;
-  void Set(int key, nb::object value);
-
-  // Adds or removes a thread-local state from the set of thread-local states.
-  void AddThreadLocalState(ThreadLocalConfigState* state) {
-    absl::MutexLock lock(&mu_);
-    thread_local_states_.insert(state);
-  }
-  void RemoveThreadLocalState(ThreadLocalConfigState* state) {
-    absl::MutexLock lock(&mu_);
-    thread_local_states_.erase(state);
-  }
-
-  // Python GC helpers. These are called from the tp_traverse and tp_clear
-  // methods of the Config class.
-  int tp_traverse(int key, PyObject* self, visitproc visit, void* arg);
-  int tp_clear(int key, PyObject* self);
-
-  // Returns the singleton object representing "value not set".
-  const nb::object& unset() const { return unset_; }
-
-  // Returns the set of keys that should be included in the jit key.
-  absl::Span<int const> include_in_jit_key() const {
-    return include_in_jit_key_;
-  }
-
- private:
-  friend class Config;
-
-  // The set of thread-local states. This is used during garbarge collection to
-  // visit thread-local values.
-  absl::Mutex mu_;
-  absl::flat_hash_set<ThreadLocalConfigState*> thread_local_states_
-      ABSL_GUARDED_BY(mu_);
-  std::vector<nb::object> entries_;
-  std::vector<int> include_in_jit_key_;
-  nb::object unset_ = UnsetObject();
-};
-
-ThreadLocalConfigState::ThreadLocalConfigState() {
-  GlobalConfigState::Instance().AddThreadLocalState(this);
-}
-
-ThreadLocalConfigState::~ThreadLocalConfigState() {
-  // It's important that we remove the thread-local state before we access
-  // entries_. This ensures that accesses to entries_ are ordered with respect
-  // any garbage collection.
-  GlobalConfigState::Instance().RemoveThreadLocalState(this);
-  // We do not hold the GIL, so we must use deferred destruction.
-  xla::GlobalPyRefManager()->AddGarbage(absl::MakeSpan(entries_));
-}
-
-void ThreadLocalConfigState::Set(int key, nb::object value) {
-  DCHECK_GE(key, 0);
-  if (key >= entries_.size()) {
-    entries_.resize(key + 1);
-  }
-  std::swap(entries_[key], value);
-}
-
-nb::object GlobalConfigState::Get(int key) const {
-  DCHECK_GE(key, 0);
-  DCHECK_LT(key, entries_.size());
-  return entries_[key];
-}
-
-void GlobalConfigState::Set(int key, nb::object value) {
-  DCHECK_GE(key, 0);
-  DCHECK_LT(key, entries_.size());
-  std::swap(entries_[key], value);
-}
-
-int GlobalConfigState::tp_traverse(int key, PyObject* self, visitproc visit,
-                                   void* arg) {
-  DCHECK_GE(key, 0);
-  if (key < entries_.size()) {
-    PyObject* value = entries_[key].ptr();
-    Py_VISIT(value);
-  }
-  absl::MutexLock lock(&mu_);
-  for (const auto* state : thread_local_states_) {
-    if (key < state->entries_.size()) {
-      PyObject* value = state->entries_[key].ptr();
-      Py_VISIT(value);
-    }
-  }
-  return 0;
-}
-
-int GlobalConfigState::tp_clear(int key, PyObject* self) {
-  if (key < entries_.size()) {
-    nb::object tmp;
-    std::swap(entries_[key], tmp);
-  }
-  // We destroy the python objects outside of the lock out of an abundance of
-  // caution.
-  std::vector<nb::object> to_destroy;
-  absl::MutexLock lock(&mu_);
-  to_destroy.reserve(thread_local_states_.size());
-  for (auto* state : thread_local_states_) {
-    if (key < state->entries_.size()) {
-      nb::object tmp;
-      std::swap(state->entries_[key], tmp);
-      to_destroy.push_back(std::move(tmp));
-    }
-  }
-  return 0;
-}
-
-// A Config object represents a configurable object with both global and
-// thread-local state. This class is wrapped using nanobind and exposed to
-// Python.
-class Config {
- public:
-  Config(nb::object value, bool include_in_jit_key);
-
-  // Returns the thread-local value if it is set, otherwise the global value.
-  nb::object Get();
-
-  // Returns the global value.
-  nb::object GetGlobal();
-
-  // Sets the global value.
-  void SetGlobal(nb::object value);
-
-  // Returns the thread-local value.
-  nb::object GetLocal();
-
-  // Sets the thread-local value. May be `unset`.
-  void SetLocal(nb::object value);
-
-  // Swaps the thread-local value with `value`. Returns the previous value.
-  // Either may be `unset`.
-  nb::object SwapLocal(nb::object value);
-
-  // This class doesn't actually hold any data, but it's the only type
-  // known to Python. We pretend that this object owns both the global and any
-  // thread-local values corresponding to this key.
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
-  static int tp_clear(PyObject* self);
-  static PyType_Slot slots_[];
-
- private:
-  int key_;
-};
-
-Config::Config(nb::object value, bool include_in_jit_key) {
-  auto& instance = GlobalConfigState::Instance();
-  key_ = instance.entries_.size();
-  instance.entries_.push_back(std::move(value));
-  if (include_in_jit_key) {
-    instance.include_in_jit_key_.push_back(key_);
-  }
-}
-
-nb::object Config::GetLocal() {
-  nb::object result = ThreadLocalConfigState::Instance().Get(key_);
-  if (!result.is_valid()) {
-    return GlobalConfigState::Instance().unset();
-  }
-  return result;
-}
-
-nb::object Config::GetGlobal() {
-  return GlobalConfigState::Instance().Get(key_);
-}
-
-nb::object Config::Get() {
-  nb::object local = ThreadLocalConfigState::Instance().Get(key_);
-  if (local.is_valid()) {
-    return local;
-  }
-  return GetGlobal();
-}
-
-void Config::SetLocal(nb::object value) {
-  const auto& instance = GlobalConfigState::Instance();
-  if (value.ptr() == instance.unset().ptr()) {
-    value = nb::object();
-  }
-  ThreadLocalConfigState::Instance().Set(key_, std::move(value));
-}
-
-nb::object Config::SwapLocal(nb::object value) {
-  const auto& global_instance = GlobalConfigState::Instance();
-  auto& instance = ThreadLocalConfigState::Instance();
-  auto result = instance.Get(key_);
-  if (value.ptr() == global_instance.unset().ptr()) {
-    value = nb::object();
-  }
-  instance.Set(key_, std::move(value));
-  if (!result.is_valid()) {
-    return global_instance.unset();
-  }
-  return result;
-}
-
-void Config::SetGlobal(nb::object value) {
-  GlobalConfigState::Instance().Set(key_, value);
-}
-
-/* static */ int Config::tp_traverse(PyObject* self, visitproc visit,
-                                     void* arg) {
-  Config* c = nb::inst_ptr<Config>(self);
-  // For the purposes of GC, we pretend that this object owns both the global
-  // and any thread-local values corresponding to this key.
-  return GlobalConfigState::Instance().tp_traverse(c->key_, self, visit, arg);
-}
-
-/* static */ int Config::tp_clear(PyObject* self) {
-  Config* c = nb::inst_ptr<Config>(self);
-  return GlobalConfigState::Instance().tp_clear(c->key_, self);
-}
-
-PyType_Slot Config::slots_[] = {
-    {Py_tp_traverse, reinterpret_cast<void*>(Config::tp_traverse)},
-    {Py_tp_clear, reinterpret_cast<void*>(Config::tp_clear)},
-    {0, nullptr},
-};
-
-void BuildConfigSubmodule(nanobind::module_& m) {
-  nb::module_ config_module = m.def_submodule("config", "Config library");
-
-  config_module.attr("unset") = GlobalConfigState::Instance().unset();
-
-  nb::class_<Config> config(config_module, "Config",
-                            nb::type_slots(Config::slots_), nb::is_generic());
-  config.def(nb::init<nb::object, bool>(), nb::arg("value").none(),
-             nb::arg("include_in_jit_key") = false);
-  config.def_prop_ro("value", &Config::Get);
-  config.def("get_local", &Config::GetLocal);
-  config.def("get_global", &Config::GetGlobal);
-  config.def("set_local", &Config::SetLocal, nb::arg("value").none());
-  config.def("swap_local", &Config::SwapLocal, nb::arg("value").none());
-  config.def("set_global", &Config::SetGlobal, nb::arg("value").none());
-}
-
-std::vector<nanobind::object> JitConfigs() {
-  auto& instance = GlobalConfigState::Instance();
-  auto& thread_local_instance = ThreadLocalConfigState::Instance();
-  std::vector<nanobind::object> result;
-  result.reserve(instance.include_in_jit_key().size());
-  for (int i : instance.include_in_jit_key()) {
-    nb::object local = thread_local_instance.Get(i);
-    if (local.is_valid()) {
-      result.push_back(std::move(local));
-    } else {
-      result.push_back(instance.Get(i));
-    }
-  }
-  return result;
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/config.h b/third_party/xla/xla/python/config.h
deleted file mode 100644
index b322b4ce5f81..000000000000
--- a/third_party/xla/xla/python/config.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_CONFIG_H_
-#define XLA_PYTHON_CONFIG_H_
-
-#include <vector>
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace jax {
-
-// Returns the set of configuration values that should be included in the JIT
-// cache key.
-std::vector<nanobind::object> JitConfigs();
-
-void BuildConfigSubmodule(nanobind::module_& m);
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_CONFIG_H_
diff --git a/third_party/xla/xla/python/config_test.py b/third_party/xla/xla/python/config_test.py
deleted file mode 100644
index 06e4ea24ba99..000000000000
--- a/third_party/xla/xla/python/config_test.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import threading
-
-from absl.testing import absltest
-
-from xla.python import xla_client
-
-config = xla_client._xla.config
-
-
-class ConfigTest(absltest.TestCase):
-
-  def testBasic(self):
-    c = config.Config(1)
-    self.assertEqual(c.value, 1)
-    self.assertEqual(c.get_global(), 1)
-    self.assertEqual(c.get_local(), config.unset)
-
-    c.set_global(2)
-    self.assertEqual(c.value, 2)
-    self.assertEqual(c.get_global(), 2)
-    self.assertEqual(c.get_local(), config.unset)
-
-    c.set_local(3)
-    self.assertEqual(c.value, 3)
-    self.assertEqual(c.get_global(), 2)
-    self.assertEqual(c.get_local(), 3)
-
-    c.set_global(4)
-    self.assertEqual(c.value, 3)
-    self.assertEqual(c.get_global(), 4)
-    self.assertEqual(c.get_local(), 3)
-
-    c.set_local(config.unset)
-    self.assertEqual(c.value, 4)
-    self.assertEqual(c.get_global(), 4)
-    self.assertEqual(c.get_local(), config.unset)
-
-  def testThreading(self):
-    c = config.Config(1)
-
-    def Body():
-      for i in range(100):
-        c.set_local(i)
-        self.assertEqual(c.get_local(), i)
-        self.assertEqual(c.get_global(), 1)
-        self.assertEqual(c.value, i)
-
-    threads = [threading.Thread(target=Body) for _ in range(4)]
-    for t in threads:
-      t.start()
-    for t in threads:
-      t.join()
-
-
-if __name__ == "__main__":
-  absltest.main()
diff --git a/third_party/xla/xla/python/custom_call_batch_partitioner.cc b/third_party/xla/xla/python/custom_call_batch_partitioner.cc
index acfa2387d7ab..1913a2915a59 100644
--- a/third_party/xla/xla/python/custom_call_batch_partitioner.cc
+++ b/third_party/xla/xla/python/custom_call_batch_partitioner.cc
@@ -159,13 +159,14 @@ std::pair<Shape, HloSharding> ComputeResultShapeAndSharding(
     const Shape& shape, const HloSharding& batch_sharding,
     int64_t num_batch_dims) {
   if (!shape.IsTuple()) {
-    const int64_t num_replicate_dims = shape.dimensions_size() - num_batch_dims;
+    const int64_t num_replicate_dims =
+        shape.dimensions().size() - num_batch_dims;
     auto result_sharding =
         InsertNonBatchSharding(batch_sharding, num_replicate_dims);
     auto result_shape = spmd::MakePartitionedShape(shape, result_sharding);
     return std::make_pair(result_shape, result_sharding);
   }
-  int num_results = shape.tuple_shapes_size();
+  int num_results = shape.tuple_shapes().size();
   std::vector<Shape> result_shapes;
   result_shapes.reserve(num_results);
   std::vector<HloSharding> result_shardings;
@@ -243,7 +244,7 @@ absl::Status CustomCallBatchPartitioner::Partition(
   partitioned_shapes_with_layout_constraints.reserve(num_operands);
   for (size_t i = 0; i < num_operands; ++i) {
     const int64_t num_replicate_dims =
-        hlo->operand(i)->shape().dimensions_size() - num_batch_dims;
+        hlo->operand(i)->shape().dimensions().size() - num_batch_dims;
     HloSharding operand_sharding =
         InsertNonBatchSharding(batch_sharding, num_replicate_dims);
     spmd::PartitionedHlo partitioned_operand =
diff --git a/third_party/xla/xla/python/custom_call_sharding.cc b/third_party/xla/xla/python/custom_call_sharding.cc
deleted file mode 100644
index 5eacd72c6a4e..000000000000
--- a/third_party/xla/xla/python/custom_call_sharding.cc
+++ /dev/null
@@ -1,343 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/python/custom_call_sharding.h"
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/tuple.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/hlo/utils/hlo_sharding_util.h"
-#include "xla/pjrt/c/pjrt_c_api.h"
-#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
-#include "xla/pjrt/c/pjrt_c_api_helpers.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/custom_call_batch_partitioner.h"
-#include "xla/python/custom_partition_callback.h"
-#include "xla/python/inspect_sharding.h"
-#include "xla/shape.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace xla {
-
-namespace nb = ::nanobind;
-
-class PyCustomCallPartitionerCallbacks {
- public:
-  PyCustomCallPartitionerCallbacks(nb::object prop_user_sharding,
-                                   nb::object partition,
-                                   nb::object infer_sharding_from_operands)
-      : prop_user_sharding_(prop_user_sharding),
-        partition_(partition),
-        infer_sharding_from_operands_(infer_sharding_from_operands) {
-    callbacks_.version = 0;
-    callbacks_.private_data = this;
-    callbacks_.dtor = +[](JAX_CustomCallPartitioner_Callbacks* self) {
-      delete GetSelfPtr(self);
-    };
-    callbacks_.partition = +[](JAX_CustomCallPartitioner_Callbacks* self,
-                               JAX_CustomCallPartitioner_Partition_Args* args) {
-      jax::PopulateResults(GetSelfPtr(self)->CallPartition(args), args);
-    };
-    callbacks_.infer_sharding =
-        +[](JAX_CustomCallPartitioner_Callbacks* self,
-            JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args) {
-          jax::PopulateResults(
-              GetSelfPtr(self)->CallInferShardingFromOperands(args), args);
-        };
-    callbacks_.propagate_user_sharding =
-        +[](JAX_CustomCallPartitioner_Callbacks* self,
-            JAX_CustomCallPartitioner_PropagateUserSharding_Args* args) {
-          jax::PopulateResults(
-              GetSelfPtr(self)->CallPropagateUserSharding(args), args);
-        };
-  }
-
-  absl::StatusOr<
-      std::tuple<std::string, std::vector<xla::HloSharding>, xla::HloSharding>>
-  CallPartition(JAX_CustomCallPartitioner_Partition_Args* args) const {
-    if (args->header.api_version != 0) {
-      return absl::InternalError("API version mismatch.");
-    }
-    TF_ASSIGN_OR_RETURN(auto args_tuple, jax::ReadArgs(args));
-    std::vector<xla::Shape> shapes = std::move(std::get<0>(args_tuple));
-    std::vector<std::optional<xla::HloSharding>> shardings =
-        std::move(std::get<1>(args_tuple));
-    xla::Shape result_shape = std::move(std::get<2>(args_tuple));
-    std::optional<xla::HloSharding> result_sharding =
-        std::move(std::get<3>(args_tuple));
-    absl::string_view backend_config = std::move(std::get<4>(args_tuple));
-
-    {
-      nb::gil_scoped_acquire gil;
-      try {
-        auto py_result =
-            partition_(shapes, shardings, result_shape, result_sharding,
-                       nb::bytes(backend_config.data(), backend_config.size()));
-        try {
-          auto [ir, arg_shardings, result_sharding] = nb::cast<
-              std::tuple<nb::bytes, std::vector<HloSharding>, HloSharding>>(
-              py_result);
-          if (arg_shardings.size() != args->num_args) {
-            return xla::Internal(
-                "Shardings returned from partitioning: lengths must match: %d "
-                "vs %d",
-                arg_shardings.size(), args->num_args);
-          }
-          return std::make_tuple(std::string(ir.c_str(), ir.size()),
-                                 std::move(arg_shardings),
-                                 std::move(result_sharding));
-        } catch (const nb::cast_error& e) {
-          return xla::Internal(
-              "Shardings returned from partitioning: expected "
-              "Tuple[bytes, List[HloSharding], HloSharding] got: %s",
-              nb::cast<absl::string_view>(nb::repr(py_result)));
-        }
-      } catch (const nb::python_error& e) {
-        return xla::Internal("custom_partitioner: %s", e.what());
-      }
-    }
-  }
-
-  absl::StatusOr<std::optional<xla::HloSharding>> CallInferShardingFromOperands(
-      JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args) const {
-    if (args->header.api_version != 0) {
-      return absl::InternalError("API version mismatch.");
-    }
-    TF_ASSIGN_OR_RETURN(auto args_tuple, jax::ReadArgs(args));
-    std::vector<xla::Shape> arg_shapes = std::move(std::get<0>(args_tuple));
-    std::vector<std::optional<xla::HloSharding>> arg_shardings =
-        std::move(std::get<1>(args_tuple));
-    xla::Shape result_shape = std::move(std::get<2>(args_tuple));
-    absl::string_view backend_config = std::move(std::get<3>(args_tuple));
-
-    std::optional<HloSharding> result;
-    nb::gil_scoped_acquire gil;
-    try {
-      auto py_result = infer_sharding_from_operands_(
-          arg_shapes, arg_shardings, result_shape,
-          nb::bytes(backend_config.data(), backend_config.size()));
-      if (py_result.is_none()) {
-        return std::nullopt;
-      }
-      return nb::cast<HloSharding>(py_result);
-    } catch (const nb::python_error& e) {
-      return xla::Internal("custom_partitioner: %s", e.what());
-    }
-  }
-
-  absl::StatusOr<xla::HloSharding> CallPropagateUserSharding(
-      JAX_CustomCallPartitioner_PropagateUserSharding_Args* args) const {
-    if (args->header.api_version != 0) {
-      return absl::InternalError("API version mismatch.");
-    }
-    TF_ASSIGN_OR_RETURN(auto args_tuple, jax::ReadArgs(args));
-    xla::HloSharding result_sharding = std::move(std::get<0>(args_tuple));
-    xla::Shape result_shape = std::move(std::get<1>(args_tuple));
-    absl::string_view backend_config = std::move(std::get<2>(args_tuple));
-
-    nb::gil_scoped_acquire gil;
-    try {
-      // TODO(parkers): expand this API to handle the `user` sharding.
-      // The user is used when the custom call returns a Tuple and
-      // the user is a get-tuple-element. In this case we must update only
-      // part of the sharding spec.
-      auto result = nb::cast<HloSharding>(prop_user_sharding_(
-          result_sharding, result_shape,
-          nb::bytes(backend_config.data(), backend_config.size())));
-      return result;
-    } catch (const nb::python_error& e) {
-      return xla::Internal("custom_partitioner: %s", e.what());
-    }
-  }
-
-  JAX_CustomCallPartitioner_Callbacks* callbacks() { return &callbacks_; }
-
- private:
-  static PyCustomCallPartitionerCallbacks* GetSelfPtr(
-      JAX_CustomCallPartitioner_Callbacks* callbacks) {
-    return reinterpret_cast<PyCustomCallPartitionerCallbacks*>(
-        callbacks->private_data);
-  }
-
-  JAX_CustomCallPartitioner_Callbacks callbacks_;
-  nb::object prop_user_sharding_;
-  nb::object partition_;
-  nb::object infer_sharding_from_operands_;
-};
-
-namespace {
-
-void CallInspectSharding(void* obj, JAX_InspectSharding_Callback_Args* args) {
-  std::optional<xla::HloSharding> arg = jax::InspectShardingReadArgs(args);
-  if (!arg.has_value()) {
-    return;
-  }
-  try {
-    nb::gil_scoped_acquire gil;
-    nb::handle(reinterpret_cast<PyObject*>(obj))(*std::move(arg));
-  } catch (const nb::python_error& e) {
-    jax::InspectShardingSetError(args, std::string(e.what()));
-  }
-}
-
-}  // namespace
-
-void BuildCustomCallShardingPybindAPI(nb::module_& m) {
-  m.def(
-      "register_custom_call_partitioner",
-      [](std::string name, nb::object prop_user_sharding, nb::object partition,
-         nb::object infer_sharding_from_operands,
-         bool can_side_effecting_have_replicated_sharding,
-         std::optional<nb::capsule> c_api) {
-        auto* c_fns =
-            (new PyCustomCallPartitionerCallbacks(prop_user_sharding, partition,
-                                                  infer_sharding_from_operands))
-                ->callbacks();
-        c_fns->can_side_effecting_have_replicated_sharding =
-            can_side_effecting_have_replicated_sharding;
-        if (!c_api.has_value()) {
-          RegisterCustomCallPartitioner(
-              name, jax::CreateCApiCustomCallPartitioner(c_fns));
-          return;
-        }
-
-        if (absl::string_view(c_api->name()) != "pjrt_c_api") {
-          throw absl::InvalidArgumentError(
-              "Argument to register_custom_call_partitioner was not a "
-              "pjrt_c_api capsule.");
-        }
-        auto* c_api_value = static_cast<const PJRT_Api*>(c_api->data());
-        PJRT_Custom_Partitioner_Extension* extension =
-            pjrt::FindExtension<PJRT_Custom_Partitioner_Extension>(
-                c_api_value,
-                PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner);
-        if (extension == nullptr) {
-          return;
-        }
-        PJRT_Register_Custom_Partitioner_Args args;
-        args.struct_size = PJRT_Register_Custom_Partitioner_Args_STRUCT_SIZE;
-        args.name = name.c_str();
-        args.name_size = name.size();
-        args.callbacks = c_fns;
-        PJRT_Error* error =
-            reinterpret_cast<const PJRT_Custom_Partitioner_Extension*>(
-                extension)
-                ->register_custom_partitioner(&args);
-        std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error_ptr(
-            error, pjrt::MakeErrorDeleter(c_api_value));
-        ThrowIfError(pjrt::PjrtErrorToStatus(error_ptr.get(), c_api_value));
-      },
-      R"(Registers a partitioner for a custom-call operation.
-
-Args:
-  name: custom_call_target to match.
-  prop_user_sharding: Custom backwards sharding propagation rule.
-     Takes result sharding and returns the instruction sharding.
-  partition: Lowering rule. Takes operand and result shardings and returns
-     a generated HLO and sharding specs. The spmd lowerer first reshards
-     to match the returned sharding specs and then inserts the generated hlo.
-  infer_sharding_from_operands: Custom forwards sharding propagation rule.
-     Takes operand sharding and returns the instruction sharding.
-  can_side_effecting_have_replicated_sharding: Side effecting ops are not
-     allowed to have replicated sharding. Pass true to disable this check.
-  c_api: Optional `PJRT_Api*` if it is called with a plugin. This is safe to
-     call on plugins that do not implement the custom partitioner extension
-)",
-      nb::arg("name"), nb::arg("prop_user_sharding"), nb::arg("partition"),
-      nb::arg("infer_sharding_from_operands"),
-      nb::arg("can_side_effecting_have_replicated_sharding") = false,
-      nb::arg("c_api").none() = std::nullopt);
-  m.def("encode_inspect_sharding_callback",
-        [](nb::object handler) -> nb::bytes {
-          JAX_InspectSharding_Callback cb;
-          cb.call = &CallInspectSharding;
-          cb.data = handler.ptr();
-          char bytes[sizeof(JAX_InspectSharding_Callback)];
-          std::memcpy(&bytes, &cb, sizeof(JAX_InspectSharding_Callback));
-          return nb::bytes(bytes, sizeof(JAX_InspectSharding_Callback));
-        });
-
-  nb::module_ hlo_sharding_util_m = m.def_submodule(
-      "hlo_sharding_util", "Utilities for manipulating HloSharding.");
-  hlo_sharding_util_m.def(
-      "PartiallyReplicateTiledShardingOnDims",
-      [](const HloSharding& sharding, std::vector<int64_t> dims) {
-        return hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-            sharding, dims);
-      });
-
-  m.def(
-      "register_custom_call_as_batch_partitionable",
-      [](std::string target_name, std::optional<nb::capsule> c_api) {
-        if (!c_api.has_value()) {
-          RegisterCustomCallPartitioner(
-              target_name, std::make_unique<xla::CustomCallBatchPartitioner>());
-          return;
-        }
-        if (absl::string_view(c_api->name()) != "pjrt_c_api") {
-          throw absl::InvalidArgumentError(
-              "Argument to register_custom_call_partitioner was not a "
-              "pjrt_c_api capsule.");
-        }
-        auto* c_api_value = static_cast<const PJRT_Api*>(c_api->data());
-        PJRT_Custom_Partitioner_Extension* extension =
-            pjrt::FindExtension<PJRT_Custom_Partitioner_Extension>(
-                c_api_value,
-                PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner);
-        if (extension == nullptr) {
-          return;
-        }
-        PJRT_Register_Batch_Partitionable_Args args;
-        args.struct_size = PJRT_Register_Batch_Partitionable_Args_STRUCT_SIZE;
-        args.name = target_name.c_str();
-        args.name_size = target_name.size();
-        PJRT_Error* error = extension->register_batch_partitionable(&args);
-        std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error_ptr(
-            error, pjrt::MakeErrorDeleter(c_api_value));
-        ThrowIfError(pjrt::PjrtErrorToStatus(error_ptr.get(), c_api_value));
-      },
-      R"(Registers a custom call as batch partitionable.
-
-If a custom call is "batch partitionable", it means that it can be trivially
-partitioned on some number of (leading) dimensions, with the same call being
-executed independently on each shard of data. If the data are sharded on
-non-batch dimensions, partitioning will re-shard the data to be replicated on
-the non-batch dimensions.
-
-Args:
-  target_name: the target name of the batch partitionable custom call.
-  c_api: optional `PJRT_Api*` to support registration via a PJRT plugin.
-)",
-      nb::arg("target_name"), nb::arg("c_api").none() = std::nullopt);
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/custom_call_sharding.h b/third_party/xla/xla/python/custom_call_sharding.h
deleted file mode 100644
index 0f219b256ca9..000000000000
--- a/third_party/xla/xla/python/custom_call_sharding.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_CUSTOM_CALL_SHARDING_H_
-#define XLA_PYTHON_CUSTOM_CALL_SHARDING_H_
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-void BuildCustomCallShardingPybindAPI(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_CUSTOM_CALL_SHARDING_H_
diff --git a/third_party/xla/xla/python/custom_calls_testlib.cc b/third_party/xla/xla/python/custom_calls_testlib.cc
deleted file mode 100644
index 2c57fbd7e52f..000000000000
--- a/third_party/xla/xla/python/custom_calls_testlib.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <memory>
-
-#include "nanobind/nanobind.h"
-#include "xla/ffi/api/c_api.h"
-#include "xla/ffi/api/ffi.h"
-
-namespace xla::ffi {
-namespace nb = ::nanobind;
-
-// Implement custom calls as static functions with XLA FFI types in the function
-// signature that gives access to the arguments and results buffers together
-// with their types and dimensions. See `ffi/api/ffi_test.cc` for more XLA FFI
-// examples and features (e.g. binding attributes, custom user-defined structs
-// and arbitrary execution context).
-
-static Error AlwaysFail(Result<AnyBuffer>) {
-  return Error(XLA_FFI_Error_Code_INTERNAL, "Failed intentionally");
-}
-
-static Error AlwaysSucceed(Result<AnyBuffer>) { return Error::Success(); }
-
-static Error Subtract(BufferR0<DataType::F32> a, BufferR0<DataType::F32> b,
-                      Result<BufferR0<DataType::F32>> out) {
-  *out->typed_data() = *a.typed_data() - *b.typed_data();
-  return Error::Success();
-}
-
-static Error SubtractCst(BufferR0<DataType::F32> a,
-                         Result<BufferR0<DataType::F32>> out, float cst) {
-  *out->typed_data() = *a.typed_data() - cst;
-  return Error::Success();
-}
-
-// Define XLA FFI handlers from the implementations defined above using explicit
-// XLA FFI binding API to describe type signatures of custom calls.
-
-XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail, Ffi::Bind().Ret<AnyBuffer>());
-
-XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
-                       Ffi::Bind().Ret<AnyBuffer>());
-
-XLA_FFI_DEFINE_HANDLER(kSubtract, Subtract,
-                       Ffi::Bind()
-                           .Arg<BufferR0<DataType::F32>>()
-                           .Arg<BufferR0<DataType::F32>>()
-                           .Ret<BufferR0<DataType::F32>>());
-
-XLA_FFI_DEFINE_HANDLER(kSubtractCst, SubtractCst,
-                       Ffi::Bind()
-                           .Arg<BufferR0<DataType::F32>>()
-                           .Ret<BufferR0<DataType::F32>>()
-                           .Attr<float>("cst"));
-
-// XLA FFI calls can also be stateful.
-struct TestFfiState {
-  static TypeId id;
-  explicit TestFfiState(int32_t value) : value(value) {}
-  int32_t value;
-};
-TypeId TestFfiState::id = {};
-
-static ErrorOr<std::unique_ptr<TestFfiState>> StateInstantiate() {
-  return std::make_unique<TestFfiState>(42);
-}
-
-static Error StateExecute(TestFfiState* state,
-                          Result<BufferR0<DataType::S32>> out) {
-  *out->typed_data() = state->value;
-  return Error::Success();
-}
-
-XLA_FFI_DEFINE_HANDLER(kStateInstantiate, StateInstantiate,
-                       Ffi::BindInstantiate());
-XLA_FFI_DEFINE_HANDLER(
-    kStateExecute, StateExecute,
-    Ffi::Bind().Ctx<State<TestFfiState>>().Ret<BufferR0<DataType::S32>>());
-
-template <typename T>
-static auto BindFunction(T* fn) {
-  return nb::capsule(reinterpret_cast<void*>(fn));
-}
-
-template <typename T>
-static auto BindTypeId(T* typeId) {
-  return nb::capsule(reinterpret_cast<void*>(typeId));
-}
-
-// Custom calls registration library that exports function pointers to XLA FFI
-// handlers to the python users.
-NB_MODULE(custom_calls_testlib, m) {
-  m.def("registrations", []() {
-    nb::dict dict;
-    dict["always_fail"] = BindFunction(kAlwaysFail);
-    dict["always_succeed"] = BindFunction(kAlwaysSucceed);
-    dict["subtract_f32"] = BindFunction(kSubtract);
-    dict["subtract_f32_cst"] = BindFunction(kSubtractCst);
-
-    nb::dict bundle;
-    bundle["instantiate"] = BindFunction(kStateInstantiate);
-    bundle["execute"] = BindFunction(kStateExecute);
-    dict["stateful"] = bundle;
-
-    return dict;
-  });
-  m.def("type_ids", []() {
-    nb::dict type_ids;
-    type_ids["test_ffi_state"] = BindTypeId(&TestFfiState::id);
-    return type_ids;
-  });
-}
-
-}  // namespace xla::ffi
diff --git a/third_party/xla/xla/python/custom_partition_callback.cc b/third_party/xla/xla/python/custom_partition_callback.cc
index fed210f41575..6769aa5ba746 100644
--- a/third_party/xla/xla/python/custom_partition_callback.cc
+++ b/third_party/xla/xla/python/custom_partition_callback.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -212,7 +213,8 @@ absl::string_view ToStringView(JAX_CustomCallPartitioner_string data) {
 void SetCAPIAval(JAX_CustomCallPartitioner_aval& result,
                  const xla::HloInstruction* inst,
                  std::vector<std::string>& scratch) {
-  SetCAPIString(result.shape, inst->shape().SerializeAsString(), scratch);
+  SetCAPIString(result.shape, inst->shape().ToProto().SerializeAsString(),
+                scratch);
   if (inst->has_sharding()) {
     result.has_sharding = true;
     SetCAPIString(result.sharding,
@@ -248,7 +250,7 @@ absl::StatusOr<xla::Shape> ReadHloShape(JAX_CustomCallPartitioner_string data) {
     return absl::InternalError(
         "custom_call_sharding.cc: error parsing xla::Shape");
   }
-  return xla::Shape(proto);
+  return xla::Shape::FromProto(proto);
 }
 
 bool PopulateErrorHeader(JAX_CustomCallPartitioner_version_and_error& header,
@@ -421,7 +423,8 @@ PartitionScratch PopulateArgs(
   }
   args->num_args = instruction->operand_count();
   args->op_args = scratch.op_args_storage.data();
-  SetCAPIString(args->result_shape, instruction->shape().SerializeAsString(),
+  SetCAPIString(args->result_shape,
+                instruction->shape().ToProto().SerializeAsString(),
                 scratch.strings);
   args->backend_config.data = instruction->raw_backend_config_string().data();
   args->backend_config.size = instruction->raw_backend_config_string().size();
@@ -476,7 +479,8 @@ PartitionScratch PopulateArgs(
   scratch.strings.reserve(2);
   SetCAPIString(args->result_sharding, sharding.ToProto().SerializeAsString(),
                 scratch.strings);
-  SetCAPIString(args->result_shape, instruction->shape().SerializeAsString(),
+  SetCAPIString(args->result_shape,
+                instruction->shape().ToProto().SerializeAsString(),
                 scratch.strings);
   args->backend_config.data = instruction->raw_backend_config_string().data();
   args->backend_config.size = instruction->raw_backend_config_string().size();
diff --git a/third_party/xla/xla/python/custom_partition_callback.h b/third_party/xla/xla/python/custom_partition_callback.h
index 6ba1789a038d..53cec15ed7a7 100644
--- a/third_party/xla/xla/python/custom_partition_callback.h
+++ b/third_party/xla/xla/python/custom_partition_callback.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 #include <tuple>
 
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
diff --git a/third_party/xla/xla/python/debug_callback_partitioner.cc b/third_party/xla/xla/python/debug_callback_partitioner.cc
new file mode 100644
index 000000000000..cd58f9922885
--- /dev/null
+++ b/third_party/xla/xla/python/debug_callback_partitioner.cc
@@ -0,0 +1,96 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/debug_callback_partitioner.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/spmd/spmd_partitioner.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+absl::Status DebugCallbackCustomCallPartitioner::Partition(
+    spmd::SpmdPartitioningVisitor* partitioner, HloInstruction* hlo) const {
+  // Cast HloInstruction to HloCustomCallInstruction.
+  const HloCustomCallInstruction* custom_call =
+      Cast<HloCustomCallInstruction>(hlo);
+
+  // Initialize partitioned operands and shapes.
+  const int64_t num_operands = hlo->operand_count();
+  std::vector<HloInstruction*> partitioned_operands;
+  partitioned_operands.reserve(num_operands);
+  std::vector<Shape> partitioned_shapes_with_layout_constraints;
+  partitioned_shapes_with_layout_constraints.reserve(num_operands);
+
+  // Loop through and get partitioned operands and shapes.
+  for (size_t i = 0; i < num_operands; ++i) {
+    // For each operand, get partitioned hlo.
+    spmd::PartitionedHlo partitioned_operand =
+        partitioner->GetPartitionedHlo(hlo->operand(i));
+    partitioned_operands.push_back(partitioned_operand.hlo());
+    Shape partitioned_shape_with_layout_constraint =
+        partitioned_operand.hlo()->shape();
+    (*partitioned_shape_with_layout_constraint.mutable_layout()) =
+        custom_call->operand_shapes_with_layout()[i].layout();
+    partitioned_shapes_with_layout_constraints.push_back(
+        partitioned_shape_with_layout_constraint);
+  }
+
+  // Create new custom call with partitioned operands.
+  std::unique_ptr<HloInstruction> partitioned_instruction =
+      HloInstruction::CreateCustomCall(
+          hlo->shape(), partitioned_operands, custom_call->custom_call_target(),
+          partitioned_shapes_with_layout_constraints, custom_call->opaque(),
+          custom_call->api_version());
+  auto partitioned_custom_call =
+      Cast<HloCustomCallInstruction>(partitioned_instruction.get());
+  partitioned_custom_call->set_custom_call_has_side_effect(
+      custom_call->custom_call_has_side_effect());
+  HloInstruction* partitioned_hlo = partitioner->builder()->AddInstruction(
+      std::move(partitioned_instruction));
+  partitioned_hlo->set_sharding(HloSharding::Replicate());
+
+  spmd::PartitionedHlo result_partitioned =
+      spmd::PartitionedHlo(partitioned_hlo, hlo->shape(),
+                           partitioner->MakePartitioningState())
+          .Reshard(hlo->sharding());
+  partitioner->SetPartitionedHlo(hlo, result_partitioned);
+
+  return absl::OkStatus();
+}
+
+namespace {
+struct Registerer {
+  explicit Registerer(std::string target_name) {
+    RegisterCustomCallPartitioner(
+        target_name, std::make_unique<DebugCallbackCustomCallPartitioner>());
+  }
+};
+Registerer cpu_registerer("xla_ffi_partitioned_python_cpu_callback");
+Registerer gpu_registerer("xla_ffi_partitioned_python_gpu_callback");
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/debug_callback_partitioner.h b/third_party/xla/xla/python/debug_callback_partitioner.h
new file mode 100644
index 000000000000..02f163689b21
--- /dev/null
+++ b/third_party/xla/xla/python/debug_callback_partitioner.h
@@ -0,0 +1,37 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_DEBUG_CALLBACK_PARTITIONER_H_
+#define XLA_PYTHON_DEBUG_CALLBACK_PARTITIONER_H_
+
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/custom_call_sharding_helper.h"
+
+namespace xla {
+
+// Partition the custom call according to XLA partitioning. Currently only used
+// by `jax.debug.callback`.
+// TODO(b/409338207): Pass additional metadata to the custom call e.g.,
+// partition id.
+class DebugCallbackCustomCallPartitioner : public CustomCallPartitioner {
+ public:
+  absl::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
+                         HloInstruction* hlo) const override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_DEBUG_CALLBACK_PARTITIONER_H_
diff --git a/third_party/xla/xla/python/dlpack.cc b/third_party/xla/xla/python/dlpack.cc
deleted file mode 100644
index b5befa1ed19e..000000000000
--- a/third_party/xla/xla/python/dlpack.cc
+++ /dev/null
@@ -1,620 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/dlpack.h"
-
-#include <Python.h>
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_join.h"
-#include "absl/types/span.h"
-#include "include/dlpack/dlpack.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/ndarray.h"
-#include "xla/layout.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_compiler.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pjrt_ifrt/pjrt_array.h"
-#include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/python/pjrt_ifrt/pjrt_device.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_client.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/traceback.h"
-#include "xla/python/types.h"
-#include "xla/python/util.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-namespace {
-
-const char* const kDlTensorCapsuleName = "dltensor";
-
-struct DLPackTensor {
-  ~DLPackTensor();
-
-  // `buffer_reference` is populated if we have shared (read-only) access.
-  nb::object buffer_reference;
-
-  // `external_reference` is always populated.
-  std::unique_ptr<PjRtBuffer::ExternalReference> external_reference;
-
-  std::vector<int64_t> shape;
-  std::vector<int64_t> strides;
-  DLManagedTensor tensor;
-};
-
-DLPackTensor::~DLPackTensor() {
-  if (buffer_reference) {
-    GlobalPyRefManager()->AddGarbage(
-        absl::MakeSpan(&buffer_reference, /*size=*/1));
-  }
-}
-
-void DLPackTensorDeleter(DLManagedTensor* t) {
-  if (t) {
-    delete static_cast<DLPackTensor*>(t->manager_ctx);
-  }
-}
-
-absl::StatusOr<DLDataType> PrimitiveTypeToDLDataType(PrimitiveType type) {
-  switch (type) {
-    case S8:
-      return DLDataType{kDLInt, 8, 1};
-    case S16:
-      return DLDataType{kDLInt, 16, 1};
-    case S32:
-      return DLDataType{kDLInt, 32, 1};
-    case S64:
-      return DLDataType{kDLInt, 64, 1};
-    case U8:
-      return DLDataType{kDLUInt, 8, 1};
-    case U16:
-      return DLDataType{kDLUInt, 16, 1};
-    case U32:
-      return DLDataType{kDLUInt, 32, 1};
-    case U64:
-      return DLDataType{kDLUInt, 64, 1};
-    case F16:
-      return DLDataType{kDLFloat, 16, 1};
-    case F32:
-      return DLDataType{kDLFloat, 32, 1};
-    case F64:
-      return DLDataType{kDLFloat, 64, 1};
-    case BF16:
-      return DLDataType{kDLBfloat, 16, 1};
-    case PRED:
-      return DLDataType{kDLBool, 8, 1};
-    case C64:
-      return DLDataType{kDLComplex, 64, 1};
-    case C128:
-      return DLDataType{kDLComplex, 128, 1};
-    default:
-      return Unimplemented("XLA type %s has no DLPack equivalent",
-                           PrimitiveType_Name(type));
-  }
-}
-
-absl::StatusOr<PrimitiveType> DLDataTypeToPrimitiveType(DLDataType type) {
-  if (type.lanes != 1) {
-    return Unimplemented("DLPack types with lanes != 1 not implemented, got %d",
-                         type.lanes);
-  }
-  switch (type.code) {
-    case kDLBool:
-      switch (type.bits) {
-        case 8:
-          return PRED;
-        default:
-          return Unimplemented(
-              "Only 8-bit DLPack booleans are supported, got %d bits",
-              type.bits);
-      }
-    case kDLInt:
-      switch (type.bits) {
-        case 8:
-          return S8;
-        case 16:
-          return S16;
-        case 32:
-          return S32;
-        case 64:
-          return S64;
-        default:
-          return Unimplemented(
-              "Invalid or unsupported DLPack integer width: %d bits",
-              type.bits);
-      }
-    case kDLUInt:
-      switch (type.bits) {
-        case 8:
-          return U8;
-        case 16:
-          return U16;
-        case 32:
-          return U32;
-        case 64:
-          return U64;
-        default:
-          return Unimplemented(
-              "Invalid or unsupported DLPack unsigned integer width: %d bits",
-              type.bits);
-      }
-    case kDLFloat:
-      switch (type.bits) {
-        case 16:
-          return F16;
-        case 32:
-          return F32;
-        case 64:
-          return F64;
-        default:
-          return Unimplemented(
-              "Invalid or unsupported DLPack float width: %d bits", type.bits);
-      }
-    case kDLBfloat:
-      switch (type.bits) {
-        case 16:
-          return BF16;
-        default:
-          return Unimplemented(
-              "Invalid or unsupported DLPack Bfloat width: %d bits", type.bits);
-      }
-    case kDLComplex:
-      switch (type.bits) {
-        case 64:
-          return C64;
-        case 128:
-          return C128;
-        default:
-          return Unimplemented(
-              "Invalid or unsupported DLPack complex width: %d bits",
-              type.bits);
-      }
-    default:
-      return Unimplemented("Unknown or invalid DLPack type code %d", type.code);
-  }
-}
-
-absl::StatusOr<std::vector<int64_t>> StridesToLayout(
-    absl::Span<int64_t const> dims, absl::Span<int64_t const> strides) {
-  CHECK_EQ(dims.size(), strides.size());
-  std::vector<int64_t> minor_to_major(dims.size());
-  std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
-  absl::c_sort(minor_to_major, [&](int a, int b) {
-    if (strides[a] < strides[b]) {
-      return true;
-    }
-    if (strides[a] > strides[b]) {
-      return false;
-    }
-    // If two dimensions have the same stride, prefer the major-to-minor
-    // interpretation of the ordering, since that's what JAX wants.
-    return b < a;
-  });
-  int64_t stride = 1;
-  for (int64_t d : minor_to_major) {
-    if (dims[d] > 1 && strides[d] != stride) {
-      return Unimplemented(
-          "Only DLPack tensors with trivial (compact) striding are supported; "
-          "i.e., tensors whose striding represents a transposition of the "
-          "underlying buffer but not broadcasting. Dimensions were: [%s], "
-          "strides were [%s].",
-          absl::StrJoin(dims, ","), absl::StrJoin(strides, ","));
-    }
-    stride *= dims[d];
-  }
-  return minor_to_major;
-}
-
-absl::StatusOr<DLDeviceType> DLDeviceTypeForDevice(const PjRtDevice& device) {
-  if (device.client()->platform_id() == CpuId()) {
-    return kDLCPU;
-  } else if (device.client()->platform_id() == CudaId()) {
-    return kDLCUDA;
-  } else if (device.client()->platform_id() == RocmId()) {
-    return kDLROCM;
-  }
-  return InvalidArgument("Device %s cannot be used as a DLPack device.",
-                         device.DebugString());
-}
-
-absl::StatusOr<DLDevice> DLDeviceForDevice(const PjRtDevice& device) {
-  DLDevice context;
-  TF_ASSIGN_OR_RETURN(context.device_type, DLDeviceTypeForDevice(device));
-  context.device_id = device.local_hardware_id().value();
-  return context;
-}
-
-absl::StatusOr<PjRtDevice*> DeviceForDLDevice(const PjRtClient* cpu_client,
-                                              const PjRtClient* gpu_client,
-                                              const DLDevice& context) {
-  switch (context.device_type) {
-    case kDLCPU:
-      if (cpu_client == nullptr) {
-        return InvalidArgument(
-            "DLPack tensor is on CPU, but no CPU backend was provided.");
-      }
-      TF_RET_CHECK(cpu_client->platform_id() == CpuId());
-      return cpu_client->LookupAddressableDevice(
-          xla::PjRtLocalDeviceId(context.device_id));
-    case kDLCUDA:
-      if (gpu_client == nullptr) {
-        return InvalidArgument(
-            "DLPack tensor is on GPU, but no GPU backend was provided.");
-      }
-      TF_RET_CHECK(gpu_client->platform_id() == CudaId());
-      return gpu_client->LookupAddressableDevice(
-          xla::PjRtLocalDeviceId(context.device_id));
-    case kDLROCM:
-      if (gpu_client == nullptr) {
-        return InvalidArgument(
-            "DLPack tensor is on GPU, but no GPU backend was provided.");
-      }
-      TF_RET_CHECK(gpu_client->platform_id() == RocmId());
-      return gpu_client->LookupAddressableDevice(
-          xla::PjRtLocalDeviceId(context.device_id));
-    default:
-      return InvalidArgument("Unknown/unsupported DLPack device type %d",
-                             context.device_type);
-  }
-}
-
-absl::Status VerifyDType(const DLTensor& dl_tensor) {
-  if (dl_tensor.dtype.bits % 8 != 0) {
-    return InvalidArgument(
-        "Unsupported DLPack tensor dtype: bits should be a multiple of 8, got "
-        "%d",
-        dl_tensor.dtype.bits);
-  }
-
-  if (dl_tensor.dtype.lanes != 1) {
-    return InvalidArgument(
-        "Unsupported DLPack tensor dtype: lanes should be equal to 1, got %d",
-        dl_tensor.dtype.lanes);
-  }
-
-  return absl::OkStatus();
-}
-
-absl::StatusOr<std::vector<int64_t>> GetByteStrides(const DLTensor& dl_tensor) {
-  TF_RETURN_IF_ERROR(VerifyDType(dl_tensor));
-
-  // Convert element strides from the number of elements to the number of bytes.
-  std::vector<int64_t> strides;
-  strides.reserve(dl_tensor.ndim);
-  for (int i = 0; i < dl_tensor.ndim; ++i) {
-    strides.push_back(dl_tensor.strides[i] * dl_tensor.dtype.bits / 8);
-  }
-  return strides;
-}
-
-absl::StatusOr<std::unique_ptr<PjRtBuffer>> MakePjrtBuffer(
-    PjRtDevice& device, ::DLManagedTensor* dlmt, const Shape& shape,
-    PrimitiveType element_type, absl::Span<int64_t const> dimensions,
-    std::optional<std::intptr_t> stream = std::nullopt) {
-  std::function<void()> on_delete_callback;
-  if (dlmt->deleter) {
-    on_delete_callback = [dlmt]() { dlmt->deleter(dlmt); };
-  }
-
-  // First try to create a view.
-  void* data =
-      static_cast<char*>(dlmt->dl_tensor.data) + dlmt->dl_tensor.byte_offset;
-  auto result = device.client()->CreateViewOfDeviceBuffer(
-      data, shape, *device.default_memory_space(), on_delete_callback, stream);
-
-  // If that fails with invalid argument, it's possibly because of the incorrect
-  // alignment. If we're on CPU, we can create a copy of buffer.
-  if (result.status().code() == absl::StatusCode::kInvalidArgument &&
-      dlmt->dl_tensor.device.device_type == kDLCPU) {
-    LOG(WARNING) << "DLPack buffer is not aligned (data at: " << data
-                 << "). Creating a copy.";
-
-    // Convert tensor strides (expressed in number of elements) to byte strides.
-    std::optional<std::vector<int64_t>> byte_strides;
-    if (dlmt->dl_tensor.strides) {
-      TF_ASSIGN_OR_RETURN(byte_strides, GetByteStrides(dlmt->dl_tensor));
-    }
-
-    TF_ASSIGN_OR_RETURN(auto* memory_space, device.default_memory_space());
-
-    // Create a copy.
-    result = device.client()->BufferFromHostBuffer(
-        data, element_type, dimensions, byte_strides,
-        PjRtClient::HostBufferSemantics::kMutableZeroCopy, on_delete_callback,
-        memory_space, /*device_layout=*/nullptr);
-  }
-  return result;
-}
-
-}  // namespace
-
-absl::StatusOr<nb::capsule> BufferToDLPackManagedTensor(
-    nb::handle py_buffer, std::optional<std::intptr_t> stream) {
-  ifrt::Array* ifrt_array = nb::cast<xla::PyArray>(py_buffer).ifrt_array();
-  if (ifrt_array == nullptr) {
-    return Unimplemented(
-        "BufferToDLPackManagedTensor called on deleted array.");
-  }
-  auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
-  if (arr == nullptr) {
-    throw XlaRuntimeError(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  PjRtBuffer* pjrt_buffer = arr->pjrt_buffers().front().get();
-
-  if (pjrt_buffer->IsTuple()) {
-    return Unimplemented(
-        "BufferToDLPackManagedTensor is not implemented for tuple "
-        "buffers.");
-  }
-  if (pjrt_buffer->has_dynamic_dimensions()) {
-    return Unimplemented("DynamicShape is not implemented in DLPack.");
-  }
-
-  auto pack = std::make_unique<DLPackTensor>();
-  DLTensor& dt = pack->tensor.dl_tensor;
-  {
-    // AcquireExternalReference may block; there are no API guarantees.
-    GlobalPyRefManager()->CollectGarbage();
-    nb::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(pack->external_reference,
-                        pjrt_buffer->AcquireExternalReference());
-    if (stream) {
-      TF_RETURN_IF_ERROR(
-          pack->external_reference->WaitUntilBufferReadyOnStream(*stream));
-    } else {
-      TF_RETURN_IF_ERROR(
-          AwaitBuffersReady(absl::MakeConstSpan(&ifrt_array, 1)));
-    }
-  }
-  pack->buffer_reference = nb::borrow<nb::object>(py_buffer);
-
-  dt.data = pack->external_reference->OpaqueDeviceMemoryDataPointer();
-  pack->tensor.manager_ctx = pack.get();
-  pack->tensor.deleter = DLPackTensorDeleter;
-  TF_ASSIGN_OR_RETURN(dt.device, DLDeviceForDevice(*pjrt_buffer->device()));
-  dt.device.device_id = pjrt_buffer->device()->local_hardware_id().value();
-  dt.ndim = pjrt_buffer->dimensions().size();
-  TF_ASSIGN_OR_RETURN(dt.dtype,
-                      PrimitiveTypeToDLDataType(pjrt_buffer->element_type()));
-
-  pack->shape = std::vector<int64_t>(pjrt_buffer->dimensions().begin(),
-                                     pjrt_buffer->dimensions().end());
-
-  // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
-  Layout xla_layout = pjrt_buffer->layout()->xla_layout();
-  pack->strides = StridesForShape(pjrt_buffer->element_type(),
-                                  pjrt_buffer->dimensions(), xla_layout);
-
-  dt.shape = reinterpret_cast<std::int64_t*>(pack->shape.data());
-  dt.strides = reinterpret_cast<std::int64_t*>(pack->strides.data());
-  dt.byte_offset = 0;
-
-  // We cannot use nanobind's capsule object constructor because we need to
-  // detect if the capsule name has been changed in the deleter, but nanobind
-  // hides the underlying Python object from the deleter.
-  nb::capsule capsule = nb::steal<nb::capsule>(
-      PyCapsule_New(&pack.release()->tensor, kDlTensorCapsuleName,
-                    [](PyObject* obj) noexcept {
-                      DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(
-                          PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
-                      if (dlmt) {
-                        DLPackTensorDeleter(dlmt);
-                      } else {
-                        // The tensor has been deleted. Clear any error from
-                        // PyCapsule_GetPointer.
-                        PyErr_Clear();
-                      }
-                    }));
-  if (!capsule.ptr()) {
-    throw nb::python_error();
-  }
-  return capsule;
-}
-
-absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
-    const nb::capsule& tensor, std::optional<nb_class_ptr<PyClient>> cpu_client,
-    std::optional<nb_class_ptr<PyClient>> gpu_client) {
-  // TODO(hyeontaek): This is a potential target for an IFRT client to multiplex
-  // multiple PjRt clients. Devices from these PjRt clients could be expressed
-  // as a unified set of IFRT devices.
-  auto* cpu_pjrt_client = cpu_client ? (*cpu_client)->pjrt_client() : nullptr;
-  auto* gpu_pjrt_client = gpu_client ? (*gpu_client)->pjrt_client() : nullptr;
-
-  if (absl::string_view(tensor.name()) != kDlTensorCapsuleName) {
-    return InvalidArgument(
-        "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
-        "Note that a DLPack tensor may be consumed at most once.",
-        absl::string_view(tensor.name()));
-  }
-  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor.data());
-  if (dlmt->dl_tensor.ndim < 0) {
-    return InvalidArgument(
-        "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
-        dlmt->dl_tensor.ndim);
-  }
-  TF_ASSIGN_OR_RETURN(PjRtDevice * device,
-                      DeviceForDLDevice(cpu_client ? cpu_pjrt_client : nullptr,
-                                        gpu_client ? gpu_pjrt_client : nullptr,
-                                        dlmt->dl_tensor.device));
-  absl::Span<int64_t const> dimensions(
-      reinterpret_cast<int64_t*>(dlmt->dl_tensor.shape), dlmt->dl_tensor.ndim);
-  TF_ASSIGN_OR_RETURN(PrimitiveType element_type,
-                      DLDataTypeToPrimitiveType(dlmt->dl_tensor.dtype));
-
-  std::vector<int64_t> minor_to_major;
-  if (dlmt->dl_tensor.strides &&
-      absl::c_find(dimensions, 0) == dimensions.end()) {
-    absl::Span<int64_t const> strides(
-        reinterpret_cast<int64_t*>(dlmt->dl_tensor.strides),
-        dlmt->dl_tensor.ndim);
-    TF_ASSIGN_OR_RETURN(minor_to_major, StridesToLayout(dimensions, strides));
-  } else {
-    minor_to_major.resize(dlmt->dl_tensor.ndim);
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
-  }
-  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(element_type, dimensions,
-                                                    minor_to_major);
-
-  // Raise an error if the resulting PjRtBuffer would have a non-default layout.
-  // TODO(skyewm): we do this because JAX doesn't currently have good support
-  // for non-default layouts, and will return wrong results if a non-default
-  // layout is passed to a computation expecting default layouts. Remove this
-  // special case when non-default layouts are better supported by JAX.
-  TF_ASSIGN_OR_RETURN(Layout default_layout, device->client()->GetDefaultLayout(
-                                                 element_type, dimensions));
-  if (shape.layout() != default_layout) {
-    return Unimplemented(
-        "from_dlpack got array with non-default layout with minor-to-major "
-        "dimensions (%s), expected (%s)",
-        absl::StrJoin(shape.layout().minor_to_major(), ","),
-        absl::StrJoin(default_layout.minor_to_major(), ","));
-  }
-
-  std::function<void()> on_delete_callback;
-  if (dlmt->deleter) {
-    on_delete_callback = [dlmt]() { dlmt->deleter(dlmt); };
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      auto pjrt_buffer,
-      MakePjrtBuffer(*device, dlmt, shape, element_type, dimensions));
-
-  // We have taken ownership of the array inside the capsule; make sure the
-  // capsule it cannot be used again.
-  PyCapsule_SetName(tensor.ptr(), "used_dltensor");
-  PyCapsule_SetDestructor(tensor.ptr(), nullptr);
-  // TODO(phawkins): simplify the expression below once we know cpu_client is
-  // always non-null.
-  auto client = (cpu_client && device->client() == cpu_pjrt_client)
-                    ? std::move(*cpu_client)
-                    : std::move(*gpu_client);
-  auto* ifrt_client =
-      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
-  if (ifrt_client == nullptr) {
-    throw XlaRuntimeError(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  TF_ASSIGN_OR_RETURN(auto ifrt_array,
-                      ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
-                                            std::move(ifrt_array), false, true);
-}
-
-absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
-    const nb::capsule& tensor, ifrt::Device* ifrt_device,
-    nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream) {
-  ifrt::PjRtDevice* device =
-      llvm::dyn_cast_or_null<ifrt::PjRtDevice>(ifrt_device);
-  if (device == nullptr) {
-    throw XlaRuntimeError(
-        "DLPack is supported for PjRt-compatible backends only.");
-  }
-  if (!device->IsAddressable()) {
-    throw XlaRuntimeError(
-        "DLPack is only supported for devices addressable by the current "
-        "process.");
-  }
-  if (absl::string_view(tensor.name()) != kDlTensorCapsuleName) {
-    return InvalidArgument(
-        "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
-        "Note that a DLPack tensor may be consumed at most once.",
-        absl::string_view(tensor.name()));
-  }
-  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor.data());
-  if (dlmt->dl_tensor.ndim < 0) {
-    return InvalidArgument(
-        "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
-        dlmt->dl_tensor.ndim);
-  }
-  absl::Span<int64_t const> dimensions(
-      reinterpret_cast<int64_t*>(dlmt->dl_tensor.shape), dlmt->dl_tensor.ndim);
-  TF_ASSIGN_OR_RETURN(PrimitiveType element_type,
-                      DLDataTypeToPrimitiveType(dlmt->dl_tensor.dtype));
-
-  std::vector<int64_t> minor_to_major;
-  if (dlmt->dl_tensor.strides &&
-      absl::c_find(dimensions, 0) == dimensions.end()) {
-    absl::Span<int64_t const> strides(
-        reinterpret_cast<int64_t*>(dlmt->dl_tensor.strides),
-        dlmt->dl_tensor.ndim);
-    TF_ASSIGN_OR_RETURN(minor_to_major, StridesToLayout(dimensions, strides));
-  } else {
-    minor_to_major.resize(dlmt->dl_tensor.ndim);
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
-  }
-  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(element_type, dimensions,
-                                                    minor_to_major);
-
-  TF_ASSIGN_OR_RETURN(auto pjrt_buffer,
-                      MakePjrtBuffer(*device->pjrt_device(), dlmt, shape,
-                                     element_type, dimensions, stream));
-
-  // We have taken ownership of the array inside the capsule; make sure the
-  // capsule it cannot be used again.
-  PyCapsule_SetName(tensor.ptr(), "used_dltensor");
-  PyCapsule_SetDestructor(tensor.ptr(), nullptr);
-
-  auto* ifrt_client =
-      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
-  if (ifrt_client == nullptr) {
-    throw XlaRuntimeError(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  TF_ASSIGN_OR_RETURN(auto ifrt_array,
-                      ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
-                                            std::move(ifrt_array), false, true);
-}
-
-absl::StatusOr<nanobind::dlpack::dtype> PrimitiveTypeToNbDLDataType(
-    PrimitiveType type) {
-  TF_ASSIGN_OR_RETURN(DLDataType dl_type, PrimitiveTypeToDLDataType(type));
-
-  nanobind::dlpack::dtype nb_type;
-  nb_type.lanes = dl_type.lanes;
-  nb_type.bits = dl_type.bits;
-  nb_type.code = dl_type.code;
-
-  return nb_type;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/dlpack.h b/third_party/xla/xla/python/dlpack.h
deleted file mode 100644
index 0305e258c51f..000000000000
--- a/third_party/xla/xla/python/dlpack.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_DLPACK_H_
-#define XLA_PYTHON_DLPACK_H_
-
-#include <cstdint>
-#include <optional>
-
-#include "absl/status/statusor.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/ndarray.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/py_client.h"
-
-namespace xla {
-
-// If take_ownership is true, ownership of the buffer is handed to DLPack, and
-// the receiver may mutate the buffer as they see fit. Otherwise PjRt retains
-// ownership of the buffer and it should be immutable.
-//
-// stream, if set, is a GPU stream, e.g. cudaStream_t for CUDA GPUs, that should
-// be synchronized to the buffer as per
-// https://dmlc.github.io/dlpack/latest/python_spec.html#python-specification-for-dlpack.
-absl::StatusOr<nanobind::capsule> BufferToDLPackManagedTensor(
-    nanobind::handle buffer, std::optional<std::intptr_t> stream);
-
-absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
-    const nanobind::capsule& tensor,
-    std::optional<nb_class_ptr<PyClient>> cpu_client,
-    std::optional<nb_class_ptr<PyClient>> gpu_client);
-
-absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
-    const nanobind::capsule& tensor, ifrt::Device* device,
-    nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream);
-
-// Converts a PrimitiveType to the nanobind specific implementation of
-// DLDataType.
-absl::StatusOr<nanobind::dlpack::dtype> PrimitiveTypeToNbDLDataType(
-    PrimitiveType type);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_DLPACK_H_
diff --git a/third_party/xla/xla/python/gpu_support.cc b/third_party/xla/xla/python/gpu_support.cc
deleted file mode 100644
index 060329a03349..000000000000
--- a/third_party/xla/xla/python/gpu_support.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <Python.h>
-
-#include <memory>
-#include <optional>
-#include <set>
-#include <string>
-#include <utility>
-
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/set.h"  // IWYU pragma: keep
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "xla/pjrt/distributed/client.h"
-#include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/python/py_client.h"
-
-namespace xla {
-namespace {
-
-namespace nb = nanobind;
-
-}  // namespace
-
-void RegisterGpuClientAndDefineGpuAllocatorConfig(nanobind::module_& m_nb) {
-  nb::class_<GpuAllocatorConfig> alloc_config(m_nb, "GpuAllocatorConfig");
-  alloc_config.def(nb::init<>())
-      .def_rw("kind", &GpuAllocatorConfig::kind)
-      .def_rw("memory_fraction", &GpuAllocatorConfig::memory_fraction)
-      .def_rw("preallocate", &GpuAllocatorConfig::preallocate)
-      .def_rw("collective_memory_size",
-              &GpuAllocatorConfig::collective_memory_size);
-  nb::enum_<GpuAllocatorConfig::Kind>(alloc_config, "Kind")
-      .value("DEFAULT", GpuAllocatorConfig::Kind::kDefault)
-      .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
-      .value("BFC", GpuAllocatorConfig::Kind::kBFC)
-      .value("CUDA_ASYNC", GpuAllocatorConfig::Kind::kCudaAsync);
-
-  m_nb.def(
-      "get_gpu_client",
-      [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
-         std::shared_ptr<DistributedRuntimeClient> distributed_client,
-         int node_id, int num_nodes,
-         std::optional<std::set<int>> allowed_devices,
-         std::optional<std::string> platform_name,
-         std::optional<bool> mock = false,
-         std::optional<std::string> mock_gpu_topology =
-             "") -> nb_class_ptr<PyClient> {
-        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
-        {
-          nb::gil_scoped_release gil_release;
-          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-          if (distributed_client != nullptr) {
-            kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                   /*key_prefix=*/"gpu:");
-          }
-          GpuClientOptions options;
-          options.allocator_config = allocator_config;
-          options.node_id = node_id;
-          options.num_nodes = num_nodes;
-          options.allowed_devices = allowed_devices;
-          options.platform_name = platform_name;
-          options.kv_store = kv_store;
-          options.enable_mock_nccl = mock.value_or(false);
-          options.mock_gpu_topology = mock_gpu_topology;
-          std::unique_ptr<PjRtClient> pjrt_client =
-              xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
-          ifrt_client = ifrt::PjRtClient::Create(std::move(pjrt_client));
-        }
-        return PyClient::Make(std::move(ifrt_client));
-      },
-      nb::arg("asynchronous") = true,
-      nb::arg("allocator_config") = GpuAllocatorConfig(),
-      nb::arg("distributed_client") = nullptr, nb::arg("node_id") = 0,
-      nb::arg("num_nodes") = 1,
-      nb::arg("allowed_devices").none() = std::nullopt,
-      nb::arg("platform_name").none() = std::nullopt,
-      nb::arg("mock").none() = std::nullopt,
-      nb::arg("mock_gpu_topology").none() = std::nullopt);
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/gpu_support.h b/third_party/xla/xla/python/gpu_support.h
deleted file mode 100644
index 371d25d68d2e..000000000000
--- a/third_party/xla/xla/python/gpu_support.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_GPU_SUPPORT_H_
-#define XLA_PYTHON_GPU_SUPPORT_H_
-
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-// Registers `make_gpu_client` and `GpuAllocatorConfig` in the parameter
-// nanobind module.
-void RegisterGpuClientAndDefineGpuAllocatorConfig(nanobind::module_& m_nb);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_GPU_SUPPORT_H_
diff --git a/third_party/xla/xla/python/guard_lib.cc b/third_party/xla/xla/python/guard_lib.cc
deleted file mode 100644
index 6dec6e4e2490..000000000000
--- a/third_party/xla/xla/python/guard_lib.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This files implements the configuration management for different types of
-// guards.
-// C++ backends are responsible for enforcing transfer guard levels.
-
-#include "xla/python/guard_lib.h"
-
-#include <optional>
-#include <string>
-
-#include "absl/base/attributes.h"
-#include "absl/functional/function_ref.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "xla/util.h"
-
-namespace jax {
-
-namespace nb = ::nanobind;
-
-namespace {
-
-// Protected by the GIL.
-GuardState& global_state = *new GuardState();
-
-ABSL_CONST_INIT thread_local GuardState thread_local_state;
-
-// The default transfer guard level.
-constexpr TransferGuardLevel kDefaultGuardLevel = TransferGuardLevel::kAllow;
-
-// The default garbage collection guard level.
-constexpr GarbageCollectionGuardLevel kDefaultGarbageCollectionGuardLevel =
-    GarbageCollectionGuardLevel::kAllow;
-
-// Returns the transfer guard action for a transfer.
-TransferGuardAction GetTransferGuardAction(TransferGuardLevel guard_level,
-                                           bool explicit_transfer) {
-  switch (guard_level) {
-    case TransferGuardLevel::kAllow:
-      return TransferGuardAction::kAllow;
-    case TransferGuardLevel::kLog:
-      if (explicit_transfer) {
-        return TransferGuardAction::kAllow;
-      } else {
-        return TransferGuardAction::kLog;
-      }
-    case TransferGuardLevel::kDisallow:
-      if (explicit_transfer) {
-        return TransferGuardAction::kAllow;
-      } else {
-        return TransferGuardAction::kDisallow;
-      }
-    case TransferGuardLevel::kLogExplicit:
-      return TransferGuardAction::kLog;
-    case TransferGuardLevel::kDisallowExplicit:
-      return TransferGuardAction::kDisallow;
-    default:
-      // Unreachable; gracefully handle the unexpected guard level and prevent a
-      // compiler warning.
-      return TransferGuardAction::kDisallow;
-  }
-}
-
-// Returns the transfer guard action for a host-to-device transfer.
-// REQUIRES: Python GIL.
-TransferGuardAction GetTransferGuardActionForHostToDevice() {
-  return GetTransferGuardAction(
-      thread_local_state.host_to_device.value_or(
-          global_state.host_to_device.value_or(kDefaultGuardLevel)),
-      thread_local_state.explicit_device_put);
-}
-
-// Returns the transfer guard action for a device-to-device transfer.
-// REQUIRES: Python GIL.
-TransferGuardAction GetTransferGuardActionForDeviceToDevice() {
-  return GetTransferGuardAction(
-      thread_local_state.device_to_device.value_or(
-          global_state.device_to_device.value_or(kDefaultGuardLevel)),
-      thread_local_state.explicit_device_put);
-}
-
-// Returns the transfer guard action for a device-to-host transfer.
-// REQUIRES: Python GIL.
-TransferGuardAction GetTransferGuardActionForDeviceToHost() {
-  return GetTransferGuardAction(
-      thread_local_state.device_to_host.value_or(
-          global_state.device_to_host.value_or(kDefaultGuardLevel)),
-      thread_local_state.explicit_device_get);
-}
-
-}  // namespace
-
-absl::Status ApplyTransferGuardToHostToDevice(
-    absl::FunctionRef<std::string()> formatter) {
-  switch (GetTransferGuardActionForHostToDevice()) {
-    case TransferGuardAction::kAllow:
-      break;
-    case TransferGuardAction::kLog:
-      LOG(WARNING) << "host-to-device transfer: " << formatter();
-      break;
-    case TransferGuardAction::kDisallow:
-      return xla::InvalidArgument("Disallowed host-to-device transfer: %s",
-                                  formatter());
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ApplyTransferGuardToDeviceToDevice(
-    absl::FunctionRef<std::string()> formatter) {
-  switch (GetTransferGuardActionForDeviceToDevice()) {
-    case TransferGuardAction::kAllow:
-      break;
-    case TransferGuardAction::kLog:
-      LOG(WARNING) << "device-to-device transfer: " << formatter();
-      break;
-    case TransferGuardAction::kDisallow:
-      return xla::InvalidArgument("Disallowed device-to-device transfer: %s",
-                                  formatter());
-  }
-  return absl::OkStatus();
-}
-
-absl::Status ApplyTransferGuardToDeviceToHost(
-    absl::FunctionRef<std::string()> formatter) {
-  switch (GetTransferGuardActionForDeviceToHost()) {
-    case TransferGuardAction::kAllow:
-      break;
-    case TransferGuardAction::kLog:
-      LOG(WARNING) << "device-to-host transfer: " << formatter();
-      break;
-    case TransferGuardAction::kDisallow:
-      return xla::InvalidArgument("Disallowed device-to-host transfer: %s",
-                                  formatter());
-  }
-  return absl::OkStatus();
-}
-
-GarbageCollectionGuardLevel GetGarbageCollectArrayGuard() {
-  return thread_local_state.garbage_collect_array.value_or(
-      global_state.garbage_collect_array.value_or(
-          kDefaultGarbageCollectionGuardLevel));
-}
-
-void BuildGuardSubmodule(nb::module_& m) {
-  nb::module_ glib =
-      m.def_submodule("guard_lib", "Jax support library for guards");
-
-  nb::enum_<TransferGuardLevel> tglevel(glib, "TransferGuardLevel");
-  tglevel.value("ALLOW", TransferGuardLevel::kAllow);
-  tglevel.value("LOG", TransferGuardLevel::kLog);
-  tglevel.value("DISALLOW", TransferGuardLevel::kDisallow);
-  tglevel.value("LOG_EXPLICIT", TransferGuardLevel::kLogExplicit);
-  tglevel.value("DISALLOW_EXPLICIT", TransferGuardLevel::kDisallowExplicit);
-
-  nb::enum_<GarbageCollectionGuardLevel> gcglevel(
-      glib, "GarbageCollectionGuardLevel");
-  gcglevel.value("ALLOW", GarbageCollectionGuardLevel::kAllow);
-  gcglevel.value("LOG", GarbageCollectionGuardLevel::kLog);
-  gcglevel.value("FATAL", GarbageCollectionGuardLevel::kFatal);
-
-  nb::class_<GuardState> tgstate(glib, "GuardState");
-  tgstate.def_rw("host_to_device", &GuardState::host_to_device,
-                 nb::arg().none());
-  tgstate.def_rw("device_to_device", &GuardState::device_to_device,
-                 nb::arg().none());
-  tgstate.def_rw("device_to_host", &GuardState::device_to_host,
-                 nb::arg().none());
-  tgstate.def_rw("explicit_device_put", &GuardState::explicit_device_put);
-  tgstate.def_rw("explicit_device_get", &GuardState::explicit_device_get);
-  tgstate.def_rw("garbage_collect_array", &GuardState::garbage_collect_array,
-                 nb::arg().none());
-
-  glib.def(
-      "global_state", [&]() { return &global_state; },
-      nb::rv_policy::reference);
-  glib.def(
-      "thread_local_state", [&]() { return &thread_local_state; },
-      nb::rv_policy::reference);
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/guard_lib.h b/third_party/xla/xla/python/guard_lib.h
deleted file mode 100644
index 1ff668ff30ee..000000000000
--- a/third_party/xla/xla/python/guard_lib.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_GUARD_LIB_H_
-#define XLA_PYTHON_GUARD_LIB_H_
-
-#include <optional>
-#include <string>
-
-// placeholder for index annotation headers
-#include "absl/functional/function_ref.h"
-#include "absl/status/status.h"
-#include "nanobind/nanobind.h"
-
-namespace jax {
-
-// Transfer guard level chosen by the user code.
-enum class TransferGuardLevel {
-  // Explicit transfers: allow
-  // Implicit transfers: allow
-  kAllow,
-  // Explicit transfers: allow
-  // Implicit transfers: log
-  kLog,
-  // Explicit transfers: allow
-  // Implicit transfers: disallow
-  kDisallow,
-  // Explicit transfers: log
-  // Implicit transfers: log
-  kLogExplicit,
-  // Explicit transfers: disallow
-  // Implicit transfers: disallow
-  kDisallowExplicit,
-};
-
-// Garbage collection guard level chose by the user code.
-enum class GarbageCollectionGuardLevel {
-  // Silently allow the object to be garbage collected.
-  kAllow,
-  // Log and allow the object to be garbage collected.
-  kLog,
-  // Fatal crash on object garbage collection.
-  kFatal,
-};
-
-// Flags for guard levels are controlled by:
-// - a global flag value,
-//   e.g., associated to --jax_transfer_guard_device_to_host
-//   which defaults to TransferGuardLevel::kAllow.
-// - possibly a thread-local value, which initially is std::nullopt and
-//   overrides the global value if set. The thread-local state is used to
-//   implement context managers that locally override the global state.
-//
-// Explicit device_put/device_get contexts are tracked by context managers.
-struct GuardState {
-  std::optional<TransferGuardLevel> host_to_device;
-  std::optional<TransferGuardLevel> device_to_device;
-  std::optional<TransferGuardLevel> device_to_host;
-  bool explicit_device_put = false;
-  bool explicit_device_get = false;
-
-  std::optional<GarbageCollectionGuardLevel> garbage_collect_array;
-};
-
-// Resulting action for a transfer given the transfer guard level and the
-// transfer type.
-enum class TransferGuardAction {
-  // Silently allow the transfer.
-  kAllow,
-  // Log and allow the transfer.
-  kLog,
-  // Disallow the transfer.
-  kDisallow,
-};
-
-// Guards a host-to-device transfer. formatter is called to describe the
-// transfer in a log message or error status.
-// REQUIRES: Python GIL.
-absl::Status ApplyTransferGuardToHostToDevice(
-    absl::FunctionRef<std::string()> formatter);
-
-// Guards a device-to-device transfer. formatter is called to describe the
-// transfer in a log message or error status.
-// REQUIRES: Python GIL.
-absl::Status ApplyTransferGuardToDeviceToDevice(
-    absl::FunctionRef<std::string()> formatter);
-
-// Guards a device-to-host transfer. formatter is called to describe the
-// transfer in a log message or error status.
-// REQUIRES: Python GIL.
-absl::Status ApplyTransferGuardToDeviceToHost(
-    absl::FunctionRef<std::string()> formatter);
-
-// Returns the garbage collection guard level for "jax.Array" objects.
-// REQUIRES: Python GIL.
-GarbageCollectionGuardLevel GetGarbageCollectArrayGuard();
-
-// The function to call in `xla.cc` to add the bindings for this module.
-void BuildGuardSubmodule(nanobind::module_& m);
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_GUARD_LIB_H_
diff --git a/third_party/xla/xla/python/hlo_pass.cc b/third_party/xla/xla/python/hlo_pass.cc
new file mode 100644
index 000000000000..0c5ea280bdb6
--- /dev/null
+++ b/third_party/xla/xla/python/hlo_pass.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The JAX Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "nanobind/stl/string.h"  // IWYU pragma: keep
+#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/service/call_inliner.h"
+
+namespace nb = nanobind;
+
+namespace xla {
+namespace {
+
+NB_MODULE(_hlo_pass, m) {
+  nb::class_<HloModuleGroup> hlo_module_group_class(m, "HloModuleGroup");
+  hlo_module_group_class
+      .def("__init__",
+           [](HloModuleGroup* self, const std::string& name,
+              const std::vector<std::shared_ptr<HloModule>>& hlo_modules) {
+             std::vector<std::unique_ptr<HloModule>> modules;
+             modules.reserve(hlo_modules.size());
+             for (const auto& m : hlo_modules) {
+               modules.push_back(m->Clone(/*suffix=*/""));
+             }
+             new (self) HloModuleGroup(name, std::move(modules));
+           })
+      .def_prop_ro("name", &HloModuleGroup::name)
+      .def("to_string", &HloModuleGroup::ToString)
+      .def("to_modules",
+           [](HloModuleGroup& m) -> std::vector<std::shared_ptr<HloModule>> {
+             std::vector<std::unique_ptr<HloModule>> modules =
+                 m.ConsumeModules();
+             std::vector<std::shared_ptr<HloModule>> shared_modules;
+             shared_modules.reserve(modules.size());
+             for (auto& module : modules) {
+               shared_modules.push_back(std::move(module));
+             }
+             return shared_modules;
+           });
+
+  // Hlo Module Passes
+  nb::class_<HloPassInterface> hlo_pass_interface(m, "HloPassInterface");
+  hlo_pass_interface.def_prop_ro("name", &HloPassInterface::name)
+      .def("is_pass_pipeline", &HloPassInterface::IsPassPipeline)
+      .def("run",
+           [](HloPassInterface& pass, HloModule* module) -> bool {
+             return xla::ValueOrThrow(pass.Run(module));
+           })
+      .def("run_on_module_group",
+           [](HloPassInterface& pass, HloModuleGroup* module_group) -> bool {
+             return xla::ValueOrThrow(pass.RunOnModuleGroup(module_group));
+           });
+
+  nb::class_<HloDCE, HloPassInterface>(m, "HloDCE").def(nb::init<>());
+  nb::class_<CallInliner, HloPassInterface>(m, "CallInliner").def(nb::init<>());
+  nb::class_<FlattenCallGraph, HloPassInterface>(m, "FlattenCallGraph")
+      .def(nb::init<>());
+  nb::class_<TupleSimplifier, HloPassInterface>(m, "TupleSimplifier")
+      .def(nb::init<>());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index e3970b8581ae..13f62a5f5c14 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -1,10 +1,10 @@
-# copybara:uncomment load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/python/ifrt:ifrt.bzl", "ifrt_package_groups")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform/default:rules_cc.bzl", "cc_library")
+# copybara:uncomment load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -27,6 +27,7 @@ cc_library(
         "device.cc",
         "device_list.cc",
         "dtype.cc",
+        "layout.cc",
         "remap_plan.cc",
         "executable.cc",
         "host_callback.cc",
@@ -44,6 +45,7 @@ cc_library(
     ],
     hdrs = [
         "array.h",
+        "layout.h",
         "array_spec.h",
         "client.h",
         "compiler.h",
@@ -78,11 +80,13 @@ cc_library(
         ":device_proto_cc",
         ":dtype_proto_cc",
         ":execute_options_proto_cc",
+        ":layout_proto_cc",
         ":ref_wrapper",
         ":remap_plan_proto_cc",
         ":serdes",
         ":shape_proto_cc",
         ":sharding_proto_cc",
+        ":user_context",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
@@ -97,6 +101,7 @@ cc_library(
         "//xla/python/ifrt/ir:sharding_param",
         "//xla/service:computation_placer_hdr",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/lib/core:bitmap",
         "//xla/tsl/lib/gtl:int_type",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
@@ -111,6 +116,7 @@ cc_library(
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -130,7 +136,6 @@ xla_cc_test(
         ":mock",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -164,6 +169,28 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "client_impl_util",
+    srcs = ["client_impl_util.cc"],
+    hdrs = ["client_impl_util.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = internal_visibility([
+        ":friends",
+        ":internal",
+    ]),
+    deps = [
+        ":ifrt",
+        ":user_context",
+        "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 tf_proto_library(
     name = "execute_options_proto",
     srcs = ["execute_options.proto"],
@@ -225,6 +252,71 @@ xla_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "layout_proto",
+    srcs = ["layout.proto"],
+    protodeps = [":serdes_proto"],
+    visibility = internal_visibility([
+        ":friends",
+        ":internal",
+        ":users",
+    ]),
+)
+
+xla_cc_test(
+    name = "layout_test",
+    size = "small",
+    srcs = ["layout_test.cc"],
+    deps = [
+        ":basic_device_list",
+        ":ifrt",
+        ":mock",
+        "//xla:shape_util",
+        "//xla/pjrt:pjrt_layout",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "layout_serdes_proto",
+    srcs = ["layout_serdes.proto"],
+)
+
+cc_library(
+    name = "layout_serdes",
+    srcs = ["layout_serdes.cc"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":ifrt",
+        ":layout_serdes_proto_cc",
+        ":serdes",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+    ],
+    alwayslink = True,
+)
+
+xla_cc_test(
+    name = "layout_serdes_test",
+    size = "small",
+    srcs = ["layout_serdes_test.cc"],
+    deps = [
+        ":ifrt",
+        ":layout_serdes",
+        ":serdes",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 xla_cc_test(
     name = "memory_test",
     size = "small",
@@ -287,7 +379,6 @@ xla_cc_test(
         ":device_test_util",
         ":ifrt",
         "//xla/python/ifrt/ir:sharding_param",
-        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
@@ -357,7 +448,7 @@ cc_library(
         ":internal",
     ]),
     deps = [
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -379,8 +470,11 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
@@ -418,8 +512,10 @@ cc_library(
     deps = [
         ":ifrt",
         ":test_util",
+        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
     alwayslink = True,
 )
@@ -478,6 +574,7 @@ cc_library(
         ":attribute_map",
         ":basic_device_list",
         ":ifrt",
+        ":user_context",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
@@ -485,12 +582,12 @@ cc_library(
         "//xla/pjrt:pjrt_layout",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/framework:allocator",
-        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -562,6 +659,7 @@ cc_library(
         "//xla:util",
         "@llvm-project//llvm:Support",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -734,7 +832,6 @@ xla_cc_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
-        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -918,6 +1015,21 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "user_context",
+    srcs = ["user_context.cc"],
+    hdrs = ["user_context.h"],
+    visibility = internal_visibility([
+        ":friends",
+        ":internal",
+        ":users",
+    ]),
+    deps = [
+        "//xla/tsl/concurrency:ref_count",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 xla_cc_test(
     name = "custom_call_program_serdes_test",
     srcs = ["custom_call_program_serdes_test.cc"],
diff --git a/third_party/xla/xla/python/ifrt/array.cc b/third_party/xla/xla/python/ifrt/array.cc
index 8f31a05ec4cc..8004c074fe19 100644
--- a/third_party/xla/xla/python/ifrt/array.cc
+++ b/third_party/xla/xla/python/ifrt/array.cc
@@ -25,8 +25,7 @@ namespace ifrt {
 
 char Array::ID = 0;
 
-std::vector<Array*> MakeArrayPointerList(
-    absl::Span<const tsl::RCReference<Array>> arrays) {
+std::vector<Array*> MakeArrayPointerList(absl::Span<const ArrayRef> arrays) {
   std::vector<Array*> result;
   result.reserve(arrays.size());
   for (const auto& array : arrays) {
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index e31a26003523..9fd71e8716ca 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/base/nullability.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -38,6 +39,7 @@ namespace xla {
 namespace ifrt {
 
 class Client;
+class Array;
 
 // Semantics for operations that may copy or move sharded buffers in an array.
 enum class ArrayCopySemantics : int {
@@ -56,6 +58,8 @@ enum class ArrayCopySemantics : int {
   kDonateInput,
 };
 
+using ArrayRef = tsl::RCReference<Array>;
+
 // Represents a single logical array from one or more sharded buffers.
 // Implementations must be thread-safe.
 class Array : public llvm::RTTIExtends<Array, Value> {
@@ -71,29 +75,34 @@ class Array : public llvm::RTTIExtends<Array, Value> {
   virtual DType dtype() const = 0;
   virtual const Shape& shape() const = 0;
   virtual const Sharding& sharding() const = 0;
-  virtual absl::Nonnull<std::shared_ptr<const Sharding>> shared_ptr_sharding()
-      const = 0;
+  virtual ShardingRef shared_ptr_sharding() const = 0;
   // The device memory layout for each shard of the Array. All shards are
   // assumed to have the same layout. Cannot be nullptr; implementations should
   // return UNIMPLEMENTED instead.
-  virtual absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const = 0;
+  virtual absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> layout()
+      const = 0;
 
   // Breaks an array up into per-device arrays. This is the elimination
   // counterpart of `Client::AssembleArrayFromSingleDeviceArrays()`.
-  // TODO(hyeontaek): Replace this API with the version that takes
-  // `SingleDeviceShardSemantics`.
-  virtual absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) = 0;
-  virtual absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  virtual absl::StatusOr<std::vector<ArrayRef>>
   DisassembleIntoSingleDeviceArrays(
       ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics) = 0;
 
+  // TODO(hyeontaek): Replace this API with the version that takes
+  // `SingleDeviceShardSemantics`.
+  ABSL_DEPRECATE_AND_INLINE()
+  absl::StatusOr<std::vector<ArrayRef>> DisassembleIntoSingleDeviceArrays(
+      ArrayCopySemantics semantics) {
+    return DisassembleIntoSingleDeviceArrays(
+        semantics, SingleDeviceShardSemantics::kAddressableShards);
+  }
+
   // Returns a shard of an Array which is fully replicated. This is an
   // optimization so that instead of disassembling into all the shards when
   // the Array is fully replicated, we can just get 1 shard out and create an
   // Array from it.
-  virtual absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+  virtual absl::StatusOr<ArrayRef> FullyReplicatedShard(
       ArrayCopySemantics semantics) = 0;
 
   // Fetches the array to host and stores it as unreplicated, unsharded data.
@@ -133,8 +142,7 @@ class Array : public llvm::RTTIExtends<Array, Value> {
 
 // Convenience function to create a list of pointer Arrays from a list of
 // RCReference<Array>s.
-std::vector<Array*> MakeArrayPointerList(
-    absl::Span<const tsl::RCReference<Array>> arrays);
+std::vector<Array*> MakeArrayPointerList(absl::Span<const ArrayRef> arrays);
 
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
index 4e0095e3a375..e59278dbb298 100644
--- a/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/array_impl_test_lib.cc
@@ -21,13 +21,17 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
@@ -48,11 +52,35 @@ namespace xla {
 namespace ifrt {
 namespace {
 
+using ::testing::_;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::HasSubstr;
 using ::testing::SizeIs;
 using ::tsl::testing::StatusIs;
 
+// Returns a list of non-addressable devices in the client.
+std::vector<Device*> GetNonAddressableDevices(Client* client) {
+  std::vector<Device*> devices;
+  for (auto* device : client->devices()) {
+    if (!device->IsAddressable()) {
+      devices.push_back(device);
+    }
+  }
+  return devices;
+}
+
+// Returns all addressable CPU devices in the client.
+std::vector<Device*> GetAddressableCpuDevices(Client* client) {
+  std::vector<Device*> cpu_devices;
+  for (const auto& device : client->GetAllDevices()) {
+    if (device->IsAddressable() && device->Kind() == "cpu") {
+      cpu_devices.push_back(device);
+    }
+  }
+  return cpu_devices;
+}
+
 TEST(ArrayImplTest, MakeArrayFromHostBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
 
@@ -61,8 +89,43 @@ TEST(ArrayImplTest, MakeArrayFromHostBuffer) {
   auto data = std::make_unique<std::vector<float>>(6);
   std::iota(data->begin(), data->end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data->data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding,
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      /*on_done_with_host_buffer=*/nullptr));
+
+  EXPECT_EQ(array->dtype(), dtype);
+  EXPECT_EQ(array->shape(), shape);
+  EXPECT_EQ(array->shared_ptr_sharding().get(), sharding.get());
+}
+
+TEST(ArrayImplTest,
+     MakeArrayFromHostBufferWithAddressableAndNonAddressableDevice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  std::vector<Device*> non_addressable_devices =
+      GetNonAddressableDevices(client.get());
+  if (non_addressable_devices.empty()) {
+    GTEST_SKIP() << "Skipping test; needs at least 1 non-addressable device.";
+  }
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+
+  std::vector<Device*> devices;
+  devices.reserve(2);
+  devices.push_back(non_addressable_devices.at(0));
+  devices.push_back(client->addressable_devices().at(0));
+  ShardingRef sharding = xla::ifrt::ConcreteEvenSharding::Create(
+      client->MakeDeviceList(devices), xla::ifrt::MemoryKind(), shape,
+      /*shard_shape=*/shape,
+      /*is_fully_replicated=*/true);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -89,8 +152,7 @@ TEST_P(ArrayImplWithHostBufferSemanticsTest,
   auto data = std::make_unique<std::vector<float>>(6);
   std::iota(data->begin(), data->end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   absl::Notification done_with_host_buffer;
   auto on_done_with_host_buffer = [&]() { done_with_host_buffer.Notify(); };
@@ -138,8 +200,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferImmutableOnlyDuringCall) {
   auto data = std::make_unique<std::vector<float>>(6);
   std::iota(data->begin(), data->end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   absl::Notification done_with_host_buffer;
   auto on_done_with_host_buffer = [&]() {
@@ -176,8 +237,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferImmutableUntilTransferCompletes) {
   auto data = std::make_unique<std::vector<float>>(6);
   std::iota(data->begin(), data->end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array,
@@ -201,8 +261,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferZeroCopy) {
   auto data = std::make_unique<std::vector<float>>(6);
   std::iota(data->begin(), data->end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(auto array,
                           client->MakeArrayFromHostBuffer(
@@ -229,8 +288,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferAndCopyToHostBuffer) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -256,8 +314,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferWithByteStridesAndCopyToHostBuffer) {
   std::vector<float> data = {0, 3, 1, 4, 2, 5};
   std::vector<int64_t> byte_strides = {4, 8};
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -283,8 +340,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferAndCopyToHostBufferWithByteStrides) {
   // The input data layout is major-to-minor.
   std::vector<float> data = {0, 1, 2, 3, 4, 5};
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -311,7 +367,7 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferReplicated) {
   auto data = std::make_unique<std::vector<float>>(6);
   std::iota(data->begin(), data->end(), 0);
   absl::Span<Device* const> devices = client->addressable_devices();
-  std::shared_ptr<const Sharding> sharding = ConcreteEvenSharding::Create(
+  ShardingRef sharding = ConcreteEvenSharding::Create(
       client->MakeDeviceList(devices), MemoryKind(), shape,
       /*shard_shape=*/shape, /*is_fully_replicated=*/true);
 
@@ -346,6 +402,417 @@ TEST(ArrayImplTest, MakeArrayFromHostBufferReplicated) {
   }
 }
 
+TEST(ArrayImplTest, MakeArraysFromHostBufferShardsAndCopyToHostBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  if (client->addressable_devices().size() < 2) {
+    GTEST_SKIP() << "This test is relevant only for clients with devices that "
+                    "have at least 2 devices";
+  }
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  Shape shard_shape({1, 3});
+  auto data0 = std::make_unique<std::vector<float>>(3);
+  std::iota(data0->begin(), data0->end(), 0);
+  auto data1 = std::make_unique<std::vector<float>>(3);
+  std::iota(data1->begin(), data1->end(), 3);
+  absl::Span<Device* const> devices =
+      client->addressable_devices().subspan(0, 2);
+  TF_ASSERT_OK_AND_ASSIGN(
+      ShardingRef sharding,
+      ShardingParamSharding::Create(
+          ShardingParam(
+              /*dim_shards=*/{2, 1},
+              {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 1}}),
+          client->MakeDeviceList(devices), MemoryKind()));
+
+  std::vector<Client::MakeArraysFromHostBufferShardsSpec> specs;
+  // Create two arrays with the same sharding, but swapped host buffers (data0
+  // and data1).
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data0->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}},
+          {{1},
+           {data1->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}}},
+      /*array_spec=*/{dtype, shape, sharding, /*layout=*/nullptr},
+  });
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data1->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}},
+          {{1},
+           {data0->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}}},
+      /*array_spec=*/{dtype, shape, sharding, /*layout=*/nullptr},
+  });
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arrays, client->MakeArraysFromHostBufferShards(
+                       absl::MakeSpan(specs),
+                       Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                       client->CreateUserContext()));
+  ASSERT_THAT(arrays, SizeIs(2));
+
+  // Once the `Array` has become ready, the host buffer is not accessed.
+  TF_ASSERT_OK(arrays[0]->GetReadyFuture().Await());
+  TF_ASSERT_OK(arrays[1]->GetReadyFuture().Await());
+  data0 = nullptr;
+  data1 = nullptr;
+  // There should be no use-after-free.
+
+  for (int i = 0; i < arrays.size(); ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto single_device_arrays,
+        arrays[i]->DisassembleIntoSingleDeviceArrays(
+            ArrayCopySemantics::kAlwaysCopy,
+            SingleDeviceShardSemantics::kAddressableShards));
+    ASSERT_EQ(single_device_arrays.size(), devices.size());
+    for (int j = 0; j < single_device_arrays.size(); ++j) {
+      EXPECT_THAT(single_device_arrays[j]->sharding().devices()->devices(),
+                  ElementsAre(devices[j]));
+
+      std::vector<float> out_data(3);
+      auto future = single_device_arrays[j]->CopyToHostBuffer(
+          out_data.data(),
+          /*byte_strides=*/std::nullopt, ArrayCopySemantics::kAlwaysCopy);
+      TF_ASSERT_OK(future.Await());
+      if ((i + j) % 2 == 0) {
+        EXPECT_THAT(out_data, ElementsAre(0, 1, 2));
+      } else {
+        EXPECT_THAT(out_data, ElementsAre(3, 4, 5));
+      }
+    }
+  }
+}
+
+TEST(ArrayImplTest, MakeArraysFromHostBufferShardsWithDifferentDevices) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  if (client->addressable_devices().size() < 2) {
+    GTEST_SKIP() << "This test is relevant only for clients with devices that "
+                    "have at least 2 devices";
+  }
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  Shape shard_shape = shape;
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+
+  ShardingRef sharding0 = SingleDeviceSharding::Create(
+      client->addressable_devices()[0], MemoryKind());
+  ShardingRef sharding1 = SingleDeviceSharding::Create(
+      client->addressable_devices()[1], MemoryKind());
+
+  std::vector<Client::MakeArraysFromHostBufferShardsSpec> specs;
+  // Create two arrays with different shardings.
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}}},
+      /*array_spec=*/{dtype, shape, sharding0, /*layout=*/nullptr},
+  });
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}}},
+      /*array_spec=*/{dtype, shape, sharding1, /*layout=*/nullptr},
+  });
+
+  absl::Status status;
+  auto result = client->MakeArraysFromHostBufferShards(
+      absl::MakeSpan(specs),
+      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+      client->CreateUserContext());
+  if (result.ok()) {
+    // Implementations may poison outputs instead of immediately returning an
+    // error.
+    status = result->at(0)->GetReadyFuture().Await();
+  } else {
+    status = result.status();
+  }
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST(ArrayImplTest, MakeArraysFromHostBufferShardsWithDifferentMemoryKinds) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  if (client->addressable_devices().front()->Memories().size() < 2) {
+    GTEST_SKIP() << "This test is relevant only for clients with a device that "
+                    "have at least 2 memories";
+  }
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  Shape shard_shape = shape;
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+
+  std::vector<MemoryKind> memory_kinds;
+  for (const Memory* memory :
+       client->addressable_devices().front()->Memories()) {
+    memory_kinds.push_back(memory->Kind());
+  }
+
+  ShardingRef sharding0 = SingleDeviceSharding::Create(
+      client->addressable_devices().front(), memory_kinds[0]);
+  ShardingRef sharding1 = SingleDeviceSharding::Create(
+      client->addressable_devices().front(), memory_kinds[1]);
+
+  std::vector<Client::MakeArraysFromHostBufferShardsSpec> specs;
+  // Create two arrays with different shardings.
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}}},
+      /*array_spec=*/{dtype, shape, sharding0, /*layout=*/nullptr},
+  });
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data->data(), dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/nullptr}}},
+      /*array_spec=*/{dtype, shape, sharding1, /*layout=*/nullptr},
+  });
+
+  absl::Status status;
+  auto result = client->MakeArraysFromHostBufferShards(
+      absl::MakeSpan(specs),
+      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+      client->CreateUserContext());
+  if (result.ok()) {
+    // Implementations may poison outputs instead of immediately returning an
+    // error.
+    status = result->at(0)->GetReadyFuture().Await();
+  } else {
+    status = result.status();
+  }
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST(ArrayImplTest, MakeArrayFromHostBufferAndCopyToHostBufferWithString) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  auto cpu_devices = GetAddressableCpuDevices(client.get());
+  if (cpu_devices.empty()) {
+    GTEST_SKIP()
+        << "This test is relevant only for clients with at least 1 CPU device";
+  }
+
+  DType dtype(DType::kString);
+  Shape shape({2, 3});
+  auto cords = std::make_shared<std::vector<absl::Cord>>();
+  cords->reserve(shape.num_elements());
+  for (int64_t k = 0; k < shape.num_elements(); ++k) {
+    cords->push_back(absl::Cord(absl::StrCat("string-", k)));
+  }
+  void* data_ptr = static_cast<void*>(cords->data());
+  Device* device = cpu_devices.front();
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array,
+      client->MakeArrayFromHostBuffer(
+          data_ptr, dtype, shape,
+          /*byte_strides=*/std::nullopt, std::move(sharding),
+          Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
+          /*on_done_with_host_buffer=*/[cords = std::move(cords)]() {}));
+
+  std::vector<absl::Cord> out_data(shape.num_elements());
+  auto future =
+      array->CopyToHostBuffer(out_data.data(), /*byte_strides=*/std::nullopt,
+                              ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+  for (int k = 0; k < shape.num_elements(); ++k) {
+    EXPECT_EQ(out_data[k].Flatten(), absl::StrCat("string-", k))
+        << "Unexpected data at element " << k;
+  }
+}
+
+TEST(ArrayImplTest,
+     MakeArraysFromHostBufferShardsAndCopyToHostBufferWithString) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  auto cpu_devices = GetAddressableCpuDevices(client.get());
+  if (cpu_devices.size() < 2) {
+    GTEST_SKIP()
+        << "This test is relevant only for clients with at least 2 CPU devices";
+  }
+
+  DType dtype(DType::kString);
+  Shape shape({2, 3});
+  Shape shard_shape({1, 3});
+
+  auto cords0 = std::make_shared<std::vector<absl::Cord>>();
+  cords0->reserve(shard_shape.num_elements());
+  for (int64_t k = 0; k < shard_shape.num_elements(); ++k) {
+    cords0->push_back(absl::Cord(absl::StrCat("string-", k)));
+  }
+  void* data_ptr0 = static_cast<void*>(cords0->data());
+
+  auto cords1 = std::make_shared<std::vector<absl::Cord>>();
+  cords1->reserve(shard_shape.num_elements());
+  for (int64_t k = 0; k < shard_shape.num_elements(); ++k) {
+    cords1->push_back(absl::Cord(absl::StrCat("string-", k + 100)));
+  }
+  void* data_ptr1 = static_cast<void*>(cords1->data());
+
+  absl::Span<Device* const> devices =
+      absl::MakeConstSpan(cpu_devices).subspan(0, 2);
+  TF_ASSERT_OK_AND_ASSIGN(
+      ShardingRef sharding,
+      ShardingParamSharding::Create(
+          ShardingParam(
+              /*dim_shards=*/{2, 1},
+              {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 1}}),
+          client->MakeDeviceList(devices), MemoryKind()));
+
+  std::vector<Client::MakeArraysFromHostBufferShardsSpec> specs;
+  // Create two arrays with the same sharding, but swapped host buffers (data0
+  // and data1).
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data_ptr0, dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/[cords0]() {}}},
+          {{1},
+           {data_ptr1, dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/[cords1]() {}}}},
+      /*array_spec=*/{dtype, shape, sharding, /*layout=*/nullptr},
+  });
+  specs.push_back({
+      /*buffers=*/{
+          {{0},
+           {data_ptr1, dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/[cords1]() {}}},
+          {{1},
+           {data_ptr0, dtype, shard_shape, /*byte_strides=*/std::nullopt,
+            /*on_done_with_host_buffer=*/[cords0]() {}}}},
+      /*array_spec=*/{dtype, shape, sharding, /*layout=*/nullptr},
+  });
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arrays,
+      client->MakeArraysFromHostBufferShards(
+          absl::MakeSpan(specs),
+          Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
+          client->CreateUserContext()));
+  ASSERT_THAT(arrays, SizeIs(2));
+
+  // Resetting these references does not necessarily destroy host buffers
+  // immediately. The host buffer will be destroyed once the transfer also
+  // finishes and `on_done_with_host_buffer` is destroyed.
+  //
+  // There is no need to reset references, but doing so in this test may shorten
+  // the lifetime of host buffers and help detect an incorrect API
+  // implementation as a form of use-after-free if the implementation destroyed
+  // `on_done_with_host_buffer` prematurely.
+  cords0 = nullptr;
+  cords1 = nullptr;
+
+  for (int i = 0; i < arrays.size(); ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto single_device_arrays,
+        arrays[i]->DisassembleIntoSingleDeviceArrays(
+            ArrayCopySemantics::kAlwaysCopy,
+            SingleDeviceShardSemantics::kAddressableShards));
+    ASSERT_EQ(single_device_arrays.size(), devices.size());
+    for (int j = 0; j < single_device_arrays.size(); ++j) {
+      EXPECT_THAT(single_device_arrays[j]->sharding().devices()->devices(),
+                  ElementsAre(devices[j]))
+          << "Unexpected array sharding devices for array " << i << " shard "
+          << j;
+
+      std::vector<absl::Cord> out_data(shard_shape.num_elements());
+      auto future = single_device_arrays[j]->CopyToHostBuffer(
+          out_data.data(),
+          /*byte_strides=*/std::nullopt, ArrayCopySemantics::kAlwaysCopy);
+      TF_ASSERT_OK(future.Await());
+      if ((i + j) % 2 == 0) {
+        for (int k = 0; k < shard_shape.num_elements(); ++k) {
+          EXPECT_EQ(out_data[k].Flatten(), absl::StrCat("string-", k))
+              << "Unexpected data at array " << i << " shard " << j
+              << " element " << k;
+        }
+      } else {
+        for (int k = 0; k < shard_shape.num_elements(); ++k) {
+          EXPECT_EQ(out_data[k].Flatten(), absl::StrCat("string-", k + 100))
+              << "Unexpected data at array " << i << " shard " << j
+              << " element " << k;
+        }
+      }
+    }
+  }
+}
+
+TEST(ArrayImplTest, MakeErrorArrays) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  xla::ifrt::DeviceListRef device_list =
+      client->MakeDeviceList(client->addressable_devices());
+
+  Shape shape({2, 2});
+  ArraySpec array_spec = {
+      /*dtype=*/xla::ifrt::DType(xla::ifrt::DType::kS8),
+      /*shape=*/shape,
+      /*sharding=*/
+      xla::ifrt::ConcreteEvenSharding::Create(
+          device_list, xla::ifrt::MemoryKind(), shape, /*shard_shape=*/shape,
+          /*is_fully_replicated=*/true),
+  };
+
+  const absl::Status error = absl::InternalError("injected error");
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<xla::ifrt::ArrayRef> arrays,
+      client->MakeErrorArrays(error, {array_spec, array_spec},
+                              client->CreateUserContext()));
+  ASSERT_EQ(arrays.size(), 2);
+
+  EXPECT_THAT(arrays[0]->GetReadyFuture().Await(),
+              StatusIs(_, HasSubstr("injected error")));
+  EXPECT_THAT(arrays[1]->GetReadyFuture().Await(),
+              StatusIs(_, HasSubstr("injected error")));
+}
+
+TEST(ArrayImplTest, MakeErrorArraysWithAddressableAndNonAddressableDevice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  std::vector<Device*> non_addressable_devices =
+      GetNonAddressableDevices(client.get());
+  if (non_addressable_devices.empty()) {
+    GTEST_SKIP() << "Skipping test; needs at least 1 non-addressable device.";
+  }
+
+  Shape shape({2, 2});
+
+  std::vector<Device*> devices;
+  devices.reserve(2);
+  devices.push_back(client->addressable_devices().at(0));
+  devices.push_back(non_addressable_devices.at(0));
+  ShardingRef sharding =
+      ConcreteEvenSharding::Create(client->MakeDeviceList(devices),
+                                   MemoryKind(), shape, /*shard_shape=*/shape,
+                                   /*is_fully_replicated=*/true);
+
+  ArraySpec array_spec = {/*dtype=*/xla::ifrt::DType(xla::ifrt::DType::kS8),
+                          /*shape=*/shape,
+                          /*sharding=*/sharding};
+
+  const absl::Status error = absl::InternalError("injected error");
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<xla::ifrt::ArrayRef> arrays,
+      client->MakeErrorArrays(error, {array_spec, array_spec},
+                              client->CreateUserContext()));
+  ASSERT_EQ(arrays.size(), 2);
+
+  EXPECT_THAT(arrays[0]->GetReadyFuture().Await(),
+              StatusIs(_, HasSubstr("injected error")));
+  EXPECT_THAT(arrays[1]->GetReadyFuture().Await(),
+              StatusIs(_, HasSubstr("injected error")));
+}
+
 TEST(ArrayImplTest, AssembleArray) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
 
@@ -354,11 +821,9 @@ TEST(ArrayImplTest, AssembleArray) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device0 = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding0 =
-      SingleDeviceSharding::Create(device0, MemoryKind());
+  ShardingRef sharding0 = SingleDeviceSharding::Create(device0, MemoryKind());
   Device* device1 = client->addressable_devices().at(1);
-  std::shared_ptr<const Sharding> sharding1 =
-      SingleDeviceSharding::Create(device1, MemoryKind());
+  ShardingRef sharding1 = SingleDeviceSharding::Create(device1, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array0, client->MakeArrayFromHostBuffer(
@@ -373,9 +838,9 @@ TEST(ArrayImplTest, AssembleArray) {
                        Client::HostBufferSemantics::kImmutableOnlyDuringCall,
                        /*on_done_with_host_buffer=*/{}));
 
-  std::vector<tsl::RCReference<Array>> arrays({array0, array1});
+  std::vector<ArrayRef> arrays({array0, array1});
   Shape assembled_shape({4, 3});
-  std::shared_ptr<const Sharding> assembled_sharding = OpaqueSharding::Create(
+  ShardingRef assembled_sharding = OpaqueSharding::Create(
       client->MakeDeviceList({array0->sharding().devices()->devices().front(),
                               array1->sharding().devices()->devices().front()}),
       MemoryKind());
@@ -400,11 +865,9 @@ TEST(ArrayImplTest, AssembleAndDisassembleArray) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device0 = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding0 =
-      SingleDeviceSharding::Create(device0, MemoryKind());
+  ShardingRef sharding0 = SingleDeviceSharding::Create(device0, MemoryKind());
   Device* device1 = client->addressable_devices().at(1);
-  std::shared_ptr<const Sharding> sharding1 =
-      SingleDeviceSharding::Create(device1, MemoryKind());
+  ShardingRef sharding1 = SingleDeviceSharding::Create(device1, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array0, client->MakeArrayFromHostBuffer(
@@ -419,7 +882,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleArray) {
                        Client::HostBufferSemantics::kImmutableOnlyDuringCall,
                        /*on_done_with_host_buffer=*/{}));
 
-  std::vector<tsl::RCReference<Array>> arrays({array0, array1});
+  std::vector<ArrayRef> arrays({array0, array1});
   Shape assembled_shape({4, 3});
   ShardingParam sharding_param(
       /*dim_shards=*/{2, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 1}});
@@ -427,10 +890,10 @@ TEST(ArrayImplTest, AssembleAndDisassembleArray) {
       client->MakeDeviceList({array0->sharding().devices()->devices().front(),
                               array1->sharding().devices()->devices().front()});
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> sharding_param_sharding,
+      ShardingRef sharding_param_sharding,
       ShardingParamSharding::Create(std::move(sharding_param), ifrt_device_list,
                                     MemoryKind()));
-  std::shared_ptr<const Sharding> assembled_shardings[] = {
+  ShardingRef assembled_shardings[] = {
       ConcreteEvenSharding::Create(ifrt_device_list, MemoryKind(),
                                    assembled_shape, shape),
       sharding_param_sharding};
@@ -468,8 +931,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleSingleDeviceArray) {
   std::vector<float> data(6);
   absl::c_iota(data, 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -478,7 +940,7 @@ TEST(ArrayImplTest, AssembleAndDisassembleSingleDeviceArray) {
                       Client::HostBufferSemantics::kImmutableOnlyDuringCall,
                       /*on_done_with_host_buffer=*/{}));
 
-  std::vector<tsl::RCReference<Array>> arrays({array});
+  std::vector<ArrayRef> arrays({array});
 
   TF_ASSERT_OK_AND_ASSIGN(auto assembled_array,
                           client->AssembleArrayFromSingleDeviceArrays(
@@ -511,8 +973,7 @@ TEST(ArrayImplTest, CopyToSameDevices) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -534,6 +995,61 @@ TEST(ArrayImplTest, CopyToSameDevices) {
   EXPECT_THAT(out_data, ElementsAreArray(data));
 }
 
+TEST(ArrayImplTest, AssembleAndDisassembleNonAddressableArray) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  std::vector<Device*> non_addressable_devices =
+      GetNonAddressableDevices(client.get());
+  if (non_addressable_devices.size() < 2) {
+    GTEST_SKIP() << "Skipping test; needs at least 2 non-addressable devices.";
+  }
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device0 = client->addressable_devices().at(0);
+  ShardingRef sharding0 = SingleDeviceSharding::Create(device0, MemoryKind());
+  Device* device1 = client->addressable_devices().at(1);
+  ShardingRef sharding1 = SingleDeviceSharding::Create(device1, MemoryKind());
+
+  std::vector<ArrayRef> arrays;
+  Shape assembled_shape({4, 3});
+  ShardingParam sharding_param(
+      /*dim_shards=*/{2, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 1}});
+
+  absl::flat_hash_set<DeviceId> addressable_device_ids;
+  for (auto* device : client->addressable_devices()) {
+    addressable_device_ids.insert(device->Id());
+  }
+  auto ifrt_device_list = client->MakeDeviceList(
+      absl::MakeConstSpan(non_addressable_devices).subspan(0, 2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      ShardingRef sharding_param_sharding,
+      ShardingParamSharding::Create(std::move(sharding_param), ifrt_device_list,
+                                    MemoryKind()));
+  ShardingRef assembled_shardings[] = {
+      ConcreteEvenSharding::Create(ifrt_device_list, MemoryKind(),
+                                   assembled_shape, shape),
+      sharding_param_sharding};
+  for (auto& assembled_sharding : assembled_shardings) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto assembled_array,
+        client->AssembleArrayFromSingleDeviceArrays(
+            dtype, assembled_shape, assembled_sharding, absl::MakeSpan(arrays),
+            ArrayCopySemantics::kAlwaysCopy,
+            SingleDeviceShardSemantics::kAddressableShards));
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto single_device_arrays,
+        assembled_array->DisassembleIntoSingleDeviceArrays(
+            ArrayCopySemantics::kAlwaysCopy,
+            SingleDeviceShardSemantics::kAddressableShards));
+
+    ASSERT_THAT(single_device_arrays, SizeIs(0));
+  }
+}
+
 TEST(ArrayImplTest, CopyToDifferentDevice) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   DeviceListRef devices = client->MakeDeviceList(client->addressable_devices());
@@ -543,10 +1059,9 @@ TEST(ArrayImplTest, CopyToDifferentDevice) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
-  std::vector<tsl::RCReference<Array>> shards;
+  std::vector<ArrayRef> shards;
   for (auto* device : devices->devices()) {
-    std::shared_ptr<const Sharding> sharding =
-        SingleDeviceSharding::Create(device, MemoryKind());
+    ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
     TF_ASSERT_OK_AND_ASSIGN(shards.emplace_back(),
                             client->MakeArrayFromHostBuffer(
                                 data.data(), dtype, shape,
@@ -556,10 +1071,10 @@ TEST(ArrayImplTest, CopyToDifferentDevice) {
 
   // Intentionally use different shardings to verify that each result array has
   // the correct sharding.
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   {
     std::vector<Shape> shapes(shards.size(), shape);
-    std::shared_ptr<const Sharding> sharding =
+    ShardingRef sharding =
         ConcreteSharding::Create(devices, MemoryKind(), shape, shapes);
     TF_ASSERT_OK_AND_ASSIGN(
         arrays.emplace_back(),
@@ -569,7 +1084,7 @@ TEST(ArrayImplTest, CopyToDifferentDevice) {
             SingleDeviceShardSemantics::kAddressableShards));
   }
   {
-    std::shared_ptr<const Sharding> sharding =
+    ShardingRef sharding =
         ConcreteEvenSharding::Create(devices, MemoryKind(), shape, shape);
     TF_ASSERT_OK_AND_ASSIGN(
         arrays.emplace_back(),
@@ -621,10 +1136,9 @@ TEST(ArrayImplTest, CopyMixedSourceDevices) {
   std::iota(data.begin(), data.end(), 0);
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   for (auto* device : client->addressable_devices()) {
-    std::shared_ptr<const Sharding> sharding =
-        SingleDeviceSharding::Create(device, MemoryKind());
+    ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
     TF_ASSERT_OK_AND_ASSIGN(
         arrays.emplace_back(),
         client->MakeArrayFromHostBuffer(data.data(), dtype, shape,
@@ -656,10 +1170,9 @@ TEST(ArrayImplTest, CopyMixedSourceMemoryKind) {
   Device* device = client->addressable_devices().at(0);
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   for (auto* memory : device->Memories()) {
-    std::shared_ptr<const Sharding> sharding =
-        SingleDeviceSharding::Create(device, memory->Kind());
+    ShardingRef sharding = SingleDeviceSharding::Create(device, memory->Kind());
     TF_ASSERT_OK_AND_ASSIGN(arrays.emplace_back(),
                             client->MakeArrayFromHostBuffer(
                                 data.data(), dtype, shape,
@@ -684,8 +1197,7 @@ TEST(ArrayImplTest, GetReadyFuture) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -704,11 +1216,10 @@ TEST(ArrayImplTest, BatchedGetReadyFuture) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
-  std::vector<tsl::RCReference<Value>> values;
+  std::vector<ValueRef> values;
   for (int i = 0; i < 4; ++i) {
     TF_ASSERT_OK_AND_ASSIGN(values.emplace_back(),
                             client->MakeArrayFromHostBuffer(
@@ -727,8 +1238,7 @@ TEST(ArrayImplTest, Delete) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -747,8 +1257,7 @@ TEST(ArrayImplTest, DeleteIsIdempotent) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -772,8 +1281,7 @@ TEST(ArrayImplTest, IsDeleted) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
   auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/python/ifrt/array_spec.h b/third_party/xla/xla/python/ifrt/array_spec.h
index 47d69a60dcb9..03031c850a88 100644
--- a/third_party/xla/xla/python/ifrt/array_spec.h
+++ b/third_party/xla/xla/python/ifrt/array_spec.h
@@ -39,16 +39,15 @@ class Client;
 struct ArraySpec {
   DType dtype;
   Shape shape;
-  absl::Nonnull<std::shared_ptr<const Sharding>> sharding;
-  absl::Nullable<std::shared_ptr<const xla::PjRtLayout>> layout;
+  ShardingRef sharding;
+  absl_nullable std::shared_ptr<const xla::PjRtLayout> layout;
 
   bool operator==(const ArraySpec& other) const {
     auto are_pointees_equal = [](auto* lhs, auto* rhs) {
       if (lhs == nullptr || rhs == nullptr) {
         return lhs == nullptr && rhs == nullptr;
-      } else {
-        return lhs == rhs || *lhs == *rhs;
       }
+      return lhs == rhs || *lhs == *rhs;
     };
     return dtype == other.dtype && shape == other.shape &&
            are_pointees_equal(sharding.get(), other.sharding.get()) &&
@@ -61,7 +60,7 @@ struct ArraySpec {
   friend H AbslHashValue(H h, const ArraySpec& value) {
     h = H::combine(std::move(h), value.dtype, value.shape);
     // The current implementation gracefully handles null sharding even if it's
-    // invalid (see `absl::Nonnull` annotation) since we don't enforce such
+    // invalid (see `absl_nonnull` annotation) since we don't enforce such
     // properties at ArraySpec creation time. Once we have a constructor that
     // crashes with a null sharding, we can remove this null check.
     if (value.sharding != nullptr) {
diff --git a/third_party/xla/xla/python/ifrt/array_test.cc b/third_party/xla/xla/python/ifrt/array_test.cc
index 42a0441cf866..18504b7979e6 100644
--- a/third_party/xla/xla/python/ifrt/array_test.cc
+++ b/third_party/xla/xla/python/ifrt/array_test.cc
@@ -28,7 +28,7 @@ namespace {
 
 TEST(ArrayTest, MakeArrayPointerListTest) {
   const int kNumArrays = 3;
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   arrays.reserve(kNumArrays);
   for (int i = 0; i < kNumArrays; ++i) {
     arrays.push_back(tsl::MakeRef<MockArray>());
diff --git a/third_party/xla/xla/python/ifrt/basic_device_list.h b/third_party/xla/xla/python/ifrt/basic_device_list.h
index 89a9db8b010f..5231487547cb 100644
--- a/third_party/xla/xla/python/ifrt/basic_device_list.h
+++ b/third_party/xla/xla/python/ifrt/basic_device_list.h
@@ -101,7 +101,7 @@ class BasicDeviceList : public llvm::RTTIExtends<BasicDeviceList, DeviceList> {
   // Cached hash. 0 indicates the hash needs to be computed and cached.
   // May be written multiple times with the same non-zero value.
   static constexpr uint64_t kUnsetHash = 0;
-  mutable std::atomic<uint64_t> hash_;
+  mutable std::atomic<uint64_t> hash_ = kUnsetHash;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index 8c54f7680535..8df71bfb149b 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -20,9 +20,13 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <utility>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "absl/base/nullability.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -31,6 +35,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
@@ -43,6 +48,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/service/computation_placer.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -105,36 +111,133 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   // `on_done_with_host_buffer` is optional and may be null.
   // `on_done_with_host_buffer` will be called iff OK is returned.
   //
+  // `user_context` is attached to all the runtime actions triggered by this
+  // call and thus simplifies performance analysis and debugging.
+  //
   // TODO(hyeontaek): Consider changing `on_done_with_host_buffer` into a
   // returned `Future<absl::Status>` for consistency with other IFRT APIs.
-  virtual absl::StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
+  virtual absl::StatusOr<ArrayRef> MakeArrayFromHostBuffer(
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      ShardingRef sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer,
+      tsl::RCReference<UserContext> user_context) = 0;
+
+  // Soon to be deprecated. Please use the version above that accepts a
+  // `UserContext`.
+  absl::StatusOr<ArrayRef> MakeArrayFromHostBuffer(
       const void* data, DType dtype, Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
-      absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+      ShardingRef sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) {
+    return MakeArrayFromHostBuffer(data, dtype, shape, byte_strides, sharding,
+                                   semantics, on_done_with_host_buffer,
+                                   CreateUserContext());
+  }
+  // Represents a host buffer.
+  //
+  // TODO(hyeontaek): Consider evolving this structure to `Literal` once it is
+  // available in IFRT.
+  struct HostBuffer {
+    // `data` points to the backing array of the host buffer. Caution:
+    // `byte_strides` are allowed to be negative, in which case `data` may need
+    // to point to the interior of the buffer, not necessarily its start.
+    const void* data;
+
+    DType dtype;
+    Shape shape;
+
+    // If omitted, it defaults to a dense layout with dimensions in
+    // major-to-minor order. As of 2025, with many IFRT API implementations, API
+    // operations that process the HostBuffer return `UNIMPLEMENTED` if asked to
+    // process `byte_strides` that do not equate to a reordering of the
+    // dimensions.
+    //
+    // TODO(hyeontaek): Consider generalizing `byte_strides` to a more general
+    // layout representation.
+    using ByteStrides = std::vector<int64_t>;
+    std::optional<ByteStrides> byte_strides;
+
+    // `on_done` is optional and may be null. `on_done` will be called when the
+    // host buffer is no longer used by the runtime. It will not be called if
+    // the API processing the host buffer has returned an error. For simple and
+    // robust cleanup, it is strongly recommended to capture RAII objects in the
+    // closure of the callback and leave the callback's function body empty.
+    //
+    // If a host buffer is used with a zero-copy semantics, the host buffer data
+    // should not be accessed by the user until `on_done` is called; the data
+    // may be read by the runtime throughout the life of the array created with
+    // the host buffer, and it may be even mutated.
+    std::function<void()> on_done;
+  };
+
+  // Represents the specification of creating an array following an array spec
+  // from host buffer shards.
+  //
+  // `buffers` is a list of pairs of addressable shard indices and a host
+  // buffer. `buffers` should include all addressable shards of
+  // `array_spec.sharding`.
+  //
+  // Each host buffer will be used as per-shard data for all addressable shards
+  // identified by the shard indices. Host buffers should not require casting or
+  // slicing/padding, but may have different layouts (byte strides) from the
+  // `array_spec.layout`.
+  struct MakeArraysFromHostBufferShardsSpec {
+    using ShardIndices = absl::InlinedVector<int64_t, 1>;
+    using Buffers = absl::InlinedVector<std::pair<ShardIndices, HostBuffer>, 1>;
+    Buffers buffers;
+    ArraySpec array_spec;
+  };
+
+  // Creates new arrays. For each array, a subset of array shards will be
+  // created from a host buffer shard. The resulting array will match the array
+  // spec.
+  //
+  // `specs` may be consumed by the implementation.
+  //
+  // `user_context` is attached to all the runtime actions triggered by this
+  // call and thus simplifies performance analysis and debugging.
+  //
+  // All resulting arrays should use the same device list and memory kind. i.e.,
+  // `specs[i].sharding->devices()` and `specs[i].sharding->memory_kind()` must
+  // be equal across all `i`.
+  virtual absl::StatusOr<std::vector<ArrayRef>> MakeArraysFromHostBufferShards(
+      absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
       HostBufferSemantics semantics,
-      std::function<void()> on_done_with_host_buffer) = 0;
+      tsl::RCReference<UserContext> user_context) = 0;
+
+  // Creates new arrays that will be fulfilled with the given error status. The
+  // status must not be OK.
+  virtual absl::StatusOr<std::vector<ArrayRef>> MakeErrorArrays(
+      const absl::Status& error, absl::Span<const ArraySpec> array_specs,
+      tsl::RCReference<UserContext> user_context) = 0;
 
   // Builds a larger array out of individual per-device shards.
   // TODO(hyeontaek): Replace this API with the version that takes
   // `SingleDeviceShardSemantics` and `dtype`.
-  virtual absl::StatusOr<tsl::RCReference<Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      Shape shape, absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-      absl::Span<tsl::RCReference<Array>> arrays,
-      ArrayCopySemantics semantics) = 0;
-  virtual absl::StatusOr<tsl::RCReference<Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      Shape shape, absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-      absl::Span<tsl::RCReference<Array>> arrays,
-      ArrayCopySemantics array_copy_semantics,
+  virtual absl::StatusOr<ArrayRef> AssembleArrayFromSingleDeviceArrays(
+      DType dtype, Shape shape, ShardingRef sharding,
+      absl::Span<ArrayRef> arrays, ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics) = 0;
-  virtual absl::StatusOr<tsl::RCReference<Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      DType dtype, Shape shape,
-      absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-      absl::Span<tsl::RCReference<Array>> arrays,
+
+  ABSL_DEPRECATE_AND_INLINE()
+  absl::StatusOr<ArrayRef> AssembleArrayFromSingleDeviceArrays(
+      Shape shape, ShardingRef sharding, absl::Span<ArrayRef> arrays,
+      ArrayCopySemantics semantics) {
+    return AssembleArrayFromSingleDeviceArrays(
+        arrays.at(0)->dtype(), std::move(shape), std::move(sharding), arrays,
+        semantics, SingleDeviceShardSemantics::kAddressableShards);
+  }
+
+  ABSL_DEPRECATE_AND_INLINE()
+  absl::StatusOr<ArrayRef> AssembleArrayFromSingleDeviceArrays(
+      Shape shape, ShardingRef sharding, absl::Span<ArrayRef> arrays,
       ArrayCopySemantics array_copy_semantics,
-      SingleDeviceShardSemantics single_device_shard_semantics) = 0;
+      SingleDeviceShardSemantics single_device_shard_semantics) {
+    return AssembleArrayFromSingleDeviceArrays(
+        arrays.at(0)->dtype(), std::move(shape), std::move(sharding), arrays,
+        array_copy_semantics, single_device_shard_semantics);
+  }
 
   // Copies the arrays to a new set of devices.
   //
@@ -151,9 +254,8 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   //
   // It may fail if the buffer data would be sent from/to an unaddressable
   // device.
-  virtual absl::StatusOr<std::vector<tsl::RCReference<Array>>> CopyArrays(
-      absl::Span<tsl::RCReference<Array>> arrays,
-      std::optional<DeviceListRef> devices,
+  virtual absl::StatusOr<std::vector<ArrayRef>> CopyArrays(
+      absl::Span<ArrayRef> arrays, std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind, ArrayCopySemantics semantics) = 0;
 
   // Remaps shards across input `Array`s to create new `Array`s based on `plan`.
@@ -168,10 +270,9 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   // * `ArrayCopySemantics::kReuseInput` is allowed only if the number of inputs
   // is 1. This is safe because each input shard can be used only once.
   // * `ArrayCopySemantics::kDonateInput` is always allowed.
-  virtual absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-  RemapArrays(const RemapPlan& plan,
-              absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-              ArrayCopySemantics semantics) = 0;
+  virtual absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> RemapArrays(
+      const RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
+      ArrayCopySemantics semantics) = 0;
 
   // Returns a future that becomes ready once all of the values become ready.
   //
@@ -184,15 +285,14 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   // * If there is one or more values with errors, the implementation will pick
   //   one of them arbitrarily to fulfill the returned future.
   //
-  // Note: this API currently accepts a span of `tsl::RCReference<Array>` for
+  // Note: this API currently accepts a span of `ArrayRef` for
   // consistency with other APIs. We may change this to take a span of `Array*`
   // instead to reflect its read-only semantics.
-  virtual Future<> GetReadyFuture(
-      absl::Span<const tsl::RCReference<Value>> values) = 0;
+  virtual Future<> GetReadyFuture(absl::Span<const ValueRef> values) = 0;
 
   // Builds a tuple from a sequence of values.
   virtual absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
-      absl::Span<tsl::RCReference<Value>> values) = 0;
+      absl::Span<ValueRef> values) = 0;
 
   // Identifies the IFRT implementation. Most C++ users should use LLVM RTTI to
   // determine the runtime type. This is a string exposed to users mostly for
@@ -252,9 +352,14 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   // `dtype` and single-shard dimensions `dims`.
   // TODO(hyeontaek): Change the API to take `Shape` and `Sharding` instead of
   // single-shard dimensions and device.
-  virtual absl::StatusOr<std::shared_ptr<const PjRtLayout>> GetDefaultLayout(
-      DType dtype, absl::Span<const int64_t> dims, Device* device,
-      xla::ifrt::MemoryKind memory_kind) const = 0;
+  virtual absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>>
+  GetDefaultLayout(DType dtype, absl::Span<const int64_t> dims, Device* device,
+                   xla::ifrt::MemoryKind memory_kind) const = 0;
+
+  // Returns a UserContext that captures the current context information such as
+  // the stack trace. IFRT implementations that do not support UserContext will
+  // return a nullptr.
+  virtual tsl::RCReference<UserContext> CreateUserContext() = 0;
 
   static char ID;  // NOLINT
 };
diff --git a/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
index 7bff86a9ffdc..719079a77fdc 100644
--- a/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/client_impl_test_lib.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/container/flat_hash_set.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/test_util.h"
+#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
@@ -26,6 +28,7 @@ using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::NotNull;
 using ::testing::SizeIs;
+using ::tsl::testing::IsOk;
 
 TEST(ClientImplTest, RuntimeTypeAndPlatform) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
@@ -60,10 +63,13 @@ TEST(ClientImplTest, GetAllDevices) {
 
   EXPECT_GE(client->GetAllDevices().size(), client->device_count());
 
+  absl::flat_hash_set<DeviceId> seen_device_ids;
   for (Device* device : client->GetAllDevices()) {
     TF_ASSERT_OK_AND_ASSIGN(auto* looked_up_device,
                             client->LookupDevice(device->Id()));
     EXPECT_EQ(device, looked_up_device);
+    EXPECT_TRUE(seen_device_ids.insert(device->Id()).second)
+        << "Duplicate device ID: " << device->Id();
   }
 }
 
@@ -72,6 +78,36 @@ TEST(ClientImplTest, DefaultCompiler) {
   EXPECT_THAT(client->GetDefaultCompiler(), NotNull());
 }
 
+TEST(ClientImplTest, DefaultDeviceAssignment) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto device_assignment,
+        client->GetDefaultDeviceAssignment(client->device_count(), 1));
+    EXPECT_EQ(device_assignment.replica_count(), client->device_count());
+    EXPECT_EQ(device_assignment.computation_count(), 1);
+    for (int i = 0; i < device_assignment.replica_count(); ++i) {
+      for (int j = 0; j < device_assignment.computation_count(); ++j) {
+        EXPECT_THAT(client->LookupDevice(DeviceId(device_assignment(i, j))),
+                    IsOk());
+      }
+    }
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto device_assignment,
+        client->GetDefaultDeviceAssignment(1, client->device_count()));
+    EXPECT_EQ(device_assignment.replica_count(), 1);
+    EXPECT_EQ(device_assignment.computation_count(), client->device_count());
+    for (int i = 0; i < device_assignment.replica_count(); ++i) {
+      for (int j = 0; j < device_assignment.computation_count(); ++j) {
+        EXPECT_THAT(client->LookupDevice(DeviceId(device_assignment(i, j))),
+                    IsOk());
+      }
+    }
+  }
+}
+
 }  // namespace
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/client_impl_util.cc b/third_party/xla/xla/python/ifrt/client_impl_util.cc
new file mode 100644
index 000000000000..ce14727d0872
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/client_impl_util.cc
@@ -0,0 +1,213 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/client_impl_util.h"
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/user_context.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+// Validates if the host buffer metadata is consistent with the array spec and a
+// computed shard shape (if available).
+absl::Status CheckHostBuffer(
+    const Client::MakeArraysFromHostBufferShardsSpec& spec,
+    const Client::HostBuffer& host_buffer,
+    const std::optional<xla::ifrt::Shape>& shard_shape) {
+  if (spec.array_spec.dtype != host_buffer.dtype) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Host buffer dtype does not match array spec dtype: ",
+                     host_buffer.dtype, " vs. ", spec.array_spec.dtype));
+  }
+  if (shard_shape.has_value() && *shard_shape != host_buffer.shape) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Host buffer shape does not match shard shape: ", host_buffer.shape,
+        " vs. ", *shard_shape));
+  }
+  return absl::OkStatus();
+}
+
+// Tries to get the shard shape from the sharding. Returns std::nullopt if the
+// sharding does not support `GetShardShape()`.
+std::optional<xla::ifrt::Shape> TryToGetShardShape(
+    const ArraySpec& array_spec) {
+  if (auto shard_shape_from_sharding =
+          array_spec.sharding->GetShardShape(array_spec.shape);
+      shard_shape_from_sharding.ok()) {
+    return *std::move(shard_shape_from_sharding);
+  }
+  return std::nullopt;
+}
+
+// Checks if the given `MakeArraysFromHostBufferShardsSpec` can be handled by a
+// single call to `MakeArrayFromHostBuffer`.
+bool CanUseMakeArrayFromHostBuffer(
+    const Client::MakeArraysFromHostBufferShardsSpec& spec) {
+  if (spec.buffers.size() == 1) {
+    const auto& [addressable_shard_indices, _] = spec.buffers.front();
+    if (addressable_shard_indices.size() == spec.array_spec.sharding->devices()
+                                                ->AddressableDeviceList()
+                                                ->size() &&
+        spec.array_spec.sharding->IsFullyReplicated()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+absl::StatusOr<std::vector<ArrayRef>> ClientMakeArraysFromHostBufferShards(
+    Client* client,
+    absl::Span<Client::MakeArraysFromHostBufferShardsSpec> specs,
+    Client::HostBufferSemantics semantics,
+    tsl::RCReference<UserContext> user_context) {
+  for (int i = 1; i < specs.size(); ++i) {
+    const Client::MakeArraysFromHostBufferShardsSpec& spec = specs[i];
+    if (specs[0].array_spec.sharding->devices() !=
+        spec.array_spec.sharding->devices()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "All arrays in MakeArraysFromHostBufferShards must have the "
+          "same device list, but got ",
+          specs[0].array_spec.sharding->devices(), " vs. ",
+          spec.array_spec.sharding->devices()));
+    }
+    if (specs[0].array_spec.sharding->memory_kind() !=
+        spec.array_spec.sharding->memory_kind()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "All arrays in MakeArraysFromHostBufferShards must have the "
+          "same memory kind, but got ",
+          specs[0].array_spec.sharding->memory_kind(), " vs. ",
+          spec.array_spec.sharding->memory_kind()));
+    }
+  }
+
+  std::vector<ArrayRef> arrays;
+  arrays.reserve(specs.size());
+  for (Client::MakeArraysFromHostBufferShardsSpec& spec : specs) {
+    std::optional<xla::ifrt::Shape> shard_shape =
+        TryToGetShardShape(spec.array_spec);
+
+    if (CanUseMakeArrayFromHostBuffer(spec)) {
+      // Fast-path for fully replicated arrays. Assumes that
+      // `MakeArrayFromHostBuffer` can handle fully replicated array creation.
+      auto& [addressable_shard_indices, host_buffer] = spec.buffers.front();
+      TF_RETURN_IF_ERROR(CheckHostBuffer(spec, host_buffer, shard_shape));
+
+      TF_ASSIGN_OR_RETURN(
+          ArrayRef array,
+          client->MakeArrayFromHostBuffer(
+              host_buffer.data, host_buffer.dtype, std::move(host_buffer.shape),
+              std::move(host_buffer.byte_strides),
+              std::move(spec.array_spec.sharding), semantics,
+              std::move(host_buffer.on_done), user_context));
+      arrays.push_back(std::move(array));
+      continue;
+    }
+
+    absl::Span<xla::ifrt::Device* const> addressable_devices =
+        spec.array_spec.sharding->devices()->AddressableDeviceList()->devices();
+
+    std::vector<ArrayRef> addressable_shards;
+    addressable_shards.resize(addressable_devices.size());
+    int64_t num_processed_shards = 0;
+
+    // Note that `host_buffer` is const reference. We cannot move any member
+    // from it because the same instance may be used multiple times if the same
+    // index domain shows up in `addressable_index_domains` multiple times.
+    for (const auto& [addressable_shard_indices, host_buffer] : spec.buffers) {
+      TF_RETURN_IF_ERROR(CheckHostBuffer(spec, host_buffer, shard_shape));
+
+      std::function<void()> on_done_with_host_buffer_per_device;
+      if (host_buffer.on_done != nullptr) {
+        auto count = std::make_shared<std::atomic<int64_t>>(
+            addressable_shard_indices.size());
+        on_done_with_host_buffer_per_device =
+            [on_done = std::move(host_buffer.on_done), count]() mutable {
+              if (count->fetch_sub(1, std::memory_order_relaxed) == 1) {
+                on_done();
+              }
+              on_done = nullptr;
+            };
+      }
+
+      for (int64_t addressable_shard_index : addressable_shard_indices) {
+        if (addressable_shard_index < 0 ||
+            addressable_shard_index >= addressable_devices.size()) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Invalid addressable shard index: ", addressable_shard_index,
+              "; expected: [0, ", addressable_devices.size(), ")"));
+        }
+        ArrayRef& shard = addressable_shards[addressable_shard_index];
+        if (shard != nullptr) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Duplicate addressable shard index: ", addressable_shard_index));
+        }
+        auto sharding = xla::ifrt::SingleDeviceSharding::Create(
+            addressable_devices[addressable_shard_index],
+            spec.array_spec.sharding->memory_kind());
+        TF_ASSIGN_OR_RETURN(
+            shard, client->MakeArrayFromHostBuffer(
+                       host_buffer.data, host_buffer.dtype, host_buffer.shape,
+                       host_buffer.byte_strides, std::move(sharding), semantics,
+                       on_done_with_host_buffer_per_device, user_context));
+      }
+      num_processed_shards += addressable_shard_indices.size();
+    }
+    if (num_processed_shards != addressable_devices.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Number of processed shards does not match the number of addressable "
+          "devices: ",
+          num_processed_shards, " vs. ", addressable_devices.size()));
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        ArrayRef array,
+        client->AssembleArrayFromSingleDeviceArrays(
+            spec.array_spec.dtype, std::move(spec.array_spec.shape),
+            std::move(spec.array_spec.sharding),
+            absl::MakeSpan(addressable_shards),
+            ArrayCopySemantics::kDonateInput,
+            SingleDeviceShardSemantics::kAddressableShards));
+    arrays.push_back(std::move(array));
+  }
+  return arrays;
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/client_impl_util.h b/third_party/xla/xla/python/ifrt/client_impl_util.h
new file mode 100644
index 000000000000..311873794023
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/client_impl_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_CLIENT_IMPL_UTIL_H_
+#define XLA_PYTHON_IFRT_CLIENT_IMPL_UTIL_H_
+
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/user_context.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// Portable adapter for `MakeArraysFromHostBufferShards`. It breaks downs
+// requests into `MakeArrayFromHostBuffer` calls followed by
+// `AssembleArrayFromSingleDeviceArrays`.
+//
+// TODO(hyeontaek): Remove this adapter once all major IFRT implementations
+// natively support `MakeArraysFromHostBufferShards`.
+absl::StatusOr<std::vector<ArrayRef>> ClientMakeArraysFromHostBufferShards(
+    Client* client,
+    absl::Span<Client::MakeArraysFromHostBufferShardsSpec> specs,
+    Client::HostBufferSemantics semantics,
+    tsl::RCReference<UserContext> user_context);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_CLIENT_IMPL_UTIL_H_
diff --git a/third_party/xla/xla/python/ifrt/compiler.h b/third_party/xla/xla/python/ifrt/compiler.h
index 36cbdebdf54b..c842faadd63a 100644
--- a/third_party/xla/xla/python/ifrt/compiler.h
+++ b/third_party/xla/xla/python/ifrt/compiler.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define XLA_PYTHON_IFRT_COMPILER_H_
 
 #include <memory>
+#include <utility>
 
+#include "absl/base/macros.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -54,22 +56,28 @@ class Compiler : public llvm::RTTIExtends<Compiler, llvm::RTTIRoot> {
  public:
   // Compiles `mlir_module` and returns a `LoadedExecutable`.
   // TODO(hyeontaek): Move executable loading to `Client`.
-  virtual absl::StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
+  ABSL_DEPRECATE_AND_INLINE()
+  absl::StatusOr<LoadedExecutableRef> Compile(
       std::unique_ptr<Program> program,
-      std::unique_ptr<CompileOptions> options) = 0;
+      std::unique_ptr<CompileOptions> options) {
+    return CompileAndLoad(std::move(program), std::move(options));
+  }
 
-  virtual absl::StatusOr<std::unique_ptr<Executable>> Compile(
+  virtual absl::StatusOr<ExecutableRef> Compile(
       std::unique_ptr<Program> program, const Topology& topology,
       std::unique_ptr<CompileOptions> options) = 0;
 
+  virtual absl::StatusOr<LoadedExecutableRef> CompileAndLoad(
+      std::unique_ptr<Program> program,
+      std::unique_ptr<CompileOptions> options) = 0;
+
   // Deserializes a serialized executable as produced by
   // `LoadedExecutable::Serialize()`. The compatibility of `serialized` is
   // implementation specific.
   // TODO(hyeontaek): Move executable loading to `Client`. Then, the user can
   // use standard IFRT deserialization instead of this custom deserialization
   // function.
-  virtual absl::StatusOr<std::unique_ptr<LoadedExecutable>>
-  DeserializeLoadedExecutable(
+  virtual absl::StatusOr<LoadedExecutableRef> DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<DeserializeExecutableOptions> options) = 0;
 
diff --git a/third_party/xla/xla/python/ifrt/custom_call_program_serdes.cc b/third_party/xla/xla/python/ifrt/custom_call_program_serdes.cc
index 50e4f6238b7a..0ccac699f03b 100644
--- a/third_party/xla/xla/python/ifrt/custom_call_program_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/custom_call_program_serdes.cc
@@ -49,7 +49,8 @@ class CustomCallProgramSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const CustomCallProgram& program =
         llvm::cast<CustomCallProgram>(serializable);
     CustomCallProgramProto proto;
@@ -124,7 +125,8 @@ class CustomCallCompileOptionsSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     return "";
   }
 
diff --git a/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
index e89521309e07..2363d2f2bc9a 100644
--- a/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
@@ -52,14 +52,14 @@ TEST_P(CustomCallProgramSerDesTest, RoundTrip) {
   Shape shape0({10, 20});
   Shape shard_shape0({5, 20});
   DeviceListRef devices = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding0 =
+  ShardingRef sharding0 =
       ConcreteEvenSharding::Create(devices, MemoryKind(),
                                    /*shape=*/shape0,
                                    /*shard_shape=*/shard_shape0);
 
   Shape shape1({});
   Shape shard_shape1({});
-  std::shared_ptr<const Sharding> sharding1 =
+  ShardingRef sharding1 =
       ConcreteEvenSharding::Create(devices, MemoryKind(),
                                    /*shape=*/shape1,
                                    /*shard_shape=*/shard_shape1);
diff --git a/third_party/xla/xla/python/ifrt/device.h b/third_party/xla/xla/python/ifrt/device.h
index c20b40008d19..a9d86a7d17e3 100644
--- a/third_party/xla/xla/python/ifrt/device.h
+++ b/third_party/xla/xla/python/ifrt/device.h
@@ -68,8 +68,6 @@ class Device : public llvm::RTTIExtends<Device, llvm::RTTIRoot> {
 
   // Debug string suitable for logging when errors occur. Should be verbose
   // enough to describe the current device unambiguously.
-  //
-  // TODO(hyeontaek): Remove this method in favor of AbslStringify.
   virtual absl::string_view DebugString() const = 0;
 
   // Returns the default memory space attached to this device.
@@ -91,7 +89,7 @@ class Device : public llvm::RTTIExtends<Device, llvm::RTTIRoot> {
 
   template <class Sink>
   friend void AbslStringify(Sink& sink, const Device& device) {
-    sink.Append(device.DebugString());
+    sink.Append(device.ToString());
   }
 
   template <class Sink>
@@ -99,7 +97,7 @@ class Device : public llvm::RTTIExtends<Device, llvm::RTTIRoot> {
     if (device == nullptr) {
       sink.Append("<nullptr>");
     } else {
-      sink.Append(device->DebugString());
+      sink.Append(device->ToString());
     }
   }
 
diff --git a/third_party/xla/xla/python/ifrt/dtype.cc b/third_party/xla/xla/python/ifrt/dtype.cc
index 5694e1886d1c..fdc45de6bcfd 100644
--- a/third_party/xla/xla/python/ifrt/dtype.cc
+++ b/third_party/xla/xla/python/ifrt/dtype.cc
@@ -146,9 +146,8 @@ absl::StatusOr<DType> DType::FromProto(const DTypeProto& dtype_proto) {
       CASE(C64);
       CASE(C128);
       CASE(F4E2M1FN);
-      // TODO: Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-      // CASE(F8E3M4);
-      // CASE(F8E4M3);
+      CASE(F8E3M4);
+      CASE(F8E4M3);
       CASE(F8E8M0FNU);
       CASE(F8E4M3FN);
       CASE(F8E4M3B11FNUZ);
@@ -196,9 +195,8 @@ DTypeProto DType::ToProto() const {
       CASE(C64);
       CASE(C128);
       CASE(F4E2M1FN);
-      // TODO: Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-      // CASE(F8E3M4);
-      // CASE(F8E4M3);
+      CASE(F8E3M4);
+      CASE(F8E4M3);
       CASE(F8E8M0FNU);
       CASE(F8E4M3FN);
       CASE(F8E4M3B11FNUZ);
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index e544ebef3e3d..d94fb8e49dcf 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -102,6 +103,8 @@ class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
   static char ID;  // NOLINT
 };
 
+using ExecutableRef = std::shared_ptr<Executable>;
+
 struct ExecuteOptions {
   // If non-zero, identifies this execution as part of a potentially
   // multi-device launch. This can be used to detect scheduling errors, e.g. if
@@ -184,6 +187,13 @@ class LoadedExecutable
   // Returns a list of parameter Sharding.
   virtual std::optional<std::vector<OpSharding>> GetParameterShardings()
       const = 0;
+
+  // Returns the indices of parameters that will be donated whenever `Execute`
+  // gets called, provided they are not present in
+  // `execute_options.non_donatable_input_indices`.
+  virtual absl::StatusOr<absl::Span<const int>> GetDonatableInputIndices()
+      const = 0;
+
   // Returns a list of output OpSharding.
   virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
   // Returns a list of parameter layouts.
@@ -217,7 +227,7 @@ class LoadedExecutable
     // `ExecuteOptions::fill_status` is true.
     Future<> status;
     // Output arrays.
-    std::vector<tsl::RCReference<Array>> outputs;
+    std::vector<ArrayRef> outputs;
   };
 
   // Executes the executable on devices.
@@ -238,18 +248,9 @@ class LoadedExecutable
   // (e.g., having per-argument/output booleans or providing a separate barrier
   // API).
   virtual absl::StatusOr<ExecuteResult> Execute(
-      absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
+      absl::Span<ArrayRef> args, const ExecuteOptions& options,
       std::optional<DeviceListRef> devices) = 0;
 
-  // Deletes the executable from the devices. The operation may be asynchronous.
-  // The returned future will have the result of the deletion on the devices.
-  // Implementations that do not track the completion of the deletion operation
-  // may make the future immediately ready with an OK status.
-  virtual Future<> Delete() = 0;
-  // Returns whether the executable has been enqueued for deletion from the
-  // devices.
-  virtual bool IsDeleted() const = 0;
-
   // The following APIs are taken from xla::PjRtLoadedExecutable for fast
   // prototyping.
   // TODO(hyeontaek): Move the following XLA-specific methods to
@@ -260,6 +261,8 @@ class LoadedExecutable
   static char ID;  // NOLINT
 };
 
+using LoadedExecutableRef = std::shared_ptr<LoadedExecutable>;
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/hlo/BUILD b/third_party/xla/xla/python/ifrt/hlo/BUILD
index 554e0d13780b..1f477aa72685 100644
--- a/third_party/xla/xla/python/ifrt/hlo/BUILD
+++ b/third_party/xla/xla/python/ifrt/hlo/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
@@ -33,14 +33,12 @@ cc_library(
         "//xla/mlir_hlo:mhlo_passes",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/python/ifrt:serdes",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
         "@stablehlo//:stablehlo_serialization",
     ],
     alwayslink = True,
@@ -52,7 +50,6 @@ xla_cc_test(
     deps = [
         ":hlo_program",
         ":hlo_program_serdes",
-        "//xla/mlir_hlo",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/python/ifrt:serdes",
         "//xla/python/ifrt:serdes_proto_cc",
@@ -66,5 +63,6 @@ xla_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes.cc b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes.cc
index b620ecc48102..f2acb466518b 100644
--- a/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes.cc
@@ -26,14 +26,12 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
 #include "stablehlo/dialect/Serialization.h"
 #include "xla/mlir/utils/error_util.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/serdes.h"
-#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -64,7 +62,8 @@ class HloProgramSerDes : public llvm::RTTIExtends<HloProgramSerDes, SerDes> {
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     // Currently, PjRT-IFRT accepts an `HloProgram` that contains C/MHLO. Since
     // these dialects don't provide version compatibility, the following
     // converts the module into StableHLO and use its portable serialization.
@@ -78,10 +77,8 @@ class HloProgramSerDes : public llvm::RTTIExtends<HloProgramSerDes, SerDes> {
         llvm::cast<mlir::ModuleOp>(program.mlir_module->clone()));
 
     // Serialize portable artifact.
-    TF_ASSIGN_OR_RETURN(std::string serialized,
-                        xla::SerializeUsingVersionedStablehlo(
-                            *module, xla::GetDefaultStablehloVersion()));
-    return serialized;
+    return xla::SerializeUsingVersionedStablehlo(
+        *module, xla::GetDefaultStablehloVersion());
   }
 
   absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
@@ -103,17 +100,6 @@ class HloProgramSerDes : public llvm::RTTIExtends<HloProgramSerDes, SerDes> {
                        status.message()));
     }
 
-    // Convert StableHLO back to MHLO to keep the contract the same before and
-    // after a serialization/deserialization round trip.
-    mlir::PassManager pm(context.get());
-    pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
-    if (!mlir::succeeded(pm.run(*module))) {
-      const absl::Status status = diagnostic_handler.ConsumeStatus();
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Failed to legalize StableHLO to MHLO;\n\nDetailed error from MLIR: ",
-          status.message()));
-    }
-
     return std::make_unique<HloProgram>(std::move(context), std::move(module));
   }
 
diff --git a/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
index 859ba3715261..e8991eecc577 100644
--- a/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/DebugStringHelper.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/serdes.h"
@@ -48,11 +48,10 @@ TEST(HloProgramSerDesTest, RoundTrip) {
   static constexpr absl::string_view kMlirModuleStr = R"(
 module {
   func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-    %0 = "mhlo.copy"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-    %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
-    %2 = "mhlo.broadcast"(%1) {broadcast_sizes = dense<[2, 3]> : tensor<2xi64>} : (tensor<f32>) -> tensor<2x3xf32>
-    %3 = mhlo.add %0, %2 : tensor<2x3xf32>
-    return %3 : tensor<2x3xf32>
+    %0 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+    %1 = "stablehlo.broadcast_in_dim"(%0) {broadcast_dimensions = array<i64>} : (tensor<f32>) -> tensor<2x3xf32>
+    %2 = stablehlo.add %arg0, %1 : tensor<2x3xf32>
+    return %2 : tensor<2x3xf32>
   }
 })";
 
@@ -72,13 +71,13 @@ module {
       std::unique_ptr<HloProgram> xla_program,
       Deserialize<HloProgram>(serialized, /*options=*/nullptr));
 
-  // Verify that the deserialized program has no StableHLO ops.
+  // Verify that the deserialized program has no MHLO ops.
   bool has_unsupported_dialect = false;
   xla_program->mlir_module->walk([&](mlir::Operation *op) {
     if (!llvm::isa<mlir::BuiltinDialect, mlir::func::FuncDialect,
-                   mlir::mhlo::MhloDialect>(op->getDialect())) {
+                   mlir::stablehlo::StablehloDialect>(op->getDialect())) {
       LOG(ERROR) << "Found an op with an unsupported dialect: "
-                 << mlir::debugString(op);
+                 << mlir::debugString(*op);
       has_unsupported_dialect = true;
     }
   });
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index c49f959de2dc..29b321727676 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -1,5 +1,5 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_pybind_extension")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
@@ -38,50 +38,32 @@ td_library(
 gentbl_cc_library(
     name = "ifrt_dialect_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-dialect-decls",
-                "-dialect=ifrt",
-            ],
-            "ifrt_dialect.h.inc",
-        ),
-        (
-            [
-                "-gen-dialect-defs",
-                "-dialect=ifrt",
-            ],
-            "ifrt_dialect.cc.inc",
-        ),
-        (
-            [
-                "-gen-typedef-decls",
-                "--typedefs-dialect=ifrt",
-            ],
-            "ifrt_types.h.inc",
-        ),
-        (
-            [
-                "-gen-typedef-defs",
-                "--typedefs-dialect=ifrt",
-            ],
-            "ifrt_types.cc.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-decls",
-                "--attrdefs-dialect=ifrt",
-            ],
-            "ifrt_attrs.h.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-defs",
-                "--attrdefs-dialect=ifrt",
-            ],
-            "ifrt_attrs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ifrt_dialect.h.inc": [
+            "-gen-dialect-decls",
+            "-dialect=ifrt",
+        ],
+        "ifrt_dialect.cc.inc": [
+            "-gen-dialect-defs",
+            "-dialect=ifrt",
+        ],
+        "ifrt_types.h.inc": [
+            "-gen-typedef-decls",
+            "--typedefs-dialect=ifrt",
+        ],
+        "ifrt_types.cc.inc": [
+            "-gen-typedef-defs",
+            "--typedefs-dialect=ifrt",
+        ],
+        "ifrt_attrs.h.inc": [
+            "-gen-attrdef-decls",
+            "--attrdefs-dialect=ifrt",
+        ],
+        "ifrt_attrs.cc.inc": [
+            "-gen-attrdef-defs",
+            "--attrdefs-dialect=ifrt",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ifrt_dialect.td",
     test = True,
@@ -91,16 +73,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "ifrt_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ifrt_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ifrt_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ifrt_ops.h.inc": ["-gen-op-decls"],
+        "ifrt_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ifrt_ops.td",
     test = True,
@@ -110,24 +86,12 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "ifrt_interfaces_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-attr-interface-decls"],
-            "ifrt_attr_interfaces.h.inc",
-        ),
-        (
-            ["-gen-attr-interface-defs"],
-            "ifrt_attr_interfaces.cc.inc",
-        ),
-        (
-            ["-gen-op-interface-decls"],
-            "ifrt_op_interfaces.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "ifrt_op_interfaces.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "ifrt_attr_interfaces.h.inc": ["-gen-attr-interface-decls"],
+        "ifrt_attr_interfaces.cc.inc": ["-gen-attr-interface-defs"],
+        "ifrt_op_interfaces.h.inc": ["-gen-op-interface-decls"],
+        "ifrt_op_interfaces.cc.inc": ["-gen-op-interface-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ifrt_interfaces.td",
     test = True,
@@ -202,6 +166,7 @@ cc_library(
         "//xla/pjrt:compile_options_proto_cc",
         "//xla/pjrt:pjrt_executable",
         "//xla/python/ifrt",
+        "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:serdes",
         "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/tsl/platform:statusor",
@@ -341,32 +306,14 @@ td_library(
 gentbl_cc_library(
     name = "vifrt_interfaces_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-attr-interface-decls"],
-            "vifrt_attr_interfaces.h.inc",
-        ),
-        (
-            ["-gen-attr-interface-defs"],
-            "vifrt_attr_interfaces.cc.inc",
-        ),
-        (
-            ["-gen-type-interface-decls"],
-            "vifrt_type_interfaces.h.inc",
-        ),
-        (
-            ["-gen-type-interface-defs"],
-            "vifrt_type_interfaces.cc.inc",
-        ),
-        (
-            ["-gen-op-interface-decls"],
-            "vifrt_op_interfaces.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "vifrt_op_interfaces.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "vifrt_attr_interfaces.h.inc": ["-gen-attr-interface-decls"],
+        "vifrt_attr_interfaces.cc.inc": ["-gen-attr-interface-defs"],
+        "vifrt_type_interfaces.h.inc": ["-gen-type-interface-decls"],
+        "vifrt_type_interfaces.cc.inc": ["-gen-type-interface-defs"],
+        "vifrt_op_interfaces.h.inc": ["-gen-op-interface-decls"],
+        "vifrt_op_interfaces.cc.inc": ["-gen-op-interface-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "vifrt_interfaces.td",
     test = True,
@@ -376,50 +323,32 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "vifrt_dialect_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-dialect-decls",
-                "-dialect=vifrt",
-            ],
-            "vifrt_dialect.h.inc",
-        ),
-        (
-            [
-                "-gen-dialect-defs",
-                "-dialect=vifrt",
-            ],
-            "vifrt_dialect.cc.inc",
-        ),
-        (
-            [
-                "-gen-typedef-decls",
-                "--typedefs-dialect=vifrt",
-            ],
-            "vifrt_types.h.inc",
-        ),
-        (
-            [
-                "-gen-typedef-defs",
-                "--typedefs-dialect=vifrt",
-            ],
-            "vifrt_types.cc.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-decls",
-                "--attrdefs-dialect=vifrt",
-            ],
-            "vifrt_attrs.h.inc",
-        ),
-        (
-            [
-                "-gen-attrdef-defs",
-                "--attrdefs-dialect=vifrt",
-            ],
-            "vifrt_attrs.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "vifrt_dialect.h.inc": [
+            "-gen-dialect-decls",
+            "-dialect=vifrt",
+        ],
+        "vifrt_dialect.cc.inc": [
+            "-gen-dialect-defs",
+            "-dialect=vifrt",
+        ],
+        "vifrt_types.h.inc": [
+            "-gen-typedef-decls",
+            "--typedefs-dialect=vifrt",
+        ],
+        "vifrt_types.cc.inc": [
+            "-gen-typedef-defs",
+            "--typedefs-dialect=vifrt",
+        ],
+        "vifrt_attrs.h.inc": [
+            "-gen-attrdef-decls",
+            "--attrdefs-dialect=vifrt",
+        ],
+        "vifrt_attrs.cc.inc": [
+            "-gen-attrdef-defs",
+            "--attrdefs-dialect=vifrt",
+        ],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "vifrt_dialect.td",
     test = True,
@@ -429,16 +358,10 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "vifrt_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "vifrt_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "vifrt_ops.cc.inc",
-        ),
-    ],
+    tbl_outs = {
+        "vifrt_ops.h.inc": ["-gen-op-decls"],
+        "vifrt_ops.cc.inc": ["-gen-op-defs"],
+    },
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "vifrt_ops.td",
     test = True,
diff --git a/third_party/xla/xla/python/ifrt/ir/atom_program_compiler.h b/third_party/xla/xla/python/ifrt/ir/atom_program_compiler.h
index ffec3a3dc753..a4251a913bcc 100644
--- a/third_party/xla/xla/python/ifrt/ir/atom_program_compiler.h
+++ b/third_party/xla/xla/python/ifrt/ir/atom_program_compiler.h
@@ -35,11 +35,10 @@ namespace ifrt {
 // Loaded executable and unique name for a compiled atom program.
 struct AtomProgramCompileResult {
   std::string name;
-  std::shared_ptr<LoadedExecutable> executable;
+  LoadedExecutableRef executable;
 };
 
-using AtomExecutableMap =
-    absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>;
+using AtomExecutableMap = absl::flat_hash_map<std::string, LoadedExecutableRef>;
 
 class AtomProgramCompiler {
  public:
diff --git a/third_party/xla/xla/python/ifrt/ir/constants.h b/third_party/xla/xla/python/ifrt/ir/constants.h
index 512b22259fdc..99621f5f693b 100644
--- a/third_party/xla/xla/python/ifrt/ir/constants.h
+++ b/third_party/xla/xla/python/ifrt/ir/constants.h
@@ -61,6 +61,9 @@ inline constexpr llvm::StringLiteral kIfrtEntryFunctionAttrName =
 // partitioned by the Sdy partitioner.
 inline constexpr llvm::StringLiteral kIsSdyPartitioned =
     "ifrt.is_sdy_partitioned";
+// Name of the StringAttr set on the ModuleOp to store meshes SDY uses.
+inline constexpr llvm::StringLiteral kIfrtSdyMeshesRoundTripAttr =
+    "ifrt.sdy.meshes";
 
 inline constexpr llvm::StringLiteral kCalleeMainFuncName = "main";
 
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.cc
index 715337d097c8..43fe0ddf27cc 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.cc
@@ -28,8 +28,10 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/ir/ifrt_ir_compile_options.pb.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
@@ -67,12 +69,16 @@ IfrtIRCompileOptions::FromProto(const IfrtIrCompileOptionsProto& proto) {
   for (const auto& [key, value] : proto.compile_option_overrides()) {
     TF_ASSIGN_OR_RETURN(xla::CompileOptions compile_options,
                         xla::CompileOptions::FromProto(value));
+    // TODO(emilyaf): XlaCompileOptions should be built with the correct
+    // devices. Pass `ifrt::Client*` to `IfrtIRCompileOptions::FromProto` and
+    // look up the IFRT devices corresponding to `device_ids`.
+    DeviceListRef devices = BasicDeviceList::Create({});
     compile_options_overrides->insert(
-        {key, std::make_unique<XlaCompileOptions>(compile_options)});
+        {key, std::make_unique<XlaCompileOptions>(compile_options, devices)});
   }
   return std::make_unique<IfrtIRCompileOptions>(
       std::move(device_ids),
-      absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>(),
+      absl::flat_hash_map<std::string, LoadedExecutableRef>(),
       std::move(compile_options_overrides), proto.propagate_shardings());
 }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.h b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.h
index 148163876fbe..ffdf12cacf56 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.h
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program.h
@@ -101,7 +101,7 @@ struct IfrtIRCompileOptions
   IfrtIRCompileOptions() = default;
   explicit IfrtIRCompileOptions(
       std::vector<DeviceId> device_assignments,
-      absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>
+      absl::flat_hash_map<std::string, LoadedExecutableRef>
           loaded_exec_binding = {},
       std::shared_ptr<absl::flat_hash_map<
           std::string, std::unique_ptr<xla::ifrt::CompileOptions>>>
@@ -119,8 +119,7 @@ struct IfrtIRCompileOptions
   // Map from symbol names of LoadedExecutableOp in the IFRT IR MLIR module
   // to pre-compiled `LoadedExecutable` instance. The `LoadedExecutable`s must
   // outlive the `LoadedExecutable` of the IFRT IR program.
-  absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>
-      loaded_exec_binding;
+  absl::flat_hash_map<std::string, LoadedExecutableRef> loaded_exec_binding;
 
   // Mapping from values of `ifrt.compile_option_key` attribute of a `CallOp` to
   // compile options. If a `CallOp` does not have have the attribute set or does
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes.cc
index 666d4b8b91ec..c65c718a9a0e 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_program_serdes.cc
@@ -63,9 +63,9 @@ class IfrtIRProgramSerDes
   // serialized to a stable versioned IFRT IR representation, and the atom
   // program modules are serialized to VHLO.
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable,
+      const Serializable& serializable,
       std::unique_ptr<SerializeOptions> options) override {
-    auto& program = llvm::cast<IfrtIRProgram>(serializable);
+    const auto& program = llvm::cast<IfrtIRProgram>(serializable);
     if (program.mlir_module == nullptr) {
       return absl::InvalidArgumentError("Unable to serialize null MLIR module");
     }
@@ -96,7 +96,7 @@ class IfrtIRProgramSerDes
       if (serialize_options->version_in_place) {
         mlir_module = program.mlir_module;
       } else {
-        cloned = program.mlir_module.clone();
+        cloned = mlir::ModuleOp(program.mlir_module).clone();
         mlir_module = *cloned;
       }
       // Run the pipeline to convert IFRT IR program to a versioned artifact.
@@ -210,7 +210,8 @@ class IfrtIRCompileOptionsSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const auto& options = llvm::cast<IfrtIRCompileOptions>(serializable);
     TF_ASSIGN_OR_RETURN(IfrtIrCompileOptionsProto options_proto,
                         options.ToProto());
diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
index e40ac373e8d3..04df160a9e90 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
@@ -274,8 +274,8 @@ ShardingParam::LocalShapeFromGlobalShape(
     if (global_shape[i] % num_shards[i] != 0) {
       return absl::InvalidArgumentError(absl::StrCat(
           "Global shape is not divisible by the number of shards in dimension ",
-          i, ". Global size: ", global_shape[i],
-          ", number of shards: ", num_shards[i], "."));
+          i, ". Global shape: [", absl::StrJoin(global_shape, ","),
+          "], number of shards: ", num_shards[i], "."));
     }
     local_shape.push_back(global_shape[i] / num_shards[i]);
   }
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/BUILD b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
index 72cb10d98584..9b79f4dd4af0 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
@@ -1,5 +1,5 @@
 load("//xla:lit.bzl", "lit_test_suite")
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package_group(
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.cc b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.cc
index a7d7fe120be4..ae780a094f0a 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.cc
@@ -126,24 +126,23 @@ IfrtIrExecutableImplTestBase::SerDeRoundTrip(
   return program;
 }
 
-absl::StatusOr<tsl::RCReference<Array>>
-IfrtIrExecutableImplTestBase::CreateArray(
+absl::StatusOr<ArrayRef> IfrtIrExecutableImplTestBase::CreateArray(
     absl::Span<void* const> per_shard_data, Shape shape, DType dtype,
     ShardingParam sharding_param, DeviceListRef device_list) {
   TF_RET_CHECK(per_shard_data.size() == device_list->devices().size())
       << "Inconsistent sizes. per_shard_data " << per_shard_data.size()
       << " vs device_list " << device_list->devices().size();
   TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const Sharding> sharding,
+      ShardingRef sharding,
       ShardingParamSharding::Create(sharding_param, device_list, MemoryKind()));
   TF_ASSIGN_OR_RETURN(auto per_shard, sharding->Disassemble(shape));
   // All shards have the same shape. Just pick 0.
   Shape per_shard_shape = per_shard[0].first;
-  std::vector<tsl::RCReference<Array>> per_shard_arrays;
+  std::vector<ArrayRef> per_shard_arrays;
   per_shard_arrays.reserve(per_shard_data.size());
   for (int i = 0; i < per_shard_data.size(); ++i) {
     TF_ASSIGN_OR_RETURN(
-        tsl::RCReference<Array> per_shard_array,
+        ArrayRef per_shard_array,
         client_->MakeArrayFromHostBuffer(
             per_shard_data[i], dtype, per_shard_shape,
             /*byte_strides=*/std::nullopt,
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
index 350dcc74dcab..8f7d11dce39d 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
@@ -66,9 +66,10 @@ class IfrtIrExecutableImplTestBase : public testing::Test {
   // Creates an Array from per shard data.
   // TODO(hyeontaek): Remove this when MakeArrayFromHostBuffer supports it
   // directly.
-  absl::StatusOr<tsl::RCReference<Array>> CreateArray(
-      absl::Span<void* const> per_shard_data, Shape shape, DType dtype,
-      ShardingParam sharding_param, DeviceListRef device_list);
+  absl::StatusOr<ArrayRef> CreateArray(absl::Span<void* const> per_shard_data,
+                                       Shape shape, DType dtype,
+                                       ShardingParam sharding_param,
+                                       DeviceListRef device_list);
 
   // Picks a given number of devices.
   // Error when `count` is larger than the total number of devices.
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
index 3216ed403bfa..2f8c1a253223 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
@@ -76,7 +76,7 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
@@ -84,10 +84,9 @@ module {
   std::vector<int> data0 = {0, 1};
   std::vector<int> data1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -127,7 +126,7 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
@@ -135,10 +134,9 @@ module {
   std::vector<int> data_shard0 = {0, 1};
   std::vector<int> data_shard1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data_shard0.data(), data_shard1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -190,10 +188,9 @@ module {
   std::vector<int> data_shard0 = {0, 1};
   std::vector<int> data_shard1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data_shard0.data(), data_shard1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -249,10 +246,9 @@ module {
   std::vector<int> data_shard0 = {0, 1};
   std::vector<int> data_shard1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data_shard0.data(), data_shard1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -305,10 +301,9 @@ module {
   std::vector<int> data_shard0 = {0, 1};
   std::vector<int> data_shard1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data_shard0.data(), data_shard1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -370,10 +365,9 @@ module {
   std::vector<int> data_shard0 = {0, 1};
   std::vector<int> data_shard1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data_shard0.data(), data_shard1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -402,14 +396,14 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
 
   std::vector<int> data = {1, 2};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
+      ArrayRef input,
       CreateArray({data.data()}, Shape({2}), DType(DType::kS32),
                   ShardingParam({1}, {{0}, {1}}),
                   client_->MakeDeviceList({devices->devices()[0]})));
@@ -449,14 +443,14 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
 
   std::vector<int> data = {0, 1, 2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
+      ArrayRef input,
       CreateArray({data.data()}, Shape({2, 2}), DType(DType::kS32),
                   ShardingParam({1, 1}, {{0}, {1}}),
                   client_->MakeDeviceList({devices->devices()[0]})));
@@ -498,7 +492,7 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
@@ -537,7 +531,7 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
@@ -545,10 +539,9 @@ module {
   std::vector<int> data0 = {0, 1};
   std::vector<int> data1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -584,7 +577,7 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
@@ -592,10 +585,9 @@ module {
   std::vector<int> data0 = {0, 1};
   std::vector<int> data1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -641,7 +633,7 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
@@ -649,10 +641,9 @@ module {
   std::vector<int> data_shard0 = {0, 1};
   std::vector<int> data_shard1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data_shard0.data(), data_shard1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -689,7 +680,7 @@ module {
                           LoadFromSource(source));
   TF_ASSERT_OK_AND_ASSIGN(DeviceListRef devices, PickDevices(2));
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module),
           std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
@@ -697,10 +688,9 @@ module {
   std::vector<int> data_shard0 = {0, 1};
   std::vector<int> data_shard1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data_shard0.data(), data_shard1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data_shard0.data(), data_shard1.data()},
+                                  Shape({2, 2}), DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions options;
   options.fill_status = true;
@@ -744,11 +734,11 @@ module {
     }
     exec_build_options.set_device_assignment(device_assignment);
   }
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> child_exec,
-      client_->GetDefaultCompiler()->Compile(
-          std::make_unique<HloProgram>(*mhlo_module),
-          std::make_unique<XlaCompileOptions>(std::move(xla_options))));
+  TF_ASSERT_OK_AND_ASSIGN(LoadedExecutableRef child_exec,
+                          client_->GetDefaultCompiler()->Compile(
+                              std::make_unique<HloProgram>(*mhlo_module),
+                              std::make_unique<XlaCompileOptions>(
+                                  std::move(xla_options), devices)));
 
   std::string source = R"(
 !array = !ifrt.array<tensor<2x2xi32>,
@@ -767,17 +757,16 @@ module {
   auto options = std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices));
   options->loaded_exec_binding["add_one"] = std::move(child_exec);
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<LoadedExecutable> loaded_exec,
+      LoadedExecutableRef loaded_exec,
       client_->GetDefaultCompiler()->Compile(
           std::make_unique<IfrtIRProgram>(*mlir_module), std::move(options)));
 
   std::vector<int> data0 = {0, 1};
   std::vector<int> data1 = {2, 3};
   TF_ASSERT_OK_AND_ASSIGN(
-      tsl::RCReference<Array> input,
-      CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
-                  DType(DType::kS32), ShardingParam({2, 1}, {{0}, {2}}),
-                  devices));
+      ArrayRef input, CreateArray({data0.data(), data1.data()}, Shape({2, 2}),
+                                  DType(DType::kS32),
+                                  ShardingParam({2, 1}, {{0}, {2}}), devices));
 
   ExecuteOptions execute_options;
   execute_options.fill_status = true;
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_atom_program.mlir b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_atom_program.mlir
index 22257730e01d..6a544019e15b 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_atom_program.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_atom_program.mlir
@@ -32,8 +32,7 @@ module @call_hlo {
                      #ifrt.sharding_param<2x1 to [0] on 2>, [0,1]>
 // CHECK-LABEL: @call_hlo_sdy_lowered
 module @call_hlo_sdy_lowered attributes {
-    mhlo.frontend_attributes = {
-      xla.sdy.meshes ="{mesh = #sdy.mesh<[\\\22x\\\22=2]>}"}} {
+    ifrt.sdy.meshes ="{mesh = #sdy.mesh<[\\\22x\\\22=2]>}"} {
   func.func @main(%arg0: !array) -> !array attributes {ifrt.function} {
     // CHECK: ifrt.CallLoadedExecutable @fake_component__fake_method_1(%arg0)
     %0, %ctrl_0 = ifrt.Call @add_one::@main(%arg0) on devices [0,1]
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir b/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir
index 4fef0876dc8b..29977603221a 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/spmd_expansion.mlir
@@ -166,7 +166,7 @@ module @missing_entry_function
 #device = #ifrt<devices[0,1]>
 #sharding = #ifrt.sharding_param<2x1 to [0] on 2>
 module @non_divisible_global_shape attributes {ifrt.num_devices = 2} {
-  // expected-error@+1 {{Global shape is not divisible by the number of shards in dimension 0. Global size: 3, number of shards: 2}}
+  // expected-error@+1 {{Global shape is not divisible by the number of shards in dimension 0. Global shape: [3,2], number of shards: 2.}}
   func.func @main(
       %arg0: tensor<3x2xi32> {ifrt.sharding = #sharding,
       ifrt.devices = #device})
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
index 8a425c11e593..7cb7cf731fd0 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
@@ -10,15 +10,10 @@ package(
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=IfrtIr",
-            ],
-            "passes.h.inc",
-        ),
-    ],
+    tbl_outs = {"passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=IfrtIr",
+    ]},
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
     deps = [
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
index 322c03013558..e5a828bfbab8 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
@@ -116,12 +116,8 @@ void IfrtCompileAtomProgramPass::runOnOperation() {
   llvm::DenseMap<CallOp, CompileFuture, IfrtCallOpInfo> call_to_compile_futures;
   mlir::ModuleOp module_op = getOperation();
 
-  mlir::Attribute meshes_round_trip_attr;
-  // TODO: icgog - This attribute will be deleted in the IFRT -> VIFRT
-  // legalization. Fix in order to be able to use Sdy with VIFRT.
-  if (auto front_end_attr = xla::sdy::getFrontendAttrs(module_op)) {
-    meshes_round_trip_attr = front_end_attr.get(xla::sdy::kMeshesRoundTripAttr);
-  }
+  mlir::Attribute sdy_meshes_round_trip_attr =
+      module_op->getAttr(kIfrtSdyMeshesRoundTripAttr);
 
   // Stash the errors in a MapVector, which maintains the order in which they
   // are encountered. We do not emit an error within the walk because atom
@@ -156,7 +152,7 @@ void IfrtCompileAtomProgramPass::runOnOperation() {
           if (call_op->hasAttr(kIsSdyPartitioned)) {
             // Add the meshes roundtrip attribute to the callee module if the
             // atom program was partitioned with sdy.
-            if (!meshes_round_trip_attr) {
+            if (!sdy_meshes_round_trip_attr) {
               call_op_to_error.try_emplace(
                   call_op,
                   "requires meshes roundtrip attribute to be set on the "
@@ -164,9 +160,9 @@ void IfrtCompileAtomProgramPass::runOnOperation() {
                   "sdy.");
               return mlir::WalkResult::advance();
             }
-            xla::sdy::setFrontendAttribute(
-                callee_module, xla::sdy::kMeshesRoundTripAttr,
-                meshes_round_trip_attr, /*escapeAttr=*/false);
+            xla::sdy::setFrontendAttribute(callee_module,
+                                           xla::sdy::kMeshesRoundTripAttr,
+                                           sdy_meshes_round_trip_attr);
           }
 
           absl::StatusOr<CompileFuture> compile_future =
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/passes.cc b/third_party/xla/xla/python/ifrt/ir/transforms/passes.cc
index 7fd8ba85ae9f..c3579aa75622 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/passes.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/passes.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/Passes.h"
+#include "shardy/dialect/sdy/ir/dialect.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/python/ifrt/executable.h"
@@ -40,7 +41,8 @@ void CreateIfrtToOutlinedAtomProgramsPipeline(
   // Passes that verify the correctness of the module.
   pm.addPass(CreateSpmdExpandableInterfaceVerificationPass(
       {{mlir::mhlo::MhloDialect::getDialectNamespace().str(),
-        mlir::stablehlo::StablehloDialect::getDialectNamespace().str()}}));
+        mlir::stablehlo::StablehloDialect::getDialectNamespace().str(),
+        mlir::sdy::SdyDialect::getDialectNamespace().str()}}));
   pm.addNestedPass<mlir::func::FuncOp>(CreateIfrtVerifyDonationPass());
 
   pm.addPass(CreateIfrtOutlineAtomProgramToModulePass());
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/utils.cc b/third_party/xla/xla/python/ifrt/ir/transforms/utils.cc
index 7b4ed8881432..42f6bf3bc2f2 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/utils.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/ifrt/ir/transforms/utils.h"
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -25,13 +26,16 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -45,6 +49,104 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
+namespace {
+
+// Finds a nested call site location in the given location.
+std::optional<mlir::CallSiteLoc> GetCallSiteLoc(mlir::Location loc) {
+  if (mlir::dyn_cast<mlir::NameLoc>(loc))
+    return GetCallSiteLoc(mlir::cast<mlir::NameLoc>(loc).getChildLoc());
+  if (auto callLoc = mlir::dyn_cast<mlir::CallSiteLoc>(loc)) {
+    return callLoc;
+  }
+  if (mlir::dyn_cast<mlir::FusedLoc>(loc)) {
+    for (auto subLoc : mlir::cast<mlir::FusedLoc>(loc).getLocations()) {
+      // If fused return the first call site location.
+      if (auto callLoc = GetCallSiteLoc(subLoc)) {
+        return callLoc;
+      }
+    }
+    return std::nullopt;
+  }
+  return std::nullopt;
+}
+
+void PrintFileLoc(mlir::FileLineColLoc file_loc,
+                  llvm::raw_string_ostream& loc_stream) {
+  if (file_loc.getFilename().str() != "-") {
+    loc_stream << file_loc.getFilename();
+  } else {
+    // The location printed is from the MLIR module.
+    loc_stream << "mlir";
+  }
+  loc_stream << ":" << file_loc.getLine() << ":" << file_loc.getStartColumn()
+             << " to " << file_loc.getEndColumn() << "\n";
+}
+
+// Recurses into the child locations of some of location types to find a nested
+// file location and prints info if it is found. Returns true if a file location
+// is found.
+bool RecursivelyPrintLoc(mlir::Location loc,
+                         llvm::raw_string_ostream& loc_stream) {
+  return llvm::TypeSwitch<mlir::LocationAttr, bool>(loc)
+      .Case([&](mlir::CallSiteLoc call_loc) -> bool {
+        // We recurse into the callee of a call site, as the caller will be
+        // emitted in a different note on the main diagnostic.
+        return RecursivelyPrintLoc(call_loc.getCallee(), loc_stream);
+      })
+      .Case([&](mlir::FileLineColLoc file_loc) -> bool {
+        PrintFileLoc(file_loc, loc_stream);
+        return true;
+      })
+      .Case([&](mlir::FusedLoc fused_loc) -> bool {
+        // Fused location is unique in that we try to find a sub-location to
+        // show, rather than the top-level location itself.
+        for (mlir::Location childLoc : fused_loc.getLocations()) {
+          if (RecursivelyPrintLoc(childLoc, loc_stream)) {
+            return true;
+          }
+        }
+        return false;
+      })
+      .Case([&](mlir::NameLoc name_loc) -> bool {
+        if (RecursivelyPrintLoc(name_loc.getChildLoc(), loc_stream)) {
+          loc_stream << "\t ^ " << name_loc.getName() << "\n";
+          return true;
+        };
+        return false;
+      })
+      .Case([&](mlir::OpaqueLoc opaque_loc) -> bool {
+        // OpaqueLoc always falls back to a different source location.
+        return RecursivelyPrintLoc(opaque_loc.getFallbackLocation(),
+                                   loc_stream);
+      })
+      .Case([](mlir::UnknownLoc) -> bool {
+        // Prefer not to show unknown locations.
+        return false;
+      });
+}
+
+void GetPrettyLocation(mlir::Location loc,
+                       llvm::raw_string_ostream& loc_stream) {
+  loc_stream << "\t";
+  if (auto call_loc = GetCallSiteLoc(loc)) {
+    // Print the file location from the current loc.
+    RecursivelyPrintLoc(*call_loc, loc_stream);
+    // Print the file locations of the callers.
+    GetPrettyLocation(call_loc->getCaller(), loc_stream);
+  } else if (auto file_loc = mlir::dyn_cast<mlir::FileLineColLoc>(loc)) {
+    PrintFileLoc(file_loc, loc_stream);
+  }
+}
+
+}  // namespace
+
+std::string GetPrettyLocation(mlir::Location loc) {
+  std::string loc_str;
+  llvm::raw_string_ostream loc_stream(loc_str);
+  GetPrettyLocation(loc, loc_stream);
+  return loc_str;
+}
+
 unsigned IfrtCallOpInfo::getHashValue(CallOp call_op) {
   llvm::hash_code hash = {};
   // Use `getInputs()/getOutputs()` instead of `getOperands()/getResults()` to
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/utils.h b/third_party/xla/xla/python/ifrt/ir/transforms/utils.h
index 6989ba2b929c..9b5b3b2990a4 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/utils.h
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Types.h"
@@ -68,6 +69,9 @@ mlir::ModuleOp CloneModuleUsingBuilder(mlir::ModuleOp module,
 absl::StatusOr<std::vector<std::string>> ExpandPlatformNames(
     const mlir::Pass::ListOption<std::string>& platform_names);
 
+// Returns a pretty string representation of the location.
+std::string GetPrettyLocation(mlir::Location loc);
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/layout.cc b/third_party/xla/xla/python/ifrt/layout.cc
new file mode 100644
index 000000000000..fcb4680e79c4
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/layout.cc
@@ -0,0 +1,165 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/layout.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/nullability.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/layout.pb.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/lib/core/bitmap.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+char Layout::ID = 0;
+char CompactLayout::ID = 0;
+
+absl::StatusOr<CustomLayoutRef> Layout::FromProto(
+    const LayoutProto& layout_proto) {
+  return Deserialize<Layout>(layout_proto.serialized_layout(),
+                             /*options=*/nullptr);
+}
+
+absl::StatusOr<LayoutProto> Layout::ToProto() const {
+  LayoutProto layout_proto;
+  TF_ASSIGN_OR_RETURN(*layout_proto.mutable_serialized_layout(),
+                      Serialize(*this, /*options=*/nullptr));
+  return layout_proto;
+}
+
+absl::StatusOr<absl_nonnull std::unique_ptr<CompactLayout>>
+CompactLayout::Create(absl::Span<const int> major_to_minor) {
+  tsl::core::Bitmap bitmap(major_to_minor.size());
+  for (int i : major_to_minor) {
+    if (i < 0 || i >= major_to_minor.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("CompactLayout expects major_to_minor with elements in "
+                       "range [0, ",
+                       major_to_minor.size(), "), but got major_to_minor=[",
+                       absl::StrJoin(major_to_minor, ","), "]"));
+    }
+    bitmap.set(i);
+  }
+  if (!bitmap.IsAllSet()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("CompactLayout expects major_to_minor with all elements "
+                     "in range [0, ",
+                     major_to_minor.size(), "), but got major_to_minor=[",
+                     absl::StrJoin(major_to_minor, ","), "]"));
+  }
+  return absl::WrapUnique<CompactLayout>(new CompactLayout(
+      MajorToMinor(major_to_minor.begin(), major_to_minor.end())));
+}
+
+absl_nonnull std::unique_ptr<CompactLayout> CompactLayout::CreateCOrder(
+    int num_shard_shape_dims) {
+  MajorToMinor major_to_minor(num_shard_shape_dims);
+  absl::c_iota(major_to_minor, 0);
+  return absl::WrapUnique<CompactLayout>(
+      new CompactLayout(std::move(major_to_minor)));
+}
+
+absl::StatusOr<std::optional<int64_t>> CompactLayout::ByteSize(
+    DType dtype, const Shape& shard_shape) const {
+  if (major_to_minor_.size() != shard_shape.dims().size()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("CompactLayout expects Shape with the same number of "
+                     "dimensions as major_to_minor [",
+                     absl::StrJoin(major_to_minor_, ","),
+                     "], but got shard_shape=", shard_shape));
+  }
+  auto bit_size = dtype.bit_size();
+  if (!bit_size.has_value()) {
+    return std::nullopt;
+  }
+  // All elements are packed at the bit level. The last byte may contain a small
+  // padding.
+  return (shard_shape.num_elements() * *bit_size + 7) / 8;
+}
+
+bool CompactLayout::operator==(const Layout& other) const {
+  if (this == &other) {
+    return true;
+  }
+  if (const auto* other_compact = llvm::dyn_cast<CompactLayout>(&other);
+      other_compact != nullptr) {
+    return major_to_minor_ == other_compact->major_to_minor_;
+  }
+  return false;
+}
+
+std::string CompactLayout::ToString() const {
+  return absl::StrCat("CompactLayout(major_to_minor=[",
+                      absl::StrJoin(major_to_minor_, ","), "])");
+}
+
+absl::StatusOr<bool> EquivalentLayouts(DType dtype1, const Shape& shape1,
+                                       const ShardingRef& sharding1,
+                                       const LayoutRef& layout1, DType dtype2,
+                                       const Shape& shape2,
+                                       const ShardingRef& sharding2,
+                                       const LayoutRef& layout2) {
+  if (layout1 == nullptr && layout2 == nullptr) {
+    // TODO(hyeontaek): Track a default layout domain in `Device` to check if
+    // two default layouts will be the same. For now, we resolve them to
+    // concrete layouts and compare them.
+    Device* device1 = sharding1->devices()->devices().front();
+    Device* device2 = sharding2->devices()->devices().front();
+    if (dtype1 == dtype2 && shape1 == shape2 && device1 == device2 &&
+        sharding1->memory_kind() == sharding2->memory_kind()) {
+      // Assume that layouts will resolve to the same concrete layout if all
+      // metadata is the same.
+      return true;
+    }
+    // TODO(hyeontaek): Change to IFRT `Layout` comparison once
+    // `Client::GetDefaultLayout()` returns a `CustomLayoutRef`.
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const xla::PjRtLayout> pjrt_layout1,
+        device1->client()->GetDefaultLayout(dtype1, shape1.dims(), device1,
+                                            sharding1->memory_kind()));
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const xla::PjRtLayout> pjrt_layout2,
+        device2->client()->GetDefaultLayout(dtype2, shape2.dims(), device2,
+                                            sharding2->memory_kind()));
+    return *pjrt_layout1 == *pjrt_layout2;
+  }
+  if (layout1 != nullptr && layout2 != nullptr) {
+    return *layout1 == *layout2;
+  }
+  return false;
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/layout.h b/third_party/xla/xla/python/ifrt/layout.h
new file mode 100644
index 000000000000..6f8b8e04e5e9
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/layout.h
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_LAYOUT_H_
+#define XLA_PYTHON_IFRT_LAYOUT_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/nullability.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/layout.pb.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+class Device;
+class Layout;
+
+// Reference to a layout.
+//
+// If `nullptr`, it represents a default layout; since the concrete layout of a
+// default layout is context-sensitive, the user must not treat two `nullptr`
+// values as the same layout.
+//
+// If not `nullptr`, it represents a custom layout.
+using LayoutRef = absl_nullable std::shared_ptr<const Layout>;
+
+// Reference to a custom layout that is not a default layout.
+using CustomLayoutRef = absl_nonnull std::shared_ptr<const Layout>;
+
+// Abstract layout type.
+//
+// `Layout` describes how the elements of a single shard in an array are
+// arranged in memory. A layout may contain transpose, padding, bit packing,
+// sparsity, indirection, and so forth. All shards of a single array use the
+// same layout.
+//
+// Note that within-element layouts such as big/little endian are not expressed
+// through `Layout`. They may be expressed through `DType`.
+class Layout : public llvm::RTTIExtends<Layout, Serializable> {
+ public:
+  Layout(const Layout&) = delete;
+  Layout& operator=(const Layout&) = delete;
+  Layout(Layout&&) = delete;
+  Layout& operator=(Layout&&) = delete;
+
+  // Computes the byte size of a shard shape using the layout. If `dtype`
+  // represents non-fixed-size (e.g., `kString`), size-less (e.g., `kToken`), or
+  // opaque (`kOpaque`) data, returns `std::nullopt`.
+  virtual absl::StatusOr<std::optional<int64_t>> ByteSize(
+      DType dtype, const Shape& shard_shape) const = 0;
+
+  // Constructs `Layout` from `LayoutProto`.
+  static absl::StatusOr<CustomLayoutRef> FromProto(const LayoutProto& proto);
+
+  // Returns a `LayoutProto` representation.
+  absl::StatusOr<LayoutProto> ToProto() const;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Layout& layout) {
+    sink.Append(layout.ToString());
+  }
+
+  static char ID;  // NOLINT
+
+ protected:
+  Layout() = default;
+
+ private:
+  // `operator==` is expected to be used only by `EquivalentLayouts()`.
+  friend absl::StatusOr<bool> EquivalentLayouts(
+      DType dtype1, const Shape& shape1, const ShardingRef& sharding1,
+      const LayoutRef& layout1, DType dtype2, const Shape& shape2,
+      const ShardingRef& sharding2, const LayoutRef& layout2);
+  virtual bool operator==(const Layout& other) const = 0;
+
+  // Returns a string representation of the layout.
+  virtual std::string ToString() const = 0;
+};
+
+// Concrete layout that expresses a compact layout using major-to-minor order of
+// dimensions. There is no padding or gaps between elements. Sub-byte `DType`s
+// such as `DType::kS4` use a packed layout.
+class CompactLayout final : public llvm::RTTIExtends<CompactLayout, Layout> {
+ public:
+  static absl::StatusOr<absl_nonnull std::unique_ptr<CompactLayout>> Create(
+      absl::Span<const int> major_to_minor);
+
+  // Creates a compact layout that represents a C order (row-major order) layout
+  // for a shard shape with `num_shard_shape_dims` dimensions.
+  static absl_nonnull std::unique_ptr<CompactLayout> CreateCOrder(
+      int num_shard_shape_dims);
+
+  absl::Span<const int> major_to_minor() const { return major_to_minor_; }
+
+  // Layout implementation.
+
+  absl::StatusOr<std::optional<int64_t>> ByteSize(
+      DType dtype, const Shape& shard_shape) const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  using MajorToMinor = absl::InlinedVector<int, 6>;
+
+  explicit CompactLayout(MajorToMinor major_to_minor)
+      : major_to_minor_(std::move(major_to_minor)) {}
+
+  bool operator==(const Layout& other) const override;
+  std::string ToString() const override;
+
+  MajorToMinor major_to_minor_;
+};
+
+// Returns true if two array specs have equivalent layouts.
+//
+// Caution: It is not well-defined what it should return if two dtypes or shapes
+// do not match. Typically, the caller should perform separate equivalence
+// checks on dtype and shape as required.
+//
+// TODO(hyeontaek): Consider taking `ArraySpec` once `ArraySpec::layout` becomes
+// a `LayoutRef`.
+absl::StatusOr<bool> EquivalentLayouts(DType dtype1, const Shape& shape1,
+                                       const ShardingRef& sharding1,
+                                       const LayoutRef& layout1, DType dtype2,
+                                       const Shape& shape2,
+                                       const ShardingRef& sharding2,
+                                       const LayoutRef& layout2);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_LAYOUT_H_
diff --git a/third_party/xla/xla/python/ifrt/layout.proto b/third_party/xla/xla/python/ifrt/layout.proto
new file mode 100644
index 000000000000..eb503323cb85
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/layout.proto
@@ -0,0 +1,26 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+import "xla/python/ifrt/serdes.proto";
+
+// Proto equivalent of C++ `Layout`. A suitable serializer and deserializer
+// implementation must be registered.
+message LayoutProto {
+  xla.ifrt.Serialized serialized_layout = 1;
+}
diff --git a/third_party/xla/xla/python/ifrt/layout_serdes.cc b/third_party/xla/xla/python/ifrt/layout_serdes.cc
new file mode 100644
index 000000000000..8fd286277d3e
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/layout_serdes.cc
@@ -0,0 +1,77 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/layout.h"
+#include "xla/python/ifrt/layout_serdes.pb.h"
+#include "xla/python/ifrt/serdes.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+// Serialization/deserialization for `CompactLayout`.
+class CompactLayoutSerDes
+    : public llvm::RTTIExtends<CompactLayoutSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::CompactLayout";
+  }
+
+  absl::StatusOr<std::string> Serialize(
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions> options) override {
+    const auto* compact_layout = llvm::cast<CompactLayout>(&serializable);
+    const auto& major_to_minor = compact_layout->major_to_minor();
+    CompactLayoutProto proto;
+    proto.mutable_major_to_minor()->Reserve(major_to_minor.size());
+    proto.mutable_major_to_minor()->Add(major_to_minor.begin(),
+                                        major_to_minor.end());
+    return proto.SerializeAsString();
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    CompactLayoutProto proto;
+    if (!proto.ParseFromString(serialized)) {
+      return absl::InvalidArgumentError(
+          "Failed to parse serialized CompactLayout");
+    }
+    return CompactLayout::Create(proto.major_to_minor());
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char CompactLayoutSerDes::ID = 0;  // NOLINT
+
+// clang-format off
+bool register_compact_layout_serdes = ([]{
+  RegisterSerDes<CompactLayout>(
+      std::make_unique<CompactLayoutSerDes>());
+}(), true);
+// clang-format on
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/layout_serdes.proto b/third_party/xla/xla/python/ifrt/layout_serdes.proto
new file mode 100644
index 000000000000..07dd112cb4be
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/layout_serdes.proto
@@ -0,0 +1,23 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+// Proto equivalent of C++ `CompactLayout`.
+message CompactLayoutProto {
+  repeated int32 major_to_minor = 1;
+}
diff --git a/third_party/xla/xla/python/ifrt/layout_serdes_test.cc b/third_party/xla/xla/python/ifrt/layout_serdes_test.cc
new file mode 100644
index 000000000000..405b66ad76c8
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/layout_serdes_test.cc
@@ -0,0 +1,45 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "llvm/Support/Casting.h"
+#include "xla/python/ifrt/layout.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+TEST(LayoutSerDesTest, CompactLayoutRoundTrip) {
+  TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({1, 0}));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto serialized,
+                          Serialize(*layout, /*options=*/nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto deserialized,
+      Deserialize<CompactLayout>(serialized, /*options=*/nullptr));
+
+  const auto* out_layout = llvm::dyn_cast<CompactLayout>(deserialized.get());
+  ASSERT_NE(out_layout, nullptr);
+  EXPECT_EQ(out_layout->major_to_minor(), layout->major_to_minor());
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/layout_test.cc b/third_party/xla/xla/python/ifrt/layout_test.cc
new file mode 100644
index 000000000000..ff606104e917
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/layout_test.cc
@@ -0,0 +1,256 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/layout.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <tuple>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/layout_util.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/basic_device_list.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+using ::testing::Optional;
+using ::testing::Return;
+using ::testing::ReturnRef;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+TEST(CompactLayoutTest, Create) {
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({}));
+    EXPECT_THAT(layout->major_to_minor(), ElementsAre());
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({1, 0}));
+    EXPECT_THAT(layout->major_to_minor(), ElementsAre(1, 0));
+  }
+}
+
+TEST(CompactLayoutTest, CreateCOrder) {
+  EXPECT_THAT(CompactLayout::CreateCOrder(0)->major_to_minor(), ElementsAre());
+  EXPECT_THAT(CompactLayout::CreateCOrder(2)->major_to_minor(),
+              ElementsAre(0, 1));
+}
+
+TEST(CompactLayoutTest, ByteSize) {
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({}));
+    EXPECT_THAT(layout->ByteSize(DType(DType::kToken), Shape({})),
+                IsOkAndHolds(std::nullopt));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({}));
+    EXPECT_THAT(layout->ByteSize(DType(DType::kOpaque), Shape({})),
+                IsOkAndHolds(std::nullopt));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({}));
+    EXPECT_THAT(layout->ByteSize(DType(DType::kString), Shape({})),
+                IsOkAndHolds(std::nullopt));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({}));
+    EXPECT_THAT(layout->ByteSize(DType(DType::kS8), Shape({})),
+                IsOkAndHolds(Optional(1)));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({}));
+    EXPECT_THAT(layout->ByteSize(DType(DType::kS32), Shape({})),
+                IsOkAndHolds(Optional(4)));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({1, 0}));
+    EXPECT_THAT(layout->ByteSize(DType(DType::kS32), Shape({3, 2})),
+                IsOkAndHolds(Optional(24)));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({1, 0}));
+    EXPECT_THAT(layout->ByteSize(DType(DType::kS4), Shape({3, 2})),
+                IsOkAndHolds(Optional(3)));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout, CompactLayout::Create({}));
+    EXPECT_THAT(
+        layout->ByteSize(DType(DType::kS32), Shape({3, 2})),
+        StatusIs(tsl::error::INVALID_ARGUMENT,
+                 HasSubstr(
+                     "CompactLayout expects Shape with the same number of "
+                     "dimensions as major_to_minor [], but got shard_shape=")));
+  }
+}
+
+TEST(LayoutTest, EquivalentLayouts) {
+  auto client = std::make_unique<MockClient>();
+  ON_CALL(*client, MakeDeviceList)
+      .WillByDefault([](absl::Span<Device* const> devices) -> DeviceListRef {
+        return BasicDeviceList::Create(devices);
+      });
+
+  Shape shape({3, 2});
+
+  auto memory0 = std::make_unique<MockMemory>();
+  auto memory1 = std::make_unique<MockMemory>();
+  auto memory2 = std::make_unique<MockMemory>();
+  MemoryKind memory_kind0("memory kind 0");
+  ON_CALL(*memory0, Kind()).WillByDefault(ReturnRef(memory_kind0));
+  ON_CALL(*memory1, Kind()).WillByDefault(ReturnRef(memory_kind0));
+  ON_CALL(*memory2, Kind()).WillByDefault(ReturnRef(memory_kind0));
+
+  auto device0 = std::make_unique<MockDevice>();
+  auto device1 = std::make_unique<MockDevice>();
+  auto device2 = std::make_unique<MockDevice>();
+  ON_CALL(*device0, client()).WillByDefault(Return(client.get()));
+  ON_CALL(*device1, client()).WillByDefault(Return(client.get()));
+  ON_CALL(*device2, client()).WillByDefault(Return(client.get()));
+  ON_CALL(*device0, DefaultMemory()).WillByDefault(Return(memory0.get()));
+  ON_CALL(*device1, DefaultMemory()).WillByDefault(Return(memory1.get()));
+  ON_CALL(*device2, DefaultMemory()).WillByDefault(Return(memory2.get()));
+
+  ON_CALL(*client, GetDefaultLayout)
+      .With(std::make_tuple(DType(DType::kS32), shape.dims(),
+                            static_cast<Device*>(device0.get()), memory_kind0))
+      .WillByDefault(
+          [](DType dtype, absl::Span<const int64_t> dims, Device* device,
+             MemoryKind memory_kind)
+              -> absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> {
+            return std::make_shared<xla::PjRtLayout>(
+                xla::LayoutUtil::MakeDescendingLayout(2));
+          });
+  ON_CALL(*client, GetDefaultLayout)
+      .With(std::make_tuple(DType(DType::kS32), shape.dims(),
+                            static_cast<Device*>(device1.get()), memory_kind0))
+      .WillByDefault(
+          [](DType dtype, absl::Span<const int64_t> dims, Device* device,
+             MemoryKind memory_kind)
+              -> absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> {
+            return std::make_shared<xla::PjRtLayout>(
+                xla::LayoutUtil::MakeDescendingLayout(2));
+          });
+
+  ON_CALL(*client, GetDefaultLayout)
+      .With(std::make_tuple(DType(DType::kS32), shape.dims(),
+                            static_cast<Device*>(device2.get()), memory_kind0))
+      .WillByDefault(
+          [](DType dtype, absl::Span<const int64_t> dims, Device* device,
+             MemoryKind memory_kind)
+              -> absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> {
+            return std::make_shared<xla::PjRtLayout>(
+                xla::LayoutUtil::MakeAscendingLayout(2));
+          });
+
+  // A concrete layout and a default layout are not equivalent.
+  {
+    TF_ASSERT_OK_AND_ASSIGN(LayoutRef layout0, CompactLayout::Create({1, 0}));
+    LayoutRef layout1 = nullptr;
+    EXPECT_THAT(
+        EquivalentLayouts(
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout0,
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout1),
+        IsOkAndHolds(false));
+    EXPECT_THAT(
+        EquivalentLayouts(
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout1,
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout0),
+        IsOkAndHolds(false));
+  }
+
+  // Two same concrete layouts are equivalent.
+  {
+    TF_ASSERT_OK_AND_ASSIGN(LayoutRef layout0, CompactLayout::Create({1, 0}));
+    TF_ASSERT_OK_AND_ASSIGN(LayoutRef layout1, CompactLayout::Create({1, 0}));
+    EXPECT_THAT(
+        EquivalentLayouts(
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout0,
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout1),
+        IsOkAndHolds(true));
+  }
+  // Two different concrete layouts are not equivalent.
+  {
+    TF_ASSERT_OK_AND_ASSIGN(LayoutRef layout0, CompactLayout::Create({1, 0}));
+    TF_ASSERT_OK_AND_ASSIGN(LayoutRef layout1, CompactLayout::Create({0, 1}));
+    EXPECT_THAT(
+        EquivalentLayouts(
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout0,
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout1),
+        IsOkAndHolds(false));
+  }
+
+  // Default layouts are equivalent if they resolve to the same concrete layout.
+  {
+    LayoutRef layout0 = nullptr;
+    LayoutRef layout1 = nullptr;
+    EXPECT_THAT(
+        EquivalentLayouts(
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout0,
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout1),
+        IsOkAndHolds(true));
+    EXPECT_THAT(
+        EquivalentLayouts(
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout0,
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device1.get(), MemoryKind()), layout1),
+        IsOkAndHolds(true));
+  }
+  // Default layouts are not equivalent if they resolve to different concrete
+  // layouts.
+  {
+    LayoutRef layout0 = nullptr;
+    LayoutRef layout1 = nullptr;
+    EXPECT_THAT(
+        EquivalentLayouts(
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device0.get(), MemoryKind()), layout0,
+            DType(DType::kS32), shape,
+            SingleDeviceSharding::Create(device2.get(), MemoryKind()), layout1),
+        IsOkAndHolds(false));
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index 9a0feba41b50..b7ca910e68b4 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -22,11 +22,12 @@ limitations under the License.
 #include <utility>
 
 #include <gmock/gmock.h>
-#include "absl/base/nullability.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/tsl/concurrency/ref_count.h"
 
@@ -56,7 +58,7 @@ using ::testing::_;
 }
 
 // LINT.IfChange(MockArrayDelegation)
-MockArray::MockArray(tsl::RCReference<xla::ifrt::Array> delegated)
+MockArray::MockArray(xla::ifrt::ArrayRef delegated)
     : delegated_(std::move(delegated)) {
   ON_CALL(*this, GetReadyFuture).WillByDefault([this]() {
     return delegated_->GetReadyFuture();
@@ -79,13 +81,9 @@ MockArray::MockArray(tsl::RCReference<xla::ifrt::Array> delegated)
   });
   ON_CALL(*this, layout)
       .WillByDefault(
-          [this]() -> absl::StatusOr<std::shared_ptr<const PjRtLayout>> {
+          [this]() -> absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> {
             return delegated_->layout();
           });
-  ON_CALL(*this, DisassembleIntoSingleDeviceArrays(_))
-      .WillByDefault([this](ArrayCopySemantics semantics) {
-        return delegated_->DisassembleIntoSingleDeviceArrays(semantics);
-      });
   ON_CALL(*this, DisassembleIntoSingleDeviceArrays(_, _))
       .WillByDefault(
           [this](ArrayCopySemantics array_copy_semantics,
@@ -111,42 +109,36 @@ MockArray::MockArray(tsl::RCReference<xla::ifrt::Array> delegated)
 MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
     : delegated_(std::move(delegated)) {
   ON_CALL(*this, MakeArrayFromHostBuffer)
+      .WillByDefault([this](
+                         const void* data, DType dtype, Shape shape,
+                         std::optional<absl::Span<const int64_t>> byte_strides,
+                         ShardingRef sharding, HostBufferSemantics semantics,
+                         std::function<void()> on_done_with_host_buffer,
+                         tsl::RCReference<UserContext> user_context) {
+        // Currently the `user_context` parameter is ignored.
+        return delegated_->MakeArrayFromHostBuffer(
+            data, dtype, std::move(shape), byte_strides, std::move(sharding),
+            semantics, std::move(on_done_with_host_buffer));
+      });
+  ON_CALL(*this, MakeArraysFromHostBufferShards)
       .WillByDefault(
-          [this](const void* data, DType dtype, Shape shape,
-                 std::optional<absl::Span<const int64_t>> byte_strides,
-                 absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
+          [this](absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
                  HostBufferSemantics semantics,
-                 std::function<void()> on_done_with_host_buffer) {
-            return delegated_->MakeArrayFromHostBuffer(
-                data, dtype, std::move(shape), byte_strides,
-                std::move(sharding), semantics,
-                std::move(on_done_with_host_buffer));
-          });
-  ON_CALL(*this, AssembleArrayFromSingleDeviceArrays(_, _, _, _))
-      .WillByDefault(
-          [this](Shape shape,
-                 absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-                 absl::Span<tsl::RCReference<Array>> arrays,
-                 ArrayCopySemantics semantics) {
-            return delegated_->AssembleArrayFromSingleDeviceArrays(
-                std::move(shape), std::move(sharding), arrays, semantics);
-          });
-  ON_CALL(*this, AssembleArrayFromSingleDeviceArrays(_, _, _, _, _))
-      .WillByDefault(
-          [this](Shape shape,
-                 absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-                 absl::Span<tsl::RCReference<Array>> arrays,
-                 ArrayCopySemantics array_copy_semantics,
-                 SingleDeviceShardSemantics single_device_shard_semantics) {
-            return delegated_->AssembleArrayFromSingleDeviceArrays(
-                std::move(shape), std::move(sharding), arrays,
-                array_copy_semantics, single_device_shard_semantics);
+                 tsl::RCReference<UserContext> user_context) {
+            return delegated_->MakeArraysFromHostBufferShards(
+                specs, semantics, std::move(user_context));
           });
+  ON_CALL(*this, MakeErrorArrays)
+      .WillByDefault([this](const absl::Status& error,
+                            absl::Span<const ArraySpec> array_specs,
+                            tsl::RCReference<UserContext> user_context) {
+        return delegated_->MakeErrorArrays(error, array_specs,
+                                           std::move(user_context));
+      });
   ON_CALL(*this, AssembleArrayFromSingleDeviceArrays(_, _, _, _, _, _))
       .WillByDefault(
-          [this](DType dtype, Shape shape,
-                 absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-                 absl::Span<tsl::RCReference<Array>> arrays,
+          [this](DType dtype, Shape shape, ShardingRef sharding,
+                 absl::Span<ArrayRef> arrays,
                  ArrayCopySemantics array_copy_semantics,
                  SingleDeviceShardSemantics single_device_shard_semantics) {
             return delegated_->AssembleArrayFromSingleDeviceArrays(
@@ -154,7 +146,7 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
                 array_copy_semantics, single_device_shard_semantics);
           });
   ON_CALL(*this, CopyArrays)
-      .WillByDefault([this](absl::Span<tsl::RCReference<Array>> arrays,
+      .WillByDefault([this](absl::Span<ArrayRef> arrays,
                             std::optional<DeviceListRef> devices,
                             std::optional<MemoryKind> memory_kind,
                             ArrayCopySemantics semantics) {
@@ -162,19 +154,17 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
                                       semantics);
       });
   ON_CALL(*this, RemapArrays)
-      .WillByDefault([this](const RemapPlan& plan,
-                            absl::Span<tsl::RCReference<Array>> arrays,
+      .WillByDefault([this](const RemapPlan& plan, absl::Span<ArrayRef> arrays,
                             ArrayCopySemantics semantics) {
         return delegated_->RemapArrays(plan, arrays, semantics);
       });
   ON_CALL(*this, GetReadyFuture)
-      .WillByDefault([this](absl::Span<const tsl::RCReference<Value>> values) {
+      .WillByDefault([this](absl::Span<const ValueRef> values) {
         return delegated_->GetReadyFuture(values);
       });
-  ON_CALL(*this, MakeTuple)
-      .WillByDefault([this](absl::Span<tsl::RCReference<Value>> values) {
-        return delegated_->MakeTuple(values);
-      });
+  ON_CALL(*this, MakeTuple).WillByDefault([this](absl::Span<ValueRef> values) {
+    return delegated_->MakeTuple(values);
+  });
 
   ON_CALL(*this, runtime_type).WillByDefault([this]() {
     return delegated_->runtime_type();
@@ -233,12 +223,13 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
         return delegated_->GetTopologyForDevices(devices);
       });
   ON_CALL(*this, GetDefaultLayout)
-      .WillByDefault([this](xla::ifrt::DType dtype,
-                            absl::Span<const int64_t> dims,
-                            xla::ifrt::Device* device,
-                            xla::ifrt::MemoryKind memory_kind) {
-        return delegated_->GetDefaultLayout(dtype, dims, device, memory_kind);
-      });
+      .WillByDefault(
+          [this](xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
+                 xla::ifrt::Device* device, xla::ifrt::MemoryKind memory_kind)
+              -> absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> {
+            return delegated_->GetDefaultLayout(dtype, dims, device,
+                                                memory_kind);
+          });
 }
 // LINT.ThenChange()
 
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 84c0cffbded7..1056ce4df297 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -24,8 +24,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/nullability.h"
 #include "absl/hash/hash.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/client.h"
@@ -54,6 +55,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
@@ -66,7 +68,7 @@ namespace ifrt {
 class MockArray : public llvm::RTTIExtends<MockArray, Array> {
  public:
   MockArray() = default;
-  explicit MockArray(tsl::RCReference<xla::ifrt::Array> delegated);
+  explicit MockArray(xla::ifrt::ArrayRef delegated);
 
   // LINT.IfChange
   MOCK_METHOD(Client*, client, (), (const, final));
@@ -77,19 +79,15 @@ class MockArray : public llvm::RTTIExtends<MockArray, Array> {
   MOCK_METHOD(DType, dtype, (), (const, final));
   MOCK_METHOD(const Shape&, shape, (), (const, final));
   MOCK_METHOD(const Sharding&, sharding, (), (const, final));
-  MOCK_METHOD(absl::Nonnull<std::shared_ptr<const Sharding>>,
-              shared_ptr_sharding, (), (const, final));
-  MOCK_METHOD(absl::StatusOr<std::shared_ptr<const PjRtLayout>>, layout, (),
-              (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>,
-              DisassembleIntoSingleDeviceArrays, (ArrayCopySemantics semantics),
-              (final));
-  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>,
+  MOCK_METHOD(ShardingRef, shared_ptr_sharding, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>>, layout,
+              (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<ArrayRef>>,
               DisassembleIntoSingleDeviceArrays,
               (ArrayCopySemantics array_copy_semantics,
                SingleDeviceShardSemantics single_device_shard_semantics),
               (final));
-  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>, FullyReplicatedShard,
+  MOCK_METHOD(absl::StatusOr<ArrayRef>, FullyReplicatedShard,
               (ArrayCopySemantics semantics), (final));
   MOCK_METHOD(Future<>, CopyToHostBuffer,
               (void* data,
@@ -98,14 +96,14 @@ class MockArray : public llvm::RTTIExtends<MockArray, Array> {
               (final));
   // LINT.ThenChange(mock.cc:MockArrayDelegation)
 
-  tsl::RCReference<xla::ifrt::Array> delegated() const { return delegated_; }
+  xla::ifrt::ArrayRef delegated() const { return delegated_; }
 
   std::string DebugString() const final { return "MockArray"; }
 
   static char ID;  // NOLINT
 
  private:
-  const tsl::RCReference<xla::ifrt::Array> delegated_;
+  const xla::ifrt::ArrayRef delegated_;
 };
 
 // client.h
@@ -116,51 +114,44 @@ class MockClient : public llvm::RTTIExtends<MockClient, Client> {
   explicit MockClient(std::unique_ptr<xla::ifrt::Client> delegated);
 
   // LINT.IfChange
-  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>, MakeArrayFromHostBuffer,
+  MOCK_METHOD(absl::StatusOr<ArrayRef>, MakeArrayFromHostBuffer,
               (const void* data, DType dtype, Shape shape,
                std::optional<absl::Span<const int64_t>> byte_strides,
-               absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-               HostBufferSemantics semantics,
-               std::function<void()> on_done_with_host_buffer),
+               ShardingRef sharding, HostBufferSemantics semantics,
+               std::function<void()> on_done_with_host_buffer,
+               tsl::RCReference<UserContext> user_context),
               (final));
-  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>,
-              AssembleArrayFromSingleDeviceArrays,
-              (Shape shape,
-               absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-               absl::Span<tsl::RCReference<Array>> arrays,
-               ArrayCopySemantics semantics),
+  MOCK_METHOD(absl::StatusOr<std::vector<ArrayRef>>,
+              MakeArraysFromHostBufferShards,
+              (absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
+               HostBufferSemantics semantics,
+               tsl::RCReference<UserContext> user_context),
               (final));
-  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>,
-              AssembleArrayFromSingleDeviceArrays,
-              (Shape shape,
-               absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-               absl::Span<tsl::RCReference<Array>> arrays,
-               ArrayCopySemantics array_copy_semantics,
-               SingleDeviceShardSemantics single_device_shard_semantics),
+  MOCK_METHOD(absl::StatusOr<std::vector<ArrayRef>>, MakeErrorArrays,
+              (const absl::Status& error,
+               absl::Span<const ArraySpec> array_specs,
+               tsl::RCReference<UserContext> user_context),
               (final));
-  MOCK_METHOD(absl::StatusOr<tsl::RCReference<Array>>,
-              AssembleArrayFromSingleDeviceArrays,
-              (DType dtype, Shape shape,
-               absl::Nonnull<std::shared_ptr<const Sharding>> sharding,
-               absl::Span<tsl::RCReference<Array>> arrays,
+  MOCK_METHOD(absl::StatusOr<ArrayRef>, AssembleArrayFromSingleDeviceArrays,
+              (DType dtype, Shape shape, ShardingRef sharding,
+               absl::Span<ArrayRef> arrays,
                ArrayCopySemantics array_copy_semantics,
                SingleDeviceShardSemantics single_device_shard_semantics),
               (final));
-  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>, CopyArrays,
-              (absl::Span<tsl::RCReference<Array>> arrays,
+  MOCK_METHOD(absl::StatusOr<std::vector<ArrayRef>>, CopyArrays,
+              (absl::Span<ArrayRef> arrays,
                std::optional<DeviceListRef> devices,
                std::optional<MemoryKind> memory_kind,
                ArrayCopySemantics semantics),
               (final));
-  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>, RemapArrays,
-              (const RemapPlan& plan,
-               absl::Span<tsl::RCReference<Array>> arrays,
+  MOCK_METHOD(absl::StatusOr<std::vector<ArrayRef>>, RemapArrays,
+              (const RemapPlan& plan, absl::Span<ArrayRef> arrays,
                ArrayCopySemantics semantics),
               (final));
-  MOCK_METHOD(Future<>, GetReadyFuture,
-              (absl::Span<const tsl::RCReference<Value>> values), (final));
+  MOCK_METHOD(Future<>, GetReadyFuture, (absl::Span<const ValueRef> values),
+              (final));
   MOCK_METHOD(absl::StatusOr<tsl::RCReference<Tuple>>, MakeTuple,
-              (absl::Span<tsl::RCReference<Value>> values), (final));
+              (absl::Span<ValueRef> values), (final));
   MOCK_METHOD(absl::string_view, runtime_type, (), (const, final));
   MOCK_METHOD(absl::string_view, platform_name, (), (const, final));
   MOCK_METHOD(absl::string_view, platform_version, (), (const, final));
@@ -185,11 +176,13 @@ class MockClient : public llvm::RTTIExtends<MockClient, Client> {
   MOCK_METHOD(Compiler*, GetDefaultCompiler, (), (final));
   MOCK_METHOD(absl::StatusOr<std::shared_ptr<Topology>>, GetTopologyForDevices,
               (const xla::ifrt::DeviceListRef& devices), (const, final));
-  MOCK_METHOD(absl::StatusOr<std::shared_ptr<const PjRtLayout>>,
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>>,
               GetDefaultLayout,
               (xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
                xla::ifrt::Device* device, xla::ifrt::MemoryKind memory_kind),
               (const, final));
+  MOCK_METHOD(tsl::RCReference<xla::ifrt::UserContext>, CreateUserContext, (),
+              (final));
   // LINT.ThenChange(mock.cc:MockClientDelegation)
 
   xla::ifrt::Client* delegated() const { return delegated_.get(); }
@@ -204,16 +197,15 @@ class MockClient : public llvm::RTTIExtends<MockClient, Client> {
 
 class MockCompiler : public llvm::RTTIExtends<MockCompiler, Compiler> {
  public:
-  MOCK_METHOD(absl::StatusOr<std::unique_ptr<LoadedExecutable>>, Compile,
-              (std::unique_ptr<Program> program,
+  MOCK_METHOD(absl::StatusOr<ExecutableRef>, Compile,
+              (std::unique_ptr<Program> program, const Topology& topology,
                std::unique_ptr<CompileOptions> options),
               (final));
-  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Executable>>, Compile,
-              (std::unique_ptr<Program> program, const Topology& topology,
+  MOCK_METHOD(absl::StatusOr<LoadedExecutableRef>, CompileAndLoad,
+              (std::unique_ptr<Program> program,
                std::unique_ptr<CompileOptions> options),
               (final));
-  MOCK_METHOD(absl::StatusOr<std::unique_ptr<LoadedExecutable>>,
-              DeserializeLoadedExecutable,
+  MOCK_METHOD(absl::StatusOr<LoadedExecutableRef>, DeserializeLoadedExecutable,
               (absl::string_view serialized,
                std::unique_ptr<DeserializeExecutableOptions> options),
               (final));
@@ -276,10 +268,12 @@ class MockExecutable : public llvm::RTTIExtends<MockExecutable, Executable> {
               (const, final));
   MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
               (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
-              GetParameterLayouts, (), (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
-              GetOutputLayouts, (), (const, final));
+  MOCK_METHOD(
+      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>,
+      GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(
+      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>,
+      GetOutputLayouts, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
               GetHloModules, (), (const, final));
   MOCK_METHOD(absl::StatusOr<xla::ifrt::AttributeMap>, GetCostAnalysis, (),
@@ -305,10 +299,14 @@ class MockLoadedExecutable
               (const, final));
   MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
               (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
-              GetParameterLayouts, (), (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>,
-              GetOutputLayouts, (), (const, final));
+  MOCK_METHOD(
+      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>,
+      GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<absl::Span<const int>>, GetDonatableInputIndices,
+              (), (const, final));
+  MOCK_METHOD(
+      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>,
+      GetOutputLayouts, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::vector<absl::string_view>>>,
               GetOutputMemoryKinds, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
@@ -316,12 +314,9 @@ class MockLoadedExecutable
   MOCK_METHOD(absl::StatusOr<xla::ifrt::AttributeMap>, GetCostAnalysis, (),
               (const, final));
   MOCK_METHOD(absl::StatusOr<ExecuteResult>, Execute,
-              (absl::Span<tsl::RCReference<Array>> args,
-               const ExecuteOptions& options,
+              (absl::Span<ArrayRef> args, const ExecuteOptions& options,
                std::optional<DeviceListRef> devices),
               (final));
-  MOCK_METHOD(Future<>, Delete, (), (final));
-  MOCK_METHOD(bool, IsDeleted, (), (const, final));
   MOCK_METHOD(absl::Span<Device* const>, addressable_devices, (),
               (const, final));
 
@@ -361,24 +356,18 @@ class MockSharding : public llvm::RTTIExtends<MockSharding, Sharding> {
       : llvm::RTTIExtends<MockSharding, Sharding>(devices, memory_kind,
                                                   is_fully_replicated) {}
 
+  MOCK_METHOD((absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>),
+              Disassemble, (const Shape& shape), (const, final));
+  MOCK_METHOD((absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>),
+              Disassemble,
+              (const Shape& shape,
+               SingleDeviceShardSemantics single_device_shard_semantics),
+              (const, final));
   MOCK_METHOD(
-      (absl::StatusOr<std::vector<
-           std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
-      Disassemble, (const Shape& shape), (const, final));
-  MOCK_METHOD(
-      (absl::StatusOr<std::vector<
-           std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
-      Disassemble,
-      (const Shape& shape,
-       SingleDeviceShardSemantics single_device_shard_semantics),
-      (const, final));
-  MOCK_METHOD(
-      (absl::StatusOr<std::vector<std::pair<
-           DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
+      (absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>),
       Disassemble, (const DynamicShape& dynamic_shape), (const final));
   MOCK_METHOD(
-      (absl::StatusOr<std::vector<std::pair<
-           DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>),
+      (absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>),
       Disassemble,
       (const DynamicShape& dynamic_shape,
        SingleDeviceShardSemantics single_device_shard_semantics),
diff --git a/third_party/xla/xla/python/ifrt/plugin_program_serdes.cc b/third_party/xla/xla/python/ifrt/plugin_program_serdes.cc
index d46213254295..bfd9fd2da8e9 100644
--- a/third_party/xla/xla/python/ifrt/plugin_program_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/plugin_program_serdes.cc
@@ -42,7 +42,8 @@ class PluginProgramSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     return absl::StrCat(kSerializationPrefix,
                         llvm::cast<PluginProgram>(serializable).data);
   }
@@ -79,7 +80,8 @@ class PluginCompileOptionsSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     return "";
   }
 
diff --git a/third_party/xla/xla/python/ifrt/ref_wrapper.h b/third_party/xla/xla/python/ifrt/ref_wrapper.h
index 92c294bf1ee1..2a1fb8a7c0d3 100644
--- a/third_party/xla/xla/python/ifrt/ref_wrapper.h
+++ b/third_party/xla/xla/python/ifrt/ref_wrapper.h
@@ -55,7 +55,7 @@ class RCReferenceWrapper {
 
   void reset(T* pointer = nullptr) { ref_.reset(pointer); }
   T* release() { return ref_.release(); }
-  void swap(RCReferenceWrapper& other) { std::swap(ref_, other.ref_); }
+  void swap(RCReferenceWrapper& other) noexcept { std::swap(ref_, other.ref_); }
 
   RCReferenceWrapper& operator=(const RCReferenceWrapper& other) = default;
   RCReferenceWrapper& operator=(RCReferenceWrapper&& other) = default;
diff --git a/third_party/xla/xla/python/ifrt/remap_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/remap_impl_test_lib.cc
index 301a82569aec..4905c3059ec1 100644
--- a/third_party/xla/xla/python/ifrt/remap_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/remap_impl_test_lib.cc
@@ -101,15 +101,16 @@ struct CppTypeToDType<float> {
 };
 
 template <typename ValueType>
-absl::StatusOr<tsl::RCReference<Array>> CreateArray(
-    Client* client, absl::Span<const ValueType> base_values,
-    absl::Span<const int> device_indices, Shape shard_shape = Shape({2, 3})) {
+absl::StatusOr<ArrayRef> CreateArray(Client* client,
+                                     absl::Span<const ValueType> base_values,
+                                     absl::Span<const int> device_indices,
+                                     Shape shard_shape = Shape({2, 3})) {
   TF_RET_CHECK(base_values.size() == device_indices.size());
 
   DType dtype(CppTypeToDType<ValueType>::kDType);
   TF_ASSIGN_OR_RETURN(Shape shape, GetShape(base_values.size(), shard_shape));
 
-  std::vector<tsl::RCReference<Array>> shards;
+  std::vector<ArrayRef> shards;
   shards.reserve(base_values.size());
   absl::InlinedVector<xla::ifrt::Device*, 1> devices;
   devices.reserve(device_indices.size());
@@ -120,8 +121,7 @@ absl::StatusOr<tsl::RCReference<Array>> CreateArray(
 
     Device* device = client->addressable_devices().at(device_indices[i]);
     devices.push_back(device);
-    std::shared_ptr<const Sharding> sharding =
-        SingleDeviceSharding::Create(device, MemoryKind());
+    ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
     TF_ASSIGN_OR_RETURN(
         shards.emplace_back(),
@@ -132,14 +132,15 @@ absl::StatusOr<tsl::RCReference<Array>> CreateArray(
             /*on_done_with_host_buffer=*/{}));
   }
 
-  std::shared_ptr<const Sharding> assembled_sharding =
-      ConcreteEvenSharding::Create(client->MakeDeviceList(devices),
-                                   MemoryKind(),
-                                   /*shape=*/shape,
-                                   /*shard_shape=*/std::move(shard_shape));
+  ShardingRef assembled_sharding = ConcreteEvenSharding::Create(
+      client->MakeDeviceList(devices), MemoryKind(),
+      /*shape=*/shape,
+      /*shard_shape=*/std::move(shard_shape));
+  absl::Span<ArrayRef> arrays = absl::MakeSpan(shards);
   return client->AssembleArrayFromSingleDeviceArrays(
-      std::move(shape), std::move(assembled_sharding), absl::MakeSpan(shards),
-      ArrayCopySemantics::kDonateInput);
+      arrays.at(0)->dtype(), std::move(shape), std::move(assembled_sharding),
+      arrays, ArrayCopySemantics::kDonateInput,
+      SingleDeviceShardSemantics::kAddressableShards);
 }
 
 // Checks the shards and contents of an array, same as what CreateArray would
@@ -160,8 +161,10 @@ void AssertArrayContent(Client* client, Array* array,
   EXPECT_EQ(actual_sharding->shape(), expected_shape);
   EXPECT_EQ(actual_sharding->shard_shape(), expected_shard_shape);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto shards, array->DisassembleIntoSingleDeviceArrays(
-                                           ArrayCopySemantics::kReuseInput));
+  TF_ASSERT_OK_AND_ASSIGN(auto shards,
+                          array->DisassembleIntoSingleDeviceArrays(
+                              ArrayCopySemantics::kReuseInput,
+                              SingleDeviceShardSemantics::kAddressableShards));
   ASSERT_THAT(shards, SizeIs(base_values.size()));
   for (int i = 0; i < shards.size(); ++i) {
     EXPECT_EQ(shards[i]->dtype(), expected_dtype);
@@ -203,7 +206,7 @@ TEST(RemapImplTest, ExtractSingleShard) {
                          /*to=*/{RemapPlan::Interval{0, 1, 1}}});
   TF_ASSERT_OK(plan.Validate());
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       CreateArray<int32_t>(client.get(), /*base_values=*/{0, 6},
@@ -255,7 +258,7 @@ TEST(RemapImplTest, InterleaveArraysDonate) {
                          /*to=*/{RemapPlan::Interval{1, 4, 2}}});
   TF_ASSERT_OK(plan.Validate());
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       CreateArray<int32_t>(client.get(), /*base_values=*/{0, 6},
@@ -303,7 +306,7 @@ TEST(RemapImplTest, InterleaveArraysReuse) {
                          /*to=*/{RemapPlan::Interval{1, 4, 2}}});
   TF_ASSERT_OK(plan.Validate());
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       CreateArray<int32_t>(client.get(), /*base_values=*/{0, 6},
@@ -344,7 +347,7 @@ TEST(RemapImplTest, DeinterleaveArrays) {
                          /*to=*/{RemapPlan::Interval{0, 2, 1}}});
   TF_ASSERT_OK(plan.Validate());
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       CreateArray<int32_t>(client.get(), /*base_values=*/{0, 100, 6, 106},
@@ -418,7 +421,7 @@ TEST(RemapImplTest, BatchMappingIdentity) {
                          /*to=*/{RemapPlan::Interval{0, 2, 1}}});
   TF_ASSERT_OK(plan.Validate());
 
-  std::vector<tsl::RCReference<Array>> inputs;
+  std::vector<ArrayRef> inputs;
   TF_ASSERT_OK_AND_ASSIGN(
       inputs.emplace_back(),
       CreateArray<int32_t>(client.get(), /*base_values=*/{10, 20, 30, 40},
@@ -430,7 +433,7 @@ TEST(RemapImplTest, BatchMappingIdentity) {
   for (ArrayCopySemantics copy_semantics : std::vector<ArrayCopySemantics>{
            ArrayCopySemantics::kReuseInput, ArrayCopySemantics::kDonateInput}) {
     TF_ASSERT_OK_AND_ASSIGN(
-        std::vector<tsl::RCReference<Array>> outputs,
+        std::vector<ArrayRef> outputs,
         client->RemapArrays(plan, absl::MakeSpan(inputs), copy_semantics));
     ASSERT_THAT(outputs, SizeIs(2));
     AssertArrayContent<int32_t>(client.get(), outputs[0].get(),
@@ -503,7 +506,7 @@ TEST(RemapImplTest, BatchMappingDeinterleave) {
                          /*to=*/{RemapPlan::Interval{0, 1, 1}}});
   TF_ASSERT_OK(plan.Validate());
 
-  std::vector<tsl::RCReference<Array>> inputs;
+  std::vector<ArrayRef> inputs;
   TF_ASSERT_OK_AND_ASSIGN(
       inputs.emplace_back(),
       CreateArray<float>(client.get(), /*base_values=*/{10, 20, 30, 40},
@@ -515,7 +518,7 @@ TEST(RemapImplTest, BatchMappingDeinterleave) {
   for (ArrayCopySemantics copy_semantics : std::vector<ArrayCopySemantics>{
            ArrayCopySemantics::kReuseInput, ArrayCopySemantics::kDonateInput}) {
     TF_ASSERT_OK_AND_ASSIGN(
-        std::vector<tsl::RCReference<Array>> outputs,
+        std::vector<ArrayRef> outputs,
         client->RemapArrays(plan, absl::MakeSpan(inputs), copy_semantics));
     ASSERT_THAT(outputs, SizeIs(4));
     AssertArrayContent<float>(client.get(), outputs[0].get(),
@@ -550,7 +553,7 @@ TEST(RemapImplTest, DetectBadInput) {
   TF_ASSERT_OK(plan.Validate());
 
   {
-    std::vector<tsl::RCReference<Array>> arrays;
+    std::vector<ArrayRef> arrays;
     TF_ASSERT_OK_AND_ASSIGN(
         arrays.emplace_back(),
         CreateArray<int32_t>(client.get(), /*base_values=*/{0},
@@ -567,7 +570,7 @@ TEST(RemapImplTest, DetectBadInput) {
   }
 
   {
-    std::vector<tsl::RCReference<Array>> arrays;
+    std::vector<ArrayRef> arrays;
     TF_ASSERT_OK_AND_ASSIGN(
         arrays.emplace_back(),
         CreateArray<float>(client.get(), /*base_values=*/{0},
@@ -580,7 +583,7 @@ TEST(RemapImplTest, DetectBadInput) {
   }
 
   {
-    std::vector<tsl::RCReference<Array>> arrays;
+    std::vector<ArrayRef> arrays;
     TF_ASSERT_OK_AND_ASSIGN(
         arrays.emplace_back(),
         CreateArray<int32_t>(client.get(), /*base_values=*/{0},
@@ -594,7 +597,7 @@ TEST(RemapImplTest, DetectBadInput) {
   }
 
   {
-    std::vector<tsl::RCReference<Array>> arrays;
+    std::vector<ArrayRef> arrays;
     TF_ASSERT_OK_AND_ASSIGN(
         arrays.emplace_back(),
         CreateArray<int32_t>(client.get(), /*base_values=*/{0},
diff --git a/third_party/xla/xla/python/ifrt/remap_plan_test.cc b/third_party/xla/xla/python/ifrt/remap_plan_test.cc
index 22c11919fe8d..1214b97d01ae 100644
--- a/third_party/xla/xla/python/ifrt/remap_plan_test.cc
+++ b/third_party/xla/xla/python/ifrt/remap_plan_test.cc
@@ -71,7 +71,7 @@ TEST_P(RemapPlanTest, ToFromProto) {
   Shape shape({20, 20});
   Shape shard_shape({5, 20});
   DeviceListRef devices = GetDevices({0, 1, 2, 3});
-  std::shared_ptr<const Sharding> sharding =
+  ShardingRef sharding =
       ConcreteEvenSharding::Create(devices, MemoryKind(), /*shape=*/shape,
                                    /*shard_shape=*/shard_shape);
 
diff --git a/third_party/xla/xla/python/ifrt/serdes.cc b/third_party/xla/xla/python/ifrt/serdes.cc
index e6d40c850690..e33c42e0e006 100644
--- a/third_party/xla/xla/python/ifrt/serdes.cc
+++ b/third_party/xla/xla/python/ifrt/serdes.cc
@@ -50,7 +50,7 @@ struct Registry {
 };
 
 Registry* registry() {
-  static auto* r = new Registry();
+  static auto* const r = new Registry();
   return r;
 }
 
@@ -82,7 +82,8 @@ void RegisterSerDes(const void* type_id, std::unique_ptr<SerDes> serdes) {
 }
 
 absl::StatusOr<Serialized> Serialize(
-    Serializable& serializable, std::unique_ptr<SerializeOptions> options) {
+    const Serializable& serializable,
+    std::unique_ptr<SerializeOptions> options) {
   SerDes* serdes;
   {
     Registry* const r = registry();
diff --git a/third_party/xla/xla/python/ifrt/serdes.h b/third_party/xla/xla/python/ifrt/serdes.h
index 624d588cdcc0..12fddbff6339 100644
--- a/third_party/xla/xla/python/ifrt/serdes.h
+++ b/third_party/xla/xla/python/ifrt/serdes.h
@@ -63,7 +63,7 @@ class SerDes : public llvm::RTTIExtends<SerDes, llvm::RTTIRoot> {
   virtual absl::string_view type_name() const = 0;
 
   virtual absl::StatusOr<std::string> Serialize(
-      Serializable& serializable,
+      const Serializable& serializable,
       std::unique_ptr<SerializeOptions> options) = 0;
 
   virtual absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
@@ -104,7 +104,7 @@ absl::StatusOr<std::unique_ptr<Serializable>> DeserializeUnchecked(
 //
 // Returns an error if the `Serializable` type does not have a corresponding
 // `SerDes` registered or the `SerDes` returns an error.
-absl::StatusOr<Serialized> Serialize(Serializable& serializable,
+absl::StatusOr<Serialized> Serialize(const Serializable& serializable,
                                      std::unique_ptr<SerializeOptions> options);
 
 // Deserializes the given proto message produced by `Serialize()` back to an
diff --git a/third_party/xla/xla/python/ifrt/serdes_test.cc b/third_party/xla/xla/python/ifrt/serdes_test.cc
index e325885db2b1..5669974b1e52 100644
--- a/third_party/xla/xla/python/ifrt/serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/serdes_test.cc
@@ -80,7 +80,7 @@ class TestNumberSerDes : public llvm::RTTIExtends<TestNumberSerDes, SerDes> {
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable,
+      const Serializable& serializable,
       std::unique_ptr<SerializeOptions> options) override {
     if (options != nullptr) {
       auto* serialize_options =
diff --git a/third_party/xla/xla/python/ifrt/sharding.cc b/third_party/xla/xla/python/ifrt/sharding.cc
index 39eb4fc825d9..2f5d8807da92 100644
--- a/third_party/xla/xla/python/ifrt/sharding.cc
+++ b/third_party/xla/xla/python/ifrt/sharding.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.pb.h"
-#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
@@ -183,7 +182,7 @@ bool Sharding::operator==(const Sharding& other) const {
          *devices() == *other.devices();
 }
 
-absl::StatusOr<std::unique_ptr<Sharding>> Sharding::FromProto(
+absl::StatusOr<ShardingRef> Sharding::FromProto(
     Client* client, const ShardingProto& sharding_proto) {
   return Deserialize<Sharding>(
       sharding_proto.serialized_sharding(),
@@ -192,9 +191,8 @@ absl::StatusOr<std::unique_ptr<Sharding>> Sharding::FromProto(
 
 absl::StatusOr<ShardingProto> Sharding::ToProto() const {
   ShardingProto sharding_proto;
-  TF_ASSIGN_OR_RETURN(
-      *sharding_proto.mutable_serialized_sharding(),
-      Serialize(const_cast<Sharding&>(*this), /*options=*/nullptr));
+  TF_ASSIGN_OR_RETURN(*sharding_proto.mutable_serialized_sharding(),
+                      Serialize(*this, /*options=*/nullptr));
   return sharding_proto;
 }
 
@@ -242,18 +240,18 @@ SingleDeviceSharding::WithDeviceAssignment(
                 memory_kind.value_or(memory_kind_));
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 SingleDeviceSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
   return Disassemble(shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 SingleDeviceSharding::Disassemble(
     const Shape& shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
   DCHECK(this);
-  std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+  std::vector<std::pair<Shape, ShardingRef>> result;
   if (single_device_shard_semantics == SingleDeviceShardSemantics::kAllShards ||
       devices_->devices().front()->IsAddressable()) {
     result.reserve(1);
@@ -263,19 +261,17 @@ SingleDeviceSharding::Disassemble(
   return result;
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 SingleDeviceSharding::Disassemble(const DynamicShape& dynamic_shape) const {
   DCHECK(this);
   return Disassemble(dynamic_shape, SingleDeviceShardSemantics::kAllShards);
 }
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 SingleDeviceSharding::Disassemble(
     const DynamicShape& dynamic_shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
   DCHECK(this);
-  std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>> result;
+  std::vector<std::pair<DynamicShape, ShardingRef>> result;
   if (single_device_shard_semantics == SingleDeviceShardSemantics::kAllShards ||
       devices_->devices().front()->IsAddressable()) {
     result.reserve(1);
@@ -353,13 +349,13 @@ absl::StatusOr<std::unique_ptr<Sharding>> OpaqueSharding::WithDeviceAssignment(
   return Create(devices.value_or(devices_), memory_kind.value_or(memory_kind_));
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 OpaqueSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
   return Disassemble(shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 OpaqueSharding::Disassemble(
     const Shape& shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -368,15 +364,13 @@ OpaqueSharding::Disassemble(
       "OpaqueSharding does not have shard shape information");
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 OpaqueSharding::Disassemble(const DynamicShape& dynamic_shape) const {
   DCHECK(this);
   return Disassemble(dynamic_shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 OpaqueSharding::Disassemble(
     const DynamicShape& dynamic_shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -447,7 +441,7 @@ ConcreteSharding::ConcreteSharding(DeviceListRef devices,
       break;
     }
   }
-  if (identical) {
+  if (identical && !static_shard_shapes.empty()) {
     shard_shape_ = static_shard_shapes[0];
   }
 }
@@ -495,21 +489,19 @@ ConcreteSharding::WithDeviceAssignment(
     return Create(devices.value_or(devices_),
                   memory_kind.value_or(memory_kind_), std::get<Shape>(shape_),
                   std::get<std::vector<Shape>>(shard_shapes_));
-  } else {
-    return Create(devices.value_or(devices_),
-                  memory_kind.value_or(memory_kind_),
-                  std::get<DynamicShape>(shape_),
-                  std::get<std::vector<DynamicShape>>(shard_shapes_));
   }
+  return Create(devices.value_or(devices_), memory_kind.value_or(memory_kind_),
+                std::get<DynamicShape>(shape_),
+                std::get<std::vector<DynamicShape>>(shard_shapes_));
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 ConcreteSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
   return Disassemble(shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 ConcreteSharding::Disassemble(
     const Shape& shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -526,7 +518,7 @@ ConcreteSharding::Disassemble(
         "to disassemble shape %s",
         std::get<Shape>(shape_).DebugString(), shape.DebugString());
   }
-  std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+  std::vector<std::pair<Shape, ShardingRef>> result;
   const std::vector<Shape>& shard_shapes =
       std::get<std::vector<Shape>>(shard_shapes_);
 
@@ -558,15 +550,13 @@ ConcreteSharding::Disassemble(
   return result;
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 ConcreteSharding::Disassemble(const DynamicShape& dynamic_shape) const {
   DCHECK(this);
   return Disassemble(dynamic_shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 ConcreteSharding::Disassemble(
     const DynamicShape& dynamic_shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -584,7 +574,7 @@ ConcreteSharding::Disassemble(
         std::get<DynamicShape>(shape_).DebugString(),
         dynamic_shape.DebugString());
   }
-  std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>> result;
+  std::vector<std::pair<DynamicShape, ShardingRef>> result;
   const std::vector<DynamicShape>& shard_dynamic_shapes =
       std::get<std::vector<DynamicShape>>(shard_shapes_);
 
@@ -711,13 +701,13 @@ ConcreteEvenSharding::WithDeviceAssignment(
                 shape_, shard_shape_, is_fully_replicated_);
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 ConcreteEvenSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
   return Disassemble(shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 ConcreteEvenSharding::Disassemble(
     const Shape& shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -728,7 +718,7 @@ ConcreteEvenSharding::Disassemble(
         "to disassemble shape %s",
         shape_.DebugString(), shape.DebugString());
   }
-  std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+  std::vector<std::pair<Shape, ShardingRef>> result;
   const absl::Span<Device* const> devices = devices_->devices();
   if (single_device_shard_semantics == SingleDeviceShardSemantics::kAllShards) {
     result.reserve(devices_->size());
@@ -746,15 +736,13 @@ ConcreteEvenSharding::Disassemble(
   return result;
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 ConcreteEvenSharding::Disassemble(const DynamicShape& dynamic_shape) const {
   DCHECK(this);
   return Disassemble(dynamic_shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 ConcreteEvenSharding::Disassemble(
     const DynamicShape& dynamic_shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -817,20 +805,20 @@ ShardingParamSharding::ShardingParamSharding(ShardingParam sharding_param,
           ComputeIsFullyReplicated(sharding_param)),
       sharding_param_(sharding_param) {}
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 ShardingParamSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
   return Disassemble(shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 ShardingParamSharding::Disassemble(
     const Shape& shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
   DCHECK(this);
   TF_ASSIGN_OR_RETURN(Shape local_shape, GetShardShape(shape));
 
-  std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+  std::vector<std::pair<Shape, ShardingRef>> result;
   if (single_device_shard_semantics == SingleDeviceShardSemantics::kAllShards) {
     result.reserve(devices_->size());
   } else {
@@ -896,15 +884,13 @@ ShardingParamSharding::WithDeviceAssignment(
                 memory_kind.value_or(memory_kind_));
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 ShardingParamSharding::Disassemble(const DynamicShape& dynamic_shape) const {
   DCHECK(this);
   return Disassemble(dynamic_shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 ShardingParamSharding::Disassemble(
     const DynamicShape& dynamic_shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
diff --git a/third_party/xla/xla/python/ifrt/sharding.h b/third_party/xla/xla/python/ifrt/sharding.h
index 56c63435f0ae..ec7710abf3ca 100644
--- a/third_party/xla/xla/python/ifrt/sharding.h
+++ b/third_party/xla/xla/python/ifrt/sharding.h
@@ -36,15 +36,17 @@ limitations under the License.
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.pb.h"
-#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
 
 // TODO(hyeontaek): Unify sharding types with jax::Sharding.
 
+class Sharding;
 struct DeserializeShardingOptions;
 
+using ShardingRef = absl_nonnull std::shared_ptr<const Sharding>;
+
 // Semantics for operations that take or return single-device shards of arrays
 // or shardings.
 enum class SingleDeviceShardSemantics : int {
@@ -122,11 +124,9 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
   // disassembly is unsupported.
   // TODO(hyeontaek): Replace this API with the version that takes
   // `SingleDeviceShardSemantics`.
-  virtual absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  virtual absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
   Disassemble(const Shape& shape) const = 0;
-  virtual absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  virtual absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
   Disassemble(
       const Shape& shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const = 0;
@@ -134,11 +134,9 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
   // Variant of `Disassemble` that takes a dynamic shape.
   // TODO(hyeontaek): Replace this API with the version that takes
   // `SingleDeviceShardSemantics`.
-  virtual absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  virtual absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
   Disassemble(const DynamicShape& dynamic_shape) const = 0;
-  virtual absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
+  virtual absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
   Disassemble(
       const DynamicShape& dynamic_shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const = 0;
@@ -166,7 +164,7 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
   // Deserializes `ShardingProto` into `Sharding`.
   // Note that `Sharding` serialization uses `SerDes` to handle an open set of
   // `Sharding` subclasses. See `serdes.h`.
-  static absl::StatusOr<std::unique_ptr<Sharding>> FromProto(
+  static absl::StatusOr<ShardingRef> FromProto(
       Client* client, const ShardingProto& sharding_proto);
 
   // Serializes `Sharding` into `ShardingProto`.
@@ -182,6 +180,10 @@ class Sharding : public llvm::RTTIExtends<Sharding, Serializable> {
     sink.Append(sharding.DebugString());
   }
 
+  // TODO(hyeontaek): Remove this template definition. In theory,
+  // `std::shared_ptr<>` is responsible for defining it. Consider introducing a
+  // `std::shared_ptr<>` version of `RCReferenceWrapper` to own this template
+  // definition.
   template <class Sink>
   friend void AbslStringify(Sink& sink,
                             std::shared_ptr<const Sharding>& sharding) {
@@ -234,21 +236,15 @@ class SingleDeviceSharding final
       std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const Shape& shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
       const Shape& shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const DynamicShape& dynamic_shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
+      const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
       const DynamicShape& dynamic_shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
@@ -289,21 +285,15 @@ class OpaqueSharding : public llvm::RTTIExtends<OpaqueSharding, Sharding> {
       std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const Shape& shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
       const Shape& shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const DynamicShape& dynamic_shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
+      const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
       const DynamicShape& dynamic_shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
@@ -390,21 +380,15 @@ class ConcreteSharding : public llvm::RTTIExtends<ConcreteSharding, Sharding> {
       std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const Shape& shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
       const Shape& shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const DynamicShape& dynamic_shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
+      const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
       const DynamicShape& dynamic_shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
@@ -468,21 +452,15 @@ class ConcreteEvenSharding
       std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const Shape& shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
       const Shape& shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const DynamicShape& dynamic_shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
+      const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
       const DynamicShape& dynamic_shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
@@ -526,21 +504,15 @@ class ShardingParamSharding
       std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const Shape& shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<Shape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
       const Shape& shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(const DynamicShape& dynamic_shape) const override;
-  absl::StatusOr<std::vector<
-      std::pair<DynamicShape, absl::Nonnull<std::shared_ptr<const Sharding>>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
+      const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
       const DynamicShape& dynamic_shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
diff --git a/third_party/xla/xla/python/ifrt/sharding_serdes.cc b/third_party/xla/xla/python/ifrt/sharding_serdes.cc
index ee74cb1c5bfb..0c80753eb6d8 100644
--- a/third_party/xla/xla/python/ifrt/sharding_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_serdes.cc
@@ -51,7 +51,8 @@ class SingleDeviceShardingSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const SingleDeviceSharding& sharding =
         llvm::cast<SingleDeviceSharding>(serializable);
     SingleDeviceShardingProto proto;
@@ -95,7 +96,8 @@ class OpaqueShardingSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const OpaqueSharding& sharding = llvm::cast<OpaqueSharding>(serializable);
     OpaqueShardingProto proto;
     *proto.mutable_devices() = sharding.devices()->ToProto();
@@ -138,7 +140,8 @@ class ConcreteShardingSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const ConcreteSharding& sharding =
         llvm::cast<ConcreteSharding>(serializable);
     ConcreteShardingProto proto;
@@ -222,7 +225,8 @@ class ConcreteEvenShardingSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const ConcreteEvenSharding& sharding =
         llvm::cast<ConcreteEvenSharding>(serializable);
     ConcreteEvenShardingProto proto;
@@ -273,7 +277,8 @@ class ShardingParamShardingSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const ShardingParamSharding& sharding =
         llvm::cast<ShardingParamSharding>(serializable);
     ShardingParamShardingProto proto;
diff --git a/third_party/xla/xla/python/ifrt/sharding_test.cc b/third_party/xla/xla/python/ifrt/sharding_test.cc
index d80acc3b3795..c8512d36d24e 100644
--- a/third_party/xla/xla/python/ifrt/sharding_test.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_test.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
-#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
@@ -59,14 +58,14 @@ TEST_P(SingleDeviceShardingTest, CreateWithBadDevice) {
 
 TEST_P(SingleDeviceShardingTest, IsFullyReplicated) {
   auto device_list = GetDevices({0});
-  std::shared_ptr<const Sharding> sharding = SingleDeviceSharding::Create(
+  ShardingRef sharding = SingleDeviceSharding::Create(
       device_list->devices().front(), MemoryKind());
   EXPECT_TRUE(sharding->IsFullyReplicated());
 }
 
 TEST_P(SingleDeviceShardingTest, GetShardShape) {
   auto device_list = GetDevices({0});
-  std::shared_ptr<const Sharding> sharding = SingleDeviceSharding::Create(
+  ShardingRef sharding = SingleDeviceSharding::Create(
       device_list->devices().front(), MemoryKind());
   EXPECT_THAT(sharding->GetShardShape(Shape({10, 20})),
               IsOkAndHolds(Shape({10, 20})));
@@ -74,13 +73,13 @@ TEST_P(SingleDeviceShardingTest, GetShardShape) {
 
 TEST_P(SingleDeviceShardingTest, HasSamePartitioning) {
   auto device_list0 = GetDevices({0});
-  std::shared_ptr<const Sharding> sharding0 = SingleDeviceSharding::Create(
+  ShardingRef sharding0 = SingleDeviceSharding::Create(
       device_list0->devices().front(), MemoryKind());
 
   EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
   {
     auto device_list1 = GetDevices({1});
-    std::shared_ptr<const Sharding> sharding1 = SingleDeviceSharding::Create(
+    ShardingRef sharding1 = SingleDeviceSharding::Create(
         device_list1->devices().front(), MemoryKind());
     EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -88,11 +87,11 @@ TEST_P(SingleDeviceShardingTest, HasSamePartitioning) {
 
 TEST_P(SingleDeviceShardingTest, WithDeviceAssignment) {
   auto device_list0 = GetDevices({0});
-  std::shared_ptr<const Sharding> sharding0 = SingleDeviceSharding::Create(
+  ShardingRef sharding0 = SingleDeviceSharding::Create(
       device_list0->devices().front(), MemoryKind());
   {
     auto device_list1 = GetDevices({1});
-    std::shared_ptr<const Sharding> sharding1 = SingleDeviceSharding::Create(
+    ShardingRef sharding1 = SingleDeviceSharding::Create(
         device_list1->devices().front(), MemoryKind());
     TF_ASSERT_OK_AND_ASSIGN(
         auto new_sharding,
@@ -112,7 +111,7 @@ TEST_P(SingleDeviceShardingTest, WithDeviceAssignment) {
 
 TEST_P(SingleDeviceShardingTest, IndexDomains) {
   auto device_list = GetDevices({0});
-  std::shared_ptr<const Sharding> sharding = SingleDeviceSharding::Create(
+  ShardingRef sharding = SingleDeviceSharding::Create(
       device_list->devices().front(), MemoryKind());
 
   Shape shape({10, 20});
@@ -137,7 +136,7 @@ TEST_P(SingleDeviceShardingTest, IndexDomains) {
 
 TEST_P(SingleDeviceShardingTest, Disassemble) {
   auto device_list = GetDevices({0});
-  std::shared_ptr<const Sharding> sharding = SingleDeviceSharding::Create(
+  ShardingRef sharding = SingleDeviceSharding::Create(
       device_list->devices().front(), MemoryKind());
 
   {  // Disassemble static shape.
@@ -222,15 +221,13 @@ TEST_P(OpaqueShardingTest, CreateWithBadDeviceList) {
 
 TEST_P(OpaqueShardingTest, IsFullyReplicated) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding =
-      OpaqueSharding::Create(device_list, MemoryKind());
+  ShardingRef sharding = OpaqueSharding::Create(device_list, MemoryKind());
   EXPECT_FALSE(sharding->IsFullyReplicated());
 }
 
 TEST_P(OpaqueShardingTest, GetShardShape) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding =
-      OpaqueSharding::Create(device_list, MemoryKind());
+  ShardingRef sharding = OpaqueSharding::Create(device_list, MemoryKind());
   EXPECT_THAT(sharding->GetShardShape(Shape({10, 20})),
               StatusIs(tsl::error::INVALID_ARGUMENT,
                        HasSubstr("OpaqueSharding does not have shard shape")));
@@ -238,26 +235,22 @@ TEST_P(OpaqueShardingTest, GetShardShape) {
 
 TEST_P(OpaqueShardingTest, HasSamePartitioning) {
   auto device_list0 = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding0 =
-      OpaqueSharding::Create(device_list0, MemoryKind());
+  ShardingRef sharding0 = OpaqueSharding::Create(device_list0, MemoryKind());
 
   EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
   {
     auto device_list1 = GetDevices({2, 3});
-    std::shared_ptr<const Sharding> sharding1 =
-        OpaqueSharding::Create(device_list0, MemoryKind());
+    ShardingRef sharding1 = OpaqueSharding::Create(device_list0, MemoryKind());
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
   }
 }
 
 TEST_P(OpaqueShardingTest, WithDeviceAssignment) {
   auto device_list0 = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding0 =
-      OpaqueSharding::Create(device_list0, MemoryKind());
+  ShardingRef sharding0 = OpaqueSharding::Create(device_list0, MemoryKind());
   {
     auto device_list1 = GetDevices({2, 3});
-    std::shared_ptr<const Sharding> sharding1 =
-        OpaqueSharding::Create(device_list0, MemoryKind());
+    ShardingRef sharding1 = OpaqueSharding::Create(device_list0, MemoryKind());
     TF_ASSERT_OK_AND_ASSIGN(
         auto new_sharding,
         sharding0->WithDeviceAssignment(device_list1,
@@ -281,8 +274,7 @@ TEST_P(OpaqueShardingTest, WithDeviceAssignment) {
 
 TEST_P(OpaqueShardingTest, FailedToDisassemble) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding =
-      OpaqueSharding::Create(device_list, MemoryKind());
+  ShardingRef sharding = OpaqueSharding::Create(device_list, MemoryKind());
 
   EXPECT_THAT(
       sharding->Disassemble(Shape({30})),
@@ -302,8 +294,7 @@ TEST_P(OpaqueShardingTest, FailedToDisassemble) {
 
 TEST_P(OpaqueShardingTest, IndexDomainsFails) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding =
-      OpaqueSharding::Create(device_list, MemoryKind());
+  ShardingRef sharding = OpaqueSharding::Create(device_list, MemoryKind());
 
   EXPECT_THAT(
       sharding->IndexDomains(Shape({30})),
@@ -335,8 +326,8 @@ TEST_P(ConcreteShardingTest, IsFullyReplicated) {
   shard_shapes.reserve(2);
   shard_shapes.push_back(Shape({10}));
   shard_shapes.push_back(Shape({20}));
-  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
-      device_list, MemoryKind(), Shape({30}), shard_shapes);
+  ShardingRef sharding = ConcreteSharding::Create(device_list, MemoryKind(),
+                                                  Shape({30}), shard_shapes);
   EXPECT_FALSE(sharding->IsFullyReplicated());
 }
 
@@ -344,8 +335,8 @@ TEST_P(ConcreteShardingTest, GetShardShapeSuccess) {
   auto device_list = GetDevices({0, 1});
   Shape shard_shape({30});
   std::vector<Shape> shard_shapes(2, shard_shape);
-  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
-      device_list, MemoryKind(), Shape({30}), shard_shapes);
+  ShardingRef sharding = ConcreteSharding::Create(device_list, MemoryKind(),
+                                                  Shape({30}), shard_shapes);
   EXPECT_THAT(sharding->GetShardShape(Shape({30})), IsOkAndHolds(shard_shape));
 }
 
@@ -355,8 +346,8 @@ TEST_P(ConcreteShardingTest, GetShardShapeFailure) {
   shard_shapes.reserve(2);
   shard_shapes.push_back(Shape({10}));
   shard_shapes.push_back(Shape({20}));
-  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
-      device_list, MemoryKind(), Shape({30}), shard_shapes);
+  ShardingRef sharding = ConcreteSharding::Create(device_list, MemoryKind(),
+                                                  Shape({30}), shard_shapes);
   EXPECT_THAT(
       sharding->GetShardShape(Shape({30})),
       StatusIs(
@@ -370,8 +361,8 @@ TEST_P(ConcreteShardingTest, HasSamePartitioning) {
   shard_shapes0.reserve(2);
   shard_shapes0.push_back(Shape({10}));
   shard_shapes0.push_back(Shape({20}));
-  std::shared_ptr<const Sharding> sharding0 = ConcreteSharding::Create(
-      device_list0, MemoryKind(), Shape({30}), shard_shapes0);
+  ShardingRef sharding0 = ConcreteSharding::Create(device_list0, MemoryKind(),
+                                                   Shape({30}), shard_shapes0);
 
   EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
   {
@@ -380,7 +371,7 @@ TEST_P(ConcreteShardingTest, HasSamePartitioning) {
     shard_shapes1.reserve(2);
     shard_shapes1.push_back(Shape({10}));
     shard_shapes1.push_back(Shape({20}));
-    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+    ShardingRef sharding1 = ConcreteSharding::Create(
         device_list1, MemoryKind(), Shape({30}), shard_shapes1);
     EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -392,7 +383,7 @@ TEST_P(ConcreteShardingTest, HasSamePartitioning) {
     shard_shapes1.push_back(Shape({10}));
     shard_shapes1.push_back(Shape({20}));
     shard_shapes1.push_back(Shape({30}));
-    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+    ShardingRef sharding1 = ConcreteSharding::Create(
         device_list1, MemoryKind(), Shape({60}), shard_shapes1);
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -403,7 +394,7 @@ TEST_P(ConcreteShardingTest, HasSamePartitioning) {
     shard_shapes1.reserve(2);
     shard_shapes1.push_back(Shape({10}));
     shard_shapes1.push_back(Shape({20}));
-    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+    ShardingRef sharding1 = ConcreteSharding::Create(
         device_list1, MemoryKind(), Shape({40}), shard_shapes1);
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -414,7 +405,7 @@ TEST_P(ConcreteShardingTest, HasSamePartitioning) {
     shard_shapes1.reserve(2);
     shard_shapes1.push_back(Shape({10000}));
     shard_shapes1.push_back(Shape({20}));
-    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+    ShardingRef sharding1 = ConcreteSharding::Create(
         device_list1, MemoryKind(), Shape({30}), shard_shapes1);
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -426,15 +417,15 @@ TEST_P(ConcreteShardingTest, WithDeviceAssignment) {
   shard_shapes0.reserve(2);
   shard_shapes0.push_back(Shape({10}));
   shard_shapes0.push_back(Shape({20}));
-  std::shared_ptr<const Sharding> sharding0 = ConcreteSharding::Create(
-      device_list0, MemoryKind(), Shape({30}), shard_shapes0);
+  ShardingRef sharding0 = ConcreteSharding::Create(device_list0, MemoryKind(),
+                                                   Shape({30}), shard_shapes0);
   {
     auto device_list1 = GetDevices({0, 1});
     std::vector<Shape> shard_shapes1;
     shard_shapes1.reserve(2);
     shard_shapes1.push_back(Shape({10}));
     shard_shapes1.push_back(Shape({20}));
-    std::shared_ptr<const Sharding> sharding1 = ConcreteSharding::Create(
+    ShardingRef sharding1 = ConcreteSharding::Create(
         device_list1, MemoryKind(), Shape({30}), shard_shapes1);
     TF_ASSERT_OK_AND_ASSIGN(
         auto new_sharding,
@@ -462,8 +453,8 @@ TEST_P(ConcreteShardingTest, Disassemble) {
   shard_shapes.push_back(Shape({7}));
   shard_shapes.push_back(Shape({3}));
   shard_shapes.push_back(Shape({7}));
-  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
-      device_list, MemoryKind(), Shape({20}), shard_shapes);
+  ShardingRef sharding = ConcreteSharding::Create(device_list, MemoryKind(),
+                                                  Shape({20}), shard_shapes);
 
   {
     EXPECT_THAT(
@@ -569,8 +560,8 @@ TEST_P(ConcreteShardingTest, DisassembleFailsForUnexpectedShape) {
   shard_shapes.reserve(2);
   shard_shapes.push_back(Shape({10}));
   shard_shapes.push_back(Shape({20}));
-  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
-      device_list, MemoryKind(), Shape({30}), shard_shapes);
+  ShardingRef sharding = ConcreteSharding::Create(device_list, MemoryKind(),
+                                                  Shape({30}), shard_shapes);
 
   EXPECT_THAT(sharding->Disassemble(Shape({40})),
               StatusIs(tsl::error::INVALID_ARGUMENT,
@@ -583,8 +574,8 @@ TEST_P(ConcreteShardingTest, IndexDomainsFails) {
   shard_shapes.reserve(2);
   shard_shapes.push_back(Shape({10}));
   shard_shapes.push_back(Shape({20}));
-  std::shared_ptr<const Sharding> sharding = ConcreteSharding::Create(
-      device_list, MemoryKind(), Shape({30}), shard_shapes);
+  ShardingRef sharding = ConcreteSharding::Create(device_list, MemoryKind(),
+                                                  Shape({30}), shard_shapes);
 
   EXPECT_THAT(sharding->IndexDomains(Shape({30})),
               StatusIs(tsl::error::INVALID_ARGUMENT,
@@ -620,14 +611,14 @@ TEST_P(ConcreteEvenShardingTest, IsFullyReplicated) {
   auto device_list = GetDevices({0, 1});
   {
     // Fully replicated.
-    std::shared_ptr<const Sharding> sharding =
+    ShardingRef sharding =
         ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
                                      Shape({15}), /*is_fully_replicated=*/true);
     EXPECT_TRUE(sharding->IsFullyReplicated());
   }
   {
     // Not fully replicated.
-    std::shared_ptr<const Sharding> sharding = ConcreteEvenSharding::Create(
+    ShardingRef sharding = ConcreteEvenSharding::Create(
         device_list, MemoryKind(), Shape({30}), Shape({15}),
         /*is_fully_replicated=*/false);
     EXPECT_FALSE(sharding->IsFullyReplicated());
@@ -636,7 +627,7 @@ TEST_P(ConcreteEvenShardingTest, IsFullyReplicated) {
 
 TEST_P(ConcreteEvenShardingTest, GetShardShape) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding =
+  ShardingRef sharding =
       ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
                                    Shape({15}), /*is_fully_replicated=*/true);
   EXPECT_THAT(sharding->GetShardShape(Shape({30})), IsOkAndHolds(Shape({15})));
@@ -650,14 +641,14 @@ TEST_P(ConcreteEvenShardingTest, GetShardShape) {
 
 TEST_P(ConcreteEvenShardingTest, HasSamePartitioning) {
   auto device_list0 = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding0 =
+  ShardingRef sharding0 =
       ConcreteEvenSharding::Create(device_list0, MemoryKind(), Shape({30}),
                                    Shape({15}), /*is_fully_replicated=*/true);
 
   EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
   {
     auto device_list1 = GetDevices({2, 3});
-    std::shared_ptr<const Sharding> sharding1 =
+    ShardingRef sharding1 =
         ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
                                      Shape({15}), /*is_fully_replicated=*/true);
     EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
@@ -665,7 +656,7 @@ TEST_P(ConcreteEvenShardingTest, HasSamePartitioning) {
   // Different number of shards.
   {
     auto device_list1 = GetDevices({2, 3, 4});
-    std::shared_ptr<const Sharding> sharding1 =
+    ShardingRef sharding1 =
         ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
                                      Shape({15}), /*is_fully_replicated=*/true);
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
@@ -673,7 +664,7 @@ TEST_P(ConcreteEvenShardingTest, HasSamePartitioning) {
   // Difference shape.
   {
     auto device_list1 = GetDevices({2, 3});
-    std::shared_ptr<const Sharding> sharding1 =
+    ShardingRef sharding1 =
         ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({45}),
                                      Shape({15}), /*is_fully_replicated=*/true);
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
@@ -681,7 +672,7 @@ TEST_P(ConcreteEvenShardingTest, HasSamePartitioning) {
   // Different shard shape.
   {
     auto device_list1 = GetDevices({2, 3});
-    std::shared_ptr<const Sharding> sharding1 =
+    ShardingRef sharding1 =
         ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
                                      Shape({10}), /*is_fully_replicated=*/true);
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
@@ -689,7 +680,7 @@ TEST_P(ConcreteEvenShardingTest, HasSamePartitioning) {
   // Different is_fully_replicated.
   {
     auto device_list1 = GetDevices({2, 3});
-    std::shared_ptr<const Sharding> sharding1 = ConcreteEvenSharding::Create(
+    ShardingRef sharding1 = ConcreteEvenSharding::Create(
         device_list1, MemoryKind(), Shape({30}), Shape({15}),
         /*is_fully_replicated=*/false);
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
@@ -698,12 +689,12 @@ TEST_P(ConcreteEvenShardingTest, HasSamePartitioning) {
 
 TEST_P(ConcreteEvenShardingTest, WithDeviceAssignment) {
   auto device_list0 = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding0 =
+  ShardingRef sharding0 =
       ConcreteEvenSharding::Create(device_list0, MemoryKind(), Shape({30}),
                                    Shape({15}), /*is_fully_replicated=*/true);
   {
     auto device_list1 = GetDevices({2, 3});
-    std::shared_ptr<const Sharding> sharding1 =
+    ShardingRef sharding1 =
         ConcreteEvenSharding::Create(device_list1, MemoryKind(), Shape({30}),
                                      Shape({15}), /*is_fully_replicated=*/true);
     TF_ASSERT_OK_AND_ASSIGN(
@@ -727,7 +718,7 @@ TEST_P(ConcreteEvenShardingTest, WithDeviceAssignment) {
 
 TEST_P(ConcreteEvenShardingTest, Disassemble) {
   auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
-  std::shared_ptr<const Sharding> sharding =
+  ShardingRef sharding =
       ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
                                    Shape({5}), /*is_fully_replicated=*/false);
 
@@ -773,7 +764,7 @@ TEST_P(ConcreteEvenShardingTest, Disassemble) {
 
 TEST_P(ConcreteEvenShardingTest, DisassembleFailsForUnexpectedShape) {
   auto device_list = GetDevices({0, 1});
-  std::shared_ptr<const Sharding> sharding =
+  ShardingRef sharding =
       ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
                                    Shape({15}), /*is_fully_replicated=*/false);
 
@@ -785,7 +776,7 @@ TEST_P(ConcreteEvenShardingTest, DisassembleFailsForUnexpectedShape) {
 TEST_P(ConcreteEvenShardingTest, IndexDomainsFails) {
   auto device_list = GetDevices({0, 1});
   std::vector<Shape> shard_shapes;
-  std::shared_ptr<const Sharding> sharding =
+  ShardingRef sharding =
       ConcreteEvenSharding::Create(device_list, MemoryKind(), Shape({30}),
                                    Shape({5}), /*is_fully_replicated=*/false);
 
@@ -840,7 +831,7 @@ TEST_P(ShardingParamShardingTest, IsFullyReplicated) {
     ShardingParam param{/*dim_shards=*/{1, 1},
                         {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
     TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const Sharding> param_sharding,
+        ShardingRef param_sharding,
         ShardingParamSharding::Create(param, device_list, MemoryKind()));
     EXPECT_TRUE(param_sharding->IsFullyReplicated());
   }
@@ -849,7 +840,7 @@ TEST_P(ShardingParamShardingTest, IsFullyReplicated) {
     ShardingParam param{/*dim_shards=*/{1, 6},
                         {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
     TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const Sharding> param_sharding,
+        ShardingRef param_sharding,
         ShardingParamSharding::Create(param, device_list, MemoryKind()));
     EXPECT_FALSE(param_sharding->IsFullyReplicated());
   }
@@ -858,7 +849,7 @@ TEST_P(ShardingParamShardingTest, IsFullyReplicated) {
     ShardingParam param{/*dim_shards=*/{2, 3},
                         {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
     TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const Sharding> param_sharding,
+        ShardingRef param_sharding,
         ShardingParamSharding::Create(param, device_list, MemoryKind()));
     EXPECT_FALSE(param_sharding->IsFullyReplicated());
   }
@@ -869,7 +860,7 @@ TEST_P(ShardingParamShardingTest, GetShardShape) {
   ShardingParam param{/*dim_shards=*/{2, 3},
                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> sharding,
+      ShardingRef sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
   EXPECT_THAT(sharding->GetShardShape(Shape({6, 6})),
               IsOkAndHolds(Shape({3, 2})));
@@ -884,7 +875,7 @@ TEST_P(ShardingParamShardingTest, HasSamePartitioning) {
   ShardingParam param0{/*dim_shards=*/{2, 3},
                        {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> sharding0,
+      ShardingRef sharding0,
       ShardingParamSharding::Create(param0, device_list0, MemoryKind()));
 
   EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding0));
@@ -893,7 +884,7 @@ TEST_P(ShardingParamShardingTest, HasSamePartitioning) {
     ShardingParam param1{/*dim_shards=*/{2, 3},
                          {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
     TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const Sharding> sharding1,
+        ShardingRef sharding1,
         ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
     EXPECT_TRUE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -903,7 +894,7 @@ TEST_P(ShardingParamShardingTest, HasSamePartitioning) {
     ShardingParam param1{/*dim_shards=*/{3, 1},
                          {/*permutation=*/{1, 0}, /*axis_sizes=*/{1, 3}}};
     TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const Sharding> sharding1,
+        ShardingRef sharding1,
         ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -913,7 +904,7 @@ TEST_P(ShardingParamShardingTest, HasSamePartitioning) {
     ShardingParam param1{/*dim_shards=*/{3, 2},
                          {/*permutation=*/{0, 1}, /*axis_sizes=*/{3, 2}}};
     TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const Sharding> sharding1,
+        ShardingRef sharding1,
         ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
     EXPECT_FALSE(sharding0->HasSamePartitioning(*sharding1));
   }
@@ -924,14 +915,14 @@ TEST_P(ShardingParamShardingTest, WithDeviceAssignment) {
   ShardingParam param0{/*dim_shards=*/{2, 3},
                        {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> sharding0,
+      ShardingRef sharding0,
       ShardingParamSharding::Create(param0, device_list0, MemoryKind()));
   {
     auto device_list1 = GetDevices({3, 4, 5, 0, 1, 2});
     ShardingParam param1{/*dim_shards=*/{2, 3},
                          {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
     TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const Sharding> sharding1,
+        ShardingRef sharding1,
         ShardingParamSharding::Create(param1, device_list1, MemoryKind()));
     TF_ASSERT_OK_AND_ASSIGN(
         auto new_sharding,
@@ -957,7 +948,7 @@ TEST_P(ShardingParamShardingTest, Disassemble) {
   ShardingParam param{/*dim_shards=*/{2, 3},
                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> param_sharding,
+      ShardingRef param_sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
 
   {
@@ -1005,7 +996,7 @@ TEST_P(ShardingParamShardingTest, DisassembleFailsWhenRankNotMatch) {
   ShardingParam param{/*dim_shards=*/{2, 3},
                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> param_sharding,
+      ShardingRef param_sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
 
   EXPECT_THAT(param_sharding->Disassemble(Shape({6, 6, 6})),
@@ -1019,7 +1010,7 @@ TEST_P(ShardingParamShardingTest, DisassembleFailsForUnevenSharding) {
   ShardingParam param{/*dim_shards=*/{2, 3},
                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> param_sharding,
+      ShardingRef param_sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
 
   EXPECT_THAT(
@@ -1034,7 +1025,7 @@ TEST_P(ShardingParamShardingTest, IndexDomain) {
   ShardingParam param{/*dim_shards=*/{2, 3},
                       {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> param_sharding,
+      ShardingRef param_sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
 
   {
@@ -1080,7 +1071,7 @@ TEST_P(ShardingParamShardingTest, IndexDomainWithPermutation) {
   ShardingParam param{/*dim_shards=*/{2, 3},
                       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> param_sharding,
+      ShardingRef param_sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
 
   {
@@ -1126,7 +1117,7 @@ TEST_P(ShardingParamShardingTest, IndexDomainWithReplication) {
   ShardingParam param{/*dim_shards=*/{2, 1},
                       {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<const Sharding> param_sharding,
+      ShardingRef param_sharding,
       ShardingParamSharding::Create(param, device_list, MemoryKind()));
 
   {
diff --git a/third_party/xla/xla/python/ifrt/support/BUILD b/third_party/xla/xla/python/ifrt/support/BUILD
index 065768d2ac37..6cd306757977 100644
--- a/third_party/xla/xla/python/ifrt/support/BUILD
+++ b/third_party/xla/xla/python/ifrt/support/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
index 8fc4cfddedc5..bbe18a474bd6 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
@@ -103,10 +103,10 @@ class ShardingConversionsTest : public testing::TestWithParam<int> {
   void AssertSameTiling(const ShardingParam& sharding_param,
                         const HloSharding& hlo_sharding, const Shape& shape) {
     auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
-    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<const Sharding> sharding,
+    TF_ASSERT_OK_AND_ASSIGN(ShardingRef sharding,
                             ShardingParamSharding::Create(
                                 sharding_param, device_list, MemoryKind()));
-    const xla::Shape xla_shape(PrimitiveType::F16, shape.dims(), {}, {});
+    const xla::Shape xla_shape(PrimitiveType::F16, shape.dims(), {});
 
     TF_ASSERT_OK_AND_ASSIGN(const std::vector<IndexDomain> index_domains,
                             sharding->IndexDomains(shape));
diff --git a/third_party/xla/xla/python/ifrt/test_util.h b/third_party/xla/xla/python/ifrt/test_util.h
index e83ac5268591..8402dffde83c 100644
--- a/third_party/xla/xla/python/ifrt/test_util.h
+++ b/third_party/xla/xla/python/ifrt/test_util.h
@@ -58,8 +58,7 @@ void SetTestFilterIfNotUserSpecified(absl::string_view custom_filter);
 // This will blocking copy the data to host buffer.
 template <typename ElementT>
 void AssertPerShardData(
-    tsl::RCReference<Array> actual, DType expected_dtype,
-    Shape expected_per_shard_shape,
+    ArrayRef actual, DType expected_dtype, Shape expected_per_shard_shape,
     absl::Span<const absl::Span<const ElementT>> expected_per_shard_data,
     DeviceListRef expected_device_list) {
   ASSERT_EQ(actual->dtype(), expected_dtype);
@@ -67,11 +66,12 @@ void AssertPerShardData(
               testing::ElementsAreArray(GetDeviceIds(expected_device_list)));
   TF_ASSERT_OK_AND_ASSIGN(auto actual_per_shard_arrays,
                           actual->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy));
+                              ArrayCopySemantics::kAlwaysCopy,
+                              SingleDeviceShardSemantics::kAddressableShards));
   ASSERT_EQ(actual_per_shard_arrays.size(), expected_per_shard_data.size());
   for (int i = 0; i < actual_per_shard_arrays.size(); ++i) {
     SCOPED_TRACE(absl::StrCat("Shard ", i));
-    const tsl::RCReference<Array>& array = actual_per_shard_arrays[i];
+    const ArrayRef& array = actual_per_shard_arrays[i];
     ASSERT_EQ(array->shape(), expected_per_shard_shape);
     std::vector<ElementT> actual_data(expected_per_shard_shape.num_elements());
     TF_ASSERT_OK(array
diff --git a/third_party/xla/xla/python/ifrt/tuple.h b/third_party/xla/xla/python/ifrt/tuple.h
index 54cb1a1fd028..c0dd5ec9b663 100644
--- a/third_party/xla/xla/python/ifrt/tuple.h
+++ b/third_party/xla/xla/python/ifrt/tuple.h
@@ -53,7 +53,7 @@ class Tuple : public llvm::RTTIExtends<Tuple, Value> {
   virtual int Arity() = 0;
 
   // Unpacks the tuple into its constituent pieces.
-  virtual absl::Status Unpack(absl::Span<tsl::RCReference<Value>> values) = 0;
+  virtual absl::Status Unpack(absl::Span<ValueRef> values) = 0;
 
   static char ID;  // NOLINT
 };
diff --git a/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc
index f17efb8e450a..a764fe67eb06 100644
--- a/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -33,14 +34,13 @@ namespace xla {
 namespace ifrt {
 namespace {
 
-absl::StatusOr<tsl::RCReference<Array>> MakeArray(Client* client) {
+absl::StatusOr<ArrayRef> MakeArray(Client* client) {
   DType dtype(DType::kF32);
   Shape shape({2, 3});
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   return client->MakeArrayFromHostBuffer(
       data.data(), dtype, shape,
@@ -55,7 +55,7 @@ TEST(TupleImplTest, NullaryTuple) {
   TF_ASSERT_OK_AND_ASSIGN(auto t, client->MakeTuple({}));
 
   EXPECT_EQ(t->Arity(), 0);
-  std::vector<tsl::RCReference<Value>> elements;
+  std::vector<ValueRef> elements;
   TF_EXPECT_OK(t->Unpack(absl::MakeSpan(elements)));
   EXPECT_EQ(elements.size(), 0);
 
@@ -73,11 +73,11 @@ TEST(TupleImplTest, TupleOfArrays) {
   TF_ASSERT_OK_AND_ASSIGN(auto a1, MakeArray(client.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto a2, MakeArray(client.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto a3, MakeArray(client.get()));
-  std::vector<tsl::RCReference<Value>> elements_in{a1, a2, a3};
+  std::vector<ValueRef> elements_in{a1, a2, a3};
   TF_ASSERT_OK_AND_ASSIGN(auto t,
                           client->MakeTuple(absl::MakeSpan(elements_in)));
   EXPECT_EQ(t->Arity(), 3);
-  std::vector<tsl::RCReference<Value>> elements(3);
+  std::vector<ValueRef> elements(3);
   TF_EXPECT_OK(t->Unpack(absl::MakeSpan(elements)));
   EXPECT_THAT(elements, ::testing::ElementsAre(a1, a2, a3));
 
@@ -96,7 +96,7 @@ TEST(TupleImplTest, DeleteOfElementDeletesTuple) {
   TF_ASSERT_OK_AND_ASSIGN(auto a1, MakeArray(client.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto a2, MakeArray(client.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto a3, MakeArray(client.get()));
-  std::vector<tsl::RCReference<Value>> elements_in{a1, a2, a3};
+  std::vector<ValueRef> elements_in{a1, a2, a3};
   TF_ASSERT_OK_AND_ASSIGN(auto t,
                           client->MakeTuple(absl::MakeSpan(elements_in)));
 
@@ -111,18 +111,18 @@ TEST(TupleImplTest, NestedTuples) {
   TF_ASSERT_OK_AND_ASSIGN(auto a1, MakeArray(client.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto a2, MakeArray(client.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto a3, MakeArray(client.get()));
-  std::vector<tsl::RCReference<Value>> e1{a1, a2};
+  std::vector<ValueRef> e1{a1, a2};
   TF_ASSERT_OK_AND_ASSIGN(auto t1, client->MakeTuple(absl::MakeSpan(e1)));
   EXPECT_EQ(t1->Arity(), 2);
-  std::vector<tsl::RCReference<Value>> e2{};
+  std::vector<ValueRef> e2{};
   TF_ASSERT_OK_AND_ASSIGN(auto t2, client->MakeTuple(absl::MakeSpan(e2)));
   EXPECT_EQ(t2->Arity(), 0);
 
-  std::vector<tsl::RCReference<Value>> e3{t1, t2, a3};
+  std::vector<ValueRef> e3{t1, t2, a3};
   TF_ASSERT_OK_AND_ASSIGN(auto t3, client->MakeTuple(absl::MakeSpan(e3)));
   EXPECT_EQ(t3->Arity(), 3);
 
-  std::vector<tsl::RCReference<Value>> elements(3);
+  std::vector<ValueRef> elements(3);
   TF_EXPECT_OK(t3->Unpack(absl::MakeSpan(elements)));
   EXPECT_THAT(elements, ::testing::ElementsAre(t1, t2, a3));
 
diff --git a/third_party/xla/xla/python/ifrt/user_context.cc b/third_party/xla/xla/python/ifrt/user_context.cc
new file mode 100644
index 000000000000..11712d6b2594
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/user_context.cc
@@ -0,0 +1,24 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/user_context.h"
+
+namespace xla {
+namespace ifrt {
+
+char UserContext::ID = 0;         // For llvm::RTTI
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/user_context.h b/third_party/xla/xla/python/ifrt/user_context.h
new file mode 100644
index 000000000000..59ab813e745a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/user_context.h
@@ -0,0 +1,59 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_USER_CONTEXT_H_
+#define XLA_PYTHON_IFRT_USER_CONTEXT_H_
+
+#include <cstdint>
+#include <string>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// UserContext is an interface that must be implemented by any object that the
+// user would like to be associated with the runtime operations triggered by an
+// IFRT call. For example, a UserContext can be based on a stack trace for
+// Python frameworks (e.g.: JAX), or on a "request_id" in case of request
+// serving applications.
+class UserContext : public tsl::ReferenceCounted<UserContext>,
+                    public llvm::RTTIExtends<UserContext, llvm::RTTIRoot> {
+ public:
+  static tsl::RCReference<UserContext> Default();
+
+  ~UserContext() override = default;
+
+  // Returns a fingerprint of the UserContext. The returned fingerprint is must
+  // be non-zero, as the special value of zero is reserved for the IFRT
+  // implementations for their internal default UserContext.  IFRT
+  // implementations may use internally. IFRT implementations
+  // may also use this as a key for holding the UserContexts in a container, and
+  // so this should be efficient enough to called multiple times.
+  virtual uint64_t Fingerprint() const = 0;
+
+  // Returns a human readable string. Meant for debugging, logging, and for
+  // putting together statusz-like pages.
+  virtual std::string DebugString() const = 0;
+
+  // For llvm::RTTI
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_USER_CONTEXT_H_
diff --git a/third_party/xla/xla/python/ifrt/value.h b/third_party/xla/xla/python/ifrt/value.h
index 623390cb1b9e..c56288334b42 100644
--- a/third_party/xla/xla/python/ifrt/value.h
+++ b/third_party/xla/xla/python/ifrt/value.h
@@ -65,6 +65,8 @@ class Value : public tsl::ReferenceCounted<Value>,
   static char ID;  // NOLINT
 };
 
+using ValueRef = tsl::RCReference<Value>;
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 88952d98cb46..812eaca8d06c 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
-load("//xla/tsl:tsl.bzl", "if_google")
+load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = default_ifrt_proxy_visibility,
+    default_visibility = internal_visibility(default_ifrt_proxy_visibility),
 )
 
 cc_library(
@@ -102,6 +102,8 @@ cc_library(
         "//xla/python/ifrt_proxy/common:prof_util",
         "//xla/python/ifrt_proxy/common:test_utils",
         "//xla/python/ifrt_proxy/common:types",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status_to_from_proto",
         "//xla/tsl/profiler/utils:xplane_schema",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
@@ -155,17 +157,20 @@ cc_library(
         ":memory",
         ":rpc_helper",
         "//xla:xla_data_proto_cc",
-        "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
+        "//xla/python/ifrt:user_context",
         "//xla/python/ifrt_proxy/common:common_serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:versions",
         "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -174,10 +179,10 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -186,6 +191,7 @@ ifrt_proxy_cc_test(
     name = "client_test",
     srcs = ["client_test.cc"],
     deps = [
+        ":array",
         ":client",
         ":client_session",
         ":host_buffer",
@@ -193,14 +199,21 @@ ifrt_proxy_cc_test(
         ":mock_host_buffer",
         ":rpc_helper",
         ":version",
+        "//xla:shape_util",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:types",
         "//xla/service:computation_placer_hdr",
+        "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
@@ -235,16 +248,23 @@ cc_library(
         ":global_flags",
         ":rpc_helper",
         "//xla:status_macros",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
+        "//xla/python/ifrt:client_impl_util",
         "//xla/python/ifrt:sharding_serdes",
+        "//xla/python/ifrt:user_context",
         "//xla/python/ifrt_proxy/common:array_util",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:types_proto_cc",
         "//xla/python/ifrt_proxy/common:versions",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_to_from_proto",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -252,10 +272,9 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -266,26 +285,31 @@ ifrt_proxy_cc_test(
     deps = [
         ":array",
         ":client_session",
-        ":host_buffer",
         ":mock_client_session",
         ":mock_host_buffer",
         ":rpc_helper",
         ":version",
+        "//xla:shape_util",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:mock",
+        "//xla/python/ifrt:user_context",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:test_utils",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:types_proto_cc",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -309,7 +333,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:test_utils",
         "@com_google_absl//absl/status",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -320,6 +344,7 @@ cc_library(
     deps = [
         ":executable",
         ":rpc_helper",
+        "//xla:debug_options_flags",
         "//xla/pjrt:host_callback",
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes",
@@ -328,6 +353,7 @@ cc_library(
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -353,10 +379,13 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt:mock",
         "//xla/python/ifrt:serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:test_utils",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:protobuf",
@@ -388,12 +417,16 @@ cc_library(
         "//xla/python/ifrt_proxy/common:versions",
         "//xla/python/pjrt_ifrt",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_to_from_proto",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -403,12 +436,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status_to_from_proto",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -433,7 +462,7 @@ cc_library(
         "//xla/python/ifrt",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -497,9 +526,9 @@ cc_library(
     name = "registry",
     srcs = ["registry.cc"],
     hdrs = ["registry.h"],
-    visibility = default_ifrt_proxy_visibility + if_google([
+    visibility = internal_visibility(default_ifrt_proxy_visibility + if_google([
         "//xla/python/ifrt_proxy/common/google:cc_client_users",
-    ]),
+    ])),
     deps = [
         ":global_flags",
         "//xla/python/ifrt",
@@ -548,7 +577,7 @@ ifrt_proxy_cc_test(
         ":rpc_helper",
         ":version",
         "//xla:shape_util",
-        "//xla/pjrt:pjrt_layout",
+        "//xla:xla_data_proto_cc",
         "//xla/python/ifrt",
         "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:mock",
@@ -557,12 +586,12 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt_proxy/common:test_utils",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/tsl/concurrency:ref_count",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
@@ -570,38 +599,6 @@ ifrt_proxy_cc_test(
     ],
 )
 
-cc_library(
-    name = "py_module",
-    srcs = ["py_module.cc"],
-    hdrs = ["py_module.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//xla/python:__pkg__"],
-    deps = [
-        ":grpc_client",
-        ":registry",
-        "//xla/pjrt:status_casters",
-        "//xla/python:nb_class_ptr",
-        "//xla/python:py_client",
-        "//xla/python/ifrt",
-        "//xla/python/ifrt:attribute_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/log:log_entry",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/time",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:statusor",
-        "@nanobind",
-    ],
-)
-
 # Export headers referenced by the google-internal-version of global_flags.
 exports_files(
     ["global_flags.h"],
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index 49ea1350840e..69085381aa4d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -25,6 +25,7 @@
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -35,15 +36,20 @@
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/client_impl_util.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt_proxy/client/global_flags.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/common/array_util.h"
@@ -53,15 +59,16 @@
 #include "xla/python/ifrt_proxy/common/versions.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_to_from_proto.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
 namespace ifrt {
 namespace proxy {
 
-char Array::ID = 0;
+namespace {
 
 template <typename T>
 void CheckResponseAfterAsyncCall(const Future<std::shared_ptr<T>>& f,
@@ -95,12 +102,12 @@ void CheckResponseAfterAsyncCall(const Future<std::shared_ptr<T>>& f,
 
 using HostBufferSemantics = ::xla::ifrt::Client::HostBufferSemantics;
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-Array::MakeArrayFromHostBuffer(
+// Makes a host buffer on the server.
+absl::StatusOr<uint64_t> MakeHostBuffer(
     xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
     const void* data, DType dtype, Shape shape,
     std::optional<absl::Span<const int64_t>> byte_strides,
-    std::shared_ptr<const Sharding> sharding, HostBufferSemantics semantics,
+    HostBufferSemantics semantics,
     std::function<void()> on_done_with_host_buffer) {
   absl::string_view mem_region;
   if (dtype.kind() != DType::kString) {
@@ -186,6 +193,32 @@ Array::MakeArrayFromHostBuffer(
           }
         });
   }
+  return host_buffer_handle;
+}
+
+}  // namespace
+
+char Array::ID = 0;
+
+absl::StatusOr<xla::ifrt::ArrayRef> Array::MakeArrayFromHostBuffer(
+    xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+    const void* data, DType dtype, Shape shape,
+    std::optional<absl::Span<const int64_t>> byte_strides, ShardingRef sharding,
+    HostBufferSemantics semantics,
+    std::function<void()> on_done_with_host_buffer) {
+  TF_ASSIGN_OR_RETURN(
+      const uint64_t host_buffer_handle,
+      MakeHostBuffer(client, rpc_helper, data, dtype, shape, byte_strides,
+                     semantics, std::move(on_done_with_host_buffer)));
+  auto cleanup = absl::MakeCleanup([&]() {
+    rpc_helper->host_buffer_store()
+        ->Delete(host_buffer_handle)
+        .OnReady([](absl::Status status) {
+          if (!status.ok()) {
+            LOG(WARNING) << "Failed to delete host buffer: " << status;
+          }
+        });
+  });
 
   auto req = std::make_unique<MakeArrayFromHostBufferRequest>();
   req->set_host_buffer_handle(host_buffer_handle);
@@ -210,9 +243,176 @@ Array::MakeArrayFromHostBuffer(
     CheckResponseAfterAsyncCall(
         rpc_helper->MakeArrayFromHostBuffer(std::move(req)), arr_handle);
   }
-  return tsl::RCReference<xla::ifrt::Array>(
-      tsl::MakeRef<Array>(client, std::move(rpc_helper), dtype,
-                          std::move(shape), std::move(sharding), arr_handle));
+
+  std::move(cleanup).Cancel();
+
+  return xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
+      client, std::move(rpc_helper), dtype, std::move(shape),
+      std::move(sharding), arr_handle, /*layout=*/nullptr));
+}
+
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
+Array::MakeArraysFromHostBufferShards(
+    xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+    absl::Span<xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec> specs,
+    xla::ifrt::Client::HostBufferSemantics semantics,
+    tsl::RCReference<xla::ifrt::UserContext> user_context) {
+  if (rpc_helper->version().protocol_version() <
+      protocol_version::kMakeArraysFromHostBufferShards) {
+    return xla::ifrt::ClientMakeArraysFromHostBufferShards(
+        client, specs, semantics, std::move(user_context));
+  }
+  // TODO(b/407104769): Handle `user_context`.
+
+  absl::InlinedVector<absl::InlinedVector<uint64_t, 1>, 1>
+      host_buffer_handles_for_specs;
+  auto cleanup = absl::MakeCleanup([&]() {
+    for (const auto& host_buffer_handles : host_buffer_handles_for_specs) {
+      for (const uint64_t host_buffer_handle : host_buffer_handles) {
+        rpc_helper->host_buffer_store()
+            ->Delete(host_buffer_handle)
+            .OnReady([](absl::Status status) {
+              if (!status.ok()) {
+                LOG(WARNING) << "Failed to delete host buffer: " << status;
+              }
+            });
+      }
+    }
+  });
+  host_buffer_handles_for_specs.reserve(specs.size());
+  for (const auto& spec : specs) {
+    auto& host_buffer_handles = host_buffer_handles_for_specs.emplace_back();
+    host_buffer_handles.reserve(spec.buffers.size());
+    for (const auto& [_, host_buffer] : spec.buffers) {
+      TF_ASSIGN_OR_RETURN(
+          const uint64_t host_buffer_handle,
+          MakeHostBuffer(client, rpc_helper, host_buffer.data,
+                         host_buffer.dtype, host_buffer.shape,
+                         host_buffer.byte_strides, semantics,
+                         /*on_done_with_host_buffer=*/host_buffer.on_done));
+      host_buffer_handles.push_back(host_buffer_handle);
+    }
+  }
+
+  std::vector<ArrayHandle> arr_handles;
+  arr_handles.reserve(specs.size());
+
+  auto req = std::make_unique<MakeArraysFromHostBufferShardsRequest>();
+  req->mutable_specs()->Reserve(specs.size());
+  if (!GetGlobalClientFlags()->synchronous_host_buffer_store) {
+    req->mutable_array_handles()->Reserve(specs.size());
+  }
+  for (int spec_idx = 0; spec_idx < specs.size(); ++spec_idx) {
+    const xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec& spec =
+        specs[spec_idx];
+    MakeArraysFromHostBufferShardsRequest::MakeArraysFromHostBufferShardsSpec*
+        spec_proto = req->add_specs();
+    spec_proto->mutable_host_buffers()->Reserve(spec.buffers.size());
+    for (int buffer_idx = 0; buffer_idx < spec.buffers.size(); ++buffer_idx) {
+      const auto& [addressable_shard_indices, host_buffer] =
+          spec.buffers[buffer_idx];
+      MakeArraysFromHostBufferShardsRequest::ShardIndices*
+          addressable_shard_indices_proto =
+              spec_proto->add_addressable_shard_indices();
+      addressable_shard_indices_proto->mutable_indices()->Reserve(
+          addressable_shard_indices.size());
+      for (const int shard_index : addressable_shard_indices) {
+        addressable_shard_indices_proto->add_indices(shard_index);
+      }
+
+      MakeArraysFromHostBufferShardsRequest::HostBuffer* host_buffer_proto =
+          spec_proto->add_host_buffers();
+      *host_buffer_proto->mutable_dtype() = host_buffer.dtype.ToProto();
+      *host_buffer_proto->mutable_shape() = host_buffer.shape.ToProto();
+      host_buffer_proto->set_host_buffer_handle(
+          host_buffer_handles_for_specs[spec_idx][buffer_idx]);
+      if (host_buffer.byte_strides.has_value()) {
+        *host_buffer_proto->mutable_byte_strides() =
+            ToByteStridesProto(*host_buffer.byte_strides);
+      }
+    }
+    TF_ASSIGN_OR_RETURN(*spec_proto->mutable_array_spec(),
+                        spec.array_spec.ToProto());
+
+    if (!GetGlobalClientFlags()->synchronous_host_buffer_store) {
+      uint64_t arr_handle;
+      if (spec.buffers.empty()) {
+        arr_handle = rpc_helper->NextHandle();
+      } else {
+        // Reuse the host_buffer_handle as also the client-generated arr_handle.
+        arr_handle = spec_proto->host_buffers(0).host_buffer_handle();
+      }
+
+      req->add_array_handles(arr_handle);
+      arr_handles.push_back(ArrayHandle{arr_handle});
+    }
+  }
+
+  if (GetGlobalClientFlags()->synchronous_host_buffer_store) {
+    TF_ASSIGN_OR_RETURN(
+        auto resp,
+        rpc_helper->MakeArraysFromHostBufferShards(std::move(req)).Await());
+    for (const uint64_t array_handle : resp->array_handles()) {
+      arr_handles.push_back(ArrayHandle{array_handle});
+    }
+  } else {
+    CheckResponseAfterAsyncCall(
+        rpc_helper->MakeArraysFromHostBufferShards(std::move(req)),
+        arr_handles);
+  }
+
+  std::move(cleanup).Cancel();
+
+  std::vector<xla::ifrt::ArrayRef> arrays;
+  arrays.reserve(specs.size());
+  for (int spec_idx = 0; spec_idx < specs.size(); ++spec_idx) {
+    xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec& spec =
+        specs[spec_idx];
+    arrays.push_back(tsl::MakeRef<Array>(
+        client, rpc_helper, spec.array_spec.dtype,
+        std::move(spec.array_spec.shape), std::move(spec.array_spec.sharding),
+        arr_handles[spec_idx], spec.array_spec.layout));
+  }
+  return arrays;
+}
+
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Array::MakeErrorArrays(
+    xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+    const absl::Status& error, absl::Span<const ArraySpec> array_specs,
+    tsl::RCReference<UserContext> user_context) {
+  auto req = std::make_unique<MakeErrorArraysRequest>();
+  *req->mutable_error() = tsl::StatusToProto(error);
+
+  std::vector<ArrayHandle> arr_handles;
+  arr_handles.reserve(array_specs.size());
+
+  for (const ArraySpec& array_spec : array_specs) {
+    const uint64_t array_handle = rpc_helper->NextHandle();
+    req->add_array_handles(array_handle);
+    TF_ASSIGN_OR_RETURN(*req->add_array_specs(), array_spec.ToProto());
+    arr_handles.push_back(ArrayHandle{array_handle});
+  }
+
+  if (rpc_helper->version().protocol_version() < 10) {
+    TF_ASSIGN_OR_RETURN(auto resp,
+                        rpc_helper->MakeErrorArrays(std::move(req)).Await());
+    for (const uint64_t array_handle : resp->array_handles()) {
+      arr_handles.push_back(ArrayHandle{array_handle});
+    }
+  } else {
+    CheckResponseAfterAsyncCall(rpc_helper->MakeErrorArrays(std::move(req)),
+                                arr_handles);
+  }
+
+  std::vector<xla::ifrt::ArrayRef> arrays;
+  arrays.reserve(array_specs.size());
+  for (int i = 0; i < array_specs.size(); ++i) {
+    const xla::ifrt::ArraySpec& array_spec = array_specs[i];
+    arrays.push_back(tsl::MakeRef<Array>(client, rpc_helper, array_spec.dtype,
+                                         array_spec.shape, array_spec.sharding,
+                                         arr_handles[i], array_spec.layout));
+  }
+  return arrays;
 }
 
 void Array::Destruct(RpcHelper* rpc_helper, ArrayHandle handle) {
@@ -238,6 +438,13 @@ Future<> Array::GetReadyFuture() const {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint(
       "IfrtProxyEntrypointArrayGetReadyFuture");
 
+  {
+    absl::MutexLock lock(&mu_);
+    if (deleted_ == DeletionState::kDeleted) {
+      return Future<>(absl::InvalidArgumentError("Already deleted array."));
+    }
+  }
+
   auto req = std::make_unique<CheckValueReadyRequest>();
   req->add_value_handles(handle_.handle);
 
@@ -250,6 +457,10 @@ Future<> Array::GetReadyFuture() const {
 }
 
 Future<> Array::Delete() {
+  {
+    absl::MutexLock lock(&mu_);
+    deleted_ = DeletionState::kDeleted;
+  }
   if (rpc_helper_->version().protocol_version() >= 5) {
     rpc_helper_->Batch(RpcHelper::kDeleteArray, handle_);
     return Future<>(absl::OkStatus());
@@ -274,6 +485,15 @@ Future<> Array::Delete() {
 bool Array::IsDeleted() const {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint(
       "IfrtProxyEntrypointIsDeleted");
+  {
+    absl::MutexLock lock(&mu_);
+    if (deleted_ == DeletionState::kDeleted) {
+      return true;
+    }
+    if (deleted_ == DeletionState::kAlive) {
+      return false;
+    }
+  }
   if (GetGlobalClientFlags()->array_is_deleted_hack) {
     return false;
   }
@@ -283,6 +503,12 @@ bool Array::IsDeleted() const {
   absl::StatusOr<std::shared_ptr<IsArrayDeletedResponse>> response =
       rpc_helper_->IsArrayDeleted(std::move(req)).Await();
   if (response.ok()) {
+    absl::MutexLock lock(&mu_);
+    if ((*response)->deleted()) {
+      deleted_ = DeletionState::kDeleted;
+    } else {
+      deleted_ = DeletionState::kAlive;
+    }
     return (*response)->deleted();
   } else {
     LOG(ERROR) << "Internal error from proxy server during Array::IsDeleted(): "
@@ -294,11 +520,10 @@ bool Array::IsDeleted() const {
   }
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-Array::AssembleArrayFromSingleDeviceArrays(
+absl::StatusOr<xla::ifrt::ArrayRef> Array::AssembleArrayFromSingleDeviceArrays(
     xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
-    DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+    DType dtype, Shape shape, ShardingRef sharding,
+    absl::Span<xla::ifrt::ArrayRef> arrays,
     ArrayCopySemantics array_copy_semantics,
     SingleDeviceShardSemantics single_device_shard_semantics) {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint(
@@ -316,6 +541,10 @@ Array::AssembleArrayFromSingleDeviceArrays(
         "SingleDeviceShardSemantics::kAdressableShards is not supported in "
         "ifrt-proxy version < 8");
   }
+  if (arrays.empty()) {
+    return absl::InvalidArgumentError(
+        "AssembleArrayFromSingleDeviceArrays() called with empty arrays list");
+  }
   auto req = std::make_unique<AssembleArrayFromSingleDeviceArraysRequest>();
   *req->mutable_shape() = shape.ToProto();
   TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), sharding->ToProto());
@@ -323,7 +552,7 @@ Array::AssembleArrayFromSingleDeviceArrays(
   req->set_single_device_shard_semantics(
       ToSingleDeviceShardSemanticsProto(single_device_shard_semantics));
   *req->mutable_dtype() = dtype.ToProto();
-  for (const tsl::RCReference<xla::ifrt::Array>& rcref : arrays) {
+  for (const xla::ifrt::ArrayRef& rcref : arrays) {
     Array* array = llvm::dyn_cast<Array>(rcref.get());
     if (array == nullptr) {
       return absl::InvalidArgumentError(absl::Substitute(
@@ -331,7 +560,9 @@ Array::AssembleArrayFromSingleDeviceArrays(
           "not a xla::ifrt::proxy::Array.",
           rcref.get()));
     }
-    req->add_single_device_array_handles(array->handle_.handle);
+    TF_ASSIGN_OR_RETURN(ArrayHandle handle,
+                        array->GetHandle(array_copy_semantics));
+    req->add_single_device_array_handles(handle.handle);
   }
 
   ArrayHandle result_handle;
@@ -350,16 +581,19 @@ Array::AssembleArrayFromSingleDeviceArrays(
         result_handle);
   }
 
-  return tsl::RCReference<xla::ifrt::Array>(tsl::MakeRef<Array>(
+  // We assume that all shards have the same layout.
+  const xla::ifrt::ArrayRef& rcref = arrays[0];
+  Array* array = llvm::cast<Array>(rcref.get());
+
+  return xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
       client, std::move(rpc_helper), dtype, std::move(shape),
-      std::move(sharding), result_handle));
+      std::move(sharding), result_handle, array->custom_layout()));
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-Array::RemapArrays(xla::ifrt::Client* client,
-                   std::shared_ptr<RpcHelper> rpc_helper, const RemapPlan& plan,
-                   absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-                   ArrayCopySemantics semantics) {
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Array::RemapArrays(
+    xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+    const RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
+    ArrayCopySemantics semantics) {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint([n_arrays = arrays.size()]() {
     return tsl::profiler::TraceMeEncode("IfrtProxyEntrypointRemapArrays",
                                         {{"n_arrays", n_arrays}});
@@ -379,7 +613,7 @@ Array::RemapArrays(xla::ifrt::Client* client,
   TF_ASSIGN_OR_RETURN(*req->mutable_plan(), plan.ToProto());
   req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
   for (int i = 0; i < num_inputs; ++i) {
-    const tsl::RCReference<xla::ifrt::Array>& rcref = arrays[i];
+    const xla::ifrt::ArrayRef& rcref = arrays[i];
     Array* array = llvm::dyn_cast<Array>(rcref.get());
     if (array == nullptr) {
       return absl::InvalidArgumentError(
@@ -414,8 +648,18 @@ Array::RemapArrays(xla::ifrt::Client* client,
                           *arrays[i]->sharding().devices(),
                           arrays[i]->sharding().memory_kind()));
     }
+    TF_ASSIGN_OR_RETURN(ArrayHandle handle, array->GetHandle(semantics));
+    req->add_array_handles(handle.handle);
+  }
 
-    req->add_array_handles(array->handle_.handle);
+  std::vector<std::shared_ptr<const xla::PjRtLayout>> output_layouts(
+      plan.output_specs.size());
+  for (const auto& mapping : *plan.mappings) {
+    if (output_layouts[mapping.out_array] == nullptr) {
+      const xla::ifrt::ArrayRef& rcref = arrays[mapping.in_array];
+      Array* array = llvm::cast<Array>(rcref.get());
+      output_layouts[mapping.out_array] = array->custom_layout();
+    }
   }
 
   std::vector<ArrayHandle> result_handles;
@@ -435,26 +679,21 @@ Array::RemapArrays(xla::ifrt::Client* client,
     }
     CheckResponseAfterAsyncCall(rpc_helper->RemapArrays(std::move(req)),
                                 result_handles);
+    TF_RET_CHECK(result_handles.size() == plan.output_specs.size());
   }
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> result;
+  std::vector<xla::ifrt::ArrayRef> result;
   result.reserve(result_handles.size());
   for (int i = 0; i < result_handles.size(); ++i) {
-    result.push_back(tsl::RCReference<xla::ifrt::Array>(
-        tsl::MakeRef<Array>(client, rpc_helper, plan.output_specs[i].dtype,
-                            plan.output_specs[i].shape,
-                            plan.output_specs[i].sharding, result_handles[i])));
+    result.push_back(xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
+        client, rpc_helper, plan.output_specs[i].dtype,
+        plan.output_specs[i].shape, plan.output_specs[i].sharding,
+        result_handles[i], std::move(output_layouts[i]))));
   }
   return result;
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-Array::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
-  return DisassembleIntoSingleDeviceArrays(
-      semantics, SingleDeviceShardSemantics::kAllShards);
-}
-
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
 Array::DisassembleIntoSingleDeviceArrays(
     ArrayCopySemantics array_copy_semantics,
     SingleDeviceShardSemantics single_device_shard_semantics) {
@@ -468,7 +707,8 @@ Array::DisassembleIntoSingleDeviceArrays(
         "version < 8");
   }
   auto req = std::make_unique<DisassembleIntoSingleDeviceArraysRequest>();
-  req->set_array_handle(handle_.handle);
+  TF_ASSIGN_OR_RETURN(ArrayHandle handle, GetHandle(array_copy_semantics));
+  req->set_array_handle(handle.handle);
   req->set_copy_semantics(ToArrayCopySemanticsProto(array_copy_semantics));
   req->set_single_device_shard_semantics(
       ToSingleDeviceShardSemanticsProto(single_device_shard_semantics));
@@ -500,23 +740,25 @@ Array::DisassembleIntoSingleDeviceArrays(
       << " " << absl::StrJoin(result_handles, ",") << " " << shape_ << " "
       << *sharding_ << " ";
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> result;
+  std::vector<xla::ifrt::ArrayRef> result;
   result.reserve(result_handles.size());
   for (int i = 0; i < result_handles.size(); ++i) {
-    result.push_back(tsl::RCReference<xla::ifrt::Array>(tsl::MakeRef<Array>(
+    result.push_back(xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
         client_, rpc_helper_, dtype_, std::move(shape_and_shardings[i].first),
-        std::move(shape_and_shardings[i].second), result_handles[i])));
+        std::move(shape_and_shardings[i].second), result_handles[i],
+        this->custom_layout())));
   }
 
   return result;
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::FullyReplicatedShard(
+absl::StatusOr<xla::ifrt::ArrayRef> Array::FullyReplicatedShard(
     ArrayCopySemantics semantics) {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint(
       "IfrtProxyEntrypointFullyReplicatedShard");
   auto req = std::make_unique<FullyReplicatedShardRequest>();
-  req->set_array_handle(handle_.handle);
+  TF_ASSIGN_OR_RETURN(ArrayHandle handle, GetHandle(semantics));
+  req->set_array_handle(handle.handle);
   req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
 
   ArrayHandle result_handle;
@@ -542,9 +784,9 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::FullyReplicatedShard(
       xla::ifrt::SingleDeviceSharding::Create(
           sharding_->devices()->devices().front(), sharding_->memory_kind());
 
-  return tsl::RCReference<xla::ifrt::Array>(
-      tsl::MakeRef<Array>(client_, rpc_helper_, dtype_, shape_,
-                          std::move(single_device_sharding), result_handle));
+  return xla::ifrt::ArrayRef(tsl::MakeRef<Array>(
+      client_, rpc_helper_, dtype_, shape_, std::move(single_device_sharding),
+      result_handle, this->custom_layout()));
 }
 
 Future<> Array::CopyToStringHostBuffer(
@@ -557,7 +799,11 @@ Future<> Array::CopyToStringHostBuffer(
         "String arrays are not supported in ifrt-proxy version < 9"));
   }
   auto req = std::make_unique<CopyToHostBufferRequest>();
-  req->set_array_handle(handle_.handle);
+  absl::StatusOr<ArrayHandle> handle = GetHandle(semantics);
+  if (!handle.ok()) {
+    return Future<>(handle.status());
+  }
+  req->set_array_handle(handle->handle);
   if (byte_strides.has_value()) {
     return Future<>(absl::InvalidArgumentError(
         "Byte strides are not supported for string arrays."));
@@ -619,7 +865,11 @@ Future<> Array::CopyToHostBuffer(
   }
 
   auto req = std::make_unique<CopyToHostBufferRequest>();
-  req->set_array_handle(handle_.handle);
+  absl::StatusOr<ArrayHandle> handle = GetHandle(semantics);
+  if (!handle.ok()) {
+    return Future<>(handle.status());
+  }
+  req->set_array_handle(handle->handle);
   if (byte_strides.has_value()) {
     *req->mutable_byte_strides() = ToByteStridesProto(*byte_strides);
   }
@@ -677,11 +927,38 @@ Future<> Array::CopyToHostBuffer(
   return Future<>(std::move(promise));
 }
 
+absl::StatusOr<std::shared_ptr<const PjRtLayout>> Array::layout() const {
+  absl::MutexLock l(&mu_);
+  if (custom_layout_ != nullptr) {
+    return custom_layout_;
+  }
+
+  TF_ASSIGN_OR_RETURN(auto shard_shape, sharding_->GetShardShape(shape_));
+  return client_->GetDefaultLayout(dtype_, shard_shape.dims(),
+                                   sharding_->devices()->devices().front(),
+                                   sharding_->memory_kind());
+}
+
 xla::ifrt::Client* Array::client() const { return client_; }
 
 std::string Array::DebugString() const {
-  return absl::Substitute("proxy::Array, this=$0, handle=$1", this,
-                          handle_.handle);
+  std::string is_deleted;
+  {
+    absl::MutexLock l(&mu_);
+    switch (deleted_) {
+      case DeletionState::kUnknown:
+        is_deleted = "unknown";
+        break;
+      case DeletionState::kDeleted:
+        is_deleted = "true";
+        break;
+      case DeletionState::kAlive:
+        is_deleted = "false";
+        break;
+    }
+  }
+  return absl::Substitute("proxy::Array, this=$0, handle=$1, deleted=$2", this,
+                          handle_.handle, is_deleted);
 }
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/xla/xla/python/ifrt_proxy/client/array.h
index 463778db07c2..f2e26b734bf8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -26,11 +26,15 @@
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
@@ -38,6 +42,7 @@
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/common/types.h"
@@ -53,51 +58,92 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
   // `Array::MakeArrayFromHostBuffer()` implements
   // `Client::MakeArrayFromHostBuffer()`.
   // TODO(b/261226026): Implement logic directly in client.cc.
-  static absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-  MakeArrayFromHostBuffer(xla::ifrt::Client* client,
-                          std::shared_ptr<RpcHelper> rpc_helper,
-                          const void* data, DType dtype, Shape shape,
-                          std::optional<absl::Span<const int64_t>> byte_strides,
-                          std::shared_ptr<const Sharding> sharding,
-                          xla::ifrt::Client::HostBufferSemantics semantics,
-                          std::function<void()> on_done_with_host_buffer);
+  static absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromHostBuffer(
+      xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      ShardingRef sharding, xla::ifrt::Client::HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer);
+
+  // `Array::MakeArraysFromHostBufferShards()` implements
+  // `Client::MakeArraysFromHostBufferShards()`.
+  // TODO(b/261226026): Implement logic directly in client.cc.
+  static absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
+  MakeArraysFromHostBufferShards(
+      xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+      absl::Span<xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec> specs,
+      xla::ifrt::Client::HostBufferSemantics semantics,
+      tsl::RCReference<xla::ifrt::UserContext> user_context);
+
+  static absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> MakeErrorArrays(
+      xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+      const absl::Status& error, absl::Span<const ArraySpec> array_specs,
+      tsl::RCReference<UserContext> user_context);
 
   // `Array::AssembleArrayFromSingleDeviceArrays()` implements
   // `Client::AssembleArrayFromSingleDeviceArrays()`.
   // TODO(b/261226026): Implement logic directly in client.cc.
-  static absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  static absl::StatusOr<xla::ifrt::ArrayRef>
   AssembleArrayFromSingleDeviceArrays(
       xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
-      DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      DType dtype, Shape shape, ShardingRef sharding,
+      absl::Span<xla::ifrt::ArrayRef> arrays,
       ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics);
 
   // `Array::RemapArrays()` implements `Client::RemapArrays()`.
   // TODO(b/261226026): Implement logic directly in client.cc.
-  static absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-  RemapArrays(xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
-              const RemapPlan& plan,
-              absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-              ArrayCopySemantics semantics);
+  static absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> RemapArrays(
+      xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+      const RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
+      ArrayCopySemantics semantics);
 
   // Destructs the array associated with the given handle. The corresponding
   // array becomes unusable afterwards.
   static void Destruct(RpcHelper* rpc_helper, ArrayHandle handle);
 
-  Array(xla::ifrt::Client* const client, std::shared_ptr<RpcHelper> rpc_helper,
-        DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
-        ArrayHandle handle)
+  Array(xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+        DType dtype, Shape shape, ShardingRef sharding, ArrayHandle arr_handle,
+        std::shared_ptr<const xla::PjRtLayout> layout)
       : client_(client),
         rpc_helper_(std::move(rpc_helper)),
         dtype_(dtype),
         shape_(std::move(shape)),
         sharding_(std::move(sharding)),
-        handle_(handle) {}
+        custom_layout_(std::move(layout)),
+        handle_(arr_handle) {}
 
   ~Array() override { Destruct(rpc_helper_.get(), handle_); }
 
-  ArrayHandle handle() const { return handle_; }
+  absl::StatusOr<ArrayHandle> GetHandle(ArrayCopySemantics semantics) {
+    absl::MutexLock l(&mu_);
+    if (deleted_ == DeletionState::kDeleted) {
+      return absl::InvalidArgumentError("Array already deleted.");
+    }
+    if (semantics == ArrayCopySemantics::kDonateInput) {
+      deleted_ = DeletionState::kDeleted;
+    }
+    return handle_;
+  }
+
+  // Fetches the ArrayHandle when the ArrayCopySemantics (i.e., whether the
+  // array is meant to be donated or copied) is not known.
+  //
+  // Calling this function may cause `IsDelete()` calls to result in a
+  // synchronous RPC to the proxy-server. To avoid such performance overhead,
+  // prefer using `GetHandle(semantics)` whenever the semantics are known.
+  absl::StatusOr<ArrayHandle> GetHandleUnknownIfBeingDonated() {
+    absl::MutexLock l(&mu_);
+    if (deleted_ == DeletionState::kDeleted) {
+      return absl::InvalidArgumentError("Array already deleted.");
+    }
+    deleted_ = DeletionState::kUnknown;
+    return handle_;
+  }
+
+  std::shared_ptr<const xla::PjRtLayout> custom_layout() const {
+    return custom_layout_;
+  }
 
   xla::ifrt::Client* client() const override;
   Future<> GetReadyFuture() const override;
@@ -108,22 +154,15 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
   DType dtype() const override { return dtype_; }
   const Shape& shape() const override { return shape_; }
   const Sharding& sharding() const override { return *sharding_; }
-  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
-    return sharding_;
-  }
-  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override {
-    return absl::UnimplementedError(
-        "Array::layout() not implemented for IFRT proxy");
-  };
+  ShardingRef shared_ptr_sharding() const override { return sharding_; }
+  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
-  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
   DisassembleIntoSingleDeviceArrays(
       ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics) override;
 
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> FullyReplicatedShard(
+  absl::StatusOr<xla::ifrt::ArrayRef> FullyReplicatedShard(
       xla::ifrt::ArrayCopySemantics semantics) override;
 
   ABSL_MUST_USE_RESULT
@@ -150,8 +189,23 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
   const std::shared_ptr<RpcHelper> rpc_helper_;
   const DType dtype_;
   const Shape shape_;
-  const std::shared_ptr<const Sharding> sharding_;
-  const ArrayHandle handle_;
+  const ShardingRef sharding_;
+
+  // This is layout explicitly supplied at creation time. we explicitly
+  // distinguish it from default layouts since some functions
+  // behaves differently depending on where the layout came from.
+  const std::shared_ptr<const xla::PjRtLayout> custom_layout_;
+
+  const ArrayHandle handle_
+      ABSL_DEPRECATED("Use GetHandle() function instead.");
+
+  mutable absl::Mutex mu_;
+  enum class DeletionState {
+    kUnknown,  // Need to ask the proxy-server whether the array is deleted.
+    kDeleted,  // IsDeleted() will return true.
+    kAlive     // IsDeleted() will return false.
+  };
+  mutable DeletionState deleted_ ABSL_GUARDED_BY(mu_) = DeletionState::kAlive;
 };
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
index 9129fd8a8a97..9e8e6a95718b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
@@ -15,49 +15,50 @@
 #include "xla/python/ifrt_proxy/client/array.h"
 
 #include <memory>
-#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
-#include "absl/synchronization/notification.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/layout_util.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt_proxy/client/client_session.h"
-#include "xla/python/ifrt_proxy/client/host_buffer.h"
 #include "xla/python/ifrt_proxy/client/mock_client_session.h"
 #include "xla/python/ifrt_proxy/client/mock_host_buffer.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/client/version.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/test_utils.h"
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
 #include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/test.h"
 
 using ::testing::_;
-using ::testing::Pointee;
+using ::testing::ElementsAre;
 using ::testing::Return;
 using ::tsl::protobuf::TextFormat;
 using ::tsl::testing::IsOk;
 
-#if defined(PLATFORM_GOOGLE)
-using ::testing::EquivToProto;
-using ::testing::proto::Partially;
-#endif
-
 namespace xla {
 namespace ifrt {
 namespace proxy {
@@ -71,87 +72,281 @@ IfrtProxyVersion Version() {
 
 class ArrayTest : public ::testing::Test {
  protected:
-  void SetUp() override {
-    session_ = std::make_shared<MockClientSession>();
-    rpc_helper_ = std::make_shared<RpcHelper>(Version(), session_);
-
-    host_buffer_store_ = std::make_shared<MockClientHostBufferStore>();
+  ArrayTest()
+      : session_(std::make_shared<MockClientSession>()),
+        host_buffer_store_(std::make_shared<MockClientHostBufferStore>()),
+        rpc_helper_(std::make_shared<RpcHelper>(Version(), session_)),
+        mock_client_(std::make_shared<xla::ifrt::MockClient>()),
+        mock_device_(std::make_shared<xla::ifrt::MockDevice>()),
+        kLayout1(std::make_shared<xla::PjRtLayout>(
+            xla::LayoutUtil::MakeDescendingLayout(1))),
+        kLayout2(std::make_shared<xla::PjRtLayout>(
+            xla::LayoutUtil::MakeDescendingLayout(5))) {
     rpc_helper_->set_host_buffer_store(host_buffer_store_);
+  }
 
+  void SetUp() override {
     // Default handler that ignores all uninteresting requests, but still
     // invokes the callback in order to avoid hanging the caller forever.
     EXPECT_CALL(*session_, Enqueue(_))
         .WillRepeatedly(Return(Future<ClientSession::Response>(
             absl::InternalError("Request has no mock handlers"))));
+    EXPECT_CALL(*host_buffer_store_, Store(_, testing::An<absl::string_view>()))
+        .WillRepeatedly(Return(Future<>(absl::OkStatus())));
+
+    ON_CALL(*mock_client_, MakeDeviceList(_))
+        .WillByDefault([](absl::Span<xla::ifrt::Device* const> devices) {
+          return xla::ifrt::BasicDeviceList::Create(devices);
+        });
+    ON_CALL(*mock_device_, client()).WillByDefault(Return(mock_client_.get()));
+    sharding_ = xla::ifrt::SingleDeviceSharding::Create(
+        mock_device_.get(), xla::ifrt::MemoryKind());
   }
 
-  std::shared_ptr<MockClientSession> session_;
-  std::shared_ptr<RpcHelper> rpc_helper_;
-  std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
+  std::shared_ptr<xla::ifrt::SingleDeviceSharding> sharding_;
+  const std::shared_ptr<MockClientSession> session_;
+  const std::shared_ptr<MockClientHostBufferStore> host_buffer_store_;
+  const std::shared_ptr<RpcHelper> rpc_helper_;
+  const std::shared_ptr<xla::ifrt::MockClient> mock_client_;
+  const std::shared_ptr<xla::ifrt::MockDevice> mock_device_;
+  const std::shared_ptr<xla::PjRtLayout> kLayout1;
+  const std::shared_ptr<xla::PjRtLayout> kLayout2;
 };
 
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
 TEST_F(ArrayTest, Destruction) {
   // Destruction may not happen immediately because of batching at the
   // client-side. This test waits until destruction happens.
-  absl::Notification destructed;
-  EXPECT_CALL(
-      *session_,
-      Enqueue(Pointee(Partially(EquivToProto(R"pb(destruct_array_request {
-                                                    array_handle: 1234
-                                                  })pb")))))
-      .WillOnce([&](std::unique_ptr<IfrtRequest> request)
-                    -> Future<ClientSession::Response> {
-        destructed.Notify();
-        auto result = std::make_shared<IfrtResponse>();
-        return Future<ClientSession::Response>(result);
-      });
-
-  MockClient client;
-  tsl::MakeRef<Array>(&client, rpc_helper_, DType(DType::Kind::kBF16),
-                      Shape({}), /*sharding=*/nullptr, ArrayHandle{1234});
-
-  ASSERT_TRUE(destructed.WaitForNotificationWithTimeout(absl::Seconds(10)));
+  TestQueue<IfrtRequest> requests_queue(/*pop_timeout=*/absl::Minutes(1));
+
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kDestructArrayRequest)))
+      .WillOnce(MockClientCaptureAndReturn(&requests_queue, IfrtResponse()));
+  ON_CALL(*mock_client_, GetDefaultLayout).WillByDefault(Return(kLayout1));
+
+  tsl::MakeRef<Array>(mock_client_.get(), rpc_helper_,
+                      DType(DType::Kind::kBF16), Shape({}),
+                      /*sharding=*/nullptr, ArrayHandle{1234},
+                      /*layout=*/nullptr);
+
+  auto destruct_array_request = requests_queue.Pop().destruct_array_request();
+  EXPECT_THAT(destruct_array_request.array_handle(), ElementsAre(1234));
 }
-#endif
 
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
 TEST_F(ArrayTest, FullyReplicatedShard) {
   IfrtResponse response;
+  TestQueue<IfrtRequest> requests_queue(/*pop_timeout=*/absl::Minutes(1));
   ASSERT_TRUE(TextFormat::ParseFromString(
       R"pb(response_metadata {}
            fully_replicated_shard_response { array_handle: 1 })pb",
       &response));
+  EXPECT_CALL(
+      *session_,
+      Enqueue(IfrtRequestOfType(IfrtRequest::kFullyReplicatedShardRequest)))
+      .WillOnce(MockClientCaptureAndReturn(&requests_queue, response));
+  ON_CALL(*mock_client_, GetDefaultLayout).WillByDefault(Return(kLayout1));
+
+  auto array = tsl::MakeRef<Array>(
+      mock_client_.get(), rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
+      sharding_, ArrayHandle{1234}, /*layout=*/nullptr);
+
+  EXPECT_THAT(array->FullyReplicatedShard(ArrayCopySemantics::kAlwaysCopy),
+              IsOk());
+  auto req = requests_queue.Pop().fully_replicated_shard_request();
+  EXPECT_EQ(req.array_handle(), 1234);
+  EXPECT_EQ(req.result_handle(), 1);
+}
+
+TEST_F(ArrayTest, GetDefaultLayoutSuccess) {
+  ON_CALL(*mock_client_, GetDefaultLayout).WillByDefault(Return(kLayout1));
+
+  auto array = tsl::MakeRef<Array>(
+      mock_client_.get(), rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
+      sharding_, ArrayHandle{1234}, /*layout=*/nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(auto layout_1, array->layout());
+  EXPECT_EQ(*layout_1, *kLayout1);
+}
+
+TEST_F(ArrayTest, GetCustomLayoutSuccess) {
+  auto array = tsl::MakeRef<Array>(mock_client_.get(), rpc_helper_,
+                                   DType(DType::Kind::kBF16), Shape({}),
+                                   sharding_, ArrayHandle{1234}, kLayout1);
+  TF_ASSERT_OK_AND_ASSIGN(auto layout_1, array->layout());
+  EXPECT_EQ(*layout_1, *kLayout1);
+}
+
+TEST_F(ArrayTest, MakeArraysFromHostBufferShardsSuccess) {
+  IfrtResponse response;
+  const absl::string_view data = "test";
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(response_metadata {}
+           make_arrays_from_host_buffer_shards_response {
+             array_handles: 1
+             array_handles: 2
+           })pb",
+      &response));
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(
+                  IfrtRequest::kMakeArraysFromHostBufferShardsRequest)))
+      .WillRepeatedly(MockClientSessionReturnResponse(response));
+  std::vector<xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec> specs;
+  xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec spec_1{
+      /*buffers=*/{{/*shard_indices=*/{0}, xla::ifrt::Client::HostBuffer{
+                                               /*data=*/(void*)data.data(),
+                                               DType(DType::Kind::kBF16),
+                                               Shape({}),
+                                           }}},
+      /*array_spec=*/xla::ifrt::ArraySpec{DType(DType::Kind::kBF16), Shape({}),
+                                          sharding_, kLayout1}};
+  xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec spec_2{
+      /*buffers=*/{{/*shard_indices=*/{0},
+                    /*host_buffer=*/xla::ifrt::Client::HostBuffer{
+                        /*data=*/(void*)data.data(),
+                        /*dtype=*/DType(DType::Kind::kBF16),
+                        /*shape=*/Shape({}),
+                    }}},
+      /*array_spec=*/xla::ifrt::ArraySpec{DType(DType::Kind::kBF16), Shape({}),
+                                          sharding_, kLayout2}};
+  specs.push_back(spec_1);
+  specs.push_back(spec_2);
+
+  auto result = Array::MakeArraysFromHostBufferShards(
+      mock_client_.get(), rpc_helper_, absl::MakeSpan(specs),
+      xla::ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+      /*user_context=*/tsl::RCReference<xla::ifrt::UserContext>());
+  TF_ASSERT_OK(result.status());
+  TF_ASSERT_OK_AND_ASSIGN(auto layout_1, result.value().at(0)->layout());
+  EXPECT_EQ(*layout_1, *kLayout1);
+  TF_ASSERT_OK_AND_ASSIGN(auto layout_2, result.value().at(1)->layout());
+  EXPECT_EQ(*layout_2, *kLayout2);
+}
 
-  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
-                             R"pb(fully_replicated_shard_request {
-                                    array_handle: 1234
-                                    result_handle: 1
-                                  })pb")))))
+TEST_F(ArrayTest, MakeErrorArraysSuccess) {
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(response_metadata {}
+           make_error_arrays_response { array_handles: 1 })pb",
+      &response));
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kMakeErrorArraysRequest)))
       .WillOnce(MockClientSessionReturnResponse(response));
+  std::vector<xla::ifrt::ArraySpec> specs;
+  specs.push_back(xla::ifrt::ArraySpec{DType(DType::Kind::kBF16), Shape({}),
+                                       sharding_, kLayout1});
 
-  MockClient client;
-  ON_CALL(client, MakeDeviceList(_))
-      .WillByDefault([](absl::Span<xla::ifrt::Device* const> devices) {
-        return xla::ifrt::BasicDeviceList::Create(devices);
-      });
+  auto result = Array::MakeErrorArrays(
+      mock_client_.get(), rpc_helper_, absl::InternalError("test error"),
+      absl::MakeSpan(specs),
+      /*user_context=*/tsl::RCReference<xla::ifrt::UserContext>());
+  TF_ASSERT_OK(result.status());
+  TF_ASSERT_OK_AND_ASSIGN(auto layout, result.value().at(0)->layout());
+  EXPECT_EQ(*layout, *kLayout1);
+}
 
-  MockDevice mock_device;
-  ON_CALL(mock_device, client()).WillByDefault(Return(&client));
+TEST_F(ArrayTest, AssembleArrayFromSingleDeviceArraysSuccess) {
+  IfrtResponse response;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(
+          R"pb(response_metadata {}
+               assemble_array_from_single_device_arrays_response {
+                 array_handle: 1
+               })pb",
+          &response));
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(
+                  IfrtRequest::kAssembleArrayFromSingleDeviceArraysRequest)))
+      .WillOnce(MockClientSessionReturnResponse(response));
 
-  auto sharding = xla::ifrt::SingleDeviceSharding::Create(
-      &mock_device, xla::ifrt::MemoryKind());
+  auto array = tsl::MakeRef<Array>(mock_client_.get(), rpc_helper_,
+                                   DType(DType::Kind::kBF16), Shape({}),
+                                   sharding_, ArrayHandle{1234}, kLayout1);
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  arrays.push_back(array);
 
-  auto array =
-      tsl::MakeRef<Array>(&client, rpc_helper_, DType(DType::Kind::kBF16),
-                          Shape({}), std::move(sharding), ArrayHandle{1234});
+  auto result = Array::AssembleArrayFromSingleDeviceArrays(
+      mock_client_.get(), rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
+      sharding_, absl::MakeSpan(arrays), ArrayCopySemantics::kAlwaysCopy,
+      SingleDeviceShardSemantics::kAllShards);
+  TF_ASSERT_OK(result.status());
+  TF_ASSERT_OK_AND_ASSIGN(auto layout, result.value()->layout());
+  EXPECT_EQ(*layout, *kLayout1);
+}
 
-  ASSERT_THAT(array->FullyReplicatedShard(ArrayCopySemantics::kAlwaysCopy),
-              IsOk());
+TEST_F(ArrayTest, AssembleArrayFromSingleDeviceArraysDefaultLayoutSuccess) {
+  IfrtResponse response;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(
+          R"pb(response_metadata {}
+               assemble_array_from_single_device_arrays_response {
+                 array_handle: 1
+               })pb",
+          &response));
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(
+                  IfrtRequest::kAssembleArrayFromSingleDeviceArraysRequest)))
+      .WillOnce(MockClientSessionReturnResponse(response));
+  ON_CALL(*mock_client_, GetDefaultLayout).WillByDefault(Return(kLayout1));
+
+  auto array = tsl::MakeRef<Array>(
+      mock_client_.get(), rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
+      sharding_, ArrayHandle{1234}, /*layout=*/nullptr);
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  arrays.push_back(array);
+
+  auto result = Array::AssembleArrayFromSingleDeviceArrays(
+      mock_client_.get(), rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
+      sharding_, absl::MakeSpan(arrays), ArrayCopySemantics::kAlwaysCopy,
+      SingleDeviceShardSemantics::kAllShards);
+  TF_ASSERT_OK(result.status());
+  TF_ASSERT_OK_AND_ASSIGN(auto layout, result.value()->layout());
+  EXPECT_EQ(*layout, *kLayout1);
+}
+
+TEST_F(ArrayTest, RemapArraysSuccess) {
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(response_metadata {}
+           remap_arrays_response { array_handles: 1 array_handles: 2 })pb",
+      &response));
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kRemapArraysRequest)))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  auto array_1 = tsl::MakeRef<Array>(mock_client_.get(), rpc_helper_,
+                                     DType(DType::Kind::kBF16), Shape({}),
+                                     sharding_, ArrayHandle{1234}, kLayout1);
+  auto array_2 = tsl::MakeRef<Array>(mock_client_.get(), rpc_helper_,
+                                     DType(DType::Kind::kBF16), Shape({}),
+                                     sharding_, ArrayHandle{1234}, kLayout2);
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  arrays.push_back(array_1);
+  arrays.push_back(array_2);
+  std::vector<RemapPlan::Mapping> mappings;
+  mappings.push_back({/*in_array=*/0, /*out_array=*/1});
+  mappings.push_back({/*in_array=*/1, /*out_array=*/0});
+  std::vector<xla::ifrt::ArraySpec> input_specs;
+  input_specs.push_back(xla::ifrt::ArraySpec{DType(DType::Kind::kBF16),
+                                             Shape({}), sharding_, kLayout1});
+  input_specs.push_back(xla::ifrt::ArraySpec{DType(DType::Kind::kBF16),
+                                             Shape({}), sharding_, kLayout2});
+  std::vector<xla::ifrt::ArraySpec> output_specs;
+  output_specs.push_back(
+      xla::ifrt::ArraySpec{DType(DType::Kind::kBF16), Shape({}), sharding_});
+  output_specs.push_back(
+      xla::ifrt::ArraySpec{DType(DType::Kind::kBF16), Shape({}), sharding_});
+  RemapPlan plan{input_specs, output_specs,
+                 std::make_shared<std::vector<RemapPlan::Mapping>>(mappings)};
+
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> result =
+      Array::RemapArrays(mock_client_.get(), rpc_helper_, plan,
+                         absl::MakeSpan(arrays),
+                         ArrayCopySemantics::kAlwaysCopy);
+
+  TF_ASSERT_OK(result.status());
+  TF_ASSERT_OK_AND_ASSIGN(auto layout_1, result.value().at(0)->layout());
+  EXPECT_EQ(*layout_1, *kLayout2);
+  TF_ASSERT_OK_AND_ASSIGN(auto layout_2, result.value().at(1)->layout());
+  EXPECT_EQ(*layout_2, *kLayout1);
 }
-#endif
 
 }  // namespace
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index 17e37f3dc13f..2d340e431fa1 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -20,6 +20,7 @@
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -30,10 +31,13 @@
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/client.h"
@@ -45,6 +49,7 @@
 #include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/array.h"
 #include "xla/python/ifrt_proxy/client/device.h"
@@ -55,15 +60,29 @@
 #include "xla/python/ifrt_proxy/common/versions.h"
 #include "xla/python/pjrt_ifrt/pjrt_attribute_map_util.h"
 #include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
 namespace ifrt {
 namespace proxy {
 
+namespace {
+
+std::string device_summary(Device* device) {
+  auto it = device->Attributes().map().find("slice_index");
+  int slice_index =
+      (it != device->Attributes().map().end() &&
+       std::holds_alternative<AttributeMap::Int64Value>(it->second))
+          ? std::get<AttributeMap::Int64Value>(it->second).value
+          : 0;
+  return absl::StrCat(device->Kind(), "s", slice_index);
+}
+
+}  // namespace
+
 char Client::ID = 0;
 
 absl::StatusOr<std::unique_ptr<Client>> Client::Create(
@@ -219,43 +238,38 @@ absl::StatusOr<xla::ifrt::Device*> Client::LookupDevice(
   return it->second.get();
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-Client::MakeArrayFromHostBuffer(
+absl::StatusOr<xla::ifrt::ArrayRef> Client::MakeArrayFromHostBuffer(
     const void* data, DType dtype, Shape shape,
-    std::optional<absl::Span<const int64_t>> byte_strides,
-    std::shared_ptr<const Sharding> sharding,
+    std::optional<absl::Span<const int64_t>> byte_strides, ShardingRef sharding,
     xla::ifrt::Client::HostBufferSemantics semantics,
-    std::function<void()> on_done_with_host_buffer) {
+    std::function<void()> on_done_with_host_buffer,
+    tsl::RCReference<xla::ifrt::UserContext> user_context) {
+  // TODO(b/407104769): Handle `user_context`.
   return Array::MakeArrayFromHostBuffer(
       this, rpc_helper_, data, dtype, std::move(shape), std::move(byte_strides),
       std::move(sharding), semantics, std::move(on_done_with_host_buffer));
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-Client::AssembleArrayFromSingleDeviceArrays(
-    Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-    ArrayCopySemantics semantics) {
-  return Array::AssembleArrayFromSingleDeviceArrays(
-      this, rpc_helper_, arrays[0]->dtype(), std::move(shape), sharding, arrays,
-      semantics, SingleDeviceShardSemantics::kAllShards);
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
+Client::MakeArraysFromHostBufferShards(
+    absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
+    xla::ifrt::Client::HostBufferSemantics semantics,
+    tsl::RCReference<UserContext> user_context) {
+  return Array::MakeArraysFromHostBufferShards(
+      this, rpc_helper_, specs, semantics, std::move(user_context));
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-Client::AssembleArrayFromSingleDeviceArrays(
-    Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-    ArrayCopySemantics array_copy_semantics,
-    SingleDeviceShardSemantics single_device_shard_semantics) {
-  return Array::AssembleArrayFromSingleDeviceArrays(
-      this, rpc_helper_, arrays[0]->dtype(), std::move(shape), sharding, arrays,
-      array_copy_semantics, single_device_shard_semantics);
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Client::MakeErrorArrays(
+    const absl::Status& error,
+    absl::Span<const xla::ifrt::ArraySpec> array_specs,
+    tsl::RCReference<xla::ifrt::UserContext> user_context) {
+  return Array::MakeErrorArrays(this, rpc_helper_, error, array_specs,
+                                std::move(user_context));
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-Client::AssembleArrayFromSingleDeviceArrays(
-    DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+absl::StatusOr<xla::ifrt::ArrayRef> Client::AssembleArrayFromSingleDeviceArrays(
+    DType dtype, Shape shape, ShardingRef sharding,
+    absl::Span<xla::ifrt::ArrayRef> arrays,
     ArrayCopySemantics array_copy_semantics,
     SingleDeviceShardSemantics single_device_shard_semantics) {
   return Array::AssembleArrayFromSingleDeviceArrays(
@@ -263,17 +277,16 @@ Client::AssembleArrayFromSingleDeviceArrays(
       array_copy_semantics, single_device_shard_semantics);
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-Client::CopyArrays(absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-                   std::optional<xla::ifrt::DeviceListRef> devices,
-                   std::optional<MemoryKind> memory_kind,
-                   ArrayCopySemantics semantics) {
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Client::CopyArrays(
+    absl::Span<xla::ifrt::ArrayRef> arrays,
+    std::optional<xla::ifrt::DeviceListRef> devices,
+    std::optional<MemoryKind> memory_kind, ArrayCopySemantics semantics) {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint([n_arrays = arrays.size()]() {
     return tsl::profiler::TraceMeEncode("IfrtProxyEntrypointCopyArrays",
                                         {{"n_arrays", n_arrays}});
   });
   if (arrays.empty()) {
-    return std::vector<tsl::RCReference<xla::ifrt::Array>>();
+    return std::vector<xla::ifrt::ArrayRef>();
   }
 
   for (int i = 1; i < arrays.size(); ++i) {
@@ -290,7 +303,9 @@ Client::CopyArrays(absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
   for (const auto& array : arrays) {
     if (auto* proxy_array =
             llvm::dyn_cast<xla::ifrt::proxy::Array>(array.get())) {
-      req->add_array_handles(proxy_array->handle().handle);
+      TF_ASSIGN_OR_RETURN(ArrayHandle handle,
+                          proxy_array->GetHandle(semantics));
+      req->add_array_handles(handle.handle);
     } else {
       return absl::InvalidArgumentError(
           "CopyArrays only supports arrays created via IFRT Proxy client");
@@ -330,28 +345,30 @@ Client::CopyArrays(absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
         });
   }
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> new_arrays;
+  std::vector<xla::ifrt::ArrayRef> new_arrays;
   new_arrays.reserve(arrays.size());
   for (int i = 0; i < result_handles.size(); ++i) {
     TF_ASSIGN_OR_RETURN(
         auto new_sharding,
         arrays[i]->sharding().WithDeviceAssignment(devices, memory_kind));
+    auto* proxy_array = llvm::cast<xla::ifrt::proxy::Array>(arrays[i].get());
+    CHECK(proxy_array != nullptr);
     new_arrays.push_back(tsl::MakeRef<Array>(
         this, rpc_helper_, arrays[i]->dtype(), arrays[i]->shape(),
-        std::move(new_sharding), ArrayHandle{result_handles[i]}));
+        std::move(new_sharding), ArrayHandle{result_handles[i]},
+        /*layout=*/proxy_array->custom_layout()));
   }
   return new_arrays;
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-Client::RemapArrays(const RemapPlan& plan,
-                    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-                    ArrayCopySemantics semantics) {
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Client::RemapArrays(
+    const RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
+    ArrayCopySemantics semantics) {
   return Array::RemapArrays(this, rpc_helper_, plan, arrays, semantics);
 }
 
 xla::ifrt::Future<> Client::GetReadyFuture(
-    absl::Span<const tsl::RCReference<xla::ifrt::Value>> values) {
+    absl::Span<const xla::ifrt::ValueRef> values) {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint([n_values = values.size()]() {
     return tsl::profiler::TraceMeEncode("IfrtProxyEntrypointGetReadyFuture",
                                         {{"n_values", n_values}});
@@ -364,7 +381,13 @@ xla::ifrt::Future<> Client::GetReadyFuture(
     // type, but this may be extended later to other types such as Tuples.
     if (auto proxy_array =
             llvm::dyn_cast<xla::ifrt::proxy::Array>(value.get())) {
-      req->add_value_handles(proxy_array->handle().handle);
+      absl::StatusOr<ArrayHandle> handle =
+          proxy_array->GetHandle(ArrayCopySemantics::kAlwaysCopy);
+      if (!handle.ok()) {
+        futures.push_back(Future<>(handle.status()));
+      } else {
+        req->add_value_handles(handle->handle);
+      }
     } else {
       futures.push_back(value->GetReadyFuture());
     }
@@ -407,6 +430,45 @@ xla::ifrt::DeviceListRef Client::MakeDeviceList(
   return xla::ifrt::BasicDeviceList::Create(devices);
 }
 
+absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> Client::GetDefaultLayout(
+    xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
+    xla::ifrt::Device* device, xla::ifrt::MemoryKind memory_kind) const {
+  tsl::profiler::TraceMe traceme_ifrt_entrypoint(
+      "IfrtProxyEntrypointGetDefaultLayout");
+  auto req = std::make_unique<GetDefaultLayoutRequest>();
+
+  LayoutKey key{
+      /*dtype=*/dtype,
+      /*dims=*/std::vector<int64_t>(dims.begin(), dims.end()),
+      /*memory_kind=*/memory_kind,
+      /*device_summary=*/device_summary(llvm::dyn_cast<Device>(device))};
+
+  {
+    absl::MutexLock l(&mu_);
+    if (auto it = layout_cache_.find(key); it != layout_cache_.end()) {
+      return it->second;
+    }
+  }
+
+  *req->mutable_dtype() = dtype.ToProto();
+  req->mutable_dims()->Reserve(dims.size());
+  for (int64_t dim : dims) {
+    req->add_dims(dim);
+  }
+  req->set_device_id(device->Id().value());
+  req->set_memory_kind(std::string(memory_kind.memory_kind().value_or("")));
+
+  auto future = rpc_helper_->GetDefaultLayout(std::move(req));
+  TF_ASSIGN_OR_RETURN(auto response, future.Await());
+
+  TF_ASSIGN_OR_RETURN(auto layout, xla::PjRtLayout::Deserialize(
+                                       response->serialized_pjrt_layout()));
+  {
+    absl::MutexLock l(&mu_);
+    layout_cache_.insert({key, layout});
+  }
+  return layout;
+}
 }  // namespace proxy
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.h b/third_party/xla/xla/python/ifrt_proxy/client/client.h
index 4b05319640d4..6a879f9e791b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.h
@@ -24,14 +24,18 @@
 #include <string>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
-#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
@@ -45,6 +49,7 @@
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/compiler.h"
 #include "xla/python/ifrt_proxy/client/device.h"
@@ -65,46 +70,40 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
 
   ~Client() override;
 
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromHostBuffer(
-      const void* data, DType dtype, Shape shape,
+  absl::StatusOr<xla::ifrt::ArrayRef> MakeArrayFromHostBuffer(
+      const void* data, xla::ifrt::DType dtype, xla::ifrt::Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
-      std::shared_ptr<const Sharding> sharding, HostBufferSemantics semantics,
-      std::function<void()> on_done_with_host_buffer) override;
-
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      Shape shape, std::shared_ptr<const Sharding> sharding,
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-      ArrayCopySemantics semantics) override;
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      Shape shape, std::shared_ptr<const Sharding> sharding,
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-      ArrayCopySemantics array_copy_semantics,
-      SingleDeviceShardSemantics single_device_shard_semantics) override;
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-  AssembleArrayFromSingleDeviceArrays(
-      DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      xla::ifrt::ShardingRef sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer,
+      tsl::RCReference<xla::ifrt::UserContext> user_context) override;
+  absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
+  MakeArraysFromHostBufferShards(
+      absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
+      HostBufferSemantics semantics,
+      tsl::RCReference<xla::ifrt::UserContext> user_context) override;
+  absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> MakeErrorArrays(
+      const absl::Status& error, absl::Span<const ArraySpec> array_specs,
+      tsl::RCReference<UserContext> user_context) override;
+  absl::StatusOr<xla::ifrt::ArrayRef> AssembleArrayFromSingleDeviceArrays(
+      DType dtype, Shape shape, ShardingRef sharding,
+      absl::Span<xla::ifrt::ArrayRef> arrays,
       ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics) override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>> CopyArrays(
-      absl::Span<tsl::RCReference<Array>> arrays,
-      std::optional<DeviceListRef> devices,
+  absl::StatusOr<std::vector<ArrayRef>> CopyArrays(
+      absl::Span<ArrayRef> arrays, std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind,
       ArrayCopySemantics semantics) override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> RemapArrays(
-      const RemapPlan& plan,
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+  absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> RemapArrays(
+      const RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
       ArrayCopySemantics semantics) override;
 
   xla::ifrt::Future<> GetReadyFuture(
-      absl::Span<const tsl::RCReference<Value>> values) override;
+      absl::Span<const ValueRef> values) override;
 
   absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
-      absl::Span<tsl::RCReference<Value>> values) override {
+      absl::Span<ValueRef> values) override {
     return absl::UnimplementedError(
         "MakeTuple is not supported for the IFRT proxy client.");
   }
@@ -150,15 +149,37 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
   absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> GetDefaultLayout(
       xla::ifrt::DType dtype, absl::Span<const int64_t> dims,
       xla::ifrt::Device* device,
-      xla::ifrt::MemoryKind memory_kind) const override {
-    return absl::UnimplementedError(
-        "GetDefaultLayout is not supported for the IFRT proxy client.");
+      xla::ifrt::MemoryKind memory_kind) const override;
+
+  tsl::RCReference<xla::ifrt::UserContext> CreateUserContext() override {
+    return tsl::RCReference<xla::ifrt::UserContext>();
   }
 
   // For llvm::RTTIExtends.
   static char ID;  // NOLINT
 
  private:
+  struct LayoutKey {
+    xla::ifrt::DType dtype;
+    std::vector<int64_t> dims;
+    xla::ifrt::MemoryKind memory_kind;
+    std::string device_summary;
+
+    bool operator==(const LayoutKey& other) const {
+      return dtype == other.dtype && dims == other.dims &&
+             memory_kind == other.memory_kind &&
+             device_summary == other.device_summary;
+    }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const LayoutKey& key) {
+      h = H::combine(std::move(h), key.dtype, key.memory_kind,
+                     key.device_summary);
+      h = H::combine_contiguous(std::move(h), key.dims.data(), key.dims.size());
+      return h;
+    }
+  };
+
   Client(std::shared_ptr<RpcHelper> rpc_helper, uint64_t session_id,
          std::string platform_name, std::string platform_version,
          uint64_t platform_id, uint64_t process_index, std::string runtime_type,
@@ -189,6 +210,10 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
   const absl::flat_hash_map<int, std::unique_ptr<Memory>> memories_;
 
   Compiler default_compiler_;
+
+  mutable absl::Mutex mu_;
+  mutable absl::flat_hash_map<LayoutKey, std::shared_ptr<const xla::PjRtLayout>>
+      layout_cache_ ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
index 03dd43f3c93c..eb308b159cc4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
@@ -16,15 +16,26 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/layout_util.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/client/array.h"
 #include "xla/python/ifrt_proxy/client/client_session.h"
 #include "xla/python/ifrt_proxy/client/host_buffer.h"
 #include "xla/python/ifrt_proxy/client/mock_client_session.h"
@@ -32,7 +43,10 @@
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/client/version.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/service/computation_placer.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
@@ -61,6 +75,12 @@ using ::testing::proto::Partially;
 
 class ClientTest : public ::testing::TestWithParam</*protocol_version=*/int> {
  protected:
+  ClientTest()
+      : layout_1_(std::make_shared<xla::PjRtLayout>(
+            xla::LayoutUtil::MakeDescendingLayout(3))),
+        layout_2_(std::make_shared<xla::PjRtLayout>(
+            xla::LayoutUtil::MakeDescendingLayout(5))) {}
+
   IfrtProxyVersion Version() {
     IfrtProxyVersion version;
     version.set_protocol_version(GetParam());
@@ -221,12 +241,16 @@ class ClientTest : public ::testing::TestWithParam</*protocol_version=*/int> {
           &response));
     }
     TF_ASSERT_OK_AND_ASSIGN(client_, Client::Create(rpc_helper_, response));
+    TF_ASSERT_OK_AND_ASSIGN(device_, client_->LookupDevice(DeviceId(0)));
   }
 
   std::shared_ptr<MockClientSession> session_;
   std::shared_ptr<RpcHelper> rpc_helper_;
   std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
   std::unique_ptr<Client> client_;
+  std::shared_ptr<xla::PjRtLayout> layout_1_;
+  std::shared_ptr<xla::PjRtLayout> layout_2_;
+  xla::ifrt::Device* device_;
 };
 
 TEST_P(ClientTest, Init) {
@@ -270,6 +294,125 @@ TEST_P(ClientTest, Init) {
   EXPECT_THAT(client_->addressable_devices(), ElementsAre(device1));
 }
 
+TEST_P(ClientTest, GetDefaultLayoutSuccess) {
+  xla::PjRtLayout layout(xla::LayoutUtil::MakeDescendingLayout(3));
+  IfrtResponse response;
+  response.mutable_get_default_layout_response()->set_serialized_pjrt_layout(
+      layout.Serialize());
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kGetDefaultLayoutRequest)))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto resolved_layout,
+      client_->GetDefaultLayout(DType(DType::kF64), {1, 2, 3}, device_,
+                                MemoryKind("mock")));
+  EXPECT_EQ(resolved_layout->ToString(), layout.ToString());
+}
+
+TEST_P(ClientTest, GetCachedDefaultLayoutSuccess) {
+  IfrtResponse response;
+  response.mutable_get_default_layout_response()->set_serialized_pjrt_layout(
+      layout_1_->Serialize());
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kGetDefaultLayoutRequest)))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto resolved_layout,
+      client_->GetDefaultLayout(DType(DType::kF64), {1, 2, 3}, device_,
+                                MemoryKind("mock")));
+  EXPECT_EQ(resolved_layout->ToString(), layout_1_->ToString());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      resolved_layout, client_->GetDefaultLayout(DType(DType::kF64), {1, 2, 3},
+                                                 device_, MemoryKind("mock")));
+  EXPECT_EQ(resolved_layout->ToString(), layout_1_->ToString());
+}
+
+TEST_P(ClientTest, GetDefaultLayoutFailure) {
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kGetDefaultLayoutRequest)))
+      .WillOnce(Return(Future<ClientSession::Response>(
+          absl::InternalError("injected from test"))));
+
+  EXPECT_THAT(client_->GetDefaultLayout(DType(DType::kF64), {1, 2, 3}, device_,
+                                        MemoryKind("mock")),
+              Not(IsOk()));
+}
+
+TEST_P(ClientTest, CopyArraysDefaultLayoutSuccess) {
+  std::shared_ptr<xla::ifrt::SingleDeviceSharding> sharding =
+      xla::ifrt::SingleDeviceSharding::Create(device_, xla::ifrt::MemoryKind());
+  auto array0 = tsl::MakeRef<Array>(
+      client_.get(), rpc_helper_, DType(DType::kF64), Shape({1, 2, 3}),
+      sharding, ArrayHandle{1234}, /*layout=*/nullptr);
+  auto sharding1 = SingleDeviceSharding::Create(device_, MemoryKind("mock"));
+  auto array1 = tsl::MakeRef<Array>(
+      client_.get(), rpc_helper_, DType(DType::kF64), Shape({1, 2, 3}),
+      sharding, ArrayHandle{5678}, /*layout=*/nullptr);
+
+  IfrtResponse response;
+  response.mutable_copy_arrays_response()->add_array_handles(1);
+  response.mutable_copy_arrays_response()->add_array_handles(2);
+
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kCopyArraysRequest)))
+      .WillOnce(MockClientSessionReturnResponse(response));
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kDestructArrayRequest)))
+      .WillRepeatedly(MockClientSessionReturnResponse(IfrtResponse()));
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays = {array0, array1};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto copied_arrays,
+      client_->CopyArrays(absl::MakeSpan(arrays),
+                          client_->MakeDeviceList({device_}),
+                          MemoryKind("mock"), ArrayCopySemantics::kAlwaysCopy));
+  ASSERT_THAT(copied_arrays, SizeIs(2));
+  EXPECT_EQ(llvm::cast<Array>(copied_arrays[0].get())->custom_layout(),
+            nullptr);
+  EXPECT_EQ(llvm::cast<Array>(copied_arrays[1].get())->custom_layout(),
+            nullptr);
+}
+
+TEST_P(ClientTest, CopyArraysCustomLayoutSuccess) {
+  std::shared_ptr<xla::ifrt::SingleDeviceSharding> sharding =
+      xla::ifrt::SingleDeviceSharding::Create(device_, xla::ifrt::MemoryKind());
+  auto array0 = tsl::MakeRef<Array>(client_.get(), rpc_helper_,
+                                    DType(DType::kF64), Shape({1, 2, 3}),
+                                    sharding, ArrayHandle{1234}, layout_1_);
+  auto sharding1 = SingleDeviceSharding::Create(device_, MemoryKind("mock"));
+  auto array1 = tsl::MakeRef<Array>(client_.get(), rpc_helper_,
+                                    DType(DType::kF64), Shape({1, 2, 3}),
+                                    sharding, ArrayHandle{5678}, layout_2_);
+
+  IfrtResponse response;
+  response.mutable_copy_arrays_response()->add_array_handles(1);
+  response.mutable_copy_arrays_response()->add_array_handles(2);
+
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kCopyArraysRequest)))
+      .WillOnce(MockClientSessionReturnResponse(response));
+  EXPECT_CALL(*session_,
+              Enqueue(IfrtRequestOfType(IfrtRequest::kDestructArrayRequest)))
+      .WillRepeatedly(MockClientSessionReturnResponse(IfrtResponse()));
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays = {array0, array1};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto copied_arrays,
+      client_->CopyArrays(absl::MakeSpan(arrays),
+                          client_->MakeDeviceList({device_}),
+                          MemoryKind("mock"), ArrayCopySemantics::kAlwaysCopy));
+  ASSERT_THAT(copied_arrays, SizeIs(2));
+  EXPECT_EQ(
+      llvm::cast<Array>(copied_arrays[0].get())->custom_layout()->ToString(),
+      layout_1_->ToString());
+  EXPECT_EQ(
+      llvm::cast<Array>(copied_arrays[1].get())->custom_layout()->ToString(),
+      layout_2_->ToString());
+}
+
 // TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
 #if defined(PLATFORM_GOOGLE)
 TEST_P(ClientTest, GetDefaultDeviceAssignmentSuccess) {
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
index 088039319893..84b643b51e31 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -25,6 +25,7 @@
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
+#include "xla/debug_options_flags.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
@@ -42,6 +43,7 @@
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/errors.h"
 #include "tsl/platform/status_to_from_proto.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
@@ -54,7 +56,7 @@ Compiler::Compiler(xla::ifrt::Client* client,
                    std::shared_ptr<RpcHelper> rpc_helper)
     : client_(client), rpc_helper_(std::move(rpc_helper)) {}
 
-absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
+absl::StatusOr<xla::ifrt::LoadedExecutableRef> Compiler::CompileAndLoad(
     std::unique_ptr<Program> program,
     std::unique_ptr<xla::ifrt::CompileOptions> options) {
   auto request = std::make_unique<CompileRequest>();
@@ -99,6 +101,21 @@ absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
     }
 
     loaded_host_callbacks.swap(xla_options->loaded_host_callbacks);
+
+#if defined(PLATFORM_GOOGLE)
+    // Capture XLA flags.
+    // This is disabled for OSS because, if not, it creates a difference in
+    // behavior between OSS XLA_FLAGS and XLA TPU flags. XLA_FLAGS would be
+    // captured here and propagated to the server, but XLA TPU flags would not.
+    //
+    // With the current implementation both XLA_FLAGS and XLA TPU flags should
+    // be set at the proxy server in OSS/Cloud. For google internal usecases,
+    // both should be set at the proxy client.
+    auto& build_options = xla_options->compile_options.executable_build_options;
+    *build_options.mutable_debug_options() = xla::GetDebugOptionsFromFlags();
+    TF_RETURN_IF_ERROR(
+        build_options.mutable_comp_envs()->InitializeAllKnownEnvs());
+#endif
   }
 
   TF_ASSIGN_OR_RETURN(*request->mutable_compile_options(),
@@ -144,14 +161,14 @@ absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
       std::move(loaded_host_callback_handles));
 }
 
-absl::StatusOr<std::unique_ptr<Executable>> Compiler::Compile(
+absl::StatusOr<xla::ifrt::ExecutableRef> Compiler::Compile(
     std::unique_ptr<Program> program, const Topology& topology,
     std::unique_ptr<CompileOptions> options) {
   return absl::UnimplementedError(
       "IFRT service compiler does not support `Compile` with a topology");
 }
 
-absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
+absl::StatusOr<xla::ifrt::LoadedExecutableRef>
 Compiler::DeserializeLoadedExecutable(
     absl::string_view serialized,
     std::unique_ptr<xla::ifrt::DeserializeExecutableOptions> options) {
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.h b/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
index 3ad562d42ab8..aa3f29ffa6cb 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
@@ -21,6 +21,7 @@
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/executable.h"
@@ -34,18 +35,19 @@ namespace proxy {
 
 class Compiler final : public llvm::RTTIExtends<Compiler, xla::ifrt::Compiler> {
  public:
+  using xla::ifrt::Compiler::Compile;
+
   Compiler(xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper);
 
-  absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compile(
+  absl::StatusOr<xla::ifrt::LoadedExecutableRef> CompileAndLoad(
       std::unique_ptr<xla::ifrt::Program> program,
       std::unique_ptr<xla::ifrt::CompileOptions> options) override;
 
-  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+  absl::StatusOr<xla::ifrt::ExecutableRef> Compile(
       std::unique_ptr<Program> program, const Topology& topology,
       std::unique_ptr<CompileOptions> options) override;
 
-  absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
-  DeserializeLoadedExecutable(
+  absl::StatusOr<xla::ifrt::LoadedExecutableRef> DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<xla::ifrt::DeserializeExecutableOptions> options)
       override;
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
index 9669d505937d..891814a79c2a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
@@ -24,11 +24,14 @@
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt_proxy/client/client_session.h"
 #include "xla/python/ifrt_proxy/client/host_buffer.h"
@@ -37,6 +40,8 @@
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/client/version.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/test_utils.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -49,20 +54,13 @@ namespace {
 
 using ::testing::_;
 using ::testing::ElementsAre;
-using ::testing::FieldsAre;
 using ::testing::Invoke;
 using ::testing::Optional;
-using ::testing::Pointee;
 using ::testing::Return;
 using ::tsl::protobuf::TextFormat;
 using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 
-#if defined(PLATFORM_GOOGLE)
-using ::testing::EquivToProto;
-using ::testing::proto::Partially;
-#endif
-
 struct TestProgram : llvm::RTTIExtends<TestProgram, Program> {
   static char ID;  // NOLINT
 };
@@ -76,7 +74,8 @@ class TestProgramSerDes : public llvm::RTTIExtends<TestProgramSerDes, SerDes> {
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     CHECK(llvm::isa<TestProgram>(serializable));
     return "";
   }
@@ -107,7 +106,8 @@ class TestCompileOptionsSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     CHECK(llvm::isa<TestCompileOptions>(serializable));
     return "";
   }
@@ -156,10 +156,9 @@ class CompilerTest : public testing::Test {
   std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
 };
 
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
 TEST_F(CompilerTest, Compile) {
   std::vector<MockDevice> devices(2);
+  TestQueue<IfrtRequest> requests_queue(/*pop_timeout=*/absl::Minutes(1));
 
   MockClient client;
   ON_CALL(client, LookupDevice(_)).WillByDefault(Invoke([&](DeviceId id) {
@@ -180,11 +179,8 @@ TEST_F(CompilerTest, Compile) {
            })pb",
       &response));
   EXPECT_CALL(*session_,
-              Enqueue(Pointee(Partially(EquivToProto(
-                  R"pb(compile_request {
-                         program { type_name: "xla::ifrt::proxy::TestProgram" }
-                       })pb")))))
-      .WillOnce(MockClientSessionReturnResponse(response));
+              Enqueue(IfrtRequestOfType(IfrtRequest::kCompileRequest)))
+      .WillOnce(MockClientCaptureAndReturn(&requests_queue, response));
 
   ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
                                             response_metadata {
@@ -196,15 +192,16 @@ TEST_F(CompilerTest, Compile) {
                                           )pb",
                                           &response));
   EXPECT_CALL(*session_,
-              Enqueue(Pointee(Partially(EquivToProto(R"pb(check_future_request {
-                                                            future_handle: 5678
-                                                          })pb")))))
-      .WillOnce(MockClientSessionReturnResponse(response));
+              Enqueue(IfrtRequestOfType(IfrtRequest::kCheckFutureRequest)))
+      .WillOnce(MockClientCaptureAndReturn(&requests_queue, response));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto executable,
-      compiler.Compile(std::make_unique<TestProgram>(),
-                       std::make_unique<TestCompileOptions>()));
+      compiler.CompileAndLoad(std::make_unique<TestProgram>(),
+                              std::make_unique<TestCompileOptions>()));
+
+  EXPECT_EQ(requests_queue.Pop().compile_request().program().type_name(),
+            "xla::ifrt::proxy::TestProgram");
 
   EXPECT_EQ(executable->name(), "foo-executable");
   EXPECT_EQ(executable->num_devices(), 2);
@@ -214,8 +211,9 @@ TEST_F(CompilerTest, Compile) {
               IsOkAndHolds(Optional(std::string("fingerprint"))));
   EXPECT_THAT(executable->GetReadyFuture().Await(),
               StatusIs(absl::StatusCode::kUnknown, "injected error"));
+
+  EXPECT_EQ(requests_queue.Pop().check_future_request().future_handle(), 5678);
 }
-#endif
 
 }  // namespace
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index 3f63bd89308d..2e6f331619b0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -26,7 +26,7 @@
 #include "absl/base/thread_annotations.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/functional/bind_front.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -63,15 +63,15 @@
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/concurrency/ref_count.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_to_from_proto.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/mem.h"
 #include "tsl/platform/protobuf.h"
-#include "tsl/platform/status_to_from_proto.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/threadpool.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
@@ -196,7 +196,7 @@ absl::StatusOr<uint64_t> PrepareAndExecuteLoadedHostCallback(
 // executions.
 class LoadedExecutable::OutputSpecCache {
  public:
-  explicit OutputSpecCache(absl::Nonnull<LoadedExecutable*> parent)
+  explicit OutputSpecCache(LoadedExecutable* absl_nonnull parent)
       : parent_(parent) {}
 
   // Returns the cached output spec if already cached, and std::nullopt if not.
@@ -308,15 +308,19 @@ LoadedExecutable::LoadedExecutable(
         }
 
         auto parse_layouts =
-            [](const LoadedExecutableMetadataResponse::LayoutList& list) {
-              std::vector<std::shared_ptr<const xla::PjRtLayout>> layouts;
-              layouts.reserve(list.layouts_size());
-              for (const auto& layout : list.layouts()) {
-                layouts.push_back(std::make_shared<xla::PjRtLayout>(
-                    xla::Layout::CreateFromProto(layout)));
-              }
-              return layouts;
-            };
+            [](const LoadedExecutableMetadataResponse::LayoutList& list)
+            -> absl::StatusOr<
+                std::vector<std::shared_ptr<const xla::PjRtLayout>>> {
+          std::vector<std::shared_ptr<const xla::PjRtLayout>> layouts;
+          layouts.reserve(list.layouts_size());
+          for (const auto& layout_proto : list.layouts()) {
+            TF_ASSIGN_OR_RETURN(xla::Layout layout,
+                                xla::Layout::FromProto(layout_proto));
+            layouts.push_back(
+                std::make_shared<xla::PjRtLayout>(std::move(layout)));
+          }
+          return layouts;
+        };
 
         if (response.value()->has_parameter_layouts_list()) {
           info->parameter_layouts =
@@ -359,6 +363,27 @@ LoadedExecutable::LoadedExecutable(
           info->output_memory_kinds = std::move(output_memory_kinds);
         }
 
+        if (response.value()->has_donated_input_indices()) {
+          info->donatable_input_indices =
+              std::vector<int>(response.value()
+                                   ->donated_input_indices()
+                                   .donated_input_indices()
+                                   .begin(),
+                               response.value()
+                                   ->donated_input_indices()
+                                   .donated_input_indices()
+                                   .end());
+          info->donatable_input_indices_set =
+              absl::flat_hash_set<int>(info->donatable_input_indices->begin(),
+                                       info->donatable_input_indices->end());
+        } else if (response.value()->has_donated_input_indices_error()) {
+          info->donatable_input_indices = tsl::StatusFromProto(
+              response.value()->donated_input_indices_error());
+        } else {
+          info->donatable_input_indices = absl::UnimplementedError(
+              "IFRT Proxy server did not return donated input indices");
+        }
+
         promise.Set(std::move(info));
       };
   rpc_helper_->LoadedExecutableMetadata(std::move(req))
@@ -422,6 +447,14 @@ std::optional<std::vector<OpSharding>> LoadedExecutable::GetParameterShardings()
   return (*info)->parameter_shardings;
 }
 
+absl::StatusOr<absl::Span<const int>>
+LoadedExecutable::GetDonatableInputIndices() const {
+  tsl::profiler::TraceMe traceme_ifrt_entrypoint(
+      "IfrtProxyEntrypointLoadedExecutableDonatableInputIndices");
+  TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
+  return info->donatable_input_indices;
+}
+
 std::optional<std::vector<OpSharding>> LoadedExecutable::GetOutputShardings()
     const {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint(
@@ -470,20 +503,39 @@ absl::StatusOr<xla::ifrt::AttributeMap> LoadedExecutable::GetCostAnalysis()
 }
 
 absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult>
-LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
+LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
                           const ExecuteOptions& options,
                           std::optional<xla::ifrt::DeviceListRef> devices) {
   tsl::profiler::TraceMe traceme_ifrt_entrypoint(
       "IfrtProxyEntrypointLoadedExecutableExecute");
   auto req = std::make_unique<LoadedExecutableExecuteRequest>();
   req->set_loaded_executable_handle(handle_);
-  for (const auto& arg : args) {
+
+  TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
+  for (int i = 0; i < args.size(); ++i) {
+    xla::ifrt::ArrayRef& arg = args[i];
     auto* array = llvm::dyn_cast_or_null<Array>(arg.get());
     if (array == nullptr) {
       return absl::InvalidArgumentError(
           "Invalid IFRT array type provided to `LoadedExecutable::Execute`");
     }
-    req->add_args_handles(array->handle().handle);
+    if (options.non_donatable_input_indices.contains(i)) {
+      TF_ASSIGN_OR_RETURN(ArrayHandle handle,
+                          array->GetHandle(ArrayCopySemantics::kAlwaysCopy));
+      req->add_args_handles(handle.handle);
+    } else if (!info->donatable_input_indices_set.has_value()) {
+      TF_ASSIGN_OR_RETURN(ArrayHandle handle,
+                          array->GetHandleUnknownIfBeingDonated());
+      req->add_args_handles(handle.handle);
+    } else if (info->donatable_input_indices_set->contains(i)) {
+      TF_ASSIGN_OR_RETURN(ArrayHandle handle,
+                          array->GetHandle(ArrayCopySemantics::kDonateInput));
+      req->add_args_handles(handle.handle);
+    } else {
+      TF_ASSIGN_OR_RETURN(ArrayHandle handle,
+                          array->GetHandle(ArrayCopySemantics::kAlwaysCopy));
+      req->add_args_handles(handle.handle);
+    }
   }
   TF_ASSIGN_OR_RETURN(*req->mutable_execute_options(), options.ToProto());
   if (devices.has_value()) {
@@ -506,14 +558,29 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
       output_spec_cache_->Retrieve().has_value();
 
   xla::ifrt::LoadedExecutable::ExecuteResult result;
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>> layouts =
+      GetOutputLayouts();
 
   if (client_generated_handles) {
     auto output_specs = *output_spec_cache_->Retrieve();
-    for (const auto& output_spec : output_specs) {
+    if (layouts.ok() && layouts->size() != output_specs.size()) {
+      return absl::InternalError(absl::StrCat(
+          "Mismatch between output specs and layouts: ", output_specs.size(),
+          " vs ", layouts->size()));
+    }
+    for (int i = 0; i < output_specs.size(); ++i) {
+      const auto& output_spec = output_specs[i];
       uint64_t handle = rpc_helper_->NextHandle();
-      result.outputs.push_back(tsl::MakeRef<Array>(
-          client(), rpc_helper_, output_spec.dtype, output_spec.shape,
-          output_spec.sharding, ArrayHandle{handle}));
+      if (layouts.ok()) {
+        result.outputs.push_back(tsl::MakeRef<Array>(
+            client(), rpc_helper_, output_spec.dtype, output_spec.shape,
+            output_spec.sharding, ArrayHandle{handle},
+            /*layout=*/std::move((*layouts)[i])));
+      } else {
+        result.outputs.push_back(tsl::MakeRef<Array>(
+            client(), rpc_helper_, output_spec.dtype, output_spec.shape,
+            output_spec.sharding, ArrayHandle{handle}, /*layout=*/nullptr));
+      }
       req->add_result_array_handle(handle);
     }
     uint64_t status_handle = rpc_helper_->NextHandle();
@@ -527,80 +594,56 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
       // handle being sent.
       result.status = rpc_helper_->CheckFuture(status_handle);
     }
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<LoadedExecutableExecuteResponse> response,
-        rpc_helper_->LoadedExecutableExecute(std::move(req)).Await());
-    auto status = output_spec_cache_->Cache(response->outputs());
-    if (!status.ok()) {
-      // Handles in `response` need to be destructed remotely.
-      for (const auto& output : response->outputs()) {
-        Array::Destruct(rpc_helper_.get(), ArrayHandle{output.array_handle()});
-      }
-      if (result_needs_exec_status) {
-        // `CheckFuture` deletes the server-side future handle.
-        rpc_helper_->CheckFuture(response->status_handle());
-      }
-      return status;
-    }
-    auto output_specs = *output_spec_cache_->Retrieve();
-    for (int i = 0; i < output_specs.size(); ++i) {
-      result.outputs.push_back(tsl::MakeRef<Array>(
-          client(), rpc_helper_, output_specs[i].dtype, output_specs[i].shape,
-          output_specs[i].sharding,
-          ArrayHandle{response->outputs()[i].array_handle()}));
+
+    return result;
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<LoadedExecutableExecuteResponse> response,
+      rpc_helper_->LoadedExecutableExecute(std::move(req)).Await());
+  auto status = output_spec_cache_->Cache(response->outputs());
+  if (!status.ok()) {
+    // Handles in `response` need to be destructed remotely.
+    for (const auto& output : response->outputs()) {
+      Array::Destruct(rpc_helper_.get(), ArrayHandle{output.array_handle()});
     }
     if (result_needs_exec_status) {
-      result.status = rpc_helper_->CheckFuture(response->status_handle());
+      // `CheckFuture` deletes the server-side future handle.
+      rpc_helper_->CheckFuture(response->status_handle());
+    }
+    return status;
+  }
+  absl::Span<const ArraySpec> output_specs = *output_spec_cache_->Retrieve();
+  if (layouts.ok() && layouts->size() != output_specs.size()) {
+    return absl::InternalError(absl::StrCat(
+        "Mismatch between output specs and layouts: ", output_specs.size(),
+        " vs ", layouts->size()));
+  }
+  for (int i = 0; i < output_specs.size(); ++i) {
+    const auto& output_spec = output_specs[i];
+    if (layouts.ok()) {
+      result.outputs.push_back(tsl::MakeRef<Array>(
+          client(), rpc_helper_, output_spec.dtype, output_spec.shape,
+          output_spec.sharding,
+          ArrayHandle{response->outputs()[i].array_handle()},
+          /*layout=*/std::move((*layouts)[i])));
     } else {
-      CHECK_EQ(response->status_handle(), 0);
+      result.outputs.push_back(tsl::MakeRef<Array>(
+          client(), rpc_helper_, output_spec.dtype, output_spec.shape,
+          output_spec.sharding,
+          ArrayHandle{response->outputs()[i].array_handle()},
+          /*layout=*/nullptr));
     }
   }
+  if (result_needs_exec_status) {
+    result.status = rpc_helper_->CheckFuture(response->status_handle());
+  } else {
+    CHECK_EQ(response->status_handle(), 0);
+  }
 
   return result;
 }
 
-Future<> LoadedExecutable::Delete() {
-  tsl::profiler::TraceMe traceme_ifrt_entrypoint(
-      "IfrtProxyEntrypointLoadedExecutableDelete");
-  auto req = std::make_unique<LoadedExecutableDeleteRequest>();
-  req->set_loaded_executable_handle(handle_);
-
-  auto promise = Future<>::CreatePromise();
-  Future<> result(promise);
-
-  rpc_helper_->LoadedExecutableDelete(std::move(req))
-      .OnReady(
-          [promise = std::move(promise), rpc_helper = rpc_helper_](
-              absl::StatusOr<std::shared_ptr<LoadedExecutableDeleteResponse>>
-                  response) mutable {
-            if (!response.ok()) {
-              promise.Set(response.status());
-              return;
-            }
-            rpc_helper->CheckFuture((*response)->future_handle())
-                .OnReady([promise = std::move(promise)](
-                             absl::Status s) mutable { promise.Set(s); });
-          });
-  return result;
-}
-
-bool LoadedExecutable::IsDeleted() const {
-  tsl::profiler::TraceMe traceme_ifrt_entrypoint(
-      "IfrtProxyEntrypointLoadedExecutableIsDeleted");
-  auto req = std::make_unique<LoadedExecutableIsDeletedRequest>();
-  req->set_loaded_executable_handle(handle_);
-
-  absl::StatusOr<std::shared_ptr<LoadedExecutableIsDeletedResponse>> response =
-      rpc_helper_->LoadedExecutableIsDeleted(std::move(req)).Await();
-  if (!response.ok()) {
-    LOG(ERROR) << "Failed to query the deletion status of `LoadedExecutable`: "
-               << response.status();
-    return false;
-  }
-  return (*response)->is_deleted();
-}
-
 absl::Span<xla::ifrt::Device* const> LoadedExecutable::addressable_devices()
     const {
   return addressable_devices_;
@@ -671,7 +714,7 @@ void LoadedExecutable::PollLoadedHostCallback(
     }
   };
 
-  static auto* global_pool = new tsl::thread::ThreadPool(
+  static auto* const global_pool = new tsl::thread::ThreadPool(
       tsl::Env::Default(), GetThreadOptions(), "XLAIFRTProxy",
       std::min(16, tsl::port::MaxParallelism()));
   global_pool->Schedule(std::move(f));
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
index d815a08a04ce..84a32017bd2b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -25,6 +25,7 @@
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -77,6 +78,8 @@ class LoadedExecutable final
   absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override;
 
   std::optional<std::vector<OpSharding>> GetParameterShardings() const override;
+  absl::StatusOr<absl::Span<const int>> GetDonatableInputIndices()
+      const override;
   std::optional<std::vector<OpSharding>> GetOutputShardings() const override;
   absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
   GetParameterLayouts() const override;
@@ -89,14 +92,15 @@ class LoadedExecutable final
 
   absl::StatusOr<xla::ifrt::AttributeMap> GetCostAnalysis() const override;
 
+  // The following may return an OK status even if the underlying IFRT backend
+  // would (eagerly) return an error. If that happens, the fields of the
+  // returned `ExecuteResult` will resolve to the error (for example,
+  // `result->status.Await()` will return the error, where `result` is the
+  // returned value from the `Execute()` call).
   absl::StatusOr<ExecuteResult> Execute(
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
-      const ExecuteOptions& options,
+      absl::Span<xla::ifrt::ArrayRef> args, const ExecuteOptions& options,
       std::optional<xla::ifrt::DeviceListRef> devices) override;
 
-  Future<> Delete() override;
-  bool IsDeleted() const override;
-
   absl::Span<xla::ifrt::Device* const> addressable_devices() const override;
 
   static char ID;  // NOLINT
@@ -117,6 +121,10 @@ class LoadedExecutable final
     absl::node_hash_set<std::string> memory_kinds;
     absl::StatusOr<std::vector<std::vector<absl::string_view>>>
         output_memory_kinds;
+
+    absl::StatusOr<std::vector<int>> donatable_input_indices;
+
+    std::optional<absl::flat_hash_set<int>> donatable_input_indices_set;
   };
 
   void PollLoadedHostCallback(
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 2105e787e051..4481e2bab6a8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -16,6 +16,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -25,7 +26,6 @@
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "xla/layout_util.h"
-#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/device.h"
@@ -48,7 +48,8 @@
 #include "xla/python/ifrt_proxy/common/test_utils.h"
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "tsl/platform/casts.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -56,7 +57,6 @@
 
 using ::testing::_;
 using ::testing::ElementsAre;
-using ::testing::Optional;
 using ::testing::Pointee;
 using ::testing::Return;
 using ::testing::SizeIs;
@@ -102,9 +102,8 @@ class LoadedExecutableTest : public ::testing::Test {
   std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
 };
 
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
 TEST_F(LoadedExecutableTest, Metadata) {
+  TestQueue<IfrtRequest> requests_queue(/*pop_timeout=*/absl::Minutes(1));
   IfrtResponse response;
   ASSERT_TRUE(TextFormat::ParseFromString(
       R"pb(
@@ -130,11 +129,11 @@ TEST_F(LoadedExecutableTest, Metadata) {
         }
       )pb",
       &response));
-  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
-                             R"pb(loaded_executable_metadata_request {
-                                    loaded_executable_handle: 1234
-                                  })pb")))))
-      .WillOnce(MockClientSessionReturnResponse(response));
+
+  EXPECT_CALL(
+      *session_,
+      Enqueue(IfrtRequestOfType(IfrtRequest::kLoadedExecutableMetadataRequest)))
+      .WillOnce(MockClientCaptureAndReturn(&requests_queue, response));
 
   MockClient client;
   LoadedExecutable executable(
@@ -144,38 +143,73 @@ TEST_F(LoadedExecutableTest, Metadata) {
       /*ready_future=*/Future<>(absl::OkStatus()),
       /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
 
-  EXPECT_THAT(
-      executable.GetParameterShardings(),
-      Optional(ElementsAre(
-          EquivToProto(R"pb(type: REPLICATED)pb"),
-          EquivToProto(R"pb(type: OTHER
-                            tile_shape {
-                              element_type: BF16
-                              dimensions: [ 2, 2 ]
-                            }
-                            tile_assignment_dimensions: [ 0, 1 ])pb"))));
-  EXPECT_THAT(executable.GetOutputShardings(),
-              Optional(ElementsAre(EquivToProto(R"pb(type: REPLICATED)pb"))));
-  ASSERT_OK_AND_ASSIGN(auto parameter_layouts,
-                       executable.GetParameterLayouts());
+  EXPECT_EQ(requests_queue.Pop()
+                .loaded_executable_metadata_request()
+                .loaded_executable_handle(),
+            1234);
+  if (executable.GetParameterShardings().has_value()) {
+    std::vector<OpSharding> param_shardings =
+        *std::move(executable.GetParameterShardings());
+    ASSERT_EQ(param_shardings.size(), 2);
+    EXPECT_EQ(param_shardings[0].type(), OpSharding::REPLICATED);
+    ASSERT_EQ(param_shardings[1].type(), OpSharding::OTHER);
+    EXPECT_EQ(param_shardings[1].tile_shape().element_type(), xla::BF16);
+    EXPECT_THAT(param_shardings[1].tile_shape().dimensions(),
+                ElementsAre(2, 2));
+    EXPECT_THAT(param_shardings[1].tile_assignment_dimensions(),
+                ElementsAre(0, 1));
+  }
+  if (executable.GetOutputShardings().has_value()) {
+    std::vector<OpSharding> output_shardings =
+        *std::move(executable.GetOutputShardings());
+    ASSERT_EQ(output_shardings.size(), 1);
+    EXPECT_EQ(output_shardings[0].type(), OpSharding::REPLICATED);
+  }
+  TF_ASSERT_OK_AND_ASSIGN(auto parameter_layouts,
+                          executable.GetParameterLayouts());
   ASSERT_EQ(parameter_layouts.size(), 2);
   EXPECT_EQ(parameter_layouts[0]->xla_layout(),
             xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1));
   EXPECT_EQ(parameter_layouts[1]->xla_layout(),
             xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2));
-  ASSERT_OK_AND_ASSIGN(auto output_layouts, executable.GetOutputLayouts());
+  TF_ASSERT_OK_AND_ASSIGN(auto output_layouts, executable.GetOutputLayouts());
   ASSERT_EQ(output_layouts.size(), 1);
   EXPECT_EQ(output_layouts[0]->xla_layout(),
             xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2));
   EXPECT_THAT(executable.GetOutputMemoryKinds(),
               IsOkAndHolds(ElementsAre(ElementsAre("foo"))));
 }
-#endif
 
 // TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
 #if defined(PLATFORM_GOOGLE)
 TEST_F(LoadedExecutableTest, Execute) {
   MockClient client;
+
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_metadata_response {
+          parameter_shardings {
+            shardings { type: REPLICATED }
+            shardings { type: REPLICATED }
+          }
+          output_shardings {
+            shardings { type: REPLICATED }
+            shardings { type: REPLICATED }
+          }
+          output_layouts_list {
+            layouts { minor_to_major: [ 1, 0 ] }
+            layouts { minor_to_major: 0 }
+          }
+        }
+      )pb",
+      &response));
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_metadata_request {
+                                    loaded_executable_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
   ON_CALL(client, MakeDeviceList(_))
       .WillByDefault([](absl::Span<xla::ifrt::Device* const> devices) {
         return xla::ifrt::BasicDeviceList::Create(devices);
@@ -251,11 +285,12 @@ TEST_F(LoadedExecutableTest, Execute) {
 
   DeviceListRef devices = BasicDeviceList::Create({&device});
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> args;
+  std::vector<xla::ifrt::ArrayRef> args;
   for (const uint64_t handle : {1000, 1001}) {
     args.push_back(tsl::MakeRef<Array>(
         &client, rpc_helper_, DType(DType::kF32), Shape({2, 2}),
-        OpaqueSharding::Create(devices, MemoryKind()), ArrayHandle{handle}));
+        OpaqueSharding::Create(devices, MemoryKind()), ArrayHandle{handle},
+        /*layout=*/nullptr));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -270,12 +305,18 @@ TEST_F(LoadedExecutableTest, Execute) {
   const auto output0 = result.outputs[0];
   EXPECT_EQ(output0->dtype(), DType(DType::kF32));
   EXPECT_EQ(output0->shape(), Shape({4, 4}));
-  EXPECT_EQ(llvm::cast<Array>(output0.get())->handle().handle, 3000);
+  EXPECT_EQ(llvm::cast<Array>(output0.get())
+                ->GetHandleUnknownIfBeingDonated()
+                ->handle,
+            3000);
 
   const auto output1 = result.outputs[1];
   EXPECT_EQ(output1->dtype(), DType(DType::kF16));
   EXPECT_EQ(output1->shape(), Shape({8}));
-  EXPECT_EQ(llvm::cast<Array>(output1.get())->handle().handle, 3001);
+  EXPECT_EQ(llvm::cast<Array>(output1.get())
+                ->GetHandleUnknownIfBeingDonated()
+                ->handle,
+            3001);
 
   // Execute again. This time, the client already knows the output spec and so
   // will supply client-generated handles.
@@ -307,89 +348,17 @@ TEST_F(LoadedExecutableTest, Execute) {
 
   ASSERT_THAT(result.outputs, SizeIs(2));
   ASSERT_THAT(execute_req.result_array_handle(), SizeIs(2));
-  EXPECT_EQ(llvm::cast<Array>(result.outputs[0].get())->handle().handle,
+  EXPECT_EQ(llvm::cast<Array>(result.outputs[0].get())
+                ->GetHandleUnknownIfBeingDonated()
+                ->handle,
             execute_req.result_array_handle()[0]);
-  EXPECT_EQ(llvm::cast<Array>(result.outputs[1].get())->handle().handle,
+  EXPECT_EQ(llvm::cast<Array>(result.outputs[1].get())
+                ->GetHandleUnknownIfBeingDonated()
+                ->handle,
             execute_req.result_array_handle()[1]);
 }
 #endif
 
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
-TEST_F(LoadedExecutableTest, Delete) {
-  MockClient client;
-  LoadedExecutable executable(
-      &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
-      /*num_devices=*/2, /*addressable_devices=*/{},
-      /*fingerprint=*/"fingerprint",
-      /*ready_future=*/Future<>(absl::OkStatus()),
-      /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
-
-  {
-    IfrtResponse response;
-    ASSERT_TRUE(TextFormat::ParseFromString(
-        R"pb(
-          loaded_executable_delete_response { future_handle: 2000 }
-        )pb",
-        &response));
-    EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
-                               R"pb(loaded_executable_delete_request {
-                                      loaded_executable_handle: 1234
-                                    })pb")))))
-        .WillOnce(MockClientSessionReturnResponse(response));
-
-    ASSERT_TRUE(TextFormat::ParseFromString(
-        R"pb(
-          response_metadata {
-            status {
-              code: 2  # UNKNOWN
-              message: "injected error"
-            }
-          }
-        )pb",
-        &response));
-    EXPECT_CALL(
-        *session_,
-        Enqueue(Pointee(Partially(EquivToProto(R"pb(check_future_request {
-                                                      future_handle: 2000
-                                                    })pb")))))
-        .WillOnce(MockClientSessionReturnResponse(response));
-
-    Future<> result = executable.Delete();
-    EXPECT_THAT(result.Await(),
-                StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
-  }
-
-  {
-    IfrtResponse response;
-    ASSERT_TRUE(TextFormat::ParseFromString(
-        R"pb(
-          loaded_executable_is_deleted_response { is_deleted: true }
-        )pb",
-        &response));
-    EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
-                               R"pb(loaded_executable_is_deleted_request {
-                                      loaded_executable_handle: 1234
-                                    })pb")))))
-        .WillOnce(MockClientSessionReturnResponse(response));
-
-    EXPECT_TRUE(executable.IsDeleted());
-  }
-
-  IfrtResponse response;
-  ASSERT_TRUE(TextFormat::ParseFromString(
-      R"pb(
-        loaded_executable_destruct_response {}
-      )pb",
-      &response));
-  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
-                             R"pb(loaded_executable_destruct_request {
-                                    loaded_executable_handle: 1234
-                                  })pb")))))
-      .WillOnce(MockClientSessionReturnResponse(response));
-}
-#endif
-
 }  // namespace
 }  // namespace proxy
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h b/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
index 188a5aa96535..9f052b4a7571 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
@@ -34,7 +34,7 @@ struct GlobalClientFlags {
   // codepath works well.
   bool synchronous_host_buffer_store;
 
-  // TODO(b/375021159): Implement faster is_delete without needing a hack.
+  // TODO(b/393445969): Implement faster is_delete without needing a hack.
   bool array_is_deleted_hack;
 };
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
deleted file mode 100644
index 33d60b9ae0eb..000000000000
--- a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "xla/python/ifrt_proxy/client/py_module.h"
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <variant>
-
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/log/log_entry.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/time/time.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/function.h"  // IWYU pragma: keep
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/unordered_map.h"  // IWYU pragma: keep
-#include "nanobind/stl/variant.h"  // IWYU pragma: keep
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/ifrt/attribute_map.h"
-#include "xla/python/ifrt/client.h"
-#include "xla/python/ifrt_proxy/client/registry.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/py_client.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/statusor.h"
-
-namespace nb = ::nanobind;
-
-namespace xla {
-namespace ifrt {
-namespace proxy {
-namespace {
-
-struct PyClientConnectionOptions {
-  std::optional<std::function<void(std::string)>> on_disconnect;
-  std::optional<std::function<void(std::string)>> on_connection_update;
-  std::optional<int64_t> connection_timeout_in_seconds;
-  std::optional<
-      std::unordered_map<std::string, std::variant<nb::bytes, bool, int64_t>>>
-      initialization_data;
-};
-
-absl::StatusOr<nb_class_ptr<PyClient>> GetClient(
-    std::string proxy_server_address,
-    const PyClientConnectionOptions& py_options) {
-  DCHECK(PyGILState_Check());
-  std::unique_ptr<xla::ifrt::Client> client;
-
-  ClientConnectionOptions options;
-  if (py_options.on_disconnect) {
-    // While it is possible to pass around `py_options.on_disconnect` without
-    // wrapping it via a shared_ptr, copying the `py_options.on_disconnect`
-    // object can internally attempt to acquire the GIL [1], and can thus block
-    // or even deadlock. A unique_ptr or `absl::AnyInvocable` is not sufficient
-    // because downstream code can make copies. Reference:
-    // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors
-    auto py_on_disconnect = std::make_shared<std::function<void(std::string)>>(
-        std::move(*py_options.on_disconnect));
-
-    options.on_disconnect =
-        [on_disconnect = std::move(py_on_disconnect)](absl::Status s) mutable {
-          LOG(WARNING) << "Connection to server failed, calling supplied "
-                       << "`on_disconnect` function: " << s;
-          tsl::Env::Default()->SchedClosure([s, on_disconnect]() mutable {
-            nb::gil_scoped_acquire gil_acquire;
-            (*on_disconnect)(s.ToString());
-            on_disconnect = nullptr;
-          });
-        };
-  }
-
-  if (py_options.on_connection_update) {
-    auto fn = std::make_shared<std::function<void(std::string)>>(
-        std::move(*py_options.on_connection_update));
-    options.on_connection_update = [fn](absl::string_view log_line) -> void {
-      tsl::Env::Default()->SchedClosure([fn, str = std::string(log_line)] {
-        nb::gil_scoped_acquire gil_acquire;
-        (*fn)(std::string(str));
-      });
-    };
-  }
-
-  if (py_options.connection_timeout_in_seconds.has_value()) {
-    options.connection_timeout =
-        absl::Seconds(*py_options.connection_timeout_in_seconds);
-  }
-
-  if (py_options.initialization_data.has_value()) {
-    AttributeMap::Map attribute_map;
-    for (const auto& [key, py_value] : *py_options.initialization_data) {
-      if (std::holds_alternative<nb::bytes>(py_value)) {
-        nb::bytes value = std::get<nb::bytes>(py_value);
-        attribute_map.insert({key, AttributeMap::StringValue(std::string(
-                                       value.c_str(), value.size()))});
-      } else if (std::holds_alternative<bool>(py_value)) {
-        attribute_map.insert(
-            {key, AttributeMap::BoolValue(std::get<bool>(py_value))});
-      } else {
-        CHECK(std::holds_alternative<int64_t>(py_value));
-        attribute_map.insert(
-            {key, AttributeMap::Int64Value(std::get<int64_t>(py_value))});
-      }
-    }
-    options.initialization_data = AttributeMap(std::move(attribute_map));
-  }
-
-  {
-    nb::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(client, CreateClient(proxy_server_address, options));
-  }
-
-  // Constructing `xla::PyClient` requires GIL as it may dec-ref Python objects.
-  return xla::PyClient::Make(std::move(client));
-}
-
-}  // namespace
-
-void BuildIfrtProxySubmodule(nb::module_& m) {
-  nb::module_ sub_module = m.def_submodule("ifrt_proxy", "IFRT proxy");
-
-  nb::class_<PyClientConnectionOptions>(sub_module, "ClientConnectionOptions")
-      .def(nb::init<>())
-      .def_rw("on_disconnect", &PyClientConnectionOptions::on_disconnect,
-              nb::arg().none())
-      .def_rw("on_connection_update",
-              &PyClientConnectionOptions::on_connection_update,
-              nb::arg().none())
-      .def_rw("connection_timeout_in_seconds",
-              &PyClientConnectionOptions::connection_timeout_in_seconds,
-              nb::arg().none())
-      .def_rw("initialization_data",
-              &PyClientConnectionOptions::initialization_data,
-              nb::arg().none());
-
-  sub_module.def("get_client", xla::ValueOrThrowWrapper(GetClient),
-                 nb::arg("proxy_server_address"), nb::arg("options"));
-}
-
-}  // namespace proxy
-}  // namespace ifrt
-}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.h b/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
deleted file mode 100644
index c990b6e1ad3f..000000000000
--- a/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
-#define XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
-
-#include "nanobind/nanobind.h"
-
-namespace xla {
-namespace ifrt {
-namespace proxy {
-
-void BuildIfrtProxySubmodule(nanobind::module_& m);
-
-}  // namespace proxy
-}  // namespace ifrt
-}  // namespace xla
-
-#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/registry.cc b/third_party/xla/xla/python/ifrt_proxy/client/registry.cc
index e50d267b94e9..c977c807f34a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/registry.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/registry.cc
@@ -48,7 +48,7 @@ struct Registry {
 };
 
 Registry* registry() {
-  static auto* r = new Registry();
+  static auto* const r = new Registry();
   return r;
 }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
index 5cfd3c52e57e..bfeb351a7829 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
@@ -39,9 +39,9 @@
 #include "xla/python/ifrt_proxy/common/prof_util.h"
 #include "xla/python/ifrt_proxy/common/test_utils.h"
 #include "xla/python/ifrt_proxy/common/types.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/status_to_from_proto.h"
-#include "tsl/platform/threadpool.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status_to_from_proto.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
@@ -322,6 +322,8 @@ RPC(GetDefaultDeviceAssignment, get_default_device_assignment);
 RPC(CheckFuture, check_future);
 RPC(CheckValueReady, check_value_ready);
 RPC(MakeArrayFromHostBuffer, make_array_from_host_buffer);
+RPC(MakeArraysFromHostBufferShards, make_arrays_from_host_buffer_shards);
+RPC(MakeErrorArrays, make_error_arrays);
 RPC(AssembleArrayFromSingleDeviceArrays,
     assemble_array_from_single_device_arrays);
 RPC(RemapArrays, remap_arrays);
@@ -340,6 +342,7 @@ RPC(LoadedExecutableIsDeleted, loaded_executable_is_deleted);
 RPC(LoadedExecutableDestruct, loaded_executable_destruct);
 RPC(LoadedHostCallbackPoll, loaded_host_callback_poll);
 RPC(LoadedHostCallbackReturn, loaded_host_callback_return);
+RPC(GetDefaultLayout, get_default_layout);
 
 Future<> RpcHelper::CheckFuture(uint64_t handle) {
   auto req = std::make_unique<CheckFutureRequest>();
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
index ec225b98af94..01ef59e534a8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
@@ -100,6 +100,11 @@ class RpcHelper {
 
   ResponseFuture<MakeArrayFromHostBufferResponse> MakeArrayFromHostBuffer(
       std::unique_ptr<MakeArrayFromHostBufferRequest> req);
+  ResponseFuture<MakeArraysFromHostBufferShardsResponse>
+  MakeArraysFromHostBufferShards(
+      std::unique_ptr<MakeArraysFromHostBufferShardsRequest> req);
+  ResponseFuture<MakeErrorArraysResponse> MakeErrorArrays(
+      std::unique_ptr<MakeErrorArraysRequest> req);
   ResponseFuture<AssembleArrayFromSingleDeviceArraysResponse>
   AssembleArrayFromSingleDeviceArrays(
       std::unique_ptr<AssembleArrayFromSingleDeviceArraysRequest> req);
@@ -139,6 +144,9 @@ class RpcHelper {
   ResponseFuture<LoadedHostCallbackReturnResponse> LoadedHostCallbackReturn(
       std::unique_ptr<LoadedHostCallbackReturnRequest> req);
 
+  ResponseFuture<GetDefaultLayoutResponse> GetDefaultLayout(
+      std::unique_ptr<GetDefaultLayoutRequest> req);
+
   // Utility functions.
 
   // Generates a handle for new arrays, array data stored in HostBufferStore,
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/BUILD b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
index e69148a8aa68..694105d2618d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
@@ -69,6 +69,7 @@ tf_proto_library(
         # copybara:uncomment "@com_google_protobuf//:any",
         "//xla:xla_data_proto",
         "//xla/pjrt:execute_options_proto",
+        "//xla/python/ifrt:array_spec_proto",
         "//xla/python/ifrt:attribute_map_proto",
         "//xla/python/ifrt:dtype_proto",
         "//xla/python/ifrt:execute_options_proto",
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
index b36036b3d8b1..e938ebbce8f6 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
+++ b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
@@ -60,16 +60,27 @@
 *   Changes:
     *   MakeArrayFromHostBuffer uses client-generated array handles and sends data asynchronously.
 
-
 ## Version kClientHandlesOptimization2
 
 *   Added date: 2024-11-19
 *   Changes:
     *   Introduces a set of performance optimizations where the client generates array handles.
 
-
 ## Version kClientHandlesExecutableOptimization
 
 *   Added date: 2024-11-26
 *   Changes:
     *   Client generates array handles for execute requests.
+
+## Version kAssembleArrayFromSingleDeviceArraysWithDType
+
+*   Added date: 2025-02-11
+*   Changes:
+    *   Added support for `Client::AssembleArrayFromSingleDeviceArrays` that
+    takes `DType`.
+
+## Version kMakeArraysFromHostBufferShards
+
+*   Added date: 2025-03-12
+*   Changes:
+    *   Added support for `Client::MakeArraysFromHostBufferShards()`.
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_proxy.bzl b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_proxy.bzl
index 6c15047401ef..490e2c580d01 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_proxy.bzl
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_proxy.bzl
@@ -3,7 +3,7 @@
 # This file is used in OSS only. It is not transformed by copybara. Therefore all paths in this
 # file are OSS paths.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 
 # IMPORTANT: Do not remove this load statement. We rely on that //xla/tsl doesn't exist in g3
 # to prevent g3 .bzl files from loading this file.
@@ -17,7 +17,7 @@ def ifrt_proxy_cc_test(
         **kwargs
     )
 
-default_ifrt_proxy_visibility = ["//xla/python/ifrt_proxy:__subpackages__"]
+default_ifrt_proxy_visibility = ["//xla/python:__subpackages__"]
 
 def cc_library(**attrs):
     native.cc_library(**attrs)
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index 8ac533af4c28..c748a510b8d4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -17,6 +17,7 @@ syntax = "proto3";
 package xla.ifrt.proxy;
 
 import "google/protobuf/any.proto";
+import "xla/python/ifrt/array_spec.proto";
 import "xla/python/ifrt/attribute_map.proto";
 import "xla/python/ifrt/dtype.proto";
 import "xla/python/ifrt/execute_options.proto";
@@ -48,6 +49,9 @@ message IfrtRequest {
 
     // ===== Array =====
     MakeArrayFromHostBufferRequest make_array_from_host_buffer_request = 4;
+    MakeArraysFromHostBufferShardsRequest
+        make_arrays_from_host_buffer_shards_request = 25;
+    MakeErrorArraysRequest make_error_arrays_request = 26;
     AssembleArrayFromSingleDeviceArraysRequest
         assemble_array_from_single_device_arrays_request = 5;
     RemapArraysRequest remap_arrays_request = 23;
@@ -66,8 +70,10 @@ message IfrtRequest {
     // ===== LoadedExecutable =====
     LoadedExecutableMetadataRequest loaded_executable_metadata_request = 14;
     LoadedExecutableExecuteRequest loaded_executable_execute_request = 15;
-    LoadedExecutableDeleteRequest loaded_executable_delete_request = 16;
-    LoadedExecutableIsDeletedRequest loaded_executable_is_deleted_request = 17;
+    LoadedExecutableDeleteRequest loaded_executable_delete_request = 16
+        [deprecated = true];
+    LoadedExecutableIsDeletedRequest loaded_executable_is_deleted_request = 17
+        [deprecated = true];
     LoadedExecutableDestructRequest loaded_executable_destruct_request = 18;
 
     // ===== LoadedHostCallback =====
@@ -77,6 +83,7 @@ message IfrtRequest {
     // ===== Client =====
     GetDefaultDeviceAssignmentRequest get_default_device_assignment_request =
         19;
+    GetDefaultLayoutRequest get_default_layout_request = 27;
   }
 
   reserved 10;
@@ -96,6 +103,9 @@ message IfrtResponse {
 
     // ===== Array =====
     MakeArrayFromHostBufferResponse make_array_from_host_buffer_response = 4;
+    MakeArraysFromHostBufferShardsResponse
+        make_arrays_from_host_buffer_shards_response = 25;
+    MakeErrorArraysResponse make_error_arrays_response = 26;
     AssembleArrayFromSingleDeviceArraysResponse
         assemble_array_from_single_device_arrays_response = 5;
     RemapArraysResponse remap_arrays_response = 23;
@@ -126,6 +136,7 @@ message IfrtResponse {
     // ===== Client =====
     GetDefaultDeviceAssignmentResponse get_default_device_assignment_response =
         19;
+    GetDefaultLayoutResponse get_default_layout_response = 27;
   }
 
   reserved 10;
@@ -267,6 +278,49 @@ message MakeArrayFromHostBufferResponse {
   fixed64 array_handle = 1;
 }
 
+// Makes IFRT Arrays, where a subset of shards of each Array is mapped to a
+// HostBuffer.
+// Equivalent to `ifrt::Client::MakeArraysFromHostBufferShards`.
+message MakeArraysFromHostBufferShardsRequest {
+  message HostBuffer {
+    DTypeProto dtype = 1;
+    ShapeProto shape = 2;
+    fixed64 host_buffer_handle = 3;
+    optional proto.ByteStrides byte_strides = 4;
+  }
+  message ShardIndices {
+    repeated int64 indices = 1;
+  }
+  message MakeArraysFromHostBufferShardsSpec {
+    // `addressable_shard_indices` and `host_buffers` are parallel. Transposed
+    // from `xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec::Buffers` to
+    // reduce nesting.
+    repeated ShardIndices addressable_shard_indices = 1;
+    repeated HostBuffer host_buffers = 2;
+
+    xla.ifrt.ArraySpecProto array_spec = 3;
+  }
+
+  repeated MakeArraysFromHostBufferShardsSpec specs = 1;
+  // If array_handles is provided, the server will either respond with the same
+  // handles in `MakeArraysFromHostBufferShardsResponse` or return an error.
+  repeated fixed64 array_handles = 2;
+}
+message MakeArraysFromHostBufferShardsResponse {
+  repeated fixed64 array_handles = 1;
+}
+
+message MakeErrorArraysRequest {
+  tensorflow.StatusProto error = 1;
+  repeated xla.ifrt.ArraySpecProto array_specs = 2;
+  // If array_handles is provided, the server will either respond with the same
+  // handles in `MakeErrorArraysResponse` or return an error.
+  repeated fixed64 array_handles = 3;
+}
+message MakeErrorArraysResponse {
+  repeated fixed64 array_handles = 1;
+}
+
 // Makes an IFRT Array from a set of single-device Arrays.
 // Equivalent to ifrt::Client::AssembleArrayFromSingleDeviceArrays.
 message AssembleArrayFromSingleDeviceArraysRequest {
@@ -400,13 +454,22 @@ message LoadedExecutableMetadataResponse {
   message ShardingList {
     repeated xla.OpSharding shardings = 1;
   }
-
-  optional ShardingList parameter_shardings = 1;
-  optional ShardingList output_shardings = 2;
-
   message LayoutList {
     repeated xla.LayoutProto layouts = 1;
   }
+  message MemoryKindList {
+    repeated string memory_kinds = 1;
+  }
+  message OutputMemoryKind {
+    tensorflow.StatusProto status = 1;
+    repeated MemoryKindList memory_kind_lists = 2;
+  }
+  message DonatedInputIndices {
+    repeated fixed32 donated_input_indices = 1;
+  }
+
+  optional ShardingList parameter_shardings = 1;
+  optional ShardingList output_shardings = 2;
 
   oneof parameter_layouts {
     LayoutList parameter_layouts_list = 4;
@@ -417,16 +480,12 @@ message LoadedExecutableMetadataResponse {
     tensorflow.StatusProto output_layouts_error = 7;
   }
 
-  message MemoryKindList {
-    repeated string memory_kinds = 1;
-  }
+  OutputMemoryKind output_memory_kinds = 3;
 
-  message OutputMemoryKind {
-    tensorflow.StatusProto status = 1;
-    repeated MemoryKindList memory_kind_lists = 2;
+  oneof donated_input_indices_oneof {
+    DonatedInputIndices donated_input_indices = 8;
+    tensorflow.StatusProto donated_input_indices_error = 9;
   }
-
-  OutputMemoryKind output_memory_kinds = 3;
 }
 
 // Mirrors `LoadedExecutable::Execute`. Returns output array handles and a
@@ -516,3 +575,12 @@ message GetDefaultDeviceAssignmentRequest {
 message GetDefaultDeviceAssignmentResponse {
   xla.DeviceAssignmentProto device_assignment = 1;
 }
+message GetDefaultLayoutRequest {
+  DTypeProto dtype = 1;
+  repeated int64 dims = 2;
+  int32 device_id = 3;
+  string memory_kind = 4;
+}
+message GetDefaultLayoutResponse {
+  bytes serialized_pjrt_layout = 1;
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/versions.h b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
index 2dcca78c49a1..4eb451f5a162 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/versions.h
+++ b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
@@ -46,6 +46,10 @@ enum {
   // AssembleArrayFromSingleDeviceArrays to support non-addressable arrays.
   kAssembleArrayFromSingleDeviceArraysWithDType,
 
+  // kMakeArraysFromHostBufferShards adds Client::MakeArraysFromHostBufferShards
+  // support.
+  kMakeArraysFromHostBufferShards,
+
   // kSentiel is used to derive kCurrent below. Keep this as the last value of
   // the enum.
   kSentiel,
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
index d0e02e443d3f..f74f6d2132c4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
@@ -71,7 +71,9 @@ ifrt_proxy_cc_test(
     timeout = "moderate",
     srcs = ["executable_impl_test_tfrt_cpu.cc"],
     shuffle_tests = False,
-    tags = if_oss(["not_run:arm"]),  # TODO(b/394180263): reenable this.
+    tags = [
+        "no_oss",  # TODO(madthanu): flaky in OSS
+    ] + if_oss(["not_run:arm"]),  # TODO(b/394180263): reenable this.
     deps = [
         ":register_pjrt_cpu_for_ifrt_api_tests",  # buildcleaner: keep
         "//xla/python/ifrt:test_util",
@@ -107,6 +109,7 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:mock",
+        "//xla/python/ifrt:user_context",
         "//xla/python/ifrt_proxy/client",
         "//xla/python/ifrt_proxy/client:grpc_client",
         "//xla/python/ifrt_proxy/client:registry",
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
index 21029ad4f779..7cda8b0c86f9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
@@ -46,6 +46,7 @@
 #include "xla/python/ifrt/mock.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/client.h"
 #include "xla/python/ifrt_proxy/client/registry.h"
@@ -82,14 +83,13 @@ class MockArrayTest : public testing::Test {
                             CreateClient(absl::StrCat("grpc://", address)));
   }
 
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> NewArray() {
+  absl::StatusOr<xla::ifrt::ArrayRef> NewArray() {
     DType dtype(DType::kF32);
     Shape shape({2, 3});
     auto data = std::make_unique<std::vector<float>>(6);
     std::iota(data->begin(), data->end(), 0);
     xla::ifrt::Device* device = client_->addressable_devices().at(0);
-    std::shared_ptr<const Sharding> sharding =
-        SingleDeviceSharding::Create(device, MemoryKind());
+    ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
     TF_ASSIGN_OR_RETURN(
         auto client_arr,
@@ -121,10 +121,10 @@ class MockArrayTest : public testing::Test {
             [this, mock_backend = mock_backend.get()](
                 const void* data, DType dtype, Shape shape,
                 std::optional<absl::Span<const int64_t>> byte_strides,
-                std::shared_ptr<const Sharding> sharding,
-                Client::HostBufferSemantics semantics,
-                std::function<void()> on_done_with_host_buffer)
-                -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
+                ShardingRef sharding, Client::HostBufferSemantics semantics,
+                std::function<void()> on_done_with_host_buffer,
+                tsl::RCReference<UserContext> user_context)
+                -> absl::StatusOr<xla::ifrt::ArrayRef> {
               TF_ASSIGN_OR_RETURN(
                   auto delegated,
                   mock_backend->delegated()->MakeArrayFromHostBuffer(
@@ -155,7 +155,7 @@ class MockArrayTest : public testing::Test {
             });
 
     ON_CALL(*mock_backend, GetReadyFuture)
-        .WillByDefault([](absl::Span<const tsl::RCReference<Value>> values) {
+        .WillByDefault([](absl::Span<const ValueRef> values) {
           std::vector<Future<>> futures;
           futures.reserve(values.size());
           for (const auto& value : values) {
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
deleted file mode 100644
index 9df3a4a93d99..000000000000
--- a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2023 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Jax library for IFRT proxy.
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-)
-
-pytype_strict_library(
-    name = "ifrt_proxy_internal",
-    srcs = ["ifrt_proxy_internal.py"],
-    deps = ["//xla/python:xla_client"],
-)
-
-# copybara:uncomment_begin(ifrt_proxy.py is not exported to github)
-# pytype_strict_library(
-#     name = "ifrt_proxy",
-#     srcs = ["ifrt_proxy.py"],
-#     visibility = [
-#         "//xla/python/ifrt_proxy/common/google:internal",
-#         "//xla/python/ifrt_proxy/common/google:jax_users",
-#     ],
-#     deps = [
-#         ":ifrt_proxy_internal",
-#         "//third_party/py/jax",
-#     ],
-# )
-# copybara:uncomment_end
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
deleted file mode 100644
index 5d460b0ae303..000000000000
--- a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2023 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Library to help create a IFRT proxy client.
-
-This library is no longer recommended nor used in OSS; it is used internally
-within google code. TODO(madthanu): Remove library.
-"""
-
-import dataclasses
-from typing import Callable, Mapping, Optional
-
-from xla.python import xla_client
-
-
-@dataclasses.dataclass
-class ConnectionOptions:
-  """Various connection options.
-
-  Attributes:
-    on_disconnect: Optional, a callback that will be called if there was a
-      successful connection to the proxy server and Jax commands could be
-      issued, but there was a later disconnect before the Client is destroyed.
-    on_connection_update: Optional, a callback that will be called with status
-      updates about initial connection establishment. The updates will be
-      provided as human-readable strings, and an end-user may find them helpful.
-    connection_timeout_in_seconds: Optional, the timeout for establishing a
-      connection to the proxy server.
-    initialization_data: Optional, runtime specific initialization data.
-  """
-
-  on_disconnect: Optional[Callable[[str], None]] = None
-  on_connection_update: Optional[Callable[[str], None]] = None
-  connection_timeout_in_seconds: Optional[int] = None
-  initialization_data: Optional[Mapping[str, bytes | bool | int]] = None
-
-
-_backend_created: bool = False
-_connection_options: ConnectionOptions = ConnectionOptions()
-
-
-def get_client(proxy_server_address: str) -> xla_client.Client:
-  """Creates an IFRT Proxy client for the given server address."""
-  global _backend_created
-  py_module = xla_client._xla.ifrt_proxy  # pylint: disable=protected-access
-  cpp_options = py_module.ClientConnectionOptions()
-  cpp_options.on_disconnect = _connection_options.on_disconnect
-  cpp_options.on_connection_update = _connection_options.on_connection_update
-  cpp_options.connection_timeout_in_seconds = (
-      _connection_options.connection_timeout_in_seconds
-  )
-  cpp_options.initialization_data = _connection_options.initialization_data
-  client = py_module.get_client(proxy_server_address, cpp_options)
-  if client is not None:
-    _backend_created = True
-  return client
-
-
-def set_connection_options(
-    options: ConnectionOptions,
-) -> None:
-  """Sets the connection options for the "proxy" jax_platforms.
-
-  Args:
-    options: See documentation for ConnectionOptions class.
-
-  Raises:
-    ValueError: If this function is called after the proxy backend has already
-    been created.
-  """
-  global _connection_options
-  if _backend_created:
-    raise ValueError(
-        "set_connection_options() called after proxy backend was created."
-    )
-  _connection_options = options
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 7917868dcb48..9af811763ad0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -125,6 +125,7 @@ cc_library(
         ":host_callback",
         ":version",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
@@ -149,6 +150,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -229,7 +231,7 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "@com_google_absl//absl/status",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -264,6 +266,7 @@ cc_library(
         "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
index 52d698e01b83..7a2275065b12 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
@@ -41,6 +41,7 @@
 #include "xla/shape_util.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -98,22 +99,23 @@ RemoteLoadedHostCallback::CreateFromSerialized(
         "Unable to deserialize RemoteLoadedHostCallback");
   }
 
-  auto from_proto =
-      [](const auto& arg_protos) -> std::vector<xla::HostCallbackArgInfo> {
+  auto from_proto = [](const auto& arg_protos)
+      -> absl::StatusOr<std::vector<xla::HostCallbackArgInfo>> {
     std::vector<xla::HostCallbackArgInfo> args;
     args.reserve(arg_protos.size());
     for (const xla::ifrt::XlaHostCallbackProto::ArgInfo& arg_proto :
          arg_protos) {
       xla::HostCallbackArgInfo& arg = args.emplace_back();
       arg.channel_id = static_cast<uint16_t>(arg_proto.channel_id());
-      arg.shape = xla::Shape(arg_proto.shape());
+      TF_ASSIGN_OR_RETURN(arg.shape, xla::Shape::FromProto(arg_proto.shape()));
     }
     return args;
   };
 
+  TF_ASSIGN_OR_RETURN(auto operands, from_proto(proto.operands()));
+  TF_ASSIGN_OR_RETURN(auto results, from_proto(proto.results()));
   return tsl::MakeRef<RemoteLoadedHostCallback>(
-      client, from_proto(proto.operands()), from_proto(proto.results()),
-      std::move(queue));
+      client, std::move(operands), std::move(results), std::move(queue));
 }
 
 RemoteLoadedHostCallback::RemoteLoadedHostCallback(
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index 7ba300d9aa02..56ad71beb699 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -29,6 +29,7 @@
 #include "absl/base/thread_annotations.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/functional/bind_front.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -74,6 +75,7 @@
 #include "xla/python/ifrt_proxy/server/host_callback.h"
 #include "xla/python/ifrt_proxy/server/version.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/status_macros.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -87,11 +89,12 @@ namespace ifrt {
 namespace proxy {
 namespace {
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-MakeStringArrayFromHostBuffer(
+using IfrtArrayRef = xla::ifrt::ArrayRef;
+
+absl::StatusOr<IfrtArrayRef> MakeStringArrayFromHostBuffer(
     Client* client, std::shared_ptr<const std::string> host_buffer, DType dtype,
     Shape shape, std::optional<absl::Span<const int64_t>> byte_strides,
-    std::shared_ptr<const Sharding> sharding) {
+    ShardingRef sharding) {
   TF_ASSIGN_OR_RETURN(std::vector<absl::Cord> string_host_buffer,
                       DeserializeStringHostBufferFromString(*host_buffer));
   const void* data = string_host_buffer.data();
@@ -105,6 +108,69 @@ MakeStringArrayFromHostBuffer(
        string_host_buffer = std::move(string_host_buffer)]() {});
 }
 
+// Parses a `MakeArraysFromHostBufferShardsRequest::ShardIndices` proto to
+// `xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec::ShardIndices`.
+xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec::ShardIndices
+ParseMakeArraysFromHostBufferShardsSpecShardIndicesProto(
+    const MakeArraysFromHostBufferShardsRequest::ShardIndices&
+        shard_indices_proto) {
+  xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec::ShardIndices
+      shard_indices;
+  shard_indices.reserve(shard_indices_proto.indices_size());
+  for (const int shard_index : shard_indices_proto.indices()) {
+    shard_indices.push_back(shard_index);
+  }
+  return shard_indices;
+}
+
+// Parses a `MakeArraysFromHostBufferShardsRequest::HostBuffer` proto to
+// `xla::ifrt::Client::HostBuffer`. It requires a referenced host buffer handle
+// to exist in `host_buffer_store`. Once this function returns, the host buffer
+// may be deleted from `host_buffer_store` without affecting the returned
+// `xla::ifrt::Client::HostBuffer`.
+absl::StatusOr<xla::ifrt::Client::HostBuffer>
+ParseMakeArraysFromHostBufferShardsSpecHostBufferProto(
+    HostBufferStore* host_buffer_store,
+    const MakeArraysFromHostBufferShardsRequest::HostBuffer&
+        host_buffer_proto) {
+  TF_ASSIGN_OR_RETURN(DType dtype, DType::FromProto(host_buffer_proto.dtype()));
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(host_buffer_proto.shape()));
+  std::optional<std::vector<int64_t>> byte_strides;
+  if (host_buffer_proto.has_byte_strides()) {
+    byte_strides = FromByteStridesProto(host_buffer_proto.byte_strides());
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const std::string> host_buffer,
+      host_buffer_store->Lookup(host_buffer_proto.host_buffer_handle(),
+                                /*timeout=*/absl::InfiniteDuration()));
+  const void* data;
+  std::function<void()> on_done_with_host_buffer;
+  if (dtype.kind() == DType::kString) {
+    TF_ASSIGN_OR_RETURN(std::vector<absl::Cord> string_host_buffer,
+                        DeserializeStringHostBufferFromString(*host_buffer));
+    data = string_host_buffer.data();
+    on_done_with_host_buffer = [host_buffer = std::move(host_buffer),
+                                string_host_buffer =
+                                    std::move(string_host_buffer)]() mutable {
+      string_host_buffer.clear();
+      host_buffer.reset();
+    };
+  } else {
+    TF_ASSIGN_OR_RETURN(const auto mem_region,
+                        ArrayMemRegion::FromMinimalMemRegion(
+                            *host_buffer, dtype, shape, byte_strides));
+    data = mem_region.zeroth_element();
+    on_done_with_host_buffer = [host_buffer =
+                                    std::move(host_buffer)]() mutable {
+      host_buffer.reset();
+    };
+  }
+
+  return xla::ifrt::Client::HostBuffer{data, dtype, std::move(shape),
+                                       std::move(byte_strides),
+                                       std::move(on_done_with_host_buffer)};
+}
+
 // Returns a string_view that is guaranteed to be valid and constant until this
 // process dies.
 absl::string_view GetRequestName(const IfrtRequest* req) {
@@ -118,9 +184,122 @@ absl::string_view GetRequestName(const IfrtRequest* req) {
 
 }  // namespace
 
+// Maintains all arrays that will be created in the scope of an incoming
+// request. Expected usage is as follows for a hypothetical `FooBarRequest`:
+//
+// FooBarResponse ProcessFooBarRequest(FooBarRequest req) {
+//   ArrayStore::Reservation asr(req.client_generated_handles_if_present());
+//   FooBarResponse result = ProcessFooBarRequestInternal(asr, req);
+//   return asr.ProcessResponse(result);
+// }
+//
+// FooBarResponse ProcessFooBarRequestInternal(ArrayStore::Reservation& asr,
+//                                             FooBarRequest req) {
+//   IfrtArrayRef created_arrays;
+//   ...
+//   FooBarResponse response;
+//   response.set_array_handles(asr.Fill(std::move(created_arrays)));
+//   return response;
+// }
+class IfrtBackend::ArrayStore::Reservation {
+ public:
+  // Creates a Reservation that will use `client_generated_handles` if
+  // non-empty, and will generate handles (at a later point) otherwise.
+  Reservation(absl::Span<const uint64_t> client_generated_handles,
+              IfrtBackend::ArrayStore* parent);
+
+  // Creates a Reservation that will use `client_generated_handle` if
+  // non-zero, and will generate a handle (at a later point) otherwise.
+  Reservation(uint64_t client_generated_handle,
+              IfrtBackend::ArrayStore* parent);
+
+  // Associate the provided `arrays` with the handles maintained by this
+  // reservation. Returns the handles.
+  //
+  // Assumes that `arrays` has the same number of elements as
+  // when the Reservation was constructed, if constructed with client-generated
+  // handles.
+  std::vector<uint64_t> Fill(absl::Span<const IfrtArrayRef> arrays);
+
+  // Convenience wrapper of above method.
+  uint64_t Fill(IfrtArrayRef array) {
+    return Fill(absl::MakeConstSpan(&array, 1))[0];
+  }
+
+  // If the result contains an error, and the reservation is for
+  // client-generated handles, associates those handles with the error.
+  //
+  // Assumes that either `Fill()` has been called and the result is OK, or
+  // `Fill()` has not been called and result is an error.
+  //
+  // Returns the provided `result` as such.
+  absl::StatusOr<Response> ProcessResponse(absl::StatusOr<Response> result);
+
+  // Checks that `Fill()` and `ProcessResponse()` have been called as expected.
+  ~Reservation() {
+    absl::MutexLock l(&mu_);
+    CHECK(filled_);
+  }
+
+ private:
+  IfrtBackend::ArrayStore* const parent_;
+
+  absl::Mutex mu_;
+  std::vector<uint64_t> reserved_handles_ ABSL_GUARDED_BY(mu_);
+  bool filled_ ABSL_GUARDED_BY(mu_) = false;
+};
+
+IfrtBackend::ArrayStore::Reservation::Reservation(
+    absl::Span<const uint64_t> client_generated_handles,
+    IfrtBackend::ArrayStore* parent)
+    : parent_(parent),
+      reserved_handles_(client_generated_handles.begin(),
+                        client_generated_handles.end()) {
+  for (uint64_t h : client_generated_handles) {
+    // The CHECK failing is a bug at either the proxy-client or server.
+    CHECK(h != 0) << "IFRT proxy client supplied '0' as a handle.";
+  }
+}
+
+IfrtBackend::ArrayStore::Reservation::Reservation(
+    uint64_t client_generated_handle, IfrtBackend::ArrayStore* parent)
+    : parent_(parent) {
+  if (client_generated_handle != 0) {
+    reserved_handles_.push_back(client_generated_handle);
+  }
+}
+
+absl::StatusOr<IfrtBackend::Response>
+IfrtBackend::ArrayStore::Reservation::ProcessResponse(
+    absl::StatusOr<Response> result) {
+  if (!result.ok()) {
+    absl::MutexLock l(&mu_);
+    CHECK(!filled_);
+    filled_ = true;
+    parent_->Insert(reserved_handles_, result.status());
+  }
+  return result;
+}
+
+std::vector<uint64_t> IfrtBackend::ArrayStore::Reservation::Fill(
+    absl::Span<const IfrtArrayRef> arrays) {
+  absl::MutexLock l(&mu_);
+  CHECK(!filled_);
+  filled_ = true;
+
+  if (reserved_handles_.empty()) {
+    reserved_handles_.resize(arrays.size());
+    parent_->handle_generator_->GenerateAtServerBulk(
+        absl::MakeSpan(reserved_handles_));
+  }
+
+  parent_->Insert(reserved_handles_, arrays);
+
+  return reserved_handles_;
+}
+
 struct IfrtBackend::LoadedExecutableWithInfo {
-  explicit LoadedExecutableWithInfo(
-      std::unique_ptr<xla::ifrt::LoadedExecutable> executable_p)
+  explicit LoadedExecutableWithInfo(xla::ifrt::LoadedExecutableRef executable_p)
       : executable(std::move(executable_p)) {}
 
   absl::Mutex mu;
@@ -129,7 +308,9 @@ struct IfrtBackend::LoadedExecutableWithInfo {
   // do not result in a different specification.
   std::optional<std::vector<xla::ifrt::ArraySpec>> output_spec
       ABSL_GUARDED_BY(mu);
-  const std::unique_ptr<xla::ifrt::LoadedExecutable> executable;
+  const xla::ifrt::LoadedExecutableRef executable;
+
+  absl::flat_hash_set<int> donatable_indices ABSL_GUARDED_BY(mu);
 };
 
 class IfrtBackend::InOrderRequestsProcessor {
@@ -251,6 +432,7 @@ IfrtBackend::IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
       session_id_(session_id),
       client_(std::move(ifrt_client)),
       host_buffer_store_(std::move(host_buffer_store)),
+      array_store_(&handle_generator_),
       compile_thread_pool_(
           tsl::Env::Default(),
           []() {
@@ -328,31 +510,62 @@ Future<BackendInterface::Response> IfrtBackend::Process(
 
 Future<BackendInterface::Response> IfrtBackend::ProcessInternal(
     std::unique_ptr<IfrtRequest> request) {
+  std::optional<ArrayStore::Reservation> asr;
   switch (request->request_case()) {
     case IfrtRequest::RequestCase::kInitRequest:
       return Future<Response>(HandleInit(std::move(request)));
     case IfrtRequest::RequestCase::kCheckFutureRequest:
       return HandleCheckFutureRequest(std::move(request));
     case IfrtRequest::RequestCase::kMakeArrayFromHostBufferRequest:
+      asr.emplace(request->make_array_from_host_buffer_request().array_handle(),
+                  &array_store_);
+      return Future<Response>(asr->ProcessResponse(
+          HandleMakeArrayFromHostBufferRequest(*asr, std::move(request))));
+    case IfrtRequest::RequestCase::kMakeArraysFromHostBufferShardsRequest:
+      asr.emplace(request->make_arrays_from_host_buffer_shards_request()
+                      .array_handles(),
+                  &array_store_);
       return Future<Response>(
-          HandleMakeArrayFromHostBufferRequest(std::move(request)));
+          asr->ProcessResponse(HandleMakeArraysFromHostBufferShardsRequest(
+              *asr, std::move(request))));
+    case IfrtRequest::RequestCase::kMakeErrorArraysRequest:
+      asr.emplace(request->make_error_arrays_request().array_handles(),
+                  &array_store_);
+      return Future<Response>(asr->ProcessResponse(
+          HandleMakeErrorArraysRequest(*asr, std::move(request))));
     case IfrtRequest::RequestCase::kAssembleArrayFromSingleDeviceArraysRequest:
+      asr.emplace(request->assemble_array_from_single_device_arrays_request()
+                      .result_handle(),
+                  &array_store_);
       return Future<Response>(
-          HandleAssembleArrayFromSingleDeviceArraysRequest(std::move(request)));
+          asr->ProcessResponse(HandleAssembleArrayFromSingleDeviceArraysRequest(
+              *asr, std::move(request))));
     case IfrtRequest::RequestCase::kRemapArraysRequest:
-      return Future<Response>(HandleRemapArraysRequest(std::move(request)));
+      asr.emplace(request->remap_arrays_request().result_handles(),
+                  &array_store_);
+      return Future<Response>(asr->ProcessResponse(
+          HandleRemapArraysRequest(*asr, std::move(request))));
     case IfrtRequest::RequestCase::kCopyToHostBufferRequest:
       return HandleCopyToHostBufferRequest(std::move(request));
     case IfrtRequest::RequestCase::kDisassembleIntoSingleDeviceArraysRequest:
+      asr.emplace(request->disassemble_into_single_device_arrays_request()
+                      .result_handles(),
+                  &array_store_);
       return Future<Response>(
-          HandleDisassembleIntoSingleDeviceArraysRequest(std::move(request)));
+          asr->ProcessResponse(HandleDisassembleIntoSingleDeviceArraysRequest(
+              *asr, std::move(request))));
     case IfrtRequest::RequestCase::kCheckValueReadyRequest:
       return Future<Response>(HandleCheckValueReadyRequest(std::move(request)));
     case IfrtRequest::RequestCase::kCopyArraysRequest:
-      return Future<Response>(HandleCopyArraysRequest(std::move(request)));
+      asr.emplace(request->copy_arrays_request().result_handles(),
+                  &array_store_);
+      return Future<Response>(asr->ProcessResponse(
+          HandleCopyArraysRequest(*asr, std::move(request))));
     case IfrtRequest::RequestCase::kFullyReplicatedShardRequest:
-      return Future<Response>(
-          HandleFullyReplicatedShardRequest(std::move(request)));
+      asr.emplace(request->fully_replicated_shard_request().result_handle(),
+                  &array_store_);
+      return Future<Response>(asr->ProcessResponse(
+          HandleFullyReplicatedShardRequest(*asr, std::move(request))));
     case IfrtRequest::RequestCase::kDeleteArrayRequest:
       return Future<Response>(HandleDeleteArrayRequest(std::move(request)));
     case IfrtRequest::RequestCase::kIsArrayDeletedRequest:
@@ -363,9 +576,29 @@ Future<BackendInterface::Response> IfrtBackend::ProcessInternal(
       return Future<Response>(HandleCompileRequest(std::move(request)));
     case IfrtRequest::RequestCase::kLoadedExecutableMetadataRequest:
       return HandleLoadedExecutableMetadataRequest(std::move(request));
-    case IfrtRequest::RequestCase::kLoadedExecutableExecuteRequest:
-      return Future<Response>(
-          HandleLoadedExecutableExecuteRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kLoadedExecutableExecuteRequest: {
+      asr.emplace(
+          request->loaded_executable_execute_request().result_array_handle(),
+          &array_store_);
+      uint64_t client_generated_status_handle =
+          request->loaded_executable_execute_request().result_status_handle();
+      absl::StatusOr<Response> result =
+          HandleLoadedExecutableExecuteRequest(*asr, std::move(request));
+      if (client_generated_status_handle != 0) {
+        // Populate the handle if not already populated.
+        absl::MutexLock l(&futures_mutex_);
+        const bool inserted = futures_
+                                  .insert({client_generated_status_handle,
+                                           Future<>(result.status())})
+                                  .second;
+        // If `HandleLoadedExecutableExecuteRequest` returned OK, verify that
+        // it already has populated status_handle.
+        if (result.ok()) {
+          CHECK(!inserted);
+        }
+      }
+      return Future<Response>(asr->ProcessResponse(std::move(result)));
+    }
     case IfrtRequest::RequestCase::kLoadedExecutableDeleteRequest:
       return Future<Response>(
           HandleLoadedExecutableDeleteRequest(std::move(request)));
@@ -383,6 +616,9 @@ Future<BackendInterface::Response> IfrtBackend::ProcessInternal(
     case IfrtRequest::RequestCase::kGetDefaultDeviceAssignmentRequest:
       return Future<Response>(
           HandleGetDefaultDeviceAssignmentRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kGetDefaultLayoutRequest:
+      return Future<Response>(
+          HandleGetDefaultLayoutRequest(std::move(request)));
     default:
       LOG(ERROR) << "Got unimplemented request type: "
                  << request->DebugString();
@@ -401,19 +637,6 @@ uint64_t IfrtBackend::HandleGenerator::GenerateAtServer() {
   return result;
 }
 
-absl::StatusOr<uint64_t> IfrtBackend::HandleGenerator::FromClientGenerated(
-    uint64_t from_client) {
-  if (from_client == 0) {
-    // Assume old version client and revert to generating handles at server.
-    return GenerateAtServer();
-  }
-  if (from_client >= kServerGeneratedHandlesMinValue) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Bad client-generated handle: ", from_client));
-  }
-  return from_client;
-}
-
 void IfrtBackend::HandleGenerator::GenerateAtServerBulk(
     absl::Span<uint64_t> result_handles) {
   absl::MutexLock lock(&mu_);
@@ -422,43 +645,6 @@ void IfrtBackend::HandleGenerator::GenerateAtServerBulk(
   CHECK_GE(current_, kServerGeneratedHandlesMinValue);
 }
 
-absl::Status IfrtBackend::HandleGenerator::FromClientGeneratedBulk(
-    const tsl::protobuf::RepeatedField<uint64_t>& from_client,
-    absl::Span<uint64_t> result_handles) {
-  if (result_handles.empty()) {
-    // No handles are expected to be generated.
-    return absl::OkStatus();
-  }
-  if (from_client.empty()) {
-    // Assume old version client and revert to generating handles at server.
-    GenerateAtServerBulk(result_handles);
-    return absl::OkStatus();
-  }
-
-  if (result_handles.size() != from_client.size()) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("IFRT proxy server expected ", result_handles.size(),
-                     " handles but got ", from_client.size()));
-  }
-
-  // Given we always deal with array handles for new arrays in this codepath,
-  // check that the client-generated handles are not already recorded.
-  absl::MutexLock l(&parent_->arrays_mutex_);
-  for (int i = 0; i < from_client.size(); ++i) {
-    if (parent_->arrays_.contains(from_client[i])) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Handle ", from_client[i],
-                       " already used at the IFRT proxy server."));
-    }
-    if (from_client[i] >= kServerGeneratedHandlesMinValue) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Bad client-generated handle: ", from_client[i]));
-    }
-    result_handles[i] = from_client[i];
-  }
-  return absl::OkStatus();
-}
-
 Future<BackendInterface::Response> IfrtBackend::AsyncExecute(
     std::function<absl::StatusOr<Response>()> handle_fn,
     tsl::thread::ThreadPool* thread_pool) {
@@ -600,13 +786,13 @@ Future<BackendInterface::Response> IfrtBackend::HandleCheckFutureRequest(
 
 Future<BackendInterface::Response> IfrtBackend::HandleCheckValueReadyRequest(
     std::unique_ptr<IfrtRequest> request) {
-  std::vector<tsl::RCReference<xla::ifrt::Value>> values;
+  std::vector<xla::ifrt::ValueRef> values;
   values.reserve(request->check_value_ready_request().value_handles_size());
   for (const auto& value_handle :
        request->check_value_ready_request().value_handles()) {
     // TODO(b/261991179): IFRT Proxy currently supports Arrays as the only value
     // type, but this may be extended later to other types such as Tuples.
-    auto array = GetArray(value_handle);
+    absl::StatusOr<IfrtArrayRef> array = array_store_.Find(value_handle);
     if (!array.ok()) {
       return Future<Response>(array.status());
     }
@@ -635,7 +821,7 @@ Future<BackendInterface::Response> IfrtBackend::HandleCheckValueReadyRequest(
 
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleMakeArrayFromHostBufferRequest(
-    std::unique_ptr<IfrtRequest> request) {
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
   CHECK(request->has_make_array_from_host_buffer_request());
   auto* make_array_request =
       request->mutable_make_array_from_host_buffer_request();
@@ -663,7 +849,7 @@ IfrtBackend::HandleMakeArrayFromHostBufferRequest(
                                  /*timeout=*/absl::InfiniteDuration()));
   std::move(cleanup).Invoke();
 
-  tsl::RCReference<xla::ifrt::Array> array;
+  IfrtArrayRef array;
   if (dtype.kind() == DType::kString) {
     TF_ASSIGN_OR_RETURN(array,
                         MakeStringArrayFromHostBuffer(
@@ -683,45 +869,130 @@ IfrtBackend::HandleMakeArrayFromHostBufferRequest(
             [hold = std::move(host_buffer)]() mutable { hold.reset(); }));
   }
 
-  // TODO(b/282757875): Consider merging the handle_generator with the
-  // arrays_.
-  uint64_t handle = make_array_request->has_array_handle()
-                        ? make_array_request->array_handle()
-                        : handle_generator_.GenerateAtServer();
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    const bool inserted = arrays_.insert({handle, std::move(array)}).second;
-    if (!inserted) {
-      CHECK(make_array_request->has_array_handle()) << handle;
-      return absl::InvalidArgumentError(absl::StrCat(
-          "IFRT proxy: MakeArrayFromHostBuffer with client-supplied handle ",
-          handle, " that already exists at the server."));
+  std::unique_ptr<IfrtResponse> response =
+      NewIfrtResponse(request->request_metadata().op_id());
+  auto* make_array_resp =
+      response->mutable_make_array_from_host_buffer_response();
+  make_array_resp->set_array_handle(asr.Fill(std::move(array)));
+
+  return response;
+}
+
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleMakeArraysFromHostBufferShardsRequest(
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
+  CHECK(request->has_make_arrays_from_host_buffer_shards_request());
+  auto* make_arrays_request =
+      request->mutable_make_arrays_from_host_buffer_shards_request();
+
+  absl::Cleanup cleanup = [&] {
+    for (const auto& spec : make_arrays_request->specs()) {
+      for (const auto& host_buffer : spec.host_buffers()) {
+        host_buffer_store_->Delete(host_buffer.host_buffer_handle())
+            .IgnoreError();
+      }
+    }
+  };
+
+  std::vector<xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec> specs;
+  specs.reserve(make_arrays_request->specs_size());
+  for (const auto& spec_proto : make_arrays_request->specs()) {
+    xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec::Buffers buffers;
+    buffers.reserve(spec_proto.host_buffers_size());
+    for (int buffer_idx = 0; buffer_idx < spec_proto.host_buffers_size();
+         ++buffer_idx) {
+      xla::ifrt::Client::MakeArraysFromHostBufferShardsSpec::ShardIndices
+          shard_indices =
+              ParseMakeArraysFromHostBufferShardsSpecShardIndicesProto(
+                  spec_proto.addressable_shard_indices(buffer_idx));
+      TF_ASSIGN_OR_RETURN(
+          xla::ifrt::Client::HostBuffer host_buffer,
+          ParseMakeArraysFromHostBufferShardsSpecHostBufferProto(
+              host_buffer_store_.get(), spec_proto.host_buffers(buffer_idx)));
+      buffers.push_back({std::move(shard_indices), std::move(host_buffer)});
+    }
+    TF_ASSIGN_OR_RETURN(
+        auto array_spec,
+        ArraySpec::FromProto(client_.get(), spec_proto.array_spec()));
+    specs.push_back({std::move(buffers), std::move(array_spec)});
+  }
+
+  std::move(cleanup).Invoke();
+
+  TF_ASSIGN_OR_RETURN(std::vector<xla::ifrt::ArrayRef> arrays,
+                      client_->MakeArraysFromHostBufferShards(
+                          absl::MakeSpan(specs),
+                          xla::ifrt::Client::HostBufferSemantics::
+                              kImmutableUntilTransferCompletes,
+                          client_->CreateUserContext()));
+
+  std::vector<uint64_t> handles;
+  handles.reserve(make_arrays_request->specs_size());
+  if (!make_arrays_request->array_handles().empty()) {
+    TF_RET_CHECK(make_arrays_request->array_handles_size() ==
+                 make_arrays_request->specs_size());
+    for (uint64_t handle : make_arrays_request->array_handles()) {
+      handles.push_back(handle);
+    }
+  } else {
+    // TODO(b/282757875): Consider merging the handle_generator with the
+    // arrays_.
+    for (int i = 0; i < make_arrays_request->specs_size(); ++i) {
+      handles.push_back(handle_generator_.GenerateAtServer());
     }
   }
 
   std::unique_ptr<IfrtResponse> response =
       NewIfrtResponse(request->request_metadata().op_id());
-  auto* make_array_resp =
-      response->mutable_make_array_from_host_buffer_response();
-  make_array_resp->set_array_handle(handle);
+  auto* make_arrays_resp =
+      response->mutable_make_arrays_from_host_buffer_shards_response();
+  make_arrays_resp->mutable_array_handles()->Reserve(arrays.size());
+  for (uint64_t handle : asr.Fill(arrays)) {
+    make_arrays_resp->add_array_handles(handle);
+  }
+
+  return response;
+}
+
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleMakeErrorArraysRequest(
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
+  CHECK(request->has_make_error_arrays_request());
+  auto* make_array_request = request->mutable_make_error_arrays_request();
+
+  const absl::Status error = tsl::StatusFromProto(make_array_request->error());
+
+  std::vector<xla::ifrt::ArraySpec> array_specs;
+  array_specs.reserve(make_array_request->array_specs_size());
+  for (const auto& array_spec_proto : make_array_request->array_specs()) {
+    TF_ASSIGN_OR_RETURN(auto array_spec,
+                        ArraySpec::FromProto(client_.get(), array_spec_proto));
+    array_specs.push_back(std::move(array_spec));
+  }
+
+  TF_ASSIGN_OR_RETURN(std::vector<IfrtArrayRef> arrays,
+                      client_->MakeErrorArrays(error, array_specs,
+                                               client_->CreateUserContext()));
+
+  std::unique_ptr<IfrtResponse> response =
+      NewIfrtResponse(request->request_metadata().op_id());
+  auto* make_array_resp = response->mutable_make_error_arrays_response();
+  for (uint64_t handle : asr.Fill(arrays)) {
+    make_array_resp->add_array_handles(handle);
+  }
 
   return response;
 }
 
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleAssembleArrayFromSingleDeviceArraysRequest(
-    std::unique_ptr<IfrtRequest> request) {
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
   const auto& assemble_request =
       request->assemble_array_from_single_device_arrays_request();
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
-  {
-    absl::ReaderMutexLock lock(&arrays_mutex_);
-    for (const uint64_t handle :
-         assemble_request.single_device_array_handles()) {
-      TF_ASSIGN_OR_RETURN(arrays.emplace_back(), GetArrayLocked(handle));
-    }
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::vector<IfrtArrayRef> arrays,
+      array_store_.Find(assemble_request.single_device_array_handles()));
 
   TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(assemble_request.shape()));
   TF_ASSIGN_OR_RETURN(
@@ -738,7 +1009,7 @@ IfrtBackend::HandleAssembleArrayFromSingleDeviceArraysRequest(
                         FromSingleDeviceShardSemanticsProto(
                             assemble_request.single_device_shard_semantics()));
   }
-  tsl::RCReference<xla::ifrt::Array> array;
+  IfrtArrayRef array;
   if (version_.protocol_version() <
       protocol_version::kAssembleArrayFromSingleDeviceArraysWithDType) {
     if (arrays.empty()) {
@@ -760,30 +1031,18 @@ IfrtBackend::HandleAssembleArrayFromSingleDeviceArraysRequest(
 
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
 
-  TF_ASSIGN_OR_RETURN(uint64_t handle, handle_generator_.FromClientGenerated(
-                                           assemble_request.result_handle()));
   ifrt_resp->mutable_assemble_array_from_single_device_arrays_response()
-      ->set_array_handle(handle);
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    arrays_.insert({handle, std::move(array)});
-  }
-
+      ->set_array_handle(asr.Fill(std::move(array)));
   return ifrt_resp;
 }
 
 absl::StatusOr<BackendInterface::Response>
-IfrtBackend::HandleRemapArraysRequest(std::unique_ptr<IfrtRequest> request) {
+IfrtBackend::HandleRemapArraysRequest(ArrayStore::Reservation& asr,
+                                      std::unique_ptr<IfrtRequest> request) {
   const auto& remap_request = request->remap_arrays_request();
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
-  {
-    absl::ReaderMutexLock lock(&arrays_mutex_);
-    for (const uint64_t handle : remap_request.array_handles()) {
-      TF_ASSIGN_OR_RETURN(arrays.emplace_back(), GetArrayLocked(handle));
-    }
-  }
-
+  TF_ASSIGN_OR_RETURN(std::vector<IfrtArrayRef> arrays,
+                      array_store_.Find(remap_request.array_handles()));
   TF_ASSIGN_OR_RETURN(RemapPlan plan, RemapPlan::FromProto(
                                           client_.get(), remap_request.plan()));
   TF_ASSIGN_OR_RETURN(auto semantics, FromArrayCopySemanticsProto(
@@ -793,27 +1052,11 @@ IfrtBackend::HandleRemapArraysRequest(std::unique_ptr<IfrtRequest> request) {
       auto out_arrays,
       client_->RemapArrays(plan, absl::MakeSpan(arrays), semantics));
 
-  // Set up an IfrtResponse with pre-allocated space for the right number of
-  // single device array handles.
-  int64_t num_arrays = out_arrays.size();
-  auto response = NewIfrtResponse(request->request_metadata().op_id());
-
-  // Pre-allocate space in the response proto and fill it in with bulk allocated
-  // new handles.
-  auto* handles =
-      response->mutable_remap_arrays_response()->mutable_array_handles();
-  handles->Reserve(num_arrays);
-  uint64_t* handles_buf = handles->AddNAlreadyReserved(num_arrays);
-  TF_RETURN_IF_ERROR(handle_generator_.FromClientGeneratedBulk(
-      remap_request.result_handles(), absl::MakeSpan(handles_buf, num_arrays)));
+  std::vector<uint64_t> response_handles = asr.Fill(out_arrays);
 
-  // Install the newly created arrays into the arrays_.
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    for (int i = 0; i < num_arrays; ++i) {
-      arrays_.insert({handles_buf[i], out_arrays[i]});
-    }
-  }
+  auto response = NewIfrtResponse(request->request_metadata().op_id());
+  response->mutable_remap_arrays_response()->mutable_array_handles()->Assign(
+      response_handles.begin(), response_handles.end());
 
   return response;
 }
@@ -824,7 +1067,8 @@ IfrtBackend::HandleCopyToStringHostBufferRequest(
   const CopyToHostBufferRequest& copy_to_host =
       request->copy_to_host_buffer_request();
 
-  auto array = GetArray(copy_to_host.array_handle());
+  absl::StatusOr<IfrtArrayRef> array =
+      array_store_.Find(copy_to_host.array_handle());
   if (!array.ok()) {
     return Future<Response>(array.status());
   }
@@ -876,7 +1120,8 @@ Future<BackendInterface::Response> IfrtBackend::HandleCopyToHostBufferRequest(
   const CopyToHostBufferRequest& copy_to_host =
       request->copy_to_host_buffer_request();
 
-  auto array = GetArray(copy_to_host.array_handle());
+  absl::StatusOr<IfrtArrayRef> array =
+      array_store_.Find(copy_to_host.array_handle());
   if (!array.ok()) {
     return Future<Response>(array.status());
   }
@@ -942,10 +1187,11 @@ Future<BackendInterface::Response> IfrtBackend::HandleCopyToHostBufferRequest(
 
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleDisassembleIntoSingleDeviceArraysRequest(
-    std::unique_ptr<IfrtRequest> request) {
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
   const auto& disassemble_request =
       request->disassemble_into_single_device_arrays_request();
-  TF_ASSIGN_OR_RETURN(auto array, GetArray(disassemble_request.array_handle()));
+  TF_ASSIGN_OR_RETURN(IfrtArrayRef array,
+                      array_store_.Find(disassemble_request.array_handle()));
   SingleDeviceShardSemantics single_device_shard_semantics;
   if (version_.protocol_version() < 8) {
     single_device_shard_semantics = SingleDeviceShardSemantics::kAllShards;
@@ -962,42 +1208,23 @@ IfrtBackend::HandleDisassembleIntoSingleDeviceArraysRequest(
                           xla::ifrt::ArrayCopySemantics::kAlwaysCopy,
                           single_device_shard_semantics));
 
-  // Set up an IfrtResponse with pre-allocated space for the right number of
-  // single device array handles.
-  int64_t num_arrays = single_device_arrays.size();
-  auto response = NewIfrtResponse(request->request_metadata().op_id());
-
-  // Pre-allocate space in the response proto and fill it in with bulk allocated
-  // new handles.
-  auto* handles =
-      response->mutable_disassemble_into_single_device_arrays_response()
-          ->mutable_array_handles();
-  handles->Reserve(num_arrays);
-  uint64_t* handles_buf = handles->AddNAlreadyReserved(num_arrays);
-  TF_RETURN_IF_ERROR(handle_generator_.FromClientGeneratedBulk(
-      disassemble_request.result_handles(),
-      absl::MakeSpan(handles_buf, num_arrays)));
-
-  // Install the newly created arrays into the arrays_.
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    for (int i = 0; i < num_arrays; ++i) {
-      arrays_.insert({handles_buf[i], single_device_arrays[i]});
-    }
-  }
+  std::vector<uint64_t> response_handles =
+      asr.Fill(std::move(single_device_arrays));
 
+  auto response = NewIfrtResponse(request->request_metadata().op_id());
+  response->mutable_disassemble_into_single_device_arrays_response()
+      ->mutable_array_handles()
+      ->Assign(response_handles.begin(), response_handles.end());
   return response;
 }
 
 absl::StatusOr<BackendInterface::Response> IfrtBackend::HandleCopyArraysRequest(
-    std::unique_ptr<IfrtRequest> request) {
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
   const auto& copy_arrays_request = request->copy_arrays_request();
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
-  arrays.reserve(copy_arrays_request.array_handles_size());
-  for (const auto& handle : copy_arrays_request.array_handles()) {
-    TF_ASSIGN_OR_RETURN(arrays.emplace_back(), GetArray(handle));
-  }
+  TF_ASSIGN_OR_RETURN(std::vector<IfrtArrayRef> arrays,
+                      array_store_.Find(copy_arrays_request.array_handles()));
+
   std::optional<DeviceListRef> devices;
   if (!copy_arrays_request.device_ids().empty()) {
     BasicDeviceList::Devices ds;
@@ -1025,31 +1252,23 @@ absl::StatusOr<BackendInterface::Response> IfrtBackend::HandleCopyArraysRequest(
       client_->CopyArrays(absl::MakeSpan(arrays), std::move(devices),
                           memory_kind, semantics));
 
+  std::vector<uint64_t> response_handles = asr.Fill(std::move(new_arrays));
+
   std::unique_ptr<IfrtResponse> ifrt_resp =
       NewIfrtResponse(request->request_metadata().op_id());
-  auto* const copy_arrays_resp = ifrt_resp->mutable_copy_arrays_response();
-
-  std::vector<uint64_t> new_handles(new_arrays.size());
-  TF_RETURN_IF_ERROR(handle_generator_.FromClientGeneratedBulk(
-      copy_arrays_request.result_handles(), absl::MakeSpan(new_handles)));
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    for (int i = 0; i < new_arrays.size(); ++i) {
-      arrays_.insert({new_handles[i], new_arrays[i]});
-      copy_arrays_resp->add_array_handles(new_handles[i]);
-    }
-  }
-
+  ifrt_resp->mutable_copy_arrays_response()->mutable_array_handles()->Assign(
+      response_handles.begin(), response_handles.end());
   return ifrt_resp;
 }
 
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleFullyReplicatedShardRequest(
-    std::unique_ptr<IfrtRequest> request) {
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
   const auto& fully_replicated_shard_request =
       request->fully_replicated_shard_request();
-  TF_ASSIGN_OR_RETURN(auto array,
-                      GetArray(fully_replicated_shard_request.array_handle()));
+  TF_ASSIGN_OR_RETURN(
+      IfrtArrayRef array,
+      array_store_.Find(fully_replicated_shard_request.array_handle()));
   TF_ASSIGN_OR_RETURN(auto semantics,
                       FromArrayCopySemanticsProto(
                           fully_replicated_shard_request.copy_semantics()));
@@ -1063,16 +1282,9 @@ IfrtBackend::HandleFullyReplicatedShardRequest(
   // an Array to be made out of a specific single shard.
   TF_ASSIGN_OR_RETURN(auto new_array, array->FullyReplicatedShard(semantics));
 
-  TF_ASSIGN_OR_RETURN(uint64_t new_array_handle,
-                      handle_generator_.FromClientGenerated(
-                          fully_replicated_shard_request.result_handle()));
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    arrays_.insert({new_array_handle, std::move(new_array)});
-  }
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
   ifrt_resp->mutable_fully_replicated_shard_response()->set_array_handle(
-      new_array_handle);
+      asr.Fill(std::move(new_array)));
   return ifrt_resp;
 }
 
@@ -1082,7 +1294,7 @@ IfrtBackend::HandleDeleteArrayRequest(std::unique_ptr<IfrtRequest> request) {
   std::vector<Future<>> deletion_futures;
 
   auto delete_handle = [&](uint64_t handle) {
-    auto array = GetArray(handle);
+    absl::StatusOr<IfrtArrayRef> array = array_store_.Find(handle);
     if (array.ok()) {
       deletion_futures.push_back(array.value()->Delete());
     } else {
@@ -1115,7 +1327,8 @@ IfrtBackend::HandleDeleteArrayRequest(std::unique_ptr<IfrtRequest> request) {
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleIsArrayDeletedRequest(std::unique_ptr<IfrtRequest> request) {
   TF_ASSIGN_OR_RETURN(
-      auto array, GetArray(request->is_array_deleted_request().array_handle()));
+      IfrtArrayRef array,
+      array_store_.Find(request->is_array_deleted_request().array_handle()));
 
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
   ifrt_resp->mutable_is_array_deleted_response()->set_deleted(
@@ -1125,28 +1338,21 @@ IfrtBackend::HandleIsArrayDeletedRequest(std::unique_ptr<IfrtRequest> request) {
 
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleDestructArrayRequest(std::unique_ptr<IfrtRequest> request) {
-  std::vector<uint64_t> bad_handles;
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    for (const uint64_t array_handle :
-         request->destruct_array_request().array_handle()) {
-      if (!arrays_.erase(array_handle)) {
-        bad_handles.push_back(array_handle);
-      }
-    }
-
-    if (request->destruct_array_request().has_array_handle_deprecated()) {
-      const uint64_t array_handle =
-          request->destruct_array_request().array_handle_deprecated();
-      if (!arrays_.erase(array_handle)) {
-        bad_handles.push_back(array_handle);
-      }
+  std::vector<uint64_t> missing_handles = array_store_.EraseAndReturnMissing(
+      request->destruct_array_request().array_handle());
+
+  if (request->destruct_array_request().has_array_handle_deprecated()) {
+    auto missing = array_store_.EraseAndReturnMissing(
+        {request->destruct_array_request().array_handle_deprecated()});
+    if (!missing.empty()) {
+      CHECK_EQ(missing.size(), 1);
+      missing_handles.push_back(missing[0]);
     }
   }
 
-  if (!bad_handles.empty()) {
-    return absl::NotFoundError(absl::StrCat("Unknown array handle(s): ",
-                                            absl::StrJoin(bad_handles, ",")));
+  if (!missing_handles.empty()) {
+    return absl::NotFoundError(absl::StrCat(
+        "Unknown array handle(s): ", absl::StrJoin(missing_handles, ",")));
   }
 
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
@@ -1206,8 +1412,19 @@ Future<BackendInterface::Response> IfrtBackend::HandleCompileRequest(
       }
     }
 
+    if (auto xla_options =
+            llvm::dyn_cast<xla::ifrt::XlaCompileOptions>(options.get())) {
+      // TODO(emilyaf): Devices should be plumbed through or serialized to
+      // support MPMD parallelism, which allows executables with empty device
+      // assignments. In the meantime, devices are obtained from the device
+      // assignment in compile_options.
+      TF_ASSIGN_OR_RETURN(xla_options->devices,
+                          xla::ifrt::GetDeviceListFromXlaCompileOptions(
+                              client_.get(), xla_options->compile_options));
+    }
+
     TF_ASSIGN_OR_RETURN(auto executable,
-                        client_->GetDefaultCompiler()->Compile(
+                        client_->GetDefaultCompiler()->CompileAndLoad(
                             std::move(program), std::move(options)));
 
     std::unique_ptr<IfrtResponse> ifrt_resp =
@@ -1340,26 +1557,30 @@ IfrtBackend::HandleLoadedExecutableMetadataRequest(
           tsl::StatusToProto(output_memory_kinds.status());
     }
 
+    auto donated_input_indices = executable->GetDonatableInputIndices();
+    if (donated_input_indices.ok()) {
+      metadata_resp->mutable_donated_input_indices()
+          ->mutable_donated_input_indices()
+          ->Add(donated_input_indices->begin(), donated_input_indices->end());
+    } else {
+      *metadata_resp->mutable_donated_input_indices_error() =
+          tsl::StatusToProto(donated_input_indices.status());
+    }
+
     return ifrt_resp;
   });
 }
 
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleLoadedExecutableExecuteRequest(
-    std::unique_ptr<IfrtRequest> request) {
+    ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request) {
   const LoadedExecutableExecuteRequest& execute =
       request->loaded_executable_execute_request();
   TF_ASSIGN_OR_RETURN(std::shared_ptr<LoadedExecutableWithInfo> executable_info,
                       GetLoadedExecutable(execute.loaded_executable_handle()));
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> args;
-  args.reserve(execute.args_handles_size());
-  {
-    absl::ReaderMutexLock lock(&arrays_mutex_);
-    for (const uint64_t handle : execute.args_handles()) {
-      TF_ASSIGN_OR_RETURN(args.emplace_back(), GetArrayLocked(handle));
-    }
-  }
+  TF_ASSIGN_OR_RETURN(std::vector<IfrtArrayRef> args,
+                      array_store_.Find(execute.args_handles()));
 
   TF_ASSIGN_OR_RETURN(auto execute_options,
                       xla::ifrt::LoadedExecutable::ExecuteOptions::FromProto(
@@ -1371,6 +1592,10 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
     execute_options.fill_status = true;
   }
 
+  if (execute.result_status_handle() != 0) {
+    TF_RET_CHECK(execute_options.fill_status);
+  }
+
   std::optional<DeviceListRef> devices;
   if (!execute.device_ids().empty()) {
     BasicDeviceList::Devices d;
@@ -1405,6 +1630,17 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
             << "LoadedExecutable::Execute output " << i
             << "mismatched shape across invocations";
       }
+
+      // Check that only donatable arguments were deleted. The following assumes
+      // that there was no other concurrent operation issued that would delete
+      // the array. As of March 2025, the proxy-server issues operations in
+      // sequence, so this assumption is satisfied.
+      for (int i = 0; i < args.size(); ++i) {
+        if (execute_options.non_donatable_input_indices.contains(i) ||
+            !executable_info->donatable_indices.contains(i)) {
+          CHECK(!args[i]->IsDeleted());
+        }
+      }
     } else {
       // First `Execute()` call.
       executable_info->output_spec.emplace();
@@ -1414,6 +1650,27 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
             ArraySpec{/*dtype=*/output->dtype(), /*shape=*/output->shape(),
                       /*sharding=*/output->shared_ptr_sharding()});
       }
+      executable_info->donatable_indices = [&] {
+        absl::flat_hash_set<int> result;
+        absl::StatusOr<absl::Span<const int>> donatable_input_indices =
+            executable_info->executable->GetDonatableInputIndices();
+        if (donatable_input_indices.ok()) {
+          result.insert(donatable_input_indices->begin(),
+                        donatable_input_indices->end());
+        }
+        return result;
+      }();
+    }
+  }
+
+  // Generate sharding protos if we will be returning them to the client in the
+  // response.
+  std::vector<ShardingProto> output_sharding_protos;
+  if (execute.result_array_handle().empty()) {
+    output_sharding_protos.reserve(result.outputs.size());
+    for (int i = 0; i < result.outputs.size(); ++i) {
+      TF_ASSIGN_OR_RETURN(output_sharding_protos.emplace_back(),
+                          result.outputs[i]->sharding().ToProto());
     }
   }
 
@@ -1421,65 +1678,48 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
   LoadedExecutableExecuteResponse* execute_response =
       ifrt_resp->mutable_loaded_executable_execute_response();
 
-  // Register the future to `futures_`. Caller is expected to call
-  // `CheckFuture` exactly once to check for its status and erase it. In future,
-  // we may introduce separate mechanisms to remove futures from `futures_`
-  // without checking its status for situations where futures are not used.
-  //
-  // Starting protocol version 6, the client tells the server whether the status
-  // future needs to be populated or not.
-  if (version_.protocol_version() < 6 || execute_options.fill_status) {
-    absl::MutexLock lock(&futures_mutex_);
-    TF_ASSIGN_OR_RETURN(
-        uint64_t status_handle,
-        handle_generator_.FromClientGenerated(execute.result_status_handle()));
-    execute_response->set_status_handle(status_handle);
-    futures_.insert({status_handle, std::move(result.status)});
-  } else {
-    if (execute.result_status_handle() != 0) {
-      return absl::InvalidArgumentError(
-          "execute_options.fill_status is false but supplied "
-          "result_status_handle");
+  // The code below here cannot deal with error returns: it mints handles or
+  // assigns values to client-minted handles, and assumes this is done
+  // atomically (as in ACID) across all handles.
+  [&]() -> void {
+    if (execute_options.fill_status) {
+      // Caller is expected to call `CheckFuture` exactly once to check for its
+      // status and erase it.
+      absl::MutexLock lock(&futures_mutex_);
+      uint64_t status_handle = execute.result_status_handle();
+      if (status_handle == 0) {
+        status_handle = handle_generator_.GenerateAtServer();
+      }
+      execute_response->set_status_handle(status_handle);
+      futures_.insert({status_handle, std::move(result.status)});
     }
-  }
 
-  // Register output arrays. At this point, we should never early return because
-  // doing so will leak futures or output arrays registered so far.
-  std::vector<uint64_t> output_handles(result.outputs.size());
-  TF_RETURN_IF_ERROR(handle_generator_.FromClientGeneratedBulk(
-      execute.result_array_handle(), absl::MakeSpan(output_handles)));
-  {
-    absl::MutexLock lock(&arrays_mutex_);
-    for (int i = 0; i < result.outputs.size(); ++i) {
-      tsl::RCReference<xla::ifrt::Array>& array = result.outputs[i];
+    std::vector<uint64_t> result_handles = asr.Fill(result.outputs);
 
-      // Fill the output spec and handles in the response if the client did not
-      // supply handles.
-      if (execute.result_array_handle().empty()) {
+    // Fill the output spec and handles in the response if the client did not
+    // supply handles.
+    if (execute.result_array_handle().empty()) {
+      for (int i = 0; i < result.outputs.size(); ++i) {
         LoadedExecutableExecuteResponse::Output* output =
             execute_response->add_outputs();
-        *output->mutable_dtype() = array->dtype().ToProto();
-        *output->mutable_shape() = array->shape().ToProto();
-        TF_ASSIGN_OR_RETURN(*output->mutable_sharding(),
-                            array->sharding().ToProto());
-        output->set_array_handle(output_handles[i]);
+        *output->mutable_dtype() = result.outputs[i]->dtype().ToProto();
+        *output->mutable_shape() = result.outputs[i]->shape().ToProto();
+        *output->mutable_sharding() = std::move(output_sharding_protos[i]);
+        output->set_array_handle(result_handles[i]);
       }
-
-      arrays_.insert({output_handles[i], std::move(array)});
     }
-  }
+  }();
 
   return ifrt_resp;
 }
 
+// This handler will be deleted on 2025-06-06 since the underlying IFRT API is
+// deprecated. An error is returned until then to gracefully handle old clients.
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleLoadedExecutableDeleteRequest(
     std::unique_ptr<IfrtRequest> request) {
-  const auto& del = request->loaded_executable_delete_request();
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<LoadedExecutableWithInfo> executable_info,
-                      GetLoadedExecutable(del.loaded_executable_handle()));
-
-  Future<> future = executable_info->executable->Delete();
+  Future<> future(absl::UnimplementedError(
+      "LoadedExecutable::Delete is no longer supported"));
 
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
   auto* del_response = ifrt_resp->mutable_loaded_executable_delete_response();
@@ -1493,18 +1733,15 @@ IfrtBackend::HandleLoadedExecutableDeleteRequest(
   return ifrt_resp;
 }
 
+// This handler will be deleted on 2025-06-06 since the underlying IFRT API is
+// deprecated. false is returned until then to gracefully handle old clients.
 absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleLoadedExecutableIsDeletedRequest(
     std::unique_ptr<IfrtRequest> request) {
-  const auto& is_deleted = request->loaded_executable_is_deleted_request();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<LoadedExecutableWithInfo> executable_info,
-      GetLoadedExecutable(is_deleted.loaded_executable_handle()));
-
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
   auto* is_deleted_response =
       ifrt_resp->mutable_loaded_executable_is_deleted_response();
-  is_deleted_response->set_is_deleted(executable_info->executable->IsDeleted());
+  is_deleted_response->set_is_deleted(false);
 
   return ifrt_resp;
 }
@@ -1700,6 +1937,33 @@ IfrtBackend::HandleGetDefaultDeviceAssignmentRequest(
   return ifrt_resp;
 }
 
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleGetDefaultLayoutRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& get_default_layout_request =
+      request->get_default_layout_request();
+  TF_ASSIGN_OR_RETURN(auto dtype,
+                      DType::FromProto(get_default_layout_request.dtype()));
+  TF_ASSIGN_OR_RETURN(
+      Device* const device,
+      client_->LookupDevice(DeviceId(get_default_layout_request.device_id())));
+  MemoryKind memory_kind =
+      get_default_layout_request.memory_kind().empty()
+          ? MemoryKind()
+          : MemoryKind(get_default_layout_request.memory_kind());
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const xla::PjRtLayout> layout,
+      client_->GetDefaultLayout(dtype, get_default_layout_request.dims(),
+                                device, memory_kind));
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+
+  *ifrt_resp->mutable_get_default_layout_response()
+       ->mutable_serialized_pjrt_layout() = layout->Serialize();
+
+  return ifrt_resp;
+}
+
 absl::StatusOr<std::shared_ptr<IfrtBackend::LoadedExecutableWithInfo>>
 IfrtBackend::GetLoadedExecutable(uint64_t handle) {
   absl::MutexLock lock(&executables_mutex_);
@@ -1711,22 +1975,75 @@ IfrtBackend::GetLoadedExecutable(uint64_t handle) {
   return it->second;
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> IfrtBackend::GetArray(
-    uint64_t array_handle) {
-  absl::ReaderMutexLock lock(&arrays_mutex_);
-  return GetArrayLocked(array_handle);
-}
-
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> IfrtBackend::GetArrayLocked(
-    uint64_t array_handle) {
-  auto it = arrays_.find(array_handle);
+absl::StatusOr<IfrtArrayRef> IfrtBackend::ArrayStore::Find(uint64_t handle) {
+  absl::MutexLock l(&mu_);
+  auto it = arrays_.find(handle);
   if (it == arrays_.end()) {
-    return absl::NotFoundError(
-        absl::StrCat("Unknown array handle: ", array_handle));
+    return absl::NotFoundError(absl::StrCat("Unknown array handle: ", handle));
   }
   return it->second;
 }
 
+absl::StatusOr<std::vector<IfrtArrayRef>> IfrtBackend::ArrayStore::Find(
+    absl::Span<const uint64_t> handles) {
+  std::vector<IfrtArrayRef> result;
+  result.reserve(handles.size());
+  absl::MutexLock l(&mu_);
+  for (const uint64_t h : handles) {
+    auto it = arrays_.find(h);
+    if (it == arrays_.end()) {
+      return absl::NotFoundError(absl::StrCat("Unknown array handle: ", h));
+    }
+    if (!it->second.ok()) {
+      return it->second.status();
+    }
+    result.push_back(*it->second);
+  }
+  return result;
+}
+
+std::vector<uint64_t> IfrtBackend::ArrayStore::EraseAndReturnMissing(
+    absl::Span<const uint64_t> handles) {
+  std::vector<uint64_t> missing_handles;
+  std::vector<xla::ifrt::ArrayRef> to_destruct;
+  {
+    absl::MutexLock l(&mu_);
+    for (const uint64_t h : handles) {
+      auto it = arrays_.find(h);
+      if (it == arrays_.end()) {
+        missing_handles.push_back(h);
+        continue;
+      }
+      if (it->second.ok()) {
+        to_destruct.push_back(*std::move(it->second));
+      }
+      arrays_.erase(it);
+    }
+  }
+  return missing_handles;
+}
+
+void IfrtBackend::ArrayStore::Insert(absl::Span<const uint64_t> handles,
+                                     const absl::Status& status) {
+  absl::MutexLock l(&mu_);
+  for (const uint64_t h : handles) {
+    CHECK(arrays_.insert({h, status}).second) << h;
+  }
+}
+
+void IfrtBackend::ArrayStore::Insert(
+    absl::Span<const uint64_t> handles,
+    absl::Span<const xla::ifrt::ArrayRef> arrays) {
+  CHECK_EQ(handles.size(), arrays.size());
+  absl::MutexLock l(&mu_);
+  for (int i = 0; i < handles.size(); ++i) {
+    CHECK(arrays_.insert({handles[i], arrays[i]}).second) << handles[i];
+  }
+}
+
+IfrtBackend::ArrayStore::ArrayStore(HandleGenerator* handle_generator)
+    : handle_generator_(handle_generator) {}
+
 }  // namespace proxy
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
index af4450da5985..6b39d8f677b4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -20,7 +20,6 @@
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <optional>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -89,18 +88,6 @@ class IfrtBackend final : public BackendInterface {
    public:
     explicit HandleGenerator(IfrtBackend* parent);
 
-    // Returns the client-generated handle after performing some convenience
-    // checks, provided that the client is of a protocol_version capable of
-    // doing this. If the client has old protocol versions, generate a handle at
-    // the server.
-    absl::StatusOr<uint64_t> FromClientGenerated(uint64_t from_client);
-
-    // Performs the same function as `FromClientGenerated` but in bulk, and
-    // saves them into the provided Span.
-    absl::Status FromClientGeneratedBulk(
-        const tsl::protobuf::RepeatedField<uint64_t>& from_client,
-        absl::Span<uint64_t> result_handles);
-
     uint64_t GenerateAtServer();
 
     void GenerateAtServerBulk(absl::Span<uint64_t> result_handles);
@@ -111,6 +98,56 @@ class IfrtBackend final : public BackendInterface {
     uint64_t current_ ABSL_GUARDED_BY(mu_);
   };
 
+  // Maps array handles to absl::StatusOr<RCReference<Array>>>.
+  // If an array handle is mapped to a non-OK status:
+  //   - The proxy-server typically returns the error for any operation (except
+  //     destroying the handle) when the proxy-client refers to that handle.
+  //   - Since the proxy-client's `Array` implementation is a wrapper around the
+  //     handle, the user gets an `Array` on which any blocking operation would
+  //     return the error. Any references to the `Array` in future operations
+  //     would propagate the error downstream.
+  class ArrayStore {
+   public:
+    // Allows adding more array handles to the mapping. More comments can be
+    // found where this class is defined.
+    class Reservation;
+
+    explicit ArrayStore(HandleGenerator* handle_generator);
+
+    // Returns the `absl::StatusOr<RCReference<Array>>>` that maps to `handle`.
+    // Returns a NOT_FOUND error if the handle is not found.
+    absl::StatusOr<xla::ifrt::ArrayRef> Find(uint64_t handle);
+
+    // Same as above but takes a list of handles. If any of the handles maps to
+    // an error (or are not found) returns the error corresponding to the first
+    // such handle.
+    absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> Find(
+        absl::Span<const uint64_t> handles);
+
+    // Removes the given list of handles from the maintained mapping. Returns
+    // any handles that were already missing before the removal.
+    std::vector<uint64_t> EraseAndReturnMissing(
+        absl::Span<const uint64_t> handles);
+
+   private:
+    HandleGenerator* const handle_generator_;
+
+    // Adds handles[i] ==> arrays[i] to the map. CHECK-fails if the handles
+    // already exist in the map.
+    void Insert(absl::Span<const uint64_t> handles,
+                absl::Span<const xla::ifrt::ArrayRef> arrays)
+        ABSL_LOCKS_EXCLUDED(mu_);
+
+    // Adds handles[i] ==> status, for all handles[i] to the map. CHECK-fails if
+    // the handles already exist.
+    void Insert(absl::Span<const uint64_t> handles, const absl::Status& status)
+        ABSL_LOCKS_EXCLUDED(mu_);
+
+    absl::Mutex mu_;
+    absl::flat_hash_map<uint64_t, absl::StatusOr<xla::ifrt::ArrayRef>> arrays_
+        ABSL_GUARDED_BY(mu_);
+  };
+
   IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
               std::shared_ptr<xla::ifrt::Client> ifrt_client,
               std::shared_ptr<HostBufferStore> host_buffer_store);
@@ -138,21 +175,23 @@ class IfrtBackend final : public BackendInterface {
       std::unique_ptr<IfrtRequest> request);
 
   absl::StatusOr<Response> HandleMakeArrayFromHostBufferRequest(
-      std::unique_ptr<IfrtRequest> request);
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleMakeArraysFromHostBufferShardsRequest(
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleMakeErrorArraysRequest(
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleAssembleArrayFromSingleDeviceArraysRequest(
-      std::unique_ptr<IfrtRequest> request);
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleRemapArraysRequest(
-      std::unique_ptr<IfrtRequest> request);
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
   Future<Response> HandleCopyToHostBufferRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleDisassembleIntoSingleDeviceArraysRequest(
-      std::unique_ptr<IfrtRequest> request);
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleCopyArraysRequest(
-      std::unique_ptr<IfrtRequest> request);
-  absl::StatusOr<Response> HandleReshardRequest(
-      std::unique_ptr<IfrtRequest> request);
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleFullyReplicatedShardRequest(
-      std::unique_ptr<IfrtRequest> request);
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleDeleteArrayRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleIsArrayDeletedRequest(
@@ -165,7 +204,7 @@ class IfrtBackend final : public BackendInterface {
   Future<Response> HandleLoadedExecutableMetadataRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableExecuteRequest(
-      std::unique_ptr<IfrtRequest> request);
+      ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableDeleteRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableIsDeletedRequest(
@@ -180,6 +219,8 @@ class IfrtBackend final : public BackendInterface {
 
   absl::StatusOr<Response> HandleGetDefaultDeviceAssignmentRequest(
       std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleGetDefaultLayoutRequest(
+      std::unique_ptr<IfrtRequest> request);
 
   //////////////////////////////////////////////////////////////////////
   // Auxiliary/Helper methods for the handler methods above
@@ -196,10 +237,6 @@ class IfrtBackend final : public BackendInterface {
   absl::StatusOr<std::shared_ptr<LoadedExecutableWithInfo>> GetLoadedExecutable(
       uint64_t handle);
 
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetArray(uint64_t handle);
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetArrayLocked(
-      uint64_t handle) ABSL_SHARED_LOCKS_REQUIRED(arrays_mutex_);
-
   HandleGenerator handle_generator_;
 
   // Must not change during the life of this object.
@@ -212,9 +249,7 @@ class IfrtBackend final : public BackendInterface {
   absl::flat_hash_map<uint64_t, Future<>> futures_
       ABSL_GUARDED_BY(futures_mutex_);
 
-  absl::Mutex arrays_mutex_;
-  absl::flat_hash_map<uint64_t, tsl::RCReference<xla::ifrt::Array>> arrays_
-      ABSL_GUARDED_BY(arrays_mutex_);
+  ArrayStore array_store_;
 
   absl::Mutex executables_mutex_;
   absl::flat_hash_map<uint64_t, std::shared_ptr<LoadedExecutableWithInfo>>
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index 56a13659bc25..9d4b6ee26332 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -188,7 +188,8 @@ class TestProgramSerDes : public llvm::RTTIExtends<TestProgramSerDes, SerDes> {
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     CHECK(llvm::isa<TestProgram>(serializable));
     return "";
   }
@@ -223,7 +224,8 @@ class TestCompileOptionsSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     CHECK(llvm::isa<TestCompileOptions>(serializable));
     return "";
   }
@@ -260,6 +262,12 @@ class IfrtBackendHandlerTest : public IfrtBackendTest {
       mock_devices_.push_back(std::move(mock_device));
     }
 
+    xla::DeviceAssignment device_assignment(1, 1);
+    device_assignment(0, 0) = mock_devices_[0]->Id().value();
+    ON_CALL(*mock_client, GetDefaultDeviceAssignment(_, _))
+        .WillByDefault(Return(device_assignment));
+    ON_CALL(*mock_client, addressable_devices())
+        .WillByDefault(Return(raw_device_ptrs));
     ON_CALL(*mock_client, devices()).WillByDefault(Return(raw_device_ptrs));
     ON_CALL(*mock_client, GetAllDevices())
         .WillByDefault(Return(raw_device_ptrs));
@@ -309,8 +317,8 @@ class IfrtBackendHandlerTest : public IfrtBackendTest {
   // Utility method to set up a given MockArray (in the backend) that can then
   // be the target of the other Array-specific methods. Returns the array
   // handle.
-  absl::StatusOr<uint64_t> MakeTestArray(tsl::RCReference<Array> mock_array) {
-    EXPECT_CALL(*mock_client_, MakeArrayFromHostBuffer(_, _, _, _, _, _, _))
+  absl::StatusOr<uint64_t> MakeTestArray(ArrayRef mock_array) {
+    EXPECT_CALL(*mock_client_, MakeArrayFromHostBuffer(_, _, _, _, _, _, _, _))
         .WillOnce(Return(std::move(mock_array)));
 
     auto ifrt_request = NewIfrtRequest(NewOpId());
@@ -341,7 +349,7 @@ class IfrtBackendHandlerTest : public IfrtBackendTest {
   }
 
   absl::StatusOr<CompileResponse> CompileTestLoadedExecutable(
-      absl::StatusOr<std::unique_ptr<LoadedExecutable>> loaded_executable) {
+      absl::StatusOr<LoadedExecutableRef> loaded_executable) {
     auto request = NewIfrtRequest(NewOpId());
     CompileRequest* compile_request = request->mutable_compile_request();
     TestProgram program;
@@ -351,7 +359,7 @@ class IfrtBackendHandlerTest : public IfrtBackendTest {
     TF_ASSIGN_OR_RETURN(*compile_request->mutable_compile_options(),
                         Serialize(compile_options, /*options=*/nullptr));
 
-    EXPECT_CALL(mock_compiler_, Compile(_, _))
+    EXPECT_CALL(mock_compiler_, CompileAndLoad(_, _))
         .WillOnce(Return(ByMove(std::move(loaded_executable))));
 
     TF_ASSIGN_OR_RETURN(std::shared_ptr<IfrtResponse> response,
@@ -372,6 +380,17 @@ class IfrtBackendHandlerTest : public IfrtBackendTest {
     return tsl::StatusFromProto(response->response_metadata().status());
   }
 
+  absl::Status CheckValueReady(uint64_t handle) {
+    if (handle == 0) {
+      return absl::InternalError("Test error, future handle is 0");
+    }
+    auto request = NewIfrtRequest(NewOpId());
+    request->mutable_check_value_ready_request()->add_value_handles(handle);
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<IfrtResponse> response,
+                        CallBackend(std::move(request)));
+    return tsl::StatusFromProto(response->response_metadata().status());
+  }
+
   xla::ifrt::MockClient* mock_client_;
   xla::ifrt::MockCompiler mock_compiler_;
   std::vector<std::unique_ptr<xla::ifrt::MockDevice>> mock_devices_;
@@ -590,7 +609,7 @@ TEST_P(IfrtBackendHandlerTest, Init) {
 TEST_P(IfrtBackendHandlerTest, DisassembleIntoSingleDeviceArraysSucceeds) {
   // Set up a mock source array that returns two single device arrays on
   // disassembly.
-  std::vector<tsl::RCReference<xla::ifrt::Array>> single_device_arrays;
+  std::vector<xla::ifrt::ArrayRef> single_device_arrays;
   single_device_arrays.push_back(tsl::MakeRef<xla::ifrt::MockArray>());
   single_device_arrays.push_back(tsl::MakeRef<xla::ifrt::MockArray>());
   tsl::RCReference<xla::ifrt::MockArray> source_mock_array =
@@ -666,7 +685,7 @@ TEST_P(IfrtBackendHandlerTest, MakeArrayFromHostBufferSuccess) {
 
   EXPECT_CALL(*mock_client_,
               MakeArrayFromHostBuffer(_, DType(DType::kF64), expected_shape,
-                                      expected_byte_strides, _, _, _))
+                                      expected_byte_strides, _, _, _, _))
       .WillOnce(Return(std::move(mock_array)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
@@ -710,7 +729,7 @@ TEST_P(IfrtBackendHandlerTest, MakeStringArrayFromHostBufferSuccess) {
 
   EXPECT_CALL(*mock_client_,
               MakeArrayFromHostBuffer(_, expected_dtype, expected_shape,
-                                      expected_byte_strides, _, _, _))
+                                      expected_byte_strides, _, _, _, _))
       .WillOnce(Return(std::move(mock_array)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
@@ -750,6 +769,7 @@ TEST_P(IfrtBackendHandlerTest, AssembleArrayFromSingleDeviceArrays) {
   std::vector<tsl::RCReference<xla::ifrt::MockArray>> single_device_arrays;
   for (int i = 0; i < 2; ++i) {
     auto array = tsl::MakeRef<xla::ifrt::MockArray>();
+    ON_CALL(*array, dtype()).WillByDefault(Return(dtype));
     single_device_arrays.push_back(array);
 
     TF_ASSERT_OK_AND_ASSIGN(uint64_t array_handle, MakeTestArray(array));
@@ -775,20 +795,10 @@ TEST_P(IfrtBackendHandlerTest, AssembleArrayFromSingleDeviceArrays) {
       tsl::MakeRef<xla::ifrt::MockArray>();
   const Shape expected_shape({2, 2});
 
-  if (Version().protocol_version() >=
-      protocol_version::kAssembleArrayFromSingleDeviceArraysWithDType) {
-    EXPECT_CALL(*mock_client_,
-                AssembleArrayFromSingleDeviceArrays(
-                    dtype, expected_shape, _,
-                    ElementsAreArray(single_device_arrays), _, _))
-        .WillOnce(Return(std::move(result)));
-  } else {
-    EXPECT_CALL(
-        *mock_client_,
-        AssembleArrayFromSingleDeviceArrays(
-            expected_shape, _, ElementsAreArray(single_device_arrays), _, _))
-        .WillOnce(Return(std::move(result)));
-  }
+  EXPECT_CALL(*mock_client_, AssembleArrayFromSingleDeviceArrays(
+                                 dtype, expected_shape, _,
+                                 ElementsAreArray(single_device_arrays), _, _))
+      .WillOnce(Return(std::move(result)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
   EXPECT_NE(response->assemble_array_from_single_device_arrays_response()
@@ -880,7 +890,7 @@ TEST_P(IfrtBackendHandlerTest, CopyToHostSuccessWithStringArray) {
 
   EXPECT_CALL(*mock_client_,
               MakeArrayFromHostBuffer(_, expected_dtype, expected_shape,
-                                      expected_byte_strides, _, _, _))
+                                      expected_byte_strides, _, _, _, _))
       .WillOnce(Return(std::move(mock_array)));
 
   TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
@@ -956,10 +966,10 @@ TEST_P(IfrtBackendHandlerTest,
 MATCHER_P(EqualsDeviceList, device_list, "") { return *arg == *device_list; }
 
 TEST_P(IfrtBackendHandlerTest, CopyArrays) {
-  std::vector<tsl::RCReference<xla::ifrt::Array>> src_arrays;
+  std::vector<xla::ifrt::ArrayRef> src_arrays;
   src_arrays.push_back(tsl::MakeRef<xla::ifrt::MockArray>());
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> copied_arrays;
+  std::vector<xla::ifrt::ArrayRef> copied_arrays;
   copied_arrays.push_back(tsl::MakeRef<xla::ifrt::MockArray>());
 
   BasicDeviceList::Devices ds;
@@ -972,8 +982,7 @@ TEST_P(IfrtBackendHandlerTest, CopyArrays) {
                                         Optional(EqualsDeviceList(devices)),
                                         Optional(memory_kind),
                                         ArrayCopySemantics::kAlwaysCopy))
-      .WillOnce(Return(
-          std::vector<tsl::RCReference<xla::ifrt::Array>>(copied_arrays)));
+      .WillOnce(Return(std::vector<xla::ifrt::ArrayRef>(copied_arrays)));
 
   auto ifrt_request = NewIfrtRequest(NewOpId());
   CopyArraysRequest* copy_arrays_request =
@@ -1381,7 +1390,7 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecute) {
     return array;
   };
 
-  std::vector<tsl::RCReference<Array>> outputs;
+  std::vector<ArrayRef> outputs;
   outputs.reserve(kNumOutputs);
   for (int i = 0; i < kNumOutputs; ++i) {
     outputs.push_back(make_array());
@@ -1389,7 +1398,7 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecute) {
 
   EXPECT_CALL(*executable, Execute(SizeIs(kNumArgs), _, _))
       .WillOnce(
-          Invoke([&](absl::Span<tsl::RCReference<Array>> args,
+          Invoke([&](absl::Span<ArrayRef> args,
                      const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
                      std::optional<DeviceListRef> devices)
                      -> absl::StatusOr<LoadedExecutable::ExecuteResult> {
@@ -1450,9 +1459,10 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecute) {
 }
 #endif
 
-// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
-#if defined(PLATFORM_GOOGLE)
-TEST_P(IfrtBackendHandlerTest, LoadedExecutableDelete) {
+TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecuteErrorWithClientHandles) {
+  TF_ASSERT_OK_AND_ASSIGN(xla::ifrt::Device* const device,
+                          mock_client_->LookupDevice(DeviceId(0)));
+
   MockLoadedExecutable* executable;
   uint64_t handle;
   {
@@ -1463,40 +1473,59 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableDelete) {
     handle = response.loaded_executable_handle();
   }
 
-  {
-    EXPECT_CALL(*executable, Delete())
-        .WillOnce(Return(Future<>(absl::OkStatus())));
+  constexpr int kNumArgs = 3;
+  constexpr int kNumOutputs = 2;
 
-    auto request = NewIfrtRequest(NewOpId());
-    LoadedExecutableDeleteRequest* delete_request =
-        request->mutable_loaded_executable_delete_request();
-    delete_request->set_loaded_executable_handle(handle);
+  Shape shape({2, 2});
+  auto sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
-    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
-                            CallBackend(std::move(request)));
-    ASSERT_TRUE(response->has_loaded_executable_delete_response());
+  auto make_array = [&]() {
+    auto array = tsl::MakeRef<MockArray>();
+    ON_CALL(*array, dtype()).WillByDefault(Return(DType(DType::kF32)));
+    ON_CALL(*array, shape()).WillByDefault(ReturnRef(shape));
+    ON_CALL(*array, sharding()).WillByDefault(ReturnRef(*sharding));
+    return array;
+  };
 
-    EXPECT_THAT(
-        CheckFuture(
-            response->loaded_executable_delete_response().future_handle()),
-        IsOk());
+  EXPECT_CALL(*executable, Execute(SizeIs(kNumArgs), _, _))
+      .WillOnce(
+          Invoke([&](absl::Span<ArrayRef> args,
+                     const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+                     std::optional<DeviceListRef> devices)
+                     -> absl::StatusOr<LoadedExecutable::ExecuteResult> {
+            return absl::InternalError("injected error");
+          }));
+
+  auto request = NewIfrtRequest(NewOpId());
+  LoadedExecutableExecuteRequest* execute_request =
+      request->mutable_loaded_executable_execute_request();
+  for (int i = 0; i < kNumArgs; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(uint64_t arg_handle, MakeTestArray(make_array()));
+    execute_request->add_args_handles(arg_handle);
   }
+  execute_request->set_loaded_executable_handle(handle);
+  constexpr uint64_t kFirstResultHandle = 1000;
+  for (int i = 0; i < kNumOutputs; ++i) {
+    execute_request->add_result_array_handle(kFirstResultHandle + i);
+  }
+  execute_request->set_result_status_handle(kFirstResultHandle + kNumOutputs);
 
-  {
-    EXPECT_CALL(*executable, IsDeleted()).WillOnce(Return(true));
+  xla::ifrt::LoadedExecutable::ExecuteOptions execute_options;
+  execute_options.fill_status = true;
+  TF_ASSERT_OK_AND_ASSIGN(*execute_request->mutable_execute_options(),
+                          execute_options.ToProto());
 
-    auto request = NewIfrtRequest(NewOpId());
-    LoadedExecutableIsDeletedRequest* is_deleted_request =
-        request->mutable_loaded_executable_is_deleted_request();
-    is_deleted_request->set_loaded_executable_handle(handle);
+  auto status_is_err =
+      StatusIs(absl::StatusCode::kInternal, StrEq("injected error"));
 
-    EXPECT_THAT(CallBackend(std::move(request)),
-                IsOkAndHolds(Pointee(Partially(EquivToProto(R"pb(
-                  loaded_executable_is_deleted_response { is_deleted: true }
-                )pb")))));
+  EXPECT_THAT(CallBackend(std::move(request)), status_is_err);
+
+  EXPECT_THAT(CheckFuture(kFirstResultHandle + kNumOutputs), status_is_err);
+
+  for (int i = 0; i < kNumOutputs; ++i) {
+    EXPECT_THAT(CheckValueReady(kFirstResultHandle + i), status_is_err);
   }
 }
-#endif
 
 TEST_P(IfrtBackendHandlerTest, LoadedExecutableDestruct) {
   MockLoadedExecutable* executable;
@@ -1576,7 +1605,7 @@ TEST_P(IfrtBackendHandlerTest, LoadedHostCallbackExecute) {
     auto e = std::make_unique<MockLoadedExecutable>();
     executable = e.get();
 
-    EXPECT_CALL(mock_compiler_, Compile(_, _))
+    EXPECT_CALL(mock_compiler_, CompileAndLoad(_, _))
         .WillOnce(DoAll(
             Invoke(
                 [&](const std::unique_ptr<xla::ifrt::Program>& program,
@@ -1719,6 +1748,42 @@ TEST_P(IfrtBackendHandlerTest,
               StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
 }
 
+TEST_P(IfrtBackendHandlerTest, GetDefaultLayoutSuccess) {
+  const auto kDefaultLayout = std::make_shared<xla::PjRtLayout>(
+      xla::LayoutUtil::MakeDescendingLayout(1));
+  const xla::ifrt::DType kDType = xla::ifrt::DType(xla::ifrt::DType::kF32);
+  const std::vector<int64_t> kDims = {1, 2, 3};
+  const int64_t kDeviceId = 42;
+  const auto mock_device = std::make_unique<xla::ifrt::MockDevice>();
+  const std::string kMemoryKindStr = "xla::ifrt::MemoryKind()";
+  const xla::ifrt::MemoryKind kMemoryKind(kMemoryKindStr);
+
+  ON_CALL(*mock_client_, LookupDevice(DeviceId(kDeviceId)))
+      .WillByDefault(Return(mock_device.get()));
+
+  EXPECT_CALL(*mock_client_,
+              GetDefaultLayout(kDType, absl::MakeConstSpan(kDims),
+                               mock_device.get(), kMemoryKind))
+      .WillOnce(Return(std::shared_ptr<const xla::PjRtLayout>(kDefaultLayout)));
+
+  auto request = NewIfrtRequest(NewOpId());
+  auto* default_layout_request = request->mutable_get_default_layout_request();
+  *default_layout_request->mutable_dtype() = kDType.ToProto();
+  default_layout_request->mutable_dims()->Reserve(kDims.size());
+  for (int64_t dim : kDims) {
+    default_layout_request->add_dims(dim);
+  }
+  default_layout_request->set_device_id(kDeviceId);
+  default_layout_request->set_memory_kind(kMemoryKindStr);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(request)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto layout_got,
+      xla::PjRtLayout::Deserialize(
+          response->get_default_layout_response().serialized_pjrt_layout()));
+  EXPECT_EQ(*layout_got, *kDefaultLayout);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     IfrtBackendHandlerTestWithAllVersions, IfrtBackendHandlerTest,
     testing::Range(kServerMinVersion, kServerMaxVersion + 1),
diff --git a/third_party/xla/xla/python/jax_jit.cc b/third_party/xla/xla/python/jax_jit.cc
deleted file mode 100644
index 17920e44b32b..000000000000
--- a/third_party/xla/xla/python/jax_jit.cc
+++ /dev/null
@@ -1,495 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This files implements the `jax.jit` dispatch and just-in-time feature.
-//
-// In a nutshell, `Jit(f)` returns a callable that will dispatch (i.e. forward
-// based on passed arguments dtypes/shapes/identity) the execution to a
-// just-in-time compiled XLA Executable. All of that is done in C++ for
-// performance reasons.
-//
-// This file contains the utilities to:
-// (a) inspect arguments and describe their structure, dtype/shapes, etc.
-// (b) keep a mapping from function signatures to compiled XLA Executables.
-
-#include "xla/python/jax_jit.h"
-
-#include <Python.h>
-
-#include <algorithm>
-#include <memory>
-#include <optional>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/pair.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/nb_absl_inlined_vector.h"  // IWYU pragma: keep
-#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
-#include "xla/python/py_values.h"
-#include "xla/python/pytree.h"
-#include "xla/python/sharding.h"
-#include "xla/python/types.h"
-#include "xla/tsl/platform/logging.h"
-#include "tsl/profiler/lib/traceme.h"
-
-namespace jax {
-
-namespace nb = nanobind;
-
-// TODO(phawkins): Add support for Tracers.
-// TODO(jblespiau): Use absl absl::Status.
-
-namespace {
-
-// `thread_local_state.extra_jit_context` is set from Python. It's done when
-// loading the Python jax modules on the main-thread. For other threads, we
-// need to initialize the field the first time we access `thread_local_state`.
-nb::object& initialize_local_state = *new nb::object();
-
-}  // namespace
-
-JitState& GlobalJitState() {
-  // Protected by the GIL.
-  static JitState& global_state = *new JitState();
-  return global_state;
-}
-
-JitState& ThreadLocalJitState() {
-  // TODO(phawkins): Google style guide forbids thread-local values with
-  // non-trivial destructors.
-  ABSL_CONST_INIT thread_local JitState thread_local_state;  // NOLINT
-  DCHECK(PyGILState_Check());
-  if (thread_local_state.extra_jit_context == std::nullopt) {
-    CHECK(initialize_local_state.ptr() != nullptr);
-    // Avoids reentrant calls to the initialization function.
-    thread_local_state.extra_jit_context = nb::none();
-    initialize_local_state();
-  }
-  return thread_local_state;
-}
-
-bool GetDisableJit() {
-  auto& global_state = GlobalJitState();
-  auto& thread_local_state = ThreadLocalJitState();
-  CHECK(global_state.disable_jit.has_value());
-  return thread_local_state.disable_jit.value_or(*global_state.disable_jit);
-}
-
-bool GetEnableX64() {
-  auto& global_state = GlobalJitState();
-  auto& thread_local_state = ThreadLocalJitState();
-  CHECK(global_state.enable_x64.has_value());
-  return thread_local_state.enable_x64.value_or(*global_state.enable_x64);
-}
-
-std::optional<nb::object> GetDefaultDevice() {
-  auto& global_state = GlobalJitState();
-  auto& thread_local_state = ThreadLocalJitState();
-  return thread_local_state.default_device.has_value()
-             ? thread_local_state.default_device
-             : global_state.default_device;
-}
-
-std::optional<nb::callable> GetPostHook() {
-  auto& global_state = GlobalJitState();
-  auto& thread_local_state = ThreadLocalJitState();
-  return thread_local_state.post_hook.has_value() ? thread_local_state.post_hook
-                                                  : global_state.post_hook;
-}
-
-static std::string OptionalDebugString(
-    const std::optional<nb::object> optional) {
-  if (optional.has_value()) {
-    return nb::cast<std::string>(nb::str(optional.value()));
-  } else {
-    return "None";
-  }
-}
-
-std::string ArgumentSignature::DebugString() const {
-  auto py_object_formatter = [](std::string* out, const nb::object& o) {
-    out->append(nb::cast<absl::string_view>(nb::str(o)));
-  };
-  auto treedef_formatter = [](std::string* out, const xla::PyTreeDef& d) {
-    out->append(d.ToString());
-  };
-  return absl::StrFormat(
-      "static args (positional + keyword): [%s], "
-      "static arg keyword names: [%s], "
-      "dynamic arg signatures (positional + keyword): [%s]"
-      "dynamic arg shardings: [%s]",
-      absl::StrJoin(static_args, ",", py_object_formatter),
-      absl::StrJoin(static_arg_names, ",", py_object_formatter),
-      absl::StrJoin(dynamic_arg_names, ",", py_object_formatter),
-      absl::StrJoin(dynamic_arg_treedefs, "| ", treedef_formatter));
-}
-
-bool ArgumentSignature::operator==(const ArgumentSignature& other) const {
-  if (dynamic_arg_treedefs != other.dynamic_arg_treedefs) {
-    return false;
-  }
-  auto object_ptr_equality = [](nb::handle a, nb::handle b) {
-    return a.ptr() == b.ptr();
-  };
-  if (!absl::c_equal(dynamic_arg_names, other.dynamic_arg_names,
-                     object_ptr_equality)) {
-    return false;
-  }
-  if (!absl::c_equal(static_arg_names, other.static_arg_names,
-                     object_ptr_equality)) {
-    return false;
-  }
-  return absl::c_equal(
-      static_args, other.static_args,
-      [](const nb::object& a, const nb::object& b) {
-        try {
-          return a.type().ptr() == b.type().ptr() && a.equal(b);
-        } catch (const nb::python_error& e) {
-          throw std::invalid_argument(absl::StrCat(
-              "static arguments should be comparable using __eq__."
-              "The following error was raised when comparing two objects of "
-              "types ",
-              nb::cast<absl::string_view>(nb::str(a.type())), " and ",
-              nb::cast<absl::string_view>(nb::str(b.type())),
-              ". The error was:\n", e.what()));
-        }
-      });
-}
-
-std::string CallSignature::DebugString() const {
-  auto py_object_formatter = [](std::string* out, const nb::object& o) {
-    out->append(nb::cast<absl::string_view>(nb::str(o)));
-  };
-  auto signature_formatter = [](std::string* out,
-                                const xla::PyArgSignature& s) {
-    out->append(s.DebugString());
-  };
-  auto layout_formatter = [](std::string* out,
-                             const std::shared_ptr<const xla::PjRtLayout>& l) {
-    if (l != nullptr) {
-      out->append(l->ToString());
-    } else {
-      out->append("None");
-    }
-  };
-  auto bool_formatter = [](std::string* out, bool o) {
-    out->append(o ? "true" : "false");
-  };
-  return absl::StrFormat(
-      "arg signature: %s\n"
-      "dynamic arg signatures (positional + keyword): %s\n"
-      "dynamic arg shardings: %s\n"
-      "dynamic arg layouts: %s\n"
-      "committed args: %s\n"
-      "device: %s\n"
-      "default_device: %s\n"
-      "jax_enable_x64: %d\n"
-      "global_extra_jit_context: %s\n"
-      "thread_local_extra_jit_context: %s\n"
-      "configs: %s\n",
-      arg_signature.DebugString(),
-      absl::StrJoin(dynamic_arg_signatures, ", ", signature_formatter),
-      absl::StrJoin(dynamic_arg_shardings, ", ", py_object_formatter),
-      absl::StrJoin(dynamic_arg_layouts, ", ", layout_formatter),
-      absl::StrJoin(committed_args, ",", bool_formatter),
-      device != nullptr ? device->DebugString() : "nullptr",
-      OptionalDebugString(default_device), jax_enable_x64,
-      OptionalDebugString(global_extra_jit_context),
-      OptionalDebugString(thread_local_extra_jit_context),
-      absl::StrJoin(configs, ", ", py_object_formatter));
-}
-
-bool CallSignature::operator==(const CallSignature& other) const {
-  if (arg_signature != other.arg_signature) {
-    return false;
-  }
-  if (dynamic_arg_signatures != other.dynamic_arg_signatures) {
-    return false;
-  }
-  if (device != other.device) {
-    return false;
-  }
-  if (jax_enable_x64 != other.jax_enable_x64) {
-    return false;
-  }
-  if (committed_args != other.committed_args) {
-    return false;
-  }
-  return
-      // `==` on py:objects is the Python `is`. We need equal.
-      absl::c_equal(dynamic_arg_shardings, other.dynamic_arg_shardings,
-                    ShardingEqual) &&
-      absl::c_equal(dynamic_arg_layouts, other.dynamic_arg_layouts,
-                    [](const std::shared_ptr<const xla::PjRtLayout>& a,
-                       const std::shared_ptr<const xla::PjRtLayout>& b) {
-                      return (a && b) ? *a == *b : a == b;
-                    }) &&
-      (global_extra_jit_context.has_value() ==
-       other.global_extra_jit_context.has_value()) &&
-      (!global_extra_jit_context.has_value() ||
-       global_extra_jit_context->equal(*other.global_extra_jit_context)) &&
-      (default_device.has_value() == other.default_device.has_value()) &&
-      (!default_device.has_value() ||
-       default_device->equal(*other.default_device)) &&
-      (thread_local_extra_jit_context.has_value() ==
-       other.thread_local_extra_jit_context.has_value()) &&
-      (!thread_local_extra_jit_context.has_value() ||
-       thread_local_extra_jit_context->equal(
-           *other.thread_local_extra_jit_context)) &&
-      configs.size() == other.configs.size() &&
-      absl::c_equal(
-          configs, other.configs,
-          [](const nb::object& a, const nb::object& b) { return a.equal(b); });
-}
-
-// Filter out static arguments, flatten and concatenate other arguments (i.e.
-// dynamic positional and keyword arguments), filling `arguments` in place.
-absl::Status ParseArguments(
-    absl::Span<PyObject* const> positional_args,
-    absl::Span<PyObject* const> keyword_args, nb::handle kwnames,
-    absl::Span<int const> static_argnums,
-    absl::Span<nb::str const> static_argnames,
-    xla::PyTreeRegistry* pytree_registry, ArgumentSignature& signature,
-    absl::InlinedVector<nanobind::object, 2>& flat_dynamic_args) {
-  tsl::profiler::TraceMe traceme("ParseArguments");
-
-  DCHECK(absl::c_all_of(static_argnames, [](const nb::str& name) {
-    return PyUnicode_CHECK_INTERNED(name.ptr());
-  }));
-
-  flat_dynamic_args.reserve(positional_args.size() + keyword_args.size());
-  if (static_argnums.empty()) {
-    signature.dynamic_arg_treedefs.reserve(positional_args.size());
-
-    // Positional arguments.
-    for (int i = 0; i < positional_args.size(); ++i) {
-      signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-      xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
-      pytree_def.Flatten(nb::handle(positional_args[i]), flat_dynamic_args);
-    }
-  } else {
-    signature.dynamic_arg_treedefs.reserve(positional_args.size());
-
-    // Positional arguments.
-    int num_positional_args = positional_args.size();
-    for (int i = 0; i < positional_args.size(); ++i) {
-      if (std::find_if(static_argnums.begin(), static_argnums.end(),
-                       [i, num_positional_args](int t) {
-                         return t >= 0 ? i == t : i == t + num_positional_args;
-                       }) == static_argnums.end()) {
-        signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-        xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
-        pytree_def.Flatten(positional_args[i], flat_dynamic_args);
-      } else {
-        signature.static_args.emplace_back(
-            nb::borrow<nb::object>(positional_args[i]));
-      }
-    }
-  }
-
-  // Keyword arguments.
-  if (!keyword_args.empty()) {
-    std::vector<std::pair<nb::handle, nb::handle>> kwargs(keyword_args.size());
-    // We first intern the keys, then sort them (by name, as in the Python path)
-    // (see also xla::PyTreeDef::Flatten) and then create the signatures.
-    // TODO(jblespiau): We should be able to sort the keys by interned-key
-    // pointers, but this requires the Python compilation to do the same.
-    for (int i = 0; i < keyword_args.size(); ++i) {
-      // Intern the key if not already interned.
-      PyObject* key = PyTuple_GET_ITEM(kwnames.ptr(), i);
-      Py_INCREF(key);
-      if (!PyUnicode_CHECK_INTERNED(key)) {
-        PyUnicode_InternInPlace(&key);
-      }
-      kwargs[i].first = key;
-      kwargs[i].second = keyword_args[i];
-    }
-
-    std::sort(kwargs.begin(), kwargs.end(),
-              [](const std::pair<nb::handle, nb::handle>& a,
-                 const std::pair<nb::handle, nb::handle>& b) {
-                return a.first < b.first;
-              });
-    auto kwarg_is_static = [&](nb::handle name) {
-      for (const auto& kw : static_argnames) {
-        if (kw.ptr() == name.ptr()) return true;
-      }
-      return false;
-    };
-
-    signature.dynamic_arg_names.reserve(keyword_args.size());
-    for (int i = 0; i < keyword_args.size(); ++i) {
-      if (kwarg_is_static(kwargs[i].first)) {
-        signature.static_arg_names.push_back(
-            nb::steal<nb::object>(kwargs[i].first));
-        signature.static_args.push_back(
-            nb::borrow<nb::object>(kwargs[i].second));
-      } else {
-        signature.dynamic_arg_names.push_back(
-            nb::steal<nb::object>(kwargs[i].first));
-        signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-        xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
-        pytree_def.Flatten(nb::handle(kwargs[i].second.ptr()),
-                           flat_dynamic_args);
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-
-void BuildJaxjitSubmodule(nb::module_& m) {
-  nb::module_ jitlib = m.def_submodule("jax_jit", "Jax C++ jit library");
-
-  nb::class_<JitState> jit_state_(jitlib, "JitState");
-  jit_state_.def_rw("disable_jit", &JitState::disable_jit, nb::arg().none());
-  jit_state_.def_rw("enable_x64", &JitState::enable_x64, nb::arg().none());
-  jit_state_.def_rw("default_device", &JitState::default_device,
-                    nb::arg().none());
-  jit_state_.def_rw("extra_jit_context", &JitState::extra_jit_context,
-                    nb::arg().none());
-  jit_state_.def_rw("post_hook", &JitState::post_hook, nb::arg().none());
-
-  jitlib.def(
-      "global_state", [&]() { return &GlobalJitState(); },
-      nb::rv_policy::reference);
-  jitlib.def(
-      "thread_local_state", [&]() { return &ThreadLocalJitState(); },
-      nb::rv_policy::reference);
-
-  jitlib.def(
-      "swap_thread_local_state_disable_jit",
-      [&](std::optional<bool> value) -> std::optional<bool> {
-        auto tls = &ThreadLocalJitState();
-        auto result = tls->disable_jit;
-        tls->disable_jit = value;
-        return result;
-      },
-      nb::arg("value").none(), nb::rv_policy::reference);
-
-  jitlib.def("get_enable_x64", &GetEnableX64);
-  jitlib.def("set_thread_local_state_initialization_callback",
-             [](nb::object f) { initialize_local_state = f; });
-
-  nb::class_<xla::PyArgSignature> arg_signature(jitlib, "PyArgSignature");
-  arg_signature
-      .def_prop_ro(
-          "dtype",
-          [](const xla::PyArgSignature& sig) {
-            return xla::ValueOrThrow(xla::PrimitiveTypeToNbDtype(sig.dtype));
-          })
-      .def_prop_ro("shape",
-                   [](const xla::PyArgSignature& sig) {
-                     return xla::SpanToNbTuple(absl::MakeConstSpan(sig.shape));
-                   })
-      .def_ro("weak_type", &xla::PyArgSignature::weak_type);
-  jitlib.def("_ArgSignatureOfValue",
-             xla::ValueOrThrowWrapper(xla::PyArgSignatureOfValue));
-
-  jitlib.def("_is_float0", &xla::IsFloat0);
-
-  nb::class_<ArgumentSignature> argument_signature(jitlib, "ArgumentSignature");
-  argument_signature.def_ro("static_args", &ArgumentSignature::static_args)
-      .def_ro("static_arg_names", &ArgumentSignature::static_arg_names)
-      .def_ro("dynamic_arg_names", &ArgumentSignature::dynamic_arg_names)
-      .def_ro("dynamic_arg_treedefs", &ArgumentSignature::dynamic_arg_treedefs)
-      .def("__repr__", &ArgumentSignature::DebugString)
-      .def("__str__", &ArgumentSignature::DebugString)
-      .def("__hash__",
-           [](const ArgumentSignature& s) { return absl::HashOf(s); })
-      .def("__eq__", [](const ArgumentSignature& a,
-                        const ArgumentSignature& b) { return a == b; })
-      .def("__ne__", [](const ArgumentSignature& a,
-                        const ArgumentSignature& b) { return a != b; });
-
-  jitlib.def(
-      "parse_arguments",
-      [](nb::sequence positional_args, nb::sequence keyword_args,
-         nb::tuple kwnames, absl::Span<int const> static_argnums,
-         absl::Span<nb::str const> static_argnames,
-         xla::PyTreeRegistry* pytree_registry) {
-        ArgumentSignature signature;
-        absl::InlinedVector<nanobind::object, 2> flat_dynamic_args;
-        nb::object positional_args_seq = nb::steal(PySequence_Fast(
-            positional_args.ptr(), "positional_args must be a list or tuple"));
-        if (!positional_args_seq.ptr()) {
-          throw nb::python_error();
-        }
-        nb::object keyword_args_seq = nb::steal(PySequence_Fast(
-            keyword_args.ptr(), "keyword_args must be a list or tuple"));
-        if (!keyword_args_seq.ptr()) {
-          throw nb::python_error();
-        }
-        absl::Span<PyObject* const> positional_args_span =
-            absl::MakeSpan(PySequence_Fast_ITEMS(positional_args_seq.ptr()),
-                           PySequence_Fast_GET_SIZE(positional_args_seq.ptr()));
-        absl::Span<PyObject* const> keyword_args_span =
-            absl::MakeSpan(PySequence_Fast_ITEMS(keyword_args_seq.ptr()),
-                           PySequence_Fast_GET_SIZE(keyword_args_seq.ptr()));
-
-        // Intern the static argument names.
-        std::vector<nb::str> static_argnames_interned;
-        static_argnames_interned.reserve(static_argnames.size());
-        for (const nb::str& name : static_argnames) {
-          PyObject* s = name.inc_ref().ptr();
-          PyUnicode_InternInPlace(&s);
-          static_argnames_interned.push_back(nb::steal<nb::str>(s));
-        }
-
-        xla::ThrowIfError(
-            ParseArguments(positional_args_span, keyword_args_span, kwnames,
-                           static_argnums, static_argnames_interned,
-                           pytree_registry, signature, flat_dynamic_args));
-        return std::make_pair(std::move(signature),
-                              std::move(flat_dynamic_args));
-      },
-      nb::arg("positional_args"), nb::arg("keyword_args"), nb::arg("kwnames"),
-      nb::arg("static_argnums"), nb::arg("static_argnames"),
-      nb::arg("pytree_registry"),
-      R"doc(Parses the arguments to a function as jax.jit would.
-
-Returns a ArgumentSignature and the flattened dynamic arguments.
-
-Args:
-  positional_args: The positional arguments.
-  keyword_args: The keyword arguments.
-  kwnames: The keyword names.
-  static_argnums: The static argument numbers.
-  static_argnames: The static argument names.
-  pytree_registry: The pytree registry.
-)doc");
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/jax_jit.h b/third_party/xla/xla/python/jax_jit.h
deleted file mode 100644
index 9a64a9995717..000000000000
--- a/third_party/xla/xla/python/jax_jit.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_JAX_JIT_H_
-#define XLA_PYTHON_JAX_JIT_H_
-
-#include <Python.h>
-
-#include <cstddef>
-#include <memory>
-#include <optional>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-// placeholder for index annotation headers
-#include "absl/container/inlined_vector.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/python/nb_helpers.h"
-#include "xla/python/py_values.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/pytree.h"
-#include "xla/python/sharding.h"
-#include "xla/tsl/platform/logging.h"
-
-namespace jax {
-
-// Flags, such as JIT disable and the x64 mode, are controlled by:
-// - a global flag value, e.g., associated to --jax_enable_x64
-// - possibly a thread-local value, which initially is std::nullopt and
-//   overrides the global value if set. The thread-local state is
-//   used to implement context managers that locally override the global state.
-struct JitState {
-  ~JitState() {
-    if (extra_jit_context) {
-      // We likely do not hold the GIL if this JitState is thread-local, so we
-      // hand the Python object to the global reference manager to destroy.
-      nanobind::object o = std::move(*extra_jit_context);
-      xla::GlobalPyRefManager()->AddGarbage(absl::MakeSpan(&o, 1));
-      extra_jit_context = std::nullopt;
-    }
-  }
-
-  std::optional<bool> disable_jit;
-  std::optional<bool> enable_x64;
-
-  // Used to manually set the default device jax should use. May be unset even
-  // in global state, indicating there is no manual override.
-  // TODO(skyewm): make this a C++ type when all JAX backends support a single
-  // C++ device interface
-  std::optional<nanobind::object> default_device;
-
-  // Extra context that should be included in the JIT cache key. Must be
-  // hashable and have an equality defined.
-  std::optional<nanobind::object> extra_jit_context;
-
-  // A callback that, if present, is called when a JITted function is executed
-  // from cache. May be unset even in global state.
-  std::optional<nanobind::callable> post_hook;
-};
-
-JitState& GlobalJitState();
-
-// Requires the GIL.
-JitState& ThreadLocalJitState();
-
-// Getters for JitState fields that first look in thread-local state, then
-// fallback to global state.
-bool GetDisableJit();
-bool GetEnableX64();
-
-// TODO(skyewm): return a C++ type when all JAX backends support a single C++
-// device interface
-std::optional<nanobind::object> GetDefaultDevice();
-std::optional<nanobind::callable> GetPostHook();
-
-// An ArgumentSignature describes the static arguments to a function call, and
-// how the dynamic arguments are related to the arguments. Together with the
-// values of the dynamic arguments, this fully describes the arguments.
-struct ArgumentSignature {
-  // A PyTreeDef for each dynamic argument, positional arguments first
-  // followed by keyword arguments. Keyword arguments are in the order given
-  // by dynamic_arg_names.
-  absl::InlinedVector<xla::PyTreeDef, 2> dynamic_arg_treedefs;
-
-  // Dynamic keyword argument names. Interned, and sorted by the keyword
-  // name. Interned values are safe to compare by pointer.
-  std::vector<nanobind::object> dynamic_arg_names;
-
-  // Static arguments. Contains the positional arguments sorted in argument
-  // order, followed by static keyword arguments in the order given by
-  // `static_arg_names`.
-  std::vector<nanobind::object> static_args;
-
-  // Static keyword argument names. Interned, and sorted by keyword name.
-  std::vector<nanobind::object> static_arg_names;
-
-  bool operator==(const ArgumentSignature& other) const;
-  bool operator!=(const ArgumentSignature& other) const {
-    return !(*this == other);
-  }
-
-  std::string DebugString() const;
-};
-
-template <typename H>
-H AbslHashValue(H h, const ArgumentSignature& s) {
-  h = H::combine(std::move(h), s.dynamic_arg_treedefs,
-                 s.dynamic_arg_names.size(), s.static_args.size(),
-                 s.static_arg_names.size());
-
-  for (const auto& name : s.dynamic_arg_names) {
-    h = H::combine(std::move(h), name.ptr());
-  }
-  for (size_t i = 0; i < s.static_args.size(); ++i) {
-    const auto& static_arg = s.static_args[i];
-    Py_hash_t hash;
-    try {
-      hash = nanobind::hash(static_arg);
-    } catch (const nanobind::python_error& e) {
-      if (!e.matches(PyExc_TypeError)) throw;
-      throw std::invalid_argument(absl::StrCat(
-          "Non-hashable static arguments are not supported. An error occurred "
-          "while trying to hash an object of type ",
-          nanobind::cast<absl::string_view>(nanobind::str(static_arg.type())),
-          ", ", nanobind::cast<absl::string_view>(nanobind::str(static_arg)),
-          ". The error was:\n", e.what(), "\n"));
-    }
-    h = H::combine(std::move(h), hash);
-  }
-  for (const auto& name : s.static_arg_names) {
-    h = H::combine(std::move(h), name.ptr());
-  }
-  return h;
-}
-
-// Filter out static arguments, flatten and concatenate other arguments (i.e.
-// dynamic positional and keyword arguments), filling `arguments` in place.
-// Args:
-// positional_args: positional arguments
-// keyword_args: the values of the keyword arguments
-// kwnames: either None or a tuple containing the keyword argument names
-// static_argnums: the indices of the static arguments in the positional
-//   arguments
-// static_argnames: the names of the static arguments, which must be interned.
-// pytree_registry: the registry to use to convert the arguments to pytrees
-// signature: output; describes the static arguments and the identities of the
-//  dynamic arguments.
-// flat_dynamic_args: output; the concatenation of the dynamic positional
-//  arguments and sorted keyword arguments.
-absl::Status ParseArguments(
-    absl::Span<PyObject* const> positional_args,
-    absl::Span<PyObject* const> keyword_args, nanobind::handle kwnames,
-    absl::Span<int const> static_argnums,
-    absl::Span<nanobind::str const> static_argnames,
-    xla::PyTreeRegistry* pytree_registry, ArgumentSignature& signature,
-    absl::InlinedVector<nanobind::object, 2>& flat_dynamic_args);
-
-// The signature of Python jitted function call, partitioned into:
-// - dynamic positional arguments (i.e. positional args which are not static)
-// - static positional arguments (i.e. the args associated to static_argnums)
-// - keyword arguments
-// The CallSignature should unambiguously identify a function call, thus,
-// equality is based on:
-// (a) Same PyTree for all dynamic positional arguments and keyword arguments
-// (a) equality of the arguments and keyword arguments ArgSignature
-// (a) equality (delegated to Python) of the static arguments.
-struct CallSignature {
-  // Not part of the signature, but we need it for error messages.
-  absl::string_view function_name;
-
-  ArgumentSignature arg_signature;
-
-  // Shape and dtype for both the dynamic positional arguments and the keyword
-  // arguments (sorted by keyword name).
-  absl::InlinedVector<xla::PyArgSignature, 2> dynamic_arg_signatures;
-
-  // The sharding of the jax.Array arguments.
-  std::vector<nanobind::object> dynamic_arg_shardings;
-
-  // The layout of the jax.Array arguments.
-  std::vector<std::shared_ptr<const xla::PjRtLayout>> dynamic_arg_layouts;
-
-  absl::InlinedVector<bool, 2> committed_args;
-
-  // For JIT, we need this in the key because computation follows the data, so
-  // we may have multiple executables depending on the devices the data is on.
-  // This is not the case for PMAP, and is set to `nullptr`.
-  xla::PjRtDevice* device = nullptr;
-  bool jax_enable_x64;
-
-  // For JIT on PJIT, we need to fallback to python whenever default_device
-  // changes.
-  std::optional<nanobind::object> default_device;
-
-  // Opaque additional context that should be included as part of the cache key.
-  std::optional<nanobind::object> global_extra_jit_context;
-  std::optional<nanobind::object> thread_local_extra_jit_context;
-
-  std::vector<nanobind::object> configs;
-
-  bool operator==(const CallSignature& other) const;
-  bool operator!=(const CallSignature& other) const {
-    return !(*this == other);
-  }
-
-  std::string DebugString() const;
-};
-
-template <typename H>
-H AbslHashValue(H h, const CallSignature& s) {
-  h = H::combine(std::move(h), s.arg_signature, s.dynamic_arg_signatures);
-
-  DCHECK(s.dynamic_arg_shardings.empty() ||
-         s.dynamic_arg_shardings.size() == s.dynamic_arg_signatures.size());
-
-  DCHECK(s.dynamic_arg_layouts.empty() ||
-         s.dynamic_arg_layouts.size() == s.dynamic_arg_signatures.size());
-
-  // TODO(chky): For now, we are only hashing the pointer of shardings to avoid
-  // slow python hashing function. Consider implementing hashing function and
-  // equality checks in C++ in jax::Sharding and use those here.
-  for (const auto& sharding : s.dynamic_arg_shardings) {
-    h = H::combine(std::move(h), ShardingHash(sharding));
-  }
-
-  for (const auto& layout : s.dynamic_arg_layouts) {
-    if (layout != nullptr) {
-      h = H::combine(std::move(h), *layout);
-    }
-  }
-
-  h = H::combine(std::move(h), s.committed_args, s.device, s.jax_enable_x64);
-
-  // We do not hash the extra_jit_context fields since calling Python hash
-  // functions is expensive (~300ns) and we don't expect a large number of
-  // different contexts.
-  return h;
-}
-
-// The function to call in `xla.cc` to add the bindings for this module.
-void BuildJaxjitSubmodule(nanobind::module_& m);
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_JAX_JIT_H_
diff --git a/third_party/xla/xla/python/jax_jit_test.py b/third_party/xla/xla/python/jax_jit_test.py
deleted file mode 100644
index abd15d8fef3c..000000000000
--- a/third_party/xla/xla/python/jax_jit_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for jax_jit helper functions."""
-
-from absl.testing import absltest
-
-from xla.python import xla_client
-
-jax_jit = xla_client._xla.jax_jit
-pytree = xla_client._xla.pytree
-
-pytree_registry = pytree.default_registry()
-
-
-class JaxJitTest(absltest.TestCase):
-
-  def testParseArguments(self):
-    sig, args = jax_jit.parse_arguments(
-        positional_args=[1, 2, 3],
-        keyword_args=[4, 5],
-        kwnames=("a", "b"),
-        static_argnums=[0, 2],
-        static_argnames=["a"],
-        pytree_registry=pytree_registry,
-    )
-    self.assertEqual(args, [2, 5])
-    self.assertEqual(sig.static_args, [1, 3, 4])
-    self.assertEqual(sig.static_arg_names, ["a"])
-    _, leaf = pytree_registry.flatten(0)
-    self.assertEqual(sig.dynamic_arg_names, ["b"])
-    self.assertEqual(sig.dynamic_arg_treedefs, [leaf, leaf])
-
-
-if __name__ == "__main__":
-  absltest.main()
diff --git a/third_party/xla/xla/python/mlir.cc b/third_party/xla/xla/python/mlir.cc
deleted file mode 100644
index c2d76d860e1e..000000000000
--- a/third_party/xla/xla/python/mlir.cc
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Bytecode/BytecodeWriter.h"
-#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
-#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "stablehlo/dialect/Serialization.h"
-#include "xla/hlo/builder/xla_computation.h"
-#include "xla/hlo/translate/stablehlo.h"
-#include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/pjrt/mlir_to_hlo.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/refine_polymorphic_shapes.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-namespace {
-
-std::string PrintModule(mlir::ModuleOp module) {
-  std::string s;
-  llvm::raw_string_ostream os(s);
-  mlir::OpPrintingFlags flags;
-  flags.enableDebugInfo();
-  module->print(os, flags);
-  return s;
-}
-
-absl::StatusOr<std::string> SerializeUsingBytecode(mlir::ModuleOp module) {
-  std::string bytecode;
-  llvm::raw_string_ostream os(bytecode);
-  mlir::BytecodeWriterConfig config;
-  if (mlir::failed(mlir::writeBytecodeToFile(module, os, config))) {
-    return absl::InvalidArgumentError("mlir::writeBytecodeToFile failed");
-  }
-  return bytecode;
-}
-
-void EnablePrintBeforeAndAfter(mlir::PassManager& pm) {
-  auto print_before = [](mlir::Pass*, mlir::Operation*) { return true; };
-  auto print_after = [](mlir::Pass*, mlir::Operation*) { return true; };
-  pm.enableIRPrinting(print_before, print_after);
-}
-
-// Converts an XlaComputation to a StableHLO mlir::Module string.
-// Exists for backwards compatibility.
-// TODO(phawkins): port remaining users of XlaComputations to use mlir::Modules
-// instead and delete this function.
-absl::StatusOr<std::string> PyXlaComputationToMlirModule(
-    const XlaComputation& computation) {
-  mlir::MLIRContext context;
-  if (VLOG_IS_ON(3)) context.disableMultithreading();
-  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                      ConvertHloToStablehlo(context, &computation.proto()));
-  return PrintModule(*module);
-}
-
-absl::StatusOr<XlaComputation> PyMlirModuleToXlaComputation(
-    absl::string_view mlir_module, bool use_tuple_args, bool return_tuple) {
-  mlir::MLIRContext context;
-  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                      ParseMlirModuleString(mlir_module, context));
-  XlaComputation computation;
-  // SDY dialect may be part of the module which XLA doesn't know about.
-  TF_RETURN_IF_ERROR(ExportShardyForHloRoundTrip(*module));
-  TF_RETURN_IF_ERROR(MlirToXlaComputation(*module, computation, use_tuple_args,
-                                          return_tuple,
-                                          /*use_shardy=*/false));
-  return computation;
-}
-
-absl::StatusOr<nb::bytes> PyMhloToStablehlo(absl::string_view mlir_module) {
-  mlir::MLIRContext context;
-  if (VLOG_IS_ON(3)) context.disableMultithreading();
-  // JAX can be customized in a way that involves operations from custom
-  // dialects showing up in JAX IR.
-  // `ParseMlirModuleString` won't know about these dialects, but that's fine
-  // since we just want to convert MHLO ops to StableHLO ops here and leave
-  // everything else unchanged.
-  // In order to achieve that, we're allowing unregistered dialects here.
-  context.allowUnregisteredDialects(true);
-  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                      ParseMlirModuleString(mlir_module, context));
-  mlir::PassManager pm(&context);
-  if (VLOG_IS_ON(3)) EnablePrintBeforeAndAfter(pm);
-  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
-  if (!mlir::succeeded(pm.run(*module))) {
-    return tsl::errors::InvalidArgument("MHLO => StableHLO failed");
-  }
-  // Use bytecode, passing unregistered dialects with properties causes issues
-  // when using textual assembly.
-  TF_ASSIGN_OR_RETURN(std::string bytecode, SerializeUsingBytecode(*module));
-  return nb::bytes(bytecode.data(), bytecode.size());
-}
-
-absl::StatusOr<nb::bytes> PyStablehloToMhlo(const nb::bytes& mlir_module) {
-  mlir::MLIRContext context;
-  if (VLOG_IS_ON(3)) context.disableMultithreading();
-  // See PyMhloToStablehlo for an explanation of why we're allowing unregistered
-  // dialects here.
-  context.allowUnregisteredDialects(true);
-  TF_ASSIGN_OR_RETURN(
-      mlir::OwningOpRef<mlir::ModuleOp> module,
-      ParseMlirModuleString(
-          absl::string_view(mlir_module.c_str(), mlir_module.size()), context));
-  mlir::PassManager pm(&context);
-  if (VLOG_IS_ON(3)) EnablePrintBeforeAndAfter(pm);
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
-  if (!mlir::succeeded(pm.run(*module))) {
-    return tsl::errors::InvalidArgument("StableHLO => MHLO failed");
-  }
-
-  // Use bytecode, passing unregistered dialects with properties causes issues
-  // when using textual assembly.
-  TF_ASSIGN_OR_RETURN(std::string bytecode, SerializeUsingBytecode(*module));
-  return nb::bytes(bytecode.data(), bytecode.size());
-}
-
-absl::StatusOr<nb::bytes> PySerializePortableArtifact(
-    absl::string_view mlir_module, absl::string_view target) {
-  mlir::MLIRContext context;
-  if (VLOG_IS_ON(3)) context.disableMultithreading();
-  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                      ParseMlirModuleString(mlir_module, context));
-
-  // Serialize portable artifact
-  TF_ASSIGN_OR_RETURN(
-      std::string bytecode,
-      SerializeUsingVersionedStablehlo(*module, target, /*inplace=*/true));
-  return nb::bytes(bytecode.data(), bytecode.size());
-}
-
-absl::StatusOr<std::string> PyDeserializePortableArtifact(
-    const nb::bytes& bytecode_str) {
-  mlir::MLIRContext context;
-  mlir::OwningOpRef<mlir::ModuleOp> module =
-      mlir::stablehlo::deserializePortableArtifact(
-          absl::string_view(bytecode_str.c_str(), bytecode_str.size()),
-          &context);
-  if (!module)
-    return tsl::errors::InvalidArgument("Failed to deserialize StableHLO");
-  return PrintModule(*module);
-}
-
-}  // namespace
-
-void BuildMlirSubmodule(nb::module_& m) {
-  nb::module_ mlir_module = m.def_submodule("mlir", "MLIR/XLA integration");
-
-  mlir_module.def("xla_computation_to_mlir_module",
-                  xla::ValueOrThrowWrapper(PyXlaComputationToMlirModule),
-                  nb::arg("computation"));
-  mlir_module.def(
-      "mlir_module_to_xla_computation",
-      [](const nb::bytes& bytecode, bool use_tuple_args, bool return_tuple) {
-        return xla::ValueOrThrow(PyMlirModuleToXlaComputation(
-            absl::string_view(bytecode.c_str(), bytecode.size()),
-            use_tuple_args, return_tuple));
-      },
-      nb::arg("mlir_module"), nb::arg("use_tuple_args") = false,
-      nb::arg("return_tuple") = false);
-  mlir_module.def("mlir_module_to_xla_computation",
-                  xla::ValueOrThrowWrapper(PyMlirModuleToXlaComputation),
-                  nb::arg("mlir_module"), nb::arg("use_tuple_args") = false,
-                  nb::arg("return_tuple") = false);
-  mlir_module.def(
-      "mhlo_to_stablehlo",
-      [](const nb::bytes& bytecode) {
-        return xla::ValueOrThrow(PyMhloToStablehlo(
-            absl::string_view(bytecode.c_str(), bytecode.size())));
-      },
-      nb::arg("mlir_module"));
-  mlir_module.def("mhlo_to_stablehlo",
-                  xla::ValueOrThrowWrapper(PyMhloToStablehlo),
-                  nb::arg("mlir_module"));
-  mlir_module.def("stablehlo_to_mhlo",
-                  xla::ValueOrThrowWrapper(PyStablehloToMhlo),
-                  nb::arg("mlir_module"));
-  mlir_module.def(
-      "serialize_portable_artifact",
-      [](const nb::bytes& bytecode, absl::string_view target) {
-        return xla::ValueOrThrow(PySerializePortableArtifact(
-            absl::string_view(bytecode.c_str(), bytecode.size()), target));
-      },
-      nb::arg("mlir_module"), nb::arg("target"));
-  mlir_module.def("serialize_portable_artifact",
-                  xla::ValueOrThrowWrapper(PySerializePortableArtifact),
-                  nb::arg("mlir_module"), nb::arg("target"));
-  mlir_module.def("deserialize_portable_artifact",
-                  xla::ValueOrThrowWrapper(PyDeserializePortableArtifact),
-                  nb::arg("mlir_module"));
-  mlir_module.def(
-      "refine_polymorphic_shapes",
-      [](nb::bytes bytecode, bool enable_shape_assertions,
-         bool validate_static_shapes, bool enable_shardy) -> nb::bytes {
-        std::string buffer;
-        llvm::raw_string_ostream os(buffer);
-        xla::ThrowIfError(RefinePolymorphicShapes(
-            absl::string_view(bytecode.c_str(), bytecode.size()), os,
-            enable_shape_assertions, validate_static_shapes, enable_shardy));
-        return nb::bytes(buffer.data(), buffer.size());
-      },
-      nb::arg("mlir_module"), nb::arg("enable_shape_assertions") = true,
-      nb::arg("validate_static_shapes") = true,
-      nb::arg("enable_shardy") = false,
-      R"(Refines the dynamic shapes for a module.
-        The "main" function must have static shapes and all the
-        intermediate dynamic shapes depend only on the input static
-        shapes. Optionally, also validates that the resulting module has
-        only static shapes.
-      )");
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/mlir.h b/third_party/xla/xla/python/mlir.h
deleted file mode 100644
index 180024068c59..000000000000
--- a/third_party/xla/xla/python/mlir.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_MLIR_H_
-#define XLA_PYTHON_MLIR_H_
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-void BuildMlirSubmodule(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_MLIR_H_
diff --git a/third_party/xla/xla/python/nb_class_ptr.h b/third_party/xla/xla/python/nb_class_ptr.h
deleted file mode 100644
index e6aa68f7f8cb..000000000000
--- a/third_party/xla/xla/python/nb_class_ptr.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_NB_CLASS_PTR_H_
-#define XLA_PYTHON_NB_CLASS_PTR_H_
-
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-// A reference-counting smart pointer to a nanobind-wrapped class on the Python
-// heap. Type T must be a class known to nanobind via a nanobind::class_
-// declaration. nb_class_ptr is useful for managing C++ classes that may be
-// allocated inline in Python objects on the Python heap.
-template <typename T>
-class nb_class_ptr : public nanobind::object {
- public:
-  inline nb_class_ptr() : nanobind::object() {}
-  inline nb_class_ptr(nanobind::handle h, ::nanobind::detail::borrow_t)
-      : nanobind::object(h, ::nanobind::detail::borrow_t{}) {}
-  inline nb_class_ptr(nanobind::handle h, ::nanobind::detail::steal_t)
-      : nanobind::object(h, ::nanobind::detail::steal_t{}) {}
-  inline static bool check_(nanobind::handle h) {
-    nanobind::handle type = nanobind::type<T>();
-    return h.type().is(type);
-  };
-
-  T* operator->() const { return nanobind::inst_ptr<T>(ptr()); }
-  T& operator*() const { return *nanobind::inst_ptr<T>(ptr()); }
-  T* get() const { return ptr() ? nanobind::inst_ptr<T>(ptr()) : nullptr; }
-};
-
-// This function is analogous to std::make_unique<T>(...), but instead it
-// allocates the object on the Python heap
-template <typename T, class... Args>
-nb_class_ptr<T> make_nb_class(Args&&... args) {
-  nanobind::handle type = nanobind::type<T>();
-  nanobind::object instance = nanobind::inst_alloc(type);
-  T* ptr = nanobind::inst_ptr<T>(instance);
-  new (ptr) T(std::forward<Args>(args)...);
-  nanobind::inst_mark_ready(instance);
-  return nb_class_ptr<T>(instance.release(), ::nanobind::detail::steal_t{});
-}
-
-}  // namespace xla
-
-#endif  //  XLA_PYTHON_NB_CLASS_PTR_H_
diff --git a/third_party/xla/xla/python/ops.cc b/third_party/xla/xla/python/ops.cc
index e3546877fd47..8ff640289704 100644
--- a/third_party/xla/xla/python/ops.cc
+++ b/third_party/xla/xla/python/ops.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/python/ops.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <optional>
@@ -23,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/hash/hash.h"
 #include "absl/types/span.h"
 #include "nanobind/nanobind.h"
 #include "nanobind/stl/optional.h"  // IWYU pragma: keep
@@ -42,11 +41,14 @@ limitations under the License.
 #include "xla/hlo/builder/lib/svd.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
 #include "xla/python/nb_helpers.h"
-#include "xla/python/types.h"
+#include "xla/python/types.h"  // IWYU pragma: keep
 #include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace nb = nanobind;
@@ -311,29 +313,59 @@ struct type_caster<xla::ResultAccuracy> {
 
 namespace xla {
 
-void BuildOpsSubmodule(nb::module_& m) {
-  // ops submodule, containing free functions that add operators to an
-  // XlaBuilder.
-  nb::module_ ops = m.def_submodule("ops", "XLA operations");
+NB_MODULE(_ops, m) {
+  nb::class_<ShapeIndex>(m, "ShapeIndex", R"(Represents an XLA ShapeIndex.
+
+  An index for specifying a particular nested subshape within a shape. Used in
+  ShapeUtil::GetSubshape and other interfaces. ShapeIndex defines a path through
+  the Shape tree where each element of ShapeIndex indexes into a tuple (or
+  nested tuple) within the shape. For a non-nested tuple, an index has a single
+  element.)")
+      .def("__init__",
+           [](ShapeIndex* self, const std::vector<int64_t>& v) {
+             new (self) ShapeIndex(v.begin(), v.end());
+           })
+      .def("__repr__", &ShapeIndex::ToString)
+      .def("__eq__", [](const ShapeIndex& shape_ind,
+                        const ShapeIndex& other) { return shape_ind == other; })
+      .def("__ne__", [](const ShapeIndex& shape_ind,
+                        const ShapeIndex& other) { return shape_ind != other; })
+      .def("__hash__",
+           [](const ShapeIndex& shape_ind) { return absl::HashOf(shape_ind); });
+
+  nb::enum_<FftType>(m, "FftType")
+      .value("FFT", FftType::FFT)
+      .value("IFFT", FftType::IFFT)
+      .value("RFFT", FftType::RFFT)
+      .value("IRFFT", FftType::IRFFT);
+
+  nb::enum_<PrecisionConfig::Precision>(m, "PrecisionConfig_Precision")
+      .value("DEFAULT", PrecisionConfig::DEFAULT)
+      .value("HIGH", PrecisionConfig::HIGH)
+      .value("HIGHEST", PrecisionConfig::HIGHEST);
 
   nb::enum_<TriangularSolveOptions::Transpose>(
-      ops, "TriangularSolveOptions_Transpose")
+      m, "TriangularSolveOptions_Transpose")
       .value("TRANSPOSE_INVALID", TriangularSolveOptions::TRANSPOSE_INVALID)
       .value("NO_TRANSPOSE", TriangularSolveOptions::NO_TRANSPOSE)
       .value("TRANSPOSE", TriangularSolveOptions::TRANSPOSE)
       .value("ADJOINT", TriangularSolveOptions::ADJOINT);
 
-  nb::enum_<RandomAlgorithm>(ops, "RandomAlgorithm", nb::is_arithmetic())
+  nb::enum_<RandomAlgorithm>(m, "RandomAlgorithm", nb::is_arithmetic())
       .value("RNG_DEFAULT", RandomAlgorithm::RNG_DEFAULT)
       .value("RNG_THREE_FRY", RandomAlgorithm::RNG_THREE_FRY)
       .value("RNG_PHILOX", RandomAlgorithm::RNG_PHILOX);
 
-  nb::enum_<CustomCallSchedule>(ops, "CustomCallSchedule")
+  nb::enum_<ResultAccuracy::Mode>(m, "ResultAccuracy_Mode")
+      .value("DEFAULT", ResultAccuracy::DEFAULT)
+      .value("HIGHEST", ResultAccuracy::HIGHEST);
+
+  nb::enum_<CustomCallSchedule>(m, "CustomCallSchedule")
       .value("SCHEDULE_NONE", CustomCallSchedule::SCHEDULE_NONE)
       .value("SCHEDULE_LATEST", CustomCallSchedule::SCHEDULE_LATEST)
       .value("SCHEDULE_EARLIEST", CustomCallSchedule::SCHEDULE_EARLIEST);
 
-  nb::enum_<CustomCallApiVersion>(ops, "CustomCallApiVersion",
+  nb::enum_<CustomCallApiVersion>(m, "CustomCallApiVersion",
                                   nb::is_arithmetic())
       .value("API_VERSION_ORIGINAL", CustomCallApiVersion::API_VERSION_ORIGINAL)
       .value("API_VERSION_STATUS_RETURNING",
@@ -343,94 +375,91 @@ void BuildOpsSubmodule(nb::module_& m) {
       .value("API_VERSION_TYPED_FFI",
              CustomCallApiVersion::API_VERSION_TYPED_FFI);
 
-  ops.def("AfterAll", &AfterAll, nb::arg("builder"), nb::arg("tokens"));
-  ops.def("AllGather", &AllGather, nb::arg("operand"),
-          nb::arg("all_gather_dimension"), nb::arg("shard_count"),
-          nb::arg("replica_groups") = nb::list(),
-          nb::arg("channel_id") = std::nullopt,
-          nb::arg("shape_with_layout") = std::nullopt,
-          nb::arg("use_global_device_ids") = std::nullopt);
-  ops.def("AllReduce",
-          static_cast<XlaOp (*)(
-              XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
-              const std::optional<ChannelHandle>&, const std::optional<Shape>&,
-              const std::optional<bool>)>(&AllReduce),
-          nb::arg("operand"), nb::arg("computation"),
-          nb::arg("replica_groups") = nb::list(),
-          nb::arg("channel_id") = std::nullopt,
-          nb::arg("shape_with_layout") = std::nullopt,
-          nb::arg("use_global_device_ids") = std::nullopt);
-  ops.def("ReduceScatter", &ReduceScatter, nb::arg("operand"),
-          nb::arg("computation"), nb::arg("scatter_dimension"),
-          nb::arg("shard_count"), nb::arg("replica_groups") = nb::list(),
-          nb::arg("channel_id") = std::nullopt,
-          nb::arg("layout") = std::nullopt,
-          nb::arg("use_global_device_ids") = std::nullopt);
-  ops.def("AllToAll", &AllToAll, nb::arg("operand"), nb::arg("split_dimension"),
-          nb::arg("concat_dimension"), nb::arg("split_count"),
-          nb::arg("replica_groups") = nb::list(),
-          nb::arg("layout") = std::nullopt,
-          nb::arg("channel_id") = std::nullopt);
-  ops.def("ApproxTopK", &ApproxTopK, nb::arg("builder"), nb::arg("operands"),
-          nb::arg("init_values"), nb::arg("top_k"), nb::arg("reduction_dim"),
-          nb::arg("comparator"), nb::arg("recall_target") = 0.9,
-          nb::arg("aggregate_to_topk") = true,
-          nb::arg("reduction_input_size_override") = -1);
-  ops.def("ApproxTopKFallback", &ApproxTopKFallback, nb::arg("builder"),
-          nb::arg("operands"), nb::arg("init_values"), nb::arg("top_k"),
-          nb::arg("reduction_dim"), nb::arg("comparator"),
-          nb::arg("recall_target") = 0.9, nb::arg("aggregate_to_topk") = true,
-          nb::arg("reduction_input_size_override") = -1);
-  ops.def("ApproxTopKReductionOutputSize",
-          xla::ValueOrThrowWrapper(ApproxTopKReductionOutputSize),
-          nb::arg("input_size"), nb::arg("rank"), nb::arg("top_k"),
-          nb::arg("recall_target"), nb::arg("aggregate_to_topk") = true,
-          nb::arg("input_size_override") = -1);
-  ops.def("BitcastConvertType", &BitcastConvertType, nb::arg("operand"),
-          nb::arg("new_element_type"));
-  ops.def("Broadcast", &Broadcast, nb::arg("operand"), nb::arg("sizes"));
-  ops.def("BroadcastInDim", &BroadcastInDim, nb::arg("operand"),
-          nb::arg("shape"), nb::arg("broadcast_dimensions"));
-  ops.def("Call", &Call, nb::arg("builder"), nb::arg("computation"),
-          nb::arg("operands"));
-  ops.def("Cholesky", &Cholesky, nb::arg("a"), nb::arg("lower") = true);
-  ops.def("Clamp", &Clamp, nb::arg("min"), nb::arg("operand"), nb::arg("max"));
-  ops.def("Collapse", &Collapse, nb::arg("operand"), nb::arg("dimensions"));
-  ops.def("CollectivePermute", &CollectivePermute, nb::arg("operand"),
-          nb::arg("source_target_pairs"), nb::arg("channel_id") = std::nullopt,
-          nb::arg("inplace") = false);
-  ops.def("ConcatInDim", &ConcatInDim, nb::arg("builder"), nb::arg("operands"),
-          nb::arg("dimension"));
-  ops.def("Conditional",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
-                                absl::Span<const XlaOp>)>(&Conditional),
-          nb::arg("branch_index"), nb::arg("branch_computations"),
-          nb::arg("branch_operands"));
-  ops.def("Conditional",
-          static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
-                                const XlaComputation&)>(&Conditional),
-          nb::arg("predicate"), nb::arg("true_operand"),
-          nb::arg("true_computation"), nb::arg("false_operand"),
-          nb::arg("false_computation"));
-  ops.def("Constant", &ConstantLiteral, nb::arg("builder"), nb::arg("literal"));
-  ops.def("ConstantLiteral", &ConstantLiteral, nb::arg("builder"),
-          nb::arg("literal"));
-  ops.def("ConvGeneralDilated", &ConvGeneralDilated, nb::arg("lhs"),
-          nb::arg("rhs"), nb::arg("window_strides"), nb::arg("padding"),
-          nb::arg("lhs_dilation"), nb::arg("rhs_dilation"),
-          nb::arg("dimension_numbers"), nb::arg("feature_group_count") = 1,
-          nb::arg("batch_group_count") = 1,
-          nb::arg("precision_config") = nullptr,
-          nb::arg("preferred_element_type") = std::nullopt,
-          nb::arg("window_reversal") = std::nullopt);
-  ops.def("ConvertElementType", &ConvertElementType, nb::arg("operand"),
-          nb::arg("new_element_type"));
-  ops.def("CreateToken", &CreateToken, nb::arg("builder"));
-  ops.def("CrossReplicaSum",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
-              &CrossReplicaSum),
-          nb::arg("operand"), nb::arg("replica_groups") = nb::list());
-  ops.def(
+  m.def("AfterAll", &AfterAll, nb::arg("builder"), nb::arg("tokens"));
+  m.def("AllGather", &AllGather, nb::arg("operand"),
+        nb::arg("all_gather_dimension"), nb::arg("shard_count"),
+        nb::arg("replica_groups") = nb::list(),
+        nb::arg("channel_id") = std::nullopt,
+        nb::arg("shape_with_layout") = std::nullopt,
+        nb::arg("use_global_device_ids") = std::nullopt);
+  m.def("AllReduce",
+        static_cast<XlaOp (*)(
+            XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
+            const std::optional<ChannelHandle>&, const std::optional<Shape>&,
+            const std::optional<bool>)>(&AllReduce),
+        nb::arg("operand"), nb::arg("computation"),
+        nb::arg("replica_groups") = nb::list(),
+        nb::arg("channel_id") = std::nullopt,
+        nb::arg("shape_with_layout") = std::nullopt,
+        nb::arg("use_global_device_ids") = std::nullopt);
+  m.def("ReduceScatter", &ReduceScatter, nb::arg("operand"),
+        nb::arg("computation"), nb::arg("scatter_dimension"),
+        nb::arg("shard_count"), nb::arg("replica_groups") = nb::list(),
+        nb::arg("channel_id") = std::nullopt, nb::arg("layout") = std::nullopt,
+        nb::arg("use_global_device_ids") = std::nullopt);
+  m.def("AllToAll", &AllToAll, nb::arg("operand"), nb::arg("split_dimension"),
+        nb::arg("concat_dimension"), nb::arg("split_count"),
+        nb::arg("replica_groups") = nb::list(),
+        nb::arg("layout") = std::nullopt, nb::arg("channel_id") = std::nullopt);
+  m.def("ApproxTopK", &ApproxTopK, nb::arg("builder"), nb::arg("operands"),
+        nb::arg("init_values"), nb::arg("top_k"), nb::arg("reduction_dim"),
+        nb::arg("comparator"), nb::arg("recall_target") = 0.9,
+        nb::arg("aggregate_to_topk") = true,
+        nb::arg("reduction_input_size_override") = -1);
+  m.def("ApproxTopKFallback", &ApproxTopKFallback, nb::arg("builder"),
+        nb::arg("operands"), nb::arg("init_values"), nb::arg("top_k"),
+        nb::arg("reduction_dim"), nb::arg("comparator"),
+        nb::arg("recall_target") = 0.9, nb::arg("aggregate_to_topk") = true,
+        nb::arg("reduction_input_size_override") = -1);
+  m.def("ApproxTopKReductionOutputSize",
+        xla::ValueOrThrowWrapper(ApproxTopKReductionOutputSize),
+        nb::arg("input_size"), nb::arg("rank"), nb::arg("top_k"),
+        nb::arg("recall_target"), nb::arg("aggregate_to_topk") = true,
+        nb::arg("input_size_override") = -1);
+  m.def("BitcastConvertType", &BitcastConvertType, nb::arg("operand"),
+        nb::arg("new_element_type"));
+  m.def("Broadcast", &Broadcast, nb::arg("operand"), nb::arg("sizes"));
+  m.def("BroadcastInDim", &BroadcastInDim, nb::arg("operand"), nb::arg("shape"),
+        nb::arg("broadcast_dimensions"));
+  m.def("Call", &Call, nb::arg("builder"), nb::arg("computation"),
+        nb::arg("operands"));
+  m.def("Cholesky", &Cholesky, nb::arg("a"), nb::arg("lower") = true);
+  m.def("Clamp", &Clamp, nb::arg("min"), nb::arg("operand"), nb::arg("max"));
+  m.def("Collapse", &Collapse, nb::arg("operand"), nb::arg("dimensions"));
+  m.def("CollectivePermute", &CollectivePermute, nb::arg("operand"),
+        nb::arg("source_target_pairs"), nb::arg("channel_id") = std::nullopt,
+        nb::arg("inplace") = false);
+  m.def("ConcatInDim", &ConcatInDim, nb::arg("builder"), nb::arg("operands"),
+        nb::arg("dimension"));
+  m.def("Conditional",
+        static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
+                              absl::Span<const XlaOp>)>(&Conditional),
+        nb::arg("branch_index"), nb::arg("branch_computations"),
+        nb::arg("branch_operands"));
+  m.def("Conditional",
+        static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
+                              const XlaComputation&)>(&Conditional),
+        nb::arg("predicate"), nb::arg("true_operand"),
+        nb::arg("true_computation"), nb::arg("false_operand"),
+        nb::arg("false_computation"));
+  m.def("Constant", &ConstantLiteral, nb::arg("builder"), nb::arg("literal"));
+  m.def("ConstantLiteral", &ConstantLiteral, nb::arg("builder"),
+        nb::arg("literal"));
+  m.def("ConvGeneralDilated", &ConvGeneralDilated, nb::arg("lhs"),
+        nb::arg("rhs"), nb::arg("window_strides"), nb::arg("padding"),
+        nb::arg("lhs_dilation"), nb::arg("rhs_dilation"),
+        nb::arg("dimension_numbers"), nb::arg("feature_group_count") = 1,
+        nb::arg("batch_group_count") = 1, nb::arg("precision_config") = nullptr,
+        nb::arg("preferred_element_type") = std::nullopt,
+        nb::arg("window_reversal") = std::nullopt);
+  m.def("ConvertElementType", &ConvertElementType, nb::arg("operand"),
+        nb::arg("new_element_type"));
+  m.def("CreateToken", &CreateToken, nb::arg("builder"));
+  m.def("CrossReplicaSum",
+        static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
+            &CrossReplicaSum),
+        nb::arg("operand"), nb::arg("replica_groups") = nb::list());
+  m.def(
       "CustomCall",
       [](XlaBuilder* builder, const nb::bytes& call_target_name,
          absl::Span<const XlaOp> operands, const Shape& shape,
@@ -450,7 +479,7 @@ void BuildOpsSubmodule(nb::module_& m) {
       nb::arg("has_side_effect") = false,
       nb::arg("schedule") = CustomCallSchedule::SCHEDULE_NONE,
       nb::arg("api_version") = CustomCallApiVersion::API_VERSION_ORIGINAL);
-  ops.def(
+  m.def(
       "CustomCallWithLayout",
       [](XlaBuilder* builder, const nb::bytes& call_target_name,
          absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
@@ -472,7 +501,7 @@ void BuildOpsSubmodule(nb::module_& m) {
       nb::arg("opaque") = nb::bytes(""), nb::arg("has_side_effect") = false,
       nb::arg("schedule") = CustomCallSchedule::SCHEDULE_NONE,
       nb::arg("api_version") = CustomCallApiVersion::API_VERSION_ORIGINAL);
-  ops.def(
+  m.def(
       "CustomCallWithAliasing",
       [](XlaBuilder* builder, const nb::bytes& call_target_name,
          absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
@@ -496,7 +525,7 @@ void BuildOpsSubmodule(nb::module_& m) {
       nb::arg("output_operand_aliasing"), nb::arg("literal") = nullptr,
       nb::arg("schedule") = CustomCallSchedule::SCHEDULE_NONE,
       nb::arg("api_version") = CustomCallApiVersion::API_VERSION_ORIGINAL);
-  ops.def(
+  m.def(
       "CustomCallWithComputation",
       [](XlaBuilder* builder, const nb::bytes& call_target_name,
          absl::Span<const XlaOp> operands, const XlaComputation& computation,
@@ -519,27 +548,27 @@ void BuildOpsSubmodule(nb::module_& m) {
       nb::arg("output_operand_aliasing"), nb::arg("literal") = nullptr,
       nb::arg("schedule") = CustomCallSchedule::SCHEDULE_NONE,
       nb::arg("api_version") = CustomCallApiVersion::API_VERSION_ORIGINAL);
-  ops.def("Dot", &Dot, nb::arg("lhs"), nb::arg("rhs"),
-          nb::arg("precision_config") = nullptr,
-          nb::arg("preferred_element_type") = std::nullopt);
-  ops.def("DotGeneral", &DotGeneral, nb::arg("lhs"), nb::arg("rhs"),
-          nb::arg("dimension_numbers"), nb::arg("precision_config") = nullptr,
-          nb::arg("preferred_element_type") = std::nullopt);
-  ops.def("DynamicReshape",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
-                                absl::Span<const int64_t>,
-                                const std::vector<bool>&)>(&DynamicReshape),
-          nb::arg("operand"), nb::arg("dim_sizes"), nb::arg("new_size_bounds"),
-          nb::arg("dims_are_dynamic"));
-  ops.def("DynamicSlice",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
-                                absl::Span<const int64_t>)>(&DynamicSlice),
-          nb::arg("operand"), nb::arg("start_indices"), nb::arg("slice_sizes"));
-  ops.def("DynamicUpdateSlice",
-          static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
-              &DynamicUpdateSlice),
-          nb::arg("operand"), nb::arg("update"), nb::arg("start_indices"));
-  ops.def(
+  m.def("Dot", &Dot, nb::arg("lhs"), nb::arg("rhs"),
+        nb::arg("precision_config") = nullptr,
+        nb::arg("preferred_element_type") = std::nullopt);
+  m.def("DotGeneral", &DotGeneral, nb::arg("lhs"), nb::arg("rhs"),
+        nb::arg("dimension_numbers"), nb::arg("precision_config") = nullptr,
+        nb::arg("preferred_element_type") = std::nullopt);
+  m.def("DynamicReshape",
+        static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
+                              absl::Span<const int64_t>,
+                              const std::vector<bool>&)>(&DynamicReshape),
+        nb::arg("operand"), nb::arg("dim_sizes"), nb::arg("new_size_bounds"),
+        nb::arg("dims_are_dynamic"));
+  m.def("DynamicSlice",
+        static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
+                              absl::Span<const int64_t>)>(&DynamicSlice),
+        nb::arg("operand"), nb::arg("start_indices"), nb::arg("slice_sizes"));
+  m.def("DynamicUpdateSlice",
+        static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
+            &DynamicUpdateSlice),
+        nb::arg("operand"), nb::arg("update"), nb::arg("start_indices"));
+  m.def(
       "Eigh",
       [](XlaOp a, bool lower, int64_t max_iter, float epsilon,
          bool sort_eigenvalues) -> std::pair<XlaOp, XlaOp> {
@@ -549,53 +578,53 @@ void BuildOpsSubmodule(nb::module_& m) {
       },
       nb::arg("a"), nb::arg("lower") = true, nb::arg("max_iter") = 15,
       nb::arg("epsilon") = 1e-5, nb::arg("sort_eigenvalues") = true);
-  ops.def("Fft", &Fft, nb::arg("operand"), nb::arg("fft_type"),
-          nb::arg("fft_length"));
-  ops.def("Gather", &Gather, nb::arg("a"), nb::arg("start_indices"),
-          nb::arg("dimension_numbers"), nb::arg("slice_sizes"),
-          nb::arg("indices_are_sorted") = false);
-  ops.def("GetDimensionSize", &GetDimensionSize, nb::arg("operand"),
-          nb::arg("dimension"));
-  ops.def("GetTupleElement", &GetTupleElement, nb::arg("tuple_data"),
-          nb::arg("index"));
-  ops.def("InfeedWithToken", &InfeedWithToken, nb::arg("token"),
-          nb::arg("shape"), nb::arg("config") = "");
-  ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64_t)>(&Iota),
-          nb::arg("builder"), nb::arg("shape"), nb::arg("iota_dimension"));
-  ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64_t)>(&Iota),
-          nb::arg("builder"), nb::arg("type"), nb::arg("size"));
-  ops.def(
+  m.def("Fft", &Fft, nb::arg("operand"), nb::arg("fft_type"),
+        nb::arg("fft_length"));
+  m.def("Gather", &Gather, nb::arg("a"), nb::arg("start_indices"),
+        nb::arg("dimension_numbers"), nb::arg("slice_sizes"),
+        nb::arg("indices_are_sorted") = false);
+  m.def("GetDimensionSize", &GetDimensionSize, nb::arg("operand"),
+        nb::arg("dimension"));
+  m.def("GetTupleElement", &GetTupleElement, nb::arg("tuple_data"),
+        nb::arg("index"));
+  m.def("InfeedWithToken", &InfeedWithToken, nb::arg("token"), nb::arg("shape"),
+        nb::arg("config") = "");
+  m.def("Iota",
+        static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64_t)>(&Iota),
+        nb::arg("builder"), nb::arg("shape"), nb::arg("iota_dimension"));
+  m.def("Iota",
+        static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64_t)>(&Iota),
+        nb::arg("builder"), nb::arg("type"), nb::arg("size"));
+  m.def(
       "LU",
       [](XlaOp a) -> std::tuple<XlaOp, XlaOp, XlaOp> {
         LuDecompositionResult lu = LuDecomposition(a);
         return std::make_tuple(lu.lu, lu.pivots, lu.permutation);
       },
       nb::arg("operand"));
-  ops.def("Map", &Map, nb::arg("builder"), nb::arg("operands"),
-          nb::arg("computation"), nb::arg("dimensions"),
-          nb::arg("static_operands") = nb::list());
-  ops.def("MultiCollectivePermute", &MultiCollectivePermute,
-          nb::arg("operands"), nb::arg("source_target_pairs"),
-          nb::arg("channel_id") = std::nullopt, nb::arg("inplace") = false);
-  ops.def("NextAfter", &NextAfter, nb::arg("from"), nb::arg("to"));
-  ops.def("OutfeedWithToken", &OutfeedWithToken, nb::arg("operand"),
-          nb::arg("token"), nb::arg("shape_with_layout"),
-          nb::arg("outfeed_config") = "");
-  ops.def("Pad", &Pad, nb::arg("operand"), nb::arg("padding_value"),
-          nb::arg("padding_config"));
-  ops.def("Parameter",
-          static_cast<XlaOp (*)(XlaBuilder*, int64_t, const Shape&,
-                                const std::string&, const std::vector<bool>&)>(
-              &Parameter),
-          nb::arg("builder"), nb::arg("parameter_number"), nb::arg("shape"),
-          nb::arg("name") = "",
-          nb::arg("replicated_at_leaf_buffers") = std::vector<bool>());
-  ops.def("ProductOfElementaryHouseholderReflectors",
-          &ProductOfElementaryHouseholderReflectors, nb::arg("a"),
-          nb::arg("taus"));
-  ops.def(
+  m.def("Map", &Map, nb::arg("builder"), nb::arg("operands"),
+        nb::arg("computation"), nb::arg("dimensions"),
+        nb::arg("static_operands") = nb::list());
+  m.def("MultiCollectivePermute", &MultiCollectivePermute, nb::arg("operands"),
+        nb::arg("source_target_pairs"), nb::arg("channel_id") = std::nullopt,
+        nb::arg("inplace") = false);
+  m.def("NextAfter", &NextAfter, nb::arg("from"), nb::arg("to"));
+  m.def("OutfeedWithToken", &OutfeedWithToken, nb::arg("operand"),
+        nb::arg("token"), nb::arg("shape_with_layout"),
+        nb::arg("outfeed_config") = "");
+  m.def("Pad", &Pad, nb::arg("operand"), nb::arg("padding_value"),
+        nb::arg("padding_config"));
+  m.def("Parameter",
+        static_cast<XlaOp (*)(XlaBuilder*, int64_t, const Shape&,
+                              const std::string&, const std::vector<bool>&)>(
+            &Parameter),
+        nb::arg("builder"), nb::arg("parameter_number"), nb::arg("shape"),
+        nb::arg("name") = "",
+        nb::arg("replicated_at_leaf_buffers") = std::vector<bool>());
+  m.def("ProductOfElementaryHouseholderReflectors",
+        &ProductOfElementaryHouseholderReflectors, nb::arg("a"),
+        nb::arg("taus"));
+  m.def(
       "QR",
       [](XlaOp a, bool full_matrices) -> std::pair<XlaOp, XlaOp> {
         XlaOp q, r;
@@ -603,92 +632,92 @@ void BuildOpsSubmodule(nb::module_& m) {
         return std::make_pair(q, r);
       },
       nb::arg("operand"), nb::arg("full_matrices"));
-  ops.def(
+  m.def(
       "QrDecomposition",
       [](XlaOp a) -> std::pair<XlaOp, XlaOp> {
         QrDecomposition d = Qr(a);
         return std::make_pair(d.q_and_r, d.taus);
       },
       nb::arg("operand"));
-  ops.def("RecvFromHost", &RecvFromHost, nb::arg("token"), nb::arg("shape"),
-          nb::arg("handle"));
-  ops.def("Reduce",
-          static_cast<XlaOp (*)(XlaBuilder*, absl::Span<const XlaOp>,
-                                absl::Span<const XlaOp>, const XlaComputation&,
-                                absl::Span<const int64_t>)>(&Reduce),
-          nb::arg("builder"), nb::arg("operands"), nb::arg("init_values"),
-          nb::arg("computation"), nb::arg("dimensions_to_reduce"));
-  ops.def("ReducePrecision", &ReducePrecision, nb::arg("operand"),
-          nb::arg("exponent_bits"), nb::arg("mantissa_bits"));
-  ops.def("ReduceWindowWithGeneralPadding",
-          static_cast<XlaOp (*)(
-              XlaOp, XlaOp, const XlaComputation&, absl::Span<const int64_t>,
-              absl::Span<const int64_t>, absl::Span<const int64_t>,
-              absl::Span<const int64_t>,
-              absl::Span<const std::pair<int64_t, int64_t>>)>(
-              &ReduceWindowWithGeneralPadding),
-          nb::arg("operand"), nb::arg("init_value"), nb::arg("computation"),
-          nb::arg("window_dimensions"), nb::arg("window_strides"),
-          nb::arg("base_dilations"), nb::arg("window_dilations"),
-          nb::arg("padding"));
-  ops.def("ReduceWindowWithGeneralPadding",
-          static_cast<XlaOp (*)(
-              absl::Span<const XlaOp>, absl::Span<const XlaOp>,
-              const XlaComputation&, absl::Span<const int64_t>,
-              absl::Span<const int64_t>, absl::Span<const int64_t>,
-              absl::Span<const int64_t>,
-              absl::Span<const std::pair<int64_t, int64_t>>)>(
-              &ReduceWindowWithGeneralPadding),
-          nb::arg("operands"), nb::arg("init_values"), nb::arg("computation"),
-          nb::arg("window_dimensions"), nb::arg("window_strides"),
-          nb::arg("base_dilations"), nb::arg("window_dilations"),
-          nb::arg("padding"));
-  ops.def("RemoveDynamicDimension", &RemoveDynamicDimension, nb::arg("operand"),
-          nb::arg("dimension"));
-  ops.def("ReplicaId", &ReplicaId, nb::arg("builder"));
-  ops.def("Reshape",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64_t>)>(&Reshape),
-          nb::arg("operand"), nb::arg("new_sizes"));
-  ops.def("Rev", &Rev, nb::arg("operand"), nb::arg("dimensions"));
-  ops.def("RngBitGenerator", &RngBitGenerator, nb::arg("algorithm"),
-          nb::arg("initial_state"), nb::arg("shape"));
-  ops.def("RngNormal", &RngNormal, nb::arg("mu"), nb::arg("sigma"),
-          nb::arg("shape"));
-  ops.def("RngUniform", &RngUniform, nb::arg("a"), nb::arg("b"),
-          nb::arg("shape"));
-  ops.def("Scatter",
-          static_cast<XlaOp (*)(XlaOp, XlaOp, XlaOp, const XlaComputation&,
-                                const ScatterDimensionNumbers&, bool, bool)>(
-              &Scatter),
-          nb::arg("input"), nb::arg("scatter_indices"), nb::arg("updates"),
-          nb::arg("update_computation"), nb::arg("dimension_numbers"),
-          nb::arg("indices_are_sorted") = false,
-          nb::arg("unique_indices") = false);
-  ops.def("Scatter",
-          static_cast<XlaOp (*)(absl::Span<const XlaOp>, XlaOp,
-                                absl::Span<const XlaOp>, const XlaComputation&,
-                                const ScatterDimensionNumbers&, bool, bool)>(
-              &Scatter),
-          nb::arg("inputs"), nb::arg("scatter_indices"), nb::arg("updates"),
-          nb::arg("update_computation"), nb::arg("dimension_numbers"),
-          nb::arg("indices_are_sorted") = false,
-          nb::arg("unique_indices") = false);
-  ops.def("Select", &Select, nb::arg("pred"), nb::arg("on_true"),
-          nb::arg("on_false"));
-  ops.def("SelectAndScatterWithGeneralPadding",
-          &SelectAndScatterWithGeneralPadding, nb::arg("operand"),
-          nb::arg("select"), nb::arg("window_dimensions"),
-          nb::arg("window_strides"), nb::arg("padding"), nb::arg("source"),
-          nb::arg("init_value"), nb::arg("scatter"));
-  ops.def("SendToHost", &SendToHost, nb::arg("operand"), nb::arg("token"),
-          nb::arg("shape_with_layout"), nb::arg("handle"));
-  ops.def("SetDimensionSize", &SetDimensionSize, nb::arg("operand"),
-          nb::arg("val"), nb::arg("dimension"));
-  ops.def("Slice", &Slice, nb::arg("operand"), nb::arg("start_indices"),
-          nb::arg("limit_indices"), nb::arg("strides"));
-  ops.def("SliceInDim", &SliceInDim, nb::arg("operand"), nb::arg("start_index"),
-          nb::arg("limit_index"), nb::arg("stride"), nb::arg("dimno"));
-  ops.def(
+  m.def("RecvFromHost", &RecvFromHost, nb::arg("token"), nb::arg("shape"),
+        nb::arg("handle"));
+  m.def("Reduce",
+        static_cast<XlaOp (*)(XlaBuilder*, absl::Span<const XlaOp>,
+                              absl::Span<const XlaOp>, const XlaComputation&,
+                              absl::Span<const int64_t>)>(&Reduce),
+        nb::arg("builder"), nb::arg("operands"), nb::arg("init_values"),
+        nb::arg("computation"), nb::arg("dimensions_to_reduce"));
+  m.def("ReducePrecision", &ReducePrecision, nb::arg("operand"),
+        nb::arg("exponent_bits"), nb::arg("mantissa_bits"));
+  m.def("ReduceWindowWithGeneralPadding",
+        static_cast<XlaOp (*)(
+            XlaOp, XlaOp, const XlaComputation&, absl::Span<const int64_t>,
+            absl::Span<const int64_t>, absl::Span<const int64_t>,
+            absl::Span<const int64_t>,
+            absl::Span<const std::pair<int64_t, int64_t>>)>(
+            &ReduceWindowWithGeneralPadding),
+        nb::arg("operand"), nb::arg("init_value"), nb::arg("computation"),
+        nb::arg("window_dimensions"), nb::arg("window_strides"),
+        nb::arg("base_dilations"), nb::arg("window_dilations"),
+        nb::arg("padding"));
+  m.def("ReduceWindowWithGeneralPadding",
+        static_cast<XlaOp (*)(absl::Span<const XlaOp>, absl::Span<const XlaOp>,
+                              const XlaComputation&, absl::Span<const int64_t>,
+                              absl::Span<const int64_t>,
+                              absl::Span<const int64_t>,
+                              absl::Span<const int64_t>,
+                              absl::Span<const std::pair<int64_t, int64_t>>)>(
+            &ReduceWindowWithGeneralPadding),
+        nb::arg("operands"), nb::arg("init_values"), nb::arg("computation"),
+        nb::arg("window_dimensions"), nb::arg("window_strides"),
+        nb::arg("base_dilations"), nb::arg("window_dilations"),
+        nb::arg("padding"));
+  m.def("RemoveDynamicDimension", &RemoveDynamicDimension, nb::arg("operand"),
+        nb::arg("dimension"));
+  m.def("ReplicaId", &ReplicaId, nb::arg("builder"));
+  m.def("Reshape",
+        static_cast<XlaOp (*)(XlaOp, absl::Span<const int64_t>)>(&Reshape),
+        nb::arg("operand"), nb::arg("new_sizes"));
+  m.def("Rev", &Rev, nb::arg("operand"), nb::arg("dimensions"));
+  m.def("RngBitGenerator", &RngBitGenerator, nb::arg("algorithm"),
+        nb::arg("initial_state"), nb::arg("shape"));
+  m.def("RngNormal", &RngNormal, nb::arg("mu"), nb::arg("sigma"),
+        nb::arg("shape"));
+  m.def("RngUniform", &RngUniform, nb::arg("a"), nb::arg("b"),
+        nb::arg("shape"));
+  m.def("Scatter",
+        static_cast<XlaOp (*)(XlaOp, XlaOp, XlaOp, const XlaComputation&,
+                              const ScatterDimensionNumbers&, bool, bool)>(
+            &Scatter),
+        nb::arg("input"), nb::arg("scatter_indices"), nb::arg("updates"),
+        nb::arg("update_computation"), nb::arg("dimension_numbers"),
+        nb::arg("indices_are_sorted") = false,
+        nb::arg("unique_indices") = false);
+  m.def("Scatter",
+        static_cast<XlaOp (*)(absl::Span<const XlaOp>, XlaOp,
+                              absl::Span<const XlaOp>, const XlaComputation&,
+                              const ScatterDimensionNumbers&, bool, bool)>(
+            &Scatter),
+        nb::arg("inputs"), nb::arg("scatter_indices"), nb::arg("updates"),
+        nb::arg("update_computation"), nb::arg("dimension_numbers"),
+        nb::arg("indices_are_sorted") = false,
+        nb::arg("unique_indices") = false);
+  m.def("Select", &Select, nb::arg("pred"), nb::arg("on_true"),
+        nb::arg("on_false"));
+  m.def("SelectAndScatterWithGeneralPadding",
+        &SelectAndScatterWithGeneralPadding, nb::arg("operand"),
+        nb::arg("select"), nb::arg("window_dimensions"),
+        nb::arg("window_strides"), nb::arg("padding"), nb::arg("source"),
+        nb::arg("init_value"), nb::arg("scatter"));
+  m.def("SendToHost", &SendToHost, nb::arg("operand"), nb::arg("token"),
+        nb::arg("shape_with_layout"), nb::arg("handle"));
+  m.def("SetDimensionSize", &SetDimensionSize, nb::arg("operand"),
+        nb::arg("val"), nb::arg("dimension"));
+  m.def("Slice", &Slice, nb::arg("operand"), nb::arg("start_indices"),
+        nb::arg("limit_indices"), nb::arg("strides"));
+  m.def("SliceInDim", &SliceInDim, nb::arg("operand"), nb::arg("start_index"),
+        nb::arg("limit_index"), nb::arg("stride"), nb::arg("dimno"));
+  m.def(
       "Sort",
       [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
          std::optional<const XlaComputation*> comparator, int64_t dimension,
@@ -703,17 +732,16 @@ void BuildOpsSubmodule(nb::module_& m) {
 
           if (comparator) {
             return Sort(operands, **comparator, dimension, is_stable);
-          } else {
-            return Sort(operands,
-                        CreateScalarLtComputation(operand_types, builder),
-                        dimension, is_stable);
           }
+          return Sort(operands,
+                      CreateScalarLtComputation(operand_types, builder),
+                      dimension, is_stable);
         });
       },
       nb::arg("builder"), nb::arg("operands"),
       nb::arg("comparator") = std::nullopt, nb::arg("dimension") = -1,
       nb::arg("is_stable") = false);
-  ops.def(
+  m.def(
       "SVD",
       [](XlaOp a, int64_t max_iter,
          float epsilon) -> std::tuple<XlaOp, XlaOp, XlaOp> {
@@ -721,33 +749,95 @@ void BuildOpsSubmodule(nb::module_& m) {
         return std::make_tuple(svd.u, svd.d, svd.v);
       },
       nb::arg("a"), nb::arg("max_iter") = 100, nb::arg("epsilon") = 1e-6);
-  ops.def(
+  m.def(
       "TopK",
       [](XlaOp input, int64_t k) {
         return TopK(input, k, /*index_type=*/PrimitiveType::S32);
       },
       nb::arg("input"), nb::arg("k"));
-  ops.def("Transpose", &Transpose, nb::arg("operand"), nb::arg("permutation"));
-  ops.def("TriangularSolve", &TriangularSolve, nb::arg("a"), nb::arg("b"),
-          nb::arg("left_side"), nb::arg("lower"), nb::arg("unit_diagonal"),
-          nb::arg("transpose_a"));
-  ops.def("Tuple", &Tuple, nb::arg("builder"), nb::arg("elements"));
-  ops.def("While", &While, nb::arg("condition"), nb::arg("body"),
-          nb::arg("init"));
-
-  ops.def("Igamma", &Igamma, nb::arg("a"), nb::arg("x"));
-  ops.def("Igammac", &Igammac, nb::arg("a"), nb::arg("x"));
-  ops.def("IgammaGradA", &IgammaGradA, nb::arg("a"), nb::arg("x"));
-  ops.def("RandomGammaGrad", &RandomGammaGrad, nb::arg("a"), nb::arg("x"));
-  ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, nb::arg("a"),
-          nb::arg("b"), nb::arg("x"));
-  ops.def("Zeta", &Zeta, nb::arg("x"), nb::arg("q"));
-  ops.def("Exp", static_cast<XlaOp (*)(XlaOp, const ResultAccuracy&)>(&Exp),
-          nb::arg("operand"), nb::arg("result_accuracy"));
-  ops.def("Exp", static_cast<XlaOp (*)(XlaOp)>(&Exp), nb::arg("operand"));
+  m.def("Transpose", &Transpose, nb::arg("operand"), nb::arg("permutation"));
+  m.def("TriangularSolve", &TriangularSolve, nb::arg("a"), nb::arg("b"),
+        nb::arg("left_side"), nb::arg("lower"), nb::arg("unit_diagonal"),
+        nb::arg("transpose_a"));
+  m.def("Tuple", &Tuple, nb::arg("builder"), nb::arg("elements"));
+  m.def("While", &While, nb::arg("condition"), nb::arg("body"),
+        nb::arg("init"));
+
+  m.def("Igamma", &Igamma, nb::arg("a"), nb::arg("x"));
+  m.def("Igammac", &Igammac, nb::arg("a"), nb::arg("x"));
+  m.def("IgammaGradA", &IgammaGradA, nb::arg("a"), nb::arg("x"));
+  m.def("RandomGammaGrad", &RandomGammaGrad, nb::arg("a"), nb::arg("x"));
+  m.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, nb::arg("a"),
+        nb::arg("b"), nb::arg("x"));
+  m.def("Zeta", &Zeta, nb::arg("x"), nb::arg("q"));
+
+  m.def("Cbrt",
+        static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(
+            &Cbrt),
+        nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def(
+      "Cos",
+      static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(&Cos),
+      nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def(
+      "Erf",
+      static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(&Erf),
+      nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def(
+      "Exp",
+      static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(&Exp),
+      nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def("Expm1",
+        static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(
+            &Expm1),
+        nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def(
+      "Log",
+      static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(&Log),
+      nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def("Log1p",
+        static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(
+            &Log1p),
+        nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def("Logistic",
+        static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(
+            &Logistic),
+        nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def("Rsqrt",
+        static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(
+            &Rsqrt),
+        nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def(
+      "Sin",
+      static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(&Sin),
+      nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def("Sqrt",
+        static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(
+            &Sqrt),
+        nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def(
+      "Tan",
+      static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(&Tan),
+      nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
+
+  m.def("Tanh",
+        static_cast<XlaOp (*)(XlaOp, const std::optional<ResultAccuracy>&)>(
+            &Tanh),
+        nb::arg("operand"), nb::arg("result_accuracy") = std::nullopt);
 
 #define BINARY_OP(op)                                                  \
-  ops.def(                                                             \
+  m.def(                                                               \
       #op,                                                             \
       [](XlaOp a, XlaOp b, std::optional<std::vector<int64_t>> dims) { \
         return dims ? op(a, b, *dims) : op(a, b);                      \
@@ -778,31 +868,20 @@ void BuildOpsSubmodule(nb::module_& m) {
   BINARY_OP(Complex);
 #undef BINARY_OP
 
-#define UNARY_OP(op) ops.def(#op, &op)
+#define UNARY_OP(op) m.def(#op, &op)
   UNARY_OP(Not);
   UNARY_OP(PopulationCount);
   UNARY_OP(Clz);
   UNARY_OP(Abs);
-  UNARY_OP(Expm1);
   UNARY_OP(Floor);
   UNARY_OP(Ceil);
   UNARY_OP(Round);
-  UNARY_OP(Log);
-  UNARY_OP(Log1p);
   UNARY_OP(Sign);
-  UNARY_OP(Cos);
-  UNARY_OP(Sin);
-  UNARY_OP(Tan);
-  UNARY_OP(Tanh);
   UNARY_OP(IsFinite);
   UNARY_OP(Neg);
-  UNARY_OP(Sqrt);
-  UNARY_OP(Rsqrt);
-  UNARY_OP(Cbrt);
   UNARY_OP(Square);
   UNARY_OP(Reciprocal);
   UNARY_OP(Erfc);
-  UNARY_OP(Erf);
   UNARY_OP(ErfInv);
   UNARY_OP(Lgamma);
   UNARY_OP(Digamma);
@@ -811,7 +890,6 @@ void BuildOpsSubmodule(nb::module_& m) {
   UNARY_OP(Acos);
   UNARY_OP(Asin);
   UNARY_OP(Atan);
-  UNARY_OP(Tan);
   UNARY_OP(Acosh);
   UNARY_OP(Asinh);
   UNARY_OP(Atanh);
diff --git a/third_party/xla/xla/python/ops.h b/third_party/xla/xla/python/ops.h
index 936aeb0cc81a..c9654354bc4b 100644
--- a/third_party/xla/xla/python/ops.h
+++ b/third_party/xla/xla/python/ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 namespace xla {
 
 void BuildOpsSubmodule(nanobind::module_& m);
+void BuildOpsModule(nanobind::module_& m);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/package_groups.bzl b/third_party/xla/xla/python/package_groups.bzl
new file mode 100644
index 000000000000..a31f004724ba
--- /dev/null
+++ b/third_party/xla/xla/python/package_groups.bzl
@@ -0,0 +1,7 @@
+"""XLA Python package_group definitions."""
+
+visibility(["//xla/python/..."])
+
+XLA_PYTHON_XLA_CLIENT_USERS = []
+XLA_PYTHON_XLA_EXTENSION_USERS = []
+XLA_PYTHON_PROFILER_USERS = []
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
deleted file mode 100644
index 55d034dc4419..000000000000
--- a/third_party/xla/xla/python/pjit.cc
+++ /dev/null
@@ -1,1402 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/pjit.h"
-
-#include <Python.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <exception>
-#include <memory>
-#include <optional>
-#include <stdexcept>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/cleanup/cleanup.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/synchronization/notification.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/layout.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/lru_cache.h"
-#include "xla/python/config.h"
-#include "xla/python/guard_lib.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/jax_jit.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_helpers.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_executable.h"
-#include "xla/python/py_values.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/pytree.h"
-#include "xla/python/sharding.h"
-#include "xla/python/traceback.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "tsl/profiler/lib/traceme.h"
-
-namespace jax {
-namespace {
-
-namespace nb = nanobind;
-
-struct PjitCacheEntry {
-  explicit PjitCacheEntry(xla::PyTreeRegistry* registry)
-      : out_pytree_def(registry) {}
-  std::shared_ptr<xla::PyLoadedExecutable> executable;
-  std::vector<nb::object> in_shardings;
-  std::vector<nb::object> out_avals;
-  std::vector<xla::nb_dtype> out_dtypes;
-  std::vector<std::vector<int64_t>> out_shapes;
-  std::vector<bool> out_weak_types;
-  std::vector<nb::object> out_shardings;
-  std::vector<bool> out_committed;
-  xla::PyTreeDef out_pytree_def;
-  // Bitvector of kept arguments from Jaxpr DCE pass. Used to drop some `args`
-  // in PjitFunction::Call before calling into compiled computation.
-  std::vector<bool> kept_var_bitvec;
-  std::vector<nb::object> in_device_local_layouts;
-
-  // Ensures a single thread performs the compilation for a given executable.
-  //
-  // The first thread (holding the GIL) will create the CacheEntry associated to
-  // a signature and if the object has been inserted already, other threads
-  // will wait for the notification.
-  absl::Notification compilation_complete;
-
-  std::thread::id thread_id = std::this_thread::get_id();
-
-  bool fall_back_to_python = false;
-};
-
-// A PjitFunctionCache represents a cache of compiled functions that can be
-// shared between one or more PjitFunction objects. It serves two goals:
-// - reduce the number of lru caches (hash map) across multiple JITs.
-// - make the cache global to increase cache hits (e.g. calling jit(f)(3) twice)
-//   keeping entries alive as long as the underlying function f is alive.
-// Assume the cache is protected by the GIL.
-class PjitFunctionCache {
- public:
-  static constexpr int kDefaultCapacity = 4096;
-  explicit PjitFunctionCache(int capacity);
-
-  // Cache entries are shared_ptr<>s because it's possible the cache entry
-  // might be evicted before we finish tracing/compiling.
-  typedef xla::LRUCache<CallSignature, std::shared_ptr<PjitCacheEntry>> Cache;
-
-  // We include as part of the cache key `global_cache_key` (and any other
-  // fields that aren't subsumed by the CallSignature we compute for each call).
-  static std::shared_ptr<Cache> Lookup(
-      xla::nb_class_ptr<PjitFunctionCache> self, nb::handle function,
-      nb::object global_cache_key);
-  std::shared_ptr<Cache> DefaultCache();
-
-  // These methods require the GIL or the object's lock in no-GIL mode.
-  int Size() const { return lru_list_.Size(); }
-  int Capacity() const { return lru_list_.Capacity(); }
-  void Clear() {
-    lru_list_.Clear();
-    functions_.clear();
-  }
-
- private:
-  struct Key {
-    nb::handle function;  // Does not hold a reference.
-
-    // Other fields that are part of the arguments to `jit`, but are not
-    // otherwise part of CallSignature.
-    nb::object global_cache_key;
-
-    size_t cached_hash;
-
-    bool operator==(const Key& other) const {
-      bool global_cache_eq;
-      try {
-        global_cache_eq = global_cache_key.equal(other.global_cache_key);
-      } catch (const nanobind::python_error& e) {
-        throw std::invalid_argument(
-            absl::StrCat("Equality of  global cache key lead to an exception. "
-                         "The error was:\n",
-                         e.what(), "\n"));
-      }
-      return function.ptr() == other.function.ptr() && global_cache_eq;
-    }
-
-    struct Hash {
-      size_t operator()(const Key& key) const { return key.cached_hash; }
-    };
-  };
-
-  template <typename H>
-  friend H AbslHashValue(H h, const Key& key) {
-    h = H::combine(std::move(h), key.function.ptr());
-    Py_hash_t hash;
-    try {
-      hash = nb::hash(key.global_cache_key);
-    } catch (const nanobind::python_error& e) {
-      if (!e.matches(PyExc_TypeError)) throw;
-      throw std::invalid_argument(absl::StrCat(
-          "Hashing global cache key lead to an exception. The error was:\n",
-          e.what(), "\n"));
-    }
-    h = H::combine(std::move(h), hash);
-    return h;
-  }
-
-  struct Value {
-    explicit Value(std::shared_ptr<Cache> cache) : cache(std::move(cache)) {}
-    std::shared_ptr<Cache> cache;
-
-    // A weak reference to the key function. We use the weak reference to
-    // register a callback that is triggered when the key function is destroyed.
-    // We use a weak pointer because we want to allow caching across multiple
-    // calls to `pjit(f)` if `f` remains alive, but we do not want the cache
-    // to keep `f` alive if all other references are dropped.
-    std::optional<nb::weakref> weakref;
-  };
-
-  // lru_list_ and functions_ are protected by the GIL in GIL mode, and by the
-  // self object lock in freethreading mode.
-  Cache::LRUList lru_list_;
-  // We use std::unordered_map because ABSL containers are not exception safe:
-  std::unordered_map<Key, std::unique_ptr<Value>, Key::Hash> functions_;
-  // mu_ prevents concurrent insertions into functions_ if the gil or critical
-  // section lock is released during insertion.
-  absl::Mutex mu_;
-};
-
-PjitFunctionCache::PjitFunctionCache(int capacity) : lru_list_(capacity) {}
-
-std::shared_ptr<PjitFunctionCache::Cache> PjitFunctionCache::DefaultCache() {
-  return std::make_shared<Cache>(&lru_list_);
-}
-
-/*static*/ std::shared_ptr<PjitFunctionCache::Cache> PjitFunctionCache::Lookup(
-    xla::nb_class_ptr<PjitFunctionCache> self, nb::handle function,
-    nb::object global_cache_key) ABSL_NO_THREAD_SAFETY_ANALYSIS {
-  // In no-GIL mode, a critical section on self plays the same role that
-  // the GIL plays in GIL mode.
-  nb::ft_object_guard lock(self);
-  {
-    // Because the gil (or the critical section lock) can be released during
-    // cache insertion, this forces the lock order to be mu_ then gil so we
-    // must release the gil first.
-    nb::gil_scoped_release release;
-    // Acquire a mutex to avoid problems where the gil is released during
-    // cache insertion and then a second thread invalidates the cache order.
-    self->mu_.Lock();
-  }
-  absl::Cleanup unlock = [&self]() ABSL_UNLOCK_FUNCTION(self->mu_) {
-    self->mu_.Unlock();
-  };
-  Key key;
-  key.function = function;
-  key.global_cache_key = global_cache_key;
-  key.cached_hash = absl::HashOf(key);
-  auto insert = self->functions_.emplace(key, nullptr);
-  if (!insert.second) {
-    return insert.first->second->cache;
-  }
-  std::shared_ptr<Cache> cache = std::make_shared<Cache>(&self->lru_list_);
-  auto callback =
-      nb::cpp_function([self, key{std::move(key)}](nb::handle weakref) {
-        nb::ft_object_guard lock(self);
-        auto it = self->functions_.find(key);
-        if (it != self->functions_.end()) {
-          self->functions_.erase(it);
-        }
-      });
-  PyObject* weakref = PyWeakref_NewRef(function.ptr(), callback.ptr());
-  if (weakref) {
-    std::unique_ptr<Value>& entry = insert.first->second;
-    entry = std::make_unique<Value>(cache);
-    entry->weakref = nb::steal<nb::weakref>(weakref);
-  } else {
-    PyErr_Clear();
-    // `function` is not weak-referenceable. Don't bother adding it to the
-    // shared cache in that case; the `jit` object will hold the only shared
-    // reference to the cache entry.
-    self->functions_.erase(insert.first);
-  }
-  return cache;
-}
-
-class PjitFunction {
- public:
-  PjitFunction(std::string function_name, std::optional<nb::callable> fun,
-               nb::callable cache_miss, std::vector<int> static_argnums,
-               std::vector<nb::str> static_argnames,
-               nb::object global_cache_key,
-               xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry,
-               nb::callable shard_arg_fallback,
-               xla::nb_class_ptr<PjitFunctionCache> cache);
-  ~PjitFunction();
-
-  PjitFunction(const PjitFunction&) = delete;
-  PjitFunction& operator=(const PjitFunction&) = delete;
-  PjitFunction(PjitFunction&&) = default;
-  PjitFunction& operator=(PjitFunction&&) = default;
-
-  // nb::object typed subclass for PjitFunction objects.
-  class pyobject : public nb::object {
-   public:
-    NB_OBJECT(pyobject, nb::object, "PjitFunction",
-              PjitFunction::IsPjitFunction);
-    pyobject() = default;
-    PjitFunction* func() const {
-      return PjitFunction::AsPjitFunctionUnchecked(*this);
-    }
-  };
-  // Alias as ::object; outside the scope above we won't confuse nanobind's
-  // macros.
-  using object = pyobject;
-
-  // Returns true if `h` is a PjitFunction.
-  static bool IsPjitFunction(nb::handle handle);
-  // Converts `handle` to a PjitFunction*. Does not do any checking.
-  static PjitFunction* AsPjitFunctionUnchecked(nb::handle handle);
-
-  absl::StatusOr<nb::object> Call(nb::handle callable, PyObject* const* args,
-                                  size_t nargs, PyObject* kwnames);
-
-  void InitExecutables();
-
-  void ClearPythonReferences();
-
-  const std::string& function_name() const { return function_name_; }
-  const std::optional<nb::callable>& fun() const { return fun_; }
-  const nb::callable& cache_miss() const { return cache_miss_; }
-  const xla::nb_class_ptr<xla::PyTreeRegistry>& pytree_registry() const {
-    return pytree_registry_;
-  }
-  const nb::callable& shard_arg_fallback() const { return shard_arg_fallback_; }
-
-  const std::vector<int>& static_argnums() const { return static_argnums_; }
-  const std::vector<nb::str>& static_argnames() const {
-    return static_argnames_;
-  }
-  const nb::object& global_cache_key() const { return global_cache_key_; }
-  const xla::nb_class_ptr<PjitFunctionCache>& cache() const { return cache_; }
-
-  int cache_capacity() const {
-    nb::ft_object_guard lock(cache_);
-    return executables_->Size();
-  }
-
-  void ClearCache() {
-    nb::ft_object_guard lock(cache_);
-    executables_->Clear();
-  }
-
-  std::shared_ptr<PjitFunctionCache::Cache> executables() {
-    nb::ft_object_guard lock(cache_);
-    return executables_;
-  }
-
-  nb::object PythonSignature() {
-    if (!fun_.has_value()) {
-      throw nb::value_error(
-          absl::StrFormat(
-              "Calling __signature__ on PjitFunction(%s) not supported.",
-              function_name_)
-              .c_str());
-    }
-    static const auto* inspect =
-        new nb::module_(nb::module_::import_("inspect"));
-    return inspect->attr("signature")(*fun_);
-  }
-
- private:
-  absl::Status ComputeCallSignature(
-      absl::Span<nb::object const> flat_dynamic_args,
-      CallSignature& call_signature);
-
-  void PopulateCacheEntry(PjitCacheEntry& cache_entry,
-                          const nb::tuple& out_and_fastpath_data);
-
-  std::string function_name_;
-  std::optional<nb::callable> fun_;
-  nb::callable cache_miss_;
-  std::vector<int> static_argnums_;
-  std::vector<nb::str> static_argnames_;
-  nb::object global_cache_key_;
-
-  xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry_;
-  nb::callable shard_arg_fallback_;
-  xla::nb_class_ptr<PjitFunctionCache> cache_;
-
-  // In no-GIL mode executables_ is protected by the object lock on cache_,
-  // because it shared an LRU list with cache_.
-  std::shared_ptr<PjitFunctionCache::Cache> executables_;
-};
-
-PjitFunction::PjitFunction(
-    std::string function_name, std::optional<nb::callable> fun,
-    nb::callable cache_miss, std::vector<int> static_argnums,
-    std::vector<nb::str> static_argnames, nb::object global_cache_key,
-    xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry,
-    nb::callable shard_arg_fallback, xla::nb_class_ptr<PjitFunctionCache> cache)
-    : function_name_(std::move(function_name)),
-      fun_(std::move(fun)),
-      cache_miss_(std::move(cache_miss)),
-      static_argnums_(std::move(static_argnums)),
-      global_cache_key_(std::move(global_cache_key)),
-      pytree_registry_(std::move(pytree_registry)),
-      shard_arg_fallback_(std::move(shard_arg_fallback)),
-      cache_(std::move(cache)) {
-  std::sort(static_argnums_.begin(), static_argnums_.end());
-  static_argnames_.reserve(static_argnames.size());
-  for (nb::str& name : static_argnames) {
-    PyObject* s = name.inc_ref().ptr();
-    PyUnicode_InternInPlace(&s);
-    static_argnames_.push_back(nb::steal<nb::str>(s));
-  }
-}
-
-void PjitFunction::InitExecutables() {
-  // Construction of the object hasn't completed yet, so we don't need to hold
-  // the cache lock to mutate executables_.
-  if (!fun_.has_value()) {
-    executables_ = cache_->DefaultCache();
-  } else {
-    executables_ = cache_->Lookup(cache_, fun_.value(), global_cache_key_);
-  }
-}
-
-PjitFunction::~PjitFunction() {
-  nb::ft_object_guard lock(cache_);
-  executables_ = nullptr;
-}
-
-void CallShardArgFallback(
-    nb::handle arg, nb::handle sharding, nb::handle layout,
-    const nb::callable& fallback,
-    std::vector<tsl::RCReference<xla::ifrt::Array>>& num_args_arrays,
-    std::vector<nb::object>& keep_alive_objects) {
-  tsl::profiler::TraceMe traceme("cpp_pjit_shard_arg_fallback");
-  auto py_array_or_bufs = fallback(arg, sharding, layout);
-  auto py_array = nb::cast<xla::PyArray>(py_array_or_bufs);
-  num_args_arrays.push_back(tsl::FormRef(py_array.ifrt_array()));
-  keep_alive_objects.push_back(std::move(py_array_or_bufs));
-}
-
-// Prepares the input PjRtBuffers from the python arguments. This is equivalent
-// to shard_args() in pxla.py but for only a few supported cases.
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
-                  absl::Span<nb::object const> flat_dynamic_args,
-                  bool enable_x64, const std::vector<bool>& kept_args,
-                  const std::vector<nb::object>& in_shardings,
-                  const std::vector<nb::object>& in_device_local_layouts,
-                  const nb::callable& shard_arg_fallback,
-                  std::vector<nb::object>& keep_alive_objects) {
-  const auto& addressable_devices =
-      executable.ifrt_loaded_executable()->addressable_devices();
-  const auto& num_global_devices =
-      executable.ifrt_loaded_executable()->num_devices();
-  int num_args = flat_dynamic_args.size();
-
-  std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays;
-  num_args_arrays.reserve(num_args);
-
-  struct CopyGroup {
-    std::vector<int> indices;
-    std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
-  };
-  absl::flat_hash_map<std::pair<xla::ifrt::Device*, xla::ifrt::MemoryKind>,
-                      CopyGroup>
-      copy_groups;
-
-  xla::DevicePutOptions options;
-  options.squash_64bit_types = !enable_x64;
-  options.allow_zero_copy = true;
-  xla::ifrt::Device* data_device = nullptr;
-  if (executable.ifrt_loaded_executable()->num_devices() == 1) {
-    data_device = executable.ifrt_loaded_executable()->addressable_devices()[0];
-  }
-  int dce_i = 0;
-  for (int i = 0; i < num_args; ++i) {
-    if (!kept_args[i]) {
-      continue;
-    }
-    int dce_index = dce_i;
-    ++dce_i;
-
-    const nb::object& arg = flat_dynamic_args[i];
-    const nb::object& in_device_local_layout =
-        in_device_local_layouts[dce_index];
-
-    auto transfer_guard_formatter = [] { return std::string(""); };
-
-    if (arg.type().ptr() != xla::PyArray::type().ptr()) {
-      if (data_device != nullptr && in_device_local_layout.is_none()) {
-        TF_RETURN_IF_ERROR(
-            jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
-        TF_ASSIGN_OR_RETURN(
-            auto on_device_fn,
-            DevicePut(arg, executable.ifrt_loaded_executable()->client(),
-                      data_device, options, xla::ifrt::MemoryKind()));
-        TF_ASSIGN_OR_RETURN(xla::DevicePutResult on_device, [&]() {
-          // Must release the GIL before calling IFRT because backends may
-          // decide to block/sleep for device buffer allocation.
-          nb::gil_scoped_release gil_release;
-          return std::move(on_device_fn)();
-        }());
-
-        num_args_arrays.push_back(std::move(on_device.ifrt_array));
-        if (on_device.owning_pybuffer) {
-          keep_alive_objects.push_back(std::move(on_device.owning_pybuffer));
-        }
-        continue;
-      } else {
-        CallShardArgFallback(arg, in_shardings[dce_index],
-                             in_device_local_layout, shard_arg_fallback,
-                             num_args_arrays, keep_alive_objects);
-        continue;
-      }
-    }
-
-    xla::PyArray py_array = nb::borrow<xla::PyArray>(arg);
-    const auto& sharding = py_array.sharding();
-    int sharding_num_devices = jax::Sharding::SafeNumDevices(sharding);
-
-    // Currently only committed PyArray inputs or uncommitted PyArray on a
-    // single device inputs are allowed. This is checked previously in the entry
-    // point of PjitFunction::Call().
-    DCHECK(py_array.committed() ||
-           (!py_array.committed() && sharding_num_devices == 1));
-
-    if (!in_device_local_layout.is_none()) {
-      TF_ASSIGN_OR_RETURN(auto arr_layout, py_array.ifrt_array()->layout());
-      xla::Layout in_xc_layout = nb::cast<xla::Layout>(
-          in_device_local_layout.attr("_to_xla_layout")(py_array.dtype()));
-      if (in_xc_layout != arr_layout->xla_layout()) {
-        CallShardArgFallback(arg, in_shardings[dce_index],
-                             in_device_local_layout, shard_arg_fallback,
-                             num_args_arrays, keep_alive_objects);
-        continue;
-      }
-    }
-
-    if (sharding.type().ptr() == jax::PmapSharding::type().ptr()) {
-      CallShardArgFallback(arg, in_shardings[dce_index], in_device_local_layout,
-                           shard_arg_fallback, num_args_arrays,
-                           keep_alive_objects);
-      continue;
-    }
-
-    if (sharding_num_devices != num_global_devices) {
-      CallShardArgFallback(arg, in_shardings[dce_index], in_device_local_layout,
-                           shard_arg_fallback, num_args_arrays,
-                           keep_alive_objects);
-      continue;
-    }
-
-    xla::ifrt::Array* ifrt_array = py_array.ifrt_array();
-    // PyArray inputs should have already been checked in
-    // `xla::PyArgSignatureOfValue()` called by
-    // `PjitFunction::ComputeCallSignature()`.
-    DCHECK(ifrt_array != nullptr) << "PyArray has been unexpectedly deleted.";
-
-    const auto& ifrt_sharding = ifrt_array->sharding();
-    if (sharding_num_devices == 1 &&
-        ifrt_sharding.devices()->devices().front() != addressable_devices[0]) {
-      auto& copy_group =
-          copy_groups[std::make_pair(ifrt_sharding.devices()->devices().front(),
-                                     ifrt_sharding.memory_kind())];
-      copy_group.indices.push_back(num_args_arrays.size());
-      copy_group.arrays.push_back(tsl::FormRef(ifrt_array));
-      num_args_arrays.push_back({});
-    } else {
-      num_args_arrays.push_back(tsl::FormRef(ifrt_array));
-    }
-
-    keep_alive_objects.push_back(arg);
-  }
-
-  if (!copy_groups.empty()) {
-    xla::ifrt::Client* const ifrt_client =
-        executable.ifrt_loaded_executable()->client();
-    xla::ifrt::DeviceListRef ifrt_devices =
-        ifrt_client->MakeDeviceList({addressable_devices[0]});
-    for (auto& [key, group] : copy_groups) {
-      TF_ASSIGN_OR_RETURN(
-          auto copied_ifrt_arrays,
-          ifrt_client->CopyArrays(absl::MakeSpan(group.arrays), ifrt_devices,
-                                  /*memory_kind=*/std::nullopt,
-                                  xla::ifrt::ArrayCopySemantics::kReuseInput));
-      for (int i = 0; i < copied_ifrt_arrays.size(); ++i) {
-        num_args_arrays[group.indices[i]] = std::move(copied_ifrt_arrays[i]);
-      }
-    }
-  }
-
-  return num_args_arrays;
-}
-
-absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
-                                              PyObject* const* args,
-                                              size_t nargs, PyObject* kwnames) {
-  tsl::profiler::TraceMe traceme(
-      [&] { return absl::StrCat("PjitFunction(", function_name_, ")"); });
-
-  // Make sure we trigger a garbage collection on JIT function calls. Otherwise
-  // code like
-  // f = jit(...)
-  // while True:
-  //   f(x)
-  // may never free temporary buffers for copies of arguments.
-  xla::GlobalPyRefManager()->MaybeCollectGarbage();
-
-  if (GetDisableJit()) {
-    if (!fun_.has_value()) {
-      throw nb::value_error(
-          absl::StrFormat("Disable jit is not supported in the AOT path since "
-                          "the function is not available for (%s)",
-                          function_name_)
-              .c_str());
-    }
-    return nb::steal<nb::object>(
-        PyObject_Vectorcall(fun_.value().ptr(), args, nargs, kwnames));
-  }
-
-  // Calls the cache_miss_ function. This just calls the Python function; it may
-  // return nullptr value if a Python exception is thrown.
-  auto cache_miss = [&]() -> nb::tuple {
-    return nb::steal<nb::tuple>(
-        PyObject_Vectorcall(cache_miss_.ptr(), args, nargs, kwnames));
-  };
-
-  // Call the cache_miss() function, extracting the output data and ignoring
-  // the fastpath data. If the cache miss returns a Python error, returns
-  // nullptr and leaves the Python error set.
-  auto fallback_to_cache_miss = [&]() {
-    nb::tuple cache_miss_output = cache_miss();
-    if (!cache_miss_output.ptr()) {
-      return nb::object();
-    }
-    return nb::object(cache_miss_output[0]);
-  };
-
-  size_t num_positional_args = PyVectorcall_NARGS(nargs);
-  size_t num_keyword_args = kwnames ? PyTuple_GET_SIZE(kwnames) : 0;
-  absl::Span<PyObject* const> positional_args(args, num_positional_args);
-  absl::Span<PyObject* const> keyword_args(args + num_positional_args,
-                                           num_keyword_args);
-
-  CallSignature call_signature;
-  std::vector<nb::object> keep_alive_objects;
-  absl::InlinedVector<nb::object, 2> flat_dynamic_args;
-  auto status = ParseArguments(
-      positional_args, keyword_args, kwnames, static_argnums_, static_argnames_,
-      pytree_registry_.get(), call_signature.arg_signature, flat_dynamic_args);
-  if (!status.ok()) {
-    VLOG(2) << "ParseArguments failed: " << status;
-    return fallback_to_cache_miss();
-  }
-
-  // Perform a few checks for the arguments. Currently we are only allowing
-  // committed PyArray inputs. For other cases, e.g. Tracers or ShapedArray, it
-  // will fallback to python. For jit, numpy arrays and scalars are also
-  // allowed, which we will check later.
-  for (const auto& arg : flat_dynamic_args) {
-    if (arg.type().ptr() != xla::PyArray::type().ptr()) {
-      continue;
-    }
-
-    xla::PyArray py_array = nb::borrow<xla::PyArray>(arg);
-
-    // Only allow committed PyArray in cpp pjit for now as the logic on handling
-    // sharding for uncommitted PyArray is complicated and still under
-    // development.
-    //
-    // TODO(chky): Consider support uncommitted PyArray in cpp when the python
-    // side stablizes.
-    if (!py_array.committed() &&
-        jax::Sharding::SafeNumDevices(py_array.sharding()) > 1) {
-      VLOG(2) << "PyArray argument is not committed and number of global "
-                 "devices is more than 1; fallback to python.";
-      return fallback_to_cache_miss();
-    }
-  }
-
-  status = ComputeCallSignature(flat_dynamic_args, call_signature);
-  if (!status.ok()) {
-    VLOG(2) << "ComputeCallSignature failed: " << status;
-    return fallback_to_cache_miss();
-  }
-
-  VLOG(2) << "CallSignature:\n" << call_signature.DebugString();
-  bool inserted = false;
-  std::shared_ptr<PjitCacheEntry> cache_entry;
-  {
-    nb::ft_object_guard lock(cache_);
-    cache_entry = executables_->GetOrCreateIfAbsent(
-        call_signature, [this, &inserted](const CallSignature& unused) {
-          inserted = true;
-          return std::make_shared<PjitCacheEntry>(pytree_registry_.get());
-        });
-  }
-
-  if (!cache_entry->compilation_complete.HasBeenNotified()) {
-    // In case of several threads attempting to compile the executable, only
-    // the one that inserted the item will perform the compilation.
-    if (inserted) {
-      nb::object out_and_fastpath_data;
-      nb::tuple out_tuple;
-      VLOG(2) << "Cache miss for " << call_signature.DebugString();
-      bool remove_cache = false;
-      try {
-        // Calls Python and may release the GIL. May also throw if
-        // compilation/tracing fails.
-        out_and_fastpath_data = cache_miss();
-        if (!out_and_fastpath_data.ptr()) {
-          throw nb::python_error();
-        }
-        out_tuple = nb::cast<nb::tuple>(out_and_fastpath_data);
-
-        PopulateCacheEntry(*cache_entry, out_tuple);
-
-        if (out_tuple.size() > 2 && out_tuple[2].is_valid()) {
-          remove_cache = nb::cast<bool>(out_tuple[2]);
-        }
-      } catch (const std::exception& e) {
-        VLOG(2) << "cache miss fail: " << e.what();
-        cache_entry->fall_back_to_python = true;
-        cache_entry->compilation_complete.Notify();
-        throw;
-      }
-      cache_entry->compilation_complete.Notify();
-
-      if (remove_cache) {
-        nb::ft_object_guard lock(cache_);
-        executables_->Remove(call_signature);
-      }
-
-      // We have already computed the result in the miss path so we can return
-      // it. We are even *required* to do so if there are donated arguments,
-      // because any donated buffers will now be invalid.
-      return nb::object(out_tuple[0]);
-    } else {
-      if (cache_entry->thread_id == std::this_thread::get_id()) {
-        auto error_string = absl::StrCat("Recursively calling jit: ",
-                                         call_signature.DebugString());
-        PyErr_SetString(PyExc_RecursionError, error_string.c_str());
-        throw nb::python_error();
-      }
-      // Release the GIL while we wait, making sure the compile thread can
-      // lock it.
-      nb::gil_scoped_release release;
-      cache_entry->compilation_complete.WaitForNotification();
-    }
-  }
-
-  if (cache_entry->fall_back_to_python) {
-    VLOG(2) << "cpp pjit fallback to python.";
-    return fallback_to_cache_miss();
-  }
-
-  // A vector of [num_inputs].
-  auto num_args_arrays = PrepareIfrtInputs(
-      *cache_entry->executable, flat_dynamic_args,
-      call_signature.jax_enable_x64, cache_entry->kept_var_bitvec,
-      cache_entry->in_shardings, cache_entry->in_device_local_layouts,
-      shard_arg_fallback_, keep_alive_objects);
-
-  if (!num_args_arrays.ok()) {
-    VLOG(2) << "Failed to prepare IFRT inputs: " << num_args_arrays.status();
-    return fallback_to_cache_miss();
-  }
-
-  xla::ifrt::ExecuteOptions execute_options =
-      cache_entry->executable->options();
-  execute_options.launch_id = cache_entry->executable->GetNextLaunchId();
-  execute_options.execution_stream_id =
-      tsl::Env::Default()->GetCurrentThreadId();
-
-  // A vector of [num_outputs].
-  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
-  {
-    nb::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(auto result,
-                        cache_entry->executable->ifrt_executable()->Execute(
-                            absl::MakeSpan(*num_args_arrays), execute_options,
-                            /*devices=*/std::nullopt));
-    output_arrays = std::move(result.outputs);
-  }
-
-  auto traceback = xla::Traceback::Get();
-
-  // Convert the ifrt::Array objects to PyArray.
-  int num_outputs = output_arrays.size();
-  absl::InlinedVector<nb::object, 4> outputs;
-  outputs.reserve(num_outputs);
-  for (int i = 0; i < num_outputs; ++i) {
-    // Creating the PyArray result. In addition to the IFRT arrays, the metadata
-    // like `aval` and `sharding` are retrieved from the cache for this
-    // function, which are produced by the python path in `cache_miss`.
-    xla::PyArray py_array(
-        cache_entry->out_avals[i], cache_entry->out_weak_types[i],
-        cache_entry->out_dtypes[i], cache_entry->out_shapes[i],
-        cache_entry->out_shardings[i], cache_entry->executable->client(),
-        traceback, std::move(output_arrays[i]),
-        /*committed=*/cache_entry->out_committed.at(i), /*skip_checks=*/true);
-
-    outputs.push_back(std::move(py_array));
-  }
-
-  nb::object out = nb::steal<nb::object>(
-      cache_entry->out_pytree_def.Unflatten(outputs).release().ptr());
-
-  // If there is a post-hook function, call it with the inputs and the outputs.
-  std::optional<nb::object> post_hook = GetPostHook();
-  if (post_hook) {
-    nb::tuple args_tuple =
-        nb::steal<nb::tuple>(PyTuple_New(num_positional_args));
-    for (size_t i = 0; i < num_positional_args; ++i) {
-      Py_INCREF(args[i]);
-      PyTuple_SET_ITEM(args_tuple.ptr(), i, args[i]);
-    }
-    nb::dict kwargs;
-    if (kwnames) {
-      for (size_t i = 0; i < num_keyword_args; ++i) {
-        kwargs[nb::handle(PyTuple_GET_ITEM(kwnames, i))] =
-            nb::borrow(args[num_positional_args + i]);
-      }
-    }
-    (*post_hook)(nb::handle(callable.ptr()), args_tuple, kwargs,
-                 nb::handle(out.ptr()));
-  }
-
-  return out;
-}
-
-absl::Status PjitFunction::ComputeCallSignature(
-    absl::Span<nb::object const> flat_dynamic_args, CallSignature& signature) {
-  signature.function_name = function_name_;
-
-  // Get dynamic argument signatures.
-  JitState& global_state = jax::GlobalJitState();
-  JitState& tls = jax::ThreadLocalJitState();
-  bool jax_enable_x64 = GetEnableX64();
-
-  signature.default_device = GetDefaultDevice();
-  signature.jax_enable_x64 = jax_enable_x64;
-
-  auto& dynamic_arg_signatures = signature.dynamic_arg_signatures;
-  dynamic_arg_signatures.reserve(flat_dynamic_args.size());
-  auto& dynamic_arg_shardings = signature.dynamic_arg_shardings;
-  dynamic_arg_shardings.reserve(flat_dynamic_args.size());
-  auto& dynamic_arg_layouts = signature.dynamic_arg_layouts;
-  dynamic_arg_layouts.reserve(flat_dynamic_args.size());
-
-  for (nb::handle arg : flat_dynamic_args) {
-    TF_ASSIGN_OR_RETURN(auto arg_signature,
-                        xla::PyArgSignatureOfValue(arg, jax_enable_x64));
-    signature.dynamic_arg_signatures.push_back(std::move(arg_signature));
-
-    // It should be already checked previously in the entry point of
-    // PjitFunction::Call().
-    if (arg.type().ptr() == xla::PyArray::type().ptr()) {
-      auto py_array = nb::borrow<xla::PyArray>(arg);
-      signature.dynamic_arg_shardings.push_back(py_array.sharding());
-      auto layout = py_array.layout();
-      if (absl::IsUnimplemented(layout.status())) {
-        signature.dynamic_arg_layouts.push_back(nullptr);
-      } else {
-        signature.dynamic_arg_layouts.push_back(*std::move(layout));
-      }
-      signature.committed_args.push_back(py_array.committed());
-    } else {
-      signature.dynamic_arg_shardings.push_back(nb::none());
-      signature.dynamic_arg_layouts.push_back(nullptr);
-      signature.committed_args.push_back(false);
-    }
-  }
-
-  signature.thread_local_extra_jit_context = tls.extra_jit_context;
-  signature.global_extra_jit_context = global_state.extra_jit_context;
-  signature.configs = JitConfigs();
-
-  return absl::OkStatus();
-}
-
-void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
-                                      const nb::tuple& out_and_fastpath_data) {
-  DCHECK_GE(out_and_fastpath_data.size(), 2);
-
-  if (out_and_fastpath_data[1].is_none()) {
-    VLOG(2) << "fastpath_data is none";
-    cache_entry.fall_back_to_python = true;
-    return;
-  }
-
-  nb::tuple fastpath_data = nb::cast<nb::tuple>(out_and_fastpath_data[1]);
-
-  cache_entry.executable = nb::cast<std::shared_ptr<xla::PyLoadedExecutable>>(
-      fastpath_data.attr("xla_executable"));
-
-  nb::sequence in_shardings = fastpath_data.attr("in_shardings");
-  cache_entry.in_shardings.reserve(nb::len(in_shardings));
-  for (nb::handle sharding : in_shardings) {
-    cache_entry.in_shardings.push_back(nb::borrow(sharding));
-  }
-
-  nb::sequence out_shardings = fastpath_data.attr("out_shardings");
-  cache_entry.out_shardings.reserve(nb::len(out_shardings));
-  for (nb::handle sharding : out_shardings) {
-    cache_entry.out_shardings.push_back(nb::borrow(sharding));
-  }
-
-  nb::sequence out_committed = fastpath_data.attr("out_committed");
-  cache_entry.out_committed.reserve(nb::len(out_committed));
-  for (nb::handle c : out_committed) {
-    cache_entry.out_committed.push_back(nb::cast<bool>(c));
-  }
-
-  nb::sequence out_avals = fastpath_data.attr("out_avals");
-  cache_entry.out_avals.reserve(nb::len(out_avals));
-  cache_entry.out_dtypes.reserve(nb::len(out_avals));
-  cache_entry.out_shapes.reserve(nb::len(out_avals));
-  cache_entry.out_weak_types.reserve(nb::len(out_avals));
-  for (nb::handle aval : out_avals) {
-    cache_entry.out_avals.push_back(nb::borrow(aval));
-    cache_entry.out_dtypes.push_back(aval.attr("dtype"));
-    cache_entry.out_shapes.push_back(
-        nb::cast<std::vector<int64_t>>(aval.attr("shape")));
-    cache_entry.out_weak_types.push_back(
-        nb::cast<bool>(aval.attr("weak_type")));
-  }
-
-  cache_entry.out_pytree_def = nb::cast<xla::PyTreeDef>(
-      nb::handle(fastpath_data.attr("out_pytree_def").ptr()));
-
-  nb::sequence kept_var_bitvec = fastpath_data.attr("kept_var_bitvec");
-  cache_entry.kept_var_bitvec.reserve(nb::len(kept_var_bitvec));
-  for (nb::handle k : kept_var_bitvec) {
-    cache_entry.kept_var_bitvec.push_back(nb::cast<bool>(k));
-  }
-
-  nb::sequence in_device_local_layouts =
-      fastpath_data.attr("in_device_local_layouts");
-  cache_entry.in_device_local_layouts.reserve(nb::len(in_device_local_layouts));
-  for (nb::handle dll : in_device_local_layouts) {
-    cache_entry.in_device_local_layouts.push_back(nb::borrow(dll));
-  }
-}
-
-// Helper function used by the tp_clear GC method.
-void PjitFunction::ClearPythonReferences() {
-  // TODO(mattjj): phawkins@ observed that the xla::PyTreeRegistry
-  // pytree_registry_ attribute of PjitFunction could in principle also have
-  // python references to clear
-  nb::callable cache_miss;
-  std::optional<nb::callable> fun;
-  nb::callable shard_arg_fallback;
-  // Swap values for nulls before they are destroyed. See the Python
-  // Py_CLEAR() documentation for a discussion of this topic.
-  std::swap(cache_miss_, cache_miss);
-  std::swap(fun_, fun);
-  std::swap(shard_arg_fallback_, shard_arg_fallback);
-}
-
-struct PjitFunctionObject {
-  PyObject_HEAD;
-#if PY_VERSION_HEX < 0x030C0000
-  PyObject* dict;      // Dictionary for __dict__
-  PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
-#endif                 // PY_VERSION_HEX < 0x030C0000
-  vectorcallfunc vectorcall;
-  PjitFunction fun;
-
-  // Doubly-linked list of PjitFunctionObjects, protected by
-  // PjitFunctionStore::mu_ or the GIL in GIL mode.
-  PjitFunctionObject* next;
-  PjitFunctionObject* prev;
-};
-
-// Contains a list of all PjitFunctionObjects.
-// Thread-safe.
-class PjitFunctionStore {
- public:
-  void Insert(PjitFunctionObject* o) {
-    nb::ft_lock_guard lock(mu_);
-    o->next = compiled_functions_;
-    o->prev = nullptr;
-    if (o->next) {
-      o->next->prev = o;
-    }
-    compiled_functions_ = o;
-  }
-
-  void Remove(PjitFunctionObject* o) {
-    nb::ft_lock_guard lock(mu_);
-    if (o->next) {
-      o->next->prev = o->prev;
-    }
-    if (o->prev) {
-      o->prev->next = o->next;
-    } else {
-      compiled_functions_ = o->next;
-    }
-  }
-
-  void ClearCaches() {
-    std::vector<
-        std::pair<nb::object, std::shared_ptr<PjitFunctionCache::Cache>>>
-        caches;
-    {
-      nb::ft_lock_guard lock(mu_);
-      for (PjitFunctionObject* fn = compiled_functions_; fn != nullptr;
-           fn = fn->next) {
-        caches.emplace_back(fn->fun.cache(), fn->fun.executables());
-      }
-    }
-    for (auto& [cache, executables] : caches) {
-      nb::ft_object_guard lock(cache);
-      executables->Clear();
-    }
-  };
-
- private:
-  // Protected by the GIL in GIL mode, and by mu_ in freethreading mode.
-  nb::ft_mutex mu_;
-  PjitFunctionObject* compiled_functions_;
-};
-
-PjitFunctionStore pjit_function_store;
-
-PyObject* PjitFunction_Type = nullptr;
-
-bool PjitFunction::IsPjitFunction(nb::handle handle) {
-  return handle.type().ptr() == PjitFunction_Type;
-}
-
-PjitFunction* PjitFunction::AsPjitFunctionUnchecked(nb::handle handle) {
-  return &(reinterpret_cast<PjitFunctionObject*>(handle.ptr())->fun);
-}
-
-PjitFunction* AsPjitFunction(nb::handle handle) {
-  if (!PjitFunction::IsPjitFunction(handle)) {
-    throw xla::XlaRuntimeError(xla::InvalidArgument("Expected a PjitFunction"));
-  }
-  return PjitFunction::AsPjitFunctionUnchecked(handle);
-}
-
-extern "C" {
-
-PyObject* PjitFunction_tp_vectorcall(PyObject* callable, PyObject* const* args,
-                                     size_t nargs, PyObject* kwnames) {
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(callable);
-  tsl::profiler::TraceMe traceme([&] {
-    return absl::StrCat("PjitFunction(", o->fun.function_name(), ")");
-  });
-  try {
-    absl::StatusOr<nb::object> out =
-        o->fun.Call(callable, args, nargs, kwnames);
-    if (!out.ok()) {
-      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
-      return nullptr;
-    }
-    return out.value().release().ptr();
-  } catch (nb::python_error& e) {
-    e.restore();
-    return nullptr;
-  } catch (nb::cast_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::invalid_argument& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::runtime_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  }
-}
-
-PyObject* PjitFunction_tp_new(PyTypeObject* subtype, PyObject* args,
-                              PyObject* kwds) {
-  PjitFunctionObject* self =
-      reinterpret_cast<PjitFunctionObject*>(subtype->tp_alloc(subtype, 0));
-  if (!self) return nullptr;
-#if PY_VERSION_HEX < 0x030C0000
-  self->dict = nullptr;
-  self->weakrefs = nullptr;
-#endif  // PY_VERSION_HEX < 0x030C0000
-  self->vectorcall = PjitFunction_tp_vectorcall;
-  return reinterpret_cast<PyObject*>(self);
-}
-
-void PjitFunction_tp_dealloc(PyObject* self) {
-  PyObject_GC_UnTrack(self);
-  PyTypeObject* tp = Py_TYPE(self);
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  pjit_function_store.Remove(o);
-  PyObject_ClearWeakRefs(self);
-#if PY_VERSION_HEX < 0x030C0000
-  Py_CLEAR(o->dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_ClearManagedDict(self);
-#else
-  PyObject_ClearManagedDict(self);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  o->fun.~PjitFunction();
-  tp->tp_free(self);
-  Py_DECREF(tp);
-}
-
-int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
-  // TODO(mattjj): phawkins@ observed that the xla::PyTreeRegistry
-  // pytree_registry_ attribute of PjitFunction could in principle also have
-  // python references to visit
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
-  Py_VISIT(Py_TYPE(self));
-#if PY_VERSION_HEX < 0x030C0000
-  Py_VISIT(o->dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_VisitManagedDict(self, visit, arg);
-#else
-  PyObject_VisitManagedDict(self, visit, arg);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  Py_VISIT(o->fun.cache_miss().ptr());
-  Py_VISIT(o->fun.shard_arg_fallback().ptr());
-  if (o->fun.fun()) {
-    Py_VISIT(o->fun.fun()->ptr());
-  }
-  return 0;
-}
-
-int PjitFunction_tp_clear(PyObject* self) {
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-#if PY_VERSION_HEX < 0x030C0000
-  Py_CLEAR(o->dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_ClearManagedDict(self);
-#else
-  PyObject_ClearManagedDict(self);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  o->fun.ClearPythonReferences();
-  return 0;
-}
-
-// Implements the Python descriptor protocol so JIT-compiled functions can be
-// used as bound methods. See:
-// https://docs.python.org/3/howto/descriptor.html#functions-and-methods
-PyObject* PjitFunction_tp_descr_get(PyObject* self, PyObject* obj,
-                                    PyObject* type) {
-  if (obj == nullptr || obj == Py_None) {
-    Py_INCREF(self);
-    return self;
-  }
-  return PyMethod_New(self, obj);
-}
-
-static PyGetSetDef PjitFunction_tp_getset[] = {
-    // Having a __dict__ seems necessary to allow !functool.wraps to override
-    // __doc__.
-    {const_cast<char*>("__dict__"), PyObject_GenericGetDict,
-     PyObject_GenericSetDict, nullptr, nullptr},
-    {nullptr, nullptr, nullptr, nullptr, nullptr}};
-
-PyObject* PjitFunction_tp_repr(PyObject* self) {
-  try {
-    const std::string& repr = absl::StrFormat(
-        "<PjitFunction of %s>", nb::cast<absl::string_view>(nb::repr(
-                                    nb::getattr(self, "__wrapped__"))));
-    return PyUnicode_FromString(repr.c_str());
-  } catch (...) {
-    // Ignore all errors when accessing a repr.
-    return PyUnicode_FromString("<PjitFunction>");
-  }
-}
-
-}  // extern "C"
-
-void InitializePjitFunction(
-    PjitFunctionObject* fn_obj, std::string function_name,
-    std::optional<nb::callable> fun, nb::callable cache_miss,
-    std::vector<int> static_argnums, std::vector<nb::str> static_argnames,
-    nb::object global_cache_key,
-    xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry,
-    nb::callable shard_arg_fallback,
-    xla::nb_class_ptr<PjitFunctionCache> cache) {
-  fn_obj->next = fn_obj->prev = nullptr;
-  if (nb::isinstance<nb::list>(global_cache_key)) {
-    global_cache_key = nb::tuple(global_cache_key);
-  }
-  new (&fn_obj->fun) PjitFunction(
-      std::move(function_name), std::move(fun), std::move(cache_miss),
-      std::move(static_argnums), std::move(static_argnames),
-      std::move(global_cache_key), std::move(pytree_registry),
-      std::move(shard_arg_fallback), std::move(cache));
-  // Handled separately because it is not exception safe to call this
-  // in the constructor because it leaves the object improperly constructed.
-  fn_obj->fun.InitExecutables();
-
-  // Only add the executable to the store after executables_ has been
-  // initialized. We want only fully constructed executables in the store.
-  pjit_function_store.Insert(fn_obj);
-}
-
-nb::object MakePjitFunction(
-    std::string function_name, std::optional<nb::callable> fun,
-    nb::callable cache_miss, std::vector<int> static_argnums,
-    std::vector<nb::str> static_argnames, nb::object global_cache_key,
-    xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry,
-    nb::callable shard_arg_fallback,
-    std::optional<xla::nb_class_ptr<PjitFunctionCache>> cache) {
-  nb::object obj = nb::steal<nb::object>(PjitFunction_tp_new(
-      reinterpret_cast<PyTypeObject*>(PjitFunction_Type), nullptr, nullptr));
-  PjitFunctionObject* fn_obj = reinterpret_cast<PjitFunctionObject*>(obj.ptr());
-  if (!cache) {
-    cache = xla::make_nb_class<PjitFunctionCache>(
-        PjitFunctionCache::kDefaultCapacity);
-  }
-  InitializePjitFunction(
-      fn_obj, std::move(function_name), std::move(fun), std::move(cache_miss),
-      std::move(static_argnums), std::move(static_argnames),
-      std::move(global_cache_key), std::move(pytree_registry),
-      std::move(shard_arg_fallback), std::move(*cache));
-  return obj;
-}
-
-// Version numbers for the pickled representations of
-// PjitFunction. Increment these if changing them.
-const int kPjitFunctionPickleVersion = 1;
-
-PyMemberDef PjitFunction_members[] = {
-    {"__vectorcalloffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, vectorcall)),
-     READONLY, nullptr},
-#if PY_VERSION_HEX < 0x030C0000
-    {"__dictoffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, dict)), READONLY,
-     nullptr},
-    {"__weaklistoffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, weakrefs)), READONLY,
-     nullptr},
-#endif  // PY_VERSION_HEX < 0x030C0000
-    {nullptr, 0, 0, 0, nullptr},
-};
-
-PyType_Slot PjitFunction_slots[] = {
-    {Py_tp_new, reinterpret_cast<void*>(PjitFunction_tp_new)},
-    {Py_tp_dealloc, reinterpret_cast<void*>(PjitFunction_tp_dealloc)},
-    {Py_tp_traverse, reinterpret_cast<void*>(PjitFunction_tp_traverse)},
-    {Py_tp_clear, reinterpret_cast<void*>(PjitFunction_tp_clear)},
-    {Py_tp_getset, reinterpret_cast<void*>(PjitFunction_tp_getset)},
-    {Py_tp_descr_get, reinterpret_cast<void*>(PjitFunction_tp_descr_get)},
-    {Py_tp_call, reinterpret_cast<void*>(PyVectorcall_Call)},
-    {Py_tp_repr, reinterpret_cast<void*>(PjitFunction_tp_repr)},
-    {Py_tp_members, reinterpret_cast<void*>(PjitFunction_members)},
-    {0, nullptr},
-};
-
-}  // namespace
-
-void BuildPjitSubmodule(nb::module_& m) {
-  nb::class_<PjitFunctionCache> cache(m, "PjitFunctionCache");
-  cache.def(nb::init<int>(),
-            nb::arg("capacity") = PjitFunctionCache::kDefaultCapacity);
-  cache.def("size", &PjitFunctionCache::Size, nb::lock_self());
-  cache.def("capacity", &PjitFunctionCache::Capacity, nb::lock_self());
-  cache.def("clear", &PjitFunctionCache::Clear, nb::lock_self());
-  cache.def_static("clear_all", []() { pjit_function_store.ClearCaches(); });
-  cache.def(
-      "__getstate__",
-      // Pickles as an empty cache; the client can repopulate as needed.
-      [](const PjitFunctionCache& cache) {
-        nb::dict pickle;
-        pickle["version"] = kPjitFunctionPickleVersion;
-        pickle["capacity"] = cache.Capacity();
-        return pickle;
-      },
-      nb::lock_self());
-  cache.def("__setstate__",
-            [](PjitFunctionCache* cache, const nb::dict& pickle) {
-              int version = nb::cast<int>(pickle["version"]);
-              if (version != kPjitFunctionPickleVersion) {
-                throw std::invalid_argument(absl::StrFormat(
-                    "Invalid PjitFunction pickle version, got %d, expected %d",
-                    version, kPjitFunctionPickleVersion));
-              }
-              int capacity = nb::cast<int>(pickle["capacity"]);
-              new (cache) PjitFunctionCache(capacity);
-            });
-
-  // We need to use heap-allocated type objects because we want to add
-  // additional methods dynamically.
-  std::string name =
-      absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".PjitFunction");
-  PyType_Spec PjitFunction_spec = {
-#if PY_VERSION_HEX < 0x030B0000
-      // Work around for https://github.com/python/cpython/issues/89478
-      // CPython 3.10 and earlier assume that the .name value remains alive
-      // forever.
-      /*.name=*/strdup(name.c_str()),
-#else
-      /*.name=*/name.c_str(),
-#endif  // PY_VERSION_HEX < 0x030B0000
-      /*.basicsize=*/static_cast<int>(sizeof(PjitFunctionObject)),
-      /*.itemsize=*/0,
-#if PY_VERSION_HEX < 0x030C0000
-      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
-          Py_TPFLAGS_HAVE_VECTORCALL,
-#else   // PY_VERSION_HEX < 0x030C0000
-      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
-          Py_TPFLAGS_HAVE_VECTORCALL | Py_TPFLAGS_MANAGED_DICT |
-          Py_TPFLAGS_MANAGED_WEAKREF,
-#endif  // PY_VERSION_HEX < 0x030C0000
-      /*.slots=*/PjitFunction_slots,
-  };
-  PjitFunction_Type = PyType_FromSpec(&PjitFunction_spec);
-  if (!PjitFunction_Type) {
-    throw nb::python_error();
-  }
-  nb::object cfun = nb::borrow<nb::object>(PjitFunction_Type);
-
-  // Add PjitFunction to the xla_extension module so it can be pickled.
-  m.attr("PjitFunction") = cfun;
-  cfun.attr("__getstate__") = nb::cpp_function(
-      [](const PjitFunction::object& self) {
-        PjitFunction* fn = self.func();
-        nb::dict pickle;
-        pickle["version"] = kPjitFunctionPickleVersion;
-        pickle["function_name"] = fn->function_name();
-        if (fn->fun().has_value()) {
-          pickle["fun"] = *fn->fun();
-        }
-        pickle["cache_miss"] = fn->cache_miss();
-        pickle["static_argnums"] = fn->static_argnums();
-        pickle["static_argnames"] = nb::cast(fn->static_argnames());
-        pickle["global_cache_key"] = fn->global_cache_key();
-        pickle["pytree_registry"] = nb::cast(fn->pytree_registry());
-        pickle["shard_arg_fallback"] = fn->shard_arg_fallback();
-        pickle["cache"] = fn->cache();
-        return pickle;
-      },
-      nb::is_method());
-  cfun.attr("__setstate__") = nb::cpp_function(
-      [](nb::object& self, const nb::dict& pickle) {
-        int version = nb::cast<int>(pickle["version"]);
-        if (version != kPjitFunctionPickleVersion) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Invalid PjitFunction pickle version, got %d, expected %d. "
-              "Pickling/Unpickling jitted functions using different JAX "
-              "versions is not supported.",
-              version, kPjitFunctionPickleVersion));
-        }
-        std::string function_name =
-            nb::cast<std::string>(pickle["function_name"]);
-        std::optional<nb::callable> fun;
-        if (pickle.contains("fun")) {
-          fun = nb::cast<nb::callable>(pickle["fun"]);
-        }
-        nb::callable cache_miss = nb::cast<nb::callable>(pickle["cache_miss"]);
-        std::vector<int> static_argnums =
-            nb::cast<std::vector<int>>(pickle["static_argnums"]);
-        std::vector<nb::str> static_argnames =
-            nb::cast<std::vector<nb::str>>(pickle["static_argnames"]);
-        nb::object global_cache_key = pickle["global_cache_key"];
-        xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry =
-            nb::cast<xla::nb_class_ptr<xla::PyTreeRegistry>>(
-                nb::handle(pickle["pytree_registry"].ptr()));
-        nb::callable shard_arg_fallback =
-            nb::cast<nb::callable>(pickle["shard_arg_fallback"]);
-        xla::nb_class_ptr<PjitFunctionCache> cache =
-            nb::cast<xla::nb_class_ptr<PjitFunctionCache>>(pickle["cache"]);
-        InitializePjitFunction(
-            reinterpret_cast<PjitFunctionObject*>(self.ptr()),
-            std::move(function_name), std::move(fun), std::move(cache_miss),
-            std::move(static_argnums), std::move(static_argnames),
-            std::move(global_cache_key), std::move(pytree_registry),
-            std::move(shard_arg_fallback), std::move(cache));
-      },
-      nb::is_method());
-  cfun.attr("__signature__") =
-      xla::nb_property_readonly([](nb::handle self) -> nb::object {
-        return AsPjitFunction(self)->PythonSignature();
-      });
-  cfun.attr("_cache_miss") =
-      xla::nb_property_readonly([](nb::handle self) -> nb::object {
-        return AsPjitFunction(self)->cache_miss();
-      });
-  // All private members are only for testing/debugging purposes
-  cfun.attr("_cache_size") = nb::cpp_function(
-      [](nb::handle self) -> int {
-        return AsPjitFunction(self)->cache_capacity();
-      },
-      nb::is_method());
-  cfun.attr("_clear_cache") = nb::cpp_function(
-      [](nb::handle self) { AsPjitFunction(self)->ClearCache(); },
-      nb::is_method());
-
-  m.def(
-      "pjit",
-      [](std::string function_name, std::optional<nb::callable> fun,
-         nb::callable cache_miss, std::vector<int> static_argnums,
-         std::vector<nb::str> static_argnames, nb::object global_cache_key,
-         nb::object pytree_registry, nb::callable shard_arg_fallback,
-         std::optional<xla::nb_class_ptr<PjitFunctionCache>> cache) {
-        xla::nb_class_ptr<xla::PyTreeRegistry> registry =
-            nb::cast<xla::nb_class_ptr<xla::PyTreeRegistry>>(
-                nb::handle(pytree_registry.ptr()));
-        return MakePjitFunction(
-            std::move(function_name), std::move(fun), std::move(cache_miss),
-            std::move(static_argnums), std::move(static_argnames),
-            std::move(global_cache_key), std::move(registry),
-            std::move(shard_arg_fallback), std::move(cache));
-      },
-      nb::arg("function_name"), nb::arg("fun").none(), nb::arg("cache_miss"),
-      nb::arg("static_argnums"), nb::arg("static_argnames"),
-      nb::arg("global_cache_key"), nb::arg("pytree_registry"),
-      nb::arg("shard_arg_fallback"), nb::arg("cache").none() = nb::none());
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/pjit.h b/third_party/xla/xla/python/pjit.h
deleted file mode 100644
index 798152c79194..000000000000
--- a/third_party/xla/xla/python/pjit.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PJIT_H_
-#define XLA_PYTHON_PJIT_H_
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace jax {
-
-void BuildPjitSubmodule(nanobind::module_& m);
-}
-
-#endif  // XLA_PYTHON_PJIT_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 4f54ff824287..f9d49edb433a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/python/pjrt_ifrt:pjrt_ifrt.bzl", "pjrt_ifrt_package_groups")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
@@ -41,8 +41,9 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes",
-        "//xla/tsl/concurrency:ref_count",
+        "//xla/service:computation_placer_hdr",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -133,6 +134,7 @@ cc_library(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -167,10 +169,10 @@ xla_cc_test(
         "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:device_test_util",
         "//xla/python/ifrt:tuple_impl_test_lib",
-        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -213,6 +215,8 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/ffi:execution_context",
+        "//xla/ffi:type_id_registry",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/translate/mhlo_to_hlo:type_to_shape",
         "//xla/pjrt:host_callback",
@@ -231,6 +235,8 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
+        "//xla/python/ifrt:client_impl_util",
+        "//xla/python/ifrt:user_context",
         "//xla/python/ifrt/hlo:hlo_program",
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_proto_cc",
@@ -240,6 +246,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -329,10 +336,89 @@ cc_library(
     hdrs = ["pjrt_dtype.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/python/ifrt",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "pjrt_layout",
+    srcs = ["pjrt_layout.cc"],
+    hdrs = ["pjrt_layout.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":pjrt_dtype",
+        "//xla:shape_util",
+        "//xla/pjrt:pjrt_layout",
+        "//xla/python/ifrt",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_layout_test",
+    srcs = ["pjrt_layout_test.cc"],
+    deps = [
+        ":pjrt_layout",
+        "//xla:shape_util",
+        "//xla/pjrt:pjrt_layout",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:mock",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_proto_library(
+    name = "pjrt_layout_serdes_proto",
+    srcs = ["pjrt_layout_serdes.proto"],
+    protodeps = ["//xla:xla_data_proto"],
+)
+
+cc_library(
+    name = "pjrt_layout_serdes",
+    srcs = ["pjrt_layout_serdes.cc"],
+    deps = [
+        ":pjrt_layout",
+        ":pjrt_layout_serdes_proto_cc",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:pjrt_layout",
+        "//xla/python/ifrt:serdes",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+    ],
+    alwayslink = True,
+)
+
+xla_cc_test(
+    name = "pjrt_layout_serdes_test",
+    srcs = ["pjrt_layout_serdes_test.cc"],
+    deps = [
+        ":pjrt_layout",
+        ":pjrt_layout_serdes",
+        "//xla:shape_util",
+        "//xla/pjrt:pjrt_layout",
+        "//xla/python/ifrt:serdes",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -350,7 +436,6 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
index c35f688d7853..afff4582bd97 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -61,8 +60,8 @@ namespace ifrt {
 char BasicStringArray::ID = 0;
 
 absl::StatusOr<tsl::RCReference<BasicStringArray>> BasicStringArray::Create(
-    Client* client, Shape shape, std::shared_ptr<const Sharding> sharding,
-    Future<Buffers> buffers, OnDoneWithBuffer on_done_with_buffer) {
+    Client* client, Shape shape, ShardingRef sharding, Future<Buffers> buffers,
+    OnDoneWithBuffer on_done_with_buffer) {
   if (!buffers.IsValid()) {
     return absl::InvalidArgumentError("Got buffers_ future is invalid");
   }
@@ -111,7 +110,7 @@ absl::StatusOr<tsl::RCReference<BasicStringArray>> BasicStringArray::Create(
 }
 
 BasicStringArray::BasicStringArray(Client* client, Shape shape,
-                                   std::shared_ptr<const Sharding> sharding,
+                                   ShardingRef sharding,
                                    Future<Buffers> buffers,
                                    Future<> ready_future,
                                    OnDoneWithBuffer on_done_with_buffer)
@@ -155,15 +154,7 @@ Future<> BasicStringArray::GetReadyFuture() const {
   return ready_future_;
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-BasicStringArray::DisassembleIntoSingleDeviceArrays(
-    ArrayCopySemantics semantics) {
-  DCHECK(this);
-  return DisassembleIntoSingleDeviceArrays(
-      semantics, SingleDeviceShardSemantics::kAllShards);
-}
-
-absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+absl::StatusOr<std::vector<ArrayRef>>
 BasicStringArray::DisassembleIntoSingleDeviceArrays(
     ArrayCopySemantics semantics,
     SingleDeviceShardSemantics single_device_shard_semantics) {
@@ -251,7 +242,7 @@ BasicStringArray::DisassembleIntoSingleDeviceArrays(
   // up above runs.
   TF_ASSIGN_OR_RETURN(auto shapes_and_shadings, sharding_->Disassemble(shape_));
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   arrays.reserve(num_shards);
   for (int i = 0; i < num_shards; ++i) {
     TF_ASSIGN_OR_RETURN(auto array,
@@ -302,7 +293,7 @@ Future<> BasicStringArray::CopyToHostBuffer(
   return copy_completion_future;
 }
 
-absl::StatusOr<tsl::RCReference<Array>> BasicStringArray::Copy(
+absl::StatusOr<ArrayRef> BasicStringArray::Copy(
     std::optional<xla::ifrt::DeviceListRef> devices,
     std::optional<xla::ifrt::MemoryKind> memory_kind,
     ArrayCopySemantics semantics) {
@@ -360,7 +351,7 @@ absl::StatusOr<tsl::RCReference<Array>> BasicStringArray::Copy(
 }
 
 // Makes a single sharded BasicStringArray from the first shard.
-absl::StatusOr<tsl::RCReference<Array>> BasicStringArray::FullyReplicatedShard(
+absl::StatusOr<ArrayRef> BasicStringArray::FullyReplicatedShard(
     ArrayCopySemantics semantics) {
   absl::MutexLock lock(&mu_);
   if (is_deleted_) {
@@ -415,8 +406,8 @@ absl::StatusOr<tsl::RCReference<Array>> BasicStringArray::FullyReplicatedShard(
       std::move(buffers_future), std::move(on_done_with_buffer));
 }
 
-absl::StatusOr<std::shared_ptr<const PjRtLayout>> BasicStringArray::layout()
-    const {
+absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>>
+BasicStringArray::layout() const {
   return absl::UnimplementedError("String arrays do not support PjRtLayout");
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
index c89a302df49c..d25d247f6309 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/strings/cord.h"
 #include "absl/synchronization/mutex.h"
@@ -70,12 +69,12 @@ class BasicStringArray final
   // The number and order of buffers must match the number and order of devices
   // in `sharding`.
   static absl::StatusOr<tsl::RCReference<BasicStringArray>> Create(
-      Client* client, Shape shape, std::shared_ptr<const Sharding> sharding,
+      Client* client, Shape shape, ShardingRef sharding,
       Future<Buffers> buffers, OnDoneWithBuffer on_done_with_buffer);
 
   ~BasicStringArray() override;
 
-  absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+  absl::StatusOr<ArrayRef> FullyReplicatedShard(
       ArrayCopySemantics semantics) override;
 
   // ifrt::Array API
@@ -100,17 +99,15 @@ class BasicStringArray final
     return *sharding_;
   }
 
-  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+  ShardingRef shared_ptr_sharding() const override {
     DCHECK(this);
     return sharding_;
   }
 
-  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override;
+  absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> layout()
+      const override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(
+  absl::StatusOr<std::vector<ArrayRef>> DisassembleIntoSingleDeviceArrays(
       ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics) override;
 
@@ -119,7 +116,7 @@ class BasicStringArray final
       void* data, std::optional<absl::Span<const int64_t>> byte_strides,
       ArrayCopySemantics semantics) override;
 
-  absl::StatusOr<tsl::RCReference<Array>> Copy(
+  absl::StatusOr<ArrayRef> Copy(
       std::optional<xla::ifrt::DeviceListRef> devices,
       std::optional<xla::ifrt::MemoryKind> memory_kind,
       ArrayCopySemantics semantics);
@@ -145,8 +142,7 @@ class BasicStringArray final
   template <typename T, typename... Args>
   friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
 
-  BasicStringArray(Client* client, Shape shape,
-                   std::shared_ptr<const Sharding> sharding,
+  BasicStringArray(Client* client, Shape shape, ShardingRef sharding,
                    Future<Buffers> buffers, Future<> ready_future,
                    OnDoneWithBuffer on_done_with_buffer);
 
@@ -155,7 +151,7 @@ class BasicStringArray final
 
   Client* client_;
   Shape shape_;
-  std::shared_ptr<const Sharding> sharding_;
+  ShardingRef sharding_;
   Future<Buffers> buffers_;
   Future<> ready_future_;
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
index ee79a92b121e..b79d5133ce9d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
@@ -66,13 +66,12 @@ using ::tsl::testing::StatusIs;
 // user-supplied buffers and on_done_with_buffer callback by means of the
 // factory method: `BasicStringArray::Create`. Uses the first device from the
 // `client->addressable_devices()`.
-absl::StatusOr<tsl::RCReference<BasicStringArray>> CreateTestArray(
+absl::StatusOr<ArrayRef> CreateTestArray(
     Client* client, Future<BasicStringArray::Buffers> buffers,
     BasicStringArray::OnDoneWithBuffer on_done_with_buffer) {
   Shape shape({1});
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   return BasicStringArray::Create(client, shape, sharding, std::move(buffers),
                                   std::move(on_done_with_buffer));
@@ -110,8 +109,7 @@ CreateNonReadyTestArray(
   auto buffers_promise = Future<BasicStringArray::Buffers>::CreatePromise();
   auto buffers_future = Future<BasicStringArray::Buffers>(buffers_promise);
   Shape shape({1});
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSIGN_OR_RETURN(auto array,
                       BasicStringArray::Create(client, shape, sharding,
@@ -277,8 +275,7 @@ TEST(MakeArrayFromHostBufferTest, SuccessCase) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   Shape shape({1});
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   auto strings = std::make_shared<std::vector<absl::Cord>>();
   strings->push_back(absl::Cord("abc"));
@@ -297,7 +294,7 @@ TEST(MakeArrayFromHostBufferTest, FailureCases) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   Shape shape({1});
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> single_device_sharding =
+  ShardingRef single_device_sharding =
       SingleDeviceSharding::Create(device, MemoryKind());
   auto strings = std::make_shared<std::vector<absl::Cord>>();
   strings->push_back(absl::Cord("abc"));
@@ -318,7 +315,7 @@ TEST(MakeArrayFromHostBufferTest, FailureCases) {
 
   // MakeArrayFromHostBuffer should check and fail if the sharding is not a
   // SingleDeviceSharding.
-  std::shared_ptr<const Sharding> opaque_sharding =
+  ShardingRef opaque_sharding =
       OpaqueSharding::Create(client->MakeDeviceList({device}), MemoryKind());
   EXPECT_THAT(client->MakeArrayFromHostBuffer(
                   data, DType(DType::kString), shape,
@@ -344,12 +341,11 @@ TEST(MakeArrayFromHostBufferTest, FailureCases) {
 
 // Makes a single device sharded string ifrt::Array. Makes the necessary host
 // string buffers.
-absl::StatusOr<tsl::RCReference<Array>> MakeSingleDeviceStringTestArray(
+absl::StatusOr<ArrayRef> MakeSingleDeviceStringTestArray(
     absl::Span<const std::string> contents, Client* client,
     Device* const device) {
   Shape shape(absl::MakeConstSpan({static_cast<int64_t>(contents.size())}));
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   auto strings = std::make_shared<std::vector<absl::Cord>>();
   for (const auto& content : contents) {
@@ -367,14 +363,13 @@ absl::StatusOr<tsl::RCReference<Array>> MakeSingleDeviceStringTestArray(
 
 // Makes a single device sharded test array containing floats on the given
 // Device.
-absl::StatusOr<tsl::RCReference<Array>> MakeSingleDeviceFloatTestArray(
-    Client* client, Device* const device) {
+absl::StatusOr<ArrayRef> MakeSingleDeviceFloatTestArray(Client* client,
+                                                        Device* const device) {
   DType dtype(DType::kF32);
   Shape shape({2, 3});
   auto data = std::make_unique<std::vector<float>>(6);
   std::iota(data->begin(), data->end(), 0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   return client->MakeArrayFromHostBuffer(
       data->data(), dtype, shape,
@@ -385,7 +380,7 @@ absl::StatusOr<tsl::RCReference<Array>> MakeSingleDeviceFloatTestArray(
 
 // Makes a sharded string array with two shards. Uses the first two strings from
 // the input `data`, one per shard.
-absl::StatusOr<tsl::RCReference<Array>> MakeShardedStringTestArray(
+absl::StatusOr<ArrayRef> MakeShardedStringTestArray(
     Client* client, absl::Span<const std::string> data,
     bool is_fully_replicated) {
   if (data.size() < 2) {
@@ -398,11 +393,11 @@ absl::StatusOr<tsl::RCReference<Array>> MakeShardedStringTestArray(
         "Test client has too few devices. Need 2, got:", devices.size()));
   }
 
-  std::shared_ptr<const Sharding> sharding = ConcreteEvenSharding::Create(
+  ShardingRef sharding = ConcreteEvenSharding::Create(
       client->MakeDeviceList({devices[0], devices[1]}), MemoryKind(),
       Shape({2, 1}), Shape({1}), is_fully_replicated);
 
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   for (int i = 0; i < 2; ++i) {
     TF_ASSIGN_OR_RETURN(auto array, MakeSingleDeviceStringTestArray(
                                         {data[i]}, client, devices[i]));
@@ -439,10 +434,10 @@ TEST(AssembleArrayFromSingleDeviceArraysTest, FailsWithNonStringArrays) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   auto devices = client->addressable_devices();
   ASSERT_GE(devices.size(), 2);
-  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+  ShardingRef opaque_sharding = OpaqueSharding::Create(
       client->MakeDeviceList({devices[0], devices[1]}), MemoryKind());
 
-  std::vector<tsl::RCReference<Array>> arrays(2);
+  std::vector<ArrayRef> arrays(2);
   TF_ASSERT_OK_AND_ASSIGN(
       arrays[0], MakeSingleDeviceFloatTestArray(client.get(), devices[0]));
   TF_ASSERT_OK_AND_ASSIGN(
@@ -460,10 +455,10 @@ TEST(AssembleArrayFromSingleDeviceArraysTest,
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   auto devices = client->addressable_devices();
   ASSERT_GE(devices.size(), 2);
-  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+  ShardingRef opaque_sharding = OpaqueSharding::Create(
       client->MakeDeviceList({devices[0], devices[1]}), MemoryKind());
 
-  std::vector<tsl::RCReference<Array>> arrays(2);
+  std::vector<ArrayRef> arrays(2);
   const std::vector<std::string> per_shard_contents({"abc", "def"});
   TF_ASSERT_OK_AND_ASSIGN(
       arrays[0], MakeShardedStringTestArray(client.get(), per_shard_contents,
@@ -483,11 +478,11 @@ TEST(AssembleArrayFromSingleDeviceArraysTest,
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   auto devices = client->addressable_devices();
   ASSERT_GE(devices.size(), 2);
-  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+  ShardingRef opaque_sharding = OpaqueSharding::Create(
       client->MakeDeviceList({devices[0], devices[1]}), MemoryKind());
 
   // Make two non-ready single device sharded arrays.
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   std::vector<Promise<BasicStringArray::Buffers>> promises;
   arrays.reserve(2);
   auto buf_and_on_done_with_buffer = MakeBuffersAndOnDoneWithBuffer({"abc"});
@@ -533,11 +528,11 @@ TEST(AssembleArrayFromSingleDeviceArraysTest,
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   auto devices = client->addressable_devices();
   ASSERT_GE(devices.size(), 2);
-  std::shared_ptr<const Sharding> opaque_sharding = OpaqueSharding::Create(
+  ShardingRef opaque_sharding = OpaqueSharding::Create(
       client->MakeDeviceList({devices[0], devices[1]}), MemoryKind());
 
   // Make two non-ready single device sharded arrays.
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   std::vector<Promise<BasicStringArray::Buffers>> promises;
   arrays.reserve(2);
 
@@ -594,7 +589,8 @@ TEST(DisassembleArrayIntoSingleDeviceArrays,
 
   TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
                           array->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy));
+                              ArrayCopySemantics::kAlwaysCopy,
+                              SingleDeviceShardSemantics::kAddressableShards));
 
   ASSERT_EQ(disassembled_arrays.size(), 1);
   auto basic_string_array =
@@ -616,7 +612,8 @@ TEST(DisassembleArrayIntoSingleDeviceArrays, ShardedArrayDisassembleSuccess) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
                           array->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy));
+                              ArrayCopySemantics::kAlwaysCopy,
+                              SingleDeviceShardSemantics::kAddressableShards));
 
   ASSERT_EQ(disassembled_arrays.size(), 2);
 
@@ -642,9 +639,10 @@ TEST(DisassembleArrayIntoSingleDeviceArrays, FailsIfTheArrayHasBeenDeleted) {
 
   array->Delete();
 
-  EXPECT_THAT(
-      array->DisassembleIntoSingleDeviceArrays(ArrayCopySemantics::kAlwaysCopy),
-      StatusIs(absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(array->DisassembleIntoSingleDeviceArrays(
+                  ArrayCopySemantics::kAlwaysCopy,
+                  SingleDeviceShardSemantics::kAddressableShards),
+              StatusIs(absl::StatusCode::kFailedPrecondition));
 }
 
 TEST(CopyTest, SuccessSingleDeviceShardedArray) {
@@ -653,7 +651,7 @@ TEST(CopyTest, SuccessSingleDeviceShardedArray) {
   ASSERT_GE(devices.size(), 2);
 
   auto [buffers, on_done_with_buffer] = MakeBuffersAndOnDoneWithBuffer({"abc"});
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
@@ -681,7 +679,7 @@ TEST(CopyTest, SuccessMultiDeviceShardedArray) {
   ASSERT_GE(devices.size(), 4);
 
   const std::vector<std::string> per_shard_contents({"shard 0", "shard 1"});
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       MakeShardedStringTestArray(client.get(), per_shard_contents,
@@ -708,7 +706,7 @@ TEST(CopyTest, FailsAfterDeletion) {
   ASSERT_GE(devices.size(), 2);
 
   auto [buffers, on_done_with_buffer] = MakeBuffersAndOnDoneWithBuffer({"abc"});
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
@@ -728,7 +726,7 @@ TEST(CopyTest, FailsWithDifferentNumbersDevices) {
   ASSERT_GE(devices.size(), 2);
 
   auto [buffers, on_done_with_buffer] = MakeBuffersAndOnDoneWithBuffer({"abc"});
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   TF_ASSERT_OK_AND_ASSIGN(
       arrays.emplace_back(),
       CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
@@ -752,7 +750,7 @@ TEST(CopyTest, NonReadySourceArraySuccessfullyBecomesReadyAfterCopy) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto ret, CreateNonReadyTestArray(client.get(), devices[0],
                                         std::move(on_done_with_buffer)));
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   arrays.push_back(std::move(ret.first));
   auto promise = std::move(ret.second);
 
@@ -790,7 +788,7 @@ TEST(CopyTest, NonReadySourceArrayFailsToBecomeReadyAfterCopy) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto ret, CreateNonReadyTestArray(client.get(), devices[0],
                                         std::move(on_done_with_buffer)));
-  std::vector<tsl::RCReference<Array>> arrays;
+  std::vector<ArrayRef> arrays;
   arrays.push_back(std::move(ret.first));
   auto promise = std::move(ret.second);
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 210a8cdf4433..0b1c9e8a16db 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -68,17 +69,17 @@ static const xla::ifrt::MemoryKind kPinnedHostMemoryKind(
 // Validates the sharding and PjRtBuffers have consistent device and memory
 // kind.
 absl::Status ValidateArrayCreationInput(
-    std::shared_ptr<const Sharding> sharding,
+    PjRtCompatibleClient* client, ShardingRef sharding,
     const PjRtArray::PjRtBuffers& pjrt_buffers) {
-  if (pjrt_buffers.empty()) {
-    return InvalidArgument("pjrt_buffers must be non-empty");
-  }
   absl::Span<Device* const> sharding_devices =
       sharding->devices()->AddressableDeviceList()->devices();
   if (sharding_devices.size() != pjrt_buffers.size()) {
     return InvalidArgument("device and buffer counts mismatch: %d vs. %d",
                            sharding_devices.size(), pjrt_buffers.size());
   }
+  if (pjrt_buffers.empty()) {
+    return absl::OkStatus();
+  }
 
   // Canonicalize memory kind in case it hasn't been done before.
   MemoryKind canonicalized_sharding_memory_kind =
@@ -89,6 +90,11 @@ absl::Status ValidateArrayCreationInput(
     if (!device) {
       return InvalidArgument("Sharding device %d is not a PjRtDevice", i);
     }
+    if (device->client() != client) {
+      return InvalidArgument(
+          "sharding client mismatches array client: %s vs %s",
+          sharding_devices[i]->DebugString(), client->platform_version());
+    }
     if (pjrt_buffers[i]->device() != device->pjrt_device()) {
       return InvalidArgument(
           "PjRtBuffer's memory space is addressed by device %s vs sharding is "
@@ -145,18 +151,24 @@ MemoryKind MakeMemoryKindFromPjRtBuffer(PjRtBuffer* pjrt_buffer) {
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
     PjRtCompatibleClient* client, DType dtype, Shape shape,
-    std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers) {
-  TF_RETURN_IF_ERROR(ValidateArrayCreationInput(sharding, pjrt_buffers));
+    ShardingRef sharding, PjRtBuffers pjrt_buffers,
+    std::shared_ptr<const xla::PjRtLayout> layout) {
+  TF_RETURN_IF_ERROR(
+      ValidateArrayCreationInput(client, sharding, pjrt_buffers));
   return tsl::MakeRef<PjRtArray>(client, dtype, std::move(shape),
-                                 std::move(sharding), std::move(pjrt_buffers));
+                                 std::move(sharding), std::move(pjrt_buffers),
+                                 std::move(layout));
 }
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
     PjRtCompatibleClient* client, DType dtype, DynamicShape dynamic_shape,
-    std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers) {
-  TF_RETURN_IF_ERROR(ValidateArrayCreationInput(sharding, pjrt_buffers));
+    ShardingRef sharding, PjRtBuffers pjrt_buffers,
+    std::shared_ptr<const xla::PjRtLayout> layout) {
+  TF_RETURN_IF_ERROR(
+      ValidateArrayCreationInput(client, sharding, pjrt_buffers));
   return tsl::MakeRef<PjRtArray>(client, dtype, std::move(dynamic_shape),
-                                 std::move(sharding), std::move(pjrt_buffers));
+                                 std::move(sharding), std::move(pjrt_buffers),
+                                 std::move(layout));
 }
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
@@ -167,12 +179,15 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
                       client->LookupPjRtDevice(pjrt_buffer->device()));
   auto sharding = SingleDeviceSharding::Create(
       device, MakeMemoryKindFromPjRtBuffer(pjrt_buffer.get()));
-  return tsl::MakeRef<PjRtArray>(client, dtype, std::move(shape),
-                                 std::move(sharding),
-                                 PjRtBuffers({std::move(pjrt_buffer)}));
+  auto layout = (dtype.kind() == DType::kToken)
+                    ? std::make_shared<xla::PjRtLayout>(xla::Layout())
+                    : pjrt_buffer->layout();
+  return tsl::MakeRef<PjRtArray>(
+      client, dtype, std::move(shape), std::move(sharding),
+      PjRtBuffers({std::move(pjrt_buffer)}), std::move(layout));
 }
 
-absl::StatusOr<tsl::RCReference<Array>> PjRtArray::FullyReplicatedShard(
+absl::StatusOr<ArrayRef> PjRtArray::FullyReplicatedShard(
     ArrayCopySemantics semantics) {
   return PjRtArray::Create(client(), GetPjRtBuffer(semantics, 0));
 }
@@ -195,6 +210,9 @@ std::shared_ptr<PjRtBuffer> PjRtArray::GetPjRtBuffer(
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
     PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers) {
+  if (pjrt_buffers.empty()) {
+    return InvalidArgument("PjRtBuffers must be non-empty.");
+  }
   TF_ASSIGN_OR_RETURN(auto dtype,
                       xla::ifrt::ToDType(pjrt_buffers.front()->element_type()));
   TF_ASSIGN_OR_RETURN(MemoryKind memory_kind,
@@ -215,13 +233,17 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
       BasicDeviceList::Create(std::move(devices)), memory_kind,
       /*shape=*/shape,
       /*shard_shapes=*/shapes);
+  auto layout = pjrt_buffers.front()->layout();
   return PjRtArray::Create(client, dtype, std::move(shape), std::move(sharding),
-                           std::move(pjrt_buffers));
+                           std::move(pjrt_buffers), std::move(layout));
 }
 
 absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
     PjRtCompatibleClient* client, DynamicShape dynamic_shape,
     PjRtBuffers pjrt_buffers) {
+  if (pjrt_buffers.empty()) {
+    return InvalidArgument("PjRtBuffers must be non-empty.");
+  }
   TF_ASSIGN_OR_RETURN(auto dtype,
                       xla::ifrt::ToDType(pjrt_buffers.front()->element_type()));
   TF_ASSIGN_OR_RETURN(auto memory_kind,
@@ -248,37 +270,34 @@ absl::StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
       BasicDeviceList::Create(std::move(devices)), memory_kind,
       /*dynamic_shape=*/dynamic_shape,
       /*shard_dynamic_shapes=*/dynamic_shapes);
+  auto layout = pjrt_buffers.front()->layout();
   return PjRtArray::Create(client, dtype, std::move(dynamic_shape),
-                           std::move(sharding), std::move(pjrt_buffers));
+                           std::move(sharding), std::move(pjrt_buffers),
+                           std::move(layout));
 }
 
 PjRtArray::PjRtArray(PjRtCompatibleClient* client, DType dtype, Shape shape,
-                     std::shared_ptr<const Sharding> sharding,
-                     PjRtBuffers pjrt_buffers)
+                     ShardingRef sharding, PjRtBuffers pjrt_buffers,
+                     std::shared_ptr<const xla::PjRtLayout> layout)
     : client_(client),
       dtype_(dtype),
       shape_(std::move(shape)),
       sharding_(std::move(sharding)),
-      pjrt_buffers_(std::move(pjrt_buffers)) {}
+      pjrt_buffers_(std::move(pjrt_buffers)),
+      layout_(std::move(layout)) {}
 
 PjRtArray::PjRtArray(PjRtCompatibleClient* client, DType dtype,
-                     DynamicShape dynamic_shape,
-                     std::shared_ptr<const Sharding> sharding,
-                     PjRtBuffers pjrt_buffers)
+                     DynamicShape dynamic_shape, ShardingRef sharding,
+                     PjRtBuffers pjrt_buffers,
+                     std::shared_ptr<const xla::PjRtLayout> layout)
     : client_(client),
       dtype_(dtype),
       shape_(std::move(dynamic_shape)),
       sharding_(std::move(sharding)),
-      pjrt_buffers_(std::move(pjrt_buffers)) {}
-
-absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-PjRtArray::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
-  DCHECK(this);
-  return DisassembleIntoSingleDeviceArrays(
-      semantics, SingleDeviceShardSemantics::kAddressableShards);
-}
+      pjrt_buffers_(std::move(pjrt_buffers)),
+      layout_(std::move(layout)) {}
 
-absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+absl::StatusOr<std::vector<ArrayRef>>
 PjRtArray::DisassembleIntoSingleDeviceArrays(
     ArrayCopySemantics semantics,
     SingleDeviceShardSemantics single_device_shard_semantics) {
@@ -290,7 +309,7 @@ PjRtArray::DisassembleIntoSingleDeviceArrays(
         "devices: %v",
         *sharding_->devices());
   }
-  std::vector<tsl::RCReference<Array>> result;
+  std::vector<ArrayRef> result;
   result.reserve(sharding_->devices()->AddressableDeviceList()->size());
   TF_RETURN_IF_ERROR(std::visit(
       [&](const auto& this_shape) {
@@ -307,7 +326,7 @@ PjRtArray::DisassembleIntoSingleDeviceArrays(
               PjRtArray::Create(client_, dtype_,
                                 std::move(shape_and_shardings[i].first),
                                 std::move(shape_and_shardings[i].second),
-                                std::move(buffers)));
+                                std::move(buffers), layout_));
           result.push_back(std::move(array));
         }
         return absl::OkStatus();
@@ -399,7 +418,7 @@ absl::StatusOr<Memory*> GetMemorySpaceFromMemoryKind(
   return memory;
 }
 
-absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Copy(
+absl::StatusOr<ArrayRef> PjRtArray::Copy(
     std::optional<xla::ifrt::DeviceListRef> devices,
     std::optional<xla::ifrt::MemoryKind> memory_kind,
     ArrayCopySemantics semantics) {
@@ -423,6 +442,7 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Copy(
       canonicalized_sharding_memory_kind.memory_kind().has_value();
   const absl::Span<Device* const> new_sharding_devices =
       new_sharding->devices()->devices();
+  PjRtCompatibleClient* new_client = nullptr;
   for (int i = 0; i < pjrt_buffers_.size(); ++i) {
     TF_ASSIGN_OR_RETURN(Device * buffer_device,
                         client_->LookupPjRtDevice(pjrt_buffers_[i]->device()));
@@ -435,27 +455,18 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Copy(
     // No need for data transfer.
     if (devices_equal && (!new_sharding_has_memory_kind || memory_kind_equal)) {
       switch (semantics) {
-        case ArrayCopySemantics::kAlwaysCopy:
-          // HBM is the only thing that doesn't support same-device copy and
-          // both pinned_host and unpinned_host support it. But unpinned_host
-          // support is unimplemented.
-          if (canonicalized_sharding_memory_kind == kPinnedHostMemoryKind) {
-            TF_ASSIGN_OR_RETURN(auto memory,
-                                GetMemorySpaceFromMemoryKind(
-                                    new_sharding_devices[i],
-                                    canonicalized_sharding_memory_kind));
-            PjRtMemory* pjrt_memory = llvm::dyn_cast<PjRtMemory>(memory);
-            TF_ASSIGN_OR_RETURN(auto copied_buffer,
-                                pjrt_buffers_[i]->CopyToMemorySpace(
-                                    pjrt_memory->pjrt_memory()));
-            buffers.push_back(std::move(copied_buffer));
-          } else {
-            // TODO(hyeontaek): kAlwaysCopy should clone the buffer, but the
-            // PjRt API does not have efficient buffer cloning on the same
-            // device.
-            buffers.push_back(pjrt_buffers_[i]);
-          }
+        case ArrayCopySemantics::kAlwaysCopy: {
+          TF_ASSIGN_OR_RETURN(
+              auto memory,
+              GetMemorySpaceFromMemoryKind(new_sharding_devices[i],
+                                           canonicalized_sharding_memory_kind));
+          PjRtMemory* pjrt_memory = llvm::dyn_cast<PjRtMemory>(memory);
+          TF_ASSIGN_OR_RETURN(
+              auto copied_buffer,
+              pjrt_buffers_[i]->CopyToMemorySpace(pjrt_memory->pjrt_memory()));
+          buffers.push_back(std::move(copied_buffer));
           break;
+        }
         case ArrayCopySemantics::kReuseInput:
           buffers.push_back(pjrt_buffers_[i]);
           break;
@@ -468,6 +479,7 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Copy(
     } else {
       PjRtCompatibleDevice* pjrt_device =
           llvm::dyn_cast<PjRtCompatibleDevice>(new_sharding_devices[i]);
+      new_client = llvm::dyn_cast<PjRtCompatibleClient>(pjrt_device->client());
       if (!pjrt_device) {
         return InvalidArgument(
             "The destination device is owned by a non-PjRt-compatible client. "
@@ -505,10 +517,14 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Copy(
       buffers.push_back(std::move(copied_buffer));
     }
   }
+  if (new_client == nullptr) {
+    new_client = client_;
+  }
   return std::visit(
-      [this, &new_sharding, &buffers](const auto& shape) {
-        return PjRtArray::Create(client_, dtype_, shape,
-                                 std::move(new_sharding), std::move(buffers));
+      [this, new_client, &new_sharding, &buffers](const auto& shape) {
+        return PjRtArray::Create(new_client, dtype_, shape,
+                                 std::move(new_sharding), std::move(buffers),
+                                 layout_);
       },
       shape_);
 }
@@ -531,6 +547,7 @@ Future<> PjRtArray::Delete() {
   for (auto& buffer : pjrt_buffers_) {
     buffer->Delete();
   }
+  is_deleted_ = true;
   // TODO(hyeontaek): Return a correct future.
   return Future<>(absl::OkStatus());
 }
@@ -539,12 +556,13 @@ bool PjRtArray::IsDeleted() const {
   DCHECK(this);
   // TODO(hyeontaek): This may be incorrect if PjRtBuffers are shared and a
   // portion of pjrt_buffers_ is deleted or not deleted.
-  return pjrt_buffers_.front()->IsDeleted();
+  return is_deleted_ ||
+         (!pjrt_buffers_.empty() && pjrt_buffers_.front()->IsDeleted());
 }
 
 std::string PjRtArray::DebugString() const {
   DCHECK(this);
-  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout_ptr = layout();
+  absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> layout_ptr = layout();
   std::string layout_str =
       layout_ptr.ok() ? (*layout_ptr)->ToString() : "<unknown>";
 
@@ -555,21 +573,19 @@ std::string PjRtArray::DebugString() const {
       sharding_->DebugString(), layout_str);
 }
 
-// TODO(b/330198879): populate layout at construction instead of accessing PJRT
-// buffer directly for consistency with Pathways.
-absl::StatusOr<std::shared_ptr<const PjRtLayout>> PjRtArray::layout() const {
-  CHECK(!pjrt_buffers_.empty());
-  std::shared_ptr<const PjRtLayout> layout = pjrt_buffers_[0]->layout();
+absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> PjRtArray::layout()
+    const {
 #ifndef NDEBUG
   for (int i = 1; i < pjrt_buffers_.size(); ++i) {
-    std::shared_ptr<const PjRtLayout> layout_i = pjrt_buffers_[i]->layout();
-    DCHECK(*layout == *layout_i)
+    std::shared_ptr<const xla::PjRtLayout> layout_i =
+        pjrt_buffers_[i]->layout();
+    DCHECK(*layout_ == *layout_i)
         << "PjRtArray has mismatched layouts across shards! "
-        << "shard 0: " << layout->ToString() << ", shard " << i << ": "
+        << "shard 0: " << layout_->ToString() << ", shard " << i << ": "
         << layout_i->ToString();
   }
 #endif
-  return layout;
+  return layout_;
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
index f96bee544c47..bfe048c5c52d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
@@ -67,28 +67,31 @@ class PjRtArray final
   using PjRtBuffers =
       absl::InlinedVector<std::shared_ptr<PjRtBuffer>, kPjRtBufferInlineSize>;
 
-  // General array construction (with static shape).
+  // General array construction (with static shape). pjrt_buffers may be empty.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, DType dtype, Shape shape,
-      std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+      ShardingRef sharding, PjRtBuffers pjrt_buffers,
+      std::shared_ptr<const xla::PjRtLayout> layout);
 
-  // General array construction (with dynamic shape).
+  // General array construction (with dynamic shape). pjrt_buffers may be empty.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, DType dtype, DynamicShape dynamic_shape,
-      std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+      ShardingRef sharding, PjRtBuffers pjrt_buffers,
+      std::shared_ptr<const xla::PjRtLayout> layout);
 
   // Shorthand for a single-shard array construction.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer);
 
   // Shorthand for a multi-shard array construction using ConcreteSharding.
+  // pjrt_buffers must be non-empty.
   // TODO(hyeontaek): Remove this once IFRT Sharding and JAX Sharding is unified
   // so that ConcreteSharding can be replaced with a real Sharding.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers);
 
   // Shorthand for a multi-shard array construction using ConcreteSharding with
-  // DynamicShape.
+  // DynamicShape. pjrt_buffers must be non-empty.
   static absl::StatusOr<tsl::RCReference<PjRtArray>> Create(
       PjRtCompatibleClient* client, DynamicShape dynamic_shape,
       PjRtBuffers pjrt_buffers);
@@ -105,7 +108,7 @@ class PjRtArray final
     return absl::MakeSpan(pjrt_buffers_);
   }
 
-  absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+  absl::StatusOr<ArrayRef> FullyReplicatedShard(
       ArrayCopySemantics semantics) override;
 
   // Array implementation.
@@ -146,17 +149,15 @@ class PjRtArray final
     DCHECK(this);
     return *sharding_;
   }
-  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+  ShardingRef shared_ptr_sharding() const override {
     DCHECK(this);
     return sharding_;
   }
 
-  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() const override;
+  absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>> layout()
+      const override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
-  DisassembleIntoSingleDeviceArrays(
+  absl::StatusOr<std::vector<ArrayRef>> DisassembleIntoSingleDeviceArrays(
       ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics) override;
 
@@ -165,7 +166,7 @@ class PjRtArray final
       void* data, std::optional<absl::Span<const int64_t>> byte_strides,
       ArrayCopySemantics semantics) override;
 
-  absl::StatusOr<tsl::RCReference<Array>> Copy(
+  absl::StatusOr<ArrayRef> Copy(
       std::optional<xla::ifrt::DeviceListRef> devices,
       std::optional<xla::ifrt::MemoryKind> memory_kind,
       ArrayCopySemantics semantics);
@@ -184,11 +185,13 @@ class PjRtArray final
 
  private:
   PjRtArray(PjRtCompatibleClient* client, DType dtype, Shape shape,
-            std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+            ShardingRef sharding, PjRtBuffers pjrt_buffers,
+            std::shared_ptr<const xla::PjRtLayout> layout);
 
   PjRtArray(PjRtCompatibleClient* client, DType dtype,
-            DynamicShape dynamic_shape,
-            std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+            DynamicShape dynamic_shape, ShardingRef sharding,
+            PjRtBuffers pjrt_buffers,
+            std::shared_ptr<const xla::PjRtLayout> layout);
 
   template <typename T, typename... Args>
   friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
@@ -196,8 +199,10 @@ class PjRtArray final
   PjRtCompatibleClient* client_;
   DType dtype_;
   std::variant<Shape, DynamicShape> shape_;
-  std::shared_ptr<const Sharding> sharding_;
+  ShardingRef sharding_;
   PjRtBuffers pjrt_buffers_;
+  std::shared_ptr<const xla::PjRtLayout> layout_;
+  bool is_deleted_ = false;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index 609a3fbe478b..4c24827b93a9 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -56,9 +56,11 @@ limitations under the License.
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/client_impl_util.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/dtype.h"
@@ -69,6 +71,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/basic_string_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
@@ -80,6 +83,8 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_topology.h"
 #include "xla/python/pjrt_ifrt/pjrt_tuple.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/errors.h"
@@ -534,10 +539,9 @@ void LogDeviceSummary(PjRtClient* client) {
   }
 }
 
-absl::StatusOr<tsl::RCReference<Array>> MakeStringArrayFromHostBuffer(
+absl::StatusOr<ArrayRef> MakeStringArrayFromHostBuffer(
     Client* client, const void* data, DType dtype, Shape shape,
-    std::optional<absl::Span<const int64_t>> byte_strides,
-    std::shared_ptr<const Sharding> sharding,
+    std::optional<absl::Span<const int64_t>> byte_strides, ShardingRef sharding,
     Client::HostBufferSemantics semantics,
     std::function<void()> on_done_with_host_buffer) {
   auto param_validation = [&]() -> absl::Status {
@@ -554,11 +558,11 @@ absl::StatusOr<tsl::RCReference<Array>> MakeStringArrayFromHostBuffer(
           "kImmutableUntilTransferCompletes are not "
           "currently supported for making BasicStringArrays.");
     }
-    if (!llvm::isa<const SingleDeviceSharding>(sharding.get())) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Only SingleDeviceSharding is supported for making "
-                       "BasicStringArrays: got: ",
-                       sharding->DebugString()));
+    if (!sharding->IsFullyReplicated()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Only fully replicated shardings are supported for making "
+          "BasicStringArrays: got: ",
+          sharding->DebugString()));
     }
     return absl::OkStatus();
   }();
@@ -584,11 +588,9 @@ absl::StatusOr<tsl::RCReference<Array>> MakeStringArrayFromHostBuffer(
       std::move(buffer_releaser));
 }
 
-absl::StatusOr<tsl::RCReference<Array>>
-AssembleStringArrayFromSingleDeviceStringArrays(
-    Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<Array>> arrays,
-    ArrayCopySemantics array_copy_semantics,
+absl::StatusOr<ArrayRef> AssembleStringArrayFromSingleDeviceStringArrays(
+    PjRtClient* client, Shape shape, ShardingRef sharding,
+    absl::Span<ArrayRef> arrays, ArrayCopySemantics array_copy_semantics,
     SingleDeviceShardSemantics single_device_shard_semantics) {
   if (single_device_shard_semantics == SingleDeviceShardSemantics::kAllShards &&
       !sharding->devices()->IsFullyAddressable()) {
@@ -676,12 +678,10 @@ AssembleStringArrayFromSingleDeviceStringArrays(
           "All single device arrays must be BasicStringArrays");
     }
 
-    if (!llvm::isa<SingleDeviceSharding>(basic_string_array->sharding()) &&
-        (basic_string_array->sharding().devices()->size() != 1)) {
+    if (basic_string_array->sharding().devices()->size() != 1) {
       return absl::InvalidArgumentError(
           absl::StrFormat("All single device arrays must have single device "
-                          "sharding. got: %s "
-                          "for shard index: %d",
+                          "sharding. got: %s for shard index: %d",
                           basic_string_array->sharding().DebugString(), i));
     }
 
@@ -692,8 +692,8 @@ AssembleStringArrayFromSingleDeviceStringArrays(
         });
   }
 
-  return BasicStringArray::Create(arrays[0]->client(), std::move(shape),
-                                  std::move(sharding), buffers_future,
+  return BasicStringArray::Create(client, std::move(shape), std::move(sharding),
+                                  buffers_future,
                                   std::move(on_done_with_buffer));
 }
 
@@ -794,6 +794,30 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> PjRtClient::Create(
     }
   }
 
+  // For non-addressable devices, pjrt_device is null, so the default memory is
+  // set to that of an addressable device.
+  auto default_memory = client->addressable_devices_.front()->DefaultMemory();
+  for (const auto& device : client->owned_devices_) {
+    if (!device->pjrt_device()) {
+      for (const Memory* memory :
+           client->addressable_devices_.front()->Memories()) {
+        auto ifrt_memory = std::make_unique<PjRtMemory>(
+            client.get(), memory->Kind(), device.get());
+        device->memories_.push_back(ifrt_memory.get());
+        if (absl::IsUnknown(device->default_memory_.status())) {
+          if (default_memory.ok()) {
+            if (memory == *default_memory) {
+              device->default_memory_ = ifrt_memory.get();
+            }
+          } else {
+            device->default_memory_ = default_memory.status();
+          }
+        }
+        client->owned_memories_.push_back(std::move(ifrt_memory));
+      }
+    }
+  }
+
   LogDeviceSummary(client.get());
   return client;
 }
@@ -883,12 +907,13 @@ PjRtClient::CreatePjRtArray(Shape shape, PjRtBuffers pjrt_buffers) {
   return tsl::RCReference<PjRtCompatibleArray>(std::move(array));
 }
 
-absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
+absl::StatusOr<ArrayRef> PjRtClient::MakeArrayFromHostBuffer(
     const void* data, DType dtype, Shape shape,
-    std::optional<absl::Span<const int64_t>> byte_strides,
-    std::shared_ptr<const Sharding> sharding,
+    std::optional<absl::Span<const int64_t>> byte_strides, ShardingRef sharding,
     Client::HostBufferSemantics semantics,
-    std::function<void()> on_done_with_host_buffer) {
+    std::function<void()> on_done_with_host_buffer,
+    tsl::RCReference<UserContext> user_context) {
+  // Currently the `user_context` parameter is ignored.
   DCHECK(this);
   if (dtype.kind() == DType::kString) {
     return MakeStringArrayFromHostBuffer(this, data, dtype, shape, byte_strides,
@@ -904,7 +929,14 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
   }
   TF_ASSIGN_OR_RETURN(auto primitive_type, ToPrimitiveType(dtype));
 
-  auto count = std::make_shared<std::atomic<int>>(sharding->devices()->size());
+  absl::Span<xla::ifrt::Device* const> ifrt_addressable_devices =
+      sharding->devices()->AddressableDeviceList()->devices();
+  auto count =
+      std::make_shared<std::atomic<int>>(ifrt_addressable_devices.size());
+  if (ifrt_addressable_devices.empty()) {
+    return InvalidArgument("Cannot copy array to non-addressable device: %s",
+                           sharding->devices()->DebugString());
+  }
   std::function<void()> on_done_with_host_buffer_per_device;
   if (on_done_with_host_buffer) {
     on_done_with_host_buffer_per_device =
@@ -919,8 +951,8 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
   }
 
   PjRtArray::PjRtBuffers buffers;
-  buffers.reserve(sharding->devices()->size());
-  for (xla::ifrt::Device* const device : sharding->devices()->devices()) {
+  buffers.reserve(ifrt_addressable_devices.size());
+  for (xla::ifrt::Device* const device : ifrt_addressable_devices) {
     std::unique_ptr<PjRtBuffer> buffer;
     // If the sharding has memory_kind specified, use a version of
     // `PjRtClient::BufferFromHostBuffer` that accepts `PjRtMemorySpace`.
@@ -940,8 +972,8 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
         return InvalidArgument(
             "Invalid memory kind: %s; available memory kinds: %s",
             *sharding->memory_kind().memory_kind(),
-            absl::StrJoin(sharding->devices()->devices().front()->Memories(),
-                          ", ", [](std::string* out, Memory* ms) {
+            absl::StrJoin(ifrt_addressable_devices.front()->Memories(), ", ",
+                          [](std::string* out, Memory* ms) {
                             absl::StrAppend(out, *ms->Kind().memory_kind());
                           }));
       }
@@ -952,10 +984,6 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
                       tensorflow::down_cast<PjRtMemory*>(memory)->pjrt_memory(),
                       /*device_layout=*/nullptr));
     } else {
-      if (!device->IsAddressable()) {
-        return InvalidArgument("Cannot copy array to non-addressable device %s",
-                               device->DebugString());
-      }
       TF_ASSIGN_OR_RETURN(xla::PjRtMemorySpace * memory_space,
                           tensorflow::down_cast<PjRtDevice*>(device)
                               ->pjrt_device()
@@ -968,42 +996,91 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
     }
     buffers.push_back(std::move(buffer));
   }
+  auto layout = buffers.front()->layout();
   return PjRtArray::Create(this, dtype, std::move(shape), std::move(sharding),
-                           std::move(buffers));
+                           std::move(buffers), std::move(layout));
 }
 
-absl::StatusOr<tsl::RCReference<Array>>
-PjRtClient::AssembleArrayFromSingleDeviceArrays(
-    Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<Array>> arrays, ArrayCopySemantics semantics) {
-  DCHECK(this);
-  return AssembleArrayFromSingleDeviceArrays(
-      std::move(shape), std::move(sharding), arrays, semantics,
-      SingleDeviceShardSemantics::kAddressableShards);
+absl::StatusOr<std::vector<ArrayRef>>
+PjRtClient::MakeArraysFromHostBufferShards(
+    absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
+    HostBufferSemantics semantics, tsl::RCReference<UserContext> user_context) {
+  return ClientMakeArraysFromHostBufferShards(this, specs, semantics,
+                                              std::move(user_context));
 }
 
-absl::StatusOr<tsl::RCReference<Array>>
-PjRtClient::AssembleArrayFromSingleDeviceArrays(
-    Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<Array>> arrays,
-    ArrayCopySemantics array_copy_semantics,
-    SingleDeviceShardSemantics single_device_shard_semantics) {
+absl::StatusOr<std::vector<ArrayRef>> PjRtClient::MakeErrorArrays(
+    const absl::Status& error, absl::Span<const ArraySpec> array_specs,
+    tsl::RCReference<UserContext> user_context) {
+  if (error.ok()) {
+    return absl::InvalidArgumentError("Error status must not be OK");
+  }
   DCHECK(this);
-  DCHECK(!arrays.empty());
-  DType dtype = arrays[0]->dtype();
-  return AssembleArrayFromSingleDeviceArrays(
-      dtype, std::move(shape), std::move(sharding), arrays,
-      array_copy_semantics, single_device_shard_semantics);
+  std::vector<ArrayRef> arrays;
+  arrays.reserve(array_specs.size());
+  for (const auto& array_spec : array_specs) {
+    if (array_spec.dtype.kind() == DType::kString) {
+      TF_ASSIGN_OR_RETURN(
+          arrays.emplace_back(),
+          BasicStringArray::Create(this, array_spec.shape, array_spec.sharding,
+                                   Future<BasicStringArray::Buffers>(error),
+                                   /*on_done_with_buffer=*/[]() {}));
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(auto primitive_type, ToPrimitiveType(array_spec.dtype));
+    absl::Span<xla::ifrt::Device* const> ifrt_addressable_devices =
+        array_spec.sharding->devices()->AddressableDeviceList()->devices();
+    TF_ASSIGN_OR_RETURN(Shape shard_shape,
+                        array_spec.sharding->GetShardShape(array_spec.shape));
+    xla::Shape xla_shape =
+        xla::ShapeUtil::MakeShape(primitive_type, shard_shape.dims());
+
+    PjRtArray::PjRtBuffers buffers;
+    buffers.reserve(ifrt_addressable_devices.size());
+    for (xla::ifrt::Device* const device : ifrt_addressable_devices) {
+      std::unique_ptr<PjRtBuffer> buffer;
+      // Find `PjRtMemorySpace` that is associated with the sharding's device
+      // and matches the sharding's memory_kind.
+      Memory* memory = nullptr;
+      for (Memory* ms : device->Memories()) {
+        if (ms->Kind() == array_spec.sharding->memory_kind()) {
+          memory = ms;
+          break;
+        }
+      }
+      if (memory == nullptr) {
+        return absl::InvalidArgumentError(absl::StrFormat(
+            "Invalid memory kind: %s; available memory kinds: %s",
+            *array_spec.sharding->memory_kind().memory_kind(),
+            absl::StrJoin(ifrt_addressable_devices.front()->Memories(), ", ",
+                          [](std::string* out, Memory* ms) {
+                            absl::StrAppend(out, *ms->Kind().memory_kind());
+                          })));
+      }
+      TF_ASSIGN_OR_RETURN(
+          buffers.emplace_back(),
+          pjrt_client_->CreateErrorBuffer(
+              error, xla_shape,
+              tensorflow::down_cast<PjRtMemory*>(memory)->pjrt_memory()));
+    }
+    auto layout = buffers.front()->layout();
+    TF_ASSIGN_OR_RETURN(
+        arrays.emplace_back(),
+        PjRtArray::Create(this, array_spec.dtype, std::move(shard_shape),
+                          array_spec.sharding, std::move(buffers),
+                          std::move(layout)));
+  }
+  return arrays;
 }
 
-absl::StatusOr<tsl::RCReference<Array>>
-PjRtClient::AssembleArrayFromSingleDeviceArrays(
-    DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
-    absl::Span<tsl::RCReference<Array>> arrays,
+absl::StatusOr<ArrayRef> PjRtClient::AssembleArrayFromSingleDeviceArrays(
+    DType dtype, Shape shape, ShardingRef sharding, absl::Span<ArrayRef> arrays,
     ArrayCopySemantics array_copy_semantics,
     SingleDeviceShardSemantics single_device_shard_semantics) {
   DCHECK(this);
-  if (llvm::isa<const SingleDeviceSharding>(sharding.get())) {
+  if (!arrays.empty() &&
+      llvm::isa<const SingleDeviceSharding>(sharding.get())) {
     // Assemble with SingleDeviceSharding is No-op.
     if (arrays.size() != 1) {
       return InvalidArgument(
@@ -1012,9 +1089,10 @@ PjRtClient::AssembleArrayFromSingleDeviceArrays(
           arrays.size());
     }
     return arrays[0];
-  } else if (!llvm::isa<const OpaqueSharding, const ConcreteSharding,
-                        const ConcreteEvenSharding, const ShardingParamSharding,
-                        const HloSharding>(sharding.get())) {
+  } else if (!llvm::isa<const SingleDeviceSharding, const OpaqueSharding,
+                        const ConcreteSharding, const ConcreteEvenSharding,
+                        const ShardingParamSharding, const HloSharding>(
+                 sharding.get())) {
     return InvalidArgument(
         "Only SingleDeviceSharding, OpaqueSharding, ConcreteSharding, "
         "ConcreteEvenSharding, ShardingParamSharding, HloSharding are "
@@ -1036,7 +1114,7 @@ PjRtClient::AssembleArrayFromSingleDeviceArrays(
   }
   if (dtype.kind() == DType::kString) {
     return AssembleStringArrayFromSingleDeviceStringArrays(
-        shape, sharding, arrays, array_copy_semantics,
+        this, shape, sharding, arrays, array_copy_semantics,
         single_device_shard_semantics);
   }
   PjRtArray::PjRtBuffers buffers;
@@ -1074,16 +1152,28 @@ PjRtClient::AssembleArrayFromSingleDeviceArrays(
         break;
     }
   }
+  // TODO(emilyaf): Remove the following logic once layout is plumbed through.
+  std::shared_ptr<const xla::PjRtLayout> layout;
+  if (dtype.kind() == DType::kToken) {
+    layout = std::make_shared<xla::PjRtLayout>(xla::Layout());
+  } else if (buffers.empty()) {
+    TF_ASSIGN_OR_RETURN(auto shard_shape, sharding->GetShardShape(shape));
+    TF_ASSIGN_OR_RETURN(layout,
+                        GetDefaultLayout(dtype, shard_shape.dims(),
+                                         sharding->devices()->devices().front(),
+                                         sharding->memory_kind()));
+  } else {
+    layout = buffers.front()->layout();
+  }
   return PjRtArray::Create(this, dtype, std::move(shape), std::move(sharding),
-                           std::move(buffers));
+                           std::move(buffers), std::move(layout));
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<Array>>> PjRtClient::CopyArrays(
-    absl::Span<tsl::RCReference<Array>> arrays,
-    std::optional<DeviceListRef> devices, std::optional<MemoryKind> memory_kind,
-    ArrayCopySemantics semantics) {
+absl::StatusOr<std::vector<ArrayRef>> PjRtClient::CopyArrays(
+    absl::Span<ArrayRef> arrays, std::optional<DeviceListRef> devices,
+    std::optional<MemoryKind> memory_kind, ArrayCopySemantics semantics) {
   if (arrays.empty()) {
-    return std::vector<tsl::RCReference<Array>>();
+    return std::vector<ArrayRef>();
   }
 
   for (int i = 1; i < arrays.size(); ++i) {
@@ -1096,7 +1186,7 @@ absl::StatusOr<std::vector<tsl::RCReference<Array>>> PjRtClient::CopyArrays(
     }
   }
 
-  std::vector<tsl::RCReference<Array>> new_arrays;
+  std::vector<ArrayRef> new_arrays;
   new_arrays.reserve(arrays.size());
   for (const auto& array : arrays) {
     if (auto* const pjrt_array = llvm::dyn_cast<PjRtArray>(array.get())) {
@@ -1114,15 +1204,13 @@ absl::StatusOr<std::vector<tsl::RCReference<Array>>> PjRtClient::CopyArrays(
   return new_arrays;
 }
 
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-PjRtClient::RemapArrays(const RemapPlan& plan,
-                        absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-                        ArrayCopySemantics semantics) {
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> PjRtClient::RemapArrays(
+    const RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
+    ArrayCopySemantics semantics) {
   return PjRtCompatibleClientRemapArrays(this, plan, arrays, semantics);
 }
 
-Future<> PjRtClient::GetReadyFuture(
-    absl::Span<const tsl::RCReference<Value>> values) {
+Future<> PjRtClient::GetReadyFuture(absl::Span<const ValueRef> values) {
   absl::InlinedVector<Future<>, 1> futures;
   futures.reserve(values.size());
   for (const auto& value : values) {
@@ -1132,7 +1220,7 @@ Future<> PjRtClient::GetReadyFuture(
 }
 
 absl::StatusOr<tsl::RCReference<Tuple>> PjRtClient::MakeTuple(
-    absl::Span<tsl::RCReference<Value>> values) {
+    absl::Span<ValueRef> values) {
   return PjRtTuple::Create(this, values);
 }
 
@@ -1146,18 +1234,22 @@ absl::StatusOr<std::shared_ptr<Topology>> PjRtClient::GetTopologyForDevices(
                                                           topology));
 }
 
-absl::StatusOr<std::shared_ptr<const PjRtLayout>> PjRtClient::GetDefaultLayout(
-    DType dtype, absl::Span<const int64_t> dims, Device* device,
-    MemoryKind memory_kind) const {
+absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>>
+PjRtClient::GetDefaultLayout(DType dtype, absl::Span<const int64_t> dims,
+                             Device* device, MemoryKind memory_kind) const {
   static MemoryKind kUnpinnedHostMemoryKind(UnpinnedHostMemorySpace::kKind);
   if (memory_kind == kUnpinnedHostMemoryKind) {
-    return std::make_shared<PjRtLayout>(
+    return std::make_shared<xla::PjRtLayout>(
         LayoutUtil::MakeDescendingLayout(dims.size()));
   }
   TF_ASSIGN_OR_RETURN(PrimitiveType element_type, ToPrimitiveType(dtype));
+  if (element_type == PrimitiveType::TOKEN) {
+    return std::make_shared<PjRtLayout>(
+        LayoutUtil::MakeDescendingLayout(dims.size()));
+  }
   TF_ASSIGN_OR_RETURN(xla::Layout layout,
                       pjrt_client_->GetDefaultLayout(element_type, dims));
-  return std::make_shared<PjRtLayout>(std::move(layout));
+  return std::make_shared<xla::PjRtLayout>(std::move(layout));
 }
 
 absl::Status PjRtClient::TransferToInfeed(PjRtDevice* device,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index 13ace70d6049..1e4b4bd55acd 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -22,10 +22,12 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
@@ -38,6 +40,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
@@ -51,6 +54,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/topology.h"
 #include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/user_context.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/pjrt_compiler.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -166,44 +170,40 @@ class PjRtClient final
   //   function to fail.
   //   (3) only the `kImmutableDuringCall` semantics is supported currently.
   //   Fails for other values of `HostBufferSemantics`.
-  absl::StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
+  absl::StatusOr<ArrayRef> MakeArrayFromHostBuffer(
       const void* data, DType dtype, Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
-      std::shared_ptr<const Sharding> sharding,
-      Client::HostBufferSemantics semantics,
-      std::function<void()> on_done_with_host_buffer) override;
-
-  absl::StatusOr<tsl::RCReference<Array>> AssembleArrayFromSingleDeviceArrays(
-      Shape shape, std::shared_ptr<const Sharding> sharding,
-      absl::Span<tsl::RCReference<Array>> arrays,
-      ArrayCopySemantics semantics) override;
-  absl::StatusOr<tsl::RCReference<Array>> AssembleArrayFromSingleDeviceArrays(
-      Shape shape, std::shared_ptr<const Sharding> sharding,
-      absl::Span<tsl::RCReference<Array>> arrays,
-      ArrayCopySemantics array_copy_semantics,
-      SingleDeviceShardSemantics single_device_shard_semantics) override;
-  absl::StatusOr<tsl::RCReference<Array>> AssembleArrayFromSingleDeviceArrays(
-      DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
-      absl::Span<tsl::RCReference<Array>> arrays,
-      ArrayCopySemantics array_copy_semantics,
+      ShardingRef sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer,
+      tsl::RCReference<UserContext> user_context) override;
+
+  absl::StatusOr<std::vector<ArrayRef>> MakeArraysFromHostBufferShards(
+      absl::Span<MakeArraysFromHostBufferShardsSpec> specs,
+      HostBufferSemantics semantics,
+      tsl::RCReference<UserContext> user_context) override;
+
+  absl::StatusOr<std::vector<ArrayRef>> MakeErrorArrays(
+      const absl::Status& error, absl::Span<const ArraySpec> array_specs,
+      tsl::RCReference<UserContext> user_context) override;
+
+  absl::StatusOr<ArrayRef> AssembleArrayFromSingleDeviceArrays(
+      DType dtype, Shape shape, ShardingRef sharding,
+      absl::Span<ArrayRef> arrays, ArrayCopySemantics array_copy_semantics,
       SingleDeviceShardSemantics single_device_shard_semantics) override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<Array>>> CopyArrays(
-      absl::Span<tsl::RCReference<Array>> arrays,
-      std::optional<DeviceListRef> devices,
+  absl::StatusOr<std::vector<ArrayRef>> CopyArrays(
+      absl::Span<ArrayRef> arrays, std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind,
       ArrayCopySemantics semantics) override;
 
-  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> RemapArrays(
-      const RemapPlan& plan,
-      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+  absl::StatusOr<std::vector<xla::ifrt::ArrayRef>> RemapArrays(
+      const RemapPlan& plan, absl::Span<xla::ifrt::ArrayRef> arrays,
       ArrayCopySemantics semantics) override;
 
-  Future<> GetReadyFuture(
-      absl::Span<const tsl::RCReference<Value>> values) override;
+  Future<> GetReadyFuture(absl::Span<const ValueRef> values) override;
 
   absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
-      absl::Span<tsl::RCReference<Value>> values) override;
+      absl::Span<ValueRef> values) override;
 
   absl::string_view runtime_type() const override { return "pjrt_ifrt"; }
 
@@ -297,6 +297,10 @@ class PjRtClient final
   absl::Status TransferFromOutfeed(PjRtDevice* device,
                                    MutableBorrowingLiteral literal);
 
+  tsl::RCReference<UserContext> CreateUserContext() override {
+    return tsl::RCReference<UserContext>();
+  }
+
   static char ID;  // NOLINT
 
  private:
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
index c789247a5676..b2d51b6b0e9b 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/program.h"
@@ -83,7 +85,7 @@ absl::Status TranslateDeviceIds(PjRtClient* client,
   return absl::OkStatus();
 }
 
-absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtCompiler::Compile(
+absl::StatusOr<LoadedExecutableRef> PjRtCompiler::CompileAndLoad(
     std::unique_ptr<Program> program, std::unique_ptr<CompileOptions> options) {
   DCHECK(this);
   const auto* xla_program = llvm::dyn_cast<HloProgram>(program.get());
@@ -97,10 +99,11 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtCompiler::Compile(
   return PjRtLoadedExecutable::Create(
       client_, xla_program->mlir_module,
       std::move(xla_compile_options->compile_options),
-      std::move(xla_compile_options->loaded_host_callbacks));
+      std::move(xla_compile_options->loaded_host_callbacks),
+      std::move(xla_compile_options->devices));
 }
 
-absl::StatusOr<std::unique_ptr<Executable>> PjRtCompiler::Compile(
+absl::StatusOr<ExecutableRef> PjRtCompiler::Compile(
     std::unique_ptr<Program> program, const Topology& topology,
     std::unique_ptr<CompileOptions> options) {
   DCHECK(this);
@@ -123,8 +126,7 @@ absl::StatusOr<std::unique_ptr<Executable>> PjRtCompiler::Compile(
   return PjRtExecutable::Create(std::move(executable));
 }
 
-absl::StatusOr<std::unique_ptr<LoadedExecutable>>
-PjRtCompiler::DeserializeLoadedExecutable(
+absl::StatusOr<LoadedExecutableRef> PjRtCompiler::DeserializeLoadedExecutable(
     absl::string_view serialized,
     std::unique_ptr<DeserializeExecutableOptions> options) {
   DCHECK(this);
@@ -136,13 +138,25 @@ PjRtCompiler::DeserializeLoadedExecutable(
   }
   TF_ASSIGN_OR_RETURN(
       auto pjrt_loaded_executable,
-      client_->pjrt_client()->DeserializeExecutable(
-          serialized, std::move(xla_deserialize_options->compile_options)));
+      client_->pjrt_client()->LoadSerializedExecutable(
+          serialized, std::move(xla_deserialize_options->compile_options),
+          xla::LoadOptions()));
+  // TODO(emilyaf): Remove the else branch once devices are plumbed through from
+  // Australis and are always present in the DeserializeExecutableOptions.
+  DeviceListRef device_list;
+  if (xla_deserialize_options->devices.has_value()) {
+    device_list = std::move(xla_deserialize_options->devices.value());
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        device_list, GetDeviceListFromDeviceAssignment(
+                         client_, pjrt_loaded_executable->device_assignment()));
+  }
   return PjRtLoadedExecutable::Create(
       client_,
       std::shared_ptr<xla::PjRtLoadedExecutable>(
           std::move(pjrt_loaded_executable)),
-      std::move(xla_deserialize_options->loaded_host_callbacks));
+      std::move(xla_deserialize_options->loaded_host_callbacks),
+      std::move(device_list));
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
index 930f89d8e2d3..d7e456241cd8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.h
@@ -43,15 +43,15 @@ class PjRtCompiler final : public llvm::RTTIExtends<PjRtCompiler, Compiler> {
 
   ~PjRtCompiler() override = default;
 
-  absl::StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
+  absl::StatusOr<LoadedExecutableRef> CompileAndLoad(
       std::unique_ptr<Program> program,
       std::unique_ptr<CompileOptions> options) override;
 
-  absl::StatusOr<std::unique_ptr<Executable>> Compile(
+  absl::StatusOr<ExecutableRef> Compile(
       std::unique_ptr<Program> program, const Topology& topology,
       std::unique_ptr<CompileOptions> options) override;
 
-  absl::StatusOr<std::unique_ptr<LoadedExecutable>> DeserializeLoadedExecutable(
+  absl::StatusOr<LoadedExecutableRef> DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<DeserializeExecutableOptions> options) override;
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_dtype.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_dtype.cc
index 2af3281a588c..7aad235ebaaf 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_dtype.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_dtype.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
 #include "xla/python/ifrt/dtype.h"
-#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -63,10 +65,12 @@ absl::StatusOr<xla::PrimitiveType> ToPrimitiveType(DType dtype) {
     CASE(DType::kOpaque, xla::PrimitiveType::OPAQUE_TYPE);
 #undef CASE
     case DType::kString:
-      return InvalidArgument("Not supported as XLA PrimitiveType: %d",
-                             static_cast<int>(dtype.kind()));
+      return absl::InvalidArgumentError(
+          absl::StrCat("Not supported as XLA PrimitiveType: ",
+                       static_cast<int>(dtype.kind())));
   }
-  return InvalidArgument("Invalid DType: %d", static_cast<int>(dtype.kind()));
+  return absl::InvalidArgumentError(
+      absl::StrCat("Invalid DType: ", static_cast<int>(dtype.kind())));
 }
 
 absl::StatusOr<DType> ToDType(xla::PrimitiveType primitive_type) {
@@ -104,8 +108,10 @@ absl::StatusOr<DType> ToDType(xla::PrimitiveType primitive_type) {
     case xla::PrimitiveType::OPAQUE_TYPE:
       return DType(static_cast<DType::Kind>(static_cast<int>(primitive_type)));
     default:
-      return InvalidArgument("Invalid XLA PrimitiveType: %d",
-                             static_cast<int>(primitive_type));
+      return absl::InvalidArgumentError(
+          absl::Substitute("Invalid XLA PrimitiveType: $0 ($1)",
+                           static_cast<int>(primitive_type),
+                           xla::PrimitiveType_Name(primitive_type)));
   }
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 65a6a9c33da8..6db2cafb9818 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -31,13 +31,17 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "xla/ffi/execution_context.h"
+#include "xla/ffi/type_id_registry.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
+#include "xla/layout.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/primitive_util.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/basic_device_list.h"
@@ -185,10 +189,9 @@ char PjRtCompatibleLoadedExecutable::ID = 0;
 char PjRtExecutable::ID = 0;
 char PjRtLoadedExecutable::ID = 0;
 
-absl::StatusOr<std::unique_ptr<Executable>> PjRtExecutable::Create(
+absl::StatusOr<ExecutableRef> PjRtExecutable::Create(
     std::shared_ptr<xla::PjRtExecutable> pjrt_executable) {
-  return std::unique_ptr<Executable>(
-      new PjRtExecutable(std::move(pjrt_executable)));
+  return ExecutableRef(new PjRtExecutable(std::move(pjrt_executable)));
 }
 
 absl::StatusOr<std::optional<std::string>> PjRtExecutable::Fingerprint() const {
@@ -201,10 +204,11 @@ absl::StatusOr<std::string> PjRtExecutable::Serialize() const {
   return pjrt_executable_->SerializeExecutable();
 }
 
-absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
+absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
     PjRtCompatibleClient* client,
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+    DeviceListRef executable_devices) {
   // TODO(hyeontaek): Use a full shape and a sharding rather than a per-shard
   // shape.
   VLOG(3) << "PjRtLoadedExecutable::Create";
@@ -221,7 +225,8 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
   return CreateInternal(client, std::move(pjrt_loaded_executable),
                         result_element_types, result_dimensions,
                         /*result_hlo_sharding=*/std::nullopt,
-                        result_memory_kinds, loaded_host_callbacks);
+                        result_memory_kinds, loaded_host_callbacks,
+                        std::move(executable_devices));
 }
 
 static absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
@@ -240,10 +245,11 @@ static absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
   return result_shapes;
 }
 
-absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
+absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
     PjRtCompatibleClient* client, mlir::ModuleOp module,
     xla::CompileOptions compile_options,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+    DeviceListRef executable_devices) {
   VLOG(3) << "PjRtLoadedExecutable::Create";
   if (VLOG_IS_ON(3)) {
     module.dump();
@@ -256,9 +262,9 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
       (build_options.use_auto_spmd_partitioning() ||
        build_options.any_allow_spmd_sharding_propagation_to_parameters() ||
        build_options.any_allow_spmd_sharding_propagation_to_output());
-  TF_ASSIGN_OR_RETURN(
-      auto pjrt_loaded_executable,
-      client->pjrt_client()->Compile(module, std::move(compile_options)));
+  TF_ASSIGN_OR_RETURN(auto pjrt_loaded_executable,
+                      client->pjrt_client()->CompileAndLoad(
+                          module, std::move(compile_options)));
 
   if (auto_spmd_partitioning) {
     // TODO(hyeontaek): Use a full shape and a sharding rather than a per-shard
@@ -276,8 +282,8 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     return CreateInternal(client, std::move(pjrt_loaded_executable),
                           result_element_types, result_dimensions,
                           /*result_hlo_sharding=*/std::nullopt,
-                          result_memory_kinds,
-                          std::move(loaded_host_callbacks));
+                          result_memory_kinds, std::move(loaded_host_callbacks),
+                          std::move(executable_devices));
   } else {
     VLOG(3) << "Using full shape";
     // TODO(yueshengys): Consider getting element types and dimensions directly
@@ -306,40 +312,56 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     return CreateInternal(client, std::move(pjrt_loaded_executable),
                           shape_partial_info.element_types,
                           shape_partial_info.dimensions, result_hlo_sharding,
-                          result_memory_kinds,
-                          std::move(loaded_host_callbacks));
+                          result_memory_kinds, std::move(loaded_host_callbacks),
+                          std::move(executable_devices));
   }
 }
 
-absl::StatusOr<std::unique_ptr<LoadedExecutable>>
-PjRtLoadedExecutable::CreateInternal(
+absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
     PjRtCompatibleClient* client,
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
     absl::Span<const xla::PrimitiveType> result_element_types,
     absl::Span<const xla::DimensionVector> result_dimensions,
     const std::optional<xla::HloSharding>& result_hlo_sharding,
     const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
-  BasicDeviceList::Devices ds;
-  ds.reserve(pjrt_loaded_executable->addressable_devices().size());
-  for (xla::PjRtDevice* device :
-       pjrt_loaded_executable->addressable_devices()) {
-    TF_ASSIGN_OR_RETURN(Device * ifrt_device, client->LookupPjRtDevice(device));
-    ds.push_back(ifrt_device);
-  }
-  DeviceListRef devices = BasicDeviceList::Create(std::move(ds));
-  // Devices used for constructing output shardings. A fake one will be used for
-  // a portable executable.
-  std::optional<DeviceListRef> sharding_devices;
-  if (devices->devices().empty()) {
-    sharding_devices =
-        BasicDeviceList::Create({client->addressable_devices().front()});
-  } else {
-    sharding_devices = devices;
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+    DeviceListRef executable_devices) {
+  // For jit(pmap(...)), the device assignment (passed as `executable_devices`)
+  // may contain a single device while the PjRt executable has multiple
+  // addressable devices. We check for this condition and replace
+  // `executable_devices` with the executable's addressable devices if
+  // necessary.
+  if (pjrt_loaded_executable->num_replicas() > 1 &&
+      executable_devices->devices().size() == 1) {
+    if (pjrt_loaded_executable->addressable_devices().size() > 1) {
+      BasicDeviceList::Devices ds;
+      ds.reserve(pjrt_loaded_executable->addressable_devices().size());
+      for (xla::PjRtDevice* device :
+           pjrt_loaded_executable->addressable_devices()) {
+        TF_ASSIGN_OR_RETURN(Device * ifrt_device,
+                            client->LookupPjRtDevice(device));
+        ds.push_back(ifrt_device);
+      }
+      executable_devices = BasicDeviceList::Create(std::move(ds));
+    } else if (pjrt_loaded_executable->addressable_devices().size() == 1) {
+      TF_ASSIGN_OR_RETURN(
+          Device * ifrt_device,
+          client->LookupPjRtDevice(
+              pjrt_loaded_executable->addressable_devices().front()));
+      if (ifrt_device != executable_devices->devices().front()) {
+        return FailedPrecondition(
+            "Addressable device does not match sharding device");
+      }
+    }
+  }
+  if (executable_devices->devices().size() <
+      pjrt_loaded_executable->addressable_devices().size()) {
+    return FailedPrecondition(
+        "Sharding devices must be at least as many as addressable devices");
   }
   std::vector<DType> output_dtypes;
   std::vector<Shape> output_shapes;
-  std::vector<std::shared_ptr<const Sharding>> output_shardings;
+  std::vector<ShardingRef> output_shardings;
 
   auto append_arg = [&](const xla::PrimitiveType& element_type,
                         const xla::DimensionVector& dimensions,
@@ -361,7 +383,7 @@ PjRtLoadedExecutable::CreateInternal(
               xla::ShapeUtil::MakeShape(element_type, dimensions)));
     }
     output_shardings.push_back(ifrt::ConcreteEvenSharding::Create(
-        *sharding_devices, memory_kind,
+        executable_devices, memory_kind,
         /*shape=*/ifrt::Shape(dimensions),
         /*shard_shape=*/ifrt::Shape(tile_shape_dimensions)));
     return absl::OkStatus();
@@ -370,7 +392,7 @@ PjRtLoadedExecutable::CreateInternal(
     output_dtypes.push_back(DType(DType::kToken));
     output_shapes.push_back(Shape({}));
     output_shardings.push_back(
-        ifrt::ConcreteEvenSharding::Create(*sharding_devices, memory_kind,
+        ifrt::ConcreteEvenSharding::Create(executable_devices, memory_kind,
                                            /*shape=*/ifrt::Shape({}),
                                            /*shard_shape=*/ifrt::Shape({})));
   };
@@ -458,8 +480,8 @@ PjRtLoadedExecutable::CreateInternal(
     addressable_devices.push_back(ifrt_device);
   }
 
-  return std::unique_ptr<LoadedExecutable>(new PjRtLoadedExecutable(
-      client, std::move(pjrt_loaded_executable), std::move(devices),
+  return LoadedExecutableRef(new PjRtLoadedExecutable(
+      client, std::move(pjrt_loaded_executable), std::move(executable_devices),
       std::move(addressable_devices), std::move(loaded_host_callbacks),
       std::move(host_send_and_recv_callbacks), std::move(output_dtypes),
       std::move(output_shapes), std::move(output_shardings)));
@@ -473,7 +495,7 @@ PjRtLoadedExecutable::PjRtLoadedExecutable(
     std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
         host_send_recv_callbacks,
     std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
-    std::vector<std::shared_ptr<const Sharding>> output_shardings)
+    std::vector<ShardingRef> output_shardings)
     : client_(client),
       pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
       devices_(std::move(devices)),
@@ -489,7 +511,7 @@ PjRtLoadedExecutable::PjRtLoadedExecutable(
 PjRtLoadedExecutable::~PjRtLoadedExecutable() = default;
 
 absl::StatusOr<PjRtLoadedExecutable::ExecuteResult>
-PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
+PjRtLoadedExecutable::Execute(absl::Span<ArrayRef> args,
                               const ExecuteOptions& options,
                               std::optional<DeviceListRef> devices) {
   DCHECK(this);
@@ -515,7 +537,7 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     if (devices_->devices().empty()) {
       return InvalidArgument("No devices provided for portable executable");
     }
-    num_computations = devices_->size();
+    num_computations = addressable_devices_.size();
   }
 
   argument_handles.resize(num_computations);
@@ -545,12 +567,36 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
   opts.use_major_to_minor_data_layout_for_callbacks = true;
   opts.non_donatable_input_indices = options.non_donatable_input_indices;
 
-  auto context = std::make_shared<xla::ExecuteContext>();
+  auto context = std::make_unique<xla::ExecuteContext>();
   auto platform_id = pjrt_loaded_executable_->client()->platform_id();
+  auto ffi_callbacks = std::make_unique<xla::FfiLoadedHostCallbacks>();
+  auto callbacks = std::make_unique<std::vector<void*>>();
   // Forward callbacks via FFI's ExecutionContext for CPU/GPU platforms only.
   if (platform_id == CpuId() || platform_id == CudaId() ||
       platform_id == RocmId() || platform_id == SyclId()) {
-    CHECK_OK(context->ffi_context().Insert(all_loaded_host_callbacks_.get()));
+    for (const auto& loaded_host_callback : *all_loaded_host_callbacks_) {
+      auto* ffi_loaded_host_callback =
+          llvm::dyn_cast<PjRtFfiLoadedHostCallback>(loaded_host_callback.get());
+      if (ffi_loaded_host_callback != nullptr) {
+        void* callback = ffi_loaded_host_callback->callable();
+        callbacks->push_back(callback);
+      }
+    }
+    // NOTE(dsuo): For now, check that either all or none of the host callbacks
+    // are FFI callbacks. Otherwise, we have an error.
+    // TODO(b/406585850): Improve how we determine when loaded host callbacks
+    // are forwarded to ffi::ExecutionContext.
+    if (!callbacks->empty() &&
+        callbacks->size() != all_loaded_host_callbacks_->size()) {
+      return InvalidArgument(
+          "ifrt::LoadedHostCallbacks must either be all "
+          "ifrt::PjRtFfiLoadedHostCallback or none.");
+    }
+    ffi_callbacks->callbacks = callbacks->data();
+    ffi_callbacks->num_callbacks = callbacks->size();
+    auto type_id = xla::ffi::TypeIdRegistry::TypeId(
+        xla::FfiLoadedHostCallbacks::id.type_id);
+    CHECK_OK(context->ffi_context().Insert(type_id, ffi_callbacks.get()));
     opts.context = context.get();
   }
 
@@ -614,13 +660,15 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     // the execution finishes.
     status.OnReady([all_loaded_host_callbacks = all_loaded_host_callbacks_,
                     host_callback_states = std::move(host_callback_states),
-                    context = std::move(context)](absl::Status) mutable {
+                    context = std::move(context),
+                    ffi_callbacks = std::move(ffi_callbacks),
+                    callbacks = std::move(callbacks)](absl::Status) mutable {
       all_loaded_host_callbacks.reset();
     });
   }
 
   // Convert 2-level PjRtBuffer vectors into an Array vector.
-  std::vector<tsl::RCReference<Array>> outputs;
+  std::vector<ArrayRef> outputs;
   // TODO(hyeontaek): Check output dtype/shape consistency with the actual
   // output.
   if (pjrt_outputs.size() != num_computations) {
@@ -628,7 +676,8 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
         "Unexpected number of computations in outputs: %d vs. %d",
         pjrt_outputs.size(), num_computations);
   }
-  const int num_outputs = pjrt_outputs.front().size();
+  const int num_outputs = pjrt_outputs.empty() ? output_dtypes_.size()
+                                               : pjrt_outputs.front().size();
   if (num_outputs != output_dtypes_.size()) {
     return FailedPrecondition("Unexpected number of outputs: %d vs. %d",
                               num_outputs, output_dtypes_.size());
@@ -636,51 +685,82 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
   outputs.reserve(num_outputs);
   // Single-device Shardings for portable execution. Outputs with the same
   // memory_kind shares the same Sharding object.
-  absl::flat_hash_map<MemoryKind, std::shared_ptr<const Sharding>>
-      single_device_shardings;
+  absl::flat_hash_map<MemoryKind, ShardingRef> single_device_shardings;
+
+  // TODO(emilyaf): Simplify the handling of layouts here when they're plumbed
+  // through from JAX.
+  std::vector<std::shared_ptr<const xla::PjRtLayout>> layouts;
+  layouts.reserve(num_outputs);
+  if (!pjrt_outputs.empty()) {
+    for (int i = 0; i < num_outputs; ++i) {
+      auto layout = output_dtypes_[i].kind() == xla::ifrt::DType::kToken
+                        ? std::make_shared<xla::PjRtLayout>(xla::Layout())
+                        : pjrt_outputs.front()[i]->layout();
+      layouts.push_back(std::move(layout));
+    }
+  } else {
+    auto maybe_layouts = GetOutputLayouts();
+    if (absl::IsUnimplemented(maybe_layouts.status())) {
+      for (int i = 0; i < num_outputs; ++i) {
+        std::shared_ptr<const xla::PjRtLayout> layout;
+        if (output_dtypes_[i].kind() == xla::ifrt::DType::kToken) {
+          layout = std::make_shared<xla::PjRtLayout>(xla::Layout());
+        } else {
+          TF_ASSIGN_OR_RETURN(layout,
+                              client_->GetDefaultLayout(
+                                  output_dtypes_[i], output_shapes_[i].dims(),
+                                  devices_->devices().front(),
+                                  output_shardings_[i]->memory_kind()));
+        }
+        layouts.push_back(std::move(layout));
+      }
+    } else {
+      TF_RETURN_IF_ERROR(maybe_layouts.status());
+      layouts = *std::move(maybe_layouts);
+    }
+  }
+
   for (int i = 0; i < num_outputs; ++i) {
     PjRtArray::PjRtBuffers buffers;
     buffers.reserve(num_computations);
-    const MemoryKind first_memory_kind =
-        MakeMemoryKindFromPjRtBuffer(pjrt_outputs[0][i].get());
-    const MemoryKind canonical_first_memory_kind =
-        CanonicalizeMemoryKindWithPjRtDevice(first_memory_kind,
-                                             pjrt_outputs[0][i]->device());
+    const MemoryKind dst_memory_kind = output_shardings_[i]->memory_kind();
+    const MemoryKind canonical_dst_memory_kind = CanonicalizeMemoryKind(
+        dst_memory_kind, output_shardings_[i]->devices()->devices().front());
+
     for (int j = 0; j < num_computations; ++j) {
       if (j > 0) {
         if (auto memory_kind =
                 MakeMemoryKindFromPjRtBuffer(pjrt_outputs[j][i].get());
-            canonical_first_memory_kind !=
+            canonical_dst_memory_kind !=
             CanonicalizeMemoryKindWithPjRtDevice(
                 memory_kind, pjrt_outputs[j][i]->device())) {
           return FailedPrecondition(
-              "Memory kind mismatch between PjRtBuffers. Got one buffer with "
-              "memory kind '%v' and another with memory_kind '%v'",
-              first_memory_kind, memory_kind);
+              "Memory kind mismatch. Got sharding with memory kind '%v' and "
+              "buffer with memory_kind '%v'",
+              dst_memory_kind, memory_kind);
         }
       }
       buffers.push_back(
           std::shared_ptr<PjRtBuffer>(pjrt_outputs[j][i].release()));
     }
-    std::shared_ptr<const Sharding> sharding;
+    std::optional<ShardingRef> sharding;
     if (portable_execution) {
-      if (auto it = single_device_shardings.find(first_memory_kind);
+      if (auto it = single_device_shardings.find(dst_memory_kind);
           it == single_device_shardings.end()) {
-        sharding =
-            single_device_shardings
-                .insert({first_memory_kind,
-                         SingleDeviceSharding::Create(portable_execution_device,
-                                                      first_memory_kind)})
-                .first->second;
+        sharding = single_device_shardings
+                       .insert({dst_memory_kind, SingleDeviceSharding::Create(
+                                                     portable_execution_device,
+                                                     dst_memory_kind)})
+                       .first->second;
       } else {
         sharding = it->second;
       }
     } else {
       sharding = output_shardings_[i];
     }
-    outputs.push_back(*PjRtArray::Create(client_, output_dtypes_[i],
-                                         output_shapes_[i], std::move(sharding),
-                                         std::move(buffers)));
+    outputs.push_back(*PjRtArray::Create(
+        client_, output_dtypes_[i], output_shapes_[i], *std::move(sharding),
+        std::move(buffers), std::move(layouts[i])));
   }
 
   ExecuteResult result;
@@ -711,12 +791,5 @@ absl::StatusOr<std::string> PjRtLoadedExecutable::Serialize() const {
   return pjrt_loaded_executable_->SerializeExecutable();
 }
 
-Future<> PjRtLoadedExecutable::Delete() {
-  DCHECK(this);
-  pjrt_loaded_executable_->Delete();
-  // TODO(hyeontaek): Return a correct future.
-  return Future<>(absl::OkStatus());
-}
-
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index cd3cf4809bcf..5e59c36c9561 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -85,7 +85,7 @@ class PjRtExecutable final
     : public llvm::RTTIExtends<PjRtExecutable, PjRtCompatibleExecutable> {
  public:
   // Creates PjRtExecutable from xla::PjRtExecutable.
-  static absl::StatusOr<std::unique_ptr<Executable>> Create(
+  static absl::StatusOr<ExecutableRef> Create(
       std::shared_ptr<xla::PjRtExecutable> pjrt_executable);
 
   // PjRtCompatibleExecutable implementation.
@@ -115,13 +115,13 @@ class PjRtExecutable final
     return pjrt_executable_->GetOutputShardings();
   }
 
-  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
   GetParameterLayouts() const override {
     DCHECK(this);
     return pjrt_executable_->GetParameterLayouts();
   }
 
-  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
   GetOutputLayouts() const override {
     DCHECK(this);
     return pjrt_executable_->GetOutputLayouts();
@@ -181,20 +181,22 @@ class PjRtLoadedExecutable final
   // Creates PjRtExecutable from xla::PjRtLoadedExecutable. We expect that
   // xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings.
   // PjRtLoadedExecutable::GetHloModules() must be implemented.
-  static absl::StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+  static absl::StatusOr<LoadedExecutableRef> Create(
       PjRtCompatibleClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+      DeviceListRef executable_devices);
 
   // Creates PjRtExecutable from an MHLO or StableHLO MLIR module. We expect
   // that xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings. If
   // options.executable_build_options has use_auto_spmd_partitioning or
   // allow_spmd_sharding_propagation_to_output enabled,
   // PjRtLoadedExecutable::GetHloModules() must be implemented.
-  static absl::StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+  static absl::StatusOr<LoadedExecutableRef> Create(
       PjRtCompatibleClient* client, mlir::ModuleOp module,
       xla::CompileOptions compile_options,
-      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+      DeviceListRef executable_devices);
 
   // PjRtCompatibleLoadedExecutable implementation.
 
@@ -217,6 +219,12 @@ class PjRtLoadedExecutable final
     return pjrt_loaded_executable_->name();
   }
 
+  absl::StatusOr<absl::Span<const int>> GetDonatableInputIndices()
+      const override {
+    return absl::UnimplementedError(
+        "PjRtLoadedExecutable::GetDonatableInputIndices is not implemented.");
+  }
+
   Future<> GetReadyFuture() const override {
     // PjRtCompiler blocks until compilation finishes and returns only the
     // executables that are ready.
@@ -234,13 +242,13 @@ class PjRtLoadedExecutable final
     return pjrt_loaded_executable_->GetOutputShardings();
   }
 
-  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
   GetParameterLayouts() const override {
     DCHECK(this);
     return pjrt_loaded_executable_->GetParameterLayouts();
   }
 
-  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
+  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
   GetOutputLayouts() const override {
     DCHECK(this);
     return pjrt_loaded_executable_->GetOutputLayouts();
@@ -281,15 +289,9 @@ class PjRtLoadedExecutable final
     return client_;
   }
   absl::StatusOr<ExecuteResult> Execute(
-      absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
+      absl::Span<ArrayRef> args, const ExecuteOptions& options,
       std::optional<DeviceListRef> devices) override;
 
-  Future<> Delete() override;
-  bool IsDeleted() const override {
-    DCHECK(this);
-    return pjrt_loaded_executable_->IsDeleted();
-  }
-
   absl::Span<Device* const> addressable_devices() const override {
     DCHECK(this);
     return addressable_devices_;
@@ -304,14 +306,15 @@ class PjRtLoadedExecutable final
   static char ID;  // NOLINT
 
  private:
-  static absl::StatusOr<std::unique_ptr<LoadedExecutable>> CreateInternal(
+  static absl::StatusOr<LoadedExecutableRef> CreateInternal(
       PjRtCompatibleClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
       absl::Span<const xla::PrimitiveType> result_element_types,
       absl::Span<const xla::DimensionVector> result_dimensions,
       const std::optional<xla::HloSharding>& result_hlo_sharding,
       const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
-      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
+      DeviceListRef executable_devices);
 
   PjRtLoadedExecutable(
       PjRtCompatibleClient* client,
@@ -322,7 +325,7 @@ class PjRtLoadedExecutable final
       std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
           host_send_recv_callbacks,
       std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
-      std::vector<std::shared_ptr<const Sharding>> output_shardings);
+      std::vector<ShardingRef> output_shardings);
 
   PjRtCompatibleClient* client_;
   std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable_;
@@ -340,7 +343,7 @@ class PjRtLoadedExecutable final
   // time.
   std::vector<DType> output_dtypes_;
   std::vector<Shape> output_shapes_;
-  std::vector<std::shared_ptr<const Sharding>> output_shardings_;
+  std::vector<ShardingRef> output_shardings_;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.cc
index 9ecdf64b4a72..f342f79be837 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace ifrt {
@@ -30,5 +31,12 @@ absl::StatusOr<std::string> PjRtHostSendAndRecvLoadedHostCallback::Serialize()
       "PjRtHostSendAndRecvLoadedHostCallback serialization is not supported");
 }
 
+char PjRtFfiLoadedHostCallback::ID = 0;
+
+absl::StatusOr<std::string> PjRtFfiLoadedHostCallback::Serialize() const {
+  return Unimplemented(
+      "PjRtFfiLoadedHostCallback serialization is not supported");
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.h
index 8c539af83e99..1fccdd3bd741 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_host_callback.h
@@ -21,8 +21,11 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/statusor.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/host_callback.h"
+#include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/host_callback.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -64,6 +67,29 @@ class PjRtHostSendAndRecvLoadedHostCallback
   std::unique_ptr<xla::HostCallback> host_callback_;
 };
 
+// Wrapper of an opaque callable that is loaded into FFI's ExecutionContext
+// during execution.
+class PjRtFfiLoadedHostCallback
+    : public llvm::RTTIExtends<PjRtFfiLoadedHostCallback, LoadedHostCallback> {
+ public:
+  explicit PjRtFfiLoadedHostCallback(Client* client, void* callable)
+      : client_(client), callable_(callable) {}
+
+  ~PjRtFfiLoadedHostCallback() override = default;
+
+  Client* client() const override { return client_; }
+
+  void* callable() const { return callable_; };
+
+  absl::StatusOr<std::string> Serialize() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  Client* client_;
+  void* callable_;
+};
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout.cc
new file mode 100644
index 000000000000..68969daebd07
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout.cc
@@ -0,0 +1,111 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/pjrt_layout.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/nullability.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/layout.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/pjrt_ifrt/pjrt_dtype.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+char PjRtLayout::ID = 0;
+
+absl_nonnull std::unique_ptr<PjRtLayout> PjRtLayout::Create(
+    std::shared_ptr<const xla::PjRtLayout> pjrt_layout) {
+  return absl::WrapUnique<PjRtLayout>(new PjRtLayout(std::move(pjrt_layout)));
+}
+
+absl::StatusOr<std::optional<int64_t>> PjRtLayout::ByteSize(
+    DType dtype, const Shape& shard_shape) const {
+  auto bit_size = dtype.bit_size();
+  if (!bit_size.has_value()) {
+    return std::nullopt;
+  }
+  TF_ASSIGN_OR_RETURN(auto xla_primitive_type, ToPrimitiveType(dtype));
+  auto xla_shape =
+      xla::ShapeUtil::MakeShape(xla_primitive_type, shard_shape.dims());
+  *xla_shape.mutable_layout() = pjrt_layout_->xla_layout();
+  return xla::ShapeUtil::ArraySize(xla_shape);
+}
+
+bool PjRtLayout::operator==(const Layout& other) const {
+  if (this == &other) {
+    return true;
+  }
+  if (const auto* other_pjrt = llvm::dyn_cast<PjRtLayout>(&other);
+      other_pjrt != nullptr) {
+    return *pjrt_layout_ == *other_pjrt->pjrt_layout_;
+  }
+  return false;
+}
+
+std::string PjRtLayout::ToString() const {
+  return absl::StrCat("PjRtLayout(", pjrt_layout_->ToString(), ")");
+}
+
+absl::StatusOr<absl_nonnull std::shared_ptr<const xla::PjRtLayout>>
+ToPjRtLayout(DType dtype, const Shape& shard_shape,
+             const CustomLayoutRef& layout) {
+  if (const auto* pjrt_layout = llvm::dyn_cast<PjRtLayout>(layout.get())) {
+    return pjrt_layout->pjrt_layout();
+  }
+  if (const auto* compact_layout =
+          llvm::dyn_cast<CompactLayout>(layout.get())) {
+    xla::Layout layout;
+    int num_dims = compact_layout->major_to_minor().size();
+    layout.mutable_minor_to_major()->reserve(num_dims);
+    for (int i = num_dims - 1; i >= 0; i--) {
+      layout.add_minor_to_major(compact_layout->major_to_minor()[i]);
+    }
+    return std::make_shared<xla::PjRtLayout>(std::move(layout));
+  }
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unsupported layout type: ", *layout));
+}
+
+absl::StatusOr<absl_nonnull std::shared_ptr<const xla::PjRtLayout>>
+ToPjRtLayout(DType dtype, const Shape& shard_shape, Device* device,
+             MemoryKind memory_kind, const LayoutRef& layout) {
+  if (layout == nullptr) {
+    return device->client()->GetDefaultLayout(dtype, shard_shape.dims(), device,
+                                              memory_kind);
+  }
+  return ToPjRtLayout(dtype, shard_shape, layout);
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout.h
new file mode 100644
index 000000000000..289ac376af64
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout.h
@@ -0,0 +1,96 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_LAYOUT_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_LAYOUT_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/nullability.h"
+#include "absl/status/statusor.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/layout.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+
+namespace xla {
+namespace ifrt {
+
+// Wraps around `xla::PjRtLayout` as an IFRT Layout.
+//
+// Compatibility note: While `xla::PjRtLayout` may accept take an arbitrary
+// `xla::Layout`, we strongly suggest using only a small subset of `xla::Layout`
+// features (`minor_to_major`, `tiles`, and `element_size_in_bits`) that are
+// approved for use in PjRt C API and less commonly used features.
+class PjRtLayout final
+    : public llvm::RTTIExtends<xla::ifrt::PjRtLayout, Layout> {
+ public:
+  // Creates a PjRtLayout.
+  //
+  // TODO(hyeontaek): Consider accepting only `xla::PjRtLayout` whose
+  // `xla::Layout` uses supported features by PjRt.
+  static absl_nonnull std::unique_ptr<PjRtLayout> Create(
+      absl_nonnull std::shared_ptr<const xla::PjRtLayout> pjrt_layout);
+
+  ~PjRtLayout() override = default;
+
+  absl_nonnull const std::shared_ptr<const xla::PjRtLayout>& pjrt_layout()
+      const {
+    return pjrt_layout_;
+  }
+
+  // Layout implementation.
+
+  absl::StatusOr<std::optional<int64_t>> ByteSize(
+      DType dtype, const Shape& shard_shape) const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit PjRtLayout(std::shared_ptr<const xla::PjRtLayout> pjrt_layout)
+      : pjrt_layout_(std::move(pjrt_layout)) {}
+
+  bool operator==(const Layout& other) const override;
+  std::string ToString() const override;
+
+  absl_nonnull std::shared_ptr<const xla::PjRtLayout> pjrt_layout_;
+};
+
+// Converts IFRT `CustomLayoutRef` into `xla::PjRtLayout`. Only supports a
+// reference to IFRT `PjRtLayout` and `CompactLayout`, as input.
+absl::StatusOr<absl_nonnull std::shared_ptr<const xla::PjRtLayout>>
+ToPjRtLayout(DType dtype, const Shape& shard_shape,
+             const CustomLayoutRef& layout);
+
+// Converts IFRT `LayoutRef` into `xla::PjRtLayout`. Only supports `nullptr`
+// (default layout), a reference to IFRT `PjRtLayout` and `CompactLayout` as
+// input. `nullptr` will be resolved to a concrete layout.
+//
+// Do not use this API to check the equivalence of two `LayoutRef`s. Use
+// `EquivalentLayouts` instead for a formal layout comparison logic.
+absl::StatusOr<absl_nonnull std::shared_ptr<const xla::PjRtLayout>>
+ToPjRtLayout(DType dtype, const Shape& shard_shape, Device* device,
+             MemoryKind memory_kind, const LayoutRef& layout);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_LAYOUT_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes.cc
new file mode 100644
index 000000000000..d2c868e41ebe
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes.cc
@@ -0,0 +1,85 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/pjrt_ifrt/pjrt_layout.h"
+#include "xla/python/pjrt_ifrt/pjrt_layout_serdes.pb.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+// Serialization/deserialization for `PjRtLayout`.
+class PjRtLayoutSerDes : public llvm::RTTIExtends<PjRtLayoutSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::PjRtLayout";
+  }
+
+  absl::StatusOr<std::string> Serialize(
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions> options) override {
+    const auto* pjrt_layout = llvm::cast<PjRtLayout>(&serializable);
+    PjRtLayoutProto proto;
+    // Use `xla::Layout` proto serialization, which is currently faster than
+    // `xla::PjRtLayout` human-readable serialization, and reasonably stable for
+    // the features used via `xla::PjRtLayout`.
+    *proto.mutable_xla_layout() =
+        pjrt_layout->pjrt_layout()->xla_layout().ToProto();
+    return proto.SerializeAsString();
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    PjRtLayoutProto proto;
+    if (!proto.ParseFromString(serialized)) {
+      return absl::InvalidArgumentError(
+          "Failed to parse serialized PjRtLayout");
+    }
+    TF_ASSIGN_OR_RETURN(auto xla_layout,
+                        xla::Layout::FromProto(proto.xla_layout()));
+    return PjRtLayout::Create(
+        std::make_unique<xla::PjRtLayout>(std::move(xla_layout)));
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char PjRtLayoutSerDes::ID = 0;  // NOLINT
+
+// clang-format off
+bool register_pjrt_layout_serdes = ([]{
+  RegisterSerDes<PjRtLayout>(
+      std::make_unique<PjRtLayoutSerDes>());
+}(), true);
+// clang-format on
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes.proto b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes.proto
new file mode 100644
index 000000000000..13fba3263a68
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes.proto
@@ -0,0 +1,25 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+import "xla/xla_data.proto";
+
+// Proto equivalent of C++ `PjRtLayout`.
+message PjRtLayoutProto {
+  xla.LayoutProto xla_layout = 1;
+}
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes_test.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes_test.cc
new file mode 100644
index 000000000000..d09b6e20ee02
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_serdes_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "llvm/Support/Casting.h"
+#include "xla/layout_util.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/pjrt_ifrt/pjrt_layout.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+TEST(PjRtLayoutSerDesTest, PjRtLayoutRoundTrip) {
+  auto layout = PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(
+      xla::LayoutUtil::MakeDescendingLayout(1)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto serialized,
+                          Serialize(*layout, /*options=*/nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto deserialized,
+      Deserialize<PjRtLayout>(serialized, /*options=*/nullptr));
+
+  const auto* out_layout = llvm::dyn_cast<PjRtLayout>(deserialized.get());
+  ASSERT_NE(out_layout, nullptr);
+  EXPECT_EQ(*out_layout->pjrt_layout(), *layout->pjrt_layout());
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_test.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_test.cc
new file mode 100644
index 000000000000..e8a0211edfeb
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_layout_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/pjrt_layout.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <tuple>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/layout.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::Optional;
+using ::testing::Return;
+using ::tsl::testing::IsOkAndHolds;
+
+TEST(PjRtLayoutTest, Create) {
+  EXPECT_EQ(PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(
+                                   xla::LayoutUtil::MakeDescendingLayout(2)))
+                ->pjrt_layout()
+                ->xla_layout(),
+            xla::LayoutUtil::MakeDescendingLayout(2));
+}
+
+TEST(PjRtLayoutTest, ByteSize) {
+  EXPECT_THAT(
+      PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(xla::Layout()))
+          ->ByteSize(DType(DType::kToken), Shape({})),
+      IsOkAndHolds(std::nullopt));
+  EXPECT_THAT(
+      PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(xla::Layout()))
+          ->ByteSize(DType(DType::kOpaque), Shape({})),
+      IsOkAndHolds(std::nullopt));
+
+  EXPECT_THAT(PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(
+                                     xla::LayoutUtil::MakeDescendingLayout(0)))
+                  ->ByteSize(DType(DType::kS32), Shape({})),
+              IsOkAndHolds(Optional(4)));
+  EXPECT_THAT(PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(
+                                     xla::LayoutUtil::MakeDescendingLayout(2)))
+                  ->ByteSize(DType(DType::kS32), Shape({3, 2})),
+              IsOkAndHolds(Optional(24)));
+  EXPECT_THAT(
+      PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(xla::Layout(
+                             /*minor_to_major=*/{1, 0},
+                             /*tiles=*/{xla::Tile({2, 128})},
+                             /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
+                             /*element_primitive_type=*/PRIMITIVE_TYPE_INVALID,
+                             /*tail_padding_alignment_in_elements=*/1,
+                             /*element_size_in_bits=*/32)))
+          ->ByteSize(DType(DType::kS32), Shape({1, 127})),
+      IsOkAndHolds(Optional(4 * (2 * 128))));
+  EXPECT_THAT(
+      PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(xla::Layout(
+                             /*minor_to_major=*/{1, 0},
+                             /*tiles=*/{xla::Tile({2, 1024})},
+                             /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
+                             /*element_primitive_type=*/PRIMITIVE_TYPE_INVALID,
+                             /*tail_padding_alignment_in_elements=*/1,
+                             /*element_size_in_bits=*/4)))
+          ->ByteSize(DType(DType::kS4), Shape({1, 1023})),
+      IsOkAndHolds(Optional((2 * 1024) / 2)));
+}
+
+TEST(PjRtLayoutTest, ToPjRtLayout) {
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto layout,
+        ToPjRtLayout(DType(DType::kS32), Shape({3, 2}),
+                     PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(
+                         xla::LayoutUtil::MakeDescendingLayout(2)))));
+    EXPECT_EQ(layout->xla_layout(), xla::LayoutUtil::MakeDescendingLayout(2));
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto layout,
+                            ToPjRtLayout(DType(DType::kS32), Shape({3, 2}),
+                                         CompactLayout::CreateCOrder(2)));
+    EXPECT_EQ(layout->xla_layout(), xla::LayoutUtil::MakeDescendingLayout(2));
+  }
+
+  {
+    auto client = std::make_shared<MockClient>();
+    auto device = std::make_unique<MockDevice>();
+    Shape shape({3, 2});
+    ON_CALL(*device, client).WillByDefault(Return(client.get()));
+    EXPECT_CALL(*client, GetDefaultLayout)
+        .With(std::make_tuple(DType(DType::kS32), shape.dims(),
+                              static_cast<Device*>(device.get()), MemoryKind()))
+        .WillOnce(Return(absl::StatusOr<std::shared_ptr<const xla::PjRtLayout>>(
+            std::make_shared<xla::PjRtLayout>(
+                xla::LayoutUtil::MakeDescendingLayout(2)))));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto layout, ToPjRtLayout(DType(DType::kS32), shape, device.get(),
+                                  MemoryKind(), /*layout=*/nullptr));
+    EXPECT_EQ(layout->xla_layout(), xla::LayoutUtil::MakeDescendingLayout(2));
+  }
+  {
+    auto client = std::make_shared<MockClient>();
+    auto device = std::make_unique<MockDevice>();
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto layout,
+        ToPjRtLayout(DType(DType::kS32), Shape({3, 2}), device.get(),
+                     MemoryKind(),
+                     PjRtLayout::Create(std::make_unique<xla::PjRtLayout>(
+                         xla::LayoutUtil::MakeDescendingLayout(2)))));
+    EXPECT_EQ(layout->xla_layout(), xla::LayoutUtil::MakeDescendingLayout(2));
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
index ebe1d86f915d..42f4b7dba94e 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
@@ -38,15 +38,33 @@ PjRtMemory::PjRtMemory(PjRtClient* client, xla::PjRtMemorySpace* pjrt_memory)
   }
 }
 
-MemoryId PjRtMemory::Id() const { return MemoryId(pjrt_memory_->id()); }
+PjRtMemory::PjRtMemory(PjRtClient* client, const MemoryKind& kind,
+                       Device* device)
+    : client_(client), kind_(kind) {
+  pjrt_memory_ = nullptr;
+  devices_.push_back(device);
+}
+
+MemoryId PjRtMemory::Id() const {
+  if (pjrt_memory_ == nullptr) {
+    return MemoryId(-1);
+  }
+  return MemoryId(pjrt_memory_->id());
+}
 
 const MemoryKind& PjRtMemory::Kind() const { return kind_; }
 
 absl::string_view PjRtMemory::ToString() const {
+  if (pjrt_memory_ == nullptr) {
+    return "UNADDRESSABLE_MEMORY_SPACE";
+  }
   return pjrt_memory_->ToString();
 }
 
 absl::string_view PjRtMemory::DebugString() const {
+  if (pjrt_memory_ == nullptr) {
+    return "Unaddressable PjRtMemory";
+  }
   return pjrt_memory_->DebugString();
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
index 3e69a151555b..33f8faeef879 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
@@ -43,6 +43,10 @@ class PjRtMemory final
  public:
   PjRtMemory(PjRtClient* client, xla::PjRtMemorySpace* pjrt_memory);
 
+  // Constructor for memories for non-addressable devices that are not backed by
+  // a PjRtMemorySpace.
+  PjRtMemory(PjRtClient* client, const MemoryKind& kind, Device* device);
+
   PjRtClient* client() const { return client_; }
   xla::PjRtMemorySpace* pjrt_memory() override { return pjrt_memory_; }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.cc
index ff9925f0a615..78b3727e78b3 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/remap_plan.h"
@@ -39,11 +40,11 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-PjRtCompatibleClientRemapArrays(
-    PjRtCompatibleClient* client, const RemapPlan& plan,
-    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-    ArrayCopySemantics semantics) {
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
+PjRtCompatibleClientRemapArrays(PjRtCompatibleClient* client,
+                                const RemapPlan& plan,
+                                absl::Span<xla::ifrt::ArrayRef> arrays,
+                                ArrayCopySemantics semantics) {
   TF_RETURN_IF_ERROR(plan.CheckArrayCopySemantics(semantics));
   const int num_inputs = plan.input_specs.size();
   const int num_actual_inputs = arrays.size();
@@ -127,14 +128,18 @@ PjRtCompatibleClientRemapArrays(
     }
   }
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
+  std::vector<xla::ifrt::ArrayRef> output_arrays;
   output_arrays.reserve(num_outputs);
   for (int i = 0; i < num_outputs; ++i) {
-    TF_ASSIGN_OR_RETURN(auto output_array,
-                        PjRtArray::Create(client, plan.output_specs[i].dtype,
-                                          plan.output_specs[i].shape,
-                                          plan.output_specs[i].sharding,
-                                          std::move(out_buffers_list[i])));
+    CHECK_GE(out_buffers_list[i].size(), 1);
+    std::shared_ptr<const xla::PjRtLayout> layout =
+        out_buffers_list[i].front()->layout();
+    TF_ASSIGN_OR_RETURN(
+        auto output_array,
+        PjRtArray::Create(client, plan.output_specs[i].dtype,
+                          plan.output_specs[i].shape,
+                          plan.output_specs[i].sharding,
+                          std::move(out_buffers_list[i]), std::move(layout)));
     output_arrays.push_back(std::move(output_array));
   }
   return output_arrays;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h
index 1dde760ef801..104b53d72607 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h
@@ -34,11 +34,11 @@ class PjRtCompatibleClient;
 
 // Common implementation of `xla::ifrt::Client::RemapArrays` for
 // `PjRtCompatibleClient`.
-absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-PjRtCompatibleClientRemapArrays(
-    PjRtCompatibleClient* client, const RemapPlan& plan,
-    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
-    ArrayCopySemantics semantics);
+absl::StatusOr<std::vector<xla::ifrt::ArrayRef>>
+PjRtCompatibleClientRemapArrays(PjRtCompatibleClient* client,
+                                const RemapPlan& plan,
+                                absl::Span<xla::ifrt::ArrayRef> arrays,
+                                ArrayCopySemantics semantics);
 
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
index b86382848689..4c1a1817e259 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
@@ -36,7 +36,7 @@ namespace xla {
 namespace ifrt {
 
 /*static*/ absl::StatusOr<tsl::RCReference<PjRtTuple>> PjRtTuple::Create(
-    PjRtCompatibleClient* client, absl::Span<tsl::RCReference<Value>> values) {
+    PjRtCompatibleClient* client, absl::Span<ValueRef> values) {
   return tsl::MakeRef<PjRtTuple>(client, values);
 }
 
@@ -79,14 +79,13 @@ bool PjRtTuple::IsDeleted() const {
 std::string PjRtTuple::DebugString() const {
   return absl::StrFormat(
       "PjRtTuple(%s)",
-      absl::StrJoin(values_, ",",
-                    [](std::string* out, const tsl::RCReference<Value>& value) {
-                      out->append(value->DebugString());
-                    }));
+      absl::StrJoin(values_, ",", [](std::string* out, const ValueRef& value) {
+        out->append(value->DebugString());
+      }));
 }
 int PjRtTuple::Arity() { return values_.size(); }
 
-absl::Status PjRtTuple::Unpack(absl::Span<tsl::RCReference<Value>> values_out) {
+absl::Status PjRtTuple::Unpack(absl::Span<ValueRef> values_out) {
   if (values_out.size() != values_.size()) {
     return InvalidArgument(
         "Wrong number of output values for "
@@ -99,8 +98,7 @@ absl::Status PjRtTuple::Unpack(absl::Span<tsl::RCReference<Value>> values_out) {
 
 char PjRtTuple::ID = 0;
 
-PjRtTuple::PjRtTuple(PjRtCompatibleClient* client,
-                     absl::Span<tsl::RCReference<Value>> values)
+PjRtTuple::PjRtTuple(PjRtCompatibleClient* client, absl::Span<ValueRef> values)
     : client_(client), values_(values.begin(), values.end()) {}
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
index 2140d19473cc..944884d0c7b8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
@@ -40,7 +40,7 @@ namespace ifrt {
 class PjRtTuple final : public llvm::RTTIExtends<PjRtTuple, Tuple> {
  public:
   static absl::StatusOr<tsl::RCReference<PjRtTuple>> Create(
-      PjRtCompatibleClient* client, absl::Span<tsl::RCReference<Value>> values);
+      PjRtCompatibleClient* client, absl::Span<ValueRef> values);
 
   ~PjRtTuple() override = default;
 
@@ -59,19 +59,18 @@ class PjRtTuple final : public llvm::RTTIExtends<PjRtTuple, Tuple> {
 
   int Arity() override;
 
-  absl::Status Unpack(absl::Span<tsl::RCReference<Value>> values) override;
+  absl::Status Unpack(absl::Span<ValueRef> values) override;
 
   static char ID;  // NOLINT
 
  private:
-  PjRtTuple(PjRtCompatibleClient* client,
-            absl::Span<tsl::RCReference<Value>> values);
+  PjRtTuple(PjRtCompatibleClient* client, absl::Span<ValueRef> values);
 
   template <typename T, typename... Args>
   friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
 
   PjRtCompatibleClient* client_;
-  absl::InlinedVector<tsl::RCReference<Value>, 4> values_;
+  absl::InlinedVector<ValueRef, 4> values_;
 
   absl::Mutex mu_;
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc
index 56b90fd84ac6..c2f7a42469a9 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -24,8 +26,12 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.pb.h"
+#include "xla/service/computation_placer.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -41,7 +47,8 @@ class XlaCompileOptionsSerDes
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const auto& options = llvm::cast<XlaCompileOptions>(serializable);
 
     XlaCompileOptionsProto proto;
@@ -106,5 +113,51 @@ GetXlaDeserializeExecutableOptions(
       static_cast<XlaDeserializeExecutableOptions*>(options.release()));
 }
 
+absl::StatusOr<xla::ifrt::DeviceListRef> GetDeviceListFromDeviceAssignment(
+    xla::ifrt::Client* ifrt_client,
+    const xla::DeviceAssignment& device_assignment) {
+  std::vector<xla::ifrt::Device*> devices;
+  devices.reserve(device_assignment.replica_count() *
+                  device_assignment.computation_count());
+  for (int64_t i = 0; i < device_assignment.replica_count(); ++i) {
+    for (int64_t j = 0; j < device_assignment.computation_count(); ++j) {
+      TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
+                          ifrt_client->LookupDevice(
+                              xla::ifrt::DeviceId(device_assignment(i, j))));
+      devices.push_back(device);
+    }
+  }
+  return ifrt_client->MakeDeviceList(devices);
+}
+
+absl::StatusOr<xla::ifrt::DeviceListRef> GetDeviceListFromXlaCompileOptions(
+    xla::ifrt::Client* ifrt_client,
+    const xla::CompileOptions& compile_options) {
+  if (compile_options.executable_build_options.has_device_assignment()) {
+    return GetDeviceListFromDeviceAssignment(
+        ifrt_client,
+        compile_options.executable_build_options.device_assignment());
+  }
+  if (compile_options.compile_portable_executable) {
+    return ifrt_client->MakeDeviceList(
+        {ifrt_client->addressable_devices().front()});
+  }
+  auto& build_options = compile_options.executable_build_options;
+  if (build_options.device_ordinal() >= 0) {
+    TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
+                        ifrt_client->LookupDevice(xla::ifrt::DeviceId(
+                            build_options.device_ordinal())));
+    return ifrt_client->MakeDeviceList({device});
+  }
+  TF_ASSIGN_OR_RETURN(
+      xla::DeviceAssignment default_da,
+      ifrt_client->GetDefaultDeviceAssignment(build_options.num_replicas(),
+                                              build_options.num_partitions()));
+  TF_ASSIGN_OR_RETURN(
+      xla::ifrt::DeviceListRef devices,
+      GetDeviceListFromDeviceAssignment(ifrt_client, default_da));
+  return devices;
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
index 1cf81e253da8..e022e7f617c0 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/host_callback.h"
 
 namespace xla {
@@ -43,12 +44,15 @@ struct XlaCompileOptions
     : llvm::RTTIExtends<XlaCompileOptions, CompileOptions> {
   XlaCompileOptions() = default;
   explicit XlaCompileOptions(xla::CompileOptions compile_options,
+                             DeviceListRef devices,
                              std::vector<tsl::RCReference<LoadedHostCallback>>
                                  loaded_host_callbacks = {})
       : compile_options(std::move(compile_options)),
+        devices(std::move(devices)),
         loaded_host_callbacks(std::move(loaded_host_callbacks)) {}
 
   xla::CompileOptions compile_options;
+  DeviceListRef devices;
   std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks;
 
   // CompileOptions implementation.
@@ -64,20 +68,25 @@ struct XlaCompileOptions
 //
 // TODO(hyeontaek): Move `loaded_host_callbacks` to a (new) `LoadOptions`
 // because deserialization (without loading) should not take them.
+// TODO(emilyaf): Make `devices` non-optional once it is plumbed through from
+// Australis.
 struct XlaDeserializeExecutableOptions
     : llvm::RTTIExtends<XlaDeserializeExecutableOptions,
                         DeserializeExecutableOptions> {
   XlaDeserializeExecutableOptions() = default;
   explicit XlaDeserializeExecutableOptions(
       std::optional<xla::CompileOptions> compile_options,
+      std::optional<DeviceListRef> devices,
       std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks =
           {})
       : compile_options(std::move(compile_options)),
+        devices(std::move(devices)),
         loaded_host_callbacks(std::move(loaded_host_callbacks)) {}
 
   // `compile_options` may be unspecified if deserialization does not override
   // it.
   std::optional<xla::CompileOptions> compile_options;
+  std::optional<DeviceListRef> devices;
   std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks;
 
   // DeserializeExecutableOptions implementation.
@@ -97,6 +106,15 @@ absl::StatusOr<std::unique_ptr<XlaDeserializeExecutableOptions>>
 GetXlaDeserializeExecutableOptions(
     std::unique_ptr<DeserializeExecutableOptions> options);
 
+// Gets `xla::ifrt::DeviceListRef` from `xla::DeviceAssignment`.
+absl::StatusOr<xla::ifrt::DeviceListRef> GetDeviceListFromDeviceAssignment(
+    xla::ifrt::Client* ifrt_client,
+    const xla::DeviceAssignment& device_assignment);
+
+// Gets `xla::ifrt::DeviceListRef` from `xla::XlaCompileOptions`.
+absl::StatusOr<xla::ifrt::DeviceListRef> GetDeviceListFromXlaCompileOptions(
+    xla::ifrt::Client* ifrt_client, const xla::CompileOptions& compile_options);
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index 4cf57f82a34a..0f5c33a85487 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -51,33 +52,37 @@ namespace {
 
 using ::testing::ElementsAreArray;
 using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
+using ::tsl::testing::IsOkAndHolds;
 
 // Serialized `ModuleOp` that does add 1.
 static const char* const module_add_one =
     R"(module {
-func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-  %0 = "mhlo.copy"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  %2 = "mhlo.broadcast"(%1) {broadcast_sizes = dense<[2, 3]> : tensor<2xi64>} : (tensor<f32>) -> tensor<2x3xf32>
-  %3 = mhlo.add %0, %2 : tensor<2x3xf32>
-  return %3 : tensor<2x3xf32>
-}})";
+  func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+    %0 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
+    %1 = "stablehlo.broadcast_in_dim"(%0) {broadcast_dimensions = array<i64>} : (tensor<f32>) -> tensor<2x3xf32>
+    %2 = stablehlo.add %arg0, %1 : tensor<2x3xf32>
+    return %2 : tensor<2x3xf32>
+  }
+})";
 
 // Compiles an MLIR module on specified devices. If devices is empty, compiles
 // it as a portable executable.
-absl::StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
+absl::StatusOr<LoadedExecutableRef> CompileOnDevices(
     Client* client, Compiler* compiler, absl::string_view mlir_module_str,
     absl::Span<Device* const> devices, bool replicated) {
   mlir::MLIRContext context;
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                       xla::ParseMlirModuleString(mlir_module_str, context));
 
-  auto compile_options =
-      std::make_unique<XlaCompileOptions>(xla::CompileOptions());
+  xla::CompileOptions compile_options;
   ExecutableBuildOptions& build_options =
-      compile_options->compile_options.executable_build_options;
+      compile_options.executable_build_options;
+  DeviceListRef device_list;
   if (devices.empty()) {
-    compile_options->compile_options.compile_portable_executable = true;
+    compile_options.compile_portable_executable = true;
+    device_list =
+        client->MakeDeviceList({client->addressable_devices().front()});
   } else {
     build_options.set_device_ordinal(devices.front()->Id().value());
     if (replicated) {
@@ -102,9 +107,47 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
       }
       build_options.set_device_assignment(device_assignment);
     }
+    device_list = client->MakeDeviceList(devices);
+  }
+  auto xla_compile_options = std::make_unique<XlaCompileOptions>(
+      compile_options, std::move(device_list));
+  return compiler->CompileAndLoad(std::make_unique<HloProgram>(*module),
+                                  std::move(xla_compile_options));
+}
+
+TEST(LoadedExecutableImplTest, GetDonatableInputIndices) {
+  static const char* const multi_arg_add_all = R"(module {
+    func.func @main(
+        %arg0: tensor<2x3xf32> {jax.buffer_donor = true},
+        %arg1: tensor<2x3xf32>,
+        %arg2: tensor<2x3xf32> {jax.buffer_donor = true},
+        %arg3: tensor<2x3xf32>
+      ) -> tensor<2x3xf32> {
+      %4 = stablehlo.add %arg0, %arg1 : tensor<2x3xf32>
+      %5 = stablehlo.add %arg2, %arg3 : tensor<2x3xf32>
+      %6 = stablehlo.add %4, %5 : tensor<2x3xf32>
+      return %6 : tensor<2x3xf32>
+    }})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Compiler* compiler = client->GetDefaultCompiler();
+
+  std::vector<Device*> devices = {client->addressable_devices().at(0)};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      CompileOnDevices(client.get(), compiler, multi_arg_add_all, devices,
+                       /*replicated=*/false));
+
+  absl::StatusOr<absl::Span<const int>> donatable_input_indices =
+      loaded_executable->GetDonatableInputIndices();
+
+  if (absl::IsUnimplemented(donatable_input_indices.status())) {
+    GTEST_SKIP() << "GetDonatableInputIndices() returned unimplemented error: "
+                 << donatable_input_indices.status();
   }
-  return compiler->Compile(std::make_unique<HloProgram>(*module),
-                           std::move(compile_options));
+
+  EXPECT_THAT(donatable_input_indices,
+              IsOkAndHolds(UnorderedElementsAre(0, 2)));
 }
 
 TEST(LoadedExecutableImplTest, CompileAndExecute) {
@@ -122,8 +165,7 @@ TEST(LoadedExecutableImplTest, CompileAndExecute) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -167,8 +209,7 @@ TEST(LoadedExecutableImplTest, CompileAndExecutePortable) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -212,8 +253,7 @@ TEST(LoadedExecutableImplTest, DoNotFillStatus) {
   std::vector<float> data(6);
   std::iota(data.begin(), data.end(), 0);
   Device* device = client->addressable_devices().at(0);
-  std::shared_ptr<const Sharding> sharding =
-      SingleDeviceSharding::Create(device, MemoryKind());
+  ShardingRef sharding = SingleDeviceSharding::Create(device, MemoryKind());
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto array, client->MakeArrayFromHostBuffer(
@@ -242,33 +282,6 @@ TEST(LoadedExecutableImplTest, DoNotFillStatus) {
   EXPECT_THAT(out_data, ElementsAreArray(expected_out_data));
 }
 
-TEST(LoadedExecutableImplTest, Delete) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
-  Compiler* compiler = client->GetDefaultCompiler();
-
-  std::vector<Device*> devices = {client->addressable_devices().at(0)};
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto loaded_executable,
-      CompileOnDevices(client.get(), compiler, module_add_one, devices,
-                       /*replicated=*/false));
-  TF_EXPECT_OK(loaded_executable->Delete().Await());
-}
-
-TEST(LoadedExecutableImplTest, IsDeleted) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
-  Compiler* compiler = client->GetDefaultCompiler();
-
-  std::vector<Device*> devices = {client->addressable_devices().at(0)};
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto loaded_executable,
-      CompileOnDevices(client.get(), compiler, module_add_one, devices,
-                       /*replicated=*/false));
-  EXPECT_FALSE(loaded_executable->IsDeleted());
-  auto future = loaded_executable->Delete();
-  EXPECT_TRUE(loaded_executable->IsDeleted());
-  TF_EXPECT_OK(future.Await());
-}
-
 }  // namespace
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
index 4aaf505ce681..4956b82600a1 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -23,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/optimization.h"
 #include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -40,7 +42,6 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -127,8 +128,17 @@ std::unique_ptr<HloSharding> HloSharding::Create(
 HloSharding::HloSharding(DeviceListRef devices, MemoryKind memory_kind,
                          xla::HloSharding xla_hlo_sharding)
     : llvm::RTTIExtends<HloSharding, XlaCompatibleSharding>(
-          std::move(devices), memory_kind, xla_hlo_sharding.IsReplicated()),
-      xla_hlo_sharding_(std::move(xla_hlo_sharding)) {}
+          std::move(devices), memory_kind,
+          // Computed in the constructor because it needs to access `devices` or
+          // `devices_`; this access would be unsafe unless `device` is not
+          // moved.
+          /*is_fully_replicated=*/false),
+      xla_hlo_sharding_(std::move(xla_hlo_sharding)) {
+  is_fully_replicated_ =
+      xla_hlo_sharding_.IsReplicated() ||
+      ((xla_hlo_sharding_.IsTiled() || xla_hlo_sharding_.IsTileMaximal()) &&
+       devices_->size() == 1);
+}
 
 absl::StatusOr<Shape> HloSharding::GetShardShape(const Shape& shape) const {
   if (xla_hlo_sharding_.IsTileMaximal() || xla_hlo_sharding_.IsManual() ||
@@ -185,13 +195,13 @@ absl::StatusOr<std::unique_ptr<Sharding>> HloSharding::WithDeviceAssignment(
   return Create(devices.value_or(devices_), memory_kind.value_or(memory_kind_),
                 xla_hlo_sharding_);
 }
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 HloSharding::Disassemble(const Shape& shape) const {
   DCHECK(this);
   return Disassemble(shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>>
 HloSharding::Disassemble(
     const Shape& shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -225,7 +235,7 @@ HloSharding::Disassemble(
   if (is_even_sharding) {
     // Fast path for even sharding.
     TF_ASSIGN_OR_RETURN(xla::ifrt::Shape shard_shape, GetShardShape(shape));
-    std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+    std::vector<std::pair<Shape, ShardingRef>> result;
     if (single_device_shard_semantics ==
         SingleDeviceShardSemantics::kAllShards) {
       result.reserve(devices_->size());
@@ -243,41 +253,37 @@ HloSharding::Disassemble(
       }
     }
     return result;
+  }
+  // Slow path that uses `IndexDomains()` to handle uneven sharding.
+  TF_ASSIGN_OR_RETURN(std::vector<IndexDomain> index_domains,
+                      IndexDomains(shape));
+  CHECK_EQ(index_domains.size(), devices_->size());
+  std::vector<std::pair<Shape, ShardingRef>> result;
+  if (single_device_shard_semantics == SingleDeviceShardSemantics::kAllShards) {
+    result.reserve(devices_->size());
   } else {
-    // Slow path that uses `IndexDomains()` to handle uneven sharding.
-    TF_ASSIGN_OR_RETURN(std::vector<IndexDomain> index_domains,
-                        IndexDomains(shape));
-    CHECK_EQ(index_domains.size(), devices_->size());
-    std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+    result.reserve(devices_->AddressableDeviceList()->size());
+  }
+  for (int i = 0; i < index_domains.size(); ++i) {
     if (single_device_shard_semantics ==
-        SingleDeviceShardSemantics::kAllShards) {
-      result.reserve(devices_->size());
-    } else {
-      result.reserve(devices_->AddressableDeviceList()->size());
-    }
-    for (int i = 0; i < index_domains.size(); ++i) {
-      if (single_device_shard_semantics ==
-              SingleDeviceShardSemantics::kAllShards ||
-          devices[i]->IsAddressable()) {
-        result.push_back({
-            index_domains[i].shape(),
-            SingleDeviceSharding::Create(devices[i], memory_kind_),
-        });
-      }
+            SingleDeviceShardSemantics::kAllShards ||
+        devices[i]->IsAddressable()) {
+      result.push_back({
+          index_domains[i].shape(),
+          SingleDeviceSharding::Create(devices[i], memory_kind_),
+      });
     }
-    return result;
   }
+  return result;
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 HloSharding::Disassemble(const DynamicShape& dynamic_shape) const {
   DCHECK(this);
   return Disassemble(dynamic_shape, SingleDeviceShardSemantics::kAllShards);
 }
 
-absl::StatusOr<
-    std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
+absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>>
 HloSharding::Disassemble(
     const DynamicShape& dynamic_shape,
     SingleDeviceShardSemantics single_device_shard_semantics) const {
@@ -424,8 +430,15 @@ std::string HloSharding::DebugString() const {
 }
 
 void HloSharding::Hash(absl::HashState state) const {
-  absl::HashState::combine(std::move(state), devices_, memory_kind_,
-                           xla_hlo_sharding_);
+  uint64_t hash = hash_.load(std::memory_order_relaxed);
+  if (hash == kUnsetHash) {
+    hash = absl::HashOf(devices_, memory_kind_, xla_hlo_sharding_);
+    if (ABSL_PREDICT_FALSE(hash == kUnsetHash)) {
+      ++hash;
+    }
+    hash_.store(hash, std::memory_order_relaxed);
+  }
+  absl::HashState::combine(std::move(state), hash);
 }
 
 std::vector<IndexDomain> TEST_HloShardingIndexDomainsSlowPath(
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
index 46d799bfaa9a..5d3a17098856 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_PYTHON_PJRT_IFRT_XLA_SHARDING_H_
 #define XLA_PYTHON_PJRT_IFRT_XLA_SHARDING_H_
 
+#include <atomic>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -31,7 +33,6 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
-#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -73,19 +74,15 @@ class HloSharding final
       std::optional<DeviceListRef> devices,
       std::optional<MemoryKind> memory_kind) const override;
 
-  absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
-  Disassemble(const Shape& shape) const override;
-  absl::StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
+      const Shape& shape) const override;
+  absl::StatusOr<std::vector<std::pair<Shape, ShardingRef>>> Disassemble(
       const Shape& shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
-  absl::StatusOr<
-      std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
-  Disassemble(const DynamicShape& dynamic_shape) const override;
-  absl::StatusOr<
-      std::vector<std::pair<DynamicShape, std::shared_ptr<const Sharding>>>>
-  Disassemble(
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
+      const DynamicShape& dynamic_shape) const override;
+  absl::StatusOr<std::vector<std::pair<DynamicShape, ShardingRef>>> Disassemble(
       const DynamicShape& dynamic_shape,
       SingleDeviceShardSemantics single_device_shard_semantics) const override;
 
@@ -106,6 +103,11 @@ class HloSharding final
   void Hash(absl::HashState state) const override;
 
   xla::HloSharding xla_hlo_sharding_;
+
+  // Cached hash. 0 indicates the hash needs to be computed and cached.
+  // May be written multiple times with the same non-zero value.
+  static constexpr uint64_t kUnsetHash = 0;
+  mutable std::atomic<uint64_t> hash_ = kUnsetHash;
 };
 
 // Test only: returns `HloSharding::IndexDomains()`, using `xla::HloSharding`
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc
index a6fc641e94d3..de31271d30e8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes.cc
@@ -44,7 +44,8 @@ class HloShardingSerDes : public llvm::RTTIExtends<HloSharding, SerDes> {
   }
 
   absl::StatusOr<std::string> Serialize(
-      Serializable& serializable, std::unique_ptr<SerializeOptions>) override {
+      const Serializable& serializable,
+      std::unique_ptr<SerializeOptions>) override {
     const HloSharding& sharding = llvm::cast<HloSharding>(serializable);
     HloShardingProto proto;
     *proto.mutable_devices() = sharding.devices()->ToProto();
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc
index 15c56fe3e330..8dff15655a6a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_serdes_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/functional/bind_front.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/client.h"
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
index 040e4826ffe3..5975cf13fbd8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_sharding_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/hash/hash_testing.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
-#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
@@ -64,22 +64,71 @@ TEST_P(HloShardingTest, CreateWithBadDeviceList) {
 TEST_P(HloShardingTest, IsFullyReplicated) {
   auto device_list = GetDevices({0, 1, 2, 3, 4, 5});
   {
-    // Fully replicated.
+    // Fully replicated HloSharding is fully replicated.
     auto xla_hlo_sharding = xla::HloSharding::Replicate();
     std::shared_ptr<const HloSharding> sharding =
         HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
     EXPECT_TRUE(sharding->IsFullyReplicated());
   }
   {
-    // Not fully replicated.
+    // Single-tile HloSharding is fully replicated.
+    auto device_list = GetDevices({0});  // This sharding uses 1 device.
+    auto xla_hlo_sharding = xla::HloSharding::IotaTile({1, 1});
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_TRUE(sharding->IsFullyReplicated());
+  }
+  {
+    // Multi-tile HloSharding with last_dim_replicate where all replices are on
+    // the last tile dimension is fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::PartialTile(
+        xla::TileAssignment(xla::IotaTileAssignment::Create({1, 6})));
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_TRUE(sharding->IsFullyReplicated());
+  }
+  {
+    // Multi-tile HloSharding with last_dim_replicate where not all replices are
+    // on the last tile dimension is not fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::PartialTile(
+        xla::TileAssignment(xla::IotaTileAssignment::Create({2, 3})));
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_FALSE(sharding->IsFullyReplicated());
+  }
+  {
+    // Multi-tile HloSharding with no last_dim_replicate is not fully
+    // replicated.
     auto xla_hlo_sharding = xla::HloSharding::IotaTile({1, 6});
     std::shared_ptr<const HloSharding> sharding =
         HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
     EXPECT_FALSE(sharding->IsFullyReplicated());
   }
   {
-    // Not fully replicated.
-    auto xla_hlo_sharding = xla::HloSharding::IotaTile({2, 3});
+    // Maximal HloSharding with a single device is fully replicated.
+    auto device_list = GetDevices({0});  // This sharding uses 1 device.
+    auto xla_hlo_sharding = xla::HloSharding::AssignDevice(/*device_id=*/0);
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_TRUE(sharding->IsFullyReplicated());
+  }
+  {
+    // Maximal HloSharding with more than one device is not fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::AssignDevice(/*device_id=*/0);
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_FALSE(sharding->IsFullyReplicated());
+  }
+  {
+    // Manual HloSharding is not fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::Manual();
+    std::shared_ptr<const HloSharding> sharding =
+        HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
+    EXPECT_FALSE(sharding->IsFullyReplicated());
+  }
+  {
+    // Unknown HloSharding is not fully replicated.
+    auto xla_hlo_sharding = xla::HloSharding::Unknown();
     std::shared_ptr<const HloSharding> sharding =
         HloSharding::Create(device_list, MemoryKind(), xla_hlo_sharding);
     EXPECT_FALSE(sharding->IsFullyReplicated());
@@ -882,6 +931,22 @@ TEST_P(HloShardingTest, DisassembleFailsWithDynamicShape) {
                        HasSubstr("can only disassemble static shape")));
 }
 
+TEST_P(HloShardingTest, Hash) {
+  EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly({
+      HloSharding::Create(GetDevices({0, 1, 2, 3, 4, 5}), MemoryKind(),
+                          xla::HloSharding::Replicate()),
+      HloSharding::Create(GetDevices({0}), MemoryKind(),
+                          xla::HloSharding::Replicate()),
+      HloSharding::Create(GetDevices({0}), MemoryKind("pinned_host"),
+                          xla::HloSharding::Replicate()),
+      HloSharding::Create(GetDevices({0, 1, 2, 3, 4, 5}), MemoryKind(),
+                          xla::HloSharding::AssignDevice(/*device_id=*/0)),
+      HloSharding::Create(GetDevices({0, 1, 2, 3, 4, 5}), MemoryKind(),
+                          xla::HloSharding::PartialTile(xla::TileAssignment(
+                              xla::IotaTileAssignment::Create({2, 3})))),
+  }));
+}
+
 INSTANTIATE_TEST_SUITE_P(NumDevices, HloShardingTest,
                          testing::Values(test_util::DeviceTestParam{
                              /*num_devices=*/6,
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
deleted file mode 100644
index 5f3ab64f2035..000000000000
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ /dev/null
@@ -1,1180 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/pmap_lib.h"
-
-#include <Python.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <memory>
-#include <optional>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/synchronization/notification.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/variant.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/config.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/jax_jit.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_helpers.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_device.h"
-#include "xla/python/py_executable.h"
-#include "xla/python/py_values.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/pytree.h"
-#include "xla/python/sharded_device_array.h"
-#include "xla/python/sharding.h"
-#include "xla/python/to_ifrt_sharding.h"
-#include "xla/python/traceback.h"
-#include "xla/python/types.h"
-#include "xla/status_macros.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/python/lib/core/numpy.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/profiler/lib/traceme.h"
-
-namespace jax {
-
-namespace nb = nanobind;
-
-namespace {
-
-// Specifies how to shard the inputs. Even though everything could be computed
-// from `sharding_specs` and the argument shape, we cache derived computations
-// for performance.
-struct InputSpec {
-  InputSpec(nb::object indices, nb::object array_sharding)
-      : indices(std::move(indices)),
-        array_sharding(std::move(array_sharding)) {}
-  nb::object indices;
-  nb::object array_sharding;
-};
-
-// An object containing the arguments to create Array from the
-// output buffers.
-struct ResultSpec {
- public:
-  explicit ResultSpec(nb::object aval)
-      : out_aval(std::move(aval)),
-        weak_type(nb::cast<bool>(out_aval.attr("weak_type"))) {}
-  nb::object out_aval;
-  bool weak_type;
-};
-
-// The result of `ShardArg`.
-struct ShardArgResult {
-  // Points to the on-device array.
-  // ifrt_array->sharding().num_shards() == `num_devices`.
-  tsl::RCReference<xla::ifrt::Array> ifrt_array;
-  // The Python argument will be always be copied to `owning_sda`.
-  nb::object owning_sda;
-};
-
-// Shards a single argument over devices.
-//
-// We currently only support fully in C++, C++ Array. For all
-// other usages, we call a Python function returning C++ Array
-// that will be casted back to the C++ objects.
-//
-// This function is not usable for JAX extensions that do not comply with the
-// PjRt interfaces.
-//
-// Arguments:
-// `arg`: The object to shard across `devices`. If a `Array`,
-//   a fast-path will be executed if it's already correctly sharded.
-//
-// Returns a failure absl::Status when an unrecoverable error occurred, so we
-// don't need to fallback to Python.
-//
-// Both `devices` and `sharding_spec` has the same length.
-absl::StatusOr<ShardArgResult> ShardArg(
-    nb::handle arg, absl::Span<xla::ifrt::Device* const> devices,
-    const InputSpec& input_spec, nb::handle py_devices,
-    const nb::callable& python_fallback) {
-  if (arg.type().ptr() == xla::PyArray::type().ptr()) {
-    auto py_array = nb::borrow<xla::PyArray>(arg);
-    if (py_array.sharding().type().ptr() ==
-        input_spec.array_sharding.type().ptr()) {
-      auto* pmap_sharding = nb::cast<jax::PmapSharding*>(py_array.sharding());
-      auto* cached_pmap_sharding =
-          nb::cast<jax::PmapSharding*>(input_spec.array_sharding);
-
-      if (pmap_sharding->sharding_spec() ==
-          cached_pmap_sharding->sharding_spec()) {
-        ShardArgResult result;
-        result.owning_sda = nb::borrow<nb::object>(arg);
-        result.ifrt_array = tsl::FormRef(py_array.ifrt_array());
-        if (result.ifrt_array == nullptr) {
-          return xla::InvalidArgument("Array has been deleted.");
-        }
-        if (result.ifrt_array->sharding().devices()->devices() != devices) {
-          absl::InlinedVector<xla::ifrt::Device*, 1> ifrt_devices;
-          ifrt_devices.reserve(devices.size());
-          ifrt_devices.insert(ifrt_devices.end(), devices.begin(),
-                              devices.end());
-          // pmap does not support memory_kind for now.
-          auto* ifrt_client = result.ifrt_array->client();
-          TF_ASSIGN_OR_RETURN(auto copied_ifrt_arrays,
-                              ifrt_client->CopyArrays(
-                                  absl::MakeSpan(&result.ifrt_array, 1),
-                                  ifrt_client->MakeDeviceList(ifrt_devices),
-                                  xla::ifrt::MemoryKind(),
-                                  xla::ifrt::ArrayCopySemantics::kReuseInput));
-          result.ifrt_array = std::move(copied_ifrt_arrays.front());
-        }
-        return result;
-      }
-    }
-  }
-
-  auto ndarray = xla::nb_numpy_ndarray::ensure(arg);
-  if (ndarray && PyArray_CheckExact(arg.ptr()) &&
-      xla::DtypeToPrimitiveType(ndarray.dtype()).status().ok()) {
-    tsl::profiler::TraceMe traceme("ndarray pmap ShardArg");
-    nb::list indices = nb::list(input_spec.indices);
-    nb::list py_devices_list = nb::cast<nb::list>(py_devices);
-    auto n_devices = py_devices_list.size();
-    if (indices.size() != n_devices) {
-      return xla::InvalidArgument("indices vs devices mismatch: %d vs %d",
-                                  indices.size(), n_devices);
-    }
-
-    std::vector<tsl::RCReference<xla::ifrt::Array>> per_device_arrays;
-    per_device_arrays.reserve(n_devices);
-    absl::InlinedVector<xla::ifrt::Device*, 1> devices;
-    devices.reserve(n_devices);
-    // TODO(hyeontaek): The created array will never be disassembled. We should
-    // omit collecting shapes and make the OpaqueSharding non-disassemblable?
-    std::vector<xla::ifrt::Shape> shapes;
-    shapes.reserve(n_devices);
-
-    nb::list owning_pylist;
-    ShardArgResult result;
-    result.owning_sda = owning_pylist;
-    const bool jax_enable_x64 = GetEnableX64();
-
-    std::vector<xla::DevicePutResultFn> device_put_fns;
-    device_put_fns.reserve(n_devices);
-    xla::DevicePutOptions options;
-    options.squash_64bit_types = !jax_enable_x64;
-    options.allow_zero_copy = true;
-    for (size_t i = 0; i < n_devices; ++i) {
-      auto to_device = nb::cast<xla::PyDevice*>(py_devices_list[i]);
-      if (to_device->client().get() == nullptr) {
-        return xla::InvalidArgument("Cannot copy to unattached devices.");
-      }
-
-      TF_ASSIGN_OR_RETURN(
-          device_put_fns.emplace_back(),
-          DevicePut(arg[indices[i]], to_device->client()->ifrt_client(),
-                    to_device->device(), options, xla::ifrt::MemoryKind()));
-    }
-    std::vector<xla::DevicePutResult> device_puts;
-    device_puts.reserve(n_devices);
-    {
-      nb::gil_scoped_release gil_release;
-      for (auto& device_put_fn : device_put_fns) {
-        TF_ASSIGN_OR_RETURN(auto device_put, std::move(device_put_fn)());
-        device_puts.push_back(std::move(device_put));
-      }
-    }
-    for (auto& device_put : device_puts) {
-      per_device_arrays.push_back(std::move(device_put.ifrt_array));
-      devices.push_back(
-          per_device_arrays.back()->sharding().devices()->devices().front());
-      shapes.push_back(per_device_arrays.back()->shape());
-      if (device_put.owning_pybuffer) {
-        owning_pylist.append(device_put.owning_pybuffer);
-      }
-    }
-
-    if (per_device_arrays.empty()) {
-      return xla::InvalidArgument("Per-device arrays must not be empty.");
-    }
-    // TODO(hyeontaek): The logical shape here is inaccurate. We
-    // may want to avoid creating a new Array or specialize Array
-    // to disallow access to the logical shape.
-    xla::ifrt::Shape shape = per_device_arrays.front()->shape();
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_sharding,
-        xla::GetIfrtConcreteSharding(input_spec.array_sharding, shape, shapes));
-    TF_ASSIGN_OR_RETURN(
-        result.ifrt_array,
-        per_device_arrays.front()
-            ->client()
-            ->AssembleArrayFromSingleDeviceArrays(
-                std::move(shape), std::move(ifrt_sharding),
-                absl::MakeSpan(per_device_arrays),
-                xla::ifrt::ArrayCopySemantics::kReuseInput,
-                xla::ifrt::SingleDeviceShardSemantics::kAddressableShards));
-    return result;
-  }
-  tsl::profiler::TraceMe traceme("pmap_lib_shard_arg_python_fallback");
-  auto py_array_or_bufs = python_fallback(arg, input_spec.array_sharding);
-
-  auto py_array = nb::cast<xla::PyArray>(py_array_or_bufs);
-  ShardArgResult result;
-  result.owning_sda = nb::borrow(py_array_or_bufs);
-  result.ifrt_array = tsl::FormRef(py_array.ifrt_array());
-  return result;
-}
-
-struct PmapCacheEntry {
-  explicit PmapCacheEntry(xla::PyTreeRegistry* registry)
-      : out_pytree_def(registry) {}
-  std::shared_ptr<xla::PyLoadedExecutable> executable;
-  // The value `backend.local_devices()`.
-  nb::object py_devices;  // To pass back to Python.
-  std::vector<xla::ifrt::Device*> devices;
-  std::vector<InputSpec> input_specs;
-  xla::PyTreeDef out_pytree_def;
-  // Objects necessary to build the out Array objects.
-  std::vector<ResultSpec> out_result_specs;
-
-  std::vector<nb::object> out_array_shardings;
-  std::vector<xla::nb_dtype> out_dtypes;
-  std::vector<std::vector<int64_t>> out_shapes;
-  std::vector<bool> out_committed;
-
-  // Ensures a single thread performs the compilation for a given executable.
-  //
-  // The first thread (holding the GIL) will create the CacheEntry associated to
-  // a signature and if the object has been inserted already, other threads
-  // will wait for the notification.
-  absl::Notification compilation_complete;
-
-  bool fall_back_to_python = false;
-};
-
-}  // namespace
-
-// A `PmapFunction` is associated to a `jax.pmap(f)` and takes care of the
-// bookkeeping of the different signatures used and the dispatch of calls to
-// the correct underlying `PyLoadedExecutable`. This class is thread-safe.
-class PmapFunction {
- public:
-  PmapFunction(nb::callable fun, nb::callable cache_miss,
-               std::vector<int> static_argnums,
-               nb::callable python_shard_arg_fallback,
-               xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry)
-      : fun_(std::move(fun)),
-        cache_miss_(std::move(cache_miss)),
-        static_argnums_(std::move(static_argnums)),
-        pytree_registry_(std::move(pytree_registry)),
-        python_shard_arg_fallback_(std::move(python_shard_arg_fallback)) {
-    std::sort(static_argnums_.begin(), static_argnums_.end());
-
-    function_name_ =
-        nb::cast<std::string>(nb::str(nb::getattr(fun_, "__name__", fun_)));
-  }
-  PmapFunction(const PmapFunction&) = delete;
-  PmapFunction& operator=(const PmapFunction& other) = delete;
-  PmapFunction(PmapFunction&&) = default;
-  PmapFunction& operator=(PmapFunction&&) = default;
-
-  // This function will:
-  // (a) flatten the inputs using pytree
-  // (b) get buffer objects from the arguments
-  // (c) call the executable
-  // (d) construct `Array` objects from the outputs
-  // (e) reconstruct the `PyTree`.
-  absl::StatusOr<nb::object> Call(nb::handle callable, PyObject* const* args,
-                                  size_t nargs, PyObject* kwnames);
-
-  nb::object PythonSignature() {
-    static const auto* inspect =
-        new nb::module_(nb::module_::import_("inspect"));
-    return inspect->attr("signature")(fun_);
-  }
-
-  int cache_size() {
-    nb::ft_lock_guard lock(mu_);
-    return executables_.size();
-  }
-  void cache_clear() {
-    nb::ft_lock_guard lock(mu_);
-    return executables_.clear();
-  }
-  const nb::callable& fun() const { return fun_; }
-  const nb::callable& cache_miss() const { return cache_miss_; }
-  const std::string& function_name() const { return function_name_; }
-  const xla::nb_class_ptr<xla::PyTreeRegistry>& pytree_registry() const {
-    return pytree_registry_;
-  }
-  const nb::callable& python_shard_arg_fallback() const {
-    return python_shard_arg_fallback_;
-  }
-  const std::vector<int>& static_argnums() const { return static_argnums_; }
-
-  // nb::object typed subclass for PmapFunction objects.
-  class pyobject : public nb::object {
-   public:
-    NB_OBJECT(pyobject, nb::object, "PmapFunction",
-              PmapFunction::IsPmapFunction);
-    pyobject() = default;
-    PmapFunction* func() const {
-      return PmapFunction::AsPmapFunctionUnchecked(*this);
-    }
-  };
-  // Alias as ::object; outside the scope above we won't confuse nanobind's
-  // macros.
-  using object = pyobject;
-
-  // Returns true if `h` is a PmapFunction.
-  static bool IsPmapFunction(nb::handle handle);
-  // Converts `handle` to a PmapFunction*. Does not do any checking.
-  static PmapFunction* AsPmapFunctionUnchecked(nb::handle handle);
-
-  // Helper function used by the tp_clear GC method.
-  void ClearPythonReferences() {
-    nb::callable fun, cache_miss, python_shard_arg_fallback;
-    // Swap values for nulls before they are destroyed. See the Python
-    // Py_CLEAR() documentation for a discussion of this topic.
-    std::swap(fun_, fun);
-    std::swap(cache_miss_, cache_miss);
-    std::swap(python_shard_arg_fallback_, python_shard_arg_fallback);
-  }
-
-  // Updates the signature of arguments for a pmapped function.
-  //
-  // It deals with the arguments signatures and also of the global and
-  // thread-local jit context.
-  absl::Status ComputeCallSignature(
-      absl::Span<nb::object const> flat_dynamic_args,
-      CallSignature& signature) {
-    signature.function_name = function_name_;
-
-    // Get dynamic argument signatures.
-    JitState& global_state = jax::GlobalJitState();
-    JitState& tls = jax::ThreadLocalJitState();
-    const bool jax_enable_x64 = GetEnableX64();
-    signature.jax_enable_x64 = jax_enable_x64;
-    for (nb::handle arg : flat_dynamic_args) {
-      auto signature_or_error = xla::PyArgSignatureOfValue(arg, jax_enable_x64);
-      if (!signature_or_error.ok()) {
-        VLOG(2) << "PyArgSignatureOfValue failed: "
-                << signature_or_error.status();
-        return signature_or_error.status();
-      }
-      signature.dynamic_arg_signatures.push_back(
-          std::move(signature_or_error).value());
-    }
-    signature.thread_local_extra_jit_context = tls.extra_jit_context;
-    signature.global_extra_jit_context = global_state.extra_jit_context;
-    signature.configs = JitConfigs();
-    return absl::Status();
-  }
-
-  // Returns, for debugging purposes (e.g. finding why some call misses the
-  // cache and recompiles), the list of the string representations of the keys.
-  //
-  // The format can change at any time.
-  std::string DebugCacheKeys() {
-    nb::ft_lock_guard lock(mu_);
-    std::vector<std::string> key_strings = {
-        absl::StrCat("The cache contains ", executables_.size(), " elements:")};
-    // We will be able to use auto& [key, _] when TF uses C++ 17.
-    for (auto& pair : executables_) {
-      key_strings.push_back(pair.first.DebugString());
-    }
-    return absl::StrJoin(key_strings, "\n\n");
-  }
-
- private:
-  // Mutates `cache_entry` in place.
-  void PopulateCacheEntry(PmapCacheEntry& cache_entry,
-                          const nb::tuple& out_and_fastpath_data);
-
-  bool always_fallback_to_python_ = false;
-
-  nb::callable fun_;  // The Python function to pmap.
-  std::string function_name_;
-  // See JAX _cpp_pmap in api.py for documentation.
-  nb::callable cache_miss_;
-
-  // We need to know the static arguments to remove them from the arguments
-  // passed to the underlying PyLoadedExecutable. In sorted order.
-  std::vector<int> static_argnums_;
-  xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry_;
-  // We need a `shared_ptr` here to ensure value pointer stability, and to
-  // ensure that the cache entry remains alive in the presence of concurrent
-  // removals.
-  absl::flat_hash_map<CallSignature, std::shared_ptr<PmapCacheEntry>>
-      executables_;
-
-  // The fallback function to use with `ShardArgs`.
-  // TODO(jblespiau): Add support for more types from C++.
-  nb::callable python_shard_arg_fallback_;
-
-  // Protect methods in FT:
-  nb::ft_mutex mu_;
-};
-
-void PmapFunction::PopulateCacheEntry(PmapCacheEntry& cache_entry,
-                                      const nb::tuple& out_and_fastpath_data) {
-  CHECK_EQ(out_and_fastpath_data.size(), 2);
-  if (out_and_fastpath_data[1].is_none()) {
-    cache_entry.fall_back_to_python = true;
-    return;
-  }
-
-  nb::tuple pmap_data = nb::cast<nb::tuple>(out_and_fastpath_data[1]);
-  if (nb::cast<int>(pmap_data.attr("version")) != 1) {
-    throw xla::XlaRuntimeError(absl::StrCat(
-        "The versions of jaxlib and Jax are incompatible (pmap cpp version 1 "
-        "expected, but got ",
-        nb::cast<int>(pmap_data.attr("version")),
-        "Upgrade jaxlib and jax. Provided data was:",
-        nb::cast<std::string>(nb::str(nb::repr(pmap_data)))));
-  }
-  // See api.nb::_PmapFastpathData in the JAX code base for the expected
-  // namedtuple.
-  std::shared_ptr<xla::PyLoadedExecutable> executable;
-  try {
-    executable = nb::cast<std::shared_ptr<xla::PyLoadedExecutable>>(
-        pmap_data.attr("xla_executable"));
-  } catch (const nb::cast_error& e) {
-    // Backends that don't implement the C++ PjRt APIs
-    cache_entry.fall_back_to_python = true;
-    always_fallback_to_python_ = true;
-    return;
-  }
-  cache_entry.executable = std::move(executable);
-  const std::vector<xla::nb_class_ptr<xla::PyDevice>>& devices =
-      cache_entry.executable->AddressableDevices();
-  cache_entry.devices.reserve(devices.size());
-  for (auto& device : devices) {
-    cache_entry.devices.push_back(device->device());
-  }
-
-  // Inputs shard args details.
-  nb::list input_indices = pmap_data.attr("input_indices");
-
-  cache_entry.py_devices = pmap_data.attr("input_devices");
-  auto input_devices = nb::cast<std::vector<xla::nb_class_ptr<xla::PyDevice>>>(
-      pmap_data.attr("input_devices"));
-
-  nb::list input_array_shardings = pmap_data.attr("input_array_shardings");
-
-  cache_entry.input_specs.reserve(input_array_shardings.size());
-
-  for (int i = 0; i < input_array_shardings.size(); ++i) {
-    cache_entry.input_specs.emplace_back(input_indices[i],
-                                         input_array_shardings[i]);
-  }
-
-  // Outputs specs.
-  auto out_tree = nb::cast<xla::PyTreeDef>(pmap_data.attr("out_pytree_def"));
-  cache_entry.out_pytree_def = std::move(out_tree);
-  nb::list out_avals = pmap_data.attr("out_avals");
-
-  cache_entry.out_result_specs.reserve(out_avals.size());
-  cache_entry.out_dtypes.reserve(out_avals.size());
-  cache_entry.out_shapes.reserve(out_avals.size());
-
-  for (int i = 0; i < out_avals.size(); ++i) {
-    cache_entry.out_dtypes.push_back(out_avals[i].attr("dtype"));
-    cache_entry.out_shapes.push_back(
-        nb::cast<std::vector<int64_t>>(out_avals[i].attr("shape")));
-    cache_entry.out_result_specs.emplace_back(out_avals[i]);
-  }
-
-  nb::list out_array_shardings = pmap_data.attr("out_array_shardings");
-
-  DCHECK(out_array_shardings.size() == 0 ||
-         out_avals.size() == out_array_shardings.size());
-
-  cache_entry.out_array_shardings.reserve(out_array_shardings.size());
-  for (nb::handle out_array_sharding : out_array_shardings) {
-    cache_entry.out_array_shardings.push_back(
-        nb::borrow<nb::object>(out_array_sharding));
-  }
-
-  nb::list out_committed = pmap_data.attr("out_committed");
-
-  DCHECK(out_committed.size() == 0 || out_avals.size() == out_committed.size());
-
-  cache_entry.out_committed.reserve(out_committed.size());
-  for (nb::handle c : out_committed) {
-    cache_entry.out_committed.push_back(nb::cast<bool>(c));
-  }
-}
-
-absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
-                                              PyObject* const* args,
-                                              size_t nargs, PyObject* kwnames) {
-  xla::GlobalPyRefManager()->MaybeCollectGarbage();
-
-  // Calls the cache_miss_ function. This just calls the Python function; it may
-  // return nullptr value if a Python exception is thrown.
-  auto cache_miss = [&]() -> nb::tuple {
-    return nb::steal<nb::tuple>(
-        PyObject_Vectorcall(cache_miss_.ptr(), args, nargs, kwnames));
-  };
-
-  // Call the cache_miss() function, extracting the output data and ignoring
-  // the fastpath data. If the cache miss returns a Python error, returns
-  // nullptr and leaves the Python error set.
-  auto fallback_to_cache_miss = [&]() {
-    nb::tuple cache_miss_output = cache_miss();
-    if (!cache_miss_output.ptr()) {
-      return nb::object();
-    }
-    return nb::object(cache_miss_output[0]);
-  };
-
-  if (always_fallback_to_python_) {
-    return fallback_to_cache_miss();
-  }
-
-  size_t num_positional_args = PyVectorcall_NARGS(nargs);
-  size_t num_keyword_args = kwnames ? PyTuple_GET_SIZE(kwnames) : 0;
-  absl::Span<PyObject* const> positional_args(args, num_positional_args);
-  absl::Span<PyObject* const> keyword_args(args + num_positional_args,
-                                           num_keyword_args);
-  CallSignature call_signature;
-  absl::InlinedVector<nb::object, 2> flat_dynamic_args;
-  std::vector<nb::object> keep_alive_objects;
-  absl::Status status =
-      ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
-                     /*static_argnames=*/{}, pytree_registry_.get(),
-                     call_signature.arg_signature, flat_dynamic_args);
-  if (!status.ok()) {
-    VLOG(2) << "ParseArguments failed: " << status;
-    return fallback_to_cache_miss();
-  }
-
-  status = ComputeCallSignature(flat_dynamic_args, call_signature);
-  if (!status.ok()) {
-    return fallback_to_cache_miss();
-  }
-
-  // Retrieve/Maybe add the executable to the cache.
-  bool inserted = false;
-  std::shared_ptr<PmapCacheEntry> cache_entry_ptr;
-  {
-    nb::ft_lock_guard lock(mu_);
-    std::shared_ptr<PmapCacheEntry>& entry_ref = executables_[call_signature];
-    if (!entry_ref) {
-      inserted = true;
-      entry_ref = std::make_shared<PmapCacheEntry>(pytree_registry_.get());
-    }
-    cache_entry_ptr = entry_ref;
-  }
-  PmapCacheEntry& cache_entry = *cache_entry_ptr;
-
-  if (!cache_entry.compilation_complete.HasBeenNotified()) {
-    // In case of several threads attempting to compile the executable, only
-    // the one that inserted the item will perform the compilation.
-    if (inserted) {
-      nb::object out_and_fastpath_data;
-      nb::tuple out_tuple;
-      VLOG(2) << "Cache miss for " << call_signature.DebugString();
-      try {
-        // Calls Python and may release the GIL. May also throw if
-        // compilation/tracing fails.
-        out_and_fastpath_data = cache_miss();
-        if (!out_and_fastpath_data.ptr()) {
-          throw nb::python_error();
-        }
-        out_tuple = nb::cast<nb::tuple>(out_and_fastpath_data);
-
-        PopulateCacheEntry(cache_entry, out_tuple);
-      } catch (const std::exception& e) {
-        cache_entry.fall_back_to_python = true;
-        cache_entry.compilation_complete.Notify();
-        throw;
-      }
-      cache_entry.compilation_complete.Notify();
-
-      // We have already computed the result in the miss path so we can return
-      // it. We are even *required* to do so if there are donated arguments,
-      // because any donated buffers will now be invalid.
-      return nb::object(out_tuple[0]);
-    } else {
-      // Release the GIL while we wait, making sure the compile thread can
-      // lock it.
-      nb::gil_scoped_release release;
-      cache_entry.compilation_complete.WaitForNotification();
-    }
-  }
-  if (cache_entry.fall_back_to_python) {
-    return fallback_to_cache_miss();
-  }
-
-  // 1. Parse arguments.
-  std::vector<xla::ifrt::Device*>& input_devices = cache_entry.devices;
-  std::vector<InputSpec>& input_specs = cache_entry.input_specs;
-  const int num_args = flat_dynamic_args.size();
-
-  // We need [num_args] for the `Execute` call below.
-  std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays(num_args);
-  for (int i = 0; i < num_args; ++i) {
-    TF_ASSIGN_OR_RETURN(
-        ShardArgResult sharded_arg,
-        ShardArg(flat_dynamic_args[i], input_devices, input_specs[i],
-                 cache_entry.py_devices, python_shard_arg_fallback_));
-
-    num_args_arrays[i] = std::move(sharded_arg.ifrt_array);
-    if (sharded_arg.owning_sda) {
-      keep_alive_objects.push_back(std::move(sharded_arg.owning_sda));
-    }
-  }
-
-  xla::ifrt::ExecuteOptions execute_options = cache_entry.executable->options();
-  execute_options.launch_id = cache_entry.executable->GetNextLaunchId();
-  execute_options.execution_stream_id =
-      tsl::Env::Default()->GetCurrentThreadId();
-
-  // A vector of [num_outputs].
-  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
-  {
-    nb::gil_scoped_release gil_release;
-    auto ifrt_executable = cache_entry.executable->ifrt_executable();
-    TF_ASSIGN_OR_RETURN(
-        auto result, ifrt_executable->Execute(absl::MakeSpan(num_args_arrays),
-                                              execute_options,
-                                              /*devices=*/std::nullopt));
-    output_arrays = std::move(result.outputs);
-  }
-
-  // TODO(jblespiau): We don't need to create the PyBuffer objects.
-  // Having a C++ `Array`, keeping internally the PjRtBuffer
-  // objects is sufficient, and we can lazily create the `PyBuffer` only if
-  // we access them from Python.
-  auto traceback = xla::Traceback::Get();
-  // TODO(jblespiau): Change the `client` function to return a reference.
-  xla::nb_class_ptr<xla::PyClient> client = cache_entry.executable->client();
-
-  // Convert the PjRtBuffer objects to PyBuffer, and invert the order from
-  // [num_devices, num_args] to [num_args, num_devices].
-  const int num_outputs = output_arrays.size();
-  std::vector<nb::object> flat_sharded_device_arrays;
-  flat_sharded_device_arrays.reserve(num_outputs);
-
-  const auto& output_specs = cache_entry.out_result_specs;
-
-  TF_RET_CHECK(cache_entry.out_array_shardings.size() == num_outputs);
-  for (int i = 0; i < num_outputs; ++i) {
-    const ResultSpec& result_spec = output_specs[i];
-    xla::PyArray py_array(
-        result_spec.out_aval, result_spec.weak_type, cache_entry.out_dtypes[i],
-        cache_entry.out_shapes[i], cache_entry.out_array_shardings[i], client,
-        traceback, std::move(output_arrays[i]), cache_entry.out_committed[i],
-        /*skip_checks=*/true);
-
-    flat_sharded_device_arrays.push_back(std::move(py_array));
-  }
-
-  nb::object out =
-      cache_entry.out_pytree_def.Unflatten(flat_sharded_device_arrays);
-
-  // If there is a post-hook function, call it with the inputs and the outputs.
-  std::optional<nb::object> post_hook = GetPostHook();
-  if (post_hook) {
-    nb::tuple args_tuple =
-        nb::steal<nb::tuple>(PyTuple_New(num_positional_args));
-    for (size_t i = 0; i < num_positional_args; ++i) {
-      Py_INCREF(args[i]);
-      PyTuple_SET_ITEM(args_tuple.ptr(), i, args[i]);
-    }
-    nb::dict kwargs;
-    if (kwnames) {
-      for (size_t i = 0; i < num_keyword_args; ++i) {
-        kwargs[nb::handle(PyTuple_GET_ITEM(kwnames, i))] =
-            nb::borrow(args[num_positional_args + i]);
-      }
-    }
-
-    (*post_hook)(callable, args_tuple, kwargs, out);
-  }
-
-  return out;
-}
-
-struct JaxPmapFunctionObject {
-  PyObject_HEAD;
-#if PY_VERSION_HEX < 0x030C0000
-  PyObject* dict;      // Dictionary for __dict__
-  PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
-#endif                 // PY_VERSION_HEX < 0x030C0000
-  vectorcallfunc vectorcall;
-  PmapFunction fun;
-};
-
-PyObject* JaxPmapFunction_Type = nullptr;
-
-bool PmapFunction::IsPmapFunction(nb::handle handle) {
-  return handle.type().ptr() == JaxPmapFunction_Type;
-}
-
-PmapFunction* PmapFunction::AsPmapFunctionUnchecked(nb::handle handle) {
-  return &(reinterpret_cast<JaxPmapFunctionObject*>(handle.ptr())->fun);
-}
-
-absl::StatusOr<PmapFunction*> AsPmapFunction(nb::handle handle) {
-  if (!PmapFunction::IsPmapFunction(handle)) {
-    return xla::InvalidArgument("Expected a PmapFunction");
-  }
-  return PmapFunction::AsPmapFunctionUnchecked(handle);
-}
-
-namespace {
-
-extern "C" {
-
-PyObject* JaxPmapFunction_tp_vectorcall(PyObject* callable,
-                                        PyObject* const* args, size_t nargs,
-                                        PyObject* kwnames) {
-  JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(callable);
-  tsl::profiler::TraceMe traceme([&] {
-    return absl::StrCat("JaxPmapFunction(", o->fun.function_name(), ")");
-  });
-  try {
-    absl::StatusOr<nb::object> out =
-        o->fun.Call(callable, args, nargs, kwnames);
-    if (!out.ok()) {
-      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
-      return nullptr;
-    }
-    return out.value().release().ptr();
-  } catch (nb::python_error& e) {
-    e.restore();
-    return nullptr;
-  } catch (nb::cast_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::invalid_argument& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  }
-}
-
-PyObject* JaxPmapFunction_tp_new(PyTypeObject* subtype, PyObject* args,
-                                 PyObject* kwds) {
-  JaxPmapFunctionObject* self =
-      reinterpret_cast<JaxPmapFunctionObject*>(subtype->tp_alloc(subtype, 0));
-  if (!self) return nullptr;
-#if PY_VERSION_HEX < 0x030C0000
-  self->dict = nullptr;
-  self->weakrefs = nullptr;
-#endif  // PY_VERSION_HEX < 0x030C0000
-  self->vectorcall = JaxPmapFunction_tp_vectorcall;
-  return reinterpret_cast<PyObject*>(self);
-}
-
-void JaxPmapFunction_tp_dealloc(PyObject* self) {
-  PyObject_GC_UnTrack(self);
-  PyTypeObject* tp = Py_TYPE(self);
-  JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(self);
-  PyObject_ClearWeakRefs(self);
-#if PY_VERSION_HEX < 0x030C0000
-  Py_CLEAR(o->dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_ClearManagedDict(self);
-#else
-  PyObject_ClearManagedDict(self);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  o->fun.~PmapFunction();
-  tp->tp_free(self);
-  Py_DECREF(tp);
-}
-
-int JaxPmapFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
-  JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(self);
-  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
-  Py_VISIT(Py_TYPE(self));
-#if PY_VERSION_HEX < 0x030C0000
-  Py_VISIT(o->dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_VisitManagedDict(self, visit, arg);
-#else
-  PyObject_VisitManagedDict(self, visit, arg);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  Py_VISIT(o->fun.fun().ptr());
-  Py_VISIT(o->fun.cache_miss().ptr());
-  return 0;
-}
-
-int JaxPmapFunction_tp_clear(PyObject* self) {
-  JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(self);
-#if PY_VERSION_HEX < 0x030C0000
-  Py_CLEAR(o->dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_ClearManagedDict(self);
-#else
-  PyObject_ClearManagedDict(self);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  o->fun.ClearPythonReferences();
-  return 0;
-}
-
-// Implements the Python descriptor protocol so PMAP-compiled functions can be
-// used as bound methods. See:
-// https://docs.python.org/3/howto/descriptor.html#functions-and-methods
-PyObject* JaxPmapFunction_tp_descr_get(PyObject* self, PyObject* obj,
-                                       PyObject* type) {
-  if (obj == nullptr || obj == Py_None) {
-    Py_INCREF(self);
-    return self;
-  }
-  return PyMethod_New(self, obj);
-}
-
-static PyGetSetDef JaxPmapFunction_tp_getset[] = {
-    // Having a __dict__ seems necessary to allow !functool.wraps to override
-    // __doc__.
-    {const_cast<char*>("__dict__"), PyObject_GenericGetDict,
-     PyObject_GenericSetDict, nullptr, nullptr},
-    {nullptr, nullptr, nullptr, nullptr, nullptr}};
-
-PyMemberDef JaxPmapFunction_members[] = {
-    {"__vectorcalloffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(JaxPmapFunctionObject, vectorcall)),
-     READONLY, nullptr},
-#if PY_VERSION_HEX < 0x030C0000
-    {"__dictoffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(JaxPmapFunctionObject, dict)), READONLY,
-     nullptr},
-    {"__weaklistoffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(JaxPmapFunctionObject, weakrefs)),
-     READONLY, nullptr},
-#endif  // PY_VERSION_HEX < 0x030C0000
-    {nullptr, 0, 0, 0, nullptr},
-};
-
-PyType_Slot JaxPmapFunction_slots[] = {
-    {Py_tp_new, reinterpret_cast<void*>(JaxPmapFunction_tp_new)},
-    {Py_tp_dealloc, reinterpret_cast<void*>(JaxPmapFunction_tp_dealloc)},
-    {Py_tp_traverse, reinterpret_cast<void*>(JaxPmapFunction_tp_traverse)},
-    {Py_tp_clear, reinterpret_cast<void*>(JaxPmapFunction_tp_clear)},
-    {Py_tp_getset, reinterpret_cast<void*>(JaxPmapFunction_tp_getset)},
-    {Py_tp_descr_get, reinterpret_cast<void*>(JaxPmapFunction_tp_descr_get)},
-    {Py_tp_call, reinterpret_cast<void*>(PyVectorcall_Call)},
-    {Py_tp_members, reinterpret_cast<void*>(JaxPmapFunction_members)},
-    {0, nullptr},
-};
-
-}  // extern "C"
-
-nb::object MakePmapFunction(
-    nb::callable fun, nb::callable cache_miss, std::vector<int> static_argnums,
-    nb::callable python_shard_arg_fallback,
-    xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry) {
-  nb::object obj = nb::steal<nb::object>(JaxPmapFunction_tp_new(
-      reinterpret_cast<PyTypeObject*>(JaxPmapFunction_Type), nullptr, nullptr));
-  JaxPmapFunctionObject* buf =
-      reinterpret_cast<JaxPmapFunctionObject*>(obj.ptr());
-  new (&buf->fun) PmapFunction(
-      std::move(fun), std::move(cache_miss), std::move(static_argnums),
-      std::move(python_shard_arg_fallback), std::move(pytree_registry));
-  return obj;
-}
-
-// Version numbers for the pickled representations.
-// Increment these if changing them.
-const int kPmapFunctionPickleVersion = 1;
-
-}  // namespace
-
-void BuildPmapSubmodule(nb::module_& m) {
-  nb::module_ pmap_lib = m.def_submodule("pmap_lib", "Jax C++ pmap library");
-
-  nb::class_<NoSharding> no_sharding(pmap_lib, "NoSharding");
-  no_sharding.def(nb::init<>())
-      .def("__getstate__",
-           [](const NoSharding& self) { return nb::make_tuple(); })
-      .def("__setstate__",
-           [](NoSharding& self, nb::tuple t) { new (&self) NoSharding(); })
-      .def("__repr__",
-           [](const NoSharding& chuncked) { return "NoSharding()"; })
-      .def("__eq__",
-           [](const NoSharding& self, nb::object obj) {
-             return nb::isinstance<NoSharding>(obj);
-           })
-      .def("__hash__", [](const NoSharding& self) {
-        const size_t hash = absl::HashOf(self);
-        return nb::int_(hash);
-      });
-
-  nb::class_<Chunked> chunked(pmap_lib, "Chunked");
-  chunked.def(nb::init<std::vector<int>>())
-      .def("__getstate__",
-           [](const Chunked& self) { return nb::make_tuple(self.chunks); })
-      .def("__setstate__",
-           [](Chunked& self, nb::tuple t) {
-             new (&self) Chunked{nb::cast<std::vector<int>>(t[0])};
-           })
-      .def_ro("chunks", &Chunked::chunks)
-      .def("__repr__",
-           [](const Chunked& chuncked) {
-             return absl::StrCat("Chunked(",
-                                 absl::StrJoin(chuncked.chunks, ","), ")");
-           })
-      .def("__eq__", [](const Chunked& self, nb::object other) {
-        if (!nb::isinstance<Chunked>(other)) {
-          return false;
-        }
-        return self == nb::cast<const Chunked&>(other);
-      });
-
-  nb::class_<Unstacked> unstacked(pmap_lib, "Unstacked");
-  unstacked.def(nb::init<int>())
-      .def("__getstate__",
-           [](const Unstacked& self) { return nb::make_tuple(self.size); })
-      .def("__setstate__",
-           [](Unstacked& self, nb::tuple t) {
-             new (&self) Unstacked{nb::cast<int>(t[0])};
-           })
-      .def_ro("size", &Unstacked::size)
-      .def("__repr__",
-           [](const Unstacked& x) {
-             return absl::StrCat("Unstacked(", x.size, ")");
-           })
-      .def("__eq__", [](const Unstacked& self, nb::object other) {
-        if (!nb::isinstance<Unstacked>(other)) {
-          return false;
-        }
-        return self == nb::cast<const Unstacked&>(other);
-      });
-
-  nb::class_<ShardedAxis> sharded_axis(pmap_lib, "ShardedAxis");
-  sharded_axis.def(nb::init<int>())
-      .def("__getstate__",
-           [](const ShardedAxis& self) { return nb::make_tuple(self.axis); })
-      .def("__setstate__",
-           [](ShardedAxis& self, nb::tuple t) {
-             new (&self) ShardedAxis{nb::cast<int>(t[0])};
-           })
-      .def_ro("axis", &ShardedAxis::axis)
-      .def("__repr__",
-           [](const ShardedAxis& x) {
-             return absl::StrCat("ShardedAxis(axis=", x.axis, ")");
-           })
-      .def("__eq__", [](const ShardedAxis& self, const ShardedAxis& other) {
-        return self == other;
-      });
-
-  nb::class_<Replicated> replicated(pmap_lib, "Replicated");
-  replicated.def(nb::init<int>())
-      .def("__getstate__",
-           [](const Replicated& self) { return nb::make_tuple(self.replicas); })
-      .def("__setstate__",
-           [](Replicated& self, nb::tuple t) {
-             new (&self) Replicated{nb::cast<int>(t[0])};
-           })
-      .def_ro("replicas", &Replicated::replicas)
-      .def("__repr__",
-           [](const Replicated& x) {
-             return absl::StrCat("Replicated(replicas=", x.replicas, ")");
-           })
-      .def("__eq__", [](const Replicated& self, const Replicated& other) {
-        return self == other;
-      });
-
-  nb::class_<ShardingSpec> sharding_spec(pmap_lib, "ShardingSpec");
-  sharding_spec
-      .def(nb::init<nb::iterable, nb::iterable>(), nb::arg("sharding"),
-           nb::arg("mesh_mapping"))
-      .def("__getstate__",
-           [](const ShardingSpec& self) {
-             auto sharding =
-                 xla::SpanToNbTuple(absl::MakeConstSpan(self.GetSharding()));
-             auto mesh_mapping =
-                 xla::SpanToNbTuple(absl::MakeConstSpan(self.GetMeshMapping()));
-             return nb::make_tuple(sharding, mesh_mapping);
-           })
-      .def("__setstate__",
-           [](ShardingSpec& self, nb::tuple t) {
-             new (&self)
-                 ShardingSpec{nb::cast<std::vector<AvalDimSharding>>(t[0]),
-                              nb::cast<std::vector<MeshDimAssignment>>(t[1])};
-           })
-      .def_prop_ro(
-          "sharding",
-          [](const ShardingSpec& self) {
-            return xla::SpanToNbTuple(absl::MakeConstSpan(self.GetSharding()));
-          })
-      .def_prop_ro("mesh_mapping",
-                   [](const ShardingSpec& self) {
-                     return xla::SpanToNbTuple(
-                         absl::MakeConstSpan(self.GetMeshMapping()));
-                   })
-      .def("__eq__", [](const ShardingSpec& self,
-                        const ShardingSpec& other) { return self == other; })
-      .def("__hash__", [](const ShardingSpec& self) {
-        const size_t hash = absl::HashOf(self);
-        return nb::int_(hash);
-      });
-
-  // We need to use heap-allocated type objects because we want to add
-  // additional methods dynamically.
-
-  std::string name =
-      absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".PmapFunction");
-  PyType_Spec pmap_function_spec = {
-#if PY_VERSION_HEX < 0x030B0000
-      // Work around for https://github.com/python/cpython/issues/89478
-      // CPython 3.10 and earlier assume that the .name value remains alive
-      // forever.
-      /*.name=*/strdup(name.c_str()),
-#else
-      /*.name=*/name.c_str(),
-#endif  // PY_VERSION_HEX < 0x030B0000
-      /*.basicsize=*/static_cast<int>(sizeof(JaxPmapFunctionObject)),
-      /*.itemsize=*/0,
-#if PY_VERSION_HEX < 0x030C0000
-      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
-          Py_TPFLAGS_HAVE_VECTORCALL,
-#else   // PY_VERSION_HEX >= 0x030C0000
-      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
-          Py_TPFLAGS_HAVE_VECTORCALL | Py_TPFLAGS_MANAGED_DICT |
-          Py_TPFLAGS_MANAGED_WEAKREF,
-#endif  // PY_VERSION_HEX >= 0x030C0000
-      /*.slots=*/JaxPmapFunction_slots,
-  };
-
-  JaxPmapFunction_Type = PyType_FromSpec(&pmap_function_spec);
-  if (!JaxPmapFunction_Type) {
-    throw nb::python_error();
-  }
-  nb::object cfun = nb::borrow<nb::object>(JaxPmapFunction_Type);
-
-  // Add PmapFunction to the xla_extension module so it can be pickled.
-  m.attr("PmapFunction") = cfun;
-
-  cfun.attr("__signature__") =
-      xla::nb_property_readonly([](nb::handle self) -> nb::object {
-        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
-        return fun->PythonSignature();
-      });
-  // Required by `post_hook`.
-  cfun.attr("_cache_miss") =
-      xla::nb_property_readonly([](nb::handle self) -> nb::object {
-        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
-        return fun->cache_miss();
-      });
-  cfun.attr("__getstate__") = nb::cpp_function(
-      [](const PmapFunction::object& self) {
-        PmapFunction* fn = self.func();
-        nb::dict pickle;
-        pickle["version"] = kPmapFunctionPickleVersion;
-        pickle["fun"] = fn->fun();
-        pickle["cache_miss"] = fn->cache_miss();
-        pickle["static_argnums"] = fn->static_argnums();
-        pickle["python_shard_arg_fallback"] = fn->python_shard_arg_fallback();
-        pickle["pytree_registry"] = nb::cast(fn->pytree_registry());
-        return pickle;
-      },
-      nb::is_method());
-  cfun.attr("__setstate__") = nb::cpp_function(
-      [](PmapFunction::object& self, const nb::dict& pickle) {
-        int version = nb::cast<int>(pickle["version"]);
-        if (version != kPmapFunctionPickleVersion) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Invalid PmapFunction pickle version, got %d, expected %d. "
-              "Pickling/Unpickling jitted functions using different JAX "
-              "versions is not supported.",
-              version, kPmapFunctionPickleVersion));
-        }
-        nb::callable fun = nb::cast<nb::callable>(pickle["fun"]);
-        nb::callable cache_miss = nb::cast<nb::callable>(pickle["cache_miss"]);
-        std::vector<int> static_argnums =
-            nb::cast<std::vector<int>>(pickle["static_argnums"]);
-        nb::callable python_shard_arg_fallback =
-            nb::cast<nb::callable>(pickle["python_shard_arg_fallback"]);
-        xla::nb_class_ptr<xla::PyTreeRegistry> pytree_registry =
-            nb::cast<xla::nb_class_ptr<xla::PyTreeRegistry>>(
-                pickle["pytree_registry"]);
-        new (&(reinterpret_cast<JaxPmapFunctionObject*>(self.ptr())->fun))
-            PmapFunction(std::move(fun), std::move(cache_miss),
-                         std::move(static_argnums),
-                         std::move(python_shard_arg_fallback),
-                         std::move(pytree_registry));
-      },
-      nb::is_method());
-
-  // This is only for testing/debugging purposes.
-  cfun.attr("_cache_size") =
-      xla::nb_property_readonly([](nb::handle self) -> nb::object {
-        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
-        return nb::cast<int>(fun->cache_size());
-      });
-
-  cfun.attr("_cache_clear") = nb::cpp_function(
-      [](nb::handle self) {
-        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
-        fun->cache_clear();
-      },
-      nb::is_method());
-
-  cfun.attr("_debug_cache_keys") = nb::cpp_function(
-      [](nb::handle self) -> std::string {
-        PmapFunction* fun = xla::ValueOrThrow(AsPmapFunction(self));
-        return fun->DebugCacheKeys();
-      },
-      nb::is_method());
-
-  pmap_lib.def(
-      "pmap",
-      [](nb::callable fun, nb::callable cache_miss,
-         std::vector<int> static_argnums, nb::callable shard_arg_fallback,
-         nb::object pytree_registry) -> nb::object {
-        xla::nb_class_ptr<xla::PyTreeRegistry> registry =
-            nb::cast<xla::nb_class_ptr<xla::PyTreeRegistry>>(pytree_registry);
-        return MakePmapFunction(
-            std::move(fun), std::move(cache_miss), std::move(static_argnums),
-            std::move(shard_arg_fallback), std::move(registry));
-      },
-      nb::arg("fun"), nb::arg("cache_miss"), nb::arg("static_argnums"),
-      nb::arg("shard_arg_fallback"), nb::arg("pytree_registry"));
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/pmap_lib.h b/third_party/xla/xla/python/pmap_lib.h
deleted file mode 100644
index 0cb830793123..000000000000
--- a/third_party/xla/xla/python/pmap_lib.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PMAP_LIB_H_
-#define XLA_PYTHON_PMAP_LIB_H_
-
-#include <optional>
-#include <utility>
-#include <vector>
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-// TODO(jblespiau): The current implementation moves the Python logic to C++,
-// as a preliminary step to executing the `pmap` execution path from C++.
-// It implements the current Python behavior (thus, it may not be optimal, and
-// we will be able to modify it later).
-
-namespace jax {
-
-void BuildPmapSubmodule(nanobind::module_& m);
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_PMAP_LIB_H_
diff --git a/third_party/xla/xla/python/pprof_profile_builder.cc b/third_party/xla/xla/python/pprof_profile_builder.cc
index b42fccb8248e..be16c0ab357f 100644
--- a/third_party/xla/xla/python/pprof_profile_builder.cc
+++ b/third_party/xla/xla/python/pprof_profile_builder.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "nanobind/nanobind.h"
 #include "nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "xla/tsl/platform/logging.h"
diff --git a/third_party/xla/xla/python/pprof_profile_builder.h b/third_party/xla/xla/python/pprof_profile_builder.h
index 8c1ee9afb784..4c2c0763423e 100644
--- a/third_party/xla/xla/python/pprof_profile_builder.h
+++ b/third_party/xla/xla/python/pprof_profile_builder.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "nanobind/nanobind.h"
 #include "tsl/profiler/protobuf/profile.pb.h"
 
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 92c5bfd2f260..7f37b806caaa 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/python/profiler.h"
-
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -144,12 +143,10 @@ static std::string GetFdoProfile(const std::string& xspace,
   return fdo_profile.SerializeAsString();
 }
 
-void BuildProfilerSubmodule(nb::module_& m) {
-  nb::module_ profiler =
-      m.def_submodule("profiler", "TensorFlow profiler integration");
+NB_MODULE(_profiler, m) {
   nb::class_<tsl::profiler::ProfilerServer> profiler_server_class(
-      profiler, "ProfilerServer");
-  profiler.def(
+      m, "ProfilerServer");
+  m.def(
       "start_server",
       [](int port) -> std::unique_ptr<tsl::profiler::ProfilerServer> {
         auto server = std::make_unique<tsl::profiler::ProfilerServer>();
@@ -157,7 +154,7 @@ void BuildProfilerSubmodule(nb::module_& m) {
         return server;
       },
       nb::arg("port"));
-  profiler.def("register_plugin_profiler", [](nb::capsule c_api) -> void {
+  m.def("register_plugin_profiler", [](nb::capsule c_api) -> void {
     if (absl::string_view(c_api.name()) != "pjrt_c_api") {
       throw xla::XlaRuntimeError(
           "Argument to register_plugin_profiler was not a pjrt_c_api capsule.");
@@ -165,7 +162,7 @@ void BuildProfilerSubmodule(nb::module_& m) {
     RegisterProfiler(static_cast<const PJRT_Api*>(c_api.data()));
   });
 
-  nb::class_<ProfilerSessionWrapper> profiler_session_class(profiler,
+  nb::class_<ProfilerSessionWrapper> profiler_session_class(m,
                                                             "ProfilerSession");
   profiler_session_class
       .def("__init__",
@@ -218,7 +215,7 @@ void BuildProfilerSubmodule(nb::module_& m) {
            });
 
   nb::class_<tensorflow::ProfileOptions> profile_options_class(
-      profiler, "ProfileOptions");
+      m, "ProfileOptions");
   profile_options_class
       .def("__init__",
            [](tensorflow::ProfileOptions* options) {
@@ -242,13 +239,41 @@ void BuildProfilerSubmodule(nb::module_& m) {
                    &tensorflow::ProfileOptions::set_start_timestamp_ns)
       .def_prop_rw("duration_ms", &tensorflow::ProfileOptions::duration_ms,
                    &tensorflow::ProfileOptions::set_duration_ms)
+      .def_prop_rw(
+          "raise_error_on_start_failure",
+          &tensorflow::ProfileOptions::raise_error_on_start_failure,
+          &tensorflow::ProfileOptions::set_raise_error_on_start_failure)
+      .def_prop_rw(
+          "advanced_configuration",
+          &tensorflow::ProfileOptions::advanced_configuration,
+          [](tensorflow::ProfileOptions* options, const nb::dict& dict) {
+            if (options->mutable_advanced_configuration() == nullptr) {
+              throw xla::XlaRuntimeError("advanced_configuration is null");
+            }
+            options->mutable_advanced_configuration()->clear();
+            for (const auto& item : dict) {
+              std::string key = nb::cast<std::string>(item.first);
+              nb::handle value = item.second;
+              tensorflow::ProfileOptions::AdvancedConfigValue config_value;
+              if (nb::isinstance<nb::bool_>(value)) {
+                config_value.set_bool_value(nb::cast<bool>(value));
+              } else if (nb::isinstance<nb::int_>(value)) {
+                config_value.set_int64_value(nb::cast<int64_t>(value));
+              } else {
+                config_value.set_string_value(
+                    nb::cast<std::string>(nb::str(value)));
+              }
+              options->mutable_advanced_configuration()->insert(
+                  {key, config_value});
+            }
+          })
       .def_prop_rw(
           "repository_path", &tensorflow::ProfileOptions::repository_path,
           [](tensorflow::ProfileOptions* options, const std::string& path) {
             options->set_repository_path(path);
           });
 
-  nb::class_<TraceMeWrapper> traceme_class(profiler, "TraceMe");
+  nb::class_<TraceMeWrapper> traceme_class(m, "TraceMe");
   traceme_class.def(nb::init<nb::str, nb::kwargs>())
       .def("__enter__", [](nb::object self) -> nb::object { return self; })
       .def(
@@ -264,7 +289,7 @@ void BuildProfilerSubmodule(nb::module_& m) {
       .def("set_metadata", &TraceMeWrapper::SetMetadata)
       .def_static("is_enabled", &TraceMeWrapper::IsEnabled);
 
-  profiler.def(
+  m.def(
       "get_profiled_instructions_proto",
       [](std::string tensorboard_dir) -> nb::bytes {
         tensorflow::profiler::ProfiledInstructionsProto profile_proto;
@@ -276,7 +301,7 @@ void BuildProfilerSubmodule(nb::module_& m) {
       },
       nb::arg("tensorboard_dir"));
 
-  profiler.def(
+  m.def(
       "get_instructions_profile",
       [](const std::string& tensorboard_dir)
           -> std::vector<std::pair<std::string, double>> {
@@ -293,19 +318,19 @@ void BuildProfilerSubmodule(nb::module_& m) {
       },
       nb::arg("tensorboard_dir"));
 
-  profiler.def("get_fdo_profile",
-               [](nb::bytes xspace, bool as_textproto = false) -> nb::object {
-                 std::string out = GetFdoProfile(
-                     std::string(xspace.c_str(), xspace.size()), as_textproto);
-                 return nb::bytes(out.data(), out.size());
-               });
+  m.def("get_fdo_profile",
+        [](nb::bytes xspace, bool as_textproto = false) -> nb::object {
+          std::string out = GetFdoProfile(
+              std::string(xspace.c_str(), xspace.size()), as_textproto);
+          return nb::bytes(out.data(), out.size());
+        });
 
-  profiler.def("get_fdo_profile", [](nb::bytes xspace) -> nb::object {
+  m.def("get_fdo_profile", [](nb::bytes xspace) -> nb::object {
     std::string out = GetFdoProfile(std::string(xspace.c_str(), xspace.size()));
     return nb::bytes(out.data(), out.size());
   });
 
-  profiler.def(
+  m.def(
       "aggregate_profiled_instructions",
       [](const std::vector<nb::bytes>& profiles, int percentile) -> nb::object {
         std::vector<tensorflow::profiler::ProfiledInstructionsProto>
diff --git a/third_party/xla/xla/python/profiler.h b/third_party/xla/xla/python/profiler.h
deleted file mode 100644
index 8732f980626d..000000000000
--- a/third_party/xla/xla/python/profiler.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PROFILER_H_
-#define XLA_PYTHON_PROFILER_H_
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-void BuildProfilerSubmodule(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PROFILER_H_
diff --git a/third_party/xla/xla/python/profiler/BUILD b/third_party/xla/xla/python/profiler/BUILD
index b1c47c8b58f7..472a81615f57 100644
--- a/third_party/xla/xla/python/profiler/BUILD
+++ b/third_party/xla/xla/python/profiler/BUILD
@@ -1,4 +1,9 @@
-load("//xla:strict.default.bzl", "py_strict_test")
+load("//xla:py_strict.bzl", "py_strict_test")
+load("//xla/python:package_groups.bzl", "XLA_PYTHON_PROFILER_USERS")
+load(
+    "//xla/tsl:tsl.bzl",
+    "internal_visibility",
+)
 load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -37,7 +42,7 @@ tsl_pybind_extension(
         "-fno-strict-aliasing",
     ],
     pytype_srcs = ["profile_data.pyi"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(XLA_PYTHON_PROFILER_USERS),
     deps = [
         ":profile_data_lib",
         "@local_tsl//tsl/platform:protobuf",
diff --git a/third_party/xla/xla/python/profiler/internal/python_hooks.cc b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
index c0272fa9e836..ee2556d8d7b5 100644
--- a/third_party/xla/xla/python/profiler/internal/python_hooks.cc
+++ b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
@@ -133,7 +133,7 @@ std::string PythonTraceEntry::Name() const {
 }
 
 PythonHooks* PythonHooks::GetSingleton() {
-  static PythonHooks* singleton = new PythonHooks;
+  static PythonHooks* const singleton = new PythonHooks;
   return singleton;
 }
 
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
deleted file mode 100644
index 0ae932dc209d..000000000000
--- a/third_party/xla/xla/python/py_array.cc
+++ /dev/null
@@ -1,2076 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_array.h"
-
-#include <Python.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <new>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <thread>  // NOLINT
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/casts.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/pair.h"  // IWYU pragma: keep
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/layout.h"
-#include "xla/layout_util.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/lru_cache.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_compiler.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/primitive_util.h"
-#include "xla/python/guard_lib.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/array_spec.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/future.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/remap_plan.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_helpers.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/pjrt_ifrt/pjrt_array.h"
-#include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/python/pjrt_ifrt/pjrt_device.h"
-#include "xla/python/pjrt_ifrt/pjrt_dtype.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_device.h"
-#include "xla/python/py_values.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/sharding.h"
-#include "xla/python/to_ifrt_sharding.h"
-#include "xla/python/traceback.h"
-#include "xla/python/types.h"
-#include "xla/python/util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace {
-
-namespace nb = nanobind;
-
-PjRtBuffer* GetPjrtBuffer(ifrt::Array* ifrt_array) {
-  auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
-  if (arr == nullptr) {
-    throw XlaRuntimeError(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  return arr->pjrt_buffers().front().get();
-}
-
-absl::StatusOr<const Shape*> XlaDynamicShape(ifrt::Array* ifrt_array,
-                                             std::optional<Shape>& scratch) {
-  auto* pjrt_buffer = GetPjrtBuffer(ifrt_array);
-
-  if (!scratch) {
-    absl::Span<const int64_t> dims;
-    std::optional<std::vector<int64_t>> logical_dims_storage;
-    if (pjrt_buffer->has_dynamic_dimensions()) {
-      {
-        nb::gil_scoped_release gil_release;
-        TF_ASSIGN_OR_RETURN(std::vector<int64_t> logical_dims,
-                            pjrt_buffer->logical_dimensions());
-        logical_dims_storage.emplace(std::move(logical_dims));
-      }
-      dims = *logical_dims_storage;
-    } else {
-      dims = pjrt_buffer->dimensions();
-    }
-    Shape shape = ShapeUtil::MakeShape(pjrt_buffer->element_type(), dims);
-    // TODO(b/327524065): fix this
-    *shape.mutable_layout() = pjrt_buffer->layout()->xla_layout();
-    scratch = std::move(shape);
-  }
-  return &scratch.value();
-}
-
-tsl::RCReference<ifrt::Array> CreateIfRtArrayFromSingleDeviceShardedPyArrays(
-    nb_dtype dtype, absl::Span<const int64_t> shape,
-    absl::Span<const PyArray> py_arrays, const nb::object& sharding) {
-  if (py_arrays.empty()) {
-    // TODO(hyeontaek): Return a absl::Status.
-    throw nb::value_error("At least one array must be provided.");
-  }
-  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
-  ifrt_arrays.reserve(py_arrays.size());
-  absl::InlinedVector<ifrt::Device*, 1> devices;
-  devices.reserve(py_arrays.size());
-  absl::flat_hash_set<ifrt::Device*> device_set;
-  device_set.reserve(py_arrays.size());
-  std::vector<ifrt::Shape> shapes;
-  shapes.reserve(py_arrays.size());
-
-  const ifrt::MemoryKind first_memory_kind =
-      py_arrays.front().ifrt_array()->sharding().memory_kind();
-  // TODO(hyeontaek): Canonicalize every `ifrt::MemoryKind` at creation time to
-  // skip canonicalization here once JAX begins to do it for JAX shardings.
-  const ifrt::MemoryKind canonical_first_memory_kind =
-      ifrt::CanonicalizeMemoryKind(first_memory_kind, py_arrays.front()
-                                                          .ifrt_array()
-                                                          ->sharding()
-                                                          .devices()
-                                                          ->devices()
-                                                          .front());
-  for (const auto& py_array : py_arrays) {
-    if (py_array.num_shards() != 1) {
-      throw nb::value_error(
-          absl::StrFormat(
-              "When making an array from single-device arrays the input arrays "
-              "must have one shard each. An argument array had %d shard(s).",
-              py_array.num_shards())
-              .c_str());
-    }
-    ifrt_arrays.push_back(tsl::FormRef(py_array.ifrt_array()));
-    ifrt::Device* const device =
-        ifrt_arrays.back()->sharding().devices()->devices().front();
-    devices.push_back(device);
-    device_set.insert(device);
-    shapes.push_back(ifrt_arrays.back()->shape());
-    if (canonical_first_memory_kind !=
-        ifrt::CanonicalizeMemoryKind(
-            ifrt_arrays.back()->sharding().memory_kind(), device)) {
-      throw nb::value_error(
-          absl::StrFormat(
-              "Memory kind mismatch between PjRtBuffers. Got one buffer with "
-              "memory kind '%v' and another with memory_kind '%v'",
-              first_memory_kind, ifrt_arrays.back()->sharding().memory_kind())
-              .c_str());
-    }
-  }
-  ifrt::Client* client = ifrt_arrays.front()->client();
-  ifrt::DeviceListRef device_list = client->MakeDeviceList(devices);
-  if (device_set.size() != device_list->size()) {
-    throw nb::value_error(
-        absl::StrFormat(
-            "When making an array from single-device arrays, the input arrays "
-            "must be from distinct devices, but got %v",
-            *device_list)
-            .c_str());
-  }
-
-  auto ifrt_dtype = DtypeToIfRtDType(dtype);
-  if (!ifrt_dtype.ok()) {
-    // TODO(hyeontaek): Return a absl::Status.
-    throw nb::value_error(ifrt_dtype.status().ToString().c_str());
-  }
-
-  auto ifrt_sharding =
-      xla::GetIfrtConcreteSharding(sharding,
-                                   /*shape=*/ifrt::Shape(shape),
-                                   /*shard_shapes=*/std::move(shapes));
-  if (!ifrt_sharding.ok()) {
-    // TODO(hyeontaek): Return a absl::Status.
-    throw nb::value_error(ifrt_sharding.status().ToString().c_str());
-  }
-  absl::StatusOr<tsl::RCReference<ifrt::Array>> ifrt_array;
-  // TODO(emilyaf): Always call the version that takes `dtype` once tokens are
-  // handled correctly.
-  if (ifrt_arrays.empty()) {
-    ifrt_array = client->AssembleArrayFromSingleDeviceArrays(
-        ifrt_dtype.value(), ifrt::Shape(shape), *std::move(ifrt_sharding),
-        absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput,
-        ifrt::SingleDeviceShardSemantics::kAddressableShards);
-  } else {
-    ifrt_array = client->AssembleArrayFromSingleDeviceArrays(
-        ifrt::Shape(shape), *std::move(ifrt_sharding),
-        absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput,
-        ifrt::SingleDeviceShardSemantics::kAddressableShards);
-  }
-  if (!ifrt_array.ok()) {
-    // TODO(hyeontaek): Return a absl::Status.
-    throw nb::value_error(ifrt_array.status().ToString().c_str());
-  }
-  return *std::move(ifrt_array);
-}
-
-// Creates an IFRT `MemoryKind` from a JAX `Sharding`.
-ifrt::MemoryKind CreateIfRtMemoryKindFromSharding(const nb::object& sharding) {
-  nb::object py_memory_kind = nb::none();
-
-  // sharding.attr("memory_kind") can crash if sharding was originally created
-  // from C++ and casted into a Python Sharding object. Thus, we cast sharding
-  // to a C++ type and use C++ `memory_kind()` method, which bypasses any Python
-  // attribute access.
-  nb::handle type = sharding.type();
-  if (type.is(jax::NamedSharding::type())) {
-    py_memory_kind =
-        nb::cast<const jax::NamedSharding*>(sharding)->memory_kind();
-  } else if (type.is(jax::GSPMDSharding::type())) {
-    py_memory_kind =
-        nb::cast<const jax::GSPMDSharding*>(sharding)->memory_kind();
-  } else if (type.is(jax::SingleDeviceSharding::type())) {
-    py_memory_kind =
-        nb::cast<const jax::SingleDeviceSharding*>(sharding)->memory_kind();
-  } else {
-    py_memory_kind = sharding.attr("memory_kind");
-  }
-
-  if (py_memory_kind.is_none()) {
-    return ifrt::MemoryKind();
-  }
-  return ifrt::MemoryKind(nb::cast<std::string>(py_memory_kind));
-}
-
-struct PyArrayObject {
-  PyObject_HEAD;
-#if PY_VERSION_HEX < 0x030C0000
-  PyObject* weakrefs;
-  PyObject* dict;
-#endif  // PY_VERSION_HEX < 0x030B0000
-  bool initialized;
-  alignas(PyArray::Storage) char array_storage[sizeof(PyArray::Storage)];
-};
-static_assert(std::is_standard_layout<PyArrayObject>::value);
-
-PyArray::Storage* GetPyArrayStorageFromObject(PyArrayObject* py_array_object) {
-  return std::launder(
-      reinterpret_cast<PyArray::Storage*>(py_array_object->array_storage));
-}
-
-extern "C" PyObject* PyArray_tp_new(PyTypeObject* type, PyObject*, PyObject*) {
-  PyObject* self = type->tp_alloc(type, 0);
-  auto* obj = reinterpret_cast<PyArrayObject*>(self);
-  obj->initialized = false;
-  return self;
-}
-
-extern "C" void PyArray_tp_dealloc(PyObject* self) {
-  PyObject_GC_UnTrack(self);
-  PyTypeObject* tp = Py_TYPE(self);
-  auto* obj = reinterpret_cast<PyArrayObject*>(self);
-
-  if (obj->initialized) {
-    GetPyArrayStorageFromObject(obj)->~PyArray_Storage();
-  }
-
-  PyObject_ClearWeakRefs(self);
-#if PY_VERSION_HEX < 0x030C0000
-  PyObject*& dict = *_PyObject_GetDictPtr(self);
-  Py_CLEAR(dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_ClearManagedDict(self);
-#else
-  PyObject_ClearManagedDict(self);
-#endif  // PY_VERSION_HEX < 0x030C0000
-
-  tp->tp_free(self);
-  Py_DECREF(tp);
-}
-
-// dynamic_attr: Allow the garbage collector to traverse the internal instance
-// `__dict__`.
-extern "C" int PyArray_tp_traverse(PyObject* self, visitproc visit, void* arg) {
-#if PY_VERSION_HEX < 0x030C0000
-  PyObject*& dict = *_PyObject_GetDictPtr(self);
-  Py_VISIT(dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_VisitManagedDict(self, visit, arg);
-#else
-  PyObject_VisitManagedDict(self, visit, arg);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
-  Py_VISIT(Py_TYPE(self));
-  return 0;
-}
-
-// dynamic_attr: Allow the GC to clear the dictionary.
-extern "C" int PyArray_tp_clear(PyObject* self) {
-  switch (auto guard_level = jax::GetGarbageCollectArrayGuard(); guard_level) {
-    case jax::GarbageCollectionGuardLevel::kAllow:
-      break;
-    case jax::GarbageCollectionGuardLevel::kLog:
-    case jax::GarbageCollectionGuardLevel::kFatal: {
-      auto* obj = reinterpret_cast<PyArrayObject*>(self);
-      std::string traceback_str;
-      if (obj->initialized) {
-        auto traceback = GetPyArrayStorageFromObject(obj)->traceback;
-        if (traceback.has_value()) {
-          traceback_str = traceback.value()->ToString();
-        }
-      }
-      auto error_msg = absl::StrCat(
-          "`jax.Array` was deleted by the Python garbage collector "
-          "instead of reference counting. Break the reference cycle "
-          "that delays the deletion of this `jax.Array` to avoid hogging "
-          "memory. Traceback: \n",
-          traceback_str.empty() ? "not available" : traceback_str);
-      if (guard_level == jax::GarbageCollectionGuardLevel::kFatal) {
-        Py_FatalError(error_msg.c_str());
-      } else {
-        PyErr_SetString(PyExc_RuntimeError, error_msg.c_str());
-        PyErr_Print();
-        PyErr_Clear();
-      }
-      break;
-    }
-  }
-#if PY_VERSION_HEX < 0x030C0000
-  PyObject*& dict = *_PyObject_GetDictPtr(self);
-  Py_CLEAR(dict);
-#elif PY_VERSION_HEX < 0x030D0000
-  _PyObject_ClearManagedDict(self);
-#else
-  PyObject_ClearManagedDict(self);
-#endif  // PY_VERSION_HEX < 0x030C0000
-  return 0;
-}
-
-template <typename... Args>
-PyArray::Storage* Construct(PyArrayObject* self, Args&&... args) {
-  PyArray::Storage* out =
-      new (self->array_storage) PyArray::Storage(std::forward<Args>(args)...);
-  self->initialized = true;
-  return out;
-}
-
-struct ShapedArrayCacheKey {
-  std::vector<int64_t> dims;
-  ifrt::DType dtype{ifrt::DType::kInvalid};
-  bool weak_type;
-
-  template <typename H>
-  friend H AbslHashValue(H h, const ShapedArrayCacheKey& value) {
-    return H::combine(std::move(h), value.dims, value.dtype, value.weak_type);
-  }
-  bool operator==(const ShapedArrayCacheKey& other) const {
-    return dims == other.dims && dtype == other.dtype &&
-           weak_type == other.weak_type;
-  }
-};
-
-// Constructing ShapedArrays has gotten slow. Cache it.
-nb::object MakeShapedArrayCached(const ShapedArrayCacheKey& key) {
-  using CacheT =
-      LRUCache<ShapedArrayCacheKey, std::shared_ptr<std::optional<nb::object>>>;
-  static nb::ft_mutex mu;
-  static auto* lru_list = new CacheT::LRUList(4096);
-  static auto* cache = new CacheT(lru_list);
-
-  static const nb::object* shaped_array = []() -> nb::object* {
-    nb::object jax_core;
-    try {
-      jax_core = nb::module_::import_("jax.core");
-    } catch (nb::python_error& e) {
-      return nullptr;
-    }
-    return new nb::object(jax_core.attr("ShapedArray"));
-  }();
-  if (!shaped_array) {
-    return nb::none();
-  }
-
-  nb::ft_lock_guard lock(mu);
-  auto value =
-      cache->GetOrCreateIfAbsent(key, [](const ShapedArrayCacheKey& key) {
-        return std::make_shared<std::optional<nb::object>>();
-      });
-
-  if (!value->has_value()) {
-    nb_dtype dtype =
-        IfrtDtypeToDtypeWithTokenCanonicalization(key.dtype).value();
-    nb::object aval = (*shaped_array)(
-        SpanToNbTuple(absl::Span<const int64_t>(
-            key.dtype.kind() == ifrt::DType::kToken ? std::vector<int64_t>{0}
-                                                    : key.dims)),
-        dtype, key.weak_type);
-    *value = aval;
-    return aval;
-  }
-  return **value;
-}
-
-// Grouping key used by BatchedCopyToDeviceWithSharding.
-// Defined outside of the function as required by templatized function
-// `AbslHashValue`.
-struct BatchedCopyToDeviceWithShardingKey {
-  ifrt::DeviceListRef src_devices;
-  ifrt::MemoryKind src_memory_kind;
-  ifrt::DeviceListRef dst_devices;
-  ifrt::MemoryKind dst_memory_kind;
-  ifrt::ArrayCopySemantics array_copy_semantics;
-
-  bool operator==(const BatchedCopyToDeviceWithShardingKey& other) const {
-    return *src_devices == *other.src_devices &&
-           src_memory_kind == other.src_memory_kind &&
-           *dst_devices == *other.dst_devices &&
-           dst_memory_kind == other.dst_memory_kind &&
-           array_copy_semantics == other.array_copy_semantics;
-  }
-
-  template <typename H>
-  friend H AbslHashValue(H h, const BatchedCopyToDeviceWithShardingKey& key) {
-    return H::combine(std::move(h), key.src_devices, key.src_memory_kind,
-                      key.dst_devices, key.dst_memory_kind,
-                      key.array_copy_semantics);
-  }
-};
-
-}  // namespace
-
-PyArray_Storage::PyArray_Storage(
-    nb::object aval, bool weak_type, xla::nb_dtype dtype,
-    std::vector<int64_t> shape, nb::object sharding, bool committed,
-    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
-    tsl::RCReference<ifrt::Array> ifrt_array, xla::PjRtFuture<> result_status)
-    : aval(std::move(aval)),
-      weak_type(weak_type),
-      dtype(std::move(dtype)),
-      shape(std::move(shape)),
-      sharding(std::move(sharding)),
-      committed(committed),
-      py_client(std::move(py_client)),
-      traceback(std::move(traceback)),
-      ifrt_array(std::move(ifrt_array)),
-      result_status(std::move(result_status)) {
-  static_assert(PyClient::kNumArraysShards <
-                std::numeric_limits<uint8_t>::max());
-  thread_id_bucket = std::hash<std::thread::id>()(std::this_thread::get_id()) %
-                     PyClient::kNumArraysShards;
-
-  PyClient::ArraysShard& shard = this->py_client->arrays_[thread_id_bucket];
-  nanobind::ft_lock_guard lock(shard.mutex);
-  next = shard.arrays;
-  shard.arrays = this;
-  if (next) {
-    next->prev = this;
-  }
-  prev = nullptr;
-}
-
-void PyInit_helper(PyArray self, nb::object aval, nb::object sharding,
-                   absl::Span<const PyArray> py_arrays, bool committed) {
-  auto dtype = nb::cast<nb_dtype>(aval.attr("dtype"));
-  auto shape = nb::cast<std::vector<int64_t>>(aval.attr("shape"));
-  auto ifrt_array = CreateIfRtArrayFromSingleDeviceShardedPyArrays(
-      dtype, shape, py_arrays, sharding);
-  Construct(reinterpret_cast<PyArrayObject*>(self.ptr()), aval,
-            nb::cast<bool>(aval.attr("weak_type")), std::move(dtype),
-            std::move(shape), std::move(sharding), committed,
-            py_arrays.at(0).py_client(), Traceback::Get(),
-            std::move(ifrt_array), xla::PjRtFuture<>());
-}
-
-void PyArray::PyInit(PyArray self, nb::object aval, nb::object sharding,
-                     absl::Span<const PyArray> py_arrays, bool committed,
-                     bool skip_checks) {
-  if (skip_checks) {
-    PyInit_helper(self, aval, sharding, py_arrays, committed);
-  } else {
-    nb::object rearranged_arrays =
-        self.CheckAndRearrange(py_arrays, sharding, aval);
-    auto rearranged_py_arrays =
-        nb::cast<std::vector<PyArray>>(rearranged_arrays);
-    PyInit_helper(self, aval, sharding, rearranged_py_arrays, committed);
-  }
-}
-
-PyArray PyArray::MakeFromSingleDeviceArray(
-    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
-    tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
-    xla::PjRtFuture<> result_status) {
-  if (!llvm::isa<ifrt::SingleDeviceSharding>(ifrt_array->sharding())) {
-    throw XlaRuntimeError(
-        InvalidArgument("Constructing single device jax.Array from non-single "
-                        "device ifrt array."));
-  }
-  auto shape_span = ifrt_array->shape().dims();
-  ShapedArrayCacheKey key;
-  key.dtype = ifrt_array->dtype();
-  key.dims = key.dtype.kind() == ifrt::DType::kToken
-                 ? std::vector<int64_t>{0}
-                 : std::vector<int64_t>(shape_span.begin(), shape_span.end());
-  key.weak_type = weak_type;
-  auto aval = MakeShapedArrayCached(key);
-  auto dtype = IfrtDtypeToDtypeWithTokenCanonicalization(key.dtype).value();
-  const ifrt::MemoryKind memory_kind = ifrt_array->sharding().memory_kind();
-  nb::object py_memory_kind =
-      (memory_kind.memory_kind().has_value())
-          ? nb::object(nb::str(memory_kind.memory_kind()->data(),
-                               memory_kind.memory_kind()->size()))
-          : nb::none();
-  nb::object sharding = make_nb_class<jax::SingleDeviceSharding>(
-      py_client, ifrt_array->sharding().devices(), std::move(py_memory_kind));
-  return PyArray(std::move(aval), weak_type, dtype, std::move(key.dims),
-                 std::move(sharding), std::move(py_client),
-                 std::move(traceback), std::move(ifrt_array), committed,
-                 /*skip_checks=*/true, std::move(result_status));
-}
-
-PyArray PyArray::MakeFromIfrtArrayAndSharding(
-    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
-    tsl::RCReference<ifrt::Array> ifrt_array, nb::object sharding,
-    bool weak_type, bool committed, bool skip_checks) {
-  auto shape_span = ifrt_array->shape().dims();
-  ShapedArrayCacheKey key;
-  key.dtype = ifrt_array->dtype();
-  key.dims = key.dtype.kind() == ifrt::DType::kToken
-                 ? std::vector<int64_t>{0}
-                 : std::vector<int64_t>(shape_span.begin(), shape_span.end());
-  key.weak_type = weak_type;
-  auto aval = MakeShapedArrayCached(key);
-  auto dtype = IfrtDtypeToDtypeWithTokenCanonicalization(key.dtype).value();
-  return PyArray(std::move(aval), weak_type, dtype, std::move(key.dims),
-                 std::move(sharding), std::move(py_client),
-                 std::move(traceback), std::move(ifrt_array), committed,
-                 skip_checks);
-}
-
-PyArrayResultHandler::PyArrayResultHandler(nb::object aval, nb::object sharding,
-                                           bool committed, bool skip_checks)
-    : aval_(std::move(aval)),
-      sharding_(std::move(sharding)),
-      committed_(committed),
-      skip_checks_(skip_checks) {
-  weak_type_ = nb::cast<bool>(aval_.attr("weak_type"));
-  dtype_ = nb::cast<nb_dtype>(aval_.attr("dtype"));
-  shape_ = nb::cast<std::vector<int64_t>>(aval_.attr("shape"));
-}
-
-PyArray PyArrayResultHandler::Call(absl::Span<const PyArray> py_arrays) const {
-  return Call(py_arrays.at(0).py_client(),
-              CreateIfRtArrayFromSingleDeviceShardedPyArrays(
-                  dtype_, shape_, py_arrays, sharding_),
-              xla::PjRtFuture<>());
-}
-
-PyArray PyArrayResultHandler::Call(nb_class_ptr<PyClient> py_client,
-                                   tsl::RCReference<ifrt::Array> ifrt_array,
-                                   xla::PjRtFuture<> result_status) const {
-  return PyArray(aval_, weak_type_, dtype_, shape_, sharding_,
-                 std::move(py_client), Traceback::Get(), std::move(ifrt_array),
-                 committed_, skip_checks_, std::move(result_status));
-}
-
-PyArray PyArrayResultHandler::Call(PyArray py_array) const {
-  return Call(py_array.py_client(), tsl::FormRef(py_array.ifrt_array()),
-              xla::PjRtFuture<>());
-}
-
-PyArray::PyArray(nb::object aval, bool weak_type, nb_dtype dtype,
-                 std::vector<int64_t> shape, nb::object sharding,
-                 nb_class_ptr<PyClient> py_client,
-                 std::optional<nb_traceback> traceback,
-                 tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
-                 bool skip_checks, xla::PjRtFuture<> result_status) {
-  auto* self =
-      PyArray_tp_new(reinterpret_cast<PyTypeObject*>(type_), nullptr, nullptr);
-  m_ptr = self;
-  Construct(reinterpret_cast<PyArrayObject*>(self), std::move(aval), weak_type,
-            std::move(dtype), std::move(shape), std::move(sharding), committed,
-            std::move(py_client), std::move(traceback), std::move(ifrt_array),
-            std::move(result_status));
-
-  if (!skip_checks) {
-    this->attr("_arrays") = this->attr("_check_and_rearrange")(
-        this->attr("_arrays"), this->attr("_sharding"), this->attr("aval"));
-  }
-}
-
-PyArray::Storage& PyArray::GetStorage() {
-  return *GetPyArrayStorageFromObject(reinterpret_cast<PyArrayObject*>(ptr()));
-}
-
-const PyArray::Storage& PyArray::GetStorage() const {
-  return *GetPyArrayStorageFromObject(reinterpret_cast<PyArrayObject*>(ptr()));
-}
-
-nb::object PyArray::CheckAndRearrange(const absl::Span<const PyArray> py_arrays,
-                                      const nb::object sharding,
-                                      const nb::object aval) {
-  return this->attr("_check_and_rearrange")(py_arrays, sharding, aval);
-}
-
-void PyArray::SetIfrtArray(tsl::RCReference<ifrt::Array> ifrt_array) {
-  GetStorage().ifrt_array = std::move(ifrt_array);
-}
-
-const std::vector<PyArray>& PyArray::py_arrays_cached() {
-  auto& py_arrays = this->py_arrays();
-
-  if (py_arrays.empty()) {
-    auto ifrt_arrays = ifrt_array()->DisassembleIntoSingleDeviceArrays(
-        ifrt::ArrayCopySemantics::kReuseInput,
-        ifrt::SingleDeviceShardSemantics::kAddressableShards);
-    if (!ifrt_arrays.ok()) {
-      throw nb::value_error(
-          absl::StrCat("Failed to disassemble into single-device arrays: ",
-                       ifrt_arrays.status().ToString())
-              .c_str());
-    }
-    py_arrays.reserve(ifrt_arrays->size());
-    for (auto& ifrt_array : *ifrt_arrays) {
-      py_arrays.push_back(PyArray::MakeFromSingleDeviceArray(
-          py_client(), traceback(), std::move(ifrt_array), weak_type(),
-          committed(), result_status()));
-    }
-  }
-
-  return py_arrays;
-}
-
-nb::object PyArray::arrays() {
-  // For performance, we only keep pjrt buffers by default. But on python side
-  // "_arrays" returns PyArrays instead, and subsequent calls to "_arrays"
-  // should return the same PyArrays (to avoid duplicate device to host
-  // transfers). So we create PyArrays the first time it is called and reuse
-  // them later.
-  if (ifrt_array() == nullptr || ifrt_array()->IsDeleted()) return nb::none();
-
-  if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
-    std::vector<PyArray> py_arrays;
-    py_arrays.push_back(*this);
-    return nb::cast(py_arrays);
-  }
-
-  return nb::cast(py_arrays_cached());
-}
-
-absl::Status PyArray::set_arrays(nb::object obj) {
-  if (obj.is_none()) {
-    SetIfrtArray(tsl::RCReference<ifrt::Array>());
-    py_arrays().clear();
-    return absl::OkStatus();
-  }
-
-  if (!nb::isinstance<nb::list>(obj)) {
-    return InvalidArgument("Unsupported arg when setting Array._arrays: %s",
-                           nb::cast<absl::string_view>(nb::str(obj.type())));
-  }
-
-  nb::list list(obj);
-
-  if (list.size() == 0) return absl::OkStatus();
-
-  SetIfrtArray(tsl::RCReference<ifrt::Array>());
-  py_arrays().clear();
-  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
-  ifrt_arrays.reserve(list.size());
-  absl::InlinedVector<ifrt::Device*, 1> devices;
-  devices.reserve(list.size());
-  std::vector<ifrt::Shape> shapes;
-  shapes.reserve(list.size());
-  for (nb::handle obj : list) {
-    if (obj.type().is(PyArray::type())) {
-      auto py_array = nb::borrow<PyArray>(obj);
-      if (py_array.py_client().get() != py_client().get()) {
-        return InvalidArgument("Client mismatch when assigning to _arrays.");
-      }
-      if (py_array.num_shards() != 1) {
-        return InvalidArgument("Wrong number of shards: %d",
-                               py_array.num_shards());
-      }
-      ifrt_arrays.push_back(tsl::FormRef(py_array.ifrt_array()));
-      devices.push_back(
-          ifrt_arrays.back()->sharding().devices()->devices().front());
-      shapes.push_back(ifrt_arrays.back()->shape());
-    } else {
-      return InvalidArgument("Unsupported arg when setting Array._arrays: %s",
-                             nb::cast<absl::string_view>(nb::str(obj.type())));
-    }
-  }
-  const ifrt::MemoryKind first_memory_kind =
-      ifrt_arrays.front()->sharding().memory_kind();
-  // TODO(hyeontaek): Canonicalize every `ifrt::MemoryKind` at creation time to
-  // skip canonicalization here once JAX begins to do it for JAX shardings.
-  const ifrt::MemoryKind canonical_first_memory_kind =
-      ifrt::CanonicalizeMemoryKind(
-          first_memory_kind,
-          ifrt_arrays.front()->sharding().devices()->devices().front());
-  for (const auto& ifrt_array : ifrt_arrays) {
-    if (canonical_first_memory_kind !=
-        ifrt::CanonicalizeMemoryKind(
-            ifrt_array->sharding().memory_kind(),
-            ifrt_array->sharding().devices()->devices().front())) {
-      throw nb::value_error(
-          absl::StrFormat(
-              "Memory kind mismatch between single-device arrays. Got one "
-              "array with memory kind '%v' and another with memory_kind '%v'",
-              first_memory_kind, ifrt_array->sharding().memory_kind())
-              .c_str());
-    }
-  }
-
-  TF_ASSIGN_OR_RETURN(auto ifrt_sharding,
-                      xla::GetIfrtConcreteSharding(
-                          sharding(), ifrt::Shape(shape()), std::move(shapes)));
-  TF_ASSIGN_OR_RETURN(
-      auto array,
-      py_client()->ifrt_client()->AssembleArrayFromSingleDeviceArrays(
-          ifrt::Shape(shape()), std::move(ifrt_sharding),
-          absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput,
-          ifrt::SingleDeviceShardSemantics::kAddressableShards));
-  SetIfrtArray(std::move(array));
-  return absl::OkStatus();
-}
-
-absl::StatusOr<PyArray> PyArray::FullyReplicatedShard() {
-  auto& cached = GetStorage().fully_replicated_array;
-  if (!cached.is_none()) {
-    return nb::cast<PyArray>(cached);
-  }
-
-  if (ifrt_array() == nullptr) {
-    return InvalidArgument(
-        "FullyReplicatedShard() called on deleted or donated buffer");
-  }
-
-  TF_ASSIGN_OR_RETURN(auto fully_replicated_ifrt_shard,
-                      ifrt_array()->FullyReplicatedShard(
-                          ifrt::ArrayCopySemantics::kReuseInput));
-  auto array = MakeFromSingleDeviceArray(
-      py_client(), traceback(), std::move(fully_replicated_ifrt_shard),
-      weak_type(), committed(), result_status());
-  cached = array;
-  return nb::cast<PyArray>(cached);
-}
-
-absl::Status PyArray::BlockUntilReady() const {
-  nb::gil_scoped_release gil_release;
-  if (ifrt_array() == nullptr) {
-    return InvalidArgument(
-        "BlockHostUntilReady() called on deleted or donated buffer");
-  }
-  ifrt::Array* ifrt_array = this->ifrt_array();
-  return AwaitBuffersReady(absl::MakeConstSpan(&ifrt_array, 1));
-}
-
-absl::StatusOr<size_t> PyArray::GetOnDeviceSizeInBytes() {
-  if (ifrt_array() == nullptr) {
-    return InvalidArgument(
-        "GetOnDeviceSizeInBytes() called on deleted or donated buffer");
-  }
-
-  TF_ASSIGN_OR_RETURN(size_t shard_size,
-                      GetPjrtBuffer(ifrt_array())->GetOnDeviceSizeInBytes());
-  return shard_size * nb::len(nb::object(sharding().attr("device_set")));
-}
-
-absl::Status PyArray::BlockUntilResultStatusIsReady() {
-  auto& result_status = GetStorage().result_status;
-  // If the result_status future is not valid, this result did not come directly
-  // from a computation that returns tokens, so we don't wait for the status.
-  if (!result_status.IsValid()) {
-    return absl::OkStatus();
-  }
-  if (!result_status.IsReady()) {
-    // Only release the gil if we need to Await().
-    nb::gil_scoped_release release_gil;
-    return result_status.Await();
-  }
-  return result_status.Await();
-}
-
-absl::StatusOr<std::pair<nb::object, bool>>
-PyArray::SingleDeviceArrayToNumpyArrayDidCopy() {
-  TF_ASSIGN_OR_RETURN(auto arr, FullyReplicatedShard());
-  auto result = arr.GetStorage().host_value.AsNumPyArray(
-      arr.GetStorage().dynamic_shape, arr.ifrt_array());
-  TF_RETURN_IF_ERROR(arr.BlockUntilResultStatusIsReady());
-  return result;
-}
-
-absl::StatusOr<nb::object> PyArray::SingleDeviceArrayToNumpyArray() {
-  TF_ASSIGN_OR_RETURN(auto result, SingleDeviceArrayToNumpyArrayDidCopy());
-  return result.first;
-}
-
-absl::Status PyArray::CopySingleDeviceArrayToHostAsync() {
-  TF_ASSIGN_OR_RETURN(auto arr, FullyReplicatedShard());
-  return arr.GetStorage().host_value.CopyToHostAsync(
-      arr.GetStorage().dynamic_shape, arr.ifrt_array());
-}
-
-absl::StatusOr<PyArray> PyArray::AssertUnsharded(absl::string_view api) {
-  if (ifrt_array() == nullptr) {
-    return InvalidArgument("%s( called on deleted or donated buffer", api);
-  }
-
-  if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
-    return *this;
-  }
-
-  auto& py_arrays = py_arrays_cached();
-  if (py_arrays.size() != 1) {
-    return InvalidArgument("%s() is supported only for unsharded arrays.", api);
-  }
-  return py_arrays[0];
-}
-
-absl::StatusOr<std::uintptr_t> PyArray::UnsafeBufferPointer() {
-  TF_ASSIGN_OR_RETURN(auto arr, AssertUnsharded("UnsafeBufferPointer"));
-
-  return py_client()->pjrt_client()->UnsafeBufferPointer(
-      GetPjrtBuffer(arr.ifrt_array()));
-}
-
-nb::dict PyArray::CudaArrayInterface() {
-  auto arr_or_error = AssertUnsharded("UnsafeBufferPointer");
-  if (!arr_or_error.ok()) {
-    throw nb::attribute_error(
-        "__cuda_array_interface__ is only supported for unsharded arrays.");
-  }
-  auto arr = *arr_or_error;
-
-  ifrt::Array* ifrt_array = arr.ifrt_array();
-  std::optional<Shape>& scratch = arr.GetStorage().dynamic_shape;
-  auto* pjrt_buffer = GetPjrtBuffer(ifrt_array);
-  if (pjrt_buffer->client()->platform_id() != CudaId()) {
-    throw nb::attribute_error(
-        "__cuda_array_interface__ is only defined for NVidia GPU buffers.");
-  }
-  if (pjrt_buffer->IsTuple()) {
-    throw nb::attribute_error(
-        "__cuda_array_interface__ is only defined for array buffers.");
-  }
-
-  switch (pjrt_buffer->element_type()) {
-    case PrimitiveType::PRED:
-    case PrimitiveType::S8:
-    case PrimitiveType::S16:
-    case PrimitiveType::S32:
-    case PrimitiveType::S64:
-    case PrimitiveType::U8:
-    case PrimitiveType::U16:
-    case PrimitiveType::U32:
-    case PrimitiveType::U64:
-    case PrimitiveType::F16:
-    case PrimitiveType::F32:
-    case PrimitiveType::F64:
-    case PrimitiveType::C64:
-    case PrimitiveType::C128:
-      break;
-
-    default:
-      throw nb::attribute_error(
-          absl::StrFormat(
-              "__cuda_array_interface__ is not supported for %s buffers.",
-              PrimitiveType_Name(pjrt_buffer->element_type()))
-              .c_str());
-  }
-
-  nb::str typestr =
-      ValueOrThrow(TypeDescriptorForPrimitiveType(pjrt_buffer->element_type()));
-
-  // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
-  Layout xla_layout = pjrt_buffer->layout()->xla_layout();
-  if (!LayoutUtil::IsMonotonicWithDim0Major(xla_layout)) {
-    throw nb::attribute_error(
-        "__cuda_array_interface__ is only currently supported for "
-        "buffers in row-major order.");
-  }
-
-  nb::dict result;
-  const auto* dynamic_shape =
-      ValueOrThrow(XlaDynamicShape(ifrt_array, scratch));
-  result["shape"] = SpanToNbTuple(dynamic_shape->dimensions());
-  result["typestr"] = std::move(typestr);
-  std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold =
-      ValueOrThrow(pjrt_buffer->AcquireExternalReference());
-  const void* root_ptr =
-      external_reference_hold->OpaqueDeviceMemoryDataPointer();
-  nb::tuple data =
-      nb::make_tuple(nb::int_(absl::bit_cast<std::uintptr_t>(root_ptr)),
-                     nb::bool_(true) /* read-only */
-      );
-  result["data"] = std::move(data);
-  result["version"] = nb::int_(2);
-  return result;
-}
-
-absl::StatusOr<nb::object> CudaArrayInterfaceToBuffer(
-    const nb::dict& cai, nb_class_ptr<PyClient> client,
-    std::optional<int> device_id) {
-  if (!cai.contains("data")) {
-    return absl::InvalidArgumentError(
-        "CUDA Array Interface does not define `data`");
-  }
-  if (!cai.contains("shape")) {
-    return absl::InvalidArgumentError(
-        "CUDA Array Interface does not define `shape`");
-  }
-  if (!cai.contains("typestr")) {
-    return absl::InvalidArgumentError(
-        "CUDA Array Interface does not define `typestr`");
-  }
-  if (!cai.contains("version")) {
-    return absl::InvalidArgumentError(
-        "CUDA Array Interface does not define `version`");
-  }
-  auto version = nb::cast<int>(cai["version"]);
-  if (version < 2 || version > 3) {
-    LOG(WARNING) << "CUDA Array Interface version " << version
-                 << " support is undefined";
-  }
-  auto data = nb::cast<nb::tuple>(cai["data"]);
-  auto data_value = nb::cast<std::intptr_t>(data[0]);
-  void* data_ptr = reinterpret_cast<void*>(data_value);
-  auto dimensions = nb::cast<std::vector<int64_t>>(cai["shape"]);
-  if (data_value == 0 && absl::c_find(dimensions, 0) == dimensions.end()) {
-    return absl::InvalidArgumentError(
-        "CUDA Array Interface `data`(=NULL) and `shape`(no zero-valued "
-        "dimensions) are inconsistent");
-  }
-  auto ndim = dimensions.size();
-  TF_ASSIGN_OR_RETURN(
-      PrimitiveType element_type,
-      DtypeToPrimitiveType(nb_dtype::from_args(cai["typestr"])));
-
-  if (!device_id.has_value()) {
-    throw XlaRuntimeError(
-        "This operation requires CUDA support from jaxlib or jax cuda plugin.");
-  }
-  TF_ASSIGN_OR_RETURN(auto device,
-                      client->DeviceFromLocalHardwareId(*device_id));
-  bool is_default_stream =
-      data_value == 0 || version == 2 ||
-      (version == 3 && (!cai.contains("stream") || cai["stream"].is_none()));
-  TF_ASSIGN_OR_RETURN(
-      std::intptr_t stream,
-      ([is_default_stream, cai, device]() -> absl::StatusOr<std::intptr_t> {
-        if (is_default_stream) {
-          return device->GetStreamForExternalReadyEvents();
-        } else {
-          auto stream_ = nb::cast<std::intptr_t>(cai["stream"]);
-          if (stream_ == 0) {
-            return absl::InvalidArgumentError(
-                "CUDA Array Interface does not allow zero stream value");
-          }
-          return stream_;
-        }
-      }()));
-
-  std::vector<int64_t> minor_to_major(ndim);
-  if (cai.contains("strides") && !cai["strides"].is_none() && data_value != 0) {
-    std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
-    auto strides = nb::cast<std::vector<int64_t>>(cai["strides"]);
-    if (strides.size() != ndim) {
-      return absl::InvalidArgumentError(
-          "CUDA Array Interface `shape` and `strides` dimensionalities are "
-          "inconsistent");
-    }
-    absl::c_sort(minor_to_major, [&](int a, int b) {
-      // If two dimensions have the same stride, prefer the major-to-minor
-      // interpretation of the ordering, since that's what JAX wants.
-      return (strides[a] == strides[b] ? b < a : strides[a] < strides[b]);
-    });
-    int64_t stride = ShapeUtil::ByteSizeOfPrimitiveType(element_type);
-    for (int64_t d : minor_to_major) {
-      if (dimensions[d] > 1 && strides[d] != stride) {
-        return absl::UnimplementedError(absl::StrCat(
-            "Only arrays with trivial (compact) striding are supported; "
-            "i.e., arrays whose striding represents a transposition of the "
-            "underlying buffer but not broadcasting. Dimensions were: [%s], "
-            "strides were [%s].",
-            absl::StrJoin(dimensions, ","), absl::StrJoin(strides, ",")));
-      }
-      stride *= dimensions[d];
-    }
-  } else {
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
-  }
-  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(element_type, dimensions,
-                                                    minor_to_major);
-  std::function<void()> on_delete_callback = []() {};
-  auto* pjrt_device =
-      llvm::dyn_cast_or_null<ifrt::PjRtDevice>(device->device());
-  if (pjrt_device == nullptr) {
-    return InvalidArgument(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  TF_RET_CHECK(pjrt_device->IsAddressable());
-  TF_ASSIGN_OR_RETURN(
-      auto pjrt_buffer,
-      device->client()->pjrt_client()->CreateViewOfDeviceBuffer(
-          static_cast<char*>(data_ptr), shape,
-          *pjrt_device->pjrt_device()->default_memory_space(),
-          on_delete_callback,
-          stream <= 2 ? std::nullopt : std::make_optional(stream)));
-  auto* ifrt_client =
-      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
-  if (ifrt_client == nullptr) {
-    throw XlaRuntimeError(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  TF_ASSIGN_OR_RETURN(auto ifrt_array,
-                      ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
-                                            std::move(ifrt_array), false, true);
-}
-
-absl::Status PyArray::Delete() {
-  for (auto& arr : py_arrays()) {
-    TF_RETURN_IF_ERROR(arr.Delete());
-  }
-  py_arrays().clear();
-  if (ifrt_array() != nullptr) {
-    // We do not wait for the deletion to complete here.
-    //
-    // (1) Skipping blocking does not affect the correctness of deletion as long
-    // as the runtime preserves dispatch ordering of deletion w.r.t. other
-    // operations.
-    //
-    // (2) Synchronously waiting for the deletion to complete is very expensive
-    // when the deletion can return a status only after the underlying physical
-    // buffer has been deleted or a request must be processed via RPC,
-    // especially as this deletion is done per array.
-    ifrt_array()->Delete();
-    SetIfrtArray(tsl::RCReference<ifrt::Array>());
-  }
-  return absl::OkStatus();
-}
-
-bool PyArray::IsDeleted() const {
-  if (ifrt_array() == nullptr) {
-    return true;
-  }
-
-  return ifrt_array()->IsDeleted();
-}
-
-PyArray PyArray::Clone() const {
-  auto array = tsl::FormRef(ifrt_array());
-  auto* ifrt_client = py_client()->ifrt_client();
-  tsl::RCReference<ifrt::Array> out =
-      ifrt_client
-          ->CopyArrays(absl::MakeSpan(&array, 1), /*devices=*/std::nullopt,
-                       /*memory_kind=*/std::nullopt,
-                       ifrt::ArrayCopySemantics::kReuseInput)
-          .value()
-          .front();
-  return PyArray(aval(), weak_type(), dtype(),
-                 std::vector<int64_t>(shape().begin(), shape().end()),
-                 sharding(), py_client(), traceback(), std::move(out),
-                 committed(), /*skip_checks=*/true, result_status());
-}
-
-nb::handle PyArray::Storage::AsHandle() {
-  return reinterpret_cast<PyObject*>(reinterpret_cast<char*>(this) -
-                                     offsetof(PyArrayObject, array_storage));
-}
-
-PyArray::Storage::~PyArray_Storage() {
-  CHECK(PyGILState_Check());
-  if (py_client) {
-    PyClient::ArraysShard& shard = py_client->arrays_[thread_id_bucket];
-    nanobind::ft_lock_guard lock(shard.mutex);
-    if (shard.arrays == this) {
-      shard.arrays = next;
-    }
-    if (prev) {
-      prev->next = next;
-    }
-    if (next) {
-      next->prev = prev;
-    }
-  }
-  // Release GIL and then explicitly destroy `ifrt_array` to prevent deadlock on
-  // CPU backend caused by interactions between argument donations and host
-  // callbacks.
-  nb::gil_scoped_release gil_release;
-  ifrt_array.reset();
-}
-
-absl::StatusOr<std::vector<PyArray>> PyArray::BatchedCopyToDeviceWithSharding(
-    absl::Span<const PyArray> py_arrays,
-    absl::Span<const ifrt::DeviceListRef> dst_device_lists,
-    absl::Span<const nb::object> dst_shardings,
-    absl::Span<const ifrt::ArrayCopySemantics> array_copy_semantics) {
-  if (py_arrays.empty()) {
-    return std::vector<PyArray>();
-  }
-
-  TF_RET_CHECK(py_arrays.size() == dst_device_lists.size());
-  TF_RET_CHECK(py_arrays.size() == dst_shardings.size());
-
-  ifrt::Client* const client = py_arrays.front().ifrt_array()->client();
-  std::vector<PyArray> results(py_arrays.size());
-
-  // Arrays to be copied, grouped by source/destination devices and memory
-  // kinds. The grouping is enforced by `ifrt::Client::CopyArrays()`.
-  struct Batch {
-    std::vector<int> indexes;
-    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
-  };
-  absl::flat_hash_map<BatchedCopyToDeviceWithShardingKey, Batch> batches;
-
-  for (int i = 0; i < py_arrays.size(); ++i) {
-    const auto& py_array = py_arrays[i];
-    const auto& dst_sharding = dst_shardings[i];
-    const auto& array_cs = array_copy_semantics[i];
-
-    auto* ifrt_array_ptr = py_array.ifrt_array();
-    const ifrt::DeviceListRef& src_devices =
-        ifrt_array_ptr->sharding().devices();
-    const ifrt::DeviceListRef& dst_devices = dst_device_lists[i];
-
-    ifrt::MemoryKind src_memory_kind =
-        ifrt::CanonicalizeMemoryKind(ifrt_array_ptr->sharding().memory_kind(),
-                                     src_devices->devices().front());
-    ifrt::MemoryKind dst_memory_kind = ifrt::CanonicalizeMemoryKind(
-        CreateIfRtMemoryKindFromSharding(dst_sharding),
-        dst_devices->devices().front());
-
-    if (*src_devices == *dst_devices && src_memory_kind == dst_memory_kind &&
-        array_cs == ifrt::ArrayCopySemantics::kReuseInput) {
-      results[i] = py_arrays[i];
-      continue;
-    }
-
-    auto transfer_guard_formatter = [&py_array, &dst_sharding] {
-      return absl::StrCat(
-          "aval=", nb::cast<absl::string_view>(nb::repr(py_array.aval())),
-          ", sharding=",
-          nb::cast<absl::string_view>(nb::repr(py_array.sharding())),
-          ", dst_sharding=",
-          nb::cast<absl::string_view>(nb::repr(dst_sharding)));
-    };
-    TF_RETURN_IF_ERROR(
-        jax::ApplyTransferGuardToDeviceToDevice(transfer_guard_formatter));
-
-    Batch& batch = batches[BatchedCopyToDeviceWithShardingKey{
-        src_devices, src_memory_kind, dst_devices, dst_memory_kind, array_cs}];
-    batch.indexes.push_back(i);
-    batch.ifrt_arrays.push_back(tsl::FormRef(ifrt_array_ptr));
-  }
-
-  std::vector<std::pair<int, tsl::RCReference<ifrt::Array>>> ifrt_arrays;
-  {
-    GlobalPyRefManager()->CollectGarbage();
-    nb::gil_scoped_release gil_release;
-
-    for (auto& [key, batch] : batches) {
-      TF_ASSIGN_OR_RETURN(
-          auto copied,
-          client->CopyArrays(
-              absl::MakeSpan(batch.ifrt_arrays),
-              // All arrays in `batch` have the same `key.dst_devices` and
-              // `key.dst_memory_kind` due to the grouping above.
-              key.dst_devices, key.dst_memory_kind, key.array_copy_semantics));
-      for (int i = 0; i < batch.indexes.size(); ++i) {
-        ifrt_arrays.push_back(
-            std::make_pair(batch.indexes[i], std::move(copied[i])));
-      }
-    }
-  }
-
-  auto traceback = Traceback::Get();
-  for (auto& [i, ifrt_array] : ifrt_arrays) {
-    const auto& py_array = py_arrays[i];
-    absl::Span<const int64_t> shape_span = py_array.shape();
-    results[i] =
-        PyArray(py_array.aval(), py_array.weak_type(), py_array.dtype(),
-                std::vector<int64_t>(shape_span.begin(), shape_span.end()),
-                dst_shardings[i], py_array.py_client(), traceback,
-                std::move(ifrt_array), py_array.committed(),
-                /*skip_checks=*/true, py_array.result_status());
-  }
-  return results;
-}
-
-absl::StatusOr<PyArray> PyArray::BatchedDevicePut(
-    nb::object aval, nb::object sharding, std::vector<nb::object> xs,
-    absl::Span<const PyDevice* const> dst_devices, bool committed,
-    bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
-    bool jax_enable_x64) {
-  if (dst_devices.size() != xs.size() || xs.empty()) {
-    throw nb::value_error(
-        absl::StrCat("Argument sizes (xs and devices) must match %zu vs "
-                     "%zu and be nonzero",
-                     dst_devices.size(), xs.size())
-            .c_str());
-  }
-  for (const PyDevice* device : dst_devices) {
-    if (device->client().get() == nullptr) {
-      return InvalidArgument("Cannot copy to unattached devices.");
-    }
-  }
-  auto transfer_guard_formatter = [&aval, &sharding] {
-    return absl::StrCat(
-        "aval=", nb::cast<absl::string_view>(nb::repr(aval)),
-        ", dst_sharding=", nb::cast<absl::string_view>(nb::repr(sharding)));
-  };
-
-  GlobalPyRefManager()->CollectGarbage();
-
-  auto n_devices = dst_devices.size();
-
-  DevicePutOptions options;
-  options.squash_64bit_types = !jax_enable_x64;
-  options.allow_zero_copy =
-      (!force_copy && (host_buffer_semantics ==
-                       ifrt::Client::HostBufferSemantics::kImmutableZeroCopy));
-
-  nb::list owning_pylist;
-  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
-
-  absl::InlinedVector<ifrt::Device*, 1> devices;
-  devices.reserve(n_devices);
-  std::vector<xla::ifrt::Shape> shapes;
-  shapes.reserve(n_devices);
-
-  ifrt::MemoryKind dst_memory_kind = CreateIfRtMemoryKindFromSharding(sharding);
-
-  std::vector<DevicePutResultFn> device_put_fns;
-  device_put_fns.reserve(xs.size());
-  size_t i = 0;
-  for (auto& x : xs) {
-    if (PyArray::IsPyArray(x)) {
-      TF_RETURN_IF_ERROR(
-          jax::ApplyTransferGuardToDeviceToDevice(transfer_guard_formatter));
-    } else {
-      TF_RETURN_IF_ERROR(
-          jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
-    }
-    TF_ASSIGN_OR_RETURN(
-        device_put_fns.emplace_back(),
-        DevicePut(x, dst_devices[i]->client()->ifrt_client(),
-                  dst_devices[i]->device(), options, dst_memory_kind));
-    ++i;
-  }
-  std::vector<DevicePutResult> device_puts;
-  device_puts.reserve(device_put_fns.size());
-  {
-    nb::gil_scoped_release gil_release;
-    for (auto& device_put_fn : device_put_fns) {
-      TF_ASSIGN_OR_RETURN(auto device_put, std::move(device_put_fn)());
-      device_puts.push_back(std::move(device_put));
-    }
-  }
-  for (auto& device_put : device_puts) {
-    ifrt_arrays.push_back(std::move(device_put.ifrt_array));
-    devices.push_back(
-        ifrt_arrays.back()->sharding().devices()->devices().front());
-    shapes.push_back(ifrt_arrays.back()->shape());
-    if (device_put.owning_pybuffer) {
-      owning_pylist.append(device_put.owning_pybuffer);
-    }
-  }
-
-  // TODO(phawkins): it's highly suspicious to me that owning_pylist isn't
-  // consumed here. Look into this.
-
-  auto weak_type = nb::cast<bool>(aval.attr("weak_type"));
-  auto dtype = aval.attr("dtype");
-  auto shape = nb::cast<std::vector<int64_t>>(aval.attr("shape"));
-
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_sharding,
-      xla::GetIfrtConcreteSharding(sharding,
-                                   /*shape=*/ifrt::Shape(shape),
-                                   /*shard_shapes=*/std::move(shapes)));
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_array,
-      ifrt_arrays.front()->client()->AssembleArrayFromSingleDeviceArrays(
-          ifrt::Shape(shape), std::move(ifrt_sharding),
-          absl::MakeSpan(ifrt_arrays),
-          xla::ifrt::ArrayCopySemantics::kReuseInput,
-          xla::ifrt::SingleDeviceShardSemantics::kAddressableShards));
-
-  return PyArray(aval, weak_type, dtype, std::move(shape), sharding,
-                 dst_devices[0]->client(), Traceback::Get(),
-                 std::move(ifrt_array), committed, /*skip_checks=*/true);
-}
-
-absl::StatusOr<PyArray> PyArray::ReorderShards(
-    PyArray x, nanobind::object dst_sharding,
-    ifrt::ArrayCopySemantics array_copy_semantics) {
-  xla::ifrt::Array* ifrt_array_ptr = x.ifrt_array();
-  if (ifrt_array_ptr == nullptr) {
-    return absl::InvalidArgumentError(
-        "Reorder() called on deleted or donated buffer");
-  }
-
-  ifrt::Client* const client = ifrt_array_ptr->client();
-
-  const auto& device_list = ifrt_array_ptr->sharding().devices();
-  TF_ASSIGN_OR_RETURN(auto dst_device_list, GetIfrtDeviceList(dst_sharding));
-  if (device_list->AddressableDeviceList()->size() !=
-      dst_device_list->AddressableDeviceList()->size()) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Array is expected to have ",
-        dst_device_list->AddressableDeviceList()->size(),
-        " addressable shards, but has ",
-        device_list->AddressableDeviceList()->size(), " addressable shards"));
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const xla::ifrt::Sharding> dst_ifrt_sharding,
-      GetIfrtConcreteEvenSharding(dst_sharding, ifrt_array_ptr->dtype(),
-                                  ifrt_array_ptr->shape()));
-
-  tsl::RCReference<xla::ifrt::Array> new_ifrt_array;
-  {
-    nb::gil_scoped_release gil_release;
-
-    const absl::Span<xla::ifrt::Device* const> addressable_devices =
-        device_list->AddressableDeviceList()->devices();
-    const absl::Span<xla::ifrt::Device* const> dst_addressable_devices =
-        dst_device_list->AddressableDeviceList()->devices();
-
-    absl::flat_hash_map<int, int> device_id_to_array_shard_index;
-    device_id_to_array_shard_index.reserve(dst_addressable_devices.size());
-    for (int i = 0; i < dst_addressable_devices.size(); ++i) {
-      const int device_id = dst_addressable_devices[i]->Id().value();
-      const bool inserted =
-          device_id_to_array_shard_index.insert({device_id, i}).second;
-      if (!inserted) {
-        return absl::InvalidArgumentError(
-            absl::StrCat("Sharding contains duplicate device id=", device_id));
-      }
-    }
-
-    std::vector<int64_t> from_shard_indices;
-    from_shard_indices.reserve(addressable_devices.size());
-    std::vector<int64_t> to_shard_indices;
-    to_shard_indices.reserve(dst_addressable_devices.size());
-    for (int i = 0; i < dst_addressable_devices.size(); ++i) {
-      from_shard_indices.push_back(i);
-      const int shard_device_id = addressable_devices[i]->Id().value();
-      const auto it = device_id_to_array_shard_index.find(shard_device_id);
-      if (it == device_id_to_array_shard_index.end()) {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "Array shard ", i, " is on device id=", shard_device_id,
-            ", but sharding does not have a shard on that device."));
-      }
-      to_shard_indices.push_back(it->second);
-    }
-
-    auto mappings =
-        std::make_shared<std::vector<xla::ifrt::RemapPlan::Mapping>>();
-    {
-      auto& mapping = mappings->emplace_back();
-      mapping.in_array = 0;
-      mapping.out_array = 0;
-      mapping.from.reserve(dst_addressable_devices.size());
-      mapping.to.reserve(dst_addressable_devices.size());
-      for (int64_t i = 0; i < dst_addressable_devices.size(); ++i) {
-        mapping.from.push_back(xla::ifrt::RemapPlan::Interval{
-            from_shard_indices[i], from_shard_indices[i] + 1, 1});
-        mapping.to.push_back(xla::ifrt::RemapPlan::Interval{
-            to_shard_indices[i], to_shard_indices[i] + 1, 1});
-      }
-    }
-
-    xla::ifrt::RemapPlan plan = {
-        /*input_specs=*/{xla::ifrt::ArraySpec{
-            /*dtype=*/ifrt_array_ptr->dtype(),
-            /*shape=*/ifrt_array_ptr->shape(),
-            /*sharding=*/ifrt_array_ptr->shared_ptr_sharding()}},
-        /*output_specs=*/
-        {xla::ifrt::ArraySpec{/*dtype=*/ifrt_array_ptr->dtype(),
-                              /*shape=*/ifrt_array_ptr->shape(),
-                              /*sharding=*/std::move(dst_ifrt_sharding)}},
-        /*mappings=*/std::move(mappings),
-    };
-    DCHECK_OK(plan.Validate());
-    std::vector<tsl::RCReference<xla::ifrt::Array>> input;
-    input.push_back(tsl::FormRef(ifrt_array_ptr));
-    TF_ASSIGN_OR_RETURN(
-        auto remapped,
-        client->RemapArrays(plan, absl::MakeSpan(input), array_copy_semantics));
-
-    TF_RET_CHECK(remapped.size() == 1);
-    new_ifrt_array = std::move(remapped.front());
-  }
-
-  return xla::PyArray(nb::borrow<nb::object>(x.aval().ptr()), x.weak_type(),
-                      nb::borrow<xla::nb_dtype>(x.dtype().ptr()),
-                      std::vector<int64_t>(x.shape().begin(), x.shape().end()),
-                      std::move(dst_sharding), x.py_client(), x.traceback(),
-                      std::move(new_ifrt_array),
-                      /*committed=*/true,
-                      /*skip_checks=*/true);
-}
-
-absl::Status PyArray::BatchedBlockUntilReady(std::vector<nb::object> objs) {
-  // Create ready futures for all arrays before blocking on their readiness.
-  // This helps reduce the latency in some backend implementations where
-  // querying readiness of an array is not free.
-
-  std::vector<ifrt::Array*> ifrt_arrays;
-  ifrt_arrays.reserve(objs.size());
-  for (nb::handle obj : objs) {
-    if (obj.type().is(PyArray::type())) {
-      auto py_array = nb::borrow<PyArray>(obj);
-      ifrt::Array* const ifrt_array = py_array.ifrt_array();
-      if (ifrt_array == nullptr) {
-        return absl::InvalidArgumentError(
-            "BlockHostUntilReady() called on deleted or donated buffer");
-      }
-      ifrt_arrays.push_back(ifrt_array);
-    } else {
-      return absl::InvalidArgumentError(
-          "PyArray::BatchedBlockUntilReady can take PyArray only");
-    }
-  }
-
-  GlobalPyRefManager()->CollectGarbage();
-  nb::gil_scoped_release gil_release;
-  return AwaitBuffersReady(absl::MakeConstSpan(ifrt_arrays));
-}
-
-std::vector<PyArray> PyClient::LiveArrays() const {
-  std::vector<PyArray> result;
-  for (auto& shard : arrays_) {
-    nb::ft_lock_guard lock(shard.mutex);
-    for (PyArray::Storage* array = shard.arrays; array; array = array->next) {
-      bool all_deleted =
-          (array->ifrt_array == nullptr || array->ifrt_array->IsDeleted());
-      if (!all_deleted) {
-        result.push_back(nb::borrow<PyArray>(array->AsHandle()));
-      }
-    }
-  }
-  return result;
-}
-
-// PEP 3118 buffer protocol implementation.
-
-namespace {
-
-// Extra data to be kept alive by the consumer of the buffer protocol.
-struct ExtraBufferInfo {
-  explicit ExtraBufferInfo(
-      std::shared_ptr<PjRtBuffer> buffer,
-      std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold)
-      : buffer(std::move(buffer)),
-        external_reference_hold(std::move(external_reference_hold)) {}
-
-  std::vector<int64_t> strides;
-  // We keep an external reference hold to the PjRtBuffer. This prevents a
-  // use-after-free in the event that Delete() is called on a buffer with an
-  // live buffer protocol view. It does however mean that Delete() sometimes
-  // won't actually delete immediately.
-  std::shared_ptr<PjRtBuffer> buffer;
-  std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
-};
-
-// The default layout of a non-tuple array should have major-to-minor layout
-// and no tiles.
-bool HasDefaultLayout(const Layout& layout) {
-  return LayoutUtil::IsMonotonicWithDim0Major(layout) && layout.tiles().empty();
-}
-
-int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
-  absl::Status status = [&]() -> absl::Status {
-    PyArray py_array = nb::borrow<PyArray>(exporter);
-    if (py_array.ifrt_array() == nullptr) {
-      // TODO(phawkins): why is this happening?
-      return InvalidArgument("Array is null");
-    }
-    if (!llvm::isa<ifrt::PjRtCompatibleArray>(py_array.ifrt_array())) {
-      return InvalidArgument("Only local arrays are supported, got %s",
-                             py_array.ifrt_array()->DebugString());
-    }
-    auto* array =
-        static_cast<ifrt::PjRtCompatibleArray*>(py_array.ifrt_array());
-    absl::Span<const std::shared_ptr<PjRtBuffer>> buffers =
-        array->pjrt_buffers();
-
-    PjRtBuffer& buffer = *buffers.front();
-    if (!buffer.IsOnCpu()) {
-      return InvalidArgument(
-          "Python buffer protocol is only defined for CPU buffers.");
-    }
-
-    if (buffers.size() != 1) {
-      return InvalidArgument(
-          "Python buffer protocol is only defined for buffers with a single "
-          "shard.");
-    }
-    if (!py_array.sharding().type().is(jax::SingleDeviceSharding::type())) {
-      return InvalidArgument(
-          "Python buffer protocol is only defined for single-device sharded "
-          "buffers.");
-    }
-
-    const char* format =
-        PEP3118FormatDescriptorForPrimitiveType(buffer.element_type());
-    // It isn't an option for us to export unknown types as, say, bytes. When
-    // converting an object to an ndarray, NumPy tries the buffer protocol
-    // first. We very much want NumPy to fail and fall back to using
-    // __array__, which allows us to handle custom dtypes correctly.
-    if (!format) {
-      return InvalidArgument(
-          "Buffers of type %s are not supported by the Python buffer protocol.",
-          PrimitiveType_Name(buffer.element_type()));
-    }
-
-    std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
-    {
-      // We call BlockHostUntilReady() below, which may block.
-      nb::gil_scoped_release gil_release;
-
-      if (buffer.IsTuple()) {
-        return InvalidArgument(
-            "Python buffer protocol is only defined for array buffers.");
-      }
-      if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
-        return InvalidArgument("XLA buffers are read-only.");
-      }
-      TF_ASSIGN_OR_RETURN(external_reference_hold,
-                          buffer.AcquireExternalReference());
-      if (buffer.IsDeleted()) {
-        return InvalidArgument("Deleted buffer used in buffer protocol.");
-      }
-
-      // TODO(b/327524065): use PjRtLayout directly instead of xla::Layout
-      Layout xla_layout = buffer.layout()->xla_layout();
-
-      if (((flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS ||
-           (flags & PyBUF_STRIDES) == PyBUF_ND) &&
-          !LayoutUtil::IsMonotonicWithDim0Major(xla_layout)) {
-        return InvalidArgument("Buffer is not in C-contiguous layout.");
-      } else if ((flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS &&
-                 !LayoutUtil::IsMonotonicWithDim0Minor(xla_layout)) {
-        return InvalidArgument("Buffer is not in F-contiguous layout.");
-      } else if ((flags & PyBUF_ANY_CONTIGUOUS) == PyBUF_ANY_CONTIGUOUS &&
-                 !LayoutUtil::IsMonotonicWithDim0Major(xla_layout) &&
-                 !LayoutUtil::IsMonotonicWithDim0Minor(xla_layout)) {
-        return InvalidArgument("Buffer is not in contiguous layout.");
-      } else if (!HasDefaultLayout(xla_layout)) {
-        // Fail and fall back to using __array__ if the CPU buffer has a device
-        // specific layout. For instance, this happens for host buffers in
-        // pinned memories of the TPU device.
-        return InvalidArgument(
-            "Buffer is potentially a device buffer with non default layout.");
-      }
-      TF_RETURN_IF_ERROR(buffer.GetReadyFuture().Await());
-    }
-
-    // We must hold the GIL (or at least prevent Python GC) while writing to the
-    // view object, see https://github.com/python/cpython/issues/130409.
-    std::memset(view, 0, sizeof(Py_buffer));
-    const void* root_ptr =
-        external_reference_hold->OpaqueDeviceMemoryDataPointer();
-    view->buf = const_cast<void*>(root_ptr);
-    auto extra = std::make_unique<ExtraBufferInfo>(
-        buffers.front(), std::move(external_reference_hold));
-    view->itemsize = ShapeUtil::ByteSizeOfPrimitiveType(buffer.element_type());
-    TF_ASSIGN_OR_RETURN(view->len, buffer.GetOnDeviceSizeInBytes());
-    view->readonly = 1;
-    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
-      view->format = const_cast<char*>(format);
-    }
-    if ((flags & PyBUF_ND) == PyBUF_ND) {
-      view->ndim = buffer.dimensions().size();
-      static_assert(sizeof(int64_t) == sizeof(Py_ssize_t),
-                    "Py_ssize_t must be 64 bits");
-      if (view->ndim != 0) {
-        view->shape = reinterpret_cast<Py_ssize_t*>(
-            const_cast<int64_t*>(buffer.dimensions().data()));
-        if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
-          extra->strides =
-              ByteStridesForShape(buffer.element_type(), buffer.dimensions(),
-                                  buffer.layout()->xla_layout());
-          view->strides = reinterpret_cast<Py_ssize_t*>(
-              const_cast<int64_t*>(extra->strides.data()));
-        }
-      }
-    }
-    view->internal = extra.release();
-    return absl::OkStatus();
-  }();
-  if (!status.ok()) {
-    // numpy.asarray(...) eats the PyExc_BufferError. Adding a log here helps
-    // debugging when the error really occurs.
-    VLOG(1) << "Buffer Protocol Error: " << status;
-    PyErr_SetString(PyExc_BufferError, status.ToString().c_str());
-    return -1;
-  }
-  view->obj = exporter;
-  Py_INCREF(view->obj);
-  return 0;
-}
-
-void PyArray_bf_releasebuffer(PyObject*, Py_buffer* buffer) {
-  auto extra = static_cast<ExtraBufferInfo*>(buffer->internal);
-  delete extra;
-}
-
-// Returns if shape has a major-to-minor layout.
-bool HasMajorToMinorLayout(const xla::Shape& shape) {
-  if (shape.has_layout()) {
-    for (int i = 0; i < shape.layout().minor_to_major_size(); ++i) {
-      if (shape.layout().minor_to_major(i) !=
-          shape.layout().minor_to_major_size() - 1 - i) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-// Returns byte_strides if shape has a non-major-to-minor layout.
-std::optional<std::vector<int64_t>> ByteStridesOrDefaultForShapeInt64(
-    const Shape& shape) {
-  if (!shape.has_layout() || HasMajorToMinorLayout(shape)) {
-    return std::nullopt;
-  }
-  return ByteStridesForShape(shape);
-}
-
-bool IsZeroCopyableCpuBuffer(const PjRtBuffer* buf) {
-  // For CPU buffers with device-specific layouts, we must delinearize
-  // to unpack the array. This could happen for the host buffer
-  // pre-mapped to the TPU device, a.k.a., pinned host buffers for the
-  // device.
-  bool has_default_layout =
-      buf->layout() == nullptr || HasDefaultLayout(buf->layout()->xla_layout());
-  // On CPU for values >= 8 bits, we can return the value in a zero-copy way.
-  // For sub-byte values, we must copy in order to unpack the array.
-  return buf->IsOnCpu() &&
-         !primitive_util::IsSubByteNonPredType(buf->element_type()) &&
-         has_default_layout;
-}
-}  // namespace
-
-PyHostValue::PyHostValue() = default;
-PyHostValue::~PyHostValue() = default;
-
-absl::StatusOr<std::pair<nb::object, bool>> PyHostValue::AsNumPyArray(
-    std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array) {
-  if (ifrt_array->IsDeleted()) {
-    return InvalidArgument("DeviceArray has been deleted.");
-  }
-  // The only `jax.Array` with token-shape buffer is the one wrapped by
-  // `jax.core.Token`. Since it is an internal implementation detail, we
-  // don't support converting it to a numpy array.
-  if (ifrt_array->dtype().kind() == ifrt::DType::kToken) {
-    return InvalidArgument(
-        "Cannot convert a token-shape buffer to a numpy array.");
-  }
-  auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
-  if (arr != nullptr) {
-    auto* pjrt_buffer = arr->pjrt_buffers().front().get();
-    TF_RET_CHECK(!pjrt_buffer->IsTuple());
-    // On CPU for values >= 8 bits, we can return the value in a zero-copy way.
-    // For sub-byte values, we must copy in order to unpack the array.
-    if (IsZeroCopyableCpuBuffer(pjrt_buffer)) {
-      TF_ASSIGN_OR_RETURN(const auto* shape,
-                          XlaDynamicShape(ifrt_array, dynamic_shape_holder));
-      TF_ASSIGN_OR_RETURN(nb_dtype dtype,
-                          PrimitiveTypeToNbDtype(shape->element_type()));
-      // Objects that must be kept alive while the array is alive.
-      struct Hold {
-        tsl::RCReference<ifrt::Array> buffer;
-        std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
-      };
-      auto hold = std::make_unique<Hold>();
-      hold->buffer = tsl::FormRef(ifrt_array);
-      auto* hold_ptr = hold.release();
-      nb::capsule hold_capsule(
-          hold_ptr, [](void* h) noexcept { delete static_cast<Hold*>(h); });
-      {
-        // Release the GIL as `AcquireExternalReference` may block.
-        nb::gil_scoped_release gil;
-        TF_ASSIGN_OR_RETURN(hold_ptr->external_reference_hold,
-                            pjrt_buffer->AcquireExternalReference());
-        TF_RETURN_IF_ERROR(ifrt_array->GetReadyFuture().Await());
-      }
-      void* data =
-          hold_ptr->external_reference_hold->OpaqueDeviceMemoryDataPointer();
-      nb_numpy_ndarray array(dtype, shape->dimensions(),
-                             ByteStridesForShape(*shape), data, hold_capsule);
-      array.attr("flags").attr("writeable") = nb::bool_(false);
-      return std::make_pair(array, false);
-    }
-  }
-
-  TF_RETURN_IF_ERROR(CopyToHostAsync(dynamic_shape_holder, ifrt_array));
-  if (!ready_.IsReady()) {
-    nb::gil_scoped_release gil;
-    TF_RETURN_IF_ERROR(ready_.Await());
-  } else {
-    TF_RETURN_IF_ERROR(ready_.Await());
-  }
-  if (string_array_contents_ != nullptr) {
-    TF_RETURN_IF_ERROR(ConvertStringArrayContentsToNumpyArray(ifrt_array));
-  }
-  return std::make_pair(value_, true);
-}
-
-absl::Status PyHostValue::ConvertStringArrayContentsToNumpyArray(
-    ifrt::Array* ifrt_array) {
-#ifdef NPY_2_0_API_VERSION
-  if (PyArray_RUNTIME_VERSION < NPY_2_0_API_VERSION) {
-    return absl::FailedPreconditionError(
-        absl::StrCat("String arrays are not supported in NumPy version: ",
-                     PyArray_RUNTIME_VERSION));
-  }
-  auto numpy_dtype = nb::steal<nb_dtype>(
-      reinterpret_cast<PyObject*>(PyArray_DescrFromType(NPY_VSTRING)));
-  value_ = nb_numpy_ndarray(numpy_dtype, ifrt_array->shape().dims(),
-                            /*strides=*/std::nullopt);
-
-  auto dst_py_array_obj = reinterpret_cast<::PyArrayObject*>(value_.ptr());
-  auto iter =
-      nb::steal(PyArray_IterNew(reinterpret_cast<PyObject*>(dst_py_array_obj)));
-  for (auto& cord : *string_array_contents_) {
-    absl::string_view input_str_view = cord.Flatten();
-    auto py_unicode = nb::steal(PyUnicode_FromStringAndSize(
-        input_str_view.data(), input_str_view.size()));
-    if (py_unicode.ptr() == nullptr) {
-      return absl::InternalError("PyUnicode_FromStringAndSize failed");
-    }
-    if (PyArray_SETITEM(dst_py_array_obj,
-                        static_cast<char*>(PyArray_ITER_DATA(iter.ptr())),
-                        py_unicode.ptr()) != 0) {
-      return absl::InternalError("PyArray_SETITEM failed");
-    }
-    PyArray_ITER_NEXT(iter.ptr());
-  }
-
-  value_.attr("flags").attr("writeable") = nb::bool_(false);
-
-  string_array_contents_.reset();
-
-  return absl::OkStatus();
-#else
-  return absl::FailedPreconditionError(
-      "String arrays are not supported in this NumPy version.");
-#endif
-}
-
-absl::Status PyHostValue::CopyStringArrayToHostAsync(
-    std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array) {
-  auto transfer_guard_formatter = [ifrt_array] {
-    return absl::StrCat(
-        "shape=(", absl::StrJoin(ifrt_array->shape().dims(), ","),
-        "), dtype=", ifrt_array->dtype().DebugString(), ", device=",
-        ifrt_array->sharding().devices()->devices().front()->DebugString());
-  };
-  TF_RETURN_IF_ERROR(
-      jax::ApplyTransferGuardToDeviceToHost(transfer_guard_formatter));
-
-  TF_ASSIGN_OR_RETURN(nb_dtype dtype, IfrtDtypeToNbDtype(ifrt_array->dtype()));
-  auto shape = ifrt_array->shape();
-
-  // Allocate a vector of cords to hold the contents of the array until
-  // they are until they are ultimately converted to a numpy array as part
-  // of the `AsNumPyArray` call.
-  string_array_contents_ =
-      std::make_shared<std::vector<absl::Cord>>(shape.num_elements());
-  ready_ = ifrt_array->CopyToHostBuffer(string_array_contents_->data(),
-                                        /*byte_strides=*/std::nullopt,
-                                        ifrt::ArrayCopySemantics::kAlwaysCopy);
-
-  ready_.OnReady(
-      [string_array_contents = string_array_contents_](absl::Status) {
-      });  // Keeps the cords alive until the copy is done.
-
-  return absl::OkStatus();
-}
-
-absl::Status PyHostValue::CopyToHostAsync(
-    std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array) {
-  if (ready_.IsValid()) {
-    // The array value has been populated, so CopyToHostAsync has been called.
-    return absl::OkStatus();
-  }
-
-  // Copying in Arrays of type kString requires some special handling
-  if (ifrt_array->dtype().kind() == ifrt::DType::kString) {
-    return CopyStringArrayToHostAsync(dynamic_shape_holder, ifrt_array);
-  }
-
-  auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
-  if (arr != nullptr && !arr->pjrt_buffers().front()->IsTuple() &&
-      IsZeroCopyableCpuBuffer(arr->pjrt_buffers().front().get())) {
-    return absl::OkStatus();
-  }
-  auto transfer_guard_formatter = [ifrt_array] {
-    return absl::StrCat(
-        "shape=(", absl::StrJoin(ifrt_array->shape().dims(), ","),
-        "), dtype=", ifrt_array->dtype().DebugString(), ", device=",
-        ifrt_array->sharding().devices()->devices().front()->DebugString());
-  };
-  TF_RETURN_IF_ERROR(
-      jax::ApplyTransferGuardToDeviceToHost(transfer_guard_formatter));
-
-  // TODO(b/182461453): This is a blocking call. If we further implemented
-  // populating dynamic shape metadata while fetching the literal, we wouldn't
-  // need this static approach.
-  const xla::Shape* dynamic_shape;
-  std::optional<xla::Shape> shape_holder;
-  if (llvm::isa<ifrt::PjRtCompatibleArray>(ifrt_array)) {
-    TF_ASSIGN_OR_RETURN(dynamic_shape,
-                        XlaDynamicShape(ifrt_array, dynamic_shape_holder));
-  } else {
-    // Skip querying the dynamic shape for a non-PjRt Array.
-    TF_ASSIGN_OR_RETURN(xla::PrimitiveType type,
-                        ifrt::ToPrimitiveType(ifrt_array->dtype()));
-    shape_holder = ShapeUtil::MakeShapeWithDescendingLayout(
-        type, ifrt_array->shape().dims());
-    dynamic_shape = &*shape_holder;
-  }
-
-  xla::Shape host_shape = ShapeUtil::DeviceShapeToHostShape(*dynamic_shape);
-
-  auto strides = ByteStridesOrDefaultForShapeInt64(host_shape);
-  TF_ASSIGN_OR_RETURN(nb_dtype dtype,
-                      PrimitiveTypeToNbDtype(host_shape.element_type()));
-  value_ = nb_numpy_ndarray(dtype, host_shape.dimensions(), strides);
-  // TODO(hyeontaek): Several PjRt runtimes assume that the host buffer uses
-  // the same transposition as the device buffer. This is different from
-  // PjRtBuffer::ToLiteral()'s semantics that the runtime respects the layout
-  // of the host buffer literal. On the other hand, the runtime often knows
-  // better about an efficient layout for the host buffer. It will be useful
-  // to revisit the semantics of PjRtBuffer::ToLiteral() to see if it is
-  // desirable for the runtime to choose the layout.
-  ready_ = ifrt_array->CopyToHostBuffer(value_.mutable_data(), strides,
-                                        ifrt::ArrayCopySemantics::kReuseInput);
-  // Make sure the destination of the copy remains alive until the copy is done.
-  value_.inc_ref();
-  ready_.OnReady([array{value_.ptr()}](absl::Status status) {
-    GlobalPyRefManager()->AddGarbage(nb::steal(array));
-  });
-  value_.attr("flags").attr("writeable") = nb::bool_(false);
-  return absl::OkStatus();
-}
-
-namespace {
-PyGetSetDef PyArray_tp_getset[] = {
-    {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict, nullptr,
-     nullptr},
-    {nullptr, nullptr, nullptr, nullptr, nullptr},
-};
-
-PyMemberDef PyArray_members[] = {
-#if PY_VERSION_HEX < 0x030C0000
-    {"__weaklistoffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(PyArrayObject, weakrefs)), READONLY,
-     nullptr},
-    {"__dictoffset__", T_PYSSIZET,
-     static_cast<Py_ssize_t>(offsetof(PyArrayObject, dict)), READONLY, nullptr},
-#endif  // PY_VERSION_HEX < 0x030C0000
-    {nullptr, 0, 0, 0, nullptr},
-};  // namespace xla
-
-PyType_Slot PyArray_slots[] = {
-    {Py_tp_new, reinterpret_cast<void*>(PyArray_tp_new)},
-    {Py_tp_dealloc, reinterpret_cast<void*>(PyArray_tp_dealloc)},
-    {Py_tp_members, reinterpret_cast<void*>(PyArray_members)},
-    {Py_tp_traverse, reinterpret_cast<void*>(PyArray_tp_traverse)},
-    {Py_tp_clear, reinterpret_cast<void*>(PyArray_tp_clear)},
-    {Py_tp_getset, reinterpret_cast<void*>(PyArray_tp_getset)},
-    {Py_bf_getbuffer, reinterpret_cast<void*>(PyArray_bf_getbuffer)},
-    {Py_bf_releasebuffer, reinterpret_cast<void*>(PyArray_bf_releasebuffer)},
-    {0, nullptr},
-};
-
-}  // namespace
-
-absl::Status PyArray::RegisterTypes(nb::module_& m) {
-  std::string name =
-      absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".ArrayImpl");
-
-  PyType_Spec PyArray_spec = {
-#if PY_VERSION_HEX < 0x030B0000
-      // Work around for https://github.com/python/cpython/issues/89478
-      // CPython 3.10 and earlier assume that the .name value remains alive
-      // forever.
-      /*.name=*/strdup(name.c_str()),
-#else
-      /*.name=*/name.c_str(),
-#endif  // PY_VERSION_HEX < 0x030B0000
-      /*.basicsize=*/static_cast<int>(sizeof(PyArrayObject)),
-      /*.itemsize=*/0,
-#if PY_VERSION_HEX < 0x030C0000
-      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
-#else   // PY_VERSION_HEX >= 0x030C0000
-      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
-          Py_TPFLAGS_MANAGED_DICT | Py_TPFLAGS_MANAGED_WEAKREF,
-#endif  // PY_VERSION_HEX >= 0x030C0000
-      /*.slots=*/PyArray_slots,
-  };
-
-  type_ = PyType_FromSpec(&PyArray_spec);
-  if (!type_) {
-    throw nb::python_error();
-  }
-  auto type = nb::borrow<nb::object>(type_);
-  m.attr("ArrayImpl") = type;
-
-  type.attr("__init__") = nb::cpp_function(
-      [](PyArray self, nb::object aval, nb::object sharding, nb::list arrays,
-         bool committed, bool skip_checks) {
-        if (arrays[0].type().is(PyArray::type())) {
-          auto py_arrays = nb::cast<std::vector<PyArray>>(arrays);
-          PyArray::PyInit(self, std::move(aval), std::move(sharding), py_arrays,
-                          committed, skip_checks);
-        } else {
-          throw nb::type_error(
-              absl::StrCat(
-                  "Unsupported type for elements in `arrays`: ",
-                  nb::cast<absl::string_view>(nb::str(arrays[0].type())))
-                  .c_str());
-        }
-      },
-      nb::is_method(), nb::arg("aval"), nb::arg("sharding"), nb::arg("arrays"),
-      nb::arg("committed"), nb::arg("_skip_checks") = false);
-  type.attr("delete") = nb::cpp_function(
-      [](PyArray& self) { xla::ThrowIfError(self.Delete()); }, nb::is_method());
-  type.attr("_sharding") = nb_property_readonly(&PyArray::sharding);
-  type.attr("aval") = nb_property(&PyArray::aval, &PyArray::set_aval);
-  type.attr("_arrays") =
-      nb_property(&PyArray::arrays, [](PyArray& self, nb::object obj) {
-        xla::ThrowIfError(self.set_arrays(obj));
-      });
-  type.attr("_fully_replicated_shard") = nb::cpp_function(
-      [](PyArray self) {
-        return xla::ValueOrThrow(self.FullyReplicatedShard());
-      },
-      nb::is_method());
-  type.attr("_npy_value") =
-      nb_property(&PyArray::npy_value, &PyArray::set_npy_value);
-  type.attr("_committed") = nb_property_readonly(&PyArray::committed);
-  type.attr("unsafe_buffer_pointer") = nb::cpp_function(
-      [](PyArray self) {
-        return xla::ValueOrThrow(self.UnsafeBufferPointer());
-      },
-      nb::is_method());
-  type.attr("__cuda_array_interface__") = nb_property_readonly(
-      [](PyArray self) { return self.CudaArrayInterface(); });
-  type.attr("_pjrt_layout") =
-      nb_property_readonly(xla::ValueOrThrowWrapper(&PyArray::layout));
-  type.attr("on_device_size_in_bytes") = nb::cpp_function(
-      xla::ValueOrThrowWrapper(&PyArray::GetOnDeviceSizeInBytes),
-      nb::is_method());
-  type.attr("_single_device_array_to_np_array_did_copy") = nb::cpp_function(
-      xla::ValueOrThrowWrapper(&PyArray::SingleDeviceArrayToNumpyArrayDidCopy),
-      nb::is_method());
-  type.attr("_copy_single_device_array_to_host_async") = nb::cpp_function(
-      [](PyArray& self) {
-        xla::ThrowIfError(self.CopySingleDeviceArrayToHostAsync());
-      },
-      nb::is_method());
-  type.attr("block_until_ready") = nb::cpp_function(
-      [](PyArray self) -> nb::object {
-        xla::ThrowIfError(self.BlockUntilReady());
-        return self;
-      },
-      nb::is_method());
-  type.attr("platform") = nb::cpp_function(
-      [](PyArray self) {
-        if (self.ifrt_array()->client()->platform_name() == "cuda" ||
-            self.ifrt_array()->client()->platform_name() == "rocm") {
-          return absl::string_view("gpu");
-        } else {
-          return self.ifrt_array()->client()->platform_name();
-        }
-      },
-      nb::is_method());
-  type.attr("is_ready") = nb::cpp_function(
-      [](PyArray self) { return xla::ValueOrThrow(self.IsReady()); },
-      nb::is_method());
-  type.attr("is_deleted") =
-      nb::cpp_function(&PyArray::IsDeleted, nb::is_method());
-  type.attr("traceback") = nb_property_readonly(&PyArray::traceback);
-  type.attr("clone") = nb::cpp_function(&PyArray::Clone, nb::is_method());
-  type.attr("__module__") = m.attr("__name__");
-
-  m.attr("batched_copy_array_to_devices_with_sharding") = nb::cpp_function(
-      [](absl::Span<const PyArray> arrays,
-         absl::Span<const std::vector<const PyDevice*>> dst_device_lists,
-         absl::Span<const nb::object> shardings,
-         absl::Span<const ifrt::ArrayCopySemantics> array_copy_semantics) {
-        if (arrays.empty()) {
-          return std::vector<PyArray>();
-        }
-        auto* client = arrays[0].ifrt_array()->client();
-        std::vector<ifrt::DeviceListRef> device_lists;
-        device_lists.reserve(dst_device_lists.size());
-        for (const auto& dst_devices : dst_device_lists) {
-          absl::InlinedVector<ifrt::Device*, 1> devices;
-          devices.reserve(dst_devices.size());
-          for (auto& d : dst_devices) {
-            devices.push_back(d->device());
-          }
-          device_lists.push_back(client->MakeDeviceList(devices));
-        }
-        return xla::ValueOrThrow(PyArray::BatchedCopyToDeviceWithSharding(
-            arrays, device_lists, shardings, array_copy_semantics));
-      });
-  m.attr("array_result_handler") = nb::cpp_function(
-      [](nb::object aval, nb::object sharding, bool committed,
-         bool skip_checks) -> nb_class_ptr<PyArrayResultHandler> {
-        return make_nb_class<PyArrayResultHandler>(
-            std::move(aval), std::move(sharding), committed, skip_checks);
-      },
-      nb::arg("aval"), nb::arg("sharding"), nb::arg("committed"),
-      nb::arg("_skip_checks") = false);
-
-  nb::class_<PyArrayResultHandler>(m, "ResultHandler")
-      .def("__call__", [](const PyArrayResultHandler& self,
-                          PyArray arg) { return self.Call(arg); })
-      .def("__call__",
-           [](const PyArrayResultHandler& self,
-              std::vector<PyArray> py_arrays) { return self.Call(py_arrays); });
-
-  return absl::OkStatus();
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
deleted file mode 100644
index e93c9228b734..000000000000
--- a/third_party/xla/xla/python/py_array.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_ARRAY_H_
-#define XLA_PYTHON_PY_ARRAY_H_
-
-#include <Python.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-// placeholder for index annotation headers
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/cord.h"
-#include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/future.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/pjrt_ifrt/pjrt_array.h"
-#include "xla/python/py_client.h"
-#include "xla/python/traceback.h"
-#include "xla/shape.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/util.h"
-
-namespace xla {
-
-// Private to PyArray, but you cannot forward declare member classes.
-// Not thread safe; assumes the GIL is held.
-class PyHostValue {
- public:
-  PyHostValue();
-  ~PyHostValue();
-
-  PyHostValue(const PyHostValue&) = delete;
-  PyHostValue(PyHostValue&&) = delete;
-  PyHostValue& operator=(const PyHostValue&) = delete;
-  PyHostValue& operator=(PyHostValue&&) = delete;
-
-  absl::Status CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
-                               ifrt::Array* ifrt_array);
-
-  absl::StatusOr<std::pair<nanobind::object, bool>> AsNumPyArray(
-      std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array);
-
- private:
-  absl::Status CopyStringArrayToHostAsync(
-      std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array);
-
-  absl::Status ConvertStringArrayContentsToNumpyArray(ifrt::Array* ifrt_array);
-
-  ifrt::Future<> ready_;
-  nb_numpy_ndarray value_;
-
-  // Optional field, only used for arrays of type kString. This vector of cords
-  // serves as input buffer for the CopyToHostBuffer call. It holds these
-  // contents until it is lazily converted it to a numpy array when the user
-  // calls `AsNumPyArray`.
-  std::shared_ptr<std::vector<absl::Cord>> string_array_contents_;
-};
-
-// Private to PyArray, but you cannot forward declare member classes.
-struct PyArray_Storage {
-  PyArray_Storage(nanobind::object aval, bool weak_type, nb_dtype dtype,
-                  std::vector<int64_t> shape, nanobind::object sharding,
-                  bool committed, nb_class_ptr<PyClient> py_client,
-                  std::optional<nb_traceback> traceback,
-                  tsl::RCReference<ifrt::Array> ifrt_array,
-                  xla::PjRtFuture<> result_status);
-
-  ~PyArray_Storage();
-  nanobind::handle AsHandle();
-
-  nanobind::object aval;
-  bool weak_type = false;
-  nb_dtype dtype;
-  std::vector<int64_t> shape;
-
-  nanobind::object sharding;
-  nanobind::object npy_value = nanobind::none();
-  bool committed = false;
-
-  nb_class_ptr<PyClient> py_client;
-  std::optional<nb_traceback> traceback;
-  tsl::RCReference<ifrt::Array> ifrt_array;
-  nanobind::object fully_replicated_array = nanobind::none();
-
-  // optional field, used only in python
-  std::vector<PyArray> py_arrays;
-  PyHostValue host_value;  // Protected by the GIL.
-  std::optional<Shape> dynamic_shape = std::nullopt;
-  // Only set if this Array was generated by a computation that has effects.
-  // This is the result status of the XLA computation that generated this
-  // array.
-  xla::PjRtFuture<> result_status;
-
-  // Doubly-linked list of all PyArrays known to the client. Protected by the
-  // GIL. Since multiple PyArrays may share the same PjRtBuffer, there may be
-  // duplicate PjRtBuffers in this list.
-  PyArray_Storage* next;
-  PyArray_Storage* prev;
-
-  uint8_t thread_id_bucket;
-};
-
-// The C++ implementation of jax.Array. A few key methods and data members are
-// implemented in C++ for performance, while most of the functionalities are
-// still implemented in python.
-class PyArray : public nanobind::object {
- public:
-  NB_OBJECT(PyArray, nanobind::object, "Array", PyArray::IsPyArray);
-  PyArray() = default;
-
-  // "__init__" methods. Only used in python
-  static void PyInit(PyArray self, nanobind::object aval,
-                     nanobind::object sharding,
-                     absl::Span<const PyArray> py_arrays, bool committed,
-                     bool skip_checks);
-
-  // Only used in C++. `skip_checks` should only be set for Arrays created by
-  // jax that cannot possibly have consistency issues (e.g. `sharding` devices
-  // different than `ifrt_array` devices). Arrays created by users should be
-  // checked.
-  PyArray(nanobind::object aval, bool weak_type, nb_dtype dtype,
-          std::vector<int64_t> shape, nanobind::object sharding,
-          nb_class_ptr<PyClient> py_client,
-          std::optional<nb_traceback> traceback,
-          tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
-          bool skip_checks,
-          xla::PjRtFuture<> result_status = xla::PjRtFuture<>());
-
-  static PyArray MakeFromSingleDeviceArray(
-      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
-      tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
-      xla::PjRtFuture<> result_status = xla::PjRtFuture<>());
-
-  static PyArray MakeFromIfrtArrayAndSharding(
-      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
-      tsl::RCReference<ifrt::Array> ifrt_array, nanobind::object sharding,
-      bool weak_type, bool committed, bool skip_checks);
-
-  static absl::Status RegisterTypes(nanobind::module_& m);
-
-  static PyArray borrow(PyObject* ptr) {
-    return nanobind::borrow<xla::PyArray>(ptr);
-  }
-
-  using Storage = PyArray_Storage;
-
-  const nanobind::object& aval() const { return GetStorage().aval; }
-  void set_aval(nanobind::object aval) { GetStorage().aval = std::move(aval); }
-
-  bool weak_type() const { return GetStorage().weak_type; }
-
-  const nb_dtype& dtype() const { return GetStorage().dtype; }
-  absl::Span<const int64_t> shape() const { return GetStorage().shape; }
-
-  const nanobind::object& sharding() const { return GetStorage().sharding; }
-
-  absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout() {
-    return ifrt_array()->layout();
-  }
-
-  bool committed() const { return GetStorage().committed; }
-
-  const nanobind::object& npy_value() const { return GetStorage().npy_value; }
-  void set_npy_value(nanobind::object v) {
-    GetStorage().npy_value = std::move(v);
-  }
-
-  const nb_class_ptr<PyClient>& py_client() const {
-    return GetStorage().py_client;
-  }
-
-  const std::optional<nb_traceback>& traceback() const {
-    return GetStorage().traceback;
-  }
-
-  // Returns xla::InvalidArgument if the buffer has been deleted.
-  // See `PjRtFuture` for the semantics of `IsReady` and `IsKnownReady`.
-  absl::StatusOr<bool> IsReady() {
-    ifrt::Array* ifrt_array_ptr = ifrt_array();
-    if (ifrt_array_ptr->IsDeleted()) {
-      return InvalidArgument("Array has been deleted.");
-    }
-    return ifrt_array_ptr->GetReadyFuture().IsReady();
-  }
-
-  const xla::PjRtFuture<>& result_status() const {
-    return GetStorage().result_status;
-  }
-
-  ifrt::Array* ifrt_array() const { return GetStorage().ifrt_array.get(); }
-
-  // Short-term escape hatch to get PjRtBuffers from PyArray.
-  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
-  absl::Span<const std::shared_ptr<PjRtBuffer>> pjrt_buffers() const {
-    ifrt::Array* ifrt_array_ptr = ifrt_array();
-    if (ifrt_array_ptr == nullptr) {
-      return {};
-    }
-    auto* arr =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_ptr);
-    if (arr == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    return arr->pjrt_buffers();
-  }
-
-  int num_addressable_shards() const {
-    ifrt::Array* ifrt_array_ptr = ifrt_array();
-    if (ifrt_array_ptr == nullptr) {
-      return 0;
-    }
-    auto* arr =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_ptr);
-    if (arr == nullptr) {
-      // TODO(hyeontaek): Add num_addressable_shards to ifrt.
-      return num_shards();
-    }
-    return arr->pjrt_buffers().size();
-  }
-
-  std::vector<PyArray>& py_arrays() { return GetStorage().py_arrays; }
-  const std::vector<PyArray>& py_arrays() const {
-    return GetStorage().py_arrays;
-  }
-  const std::vector<PyArray>& py_arrays_cached();
-
-  nanobind::object arrays();
-  absl::Status set_arrays(nanobind::object obj);
-  absl::StatusOr<PyArray> FullyReplicatedShard();
-
-  int num_shards() const {
-    ifrt::Array* ifrt_array_ptr = ifrt_array();
-    if (ifrt_array_ptr == nullptr) {
-      return 0;
-    }
-    return ifrt_array_ptr->sharding().devices()->size();
-  }
-
-  static nanobind::handle type() {
-    DCHECK(type_);
-    return nanobind::handle(type_);
-  }
-
-  static bool IsPyArray(nanobind::handle arg) {
-    return arg.type().is(PyArray::type());
-  }
-
-  absl::Status BlockUntilReady() const;
-
-  absl::Status BlockUntilResultStatusIsReady();
-
-  absl::StatusOr<size_t> GetOnDeviceSizeInBytes();
-  absl::StatusOr<std::pair<nanobind::object, bool>>
-  SingleDeviceArrayToNumpyArrayDidCopy();
-  absl::StatusOr<nanobind::object> SingleDeviceArrayToNumpyArray();
-  absl::Status CopySingleDeviceArrayToHostAsync();
-  nanobind::dict CudaArrayInterface();
-  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer();
-
-  absl::Status Delete();
-
-  bool IsDeleted() const;
-
-  PyArray Clone() const;
-
-  static absl::StatusOr<std::vector<PyArray>> BatchedCopyToDeviceWithSharding(
-      absl::Span<const PyArray> py_arrays,
-      absl::Span<const ifrt::DeviceListRef> dst_device_lists,
-      absl::Span<const nanobind::object> dst_shardings,
-      absl::Span<const ifrt::ArrayCopySemantics> array_copy_semantics);
-
-  static absl::StatusOr<PyArray> BatchedDevicePut(
-      nanobind::object aval, nanobind::object sharding,
-      std::vector<nanobind::object> xs,
-      absl::Span<const PyDevice* const> dst_devices, bool committed,
-      bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
-      bool jax_enable_x64);
-
-  static absl::StatusOr<PyArray> ReorderShards(
-      PyArray x, nanobind::object dst_sharding,
-      ifrt::ArrayCopySemantics array_copy_semantics);
-
-  static absl::Status BatchedBlockUntilReady(
-      std::vector<nanobind::object> objs);
-
- private:
-  absl::StatusOr<PyArray> AssertUnsharded(absl::string_view api);
-
-  nanobind::object CheckAndRearrange(absl::Span<const PyArray> py_arrays,
-                                     nanobind::object sharding,
-                                     nanobind::object aval);
-
-  void SetIfrtArray(tsl::RCReference<ifrt::Array> ifrt_array);
-
-  Storage& GetStorage();
-  const Storage& GetStorage() const;
-
-  inline static PyObject* type_ = nullptr;
-};
-
-class PyArrayResultHandler {
- public:
-  PyArrayResultHandler(nanobind::object aval, nanobind::object sharding,
-                       bool committed, bool skip_checks);
-
-  PyArray Call(absl::Span<const PyArray> py_arrays) const;
-  PyArray Call(PyArray py_array) const;
-
-  PyArray Call(nb_class_ptr<PyClient> py_client,
-               tsl::RCReference<ifrt::Array> ifrt_array,
-               xla::PjRtFuture<> result_status = xla::PjRtFuture<>()) const;
-
- private:
-  nanobind::object aval_;
-  nanobind::object sharding_;
-  bool weak_type_;
-  bool committed_;
-  bool skip_checks_;
-
-  nb_dtype dtype_;
-  std::vector<int64_t> shape_;
-};
-
-absl::StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
-    const nanobind::dict& cai, nb_class_ptr<PyClient> cuda_client,
-    std::optional<int> device_id);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_ARRAY_H_
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
deleted file mode 100644
index 19dce6935094..000000000000
--- a/third_party/xla/xla/python/py_client.cc
+++ /dev/null
@@ -1,817 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_client.h"
-
-#include <Python.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/Pass/PassManager.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/pair.h"  // IWYU pragma: keep
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/variant.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/literal.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/mlir_to_hlo.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_compiler.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/callback.h"
-#include "xla/python/guard_lib.h"
-#include "xla/python/ifrt/client.h"
-#include "xla/python/ifrt/compiler.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/hlo/hlo_program.h"
-#include "xla/python/ifrt/host_callback.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/program.h"
-#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/pjrt_ifrt/pjrt_array.h"
-#include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/python/pjrt_ifrt/xla_compiler.h"
-#include "xla/python/pprof_profile_builder.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_device.h"
-#include "xla/python/py_executable.h"
-#include "xla/python/py_host_callback.h"
-#include "xla/python/py_memory_space.h"
-#include "xla/python/py_values.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/traceback.h"
-#include "xla/python/types.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "xla/service/platform_util.h"  // IWYU pragma: keep
-#include "xla/shape.h"
-#include "xla/status_macros.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace xla {
-
-namespace nb = nanobind;
-
-/*static*/ nb_class_ptr<PyClient> PyClient::Make(
-    std::shared_ptr<ifrt::Client> ifrt_client) {
-  auto client = make_nb_class<PyClient>(std::move(ifrt_client));
-  Initialize(client);
-  return client;
-}
-
-PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
-    : ifrt_client_(std::move(ifrt_client)),
-      client_attributes_(ifrt_client_->Attributes()) {
-  CHECK(ifrt_client_);
-}
-
-/* static */ void PyClient::Initialize(nb_class_ptr<PyClient> client) {
-  for (ifrt::Device* device : client->ifrt_client()->devices()) {
-    client->devices_[device] = make_nb_class<PyDevice>(client, device);
-
-    for (ifrt::Memory* memory : device->Memories()) {
-      auto& py_memory = client->memory_spaces_[memory];
-      if (py_memory.get() == nullptr) {
-        py_memory = make_nb_class<PyMemorySpace>(client, memory);
-      }
-    }
-  }
-}
-
-PyClient::~PyClient() {
-  nb::gil_scoped_release gil;
-  ifrt_client_ = nullptr;
-}
-
-nb_class_ptr<PyDevice> PyClient::GetPyDevice(ifrt::Device* device) {
-  auto& py_device = devices_[device];
-  if (py_device.get() == nullptr) {
-    py_device = make_nb_class<PyDevice>(
-        nb::borrow<nb_class_ptr<PyClient>>(nb::find(this)), device);
-  }
-  return py_device;
-}
-
-nb_class_ptr<PyMemorySpace> PyClient::GetPyMemorySpace(
-    ifrt::Memory* memory_space) {
-  auto& py_memory = memory_spaces_[memory_space];
-  if (py_memory.get() == nullptr) {
-    py_memory = make_nb_class<PyMemorySpace>(
-        nb::borrow<nb_class_ptr<PyClient>>(nb::find(this)), memory_space);
-  }
-  return py_memory;
-}
-
-std::vector<nb_class_ptr<PyDevice>> PyClient::Devices() {
-  std::vector<nb_class_ptr<PyDevice>> devices;
-  auto span = ifrt_client_->devices();
-  devices.reserve(span.size());
-  for (ifrt::Device* device : span) {
-    devices.push_back(GetPyDevice(device));
-  }
-  return devices;
-}
-
-std::vector<nb_class_ptr<PyDevice>> PyClient::LocalDevices() {
-  std::vector<nb_class_ptr<PyDevice>> devices;
-  devices.reserve(ifrt_client_->addressable_devices().size());
-  for (ifrt::Device* device : ifrt_client_->addressable_devices()) {
-    devices.push_back(GetPyDevice(device));
-  }
-  return devices;
-}
-
-std::vector<nb_class_ptr<PyDevice>> PyClient::GetAllDevices() {
-  std::vector<nb_class_ptr<PyDevice>> devices;
-  devices.reserve(ifrt_client_->GetAllDevices().size());
-  for (ifrt::Device* device : ifrt_client_->GetAllDevices()) {
-    devices.push_back(GetPyDevice(device));
-  }
-  return devices;
-}
-
-absl::StatusOr<nb_class_ptr<PyDevice>> PyClient::DeviceFromLocalHardwareId(
-    int local_hardware_id) {
-  TF_ASSIGN_OR_RETURN(ifrt::Device * device,
-                      ifrt_client_->LookupAddressableDevice(local_hardware_id));
-  return GetPyDevice(device);
-}
-
-nb::list PyClient::LiveExecutables() {
-  CHECK(PyGILState_Check());
-  nb::ft_lock_guard lock(executables_mutex_);
-  nb::list executables;
-  for (PyLoadedExecutable* exec = executables_; exec; exec = exec->next_) {
-    if (!exec->is_deleted()) {
-      executables.append(nb::find(exec));
-    }
-  }
-  return executables;
-}
-
-absl::Status PyClient::Defragment() {
-  CHECK(PyGILState_Check());
-  if (!llvm::isa<ifrt::PjRtCompatibleClient>(ifrt_client_.get())) {
-    return absl::UnimplementedError(
-        "Defragmentation is not supported on this runtime.");
-  }
-  ifrt::PlatformId platform_id = ifrt_client_->platform_id();
-  bool is_gpu_client = platform_id == CudaId() || platform_id == RocmId() ||
-                       platform_id == SyclId();
-
-  if (!is_gpu_client) {
-    return pjrt_client()->Defragment();
-  }
-
-  struct TmpBuffer {
-    // Non-empty for buffers found in a PyArray_Storage. Multiple Arrays
-    // can reference the same PjRtBuffer.
-    std::vector<std::shared_ptr<PjRtBuffer>*> pjrt_buffer_ptrs;
-    // TODO(skyewm): maybe use py_buffer's HostValue
-    std::shared_ptr<Literal> host_copy;
-  };
-
-  // Synchronously copy all buffers to host
-  absl::flat_hash_map<PjRtBuffer*, TmpBuffer> pjrt_buf_to_tmp_buffer;
-
-  std::vector<PyArray> arrays = LiveArrays();
-  for (const PyArray& array : arrays) {
-    // TODO(hyeontaek): Support non-PjRt Arrays.
-    // TODO(hyeontaek): Re-construct ifrt::Array with new PjRtBuffer so that
-    // std::shared_ptr<PjRtBuffer> does not need to be updated in-place.
-    if (array.ifrt_array() == nullptr) {
-      continue;
-    }
-    auto* arr =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(array.ifrt_array());
-    if (arr == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend "
-          "only.");
-    }
-    TF_ASSIGN_OR_RETURN(absl::Span<std::shared_ptr<PjRtBuffer>> pjrt_buffers,
-                        arr->mutable_pjrt_buffers());
-    for (int i = 0; i < pjrt_buffers.size(); ++i) {
-      std::shared_ptr<PjRtBuffer>& pjrt_buf_ptr = pjrt_buffers[i];
-      if (pjrt_buf_ptr->IsDeleted()) {
-        continue;
-      }
-      auto [iter, inserted] =
-          pjrt_buf_to_tmp_buffer.insert({pjrt_buf_ptr.get(), TmpBuffer()});
-      if (inserted) {
-        TF_ASSIGN_OR_RETURN(iter->second.host_copy,
-                            pjrt_buf_ptr->ToLiteralSync());
-      }
-      iter->second.pjrt_buffer_ptrs.push_back(&pjrt_buf_ptr);
-    }
-  }
-
-  // All buffers successfully copied to host, delete on-device copies.
-  //
-  // Use blocking delete operation to ensure all memory is actually cleared
-  // before we start rewriting buffers.
-  //
-  // Die instead of returning a bad status because program presumably can't
-  // continue if we fail to reconstitute device buffers.
-  for (const auto& it : pjrt_buf_to_tmp_buffer) {
-    PjRtBuffer* pjrt_buf = it.first;
-    TF_CHECK_OK(pjrt_buf
-                    ->ReleaseDeviceMemoryOwnership(
-                        /*wait_for_operations_to_complete=*/true)
-                    .status());
-  }
-
-  // Copy host copies back to device and update PyArrays in-place.
-  for (auto& it : pjrt_buf_to_tmp_buffer) {
-    PjRtBuffer* pjrt_buf = it.first;
-    TmpBuffer& tmp_buffer = it.second;
-    std::unique_ptr<PjRtBuffer> new_copy =
-        pjrt_client()
-            ->BufferFromHostLiteral(*tmp_buffer.host_copy,
-                                    pjrt_buf->memory_space())
-            .value();
-    TF_CHECK_OK(new_copy->GetReadyFuture().Await());
-
-    std::shared_ptr<PjRtBuffer> new_pjrt_buf_ptr(new_copy.release());
-    for (std::shared_ptr<PjRtBuffer>* pjrt_buffer_ptr :
-         tmp_buffer.pjrt_buffer_ptrs) {
-      *pjrt_buffer_ptr = new_pjrt_buf_ptr;
-    }
-  }
-
-  // TODO(skyewm): delete executables?
-  return absl::OkStatus();
-}
-
-/* static */ absl::StatusOr<nb::object> PyClient::BufferFromPyval(
-    nb_class_ptr<PyClient> client, nb::handle argument, ifrt::Device* device,
-    bool force_copy, ifrt::Client::HostBufferSemantics host_buffer_semantics) {
-  if (device == nullptr) {
-    TF_RET_CHECK(!client->ifrt_client_->addressable_devices().empty());
-    device = client->ifrt_client_->addressable_devices().front();
-  }
-  CHECK(device != nullptr);
-
-  auto transfer_guard_formatter = [&argument, dst_device = device] {
-    auto type = nb::cast<std::string>(nb::str(argument.type()));
-    // Catch exceptions because shape and dtype properties convertible to str
-    // are not guaranteed to present in an arbitrary argument.
-    std::string shape;
-    std::string dtype;
-    try {
-      shape =
-          nb::cast<std::string>(nb::str(nb::object(argument.attr("shape"))));
-    } catch (const std::exception& e) {
-      shape = "<unknown>";
-    }
-    try {
-      dtype =
-          nb::cast<std::string>(nb::str(nb::object(argument.attr("dtype"))));
-    } catch (const std::exception& e) {
-      dtype = "<unknown>";
-    }
-    return absl::StrCat("type=", type, ", shape=", shape, ", dtype=", dtype,
-                        ", dst_device=", dst_device->DebugString());
-  };
-  TF_RETURN_IF_ERROR(
-      jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
-
-  TF_ASSIGN_OR_RETURN(ifrt::Device * found_device,
-                      client->ifrt_client_->LookupDevice(device->Id()));
-  if (found_device != device) {
-    return InvalidArgument("Cannot copy value to device '%s' with '%s' backend",
-                           device->DebugString(),
-                           client->ifrt_client_->platform_name());
-  }
-  GlobalPyRefManager()->CollectGarbage();
-
-  DevicePutOptions options;
-  options.squash_64bit_types = false;
-  options.allow_zero_copy =
-      (!force_copy && (host_buffer_semantics ==
-                       ifrt::Client::HostBufferSemantics::kImmutableZeroCopy));
-  TF_ASSIGN_OR_RETURN(auto put_fn,
-                      DevicePut(argument, client->ifrt_client_.get(), device,
-                                options, ifrt::MemoryKind()));
-  TF_ASSIGN_OR_RETURN(auto put, [&]() {
-    // Must release the GIL before calling IFRT because backends may
-    // decide to block/sleep for device buffer allocation.
-    nb::gil_scoped_release gil_release;
-    return std::move(put_fn)();
-  }());
-
-  if (put.ifrt_array) {
-    auto traceback = Traceback::Get();
-    return PyArray::MakeFromSingleDeviceArray(
-        std::move(client), std::move(traceback), std::move(put.ifrt_array),
-        /*weak_type=*/false,
-        /*committed=*/false);
-  } else {
-    return put.owning_pybuffer;
-  }
-}
-
-namespace {
-
-// Makes IFRT `CompileOptions` from XLA `CompileOptions` and optional host
-// callbacks.
-std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
-    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
-  std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
-      ifrt_loaded_host_callbacks;
-  ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
-  // Extract `ifrt::LoadedHostCallback`s from host callback capsules that were
-  // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
-  // `PyClient::GetEmitPythonCallbackDescriptor()`.
-  for (auto& host_callback : host_callbacks) {
-    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
-        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
-  }
-  return std::make_unique<ifrt::XlaCompileOptions>(
-      std::move(options), std::move(ifrt_loaded_host_callbacks));
-}
-
-// Makes IFRT `DeserializeExecutableOptions` from XLA `CompileOptions` and
-// optional host callbacks.
-std::unique_ptr<ifrt::DeserializeExecutableOptions>
-MakeIfrtDeserializeExecutableOptions(std::optional<CompileOptions> options,
-                                     std::vector<nb::capsule> host_callbacks) {
-  std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
-      ifrt_loaded_host_callbacks;
-  ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
-  // Extract `ifrt::LoadedHostCallback`s from host callback capsules that were
-  // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
-  // `PyClient::GetEmitPythonCallbackDescriptor()`.
-  for (auto& host_callback : host_callbacks) {
-    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
-        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
-  }
-  return std::make_unique<ifrt::XlaDeserializeExecutableOptions>(
-      std::move(options), std::move(ifrt_loaded_host_callbacks));
-}
-
-}  // namespace
-
-/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>>
-PyClient::CompileIfrtProgram(
-    nb_class_ptr<PyClient> client, std::unique_ptr<ifrt::Program> ifrt_program,
-    std::unique_ptr<ifrt::CompileOptions> ifrt_options) {
-  auto* pjrt_compatible_client =
-      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
-          client->ifrt_client_.get());
-  auto* ifrt_xla_options =
-      llvm::dyn_cast_or_null<ifrt::XlaCompileOptions>(ifrt_options.get());
-  // For XLA programs, pass allocated device memory size to compile options for
-  // pjrt compatible backends.
-  if (pjrt_compatible_client != nullptr && ifrt_xla_options != nullptr) {
-    xla::CompileOptions& options = ifrt_xla_options->compile_options;
-    auto addressable_devices =
-        pjrt_compatible_client->pjrt_client()->addressable_devices();
-    if (!addressable_devices.empty()) {
-      int device_ordinal = options.executable_build_options.device_ordinal();
-      if (device_ordinal < 0) {
-        device_ordinal = 0;
-      }
-      CHECK_LT(device_ordinal, addressable_devices.size());
-      auto stats = addressable_devices[device_ordinal]->GetAllocatorStats();
-      if (stats.ok() && stats->bytes_limit) {
-        options.executable_build_options.set_device_memory_size(
-            *stats->bytes_limit);
-      }
-    }
-
-    if (pjrt_compatible_client->pjrt_client()->key_value_store().has_value()) {
-      options.executable_build_options.set_key_value_store(
-          *pjrt_compatible_client->pjrt_client()->key_value_store());
-    }
-  }
-
-  std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
-  std::optional<std::string> fingerprint;
-  {
-    nb::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(ifrt_loaded_executable,
-                        client->ifrt_client_->GetDefaultCompiler()->Compile(
-                            std::move(ifrt_program), std::move(ifrt_options)));
-    TF_RETURN_IF_ERROR(ifrt_loaded_executable->GetReadyFuture().Await());
-    TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
-  }
-  auto traceback = Traceback::Get();
-  return make_nb_class<PyLoadedExecutable>(
-      std::move(client), std::move(ifrt_loaded_executable),
-      std::move(traceback), std::move(fingerprint));
-}
-
-/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> PyClient::Compile(
-    nb_class_ptr<PyClient> client, std::string mlir_module,
-    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
-  mlir::MLIRContext context;
-  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                      ParseMlirModuleString(mlir_module, context));
-  if (options.executable_build_options.use_shardy_partitioner()) {
-    // Since Shardy is located in the middle of the XLA pipeline, we need to
-    // export it before going to HLO while preserving Shardy ops and attrs.
-    TF_RETURN_IF_ERROR(ExportShardyForHloRoundTrip(*module));
-  }
-  return CompileIfrtProgram(
-      client, std::make_unique<xla::ifrt::HloProgram>(module.get()),
-      MakeIfrtCompileOptions(std::move(options), std::move(host_callbacks)));
-}
-
-absl::StatusOr<nb::bytes> PyClient::SerializeExecutable(
-    const PyLoadedExecutable& executable) const {
-  TF_ASSIGN_OR_RETURN(auto serialized,
-                      executable.ifrt_loaded_executable()->Serialize());
-  return nb::bytes(serialized.data(), serialized.size());
-}
-
-/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>>
-PyClient::DeserializeExecutable(nb_class_ptr<PyClient> client,
-                                nb::bytes serialized,
-                                std::optional<CompileOptions> options,
-                                std::vector<nb::capsule> host_callbacks) {
-  std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
-  std::optional<std::string> fingerprint;
-  auto ifrt_deserialize_options = MakeIfrtDeserializeExecutableOptions(
-      std::move(options), std::move(host_callbacks));
-  {
-    nb::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(
-        ifrt_loaded_executable,
-        client->ifrt_client_->GetDefaultCompiler()->DeserializeLoadedExecutable(
-            absl::string_view(serialized.c_str(), serialized.size()),
-            std::move(ifrt_deserialize_options)));
-  }
-  TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
-  auto traceback = Traceback::Get();
-  return make_nb_class<PyLoadedExecutable>(
-      std::move(client), std::move(ifrt_loaded_executable),
-      std::move(traceback), std::move(fingerprint));
-}
-
-namespace {
-
-struct HeapProfileKey {
-  Traceback* traceback;
-  int64_t size;
-  xla::PjRtDevice* device;
-  bool operator==(const HeapProfileKey& other) const;
-};
-
-bool HeapProfileKey::operator==(const HeapProfileKey& other) const {
-  if (size != other.size || device != other.device) {
-    return false;
-  }
-  if ((traceback == nullptr) != (other.traceback == nullptr)) {
-    return false;
-  }
-  if (traceback && traceback->raw_frames() != other.traceback->raw_frames()) {
-    return false;
-  }
-  return true;
-}
-
-template <typename H>
-H AbslHashValue(H h, const HeapProfileKey& key) {
-  if (key.traceback) {
-    h = H::combine(std::move(h), key.traceback->raw_frames());
-  }
-  h = H::combine(std::move(h), key.size, key.device);
-  return h;
-}
-
-}  // namespace
-
-absl::StatusOr<nb::bytes> PyClient::HeapProfile() {
-  CHECK(PyGILState_Check());
-  absl::flat_hash_set<PjRtBuffer*> buffer_set;
-  absl::flat_hash_map<HeapProfileKey, int64_t> entries;
-
-  auto add_buffer_to_profile = [&](PjRtBuffer* buffer, Traceback* traceback) {
-    // We only wish to count each PjRtBuffer once, even though they may be
-    // shared by multiple PyArrays.
-    if (!buffer->IsDeleted() && buffer_set.insert(buffer).second) {
-      TF_ASSIGN_OR_RETURN(size_t size, buffer->GetOnDeviceSizeInBytes());
-      HeapProfileKey key{traceback, static_cast<int64_t>(size),
-                         buffer->device()};
-      ++entries[key];
-    }
-    return absl::OkStatus();
-  };
-
-  std::vector<PyArray> arrays = LiveArrays();
-  for (const PyArray& array : arrays) {
-    if (array.ifrt_array() == nullptr) {
-      continue;
-    }
-    auto* arr =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(array.ifrt_array());
-    // TODO(hyeontaek): Support non-PjRt Arrays.
-    if (arr == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend "
-          "only.");
-    }
-    for (const auto& buffer : arr->pjrt_buffers()) {
-      TF_RETURN_IF_ERROR(add_buffer_to_profile(
-          buffer.get(),
-          array.traceback() ? array.traceback()->get() : nullptr));
-    }
-  }
-
-  for (PyLoadedExecutable* executable = executables_; executable;
-       executable = executable->next_) {
-    if (!executable->is_deleted()) {
-      HeapProfileKey key{
-          executable->traceback() ? executable->traceback()->get() : nullptr,
-          executable->SizeOfGeneratedCodeInBytes(), nullptr};
-      ++entries[key];
-    }
-  }
-
-  PprofProfileBuilder builder;
-  auto* allocations = builder.profile().add_sample_type();
-  allocations->set_type(builder.StringId("allocations"));
-  allocations->set_unit(builder.StringId("count"));
-  auto* space = builder.profile().add_sample_type();
-  space->set_type(builder.StringId("space"));
-  space->set_unit(builder.StringId("bytes"));
-
-  const int kind_string_id = builder.StringId("kind");
-  const int buffer_string_id = builder.StringId("buffer");
-  const int executable_string_id = builder.StringId("executable");
-  const int device_string_id = builder.StringId("device");
-  for (const auto& entry : entries) {
-    auto* sample = builder.profile().add_sample();
-    if (entry.first.traceback) {
-      for (const auto& frame : entry.first.traceback->raw_frames()) {
-        sample->add_location_id(builder.LocationId(frame.first, frame.second));
-      }
-    }
-    sample->add_value(entry.second);
-    sample->add_value(entry.first.size * entry.second);
-
-    auto* kind_label = sample->add_label();
-    kind_label->set_key(kind_string_id);
-    if (entry.first.device) {
-      kind_label->set_str(buffer_string_id);
-      auto* device_label = sample->add_label();
-      device_label->set_key(device_string_id);
-      std::string device_label_str(entry.first.device->DebugString());
-      device_label->set_str(builder.StringId(device_label_str));
-    } else {
-      kind_label->set_str(executable_string_id);
-    }
-  }
-  std::string serialized = builder.profile().SerializeAsString();
-  return nb::bytes(serialized.data(), serialized.size());
-}
-
-absl::StatusOr<nb::object> PyClient::MakePythonCallbackUsingHostSendAndRecv(
-    nb::callable callable, absl::Span<Shape const> operand_shapes,
-    absl::Span<Shape const> result_shapes,
-    absl::Span<uint16_t const> send_channel_ids,
-    absl::Span<uint16_t const> recv_channel_ids, nb::callable serializer) {
-  TF_ASSIGN_OR_RETURN(
-      auto loaded_host_callback,
-      PyHostSendAndRecvLoadedHostCallback::Create(
-          ifrt_client(), std::move(callable), operand_shapes, result_shapes,
-          send_channel_ids, recv_channel_ids, std::move(serializer)));
-  nb::capsule callback_capsule(
-      loaded_host_callback.release(), [](void* ptr) noexcept {
-        static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
-      });
-  return callback_capsule;
-}
-
-// TODO(b/394595987): Remove this API method once we remove the call from
-// mlir.py's get_emit_python_callback.
-absl::StatusOr<std::pair<uint64_t, nb::object>>
-PyClient::GetEmitPythonCallbackDescriptor(
-    nb::callable callable, absl::Span<Shape const> operand_shapes,
-    absl::Span<Shape const> result_shapes) {
-  TF_ASSIGN_OR_RETURN(
-      auto loaded_host_callback,
-      PyCpuLoadedHostCallback::Create(ifrt_client(), std::move(callable),
-                                      operand_shapes, result_shapes));
-  const uint64_t descriptor = loaded_host_callback->descriptor();
-
-  nb::capsule callback_capsule(
-      loaded_host_callback.release(), [](void* ptr) noexcept {
-        static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
-      });
-  return std::make_pair(descriptor, nb::object(std::move(callback_capsule)));
-}
-
-// TODO(b/394595987): Deprecate / clean up this API method to remove the need
-// for `operand_shapes` and `result_shapes` once we can remove
-// xla::PyClient::GetEmitPythonCallbackDescriptor (called by mlir.py's
-// get_emit_python_callback for CPU/GPU devices).
-absl::StatusOr<nb::object> PyClient::GetEmitPythonCallback(
-    nb::callable callable) {
-  absl::Span<const Shape> operand_shapes;
-  absl::Span<const Shape> result_shapes;
-  TF_ASSIGN_OR_RETURN(auto descriptor_and_callback,
-                      GetEmitPythonCallbackDescriptor(
-                          std::move(callable), operand_shapes, result_shapes));
-  return nb::object(std::move(descriptor_and_callback.second));
-}
-
-XLA_CPU_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM("xla_python_cpu_callback",
-                                             &XlaPythonCpuCallback);
-
-/* static */ int PyClient::tp_traverse(PyObject* self, visitproc visit,
-                                       void* arg) {
-  PyClient* c = nb::inst_ptr<PyClient>(self);
-  for (const auto& [ifrt_device, py_device] : c->devices_) {
-    Py_VISIT(py_device.ptr());
-  }
-  for (const auto& [ifrt_memory, py_memory] : c->memory_spaces_) {
-    Py_VISIT(py_memory.ptr());
-  }
-  return 0;
-}
-
-/* static */ int PyClient::tp_clear(PyObject* self) {
-  PyClient* c = nb::inst_ptr<PyClient>(self);
-  absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices;
-  std::swap(devices, c->devices_);
-  absl::flat_hash_map<ifrt::Memory*, nb_class_ptr<PyMemorySpace>> memory_spaces;
-  std::swap(memory_spaces, c->memory_spaces_);
-  return 0;
-}
-
-PyType_Slot PyClient::slots_[] = {
-    {Py_tp_traverse, (void*)PyClient::tp_traverse},
-    {Py_tp_clear, (void*)PyClient::tp_clear},
-    {0, nullptr},
-};
-
-/* static */ void PyClient::RegisterPythonTypes(nb::module_& m) {
-  nb::enum_<PjRtClient::HostBufferSemantics>(m, "HostBufferSemantics")
-      .value("IMMUTABLE_ONLY_DURING_CALL",
-             PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall)
-      .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
-             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes)
-      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kImmutableZeroCopy);
-
-  nb::class_<PyClient> py_local_client(m, "Client", nb::is_weak_referenceable(),
-                                       nb::type_slots(PyClient::slots_));
-  py_local_client.def_prop_ro("platform", &PyClient::platform_name)
-      .def_prop_ro("_raw_platform", &PyClient::raw_platform_name)
-      .def_prop_ro("platform_version", &PyClient::platform_version)
-      .def_prop_ro("runtime_type", &PyClient::runtime_type)
-      .def("device_count", &PyClient::device_count)
-      .def("local_device_count", &PyClient::addressable_device_count)
-      .def("devices", &PyClient::Devices)
-      .def("local_devices", &PyClient::LocalDevices)
-      // TODO(hyeontaek): Remove this method once we have a unified API for
-      // enumerating devices with different criteria.
-      .def("_get_all_devices", &PyClient::GetAllDevices)
-      .def("device_from_local_hardware_id",
-           xla::ValueOrThrowWrapper(&PyClient::DeviceFromLocalHardwareId))
-      .def("live_executables", &PyClient::LiveExecutables)
-      .def("live_arrays", &PyClient::LiveArrays)
-      .def("live_buffers", &PyClient::LiveArrays)
-      .def("process_index", &PyClient::process_index)
-      .def("host_id", &PyClient::process_index)
-      .def("task_id", &PyClient::process_index)
-      .def(
-          "buffer_from_pyval",
-          [](nb_class_ptr<PyClient> client, nb::handle argument,
-             PyDevice* device, bool force_copy,
-             PjRtClient::HostBufferSemantics host_buffer_semantics) {
-            return ValueOrThrow(
-                PyClient::BufferFromPyval(std::move(client), argument,
-                                          device ? device->device() : nullptr,
-                                          force_copy, host_buffer_semantics));
-          },
-          nb::arg("argument"), nb::arg("device").none() = nullptr,
-          nb::arg("force_copy") = false,
-          nb::arg("host_buffer_semantics") =
-              PjRtClient::HostBufferSemantics::kImmutableZeroCopy)
-      .def(
-          "compile",
-          [](nb_class_ptr<PyClient> client, nb::bytes mlir_module,
-             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
-            return ValueOrThrow(PyClient::Compile(
-                std::move(client),
-                std::string(mlir_module.c_str(), mlir_module.size()),
-                std::move(options), std::move(host_callbacks)));
-          },
-          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
-          nb::arg("host_callbacks") = std::vector<nb::capsule>())
-      .def(
-          "compile",
-          [](nb_class_ptr<PyClient> client, std::string mlir_module,
-             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
-            return ValueOrThrow(PyClient::Compile(
-                std::move(client), std::move(mlir_module), std::move(options),
-                std::move(host_callbacks)));
-          },
-          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
-          nb::arg("host_callbacks") = std::vector<nb::capsule>())
-      .def("compile_ifrt_program",
-           xla::ValueOrThrowWrapper(PyClient::CompileIfrtProgram))
-      .def("serialize_executable",
-           xla::ValueOrThrowWrapper(&PyClient::SerializeExecutable))
-      .def(
-          "deserialize_executable",
-          [](nb_class_ptr<PyClient> client, nb::bytes serialized,
-             std::optional<CompileOptions> options,
-             std::vector<nb::capsule> host_callbacks) {
-            return ValueOrThrow(PyClient::DeserializeExecutable(
-                std::move(client), std::move(serialized), std::move(options),
-                std::move(host_callbacks)));
-          },
-          nb::arg("serialized"), nb::arg("compile_options").none() = nb::none(),
-          nb::arg("host_callbacks") = std::vector<nb::capsule>())
-      .def("heap_profile", xla::ValueOrThrowWrapper(&PyClient::HeapProfile))
-      // TODO(zhangqiaorjc): Experimental.
-      .def("defragment",
-           [](PyClient& self) { xla::ThrowIfError(self.Defragment()); })
-      .def("get_emit_python_callback_descriptor",
-           xla::ValueOrThrowWrapper(&PyClient::GetEmitPythonCallbackDescriptor),
-           nb::arg("callable"), nb::arg("operand_shapes"),
-           nb::arg("result_shapes").none() = nb::none())
-      .def("get_emit_python_callback",
-           xla::ValueOrThrowWrapper(&PyClient::GetEmitPythonCallback),
-           nb::arg("callable"))
-      .def("make_python_callback_from_host_send_and_recv",
-           xla::ValueOrThrowWrapper(
-               &PyClient::MakePythonCallbackUsingHostSendAndRecv),
-           nb::arg("callable"), nb::arg("operand_shapes"),
-           nb::arg("result_shapes"), nb::arg("send_channel_ids"),
-           nb::arg("recv_channel_ids"),
-           nb::arg("serializer").none() = nb::none())
-      .def(
-          "get_default_layout",
-          [](PyClient& self, nb_dtype dtype, nb::sequence shard_shape,
-             nb_class_ptr<PyDevice> device)
-              -> std::shared_ptr<const PjRtLayout> {
-            ifrt::DType ifrt_type = xla::ValueOrThrow(DtypeToIfRtDType(dtype));
-            std::vector<int64_t> dims = SequenceToVector<int64_t>(shard_shape);
-            return xla::ValueOrThrow(self.ifrt_client()->GetDefaultLayout(
-                ifrt_type, dims, device->device(), xla::ifrt::MemoryKind()));
-          },
-          nb::arg("dtype"), nb::arg("shard_shape"), nb::arg("device"))
-      .def("__getattr__",
-           [](PyClient& client, absl::string_view name) -> nb::object {
-             const auto& attrs = client.Attributes().map();
-             auto it = attrs.find(name);
-             if (it != attrs.end()) {
-               return std::visit([](auto&& v) { return nb::cast(v.value); },
-                                 it->second);
-             }
-             throw nb::attribute_error(
-                 absl::StrCat("Unknown attribute ", name).c_str());
-           });
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_client.h b/third_party/xla/xla/python/py_client.h
deleted file mode 100644
index a107fa57be74..000000000000
--- a/third_party/xla/xla/python/py_client.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_CLIENT_H_
-#define XLA_PYTHON_PY_CLIENT_H_
-
-#include <Python.h>
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/python/ifrt/attribute_map.h"
-#include "xla/python/ifrt/client.h"
-#include "xla/python/ifrt/compiler.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/program.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/shape.h"
-
-namespace xla {
-
-class PyClient;
-class PyLoadedExecutable;
-class PyArray;
-class PyDevice;
-class PyMemorySpace;
-struct PyArray_Storage;
-
-// Python wrapper around PjRtClient.
-// We use a wrapper class to add Python-specific functionality.
-class PyClient {
- public:
-  static nb_class_ptr<PyClient> Make(std::shared_ptr<ifrt::Client> ifrt_client);
-
-  // Do not call the constructor directly. Use `PyClient::Make` instead.
-  explicit PyClient(std::shared_ptr<ifrt::Client> ifrt_client);
-  virtual ~PyClient();
-
-  ifrt::Client* ifrt_client() const { return ifrt_client_.get(); }
-  const std::shared_ptr<ifrt::Client>& shared_ptr_ifrt_client() const {
-    return ifrt_client_;
-  }
-
-  // Short-term escape hatch to get PjRtClient from PyClient.
-  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
-  xla::PjRtClient* pjrt_client() const {
-    auto* pjrt_client =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
-    if (pjrt_client == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    return pjrt_client->pjrt_client();
-  }
-  std::shared_ptr<PjRtClient> shared_ptr_pjrt_client() {
-    auto* pjrt_client =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
-    if (pjrt_client == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    return pjrt_client->shared_ptr_pjrt_client();
-  }
-
-  // Legacy alises.
-  std::shared_ptr<PjRtClient> shared_pjrt_client() {
-    return shared_ptr_pjrt_client();
-  }
-
-  absl::string_view platform_name() const {
-    // TODO(phawkins): this is a temporary backwards compatibility shim. We
-    // changed the name PJRT reports for GPU platforms to "cuda" or "rocm", but
-    // we haven't yet updated JAX clients that expect "gpu". Migrate users and
-    // remove this code.
-    if (ifrt_client_->platform_name() == "cuda" ||
-        ifrt_client_->platform_name() == "rocm") {
-      return "gpu";
-    } else {
-      return ifrt_client_->platform_name();
-    }
-  }
-  absl::string_view raw_platform_name() const {
-    // TODO(parkers): Once platform_name() is the same, remove this.
-    return ifrt_client_->platform_name();
-  }
-  absl::string_view platform_version() const {
-    return ifrt_client_->platform_version();
-  }
-  absl::string_view runtime_type() const {
-    return ifrt_client_->runtime_type();
-  }
-
-  // Returns implementation-specific attributes about this client, e.g. the PJRT
-  // C API version if applicable.
-  const xla::ifrt::AttributeMap& Attributes() const {
-    return client_attributes_;
-  }
-
-  int addressable_device_count() const {
-    return ifrt_client_->addressable_device_count();
-  }
-  int device_count() const { return ifrt_client_->device_count(); }
-  int process_index() const { return ifrt_client_->process_index(); }
-
-  std::vector<nb_class_ptr<PyDevice>> Devices();
-  std::vector<nb_class_ptr<PyDevice>> LocalDevices();
-  // Returns all devices in the client. Private API; only use this method for
-  // implementing backend._get_all_devices().
-  // TODO(hyeontaek): Remove this method once we have a unified API for
-  // enumerating devices with different criteria.
-  std::vector<nb_class_ptr<PyDevice>> GetAllDevices();
-  absl::StatusOr<nb_class_ptr<PyDevice>> DeviceFromLocalHardwareId(
-      int local_hardware_id);
-
-  // Returns the PyDevice associated with the given ifrt::Device.
-  nb_class_ptr<PyDevice> GetPyDevice(ifrt::Device* device);
-
-  // Returns the PyMemorySpace associated with the given ifrt::Memory.
-  nb_class_ptr<PyMemorySpace> GetPyMemorySpace(ifrt::Memory* memory_space);
-
-  // Returns a vector of live PyArray objects. PyArray objects may share
-  // PjRtBuffers, so there may be duplicates of the same underlying device
-  // buffer.
-  std::vector<nanobind::object> LiveBuffersOnDevice(ifrt::Device* device);
-
-  nanobind::list LiveExecutables();
-
-  // TODO(zhangqiaorjc): Remove when we have transparent defragmentation.
-  absl::Status Defragment();
-
-  static absl::StatusOr<nanobind::object> BufferFromPyval(
-      nb_class_ptr<PyClient> client, nanobind::handle argument,
-      ifrt::Device* device, bool force_copy,
-      ifrt::Client::HostBufferSemantics host_buffer_semantics);
-
-  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> CompileIfrtProgram(
-      nb_class_ptr<PyClient> client,
-      std::unique_ptr<ifrt::Program> ifrt_program,
-      std::unique_ptr<ifrt::CompileOptions> ifrt_options);
-
-  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> Compile(
-      nb_class_ptr<PyClient> client, std::string mlir_module,
-      CompileOptions options, std::vector<nanobind::capsule> host_callbacks);
-
-  absl::StatusOr<nanobind::bytes> SerializeExecutable(
-      const PyLoadedExecutable& executable) const;
-  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> DeserializeExecutable(
-      nb_class_ptr<PyClient> client, nanobind::bytes serialized,
-      std::optional<CompileOptions> options,
-      std::vector<nanobind::capsule> host_callbacks);
-
-  absl::StatusOr<nanobind::bytes> HeapProfile();
-
-  // `GetEmitPythonCallbackDescriptor` takes in an input Python callable that
-  // takes in arguments of shapes `operand_shapes` and returns values of shapes
-  // `result_shapes`. It returns a pair of a `uint64_t` descriptor and a Python
-  // object whose reference will keep the Python callback alive. The descriptor
-  // should be passed into a 'xla_python_cpu_callback' or
-  // 'xla_python_gpu_callback' CustomCall as its first argument. Typically the
-  // callback may be kept alive by attaching the keep-alive object to the
-  // executable built from this computation.
-  //
-  // The callable receives as arguments NumPy arrays for arguments with array
-  // types, and None for Token argument. The callable must return a tuple of
-  // either arrays or None values.
-  absl::StatusOr<std::pair<uint64_t, nanobind::object>>
-  GetEmitPythonCallbackDescriptor(nanobind::callable callable,
-                                  absl::Span<Shape const> operand_shapes,
-                                  absl::Span<Shape const> result_shapes);
-
-  // `GetEmitPythonCallback` takes in an input Python callable. It returns a
-  // Python object whose reference will keep the Python callback alive.
-  //
-  // The callable receives as arguments NumPy arrays for arguments with array
-  // types, and None for Token argument. The callable must return a tuple of
-  // either arrays or None values.
-  absl::StatusOr<nanobind::object> GetEmitPythonCallback(
-      nanobind::callable callable);
-
-  // `MakePythonCallbackUsingHostSendAndRecv` takes in an input Python callable
-  // that takes in arguments of shapes `operand_shapes` and returns results of
-  // shapes `result_shapes`. The arguments correspond to Send ops in the HLO
-  // program through `send_channel_ids` and the results correspond to Recv ops
-  // through `recv_channel_ids`. It returns the host callback as an opaque
-  // object whose reference will keep the Python callback alive. The host
-  // callback can be passed to `PyClient::Compile` or
-  // `PyClient::DeserializeExecutable`. The corresponding Send/Recv ops in the
-  // XLA computation can trigger the execution of this host callback.
-  // `serializer` is a function that takes `callable` as an argument and returns
-  // a serialized callable as a string.
-  //
-  // The callable receives as arguments NumPy arrays for arguments with array
-  // types, and None for Token argument. The callable must return a tuple of
-  // either arrays or None values.
-  absl::StatusOr<nanobind::object> MakePythonCallbackUsingHostSendAndRecv(
-      nanobind::callable callable, absl::Span<Shape const> operand_shapes,
-      absl::Span<Shape const> result_shapes,
-      absl::Span<uint16_t const> send_channel_ids,
-      absl::Span<uint16_t const> recv_channel_ids,
-      nanobind::callable serializer);
-
-  std::vector<PyArray> LiveArrays() const;
-
-  static void RegisterPythonTypes(nanobind::module_& m);
-
- protected:
-  static void Initialize(nb_class_ptr<PyClient> client);
-
- private:
-  friend class PyLoadedExecutable;
-  friend class PyArray;
-  friend struct PyArray_Storage;
-
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
-  static int tp_clear(PyObject* self);
-  static PyType_Slot slots_[];
-
-  std::shared_ptr<ifrt::Client> ifrt_client_;
-  xla::ifrt::AttributeMap client_attributes_;
-  // Pointers to intrusive doubly-linked lists of arrays and executables, used
-  // to iterate over all known objects when heap profiling. The list structure
-  // is protected by the GIL.
-
-  nanobind::ft_mutex executables_mutex_;
-  // List guarded by executables_mutex_.
-  PyLoadedExecutable* executables_ = nullptr;
-
-#ifdef NB_FREE_THREADING
-  static constexpr size_t kNumArraysShards = 16;
-#else
-  static constexpr size_t kNumArraysShards = 1;
-#endif
-  struct ArraysShard {
-    mutable nanobind::ft_mutex mutex;
-    PyArray_Storage* arrays;
-  };
-  std::array<ArraysShard, kNumArraysShards> arrays_;
-
-  absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices_;
-  absl::flat_hash_map<ifrt::Memory*, nb_class_ptr<PyMemorySpace>>
-      memory_spaces_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_CLIENT_H_
diff --git a/third_party/xla/xla/python/py_client_cpu.cc b/third_party/xla/xla/python/py_client_cpu.cc
deleted file mode 100644
index 4d4702e1b2a8..000000000000
--- a/third_party/xla/xla/python/py_client_cpu.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_client_cpu.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "xla/ffi/ffi.h"
-#include "xla/ffi/ffi_api.h"
-#include "xla/pjrt/host_callback.h"
-#include "xla/pjrt/transpose.h"
-#include "xla/primitive_util.h"
-#include "xla/python/callback.h"
-#include "xla/python/ifrt/host_callback.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/py_host_callback.h"
-#include "xla/python/types.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-
-absl::Status XlaFfiPythonCpuCallback(
-    std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>* callbacks,
-    uint64_t index, ffi::RemainingArgs args, ffi::RemainingRets rets) {
-  auto loaded_callback = llvm::dyn_cast_or_null<PyCpuLoadedHostCallback>(
-      callbacks->at(index).get());
-  if (loaded_callback == nullptr) {
-    return absl::InternalError(
-        "Expected a PyCpuLoadedHostCallback, got something else.");
-  }
-  CpuCallback* callback = loaded_callback->cpu_callback();
-
-  nb::gil_scoped_acquire gil;
-  auto nb_args = nb::steal<nb::tuple>(PyTuple_New(args.size()));
-  for (size_t i = 0; i < args.size(); ++i) {
-    auto arg = args.get<ffi::AnyBuffer>(i);
-    auto ptype = arg->element_type();
-    if (ptype == TOKEN) {
-      PyTuple_SET_ITEM(nb_args.ptr(), i, nb::none().release().ptr());
-    } else {
-      TF_ASSIGN_OR_RETURN(auto dtype, PrimitiveTypeToNbDtype(ptype));
-      // We pass in data using default numpy layout i.e., std::nullopt.
-      auto array = nb_numpy_ndarray(dtype, arg->dimensions(), std::nullopt,
-                                    arg.value().untyped_data());
-      array.attr("flags").attr("writeable") = nb::bool_(false);
-      PyTuple_SET_ITEM(nb_args.ptr(), i, array.release().ptr());
-    }
-  }
-
-  EnterHostCallback();
-  // TODO(dsuo): Change this to use the Python vectorcall protocol, which allows
-  // you to avoid constructing a tuple for the arguments.
-  absl::StatusOr<nb::tuple> maybe_result_tuple =
-      callback->FfiCall(std::move(nb_args));
-  LeaveHostCallback();
-  TF_ASSIGN_OR_RETURN(auto result_tuple, maybe_result_tuple);
-
-  for (size_t i = 0; i < rets.size(); ++i) {
-    auto arg = rets.get<ffi::AnyBuffer>(i).value();
-    auto ptype = arg->element_type();
-    if (ptype == TOKEN) continue;
-    nb::object output =
-        nb::borrow<nb::object>(PyTuple_GetItem(result_tuple.ptr(), i));
-    nb_numpy_ndarray array = nb_numpy_ndarray::ensure(std::move(output));
-    absl::Span<int64_t const> strides(
-        reinterpret_cast<const int64_t*>(array.strides()), array.ndim());
-    // We expect the output to be in default numpy layout.
-    TF_ASSIGN_OR_RETURN(auto expected_shape, ShapeUtil::MakeValidatedShape(
-                                                 ptype, arg->dimensions()));
-    auto expected_strides = ByteStridesForShape(expected_shape);
-    if (strides == expected_strides) {
-      std::memcpy(arg->untyped_data(), array.data(), arg->size_bytes());
-    } else {
-      xla::TransposePlan::Options options;
-      options.elem_size_in_bytes = xla::primitive_util::ByteWidth(ptype);
-      absl::Span<int64_t const> dims(
-          reinterpret_cast<const int64_t*>(array.shape()), array.ndim());
-      options.dims = dims;
-      absl::InlinedVector<int64_t, 4> reversed_layout;
-      reversed_layout.resize(expected_shape.dimensions_size());
-      absl::c_reverse_copy(expected_shape.layout().minor_to_major(),
-                           reversed_layout.begin());
-      options.permutation = reversed_layout;
-      options.input_layout = xla::TransposePlan::Striding{strides};
-      TF_ASSIGN_OR_RETURN(auto plan,
-                          callback->transpose_cache().GetOrCreate(options));
-      plan->Execute(array.data(), arg->untyped_data());
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-XLA_FFI_DEFINE_HANDLER_SYMBOL(
-    kXlaFfiPythonCpuCallback, XlaFfiPythonCpuCallback,
-    ffi::Ffi::Bind()
-        .Ctx<ffi::UserData<
-            std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>>>()
-        .Attr<uint64_t>("index")
-        .RemainingArgs()
-        .RemainingRets());
-
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla_ffi_python_cpu_callback",
-                         "HOST", kXlaFfiPythonCpuCallback);
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_client_cpu.h b/third_party/xla/xla/python/py_client_cpu.h
deleted file mode 100644
index 1fea2914e47d..000000000000
--- a/third_party/xla/xla/python/py_client_cpu.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_CLIENT_CPU_H_
-#define XLA_PYTHON_PY_CLIENT_CPU_H_
-
-#include "xla/ffi/ffi.h"
-
-namespace xla {
-
-XLA_FFI_DECLARE_HANDLER_SYMBOL(kXlaFfiPythonCpuCallback);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_CLIENT_CPU_H_
diff --git a/third_party/xla/xla/python/py_client_gpu.cc b/third_party/xla/xla/python/py_client_gpu.cc
deleted file mode 100644
index 480161a8d1e7..000000000000
--- a/third_party/xla/xla/python/py_client_gpu.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/base/casts.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/ascii.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#if TENSORFLOW_USE_ROCM
-#include "rocm/include/hip/hip_runtime.h"
-#else
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/gpus/cuda/include/driver_types.h"
-#endif
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "xla/ffi/ffi.h"
-#include "xla/ffi/ffi_api.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/host_callback.h"
-#include "xla/pjrt/transpose.h"
-#include "xla/primitive_util.h"
-#include "xla/python/callback.h"
-#include "xla/python/ifrt/host_callback.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/py_host_callback.h"
-#include "xla/python/types.h"
-#include "xla/service/custom_call_status.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "xla/service/platform_util.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/statusor.h"
-#if TENSORFLOW_USE_ROCM
-#define gpuSuccess hipSuccess
-#define gpuStreamHandle hipStream_t
-#define gpuMemcpyAsync hipMemcpyAsync
-#define gpuStreamSynchronize hipStreamSynchronize
-#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-#else
-#define gpuSuccess cudaSuccess
-#define gpuStreamHandle CUstream
-#define gpuMemcpyAsync cudaMemcpyAsync
-#define gpuStreamSynchronize cudaStreamSynchronize
-#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-#endif
-
-namespace nb = nanobind;
-
-namespace xla {
-
-void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
-                          const char* opaque, size_t opaque_len,
-                          XlaCustomCallStatus* status) {
-  // Ignore `descriptor` arg to callback
-  buffers += 1;
-  uint64_t descriptor;
-  if (!absl::SimpleAtoi(opaque, &descriptor)) {
-    throw xla::XlaRuntimeError("Invalid callback descriptor");
-    return;
-  }
-  CpuCallback* callback =
-      absl::bit_cast<CpuCallback*>(static_cast<uintptr_t>(descriptor));
-  size_t arity = callback->num_args();
-  std::vector<void*> host_input_buffers(arity);
-  // Copy input GPU buffers to host
-  for (size_t i = 0; i < arity; ++i) {
-    const CpuCallback::Arg& arg = callback->args()[i];
-    if (arg.type == TOKEN) {
-      host_input_buffers[i] = nullptr;
-      continue;
-    }
-    void* buf = new char[arg.size_in_bytes];
-    host_input_buffers[i] = buf;
-    // TODO(b/238441608): Use pinned memory here to speed up the transfer.
-    auto gpu_res = gpuMemcpyAsync(buf, buffers[i], arg.size_in_bytes,
-                                  gpuMemcpyDeviceToHost, stream);
-    CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-  }
-  CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
-      << "Failed to gpuStreamSynchronize";
-  nb::gil_scoped_acquire gil;
-  nb::tuple host_input_arrays = nb::steal<nb::tuple>(PyTuple_New(arity));
-  for (size_t i = 0; i < arity; ++i) {
-    CpuCallback::Arg arg = callback->args()[i];
-    if (arg.type == TOKEN) {
-      PyTuple_SET_ITEM(host_input_arrays.ptr(), i, nb::none().inc_ref().ptr());
-      continue;
-    }
-    nb::capsule base(host_input_buffers[i], [](void* ptr) noexcept {
-      delete[] static_cast<char*>(ptr);
-    });
-    auto array = nb_numpy_ndarray(arg.dtype, arg.dims, arg.strides,
-                                  const_cast<void*>(host_input_buffers[i]),
-                                  /*base=*/base);
-    array.attr("flags").attr("writeable") = nb::bool_(false);
-    PyTuple_SET_ITEM(host_input_arrays.ptr(), i, array.inc_ref().ptr());
-  }
-  EnterHostCallback();
-  absl::StatusOr<nb::tuple> maybe_result_tuple =
-      callback->Call(host_input_arrays);
-  LeaveHostCallback();
-  if (!maybe_result_tuple.ok()) {
-    absl::string_view msg = maybe_result_tuple.status().message();
-    XlaCustomCallStatusSetFailure(status, msg.data(), msg.length());
-    return;
-  }
-  nb::tuple result_tuple = maybe_result_tuple.value();
-  std::vector<void*> temp_buffers;
-  for (size_t i = 0; i < callback->results().size(); ++i) {
-    CpuCallback::Result result = callback->results()[i];
-    if (result.type == TOKEN) {
-      continue;
-    }
-    nb::object output =
-        nb::borrow<nb::object>(PyTuple_GetItem(result_tuple.ptr(), i));
-    nb_numpy_ndarray array = nb_numpy_ndarray::ensure(std::move(output));
-    absl::Span<int64_t const> dims(
-        reinterpret_cast<const int64_t*>(array.shape()), array.ndim());
-    absl::Span<int64_t const> strides(
-        reinterpret_cast<const int64_t*>(array.strides()), array.ndim());
-    if (strides == result.expected_strides) {
-      auto gpu_res =
-          gpuMemcpyAsync(buffers[arity + i], array.data(), result.size_in_bytes,
-                         gpuMemcpyHostToDevice, stream);
-      CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-    } else {
-      void* temp = new char[result.size_in_bytes];
-      temp_buffers.push_back(temp);
-      xla::TransposePlan::Options options;
-      options.elem_size_in_bytes = xla::primitive_util::ByteWidth(result.type);
-      options.dims = dims;
-      options.permutation = result.reversed_layout;
-      options.input_layout = xla::TransposePlan::Striding{strides};
-      absl::StatusOr<std::shared_ptr<xla::TransposePlan>> plan =
-          callback->transpose_cache().GetOrCreate(options);
-      if (!plan.ok()) {
-        throw xla::XlaRuntimeError(plan.status().ToString());
-      }
-      plan.value()->Execute(array.data(), temp);
-      auto gpu_res =
-          gpuMemcpyAsync(buffers[arity + i], temp, result.size_in_bytes,
-                         gpuMemcpyHostToDevice, stream);
-      CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-    }
-  }
-  nb::gil_scoped_release release;
-  CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
-      << "Failed to gpuStreamSynchronize";
-  for (int i = 0; i < temp_buffers.size(); ++i) {
-    delete[] static_cast<char*>(temp_buffers[i]);
-  }
-}
-
-// TODO(danfm): When compiled as part of a jaxlib plugin, this will register
-// the custom call target in the plugin's registry. This won't affect
-// registration via the Python API, but we should remove this once we have
-// fully migrated to the plugin interface.
-XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(
-    "xla_python_gpu_callback", &XlaPythonGpuCallback,
-    absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value()));
-
-absl::Status XlaFfiPythonGpuCallback(
-    gpuStreamHandle stream,
-    std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>* callbacks,
-    uint64_t index, ffi::RemainingArgs args, ffi::RemainingRets rets) {
-  auto loaded_callback = llvm::dyn_cast_or_null<PyCpuLoadedHostCallback>(
-      callbacks->at(index).get());
-  if (loaded_callback == nullptr) {
-    return absl::InternalError(
-        "Expected a PyCpuLoadedHostCallback, got something else.");
-  }
-  CpuCallback* callback = loaded_callback->cpu_callback();
-  size_t arity = args.size();
-  std::vector<void*> host_input_buffers(arity);
-  // Copy input GPU buffers to host
-  for (size_t i = 0; i < arity; ++i) {
-    auto arg = args.get<ffi::AnyBuffer>(i);
-    if (arg->element_type() == TOKEN) {
-      host_input_buffers[i] = nullptr;
-      continue;
-    }
-    void* buf = new char[arg->size_bytes()];
-    host_input_buffers[i] = buf;
-    // TODO(b/238441608): Use pinned memory here to speed up the transfer.
-    auto gpu_res =
-        gpuMemcpyAsync(buf, arg.value().untyped_data(), arg->size_bytes(),
-                       gpuMemcpyDeviceToHost, stream);
-    CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-  }
-  CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
-      << "Failed to gpuStreamSynchronize";
-  nb::gil_scoped_acquire gil;
-  nb::tuple host_input_arrays = nb::steal<nb::tuple>(PyTuple_New(arity));
-  for (size_t i = 0; i < arity; ++i) {
-    auto arg = args.get<ffi::AnyBuffer>(i);
-    PrimitiveType ptype = arg->element_type();
-    if (ptype == TOKEN) {
-      PyTuple_SET_ITEM(host_input_arrays.ptr(), i, nb::none().inc_ref().ptr());
-    } else {
-      nb::capsule base(host_input_buffers[i], [](void* ptr) noexcept {
-        delete[] static_cast<char*>(ptr);
-      });
-      TF_ASSIGN_OR_RETURN(auto dtype, PrimitiveTypeToNbDtype(ptype));
-      auto array = nb_numpy_ndarray(dtype, arg->dimensions(), std::nullopt,
-                                    host_input_buffers[i], base);
-      array.attr("flags").attr("writeable") = nb::bool_(false);
-      PyTuple_SET_ITEM(host_input_arrays.ptr(), i, array.inc_ref().ptr());
-    }
-  }
-
-  EnterHostCallback();
-  // TODO(dsuo): Change this to use the Python vectorcall protocol, which allows
-  // you to avoid constructing a tuple for the arguments.
-  absl::StatusOr<nb::tuple> maybe_result_tuple =
-      callback->FfiCall(host_input_arrays);
-  LeaveHostCallback();
-  TF_ASSIGN_OR_RETURN(auto result_tuple, maybe_result_tuple);
-
-  std::vector<void*> temp_buffers;
-  for (size_t i = 0; i < rets.size(); ++i) {
-    auto ret = rets.get<ffi::AnyBuffer>(i).value();
-    auto ptype = ret->element_type();
-    if (ptype == TOKEN) continue;
-    nb::object output =
-        nb::borrow<nb::object>(PyTuple_GetItem(result_tuple.ptr(), i));
-    nb_numpy_ndarray array = nb_numpy_ndarray::ensure(std::move(output));
-    absl::Span<int64_t const> strides(
-        reinterpret_cast<const int64_t*>(array.strides()), array.ndim());
-    // We expect the output to be in default numpy layout.
-    TF_ASSIGN_OR_RETURN(auto expected_shape, ShapeUtil::MakeValidatedShape(
-                                                 ptype, ret->dimensions()));
-    auto expected_strides = ByteStridesForShape(expected_shape);
-    if (strides == expected_strides) {
-      auto gpu_res =
-          gpuMemcpyAsync(ret->untyped_data(), array.data(), ret->size_bytes(),
-                         gpuMemcpyHostToDevice, stream);
-      CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-    } else {
-      void* temp = new char[ret->size_bytes()];
-      temp_buffers.push_back(temp);
-      xla::TransposePlan::Options options;
-      options.elem_size_in_bytes = xla::primitive_util::ByteWidth(ptype);
-      absl::Span<int64_t const> dims(
-          reinterpret_cast<const int64_t*>(array.shape()), array.ndim());
-      options.dims = dims;
-      absl::InlinedVector<int64_t, 4> reversed_layout;
-      reversed_layout.resize(expected_shape.dimensions_size());
-      absl::c_reverse_copy(expected_shape.layout().minor_to_major(),
-                           reversed_layout.begin());
-      options.permutation = reversed_layout;
-      options.input_layout = xla::TransposePlan::Striding{strides};
-      TF_ASSIGN_OR_RETURN(auto plan,
-                          callback->transpose_cache().GetOrCreate(options));
-      plan->Execute(array.data(), temp);
-      auto gpu_res =
-          gpuMemcpyAsync(ret->untyped_data(), temp, ret->size_bytes(),
-                         gpuMemcpyHostToDevice, stream);
-      CHECK_EQ(gpu_res, gpuSuccess) << "Failed to gpuMemcpyAsync";
-    }
-  }
-  nb::gil_scoped_release release;
-  CHECK_EQ(gpuStreamSynchronize(stream), gpuSuccess)
-      << "Failed to gpuStreamSynchronize";
-  for (int i = 0; i < temp_buffers.size(); ++i) {
-    delete[] static_cast<char*>(temp_buffers[i]);
-  }
-  return absl::OkStatus();
-}
-
-XLA_FFI_DEFINE_HANDLER_SYMBOL(
-    kXlaFfiPythonGpuCallback, XlaFfiPythonGpuCallback,
-    ffi::Ffi::Bind()
-        .Ctx<ffi::PlatformStream<gpuStreamHandle>>()
-        .Ctx<ffi::UserData<
-            std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>>>()
-        .Attr<uint64_t>("index")
-        .RemainingArgs()
-        .RemainingRets());
-XLA_FFI_REGISTER_HANDLER(
-    ffi::GetXlaFfiApi(), "xla_ffi_python_gpu_callback",
-    absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value()),
-    kXlaFfiPythonGpuCallback);
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_client_gpu.h b/third_party/xla/xla/python/py_client_gpu.h
deleted file mode 100644
index f95a2db4df10..000000000000
--- a/third_party/xla/xla/python/py_client_gpu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_CLIENT_GPU_H_
-#define XLA_PYTHON_PY_CLIENT_GPU_H_
-
-#if TENSORFLOW_USE_ROCM
-#include "rocm/include/hip/hip_runtime.h"
-#else
-#include "third_party/gpus/cuda/include/cuda.h"
-#endif
-#include "xla/ffi/ffi.h"
-#include "xla/service/custom_call_status.h"
-
-#if TENSORFLOW_USE_ROCM
-#define gpuStreamHandle hipStream_t
-#else
-#define gpuStreamHandle CUstream
-#endif
-
-namespace xla {
-
-void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
-                          const char* opaque, size_t opaque_len,
-                          XlaCustomCallStatus* status);
-
-XLA_FFI_DECLARE_HANDLER_SYMBOL(kXlaFfiPythonGpuCallback);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_CLIENT_GPU_H_
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
deleted file mode 100644
index 7423d2f38bf7..000000000000
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_compile_only_client.h"
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/pjrt/mlir_to_hlo.h"
-#include "xla/pjrt/pjrt_compiler.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/compile_only_ifrt/client.h"
-#include "xla/python/ifrt/executable.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pjrt_ifrt/pjrt_executable.h"
-#include "xla/python/pjrt_ifrt/pjrt_topology.h"
-#include "xla/python/pjrt_ifrt/xla_compiler.h"
-#include "xla/python/py_client.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/python/lib/core/numpy.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-
-namespace {
-
-class CompileOnlyPyClient : public PyClient {
- public:
-  using PyClient::PyClient;
-
-  static nb_class_ptr<PyClient> Make(
-      std::shared_ptr<ifrt::PjRtTopology> topology) {
-    auto client =
-        nb::borrow<nb_class_ptr<PyClient>>(make_nb_class<CompileOnlyPyClient>(
-            std::make_unique<CompileOnlyIfRtClient>(std::move(topology))));
-    CompileOnlyPyClient::Initialize(client);
-    return client;
-  }
-
-  absl::StatusOr<std::shared_ptr<ifrt::Executable>> CompileUnloaded(
-      absl::string_view mlir_module, CompileOptions options,
-      std::vector<nb::capsule> host_callbacks) {
-    if (!host_callbacks.empty()) {
-      return Unimplemented(
-          "Compiling with host_callbacks not available with compile-only "
-          "client.");
-    }
-    nb::gil_scoped_release gil_release;
-    mlir::MLIRContext context;
-    TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                        ParseMlirModuleString(mlir_module, context));
-    if (options.executable_build_options.use_shardy_partitioner()) {
-      // Since Shardy is located in the middle of the XLA pipeline, we need to
-      // export it before going to HLO while preserving Shardy ops and attrs.
-      TF_RETURN_IF_ERROR(ExportShardyForHloRoundTrip(*module));
-    }
-    auto* ifrt_client =
-        llvm::dyn_cast_or_null<CompileOnlyIfRtClient>(this->ifrt_client());
-    CHECK(ifrt_client) << "CompileOnlyPyClient requires ifrt_client be a "
-                          "CompileOnlyIfRtClient";
-    auto xla_options = std::make_unique<ifrt::XlaCompileOptions>(options);
-    TF_ASSIGN_OR_RETURN(auto executable,
-                        PjRtCompile(std::move(options), module.get(),
-                                    *ifrt_client->topology().description()));
-    TF_ASSIGN_OR_RETURN(auto ifrt_executable,
-                        ifrt::PjRtExecutable::Create(std::move(executable)));
-    return std::shared_ptr<ifrt::Executable>(std::move(ifrt_executable));
-  }
-
- private:
-  static void Initialize(nb_class_ptr<PyClient> client) {
-    PyClient::Initialize(client);
-  }
-};
-
-}  // namespace
-
-nb_class_ptr<PyClient> MakeCompileOnlyClient(
-    std::shared_ptr<ifrt::PjRtTopology> topology) {
-  return CompileOnlyPyClient::Make(std::move(topology));
-}
-
-void RegisterCompileOnlyClient(nb::module_& m) {
-  nb::class_<CompileOnlyPyClient, PyClient>(m, "CompileOnlyPyClient")
-      .def(
-          "compile",
-          [](CompileOnlyPyClient& self, nb::bytes mlir_module,
-             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
-            return ValueOrThrow(self.CompileUnloaded(
-                absl::string_view(mlir_module.c_str(), mlir_module.size()),
-                std::move(options), std::move(host_callbacks)));
-          },
-          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
-          nb::arg("host_callbacks") = std::vector<nb::capsule>())
-      .def(
-          "compile", ValueOrThrowWrapper(&CompileOnlyPyClient::CompileUnloaded),
-          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
-          nb::arg("host_callbacks") = std::vector<nb::capsule>());
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_compile_only_client.h b/third_party/xla/xla/python/py_compile_only_client.h
deleted file mode 100644
index 470bcdd9b4c2..000000000000
--- a/third_party/xla/xla/python/py_compile_only_client.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
-#define XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
-
-#include <memory>
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pjrt_ifrt/pjrt_topology.h"
-#include "xla/python/py_client.h"
-
-namespace xla {
-
-// This is a workaround for AOT compilation until topologies and device
-// descriptions are better integrated into jax's Python code. It returns a
-// PyClient that will return errors for all non-AOT methods. It also exposes a
-// different compile method that returns an unloaded executable (vs. PyClient
-// usually returns a loaded executable). RegisterCompileOnlyClient() overloads
-// the Python "compile" method to return the unloaded executable, and we rely on
-// Python duck typing to treat the unloaded executable like a loaded executable
-// (except it will raise errors if you try to run it, which is what we want for
-// AOT environments).
-nb_class_ptr<PyClient> MakeCompileOnlyClient(
-    std::shared_ptr<ifrt::PjRtTopology>);
-
-void RegisterCompileOnlyClient(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_COMPILE_ONLY_CLIENT_H_
diff --git a/third_party/xla/xla/python/py_device.cc b/third_party/xla/xla/python/py_device.cc
deleted file mode 100644
index 6c0f4df53b10..000000000000
--- a/third_party/xla/xla/python/py_device.cc
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_device.h"
-
-#include <Python.h>
-
-#include <cstdint>
-#include <exception>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/variant.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_helpers.h"
-#include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/python/pjrt_ifrt/pjrt_device.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_memory_space.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/types.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/framework/allocator.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace nb = ::nanobind;
-
-namespace xla {
-
-PyDevice::PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device)
-    : client_(std::move(client)), device_(device) {}
-
-int PyDevice::id() const { return device_->Id().value(); }
-
-int PyDevice::process_index() const { return device_->ProcessIndex(); }
-
-absl::string_view PyDevice::platform() const {
-  // TODO(phawkins): this is a temporary backwards
-  // compatibility shim. We changed the name PJRT
-  // reports for GPU platforms to "cuda" or "rocm",
-  // but we haven't yet updated JAX clients that
-  // expect "gpu". Migrate users and remove this
-  // code.
-  if (client_->platform_name() == "cuda" ||
-      client_->platform_name() == "rocm") {
-    return absl::string_view("gpu");
-  } else {
-    return client_->platform_name();
-  }
-}
-
-absl::string_view PyDevice::device_kind() const { return device_->Kind(); }
-
-std::optional<int> PyDevice::local_hardware_id() const {
-  // TODO(phawkins): consider supporting this for non-PJRT devices.
-  ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-  if (device == nullptr || !device->IsAddressable()) {
-    return std::nullopt;
-  }
-  int local_hardware_id = device->pjrt_device()->local_hardware_id().value();
-  if (local_hardware_id == -1) {
-    return std::nullopt;
-  }
-  return local_hardware_id;
-}
-
-absl::string_view PyDevice::Str() const { return device_->DebugString(); }
-
-absl::string_view PyDevice::Repr() const { return device_->ToString(); }
-
-absl::Status PyDevice::TransferToInfeed(LiteralSlice literal) {
-  GlobalPyRefManager()->CollectGarbage();
-  nb::gil_scoped_release gil_release;
-  auto client = llvm::dyn_cast<ifrt::PjRtClient>(client_->ifrt_client());
-  auto device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-  if (client == nullptr || device == nullptr) {
-    return xla::InvalidArgument(
-        "TransferToInfeed is only supported for PjRt devices.");
-  }
-  return client->TransferToInfeed(device, literal);
-}
-
-absl::StatusOr<nb::object> PyDevice::TransferFromOutfeed(Shape shape) {
-  GlobalPyRefManager()->CollectGarbage();
-  std::shared_ptr<Literal> literal;
-  {
-    nb::gil_scoped_release gil_release;
-    auto client = llvm::dyn_cast<ifrt::PjRtClient>(client_->ifrt_client());
-    auto device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-    if (client == nullptr || device == nullptr) {
-      return xla::InvalidArgument(
-          "TransferFromOutfeed is only supported for PjRt devices.");
-    }
-    ShapeUtil::ForEachMutableSubshape(
-        &shape, [](Shape* subshape, const ShapeIndex&) {
-          if (!subshape->has_layout()) {
-            LayoutUtil::SetToDefaultLayout(subshape);
-          }
-        });
-    literal = std::make_shared<Literal>(shape);
-    TF_RETURN_IF_ERROR(client->TransferFromOutfeed(device, literal.get()));
-  }
-  return LiteralToPython(std::move(literal));
-}
-
-absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::Memory(
-    absl::string_view kind) const {
-  ifrt::Memory* result_memory_space = nullptr;
-  for (auto* memory_space : device_->Memories()) {
-    if (memory_space->Kind().memory_kind() == kind) {
-      if (result_memory_space != nullptr) {
-        std::string memories = absl::StrJoin(
-            device_->Memories(), ", ",
-            [](std::string* out, const auto& memory_space) {
-              absl::StrAppend(out, *memory_space->Kind().memory_kind());
-            });
-        auto device_kind = device_->Kind();
-        return xla::InvalidArgument(
-            "Found more than one addressable memory for "
-            "kind %s which is not allowed. There can only "
-            "be one memory for each "
-            "kind. Device %s can address the following "
-            "memory kinds: %s",
-            kind, device_kind, memories);
-      }
-      result_memory_space = memory_space;
-    }
-  }
-  if (result_memory_space == nullptr) {
-    std::string memories = absl::StrJoin(
-        device_->Memories(), ", ",
-        [](std::string* out, const auto& memory_space) {
-          absl::StrAppend(out, *memory_space->Kind().memory_kind());
-        });
-    auto device_kind = device_->Kind();
-    return xla::InvalidArgument(
-        "Could not find memory addressable by device %s. Device %s "
-        "can address the following memory kinds: %s. "
-        "Got memory kind: %s",
-        device_kind, device_kind, memories, kind);
-  }
-  return client_->GetPyMemorySpace(result_memory_space);
-}
-
-absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::DefaultMemory() const {
-  TF_ASSIGN_OR_RETURN(auto* memory_space, device_->DefaultMemory());
-  return client_->GetPyMemorySpace(memory_space);
-}
-
-nb::list PyDevice::AddressableMemories() const {
-  nb::list memory_spaces;
-  for (auto* memory_space : device_->Memories()) {
-    memory_spaces.append(client_->GetPyMemorySpace(memory_space));
-  }
-  return memory_spaces;
-}
-
-absl::StatusOr<std::optional<nb::dict>> PyDevice::MemoryStats() const {
-  GlobalPyRefManager()->CollectGarbage();
-  ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-  if (device == nullptr || !device->IsAddressable()) {
-    return xla::InvalidArgument(
-        "MemoryStats is only supported for addressable PjRt devices.");
-  }
-  absl::StatusOr<tsl::AllocatorStats> maybe_stats =
-      device->pjrt_device()->GetAllocatorStats();
-  if (absl::IsUnimplemented(maybe_stats.status())) {
-    return std::nullopt;
-  }
-  // Raise error if any status other than Unimplemented is returned.
-  ThrowIfError(maybe_stats.status());
-
-  nb::dict result;
-  result["num_allocs"] = maybe_stats->num_allocs;
-  result["bytes_in_use"] = maybe_stats->bytes_in_use;
-  result["peak_bytes_in_use"] = maybe_stats->peak_bytes_in_use;
-  result["largest_alloc_size"] = maybe_stats->largest_alloc_size;
-  if (maybe_stats->bytes_limit) {
-    result["bytes_limit"] = *maybe_stats->bytes_limit;
-  }
-  result["bytes_reserved"] = maybe_stats->bytes_reserved;
-  result["peak_bytes_reserved"] = maybe_stats->peak_bytes_reserved;
-  if (maybe_stats->bytes_reservable_limit) {
-    result["bytes_reservable_limit"] = *maybe_stats->bytes_reservable_limit;
-  }
-  result["largest_free_block_bytes"] = maybe_stats->largest_free_block_bytes;
-  if (maybe_stats->pool_bytes) {
-    result["pool_bytes"] = *maybe_stats->pool_bytes;
-  }
-  if (maybe_stats->peak_pool_bytes) {
-    result["peak_pool_bytes"] = *maybe_stats->peak_pool_bytes;
-  }
-  return result;
-}
-
-absl::StatusOr<std::intptr_t> PyDevice::GetStreamForExternalReadyEvents()
-    const {
-  ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-  if (device == nullptr || !device->IsAddressable()) {
-    return xla::InvalidArgument(
-        "GetStreamForExternalReadyEvents is only supported for addressable "
-        "PjRt devices.");
-  }
-  return device->pjrt_device()->GetStreamForExternalReadyEvents();
-}
-
-/* static */ int PyDevice::tp_traverse(PyObject* self, visitproc visit,
-                                       void* arg) {
-  PyDevice* d = nb::inst_ptr<PyDevice>(self);
-  Py_VISIT(d->client().ptr());
-  return 0;
-}
-
-/* static */ int PyDevice::tp_clear(PyObject* self) {
-  PyDevice* d = nb::inst_ptr<PyDevice>(self);
-  nb_class_ptr<PyClient> client;
-  std::swap(client, d->client_);
-  return 0;
-}
-
-PyType_Slot PyDevice::slots_[] = {
-    {Py_tp_traverse, (void*)PyDevice::tp_traverse},
-    {Py_tp_clear, (void*)PyDevice::tp_clear},
-    {0, nullptr},
-};
-
-/* static */ void PyDevice::RegisterPythonType(nb::module_& m) {
-  nb::class_<PyDevice> device(
-      m, "Device", nb::type_slots(PyDevice::slots_),
-      "A descriptor of an available device.\n\nSubclasses are used to "
-      "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
-      "have additional properties specific to that device type.");
-  device
-      .def_prop_ro(
-          "id", &PyDevice::id,
-          "Integer ID of this device.\n\nUnique across all available devices "
-          "of this type, including remote devices on multi-host platforms.")
-      .def_prop_ro("process_index", &PyDevice::process_index,
-                   "Integer index of this device's process.\n\n"
-                   "This is always 0 except on multi-process platforms.")
-      .def_prop_ro("host_id", &PyDevice::process_index,
-                   "Deprecated; please use process_index")
-      .def_prop_ro("task_id", &PyDevice::process_index,
-                   "Deprecated; please use process_index")
-      .def_prop_ro("platform", &PyDevice::platform)
-      .def_prop_ro("device_kind", &PyDevice::device_kind)
-      .def_prop_ro("client", &PyDevice::client)
-      .def_prop_ro(
-          "local_hardware_id", &PyDevice::local_hardware_id,
-          "Opaque hardware ID, e.g., the CUDA device number. In general, not "
-          "guaranteed to be dense, and not guaranteed to be defined on all "
-          "platforms.")
-      .def("__str__", &PyDevice::Str)
-      .def("__repr__", &PyDevice::Repr)
-      .def("transfer_to_infeed",
-           ThrowIfErrorWrapper(&PyDevice::TransferToInfeed))
-      .def("transfer_from_outfeed",
-           ValueOrThrowWrapper(&PyDevice::TransferFromOutfeed))
-      .def("memory", ValueOrThrowWrapper(&PyDevice::Memory), nb::arg("kind"))
-      .def("default_memory", ValueOrThrowWrapper(&PyDevice::DefaultMemory),
-           "Returns the default memory of a device.")
-      .def("addressable_memories", &PyDevice::AddressableMemories,
-           "Returns all the memories that a device can address.")
-
-      .def("live_buffers",
-           [](nb::handle device) {
-             PythonDeprecationWarning(
-                 /*stacklevel=*/1,
-                 "Per device live_buffers() is deprecated. Please "
-                 "use the jax.live_arrays() for jax.Arrays instead.");
-             return nb::list();
-           })
-      .def(
-          "memory_stats", ValueOrThrowWrapper(&PyDevice::MemoryStats),
-          "Returns memory statistics for this device keyed by name. May not "
-          "be implemented on all platforms, and different platforms may return "
-          "different stats, or -1 for unavailable stats. 'bytes_in_use' is "
-          "usually available. Intended for diagnostic use.")
-      .def(
-          "get_stream_for_external_ready_events",
-          xla::ValueOrThrowWrapper(&PyDevice::GetStreamForExternalReadyEvents));
-  static PyMethodDef get_attr_method = {
-      "__getattr__",
-      +[](PyObject* self, PyObject* args) -> PyObject* {
-        PyObject* key;
-        if (!PyArg_ParseTuple(args, "O", &key)) {
-          PyErr_SetString(PyExc_TypeError, "__getattr__ must take 1 argument.");
-          return nullptr;
-        }
-        try {
-          auto device = nb::cast<PyDevice*>(nb::handle(self));
-          auto name = nb::cast<absl::string_view>(nb::handle(key));
-          const auto& attrs = device->device_->Attributes().map();
-          auto it = attrs.find(name);
-          if (it != attrs.end()) {
-            auto result = std::visit([](auto&& v) { return nb::cast(v.value); },
-                                     it->second);
-            return result.release().ptr();
-          }
-          PyErr_SetNone(PyExc_AttributeError);
-          return nullptr;
-        } catch (std::exception& e) {
-          PyErr_Format(PyExc_SystemError, "Unhandled nanobind exception: %s",
-                       e.what());
-          return nullptr;
-        } catch (...) {
-          PyErr_SetString(PyExc_SystemError, "Unhandled nanobind exception.");
-          return nullptr;
-        }
-      },
-      METH_VARARGS,
-      nullptr,
-  };
-  device.attr("__getattr__") = nb::steal<nb::object>(PyDescr_NewMethod(
-      reinterpret_cast<PyTypeObject*>(device.ptr()), &get_attr_method));
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_device.h b/third_party/xla/xla/python/py_device.h
deleted file mode 100644
index 6acd35b1da99..000000000000
--- a/third_party/xla/xla/python/py_device.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_DEVICE_H_
-#define XLA_PYTHON_PY_DEVICE_H_
-
-#include <Python.h>
-
-#include <cstdint>
-#include <optional>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "nanobind/nanobind.h"
-#include "xla/literal.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/py_client.h"
-#include "xla/shape.h"
-
-namespace xla {
-
-class PyDevice {
- public:
-  PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device);
-
-  // Devices are compared using Python object identity, so we don't allow them
-  // to be copied or moved.
-  PyDevice(const PyDevice&) = delete;
-  PyDevice(PyDevice&&) = delete;
-  PyDevice& operator=(const PyDevice&) = delete;
-  PyDevice& operator=(PyDevice&&) = delete;
-
-  const nb_class_ptr<PyClient>& client() const { return client_; }
-  ifrt::Device* device() const { return device_; }
-
-  int id() const;
-  int process_index() const;
-  absl::string_view platform() const;
-  absl::string_view device_kind() const;
-  std::optional<int> local_hardware_id() const;
-
-  absl::string_view Str() const;
-  absl::string_view Repr() const;
-
-  absl::Status TransferToInfeed(LiteralSlice literal);
-  absl::StatusOr<nanobind::object> TransferFromOutfeed(Shape shape);
-
-  absl::StatusOr<nb_class_ptr<PyMemorySpace>> Memory(
-      absl::string_view kind) const;
-  absl::StatusOr<nb_class_ptr<PyMemorySpace>> DefaultMemory() const;
-  nanobind::list AddressableMemories() const;
-  absl::StatusOr<std::optional<nanobind::dict>> MemoryStats() const;
-
-  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents() const;
-
-  static void RegisterPythonType(nanobind::module_& m);
-
- private:
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
-  static int tp_clear(PyObject* self);
-  static PyType_Slot slots_[];
-
-  nb_class_ptr<PyClient> client_;
-  ifrt::Device* device_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_DEVICE_H_
diff --git a/third_party/xla/xla/python/py_device_list.cc b/third_party/xla/xla/python/py_device_list.cc
deleted file mode 100644
index 762a686dfab1..000000000000
--- a/third_party/xla/xla/python/py_device_list.cc
+++ /dev/null
@@ -1,473 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_device_list.h"
-
-#include <Python.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "nanobind/make_iterator.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_helpers.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_device.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/sharding.h"
-#include "xla/python/types.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/util.h"
-
-namespace jax {
-
-namespace nb = ::nanobind;
-
-PyDeviceList::PyDeviceList(xla::nb_class_ptr<xla::PyClient> py_client,
-                           xla::ifrt::DeviceListRef device_list)
-    : py_client_(std::move(py_client)), device_list_(std::move(device_list)) {}
-
-PyDeviceList::PyDeviceList(nb::tuple py_device_assignment)
-    : device_list_(py_device_assignment) {
-  // Attempt to convert to Python devices into `ifrt::DeviceList`.
-  if (py_device_assignment.size() == 0) {
-    return;
-  }
-  absl::InlinedVector<xla::ifrt::Device*, 1> devices;
-  devices.reserve(py_device_assignment.size());
-  for (nb::handle obj : py_device_assignment) {
-    if (!nb::isinstance<xla::PyDevice>(obj.ptr())) {
-      // Non-`xla::PyDevice` is used on an alternative JAX backend with device
-      // duck typing. Use Python device objects already set in `device_list_`.
-      return;
-    }
-    auto py_device = nb::cast<xla::PyDevice*>(obj);
-    if (py_client_.get() == nullptr) {
-      py_client_ = py_device->client();
-    } else if (py_device->client().get() != py_client_.get()) {
-      // If the list contains multiple clients, fall back to device duck typing.
-      return;
-    }
-    devices.push_back(py_device->device());
-  }
-  device_list_ = py_client_->ifrt_client()->MakeDeviceList(devices);
-}
-
-PyDeviceList::~PyDeviceList() {
-  if (device_list_.index() == 1) {
-    xla::GlobalPyRefManager()->AddGarbage(
-        std::move(std::get<1>(std::move(device_list_))));
-  }
-}
-
-absl::StatusOr<xla::ifrt::DeviceListRef> PyDeviceList::ifrt_device_list()
-    const {
-  switch (device_list_.index()) {
-    case 0:
-      return std::get<0>(device_list_);
-    case 1:
-      return xla::InvalidArgument("DeviceList contains non-IFRT devices");
-    default:
-      return xla::InvalidArgument("Unrecognized DeviceList type");
-  }
-}
-
-int64_t PyDeviceList::Hash() {
-  if (!hash_.has_value()) {
-    switch (device_list_.index()) {
-      case 0:
-        hash_ = absl::HashOf(std::get<0>(device_list_));
-        break;
-      case 1:
-        hash_ = nb::hash(std::get<1>(device_list_));
-        break;
-      default:
-        throw nb::value_error("Unrecognized DeviceList type");
-    }
-  }
-  return *hash_;
-}
-
-/*static*/ bool PyDeviceList::Equal(xla::nb_class_ptr<PyDeviceList> self,
-                                    nb::handle other) {
-  if (!nb::isinstance<PyDeviceList>(other)) {
-    return false;
-  }
-  auto o = nb::cast<PyDeviceList*>(other);
-  // Fast-path using a pointer equality check.
-  if (self.get() == o) {
-    return true;
-  }
-  int64_t h1, h2;
-  {
-    nb::ft_object_guard lock(self);
-    h1 = self->Hash();
-  }
-  {
-    nb::ft_object_guard lock(other);
-    h2 = o->Hash();
-  }
-  if (h1 != h2) {
-    return false;
-  }
-  if (self->device_list_.index() == 0 && o->device_list_.index() == 0) {
-    nb::gil_scoped_release gil_release;
-    return *std::get<0>(self->device_list_) == *std::get<0>(o->device_list_);
-  } else {
-    return self->AsTuple().equal(o->AsTuple());
-  }
-}
-
-/*static*/ bool PyDeviceList::NotEqual(xla::nb_class_ptr<PyDeviceList> self,
-                                       nb::handle other) {
-  return !Equal(std::move(self), other);
-}
-
-int PyDeviceList::Len() const {
-  switch (device_list_.index()) {
-    case 0:
-      return std::get<0>(device_list_)->size();
-    case 1:
-      return nb::len(std::get<1>(device_list_));
-    default:
-      throw nb::value_error("Unrecognized DeviceList type");
-  }
-}
-
-nb::object PyDeviceList::GetItem(int index) {
-  switch (device_list_.index()) {
-    case 0: {
-      const xla::ifrt::DeviceListRef& device_list = std::get<0>(device_list_);
-      if (index < -device_list->size() || index >= device_list->size()) {
-        throw nb::index_error();
-      } else if (index < 0) {
-        index += device_list->size();
-      }
-      return py_client_->GetPyDevice(device_list->devices()[index]);
-    }
-    case 1:
-      return std::get<1>(device_list_).attr("__getitem__")(index);
-    default:
-      throw nb::value_error("Unrecognized DeviceList type");
-  }
-}
-
-nb::object PyDeviceList::GetSlice(nb::slice slice) {
-  switch (device_list_.index()) {
-    case 0: {
-      const xla::ifrt::DeviceListRef& device_list = std::get<0>(device_list_);
-      const absl::Span<xla::ifrt::Device* const> devices =
-          device_list->devices();
-      Py_ssize_t start, stop, step, slicelength;
-      if (PySlice_GetIndicesEx(slice.ptr(), devices.size(), &start, &stop,
-                               &step, &slicelength) != 0) {
-        throw nb::python_error();
-      }
-      nb::tuple out = nb::steal<nb::tuple>(PyTuple_New(slicelength));
-      for (size_t i = 0; i < slicelength; ++i) {
-        nb::object d = py_client_->GetPyDevice(devices[start]);
-        PyTuple_SET_ITEM(out.ptr(), i, d.release().ptr());
-        start += step;
-      }
-      return std::move(out);
-    }
-    case 1:
-      return std::get<1>(device_list_).attr("__getitem__")(slice);
-    default:
-      throw nb::value_error("Unrecognized DeviceList type");
-  }
-}
-
-nb::tuple PyDeviceList::AsTuple() const {
-  switch (device_list_.index()) {
-    case 0: {
-      const xla::ifrt::DeviceListRef& device_list = std::get<0>(device_list_);
-      nb::tuple out = nb::steal<nb::tuple>(PyTuple_New(device_list->size()));
-      int i = 0;
-      for (xla::ifrt::Device* device : device_list->devices()) {
-        nb::object d = py_client_->GetPyDevice(device);
-        PyTuple_SET_ITEM(out.ptr(), i, d.release().ptr());
-        ++i;
-      }
-      return out;
-    }
-    case 1:
-      return std::get<1>(device_list_);
-    default:
-      throw nb::value_error("Unrecognized DeviceList type");
-  }
-}
-
-nb::iterator PyDeviceList::Iter() {
-  switch (device_list_.index()) {
-    case 0: {
-      // Iterator whose deference converts `xla::ifrt::Device*` into JAX
-      // `PjRtDevice`.
-      struct Iterator {
-        void operator++() { ++it; }
-        bool operator==(const Iterator& other) const { return it == other.it; }
-        xla::nb_class_ptr<xla::PyDevice> operator*() const {
-          return py_client->GetPyDevice(*it);
-        }
-        xla::nb_class_ptr<xla::PyClient> py_client;
-        absl::Span<xla::ifrt::Device* const>::const_iterator it;
-      };
-      return nb::make_iterator(
-          nb::type<PyDeviceList>(), "ifrt_device_iterator",
-          Iterator{py_client_, std::get<0>(device_list_)->devices().cbegin()},
-          Iterator{py_client_, std::get<0>(device_list_)->devices().cend()});
-    }
-    case 1:
-      return nb::make_iterator(
-          nb::type<PyDeviceList>(), "python_device_iterator",
-          std::get<1>(device_list_).begin(), std::get<1>(device_list_).end());
-    default:
-      throw nb::value_error("Unrecognized DeviceList type");
-  }
-}
-
-std::string PyDeviceList::Str() {
-  return nb::cast<std::string>(nb::str(AsTuple()));
-}
-
-nb::tuple PyDeviceList::Dump() const { return AsTuple(); }
-
-bool PyDeviceList::IsFullyAddressable() {
-  if (!is_fully_addressable_.has_value()) {
-    is_fully_addressable_ = true;
-    switch (device_list_.index()) {
-      case 0: {
-        const int process_index = py_client_ ? py_client_->process_index() : 0;
-        for (const xla::ifrt::Device* device :
-             std::get<0>(device_list_)->devices()) {
-          if (device->ProcessIndex() != process_index) {
-            is_fully_addressable_ = false;
-            break;
-          }
-        }
-        break;
-      }
-      case 1: {
-        for (nb::handle device : std::get<1>(device_list_)) {
-          if (nb::cast<int>(device.attr("process_index")) !=
-              nb::cast<int>(device.attr("client").attr("process_index")())) {
-            is_fully_addressable_ = false;
-            break;
-          }
-        }
-        break;
-      }
-      default:
-        throw nb::value_error("Unrecognized DeviceList type");
-    }
-  }
-  return *is_fully_addressable_;
-}
-
-/*static*/ xla::nb_class_ptr<PyDeviceList> PyDeviceList::AddressableDeviceList(
-    xla::nb_class_ptr<PyDeviceList> self) {
-  nb::ft_object_guard lock(self);
-  if (self->IsFullyAddressable()) {
-    // Do not cache this result in `addressable_device_list_`. Otherwise, it
-    // will create a cycle that prevents deletion of this object.
-    return self;
-  }
-  if (!self->addressable_device_list_.has_value()) {
-    switch (self->device_list_.index()) {
-      case 0: {
-        absl::InlinedVector<xla::ifrt::Device*, 1> addressable_devices;
-        const int process_index =
-            self->py_client_ ? self->py_client_->process_index() : 0;
-        for (xla::ifrt::Device* device :
-             std::get<0>(self->device_list_)->devices()) {
-          if (device->ProcessIndex() == process_index) {
-            addressable_devices.push_back(device);
-          }
-        }
-        self->addressable_device_list_ = xla::make_nb_class<PyDeviceList>(
-            self->py_client_, self->py_client_->ifrt_client()->MakeDeviceList(
-                                  addressable_devices));
-        break;
-      }
-      case 1: {
-        auto device_list = std::get<1>(self->device_list_);
-        std::vector<nb::object> addressable_devices;
-        for (size_t i = 0; i < device_list.size(); ++i) {
-          nb::object device = device_list[i];
-          if (nb::cast<int>(device.attr("process_index")) ==
-              nb::cast<int>(device.attr("client").attr("process_index")())) {
-            addressable_devices.push_back(std::move(device));
-          }
-        }
-        self->addressable_device_list_ = xla::make_nb_class<PyDeviceList>(
-            xla::MutableSpanToNbTuple(absl::MakeSpan(addressable_devices)));
-        break;
-      }
-      default:
-        throw nb::value_error("Unrecognized DeviceList type");
-    }
-  }
-  return *self->addressable_device_list_;
-}
-
-void PyDeviceList::PopulateMemoryKindInfo() {
-  if (device_list_.index() == 1) {
-    // Handle Python duck-type devices in a separate function for readability.
-    PopulateMemoryKindInfoForDuckTypedDevices();
-    return;
-  }
-  if (device_list_.index() != 0) {
-    throw nb::value_error("Unrecognized DeviceList type");
-  }
-  MemoryKindInfo info;
-  xla::ifrt::Device* addressable_device = nullptr;
-  const int process_index = py_client_ ? py_client_->process_index() : 0;
-  for (xla::ifrt::Device* device : std::get<0>(device_list_)->devices()) {
-    if (device->ProcessIndex() == process_index) {
-      addressable_device = device;
-      break;
-    }
-  }
-  if (addressable_device == nullptr) {
-    info.default_memory_kind = nb::none();
-    memory_kind_info_ = std::move(info);
-    return;
-  }
-
-  auto default_memory = addressable_device->DefaultMemory();
-  if (!default_memory.ok()) {
-    // Cache the error.
-    memory_kind_info_ = default_memory.status();
-    return;
-  }
-  info.default_memory_kind = nb::cast(*(*default_memory)->Kind().memory_kind());
-  nb::tuple memory_kinds =
-      nb::steal<nb::tuple>(PyTuple_New(addressable_device->Memories().size()));
-  for (size_t i = 0; i < addressable_device->Memories().size(); ++i) {
-    auto* memory = addressable_device->Memories()[i];
-    nb::str s = nb::str(memory->Kind().memory_kind()->data(),
-                        memory->Kind().memory_kind()->size());
-    PyTuple_SET_ITEM(memory_kinds.ptr(), i, s.release().ptr());
-  }
-  info.memory_kinds = std::move(memory_kinds);
-  memory_kind_info_ = std::move(info);
-}
-
-void PyDeviceList::PopulateMemoryKindInfoForDuckTypedDevices() {
-  MemoryKindInfo info;
-  try {
-    nb::handle addressable_device;
-    for (nb::handle device : std::get<1>(device_list_)) {
-      if (nb::cast<int>(device.attr("process_index")) ==
-          nb::cast<int>(device.attr("client").attr("process_index")())) {
-        addressable_device = device;
-        break;
-      }
-    }
-    if (!addressable_device) {
-      info.default_memory_kind = nb::none();
-      // info.memory_kinds is default-initialized to an empty tuple.
-      memory_kind_info_ = std::move(info);
-      return;
-    }
-    auto default_memory = addressable_device.attr("default_memory")();
-    info.default_memory_kind = default_memory.attr("kind");
-    info.memory_kinds = nb::tuple(
-        nb::object(addressable_device.attr("addressable_memories")()));
-    memory_kind_info_ = std::move(info);
-  } catch (nb::python_error& e) {
-    // Cache the error.
-    memory_kind_info_ = xla::InvalidArgument("%s", e.what());
-  }
-}
-
-/*static*/ absl::StatusOr<nb::tuple> PyDeviceList::MemoryKinds(
-    xla::nb_class_ptr<PyDeviceList> self) {
-  nb::ft_object_guard lock(self);
-  if (!self->memory_kind_info_.has_value()) {
-    self->PopulateMemoryKindInfo();
-  }
-  if (!self->memory_kind_info_->ok()) {
-    return self->memory_kind_info_->status();
-  }
-  return (*self->memory_kind_info_)->memory_kinds;
-}
-
-/*static*/ absl::StatusOr<nb::object> PyDeviceList::DefaultMemoryKind(
-    xla::nb_class_ptr<PyDeviceList> self) {
-  nb::ft_object_guard lock(self);
-  if (!self->memory_kind_info_.has_value()) {
-    self->PopulateMemoryKindInfo();
-  }
-  if (!self->memory_kind_info_->ok()) {
-    return self->memory_kind_info_->status();
-  }
-  return (*self->memory_kind_info_)->default_memory_kind;
-}
-
-/*static*/ void PyDeviceList::Register(nb::module_& m) {
-  nb::class_<PyDeviceList>(m, "DeviceList")
-      .def(nb::init<nb::tuple>())
-      .def("__hash__", &PyDeviceList::Hash, nb::lock_self())
-      .def("__eq__", &PyDeviceList::Equal)
-      .def("__ne__", &PyDeviceList::NotEqual)
-      .def("__len__", &PyDeviceList::Len)
-      .def("__getitem__", &PyDeviceList::GetItem)
-      .def("__getitem__", &PyDeviceList::GetSlice)
-      .def("__iter__", &PyDeviceList::Iter, nb::keep_alive<0, 1>())
-      .def("__str__", &PyDeviceList::Str)
-      .def("__repr__", &PyDeviceList::Str)
-      .def("__getstate__", [](const PyDeviceList& l) { return l.Dump(); })
-      .def("__setstate__",
-           [](PyDeviceList& self, nb::tuple t) {
-             new (&self) PyDeviceList(std::move(t));
-           })
-      .def_prop_ro("is_fully_addressable", &PyDeviceList::IsFullyAddressable,
-                   nb::lock_self())
-      .def_prop_ro("addressable_device_list",
-                   &PyDeviceList::AddressableDeviceList)
-      // `xla::ValueOrThrowWrapper` does not work with
-      // `def_prop_ro()`. Manually convert an error into an exception.
-      .def_prop_ro("default_memory_kind",
-                   [](xla::nb_class_ptr<PyDeviceList> l) {
-                     auto kind = DefaultMemoryKind(l);
-                     if (!kind.ok()) {
-                       throw nb::value_error(kind.status().ToString().c_str());
-                     }
-                     return *kind;
-                   })
-      .def_prop_ro("memory_kinds", [](xla::nb_class_ptr<PyDeviceList> l) {
-        auto kinds = MemoryKinds(l);
-        if (!kinds.ok()) {
-          throw nb::value_error(kinds.status().ToString().c_str());
-        }
-        return *kinds;
-      });
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/py_device_list.h b/third_party/xla/xla/python/py_device_list.h
deleted file mode 100644
index 132049670b9b..000000000000
--- a/third_party/xla/xla/python/py_device_list.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_DEVICE_LIST_H_
-#define XLA_PYTHON_PY_DEVICE_LIST_H_
-
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <variant>
-
-#include "absl/status/statusor.h"
-#include "nanobind/nanobind.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/py_client.h"
-#include "xla/tsl/concurrency/ref_count.h"
-
-namespace jax {
-
-// Device list with various caching and direct access to IFRT DeviceList.
-class PyDeviceList {
- public:
-  PyDeviceList(xla::nb_class_ptr<xla::PyClient> py_client,
-               xla::ifrt::DeviceListRef device_list);
-  explicit PyDeviceList(nanobind::tuple py_device_assignment);
-  ~PyDeviceList();
-
-  PyDeviceList(const PyDeviceList&) = delete;
-  PyDeviceList(PyDeviceList&&) = delete;
-  PyDeviceList& operator=(const PyDeviceList&) = delete;
-  PyDeviceList& operator=(PyDeviceList&&) = delete;
-
-  static nanobind::handle type() {
-    static auto type = nanobind::type<PyDeviceList>();
-    return type;
-  }
-
-  // These two methods are safe to call from C++ without GIL.
-  xla::nb_class_ptr<xla::PyClient> py_client() const { return py_client_; }
-  absl::StatusOr<xla::ifrt::DeviceListRef> ifrt_device_list() const;
-
-  int Len() const;                      // Requires the GIL in GIL mode.
-  nanobind::object GetItem(int index);  // Requires the GIL in GIL mode.
-
-  // Requires the GIL in GIL mode. Acquires the self lock in non-GIL mode.
-  static xla::nb_class_ptr<PyDeviceList> AddressableDeviceList(
-      xla::nb_class_ptr<PyDeviceList> self);
-
-  // Requires the GIL in GIL mode. Acquires the self lock in non-GIL mode.
-  static absl::StatusOr<nanobind::object> DefaultMemoryKind(
-      xla::nb_class_ptr<PyDeviceList> self);
-
-  // Requires the GIL in GIL mode. Acquires the self lock in non-GIL mode.
-  static absl::StatusOr<nanobind::tuple> MemoryKinds(
-      xla::nb_class_ptr<PyDeviceList> self);
-
-  // go/pywald-pybind-annotation BEGIN
-  // refs {
-  //   module_path: "third_party/tensorflow/compiler/xla/python/xla.cc"
-  //   module_arg {}
-  // }
-  // go/pywald-pybind-annotation END
-  static void Register(nanobind::module_& m);
-
- private:
-  nanobind::tuple AsTuple() const;
-
-  // Methods below require GIL.
-  nanobind::object GetSlice(nanobind::slice slice);
-  nanobind::iterator Iter();
-
-  std::string Str();
-
-  nanobind::tuple Dump() const;
-
-  int64_t Hash();  // Mutates hash_, needs self lock.
-
-  static bool Equal(xla::nb_class_ptr<PyDeviceList> self,
-                    nanobind::handle other);
-  static bool NotEqual(xla::nb_class_ptr<PyDeviceList> self,
-                       nanobind::handle other);
-
-  // Finds the memory kind info from an addressable device. Requires the GIL
-  // or self lock.
-  void PopulateMemoryKindInfo();
-  // Same as `PopulateMemoryKindInfo()`, but uses `py_device_assignment_`
-  // instead of `ifrt_device_list_` to support duck-typed device objects.
-  // Requires the GIL or self lock.
-  void PopulateMemoryKindInfoForDuckTypedDevices();
-
-  // Requires the self lock or GIL is held.
-  bool IsFullyAddressable();
-
-  // Valid only if `device_list_` contains `xla::ifrt::DeviceList` and
-  // non-empty.
-  xla::nb_class_ptr<xla::PyClient> py_client_;
-
-  // Either C++ `ifrt::DeviceList` or Python duck-type devices.
-  // TODO(hyeontaek): Remove support for Python duck-type devices once all
-  // JAX backends and tests are migrated to use an `xla::ifrt::Device` type
-  // for JAX devices.
-  // Immutable after constructor; no locking needed.
-  std::variant<xla::ifrt::DeviceListRef, nanobind::tuple> device_list_;
-
-  // Populated on demand. Guarded by the object's self lock.
-  std::optional<ssize_t> hash_;
-  // TODO(hyeontaek): Make the following property cached within
-  // `xla::ifrt::DeviceList`.
-  // Populated on demand. Guarded by the object's self lock.
-  std::optional<bool> is_fully_addressable_;
-  // Populated on demand. Guarded by the object's self lock.
-  std::optional<xla::nb_class_ptr<PyDeviceList>> addressable_device_list_;
-
-  struct MemoryKindInfo {
-    nanobind::object default_memory_kind;
-    nanobind::tuple memory_kinds;
-  };
-  // Populated on demand. Guarded by the object's self lock.
-  std::optional<absl::StatusOr<MemoryKindInfo>> memory_kind_info_;
-};
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_PY_DEVICE_LIST_H_
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
deleted file mode 100644
index 80384ba925f4..000000000000
--- a/third_party/xla/xla/python/py_executable.cc
+++ /dev/null
@@ -1,463 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_executable.h"
-
-#include <Python.h>
-
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/future.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_device.h"
-#include "xla/python/traceback.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/fingerprint.h"
-#include "tsl/profiler/lib/traceme.h"
-
-namespace xla {
-
-namespace nb = nanobind;
-
-absl::Status PyToken::Await() {
-  CHECK(future_.IsValid());
-  nb::gil_scoped_release gil_release;
-  return future_.Await();
-}
-
-absl::Status PyShardedToken::Await() {
-  nb::gil_scoped_release gil_release;
-  absl::Status status = absl::OkStatus();
-  for (auto& future : futures_) {
-    auto s = future.Await();
-    if (!s.ok()) status = std::move(s);
-  }
-  return status;
-}
-
-PyLoadedExecutable::PyLoadedExecutable(
-    nb_class_ptr<PyClient> client,
-    std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
-    std::optional<nb_traceback> traceback,
-    std::optional<std::string> fingerprint)
-    : client_(std::move(client)),
-      ifrt_loaded_executable_(std::move(ifrt_loaded_executable)),
-      traceback_(std::move(traceback)),
-      fingerprint_(std::move(fingerprint)),
-      next_launch_id_(
-          fingerprint_.has_value() ? tsl::Fingerprint32(*fingerprint_) : 1) {
-  CHECK(PyGILState_Check());
-  if (fingerprint_) {
-    VLOG(1) << "Fingerprint for executable " << ifrt_loaded_executable_->name()
-            << ": " << *fingerprint_;
-  }
-  nb::ft_lock_guard lock(client_->executables_mutex_);
-  next_ = client_->executables_;
-  client_->executables_ = this;
-  prev_ = nullptr;
-  if (next_) {
-    next_->prev_ = this;
-  }
-}
-
-PyLoadedExecutable::~PyLoadedExecutable() {
-  CHECK(PyGILState_Check());
-  nb::ft_lock_guard lock(client_->executables_mutex_);
-  if (client_->executables_ == this) {
-    client_->executables_ = next_;
-  }
-  if (prev_) {
-    prev_->next_ = next_;
-  }
-  if (next_) {
-    next_->prev_ = prev_;
-  }
-}
-
-std::vector<nb_class_ptr<PyDevice>> PyLoadedExecutable::AddressableDevices()
-    const {
-  std::vector<nb_class_ptr<PyDevice>> devices;
-  devices.reserve(ifrt_loaded_executable_->addressable_devices().size());
-  for (ifrt::Device* device : ifrt_loaded_executable_->addressable_devices()) {
-    devices.push_back(client_->GetPyDevice(device));
-  }
-  return devices;
-}
-
-namespace {
-
-// Traits classes of common methods for std::vector<PyArray>.
-template <typename ShardedBufferT>
-struct ShardedBufferAdapter;
-
-template <>
-struct ShardedBufferAdapter<ExecuteShardedArg> {
-  static int num_devices(const ExecuteShardedArg& arg) {
-    if (std::holds_alternative<PyArray>(arg)) {
-      return std::get<PyArray>(arg).num_addressable_shards();
-    } else {
-      return std::get<std::vector<PyArray>>(arg).size();
-    }
-  }
-  static tsl::RCReference<ifrt::Array> GetIfRtArray(
-      const ExecuteShardedArg& arg) {
-    if (std::holds_alternative<PyArray>(arg)) {
-      return tsl::FormRef(std::get<PyArray>(arg).ifrt_array());
-    }
-    auto& arg_vector = std::get<std::vector<PyArray>>(arg);
-
-    // TODO(hyeontaek): This on-demand Array creation is not efficient and has
-    // insufficient information about the shape (a dummy shape is used). This
-    // should be removed if possible and only be used in the context where the
-    // shape information is unused.
-    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
-    ifrt_arrays.reserve(arg_vector.size());
-    absl::InlinedVector<ifrt::Device*, 1> devices;
-    devices.reserve(arg_vector.size());
-    for (auto& arr : arg_vector) {
-      CHECK_EQ(arr.ifrt_array()->sharding().devices()->size(), 1)
-          << arr.ifrt_array()->sharding().DebugString();
-      ifrt_arrays.push_back(tsl::FormRef(arr.ifrt_array()));
-      devices.push_back(
-          arr.ifrt_array()->sharding().devices()->devices().front());
-    }
-    CHECK(!ifrt_arrays.empty());
-    // Use a dummy shape.
-    // TODO(hyeontaek): Find a way to compute a correct shape.
-    // TODO(yashkatariya): Plumb sharding or memory_kind here.
-    ifrt::Client* client = ifrt_arrays.front()->client();
-    auto ifrt_array = client->AssembleArrayFromSingleDeviceArrays(
-        ifrt_arrays.front()->shape(),
-        ifrt::OpaqueSharding::Create(client->MakeDeviceList(devices),
-                                     ifrt::MemoryKind()),
-        absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput,
-        ifrt::SingleDeviceShardSemantics::kAddressableShards);
-    TF_CHECK_OK(ifrt_array.status());
-    return *ifrt_array;
-  }
-};
-
-void PopulateExecuteShardedResults(
-    const nb_class_ptr<PyClient>& client,
-    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
-    const PjRtFuture<>& result_status, int num_computations,
-    std::vector<std::vector<PyArray>>& outputs) {
-  auto traceback = Traceback::Get();
-  DCHECK_GT(num_computations, 0);
-  int num_output_buffers = ifrt_arrays.size();
-  outputs.resize(num_output_buffers);
-  for (int buffer_id = 0; buffer_id < num_output_buffers; ++buffer_id) {
-    outputs[buffer_id].reserve(num_computations);
-    auto exploded_arrays =
-        ifrt_arrays[buffer_id]->DisassembleIntoSingleDeviceArrays(
-            ifrt::ArrayCopySemantics::kReuseInput,
-            ifrt::SingleDeviceShardSemantics::kAddressableShards);
-    TF_CHECK_OK(exploded_arrays.status());
-    for (auto& exploded_array : *exploded_arrays) {
-      outputs[buffer_id].push_back(PyArray::MakeFromSingleDeviceArray(
-          client, traceback, std::move(exploded_array), false, true,
-          result_status));
-    }
-  }
-}
-
-template <typename ArgT, typename ArgAdapter = ShardedBufferAdapter<ArgT>>
-absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
-    const ifrt::ExecuteOptions& options, const nb_class_ptr<PyClient>& client,
-    ifrt::LoadedExecutable* ifrt_loaded_executable, absl::Span<const ArgT> args,
-    std::optional<std::vector<PjRtFuture<>>>& returned_futures) {
-  std::vector<tsl::RCReference<ifrt::Array>> output_arrays;
-  std::unique_ptr<ifrt::Future<>> returned_future;
-  int num_computations = ifrt_loaded_executable->addressable_devices().size();
-  PjRtFuture<> result_status;
-  {
-    nb::gil_scoped_release gil_release;
-    for (const auto& arg : args) {
-      if (ArgAdapter::num_devices(arg) != num_computations) {
-        return InvalidArgument(
-            "Expected args to execute_sharded_on_local_devices to have %d "
-            "shards, got: [%s]",
-            num_computations,
-            absl::StrJoin(args, ", ", [](std::string* out, const ArgT& arg) {
-              out->append(std::to_string(ArgAdapter::num_devices(arg)));
-            }));
-      }
-    }
-    std::vector<tsl::RCReference<ifrt::Array>> arg_arrays(args.size());
-    absl::c_transform(args, arg_arrays.begin(), [&](const ArgT& arg) mutable {
-      return ArgAdapter::GetIfRtArray(arg);
-    });
-    TF_ASSIGN_OR_RETURN(auto result, ifrt_loaded_executable->Execute(
-                                         absl::MakeSpan(arg_arrays), options,
-                                         /*devices=*/std::nullopt));
-    output_arrays = std::move(result.outputs);
-    // options.fill_status is only supposed to be true when the computation has
-    // tokens.
-    if (options.fill_status) {
-      result_status = result.status;
-      if (returned_futures.has_value()) {
-        returned_futures->resize(num_computations, std::move(result.status));
-      }
-    }
-  }
-
-  // TODO(b/240696624): Although the PjRt interface require `returned_futures`
-  // to be resized correctly if it is not nullopt, some implementation does not
-  // implement this. So we have to check whether returned_futures is empty.
-  // Remove this check once the implementation is fixed.
-  auto py_sharded_token = returned_futures.has_value()
-                              ? PyShardedToken(std::move(*returned_futures))
-                              : PyShardedToken();
-
-  return PyExecuteResults(client, std::move(output_arrays), num_computations,
-                          std::move(py_sharded_token), result_status);
-}
-
-}  // namespace
-
-PyExecuteResults::PyExecuteResults(
-    const nb_class_ptr<PyClient>& client,
-    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
-    int num_computations, PyShardedToken token, PjRtFuture<> result_status)
-    : client_(client),
-      ifrt_arrays_(std::move(ifrt_arrays)),
-      num_computations_(num_computations),
-      token_(std::move(token)),
-      result_status_(std::move(result_status)) {}
-
-void PyExecuteResults::CheckNotDisassembled() const {
-  if (is_exploded_) {
-    throw nb::value_error("ExecuteResults already exploded.");
-  }
-}
-
-std::vector<tsl::RCReference<ifrt::Array>> PyExecuteResults::Consume() {
-  CheckNotDisassembled();
-  is_exploded_ = true;
-  return std::move(ifrt_arrays_);
-}
-
-PyShardedToken PyExecuteResults::ConsumeToken() {
-  if (token_consumed_) {
-    throw nb::value_error("ExecuteResults token already consumed.");
-  }
-  token_consumed_ = true;
-  return std::move(token_);
-}
-
-std::vector<std::vector<PyArray>>
-PyExecuteResults::DisassembleIntoSingleDeviceArrays() {
-  std::vector<std::vector<PyArray>> outputs;
-  PopulateExecuteShardedResults(
-      client_, Consume(),
-      result_status_.IsValid() ? result_status_ : PjRtFuture<>(),
-      num_computations_, outputs);
-  return outputs;
-}
-
-std::vector<std::vector<PyArray>>
-PyExecuteResults::DisassemblePrefixIntoSingleDeviceArrays(size_t n) {
-  CheckNotDisassembled();
-  if (n > ifrt_arrays_.size()) {
-    throw nb::value_error(
-        absl::StrCat("In DisassemblePrefixIntoSingleDeviceArrays: ", n, " > ",
-                     ifrt_arrays_.size())
-            .c_str());
-  }
-  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
-  ifrt_arrays.reserve(ifrt_arrays_.size() - n);
-  for (size_t i = n; i < ifrt_arrays_.size(); ++i) {
-    ifrt_arrays.push_back(std::move(ifrt_arrays_[i]));
-  }
-  ifrt_arrays_.erase(ifrt_arrays_.begin() + n, ifrt_arrays_.end());
-  std::swap(ifrt_arrays_, ifrt_arrays);
-  std::vector<std::vector<PyArray>> outputs;
-  PopulateExecuteShardedResults(
-      client_, std::move(ifrt_arrays),
-      result_status_.IsValid() ? result_status_ : PjRtFuture<>(),
-      num_computations_, outputs);
-  return outputs;
-}
-
-std::vector<nb::object> PyExecuteResults::ConsumeWithHandlers(
-    std::vector<std::variant<const PyArrayResultHandler*, nb::object>>
-        out_handlers) {
-  std::vector<nb::object> outputs;
-  auto ifrt_arrays = Consume();
-  auto traceback = Traceback::Get();
-  DCHECK_GT(num_computations_, 0);
-  int num_output_buffers = ifrt_arrays.size();
-  outputs.reserve(num_output_buffers);
-  if (out_handlers.size() != num_output_buffers) {
-    throw nb::value_error(
-        absl::StrCat("Mismatch between out_handlers and num_results: ",
-                     out_handlers.size(), " vs ", num_output_buffers)
-            .c_str());
-  }
-  for (int buffer_id = 0; buffer_id < num_output_buffers; ++buffer_id) {
-    auto& handler = out_handlers[buffer_id];
-    if (std::holds_alternative<const PyArrayResultHandler*>(handler)) {
-      outputs.push_back(std::get<const PyArrayResultHandler*>(handler)->Call(
-          client_, std::move(ifrt_arrays[buffer_id]),
-          result_status_.IsValid() ? result_status_ : PjRtFuture<>()));
-    } else {
-      tsl::profiler::TraceMe traceme("ConsumeWithHandlers fallback.");
-      auto disassembled_arrays =
-          ifrt_arrays[buffer_id]->DisassembleIntoSingleDeviceArrays(
-              ifrt::ArrayCopySemantics::kReuseInput,
-              ifrt::SingleDeviceShardSemantics::kAddressableShards);
-      TF_CHECK_OK(disassembled_arrays.status());
-      nb::list bufs =
-          nb::steal<nb::list>(PyList_New(disassembled_arrays->size()));
-      int i = 0;
-      for (auto& disassembled_array : *disassembled_arrays) {
-        nb::object array = PyArray::MakeFromSingleDeviceArray(
-            client_, traceback, std::move(disassembled_array), false, true,
-            result_status_.IsValid() ? result_status_ : PjRtFuture<>());
-        PyList_SET_ITEM(bufs.ptr(), i, array.release().ptr());
-        ++i;
-      }
-      outputs.push_back(std::get<nb::object>(handler)(std::move(bufs)));
-    }
-  }
-  return outputs;
-}
-
-absl::StatusOr<std::vector<std::vector<PyArray>>>
-PyLoadedExecutable::ExecuteShardedOnLocalDevices(
-    absl::Span<const ExecuteShardedArg> args) {
-  xla::ifrt::ExecuteOptions options = options_;
-  options.launch_id = GetNextLaunchId();
-  options.fill_status = false;
-  options.execution_stream_id = tsl::Env::Default()->GetCurrentThreadId();
-  std::optional<std::vector<PjRtFuture<>>> returned_futures;
-  TF_ASSIGN_OR_RETURN(auto outputs_and_tokens,
-                      ExecuteShardedOnLocalDevicesInternal(
-                          options, client_, ifrt_loaded_executable_.get(), args,
-                          returned_futures));
-  return outputs_and_tokens.DisassembleIntoSingleDeviceArrays();
-}
-
-absl::StatusOr<std::pair<std::vector<std::vector<PyArray>>, PyShardedToken>>
-PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens(
-    absl::Span<const ExecuteShardedArg> args) {
-  xla::ifrt::ExecuteOptions options = options_;
-  options.launch_id = GetNextLaunchId();
-  options.fill_status = true;
-  options.execution_stream_id = tsl::Env::Default()->GetCurrentThreadId();
-  std::optional<std::vector<PjRtFuture<>>> returned_futures;
-  returned_futures.emplace();
-  TF_ASSIGN_OR_RETURN(auto outputs_and_tokens,
-                      ExecuteShardedOnLocalDevicesInternal(
-                          options, client_, ifrt_loaded_executable_.get(), args,
-                          returned_futures));
-  return std::make_pair(outputs_and_tokens.DisassembleIntoSingleDeviceArrays(),
-                        outputs_and_tokens.ConsumeToken());
-}
-
-absl::StatusOr<PyExecuteResults> PyLoadedExecutable::ExecuteSharded(
-    std::vector<ExecuteShardedArg> args, bool with_tokens) {
-  xla::ifrt::ExecuteOptions options = options_;
-  options.launch_id = GetNextLaunchId();
-  options.fill_status = with_tokens;
-  options.execution_stream_id = tsl::Env::Default()->GetCurrentThreadId();
-  std::optional<std::vector<PjRtFuture<>>> returned_futures;
-  if (with_tokens) {
-    returned_futures.emplace();
-  }
-  absl::Span<const ExecuteShardedArg> span_args = args;
-  return ExecuteShardedOnLocalDevicesInternal(options, client_,
-                                              ifrt_loaded_executable_.get(),
-                                              span_args, returned_futures);
-}
-
-absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
-PyLoadedExecutable::HloModules() const {
-  nb::gil_scoped_release gil_release;
-  return ifrt_loaded_executable_->GetHloModules();
-}
-
-absl::StatusOr<std::vector<std::vector<absl::string_view>>>
-PyLoadedExecutable::GetOutputMemoryKinds() const {
-  nb::gil_scoped_release gil_release;
-  return ifrt_loaded_executable_->GetOutputMemoryKinds();
-}
-
-absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
-PyLoadedExecutable::GetParameterLayouts() const {
-  nb::gil_scoped_release gil_release;
-  return ifrt_loaded_executable_->GetParameterLayouts();
-}
-
-absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
-PyLoadedExecutable::GetOutputLayouts() const {
-  nb::gil_scoped_release gil_release;
-  return ifrt_loaded_executable_->GetOutputLayouts();
-}
-
-std::optional<std::vector<OpSharding>>
-PyLoadedExecutable::GetParameterShardings() const {
-  nb::gil_scoped_release gil_release;
-  return ifrt_loaded_executable_->GetParameterShardings();
-}
-
-std::optional<std::vector<OpSharding>> PyLoadedExecutable::GetOutputShardings()
-    const {
-  nb::gil_scoped_release gil_release;
-  return ifrt_loaded_executable_->GetOutputShardings();
-}
-
-int64_t PyLoadedExecutable::GetNextLaunchId() {
-  return next_launch_id_.fetch_add(1, std::memory_order_relaxed);
-}
-
-void PyLoadedExecutable::KeepAlive(nb::object obj) {
-  keepalives_.push_back(std::move(obj));
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_executable.h b/third_party/xla/xla/python/py_executable.h
deleted file mode 100644
index 2f4940c87f50..000000000000
--- a/third_party/xla/xla/python/py_executable.h
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_EXECUTABLE_H_
-#define XLA_PYTHON_PY_EXECUTABLE_H_
-
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/layout.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/attribute_map.h"
-#include "xla/python/ifrt/executable.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pjrt_ifrt/pjrt_executable.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_client.h"
-#include "xla/python/traceback.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-
-class PyToken {
- public:
-  PyToken() = default;
-  explicit PyToken(PjRtFuture<> future) : future_(std::move(future)) {}
-
-  static PyToken ReadyPyToken() {
-    return PyToken(PjRtFuture<>(absl::OkStatus()));
-  }
-
-  absl::Status Await();
-
- private:
-  PjRtFuture<> future_;
-};
-
-// PyShardedToken contains a PyToken for each device's execution.
-class PyShardedToken {
- public:
-  // Default construction creates a always-ready token.
-  PyShardedToken() = default;
-  explicit PyShardedToken(std::vector<PjRtFuture<>> futures)
-      : futures_(std::move(futures)) {}
-
-  PyToken GetPyToken(int device_id) const {
-    if (futures_.empty()) return PyToken::ReadyPyToken();
-    return PyToken(futures_.at(device_id));
-  }
-
-  absl::Status Await();
-
- private:
-  std::vector<PjRtFuture<>> futures_;
-};
-
-class PyExecuteResults {
- public:
-  PyExecuteResults(const nb_class_ptr<PyClient>& client,
-                   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
-                   int num_computations, PyShardedToken token,
-                   PjRtFuture<> result_status = PjRtFuture<>());
-
-  std::vector<std::vector<PyArray>> DisassembleIntoSingleDeviceArrays();
-
-  std::vector<std::vector<PyArray>> DisassemblePrefixIntoSingleDeviceArrays(
-      size_t n);
-
-  std::vector<nanobind::object> ConsumeWithHandlers(
-      std::vector<std::variant<const PyArrayResultHandler*, nanobind::object>>
-          out_handlers);
-
-  std::vector<tsl::RCReference<ifrt::Array>> Consume();
-
-  PyShardedToken ConsumeToken();
-
-  size_t Size() const {
-    CheckNotDisassembled();
-    return ifrt_arrays_.size();
-  }
-
-  void CheckNotDisassembled() const;
-
- private:
-  bool is_exploded_ = false;
-  bool token_consumed_ = false;
-  nb_class_ptr<PyClient> client_;
-  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays_;
-  int num_computations_;
-  PyShardedToken token_;
-  // Only set if the computation has tokens.
-  PjRtFuture<> result_status_;
-};
-
-using ExecuteShardedArg = std::variant<PyArray, std::vector<PyArray>>;
-
-// Python wrapper around PjRtExecutable. We use a wrapper class:
-// a) to keep the PyClient alive via a std::shared_ptr<>
-// b) to add Python-specific functionality.
-class PyLoadedExecutable {
- public:
-  PyLoadedExecutable(
-      nb_class_ptr<PyClient> client,
-      std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
-      std::optional<nb_traceback> traceback,
-      std::optional<std::string> fingerprint);
-  ~PyLoadedExecutable();
-
-  nb_class_ptr<PyClient> client() const { return client_; }
-  ifrt::LoadedExecutable* ifrt_loaded_executable() const {
-    return ifrt_loaded_executable_.get();
-  }
-
-  std::shared_ptr<ifrt::LoadedExecutable> shared_ifrt_loaded_executable() {
-    return ifrt_loaded_executable_;
-  }
-
-  std::vector<nb_class_ptr<PyDevice>> AddressableDevices() const;
-
-  int64_t SizeOfGeneratedCodeInBytes() const {
-    return ifrt_loaded_executable_->SizeOfGeneratedCodeInBytes();
-  }
-
-  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
-    nanobind::gil_scoped_release scope;
-    return ifrt_loaded_executable_->GetCompiledMemoryStats();
-  }
-
-  absl::StatusOr<xla::ifrt::AttributeMap> GetCostAnalysis() const {
-    return ifrt_loaded_executable_->GetCostAnalysis();
-  }
-
-  void Delete() {
-    // TODO(hyeontaek): Return absl::Status.
-    TF_CHECK_OK(ifrt_loaded_executable_->Delete().Await());
-  }
-
-  bool is_deleted() { return ifrt_loaded_executable_->IsDeleted(); }
-
-  // Takes args indexed by argid then deviceid, transposes them, and passes to
-  // PjRtExecutable::Execute. The result is similarly transposed back into the
-  // argid,deviceid format.
-  // args is [num_args x num_devices].
-  absl::StatusOr<std::vector<std::vector<PyArray>>>
-  ExecuteShardedOnLocalDevices(absl::Span<const ExecuteShardedArg> args);
-
-  absl::StatusOr<std::pair<std::vector<std::vector<PyArray>>, PyShardedToken>>
-  ExecuteShardedOnLocalDevicesWithTokens(
-      absl::Span<const ExecuteShardedArg> args);
-
-  absl::StatusOr<PyExecuteResults> ExecuteSharded(
-      std::vector<ExecuteShardedArg> args, bool with_tokens);
-
-  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> HloModules() const;
-
-  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
-  GetOutputMemoryKinds() const;
-
-  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
-  GetParameterLayouts() const;
-
-  absl::StatusOr<std::vector<std::shared_ptr<const PjRtLayout>>>
-  GetOutputLayouts() const;
-
-  std::optional<std::vector<OpSharding>> GetParameterShardings() const;
-
-  std::optional<std::vector<OpSharding>> GetOutputShardings() const;
-
-  const std::optional<nb_traceback>& traceback() { return traceback_; }
-
-  ifrt::LoadedExecutable* ifrt_executable() const {
-    return ifrt_loaded_executable_.get();
-  }
-
-  // Short-term escape hatch to get PjRtLoadedExecutable from PyExecutable.
-  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
-  std::shared_ptr<PjRtLoadedExecutable> shared_ptr_pjrt_executable() {
-    auto* exec = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleLoadedExecutable>(
-        ifrt_loaded_executable_.get());
-    if (exec == nullptr) {
-      throw XlaRuntimeError(
-          "This operation is implemented for a PjRt-compatible backend only.");
-    }
-    return exec->shared_ptr_pjrt_loaded_executable();
-  }
-
-  // Returns a template of execute options to pass to
-  // `ifrt_executable()->Execute()`. Note that the caller may need to override
-  // some options such as `launch_id` that change at each execution.
-  const ifrt::ExecuteOptions& options() const { return options_; }
-
-  // Returns a unique launch ID to use for the next execution.
-  int64_t GetNextLaunchId();
-
-  const std::optional<std::string>& fingerprint() const { return fingerprint_; }
-
-  // Keep `obj` alive as long as PyLoadedExecutable.
-  void KeepAlive(nanobind::object obj);
-
- private:
-  friend class PyClient;
-
-  nb_class_ptr<PyClient> client_;
-  std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable_;
-  std::optional<nb_traceback> traceback_;
-
-  // Identical executables (i.e. representing the same program) will have the
-  // same fingerprint. nullopt on platforms or executables where fingerprints
-  // aren't implemented.
-  std::optional<std::string> fingerprint_;
-
-  // Launch ID to use for the next execution.
-  std::atomic<int64_t> next_launch_id_;
-
-  // The options to pass to `executable_.Execute`.
-  ifrt::ExecuteOptions options_;
-
-  // Python objects to keep alive as requested by user.
-  std::vector<nanobind::object> keepalives_;
-
-  // Doubly-linked list of all executables known to the client. Protected by the
-  // GIL.
-  PyLoadedExecutable* next_;
-  PyLoadedExecutable* prev_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_EXECUTABLE_H_
diff --git a/third_party/xla/xla/python/py_host_callback.cc b/third_party/xla/xla/python/py_host_callback.cc
deleted file mode 100644
index ba7f72c51385..000000000000
--- a/third_party/xla/xla/python/py_host_callback.cc
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_host_callback.h"
-
-#include <cstdint>
-#include <exception>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/any.pb.h"
-#include "absl/algorithm/container.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "llvm/Support/ExtensibleRTTI.h"
-#include "nanobind/nanobind.h"
-#include "xla/layout_util.h"
-#include "xla/pjrt/host_callback.h"
-#include "xla/pjrt/pjrt_compiler.h"
-#include "xla/python/callback.h"
-#include "xla/python/ifrt/client.h"
-#include "xla/python/ifrt/host_callback.h"
-#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
-#include "xla/python/pjrt_ifrt/xla_host_callback.pb.h"
-#include "xla/python/py_host_callback.pb.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/types.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-
-char PyCpuLoadedHostCallback::ID = 0;
-char PyHostSendAndRecvLoadedHostCallback::ID = 0;
-
-namespace {
-
-absl::StatusOr<std::vector<CpuCallback::Arg>> CreateCallbackArgs(
-    absl::Span<const Shape> operand_shapes) {
-  std::vector<CpuCallback::Arg> callback_args(operand_shapes.size());
-  for (int i = 0; i < operand_shapes.size(); ++i) {
-    Shape shape = operand_shapes[i];
-
-    if (shape.IsArray()) {
-      Shape layout =
-          (shape.has_layout() ? shape
-                              : LayoutUtil::GetWithDefaultLayout(shape));
-      callback_args[i].dims.resize(shape.dimensions_size());
-      absl::c_copy(shape.dimensions(), callback_args[i].dims.begin());
-      callback_args[i].strides = ByteStridesForShape(layout);
-      callback_args[i].type = shape.element_type();
-      callback_args[i].size_in_bytes = ShapeUtil::ByteSizeOf(layout);
-      TF_ASSIGN_OR_RETURN(callback_args[i].dtype,
-                          PrimitiveTypeToNbDtype(shape.element_type()));
-    } else if (shape.IsToken()) {
-      callback_args[i].type = TOKEN;
-    } else {
-      return InvalidArgument(
-          "Only array and token arguments to Python callbacks are supported, "
-          "got %s",
-          shape.ToString());
-    }
-  }
-  return callback_args;
-}
-
-absl::StatusOr<std::vector<CpuCallback::Result>> CreateCallbackResults(
-    absl::Span<const Shape> result_shapes) {
-  std::vector<CpuCallback::Result> callback_results(result_shapes.size());
-  for (int i = 0; i < result_shapes.size(); ++i) {
-    if (result_shapes[i].IsArray()) {
-      const Shape& shape =
-          result_shapes[i].has_layout()
-              ? result_shapes[i]
-              : LayoutUtil::GetWithDefaultLayout(result_shapes[i]);
-      callback_results[i].expected_dims.resize(shape.dimensions_size());
-      absl::c_copy(shape.dimensions(),
-                   callback_results[i].expected_dims.begin());
-      callback_results[i].expected_strides = ByteStridesForShape(shape);
-      callback_results[i].type = shape.element_type();
-      callback_results[i].size_in_bytes = ShapeUtil::ByteSizeOf(shape);
-      callback_results[i].reversed_layout.resize(shape.dimensions_size());
-      absl::c_reverse_copy(shape.layout().minor_to_major(),
-                           callback_results[i].reversed_layout.begin());
-    } else if (result_shapes[i].IsToken()) {
-      callback_results[i].type = TOKEN;
-    } else {
-      return InvalidArgument(
-          "Only array and token return values from Python callbacks are "
-          "supported, got %s",
-          result_shapes[i].ToString());
-    }
-  }
-  return callback_results;
-}
-
-}  // namespace
-
-absl::StatusOr<tsl::RCReference<PyCpuLoadedHostCallback>>
-PyCpuLoadedHostCallback::Create(ifrt::Client* ifrt_client,
-                                nb::callable callable,
-                                absl::Span<const Shape> operand_shapes,
-                                absl::Span<const Shape> result_shapes) {
-  ifrt::PlatformId platform_id = ifrt_client->platform_id();
-  if (platform_id != CpuId() && platform_id != CudaId() &&
-      platform_id != RocmId() && platform_id != SyclId()) {
-    return Unimplemented("CpuCallback supports CPU and GPU only");
-  }
-
-  TF_ASSIGN_OR_RETURN(auto callback_args, CreateCallbackArgs(operand_shapes));
-  TF_ASSIGN_OR_RETURN(auto callback_results,
-                      CreateCallbackResults(result_shapes));
-
-  // `callable` will be destroyed safely with `PythonRefManager` when
-  // `CpuCallback` is destroyed.
-  auto cpu_callback = std::make_unique<CpuCallback>(
-      std::move(callable), callback_args, callback_results);
-  return tsl::RCReference<PyCpuLoadedHostCallback>(
-      tsl::MakeRef<PyCpuLoadedHostCallback>(ifrt_client,
-                                            std::move(cpu_callback)));
-}
-
-absl::StatusOr<std::string> PyCpuLoadedHostCallback::Serialize() const {
-  return Unimplemented(
-      "PyHostSendAndRecvLoadedHostCallback serialization is not supported");
-}
-
-absl::StatusOr<tsl::RCReference<PyHostSendAndRecvLoadedHostCallback>>
-PyHostSendAndRecvLoadedHostCallback::Create(
-    ifrt::Client* ifrt_client, nb::callable callable,
-    absl::Span<const Shape> operand_shapes,
-    absl::Span<const Shape> result_shapes,
-    absl::Span<const uint16_t> send_channel_ids,
-    absl::Span<const uint16_t> recv_channel_ids, nb::callable serializer) {
-  TF_ASSIGN_OR_RETURN(auto callback_args, CreateCallbackArgs(operand_shapes));
-  TF_ASSIGN_OR_RETURN(auto callback_results,
-                      CreateCallbackResults(result_shapes));
-
-  // `callable` will be destroyed safely with `PythonRefManager` when
-  // `CpuCallback` is destroyed.
-  auto cpu_callback =
-      std::make_shared<CpuCallback>(callable, callback_args, callback_results);
-
-  auto host_callback = std::make_unique<HostCallback>();
-
-  auto assign_arg_info = [](absl::Span<const xla::Shape> shapes,
-                            absl::Span<const uint16_t> channel_ids,
-                            std::vector<HostCallbackArgInfo>& arg_infos) {
-    DCHECK_EQ(shapes.size(), channel_ids.size());
-    arg_infos.reserve(shapes.size());
-    for (int i = 0; i < shapes.size(); ++i) {
-      HostCallbackArgInfo host_callback_arg_info;
-      host_callback_arg_info.channel_id = channel_ids[i];
-      const auto& shape = shapes[i];
-      Shape layout =
-          (shape.has_layout() ? shape
-                              : LayoutUtil::GetWithDefaultLayout(shape));
-      host_callback_arg_info.shape = layout;
-      arg_infos.push_back(std::move(host_callback_arg_info));
-    }
-  };
-
-  assign_arg_info(operand_shapes, send_channel_ids, host_callback->operands);
-  assign_arg_info(result_shapes, recv_channel_ids, host_callback->results);
-
-  host_callback->callback = [cpu_callback = std::move(cpu_callback)](
-                                void** outputs, void** inputs) {
-    return cpu_callback->PrepareAndCall(outputs, inputs);
-  };
-  return tsl::RCReference<PyHostSendAndRecvLoadedHostCallback>(
-      tsl::MakeRef<PyHostSendAndRecvLoadedHostCallback>(
-          ifrt_client, std::move(host_callback), callable, operand_shapes,
-          result_shapes, send_channel_ids, recv_channel_ids,
-          std::move(serializer)));
-}
-
-PyHostSendAndRecvLoadedHostCallback::PyHostSendAndRecvLoadedHostCallback(
-    ifrt::Client* ifrt_client,
-    std::unique_ptr<xla::HostCallback> xla_host_callback, nb::callable callable,
-    absl::Span<const Shape> operand_shapes,
-    absl::Span<const Shape> result_shapes,
-    absl::Span<const uint16_t> send_channel_ids,
-    absl::Span<const uint16_t> recv_channel_ids, nb::callable serializer)
-    : llvm::RTTIExtends<PyHostSendAndRecvLoadedHostCallback,
-                        ifrt::PjRtHostSendAndRecvLoadedHostCallback>(
-          ifrt_client, std::move(xla_host_callback)),
-      callable_(std::move(callable)),
-      operand_shapes_(operand_shapes.begin(), operand_shapes.end()),
-      result_shapes_(result_shapes.begin(), result_shapes.end()),
-      send_channel_ids_(send_channel_ids.begin(), send_channel_ids.end()),
-      recv_channel_ids_(recv_channel_ids.begin(), recv_channel_ids.end()),
-      serializer_(serializer) {}
-
-PyHostSendAndRecvLoadedHostCallback::~PyHostSendAndRecvLoadedHostCallback() {
-  GlobalPyRefManager()->AddGarbage(
-      absl::MakeSpan(static_cast<nb::object*>(&callable_), 1));
-  GlobalPyRefManager()->AddGarbage(
-      absl::MakeSpan(static_cast<nb::object*>(&serializer_), 1));
-}
-
-absl::StatusOr<std::string> PyHostSendAndRecvLoadedHostCallback::Serialize()
-    const {
-  if (serializer_.is_none()) {
-    return InvalidArgument(
-        "Host callback cannot be serialized because serializer was not "
-        "provided by JAX");
-  }
-  ifrt::XlaHostCallbackProto xla_host_callback_proto;
-
-  TF_RET_CHECK(operand_shapes_.size() == send_channel_ids_.size());
-  for (int i = 0; i < operand_shapes_.size(); ++i) {
-    ifrt::XlaHostCallbackProto::ArgInfo* const operand =
-        xla_host_callback_proto.add_operands();
-    operand->set_channel_id(send_channel_ids_[i]);
-    *operand->mutable_shape() = operand_shapes_[i].ToProto();
-  }
-
-  TF_RET_CHECK(result_shapes_.size() == recv_channel_ids_.size());
-  for (int i = 0; i < result_shapes_.size(); ++i) {
-    ifrt::XlaHostCallbackProto::ArgInfo* const result =
-        xla_host_callback_proto.add_results();
-    result->set_channel_id(recv_channel_ids_[i]);
-    *result->mutable_shape() = result_shapes_[i].ToProto();
-  }
-
-  std::string callable;
-  {
-    nb::gil_scoped_acquire gil_acquire;
-    try {
-      nb::bytes bytes = nb::cast<nb::bytes>(serializer_(callable_));
-      callable = std::string(bytes.c_str(), bytes.size());
-    } catch (const nb::python_error& e) {
-      return absl::InternalError(absl::StrCat(
-          "Unable to pickle the host_callback callable: ", e.what()));
-    } catch (const std::exception& e) {
-      std::exception_ptr p = std::current_exception();
-      return absl::InternalError(absl::StrCat(
-          "Exception while pickling the host_callback callable: ", e.what()));
-    } catch (...) {
-      // Ensure to avoid leaking any exception because this method could have
-      // been called outside of a Python context where C++ exceptions are not
-      // necessarily enabled.
-      return absl::InternalError(
-          "Unknown exception while pickling the host_callback callable.");
-    }
-  }
-  PyHostCallbackProto py_host_callback_proto;
-  py_host_callback_proto.set_callable(std::move(callable));
-  if (!xla_host_callback_proto.mutable_serialized_callback()->PackFrom(
-          py_host_callback_proto)) {
-    return absl::InternalError("Could not serialize a Python host callback");
-  }
-  xla_host_callback_proto.set_use_major_to_minor_data_layout_for_callbacks(
-      true);
-  return xla_host_callback_proto.SerializeAsString();
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_host_callback.h b/third_party/xla/xla/python/py_host_callback.h
deleted file mode 100644
index fc552aad588a..000000000000
--- a/third_party/xla/xla/python/py_host_callback.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_HOST_CALLBACK_H_
-#define XLA_PYTHON_PY_HOST_CALLBACK_H_
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/base/casts.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "llvm/Support/ExtensibleRTTI.h"
-#include "nanobind/nanobind.h"
-#include "xla/pjrt/host_callback.h"
-#include "xla/python/callback.h"
-#include "xla/python/ifrt/client.h"
-#include "xla/python/ifrt/host_callback.h"
-#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
-#include "xla/shape.h"
-#include "xla/tsl/concurrency/ref_count.h"
-
-namespace xla {
-
-using PyLoadedHostCallback = ::xla::ifrt::LoadedHostCallback;
-
-// `PyCpuLoadedHostCallback` implements a Python host callback that uses a
-// descriptor (a raw pointer to JAX `CpuCallback`). The descriptor should be
-// passed into a 'xla_python_cpu_callback' or 'xla_python_gpu_callback'
-// CustomCall as its first argument.
-//
-// Serialization is not supported. Once the descriptor is embedded in
-// CustomCall in an XLA computation, the computation will not be serializable.
-class PyCpuLoadedHostCallback final
-    : public llvm::RTTIExtends<PyCpuLoadedHostCallback,
-                               ifrt::LoadedHostCallback> {
- public:
-  static absl::StatusOr<tsl::RCReference<PyCpuLoadedHostCallback>> Create(
-      ifrt::Client* ifrt_client, nanobind::callable callable,
-      absl::Span<const Shape> operand_shapes,
-      absl::Span<const Shape> result_shapes);
-
-  // Returns the descriptor of `CpuCallback`.
-  uint64_t descriptor() const {
-    return absl::bit_cast<uint64_t>(cpu_callback_.get());
-  }
-
-  CpuCallback* cpu_callback() { return cpu_callback_.get(); }
-
-  // LoadedHostCallback implementation.
-
-  ~PyCpuLoadedHostCallback() override = default;
-
-  ifrt::Client* client() const override { return ifrt_client_; }
-
-  absl::StatusOr<std::string> Serialize() const override;
-
-  static char ID;  // NOLINT
-
- private:
-  PyCpuLoadedHostCallback(ifrt::Client* ifrt_client,
-                          std::unique_ptr<CpuCallback> cpu_callback)
-      : ifrt_client_(ifrt_client), cpu_callback_(std::move(cpu_callback)) {}
-
-  template <typename T, typename... Args>
-  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
-
-  ifrt::Client* ifrt_client_;
-  std::unique_ptr<CpuCallback> cpu_callback_;
-};
-
-// `PyHostSendAndRecvLoadedHostCallback` implements a Python host callback that
-// uses XLA host send and recv. This object should be passed to the compiler
-// when creating `xla::ifrt::LoadedExecutable`.
-//
-// Serialization is supported if the Python host callback using the
-// `cloudpickle` third-party library.
-//
-// TODO(hyeontaek): Update the comment ("compiler" to "client") after splitting
-// compilation and loading.
-class PyHostSendAndRecvLoadedHostCallback final
-    : public llvm::RTTIExtends<PyHostSendAndRecvLoadedHostCallback,
-                               ifrt::PjRtHostSendAndRecvLoadedHostCallback> {
- public:
-  static absl::StatusOr<tsl::RCReference<PyHostSendAndRecvLoadedHostCallback>>
-  Create(ifrt::Client* ifrt_client, nanobind::callable callable,
-         absl::Span<const Shape> operand_shapes,
-         absl::Span<const Shape> result_shapes,
-         absl::Span<const uint16_t> send_channel_ids,
-         absl::Span<const uint16_t> recv_channel_ids,
-         nanobind::callable serializer);
-
-  // PjRtLoadedHostCallback implementation.
-
-  ~PyHostSendAndRecvLoadedHostCallback() override;
-
-  absl::StatusOr<std::string> Serialize() const override;
-
-  static char ID;  // NOLINT
-
- private:
-  PyHostSendAndRecvLoadedHostCallback(
-      ifrt::Client* ifrt_client,
-      std::unique_ptr<xla::HostCallback> xla_host_callback,
-      nanobind::callable callable, absl::Span<const Shape> operand_shapes,
-      absl::Span<const Shape> result_shapes,
-      absl::Span<const uint16_t> send_channel_ids,
-      absl::Span<const uint16_t> recv_channel_ids,
-      nanobind::callable serializer);
-
-  template <typename T, typename... Args>
-  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
-
-  // Retained arguments for host callback serialization.
-  nanobind::callable callable_;
-  std::vector<Shape> operand_shapes_;
-  std::vector<Shape> result_shapes_;
-  std::vector<uint16_t> send_channel_ids_;
-  std::vector<uint16_t> recv_channel_ids_;
-  nanobind::callable serializer_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_HOST_CALLBACK_H_
diff --git a/third_party/xla/xla/python/py_host_callback.proto b/third_party/xla/xla/python/py_host_callback.proto
deleted file mode 100644
index f91e122af057..000000000000
--- a/third_party/xla/xla/python/py_host_callback.proto
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-syntax = "proto3";
-
-package xla;
-
-// Represents a JAX host callback that is serialized using the 'cloudpickle'
-// Python library. Typically used for
-// `xla.ifrt.XlaHostCallbackProto.serialized_callback`.
-message PyHostCallbackProto {
-  bytes callable = 1;
-}
diff --git a/third_party/xla/xla/python/py_memory_space.cc b/third_party/xla/xla/python/py_memory_space.cc
deleted file mode 100644
index 990b1ba6ec5f..000000000000
--- a/third_party/xla/xla/python/py_memory_space.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_memory_space.h"
-
-#include <Python.h>
-
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "xla/python/ifrt/device.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/py_client.h"
-
-namespace nb = ::nanobind;
-
-namespace xla {
-
-PyMemorySpace::PyMemorySpace(nb_class_ptr<PyClient> client,
-                             ifrt::Memory* memory)
-    : client_(std::move(client)), memory_(memory) {}
-
-int PyMemorySpace::process_index() const { return client_->process_index(); }
-
-absl::string_view PyMemorySpace::platform() const {
-  // TODO(phawkins): this is a temporary backwards
-  // compatibility shim. We changed the name PJRT
-  // reports for GPU platforms to "cuda" or "rocm",
-  // but we haven't yet updated JAX clients that
-  // expect "gpu". Migrate users and remove this
-  // code.
-  if (client_->platform_name() == "cuda" ||
-      client_->platform_name() == "rocm") {
-    return absl::string_view("gpu");
-  } else {
-    return client_->platform_name();
-  }
-}
-
-absl::string_view PyMemorySpace::kind() const {
-  return *memory_->Kind().memory_kind();
-}
-
-absl::string_view PyMemorySpace::Str() const { return memory_->DebugString(); }
-
-absl::string_view PyMemorySpace::Repr() const { return memory_->ToString(); }
-
-nb::list PyMemorySpace::AddressableByDevices() const {
-  nb::list devices;
-  for (ifrt::Device* device : memory_->Devices()) {
-    devices.append(client_->GetPyDevice(device));
-  }
-  return devices;
-}
-
-/* static */ int PyMemorySpace::tp_traverse(PyObject* self, visitproc visit,
-                                            void* arg) {
-  PyMemorySpace* d = nb::inst_ptr<PyMemorySpace>(self);
-  Py_VISIT(d->client().ptr());
-  return 0;
-}
-
-/* static */ int PyMemorySpace::tp_clear(PyObject* self) {
-  PyMemorySpace* d = nb::inst_ptr<PyMemorySpace>(self);
-  nb_class_ptr<PyClient> client;
-  std::swap(client, d->client_);
-  return 0;
-}
-
-PyType_Slot PyMemorySpace::slots_[] = {
-    {Py_tp_traverse, (void*)PyMemorySpace::tp_traverse},
-    {Py_tp_clear, (void*)PyMemorySpace::tp_clear},
-    {0, nullptr},
-};
-
-/* static */ void PyMemorySpace::RegisterPythonType(nb::module_& m) {
-  nb::class_<PyMemorySpace> device(m, "Memory",
-                                   nb::type_slots(PyMemorySpace::slots_));
-  device.def_prop_ro("process_index", &PyMemorySpace::process_index)
-      .def_prop_ro("platform", &PyMemorySpace::platform)
-      .def_prop_ro("kind", &PyMemorySpace::kind)
-      .def("__str__", &PyMemorySpace::Str)
-      .def("__repr__", &PyMemorySpace::Repr)
-      .def("addressable_by_devices", &PyMemorySpace::AddressableByDevices,
-           "Returns devices that can address this memory.");
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_memory_space.h b/third_party/xla/xla/python/py_memory_space.h
deleted file mode 100644
index bc0773ed4366..000000000000
--- a/third_party/xla/xla/python/py_memory_space.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_MEMORY_SPACE_H_
-#define XLA_PYTHON_PY_MEMORY_SPACE_H_
-
-#include <Python.h>
-
-#include "nanobind/nanobind.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/py_client.h"
-
-namespace xla {
-
-class PyMemorySpace {
- public:
-  PyMemorySpace(nb_class_ptr<PyClient> client, ifrt::Memory* memory_space);
-
-  // Memory spaces are compared using Python object identity, so we don't allow
-  // them to be copied or moved.
-  PyMemorySpace(const PyMemorySpace&) = delete;
-  PyMemorySpace(PyMemorySpace&&) = delete;
-  PyMemorySpace& operator=(const PyMemorySpace&) = delete;
-  PyMemorySpace& operator=(PyMemorySpace&&) = delete;
-
-  const nb_class_ptr<PyClient>& client() const { return client_; }
-  ifrt::Memory* memory_space() const { return memory_; }
-
-  int process_index() const;
-  absl::string_view platform() const;
-  absl::string_view kind() const;
-
-  absl::string_view Str() const;
-  absl::string_view Repr() const;
-
-  nanobind::list AddressableByDevices() const;
-
-  static void RegisterPythonType(nanobind::module_& m);
-
- private:
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
-  static int tp_clear(PyObject* self);
-  static PyType_Slot slots_[];
-
-  nb_class_ptr<PyClient> client_;
-  ifrt::Memory* memory_;
-};
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_MEMORY_SPACE_H_
diff --git a/third_party/xla/xla/python/py_program.cc b/third_party/xla/xla/python/py_program.cc
deleted file mode 100644
index 6850b4199ff6..000000000000
--- a/third_party/xla/xla/python/py_program.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_program.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/cord.h"
-#include "absl/strings/string_view.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/pjrt/mlir_to_hlo.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/ifrt/array_spec.h"
-#include "xla/python/ifrt/compiler.h"
-#include "xla/python/ifrt/custom_call_program.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/hlo/hlo_program.h"
-#include "xla/python/ifrt/host_callback.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/plugin_program.h"
-#include "xla/python/ifrt/program.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/pjrt_ifrt/xla_compiler.h"
-#include "xla/python/pjrt_ifrt/xla_sharding.h"
-#include "xla/python/py_device.h"
-#include "xla/python/py_device_list.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/sharding.h"
-#include "xla/python/types.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-
-namespace nb = ::nanobind;
-
-namespace {
-
-// Gets `ifrt::DeviceList` from a sequence of JAX devices.
-absl::StatusOr<ifrt::DeviceListRef> GetDeviceList(nb::sequence devices) {
-  ifrt::DeviceListRef ifrt_device_list;
-  if (devices.type().is(jax::PyDeviceList::type())) {
-    return nb::cast<const jax::PyDeviceList*>(devices)->ifrt_device_list();
-  } else {
-    auto py_devices = nb::cast<std::vector<nb_class_ptr<PyDevice>>>(devices);
-    if (py_devices.empty()) {
-      return absl::InvalidArgumentError(
-          "Colocated Python program requires at least one device");
-    }
-    absl::InlinedVector<ifrt::Device*, 1> ifrt_devices;
-    ifrt_devices.reserve(py_devices.size());
-    for (const nb_class_ptr<PyDevice>& py_device : py_devices) {
-      ifrt_devices.push_back(py_device->device());
-    }
-    return py_devices.front()->client()->ifrt_client()->MakeDeviceList(
-        ifrt_devices);
-  }
-}
-
-// Gets `xla::HloSharding` from a JAX Sharding.
-xla::HloSharding GetXlaHloSharding(nb::handle sharding,
-                                   int64_t num_dimensions) {
-  if (sharding.type().is(jax::GSPMDSharding::type())) {
-    return nb::cast<jax::GSPMDSharding*>(sharding)->hlo_sharding();
-  } else {
-    return nb::cast<xla::HloSharding>(
-        sharding.attr("_to_xla_hlo_sharding")(num_dimensions));
-  }
-}
-
-// Gets `ifrt::DeviceList` from a JAX Sharding.
-absl::StatusOr<ifrt::DeviceListRef> GetIfrtDeviceList(nb::handle sharding) {
-  if (sharding.type().is(jax::NamedSharding::type())) {
-    TF_ASSIGN_OR_RETURN(
-        auto ns_device_list,
-        nb::cast<const jax::NamedSharding*>(sharding)->internal_device_list());
-    return ns_device_list->ifrt_device_list();
-  } else if (sharding.type().is(jax::SingleDeviceSharding::type())) {
-    return nb::cast<const jax::SingleDeviceSharding*>(sharding)
-        ->internal_device_list()
-        ->ifrt_device_list();
-  } else if (sharding.type().is(jax::PmapSharding::type())) {
-    return nb::cast<const jax::PmapSharding*>(sharding)
-        ->internal_device_list()
-        ->ifrt_device_list();
-  } else if (sharding.type().is(jax::GSPMDSharding::type())) {
-    return nb::cast<const jax::GSPMDSharding*>(sharding)
-        ->internal_device_list()
-        ->ifrt_device_list();
-  } else {
-    return nb::cast<const jax::PyDeviceList*>(
-               sharding.attr("_internal_device_list"))
-        ->ifrt_device_list();
-  }
-}
-
-// Gets `ifrt::MemoryKind` from a JAX Sharding.
-ifrt::MemoryKind GetIfrtMemoryKind(nb::handle sharding) {
-  auto memory_kind = sharding.attr("memory_kind");
-  if (memory_kind.is_none()) {
-    return ifrt::MemoryKind();
-  } else {
-    return ifrt::MemoryKind(nb::cast<std::string>(memory_kind));
-  }
-}
-
-// Makes `ifrt::Sharding` from a JAX Sharding. It requires the number of shape
-// dimensions, which may become necessary when building an HLO sharding.
-absl::StatusOr<std::shared_ptr<const ifrt::Sharding>> GetIfrtSharding(
-    nb::handle sharding, int64_t num_dimensions) {
-  auto ifrt_memory_kind = GetIfrtMemoryKind(sharding);
-  std::shared_ptr<const ifrt::Sharding> ifrt_sharding;
-  if (sharding.type().is(jax::SingleDeviceSharding::type())) {
-    TF_ASSIGN_OR_RETURN(auto ifrt_device_list,
-                        nb::cast<const jax::SingleDeviceSharding*>(sharding)
-                            ->internal_device_list()
-                            ->ifrt_device_list());
-    return ifrt::SingleDeviceSharding::Create(
-        ifrt_device_list->devices().front(), ifrt_memory_kind);
-  } else {
-    TF_ASSIGN_OR_RETURN(auto ifrt_device_list, GetIfrtDeviceList(sharding));
-    auto xla_hlo_sharding = GetXlaHloSharding(sharding, num_dimensions);
-    return ifrt::HloSharding::Create(std::move(ifrt_device_list),
-                                     ifrt_memory_kind,
-                                     std::move(xla_hlo_sharding));
-  }
-}
-
-// Gets `ifrt::ArraySpec`s from a sequence of JAX avals (e.g.,
-// `jax.ShapeDtypeStruct`).
-absl::StatusOr<std::vector<ifrt::ArraySpec>> GetIfrtArraySpecs(
-    nb::sequence avals) {
-  std::vector<ifrt::ArraySpec> ifrt_array_specs;
-  ifrt_array_specs.reserve(nb::len(avals));
-  for (nb::handle aval : avals) {
-    ifrt::Shape ifrt_shape(nb::cast<std::vector<int64_t>>(aval.attr("shape")));
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_dtype,
-        DtypeToIfRtDType(nb::cast<nb_dtype>(aval.attr("dtype"))));
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_sharding,
-        GetIfrtSharding(aval.attr("sharding"), ifrt_shape.dims().size()));
-    ifrt_array_specs.push_back(ifrt::ArraySpec{
-        ifrt_dtype, std::move(ifrt_shape), std::move(ifrt_sharding)});
-  }
-  return ifrt_array_specs;
-}
-
-absl::StatusOr<std::unique_ptr<xla::ifrt::Program>> MakePluginProgramFromString(
-    std::string data) {
-  auto plugin_program = std::make_unique<xla::ifrt::PluginProgram>();
-  plugin_program->data = std::move(data);
-  return plugin_program;
-}
-
-absl::StatusOr<std::unique_ptr<xla::ifrt::Program>> MakePluginProgramFromBytes(
-    nb::bytes data) {
-  auto plugin_program = std::make_unique<xla::ifrt::PluginProgram>();
-  plugin_program->data = std::string(data.c_str(), data.size());
-  return plugin_program;
-}
-
-absl::StatusOr<std::unique_ptr<ifrt::CompileOptions>>
-MakeColocatedPythonCompileOptions() {
-  return std::make_unique<ifrt::CustomCallCompileOptions>();
-}
-
-absl::StatusOr<std::unique_ptr<ifrt::CompileOptions>>
-MakePluginCompileOptions() {
-  return std::make_unique<ifrt::PluginCompileOptions>();
-}
-
-absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeHloProgram(
-    absl::string_view mlir_module) {
-  auto context = std::make_unique<mlir::MLIRContext>();
-  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
-                      ParseMlirModuleString(mlir_module, *context));
-  return std::make_unique<xla::ifrt::HloProgram>(std::move(context),
-                                                 std::move(module));
-}
-
-absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeHloProgramFromString(
-    std::string mlir_module) {
-  return MakeHloProgram(mlir_module);
-}
-
-absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeHloProgramFromBytes(
-    nb::bytes mlir_module) {
-  return MakeHloProgram(
-      absl::string_view(mlir_module.c_str(), mlir_module.size()));
-}
-
-absl::StatusOr<std::unique_ptr<ifrt::CompileOptions>> MakeXlaCompileOptions(
-    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
-  std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
-      ifrt_loaded_host_callbacks;
-  ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
-  // Extract `ifrt::LoadedHostCallback`s from host callback capsules that were
-  // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
-  // `PyClient::GetEmitPythonCallbackDescriptor()`.
-  for (auto& host_callback : host_callbacks) {
-    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
-        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
-  }
-  return std::make_unique<ifrt::XlaCompileOptions>(
-      std::move(options), std::move(ifrt_loaded_host_callbacks));
-}
-
-constexpr absl::string_view kColocatedPythonProgramType =
-    "jax_colocated_python_v0.0.1";
-
-absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeColocatedPythonProgram(
-    std::string name, nb::bytes picked_function, nb::sequence devices,
-    nb::sequence input_avals, nb::sequence output_avals) {
-  auto ifrt_serialized_program_text = absl::MakeCordFromExternal(
-      absl::string_view(reinterpret_cast<const char*>(picked_function.data()),
-                        picked_function.size()),
-      /*releaser=*/[picked_function](absl::string_view) mutable {
-        GlobalPyRefManager()->AddGarbage(std::move(picked_function));
-      });
-  TF_ASSIGN_OR_RETURN(auto ifrt_device_list, GetDeviceList(devices));
-  TF_ASSIGN_OR_RETURN(auto ifrt_input_specs, GetIfrtArraySpecs(input_avals));
-  TF_ASSIGN_OR_RETURN(auto ifrt_output_specs, GetIfrtArraySpecs(output_avals));
-  return std::make_unique<ifrt::CustomCallProgram>(
-      std::string(kColocatedPythonProgramType), std::move(name),
-      std::move(ifrt_serialized_program_text), std::move(ifrt_device_list),
-      std::move(ifrt_input_specs), std::move(ifrt_output_specs));
-}
-
-}  // namespace
-
-void BuildIfrtProgramsSubmodule(nanobind::module_& m) {
-  auto sub_module = m.def_submodule("ifrt_programs");
-  nb::class_<ifrt::Program> ifrt_program_base_class(sub_module, "Program");
-  nb::class_<ifrt::CompileOptions> ifrt_compile_options_base_class(
-      sub_module, "CompileOptions");
-  sub_module
-      .def("make_hlo_program", ValueOrThrowWrapper(MakeHloProgramFromString),
-           nb::arg("mlir_module"))
-      .def("make_hlo_program", ValueOrThrowWrapper(MakeHloProgramFromBytes),
-           nb::arg("mlir_module"))
-      .def("make_colocated_python_program",
-           ValueOrThrowWrapper(MakeColocatedPythonProgram), nb::arg("name"),
-           nb::arg("pickled_function"), nb::arg("devices"),
-           nb::arg("input_avals"), nb::arg("output_avals"))
-      .def("make_plugin_program",
-           ValueOrThrowWrapper(MakePluginProgramFromString), nb::arg("data"))
-      .def("make_plugin_program",
-           ValueOrThrowWrapper(MakePluginProgramFromBytes), nb::arg("data"))
-      .def("make_xla_compile_options",
-           ValueOrThrowWrapper(MakeXlaCompileOptions), nb::arg("options"),
-           nb::arg("host_callbacks"))
-      .def("make_colocated_python_compile_options",
-           ValueOrThrowWrapper(MakeColocatedPythonCompileOptions))
-      .def("make_plugin_compile_options",
-           ValueOrThrowWrapper(MakePluginCompileOptions));
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_program.h b/third_party/xla/xla/python/py_program.h
deleted file mode 100644
index e9654d8a9964..000000000000
--- a/third_party/xla/xla/python/py_program.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PY_PROGRAM_H_
-#define XLA_PYTHON_PY_PROGRAM_H_
-
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-void BuildIfrtProgramsSubmodule(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_PROGRAM_H_
diff --git a/third_party/xla/xla/python/py_values.cc b/third_party/xla/xla/python/py_values.cc
deleted file mode 100644
index 0311566dad79..000000000000
--- a/third_party/xla/xla/python/py_values.cc
+++ /dev/null
@@ -1,745 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/py_values.h"
-
-#include <Python.h>
-
-#include <cstdint>
-#include <exception>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/cord.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/complex.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "xla/primitive_util.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/pjrt_ifrt/pjrt_dtype.h"
-#include "xla/python/py_array.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/sharding.h"
-#include "xla/python/types.h"
-#include "xla/shape.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/python/lib/core/numpy.h"
-#include "xla/types.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/ml_dtypes.h"
-#include "tsl/profiler/lib/traceme.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-
-namespace {
-
-absl::StatusOr<std::vector<absl::Cord>> StringDTypeArrayToCords(
-    PyArrayObject* py_array_obj) {
-  if (PyArray_SIZE(py_array_obj) == 0) {
-    return absl::InvalidArgumentError("empty numpy array");
-  }
-
-  std::vector<absl::Cord> cords;
-  cords.reserve(PyArray_SIZE(py_array_obj));
-
-  auto iter =
-      nb::steal(PyArray_IterNew(reinterpret_cast<PyObject*>(py_array_obj)));
-  while (PyArray_ITER_NOTDONE(iter.ptr())) {
-    auto* iter_data = PyArray_ITER_DATA(iter.ptr());
-    auto* item = PyArray_GETITEM(py_array_obj, static_cast<char*>(iter_data));
-    if (!item) {
-      return absl::InternalError(
-          "Failed to get elements out of the ndarray iter.");
-    }
-    Py_ssize_t len;
-    auto str = PyUnicode_AsUTF8AndSize(item, &len);
-    cords.push_back(absl::Cord(absl::string_view(str, len)));
-    PyArray_ITER_NEXT(iter.ptr());
-  }
-  return cords;
-}
-
-using DevicePutFunc = std::function<absl::StatusOr<DevicePutResultFn>(
-    nb::handle, ifrt::Client*, ifrt::Device*, const DevicePutOptions& options,
-    ifrt::MemoryKind to_memory_kind)>;
-
-template <typename T, typename SquashedT>
-absl::StatusOr<DevicePutResultFn> HandlePythonScalar(
-    nb::handle obj, ifrt::Client* client, ifrt::Device* to_device,
-    const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  T value;
-  try {
-    value = nb::cast<T>(obj);
-  } catch (const std::exception& e) {
-    return InvalidArgument(
-        "Unable to convert Python scalar to %s. This most likely means the "
-        "value (%s) overflows the range of the type.",
-        PrimitiveType_Name(primitive_util::NativeToPrimitiveType<T>()),
-        nb::cast<absl::string_view>(nb::repr(obj)));
-  }
-
-  std::variant<T, SquashedT> data;
-  Shape shape;
-  PrimitiveType type;
-  if (std::is_same<T, SquashedT>() || !options.squash_64bit_types) {
-    data.template emplace<0>(value);
-    type = primitive_util::NativeToPrimitiveType<T>();
-  } else {
-    // TODO(phawkins): we should check for overflow here, e.g., because of bugs
-    // like https://github.com/google/jax/issues/2006
-    data.template emplace<1>(static_cast<SquashedT>(value));
-    type = primitive_util::NativeToPrimitiveType<SquashedT>();
-  }
-
-  return [client, data, type, to_device,
-          to_memory_kind]() -> absl::StatusOr<DevicePutResult> {
-    const void* ptr = std::visit(
-        [](const auto& v) { return static_cast<const void*>(&v); }, data);
-    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
-    // TODO(yashkatariya): Plumb sharding or memory_kind here.
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_array,
-        client->MakeArrayFromHostBuffer(
-            ptr, ifrt_dtype, /*shape=*/ifrt::Shape({}), /*byte_strides=*/{},
-            ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-            ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
-            /*on_done_with_host_buffer=*/{}));
-    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
-  };
-}
-
-absl::StatusOr<DevicePutResultFn> HandlePythonInt(
-    nb::handle obj, ifrt::Client* client, ifrt::Device* to_device,
-    const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  PrimitiveType type;
-  std::variant<int64_t, int32_t> data;
-
-  if (options.squash_64bit_types) {
-    try {
-      data.emplace<1>(nb::cast<int32_t>(obj));
-    } catch (const std::exception& e) {
-      return InvalidArgument(
-          "Unable to convert Python scalar to %s. This most likely means the "
-          "value (%s) overflows the range of the type.",
-          PrimitiveType_Name(primitive_util::NativeToPrimitiveType<int32_t>()),
-          nb::cast<absl::string_view>(nb::repr(obj)));
-    }
-    type = S32;
-  } else {
-    try {
-      data.emplace<0>(nb::cast<int64_t>(obj));
-    } catch (const std::exception& e) {
-      return InvalidArgument(
-          "Unable to convert Python scalar to %s. This most likely means the "
-          "value (%s) overflows the range of the type.",
-          PrimitiveType_Name(primitive_util::NativeToPrimitiveType<int64_t>()),
-          nb::cast<absl::string_view>(nb::repr(obj)));
-    }
-    type = S64;
-  }
-  return [client, data, type, to_device,
-          to_memory_kind]() -> absl::StatusOr<DevicePutResult> {
-    const void* ptr = std::visit(
-        [](const auto& v) { return static_cast<const void*>(&v); }, data);
-    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
-    // TODO(yashkatariya): Plumb sharding or memory_kind here.
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_array,
-        client->MakeArrayFromHostBuffer(
-            ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}),
-            /*byte_strides=*/{},
-            ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-            ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
-            /*on_done_with_host_buffer=*/nullptr));
-    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
-  };
-}
-
-template <typename T, typename SquashedT = T>
-absl::StatusOr<DevicePutResultFn> HandleNumpyScalar(
-    nb::handle h, ifrt::Client* client, ifrt::Device* to_device,
-    const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  std::variant<T, SquashedT, void*> data;
-  PrimitiveType type;
-  // For extension types, ScalarAsCtype returns a pointer to the data.
-  if (std::is_same<T, xla::s2>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = S2;
-  } else if (std::is_same<T, xla::s4>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = S4;
-  } else if (std::is_same<T, xla::u2>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = U2;
-  } else if (std::is_same<T, xla::u4>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = U4;
-  } else if (std::is_same<T, bfloat16>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = BF16;
-  } else if (std::is_same<T, tsl::float4_e2m1fn>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F4E2M1FN;
-  } else if (std::is_same<T, tsl::float8_e3m4>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E3M4;
-  } else if (std::is_same<T, tsl::float8_e4m3>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E4M3;
-  } else if (std::is_same<T, tsl::float8_e4m3fn>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E4M3FN;
-  } else if (std::is_same<T, tsl::float8_e4m3b11fnuz>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E4M3B11FNUZ;
-  } else if (std::is_same<T, tsl::float8_e5m2>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E5M2;
-  } else if (std::is_same<T, tsl::float8_e4m3fnuz>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E4M3FNUZ;
-  } else if (std::is_same<T, tsl::float8_e5m2fnuz>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E5M2FNUZ;
-  } else if (std::is_same<T, tsl::float8_e8m0fnu>()) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
-    type = F8E8M0FNU;
-  } else if (std::is_same<T, SquashedT>() || !options.squash_64bit_types) {
-    PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<0>());
-    type = primitive_util::NativeToPrimitiveType<T>();
-  } else {
-    T value;
-    PyArray_ScalarAsCtype(h.ptr(), &value);
-    data.template emplace<1>(static_cast<SquashedT>(value));
-    type = primitive_util::NativeToPrimitiveType<SquashedT>();
-  }
-  std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref;
-  if (data.index() == 2) {
-    py_buffer_ref =
-        GlobalPyRefManager()->ManageReference(nb::cast<nb::object>(h));
-  }
-  return [client, data, py_buffer_ref, type, to_device,
-          to_memory_kind]() mutable -> absl::StatusOr<DevicePutResult> {
-    const void* ptr = std::visit(
-        [](const auto& v) -> const void* {
-          if constexpr (std::is_same_v<std::decay_t<decltype(v)>, void*>) {
-            return v;
-          } else {
-            return static_cast<const void*>(&v);
-          }
-        },
-        data);
-    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
-    // TODO(yashkatariya): Plumb sharding or memory_kind here.
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_array,
-        client->MakeArrayFromHostBuffer(
-            ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}),
-            /*byte_strides=*/{},
-            ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-            ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
-            /*on_done_with_host_buffer=*/
-            [py_buffer_ref = std::move(
-                 py_buffer_ref)]() { /* keeps py_buffer_ref alive */ }));
-    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
-  };
-}
-
-absl::StatusOr<DevicePutResultFn> HandleStringNumpyArray(
-    nb::handle h, ifrt::Client* client, ifrt::Device* to_device,
-    const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  xla::nb_numpy_ndarray array = nb::cast<xla::nb_numpy_ndarray>(h);
-  auto py_array_obj = reinterpret_cast<PyArrayObject*>(array.ptr());
-  TF_ASSIGN_OR_RETURN(auto cords, StringDTypeArrayToCords(py_array_obj));
-
-  // Assemble all the parameters of MakeArrayFromHostBuffer
-  void* data = cords.data();
-
-  // Make an explicit copy of the shape elements so we won't run into complex
-  // endianness and precision issues that might arise if we reinterpret-casted
-  // from npy_intp, that can be just 32 bits-wide in some environments
-  // such as macos_arm64 to const int64_t* that must be 64 bits-wide.
-  ifrt::Shape::Dimensions dims;
-  dims.reserve(array.ndim());
-  for (int i = 0; i < array.ndim(); ++i) {
-    dims.push_back(array.shape(i));
-  }
-  ifrt::Shape shape(std::move(dims));
-
-  std::shared_ptr<xla::ifrt::Sharding> sharding =
-      xla::ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind);
-
-  auto on_done_with_host_buffer = [cords = std::move(cords)] {};
-
-  return [client, data = data, shape = std::move(shape),
-          sharding = std::move(sharding),
-          on_done_with_host_buffer =
-              std::move(on_done_with_host_buffer)]() mutable
-             -> absl::StatusOr<DevicePutResult> {
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_array,
-        client->MakeArrayFromHostBuffer(
-            data, ifrt::DType(ifrt::DType::kString), std::move(shape),
-            /*byte_strides=*/std::nullopt, std::move(sharding),
-            ifrt::Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
-            std::move(on_done_with_host_buffer)));
-
-    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
-  };
-}
-
-absl::StatusOr<DevicePutResultFn> HandleNumpyArray(
-    nb::handle h, ifrt::Client* client, ifrt::Device* to_device,
-    const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  xla::nb_numpy_ndarray array = nb::cast<xla::nb_numpy_ndarray>(h);
-
-  // String numpy arrays require substantially different processing.
-  if (array.dtype().char_() == (int)'T' || array.dtype().kind() == 'T') {
-    return HandleStringNumpyArray(h, client, to_device, options,
-                                  to_memory_kind);
-  }
-
-  TF_ASSIGN_OR_RETURN(PrimitiveType type, DtypeToPrimitiveType(array.dtype()));
-
-  PrimitiveType squashed_type;
-  if (options.squash_64bit_types) {
-    squashed_type = Squash64BitTypes(type);
-    if (squashed_type != type) {
-      TF_ASSIGN_OR_RETURN(xla::nb_dtype squashed_dtype,
-                          PrimitiveTypeToNbDtype(squashed_type));
-      array = nb::steal<xla::nb_numpy_ndarray>(PyArray_CastToType(
-          reinterpret_cast<PyArrayObject*>(array.ptr()),
-          reinterpret_cast<PyArray_Descr*>(squashed_dtype.release().ptr()),
-          /*fortran=*/0));
-    }
-  } else {
-    squashed_type = type;
-  }
-
-  absl::InlinedVector<int64_t, 4> dims(array.ndim());
-  absl::InlinedVector<int64_t, 4> byte_strides(array.ndim());
-  for (int i = 0; i < array.ndim(); ++i) {
-    dims[i] = array.shape(i);
-    byte_strides[i] = array.strides(i);
-  }
-  const void* data = array.data();
-  std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
-      GlobalPyRefManager()->ManageReference(std::move(array));
-  return [client, data, squashed_type, dims = std::move(dims),
-          byte_strides = std::move(byte_strides),
-          py_buffer_ref = std::move(py_buffer_ref),
-          allow_zero_copy = options.allow_zero_copy, to_device,
-          to_memory_kind]() mutable -> absl::StatusOr<DevicePutResult> {
-    TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(squashed_type));
-
-    ifrt::Client::HostBufferSemantics host_buffer_semantics =
-        ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall;
-    std::function<void()> on_done_with_host_buffer;
-    if (allow_zero_copy) {
-      on_done_with_host_buffer =
-          [py_buffer_ref{
-              std::move(py_buffer_ref)}]() { /* keeps py_buffer_ref alive */ };
-      host_buffer_semantics =
-          ifrt::Client::HostBufferSemantics::kImmutableZeroCopy;
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_array,
-        client->MakeArrayFromHostBuffer(
-            data, ifrt_dtype, ifrt::Shape(dims), byte_strides,
-            xla::ifrt::SingleDeviceSharding::Create(to_device, to_memory_kind),
-            host_buffer_semantics, std::move(on_done_with_host_buffer)));
-    return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
-  };
-}
-
-absl::StatusOr<DevicePutResultFn> HandlePyArray(
-    nb::handle obj, ifrt::Client* client, ifrt::Device* to_device,
-    const DevicePutOptions& options, ifrt::MemoryKind to_memory_kind) {
-  auto py_array = nb::borrow<PyArray>(obj);
-
-  // We only allow single device case for PyArray in device put.
-  if (py_array.num_shards() != 1) {
-    return InvalidArgument(
-        "device_put expects an array with exactly one shard, got an array with "
-        "with %d shards.",
-        py_array.num_shards());
-  }
-
-  ifrt::Array* ifrt_array = py_array.ifrt_array();
-  if (ifrt_array == nullptr) {
-    return InvalidArgument("Array has been deleted.");
-  }
-
-  // Fallback to python for non-matching clients or pmap sharding.
-  if (py_array.sharding().type().ptr() == jax::PmapSharding::type().ptr() ||
-      ifrt_array->sharding().devices()->devices().front()->client() !=
-          to_device->client()) {
-    return HandleNumpyArray(obj.attr("_value"), client, to_device, options,
-                            to_memory_kind);
-  }
-
-  if (ifrt_array->sharding().devices()->devices().front() == to_device &&
-      (!to_memory_kind.memory_kind().has_value() ||
-       !ifrt_array->sharding().memory_kind().memory_kind().has_value() ||
-       ifrt_array->sharding().memory_kind() == to_memory_kind)) {
-    DevicePutResult result(tsl::FormRef(ifrt_array), py_array.weak_type(),
-                           /*owning_pybuffer=*/nb::borrow<nb::object>(obj));
-    return [result = std::move(result)]() mutable { return std::move(result); };
-  } else {
-    return [ifrt_array = tsl::FormRef(ifrt_array), to_device, to_memory_kind,
-            owning_pybuffer = py_array.weak_type()]() mutable
-               -> absl::StatusOr<DevicePutResult> {
-      auto* ifrt_client = ifrt_array->client();
-      TF_ASSIGN_OR_RETURN(
-          auto copied_ifrt_arrays,
-          ifrt_client->CopyArrays(absl::MakeSpan(&ifrt_array, 1),
-                                  ifrt_client->MakeDeviceList({to_device}),
-                                  to_memory_kind,
-                                  ifrt::ArrayCopySemantics::kReuseInput));
-      return DevicePutResult(std::move(copied_ifrt_arrays[0]),
-                             std::move(owning_pybuffer));
-    };
-  }
-}
-
-}  // namespace
-
-absl::StatusOr<DevicePutResultFn> DevicePut(nb::handle arg,
-                                            ifrt::Client* client,
-                                            ifrt::Device* to_device,
-                                            const DevicePutOptions& options,
-                                            ifrt::MemoryKind to_memory_kind) {
-  tsl::profiler::TraceMe traceme("DevicePut");
-  static const absl::flat_hash_map<PyObject*, DevicePutFunc>* const handlers =
-      [] {
-        auto p = new absl::flat_hash_map<PyObject*, DevicePutFunc>();
-        const NumpyScalarTypes& dtypes = GetNumpyScalarTypes();
-        // Python scalar types.
-        static_assert(sizeof(bool) == 1,
-                      "Conversion code assumes bool is 1 byte");
-        (*p)[reinterpret_cast<PyObject*>(&PyBool_Type)] =
-            HandlePythonScalar<bool, bool>;
-        (*p)[reinterpret_cast<PyObject*>(&PyLong_Type)] = HandlePythonInt;
-        (*p)[reinterpret_cast<PyObject*>(&PyFloat_Type)] =
-            HandlePythonScalar<double, float>;
-        (*p)[reinterpret_cast<PyObject*>(&PyComplex_Type)] =
-            HandlePythonScalar<complex128, complex64>;
-
-        (*p)[reinterpret_cast<PyObject*>(&PyArray_Type)] = HandleNumpyArray;
-
-        // Numpy scalar types. For some of them, we share the handler with
-        // Python types (np_int64, np_float64, np_complex128).
-        (*p)[dtypes.np_bool.ptr()] = HandleNumpyScalar<bool>;
-        (*p)[dtypes.np_int4.ptr()] = HandleNumpyScalar<xla::s4>;
-        if (dtypes.np_int2.has_value()) {
-          (*p)[dtypes.np_int2->ptr()] = HandleNumpyScalar<xla::s2>;
-        }
-        (*p)[dtypes.np_int8.ptr()] = HandleNumpyScalar<int8_t>;
-        (*p)[dtypes.np_int16.ptr()] = HandleNumpyScalar<int16_t>;
-        (*p)[dtypes.np_int32.ptr()] = HandleNumpyScalar<int32_t>;
-        (*p)[dtypes.np_int64.ptr()] = HandleNumpyScalar<int64_t, int32_t>;
-        if (dtypes.np_uint2.has_value()) {
-          (*p)[dtypes.np_uint2->ptr()] = HandleNumpyScalar<xla::u2>;
-        }
-        (*p)[dtypes.np_uint4.ptr()] = HandleNumpyScalar<xla::u4>;
-        (*p)[dtypes.np_uint8.ptr()] = HandleNumpyScalar<uint8_t>;
-        (*p)[dtypes.np_uint16.ptr()] = HandleNumpyScalar<uint16_t>;
-        (*p)[dtypes.np_uint32.ptr()] = HandleNumpyScalar<uint32_t>;
-        (*p)[dtypes.np_uint64.ptr()] = HandleNumpyScalar<uint64_t, uint32_t>;
-        if (dtypes.np_float4_e2m1fn.has_value()) {
-          (*p)[dtypes.np_float4_e2m1fn->ptr()] =
-              HandleNumpyScalar<tsl::float4_e2m1fn>;
-        }
-        if (dtypes.np_float8_e3m4.has_value()) {
-          (*p)[dtypes.np_float8_e3m4->ptr()] =
-              HandleNumpyScalar<tsl::float8_e3m4>;
-        }
-        if (dtypes.np_float8_e4m3.has_value()) {
-          (*p)[dtypes.np_float8_e4m3->ptr()] =
-              HandleNumpyScalar<tsl::float8_e4m3>;
-        }
-        (*p)[dtypes.np_float8_e4m3fn.ptr()] =
-            HandleNumpyScalar<tsl::float8_e4m3fn>;
-        (*p)[dtypes.np_float8_e4m3b11fnuz.ptr()] =
-            HandleNumpyScalar<tsl::float8_e4m3b11fnuz>;
-        (*p)[dtypes.np_float8_e5m2.ptr()] = HandleNumpyScalar<tsl::float8_e5m2>;
-        (*p)[dtypes.np_float8_e4m3fnuz.ptr()] =
-            HandleNumpyScalar<tsl::float8_e4m3fnuz>;
-        (*p)[dtypes.np_float8_e5m2fnuz.ptr()] =
-            HandleNumpyScalar<tsl::float8_e5m2fnuz>;
-        if (dtypes.np_float8_e8m0fnu.has_value()) {
-          (*p)[dtypes.np_float8_e8m0fnu->ptr()] =
-              HandleNumpyScalar<tsl::float8_e8m0fnu>;
-        }
-        (*p)[dtypes.np_bfloat16.ptr()] = HandleNumpyScalar<bfloat16>;
-        (*p)[dtypes.np_float16.ptr()] = HandleNumpyScalar<half>;
-        (*p)[dtypes.np_float32.ptr()] = HandleNumpyScalar<float>;
-        (*p)[dtypes.np_float64.ptr()] = HandleNumpyScalar<double, float>;
-        (*p)[dtypes.np_complex64.ptr()] = HandleNumpyScalar<complex64>;
-        (*p)[dtypes.np_complex128.ptr()] =
-            HandleNumpyScalar<complex128, complex64>;
-        static_assert(sizeof(long long) == sizeof(int64_t),  // NOLINT
-                      "long long must be the same size as int64_t");
-        (*p)[dtypes.np_longlong.ptr()] = HandleNumpyScalar<int64_t, int32_t>;
-        static_assert(sizeof(int) == sizeof(int32_t),
-                      "int must be the same size as int32_t");
-        (*p)[dtypes.np_intc.ptr()] = HandleNumpyScalar<int32_t>;
-
-        return p;
-      }();
-
-  if (arg.type().ptr() == PyArray::type().ptr()) {
-    auto array = nb::borrow<PyArray>(arg);
-    return HandlePyArray(arg, client, to_device, options, to_memory_kind);
-  }
-
-  auto res = handlers->find(arg.type().ptr());
-  if (res == handlers->end()) {
-    for (auto base_class : arg.type().attr("__mro__")) {
-      res = handlers->find(base_class.ptr());
-      if (res != handlers->end()) {
-        return res->second(arg, client, to_device, options, to_memory_kind);
-      }
-    }
-    return InvalidArgument(
-        "%s", absl::StrCat(
-                  "Not supported: The C++ jax jit execution path, only accepts "
-                  "DeviceArray, Numpy arrays scalars of supported types "
-                  "(see implementation), or Python scalars. Got type ",
-                  nb::cast<absl::string_view>(nb::str(arg.type()))));
-  }
-  return res->second(arg, client, to_device, options, to_memory_kind);
-}
-
-bool IsFloat0(xla::nb_numpy_ndarray arg) {
-  static const auto* dtypes_module =
-      new nb::module_(nb::module_::import_("jax.dtypes"));
-  static const auto* float0_dtype =
-      new nb::handle(dtypes_module->attr("float0"));
-  return float0_dtype->is(arg.attr("dtype"));
-}
-
-std::string PyArgSignature::DebugString() const {
-  std::string result = "";
-  if (weak_type) {
-    absl::StrAppend(&result, "weak_");
-  }
-  absl::StrAppend(&result, xla::PrimitiveType_Name(dtype));
-  absl::StrAppend(&result, "[", absl::StrJoin(shape, ","), "]");
-  return result;
-}
-
-using ToPyArgSignatureHandler =
-    std::function<absl::StatusOr<PyArgSignature>(nb::handle, bool)>;
-
-absl::StatusOr<PyArgSignature> PyArgSignatureOfValue(nb::handle arg,
-                                                     bool jax_enable_x64) {
-  static const absl::flat_hash_map<PyObject*, ToPyArgSignatureHandler>* const
-      handlers = [] {
-        auto p = new absl::flat_hash_map<PyObject*, ToPyArgSignatureHandler>();
-
-        const NumpyScalarTypes& dtypes = GetNumpyScalarTypes();
-
-        // The 4 Python native types.
-        ToPyArgSignatureHandler bool_handler =
-            [](nb::handle, bool) -> absl::StatusOr<PyArgSignature> {
-          return PyArgSignature(PrimitiveType::PRED, {}, true);
-        };
-        ToPyArgSignatureHandler int_handler =
-            [](nb::handle h,
-               bool jax_enable_x64) -> absl::StatusOr<PyArgSignature> {
-          // TODO(phawkins): we should consider checking for integer overflow.
-          if (jax_enable_x64) {
-            return PyArgSignature(PrimitiveType::S64, {}, true);
-          } else {
-            return PyArgSignature(PrimitiveType::S32, {}, true);
-          }
-        };
-        ToPyArgSignatureHandler float_handler =
-            [&dtypes](nb::handle h,
-                      bool jax_enable_x64) -> absl::StatusOr<PyArgSignature> {
-          // Only Python native types has a True weak_type.
-          bool weak_type = !nb::isinstance(h, dtypes.np_float64);
-          if (jax_enable_x64) {
-            return PyArgSignature(PrimitiveType::F64, {}, weak_type);
-          } else {
-            return PyArgSignature(PrimitiveType::F32, {}, weak_type);
-          }
-        };
-        ToPyArgSignatureHandler complex_handler =
-            [&dtypes](nb::handle h,
-                      bool jax_enable_x64) -> absl::StatusOr<PyArgSignature> {
-          // Note that this branch is also taken  for np.complex128:
-          // isinstance(np.complex128(3), complex) returns True
-          // isinstance(np.complex64(3), complex) returns False
-          bool weak_type = !nb::isinstance(h, dtypes.np_complex128);
-          if (jax_enable_x64) {
-            return PyArgSignature(PrimitiveType::C128, {}, weak_type);
-          } else {
-            return PyArgSignature(PrimitiveType::C64, {}, weak_type);
-          }
-        };
-
-        (*p)[reinterpret_cast<PyObject*>(&PyBool_Type)] = bool_handler;
-        (*p)[reinterpret_cast<PyObject*>(&PyLong_Type)] = int_handler;
-        (*p)[reinterpret_cast<PyObject*>(&PyFloat_Type)] = float_handler;
-        (*p)[reinterpret_cast<PyObject*>(&PyComplex_Type)] = complex_handler;
-
-        ToPyArgSignatureHandler numpy_handler =
-            [](nb::handle h,
-               bool jax_enable_x64) -> absl::StatusOr<PyArgSignature> {
-          xla::nb_numpy_ndarray numpy_array =
-              nb::cast<xla::nb_numpy_ndarray>(h);
-          TF_ASSIGN_OR_RETURN(PrimitiveType dtype,
-                              DtypeToPrimitiveType(numpy_array.dtype()));
-          if (!jax_enable_x64) {
-            dtype = Squash64BitTypes(dtype);
-          }
-          // We use reinterpret_cast<> to defend against environments where
-          // ssize_t may not be precisely the same type as int64_t, even if it
-          // is the same size (long vs long long).
-          static_assert(sizeof(int64_t) == sizeof(ssize_t),
-                        "Code assumes ssize_t is the same as int64_t");
-          return PyArgSignature(
-              dtype,
-              absl::MakeConstSpan(
-                  reinterpret_cast<const int64_t*>(numpy_array.shape()),
-                  numpy_array.ndim()),
-              /*weak_type=*/false);
-        };
-        (*p)[reinterpret_cast<PyObject*>(&PyArray_Type)] = numpy_handler;
-
-        ToPyArgSignatureHandler np_uint64_handler =
-            [](nb::handle h,
-               bool jax_enable_x64) -> absl::StatusOr<PyArgSignature> {
-          if (jax_enable_x64) {
-            return PyArgSignature(PrimitiveType::U64, {}, /*weak_type=*/false);
-          } else {
-            return PyArgSignature(PrimitiveType::U32, {}, /*weak_type=*/false);
-          }
-        };
-        ToPyArgSignatureHandler np_int_handler =
-            [](nb::handle h,
-               bool jax_enable_x64) -> absl::StatusOr<PyArgSignature> {
-          if (jax_enable_x64) {
-            return PyArgSignature(PrimitiveType::S64, {}, /*weak_type=*/false);
-          } else {
-            return PyArgSignature(PrimitiveType::S32, {}, /*weak_type=*/false);
-          }
-        };
-        ToPyArgSignatureHandler numpy_array_handler =
-            [](nb::handle h,
-               bool jax_enable_x64) -> absl::StatusOr<PyArgSignature> {
-          // This block deals with all numpy scalar types, except for int64_dt,
-          // float64_dt and complex128_dt which are taken care of in previous if
-          // blocks.
-          TF_ASSIGN_OR_RETURN(auto dtype,
-                              DtypeToPrimitiveType(h.attr("dtype")));
-          return PyArgSignature(dtype, {}, /*weak_type=*/false);
-        };
-
-        // This block deals with all numpy scalar types, except for int64_dt,
-        // float64_dt and complex128_dt which are taken care of in previous if
-        // blocks.
-        (*p)[dtypes.np_bool.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_int8.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_int16.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_int32.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_int64.ptr()] = np_int_handler;
-        (*p)[dtypes.np_uint8.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_uint16.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_uint32.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_uint64.ptr()] = np_uint64_handler;
-        // TODO: Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-        // (*p)[dtypes.np_float4_e2m1fn.ptr()] = numpy_array_handler;
-        // (*p)[dtypes.np_float8_e3m4.ptr()] = numpy_array_handler;
-        // (*p)[dtypes.np_float8_e4m3.ptr()] = numpy_array_handler;
-        // (*p)[dtypes.np_float8_e8m0fnu.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float8_e4m3fn.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float8_e4m3b11fnuz.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float8_e5m2.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float8_e4m3fnuz.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float8_e5m2fnuz.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float16.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_bfloat16.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float32.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float64.ptr()] = float_handler;
-        (*p)[dtypes.np_complex64.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_complex128.ptr()] = complex_handler;
-        (*p)[dtypes.np_longlong.ptr()] = np_int_handler;
-        (*p)[dtypes.np_intc.ptr()] = numpy_array_handler;
-
-        return p;
-      }();
-
-  if (arg.type().ptr() == PyArray::type().ptr()) {
-    auto array = nb::borrow<PyArray>(arg);
-    ifrt::Array* ifrt_array = array.ifrt_array();
-    if (ifrt_array == nullptr) {
-      return xla::InvalidArgument("Array has been deleted.");
-    }
-    TF_ASSIGN_OR_RETURN(auto primitive_type,
-                        ifrt::ToPrimitiveType(ifrt_array->dtype()));
-    return PyArgSignature(primitive_type, array.shape(), array.weak_type());
-  }
-
-  auto res = handlers->find(arg.type().ptr());
-  if (res == handlers->end()) {
-    // We attempt to look at the MRO classes
-    for (auto base_class : arg.type().attr("__mro__")) {
-      res = handlers->find(base_class.ptr());
-      if (res != handlers->end()) {
-        return res->second(arg, jax_enable_x64);
-      }
-    }
-    return InvalidArgument(
-        "%s",
-        absl::StrCat("Not supported: The C++ ToPyArgSignature only accepts "
-                     "Buffer/DeviceArray, Numpy "
-                     "arrays scalars of supported types "
-                     "(see implementation), or Python scalars. Got type ",
-                     nb::cast<absl::string_view>(nb::str(arg.type()))));
-  }
-  return res->second(arg, jax_enable_x64);
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/py_values.h b/third_party/xla/xla/python/py_values.h
deleted file mode 100644
index 51bfdb919cf4..000000000000
--- a/third_party/xla/xla/python/py_values.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Helpers for converting Python values into buffers.
-
-#ifndef XLA_PYTHON_PY_VALUES_H_
-#define XLA_PYTHON_PY_VALUES_H_
-
-#include <cstdint>
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-
-struct DevicePutResult {
-  explicit DevicePutResult(
-      tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type,
-      nanobind::object owning_pybuffer = nanobind::object())
-      : ifrt_array(std::move(ifrt_array)),
-        weak_type(weak_type),
-        owning_pybuffer(owning_pybuffer) {}
-
-  // Disallow copy since copying `DevicePutResult` without holding GIL may be
-  // dangerous due to `owning_pybuffer`.
-  DevicePutResult(const DevicePutResult&) = delete;
-  DevicePutResult& operator=(const DevicePutResult&) = delete;
-  DevicePutResult(DevicePutResult&&) noexcept = default;
-  DevicePutResult& operator=(DevicePutResult&&) noexcept = default;
-
-  // Points to the on-device array. Not owned.
-  tsl::RCReference<ifrt::Array> ifrt_array;
-  bool weak_type;
-
-  nanobind::object owning_pybuffer;
-};
-
-// Copies a buffer-like object to be on device.
-//
-// If `arg` is not convertible to a `PjRtBuffer` from C++, an error will be
-// returned; float0s are not supported yet.
-// If the value is known to be a PyBuffer object, py_buffer can be passed as
-// an optimization to avoid a Python->C++ cast.
-//
-// This function performs Python work inline but postpones C++ work until the
-// returned function is called. The returned function must be called after
-// releasing GIL. Useful for batching GIL release when there are many device_put
-// to execute.
-//
-// May throw exceptions from nanobind in addition to failing via an error
-// absl::Status. (We could catch these if needed, but there seems little point.)
-struct DevicePutOptions {
-  bool squash_64bit_types = false;
-  bool allow_zero_copy = true;
-};
-using DevicePutResultFn =
-    absl::AnyInvocable<absl::StatusOr<DevicePutResult>() &&>;
-absl::StatusOr<DevicePutResultFn> DevicePut(nanobind::handle arg,
-                                            ifrt::Client* client,
-                                            ifrt::Device* to_device,
-                                            const DevicePutOptions& options,
-                                            ifrt::MemoryKind to_memory_kind);
-
-// Returns `true` if `arg` is a JAX float0 array.
-bool IsFloat0(xla::nb_numpy_ndarray arg);
-
-// Describes the abstract shape and dtype of an argument.
-struct PyArgSignature {
-  PyArgSignature(PrimitiveType dtype, absl::Span<const int64_t> shape,
-                 bool weak_type)
-      : dtype(dtype), shape(shape.begin(), shape.end()), weak_type(weak_type) {}
-  // This is the XLA dtype of the object.
-  const PrimitiveType dtype;
-  const absl::InlinedVector<int64_t, 4> shape;
-  // JAX arguments can be of weak type, if and only if they are Python scalars
-  // or `DeviceArray` values such that `aval.weak_type` is true.
-  const bool weak_type;
-  bool operator==(const PyArgSignature& other) const {
-    return std::tie(dtype, weak_type, shape) ==
-           std::tie(other.dtype, other.weak_type, other.shape);
-  }
-  bool operator!=(const PyArgSignature& other) const {
-    return !(*this == other);
-  }
-  std::string DebugString() const;
-};
-
-// Returns the PyArgSignature associated with an argument. Returns an error if
-// the argument is not supported.
-absl::StatusOr<PyArgSignature> PyArgSignatureOfValue(nanobind::handle arg,
-                                                     bool jax_enable_x64);
-
-template <typename H>
-H AbslHashValue(H h, const xla::PyArgSignature& s) {
-  h = H::combine(std::move(h), s.dtype);
-  h = H::combine_contiguous(std::move(h), s.shape.data(), s.shape.size());
-  return h;
-}
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PY_VALUES_H_
diff --git a/third_party/xla/xla/python/pyinit_stub.c b/third_party/xla/xla/python/pyinit_stub.c
new file mode 100644
index 000000000000..40e23e25332b
--- /dev/null
+++ b/third_party/xla/xla/python/pyinit_stub.c
@@ -0,0 +1,28 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Stub that reexports Wrapped_PyInit_module as PyInit_module.
+
+extern void* Wrapped_PyInit_@MODULE_NAME@();
+
+#if defined(WIN32) || defined(_WIN32)
+#define EXPORT_SYMBOL __declspec(dllexport)
+#else
+#define EXPORT_SYMBOL __attribute__ ((visibility("default")))
+#endif
+
+EXPORT_SYMBOL void* PyInit_@MODULE_NAME@() {
+  return Wrapped_PyInit_@MODULE_NAME@();
+}
diff --git a/third_party/xla/xla/python/python_ref_manager.cc b/third_party/xla/xla/python/python_ref_manager.cc
deleted file mode 100644
index 6ace28f4a079..000000000000
--- a/third_party/xla/xla/python/python_ref_manager.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/python_ref_manager.h"
-
-#include <atomic>
-#include <deque>
-#include <memory>
-#include <utility>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-namespace nb = nanobind;
-
-PythonRefManager::ManagedPyObjects::ManagedPyObjects(
-    PythonRefManager* manager, absl::Span<nb::object> objects)
-    : manager_(manager) {
-  objects_.reserve(objects.size());
-  for (nb::object& object : objects) {
-    objects_.push_back(std::move(object));
-  }
-}
-
-PythonRefManager::ManagedPyObjects::~ManagedPyObjects() {
-  if (manager_ && !objects_.empty()) {
-    manager_->AddGarbage(absl::MakeSpan(objects_));
-  }
-}
-
-std::shared_ptr<PythonRefManager::ManagedPyObjects>
-PythonRefManager::ManageReference(nb::object object) {
-  return std::make_shared<ManagedPyObjects>(this,
-                                            absl::Span<nb::object>(&object, 1));
-}
-
-std::shared_ptr<PythonRefManager::ManagedPyObjects>
-PythonRefManager::ManageReferences(absl::Span<nb::object> objects) {
-  return std::make_shared<ManagedPyObjects>(this, objects);
-}
-
-void PythonRefManager::AddGarbage(nb::object garbage) {
-  absl::MutexLock lock(&mu_);
-  // We want to collect arbitrary python garbage (e.g., buffers) aggressively.
-  garbage_count_.fetch_add(100, std::memory_order_relaxed);
-  python_garbage_.push_back(std::move(garbage));
-}
-
-void PythonRefManager::AddGarbage(absl::Span<nb::object> garbage) {
-  absl::MutexLock lock(&mu_);
-  // We want to collect arbitrary python garbage (e.g., buffers) aggressively.
-  garbage_count_.fetch_add(100, std::memory_order_relaxed);
-  for (nb::object& o : garbage) {
-    python_garbage_.push_back(std::move(o));
-  }
-}
-
-void PythonRefManager::AddGarbage(
-    absl::Span<std::pair<PyCodeObject*, int> const> garbage) {
-  absl::MutexLock lock(&mu_);
-  // We don't care about collecting stack frame objects often. We grab a lot of
-  // tracebacks and the code objects are most likely live for the entire
-  // process.
-  garbage_count_.fetch_add(1, std::memory_order_relaxed);
-  for (const auto& o : garbage) {
-    python_garbage_.push_back(nb::steal(reinterpret_cast<PyObject*>(o.first)));
-  }
-}
-
-void PythonRefManager::CollectGarbage() {
-  // TODO(phawkins): we should CHECK(PyGILState_Check());
-  std::deque<nanobind::object> garbage;
-  {
-    absl::MutexLock lock(&mu_);
-    garbage_count_ = 0;
-    garbage.swap(python_garbage_);
-  }
-  // We defer deleting garbage until the lock is released. It's possible that
-  // deleting garbage will lead to more Python garbage being added; if we held
-  // the lock we would deadlock because absl::Mutex is not reentrant.
-}
-
-PythonRefManager* GlobalPyRefManager() {
-  static PythonRefManager* static_ref_manager = new PythonRefManager();
-  return static_ref_manager;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/python_ref_manager.h b/third_party/xla/xla/python/python_ref_manager.h
deleted file mode 100644
index 4f1d8212fe6e..000000000000
--- a/third_party/xla/xla/python/python_ref_manager.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PYTHON_REF_MANAGER_H_
-#define XLA_PYTHON_PYTHON_REF_MANAGER_H_
-
-#include <Python.h>
-
-#include <atomic>
-#include <deque>
-#include <memory>
-#include <utility>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-// Class that manages destruction of Python objects.
-//
-// We must not destroy Python objects without holding the GIL. However, we
-// frequently want to hold references to Python objects for the duration of
-// an asynchronous transfer on a Stream, and release our reference when the
-// transfer completes.
-//
-// This class holds references to Python objects outside a GIL scope, that can
-// be collected later when the GIL is held by calling CollectGarbage().
-class PythonRefManager {
- public:
-  PythonRefManager() = default;
-
-  // Holds references to a set of nanobind::objects, adding the references to
-  // the PythonRefManager on destruction.
-  class ManagedPyObjects {
-   public:
-    ManagedPyObjects() = default;
-    ManagedPyObjects(PythonRefManager* manager,
-                     absl::Span<nanobind::object> objects);
-
-    ~ManagedPyObjects();
-
-    ManagedPyObjects(const ManagedPyObjects& other) = delete;
-    ManagedPyObjects(ManagedPyObjects&& other) = default;
-    ManagedPyObjects& operator=(const ManagedPyObjects& other) = delete;
-    ManagedPyObjects& operator=(ManagedPyObjects&& other) noexcept = default;
-
-   private:
-    PythonRefManager* manager_ = nullptr;
-    absl::InlinedVector<nanobind::object, 1> objects_;
-  };
-
-  // Creates a managed std::shared_ptr to an object. When the shared_ptr is
-  // destroyed, the reference to 'object' will be added to python_garbage_,
-  // and collected next time CollectGarbage() is called.
-  std::shared_ptr<ManagedPyObjects> ManageReference(nanobind::object object);
-  std::shared_ptr<ManagedPyObjects> ManageReferences(
-      absl::Span<nanobind::object> objects);
-
-  // Adds garbage objects to the manager.
-  void AddGarbage(nanobind::object garbage);
-  void AddGarbage(absl::Span<nanobind::object> garbage);
-  void AddGarbage(absl::Span<std::pair<PyCodeObject*, int> const> garbage);
-
-  // Releases the contents of python_garbage_. Requires that the GIL is held.
-  // The client calls this method during API entry points where the GIL is held
-  // to free any garbage that has accumulated.
-  void CollectGarbage();
-
-  // Cheaper version of CollectGarbage() with relaxed consistency and frequency.
-  // The purpose of this function is to amortize lock acquisition costs over
-  // a larger number of API calls.
-  void MaybeCollectGarbage() {
-    if (garbage_count_.load(std::memory_order_relaxed) >= 100) {
-      CollectGarbage();
-    }
-  }
-
- private:
-  absl::Mutex mu_;
-  std::deque<nanobind::object> python_garbage_ ABSL_GUARDED_BY(mu_);
-
-  // Writes to garbage_count_ are protected by mu_, reads are not protected.
-  std::atomic<int> garbage_count_{0};
-};
-
-// A global PythonRefManager. Unless `CollectGarbage()` is called before
-// shutdown, this container will hold on to Python objects and thus cause a
-// leak. This behavior is similar to `tensorflow::ClearDecRefCache()`.
-PythonRefManager* GlobalPyRefManager();
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PYTHON_REF_MANAGER_H_
diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
deleted file mode 100644
index 1036a10a28e4..000000000000
--- a/third_party/xla/xla/python/pytree.cc
+++ /dev/null
@@ -1,1825 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Caution: this code uses exceptions. The exception use is local to the
-// binding code and the idiomatic way to emit Python exceptions.
-
-#include "xla/python/pytree.h"
-
-#include <Python.h>
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <optional>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/pair.h"  // IWYU pragma: keep
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/tuple.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/pjrt/exceptions.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pytree.pb.h"
-#include "xla/tsl/platform/logging.h"
-
-namespace xla {
-
-namespace nb = nanobind;
-
-constexpr int kSequenceKeyHashSalt = 1;
-constexpr int kFlattenedIndexKeyHashSalt = 42;
-
-PyTreeRegistry::PyTreeRegistry(bool enable_none, bool enable_tuple,
-                               bool enable_namedtuple, bool enable_list,
-                               bool enable_dict) {
-  auto add_builtin_type = [&](PyTypeObject* type_obj, PyTreeKind kind) {
-    nb::object type =
-        nb::borrow<nb::object>(reinterpret_cast<PyObject*>(type_obj));
-    auto registration = std::make_unique<Registration>();
-    registration->kind = kind;
-    registration->type = type;
-    CHECK(registrations_.emplace(type, std::move(registration)).second);
-  };
-  if (enable_none) {
-    add_builtin_type(Py_TYPE(Py_None), PyTreeKind::kNone);
-  }
-  if (enable_tuple) {
-    add_builtin_type(&PyTuple_Type, PyTreeKind::kTuple);
-  }
-  enable_namedtuple_ = enable_namedtuple;
-  if (enable_list) {
-    add_builtin_type(&PyList_Type, PyTreeKind::kList);
-  }
-  if (enable_dict) {
-    add_builtin_type(&PyDict_Type, PyTreeKind::kDict);
-  }
-}
-
-void PyTreeRegistry::Register(
-    nb::object type, nb::callable to_iterable, nb::callable from_iterable,
-    std::optional<nb::callable> to_iterable_with_keys) {
-  auto registration = std::make_unique<Registration>();
-  registration->kind = PyTreeKind::kCustom;
-  registration->type = type;
-  registration->to_iterable = std::move(to_iterable);
-  registration->from_iterable = std::move(from_iterable);
-  registration->to_iterable_with_keys = std::move(to_iterable_with_keys);
-  nb::ft_lock_guard lock(mu_);
-  auto it = registrations_.emplace(type, std::move(registration));
-  if (!it.second) {
-    throw std::invalid_argument(
-        absl::StrFormat("Duplicate custom PyTreeDef type registration for %s.",
-                        nb::cast<absl::string_view>(nb::repr(type))));
-  }
-}
-
-void PyTreeRegistry::RegisterDataclass(nb::object type,
-                                       std::vector<nb::str> data_fields,
-                                       std::vector<nb::str> meta_fields) {
-  auto registration = std::make_unique<Registration>();
-  registration->kind = PyTreeKind::kDataclass;
-  registration->type = type;
-  registration->data_fields = std::move(data_fields);
-  registration->meta_fields = std::move(meta_fields);
-  nb::ft_lock_guard lock(mu_);
-  auto it = registrations_.emplace(type, std::move(registration));
-  if (!it.second) {
-    throw std::invalid_argument(absl::StrFormat(
-        "Duplicate custom dataclass PyTreeDef type registration for %s.",
-        nb::cast<absl::string_view>(nb::repr(std::move(type)))));
-  }
-}
-
-std::pair<nanobind::iterable, nanobind::object>
-PyTreeRegistry::Registration::ToIterable(nanobind::handle o) const {
-  nb::object out = to_iterable(o);
-  nb::tuple leaves_and_aux_data;
-  if (!nb::try_cast<nb::tuple>(out, leaves_and_aux_data) ||
-      leaves_and_aux_data.size() != 2) {
-    throw std::invalid_argument(absl::StrCat(
-        "The to_iterable function for a custom PyTree node should return "
-        "a (children, aux_data) tuple, got ",
-        nb::cast<absl::string_view>(nb::repr(out))));
-  }
-  nb::iterable leaves;
-  if (!nb::try_cast<nb::iterable>(leaves_and_aux_data[0], leaves)) {
-    throw std::invalid_argument(absl::StrCat(
-        "The to_iterable function for a custom PyTree node should return "
-        "a (children, aux_data) tuple where 'children' is iterable, "
-        "got ",
-        nb::cast<absl::string_view>(nb::repr(out))));
-  }
-  return std::make_pair(std::move(leaves), nb::object(leaves_and_aux_data[1]));
-}
-
-std::pair<std::vector<std::pair<nb::object, nb::object>>, nb::object>
-PyTreeRegistry::Registration::ToIterableWithKeys(nb::handle o) const {
-  // Backwards compatibility case: return dummy FlattenedIndexKey for each leaf.
-  std::vector<std::pair<nb::object, nb::object>> result;
-  if (!to_iterable_with_keys.has_value()) {
-    auto [leaves, aux_data] = ToIterable(o);
-    for (nb::handle leaf : leaves) {
-      result.push_back(std::make_pair(
-          make_nb_class<FlattenedIndexKey>(result.size()), nb::borrow(leaf)));
-    }
-    return std::make_pair(std::move(result), std::move(aux_data));
-  }
-  nb::object out = to_iterable_with_keys.value()(o);
-  nb::tuple leaves_and_aux_data;
-  if (!nb::try_cast<nb::tuple>(out, leaves_and_aux_data) ||
-      leaves_and_aux_data.size() != 2) {
-    throw std::invalid_argument(absl::StrCat(
-        "The to_iterable_with_keys function for a custom PyTree "
-        "node should return a (key_leaf_pairs, aux_data) tuple, got ",
-        nb::cast<absl::string_view>(nb::repr(out))));
-  }
-  nb::iterable key_leaf_pairs;
-  if (!nb::try_cast<nb::iterable>(leaves_and_aux_data[0], key_leaf_pairs)) {
-    throw std::invalid_argument(absl::StrCat(
-        "The to_iterable_with_keys function for a custom PyTree node should "
-        "return a (key_leaf_pairs, aux_data) tuple where 'key_leaf_pairs' is "
-        "iterable, got ",
-        nb::cast<absl::string_view>(nb::repr(leaves_and_aux_data))));
-  }
-  for (nb::handle key_leaf_pair : key_leaf_pairs) {
-    nb::tuple key_leaf_pair_tuple;
-    if (!nb::try_cast<nb::tuple>(key_leaf_pair, key_leaf_pair_tuple) ||
-        key_leaf_pair_tuple.size() != 2) {
-      throw std::invalid_argument(absl::StrCat(
-          "The to_iterable_with_keys function for a custom PyTree node should "
-          "return a (key_leaf_pairs, aux_data) tuple where 'child",
-          nb::cast<absl::string_view>(nb::repr(key_leaf_pair))));
-    }
-    result.push_back(std::make_pair(nb::borrow(key_leaf_pair_tuple[0]),
-                                    nb::borrow(key_leaf_pair_tuple[1])));
-  }
-  return std::make_pair(std::move(result), nb::object(leaves_and_aux_data[1]));
-}
-
-int PyTreeRegistry::Registration::tp_traverse(visitproc visit, void* arg) {
-  Py_VISIT(type.ptr());
-  Py_VISIT(to_iterable.ptr());
-  Py_VISIT(from_iterable.ptr());
-  for (const auto& field : data_fields) {
-    Py_VISIT(field.ptr());
-  }
-  for (const auto& field : meta_fields) {
-    Py_VISIT(field.ptr());
-  }
-  return 0;
-}
-
-// Computes the node kind of a given Python object.
-PyTreeKind PyTreeRegistry::KindOfObject(
-    nb::handle obj, PyTreeRegistry::Registration const** custom) const {
-  const PyTreeRegistry::Registration* registration = Lookup(obj.type());
-  if (registration) {
-    if (registration->kind == PyTreeKind::kCustom ||
-        registration->kind == PyTreeKind::kDataclass) {
-      *custom = registration;
-    } else {
-      *custom = nullptr;
-    }
-    return registration->kind;
-  } else if (nb::isinstance<nb::tuple>(obj) && nb::hasattr(obj, "_fields")) {
-    // We can only identify namedtuples heuristically, here by the presence of
-    // a _fields attribute.
-    return PyTreeKind::kNamedTuple;
-  } else {
-    return PyTreeKind::kLeaf;
-  }
-}
-
-/*static*/ const PyTreeRegistry::Registration* PyTreeRegistry::Lookup(
-    nb::handle type) const {
-  nb::ft_lock_guard lock(mu_);
-  auto it = registrations_.find(type);
-  return it == registrations_.end() ? nullptr : it->second.get();
-}
-
-/*static*/ std::vector<nb::object> GetSortedPyDictKeys(PyObject* py_dict) {
-  std::vector<nb::object> keys;
-  keys.reserve(PyDict_Size(py_dict));
-  PyObject* key;
-  Py_ssize_t pos = 0;
-  while (PyDict_Next(py_dict, &pos, &key, /*value=*/nullptr)) {
-    keys.push_back(nb::borrow<nb::object>(key));
-  }
-
-  try {
-    std::stable_sort(
-        keys.begin(), keys.end(), [](const nb::object& a, const nb::object& b) {
-          int cmp = PyObject_RichCompareBool(a.ptr(), b.ptr(), Py_LT);
-          if (cmp == -1) {
-            throw nb::python_error();
-          }
-          return cmp;
-        });
-  } catch (nb::python_error& e) {
-    nb::raise_from(e, PyExc_ValueError,
-                   "Comparator raised exception while sorting pytree "
-                   "dictionary keys.");
-  }
-  return keys;
-}
-
-/*static*/ bool IsSortedPyDictKeysEqual(absl::Span<const nb::object> lhs,
-                                        absl::Span<const nb::object> rhs) {
-  if (lhs.size() != rhs.size()) {
-    return false;
-  }
-  for (int i = 0; i < lhs.size(); ++i) {
-    if (lhs[i].not_equal(rhs[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool PyTreeDef::operator==(const PyTreeDef& other) const {
-  if (traversal_.size() != other.traversal_.size()) {
-    return false;
-  }
-  for (size_t i = 0; i < traversal_.size(); ++i) {
-    const Node& a = traversal_[i];
-    const Node& b = other.traversal_[i];
-    if (a.kind != b.kind || a.arity != b.arity ||
-        (a.node_data.ptr() == nullptr) != (b.node_data.ptr() == nullptr) ||
-        (a.sorted_dict_keys.size() != b.sorted_dict_keys.size()) ||
-        a.custom != b.custom) {
-      return false;
-    }
-    if (a.node_data && a.node_data.not_equal(b.node_data)) {
-      return false;
-    }
-    if (!IsSortedPyDictKeysEqual(a.sorted_dict_keys, b.sorted_dict_keys)) {
-      return false;
-    }
-    // We don't need to test equality of num_leaves and num_nodes since they
-    // are derivable from the other node data.
-  }
-  return true;
-}
-
-nb::object PyTreeRegistry::FlattenOneLevel(nb::handle x) const {
-  return FlattenOneLevelImpl(x, /*with_keys=*/false);
-}
-
-nb::object PyTreeRegistry::FlattenOneLevelWithKeys(nb::handle x) const {
-  return FlattenOneLevelImpl(x, /*with_keys=*/true);
-}
-
-nb::object PyTreeRegistry::FlattenOneLevelImpl(nb::handle x,
-                                               bool with_keys) const {
-  PyTreeRegistry::Registration const* custom;
-  PyTreeKind kind = KindOfObject(x, &custom);
-  switch (kind) {
-    case PyTreeKind::kNone:
-      return nb::make_tuple(nb::make_tuple(), nb::none());
-    case PyTreeKind::kTuple: {
-      if (with_keys) {
-        auto size = PyTuple_GET_SIZE(x.ptr());
-        nb::object key_leaves = nb::steal(PyTuple_New(size));
-        for (int i = 0; i < size; ++i) {
-          nb::object key = make_nb_class<SequenceKey>(i);
-          nb::object value =
-              nb::borrow<nb::object>(PyTuple_GET_ITEM(x.ptr(), i));
-          PyTuple_SET_ITEM(key_leaves.ptr(), i,
-                           nb::make_tuple(key, value).release().ptr());
-        }
-        return nb::make_tuple(std::move(key_leaves), nb::none());
-      }
-      return nb::make_tuple(nb::borrow(x), nb::none());
-    }
-    case PyTreeKind::kList: {
-      if (with_keys) {
-        auto size = PyList_GET_SIZE(x.ptr());
-        nb::object key_leaves = nb::steal(PyTuple_New(size));
-        for (int i = 0; i < size; ++i) {
-          nb::object key = make_nb_class<SequenceKey>(i);
-          nb::object value =
-              nb::borrow<nb::object>(PyList_GET_ITEM(x.ptr(), i));
-          PyTuple_SET_ITEM(key_leaves.ptr(), i,
-                           nb::make_tuple(key, value).release().ptr());
-        }
-        return nb::make_tuple(std::move(key_leaves), nb::none());
-      }
-      return nb::make_tuple(nb::borrow(x), nb::none());
-    }
-    case PyTreeKind::kDict: {
-      nb::dict dict = nb::borrow<nb::dict>(x);
-      std::vector<nb::object> sorted_keys = GetSortedPyDictKeys(dict.ptr());
-      nb::tuple keys = nb::steal<nb::tuple>(PyTuple_New(sorted_keys.size()));
-      nb::tuple values = nb::steal<nb::tuple>(PyTuple_New(sorted_keys.size()));
-      for (size_t i = 0; i < sorted_keys.size(); ++i) {
-        nb::object& key = sorted_keys[i];
-        nb::object value = nb::object(dict[key]);
-        if (with_keys) {
-          value = nb::make_tuple(make_nb_class<DictKey>(key), value);
-        }
-        PyTuple_SET_ITEM(values.ptr(), i, value.release().ptr());
-        PyTuple_SET_ITEM(keys.ptr(), i, sorted_keys[i].release().ptr());
-      }
-      return nb::make_tuple(std::move(values), std::move(keys));
-    }
-    case PyTreeKind::kNamedTuple: {
-      nb::tuple in = nb::borrow<nb::tuple>(x);
-      nb::list out;
-      if (with_keys) {
-        // Get key names from NamedTuple fields.
-        nb::tuple fields;
-        if (!nb::try_cast<nb::tuple>(nb::getattr(in, "_fields"), fields) ||
-            in.size() != fields.size()) {
-          throw std::invalid_argument(
-              "A namedtuple's _fields attribute should have the same size as "
-              "the tuple.");
-        }
-        auto field_iter = fields.begin();
-        for (nb::handle entry : in) {
-          out.append(nb::make_tuple(
-              make_nb_class<GetAttrKey>(nb::str(*field_iter)), entry));
-        }
-        return nb::make_tuple(std::move(out), x.type());
-      }
-      for (size_t i = 0; i < in.size(); ++i) {
-        out.append(in[i]);
-      }
-      return nb::make_tuple(std::move(out), x.type());
-    }
-    case PyTreeKind::kCustom: {
-      if (with_keys) {
-        auto [leaves, aux_data] = custom->ToIterableWithKeys(x);
-        return nb::make_tuple(std::move(leaves), std::move(aux_data));
-      }
-      auto [leaves, aux_data] = custom->ToIterable(x);
-      return nb::make_tuple(std::move(leaves), std::move(aux_data));
-    }
-    case PyTreeKind::kDataclass: {
-      auto data_size = custom->data_fields.size();
-      nb::list leaves = nb::steal<nb::list>(PyList_New(data_size));
-      for (int leaf = 0; leaf < data_size; ++leaf) {
-        nb::object value = nb::getattr(x, custom->data_fields[leaf]);
-        if (with_keys) {
-          value = nb::make_tuple(
-              make_nb_class<GetAttrKey>(custom->data_fields[leaf]), value);
-        }
-        PyList_SET_ITEM(leaves.ptr(), leaf, value.release().ptr());
-      }
-      auto meta_size = custom->meta_fields.size();
-      nb::object aux_data = nb::steal(PyTuple_New(meta_size));
-      for (int meta_leaf = 0; meta_leaf < meta_size; ++meta_leaf) {
-        PyTuple_SET_ITEM(
-            aux_data.ptr(), meta_leaf,
-            nb::getattr(x, custom->meta_fields[meta_leaf]).release().ptr());
-      }
-      return nb::make_tuple(std::move(leaves), std::move(aux_data));
-    }
-    default:
-      DCHECK(kind == PyTreeKind::kLeaf);
-      return nb::none();
-  }
-}
-
-/* static */ PyType_Slot PyTreeRegistry::slots_[] = {
-    {Py_tp_traverse, (void*)PyTreeRegistry::tp_traverse},
-    {Py_tp_clear, (void*)PyTreeRegistry::tp_clear},
-    {0, nullptr},
-};
-
-/* static */ int PyTreeRegistry::tp_traverse(PyObject* self, visitproc visit,
-                                             void* arg) {
-  PyTreeRegistry* registry = nb::inst_ptr<PyTreeRegistry>(self);
-  Py_VISIT(Py_TYPE(self));
-  nb::ft_lock_guard lock(registry->mu_);
-  for (const auto& [key, value] : registry->registrations_) {
-    Py_VISIT(key.ptr());
-    int rval = value->tp_traverse(visit, arg);
-    if (rval != 0) {
-      return rval;
-    }
-  }
-  return 0;
-}
-
-/* static */ int PyTreeRegistry::tp_clear(PyObject* self) {
-  PyTreeRegistry* registry = nb::inst_ptr<PyTreeRegistry>(self);
-  nb::ft_lock_guard lock(registry->mu_);
-  registry->registrations_.clear();
-  return 0;
-}
-
-/* static */ PyType_Slot DictKey::slots_[] = {
-    {Py_tp_traverse, (void*)DictKey::tp_traverse},
-    {Py_tp_clear, (void*)DictKey::tp_clear},
-    {0, nullptr},
-};
-
-/* static */ int DictKey::tp_traverse(PyObject* self, visitproc visit,
-                                      void* arg) {
-  DictKey* key = nb::inst_ptr<DictKey>(self);
-  Py_VISIT(key->key_.ptr());
-  return 0;
-}
-
-/* static */ int DictKey::tp_clear(PyObject* self) {
-  DictKey* dictkey = nb::inst_ptr<DictKey>(self);
-  nb::object tmp;
-  std::swap(tmp, dictkey->key_);
-  return 0;
-}
-
-std::string SequenceKey::ToString() const {
-  return absl::StrFormat("[%d]", idx_);
-}
-
-std::string SequenceKey::ToReprString() const {
-  return absl::StrFormat("SequenceKey(idx=%d)", idx_);
-}
-
-std::string DictKey::ToString() const {
-  return absl::StrFormat("[%s]", nb::cast<absl::string_view>(nb::repr(key_)));
-}
-
-std::string DictKey::ToReprString() const {
-  return absl::StrFormat("DictKey(key=%s)",
-                         nb::cast<absl::string_view>(nb::repr(key_)));
-}
-
-std::string GetAttrKey::ToString() const {
-  return absl::StrFormat(".%s", nb::cast<absl::string_view>(name_));
-}
-
-std::string GetAttrKey::ToReprString() const {
-  return absl::StrFormat("GetAttrKey(name='%s')",
-                         nb::cast<absl::string_view>(name_));
-}
-
-std::string FlattenedIndexKey::ToString() const {
-  return absl::StrFormat("[<flat index %d>]", key_);
-}
-
-std::string FlattenedIndexKey::ToReprString() const {
-  return absl::StrFormat("FlattenedIndexKey(key=%d)", key_);
-}
-
-bool SequenceKey::Equals(const nb::object& other) {
-  SequenceKey other_key(0);
-  if (!nb::try_cast<SequenceKey>(other, other_key)) return false;
-  return idx_ == other_key.idx();
-}
-
-bool DictKey::Equals(const nb::object& other) {
-  DictKey other_key(nb::none());
-  if (!nb::try_cast<DictKey>(other, other_key)) return false;
-  return key_.equal(other_key.key());
-}
-
-bool GetAttrKey::Equals(const nb::object& other) {
-  GetAttrKey other_key(nb::str(""));
-  if (!nb::try_cast<GetAttrKey>(other, other_key)) return false;
-  return name_.equal(other_key.name());
-}
-
-bool FlattenedIndexKey::Equals(const nb::object& other) {
-  FlattenedIndexKey other_key(0);
-  if (!nb::try_cast<FlattenedIndexKey>(other, other_key)) return false;
-  return key_ == other_key.key();
-}
-
-nanobind::tuple SequenceKey::MatchArgs(nanobind::handle unused) {
-  return nanobind::make_tuple("idx");
-};
-
-nanobind::tuple DictKey::MatchArgs(nanobind::handle unused) {
-  return nanobind::make_tuple("key");
-};
-
-nanobind::tuple GetAttrKey::MatchArgs(nanobind::handle unused) {
-  return nanobind::make_tuple("name");
-};
-
-nanobind::tuple FlattenedIndexKey::MatchArgs(nanobind::handle unused) {
-  return nanobind::make_tuple("key");
-};
-
-template <typename T>
-void PyTreeDef::FlattenImpl(nb::handle handle, T& leaves,
-                            const std::optional<nb::callable>& leaf_predicate,
-                            std::optional<std::vector<nb::object>>& keypath) {
-  Node node;
-  const int start_num_nodes = traversal_.size();
-  const int start_num_leaves = leaves.size();
-  bool is_known_leaf = false;
-  if (leaf_predicate) {
-    nb::object o = (*leaf_predicate)(handle);
-    // Historically we accepted "truthy" values from leaf predicates. Accept
-    // None here to keep existing clients happy.
-    if (o.is_none()) {
-      is_known_leaf = false;
-    } else if (!nb::try_cast<bool>(o, is_known_leaf)) {
-      throw std::invalid_argument(absl::StrCat(
-          "is_leaf predicate returned a non-boolean value ",
-          nb::cast<absl::string_view>(nb::repr(o)), "; expected a boolean"));
-    }
-  }
-  if (is_known_leaf) {
-    nb::object value = nb::borrow<nb::object>(handle);
-    if (keypath.has_value()) {
-      const std::vector<nb::object>& frozen_keypath = keypath.value();
-      nb::object kp_tuple = nb::steal(PyTuple_New(frozen_keypath.size()));
-      for (int i = 0; i < frozen_keypath.size(); ++i) {
-        PyTuple_SET_ITEM(kp_tuple.ptr(), i,
-                         nb::object(frozen_keypath[i]).release().ptr());
-      }
-      value = nb::make_tuple(std::move(kp_tuple), std::move(value));
-    }
-    if constexpr (std::is_same_v<T, nb::list>) {
-      leaves.append(std::move(value));
-    } else {
-      leaves.push_back(std::move(value));
-    }
-  } else {
-    node.kind = registry_->KindOfObject(handle, &node.custom);
-    auto recurse = [this, &leaf_predicate, &leaves](
-                       nb::handle child,
-                       std::optional<std::vector<nb::object>>& keypath) {
-      if (Py_EnterRecursiveCall(
-              " in flatten; PyTree may have cyclical node references.")) {
-        return;
-      }
-      FlattenImpl(child, leaves, leaf_predicate, keypath);
-      Py_LeaveRecursiveCall();
-    };
-    switch (node.kind) {
-      case PyTreeKind::kNone:
-        // Nothing to do.
-        break;
-      case PyTreeKind::kTuple: {
-        node.arity = PyTuple_GET_SIZE(handle.ptr());
-        for (int i = 0; i < node.arity; ++i) {
-          if (keypath.has_value()) {
-            keypath->push_back(make_nb_class<SequenceKey>(i));
-          }
-          recurse(PyTuple_GET_ITEM(handle.ptr(), i), keypath);
-          if (keypath.has_value()) {
-            keypath->pop_back();
-          }
-        }
-        break;
-      }
-      case PyTreeKind::kList: {
-        node.arity = PyList_GET_SIZE(handle.ptr());
-        for (int i = 0; i < node.arity; ++i) {
-          if (keypath.has_value()) {
-            keypath->push_back(make_nb_class<SequenceKey>(i));
-          }
-          recurse(PyList_GET_ITEM(handle.ptr(), i), keypath);
-          if (keypath.has_value()) {
-            keypath->pop_back();
-          }
-        }
-        break;
-      }
-      case PyTreeKind::kDict: {
-        nb::dict dict = nb::borrow<nb::dict>(handle);
-
-        std::vector<nb::object> keys = GetSortedPyDictKeys(dict.ptr());
-        for (nb::object& key : keys) {
-          if (keypath.has_value()) {
-            keypath->push_back(make_nb_class<DictKey>(key));
-          }
-          recurse(dict[key], keypath);
-          if (keypath.has_value()) {
-            keypath->pop_back();
-          }
-        }
-        node.arity = dict.size();
-        node.sorted_dict_keys = std::move(keys);
-        break;
-      }
-      case PyTreeKind::kCustom: {
-        if (keypath.has_value()) {
-          auto [leaves, aux_data] = node.custom->ToIterableWithKeys(handle);
-          node.node_data = std::move(aux_data);
-          node.arity = 0;
-          for (auto& [key, leaf] : leaves) {
-            keypath->push_back(key);
-            ++node.arity;
-            recurse(leaf, keypath);
-            keypath->pop_back();
-          }
-        } else {
-          auto [leaves, aux_data] = node.custom->ToIterable(handle);
-          node.node_data = std::move(aux_data);
-          node.arity = 0;
-          for (nb::handle entry : leaves) {
-            ++node.arity;
-            recurse(entry, keypath);
-          }
-        }
-        break;
-      }
-      case PyTreeKind::kDataclass: {
-        auto meta_size = node.custom->meta_fields.size();
-        nb::object aux_data = nb::steal(PyTuple_New(meta_size));
-        for (int meta_leaf = 0; meta_leaf < meta_size; ++meta_leaf) {
-          PyTuple_SET_ITEM(
-              aux_data.ptr(), meta_leaf,
-              nb::getattr(handle, node.custom->meta_fields[meta_leaf])
-                  .release()
-                  .ptr());
-        }
-        node.node_data = std::move(aux_data);
-        auto data_size = node.custom->data_fields.size();
-        node.arity = data_size;
-        for (int leaf = 0; leaf < data_size; ++leaf) {
-          if (keypath.has_value()) {
-            keypath->push_back(
-                make_nb_class<GetAttrKey>(node.custom->data_fields[leaf]));
-          }
-          recurse(nb::getattr(handle, node.custom->data_fields[leaf]), keypath);
-          if (keypath.has_value()) {
-            keypath->pop_back();
-          }
-        }
-        break;
-      }
-      case PyTreeKind::kNamedTuple: {
-        nb::tuple tuple = nb::borrow<nb::tuple>(handle);
-        node.arity = tuple.size();
-        node.node_data = nb::borrow<nb::object>(tuple.type());
-        if (keypath.has_value()) {
-          // Get key names from NamedTuple fields.
-          nb::tuple fields;
-          if (!nb::try_cast<nb::tuple>(nb::getattr(tuple, "_fields"), fields) ||
-              tuple.size() != fields.size()) {
-            throw std::invalid_argument(
-                "A namedtuple's _fields attribute should have the same size as "
-                "the tuple.");
-          }
-          auto field_iter = fields.begin();
-          for (nb::handle entry : tuple) {
-            keypath->push_back(make_nb_class<GetAttrKey>(nb::str(*field_iter)));
-            field_iter++;
-            recurse(entry, keypath);
-            keypath->pop_back();
-          }
-        } else {
-          for (nb::handle entry : tuple) {
-            recurse(entry, keypath);
-          }
-        }
-        break;
-      }
-      default:
-        DCHECK(node.kind == PyTreeKind::kLeaf);
-        auto value = nb::borrow<nb::object>(handle);
-        if (keypath.has_value()) {
-          const std::vector<nb::object>& frozen_keypath = keypath.value();
-          nb::object kp_tuple = nb::steal(PyTuple_New(frozen_keypath.size()));
-          for (int i = 0; i < frozen_keypath.size(); ++i) {
-            PyTuple_SET_ITEM(kp_tuple.ptr(), i,
-                             nb::object(frozen_keypath[i]).release().ptr());
-          }
-          value = nb::make_tuple(std::move(kp_tuple), std::move(value));
-        }
-        if constexpr (std::is_same_v<T, nb::list>) {
-          leaves.append(std::move(value));
-        } else {
-          leaves.push_back(std::move(value));
-        }
-    }
-  }
-  node.num_nodes = traversal_.size() - start_num_nodes + 1;
-  node.num_leaves = leaves.size() - start_num_leaves;
-  traversal_.push_back(std::move(node));
-}
-
-void PyTreeDef::Flatten(nb::handle handle,
-                        absl::InlinedVector<nb::object, 2>& leaves,
-                        std::optional<nb::callable> leaf_predicate) {
-  std::optional<std::vector<nb::object>> keypath = std::nullopt;
-  FlattenImpl(handle, leaves, leaf_predicate, keypath);
-}
-
-void PyTreeDef::Flatten(nb::handle handle, std::vector<nb::object>& leaves,
-                        std::optional<nb::callable> leaf_predicate) {
-  std::optional<std::vector<nb::object>> keypath = std::nullopt;
-  FlattenImpl(handle, leaves, leaf_predicate, keypath);
-}
-
-void PyTreeDef::Flatten(nb::handle handle, nb::list& leaves,
-                        std::optional<nb::callable> leaf_predicate) {
-  std::optional<std::vector<nb::object>> keypath = std::nullopt;
-  FlattenImpl(handle, leaves, leaf_predicate, keypath);
-}
-
-/*static*/ std::pair<std::vector<nb::object>, nb_class_ptr<PyTreeDef>>
-PyTreeDef::Flatten(nb::handle x, nb_class_ptr<PyTreeRegistry> registry,
-                   std::optional<nb::callable> leaf_predicate) {
-  auto def = make_nb_class<PyTreeDef>(registry);
-  std::vector<nb::object> leaves;
-  def->Flatten(x, leaves, leaf_predicate);
-  return std::make_pair(std::move(leaves), std::move(def));
-}
-
-void PyTreeDef::FlattenWithPath(nb::handle handle, nanobind::list& leaves,
-                                std::optional<nb::callable> leaf_predicate) {
-  std::optional<std::vector<nb::object>> keypath = std::vector<nb::object>();
-  FlattenImpl(handle, leaves, leaf_predicate, keypath);
-}
-
-/*static*/ bool PyTreeDef::AllLeaves(PyTreeRegistry* registry,
-                                     const nb::iterable& x) {
-  const PyTreeRegistry::Registration* custom;
-  for (const nb::handle& h : x) {
-    if (registry->KindOfObject(h, &custom) != PyTreeKind::kLeaf) return false;
-  }
-  return true;
-}
-
-template <typename T>
-nb::object PyTreeDef::UnflattenImpl(T leaves) const {
-  absl::InlinedVector<nb::object, 4> agenda;
-  auto it = leaves.begin();
-  int leaf_count = 0;
-  for (const Node& node : traversal_) {
-    if (agenda.size() < node.arity) {
-      throw std::logic_error("Too few elements for TreeDef node.");
-    }
-    switch (node.kind) {
-      case PyTreeKind::kLeaf:
-        if (it == leaves.end()) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Too few leaves for PyTreeDef; expected %d, got %d", num_leaves(),
-              leaf_count));
-        }
-        agenda.push_back(nb::borrow<nb::object>(*it));
-        ++it;
-        ++leaf_count;
-        break;
-
-      case PyTreeKind::kNone:
-      case PyTreeKind::kTuple:
-      case PyTreeKind::kNamedTuple:
-      case PyTreeKind::kList:
-      case PyTreeKind::kDict:
-      case PyTreeKind::kCustom:
-      case PyTreeKind::kDataclass: {
-        const int size = agenda.size();
-        absl::Span<nb::object> span;
-        if (node.arity > 0) {
-          span = absl::Span<nb::object>(&agenda[size - node.arity], node.arity);
-        }
-        nb::object o = MakeNode(node, span);
-        agenda.resize(size - node.arity);
-        agenda.push_back(o);
-        break;
-      }
-    }
-  }
-  if (it != leaves.end()) {
-    throw std::invalid_argument(absl::StrFormat(
-        "Too many leaves for PyTreeDef; expected %d.", num_leaves()));
-  }
-  if (agenda.size() != 1) {
-    throw std::logic_error("PyTreeDef traversal did not yield a singleton.");
-  }
-  return std::move(agenda.back());
-}
-
-nb::object PyTreeDef::Unflatten(nb::iterable leaves) const {
-  return UnflattenImpl(leaves);
-}
-
-nb::object PyTreeDef::Unflatten(absl::Span<const nb::object> leaves) const {
-  return UnflattenImpl(leaves);
-}
-
-/*static*/ nb::object PyTreeDef::MakeNode(const PyTreeDef::Node& node,
-                                          absl::Span<nb::object> children) {
-  if (children.size() != node.arity) {
-    throw std::logic_error("Node arity mismatch.");
-  }
-  switch (node.kind) {
-    case PyTreeKind::kLeaf:
-      throw std::logic_error("MakeNode not implemented for leaves.");
-
-    case PyTreeKind::kNone:
-      return nb::none();
-
-    case PyTreeKind::kTuple:
-    case PyTreeKind::kNamedTuple: {
-      nb::object tuple = nb::steal(PyTuple_New(node.arity));
-      for (int i = 0; i < node.arity; ++i) {
-        PyTuple_SET_ITEM(tuple.ptr(), i, children[i].release().ptr());
-      }
-      if (node.kind == PyTreeKind::kNamedTuple) {
-        return node.node_data(*tuple);
-      } else {
-        return tuple;
-      }
-    }
-
-    case PyTreeKind::kList: {
-      nb::object list = nb::steal(PyList_New(node.arity));
-      for (int i = 0; i < node.arity; ++i) {
-        PyList_SET_ITEM(list.ptr(), i, children[i].release().ptr());
-      }
-      return list;
-    }
-
-    case PyTreeKind::kDict: {
-      nb::dict dict;
-      for (int i = 0; i < node.arity; ++i) {
-        dict[node.sorted_dict_keys[i]] = std::move(children[i]);
-      }
-      return std::move(dict);
-      break;
-    }
-    case PyTreeKind::kCustom: {
-      nb::object tuple = nb::steal(PyTuple_New(node.arity));
-      for (int i = 0; i < node.arity; ++i) {
-        PyTuple_SET_ITEM(tuple.ptr(), i, children[i].release().ptr());
-      }
-      return node.custom->from_iterable(node.node_data, tuple);
-    }
-
-    case PyTreeKind::kDataclass: {
-      nb::kwargs kwargs;
-      auto meta_size = node.custom->meta_fields.size();
-      for (int i = 0; i < meta_size; ++i) {
-        kwargs[node.custom->meta_fields[i]] =
-            nb::borrow(nb::tuple(node.node_data)[i]);
-      }
-      auto data_size = node.custom->data_fields.size();
-      for (int i = 0; i < data_size; ++i) {
-        kwargs[node.custom->data_fields[i]] = std::move(children[i]);
-      }
-      return node.custom->type(**kwargs);
-    }
-  }
-  throw std::logic_error("Unreachable code.");
-}
-
-nb::list PyTreeDef::FlattenUpTo(nb::handle xs) const {
-  nb::list leaves = nb::steal<nb::list>(PyList_New(num_leaves()));
-  std::vector<nb::object> agenda;
-  agenda.push_back(nb::borrow<nb::object>(xs));
-  auto it = traversal_.rbegin();
-  int leaf = num_leaves() - 1;
-  while (!agenda.empty()) {
-    if (it == traversal_.rend()) {
-      throw std::invalid_argument(absl::StrFormat(
-          "Tree structures did not match: %s vs %s",
-          nb::cast<absl::string_view>(nb::repr(xs)), ToString()));
-    }
-    const Node& node = *it;
-    nb::object object = agenda.back();
-    agenda.pop_back();
-    ++it;
-
-    switch (node.kind) {
-      case PyTreeKind::kLeaf:
-        if (leaf < 0) {
-          throw std::logic_error("Leaf count mismatch.");
-        }
-        PyList_SET_ITEM(leaves.ptr(), leaf, object.release().ptr());
-        --leaf;
-        break;
-
-      case PyTreeKind::kNone:
-        if (!object.is_none()) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Expected None, got %s.\n\n"
-              "In previous releases of JAX, flatten-up-to used to "
-              "consider None to be a tree-prefix of non-None values. To obtain "
-              "the previous behavior, you can usually write:\n"
-              "  jax.tree.map(lambda x, y: None if x is None else f(x, y), a, "
-              "b, is_leaf=lambda x: x is None)",
-              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        break;
-
-      case PyTreeKind::kTuple: {
-        if (!PyTuple_CheckExact(object.ptr())) {
-          throw std::invalid_argument(
-              absl::StrFormat("Expected tuple, got %s.",
-                              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        nb::tuple tuple = nb::borrow<nb::tuple>(object);
-        if (tuple.size() != node.arity) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Tuple arity mismatch: %d != %d; tuple: %s.", tuple.size(),
-              node.arity, nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        for (nb::handle entry : tuple) {
-          agenda.push_back(nb::borrow<nb::object>(entry));
-        }
-        break;
-      }
-
-      case PyTreeKind::kList: {
-        if (!PyList_CheckExact(object.ptr())) {
-          throw std::invalid_argument(
-              absl::StrFormat("Expected list, got %s.",
-                              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        nb::list list = nb::borrow<nb::list>(object);
-        if (list.size() != node.arity) {
-          throw std::invalid_argument(absl::StrFormat(
-              "List arity mismatch: %d != %d; list: %s.", list.size(),
-              node.arity, nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        for (nb::handle entry : list) {
-          agenda.push_back(nb::borrow<nb::object>(entry));
-        }
-        break;
-      }
-
-      case PyTreeKind::kDict: {
-        if (!PyDict_CheckExact(object.ptr())) {
-          throw std::invalid_argument(
-              absl::StrFormat("Expected dict, got %s.",
-                              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        nb::dict dict = nb::borrow<nb::dict>(object);
-        std::vector<nb::object> keys = GetSortedPyDictKeys(dict.ptr());
-        if (!IsSortedPyDictKeysEqual(keys, node.sorted_dict_keys)) {
-          // Convert to a nb::list for nb::repr to avoid having to stringify a
-          // vector. This is error path so it is fine to pay conversion cost.
-          throw std::invalid_argument(
-              absl::StrFormat("Dict key mismatch; expected keys: %s; dict: %s.",
-                              nb::cast<absl::string_view>(
-                                  nb::repr(nb::cast(node.sorted_dict_keys))),
-                              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        for (nb::handle key : keys) {
-          agenda.push_back(dict[key]);
-        }
-        break;
-      }
-
-      case PyTreeKind::kNamedTuple: {
-        if (!nb::isinstance<nb::tuple>(object) ||
-            !nb::hasattr(object, "_fields")) {
-          throw std::invalid_argument(
-              absl::StrFormat("Expected named tuple, got %s.",
-                              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        nb::tuple tuple = nb::borrow<nb::tuple>(object);
-        if (tuple.size() != node.arity) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Named tuple arity mismatch: %d != %d; tuple: %s.", tuple.size(),
-              node.arity, nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        if (tuple.type().not_equal(node.node_data)) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Named tuple type mismatch: expected type: %s, tuple: %s.",
-              nb::cast<absl::string_view>(nb::repr(node.node_data)),
-              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        for (nb::handle entry : tuple) {
-          agenda.push_back(nb::borrow<nb::object>(entry));
-        }
-        break;
-      }
-
-      case PyTreeKind::kCustom: {
-        auto* registration = registry_->Lookup(object.type());
-        if (registration != node.custom) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Custom node type mismatch: expected type: %s, value: %s.",
-              nb::cast<absl::string_view>(nb::repr(node.custom->type)),
-              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        auto [leaves, aux_data] = node.custom->ToIterable(object);
-        if (node.node_data.not_equal(aux_data)) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Mismatch custom node data: %s != %s; value: %s.",
-              nb::cast<absl::string_view>(nb::repr(node.node_data)),
-              nb::cast<absl::string_view>(nb::repr(aux_data)),
-              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        int arity = 0;
-        for (nb::handle entry : leaves) {
-          ++arity;
-          agenda.push_back(nb::borrow<nb::object>(entry));
-        }
-        if (arity != node.arity) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Custom type arity mismatch: %d != %d; value: %s.", arity,
-              node.arity, nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        break;
-      }
-
-      case PyTreeKind::kDataclass: {
-        auto* registration = registry_->Lookup(object.type());
-        if (registration != node.custom) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Custom dataclasss node type mismatch: expected type: %s, value: "
-              "%s.",
-              nb::cast<absl::string_view>(nb::repr(node.custom->type)),
-              nb::cast<absl::string_view>(nb::repr(std::move(object)))));
-        }
-        auto meta_size = node.custom->meta_fields.size();
-        nb::object aux_data = nb::steal(PyTuple_New(meta_size));
-        for (int meta_leaf = 0; meta_leaf < meta_size; ++meta_leaf) {
-          PyTuple_SET_ITEM(
-              aux_data.ptr(), meta_leaf,
-              nb::getattr(object, node.custom->meta_fields[meta_leaf])
-                  .release()
-                  .ptr());
-        }
-        if (node.node_data.not_equal(aux_data)) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Mismatch custom dataclass node data: %s != %s; value: %s.",
-              nb::cast<absl::string_view>(nb::repr(node.node_data)),
-              nb::cast<absl::string_view>(nb::repr(aux_data)),
-              nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        auto data_size = node.custom->data_fields.size();
-        if (data_size != node.arity) {
-          throw std::invalid_argument(absl::StrFormat(
-              "Custom type arity mismatch: %d != %d; value: %s.", data_size,
-              node.arity, nb::cast<absl::string_view>(nb::repr(object))));
-        }
-        for (int leaf = 0; leaf < data_size; ++leaf) {
-          agenda.push_back(nb::borrow<nb::object>(
-              nb::getattr(object, node.custom->data_fields[leaf])));
-        }
-        break;
-      }
-    }
-  }
-  if (it != traversal_.rend() || leaf != -1) {
-    throw std::invalid_argument(
-        absl::StrFormat("Tree structures did not match: %s vs %s",
-                        nb::cast<absl::string_view>(nb::repr(xs)), ToString()));
-  }
-  return leaves;
-}
-
-nb::object PyTreeDef::Walk(const nb::callable& f_node, nb::handle f_leaf,
-                           nb::iterable leaves) const {
-  std::vector<nb::object> agenda;
-  auto it = leaves.begin();
-  for (const Node& node : traversal_) {
-    switch (node.kind) {
-      case PyTreeKind::kLeaf: {
-        if (it == leaves.end()) {
-          throw std::invalid_argument("Too few leaves for PyTreeDef");
-        }
-
-        nb::object leaf = nb::borrow<nb::object>(*it);
-        agenda.push_back(f_leaf.is_none() ? std::move(leaf)
-                                          : f_leaf(std::move(leaf)));
-        ++it;
-        break;
-      }
-
-      case PyTreeKind::kNone:
-      case PyTreeKind::kTuple:
-      case PyTreeKind::kNamedTuple:
-      case PyTreeKind::kList:
-      case PyTreeKind::kDict:
-      case PyTreeKind::kCustom:
-      case PyTreeKind::kDataclass: {
-        if (agenda.size() < node.arity) {
-          throw std::logic_error("Too few elements for custom type.");
-        }
-        nb::object tuple = nb::steal(PyTuple_New(node.arity));
-        for (int i = node.arity - 1; i >= 0; --i) {
-          PyTuple_SET_ITEM(tuple.ptr(), i, agenda.back().release().ptr());
-          agenda.pop_back();
-        }
-        nb::object node_data = node.node_data;
-        if (node.kind == PyTreeKind::kDict) {
-          // Convert to a nb::list for f_node invocation.
-          node_data = nb::cast(node.sorted_dict_keys);
-        }
-        agenda.push_back(f_node(tuple, node_data ? node_data : nb::none()));
-      }
-    }
-  }
-  if (it != leaves.end()) {
-    throw std::invalid_argument("Too many leaves for PyTreeDef");
-  }
-  if (agenda.size() != 1) {
-    throw std::logic_error("PyTreeDef traversal did not yield a singleton.");
-  }
-  return std::move(agenda.back());
-}
-
-nb::object PyTreeDef::FromIterableTreeHelper(
-    nb::handle xs,
-    absl::InlinedVector<PyTreeDef::Node, 1>::const_reverse_iterator* it) const {
-  if (*it == traversal_.rend()) {
-    throw std::invalid_argument("Tree structures did not match.");
-  }
-  const Node& node = **it;
-  ++*it;
-  if (node.kind == PyTreeKind::kLeaf) {
-    return nb::borrow<nb::object>(xs);
-  }
-  nb::iterable iterable = nb::borrow<nb::iterable>(xs);
-  std::vector<nb::object> ys;
-  ys.reserve(node.arity);
-  for (nb::handle x : iterable) {
-    ys.push_back(nb::borrow<nb::object>(x));
-  }
-  if (ys.size() != node.arity) {
-    throw std::invalid_argument("Arity mismatch between trees");
-  }
-  for (int j = node.arity - 1; j >= 0; --j) {
-    ys[j] = FromIterableTreeHelper(ys[j], it);
-  }
-
-  return MakeNode(node, absl::MakeSpan(ys));
-}
-
-nb::object PyTreeDef::FromIterableTree(nb::handle xs) const {
-  auto it = traversal_.rbegin();
-  nb::object out = FromIterableTreeHelper(xs, &it);
-  if (it != traversal_.rend()) {
-    throw std::invalid_argument("Tree structures did not match.");
-  }
-  return out;
-}
-
-nb_class_ptr<PyTreeDef> PyTreeDef::Compose(const PyTreeDef& inner) const {
-  if (inner.registry_ != registry_) {
-    throw std::invalid_argument(
-        "PyTree registries of PyTreeDefs passed to Compose() must match.");
-  }
-  auto out = make_nb_class<PyTreeDef>(registry_ref_);
-  out->traversal_.reserve(static_cast<size_t>(num_leaves()) *
-                              inner.num_nodes() +
-                          num_nodes() - num_leaves());
-  for (const Node& n : traversal_) {
-    if (n.kind == PyTreeKind::kLeaf) {
-      absl::c_copy(inner.traversal_, std::back_inserter(out->traversal_));
-    } else {
-      out->traversal_.push_back(n);
-    }
-  }
-  out->SetNumLeavesAndNumNodes();
-  return out;
-}
-
-/*static*/ nb_class_ptr<PyTreeDef> PyTreeDef::Tuple(
-    nb_class_ptr<PyTreeRegistry> registry, nb::list defs) {
-  auto out = make_nb_class<PyTreeDef>(std::move(registry));
-  int num_leaves = 0;
-  for (nb::handle def_handle : defs) {
-    const PyTreeDef* def = nb::cast<const PyTreeDef*>(def_handle);
-    if (def->registry() != out->registry()) {
-      throw std::invalid_argument(
-          "PyTree registries of PyTreeDefs passed to Tuple() must match.");
-    }
-    absl::c_copy(def->traversal_, std::back_inserter(out->traversal_));
-    num_leaves += def->num_leaves();
-  }
-  Node node;
-  node.kind = PyTreeKind::kTuple;
-  node.arity = defs.size();
-  node.num_leaves = num_leaves;
-  node.num_nodes = out->traversal_.size() + 1;
-  out->traversal_.push_back(node);
-  return out;
-}
-
-std::vector<nb_class_ptr<PyTreeDef>> PyTreeDef::Children() const {
-  std::vector<nb_class_ptr<PyTreeDef>> children;
-  if (traversal_.empty()) {
-    return children;
-  }
-  Node const& root = traversal_.back();
-  children.resize(root.arity);
-  int pos = traversal_.size() - 1;
-  for (int i = root.arity - 1; i >= 0; --i) {
-    children[i] = make_nb_class<PyTreeDef>(registry_ref_);
-    const Node& node = traversal_.at(pos - 1);
-    if (pos < node.num_nodes) {
-      throw std::logic_error("children() walked off start of array");
-    }
-    std::copy(traversal_.begin() + pos - node.num_nodes,
-              traversal_.begin() + pos,
-              std::back_inserter(children[i]->traversal_));
-    pos -= node.num_nodes;
-  }
-  if (pos != 0) {
-    throw std::logic_error("pos != 0 at end of PyTreeDef::Children");
-  }
-  return children;
-}
-
-std::string PyTreeDef::ToString() const {
-  std::vector<std::string> agenda;
-  for (const Node& node : traversal_) {
-    if (agenda.size() < node.arity) {
-      throw std::logic_error("Too few elements for container.");
-    }
-
-    std::string children =
-        absl::StrJoin(agenda.end() - node.arity, agenda.end(), ", ");
-    std::string representation;
-    switch (node.kind) {
-      case PyTreeKind::kLeaf:
-        agenda.push_back("*");
-        continue;
-      case PyTreeKind::kNone:
-        representation = "None";
-        break;
-      case PyTreeKind::kTuple:
-        // Tuples with only one element must have a trailing comma.
-        if (node.arity == 1) children += ",";
-        representation = absl::StrCat("(", children, ")");
-        break;
-      case PyTreeKind::kList:
-        representation = absl::StrCat("[", children, "]");
-        break;
-      case PyTreeKind::kDict: {
-        if (node.sorted_dict_keys.size() != node.arity) {
-          throw std::logic_error("Number of keys and entries does not match.");
-        }
-        representation = "{";
-        std::string separator;
-        auto child_iter = agenda.end() - node.arity;
-        for (const nb::handle& key : node.sorted_dict_keys) {
-          absl::StrAppendFormat(&representation, "%s%s: %s", separator,
-                                nb::cast<absl::string_view>(nb::repr(key)),
-                                *child_iter);
-          child_iter++;
-          separator = ", ";
-        }
-        representation += "}";
-        break;
-      }
-
-      case PyTreeKind::kNamedTuple:
-      case PyTreeKind::kCustom:
-      case PyTreeKind::kDataclass: {
-        std::string kind;
-        std::string data;
-        if (node.kind == PyTreeKind::kNamedTuple) {
-          kind = "namedtuple";
-          if (node.node_data) {
-            // Node data for named tuples is the type.
-            data = absl::StrFormat(
-                "[%s]", nb::cast<absl::string_view>(
-                            nb::str(nb::getattr(node.node_data, "__name__"))));
-          }
-        } else {
-          kind = nb::cast<std::string>(
-              nb::str(nb::getattr(node.custom->type, "__name__")));
-          if (node.node_data) {
-            data = absl::StrFormat(
-                "[%s]", nb::cast<absl::string_view>(nb::str(node.node_data)));
-          }
-        }
-
-        representation =
-            absl::StrFormat("CustomNode(%s%s, [%s])", kind, data, children);
-        break;
-      }
-    }
-    agenda.erase(agenda.end() - node.arity, agenda.end());
-    agenda.push_back(std::move(representation));
-  }
-  if (agenda.size() != 1) {
-    throw std::logic_error("PyTreeDef traversal did not yield a singleton.");
-  }
-  return absl::StrCat("PyTreeDef(", agenda.back(), ")");
-}
-
-nb::object PyTreeDef::ToPickle() const {
-  nb::list traversal;
-  for (const auto& node : traversal_) {
-    nb::object node_data = node.node_data;
-    if (node.kind == PyTreeKind::kDict) {
-      // Convert to a nb::list for pickling to avoid having to pickle a vector.
-      // Pickle should be a rare operation so this conversion cost is hopefully
-      // on non-critical path.
-      node_data = nb::cast(node.sorted_dict_keys);
-    }
-    traversal.append(
-        nb::make_tuple(static_cast<int>(node.kind), node.arity,
-                       node_data ? node_data : nb::none(),
-                       node.custom != nullptr ? node.custom->type : nb::none(),
-                       node.num_leaves, node.num_nodes));
-  }
-  return nb::make_tuple(nb::cast(registry_ref_), traversal);
-}
-
-void PyTreeDef::FromPickle(nb::object pickle) {
-  for (const auto& item : nb::cast<nb::list>(pickle)) {
-    auto t = nb::cast<nb::tuple>(item);
-    if (t.size() != 6) {
-      throw xla::XlaRuntimeError("Malformed pickled PyTreeDef");
-    }
-    Node& node = traversal_.emplace_back();
-    node.kind = static_cast<PyTreeKind>(nb::cast<int>(t[0]));
-    node.arity = nb::cast<int>(t[1]);
-    switch (node.kind) {
-      case PyTreeKind::kNamedTuple:
-        node.node_data = t[2];
-        break;
-      case PyTreeKind::kDict:
-        node.sorted_dict_keys = nb::cast<std::vector<nb::object>>(t[2]);
-        break;
-      case PyTreeKind::kCustom:
-      case PyTreeKind::kDataclass:
-        node.node_data = t[2];
-        break;
-      default:
-        if (!t[2].is_none()) {
-          throw xla::XlaRuntimeError("Malformed pickled PyTreeDef");
-        }
-        break;
-    }
-    if (node.kind == PyTreeKind::kCustom ||
-        node.kind == PyTreeKind::kDataclass) {
-      node.custom = t[3].is_none() ? nullptr : registry()->Lookup(t[3]);
-      if (node.custom == nullptr) {
-        throw xla::XlaRuntimeError(
-            absl::StrCat("Unknown custom type in pickled PyTreeDef: ",
-                         nb::cast<absl::string_view>(nb::repr(t[3]))));
-      }
-    } else {
-      if (!t[3].is_none()) {
-        throw xla::XlaRuntimeError("Malformed pickled PyTreeDef");
-      }
-    }
-    node.num_leaves = nb::cast<int>(t[4]);
-    node.num_nodes = nb::cast<int>(t[5]);
-  }
-}
-
-void PyTreeDef::SetNumLeavesAndNumNodes() {
-  // num_leaves and num_nodes are fully determined by arity.
-  std::vector<std::pair<int, int>> starts;
-  int num_leaves = 0;
-  for (int i = 0; i < traversal_.size(); ++i) {
-    std::pair<int, int> start = {num_leaves, i};
-    if (traversal_[i].kind == PyTreeKind::kLeaf) {
-      num_leaves += 1;
-    }
-    if (traversal_[i].arity == 0) {
-      starts.push_back(start);
-    } else {
-      starts.resize(starts.size() - (traversal_[i].arity - 1));
-    }
-    traversal_[i].num_leaves = num_leaves - starts.back().first;
-    traversal_[i].num_nodes = i + 1 - starts.back().second;
-  }
-}
-
-void PyTreeDef::SerializeTo(jax::PyTreeDefProto& result) const {
-  absl::flat_hash_map<std::string, uint32_t> interned_strings;
-  auto intern_str = [&](const std::string& key) {
-    auto [it, added] =
-        interned_strings.emplace(key, result.interned_strings_size());
-    if (added) {
-      result.add_interned_strings(key);
-    }
-    return it->second;
-  };
-  for (const auto& node : traversal_) {
-    auto* node_data = result.add_nodes();
-    node_data->set_arity(node.arity);
-    switch (node.kind) {
-      case PyTreeKind::kLeaf:
-        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_LEAF);
-        break;
-      case PyTreeKind::kList:
-        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_LIST);
-        break;
-      case PyTreeKind::kNone:
-        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_NONE);
-        break;
-      case PyTreeKind::kTuple:
-        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_TUPLE);
-        break;
-      case PyTreeKind::kDict:
-        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_DICT);
-        for (auto& key : node.sorted_dict_keys) {
-          if (!nb::isinstance<nb::str>(key)) {
-            throw std::invalid_argument(
-                "Only string keys are supported in proto pytree "
-                "serialization.");
-          }
-          node_data->mutable_dict_keys()->add_str_id(
-              intern_str(nb::cast<std::string>(key)));
-        }
-        break;
-      default:
-        throw std::invalid_argument(
-            "User-defined nodes are not supported when serializing pytrees as "
-            "protocol buffers. You should either convert the user-defined "
-            "nodes to another type or use pickle instead.");
-        break;
-    }
-  }
-}
-
-nb_class_ptr<PyTreeDef> PyTreeDef::DeserializeFrom(
-    nb_class_ptr<PyTreeRegistry> registry, const jax::PyTreeDefProto& input) {
-  std::vector<nb::object> interned_strings;
-  interned_strings.reserve(input.interned_strings().size());
-  for (auto& s : input.interned_strings()) {
-    interned_strings.push_back(nb::cast(s));
-  }
-  nb_class_ptr<PyTreeDef> result =
-      make_nb_class<PyTreeDef>(std::move(registry));
-  for (auto& node_proto : input.nodes()) {
-    result->traversal_.emplace_back();
-    auto& node = result->traversal_.back();
-    node.arity = node_proto.arity();
-    node.custom = nullptr;
-    switch (node_proto.type()) {
-      case jax::PyTreeNodeType::PY_TREE_KIND_LEAF:
-        node.kind = PyTreeKind::kLeaf;
-        break;
-      case jax::PyTreeNodeType::PY_TREE_KIND_LIST:
-        node.kind = PyTreeKind::kList;
-        break;
-      case jax::PyTreeNodeType::PY_TREE_KIND_NONE:
-        node.kind = PyTreeKind::kNone;
-        break;
-      case jax::PyTreeNodeType::PY_TREE_KIND_TUPLE:
-        node.kind = PyTreeKind::kTuple;
-        break;
-      case jax::PyTreeNodeType::PY_TREE_KIND_DICT:
-        node.kind = PyTreeKind::kDict;
-        for (uint32_t str_id : node_proto.dict_keys().str_id()) {
-          if (str_id >= interned_strings.size()) {
-            throw std::invalid_argument(
-                "Malformed pytree proto (dict_key out of range).");
-          }
-          node.sorted_dict_keys.push_back(interned_strings.at(str_id));
-        }
-        break;
-      default:
-        throw std::invalid_argument(
-            "Malformed pytree proto (invalid node type)");
-        break;
-    }
-  }
-  result->SetNumLeavesAndNumNodes();
-  return result;
-}
-
-std::optional<std::pair<nb::object, nb::object>> PyTreeDef::GetNodeData()
-    const {
-  if (traversal_.empty()) {
-    throw std::logic_error("empty PyTreeDef traversal.");
-  }
-  auto builtin_type = [](PyTypeObject* type_obj) {
-    return nb::borrow<nb::object>(reinterpret_cast<PyObject*>(type_obj));
-  };
-  const auto& node = traversal_.back();
-  switch (node.kind) {
-    case PyTreeKind::kLeaf:
-      return std::nullopt;
-    case PyTreeKind::kNone:
-      return std::make_pair(builtin_type(Py_TYPE(Py_None)), nb::none());
-    case PyTreeKind::kTuple:
-      return std::make_pair(builtin_type(&PyTuple_Type), nb::none());
-    case PyTreeKind::kList:
-      return std::make_pair(builtin_type(&PyList_Type), nb::none());
-    case PyTreeKind::kDict:
-      return std::make_pair(builtin_type(&PyDict_Type),
-                            nb::cast(node.sorted_dict_keys));
-    case PyTreeKind::kNamedTuple:
-      return std::make_pair(node.node_data, nb::none());
-    case PyTreeKind::kCustom:
-    case PyTreeKind::kDataclass:
-      return std::make_pair(node.custom->type, node.node_data);
-  }
-}
-
-nb_class_ptr<PyTreeDef> PyTreeDef::MakeFromNodeDataAndChildren(
-    nb_class_ptr<PyTreeRegistry> registry,
-    std::optional<std::pair<nb::object, nb::object>> node_data,
-    nb::iterable children) {
-  nb_class_ptr<PyTreeDef> result =
-      make_nb_class<PyTreeDef>(std::move(registry));
-  int num_leaves = 0;
-  int arity = 0;
-  for (nb::handle pchild : children) {
-    const PyTreeDef& child = nb::cast<const PyTreeDef&>(pchild);
-    absl::c_copy(child.traversal_, std::back_inserter(result->traversal_));
-    num_leaves += child.num_leaves();
-    ++arity;
-  }
-  result->traversal_.emplace_back();
-  auto& node = result->traversal_.back();
-  node.arity = arity;
-  node.custom = nullptr;
-  node.num_leaves = num_leaves;
-  node.num_nodes = result->traversal_.size();
-  if (node_data == std::nullopt) {
-    node.kind = PyTreeKind::kLeaf;
-    ++node.num_leaves;
-    return result;
-  }
-  int is_nt = PyObject_IsSubclass(node_data->first.ptr(),
-                                  reinterpret_cast<PyObject*>(&PyTuple_Type));
-  if (is_nt == -1) {
-    throw nb::python_error();
-  }
-  if (is_nt != 0 && nb::hasattr(node_data->first, "_fields")) {
-    node.kind = PyTreeKind::kNamedTuple;
-    node.node_data = node_data->first;
-    return result;
-  }
-  auto* registration = result->registry()->Lookup(node_data->first);
-  if (registration == nullptr) {
-    throw std::logic_error(absl::StrFormat(
-        "Could not find type: %s.",
-        nb::cast<absl::string_view>(nb::repr(node_data->first))));
-  }
-  node.kind = registration->kind;
-  if (node.kind == PyTreeKind::kCustom || node.kind == PyTreeKind::kDataclass) {
-    node.custom = registration;
-    node.node_data = node_data->second;
-  } else if (node.kind == PyTreeKind::kNamedTuple) {
-    node.node_data = node_data->first;
-  } else if (node.kind == PyTreeKind::kDict) {
-    node.sorted_dict_keys =
-        nb::cast<std::vector<nb::object>>(node_data->second);
-  }
-  return result;
-}
-
-int PyTreeDef::Node::tp_traverse(visitproc visit, void* arg) const {
-  Py_VISIT(node_data.ptr());
-  for (const auto& key : sorted_dict_keys) {
-    Py_VISIT(key.ptr());
-  }
-  return 0;
-}
-
-/* static */ int PyTreeDef::tp_traverse(PyObject* self, visitproc visit,
-                                        void* arg) {
-  PyTreeDef* treedef = nb::inst_ptr<PyTreeDef>(self);
-  Py_VISIT(Py_TYPE(self));
-  Py_VISIT(treedef->registry_ref_.ptr());
-  for (const auto& node : treedef->traversal_) {
-    node.tp_traverse(visit, arg);
-  }
-  return 0;
-}
-
-/* static */ int PyTreeDef::tp_clear(PyObject* self) {
-  PyTreeDef* treedef = nb::inst_ptr<PyTreeDef>(self);
-  treedef->registry_ref_.reset();
-  treedef->traversal_.clear();
-  return 0;
-}
-
-/* static */ PyType_Slot PyTreeDef::slots_[] = {
-    {Py_tp_traverse, (void*)PyTreeDef::tp_traverse},
-    {Py_tp_clear, (void*)PyTreeDef::tp_clear},
-    {0, nullptr},
-};
-
-void BuildPytreeSubmodule(nb::module_& m) {
-  nb::module_ pytree = m.def_submodule("pytree", "Python tree library");
-  pytree.attr("version") = nb::int_(3);
-
-  nb::class_<PyTreeDef> treedef(pytree, "PyTreeDef",
-                                nb::type_slots(PyTreeDef::slots_));
-
-  nb::class_<PyTreeRegistry> registry(m, "PyTreeRegistry", nb::dynamic_attr(),
-                                      nb::type_slots(PyTreeRegistry::slots_));
-
-  registry.def(nb::init<bool, bool, bool, bool, bool>(),
-               nb::arg("enable_none") = true, nb::arg("enable_tuple") = true,
-               nb::arg("enable_namedtuple") = true,
-               nb::arg("enable_list") = true, nb::arg("enable_dict") = true);
-  registry.def(
-      "flatten",
-      [](nb_class_ptr<PyTreeRegistry> registry, nb::object x,
-         std::optional<nb::callable> leaf_predicate) {
-        nb::list leaves;
-        nb_class_ptr<PyTreeDef> def =
-            make_nb_class<PyTreeDef>(std::move(registry));
-        def->Flatten(x, leaves, leaf_predicate);
-        return nb::make_tuple(std::move(leaves), std::move(def));
-      },
-      nb::arg("tree").none(), nb::arg("leaf_predicate").none() = std::nullopt);
-  registry.def("flatten_one_level", &PyTreeRegistry::FlattenOneLevel,
-               nb::arg("tree").none());
-  registry.def("flatten_one_level_with_keys",
-               &PyTreeRegistry::FlattenOneLevelWithKeys,
-               nb::arg("tree").none());
-  registry.def(
-      "flatten_with_path",
-      [](nb_class_ptr<PyTreeRegistry> registry, nb::object x,
-         std::optional<nb::callable> leaf_predicate) {
-        nb::list leaves;
-        nb_class_ptr<PyTreeDef> def =
-            make_nb_class<PyTreeDef>(std::move(registry));
-        def->FlattenWithPath(x, leaves, leaf_predicate);
-        return nb::make_tuple(std::move(leaves), std::move(def));
-      },
-      nb::arg("tree").none(), nb::arg("leaf_predicate").none() = std::nullopt);
-  registry.def("register_node", &PyTreeRegistry::Register,
-               nb::arg("type").none(), nb::arg("to_iterable").none(),
-               nb::arg("from_iterable").none(),
-               nb::arg("to_iterable_with_keys").none() = std::nullopt);
-  registry.def("register_dataclass_node", &PyTreeRegistry::RegisterDataclass);
-  registry.def("__reduce__",
-               [](nb::object self) { return self.attr("__name__"); });
-
-  pytree.attr("_default_registry") = make_nb_class<PyTreeRegistry>(
-      /*enable_none=*/true, /*enable_tuple=*/true, /*enable_namedtuple=*/true,
-      /*enable_list=*/true, /*enable_dict*/ true);
-  pytree.def("default_registry",
-             [registry = nb::cast<nb_class_ptr<PyTreeRegistry>>(
-                  pytree.attr("_default_registry"))]() { return registry; });
-
-  pytree.attr("PyTreeRegistry") = m.attr("PyTreeRegistry");
-  pytree.def("tuple", &PyTreeDef::Tuple);
-  pytree.def("all_leaves", &PyTreeDef::AllLeaves);
-
-  treedef.def("unflatten",
-              static_cast<nb::object (PyTreeDef::*)(nb::iterable leaves) const>(
-                  &PyTreeDef::Unflatten));
-  treedef.def("flatten_up_to", &PyTreeDef::FlattenUpTo, nb::arg("tree").none());
-  treedef.def("compose", &PyTreeDef::Compose);
-  treedef.def(
-      "walk", &PyTreeDef::Walk,
-      "Walk pytree, calling f_node(node, node_data) at nodes, and f_leaf "
-      "at leaves",
-      nb::arg("f_node"), nb::arg("f_leaf"), nb::arg("leaves"));
-  treedef.def("from_iterable_tree", &PyTreeDef::FromIterableTree);
-  treedef.def("children", &PyTreeDef::Children);
-  treedef.def_prop_ro("num_leaves", &PyTreeDef::num_leaves);
-  treedef.def_prop_ro("num_nodes", &PyTreeDef::num_nodes);
-  treedef.def("__repr__", &PyTreeDef::ToString);
-  treedef.def("__eq__",
-              [](const PyTreeDef& a, const PyTreeDef& b) { return a == b; });
-  treedef.def("__ne__",
-              [](const PyTreeDef& a, const PyTreeDef& b) { return a != b; });
-  treedef.def("__hash__", [](const PyTreeDef& t) { return absl::HashOf(t); });
-  treedef.def("serialize_using_proto", [](const PyTreeDef& a) {
-    jax::PyTreeDefProto result;
-    a.SerializeTo(result);
-    std::string serialized = result.SerializeAsString();
-    return nb::bytes(serialized.data(), serialized.size());
-  });
-  treedef.def_static(
-      "deserialize_using_proto",
-      [](nb_class_ptr<PyTreeRegistry> registry, nb::bytes data) {
-        jax::PyTreeDefProto input;
-        absl::string_view serialized(data.c_str(), data.size());
-        if (serialized.size() > std::numeric_limits<int>::max()) {
-          throw xla::XlaRuntimeError(
-              "Pytree serialization too large to deserialize.");
-        }
-        if (!input.ParseFromArray(serialized.data(), serialized.size())) {
-          throw xla::XlaRuntimeError("Could not deserialize PyTreeDefProto.");
-        }
-        return PyTreeDef::DeserializeFrom(std::move(registry), input);
-      },
-      nb::arg("registry"), nb::arg("data"));
-  treedef.def("node_data", &PyTreeDef::GetNodeData,
-              "Returns None if a leaf-pytree, else (type, node_data)");
-  treedef.def_static(
-      "make_from_node_data_and_children",
-      &PyTreeDef::MakeFromNodeDataAndChildren, nb::arg("registry"),
-      nb::arg("node_data").none(), nb::arg("children"),
-      "Reconstructs a pytree from `node_data()` and `children()`.");
-  treedef.def("__getstate__", &PyTreeDef::ToPickle);
-  treedef.def("__setstate__", [](PyTreeDef& t, nb::object o) {
-    nb::tuple pickle = nb::cast<nb::tuple>(o);
-    if (pickle.size() != 2) {
-      throw xla::XlaRuntimeError(
-          "Malformed pickled PyTreeDef, expected 2-tuple");
-    }
-    auto registry = nb::cast<nb_class_ptr<PyTreeRegistry>>(pickle[0]);
-    new (&t) PyTreeDef(registry);
-    t.FromPickle(pickle[1]);
-  });
-
-  nb::class_<SequenceKey> sequence_key(pytree, "SequenceKey");
-  sequence_key.def(nb::init<int>(), nb::arg("idx"));
-  sequence_key.def("__str__", &SequenceKey::ToString);
-  sequence_key.def("__repr__", &SequenceKey::ToReprString);
-  sequence_key.def("__eq__", &SequenceKey::Equals);
-  sequence_key.def("__hash__", [](const SequenceKey& key) {
-    return key.idx() + kSequenceKeyHashSalt;
-  });
-  sequence_key.def_prop_ro("idx", &SequenceKey::idx);
-  sequence_key.def_prop_ro_static("__match_args__", &SequenceKey::MatchArgs);
-  sequence_key.def("__getstate__",
-                   [](SequenceKey& key) { return nb::make_tuple(key.idx()); });
-  sequence_key.def("__setstate__",
-                   [](SequenceKey& key, const nb::tuple& state) {
-                     if (state.size() != 1) {
-                       throw xla::XlaRuntimeError(
-                           "Malformed pickled SequenceKey, expected 1-tuple");
-                     }
-                     new (&key) SequenceKey(nb::cast<int>(state[0]));
-                   });
-
-  nb::class_<DictKey> dict_key(pytree, "DictKey",
-                               nb::type_slots(DictKey::slots_));
-  dict_key.def(nb::init<nb::object>(), nb::arg("key"));
-  dict_key.def("__str__", &DictKey::ToString);
-  dict_key.def("__repr__", &DictKey::ToReprString);
-  dict_key.def("__eq__", &DictKey::Equals);
-  dict_key.def("__hash__",
-               [](const DictKey& key) { return nanobind::hash(key.key()); });
-  dict_key.def_prop_ro("key", &DictKey::key);
-  dict_key.def_prop_ro_static("__match_args__", &DictKey::MatchArgs);
-  dict_key.def("__getstate__",
-               [](DictKey& key) { return nb::make_tuple(key.key()); });
-  dict_key.def("__setstate__", [](DictKey& key, const nb::tuple& state) {
-    if (state.size() != 1) {
-      throw xla::XlaRuntimeError("Malformed pickled DictKey, expected 1-tuple");
-    }
-    new (&key) DictKey(nb::cast<nb::object>(state[0]));
-  });
-
-  nb::class_<GetAttrKey> get_attr_key(pytree, "GetAttrKey");
-  get_attr_key.def(nb::init<nb::str>(), nb::arg("name"));
-  get_attr_key.def("__str__", &GetAttrKey::ToString);
-  get_attr_key.def("__repr__", &GetAttrKey::ToReprString);
-  get_attr_key.def("__eq__", &GetAttrKey::Equals);
-  get_attr_key.def("__hash__",
-                   [](const GetAttrKey& key) { return nb::hash(key.name()); });
-  get_attr_key.def_prop_ro("name", &GetAttrKey::name);
-  get_attr_key.def_prop_ro_static("__match_args__", &GetAttrKey::MatchArgs);
-  get_attr_key.def("__getstate__",
-                   [](GetAttrKey& key) { return nb::make_tuple(key.name()); });
-  get_attr_key.def("__setstate__", [](GetAttrKey& key, const nb::tuple& state) {
-    if (state.size() != 1) {
-      throw xla::XlaRuntimeError(
-          "Malformed pickled GetAttrKey, expected 1-tuple");
-    }
-    new (&key) GetAttrKey(nb::str(state[0]));
-  });
-
-  nb::class_<FlattenedIndexKey> flattened_index_key(pytree,
-                                                    "FlattenedIndexKey");
-  flattened_index_key.def(nb::init<int>(), nb::arg("key"));
-  flattened_index_key.def("__str__", &FlattenedIndexKey::ToString);
-  flattened_index_key.def("__repr__", &FlattenedIndexKey::ToReprString);
-  flattened_index_key.def("__eq__", &FlattenedIndexKey::Equals);
-  flattened_index_key.def("__hash__", [](const FlattenedIndexKey& key) {
-    return key.key() + kFlattenedIndexKeyHashSalt;
-  });
-  flattened_index_key.def_prop_ro("key", &FlattenedIndexKey::key);
-  flattened_index_key.def_prop_ro_static("__match_args__",
-                                         &FlattenedIndexKey::MatchArgs);
-  flattened_index_key.def("__getstate__", [](FlattenedIndexKey& key) {
-    return nb::make_tuple(key.key());
-  });
-  flattened_index_key.def(
-      "__setstate__", [](FlattenedIndexKey& key, const nb::tuple& state) {
-        if (state.size() != 1) {
-          throw xla::XlaRuntimeError(
-              "Malformed pickled FlattenedIndexKey, expected 1-tuple");
-        }
-        new (&key) FlattenedIndexKey(nb::cast<int>(state[0]));
-      });
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/pytree.h b/third_party/xla/xla/python/pytree.h
deleted file mode 100644
index f526893d8dc8..000000000000
--- a/third_party/xla/xla/python/pytree.h
+++ /dev/null
@@ -1,408 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PYTREE_H_
-#define XLA_PYTHON_PYTREE_H_
-
-// See https://jax.readthedocs.io/en/latest/pytrees.html for the documentation
-// about pytree.
-
-#include <Python.h>
-
-#include <cstddef>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-// placeholder for index annotation headers
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pytree.pb.h"
-
-namespace xla {
-
-enum class PyTreeKind {
-  kLeaf,        // An opaque leaf node
-  kNone,        // None.
-  kTuple,       // A tuple
-  kNamedTuple,  // A collections.namedtuple
-  kList,        // A list
-  kDict,        // A dict
-  kCustom,      // A custom type.
-  kDataclass,   // A dataclass.
-};
-
-// Registry of custom node types.
-class PyTreeRegistry {
- public:
-  PyTreeRegistry(bool enable_none, bool enable_tuple, bool enable_namedtuple,
-                 bool enable_list, bool enable_dict);
-
-  PyTreeRegistry(const PyTreeRegistry&) = delete;
-  PyTreeRegistry(PyTreeRegistry&&) = delete;
-  PyTreeRegistry& operator=(const PyTreeRegistry&) = delete;
-  PyTreeRegistry& operator=(PyTreeRegistry&&) = delete;
-
-  struct Registration {
-    PyTreeKind kind;
-
-    // The following values are populated for custom types.
-    // The Python type object, used to identify the type.
-    nanobind::object type;
-    // A function with signature: object -> (iterable, aux_data)
-    nanobind::callable to_iterable;
-    // A function with signature: (aux_data, iterable) -> object
-    nanobind::callable from_iterable;
-    // A function with signature: (aux_data, iterable(keypath, leaf)) -> object
-    std::optional<nanobind::callable> to_iterable_with_keys;
-
-    // Helper that calls to_iterable and validates that it returns a pair
-    // of an iterable and an aux_data object
-    std::pair<nanobind::iterable, nanobind::object> ToIterable(
-        nanobind::handle o) const;
-    // Helper that calls to_iterable_with_keys and validates that it returns a
-    // pair of an iterable of key-leaf pairs and an aux_data object. If
-    // to_iterable_with_keys is not available, return a dummy key for each leaf,
-    // similar to the current jax.tree_util.FlattenedIndexKey.
-    std::pair<std::vector<std::pair<nanobind::object, nanobind::object>>,
-              nanobind::object>
-    ToIterableWithKeys(nanobind::handle o) const;
-
-    // For dataclasses.
-    std::vector<nanobind::str> data_fields;
-    std::vector<nanobind::str> meta_fields;
-
-    int tp_traverse(visitproc visit, void* arg);
-  };
-
-  // Registers a new custom type. Objects of `type` will be treated as container
-  // node types in PyTrees.
-  void Register(
-      nanobind::object type, nanobind::callable to_iterable,
-      nanobind::callable from_iterable,
-      std::optional<nanobind::callable> to_iterable_with_keys = std::nullopt);
-  // Same, but for dataclasses.
-  void RegisterDataclass(nanobind::object type,
-                         std::vector<nanobind::str> data_fields,
-                         std::vector<nanobind::str> meta_fields);
-
-  // Finds the custom type registration for `type`. Returns nullptr if none
-  // exists.
-  const Registration* Lookup(nanobind::handle type) const;
-
-  PyTreeKind KindOfObject(nanobind::handle obj,
-                          PyTreeRegistry::Registration const** custom) const;
-
-  // Flattens a pytree one level, returning either a tuple of the leaves and
-  // the node data, or None, if the entry is a leaf.
-  nanobind::object FlattenOneLevel(nanobind::handle x) const;
-  // Similar to above but returns a key-leaf pair for each leaf.
-  nanobind::object FlattenOneLevelWithKeys(nanobind::handle x) const;
-  // Underlying implementation of FlattenOneLevel and FlattenOneLevelWithKeys.
-  nanobind::object FlattenOneLevelImpl(nanobind::handle x,
-                                       bool with_keys) const;
-
-  static PyType_Slot slots_[];
-
- private:
-  struct TypeHash {
-    using is_transparent = void;
-    size_t operator()(const nanobind::object& t) const {
-      return absl::HashOf(t.ptr());
-    }
-    size_t operator()(const nanobind::handle& t) const {
-      return absl::HashOf(t.ptr());
-    }
-  };
-  struct TypeEq {
-    using is_transparent = void;
-    bool operator()(const nanobind::object& a,
-                    const nanobind::object& b) const {
-      return a.ptr() == b.ptr();
-    }
-    bool operator()(const nanobind::object& a,
-                    const nanobind::handle& b) const {
-      return a.ptr() == b.ptr();
-    }
-  };
-  mutable nanobind::ft_mutex mu_;
-  absl::flat_hash_map<nanobind::object, std::unique_ptr<Registration>, TypeHash,
-                      TypeEq>
-      registrations_;  // Guarded by mu_
-  bool enable_namedtuple_;
-
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
-  static int tp_clear(PyObject* self);
-};
-
-class SequenceKey {
- public:
-  explicit SequenceKey(int idx) : idx_(idx) {};
-  std::string ToReprString() const;
-  std::string ToString() const;
-  bool Equals(const nanobind::object& other);
-  int idx() const { return idx_; }
-  static nanobind::tuple MatchArgs(nanobind::handle unused);
-
- private:
-  int idx_;
-};
-
-class DictKey {
- public:
-  explicit DictKey(nanobind::object key) : key_(key) {};
-  std::string ToReprString() const;
-  std::string ToString() const;
-  bool Equals(const nanobind::object& other);
-  nanobind::object key() const { return key_; }
-  static nanobind::tuple MatchArgs(nanobind::handle unused);
-  static PyType_Slot slots_[];
-
- private:
-  nanobind::object key_;
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
-  static int tp_clear(PyObject* self);
-};
-
-class GetAttrKey {
- public:
-  explicit GetAttrKey(nanobind::str name) : name_(name) {};
-  std::string ToReprString() const;
-  std::string ToString() const;
-  bool Equals(const nanobind::object& other);
-  nanobind::str name() const { return name_; }
-  static nanobind::tuple MatchArgs(nanobind::handle unused);
-
- private:
-  nanobind::str name_;
-};
-
-class FlattenedIndexKey {
- public:
-  explicit FlattenedIndexKey(int key) : key_(key) {};
-  std::string ToReprString() const;
-  std::string ToString() const;
-  bool Equals(const nanobind::object& other);
-  int key() const { return key_; }
-  static nanobind::tuple MatchArgs(nanobind::handle unused);
-
- private:
-  int key_;
-};
-
-// A PyTreeDef describes the tree structure of a PyTree. A PyTree is a tree of
-// Python values, where the interior nodes are tuples, lists, dictionaries, or
-// user-defined containers, and the leaves are other objects.
-class PyTreeDef {
- public:
-  // Unowned registry: the registry must remain live at least as long as the
-  // PyTreeDef. It is the caller's responsibility to enforce this.
-  explicit PyTreeDef(PyTreeRegistry* registry) : registry_(registry) {}
-
-  explicit PyTreeDef(nb_class_ptr<PyTreeRegistry> registry)
-      : registry_(registry.get()), registry_ref_(std::move(registry)) {}
-
-  // Flattens a Pytree into a list of leaves and a PyTreeDef.
-  // Returns references to the flattened objects, which might be temporary
-  // objects in the case of custom pytype handlers.
-  static std::pair<std::vector<nanobind::object>, nb_class_ptr<PyTreeDef>>
-  Flatten(nanobind::handle x, nb_class_ptr<PyTreeRegistry> registry,
-          std::optional<nanobind::callable> leaf_predicate = std::nullopt);
-
-  // Flattens a Pytree into a list of `leaves` and a PyTreeDef (this).
-  // `leaves` owns references to the flattened objects, which might be
-  // temporary objects in the case of custom pytype handlers.
-  void Flatten(nanobind::handle handle, std::vector<nanobind::object>& leaves,
-               std::optional<nanobind::callable> leaf_predicate = std::nullopt);
-  void Flatten(nanobind::handle handle,
-               absl::InlinedVector<nanobind::object, 2>& leaves,
-               std::optional<nanobind::callable> leaf_predicate = std::nullopt);
-  void Flatten(nanobind::handle handle, nanobind::list& leaves,
-               std::optional<nanobind::callable> leaf_predicate = std::nullopt);
-
-  void FlattenWithPath(
-      nanobind::handle handle, nanobind::list& leaves,
-      std::optional<nanobind::callable> leaf_predicate = std::nullopt);
-
-  // Tests whether the given list is a flat list of leaves.
-  static bool AllLeaves(PyTreeRegistry* registry, const nanobind::iterable& x);
-
-  // Flattens a Pytree up to this PyTreeDef. 'this' must be a tree prefix of
-  // the tree-structure of 'x'. For example, if we flatten a value
-  // [(1, (2, 3)), {"foo": 4}] with a treedef [(*, *), *], the result is the
-  // list of leaves [1, (2, 3), {"foo": 4}].
-  nanobind::list FlattenUpTo(nanobind::handle x) const;
-
-  // Returns an unflattened PyTree given an iterable of leaves and a PyTreeDef.
-  nanobind::object Unflatten(nanobind::iterable leaves) const;
-  nanobind::object Unflatten(absl::Span<const nanobind::object> leaves) const;
-
-  // Composes two PyTreeDefs, replacing the leaves of this tree with copies of
-  // `inner`. The returned PyTreeDef holds a reference to its registry.
-  nb_class_ptr<PyTreeDef> Compose(const PyTreeDef& inner) const;
-
-  // Makes a Tuple PyTreeDef out of a vector of PyTreeDefs.
-  static nb_class_ptr<PyTreeDef> Tuple(nb_class_ptr<PyTreeRegistry> registry,
-                                       nanobind::list defs);
-
-  // The returned PyTreeDefs hold a reference to the registry.
-  std::vector<nb_class_ptr<PyTreeDef>> Children() const;
-
-  // Maps a function over a PyTree structure, applying f_leaf to each leaf, and
-  // f_node(node, node_data) to each container node.
-  nanobind::object Walk(const nanobind::callable& f_node,
-                        nanobind::handle f_leaf,
-                        nanobind::iterable leaves) const;
-
-  // Given a tree of iterables with the same node/leaf structure as this PyTree,
-  // build the corresponding PyTree.
-  // TODO(phawkins): use flattening everywhere instead and delete this method.
-  nanobind::object FromIterableTree(nanobind::handle xs) const;
-
-  int num_leaves() const {
-    if (traversal_.empty()) {
-      return 0;
-    }
-    return traversal_.back().num_leaves;
-  }
-
-  int num_nodes() const { return traversal_.size(); }
-
-  PyTreeRegistry* registry() const { return registry_; }
-
-  size_t Hash() const;
-
-  bool operator==(const PyTreeDef& other) const;
-  bool operator!=(const PyTreeDef& other) const { return !(*this == other); }
-
-  std::string ToString() const;
-
-  // Transforms the PyTreeDef into a pickleable object. Used to implement
-  // `PyTreeDef.__getstate__`.
-  nanobind::object ToPickle() const;
-
-  // Transforms the object returned by `ToPickleable()` back to PyTreeDef. Used
-  // to implement `PyTreeDef.__setstate__`.
-  void FromPickle(nanobind::object pickleable);
-
-  void SerializeTo(jax::PyTreeDefProto& result) const;
-
-  static nb_class_ptr<PyTreeDef> DeserializeFrom(
-      nb_class_ptr<PyTreeRegistry> registry, const jax::PyTreeDefProto& input);
-
-  std::optional<std::pair<nanobind::object, nanobind::object>> GetNodeData()
-      const;
-
-  static nb_class_ptr<PyTreeDef> MakeFromNodeDataAndChildren(
-      nb_class_ptr<PyTreeRegistry> registry,
-      std::optional<std::pair<nanobind::object, nanobind::object>> node_data,
-      nanobind::iterable children);
-
-  static PyType_Slot slots_[];
-
- private:
-  void SetNumLeavesAndNumNodes();
-
-  struct Node {
-    PyTreeKind kind = PyTreeKind::kLeaf;
-
-    // Arity for non-kLeaf types.
-    int arity = 0;
-
-    // Kind-specific auxiliary data. For a kNamedTuple, contains the tuple type
-    // object. For a kDict, use `sorted_dict_keys` field below. For a kCustom
-    // type, contains the auxiliary data returned by the `to_iterable` function.
-    nanobind::object node_data;
-
-    // Kind-specific auxiliary data specialized for kDict. Use a c++ vector
-    // to hold the sorted dict keys instead of a py::list to avoid creating
-    // a new python list object when flattening kDict. For deeply nested dict,
-    // using c++ vector instead of py::list avoids creating too many python
-    // objects that make python gc sweep slow.
-    std::vector<nanobind::object> sorted_dict_keys;
-
-    // Custom type registration. Must be null for non-custom types.
-    const PyTreeRegistry::Registration* custom = nullptr;
-
-    // Number of leaf nodes in the subtree rooted at this node.
-    int num_leaves = 0;
-
-    // Number of leaf and interior nodes in the subtree rooted at this node.
-    int num_nodes = 0;
-
-    int tp_traverse(visitproc visit, void* arg) const;
-  };
-  template <typename H>
-  friend H AbslHashValue(H h, const Node& n);
-
-  template <typename H>
-  friend H AbslHashValue(H h, const PyTreeDef& t);
-
-  // Helper that manufactures an instance of a node given its children.
-  static nanobind::object MakeNode(const Node& node,
-                                   absl::Span<nanobind::object> children);
-
-  // Recursive helper used to implement FromIterableTree()
-  nanobind::object FromIterableTreeHelper(
-      nanobind::handle xs,
-      absl::InlinedVector<PyTreeDef::Node, 1>::const_reverse_iterator* it)
-      const;
-
-  template <typename T>
-  void FlattenImpl(nanobind::handle handle, T& leaves,
-                   const std::optional<nanobind::callable>& leaf_predicate,
-                   std::optional<std::vector<nanobind::object>>& keypath);
-
-  template <typename T>
-  nanobind::object UnflattenImpl(T leaves) const;
-
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
-  static int tp_clear(PyObject* self);
-
-  // Pytree registry. Not owned.
-  PyTreeRegistry* registry_;
-  // If this class holds a reference to `registry`, it is held by
-  // `registry_ref_`.
-  nb_class_ptr<PyTreeRegistry> registry_ref_;
-
-  // Nodes, in a post-order traversal. We use an ordered traversal to minimize
-  // allocations, and post-order corresponds to the order we need to rebuild the
-  // tree structure.
-  absl::InlinedVector<Node, 1> traversal_;
-};
-
-template <typename H>
-H AbslHashValue(H h, const PyTreeDef::Node& n) {
-  h = H::combine(std::move(h), n.kind, n.arity, n.custom);
-  return h;
-}
-
-template <typename H>
-H AbslHashValue(H h, const PyTreeDef& t) {
-  h = H::combine(std::move(h), t.traversal_);
-  return h;
-}
-
-void BuildPytreeSubmodule(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_PYTREE_H_
diff --git a/third_party/xla/xla/python/pytree.proto b/third_party/xla/xla/python/pytree.proto
deleted file mode 100644
index 73c087ef55ab..000000000000
--- a/third_party/xla/xla/python/pytree.proto
+++ /dev/null
@@ -1,32 +0,0 @@
-syntax = "proto3";
-
-package jax;
-
-enum PyTreeNodeType {
-  PY_TREE_KIND_INVALID = 0;
-  PY_TREE_KIND_LEAF = 1;
-  PY_TREE_KIND_LIST = 2;
-  PY_TREE_KIND_NONE = 3;
-  PY_TREE_KIND_TUPLE = 4;
-  PY_TREE_KIND_DICT = 5;
-}
-
-message DictKeysProto {
-  repeated uint32 str_id = 1;
-}
-
-message PyTreeNodeDefProto {
-  // Recovers the tree structure.
-  uint32 arity = 1;
-  // Node type.
-  PyTreeNodeType type = 2;
-  // Only set when type == DICT.
-  DictKeysProto dict_keys = 3;
-}
-
-// A Pytree.
-message PyTreeDefProto {
-  repeated PyTreeNodeDefProto nodes = 1;
-  // Extra strings.
-  repeated string interned_strings = 2;
-}
diff --git a/third_party/xla/xla/python/pytree_test.py b/third_party/xla/xla/python/pytree_test.py
deleted file mode 100644
index 263bc4b99bfa..000000000000
--- a/third_party/xla/xla/python/pytree_test.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2023 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-import collections
-import dataclasses
-import gc
-
-from absl.testing import absltest
-
-from xla.python import xla_client
-
-pytree = xla_client._xla.pytree
-
-
-ExampleType = collections.namedtuple("ExampleType", "field0 field1")
-
-registry = pytree.PyTreeRegistry()
-
-
-class ExampleType2:
-
-  def __init__(self, field0, field1):
-    self.field0 = field0
-    self.field1 = field1
-
-  def to_iterable(self):
-    return [self.field0, self.field1], (None,)
-
-
-def from_iterable(state, values):
-  del state
-  return ExampleType2(field0=values[0], field1=values[1])
-
-
-registry.register_node(ExampleType2, ExampleType2.to_iterable, from_iterable)
-
-
-@dataclasses.dataclass
-class Custom:
-  a: int
-  b: str
-
-
-registry.register_dataclass_node(Custom, ["a"], ["b"])
-
-
-class PyTreeTest(absltest.TestCase):
-
-  def roundtrip(self, example):
-    original = registry.flatten(example)[1]
-    self.assertEqual(
-        pytree.PyTreeDef.deserialize_using_proto(
-            registry, original.serialize_using_proto()
-        ),
-        original,
-    )
-
-  def testSerializeDeserializeNoPickle(self):
-    o = object()
-    self.roundtrip(({"a": o, "b": o}, [o, (o, o), None]))
-
-  def testSerializeWithFallback(self):
-    o = object()
-    with self.assertRaises(ValueError):
-      self.roundtrip({"a": ExampleType(field0=o, field1=o)})
-
-  def testRegisteredType(self):
-    o = object()
-    with self.assertRaises(ValueError):
-      self.roundtrip({"a": ExampleType2(field0=o, field1=o)})
-
-  def roundtrip_node_data(self, example):
-    original = registry.flatten(example)[1]
-    restored = pytree.PyTreeDef.make_from_node_data_and_children(
-        registry, original.node_data(), original.children()
-    )
-    self.assertEqual(restored, original)
-
-  def testRoundtripNodeData(self):
-    o = object()
-    self.roundtrip_node_data([o, o, o])
-    self.roundtrip_node_data((o, o, o))
-    self.roundtrip_node_data({"a": o, "b": o})
-    self.roundtrip_node_data({22: o, 88: o})
-    self.roundtrip_node_data(None)
-    self.roundtrip_node_data(o)
-    self.roundtrip_node_data(ExampleType(field0=o, field1=o))
-    self.roundtrip_node_data(ExampleType2(field0=o, field1=o))
-
-  def testCompose(self):
-    x = registry.flatten(0)[1]
-    y = registry.flatten((0, 0))[1]
-    self.assertEqual((x.compose(y)).num_leaves, 2)
-
-  def testDataclassMakeFromNodeData(self):
-    c = Custom(1, "a")
-    c_leafs, c_tree = registry.flatten(c)
-    c_tree2 = c_tree.make_from_node_data_and_children(
-        registry, c_tree.node_data(), c_tree.children()
-    )
-    self.assertEqual(c_tree2.unflatten(c_leafs), c)
-    self.assertEqual(str(c_tree2), str(c_tree))
-
-  def testTpTraverse(self):
-    self.assertContainsSubset(
-        [
-            pytree.PyTreeRegistry,
-            ExampleType2,
-            ExampleType2.to_iterable,
-            from_iterable,
-        ],
-        gc.get_referents(registry),
-    )
-    k1 = "k1"
-    k2 = "k2"
-
-    t = ExampleType("a", "b")
-    _, treedef = registry.flatten([1, {k1: 2, k2: t}, 5, t])
-
-    self.assertContainsSubset(
-        [
-            pytree.PyTreeDef,
-            registry,
-            k1,
-            k2,
-            ExampleType,
-        ],
-        gc.get_referents(treedef),
-    )
-
-
-if __name__ == "__main__":
-  absltest.main()
diff --git a/third_party/xla/xla/python/pywrap.bzl b/third_party/xla/xla/python/pywrap.bzl
new file mode 100644
index 000000000000..471a7b652412
--- /dev/null
+++ b/third_party/xla/xla/python/pywrap.bzl
@@ -0,0 +1,85 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrappers around pywrap rules for JAX."""
+
+# NO_VISIBILITY_DECLARATION=.bzl file is intentionally exported to, e.g., JAX.
+
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load(
+    "//third_party/py/rules_pywrap:pywrap.impl.bzl",
+    "pybind_extension",
+    _pywrap_binaries = "pywrap_binaries",
+    _pywrap_library = "pywrap_library",
+)
+
+pywrap_library = _pywrap_library
+pywrap_binaries = _pywrap_binaries
+
+def nanobind_pywrap_extension(
+        name,
+        srcs = [],
+        deps = [],
+        pytype_srcs = [],
+        pytype_deps = [],  # @unused
+        copts = [],
+        linkopts = [],
+        visibility = None):
+    # buildifier: disable=function-docstring-args
+    "Python extension rule using nanobind and the pywrap rules."
+    module_name = name
+    lib_name = name + "_pywrap_library"
+    src_cc_name = name + "_pywrap_stub.c"
+
+    # We put the entire contents of the extension in a single cc_library, which will become part of
+    # the common pywrap library. All the contents of all extensions will end up in the common
+    # library.
+    native.cc_library(
+        name = lib_name,
+        srcs = srcs,
+        copts = copts,
+        deps = deps,
+        local_defines = [
+            "PyInit_{}=Wrapped_PyInit_{}".format(module_name, module_name),
+        ],
+        visibility = ["//visibility:private"],
+    )
+
+    # We build a small stub library as the extension that forwards to the PyInit_... symbol from the
+    # common pywrap library.
+    expand_template(
+        name = name + "_pywrap_stub",
+        testonly = True,
+        out = src_cc_name,
+        substitutions = {
+            "@MODULE_NAME@": module_name,
+        },
+        template = "//xla/python:pyinit_stub.c",
+        visibility = ["//visibility:private"],
+    )
+
+    # Despite its name "pybind_extension" has nothing to do with pybind. It is the Python extension
+    # rule from the pywrap rules.
+    pybind_extension(
+        name = name,
+        srcs = [src_cc_name],
+        deps = [":" + lib_name],
+        data = pytype_srcs,
+        linkopts = linkopts,
+        visibility = visibility,
+        default_deps = [],
+        common_lib_packages = [
+            "jaxlib",
+        ],
+    )
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index b9e433a650f0..0cc431688896 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -274,7 +274,8 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
   // TODO(necula): we should not need the inliner.
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::stablehlo_ext::createChloRecomposeOpsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo_ext::createChloRecomposeOpsPass());
   pm.addPass(mlir::stablehlo_ext::createStablehloRefineShapesPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo_ext::createStablehloCanonicalizeDynamismPass());
diff --git a/third_party/xla/xla/python/safe_static_init.h b/third_party/xla/xla/python/safe_static_init.h
new file mode 100644
index 000000000000..6a154a2be28f
--- /dev/null
+++ b/third_party/xla/xla/python/safe_static_init.h
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_SAFE_STATIC_INIT_H_
+#define XLA_PYTHON_SAFE_STATIC_INIT_H_
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+
+// Serialized static variable initialization from the `init_fn` output.
+// It avoids a deadlock between static variable initialization lock
+// and a lock in the `init_fn` function.
+// Expected signature of `init_fn` function: `std::unique_ptr<T> init_fn()`.
+// We have the following assumptions on `init_fn` function:
+// a) it can call python code and may release the GIL.
+// When the function is called we do not hold any non-GIL or
+// free-threading mutex.
+// b) function can be called multiple times if invoked concurrently,
+// but the output from all but one will be discarded.
+//
+// Usage:
+// Instead of incorrect code with potential deadlock
+// static SomeType* p = [](){
+//     // for example we call some python code using nanobind
+//     nb::module_ numpy = nb::module_::import_("numpy");
+//     auto np_int8 = nb::object(numpy.attr("int8"));
+//     SomeType* obj = new SomeType(np_uint8);
+//     return obj;
+// }();
+//
+// let us use SafeStaticInit
+// auto func = [](){
+//     // for example we call some python code using nanobind
+//     nb::module_ numpy = nb::module_::import_("numpy");
+//     auto np_int8 = nb::object(numpy.attr("int8"));
+//     std::unique_ptr<SomeType> obj = std::make_unique<SomeType>(np_uint8);
+//     return obj;
+// }
+// SomeType& p = SafeStaticInit<SomeType>(func);
+template <typename T, typename F>
+T& SafeStaticInit(F init_fn) {
+  static absl::Mutex mutex;
+  static T* output = nullptr;
+  {
+    absl::MutexLock lock(&mutex);
+    if (output) {
+      return *output;
+    }
+  }
+  std::unique_ptr<T> p = init_fn();
+  absl::MutexLock lock(&mutex);
+  if (!output) {
+    output = p.release();
+  }
+  return *output;
+}
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_SAFE_STATIC_INIT_H_
diff --git a/third_party/xla/xla/python/sdy.cc b/third_party/xla/xla/python/sdy.cc
deleted file mode 100644
index d577d0dec46e..000000000000
--- a/third_party/xla/xla/python/sdy.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/sdy.h"
-
-#include <cassert>
-#include <string>
-
-#include "mhlo/transforms/passes.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Bytecode/BytecodeWriter.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "mlir/IR/SymbolTable.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LLVM.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/tuple.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "shardy/dialect/sdy/ir/dialect.h"
-#include "shardy/dialect/sdy/ir/utils.h"
-#include "xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
-#include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/pjrt/mlir_to_hlo.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/service/spmd/shardy/constants.h"
-#include "xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h"
-#include "xla/service/spmd/shardy/sdy_round_trip/pipelines.h"
-#include "xla/service/spmd/shardy/utils.h"
-#include "xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h"
-
-namespace nb = nanobind;
-
-namespace xla {
-
-namespace {
-
-absl::StatusOr<std::string> SerializeUsingBytecode(mlir::ModuleOp module) {
-  std::string bytecode;
-  llvm::raw_string_ostream os(bytecode);
-  mlir::BytecodeWriterConfig config;
-  if (mlir::failed(mlir::writeBytecodeToFile(module, os, config))) {
-    return absl::InvalidArgumentError("mlir::writeBytecodeToFile failed");
-  }
-  return bytecode;
-}
-
-}  // namespace
-
-void BuildSdySubmodule(nb::module_& m) {
-  nb::module_ mlir_module = m.def_submodule("sdy", "Shardy/XLA integration");
-
-  mlir_module
-      // TODO(b/707574930): define a C API for the XLA pipelines.
-      .def(
-          "sdy_round_trip_export_pipeline",
-          [](const nb::bytes& bytecode) -> nb::bytes {
-            mlir::MLIRContext context;
-            mlir::OwningOpRef<mlir::ModuleOp> module =
-                xla::ValueOrThrow(ParseMlirModuleString(
-                    absl::string_view(bytecode.c_str(), bytecode.size()),
-                    context));
-            mlir::PassManager pm(&context);
-            sdy::addSdyRoundTripExportPipeline(pm);
-            tsl::StatusScopedDiagnosticHandler diagnosticHandler(&context);
-            ThrowIfError(diagnosticHandler.consumeStatus(pm.run(module.get())));
-            std::string module_str =
-                xla::ValueOrThrow(SerializeUsingBytecode(module.get()));
-            return nb::bytes(module_str.data(), module_str.size());
-          },
-          nb::arg("module"))
-      .def(
-          "sdy_round_trip_import_shardings",
-          [](const nb::bytes& bytecode) -> nb::bytes {
-            mlir::MLIRContext context;
-            mlir::OwningOpRef<mlir::ModuleOp> module =
-                xla::ValueOrThrow(ParseMlirModuleString(
-                    absl::string_view(bytecode.c_str(), bytecode.size()),
-                    context));
-            mlir::PassManager pm(&context);
-            pm.addPass(xla::sdy::createSdyRoundTripImportShardyAttrsPass());
-            tsl::StatusScopedDiagnosticHandler diagnosticHandler(&context);
-            ThrowIfError(diagnosticHandler.consumeStatus(pm.run(module.get())));
-            std::string module_str =
-                xla::ValueOrThrow(SerializeUsingBytecode(module.get()));
-            return nb::bytes(module_str.data(), module_str.size());
-          },
-          nb::arg("module"))
-      .def("lowered_with_shardy",
-           [](const nb::bytes& bytecode) -> bool {
-             mlir::MLIRContext context;
-             mlir::OwningOpRef<mlir::ModuleOp> module =
-                 xla::ValueOrThrow(ParseMlirModuleString(
-                     absl::string_view(bytecode.c_str(), bytecode.size()),
-                     context));
-             return mlir::sdy::getMeshAttr(module.get(), "mesh") ||
-                    sdy::tryGetFrontendAttr<mlir::DictionaryAttr>(
-                        module.get(), sdy::kMeshesRoundTripAttr)
-                        .has_value();
-           })
-      // TODO(bartchr): delete this and all uses of it once I have JAX export
-      // support multiple meshes.
-      .def("get_mesh", [](const nb::bytes& bytecode) -> nb::list {
-        mlir::MLIRContext context;
-        mlir::OwningOpRef<mlir::ModuleOp> module =
-            xla::ValueOrThrow(ParseMlirModuleString(
-                absl::string_view(bytecode.c_str(), bytecode.size()), context));
-        auto mesh_op =
-            mlir::SymbolTable::lookupNearestSymbolFrom<mlir::sdy::MeshOp>(
-                module.get(), mlir::StringAttr::get(&context, "mesh"));
-        if (!mesh_op) {
-          return {};
-        }
-        nb::list mesh_shape;
-        for (auto axis : mesh_op.getMeshAttr().getAxes()) {
-          mesh_shape.append(
-              nb::make_tuple(axis.getName().str(), axis.getSize()));
-        }
-        return mesh_shape;
-      });
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/sdy.h b/third_party/xla/xla/python/sdy.h
deleted file mode 100644
index 00110c910929..000000000000
--- a/third_party/xla/xla/python/sdy.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_SDY_H_
-#define XLA_PYTHON_SDY_H_
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-void BuildSdySubmodule(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_SDY_H_
diff --git a/third_party/xla/xla/python/sharded_device_array.h b/third_party/xla/xla/python/sharded_device_array.h
deleted file mode 100644
index 8bbec37e451e..000000000000
--- a/third_party/xla/xla/python/sharded_device_array.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
-#define XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
-
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/types/variant.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/variant.h"  // IWYU pragma: keep
-#include "xla/python/types.h"
-
-// TODO(jblespiau): The current implementation moves the Python logic to C++,
-// as a preliminary step to executing the `pmap` execution path from C++.
-// It implements the current Python behavior (thus, it may not be optimal, and
-// we will be able to modify it later).
-
-namespace jax {
-
-// High level introduction.
-//
-// pmap and other parallel computation functions distribute some computation on
-// several devices. On December 2020, the devices mesh (i.e. N-dimentional array
-// of devices on which we map the computation) is defined by the user.
-//
-// We describe how to shard the inputs, and how to map it to the mesh of devices
-// using `ShardingSpec`. It's mainly based on 2 components:
-// - `sharding`, which specifies how to shard the inputs.
-// - `mesh_mapping`, which specifies how to map shards to devices.
-//
-// The 3 following structs define how to shard one dimension of an ndarry.
-//
-// `NoSharding` (`None` in Python) means no sharding.
-struct NoSharding {
-  bool operator==(const NoSharding& other) const { return true; }
-  bool operator!=(const NoSharding& other) const { return false; }
-};
-
-template <typename H>
-H AbslHashValue(H h, const NoSharding& key) {
-  return h;
-}
-
-// `Chunked` means that the dimension is split into np.prod(chunks) chunks
-// and the split dimension itself is preserved inside the map.
-// Those chunks are distributed over `len(chunks)` ShardedAxes axes
-// (major-to-minor).
-// For example, for a tensor `t` of shape [N] sharded using [Chunked([p])] (with
-// p  dividing N, let S = N // p) the tensor will be split into p chunks of
-// shape [S], such sharded_t[k] = t[k * S: (k+1)*S] (left included, right
-// excluded) for k in {0, ... p-1}.
-struct Chunked {
- public:
-  explicit Chunked(std::vector<int> chunks_) : chunks(std::move(chunks_)) {}
-  // The number of chunks per axis.
-  std::vector<int> chunks;
-
-  bool operator==(const Chunked& other) const { return chunks == other.chunks; }
-  bool operator!=(const Chunked& other) const { return chunks != other.chunks; }
-};
-
-template <typename H>
-H AbslHashValue(H h, const Chunked& key) {
-  h = H::combine(std::move(h), key.chunks);
-  return h;
-}
-
-// `Unstacked` means that the dimension is split into chunks of size 1, and
-// doesn't appear inside the map. `size` is always the dimension size.
-// For example, a Tensor t of shape [N] will be sharded into N tensors of shape
-// [], when using `Unstacked(N)`.
-struct Unstacked {
- public:
-  explicit Unstacked(int sz) : size(sz) {}
-  int size;
-
-  bool operator==(const Unstacked& other) const { return size == other.size; }
-  bool operator!=(const Unstacked& other) const { return size != other.size; }
-};
-
-template <typename H>
-H AbslHashValue(H h, const Unstacked& key) {
-  h = H::combine(std::move(h), key.size);
-  return h;
-}
-
-using AvalDimSharding = std::variant<NoSharding, Chunked, Unstacked>;
-
-// Assigns sharded axes to mesh dimensions.
-//
-// The devices will be for each dimension which has a sharded `AvalDimSharding`
-// When no axis is assigned, the data is replicated.
-// As indices are 0-indexed, `ShardedAxis(1)` refers to the second actually
-// sharded axis (i.e. counting as if the None dimensions of sharding were
-// filtered out).
-// For example, given the sharding `[Unstacked(n), None, Chunked(m)]`, an entry
-// of `ShardedAxis(1)` refers to the `Chunked(m)` axis, not the `None`.
-
-struct ShardedAxis {
-  int axis;
-  bool operator==(const ShardedAxis& other) const { return axis == other.axis; }
-  bool operator!=(const ShardedAxis& other) const { return axis != other.axis; }
-};
-
-template <typename H>
-H AbslHashValue(H h, const ShardedAxis& key) {
-  h = H::combine(std::move(h), key.axis);
-  return h;
-}
-
-struct Replicated {
-  int replicas;
-  bool operator==(const Replicated& other) const {
-    return replicas == other.replicas;
-  }
-  bool operator!=(const Replicated& other) const {
-    return replicas != other.replicas;
-  }
-};
-
-template <typename H>
-H AbslHashValue(H h, const Replicated& key) {
-  h = H::combine(std::move(h), key.replicas);
-  return h;
-}
-
-using MeshDimAssignment = std::variant<ShardedAxis, Replicated>;
-
-// Describes how each axis is sharded (if it is), and how it's mapped to the
-// devices mesh. See Jax pxla.py for the documentation.
-//
-// ShardingSpec is shared across pmap, pjit and xpmap. For pmap, an input
-// `sharding`  is composed of `NoSharding` and at most one `Unstacked`.
-// If `axis_size=None`, at least one the inputs has a dimension associated to
-// `Unstacked`.
-//
-// Examples:
-//
-// 1. For pmap, with a tensor of shape [8, 2, 2], to unstack along the first
-//    dimension into [8] devices:
-//
-//    sharding = [Unstacked(8), NoSharding, NoSharding]
-//    mesh_mapping = [ShardedAxis(0)]
-//
-// 2. With an input array of shape [6], that we want to chunk into [2, 3]
-//    Assuming an device mesh [3, 4, 2] of devices, we will have:
-//
-//    sharding = [Chunked([2, 3])]
-//    mesh_mapping = [ShardedAxis(1), Replicated, ShardedAxis(0)]
-//
-//    In particular, in the above example, the ShardedAxis refers to indices
-//    of the sharded shape [2, 3]. (only the `Chunked` sharding can produce more
-//    than one dimension).
-class ShardingSpec {
- public:
-  ShardingSpec(std::vector<AvalDimSharding> sharding,
-               std::vector<MeshDimAssignment> mesh_mapping)
-      : sharding_(std::move(sharding)),
-        mesh_mapping_(std::move(mesh_mapping)) {}
-  ShardingSpec(nanobind::iterable py_sharding,
-               nanobind::iterable py_mesh_mapping)
-      : sharding_(xla::IterableToVector<AvalDimSharding>(py_sharding)),
-        mesh_mapping_(
-            xla::IterableToVector<MeshDimAssignment>(py_mesh_mapping)) {}
-
-  const std::vector<AvalDimSharding>& GetSharding() const { return sharding_; }
-  const std::vector<MeshDimAssignment>& GetMeshMapping() const {
-    return mesh_mapping_;
-  }
-
-  bool operator==(const ShardingSpec& other) const {
-    return sharding_ == other.sharding_ && mesh_mapping_ == other.mesh_mapping_;
-  }
-
-  bool operator!=(const ShardingSpec& other) const { return !(*this == other); }
-
-  template <typename H>
-  friend H AbslHashValue(H h, const ShardingSpec& key);
-
- private:
-  //  `sharding` specifies how the array is supposed to get partitioned into
-  //  chunks. Its length matchs the rank of the array. See the docstring
-  //  of `AvalDimSharding` for the supported partitioning schemes.
-  std::vector<AvalDimSharding> sharding_;
-  //  `mesh_mapping` describes an assignments of the array chunks created by
-  //  `sharding` to a logical device mesh. The length of the tuple is equal to
-  //  the rank of the mesh. Each mesh dimension can either get partitions of
-  //  data varying along one of the sharded dimensions, or the data can be
-  //  replicated.
-  std::vector<MeshDimAssignment> mesh_mapping_;
-};
-
-template <typename H>
-H AbslHashValue(H h, const ShardingSpec& key) {
-  h = H::combine(std::move(h), key.sharding_);
-  h = H::combine(std::move(h), key.mesh_mapping_);
-  return h;
-}
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
diff --git a/third_party/xla/xla/python/sharding.cc b/third_party/xla/xla/python/sharding.cc
deleted file mode 100644
index aad457049fea..000000000000
--- a/third_party/xla/xla/python/sharding.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/sharding.h"
-
-#include <Python.h>
-
-#include <cstdlib>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/hash/hash.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_device_list.h"
-#include "xla/python/sharded_device_array.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/xla_data.pb.h"
-
-namespace jax {
-
-namespace nb = nanobind;
-
-nb::object CheckAndCanonicalizeMemoryKind(
-    nb::object memory_kind,
-    const xla::nb_class_ptr<PyDeviceList>& device_list) {
-  if (!memory_kind.is_none()) {
-    // If memory kind is not None, check if it's supported by the devices
-    // mentioned in the Sharding.
-    auto supported_memory_kinds = PyDeviceList::MemoryKinds(device_list);
-    if (!supported_memory_kinds.ok()) {
-      supported_memory_kinds = nb::tuple();
-    }
-    for (nb::handle supported_memory_kind : *supported_memory_kinds) {
-      if (supported_memory_kind.equal(memory_kind)) {
-        return memory_kind;
-      }
-    }
-    auto addressable_device_list =
-        PyDeviceList::AddressableDeviceList(device_list);
-    if (addressable_device_list->Len() == 0) {
-      // If the device list is not addressable, we can't check if the memory
-      // kind is supported, so we assume it is.
-      return memory_kind;
-    }
-    nb::object device_kind =
-        addressable_device_list->GetItem(0).attr("device_kind");
-    absl::string_view device_kind_str =
-        nb::cast<absl::string_view>(device_kind);
-    auto py_str_formatter = [](std::string* out, nb::handle h) {
-      *out += nb::cast<absl::string_view>(nb::str(h));
-    };
-    throw nb::value_error(
-        absl::StrCat(
-            "Could not find memory addressable by device ", device_kind_str,
-            ". Device ", device_kind_str,
-            " can address the following memory kinds: ",
-            absl::StrJoin(*supported_memory_kinds, ", ", py_str_formatter),
-            ". Got memory kind: ", nb::cast<absl::string_view>(memory_kind))
-            .c_str());
-  }
-  // If memory kind is None, canonicalize to default memory.
-  absl::StatusOr<nb::object> default_memory_kind =
-      PyDeviceList::DefaultMemoryKind(device_list);
-  if (!default_memory_kind.ok()) {
-    return nb::none();
-  }
-  return *std::move(default_memory_kind);
-}
-
-int Sharding::SafeNumDevices(nb::handle sharding) {
-  const jax::Sharding* cpp_sharding;
-  if (nb::try_cast<const jax::Sharding*>(sharding, cpp_sharding)) {
-    if (cpp_sharding->num_devices_.has_value()) {
-      return (*cpp_sharding->num_devices_);
-    }
-  }
-  nb::set device_set = sharding.attr("device_set");
-  return device_set.size();
-}
-
-size_t ShardingHash(nb::handle sharding) {
-  auto type = sharding.type();
-
-  if (type.is(NamedSharding::type())) {
-    const auto* named_sharding = nb::inst_ptr<jax::NamedSharding>(sharding);
-    return absl::Hash<void*>()(named_sharding->mesh().ptr());
-  }
-
-  if (type.is(GSPMDSharding::type())) {
-    auto* gspmd_sharding = nb::inst_ptr<GSPMDSharding>(sharding);
-    return gspmd_sharding->Hash();
-  }
-
-  if (type.is(SingleDeviceSharding::type())) {
-    auto* single_device_sharding = nb::inst_ptr<SingleDeviceSharding>(sharding);
-    return absl::Hash<void*>()(single_device_sharding->device().ptr());
-  }
-
-  return nb::hash(sharding);
-}
-
-bool ShardingEqual(nb::handle a, nb::handle b) {
-  if (a.ptr() == b.ptr()) return true;
-
-  auto a_type = a.type();
-  auto b_type = b.type();
-
-  if (!a_type.is(b_type)) return false;
-
-  if (a_type.is(NamedSharding::type())) {
-    auto* a_named_sharding = nb::inst_ptr<const NamedSharding>(a);
-    auto* b_named_sharding = nb::inst_ptr<const NamedSharding>(b);
-
-    return a_named_sharding->mesh().ptr() == b_named_sharding->mesh().ptr() &&
-           a_named_sharding->spec().equal(b_named_sharding->spec()) &&
-           a_named_sharding->memory_kind().equal(
-               b_named_sharding->memory_kind()) &&
-           a_named_sharding->manual_axes().equal(
-               b_named_sharding->manual_axes()) &&
-           a_named_sharding->logical_device_ids().equal(
-               b_named_sharding->logical_device_ids());
-  }
-
-  if (a_type.is(GSPMDSharding::type())) {
-    auto* a_gspmd_sharding = nb::inst_ptr<const GSPMDSharding>(a);
-    auto* b_gspmd_sharding = nb::inst_ptr<const GSPMDSharding>(b);
-
-    return a_gspmd_sharding == b_gspmd_sharding;
-  }
-
-  if (a_type.is(SingleDeviceSharding::type())) {
-    auto* a_single_device_sharding =
-        nb::inst_ptr<const SingleDeviceSharding>(a);
-    auto* b_single_device_sharding =
-        nb::inst_ptr<const SingleDeviceSharding>(b);
-
-    return a_single_device_sharding->device().ptr() ==
-               b_single_device_sharding->device().ptr() &&
-           a_single_device_sharding->memory_kind().equal(
-               b_single_device_sharding->memory_kind());
-  }
-
-  return a.equal(b);
-}
-
-NamedSharding::NamedSharding(nb::object mesh, nb::object spec,
-                             nb::object memory_kind, nb::object manual_axes,
-                             nb::object logical_device_ids)
-    : Sharding(/*num_devices=*/[&mesh]() {
-        return nb::cast<int>(mesh.attr("size"));
-      }()),
-      mesh_(std::move(mesh)),
-      spec_(std::move(spec)),
-      memory_kind_(std::move(memory_kind)),
-      manual_axes_(std::move(manual_axes)),
-      logical_device_ids_(std::move(logical_device_ids)) {
-  if (spec_.is_none()) {
-    throw nb::type_error(
-        "Unexpected None passed as spec for NamedSharding. Did you mean P()?");
-  }
-  nb::object idl = nb::object(mesh_.attr("_internal_device_list"));
-  if (idl.is_none()) {
-    internal_device_list_ = std::nullopt;
-  } else {
-    internal_device_list_ = nb::cast<xla::nb_class_ptr<jax::PyDeviceList>>(
-        nb::object(mesh_.attr("_internal_device_list")));
-  }
-  if (internal_device_list_) {
-    memory_kind_ =
-        CheckAndCanonicalizeMemoryKind(memory_kind_, *internal_device_list_);
-  } else {
-    memory_kind_ = nb::none();
-  }
-
-  nb::module_ si = nb::module_::import_("jax._src.named_sharding");
-  si.attr("check_pspec")(mesh_, spec_, manual_axes_);
-}
-
-SingleDeviceSharding::SingleDeviceSharding(nb::object device,
-                                           nb::object memory_kind)
-    : Sharding(/*num_devices=*/1),
-      device_(device),
-      memory_kind_(std::move(memory_kind)),
-      internal_device_list_(
-          xla::make_nb_class<PyDeviceList>(nb::make_tuple(std::move(device)))) {
-  memory_kind_ =
-      CheckAndCanonicalizeMemoryKind(memory_kind_, internal_device_list_);
-}
-
-SingleDeviceSharding::SingleDeviceSharding(
-    xla::nb_class_ptr<xla::PyClient> client,
-    xla::ifrt::DeviceListRef device_list, nb::object memory_kind)
-    : Sharding(/*num_devices=*/1),
-      device_(client->GetPyDevice(device_list->devices().front())),
-      memory_kind_(std::move(memory_kind)),
-      internal_device_list_(xla::make_nb_class<PyDeviceList>(
-          std::move(client), std::move(device_list))) {
-  memory_kind_ =
-      CheckAndCanonicalizeMemoryKind(memory_kind_, internal_device_list_);
-}
-
-PmapSharding::PmapSharding(xla::nb_numpy_ndarray devices,
-                           ShardingSpec sharding_spec)
-    : Sharding(/*num_devices=*/devices.size()),
-      devices_(std::move(devices)),
-      sharding_spec_(std::move(sharding_spec)) {
-  nb::object flat_devices = devices_.attr("flat");
-  internal_device_list_ =
-      xla::make_nb_class<PyDeviceList>(nb::tuple(flat_devices));
-}
-
-GSPMDSharding::GSPMDSharding(nb::sequence devices, xla::HloSharding op_sharding,
-                             nb::object memory_kind, nb::object device_list)
-    : Sharding(/*num_devices=*/nb::len(devices.ptr())),
-      devices_(nb::tuple(devices)),
-      hlo_sharding_(std::move(op_sharding)),
-      memory_kind_(std::move(memory_kind)) {
-  if (device_list.is_none()) {
-    internal_device_list_ = xla::make_nb_class<PyDeviceList>(devices_);
-  } else {
-    internal_device_list_ =
-        nb::cast<xla::nb_class_ptr<jax::PyDeviceList>>(std::move(device_list));
-  }
-  // This checks in python if the memory kind is correct for the given
-  // devices. Currently in python this check is optimized but we want to
-  // move that check to C++ after which we can remove this call.
-  CHECK(devices_.size() != 0)
-      << "Devices given to GSPMDSharding must not be empty";
-  memory_kind_ =
-      CheckAndCanonicalizeMemoryKind(memory_kind_, internal_device_list_);
-}
-
-void RegisterSharding(nb::module_& m) {
-  nb::class_<Sharding>(m, "Sharding").def(nb::init<>());
-
-  nb::class_<NamedSharding, Sharding>(m, "NamedSharding", nb::dynamic_attr())
-      .def(nb::init<nb::object, nb::object, nb::object, nb::object,
-                    nb::object>(),
-           nb::arg("mesh"), nb::arg("spec").none(),
-           nb::arg("memory_kind").none() = nb::none(),
-           nb::arg("_manual_axes") = nb::steal(PyFrozenSet_New(nullptr)),
-           nb::arg("_logical_device_ids").none() = nb::none())
-      .def_prop_ro("mesh", &NamedSharding::mesh)
-      .def_prop_ro("spec", &NamedSharding::spec)
-      .def_prop_ro("_memory_kind", &NamedSharding::memory_kind)
-      .def_prop_ro("_manual_axes", &NamedSharding::manual_axes)
-      .def_prop_ro("_logical_device_ids", &NamedSharding::logical_device_ids)
-      .def_prop_ro("_internal_device_list", [](const NamedSharding& s) {
-        return xla::ValueOrThrow(s.internal_device_list());
-      });
-
-  nb::class_<SingleDeviceSharding, Sharding>(m, "SingleDeviceSharding",
-                                             nb::dynamic_attr())
-      .def(nb::init<nb::object, nb::object>(), nb::arg("device"),
-           nb::arg("memory_kind").none() = nb::none())
-      .def_prop_ro("_device", &SingleDeviceSharding::device)
-      .def_prop_ro("_memory_kind", &SingleDeviceSharding::memory_kind)
-      .def_prop_ro("_internal_device_list",
-                   &SingleDeviceSharding::internal_device_list);
-
-  nb::class_<PmapSharding, Sharding>(m, "PmapSharding", nb::dynamic_attr())
-      .def(
-          "__init__",
-          [](PmapSharding* self, nb::object devices,
-             ShardingSpec sharding_spec) {
-            new (self) PmapSharding(xla::nb_numpy_ndarray::ensure(devices),
-                                    std::move(sharding_spec));
-          },
-          nb::arg("devices"), nb::arg("sharding_spec"))
-      .def_prop_ro("devices", &PmapSharding::devices)
-      .def_prop_ro("sharding_spec", &PmapSharding::sharding_spec)
-      .def_prop_ro("_internal_device_list",
-                   &PmapSharding::internal_device_list);
-
-  nb::class_<GSPMDSharding, Sharding>(m, "GSPMDSharding", nb::dynamic_attr())
-      .def(nb::init<nb::sequence, xla::OpSharding, nb::object, nb::object>(),
-           nb::arg("devices"), nb::arg("op_sharding"),
-           nb::arg("memory_kind").none() = nb::none(),
-           nb::arg("_device_list").none() = nb::none())
-      .def(nb::init<nb::sequence, xla::HloSharding, nb::object, nb::object>(),
-           nb::arg("devices"), nb::arg("op_sharding"),
-           nb::arg("memory_kind").none() = nb::none(),
-           nb::arg("_device_list").none() = nb::none())
-      .def_prop_ro("_devices", &GSPMDSharding::devices)
-      .def_prop_ro("_hlo_sharding", &GSPMDSharding::hlo_sharding)
-      .def_prop_ro("_memory_kind", &GSPMDSharding::memory_kind)
-      .def_prop_ro("_internal_device_list",
-                   &GSPMDSharding::internal_device_list);
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/sharding.h b/third_party/xla/xla/python/sharding.h
deleted file mode 100644
index bd86ccca15b2..000000000000
--- a/third_party/xla/xla/python/sharding.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_SHARDING_H_
-#define XLA_PYTHON_SHARDING_H_
-
-#include <cstddef>
-#include <optional>
-#include <utility>
-
-// placeholder for index annotation headers
-#include "absl/hash/hash.h"
-#include "absl/status/statusor.h"
-#include "nanobind/nanobind.h"
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/ifrt/device.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_device_list.h"
-#include "xla/python/sharded_device_array.h"
-#include "xla/xla_data.pb.h"
-
-namespace jax {
-
-class Sharding {
- public:
-  Sharding() = default;
-
-  // This constructor is used in the fast path to retrieve the number of devices
-  // without falling back to python. This is only used in the cpp path.
-  explicit Sharding(int num_devices) : num_devices_(num_devices) {}
-
-  virtual ~Sharding() = default;
-
-  static int SafeNumDevices(nanobind::handle sharding);
-
- private:
-  std::optional<int> num_devices_;
-};
-
-// Checks if the memory kind is valid, and canonicalizes the
-// memory kind to default memory on backends that support memories.
-nanobind::object CheckAndCanonicalizeMemoryKind(
-    nanobind::object memory_kind,
-    const xla::nb_class_ptr<PyDeviceList>& device_list);
-
-// Returns a hash that may sometimes return different hashes for equal values.
-// It is not a correct implementation of `__hash__` in python, but it's fine
-// for jit/pjit dispatch since it only causes spurious cache misses.
-size_t ShardingHash(nanobind::handle sharding);
-
-bool ShardingEqual(nanobind::handle a, nanobind::handle b);
-
-class NamedSharding : public Sharding {
- public:
-  NamedSharding(nanobind::object mesh, nanobind::object spec,
-                nanobind::object memory_kind, nanobind::object manual_axes,
-                nanobind::object logical_device_ids);
-
-  const nanobind::object& mesh() const { return mesh_; }
-  const nanobind::object& spec() const { return spec_; }
-  const nanobind::object& memory_kind() const { return memory_kind_; }
-  const nanobind::object& manual_axes() const { return manual_axes_; }
-  const nanobind::object& logical_device_ids() const {
-    return logical_device_ids_;
-  }
-
-  static nanobind::handle type() {
-    static auto type = nanobind::type<NamedSharding>();
-    return type;
-  }
-
-  absl::StatusOr<xla::nb_class_ptr<PyDeviceList>> internal_device_list() const {
-    if (internal_device_list_) {
-      return *internal_device_list_;
-    }
-    return xla::InvalidArgument(
-        "internal_device_list is not implemented for "
-        "`jax.sharding.AbstractMesh`");
-  }
-
- private:
-  nanobind::object mesh_;
-  nanobind::object spec_;
-  nanobind::object memory_kind_;
-  nanobind::object manual_axes_;
-  nanobind::object logical_device_ids_;
-  std::optional<xla::nb_class_ptr<PyDeviceList>> internal_device_list_;
-};
-
-class SingleDeviceSharding : public Sharding {
- public:
-  explicit SingleDeviceSharding(
-      nanobind::object device, nanobind::object memory_kind = nanobind::none());
-
-  // Used only in C++ to accelerate `PyArray::MakeFromSingleDeviceArray()`.
-  SingleDeviceSharding(xla::nb_class_ptr<xla::PyClient> client,
-                       xla::ifrt::DeviceListRef device_list,
-                       nanobind::object memory_kind);
-
-  const nanobind::object& device() const { return device_; }
-  const nanobind::object& memory_kind() const { return memory_kind_; }
-
-  static nanobind::handle type() {
-    static auto type = nanobind::type<SingleDeviceSharding>();
-    return type;
-  }
-
-  xla::nb_class_ptr<PyDeviceList> internal_device_list() const {
-    return internal_device_list_;
-  }
-
- private:
-  nanobind::object device_;
-  nanobind::object memory_kind_;
-  xla::nb_class_ptr<PyDeviceList> internal_device_list_;
-};
-
-// The C++ implementation of jax.PmapSharding in python. It contains a few key
-// data members and methods that are performance-critical.
-class PmapSharding : public Sharding {
- public:
-  PmapSharding(xla::nb_numpy_ndarray devices, ShardingSpec sharding_spec);
-
-  ~PmapSharding() override = default;
-
-  xla::nb_numpy_ndarray devices() const { return devices_; }
-
-  const ShardingSpec& sharding_spec() const { return sharding_spec_; }
-
-  static nanobind::handle type() {
-    static auto type = nanobind::type<PmapSharding>();
-    return type;
-  }
-
-  xla::nb_class_ptr<PyDeviceList> internal_device_list() const {
-    return internal_device_list_;
-  }
-
- private:
-  xla::nb_numpy_ndarray devices_;
-  ShardingSpec sharding_spec_;
-  xla::nb_class_ptr<PyDeviceList> internal_device_list_;
-};
-
-class GSPMDSharding : public Sharding {
- public:
-  GSPMDSharding(nanobind::sequence devices, xla::OpSharding op_sharding,
-                nanobind::object memory_kind, nanobind::object device_list)
-      : GSPMDSharding(
-            std::move(devices),
-            xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding)),
-            std::move(memory_kind), std::move(device_list)) {}
-
-  GSPMDSharding(nanobind::sequence devices, xla::HloSharding op_sharding,
-                nanobind::object memory_kind, nanobind::object device_list);
-
-  const nanobind::tuple& devices() const { return devices_; }
-  const nanobind::object& memory_kind() const { return memory_kind_; }
-
-  size_t Hash() {
-    if (!hash_.has_value()) {
-      hash_ = CalculateHash();
-    }
-    return *hash_;
-  }
-
-  static nanobind::handle type() {
-    static auto type = nanobind::type<GSPMDSharding>();
-    return type;
-  }
-
-  const xla::HloSharding& hlo_sharding() const { return hlo_sharding_; }
-
-  bool operator==(const GSPMDSharding& other) const {
-    return AreOpShardingsEqual(*this, other) &&
-           this->devices().equal(other.devices()) &&
-           this->memory_kind().equal(other.memory_kind());
-  }
-
-  xla::nb_class_ptr<PyDeviceList> internal_device_list() const {
-    return internal_device_list_;
-  }
-
- private:
-  size_t CalculateHash() const {
-    // We only hash `hlo_sharding_` here for performance.
-    return absl::Hash<xla::HloSharding>()(hlo_sharding_);
-  }
-
-  static bool AreOpShardingsEqual(const GSPMDSharding& a,
-                                  const GSPMDSharding& b) {
-    // If the OpSharding object is the same, return true
-    if (&a.hlo_sharding() == &b.hlo_sharding()) {
-      return true;
-    }
-    // If both OpShardings are replicated, return true
-    if (a.IsOpShardingReplicated() && b.IsOpShardingReplicated()) {
-      return true;
-    }
-    return a.hlo_sharding() == b.hlo_sharding();
-  }
-
-  bool IsOpShardingReplicated() const {
-    // For JAX, shardings with 1 device are considered as replicated in its
-    // semantics so that downstream things continue to work.
-    if (hlo_sharding_.tile_assignment().num_elements() == 1) {
-      return true;
-    }
-    return hlo_sharding().IsReplicated();
-  }
-
-  nanobind::tuple devices_;
-  xla::HloSharding hlo_sharding_;
-  nanobind::object memory_kind_;
-  std::optional<size_t> hash_;
-  xla::nb_class_ptr<PyDeviceList> internal_device_list_;
-};
-
-void RegisterSharding(nanobind::module_& m);
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_SHARDING_H_
diff --git a/third_party/xla/xla/python/to_ifrt_sharding.cc b/third_party/xla/xla/python/to_ifrt_sharding.cc
deleted file mode 100644
index 90c26de49261..000000000000
--- a/third_party/xla/xla/python/to_ifrt_sharding.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/python/to_ifrt_sharding.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/pjrt_ifrt/pjrt_dtype.h"
-#include "xla/python/pjrt_ifrt/xla_sharding.h"
-#include "xla/python/py_device_list.h"
-#include "xla/python/sharding.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-
-namespace nb = ::nanobind;
-
-// Gets `xla::HloSharding` from a JAX Sharding.
-xla::HloSharding GetXlaHloSharding(nb::handle sharding,
-                                   int64_t num_dimensions) {
-  if (sharding.type().is(nb::handle(jax::GSPMDSharding::type().ptr()))) {
-    return nb::cast<jax::GSPMDSharding*>(nb::handle(sharding.ptr()))
-        ->hlo_sharding();
-  } else {
-    return nb::cast<xla::HloSharding>(
-        sharding.attr("_to_xla_hlo_sharding")(num_dimensions));
-  }
-}
-
-// Gets `xla::ifrt::DeviceList` from a JAX Sharding.
-absl::StatusOr<xla::ifrt::DeviceListRef> GetIfrtDeviceList(
-    nb::handle sharding_py) {
-  nb::handle sharding(sharding_py.ptr());
-  if (sharding.type().is(jax::NamedSharding::type())) {
-    TF_ASSIGN_OR_RETURN(
-        auto ns_device_list,
-        nb::cast<const jax::NamedSharding*>(sharding)->internal_device_list());
-    return ns_device_list->ifrt_device_list();
-  } else if (sharding.type().is(jax::SingleDeviceSharding::type())) {
-    return nb::cast<const jax::SingleDeviceSharding*>(sharding)
-        ->internal_device_list()
-        ->ifrt_device_list();
-  } else if (sharding.type().is(jax::PmapSharding::type())) {
-    return nb::cast<const jax::PmapSharding*>(sharding)
-        ->internal_device_list()
-        ->ifrt_device_list();
-  } else if (sharding.type().is(jax::GSPMDSharding::type())) {
-    return nb::cast<const jax::GSPMDSharding*>(sharding)
-        ->internal_device_list()
-        ->ifrt_device_list();
-  } else {
-    return nb::cast<const jax::PyDeviceList*>(
-               sharding.attr("_internal_device_list"))
-        ->ifrt_device_list();
-  }
-}
-
-// Gets `xla::ifrt::MemoryKind` from a JAX Sharding.
-xla::ifrt::MemoryKind GetMemoryKind(nb::handle sharding) {
-  nb::object py_memory_kind = nb::none();
-
-  // sharding.attr("memory_kind") can crash if sharding was originally created
-  // from C++ and casted into a Python Sharding object. Thus, we cast sharding
-  // to a C++ type and use C++ `memory_kind()` method, which bypasses any Python
-  // attribute access.
-  nb::handle type = sharding.type();
-  if (type.is(jax::NamedSharding::type())) {
-    py_memory_kind =
-        nb::cast<const jax::NamedSharding*>(sharding)->memory_kind();
-  } else if (type.is(jax::SingleDeviceSharding::type())) {
-    py_memory_kind =
-        nb::cast<const jax::SingleDeviceSharding*>(sharding)->memory_kind();
-  } else if (type.is(jax::GSPMDSharding::type())) {
-    py_memory_kind =
-        nb::cast<const jax::GSPMDSharding*>(sharding)->memory_kind();
-  } else {
-    py_memory_kind = sharding.attr("memory_kind");
-  }
-
-  if (py_memory_kind.is_none()) {
-    return xla::ifrt::MemoryKind();
-  }
-  return xla::ifrt::MemoryKind(nb::cast<std::string>(py_memory_kind));
-}
-
-// Converts a JAX Sharding into `xla::ifrt::HloSharding`.
-absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>> GetIfrtHloSharding(
-    nb::handle sharding, const xla::ifrt::Shape& shape) {
-  TF_ASSIGN_OR_RETURN(xla::ifrt::DeviceListRef device_list,
-                      GetIfrtDeviceList(sharding));
-  xla::ifrt::MemoryKind memory_kind = GetMemoryKind(sharding.ptr());
-  xla::HloSharding hlo_sharding =
-      GetXlaHloSharding(sharding, shape.dims().size());
-  return xla::ifrt::HloSharding::Create(
-      std::move(device_list), std::move(memory_kind), std::move(hlo_sharding));
-}
-
-// Converts a JAX Sharding into `xla::ifrt::ConcreteEvenSharding`.
-absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>>
-GetIfrtConcreteEvenSharding(nb::handle sharding, xla::ifrt::DType dtype,
-                            const xla::ifrt::Shape& shape) {
-  TF_ASSIGN_OR_RETURN(xla::ifrt::DeviceListRef device_list,
-                      GetIfrtDeviceList(sharding));
-  xla::ifrt::MemoryKind memory_kind = GetMemoryKind(sharding.ptr());
-  TF_ASSIGN_OR_RETURN(xla::PrimitiveType xla_primitive_type,
-                      xla::ifrt::ToPrimitiveType(dtype));
-  // The XLA shape's layout is irrelevant because we only need to know the
-  // tile shape, which is independent from the layout.
-  xla::Shape xla_shape = xla::ShapeUtil::MakeShapeWithDescendingLayout(
-      xla_primitive_type, shape.dims());
-  xla::HloSharding hlo_sharding =
-      GetXlaHloSharding(sharding, shape.dims().size());
-  xla::Shape tile_shape = hlo_sharding.TileShape(xla_shape);
-  xla::ifrt::Shape shard_shape(xla::ifrt::Shape::Dimensions(
-      tile_shape.dimensions().begin(), tile_shape.dimensions().end()));
-  return xla::ifrt::ConcreteEvenSharding::Create(
-      std::move(device_list), std::move(memory_kind), shape,
-      /*shard_shape=*/std::move(shard_shape));
-}
-
-// Converts a JAX Sharding into `xla::ifrt::ConcreteSharding`.
-absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>>
-GetIfrtConcreteSharding(nb::handle sharding, const xla::ifrt::Shape& shape,
-                        std::vector<xla::ifrt::Shape> shard_shapes) {
-  TF_ASSIGN_OR_RETURN(xla::ifrt::DeviceListRef device_list,
-                      GetIfrtDeviceList(sharding));
-  xla::ifrt::MemoryKind memory_kind = GetMemoryKind(sharding.ptr());
-  return xla::ifrt::ConcreteSharding::Create(
-      std::move(device_list), std::move(memory_kind), shape,
-      /*shard_shapes=*/std::move(shard_shapes));
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/to_ifrt_sharding.h b/third_party/xla/xla/python/to_ifrt_sharding.h
deleted file mode 100644
index cc2fbfead81b..000000000000
--- a/third_party/xla/xla/python/to_ifrt_sharding.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_TO_IFRT_SHARDING_H_
-#define XLA_PYTHON_TO_IFRT_SHARDING_H_
-
-#include "nanobind/nanobind.h"
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-
-// Gets `xla::HloSharding` from a JAX Sharding.
-xla::HloSharding GetXlaHloSharding(nanobind::handle sharding,
-                                   int64_t num_dimensions);
-
-// Gets `xla::ifrt::DeviceList` from a JAX Sharding.
-absl::StatusOr<xla::ifrt::DeviceListRef> GetIfrtDeviceList(
-    nanobind::handle sharding_py);
-
-// Converts a JAX Sharding into `xla::ifrt::HloSharding`.
-absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>> GetIfrtHloSharding(
-    nanobind::handle sharding, const xla::ifrt::Shape& shape);
-
-// Converts a JAX Sharding into `xla::ifrt::ConcreteEvenSharding`.
-absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>>
-GetIfrtConcreteEvenSharding(nanobind::handle sharding, xla::ifrt::DType dtype,
-                            const xla::ifrt::Shape& shape);
-
-// Converts a JAX Sharding into `xla::ifrt::ConcreteSharding`.
-absl::StatusOr<std::shared_ptr<const xla::ifrt::Sharding>>
-GetIfrtConcreteSharding(nanobind::handle sharding,
-                        const xla::ifrt::Shape& shape,
-                        std::vector<xla::ifrt::Shape> shard_shapes);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_TO_IFRT_SHARDING_H_
diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
index 8f48a719cffd..db2cb0f2a71d 100644
--- a/third_party/xla/xla/python/tools/BUILD
+++ b/third_party/xla/xla/python/tools/BUILD
@@ -1,8 +1,9 @@
+load("//xla:py_strict.bzl", "py_strict_test")
+
 # NOTE: We can't use `pytype_pybind_extension` nor `pytype_strict_contrib_test`
 # because the OSS versions of these files do not include ports of those rules.
 # We must instead use `tsl_pybind_extension` and `py_strict_test`.
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-load("//xla:strict.default.bzl", "py_strict_test")
+load("//xla:pytype.bzl", "pytype_strict_library")
 load("//xla/tsl:tsl.default.bzl", "tsl_pybind_extension")
 
 package(
@@ -37,6 +38,7 @@ pytype_strict_library(
     # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
     # dependency isn't part of the public API.
     tags = ["no_oss"],
+    # TODO(dsuo): Should this be public given note above?
     visibility = ["//visibility:public"],
     deps = [
         ":_types",  # buildcleaner: keep
diff --git a/third_party/xla/xla/python/traceback.cc b/third_party/xla/xla/python/traceback.cc
deleted file mode 100644
index a9d35e4d04d7..000000000000
--- a/third_party/xla/xla/python/traceback.cc
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/traceback.h"
-
-#include <Python.h>
-
-#include <cstddef>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/base/casts.h"
-#include "absl/hash/hash.h"
-#include "absl/log/check.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/pjrt/exceptions.h"
-#include "xla/python/nb_class_ptr.h"
-#include "tsl/platform/platform.h"
-
-#ifdef PLATFORM_GOOGLE
-#define Py_BUILD_CORE
-#include "internal/pycore_frame.h"
-#undef Py_BUILD_CORE
-#endif  // PLATFORM_GOOGLE
-
-namespace xla {
-
-namespace nb = nanobind;
-
-bool Traceback::enabled_ = true;
-
-Traceback::Traceback() {
-  DCHECK(PyGILState_Check());
-  PyThreadState* thread_state = PyThreadState_GET();
-
-#if PY_VERSION_HEX < 0x030b0000
-  // The representation of frame->f_lasti changed from bytes to words in Python
-  // 3.10, see https://docs.python.org/3/whatsnew/3.10.html#changes-in-the-c-api
-  // This should match sizeof(_Py_CODEUNIT) which is unfortunately private.
-  constexpr int kLastiWordBytes = 2;
-
-  for (PyFrameObject* py_frame = thread_state->frame; py_frame != nullptr;
-       py_frame = py_frame->f_back) {
-    Py_INCREF(py_frame->f_code);
-    frames_.emplace_back(py_frame->f_code, py_frame->f_lasti * kLastiWordBytes);
-  }
-#else  // PY_VERSION_HEX < 0x030b0000
-
-#ifdef PLATFORM_GOOGLE
-  // This code is equivalent to the version using public APIs, but it saves us
-  // an allocation of one object per stack frame. However, this is definitely
-  // violating the API contract of CPython, so we only use this where we can be
-  // confident we know exactly which CPython we are using (internal to Google).
-  // Feel free to turn this on if you like, but it might break at any time!
-  for (_PyInterpreterFrame* f = thread_state->cframe->current_frame;
-       f != nullptr; f = f->previous) {
-    if (_PyFrame_IsIncomplete(f)) continue;
-    Py_INCREF(f->f_code);
-    frames_.emplace_back(f->f_code,
-                         _PyInterpreterFrame_LASTI(f) * sizeof(_Py_CODEUNIT));
-  }
-#else   // PLATFORM_GOOGLE
-  PyFrameObject* next;
-  for (PyFrameObject* py_frame = PyThreadState_GetFrame(thread_state);
-       py_frame != nullptr; py_frame = next) {
-    frames_.emplace_back(PyFrame_GetCode(py_frame), PyFrame_GetLasti(py_frame));
-    next = PyFrame_GetBack(py_frame);
-    Py_XDECREF(py_frame);
-  }
-#endif  // PLATFORM_GOOGLE
-
-#endif  // PY_VERSION_HEX < 0x030b0000
-}
-
-Traceback::~Traceback() {
-  for (auto& frame : frames_) {
-    DCHECK(PyGILState_Check());
-    Py_DECREF(frame.first);
-  }
-}
-
-Traceback::Traceback(Traceback&& other) noexcept
-    : frames_(std::move(other.frames_)) {
-  // absl::InlinedVector does not always clear itself if moved. Since we rely on
-  // its empty() method to destroy Traceback differently, we explicitly clear
-  // here.
-  other.frames_.clear();
-}
-
-std::string Traceback::Frame::ToString() const {
-  return absl::StrFormat("%s:%d (%s)", nb::cast<absl::string_view>(file_name),
-                         line_num, nb::cast<absl::string_view>(function_name));
-}
-
-std::string Traceback::ToString() const {
-  std::vector<std::string> frame_strs;
-  frame_strs.reserve(frames_.size());
-  for (const Frame& frame : Frames()) {
-    frame_strs.push_back(frame.ToString());
-  }
-  return absl::StrJoin(frame_strs, "\n");
-}
-
-std::vector<Traceback::Frame> Traceback::Frames() const {
-  // We require the GIL because we manipulate Python strings.
-  CHECK(PyGILState_Check());
-  std::vector<Traceback::Frame> frames;
-  frames.reserve(frames_.size());
-  for (const auto& frame : frames_) {
-    frames.push_back(Frame{nb::borrow<nb::str>(frame.first->co_filename),
-                           nb::borrow<nb::str>(frame.first->co_name),
-                           frame.first->co_firstlineno,
-                           PyCode_Addr2Line(frame.first, frame.second)});
-  }
-  return frames;
-}
-
-std::optional<nb_class_ptr<Traceback>> Traceback::Get() {
-  DCHECK(PyGILState_Check());
-  if (!enabled_) {
-    return std::nullopt;
-  }
-  return make_nb_class<Traceback>();
-}
-
-void Traceback::SetEnabled(bool enabled) { enabled_ = enabled; }
-
-nb::object Traceback::AsPythonTraceback() const {
-  nb::object traceback = nb::none();
-  nb::dict globals;
-  nb::handle traceback_type(reinterpret_cast<PyObject*>(&PyTraceBack_Type));
-  for (const std::pair<PyCodeObject*, int>& frame : frames_) {
-    int lineno = PyCode_Addr2Line(frame.first, frame.second);
-    // Under Python 3.11 we observed crashes when using a fake PyFrameObject
-    // with a real PyCodeObject (https://github.com/google/jax/issues/16027).
-    // because the frame does not have fields necessary to compute the locals,
-    // notably the closure object, leading to crashes in CPython in
-    // _PyFrame_FastToLocalsWithError
-    // https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Objects/frameobject.c#LL1116C2-L1116C2
-    // We therefore always build a fake code object to go along with our fake
-    // frame.
-    PyCodeObject* py_code =
-        PyCode_NewEmpty(PyUnicode_AsUTF8(frame.first->co_filename),
-                        PyUnicode_AsUTF8(frame.first->co_name), lineno);
-    PyFrameObject* py_frame = PyFrame_New(PyThreadState_Get(), py_code,
-                                          globals.ptr(), /*locals=*/nullptr);
-    Py_DECREF(py_code);
-
-    traceback = traceback_type(
-        /*tb_next=*/std::move(traceback),
-        /*tb_frame=*/
-        nb::steal<nb::object>(reinterpret_cast<PyObject*>(py_frame)),
-        /*tb_lasti=*/0,
-        /*tb_lineno=*/
-        PyCode_Addr2Line(frame.first, frame.second));
-  }
-  return traceback;
-}
-
-namespace {
-
-Py_hash_t traceback_tp_hash(PyObject* o) {
-  Traceback* tb;
-  if (!nb::try_cast(nb::handle(o), tb)) {
-    PyErr_SetString(PyExc_TypeError, "Expected a Traceback object");
-    return -1;
-  }
-  size_t h = absl::HashOf(*tb);
-  Py_hash_t s = absl::bit_cast<Py_hash_t>(h);  // Python hashes are signed.
-  return s == -1 ? -2 : s;  // -1 must not be used as a Python hash value.
-}
-
-PyObject* traceback_tp_richcompare(PyObject* self, PyObject* other, int op) {
-  if (op != Py_EQ && op != Py_NE) {
-    return Py_NewRef(Py_NotImplemented);
-  }
-
-  Traceback* x;
-  if (!nb::try_cast(nb::handle(self), x)) {
-    PyErr_SetString(PyExc_TypeError, "Expected a Traceback object");
-    return nullptr;
-  }
-
-  bool result;
-  Traceback* y;
-  if (nb::try_cast(nb::handle(other), y)) {
-    result = ((*x == *y) == (op == Py_EQ));
-  } else {
-    result = (op == Py_NE);
-  }
-  return Py_NewRef(result ? Py_True : Py_False);
-}
-
-// It turns out to be slightly faster to define a tp_hash slot rather than
-// defining __hash__ and __eq__ on the class.
-PyType_Slot traceback_slots_[] = {
-    {Py_tp_hash, (void*)traceback_tp_hash},
-    {Py_tp_richcompare, (void*)traceback_tp_richcompare},
-    {0, nullptr},
-};
-
-}  // namespace
-
-void BuildTracebackSubmodule(nb::module_& m) {
-  nb::class_<Traceback::Frame>(m, "Frame")
-      .def(nb::init<const nb::str&, const nb::str&, int, int>())
-      .def_ro("file_name", &Traceback::Frame::file_name)
-      .def_ro("function_name", &Traceback::Frame::function_name)
-      .def_ro("function_start_line", &Traceback::Frame::function_start_line)
-      .def_ro("line_num", &Traceback::Frame::line_num)
-      .def("__repr__", [](const Traceback::Frame& frame) {
-        return absl::StrFormat(
-            "%s;%s:%d", nb::cast<absl::string_view>(frame.function_name),
-            nb::cast<absl::string_view>(frame.file_name), frame.line_num);
-      });
-
-  nb::class_<Traceback> traceback(m, "Traceback",
-                                  nb::type_slots(traceback_slots_),
-                                  "Represents a Python stack trace.");
-  traceback.def_prop_rw_static(
-      "enabled", [](nb::object /* cls */) { return Traceback::enabled(); },
-      [](nb::object /* cls */, bool enabled) {
-        return Traceback::SetEnabled(enabled);
-      });
-  traceback.def_static(
-      "get_traceback", []() { return Traceback::Get(); },
-      R"doc(
-    Returns a :class:`Traceback` for the current thread.
-
-    If ``Traceback.enabled`` is ``True``, returns a :class:`Traceback` object
-    that describes the Python stack of the calling thread. Stack trace
-    collection has a small overhead, so it is disabled by default. If traceback
-    collection is disabled, returns ``None``.
-    )doc");
-  traceback.def_prop_ro("frames", &Traceback::Frames);
-  traceback.def("raw_frames", [](const Traceback& tb) -> nb::tuple {
-    // We return a tuple of lists, rather than a list of tuples, because it
-    // is cheaper to allocate only three Python objects for everything rather
-    // than one per frame.
-    nb::list out_code = nb::steal<nb::list>(PyList_New(tb.raw_frames().size()));
-    nb::list out_lasti =
-        nb::steal<nb::list>(PyList_New(tb.raw_frames().size()));
-    for (size_t i = 0; i < tb.raw_frames().size(); ++i) {
-      const auto& frame = tb.raw_frames()[i];
-      PyObject* code = reinterpret_cast<PyObject*>(frame.first);
-      Py_INCREF(code);
-      PyList_SET_ITEM(out_code.ptr(), i, code);
-      PyList_SET_ITEM(out_lasti.ptr(), i,
-                      nb::int_(frame.second).release().ptr());
-    }
-    return nb::make_tuple(out_code, out_lasti);
-  });
-  traceback.def("__str__", &Traceback::ToString);
-  traceback.def("as_python_traceback", &Traceback::AsPythonTraceback);
-
-  traceback.def_static(
-      "traceback_from_frames",
-      [](std::vector<Traceback::Frame> frames) {
-        nb::object traceback = nb::none();
-        nb::dict globals;
-        nb::handle traceback_type(
-            reinterpret_cast<PyObject*>(&PyTraceBack_Type));
-        for (const Traceback::Frame& frame : frames) {
-          PyCodeObject* py_code =
-              PyCode_NewEmpty(frame.file_name.c_str(),
-                              frame.function_name.c_str(), frame.line_num);
-          PyFrameObject* py_frame = PyFrame_New(PyThreadState_Get(), py_code,
-                                                globals.ptr(), /*locals=*/
-                                                nullptr);
-          Py_DECREF(py_code);
-          traceback = traceback_type(
-              /*tb_next=*/std::move(traceback),
-              /*tb_frame=*/
-              nb::steal<nb::object>(reinterpret_cast<PyObject*>(py_frame)),
-              /*tb_lasti=*/0,
-              /*tb_lineno=*/
-              frame.line_num);
-        }
-        return traceback;
-      },
-      "Creates a traceback from a list of frames.");
-
-  traceback.def_static(
-      "code_addr2line",
-      [](nb::handle code, int lasti) {
-        if (!PyCode_Check(code.ptr())) {
-          throw xla::XlaRuntimeError("code argument must be a code object");
-        }
-        return PyCode_Addr2Line(reinterpret_cast<PyCodeObject*>(code.ptr()),
-                                lasti);
-      },
-      "Python wrapper around the Python C API function PyCode_Addr2Line");
-
-#if PY_VERSION_HEX >= 0x030b0000
-  traceback.def_static(
-      "code_addr2location",
-      [](nb::handle code, int lasti) {
-        if (!PyCode_Check(code.ptr())) {
-          throw xla::XlaRuntimeError("code argument must be a code object");
-        }
-        int start_line, start_column, end_line, end_column;
-        if (!PyCode_Addr2Location(reinterpret_cast<PyCodeObject*>(code.ptr()),
-                                  lasti, &start_line, &start_column, &end_line,
-                                  &end_column)) {
-          throw nb::python_error();
-        }
-        return nb::make_tuple(start_line, start_column, end_line, end_column);
-      },
-      "Python wrapper around the Python C API function PyCode_Addr2Location");
-#endif  // PY_VERSION_HEX >= 0x030b0000
-
-#if PY_VERSION_HEX < 0x030b0000
-  // This function replaces the exception traceback associated with the current
-  // Python thread.
-  m.def(
-      "replace_thread_exc_traceback",
-      [](nb::object tb) {
-        if (!tb.is_none() && !PyTraceBack_Check(tb.ptr())) {
-          throw xla::XlaRuntimeError(
-              "argument must be a traceback object or None");
-        }
-        PyThreadState* thread_state = PyThreadState_Get();
-        if (!thread_state->exc_info->exc_traceback) {
-          throw xla::XlaRuntimeError(
-              "Current thread does not have an active "
-              "exception traceback");
-        }
-        PyObject* old_exc_traceback = thread_state->exc_info->exc_traceback;
-        PyObject* new_tb = tb.is_none() ? nullptr : tb.release().ptr();
-        thread_state->exc_info->exc_traceback = new_tb;
-        Py_XDECREF(old_exc_traceback);
-      },
-      nb::arg("traceback").none());
-#endif  // PY_VERSION_HEX < 0x30b0000
-}
-}  // namespace xla
diff --git a/third_party/xla/xla/python/traceback.h b/third_party/xla/xla/python/traceback.h
deleted file mode 100644
index da8036272ee3..000000000000
--- a/third_party/xla/xla/python/traceback.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_TRACEBACK_H_
-#define XLA_PYTHON_TRACEBACK_H_
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-// placeholder for index annotation headers
-#include "absl/container/inlined_vector.h"
-#include "nanobind/nanobind.h"
-#include "xla/python/nb_class_ptr.h"
-
-namespace xla {
-
-// Represents a Python traceback. This object is designed to be allocated on
-// the Python heap; creating or destroying a traceback requires the GIL.
-class Traceback {
- public:
-  // Requires GIL. Creates a Traceback object that requires destructor to be
-  // invoked with GIL held as well.
-  static std::optional<nb_class_ptr<Traceback>> Get();
-
-  // Requires GIL.
-  static bool enabled() { return enabled_; }
-  // Requires GIL.
-  static void SetEnabled(bool enabled);
-
-  // Requires GIL. Don't call this directly, you're looking for Get().
-  Traceback();
-  // Requires GIL.
-  ~Traceback();
-
-  Traceback(const Traceback&) = delete;
-  Traceback(Traceback&& other) noexcept;
-  Traceback& operator=(const Traceback&) = delete;
-  Traceback& operator=(Traceback&&) = delete;
-
-  // Requires the GIL be held.
-  std::string ToString() const;
-
-  struct Frame {
-    nanobind::str file_name;
-    nanobind::str function_name;
-    int function_start_line;
-    int line_num;
-
-    std::string ToString() const;
-  };
-  std::vector<Frame> Frames() const;
-
-  const absl::InlinedVector<std::pair<PyCodeObject*, int>, 32>& raw_frames()
-      const {
-    return frames_;
-  }
-
-  // Returns the traceback as a fake Python Traceback object, suitable for
-  // using as an exception traceback.
-  nanobind::object AsPythonTraceback() const;
-
-  bool operator==(const Traceback& other) const {
-    return frames_ == other.frames_;
-  }
-  bool operator!=(const Traceback& other) const {
-    return frames_ != other.frames_;
-  }
-
- private:
-  // Each frame is a pair of a code object and a "lasti" instruction location
-  // in bytes. The size of _Py_CODEUNIT has changed across different Python
-  // versions; the lasti value here has already been multiplied by
-  // sizeof(_Py_CODEUNIT) if needed and is suitable for passing to functions
-  // like PyCode_Addr2Line().
-  absl::InlinedVector<std::pair<PyCodeObject*, int>, 32> frames_;
-
-  // Protected by GIL.
-  static bool enabled_;
-};
-
-using nb_traceback = nb_class_ptr<Traceback>;
-
-template <typename H>
-H AbslHashValue(H h, const Traceback& traceback) {
-  h = H::combine(std::move(h), traceback.raw_frames());
-  return h;
-}
-
-void BuildTracebackSubmodule(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_TRACEBACK_H_
diff --git a/third_party/xla/xla/python/transfer/BUILD b/third_party/xla/xla/python/transfer/BUILD
index 027827a771e9..c37df5d4c762 100644
--- a/third_party/xla/xla/python/transfer/BUILD
+++ b/third_party/xla/xla/python/transfer/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/tsl:tsl.bzl",
     "if_oss",
@@ -48,6 +48,7 @@ xla_cc_test(
     srcs = ["streaming_test.cc"],
     deps = [
         ":streaming",
+        ":transfer_socket_proto_cc",
         "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
@@ -87,6 +88,7 @@ xla_cc_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
@@ -101,12 +103,15 @@ cc_library(
     deps = [
         ":streaming",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:raw_buffer",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -167,6 +172,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:env",
@@ -226,40 +232,3 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-cc_library(
-    name = "py_socket_transfer",
-    srcs = ["py_socket_transfer.cc"],
-    hdrs = ["py_socket_transfer.h"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        ":event_loop",
-        ":socket-server",
-        ":socket_bulk_transport",
-        ":streaming",
-        ":streaming_ifrt",
-        "//xla:util",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:status_casters",
-        "//xla/python:nb_class_ptr",
-        "//xla/python:nb_numpy",
-        "//xla/python:py_client",
-        "//xla/python:traceback",
-        "//xla/python:types",
-        "//xla/python/ifrt",
-        "//xla/python/pjrt_ifrt",
-        "//xla/python/pjrt_ifrt:pjrt_dtype",
-        "//xla/tsl/concurrency:ref_count",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@nanobind",
-    ],
-)
diff --git a/third_party/xla/xla/python/transfer/event_loop.cc b/third_party/xla/xla/python/transfer/event_loop.cc
index 9b90fe857059..d7f7327fd9e6 100644
--- a/third_party/xla/xla/python/transfer/event_loop.cc
+++ b/third_party/xla/xla/python/transfer/event_loop.cc
@@ -178,7 +178,7 @@ void PollEventLoop::Handler::Register(PollEventLoop* loop) {
 }
 
 PollEventLoop* PollEventLoop::GetDefault() {
-  static auto* loop = new PollEventLoopImpl;
+  static auto* const loop = new PollEventLoopImpl;
   return loop;
 }
 
diff --git a/third_party/xla/xla/python/transfer/event_loop_test.cc b/third_party/xla/xla/python/transfer/event_loop_test.cc
index e0f007511be3..0519a0a02fe5 100644
--- a/third_party/xla/xla/python/transfer/event_loop_test.cc
+++ b/third_party/xla/xla/python/transfer/event_loop_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/third_party/xla/xla/python/transfer/py_socket_transfer.cc b/third_party/xla/xla/python/transfer/py_socket_transfer.cc
deleted file mode 100644
index 64c2322947da..000000000000
--- a/third_party/xla/xla/python/transfer/py_socket_transfer.cc
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/python/transfer/py_socket_transfer.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/synchronization/mutex.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/array.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/array_spec.h"
-#include "xla/python/ifrt/dtype.h"
-#include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/shape.h"
-#include "xla/python/ifrt/sharding.h"
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/pjrt_ifrt/pjrt_array.h"
-#include "xla/python/pjrt_ifrt/pjrt_device.h"
-#include "xla/python/pjrt_ifrt/pjrt_dtype.h"
-#include "xla/python/pjrt_ifrt/pjrt_memory.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_client.h"
-#include "xla/python/to_ifrt_sharding.h"
-#include "xla/python/traceback.h"
-#include "xla/python/transfer/event_loop.h"
-#include "xla/python/transfer/socket-server.h"
-#include "xla/python/transfer/socket_bulk_transport.h"
-#include "xla/python/transfer/streaming.h"
-#include "xla/python/transfer/streaming_ifrt.h"
-#include "xla/python/types.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace aux {
-
-namespace nb = nanobind;
-
-absl::StatusOr<xla::PjRtMemorySpace*> MemorySpaceFromSharding(
-    const xla::ifrt::Sharding& sharding) {
-  if (sharding.devices()->devices().size() != 1) {
-    return xla::InvalidArgument(
-        "Can only convert SingleDeviceSharding to MemorySpace not %s",
-        sharding.DebugString());
-  }
-  auto* device = sharding.devices()->devices()[0];
-  if (sharding.memory_kind().memory_kind().has_value()) {
-    // Find `PjRtMemorySpace` that is associated with the sharding's device
-    // and matches the sharding's memory_kind.
-    xla::ifrt::Memory* memory = nullptr;
-    for (xla::ifrt::Memory* ms : device->Memories()) {
-      if (ms->Kind() == sharding.memory_kind()) {
-        memory = ms;
-        break;
-      }
-    }
-    if (memory == nullptr) {
-      return xla::InvalidArgument(
-          "Invalid memory kind: %s; available memory kinds: %s",
-          *sharding.memory_kind().memory_kind(),
-          absl::StrJoin(sharding.devices()->devices().front()->Memories(), ", ",
-                        [](std::string* out, xla::ifrt::Memory* ms) {
-                          absl::StrAppend(out, *ms->Kind().memory_kind());
-                        }));
-    }
-    return tensorflow::down_cast<xla::ifrt::PjRtMemory*>(memory)->pjrt_memory();
-  } else {
-    if (!device->IsAddressable()) {
-      return xla::InvalidArgument(
-          "Cannot copy array to non-addressable device %s",
-          device->DebugString());
-    }
-    return tensorflow::down_cast<xla::ifrt::PjRtDevice*>(device)
-        ->pjrt_device()
-        ->default_memory_space();
-  }
-}
-
-class IfrtArrayEntry : public PullTable::Entry {
- public:
-  struct BufferRef {
-    tsl::RCReference<xla::ifrt::Array> arr;
-    xla::PjRtBuffer* buffer;
-    size_t buf_size;
-  };
-  explicit IfrtArrayEntry(std::vector<BufferRef> arrs,
-                          std::shared_ptr<PremappedCopierState> state,
-                          size_t xfer_size)
-      : arrs_(std::move(arrs)), state_(state), xfer_size_(xfer_size) {}
-  bool Handle(tsl::RCReference<ConnectionState> state,
-              const SocketTransferPullRequest& req,
-              size_t base_req_id) override {
-    for (uint64_t bid : req.buffer_ids()) {
-      auto req_id = base_req_id;
-      ++base_req_id;
-      for (size_t i = 0; i * xfer_size_ < arrs_[bid].buf_size; ++i) {
-        DmaCopyChunk blob;
-        blob.arr = std::move(arrs_[bid].arr);
-        blob.buffer = arrs_[bid].buffer;
-        blob.buffer_id = bid;
-        blob.offset = i * xfer_size_;
-        blob.size = std::min(xfer_size_, arrs_[bid].buf_size - blob.offset);
-        bool is_largest = blob.size + blob.offset == arrs_[bid].buf_size;
-        state_->ScheduleCopy(
-            blob, [req_id, state, copier_state = state_, is_largest](
-                      PremappedCopierState* copier_state_ptr, void* buf,
-                      const DmaCopyChunk& chunk) {
-              state->Send(
-                  req_id, buf, chunk.offset, chunk.size, is_largest,
-                  [copier_state, buf]() { copier_state->ReturnBuffer(buf); });
-            });
-      }
-    }
-
-    num_consumed_bufs_ += req.buffer_ids().size();
-    return num_consumed_bufs_ == arrs_.size();
-  }
-
- private:
-  absl::Mutex mu_;
-  size_t num_consumed_bufs_ = 0;
-  std::vector<BufferRef> arrs_;
-  std::shared_ptr<PremappedCopierState> state_;
-  size_t xfer_size_;
-};
-
-absl::StatusOr<tsl::RCReference<IfrtArrayEntry>> CreatePullEntry(
-    const std::vector<tsl::RCReference<xla::ifrt::Array>>& arrs,
-    std::shared_ptr<PremappedCopierState> state, size_t xfer_size) {
-  std::vector<IfrtArrayEntry::BufferRef> refs;
-  for (auto& arr : arrs) {
-    auto* pjrt_arr = llvm::dyn_cast_or_null<xla::ifrt::PjRtArray>(arr.get());
-    if (pjrt_arr == nullptr) {
-      return absl::InvalidArgumentError(
-          "Cannot remote transfer non-pjrt arrays.");
-    }
-    for (auto& pjrt_buf : pjrt_arr->pjrt_buffers()) {
-      TF_ASSIGN_OR_RETURN(size_t buf_size, pjrt_buf->GetOnDeviceSizeInBytes());
-      refs.push_back({arr, pjrt_buf.get(), buf_size});
-    }
-  }
-  return tsl::MakeRef<IfrtArrayEntry>(std::move(refs), state, xfer_size);
-}
-
-class PyTransferServerConnection {
- public:
-  explicit PyTransferServerConnection(
-      tsl::RCReference<SocketServer::Connection> conn)
-      : conn_(std::move(conn)) {}
-
-  void Pull(uint64_t uuid, std::vector<int> buffer_ids,
-            std::vector<tsl::RCReference<ChunkDestination>> pull_dests) {
-    for (size_t i = 0; i < buffer_ids.size(); ++i) {
-      conn_->Pull(uuid, buffer_ids[i], std::move(pull_dests[i]));
-    }
-  }
-
- private:
-  tsl::RCReference<SocketServer::Connection> conn_;
-};
-
-class PyTransferServer {
- public:
-  PyTransferServer() = default;
-  absl::Status Start(xla::ifrt::Client* client, size_t max_num_parallel_copies,
-                     size_t xfer_size, const SocketAddress& addr,
-                     const std::vector<SocketAddress>& transport_addresses) {
-    std::shared_ptr<BulkTransportFactory> factory;
-    if (transport_addresses.empty()) {
-      factory = BulkTransportFactory::CreateLocal();
-    } else {
-      auto tmp = xla::ValueOrThrow(
-          AllocateAlignedMemory(xfer_size * max_num_parallel_copies));
-      SlabAllocator uallocator(xla::ValueOrThrow(MapPjrtMemory(
-                                   client, tmp->data(), tmp->size(), tmp)),
-                               xfer_size);
-      factory = xla::ValueOrThrow(CreateSocketBulkTransportFactory(
-          transport_addresses, std::nullopt, uallocator));
-    }
-
-    server_ = std::make_shared<SocketServer>();
-
-    TF_ASSIGN_OR_RETURN(auto mem,
-                        AllocateAndMapPjrtMemory(
-                            client, max_num_parallel_copies * xfer_size * 2));
-    premapped_copier_ = std::make_shared<PremappedCopierState>(
-        mem, max_num_parallel_copies, xfer_size);
-    xfer_size_ = xfer_size;
-    return server_->Start(addr, factory);
-  }
-  std::string address() { return server_->addr().ToString(); }
-
-  PyTransferServerConnection Connect(const std::string& saddr) {
-    return PyTransferServerConnection(
-        server_->Connect(xla::ValueOrThrow(SocketAddress::Parse(saddr))));
-  }
-
-  void AwaitPull(uint64_t uuid,
-                 const std::vector<tsl::RCReference<xla::ifrt::Array>>& arrs) {
-    server_->AwaitPull(uuid, xla::ValueOrThrow(CreatePullEntry(
-                                 arrs, premapped_copier_, xfer_size_)));
-  }
-
-  size_t xfer_size() { return xfer_size_; }
-
-  std::shared_ptr<PremappedCopierState> premapped_copier() {
-    return premapped_copier_;
-  }
-
- private:
-  std::shared_ptr<SocketServer> server_;
-  std::shared_ptr<PremappedCopierState> premapped_copier_;
-  size_t xfer_size_;
-};
-
-absl::StatusOr<xla::ifrt::ArraySpec> ArraySpecFromShapeDtypeStruct(
-    nb::handle aval) {
-  TF_ASSIGN_OR_RETURN(xla::ifrt::DType dtype,
-                      xla::DtypeToIfRtDType(
-                          nb::borrow<xla::nb_dtype>(aval.attr("dtype").ptr())));
-  auto shape_dims = nb::cast<std::vector<int64_t>>(aval.attr("shape"));
-  auto shape = xla::ifrt::Shape(
-      xla::ifrt::Shape::Dimensions(shape_dims.begin(), shape_dims.end()));
-  TF_ASSIGN_OR_RETURN(auto sharding,
-                      xla::GetIfrtHloSharding(aval.attr("sharding"), shape));
-  return xla::ifrt::ArraySpec{dtype, std::move(shape), std::move(sharding)};
-}
-
-struct BufferSource {
-  tsl::RCReference<xla::ifrt::Array> arr;
-  xla::PjRtBuffer* buffer;
-};
-
-struct CopyDests {
-  std::vector<xla::PjRtClient::ShapeSpec> shape_specs;
-  xla::PjRtMemorySpace* memory_space;
-};
-
-void RegisterTransferServerTypes(nanobind::module_& m) {
-  nb::class_<PyTransferServerConnection>(m, "TransferConnection")
-      .def("_pull_flat", [](PyTransferServerConnection& self, uint64_t uuid,
-                            xla::nb_class_ptr<xla::PyClient> py_client,
-                            std::vector<nb::object> py_avals) {
-        auto* ifrt_client = llvm::dyn_cast_or_null<xla::ifrt::PjRtClient>(
-            py_client->ifrt_client());
-        if (ifrt_client == nullptr) {
-          xla::ThrowIfError(absl::InvalidArgumentError(
-              "_pull_flat only supported on pjrt-ifrt clients."));
-        }
-
-        std::vector<xla::ifrt::ArraySpec> avals;
-        std::vector<nb::object> shardings;
-        shardings.reserve(py_avals.size());
-        avals.reserve(py_avals.size());
-        for (const auto& py_aval : py_avals) {
-          avals.push_back(
-              xla::ValueOrThrow(ArraySpecFromShapeDtypeStruct(py_aval)));
-          shardings.push_back(py_aval.attr("sharding"));
-        }
-
-        std::vector<CopyDests> dests;
-        std::vector<std::pair<int, int>> fetch_idxs;
-        absl::flat_hash_map<xla::PjRtMemorySpace*, int> mapping;
-        std::vector<std::vector<std::pair<int, int>>> buffer_list;
-
-        for (auto& aval : avals) {
-          std::vector<std::pair<int, int>> buf_list;
-          auto prim_type =
-              xla::ValueOrThrow(xla::ifrt::ToPrimitiveType(aval.dtype));
-          auto shards = xla::ValueOrThrow(aval.sharding->Disassemble(
-              aval.shape,
-              xla::ifrt::SingleDeviceShardSemantics::kAddressableShards));
-          buf_list.reserve(shards.size());
-          for (auto& shard : shards) {
-            auto* mem_space =
-                xla::ValueOrThrow(MemorySpaceFromSharding(*shard.second));
-            int dest_idx =
-                mapping.emplace(mem_space, static_cast<int>(dests.size()))
-                    .first->second;
-            if (dest_idx == dests.size()) {
-              dests.emplace_back();
-              dests.back().memory_space = mem_space;
-            }
-            fetch_idxs.push_back(
-                {dest_idx,
-                 static_cast<int>(dests[dest_idx].shape_specs.size())});
-            buf_list.push_back(fetch_idxs.back());
-            dests[dest_idx].shape_specs.push_back(
-                {prim_type, xla::DimensionVector(shard.first.dims().begin(),
-                                                 shard.first.dims().end())});
-          }
-          buffer_list.push_back(std::move(buf_list));
-        }
-
-        std::vector<
-            std::shared_ptr<xla::PjRtClient::AsyncHostToDeviceTransferManager>>
-            atms;
-        atms.reserve(dests.size());
-
-        for (auto& dest : dests) {
-          atms.push_back(xla::ValueOrThrow(
-              py_client->pjrt_client()->CreateBuffersForAsyncHostToDevice(
-                  dest.shape_specs, std::nullopt, dest.memory_space)));
-        }
-
-        std::vector<tsl::RCReference<ChunkDestination>> pull_dests;
-        std::vector<int> buffer_ids;
-        pull_dests.reserve(fetch_idxs.size());
-        buffer_ids.reserve(fetch_idxs.size());
-        for (auto& fetch_idx : fetch_idxs) {
-          auto& atm = atms[fetch_idx.first];
-          pull_dests.push_back(MakeDmaDestination(
-              atm, fetch_idx.second, atm->buffer_size(fetch_idx.second)));
-          buffer_ids.push_back(static_cast<int>(buffer_ids.size()));
-        }
-
-        self.Pull(uuid, buffer_ids, std::move(pull_dests));
-
-        std::vector<xla::PyArray> out;
-        auto traceback = xla::Traceback::Get();
-        for (size_t i = 0; i < buffer_list.size(); ++i) {
-          xla::ifrt::PjRtArray::PjRtBuffers buffers;
-          buffers.reserve(buffer_list[i].size());
-          for (auto& v : buffer_list[i]) {
-            buffers.push_back(atms[v.first]->RetrieveBuffer(v.second));
-          }
-          auto arr = xla::ValueOrThrow(xla::ifrt::PjRtArray::Create(
-              ifrt_client, avals[i].dtype, avals[i].shape, avals[i].sharding,
-              std::move(buffers)));
-          out.push_back(xla::PyArray::MakeFromIfrtArrayAndSharding(
-              py_client, traceback, std::move(arr), shardings[i], false, true,
-              /*skip_checks=*/false));
-        }
-
-        return out;
-      });
-
-  nb::class_<PyTransferServer>(m, "TransferServer")
-      .def("address", [](PyTransferServer& self) { return self.address(); })
-      .def("_await_pull_flat",
-           [](PyTransferServer& self, uint64_t uuid,
-              std::vector<xla::PyArray> inputs) {
-             std::vector<tsl::RCReference<xla::ifrt::Array>> arrs;
-             arrs.reserve(inputs.size());
-             for (const xla::PyArray& input : inputs) {
-               arrs.push_back(tsl::FormRef(input.ifrt_array()));
-             }
-             self.AwaitPull(uuid, arrs);
-           })
-      .def("connect", [](PyTransferServer& self, const std::string& address) {
-        return self.Connect(address);
-      });
-
-  m.def(
-      "start_transfer_server",
-      [](xla::nb_class_ptr<xla::PyClient> py_client, std::string address,
-         std::vector<std::string> transport_addresses_str,
-         size_t max_num_parallel_copies,
-         size_t transfer_size) -> PyTransferServer {
-        PyTransferServer result;
-        std::vector<SocketAddress> transport_addresses;
-        transport_addresses.reserve(transport_addresses_str.size());
-        for (const std::string& addr : transport_addresses_str) {
-          transport_addresses.push_back(
-              xla::ValueOrThrow(SocketAddress::Parse(addr)));
-        }
-        xla::ThrowIfError(result.Start(
-            py_client->ifrt_client(), max_num_parallel_copies, transfer_size,
-            xla::ValueOrThrow(SocketAddress::Parse(address)),
-            transport_addresses));
-        return result;
-      },
-      nb::arg("client"), nb::arg("address") = SocketAddress().ToString(),
-      nb::arg("transport_addresses") = std::vector<std::string>(),
-      nb::arg("max_num_parallel_copies") = 8,
-      nb::arg("transfer_size") = 256 * 1024 * 1024);
-}
-
-}  // namespace aux
diff --git a/third_party/xla/xla/python/transfer/py_socket_transfer.h b/third_party/xla/xla/python/transfer/py_socket_transfer.h
deleted file mode 100644
index 1541530de4b3..000000000000
--- a/third_party/xla/xla/python/transfer/py_socket_transfer.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef XLA_PYTHON_TRANSFER_PY_SOCKET_TRANSFER_H_
-#define XLA_PYTHON_TRANSFER_PY_SOCKET_TRANSFER_H_
-
-#include "nanobind/nanobind.h"
-
-namespace aux {
-
-void RegisterTransferServerTypes(nanobind::module_& m);
-
-}  // namespace aux
-
-#endif  // XLA_PYTHON_TRANSFER_PY_SOCKET_TRANSFER_H_
diff --git a/third_party/xla/xla/python/transfer/socket-server.cc b/third_party/xla/xla/python/transfer/socket-server.cc
index 652ad7e03e88..19e0cd235682 100644
--- a/third_party/xla/xla/python/transfer/socket-server.cc
+++ b/third_party/xla/xla/python/transfer/socket-server.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <deque>
 #include <memory>
 #include <string>
-#include <string_view>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
diff --git a/third_party/xla/xla/python/transfer/socket-server.h b/third_party/xla/xla/python/transfer/socket-server.h
index e6877e5b9ec4..749ee25aa0ac 100644
--- a/third_party/xla/xla/python/transfer/socket-server.h
+++ b/third_party/xla/xla/python/transfer/socket-server.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define XLA_PYTHON_TRANSFER_SOCKET_SERVER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "xla/python/transfer/event_loop.h"
 #include "xla/python/transfer/streaming.h"
 #include "xla/python/transfer/transfer_socket.pb.h"
diff --git a/third_party/xla/xla/python/transfer/socket_bulk_transport.h b/third_party/xla/xla/python/transfer/socket_bulk_transport.h
index b1630e8c6dbb..47609de44826 100644
--- a/third_party/xla/xla/python/transfer/socket_bulk_transport.h
+++ b/third_party/xla/xla/python/transfer/socket_bulk_transport.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/python/transfer/event_loop.h"
 #include "xla/python/transfer/streaming.h"
diff --git a/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc b/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
index 115a92fe2372..3efd78516e47 100644
--- a/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
+++ b/third_party/xla/xla/python/transfer/socket_bulk_transport_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "xla/python/transfer/event_loop.h"
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt.cc b/third_party/xla/xla/python/transfer/streaming_ifrt.cc
index 12edb884c758..3a858ebb5ef8 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt.cc
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
@@ -91,7 +92,7 @@ absl::StatusOr<std::shared_ptr<absl::Span<uint8_t>>> AllocateAndMapPjrtMemory(
 }
 
 absl::StatusOr<std::vector<DmaCopyChunk>>
-DmaCopyChunk::DivideBufferCopiesEvenly(tsl::RCReference<xla::ifrt::Array> arr,
+DmaCopyChunk::DivideBufferCopiesEvenly(xla::ifrt::ArrayRef arr,
                                        size_t xfer_size, size_t buffer_id) {
   auto* pjrt_arr =
       llvm::dyn_cast_or_null<xla::ifrt::PjRtCompatibleArray>(arr.get());
@@ -102,7 +103,11 @@ DmaCopyChunk::DivideBufferCopiesEvenly(tsl::RCReference<xla::ifrt::Array> arr,
   work_units.reserve(total_num_copies);
   for (size_t i = 0; i < total_num_copies; ++i) {
     work_units.push_back(
-        DmaCopyChunk{arr, buffer, buffer_id, i * xfer_size,
+        DmaCopyChunk{[arr, buffer](void* dst, int64_t offset,
+                                   int64_t transfer_size) -> xla::PjRtFuture<> {
+                       return buffer->CopyRawToHost(dst, offset, transfer_size);
+                     },
+                     buffer_id, i* xfer_size,
                      std::min(copy_size - i * xfer_size, xfer_size)});
   }
   return work_units;
@@ -124,14 +129,14 @@ PremappedCopierState::PremappedCopierState(
 }
 
 void PremappedCopierState::ScheduleCopy(
-    const DmaCopyChunk& blob,
+    DmaCopyChunk blob,
     absl::AnyInvocable<void(PremappedCopierState* state, void* buf,
                             const DmaCopyChunk& chunk) &&>
         on_done) {
   WorkList work_list;
   {
     absl::MutexLock l(&mu_);
-    work_queue_.push_back(WorkQueueItem{blob, nullptr,
+    work_queue_.push_back(WorkQueueItem{std::move(blob), nullptr,
                                         base_seq_id_ + work_queue_.size(),
                                         false, std::move(on_done)});
     work_list = FindWorkLocked();
@@ -166,8 +171,8 @@ PremappedCopierState::WorkList PremappedCopierState::FindWorkLocked() {
 
 void PremappedCopierState::StartWorkUnlocked(const WorkList& work_list) {
   for (WorkQueueItem* work_item : work_list) {
-    const auto& wu = work_item->work;
-    wu.buffer->CopyRawToHost(work_item->dest_buffer, wu.offset, wu.size)
+    auto& wu = work_item->work;
+    wu.copy_fn(work_item->dest_buffer, wu.offset, wu.size)
         .OnReady([this, work_item](absl::Status s) {
           CHECK_OK(s);
           WorkList work_list2;
@@ -228,4 +233,82 @@ tsl::RCReference<ChunkDestination> MakeDmaDestination(
   return tsl::MakeRef<DmaDestination>(atm, buffer_index, transfer_size);
 }
 
+RawBufferEntry::RawBufferEntry(std::vector<BufferRef> arrs,
+                               std::shared_ptr<PremappedCopierState> state,
+                               size_t xfer_size)
+    : arrs_(std::move(arrs)), state_(state), xfer_size_(xfer_size) {}
+bool RawBufferEntry::Handle(tsl::RCReference<ConnectionState> state,
+                            const SocketTransferPullRequest& req,
+                            size_t base_req_id) {
+  for (uint64_t bid : req.buffer_ids()) {
+    auto req_id = base_req_id;
+    ++base_req_id;
+    arrs_[bid].ready_future.OnReady(
+        [state, copier_state = state_, xfer_size = xfer_size_,
+         buf_size = arrs_[bid].buf_size, req_id, bid,
+         buffer = std::move(arrs_[bid].buffer)](absl::Status s) {
+          for (size_t i = 0; i * xfer_size < buf_size; ++i) {
+            DmaCopyChunk blob;
+            blob.copy_fn = [buffer](
+                               void* dst, int64_t offset,
+                               int64_t transfer_size) -> xla::PjRtFuture<> {
+              return buffer->CopyRawDeviceToHost(dst, offset, transfer_size);
+            };
+            blob.buffer_id = bid;
+            blob.offset = i * xfer_size;
+            blob.size = std::min(xfer_size, buf_size - blob.offset);
+            bool is_largest = blob.size + blob.offset == buf_size;
+            copier_state->ScheduleCopy(
+                std::move(blob), [req_id, state, copier_state, is_largest](
+                                     PremappedCopierState* copier_state_ptr,
+                                     void* buf, const DmaCopyChunk& chunk) {
+                  state->Send(req_id, buf, chunk.offset, chunk.size, is_largest,
+                              [copier_state, buf]() {
+                                copier_state->ReturnBuffer(buf);
+                              });
+                });
+          }
+        });
+  }
+
+  num_consumed_bufs_ += req.buffer_ids().size();
+  return num_consumed_bufs_ == arrs_.size();
+}
+
+PjRtBufferEntry::PjRtBufferEntry(std::vector<BufferRef> arrs,
+                                 std::shared_ptr<PremappedCopierState> state,
+                                 size_t xfer_size)
+    : arrs_(std::move(arrs)), state_(state), xfer_size_(xfer_size) {}
+bool PjRtBufferEntry::Handle(tsl::RCReference<ConnectionState> state,
+                             const SocketTransferPullRequest& req,
+                             size_t base_req_id) {
+  for (uint64_t bid : req.buffer_ids()) {
+    auto req_id = base_req_id;
+    ++base_req_id;
+    for (size_t i = 0; i * xfer_size_ < arrs_[bid].buf_size; ++i) {
+      DmaCopyChunk blob;
+      blob.copy_fn = [buffer = std::move(arrs_[bid].buffer)](
+                         void* dst, int64_t offset,
+                         int64_t transfer_size) -> xla::PjRtFuture<> {
+        return buffer->CopyRawToHost(dst, offset, transfer_size);
+      };
+      blob.buffer_id = bid;
+      blob.offset = i * xfer_size_;
+      blob.size = std::min(xfer_size_, arrs_[bid].buf_size - blob.offset);
+      bool is_largest = blob.size + blob.offset == arrs_[bid].buf_size;
+      state_->ScheduleCopy(
+          std::move(blob), [req_id, state, copier_state = state_, is_largest](
+                               PremappedCopierState* copier_state_ptr,
+                               void* buf, const DmaCopyChunk& chunk) {
+            state->Send(
+                req_id, buf, chunk.offset, chunk.size, is_largest,
+                [copier_state, buf]() { copier_state->ReturnBuffer(buf); });
+          });
+    }
+  }
+
+  num_consumed_bufs_ += req.buffer_ids().size();
+  return num_consumed_bufs_ == arrs_.size();
+}
+
 }  // namespace aux
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt.h b/third_party/xla/xla/python/transfer/streaming_ifrt.h
index 863ad279407b..818d886496c6 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt.h
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt.h
@@ -22,9 +22,12 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/pjrt/raw_buffer.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
@@ -48,16 +51,26 @@ absl::StatusOr<std::shared_ptr<absl::Span<uint8_t>>> AllocateAndMapPjrtMemory(
 // An structure which represents a single copy of a chunk out of a buffer
 // with an assigned 'buffer_id'.
 struct DmaCopyChunk {
-  tsl::RCReference<xla::ifrt::Array> arr;
-  xla::PjRtBuffer* buffer;
+  absl::AnyInvocable<xla::PjRtFuture<>(void* dst, int64_t offset,
+                                       int64_t transfer_size)>
+      copy_fn;
   size_t buffer_id;
   size_t offset;
   size_t size;
 
+  static DmaCopyChunk Make(xla::ifrt::ArrayRef arr, xla::PjRtBuffer* buffer,
+                           size_t buffer_id, size_t offset, size_t size) {
+    return DmaCopyChunk{
+        [arr, buffer](void* dst, int64_t offset,
+                      int64_t transfer_size) -> xla::PjRtFuture<> {
+          return buffer->CopyRawToHost(dst, offset, transfer_size);
+        },
+        buffer_id, offset, size};
+  }
+
   // Divides an IFRT array up evenly for copying.
   static absl::StatusOr<std::vector<DmaCopyChunk>> DivideBufferCopiesEvenly(
-      tsl::RCReference<xla::ifrt::Array> arr, size_t xfer_size,
-      size_t buffer_id);
+      xla::ifrt::ArrayRef arr, size_t xfer_size, size_t buffer_id);
 };
 
 // Copies into subdivisions of scratch asyncly in parallel calling on_done
@@ -80,7 +93,7 @@ class PremappedCopierState {
   // future. Since on_done can be called from the TPU thread, avoid doing any
   // serious work (or even calling ReturnBuffer).
   void ScheduleCopy(
-      const DmaCopyChunk& blob,
+      DmaCopyChunk blob,
       absl::AnyInvocable<void(PremappedCopierState* state, void* buf,
                               const DmaCopyChunk& chunk) &&>
           on_done);
@@ -105,6 +118,54 @@ class PremappedCopierState {
   std::vector<void*> available_copy_offsets_ ABSL_GUARDED_BY(mu_);
 };
 
+// A PullTable::Entry impl for a list of raw_buffer + ready_future.
+class RawBufferEntry : public PullTable::Entry {
+ public:
+  struct BufferRef {
+    // TODO(parkers): Technically this should be a use-ref instead of a
+    // ready_future + buffer, but there is no PJRT api for this.
+    xla::PjRtFuture<> ready_future;
+    tsl::RCReference<xla::PjRtRawBuffer> buffer;
+    size_t buf_size;
+  };
+
+  explicit RawBufferEntry(std::vector<BufferRef> arrs,
+                          std::shared_ptr<PremappedCopierState> state,
+                          size_t xfer_size);
+  bool Handle(tsl::RCReference<ConnectionState> state,
+              const SocketTransferPullRequest& req,
+              size_t base_req_id) override;
+
+ private:
+  absl::Mutex mu_;
+  size_t num_consumed_bufs_ = 0;
+  std::vector<BufferRef> arrs_;
+  std::shared_ptr<PremappedCopierState> state_;
+  size_t xfer_size_;
+};
+
+// A PullTable::Entry impl for a list of pjrt buffers.
+class PjRtBufferEntry : public PullTable::Entry {
+ public:
+  struct BufferRef {
+    std::shared_ptr<xla::PjRtBuffer> buffer;
+    size_t buf_size;
+  };
+  explicit PjRtBufferEntry(std::vector<BufferRef> arrs,
+                           std::shared_ptr<PremappedCopierState> state,
+                           size_t xfer_size);
+  bool Handle(tsl::RCReference<ConnectionState> state,
+              const SocketTransferPullRequest& req,
+              size_t base_req_id) override;
+
+ private:
+  absl::Mutex mu_;
+  size_t num_consumed_bufs_ = 0;
+  std::vector<BufferRef> arrs_;
+  std::shared_ptr<PremappedCopierState> state_;
+  size_t xfer_size_;
+};
+
 // Creates a ChunkDestination for a buffer_index of an
 // AsyncHostToDeviceTransferManager.
 tsl::RCReference<ChunkDestination> MakeDmaDestination(
diff --git a/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc b/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
index b2e40e2c3b3c..150df50d9d19 100644
--- a/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
+++ b/third_party/xla/xla/python/transfer/streaming_ifrt_test.cc
@@ -46,17 +46,17 @@ limitations under the License.
 namespace aux {
 namespace {
 
-xla::ifrt::PjRtDevice* GetOtherDevice(tsl::RCReference<xla::ifrt::Array> arr) {
+xla::ifrt::PjRtDevice* GetOtherDevice(xla::ifrt::ArrayRef arr) {
   auto* ifrt_client =
       llvm::dyn_cast_or_null<xla::ifrt::PjRtClient>(arr->client());
   return llvm::dyn_cast<xla::ifrt::PjRtDevice>(ifrt_client->devices()[1]);
 }
 
-xla::ifrt::PjRtClient* GetIfrtClient(tsl::RCReference<xla::ifrt::Array> arr) {
+xla::ifrt::PjRtClient* GetIfrtClient(xla::ifrt::ArrayRef arr) {
   return llvm::dyn_cast_or_null<xla::ifrt::PjRtClient>(arr->client());
 }
 
-xla::Shape ShapeFromIfrt(tsl::RCReference<xla::ifrt::Array> arr) {
+xla::Shape ShapeFromIfrt(xla::ifrt::ArrayRef arr) {
   auto* pjrt_arr =
       llvm::dyn_cast_or_null<xla::ifrt::PjRtCompatibleArray>(arr.get());
   auto buffer = pjrt_arr->pjrt_buffers()[0].get();
@@ -65,7 +65,7 @@ xla::Shape ShapeFromIfrt(tsl::RCReference<xla::ifrt::Array> arr) {
 
 struct SingleBufferCopyPlan {
   std::vector<tsl::RCReference<ChunkDestination>> dests;
-  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  std::vector<xla::ifrt::ArrayRef> arrays;
 };
 
 // Single buffer copy plan example.
@@ -119,7 +119,7 @@ TEST(PremappedCopierState, RoundTrip) {
 
   for (size_t i = 0; i < src_work_units.size(); ++i) {
     cstate->ScheduleCopy(
-        src_work_units[i],
+        std::move(src_work_units[i]),
         [&mu, &local_queue](PremappedCopierState* state, void* buf,
                             const DmaCopyChunk& chunk) {
           absl::MutexLock l(&mu);
diff --git a/third_party/xla/xla/python/transfer/streaming_test.cc b/third_party/xla/xla/python/transfer/streaming_test.cc
index b5dd29861a59..88fce8408048 100644
--- a/third_party/xla/xla/python/transfer/streaming_test.cc
+++ b/third_party/xla/xla/python/transfer/streaming_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
+#include "xla/python/transfer/transfer_socket.pb.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 
diff --git a/third_party/xla/xla/python/transfer/test_pattern.cc b/third_party/xla/xla/python/transfer/test_pattern.cc
index ad25cd4e0e20..2d37da49f3d3 100644
--- a/third_party/xla/xla/python/transfer/test_pattern.cc
+++ b/third_party/xla/xla/python/transfer/test_pattern.cc
@@ -17,9 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <memory>
 #include <optional>
-#include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
@@ -43,7 +41,7 @@ std::vector<int32_t> CreateTestPattern(size_t offset, size_t length) {
   return out;
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> CopyTestPatternToDevice(
+absl::StatusOr<xla::ifrt::ArrayRef> CopyTestPatternToDevice(
     xla::ifrt::Client* client, xla::ifrt::Device* dest_device,
     const std::vector<int32_t>& pattern) {
   return client->MakeArrayFromHostBuffer(
diff --git a/third_party/xla/xla/python/transfer/test_pattern.h b/third_party/xla/xla/python/transfer/test_pattern.h
index a5ed3b5c8753..28b0b7e875b5 100644
--- a/third_party/xla/xla/python/transfer/test_pattern.h
+++ b/third_party/xla/xla/python/transfer/test_pattern.h
@@ -30,7 +30,7 @@ namespace aux::tests {
 
 std::vector<int32_t> CreateTestPattern(size_t offset, size_t length);
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> CopyTestPatternToDevice(
+absl::StatusOr<xla::ifrt::ArrayRef> CopyTestPatternToDevice(
     xla::ifrt::Client* client, xla::ifrt::Device* dest_device,
     const std::vector<int32_t>& pattern);
 
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index 4a184f009035..ee3070fb8f7b 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
+#include "xla/python/safe_static_init.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -377,7 +378,7 @@ absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype) {
   return Unimplemented("Unimplemented primitive type %s", dtype.DebugString());
 }
 
-absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype) {
+absl::StatusOr<ifrt::DType> DtypeToIfRtDType(const nb_dtype& dtype) {
   // String does not have a corresponding XLA primitive type.
   if (dtype.kind() == 'T') {
     return ifrt::DType(ifrt::DType::kString);
@@ -398,8 +399,9 @@ absl::StatusOr<nb_dtype> IfrtDtypeToDtypeWithTokenCanonicalization(
 }
 
 const NumpyScalarTypes& GetNumpyScalarTypes() {
-  static const NumpyScalarTypes* singleton = []() {
-    NumpyScalarTypes* dtypes = new NumpyScalarTypes();
+  auto init_fn = []() {
+    std::unique_ptr<NumpyScalarTypes> dtypes =
+        std::make_unique<NumpyScalarTypes>();
     nb::module_ numpy = nb::module_::import_("numpy");
     nb::module_ ml_dtypes = nb::module_::import_("ml_dtypes");
     dtypes->np_bool = nb::object(numpy.attr("bool_"));
@@ -446,8 +448,10 @@ const NumpyScalarTypes& GetNumpyScalarTypes() {
     dtypes->np_longlong = nb::object(numpy.attr("longlong"));
     dtypes->np_intc = nb::object(numpy.attr("intc"));
     return dtypes;
-  }();
-  return *singleton;
+  };
+
+  const NumpyScalarTypes& singleton = SafeStaticInit<NumpyScalarTypes>(init_fn);
+  return singleton;
 }
 
 const char* PEP3118FormatDescriptorForPrimitiveType(PrimitiveType type) {
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index babdf5a9bd41..bc06312bd012 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "nanobind/nanobind.h"
@@ -47,7 +46,8 @@ absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type);
 // Converts an IFRT dtype to a NumPy dtype.
 absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype);
 
-absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
+// Converts a NumPy dtype to an IFRT dtype.
+absl::StatusOr<ifrt::DType> DtypeToIfRtDType(const nb_dtype& dtype);
 
 // Converts an IFRT dtype to a NumPy dtype. It specially converts `kToken` into
 // bool to avoid exposing the token type to the JAX dtype system, expecting JAX
diff --git a/third_party/xla/xla/python/util.cc b/third_party/xla/xla/python/util.cc
deleted file mode 100644
index 74468f5ee2fb..000000000000
--- a/third_party/xla/xla/python/util.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/util.h"
-
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/client.h"
-#include "xla/python/ifrt/future.h"
-#include "xla/python/ifrt/value.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/util.h"
-
-namespace xla {
-
-absl::Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays) {
-  if (ifrt_arrays.empty()) {
-    return absl::OkStatus();
-  }
-
-  ifrt::Future<> future;
-  if (ifrt_arrays.size() == 1) {
-    future = ifrt_arrays[0]->GetReadyFuture();
-  } else {
-    std::vector<tsl::RCReference<ifrt::Value>> values;
-    values.reserve(ifrt_arrays.size());
-    for (ifrt::Array* const ifrt_array : ifrt_arrays) {
-      values.push_back(tsl::FormRef(ifrt_array));
-    }
-    ifrt::Client* const client = ifrt_arrays.front()->client();
-    future = client->GetReadyFuture(values);
-  }
-
-  absl::Status s = future.Await();
-  if (!s.ok()) {
-    // Fix up error string because some clients rely on it.
-    if (s.message() == "GetReadyFuture() called on deleted or donated buffer") {
-      s = InvalidArgument(
-          "BlockHostUntilReady() called on deleted or donated buffer");
-    }
-  }
-  return s;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/util.h b/third_party/xla/xla/python/util.h
deleted file mode 100644
index fa71dbc2a226..000000000000
--- a/third_party/xla/xla/python/util.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_UTIL_H_
-#define XLA_PYTHON_UTIL_H_
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/python/ifrt/array.h"
-
-namespace xla {
-
-// Requests if given buffers are ready, awaits for results and returns OK if
-// all of the buffers are ready or the last non-ok status.
-absl::Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_UTIL_H_
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
new file mode 100644
index 000000000000..2e2ed955691a
--- /dev/null
+++ b/third_party/xla/xla/python/version.h
@@ -0,0 +1,23 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_VERSION_H_
+#define XLA_PYTHON_VERSION_H_
+
+// An increasing version number to protect jax code against breaking changes.
+// In JAX, reference this via jax._src.lib.ifrt_version.
+#define JAX_IFRT_VERSION_NUMBER 8
+
+#endif  // XLA_PYTHON_VERSION_H_
diff --git a/third_party/xla/xla/python/weakref_lru_cache.cc b/third_party/xla/xla/python/weakref_lru_cache.cc
deleted file mode 100644
index 342097510670..000000000000
--- a/third_party/xla/xla/python/weakref_lru_cache.cc
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/weakref_lru_cache.h"
-
-#include <Python.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/cleanup/cleanup.h"
-#include "absl/hash/hash.h"
-#include "absl/strings/str_cat.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/synchronization/notification.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/pjrt/lru_cache.h"
-#include "xla/tsl/platform/logging.h"
-
-namespace nb = nanobind;
-
-namespace jax {
-namespace {
-
-// Minimal wrapper to expose a nb::dict_iterator's value as something
-// hashable with Abseil.
-class HashablePyDictEntry {
- public:
-  explicit HashablePyDictEntry(std::pair<nb::handle, nb::handle> entry)
-      : entry_(entry) {}
-
-  template <typename H>
-  friend H AbslHashValue(H h, const HashablePyDictEntry& v) {
-    return H::combine(std::move(h), nb::hash(v.entry_.first),
-                      nb::hash(v.entry_.second));
-  }
-
-  std::pair<nb::handle, nb::handle> entry_;
-};
-
-// Similarly, a minimalist adaptor around the nb::detail::dict_iterator
-// itself. Note that the iterator "is" also a Value. Does not meet the full
-// standard iterator requirements, only enough to support H::combine_unordered.
-class HashablePyDictIter {
- public:
-  using iterator_category = std::input_iterator_tag;
-
-  explicit HashablePyDictIter(nb::detail::dict_iterator& iter) : iter_(iter) {}
-
-  // Minimal set of iterator operations.
-  HashablePyDictEntry operator*() const { return HashablePyDictEntry(*iter_); }
-  bool operator!=(const HashablePyDictIter& rhs) const {
-    return iter_ != rhs.iter_;
-  }
-  void operator++() { ++iter_; }
-
- private:
-  nb::detail::dict_iterator& iter_;
-};
-
-struct HashableKey {
-  nb::object context;
-  nb::args args;
-  nb::kwargs kwargs;
-
-  template <typename H>
-  friend H AbslHashValue(H h, const HashableKey& key) {
-    // Note: Despite the fact this is an ABSL hash function, it's safe to call
-    // functions that may throw exceptions such as nb::hash(), because it is
-    // used by an LRUCache, which uses a std::unordered_map, which is
-    // exception-safe.
-    h = H::combine(std::move(h), nb::hash(key.context), nb::hash(key.args));
-    nb::detail::dict_iterator begin = key.kwargs.begin();
-    nb::detail::dict_iterator end = key.kwargs.end();
-    h = H::combine_unordered(std::move(h), HashablePyDictIter(begin),
-                             HashablePyDictIter(end));
-    h = H::combine(std::move(h), key.kwargs.size());
-    return h;
-  }
-};
-
-}  // namespace
-
-class WeakrefLRUCache : public std::enable_shared_from_this<WeakrefLRUCache> {
- public:
-  class Key {
-   public:
-    Key(nb::object context, nb::args args, nb::kwargs kwargs)
-        : context_(std::move(context)),
-          args_(std::move(args)),
-          kwargs_(std::move(kwargs)),
-          cached_hash_(absl::HashOf(HashableKey{context_, args_, kwargs_})) {}
-
-    bool operator==(const Key& other) const {
-      return context_.equal(other.context_) && args_.equal(other.args_) &&
-             kwargs_.equal(other.kwargs_);
-    }
-
-    template <typename H>
-    friend H AbslHashValue(H h, const Key& key) {
-      return H::combine(std::move(h), key.cached_hash_);
-    }
-
-    nb::object context() const { return context_; }
-    nb::args args() const { return args_; }
-    nb::kwargs kwargs() const { return kwargs_; }
-
-    int tp_traverse(visitproc visit, void* arg) const {
-      Py_VISIT(context_.ptr());
-      Py_VISIT(args_.ptr());
-      Py_VISIT(kwargs_.ptr());
-      return 0;
-    }
-
-   private:
-    nb::object context_;
-    nb::args args_;
-    nb::kwargs kwargs_;
-    size_t cached_hash_;
-  };
-
-  struct CacheEntry {
-    bool has_result = false;
-    nb::object result;
-    absl::Notification completed;
-    std::thread::id thread_id = std::this_thread::get_id();
-
-    int tp_traverse(visitproc visit, void* arg) const {
-      Py_VISIT(result.ptr());
-      return 0;
-    }
-  };
-
-  struct CacheInfo {
-    int64_t hits;
-    int64_t misses;
-    int64_t maxsize;
-    int64_t currsize;
-  };
-
-  struct WeakrefCacheKey {
-    nb::weakref ref;
-    size_t cached_hash;
-  };
-
-  using Cache = xla::LRUCache<Key, std::shared_ptr<CacheEntry>>;
-
-  struct WeakrefCacheValue {
-    std::shared_ptr<Cache> cache;
-  };
-
-  struct WeakrefKeyHash {
-    size_t operator()(const WeakrefCacheKey& v) const { return v.cached_hash; }
-  };
-
-  struct WeakrefKeyEq {
-    bool operator()(const WeakrefCacheKey& lhs,
-                    const WeakrefCacheKey& rhs) const {
-      return lhs.ref.equal(rhs.ref);
-    }
-  };
-
-  WeakrefLRUCache(nb::callable cache_context_fn, nb::callable fn,
-                  int64_t maxsize)
-      : cache_context_fn_(cache_context_fn), fn_(fn), lru_list_(maxsize) {}
-
-  std::shared_ptr<Cache> GetCache(WeakrefCacheKey key) {
-    WeakrefCacheValue& value = entries_[key];
-    if (!value.cache) {
-      value.cache = std::make_shared<Cache>(&lru_list_);
-    }
-    return value.cache;
-  }
-
-  nb::object Call(nb::object weakref_key, nb::args args,
-                  nb::kwargs kwargs) ABSL_NO_THREAD_SAFETY_ANALYSIS {
-    nb::object context = cache_context_fn_();
-
-    // We precompute all of the hash values needed by the various maps rather
-    // than computing them during the std::unordered_map insertions. At the very
-    // least, MSVC's std::unordered_map has undefined behavior if the hash
-    // function throws an exception
-    // (https://learn.microsoft.com/en-us/cpp/standard-library/unordered-map-class?view=msvc-170#emplace).
-    Key key(context, args, kwargs);
-    size_t wrcache_hash = static_cast<size_t>(nb::hash(weakref_key));
-
-    // No hash computations after this point.
-
-    auto weakref_gc_callback = nb::cpp_function(
-        [this_weak = weak_from_this(), wrcache_hash](nb::handle weakref) {
-          auto cache = this_weak.lock();
-          if (cache == nullptr) {
-            return;
-          }
-          // Set up PyCriticalSection for cache python associated object;
-          auto py_cache = nb::find(cache);
-          // This should never happen as python cache should always be found
-          CHECK(py_cache.ptr() != nullptr);
-          nb::ft_object_guard lock(py_cache);
-
-          // The object the reference referred to is now in the process of being
-          // destroyed, so we cannot refer to its contents. Python weakref
-          // objects compare based on identity if the object they refer to is
-          // gone, so the hash lookup will work fine.
-          auto it = cache->entries_.find(
-              WeakrefCacheKey{nb::borrow<nb::weakref>(weakref), wrcache_hash});
-          if (it == cache->entries_.end()) {
-            return;
-          }
-          // Create temp-var to avoid re-entrant erase.
-          auto tmp = std::move(it->second);
-          cache->entries_.erase(it);
-        });
-    nb::weakref weakref = nb::weakref(weakref_key, weakref_gc_callback);
-    WeakrefCacheKey wrcache_key{weakref, wrcache_hash};
-    std::shared_ptr<Cache> cache_ptr = GetCache(wrcache_key);
-    Cache& cache = *cache_ptr;
-    ++total_queries_;
-
-    bool inserted = false;
-    std::shared_ptr<CacheEntry> entry;
-    {
-      // Because the gil can be released during cache insertion, this forces
-      // the lock order to be mu_ then gil so we must release the gil first.
-      nb::gil_scoped_release release;
-      // Acquire a mutex to avoid problems where the gil is released during
-      // cache insertion and then a second thread invalidates the cache order.
-      mu_.Lock();
-    }
-    {
-      // GetOrCreateIfAbsent calls into Python hash and equality functions,
-      // which may throw exceptions. The use of absl::Cleanup ensures mu_ is
-      // released if that happens.
-      absl::Cleanup unlock = [this]()
-                                 ABSL_UNLOCK_FUNCTION(mu_) { mu_.Unlock(); };
-      entry = cache.GetOrCreateIfAbsent(key, [&inserted](const Key& key) {
-        inserted = true;
-        return std::make_shared<CacheEntry>();
-      });
-    }
-    if (!entry->completed.HasBeenNotified()) {
-      if (inserted) {
-        ++misses_;
-        absl::Cleanup notify = [&] { entry->completed.Notify(); };
-        entry->result = fn_(weakref_key, *args, **kwargs);
-        entry->has_result = true;
-      } else {
-        if (entry->thread_id == std::this_thread::get_id()) {
-          auto error_string =
-              absl::StrCat("Recursively calling ",
-                           nb::cast<std::string>(nb::repr(weakref_key)),
-                           nb::cast<std::string>(nb::repr(args)));
-          PyErr_SetString(PyExc_RecursionError, error_string.c_str());
-          throw nb::python_error();
-        }
-        nb::gil_scoped_release release;
-        entry->completed.WaitForNotification();
-      }
-    }
-
-    if (entry->has_result) {
-      return entry->result;
-    } else {
-      ++misses_;
-      return fn_(weakref_key, *args, **kwargs);
-    }
-  }
-  std::vector<nb::object> GetKeys() {
-    std::vector<nb::object> results;
-    mu_.Lock();
-    for (const auto& wr_entry : entries_) {
-      for (const auto& rest : *wr_entry.second.cache) {
-        nb::tuple result =
-            nb::make_tuple(*wr_entry.first.ref, rest.first.context(),
-                           rest.first.args(), rest.first.kwargs());
-        results.push_back(std::move(result));
-      }
-    }
-    mu_.Unlock();
-    return results;
-  }
-  CacheInfo GetCacheInfo() const {
-    CacheInfo result;
-    result.hits = total_queries_ - misses_;
-    result.misses = misses_;
-    result.maxsize = lru_list_.Capacity();
-    result.currsize = lru_list_.Size();
-    return result;
-  }
-  void Clear() {
-    total_queries_ = misses_ = 0;
-    std::vector<std::shared_ptr<Cache>> deferred_deletes;
-    deferred_deletes.reserve(entries_.size());
-    for (auto& entry : entries_) {
-      deferred_deletes.push_back(std::move(entry.second.cache));
-    }
-    entries_.clear();
-    deferred_deletes.clear();
-  }
-
-  nb::callable cache_context_fn_;
-  nb::callable fn_;
-  Cache::LRUList lru_list_;
-  std::unordered_map<WeakrefCacheKey, WeakrefCacheValue, WeakrefKeyHash,
-                     WeakrefKeyEq>
-      entries_;
-  int64_t misses_ = 0;
-  int64_t total_queries_ = 0;
-  absl::Mutex mu_;
-
-  static int tp_traverse(PyObject* self, visitproc visit, void* arg) {
-    WeakrefLRUCache* cache = nb::inst_ptr<WeakrefLRUCache>(self);
-    Py_VISIT(Py_TYPE(self));
-    Py_VISIT(cache->cache_context_fn_.ptr());
-    Py_VISIT(cache->fn_.ptr());
-    for (const auto& [wr_key, wr_value] : cache->entries_) {
-      Py_VISIT(wr_key.ref.ptr());
-      for (const auto& [key, cache_value] : *wr_value.cache) {
-        int rval = key.tp_traverse(visit, arg);
-        if (rval != 0) {
-          return rval;
-        }
-        if (cache_value.value.has_value()) {
-          cache_value.value->get()->tp_traverse(visit, arg);
-        }
-      }
-    }
-    return 0;
-  }
-
-  static int tp_clear(PyObject* self) {
-    WeakrefLRUCache* cache = nb::inst_ptr<WeakrefLRUCache>(self);
-    cache->Clear();
-    cache->cache_context_fn_.reset();
-    cache->fn_.reset();
-    return 0;
-  }
-
-  static PyType_Slot slots_[];
-};
-
-/* static */ PyType_Slot WeakrefLRUCache::slots_[] = {
-    {Py_tp_traverse, (void*)WeakrefLRUCache::tp_traverse},
-    {Py_tp_clear, (void*)WeakrefLRUCache::tp_clear},
-    {0, nullptr},
-};
-
-void BuildWeakrefLRUCacheAPI(nb::module_& m) {
-  auto weakref_lru_cache =
-      nb::class_<WeakrefLRUCache>(m, "WeakrefLRUCache",
-                                  nb::is_weak_referenceable(),
-                                  nb::type_slots(WeakrefLRUCache::slots_))
-          .def("__call__", &WeakrefLRUCache::Call, nb::lock_self())
-          .def("cache_keys", &WeakrefLRUCache::GetKeys, nb::lock_self())
-          .def("cache_info", &WeakrefLRUCache::GetCacheInfo, nb::lock_self())
-          .def("cache_clear", &WeakrefLRUCache::Clear, nb::lock_self());
-  nb::class_<WeakrefLRUCache::CacheInfo>(weakref_lru_cache,
-                                         "WeakrefLRUCacheInfo")
-      .def_ro("hits", &WeakrefLRUCache::CacheInfo::hits)
-      .def_ro("misses", &WeakrefLRUCache::CacheInfo::misses)
-      .def_ro("maxsize", &WeakrefLRUCache::CacheInfo::maxsize)
-      .def_ro("currsize", &WeakrefLRUCache::CacheInfo::currsize)
-      .def("__repr__", [](WeakrefLRUCache::CacheInfo& info) {
-        return absl::StrCat(
-            "WeakrefLRUCache(hits=", info.hits, ", misses=", info.misses,
-            ", maxsize=", info.maxsize, ", currsize=", info.currsize, ")");
-      });
-  m.def(
-      "weakref_lru_cache",
-      [](nb::callable cache_context_fn, nb::callable fn, int64_t maxsize) {
-        return std::make_shared<WeakrefLRUCache>(cache_context_fn, fn, maxsize);
-      },
-      nb::arg("cache_context_fn"), nb::arg("fn"), nb::arg("maxsize") = 2048);
-}
-
-}  // namespace jax
diff --git a/third_party/xla/xla/python/weakref_lru_cache.h b/third_party/xla/xla/python/weakref_lru_cache.h
deleted file mode 100644
index a04086a69c0d..000000000000
--- a/third_party/xla/xla/python/weakref_lru_cache.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_WEAKREF_LRU_CACHE_H_
-#define XLA_PYTHON_WEAKREF_LRU_CACHE_H_
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace jax {
-
-void BuildWeakrefLRUCacheAPI(nanobind::module_& m);
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_WEAKREF_LRU_CACHE_H_
diff --git a/third_party/xla/xla/python/weakref_lru_cache_test.py b/third_party/xla/xla/python/weakref_lru_cache_test.py
deleted file mode 100644
index 018b70c0351a..000000000000
--- a/third_party/xla/xla/python/weakref_lru_cache_test.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright 2023 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import gc
-import threading
-import time
-import weakref
-
-from absl.testing import absltest
-
-from xla.python import xla_client
-
-
-class WeakrefLRUCacheTest(absltest.TestCase):
-
-  def testMultiThreaded(self):
-    insert_evs = [threading.Event() for _ in range(2)]
-    insert_evs_i = 0
-
-    class WRKey:
-      pass
-
-    class ClashingKey:
-
-      def __eq__(self, other):
-        return False
-
-      def __hash__(self):
-        return 333  # induce maximal caching problems.
-
-    class GilReleasingCacheKey:
-
-      def __eq__(self, other):
-        nonlocal insert_evs_i
-        if isinstance(other, GilReleasingCacheKey) and insert_evs_i < len(
-            insert_evs
-        ):
-          insert_evs[insert_evs_i].set()
-          insert_evs_i += 1
-          time.sleep(0.01)
-        return False
-
-      def __hash__(self):
-        return 333  # induce maximal caching problems.
-
-    def CacheFn(obj, gil_releasing_cache_key):
-      del obj
-      del gil_releasing_cache_key
-      return None
-
-    cache = xla_client.weakref_lru_cache(lambda: None, CacheFn, 2048)
-
-    wrkey = WRKey()
-
-    def Body():
-      for insert_ev in insert_evs:
-        insert_ev.wait()
-        for _ in range(20):
-          cache(wrkey, ClashingKey())
-
-    t = threading.Thread(target=Body)
-    t.start()
-    for _ in range(3):
-      cache(wrkey, GilReleasingCacheKey())
-    t.join()
-
-  def testAnotherMultiThreaded(self):
-    num_workers = 5
-    barrier = threading.Barrier(num_workers)
-    cache = xla_client.weakref_lru_cache(lambda: None, lambda x, y: y, 2048)
-
-    class WRKey:
-      pass
-
-    def WorkerAddToCache():
-      barrier.wait()
-      wrkey = WRKey()
-      for i in range(10):
-        cache(wrkey, i)
-
-    def WorkerCleanCache():
-      barrier.wait()
-      for _ in range(10):
-        cache.cache_clear()
-
-    workers = [
-        threading.Thread(target=WorkerAddToCache)
-        for _ in range(num_workers - 1)
-    ] + [threading.Thread(target=WorkerCleanCache)]
-
-    for t in workers:
-      t.start()
-
-    for t in workers:
-      t.join()
-
-  def testKwargsDictOrder(self):
-    miss_id = 0
-
-    class WRKey:
-      pass
-
-    def CacheFn(obj, kwkey1, kwkey2):
-      del obj, kwkey1, kwkey2
-      nonlocal miss_id
-      miss_id += 1
-      return miss_id
-
-    cache = xla_client.weakref_lru_cache(lambda: None, CacheFn, 4)
-
-    wrkey = WRKey()
-
-    self.assertEqual(cache(wrkey, kwkey1="a", kwkey2="b"), 1)
-    self.assertEqual(cache(wrkey, kwkey1="b", kwkey2="a"), 2)
-    self.assertEqual(cache(wrkey, kwkey2="b", kwkey1="a"), 1)
-
-  def testGetKeys(self):
-    def CacheFn(obj, arg):
-      del obj
-      return arg + "extra"
-
-    cache = xla_client.weakref_lru_cache(lambda: None, CacheFn, 4)
-
-    class WRKey:
-      pass
-
-    wrkey = WRKey()
-
-    self.assertEmpty(cache.cache_keys())
-    cache(wrkey, "arg1")
-    cache(wrkey, "arg2")
-    self.assertLen(cache.cache_keys(), 2)
-
-  def testNonWeakreferenceableKey(self):
-    class NonWRKey:
-      __slots__ = ()
-
-    non_wr_key = NonWRKey()
-    with self.assertRaises(TypeError):
-      weakref.ref(non_wr_key)
-
-    cache = xla_client.weakref_lru_cache(lambda: None, lambda x: 2048)
-    for _ in range(100):
-      with self.assertRaises(TypeError):
-        cache(non_wr_key)
-
-  def testCrashingKey(self):
-    class WRKey:
-      pass
-
-    class CrashingKey:
-      # A key that raises exceptions if eq or hash is called.
-
-      def __eq__(self, other):
-        raise ValueError("eq")
-
-      def __hash__(self):
-        raise ValueError("hash")
-
-    cache = xla_client.weakref_lru_cache(lambda: None, lambda x, y: y, 2048)
-    wrkey = WRKey()
-    with self.assertRaises(ValueError):
-      for _ in range(100):
-        cache(wrkey, CrashingKey())
-
-  def testPrintingStats(self):
-    class WRKey:
-      pass
-
-    cache = xla_client.weakref_lru_cache(lambda: None, lambda x, y: y, 2048)
-    wrkey = WRKey()
-    for i in range(10):
-      cache(wrkey, i)
-    for i in range(5):
-      cache(wrkey, i)
-
-    self.assertEqual(
-        repr(cache.cache_info()),
-        "WeakrefLRUCache(hits=5, misses=10, maxsize=2048, currsize=10)",
-    )
-
-  def testGCKeys(self):
-    class WRKey:
-
-      def __init__(self, x):
-        self.x = x
-
-      def __eq__(self, other):
-        return self.x == other.x
-
-      def __hash__(self):
-        return hash(self.x)
-
-    cache = xla_client.weakref_lru_cache(lambda: None, lambda x, y: y, 2048)
-    keys = [WRKey(i) for i in range(10)]
-    for i in range(10):
-      cache(keys[i], i)
-
-    # Delete some keys, to exercise the weakref callback behavior.
-    del keys[::2]
-
-    for key in keys:
-      cache(key, 7)
-
-  def testTpTraverse(self):
-    class WRKey:
-      pass
-
-    def CacheContextFn():
-      return None
-
-    def CallFn(x, y, *args, **kwargs):
-      del x, args, kwargs
-      return y
-
-    cache = xla_client.weakref_lru_cache(CacheContextFn, CallFn, 2048)
-
-    keys = [WRKey() for _ in range(10)]
-    values = [str(i) for i in range(10)]
-    args = [str(i) for i in range(10)]
-    kwargs = {"a": "b"}
-
-    for key, value in zip(keys, values):
-      cache(key, value, *args, **kwargs)
-
-    expected_refs = (
-        [
-            CacheContextFn,
-            CallFn,
-            xla_client._xla.WeakrefLRUCache,
-            kwargs,
-        ]
-        + [weakref.getweakrefs(key)[0] for key in keys]
-        + values
-        + args
-    )
-
-    # Can't use assertContainsSubset because it doesn't support kwargs since
-    # dicts aren't hashable.
-    for ref in expected_refs:
-      self.assertIn(ref, gc.get_referents(cache))
-
-
-if __name__ == "__main__":
-  absltest.main()
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
deleted file mode 100644
index 1890f82b9296..000000000000
--- a/third_party/xla/xla/python/xla.cc
+++ /dev/null
@@ -1,965 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <Python.h>
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/base/casts.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/hash/hash.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/time/time.h"
-#include "absl/types/span.h"
-#include "llvm/Support/Casting.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/nb_defs.h"
-#include "nanobind/stl/function.h"  // IWYU pragma: keep
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/pair.h"  // IWYU pragma: keep
-#include "nanobind/stl/set.h"  // IWYU pragma: keep
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/variant.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/backends/cpu/collectives/cpu_collectives.h"
-#include "xla/pjrt/c/pjrt_c_api.h"
-#include "xla/pjrt/distributed/client.h"
-#include "xla/pjrt/distributed/distributed.h"
-#include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/distributed/service.h"
-#include "xla/pjrt/pjrt_compiler.h"
-#include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
-#include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/device_list.h"
-#include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/topology.h"
-#include "xla/python/ifrt_proxy/client/py_module.h"
-#include "xla/python/pjrt_ifrt/pjrt_attribute_map_util.h"
-#include "xla/python/py_client.h"
-#include "xla/python/py_program.h"
-#include "xla/python/sdy.h"
-#include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/python/lib/core/numpy.h"  // NOLINT
-
-#if defined(__linux__)
-#include "gloo/transport/tcp/attr.h"
-#include "gloo/transport/tcp/device.h"
-#include "xla/backends/cpu/collectives/gloo_collectives.h"
-#include "xla/backends/cpu/collectives/gloo_kv_store.h"
-#include "xla/python/transfer/py_socket_transfer.h"
-#elif defined(__APPLE__)
-#include "gloo/transport/uv/device.h"
-#include "xla/backends/cpu/collectives/gloo_collectives.h"  // NOLINT
-#include "xla/backends/cpu/collectives/gloo_kv_store.h"  // NOLINT
-#endif  // defined(__linux__)
-
-#if !defined(_WIN32) && !defined(PLATFORM_GOOGLE)
-#include "xla/backends/cpu/collectives/mpi_collectives.h"
-#endif  // !_WIN32 && !PLATFORM_GOOGLE
-
-#include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/pjrt_api.h"
-#include "xla/pjrt/pjrt_c_api_client.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_common.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_layout.h"
-#include "xla/python/config.h"
-#include "xla/python/custom_call_sharding.h"
-#include "xla/python/dlpack.h"
-#include "xla/python/guard_lib.h"
-#include "xla/python/jax_jit.h"
-#include "xla/python/logging.h"  // IWYU pragma: keep
-#include "xla/python/mlir.h"
-#include "xla/python/nb_absl_flat_hash_map.h"  // IWYU pragma: keep
-#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
-#include "xla/python/nb_class_ptr.h"
-#include "xla/python/ops.h"
-#include "xla/python/pjit.h"
-#include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/python/pjrt_ifrt/pjrt_executable.h"
-#include "xla/python/pjrt_ifrt/pjrt_topology.h"
-#include "xla/python/pmap_lib.h"
-#include "xla/python/pprof_profile_builder.h"
-#include "xla/python/profiler.h"
-#include "xla/python/py_array.h"
-#include "xla/python/py_compile_only_client.h"
-#include "xla/python/py_device.h"
-#include "xla/python/py_device_list.h"
-#include "xla/python/py_executable.h"
-#include "xla/python/py_memory_space.h"
-#include "xla/python/python_ref_manager.h"
-#include "xla/python/pytree.h"
-#include "xla/python/sharding.h"
-#include "xla/python/traceback.h"
-#include "xla/python/weakref_lru_cache.h"
-#include "xla/python/xla_compiler.h"
-#include "xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
-#include "xla/tsl/platform/status.h"
-#include "tsl/platform/platform.h"
-
-// TODO(phawkins): remove host_id properties after JAX is update to avoid them.
-
-namespace xla {
-namespace {
-
-namespace nb = nanobind;
-
-bool IsOptimizedBuild() {
-#if NDEBUG
-  return true;
-#else
-  return false;
-#endif  // NDEBUG
-}
-
-// Is*san reports whether the build is under that particular sanitizer.
-bool IsAsan() {
-#if defined(ADDRESS_SANITIZER)
-  return true;
-#else  // defined(ADDRESS_SANITIZER)
-  return false;
-#endif
-}
-
-bool IsMsan() {
-#if defined(MEMORY_SANITIZER)
-  return true;
-#else  // defined(MEMORY_SANITIZER)
-  return false;
-#endif
-}
-
-bool IsTsan() {
-#if defined(THREAD_SANITIZER)
-  return true;
-#else  // defined(THREAD_SANITIZER)
-  return false;
-#endif
-}
-
-// IsSanitized reports whether the build is under any sanitizer.
-bool IsSanitized() { return IsAsan() || IsMsan() || IsTsan(); }
-
-}  // namespace
-
-NB_MODULE(xla_extension, m) {
-  // Initialize ABSL logging because code within XLA uses it.
-#ifndef PLATFORM_GOOGLE
-  InitializeAbslLogging();
-#endif  // PLATFORM_GOOGLE
-
-  // We seem to get a fair number of leak warnings from nanobind. It's unclear
-  // whether these are false positives or not.
-  nb::set_leak_warnings(false);
-
-  tsl::ImportNumpy();
-
-  // Exceptions
-  nb::exception<XlaRuntimeError> xla_runtime_error(m, "XlaRuntimeError",
-                                                   PyExc_RuntimeError);
-  xla_runtime_error.attr("__doc__") = nb::str(
-      "Runtime errors thrown by the JAX runtime. While the JAX runtime may "
-      "raise other exceptions as well, most exceptions thrown by the runtime "
-      "are instances of this class.");
-
-  // Types
-  nb::enum_<PrimitiveType>(m, "PrimitiveType", nb::is_arithmetic())
-      .value("PRIMITIVE_TYPE_INVALID", PRIMITIVE_TYPE_INVALID)
-      .value("PRED", PRED)
-      .value("S4", S4)
-      .value("S8", S8)
-      .value("S16", S16)
-      .value("S32", S32)
-      .value("S64", S64)
-      .value("U4", U4)
-      .value("U8", U8)
-      .value("U16", U16)
-      .value("U32", U32)
-      .value("U64", U64)
-      .value("F16", F16)
-      .value("F4E2M1FN", F4E2M1FN)
-      // TODO: Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-      // .value("F8E3M4", F8E3M4)
-      // .value("F8E4M3", F8E4M3)
-      .value("F8E8M0FNU", F8E8M0FNU)
-      .value("F8E4M3FN", F8E4M3FN)
-      .value("F8E4M3B11FNUZ", F8E4M3B11FNUZ)
-      .value("F8E4M3FNUZ", F8E4M3FNUZ)
-      .value("F8E5M2", F8E5M2)
-      .value("F8E5M2FNUZ", F8E5M2FNUZ)
-      .value("BF16", BF16)
-      .value("F32", F32)
-      .value("F64", F64)
-      .value("C64", C64)
-      .value("C128", C128)
-      .value("TUPLE", TUPLE)
-      .value("OPAQUE_TYPE", OPAQUE_TYPE)
-      .value("TOKEN", TOKEN);
-
-  // Must be before PyClient.compile.
-  BuildXlaCompilerSubmodule(m);
-
-  PyDevice::RegisterPythonType(m);
-  PyMemorySpace::RegisterPythonType(m);
-  PyClient::RegisterPythonTypes(m);
-
-  nb::enum_<ifrt::ArrayCopySemantics>(m, "ArrayCopySemantics",
-                                      nb::is_arithmetic())
-      .value("ALWAYS_COPY", ifrt::ArrayCopySemantics::kAlwaysCopy)
-      .value("REUSE_INPUT", ifrt::ArrayCopySemantics::kReuseInput)
-      .value("DONATE_INPUT", ifrt::ArrayCopySemantics::kDonateInput);
-
-  nb::class_<PjRtLayout>(m, "PjRtLayout")
-      .def("__str__", &PjRtLayout::ToString)
-      .def("__eq__", [](const PjRtLayout& layout,
-                        const PjRtLayout& other) { return layout == other; })
-      .def("__hash__",
-           [](const PjRtLayout& layout) { return absl::HashOf(layout); })
-      .def("_xla_layout", &PjRtLayout::xla_layout)
-      .def("__getstate__",
-           [](const PjRtLayout& layout) -> nb::tuple {
-             absl::StatusOr<std::string> serialized = layout.Serialize();
-             ThrowIfError(serialized.status());
-             return nb::make_tuple(
-                 nb::bytes(serialized->data(), serialized->size()));
-           })
-      .def("__setstate__", [](PjRtLayout* self, nb::tuple t) {
-        nb::bytes serialized = nb::cast<nb::bytes>(t[0]);
-        absl::StatusOr<std::shared_ptr<const PjRtLayout>> layout =
-            PjRtLayout::Deserialize(
-                absl::string_view(serialized.c_str(), serialized.size()));
-        ThrowIfError(layout.status());
-        new (self) PjRtLayout((*layout)->xla_layout());
-      });
-
-  jax::BuildWeakrefLRUCacheAPI(m);
-
-  nb::class_<xla::cpu::CpuCollectives> cpu_collectives(m, "CpuCollectives");
-
-  m.def(
-      "make_gloo_tcp_collectives",
-      [](std::shared_ptr<DistributedRuntimeClient> distributed_client,
-
-         std::optional<std::string> hostname,
-         std::optional<std::string> interface)
-          -> std::shared_ptr<xla::cpu::CpuCollectives> {
-#if defined(__linux__)
-        std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-        if (distributed_client != nullptr) {
-          kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                 /*key_prefix=*/"cpu:");
-        }
-        auto gloo_kv_store = std::make_unique<cpu::GlooKeyValueStore>(kv_store);
-        auto tcp_attrs = gloo::transport::tcp::attr();
-        if (hostname) {
-          tcp_attrs.hostname = *hostname;
-        }
-        if (interface) {
-          tcp_attrs.iface = *interface;
-        }
-        auto tcp_device = gloo::transport::tcp::CreateDevice(tcp_attrs);
-        return std::make_shared<cpu::GlooCollectives>(std::move(gloo_kv_store),
-                                                      std::move(tcp_device));
-#elif defined(__APPLE__)
-        std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-        if (distributed_client != nullptr) {
-          kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                 /*key_prefix=*/"cpu:");
-        }
-        auto gloo_kv_store = std::make_unique<cpu::GlooKeyValueStore>(kv_store);
-        auto uv_attrs = gloo::transport::uv::attr();
-        if (hostname) {
-          uv_attrs.hostname = *hostname;
-        }
-        if (interface) {
-          uv_attrs.iface = *interface;
-        }
-        auto uv_device = gloo::transport::uv::CreateDevice(uv_attrs);
-        return std::make_shared<cpu::GlooCollectives>(std::move(gloo_kv_store),
-                                                      std::move(uv_device));
-#else   // defined(__linux__)
-        throw xla::XlaRuntimeError(
-            "make_gloo_tcp_collectives only implemented for linux and macos");
-#endif  // defined(__linux__)
-      },
-      nb::arg("distributed_client"), nb::arg("hostname").none() = std::nullopt,
-      nb::arg("interface").none() = std::nullopt);
-
-#if !defined(_WIN32) && !defined(PLATFORM_GOOGLE)
-  nb::class_<cpu::MpiCollectives> mpi_collectives(m, "MpiCollectives",
-                                                  cpu_collectives);
-  mpi_collectives.def("Init", &cpu::MpiCollectives::Init);
-  mpi_collectives.def("Finalize", &cpu::MpiCollectives::Finalize);
-  m.def("make_mpi_collectives", []() -> std::shared_ptr<cpu::MpiCollectives> {
-    return std::make_shared<cpu::MpiCollectives>();
-  });
-#else   // !_WIN32 && !PLATFORM_GOOGLE
-  m.def("make_mpi_collectives",
-        []() -> std::shared_ptr<xla::cpu::CpuCollectives> {
-          throw xla::XlaRuntimeError(
-              "make_mpi_collectives is not implemented for Windows");
-        });
-#endif  // !_WIN32 && !PLATFORM_GOOGLE
-
-  m.def(
-      "get_tfrt_cpu_client",
-      [](bool asynchronous,
-         std::shared_ptr<DistributedRuntimeClient> distributed_client,
-         int node_id, int num_nodes,
-         std::shared_ptr<xla::cpu::CpuCollectives> collectives,
-         std::optional<int> num_devices) -> nb_class_ptr<PyClient> {
-        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
-        {
-          nb::gil_scoped_release gil_release;
-          xla::CpuClientOptions options;
-
-          options.asynchronous = asynchronous;
-          options.collectives = std::move(collectives);
-          options.process_id = node_id;
-          options.cpu_device_count = num_devices;
-          std::unique_ptr<PjRtClient> client =
-              xla::ValueOrThrow(xla::GetXlaPjrtCpuClient(std::move(options)));
-          ifrt::PjRtClient::CreateOptions ifrt_options;
-          ifrt_options.pjrt_client =
-              std::shared_ptr<PjRtClient>(std::move(client));
-          if (distributed_client != nullptr) {
-            ifrt_options.kv_store =
-                GetDistributedKeyValueStore(distributed_client,
-                                            /*key_prefix=*/"cpu:");
-            ifrt_options.process_id = node_id;
-            ifrt_options.num_processes = num_nodes;
-          }
-          ifrt_client =
-              ValueOrThrow(ifrt::PjRtClient::Create(std::move(ifrt_options)));
-        }
-        return PyClient::Make(std::move(ifrt_client));
-      },
-      nb::arg("asynchronous") = true, nb::arg("distributed_client") = nullptr,
-      nb::arg("node_id") = 0, nb::arg("num_nodes") = 1,
-      nb::arg("collectives").none() =
-          std::shared_ptr<xla::cpu::CpuCollectives>(),
-      nb::arg("num_devices").none() = std::nullopt);
-  m.def("pjrt_plugin_loaded", [](std::string platform_name) -> bool {
-    absl::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
-    return pjrt_api.ok();
-  });
-  m.def(
-      "load_pjrt_plugin",
-      [](std::string platform_name, std::optional<std::string> library_path,
-         std::optional<nb::capsule> c_api) -> nb::capsule {
-        if (library_path.has_value()) {
-          const PJRT_Api* api = xla::ValueOrThrow(
-              pjrt::LoadPjrtPlugin(platform_name, *library_path));
-          return nb::capsule(absl::bit_cast<void*>(api), "pjrt_c_api");
-        }
-        if (absl::string_view(c_api->name()) != "pjrt_c_api") {
-          throw nb::value_error(
-              "c_api argument to load_pjrt_plugin is not a pjrt_c_api "
-              "capsule.");
-        }
-        xla::ThrowIfError(pjrt::SetPjrtApi(
-            platform_name, static_cast<const PJRT_Api*>(c_api->data())));
-        return *c_api;
-      },
-      nb::arg("platform_name"), nb::arg("library_path").none() = std::nullopt,
-      nb::arg("c_api").none() = std::nullopt);
-  m.def("pjrt_plugin_initialized", [](std::string platform_name) -> bool {
-    return xla::ValueOrThrow(pjrt::IsPjrtPluginInitialized(platform_name));
-  });
-  m.def("initialize_pjrt_plugin", [](std::string platform_name) {
-    return xla::ThrowIfError(pjrt::InitializePjrtPlugin(platform_name));
-  });
-
-  m.def(
-      "get_c_api_client",
-      [](std::string platform_name,
-         const absl::flat_hash_map<std::string, PjRtValueType>& options,
-         std::shared_ptr<DistributedRuntimeClient> distributed_client)
-          -> nb_class_ptr<PyClient> {
-        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
-        {
-          nb::gil_scoped_release gil_release;
-          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-          if (distributed_client != nullptr) {
-            kv_store = GetDistributedKeyValueStore(
-                distributed_client,
-                /*key_prefix=*/absl::StrCat(platform_name, ":"));
-          }
-          std::unique_ptr<PjRtClient> c_api_client = xla::ValueOrThrow(
-              GetCApiClient(platform_name, options, kv_store));
-          ifrt_client = ifrt::PjRtClient::Create(std::move(c_api_client));
-        }
-        return PyClient::Make(std::move(ifrt_client));
-      },
-      nb::arg("platform_name"),
-      nb::arg("options") = absl::flat_hash_map<std::string, PjRtValueType>(),
-      nb::arg("distributed_client").none() = nullptr);
-  // TODO(b/322357665): Delete this method after TPU plugin changes to use the
-  // standard registration.
-  m.def("get_default_c_api_topology",
-        [](std::string platform_name, std::string topology_name,
-           const absl::flat_hash_map<std::string, PjRtValueType>& options)
-            -> std::shared_ptr<ifrt::Topology> {
-          return std::make_shared<ifrt::PjRtTopology>(xla::ValueOrThrow(
-              GetCApiTopology(platform_name, topology_name, options)));
-        });
-  m.def("get_c_api_topology",
-        [](nb::capsule c_api, std::string topology_name,
-           const absl::flat_hash_map<std::string, PjRtValueType>& options)
-            -> std::shared_ptr<ifrt::Topology> {
-          if (absl::string_view(c_api.name()) != "pjrt_c_api") {
-            throw nb::value_error(
-                "Argument to get_c_api_topology was not a pjrt_c_api capsule.");
-          }
-          return std::make_shared<ifrt::PjRtTopology>(xla::ValueOrThrow(
-              GetCApiTopology(static_cast<const PJRT_Api*>(c_api.data()),
-                              topology_name, options)));
-        });
-  m.def("get_topology_for_devices",
-        [](const std::vector<nb_class_ptr<PyDevice>>& py_devices) {
-          if (py_devices.empty()) {
-            throw nb::value_error(
-                "get_topology_for_devices requires >= 1 devices.");
-          }
-          auto client = py_devices[0]->client();
-          absl::InlinedVector<ifrt::Device*, 1> ifrt_devices;
-          ifrt_devices.reserve(py_devices.size());
-          for (const auto& py_device : py_devices) {
-            if (py_device->client().get() != client.get()) {
-              throw nb::value_error(
-                  "devices passed to get_topology_for_devices come from "
-                  "different clients.");
-            }
-            ifrt_devices.push_back(py_device->device());
-          }
-          ifrt::DeviceListRef device_list =
-              client->ifrt_client()->MakeDeviceList(ifrt_devices);
-          return xla::ValueOrThrow(
-              client->ifrt_client()->GetTopologyForDevices(device_list));
-        });
-
-  TF_CHECK_OK(PyArray::RegisterTypes(m));
-  jax::PyDeviceList::Register(m);
-  jax::RegisterSharding(m);
-
-  nb::class_<CompiledMemoryStats>(m, "CompiledMemoryStats")
-      .def_rw("generated_code_size_in_bytes",
-              &CompiledMemoryStats::generated_code_size_in_bytes)
-      .def_rw("argument_size_in_bytes",
-              &CompiledMemoryStats::argument_size_in_bytes)
-      .def_rw("output_size_in_bytes",
-              &CompiledMemoryStats::output_size_in_bytes)
-      .def_rw("alias_size_in_bytes", &CompiledMemoryStats::alias_size_in_bytes)
-      .def_rw("temp_size_in_bytes", &CompiledMemoryStats::temp_size_in_bytes)
-      .def_rw("host_generated_code_size_in_bytes",
-              &CompiledMemoryStats::host_generated_code_size_in_bytes)
-      .def_rw("host_argument_size_in_bytes",
-              &CompiledMemoryStats::host_argument_size_in_bytes)
-      .def_rw("host_output_size_in_bytes",
-              &CompiledMemoryStats::host_output_size_in_bytes)
-      .def_rw("host_alias_size_in_bytes",
-              &CompiledMemoryStats::host_alias_size_in_bytes)
-      .def_rw("host_temp_size_in_bytes",
-              &CompiledMemoryStats::host_temp_size_in_bytes)
-      .def_prop_ro("serialized_hlo_proto",
-                   [](const CompiledMemoryStats& cms) -> nb::bytes {
-                     return nb::bytes(cms.serialized_hlo_proto.data(),
-                                      cms.serialized_hlo_proto.size());
-                   })
-      .def("__str__", &CompiledMemoryStats::DebugString);
-
-  nb::class_<PyExecuteResults>(m, "ExecuteResults")
-      .def("__len__", [](PyExecuteResults& results) { return results.Size(); })
-      .def("disassemble_into_single_device_arrays",
-           &PyExecuteResults::DisassembleIntoSingleDeviceArrays)
-      .def("disassemble_prefix_into_single_device_arrays",
-           &PyExecuteResults::DisassemblePrefixIntoSingleDeviceArrays)
-      .def("consume_with_handlers", &PyExecuteResults::ConsumeWithHandlers)
-      .def("consume_token", &PyExecuteResults::ConsumeToken);
-
-  nb::class_<PyLoadedExecutable>(m, "LoadedExecutable")
-      .def_prop_ro("client", &PyLoadedExecutable::client)
-      .def("local_devices", &PyLoadedExecutable::AddressableDevices)
-      .def("size_of_generated_code_in_bytes",
-           &PyLoadedExecutable::SizeOfGeneratedCodeInBytes)
-      .def(
-          "get_compiled_memory_stats",
-          xla::ValueOrThrowWrapper(&PyLoadedExecutable::GetCompiledMemoryStats))
-      .def("delete", &PyLoadedExecutable::Delete)
-      .def("execute_sharded_on_local_devices",
-           xla::ValueOrThrowWrapper(
-               &PyLoadedExecutable::ExecuteShardedOnLocalDevices),
-           nb::arg("arguments"))
-      .def("execute_sharded_on_local_devices_with_tokens",
-           xla::ValueOrThrowWrapper(
-               &PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens),
-           nb::arg("arguments"))
-      // TODO(parkers): Switch execute_sharded_on_local_devices* to this.
-      .def("execute_sharded",
-           xla::ValueOrThrowWrapper(&PyLoadedExecutable::ExecuteSharded),
-           nb::arg("arguments"), nb::arg("with_tokens") = false)
-      .def("hlo_modules", ValueOrThrowWrapper(&PyLoadedExecutable::HloModules))
-      .def("get_output_memory_kinds",
-           xla::ValueOrThrowWrapper(&PyLoadedExecutable::GetOutputMemoryKinds))
-      .def("get_output_shardings", &PyLoadedExecutable::GetOutputShardings)
-      .def("get_parameter_layouts",
-           xla::ValueOrThrowWrapper(&PyLoadedExecutable::GetParameterLayouts))
-      .def("get_output_layouts",
-           xla::ValueOrThrowWrapper(&PyLoadedExecutable::GetOutputLayouts))
-      .def("get_parameter_shardings",
-           &PyLoadedExecutable::GetParameterShardings)
-      .def("keep_alive", &PyLoadedExecutable::KeepAlive)
-      .def("cost_analysis",
-           [](const PyLoadedExecutable& self) {
-             auto map = ValueOrThrow(self.GetCostAnalysis());
-             return ifrt::ToPjRtAttributeMap(std::move(map));
-           })
-      .def_prop_ro("traceback", &PyLoadedExecutable::traceback)
-      .def_prop_ro("fingerprint", [](PyLoadedExecutable* exec) -> nb::object {
-        if (exec->fingerprint().has_value()) {
-          return nb::bytes(exec->fingerprint()->data(),
-                           exec->fingerprint()->size());
-        } else {
-          return nb::none();
-        }
-      });
-  nb::class_<PyToken> token(m, "Token");
-  token.def("block_until_ready",
-            [](PyToken& self) { xla::ThrowIfError(self.Await()); });
-
-  nb::class_<PyShardedToken> sharded_token(m, "ShardedToken");
-  sharded_token.def("block_until_ready", [](PyShardedToken& self) {
-    xla::ThrowIfError(self.Await());
-  });
-  sharded_token.def("get_token", &PyShardedToken::GetPyToken);
-
-  m.def("buffer_to_dlpack_managed_tensor",
-        xla::ValueOrThrowWrapper(BufferToDLPackManagedTensor),
-        nb::arg("buffer"), nb::arg("stream").none() = nb::none());
-  m.def(
-      "dlpack_managed_tensor_to_buffer",
-      [](const nb::capsule& tensor, nb_class_ptr<PyDevice> device,
-         std::optional<std::intptr_t> stream) {
-        return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
-            tensor, device->device(), device->client(), stream));
-      },
-      nb::arg("dlpack"), nb::arg("device"), nb::arg("stream").none());
-  // Legacy overload
-  m.def(
-      "dlpack_managed_tensor_to_buffer",
-      [](const nb::capsule& tensor,
-         std::optional<nb_class_ptr<PyClient>> cpu_client,
-         std::optional<nb_class_ptr<PyClient>> gpu_client) {
-        return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
-            tensor, std::move(cpu_client), std::move(gpu_client)));
-      },
-      nb::arg("dlpack"), nb::arg("cpu_backend").none() = nb::none(),
-      nb::arg("gpu_backend").none() = nb::none());
-  m.def("cuda_array_interface_to_buffer",
-        xla::ValueOrThrowWrapper(CudaArrayInterfaceToBuffer), nb::arg("cai"),
-        nb::arg("gpu_backend").none() = nb::none(),
-        nb::arg("device_id").none() = nb::none());
-
-  jax::BuildConfigSubmodule(m);
-  BuildIfrtProgramsSubmodule(m);
-  BuildProfilerSubmodule(m);
-  BuildOpsSubmodule(m);
-  BuildPytreeSubmodule(m);
-  jax::BuildGuardSubmodule(m);
-  jax::BuildJaxjitSubmodule(m);
-  jax::BuildPmapSubmodule(m);
-  jax::BuildPjitSubmodule(m);
-  BuildTracebackSubmodule(m);
-  BuildMlirSubmodule(m);
-  BuildSdySubmodule(m);
-  BuildCustomCallShardingPybindAPI(m);
-#if defined(__linux__)
-  aux::RegisterTransferServerTypes(m);
-#endif  // defined(__linux__)
-
-  // The following uses python bindings for PyClient defined above using
-  // pybind11, and hence needs pybind11::module_ (not just nanobind::module_).
-  xla::ifrt::proxy::BuildIfrtProxySubmodule(m);
-
-  nb::class_<tsl::PreemptionSyncManager> preemption_sync_manager(
-      m, "PreemptionSyncManager");
-  preemption_sync_manager
-      .def(
-          "initialize",
-          [](tsl::PreemptionSyncManager& manager,
-             DistributedRuntimeClient* client) {
-            tsl::CoordinationServiceAgent* agent =
-                xla::ValueOrThrow(client->GetCoordinationServiceAgent());
-            xla::ThrowIfError(manager.Initialize(agent));
-          },
-          nb::arg("distributed_client"))
-      .def("reached_sync_point",
-           [](tsl::PreemptionSyncManager& manager, int step_counter) {
-             return manager.ReachedSyncPoint(step_counter);
-           });
-  m.def("create_preemption_sync_manager",
-        []() { return tsl::CreatePreemptionSyncManager(); });
-
-  nb::class_<DistributedRuntimeService> distributed_runtime_service(
-      m, "DistributedRuntimeService");
-  distributed_runtime_service.def("shutdown",
-                                  &DistributedRuntimeService::Shutdown,
-                                  nb::call_guard<nb::gil_scoped_release>());
-  nb::class_<DistributedRuntimeClient> distributed_runtime_client(
-      m, "DistributedRuntimeClient");
-  distributed_runtime_client
-      .def("connect",
-           [](DistributedRuntimeClient& self) {
-             nb::gil_scoped_release gil_release;
-             xla::ThrowIfError(self.Connect());
-           })
-      .def("shutdown",
-           [](DistributedRuntimeClient& self) {
-             nb::gil_scoped_release gil_release;
-             xla::ThrowIfError(self.Shutdown());
-           })
-      // This method assumes that the value is a Python string. Use
-      // `blocking_key_value_get_bytes()` if key_value_set() was called with a
-      // Python bytes object as its value.
-      .def(
-          "blocking_key_value_get",
-          [](DistributedRuntimeClient& client, std::string key,
-             int64_t timeout_in_ms) {
-            nb::gil_scoped_release gil_release;
-            return xla::ValueOrThrow(client.BlockingKeyValueGet(
-                key, absl::Milliseconds(timeout_in_ms)));
-          },
-          nb::arg("key"), nb::arg("timeout_in_ms"))
-      // Same as `blocking_key_value_get()`, but retrieves the raw Python byte
-      // values explicitly.
-      .def(
-          "blocking_key_value_get_bytes",
-          [](DistributedRuntimeClient& client, std::string key,
-             int64_t timeout_in_ms) -> nb::bytes {
-            std::string result;
-            {
-              nb::gil_scoped_release gil_release;
-              result = xla::ValueOrThrow(client.BlockingKeyValueGet(
-                  key, absl::Milliseconds(timeout_in_ms)));
-            }
-            return nb::bytes(result.data(), result.size());
-          },
-          nb::arg("key"), nb::arg("timeout_in_ms"))
-      .def(
-          "key_value_try_get",
-          [](DistributedRuntimeClient& client, std::string key) {
-            nb::gil_scoped_release gil_release;
-            return xla::ValueOrThrow(client.KeyValueTryGet(key));
-          },
-          nb::arg("key"))
-      .def(
-          "key_value_try_get_bytes",
-          [](DistributedRuntimeClient& client, std::string key) -> nb::bytes {
-            std::string result;
-            {
-              nb::gil_scoped_release gil_release;
-              result = xla::ValueOrThrow(client.KeyValueTryGet(key));
-            }
-            return nb::bytes(result.data(), result.size());
-          },
-          nb::arg("key"))
-      .def(
-          "wait_at_barrier",
-          [](DistributedRuntimeClient& client, std::string barrier_id,
-             int64_t timeout_in_ms,
-             std::optional<std::vector<int32_t>> process_ids) {
-            nb::gil_scoped_release gil_release;
-            xla::ThrowIfError(client.WaitAtBarrier(
-                barrier_id, absl::Milliseconds(timeout_in_ms), process_ids));
-          },
-          nb::arg("barrier_id"), nb::arg("timeout_in_ms"),
-          nb::arg("process_ids") = std::nullopt)
-      .def(
-          "get_live_nodes",
-          [](DistributedRuntimeClient& client,
-             std::vector<int32_t> process_ids) {
-            nb::gil_scoped_release gil_release;
-            return xla::ValueOrThrow(client.GetLiveNodes(process_ids));
-          },
-          nb::arg("process_ids"))
-      // The key must be a string, but the value can either be a Python string
-      // or bytes object.
-      // With Python string values, use `key_value_set()` and
-      // `blocking_key_value_get()`.
-      // With Python byte object values, use `key_value_set()` and
-      // `blocking_key_value_get_bytes()`.
-      .def(
-          "key_value_set",
-          [](DistributedRuntimeClient& client, absl::string_view key,
-             absl::string_view value, bool allow_overwrite) {
-            nb::gil_scoped_release gil_release;
-            xla::ThrowIfError(client.KeyValueSet(key, value, allow_overwrite));
-          },
-          nb::arg("key"), nb::arg("value"), nb::arg("allow_overwrite") = false)
-      // The key must be a string, but the value must a
-      // Python bytes object.
-      // Use `key_value_set_bytes()` and `blocking_key_value_get_bytes()`.
-      .def(
-          "key_value_set_bytes",
-          [](DistributedRuntimeClient& client, absl::string_view key,
-             nb::bytes value, bool allow_overwrite) {
-            nb::gil_scoped_release gil_release;
-            xla::ThrowIfError(client.KeyValueSet(
-                key, absl::string_view(value.c_str(), value.size()),
-                allow_overwrite));
-          },
-          nb::arg("key"), nb::arg("value"), nb::arg("allow_overwrite") = false)
-      // Assumes that all values in the directory are Python strings.
-      .def(
-          "key_value_dir_get",
-          [](DistributedRuntimeClient& client, absl::string_view key) {
-            nb::gil_scoped_release gil_release;
-            return xla::ValueOrThrow(client.KeyValueDirGet(key));
-          },
-          nb::arg("key"))
-      // Assumes that all values in the directory are Python byte objects.
-      // Same as `key_value_dir_get()`, but retrieves Python byte values
-      // explicitly.
-      .def(
-          "key_value_dir_get_bytes",
-          [](DistributedRuntimeClient& client, absl::string_view key)
-              -> std::vector<std::pair<std::string, nb::bytes>> {
-            std::vector<std::pair<std::string, std::string>> result;
-            {
-              nb::gil_scoped_release gil_release;
-              result = xla::ValueOrThrow(client.KeyValueDirGet(key));
-            }
-            // Convert std::string values to nb::bytes.
-            std::vector<std::pair<std::string, nb::bytes>> kvs;
-            kvs.reserve(result.size());
-            for (auto& kv : result) {
-              kvs.push_back(
-                  std::pair(std::move(kv.first),
-                            nb::bytes(kv.second.data(), kv.second.size())));
-            }
-            return kvs;
-          },
-          nb::arg("key"))
-      .def(
-          "key_value_delete",
-          [](DistributedRuntimeClient& client, absl::string_view key) {
-            nb::gil_scoped_release gil_release;
-            return xla::ThrowIfError(client.KeyValueDelete(key));
-          },
-          nb::arg("key"));
-
-  m.def(
-      "get_distributed_runtime_service",
-      [](std::string address, int num_nodes,
-         std::optional<int> heartbeat_interval,
-         std::optional<int> max_missing_heartbeats,
-         std::optional<int> cluster_register_timeout,
-         std::optional<int> shutdown_timeout)
-          -> std::unique_ptr<DistributedRuntimeService> {
-        CoordinationServiceImpl::Options options;
-        options.num_nodes = num_nodes;
-        if (heartbeat_interval.has_value()) {
-          options.heartbeat_interval = absl::Seconds(*heartbeat_interval);
-        }
-        if (max_missing_heartbeats.has_value()) {
-          options.max_missing_heartbeats = *max_missing_heartbeats;
-        }
-        if (cluster_register_timeout.has_value()) {
-          options.cluster_register_timeout =
-              absl::Seconds(*cluster_register_timeout);
-        }
-        if (shutdown_timeout.has_value()) {
-          options.shutdown_timeout = absl::Seconds(*shutdown_timeout);
-        }
-        std::unique_ptr<DistributedRuntimeService> service =
-            xla::ValueOrThrow(GetDistributedRuntimeService(address, options));
-        return service;
-      },
-      nb::arg("address"), nb::arg("num_nodes"),
-      nb::arg("heartbeat_interval").none() = std::nullopt,
-      nb::arg("max_missing_heartbeats").none() = std::nullopt,
-      nb::arg("cluster_register_timeout").none() = std::nullopt,
-      nb::arg("shutdown_timeout").none() = std::nullopt);
-
-  m.def(
-      "get_distributed_runtime_client",
-      [](std::string address, int node_id, std::optional<int> rpc_timeout,
-         std::optional<int> init_timeout, std::optional<int> shutdown_timeout,
-         std::optional<int> heartbeat_interval,
-         std::optional<int> max_missing_heartbeats,
-         std::optional<std::function<void(absl::Status)>>
-             missed_heartbeat_callback,
-         std::optional<bool> shutdown_on_destruction,
-         std::optional<bool> use_compression)
-          -> std::shared_ptr<DistributedRuntimeClient> {
-        bool compression = use_compression.value_or(false);
-        DistributedRuntimeClient::Options options;
-        options.node_id = node_id;
-        if (rpc_timeout.has_value()) {
-          options.rpc_timeout = absl::Seconds(*rpc_timeout);
-        }
-        if (init_timeout.has_value()) {
-          options.init_timeout = absl::Seconds(*init_timeout);
-        }
-        if (shutdown_timeout.has_value()) {
-          options.shutdown_timeout = absl::Seconds(*shutdown_timeout);
-        }
-        if (heartbeat_interval.has_value()) {
-          options.heartbeat_interval = absl::Seconds(*heartbeat_interval);
-        }
-        if (max_missing_heartbeats.has_value()) {
-          options.max_missing_heartbeats = *max_missing_heartbeats;
-        }
-        if (missed_heartbeat_callback.has_value()) {
-          options.missed_heartbeat_callback =
-              std::move(*missed_heartbeat_callback);
-        }
-        if (shutdown_on_destruction.has_value()) {
-          options.shutdown_on_destruction = *shutdown_on_destruction;
-        }
-        return GetDistributedRuntimeClient(address, options, compression);
-      },
-      nb::arg("address"), nb::arg("node_id"),
-      nb::arg("rpc_timeout").none() = std::nullopt,
-      nb::arg("init_timeout").none() = std::nullopt,
-      nb::arg("shutdown_timeout").none() = std::nullopt,
-      nb::arg("heartbeat_interval").none() = std::nullopt,
-      nb::arg("max_missing_heartbeats").none() = std::nullopt,
-      nb::arg("missed_heartbeat_callback").none() = std::nullopt,
-      nb::arg("shutdown_on_destruction").none() = std::nullopt,
-      nb::arg("use_compression").none() = std::nullopt);
-
-  m.def("collect_garbage", []() { GlobalPyRefManager()->CollectGarbage(); });
-
-  m.def("is_optimized_build", &IsOptimizedBuild);
-
-  m.def("json_to_pprof_profile", xla::ValueOrThrowWrapper(JsonToPprofProfile),
-        "Encodes the JSON representation of a pprof Profile into its binary "
-        "protocol buffer encoding.");
-  m.def("pprof_profile_to_json", xla::ValueOrThrowWrapper(PprofProfileToJson),
-        "Decodes an uncompressed pprof Profile protocol buffer into a JSON "
-        "representation");
-
-  RegisterCompileOnlyClient(m);
-  nb::class_<ifrt::Topology>(m, "DeviceTopology")
-      .def("_make_compile_only_devices",
-           [](std::shared_ptr<ifrt::Topology> topology) {
-             if (!llvm::isa<ifrt::PjRtTopology>(*topology)) {
-               throw xla::XlaRuntimeError("Only PjRtTopologies are supported.");
-             }
-             return MakeCompileOnlyClient(
-                        std::dynamic_pointer_cast<ifrt::PjRtTopology>(topology))
-                 ->Devices();
-           })
-      .def_prop_ro(
-          "platform",
-          [](ifrt::Topology& topology) { return topology.platform_name(); })
-      .def_prop_ro(
-          "platform_version",
-          [](ifrt::Topology& topology) { return topology.platform_version(); })
-      .def("serialize",
-           [](ifrt::Topology& topology) -> nb::bytes {
-             std::string serialized = ValueOrThrow(topology.Serialize());
-             return nb::bytes(serialized.data(), serialized.size());
-           })
-      .def("__getattr__",
-           [](ifrt::Topology& topology, absl::string_view name) -> nb::object {
-             const auto& attrs = topology.Attributes().map();
-             auto it = attrs.find(name);
-             if (it != attrs.end()) {
-               return std::visit([](auto&& v) { return nb::cast(v.value); },
-                                 it->second);
-             }
-             throw nb::attribute_error(
-                 absl::StrCat("Unknown attribute ", name).c_str());
-           });
-
-  nb::class_<ifrt::Executable>(m, "Executable")
-      .def("hlo_modules", ValueOrThrowWrapper(&ifrt::Executable::GetHloModules))
-      .def("get_output_memory_kinds",
-           xla::ValueOrThrowWrapper(&ifrt::Executable::GetOutputMemoryKinds))
-      .def("get_output_shardings", &ifrt::Executable::GetOutputShardings)
-      .def("get_parameter_layouts",
-           ValueOrThrowWrapper(&ifrt::Executable::GetParameterLayouts))
-      .def("get_output_layouts",
-           xla::ValueOrThrowWrapper(&ifrt::Executable::GetOutputLayouts))
-      .def("get_parameter_shardings", &ifrt::Executable::GetParameterShardings)
-      .def("get_compiled_memory_stats",
-           xla::ValueOrThrowWrapper(&ifrt::Executable::GetCompiledMemoryStats))
-      .def("serialize",
-           [](const ifrt::Executable& exec) -> nb::bytes {
-             std::string serialized = ValueOrThrow(exec.Serialize());
-             return nb::bytes(serialized.data(), serialized.size());
-           })
-      .def("cost_analysis", [](const ifrt::Executable& exec) {
-        auto attrs = ValueOrThrow(exec.GetCostAnalysis());
-        return ifrt::ToPjRtAttributeMap(std::move(attrs));
-      });
-
-  m.def("is_asan", IsAsan);
-  m.def("is_msan", IsMsan);
-  m.def("is_tsan", IsTsan);
-  m.def("is_sanitized", IsSanitized);
-
-  m.def(
-      "batched_device_put",
-      [](nb::object aval, nb::object sharding, std::vector<nb::object> xs,
-         std::vector<const PyDevice*> dst_devices, bool committed,
-         bool force_copy,
-         PjRtClient::HostBufferSemantics host_buffer_semantics) -> nb::object {
-        return ValueOrThrow(PyArray::BatchedDevicePut(
-            aval, sharding, std::move(xs), std::move(dst_devices), committed,
-            force_copy, host_buffer_semantics, jax::GetEnableX64()));
-      },
-      nb::arg("aval"), nb::arg("sharding"), nb::arg("xs"), nb::arg("devices"),
-      nb::arg("committed") = true, nb::arg("force_copy") = false,
-      nb::arg("host_buffer_semantics") =
-          PjRtClient::HostBufferSemantics::kImmutableZeroCopy);
-  m.def(
-      "reorder_shards",
-      [](PyArray x, nb::object dst_sharding,
-         ifrt::ArrayCopySemantics array_copy_semantics) {
-        return ValueOrThrow(PyArray::ReorderShards(
-            std::move(x), std::move(dst_sharding), array_copy_semantics));
-      },
-      nb::arg("x"), nb::arg("dst_sharding"), nb::arg("array_copy_semantics"));
-
-  m.def("batched_block_until_ready", [](std::vector<nb::object> xs) {
-    ThrowIfError(PyArray::BatchedBlockUntilReady(std::move(xs)));
-  });
-
-  m.def("check_and_canonicalize_memory_kind",
-        &jax::CheckAndCanonicalizeMemoryKind, nb::arg("memory_kind").none(),
-        nb::arg("device_list"));
-}  // NOLINT(readability/fn_size)
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python/xla_builder.cc b/third_party/xla/xla/python/xla_builder.cc
new file mode 100644
index 000000000000..0e81ddcd305c
--- /dev/null
+++ b/third_party/xla/xla/python/xla_builder.cc
@@ -0,0 +1,150 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/builder/xla_builder.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "nanobind/nanobind.h"
+#include "nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "nanobind/stl/string.h"  // IWYU pragma: keep
+#include "nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/nb_helpers.h"
+#include "xla/service/name_uniquer.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+
+namespace nb = nanobind;
+
+namespace nanobind {
+namespace detail {
+
+template <>
+struct type_caster<xla::OpMetadata> {
+ public:
+  NB_TYPE_CASTER_FROM_PYTHON_ONLY(xla::OpMetadata,
+                                  const_name("xla::OpMetadata"));
+
+  bool from_python(handle h, uint8_t, cleanup_list*) noexcept {
+    handle op_type = getattr(h, "op_type");
+    if (!op_type.is_none()) {
+      value.set_op_type(cast<std::string>(op_type));
+    }
+    handle op_name = getattr(h, "op_name");
+    if (!op_name.is_none()) {
+      value.set_op_name(cast<std::string>(op_name));
+    }
+    handle source_file = getattr(h, "source_file");
+    if (!source_file.is_none()) {
+      value.set_source_file(cast<std::string>(source_file));
+    }
+    handle source_line = getattr(h, "source_line");
+    if (!source_line.is_none()) {
+      value.set_source_line(cast<int32_t>(source_line));
+    }
+    return true;
+  }
+};
+
+}  // namespace detail
+}  // namespace nanobind
+
+namespace xla {
+
+namespace {
+
+struct Uniquer {
+  absl::Mutex mu;
+  NameUniquer name_uniquer ABSL_GUARDED_BY(mu);
+};
+
+Uniquer* GetUniquer() {
+  static Uniquer* uniquer = new Uniquer;
+  return uniquer;
+}
+
+static std::string UniquifyName(const std::string& name) {
+  Uniquer* uniquer = GetUniquer();
+  absl::MutexLock lock(&uniquer->mu);
+  return uniquer->name_uniquer.GetUniqueName(name);
+}
+
+}  // namespace
+
+NB_MODULE(_xla_builder, m) {
+  nb::class_<FrontendAttributes> frontend_attributes(m, "FrontendAttributes");
+  frontend_attributes.def(nb::init<>())
+      .def("__setitem__",
+           [](FrontendAttributes* attr, std::string key, std::string value) {
+             (*attr->mutable_map())[key] = value;
+           });
+
+  nb::class_<XlaOp> xla_op_class(m, "XlaOp");
+
+  nb::class_<XlaBuilder>(m, "XlaBuilder")
+      .def("__init__",
+           [](XlaBuilder* self, const std::string& name) {
+             new (self) XlaBuilder(UniquifyName(name));
+           })
+      // TODO(phawkins): delete capitalized names after updating callers.
+      .def("Build",
+           xla::ValueOrThrowWrapper(
+               [](XlaBuilder& builder, std::optional<XlaOp> root) {
+                 return root ? builder.Build(*root) : builder.Build();
+               }),
+           "Builds a computation from the contents of the builder.",
+           nb::arg("root") = std::nullopt)
+      .def("GetShape", xla::ValueOrThrowWrapper(&XlaBuilder::GetShape))
+      .def("build",
+           xla::ValueOrThrowWrapper(
+               [](XlaBuilder& builder, std::optional<XlaOp> root) {
+                 return root ? builder.Build(*root) : builder.Build();
+               }),
+           "Builds a computation from the contents of the builder.",
+           nb::arg("root") = std::nullopt)
+      .def("clear_op_metadata", &XlaBuilder::ClearOpMetadata)
+      .def("get_shape", xla::ValueOrThrowWrapper(&XlaBuilder::GetShape))
+      .def(
+          "get_program_shape",
+          [](const XlaBuilder& builder,
+             std::optional<XlaOp> root) -> ProgramShape {
+            return ValueOrThrow(root ? builder.GetProgramShape(*root)
+                                     : builder.GetProgramShape());
+          },
+          nb::arg("root") = std::nullopt)
+      .def("is_constant", xla::ValueOrThrowWrapper(&XlaBuilder::IsConstant))
+      .def("set_op_metadata", &XlaBuilder::SetOpMetadata)
+      .def("set_sharding", &XlaBuilder::SetSharding)
+      .def("clear_sharding", &XlaBuilder::ClearSharding)
+      .def("set_frontend_attributes", &XlaBuilder::SetFrontendAttributes)
+      .def("clear_frontend_attributes", &XlaBuilder::ClearFrontendAttributes)
+      .def("setup_alias",
+           [](XlaBuilder& builder, const std::vector<int64_t>& output_index,
+              int64_t param_number, const std::vector<int64_t>& param_index) {
+             builder.SetUpAlias(
+                 ShapeIndex(output_index.begin(), output_index.end()),
+                 param_number,
+                 ShapeIndex(param_index.begin(), param_index.end()));
+           });
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 22b186805f66..ade760d848d3 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -14,247 +14,99 @@
 # ==============================================================================
 """An XLA client in Python."""
 
-from __future__ import annotations
+# pylint: disable=unused-import
 
-import atexit
-from collections.abc import Mapping, Sequence
-import contextlib
-import enum  # pylint: disable=g-bad-import-order
-import gzip
+import enum
 import inspect
-import logging
 import os
-import threading
-from typing import Any, Protocol, Union
+from typing import Sequence
+
+from jax.jaxlib.xla_client import *  # pylint: disable=wildcard-import
+from jax.jaxlib.xla_client import _xla
+from jax.jaxlib.xla_client import PrimitiveType
+from jax.jaxlib.xla_client import Shape
 
 import ml_dtypes
 import numpy as np
 
-from . import xla_extension as _xla
-
-# Note this module does *not* depend on any Python protocol buffers. The XLA
-# Python bindings are currently packaged both as part of jaxlib and as part
-# of TensorFlow. If we use protocol buffers here, then importing both jaxlib
-# and TensorFlow may fail with duplicate protocol buffer message definitions.
-
-# Most functions are snake_case for consistency with other modules, some
-# method names are CamelCase for consistency with XLA.
-# pylint: disable=invalid-name
-
-# Pylint has false positives for type annotations.
-# pylint: disable=invalid-sequence-index
 
-ifrt_programs = _xla.ifrt_programs
-ops = _xla.ops
-profiler = _xla.profiler
+from . import _ops as ops
+from . import _profiler as profiler
 
-# Just an internal arbitrary increasing number to help with backward-compatible
-# changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 319
+from ._xla_builder import XlaBuilder
+from ._xla_builder import XlaOp
 
-# Version number for MLIR:Python components.
-mlir_api_version = 58
 
-xla_platform_names = {
-    'cpu': 'Host',
-    'gpu': 'CUDA',
+XLA_ELEMENT_TYPE_TO_DTYPE = {
+    PrimitiveType.PRED: np.dtype('bool'),
+    PrimitiveType.S4: np.dtype(ml_dtypes.int4),
+    PrimitiveType.S8: np.dtype('int8'),
+    PrimitiveType.S16: np.dtype('int16'),
+    PrimitiveType.S32: np.dtype('int32'),
+    PrimitiveType.S64: np.dtype('int64'),
+    PrimitiveType.U4: np.dtype(ml_dtypes.uint4),
+    PrimitiveType.U8: np.dtype('uint8'),
+    PrimitiveType.U16: np.dtype('uint16'),
+    PrimitiveType.U32: np.dtype('uint32'),
+    PrimitiveType.U64: np.dtype('uint64'),
+    PrimitiveType.F4E2M1FN: np.dtype(ml_dtypes.float4_e2m1fn),
+    PrimitiveType.F8E3M4: np.dtype(ml_dtypes.float8_e3m4),
+    PrimitiveType.F8E4M3: np.dtype(ml_dtypes.float8_e4m3),
+    PrimitiveType.F8E4M3FN: np.dtype(ml_dtypes.float8_e4m3fn),
+    PrimitiveType.F8E4M3B11FNUZ: np.dtype(ml_dtypes.float8_e4m3b11fnuz),
+    PrimitiveType.F8E4M3FNUZ: np.dtype(ml_dtypes.float8_e4m3fnuz),
+    PrimitiveType.F8E5M2: np.dtype(ml_dtypes.float8_e5m2),
+    PrimitiveType.F8E5M2FNUZ: np.dtype(ml_dtypes.float8_e5m2fnuz),
+    PrimitiveType.F8E8M0FNU: np.dtype(ml_dtypes.float8_e8m0fnu),
+    PrimitiveType.BF16: np.dtype(ml_dtypes.bfloat16),
+    PrimitiveType.F16: np.dtype('float16'),
+    PrimitiveType.F32: np.dtype('float32'),
+    PrimitiveType.F64: np.dtype('float64'),
+    PrimitiveType.C64: np.dtype('complex64'),
+    PrimitiveType.C128: np.dtype('complex128'),
+    PrimitiveType.TUPLE: np.dtype(np.object_),
+    PrimitiveType.TOKEN: np.dtype(np.object_),
 }
 
-logger = logging.getLogger(__name__)
-
-_NameValueMapping = Mapping[str, Union[str, int, list[int], float, bool]]
-
-
-def make_cpu_client(
-    asynchronous=True,
-    distributed_client=None,
-    node_id=0,
-    num_nodes=1,
-    collectives=None,
-    num_devices=None,
-) -> ...:
-  register_custom_call_handler('cpu', _xla.register_custom_call_target)
-  register_custom_type_id_handler('cpu', _xla.register_custom_type_id)
-  return _xla.get_tfrt_cpu_client(
-      asynchronous=asynchronous,
-      distributed_client=distributed_client,
-      node_id=node_id,
-      num_nodes=num_nodes,
-      collectives=collectives,
-      num_devices=num_devices,
-  )
-
-
-def make_gpu_client(
-    distributed_client=None,
-    node_id=0,
-    num_nodes=1,
-    platform_name=None,
-    allowed_devices=None,
-    mock=False,
-    mock_gpu_topology=None,
-):
-  """Returns a GPU client. BFC allocator is used by default."""
-  options = generate_pjrt_gpu_plugin_options()
-  allocator = options['allocator']
-  config = _xla.GpuAllocatorConfig()
-  if allocator == 'default':
-    config.kind = _xla.GpuAllocatorConfig.Kind.DEFAULT
-  if allocator == 'platform':
-    config.kind = _xla.GpuAllocatorConfig.Kind.PLATFORM
-  if allocator == 'bfc':
-    config.kind = _xla.GpuAllocatorConfig.Kind.BFC
-  if allocator == 'cuda_async':
-    config.kind = _xla.GpuAllocatorConfig.Kind.CUDA_ASYNC
-  if 'memory_fraction' in options:
-    config.memory_fraction = options['memory_fraction']
-  if 'preallocate' in options:
-    config.preallocate = options['preallocate']
-  if 'collective_memory_size' in options:
-    config.collective_memory_size = options['collective_memory_size']
-  register_custom_call_handler('CUDA', _xla.register_custom_call_target)
-  register_custom_call_handler('ROCM', _xla.register_custom_call_target)
-  register_custom_type_id_handler('CUDA', _xla.register_custom_type_id)
-  register_custom_type_id_handler('ROCM', _xla.register_custom_type_id)
-
-  return _xla.get_gpu_client(
-      asynchronous=True,
-      allocator_config=config,
-      distributed_client=distributed_client,
-      node_id=node_id,
-      num_nodes=num_nodes,
-      platform_name=platform_name,
-      allowed_devices=allowed_devices,
-      mock=mock,
-      mock_gpu_topology=mock_gpu_topology,
-  )
-
-
-def make_tfrt_tpu_c_api_client(options: _NameValueMapping | None = None):
-  assert pjrt_plugin_loaded('tpu')
-  if not pjrt_plugin_initialized('tpu'):
-    initialize_pjrt_plugin('tpu')
-  if options is None:
-    options = {}
-  return _xla.get_c_api_client('tpu', options)
-
-
-DeviceTopology = _xla.DeviceTopology
-get_topology_for_devices = _xla.get_topology_for_devices
-
-
-def make_tfrt_tpu_c_api_device_topology(
-    topology_name: str = '', **kwargs
-) -> DeviceTopology:
-  """Creates a PJRT C API TopologyDescription."""
-  return _xla.get_default_c_api_topology('tpu', topology_name, dict(**kwargs))
-
-
-def make_c_api_device_topology(
-    c_api: Any, topology_name: str = '', **kwargs
-) -> DeviceTopology:
-  """Creates a PJRT C API TopologyDescription."""
-  return _xla.get_c_api_topology(c_api, topology_name, dict(**kwargs))
-
-
-def pjrt_plugin_loaded(plugin_name: str) -> bool:
-  return _xla.pjrt_plugin_loaded(plugin_name)
-
-
-def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> Any:
-  return _xla.load_pjrt_plugin(plugin_name, library_path, c_api=None)
-
-
-def load_pjrt_plugin_with_c_api(plugin_name: str, c_api: Any) -> None:
-  return _xla.load_pjrt_plugin(plugin_name, None, c_api)
-
-
-def pjrt_plugin_initialized(plugin_name: str) -> bool:
-  return _xla.pjrt_plugin_initialized(plugin_name)
-
+# Note the conversion on the key. Numpy has a known issue wherein dtype hashing
+# doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus,
+# when keying by dtype in this dict, we use the string form of dtypes.
+DTYPE_TO_XLA_ELEMENT_TYPE = {
+    str(dt): et for et, dt in XLA_ELEMENT_TYPE_TO_DTYPE.items()
+}
 
-def initialize_pjrt_plugin(plugin_name: str) -> None:
-  """Initializes a PJRT plugin.
 
-  The plugin needs to be loaded first (through load_pjrt_plugin_dynamically or
-  static linking) before this method is called.
-  Args:
-    plugin_name: the name of the PJRT plugin.
-  """
-  _xla.initialize_pjrt_plugin(plugin_name)
+def dtype_to_etype(dtype):
+  """Convenience function for reading DTYPE_TO_XLA_ELEMENT_TYPE."""
+  return DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
 
 
-def make_c_api_client(
-    plugin_name: str,
-    options: _NameValueMapping | None = None,
-    distributed_client: _xla.DistributedRuntimeClient | None = None,
-):
-  """Creates a PJRT C API client for a PJRT plugin.
+class PrecisionConfig:
+  """Python representation of a xla.PrecisionConfig protobuf."""
 
-  It is required that load_pjrt_plugin_dynamically is called once with the same
-  plugin_name before this method is called.
+  __slots__ = ('operand_precision',)
 
-  Args:
-     plugin_name: the name of the PJRT plugin.
-     options: extra platform-specific options.
-     distributed_client: distributed client.
+  Precision = ops.PrecisionConfig_Precision  # pylint: disable=invalid-name
 
-  Returns:
-     A PJRT C API client for plugin_name.
-  """
-  if options is None:
-    options = {}
-  return _xla.get_c_api_client(plugin_name, options, distributed_client)
+  def __init__(self):
+    self.operand_precision = []
 
 
-def make_tpu_client(
-    library_path: str | None = None, options: _NameValueMapping | None = None
-):
-  """Returns a TPU client. Defaults to allowing 32 in-flight computations."""
-  if not pjrt_plugin_loaded('tpu'):
-    c_api = load_pjrt_plugin_dynamically('tpu', library_path or 'libtpu.so')
-    profiler.register_plugin_profiler(c_api)
-  return make_tfrt_tpu_c_api_client(options)
+FftType = ops.FftType
+ShapeIndex = ops.ShapeIndex
+ResultAccuracyMode = ops.ResultAccuracy_Mode
 
 
-def generate_pjrt_gpu_plugin_options() -> _NameValueMapping:
-  """Generates the PjRt GPU plugin options.
+class ResultAccuracy:
+  """Python representation of a xla.ResultAccuracy protobuf."""
 
-  Returns:
-    A dictionary of plugin options.
-  """
+  __slots__ = ('mode', 'atol', 'rtol', 'ulps')
 
-  options = {}
-  options['platform_name'] = 'cuda'
-  allocator = os.getenv('XLA_PYTHON_CLIENT_ALLOCATOR', 'default').lower()
-  memory_fraction = os.getenv('XLA_CLIENT_MEM_FRACTION', '')
-  deprecated_memory_fraction = os.getenv('XLA_PYTHON_CLIENT_MEM_FRACTION', '')
-  if deprecated_memory_fraction:
-    if memory_fraction:
-      raise ValueError(
-          'XLA_CLIENT_MEM_FRACTION is specified together '
-          'with XLA_PYTHON_CLIENT_MEM_FRACTION. '
-          'Remove the latter one, it is deprecated.'
-      )
-    else:
-      memory_fraction = deprecated_memory_fraction
-  preallocate = os.getenv('XLA_PYTHON_CLIENT_PREALLOCATE', '')
-  collective_memory_size = os.getenv(
-      'XLA_PYTHON_CLIENT_COLLECTIVE_MEM_SIZE_MB', ''
-  )
-  if allocator not in ('default', 'platform', 'bfc', 'cuda_async'):
-    raise ValueError(
-        'XLA_PYTHON_CLIENT_ALLOCATOR env var must be "default", "platform", '
-        '"bfc", or "cuda_async", got "%s"' % allocator
-    )
-  options['allocator'] = allocator
-  if memory_fraction:
-    options['memory_fraction'] = float(memory_fraction)
-  if preallocate:
-    options['preallocate'] = preallocate not in ('false', 'False', '0')
-  if collective_memory_size:
-    options['collective_memory_size'] = int(collective_memory_size) * (1 << 20)
-  return options
+  def __init__(self):
+    self.mode = ops.ResultAccuracy_Mode.DEFAULT
+    self.atol = 0.0
+    self.rtol = 0.0
+    self.ulps = 0
 
 
 class OpMetadata:
@@ -269,7 +121,7 @@ def __init__(self, op_type='', op_name='', source_file='', source_line=0):
     self.source_line = source_line
 
 
-def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
+def current_source_info_metadata(op_type=None, op_name=None, skip_frames=1):
   """Helper for use in source mapping that returns an OpMetadata object."""
   full_filename, lineno = inspect.stack()[skip_frames][1:3]
   filename = os.path.basename(full_filename)
@@ -278,149 +130,6 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
   )
 
 
-PrimitiveType = _xla.PrimitiveType
-
-bfloat16 = ml_dtypes.bfloat16
-# TODO(reedwm): Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-# Also, it would be better to conditionally import these based on whether they
-# are in the current version of ml_dtypes.
-# float4_e2m1fn = ml_dtypes.float4_e2m1fn
-# float8_e3m4 = ml_dtypes.float8_e3m4
-# float8_e4m3 = ml_dtypes.float8_e4m3
-# float8_e8m0fnu = ml_dtypes.float8_e8m0fnu
-float8_e4m3fn = ml_dtypes.float8_e4m3fn
-float8_e4m3b11fnuz = ml_dtypes.float8_e4m3b11fnuz
-float8_e4m3fnuz = ml_dtypes.float8_e4m3fnuz
-float8_e5m2 = ml_dtypes.float8_e5m2
-float8_e5m2fnuz = ml_dtypes.float8_e5m2fnuz
-
-XLA_ELEMENT_TYPE_TO_DTYPE = {
-    PrimitiveType.PRED: np.dtype('bool'),
-    PrimitiveType.S4: np.dtype('int4'),
-    PrimitiveType.S8: np.dtype('int8'),
-    PrimitiveType.S16: np.dtype('int16'),
-    PrimitiveType.S32: np.dtype('int32'),
-    PrimitiveType.S64: np.dtype('int64'),
-    PrimitiveType.U4: np.dtype('uint4'),
-    PrimitiveType.U8: np.dtype('uint8'),
-    PrimitiveType.U16: np.dtype('uint16'),
-    PrimitiveType.U32: np.dtype('uint32'),
-    PrimitiveType.U64: np.dtype('uint64'),
-    # TODO(reedwm): Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-    # PrimitiveType.F4E2M1FN: np.dtype(float4_e2m1fn),
-    # PrimitiveType.F8E3M4: np.dtype(float8_e3m4),
-    # PrimitiveType.F8E4M3: np.dtype(float8_e4m3),
-    # PrimitiveType.F8E8M0FNU: np.dtype(float8_e8m0fnu),
-    PrimitiveType.F8E4M3FN: np.dtype(float8_e4m3fn),
-    PrimitiveType.F8E4M3B11FNUZ: np.dtype(float8_e4m3b11fnuz),
-    PrimitiveType.F8E5M2: np.dtype(float8_e5m2),
-    PrimitiveType.F8E4M3FNUZ: np.dtype(float8_e4m3fnuz),
-    PrimitiveType.F8E5M2FNUZ: np.dtype(float8_e5m2fnuz),
-    PrimitiveType.BF16: np.dtype(bfloat16),
-    PrimitiveType.F16: np.dtype('float16'),
-    PrimitiveType.F32: np.dtype('float32'),
-    PrimitiveType.F64: np.dtype('float64'),
-    PrimitiveType.C64: np.dtype('complex64'),
-    PrimitiveType.C128: np.dtype('complex128'),
-    PrimitiveType.TUPLE: np.dtype(np.object_),
-    PrimitiveType.TOKEN: np.dtype(np.object_),
-}
-
-# Note the conversion on the key. Numpy has a known issue wherein dtype hashing
-# doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus,
-# when keying by dtype in this dict, we use the string form of dtypes.
-DTYPE_TO_XLA_ELEMENT_TYPE = {
-    str(dt): et for et, dt in XLA_ELEMENT_TYPE_TO_DTYPE.items()
-}
-
-
-def dtype_to_etype(dtype):
-  """Convenience function for reading DTYPE_TO_XLA_ELEMENT_TYPE."""
-  return DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
-
-
-Shape = _xla.Shape
-Shape.__doc__ = """
-A Shape is an object defined in C++ that duck types like the following class:
-
-class Shape:
-  '''Represents an XLA shape.
-
-  A shape is either an array shape, having rank-many integer
-  dimensions and an element type (represented by a Numpy dtype), or it
-  is a tuple shape, having a shape for every tuple component:
-
-    type shape =
-        TupleShape of shape list
-      | ArrayShape of { dimensions: int list; element_type: dtype }
-  '''
-
-  @staticmethod
-  def tuple_shape(tuple_shapes) -> Shape:
-    "Construct a tuple shape."
-
-  @staticmethod
-  def array_shape(element_type, dimensions, minor_to_major=None) -> Shape:
-
-  @staticmethod
-  def from_pyval(pyval) -> Shape:
-    "Returns a Shape that describes a tuple-tree of Numpy arrays."
-
-  def __init__(self, str) -> Shape:
-    "Parses a shape string."
-  def __eq__(self, other: Shape) -> bool:
-  def __ne__(self, other: Shape) -> bool:
-  def __hash__(self):
-  def __repr__(self):
-  def is_tuple(self) -> bool:
-  def is_array(self) -> bool:
-  def tuple_shapes(self) -> [Shape]:
-  def numpy_dtype(self) -> np.dtype:
-    "Like element_type(), but returns dtype('O') for a tuple shape."
-  def xla_element_type(self) -> PrimitiveType:
-  def element_type(self) -> np.dtype:
-  def dimensions(self) -> (int, int, ...):
-  def rank(self) -> int:
-  def with_major_to_minor_layout_if_absent(self) -> Shape:
-    "Returns a copy with missing layouts set to major-to-minor."
-
-  def to_serialized_proto(self) -> bytes:
-    "Returns 'shape' as a serialized proto."
-"""
-
-ProgramShape = _xla.ProgramShape
-ProgramShape.__doc__ = """
-A ProgramShape is a C++ object that duck types like the following class.
-
-class ProgramShape:
-  def __init__(self, parameter_shapes, result_shape):
-  def parameter_shapes(self) -> [Shape]:
-  def result_shape(self) -> Shape:
-  def __repr__(self):
-"""
-
-ShapeIndex = _xla.ShapeIndex
-ShapeIndex.__doc__ = """
-A Shape is an object defined in C++ that duck types like the following class:
-
-class ShapeIndex:
-  '''Represents an XLA ShapeIndex.
-
-  An index for specifying a particular nested subshape within a shape. Used in
-  ShapeUtil::GetSubshape and other interfaces. ShapeIndex defines a path through
-  the Shape tree where each element of ShapeIndex indexes into a tuple (or
-  nested tuple) within the shape. For a non-nested tuple, an index has a single
-  element.
-  '''
-
-  def __init__(self, List[int]) -> ShapeIndex:
-  def __eq__(self, other: Shape) -> bool:
-  def __ne__(self, other: Shape) -> bool:
-  def __hash__(self):
-  def __repr__(self):
-"""
-
-
 def shape_from_pyval(pyval, layout: Sequence[int] | None = None):
   """Returns a Shape that describes a tuple-tree of Numpy arrays."""
 
@@ -437,57 +146,6 @@ def convert(pyval):
   return convert(pyval)
 
 
-DeviceAssignment = _xla.DeviceAssignment
-DeviceAssignment.__doc__ = """
-A DeviceAssignment is a C++ object with the following signature.
-
-def create(assignment):
-  '''Builds a device assignment.
-
-   Args:
-     assignment: a 2D numpy array of device ordinal integers, indexed by
-       [replica][computation_in_replica].
-   Returns:
-     A device assignment.
-  '''
-
-def replica_count():
-  '''Returns the number of replicas.'''
-def computation_count():
-  '''Returns the number of computations per replica.'''
-"""
-
-Device = _xla.Device
-CompileOptions = _xla.CompileOptions
-
-HostBufferSemantics = _xla.HostBufferSemantics
-
-# An Executable is a C++ class that duck types with the following API:
-# class Executable:
-#   def local_devices(self) -> [Device]:
-#   def execute(self, arguments : [Buffer]) -> Buffer:
-#     """Execute on one replica with Buffer arguments and return value."""
-#
-#   def size_of_generated_code_in_bytes(self) -> int:
-#     """Return generated binary size, or -1 if not known."""
-#
-#   def execute_sharded_on_local_devices(self, arguments: [[Buffer]])
-#       -> [Buffer]:
-#     """Execute on many replicas with Buffer arguments and return value.
-#
-#     Args:
-#       arguments: A sequence of sequences of Buffers. The i'th element of each
-#         sequence comprises the arguments for execution on the i'th local
-#         device.
-#
-#     Returns:
-#       A list of the computation's outputs as a list of Buffers for each
-#       device.
-#     """
-#
-# There are different implementations of Executable for different backends.
-
-
 class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
@@ -526,206 +184,6 @@ def window_padding_type_to_pad_values(
     raise ValueError(msg.format(padding_type))
 
 
-XlaBuilder = _xla.XlaBuilder
-XlaComputation = _xla.XlaComputation
-XlaOp = _xla.XlaOp
-FftType = _xla.FftType
-Client = _xla.Client
-Memory = _xla.Memory
-ArrayImpl = _xla.ArrayImpl
-LoadedExecutable = _xla.LoadedExecutable
-DeviceList = _xla.DeviceList
-OpSharding = _xla.OpSharding
-HloSharding = _xla.HloSharding
-Sharding = _xla.Sharding
-NamedSharding = _xla.NamedSharding
-SingleDeviceSharding = _xla.SingleDeviceSharding
-PmapSharding = _xla.PmapSharding
-GSPMDSharding = _xla.GSPMDSharding
-PjRtLayout = _xla.PjRtLayout
-AutotuneCacheMode = _xla.AutotuneCacheMode
-ResultAccuracyMode = _xla.ResultAccuracy_Mode
-
-
-def LoadedExecutable_execute(self, arguments, device=None):
-  del device
-  results = self.execute_sharded(arguments)
-  return [x[0] for x in results.disassemble_into_single_device_arrays()]
-
-
-def LoadedExecutable_execute_with_token(self, arguments, device=None):
-  del device
-  results = self.execute_sharded(arguments, with_tokens=True)
-  return (
-      [x[0] for x in results.disassemble_into_single_device_arrays()],
-      results.consume_token().get_token(0),
-  )
-
-
-LoadedExecutable.execute = LoadedExecutable_execute
-LoadedExecutable.execute_with_token = LoadedExecutable_execute_with_token
-
-
-class CustomCallTargetTraits(enum.IntFlag):
-  DEFAULT = 0
-  # Calls to custom call are safe to trace into the command buffer. It means
-  # that calls to custom call always launch exactly the same device operations
-  # (can depend on attribute values) that can be captured and then replayed.
-  #
-  # Supported only for custom calls implemented with XLA FFI.
-  COMMAND_BUFFER_COMPATIBLE = 1
-
-
-class CustomCallHandler(Protocol):
-
-  def __call__(
-      self,
-      name: str,
-      fn: Any,
-      platform: str,
-      /,
-      api_version: int = ...,
-      traits: CustomCallTargetTraits = ...,
-  ) -> None:
-    ...
-
-
-_custom_callback_handler: dict[str, CustomCallHandler] = {}
-# Key is xla_platform_name, value is (function_name, function, api_version)
-_custom_callback: dict[
-    str, list[tuple[str, Any, int, CustomCallTargetTraits]]
-] = {}
-_custom_callback_lock = threading.Lock()
-
-
-def register_custom_call_target(
-    name: str,
-    fn: Any,
-    platform: str = 'cpu',
-    api_version: int = 0,
-    traits: CustomCallTargetTraits = CustomCallTargetTraits.DEFAULT,
-) -> None:
-  """Registers a custom call target.
-
-  Args:
-    name: bytes containing the name of the function.
-    fn: a PyCapsule object containing the function pointer.
-    platform: the target platform.
-    api_version: the XLA FFI version to use. Supported versions are: 0 for the
-      untyped FFI and 1 for the typed FFI.
-    traits: custom call traits corresponding to XLA FFI handler traits.
-  """
-  # To support AMD GPUs, we need to have xla_platform_names["gpu"] == "ROCM"
-  # Since that is hardcoded to CUDA, we are using the following as workaround.
-  xla_platform_name = xla_platform_names.get(platform, platform)
-  with _custom_callback_lock:
-    if xla_platform_name in _custom_callback_handler:
-      _custom_callback_handler[xla_platform_name](
-          name, fn, xla_platform_name, api_version, traits
-      )
-    else:
-      _custom_callback.setdefault(xla_platform_name, []).append(
-          (name, fn, api_version, traits)
-      )
-
-
-def register_custom_call_handler(
-    platform: str, handler: CustomCallHandler
-) -> None:
-  """Registers a custom handler and use it to register existing custom calls.
-
-  If a custom call handler for the platform already exist, calling this method
-  is a no-op and it will not register a new handler.
-
-  Args:
-    platform: the target platform.
-    handler: the function to register a custom call.
-  """
-  xla_platform_name = xla_platform_names.get(platform, platform)
-  with _custom_callback_lock:
-    if xla_platform_name in _custom_callback_handler:
-      logger.debug(
-          'Custom call handler for %s is already register. Will not register a'
-          ' new one',
-          xla_platform_name,
-      )
-      return
-    _custom_callback_handler[xla_platform_name] = handler
-    if xla_platform_name in _custom_callback:
-      for name, fn, api_version, traits in _custom_callback[xla_platform_name]:
-        handler(name, fn, xla_platform_name, api_version, traits)
-      del _custom_callback[xla_platform_name]
-
-
-class CustomTypeIdHandler(Protocol):
-
-  def __call__(self, name: str, capsule: Any) -> None:
-    ...
-
-
-_custom_type_id_handler: dict[str, CustomTypeIdHandler] = {}
-_custom_type_id: dict[str, Any] = {}
-_custom_type_id_lock = threading.Lock()
-
-
-def register_custom_type_id(
-    type_name: str,
-    type_id: Any,
-    platform: str = 'cpu',
-) -> None:
-  """Register a custom type id for use with the FFI.
-
-  Args:
-    type_name: a unique name for the type.
-    type_id: a PyCapsule object containing a pointer to the ``ffi::TypeId``.
-    platform: the target platform.
-  """
-  xla_platform_name = xla_platform_names.get(platform, platform)
-  with _custom_type_id_lock:
-    if xla_platform_name in _custom_type_id_handler:
-      _custom_type_id_handler[xla_platform_name](type_name, type_id)
-    else:
-      _custom_type_id.setdefault(xla_platform_name, []).append(
-          (type_name, type_id)
-      )
-
-
-def register_custom_type_id_handler(
-    platform: str, handler: CustomTypeIdHandler
-) -> None:
-  """Register a custom type id handler and use it to register existing type ids.
-
-  If a custom type id handler for the platform already exist, calling this
-  method is a no-op and it will not register a new handler.
-
-  Args:
-    platform: the target platform.
-    handler: the function to register a custom type id.
-  """
-  xla_platform_name = xla_platform_names.get(platform, platform)
-  with _custom_callback_lock:
-    if xla_platform_name in _custom_type_id_handler:
-      logger.debug(
-          'Custom type id handler for %s is already register. Will not '
-          'register a new one',
-          xla_platform_name,
-      )
-      return
-    _custom_type_id_handler[xla_platform_name] = handler
-    if xla_platform_name in _custom_type_id:
-      for name, capsule in _custom_type_id[xla_platform_name]:
-        handler(name, capsule)
-      del _custom_type_id[xla_platform_name]
-
-
-register_custom_call_partitioner = _xla.register_custom_call_partitioner
-encode_inspect_sharding_callback = _xla.encode_inspect_sharding_callback
-hlo_sharding_util = _xla.hlo_sharding_util
-register_custom_call_as_batch_partitionable = (
-    _xla.register_custom_call_as_batch_partitionable
-)
-
-
 class PaddingConfigDimension:
   """Python representation of a xla.PaddingConfigDimension protobuf."""
 
@@ -751,7 +209,7 @@ def __init__(self):
 
 
 def make_padding_config(
-    padding_config: Union[PaddingConfig, Sequence[tuple[int, int, int]]]
+    padding_config: PaddingConfig | Sequence[tuple[int, int, int]],
 ) -> PaddingConfig:
   """Create PaddingConfig proto from list of triples of integers.
 
@@ -793,10 +251,10 @@ def __init__(self):
 
 
 def make_dot_dimension_numbers(
-    dimension_numbers: Union[
-        DotDimensionNumbers,
-        tuple[tuple[list[int], list[int]], tuple[list[int], list[int]]],
-    ]
+    dimension_numbers: (
+        DotDimensionNumbers
+        | tuple[tuple[list[int], list[int]], tuple[list[int], list[int]]]
+    ),
 ) -> DotDimensionNumbers:
   """Builds a DotDimensionNumbers object from a specification.
 
@@ -849,9 +307,9 @@ def __init__(self):
 
 
 def make_convolution_dimension_numbers(
-    dimension_numbers: Union[
-        None, ConvolutionDimensionNumbers, tuple[str, str, str]
-    ],
+    dimension_numbers: (
+        None | ConvolutionDimensionNumbers | tuple[str, str, str]
+    ),
     num_spatial_dimensions: int,
 ) -> ConvolutionDimensionNumbers:
   """Builds a ConvolutionDimensionNumbers object from a specification.
@@ -921,29 +379,6 @@ def make_convolution_dimension_numbers(
   return dimension_numbers
 
 
-class PrecisionConfig:
-  """Python representation of a xla.PrecisionConfig protobuf."""
-
-  __slots__ = ('operand_precision',)
-
-  Precision = _xla.PrecisionConfig_Precision
-
-  def __init__(self):
-    self.operand_precision = []
-
-
-class ResultAccuracy:
-  """Python representation of a xla.ResultAccuracy protobuf."""
-
-  __slots__ = ('mode', 'atol', 'rtol', 'ulps')
-
-  def __init__(self):
-    self.mode = _xla.ResultAccuracy_Mode.DEFAULT
-    self.atol = 0.0
-    self.rtol = 0.0
-    self.ulps = 0
-
-
 class GatherDimensionNumbers:
   """Python representation of a xla.GatherDimensionNumbers protobuf."""
 
@@ -1002,43 +437,3 @@ def make_replica_groups(replica_groups):
         _make_replica_group_proto(group) for group in replica_groups
     ]
   return replica_groups_protos
-
-
-Traceback = _xla.Traceback
-Frame = _xla.Frame
-
-
-@contextlib.contextmanager
-def tracebacks(enabled=True):
-  """Context manager that enables or disables traceback collection."""
-  saved = Traceback.enabled
-  Traceback.enabled = enabled
-  try:
-    yield
-  finally:
-    Traceback.enabled = saved
-
-
-def heap_profile(client: Client) -> bytes:
-  """Returns a gzipped pprof protocol buffer containing a heap profile."""
-  return gzip.compress(client.heap_profile())
-
-
-XlaRuntimeError = _xla.XlaRuntimeError
-
-# Perform one last garbage collection of deferred Python references. This is
-# mostly to keep ASAN happy.
-atexit.register(_xla.collect_garbage)
-
-weakref_lru_cache = _xla.weakref_lru_cache
-array_result_handler = _xla.array_result_handler
-batched_copy_array_to_devices_with_sharding = (
-    _xla.batched_copy_array_to_devices_with_sharding
-)
-batched_device_put = _xla.batched_device_put
-reorder_shards = _xla.reorder_shards
-batched_block_until_ready = _xla.batched_block_until_ready
-check_and_canonicalize_memory_kind = _xla.check_and_canonicalize_memory_kind
-Layout = _xla.Layout
-custom_call_targets = _xla.custom_call_targets
-ArrayCopySemantics = _xla.ArrayCopySemantics
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
deleted file mode 100644
index 327cecc664ea..000000000000
--- a/third_party/xla/xla/python/xla_client.pyi
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import annotations
-
-from collections.abc import Callable, Mapping, Sequence
-import enum
-from typing import Any, Union
-
-import numpy
-
-from . import xla_extension as _xla
-from .xla_extension import ArrayImpl as ArrayImpl
-from .xla_extension import AutotuneCacheMode as AutotuneCacheMode
-from .xla_extension import Client as Client
-from .xla_extension import CompileOptions as CompileOptions
-from .xla_extension import Device as Device
-from .xla_extension import DeviceAssignment as DeviceAssignment
-from .xla_extension import DeviceList as DeviceList
-from .xla_extension import DeviceTopology as DeviceTopology
-from .xla_extension import DistributedRuntimeClient as DistributedRuntimeClient
-from .xla_extension import FftType as FftType
-from .xla_extension import Frame as Frame
-from .xla_extension import GSPMDSharding as GSPMDSharding
-from .xla_extension import HloSharding as HloSharding
-from .xla_extension import HostBufferSemantics as HostBufferSemantics
-from .xla_extension import ifrt_programs as ifrt_programs
-from .xla_extension import Layout as Layout
-from .xla_extension import LoadedExecutable as LoadedExecutable
-from .xla_extension import Memory as Memory
-from .xla_extension import NamedSharding as NamedSharding
-from .xla_extension import ops as ops
-from .xla_extension import OpSharding as OpSharding
-from .xla_extension import PjRtLayout as PjRtLayout
-from .xla_extension import PmapSharding as PmapSharding
-from .xla_extension import PrimitiveType as PrimitiveType
-from .xla_extension import ArrayCopySemantics as ArrayCopySemantics
-from .xla_extension import profiler as profiler
-from .xla_extension import Shape as Shape
-from .xla_extension import Sharding as Sharding
-from .xla_extension import SingleDeviceSharding as SingleDeviceSharding
-from .xla_extension import Traceback as Traceback
-from .xla_extension import XlaBuilder as XlaBuilder
-from .xla_extension import XlaComputation as XlaComputation
-from .xla_extension import XlaOp as XlaOp
-
-_version: int
-
-mlir_api_version: int
-
-bfloat16: type[numpy.generic]
-# TODO: Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-# float4_e2m1fn: type[numpy.generic]
-# float8_e3m4: type[numpy.generic]
-# float8_e4m3: type[numpy.generic]
-# float8_e8m0fnu: type[numpy.generic]
-float8_e4m3fn: type[numpy.generic]
-float8_e4m3b11fnuz: type[numpy.generic]
-float8_e4m3fnuz: type[numpy.generic]
-float8_e5m2: type[numpy.generic]
-float8_e5m2fnuz: type[numpy.generic]
-XLA_ELEMENT_TYPE_TO_DTYPE: dict[PrimitiveType, numpy.dtype]
-
-_NameValueMapping = Mapping[str, Union[str, int, list[int], float, bool]]
-
-def dtype_to_etype(dtype: numpy.dtype) -> PrimitiveType:
-  ...
-
-def shape_from_pyval(pyval: Any, layout: Sequence[int] | None = None) -> Any: ...
-
-def heap_profile(client: Client) -> bytes:
-  ...
-
-XlaRuntimeError = _xla.XlaRuntimeError
-
-def make_cpu_client(
-    asynchronous: bool = ...,
-    distributed_client: DistributedRuntimeClient | None = ...,
-    node_id: int = ...,
-    num_nodes: int = ...,
-    collectives: _xla.CpuCollectives | None = ...,
-    num_devices: int | None = ...,
-) -> Client:
-  ...
-
-def make_gpu_client(
-    distributed_client: DistributedRuntimeClient | None = ...,
-    node_id: int = ...,
-    num_nodes: int = ...,
-    platform_name: str | None = ...,
-    allowed_devices: set[int] | None = ...,
-    mock: bool | None = ...,
-    mock_gpu_topology: str | None = ...,
-) -> Client:
-  ...
-
-def make_tfrt_tpu_c_api_client(options: _NameValueMapping | None = None) -> Client:
-  ...
-
-def make_tfrt_tpu_c_api_device_topology(
-    topology_name: str | None = None, **kwargs
-) -> DeviceTopology:
-  ...
-
-def make_c_api_device_topology(c_api: Any, topology_name: str = '', **kwargs) -> DeviceTopology:
-  ...
-
-def get_topology_for_devices(devices: list[Device]) -> DeviceTopology:
-  ...
-
-def make_tpu_client(
-    library_path: str | None, options: _NameValueMapping | None = None
-) -> Client:
-  ...
-
-def make_c_api_client(
-    plugin_name: str,
-    options: _NameValueMapping | None = None,
-    distributed_client: DistributedRuntimeClient | None = None,
-) -> Client:
-  ...
-
-def pjrt_plugin_loaded(plugin_name: str) -> bool:
-  ...
-
-def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> Any:
-  ...
-
-def load_pjrt_plugin_with_c_api(plugin_name: str, c_api: Any) -> None:
-  ...
-
-def pjrt_plugin_initialized(plugin_name: str) -> bool:
-  ...
-
-def initialize_pjrt_plugin(plugin_name: str) -> None:
-  ...
-
-def generate_pjrt_gpu_plugin_options() -> _NameValueMapping:
-  ...
-
-class OpMetadata:
-
-  def __init__(
-      self,
-      op_type: str | None = ...,
-      op_name: str | None = ...,
-      source_file: str | None = ...,
-      source_line: int | None = ...,
-  ):
-    ...
-  op_type: str | None
-  op_name: str | None
-  source_file: str | None
-  source_line: int | None
-
-class PaddingConfigDimension:
-  edge_padding_low: int
-  edge_padding_high: int
-  interior_padding: int
-
-class PaddingConfig:
-  dimensions: list[PaddingConfigDimension]
-
-def make_padding_config(
-    padding_config: Union[PaddingConfig, Sequence[tuple[int, int, int]]],
-) -> PaddingConfig:
-  ...
-
-class PaddingType(enum.Enum):
-  VALID = 1
-  SAME = 2
-
-class DotDimensionNumbers:
-  lhs_contracting_dimensions: list[int]
-  rhs_contracting_dimensions: list[int]
-  lhs_batch_dimensions: list[int]
-  rhs_batch_dimensions: list[int]
-
-def make_dot_dimension_numbers(
-    dimension_numbers: Union[
-        DotDimensionNumbers,
-        tuple[tuple[list[int], list[int]], tuple[list[int], list[int]]],
-    ],
-) -> DotDimensionNumbers:
-  ...
-
-class ConvolutionDimensionNumbers:
-  input_batch_dimension: int
-  input_feature_dimension: int
-  input_spatial_dimensions: list[int]
-  kernel_input_feature_dimension: int
-  kernel_output_feature_dimension: int
-  kernel_spatial_dimensions: list[int]
-  output_batch_dimension: int
-  output_feature_dimension: int
-  output_spatial_dimensions: list[int]
-
-def make_convolution_dimension_numbers(
-    dimension_numbers: Union[
-        None, ConvolutionDimensionNumbers, tuple[str, str, str]
-    ],
-    num_spatial_dimensions: int,
-) -> ConvolutionDimensionNumbers:
-  ...
-
-class PrecisionConfig:
-  Precision = _xla.PrecisionConfig_Precision
-  operand_precision: list[_xla.PrecisionConfig_Precision]
-
-class ResultAccuracy:
-  mode: _xla.ResultAccuracy_Mode
-  atol: float
-  rtol: float
-  ulps: int
-
-class GatherDimensionNumbers:
-  offset_dims: list[int]
-  collapsed_slice_dims: list[int]
-  start_index_map: list[int]
-  index_vector_dim: int
-  operand_batching_dims: list[int]
-  start_indices_batching_dims: list[int]
-
-class ScatterDimensionNumbers:
-  update_window_dims: list[int]
-  inserted_window_dims: list[int]
-  scatter_dims_to_operand_dims: list[int]
-  index_vector_dim: int
-  input_batching_dims: list[int]
-  scatter_indices_batching_dims: list[int]
-
-class ReplicaGroup:
-  replica_ids: list[int]
-
-def make_replica_groups(
-    replica_groups: Sequence[Sequence[int]] | None,
-) -> list[ReplicaGroup]:
-  ...
-
-def weakref_lru_cache(cache_context_fn: Callable, call: Callable, maxsize=...) -> _xla.WeakrefLRUCache:
-  ...
-
-def batched_copy_array_to_devices_with_sharding(
-    arrays: Sequence[ArrayImpl],
-    devices: Sequence[list[Device]],
-    sharding: Sequence[Any],
-    array_copy_semantics: Sequence[ArrayCopySemantics],
-) -> Sequence[ArrayImpl]: ...
-
-def batched_device_put(
-    aval: Any,
-    sharding: Any,
-    shards: Sequence[Any],
-    devices: list[Device],
-    committed: bool = ...,
-    force_copy: bool = ...,
-    host_buffer_semantics: Any = ...,
-) -> ArrayImpl: ...
-
-def reorder_shards(
-    x: ArrayImpl,
-    dst_sharding: Any,
-    array_copy_semantics: ArrayCopySemantics,
-) -> ArrayImpl: ...
-
-def batched_block_until_ready(x: Sequence[ArrayImpl]) -> None: ...
-
-def check_and_canonicalize_memory_kind(
-    memory_kind: str | None, device_list: DeviceList
-) -> str | None: ...
-
-def array_result_handler(
-               aval: Any,
-               sharding: Any,
-               committed: bool,
-               _skip_checks: bool = ...) -> Callable:
-  ...
-
-class CustomCallTargetTraits(enum.IntFlag):
-  DEFAULT = 0
-  COMMAND_BUFFER_COMPATIBLE = 1
-
-def register_custom_call_target(
-    name: str,
-    fn: Any,
-    platform: str = ...,
-    api_version: int = ...,
-    traits: CustomCallTargetTraits = ...,
-) -> None: ...
-
-def register_custom_call_handler(
-    xla_platform_name: str, handler: Any
-) -> None: ...
-
-def custom_call_targets(platform: str) -> dict[str, Any]: ...
-
-def register_custom_type_id(
-    type_name: str,
-    type_id: Any,
-    platform: str = ...,
-) -> None: ...
-
-def register_custom_type_id_handler(platform: str, handler: Any) -> None: ...
-
-def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
-
-register_custom_call_partitioner = _xla.register_custom_call_partitioner
-register_custom_call_as_batch_partitionable = (
-    _xla.register_custom_call_as_batch_partitionable
-)
diff --git a/third_party/xla/xla/python/xla_client_backend_independent_test.py b/third_party/xla/xla/python/xla_client_backend_independent_test.py
deleted file mode 100644
index 05e85430ac33..000000000000
--- a/third_party/xla/xla/python/xla_client_backend_independent_test.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright 2017 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Backend-independent tests for the Python XLA client."""
-
-import unittest
-
-from absl.testing import absltest
-import numpy as np
-
-from xla.python import xla_client
-
-# pylint: disable=g-import-not-at-top
-try:
-  import portpicker
-except ImportError:
-  portpicker = None
-# pylint: enable=g-import-not-at-top
-
-ops = xla_client.ops
-
-
-class ShapeTest(absltest.TestCase):
-
-  def testInvalidShapes(self):
-    with self.assertRaisesRegex(xla_client.XlaRuntimeError, "invalid shape"):
-      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [-2, 4])
-
-    with self.assertRaisesRegex(
-        RuntimeError, "layout minor_to_major field contains 1 element.*"):
-      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [2, 4], [3])
-
-    with self.assertRaisesRegex(
-        RuntimeError, "layout minor_to_major field has out-of-bounds value.*"):
-      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [2, 4],
-                                   [1, -1])
-
-
-class ComputationPrinting(absltest.TestCase):
-
-  def ExampleComputation(self):
-    builder = xla_client.XlaBuilder("acomputation")
-    p0 = ops.Parameter(builder, 0, xla_client.shape_from_pyval(np.float32(0)))
-    p1 = ops.Parameter(builder, 1,
-                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-    x = ops.Mul(p0, p1)
-    ops.Add(x, x)
-    return builder.build()
-
-  def testComputationToHloText(self):
-    computation = self.ExampleComputation()
-    hlo_text = computation.as_hlo_text()
-    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-  def testComputationToHloGraph(self):
-    computation = self.ExampleComputation()
-    hlo_dot_graph = computation.as_hlo_dot_graph()
-    self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
-  def testHloModuleToHloText(self):
-    computation = self.ExampleComputation()
-    hlo_text = computation.as_hlo_module().to_string()
-    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-  def testHloModuleFromText(self):
-    hlo_module_text = """HloModule test
-        add {
-          x = f32[] parameter(0)
-          y = f32[] parameter(1)
-          ROOT add = f32[] add(x, y)
-        }
-        ENTRY entry {
-          p0 = f32[2,3] parameter(0)
-          start = f32[2,3] all-reduce-start(p0), to_apply=add
-          ROOT done = f32[2,3] all-reduce-done(start)
-        }"""
-    hlo_module = xla_client._xla.hlo_module_from_text(hlo_module_text)
-    hlo_text = hlo_module.to_string()
-    self.assertTrue(hlo_text.startswith("HloModule test"))
-
-  def testHloModuleToHloGraph(self):
-    computation = self.ExampleComputation()
-    hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
-        computation.as_hlo_module())
-    self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
-
-class ComputationHashTest(absltest.TestCase):
-
-  def testHash(self):
-    builder0 = xla_client.XlaBuilder("computation0")
-    p0 = ops.Parameter(builder0, 0, xla_client.shape_from_pyval(np.float32(0)))
-    p1 = ops.Parameter(builder0, 1,
-                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-    ops.Mul(p0, p1)
-    computation0 = builder0.build()
-
-    builder1 = xla_client.XlaBuilder("computation1")
-    p0 = ops.Parameter(builder1, 0, xla_client.shape_from_pyval(np.float32(0)))
-    p1 = ops.Parameter(builder1, 1,
-                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-    ops.Mul(p0, p1)
-    computation1 = builder1.build()
-
-    self.assertEqual(computation0.hash(), computation1.hash())
-
-
-class AliasTest(absltest.TestCase):
-
-  def testSetUpAlias(self):
-    c = xla_client.XlaBuilder(self.id())
-    p1 = ops.Parameter(
-        c, 0,
-        xla_client.shape_from_pyval(np.array(
-            1.0, np.float32)).with_major_to_minor_layout_if_absent())
-    p2 = ops.Parameter(
-        c, 1,
-        xla_client.shape_from_pyval(np.array(
-            1.0, np.float32)).with_major_to_minor_layout_if_absent())
-    out = ops.Add(p1, p2)
-    c.setup_alias([], 0, [])
-    c.build(out)
-
-
-class ProfilerTest(absltest.TestCase):
-
-  def testTraceMe(self):
-    # TODO(phawkins): These tests just check that the TraceMe context manager
-    # acts like a context manager and doesn't explode. Ideally we'd check that
-    # the profiler saw the traceme too.
-    with xla_client.profiler.TraceMe("test1"):
-      pass
-    with xla_client.profiler.TraceMe("test2", foo=123):
-      pass
-    with self.assertRaises(ValueError):
-      with xla_client.profiler.TraceMe("test3"):
-        raise ValueError("test")
-
-  @unittest.skipIf(portpicker is None, "Test requires portpicker")
-  def testStartServer(self):
-    port = portpicker.pick_unused_port()
-    server = xla_client.profiler.start_server(port)
-    del server
-
-
-class HloModuleGroupTest(absltest.TestCase):
-
-  def testHloModuleGroup(self):
-    builder0 = xla_client.XlaBuilder("computation0")
-    p0 = ops.Parameter(builder0, 0, xla_client.shape_from_pyval(np.float32(0)))
-    p1 = ops.Parameter(builder0, 1,
-                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-    root = ops.Mul(p0, p1)
-    computation0 = builder0.build(root)
-
-    m = computation0.get_hlo_module()
-    mg_name = "test_module_group"
-    mg = xla_client._xla.HloModuleGroup(mg_name, [m])
-    self.assertEqual(mg.name, mg_name)
-
-    modules = mg.to_modules()
-    self.assertLen(modules, 1)
-    self.assertEqual(m.to_string(), modules[0].to_string())
-
-
-class RunHloPassTest(absltest.TestCase):
-
-  def testHloDCE(self):
-    b = xla_client.XlaBuilder("acomputation")
-    p0 = ops.Parameter(b, 0, xla_client.shape_from_pyval(np.float32(0)))
-    p1 = ops.Parameter(b, 1,
-                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-    root = ops.Mul(p0, p1)
-
-    # Dead instructions
-    p2 = ops.Parameter(b, 2, xla_client.shape_from_pyval(np.float32(0)))
-    ops.Add(p2, p2)
-
-    hlo_module = b.build(root).get_hlo_module()
-    self.assertTrue(xla_client._xla.HloDCE().run(hlo_module))
-
-
-if __name__ == "__main__":
-  absltest.main()
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
deleted file mode 100644
index fa94edbd7575..000000000000
--- a/third_party/xla/xla/python/xla_client_test.py
+++ /dev/null
@@ -1,3718 +0,0 @@
-# Copyright 2017 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Backend-dependent tests for the Python XLA client."""
-
-import collections
-import functools
-import itertools
-import re
-import threading
-import traceback
-from typing import Sequence
-import unittest
-
-from absl import flags
-from absl import logging
-from absl.testing import absltest
-from absl.testing import parameterized
-import ml_dtypes
-import numpy as np
-
-from xla.python import xla_client
-
-# pylint: disable=g-import-not-at-top
-try:
-  from xla.python import custom_calls_testlib
-except ImportError:
-  custom_calls_testlib = None
-
-try:
-  from xla.python import xla_extension
-  from xla.python import xla_gpu_extension
-
-  if not hasattr(xla_extension, "GpuAllocatorConfig"):
-    xla_extension.GpuAllocatorConfig = xla_gpu_extension.GpuAllocatorConfig
-  if not hasattr(xla_extension, "get_gpu_client"):
-    xla_extension.get_gpu_client = xla_gpu_extension.get_gpu_client
-except ImportError:
-  pass
-
-xla_client._xla.jax_jit.set_thread_local_state_initialization_callback(
-    lambda: None
-)
-
-bfloat16 = xla_client.bfloat16
-# TODO(reedwm): Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-# float4_e2m1fn = xla_client.float4_e2m1fn
-# float8_e3m4 = xla_client.float8_e3m4
-# float8_e4m3 = xla_client.float8_e4m3
-# float8_e8m0fnu = xla_client.float8_e8m0fnu
-float8_e4m3fn = xla_client.float8_e4m3fn
-float8_e4m3fnuz = xla_client.float8_e4m3fnuz
-float8_e4m3b11fnuz = xla_client.float8_e4m3b11fnuz
-float8_e5m2 = xla_client.float8_e5m2
-float8_e5m2fnuz = xla_client.float8_e5m2fnuz
-ops = xla_client.ops
-xla_computation_to_mlir_module = (
-    xla_client._xla.mlir.xla_computation_to_mlir_module)
-
-
-def execute_with_python_values(executable, arguments, backend):  # pylint: disable=invalid-name
-  """Execute on one replica with Python values as arguments and output."""
-
-  def put(arg):  # pylint: disable=invalid-name
-    return backend.buffer_from_pyval(arg, device=executable.local_devices()[0])
-
-  arguments = [put(arg) for arg in arguments]
-  outputs = executable.execute(arguments)
-  return [np.asarray(x) for x in outputs]
-
-
-# pylint: disable=invalid-name
-def jax_array_convert_to_array(self, dtype=None, copy=None):
-  del copy
-  out, _ = self._single_device_array_to_np_array_did_copy()
-  if dtype is not None:
-    out = out.astype(dtype)
-  return out
-
-
-def jax_array_device(self):
-  return self._sharding._device
-
-
-def jax_array_copy_to_host_async(self):
-  self._copy_single_device_array_to_host_async()
-
-
-Array = xla_client.ArrayImpl
-Array.__array__ = jax_array_convert_to_array
-Array.copy_to_host_async = jax_array_copy_to_host_async
-Array.device = jax_array_device
-xla_client.SingleDeviceSharding.device_set = property(
-    lambda self: {self._device}
-)
-# pylint: enable=invalid-name
-
-
-FLAGS = flags.FLAGS
-
-# We choose to ignore pylint's complaints about complex comprehensions, which we
-# use widely for parameterizing tests.
-# pylint: disable=g-complex-comprehension
-
-_CUSTOM_CALLS_REGISTERED = False
-
-
-# XLA' alignment is 16 bytes at the moment, but it should match what Eigen
-# supports, and that can go up to 128 bytes on hardware with HVX.
-_XLA_CPU_MAX_ALIGNMENT = 128
-
-
-# Minimum possible alignment for XLA.
-_XLA_CPU_MIN_ALIGNMENT = 16
-
-
-# Return a copy of `x` with the given alignment. Does nothing if `x` is already
-# aligned. We do this manually, because numpy doesn't support custom alignment
-# value.
-def _Aligned(x, alignment=_XLA_CPU_MAX_ALIGNMENT):
-  if (x.ctypes.data % alignment) == 0:
-    return x
-
-  # Create temporary buffer with extra space for alignment.
-  assert alignment % x.itemsize == 0
-  extra = alignment // x.itemsize
-  buf = np.empty(x.size + extra, dtype=x.dtype)
-
-  # Create a view of the temporary buffer with such an offset, that the result
-  # buffer is aligned.
-  offset = (-buf.ctypes.data % alignment) // x.itemsize
-  result = buf[offset : offset + x.size].reshape(x.shape)
-
-  # Copy the data to the result buffer and return it.
-  np.copyto(result, x)
-  return result
-
-
-# Return an unaligned copy of `x`. The result buffer's memory address is
-# guaranteed to not be aligned to `alignment`. This function is useful for
-# testing failiures.
-def _Unaligned(x, alignment=_XLA_CPU_MIN_ALIGNMENT):
-  if (x.ctypes.data % alignment) != 0:
-    return x
-
-  # Create temporary buffer with extra space.
-  assert (x.itemsize % alignment) != 0
-  offset = 1
-  buf = np.empty(x.size + offset, dtype=x.dtype)
-
-  if (buf.ctypes.data % alignment) != 0:
-    # If the temporary buffer is already unaligned, return it.
-    result = buf
-  else:
-    # Otherwise, create a view of the temporary buffer with an offset.
-    result = buf[offset : offset + x.size].reshape(x.shape)
-    assert (result.ctypes.data % alignment) != 0
-
-  # Copy the data to the result buffer and return it.
-  np.copyto(result, x)
-  return result
-
-
-def TestFactory(xla_backend,
-                cloud_tpu=False,
-                tfrt_tpu=False,
-                pjrt_c_api=False,
-                pathways=False,
-                pathways_ifrt=False):
-  tests = []
-
-  int_dtypes = [np.int32, np.int64, np.uint32, np.uint64]
-  # TODO(phawkins): test np.float16, where supported.
-  float_dtypes = [bfloat16, np.float32, np.float64]
-  complex_dtypes = [np.complex64, np.complex128]
-  standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
-  # TODO(zhangqiaorjc): test fp8 types when XLA support is complete.
-  # standard_dtypes is only used for BufferProtocolTest so we only test fp8
-  # round trip tests.
-  fp8_dtypes = [float8_e4m3b11fnuz, float8_e4m3fn, float8_e5m2]
-  standard_dtypes += fp8_dtypes
-  # TODO(reedwm): Uncomment once the minimum ml_dtypes in JAX is >= 0.5.0.
-  # standard_dtypes += [float4_e2m1fn, float8_e3m4, float8_e4m3, float8_e8m0fnu]
-  dlpack_dtypes = int_dtypes + float_dtypes + [np.bool_] + complex_dtypes
-
-  class ComputationTest(parameterized.TestCase):
-    """Base class for running an XLA Computation through the local client."""
-
-    def setUp(self):
-      super(ComputationTest, self).setUp()
-      self.backend = xla_backend()
-
-      global _CUSTOM_CALLS_REGISTERED
-      if self.backend.platform == "cpu" and not _CUSTOM_CALLS_REGISTERED:
-        for name, fn in custom_calls_testlib.registrations().items():
-          xla_client.register_custom_call_target(
-              name, fn, platform="cpu", api_version=1
-          )
-        for name, val in custom_calls_testlib.type_ids().items():
-          xla_client.register_custom_type_id(name, val, platform="cpu")
-        _CUSTOM_CALLS_REGISTERED = True
-
-    def _NewComputation(self, name=None):
-      if name is None:
-        name = self.id()
-      return xla_client.XlaBuilder(name)
-
-    def _Execute(self, c, arguments):
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()))
-      return execute_with_python_values(
-          compiled_c, arguments, backend=self.backend)
-
-    def _ExecuteAndAssertWith(self, assert_func, c, arguments, expected):
-      assert expected is not None
-      results = self._Execute(c, arguments)
-      self.assertLen(results, len(expected))
-      for result, e in zip(results, expected):
-        # Numpy's comparison methods are a bit too lenient by treating inputs as
-        # "array-like", meaning that scalar 4 will be happily compared equal to
-        # [[4]]. We'd like to be more strict so assert shapes as well.
-        self.assertEqual(np.asanyarray(result).shape, np.asanyarray(e).shape)
-        assert_func(result, e)
-
-    def _ExecuteAndCompareExact(self, c, arguments=(), expected=None):
-      self._ExecuteAndAssertWith(np.testing.assert_equal, c, arguments,
-                                 expected)
-
-    def _ExecuteAndCompareClose(self,
-                                c,
-                                arguments=(),
-                                expected=None,
-                                rtol=1e-4,
-                                atol=0):
-      self._ExecuteAndAssertWith(
-          functools.partial(np.testing.assert_allclose, rtol=rtol, atol=atol),
-          c, arguments, expected)
-
-  def NumpyArrayF32(*args, **kwargs):
-    """Convenience wrapper to create Numpy arrays with a np.float32 dtype."""
-    return np.array(*args, dtype=np.float32, **kwargs)
-
-  def NumpyArrayF64(*args, **kwargs):
-    """Convenience wrapper to create Numpy arrays with a np.float64 dtype."""
-    return np.array(*args, dtype=np.float64, **kwargs)
-
-  def NumpyArrayS32(*args, **kwargs):
-    """Convenience wrapper to create Numpy arrays with a np.int32 dtype."""
-    return np.array(*args, dtype=np.int32, **kwargs)
-
-  def NumpyArrayBool(*args, **kwargs):
-    """Convenience wrapper to create Numpy arrays with a np.bool_ dtype."""
-    return np.array(*args, dtype=np.bool_, **kwargs)
-
-  class ComputationPrinting(absltest.TestCase):
-
-    def setUp(self):
-      super(ComputationPrinting, self).setUp()
-      self.backend = xla_backend()
-
-    def ExampleComputation(self):
-      builder = xla_client.XlaBuilder("acomputation")
-      p0 = ops.Parameter(builder, 0, xla_client.shape_from_pyval(np.float32(0)))
-      p1 = ops.Parameter(
-          builder, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-      x = ops.Mul(p0, p1)
-      ops.Add(x, x)
-      return builder.build()
-
-    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
-    def testCompiledHloModuleToHloText(self):
-      computation = self.ExampleComputation()
-      executable = self.backend.compile(
-          xla_computation_to_mlir_module(computation))
-      hlo_modules = executable.hlo_modules()
-      self.assertLen(hlo_modules, 1)
-      hlo_text = hlo_modules[0].to_string()
-      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-      self.assertIn("fusion", hlo_text)
-
-    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
-    def testCompiledHloModuleAsSerializedProto(self):
-      computation = self.ExampleComputation()
-      executable = self.backend.compile(
-          xla_computation_to_mlir_module(computation))
-      hlo_modules = executable.hlo_modules()
-      self.assertLen(hlo_modules, 1)
-      hlo_text = hlo_modules[0].to_string()
-      proto = hlo_modules[0].as_serialized_hlo_module_proto()
-      hlo_module_roundtrip = xla_client.XlaComputation(proto).get_hlo_module()
-      hlo_text_roundtrip = hlo_module_roundtrip.to_string()
-      self.assertEqual(hlo_text, hlo_text_roundtrip)
-
-    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
-    def testStableComputationSerialization(self):
-      # Ideally we would test identical computations produced in different
-      # processes. For now we have this limited smoke test.
-      computation = self.ExampleComputation()
-      ref = computation.as_serialized_hlo_module_proto()
-      for _ in range(10):
-        self.assertEqual(computation.as_serialized_hlo_module_proto(), ref)
-
-    # TODO(b/261771737): some version of this should work with pjrt_c_api=True
-    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
-                     "not implemented")
-    def testFlopEstimate(self):
-      computation = self.ExampleComputation()
-      properties = xla_client._xla.hlo_module_cost_analysis(
-          self.backend, computation.as_hlo_module())
-      self.assertEqual(properties["flops"], 8.0)
-
-    def testFingerprint(self):
-      computation = self.ExampleComputation()
-      executable = self.backend.compile(
-          xla_computation_to_mlir_module(computation))
-      fingerprint = executable.fingerprint
-      if (
-          self.backend.platform == "tpu"
-          or self.backend.platform == "gpu"
-          or self.backend.platform == "cpu"
-      ) and not (cloud_tpu or pathways or pathways_ifrt):
-        logging.info("fingerprint: %s", fingerprint)
-        self.assertNotEmpty(fingerprint)
-      else:
-        self.assertIsNone(fingerprint)
-
-  tests.append(ComputationPrinting)
-
-  class ComputationsWithConstantsTest(ComputationTest):
-    """Tests focusing on Constant ops."""
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in int_dtypes + float_dtypes)
-    def testConstantScalarSum(self, dtype):
-      c = self._NewComputation()
-      ops.Add(ops.Constant(c, dtype(1.11)), ops.Constant(c, dtype(3.14)))
-      self._ExecuteAndCompareClose(c, expected=[dtype(1.11) + dtype(3.14)])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testConstantVectorMul(self, dtype):
-      c = self._NewComputation()
-      ops.Mul(
-          ops.Constant(c, np.array([2.5, 3.3, -1.2, 0.7], dtype)),
-          ops.Constant(c, np.array([-1.2, 2, -2, -3], dtype)))
-      self._ExecuteAndCompareClose(
-          c, expected=[[-3, 6.6, 2.4, -2.1]], rtol=3e-3)
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testConstantVectorScalarDiv(self, dtype):
-      c = self._NewComputation()
-      ops.Div(
-          ops.Constant(c, np.array([1.5, 2.5, 3.0, -10.8], dtype=dtype)),
-          ops.Constant(c, dtype(2.0)))
-      self._ExecuteAndCompareClose(
-          c, expected=[[0.75, 1.25, 1.5, -5.4]], rtol=2e-3)
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testConstantVectorScalarPow(self, dtype):
-      c = self._NewComputation()
-      ops.Pow(
-          ops.Constant(c, np.array([1.5, 2.5, 3.0], dtype=dtype)),
-          ops.Constant(c, dtype(2.)))
-      self._ExecuteAndCompareClose(c, expected=[[2.25, 6.25, 9.]])
-
-    def testIota(self):
-      c = self._NewComputation()
-      ops.Iota(c, xla_client.PrimitiveType.F32, 10)
-      self._ExecuteAndCompareExact(
-          c, expected=[np.arange(10, dtype=np.float32)])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in int_dtypes)
-    def testBroadcastedIota(self, dtype):
-      c = self._NewComputation()
-      shape = xla_client.Shape.array_shape(
-          xla_client.dtype_to_etype(dtype), (2, 3))
-      ops.Iota(c, shape, 1)
-      expected = np.array([[0, 1, 2], [0, 1, 2]], dtype=dtype)
-      self._ExecuteAndCompareExact(c, expected=[expected])
-
-    def testBooleanAnd(self):
-      c = self._NewComputation()
-      ops.And(
-          ops.Constant(c, NumpyArrayBool([True, False, True, False])),
-          ops.Constant(c, NumpyArrayBool([True, True, False, False])))
-      self._ExecuteAndCompareExact(c, expected=[[True, False, False, False]])
-
-    def testBooleanOr(self):
-      c = self._NewComputation()
-      ops.Or(
-          ops.Constant(c, NumpyArrayBool([True, False, True, False])),
-          ops.Constant(c, NumpyArrayBool([True, True, False, False])))
-      self._ExecuteAndCompareExact(c, expected=[[True, True, True, False]])
-
-    def testBooleanXor(self):
-      c = self._NewComputation()
-      ops.Xor(
-          ops.Constant(c, NumpyArrayBool([True, False, True, False])),
-          ops.Constant(c, NumpyArrayBool([True, True, False, False])))
-      self._ExecuteAndCompareExact(c, expected=[[False, True, True, False]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testSum2D(self, dtype):
-      c = self._NewComputation()
-      ops.Add(
-          ops.Constant(c, np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)),
-          ops.Constant(c, np.array([[1, -1, 1], [-1, 1, -1]], dtype=dtype)))
-      self._ExecuteAndCompareClose(c, expected=[[[2, 1, 4], [3, 6, 5]]])
-
-    def testShiftLeft(self):
-      c = self._NewComputation()
-      ops.ShiftLeft(
-          ops.Constant(c, NumpyArrayS32([3])),
-          ops.Constant(c, NumpyArrayS32([2])))
-      self._ExecuteAndCompareClose(c, expected=[[12]])
-
-    def testShiftRightArithmetic(self):
-      c = self._NewComputation()
-      ops.ShiftRightArithmetic(
-          ops.Constant(c, NumpyArrayS32([-2])),
-          ops.Constant(c, NumpyArrayS32([1])))
-      self._ExecuteAndCompareClose(c, expected=[[-1]])
-
-    def testShiftRightLogical(self):
-      c = self._NewComputation()
-      ops.ShiftRightLogical(
-          ops.Constant(c, NumpyArrayS32([-1])),
-          ops.Constant(c, NumpyArrayS32([1])))
-      self._ExecuteAndCompareClose(c, expected=[[2**31 - 1]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testSum2DWith1DBroadcastDim0(self, dtype):
-      # sum of a 2D array with a 1D array where the latter is replicated across
-      # dimension 0 to match the former's shape.
-      c = self._NewComputation()
-      ops.Add(
-          ops.Constant(c,
-                       np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                                dtype=dtype)),
-          ops.Constant(c, np.array([10, 20, 30], dtype=dtype)),
-          broadcast_dimensions=(0,))
-      self._ExecuteAndCompareClose(
-          c, expected=[[[11, 12, 13], [24, 25, 26], [37, 38, 39]]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testSum2DWith1DBroadcastDim1(self, dtype):
-      # sum of a 2D array with a 1D array where the latter is replicated across
-      # dimension 1 to match the former's shape.
-      c = self._NewComputation()
-      ops.Add(
-          ops.Constant(c,
-                       np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                                dtype=dtype)),
-          ops.Constant(c, np.array([10, 20, 30], dtype=dtype)),
-          broadcast_dimensions=(1,))
-      self._ExecuteAndCompareClose(
-          c, expected=[[[11, 22, 33], [14, 25, 36], [17, 28, 39]]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testConstantAxpy(self, dtype):
-      c = self._NewComputation()
-      ops.Add(
-          ops.Mul(
-              ops.Constant(c, dtype(2)),
-              ops.Constant(c, np.array([2.2, 3.3, 4.4, 5.5], dtype=dtype))),
-          ops.Constant(c, np.array([100, -100, 200, -200], dtype)))
-      self._ExecuteAndCompareClose(
-          c, expected=[[104.4, -93.4, 208.8, -189]], rtol=2e-3)
-
-    def testCustomCall(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      c = self._NewComputation()
-      ops.CustomCallWithLayout(
-          c,
-          b"subtract_f32",
-          operands=[
-              ops.Constant(c, np.float32(1.25)),
-              ops.Constant(c, np.float32(0.5))
-          ],
-          shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.float32), (), ()),
-          operand_shapes_with_layout=[
-              xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
-              xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
-          ],
-          api_version=xla_client.ops.CustomCallApiVersion
-          .API_VERSION_TYPED_FFI)
-      self._ExecuteAndCompareClose(c, expected=[0.75])
-
-    def testCustomCallWithUnifiedApiUnknownTarget(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      c = self._NewComputation()
-
-      ops.CustomCallWithLayout(
-          c,
-          b"not_existing",
-          operands=[],
-          shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.float32), (), ()
-          ),
-          operand_shapes_with_layout=[],
-          api_version=xla_client.ops.CustomCallApiVersion
-          .API_VERSION_STATUS_RETURNING_UNIFIED,
-      )
-      with self.assertRaisesRegex(
-          xla_client.XlaRuntimeError, expected_regex="NOT_FOUND"
-      ):
-        self._Execute(c, arguments=())
-
-    def testCustomCallTypedFfiUnknownTarget(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      c = self._NewComputation()
-
-      ops.CustomCallWithLayout(
-          c,
-          b"not_existing",
-          operands=[],
-          shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.float32), (), ()
-          ),
-          operand_shapes_with_layout=[],
-          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
-      )
-      with self.assertRaises(xla_client.XlaRuntimeError):
-        self._Execute(c, arguments=())
-
-    def testCustomCallTypedFfiAlwaysFail(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      c = self._NewComputation()
-
-      ops.CustomCallWithLayout(
-          c,
-          b"always_fail",
-          operands=[],
-          shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.float32), (), ()
-          ),
-          operand_shapes_with_layout=[],
-          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
-      )
-
-      with self.assertRaisesRegex(
-          Exception, expected_regex="Failed intentionally"
-      ):
-        self._Execute(c, arguments=())
-
-    def testCustomCallTypedFfiAlwaysSucceed(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      c = self._NewComputation()
-
-      ops.CustomCallWithLayout(
-          c,
-          b"always_succeed",
-          operands=[],
-          shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.float32), (), ()
-          ),
-          operand_shapes_with_layout=[],
-          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
-      )
-
-      self._Execute(c, arguments=())
-
-    def testCustomCallTypedFfiSubtract(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      c = self._NewComputation()
-
-      ops.CustomCallWithLayout(
-          c,
-          b"subtract_f32_cst",
-          operands=[ops.Constant(c, np.float32(1.25))],
-          shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.float32), (), ()
-          ),
-          operand_shapes_with_layout=[
-              xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
-          ],
-          opaque=b"{cst = 3.0 : f32}",
-          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
-      )
-      self._ExecuteAndCompareClose(c, expected=[-1.75])
-
-    def testStatefulCustomCall(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      c = self._NewComputation()
-      ops.CustomCallWithLayout(
-          c,
-          b"stateful",
-          operands=[],
-          shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.int32), (), ()),
-          operand_shapes_with_layout=[],
-          api_version=xla_client.ops.CustomCallApiVersion
-          .API_VERSION_TYPED_FFI)
-      self._ExecuteAndCompareClose(c, expected=[42])
-
-    def testCustomCallLookup(self):
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires cpu platform")
-      if xla_client._version < 241:
-        self.skipTest("Test requires jaxlib version 241")
-
-      self.assertTrue(_CUSTOM_CALLS_REGISTERED)
-      xla_client.make_cpu_client()
-      self.assertContainsSubset(
-          list(custom_calls_testlib.registrations().keys()),
-          xla_client.custom_call_targets("Host").keys(),
-      )
-
-  tests.append(ComputationsWithConstantsTest)
-
-  class ComputationFromProtoTest(absltest.TestCase):
-    """Test computation execution from HLO proto."""
-
-    def setUp(self):
-      super(ComputationFromProtoTest, self).setUp()
-      self.backend = xla_backend()
-
-    def testExecuteFromProto(self):
-      # Build the HLO proto
-      b = xla_client.XlaBuilder("computation")
-      ops.Add(ops.Constant(b, np.int32(1)), ops.Constant(b, np.int32(2)))
-      serialized_proto = b.build().as_serialized_hlo_module_proto()
-
-      # Load and execute the proto
-      c = xla_client.XlaComputation(serialized_proto)
-      m = xla_computation_to_mlir_module(c)
-      ans, = execute_with_python_values(
-          self.backend.compile(m), (), backend=self.backend)
-      np.testing.assert_equal(ans, np.int32(3))
-
-  tests.append(ComputationFromProtoTest)
-
-  class ParametersTest(ComputationTest):
-    """Tests focusing on Parameter ops and argument-passing."""
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in int_dtypes)
-    def testScalarTimesVector(self, dtype):
-      c = self._NewComputation()
-      arg0 = np.array(3, dtype=dtype)
-      if np.issubdtype(dtype, np.unsignedinteger):
-        arg1 = np.array([10, 15, 2, 7], dtype=dtype)
-      else:
-        arg1 = np.array([10, 15, -2, 7], dtype=dtype)
-      p0 = ops.Parameter(c, 0, xla_client.shape_from_pyval(arg0))
-      p1 = ops.Parameter(c, 1, xla_client.shape_from_pyval(arg1))
-      ops.Mul(p0, p1)
-      self._ExecuteAndCompareExact(
-          c, arguments=[arg0, arg1], expected=[arg0 * arg1])
-
-    # TODO(phawkins): test comparison harness doesn't support bfloat16
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes if dtype != bfloat16)
-    def testScalarMinusVectorExplicitNumbering(self, dtype):
-      # Use explicit numbering and pass parameter_num first. Sub is used since
-      # it's not commutative and can help catch parameter reversal within the
-      # computation.
-      c = self._NewComputation()
-      arg0 = np.array(2.0, dtype=dtype)
-      arg1 = np.array([-2.3, 3.3, -4.3, 5.3], dtype=dtype)
-      p1 = ops.Parameter(c, 1, xla_client.shape_from_pyval(arg1))
-      p0 = ops.Parameter(c, 0, xla_client.shape_from_pyval(arg0))
-      ops.Sub(p1, p0)
-      self._ExecuteAndCompareClose(
-          c, arguments=[arg0, arg1], expected=[arg1 - arg0])
-
-  tests.append(ParametersTest)
-
-  class LayoutsTest(ComputationTest):
-    """Tests related to getting and setting on-device memory layouts."""
-
-    def _minor_to_major(self, layout: xla_client.PjRtLayout):  # pylint: disable=invalid-name
-      m2m_str = re.search("{([0-9,]*)", str(layout)).group(1)
-      if not m2m_str:
-        return ()
-      return tuple(int(x) for x in m2m_str.split(","))
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testGetArgumentLayouts(self):
-      # Create computation with a few parameters.
-      c = self._NewComputation()
-      param_count = 0
-
-      def MakeArg(shape, dtype):
-        nonlocal param_count
-        shape = xla_client.Shape.array_shape(np.dtype(dtype), shape)
-        param = ops.Parameter(c, param_count, shape)
-        param_count += 1
-        return param
-
-      p0 = MakeArg((2, 3, 4), np.float32)
-      MakeArg((3, 2), np.int32)
-      MakeArg((), np.float64)
-
-      ops.Add(p0, ops.Constant(c, np.ones((2, 3, 4), np.float32)))
-      executable = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()))
-
-      # Test that compiled executable returns plausible layouts.
-      layouts: Sequence[xla_client.Layout] = executable.get_parameter_layouts()
-      self.assertLen(layouts, 3)
-      self.assertLen(self._minor_to_major(layouts[0]), 3)
-      self.assertLen(self._minor_to_major(layouts[1]), 2)
-      self.assertEmpty(self._minor_to_major(layouts[2]))
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testGetArgumentLayoutsTupled(self):
-      # Generated with:
-      # jax.jit(lambda x, y, z: (x, y, z))(np.ones((1024, 8, 128)),
-      #                                    np.int32(42),
-      #                                    np.ones(10))
-      module_str = """
-module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
-                                 mhlo.num_replicas = 1 : i32} {
-  func.func public @main(
-      %arg0: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}"},
-      %arg1: tensor<i32> {mhlo.sharding = "{replicated}"},
-      %arg2: tensor<10xf32> {mhlo.sharding = "{replicated}"})
-      -> (tensor<1024x8x128xf32> {jax.result_info = "[0]"},
-          tensor<i32> {jax.result_info = "[1]"},
-          tensor<10xf32> {jax.result_info = "[2]"}) {
-    return %arg0, %arg1, %arg2 : tensor<1024x8x128xf32>, tensor<i32>, tensor<10xf32>
-  }
-}
-"""
-      options = xla_client.CompileOptions()
-      # 'parameter_is_tupled_arguments' causes MLIR untupled arguments to get
-      # turned into HLO tupled arguments.
-      options.parameter_is_tupled_arguments = True
-      executable = self.backend.compile(module_str, compile_options=options)
-
-      # Test that compiled executable returns plausible layouts.
-      layouts: Sequence[xla_client.Layout] = executable.get_parameter_layouts()
-      self.assertLen(layouts, 3)
-      self.assertLen(self._minor_to_major(layouts[0]), 3)
-      self.assertEmpty(self._minor_to_major(layouts[1]))
-      self.assertLen(self._minor_to_major(layouts[2]), 1)
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testGetOutputLayouts(self):
-      # Generated with jax.jit(lambda: (np.ones((1024, 128)), np.int32(42),
-      #                                 np.ones(10)))()
-      module_str = """
-module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
-                                 mhlo.num_replicas = 1 : i32} {
-  func.func public @main() -> (tensor<1024x128xf32> {jax.result_info = "[0]"},
-                               tensor<i32> {jax.result_info = "[1]"},
-                               tensor<10xf32> {jax.result_info = "[2]"}) {
-    %0 = stablehlo.constant dense<1.000000e+00> : tensor<1024x128xf32>
-    %1 = stablehlo.constant dense<1.000000e+00> : tensor<10xf32>
-    %2 = stablehlo.constant dense<42> : tensor<i32>
-    return %0, %2, %1 : tensor<1024x128xf32>, tensor<i32>, tensor<10xf32>
-  }
-}
-"""
-      executable = self.backend.compile(module_str)
-
-      # Test that compiled executable returns plausible layouts.
-      layouts: Sequence[xla_client.Layout] = executable.get_output_layouts()
-      self.assertLen(layouts, 3)
-      self.assertLen(self._minor_to_major(layouts[0]), 2)
-      self.assertEmpty(self._minor_to_major(layouts[1]))
-      self.assertLen(self._minor_to_major(layouts[2]), 1)
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testSetArgumentLayouts(self):
-      # TODO(b/309682374): implement on CPU and GPU
-      if self.backend.platform != "tpu":
-        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
-
-      # Hand-edited version of:
-      # jax.jit(lambda x, y, z: (x, y, z))(np.ones((1024, 8, 128)),
-      #                                    np.int32(42),
-      #                                    np.ones(10))
-      module_str = """
-module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
-                                 mhlo.num_replicas = 1 : i32} {
-  func.func public @main(
-      %arg0: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}",
-                                     mhlo.layout_mode = "{0,1,2}"},
-      %arg1: tensor<i32> {mhlo.sharding = "{replicated}",
-                          mhlo.layout_mode = "{}"},
-      %arg2: tensor<10xf32> {mhlo.sharding = "{replicated}",
-                             mhlo.layout_mode = "{0}"})
-      -> (tensor<1024x8x128xf32> {jax.result_info = "[0]"},
-          tensor<i32> {jax.result_info = "[1]"},
-          tensor<10xf32> {jax.result_info = "[2]"}) {
-    return %arg0, %arg1, %arg2 : tensor<1024x8x128xf32>, tensor<i32>, tensor<10xf32>
-  }
-}
-      """
-      executable = self.backend.compile(module_str)
-
-      # Check input layouts.
-      input_layouts = executable.get_parameter_layouts()
-      self.assertLen(input_layouts, 3)
-      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1, 2))
-      self.assertEqual(self._minor_to_major(input_layouts[1]), ())
-      self.assertEqual(self._minor_to_major(input_layouts[2]), (0,))
-
-      # Compile a version with default arg0 layout so we can make sure we
-      # actually set it above.
-      default_executable = self.backend.compile(
-          module_str.replace('"{0,1,2}"', '"default"')
-      )
-      self.assertNotEqual(
-          self._minor_to_major(input_layouts[0]),
-          self._minor_to_major(default_executable.get_parameter_layouts()[0]),
-      )
-
-    @unittest.skipIf(pathways or pathways_ifrt, "not implemented")
-    def testSetArgumentLayoutsLegacy(self):
-      """Tests setting the arg layouts with compile_options (deprecated).
-
-      New code should use the mhlo.layout_mode string attr on parameters.
-      """
-      # Create computation with custom input layouts.
-      c = self._NewComputation()
-      param_count = 0
-
-      def MakeArg(shape, dtype, layout):
-        nonlocal param_count
-        arr = np.arange(np.prod(shape), dtype=dtype).reshape(shape)
-        param = ops.Parameter(c, param_count,
-                              xla_client.shape_from_pyval(arr, layout))
-        param_count += 1
-        shape = xla_client.Shape.array_shape(np.dtype(dtype), shape, layout)
-        return arr, param, shape
-
-      arg0, p0, shape0 = MakeArg((2, 3, 4), np.float32, (1, 2, 0))
-      arg1, p1, shape1 = MakeArg((3, 2), np.int32, (0, 1))
-      arg2, p2, shape2 = MakeArg((), np.float64, ())
-
-      ops.Tuple(c, [
-          ops.Add(p0, ops.Constant(c, np.ones(arg0.shape, arg0.dtype))),
-          ops.Add(p1, ops.Constant(c, np.ones(arg1.shape, arg1.dtype))),
-          ops.Add(p2, ops.Constant(c, np.ones(arg2.shape, arg2.dtype))),
-      ])
-
-      # We also need to set the input layouts in the compile options.
-      options = xla_client.CompileOptions()
-      options.argument_layouts = [shape0, shape1, shape2]
-      executable = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()), compile_options=options)
-
-      # Test that compiled executable has expected layouts.
-      expected_layouts: Sequence[xla_client.Shape] = [shape0, shape1, shape2]
-      actual_layouts: Sequence[xla_client.Layout] = (
-          executable.get_parameter_layouts())
-      self.assertEqual(len(actual_layouts), len(expected_layouts))
-      for actual, expected in zip(actual_layouts, expected_layouts):
-        self.assertEqual(
-            self._minor_to_major(actual),
-            expected.layout().minor_to_major(),
-        )
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testSetOutputLayouts(self):
-      # TODO(b/309682374): implement on CPU and GPU
-      if self.backend.platform != "tpu":
-        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
-
-      # Hand-edited version of:
-      # jax.jit(lambda x, y, z: (x, y, z))(np.ones((1024, 8, 128)),
-      #                                    np.int32(42),
-      #                                    np.ones(10))
-      module_str = """
-module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
-                                 mhlo.num_replicas = 1 : i32} {
-  func.func public @main(
-      %arg0: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}"},
-      %arg1: tensor<i32> {mhlo.sharding = "{replicated}"},
-      %arg2: tensor<10xf32> {mhlo.sharding = "{replicated}"})
-      -> (tensor<1024x8x128xf32> {jax.result_info = "[0]",
-                                  mhlo.layout_mode = "{0,1,2}"},
-          tensor<i32> {jax.result_info = "[1]",
-                       mhlo.layout_mode = "{}"},
-          tensor<10xf32> {jax.result_info = "[2]",
-                          mhlo.layout_mode = "{0}"}) {
-    return %arg0, %arg1, %arg2 : tensor<1024x8x128xf32>, tensor<i32>, tensor<10xf32>
-  }
-}
-      """
-      executable = self.backend.compile(module_str)
-
-      # Check output layouts.
-      output_layouts = executable.get_output_layouts()
-      self.assertLen(output_layouts, 3)
-      self.assertEqual(self._minor_to_major(output_layouts[0]), (0, 1, 2))
-      self.assertEqual(self._minor_to_major(output_layouts[1]), ())
-      self.assertEqual(self._minor_to_major(output_layouts[2]), (0,))
-
-      # Compile a version with default first output layout so we can make sure
-      # we actually set it above.
-      default_executable = self.backend.compile(
-          module_str.replace('"{0,1,2}"', '"default"')
-      )
-      self.assertNotEqual(
-          self._minor_to_major(output_layouts[0]),
-          self._minor_to_major(default_executable.get_output_layouts()[0]),
-      )
-
-    @unittest.skipIf(pathways, "not implemented")
-    def SetLayoutsSharded(self):
-      # TODO(b/309682374): implement on CPU and GPU
-      if self.backend.platform != "tpu":
-        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
-
-      # Hand-edited version of:
-      # sharding = PositionalSharding(mesh_utils.create_device_mesh((8,)))
-      # x = jax.device_put(np.ones((1024, 128)), sharding.reshape(4, 2))
-      # jax.jit(lambda x, y: x + y, out_shardings=sharding)(x, 1.)
-      #
-      # This also lightly tests mixed default + user-specified input layouts.
-      module_str = """
-module @jit__lambda_ attributes {mhlo.num_partitions = 8 : i32,
-                                 mhlo.num_replicas = 1 : i32} {
-  func.func public @main(
-      %arg0: tensor<1024x128xf32> {mhlo.sharding = "{devices=[4,2]0,1,2,3,4,5,6,7}",
-                                   mhlo.layout_mode = "{0,1}"},
-      %arg1: tensor<f32> {mhlo.sharding = "{replicated}"})
-      -> (tensor<1024x128xf32> {jax.result_info = "",
-                                mhlo.sharding = "{devices=[4,2]0,1,2,3,4,5,6,7}",
-                                mhlo.layout_mode = "{0,1}"}) {
-    %0 = stablehlo.convert %arg1 : tensor<f32>
-    %1 = stablehlo.broadcast_in_dim %0, dims = [] : (tensor<f32>) -> tensor<1024x128xf32>
-    %2 = stablehlo.add %arg0, %1 : tensor<1024x128xf32>
-    return %2 : tensor<1024x128xf32>
-  }
-}
-      """
-      executable = self.backend.compile(module_str)
-
-      # Check input layouts.
-      input_layouts = executable.get_parameter_layouts()
-      self.assertLen(input_layouts, 2)
-      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1))
-      self.assertEqual(self._minor_to_major(input_layouts[1]), ())
-
-      # Check output layout.
-      output_layouts = executable.get_output_layouts()
-      self.assertLen(output_layouts, 1)
-      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1))
-
-      # Compile a version with default layouts so we can make sure we actually
-      # set it above.
-      default_executable = self.backend.compile(
-          module_str.replace('"{0,1}"', '"default"')
-      )
-      self.assertNotEqual(
-          self._minor_to_major(input_layouts[0]),
-          self._minor_to_major(default_executable.get_parameter_layouts()[0]),
-      )
-      self.assertNotEqual(
-          self._minor_to_major(output_layouts[0]),
-          self._minor_to_major(default_executable.get_output_layouts()[0]),
-      )
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testAutoArgumentLayouts(self):
-      # TODO(b/309682374): implement on CPU and GPU
-      if self.backend.platform != "tpu":
-        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
-
-      # Hand-edited version of:
-      # jax.numpy.einsum("...a,ahd->...hd", ...)
-      module_str = """
-module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
-                                 mhlo.num_replicas = 1 : i32} {
-  func.func public @main(
-      %arg0: tensor<1024x1024xf32> {mhlo.sharding = "{replicated}",
-                                    mhlo.layout_mode = "auto"},
-      %arg1: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}",
-                                     mhlo.layout_mode = "auto"})
-      -> (tensor<1024x8x128xf32> {jax.result_info = ""}) {
-    %0 = stablehlo.dot_general %arg0, %arg1,
-        contracting_dims = [1] x [0],
-        precision = [DEFAULT, DEFAULT] : (tensor<1024x1024xf32>,
-                                          tensor<1024x8x128xf32>)
-        -> tensor<1024x8x128xf32>
-    return %0 : tensor<1024x8x128xf32>
-  }
-}
-"""
-      executable = self.backend.compile(module_str)
-
-      # Check input layouts.
-      input_layouts = executable.get_parameter_layouts()
-      self.assertEqual(self._minor_to_major(input_layouts[0]), (1, 0))
-      self.assertEqual(self._minor_to_major(input_layouts[1]), (2, 0, 1))
-
-      # Compile a version with default layouts so we can make sure the compiler
-      # is actually choosing above.
-      default_executable = self.backend.compile(
-          module_str.replace('"auto"', '"default"')
-      )
-      # We expect the compiler to choose a non-default layout for the second
-      # (1024,8,128) argument.
-      self.assertNotEqual(
-          self._minor_to_major(input_layouts[1]),
-          self._minor_to_major(default_executable.get_parameter_layouts()[1]),
-      )
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testAutoOutputLayouts(self):
-      # TODO(b/309682374): implement on CPU and GPU
-      if self.backend.platform != "tpu":
-        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
-
-      # Generated with jax.numpy.einsum("...a,ahd->...hd", ...)
-      module_str = """
-module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
-                                 mhlo.num_replicas = 1 : i32} {
-  func.func public @main(
-      %arg0: tensor<1024x1024xf32> {mhlo.sharding = "{replicated}"},
-      %arg1: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}"})
-      -> (tensor<1024x8x128xf32> {jax.result_info = "",
-                                  mhlo.layout_mode = "auto"}) {
-    %0 = stablehlo.dot_general %arg0, %arg1,
-        contracting_dims = [1] x [0],
-        precision = [DEFAULT, DEFAULT] : (tensor<1024x1024xf32>,
-                                          tensor<1024x8x128xf32>)
-        -> tensor<1024x8x128xf32>
-    return %0 : tensor<1024x8x128xf32>
-  }
-}
-"""
-      executable = self.backend.compile(module_str)
-
-      # Check output layout
-      output_layout, = executable.get_output_layouts()
-      self.assertEqual(self._minor_to_major(output_layout), (2, 0, 1))
-
-      # Compile a version with default layouts so we can make sure the compiler
-      # is actually choosing above.
-      default_executable = self.backend.compile(
-          module_str.replace('"auto"', '"default"')
-      )
-      # We expect the compiler to choose a non-default output layout.
-      self.assertNotEqual(
-          self._minor_to_major(output_layout),
-          self._minor_to_major(default_executable.get_output_layouts()[0]),
-      )
-
-  tests.append(LayoutsTest)
-
-  class BufferTest(ComputationTest):
-    """Tests focusing on execution with Buffers."""
-
-    def testConstantSum(self):
-      c = self._NewComputation()
-      ops.Add(
-          ops.Constant(c, np.float32(1.11)), ops.Constant(c, np.float32(3.14)))
-      self._ExecuteAndCompareClose(c, expected=[4.25])
-
-    def testOneParameterSum(self):
-      c = self._NewComputation()
-      ops.Add(
-          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0.))),
-          ops.Constant(c, np.float32(3.14)))
-      self._ExecuteAndCompareClose(
-          c, arguments=[NumpyArrayF32(1.11)], expected=[4.25])
-
-    def testTwoParameterSum(self):
-      c = self._NewComputation()
-      ops.Add(
-          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0.))),
-          ops.Parameter(c, 1, xla_client.shape_from_pyval(NumpyArrayF32(0.))))
-      self._ExecuteAndCompareClose(
-          c,
-          arguments=[NumpyArrayF32(1.11),
-                     NumpyArrayF32(3.14)],
-          expected=[4.25])
-
-    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
-    def testCannotCallWithDeletedBuffers(self):
-      c = self._NewComputation()
-      ops.Add(
-          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0.))),
-          ops.Constant(c, np.float32(3.14)))
-      arg = NumpyArrayF32(1.11)
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()))
-      arg_buffer = self.backend.buffer_from_pyval(arg)
-      arg_buffer.delete()
-      with self.assertRaises(xla_client.XlaRuntimeError):
-        compiled_c.execute([arg_buffer])
-
-    def testXlaShapeIndex(self):
-      a = xla_client.ShapeIndex((1, 2))
-      b = xla_client.ShapeIndex((1, 2))
-      c = xla_client.ShapeIndex((2, 3))
-      self.assertEqual(a, b)
-      self.assertNotEqual(b, c)
-
-    def testLayout(self):
-      f32 = xla_client.PrimitiveType.F32
-      a = xla_client.Shape.array_shape(f32, (2, 3), (0, 1)).layout()
-      b = xla_client.Shape.array_shape(f32, (2, 3), (0, 1)).layout()
-      c = xla_client.Shape.array_shape(f32, (2, 3), (1, 0)).layout()
-      self.assertEqual(a.minor_to_major(), (0, 1))
-      self.assertEqual(b.minor_to_major(), (0, 1))
-      self.assertEqual(c.minor_to_major(), (1, 0))
-      self.assertEqual(a, b)
-      self.assertNotEqual(a, c)
-      self.assertNotEqual(b, c)
-      self.assertEqual(hash(a), hash(b))
-      self.assertNotEqual(hash(a), hash(c))
-      self.assertNotEqual(hash(b), hash(c))
-
-    def testBlockUntilReadyWorks(self):
-      arg = np.array([[1., 2.]], np.float32)
-      arg_buffer = self.backend.buffer_from_pyval(arg)
-      arg_buffer.block_until_ready()
-      # This test merely checks that nothing goes awry when we call
-      # block_until_ready(); it's difficult to test anything else.
-
-    def testBlockUntilReadyRaisesOnDeletedBuffer(self):
-      arg = np.array([[1., 2.]], np.float32)
-      buffer = self.backend.buffer_from_pyval(arg)
-      buffer.delete()
-      with self.assertRaisesRegex(
-          RuntimeError,
-          re.escape(
-              "BlockHostUntilReady() called on deleted or donated buffer")):
-        buffer.block_until_ready()
-
-    @unittest.skipIf(pathways_ifrt, "not implemented")
-    def testOnDeviceSizeInBytes(self):
-      if not isinstance(self.backend, xla_client.Client):
-        self.skipTest("TPU Driver doesn't support OnDeviceSizeInBytes.")
-      arg0 = np.array([])
-      arg1 = np.array([[0., 1., 2.]], np.float32)
-      arg2 = np.array([[3., 4., 5.]], bfloat16)
-      arg0_buffer = self.backend.buffer_from_pyval(arg0)
-      arg1_buffer = self.backend.buffer_from_pyval(arg1)
-      arg2_buffer = self.backend.buffer_from_pyval(arg2)
-      self.assertEqual(arg0_buffer.on_device_size_in_bytes(), 0)
-      # OnDeviceSizeInBytes varies depending on the platform. Confirm there's
-      # a reasonable value.
-      self.assertGreater(arg1_buffer.on_device_size_in_bytes(), 0)
-      self.assertGreater(arg2_buffer.on_device_size_in_bytes(), 0)
-
-    def testLiveBuffers(self):
-      if not isinstance(self.backend, xla_client.Client):
-        self.skipTest("TPU Driver doesn't support LiveBuffers().")
-      self.assertEmpty(self.backend.live_buffers())
-      arg0 = np.array([])
-      arg1 = np.array([[0., 1., 2.]], np.float32)
-      arg2 = np.array([[3., 4., 5.]], bfloat16)
-      arg0_buffer = self.backend.buffer_from_pyval(arg0)
-      arg1_buffer = self.backend.buffer_from_pyval(arg1)
-      arg2_buffer = self.backend.buffer_from_pyval(arg2)
-      self.assertLen(self.backend.live_buffers(), 3)
-      self.assertIs(self.backend.live_buffers()[0], arg2_buffer)
-      self.assertIs(self.backend.live_buffers()[1], arg1_buffer)
-      self.assertIs(self.backend.live_buffers()[2], arg0_buffer)
-
-      arg1_buffer.delete()
-      self.assertLen(self.backend.live_buffers(), 2)
-      self.assertIs(self.backend.live_buffers()[0], arg2_buffer)
-      self.assertIs(self.backend.live_buffers()[1], arg0_buffer)
-
-      arg0_buffer.delete()
-      arg2_buffer.delete()
-      self.assertEmpty(self.backend.live_buffers())
-
-    def testCopyToHost(self):
-      arg0 = np.array([[1., 2.]], np.float32)
-      arg1 = np.array([[3., 4.]], np.float32)
-      arg0_buffer = self.backend.buffer_from_pyval(arg0)
-      arg1_buffer = self.backend.buffer_from_pyval(arg1)
-      # Prefetch two buffers using copy_to_host_async, and then retrieve their
-      # values using np.asarray().
-      arg0_buffer.copy_to_host_async()
-      arg0_buffer.copy_to_host_async()  # Duplicate calls don't do anything.
-      arg1_buffer.copy_to_host_async()
-      np.testing.assert_equal(arg0, np.asarray(arg0_buffer))
-      np.testing.assert_equal(arg1, np.asarray(arg1_buffer))
-      # copy_to_host_async does nothing after np.asarray() is called.
-      arg0_buffer.copy_to_host_async()
-      np.testing.assert_equal(arg0, np.asarray(arg0_buffer))
-
-    def testDevice(self):
-      x = np.arange(8, dtype=np.int32)
-      for device in self.backend.local_devices():
-        buf = self.backend.buffer_from_pyval(x, device=device)
-        self.assertEqual(buf.device(), device)
-        np.testing.assert_equal(x, np.asarray(buf))
-
-    def testStandardTypes(self):
-      for dtype in standard_dtypes:
-        if dtype == np.complex128:
-          continue
-        # float8_e4m3b11fnuz not supported on some TPU backends.
-        if (
-            dtype in [float8_e5m2fnuz, float8_e4m3fnuz, float8_e4m3b11fnuz]
-            and self.backend.platform == "tpu"
-        ):
-          if self.backend.platform_version.find("TPU") == -1:
-            continue
-        arr = self.backend.buffer_from_pyval(np.array([0, 1], dtype))
-        arr = np.asarray(arr)
-        self.assertEqual(dtype, type(arr[0]))
-
-    @unittest.skipIf(pathways_ifrt, "not implemented")
-    def testUnsafeBufferPointer(self):
-      if not isinstance(self.backend, xla_client.Client):
-        self.skipTest("TPU Driver doesn't support UnsafeBufferPointer().")
-      arg0 = np.array([])
-      arg1 = np.array([[0., 1., 2.]], np.float32)
-      arg2 = np.array([[3., 4., 5.]], bfloat16)
-      arg0_buffer = self.backend.buffer_from_pyval(arg0)
-      arg1_buffer = self.backend.buffer_from_pyval(arg1)
-      arg2_buffer = self.backend.buffer_from_pyval(arg2)
-      self.assertGreaterEqual(arg0_buffer.unsafe_buffer_pointer(), 0)
-      self.assertGreaterEqual(arg1_buffer.unsafe_buffer_pointer(), 0)
-      self.assertGreaterEqual(arg2_buffer.unsafe_buffer_pointer(), 0)
-
-    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt, "not implemented")
-    def testClone(self):
-      x = np.array([[3., 4., 5.]], np.float32)
-      y = self.backend.buffer_from_pyval(x)
-      z = y.clone()
-      self.assertNotEqual(id(x), id(y))
-      np.testing.assert_array_equal(np.asarray(y), np.asarray(z))
-      self.assertEqual(y.unsafe_buffer_pointer(), z.unsafe_buffer_pointer())
-
-  tests.append(BufferTest)
-
-  class SingleOpTest(ComputationTest):
-    """Tests for single ops.
-
-    The goal here is smoke testing - to exercise the most basic functionality of
-    single XLA ops. As minimal as possible number of additional ops are added
-    around the op being tested.
-    """
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testConcatenate(self, dtype):
-      c = self._NewComputation()
-      args = (
-          ops.Constant(c, np.array([1.0, 2.0, 3.0], dtype=dtype)),
-          ops.Constant(c, np.array([4.0, 5.0, 6.0], dtype=dtype)),
-      )
-      ops.ConcatInDim(c, args, dimension=0)
-      self._ExecuteAndCompareExact(
-          c, expected=[np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtype)])
-
-    # pyformat: disable
-    @parameterized.named_parameters({
-        "testcase_name": "_{}_{}".format(src_dtype.__name__,
-                                         dst_dtype.__name__),
-        "src_dtype": src_dtype,
-        "dst_dtype": dst_dtype,
-    } for src_dtype, dst_dtype in itertools.permutations(
-        [np.bool_, np.int32, np.int64, np.float32, np.float64], 2))
-    # pyformat: enable
-    def testConvertElementType(self, src_dtype, dst_dtype):
-      if ((src_dtype in [np.int64, np.float64] or
-           dst_dtype in [np.int64, np.float64]) and
-          self.backend.platform == "tpu"):
-        self.skipTest("TPU doesn't support float64")
-      c = self._NewComputation()
-      x = np.array([0, 1, 0, 0, 1], dtype=src_dtype)
-      ops.ConvertElementType(
-          ops.Constant(c, x), xla_client.dtype_to_etype(dst_dtype))
-
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      self.assertLen(result, 1)
-      expected = np.array(x, dtype=dst_dtype)
-
-      self.assertEqual(result[0].shape, expected.shape)
-      self.assertEqual(result[0].dtype, expected.dtype)
-      np.testing.assert_equal(result[0], expected)
-
-    # pyformat: disable
-    @parameterized.named_parameters(
-        {
-            "testcase_name": "_{}_{}".format(src_dtype.__name__,
-                                             dst_dtype.__name__),
-            "src_dtype": src_dtype,
-            "dst_dtype": dst_dtype,
-        }
-        for dtypes in [[np.int32, np.float32], [np.int64, np.float64]]
-        for src_dtype, dst_dtype in itertools.permutations(dtypes, 2))
-    # pyformat: enable
-    def testBitcastConvertType(self, src_dtype, dst_dtype):
-      if (np.float64 in (src_dtype, dst_dtype) and
-          self.backend.platform == "tpu"):
-        self.skipTest("TPU doesn't support float64")
-      c = self._NewComputation()
-      x = np.array([0, 1, 0, 0, 1], dtype=src_dtype)
-      ops.BitcastConvertType(
-          ops.Constant(c, x), xla_client.dtype_to_etype(dst_dtype))
-
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      self.assertLen(result, 1)
-      expected = x.view(dst_dtype)
-
-      self.assertEqual(result[0].shape, expected.shape)
-      self.assertEqual(result[0].dtype, expected.dtype)
-      np.testing.assert_equal(result[0], expected)
-
-    # TODO(b/123523486) implement AllToAll on CPU
-    def DISABLED_testAllToAllOneReplica(self):
-      samples = [
-          NumpyArrayF32([97.0]),
-          NumpyArrayF32([64.0, 117.0]),
-          NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
-      ]
-      for lhs in samples[:1]:
-        c = self._NewComputation()
-        ops.AllToAll(ops.Constant(c, lhs), 0, 0)
-        self._ExecuteAndCompareExact(c, expected=[lhs])
-
-    def testCrossReplicaSumOneReplica(self):
-      samples = [
-          NumpyArrayF32(42.0),
-          NumpyArrayF32([97.0]),
-          NumpyArrayF32([64.0, 117.0]),
-          NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
-      ]
-      for lhs in samples:
-        c = self._NewComputation()
-        ops.CrossReplicaSum(ops.Constant(c, lhs))
-        self._ExecuteAndCompareExact(c, expected=[lhs])
-
-    def testReplicaId(self):
-      c = self._NewComputation()
-      _ = ops.ReplicaId(c)
-      self._ExecuteAndCompareExact(c, expected=[0])
-
-    def testCrossReplicaSumOneReplicaWithSingletonGroup(self):
-      samples = [
-          NumpyArrayF32(42.0),
-          NumpyArrayF32([97.0]),
-          NumpyArrayF32([64.0, 117.0]),
-          NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
-      ]
-      for lhs in samples:
-        c = self._NewComputation()
-        ops.CrossReplicaSum(
-            ops.Constant(c, lhs), xla_client.make_replica_groups([[0]]))
-        self._ExecuteAndCompareExact(c, expected=[lhs])
-
-    # TODO(phawkins): np.dot implementation doesn't support bfloat16
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes if dtype != bfloat16)
-    def testDotMatrixVector(self, dtype):
-      c = self._NewComputation()
-      lhs = np.array([[2.0, 3.0], [4.0, 5.0]], dtype=dtype)
-      rhs = np.array([[10.0], [20.0]], dtype=dtype)
-      ops.Dot(ops.Constant(c, lhs), ops.Constant(c, rhs))
-      self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
-
-    # TODO(phawkins): np.dot implementation doesn't support bfloat16
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes if dtype != bfloat16)
-    def testDotMatrixMatrix(self, dtype):
-      c = self._NewComputation()
-      lhs = np.array([[2.0, 3.0], [4.0, 5.0]], dtype=dtype)
-      rhs = np.array([[10.0, 20.0], [100.0, 200.0]], dtype=dtype)
-      ops.Dot(ops.Constant(c, lhs), ops.Constant(c, rhs))
-      self._ExecuteAndCompareClose(c, expected=[np.dot(lhs, rhs)])
-
-    def testDotGeneral(self):
-      c = self._NewComputation()
-      rng = np.random.RandomState(0)
-      lhs = NumpyArrayF32(rng.randn(10, 3, 4))
-      rhs = NumpyArrayF32(rng.randn(10, 4, 5))
-      dimension_numbers = xla_client.make_dot_dimension_numbers(
-          (([2], [1]), ([0], [0])))
-      ops.DotGeneral(
-          ops.Constant(c, lhs), ops.Constant(c, rhs), dimension_numbers)
-      self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
-
-    def testDotGeneralWithDotDimensionNumbersProto(self):
-      c = self._NewComputation()
-      rng = np.random.RandomState(0)
-      lhs = NumpyArrayF32(rng.randn(10, 3, 4))
-      rhs = NumpyArrayF32(rng.randn(10, 4, 5))
-
-      dimension_numbers = xla_client.DotDimensionNumbers()
-      dimension_numbers.lhs_contracting_dimensions.append(2)
-      dimension_numbers.rhs_contracting_dimensions.append(1)
-      dimension_numbers.lhs_batch_dimensions.append(0)
-      dimension_numbers.rhs_batch_dimensions.append(0)
-
-      ops.DotGeneral(
-          ops.Constant(c, lhs), ops.Constant(c, rhs), dimension_numbers)
-      self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
-
-    def testDotGeneralWithPrecisionConfig(self):
-      c = self._NewComputation()
-      rng = np.random.RandomState(0)
-      lhs = NumpyArrayF32(rng.randn(10, 3, 4))
-      rhs = NumpyArrayF32(rng.randn(10, 4, 5))
-      dimension_numbers = xla_client.make_dot_dimension_numbers(
-          (([2], [1]), ([0], [0])))
-      config = xla_client.PrecisionConfig()
-      config.operand_precision.append(config.Precision.HIGH)
-      config.operand_precision.append(config.Precision.HIGHEST)
-      ops.DotGeneral(
-          ops.Constant(c, lhs),
-          ops.Constant(c, rhs),
-          dimension_numbers,
-          precision_config=config)
-      self._ExecuteAndCompareClose(c, expected=[np.matmul(lhs, rhs)], rtol=1e-6)
-
-    def testConvGeneralDilatedF32(self):
-      c = self._NewComputation()
-      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-      lhs = a(1, 1, 2, 3)
-      rhs = a(1, 1, 1, 2) * 10
-      strides = [1, 1]
-      pads = [(1, 0), (0, 1)]
-      lhs_dilation = (2, 1)
-      rhs_dilation = (1, 1)
-      dimension_numbers = xla_client.make_convolution_dimension_numbers(
-          ("NCHW", "OIHW", "NCHW"), 2)
-      ops.ConvGeneralDilated(
-          ops.Constant(c, lhs), ops.Constant(c, rhs), strides, pads,
-          lhs_dilation, rhs_dilation, dimension_numbers)
-      result = np.array([[[
-          [0., 0., 0.],
-          [10., 20., 0.],
-          [0., 0., 0.],
-          [40., 50., 0.],
-      ]]])
-      self._ExecuteAndCompareClose(c, expected=[result])
-
-    def testConvGeneralDilatedF32WithPrecisionConfig(self):
-      c = self._NewComputation()
-      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-      lhs = a(1, 1, 2, 3)
-      rhs = a(1, 1, 1, 2) * 10
-      strides = [1, 1]
-      pads = [(1, 0), (0, 1)]
-      lhs_dilation = (2, 1)
-      rhs_dilation = (1, 1)
-      dimension_numbers = xla_client.make_convolution_dimension_numbers(
-          ("NCHW", "OIHW", "NCHW"), 2)
-      config = xla_client.PrecisionConfig()
-      config.operand_precision.append(config.Precision.HIGHEST)
-      config.operand_precision.append(config.Precision.DEFAULT)
-      ops.ConvGeneralDilated(
-          ops.Constant(c, lhs),
-          ops.Constant(c, rhs),
-          strides,
-          pads,
-          lhs_dilation,
-          rhs_dilation,
-          dimension_numbers,
-          precision_config=config)
-      result = np.array([[[
-          [0., 0., 0.],
-          [10., 20., 0.],
-          [0., 0., 0.],
-          [40., 50., 0.],
-      ]]])
-      self._ExecuteAndCompareClose(c, expected=[result])
-
-    def testConvGeneralDilatedPermutedF32(self):
-      c = self._NewComputation()
-      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-      lhs = a(1, 1, 2, 3)
-      rhs = a(1, 1, 1, 2) * 10
-      strides = [1, 1]
-      pads = [(1, 0), (0, 1)]
-      lhs_dilation = (2, 1)
-      rhs_dilation = (1, 1)
-
-      dimension_numbers = xla_client.make_convolution_dimension_numbers(
-          ("NHWC", "OIHW", "CWNH"), 2)
-      ops.ConvGeneralDilated(
-          ops.Constant(c, np.transpose(lhs,
-                                       (0, 2, 3, 1))), ops.Constant(c, rhs),
-          strides, pads, lhs_dilation, rhs_dilation, dimension_numbers)
-      result = np.array([[[[0., 0., 0.], [10., 20., 0.], [0., 0., 0.],
-                           [40., 50., 0.]]]])
-      self._ExecuteAndCompareClose(
-          c, expected=[np.transpose(result, (1, 3, 0, 2))])
-
-    def testConvGeneralDilatedGroupedConvolutionF32(self):
-      c = self._NewComputation()
-      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-      lhs = a(1, 2, 2, 3)
-      rhs = a(2, 1, 1, 2) * 10
-      strides = [1, 1]
-      pads = [(1, 0), (0, 1)]
-      lhs_dilation = (2, 1)
-      rhs_dilation = (1, 1)
-      dimension_numbers = xla_client.make_convolution_dimension_numbers(
-          ("NCHW", "OIHW", "NCHW"), 2)
-      feature_group_count = 2
-      ops.ConvGeneralDilated(
-          ops.Constant(c, lhs), ops.Constant(c, rhs), strides, pads,
-          lhs_dilation, rhs_dilation, dimension_numbers, feature_group_count)
-      result = np.array([[[
-          [0., 0., 0.],
-          [10., 20., 0.],
-          [0., 0., 0.],
-          [40., 50., 0.],
-      ], [
-          [0., 0., 0.],
-          [330., 380., 160.],
-          [0., 0., 0.],
-          [480., 530., 220.],
-      ]]])
-      self._ExecuteAndCompareClose(c, expected=[result])
-
-    def testConvGeneralDilatedWindowReversalF32(self):
-      c = self._NewComputation()
-      a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-      lhs = a(1, 1, 2, 3)
-      rhs = a(1, 1, 1, 2) * 10
-      strides = [1, 1]
-      pads = [(1, 0), (0, 1)]
-      lhs_dilation = (2, 1)
-      rhs_dilation = (1, 1)
-      window_reversal = [False, True]
-      dimension_numbers = xla_client.make_convolution_dimension_numbers(
-          ("NCHW", "OIHW", "NCHW"), 2)
-      ops.ConvGeneralDilated(
-          ops.Constant(c, lhs),
-          ops.Constant(c, rhs),
-          strides,
-          pads,
-          lhs_dilation,
-          rhs_dilation,
-          dimension_numbers,
-          window_reversal=window_reversal)
-      result = np.array([[[
-          [0., 0., 0.],
-          [0., 10., 20.],
-          [0., 0., 0.],
-          [30., 40., 50.],
-      ]]])
-      self._ExecuteAndCompareClose(c, expected=[result])
-
-    def testBooleanNot(self):
-      c = self._NewComputation()
-      arr = NumpyArrayBool([True, False, True])
-      ops.Not(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[~arr])
-
-    def testPopulationCount(self):
-      c = self._NewComputation()
-      arr = NumpyArrayS32([3, 0, 1])
-      ops.PopulationCount(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.array([2, 0, 1])])
-
-    def testCountLeadingZeros(self):
-      c = self._NewComputation()
-      arr = NumpyArrayS32([0x7FFF, 0x12345678])
-      ops.Clz(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[[17, 3]])
-
-    def testExp(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Exp(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.exp(arr)])
-
-    def testExpWithResultAccuracy(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      accuracy = xla_client.ResultAccuracy()
-      accuracy.mode = xla_client.ResultAccuracyMode.DEFAULT
-      ops.Exp(ops.Constant(c, arr), accuracy)
-      self._ExecuteAndCompareClose(c, expected=[np.exp(arr)])
-
-    def testExpm1(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Expm1(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.expm1(arr)])
-
-    def testRound(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Round(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.round(arr)])
-
-    def testLog(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Log(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.log(arr)])
-
-    def testLog1p(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Log1p(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.log1p(arr)])
-
-    def testNeg(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Neg(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[-arr])
-
-    def testFloor(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Floor(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.floor(arr)])
-
-    def testCeil(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
-      ops.Ceil(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.ceil(arr)])
-
-    def testAbs(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, -12.1, 2.4, -1.])
-      ops.Abs(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.abs(arr)])
-
-    def testTanF32(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([-0.2, 3.3, 12.1, 0.1, 0.0001])
-      ops.Tan(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.tan(arr)])
-
-    def testTanhF32(self):
-      c = self._NewComputation()
-      arr = NumpyArrayF32([-0.2, 3.3, 12.1, 0.1, 0.0001])
-      ops.Tanh(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)])
-
-    def testTanhF64(self):
-      if self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support 64bit tanh")
-      c = self._NewComputation()
-      arr = NumpyArrayF64([-0.2, 3.3, 12.1, 0.1, 0.0001])
-      ops.Tanh(ops.Constant(c, arr))
-      self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)], rtol=1e-12)
-
-    def testTranspose(self):
-
-      def _TransposeAndTest(array, permutation):
-        c = self._NewComputation()
-        ops.Transpose(ops.Constant(c, array), permutation)
-        expected = np.transpose(array, permutation)
-        self._ExecuteAndCompareClose(c, expected=[expected])
-
-      _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [0, 1])
-      _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [1, 0])
-      _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [0, 1])
-      _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [1, 0])
-
-      arr = np.random.RandomState(0).randn(2, 3, 4).astype(np.float32)
-      for permutation in itertools.permutations(range(arr.ndim)):
-        _TransposeAndTest(arr, permutation)
-        _TransposeAndTest(np.asfortranarray(arr), permutation)
-
-    def testEq(self):
-      c = self._NewComputation()
-      ops.Eq(
-          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4])),
-          ops.Constant(c, NumpyArrayS32([4, 2, 3, 1])))
-      self._ExecuteAndCompareExact(c, expected=[[False, True, True, False]])
-
-    def testNe(self):
-      c = self._NewComputation()
-      ops.Ne(
-          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4])),
-          ops.Constant(c, NumpyArrayS32([4, 2, 3, 1])))
-      self._ExecuteAndCompareExact(c, expected=[[True, False, False, True]])
-
-      ops.Ne(
-          ops.Constant(c, NumpyArrayF32([-2.0, 0.0,
-                                         float("nan"),
-                                         float("nan")])),
-          ops.Constant(c, NumpyArrayF32([2.0, -0.0, 1.0,
-                                         float("nan")])))
-      self._ExecuteAndAssertWith(
-          np.testing.assert_allclose,
-          c, (),
-          expected=[[True, False, True, True]])
-
-    def testGt(self):
-      c = self._NewComputation()
-      ops.Gt(
-          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
-          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
-      self._ExecuteAndCompareExact(
-          c, expected=[[False, True, True, False, False]])
-
-    def testGe(self):
-      c = self._NewComputation()
-      ops.Ge(
-          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
-          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
-      self._ExecuteAndCompareExact(
-          c, expected=[[True, True, True, False, False]])
-
-    def testLt(self):
-      c = self._NewComputation()
-      ops.Lt(
-          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
-          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
-      self._ExecuteAndCompareExact(
-          c, expected=[[False, False, False, True, True]])
-
-    def testLe(self):
-      c = self._NewComputation()
-      ops.Le(
-          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 9])),
-          ops.Constant(c, NumpyArrayS32([1, 0, 2, 7, 12])))
-      self._ExecuteAndCompareExact(
-          c, expected=[[True, False, False, True, True]])
-
-    def testMax(self):
-      c = self._NewComputation()
-      ops.Max(
-          ops.Constant(c, NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
-          ops.Constant(c, NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
-      self._ExecuteAndCompareExact(c, expected=[[1.0, 2.0, 3.0, 7.0, 12.0]])
-
-    def testMaxExplicitBroadcastDim0(self):
-      c = self._NewComputation()
-      ops.Max(
-          ops.Constant(c, NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          ops.Constant(c, NumpyArrayF32([3, 4, 5])),
-          broadcast_dimensions=(0,))
-      self._ExecuteAndCompareExact(
-          c, expected=[[[3, 3, 3], [4, 5, 6], [7, 8, 9]]])
-
-    def testMaxExplicitBroadcastDim1(self):
-      c = self._NewComputation()
-      ops.Max(
-          ops.Constant(c, NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          ops.Constant(c, NumpyArrayF32([3, 4, 5])),
-          broadcast_dimensions=(1,))
-      self._ExecuteAndCompareExact(
-          c, expected=[[[3, 4, 5], [4, 5, 6], [7, 8, 9]]])
-
-    def testMin(self):
-      c = self._NewComputation()
-      ops.Min(
-          ops.Constant(c, NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
-          ops.Constant(c, NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
-      self._ExecuteAndCompareExact(c, expected=[[1.0, 0.0, 2.0, 4.0, 9.0]])
-
-    def testPad(self):
-      c = self._NewComputation()
-      ops.Pad(
-          ops.Constant(c, NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
-          ops.Constant(c, NumpyArrayF32(0.0)),
-          xla_client.make_padding_config([(1, 2, 1), (0, 1, 0)]))
-      self._ExecuteAndCompareClose(
-          c,
-          expected=[[[0.0, 0.0, 0.0], [1.0, 2.0, 0.0], [0.0, 0.0, 0.0],
-                     [3.0, 4.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]])
-
-    def testPadWithPaddingConfig(self):
-      c = self._NewComputation()
-      padding_config = xla_client.PaddingConfig()
-      for lo, hi, interior in [(1, 2, 1), (0, 1, 0)]:
-        dimension = xla_client.PaddingConfigDimension()
-        dimension.edge_padding_low = lo
-        dimension.edge_padding_high = hi
-        dimension.interior_padding = interior
-        padding_config.dimensions.append(dimension)
-      ops.Pad(
-          ops.Constant(c, NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
-          ops.Constant(c, NumpyArrayF32(0.0)), padding_config)
-      self._ExecuteAndCompareClose(
-          c,
-          expected=[[[0.0, 0.0, 0.0], [1.0, 2.0, 0.0], [0.0, 0.0, 0.0],
-                     [3.0, 4.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]])
-
-    def testReshape(self):
-      c = self._NewComputation()
-      ops.Reshape(
-          ops.Constant(c, NumpyArrayS32([[1, 2], [3, 4], [5, 6]])),
-          new_sizes=[2, 3])
-      self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3], [4, 5, 6]]])
-
-    def testCollapse(self):
-      c = self._NewComputation()
-      ops.Collapse(
-          ops.Constant(c, NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
-          dimensions=[1, 2])
-      self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3, 4], [5, 6, 7, 8]]])
-
-    def testRev(self):
-      c = self._NewComputation()
-      ops.Rev(
-          ops.Constant(c, NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
-          dimensions=[0, 2])
-      self._ExecuteAndCompareExact(
-          c, expected=[[[[6, 5], [8, 7]], [[2, 1], [4, 3]]]])
-
-    def testReducePrecision(self):
-      c = self._NewComputation()
-      ops.ReducePrecision(
-          ops.Constant(c, NumpyArrayF32([float.fromhex("0x1.32fffep-3")])),
-          exponent_bits=8,
-          mantissa_bits=7)
-      self._ExecuteAndCompareClose(c, expected=[[float.fromhex("0x1.32p-3")]])
-
-    def testClampF32(self):
-      c = self._NewComputation()
-      ops.Clamp(
-          ops.Constant(c, NumpyArrayF32(-1)),
-          ops.Constant(c, NumpyArrayF32([-2, -1, 0, 1, 2, 3])),
-          ops.Constant(c, NumpyArrayF32(2)))
-      self._ExecuteAndCompareExact(c, expected=[[-1, -1, 0, 1, 2, 2]])
-
-    def testClampS32(self):
-      c = self._NewComputation()
-      ops.Clamp(
-          ops.Constant(c, NumpyArrayS32(-1)),
-          ops.Constant(c, NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
-          ops.Constant(c, NumpyArrayS32(2)))
-      self._ExecuteAndCompareExact(c, expected=[[-1, -1, 0, 1, 2, 2]])
-
-    def testSelect(self):
-      c = self._NewComputation()
-      ops.Select(
-          ops.Constant(c, NumpyArrayBool([True, False, False, True, False])),
-          ops.Constant(c, NumpyArrayS32([1, 2, 3, 4, 5])),
-          ops.Constant(c, NumpyArrayS32([-1, -2, -3, -4, -5])))
-      self._ExecuteAndCompareExact(c, expected=[[1, -2, -3, 4, -5]])
-
-    def testSlice(self):
-      c = self._NewComputation()
-      ops.Slice(
-          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          [1, 0], [3, 2], [1, 1])
-      self._ExecuteAndCompareExact(c, expected=[[[4, 5], [7, 8]]])
-
-    def testSliceInDim(self):
-      c = self._NewComputation()
-      ops.SliceInDim(
-          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          start_index=1,
-          limit_index=2,
-          stride=1,
-          dimno=1)
-      self._ExecuteAndCompareExact(c, expected=[[[2], [5], [8]]])
-      ops.SliceInDim(
-          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          start_index=0,
-          limit_index=3,
-          stride=2,
-          dimno=0)
-      self._ExecuteAndCompareExact(c, expected=[[[1, 2, 3], [7, 8, 9]]])
-
-    def testDynamicSlice(self):
-      c = self._NewComputation()
-      ops.DynamicSlice(
-          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])), [
-              ops.Constant(c, NumpyArrayS32(1)),
-              ops.Constant(c, NumpyArrayS32(0))
-          ], [2, 2])
-      self._ExecuteAndCompareExact(c, expected=[[[4, 5], [7, 8]]])
-
-    def testDynamicUpdateSlice(self):
-      c = self._NewComputation()
-      ops.DynamicUpdateSlice(
-          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          ops.Constant(c, NumpyArrayS32([[1, 2], [3, 4]])), [
-              ops.Constant(c, NumpyArrayS32(1)),
-              ops.Constant(c, NumpyArrayS32(1))
-          ])
-      self._ExecuteAndCompareExact(
-          c, expected=[[[1, 2, 3], [4, 1, 2], [7, 3, 4]]])
-
-    def testTuple(self):
-      c = self._NewComputation()
-      ops.Tuple(c, [
-          ops.Constant(c, np.int32(42)),
-          ops.Constant(c, NumpyArrayF32([1.0, 2.0])),
-          ops.Constant(c, NumpyArrayBool([True, False, False, True]))
-      ])
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      self.assertLen(result, 3)
-      np.testing.assert_equal(result[0], 42)
-      np.testing.assert_allclose(result[1], [1.0, 2.0])
-      np.testing.assert_equal(result[2], [True, False, False, True])
-
-    def testGetTupleElement(self):
-      c = self._NewComputation()
-      ops.GetTupleElement(
-          ops.Tuple(c, [
-              ops.Constant(c, np.int32(42)),
-              ops.Constant(c, NumpyArrayF32([1.0, 2.0])),
-              ops.Constant(c, NumpyArrayBool([True, False, False, True]))
-          ]), 1)
-      self._ExecuteAndCompareClose(c, expected=[[1.0, 2.0]])
-
-    def testBroadcast(self):
-      c = self._NewComputation()
-      ops.Broadcast(
-          ops.Constant(c, NumpyArrayS32([10, 20, 30, 40])), sizes=(3,))
-      self._ExecuteAndCompareExact(
-          c, expected=[[[10, 20, 30, 40], [10, 20, 30, 40], [10, 20, 30, 40]]])
-
-    def testBroadcastInDim(self):
-      c = self._NewComputation()
-      ops.BroadcastInDim(ops.Constant(c, NumpyArrayS32([1, 2])), [2, 2], [0])
-      self._ExecuteAndCompareExact(c, expected=[[[1, 1], [2, 2]]])
-      ops.BroadcastInDim(ops.Constant(c, NumpyArrayS32([1, 2])), [2, 2], [1])
-      self._ExecuteAndCompareExact(c, expected=[[[1, 2], [1, 2]]])
-
-    def testRngNormal(self):
-      shape = (2, 3)
-      c = self._NewComputation()
-      ops.RngNormal(
-          ops.Constant(c, NumpyArrayF32(0.)),
-          ops.Constant(c, NumpyArrayF32(1.)),
-          shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.F32,
-                                             shape))
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      # since the result is random, we just check shape and uniqueness
-      self.assertLen(result, 1)
-      self.assertEqual(result[0].shape, shape)
-      self.assertLen(np.unique(result[0]), np.prod(shape))
-
-    def testRngUniformF32(self):
-      lo, hi = 2., 4.
-      shape = (2, 3)
-      c = self._NewComputation()
-      ops.RngUniform(
-          ops.Constant(c, NumpyArrayF32(lo)),
-          ops.Constant(c, NumpyArrayF32(hi)),
-          shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.F32,
-                                             shape))
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      # since the result is random, we just check shape, uniqueness, and range
-      self.assertLen(result, 1)
-      self.assertEqual(result[0].shape, shape)
-      self.assertLen(np.unique(result[0]), np.prod(shape))
-      self.assertTrue(np.all(lo <= result[0]))
-      self.assertTrue(np.all(result[0] < hi))
-
-    def testRngUniformS32(self):
-      lo, hi = 2, 4
-      shape = (2, 3)
-      c = self._NewComputation()
-      ops.RngUniform(
-          ops.Constant(c, NumpyArrayS32(lo)),
-          ops.Constant(c, NumpyArrayS32(hi)),
-          shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.S32,
-                                             shape))
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      # since the result is random, we just check shape, integrality, and range
-      self.assertLen(result, 1)
-      self.assertEqual(result[0].shape, shape)
-      self.assertEqual(result[0].dtype, np.int32)
-      self.assertTrue(np.all(lo <= result[0]))
-      self.assertTrue(np.all(result[0] < hi))
-
-    def testCholesky(self):
-      l = np.array([[4, 0, 0, 0], [6, 5, 0, 0], [2, 14, 16, 0], [3, 6, 1, 4]],
-                   dtype=np.float32)
-      c = self._NewComputation()
-      ops.Cholesky(ops.Constant(c, np.tril(np.dot(l, l.T))))
-      self._ExecuteAndCompareClose(c, expected=[l], rtol=1e-4)
-
-    def testSort(self):
-      keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
-      c = self._NewComputation()
-      ops.Sort(c, [ops.Constant(c, keys)], is_stable=True)
-      self._ExecuteAndCompareClose(
-          c,
-          expected=[np.array([[1, 2, 3, 4], [1, 2, 3, 4]], dtype=np.float32)])
-
-    def testSortKeyVal(self):
-      keys = np.array([[2, 4, 1, 3], [3, 1, 4, 2]], dtype=np.float32)
-      values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
-      c = self._NewComputation()
-      ops.Sort(c, (ops.Constant(c, keys), ops.Constant(c, values)), dimension=0)
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      self.assertLen(result, 2)
-      np.testing.assert_allclose(result[0], [[2, 1, 1, 2], [3, 4, 4, 3]])
-      np.testing.assert_equal(result[1], [[0, 5, 2, 7], [4, 1, 6, 3]])
-
-    def testSortCustomComparator(self):
-      b = self._NewComputation("comparator")
-      p0 = ops.Parameter(b, 0, xla_client.shape_from_pyval(NumpyArrayF32(0)))
-      q0 = ops.Parameter(b, 1, xla_client.shape_from_pyval(NumpyArrayF32(0)))
-      p1 = ops.Parameter(b, 2, xla_client.shape_from_pyval(NumpyArrayS32(0)))
-      q1 = ops.Parameter(b, 3, xla_client.shape_from_pyval(NumpyArrayS32(0)))
-      ops.Or(ops.Lt(p0, q0), ops.And(ops.Eq(p0, q0), ops.Gt(p1, q1)))
-      comparator = b.build()
-
-      keys = np.array([[2, 3, 1, 3], [3, 1, 2, 2]], dtype=np.float32)
-      values = np.array([[0, 1, 2, 3], [4, 5, 6, 7]], dtype=np.int32)
-      c = self._NewComputation()
-      ops.Sort(
-          c, (ops.Constant(c, keys), ops.Constant(c, values)),
-          dimension=1,
-          comparator=comparator)
-      result = execute_with_python_values(
-          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
-          backend=self.backend)
-      self.assertLen(result, 2)
-      np.testing.assert_allclose(result[0], [[1, 2, 3, 3], [1, 2, 2, 3]])
-      np.testing.assert_equal(result[1], [[2, 0, 3, 1], [5, 7, 6, 4]])
-
-    def testQR(self):
-      a = np.array([[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166],
-                    [10, 63, 166, 310]],
-                   dtype=np.float32)
-      c = self._NewComputation()
-      ops.Tuple(c, ops.QR(ops.Constant(c, a), full_matrices=True))
-      q, r = self._Execute(c, ())
-      np.testing.assert_allclose(np.dot(q, r), a, rtol=1e-4)
-
-    def testEigh(self):
-      a = np.array([[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166],
-                    [10, 63, 166, 310]],
-                   dtype=np.float32)
-      a = (a + a.T) / 2
-
-      c = self._NewComputation()
-      ops.Tuple(c, ops.Eigh(ops.Constant(c, a), lower=True))
-      # TODO(b/129396575): Turn this test back on when it passes without
-      # fastmath.
-      # v, w = self._Execute(c, ())
-      # self.assertLess(np.linalg.norm(np.dot(a, v) - w * v), 1e-3)
-
-    def testSVD(self):
-      a = np.array([[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166],
-                    [10, 63, 166, 310]],
-                   dtype=np.float32)
-      c = self._NewComputation()
-      ops.Tuple(c, ops.SVD(ops.Constant(c, a)))
-      u, d, v = self._Execute(c, ())
-      self.assertLess(np.linalg.norm(a - np.matmul(u * d, v.T)), 1e-3)
-
-    def testTriangularSolve(self):
-      a_vals = np.array(
-          [[2, 0, 0, 0], [3, 6, 0, 0], [4, 7, 9, 0], [5, 8, 10, 11]],
-          dtype=np.float32)
-      b_vals = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
-                        dtype=np.float32)
-
-      c = self._NewComputation()
-      ops.TriangularSolve(
-          ops.Constant(c, a_vals),
-          ops.Constant(c, b_vals),
-          left_side=False,
-          lower=True,
-          transpose_a=ops.TriangularSolveOptions_Transpose.TRANSPOSE,
-          unit_diagonal=False)
-      self._ExecuteAndCompareClose(
-          c,
-          expected=[
-              np.array([
-                  [0.5, 0.08333334, 0.04629629, 0.03367003],
-                  [2.5, -0.25, -0.1388889, -0.1010101],
-                  [4.5, -0.58333331, -0.32407406, -0.23569024],
-              ],
-                       dtype=np.float32)
-          ],
-          rtol=1e-4)
-
-    def testApproxTopK(self):
-      if self.backend.platform != "tpu":
-        self.skipTest("ApproxTopK is only supported on TPU")
-      k = 10
-      qy_size = 256
-      db_size = 3000
-      feature = 128
-      recall_target = 0.95
-      b = self._NewComputation()
-      p0 = ops.Parameter(b, 0, xla_client.shape_from_pyval(NumpyArrayF32(0)))
-      q0 = ops.Parameter(b, 1, xla_client.shape_from_pyval(NumpyArrayF32(0)))
-      ops.Parameter(b, 2, xla_client.shape_from_pyval(NumpyArrayS32(0)))
-      ops.Parameter(b, 3, xla_client.shape_from_pyval(NumpyArrayS32(0)))
-      ops.Gt(p0, q0)
-      comparator = b.build()
-      qy_shape = [qy_size, feature]
-      db_shape = [feature, db_size]
-      rng = np.random.RandomState(0)
-      qy_arg = rng.randn(*qy_shape).astype(np.float32)
-      db_arg = rng.randn(*db_shape).astype(np.float32)
-      b = self._NewComputation()
-      qy = ops.Parameter(b, 0, xla_client.shape_from_pyval(qy_arg))
-      db = ops.Parameter(b, 1, xla_client.shape_from_pyval(db_arg))
-      scores = ops.Dot(qy, db)
-      iota = ops.Iota(
-          b,
-          xla_client.Shape.array_shape(xla_client.PrimitiveType.S32,
-                                       (qy_size, db_size)), 1)
-      init_val = ops.Constant(b, np.float32(-1))
-      init_arg = ops.Constant(b, np.int32(-1))
-      ground_truth = ops.TopK(scores, k=k)
-      approx_topk = ops.ApproxTopK(
-          b, [scores, iota], [init_val, init_arg],
-          top_k=k,
-          reduction_dim=1,
-          comparator=comparator,
-          recall_target=recall_target)
-      ops.Tuple(b, [
-          ops.GetTupleElement(ground_truth, 1),
-          ops.GetTupleElement(approx_topk, 1)
-      ])
-      results = self._Execute(b, [qy_arg, db_arg])
-      ground_truth_docids = [set(x) for x in results[0]]
-      hits = sum(
-          len(
-              list(x
-                   for x in approx_topk_per_q
-                   if x in ground_truth_docids[q]))
-          for q, approx_topk_per_q in enumerate(results[1]))
-      self.assertGreater(hits / (qy_size * k), recall_target)
-
-    def testIsConstant(self):
-      c = self._NewComputation()
-      a = ops.Constant(c, np.int32(3))
-      b = ops.Constant(c, np.int32(1))
-      x = ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayS32(0)))
-      const_expr = ops.Sub(b, a)
-      non_const_expr = ops.Mul(const_expr, x)
-      self.assertTrue(c.is_constant(const_expr))
-      self.assertFalse(c.is_constant(non_const_expr))
-
-    def testGather(self):
-      a = np.arange(9).astype(np.int32).reshape((3, 3))
-      indices = np.array([[[0, 2], [2, 1]], [[1, 2], [2, 0]]], dtype=np.int32)
-      dnums = xla_client.GatherDimensionNumbers()
-      dnums.offset_dims.append(1)
-      dnums.offset_dims.append(2)
-      dnums.start_index_map.append(0)
-      dnums.start_index_map.append(1)
-      dnums.index_vector_dim = 2
-      c = self._NewComputation()
-      ops.Gather(
-          ops.Constant(c, a),
-          ops.Constant(c, indices),
-          dnums,
-          slice_sizes=[1, 1])
-      g, = self._Execute(c, ())
-      expected = np.array([[[[2, 7]]], [[[5, 6]]]], dtype=np.int32)
-      np.testing.assert_allclose(g, expected, rtol=1e-4)
-
-    def testAllGather(self):
-      a = np.arange(9).astype(np.int32).reshape((3, 3))
-      c = self._NewComputation()
-      ops.AllGather(
-          operand=ops.Constant(c, a),
-          all_gather_dimension=0,
-          shard_count=1,
-          replica_groups=xla_client.make_replica_groups([[0]]),
-          use_global_device_ids=False)
-      [g] = self._Execute(c, ())
-      np.testing.assert_equal(g, a)
-
-    def testFft(self):
-      if self.backend.platform == "tpu":
-        self.skipTest("TPU only supports 1D FFT")
-      shape = [2, 3, 4, 5]
-      rng = np.random.RandomState(0)
-      a = rng.randn(*shape) + 1.0j * rng.randn(*shape)
-      a = a.astype(np.complex64)
-      # FFT
-      c = self._NewComputation()
-      ops.Fft(ops.Constant(c, a), xla_client.FftType.FFT, shape[-3:])
-      self._ExecuteAndCompareClose(
-          c, expected=[np.fft.fftn(a, axes=(1, 2, 3))], rtol=1e-4)
-      # IFFT
-      c = self._NewComputation()
-      ops.Fft(ops.Constant(c, a), xla_client.FftType.IFFT, shape[-3:])
-      self._ExecuteAndCompareClose(
-          c, expected=[np.fft.ifftn(a, axes=(1, 2, 3))], rtol=1e-4)
-      # RFFT
-      b = rng.randn(*shape).astype(np.float32)
-      c = self._NewComputation()
-      ops.Fft(ops.Constant(c, b), xla_client.FftType.RFFT, shape[-3:])
-      self._ExecuteAndCompareClose(
-          c, expected=[np.fft.rfftn(b, axes=(1, 2, 3))], rtol=1e-4)
-      # IRFFT
-      c = self._NewComputation()
-      ops.Fft(ops.Constant(c, a), xla_client.FftType.IRFFT, [3, 4, 8])
-      self._ExecuteAndCompareClose(
-          c, expected=[np.fft.irfftn(a, axes=(1, 2, 3))], rtol=2e-4
-      )
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes + fp8_dtypes)
-    def testNextAfter(self, dtype):
-      if dtype == np.float64 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support float64")
-      if dtype == bfloat16 and self.backend.platform == "tpu":
-        self.skipTest("b/371119032: Test fails on TPUs with bfloat16")
-      finfo = ml_dtypes.finfo(dtype)
-      eps = finfo.eps
-      c = self._NewComputation()
-      # Each row is (value, direction, expected), where
-      # 'nextafter(value, direction)' should be 'expected'.
-      data = np.array(
-          [
-              [1, 2, 1 + finfo.eps],
-              [2, 1, 2 - eps],
-              [-0., 1, finfo.smallest_subnormal],
-              [0., -1, -finfo.smallest_subnormal],
-              [-finfo.smallest_subnormal, 1, -0.],
-              [finfo.smallest_subnormal, 1, 2 * finfo.smallest_subnormal],
-              [finfo.smallest_subnormal, -1, 0],
-          ],
-          dtype=dtype,
-      )
-
-      ops.NextAfter(ops.Constant(c, data[:, 0]), ops.Constant(c, data[:, 1]))
-      out, = self._Execute(c, ())
-      np.testing.assert_equal(out, data[:, 2])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testRegularizedIncompleteBeta(self, dtype):
-      x = np.array([0.53787335, 0.24015466, 0.47494545, 0.13567594, 0.95114538],
-                   dtype=dtype)
-      a = np.array([0.00753073, 0.34813385, 0.30485708, 1.29298632, 0.51472606],
-                   dtype=dtype)
-      b = np.array([0.55688389, 0.59794214, 0.42661022, 1.59748339, 0.95047677],
-                   dtype=dtype)
-      c = self._NewComputation()
-      ops.RegularizedIncompleteBeta(
-          ops.Constant(c, a), ops.Constant(c, b), ops.Constant(c, x))
-      expected = np.array(
-          [0.98923271, 0.48575411, 0.57952568, 0.12579775, 0.96989155])
-      self._ExecuteAndCompareClose(c, expected=[expected], rtol=2e-2)
-
-  tests.append(SingleOpTest)
-
-  class EmbeddedComputationsTest(ComputationTest):
-    """Tests for XLA graphs with embedded computations (such as maps)."""
-
-    def _CreateConstantComputation(self, in_dtype, out_dtype):
-      """Computation (A) -> B that returns a constant 1 for any input."""
-      c = self._NewComputation("constant_{}_{}_one".format(
-          in_dtype.__name__, out_dtype.__name__))
-      ops.Parameter(
-          c, 0,
-          xla_client.shape_from_pyval(np.array(
-              0, dtype=in_dtype)).with_major_to_minor_layout_if_absent())
-      ops.Constant(c, out_dtype(1))
-      return c.build()
-
-    def _CreateMulBy2Computation(self, dtype):
-      """Computation (dtype) -> dtype that multiplies its parameter by 2."""
-      c = self._NewComputation("mul_f32_by2")
-      ops.Mul(
-          ops.Parameter(
-              c, 0,
-              xla_client.shape_from_pyval(np.array(
-                  0, dtype=dtype)).with_major_to_minor_layout_if_absent()),
-          ops.Constant(c, dtype(2.0)))
-      return c.build()
-
-    def _CreateMulF32ByParamComputation(self):
-      """Computation (f32) -> f32 that multiplies one parameter by the other."""
-      c = self._NewComputation("mul_f32_by_param")
-      ops.Mul(
-          ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0))),
-          ops.Parameter(c, 1, xla_client.shape_from_pyval(NumpyArrayF32(0))))
-      return c.build()
-
-    def _CreateBinaryAddComputation(self, dtype):
-      """Computation (dtype, dtype) -> dtype that adds its two parameters."""
-      c = self._NewComputation("add_param0_by_param1")
-      shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
-      shape = shape.with_major_to_minor_layout_if_absent()
-      ops.Add(ops.Parameter(c, 0, shape), ops.Parameter(c, 1, shape))
-      return c.build()
-
-    def _CreateBinaryGeComputation(self, dtype):
-      """Computation (dtype, dtype) -> bool that tests param0 >= param1."""
-      c = self._NewComputation("param0_lt_param1")
-      shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
-      shape = shape.with_major_to_minor_layout_if_absent()
-      ops.Ge(ops.Parameter(c, 0, shape), ops.Parameter(c, 1, shape))
-      return c.build()
-
-    def _MakeSample3DArray(self, dtype):
-      return np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
-                       [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
-                      dtype=dtype)
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testCall(self, dtype):
-      c = self._NewComputation()
-      ops.Call(
-          c,
-          self._CreateMulBy2Computation(dtype),
-          operands=(ops.Constant(c, dtype(5.0)),))
-      self._ExecuteAndCompareClose(c, expected=[10.0])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}_{}".format(in_dtype.__name__, out_dtype.__name__),
-        "in_dtype": in_dtype,
-        "out_dtype": out_dtype,
-    } for in_dtype, out_dtype in [[np.float32, np.int32]])
-    def testMapEachElementToConstant(self, in_dtype, out_dtype):
-      c = self._NewComputation()
-      ops.Map(c,
-              [ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=in_dtype))],
-              self._CreateConstantComputation(in_dtype, out_dtype), [0])
-      self._ExecuteAndCompareExact(c, expected=[[1, 1, 1, 1]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testMapMulBy2(self, dtype):
-      if dtype == np.float64 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support float64")
-      c = self._NewComputation()
-      ops.Map(c, [ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype))],
-              self._CreateMulBy2Computation(dtype), [0])
-      self._ExecuteAndCompareClose(c, expected=[[2.0, 4.0, 6.0, 8.0]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testSimpleMapChain(self, dtype):
-      if dtype == np.float64 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support float64")
-      # Chains a map of constant-out with a map of mul-by-2
-      c = self._NewComputation()
-      const = ops.Map(
-          c, [ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype))],
-          self._CreateConstantComputation(dtype, dtype), [0])
-      ops.Map(c, [const], self._CreateMulBy2Computation(dtype), [0])
-      self._ExecuteAndCompareClose(c, expected=[[2.0, 2.0, 2.0, 2.0]])
-
-    # TODO(b/154752816): bfloat16 crashes in evaluator.
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes if dtype != bfloat16)
-    def testDivVectorsWithMap(self, dtype):
-
-      def DivComputation():
-        c = self._NewComputation("div_param0_by_param1")
-        shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
-        ops.Div(ops.Parameter(c, 0, shape), ops.Parameter(c, 1, shape))
-        return c.build()
-
-      c = self._NewComputation()
-      ops.Map(c, (ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype)),
-                  ops.Constant(c, np.array([5.0, 5.0, 4.0, 4.0], dtype=dtype))),
-              DivComputation(), [0])
-      self._ExecuteAndCompareClose(
-          c, expected=[[0.2, 0.4, 0.75, 1.0]], rtol=1e-3)
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testSelectAndScatter(self, dtype):
-      if dtype == np.float64 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support float64")
-      c = self._NewComputation()
-      operand = ops.Constant(
-          c, np.array([[1., 2., 6.], [4., 5., 3.]], dtype=dtype))
-      window_dimensions = (2, 1)
-      window_strides = (1, 2)
-      padding = xla_client.window_padding_type_to_pad_values(
-          xla_client.PaddingType.VALID,
-          c.get_shape(operand).dimensions(), window_dimensions, window_strides)
-      ops.SelectAndScatterWithGeneralPadding(
-          operand,
-          select=self._CreateBinaryGeComputation(dtype),
-          window_dimensions=window_dimensions,
-          window_strides=window_strides,
-          padding=padding,
-          source=ops.Constant(c, np.array([[0.1, 0.2]], dtype=dtype)),
-          init_value=ops.Constant(c, np.array(1, dtype=dtype)),
-          scatter=self._CreateBinaryAddComputation(dtype))
-      self._ExecuteAndCompareClose(
-          c, expected=[[[1., 1., 1.2], [1.1, 1., 1.]]], rtol=5e-3)
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testReduce1DtoScalar(self, dtype):
-      c = self._NewComputation()
-      ops.Reduce(
-          c,
-          operands=[
-              ops.Constant(c, np.array([1.0, 2.0, 3.0, 4.0], dtype=dtype))
-          ],
-          init_values=[ops.Constant(c, dtype(0))],
-          computation=self._CreateBinaryAddComputation(dtype),
-          dimensions_to_reduce=[0])
-      self._ExecuteAndCompareClose(c, expected=[10])
-
-    # TODO(phawkins): test comparison harness doesn't support bfloat16
-    @unittest.skipIf(pjrt_c_api, "b/264473047: hangs")
-    @parameterized.named_parameters({
-        "testcase_name": "_{}_dim{}".format(dtype.__name__, dim),
-        "dtype": dtype,
-        "dim": dim,
-    } for dtype in float_dtypes if dtype != bfloat16 for dim in range(2))
-    def testReduce2DTo1D(self, dtype, dim):
-      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
-      c = self._NewComputation()
-      ops.Reduce(
-          c,
-          operands=[ops.Constant(c, input_array)],
-          init_values=[ops.Constant(c, dtype(0))],
-          computation=self._CreateBinaryAddComputation(dtype),
-          dimensions_to_reduce=[dim])
-      self._ExecuteAndCompareClose(c, expected=[np.sum(input_array, axis=dim)])
-
-    @unittest.skipIf(pjrt_c_api, "b/264473047: hangs")
-    @parameterized.named_parameters({
-        "testcase_name": "_{}_dims[{}]".format(dtype.__name__, dims),
-        "dtype": dtype,
-        "dims": tuple(dims)
-    } for dtype in float_dtypes for dims in itertools.permutations(range(3)))
-    def testReduce3DAllPossibleWaysF32(self, dtype, dims):
-      input_array = self._MakeSample3DArray(dtype)
-      c = self._NewComputation()
-      ops.Reduce(
-          c,
-          operands=[ops.Constant(c, input_array)],
-          init_values=[ops.Constant(c, dtype(0))],
-          computation=self._CreateBinaryAddComputation(dtype),
-          dimensions_to_reduce=dims)
-      self._ExecuteAndCompareClose(c, expected=[np.sum(input_array, axis=dims)])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testReduceWindowValidUnitStrides(self, dtype):
-      if dtype == np.float64 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support float64")
-      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
-      c = self._NewComputation()
-      window_dimensions = (2, 1)
-      window_strides = (1, 1)
-      padding = xla_client.window_padding_type_to_pad_values(
-          xla_client.PaddingType.VALID, input_array.shape, window_dimensions,
-          window_strides)
-      ops.ReduceWindowWithGeneralPadding(
-          operand=ops.Constant(c, input_array),
-          init_value=ops.Constant(c, dtype(0)),
-          computation=self._CreateBinaryAddComputation(dtype),
-          window_dimensions=window_dimensions,
-          window_strides=window_strides,
-          base_dilations=[],
-          window_dilations=[],
-          padding=padding)
-      self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.]]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testReduceWindowSameUnitStrides(self, dtype):
-      if dtype == np.float64 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support float64")
-      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
-      c = self._NewComputation()
-      window_dimensions = (2, 1)
-      window_strides = (1, 1)
-      padding = xla_client.window_padding_type_to_pad_values(
-          xla_client.PaddingType.SAME, input_array.shape, window_dimensions,
-          window_strides)
-      ops.ReduceWindowWithGeneralPadding(
-          operand=ops.Constant(c, input_array),
-          init_value=ops.Constant(c, dtype(0)),
-          computation=self._CreateBinaryAddComputation(dtype),
-          window_dimensions=window_dimensions,
-          window_strides=window_strides,
-          base_dilations=[],
-          window_dilations=[],
-          padding=padding)
-      self._ExecuteAndCompareClose(c, expected=[[[5., 7., 9.], [4., 5., 6.]]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testReduceWindowValidGeneralStrides(self, dtype):
-      if dtype == np.float64 and self.backend.platform == "tpu":
-        self.skipTest("TPU doesn't support float64")
-      input_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
-      c = self._NewComputation()
-      window_dimensions = (2, 1)
-      window_strides = (1, 2)
-      padding = xla_client.window_padding_type_to_pad_values(
-          xla_client.PaddingType.VALID, input_array.shape, window_dimensions,
-          window_strides)
-      ops.ReduceWindowWithGeneralPadding(
-          operand=ops.Constant(c, input_array),
-          init_value=ops.Constant(c, dtype(0)),
-          computation=self._CreateBinaryAddComputation(dtype),
-          window_dimensions=window_dimensions,
-          window_strides=window_strides,
-          base_dilations=[],
-          window_dilations=[],
-          padding=padding)
-      self._ExecuteAndCompareClose(c, expected=[[[5., 9.]]])
-
-    @unittest.skipIf(pjrt_c_api, "b/264473047: hangs")
-    def testReduceWindowVariadic(self):
-      c = self._NewComputation("reducer")
-      shape = xla_client.shape_from_pyval(np.array(0, dtype=np.int32))
-      shape = shape.with_major_to_minor_layout_if_absent()
-      ps = [ops.Parameter(c, i, shape) for i in range(4)]
-      which = ops.Ge(ps[0], ps[2])
-      ops.Tuple(
-          c, [ops.Select(which, ps[0], ps[2]),
-              ops.Select(which, ps[1], ps[3])])
-      reducer = c.build()
-
-      key_array = np.array([[1, 5, 6], [4, 2, 3]], dtype=np.int32)
-      val_array = np.array([[7, 8, 9], [10, 11, 12]], dtype=np.int32)
-      c = self._NewComputation()
-      window_dimensions = (2, 1)
-      window_strides = (1, 1)
-      padding = xla_client.window_padding_type_to_pad_values(
-          xla_client.PaddingType.VALID, key_array.shape, window_dimensions,
-          window_strides)
-      ops.ReduceWindowWithGeneralPadding(
-          operands=[ops.Constant(c, key_array),
-                    ops.Constant(c, val_array)],
-          init_values=[
-              ops.Constant(c, np.int32(0)),
-              ops.Constant(c, np.int32(0))
-          ],
-          computation=reducer,
-          window_dimensions=window_dimensions,
-          window_strides=window_strides,
-          base_dilations=[],
-          window_dilations=[],
-          padding=padding)
-      self._ExecuteAndCompareClose(c, expected=[[[4, 5, 6]], [[10, 8, 9]]])
-
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in float_dtypes)
-    def testWhile(self, dtype):
-
-      def LessThan10Cond():
-        c = self._NewComputation("test_lt_10")
-        shape = xla_client.shape_from_pyval(np.array(0, dtype=dtype))
-        ops.Lt(ops.Parameter(c, 0, shape), ops.Constant(c, dtype(10.)))
-        return c.build()
-
-      cond = LessThan10Cond()
-      body = self._CreateMulBy2Computation(dtype)
-      c = self._NewComputation()
-      init = ops.Constant(c, dtype(1.))
-      ops.While(cond, body, init)
-      self._ExecuteAndCompareClose(c, expected=[16.])
-
-    def testConditionalTrue(self):
-      c = self._NewComputation()
-      pred = ops.Constant(c, np.bool_(True))
-      true_operand = ops.Constant(c, np.float32(3.))
-      true_computation = self._CreateMulBy2Computation(np.float32)
-      false_operand = ops.Constant(c, np.float32(2.))
-      false_computation = self._CreateConstantComputation(
-          np.float32, np.float32)
-      ops.Conditional(pred, true_operand, true_computation, false_operand,
-                      false_computation)
-      self._ExecuteAndCompareClose(c, expected=[6.])
-
-    def testConditionalFalse(self):
-      c = self._NewComputation()
-      pred = ops.Constant(c, np.bool_(False))
-      true_operand = ops.Constant(c, np.float32(3.))
-      true_computation = self._CreateMulBy2Computation(np.float32)
-      false_operand = ops.Constant(c, np.float32(2.))
-      false_computation = self._CreateConstantComputation(
-          np.float32, np.float32)
-      ops.Conditional(pred, true_operand, true_computation, false_operand,
-                      false_computation)
-      self._ExecuteAndCompareClose(c, expected=[1.])
-
-    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
-                     "not implemented")
-    def testInfeedS32Values(self):
-      to_infeed = NumpyArrayS32([1, 2, 3, 4])
-      c = self._NewComputation()
-      ops.GetTupleElement(
-          ops.InfeedWithToken(
-              ops.CreateToken(c),
-              xla_client.shape_from_pyval(
-                  to_infeed[0]).with_major_to_minor_layout_if_absent()), 0)
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()))
-      device = self.backend.local_devices()[0]
-      for item in to_infeed:
-        device.transfer_to_infeed(item)
-
-      for item in to_infeed:
-        result, = execute_with_python_values(
-            compiled_c, (), backend=self.backend)
-        self.assertEqual(result, item)
-
-    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
-                     "not implemented")
-    def testInfeedTuple(self):
-      to_infeed = (NumpyArrayS32([1, 2, 3, 4]), NumpyArrayS32([[7], [8]]))
-      c = self._NewComputation()
-      ops.GetTupleElement(
-          ops.InfeedWithToken(
-              ops.CreateToken(c),
-              xla_client.shape_from_pyval(
-                  to_infeed).with_major_to_minor_layout_if_absent()), 0)
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()))
-      device = self.backend.local_devices()[0]
-      device.transfer_to_infeed(to_infeed)
-
-      result = execute_with_python_values(
-          compiled_c, (), backend=self.backend)
-      self.assertLen(result, 2)
-      np.testing.assert_equal(result[0], to_infeed[0])
-      np.testing.assert_equal(result[1], to_infeed[1])
-
-    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
-                     "not implemented")
-    def testInfeedThenOutfeedS32(self):
-      to_round_trip = NumpyArrayS32([1, 2, 3, 4])
-      c = self._NewComputation()
-      x_and_token = ops.InfeedWithToken(
-          ops.CreateToken(c),
-          xla_client.shape_from_pyval(
-              to_round_trip[0]).with_major_to_minor_layout_if_absent())
-      x = ops.GetTupleElement(x_and_token, 0)
-      token = ops.GetTupleElement(x_and_token, 1)
-      outfeed_shape = xla_client.shape_from_pyval(
-          to_round_trip[0]).with_major_to_minor_layout_if_absent()
-      ops.OutfeedWithToken(x, token, outfeed_shape)
-      ops.Tuple(c, ())
-
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()))
-      device = self.backend.local_devices()[0]
-
-      for want in to_round_trip:
-        execution = threading.Thread(target=lambda: compiled_c.execute([]))
-        execution.start()
-        device.transfer_to_infeed(want)
-        got = device.transfer_from_outfeed(outfeed_shape)
-        execution.join()
-        self.assertEqual(want, got)
-
-    def testScatter(self):
-      a = np.arange(9).astype(np.int32).reshape((3, 3))
-      scatter_indices = np.array([0, 2], dtype=np.int32)
-      updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
-
-      dnums = xla_client.ScatterDimensionNumbers()
-      dnums.update_window_dims.append(1)
-      dnums.inserted_window_dims.append(0)
-      dnums.scatter_dims_to_operand_dims.append(0)
-      dnums.index_vector_dim = 1
-
-      c = self._NewComputation()
-      ops.Scatter(
-          ops.Constant(c, a), ops.Constant(c, scatter_indices),
-          ops.Constant(c, updates), self._CreateBinaryAddComputation(np.int32),
-          dnums)
-      expected = np.array([[10, 21, 32], [3, 4, 5], [76, 87, 98]],
-                          dtype=np.int32)
-      self._ExecuteAndCompareClose(c, expected=[expected])
-
-  class DeviceTest(ComputationTest):
-
-    def testDevices(self):
-      self.assertNotEmpty(self.backend.devices())
-
-    def testLocalDevices(self):
-      self.assertNotEmpty(self.backend.local_devices())
-      if self.backend.platform == "cpu":
-        self.assertLen(self.backend.local_devices(), 2)
-
-    def testGetAllDevices(self):
-      # TODO(hyeontaek): Remove this method once we have a unified API for
-      # enumerating devices with different criteria.
-      self.assertNotEmpty(self.backend._get_all_devices())  # pylint: disable=protected-access
-
-    def testPlatform(self):
-      for device in self.backend.local_devices():
-        self.assertEqual(device.platform, self.backend.platform)
-
-    def testCoreCount(self):
-      if self.backend.platform != "gpu":
-        self.skipTest("core_count is only supported on GPU")
-      for device in self.backend.local_devices():
-        self.assertGreater(device.core_count, 0)
-
-    def testLocalHardwareId(self):
-      for device in self.backend.devices():
-        local_hardware_id = device.local_hardware_id
-        if local_hardware_id is not None:
-          self.assertGreaterEqual(local_hardware_id, 0)
-
-    @unittest.skipIf(pathways_ifrt, "not implemented")
-    def testLocalDeviceFromLocalHardwareId(self):
-      for device in self.backend.local_devices():
-        if device.local_hardware_id is not None:
-          lookup_device = self.backend.device_from_local_hardware_id(
-              device.local_hardware_id)
-          self.assertEqual(lookup_device, device)
-
-    @unittest.skipIf(pathways, "not implemented")
-    @unittest.skipIf(pathways_ifrt, "not implemented")
-    def testMemoryStats(self):
-      for device in self.backend.local_devices():
-        stats = device.memory_stats()
-        if (
-            self.backend.platform != "tpu" or not tfrt_tpu
-        ) and self.backend.platform not in ("gpu", "cuda", "rocm"):
-          self.assertIsNone(stats)
-        else:
-          self.assertIsNotNone(stats)
-          # Spot check a few fields
-          self.assertEqual(type(stats["num_allocs"]), int)
-          self.assertGreaterEqual(stats["num_allocs"], 0)
-          self.assertEqual(type(stats["bytes_in_use"]), int)
-          self.assertGreaterEqual(stats["bytes_in_use"], 0)
-          self.assertEqual(type(stats["peak_bytes_in_use"]), int)
-          self.assertGreaterEqual(stats["peak_bytes_in_use"], 0)
-          self.assertEqual(type(stats["largest_alloc_size"]), int)
-          self.assertGreaterEqual(stats["largest_alloc_size"], 0)
-
-    @unittest.skipIf(pathways, "not implemented")
-    def testMemory(self):
-      for device in self.backend.local_devices():
-        for memory in device.addressable_memories():
-          self.assertEqual(memory.process_index, device.process_index)
-          self.assertEqual(memory.platform, device.platform)
-          self.assertIn(device, memory.addressable_by_devices())
-          self.assertEqual(memory, device.memory(memory.kind))
-
-  tests.append(DeviceTest)
-
-  class ErrorTest(ComputationTest):
-
-    def setUp(self):
-      super(ErrorTest, self).setUp()
-      self.f32_scalar_2 = NumpyArrayF32(2.0)
-      self.s32_scalar_2 = NumpyArrayS32(2)
-
-    def testCompileWithWrongElementTypeInLayout(self):
-      c = self._NewComputation()
-      c.set_op_metadata(xla_client.CurrentSourceInfoMetadata())
-      ops.Parameter(c, 0, xla_client.shape_from_pyval(self.s32_scalar_2))
-      c.clear_op_metadata()
-
-      options = xla_client.CompileOptions()
-      options.argument_layouts = [
-          xla_client.Shape.array_shape(np.dtype(np.float32), [])
-      ]
-
-      def TestFun():
-        return self.backend.compile(c.build(), compile_options=options)
-
-      self.assertRaisesRegex(
-          RuntimeError, r".*Invalid argument shape.*"
-          r"expected s32\[\], got f32\[\].*", TestFun)
-
-    def testInvokeWithWrongElementType(self):
-      c = self._NewComputation()
-      c.set_op_metadata(xla_client.CurrentSourceInfoMetadata())
-      ops.Parameter(c, 0, xla_client.shape_from_pyval(self.s32_scalar_2))
-      c.clear_op_metadata()
-
-      def TestFun():
-        return execute_with_python_values(
-            self.backend.compile(xla_computation_to_mlir_module(c.build())),
-            [self.f32_scalar_2], self.backend)
-
-      self.assertRaisesRegex(
-          RuntimeError, r"Invalid argument: Argument does not match.*"
-          r"want s32\[\], got f32\[\].*", TestFun)
-
-  tests.append(EmbeddedComputationsTest)
-
-  class ComputationRootTest(ComputationTest):
-    """Tests related to setting the root of the computation."""
-
-    def testComputationRootDifferentFromLastOp(self):
-      c = self._NewComputation()
-      x = ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(2.0)))
-      result = ops.Add(x, ops.Constant(c, np.float32(3.14)))
-      ops.Add(result, ops.Constant(c, np.float32(1.618)))
-
-      arg = NumpyArrayF32(1.0)
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build(result)))
-      ans, = execute_with_python_values(
-          compiled_c, [arg], backend=self.backend)
-      np.testing.assert_allclose(ans, 4.14)
-
-  tests.append(ComputationRootTest)
-
-  class SetShardingTest(ComputationTest):
-    """Tests related to set OpSharding."""
-
-    def testSetSharding(self):
-      c = self._NewComputation()
-      sharding = xla_client.OpSharding()
-      sharding.type = xla_client.OpSharding.Type.REPLICATED
-      sharding.tile_assignment_dimensions = [1]
-      sharding.tile_assignment_devices = [0]
-      c.set_sharding(sharding)
-      x = ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(2.0)))
-      c.clear_sharding()
-
-      result = ops.Add(x, ops.Constant(c, np.float32(3.14)))
-      ops.Add(result, ops.Constant(c, np.float32(1.618)))
-      arg = NumpyArrayF32(1.0)
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build(result)))
-      ans, = execute_with_python_values(
-          compiled_c, [arg], backend=self.backend)
-      np.testing.assert_allclose(ans, 4.14)
-
-  tests.append(SetShardingTest)
-
-  testcase_shapes = [
-      (),
-      (1,),
-      (2, 3),
-      (2, 0),
-      (0, 7),
-      (4, 1, 2),
-      (2, 1, 3),
-      (2, 4, 1),
-      (3, 1),
-      (1, 3),
-  ]
-
-  def FormatShapeAndDtype(shape, dtype):
-    return "_{}[{}]".format(np.dtype(dtype).name, ",".join(map(str, shape)))
-
-  class DLPackTest(parameterized.TestCase):
-
-    def setUp(self):
-      super(DLPackTest, self).setUp()
-      self.backend = xla_backend()
-      if self.backend.platform not in ("cpu", "gpu", "cuda", "rocm"):
-        self.skipTest("DLPack requires CPU or GPU")
-      self.cpu_backend = (
-          self.backend
-          if self.backend.platform == "cpu" else xla_client.make_cpu_client())
-      self.gpu_backend = (
-          self.backend
-          if self.backend.platform in ("gpu", "cuda", "rocm")
-          else None
-      )
-
-    def tearDown(self):
-      super().tearDown()
-      del self.backend
-      del self.cpu_backend
-      del self.gpu_backend
-
-    @classmethod
-    def _GetStreamFromDevice(cls, device):
-      try:
-        return device.get_stream_for_external_ready_events()
-      except xla_client.XlaRuntimeError as err:  # type: ignore
-        if "UNIMPLEMENTED" in str(err):
-          return None
-        else:
-          raise
-
-    def _DLPackManagedTensorToBuffer(
-        self, tensor, use_legacy_api, backend=None
-    ):
-      if use_legacy_api:
-        return xla_client._xla.dlpack_managed_tensor_to_buffer(
-            tensor, self.cpu_backend, self.gpu_backend
-        )
-      else:
-        if not backend:
-          backend = self.backend
-        device = backend.local_devices()[0]
-        stream = DLPackTest._GetStreamFromDevice(device)
-        return xla_client._xla.dlpack_managed_tensor_to_buffer(
-            tensor, device, stream
-        )
-
-    # pylint: disable=g-complex-comprehension
-    # pyformat: disable
-    @parameterized.named_parameters(
-        {
-            "testcase_name": "{}_gpu={}{}".format(
-                FormatShapeAndDtype(shape, dtype),
-                gpu,
-                "_legacy" if use_legacy_api else "",
-            ),
-            "dtype": dtype,
-            "shape": shape,
-            "gpu": gpu,
-            "use_legacy_api": use_legacy_api,
-        }
-        for dtype in dlpack_dtypes
-        for shape in testcase_shapes
-        for gpu in [False, True]
-        for use_legacy_api in [False, True]
-    )
-    # pyformat: enable
-    def testRoundTrip(self, dtype, shape, gpu, use_legacy_api):
-      if gpu and self.gpu_backend is None:
-        raise unittest.SkipTest("Test not running with GPU support")
-      backend = self.gpu_backend if gpu else self.cpu_backend
-      if dtype == np.bool_:
-        x = np.random.randint(0, 2, size=shape).astype(np.bool_)
-      else:
-        x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
-      buffer = backend.buffer_from_pyval(x)
-      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
-      del buffer  # Free "buffer" to make sure dlt retains ownership.
-      self.assertEqual(type(dlt).__name__, "PyCapsule")
-      y = self._DLPackManagedTensorToBuffer(dlt, use_legacy_api, backend)
-      np.testing.assert_array_equal(
-          x.astype(np.uint8) if dtype == np.bool_ else x, np.asarray(y))
-
-    @parameterized.named_parameters(
-        {
-            "testcase_name": "{}".format("_legacy" if use_legacy_api else ""),
-            "use_legacy_api": use_legacy_api,
-        }
-        for use_legacy_api in [False, True]
-    )
-    def testTensorsCanBeConsumedOnceOnly(self, use_legacy_api):
-      x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
-      buffer = self.backend.buffer_from_pyval(x)
-      dlt = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
-
-      def ConsumeDLPackTensor():
-        _ = self._DLPackManagedTensorToBuffer(dlt, use_legacy_api)
-
-      ConsumeDLPackTensor()
-      self.assertRaisesRegex(
-          RuntimeError, ".*a DLPack tensor may be consumed at most once.*",
-          ConsumeDLPackTensor)
-
-    @parameterized.named_parameters(
-        {
-            "testcase_name": "{}".format("_legacy" if use_legacy_api else ""),
-            "use_legacy_api": use_legacy_api,
-        }
-        for use_legacy_api in [False, True]
-    )
-    def testNonOwnedDlpackCanBeViewedTwice(self, use_legacy_api):
-      x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
-      buffer = self.backend.buffer_from_pyval(x)
-      d1 = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
-      d2 = xla_client._xla.buffer_to_dlpack_managed_tensor(buffer)
-
-      y = self._DLPackManagedTensorToBuffer(d1, use_legacy_api)
-      z = self._DLPackManagedTensorToBuffer(d2, use_legacy_api)
-      del d1, d2
-      np.testing.assert_array_equal(x, np.asarray(buffer))
-      np.testing.assert_array_equal(x, np.asarray(y))
-      np.testing.assert_array_equal(x, np.asarray(z))
-
-    @parameterized.parameters(False, True)
-    def testZeroCopyOnAlignedDlpackTensor(self, use_legacy_api):
-      # Using CPU only, since this test is about CPU memory alignment.
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires CPU")
-
-      # Create a numpy array that is aligned to XLA requirements.
-      x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
-      x = _Aligned(x)
-
-      # Convert it to a DLPack tensor, and then to an XLA buffer.
-      dlpack_tensor = x.__dlpack__()
-      buffer = self._DLPackManagedTensorToBuffer(dlpack_tensor, use_legacy_api)
-      y = np.array(buffer, copy=False)
-
-      # The input was sufficiently aligned, so input and output should alias.
-      x_ptr = x.__array_interface__["data"][0]
-      y_ptr = y.__array_interface__["data"][0]
-      self.assertEqual(
-          x_ptr,
-          y_ptr,
-          msg=f"Buffers are not aliased ({hex(x_ptr)} != {hex(y_ptr)}).",
-      )
-
-    @parameterized.named_parameters(
-        {
-            "testcase_name": "{}{}".format(
-                "_legacy" if use_legacy_api else "",
-                "_transpose" if transpose else "",
-            ),
-            "use_legacy_api": use_legacy_api,
-            "transpose": transpose,
-        }
-        for use_legacy_api in [False, True]
-        for transpose in [False, True]
-    )
-    def testReturnCopyOnUnalignedDlpackTensor(self, use_legacy_api, transpose):
-      # Using CPU only, since this test is about CPU memory alignment.
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires CPU")
-
-      if transpose and use_legacy_api:
-        self.skipTest("Non-default layout is not supported in legacy API")
-
-      # Create a numpy array that is not aligned to XLA requirements. XLA's
-      # alignment requirements differ for different hardware, so we use the
-      # smallest possible value. If we make sure the buffer is not aligned to
-      # this value (16 bytes), then it is also not aligned to its multiples (32,
-      # 64 etc.)
-      x = np.array(np.random.rand(3, 4, 5, 6), dtype=np.float32)
-      x = _Unaligned(x, alignment=_XLA_CPU_MIN_ALIGNMENT)
-
-      # Transpose the array to test non-default layout with trivial striding.
-      if transpose:
-        x = x.transpose((0, 2, 1, 3))
-
-      # Convert it to a DLPack tensor, and then to an XLA buffer.
-      dlpack_tensor = x.__dlpack__()
-      buffer = self._DLPackManagedTensorToBuffer(dlpack_tensor, use_legacy_api)
-      y = np.array(buffer, copy=False)
-
-      # The input was not sufficiently aligned, so input and output should not
-      # alias (output should be a copy of input, and it should be aligned).
-      x_ptr = x.__array_interface__["data"][0]
-      y_ptr = y.__array_interface__["data"][0]
-      self.assertNotEqual(
-          x_ptr,
-          y_ptr,
-          msg=(
-              f"Buffers aliased, but should not be ({hex(x_ptr)} =="
-              f" {hex(y_ptr)})"
-          ),
-      )
-      self.assertEqual(
-          y_ptr % _XLA_CPU_MIN_ALIGNMENT,
-          0,
-          msg="Output buffer not aligned: {hex(y_ptr)}",
-      )
-      np.testing.assert_array_equal(y, x)
-
-  tests.append(DLPackTest)
-
-  class BufferProtocolTest(parameterized.TestCase):
-
-    def setUp(self):
-      super(BufferProtocolTest, self).setUp()
-      self.backend = xla_backend()
-      if self.backend.platform != "cpu":
-        self.skipTest("Test requires CPU")
-
-    # pylint: disable=g-complex-comprehension
-    @parameterized.named_parameters({
-        "testcase_name": FormatShapeAndDtype(shape, dtype),
-        "dtype": dtype,
-        "shape": shape
-    } for dtype in standard_dtypes if dtype != bfloat16
-                                    for shape in testcase_shapes)
-    def testRoundTrip(self, dtype, shape):
-      x = np.array(np.random.rand(*shape) * 100, dtype=dtype)
-
-      x = _Aligned(x)
-      x_ptr = x.__array_interface__["data"][0]
-      buffer = self.backend.buffer_from_pyval(
-          x, host_buffer_semantics=xla_client.HostBufferSemantics.ZERO_COPY)
-      y = np.array(buffer, copy=False)
-      y_ptr = y.__array_interface__["data"][0]
-      np.testing.assert_array_equal(x, y)
-
-      # The input was sufficiently aligned, so input and output should alias.
-      self.assertEqual(x_ptr, y_ptr)
-      self.assertEqual(y_ptr, buffer.unsafe_buffer_pointer())
-
-      during_call = xla_client.HostBufferSemantics.IMMUTABLE_ONLY_DURING_CALL
-      buffer2 = self.backend.buffer_from_pyval(
-          x, host_buffer_semantics=during_call)
-      z = np.array(buffer2, copy=False)
-      self.assertNotEqual(x.__array_interface__["data"][0],
-                          z.__array_interface__["data"][0])
-
-    def testDeleteWithActiveView(self):
-      x = np.random.randn(20, 10)
-      buffer = self.backend.buffer_from_pyval(x)
-      buffer_ptr = buffer.unsafe_buffer_pointer()
-      y = np.array(buffer, copy=False)
-      buffer.delete()
-      # It is still legal to access `y`; the array view must keep it alive.
-      np.testing.assert_array_equal(x, y)
-      self.assertEqual(y.__array_interface__["data"][0], buffer_ptr)
-
-  tests.append(BufferProtocolTest)
-
-  class TracebackTest(absltest.TestCase):
-
-    def setUp(self):
-      super(TracebackTest, self).setUp()
-      self.backend = xla_backend()
-
-    def testNoTracebacksIfDisabled(self):
-      with xla_client.tracebacks(enabled=False):
-        self.assertEqual(None, xla_client.Traceback.get_traceback())
-        buffer = self.backend.buffer_from_pyval(np.array(7, np.int32))
-        self.assertEqual(None, buffer.traceback)
-
-        b = xla_client.XlaBuilder("computation")
-        ops.Add(ops.Constant(b, np.int32(1)), ops.Constant(b, np.int32(2)))
-        e = self.backend.compile(xla_computation_to_mlir_module(b.build()))
-        self.assertEqual(None, e.traceback)
-
-    def assertIsTracebackContaining(self, tb, function):
-      self.assertIsInstance(tb, xla_client.Traceback)
-      self.assertIn(function, str(tb))
-      self.assertTrue(any(f.function_name == function for f in tb.frames))
-
-    def testTracebacks(self):
-      with xla_client.tracebacks(enabled=True):
-        tb = xla_client.Traceback.get_traceback()
-        self.assertIsTracebackContaining(tb, "testTracebacks")
-
-        # Tracebacks are not implemented on the TPU driver extension's variant
-        # of buffers and executables.
-        if not isinstance(self.backend, xla_client.Client):
-          return
-
-        buffer = self.backend.buffer_from_pyval(np.array(7, np.int32))
-        self.assertIsTracebackContaining(buffer.traceback, "testTracebacks")
-
-        b = xla_client.XlaBuilder("computation")
-        ops.Add(ops.Constant(b, np.int32(1)), ops.Constant(b, np.int32(2)))
-        e = self.backend.compile(xla_computation_to_mlir_module(b.build()))
-        self.assertIsTracebackContaining(e.traceback, "testTracebacks")
-
-    def testNestedFunction(self):
-
-      def AFunction():
-
-        def AnotherFunction():
-          return xla_client.Traceback.get_traceback()
-
-        return AnotherFunction()
-
-      with xla_client.tracebacks(enabled=True):
-        tb = AFunction()
-        self.assertIsInstance(tb, xla_client.Traceback)
-        frames = tb.frames
-        i = next(
-            i for (i, f) in enumerate(frames) if f.function_name == "AFunction")
-        self.assertEqual(frames[i - 1].function_name, "AnotherFunction")
-        self.assertEqual(frames[i + 1].function_name, "testNestedFunction")
-
-    def testPythonTracebackHasCorrectLineNumbers(self):
-      def B():
-        return xla_client.Traceback.get_traceback()
-
-      def A():
-        return B()
-
-      tb = A().as_python_traceback()
-      for frame, lineno in traceback.walk_tb(tb):
-        if frame.f_code.co_name == "A":
-          line = A.__code__.co_firstlineno
-          self.assertBetween(lineno, line, line + 2)
-        elif frame.f_code.co_name == "B":
-          line = B.__code__.co_firstlineno
-          self.assertBetween(lineno, line, line + 2)
-
-    def testAccessingLocalsDoesNotCrash(self):
-      # https://github.com/google/jax/issues/16027
-      tb = xla_client.Traceback.get_traceback()
-      python_tb = tb.as_python_traceback()
-      for frame, _ in traceback.walk_tb(python_tb):
-        _ = frame.f_locals  # should not crash
-
-    def testTracebackFromFrames(self):
-      def FooFn(x):
-        return x + 1
-
-      def BarFn(y):
-        y = y + 1
-        y = y + 2
-        return y * 2
-
-      frame_foo = xla_client.Frame(
-          __file__,
-          FooFn.__code__.co_name,
-          FooFn.__code__.co_firstlineno,
-          FooFn.__code__.co_firstlineno + 1,
-      )
-      frame_bar = xla_client.Frame(
-          __file__,
-          BarFn.__code__.co_name,
-          BarFn.__code__.co_firstlineno,
-          BarFn.__code__.co_firstlineno + 2,
-      )
-      frames = [frame_foo, frame_bar]
-      tb = xla_client.Traceback.traceback_from_frames(frames)
-
-      with self.subTest("WalkDoesNotError"):
-        for frame, _ in traceback.walk_tb(tb):
-          _ = frame.f_locals  # should not crash
-
-      with self.subTest("TracebackCorrectness"):
-        tb_string = traceback.format_tb(tb)
-        # The traceback should have the format:
-        # File <this file>, line N in BarFn
-        #   y = y + 2
-        # File <this file>, line N in FooFn
-        #   return x + 1
-        self.assertLen(tb_string, len(frames))
-        bar_frame = tb_string[0].split("\n")
-        self.assertEndsWith(bar_frame[0], "BarFn")
-        self.assertEqual(bar_frame[1].strip(), "y = y + 2")
-        foo_frame = tb_string[1].split("\n")
-        self.assertEndsWith(foo_frame[0], "FooFn")
-        self.assertEqual(foo_frame[1].strip(), "return x + 1")
-
-  tests.append(TracebackTest)
-
-  class ClientTest(ComputationTest):
-
-    def setUp(self):
-      super(ClientTest, self).setUp()
-      self.backend = xla_backend()
-
-    def testPlatformVersion(self):
-      version = self.backend.platform_version
-      logging.info("platform_version:\n%s", version)
-      if self.backend.platform == "cpu":
-        self.assertEqual(version, "cpu")
-      elif self.backend.platform in ("gpu", "cuda", "rocm"):
-        # Following is false if not built with --config=cuda
-        if version != "<unknown>":
-          self.assertTrue(
-              re.match(r"^cuda \d{4,}$", version),
-              msg=f"Expected CUDA version string; got {repr(version)}")
-      elif self.backend.platform == "tpu" and not (pathways or pathways_ifrt):
-        self.assertIn("tpu", version.lower())
-        self.assertIn("cl/", version)
-        self.assertIn("Built on ", version)
-
-    @unittest.skipIf(
-        not cloud_tpu and not pjrt_c_api, "PJRT version only exist for plugins"
-    )
-    def testPjRtCApiVersion(self):
-      self.assertGreaterEqual(self.backend.pjrt_c_api_major_version, 0)
-      self.assertGreaterEqual(self.backend.pjrt_c_api_minor_version, 0)
-
-    @unittest.skipUnless(
-        not pjrt_c_api and tfrt_tpu,
-        "Test that attributes are zero for non-plugin tfrt_tpu",
-    )
-    def testStaticTfrtTpuAttributes(self):
-      self.assertEqual(self.backend.pjrt_c_api_major_version, 0)
-      self.assertEqual(self.backend.pjrt_c_api_minor_version, 0)
-      # CL number is defined as -1 when running as test.
-      self.assertEqual(self.backend.__getattr__("cl_number"), -1)
-
-    @unittest.skipIf(
-        cloud_tpu or pjrt_c_api or (not pjrt_c_api and tfrt_tpu),
-        "PJRT version only exist for plugins",
-    )
-    def testNotExistPjRtCApiVersion(self):
-      with self.assertRaises(AttributeError):
-        self.backend.pjrt_c_api_major_version  # pylint: disable=pointless-statement
-      with self.assertRaises(AttributeError):
-        self.backend.pjrt_c_api_minor_version  # pylint: disable=pointless-statement
-
-    @unittest.skipIf(pathways or pathways_ifrt, "has different behavior")
-    def testPluginProgramDoesNotCompile(self):
-      program = xla_client.ifrt_programs.make_plugin_program("foobar")
-      options = xla_client.ifrt_programs.make_plugin_compile_options()
-      with self.assertRaisesRegex(
-          xla_client.XlaRuntimeError, "PjRtCompiler requires an HloProgram"
-      ):
-        self.backend.compile_ifrt_program(program, options)
-
-    @unittest.skipIf(pathways, "does not work with non-ifrt legacy pathways")
-    def testHloProgramViaIfrtProgram(self):
-      c = self._NewComputation()
-      ops.Iota(c, xla_client.PrimitiveType.F32, 10)
-      program = xla_client.ifrt_programs.make_hlo_program(
-          xla_computation_to_mlir_module(c.build())
-      )
-      options = xla_client.ifrt_programs.make_xla_compile_options(
-          xla_client.CompileOptions(), []
-      )
-
-      compiled_c = self.backend.compile_ifrt_program(program, options)
-      results = execute_with_python_values(
-          compiled_c, arguments=(), backend=self.backend
-      )
-
-      self.assertLen(results, 1)
-      np.testing.assert_equal(results[0], np.arange(10, dtype=np.float32))
-
-    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or tfrt_tpu,
-                     "not implemented")
-    def testExecutableSerialization(self):
-      if self.backend.platform != "tpu":
-        self.skipTest("Test requires tpu platform")
-
-      c = self._NewComputation()
-      ops.Add(
-          ops.Constant(c, NumpyArrayS32([1, 2])),
-          ops.Constant(c, NumpyArrayS32([3, 4])))
-
-      options = xla_client.CompileOptions()
-      executable = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()), options)
-      self.assertLen(executable.hlo_modules(), 1)
-
-      serialized = self.backend.serialize_executable(executable)
-      deserialized = self.backend.deserialize_executable(serialized, options)
-
-      expected, = execute_with_python_values(executable, (), self.backend)
-      actual, = execute_with_python_values(deserialized, (), self.backend)
-      self.assertTrue(np.all(actual == expected))
-
-    def testCompileOptionsSerialization(self):
-      options = xla_client.CompileOptions()
-      executable_build_options = options.executable_build_options
-      options.num_replicas = 3
-      options.num_partitions = 2
-      options.profile_version = 1337
-      options.compile_portable_executable = True
-      executable_build_options.num_replicas = 3
-      executable_build_options.num_partitions = 2
-      deb_opt = executable_build_options.debug_options
-      deb_opt.xla_cpu_enable_fast_math = True
-      deb_opt.xla_test_all_input_layouts = True
-      deb_opt.xla_gpu_kernel_cache_file = "/foo/bar"
-      deb_opt.xla_gpu_enable_llvm_module_compilation_parallelism = True
-      deb_opt.xla_gpu_per_fusion_autotune_cache_dir = "/bar/foo/"
-      deb_opt.xla_gpu_experimental_autotune_cache_mode = (
-          xla_client.AutotuneCacheMode.READ
-      )
-
-      b = options.SerializeAsString()
-      restored = xla_client.CompileOptions.ParseFromString(b)
-
-      for name in ("num_replicas", "num_partitions", "profile_version",
-                   "compile_portable_executable"):
-        self.assertEqual(getattr(options, name), getattr(restored, name),
-                         msg=name)
-
-      for name in ("num_replicas", "num_partitions"):
-        self.assertEqual(getattr(options.executable_build_options, name),
-                         getattr(restored.executable_build_options, name),
-                         msg=name)
-
-      for name in (
-          "xla_cpu_enable_fast_math",
-          "xla_test_all_input_layouts",
-          "xla_gpu_kernel_cache_file",
-          "xla_gpu_enable_llvm_module_compilation_parallelism",
-          "xla_gpu_per_fusion_autotune_cache_dir",
-          "xla_gpu_experimental_autotune_cache_mode",
-      ):
-        self.assertEqual(
-            getattr(options.executable_build_options.debug_options, name),
-            getattr(restored.executable_build_options.debug_options, name),
-            msg=name)
-
-  tests.append(ClientTest)
-
-  # TODO(b/182461453): Add TFRT and cloud TPU implementation of
-  # ReadDynamicShapes
-  @unittest.skip("Test fails HLO -> MHLO conversion")
-  class DynamicReshapeTest(ComputationTest):
-    """Tests related to DynamicReshape."""
-
-    def _CompareToPyAndBufferProtocol(self, builder, args, expected_results,
-                                      test_fn):
-      compiled = self.backend.compile(
-          xla_computation_to_mlir_module(builder.build()))
-      output_buffers = compiled.execute([
-          self.backend.buffer_from_pyval(
-              arg, device=compiled.local_devices()[0]) for arg in args
-      ])
-      self.assertLen(output_buffers, len(expected_results))
-      for buf, expected in zip(output_buffers, expected_results):
-        to_py_result = np.asarray(buf)
-        self.assertEqual(expected.shape, to_py_result.shape)
-        test_fn(expected, to_py_result)
-        if self.backend.platform == "cpu" and buf.dtype != bfloat16:
-          mview = memoryview(buf)
-          self.assertEqual(expected.shape, mview.shape)
-          test_fn(expected, np.asarray(mview))
-        else:
-          # Buffer protocol expected to fail on non-cpu platforms and bfloat16
-          # Note that np.asarray(buf) doesn't throw an exception. To test if the
-          # error was thrown properly we must use memoryview(buf).
-          with self.assertRaises(BufferError):
-            memoryview(buf)
-
-    # 1D reshape of full size, half size, and size of 0.
-    @unittest.skip("not implemented")
-    @parameterized.parameters((5), (3), (0))
-    def testReshape1D(self, reshape_size):
-      full_size = 5
-      c = self._NewComputation()
-      arg = np.array(reshape_size, dtype=np.int32)
-      expected = np.array(range(reshape_size), dtype=np.int32)
-      p = ops.Parameter(c, 0, xla_client.shape_from_pyval(arg))
-      ops.DynamicReshape(
-          ops.Constant(c, NumpyArrayS32(range(full_size))), [p], [full_size],
-          [True])
-      self._CompareToPyAndBufferProtocol(c, [arg], [expected],
-                                         np.testing.assert_equal)
-
-    # 2D reshape with an slice on the minor dimension.  We test different types
-    # where the strides may differ between the host and devices. The reshaped
-    # physical memory layout is not consecutive, and we test if the program can
-    # return the correct logical view of the data.
-    @unittest.skipIf(
-        cloud_tpu or pathways or tfrt_tpu or pjrt_c_api,
-        "not implemented")
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in int_dtypes + float_dtypes)
-    def testReshape2D(self, dtype):
-      arg0 = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
-      arg1 = np.array(2, dtype=np.int32)
-      expected = np.array([[1, 2], [4, 5]], dtype=np.int32)
-      c = self._NewComputation()
-      p0 = ops.Parameter(c, 0, xla_client.shape_from_pyval(arg0))
-      p1 = ops.Parameter(c, 1, xla_client.shape_from_pyval(arg1))
-      ops.DynamicReshape(p0, [p1, p1], [2, 3], [False, True])
-      self._CompareToPyAndBufferProtocol(c, [arg0, arg1], [expected],
-                                         np.testing.assert_equal)
-
-    @unittest.skipIf(cloud_tpu or pathways or tfrt_tpu, "not implemented")
-    @parameterized.named_parameters({
-        "testcase_name": "_{}".format(dtype.__name__),
-        "dtype": dtype,
-    } for dtype in int_dtypes + float_dtypes)
-    def testDynamicShapeArgs(self, dtype):
-      full_size = 10
-      dynamic_shape_size = 4
-      # subcomputation 1
-      binary_add_builder = self._NewComputation()
-      scalar_shape = xla_client.Shape.scalar_shape(np.dtype(dtype))
-      ops.Add(
-          ops.Parameter(binary_add_builder, 0, scalar_shape),
-          ops.Parameter(binary_add_builder, 1, scalar_shape))
-      # subcomputation 2
-      reshape_reduce_builder = self._NewComputation()
-      dshape = xla_client.Shape.array_shape(
-          np.dtype(dtype), dims=[full_size], dynamic_dimensions=[True])
-      reshape_reduce_p = ops.Parameter(reshape_reduce_builder, 0, dshape)
-      ops.Reduce(
-          reshape_reduce_builder,
-          operands=[reshape_reduce_p],
-          init_values=[ops.Constant(reshape_reduce_builder, dtype(0))],
-          computation=binary_add_builder.build(),
-          dimensions_to_reduce=[0])
-      # main computation: sum(range(full_size)[:dynamic_shape_size])
-      c = self._NewComputation()
-      arg = np.array(dynamic_shape_size, dtype=np.int32)
-      p = ops.Parameter(c, 0, xla_client.shape_from_pyval(arg))
-      reshaped = ops.DynamicReshape(
-          ops.Constant(c, np.array(range(full_size), dtype=dtype)), [p],
-          [full_size], [True])
-      ops.Call(c, reshape_reduce_builder.build(), operands=(reshaped,))
-      self._ExecuteAndCompareClose(c, [arg], [dtype(6)])
-
-  tests.append(DynamicReshapeTest)
-
-  class DeviceAssignmentTest(ComputationTest):
-
-    def testSerialize(self):
-      shape = (3, 4)
-      device_assignment = xla_client.DeviceAssignment.create(
-          np.arange(np.prod(shape)).reshape(*shape))
-      self.assertEqual(device_assignment.replica_count(), shape[0])
-      self.assertEqual(device_assignment.computation_count(), shape[1])
-      serialized = device_assignment.serialize()
-      self.assertIsInstance(serialized, bytes)
-      self.assertNotEmpty(serialized)
-
-  tests.append(DeviceAssignmentTest)
-
-  class TokenTest(ComputationTest):
-    """Tests related to PyToken."""
-
-    def testExecuteWithToken(self):
-      c = self._NewComputation()
-      ops.Mul(
-          ops.Constant(c, np.array([2.5, 3.3, -1.2, 0.7], np.float32)),
-          ops.Constant(c, np.array([-1.2, 2, -2, -3], np.float32)))
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()))
-      results, token = compiled_c.execute_with_token([])
-      token.block_until_ready()
-      self.assertLen(results, 1)
-      np.testing.assert_allclose(
-          np.asarray(results[0]), np.float32([-3, 6.6, 2.4, -2.1]), rtol=3e-3)
-
-    def testExecuteShardedOnLocalDevicesWithTokens(self):
-      c = self._NewComputation()
-      ops.Mul(
-          ops.Constant(c, np.array([2.5, 3.3, -1.2, 0.7], np.float32)),
-          ops.Constant(c, np.array([-1.2, 2, -2, -3], np.float32)))
-      num_replicas = 1
-      options = xla_client.CompileOptions()
-      options.num_replicas = num_replicas
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()), compile_options=options)
-      results, sharded_token = (
-          compiled_c.execute_sharded_on_local_devices_with_tokens([])
-      )
-      sharded_token.block_until_ready()
-      self.assertLen(results, 1)
-      self.assertLen(results[0], 1)
-      np.testing.assert_allclose(
-          np.asarray(results[0][0]),
-          np.float32([-3, 6.6, 2.4, -2.1]),
-          rtol=3e-3)
-
-  tests.append(TokenTest)
-
-  class ExecutePortableTest(ComputationTest):
-
-    @unittest.skip("Test does not work under IFRT")
-    def testExecutePortable(self):
-      devices_by_kind = collections.defaultdict(list)
-      for device in self.backend.devices():
-        devices_by_kind[device.device_kind].append(device)
-      multi_devices = [d for d in devices_by_kind.values() if len(d) > 1]
-      if not multi_devices:
-        raise unittest.SkipTest("Test needs multiple identical devices")
-      devices = multi_devices[0]
-
-      c = self._NewComputation()
-      args = [
-          np.array(3, dtype=np.int32),
-          np.array([10, 15, -2, 7], dtype=np.int32)
-      ]
-      p0 = ops.Parameter(c, 0, xla_client.shape_from_pyval(args[0]))
-      p1 = ops.Parameter(c, 1, xla_client.shape_from_pyval(args[1]))
-      ops.Mul(p0, p1)
-      options = xla_client.CompileOptions()
-      options.compile_portable_executable = True
-      compiled_c = self.backend.compile(c.build(), compile_options=options)
-      for device in devices:
-        out, = compiled_c.execute(
-            [self.backend.buffer_from_pyval(a, device=device) for a in args],
-            device=device)
-        np.testing.assert_array_equal(np.asarray(out), args[0] * args[1])
-
-  tests.append(ExecutePortableTest)
-
-  class ExecuteShardedOverloadTest(ComputationTest):
-
-    def testExecuteShardedOverloadEmptyInput(self):
-      c = self._NewComputation()
-      ops.Constant(c, np.array([2.5, 3.3, -1.2, 0.7], np.float32))
-      options = xla_client.CompileOptions()
-      options.num_replicas = 1
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()), compile_options=options)
-
-      results = compiled_c.execute_sharded_on_local_devices([])
-      self.assertLen(results, 1)
-      self.assertIsInstance(results[0], list)
-      self.assertLen(results[0], 1)
-      results[0][0].block_until_ready()
-      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
-
-      results, _ = compiled_c.execute_sharded_on_local_devices_with_tokens([])
-      self.assertLen(results, 1)
-      self.assertIsInstance(results[0], list)
-      self.assertLen(results[0], 1)
-      results[0][0].block_until_ready()
-      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
-
-    def testExecuteShardedOverloadBufferInput(self):
-      arg = np.arange(12, dtype=np.int16).reshape(3, 4)
-      c = self._NewComputation()
-      ops.Parameter(c, 0, xla_client.shape_from_pyval(arg))
-
-      options = xla_client.CompileOptions()
-      options.num_replicas = 1
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()), compile_options=options)
-
-      buffer = self.backend.buffer_from_pyval(arg)
-
-      results = compiled_c.execute_sharded_on_local_devices([[buffer]])
-      self.assertLen(results, 1)
-      self.assertIsInstance(results[0], list)
-      self.assertLen(results[0], 1)
-      results[0][0].block_until_ready()
-      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
-
-      results, _ = compiled_c.execute_sharded_on_local_devices_with_tokens(
-          [[buffer]])
-      self.assertLen(results, 1)
-      self.assertIsInstance(results[0], list)
-      self.assertLen(results[0], 1)
-      results[0][0].block_until_ready()
-      self.assertIsInstance(results[0][0], xla_client.ArrayImpl)
-
-  tests.append(ExecuteShardedOverloadTest)
-
-  return tests
-
-
-def InstantiateTests(globals_dict, backend_fn, test_prefix="", **kw):
-  # Avoid creating a new backend per test (this causes GPU OOM, and is probably
-  # inefficient).
-  backend_fn = functools.lru_cache(maxsize=None)(backend_fn)
-  for klass in TestFactory(backend_fn, **kw):
-    test = type(test_prefix + klass.__name__, (klass,), {})
-    # Clean up the qualified names of the tests to not include the test factory.
-    test.__qualname__ = test.__name__
-    globals_dict[test.__name__] = test
-
-
-backends = {
-    "cpu": functools.partial(xla_client.make_cpu_client, num_devices=2),
-    "gpu": xla_client.make_gpu_client,
-}
-
-if __name__ == "__main__":
-  flags.DEFINE_string("backend", "cpu", "Target platform.")
-  # pylint: disable=unnecessary-lambda
-  InstantiateTests(globals(), lambda: backends[FLAGS.backend]())
-  # pylint: enable=unnecessary-lambda
-  absltest.main()
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
deleted file mode 100644
index 7c89543be729..000000000000
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ /dev/null
@@ -1,1639 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/xla_compiler.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/ndarray.h"
-#include "nanobind/stl/optional.h"  // IWYU pragma: keep
-#include "nanobind/stl/pair.h"  // IWYU pragma: keep
-#include "nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "nanobind/stl/variant.h"  // IWYU pragma: keep
-#include "nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "xla/array.h"
-#include "xla/client/executable_build_options.h"
-#include "xla/debug_options_flags.h"
-#include "xla/ffi/api/c_api.h"
-#include "xla/ffi/ffi.h"
-#include "xla/ffi/ffi_api.h"
-#include "xla/hlo/builder/xla_builder.h"
-#include "xla/hlo/builder/xla_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
-#include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
-#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
-#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
-#include "xla/layout.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/pjrt/compile_options.pb.h"
-#include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/status_casters.h"
-#include "xla/python/dlpack.h"
-#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
-#include "xla/python/nb_helpers.h"
-#include "xla/python/nb_numpy.h"
-#include "xla/python/py_client.h"
-#include "xla/python/types.h"
-#include "xla/service/call_inliner.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_graph_dumper.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/service/name_uniquer.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/lib/strings/proto_serialization.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-
-namespace nanobind {
-namespace detail {
-
-template <>
-struct type_caster<xla::OpMetadata> {
- public:
-  NB_TYPE_CASTER_FROM_PYTHON_ONLY(xla::OpMetadata,
-                                  const_name("xla::OpMetadata"));
-
-  bool from_python(handle h, uint8_t, cleanup_list*) noexcept {
-    handle op_type = getattr(h, "op_type");
-    if (!op_type.is_none()) {
-      value.set_op_type(cast<std::string>(op_type));
-    }
-    handle op_name = getattr(h, "op_name");
-    if (!op_name.is_none()) {
-      value.set_op_name(cast<std::string>(op_name));
-    }
-    handle source_file = getattr(h, "source_file");
-    if (!source_file.is_none()) {
-      value.set_source_file(cast<std::string>(source_file));
-    }
-    handle source_line = getattr(h, "source_line");
-    if (!source_line.is_none()) {
-      value.set_source_line(cast<int32_t>(source_line));
-    }
-    return true;
-  }
-};
-
-}  // namespace detail
-}  // namespace nanobind
-
-namespace xla {
-namespace {
-
-namespace nb = nanobind;
-
-struct Uniquer {
-  absl::Mutex mu;
-  NameUniquer name_uniquer ABSL_GUARDED_BY(mu);
-};
-
-Uniquer* GetUniquer() {
-  static Uniquer* uniquer = new Uniquer;
-  return uniquer;
-}
-
-static std::string UniquifyName(const std::string& name) {
-  Uniquer* uniquer = GetUniquer();
-  absl::MutexLock lock(&uniquer->mu);
-  return uniquer->name_uniquer.GetUniqueName(name);
-}
-
-// Converts a computation to a serialized HloModuleProto.
-absl::StatusOr<nb::bytes> GetComputationSerializedProto(
-    const XlaComputation& computation) {
-  std::string result;
-  if (!tsl::SerializeToStringDeterministic(computation.proto(), &result)) {
-    return Unknown("Failed to serialize the HloModuleProto.");
-  }
-  return nb::bytes(result.data(), result.size());
-}
-
-// Converts a hlo module to a serialized HloModuleProto.
-absl::StatusOr<nb::bytes> GetHloModuleSerializedProto(const HloModule& module) {
-  std::string result;
-  if (!tsl::SerializeToStringDeterministic(module.ToProto(), &result)) {
-    return Unknown("Failed to serialize the HloModuleProto.");
-  }
-  return nb::bytes(result.data(), result.size());
-}
-
-// Converts a serialized HloModuleProto into a HloModule.
-absl::StatusOr<std::shared_ptr<HloModule>> HloModuleFromSerializedProto(
-    const nb::bytes& bytes) {
-  HloModuleProto proto;
-  proto.ParseFromArray(bytes.c_str(), bytes.size());
-  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          proto, GetDebugOptionsFromFlags()));
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProto(proto, module_config));
-  return std::shared_ptr<HloModule>(std::move(module));
-}
-
-absl::StatusOr<std::shared_ptr<HloModule>> GetHloModule(
-    const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          computation.proto(), GetDebugOptionsFromFlags()));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      HloModule::CreateFromProto(computation.proto(), module_config));
-  return std::shared_ptr<HloModule>(std::move(module));
-}
-
-// Converts a computation to textual HLO form.
-absl::StatusOr<std::string> GetComputationHloText(
-    const XlaComputation& computation, bool print_large_constants = false) {
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<HloModule> hlo_module,
-                      GetHloModule(computation));
-  HloPrintOptions options;
-  options = HloPrintOptions::ShortParsable();
-  options.set_print_large_constants(print_large_constants);
-  return hlo_module->ToString(options);
-}
-
-// Converts a computation to HLO dot graph form.
-absl::StatusOr<std::string> GetComputationHloDotGraph(
-    const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<HloModule> hlo_module,
-                      GetHloModule(computation));
-  return RenderGraph(*hlo_module->entry_computation(), /*label=*/"",
-                     hlo_module->config().debug_options(),
-                     RenderedGraphFormat::kDot);
-}
-
-// Hashes the HLO module.
-absl::StatusOr<uint64_t> HashComputation(const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<HloModule> hlo_module,
-                      GetHloModule(computation));
-  return absl::HashOf(*hlo_module);
-}
-// Safe version of ShapeUtil::MakeShapeWithDenseLayout that fails gracefully on
-// invalid input.
-absl::StatusOr<Shape> MakeShapeWithDenseLayout(
-    PrimitiveType element_type, absl::Span<const int64_t> dims,
-    std::optional<absl::Span<const int64_t>> minor_to_major,
-    std::optional<const std::vector<bool>> dynamic_dimensions) {
-  Shape shape;
-  if (dynamic_dimensions) {
-    TF_ASSIGN_OR_RETURN(
-        shape, ShapeUtil::MakeValidatedShape(element_type, dims,
-                                             dynamic_dimensions.value()));
-  } else {
-    TF_ASSIGN_OR_RETURN(shape,
-                        ShapeUtil::MakeValidatedShape(element_type, dims));
-  }
-  if (minor_to_major) {
-    *shape.mutable_layout() = LayoutUtil::MakeLayout(*minor_to_major);
-    TF_RETURN_IF_ERROR(
-        LayoutUtil::ValidateLayoutForShape(shape.layout(), shape));
-  }
-
-  return shape;
-}
-
-// Pybind function for HloSharding.iota_tile, which is a non-crashing factory
-// that produces a HloSharding instance backed by tile assignment of a
-// transposed and reshaped iota array of device ids. More specifically the tile
-// assignment array is as if it is produced by the following numpy code:
-// numpy.arange(math.prod(dims)).reshape(reshape_dims)
-//      .transpose(transpose_perm).reshape(math.prod(dims))
-// where:
-// `dims`: is the dimensions of the tile assignment array, which corresponds to
-//   OpSharding.tile_assignment_dimensions.
-// `reshape_dims`: is the dimensions the 1D iota array is reshaped to.
-// `transpose_perm`: is the dimension permutation to transpose `reshape_dims`.
-// `subgroup_types`: indicates the subgroups of the last `subgroup_types.size()`
-//   dimensions in `dims`.
-//
-// In practice, `reshape_dims` often maps to the axises of user defined device
-// mesh, and `transpose_perm` often maps to the user specification of how a
-// tensor is partitioned based on the axes defined in the mesh, e.g. for a mesh
-// of size 4x2x2 as AxBxC:
-// PartitionSpec('A', 'B', 'C') corresponds to reshape_dims=[4,2,2],
-// transpose_perm=[0,1,2] (no transpose)
-// PartitionSpec('B', 'A', 'C') corresponds to reshape_dims=[4,2,2],
-// transpose_perm=[1,0,2] (swap A and B)
-absl::StatusOr<HloSharding> IotaTileHelper(
-    absl::Span<const int64_t> dims, absl::Span<const int64_t> reshape_dims,
-    absl::Span<const int> transpose_perm,
-    absl::Span<const OpSharding::Type> subgroup_types) {
-  if (dims.empty()) {
-    return InvalidArgument("`dims` should not be empty.");
-  }
-  if (reshape_dims.size() != transpose_perm.size()) {
-    return InvalidArgument(
-        "`reshape_dims` and `transpose_perm` should have the same size, saw "
-        "[%s] v.s. [%s]",
-        absl::StrJoin(reshape_dims, ","), absl::StrJoin(transpose_perm, ","));
-  }
-  if (!reshape_dims.empty() && Product(dims) != Product(reshape_dims)) {
-    return InvalidArgument(
-        "Cannot reshape from `dims` [%s] to `reshape_dims` [%s].",
-        absl::StrJoin(dims, ","), absl::StrJoin(reshape_dims, ","));
-  }
-  if (subgroup_types.size() > dims.size()) {
-    return InvalidArgument(
-        "`subgroup_types`(%lld) should not have more dimensions than "
-        "`dims`(%lld).",
-        subgroup_types.size(), dims.size());
-  }
-  if (reshape_dims.empty()) {
-    return subgroup_types.empty()
-               ? HloSharding::IotaTile(dims)
-               : HloSharding::Subgroup(TileAssignment(dims), subgroup_types);
-  }
-  return subgroup_types.empty()
-             ? HloSharding::IotaTile(dims, reshape_dims, transpose_perm)
-             : HloSharding::Subgroup(
-                   TileAssignment(dims, reshape_dims, transpose_perm),
-                   subgroup_types);
-}
-
-// Registers a 'fn' as a custom call target.
-//
-// `fn` must be a custom call implementation function pointer (XLA_FFI_Handler*
-// when implemented as FFI handler) encapsulated in a PyCapsule object or a
-// a dictionary of function pointers (also encapsulated in a PyCapsule).
-//
-// See XLA_FFI_ExecutionStage documentation for more details about the
-// custom execution stages.
-absl::Status PyRegisterCustomCallTarget(const std::string& fn_name,
-                                        nb::object fn,
-                                        const std::string& platform,
-                                        int api_version,
-                                        XLA_FFI_Handler_Traits traits) {
-  // Register legacy custom call target (untyped void* API).
-  if (api_version == 0) {
-    if (traits != 0) {
-      return absl::InvalidArgumentError(
-          "Custom call target registration with traits is not supported for "
-          "api_version=0");
-    }
-
-    nb::capsule capsule;
-    if (!nb::try_cast<nb::capsule>(fn, capsule)) {
-      return absl::InvalidArgumentError(
-          "Custom call target registration with api_version=0 requires a "
-          "PyCapsule fn object");
-    }
-
-    CustomCallTargetRegistry::Global()->Register(
-        fn_name, static_cast<void*>(capsule.data()), platform);
-    return absl::OkStatus();
-  }
-
-  // Register XLA FFI handler (typed API with explicit function signatures).
-  if (api_version == 1) {
-    nb::capsule capsule;
-    if (nb::try_cast<nb::capsule>(fn, capsule)) {
-      return ffi::TakeStatus(ffi::Ffi::RegisterStaticHandler(
-          xla::ffi::GetXlaFfiApi(), fn_name, platform,
-          reinterpret_cast<XLA_FFI_Handler*>(
-              static_cast<void*>(capsule.data()))));
-    }
-
-    nb::dict bundle;
-    if (nb::try_cast<nb::dict>(fn, bundle)) {
-      auto handler = [&](const char* name) -> absl::StatusOr<XLA_FFI_Handler*> {
-        if (!bundle.contains(name)) return nullptr;
-
-        nb::capsule capsule;
-        if (!nb::try_cast<nb::capsule>(bundle[name], capsule)) {
-          return absl::InvalidArgumentError(
-              "Custom call target registration with api_version=1 requires a "
-              "PyCapsule fn object for all dict keys");
-        }
-
-        return reinterpret_cast<XLA_FFI_Handler*>(capsule.data());
-      };
-
-      XLA_FFI_Handler_Bundle bundle;
-      TF_ASSIGN_OR_RETURN(bundle.instantiate, handler("instantiate"));
-      TF_ASSIGN_OR_RETURN(bundle.prepare, handler("prepare"));
-      TF_ASSIGN_OR_RETURN(bundle.initialize, handler("initialize"));
-      TF_ASSIGN_OR_RETURN(bundle.execute, handler("execute"));
-
-      return ffi::TakeStatus(ffi::Ffi::RegisterStaticHandler(
-          xla::ffi::GetXlaFfiApi(), fn_name, platform, bundle, traits));
-    }
-
-    return absl::InvalidArgumentError(
-        "Unsupported custom call target type for api_version=1");
-  }
-
-  return absl::UnimplementedError(absl::StrFormat(
-      "API version %d is not supported by RegisterCustomCallTarget. "
-      "Supported versions are 0 and 1.",
-      api_version));
-}
-
-absl::Status PyRegisterCustomTypeId(absl::string_view type_name,
-                                    nb::object type_id) {
-  nb::capsule capsule;
-  if (!nb::try_cast<nb::capsule>(type_id, capsule)) {
-    return absl::InvalidArgumentError(
-        "The type_id argument to register_custom_call_type_id must be a "
-        "PyCapsule object holding a pointer to a XLA_FFI_TypeId.");
-  }
-  XLA_FFI_TypeId* type_id_ptr =
-      reinterpret_cast<XLA_FFI_TypeId*>(static_cast<void*>(capsule.data()));
-  return ffi::TakeStatus(ffi::Ffi::RegisterTypeId(xla::ffi::GetXlaFfiApi(),
-                                                  type_name, type_id_ptr));
-}
-
-template <typename T, typename Container>
-void DefRepeatedProperty(nb::class_<T>& cls, const char* name,
-                         Container* (T::*getter)()) {
-  cls.def_prop_rw(
-      name,
-      [getter](T& obj) {
-        Container* elems = (obj.*getter)();
-        std::vector<typename Container::value_type> result;
-        result.reserve(elems->size());
-        std::copy(elems->begin(), elems->end(), std::back_inserter(result));
-        return result;
-      },
-      [getter](T& obj, std::vector<typename Container::value_type> new_elems) {
-        Container* elems = (obj.*getter)();
-        elems->Clear();
-        elems->Reserve(new_elems.size());
-        for (typename Container::value_type& e : new_elems) {
-          elems->Add(std::move(e));
-        }
-      });
-}
-
-template <typename T, typename Container>
-void DefRepeatedEnumProperty(nb::class_<T>& cls, const char* name,
-                             Container* (T::*getter)()) {
-  cls.def_prop_rw(
-      name,
-      [getter](T& obj) {
-        Container* elems = (obj.*getter)();
-        std::vector<typename Container::value_type> result;
-        result.reserve(elems->size());
-        std::copy(elems->begin(), elems->end(), std::back_inserter(result));
-        return result;
-      },
-      [getter](T& obj, nb::sequence new_elems) {
-        Container* elems = (obj.*getter)();
-        elems->Clear();
-        for (nb::handle e : new_elems) {
-          elems->Add(nb::cast<int>(e.attr("value")));
-        }
-      });
-}
-
-template <typename T>
-Array<T> NDArrayToArray(nb::ndarray<T, nb::c_contig> ndarray) {
-  std::vector<int64_t> shapes;
-  shapes.reserve(ndarray.ndim());
-  for (int i = 0; i < ndarray.ndim(); ++i) {
-    shapes.push_back(ndarray.shape(i));
-  }
-  xla::Array<int64_t> array(shapes);
-  array.Each([&](absl::Span<const int64_t> indices, int64_t* val) {
-    int64_t offset = indices.back();
-    int64_t multiplier = 1;
-    for (int i = ndarray.ndim() - 1; i > 0; --i) {
-      multiplier *= ndarray.shape(i);
-      offset += indices[i - 1] * multiplier;
-    }
-    *val = *(ndarray.data() + offset);
-  });
-  return array;
-}
-
-absl::StatusOr<HloSharding> SubgroupWithTileAssignmentHelper(
-    nb::ndarray<int64_t, nb::c_contig> tile_assignment,
-    absl::Span<const OpSharding::Type> subgroup_types) {
-  return HloSharding::Subgroup(NDArrayToArray(tile_assignment), subgroup_types);
-}
-
-nb::ndarray<> LiteralToNdarray(Literal& obj) {
-  const Shape& shape = obj.shape();
-
-  if (!shape.has_layout()) {
-    throw XlaRuntimeError(
-        "Creating an array is only supported for Literals with a layout.");
-  }
-
-  const Layout& layout = shape.layout();
-
-  if (!layout.tiles().empty()) {
-    throw XlaRuntimeError(
-        "Creating an array from a tiled Literal is not supported.");
-  }
-
-  if (!LayoutUtil::IsDenseArray(shape)) {
-    throw XlaRuntimeError(
-        "Creating an array is only supported for dense Literals.");
-  }
-
-  xla::PrimitiveType primitive_type = shape.element_type();
-  nb::dlpack::dtype dtype =
-      ValueOrThrow(PrimitiveTypeToNbDLDataType(primitive_type));
-
-  absl::Span<const int64_t> dimensions = shape.dimensions();
-  std::vector<size_t> unsigned_dimensions(dimensions.begin(), dimensions.end());
-  auto strides = StridesForShape(primitive_type, dimensions, layout);
-
-  return nb::ndarray<>(obj.untyped_data(), unsigned_dimensions.size(),
-                       unsigned_dimensions.data(), {}, strides.data(), dtype,
-                       nb::device::cpu::value, 0);
-}
-
-}  // namespace
-
-void BuildXlaCompilerSubmodule(nb::module_& m) {
-  // Shapes
-  nb::class_<Layout> layout_class(m, "Layout");
-  layout_class.def(nb::init<absl::Span<const int64_t>>())
-      .def("__init__",
-           [](Layout* self, nb::sequence minor_to_major, nb::sequence tiling,
-              int64_t element_size_in_bits) {
-             std::vector<Tile> xla_tiles;
-             xla_tiles.reserve(nb::len(tiling.ptr()));
-             for (auto tile : tiling) {
-               xla_tiles.push_back(Tile(
-                   SequenceToVector<int64_t>(nb::cast<nb::sequence>(tile))));
-             }
-             std::vector<int64_t> xla_minor_to_major =
-                 SequenceToVector<int64_t>(minor_to_major);
-             new (self)
-                 Layout(xla_minor_to_major, xla_tiles, element_size_in_bits);
-           })
-      .def("minor_to_major",
-           [](Layout layout) { return SpanToNbTuple(layout.minor_to_major()); })
-      .def("element_size_in_bits", &Layout::element_size_in_bits)
-      .def("tiling",
-           [](Layout layout) {
-             std::vector<nb::tuple> result;
-             result.reserve(layout.tiles().size());
-             for (auto& t : layout.tiles()) {
-               result.push_back(SpanToNbTuple(t.dimensions()));
-             }
-             return result;
-           })
-      .def("__eq__", [](const Layout& layout,
-                        const Layout& other) { return layout == other; })
-      .def("__ne__", [](const Layout& layout,
-                        const Layout& other) { return layout != other; })
-      .def("__str__", &Layout::ToString)
-      .def("__hash__",
-           [](const Layout& layout) { return absl::HashOf(layout); })
-      .def("to_string", &Layout::ToString)
-      .def("__getstate__",
-           [](const Layout& self) -> nb::tuple {
-             auto proto = self.ToProto();
-             std::string result;
-             if (!tsl::SerializeToStringDeterministic(proto, &result)) {
-               // throw converted by PyBind to a Python RuntimeError.
-               throw XlaRuntimeError(
-                   absl::StrCat("Layout.py_pickle: ",
-                                "SerializeToStringDeterministic failed"));
-             }
-             return nb::make_tuple(nb::bytes(result.data(), result.size()));
-           })
-      .def("__setstate__", [](Layout* self, nb::tuple t) {
-        LayoutProto result;
-        nb::bytes serialized = nb::cast<nb::bytes>(t[0]);
-        result.ParseFromArray(serialized.c_str(), serialized.size());
-        new (self) Layout(Layout::CreateFromProto(result));
-      });
-
-  nb::class_<Shape> shape_class(m, "Shape");
-  shape_class
-      .def("__init__",
-           [](Shape* self, const std::string& s) {
-             new (self) Shape(ValueOrThrow(ParseShape(s)));
-           })
-      .def_static(
-          "tuple_shape",
-          [](std::vector<Shape> shapes) -> Shape {
-            return ShapeUtil::MakeTupleShape(shapes);
-          },
-          "Constructs a tuple shape.")
-      .def_static("array_shape",
-                  xla::ValueOrThrowWrapper(
-                      [](PrimitiveType type, nb::sequence dims_seq,
-                         std::optional<nb::sequence> layout_seq,
-                         std::optional<std::vector<bool>> dynamic_dimensions)
-                          -> absl::StatusOr<Shape> {
-                        std::vector<int64_t> dims =
-                            SequenceToVector<int64_t>(dims_seq);
-                        if (layout_seq) {
-                          std::vector<int64_t> layout =
-                              SequenceToVector<int64_t>(*layout_seq);
-                          return MakeShapeWithDenseLayout(type, dims, layout,
-                                                          dynamic_dimensions);
-                        } else {
-                          return MakeShapeWithDenseLayout(
-                              type, dims, std::nullopt, dynamic_dimensions);
-                        }
-                      }),
-                  "Constructs an array shape.", nb::arg("type"),
-                  nb::arg("dims"), nb::arg("layout").none() = std::nullopt,
-                  nb::arg("dynamic_dimensions").none() = std::nullopt)
-      .def_static(
-          "array_shape",
-          xla::ValueOrThrowWrapper(
-              [](nb_dtype dtype, nb::sequence dims_seq,
-                 std::optional<nb::sequence> layout_seq,
-                 std::optional<std::vector<bool>> dynamic_dimensions)
-                  -> absl::StatusOr<Shape> {
-                PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
-                std::vector<int64_t> dims = SequenceToVector<int64_t>(dims_seq);
-                if (layout_seq) {
-                  std::vector<int64_t> layout =
-                      SequenceToVector<int64_t>(*layout_seq);
-                  return MakeShapeWithDenseLayout(type, dims, layout,
-                                                  dynamic_dimensions);
-                } else {
-                  return MakeShapeWithDenseLayout(type, dims, std::nullopt,
-                                                  dynamic_dimensions);
-                }
-              }),
-          "Constructs an array shape.", nb::arg("type"), nb::arg("dims"),
-          nb::arg("layout").none() = std::nullopt,
-          nb::arg("dynamic_dimensions").none() = std::nullopt)
-      .def_static("token_shape", []() { return ShapeUtil::MakeTokenShape(); })
-      .def_static(
-          "scalar_shape",
-          [](PrimitiveType type) -> Shape {
-            return ShapeUtil::MakeScalarShape(type);
-          },
-          "Constructs a scalar shape.", nb::arg("type"))
-      .def_static(
-          "scalar_shape",
-          [](nb_dtype dtype) -> Shape {
-            PrimitiveType type = xla::ValueOrThrow(DtypeToPrimitiveType(dtype));
-            return ShapeUtil::MakeScalarShape(type);
-          },
-          "Constructs a scalar shape.", nb::arg("type"))
-      .def("dimensions",
-           [](const Shape& shape) -> nb::tuple {
-             return SpanToNbTuple(shape.dimensions());
-           })
-      .def("layout",
-           [](const Shape& shape) -> Layout { return shape.layout(); })
-      .def("xla_element_type", &Shape::element_type)
-      .def("element_type",
-           [](const Shape& shape) {
-             return xla::ValueOrThrow(
-                 PrimitiveTypeToNbDtype(shape.element_type()));
-           })
-      .def("numpy_dtype",
-           [](const Shape& shape) {
-             if (shape.IsTuple()) {
-               return nb_dtype("O");
-             }
-             return xla::ValueOrThrow(
-                 PrimitiveTypeToNbDtype(shape.element_type()));
-           })
-      .def("is_tuple", &Shape::IsTuple)
-      .def("is_array", &Shape::IsArray)
-      .def("is_token", &Shape::IsToken)
-      .def("is_static", &Shape::is_static)
-      .def("is_dynamic", &Shape::is_dynamic)
-      .def("is_dynamic_dimension", &Shape::is_dynamic_dimension,
-           nb::arg("dimension"))
-      .def("set_dynamic_dimension", &Shape::set_dynamic_dimension,
-           nb::arg("dimension"), nb::arg("is_dynamic"))
-      .def("rank", &Shape::rank)
-      .def("to_serialized_proto",
-           [](const Shape& shape) {
-             ShapeProto proto = shape.ToProto();
-             std::string s = proto.SerializeAsString();
-             return nb::bytes(s.data(), s.size());
-           })
-      .def("tuple_shapes",
-           [](const Shape& shape) {
-             return std::vector<Shape>(shape.tuple_shapes());
-           })
-      .def("leaf_count",
-           [](const Shape& shape) { return ShapeUtil::GetLeafCount(shape); })
-      .def(
-          "with_major_to_minor_layout_if_absent",
-          [](const Shape& shape) {
-            Shape out = shape;
-            ShapeUtil::ForEachMutableSubshape(
-                &out, [](Shape* subshape, const ShapeIndex&) {
-                  if (!subshape->has_layout()) {
-                    LayoutUtil::SetToDefaultLayout(subshape);
-                  }
-                });
-            return out;
-          },
-          "Returns a copy of a shape with missing layouts set to "
-          "major-to-minor.")
-      .def("__eq__", [](const Shape& shape,
-                        const Shape& other) { return shape == other; })
-      .def("__ne__", [](const Shape& shape,
-                        const Shape& other) { return shape != other; })
-      .def("__hash__", [](const Shape& shape) { return absl::HashOf(shape); })
-      .def("__repr__", [](const Shape& shape) {
-        return shape.ToString(/*print_layout=*/true);
-      });
-
-  nb::class_<ProgramShape>(m, "ProgramShape")
-      .def(
-          "__init__",
-          [](ProgramShape* self, absl::Span<const Shape> params, Shape result) {
-            new (self) ProgramShape();
-            for (const Shape& param : params) {
-              *self->add_parameters() = param;
-            }
-            *self->mutable_result() = result;
-          })
-      .def("parameter_shapes",
-           static_cast<const std::vector<Shape>& (ProgramShape::*)() const>(
-               &ProgramShape::parameters))
-      .def("result_shape", &ProgramShape::result)
-      .def("__repr__", &ProgramShape::ToString);
-
-  nb::class_<ShapeIndex>(m, "ShapeIndex")
-      .def("__init__",
-           [](ShapeIndex* self, const std::vector<int64_t>& v) {
-             new (self) ShapeIndex(v.begin(), v.end());
-           })
-      .def("__repr__", &ShapeIndex::ToString)
-      .def("__eq__", [](const ShapeIndex& shape_ind,
-                        const ShapeIndex& other) { return shape_ind == other; })
-      .def("__ne__", [](const ShapeIndex& shape_ind,
-                        const ShapeIndex& other) { return shape_ind != other; })
-      .def("__hash__",
-           [](const ShapeIndex& shape_ind) { return absl::HashOf(shape_ind); });
-
-  // Literals
-  nb::class_<Literal>(m, "Literal")
-      .def(nb::init<const Shape&>())
-      .def("__repr__", &Literal::ToString)
-      .def(
-          "__array__",
-          [](std::shared_ptr<Literal> obj, std::optional<nb::object> dtype,
-             std::optional<bool> copy) {
-            // Provides the interface required by numpy to create a np.ndarray.
-            // Currently don't support the __dl_pack__ interface but can be
-            // added with very little effort it if needed.
-
-            nb::ndarray<nb::numpy> np_array(LiteralToNdarray(*obj));
-
-            if (dtype.has_value()) {
-              throw XlaRuntimeError(
-                  "Passing of dtype to __array__ not currently supported.");
-            }
-
-            if (copy.has_value() && *copy) {
-              // when a copy is requested we _must_ return a copy:
-              // https://numpy.org/doc/2.1/reference/generated/numpy.ndarray.__array__.html
-              return np_array.cast(nb::rv_policy::copy);
-            }
-
-            return np_array.cast(nb::rv_policy::reference_internal,
-                                 nb::cast(obj));
-          },
-          nb::arg("dtype").none() = nb::none(),
-          nb::arg("copy").none() = nb::none())
-      .def("shape", &Literal::shape);
-
-  nb::class_<XlaComputation>(m, "XlaComputation")
-      .def("__init__",
-           [](XlaComputation* self,
-              const nb::bytes& serialized_hlo_module_proto) {
-             HloModuleProto proto;
-             proto.ParseFromArray(serialized_hlo_module_proto.c_str(),
-                                  serialized_hlo_module_proto.size());
-             new (self) XlaComputation(proto);
-           })
-      .def("get_hlo_module", xla::ValueOrThrowWrapper(GetHloModule))
-      .def("program_shape",
-           xla::ValueOrThrowWrapper(&XlaComputation::GetProgramShape))
-      .def("name", &XlaComputation::name)
-      .def("as_serialized_hlo_module_proto",
-           xla::ValueOrThrowWrapper(GetComputationSerializedProto))
-      .def("as_hlo_text", xla::ValueOrThrowWrapper(GetComputationHloText),
-           nb::arg("print_large_constants") = false)
-      .def("as_hlo_dot_graph",
-           xla::ValueOrThrowWrapper(GetComputationHloDotGraph))
-      .def("hash", xla::ValueOrThrowWrapper(HashComputation))
-      .def("as_hlo_module", xla::ValueOrThrowWrapper(GetHloModule));
-
-  nb::class_<HloPrintOptions> hlo_print_options_class(m, "HloPrintOptions");
-  hlo_print_options_class.def(nb::init<>())
-      .def_static("short_parsable", &HloPrintOptions::ShortParsable)
-      .def_static("canonical", &HloPrintOptions::Canonical)
-      .def_static("fingerprint", &HloPrintOptions::Fingerprint)
-      .def_prop_rw("print_large_constants",
-                   &HloPrintOptions::print_large_constants,
-                   &HloPrintOptions::set_print_large_constants)
-      .def_prop_rw("print_metadata", &HloPrintOptions::print_metadata,
-                   &HloPrintOptions::set_print_metadata)
-      .def_prop_rw("print_backend_config",
-                   &HloPrintOptions::print_backend_config,
-                   &HloPrintOptions::set_print_backend_config)
-      .def_prop_rw("print_result_shape", &HloPrintOptions::print_result_shape,
-                   &HloPrintOptions::set_print_result_shape)
-      .def_prop_rw("print_operand_shape", &HloPrintOptions::print_operand_shape,
-                   &HloPrintOptions::set_print_operand_shape)
-      .def_prop_rw("print_operand_names", &HloPrintOptions::print_operand_names,
-                   &HloPrintOptions::set_print_operand_names)
-      .def_prop_rw("print_ids", &HloPrintOptions::print_ids,
-                   &HloPrintOptions::set_print_ids)
-      .def_prop_rw("print_extra_attributes",
-                   &HloPrintOptions::print_extra_attributes,
-                   &HloPrintOptions::set_print_extra_attributes)
-      .def_prop_rw("print_program_shape", &HloPrintOptions::print_program_shape,
-                   &HloPrintOptions::set_print_program_shape)
-      .def_prop_rw("print_percent", &HloPrintOptions::print_percent,
-                   &HloPrintOptions::set_print_percent)
-      .def_prop_rw("print_control_dependencies",
-                   &HloPrintOptions::print_control_dependencies,
-                   &HloPrintOptions::set_print_control_dependencies)
-      .def_prop_rw("compact_operands", &HloPrintOptions::compact_operands,
-                   &HloPrintOptions::set_compact_operands)
-      .def_prop_rw("include_layout_in_shapes",
-                   &HloPrintOptions::include_layout_in_shapes,
-                   &HloPrintOptions::set_include_layout_in_shapes)
-      .def_prop_rw("canonicalize_instruction_names",
-                   &HloPrintOptions::canonicalize_instruction_names,
-                   &HloPrintOptions::set_canonicalize_instruction_names)
-      .def_prop_rw("canonicalize_computations",
-                   &HloPrintOptions::canonicalize_computations,
-                   &HloPrintOptions::set_canonicalize_computations)
-      .def_prop_rw("indent_amount", &HloPrintOptions::indent_amount,
-                   &HloPrintOptions::set_indent_amount)
-      .def_prop_rw("is_in_nested_computation",
-                   &HloPrintOptions::is_in_nested_computation,
-                   &HloPrintOptions::set_is_in_nested_computation);
-
-  // HloModule.computations() returns raw pointers.
-  // pybind seems to prefer smart pointers.
-  // We give pybind a smart pointer to a wrapper around a raw pointer to satisfy
-  // pybind and avoid double frees.
-  class ComputationWrapper {
-   public:
-    ComputationWrapper(const HloComputation* comp,
-                       const std::shared_ptr<HloModule> module)
-        : comp_(comp), module_(module) {}
-    absl::string_view name() const { return comp_->name(); }
-    void render_html(const std::string& filename) {
-      std::string html = xla::ValueOrThrow(RenderGraph(
-          *comp_, /*label=*/"", comp_->parent()->config().debug_options(),
-          RenderedGraphFormat::kHtml, HloRenderOptions()));
-      xla::ThrowIfError(tsl::WriteStringToFile(
-          tsl::Env::Default(), absl::StrCat(filename, ".html"), html));
-    }
-
-   private:
-    const HloComputation* comp_;
-    // The module owns the computations: if its destructor is called, the
-    // computations are freed. To prevent that from happening in cases where the
-    // module Python object goes out of scope and gets garbage collected before
-    // the computations, we keep a shared_ptr to the module that originated the
-    // computation.
-    const std::shared_ptr<HloModule> module_;
-  };
-
-  nb::class_<ComputationWrapper> hlo_computation_class(m, "HloComputation");
-
-  hlo_computation_class.def_prop_ro("name", &ComputationWrapper::name)
-      .def("render_html", &ComputationWrapper::render_html);
-
-  nb::class_<HloModule> hlo_module_class(m, "HloModule");
-  hlo_module_class.def_prop_ro("name", &HloModule::name)
-      .def(
-          "to_string",
-          static_cast<std::string (HloModule::*)(const HloPrintOptions&) const>(
-              &HloModule::ToString),
-          nb::arg("options") = HloPrintOptions())
-      .def("as_serialized_hlo_module_proto",
-           xla::ValueOrThrowWrapper(GetHloModuleSerializedProto))
-      .def("from_serialized_hlo_module_proto",
-           xla::ValueOrThrowWrapper(HloModuleFromSerializedProto))
-      .def("computations",
-           [](const std::shared_ptr<HloModule> m)
-               -> std::vector<std::shared_ptr<ComputationWrapper>> {
-             std::vector<std::shared_ptr<ComputationWrapper>> computations;
-             for (HloComputation* comp : m->computations())
-               computations.push_back(
-                   std::make_shared<ComputationWrapper>(comp, m));
-             return computations;
-           })
-      .def_prop_ro("spmd_output_sharding",
-                   [](const HloModule& m) -> std::optional<xla::OpSharding> {
-                     if (!m.has_spmd_output_sharding()) return std::nullopt;
-                     return m.spmd_output_sharding().ToProto();
-                   })
-      .def_prop_ro("spmd_parameters_shardings",
-                   [](const HloModule& m)
-                       -> std::optional<std::vector<xla::OpSharding>> {
-                     if (!m.has_spmd_parameters_shardings())
-                       return std::nullopt;
-                     std::vector<xla::OpSharding> param_shardings;
-                     for (const auto& parameter_sharding :
-                          m.spmd_parameters_shardings()) {
-                       param_shardings.push_back(parameter_sharding.ToProto());
-                     }
-                     return param_shardings;
-                   });
-
-  nb::class_<HloModuleGroup> hlo_module_group_class(m, "HloModuleGroup");
-  hlo_module_group_class
-      .def("__init__",
-           [](HloModuleGroup* self, const std::string& name,
-              const std::vector<std::shared_ptr<HloModule>>& hlo_modules) {
-             std::vector<std::unique_ptr<HloModule>> modules;
-             modules.reserve(hlo_modules.size());
-             for (const auto& m : hlo_modules) {
-               modules.push_back(m->Clone(/*suffix=*/""));
-             }
-             new (self) HloModuleGroup(name, std::move(modules));
-           })
-      .def_prop_ro("name", &HloModuleGroup::name)
-      .def("to_string", &HloModuleGroup::ToString)
-      .def("to_modules",
-           [](HloModuleGroup& m) -> std::vector<std::shared_ptr<HloModule>> {
-             std::vector<std::unique_ptr<HloModule>> modules =
-                 m.ConsumeModules();
-             std::vector<std::shared_ptr<HloModule>> shared_modules;
-             shared_modules.reserve(modules.size());
-             for (auto& module : modules) {
-               shared_modules.push_back(std::move(module));
-             }
-             return shared_modules;
-           });
-
-  m.def("hlo_module_to_dot_graph",
-        [](const HloModule& hlo_module) -> std::string {
-          return xla::ValueOrThrow(RenderGraph(
-              *hlo_module.entry_computation(), /*label=*/"",
-              hlo_module.config().debug_options(), RenderedGraphFormat::kDot));
-        });
-  m.def(
-      "hlo_module_cost_analysis",
-      xla::ValueOrThrowWrapper([](PyClient* client, const HloModule& module)
-                                   -> absl::StatusOr<nb::dict> {
-        TF_ASSIGN_OR_RETURN(auto analysis,
-                            client->pjrt_client()->GetHloCostAnalysis());
-        TF_RETURN_IF_ERROR(module.entry_computation()->Accept(analysis.get()));
-
-        // Convert from HloCostAnalysis::Properties to a standard map.
-        nb::dict ret;
-        analysis->properties().ForEach([&](absl::string_view key, float val) {
-          ret[nb::str(key.data(), key.size())] = nb::cast(val);
-        });
-        return ret;
-      }));
-  m.def("hlo_module_from_text",
-        xla::ValueOrThrowWrapper(
-            [](const std::string& hlo_module_text)
-                -> absl::StatusOr<std::shared_ptr<HloModule>> {
-              auto hlo_module =
-                  xla::ParseAndReturnUnverifiedModule(hlo_module_text);
-              TF_RETURN_IF_ERROR(hlo_module.status());
-              std::shared_ptr<HloModule> result(std::move(*hlo_module));
-              return result;
-            }));
-
-  nb::class_<XlaOp> xla_op_class(m, "XlaOp");
-
-  nb::class_<XlaBuilder>(m, "XlaBuilder")
-      .def("__init__",
-           [](XlaBuilder* self, const std::string& name) {
-             new (self) XlaBuilder(UniquifyName(name));
-           })
-      // TODO(phawkins): delete capitalized names after updating callers.
-      .def("Build",
-           xla::ValueOrThrowWrapper(
-               [](XlaBuilder& builder, std::optional<XlaOp> root) {
-                 return root ? builder.Build(*root) : builder.Build();
-               }),
-           "Builds a computation from the contents of the builder.",
-           nb::arg("root") = std::nullopt)
-      .def("GetShape", xla::ValueOrThrowWrapper(&XlaBuilder::GetShape))
-      .def("build",
-           xla::ValueOrThrowWrapper(
-               [](XlaBuilder& builder, std::optional<XlaOp> root) {
-                 return root ? builder.Build(*root) : builder.Build();
-               }),
-           "Builds a computation from the contents of the builder.",
-           nb::arg("root") = std::nullopt)
-      .def("clear_op_metadata", &XlaBuilder::ClearOpMetadata)
-      .def("get_shape", xla::ValueOrThrowWrapper(&XlaBuilder::GetShape))
-      .def(
-          "get_program_shape",
-          [](const XlaBuilder& builder,
-             std::optional<XlaOp> root) -> absl::StatusOr<ProgramShape> {
-            return root ? builder.GetProgramShape(*root)
-                        : builder.GetProgramShape();
-          },
-          nb::arg("root") = std::nullopt)
-      .def("is_constant", xla::ValueOrThrowWrapper(&XlaBuilder::IsConstant))
-      .def("set_op_metadata", &XlaBuilder::SetOpMetadata)
-      .def("set_sharding", &XlaBuilder::SetSharding)
-      .def("clear_sharding", &XlaBuilder::ClearSharding)
-      .def("set_frontend_attributes", &XlaBuilder::SetFrontendAttributes)
-      .def("clear_frontend_attributes", &XlaBuilder::ClearFrontendAttributes)
-      .def("setup_alias",
-           [](XlaBuilder& builder, const std::vector<int64_t>& output_index,
-              int64_t param_number, const std::vector<int64_t>& param_index) {
-             builder.SetUpAlias(
-                 ShapeIndex(output_index.begin(), output_index.end()),
-                 param_number,
-                 ShapeIndex(param_index.begin(), param_index.end()));
-           });
-
-  // Device assignments
-  nb::class_<DeviceAssignment>(m, "DeviceAssignment")
-      .def_static(
-          "create",
-          xla::ValueOrThrowWrapper([](nb::ndarray<int, nb::ndim<2>> array)
-                                       -> absl::StatusOr<DeviceAssignment> {
-            if (array.ndim() != 2) {
-              return InvalidArgument(
-                  "Argument to DeviceAssignment constructor must be a "
-                  "2D array, received an %dD array.",
-                  array.ndim());
-            }
-            DeviceAssignment result(array.shape(0), array.shape(1));
-            for (int i = 0; i < array.shape(0); ++i) {
-              for (int j = 0; j < array.shape(1); ++j) {
-                result(i, j) = array(i, j);
-              }
-            }
-            return result;
-          }))
-      .def("replica_count", &DeviceAssignment::replica_count)
-      .def("computation_count", &DeviceAssignment::computation_count)
-      .def("__repr__", &DeviceAssignment::ToString)
-      .def("serialize",
-           xla::ValueOrThrowWrapper(
-               [](const DeviceAssignment& da) -> absl::StatusOr<nb::bytes> {
-                 DeviceAssignmentProto proto;
-                 da.Serialize(&proto);
-                 std::string result;
-                 if (!tsl::SerializeToStringDeterministic(proto, &result)) {
-                   return Unknown(
-                       "Failed to serialize the DeviceAssignmentProto.");
-                 }
-                 return nb::bytes(result.data(), result.size());
-               }));
-
-  nb::class_<CompileOptions> compile_options(m, "CompileOptions");
-  compile_options
-      .def("__init__",
-           [](CompileOptions* self) {
-             new (self) CompileOptions();
-             DebugOptions* debug_options =
-                 self->executable_build_options.mutable_debug_options();
-             // Sets fast-math-disabling default options expected by JAX.
-             debug_options->set_xla_cpu_enable_fast_min_max(false);
-             debug_options->set_xla_gpu_enable_fast_min_max(false);
-           })
-      .def("__getstate__",
-           [](const CompileOptions& self) -> nb::tuple {
-             auto proto = ValueOrThrow(self.ToProto());
-             std::string result;
-             if (!tsl::SerializeToStringDeterministic(proto, &result)) {
-               // throw converted by PyBind to a Python RuntimeError.
-               throw XlaRuntimeError(
-                   absl::StrCat("CompileOptions.py_pickle: ",
-                                "SerializeToStringDeterministic failed"));
-             }
-             return nb::make_tuple(nb::bytes(result.data(), result.size()));
-           })
-      .def("__setstate__",
-           [](CompileOptions* self, nb::tuple t) {
-             CompileOptionsProto result;
-             nb::bytes serialized = nb::cast<nb::bytes>(t[0]);
-             result.ParseFromArray(serialized.c_str(), serialized.size());
-             new (self) CompileOptions(
-                 ValueOrThrow(CompileOptions::FromProto(result)));
-           })
-      .def("SerializeAsString",
-           [](const CompileOptions& self) -> nb::bytes {
-             auto proto = ValueOrThrow(self.ToProto());
-             std::string result;
-             if (!tsl::SerializeToStringDeterministic(proto, &result)) {
-               // throw converted by PyBind to a Python RuntimeError.
-               throw XlaRuntimeError(
-                   absl::StrCat("CompileOptions.SerializeAsString: ",
-                                "SerializeToStringDeterministic failed"));
-             }
-             return nb::bytes(result.data(), result.size());
-           })
-      .def_static("ParseFromString",
-                  [](nb::bytes s) {
-                    CompileOptionsProto result;
-                    result.ParseFromArray(s.c_str(), s.size());
-                    return ValueOrThrow(CompileOptions::FromProto(result));
-                  })
-      .def_rw("argument_layouts", &CompileOptions::argument_layouts)
-      .def_rw("parameter_is_tupled_arguments",
-              &CompileOptions::parameter_is_tupled_arguments)
-      .def_rw("compile_portable_executable",
-              &CompileOptions::compile_portable_executable)
-      .def_ro("executable_build_options",
-              &CompileOptions::executable_build_options)
-      .def_rw("env_option_overrides", &CompileOptions::env_option_overrides)
-      // TODO(phawkins): the following fields exist for backward compatibility.
-      // Remove them after JAX has been updated not to use them.
-      .def_rw("tuple_arguments", &CompileOptions::parameter_is_tupled_arguments)
-      .def_prop_rw(
-          "num_replicas",
-          [](const CompileOptions& options) {
-            return options.executable_build_options.num_replicas();
-          },
-          [](CompileOptions& options, int num_replicas) {
-            options.executable_build_options.set_num_replicas(num_replicas);
-          })
-      .def_prop_rw(
-          "num_partitions",
-          [](const CompileOptions& options) {
-            return options.executable_build_options.num_partitions();
-          },
-          [](CompileOptions& options, int num_partitions) {
-            options.executable_build_options.set_num_partitions(num_partitions);
-          })
-      .def_prop_rw(
-          "profile_version",
-          [](const CompileOptions& options) { return options.profile_version; },
-          [](CompileOptions& options, int64_t profile_version) {
-            options.profile_version = profile_version;
-          })
-      .def_prop_rw(
-          "device_assignment",
-          [](const CompileOptions& options) -> std::optional<DeviceAssignment> {
-            return options.executable_build_options.has_device_assignment()
-                       ? std::optional<DeviceAssignment>(
-                             options.executable_build_options
-                                 .device_assignment())
-                       : std::nullopt;
-          },
-          [](CompileOptions& options,
-             const DeviceAssignment& device_assignment) {
-            options.executable_build_options.set_device_assignment(
-                device_assignment);
-          });
-
-  // Custom-call targets.
-  m.def(
-      "register_custom_call_target",
-      [](nb::object fn_name_py, nb::object fn, const std::string& platform,
-         int api_version, XLA_FFI_Handler_Traits traits) {
-        std::string fn_name;
-        if (!nb::try_cast<std::string>(fn_name_py, fn_name)) {
-          nb::bytes bytes = nb::cast<nb::bytes>(fn_name_py);
-          fn_name = std::string(bytes.c_str(), bytes.size());
-        }
-        xla::ThrowIfError(PyRegisterCustomCallTarget(
-            fn_name, std::move(fn), platform, api_version, traits));
-      },
-      nb::arg("fn_name"), nb::arg("fn"), nb::arg("platform"),
-      nb::arg("api_version") = 0, nb::arg("traits") = 0);
-
-  m.def(
-      "custom_call_targets",
-      [](const std::string& platform) -> nb::dict {
-        nb::dict targets;
-        for (const auto& [name, target] :
-             CustomCallTargetRegistry::Global()->registered_symbols(platform)) {
-          targets[nb::str(name.data(), name.size())] = nb::capsule(target);
-        }
-
-        auto ffi_handlers = ffi::StaticRegisteredHandlers(platform);
-        if (!ffi_handlers.ok()) return targets;
-
-        for (const auto& [name, registration] : *ffi_handlers) {
-          nb::dict bundle;
-          auto export_handler = [&](absl::string_view name,
-                                    XLA_FFI_Handler* h) {
-            if (h != nullptr) {
-              bundle[nb::str(name.data(), name.size())] =
-                  nb::capsule(reinterpret_cast<void*>(h));
-            }
-          };
-          export_handler("prepare", registration.bundle.prepare);
-          export_handler("initialize", registration.bundle.initialize);
-          export_handler("execute", registration.bundle.execute);
-          targets[nb::str(name.data(), name.size())] = std::move(bundle);
-        }
-        return targets;
-      },
-      nb::arg("platform"));
-
-  nb::enum_<DebugOptions::AutotuneCacheMode>(m, "AutotuneCacheMode")
-      .value("UNSPECIFIED", DebugOptions::AUTOTUNE_CACHE_MODE_UNSPECIFIED)
-      .value("UPDATE", DebugOptions::AUTOTUNE_CACHE_MODE_UPDATE)
-      .value("READ", DebugOptions::AUTOTUNE_CACHE_MODE_READ);
-
-  m.def(
-      "register_custom_type_id",
-      [](absl::string_view type_name, nb::object type_id) {
-        xla::ThrowIfError(PyRegisterCustomTypeId(type_name, type_id));
-      },
-      nb::arg("type_name"), nb::arg("type_id"));
-
-  nb::class_<DebugOptions>(m, "DebugOptions")
-      .def("__repr__", &DebugOptions::DebugString)
-      .def_prop_rw("xla_backend_optimization_level",
-                   &DebugOptions::xla_backend_optimization_level,
-                   &DebugOptions::set_xla_backend_optimization_level)
-      .def_prop_rw("xla_cpu_enable_fast_math",
-                   &DebugOptions::xla_cpu_enable_fast_math,
-                   &DebugOptions::set_xla_cpu_enable_fast_math)
-      .def_prop_rw("xla_cpu_enable_xprof_traceme",
-                   &DebugOptions::xla_cpu_enable_xprof_traceme,
-                   &DebugOptions::set_xla_cpu_enable_xprof_traceme)
-      .def_prop_rw("xla_cpu_fast_math_honor_infs",
-                   &DebugOptions::xla_cpu_fast_math_honor_infs,
-                   &DebugOptions::set_xla_cpu_fast_math_honor_infs)
-      .def_prop_rw("xla_cpu_fast_math_honor_nans",
-                   &DebugOptions::xla_cpu_fast_math_honor_nans,
-                   &DebugOptions::set_xla_cpu_fast_math_honor_nans)
-      .def_prop_rw("xla_cpu_fast_math_honor_division",
-                   &DebugOptions::xla_cpu_fast_math_honor_division,
-                   &DebugOptions::set_xla_cpu_fast_math_honor_division)
-      .def_prop_rw("xla_cpu_fast_math_honor_functions",
-                   &DebugOptions::xla_cpu_fast_math_honor_functions,
-                   &DebugOptions::set_xla_cpu_fast_math_honor_functions)
-      .def_prop_rw("xla_detailed_logging", &DebugOptions::xla_detailed_logging,
-                   &DebugOptions::set_xla_detailed_logging)
-      .def_prop_rw("xla_enable_dumping", &DebugOptions::xla_enable_dumping,
-                   &DebugOptions::set_xla_enable_dumping)
-      .def_prop_rw("xla_gpu_enable_fast_min_max",
-                   &DebugOptions::xla_gpu_enable_fast_min_max,
-                   &DebugOptions::set_xla_gpu_enable_fast_min_max)
-      .def_prop_rw("xla_gpu_dump_autotune_results_to",
-                   &DebugOptions::xla_gpu_dump_autotune_results_to,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_gpu_dump_autotune_results_to(value);
-                   })
-      .def_prop_rw("xla_gpu_load_autotune_results_from",
-                   &DebugOptions::xla_gpu_load_autotune_results_from,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_gpu_load_autotune_results_from(value);
-                   })
-      .def_prop_rw("xla_gpu_cuda_data_dir",
-                   &DebugOptions::xla_gpu_cuda_data_dir,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_gpu_cuda_data_dir(value);
-                   })
-      .def_prop_rw("xla_llvm_disable_expensive_passes",
-                   &DebugOptions::xla_llvm_disable_expensive_passes,
-                   &DebugOptions::set_xla_llvm_disable_expensive_passes)
-      .def_prop_rw(
-          "xla_disable_hlo_passes",
-          [](DebugOptions* self) {
-            return absl::StrJoin(self->xla_disable_hlo_passes(), ",");
-          },
-          [](DebugOptions* self, std::string value) {
-            self->clear_xla_disable_hlo_passes();
-            for (const auto& passname :
-                 std::vector<std::string>(absl::StrSplit(value, ','))) {
-              self->add_xla_disable_hlo_passes(passname);
-            }
-          })
-      .def_prop_rw(
-          "xla_enable_hlo_passes_only",
-          [](DebugOptions* self) {
-            return absl::StrJoin(self->xla_enable_hlo_passes_only(), ",");
-          },
-          [](DebugOptions* self, std::string value) {
-            self->clear_xla_enable_hlo_passes_only();
-            for (const auto& passname :
-                 std::vector<std::string>(absl::StrSplit(value, ','))) {
-              self->add_xla_enable_hlo_passes_only(passname);
-            }
-          })
-      .def_prop_rw("xla_test_all_input_layouts",
-                   &DebugOptions::xla_test_all_input_layouts,
-                   &DebugOptions::set_xla_test_all_input_layouts)
-      .def_prop_rw("xla_force_host_platform_device_count",
-                   &DebugOptions::xla_force_host_platform_device_count,
-                   &DebugOptions::set_xla_force_host_platform_device_count)
-      .def_prop_rw("xla_dump_to", &DebugOptions::xla_dump_to,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_dump_to(value);
-                   })
-      .def_prop_rw("xla_dump_hlo_module_re",
-                   &DebugOptions::xla_dump_hlo_module_re,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_dump_hlo_module_re(value);
-                   })
-      .def_prop_rw("xla_dump_hlo_pass_re", &DebugOptions::xla_dump_hlo_pass_re,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_dump_hlo_pass_re(value);
-                   })
-      .def_prop_rw("xla_dump_hlo_as_text", &DebugOptions::xla_dump_hlo_as_text,
-                   &DebugOptions::set_xla_dump_hlo_as_text)
-      .def_prop_rw("xla_dump_hlo_as_proto",
-                   &DebugOptions::xla_dump_hlo_as_proto,
-                   &DebugOptions::set_xla_dump_hlo_as_proto)
-      .def_prop_rw("xla_dump_hlo_as_dot", &DebugOptions::xla_dump_hlo_as_dot,
-                   &DebugOptions::set_xla_dump_hlo_as_dot)
-      .def_prop_rw("xla_dump_hlo_as_url", &DebugOptions::xla_dump_hlo_as_url,
-                   &DebugOptions::set_xla_dump_hlo_as_url)
-      .def_prop_rw("xla_dump_hlo_as_html", &DebugOptions::xla_dump_hlo_as_html,
-                   &DebugOptions::set_xla_dump_hlo_as_html)
-      .def_prop_rw("xla_dump_fusion_visualization",
-                   &DebugOptions::xla_dump_fusion_visualization,
-                   &DebugOptions::set_xla_dump_fusion_visualization)
-      .def_prop_rw("xla_dump_hlo_snapshots",
-                   &DebugOptions::xla_dump_hlo_snapshots,
-                   &DebugOptions::set_xla_dump_hlo_snapshots)
-      .def_prop_rw("xla_dump_max_hlo_modules",
-                   &DebugOptions::xla_dump_max_hlo_modules,
-                   &DebugOptions::set_xla_dump_max_hlo_modules)
-      .def_prop_rw("xla_dump_module_metadata",
-                   &DebugOptions::xla_dump_module_metadata,
-                   &DebugOptions::set_xla_dump_module_metadata)
-      .def_prop_rw("xla_dump_compress_protos",
-                   &DebugOptions::xla_dump_compress_protos,
-                   &DebugOptions::set_xla_dump_compress_protos)
-      .def_prop_rw("xla_dump_hlo_as_long_text",
-                   &DebugOptions::xla_dump_hlo_as_long_text,
-                   &DebugOptions::set_xla_dump_hlo_as_long_text)
-      .def_prop_rw("xla_dump_disable_metadata",
-                   &DebugOptions::xla_dump_disable_metadata,
-                   &DebugOptions::set_xla_dump_disable_metadata)
-      .def_prop_rw("xla_dump_hlo_pipeline_re",
-                   &DebugOptions::xla_dump_hlo_pipeline_re,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_dump_hlo_pipeline_re(value);
-                   })
-      .def_prop_rw("xla_gpu_dump_autotune_logs_to",
-                   &DebugOptions::xla_gpu_dump_autotune_logs_to,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_gpu_dump_autotune_logs_to(value);
-                   })
-      .def_prop_rw("xla_gpu_kernel_cache_file",
-                   &DebugOptions::xla_gpu_kernel_cache_file,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_gpu_kernel_cache_file(value);
-                   })
-      .def_prop_rw(
-          "xla_gpu_enable_llvm_module_compilation_parallelism",
-          &DebugOptions::xla_gpu_enable_llvm_module_compilation_parallelism,
-          &DebugOptions::set_xla_gpu_enable_llvm_module_compilation_parallelism)
-      .def_prop_rw("xla_gpu_per_fusion_autotune_cache_dir",
-                   &DebugOptions::xla_gpu_per_fusion_autotune_cache_dir,
-                   [](DebugOptions* self, std::string value) {
-                     self->set_xla_gpu_per_fusion_autotune_cache_dir(value);
-                   })
-      .def_prop_rw("xla_gpu_experimental_autotune_cache_mode",
-                   &DebugOptions::xla_gpu_experimental_autotune_cache_mode,
-                   &DebugOptions::set_xla_gpu_experimental_autotune_cache_mode);
-
-  nb::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
-      .def(nb::init<>())
-      .def("__repr__", &ExecutableBuildOptions::ToString)
-      .def_prop_rw(
-          "fdo_profile",
-          [](const ExecutableBuildOptions& options) {
-            return nb::bytes(options.fdo_profile().data(),
-                             options.fdo_profile().size());
-          },
-          [](ExecutableBuildOptions& options, nb::bytes fdo_profile) {
-            options.set_fdo_profile(
-                std::string(fdo_profile.c_str(), fdo_profile.size()));
-          })
-      .def_prop_rw(
-          "result_layout",
-          [](const ExecutableBuildOptions& options) -> std::optional<Shape> {
-            return options.result_layout()
-                       ? std::optional<Shape>(*options.result_layout())
-                       : std::nullopt;
-          },
-          &ExecutableBuildOptions::set_result_layout)
-      .def_prop_rw("num_replicas", &ExecutableBuildOptions::num_replicas,
-                   &ExecutableBuildOptions::set_num_replicas)
-      .def_prop_rw("num_partitions", &ExecutableBuildOptions::num_partitions,
-                   &ExecutableBuildOptions::set_num_partitions)
-      .def_prop_ro("debug_options",
-                   &ExecutableBuildOptions::mutable_debug_options,
-                   nb::rv_policy::reference, nb::keep_alive<1, 0>())
-      .def_prop_rw(
-          "device_assignment",
-          [](const ExecutableBuildOptions& options)
-              -> std::optional<DeviceAssignment> {
-            return options.has_device_assignment()
-                       ? std::optional<DeviceAssignment>(
-                             options.device_assignment())
-                       : std::nullopt;
-          },
-          &ExecutableBuildOptions::set_device_assignment)
-      .def("compilation_environments_from_serialized_proto",
-           [](ExecutableBuildOptions& options,
-              const nb::bytes& serialized_proto) {
-             xla::CompilationEnvironmentsProto env_proto;
-             env_proto.ParseFromArray(serialized_proto.c_str(),
-                                      serialized_proto.size());
-             auto comp_envs = xla::ValueOrThrow(
-                 xla::CompilationEnvironments::CreateFromProto(env_proto));
-             *options.mutable_comp_envs() = std::move(*comp_envs);
-           })
-      .def_prop_rw("exec_time_optimization_effort",
-                   &ExecutableBuildOptions::exec_time_optimization_effort,
-                   &ExecutableBuildOptions::set_exec_time_optimization_effort)
-      .def_prop_rw("memory_fitting_effort",
-                   &ExecutableBuildOptions::memory_fitting_effort,
-                   &ExecutableBuildOptions::set_memory_fitting_effort)
-      .def_prop_rw(
-          "optimization_level", &ExecutableBuildOptions::optimization_level,
-          [](ExecutableBuildOptions& options, int value) {
-            options.set_optimization_level(
-                static_cast<xla::ExecutionOptions::EffortLevel>(value));
-          })
-      .def_prop_rw(
-          "memory_fitting_level", &ExecutableBuildOptions::memory_fitting_level,
-          [](ExecutableBuildOptions& options, int value) {
-            options.set_memory_fitting_level(
-                static_cast<xla::ExecutionOptions::EffortLevel>(value));
-          })
-      .def_prop_rw("use_spmd_partitioning",
-                   &ExecutableBuildOptions::use_spmd_partitioning,
-                   &ExecutableBuildOptions::set_use_spmd_partitioning)
-      .def_prop_rw("use_auto_spmd_partitioning",
-                   &ExecutableBuildOptions::use_auto_spmd_partitioning,
-                   &ExecutableBuildOptions::set_use_auto_spmd_partitioning)
-      .def_prop_rw(
-          "auto_spmd_partitioning_mesh_shape",
-          &ExecutableBuildOptions::auto_spmd_partitioning_mesh_shape,
-          &ExecutableBuildOptions::set_auto_spmd_partitioning_mesh_shape)
-      .def_prop_rw("auto_spmd_partitioning_mesh_ids",
-                   &ExecutableBuildOptions::auto_spmd_partitioning_mesh_ids,
-                   &ExecutableBuildOptions::set_auto_spmd_partitioning_mesh_ids)
-      .def_prop_rw(
-          "allow_spmd_sharding_propagation_to_parameters",
-          [](const ExecutableBuildOptions& options) -> std::vector<bool> {
-            return std::vector<bool>(
-                options.allow_spmd_sharding_propagation_to_parameters().begin(),
-                options.allow_spmd_sharding_propagation_to_parameters().end());
-          },
-          [](ExecutableBuildOptions& options, std::vector<bool> values) {
-            absl::InlinedVector<bool, 1> v(values.begin(), values.end());
-            options.set_allow_spmd_sharding_propagation_to_parameters(v);
-          })
-      .def_prop_rw(
-          "allow_spmd_sharding_propagation_to_output",
-          [](const ExecutableBuildOptions& options) -> std::vector<bool> {
-            return std::vector<bool>(
-                options.allow_spmd_sharding_propagation_to_output().begin(),
-                options.allow_spmd_sharding_propagation_to_output().end());
-          },
-          [](ExecutableBuildOptions& options, std::vector<bool> values) {
-            absl::InlinedVector<bool, 1> v(values.begin(), values.end());
-            options.set_allow_spmd_sharding_propagation_to_output(v);
-          })
-      .def_prop_rw("use_shardy_partitioner",
-                   &ExecutableBuildOptions::use_shardy_partitioner,
-                   &ExecutableBuildOptions::set_use_shardy_partitioner);
-
-  nb::enum_<OpSharding::Type> op_sharding_type(m, "OpSharding_Type",
-                                               nb::is_arithmetic());
-  op_sharding_type.value("REPLICATED", OpSharding::REPLICATED)
-      .value("MAXIMAL", OpSharding::MAXIMAL)
-      .value("MANUAL", OpSharding::MANUAL)
-      .value("TUPLE", OpSharding::TUPLE)
-      .value("OTHER", OpSharding::OTHER)
-      .value("UNKNOWN", OpSharding::UNKNOWN);
-
-  nb::enum_<OpSharding::ShardGroupType> op_sharding_shard_group_type(
-      m, "OpSharding_ShardGroupType");
-  op_sharding_shard_group_type.value("AS", OpSharding::AS)
-      .value("LIKE", OpSharding::LIKE);
-
-  nb::class_<OpSharding> op_sharding(m, "OpSharding");
-  op_sharding
-      .def_prop_ro_static(
-          "Type",
-          [op_sharding_type](const nb::object&) { return op_sharding_type; })
-      .def_prop_ro_static("ShardGroupType",
-                          [op_sharding_shard_group_type](const nb::object&) {
-                            return op_sharding_shard_group_type;
-                          })
-      .def(nb::init<>())
-      .def("__getstate__",
-           [](const OpSharding& self) {
-             std::string serialized = self.SerializeAsString();
-             return nb::make_tuple(
-                 nb::bytes(serialized.data(), serialized.size()));
-           })
-      .def("__setstate__",
-           [](OpSharding* self, nb::tuple t) {
-             new (self) OpSharding();
-             nb::bytes serialized = nb::cast<nb::bytes>(t[0]);
-             self->ParseFromArray(serialized.c_str(), serialized.size());
-           })
-      .def_prop_rw("type", &xla::OpSharding::type, &xla::OpSharding::set_type)
-      .def_prop_rw("replicate_on_last_tile_dim",
-                   &xla::OpSharding::replicate_on_last_tile_dim,
-                   &xla::OpSharding::set_replicate_on_last_tile_dim)
-      .def_prop_rw("is_shard_group", &xla::OpSharding::is_shard_group,
-                   &xla::OpSharding::set_is_shard_group)
-      .def_prop_rw("shard_group_id", &xla::OpSharding::shard_group_id,
-                   &xla::OpSharding::set_shard_group_id)
-      .def_prop_rw("shard_group_type", &xla::OpSharding::shard_group_type,
-                   &xla::OpSharding::set_shard_group_type)
-      .def("__repr__",
-           [](const xla::OpSharding& self) { return self.DebugString(); })
-      .def("ParseFromString",
-           [](OpSharding& sharding, const nb::bytes& s) {
-             sharding.ParseFromArray(s.c_str(), s.size());
-           })
-      .def("SerializeToString",
-           [](const OpSharding& sharding) {
-             std::string serialized = sharding.SerializeAsString();
-             return nb::bytes(serialized.data(), serialized.size());
-           })
-      .def("clone",
-           [](const OpSharding& sharding) { return OpSharding(sharding); });
-  DefRepeatedProperty(op_sharding, "tile_assignment_dimensions",
-                      &xla::OpSharding::mutable_tile_assignment_dimensions);
-  DefRepeatedProperty(op_sharding, "tile_assignment_devices",
-                      &xla::OpSharding::mutable_tile_assignment_devices);
-  DefRepeatedProperty(op_sharding, "iota_reshape_dims",
-                      &xla::OpSharding::mutable_iota_reshape_dims);
-  DefRepeatedProperty(op_sharding, "iota_transpose_perm",
-                      &xla::OpSharding::mutable_iota_transpose_perm);
-  DefRepeatedProperty(op_sharding, "tuple_shardings",
-                      &xla::OpSharding::mutable_tuple_shardings);
-  DefRepeatedEnumProperty(op_sharding, "last_tile_dims",
-                          &xla::OpSharding::mutable_last_tile_dims);
-
-  nb::class_<HloSharding> hlo_sharding(m, "HloSharding");
-  hlo_sharding
-      .def_static("from_proto",
-                  xla::ValueOrThrowWrapper(xla::HloSharding::FromProto))
-      .def_static("from_string", xla::ValueOrThrowWrapper(xla::ParseSharding))
-      .def_static(
-          "tuple_sharding",
-          [](xla::Shape shape,
-             std::vector<xla::HloSharding> shardings) -> xla::HloSharding {
-            return HloSharding::Tuple(shape, shardings);
-          },
-          "Constructs a tuple sharding.")
-      .def_static(
-          "iota_tile", xla::ValueOrThrowWrapper(IotaTileHelper),
-          nb::arg("dims"),
-          nb::arg("reshape_dims") = absl::Span<const int64_t>(),
-          nb::arg("transpose_perm") = absl::Span<const int>(),
-          nb::arg("subgroup_types") = absl::Span<const xla::OpSharding::Type>())
-      .def_static("manual", [] { return HloSharding::Manual(); })
-      .def_static("replicate", [] { return HloSharding::Replicate(); })
-      .def_static("unknown", [] { return HloSharding::Unknown(); })
-      .def_static(
-          "subgroup_with_device_ordering",
-          xla::ValueOrThrowWrapper(SubgroupWithTileAssignmentHelper),
-          nb::arg("tile_assignment"),
-          nb::arg("subgroup_types") = absl::Span<const xla::OpSharding::Type>())
-      .def("__eq__", [](const xla::HloSharding& a,
-                        const xla::HloSharding& b) { return a == b; })
-      .def("__hash__",
-           [](const xla::HloSharding& self) { return absl::HashOf(self); })
-      .def("is_replicated", &xla::HloSharding::IsReplicated)
-      .def("is_manual", &xla::HloSharding::IsManual)
-      .def("is_unknown", &xla::HloSharding::IsUnknown)
-      .def("is_tiled", &xla::HloSharding::IsTiled)
-      .def("is_maximal", &xla::HloSharding::IsTileMaximal)
-      .def("tile", [](const xla::HloSharding& self,
-                      xla::Shape shape) { return self.TileShape(shape); })
-      // tile_assignment.array() is computed using an internal cache,
-      // which is why nb::lock_self() is required. It may be preferable to move
-      // this locking into the TileAssignment class if we find it to race with
-      // non-Python users of that class.
-      .def(
-          "tuple_elements",
-          [](const xla::HloSharding& self) { return self.tuple_elements(); },
-          nb::lock_self())
-      .def(
-          "num_devices",
-          [](const xla::HloSharding& self) {
-            return self.tile_assignment().num_elements();
-          },
-          nb::lock_self())
-      .def(
-          "num_dimensions",
-          [](const xla::HloSharding& self) {
-            return self.tile_assignment().num_dimensions();
-          },
-          nb::lock_self())
-      .def(
-          "tile_assignment_dimensions",
-          [](const xla::HloSharding& self) {
-            absl::Span<int64_t const> span =
-                self.tile_assignment().dimensions();
-            CHECK(span.data());
-            return span;
-          },
-          nb::lock_self())
-      .def(
-          "tile_assignment_devices",
-          [](const xla::HloSharding& self) {
-            auto span =
-                absl::MakeConstSpan(self.tile_assignment().array().data(),
-                                    self.tile_assignment().num_elements());
-            CHECK(span.data());
-            return span;
-          },
-          nb::lock_self())
-      .def("replicate_on_last_tile_dim",
-           &xla::HloSharding::ReplicateOnLastTileDim)
-      .def("subgroup_types", &xla::HloSharding::subgroup_types)
-      .def("__repr__",
-           [](const xla::HloSharding& self) { return self.ToString(); })
-      .def("to_proto", &xla::HloSharding::ToProto);
-
-  nb::class_<FrontendAttributes> frontend_attributes(m, "FrontendAttributes");
-  frontend_attributes.def(nb::init<>())
-      .def("__setitem__",
-           [](FrontendAttributes* attr, std::string key, std::string value) {
-             (*attr->mutable_map())[key] = value;
-           });
-
-  nb::enum_<PrecisionConfig::Precision>(m, "PrecisionConfig_Precision")
-      .value("DEFAULT", PrecisionConfig::DEFAULT)
-      .value("HIGH", PrecisionConfig::HIGH)
-      .value("HIGHEST", PrecisionConfig::HIGHEST);
-
-  nb::enum_<ResultAccuracy::Mode>(m, "ResultAccuracy_Mode")
-      .value("DEFAULT", ResultAccuracy::DEFAULT)
-      .value("HIGHEST", ResultAccuracy::HIGHEST);
-
-  nb::enum_<FftType>(m, "FftType")
-      .value("FFT", FftType::FFT)
-      .value("IFFT", FftType::IFFT)
-      .value("RFFT", FftType::RFFT)
-      .value("IRFFT", FftType::IRFFT);
-
-  // Hlo Module Passes
-  nb::class_<HloPassInterface> hlo_pass_interface(m, "HloPassInterface");
-  hlo_pass_interface.def_prop_ro("name", &HloPassInterface::name)
-      .def("is_pass_pipeline", &HloPassInterface::IsPassPipeline)
-      .def("run",
-           [](HloPassInterface& pass, HloModule* module) -> bool {
-             return xla::ValueOrThrow(pass.Run(module));
-           })
-      .def("run_on_module_group",
-           [](HloPassInterface& pass, HloModuleGroup* module_group) -> bool {
-             return xla::ValueOrThrow(pass.RunOnModuleGroup(module_group));
-           });
-
-  nb::class_<HloDCE, HloPassInterface>(m, "HloDCE").def(nb::init<>());
-  nb::class_<CallInliner, HloPassInterface>(m, "CallInliner").def(nb::init<>());
-  nb::class_<FlattenCallGraph, HloPassInterface>(m, "FlattenCallGraph")
-      .def(nb::init<>());
-  nb::class_<TupleSimplifier, HloPassInterface>(m, "TupleSimplifier")
-      .def(nb::init<>());
-}  // NOLINT(readability/fn_size)
-}  // namespace xla
diff --git a/third_party/xla/xla/python/xla_compiler.h b/third_party/xla/xla/python/xla_compiler.h
deleted file mode 100644
index 0c1a445dd060..000000000000
--- a/third_party/xla/xla/python/xla_compiler.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_XLA_COMPILER_H_
-#define XLA_PYTHON_XLA_COMPILER_H_
-
-// placeholder for index annotation headers
-#include "nanobind/nanobind.h"
-
-namespace xla {
-
-void BuildXlaCompilerSubmodule(nanobind::module_& m);
-
-}  // namespace xla
-
-#endif  // XLA_PYTHON_XLA_COMPILER_H_
diff --git a/third_party/xla/xla/python/xla_extension.py b/third_party/xla/xla/python/xla_extension.py
new file mode 100644
index 000000000000..9836f5fa2cd2
--- /dev/null
+++ b/third_party/xla/xla/python/xla_extension.py
@@ -0,0 +1,17 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An XLA client in Python."""
+
+from jax.jaxlib._jax import *  # pylint: disable=wildcard-import
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
deleted file mode 100644
index e22937370ccf..000000000000
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ /dev/null
@@ -1,1048 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import annotations
-
-import enum
-import inspect
-import types
-import typing
-from typing import (
-    Any,
-    Callable,
-    ClassVar,
-    Dict,
-    Iterator,
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-    Type,
-    TypeVar,
-    Union,
-    overload,
-)
-
-import numpy as np
-
-from . import config
-from . import guard_lib
-from . import ifrt_programs
-from . import ifrt_proxy
-from . import jax_jit
-from . import mlir
-from . import ops
-from . import pmap_lib
-from . import profiler
-from . import pytree
-from . import transfer_guard_lib
-
-_LiteralSlice = Any
-_Status = Any
-_Dtype = Any
-_XlaOpMetadata = Any
-
-_T = TypeVar("_T")
-
-class XlaRuntimeError(RuntimeError):
-  pass
-
-class PrimitiveType(enum.IntEnum):
-  PRIMITIVE_TYPE_INVALID: PrimitiveType
-  PRED: PrimitiveType
-  S2: PrimitiveType
-  S4: PrimitiveType
-  S8: PrimitiveType
-  S16: PrimitiveType
-  S32: PrimitiveType
-  S64: PrimitiveType
-  U2: PrimitiveType
-  U4: PrimitiveType
-  U8: PrimitiveType
-  U16: PrimitiveType
-  U32: PrimitiveType
-  U64: PrimitiveType
-  F4E2M1FN: PrimitiveType
-  F8E3M4: PrimitiveType
-  F8E4M3: PrimitiveType
-  F8E4M3FN: PrimitiveType
-  F8E4M3B11FNUZ: PrimitiveType
-  F8E4M3FNUZ: PrimitiveType
-  F8E5M2: PrimitiveType
-  F8E5M2FNUZ: PrimitiveType
-  F8E8M0FNU: PrimitiveType
-  BF16: PrimitiveType
-  F16: PrimitiveType
-  F32: PrimitiveType
-  F64: PrimitiveType
-  C64: PrimitiveType
-  C128: PrimitiveType
-  TUPLE: PrimitiveType
-  OPAQUE_TYPE: PrimitiveType
-  TOKEN: PrimitiveType
-
-# === BEGIN xla_compiler.cc
-
-class ArrayCopySemantics(enum.IntEnum):
-  ALWAYS_COPY: ArrayCopySemantics
-  REUSE_INPUT: ArrayCopySemantics
-  DONATE_INPUT: ArrayCopySemantics
-
-class Layout:
-  @overload
-  def __init__(self, minor_to_major: Tuple[int, ...]): ...
-  @overload
-  def __init__(self, minor_to_major: Tuple[int, ...],
-               tiling: Tuple[Tuple[int, ...], ...],
-               element_size_in_bits: int): ...
-  def minor_to_major(self) -> Tuple[int, ...]: ...
-  def tiling(self) -> Sequence[Tuple[int, ...]]: ...
-  def element_size_in_bits(self) -> int: ...
-  def to_string(self) -> str: ...
-  def __eq__(self, other: Layout) -> bool: ...
-  def __ne__(self, other: Layout) -> bool: ...
-  def __hash__(self) -> int: ...
-
-class Shape:
-  def __init__(self, s: str): ...
-  @staticmethod
-  def tuple_shape(shapes: Sequence[Shape]) -> Shape: ...
-  @staticmethod
-  def array_shape(
-      type: Union[np.dtype, PrimitiveType],
-      dims_seq: Any = ...,
-      layout_seq: Any = ...,
-      dynamic_dimensions: Optional[List[bool]] = ...,
-  ) -> Shape: ...
-  @staticmethod
-  def token_shape() -> Shape: ...
-  @staticmethod
-  def scalar_shape(type: Union[np.dtype, PrimitiveType]) -> Shape: ...
-  def dimensions(self) -> Tuple[int, ...]: ...
-  def layout(self) -> Layout: ...
-  def xla_element_type(self) -> PrimitiveType: ...
-  def element_type(self) -> np.dtype: ...
-  def numpy_dtype(self) -> np.dtype: ...
-  def is_tuple(self) -> bool: ...
-  def is_array(self) -> bool: ...
-  def is_token(self) -> bool: ...
-  def is_static(self) -> bool: ...
-  def is_dynamic(self) -> bool: ...
-  def is_dynamic_dimension(self, dimension: int) -> bool: ...
-  def set_dynamic_dimension(self, dimension: int, is_dynamic: bool) -> None: ...
-  def rank(self) -> int: ...
-  def to_serialized_proto(self) -> bytes: ...
-  def tuple_shapes(self) -> List[Shape]: ...
-  def leaf_count(self) -> int: ...
-  def with_major_to_minor_layout_if_absent(self) -> Shape: ...
-  def __eq__(self, other: Shape) -> bool: ...
-  def __ne__(self, other: Shape) -> bool: ...
-  def __hash__(self) -> int: ...
-  def __repr__(self) -> str: ...
-
-class ProgramShape:
-  def __init__(self, params: Sequence[Shape], result: Shape) -> None: ...
-  def parameter_shapes(self) -> List[Shape]: ...
-  def result_shape(self) -> Shape: ...
-  def __repr__(self) -> str: ...
-
-class ShapeIndex:
-  def __init__(self, indices: List[int]) -> ShapeIndex: ...
-  def __eq__(self, other: Shape) -> bool: ...
-  def __ne__(self, other: Shape) -> bool: ...
-  def __hash__(self) -> int: ...
-  def __repr__(self) -> str: ...
-
-class Literal:
-  def __init__(self, shape: Shape) -> Literal: ...
-  def __repr__(self) -> str: ...
-  def __array__(
-      self, dtype: Optional[np.dtype] = None, copy: Optional[bool] = None
-  ) -> np.ndarray: ...
-  def shape(self) -> Shape: ...
-
-class XlaComputation:
-  def __init__(self, serialized_hlo_module_proto: bytes) -> None: ...
-  def get_hlo_module(self) -> HloModule: ...
-  def program_shape(self) -> ProgramShape: ...
-  def as_serialized_hlo_module_proto(self) -> bytes: ...
-  def as_hlo_text(self, print_large_constants: bool = False) -> str: ...
-  def as_hlo_dot_graph(self) -> str: ...
-  def hash(self) -> int: ...
-  def as_hlo_module(self) -> HloModule: ...
-
-class HloPrintOptions:
-  def __init__(self) -> None: ...
-  @staticmethod
-  def short_parsable() -> HloPrintOptions: ...
-  @staticmethod
-  def canonical() -> HloPrintOptions: ...
-  @staticmethod
-  def fingerprint() -> HloPrintOptions: ...
-  print_large_constants: bool
-  print_metadata: bool
-  print_backend_config: bool
-  print_result_shape: bool
-  print_operand_shape: bool
-  print_operand_names: bool
-  print_ids: bool
-  print_extra_attributes: bool
-  print_program_shape: bool
-  print_percent: bool
-  print_control_dependencies: bool
-  compact_operands: bool
-  include_layout_in_shapes: bool
-  canonicalize_instruction_names: bool
-  canonicalize_computations: bool
-  indent_amount: int
-  is_in_nested_computation: bool
-
-class HloComputation:
-  def render_html(self) -> None: ...
-
-class HloModule:
-  spmd_output_sharding: Optional[OpSharding]
-  spmd_parameters_shardings: Optional[List[OpSharding]]
-  @property
-  def name(self) -> str: ...
-  def to_string(self, options: HloPrintOptions = ...) -> str: ...
-  def as_serialized_hlo_module_proto(self) -> bytes: ...
-  @staticmethod
-  def from_serialized_hlo_module_proto(
-      serialized_hlo_module_proto: bytes,
-  ) -> HloModule: ...
-  def computations(self) -> List[HloComputation]: ...
-
-class HloModuleGroup:
-  def __init__(self, name: str, modules: List[HloModule]) -> None: ...
-  @property
-  def name(self) -> str: ...
-  def to_string(self) -> str: ...
-  def to_modules(self) -> List[HloModule]: ...
-
-def hlo_module_to_dot_graph(hlo_module: HloModule) -> str: ...
-def hlo_module_from_text(hlo_module_text: str) -> HloModule: ...
-def hlo_module_cost_analysis(
-    client: Client, module: HloModule
-) -> Dict[str, float]: ...
-
-class XlaOp: ...
-
-class XlaBuilder:
-  def __init__(self, name: str) -> None: ...
-  def Build(self, root: Optional[XlaOp] = ...) -> XlaComputation: ...
-  def GetShape(self, __op: XlaOp) -> Shape: ...
-  build = Build
-  def clear_op_metadata(self) -> None: ...
-  get_shape = GetShape
-  def get_program_shape(self, root: Optional[XlaOp] = ...) -> ProgramShape: ...
-  def is_constant(self, __op: XlaOp) -> bool: ...
-  def set_op_metadata(self, metadata: _XlaOpMetadata) -> None: ...
-  def set_sharding(self, sharding: OpSharding_Type) -> None: ...
-  def clear_sharding(self) -> None: ...
-  def setup_alias(
-      self,
-      __output_index: Sequence[int],
-      __param_number: int,
-      __param_index: Sequence[int],
-  ) -> None: ...
-
-class DeviceAssignment:
-  @staticmethod
-  def create(array: np.ndarray) -> DeviceAssignment: ...
-  def replica_count(self) -> int: ...
-  def computation_count(self) -> int: ...
-  def __repr__(self) -> str: ...
-  def serialize(self) -> bytes: ...
-
-class CompileOptions:
-  @staticmethod
-  def ParseFromString(s: bytes) -> CompileOptions: ...
-  def __init__(self) -> None: ...
-  def SerializeAsString(self) -> bytes: ...
-  argument_layouts: Optional[List[Shape]]
-  parameter_is_tupled_arguments: bool
-  executable_build_options: ExecutableBuildOptions
-  tuple_arguments: bool
-  num_replicas: int
-  num_partitions: int
-  profile_version: int
-  device_assignment: Optional[DeviceAssignment]
-  compile_portable_executable: bool
-  env_option_overrides: List[Tuple[str, str]]
-
-def register_custom_call_target(
-    fn_name: str, capsule: Any, platform: str, api_version: int = ...,
-) -> _Status: ...
-def register_custom_call_partitioner(
-    name: str,
-    prop_user_sharding: Callable,
-    partition: Callable,
-    infer_sharding_from_operands: Callable,
-    can_side_effecting_have_replicated_sharding: bool = ...,
-    c_api: Optional[Any] = ...,
-) -> None: ...
-def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
-def register_custom_call_as_batch_partitionable(
-    target_name: str,
-    c_api: Optional[Any] = ...,
-) -> None: ...
-
-class AutotuneCacheMode(enum.IntEnum):
-  UNSPECIFIED: AutotuneCacheMode
-  UPDATE: AutotuneCacheMode
-  READ: AutotuneCacheMode
-
-class DebugOptions:
-  def __repr__(self) -> str: ...
-  xla_cpu_enable_fast_math: bool
-  xla_cpu_fast_math_honor_infs: bool
-  xla_cpu_fast_math_honor_nans: bool
-  xla_cpu_fast_math_honor_division: bool
-  xla_cpu_fast_math_honor_functions: bool
-  xla_gpu_enable_fast_min_max: bool
-  xla_backend_optimization_level: int
-  xla_cpu_enable_xprof_traceme: bool
-  xla_llvm_disable_expensive_passes: bool
-  xla_test_all_input_layouts: bool
-  xla_disable_hlo_passes: str
-  xla_enable_hlo_passes_only: str
-  xla_force_host_platform_device_count: int
-  xla_dump_to: str
-  xla_dump_hlo_module_re: str
-  xla_dump_hlo_pass_re: str
-  xla_dump_hlo_as_text: bool
-  xla_dump_hlo_as_proto: bool
-  xla_dump_hlo_as_dot: bool
-  xla_dump_hlo_as_url: bool
-  xla_dump_hlo_as_html: bool
-  xla_dump_fusion_visualization: bool
-  xla_dump_hlo_snapshots: bool
-  xla_dump_max_hlo_modules: bool
-  xla_dump_module_metadata: bool
-  xla_dump_compress_protos: bool
-  xla_dump_hlo_as_long_text: bool
-  xla_dump_disable_metadata: bool
-  xla_dump_hlo_pipeline_re: str
-  xla_gpu_cuda_data_dir: str
-  xla_detailed_logging: bool
-  xla_enable_dumping: bool
-  xla_gpu_dump_autotune_results_to: str
-  xla_gpu_load_autotune_results_from: str
-  xla_gpu_dump_autotune_logs_to: str
-  xla_gpu_kernel_cache_file: str
-  xla_gpu_enable_llvm_module_compilation_parallelism: bool
-  xla_gpu_per_fusion_autotune_cache_dir: str
-  xla_gpu_experimental_autotune_cache_mode: AutotuneCacheMode
-
-class CompiledMemoryStats:
-  generated_code_size_in_bytes: int
-  argument_size_in_bytes: int
-  output_size_in_bytes: int
-  alias_size_in_bytes: int
-  temp_size_in_bytes: int
-  host_generated_code_size_in_bytes: int
-  host_argument_size_in_bytes: int
-  host_output_size_in_bytes: int
-  host_alias_size_in_bytes: int
-  host_temp_size_in_bytes: int
-  serialized_hlo_proto: bytes
-  def __str__(self) -> str: ...
-
-class ExecutableBuildOptions:
-  def __init__(self) -> None: ...
-  def __repr__(self) -> str: ...
-  result_layout: Optional[Shape]
-  fdo_profile: Optional[bytes]
-  num_replicas: int
-  num_partitions: int
-  debug_options: DebugOptions
-  device_assignment: Optional[DeviceAssignment]
-  use_spmd_partitioning: bool
-  use_auto_spmd_partitioning: bool
-  auto_spmd_partitioning_mesh_shape: List[int]
-  auto_spmd_partitioning_mesh_ids: List[int]
-  use_shardy_partitioner: bool
-  def compilation_environments_from_serialized_proto(self, serialized_proto: bytes) -> None: ...
-
-class PrecisionConfig_Precision(enum.IntEnum):
-  DEFAULT: int
-  HIGH: int
-  HIGHEST: int
-
-
-class ResultAccuracy_Mode(enum.IntEnum):
-  DEFAULT: int
-  HIGHEST: int
-  TOLERANCE: int
-
-class ResultAccuracy:
-  mode: ResultAccuracy_Mode
-  atol: float
-  rtol: float
-  ulps: int
-
-class OpSharding_Type(enum.IntEnum):
-  REPLICATED: int
-  MAXIMAL: int
-  TUPLE: int
-  OTHER: int
-  MANUAL: int
-  UNKNOWN: int
-
-class OpSharding_ShardGroupType(enum.IntEnum):
-  AS: int
-  LIKE: int
-
-class OpSharding:
-  Type: typing.Type[OpSharding_Type]
-  type: OpSharding_Type
-  replicate_on_last_tile_dim: bool
-  last_tile_dims: Sequence[Type]
-  tile_assignment_dimensions: Sequence[int]
-  tile_assignment_devices: Sequence[int]
-  iota_reshape_dims: Sequence[int]
-  iota_transpose_perm: Sequence[int]
-  tuple_shardings: Sequence[OpSharding]
-  is_shard_group: bool
-  shard_group_id: int
-  ShardGroupType: typing.Type[OpSharding_ShardGroupType]
-  shard_group_type: OpSharding_ShardGroupType
-  def ParseFromString(self, s: bytes) -> None: ...
-  def SerializeToString(self) -> bytes: ...
-  def clone(self) -> OpSharding: ...
-
-class HloSharding:
-  @staticmethod
-  def from_proto(proto: OpSharding) -> HloSharding: ...
-  @staticmethod
-  def from_string(sharding: str) -> HloSharding: ...
-  @staticmethod
-  def tuple_sharding(
-      shape: Shape, shardings: Sequence[HloSharding]
-  ) -> HloSharding: ...
-  @staticmethod
-  def iota_tile(
-      dims: Sequence[int],
-      reshape_dims: Sequence[int],
-      transpose_perm: Sequence[int],
-      subgroup_types: Sequence[OpSharding.Type],
-  ) -> HloSharding: ...
-  @staticmethod
-  def replicate() -> HloSharding: ...
-  @staticmethod
-  def manual() -> HloSharding: ...
-  @staticmethod
-  def unknown() -> HloSharding: ...
-  @staticmethod
-  def subgroup_with_device_ordering(
-      tile_assignment: np.ndarray,
-      subgroup_types: Sequence[OpSharding.Type]) -> HloSharding: ...
-  def __eq__(self, other: HloSharding) -> bool: ...
-  def __hash__(self) -> int: ...
-  def __repr__(self) -> str: ...
-  def tile(self, shape: Shape) -> Shape: ...
-  def is_replicated(self) -> bool: ...
-  def is_manual(self) -> bool: ...
-  def is_unknown(self) -> bool: ...
-  def is_tiled(self) -> bool: ...
-  def is_maximal(self) -> bool: ...
-  def tuple_elements(self) -> List[HloSharding]: ...
-  def num_devices(self) -> int: ...
-  def num_dimensions(self) -> int: ...
-  def tile_assignment_dimensions(self) -> Sequence[int]: ...
-  def tile_assignment_devices(self) -> Sequence[int]: ...
-  def subgroup_types(self) -> Sequence[OpSharding.Type]: ...
-  def replicate_on_last_tile_dim(self) -> bool: ...
-  def to_proto(self) -> OpSharding: ...
-
-class FftType(enum.IntEnum):
-  FFT: FftType
-  IFFT: FftType
-  RFFT: FftType
-  IRFFT: FftType
-
-# === END xla_compiler.cc
-
-class Device:
-  id: int
-  host_id: int
-  process_index: int
-  platform: str
-  device_kind: str
-  client: Client
-  local_hardware_id: int | None
-  def __repr__(self) -> str: ...
-  def __str__(self) -> str: ...
-  def transfer_to_infeed(self, literal: _LiteralSlice): ...
-  def transfer_from_outfeed(self, shape: Shape): ...
-  def memory(self, kind: str) -> Memory: ...
-  def default_memory(self) -> Memory: ...
-  def addressable_memories(self) -> List[Memory]: ...
-  def live_buffers(self) -> List[Any]: ...
-  def memory_stats(self) -> Optional[Dict[str, int]]: ...
-  def get_stream_for_external_ready_events(self) -> int: ...
-  def __getattr__(self, name: str) -> Any: ...
-
-class Memory:
-  process_index: int
-  platform: str
-  kind: str
-  def __repr__(self) -> str: ...
-  def __str__(self) -> str: ...
-  def addressable_by_devices(self) -> List[Device]: ...
-
-class PjRtLayout:
-  def __str__(self) -> str: ...
-  def __eq__(self, other: PjRtLayout) -> bool: ...
-  def __hash__(self) -> int: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, _: Any): ...
-  def _xla_layout(self) -> Layout: ...
-
-class GpuAllocatorConfig:
-  class Kind(enum.IntEnum):
-    DEFAULT: int
-    PLATFORM: int
-    BFC: int
-    CUDA_ASYNC: int
-
-  def __init__(
-      self,
-      kind: Kind = ...,
-      memory_fraction: float = ...,
-      preallocate: bool = ...,
-      collective_memory_size: int = ...,
-  ) -> None: ...
-
-class HostBufferSemantics(enum.IntEnum):
-  IMMUTABLE_ONLY_DURING_CALL: HostBufferSemantics
-  IMMUTABLE_UNTIL_TRANSFER_COMPLETES: HostBufferSemantics
-  ZERO_COPY: HostBufferSemantics
-
-class Client:
-  platform: str
-  _raw_platform: str
-  platform_version: str
-  runtime_type: str
-  def device_count(self) -> int: ...
-  def local_device_count(self) -> int: ...
-  def devices(self) -> List[Device]: ...
-  def local_devices(self) -> List[Device]: ...
-  def _get_all_devices(self) -> List[Device]: ...
-  def device_from_local_hardware_id(self, int) -> Device: ...
-  def live_buffers(self) -> List[Any]: ...
-  def live_arrays(self) -> List[ArrayImpl]: ...
-  def live_executables(self) -> List[LoadedExecutable]: ...
-  def host_id(self) -> int: ...
-  def process_index(self) -> int: ...
-  def buffer_from_pyval(
-      self,
-      argument: Any,
-      device: Optional[Device] = ...,
-      force_copy: bool = ...,
-      host_buffer_semantics: HostBufferSemantics = ...,
-  ) -> ArrayImpl: ...
-  def compile(
-      self,
-      computation: Union[str, bytes],
-      compile_options: CompileOptions = ...,
-      host_callbacks: Sequence[Any] = ...,
-  ) -> LoadedExecutable: ...
-  def compile_ifrt_program(
-      self,
-      program: ifrt_programs.Program,
-      program_options: ifrt_programs.CompileOptions,
-  ) -> LoadedExecutable: ...
-  def serialize_executable(self, executable: LoadedExecutable) -> bytes: ...
-  def deserialize_executable(
-      self,
-      serialized: bytes,
-      options: Optional[CompileOptions],
-      host_callbacks: Sequence[Any] = ...,
-  ) -> LoadedExecutable: ...
-  def heap_profile(self) -> bytes: ...
-  def defragment(self) -> _Status: ...
-  def get_emit_python_callback_descriptor(
-      self,
-      callable: Callable,
-      operand_shapes: Sequence[Shape],
-      results_shapes: Sequence[Shape],
-  ) -> Tuple[Any, Any]: ...
-  def make_python_callback_from_host_send_and_recv(
-      self,
-      callable: Callable,
-      operand_shapes: Sequence[Shape],
-      result_shapes: Sequence[Shape],
-      send_channel_ids: Sequence[int],
-      recv_channel_ids: Sequence[int],
-      serializer: Optional[Callable] = ...,
-  ) -> Any: ...
-  def get_default_layout(
-      self, dtype: np.dtype, shard_shape: Sequence[int], device: Device
-  ) -> PjRtLayout: ...
-  def __getattr__(self, name: str) -> Any: ...
-
-class CpuCollectives: ...
-
-def make_gloo_tcp_collectives(
-    distributed_client: Optional[DistributedRuntimeClient] = ...,
-    hostname: Optional[str] = ...,
-    interface: Optional[str] = ...,
-) -> CpuCollectives: ...
-
-class MpiCollectives(CpuCollectives):
-  def Init(self): ...
-  def Finalize(self): ...
-
-def make_mpi_collectives() -> MpiCollectives: ...
-
-def get_tfrt_cpu_client(
-    asynchronous: bool = ...,
-    distributed_client: Optional[DistributedRuntimeClient] = ...,
-    node_id: int = ...,
-    num_nodes: int = ...,
-    collectives: Optional[CpuCollectives] = ...,
-    num_devices: int | None = ...,
-) -> Client: ...
-def get_gpu_client(
-    asynchronous: bool = ...,
-    allocator_config: GpuAllocatorConfig = ...,
-    distributed_client: Optional[DistributedRuntimeClient] = ...,
-    node_id: int = ...,
-    num_nodes: int = ...,
-    allowed_devices: Optional[Any] = ...,
-    platform_name: Optional[str] = ...,
-    mock: Optional[bool] = ...,
-) -> Client: ...
-def get_mock_gpu_client(
-    asynchronous: bool = ...,
-    allocator_config: GpuAllocatorConfig = ...,
-    distributed_client: Optional[DistributedRuntimeClient] = ...,
-    node_id: int = ...,
-    allowed_devices: Optional[Any] = ...,
-    platform_name: Optional[str] = ...,
-) -> Client: ...
-def get_c_api_client(
-    platform_name: str,
-    options: Dict[str, Union[str, int, List[int], float, bool]],
-    distributed_client: Optional[DistributedRuntimeClient] = ...,
-) -> Client: ...
-def get_default_c_api_topology(
-    platform_name: str,
-    topology_name: str,
-    options: Dict[str, Union[str, int, List[int], float]],
-) -> DeviceTopology: ...
-def get_topology_for_devices(devices: List[Device]) -> DeviceTopology: ...
-def load_pjrt_plugin(platform_name: str, library_path: Optional[str], c_api: Optional[Any]) -> _Status: ...
-def pjrt_plugin_loaded(plugin_name: str) -> bool: ...
-def pjrt_plugin_initialized(plugin_name: str) -> bool: ...
-def initialize_pjrt_plugin(platform_name: str) -> _Status: ...
-
-ArrayImpl = Any
-
-# TODO(phawkins): this type is problematic because it is not a subtype of
-# jax.Array, and pytype notices.
-# class ArrayImpl:
-#   def __init__(self,
-#                aval: Any,
-#                sharding: Any,
-#                arrays: Sequence[ArrayImpl],
-#                committed: bool,
-#                _skip_checks: bool = ...): ...
-#   def block_until_ready(self) -> ArrayImpl: ...
-#   def is_deleted(self) -> bool: ...
-#   def is_ready(self) -> bool: ...
-#   def delete(self): ...
-#   def unsafe_buffer_pointer(self) -> Any: ...
-#   def clone(self) -> ArrayImpl: ...
-#   def _copy_single_device_array_to_host_async(self): ...
-#   def _single_device_array_to_np_array_did_copy(self) -> tuple[np.ndarray, bool]: ...
-#   def on_device_size_in_bytes(self) -> int: ...
-#   def _fully_replicated_shard(self) -> ArrayImpl: ...
-#   __cuda_array_interface__: Dict[str, Any]
-#   dtype: np.dtype
-#   shape: Tuple[int, ...]
-#   _arrays: Any
-#   _npy_value: Any
-#   traceback: Traceback
-#   _HAS_DYNAMIC_ATTRIBUTES: bool = ...
-
-def batched_copy_array_to_devices_with_sharding(
-    arrays: Sequence[ArrayImpl],
-    devices: Sequence[List[Device]],
-    sharding: Sequence[Any],
-    array_copy_semantics: Sequence[ArrayCopySemantics],
-) -> Sequence[ArrayImpl]: ...
-
-def batched_block_until_ready(x: Sequence[ArrayImpl]) -> None: ...
-
-def batched_device_put(
-    aval: Any,
-    sharding: Any,
-    shards: Sequence[Any],
-    devices: List[Device],
-    committed: bool = True,
-) -> ArrayImpl: ...
-
-def reorder_shards(
-    x: ArrayImpl,
-    dst_sharding: Any,
-    array_copy_semantics: ArrayCopySemantics,
-) -> ArrayImpl: ...
-
-def check_and_canonicalize_memory_kind(
-    memory_kind: Optional[str], device_list: DeviceList
-) -> Optional[str]: ...
-def array_result_handler(
-    aval: Any, sharding: Any, committed: bool, _skip_checks: bool = ...
-) -> Callable: ...
-
-class Token:
-  def block_until_ready(self): ...
-
-class ShardedToken:
-  def block_until_ready(self): ...
-  def get_token(self, device_id: int): ...
-
-class ExecuteResults:
-  def __len__(self) -> int: ...
-  def disassemble_into_single_device_arrays(self) -> List[List[ArrayImpl]]: ...
-  def disassemble_prefix_into_single_device_arrays(
-      self, n: int
-  ) -> List[List[ArrayImpl]]: ...
-  def consume_with_handlers(self, handlers: List[Callable]) -> List[Any]: ...
-  def consume_token(self) -> ShardedToken: ...
-
-class LoadedExecutable:
-  client: Client
-  def local_devices(self) -> List[Device]: ...
-  def size_of_generated_code_in_bytes(self) -> int: ...
-  def delete(self) -> None: ...
-  def execute(self, arguments: Sequence[ArrayImpl]) -> List[ArrayImpl]: ...
-  def execute_with_token(
-      self, arguments: Sequence[ArrayImpl]
-  ) -> Tuple[List[ArrayImpl], Token]: ...
-  def execute_sharded_on_local_devices(
-      self, arguments: Sequence[List[ArrayImpl]]
-  ) -> List[List[ArrayImpl]]: ...
-  def execute_sharded_on_local_devices_with_tokens(
-      self, arguments: Sequence[List[ArrayImpl]]
-  ) -> Tuple[List[List[ArrayImpl]], ShardedToken]: ...
-  def execute_sharded(
-      self, arguments: Sequence[List[ArrayImpl]], with_tokens: bool = ...
-  ) -> ExecuteResults: ...
-  def hlo_modules(self) -> List[HloModule]: ...
-  def get_output_memory_kinds(self) -> List[List[str]]: ...
-  def get_compiled_memory_stats(self) -> CompiledMemoryStats: ...
-  def get_output_shardings(self) -> Optional[List[OpSharding]]: ...
-  def get_parameter_shardings(self) -> Optional[List[OpSharding]]: ...
-  def get_parameter_layouts(self) -> List[Layout]: ...
-  def get_output_layouts(self) -> List[Layout]: ...
-  def keep_alive(self) -> None: ...
-  def cost_analysis(self) -> Dict[str, Any]: ...
-  traceback: Traceback
-  fingerprint: Optional[bytes]
-
-class Executable:
-  def hlo_modules(self) -> List[HloModule]: ...
-  def get_output_memory_kinds(self) -> List[List[str]]: ...
-  def get_output_shardings(self) -> Optional[List[OpSharding]]: ...
-  def get_parameter_shardings(self) -> Optional[List[OpSharding]]: ...
-  def get_parameter_layouts(self) -> List[Layout]: ...
-  def get_output_layouts(self) -> List[Layout]: ...
-  def get_compiled_memory_stats(self) -> CompiledMemoryStats: ...
-  def serialize(self) -> str: ...
-  def cost_analysis(self) -> Dict[str, Any]: ...
-
-class DeviceTopology:
-  platform: str
-  platform_version: str
-  def _make_compile_only_devices(self) -> List[Device]: ...
-  def serialize(self) -> bytes: ...
-  def __getattr__(self, name: str) -> Any: ...
-
-def buffer_to_dlpack_managed_tensor(
-    buffer: ArrayImpl, stream: int | None = None
-) -> Any: ...
-@overload
-def dlpack_managed_tensor_to_buffer(
-    tensor: Any, device: Device, stream: int | None
-) -> ArrayImpl: ...
-@overload
-def dlpack_managed_tensor_to_buffer( # Legacy overload
-    tensor: Any,
-    cpu_backend: Optional[Client] = ...,
-    gpu_backend: Optional[Client] = ...,
-) -> ArrayImpl: ...
-
-def cuda_array_interface_to_buffer(
-    cai: Dict[str, Union[
-      str, int, None,
-      Tuple[int, ...], Tuple[int, bool],
-      List[Tuple[str, str]],
-      List[Tuple[str, str, Tuple[int, ...]]]]
-    ],
-    gpu_backend: Optional[Client] = ...,
-    device_id: int | None = None,
-) -> ArrayImpl: ...
-
-# === BEGIN py_traceback.cc
-
-class Frame:
-  file_name: str
-  function_name: str
-  function_line_start: int
-  line_num: int
-  def __init__(self,
-               file_name: str,
-               function_name: str,
-               function_line_start: int,
-               line_num: int): ...
-  def __repr__(self) -> str: ...
-
-class Traceback:
-  enabled: ClassVar[bool]
-  @staticmethod
-  def get_traceback() -> Traceback: ...
-  @staticmethod
-  def traceback_from_frames(frames: Sequence[Frame]) -> Any: ...
-  frames: Sequence[Frame]
-  def __str__(self) -> str: ...
-  def as_python_traceback(self) -> Any: ...
-  def raw_frames(self) -> Tuple[List[types.CodeType], List[int]]: ...
-  @staticmethod
-  def code_addr2line(code: types.CodeType, lasti: int) -> int: ...
-  @staticmethod
-  def code_addr2location(
-      code: types.CodeType, lasti: int
-  ) -> Tuple[int, int, int, int]: ...
-
-def replace_thread_exc_traceback(traceback: Any): ...
-
-# === END py_traceback.cc
-
-class DistributedRuntimeService:
-  def shutdown(self) -> None: ...
-
-class DistributedRuntimeClient:
-  def connect(self) -> _Status: ...
-  def shutdown(self) -> _Status: ...
-  def blocking_key_value_get(self, key: str, timeout_in_ms: int) -> _Status: ...
-  def blocking_key_value_get_bytes(
-      self, key: str, timeout_in_ms: int
-  ) -> _Status: ...
-  def key_value_try_get(self, key: str) -> _Status: ...
-  def key_value_try_get_bytes(self, key: str) -> _Status: ...
-  def key_value_dir_get(self, key: str) -> _Status: ...
-  def key_value_dir_get_bytes(self, key: str) -> _Status: ...
-  def key_value_set(self, key: str, value: str,
-                    allow_overwrite: bool = False) -> _Status: ...
-  def key_value_set_bytes(self, key: str, value: bytes,
-                          allow_overwrite: bool = False) -> _Status: ...
-  def key_value_delete(self, key: str) -> _Status: ...
-  def wait_at_barrier(
-      self, barrier_id: str, timeout_in_ms: int, process_ids: Optional[List[int]]
-  ) -> _Status: ...
-  def get_live_nodes(self, process_ids: List[int]) -> _Status: ...
-
-def get_distributed_runtime_service(
-    address: str,
-    num_nodes: int,
-    heartbeat_interval: Optional[int] = ...,
-    max_missing_heartbeats: Optional[int] = ...,
-    cluster_register_timeout: Optional[int] = ...,
-    shutdown_timeout: Optional[int] = ...,
-) -> DistributedRuntimeService: ...
-def get_distributed_runtime_client(
-    address: str,
-    node_id: int,
-    rpc_timeout: Optional[int] = ...,
-    init_timeout: Optional[int] = ...,
-    shutdown_timeout: Optional[int] = ...,
-    heartbeat_interval: Optional[int] = ...,
-    max_missing_heartbeats: Optional[int] = ...,
-    missed_heartbeat_callback: Optional[Any] = ...,
-    shutdown_on_destruction: Optional[bool] = ...,
-    use_compression: Optional[bool] = ...,
-) -> DistributedRuntimeClient: ...
-
-class PreemptionSyncManager:
-  def initialize(self, client: DistributedRuntimeClient) -> _Status: ...
-  def reached_sync_point(self, step_counter: int) -> bool: ...
-
-def create_preemption_sync_manager() -> PreemptionSyncManager: ...
-def collect_garbage() -> None: ...
-def is_optimized_build() -> bool: ...
-def json_to_pprof_profile(json: str) -> bytes: ...
-def pprof_profile_to_json(proto: bytes) -> str: ...
-
-class PmapFunction:
-  def __call__(self, *args, **kwargs) -> Any: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, Any): ...
-  __signature__: inspect.Signature
-  def _cache_size(self) -> int: ...
-  def _cache_clear(self) -> None: ...
-
-def weakref_lru_cache(
-    cache_context_fn: Callable, call: Callable, maxsize=...
-) -> WeakrefLRUCache: ...
-
-class DeviceList:
-  def __init__(self, device_assignment: Tuple[Device, ...]): ...
-  def __hash__(self) -> int: ...
-  def __eq__(self, other: Any) -> bool: ...
-  def __ne__(self, other: Any) -> bool: ...
-  def __len__(self) -> int: ...
-  def __getitem__(self, index: Any) -> Any: ...
-  def __iter__(self) -> Iterator[Device]: ...
-  def __str__(self) -> str: ...
-  def __repr__(self) -> str: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, state: Any): ...
-  @property
-  def is_fully_addressable(self) -> bool: ...
-  @property
-  def addressable_device_list(self) -> DeviceList: ...
-  @property
-  def default_memory_kind(self) -> Optional[str]: ...
-  @property
-  def memory_kinds(self) -> Tuple[str, ...]: ...
-
-class Sharding: ...
-
-class NamedSharding(Sharding):
-  def __init__(
-      self,
-      mesh: Any,
-      spec: Any,
-      *,
-      memory_kind: Optional[str] = None,
-      _manual_axes: frozenset[Any] = frozenset(),
-      _logical_device_ids: tuple[int, ...] | None = None,
-  ): ...
-  mesh: Any
-  spec: Any
-  _memory_kind: Optional[str]
-  _internal_device_list: DeviceList
-  _manual_axes: frozenset[Any]
-  _logical_device_ids: tuple[int, ...] | None
-
-class SingleDeviceSharding(Sharding):
-  def __init__(self, device: Device, *, memory_kind: Optional[str] = None): ...
-  _device: Device
-  _memory_kind: Optional[str]
-  _internal_device_list: DeviceList
-
-class PmapSharding(Sharding):
-  def __init__(
-      self, devices: Sequence[Any], sharding_spec: pmap_lib.ShardingSpec
-  ): ...
-  devices: List[Any]
-  sharding_spec: pmap_lib.ShardingSpec
-  _internal_device_list: DeviceList
-
-class GSPMDSharding(Sharding):
-  def __init__(
-      self,
-      devices: Sequence[Device],
-      op_sharding: Union[OpSharding, HloSharding],
-      *,
-      memory_kind: Optional[str] = None,
-      _device_list: Optional[DeviceList] = None,
-  ): ...
-  _devices: Tuple[Device, ...]
-  _hlo_sharding: HloSharding
-  _memory_kind: Optional[str]
-  _internal_device_list: DeviceList
-
-class PjitFunction:
-  def __call__(self, *args, **kwargs) -> Any: ...
-
-class PjitFunctionCache:
-  def __init__(self, capacity: int = ...): ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, Any): ...
-  def size(self) -> int: ...
-  def capacity(self) -> int: ...
-  def clear(self): ...
-  @staticmethod
-  def clear_all(): ...
-
-def pjit(
-    function_name: str,
-    fun: Optional[Callable],
-    cache_miss: Callable,
-    static_argnums: Sequence[int],
-    static_argnames: Sequence[str],
-    global_cache_key: Any,
-    pytree_registry: pytree.PyTreeRegistry,
-    shard_arg_fallback: Callable,
-    cache: Optional[PjitFunctionCache] = ...,
-) -> PjitFunction: ...
-
-class HloPassInterface:
-  @property
-  def name(self) -> str: ...
-  def is_pass_pipeline(self) -> bool: ...
-  def run(self, module: HloModule) -> bool: ...
-  def run_on_module_group(self, module_group: HloModuleGroup) -> bool: ...
-
-class HloDCE(HloPassInterface):
-  def __init__(self) -> None: ...
-
-class CallInliner(HloPassInterface):
-  def __init__(self) -> None: ...
-
-class FlattenCallGraph(HloPassInterface):
-  def __init__(self) -> None: ...
-
-class TupleSimplifer(HloPassInterface):
-  def __init__(self) -> None: ...
-
-class WeakrefLRUCacheInfo:
-  @property
-  def hits(self) -> int: ...
-  @property
-  def misses(self) -> int: ...
-  @property
-  def maxsize(self) -> int: ...
-  @property
-  def currsize(self) -> int: ...
-
-class WeakrefLRUCache:
-  def __call__(self, weakref_key: Any, *args, **kwargs) -> Any: ...
-  def cache_keys(self) -> list[Any]: ...
-  def cache_info(self) -> WeakrefLRUCacheInfo: ...
-  def cache_clear(self): ...
-
-def is_asan() -> bool: ...
-def is_msan() -> bool: ...
-def is_tsan() -> bool: ...
-def is_sanitized() -> bool: ...
-
-class TransferConnection:
-
-  def address(self) -> str: ...
-
-  def _pull_flat(self, uuid, backend, avals_flat) -> list[Any]: ...
-
-class TransferServer:
-  def _await_pull_flat(self, uuid, args: list[ArrayImpl]): ...
-
-  def connect(self, address: str) -> TransferConnection: ...
-
-def start_transfer_server(client: Client, address: str = "", transport_addresses: list[str] = [], max_num_parallel_copies: int = 0, transfer_size: int = 0) -> TransferServer: ...
diff --git a/third_party/xla/xla/python/xla_extension/config.pyi b/third_party/xla/xla/python/xla_extension/config.pyi
deleted file mode 100644
index 1aff2238dfbd..000000000000
--- a/third_party/xla/xla/python/xla_extension/config.pyi
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import Any, Generic, TypeVar
-
-from xla.python import xla_extension
-
-unset: object
-
-_T = TypeVar('_T')
-
-class Config(Generic[_T]):
-  def __init__(self, value: _T, include_in_jit_key: bool = False): ...
-
-  @property
-  def value(self) -> _T: ...
-
-  def get_local(self) -> Any: ...
-  def get_global(self) -> _T: ...
-  def set_local(self, value: Any) -> None: ...
-  def swap_local(self, value: Any) -> Any: ...
-  def set_global(self, value: _T) -> None: ...
diff --git a/third_party/xla/xla/python/xla_extension/guard_lib.pyi b/third_party/xla/xla/python/xla_extension/guard_lib.pyi
deleted file mode 100644
index b4d2817d4571..000000000000
--- a/third_party/xla/xla/python/xla_extension/guard_lib.pyi
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import Any, List, Optional
-
-class TransferGuardLevel:
-  ALLOW: Any
-  LOG: Any
-  DISALLOW: Any
-  LOG_EXPLICIT: Any
-  DISALLOW_EXPLICIT: Any
-
-class GarbageCollectionGuardLevel:
-  ALLOW: Any
-  LOG: Any
-  FATAL: Any
-
-class GuardState:
-  host_to_device: Optional[TransferGuardLevel]
-  device_to_device: Optional[TransferGuardLevel]
-  device_to_host: Optional[TransferGuardLevel]
-
-  explicit_device_put: bool
-  explicit_device_get: bool
-
-  garbage_collect_array: Optional[GarbageCollectionGuardLevel]
-
-def global_state() -> GuardState: ...
-def thread_local_state() -> GuardState: ...
-
-class _TestingScopedLogSink:
-  def __enter__(self) -> _TestingScopedLogSink: ...
-  def __exit__(self, *args, **kwargs) -> None: ...
-  def logs(self) -> List[str]: ...
diff --git a/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi b/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi
deleted file mode 100644
index 372f14846ce0..000000000000
--- a/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import Any, Optional, Callable, Sequence, Union
-
-from xla.python import xla_extension
-
-class Program:  ...
-
-class CompileOptions:  ...
-
-def make_hlo_program(mlir_module: Union[str, bytes]) -> Program: ...
-
-def make_colocated_python_program(
-    name : str,
-    picked_function: bytes,
-    devices: Sequence[xla_extension.Device] | xla_extension.DeviceList,
-    input_avals: Sequence[Any],
-    output_avals: Sequence[Any],
-) -> Program: ...
-
-def make_plugin_program(data: Union[str, bytes]) -> Program: ...
-
-def make_colocated_python_compile_options() -> CompileOptions: ...
-
-def make_xla_compile_options(
-    compile_options: xla_extension.CompileOptions,
-    host_callbacks: Sequence[Any]
-) -> CompileOptions: ...
-
-def make_plugin_compile_options() -> CompileOptions: ...
diff --git a/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi b/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi
deleted file mode 100644
index b3137a04501e..000000000000
--- a/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2024 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import datetime
-from typing import Any, Optional, Callable
-
-from xla.python import xla_extension
-
-_Status = Any
-Client = xla_extension.Client
-
-
-class ClientConnectionOptions:
-  on_disconnect: Optional[Callable[[_Status], None]] = None
-  on_connection_update: Optional[Callable[[str], None]] = None
-  connection_timeout_in_seconds: Optional[int] = None
-
-
-def get_client(
-    proxy_server_address: str,
-    options: ClientConnectionOptions
-) -> Client: ...
diff --git a/third_party/xla/xla/python/xla_extension/jax_jit.pyi b/third_party/xla/xla/python/xla_extension/jax_jit.pyi
deleted file mode 100644
index aa731b5bfaa9..000000000000
--- a/third_party/xla/xla/python/xla_extension/jax_jit.pyi
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import Any, Callable, Optional, Sequence, Tuple
-
-import numpy as np
-from xla.python import xla_extension
-
-from . import pytree
-
-Client = xla_extension.Client
-Device = xla_extension.Device
-
-
-class JitState:
-  disable_jit: Optional[bool]
-  enable_x64: Optional[bool]
-  default_device: Optional[Any]
-  extra_jit_context: Optional[Any]
-  post_hook: Optional[Callable[..., Any]]
-
-def global_state() -> JitState: ...
-def thread_local_state() -> JitState: ...
-
-def get_enable_x64() -> bool: ...
-def set_thread_local_state_initialization_callback(
-    function: Callable[[], None]): ...
-
-def swap_thread_local_state_disable_jit(
-    value: Optional[bool]) -> Optional[bool]: ...
-
-class ArgSignature:
-  dtype: np.dtype
-  shape: Tuple[int, ...]
-  weak_type: bool
-
-def _ArgSignatureOfValue(
-    __arg: Any,
-    __jax_enable_x64: bool) -> ArgSignature: ...
-
-def _is_float0(__arg: Any) -> bool: ...
-
-
-class ArgumentSignature:
-  static_args: Sequence[Any]
-  static_arg_names: Sequence[str]
-  dynamic_arg_names: Sequence[str]
-  dynamic_arg_treedefs: Sequence[pytree.PyTreeDef]
-
-  def __eq__(self, value, /): ...
-  def __ne__(self, value, /): ...
-  def __hash__(self, /): ...
-  def __str__(self): ...
-  def __repr__(self): ...
-
-
-def parse_arguments(
-    positional_args: Sequence[Any],
-    keyword_args: Sequence[Any],
-    kwnames: Tuple[str, ...],
-    static_argnums: Sequence[int],
-    static_argnames: Sequence[str],
-    pytree_registry: pytree.PyTreeRegistry,
-) -> tuple[ArgumentSignature, Sequence[Any]]: ...
\ No newline at end of file
diff --git a/third_party/xla/xla/python/xla_extension/mlir.pyi b/third_party/xla/xla/python/xla_extension/mlir.pyi
deleted file mode 100644
index 8787c428755c..000000000000
--- a/third_party/xla/xla/python/xla_extension/mlir.pyi
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import Union
-from . import XlaComputation
-
-def xla_computation_to_mlir_module(computation: XlaComputation) -> str: ...
-def mlir_module_to_xla_computation(
-    mlir_module: Union[bytes, str],
-    use_tuple_args: bool = ...,
-    return_tuple: bool = ...,
-) -> XlaComputation: ...
-def mhlo_to_stablehlo(mlir_module: Union[bytes, str]) -> bytes: ...
-def stablehlo_to_mhlo(mlir_module: Union[bytes, str]) -> bytes: ...
-def serialize_portable_artifact(mlir_module: str, target: str) -> bytes: ...
-def deserialize_portable_artifact(mlir_module: bytes) -> str: ...
-def refine_polymorphic_shapes(
-    mlir_module: Union[bytes, str],
-    enable_shape_assertions: bool = ...,
-    validate_static_shapes: bool = ...,
-    enable_shardy: bool = ...,
-) -> bytes: ...
diff --git a/third_party/xla/xla/python/xla_extension/ops.pyi b/third_party/xla/xla/python/xla_extension/ops.pyi
deleted file mode 100644
index 1efd2c43acae..000000000000
--- a/third_party/xla/xla/python/xla_extension/ops.pyi
+++ /dev/null
@@ -1,468 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import enum
-from typing import Any, Optional, Sequence, overload
-
-from xla.python import xla_extension
-
-FftType = xla_extension.FftType
-XlaBuilder = xla_extension.XlaBuilder
-XlaComputation = xla_extension.XlaComputation
-XlaOp = xla_extension.XlaOp
-PrecisionConfig_Precision = xla_extension.PrecisionConfig_Precision
-PrimitiveType = xla_extension.PrimitiveType
-Shape = xla_extension.Shape
-ShapeIndex = xla_extension.ShapeIndex
-ResultAccuracy = xla_extension.ResultAccuracy
-
-_ChannelHandle = Any
-_ConvDimensionNumbers = Any
-_DotDimensionNumbers = Any
-_Layout = Any
-_LiteralSlice = Any
-_GatherDimensionNumbers = Any
-_PaddingConfig = Any
-_ReplicaGroup = Any
-_ScatterDimensionNumbers = Any
-
-class TriangularSolveOptions_Transpose(enum.IntEnum):
-  TRANSPOSE_INVALID: int
-  NO_TRANSPOSE: int
-  TRANSPOSE: int
-  ADJOINT: int
-
-class RandomAlgorithm(enum.IntEnum):
-  RNG_DEFAULT: int
-  RNG_THREE_FRY: int
-  RNG_PHILOX: int
-
-class CustomCallSchedule(enum.IntEnum):
-  SCHEDULE_NONE: int
-  SCHEDULE_LATEST: int
-  SCHEDULE_EARLIEST: int
-
-# TODO(b/189822916): Remove this enum when all clients are migrated to the
-# status-returning API.
-class CustomCallApiVersion(enum.IntEnum):
-  API_VERSION_ORIGINAL: int
-  API_VERSION_STATUS_RETURNING: int
-  API_VERSION_STATUS_RETURNING_UNIFIED: int
-  API_VERSION_TYPED_FFI: int
-
-def AfterAll(builder: XlaBuilder, tokens: Sequence[XlaOp]) -> XlaOp: ...
-def AllGather(
-    operand: XlaOp,
-    all_gather_dimension: int,
-    shard_count: int,
-    replica_groups: Sequence[_ReplicaGroup] = ...,
-    channel_id: Optional[_ChannelHandle] = ...,
-    shape_with_layout: Optional[_Layout] = ...,
-    use_global_device_ids: Optional[bool] = ...) -> XlaOp: ...
-def AllReduce(
-    operand: XlaOp,
-    computation: XlaComputation,
-    replica_groups: Sequence[_ReplicaGroup] = ...,
-    channel_id: Optional[_ChannelHandle] = ...,
-    shape_with_layout: Optional[_Layout] = ...) -> XlaOp: ...
-def ApproxTopK(
-    builder: XlaBuilder,
-    operands: Sequence[XlaOp],
-    init_values: Sequence[XlaOp],
-    top_k: int,
-    reduction_dim: int,
-    comparator: XlaComputation,
-    recall_target: Optional[float],
-    aggregate_to_topk: Optional[bool],
-    reduction_input_size_override: Optional[int]) -> XlaOp: ...
-def ApproxTopKFallback(
-    builder: XlaBuilder,
-    operands: Sequence[XlaOp],
-    init_values: Sequence[XlaOp],
-    top_k: int,
-    reduction_dim: int,
-    comparator: XlaComputation,
-    recall_target: Optional[float],
-    aggregate_to_topk: Optional[bool],
-    reduction_input_size_override: Optional[int]) -> XlaOp: ...
-def ApproxTopKReductionOutputSize(
-    input_size: int,
-    rank: int,
-    top_k: int,
-    recall_target: float,
-    aggregate_to_topk: Optional[bool] = ...,
-    input_size_override: Optional[int] = ...) -> tuple[int, int]: ...
-def ReduceScatter(
-    operand: XlaOp,
-    computation: XlaComputation,
-    scatter_dimension: int,
-    shard_count: int,
-    replica_groups: Sequence[_ReplicaGroup] = ...,
-    channel_id: Optional[_ChannelHandle] = ...,
-    layout: Optional[_Layout] = ...,
-    use_global_device_ids: Optional[bool] = ...) -> XlaOp: ...
-def AllToAll(
-    operand: XlaOp,
-    split_dimension: int,
-    concat_dimension: int,
-    split_count: int,
-    replica_groups: Sequence[_ReplicaGroup] = ...,
-    layout: Optional[_Layout] = ...,
-    channel_id: Optional[_ChannelHandle] = ...) -> XlaOp: ...
-def BitcastConvertType(operand: XlaOp,
-                       new_element_type: PrimitiveType) -> XlaOp: ...
-def Broadcast(operand: XlaOp, sizes: Sequence[int]) -> XlaOp: ...
-def BroadcastInDim(operand: XlaOp,
-                   shape: Sequence[int],
-                   broadcast_dimensions: Sequence[int]) -> XlaOp: ...
-def Call(builder: XlaBuilder,
-         computation: XlaComputation,
-         operands: Sequence[XlaOp]) -> XlaOp: ...
-def Cholesky(a: XlaOp, lower: bool = ...) -> XlaOp: ...
-def Clamp(min: XlaOp, operand: XlaOp, max: XlaOp) -> XlaOp: ...
-def Collapse(operand: XlaOp, dimensions: Sequence[int]) -> XlaOp: ...
-def CollectivePermute(
-    operand: XlaOp,
-    source_target_pairs: Sequence[tuple[int, int]],
-    channel_id: Optional[_ChannelHandle] = ...,
-    inplace: bool = ...) -> XlaOp: ...
-def ConcatInDim(builder: XlaBuilder,
-                operands: Sequence[XlaOp],
-                dimension: int) -> XlaOp: ...
-@overload
-def Conditional(branch_index: XlaOp,
-                branch_computations: Sequence[XlaComputation],
-                branch_operands: Sequence[XlaOp]) -> XlaOp: ...
-@overload
-def Conditional(
-  predicate: XlaOp,
-    true_operand: XlaOp,
-    true_computation: XlaComputation,
-    false_operand: XlaOp,
-    false_computation: XlaComputation) -> XlaOp: ...
-
-def Constant(builder: XlaBuilder, value: _LiteralSlice) -> XlaOp: ...
-def ConstantLiteral(builder: XlaBuilder, value: _LiteralSlice) -> XlaOp: ...
-def ConvGeneralDilated(
-    lhs: XlaOp,
-    rhs: XlaOp,
-    window_strides: Sequence[int],
-    padding: Sequence[tuple[int, int]],
-    lhs_dilation: Sequence[int],
-    rhs_dilation: Sequence[int],
-    dimension_numbers: _ConvDimensionNumbers,
-    feature_group_count: int = ...,
-    batch_group_count: int = ...,
-    precision_config: Optional[PrecisionConfig_Precision] = ...,
-    preferred_element_type: Optional[PrimitiveType] = ...,
-    window_reversal: Optional[Sequence[bool]] = ...) -> XlaOp: ...
-def ConvertElementType(
-    operand: XlaOp,
-    new_element_type: PrimitiveType) -> XlaOp: ...
-def CreateToken(builder: XlaBuilder) -> XlaOp: ...
-def CrossReplicaSum(
-    operand: XlaOp,
-    replica_groups: Sequence[_ReplicaGroup] = ...) -> XlaOp: ...
-def CustomCall(
-    builder: XlaBuilder,
-    call_target_name: bytes,
-    operands: Sequence[XlaOp],
-    shape: Shape,
-    opaque: bytes = ...,
-    has_side_effect: bool = ...,
-    schedule: CustomCallSchedule = ...,
-    api_version: CustomCallApiVersion = ...) -> XlaOp: ...
-def CustomCallWithLayout(
-    builder: XlaBuilder,
-    call_target_name: bytes,
-    operands: Sequence[XlaOp],
-    shape_with_layout: Shape,
-    operand_shapes_with_layout: Sequence[Shape],
-    opaque: bytes = ...,
-    has_side_effect: bool = ...,
-    schedule: CustomCallSchedule = ...,
-    api_version: CustomCallApiVersion = ...) -> XlaOp: ...
-def CustomCallWithAliasing(
-    builder: XlaBuilder,
-    call_target_name: bytes,
-    operands: Sequence[XlaOp],
-    shape_with_layout: Shape,
-    operand_shapes_with_layout: Sequence[Shape],
-    opaque: bytes = ...,
-    has_side_effect: bool = ...,
-    output_operand_aliasing: Sequence[tuple[ShapeIndex, tuple[int, ShapeIndex]]] = ...,
-    literal: _LiteralSlice = ...,
-    schedule: CustomCallSchedule = ...,
-    api_version: CustomCallApiVersion = ...) -> XlaOp: ...
-def Dot(
-    lhs: XlaOp,
-    rhs: XlaOp,
-    precision_config: Optional[PrecisionConfig_Precision] = ...,
-    preferred_element_type: Optional[PrimitiveType] = ...) -> XlaOp: ...
-def DotGeneral(
-    lhs: XlaOp,
-    rhs: XlaOp,
-    dimensions_numbers: _DotDimensionNumbers,
-    precision_config: Optional[PrecisionConfig_Precision] = ...,
-    preferred_element_type: Optional[PrimitiveType] = ...) -> XlaOp: ...
-def DynamicReshape(
-    operand: XlaOp,
-    dim_sizes: Sequence[XlaOp],
-    new_size_bounds: Sequence[int],
-    dims_are_dynamic: Sequence[bool]) -> XlaOp: ...
-def DynamicSlice(
-    operand: XlaOp,
-    start_indices: Sequence[XlaOp],
-    slice_sizes: Sequence[int]) -> XlaOp: ...
-def DynamicUpdateSlice(
-    operand: XlaOp,
-    update: XlaOp,
-    start_indices: Sequence[XlaOp]) -> XlaOp: ...
-def Eigh(
-    a: XlaOp,
-    lower: bool = ...,
-    max_iter: int = ...,
-    epsilon: float = ...,
-    sort_eigenvalues: bool = ...) -> tuple[XlaOp, XlaOp]: ...
-def Fft(
-    operand: XlaOp,
-    fft_type: FftType,
-    fft_length: Sequence[int]) -> XlaOp: ...
-def Gather(
-    a: XlaOp,
-    start_indices: XlaOp,
-    dimension_numbers: _GatherDimensionNumbers,
-    slice_sizes: Sequence[int],
-    indices_are_sorted: bool = ...) -> XlaOp: ...
-def GetDimensionSize(operand: XlaOp, index: int) -> XlaOp: ...
-def GetTupleElement(tuple_data: XlaOp, index: int) -> XlaOp: ...
-def InfeedWithToken(
-    token: XlaOp,
-    shape: Shape,
-    config: Optional[str] = ...) -> XlaOp: ...
-@overload
-def Iota(builder: XlaBuilder, shape: Shape, iota_dimension: int) -> XlaOp: ...
-@overload
-def Iota(builder: XlaBuilder, type: PrimitiveType, size: int) -> XlaOp: ...
-def LU(a: XlaOp) -> tuple[XlaOp, XlaOp, XlaOp]: ...
-def Map(
-    builder: XlaBuilder,
-    operands: Sequence[XlaOp],
-    computation: XlaComputation,
-    dimensions: Sequence[int],
-    static_operands: Sequence[XlaOp] = ...) -> XlaOp: ...
-def MultiCollectivePermute(
-    operands: Sequence[XlaOp],
-    source_target_pairs: Sequence[tuple[int, int]],
-    channel_id: Optional[_ChannelHandle] = ...,
-    inplace: bool = ...) -> XlaOp: ...
-def NextAfter(__from: XlaOp, to: XlaOp) -> XlaOp: ...
-def OutfeedWithToken(
-    operand: XlaOp,
-    token: XlaOp,
-    shape_with_layout: Shape,
-    outfeed_config: Optional[str] = ...) -> XlaOp: ...
-def Pad(
-    operand: XlaOp,
-    padding_value: XlaOp,
-    padding_config: _PaddingConfig) -> XlaOp: ...
-def Parameter(
-    builder: XlaBuilder,
-    parameter_number: int,
-    shape: Shape,
-    name: str = ...,
-    replicated_at_leaf_buffers: Sequence[bool] = ...) -> XlaOp: ...
-def ProductOfElementaryHouseholderReflectors(a: XlaOp, taus: XlaOp) -> XlaOp: ...
-def QR(a: XlaOp, full_matrices: bool) -> tuple[XlaOp, XlaOp]: ...
-def QrDecomposition(a: XlaOp) -> tuple[XlaOp, XlaOp]: ...
-def Reduce(
-    builder: XlaBuilder,
-    operands: Sequence[XlaOp],
-    init_values: Sequence[XlaOp],
-    computation: XlaComputation,
-    dimensions_to_reduce: Sequence[int]) -> XlaOp: ...
-def ReducePrecision(
-    operand: XlaOp,
-    exponent_bits: int,
-    mantissa_bits: int) -> XlaOp: ...
-@overload
-def ReduceWindowWithGeneralPadding(
-    operand: XlaOp,
-    init_value: XlaOp,
-    computation: XlaComputation,
-    window_dimensions: Sequence[int],
-    window_strides: Sequence[int],
-    base_dilations: Sequence[int],
-    window_dilations: Sequence[int],
-    padding: Sequence[tuple[int, int]]) -> XlaOp: ...
-@overload
-def ReduceWindowWithGeneralPadding(
-    operands: Sequence[XlaOp],
-    init_values: Sequence[XlaOp],
-    computation: XlaComputation,
-    window_dimensions: Sequence[int],
-    window_strides: Sequence[int],
-    base_dilations: Sequence[int],
-    window_dilations: Sequence[int],
-    padding: Sequence[tuple[int, int]]) -> XlaOp: ...
-def ReplicaId(builder: XlaBuilder) -> XlaOp: ...
-def Reshape(operand: XlaOp, new_sizes: Sequence[int]) -> XlaOp: ...
-def Rev(operand: XlaOp, dimensions: Sequence[int]) -> XlaOp: ...
-def RngBitGenerator(
-    algorithm: RandomAlgorithm,
-    initial_state: XlaOp,
-    shape: Shape) -> XlaOp: ...
-def RngNormal(mu: XlaOp, sigma: XlaOp, shape: Shape) -> XlaOp: ...
-def RngUniform(a: XlaOp, b: XlaOp, shape: Shape) -> XlaOp: ...
-@overload
-def Scatter(
-    input: XlaOp,
-    scatter_indices: XlaOp,
-    updates: XlaOp,
-    update_computation: XlaComputation,
-    dimension_numbers: _ScatterDimensionNumbers,
-    indices_are_sorted: bool = ...,
-    unique_indices: bool = ...) -> XlaOp: ...
-@overload
-def Scatter(
-    inputs: Sequence[XlaOp],
-    scatter_indices: XlaOp,
-    updates: Sequence[XlaOp],
-    update_computation: XlaComputation,
-    dimension_numbers: _ScatterDimensionNumbers,
-    indices_are_sorted: bool = ...,
-    unique_indices: bool = ...) -> XlaOp: ...
-def Select(pred: XlaOp, on_true: XlaOp, on_false: XlaOp) -> XlaOp: ...
-def SelectAndScatterWithGeneralPadding(
-    operand: XlaOp,
-    select: XlaComputation,
-    window_dimensions: Sequence[int],
-    window_strides: Sequence[int],
-    padding: Sequence[tuple[int, int]],
-    source: XlaOp,
-    init_value: XlaOp,
-    scatter: XlaComputation) -> XlaOp: ...
-def Slice(
-    operand: XlaOp,
-    start_indices: Sequence[int],
-    limit_indices: Sequence[int],
-    strides: Sequence[int]) -> XlaOp: ...
-def SliceInDim(
-    operand: XlaOp,
-    start_index: int,
-    limit_index: int,
-    stride: int,
-    dimno: int) -> XlaOp: ...
-def Sort(
-    builder: XlaBuilder,
-    operands: Sequence[XlaOp],
-    comparator: Optional[XlaComputation] = ...,
-    dimension: int = ...,
-    is_stable: bool = ...) -> XlaOp: ...
-def SVD(
-    a: XlaOp,
-    max_iter: int = ...,
-    epsilon: float = ...) -> tuple[XlaOp, XlaOp, XlaOp]: ...
-def TopK(input: XlaOp, k: int) -> XlaOp: ...
-def Transpose(operand: XlaOp, permutation: Sequence[int]) -> XlaOp: ...
-def TriangularSolve(
-    a: XlaOp,
-    b: XlaOp,
-    left_side: bool,
-    lower: bool,
-    unit_diagonal: bool,
-    transpose_a: TriangularSolveOptions_Transpose) -> XlaOp: ...
-def Tuple(builder: XlaBuilder, elements: Sequence[XlaOp]) -> XlaOp: ...
-def While(
-    condition: XlaComputation,
-    body: XlaComputation,
-    init: XlaOp) -> XlaOp: ...
-
-
-def Igamma(a: XlaOp, x: XlaOp) -> XlaOp: ...
-def Igammac(a: XlaOp, x: XlaOp) -> XlaOp: ...
-def IgammaGradA(a: XlaOp, x: XlaOp) -> XlaOp: ...
-def RandomGammaGrad(a: XlaOp, x: XlaOp) -> XlaOp: ...
-def RegularizedIncompleteBeta(a: XlaOp, b: XlaOp, x: XlaOp) -> XlaOp: ...
-def Zeta(a: XlaOp, q: XlaOp) -> XlaOp: ...
-
-def Eq(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Ne(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Ge(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Gt(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Lt(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Le(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Add(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Sub(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Mul(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Div(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Rem(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Max(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Min(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def And(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Or(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Xor(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def ShiftLeft(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def ShiftRightArithmetic(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def ShiftRightLogical(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Atan2(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Pow(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-def Complex(lhs: XlaOp, rhs: XlaOp, broadcast_dimensions: Sequence[int] = ...) -> XlaOp: ...
-
-def Not(__arg: XlaOp) -> XlaOp: ...
-def PopulationCount(__arg: XlaOp) -> XlaOp: ...
-def Clz(__arg: XlaOp) -> XlaOp: ...
-def Abs(__arg: XlaOp) -> XlaOp: ...
-def Exp(__arg: XlaOp) -> XlaOp: ...
-@overload
-def Exp(operand: XlaOp, result_accuracy: ResultAccuracy = ...) -> XlaOp: ...
-def Expm1(__arg: XlaOp) -> XlaOp: ...
-def Floor(__arg: XlaOp) -> XlaOp: ...
-def Ceil(__arg: XlaOp) -> XlaOp: ...
-def Round(__arg: XlaOp) -> XlaOp: ...
-def Log(__arg: XlaOp) -> XlaOp: ...
-def Log1p(__arg: XlaOp) -> XlaOp: ...
-def Sign(__arg: XlaOp) -> XlaOp: ...
-def Cos(__arg: XlaOp) -> XlaOp: ...
-def OptimizationBarrier(__arg: XlaOp) -> XlaOp: ...
-def Sin(__arg: XlaOp) -> XlaOp: ...
-def Tan(__arg: XlaOp) -> XlaOp: ...
-def Tanh(__arg: XlaOp) -> XlaOp: ...
-def IsFinite(__arg: XlaOp) -> XlaOp: ...
-def Neg(__arg: XlaOp) -> XlaOp: ...
-def Sqrt(__arg: XlaOp) -> XlaOp: ...
-def Rsqrt(__arg: XlaOp) -> XlaOp: ...
-def Cbrt(__arg: XlaOp) -> XlaOp: ...
-def Square(__arg: XlaOp) -> XlaOp: ...
-def Reciprocal(__arg: XlaOp) -> XlaOp: ...
-def Erfc(__arg: XlaOp) -> XlaOp: ...
-def Erf(__arg: XlaOp) -> XlaOp: ...
-def ErfInv(__arg: XlaOp) -> XlaOp: ...
-def Lgamma(__arg: XlaOp) -> XlaOp: ...
-def Digamma(__arg: XlaOp) -> XlaOp: ...
-def BesselI0e(__arg: XlaOp) -> XlaOp: ...
-def BesselI1e(__arg: XlaOp) -> XlaOp: ...
-def Acos(__arg: XlaOp) -> XlaOp: ...
-def Asin(__arg: XlaOp) -> XlaOp: ...
-def Atan(__arg: XlaOp) -> XlaOp: ...
-def Tan(__arg: XlaOp) -> XlaOp: ...
-def Acosh(__arg: XlaOp) -> XlaOp: ...
-def Asinh(__arg: XlaOp) -> XlaOp: ...
-def Atanh(__arg: XlaOp) -> XlaOp: ...
-def Cosh(__arg: XlaOp) -> XlaOp: ...
-def Sinh(__arg: XlaOp) -> XlaOp: ...
-def Real(__arg: XlaOp) -> XlaOp: ...
-def Imag(__arg: XlaOp) -> XlaOp: ...
-def Conj(__arg: XlaOp) -> XlaOp: ...
diff --git a/third_party/xla/xla/python/xla_extension/pmap_lib.pyi b/third_party/xla/xla/python/xla_extension/pmap_lib.pyi
deleted file mode 100644
index 4ede6eebed83..000000000000
--- a/third_party/xla/xla/python/xla_extension/pmap_lib.pyi
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import inspect
-import numpy as np
-from typing import Any, Callable, List, Optional, Sequence, Iterable, Tuple
-
-from . import pytree
-
-_AvalDimSharding = Any
-_MeshDimAssignment = Any
-
-class NoSharding:
-  def __init__(self) -> None: ...
-  def __repr__(self) -> str: ...
-  def __eq__(self, __other: Any) -> bool: ...
-
-class Chunked:
-  @property
-  def chunks(self) -> Sequence[int]: ...
-  def __init__(self, __chunks: Sequence[int]) -> None: ...
-  def __repr__(self) -> str: ...
-  def __eq__(self, __other: Any) -> bool: ...
-
-class Unstacked:
-  @property
-  def size(self) -> int: ...
-  def __init__(self, __sz: int) -> None: ...
-  def __repr__(self) -> str: ...
-  def __eq__(self, __other: Any) -> bool: ...
-
-class ShardedAxis:
-  @property
-  def axis(self) -> int: ...
-  def __init__(self, __axis: int) -> None: ...
-  def __repr__(self) -> str: ...
-  def __eq__(self, __other: ShardedAxis) -> bool: ...
-
-class Replicated:
-  @property
-  def replicas(self) -> int: ...
-  def __init__(self, __replicas: int) -> None: ...
-  def __repr__(self) -> str: ...
-  def __eq__(self, __other: Replicated) -> bool: ...
-
-class ShardingSpec:
-  def __init__(self,
-               sharding: Iterable[_AvalDimSharding],
-               mesh_mapping: Iterable[_MeshDimAssignment]) -> None: ...
-  @property
-  def sharding(self) -> Tuple[_AvalDimSharding, ...]: ...
-  @property
-  def mesh_mapping(self) -> Tuple[_MeshDimAssignment]: ...
-  def __eq__(self, __other: ShardingSpec) -> bool: ...
-  def __hash__(self) -> int: ...
-
-  _HAS_DYNAMIC_ATTRIBUTES = True
-
-class PmapFunction:
-  def __call__(self, *args, **kwargs) -> Any: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, Any): ...
-  __signature__: inspect.Signature
-  def _cache_size(self) -> int: ...
-  def _cache_clear(self) -> None: ...
-  def _debug_cache_keys(self) -> str: ...
-
-def pmap(fun: Callable[..., Any],
-         cache_miss: Callable[..., Any],
-         static_argnums: Sequence[int],
-         shard_arg_fallback: Callable[..., Any],
-         pytree_registry: pytree.PyTreeRegistry) -> PmapFunction: ...
diff --git a/third_party/xla/xla/python/xla_extension/profiler.pyi b/third_party/xla/xla/python/xla_extension/profiler.pyi
deleted file mode 100644
index 3c1fc5bd35f0..000000000000
--- a/third_party/xla/xla/python/xla_extension/profiler.pyi
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from types import TracebackType
-from typing import Any, Optional, Type, Union, List, Tuple
-
-_Status = Any
-
-class ProfilerServer: ...
-def start_server(port: int) -> ProfilerServer: ...
-
-def register_plugin_profiler(c_api: Any) -> None: ...
-
-def get_profiled_instructions_proto(tensorboard_dir: str) -> bytes: ...
-def get_instructins_profile(tensorboard_dir: str) -> List[Tuple[str, float]]: ...
-def get_fdo_profile(
-    xspace: bytes, as_textproto: bool = ...
-) -> Union[bytes, str]: ...
-
-class ProfilerSession:
-  def __init__(self, options: Optional[ProfileOptions] = ...) -> None: ...
-  def stop(self) -> bytes: ...
-  def export(self, xspace: bytes, tensorboard_dir: str) -> _Status:...
-
-class ProfileOptions:
-  include_dataset_ops: bool
-  host_tracer_level: int
-  python_tracer_level: int
-  enable_hlo_proto: bool
-  start_timestamp_ns: int
-  duration_ms: int
-  repository_path: str
-
-def aggregate_profiled_instructions(profiles: List[bytes], percentile: int) -> str: ...
-
-class TraceMe:
-  def __init__(self, name: str, **kwargs: Any) -> None: ...
-  def __enter__(self) -> TraceMe: ...
-  def __exit__(
-      self,
-      exc_type: Optional[Type[BaseException]],
-      exc_value: Optional[BaseException],
-      exc_tb: Optional[TracebackType]) -> Optional[bool]:...
-  def set_metadata(self, **kwargs): ...
-  @staticmethod
-  def is_enabled() -> bool: ...
diff --git a/third_party/xla/xla/python/xla_extension/pytree.pyi b/third_party/xla/xla/python/xla_extension/pytree.pyi
deleted file mode 100644
index a90bb59ad876..000000000000
--- a/third_party/xla/xla/python/xla_extension/pytree.pyi
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import (
-    Any,
-    Callable,
-    Hashable,
-    Iterable,
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-    Type,
-    TypeVar,
-)
-
-_T = TypeVar("_T")
-
-version: int
-
-class PyTreeRegistry:
-  def __init__(
-      self,
-      *,
-      enable_none: bool = ...,
-      enable_tuple: bool = ...,
-      enable_namedtuple: bool = ...,
-      enable_list: bool = ...,
-      enable_dict: bool = ...
-  ): ...
-  def flatten(
-      self,
-      tree: Any,
-      leaf_predicate: Optional[Callable[[Any], bool]] = ...,
-  ) -> Tuple[List[Any], PyTreeDef]: ...
-  def flatten_one_level(
-      self, tree: Any
-  ) -> Optional[Tuple[Iterable[Any], Any]]: ...
-  def flatten_one_level_with_keys(
-      self, tree: Any
-  ) -> Optional[Tuple[Iterable[_KeyLeafPair], Any]]: ...
-  def flatten_with_path(
-      self,
-      tree: Any,
-      leaf_predicate: Optional[Callable[[Any], bool]] = ...,
-  ) -> Tuple[List[Tuple[_KeyPath, Any]], PyTreeDef]: ...
-  def register_node(
-      self,
-      __type: Type[_T],
-      to_iterable: Callable[[_T], Tuple[_Children, _AuxData]],
-      from_iterable: Callable[[_AuxData, _Children], _T],
-      to_iterable_with_keys: (
-          Callable[[_T], Tuple[_KeyLeafPairs, _AuxData]] | None
-      ) = ...,
-  ) -> Any: ...
-  def register_dataclass_node(
-      self, __type: Type[_T], meta_fields: List[str], data_fields: List[str]
-  ) -> Any: ...
-
-def default_registry() -> PyTreeRegistry: ...
-def tuple(registry: PyTreeRegistry, arg0: Sequence[PyTreeDef]) -> PyTreeDef: ...
-def all_leaves(registry: PyTreeRegistry, arg0: Iterable[Any]) -> bool: ...
-
-class SequenceKey(Hashable):
-  idx: int
-  __match_args__: tuple = ...
-  def __init__(self, idx: int): ...
-  def __str__(self) -> str: ...
-  def __repr__(self) -> str: ...
-  def __hash__(self) -> int: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, state: Any): ...
-  def __eq__(self, __other: Any) -> bool: ...
-
-class DictKey(Hashable):
-  key: Hashable
-  __match_args__: tuple = ...
-  def __init__(self, key: Hashable): ...
-  def __str__(self) -> str: ...
-  def __repr__(self) -> str: ...
-  def __hash__(self) -> int: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, state: Any): ...
-  def __eq__(self, __other: Any) -> bool: ...
-
-class GetAttrKey(Hashable):
-  name: str
-  __match_args__: tuple = ...
-  def __init__(self, name: str): ...
-  def __str__(self) -> str: ...
-  def __repr__(self) -> str: ...
-  def __hash__(self) -> int: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, state: Any): ...
-  def __eq__(self, __other: Any) -> bool: ...
-
-class FlattenedIndexKey(Hashable):
-  key: int
-  __match_args__: tuple = ...
-  def __init__(self, key: int): ...
-  def __str__(self) -> str: ...
-  def __repr__(self) -> str: ...
-  def __hash__(self) -> int: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, state: Any): ...
-  def __eq__(self, __other: Any) -> bool: ...
-
-class PyTreeDef:
-  def unflatten(self, __leaves: Iterable[Any]) -> Any: ...
-  def flatten_up_to(self, __xs: Any) -> List[Any]: ...
-  def compose(self, __inner: PyTreeDef) -> PyTreeDef: ...
-  def walk(
-      self,
-      __f_node: Callable[[Any, Any], Any],
-      __f_leaf: Optional[Callable[[_T], Any]],
-      leaves: Iterable[Any],
-  ) -> Any: ...
-  def from_iterable_tree(self, __xs: Any): ...
-  def node_data(self) -> Optional[Tuple[Type, Any]]: ...
-  def children(self) -> List[PyTreeDef]: ...
-  @staticmethod
-  def make_from_node_data_and_children(
-      registry: PyTreeRegistry,
-      node_data: Optional[Tuple[Type, Any]],
-      children: Iterable[PyTreeDef],
-  ) -> PyTreeDef: ...
-
-  num_leaves: int
-  num_nodes: int
-  def __repr__(self) -> str: ...
-  def __eq__(self, __other: PyTreeDef) -> bool: ...
-  def __ne__(self, __other: PyTreeDef) -> bool: ...
-  def __hash__(self) -> int: ...
-  def __getstate__(self) -> Any: ...
-  def __setstate__(self, state: Any): ...
-  def serialize_using_proto(self) -> bytes: ...
-  @staticmethod
-  def deserialize_using_proto(
-      registry: PyTreeRegistry, data: bytes
-  ) -> PyTreeDef: ...
-
-_Children = TypeVar("_Children", bound=Iterable[Any])
-_KeyLeafPair = TypeVar("_KeyLeafPair", bound=Tuple[Any, Any])
-_KeyLeafPairs = TypeVar("_KeyLeafPairs", bound=Iterable[_KeyLeafPair])
-_KeyPath = TypeVar("_KeyPath", bound=Tuple[Any, ...])
-_AuxData = TypeVar("_AuxData", bound=Hashable)
diff --git a/third_party/xla/xla/python/xla_extension/sdy.pyi b/third_party/xla/xla/python/xla_extension/sdy.pyi
deleted file mode 100644
index ba0b9bea5595..000000000000
--- a/third_party/xla/xla/python/xla_extension/sdy.pyi
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2021 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import Union
-from mlir import ir
-
-def sdy_round_trip_export_pipeline(
-    module: ir.module
-) -> str: ...
-
-def sdy_round_trip_import_shardings(
-    module: ir.module
-) -> str: ...
-
-def get_mesh(
-    module: ir.module
-) -> tuple[tuple[str, int], ...]: ...
-
-def lowered_with_shardy(
-    module: ir.module
-) -> bool: ...
diff --git a/third_party/xla/xla/python/xla_extension/transfer_guard_lib.pyi b/third_party/xla/xla/python/xla_extension/transfer_guard_lib.pyi
deleted file mode 100644
index 41240a475bec..000000000000
--- a/third_party/xla/xla/python/xla_extension/transfer_guard_lib.pyi
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2022 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from typing import Any, List, Optional
-
-class TransferGuardLevel:
-  ALLOW: Any
-  LOG: Any
-  DISALLOW: Any
-  LOG_EXPLICIT: Any
-  DISALLOW_EXPLICIT: Any
-
-class TransferGuardState:
-  host_to_device: Optional[TransferGuardLevel]
-  device_to_device: Optional[TransferGuardLevel]
-  device_to_host: Optional[TransferGuardLevel]
-
-  explicit_device_put: bool
-  explicit_device_get: bool
-
-def global_state() -> TransferGuardState: ...
-def thread_local_state() -> TransferGuardState: ...
-
-class _TestingScopedLogSink:
-  def __enter__(self) -> _TestingScopedLogSink: ...
-  def __exit__(self, *args, **kwargs) -> None: ...
-  def logs(self) -> List[str]: ...
diff --git a/third_party/xla/xla/python/xla_gpu_support.cc b/third_party/xla/xla/python/xla_gpu_support.cc
deleted file mode 100644
index f3f2ab649f60..000000000000
--- a/third_party/xla/xla/python/xla_gpu_support.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <Python.h>
-
-#include "absl/strings/string_view.h"
-#include "nanobind/nanobind.h"
-#include "nanobind/stl/string.h"  // IWYU pragma: keep
-#include "nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "xla/pjrt/status_casters.h"
-#include "xla/pjrt/triton.h"
-#include "xla/python/gpu_support.h"
-#include "xla/python/logging.h"
-#include "xla/python/py_client.h"  // IWYU pragma: keep
-
-namespace xla {
-namespace {
-
-namespace nb = nanobind;
-
-}  // namespace
-
-NB_MODULE(xla_gpu_extension, m_nb) {
-  // Initialize ABSL logging because code within XLA uses it.
-#ifndef PLATFORM_GOOGLE
-  InitializeAbslLogging();
-#endif  // PLATFORM_GOOGLE
-
-  // We seem to get a fair number of leak warnings from nanobind. It's unclear
-  // whether these are false positives or not.
-  nb::set_leak_warnings(false);
-
-  RegisterGpuClientAndDefineGpuAllocatorConfig(m_nb);
-
-  nb::class_<triton::CompilationResult>(m_nb, "TritonCompilationResult")
-      .def_ro("asm", &triton::CompilationResult::asm_text)
-      .def_ro("smem_bytes", &triton::CompilationResult::smem_bytes)
-      .def_ro("cluster_dim_x", &triton::CompilationResult::cluster_dim_x)
-      .def_ro("cluster_dim_y", &triton::CompilationResult::cluster_dim_y)
-      .def_ro("cluster_dim_z", &triton::CompilationResult::cluster_dim_z);
-
-  m_nb.def("compile_triton_to_asm",
-           [](nb::bytes module, nb::str arch_name, int num_warps, int num_ctas,
-              int num_stages) {
-             return xla::ValueOrThrow(xla::triton::Compile(
-                 absl::string_view(static_cast<const char*>(module.data()),
-                                   module.size()),
-                 arch_name.c_str(), num_warps, num_ctas, num_stages));
-           });
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/python_api/BUILD b/third_party/xla/xla/python_api/BUILD
index 3abf79fdd337..8e751beee755 100644
--- a/third_party/xla/xla/python_api/BUILD
+++ b/third_party/xla/xla/python_api/BUILD
@@ -1,4 +1,4 @@
-load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//xla:py_strict.bzl", "py_strict_library", "py_strict_test")
 
 # Description:
 #   Python API for XLA.
diff --git a/third_party/xla/xla/pytype.bzl b/third_party/xla/xla/pytype.bzl
new file mode 100644
index 000000000000..deb4cf089a2a
--- /dev/null
+++ b/third_party/xla/xla/pytype.bzl
@@ -0,0 +1,21 @@
+"""Default (OSS) build versions of Python pytype rules."""
+
+load("@local_xla//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+load("@rules_python//python:py_binary.bzl", "py_binary")
+load("@rules_python//python:py_library.bzl", "py_library")
+
+visibility(DEFAULT_LOAD_VISIBILITY)
+
+# Placeholder to use until bazel supports pytype_library.
+def pytype_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
+    _ = (pytype_deps, pytype_srcs)  # @unused
+    py_library(name = name, **kwargs)
+
+# Placeholder to use until bazel supports pytype_strict_binary.
+def pytype_strict_binary(name, **kwargs):
+    py_binary(name = name, **kwargs)
+
+# Placeholder to use until bazel supports pytype_strict_library.
+def pytype_strict_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
+    _ = (pytype_deps, pytype_srcs)  # @unused
+    py_library(name = name, **kwargs)
diff --git a/third_party/xla/xla/pytype.default.bzl b/third_party/xla/xla/pytype.default.bzl
deleted file mode 100644
index 397dce13431a..000000000000
--- a/third_party/xla/xla/pytype.default.bzl
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Default (OSS) build versions of Python pytype rules."""
-
-load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
-
-visibility(DEFAULT_LOAD_VISIBILITY)
-
-# Placeholder to use until bazel supports pytype_library.
-def pytype_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
-    _ = (pytype_deps, pytype_srcs)  # @unused
-    native.py_library(name = name, **kwargs)
-
-# Placeholder to use until bazel supports pytype_strict_binary.
-def pytype_strict_binary(name, **kwargs):
-    native.py_binary(name = name, **kwargs)
-
-# Placeholder to use until bazel supports pytype_strict_library.
-def pytype_strict_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
-    _ = (pytype_deps, pytype_srcs)  # @unused
-    native.py_library(name = name, **kwargs)
diff --git a/third_party/xla/xla/reference_util.cc b/third_party/xla/xla/reference_util.cc
index ec4fdf8aa437..3a5338af6299 100644
--- a/third_party/xla/xla/reference_util.cc
+++ b/third_party/xla/xla/reference_util.cc
@@ -448,7 +448,7 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
   HloEvaluator evaluator;
   Literal result_literal = evaluator.Evaluate(*computation, {}).value();
 
-  CHECK_EQ(result_literal.shape().rank(), 4);
+  CHECK_EQ(result_literal.shape().dimensions().size(), 4);
   auto result =
       std::make_unique<Array4D<float>>(result_literal.shape().dimensions(0),
                                        result_literal.shape().dimensions(1),
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
index de02661ff4df..438c05c02539 100644
--- a/third_party/xla/xla/runtime/BUILD
+++ b/third_party/xla/xla/runtime/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -37,3 +37,106 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
     ],
 )
+
+cc_library(
+    name = "resource_use",
+    srcs = ["resource_use.cc"],
+    hdrs = ["resource_use.h"],
+    deps = [
+        "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "resource_use_test",
+    srcs = ["resource_use_test.cc"],
+    deps = [
+        ":resource_use",
+        "//xla/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "execution_graph",
+    srcs = ["execution_graph.cc"],
+    hdrs = ["execution_graph.h"],
+    deps = [
+        ":buffer_use",
+        ":resource_use",
+        "//xla:util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "execution_graph_test",
+    srcs = ["execution_graph_test.cc"],
+    deps = [
+        ":buffer_use",
+        ":execution_graph",
+        ":resource_use",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "object_pool",
+    hdrs = ["object_pool.h"],
+    deps = [
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "object_pool_test",
+    srcs = ["object_pool_test.cc"],
+    deps = [
+        ":object_pool",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "thunk",
+    hdrs = ["thunk.h"],
+)
+
+cc_library(
+    name = "device_id",
+    hdrs = ["device_id.h"],
+    deps = [
+        "//xla/tsl/lib/gtl:int_type",
+    ],
+)
diff --git a/third_party/xla/xla/runtime/device_id.h b/third_party/xla/xla/runtime/device_id.h
new file mode 100644
index 000000000000..e118242957fb
--- /dev/null
+++ b/third_party/xla/xla/runtime/device_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_RUNTIME_DEVICE_ID_H_
+#define XLA_RUNTIME_DEVICE_ID_H_
+
+#include <cstdint>
+
+#include "xla/tsl/lib/gtl/int_type.h"
+
+namespace xla {
+
+// Strongly-typed integer type for naming a device globally within a distributed
+// system. XLA doesn't have a strong opinion about what global numbering scheme
+// is applied to GPUs; the user must provide a local -> global mapping via
+// GpuExecutableRunOptions for the local GPUs.
+TSL_LIB_GTL_DEFINE_INT_TYPE(GlobalDeviceId, int64_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(LocalDeviceId, int64_t);
+
+}  // namespace xla
+
+#endif  // XLA_RUNTIME_DEVICE_ID_H_
diff --git a/third_party/xla/xla/runtime/execution_graph.cc b/third_party/xla/xla/runtime/execution_graph.cc
new file mode 100644
index 000000000000..057a2f9a4700
--- /dev/null
+++ b/third_party/xla/xla/runtime/execution_graph.cc
@@ -0,0 +1,399 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/runtime/execution_graph.h"
+
+#include <sys/stat.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/const_init.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
+#include "xla/util.h"
+
+namespace xla {
+
+// Give aliases to the edge kinds to make code more readable.
+static constexpr auto kExecution = ExecutionGraph::NodeEdge::Kind::kExecution;
+static constexpr auto kScheduling = ExecutionGraph::NodeEdge::Kind::kScheduling;
+
+// A helper function to create a predicate that checks if a given node edge
+// points to a given node id.
+static auto EdgePredicate(ExecutionGraph::NodeId id) {
+  return [id](const ExecutionGraph::NodeEdge& edge) { return edge.id == id; };
+}
+
+// If any of the resource uses requires execution edge, we return kExecution
+// edge kind, otherwise we return kScheduling edge kind.
+static auto EdgeKind(absl::Span<const ResourceUse> resource_uses) {
+  auto requires_execution_edge = [](const ResourceUse& resource_use) {
+    auto kind = resource_use.resource()->kind();
+    return ExecutionGraph::NodeEdge::KindOf(kind) == kExecution;
+  };
+  return absl::c_any_of(resource_uses, requires_execution_edge) ? kExecution
+                                                                : kScheduling;
+}
+
+ExecutionGraph::ExecutionGraph(NodesEdges nodes_in_edges,
+                               NodesEdges nodes_out_edges,
+                               std::vector<NodeDef> nodes_defs)
+    : nodes_in_edges_(std::move(nodes_in_edges)),
+      nodes_out_edges_(std::move(nodes_out_edges)),
+      nodes_defs_(std::move(nodes_defs)),
+      is_sequential_(true) {
+  // Identify source and sink nodes in the execution graph.
+  for (NodeId i = 0; i < nodes_defs_.size(); ++i) {
+    // Mark nodes with empty in-edges as source nodes.
+    if (nodes_defs_[i].in_edges.empty()) {
+      source_.push_back(i);
+    }
+
+    // Mark nodes with empty out-edges as sink nodes.
+    if (nodes_defs_[i].out_edges.empty()) {
+      sink_.push_back(i);
+    }
+  }
+
+  // Check if constructed execution DAG is sequential: every node depends on the
+  // completion of the previous node.
+  for (NodeId i = 1; i < nodes_defs_.size() && is_sequential_; ++i) {
+    is_sequential_ &=
+        (absl::c_count_if(nodes_defs_[i].in_edges, EdgePredicate(i - 1)) != 0);
+  }
+
+  VLOG(2) << absl::StreamFormat(
+      "Constructed execution graph with %d nodes: #source_nodes=%d "
+      "#sink_nodes=%d, is_sequential=%v",
+      nodes_defs_.size(), source_.size(), sink_.size(), is_sequential_);
+
+  // Sanity check that all vectors are empty or all vectors are non-empty.
+  DCHECK((!source_.empty() && !sink_.empty()) ||
+         (source_.empty() && sink_.empty()));
+}
+
+absl::StatusOr<ExecutionGraph> ExecutionGraph::Create(
+    absl::Span<const Operation* const> operations) {
+  // Make sure that operations sequence size fits into NodeId.
+  if (operations.size() > std::numeric_limits<NodeId>::max()) {
+    return Internal("Can't create ExecutionGraph for more than %d operations",
+                    std::numeric_limits<NodeId>::max());
+  }
+
+  std::vector<NodeDefBuilder> builders(operations.size());
+
+  std::vector<BufferUse::ReadWriteSet> buffer_rwsets(operations.size());
+  std::vector<ResourceUse::ReadWriteSet> resource_rwsets(operations.size());
+
+  // TODO(ezhulenev): This is very inefficient O(N^2) complexity algorithm
+  // that will create a lot of redundant edges. We can do much better by
+  // stopping traversal once we prove that we already have dependencies on the
+  // most recent updates that touch the whole buffer slice.
+
+  for (NodeId i = 0; i < operations.size(); ++i) {
+    builders[i].id = i;
+
+    const Operation* op = operations[i];
+    buffer_rwsets[i].AddAll(op->BufferUses());
+    resource_rwsets[i].AddAll(op->ResourceUses());
+
+    for (NodeId j = 0; j < i; ++j) {
+      if (buffer_rwsets[j].HasConflicts(buffer_rwsets[i])) {
+        // If we have buffer conflicts we must add an execution edge to
+        // guarantee that we don't have data races at run time.
+        builders[j].out_edges.push_back(NodeEdge{kExecution, i});
+        builders[i].in_edges.push_back(NodeEdge{kExecution, j});
+
+      } else if (resource_rwsets[j].HasConflicts(resource_rwsets[i])) {
+        // If we have resource conflicts, we must check resources that are
+        // accessed by both nodes to find out what kind of edge we need to add.
+        auto kind = EdgeKind(resource_rwsets[j].Conflicts(resource_rwsets[i]));
+        builders[j].out_edges.push_back(NodeEdge{kind, i});
+        builders[i].in_edges.push_back(NodeEdge{kind, j});
+      }
+    }
+  }
+
+  // Verify that both in-edges and out-edges are sorted in ascending order
+  // according to node id as we use this property later.
+  for (NodeId i = 0; i < builders.size(); ++i) {
+    auto by_id = [](const NodeEdge& a, const NodeEdge& b) {
+      return a.id < b.id;
+    };
+    DCHECK(absl::c_is_sorted(builders[i].out_edges, by_id));
+    DCHECK(absl::c_is_sorted(builders[i].in_edges, by_id));
+  }
+
+  // Erase redundant edges between nodes.
+  int64_t num_erased_edges =
+      RunTransitiveReductionAndUpdatePriorities(absl::MakeSpan(builders));
+  VLOG(5) << absl::StreamFormat(
+      "Transitive reduction erased %d edges from the execution graph",
+      num_erased_edges);
+
+  auto [in_edges, out_edges, nodes_defs] = CreateNodeDefs(std::move(builders));
+  return ExecutionGraph(std::move(in_edges), std::move(out_edges),
+                        std::move(nodes_defs));
+}
+
+std::tuple<ExecutionGraph::NodesEdges, ExecutionGraph::NodesEdges,
+           std::vector<ExecutionGraph::NodeDef>>
+ExecutionGraph::CreateNodeDefs(std::vector<NodeDefBuilder> builders) {
+  // Find how many in-edges and out-edges we have in total.
+  size_t num_in_edges = 0, num_out_edges = 0;
+  for (const NodeDefBuilder& b : builders) {
+    num_in_edges += b.in_edges.size();
+    num_out_edges += b.out_edges.size();
+  }
+
+  NodesEdges nodes_in_edges;
+  NodesEdges nodes_out_edges;
+  std::vector<NodeDef> nodes_defs;
+
+  // Reserve memory to avoid re-allocation and dangling spans into freed memory.
+  nodes_in_edges.reserve(num_in_edges);
+  nodes_out_edges.reserve(num_out_edges);
+  nodes_defs.reserve(builders.size());
+
+  for (const NodeDefBuilder& b : builders) {
+    size_t num_in_edges = b.in_edges.size();
+    size_t num_out_edges = b.out_edges.size();
+
+    auto inserted_in_edges = nodes_in_edges.insert(
+        nodes_in_edges.end(), b.in_edges.begin(), b.in_edges.end());
+    auto inserted_out_edges = nodes_out_edges.insert(
+        nodes_out_edges.end(), b.out_edges.begin(), b.out_edges.end());
+
+    nodes_defs.push_back(NodeDef{
+        b.id,
+        num_in_edges ? absl::MakeConstSpan(&*inserted_in_edges, num_in_edges)
+                     : absl::Span<const NodeEdge>(),
+        num_out_edges ? absl::MakeConstSpan(&*inserted_out_edges, num_out_edges)
+                      : absl::Span<const NodeEdge>(),
+        b.priority,
+    });
+  }
+
+  return std::make_tuple(std::move(nodes_in_edges), std::move(nodes_out_edges),
+                         std::move(nodes_defs));
+}
+
+int64_t ExecutionGraph::EraseEdge(NodeDefBuilder& from, NodeDefBuilder& to,
+                                  NodeEdge::Kind kind) {
+  DCHECK_NE(from.id, to.id) << "Nodes must be different";
+  DCHECK_LT(from.id, to.id) << "Nodes must be ordered";
+
+  // Short-circuit if out or in-edges are empty.
+  if (from.out_edges.empty() || to.in_edges.empty()) {
+    DCHECK_EQ(absl::c_count_if(from.out_edges, EdgePredicate(to.id)), 0)
+        << "Unexpected out edge from " << from.id << " to " << to.id;
+    DCHECK_EQ(absl::c_count_if(to.in_edges, EdgePredicate(from.id)), 0)
+        << "Unexpected in edge from " << from.id << " to " << to.id;
+    return 0;
+  }
+
+  // Short-circuit if out-edges or in-edges don't intersect with `to` or `from`
+  // node ids (remember that edges are sorted).
+  if (from.out_edges.back().id < to.id || to.in_edges.front().id > from.id) {
+    DCHECK_EQ(absl::c_count_if(from.out_edges, EdgePredicate(to.id)), 0)
+        << "Unexpected out edge from " << from.id << " to " << to.id;
+    DCHECK_EQ(absl::c_count_if(to.in_edges, EdgePredicate(from.id)), 0)
+        << "Unexpected in edge from " << from.id << " to " << to.id;
+    return 0;
+  }
+
+  // Comparator to find a node edge with a given node id.
+  auto less_than = [](const NodeEdge& edge, NodeId id) { return edge.id < id; };
+
+  // Check if `from` node has an out edge to `to` node.
+  auto out_edges_it = absl::c_lower_bound(from.out_edges, to.id, less_than);
+  bool has_out_edge =
+      out_edges_it != from.out_edges.end() && out_edges_it->id == to.id;
+
+  // Short-circuit if there is no out edge from `from` node to `to` node.
+  if (!has_out_edge) {
+    DCHECK_EQ(absl::c_count_if(to.in_edges, EdgePredicate(from.id)), 0)
+        << "Unexpected in edge from " << from.id << " to " << to.id;
+    return 0;
+  }
+
+  // Check if `to` node has an in edge from `from` node.
+  auto in_edges_it = absl::c_lower_bound(to.in_edges, from.id, less_than);
+  bool has_in_edge =
+      in_edges_it != to.in_edges.end() && in_edges_it->id == from.id;
+
+  DCHECK(has_in_edge) << "In-edge must exist if out-edge exists";
+  DCHECK_EQ(static_cast<int>(in_edges_it->kind),
+            static_cast<int>(out_edges_it->kind))
+      << "Edges kind must match";
+
+  // At this point we must have exactly one edge between `from` and `to` nodes.
+  DCHECK_EQ(absl::c_count_if(from.out_edges, EdgePredicate(to.id)), 1)
+      << "Expected exactly one out edge from " << from.id << " to " << to.id;
+  DCHECK_EQ(absl::c_count_if(to.in_edges, EdgePredicate(from.id)), 1)
+      << "Expected exactly one in edge from " << from.id << " to " << to.id;
+
+  // We can't erase an edge with a stronger ordering guarantee.
+  if (in_edges_it->kind > kind) {
+    return 0;
+  }
+
+  // We erased exactly one edge between `from` and `to` nodes.
+  from.out_edges.erase(out_edges_it);
+  to.in_edges.erase(in_edges_it);
+  return 1;
+}
+
+namespace {
+
+// A state of a DFS traversal for transitive reduction.
+class TransitiveReductionDfsState {
+ public:
+  void PushToStack(ExecutionGraph::NodeEdge edge) {
+    if (!visited_[edge.id]) {
+      ++(edge.kind == kExecution ? num_execution_edges_
+                                 : num_scheduling_edges_);
+      stack_.push_back(edge);
+      visited_[edge.id] = true;
+    }
+  }
+
+  void PushToStack(absl::Span<const ExecutionGraph::NodeEdge> edges) {
+    for (const ExecutionGraph::NodeEdge& edge : edges) {
+      PushToStack(edge);
+    }
+  }
+
+  ExecutionGraph::NodeEdge PopFromStack() {
+    ExecutionGraph::NodeEdge edge = stack_.back();
+    --(edge.kind == kExecution ? num_execution_edges_ : num_scheduling_edges_);
+    stack_.pop_back();
+    return edge;
+  }
+
+  bool Empty() const { return stack_.empty(); }
+
+  void Visited(ExecutionGraph::NodeId id) { visited_[id] = true; }
+  size_t NumVisited() const { return absl::c_count(visited_, true); }
+
+  void Clear(size_t num_nodes) {
+    stack_.clear();
+    visited_.assign(num_nodes, false);
+  }
+
+  bool num_execution_edges() const { return num_execution_edges_; }
+  bool num_scheduling_edges() const { return num_scheduling_edges_; }
+
+ private:
+  std::vector<ExecutionGraph::NodeEdge> stack_;
+  std::vector<bool> visited_;
+
+  // The number of execution and scheduling edges currently in the stack.
+  size_t num_execution_edges_ = 0;
+  size_t num_scheduling_edges_ = 0;
+};
+
+}  // namespace
+
+int64_t ExecutionGraph::RunTransitiveReductionAndUpdatePriorities(
+    absl::Span<NodeDefBuilder> builders) {
+  int64_t num_erased_edges = 0;
+
+  // Keep workspace for DFS traversal between iterations.
+  TransitiveReductionDfsState state;
+
+  // For each node we do a DFS traversal and delete redundant edges that
+  // connect source node with the node reachable via DFS. We do traversal in
+  // reverse order as we end up traversing fewer edges this way.
+  for (int64_t i = builders.size() - 1; i >= 0; --i) {
+    NodeDefBuilder& source_node = builders[i];
+
+    // Clear DFS state from previous iteration.
+    state.Clear(builders.size());
+
+    // Make a copy of out edges to avoid invalidating iterators.
+    for (NodeEdge out_edge : std::vector<NodeEdge>(source_node.out_edges)) {
+      DCHECK(state.Empty()) << "Stack must be empty at the start of the DFS";
+
+      // Initialize state with nodes reachable via `out_edge`. We mark immediate
+      // out nodes as visited to correctly compute node priority below.
+      NodeDefBuilder& out_node = builders[out_edge.id];
+      state.Visited(out_edge.id);
+      state.PushToStack(out_node.out_edges);
+
+      // Do a round of DFS traversal and delete redundant edges from the
+      // `source_node` to the nodes reachable via DFS.
+      while (!state.Empty()) {
+        NodeEdge node_edge = state.PopFromStack();
+        NodeDefBuilder& node = builders[node_edge.id];
+
+        // If we reached `node` via a scheduling edge, then we can't remove an
+        // execution edge from the `source_node`, as we might weaker the
+        // execution order and introduce a data race.
+        bool has_scheduling_edge = out_edge.kind == kScheduling ||
+                                   node_edge.kind == kScheduling ||
+                                   state.num_scheduling_edges();
+        NodeEdge::Kind kind = has_scheduling_edge ? kScheduling : kExecution;
+        num_erased_edges += EraseEdge(source_node, node, kind);
+
+        // Keep following nodes reachable via `node` out edges.
+        state.PushToStack(node.out_edges);
+      }
+    }
+
+    // Set node priority to the number of visited nodes in the DFS traversal.
+    source_node.priority = state.NumVisited();
+  }
+
+  return num_erased_edges;
+}
+
+// Execution graph renderer registration logic
+
+absl::Mutex renderer_mu(absl::kConstInit);
+ExecutionGraph::Renderer* graph_renderer ABSL_GUARDED_BY(renderer_mu) = nullptr;
+
+ExecutionGraph::Renderer* ExecutionGraph::GetRenderer() {
+  absl::MutexLock lock(&renderer_mu);
+  return graph_renderer;
+}
+
+void ExecutionGraph::RegisterRenderer(
+    std::unique_ptr<ExecutionGraph::Renderer> renderer) {
+  absl::MutexLock lock(&renderer_mu);
+  if (graph_renderer != nullptr) {
+    LOG(WARNING) << "Multiple calls to RegisterRenderer. Last "
+                    "call wins, but because order of initialization in C++ is "
+                    "nondeterministic, this may not be what you want.";
+    delete graph_renderer;
+  }
+  graph_renderer = renderer.release();
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/runtime/execution_graph.h b/third_party/xla/xla/runtime/execution_graph.h
new file mode 100644
index 000000000000..04d1e3be883c
--- /dev/null
+++ b/third_party/xla/xla/runtime/execution_graph.h
@@ -0,0 +1,287 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_RUNTIME_EXECUTION_GRAPH_H_
+#define XLA_RUNTIME_EXECUTION_GRAPH_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
+
+namespace xla {
+
+// Execution graph defines the execution order of operations based on their
+// buffer use and resource use dependencies.
+//
+// In XLA:GPU and XLA:CPU we compile HLO programs to a sequence of operations
+// executed on the underlying device. XLA compiler creates a sequential schedule
+// that is used to assign buffers to operations. These operation can be
+// implemented as thunks, or as commands (only on XLA:GPU backend with CUDA
+// graphs). Each operations reads and writes from/to buffer slices and uses
+// resources (i.e. collective communicator).
+//
+// At run time we can relax sequential schedule and execute operations
+// concurrently, as long as we don't create data races (reading and writing
+// from/to the same or overlapping buffer slices concurrently), or resource
+// races (using the same mutable resource concurrently).
+//
+// Resources can behave as buffers and require an execution order (operation
+// must wait for the completion of execution of all dependencies), or as a
+// scheduling barrier (operation must wait for the completion of scheduling of
+// all dependencies). See more details in the `NodeEdge::Kind` definition.
+//
+// We use buffer and resource use conflicts to define an execution order of
+// operations as a directed acyclic graph (DAG) that satisfies all dependencies.
+//
+// Backend-specific runtime relies on the execution graph to execute operations
+// concurrently usult the underlying device concurrency mechanism, e.g.
+// thread pools on CPU device, or CUDA streams on NVIDIA GPU device.
+class ExecutionGraph {
+ public:
+  // Nodes identified by their index in the operation sequence.
+  using NodeId = int32_t;
+
+  static constexpr NodeId kInvalidNodeId = std::numeric_limits<NodeId>::min();
+
+  // A base class for an operation that can be executed by the runtime.
+  class Operation {
+   public:
+    virtual ~Operation() = default;
+
+    virtual absl::string_view name() const = 0;
+    // Optional function that allows grouping operations of the same kind. E.x.
+    // on XLA:CPU this is the id of the thunk kind, and is used for color coding
+    // graph visualization.
+    virtual int64_t op_type_id() const { return 0; };
+    virtual absl::Span<const BufferUse> BufferUses() const = 0;
+    virtual absl::Span<const ResourceUse> ResourceUses() const = 0;
+
+   protected:
+    Operation() = default;
+
+    Operation(const Operation&) = default;
+    Operation& operator=(const Operation&) = default;
+
+    Operation(Operation&&) = default;
+    Operation& operator=(Operation&&) = default;
+  };
+
+  // An edge between two nodes created for the execution graph operations.
+  struct NodeEdge {
+    // Edge kind defines execution ordering between two operations. Scheduling
+    // edge is weaker than an execution edge, as it gives more flexibility
+    // to the backend runtime to execute operations concurrently.
+    enum class Kind {
+      // If two operations have a scheduling edge between them, then the
+      // dependent operation must be scheduled (start execution) after the
+      // dependency operation scheduled (started execution), however it doesn't
+      // have to wait for the completion of execution. We use this type of
+      // edge to guarantee that operations that share the same resource (i.e.
+      // collective communicator) start execution in a deterministic order
+      // across different ranks, however the execution of operations can
+      // overlap and finish in any order, and backend-implementation specific.
+      kScheduling,
+
+      // If two operations have an execution edge between them, then the
+      // dependent operation must wait for the completion of dependency
+      // operation execution. We use this type of edge to order execution of
+      // operations that read and write from/to the same buffers, as otherwise
+      // we may create data races.
+      kExecution,
+    };
+
+    static constexpr NodeEdge::Kind KindOf(Resource::Kind resource) {
+      switch (resource) {
+        case Resource::kToken:
+          return NodeEdge::Kind::kExecution;
+        case Resource::kCollectiveCommunicator:
+          return NodeEdge::Kind::kScheduling;
+      }
+    }
+
+    bool operator==(const NodeEdge& other) const {
+      return kind == other.kind && id == other.id;
+    }
+
+    template <typename Sink>
+    friend void AbslStringify(Sink& sink, Kind kind) {
+      sink.Append(kind == Kind::kScheduling ? "scheduling" : "execution");
+    }
+
+    template <typename Sink>
+    friend void AbslStringify(Sink& sink, const NodeEdge& edge) {
+      absl::Format(&sink, "NodeEdge {kind: %v, id: %v}", edge.kind, edge.id);
+    }
+
+    Kind kind;
+    NodeId id;
+  };
+
+  // NodeDef defines a dependency-based execution order for all operations.
+  struct NodeDef {
+    NodeId id = kInvalidNodeId;
+
+    absl::Span<const NodeEdge> in_edges;
+    absl::Span<const NodeEdge> out_edges;
+
+    // When doing the transitive reduction, we assign a priority to each node
+    // based on the number of nodes that are reachable from the given node. The
+    // assumption is that by executing nodes with higher priority first we will
+    // unlock more nodes for execution.
+    int64_t priority = 0;
+  };
+
+  class Renderer {
+   public:
+    Renderer() = default;
+    virtual ~Renderer() = default;
+
+    // Generates a string representation for the given execution graph
+    // operations which can be published to a URL using `PublishGraph`.
+    virtual std::string GenerateGraphAsString(
+        absl::Span<const ExecutionGraph::Operation* const> operations) = 0;
+
+    // Publishes the generated graph.
+    virtual absl::StatusOr<std::string> PublishGraph(
+        absl::string_view graph_as_string) = 0;
+  };
+
+  // Returns the registered renderer for execution graphs.
+  static Renderer* GetRenderer();
+
+  // Registers a renderer for execution graphs.
+  static void RegisterRenderer(std::unique_ptr<Renderer> renderer);
+
+  // Constructs an execution graph from a sequence of operations.
+  template <typename Op,
+            std::enable_if_t<std::is_base_of_v<Operation, Op>>* = nullptr>
+  static absl::StatusOr<ExecutionGraph> Create(absl::Span<const Op> ops) {
+    absl::InlinedVector<const Operation*, 32> ptrs(ops.size());
+    for (size_t i = 0; i < ops.size(); ++i) {
+      ptrs[i] = &ops[i];
+    }
+    return Create(ptrs);
+  }
+
+  // Constructs an execution graph from a sequence of operations.
+  static absl::StatusOr<ExecutionGraph> Create(
+      absl::Span<const Operation* const> operations);
+
+  // Returns execution graph nodes definitions.
+  absl::Span<const NodeDef> nodes_defs() const { return nodes_defs_; }
+
+  // Source nodes are the nodes that do not have any in-edges.
+  absl::Span<const NodeId> source() const { return source_; }
+
+  // Sink nodes are the nodes that do not have any out-edges.
+  absl::Span<const NodeId> sink() const { return sink_; }
+
+  // Returns true if a given node id is a source node.
+  bool is_source(NodeId id) const {
+    return absl::c_find(source_, id) != source_.end();
+  }
+
+  // Returns true if a given node id is a sink node.
+  bool is_sink(NodeId id) const {
+    return absl::c_find(sink_, id) != sink_.end();
+  }
+
+  // Returns in-edges for a given node id.
+  absl::Span<const NodeEdge> in_edges(NodeId id) const {
+    DCHECK_EQ(id, nodes_defs_[id].id);
+    return nodes_defs_[id].in_edges;
+  }
+
+  // Returns out-edges for a given node id.
+  absl::Span<const NodeEdge> out_edges(NodeId id) const {
+    DCHECK_EQ(id, nodes_defs_[id].id);
+    return nodes_defs_[id].out_edges;
+  }
+
+  // Returns priority for a given node id.
+  int64_t priority(NodeId id) const {
+    DCHECK_EQ(id, nodes_defs_[id].id);
+    return nodes_defs_[id].priority;
+  }
+
+  bool is_sequential() const { return is_sequential_; }
+
+ private:
+  // We store all `in_edges` and `out_edges` referenced by the `NodeDef` inside
+  // large vectors to optimize for data locality on a hot path.
+  using NodesEdges = std::vector<NodeEdge>;
+
+  // A NodeDef builder to collect all in-edges and out-edges before constructing
+  // a NodeDef. We use it at dependency graph construction time when we don't
+  // know how many in-edges and out-edges we have in total.
+  struct NodeDefBuilder {
+    NodeId id = kInvalidNodeId;
+    int64_t priority = 0;
+    std::vector<NodeEdge> in_edges;
+    std::vector<NodeEdge> out_edges;
+  };
+
+  ExecutionGraph(NodesEdges nodes_in_edges, NodesEdges nodes_out_edges,
+                 std::vector<NodeDef> nodes_defs);
+
+  // Converts a vector of NodeDefBuilder to a tuple of NodesEdges and a vector
+  // of NodeDef.
+  static std::tuple<NodesEdges, NodesEdges, std::vector<NodeDef>>
+  CreateNodeDefs(std::vector<NodeDefBuilder> builders);
+
+  // Erases edge from `from` node to `to` node if it exists and it has a weaker
+  // ordering than the given `kind`. We rely on the fact that out and in-edges
+  // are sorted and use binary search on a critical path.
+  static int64_t EraseEdge(NodeDefBuilder& from, NodeDefBuilder& to,
+                           NodeEdge::Kind kind);
+
+  // Runs a transitive reduction on the NodeDefBuilder graph to remove redundant
+  // edges, and updates nodes priorities. Returns the number of removed edges.
+  //
+  // See: https://en.wikipedia.org/wiki/Transitive_reduction
+  static int64_t RunTransitiveReductionAndUpdatePriorities(
+      absl::Span<NodeDefBuilder> builders);
+
+  NodesEdges nodes_in_edges_;   // `in_edges` referenced by `nodes_defs_`
+  NodesEdges nodes_out_edges_;  // `out_edges` referenced by `nodes_defs_`
+  std::vector<NodeDef> nodes_defs_;
+
+  std::vector<NodeId> source_;
+  std::vector<NodeId> sink_;
+
+  // If NodeDef graph dependency structure is sequential and does not have any
+  // opportunities for executing operations concurrently. XLA runtime can use
+  // this property of the execution graph to skip expensive async execution and
+  // simply run all operations one by one.
+  bool is_sequential_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_RUNTIME_EXECUTION_GRAPH_H_
diff --git a/third_party/xla/xla/runtime/execution_graph_test.cc b/third_party/xla/xla/runtime/execution_graph_test.cc
new file mode 100644
index 000000000000..4e23673a9fe9
--- /dev/null
+++ b/third_party/xla/xla/runtime/execution_graph_test.cc
@@ -0,0 +1,282 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/runtime/execution_graph.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/runtime/buffer_use.h"
+#include "xla/runtime/resource_use.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+using ::testing::ElementsAre;
+
+using NodeEdge = ExecutionGraph::NodeEdge;
+
+// Give aliases to the edge kinds to make tests more readable.
+static constexpr auto kExecution = NodeEdge::Kind::kExecution;
+static constexpr auto kScheduling = NodeEdge::Kind::kScheduling;
+
+// A test-only operation for verifying execution graph implementation.
+class Operation : public ExecutionGraph::Operation {
+ public:
+  explicit Operation(std::vector<BufferUse> buffers,
+                     std::vector<ResourceUse> resources = {})
+      : buffers_(std::move(buffers)), resources_(std::move(resources)) {}
+
+  absl::string_view name() const final { return ""; }
+
+  absl::Span<const BufferUse> BufferUses() const final { return buffers_; }
+
+  absl::Span<const ResourceUse> ResourceUses() const final {
+    return resources_;
+  }
+
+ private:
+  std::vector<BufferUse> buffers_;
+  std::vector<ResourceUse> resources_;
+};
+
+TEST(ExecutionGraphTest, EdgePriority) {
+  // Scheduling edge has weaker ordering guarantee than an execution edge.
+  EXPECT_LE(kScheduling, kExecution);
+}
+
+TEST(ExecutionGraphTest, DependencyOrdering) {
+  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
+  BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
+  BufferAllocation::Slice slice2(&alloc, /*offset=*/20, /*size=*/40);
+
+  std::vector<Operation> operations;
+  operations.push_back(
+      Operation({BufferUse::Read(slice0), BufferUse::Write(slice0)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice2), BufferUse::Write(slice2)}));
+
+  TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
+                          ExecutionGraph::Create<Operation>(operations));
+
+  EXPECT_FALSE(execution_graph.is_sequential());
+  EXPECT_THAT(execution_graph.source(), ElementsAre(0, 1));
+  EXPECT_THAT(execution_graph.sink(), ElementsAre(2));
+
+  EXPECT_THAT(execution_graph.out_edges(0),
+              ElementsAre(NodeEdge{kExecution, 2}));
+  EXPECT_THAT(execution_graph.out_edges(1),
+              ElementsAre(NodeEdge{kExecution, 2}));
+  EXPECT_THAT(execution_graph.in_edges(2),
+              ElementsAre(NodeEdge{kExecution, 0}, NodeEdge{kExecution, 1}));
+
+  EXPECT_EQ(execution_graph.priority(0), 1);
+  EXPECT_EQ(execution_graph.priority(1), 1);
+  EXPECT_EQ(execution_graph.priority(2), 0);
+}
+
+TEST(ExecutionGraphTest, SequentialOrdering) {
+  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
+
+  std::vector<Operation> operations;
+  operations.push_back(
+      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+
+  TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
+                          ExecutionGraph::Create<Operation>(operations));
+
+  EXPECT_TRUE(execution_graph.is_sequential());
+  EXPECT_THAT(execution_graph.source(), ElementsAre(0));
+  EXPECT_THAT(execution_graph.sink(), ElementsAre(2));
+
+  EXPECT_THAT(execution_graph.out_edges(0),
+              ElementsAre(NodeEdge{kExecution, 1}));
+  EXPECT_THAT(execution_graph.out_edges(1),
+              ElementsAre(NodeEdge{kExecution, 2}));
+  EXPECT_THAT(execution_graph.in_edges(1),
+              ElementsAre(NodeEdge{kExecution, 0}));
+  EXPECT_THAT(execution_graph.in_edges(2),
+              ElementsAre(NodeEdge{kExecution, 1}));
+
+  EXPECT_EQ(execution_graph.priority(0), 2);
+  EXPECT_EQ(execution_graph.priority(1), 1);
+  EXPECT_EQ(execution_graph.priority(2), 0);
+}
+
+TEST(ExecutionGraphTest, TokenResourceOrdering) {
+  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
+  BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
+
+  auto resource = Resource::Create(Resource::Kind::kToken);
+
+  std::vector<Operation> operations;
+  operations.push_back(
+      Operation({BufferUse::Read(slice0), BufferUse::Write(slice0)},
+                {ResourceUse::Write(resource)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)},
+                {ResourceUse::Write(resource)}));
+
+  TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
+                          ExecutionGraph::Create<Operation>(operations));
+
+  EXPECT_TRUE(execution_graph.is_sequential());
+  EXPECT_THAT(execution_graph.source(), ElementsAre(0));
+  EXPECT_THAT(execution_graph.sink(), ElementsAre(1));
+
+  EXPECT_THAT(execution_graph.out_edges(0),
+              ElementsAre(NodeEdge{kExecution, 1}));
+  EXPECT_THAT(execution_graph.in_edges(1),
+              ElementsAre(NodeEdge{kExecution, 0}));
+
+  EXPECT_EQ(execution_graph.priority(0), 1);
+  EXPECT_EQ(execution_graph.priority(1), 0);
+}
+
+TEST(ExecutionGraphTest, CollectivesResourceOrdering) {
+  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+
+  BufferAllocation::Slice slice0(&alloc, /*offset=*/0, /*size=*/40);
+  BufferAllocation::Slice slice1(&alloc, /*offset=*/40, /*size=*/40);
+
+  auto resource = Resource::Create(Resource::Kind::kCollectiveCommunicator);
+
+  std::vector<Operation> operations;
+  operations.push_back(
+      Operation({BufferUse::Read(slice0), BufferUse::Write(slice0)},
+                {ResourceUse::Write(resource)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)},
+                {ResourceUse::Write(resource)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice1), BufferUse::Write(slice1)},
+                {ResourceUse::Write(resource)}));
+
+  TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
+                          ExecutionGraph::Create<Operation>(operations));
+
+  EXPECT_TRUE(execution_graph.is_sequential());
+  EXPECT_THAT(execution_graph.source(), ElementsAre(0));
+  EXPECT_THAT(execution_graph.sink(), ElementsAre(2));
+
+  EXPECT_THAT(execution_graph.out_edges(0),
+              ElementsAre(NodeEdge{kScheduling, 1}));
+
+  EXPECT_THAT(execution_graph.in_edges(1),
+              ElementsAre(NodeEdge{kScheduling, 0}));
+
+  // We have buffer conflicts, and a resource conflict, so in this case we
+  // must add an execution edge as it provides stronger ordering guarantee
+  EXPECT_THAT(execution_graph.out_edges(1),
+              ElementsAre(NodeEdge{kExecution, 2}));
+  EXPECT_THAT(execution_graph.in_edges(2),
+              ElementsAre(NodeEdge{kExecution, 1}));
+
+  EXPECT_EQ(execution_graph.priority(0), 2);
+  EXPECT_EQ(execution_graph.priority(1), 1);
+  EXPECT_EQ(execution_graph.priority(2), 0);
+}
+
+TEST(ExecutionGraphTest, TransitiveReduction) {
+  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
+
+  std::vector<Operation> operations;
+  operations.push_back(
+      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+  operations.push_back(
+      Operation({BufferUse::Read(slice), BufferUse::Write(slice)}));
+
+  TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
+                          ExecutionGraph::Create<Operation>(operations));
+
+  EXPECT_THAT(execution_graph.source(), ElementsAre(0));
+  EXPECT_THAT(execution_graph.sink(), ElementsAre(2));
+
+  EXPECT_THAT(execution_graph.out_edges(0),
+              ElementsAre(NodeEdge{kExecution, 1}));
+  EXPECT_THAT(execution_graph.in_edges(1),
+              ElementsAre(NodeEdge{kExecution, 0}));
+  EXPECT_THAT(execution_graph.out_edges(1),
+              ElementsAre(NodeEdge{kExecution, 2}));
+  EXPECT_THAT(execution_graph.in_edges(2),
+              ElementsAre(NodeEdge{kExecution, 1}));
+
+  EXPECT_EQ(execution_graph.priority(0), 2);
+  EXPECT_EQ(execution_graph.priority(1), 1);
+  EXPECT_EQ(execution_graph.priority(2), 0);
+}
+
+TEST(ExecutionGraphTest, TransitiveReductionKeepsExecutionEdge) {
+  BufferAllocation alloc(/*index=*/0, /*size=*/80, /*color=*/0);
+  BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/40);
+
+  auto resource = Resource::Create(Resource::Kind::kCollectiveCommunicator);
+
+  std::vector<Operation> operations;
+
+  // All three operations connected with scheduling edges, but because execution
+  // edge provides stronger ordering guarantee, we must keep an 0-2 execution
+  // edge, or we might get a data race.
+  operations.push_back(
+      Operation({BufferUse::Write(slice)}, {ResourceUse::Write(resource)}));
+  operations.push_back(
+      Operation(/*buffers=*/{}, {ResourceUse::Write(resource)}));
+  operations.push_back(
+      Operation({BufferUse::Write(slice)}, {ResourceUse::Write(resource)}));
+
+  TF_ASSERT_OK_AND_ASSIGN(ExecutionGraph execution_graph,
+                          ExecutionGraph::Create<Operation>(operations));
+
+  EXPECT_THAT(execution_graph.source(), ElementsAre(0));
+  EXPECT_THAT(execution_graph.sink(), ElementsAre(2));
+
+  EXPECT_THAT(execution_graph.out_edges(0),
+              ElementsAre(NodeEdge{kScheduling, 1}, NodeEdge{kExecution, 2}));
+
+  EXPECT_THAT(execution_graph.in_edges(1),
+              ElementsAre(NodeEdge{kScheduling, 0}));
+  EXPECT_THAT(execution_graph.out_edges(1),
+              ElementsAre(NodeEdge{kScheduling, 2}));
+
+  EXPECT_THAT(execution_graph.in_edges(2),
+              ElementsAre(NodeEdge{kExecution, 0}, NodeEdge{kScheduling, 1}));
+
+  EXPECT_EQ(execution_graph.priority(0), 2);
+  EXPECT_EQ(execution_graph.priority(1), 1);
+  EXPECT_EQ(execution_graph.priority(2), 0);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/BUILD b/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/BUILD
index 7f7de8ff3f52..98932ee3ad64 100644
--- a/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/BUILD
+++ b/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -30,8 +30,6 @@ xla_cc_test(
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -61,15 +59,16 @@ xla_cc_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/serialization_test.cc b/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/serialization_test.cc
index eca1d922495e..a407bc38f662 100644
--- a/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/serialization_test.cc
+++ b/third_party/xla/xla/runtime/large_hlo_snapshot_serialization/serialization_test.cc
@@ -30,12 +30,15 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
 
 using ::testing::HasSubstr;
+using ::tsl::proto_testing::EqualsProto;
 
 HloUnoptimizedSnapshot CreateSnapshot() {
   HloUnoptimizedSnapshot snapshot;
@@ -68,7 +71,7 @@ TEST(LargeHloSnapshotSerializationTest, SerializeAndDeserialize) {
 
   TF_ASSERT_OK_AND_ASSIGN(HloUnoptimizedSnapshot deserialized_snapshot,
                           SerializeAndDeserialize(snapshot));
-  EXPECT_EQ(deserialized_snapshot.DebugString(), snapshot.DebugString());
+  EXPECT_THAT(deserialized_snapshot, EqualsProto(snapshot));
 }
 
 TEST(LargeHloSnapshotSerializationTest, SerializeAndDeserializeEmptyModule) {
@@ -78,7 +81,7 @@ TEST(LargeHloSnapshotSerializationTest, SerializeAndDeserializeEmptyModule) {
   TF_ASSERT_OK_AND_ASSIGN(HloUnoptimizedSnapshot deserialized_snapshot,
                           SerializeAndDeserialize(snapshot));
 
-  EXPECT_EQ(deserialized_snapshot.DebugString(), snapshot.DebugString());
+  EXPECT_THAT(deserialized_snapshot, EqualsProto(snapshot));
 }
 
 TEST(LargeHloSnapshotSerializationTest, SerializeAndDeserializeEmptyPartition) {
@@ -88,7 +91,7 @@ TEST(LargeHloSnapshotSerializationTest, SerializeAndDeserializeEmptyPartition) {
   TF_ASSERT_OK_AND_ASSIGN(HloUnoptimizedSnapshot deserialized_snapshot,
                           SerializeAndDeserialize(snapshot));
 
-  EXPECT_EQ(deserialized_snapshot.DebugString(), snapshot.DebugString());
+  EXPECT_THAT(deserialized_snapshot, EqualsProto(snapshot));
 }
 
 TEST(LargeHloSnapshotSerializationTest, SerializeAndDeserializeBrokenSnapshot) {
@@ -153,7 +156,7 @@ TEST(LargeHloSnapshotSerializationTest,
   TF_ASSERT_OK_AND_ASSIGN(HloUnoptimizedSnapshot deserialized_snapshot,
                           SerializeAndDeserialize(snapshot));
 
-  EXPECT_EQ(deserialized_snapshot.DebugString(), snapshot.DebugString());
+  EXPECT_THAT(deserialized_snapshot, EqualsProto(snapshot));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/runtime/object_pool.h b/third_party/xla/xla/runtime/object_pool.h
new file mode 100644
index 000000000000..62da4816e67b
--- /dev/null
+++ b/third_party/xla/xla/runtime/object_pool.h
@@ -0,0 +1,189 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_RUNTIME_OBJECT_POOL_H_
+#define XLA_RUNTIME_OBJECT_POOL_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "absl/base/optimization.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+
+namespace xla {
+
+// A non-blocking pool of objects of type `T`. Objects in the pool are created
+// lazily when needed by calling the user-provided `builder` function.
+//
+// This object pool is intended to be used on a critical path and optimized for
+// zero-allocation in steady state.
+template <typename T, typename... Args>
+class ObjectPool {
+  struct Entry {
+    // Keep `object` as optional to allow using object pool for objects that
+    // cannot be default-constructed.
+    std::optional<T> object;
+    Entry* next = nullptr;
+  };
+
+  // We use pointer tagging for the logical deletion of entries from the linked
+  // list to avoid data races on the `entry->next` pointer and to avoid ABA
+  // problem. A thread that tries to pop an entry from the pool first tags the
+  // entry pointer to get an exclusive access to the entry, concurrent pop
+  // operations will wait in the spin loop.
+  static constexpr uintptr_t kMask = 0x1;
+
+  static bool IsMarked(Entry* entry) {
+    return (tsl::safe_reinterpret_cast<uintptr_t>(entry) & kMask) == kMask;
+  }
+
+  static Entry* Mark(Entry* entry) {
+    return tsl::safe_reinterpret_cast<Entry*>(
+        tsl::safe_reinterpret_cast<uintptr_t>(entry) | kMask);
+  }
+
+ public:
+  explicit ObjectPool(absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder);
+  ~ObjectPool();
+
+  class BorrowedObject {
+   public:
+    ~BorrowedObject();
+
+    T& operator*() { return *entry_->object; }
+    T* operator->() { return &*entry_->object; }
+
+    BorrowedObject(BorrowedObject&&) = default;
+    BorrowedObject& operator=(BorrowedObject&&) = default;
+
+   private:
+    friend class ObjectPool;
+
+    BorrowedObject(ObjectPool* parent, std::unique_ptr<Entry> entry);
+
+    ObjectPool* parent_;
+    std::unique_ptr<Entry> entry_;
+  };
+
+  absl::StatusOr<BorrowedObject> GetOrCreate(Args... args);
+
+  size_t num_created() const {
+    return num_created_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  absl::StatusOr<std::unique_ptr<Entry>> CreateEntry(Args... args);
+
+  std::unique_ptr<Entry> PopEntry();
+  void PushEntry(std::unique_ptr<Entry> entry);
+
+  absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder_;
+  std::atomic<Entry*> head_;
+  std::atomic<size_t> num_created_;
+};
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::ObjectPool(
+    absl::AnyInvocable<absl::StatusOr<T>(Args...)> builder)
+    : builder_(std::move(builder)), head_(nullptr), num_created_(0) {}
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::~ObjectPool() {
+  while (Entry* entry = head_.load(std::memory_order_acquire)) {
+    head_.store(entry->next, std::memory_order_relaxed);
+    delete entry;
+  }
+}
+
+template <typename T, typename... Args>
+auto ObjectPool<T, Args...>::CreateEntry(Args... args)
+    -> absl::StatusOr<std::unique_ptr<Entry>> {
+  auto entry = std::make_unique<Entry>();
+  TF_ASSIGN_OR_RETURN(entry->object, builder_(std::forward<Args>(args)...));
+  num_created_.fetch_add(1, std::memory_order_relaxed);
+  return entry;
+}
+
+template <typename T, typename... Args>
+auto ObjectPool<T, Args...>::PopEntry() -> std::unique_ptr<Entry> {
+  Entry* head = head_.load(std::memory_order_relaxed);
+
+  // Try to mark the entry at head for deletion with a CAS operation.
+  while (head &&
+         (IsMarked(head) || !head_.compare_exchange_weak(
+                                head, Mark(head), std::memory_order_acquire,
+                                std::memory_order_relaxed))) {
+    if (ABSL_PREDICT_FALSE(IsMarked(head))) {
+      head = head_.load(std::memory_order_relaxed);
+    }
+  }
+
+  // Object pool is empty.
+  if (ABSL_PREDICT_FALSE(head == nullptr)) {
+    return nullptr;
+  }
+
+  // Update head pointer to the next entry.
+  head_.store(head->next, std::memory_order_relaxed);
+
+  return std::unique_ptr<Entry>(head);
+}
+
+template <typename T, typename... Args>
+void ObjectPool<T, Args...>::PushEntry(std::unique_ptr<Entry> entry) {
+  Entry* new_head = entry.release();
+  new_head->next = head_.load(std::memory_order_relaxed);
+  while (IsMarked(new_head->next) ||
+         !head_.compare_exchange_weak(new_head->next, new_head,
+                                      std::memory_order_release,
+                                      std::memory_order_relaxed)) {
+    if (ABSL_PREDICT_FALSE(IsMarked(new_head->next))) {
+      new_head->next = head_.load(std::memory_order_relaxed);
+    }
+  }
+}
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::BorrowedObject::BorrowedObject(
+    ObjectPool<T, Args...>* parent, std::unique_ptr<Entry> entry)
+    : parent_(parent), entry_(std::move(entry)) {}
+
+template <typename T, typename... Args>
+ObjectPool<T, Args...>::BorrowedObject::~BorrowedObject() {
+  if (ABSL_PREDICT_TRUE(parent_ && entry_)) {
+    parent_->PushEntry(std::move(entry_));
+  }
+}
+
+template <typename T, typename... Args>
+auto ObjectPool<T, Args...>::GetOrCreate(Args... args)
+    -> absl::StatusOr<BorrowedObject> {
+  if (std::unique_ptr<Entry> entry = PopEntry(); ABSL_PREDICT_TRUE(entry)) {
+    return BorrowedObject(this, std::move(entry));
+  }
+  TF_ASSIGN_OR_RETURN(auto entry, CreateEntry(std::forward<Args>(args)...));
+  return BorrowedObject(this, std::move(entry));
+}
+
+}  // namespace xla
+
+#endif  // XLA_RUNTIME_OBJECT_POOL_H_
diff --git a/third_party/xla/xla/runtime/object_pool_test.cc b/third_party/xla/xla/runtime/object_pool_test.cc
new file mode 100644
index 000000000000..81177a3fea69
--- /dev/null
+++ b/third_party/xla/xla/runtime/object_pool_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/runtime/object_pool.h"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace xla {
+namespace {
+
+using IntPool = ObjectPool<std::unique_ptr<int32_t>>;
+
+TEST(ObjectPoolTest, GetOrCreate) {
+  int32_t counter = 0;
+  IntPool pool([&]() -> absl::StatusOr<std::unique_ptr<int32_t>> {
+    return std::make_unique<int32_t>(counter++);
+  });
+
+  TF_ASSERT_OK_AND_ASSIGN(auto obj0, pool.GetOrCreate());
+  ASSERT_EQ(**obj0, 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto obj1, pool.GetOrCreate());
+  ASSERT_EQ(**obj1, 1);
+
+  auto destroy = [](IntPool::BorrowedObject obj) {};
+  destroy(std::move(obj0));
+  destroy(std::move(obj1));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto obj2, pool.GetOrCreate());
+  ASSERT_EQ(**obj2, 1);
+  ASSERT_EQ(counter, 2);
+}
+
+TEST(ObjectPoolTest, GetOrCreateUnderContention) {
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
+
+  // We concurrently mutate counter field to detect races under tsan and track
+  // the number of concurrent users to detect races in our test code.
+  struct Obj {
+    int64_t counter{0};
+    std::atomic<int64_t> users{0};
+  };
+
+  absl::Mutex mutex;
+  std::vector<Obj*> objs;
+
+  ObjectPool<std::unique_ptr<Obj>> pool(
+      [&]() -> absl::StatusOr<std::unique_ptr<Obj>> {
+        absl::MutexLock lock(&mutex);
+        auto obj = std::make_unique<Obj>();
+        objs.push_back(obj.get());
+        return obj;
+      });
+
+  size_t num_tasks = 100;
+  size_t num_iters = 100;
+
+  absl::BlockingCounter blocking_counter(num_tasks);
+
+  for (int32_t t = 0; t < num_tasks; ++t) {
+    threads.Schedule([&] {
+      for (int32_t i = 0; i < num_iters; ++i) {
+        TF_ASSERT_OK_AND_ASSIGN(auto obj, pool.GetOrCreate());
+        CHECK_EQ((*obj)->users.fetch_add(1), 0);
+        ASSERT_GE((*obj)->counter++, 0);
+        CHECK_EQ((*obj)->users.fetch_sub(1), 1);
+      }
+      blocking_counter.DecrementCount();
+    });
+  }
+
+  blocking_counter.Wait();
+
+  // We should create at most one object for each thread in the pool.
+  EXPECT_GT(objs.size(), 0);
+  EXPECT_LE(objs.size(), threads.NumThreads());
+
+  // Check that the sum of counters matches the number of executed operations.
+  int64_t sum = 0;
+  absl::c_for_each(objs, [&](Obj* obj) { sum += obj->counter; });
+  EXPECT_EQ(sum, num_tasks * num_iters);
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks.
+//===----------------------------------------------------------------------===//
+
+static void BM_GetOrCreate(benchmark::State& state) {
+  IntPool pool([cnt = 0]() mutable -> absl::StatusOr<std::unique_ptr<int32_t>> {
+    return std::make_unique<int32_t>(cnt++);
+  });
+
+  for (auto _ : state) {
+    auto obj = pool.GetOrCreate();
+    benchmark::DoNotOptimize(obj);
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(BM_GetOrCreate);
+
+static void BM_GetOrCreateUnderContention(benchmark::State& state) {
+  size_t num_threads = state.range(0);
+  size_t num_iters = state.range(1);
+
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "bench", num_threads);
+
+  IntPool pool([cnt = 0]() mutable -> absl::StatusOr<std::unique_ptr<int32_t>> {
+    return std::make_unique<int32_t>(cnt++);
+  });
+
+  for (auto _ : state) {
+    absl::BlockingCounter blocking_counter(num_threads);
+
+    for (int32_t t = 0; t < num_threads; ++t) {
+      threads.Schedule([&] {
+        for (int32_t i = 0; i < num_iters; ++i) {
+          auto obj = pool.GetOrCreate();
+          (***obj)++;
+        }
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+
+  state.SetItemsProcessed(state.iterations() * num_threads * num_iters);
+}
+
+BENCHMARK(BM_GetOrCreateUnderContention)
+    ->MeasureProcessCPUTime()
+    ->ArgPair(1, 1000)
+    ->ArgPair(2, 1000)
+    ->ArgPair(4, 1000)
+    ->ArgPair(8, 1000);
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/runtime/resource_use.cc b/third_party/xla/xla/runtime/resource_use.cc
similarity index 77%
rename from third_party/xla/xla/backends/cpu/runtime/resource_use.cc
rename to third_party/xla/xla/runtime/resource_use.cc
index d6f2f358d7bc..164323bfaa48 100644
--- a/third_party/xla/xla/backends/cpu/runtime/resource_use.cc
+++ b/third_party/xla/xla/runtime/resource_use.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/runtime/resource_use.h"
 
 #include <memory>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
@@ -23,7 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/xla_data.pb.h"
 
-namespace xla::cpu {
+namespace xla {
 
 std::shared_ptr<Resource> Resource::Create(Kind kind) {
   return absl::WrapUnique(new Resource(kind));
@@ -49,7 +51,9 @@ void ResourceUse::ReadWriteSet::Add(ResourceUse use) {
 }
 
 void ResourceUse::ReadWriteSet::AddAll(absl::Span<const ResourceUse> uses) {
-  for (const auto& use : uses) Add(use);
+  for (const auto& use : uses) {
+    Add(use);
+  }
 }
 
 bool ResourceUse::ReadWriteSet::HasConflicts(const ResourceUse& use) const {
@@ -75,4 +79,23 @@ bool ResourceUse::ReadWriteSet::HasConflicts(const ReadWriteSet& other) {
                         });
 }
 
-}  // namespace xla::cpu
+std::vector<ResourceUse> ResourceUse::ReadWriteSet::Conflicts(
+    const ReadWriteSet& other) {
+  std::vector<ResourceUse> conflicts;
+
+  for (const std::shared_ptr<Resource>& resource : other.read_) {
+    if (auto read = ResourceUse::Read(resource); HasConflicts(read)) {
+      conflicts.push_back(std::move(read));
+    }
+  }
+
+  for (const std::shared_ptr<Resource>& resource : other.write_) {
+    if (auto write = ResourceUse::Write(resource); HasConflicts(write)) {
+      conflicts.push_back(std::move(write));
+    }
+  }
+
+  return conflicts;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/cpu/runtime/resource_use.h b/third_party/xla/xla/runtime/resource_use.h
similarity index 88%
rename from third_party/xla/xla/backends/cpu/runtime/resource_use.h
rename to third_party/xla/xla/runtime/resource_use.h
index 1442a2895a02..a43a9e8474b6 100644
--- a/third_party/xla/xla/backends/cpu/runtime/resource_use.h
+++ b/third_party/xla/xla/runtime/resource_use.h
@@ -13,19 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_BACKENDS_CPU_RUNTIME_RESOURCE_USE_H_
-#define XLA_BACKENDS_CPU_RUNTIME_RESOURCE_USE_H_
+#ifndef XLA_RUNTIME_RESOURCE_USE_H_
+#define XLA_RUNTIME_RESOURCE_USE_H_
 
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 
-namespace xla::cpu {
+namespace xla {
 
 // `Resource` models a run time resource that imposes ordering on the thunk
-// execution in addition to thunk buffer uses.
+// execution (scheduling) in addition to thunk buffer uses.
 class Resource {
  public:
   enum class Kind {
@@ -34,7 +35,7 @@ class Resource {
     // enforce ordering at run time.
     kToken,
 
-    // Collective operations must be executed in the same order as they are
+    // Collective operations must be scheduled in the same order as they are
     // defined in the HLO module. We rely on collective communicator resource
     // to enforce ordering at run time.
     kCollectiveCommunicator
@@ -87,6 +88,10 @@ class ResourceUse {
     bool HasConflicts(absl::Span<const ResourceUse> uses) const;
     bool HasConflicts(const ReadWriteSet& other);
 
+    // Collects all resource uses that have a conflict with tracked resource
+    // reads or writes.
+    std::vector<ResourceUse> Conflicts(const ReadWriteSet& other);
+
    private:
     absl::flat_hash_set<std::shared_ptr<Resource>> read_;
     absl::flat_hash_set<std::shared_ptr<Resource>> write_;
@@ -109,6 +114,6 @@ class ResourceUse {
   ResourceAccess access_;
 };
 
-}  // namespace xla::cpu
+}  // namespace xla
 
-#endif  // XLA_BACKENDS_CPU_RUNTIME_RESOURCE_USE_H_
+#endif  // XLA_RUNTIME_RESOURCE_USE_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/resource_use_test.cc b/third_party/xla/xla/runtime/resource_use_test.cc
similarity index 81%
rename from third_party/xla/xla/backends/cpu/runtime/resource_use_test.cc
rename to third_party/xla/xla/runtime/resource_use_test.cc
index dd5115bcaf2a..b6a97adbd052 100644
--- a/third_party/xla/xla/backends/cpu/runtime/resource_use_test.cc
+++ b/third_party/xla/xla/runtime/resource_use_test.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/backends/cpu/runtime/resource_use.h"
+#include "xla/runtime/resource_use.h"
 
-#include "tsl/platform/test.h"
+#include <vector>
 
-namespace xla::cpu {
+#include <gtest/gtest.h>
+#include "xla/tsl/platform/test.h"
+
+namespace xla {
 namespace {
 
 TEST(ResourceUseTest, Equality) {
@@ -47,7 +50,14 @@ TEST(ResourceUseTest, ReadWriteSet) {
   EXPECT_TRUE(rwset.HasConflicts({ResourceUse::Write(token0)}));
   EXPECT_FALSE(rwset.HasConflicts({ResourceUse::Read(token1)}));
   EXPECT_FALSE(rwset.HasConflicts({ResourceUse::Write(token1)}));
+
+  ResourceUse::ReadWriteSet rwset2;
+  rwset2.Add(ResourceUse::Write(token0));
+
+  std::vector<ResourceUse> conflicts = rwset.Conflicts(rwset2);
+  ASSERT_EQ(conflicts.size(), 1);
+  EXPECT_EQ(conflicts.front(), ResourceUse::Write(token0));
 }
 
 }  // namespace
-}  // namespace xla::cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/runtime/thunk.h b/third_party/xla/xla/runtime/thunk.h
new file mode 100644
index 000000000000..bbe6fda4c339
--- /dev/null
+++ b/third_party/xla/xla/runtime/thunk.h
@@ -0,0 +1,45 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_RUNTIME_THUNK_H_
+#define XLA_RUNTIME_THUNK_H_
+
+#include <memory>
+#include <vector>
+
+namespace xla {
+
+// Thunk is the unit of execution for the XLA CPU and GPU runtime.
+//
+// XLA programs compiled to a sequence of backends-specific thunks that run XLA
+// operations on the target device. The most common thunk type on both backends
+// is a kernel thunk, which launches an XLA fusion compiled to a backend
+// specific machine code (i.e. PTX for GPU, x86 assembly for CPU). Operations
+// that are not compiled to machine code are also represented as thunks, e.g.
+// collective operations which are implemented as library calls.
+class Thunk {
+ public:
+  virtual ~Thunk() = default;
+};
+
+// A sequence of owned thunks.
+class ThunkSequence : public std::vector<std::unique_ptr<Thunk>> {
+ public:
+  static ThunkSequence Empty() { return ThunkSequence(); }
+};
+
+}  // namespace xla
+
+#endif  // XLA_RUNTIME_THUNK_H_
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b74c5c246f4b..7cd03e529ae8 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5,11 +5,10 @@ load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
     "if_rocm_is_configured",
 )
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_binary",
     "xla_cc_test",
     "xla_internal",
@@ -110,13 +109,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "async_collective_creator",
-    hdrs = ["async_collective_creator.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:async_collective_creator instead.",
-    deps = ["//xla/hlo/transforms/collectives:async_collective_creator"],
-)
-
 cc_library(
     name = "all_reduce_key",
     srcs = ["all_reduce_key.cc"],
@@ -153,10 +145,10 @@ xla_cc_test(
     deps = [
         ":all_reduce_promotion",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -193,12 +185,12 @@ xla_cc_test(
     deps = [
         ":all_reduce_reassociate",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
@@ -208,13 +200,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "all_reduce_folder",
-    hdrs = ["all_reduce_folder.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:all_reduce_folder instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:all_reduce_folder"],
-)
-
 cc_library(
     name = "float_support",
     srcs = ["float_support.cc"],
@@ -225,34 +210,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "broadcast_canonicalizer",
-    hdrs = ["broadcast_canonicalizer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:broadcast_canonicalizer instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:broadcast_canonicalizer"],
-)
-
-cc_library(
-    name = "bfloat16_conversion_folding",
-    hdrs = ["bfloat16_conversion_folding.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:bfloat16_conversion_folding instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:bfloat16_conversion_folding"],
-)
-
-cc_library(
-    name = "float_normalization",
-    hdrs = ["float_normalization.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:float_normalization instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:float_normalization"],
-)
-
-cc_library(
-    name = "bfloat16_propagation",
-    hdrs = ["bfloat16_propagation.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:bfloat16_propagation instead.",
-    deps = ["//xla/hlo/transforms:bfloat16_propagation"],
-)
-
 cc_library(
     name = "source_target_pairs",
     hdrs = ["source_target_pairs.h"],
@@ -288,17 +245,7 @@ cc_library(
     hdrs = ["collective_permute_cycle.h"],
     deps = [
         ":source_target_pairs",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/service/graphcycles",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -308,11 +255,7 @@ xla_cc_test(
     deps = [
         ":collective_permute_cycle",
         ":source_target_pairs",
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -328,9 +271,7 @@ cc_library(
         ":pattern_matcher",
         ":source_target_pairs",
         "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tsl/platform:errors",
@@ -351,7 +292,6 @@ xla_cc_test(
         ":collective_permute_decomposer",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/utils:hlo_matchers",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -371,7 +311,6 @@ cc_library(
         ":hlo_domain_map",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/log",
     ],
 )
 
@@ -385,7 +324,6 @@ cc_library(
         "//xla/hlo/parser:hlo_parser",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
     ],
 )
 
@@ -397,7 +335,9 @@ cc_library(
         "//xla:literal",
         "//xla:util",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -411,13 +351,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "convert_async_collectives_to_sync",
-    hdrs = ["convert_async_collectives_to_sync.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:convert_async_collectives_to_sync instead.",
-    deps = ["//xla/hlo/transforms/collectives:convert_async_collectives_to_sync"],
-)
-
 cc_library(
     name = "value_range",
     srcs = ["value_range.cc"],
@@ -445,7 +378,7 @@ xla_cc_test(
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
@@ -454,13 +387,19 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "collective_pipeliner_utils",
+    hdrs = ["collective_pipeliner_utils.h"],
+    deps = [],
+)
+
 cc_library(
     name = "collective_pipeliner",
     srcs = ["collective_pipeliner.cc"],
     hdrs = ["collective_pipeliner.h"],
     deps = [
-        ":call_graph",
         ":collective_ops_utils",
+        ":collective_pipeliner_utils",
         ":constant_value",
         ":scheduling_annotations_util",
         ":value_range",
@@ -478,6 +417,8 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/utils:hlo_query",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -500,20 +441,23 @@ xla_cc_test(
     srcs = ["collective_pipeliner_test.cc"],
     deps = [
         ":collective_pipeliner",
+        ":collective_pipeliner_utils",
         ":hlo_module_config",
         ":hlo_verifier",
-        ":host_memory_offload_annotations_hdr",
+        ":legalize_scheduling_annotations",
+        ":memory_annotations_hdr",
         ":scheduling_annotations_util",
         "//xla:literal_util",
-        "//xla:test_helpers",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
@@ -525,18 +469,10 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "collective_quantizer",
-    hdrs = ["collective_quantizer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:collective_quantizer instead.",
-    deps = ["//xla/hlo/transforms/collectives:collective_quantizer"],
-)
-
 cc_library(
     name = "dump",
     srcs = ["dump.cc"],
     hdrs = ["dump.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":hlo_graph_dumper",
         ":hlo_proto_cc",
@@ -550,8 +486,10 @@ cc_library(
         "//xla/tsl/lib/io:zlib_outputbuffer",
         "//xla/tsl/lib/strings:proto_serialization",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -573,6 +511,11 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "overload",
+    hdrs = ["overload.h"],
+)
+
 xla_cc_test(
     name = "dump_test",
     srcs = ["dump_test.cc"],
@@ -581,12 +524,18 @@ xla_cc_test(
     deps = [
         ":dump",
         ":hlo_module_config",
+        "//xla:debug_options_flags",
         "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/runtime/large_hlo_snapshot_serialization:serialization",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
@@ -627,12 +576,12 @@ xla_cc_test(
     deps = [
         ":shape_inference",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:padding",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -645,17 +594,6 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
-    name = "hlo_opcode_test",
-    srcs = ["hlo_opcode_test.cc"],
-    deps = [
-        "//xla/hlo/ir:hlo",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_library(
     name = "sharding_propagation",
     srcs = [
@@ -669,7 +607,7 @@ cc_library(
         ":custom_call_sharding_helper",
         ":dot_as_convolution_util",
         ":hlo_graph_dumper",
-        ":host_memory_offload_annotations_hdr",
+        ":memory_annotations_hdr",
         "//xla:array",
         "//xla:protobuf_util",
         "//xla:shape_tree",
@@ -705,17 +643,17 @@ xla_cc_test(
     ],
     deps = [
         ":sharding_propagation",
-        "//xla:protobuf_util",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/transforms/simplifiers:hlo_constant_splitter",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
@@ -754,11 +692,10 @@ xla_cc_test(
     ],
     deps = [
         ":sharding_remover",
-        "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
@@ -787,7 +724,7 @@ xla_cc_test(
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest",
@@ -802,15 +739,21 @@ xla_test(
         "cpu",
         "gpu",
     ],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        "//xla:execution_options_util",
-        "//xla:status_macros",
-        "//xla:test",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:hlo_test_base",
+        "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla/hlo/testlib:test",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:test_macros_header",
+        "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -820,10 +763,9 @@ xla_cc_test(
     deps = [
         ":hlo_runner",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
     ],
@@ -859,11 +801,11 @@ xla_cc_test(
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:statusor",
@@ -871,23 +813,15 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "pattern_matcher_gmock",
-    testonly = 1,
-    hdrs = ["pattern_matcher_gmock.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/testlib:pattern_matcher_gmock instead.",
-    deps = ["//xla/hlo/testlib:pattern_matcher_gmock"],
-)
-
 xla_cc_test(
     name = "pattern_matcher_gmock_test",
     srcs = ["pattern_matcher_gmock_test.cc"],
     deps = [
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:shape_util",
-        "//xla:test",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",
         "@local_tsl//tsl/platform:test",
     ],
@@ -908,7 +842,7 @@ xla_cc_test(
     deps = [
         ":fuzzy_matcher",
         ":pattern_matcher",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
@@ -923,24 +857,25 @@ xla_cc_test(
     deps = [
         ":hlo_proto_cc",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:comparison_util",
         "//xla:literal_util",
-        "//xla:protobuf_util",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:util",
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -950,14 +885,14 @@ xla_cc_test(
     name = "hlo_sharding_test",
     srcs = ["hlo_sharding_test.cc"],
     deps = [
-        "//xla:protobuf_util",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/types:span",
     ],
@@ -996,8 +931,8 @@ xla_cc_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -1009,13 +944,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "flatten_call_graph",
-    hdrs = ["flatten_call_graph.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:flatten_call_graph instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:flatten_call_graph"],
-)
-
 cc_library(
     name = "call_inliner",
     srcs = ["call_inliner.cc"],
@@ -1039,8 +967,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1052,28 +978,21 @@ xla_cc_test(
         ":call_inliner",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "hlo_computation_deduplicator",
-    hdrs = ["hlo_computation_deduplicator.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:hlo_computation_deduplicator instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:hlo_computation_deduplicator"],
-)
-
 cc_library(
     name = "platform_util",
     srcs = ["platform_util.cc"],
@@ -1088,15 +1007,16 @@ cc_library(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/host:host_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1135,7 +1055,6 @@ cc_library(
     name = "service",
     srcs = ["service.cc"],
     hdrs = ["service.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":allocation_tracker",
         ":backend",
@@ -1263,7 +1182,6 @@ cc_library(
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:ptrvec",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -1291,9 +1209,9 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/transforms/collectives:async_collective_creator",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -1339,7 +1257,7 @@ xla_cc_test(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:str_format",
@@ -1374,7 +1292,7 @@ xla_cc_test(
         ":latency_hiding_scheduler",
         ":profile_guided_latency_estimator",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -1423,6 +1341,16 @@ cc_library(
 cc_library(
     name = "gpu_plugin_impl",
     compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":gpu_plugin_without_collectives",
+    ] + if_gpu_is_configured([
+        "//xla/backends/gpu/runtime:thunk_runtime_dependencies",
+    ]),
+)
+
+cc_library(
+    name = "gpu_plugin_without_collectives",
+    compatible_with = get_compatible_with_portable(),
     deps = [
     ] + if_gpu_is_configured([
         ":service",
@@ -1494,11 +1422,10 @@ xla_cc_test(
         ":shaped_buffer",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/testlib:test",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
@@ -1563,7 +1490,7 @@ xla_cc_test(
         ":executable",
         ":hlo_execution_profile",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
@@ -1626,7 +1553,6 @@ cc_library(
     name = "llvm_compiler",
     srcs = ["llvm_compiler.cc"],
     hdrs = ["llvm_compiler.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":compiler",
         ":executable",
@@ -1803,32 +1729,30 @@ xla_cc_test(
     srcs = ["buffer_assignment_test.cc"],
     deps = [
         ":buffer_assignment",
+        ":buffer_assignment_proto_cc",
         ":buffer_value",
         ":call_graph",
         ":copy_insertion",
         ":cpu_plugin",
         ":hlo_buffer",
         ":hlo_proto_cc",
-        ":hlo_proto_util",
         ":hlo_value",
         ":logical_buffer",
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/transforms/simplifiers:flatten_call_graph",
-        "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
         "//xla/service/memory_space_assignment",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -1843,13 +1767,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_ordering",
-    hdrs = ["hlo_ordering.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:hlo_ordering instead.",
-    deps = ["//xla/hlo/analysis:hlo_ordering"],
-)
-
 xla_cc_test(
     name = "hlo_module_group_test",
     srcs = ["hlo_module_group_test.cc"],
@@ -1858,10 +1775,10 @@ xla_cc_test(
     deps = [
         ":hlo_module_group_metadata",
         ":hlo_proto_cc",
-        "//xla:test",
         "//xla/hlo/ir:hlo_module_group",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
     ],
@@ -1946,12 +1863,12 @@ xla_cc_test(
         ":buffer_value",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -1966,11 +1883,11 @@ xla_cc_test(
     srcs = ["hlo_input_output_alias_config_test.cc"],
     deps = [
         "//xla:shape_util",
-        "//xla:test_helpers",
         "//xla:types",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -1979,14 +1896,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_memory_scheduler",
-    hdrs = ["hlo_memory_scheduler.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:hlo_memory_scheduler instead.",
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    deps = ["//xla/hlo/transforms/simplifiers:hlo_memory_scheduler"],
-)
-
 cc_library(
     name = "fusion_queue",
     hdrs = ["fusion_queue.h"],
@@ -2012,10 +1921,12 @@ cc_library(
         "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -2037,8 +1948,8 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -2059,6 +1970,8 @@ cc_library(
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ],
@@ -2083,6 +1996,7 @@ cc_library(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:comparators",
         "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -2116,7 +2030,7 @@ xla_cc_test(
         ":instruction_fusion",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
         "@local_tsl//tsl/platform:test",
@@ -2129,17 +2043,17 @@ xla_cc_test(
     deps = [
         ":hlo_creation_utils",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:array2d",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
@@ -2173,13 +2087,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "op_expander_pass",
-    hdrs = ["op_expander_pass.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:op_expander_pass instead.",
-    deps = ["//xla/hlo/transforms/expanders:op_expander_pass"],
-)
-
 cc_library(
     name = "gather_expander",
     srcs = ["gather_expander.cc"],
@@ -2199,20 +2106,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "optimization_barrier_expander",
-    hdrs = ["optimization_barrier_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:optimization_barrier_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:optimization_barrier_expander"],
-)
-
-cc_library(
-    name = "comparison_expander",
-    hdrs = ["comparison_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:comparison_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:comparison_expander"],
-)
-
 cc_library(
     name = "scatter_utils",
     srcs = ["scatter_utils.cc"],
@@ -2282,15 +2175,14 @@ xla_cc_test(
         ":scatter_expander",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:types",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2301,13 +2193,14 @@ xla_test(
         "cpu",
         "gpu",
     ],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":scatter_determinism_expander",
         "//xla:literal",
-        "//xla:test",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
-        "@local_tsl//tsl/platform:statusor",
+        "//xla/tsl/platform:statusor",
     ],
 )
 
@@ -2357,48 +2250,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "cholesky_expander",
-    hdrs = ["cholesky_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:cholesky_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:cholesky_expander"],
-)
-
-cc_library(
-    name = "qr_expander",
-    hdrs = ["qr_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:qr_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:qr_expander"],
-)
-
-cc_library(
-    name = "real_imag_expander",
-    hdrs = ["real_imag_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:real_imag_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:real_imag_expander"],
-)
-
-cc_library(
-    name = "eigh_expander",
-    hdrs = ["eigh_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:eigh_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:eigh_expander"],
-)
-
-cc_library(
-    name = "convolution_4d_expander",
-    hdrs = ["convolution_4d_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:convolution_4d_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:convolution_4d_expander"],
-)
-
-cc_library(
-    name = "convolution_pred_expander",
-    hdrs = ["convolution_pred_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:convolution_pred_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:convolution_pred_expander"],
-)
-
 xla_test(
     name = "batchnorm_expander_test",
     size = "small",
@@ -2407,70 +2258,35 @@ xla_test(
         "cpu",
         "gpu",
     ],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":batchnorm_expander",
         "//xla:error_spec",
-        "//xla:literal",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
-        "@local_tsl//tsl/platform:statusor",
+        "//xla/tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "algebraic_simplifier",
-    hdrs = ["algebraic_simplifier.h"],
-    copts = tsl_copts(),
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:algebraic_simplifier instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:algebraic_simplifier"],
-)
-
-cc_library(
-    name = "tree_reduction_rewriter",
-    hdrs = ["tree_reduction_rewriter.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:tree_reduction_rewriter instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:tree_reduction_rewriter"],
-)
-
 xla_test(
     name = "algebraic_simplifier_overflow_test",
     srcs = ["algebraic_simplifier_overflow_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:error_spec",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
     ],
 )
 
-cc_library(
-    name = "simplify_fp_conversions",
-    hdrs = ["simplify_fp_conversions.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:simplify_fp_conversions instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:simplify_fp_conversions"],
-)
-
-cc_library(
-    name = "logistic_expander",
-    hdrs = ["logistic_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:logistic_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:logistic_expander"],
-)
-
-cc_library(
-    name = "collectives_schedule_linearizer",
-    hdrs = ["collectives_schedule_linearizer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:collectives_schedule_linearizer instead.",
-    deps = ["//xla/hlo/transforms/collectives:collectives_schedule_linearizer"],
-)
-
 cc_library(
     name = "collective_combiner_utils",
     hdrs = ["collective_combiner_utils.h"],
@@ -2505,41 +2321,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "all_gather_broadcast_reorder",
-    hdrs = ["all_gather_broadcast_reorder.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:all_gather_broadcast_reorder instead.",
-    deps = ["//xla/hlo/transforms/collectives:all_gather_broadcast_reorder"],
-)
-
-cc_library(
-    name = "bitcast_dtypes_expander",
-    hdrs = ["bitcast_dtypes_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:bitcast_dtypes_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:bitcast_dtypes_expander"],
-)
-
-cc_library(
-    name = "all_gather_combiner",
-    hdrs = ["all_gather_combiner.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:all_gather_combiner instead.",
-    deps = ["//xla/hlo/transforms/collectives:all_gather_combiner"],
-)
-
-cc_library(
-    name = "all_reduce_combiner",
-    hdrs = ["all_reduce_combiner.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:all_reduce_combiner instead.",
-    deps = ["//xla/hlo/transforms/collectives:all_reduce_combiner"],
-)
-
-cc_library(
-    name = "all_reduce_contiguous",
-    hdrs = ["all_reduce_contiguous.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:all_reduce_contiguous instead.",
-    deps = ["//xla/hlo/transforms/collectives:all_reduce_contiguous"],
-)
-
 cc_library(
     name = "reduce_scatter_combiner",
     srcs = ["reduce_scatter_combiner.cc"],
@@ -2573,8 +2354,8 @@ xla_cc_test(
     deps = [
         ":reduce_scatter_combiner",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
@@ -2612,16 +2393,12 @@ xla_cc_test(
         ":all_reduce_simplifier",
         ":hlo_module_config",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
-        "//xla:shape_util",
-        "//xla:test",
-        "//xla:types",
-        "//xla:window_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -2654,8 +2431,8 @@ xla_cc_test(
         ":pattern_matcher",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -2687,8 +2464,8 @@ xla_cc_test(
         ":reduce_scatter_decomposer",
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -2716,29 +2493,22 @@ xla_cc_test(
     deps = [
         ":reduce_scatter_reassociate",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
 )
 
-cc_library(
-    name = "batch_dot_simplification",
-    hdrs = ["batch_dot_simplification.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:batch_dot_simplification instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:batch_dot_simplification"],
-)
-
 xla_cc_test(
     name = "gather_expander_test",
     srcs = ["gather_expander_test.cc"],
     deps = [
         ":gather_expander",
-        "//xla:test",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_query",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:test_macros_header",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/strings:string_view",
@@ -2775,12 +2545,12 @@ xla_cc_test(
         ":conditional_simplifier",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status",
@@ -2824,12 +2594,12 @@ xla_cc_test(
         ":conditional_code_motion",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest",
@@ -2837,13 +2607,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "convolution_group_converter",
-    hdrs = ["convolution_group_converter.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:convolution_group_converter instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:convolution_group_converter"],
-)
-
 cc_library(
     name = "space_to_batch_converter",
     srcs = ["space_to_batch_converter.cc"],
@@ -2858,13 +2621,11 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/lib/core:bitmap",
-        "@com_google_absl//absl/algorithm",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -2873,7 +2634,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -2886,11 +2646,10 @@ xla_cc_test(
     srcs = ["space_to_batch_converter_test.cc"],
     deps = [
         ":space_to_batch_converter",
-        "//xla:test",
-        "//xla:types",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -2901,7 +2660,6 @@ cc_library(
     srcs = ["scan_loop_accumulator_input_unification.cc"],
     hdrs = ["scan_loop_accumulator_input_unification.h"],
     deps = [
-        ":call_graph",
         ":pattern_matcher",
         ":while_loop_simplifier",
         ":while_loop_unroller",
@@ -2928,12 +2686,9 @@ xla_cc_test(
     deps = [
         ":copy_insertion",
         ":scan_loop_accumulator_input_unification",
-        "//xla:literal",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
-        "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest",
@@ -2952,10 +2707,8 @@ cc_library(
         ":while_loop_unroller",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3001,7 +2754,6 @@ cc_library(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:side_effect_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -3036,6 +2788,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -3045,13 +2798,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "while_loop_analysis",
-    hdrs = ["while_loop_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:while_loop_analysis instead.",
-    deps = ["//xla/hlo/analysis:while_loop_analysis"],
-)
-
 cc_library(
     name = "while_loop_simplifier",
     srcs = ["while_loop_simplifier.cc"],
@@ -3093,14 +2839,14 @@ xla_cc_test(
         ":while_loop_simplifier",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:tuple_simplifier",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -3109,48 +2855,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "while_loop_trip_count_annotator",
-    hdrs = ["while_loop_trip_count_annotator.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:while_loop_trip_count_annotator instead.",
-    deps = ["//xla/hlo/transforms:while_loop_trip_count_annotator"],
-)
-
-cc_library(
-    name = "defuser",
-    hdrs = ["defuser.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:defuser instead.",
-    deps = ["//xla/hlo/transforms:defuser"],
-)
-
-cc_library(
-    name = "dot_decomposer",
-    hdrs = ["dot_decomposer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:dot_decomposer instead.",
-    deps = ["//xla/hlo/transforms/expanders:dot_decomposer"],
-)
-
-cc_library(
-    name = "dot_dimension_merger",
-    hdrs = ["dot_dimension_merger.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:dot_dimension_merger instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:dot_dimension_merger"],
-)
-
-cc_library(
-    name = "dot_merger",
-    hdrs = ["dot_merger.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:dot_merger instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:dot_merger"],
-)
-
-cc_library(
-    name = "convert_mover",
-    hdrs = ["convert_mover.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:convert_mover instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:convert_mover"],
-)
-
 cc_library(
     name = "all_to_all_decomposer",
     srcs = ["all_to_all_decomposer.cc"],
@@ -3160,7 +2864,6 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -3196,58 +2899,14 @@ xla_cc_test(
         ":all_gather_decomposer",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-xla_cc_test(
-    name = "all_to_all_decomposer_test",
-    srcs = ["all_to_all_decomposer_test.cc"],
-    deps = [
-        ":all_to_all_decomposer",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/platform:statusor",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "tuple_simplifier",
-    hdrs = ["tuple_simplifier.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:tuple_simplifier instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:tuple_simplifier"],
-)
-
-cc_library(
-    name = "reshape_mover",
-    hdrs = ["reshape_mover.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:reshape_mover instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:reshape_mover"],
-)
-
-cc_library(
-    name = "reshape_decomposer",
-    hdrs = ["reshape_decomposer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:reshape_decomposer instead.",
-    deps = ["//xla/hlo/transforms/expanders:reshape_decomposer"],
-)
-
-cc_library(
-    name = "reduce_decomposer",
-    hdrs = ["reduce_decomposer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:reduce_decomposer instead.",
-    deps = ["//xla/hlo/transforms/expanders:reduce_decomposer"],
-)
-
 cc_library(
     name = "dynamic_window_utils",
     srcs = ["dynamic_window_utils.cc"],
@@ -3303,13 +2962,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "dynamic_dimension_simplifier",
-    hdrs = ["dynamic_dimension_simplifier.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier"],
-)
-
 cc_library(
     name = "dynamic_padder",
     srcs = ["dynamic_padder.cc"],
@@ -3357,19 +3009,19 @@ xla_test(
         ":dynamic_dimension_inference",
         ":dynamic_padder",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
         "//xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
@@ -3377,7 +3029,6 @@ xla_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:client_library_test_base",
         "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
         "//xla/tests:llvm_irgen_test_base",
         "//xla/tests:test_macros_header",
         "//xla/tests:xla_internal_test_main",
@@ -3392,7 +3043,6 @@ xla_test(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test_benchmark",
     ],
 )
 
@@ -3404,14 +3054,13 @@ xla_cc_test(
         ":hlo_runner",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
@@ -3584,10 +3233,10 @@ xla_cc_test(
     deps = [
         ":cpu_plugin",
         ":hlo_cost_analysis",
-        ":local_service",
         ":service",
+        "//xla:array4d",
+        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/client",
         "//xla/client:client_library",
@@ -3597,10 +3246,16 @@ xla_cc_test(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:logging",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3640,22 +3295,24 @@ xla_cc_test(
     srcs = ["hlo_computation_test.cc"],
     deps = [
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -3674,14 +3331,14 @@ xla_cc_test(
         "//xla:debug_options_flags",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/lib/strings:proto_serialization",
@@ -3702,9 +3359,9 @@ xla_cc_test(
     name = "hlo_module_metadata_test",
     srcs = ["hlo_module_metadata_test.cc"],
     deps = [
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -3760,7 +3417,6 @@ cc_library(
         "//xla:lazy",
         "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -3771,17 +3427,10 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
 )
 
-cc_library(
-    name = "hlo_dataflow_analysis",
-    hdrs = ["hlo_dataflow_analysis.h"],
-    deps = ["//xla/hlo/analysis:hlo_dataflow_analysis"],
-)
-
 cc_library(
     name = "hlo_phi_graph",
     srcs = ["hlo_phi_graph.cc"],
@@ -3806,27 +3455,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_value_semantics_analysis",
-    hdrs = ["hlo_value_semantics_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:hlo_value_semantics_analysis instead.",
-    deps = ["//xla/hlo/analysis:hlo_value_semantics_analysis"],
-)
-
-cc_library(
-    name = "hlo_replication_analysis",
-    hdrs = ["hlo_replication_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:hlo_replication_analysis instead.",
-    deps = ["//xla/hlo/analysis:hlo_replication_analysis"],
-)
-
-cc_library(
-    name = "hlo_liveness_analysis",
-    hdrs = ["hlo_liveness_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:hlo_liveness_analysis instead.",
-    deps = ["//xla/hlo/analysis:hlo_liveness_analysis"],
-)
-
 cc_library(
     name = "hlo_buffer",
     srcs = ["hlo_buffer.cc"],
@@ -3846,27 +3474,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "hlo_alias_analysis",
-    hdrs = ["hlo_alias_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:hlo_alias_analysis instead.",
-    deps = ["//xla/hlo/analysis:hlo_alias_analysis"],
-)
-
-cc_library(
-    name = "logical_buffer_analysis",
-    hdrs = ["logical_buffer_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:logical_buffer_analysis instead.",
-    deps = ["//xla/hlo/analysis:logical_buffer_analysis"],
-)
-
-cc_library(
-    name = "tuple_points_to_analysis",
-    hdrs = ["tuple_points_to_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:tuple_points_to_analysis instead.",
-    deps = ["//xla/hlo/analysis:tuple_points_to_analysis"],
-)
-
 cc_library(
     name = "compilation_cache",
     srcs = ["compilation_cache.cc"],
@@ -3920,19 +3527,20 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
     name = "copy_insertion",
-    srcs = ["copy_insertion.cc"],
+    srcs = [
+        "copy_insertion.cc",
+        "copy_removal.cc",
+    ],
     hdrs = [
         "compile_time_cap.h",
         "copy_insertion.h",
+        "copy_removal.h",
     ],
     deps = [
         ":call_graph",
@@ -3950,10 +3558,13 @@ cc_library(
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:ptrvec",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:tuple_simplifier",
-        "//xla/hlo/utils:hlo_query",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3965,9 +3576,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -4009,14 +3617,14 @@ xla_cc_test(
         "//xla:debug_options_flags",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/hlo/utils:hlo_query",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -4036,9 +3644,9 @@ xla_cc_test(
         ":copy_insertion",
         ":hlo_graph_dumper",
         ":loop_schedule_linearizer",
-        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -4046,20 +3654,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "memory_space_propagation",
-    hdrs = ["memory_space_propagation.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:memory_space_propagation instead.",
-    deps = ["//xla/hlo/transforms:memory_space_propagation"],
-)
-
-cc_library(
-    name = "hlo_dce",
-    hdrs = ["hlo_dce.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:hlo_dce instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:hlo_dce"],
-)
-
 cc_library(
     name = "hlo_module_dce",
     srcs = ["hlo_module_dce.cc"],
@@ -4093,11 +3687,14 @@ cc_library(
         "//xla:permutation_util",
         "//xla:shape_layout",
         "//xla:shape_util",
+        "//xla:side_effect_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -4108,8 +3705,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -4127,9 +3722,10 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/status",
@@ -4138,8 +3734,6 @@ xla_cc_test(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -4166,7 +3760,7 @@ xla_cc_test(
         ":hlo_module_config",
         ":hlo_verifier",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest",
@@ -4174,21 +3768,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_rematerialization",
-    hdrs = ["hlo_rematerialization.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:hlo_rematerialization instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:hlo_rematerialization"],
-)
-
-cc_library(
-    name = "hlo_rematerialization_test_utils",
-    testonly = 1,
-    hdrs = ["hlo_rematerialization_test_utils.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:hlo_rematerialization_test_utils instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:hlo_rematerialization_test_utils"],
-)
-
 xla_cc_test(
     name = "hlo_module_dce_test",
     srcs = ["hlo_module_dce_test.cc"],
@@ -4197,7 +3776,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:types",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
@@ -4213,17 +3792,17 @@ xla_cc_test(
         ":layout_assignment",
         ":logical_buffer",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_layout",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -4238,40 +3817,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_pass",
-    hdrs = [
-        "hlo_pass_fix.h",
-        "hlo_pass_interface.h",
-    ],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/pass:hlo_pass instead.",
-    deps = [
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
-        "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "hlo_pass_pipeline",
-    hdrs = ["hlo_pass_pipeline.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/pass:hlo_pass_pipeline instead.",
-    deps = [
-        ":compilation_stats",
-        "//xla:types",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "hlo_cse",
     srcs = ["hlo_cse.cc"],
@@ -4295,13 +3840,13 @@ xla_cc_test(
     deps = [
         ":hlo_cse",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
@@ -4313,12 +3858,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_constant_folding",
-    hdrs = ["hlo_constant_folding.h"],
-    deps = ["//xla/hlo/transforms/simplifiers:hlo_constant_folding"],
-)
-
 cc_library(
     name = "hlo_domain_map",
     srcs = ["hlo_domain_map.cc"],
@@ -4390,29 +3929,15 @@ xla_cc_test(
         ":hlo_domain_verifier",
         ":sharding_propagation",
         "//xla:debug_options_flags",
-        "//xla:test",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
     ],
 )
 
-cc_library(
-    name = "hlo_element_type_converter",
-    hdrs = ["hlo_element_type_converter.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:hlo_element_type_converter instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:hlo_element_type_converter"],
-)
-
-cc_library(
-    name = "conditional_canonicalizer",
-    hdrs = ["conditional_canonicalizer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:conditional_canonicalizer instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:conditional_canonicalizer"],
-)
-
 cc_library(
     name = "maybe_owning_device_memory",
     srcs = [
@@ -4502,9 +4027,9 @@ xla_test(
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:test",
         "//xla:types",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:test",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tests:hlo_test_base",
@@ -4527,13 +4052,13 @@ cc_library(
         ":computation_layout",
         ":computation_placer",
         ":hlo_proto_cc",
+        ":schedule_config",
         ":sharding_config",
         "//xla:debug_options_flags",
         "//xla:shape_layout",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/ir:hlo_sharding",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
@@ -4625,10 +4150,10 @@ xla_cc_test(
     deps = [
         ":hlo_graph_dumper",
         "//xla:literal_util",
-        "//xla:test",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4664,14 +4189,14 @@ xla_cc_test(
         ":transpose_folding",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:test",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -4680,13 +4205,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "zero_sized_hlo_elimination",
-    hdrs = ["zero_sized_hlo_elimination.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:zero_sized_hlo_elimination instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:zero_sized_hlo_elimination"],
-)
-
 cc_library(
     name = "stream_pool",
     srcs = ["stream_pool.cc"],
@@ -4702,7 +4220,7 @@ xla_cc_test(
     srcs = ["stream_pool_test.cc"],
     deps = [
         ":stream_pool",
-        "//xla:test_helpers",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/host:host_platform",
@@ -4731,10 +4249,10 @@ xla_cc_test(
         ":hlo_proto_util",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:test",
         "//xla:types",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -4745,13 +4263,11 @@ cc_library(
     hdrs = ["hlo_runner_interface.h"],
     deps = [
         ":computation_placer",
-        ":hlo_module_util",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:die_if_null",
@@ -4759,7 +4275,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -4799,14 +4314,16 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -4843,11 +4360,43 @@ cc_library(
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:fingerprint",
+        "@local_tsl//tsl/platform:path",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_runner_pjrt_test",
+    srcs = ["hlo_runner_pjrt_test.cc"],
+    deps = [
+        ":computation_placer_hdr",
+        ":hlo_runner_interface",
+        ":hlo_runner_pjrt",
+        "//xla:util",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt/interpreter:interpreter_client",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:fingerprint",
+        "@local_tsl//tsl/platform:notification",
+        "@local_tsl//tsl/platform:path",
     ],
 )
 
@@ -4864,20 +4413,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "sort_simplifier",
-    hdrs = ["sort_simplifier.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:sort_simplifier instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:sort_simplifier"],
-)
-
-cc_library(
-    name = "stable_sort_expander",
-    hdrs = ["stable_sort_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:stable_sort_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:stable_sort_expander"],
-)
-
 cc_library(
     name = "tuple_util",
     srcs = ["tuple_util.cc"],
@@ -4905,11 +4440,11 @@ xla_cc_test(
         ":hlo_module_config",
         ":tuple_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
@@ -4917,41 +4452,13 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "root_instruction_sinker",
-    hdrs = ["root_instruction_sinker.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:root_instruction_sinker instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:root_instruction_sinker"],
-)
-
-cc_library(
-    name = "host_memory_offload_annotations_hdr",
-    hdrs = ["host_memory_offload_annotations.h"],
+    name = "memory_annotations_hdr",
+    hdrs = ["memory_annotations.h"],
     deps = [
         "@com_google_absl//absl/strings:string_view",
     ],
 )
 
-cc_library(
-    name = "convert_memory_placement_to_internal_annotations",
-    hdrs = ["convert_memory_placement_to_internal_annotations.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:convert_memory_placement_to_internal_annotations instead.",
-    deps = ["//xla/hlo/transforms:convert_memory_placement_to_internal_annotations"],
-)
-
-cc_library(
-    name = "host_memory_transfer_asyncifier",
-    hdrs = ["host_memory_transfer_asyncifier.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:host_memory_transfer_asyncifier instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:host_memory_transfer_asyncifier"],
-)
-
-cc_library(
-    name = "host_offload_legalize",
-    hdrs = ["host_offload_legalize.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:host_offload_legalize instead.",
-    deps = ["//xla/hlo/transforms:host_offload_legalize"],
-)
-
 cc_library(
     name = "host_offload_utils",
     srcs = ["host_offload_utils.cc"],
@@ -4959,7 +4466,7 @@ cc_library(
     deps = [
         ":call_graph",
         ":hlo_buffer",
-        ":host_memory_offload_annotations_hdr",
+        ":memory_annotations_hdr",
         ":pattern_matcher",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -4985,40 +4492,16 @@ xla_cc_test(
     name = "host_offload_utils_test",
     srcs = ["host_offload_utils_test.cc"],
     deps = [
-        ":hlo_verifier",
-        ":host_memory_offload_annotations_hdr",
         ":host_offload_utils",
-        ":pattern_matcher",
-        ":pattern_matcher_gmock",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "host_offloader",
-    hdrs = ["host_offloader.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:host_offloader instead.",
-    deps = ["//xla/hlo/transforms:host_offloader"],
-)
-
-cc_library(
-    name = "host_offloading_prepare",
-    hdrs = ["host_offloading_prepare.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:host_offloading_prepare instead.",
-    deps = ["//xla/hlo/transforms:host_offloading_prepare"],
-)
-
 cc_library(
     name = "while_util",
     srcs = ["while_util.cc"],
@@ -5053,12 +4536,12 @@ xla_cc_test(
     srcs = ["while_util_test.cc"],
     deps = [
         ":while_util",
-        "//xla:test",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -5072,7 +4555,6 @@ cc_library(
     srcs = ["while_loop_all_reduce_code_motion.cc"],
     hdrs = ["while_loop_all_reduce_code_motion.h"],
     deps = [
-        ":call_graph",
         ":collective_ops_utils",
         "//xla:literal_util",
         "//xla:util",
@@ -5099,8 +4581,8 @@ xla_cc_test(
         ":hlo_verifier",
         ":while_loop_all_reduce_code_motion",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -5145,8 +4627,8 @@ xla_cc_test(
         ":while_loop_concat_code_motion",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
@@ -5169,6 +4651,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/hlo/transforms/simplifiers:tuple_simplifier",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -5189,12 +4672,12 @@ xla_cc_test(
         ":while_loop_invariant_code_motion",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/log",
@@ -5230,21 +4713,14 @@ xla_cc_test(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "fusion_constant_sinking",
-    hdrs = ["fusion_constant_sinking.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:fusion_constant_sinking instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:fusion_constant_sinking"],
-)
-
 cc_library(
     name = "while_loop_constant_sinking",
     srcs = ["while_loop_constant_sinking.cc"],
@@ -5274,10 +4750,10 @@ xla_cc_test(
     deps = [
         ":while_loop_constant_sinking",
         "//xla:literal_util",
-        "//xla:test",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -5318,22 +4794,14 @@ xla_cc_test(
     deps = [
         ":while_loop_fusible_sinking",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/transforms/simplifiers:flatten_call_graph",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "despecializer",
-    hdrs = ["despecializer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:despecializer instead.",
-    deps = ["//xla/hlo/transforms:despecializer"],
-)
-
 cc_library(
     name = "source_map_util",
     srcs = [],
@@ -5345,33 +4813,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "indexed_array_analysis",
-    hdrs = ["indexed_array_analysis.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/analysis:indexed_array_analysis instead.",
-    deps = ["//xla/hlo/analysis:indexed_array_analysis"],
-)
-
-cc_library(
-    name = "hlo_parser",
-    hdrs = ["hlo_parser.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/parser:hlo_parser instead.",
-    deps = [
-        "//xla/hlo/parser:hlo_parser",
-    ],
-)
-
-cc_library(
-    name = "hlo_lexer",
-    hdrs = [
-        "hlo_lexer.h",
-    ],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/parser:hlo_lexer instead.",
-    deps = [
-        "//xla/hlo/parser:hlo_lexer",
-    ],
-)
-
 cc_library(
     name = "map_inliner",
     srcs = ["map_inliner.cc"],
@@ -5392,20 +4833,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "optimize_input_output_buffer_alias",
-    hdrs = ["optimize_input_output_buffer_alias.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:optimize_input_output_buffer_alias instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:optimize_input_output_buffer_alias"],
-)
-
-cc_library(
-    name = "ar_crs_combiner",
-    hdrs = ["ar_crs_combiner.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:ar_crs_combiner instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:ar_crs_combiner"],
-)
-
 cc_library(
     name = "compilation_stats",
     srcs = ["compilation_stats.cc"],
@@ -5419,13 +4846,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "dynamic_index_splitter",
-    hdrs = ["dynamic_index_splitter.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:dynamic_index_splitter instead.",
-    deps = ["//xla/hlo/transforms/expanders:dynamic_index_splitter"],
-)
-
 xla_cc_test(
     name = "map_inliner_test",
     srcs = ["map_inliner_test.cc"],
@@ -5434,9 +4854,9 @@ xla_cc_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
@@ -5467,6 +4887,9 @@ cc_library(
         "//xla:types",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
@@ -5479,21 +4902,14 @@ xla_cc_test(
     deps = [
         ":conditional_to_select",
         "//xla:literal",
-        "//xla:test",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
 )
 
-cc_library(
-    name = "slice_sinker",
-    hdrs = ["slice_sinker.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:slice_sinker instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:slice_sinker"],
-)
-
 cc_library(
     name = "custom_call_target_registry",
     srcs = ["custom_call_target_registry.cc"],
@@ -5501,13 +4917,13 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-cc_test(
+xla_cc_test(
     name = "custom_call_target_registry_test",
     srcs = ["custom_call_target_registry_test.cc"],
     deps = [
         ":custom_call_status",
         ":custom_call_target_registry",
-        "//xla:test",
+        "//xla/hlo/testlib:test",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -5591,20 +5007,6 @@ cc_library(
     deps = [":custom_call_status"],
 )
 
-cc_library(
-    name = "rng_expander",
-    hdrs = ["rng_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:rng_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:rng_expander"],
-)
-
-cc_library(
-    name = "rng_bit_generator_expander",
-    hdrs = ["rng_bit_generator_expander.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:rng_bit_generator_expander instead.",
-    deps = ["//xla/hlo/transforms/expanders:rng_bit_generator_expander"],
-)
-
 cc_library(
     name = "slow_operation_alarm",
     srcs = ["slow_operation_alarm.cc"],
@@ -5669,13 +5071,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "collective_transformation_reorderer",
-    hdrs = ["collective_transformation_reorderer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:collective_transformation_reorderer instead.",
-    deps = ["//xla/hlo/transforms/collectives:collective_transformation_reorderer"],
-)
-
 xla_cc_test(
     name = "collective_ops_utils_test",
     srcs = ["collective_ops_utils_test.cc"],
@@ -5745,38 +5140,17 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "operand_upcaster",
-    hdrs = ["operand_upcaster.h"],
-    deps = ["//xla/hlo/transforms:operand_upcaster"],
-)
-
-cc_library(
-    name = "result_caster",
-    hdrs = ["result_caster.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:result_caster instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:result_caster"],
-)
-
 cc_library(
     name = "global_device_id",
     srcs = ["global_device_id.cc"],
     hdrs = ["global_device_id.h"],
     deps = [
-        "//xla:types",
-        "//xla/tsl/lib/gtl:int_type",
+        "//xla/runtime:device_id",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-cc_library(
-    name = "convert_operand_folding",
-    hdrs = ["convert_operand_folding.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:convert_operand_folding instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:convert_operand_folding"],
-)
-
 cc_library(
     name = "xla_debug_info_manager",
     srcs = [
@@ -5805,7 +5179,7 @@ xla_cc_test(
         ":hlo_proto_cc",
         ":xla_debug_info_manager",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/synchronization",
@@ -5841,7 +5215,7 @@ xla_cc_test(
     srcs = ["mapped_ptr_container_sorter_test.cc"],
     deps = [
         ":mapped_ptr_container_sorter",
-        "//xla:test",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/functional:bind_front",
@@ -5880,11 +5254,9 @@ cc_library(
     srcs = ["rendezvous.cc"],
     hdrs = ["rendezvous.h"],
     deps = [
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -5901,15 +5273,19 @@ xla_cc_test(
     srcs = ["rendezvous_test.cc"],
     deps = [
         ":rendezvous",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
     ],
 )
 
@@ -5919,10 +5295,13 @@ cc_library(
     hdrs = ["compilation_environments.h"],
     deps = [
         "//xla:xla_proto_cc",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -5961,11 +5340,13 @@ xla_cc_test(
     deps = [
         ":compilation_environments",
         ":test_compilation_environment_proto_cc",
-        "//xla:test",
         "//xla:xla_proto_cc",
+        "//xla/hlo/testlib:test",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -5986,6 +5367,8 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -5993,18 +5376,9 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "instruction_hoister",
-    hdrs = ["instruction_hoister.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:instruction_hoister instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:instruction_hoister"],
-)
-
 cc_library(
     name = "scatter_simplifier",
     srcs = ["scatter_simplifier.cc"],
@@ -6036,9 +5410,7 @@ xla_cc_test(
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
@@ -6066,7 +5438,7 @@ xla_cc_test(
     deps = [
         ":select_and_scatter_expander",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -6080,7 +5452,7 @@ xla_cc_test(
         ":layout_normalization",
         ":scatter_simplifier",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status",
@@ -6120,13 +5492,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gather_simplifier",
-    hdrs = ["gather_simplifier.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:gather_simplifier instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:gather_simplifier"],
-)
-
 cc_library(
     name = "batched_gather_scatter_normalizer",
     srcs = ["batched_gather_scatter_normalizer.cc"],
@@ -6147,20 +5512,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "reduce_window_rewriter",
-    hdrs = ["reduce_window_rewriter.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:reduce_window_rewriter instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:reduce_window_rewriter"],
-)
-
-cc_library(
-    name = "stochastic_convert_decomposer",
-    hdrs = ["stochastic_convert_decomposer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/expanders:stochastic_convert_decomposer instead.",
-    deps = ["//xla/hlo/transforms/expanders:stochastic_convert_decomposer"],
-)
-
 cc_library(
     name = "metrics_hook_interface",
     hdrs = ["metrics_hook_interface.h"],
@@ -6172,27 +5523,12 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "sub_byte_normalization",
-    hdrs = ["sub_byte_normalization.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/simplifiers:sub_byte_normalization instead.",
-    deps = ["//xla/hlo/transforms/simplifiers:sub_byte_normalization"],
-)
-
-cc_library(
-    name = "sharding_format_picker",
-    testonly = True,
-    hdrs = ["sharding_format_picker.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:sharding_format_picker instead.",
-    deps = ["//xla/hlo/transforms:sharding_format_picker"],
-)
-
 xla_cc_test(
     name = "batched_gather_scatter_normalizer_test",
     srcs = ["batched_gather_scatter_normalizer_test.cc"],
     deps = [
         ":batched_gather_scatter_normalizer",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -6205,8 +5541,8 @@ xla_cc_test(
     deps = [
         ":change_op_data_type",
         ":pattern_matcher",
-        ":pattern_matcher_gmock",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/types:span",
     ],
@@ -6234,23 +5570,23 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:types",
     ] + if_cuda_is_configured([
+        # keep sorted
         "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
         "//xla/service/gpu:nvptx_compiler",
         "//xla/service/gpu:nvptx_compiler_impl",
-        "//xla/stream_executor/gpu:gpu_init",
+        "//xla/stream_executor/cuda:all_runtime",
         "//xla/stream_executor/cuda:cuda_platform",
+        "//xla/stream_executor/gpu:gpu_init",
     ]) + if_rocm_is_configured([
-        "//xla/service/gpu:executable_proto_cc",
-        "//xla/service/gpu:gpu_compiler",
+        # keep sorted
         "//xla/service/gpu:amdgpu_compiler",
         "//xla/service/gpu:amdgpu_compiler_impl",
+        "//xla/service/gpu:executable_proto_cc",
+        "//xla/service/gpu:gpu_compiler",
         "//xla/stream_executor/gpu:gpu_init",
+        "//xla/stream_executor/rocm:all_runtime",
         "//xla/stream_executor/rocm:rocm_platform",
-    ]) + if_cuda([
-        "//xla/stream_executor/cuda:cublas_plugin",
-    ]) + if_rocm([
-        "//xla/stream_executor/rocm:rocblas_plugin",
     ]) + xla_internal(["tools:xsymbol_repository"]),
 )
 
@@ -6267,35 +5603,35 @@ xla_aot_compile_cpu(
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable",
-    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
-    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.txtpb",
     module = "xla_aot_compile_test.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_hlo",
-    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
-    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.txtpb",
     module = "xla_aot_compile_test.hlo",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_constant",
-    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
-    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.txtpb",
     module = "xla_aot_compile_test_constant.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_convolution",
-    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
-    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
 xla_aot_compile_gpu_runtime_autotuning(
     name = "xla_aot_compile_test_gpu_executable_convolution_runtime_autotuning",
-    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
@@ -6357,14 +5693,14 @@ xla_cc_test(
 
 xla_cc_test(
     name = "xla_aot_compile_gpu_test",
-    srcs = if_cuda_is_configured(["xla_aot_compile_gpu_test.cc"]),
-    data = if_cuda_is_configured([
+    srcs = ["xla_aot_compile_gpu_test.cc"],
+    data = [
         ":xla_aot_compile_test_gpu_executable",
-        ":xla_aot_compile_test_gpu_executable_hlo",
         ":xla_aot_compile_test_gpu_executable_constant",
         ":xla_aot_compile_test_gpu_executable_convolution",
         ":xla_aot_compile_test_gpu_executable_convolution_runtime_autotuning",
-    ]),
+        ":xla_aot_compile_test_gpu_executable_hlo",
+    ],
     tags = [
         "cuda-only",
         "gpu",
@@ -6372,27 +5708,26 @@ xla_cc_test(
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
         "requires-gpu-sm60-only",
     ],
-    deps = if_cuda_is_configured([
+    deps = [
         ":gpu_plugin_impl",
         ":platform_util",
         ":shaped_buffer",
-        "@com_google_googletest//:gtest_main",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/service/cpu:cpu_compiler",
-        "//xla/client:executable_build_options",
-        "//xla:literal",
-        "//xla:shape_util",
         "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
-    ]),
+    ],
 )
 
 tf_proto_library(
@@ -6428,6 +5763,7 @@ cc_library(
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
     ],
+    alwayslink = 1,
 )
 
 xla_cc_test(
@@ -6494,35 +5830,22 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
-cc_library(
-    name = "add_original_value",
-    hdrs = ["add_original_value.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms:add_original_value instead.",
-    deps = ["//xla/hlo/transforms:add_original_value"],
-)
-
 xla_cc_test(
     name = "propagate_original_value_test",
     srcs = ["propagate_original_value_test.cc"],
     deps = [
         ":instruction_fusion",
         "//xla:xla_data_proto_cc",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
-cc_library(
-    name = "infeed_token_propagation",
-    hdrs = ["infeed_token_propagation.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/transforms/collectives:infeed_token_propagation instead.",
-    deps = ["//xla/hlo/transforms/collectives:infeed_token_propagation"],
-)
-
 cc_library(
     name = "while_loop_pipeline_unroller",
     srcs = ["while_loop_pipeline_unroller.cc"],
@@ -6550,15 +5873,14 @@ xla_cc_test(
     deps = [
         ":copy_insertion",
         ":while_loop_pipeline_unroller",
-        "//xla:test_helpers",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -6572,12 +5894,17 @@ cc_library(
     srcs = ["legalize_scheduling_annotations.cc"],
     hdrs = ["legalize_scheduling_annotations.h"],
     deps = [
+        ":scheduling_annotations_util",
         "//xla:side_effect_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:ptrvec",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -6586,7 +5913,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6595,15 +5921,38 @@ xla_cc_test(
     srcs = ["legalize_scheduling_annotations_test.cc"],
     deps = [
         ":legalize_scheduling_annotations",
+        ":scheduling_annotations_util",
         "//xla:side_effect_util",
-        "//xla:test_helpers",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
+
+cc_library(
+    name = "schedule_config",
+    srcs = ["schedule_config.cc"],
+    hdrs = ["schedule_config.h"],
+    deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+    ],
+)
+
+xla_cc_test(
+    name = "schedule_config_test",
+    srcs = ["schedule_config_test.cc"],
+    deps = [
+        ":schedule_config",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -6634,10 +5983,33 @@ cc_library(
     srcs = ["scheduling_annotations_util.cc"],
     hdrs = ["scheduling_annotations_util.h"],
     deps = [
+        ":collective_pipeliner_utils",
         "//xla:side_effect_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
 
-exports_files(["xla_aot_compile_test_gpu_target_config.prototxt"])
+xla_cc_test(
+    name = "scheduling_annotations_util_test",
+    srcs = ["scheduling_annotations_util_test.cc"],
+    deps = [
+        ":scheduling_annotations_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+exports_files(["xla_aot_compile_test_gpu_target_config.txtpb"])
diff --git a/third_party/xla/xla/service/add_original_value.h b/third_party/xla/xla/service/add_original_value.h
deleted file mode 100644
index 2a68cca88b0e..000000000000
--- a/third_party/xla/xla/service/add_original_value.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ADD_ORIGINAL_VALUE_H_
-#define XLA_SERVICE_ADD_ORIGINAL_VALUE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/add_original_value.h"
-
-#endif  // XLA_SERVICE_ADD_ORIGINAL_VALUE_H_
diff --git a/third_party/xla/xla/service/algebraic_simplifier.h b/third_party/xla/xla/service/algebraic_simplifier.h
deleted file mode 100644
index 82fc94390304..000000000000
--- a/third_party/xla/xla/service/algebraic_simplifier.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ALGEBRAIC_SIMPLIFIER_H_
-#define XLA_SERVICE_ALGEBRAIC_SIMPLIFIER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
-
-#endif  // XLA_SERVICE_ALGEBRAIC_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/algebraic_simplifier_overflow_test.cc b/third_party/xla/xla/service/algebraic_simplifier_overflow_test.cc
index 071f9994b54a..a6ddb38e4187 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_overflow_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_overflow_test.cc
@@ -15,16 +15,17 @@ limitations under the License.
 
 #include <optional>
 #include <string>
-#include <vector>
 
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 
 namespace xla {
 namespace {
 
-class AlgebraicSimplifierOverflowTest : public HloTestBase {};
+class AlgebraicSimplifierOverflowTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {};
 
 // Test that the algebraic simplifier does not generate integer overflows
 // by moving the subtraction to the other side of the comparison
diff --git a/third_party/xla/xla/service/algorithm_util.cc b/third_party/xla/xla/service/algorithm_util.cc
index 7ea65f61caa7..e019effee6de 100644
--- a/third_party/xla/xla/service/algorithm_util.cc
+++ b/third_party/xla/xla/service/algorithm_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/algorithm_util.h"
 
 #include <variant>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace algorithm_util {
@@ -65,6 +67,49 @@ absl::StatusOr<se::blas::ComputationType> GetBlasComputationType(
   }
 }
 
+absl::StatusOr<std::vector<PrimitiveType>> GetAllowedOperandsTypeForAlgorithm(
+    PrecisionConfig::Algorithm algorithm) {
+  switch (algorithm) {
+    case PrecisionConfig::ALG_UNSET:
+      break;
+    case PrecisionConfig::ALG_DOT_F16_F16_F16:
+    case PrecisionConfig::ALG_DOT_F16_F16_F32:
+      return std::vector<PrimitiveType>{F16};
+    case PrecisionConfig::ALG_DOT_F32_F32_F32:
+      return std::vector<PrimitiveType>{F32};
+    case PrecisionConfig::ALG_DOT_F64_F64_F64:
+      return std::vector<PrimitiveType>{F64};
+    case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+      return std::vector<PrimitiveType>{BF16};
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      return std::vector<PrimitiveType>{F32};  // This is not a typo.
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+      return std::vector<PrimitiveType>{F32};
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
+    case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM: {
+      std::vector<PrimitiveType> f8_types;
+      const tsl::protobuf::EnumDescriptor* desc =
+          tsl::protobuf::GetEnumDescriptor<PrimitiveType>();
+      for (int i = 0; i < desc->value_count(); ++i) {
+        PrimitiveType ty = static_cast<PrimitiveType>(desc->value(i)->number());
+        if (primitive_util::IsF8Type(ty)) {
+          f8_types.push_back(ty);
+        }
+      }
+      return f8_types;
+    }
+    default:
+      break;
+  }
+  return absl::InternalError(
+      absl::StrFormat("GetDotAccumulatorType: unsupported algorithm %s",
+                      xla::PrecisionConfig::Algorithm_Name(algorithm)));
+}
+
 absl::StatusOr<PrimitiveType> GetDotAccumulatorType(
     PrecisionConfig::Algorithm algorithm) {
   // All dot algorithms should be listed here.
@@ -103,13 +148,6 @@ bool HasFastAccum(PrecisionConfig::Algorithm algorithm) {
   return algorithm == PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM;
 }
 
-bool IsAmpere(stream_executor::GpuComputeCapability gpu_compute_capability) {
-  return std::holds_alternative<se::CudaComputeCapability>(
-             gpu_compute_capability) &&
-         std::get<se::CudaComputeCapability>(gpu_compute_capability).major ==
-             stream_executor::CudaComputeCapability::kAmpere;
-}
-
 // It's clear that those libraries could support more, but we only list the ones
 // which we explicitly test for now.
 bool IsSupportedByCublasOrCublasLt(
diff --git a/third_party/xla/xla/service/algorithm_util.h b/third_party/xla/xla/service/algorithm_util.h
index 391b3fee5e93..293fc1e47240 100644
--- a/third_party/xla/xla/service/algorithm_util.h
+++ b/third_party/xla/xla/service/algorithm_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_ALGORITHM_UTIL_H_
 
 #include <cstdint>
+#include <vector>
 
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -37,6 +38,22 @@ namespace algorithm_util {
 absl::StatusOr<stream_executor::blas::ComputationType> GetBlasComputationType(
     PrecisionConfig::Algorithm algorithm);
 
+// Returns the list of types that are allowed for the dot operands of the given
+// algorithm. The expectation is always that both dot operands use the same
+// type.
+//
+// Algorithms mostly expect that their input and output types correspond to
+// what the algorithm describes. This is not always the case though, e.g.
+// for BF16_BF16_F32_X9, working from inputs casted to BF16 makes no sense;
+// this algorithm instead expects F32 inputs, and performs splits into BF16
+// sub-values under the hood.
+//
+// Another exception (and why we can't return a single type) are algorithms
+// working on F8 types, where we sometimes allow any flavour of F8 type to be
+// used.
+absl::StatusOr<std::vector<PrimitiveType>> GetAllowedOperandsTypeForAlgorithm(
+    PrecisionConfig::Algorithm algorithm);
+
 // Get the accumulator type of an algorithm.
 absl::StatusOr<PrimitiveType> GetDotAccumulatorType(
     PrecisionConfig::Algorithm algorithm);
diff --git a/third_party/xla/xla/service/all_gather_broadcast_reorder.h b/third_party/xla/xla/service/all_gather_broadcast_reorder.h
deleted file mode 100644
index ce722207a37a..000000000000
--- a/third_party/xla/xla/service/all_gather_broadcast_reorder.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
-#define XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h"
-
-#endif  // XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
diff --git a/third_party/xla/xla/service/all_gather_combiner.h b/third_party/xla/xla/service/all_gather_combiner.h
deleted file mode 100644
index 9c7029207c6c..000000000000
--- a/third_party/xla/xla/service/all_gather_combiner.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ALL_GATHER_COMBINER_H_
-#define XLA_SERVICE_ALL_GATHER_COMBINER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/all_gather_combiner.h"
-
-#endif  // XLA_SERVICE_ALL_GATHER_COMBINER_H_
diff --git a/third_party/xla/xla/service/all_gather_decomposer_test.cc b/third_party/xla/xla/service/all_gather_decomposer_test.cc
index b857ce032959..4c13202bcbf5 100644
--- a/third_party/xla/xla/service/all_gather_decomposer_test.cc
+++ b/third_party/xla/xla/service/all_gather_decomposer_test.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -33,7 +33,7 @@ namespace {
 
 using ::testing::AllOf;
 namespace op = xla::testing::opcode_matchers;
-using AllGatherDecomposerTest = HloTestBase;
+using AllGatherDecomposerTest = HloHardwareIndependentTestBase;
 
 TEST_F(AllGatherDecomposerTest, CrossReplicaAllGather) {
   const std::string module_str = R"(
diff --git a/third_party/xla/xla/service/all_gather_simplifier_test.cc b/third_party/xla/xla/service/all_gather_simplifier_test.cc
index 3df977a1fb5d..13cb631ea38f 100644
--- a/third_party/xla/xla/service/all_gather_simplifier_test.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -31,7 +31,7 @@ namespace {
 
 namespace m = match;
 
-using AllGatherSimplifierTest = HloTestBase;
+using AllGatherSimplifierTest = HloHardwareIndependentTestBase;
 
 TEST_F(AllGatherSimplifierTest, ReplicatedParameters) {
   const absl::string_view kModuleStr = R"(
diff --git a/third_party/xla/xla/service/all_reduce_combiner.h b/third_party/xla/xla/service/all_reduce_combiner.h
deleted file mode 100644
index f0f3a200f22f..000000000000
--- a/third_party/xla/xla/service/all_reduce_combiner.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ALL_REDUCE_COMBINER_H_
-#define XLA_SERVICE_ALL_REDUCE_COMBINER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/all_reduce_combiner.h"
-
-#endif  // XLA_SERVICE_ALL_REDUCE_COMBINER_H_
diff --git a/third_party/xla/xla/service/all_reduce_contiguous.h b/third_party/xla/xla/service/all_reduce_contiguous.h
deleted file mode 100644
index 7dc1a6501259..000000000000
--- a/third_party/xla/xla/service/all_reduce_contiguous.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
-#define XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/all_reduce_contiguous.h"
-
-#endif  // XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
diff --git a/third_party/xla/xla/service/all_reduce_folder.h b/third_party/xla/xla/service/all_reduce_folder.h
deleted file mode 100644
index 6054de621c1d..000000000000
--- a/third_party/xla/xla/service/all_reduce_folder.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ALL_REDUCE_FOLDER_H_
-#define XLA_SERVICE_ALL_REDUCE_FOLDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/all_reduce_folder.h"
-
-#endif  // XLA_SERVICE_ALL_REDUCE_FOLDER_H_
diff --git a/third_party/xla/xla/service/all_reduce_promotion.cc b/third_party/xla/xla/service/all_reduce_promotion.cc
index 0e60d59b6a24..bdc4422a6aff 100644
--- a/third_party/xla/xla/service/all_reduce_promotion.cc
+++ b/third_party/xla/xla/service/all_reduce_promotion.cc
@@ -62,7 +62,6 @@ std::unique_ptr<HloInstruction> CloneAllReduce(
     return inst->GetModule()->AddEmbeddedComputation(promoted.Build());
   }();
   new_inst->set_to_apply(to_apply_promoted);
-  to_apply_promoted->SetCollectiveCallInstruction(new_inst.get());
   return new_inst;
 }
 
diff --git a/third_party/xla/xla/service/all_reduce_promotion_test.cc b/third_party/xla/xla/service/all_reduce_promotion_test.cc
index 86d5fde6eb71..72aaa412ff49 100644
--- a/third_party/xla/xla/service/all_reduce_promotion_test.cc
+++ b/third_party/xla/xla/service/all_reduce_promotion_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -29,7 +29,7 @@ namespace xla {
 namespace {
 namespace m = ::xla::match;
 
-class AllReducePromotionTest : public HloTestBase {
+class AllReducePromotionTest : public HloHardwareIndependentTestBase {
  public:
   AllReducePromotion pass_{{{U16, U32}, {S16, S32}}};
 };
diff --git a/third_party/xla/xla/service/all_reduce_reassociate_test.cc b/third_party/xla/xla/service/all_reduce_reassociate_test.cc
index c0a91a93be21..809a12eef21b 100644
--- a/third_party/xla/xla/service/all_reduce_reassociate_test.cc
+++ b/third_party/xla/xla/service/all_reduce_reassociate_test.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -41,7 +41,7 @@ namespace {
 namespace m = xla::testing::opcode_matchers;
 using ::testing::_;
 
-class AllReduceSimplifierTest : public HloTestBase {
+class AllReduceSimplifierTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, bool expect_change,
diff --git a/third_party/xla/xla/service/all_reduce_simplifier.cc b/third_party/xla/xla/service/all_reduce_simplifier.cc
index c51492f0550c..13a6db2c62c1 100644
--- a/third_party/xla/xla/service/all_reduce_simplifier.cc
+++ b/third_party/xla/xla/service/all_reduce_simplifier.cc
@@ -155,7 +155,7 @@ absl::StatusOr<bool> AllReduceSimplifier::Run(
                       multiplier->shape(), all_reduce->shape().element_type()),
                   multiplier));
         }
-        if (all_reduce->shape().rank() > 0) {
+        if (all_reduce->shape().dimensions().size() > 0) {
           multiplier = all_reduce->parent()->AddInstruction(
               HloInstruction::CreateBroadcast(all_reduce->shape(), multiplier,
                                               {}));
diff --git a/third_party/xla/xla/service/all_reduce_simplifier_test.cc b/third_party/xla/xla/service/all_reduce_simplifier_test.cc
index 7048bf20a616..2fa45f29677d 100644
--- a/third_party/xla/xla/service/all_reduce_simplifier_test.cc
+++ b/third_party/xla/xla/service/all_reduce_simplifier_test.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -33,7 +33,7 @@ namespace {
 
 namespace m = match;
 
-using AllReduceSimplifierTest = HloTestBase;
+using AllReduceSimplifierTest = HloHardwareIndependentTestBase;
 
 TEST_F(AllReduceSimplifierTest, ReplicatedParameters) {
   const char* kModuleStr = R"(
diff --git a/third_party/xla/xla/service/all_to_all_decomposer.cc b/third_party/xla/xla/service/all_to_all_decomposer.cc
index 0e4eb7112dbb..e1edf66311ae 100644
--- a/third_party/xla/xla/service/all_to_all_decomposer.cc
+++ b/third_party/xla/xla/service/all_to_all_decomposer.cc
@@ -15,17 +15,14 @@ limitations under the License.
 
 #include "xla/service/all_to_all_decomposer.h"
 
-#include <cstdint>
 #include <optional>
 #include <vector>
 
-#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -34,19 +31,6 @@ limitations under the License.
 namespace xla {
 bool AllToAllDecomposer::InstructionMatchesPattern(
     HloInstruction* instruction) {
-  if (instruction->opcode() == HloOpcode::kRaggedAllToAll) {
-    auto* ragged_all_to_all =
-        DynCast<HloRaggedAllToAllInstruction>(instruction);
-    if (ragged_all_to_all == nullptr) {
-      return false;
-    }
-    // Do not attempt to change layout constrained collectives.
-    if (ragged_all_to_all->constrain_layout()) {
-      return false;
-    }
-    return ragged_all_to_all->shape().rank() < min_array_rank_;
-  }
-
   auto* all_to_all = DynCast<HloAllToAllInstruction>(instruction);
   if (all_to_all == nullptr) {
     return false;
@@ -61,67 +45,11 @@ bool AllToAllDecomposer::InstructionMatchesPattern(
   if (decompose_to_tuple_) {
     return true;
   }
-  return all_to_all->shape().rank() < min_array_rank_;
-}
-
-absl::StatusOr<HloInstruction*> AllToAllDecomposer::ExpandRaggedAllToAll(
-    HloInstruction* instruction) {
-  Shape input_shape = instruction->operand(0)->shape();
-  Shape aliased_output_shape = instruction->operand(1)->shape();
-  Shape output_shape = instruction->shape();
-  CHECK_EQ(instruction->operand_count(), 6);
-  CHECK_EQ(input_shape.rank(), output_shape.rank());
-  CHECK_EQ(output_shape, aliased_output_shape)
-      << "Output shape must match shape of operand 1 shape (which is aliased "
-         "to output).";
-
-  Shape new_input_shape;
-  Shape new_output_shape;
-  new_input_shape.set_element_type(input_shape.element_type());
-  new_output_shape.set_element_type(output_shape.element_type());
-
-  // New input and output shape are the same as original shape but dimensions
-  // are padded with 1s until min_array_rank_.
-  for (int64_t i = 0; i < input_shape.rank(); ++i) {
-    new_input_shape.add_dimensions(input_shape.dimensions(i));
-    new_output_shape.add_dimensions(output_shape.dimensions(i));
-  }
-  while (new_input_shape.dimensions_size() < min_array_rank_) {
-    new_input_shape.add_dimensions(1);
-    new_output_shape.add_dimensions(1);
-  }
-  *(new_input_shape.mutable_layout()) =
-      LayoutUtil::GetDefaultLayoutForRank(min_array_rank_);
-  *(new_output_shape.mutable_layout()) =
-      LayoutUtil::GetDefaultLayoutForRank(min_array_rank_);
-
-  // Reshape operands
-  HloInstruction* operand_0_reshape =
-      instruction->parent()->AddInstruction(HloInstruction::CreateReshape(
-          new_input_shape, instruction->mutable_operand(0)));
-  instruction->SetupDerivedInstruction(operand_0_reshape);
-  HloInstruction* operand_1_reshape =
-      instruction->parent()->AddInstruction(HloInstruction::CreateReshape(
-          new_output_shape, instruction->mutable_operand(1)));
-  instruction->SetupDerivedInstruction(operand_1_reshape);
-  HloInstruction* ragged_all_to_all =
-      instruction->parent()->AddInstruction(instruction->CloneWithNewOperands(
-          new_output_shape,
-          {operand_0_reshape, operand_1_reshape,
-           instruction->mutable_operand(2), instruction->mutable_operand(3),
-           instruction->mutable_operand(4), instruction->mutable_operand(5)}));
-  HloInstruction* output_reshape = instruction->parent()->AddInstruction(
-      HloInstruction::CreateReshape(instruction->shape(), ragged_all_to_all));
-  instruction->SetupDerivedInstruction(output_reshape);
-  return output_reshape;
+  return all_to_all->shape().dimensions().size() < min_array_rank_;
 }
 
 absl::StatusOr<HloInstruction*> AllToAllDecomposer::ExpandInstruction(
     HloInstruction* instruction) {
-  if (instruction->opcode() == HloOpcode::kRaggedAllToAll) {
-    return ExpandRaggedAllToAll(instruction);
-  }
-
   auto* all_to_all = Cast<HloAllToAllInstruction>(instruction);
   int64_t split_dim = *all_to_all->split_dimension();
   int64_t all_to_all_group_size =
@@ -134,15 +62,15 @@ absl::StatusOr<HloInstruction*> AllToAllDecomposer::ExpandInstruction(
     Shape new_all_to_all_shape;
     new_all_to_all_shape.set_element_type(
         instruction->operand(0)->shape().element_type());
-    for (int64_t i = 0; i < instruction->shape().rank(); ++i) {
+    for (int64_t i = 0; i < instruction->shape().dimensions().size(); ++i) {
       if (i != split_dim) {
         new_all_to_all_shape.add_dimensions(all_to_all->shape().dimensions(i));
         continue;
       }
       new_all_to_all_shape.add_dimensions(all_to_all_group_size);
       new_all_to_all_shape.add_dimensions(split_size);
-      for (int64_t j = all_to_all->shape().rank() + 1; j < min_array_rank_;
-           ++j) {
+      for (int64_t j = all_to_all->shape().dimensions().size() + 1;
+           j < min_array_rank_; ++j) {
         new_all_to_all_shape.add_dimensions(1);
       }
     }
@@ -160,8 +88,8 @@ absl::StatusOr<HloInstruction*> AllToAllDecomposer::ExpandInstruction(
     instruction->SetupDerivedInstruction(output_reshape);
     return output_reshape;
   }
-  DimensionVector slice_starts(all_to_all->shape().rank(), 0);
-  DimensionVector slice_strides(all_to_all->shape().rank(), 1);
+  DimensionVector slice_starts(all_to_all->shape().dimensions().size(), 0);
+  DimensionVector slice_strides(all_to_all->shape().dimensions().size(), 1);
   DimensionVector slice_limits(all_to_all->shape().dimensions().begin(),
                                all_to_all->shape().dimensions().end());
   slice_limits[split_dim] = split_size;
diff --git a/third_party/xla/xla/service/all_to_all_decomposer.h b/third_party/xla/xla/service/all_to_all_decomposer.h
index 4efe406bb49f..f05e586692b8 100644
--- a/third_party/xla/xla/service/all_to_all_decomposer.h
+++ b/third_party/xla/xla/service/all_to_all_decomposer.h
@@ -39,8 +39,6 @@ class AllToAllDecomposer : public OpExpanderPass {
   bool InstructionMatchesPattern(HloInstruction* instruction) override;
   absl::StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
-  absl::StatusOr<HloInstruction*> ExpandRaggedAllToAll(
-      HloInstruction* instruction);
   bool decompose_to_tuple_;
   int64_t min_array_rank_;
 };
diff --git a/third_party/xla/xla/service/all_to_all_decomposer_test.cc b/third_party/xla/xla/service/all_to_all_decomposer_test.cc
deleted file mode 100644
index f5415fa5b82c..000000000000
--- a/third_party/xla/xla/service/all_to_all_decomposer_test.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/all_to_all_decomposer.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace xla {
-namespace {
-
-using AllToAllDecomposerTest = HloTestBase;
-using ::testing::_;
-namespace op = xla::testing::opcode_matchers;
-
-TEST_F(AllToAllDecomposerTest, RaggedAllToAllRank1) {
-  const std::string module_str =
-      R"(HloModule RaggedAllToAll
-        ENTRY AllToAll {
-          p0 = s32[8]{0} parameter(0)
-          c0 = s32[] constant(0)
-          output = s32[8]{0} broadcast(c0), dimensions={}
-          p1 = s32[4]{0} parameter(1)
-          p2 = s32[4]{0} parameter(2)
-          p3 = s32[4]{0} parameter(3)
-          p4 = s32[4]{0} parameter(4)
-          input = s32[8]{0} copy(p0)
-          input_offsets = s32[4]{0} copy(p1)
-          send_sizes = s32[4]{0} copy(p2)
-          output_offsets = s32[4]{0} copy(p3)
-          recv_sizes = s32[4]{0} copy(p4)
-          ra2a = s32[8]{0} ragged-all-to-all(input, output, input_offsets, send_sizes, output_offsets, recv_sizes), replica_groups={{0,1,2,3}}
-          ROOT copy = s32[8]{0} copy(ra2a)
-        })";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((module_str)));
-  AllToAllDecomposer decomposer(true, 3);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
-  EXPECT_TRUE(changed);
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Copy(op::Reshape(op::RaggedAllToAll(
-                  op::Reshape(op::Copy(op::Parameter(0))),
-                  op::Reshape(op::Broadcast(op::Constant())), _, _, _, _))));
-  std::vector<HloInstruction*> reshapes;
-  std::vector<HloInstruction*> ragged_all_to_alls;
-  for (HloInstruction* instruction :
-       module->entry_computation()->instructions()) {
-    if (instruction->opcode() == HloOpcode::kReshape) {
-      reshapes.push_back(instruction);
-    }
-    if (instruction->opcode() == HloOpcode::kRaggedAllToAll) {
-      ragged_all_to_alls.push_back(instruction);
-    }
-  }
-  EXPECT_EQ(reshapes.size(), 3);
-  EXPECT_EQ(ragged_all_to_alls.size(), 1);
-  EXPECT_EQ(ragged_all_to_alls[0]->shape().rank(), 3);
-}
-
-TEST_F(AllToAllDecomposerTest, RaggedAllToAllRank3) {
-  const std::string module_str =
-      R"(HloModule RaggedAllToAll
-        ENTRY AllToAll {
-          p0 = s32[8,16,256]{2,1,0} parameter(0)
-          c0 = s32[] constant(0)
-          output = s32[8,16,256]{2,1,0} broadcast(c0), dimensions={}
-          p1 = s32[4]{0} parameter(1)
-          p2 = s32[4]{0} parameter(2)
-          p3 = s32[4]{0} parameter(3)
-          p4 = s32[4]{0} parameter(4)
-          input = s32[8,16,256]{2,1,0} copy(p0)
-          input_offsets = s32[4]{0} copy(p1)
-          send_sizes = s32[4]{0} copy(p2)
-          output_offsets = s32[4]{0} copy(p3)
-          recv_sizes = s32[4]{0} copy(p4)
-          ra2a = s32[8,16,256]{2,1,0} ragged-all-to-all(input, output, input_offsets, send_sizes, output_offsets, recv_sizes), replica_groups={{0,1,2,3}}
-          ROOT copy = s32[8,16,256]{2,1,0} copy(ra2a)
-        })";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((module_str)));
-  AllToAllDecomposer decomposer(true, 3);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/ar_crs_combiner.h b/third_party/xla/xla/service/ar_crs_combiner.h
deleted file mode 100644
index 57b36ee2b159..000000000000
--- a/third_party/xla/xla/service/ar_crs_combiner.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_AR_CRS_COMBINER_H_
-#define XLA_SERVICE_AR_CRS_COMBINER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/ar_crs_combiner.h"
-
-#endif  // XLA_SERVICE_AR_CRS_COMBINER_H_
diff --git a/third_party/xla/xla/service/async_collective_creator.h b/third_party/xla/xla/service/async_collective_creator.h
deleted file mode 100644
index f3141f50ece4..000000000000
--- a/third_party/xla/xla/service/async_collective_creator.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
-#define XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/async_collective_creator.h"
-
-#endif  // XLA_SERVICE_ASYNC_COLLECTIVE_CREATOR_H_
diff --git a/third_party/xla/xla/service/backend.cc b/third_party/xla/xla/service/backend.cc
index ed301131a22d..9ee413892de0 100644
--- a/third_party/xla/xla/service/backend.cc
+++ b/third_party/xla/xla/service/backend.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #define EIGEN_USE_THREADS
 
-#include "xla/service/backend.h"
-
 #include <memory>
 #include <optional>
 #include <set>
@@ -35,6 +33,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/service/backend.h"
 #include "xla/service/compiler.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/host/host_platform_id.h"
@@ -99,9 +98,9 @@ struct Backend::IntraOpThreadPool {
                       TransferManager::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto computation_placer,
                       ComputationPlacer::GetForPlatform(platform));
-  std::unique_ptr<Backend> backend(
-      new Backend(platform, compiler, stream_executors, transfer_manager,
-                  computation_placer, options.intra_op_parallelism_threads()));
+  std::unique_ptr<Backend> backend(new Backend(
+      platform, std::move(compiler), stream_executors, transfer_manager,
+      computation_placer, options.intra_op_parallelism_threads()));
   return std::move(backend);
 }
 
@@ -145,13 +144,13 @@ absl::StatusOr<std::vector<StreamPool::Ptr>> Backend::BorrowStreams(
   return ptrs;
 }
 
-Backend::Backend(se::Platform* platform, Compiler* compiler,
+Backend::Backend(se::Platform* platform, std::unique_ptr<Compiler> compiler,
                  absl::Span<se::StreamExecutor* const> stream_executors,
                  TransferManager* transfer_manager,
                  ComputationPlacer* computation_placer,
                  int intra_op_parallelism_threads)
     : platform_(platform),
-      compiler_(compiler),
+      compiler_(std::move(compiler)),
       transfer_manager_(transfer_manager),
       computation_placer_(computation_placer),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {
diff --git a/third_party/xla/xla/service/backend.h b/third_party/xla/xla/service/backend.h
index 85dbfea69c7f..4da3740b4ed7 100644
--- a/third_party/xla/xla/service/backend.h
+++ b/third_party/xla/xla/service/backend.h
@@ -92,7 +92,7 @@ class Backend {
 
   // Accessors for the various objects.
   se::Platform* platform() const { return platform_; }
-  Compiler* compiler() const { return compiler_; }
+  Compiler* compiler() const { return compiler_.get(); }
   se::DeviceMemoryAllocator* memory_allocator() const {
     return memory_allocator_.get();
   }
@@ -178,7 +178,7 @@ class Backend {
   absl::Status ResetDevices();
 
  private:
-  Backend(se::Platform* platform, Compiler* compiler,
+  Backend(se::Platform* platform, std::unique_ptr<Compiler> compiler,
           absl::Span<se::StreamExecutor* const> stream_executors,
           TransferManager* transfer_manager,
           ComputationPlacer* computation_placer,
@@ -187,7 +187,7 @@ class Backend {
   Backend& operator=(const Backend&) = delete;
 
   se::Platform* platform_;
-  Compiler* compiler_;
+  std::unique_ptr<Compiler> compiler_;
   TransferManager* transfer_manager_;
   ComputationPlacer* computation_placer_;
 
diff --git a/third_party/xla/xla/service/batch_dot_simplification.h b/third_party/xla/xla/service/batch_dot_simplification.h
deleted file mode 100644
index 381b67955adf..000000000000
--- a/third_party/xla/xla/service/batch_dot_simplification.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
-#define XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/batch_dot_simplification.h"
-
-#endif  // XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
diff --git a/third_party/xla/xla/service/batched_gather_scatter_normalizer.cc b/third_party/xla/xla/service/batched_gather_scatter_normalizer.cc
index c29a4cb65eb0..448e919445cc 100644
--- a/third_party/xla/xla/service/batched_gather_scatter_normalizer.cc
+++ b/third_party/xla/xla/service/batched_gather_scatter_normalizer.cc
@@ -97,7 +97,7 @@ HloInstruction* CreateConcatIndices(
 
   Shape iota_shape = indices->shape();
   const bool index_vector_dim_on_last_dim =
-      index_vector_dim == iota_shape.rank();
+      index_vector_dim == iota_shape.dimensions().size();
   if (index_vector_dim_on_last_dim) {
     std::vector<int64_t> dimensions(iota_shape.dimensions().begin(),
                                     iota_shape.dimensions().end());
diff --git a/third_party/xla/xla/service/batched_gather_scatter_normalizer.h b/third_party/xla/xla/service/batched_gather_scatter_normalizer.h
index 50c1d43def02..e0576c4478bc 100644
--- a/third_party/xla/xla/service/batched_gather_scatter_normalizer.h
+++ b/third_party/xla/xla/service/batched_gather_scatter_normalizer.h
@@ -27,7 +27,7 @@ namespace xla {
 class BatchedGatherScatterNormalizer : public OpExpanderPass {
  public:
   absl::string_view name() const override {
-    return "gather_scatter_normalizer";
+    return "batched_gather_scatter_normalizer";
   }
 
  protected:
diff --git a/third_party/xla/xla/service/batched_gather_scatter_normalizer_test.cc b/third_party/xla/xla/service/batched_gather_scatter_normalizer_test.cc
index ea6995651389..a6d699688cb1 100644
--- a/third_party/xla/xla/service/batched_gather_scatter_normalizer_test.cc
+++ b/third_party/xla/xla/service/batched_gather_scatter_normalizer_test.cc
@@ -19,12 +19,13 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 
 namespace xla {
 namespace {
 
-class BatchedGatherScatterNormalizerTest : public HloTestBase {};
+class BatchedGatherScatterNormalizerTest
+    : public HloHardwareIndependentTestBase {};
 
 TEST_F(BatchedGatherScatterNormalizerTest, NormalizeBatchGather) {
   constexpr absl::string_view kModuleStr = R"(
diff --git a/third_party/xla/xla/service/batchnorm_expander.cc b/third_party/xla/xla/service/batchnorm_expander.cc
index bc2f3a3230e9..a0052f1b0e65 100644
--- a/third_party/xla/xla/service/batchnorm_expander.cc
+++ b/third_party/xla/xla/service/batchnorm_expander.cc
@@ -109,7 +109,7 @@ class BatchNormExpanderVisitor : public DfsHloRewriteVisitor {
     auto elements_per_feature_s32 = add_instruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(1)));
 
-    for (int64_t i = 0; i < operand->shape().rank(); ++i) {
+    for (int64_t i = 0; i < operand->shape().dimensions().size(); ++i) {
       if (i == feature_index) {
         continue;
       }
@@ -190,7 +190,7 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       scalar_broadcast_shape,
       add(HloInstruction::CreateConstant(std::move(epsilon_literal))), {}));
   std::vector<int64_t> dimensions_without_feature;
-  const int64_t rank = operand_shape.rank();
+  const int64_t rank = operand_shape.dimensions().size();
   dimensions_without_feature.reserve(rank - 1);
 
   for (int64_t i = 0; i < rank; ++i) {
@@ -325,7 +325,7 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormInference(
       {}));
 
   std::vector<int64_t> dimensions_without_feature;
-  const int64_t rank = operand_shape.rank();
+  const int64_t rank = operand_shape.dimensions().size();
   dimensions_without_feature.reserve(rank - 1);
 
   for (int64_t i = 0; i < rank; ++i) {
@@ -448,7 +448,7 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       ShapeUtil::MakeStaticShape(feature_shape), epsilon_scalar, {}));
 
   std::vector<int64_t> dimensions_without_feature;
-  const int64_t rank = activation_shape.rank();
+  const int64_t rank = activation_shape.dimensions().size();
   dimensions_without_feature.reserve(rank - 1);
 
   for (int64_t i = 0; i < rank; ++i) {
@@ -525,7 +525,8 @@ absl::Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   // scale * rsqrt[Var[X] + epsilon] * 1/N
   Shape scale_times_rsqrt_var_add_epsilon_shape = scale_broadcasted->shape();
-  for (int64_t i = 0; i < rsqrt_var_add_epsilon_broadcasted->shape().rank();
+  for (int64_t i = 0;
+       i < rsqrt_var_add_epsilon_broadcasted->shape().dimensions().size();
        ++i) {
     if (rsqrt_var_add_epsilon_broadcasted->shape().is_dynamic_dimension(i)) {
       scale_times_rsqrt_var_add_epsilon_shape.set_dynamic_dimension(i, true);
diff --git a/third_party/xla/xla/service/batchnorm_expander_test.cc b/third_party/xla/xla/service/batchnorm_expander_test.cc
index 25cfa87004be..85141237f616 100644
--- a/third_party/xla/xla/service/batchnorm_expander_test.cc
+++ b/third_party/xla/xla/service/batchnorm_expander_test.cc
@@ -15,25 +15,27 @@ limitations under the License.
 
 #include "xla/service/batchnorm_expander.h"
 
+#include <cstdint>
 #include <memory>
-#include <utility>
 
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class BatchNormExpanderTest : public HloTestBase {
+class BatchNormExpanderTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  protected:
   // BatchNorm should have a dynamic sized divider for mean operations.
   int64_t CountGetDimensionSize(const HloModule& module) {
diff --git a/third_party/xla/xla/service/bfloat16_conversion_folding.h b/third_party/xla/xla/service/bfloat16_conversion_folding.h
deleted file mode 100644
index deb5675fc85c..000000000000
--- a/third_party/xla/xla/service/bfloat16_conversion_folding.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
-#define XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/bfloat16_conversion_folding.h"
-
-#endif  // XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
diff --git a/third_party/xla/xla/service/bfloat16_propagation.h b/third_party/xla/xla/service/bfloat16_propagation.h
deleted file mode 100644
index e3a0e0fab40b..000000000000
--- a/third_party/xla/xla/service/bfloat16_propagation.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_BFLOAT16_PROPAGATION_H_
-#define XLA_SERVICE_BFLOAT16_PROPAGATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/bfloat16_propagation.h"
-
-#endif  // XLA_SERVICE_BFLOAT16_PROPAGATION_H_
diff --git a/third_party/xla/xla/service/bitcast_dtypes_expander.h b/third_party/xla/xla/service/bitcast_dtypes_expander.h
deleted file mode 100644
index 7824af39cf58..000000000000
--- a/third_party/xla/xla/service/bitcast_dtypes_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_BITCAST_DTYPES_EXPANDER_H_
-#define XLA_SERVICE_BITCAST_DTYPES_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h"
-
-#endif  // XLA_SERVICE_BITCAST_DTYPES_EXPANDER_H_
diff --git a/third_party/xla/xla/service/broadcast_canonicalizer.h b/third_party/xla/xla/service/broadcast_canonicalizer.h
deleted file mode 100644
index efedf3ed3481..000000000000
--- a/third_party/xla/xla/service/broadcast_canonicalizer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_BROADCAST_CANONICALIZER_H_
-#define XLA_SERVICE_BROADCAST_CANONICALIZER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h"
-
-#endif  // XLA_SERVICE_BROADCAST_CANONICALIZER_H_
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index c809be5b0971..487c8ce7cbb9 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/map_util.h"
+#include "xla/service/buffer_assignment.pb.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
@@ -237,6 +238,29 @@ std::string BufferAllocation::Slice::ToString() const {
                       ", offset:", offset_, ", size:", size_, "}");
 }
 
+absl::StatusOr<xla::buffer_assignment::BufferAllocationSliceProto>
+BufferAllocation::Slice::ToProto() const {
+  xla::buffer_assignment::BufferAllocationSliceProto proto;
+  proto.set_offset(offset());
+  proto.set_size(size());
+  proto.set_buffer_allocation_index(allocation() == nullptr ? -1 : index());
+  return proto;
+}
+
+absl::StatusOr<BufferAllocation::Slice> BufferAllocation::Slice::FromProto(
+    const xla::buffer_assignment::BufferAllocationSliceProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  if (proto.buffer_allocation_index() < 0 ||
+      proto.buffer_allocation_index() >= buffer_allocations.size()) {
+    return absl::OutOfRangeError(absl::StrCat("Buffer allocation index ",
+                                              proto.buffer_allocation_index(),
+                                              " is out of range."));
+  }
+  const BufferAllocation& allocation =
+      buffer_allocations[proto.buffer_allocation_index()];
+  return BufferAllocation::Slice(&allocation, proto.offset(), proto.size());
+}
+
 BufferAllocation::Slice BufferAllocation::GetSlice(
     const HloValue& buffer) const {
   const OffsetSize os = FindOrDie(assigned_buffers_, &buffer);
@@ -1225,8 +1249,11 @@ absl::StatusOr<std::unique_ptr<BufferAssignment>> BufferAssignment::FromProto(
       absl::c_copy(alloc_proto.parameter_shape_index(),
                    std::back_inserter(shape_idx_vals));
       ShapeIndex shape_index(shape_idx_vals);
+      const bool parameter_has_alias =
+          module->input_output_alias_config().ParameterHasAlias(
+              alloc_proto.parameter_number(), shape_index);
       allocation->set_entry_computation_parameter(
-          alloc_proto.parameter_number(), shape_index, false);
+          alloc_proto.parameter_number(), shape_index, parameter_has_alias);
     }
 
     // Process each logical buffer assigned to the current allocation and create
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index 24093327cfd0..762ce70e357f 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -216,6 +216,13 @@ class BufferAllocation {
 
     std::string ToString() const;
 
+    absl::StatusOr<xla::buffer_assignment::BufferAllocationSliceProto> ToProto()
+        const;
+
+    static absl::StatusOr<BufferAllocation::Slice> FromProto(
+        const xla::buffer_assignment::BufferAllocationSliceProto& proto,
+        absl::Span<const BufferAllocation> buffer_allocations);
+
    private:
     const BufferAllocation* allocation_ = nullptr;
     int64_t offset_ = 0;
diff --git a/third_party/xla/xla/service/buffer_assignment.proto b/third_party/xla/xla/service/buffer_assignment.proto
index 98d9287bdb8d..6f6b2ac35aef 100644
--- a/third_party/xla/xla/service/buffer_assignment.proto
+++ b/third_party/xla/xla/service/buffer_assignment.proto
@@ -103,3 +103,9 @@ message BufferIsolationConfig {
   uint64 isolation_order_salt = 4;
   repeated int32 isolation_colors = 5;
 }
+
+message BufferAllocationSliceProto {
+  int64 offset = 1;
+  int64 size = 2;
+  int64 buffer_allocation_index = 3;
+}
diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index d91b13a7bff1..a0d75e560910 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -39,10 +39,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
 #include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/service/buffer_assignment.pb.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/copy_insertion.h"
@@ -52,9 +56,6 @@ limitations under the License.
 #include "xla/service/logical_buffer.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
@@ -64,7 +65,9 @@ namespace xla {
 namespace {
 
 using memory_space_assignment::PresetAssignments;
+using ::testing::HasSubstr;
 using ::testing::UnorderedElementsAre;
+using tsl::testing::StatusIs;
 
 // DFS visitor that collects the instructions referenced by a computation
 // without descending into nested computations, i.e., only from the operands.
@@ -99,7 +102,11 @@ const std::vector<const HloInstruction*> GetInstructions(HloInstruction* root) {
   return main_list.GetInstructions();
 }
 
-class BufferAssignmentTest : public HloTestBase {
+int64_t BufferSizeBytes(const BufferValue& buffer) {
+  return ShapeUtil::ByteSizeOf(buffer.shape(), sizeof(void*));
+}
+
+class BufferAssignmentTest : public HloHardwareIndependentTestBase {
  protected:
   ~BufferAssignmentTest() override {}
 
@@ -107,7 +114,7 @@ class BufferAssignmentTest : public HloTestBase {
                                                         int64_t alignment = 1) {
     return BufferAssigner::Run(
                module, std::make_unique<DependencyHloOrdering>(module),
-               backend().compiler()->BufferSizeBytesFunction(),
+               &BufferSizeBytes,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allocate_buffers_for_constants=*/true)
         .value();
@@ -118,9 +125,8 @@ class BufferAssignmentTest : public HloTestBase {
     // Dump proto for buffer assignments.
     auto proto = buffers->ToProto();
     // Recreate buffer assignment from proto.
-    return BufferAssignment::FromProto(
-        proto, module, backend().compiler()->BufferSizeBytesFunction(),
-        /*can_share_buffer=*/nullptr);
+    return BufferAssignment::FromProto(proto, module, &BufferSizeBytes,
+                                       /*can_share_buffer=*/nullptr);
   }
 
   std::unique_ptr<BufferAssignment> RunBufferAssignmentWithSequentialOrdering(
@@ -132,7 +138,7 @@ class BufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module,
                std::make_unique<SequentialHloOrdering>(module->schedule()),
-               backend().compiler()->BufferSizeBytesFunction(),
+               &BufferSizeBytes,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allocate_buffers_for_constants=*/true, colorer,
                /*must_not_live_out=*/std::nullopt, /*can_share_buffer=*/nullptr,
@@ -145,7 +151,7 @@ class BufferAssignmentTest : public HloTestBase {
       HloModule* module, int64_t alignment = 1) {
     return BufferAssigner::Run(
                module, std::make_unique<DependencyHloOrdering>(module),
-               backend().compiler()->BufferSizeBytesFunction(),
+               &BufferSizeBytes,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allocate_buffers_for_constants=*/false)
         .value();
@@ -161,7 +167,7 @@ class BufferAssignmentTest : public HloTestBase {
 
     return BufferAssigner::Run(
                module, std::make_unique<DependencyHloOrdering>(module),
-               backend().compiler()->BufferSizeBytesFunction(),
+               &BufferSizeBytes,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allocate_buffers_for_constants=*/false,
                /*colorer=*/BufferAssigner::DefaultColorer(),
@@ -174,7 +180,7 @@ class BufferAssignmentTest : public HloTestBase {
       int64_t alignment = 1) {
     return BufferAssigner::Run(
                module, std::make_unique<DependencyHloOrdering>(module),
-               backend().compiler()->BufferSizeBytesFunction(),
+               &BufferSizeBytes,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allocate_buffers_for_constants=*/true, std::move(colorer))
         .value();
@@ -187,7 +193,7 @@ class BufferAssignmentTest : public HloTestBase {
     schedule.set_sequence(module->entry_computation(), instruction_sequence);
     return BufferAssigner::Run(
                module, std::make_unique<SequentialHloOrdering>(schedule),
-               backend().compiler()->BufferSizeBytesFunction(),
+               &BufferSizeBytes,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allocate_buffers_for_constants=*/true)
         .value();
@@ -198,7 +204,7 @@ class BufferAssignmentTest : public HloTestBase {
       int64_t alignment = 1) {
     return BufferAssigner::Run(
                module, std::make_unique<DependencyHloOrdering>(module),
-               backend().compiler()->BufferSizeBytesFunction(),
+               &BufferSizeBytes,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allocate_buffers_for_constants=*/true,
                BufferAssigner::DefaultColorer(),
@@ -213,8 +219,7 @@ class BufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module,
                std::make_unique<SequentialHloOrdering>(module->schedule()),
-               backend().compiler()->BufferSizeBytesFunction(),
-               [](LogicalBuffer::Color) { return 1; },
+               &BufferSizeBytes, [](LogicalBuffer::Color) { return 1; },
                /*allocate_buffers_for_constants=*/true,
                BufferAssigner::DefaultColorer(),
                /*must_not_live_out=*/std::nullopt, /*can_share_buffer=*/nullptr,
@@ -1771,8 +1776,6 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
             GetTopLevelAllocation(*assignment, tuple_element));
 }
 
-// TODO(b/32248867): Enable when buffer assignment gives allocations to
-// constants.
 TEST_F(BufferAssignmentTest, TupleConstantAsOutput) {
   // Test that a tuple constant which is forwarded to the computation output
   // is properly handled.
@@ -1956,8 +1959,6 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) {
             GetTopLevelAllocation(*assignment, bitcast));
 }
 
-// TODO(b/34669761): Remove this test when buffers are allowed to share
-// allocations.
 TEST_F(BufferAssignmentTest, TupleBufferNotReused) {
   // Test a computation that returns a tuple parameter.
   auto builder = HloComputation::Builder(TestName());
@@ -2286,7 +2287,7 @@ ENTRY main {
   }
 }
 
-class WhileBufferAssignmentTest : public HloTestBase {
+class WhileBufferAssignmentTest : public HloHardwareIndependentTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
       const std::string& name) {
@@ -2642,8 +2643,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
       auto assignment,
       BufferAssigner::Run(
           module.get(), std::make_unique<SequentialHloOrdering>(schedule),
-          backend().compiler()->BufferSizeBytesFunction(),
-          [](LogicalBuffer::Color) { return 1; },
+          &BufferSizeBytes, [](LogicalBuffer::Color) { return 1; },
           /*allocate_buffers_for_constants=*/true));
 
   // The result tuple elements must be assigned with different buffers.
@@ -3487,5 +3487,50 @@ ENTRY entry_computation {
   EXPECT_EQ(dus9_alloc_slice.allocation(), dus5_alloc_slice.allocation());
   EXPECT_EQ(dus9_alloc_slice, dus5_alloc_slice);
 }
+
+TEST(BufferAllocationSliceProtoTest, RoundTripProto) {
+  BufferAllocation original_alloc =
+      BufferAllocation(/*index=*/1, /*size=*/500, /*color=*/0);
+  BufferAllocation::Slice original_slice(&original_alloc, /*offset=*/50,
+                                         /*size=*/100);
+
+  // Convert to proto
+  TF_ASSERT_OK_AND_ASSIGN(
+      xla::buffer_assignment::BufferAllocationSliceProto proto,
+      original_slice.ToProto());
+
+  // Convert back from proto
+  std::vector<BufferAllocation> allocations_for_from_proto = {
+      BufferAllocation(/*index=*/0, /*size=*/50, /*color=*/0),
+      original_alloc,
+      BufferAllocation(/*index=*/2, /*size=*/600, /*color=*/0),
+  };
+  TF_ASSERT_OK_AND_ASSIGN(
+      BufferAllocation::Slice round_tripped_slice,
+      BufferAllocation::Slice::FromProto(proto, allocations_for_from_proto));
+
+  EXPECT_EQ(round_tripped_slice.allocation(), &allocations_for_from_proto[1]);
+  EXPECT_EQ(round_tripped_slice.offset(), original_slice.offset());
+  EXPECT_EQ(round_tripped_slice.size(), original_slice.size());
+}
+
+TEST(BufferAllocationSliceProtoTest, FromProtoErrorAllocationNotFound) {
+  xla::buffer_assignment::BufferAllocationSliceProto proto;
+  proto.set_buffer_allocation_index(2);
+  proto.set_offset(50);
+  proto.set_size(60);
+
+  std::vector<BufferAllocation> allocations;
+  allocations.push_back(
+      BufferAllocation(/*index=*/0, /*size=*/50, /*color=*/0));
+  allocations.push_back(
+      BufferAllocation(/*index=*/1, /*size=*/200, /*color=*/0));
+
+  EXPECT_THAT(
+      BufferAllocation::Slice::FromProto(proto, allocations),
+      StatusIs(absl::StatusCode::kOutOfRange,
+               HasSubstr("Buffer allocation index 2 is out of range.")));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/call_graph.cc b/third_party/xla/xla/service/call_graph.cc
index 80515e13ea75..1429b7172c36 100644
--- a/third_party/xla/xla/service/call_graph.cc
+++ b/third_party/xla/xla/service/call_graph.cc
@@ -73,7 +73,6 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSort:
-    case HloOpcode::kTopK:
     case HloOpcode::kFusion:
     case HloOpcode::kCustomCall:
       return CallContext::kEmbedded;
diff --git a/third_party/xla/xla/service/call_graph_test.cc b/third_party/xla/xla/service/call_graph_test.cc
index a619cd5ffe6e..c93cd937f520 100644
--- a/third_party/xla/xla/service/call_graph_test.cc
+++ b/third_party/xla/xla/service/call_graph_test.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -47,7 +47,7 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class CallGraphTest : public HloTestBase {
+class CallGraphTest : public HloHardwareIndependentTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
   std::unique_ptr<HloComputation> MakeScalarComputation(
diff --git a/third_party/xla/xla/service/call_inliner.cc b/third_party/xla/xla/service/call_inliner.cc
index 081625d46eea..bf6777637b80 100644
--- a/third_party/xla/xla/service/call_inliner.cc
+++ b/third_party/xla/xla/service/call_inliner.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
@@ -39,8 +40,6 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -176,15 +175,12 @@ bool InlineComposites(
              instruction->frontend_attributes().map().at("composite.name"));
 }
 
-bool InlineStreamAnnotation(HloInstruction* instruction) {
-  if (instruction->GetModule()
-          ->config()
-          .debug_options()
-          .xla_gpu_experimental_stream_annotation()) {
-    if (instruction->frontend_attributes().map().contains(
-            kXlaStreamAnnotationAttr)) {
-      return false;
-    }
+// Introduces a specific attribute so that the frontend has the direct
+// control over inlining specific calls.
+bool InlineInstruction(HloInstruction* instruction) {
+  auto it = instruction->frontend_attributes().map().find("inlineable");
+  if (it != instruction->frontend_attributes().map().end()) {
+    return it->second == "true";
   }
   return true;
 }
@@ -238,12 +234,19 @@ CallInliner::Inline(HloInstruction* call) {
 }
 
 bool CallInliner::IsInlineableCallOp(HloInstruction* instruction) const {
-  return instruction->opcode() == HloOpcode::kCall &&
-         !instruction->has_backend_config() &&
-         !instruction->parent()->IsAsyncComputation() &&
-         InlineUnderShardy(instruction) &&
-         InlineComposites(instruction, composites_to_preserve_) &&
-         InlineStreamAnnotation(instruction);
+  bool prerequisite = instruction->opcode() == HloOpcode::kCall &&
+                      !instruction->has_backend_config() &&
+                      !instruction->parent()->IsAsyncComputation();
+  if (!prerequisite) {
+    return false;
+  }
+  if (!InlineInstruction(instruction)) {
+    // Always prioritize user's explicit requests after fulfilling the
+    // prerequisites.
+    return false;
+  }
+  return InlineUnderShardy(instruction) &&
+         InlineComposites(instruction, composites_to_preserve_);
 }
 
 absl::StatusOr<bool> CallInliner::Run(
@@ -259,6 +262,7 @@ absl::StatusOr<bool> CallInliner::Run(
             node.computation()->execution_thread(), execution_threads)) {
       return absl::OkStatus();
     }
+    bool did_node_mutate = false;
     VLOG(1) << "Visiting node: " << node.ToString();
     for (HloInstruction* instruction :
          node.computation()->MakeInstructionPostOrder()) {
@@ -281,10 +285,20 @@ absl::StatusOr<bool> CallInliner::Run(
               TF_RETURN_IF_ERROR(isolator.UpdateDomains(inlined_inst).status());
             }
           }
+          did_node_mutate = true;
           did_mutate = true;
         }
       }
     }
+    if (did_node_mutate && uniquify_channel_ids_) {
+      int unique_channel_id = 1;
+      for (HloInstruction* instruction : node.computation()->instructions()) {
+        if (dynamic_cast<HloChannelInstruction*>(instruction)) {
+          instruction->set_channel_id(unique_channel_id++);
+        }
+      }
+    }
+
     return absl::OkStatus();
   }));
   if (did_mutate) {
diff --git a/third_party/xla/xla/service/call_inliner.h b/third_party/xla/xla/service/call_inliner.h
index 3eb2b7f11757..b921628bb1b3 100644
--- a/third_party/xla/xla/service/call_inliner.h
+++ b/third_party/xla/xla/service/call_inliner.h
@@ -44,11 +44,15 @@ class CallInliner : public HloModulePass {
   // inlined.
   // If update_domain is true, the exit domains could be updated for calls which
   // are being inlined if necessary.
+  // If `uniquify_channel_ids` is true, the channel ids of the resulting
+  // computation will be uniquified.
   explicit CallInliner(
       bool single_call_site = false, bool update_domain = false,
-      absl::flat_hash_set<std::string> composites_to_preserve = {})
+      absl::flat_hash_set<std::string> composites_to_preserve = {},
+      bool uniquify_channel_ids = false)
       : single_call_site_(single_call_site),
         update_domain_(update_domain),
+        uniquify_channel_ids_(uniquify_channel_ids),
         composites_to_preserve_(std::move(composites_to_preserve)) {}
   ~CallInliner() override = default;
   absl::string_view name() const override { return "call-inliner"; }
@@ -65,6 +69,7 @@ class CallInliner : public HloModulePass {
  private:
   bool single_call_site_;
   bool update_domain_;
+  bool uniquify_channel_ids_;
   absl::flat_hash_set<std::string> composites_to_preserve_;
 };
 
diff --git a/third_party/xla/xla/service/call_inliner_test.cc b/third_party/xla/xla/service/call_inliner_test.cc
index 602a126344f2..c75616010714 100644
--- a/third_party/xla/xla/service/call_inliner_test.cc
+++ b/third_party/xla/xla/service/call_inliner_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 
 #include <cstdint>
+#include <memory>
 #include <string>
 
 #include "absl/log/log.h"
@@ -23,17 +24,18 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -42,7 +44,7 @@ namespace {
 
 // Tests for call inlining that are most tractable at the HLO level (vs
 // ComputationBuilder API in call_test.cc).
-using CallInlinerTest = HloTestBase;
+using CallInlinerTest = HloHardwareIndependentTestBase;
 
 TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   // "inner" computation just has a control dependency from the "zero" value to
@@ -406,6 +408,31 @@ TEST_F(CallInlinerTest, PreserveCompositeCall) {
   EXPECT_FALSE((*inst)->frontend_attributes().map().empty());
 }
 
+TEST_F(CallInlinerTest, DontInlineCallWithAttributeInlineableFalse) {
+  const char* const hloString = R"(
+    HloModule jit_f, entry_computation_layout={(f32[8,8]{1,0})->f32[8,8]{1,0}}
+    %test (Arg_0.5: f32[1,8]) -> f32[1,8] {
+      %Arg_0.5 = f32[1,8]{1,0} parameter(0)
+      ROOT %add.6 = f32[1,8]{1,0} add(f32[1,8]{1,0} %Arg_0.5, f32[1,8]{1,0} %Arg_0.5), metadata={source_file="-" source_line=11}
+    }
+    ENTRY %main.10 (Arg_0.1: f32[8,8]) -> f32[8,8] {
+      %Arg_0.1 = f32[8,8]{1,0} parameter(0)
+      %custom-call.3 = f32[1,8]{1,0} custom-call(f32[8,8]{1,0} %Arg_0.1), custom_call_target="SPMDFullToShardShape", sharding={manual}, metadata={source_file="-" source_line=4}
+      %call.7 = f32[1,8]{1,0} call(f32[1,8]{1,0} %custom-call.3), to_apply=%test, frontend_attributes={inlineable="false"}
+      ROOT %custom-call.9 = f32[8,8]{1,0} custom-call(f32[1,8]{1,0} %call.7), custom_call_target="SPMDShardToFullShape", sharding={devices=[8,1]<=[8]}, metadata={source_file="-" source_line=7}
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hloString));
+  module->mutable_config().set_use_shardy_partitioner(true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, CallInliner().Run(module.get()))
+  // The single call in the module is not inlined.
+  EXPECT_FALSE(changed);
+
+  HloInstruction* call = FindInstruction(module.get(), xla::HloOpcode::kCall);
+  EXPECT_NE(call, nullptr);
+  EXPECT_TRUE(call->has_to_apply());
+  EXPECT_EQ(call->to_apply()->name(), "test");
+}
+
 TEST_F(CallInlinerTest, UseShardyMhloToHloShmapBodyNotInlined) {
   const char* const hloString = R"(
     HloModule jit_f, entry_computation_layout={(f32[8,8]{1,0})->f32[8,8]{1,0}}
@@ -495,59 +522,6 @@ TEST_F(CallInlinerTest, UseShardManualComputationBodySurroundedNotInlined) {
             "my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234");
 }
 
-TEST_F(CallInlinerTest, DontInlineStreamAnnotationCall) {
-  const absl::string_view hlo_string = R"(
-  HloModule composite
-
-  %add (lhs: f32[]) -> f32[] {
-    %lhs = f32[] parameter(0)
-    %rhs = f32[] constant(2)
-    ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
-  }
-
-  %sub (lhs: f32[]) -> f32[] {
-    %lhs = f32[] parameter(0)
-    %rhs = f32[] constant(1)
-    ROOT %sub = f32[] subtract(f32[] %lhs, f32[] %rhs)
-  }
-
-  ENTRY %main () -> f32[] {
-    %lhs = f32[] constant(42)
-    %call1 = f32[] call(f32[] %lhs), to_apply=%sub, frontend_attributes={_xla_stream_annotation="1"}
-    ROOT %call2 = f32[] call(f32[] %call1), to_apply=%add
-  })";
-
-  auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_experimental_stream_annotation(true);
-  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
-  module->mutable_config().set_debug_options(debug_options);
-  CallInliner call_inliner(/*single_call_site=*/true);
-
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
-  absl::StatusOr<bool> filecheck_result = RunFileCheck(module->ToString({}), R"(
-  //CHECK: %lhs.2 = f32[] constant(42)
-  //CHECK: %call1 = f32[] call(f32[] %lhs.2), to_apply=%sub, frontend_attributes={_xla_stream_annotation="1"}
-  //CHECK: %rhs.2 = f32[] constant(2)
-  //CHECK: ROOT %add.1 = f32[] add(f32[] %call1, f32[] %rhs.2)
-  )");
-  TF_ASSERT_OK(filecheck_result.status());
-  EXPECT_TRUE(*filecheck_result);
-
-  ASSERT_TRUE(mutated);
-  ASSERT_EQ(module->entry_computation()->instruction_count(), 4);
-  auto inst = module->entry_computation()->instructions().begin();
-  EXPECT_THAT(*inst, op::Constant());
-  // Check that the annotated call isn't inlined
-  ++inst;
-  EXPECT_THAT(*inst, op::Call());
-
-  // Check that the non-annotated call is still inlined
-  ++inst;
-  EXPECT_THAT(*inst, op::Constant());
-  ++inst;
-  EXPECT_THAT(*inst, op::Add());
-}
-
 TEST_F(CallInlinerTest, ControlDepsPropagateToRootOfInlinedInstructions) {
   const char* hlo = R"(
   HloModule test
@@ -591,5 +565,41 @@ TEST_F(CallInlinerTest, ControlDepsPropagateToRootOfInlinedInstructions) {
   EXPECT_TRUE(filecheck_result);
 }
 
+TEST_F(CallInlinerTest, ChannelIdsAreUniquifiedWhenSettingIsEnabled) {
+  const char* hlo = R"(
+ag {
+  input = f32[128,32] parameter(0)
+  ROOT ag = f32[128,128] all-gather(input),
+    replica_groups={}, dimensions={1}, channel_id=1337
+}
+
+ag2 {
+  input = f32[128,128] parameter(0)
+  ROOT ag = f32[128,128] all-gather(input),
+    replica_groups={}, dimensions={1}, channel_id=1337
+}
+
+ENTRY main {
+  input = f32[128,32] parameter(0)
+  ag = f32[128,128] call(input), to_apply=ag
+  ag2 = f32[128,128] call(ag), to_apply=ag2
+  ROOT result = (f32[128,128], f32[128,128]) tuple(ag2, ag)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo));
+  CallInliner call_inliner(
+      /*single_call_site=*/false, /*update_domain=*/false,
+      /*composites_to_preserve=*/{}, /*uniquify_channel_ids=*/true);
+  EXPECT_THAT(call_inliner.Run(m.get()), ::tsl::testing::IsOkAndHolds(true));
+
+  auto ag = m->entry_computation()->root_instruction()->operand(0);
+  auto ag2 = m->entry_computation()->root_instruction()->operand(1);
+
+  EXPECT_THAT(ag, op::AllGather());
+  EXPECT_THAT(ag2, op::AllGather());
+  EXPECT_NE(ag->channel_id(), ag2->channel_id());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/change_op_data_type_test.cc b/third_party/xla/xla/service/change_op_data_type_test.cc
index 2bd746b4bc6b..2bd1f3583017 100644
--- a/third_party/xla/xla/service/change_op_data_type_test.cc
+++ b/third_party/xla/xla/service/change_op_data_type_test.cc
@@ -20,21 +20,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
 
 namespace m = ::xla::match;
 
-class ChangeOpDataTypeTest : public HloTestBase {
- public:
-  ChangeOpDataTypeTest()
-      : HloTestBase(/*verifier_layout_sensitive=*/false,
-                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
-};
+using ChangeOpDataTypeTest = HloHardwareIndependentTestBase;
 
 TEST_F(ChangeOpDataTypeTest, Simple) {
   const char* const kModuleStr = R"(
diff --git a/third_party/xla/xla/service/cholesky_expander.h b/third_party/xla/xla/service/cholesky_expander.h
deleted file mode 100644
index 7e9e7332e917..000000000000
--- a/third_party/xla/xla/service/cholesky_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CHOLESKY_EXPANDER_H_
-#define XLA_SERVICE_CHOLESKY_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/cholesky_expander.h"
-
-#endif  // XLA_SERVICE_CHOLESKY_EXPANDER_H_
diff --git a/third_party/xla/xla/service/collective_combiner_utils.h b/third_party/xla/xla/service/collective_combiner_utils.h
index 5fb45edf907d..31106569afef 100644
--- a/third_party/xla/xla/service/collective_combiner_utils.h
+++ b/third_party/xla/xla/service/collective_combiner_utils.h
@@ -125,6 +125,9 @@ absl::StatusOr<bool> CombineInstructionsByKey(
       // We can't combine dependent instructions.
       bool is_reachable =
           absl::c_any_of(to_combine, [&](HloInstruction* to_combine_inst) {
+            // We don't need a call to IsConnected() here because we iterate
+            // through instructions in topological order, which implies that
+            // IsReachable(instruction, to_combine_inst) would return false.
             bool reachable =
                 reachability->IsReachable(to_combine_inst, instruction);
             if (reachable) {
diff --git a/third_party/xla/xla/service/collective_decomposer_utils.cc b/third_party/xla/xla/service/collective_decomposer_utils.cc
index d86c6b5ae4e9..807760567786 100644
--- a/third_party/xla/xla/service/collective_decomposer_utils.cc
+++ b/third_party/xla/xla/service/collective_decomposer_utils.cc
@@ -41,7 +41,8 @@ CreateStartIndicesForCollectiveDecomposition(
   if (update_layout) {
     update_layout(*zero->mutable_shape());
   }
-  std::vector<HloInstruction *> start_indices(shard_shape.rank(), zero);
+  std::vector<HloInstruction *> start_indices(shard_shape.dimensions().size(),
+                                              zero);
   const Shape &scalar_shape = zero->shape();
 
   auto create_flattened_id = [&](HloInstruction *replica_index) {
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index 5e3e9fa51e5d..648b645cd540 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -158,10 +158,36 @@ absl::StatusOr<std::vector<int>> GetParticipatingIDs(
                           group->replica_ids().end());
 }
 
-// Returns the group formation mode of instr, assuming that instr is, or is
-// derived from, an HloAllGatherInstruction, HloAllReduceInstructionBase,
-// HloAllToAllInstruction, HloCollectiveBroadcastInstruction or
-// HloCollectivePermuteInstruction.
+absl::StatusOr<std::vector<std::vector<int64_t>>> GetAsyncReplicaGroups(
+    const HloInstruction* instruction) {
+  std::vector<std::vector<int64_t>> replica_groups;
+  if (instruction->opcode() == HloOpcode::kCollectivePermuteStart) {
+    absl::c_transform(instruction->source_target_pairs(),
+                      std::back_inserter(replica_groups),
+                      [](const std::pair<int64_t, int64_t>& pair) {
+                        std::vector<int64_t> ids({pair.first, pair.second});
+                        return ids;
+                      });
+  } else if (instruction->IsAsynchronous() ||
+             instruction->opcode() == HloOpcode::kAllGatherStart ||
+             instruction->opcode() == HloOpcode::kAllReduceStart) {
+    absl::c_transform(
+        instruction->replica_groups(), std::back_inserter(replica_groups),
+        [](const ReplicaGroup& group) {
+          std::vector<int64_t> ids;
+          absl::c_transform(group.replica_ids(), std::back_inserter(ids),
+                            [](auto id) { return id; });
+          return ids;
+        });
+  } else {
+    return InvalidArgument(
+        "Unexpected instruction type: %s is not an async collective "
+        "instruction",
+        instruction->ToString());
+  }
+  return replica_groups;
+}
+
 absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
     const HloInstruction* instr) {
   if (auto collective = DynCast<HloAllGatherInstruction>(instr)) {
@@ -181,30 +207,13 @@ absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
                  DynCast<HloCollectivePermuteInstruction>(instr)) {
     return GetCollectiveOpGroupMode(collective->channel_id().has_value(),
                                     std::nullopt);
+  } else if (auto collective = DynCast<HloRaggedAllToAllInstruction>(instr)) {
+    return GetCollectiveOpGroupMode(collective->channel_id().has_value(),
+                                    std::nullopt);
   }
   return Internal("Unexpected instruction type.");
 }
 
-absl::StatusOr<bool> GetCollectiveUseGlobalDeviceIds(
-    const HloInstruction* hlo) {
-  const bool is_all_reduce = (hlo->opcode() == HloOpcode::kAllReduce ||
-                              hlo->opcode() == HloOpcode::kAllReduceStart ||
-                              hlo->opcode() == HloOpcode::kReduceScatter);
-  const bool is_all_gather = (hlo->opcode() == HloOpcode::kAllGather ||
-                              hlo->opcode() == HloOpcode::kAllGatherStart);
-  if (!is_all_reduce && !is_all_gather) {
-    return absl::InvalidArgumentError(
-        "GetReplicaGroupCountAndSize only supports AllReduce and AllGather.");
-  }
-  return is_all_reduce
-             ? Cast<HloAllReduceInstructionBase>(hlo)->use_global_device_ids()
-             : Cast<HloAllGatherInstruction>(hlo)->use_global_device_ids();
-}
-
-std::optional<int64_t> GetCollectiveChannelId(const HloInstruction* hlo) {
-  return Cast<HloCollectiveInstruction>(hlo)->channel_id();
-}
-
 const CollectiveDeviceList& GetCollectiveDeviceList(const HloInstruction* hlo) {
   return Cast<HloCollectiveInstruction>(hlo)->device_list();
 }
@@ -364,12 +373,8 @@ GetParticipatingDevicesGroups(const HloInstruction* collective) {
   CHECK(collective->GetModule()->config().has_static_device_assignment());
   const DeviceAssignment& device_assignment =
       collective->GetModule()->config().static_device_assignment();
-  TF_ASSIGN_OR_RETURN(bool use_global_device_ids,
-                      GetCollectiveUseGlobalDeviceIds(collective));
-  TF_ASSIGN_OR_RETURN(
-      CollectiveOpGroupMode mode,
-      GetCollectiveOpGroupMode(GetCollectiveChannelId(collective).has_value(),
-                               use_global_device_ids));
+  TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode mode,
+                      GetCollectiveOpGroupMode(collective));
   return GetParticipatingDevicesGroups(
       device_assignment, GetCollectiveReplicaGroups(collective), mode);
 }
@@ -474,12 +479,8 @@ absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
 
 absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
     const HloInstruction* hlo, const DeviceAssignment& device_assignment) {
-  TF_ASSIGN_OR_RETURN(bool use_global_device_ids,
-                      GetCollectiveUseGlobalDeviceIds(hlo));
-  TF_ASSIGN_OR_RETURN(
-      CollectiveOpGroupMode mode,
-      GetCollectiveOpGroupMode(GetCollectiveChannelId(hlo).has_value(),
-                               use_global_device_ids));
+  TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode mode,
+                      GetCollectiveOpGroupMode(hlo));
   TF_ASSIGN_OR_RETURN(
       std::vector<ReplicaGroup> replica_groups,
       GetParticipatingFlattenedIdGroups(device_assignment,
@@ -490,12 +491,8 @@ absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
 // Same as above, used for cases where static_device_assignment is not present.
 absl::StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
     const HloInstruction* hlo, int replica_count, int partition_count) {
-  TF_ASSIGN_OR_RETURN(bool use_global_device_ids,
-                      GetCollectiveUseGlobalDeviceIds(hlo));
-  TF_ASSIGN_OR_RETURN(
-      CollectiveOpGroupMode mode,
-      GetCollectiveOpGroupMode(GetCollectiveChannelId(hlo).has_value(),
-                               use_global_device_ids));
+  TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode mode,
+                      GetCollectiveOpGroupMode(hlo));
   TF_ASSIGN_OR_RETURN(
       std::vector<ReplicaGroup> replica_groups,
       GetParticipatingFlattenedIdGroups(GetCollectiveReplicaGroups(hlo), mode,
@@ -681,12 +678,8 @@ GetReplicaGroupCountAndSize(const HloInstruction* hlo) {
         device_list.iota_replica_group_list()->num_replica_groups(),
         device_list.iota_replica_group_list()->num_devices_per_group());
   }
-  TF_ASSIGN_OR_RETURN(bool use_global_device_ids,
-                      GetCollectiveUseGlobalDeviceIds(hlo));
-  TF_ASSIGN_OR_RETURN(
-      CollectiveOpGroupMode group_mode,
-      GetCollectiveOpGroupMode(GetCollectiveChannelId(hlo).has_value(),
-                               use_global_device_ids));
+  TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
+                      GetCollectiveOpGroupMode(hlo));
   TF_ASSIGN_OR_RETURN(std::vector<int64_t> participant_counts,
                       GetPariticipantCountsForReplicaGroups(
                           config.replica_count(), config.num_partitions(),
@@ -779,6 +772,52 @@ bool IsCollective(const HloInstruction* instruction) {
   return false;
 }
 
+absl::StatusOr<bool> IsAsyncCollective(const HloInstruction* instruction) {
+  if (!IsNonFusionCollective(instruction)) {
+    return false;
+  }
+  if (instruction->IsAsynchronous()) {
+    switch (instruction->async_wrapped_opcode()) {
+      case HloOpcode::kAllGather:
+      case HloOpcode::kAllReduce:
+      case HloOpcode::kAllToAll:
+      case HloOpcode::kCollectiveBroadcast:
+      case HloOpcode::kCollectivePermute:
+      case HloOpcode::kRaggedAllToAll:
+      case HloOpcode::kReduceScatter:
+        return true;
+      default:
+        return absl::InvalidArgumentError("Async instruction " +
+                                          instruction->ToString() +
+                                          " is not a collective.");
+    }
+  }
+  switch (instruction->opcode()) {
+    case HloOpcode::kAllGatherStart:
+    case HloOpcode::kAllGatherDone:
+    case HloOpcode::kAllReduceStart:
+    case HloOpcode::kAllReduceDone:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kCollectivePermuteDone:
+      return true;
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+      return !Cast<HloSendRecvInstruction>(instruction)->is_host_transfer();
+    case HloOpcode::kAllGather:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllToAll:
+    case HloOpcode::kCollectiveBroadcast:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kRaggedAllToAll:
+    case HloOpcode::kReduceScatter:
+      return false;
+    default:
+      return absl::InvalidArgumentError("Instruction " +
+                                        instruction->ToString() +
+                                        " is not an async collective.");
+  }
+}
+
 HloInstruction* IsOrHasCollectiveWithChannelId(HloInstruction* instruction) {
   if (instruction->opcode() == HloOpcode::kFusion) {
     for (auto* inner_inst : instruction->fused_instructions()) {
@@ -857,7 +896,7 @@ bool IsExclusivelyCrossModule(absl::Span<const ReplicaGroup> replica_groups,
     return false;
   }
   if (!use_global_ids) {
-    // Each id is a replica group is a replica id. If any group
+    // Each id in a replica group is a replica id. If any group
     // has more than one id then this is not exclusively cross module.
     for (const ReplicaGroup& replica_group : replica_groups) {
       if (replica_group.replica_ids_size() != 1) {
@@ -869,7 +908,7 @@ bool IsExclusivelyCrossModule(absl::Span<const ReplicaGroup> replica_groups,
   // Each id in a replica group is a global id. Check if all replica groups are
   // exclusively cross module (all participants in a group have the same replica
   // id).
-  int64_t partition_count = device_assignment.computation_count();
+  const int64_t partition_count = device_assignment.computation_count();
   for (const ReplicaGroup& replica_group : replica_groups) {
     std::optional<int64_t> first_replica_id;
     for (int64_t global_id : replica_group.replica_ids()) {
@@ -883,4 +922,34 @@ bool IsExclusivelyCrossModule(absl::Span<const ReplicaGroup> replica_groups,
   }
   return true;
 }
+
+bool IsExclusivelyCrossReplica(absl::Span<const ReplicaGroup> replica_groups,
+                               bool use_global_ids, bool has_channel_id,
+                               const DeviceAssignment& device_assignment) {
+  if (!has_channel_id) {
+    return true;
+  }
+  const int64_t partition_count = device_assignment.computation_count();
+  if (!use_global_ids) {
+    // Each id in a replica group is a replica id and we will perform the
+    // collective between all devices with that replica id. If partition count
+    // is > 1, then this is not exclusively cross replica.
+    return partition_count == 1;
+  }
+  // Each id in a replica group is a global id. Check if all replica groups are
+  // exclusively cross replica (all participants in a group have the same
+  // partition id).
+  for (const ReplicaGroup& replica_group : replica_groups) {
+    std::optional<int64_t> first_partition_id;
+    for (int64_t global_id : replica_group.replica_ids()) {
+      int64_t partition_id = global_id % partition_count;
+      if (!first_partition_id.has_value()) {
+        first_partition_id = partition_id;
+      } else if (partition_id != first_partition_id) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
 }  // end namespace xla
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index 407e0010f09e..adeb5ed53625 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -126,22 +126,26 @@ absl::StatusOr<std::vector<int>> GetParticipatingIDs(
     std::optional<int> total_participant_count,
     absl::Span<const ReplicaGroup> groups);
 
+// Returns the replica groups for the given async collective instruction.
+absl::StatusOr<std::vector<std::vector<int64_t>>> GetAsyncReplicaGroups(
+    const HloInstruction* instruction);
+
 absl::string_view CollectiveOpGroupModeToString(
     CollectiveOpGroupMode group_mode);
 
-absl::StatusOr<bool> GetCollectiveUseGlobalDeviceIds(const HloInstruction* hlo);
-
-std::optional<int64_t> GetCollectiveChannelId(const HloInstruction* hlo);
-
 const CollectiveDeviceList& GetCollectiveDeviceList(const HloInstruction* hlo);
 
 const std::vector<ReplicaGroup>& GetCollectiveReplicaGroups(
     const HloInstruction* hlo);
 
 // Returns the group formation mode of instr, assuming that instr is, or is
-// dervied from, an HloAllGatherInstruction, HloAllReduceInstructionBase,
-// HloAllToAllInstruction, HloCollectiveBroadcastInstruction or
-// HloCollectivePermuteInstruction.
+// derived from on the following instructions:
+//   * HloAllGatherInstruction
+//   * HloAllReduceInstructionBase
+//   * HloAllToAllInstruction
+//   * HloCollectiveBroadcastInstruction
+//   * HloCollectivePermuteInstruction
+//   * HloRaggedAllToAllInstruction
 absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
     const HloInstruction* instr);
 
@@ -230,6 +234,12 @@ bool IsExclusivelyCrossModule(absl::Span<const ReplicaGroup> replica_groups,
                               bool use_global_ids, bool has_channel_id,
                               const DeviceAssignment& device_assignment);
 
+// Returns true if all subgroups in replica_groups are exclusively
+// cross-replica.
+bool IsExclusivelyCrossReplica(absl::Span<const ReplicaGroup> replica_groups,
+                               bool use_global_ids, bool has_channel_id,
+                               const DeviceAssignment& device_assignment);
+
 // A custom call target that can be used to create a nop that can legally
 // replace a collective op.
 inline constexpr absl::string_view kNopCustomCallTarget = "AllocateBuffer";
@@ -245,6 +255,9 @@ bool IsNonFusionCollective(const HloInstruction* instruction);
 // Returns true if instruction is a collective op or a collective fusion.
 bool IsCollective(const HloInstruction* instruction);
 
+// Returns true if instruction is an async collective op.
+absl::StatusOr<bool> IsAsyncCollective(const HloInstruction* instruction);
+
 // Returns the collective instruction if argument is a collective op (or a
 // collective fusion) with channel_id.
 HloInstruction* IsOrHasCollectiveWithChannelId(HloInstruction* instruction);
diff --git a/third_party/xla/xla/service/collective_ops_utils_test.cc b/third_party/xla/xla/service/collective_ops_utils_test.cc
index 7bbdb178c1b6..fdbecbc1c3fa 100644
--- a/third_party/xla/xla/service/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/collective_ops_utils_test.cc
@@ -290,6 +290,247 @@ TEST(IsExclusivelyCrossModuleTest, CrossModuleWithGlobalIds) {
   EXPECT_TRUE(is_exclusively_cross_module);
 }
 
+TEST(CollectiveOpsUtilsTest, GetReplicaGroups) {
+  // Create a module for the test
+  HloModule module("GetReplicaGroupsTest", HloModuleConfig());
+
+  // Set up a collective permute start instruction
+  auto builder = HloComputation::Builder("GetReplicaGroupsTest");
+  auto param_shape = ShapeUtil::MakeShape(F32, {4, 4});
+  HloInstruction *param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "p0"));
+
+  // Test for CollectivePermuteStart
+  std::vector<std::pair<int64_t, int64_t>> source_target_pairs = {
+      {0, 1}, {1, 2}, {2, 3}, {3, 0}};
+
+  HloInstruction *permute_start =
+      builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
+          param_shape, param_0, source_target_pairs, /*channel_id=*/1));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::vector<int64_t>> permute_groups,
+                          GetAsyncReplicaGroups(permute_start));
+  EXPECT_EQ(permute_groups.size(), 4);
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(permute_groups[i].size(), 2);
+    EXPECT_EQ(permute_groups[i][0], source_target_pairs[i].first);
+    EXPECT_EQ(permute_groups[i][1], source_target_pairs[i].second);
+  }
+
+  // Test for AllGatherStart
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0, 1}, {2, 3}});
+  HloInstruction *all_gather_start =
+      builder.AddInstruction(HloInstruction::CreateAllGatherStart(
+          ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
+          /*all_gather_dimension=*/0, replica_groups,
+          /*constrain_layout=*/false,
+          /*channel_id=*/1, /*use_global_device_ids=*/false));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::vector<int64_t>> all_gather_groups,
+                          GetAsyncReplicaGroups(all_gather_start));
+  EXPECT_EQ(all_gather_groups.size(), 2);
+  EXPECT_THAT(all_gather_groups[0], testing::ElementsAre(0, 1));
+  EXPECT_THAT(all_gather_groups[1], testing::ElementsAre(2, 3));
+
+  // Test for AllReduceStart
+  // Create a reduction computation
+  HloComputation::Builder reducer_builder("add");
+  auto reducer_x = reducer_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeScalarShape(F32), "x"));
+  auto reducer_y = reducer_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeScalarShape(F32), "y"));
+  reducer_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeScalarShape(F32), HloOpcode::kAdd, reducer_x, reducer_y));
+
+  HloComputation *add_computation =
+      module.AddEmbeddedComputation(reducer_builder.Build());
+
+  HloInstruction *all_reduce_start =
+      builder.AddInstruction(HloInstruction::CreateAllReduceStart(
+          ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
+          add_computation, replica_groups, /*constrain_layout=*/false,
+          /*channel_id=*/2, /*use_global_device_ids=*/false));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::vector<int64_t>> all_reduce_groups,
+                          GetAsyncReplicaGroups(all_reduce_start));
+  EXPECT_EQ(all_reduce_groups.size(), 2);
+  EXPECT_THAT(all_reduce_groups[0], testing::ElementsAre(0, 1));
+  EXPECT_THAT(all_reduce_groups[1], testing::ElementsAre(2, 3));
+}
+
+TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
+  // Create module and computation
+  HloModule module("test_module", HloModuleConfig());
+  auto builder = HloComputation::Builder("IsAsyncCollectiveTest");
+  auto param_shape = ShapeUtil::MakeShape(F32, {4, 4});
+  HloInstruction *param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "p0"));
+
+  // Test for CollectivePermuteStart and CollectivePermuteDone
+  std::vector<std::pair<int64_t, int64_t>> source_target_pairs = {
+      {0, 1}, {1, 2}, {2, 3}, {3, 0}};
+
+  HloInstruction *permute_start =
+      builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
+          param_shape, param_0, source_target_pairs, /*channel_id=*/1));
+
+  auto is_async_status = IsAsyncCollective(permute_start);
+  EXPECT_TRUE(is_async_status.ok());
+  EXPECT_TRUE(is_async_status.value());
+
+  HloInstruction *permute_done =
+      builder.AddInstruction(HloInstruction::CreateUnary(
+          param_shape, HloOpcode::kCollectivePermuteDone, permute_start));
+
+  is_async_status = IsAsyncCollective(permute_done);
+  EXPECT_TRUE(is_async_status.ok());
+  EXPECT_TRUE(is_async_status.value());
+
+  // Test for AllGatherStart and AllGatherDone
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0, 1}, {2, 3}});
+
+  HloInstruction *all_gather_start =
+      builder.AddInstruction(HloInstruction::CreateAllGatherStart(
+          ShapeUtil::MakeTupleShape(
+              {ShapeUtil::MakeShape(F32, {8, 4}), param_shape}),
+          {param_0}, /*all_gather_dimension=*/0, replica_groups,
+          /*constrain_layout=*/false,
+          /*channel_id=*/2, /*use_global_device_ids=*/false));
+
+  is_async_status = IsAsyncCollective(all_gather_start);
+  EXPECT_TRUE(is_async_status.ok());
+  EXPECT_TRUE(is_async_status.value());
+
+  HloInstruction *all_gather_done = builder.AddInstruction(
+      HloInstruction::CreateUnary(ShapeUtil::MakeShape(F32, {8, 4}),
+                                  HloOpcode::kAllGatherDone, all_gather_start));
+
+  is_async_status = IsAsyncCollective(all_gather_done);
+  EXPECT_TRUE(is_async_status.ok());
+  EXPECT_TRUE(is_async_status.value());
+
+  // Test for AllReduceStart and AllReduceDone
+  // First create a reduction computation
+  HloComputation::Builder reducer_builder("add");
+  HloInstruction *reducer_x = reducer_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeScalarShape(F32), "x"));
+  HloInstruction *reducer_y = reducer_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeScalarShape(F32), "y"));
+  reducer_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeScalarShape(F32), HloOpcode::kAdd, reducer_x, reducer_y));
+
+  HloComputation *add_computation =
+      module.AddEmbeddedComputation(reducer_builder.Build());
+
+  HloInstruction *all_reduce_start =
+      builder.AddInstruction(HloInstruction::CreateAllReduceStart(
+          ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
+          add_computation, replica_groups, /*constrain_layout=*/false,
+          /*channel_id=*/3, /*use_global_device_ids=*/false));
+
+  is_async_status = IsAsyncCollective(all_reduce_start);
+  EXPECT_TRUE(is_async_status.ok());
+  EXPECT_TRUE(is_async_status.value());
+
+  HloInstruction *all_reduce_done =
+      builder.AddInstruction(HloInstruction::CreateUnary(
+          param_shape, HloOpcode::kAllReduceDone, all_reduce_start));
+
+  is_async_status = IsAsyncCollective(all_reduce_done);
+  EXPECT_TRUE(is_async_status.ok());
+  EXPECT_TRUE(is_async_status.value());
+
+  // Test for regular CollectivePermute (non-async)
+  HloInstruction *permute =
+      builder.AddInstruction(HloInstruction::CreateCollectivePermute(
+          param_shape, param_0, source_target_pairs, /*channel_id=*/1));
+
+  is_async_status = IsAsyncCollective(permute);
+  EXPECT_TRUE(is_async_status.ok());
+  EXPECT_FALSE(is_async_status.value());
+}
+
+TEST(IsExclusivelyCrossReplicaTest, CrossReplicaNoChannelSet) {
+  int64_t num_replicas = 4;
+  int64_t num_partitions = 2;
+  DeviceAssignment device_assignment(num_replicas, num_partitions);
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0, 1}, {2, 3}});
+  EXPECT_TRUE(
+      IsExclusivelyCrossReplica(replica_groups, /*use_global_ids=*/false,
+                                /*has_channel_id=*/false, device_assignment));
+}
+
+TEST(IsExclusivelyCrossReplicaTest, CrossReplicaAndCrossModuleNoGlobalIds) {
+  int64_t num_replicas = 4;
+  int64_t num_partitions = 2;
+  DeviceAssignment device_assignment(num_replicas, num_partitions);
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0, 1}, {2, 3}});
+
+  EXPECT_FALSE(
+      IsExclusivelyCrossReplica(replica_groups, /*use_global_ids=*/false,
+                                /*has_channel_id=*/true, device_assignment));
+}
+
+TEST(IsExclusivelyCrossReplicaTest, CrossModuleNoGlobalIds) {
+  int64_t num_replicas = 4;
+  int64_t num_partitions = 2;
+  ComputationPlacer placer;
+  TF_ASSERT_OK_AND_ASSIGN(DeviceAssignment device_assignment,
+                          placer.AssignDevices(num_replicas, num_partitions));
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0}, {1}, {2}, {3}});
+
+  EXPECT_FALSE(
+      IsExclusivelyCrossReplica(replica_groups, /*use_global_ids=*/false,
+                                /*has_channel_id=*/true, device_assignment));
+}
+
+TEST(IsExclusivelyCrossReplicaTest, CrossReplicaWithGlobalIds) {
+  int64_t num_replicas = 8;
+  int64_t num_partitions = 1;
+  ComputationPlacer placer;
+  TF_ASSERT_OK_AND_ASSIGN(DeviceAssignment device_assignment,
+                          placer.AssignDevices(num_replicas, num_partitions));
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0, 1, 2, 3, 4, 5, 6, 7}});
+
+  EXPECT_TRUE(IsExclusivelyCrossReplica(replica_groups, /*use_global_ids=*/true,
+                                        /*has_channel_id=*/true,
+                                        device_assignment));
+}
+
+TEST(IsExclusivelyCrossReplicaTest, CrossReplicaAndCrossModuleWithGlobalIds) {
+  int64_t num_replicas = 4;
+  int64_t num_partitions = 2;
+  ComputationPlacer placer;
+  TF_ASSERT_OK_AND_ASSIGN(DeviceAssignment device_assignment,
+                          placer.AssignDevices(num_replicas, num_partitions));
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0, 1, 2, 3, 4, 5, 6, 7}});
+
+  EXPECT_FALSE(
+      IsExclusivelyCrossReplica(replica_groups, /*use_global_ids=*/true,
+                                /*has_channel_id=*/true, device_assignment));
+}
+
+TEST(IsExclusivelyCrossReplicaTest, CrossModuleWithGlobalIds) {
+  int64_t num_replicas = 4;
+  int64_t num_partitions = 2;
+
+  ComputationPlacer placer;
+  TF_ASSERT_OK_AND_ASSIGN(DeviceAssignment device_assignment,
+                          placer.AssignDevices(num_replicas, num_partitions));
+  std::vector<ReplicaGroup> replica_groups =
+      CreateReplicaGroups({{0, 1}, {2, 3}, {4, 5}, {6, 7}});
+
+  EXPECT_FALSE(
+      IsExclusivelyCrossReplica(replica_groups, /*use_global_ids=*/true,
+                                /*has_channel_id=*/true, device_assignment));
+}
 }  // namespace
 
 // Tests for GetCollectOpGroupMode
@@ -374,6 +615,10 @@ std::vector<TestCaseForInstruction> GetTestCasesForInstruction() {
       {HloOpcode::kCollectivePermute, true, std::nullopt,
        CollectiveOpGroupMode::kCrossPartition},
       {HloOpcode::kCollectivePermute, false, std::nullopt,
+       CollectiveOpGroupMode::kCrossReplica},
+      {HloOpcode::kRaggedAllToAll, true, std::nullopt,
+       CollectiveOpGroupMode::kCrossPartition},
+      {HloOpcode::kRaggedAllToAll, false, std::nullopt,
        CollectiveOpGroupMode::kCrossReplica}};
 }
 
@@ -453,6 +698,20 @@ TEST_P(GetCollectOpGroupModeTestForInstruction, Test) {
           builder.AddInstruction(HloInstruction::CreateCollectivePermute(
               two_elements, parameter, source_target_pairs, channel_id()));
       break;
+    case HloOpcode::kRaggedAllToAll: {
+      // Create a parameter with s64 to use a offset and size operands.
+      TF_ASSERT_OK_AND_ASSIGN(
+          HloInstruction * offset_size_parameter,
+          builder.AddParameter(HloInstruction::CreateParameter(
+              1, ShapeUtil::MakeShape(S64, {4}), "offset_size_parameter")));
+
+      collective = builder.AddInstruction(HloInstruction::CreateRaggedAllToAll(
+          eight_elements,
+          {parameter, parameter, offset_size_parameter, offset_size_parameter,
+           offset_size_parameter, offset_size_parameter},
+          {group}, channel_id()));
+      break;
+    }
     default:
       LOG(FATAL) << "Unexpected opcode.";
   }
diff --git a/third_party/xla/xla/service/collective_opt_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
index e3d0c2f94f1c..ef3090fc876f 100644
--- a/third_party/xla/xla/service/collective_opt_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -39,7 +39,7 @@ bool IsTableLookup(const HloInstruction* hlo) {
   return hlo->opcode() == HloOpcode::kDynamicSlice &&
          (hlo->operand(0)->IsConstant() ||
           hlo->operand(0)->opcode() == HloOpcode::kIota) &&
-         hlo->operand(0)->shape().rank() == 1 &&
+         hlo->operand(0)->shape().dimensions().size() == 1 &&
          (hlo->operand(0)->shape().element_type() == S32 ||
           hlo->operand(0)->shape().element_type() == U32);
 }
@@ -48,7 +48,7 @@ std::optional<int64_t> GetScalarInt64Value(const HloInstruction* constant) {
   CHECK_EQ(constant->opcode(), HloOpcode::kConstant);
   CHECK(ShapeUtil::IsEffectiveScalar(constant->shape()));
   absl::InlinedVector<int64_t, 8> multi_index(
-      constant->shape().dimensions_size());
+      constant->shape().dimensions().size());
   return constant->literal().GetIntegralAsS64(multi_index);
 }
 
@@ -197,7 +197,7 @@ bool IsPerIdOffset(const HloInstruction* offset, int64_t shard_size,
   }
 
   if (offset->opcode() == HloOpcode::kConvert &&
-      offset->operand(0)->shape().IsInteger() &&
+      offset->operand(0)->shape().AreAllLeavesIntegers() &&
       primitive_util::BitWidth(offset->operand(0)->shape().element_type()) <=
           primitive_util::BitWidth(offset->shape().element_type())) {
     return IsPerIdOffset(offset->operand(0), shard_size, map_id, group_size,
@@ -273,7 +273,7 @@ std::optional<ReduceScatterSpec> SpecFromReduceScatterInstr(
     const HloInstruction* rs_instr, int64_t num_partitions,
     int64_t num_replicas, int64_t min_rank, bool is_constrain_layout,
     bool use_global_device_ids, bool is_cross_module) {
-  if (rs_instr->shape().rank() < min_rank) {
+  if (rs_instr->shape().dimensions().size() < min_rank) {
     return std::nullopt;
   }
   CHECK(rs_instr->opcode() == HloOpcode::kReduceScatter);
@@ -358,7 +358,7 @@ std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
     VLOG(2) << "Unsupported collective: " << instruction->ToString();
     return std::nullopt;
   }
-  if (instruction->shape().rank() -
+  if (instruction->shape().dimensions().size() -
           absl::c_count(instruction->shape().dimensions(), 1) <
       min_rank) {
     VLOG(2) << " Should be at least rank-" << min_rank
@@ -513,7 +513,8 @@ std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
   // First find a single dimension where the input and output of dynamic slice
   // differ.
   int num_dims = 0;
-  for (int64_t dim = 0; dim < user->operand(0)->shape().rank(); ++dim) {
+  for (int64_t dim = 0; dim < user->operand(0)->shape().dimensions().size();
+       ++dim) {
     if (user->operand(0)->shape().dimensions(dim) ==
         user->shape().dimensions(dim)) {
       continue;
@@ -532,7 +533,7 @@ std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
   }
   const Shape& shape = user->operand(0)->shape();
   if (spec.split_dim == -1) {
-    for (int64_t dim = 0; dim < shape.rank(); ++dim) {
+    for (int64_t dim = 0; dim < shape.dimensions().size(); ++dim) {
       auto offset = user->operand(dim + 1);
       // Skip trivial (1) dimensions or if the index is a constant 0.
       if (shape.dimensions(dim) == 1 ||
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index f80d88b2e262..4c8314be1128 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -66,7 +66,9 @@ static bool ShouldDecompose(
   }
 
   // Respect threshold to limit this pass.
-  if (ShapeUtil::ByteSizeOf(result_shape) < threshold_in_bytes) {
+  if (pipeline_parallelism_opt_level ==
+          DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE &&
+      ShapeUtil::ByteSizeOf(result_shape) < threshold_in_bytes) {
     return false;
   }
 
@@ -175,18 +177,22 @@ static absl::StatusOr<DecomposedCp> DecomposeCollectivePermute(
   TF_RETURN_IF_ERROR(cp->ReplaceAllUsesWith(recv_data));
   TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(cp));
 
-  // Control dependencies are require to assure order of the instructions.
-  // To avoid deadlocks as the program runs on multiple devices, we need to
-  // assure that we initiate receival before initiating sending and that receive
-  // done is executed after send is initiated.
-  TF_RETURN_IF_ERROR(recv->AddControlDependencyTo(send));
-  if (pipeline_parallelism_opt_level !=
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE) {
-    TF_RETURN_IF_ERROR(recv_done->AddControlDependencyTo(send_done));
-  }
-  if (pipeline_parallelism_opt_level ==
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE) {
-    TF_RETURN_IF_ERROR(send->AddControlDependencyTo(recv_done));
+  // We choose to run recv before send as an invariant, which helps avoid
+  // deadlocks. At the same time, running recv before send allows for pipelining
+  // recv into prior loop iterations, which is especially beneficial for
+  // pipeline parallelism.
+  switch (pipeline_parallelism_opt_level) {
+    case DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE:
+      TF_RETURN_IF_ERROR(recv->AddControlDependencyTo(send));
+      TF_RETURN_IF_ERROR(send->AddControlDependencyTo(recv_done));
+      break;
+    case DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE:
+      TF_RETURN_IF_ERROR(recv_done->AddControlDependencyTo(send));
+      break;
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported pipeline parallelism opt level: ",
+                       pipeline_parallelism_opt_level));
   }
 
   if (!pipeline_decision.empty()) {
diff --git a/third_party/xla/xla/service/collective_permute_decomposer_test.cc b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
index 0aecf2e0b1ad..3b23cdf02d1a 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer_test.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
@@ -136,10 +136,29 @@ TEST_F(DecomposerTest, ThresholdNotTransformed) {
   TF_ASSERT_OK(RunAndCheckHloRewrite(
       hlo,
       Pass(/*threshold_in_bytes=*/kThreshold,
-           DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE),
+           DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE),
       false));
 }
 
+TEST_F(DecomposerTest, ThresholdIgnoredWhenPipelineParallelismOptEnabled) {
+  const int64_t kThreshold = 64 * 8;
+  std::string hlo = GetSimpleHloWhileLoopStr(R"(
+  body {
+    param = (u32[], f32[64]) parameter(0)
+    i = get-tuple-element(param), index=0
+    data = get-tuple-element(param), index=1
+    cp = f32[64] collective-permute(data),
+        source_target_pairs={{0,1}, {1,2}, {2,3}}
+    ROOT result = tuple(i, cp)
+  }
+  )");
+  TF_ASSERT_OK(RunAndCheckHloRewrite(
+      hlo,
+      Pass(/*threshold_in_bytes=*/kThreshold,
+           DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE),
+      true));
+}
+
 TEST_F(DecomposerTest, Basic) {
   std::string hlo = GetSimpleHloWhileLoopStr(R"(
   body {
@@ -675,9 +694,11 @@ TEST_F(DecomposerTest, OneSendRecvWithOneConflictingCollectivePermute) {
   HloInstruction* cp_fwd_send_done =
       FindInstruction(module.get(), "cp_fwd-send-done");
   ASSERT_THAT(cp_cycle, NotNull());
+  ASSERT_THAT(cp_fwd_recv, NotNull());
   ASSERT_THAT(cp_fwd_recv_done, NotNull());
+  ASSERT_THAT(cp_fwd_send, NotNull());
   ASSERT_THAT(cp_fwd_send_done, NotNull());
-  EXPECT_THAT(cp_fwd_send_done->control_predecessors(),
+  EXPECT_THAT(cp_fwd_send->control_predecessors(),
               ElementsAre(cp_fwd_recv_done));
   EXPECT_THAT(cp_cycle->control_predecessors(), ElementsAre(cp_fwd_send_done));
 
@@ -759,9 +780,11 @@ TEST_F(DecomposerTest, OneSendRecvWithOneConflictingAllReduce) {
   HloInstruction* cp_fwd_send_done =
       FindInstruction(module.get(), "cp_fwd-send-done");
   ASSERT_THAT(ar, NotNull());
+  ASSERT_THAT(cp_fwd_recv, NotNull());
   ASSERT_THAT(cp_fwd_recv_done, NotNull());
+  ASSERT_THAT(cp_fwd_send, NotNull());
   ASSERT_THAT(cp_fwd_send_done, NotNull());
-  EXPECT_THAT(cp_fwd_send_done->control_predecessors(),
+  EXPECT_THAT(cp_fwd_send->control_predecessors(),
               ElementsAre(cp_fwd_recv_done));
   EXPECT_THAT(ar->control_predecessors(), ElementsAre(cp_fwd_send_done));
 
@@ -846,9 +869,11 @@ TEST_F(DecomposerTest, OneSendRecvWithConflictingSendRecv) {
       FindInstruction(module.get(), "cp_fwd-send-done");
   ASSERT_THAT(conflicting_recv, NotNull());
   ASSERT_THAT(conflicting_send, NotNull());
+  ASSERT_THAT(cp_fwd_recv, NotNull());
   ASSERT_THAT(cp_fwd_recv_done, NotNull());
+  ASSERT_THAT(cp_fwd_send, NotNull());
   ASSERT_THAT(cp_fwd_send_done, NotNull());
-  EXPECT_THAT(cp_fwd_send_done->control_predecessors(),
+  EXPECT_THAT(cp_fwd_send->control_predecessors(),
               ElementsAre(cp_fwd_recv_done));
   EXPECT_THAT(conflicting_recv->control_predecessors(),
               ElementsAre(cp_fwd_send_done));
@@ -937,9 +962,11 @@ TEST_F(DecomposerTest, OneSendRecvWithNonConflictingAllReduce) {
   HloInstruction* cp_fwd_send_done =
       FindInstruction(module.get(), "cp_fwd-send-done");
   ASSERT_THAT(ar, NotNull());
+  ASSERT_THAT(cp_fwd_recv, NotNull());
   ASSERT_THAT(cp_fwd_recv_done, NotNull());
+  ASSERT_THAT(cp_fwd_send, NotNull());
   ASSERT_THAT(cp_fwd_send_done, NotNull());
-  EXPECT_THAT(cp_fwd_send_done->control_predecessors(),
+  EXPECT_THAT(cp_fwd_send->control_predecessors(),
               ElementsAre(cp_fwd_recv_done));
   EXPECT_THAT(ar->control_predecessors(), ElementsAre());
 
@@ -1037,13 +1064,18 @@ TEST_F(DecomposerTest, OneSendRecvWithConflictingAndNonConflictingCollectives) {
   HloInstruction* cp_fwd_send = FindInstruction(module.get(), "cp_fwd-send");
   HloInstruction* cp_fwd_send_done =
       FindInstruction(module.get(), "cp_fwd-send-done");
+  ASSERT_THAT(cp_fwd_recv, NotNull());
+  ASSERT_THAT(cp_fwd_recv_done, NotNull());
+  ASSERT_THAT(cp_fwd_send, NotNull());
+  ASSERT_THAT(cp_fwd_send_done, NotNull());
   ASSERT_THAT(cp_cycle, NotNull());
   ASSERT_THAT(ar, NotNull());
   ASSERT_THAT(arc, NotNull());
-  ASSERT_THAT(cp_fwd_recv_done, NotNull());
-  ASSERT_THAT(cp_fwd_send_done, NotNull());
-  EXPECT_THAT(cp_fwd_send_done->control_predecessors(),
+  EXPECT_THAT(cp_fwd_recv->control_predecessors(), ElementsAre());
+  EXPECT_THAT(cp_fwd_recv_done->control_predecessors(), ElementsAre());
+  EXPECT_THAT(cp_fwd_send->control_predecessors(),
               ElementsAre(cp_fwd_recv_done));
+  EXPECT_THAT(cp_fwd_send_done->control_predecessors(), ElementsAre());
   EXPECT_THAT(cp_cycle->control_predecessors(), ElementsAre(cp_fwd_send_done));
   EXPECT_THAT(ar->control_predecessors(), ElementsAre());
   EXPECT_THAT(arc->control_predecessors(), ElementsAre(cp_fwd_send_done));
@@ -1140,11 +1172,15 @@ TEST_F(DecomposerTest, OneSendRecvWithIndirectlyConflictingCollectives) {
       FindInstruction(module.get(), "cp_fwd-send-done");
   ASSERT_THAT(cp_cycle, NotNull());
   ASSERT_THAT(cp_cycle2, NotNull());
+  ASSERT_THAT(cp_fwd_recv, NotNull());
   ASSERT_THAT(cp_fwd_recv_done, NotNull());
+  ASSERT_THAT(cp_fwd_send, NotNull());
   ASSERT_THAT(cp_fwd_send_done, NotNull());
+  ASSERT_THAT(cp_fwd_recv->control_predecessors(), ElementsAre());
   ASSERT_THAT(cp_fwd_recv_done->control_predecessors(), ElementsAre());
-  EXPECT_THAT(cp_fwd_send_done->control_predecessors(),
+  EXPECT_THAT(cp_fwd_send->control_predecessors(),
               ElementsAre(cp_fwd_recv_done));
+  EXPECT_THAT(cp_fwd_send_done->control_predecessors(), ElementsAre());
   EXPECT_THAT(cp_cycle->control_predecessors(), ElementsAre(cp_fwd_send_done));
   EXPECT_THAT(cp_cycle2->control_predecessors(), ElementsAre(cp_fwd_send_done));
 
diff --git a/third_party/xla/xla/service/collective_permute_key.cc b/third_party/xla/xla/service/collective_permute_key.cc
index f6ba9f68514d..320a7cb19039 100644
--- a/third_party/xla/xla/service/collective_permute_key.cc
+++ b/third_party/xla/xla/service/collective_permute_key.cc
@@ -34,7 +34,7 @@ std::optional<CollectivePermuteKey> GetCollectivePermuteKey(
   }
 
   const auto* cp = Cast<HloCollectivePermuteInstruction>(instruction);
-  return CollectivePermuteKey{cp->source_target_pairs(), cp->channel_id()};
+  return CollectivePermuteKey{cp->source_target_pairs()};
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/collective_permute_key.h b/third_party/xla/xla/service/collective_permute_key.h
index ea0018633d88..3876aeaddc16 100644
--- a/third_party/xla/xla/service/collective_permute_key.h
+++ b/third_party/xla/xla/service/collective_permute_key.h
@@ -32,8 +32,7 @@ namespace xla {
 // collective-permute instructions to be compatible with each other (and hence
 // be possible to combine the instructions).
 using CollectivePermuteKey = std::tuple<
-    /*source_target_pairs*/ std::vector<std::pair<int64_t, int64_t>>,
-    /*channel_id*/ std::optional<int64_t>>;
+    /*source_target_pairs*/ std::vector<std::pair<int64_t, int64_t>>>;
 
 std::optional<CollectivePermuteKey> GetCollectivePermuteKey(
     const HloInstruction* instruction);
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index aa78704476b9..ac4ffea3f8ed 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -56,13 +56,15 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/map_util.h"
 #include "xla/primitive_util.h"
-#include "xla/service/call_graph.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/service/collective_pipeliner_utils.h"
 #include "xla/service/constant_value.h"
 #include "xla/service/scheduling_annotations_util.h"
 #include "xla/service/value_range.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -97,6 +99,10 @@ absl::Status UpdateControlDependencies(HloInstruction* original,
     // Only update add control dependencies between ops outside of the loop or
     // within the loop body. If the control dependency crosses the loop boundary
     // it is enforced by the loop structure.
+    // Note that parent can be null here if the instruction is in flight in a
+    // body builder. If both parents are null, then these instructions will be
+    // in the same computation eventually and we do want to add the control
+    // dependency here.
     if (it->second->parent() == new_instr->parent()) {
       TF_RETURN_IF_ERROR(it->second->AddControlDependencyTo(new_instr));
     }
@@ -207,7 +213,7 @@ CollectDynamicSliceIndicesIfConstant(HloInstruction* instr) {
   for (int64_t i = dyn_slice->first_index_operand_number();
        i < instr->operand_count(); ++i) {
     HloInstruction* operand = dyn_slice->mutable_operand(i);
-    CHECK_EQ(operand->shape().dimensions_size(), 0);
+    CHECK(operand->shape().dimensions().empty());
     std::vector<std::pair<HloInstruction*, int>> stack(
         1, std::make_pair(operand, 0));
     absl::flat_hash_set<HloInstruction*> visited;
@@ -333,7 +339,7 @@ CheckStoreIntoSliceIsCompatible(HloInstruction* instr,
     if (i->HasControlDependencies() || !acceptable_formatting(i)) {
       return false;
     }
-    if (i->opcode() == HloOpcode::kReduce &&
+    if (i->opcode() == HloOpcode::kReduce && i->shape().IsArray() &&
         (ShapeUtil::ElementsIn(i->shape()) ==
              ShapeUtil::ElementsIn(instr->operand(0)->shape()) ||
          ShapeUtil::ElementsIn(instr->operand(0)->shape()) < 1024)) {
@@ -698,16 +704,19 @@ void UpdateInstructionChannelId(HloInstruction* cloned_instr,
 // Update scheduling annotation with a new id. If the original id was not seen
 // before (checking annotation_map), use a new id and save it in the map. If it
 // was seen before, use the saved id.
-void UpdateInstructionSchedulingAnnotation(
+absl::Status UpdateInstructionSchedulingAnnotation(
     HloInstruction* cloned_instr, int64_t& scheduling_id,
     absl::flat_hash_map<int64_t, int64_t>& annotation_map) {
-  if (std::optional<int64_t> annotation_idx =
-          GetSchedulingAnnotation(cloned_instr)) {
+  TF_ASSIGN_OR_RETURN(std::optional<int64_t> annotation_idx,
+                      GetSchedulingAnnotationGroupId(cloned_instr));
+  if (annotation_idx) {
     if (!annotation_map.contains(*annotation_idx)) {
       annotation_map[*annotation_idx] = scheduling_id++;
     }
-    SetSchedulingAnnotation(cloned_instr, annotation_map[*annotation_idx]);
+    TF_RETURN_IF_ERROR(SetSchedulingAnnotationGroupId(
+        cloned_instr, annotation_map[*annotation_idx]));
   }
+  return absl::OkStatus();
 }
 
 // Clones a chain of instructions from a move_info for backward movement, and
@@ -722,8 +731,7 @@ absl::StatusOr<HloInstruction*> CloneBackwardChain(
     int64_t& next_scheduling_id,
     absl::flat_hash_map<int64_t, int64_t>& annotation_map,
     LoopVariantParameterInfo* loop_variant_parameter_info = nullptr,
-    CollectivePipeliner::HloPostprocessor postprocess_pipelined_ops =
-        std::nullopt) {
+    CollectivePipeliner::HloPostprocessor postprocess_pipelined_ops = {}) {
   std::vector<HloInstruction*> to_clone(move_info.formatting_ops.begin(),
                                         move_info.formatting_ops.end());
   to_clone.push_back(move_info.collectives_to_move[0]);
@@ -740,13 +748,13 @@ absl::StatusOr<HloInstruction*> CloneBackwardChain(
     TF_RETURN_IF_ERROR(UpdateControlDependencies(chain_op, cloned, clone_map));
     UpdateInstructionChannelId(cloned, next_channel_id);
     if (next_scheduling_id != -1) {
-      UpdateInstructionSchedulingAnnotation(cloned, next_scheduling_id,
-                                            annotation_map);
+      TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
+          cloned, next_scheduling_id, annotation_map));
     }
     clone_map[chain_op] = cloned;
-    if (postprocess_pipelined_ops.has_value()) {
+    if (postprocess_pipelined_ops) {
       TF_RETURN_IF_ERROR(
-          (*postprocess_pipelined_ops)(cloned, /*new_while_instr=*/nullptr));
+          postprocess_pipelined_ops(cloned, /*new_while_instr=*/nullptr));
     }
     last_cloned = cloned;
     if (loop_variant_parameter_info != nullptr &&
@@ -769,13 +777,12 @@ class WhileLoopAnalysis {
   explicit WhileLoopAnalysis(
       HloInstruction* while_instr, int64_t max_pipelining_per_loop,
       bool pipeline_use_tree, bool process_different_sized_options,
-      TuplePointsToAnalysis* tuple_points_to_analysis, CallGraph* call_graph,
+      TuplePointsToAnalysis* tuple_points_to_analysis,
       std::optional<ConstantValue> known_start = std::nullopt)
       : while_(while_instr),
         loop_start_(known_start),
         max_pipelining_per_loop_(max_pipelining_per_loop),
         tuple_points_to_analysis_(tuple_points_to_analysis),
-        call_graph_(call_graph),
         pipeline_use_tree_(pipeline_use_tree),
         process_different_sized_options_(process_different_sized_options) {}
   std::optional<ConstantValue> GetLoopIterationCount() const;
@@ -809,7 +816,7 @@ class WhileLoopAnalysis {
       const HloDynamicUpdateSliceInstruction* dyn_update,
       const HloInstruction* instr,
       const std::vector<HloInstruction*>& formatting_ops,
-      CollectivePipeliner::PipeliningDirection direction,
+      collective_pipeliner_utils::PipeliningDirection direction,
       int64_t level_to_operate_on,
       const absl::flat_hash_map<int64_t, int64_t>& parameter_gtes_count,
       absl::flat_hash_map<const HloInstruction*, Range>& index_ranges) const;
@@ -835,7 +842,7 @@ class WhileLoopAnalysis {
       std::vector<HloDynamicUpdateSliceInstruction*> dyn_updates,
       int64_t sliced_idx, std::vector<int64_t> output_indices,
       std::vector<int64_t> indices_to_merge,
-      absl::flat_hash_map<const HloInstruction*, int64_t>
+      absl::flat_hash_map<const HloInstruction*, int64_t>&
           index_per_dyn_update_slice,
       absl::flat_hash_map<const HloInstruction*, int64_t> instruction_order);
   void MergeIntoExistingCollectives(
@@ -843,13 +850,13 @@ class WhileLoopAnalysis {
       std::vector<HloDynamicUpdateSliceInstruction*> dyn_updates,
       int64_t sliced_idx, std::vector<int64_t> output_indices,
       std::vector<int64_t> indices_to_merge,
-      absl::flat_hash_map<const HloInstruction*, int64_t>
+      absl::flat_hash_map<const HloInstruction*, int64_t>&
           index_per_dyn_update_slice,
       absl::flat_hash_map<const HloInstruction*, int64_t> instruction_order,
-      CollectivePipeliner::PipeliningDirection direction);
+      collective_pipeliner_utils::PipeliningDirection direction);
   void CollectCollectivesToMove(
       int64_t level_to_operate_on,
-      CollectivePipeliner::PipeliningDirection direction,
+      collective_pipeliner_utils::PipeliningDirection direction,
       HloPredicate should_process, HloPredicate acceptable_formatting,
       HloPredicate should_allow_loop_variant_parameter_in_chain =
           HloPredicateFalse,
@@ -874,9 +881,6 @@ class WhileLoopAnalysis {
   // Precomputed TuplePointsToAnalysis for the HLO module containing `while_`.
   // May be null, in which case the analysis will be performed from scratch.
   TuplePointsToAnalysis* tuple_points_to_analysis_;
-  // Precomputed CallGraph analysis for the HLO module containing `while_`.
-  // May be null, in which case the analysis will be performed from scratch.
-  CallGraph* call_graph_;
 
   bool pipeline_use_tree_;
   bool process_different_sized_options_;
@@ -916,8 +920,8 @@ bool WhileLoopAnalysis::ComputeLoopStatistics() {
   if (loop_iteration_count_) {
     return true;
   }
-  std::optional<ParsedWhileLoop> parsed_loop = PatternMatchParseWhileLoop(
-      while_, {tuple_points_to_analysis_, call_graph_});
+  std::optional<ParsedWhileLoop> parsed_loop =
+      PatternMatchParseWhileLoop(while_, {tuple_points_to_analysis_});
   if (!parsed_loop || !parsed_loop->static_while_loop) {
     return false;
   }
@@ -1000,7 +1004,7 @@ WhileLoopAnalysis::IsSupportedDynamicUpdateSlice(
     const HloDynamicUpdateSliceInstruction* dyn_update,
     const HloInstruction* instr,
     const std::vector<HloInstruction*>& formatting_ops,
-    CollectivePipeliner::PipeliningDirection direction,
+    collective_pipeliner_utils::PipeliningDirection direction,
     int64_t level_to_operate_on,
     const absl::flat_hash_map<int64_t, int64_t>& parameter_gtes_count,
     absl::flat_hash_map<const HloInstruction*, Range>& index_ranges) const {
@@ -1013,7 +1017,8 @@ WhileLoopAnalysis::IsSupportedDynamicUpdateSlice(
             << " because couldn't find sliced dimension";
     return std::nullopt;
   }
-  if (direction == CollectivePipeliner::PipeliningDirection::kForwardSink &&
+  if (direction ==
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink &&
       (*sliced_dim != 0 || dyn_update->shape().dimensions(0) !=
                                loop_iteration_count_->GetUnsignedValue())) {
     VLOG(5) << "Skipping " << instr->name()
@@ -1147,7 +1152,7 @@ void WhileLoopAnalysis::MergeIntoExistingCollectivesForwardSink(
     std::vector<HloDynamicUpdateSliceInstruction*> dyn_updates,
     int64_t sliced_idx, std::vector<int64_t> output_indices,
     std::vector<int64_t> indices_to_merge,
-    absl::flat_hash_map<const HloInstruction*, int64_t>
+    absl::flat_hash_map<const HloInstruction*, int64_t>&
         index_per_dyn_update_slice,
     absl::flat_hash_map<const HloInstruction*, int64_t> instruction_order) {
   CHECK(!indices_to_merge.empty());
@@ -1220,17 +1225,18 @@ void WhileLoopAnalysis::MergeIntoExistingCollectives(
     std::vector<HloDynamicUpdateSliceInstruction*> dyn_updates,
     int64_t sliced_idx, std::vector<int64_t> output_indices,
     std::vector<int64_t> indices_to_merge,
-    absl::flat_hash_map<const HloInstruction*, int64_t>
+    absl::flat_hash_map<const HloInstruction*, int64_t>&
         index_per_dyn_update_slice,
     absl::flat_hash_map<const HloInstruction*, int64_t> instruction_order,
-    CollectivePipeliner::PipeliningDirection direction) {
-  if (direction == CollectivePipeliner::PipeliningDirection::kForwardSink) {
+    collective_pipeliner_utils::PipeliningDirection direction) {
+  if (direction ==
+      collective_pipeliner_utils::PipeliningDirection::kForwardSink) {
     MergeIntoExistingCollectivesForwardSink(
         instr, formatting_ops, dyn_updates, sliced_idx, output_indices,
         indices_to_merge, index_per_dyn_update_slice, instruction_order);
     return;
   }
-  if (direction == CollectivePipeliner::PipeliningDirection::kForward) {
+  if (direction == collective_pipeliner_utils::PipeliningDirection::kForward) {
     MergeIntoExistingCollectivesForward(instr, formatting_ops, dyn_updates,
                                         indices_to_merge, instruction_order);
     return;
@@ -1239,9 +1245,18 @@ void WhileLoopAnalysis::MergeIntoExistingCollectives(
                   "MergeIntoExistingCollectives ";
 }
 
+// Returns the number of dimensions of the array shape, or 0 if the shape is not
+// an array.
+static int GetNumArrayDimensionsOrZero(const Shape& shape) {
+  if (shape.IsArray()) {
+    return shape.dimensions().size();
+  }
+  return 0;
+}
+
 void WhileLoopAnalysis::CollectCollectivesToMove(
     int64_t level_to_operate_on,
-    CollectivePipeliner::PipeliningDirection direction,
+    collective_pipeliner_utils::PipeliningDirection direction,
     HloPredicate should_process, HloPredicate acceptable_formatting,
     HloPredicate should_allow_loop_variant_parameter_in_chain,
     bool should_allow_control_dependencies,
@@ -1298,29 +1313,33 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
   }
 
   for (auto* instr : instructions_post_order) {
-    if (direction == CollectivePipeliner::PipeliningDirection::kForward &&
+    if (direction ==
+            collective_pipeliner_utils::PipeliningDirection::kForward &&
         (instr->operand_count() != 1 ||
-         instr->shape().dimensions_size() !=
-             instr->operand(0)->shape().dimensions_size())) {
+         GetNumArrayDimensionsOrZero(instr->shape()) !=
+             GetNumArrayDimensionsOrZero(instr->operand(0)->shape()))) {
       continue;
     }
     if (!should_process(instr)) {
       continue;
     }
-    if (direction == CollectivePipeliner::PipeliningDirection::kForward ||
-        direction == CollectivePipeliner::PipeliningDirection::kForwardSink) {
+    if (direction ==
+            collective_pipeliner_utils::PipeliningDirection::kForward ||
+        direction ==
+            collective_pipeliner_utils::PipeliningDirection::kForwardSink) {
       auto [dyn_updates, formatting_ops] = CheckStoreIntoSliceIsCompatible(
           instr, while_body, level_to_operate_on, pipeline_use_tree_,
           acceptable_formatting,
           /*multi_dyn_updates=*/direction ==
-              CollectivePipeliner::PipeliningDirection::kForwardSink);
+              collective_pipeliner_utils::PipeliningDirection::kForwardSink);
       if (dyn_updates.empty()) {
         VLOG(5)
             << "Skipping " << instr->name()
             << " because storing into slice is not compatible with pipelining";
         continue;
       }
-      CHECK(direction != CollectivePipeliner::PipeliningDirection::kForward ||
+      CHECK(direction !=
+                collective_pipeliner_utils::PipeliningDirection::kForward ||
             dyn_updates.size() == 1);
 
       // Collect the information for each dynamic-update-slice. Skip the
@@ -1390,7 +1409,8 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
                                std::move(output_indices)});
       }
     } else {
-      CHECK_EQ(direction, CollectivePipeliner::PipeliningDirection::kBackward);
+      CHECK_EQ(direction,
+               collective_pipeliner_utils::PipeliningDirection::kBackward);
       auto chain_collected = CollectChainsToPushBackwards(
           instr, *loop_iteration_idx_, while_body, level_to_operate_on,
           invariant_loop_parameters_,
@@ -1409,7 +1429,7 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
       break;
     }
   }
-  if (direction != CollectivePipeliner::PipeliningDirection::kForward) {
+  if (direction != collective_pipeliner_utils::PipeliningDirection::kForward) {
     return;
   }
   dus_index_map_.clear();
@@ -1516,7 +1536,7 @@ Shape ComputeFullOutputShape(const WhileMoveInfo& move_info,
 // Create zero of base type ptype and broadcast it to shape.
 HloInstruction* CreateZero(HloComputation* comp, const Shape& shape,
                            PrimitiveType ptype) {
-  if (shape.dimensions_size() == 0) {
+  if (shape.dimensions().empty()) {
     return comp->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::Zero(ptype)));
   }
@@ -1577,7 +1597,7 @@ absl::StatusOr<std::vector<Interval>> ParseVectorOfPairs(
 // {{0,4},{0,4},{1,5},{1,5},{2,5}}.
 absl::Status UpdateSendRecvValidation(
     HloInstruction* instruction, bool is_peeled,
-    CollectivePipeliner::PipeliningDirection direction,
+    collective_pipeliner_utils::PipeliningDirection direction,
     const WhileLoopAnalysis& loop_analysis) {
   if (instruction->opcode() != HloOpcode::kCollectivePermute) {
     return absl::OkStatus();
@@ -1596,7 +1616,7 @@ absl::Status UpdateSendRecvValidation(
 
   Intervals intervals;
 
-  if (direction == CollectivePipeliner::kForward) {
+  if (direction == collective_pipeliner_utils::PipeliningDirection::kForward) {
     // It is a forward pipelining which means that the peeled collective permute
     // is before the loop. It should run once for the devices executing the
     // first iteration and the internal collective permute now sees each
@@ -1617,7 +1637,8 @@ absl::Status UpdateSendRecvValidation(
             {std::max(int64_t{0}, a - 1), std::max(int64_t{0}, b - 1)});
       }
     }
-  } else if (direction == CollectivePipeliner::kBackward) {
+  } else if (direction ==
+             collective_pipeliner_utils::PipeliningDirection::kBackward) {
     // It is a backward pipelining which means that the peeled collective is
     // after the loop. It should run once for the devices executing the last
     // iteration and the internal collective permute doesn't see the last
@@ -1762,7 +1783,8 @@ absl::Status TransformLoopForward(
 
   // Duplicate the loop body into the loop parent computation, so that the first
   // iteration happens there.
-  int64_t next_scheduling_id = NextSchedulingId(*while_loop->GetModule());
+  TF_ASSIGN_OR_RETURN(int64_t next_scheduling_id,
+                      NextSchedulingGroupId(*while_loop->GetModule()));
   absl::flat_hash_map<int64_t, int64_t> annotation_map;
   for (auto* instr : while_body->MakeInstructionPostOrder()) {
     if (instr == loop_parameter) {
@@ -1790,12 +1812,13 @@ absl::Status TransformLoopForward(
     TF_RETURN_IF_ERROR(
         UpdateControlDependencies(instr, cloned_instr, while_body_to_peeled));
     UpdateInstructionChannelId(cloned_instr, next_channel_id);
-    UpdateInstructionSchedulingAnnotation(cloned_instr, next_scheduling_id,
-                                          annotation_map);
+    TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
+        cloned_instr, next_scheduling_id, annotation_map));
     // TODO(b/398891001): Remove this once we have eliminated the need for
     // send/recv validation.
     TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        cloned_instr, true, CollectivePipeliner::PipeliningDirection::kForward,
+        cloned_instr, true,
+        collective_pipeliner_utils::PipeliningDirection::kForward,
         loop_analysis));
     while_body_to_peeled[instr] = cloned_instr;
     auto output_it = is_output_instruction.find(instr);
@@ -1864,7 +1887,8 @@ absl::Status TransformLoopForward(
     // TODO(b/398891001): Remove this once we have eliminated the need for
     // send/recv validation.
     TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        instruction, false, CollectivePipeliner::PipeliningDirection::kForward,
+        instruction, false,
+        collective_pipeliner_utils::PipeliningDirection::kForward,
         loop_analysis));
   }
   HloInstruction* new_init = loop_computation->AddInstruction(
@@ -1887,12 +1911,12 @@ absl::Status TransformLoopForward(
       new_while_loop, loop_analysis.GetMaxPipeliningPerLoop(),
       pipeline_use_tree, process_different_sized_ops,
       /*tuple_points_to_analysis=*/nullptr,
-      /*call_graph=*/nullptr,
       loop_analysis.GetLoopStart()->add(*loop_analysis.GetLoopIncrement()));
   new_loop_analysis.ComputeLoopStatistics();
   new_loop_analysis.CollectCollectivesToMove(
-      level_to_operate_on, CollectivePipeliner::PipeliningDirection::kForward,
-      should_process, acceptable_formatting);
+      level_to_operate_on,
+      collective_pipeliner_utils::PipeliningDirection::kForward, should_process,
+      acceptable_formatting);
   CHECK_EQ(new_loop_analysis.GetMoveInfos().size(),
            loop_analysis.GetMoveInfos().size());
   for (int64_t i = new_loop_tuple_operand_count;
@@ -1932,8 +1956,8 @@ absl::Status TransformLoopForward(
             move_info.collectives_to_move.front()->shape(), {stacked_data}));
     UpdateInstructionChannelId(processed, next_channel_id);
     if (update_annotations) {
-      UpdateInstructionSchedulingAnnotation(processed, next_scheduling_id,
-                                            annotation_map);
+      TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
+          processed, next_scheduling_id, annotation_map));
     }
     if (insert_non_alias_custom_call) {
       HloInstruction* level =
@@ -1945,9 +1969,9 @@ absl::Status TransformLoopForward(
               CollectivePipeliner::kInsertedByPreviousStep));
     }
 
-    if (post_processing_fn.has_value()) {
+    if (post_processing_fn) {
       TF_RETURN_IF_ERROR(
-          (*post_processing_fn)(processed, /*new_while_instr=*/nullptr));
+          post_processing_fn(processed, /*new_while_instr=*/nullptr));
     }
 
     InstructionMap cloned_map = pipelined_values_map;
@@ -1958,13 +1982,13 @@ absl::Status TransformLoopForward(
           formatting_op->CloneWithNewOperands(formatting_op->shape(),
                                               new_operands));
       if (update_annotations) {
-        UpdateInstructionSchedulingAnnotation(processed, next_scheduling_id,
-                                              annotation_map);
+        TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
+            processed, next_scheduling_id, annotation_map));
       }
       cloned_map[formatting_op] = processed;
-      if (post_processing_fn.has_value()) {
+      if (post_processing_fn) {
         TF_RETURN_IF_ERROR(
-            (*post_processing_fn)(processed, /*new_while_instr=*/nullptr));
+            post_processing_fn(processed, /*new_while_instr=*/nullptr));
       }
     }
     return processed;
@@ -1996,8 +2020,8 @@ absl::Status TransformLoopForward(
     if (slice_target_shape != data_to_slice->shape()) {
       // Slice matrix.
       absl::InlinedVector<int64_t, 4> dynamic_slice_sizes;
-      dynamic_slice_sizes.reserve(slice_target_shape.dimensions_size());
-      for (int i = 0; i < slice_target_shape.dimensions_size(); ++i) {
+      dynamic_slice_sizes.reserve(slice_target_shape.dimensions().size());
+      for (int i = 0; i < slice_target_shape.dimensions().size(); ++i) {
         dynamic_slice_sizes.push_back(slice_target_shape.dimensions(i));
       }
       sliced_data =
@@ -2255,7 +2279,7 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
       Shape index_shape =
           move_info.dynamic_update_slices.front()->index_shapes()[0];
       std::vector<HloInstruction*> indices(
-          expanded_shape.dimensions_size(),
+          expanded_shape.dimensions().size(),
           CreateZero(body_computation, index_shape,
                      index_shape.element_type()));
       indices[0] = move_info.dynamic_update_slices.front()->index_operands()[0];
@@ -2300,7 +2324,7 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
       HloDynamicUpdateSliceInstruction* dyn_update =
           to_move.dynamic_update_slices[0];
       std::vector<HloInstruction*> indices(
-          expanded_shape.dimensions_size(),
+          expanded_shape.dimensions().size(),
           CreateZero(body_computation, dyn_update->index_shapes()[0],
                      dyn_update->index_shapes()[0].element_type()));
       indices[0] = dyn_update->index_operands()[0];
@@ -2421,7 +2445,7 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
       if (is_loop_invariant) {
         Shape full_shape = ComputeFullOutputShape(to_move, pipelined->shape());
         absl::InlinedVector<int64_t, 4> operand_dims;
-        operand_dims.resize(pipelined->shape().dimensions_size());
+        operand_dims.resize(pipelined->shape().dimensions().size());
         absl::c_iota(operand_dims, 1);
         HloInstruction* broadcasted =
             loop_computation->AddInstruction(HloInstruction::CreateBroadcast(
@@ -2447,6 +2471,15 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
       std::vector<HloInstruction*> operands;
       for (auto* operand : instr->mutable_operands()) {
         if (operand->opcode() == HloOpcode::kConstant) {
+          if (instr->opcode() == HloOpcode::kPad &&
+              instr->operand_index(operand) == 1) {
+            // No need to broadcast the padding value.
+            operands.push_back(loop_computation->AddInstruction(
+                operand->CloneWithNewOperands(operand->shape(), {})));
+            continue;
+          }
+
+          // Broadcast constant into full shape.
           HloInstruction* cloned_constant = loop_computation->AddInstruction(
               operand->CloneWithNewOperands(operand->shape(), {}));
           if (!to_add_batch_set.contains(instr)) {
@@ -2456,7 +2489,7 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
           Shape full_shape =
               ComputeFullOutputShape(to_move, cloned_constant->shape());
           absl::InlinedVector<int64_t, 4> operand_dims;
-          operand_dims.resize(cloned_constant->shape().dimensions_size());
+          operand_dims.resize(cloned_constant->shape().dimensions().size());
           absl::c_iota(operand_dims, 1);
           HloInstruction* broadcasted =
               loop_computation->AddInstruction(HloInstruction::CreateBroadcast(
@@ -2533,7 +2566,7 @@ absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
         }
         // Constant scalars don't get expanded ahead of time and are kept
         // scalar.
-        if (operands[0]->shape().dimensions_size() == 0) {
+        if (operands[0]->shape().dimensions().empty()) {
           dimensions.clear();
         }
         HloInstruction* expanded_broadcast =
@@ -2742,13 +2775,13 @@ static absl::Status TransformLoopBackward(
   // only dynamic thing that is allowed to be used by the computation pipelined
   // in the previous iteration.
   const int64_t operands_indices_count =
-      while_loop->shape().tuple_shapes_size() +
+      while_loop->shape().tuple_shapes().size() +
       loop_analysis.GetMoveInfos().size() + 1;
   new_parameter_shapes.resize(operands_indices_count);
   new_root_operands.resize(operands_indices_count);
   new_init_operands.resize(operands_indices_count);
   // Fill up root and init operands for the new loop.
-  for (int i = 0; i < loop_parameter->shape().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < loop_parameter->shape().tuple_shapes().size(); ++i) {
     new_parameter_shapes[i] = loop_parameter->shape().tuple_shapes(i);
     new_root_operands[i] = while_body->root_instruction()->mutable_operand(i);
     new_init_operands[i] = while_loop->mutable_operand(0)->mutable_operand(i);
@@ -2768,10 +2801,11 @@ static absl::Status TransformLoopBackward(
   // Add to the rewritten loop the new parameter/output data that is going to be
   // pipelined. Clone chains of pipelined data in the parent computation in the
   // process (they will endup being executed before the loop).
-  int64_t next_scheduling_id = NextSchedulingId(*while_loop->GetModule());
+  TF_ASSIGN_OR_RETURN(int64_t next_scheduling_id,
+                      NextSchedulingGroupId(*while_loop->GetModule()));
   absl::flat_hash_map<int64_t, int64_t> annotation_map;
   for (int i = 0; i < loop_analysis.GetMoveInfos().size(); ++i) {
-    const int64_t idx = i + loop_parameter->shape().tuple_shapes_size();
+    const int64_t idx = i + loop_parameter->shape().tuple_shapes().size();
     new_parameter_shapes[idx] =
         loop_analysis.GetMoveInfos()[i].collectives_to_move[0]->shape();
     new_root_operands[idx] =
@@ -2784,13 +2818,13 @@ static absl::Status TransformLoopBackward(
             next_channel_id, next_scheduling_id, annotation_map,
             /*loop_variant_parameter_info=*/nullptr, post_processing_fn));
 
-    if (post_processing_fn.has_value()) {
-      TF_RETURN_IF_ERROR((*post_processing_fn)(new_init_operands[idx],
-                                               /*new_while_instr=*/nullptr));
+    if (post_processing_fn) {
+      TF_RETURN_IF_ERROR(post_processing_fn(new_init_operands[idx],
+                                            /*new_while_instr=*/nullptr));
     }
-    if (postprocess_peeled.has_value()) {
-      TF_RETURN_IF_ERROR(postprocess_peeled.value()(
-          new_init_operands[idx], /*new_while_instr=*/nullptr));
+    if (postprocess_peeled) {
+      TF_RETURN_IF_ERROR(postprocess_peeled(new_init_operands[idx],
+                                            /*new_while_instr=*/nullptr));
     }
   }
   ConstantValue next_loop_iteration =
@@ -2844,13 +2878,13 @@ static absl::Status TransformLoopBackward(
               next_scheduling_id, annotation_map, &loop_variant_parameter_info,
               post_processing_fn));
 
-      if (post_processing_fn.has_value()) {
+      if (post_processing_fn) {
         TF_RETURN_IF_ERROR(
-            (*post_processing_fn)(cloned_instr, /*new_while_instr=*/nullptr));
+            post_processing_fn(cloned_instr, /*new_while_instr=*/nullptr));
       }
-      if (postprocess_rotated.has_value()) {
-        TF_RETURN_IF_ERROR(postprocess_rotated.value()(
-            cloned_instr, /*new_while_instr=*/nullptr));
+      if (postprocess_rotated) {
+        TF_RETURN_IF_ERROR(
+            postprocess_rotated(cloned_instr, /*new_while_instr=*/nullptr));
       }
     } else {
       auto new_operands =
@@ -2863,7 +2897,7 @@ static absl::Status TransformLoopBackward(
     }
     if (it != collective_to_move_map.end()) {
       const int64_t tuple_idx =
-          while_loop->shape().tuple_shapes_size() + it->second;
+          while_loop->shape().tuple_shapes().size() + it->second;
       HloInstruction* pipelined_value = body_builder.AddInstruction(
           HloInstruction::CreateGetTupleElement(new_loop_param, tuple_idx));
       while_body_replacement_map[instr] = pipelined_value;
@@ -2909,7 +2943,8 @@ static absl::Status TransformLoopBackward(
     // TODO(b/398891001): Remove this once we have eliminated the need for
     // send/recv validation.
     TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        instruction, false, CollectivePipeliner::PipeliningDirection::kBackward,
+        instruction, false,
+        collective_pipeliner_utils::PipeliningDirection::kBackward,
         loop_analysis));
   }
   auto cond_builder =
@@ -2961,7 +2996,7 @@ static absl::Status TransformLoopBackward(
   while_body_replacement_map.clear();
   while_body_replacement_map[loop_parameter] = new_while_loop;
   std::vector<HloInstruction*> output_tuple_instructions(
-      while_loop->shape().tuple_shapes_size(), nullptr);
+      while_loop->shape().tuple_shapes().size(), nullptr);
   for (auto* instr : while_body->MakeInstructionPostOrder()) {
     if (instr == loop_parameter || instr == while_body->root_instruction() ||
         sideeffect_unused_instructions.contains(instr)) {
@@ -2971,7 +3006,7 @@ static absl::Status TransformLoopBackward(
     auto it = collective_to_move_map.find(instr);
     if (it != collective_to_move_map.end()) {
       const int64_t tuple_idx =
-          while_loop->shape().tuple_shapes_size() + it->second;
+          while_loop->shape().tuple_shapes().size() + it->second;
       HloInstruction* pipelined_value = while_loop->parent()->AddInstruction(
           HloInstruction::CreateGetTupleElement(new_while_loop, tuple_idx));
       while_body_replacement_map[instr] = pipelined_value;
@@ -2987,21 +3022,22 @@ static absl::Status TransformLoopBackward(
     HloInstruction* cloned_instr = while_loop->parent()->AddInstruction(
         instr->CloneWithNewOperands(instr->shape(), new_operands));
 
-    if (postprocess_peeled_trailing_op.has_value()) {
+    if (postprocess_peeled_trailing_op) {
       CHECK_NE(new_while_loop, nullptr);
       TF_RETURN_IF_ERROR(
-          postprocess_peeled_trailing_op.value()(cloned_instr, new_while_loop));
+          postprocess_peeled_trailing_op(cloned_instr, new_while_loop));
     }
 
     TF_RETURN_IF_ERROR(UpdateControlDependencies(instr, cloned_instr,
                                                  while_body_replacement_map));
     UpdateInstructionChannelId(cloned_instr, next_channel_id);
-    UpdateInstructionSchedulingAnnotation(cloned_instr, next_scheduling_id,
-                                          annotation_map);
+    TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
+        cloned_instr, next_scheduling_id, annotation_map));
     // TODO(b/398891001): Remove this once we have eliminated the need for
     // send/recv validation.
     TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        cloned_instr, true, CollectivePipeliner::PipeliningDirection::kBackward,
+        cloned_instr, true,
+        collective_pipeliner_utils::PipeliningDirection::kBackward,
         loop_analysis));
     while_body_replacement_map[instr] = cloned_instr;
     if (instruction_is_output_it != is_output_instruction.end()) {
@@ -3051,7 +3087,6 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<TuplePointsToAnalysis> tuple_points_to_analysis,
       TuplePointsToAnalysis::Run(module));
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
 
   std::vector<std::pair<HloInstruction*, std::unique_ptr<WhileLoopAnalysis>>>
       loop_analyses;
@@ -3070,7 +3105,7 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
       auto loop_analysis = std::make_unique<WhileLoopAnalysis>(
           instruction, config_.max_pipelining_per_loop,
           config_.pipeline_use_tree, config_.process_different_sized_ops,
-          tuple_points_to_analysis.get(), call_graph.get());
+          tuple_points_to_analysis.get());
       loop_analysis->ComputeLoopStatistics();
       if (loop_analysis->GetLoopIterationCount() &&
           loop_analysis->GetLoopIterationCount()->GetUnsignedValue() > 1) {
@@ -3087,7 +3122,8 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
   for (auto& [instruction, loop_analysis] : loop_analyses) {
     VLOG(1) << "While iterations: "
             << loop_analysis->GetLoopIterationCount()->ToString();
-    if (config_.pipelining_direction == PipeliningDirection::kForwardSink &&
+    if (config_.pipelining_direction ==
+            collective_pipeliner_utils::PipeliningDirection::kForwardSink &&
         !IsForwardSinkIterationFeasible(
             instruction, config_.collective_size_threshold_to_stop_sinking)) {
       continue;
@@ -3109,7 +3145,8 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
         VLOG(1) << "MoveInfo #" << id++ << "\n" << ToString(to_move);
       }
     }
-    if (config_.pipelining_direction == PipeliningDirection::kForward) {
+    if (config_.pipelining_direction ==
+        collective_pipeliner_utils::PipeliningDirection::kForward) {
       CHECK(config_.reuse_pipelined_op_buffer);
       TF_RETURN_IF_ERROR(TransformLoopForward(
           *loop_analysis, !config_.last_run, config_.level_to_operate_on,
@@ -3118,13 +3155,14 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
           config_.reuse_pipelined_op_buffer, next_channel_id,
           config_.postprocess_pipelined_ops));
     } else if (config_.pipelining_direction ==
-               PipeliningDirection::kForwardSink) {
+               collective_pipeliner_utils::PipeliningDirection::kForwardSink) {
       TF_RETURN_IF_ERROR(TransformLoopForwardSink(
           *loop_analysis, !config_.last_run, config_.level_to_operate_on,
           config_.pipeline_use_tree, config_.process_different_sized_ops,
           config_.should_process, next_channel_id));
     } else {
-      CHECK_EQ(config_.pipelining_direction, PipeliningDirection::kBackward);
+      CHECK_EQ(config_.pipelining_direction,
+               collective_pipeliner_utils::PipeliningDirection::kBackward);
       TF_RETURN_IF_ERROR(TransformLoopBackward(
           *loop_analysis, !config_.last_run, config_.level_to_operate_on,
           config_.process_different_sized_ops, config_.acceptable_formatting,
@@ -3175,7 +3213,8 @@ absl::StatusOr<bool> CollectivePipeliner::Run(
   CHECK(config_.acceptable_formatting);
   CHECK(config_.should_process);
 
-  if (config_.pipelining_direction != PipeliningDirection::kForwardSink) {
+  if (config_.pipelining_direction !=
+      collective_pipeliner_utils::PipeliningDirection::kForwardSink) {
     return RunPipeliner(module, execution_threads);
   }
 
diff --git a/third_party/xla/xla/service/collective_pipeliner.h b/third_party/xla/xla/service/collective_pipeliner.h
index bc3ba3212bb3..f366d1582a84 100644
--- a/third_party/xla/xla/service/collective_pipeliner.h
+++ b/third_party/xla/xla/service/collective_pipeliner.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/collective_pipeliner_utils.h"
 
 namespace xla {
 
@@ -60,18 +61,12 @@ namespace xla {
 // }
 class CollectivePipeliner : public HloModulePass {
  public:
-  enum PipeliningDirection {
-    kBackward,
-    kForward,
-    kForwardSink,
-  };
-
   // Postprocessing cloned collective instructions, such as peeled instructions
   // before and after the loop, and rotated instructions. The new while op is
   // only passed for the peeled trailing ops when the new while op was already
   // created.
-  using HloPostprocessor = std::optional<std::function<absl::Status(
-      HloInstruction* instr, HloInstruction* new_while_instr)>>;
+  using HloPostprocessor = std::function<absl::Status(
+      HloInstruction* instr, HloInstruction* new_while_instr)>;
 
   struct Config {
     int64_t level_to_operate_on = 0;
@@ -85,7 +80,8 @@ class CollectivePipeliner : public HloModulePass {
     // iteration.
     bool pipeline_use_tree = false;
     bool process_different_sized_ops = false;
-    PipeliningDirection pipelining_direction = PipeliningDirection::kForward;
+    collective_pipeliner_utils::PipeliningDirection pipelining_direction =
+        collective_pipeliner_utils::PipeliningDirection::kForward;
     HloPredicate should_process;
     // Filter acceptable formatting ops for for forward pipelining to discard
     // cases that pipeline formatting operations that we don't want to support.
@@ -104,14 +100,14 @@ class CollectivePipeliner : public HloModulePass {
     // pipelined. This is currently only used to support kBackward pipelining.
     bool should_allow_control_dependencies = false;
     // TODO(b/399476667): Consolidate these postprocessing functions.
-    HloPostprocessor postprocess_backward_peeled_op = std::nullopt;
-    HloPostprocessor postprocess_backward_rotated_op = std::nullopt;
-    HloPostprocessor postprocess_backward_peeled_trailing_op = std::nullopt;
+    HloPostprocessor postprocess_backward_peeled_op;
+    HloPostprocessor postprocess_backward_rotated_op;
+    HloPostprocessor postprocess_backward_peeled_trailing_op;
     // Determines whether a loop invariant instruction can be considered
     // in the pipelining chain.
     bool should_add_loop_invariant_op_in_chain = false;
     // Postprocessing hook which runs for every successfully pipelined op.
-    HloPostprocessor postprocess_pipelined_ops = std::nullopt;
+    HloPostprocessor postprocess_pipelined_ops;
     int64_t collective_size_threshold_to_stop_sinking = INT64_MAX;
   };
   static const char* const kInsertedByPreviousStep;
@@ -119,15 +115,16 @@ class CollectivePipeliner : public HloModulePass {
   explicit CollectivePipeliner(const Config& config) : config_(config) {}
   CollectivePipeliner(CollectivePipeliner&& other) = default;
   CollectivePipeliner& operator=(CollectivePipeliner&& other) = default;
-  absl::string_view GetPipelineDirectionString(PipeliningDirection direction) {
+  absl::string_view GetPipelineDirectionString(
+      collective_pipeliner_utils::PipeliningDirection direction) {
     switch (direction) {
-      case PipeliningDirection::kForward: {
+      case collective_pipeliner_utils::PipeliningDirection::kForward: {
         return "forward";
       }
-      case PipeliningDirection::kBackward: {
+      case collective_pipeliner_utils::PipeliningDirection::kBackward: {
         return "backward";
       }
-      case PipeliningDirection::kForwardSink: {
+      case collective_pipeliner_utils::PipeliningDirection::kForwardSink: {
         return "forwardsink";
       }
     }
@@ -135,9 +132,11 @@ class CollectivePipeliner : public HloModulePass {
 
   static constexpr absl::string_view kName = "collective-pipeliner";
   absl::string_view name() const override {
-    if (config_.pipelining_direction == kForward) {
+    if (config_.pipelining_direction ==
+        collective_pipeliner_utils::PipeliningDirection::kForward) {
       return "collective-pipeliner-forward";
-    } else if (config_.pipelining_direction == kBackward) {
+    } else if (config_.pipelining_direction ==
+               collective_pipeliner_utils::PipeliningDirection::kBackward) {
       return "collective-pipeliner-backward";
     } else {
       return "collective-pipeliner-forwardsink";
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index a0f4c621cf63..cd2665a459eb 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -39,15 +39,18 @@ limitations under the License.
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
+#include "xla/service/collective_pipeliner_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/legalize_scheduling_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/scheduling_annotations_util.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
@@ -58,7 +61,7 @@ using ::testing::_;
 using ::tsl::testing::IsOkAndHolds;
 namespace op = xla::testing::opcode_matchers;
 
-class CollectivePipelinerTest : public HloTestBase {
+class CollectivePipelinerTest : public HloHardwareIndependentTestBase {
  public:
   CollectivePipelinerTest() {
     const int64_t kNumReplicas = 4;
@@ -67,27 +70,38 @@ class CollectivePipelinerTest : public HloTestBase {
                                      /*num_partitions=*/kNumPartitions);
   }
 
+  static bool IsAllGatherExplicitPipeliningAnnotation(
+      const HloInstruction* instr,
+      collective_pipeliner_utils::PipeliningDirection direction) {
+    std::optional<AnnotationIterationId> iteration_id =
+        GetSchedulingAnnotationIterationId(instr).value();
+    return IsAllGather(instr) && iteration_id &&
+           IsIterationIdConstentWithPipeliningDirection(*iteration_id,
+                                                        direction);
+  }
+
  protected:
-  const HloPredicate IsAllGather = HloPredicateIsOp<HloOpcode::kAllGather>;
+  static const HloPredicate IsAllGather;
   HloModuleConfig config_;
 };
 
+const HloPredicate CollectivePipelinerTest::IsAllGather =
+    HloPredicateIsOp<HloOpcode::kAllGather>;
+
 absl::StatusOr<bool> RunOptimizer(
     HloModule* module, bool last_run, int64_t level_to_operate_on = 0,
     bool pipeline_use_tree = false, bool process_different_sized_ops = true,
-    CollectivePipeliner::PipeliningDirection direction =
-        CollectivePipeliner::PipeliningDirection::kForward,
+    collective_pipeliner_utils::PipeliningDirection direction =
+        collective_pipeliner_utils::PipeliningDirection::kForward,
     HloPredicate should_process = HloPredicateIsOp<HloOpcode::kAllReduce>,
     HloPredicate acceptable_formatting = HloPredicateTrue,
     HloPredicate reuse_pipelined_op_buffer = HloPredicateTrue,
     HloPredicate should_allow_loop_variant_parameter_in_chain =
         HloPredicateFalse,
-    CollectivePipeliner::HloPostprocessor postprocess_backward_peeled =
-        std::nullopt,
-    CollectivePipeliner::HloPostprocessor postprocess_backward_rotated =
-        std::nullopt,
+    CollectivePipeliner::HloPostprocessor postprocess_backward_peeled = {},
+    CollectivePipeliner::HloPostprocessor postprocess_backward_rotated = {},
     CollectivePipeliner::HloPostprocessor postprocess_backward_peeled_trailing =
-        std::nullopt,
+        {},
     bool should_add_loop_invariant_op_in_chain = false,
     int64_t collective_size_threshold_to_stop_sinking = INT64_MAX) {
   CollectivePipeliner::Config config = {
@@ -105,7 +119,7 @@ absl::StatusOr<bool> RunOptimizer(
       /*should_allow_control_dependencies=*/false, postprocess_backward_peeled,
       postprocess_backward_rotated, postprocess_backward_peeled_trailing,
       should_add_loop_invariant_op_in_chain,
-      /*postprocess_pipelined_ops=*/std::nullopt,
+      /*postprocess_pipelined_ops=*/{},
       collective_size_threshold_to_stop_sinking};
   HloPassPipeline pass("optimizer");
   pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
@@ -370,18 +384,18 @@ ENTRY entry {
     // CHECK: HloModule
     // CHECK: %while_body
     // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{0,5},{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12}{{[}]}}
-    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.+}} %[[cp]], {{.+}})
-    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.+}} %[[dus]], {{.+}} %[[dus]])
-    // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[mul]], {{.+}})
-    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.+}} %[[dus2]], {{.+}})
+    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
+    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[dus]], {{.*}}%[[dus]])
+    // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
+    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %entry
     // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}{_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}
-    // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.+}} %[[cp]], {{.+}})
-    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.+}} %[[ds]], {{.+}} %[[ds]])
-    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[mul]], {{.+}})
-    // CHECK:   %[[tuple:.+]] = {{.+}} tuple({{.+}} %[[dus]], {{.+}})
-    // CHECK:   {{.+}} = {{.+}} while({{.+}} %[[tuple]])
+    // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
+    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[ds]], {{.*}}%[[ds]])
+    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
+    // CHECK:   %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
+    // CHECK:   {{.+}} = {{.+}} while({{.*}}%[[tuple]])
     // CHECK: }
   )"));
 }
@@ -446,18 +460,18 @@ ENTRY entry {
     // CHECK: HloModule
     // CHECK: %while_body
     // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6},{0,5}{{[}]}}
-    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.+}} %[[cp]], {{.+}})
-    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.+}} %[[dus]], {{.+}} %[[dus]])
-    // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[mul]], {{.+}})
-    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.+}} %[[dus2]], {{.+}})
+    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
+    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[dus]], {{.*}}%[[dus]])
+    // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
+    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %entry
     // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}{_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}
-    // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.+}} %[[cp]], {{.+}})
-    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.+}} %[[ds]], {{.+}} %[[ds]])
-    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[mul]], {{.+}})
-    // CHECK:   %[[tuple:.+]] = {{.+}} tuple({{.+}} %[[dus]], {{.+}})
-    // CHECK:   {{.+}} = {{.+}} while({{.+}} %[[tuple]])
+    // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
+    // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[ds]], {{.*}}%[[ds]])
+    // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
+    // CHECK:   %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
+    // CHECK:   {{.+}} = {{.+}} while({{.*}}%[[tuple]])
     // CHECK: }
   )"));
 }
@@ -578,7 +592,7 @@ ENTRY entry {
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
   EXPECT_TRUE(RunOptimizer(
                   module.get(), /*last_run=*/true, 0, false, true,
-                  CollectivePipeliner::PipeliningDirection::kForward,
+                  collective_pipeliner_utils::PipeliningDirection::kForward,
                   HloPredicateIsOp<HloOpcode::kAllReduce>,
                   /*acceptable_formatting=*/
                   [](const HloInstruction* i) { return true; },
@@ -588,7 +602,7 @@ ENTRY entry {
   XLA_VLOG_LINES(1, module->ToString());
   HloInstruction* while_instr =
       FindInstruction(module.get(), HloOpcode::kWhile);
-  EXPECT_EQ(while_instr->shape().tuple_shapes_size(), 5);
+  EXPECT_EQ(while_instr->shape().tuple_shapes().size(), 5);
 }
 
 TEST_F(CollectivePipelinerTest, TransformIncrementIndexByOneNotFirstIdx) {
@@ -973,12 +987,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::DynamicUpdateSlice(
@@ -1040,12 +1055,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
@@ -1097,12 +1113,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/false, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   RunOptimizer(module.get(), /*last_run=*/true, 1).value();
   XLA_VLOG_LINES(1, module->ToString());
@@ -1177,12 +1194,13 @@ ENTRY %entry (p0: bf16[3,8,128]) -> bf16[3,8,128] {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 1,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 1,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   auto* root = module->entry_computation()->root_instruction();
   // Check that the all-gather can be pipelined after we had already a previous
@@ -1258,12 +1276,13 @@ ENTRY %entry (p0: bf16[3,8,128]) -> bf16[3,8,128] {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/false, 1,
-                            /*pipeline_use_tree=*/false,
-                            /*process_different_sized_ops=*/false,
-                            CollectivePipeliner::PipeliningDirection::kForward,
-                            IsAllGather)
-                   .value());
+  EXPECT_FALSE(
+      RunOptimizer(module.get(), /*last_run=*/false, 1,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -1447,12 +1466,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/false,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   const int64_t while_count = absl::c_count_if(
       module->entry_computation()->instructions(),
@@ -1515,12 +1535,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/false,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   const HloInstruction* while_instr =
       FindInstruction(module.get(), HloOpcode::kWhile);
@@ -1595,14 +1616,15 @@ ENTRY entry {
   auto is_all_gather_or_offloading = [](const HloInstruction* instruction) {
     return instruction->opcode() == HloOpcode::kAllGather ||
            instruction->IsCustomCall(
-               host_memory_offload_annotations::kMoveToDeviceCustomCallTarget);
+               memory_annotations::kMoveToDeviceCustomCallTarget);
   };
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/false,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           is_all_gather_or_offloading)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   is_all_gather_or_offloading)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   const int64_t while_count = absl::c_count_if(
       module->entry_computation()->instructions(),
@@ -1692,14 +1714,15 @@ ENTRY entry {
   auto is_all_gather_or_offloading = [](const HloInstruction* instruction) {
     return instruction->opcode() == HloOpcode::kAllGather ||
            instruction->IsCustomCall(
-               host_memory_offload_annotations::kMoveToDeviceCustomCallTarget);
+               memory_annotations::kMoveToDeviceCustomCallTarget);
   };
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/false,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           is_all_gather_or_offloading)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   is_all_gather_or_offloading)
+          .value());
 }
 
 TEST_F(CollectivePipelinerTest, TwoIterations) {
@@ -1758,16 +1781,16 @@ ENTRY entry {
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
   auto is_all_gather_or_offloading = [](const HloInstruction* instruction) {
     return instruction->opcode() == HloOpcode::kAllGather ||
-           instruction->IsCustomCall(host_memory_offload_annotations::
-                                         kMoveToDeviceCustomCallTarget) ||
            instruction->IsCustomCall(
-               host_memory_offload_annotations::kMoveToHostCustomCallTarget);
+               memory_annotations::kMoveToDeviceCustomCallTarget) ||
+           instruction->IsCustomCall(
+               memory_annotations::kMoveToHostCustomCallTarget);
   };
   bool changed =
       RunOptimizer(module.get(), /*last_run=*/true, /*level_to_operate_on=*/0,
                    /*pipeline_use_tree=*/true,
                    /*process_different_sized_ops=*/true,
-                   CollectivePipeliner::PipeliningDirection::kBackward,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
                    is_all_gather_or_offloading)
           .value();
   ASSERT_TRUE(changed);
@@ -1776,7 +1799,7 @@ ENTRY entry {
       RunOptimizer(module.get(), /*last_run=*/true, /*level_to_operate_on=*/0,
                    /*pipeline_use_tree=*/true,
                    /*process_different_sized_ops=*/true,
-                   CollectivePipeliner::PipeliningDirection::kForward,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
                    is_all_gather_or_offloading)
           .value();
   XLA_VLOG_LINES(1, module->ToString());
@@ -1839,26 +1862,26 @@ ENTRY entry {
   config_.set_replica_count(4);
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
   EXPECT_TRUE(
-      RunOptimizer(
-          module.get(), /*last_run=*/true, /*level_to_operate_on=*/0,
-          /*pipeline_use_tree=*/false,
-          /*process_different_sized_ops=*/false,
-          /*direction=*/CollectivePipeliner::PipeliningDirection::kBackward,
-          /*should_process=*/IsAllGather)
+      RunOptimizer(module.get(), /*last_run=*/true, /*level_to_operate_on=*/0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   /*direction=*/
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   /*should_process=*/IsAllGather)
           .value());
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
   // CHECK: %while_body
   // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12},{7,12}{{[}]}}}
-  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[cp]], {{.+}})
-  // CHECK: ROOT {{.+}} = {{.+}} tuple({{.+}} %[[dus]], {{.+}})
+  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp]], {{.*}})
+  // CHECK: ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ENTRY %entry
-  // CHECK: %[[while:.+]] = {{.+}} while({{.+}})
-  // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.+}} %[[while]]), index=1
-  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.+}} %[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}
-  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[cp2]], {{.+}})
-  // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.+}} %[[dus]], {{.+}})
-  // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.+}} %[[tuple]]), index=1
+  // CHECK: %[[while:.+]] = {{.+}} while({{.*}})
+  // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.*}}%[[while]]), index=1
+  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}
+  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp2]], {{.*}})
+  // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
+  // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.*}}%[[tuple]]), index=1
   )"));
 }
 
@@ -1918,26 +1941,26 @@ ENTRY entry {
   config_.set_replica_count(4);
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
   EXPECT_TRUE(
-      RunOptimizer(
-          module.get(), /*last_run=*/true, /*level_to_operate_on=*/0,
-          /*pipeline_use_tree=*/false,
-          /*process_different_sized_ops=*/false,
-          /*direction=*/CollectivePipeliner::PipeliningDirection::kBackward,
-          /*should_process=*/IsAllGather)
+      RunOptimizer(module.get(), /*last_run=*/true, /*level_to_operate_on=*/0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   /*direction=*/
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   /*should_process=*/IsAllGather)
           .value());
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
   // CHECK: %while_body
   // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{7,12},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}{{[}]}}}
-  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[cp]], {{.+}})
-  // CHECK: ROOT {{.+}} = {{.+}} tuple({{.+}} %[[dus]], {{.+}})
+  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp]], {{.*}})
+  // CHECK: ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ENTRY %entry
   // CHECK: %[[while:.+]] = {{.+}} while({{.+}})
-  // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.+}} %[[while]]), index=1
-  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.+}} %[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}
-  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.+}} %[[cp2]], {{.+}})
-  // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.+}} %[[dus]], {{.+}})
-  // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.+}} %[[tuple]]), index=1
+  // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.*}}%[[while]]), index=1
+  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}
+  // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp2]], {{.*}})
+  // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
+  // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.*}}%[[tuple]]), index=1
   )"));
 }
 
@@ -1998,12 +2021,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                            /*pipeline_use_tree=*/false,
-                            /*process_different_sized_ops=*/false,
-                            CollectivePipeliner::PipeliningDirection::kBackward,
-                            IsAllGather)
-                   .value());
+  EXPECT_FALSE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -2063,18 +2087,20 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/false, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -2136,18 +2162,20 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/false, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -2205,11 +2233,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -2269,11 +2298,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -2327,11 +2357,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -2387,11 +2418,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   const HloInstruction* while_instr =
@@ -2457,11 +2489,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/false,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   const HloInstruction* while_instr =
@@ -2528,11 +2561,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/false,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   const HloInstruction* all_reduce = module->entry_computation()
@@ -2615,12 +2649,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                            /*pipeline_use_tree=*/false,
-                            /*process_different_sized_ops=*/false,
-                            CollectivePipeliner::PipeliningDirection::kBackward,
-                            IsAllGather)
-                   .value());
+  EXPECT_FALSE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   IsAllGather)
+          .value());
 }
 
 TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
@@ -2691,12 +2726,13 @@ TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
             recv_done->users()[0] != recv_done->parent()->root_instruction());
   };
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/false,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           should_pipeline)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   should_pipeline)
+          .value());
   XLA_VLOG_LINES(10, module->ToString());
   auto recv1 =
       DynCast<HloRecvInstruction>(FindInstruction(module.get(), "recv.1"));
@@ -2808,16 +2844,17 @@ TEST_F(CollectivePipelinerTest,
     return absl::OkStatus();
   };
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/false,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           should_pipeline,
-                           /*acceptable_formatting=*/HloPredicateTrue,
-                           /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
-                           should_allow_loop_variant_parameter,
-                           postprocess_peeled, postprocess_rotated)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   should_pipeline,
+                   /*acceptable_formatting=*/HloPredicateTrue,
+                   /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
+                   should_allow_loop_variant_parameter, postprocess_peeled,
+                   postprocess_rotated)
+          .value());
   XLA_VLOG_LINES(10, module->ToString());
   auto while_op = FindInstruction(module.get(), "while");
   EXPECT_EQ(while_op->opcode(), HloOpcode::kWhile);
@@ -2908,11 +2945,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -2971,11 +3009,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -3049,11 +3088,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
 }
 
@@ -3108,12 +3148,13 @@ ENTRY main.3813_spmd {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::PipeliningDirection::kForward,
-                           HloPredicateIsOp<HloOpcode::kReduceScatter>)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/true,
+                   /*process_different_sized_ops=*/true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   HloPredicateIsOp<HloOpcode::kReduceScatter>)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   HloVerifier verifier(/*layout_sensitive=*/false,
                        /*allow_mixed_precision*/ true);
@@ -3170,14 +3211,15 @@ ENTRY entry {
           module.get(), /*last_run=*/true, 0,
           /*pipeline_use_tree=*/false,
           /*process_different_sized_ops=*/false,
-          /*direction=*/CollectivePipeliner::PipeliningDirection::kBackward,
+          /*direction=*/
+          collective_pipeliner_utils::PipeliningDirection::kBackward,
           /*should_process=*/IsAllGather,
           /*acceptable_formatting=*/HloPredicateTrue,
           /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
           /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateTrue,
-          /*postprocess_backward_peeled=*/std::nullopt,
-          /*postprocess_backward_rotated=*/std::nullopt,
-          /*postprocess_backward_peeled_trailing=*/std::nullopt,
+          /*postprocess_backward_peeled=*/{},
+          /*postprocess_backward_rotated=*/{},
+          /*postprocess_backward_peeled_trailing=*/{},
           /*should_add_loop_invariant_op_in_chain=*/true)
           .value());
   XLA_VLOG_LINES(1, module->ToString());
@@ -3201,14 +3243,15 @@ ENTRY entry {
           ref_module.get(), /*last_run=*/true, 0,
           /*pipeline_use_tree=*/false,
           /*process_different_sized_ops=*/false,
-          /*direction=*/CollectivePipeliner::PipeliningDirection::kBackward,
+          /*direction=*/
+          collective_pipeliner_utils::PipeliningDirection::kBackward,
           /*should_process=*/IsAllGather,
           /*acceptable_formatting=*/HloPredicateTrue,
           /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
           /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateTrue,
-          /*postprocess_backward_peeled=*/std::nullopt,
-          /*postprocess_backward_rotated=*/std::nullopt,
-          /*postprocess_backward_peeled_trailing=*/std::nullopt,
+          /*postprocess_backward_peeled=*/{},
+          /*postprocess_backward_rotated=*/{},
+          /*postprocess_backward_peeled_trailing=*/{},
           /*should_add_loop_invariant_op_in_chain=*/false)
           .value());
 }
@@ -3255,7 +3298,10 @@ while_body {
   dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.35, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
   mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
   ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
-  b.1 = bf16[1,8,128,32] broadcast(ar.1), dimensions={0,1,2}
+  slice = bf16[1,8,120] slice(ar.1), slice={[0:1], [0:8], [0:120]}
+  constant.2563 = bf16[] constant(5.0)
+  pad = bf16[1,8,128] pad(slice, constant.2563), padding=0_0x0_0x0_8
+  b.1 = bf16[1,8,128,32] broadcast(pad), dimensions={0,1,2}
   constant = bf16[] constant(0)
   reduce = bf16[1,8,128] reduce(b.1, constant), dimensions={3}, to_apply=add.1
   dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, reduce, select.1348, constant.2561, constant.2561)
@@ -3271,11 +3317,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   const HloInstruction* while_instr =
@@ -3360,7 +3407,7 @@ ENTRY entry {
           /*level_to_operate_on=*/0,
           /*pipeline_use_tree=*/true,
           /*process_different_sized_ops=*/true,
-          CollectivePipeliner::kForwardSink,
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink,
           /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
           /*acceptable_formatting=*/HloPredicateIsNotOp<HloOpcode::kAllReduce>)
           .value());
@@ -3466,7 +3513,7 @@ ENTRY entry {
           /*level_to_operate_on=*/0,
           /*pipeline_use_tree=*/true,
           /*process_different_sized_ops=*/true,
-          CollectivePipeliner::kForwardSink,
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink,
           /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
           /*acceptable_formatting=*/HloPredicateIsNotOp<HloOpcode::kAllReduce>)
           .value());
@@ -3593,14 +3640,15 @@ ENTRY entry {
           /*level_to_operate_on=*/0,
           /*pipeline_use_tree=*/true,
           /*process_different_sized_ops=*/true,
-          /*direction=*/CollectivePipeliner::kForwardSink,
+          /*direction=*/
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink,
           /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
           /*acceptable_formatting=*/HloPredicateIsNotOp<HloOpcode::kAllReduce>,
           /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
           /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
-          /*postprocess_backward_peeled=*/std::nullopt,
-          /*postprocess_backward_rotated=*/std::nullopt,
-          /*postprocess_backward_peeled_trailing=*/std::nullopt,
+          /*postprocess_backward_peeled=*/{},
+          /*postprocess_backward_rotated=*/{},
+          /*postprocess_backward_peeled_trailing=*/{},
           /*should_add_loop_invariant_op_in_chain=*/false,
           /*collective_size_threshold_to_stop_sinking=*/1024)
           .value());
@@ -3659,12 +3707,14 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true,
-                            /*level_to_operate_on=*/0,
-                            /*pipeline_use_tree=*/true,
-                            /*process_different_sized_ops=*/true,
-                            CollectivePipeliner::kForwardSink)
-                   .value());
+  EXPECT_FALSE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true,
+          /*level_to_operate_on=*/0,
+          /*pipeline_use_tree=*/true,
+          /*process_different_sized_ops=*/true,
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink)
+          .value());
 }
 
 TEST_F(CollectivePipelinerTest, ForwardSinkNotFirstDim) {
@@ -3719,12 +3769,14 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true,
-                            /*level_to_operate_on=*/0,
-                            /*pipeline_use_tree=*/true,
-                            /*process_different_sized_ops=*/true,
-                            CollectivePipeliner::kForwardSink)
-                   .value());
+  EXPECT_FALSE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true,
+          /*level_to_operate_on=*/0,
+          /*pipeline_use_tree=*/true,
+          /*process_different_sized_ops=*/true,
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink)
+          .value());
 }
 
 TEST_F(CollectivePipelinerTest, CollectiveWithMultipleDUS) {
@@ -3792,11 +3844,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   const HloInstruction* while_instr =
@@ -3887,11 +3940,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/false,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   const HloInstruction* while_instr =
@@ -3989,12 +4043,14 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true,
-                            /*level_to_operate_on=*/0,
-                            /*pipeline_use_tree=*/true,
-                            /*process_different_sized_ops=*/true,
-                            CollectivePipeliner::kForwardSink)
-                   .value());
+  EXPECT_FALSE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true,
+          /*level_to_operate_on=*/0,
+          /*pipeline_use_tree=*/true,
+          /*process_different_sized_ops=*/true,
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink)
+          .value());
 }
 
 TEST_F(CollectivePipelinerTest, MergeTwoCollectivesEachWithTwoDUS) {
@@ -4084,11 +4140,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -4186,11 +4243,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/false,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -4217,6 +4275,94 @@ ENTRY entry {
       while_instr->while_body()->root_instruction()->operand(8)));
 }
 
+// There is only one group of collectives in the following graph. The algorithm
+// - first creates two individual groups: one for ar.1 and one for ar.2,
+// - and then merges the two groups into one group because of ar.3
+// - and then adds ar.4 into the existing group.
+//  ar.1      ar.3      ar.2      ar.4
+//      \    /    \    /    \    /
+//       add.1     add.2     add.3
+//         |         |         |
+//      dus.35    dus.36    dus.37
+TEST_F(CollectivePipelinerTest, MergeFourCollectives) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=2
+  get-tuple-element.396 = bf16[3,8,128] get-tuple-element(param), index=3
+  get-tuple-element.397 = bf16[3,8,128] get-tuple-element(param), index=4
+
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  ar.2 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=2
+  ar.3 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=3
+  ar.4 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=4
+
+  add.1 = bf16[1,8,128] add(ar.1, ar.3)
+  add.2 = bf16[1,8,128] add(ar.2, ar.3)
+  add.3 = bf16[1,8,128] add(ar.2, ar.4)
+
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, add.1, select.1348, constant.2561, constant.2561)
+  dynamic-update-slice.36 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.396, add.2, select.1348, constant.2561, constant.2561)
+  dynamic-update-slice.37 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.397, add.3, select.1348, constant.2561, constant.2561)
+
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128]) tuple(add.230, get-tuple-element, dynamic-update-slice.35, dynamic-update-slice.36, dynamic-update-slice.37), control-predecessors={select.1348}
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128]) tuple(c0, p0, p0, p0, p0)
+  while = (s32[], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
+                  .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_EQ(absl::c_count_if(module->entry_computation()->instructions(),
+                             [](const HloInstruction* instr) {
+                               return instr->opcode() == HloOpcode::kAllReduce;
+                             }),
+            4);
+}
+
 TEST_F(CollectivePipelinerTest, NoRedundantBroadcastsInFormattingOps) {
   constexpr absl::string_view hlo_string = R"(
 HloModule module
@@ -4279,11 +4425,12 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true,
-                           /*level_to_operate_on=*/0,
-                           /*pipeline_use_tree=*/true,
-                           /*process_different_sized_ops=*/true,
-                           CollectivePipeliner::kForwardSink)
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true,
+                  /*level_to_operate_on=*/0,
+                  /*pipeline_use_tree=*/true,
+                  /*process_different_sized_ops=*/true,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink)
                   .value());
   XLA_VLOG_LINES(1, module->ToString());
   // There should be only one broadcast instruction using a get-tuple-element
@@ -4351,12 +4498,13 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0, false, true,
-                           CollectivePipeliner::PipeliningDirection::kForward,
-                           HloPredicateIsOp<HloOpcode::kAllReduce>,
-                           /*acceptable_formatting=*/HloPredicateTrue,
-                           /*reuse_pipelined_op_buffer=*/HloPredicateTrue)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0, false, true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   HloPredicateIsOp<HloOpcode::kAllReduce>,
+                   /*acceptable_formatting=*/HloPredicateTrue,
+                   /*reuse_pipelined_op_buffer=*/HloPredicateTrue)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::DynamicUpdateSlice(op::GetTupleElement(op::While()),
@@ -4416,20 +4564,27 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0, false, true,
-                           CollectivePipeliner::PipeliningDirection::kForward,
-                           HloPredicateIsOp<HloOpcode::kAllReduce>,
-                           /*acceptable_formatting=*/HloPredicateTrue,
-                           /*reuse_pipelined_op_buffer=*/HloPredicateTrue)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0, false, true,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
+                   HloPredicateIsOp<HloOpcode::kAllReduce>,
+                   /*acceptable_formatting=*/HloPredicateTrue,
+                   /*reuse_pipelined_op_buffer=*/HloPredicateTrue)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   for (HloInstruction* instr : module->entry_computation()->instructions()) {
     if (instr->opcode() == HloOpcode::kMultiply) {
-      EXPECT_EQ(GetSchedulingAnnotation(instr), 4);
+      TF_ASSERT_OK_AND_ASSIGN(std::optional<int64_t> id,
+                              GetSchedulingAnnotationGroupId(instr));
+      EXPECT_EQ(id, 4);
     } else if (instr->opcode() == HloOpcode::kAllReduce) {
-      EXPECT_EQ(GetSchedulingAnnotation(instr), 5);
+      TF_ASSERT_OK_AND_ASSIGN(std::optional<int64_t> id,
+                              GetSchedulingAnnotationGroupId(instr));
+      EXPECT_EQ(id, 5);
     } else if (instr->opcode() == HloOpcode::kAllGather) {
-      EXPECT_EQ(GetSchedulingAnnotation(instr), 6);
+      TF_ASSERT_OK_AND_ASSIGN(std::optional<int64_t> id,
+                              GetSchedulingAnnotationGroupId(instr));
+      EXPECT_EQ(id, 6);
     }
   }
 }
@@ -4488,23 +4643,352 @@ ENTRY entry {
 }
 )";
   auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0,
-                           /*pipeline_use_tree=*/false,
-                           /*process_different_sized_ops=*/false,
-                           CollectivePipeliner::PipeliningDirection::kBackward,
-                           IsAllGather)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0,
+                   /*pipeline_use_tree=*/false,
+                   /*process_different_sized_ops=*/false,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward,
+                   IsAllGather)
+          .value());
   XLA_VLOG_LINES(1, module->ToString());
   for (HloInstruction* instr : module->entry_computation()->instructions()) {
     if (instr->opcode() == HloOpcode::kReshape) {
-      EXPECT_EQ(GetSchedulingAnnotation(instr), 4);
+      TF_ASSERT_OK_AND_ASSIGN(std::optional<int64_t> id,
+                              GetSchedulingAnnotationGroupId(instr));
+      EXPECT_EQ(id, 4);
     } else if (instr->opcode() == HloOpcode::kAllGather) {
-      EXPECT_EQ(GetSchedulingAnnotation(instr), 5);
+      TF_ASSERT_OK_AND_ASSIGN(std::optional<int64_t> id,
+                              GetSchedulingAnnotationGroupId(instr));
+      EXPECT_EQ(id, 5);
     } else if (instr->opcode() == HloOpcode::kAllReduce) {
-      EXPECT_EQ(GetSchedulingAnnotation(instr), 6);
+      TF_ASSERT_OK_AND_ASSIGN(std::optional<int64_t> id,
+                              GetSchedulingAnnotationGroupId(instr));
+      EXPECT_EQ(id, 6);
     }
   }
 }
 
+TEST_F(CollectivePipelinerTest,
+       TransformForwardWithInconsistentSchedulingAnnotations) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.5 = bf16[3,8,128] get-tuple-element(param), index=2
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.5, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  rs.1 = bf16[1,1,128] reduce-scatter(mul), replica_groups={}, to_apply=add, channel_id=1, dimensions={1}
+  ag.1 = bf16[1,8,128] all-gather(rs.1), replica_groups={}, channel_id=2, dimensions={1}, frontend_attributes={_scheduling_group_id="123:-1"}
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ag.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.5)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-3)
+  p0 = bf16[3,8,128] parameter(0)
+  cc = bf16[] constant(0)
+  tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(c0, p0, p0)
+  while = (s32[], bf16[3,8,128], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  auto direction = collective_pipeliner_utils::PipeliningDirection::kForward;
+  EXPECT_FALSE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/false,
+          /*process_different_sized_ops=*/true, direction,
+          [direction, is_all_gather = IsAllGatherExplicitPipeliningAnnotation](
+              const HloInstruction* instr) {
+            return is_all_gather(instr, direction);
+          })
+          .value());
+  HloInstruction* ag = FindInstruction(module.get(), "ag.1");
+  std::optional<Annotation> annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_TRUE(annotation->iteration_id);
+  EXPECT_EQ(annotation->iteration_id->iteration_id, -1);
+
+  LegalizeSchedulingAnnotations::Config config;
+  config.remove_loop_iteration_annotation_only = true;
+  EXPECT_TRUE(LegalizeSchedulingAnnotations(config).Run(module.get()).value());
+  annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_FALSE(annotation->iteration_id);
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+TEST_F(CollectivePipelinerTest,
+       TransformForwardWithConsistentSchedulingAnnotations) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.5 = bf16[3,8,128] get-tuple-element(param), index=2
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.5, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  rs.1 = bf16[1,1,128] reduce-scatter(mul), replica_groups={}, to_apply=add, channel_id=1, dimensions={1}
+  ag.1 = bf16[1,8,128] all-gather(rs.1), replica_groups={}, channel_id=2, dimensions={1}, frontend_attributes={_scheduling_group_id="123:1"}
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ag.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.5)
+  }
+
+  ENTRY entry {
+  c0 = s32[] constant(-3)
+  p0 = bf16[3,8,128] parameter(0)
+  cc = bf16[] constant(0)
+  tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(c0, p0, p0)
+  while = (s32[], bf16[3,8,128], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  auto direction = collective_pipeliner_utils::PipeliningDirection::kForward;
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/false,
+          /*process_different_sized_ops=*/true, direction,
+          [direction, is_all_gather = IsAllGatherExplicitPipeliningAnnotation](
+              const HloInstruction* instr) {
+            return is_all_gather(instr, direction);
+          })
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::DynamicUpdateSlice(
+                        _, op::AllGather(op::GetTupleElement(op::While())),
+                        op::GetTupleElement(), op::Constant(), op::Constant()));
+
+  HloInstruction* ag = FindInstruction(module.get(), "ag.2");
+  std::optional<Annotation> annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_TRUE(annotation->iteration_id);
+  EXPECT_EQ(annotation->iteration_id->iteration_id, 1);
+
+  LegalizeSchedulingAnnotations::Config config;
+  config.remove_loop_iteration_annotation_only = true;
+  EXPECT_TRUE(LegalizeSchedulingAnnotations(config).Run(module.get()).value());
+  annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_FALSE(annotation->iteration_id);
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+TEST_F(CollectivePipelinerTest,
+       TransformBackwardWithInconsistentSchedulingAnnotations) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+while_cond {
+  param = (s32[], bf16[5,8,128], bf16[5,1,2,128]) parameter(0)
+  loop_index = s32[] get-tuple-element(param), index=0
+  c4 = s32[] constant(4)
+  ROOT cmp = pred[] compare(loop_index, c4), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[5,8,128], bf16[5,1,2,128]) parameter(0)
+  loop_index = s32[] get-tuple-element(param), index=0
+  partial_output = bf16[5,8,128] get-tuple-element(param), index=1
+  slice_input = bf16[5,1,2,128] get-tuple-element(param), index=2
+  c0 = s32[] constant(0)
+  c1 = s32[] constant(1)
+  next_loop_index = s32[] add(loop_index, c1)
+  c3 = s32[] constant(3)
+  three_minus_loop_index = s32[] subtract(c3, loop_index)
+  dynamic_slice = bf16[1,1,2,128] dynamic-slice(slice_input, three_minus_loop_index, c0, c0, c0), dynamic_slice_sizes={1,1,2,128}
+  dynamic_slice_reshape = bf16[1,2,128] reshape(dynamic_slice)
+  add = bf16[1,2,128] add(dynamic_slice_reshape, dynamic_slice_reshape), control-predecessors={c3}
+  all_gather = bf16[1,8,128] all-gather(add), dimensions={1}, replica_groups={}, frontend_attributes={_scheduling_group_id="123:1"}
+  updated_partial_output = bf16[5,8,128] dynamic-update-slice(partial_output, all_gather, three_minus_loop_index, c0, c0)
+  ROOT tuple = (s32[], bf16[5,8,128], bf16[5,1,2,128]) tuple(next_loop_index, updated_partial_output, slice_input), control-predecessors={add}
+}
+
+ENTRY entry {
+  c1 = s32[] constant(1)
+  p0 = bf16[5,8,128] parameter(0)
+  p1 = bf16[5,1,2,128] parameter(1)
+  tuple = (s32[], bf16[5,8,128], bf16[5,1,2,128]) tuple(c1, p0, p1)
+  while = (s32[], bf16[5,8,128], bf16[5,1,2,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte = bf16[5,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  auto direction = collective_pipeliner_utils::PipeliningDirection::kBackward;
+  EXPECT_FALSE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/false,
+          /*process_different_sized_ops=*/false, direction,
+          [direction, is_all_gather = IsAllGatherExplicitPipeliningAnnotation](
+              const HloInstruction* instr) {
+            return is_all_gather(instr, direction);
+          })
+          .value());
+
+  HloInstruction* ag = FindInstruction(module.get(), "all_gather");
+  std::optional<Annotation> annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_TRUE(annotation->iteration_id);
+  EXPECT_EQ(annotation->iteration_id->iteration_id, 1);
+
+  LegalizeSchedulingAnnotations::Config config;
+  config.remove_loop_iteration_annotation_only = true;
+  EXPECT_TRUE(LegalizeSchedulingAnnotations(config).Run(module.get()).value());
+  annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_FALSE(annotation->iteration_id);
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+TEST_F(CollectivePipelinerTest,
+       TransformBackwardWithConsistentSchedulingAnnotations) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+while_cond {
+  param = (s32[], bf16[5,8,128], bf16[5,1,2,128]) parameter(0)
+  loop_index = s32[] get-tuple-element(param), index=0
+  c4 = s32[] constant(4)
+  ROOT cmp = pred[] compare(loop_index, c4), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[5,8,128], bf16[5,1,2,128]) parameter(0)
+  loop_index = s32[] get-tuple-element(param), index=0
+  partial_output = bf16[5,8,128] get-tuple-element(param), index=1
+  slice_input = bf16[5,1,2,128] get-tuple-element(param), index=2
+  c0 = s32[] constant(0)
+  c1 = s32[] constant(1)
+  next_loop_index = s32[] add(loop_index, c1)
+  c3 = s32[] constant(3)
+  three_minus_loop_index = s32[] subtract(c3, loop_index)
+  dynamic_slice = bf16[1,1,2,128] dynamic-slice(slice_input, three_minus_loop_index, c0, c0, c0), dynamic_slice_sizes={1,1,2,128}
+  dynamic_slice_reshape = bf16[1,2,128] reshape(dynamic_slice)
+  add = bf16[1,2,128] add(dynamic_slice_reshape, dynamic_slice_reshape), control-predecessors={c3}
+  all_gather = bf16[1,8,128] all-gather(add), dimensions={1}, replica_groups={}, frontend_attributes={_scheduling_group_id="123:-1"}
+  updated_partial_output = bf16[5,8,128] dynamic-update-slice(partial_output, all_gather, three_minus_loop_index, c0, c0)
+  ROOT tuple = (s32[], bf16[5,8,128], bf16[5,1,2,128]) tuple(next_loop_index, updated_partial_output, slice_input), control-predecessors={add}
+}
+
+ENTRY entry {
+  c1 = s32[] constant(1)
+  p0 = bf16[5,8,128] parameter(0)
+  p1 = bf16[5,1,2,128] parameter(1)
+  tuple = (s32[], bf16[5,8,128], bf16[5,1,2,128]) tuple(c1, p0, p1)
+  while = (s32[], bf16[5,8,128], bf16[5,1,2,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte = bf16[5,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  auto direction = collective_pipeliner_utils::PipeliningDirection::kBackward;
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/false,
+          /*process_different_sized_ops=*/false, direction,
+          [direction, is_all_gather = IsAllGatherExplicitPipeliningAnnotation](
+              const HloInstruction* instr) {
+            return is_all_gather(instr, direction);
+          })
+          .value());
+  const HloInstruction* while_instr =
+      FindInstruction(module.get(), HloOpcode::kWhile);
+  const HloComputation* comp = while_instr->while_body();
+  const HloInstruction* root_loop = comp->root_instruction();
+
+  const HloInstruction* shifted_loop_counter = root_loop->operand(4);
+  EXPECT_EQ(shifted_loop_counter->opcode(), HloOpcode::kAdd);
+  const HloInstruction* loop_increment = shifted_loop_counter->operand(1);
+  EXPECT_EQ(loop_increment->opcode(), HloOpcode::kConstant);
+  EXPECT_TRUE(loop_increment->literal().IsEqualAt({}, 1));
+
+  HloInstruction* ag = FindInstruction(module.get(), "all_gather.2");
+  std::optional<Annotation> annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_TRUE(annotation->iteration_id);
+  EXPECT_EQ(annotation->iteration_id->iteration_id, -1);
+
+  LegalizeSchedulingAnnotations::Config config;
+  config.remove_loop_iteration_annotation_only = true;
+  EXPECT_TRUE(LegalizeSchedulingAnnotations(config).Run(module.get()).value());
+  annotation = GetSchedulingAnnotation(ag).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_FALSE(annotation->iteration_id);
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/collective_pipeliner_utils.h b/third_party/xla/xla/service/collective_pipeliner_utils.h
new file mode 100644
index 000000000000..103063764ce3
--- /dev/null
+++ b/third_party/xla/xla/service/collective_pipeliner_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COLLECTIVE_PIPELINER_UTILS_H_
+#define XLA_SERVICE_COLLECTIVE_PIPELINER_UTILS_H_
+
+namespace xla {
+namespace collective_pipeliner_utils {
+
+enum PipeliningDirection {
+  kBackward,
+  kForward,
+  kForwardSink,
+};
+
+}  // namespace collective_pipeliner_utils
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COLLECTIVE_PIPELINER_UTILS_H_
diff --git a/third_party/xla/xla/service/collective_quantizer.h b/third_party/xla/xla/service/collective_quantizer.h
deleted file mode 100644
index b63a3138b91e..000000000000
--- a/third_party/xla/xla/service/collective_quantizer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_COLLECTIVE_QUANTIZER_H_
-#define XLA_SERVICE_COLLECTIVE_QUANTIZER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/collective_quantizer.h"
-
-#endif  // XLA_SERVICE_COLLECTIVE_QUANTIZER_H_
diff --git a/third_party/xla/xla/service/collective_transformation_reorderer.h b/third_party/xla/xla/service/collective_transformation_reorderer.h
deleted file mode 100644
index 2bbae612c5e4..000000000000
--- a/third_party/xla/xla/service/collective_transformation_reorderer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
-#define XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/collective_transformation_reorderer.h"
-
-#endif  // XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
diff --git a/third_party/xla/xla/service/collective_utils.h b/third_party/xla/xla/service/collective_utils.h
index 0f9c81fb6d6c..86f03f5ae143 100644
--- a/third_party/xla/xla/service/collective_utils.h
+++ b/third_party/xla/xla/service/collective_utils.h
@@ -28,6 +28,11 @@ constexpr int64_t kDefaultAllReduceCombineThreshold = 30 * 1024 * 1024 + 7;
 // will combine collectives.
 constexpr int64_t kDefaultAllGatherCombineThreshold = 30 * 1024 * 1024 + 7;
 
+// Defines the default threshold for `CollectivePermuteCombiner` up to which the
+// pass will combine collectives.
+constexpr int64_t kDefaultCollectivePermuteCombineThreshold =
+    30 * 1024 * 1024 + 7;
+
 // Defines the default threshold for `ReduceScatterCombiner` up to which the
 // pass will combine collectives.
 constexpr int64_t kDefaultReduceScatterCombineThreshold = 30 * 1024 * 1024 + 7;
diff --git a/third_party/xla/xla/service/collectives_schedule_linearizer.h b/third_party/xla/xla/service/collectives_schedule_linearizer.h
deleted file mode 100644
index 27f0de0032e2..000000000000
--- a/third_party/xla/xla/service/collectives_schedule_linearizer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_COLLECTIVES_SCHEDULE_LINEARIZER_H_
-#define XLA_SERVICE_COLLECTIVES_SCHEDULE_LINEARIZER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/collectives_schedule_linearizer.h"
-
-#endif  // XLA_SERVICE_COLLECTIVES_SCHEDULE_LINEARIZER_H_
diff --git a/third_party/xla/xla/service/comparison_expander.h b/third_party/xla/xla/service/comparison_expander.h
deleted file mode 100644
index 333375478e59..000000000000
--- a/third_party/xla/xla/service/comparison_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_COMPARISON_EXPANDER_H_
-#define XLA_SERVICE_COMPARISON_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/comparison_expander.h"
-
-#endif  // XLA_SERVICE_COMPARISON_EXPANDER_H_
diff --git a/third_party/xla/xla/service/compilation_environments.cc b/third_party/xla/xla/service/compilation_environments.cc
index f2e0dff2b5c5..726b30ffe460 100644
--- a/third_party/xla/xla/service/compilation_environments.cc
+++ b/third_party/xla/xla/service/compilation_environments.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/compilation_environments.h"
 
-#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -28,14 +27,13 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/xla.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
+#include "xla/tsl/platform/errors.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
 
@@ -52,7 +50,7 @@ absl::flat_hash_map<const tsl::protobuf::Descriptor*,
 class GlobalCompEnvStats {
  public:
   static GlobalCompEnvStats& GetSingleton() {
-    static GlobalCompEnvStats* singleton = new GlobalCompEnvStats();
+    static GlobalCompEnvStats* const singleton = new GlobalCompEnvStats();
 
     return *singleton;
   }
@@ -184,6 +182,29 @@ void CompilationEnvironments::RegisterProcessNewEnvFn(
                   << descriptor->full_name() << "' has already been registered";
 }
 
+absl::Status CompilationEnvironments::InitializeAllKnownEnvs() {
+  std::vector<const tsl::protobuf::Descriptor*> descriptors;
+  {
+    absl::MutexLock l(&process_new_env_fns_mu);
+    if (process_new_env_fns == nullptr) {
+      return absl::OkStatus();
+    }
+    descriptors.reserve(process_new_env_fns->size());
+    for (const auto& [descriptor, fn] : *process_new_env_fns) {
+      descriptors.push_back(descriptor);
+    }
+  }
+  for (const auto& descriptor : descriptors) {
+    auto it = environments_.find(descriptor);
+    if (it == environments_.end()) {
+      TF_RETURN_IF_ERROR(AddEnvImpl(*descriptor, nullptr));
+      DefaultEnvCreatedByCompilationEnvironments(descriptor->full_name());
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 absl::Status CompilationEnvironments::AddEnv(
     std::unique_ptr<tsl::protobuf::Message> env) {
   if (!env) {
@@ -243,15 +264,15 @@ absl::Status CompilationEnvironments::AddEnvImpl(
     std::unique_ptr<tsl::protobuf::Message> env) {
   // Check if we already have an environment of env's type
   if (environments_.contains(&descriptor)) {
-    return tsl::errors::InvalidArgument(
-        "Replacing CompilationEnvironment of type %s.", descriptor.full_name());
+    return absl::AlreadyExistsError(absl::StrCat(
+        "Replacing CompilationEnvironment of type ", descriptor.full_name()));
   }
 
   // Process env
   ProcessNewEnvFn process_new_env = GetProcessNewEnvFn(descriptor);
   if (!process_new_env) {
-    return tsl::errors::InvalidArgument(
-        "Unknown compilation environment type: %s", descriptor.full_name());
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Unknown CompilationEnvironment type ", descriptor.full_name()));
   }
   TF_ASSIGN_OR_RETURN(std::unique_ptr<tsl::protobuf::Message> processed_env,
                       process_new_env(std::move(env)));
diff --git a/third_party/xla/xla/service/compilation_environments.h b/third_party/xla/xla/service/compilation_environments.h
index fe845c23a2e7..b7c6e22fdb7e 100644
--- a/third_party/xla/xla/service/compilation_environments.h
+++ b/third_party/xla/xla/service/compilation_environments.h
@@ -80,7 +80,7 @@ class CompilationEnvironments {
       ProcessNewEnvFn process_new_env);
 
   // Adds env to the list of CompilationEnvironments. If an environment with
-  // the same proto descriptor has already been added, env will replace it.
+  // the same proto descriptor has already been added, returns an error.
   //
   // All added environments are processed via registered ProcessNewEnvFns. If
   // such a function was not regitered for env's proto descriptor or env's
@@ -101,6 +101,14 @@ class CompilationEnvironments {
   template <typename T>
   bool HasEnv();
 
+  // Deletes the environment corresponding to T. Does nothing if no such
+  // environment has been added.
+  template <typename T>
+  void DeleteEnv();
+
+  // Initialize all known compilation environments.
+  absl::Status InitializeAllKnownEnvs();
+
   // Removes all added environments.
   void Clear() { environments_.clear(); }
 
@@ -162,6 +170,12 @@ bool CompilationEnvironments::HasEnv() {
   return environments_.find(descriptor) != environments_.end();
 }
 
+template <typename T>
+void CompilationEnvironments::DeleteEnv() {
+  auto descriptor = T::descriptor();
+  environments_.erase(descriptor);
+}
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
diff --git a/third_party/xla/xla/service/compilation_environments_test.cc b/third_party/xla/xla/service/compilation_environments_test.cc
index 35058aefd459..457488379906 100644
--- a/third_party/xla/xla/service/compilation_environments_test.cc
+++ b/third_party/xla/xla/service/compilation_environments_test.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/service/test_compilation_environment.pb.h"
-#include "xla/test.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/casts.h"
@@ -28,6 +30,8 @@ limitations under the License.
 
 namespace xla {
 
+using ::tsl::testing::StatusIs;
+
 // In order to use TestCompilationEnvironment* with CompilationEnvironments, we
 // must define ProcessNewEnv for them.
 std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv1(
@@ -131,6 +135,27 @@ TEST_F(CompilationEnvironmentsTest, MultipleMutableEnvs) {
             201);
 }
 
+TEST_F(CompilationEnvironmentsTest, ReplaceExistingEnv) {
+  CompilationEnvironments envs;
+  auto env1 = std::make_unique<TestCompilationEnvironment1>();
+  env1->set_some_flag(5);
+  TF_ASSERT_OK(envs.AddEnv(std::move(env1)));
+  EXPECT_EQ(envs.GetEnv<TestCompilationEnvironment1>().some_flag(), 5);
+  {
+    auto env2 = std::make_unique<TestCompilationEnvironment1>();
+    env2->set_some_flag(6);
+    ASSERT_THAT(envs.AddEnv(std::move(env2)),
+                StatusIs(absl::StatusCode::kAlreadyExists));
+  }
+  envs.DeleteEnv<TestCompilationEnvironment1>();
+  {
+    auto env2 = std::make_unique<TestCompilationEnvironment1>();
+    env2->set_some_flag(6);
+    TF_ASSERT_OK(envs.AddEnv(std::move(env2)));
+    EXPECT_EQ(envs.GetEnv<TestCompilationEnvironment1>().some_flag(), 6);
+  }
+}
+
 TEST_F(CompilationEnvironmentsTest, CopyConstructor) {
   // Setup envs with 2 environments
   auto envs = std::make_unique<CompilationEnvironments>();
@@ -217,6 +242,21 @@ TEST_F(CompilationEnvironmentsTest, EnvTypePresenceCheck) {
   EXPECT_TRUE(envs.HasEnv<TestCompilationEnvironment1>());
 }
 
+TEST_F(CompilationEnvironmentsTest, InitializeAllKnownEnvs) {
+  CompilationEnvironments envs;
+  auto env1 = std::make_unique<TestCompilationEnvironment1>();
+  env1->set_some_flag(400);
+  TF_ASSERT_OK(envs.AddEnv(std::move(env1)));
+  EXPECT_TRUE(envs.HasEnv<TestCompilationEnvironment1>());
+  EXPECT_EQ(envs.GetMutableEnv<TestCompilationEnvironment1>().some_flag(), 400);
+  TF_ASSERT_OK(envs.InitializeAllKnownEnvs());
+  EXPECT_TRUE(envs.HasEnv<TestCompilationEnvironment1>());
+  EXPECT_EQ(envs.GetEnv<TestCompilationEnvironment1>().some_flag(), 400);
+  EXPECT_TRUE(envs.HasEnv<TestCompilationEnvironment2>());
+  EXPECT_EQ(envs.GetEnv<TestCompilationEnvironment2>().some_other_flag(), 200);
+  EXPECT_TRUE(envs.HasEnv<TestCompilationEnvironment3>());
+  EXPECT_EQ(envs.GetEnv<TestCompilationEnvironment3>().a_third_flag(), 300);
+}
 }  // namespace
 }  // namespace test
 }  // namespace xla
diff --git a/third_party/xla/xla/service/compile_only_service.cc b/third_party/xla/xla/service/compile_only_service.cc
index a0e48bd8174d..cddbd4904274 100644
--- a/third_party/xla/xla/service/compile_only_service.cc
+++ b/third_party/xla/xla/service/compile_only_service.cc
@@ -22,9 +22,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/backend.h"
+#include "xla/service/compiler.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/dump.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/service.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
@@ -50,13 +52,14 @@ CompileOnlyService::NewService(const ServiceOptions& options) {
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
 
   std::unique_ptr<CompileOnlyService> service(
-      new CompileOnlyService(options, compiler));
+      new CompileOnlyService(options, std::move(compiler)));
   return std::move(service);
 }
 
 CompileOnlyService::CompileOnlyService(const ServiceOptions& options,
-                                       Compiler* compiler)
-    : Service(options, /*execute_backend=*/nullptr), compiler_(compiler) {}
+                                       std::unique_ptr<Compiler> compiler)
+    : Service(options, /*execute_backend=*/nullptr),
+      compiler_(std::move(compiler)) {}
 
 absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CompileOnlyService::CompileAheadOfTime(
@@ -117,11 +120,13 @@ CompileOnlyService::CompileAheadOfTime(
                                         update_shape_with_empty_tiles);
     }
 
+    TF_ASSIGN_OR_RETURN(
+        ProgramShape program_shape,
+        ProgramShape::FromProto(instance.computation.host_program_shape()));
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(
-            ProgramShape(instance.computation.host_program_shape()),
-            instance.argument_layouts, &execution_options, &options));
+        CreateModuleConfig(program_shape, instance.argument_layouts,
+                           &execution_options, &options));
 
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModule> hlo_module,
diff --git a/third_party/xla/xla/service/compile_only_service.h b/third_party/xla/xla/service/compile_only_service.h
index 0238a16f2829..dc83313b2fd0 100644
--- a/third_party/xla/xla/service/compile_only_service.h
+++ b/third_party/xla/xla/service/compile_only_service.h
@@ -73,14 +73,14 @@ class CompileOnlyService : public Service {
 
  private:
   explicit CompileOnlyService(const ServiceOptions& options,
-                              Compiler* compiler);
+                              std::unique_ptr<Compiler> compiler);
   CompileOnlyService(const CompileOnlyService&) = delete;
   void operator=(const CompileOnlyService&) = delete;
 
   // The compiler for the target platform.  This is included in place of
   // the Service::execute_backend_'s compiler, since execute_backend_ is a
   // nullptr in CompileOnlyService.
-  Compiler* compiler_;
+  std::unique_ptr<Compiler> compiler_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/compiler.cc b/third_party/xla/xla/service/compiler.cc
index 43f33e30ea41..c9de92f2bf41 100644
--- a/third_party/xla/xla/service/compiler.cc
+++ b/third_party/xla/xla/service/compiler.cc
@@ -46,13 +46,23 @@ Compiler::TargetConfig::TargetConfig(const se::GpuTargetConfigProto& proto)
     : device_description({proto.gpu_device_info()}),
       platform_name(proto.platform_name()),
       dnn_version_info(proto.dnn_version_info()),
-      device_description_str(proto.device_description_str()) {}
+      device_description_str(proto.device_description_str()) {
+  se::SemanticVersion runtime_version(proto.runtime_version().major(),
+                                      proto.runtime_version().minor(),
+                                      proto.runtime_version().patch());
+  device_description.set_runtime_version(runtime_version);
+}
 
 se::GpuTargetConfigProto Compiler::TargetConfig::ToProto() const {
   stream_executor::GpuTargetConfigProto proto;
   *proto.mutable_gpu_device_info() = device_description.ToGpuProto();
   proto.set_platform_name(platform_name);
   *proto.mutable_dnn_version_info() = dnn_version_info.ToProto();
+  se::RuntimeVersionProto runtime_version_proto;
+  runtime_version_proto.set_major(device_description.runtime_version().major());
+  runtime_version_proto.set_minor(device_description.runtime_version().minor());
+  runtime_version_proto.set_patch(device_description.runtime_version().patch());
+  *proto.mutable_runtime_version() = runtime_version_proto;
   proto.set_device_description_str(device_description_str);
   return proto;
 }
@@ -86,21 +96,21 @@ Compiler::CompileAheadOfTime(
 
 /* static */ absl::flat_hash_map<se::Platform::Id, Compiler::CompilerFactory>*
 Compiler::GetPlatformCompilerFactories() {
-  static auto* r = new absl::flat_hash_map<se::Platform::Id, CompilerFactory>;
+  static auto* const r =
+      new absl::flat_hash_map<se::Platform::Id, CompilerFactory>;
   return r;
 }
 
 /* static */
 absl::flat_hash_map<se::Platform::Id, std::unique_ptr<Compiler>>*
 Compiler::GetPlatformCompilers() {
-  static auto* r =
+  static auto* const r =
       new absl::flat_hash_map<se::Platform::Id, std::unique_ptr<Compiler>>;
   return r;
 }
 
 /* static */ void Compiler::RegisterCompilerFactory(
-    se::Platform::Id platform_id,
-    std::function<std::unique_ptr<Compiler>()> compiler_factory) {
+    se::Platform::Id platform_id, CompilerFactory compiler_factory) {
   absl::MutexLock lock(&platform_compiler_mutex_);
   auto* factories = GetPlatformCompilerFactories();
   CHECK(factories->find(platform_id) == factories->end())
@@ -108,22 +118,10 @@ Compiler::GetPlatformCompilers() {
   (*factories)[platform_id] = std::move(compiler_factory);
 }
 
-/* static */ absl::StatusOr<Compiler*> Compiler::GetForPlatform(
+/* static */ absl::StatusOr<std::unique_ptr<Compiler>> Compiler::GetForPlatform(
     const se::Platform* platform) {
   absl::MutexLock lock(&platform_compiler_mutex_);
 
-  auto* compilers = GetPlatformCompilers();
-  // See if we already instantiated a compiler for this platform.
-  {
-    auto it = compilers->find(platform->id());
-    if (it != compilers->end()) {
-      return it->second.get();
-    }
-
-    // If not, we just fall through to try to create one with a registered
-    // factory.
-  }
-
   auto* factories = GetPlatformCompilerFactories();
   auto it = factories->find(platform->id());
   if (it == factories->end()) {
@@ -132,10 +130,7 @@ Compiler::GetPlatformCompilers() {
         "that platform linked in?",
         platform->Name());
   }
-
-  // And then we invoke the factory, placing the result into the mapping.
-  compilers->insert(std::make_pair(platform->id(), it->second()));
-  return compilers->at(platform->id()).get();
+  return it->second();
 }
 
 // Default implementation
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index b8d34abdd259..627e99629acc 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -73,10 +73,15 @@ class AotCompilationResult {
   }
 
   virtual absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* executor) const {
+      Compiler* compiler, const se::StreamExecutor* executor) const&& {
     return Unimplemented("LoadExecutable unimplemented.");
   }
 
+  virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const {
+    return Unimplemented("buffer_assignment unimplemented.");
+  }
+
   // Returns the optimized HLO module if one was computed and the implementation
   // supports it.
   virtual const HloModule* optimized_module() const = 0;
@@ -282,7 +287,8 @@ class Compiler {
   // The Compiler class also serves as a point to register compiler objects
   // for the various platforms.
 
-  using CompilerFactory = std::function<std::unique_ptr<Compiler>()>;
+  using CompilerFactory =
+      std::function<absl::StatusOr<std::unique_ptr<Compiler>>()>;
 
   // Registers the compiler singleton for the platform. This is assumed to
   // be a singleton, so no ownership is transferred.
@@ -293,7 +299,8 @@ class Compiler {
 
   // Returns the compiler singleton pointer if it is available for the given
   // platform, or an error status if it is not.
-  static absl::StatusOr<Compiler*> GetForPlatform(const se::Platform* platform);
+  static absl::StatusOr<std::unique_ptr<Compiler>> GetForPlatform(
+      const se::Platform* platform);
 
   // Returns a function that computes the size in bytes of the logical
   // buffer that contains a shape.
@@ -324,7 +331,7 @@ class Compiler {
       absl::string_view filename_prefix) const;
 
   virtual absl::StatusOr<std::unique_ptr<Executable>> DeserializeExecutable(
-      absl::Nonnull<const tsl::protobuf::Message*> serialized) const {
+      const absl::string_view serialized) const {
     return Unimplemented("DeserializeExecutable unimplemented");
   }
 
diff --git a/third_party/xla/xla/service/compiler_test.cc b/third_party/xla/xla/service/compiler_test.cc
index 951330e94d37..a911e4a39f81 100644
--- a/third_party/xla/xla/service/compiler_test.cc
+++ b/third_party/xla/xla/service/compiler_test.cc
@@ -38,13 +38,14 @@ TEST(TargetConfigTest, DISABLED_ON_CPU(ExecutorConstructorFillsAllFields)) {
   // We don't attempt to validate values because doing so would require talking
   // to the driver directly.
   EXPECT_GT(target.dnn_version_info().major(), 0) << target.DebugString();
+  EXPECT_GT(target.runtime_version().major(), 0) << target.DebugString();
   EXPECT_GT(target.gpu_device_info().threads_per_block_limit(), 0)
       << target.DebugString();
   EXPECT_NE(target.device_description_str(), "") << target.DebugString();
   EXPECT_NE(target.platform_name(), "") << target.DebugString();
   EXPECT_EQ(target.autotune_results().version(), 0);
 
-  EXPECT_EQ(5,
+  EXPECT_EQ(6,
             stream_executor::GpuTargetConfigProto::descriptor()->field_count())
       << "Make sure all the fields in GpuTargetConfigProto are set and "
          "validated!";
@@ -54,6 +55,7 @@ TEST(TargetConfigTest, ProtoConstructorFillsAllFields) {
   stream_executor::GpuTargetConfigProto config_proto;
   config_proto.set_platform_name("platform");
   config_proto.mutable_dnn_version_info()->set_major(2);
+  config_proto.mutable_runtime_version()->set_major(12);
   config_proto.mutable_gpu_device_info()->set_threads_per_block_limit(5);
   config_proto.set_device_description_str("foo");
 
@@ -63,13 +65,16 @@ TEST(TargetConfigTest, ProtoConstructorFillsAllFields) {
   EXPECT_EQ(target.dnn_version_info().major(),
             config_proto.dnn_version_info().major())
       << target.DebugString();
+  EXPECT_EQ(target.runtime_version().major(),
+            config_proto.runtime_version().major())
+      << target.DebugString();
   EXPECT_EQ(target.gpu_device_info().threads_per_block_limit(), 5)
       << target.DebugString();
   EXPECT_EQ(target.device_description_str(), "foo") << target.DebugString();
   EXPECT_EQ(target.platform_name(), "platform") << target.DebugString();
   EXPECT_EQ(target.autotune_results().version(), 0);
 
-  EXPECT_EQ(5,
+  EXPECT_EQ(6,
             stream_executor::GpuTargetConfigProto::descriptor()->field_count())
       << "Make sure all the fields in GpuTargetConfigProto are set and "
          "validated!";
diff --git a/third_party/xla/xla/service/computation_layout.cc b/third_party/xla/xla/service/computation_layout.cc
index d62e1a19afee..1145102a78cf 100644
--- a/third_party/xla/xla/service/computation_layout.cc
+++ b/third_party/xla/xla/service/computation_layout.cc
@@ -143,8 +143,8 @@ std::string ComputationLayout::ToString() const {
 ProgramShape ComputationLayout::ComputeProgramShape() const {
   ProgramShape program_shape;
   for (int64_t i = 0; i < parameter_layouts_.size(); ++i) {
-    *program_shape.add_parameters() = parameter_layouts_[i].shape();
-    *program_shape.add_parameter_names() = absl::StrCat("p", i);
+    program_shape.AddParameter(parameter_layouts_[i].shape(),
+                               absl::StrCat("p", i));
   }
   *program_shape.mutable_result() = result_layout_.shape();
   return program_shape;
diff --git a/third_party/xla/xla/service/computation_placer.cc b/third_party/xla/xla/service/computation_placer.cc
index 43f351a54895..aa86094a6b95 100644
--- a/third_party/xla/xla/service/computation_placer.cc
+++ b/third_party/xla/xla/service/computation_placer.cc
@@ -206,7 +206,8 @@ ComputationPlacer::GetForPlatform(const se::Platform* platform) {
 
 /* static */ std::map<se::Platform::Id, ComputationPlacer::State>*
 ComputationPlacer::GetPlatformComputationPlacers() {
-  static auto* r = new std::map<se::Platform::Id, ComputationPlacer::State>;
+  static auto* const r =
+      new std::map<se::Platform::Id, ComputationPlacer::State>;
   return r;
 }
 
diff --git a/third_party/xla/xla/service/conditional_canonicalizer.h b/third_party/xla/xla/service/conditional_canonicalizer.h
deleted file mode 100644
index 6a857fc4cf20..000000000000
--- a/third_party/xla/xla/service/conditional_canonicalizer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
-#define XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/conditional_canonicalizer.h"
-
-#endif  // XLA_SERVICE_CONDITIONAL_CANONICALIZER_H_
diff --git a/third_party/xla/xla/service/conditional_code_motion.cc b/third_party/xla/xla/service/conditional_code_motion.cc
index f1ca8db8633f..1289a1812884 100644
--- a/third_party/xla/xla/service/conditional_code_motion.cc
+++ b/third_party/xla/xla/service/conditional_code_motion.cc
@@ -61,7 +61,7 @@ HloInstruction* CloneNestedTuples(HloInstruction* tuple) {
     return tuple;
   }
   std::vector<HloInstruction*> tuple_users, gte_users;
-  for (int i = 0; i < tuple->shape().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < tuple->shape().tuple_shapes().size(); ++i) {
     gte_users.push_back(nullptr);
   }
   for (auto* tuple_user : tuple->users()) {
@@ -77,7 +77,7 @@ HloInstruction* CloneNestedTuples(HloInstruction* tuple) {
   if (!tuple_users.empty() || tuple->user_count() == 0 ||
       tuple == tuple->parent()->root_instruction()) {
     VLOG(5) << "CLONING: " << tuple->ToString() << "\n";
-    int64_t tuple_size = tuple->shape().tuple_shapes_size();
+    int64_t tuple_size = tuple->shape().tuple_shapes().size();
     std::vector<HloInstruction*> operands;
     operands.reserve(tuple_size);
     for (int64_t j = 0; j < tuple_size; ++j) {
@@ -812,8 +812,8 @@ absl::StatusOr<bool> ConditionalCodeMotion::MoveUserInstructionsIn(
   // 0 to save the old operand being used.
   int64_t op_index =
       conditional->shape().IsTuple()
-          ? ((use_index >= 0) ? conditional->shape().tuple_shapes_size() - 1
-                              : conditional->shape().tuple_shapes_size())
+          ? ((use_index >= 0) ? conditional->shape().tuple_shapes().size() - 1
+                              : conditional->shape().tuple_shapes().size())
           : 0;
   // Use to map the tuple_use instruction to its operand;
   Boundary b_opd_use(Boundary::Position::kInsideBranch);
@@ -837,7 +837,7 @@ absl::StatusOr<bool> ConditionalCodeMotion::MoveUserInstructionsIn(
       // If old_root is not a kTuple but has tuple shape, elements within the
       // tuple must be extracted first to be used by the new instructions.
       const Shape& old_shape = old_root->shape();
-      for (int i = 0; i < old_shape.tuple_shapes_size(); ++i) {
+      for (int i = 0; i < old_shape.tuple_shapes().size(); ++i) {
         auto element =
             computation->AddInstruction(HloInstruction::CreateGetTupleElement(
                 old_shape.tuple_shapes(i), old_root, i));
@@ -976,7 +976,7 @@ class MoveOperandIntoBranch {
                               inst->operands()[i]) -
                     inst->operands().begin();
         VLOG(2) << "operand index = " << j << "\n";
-        CHECK(j < branch_input->shape().tuple_shapes_size());
+        CHECK(j < branch_input->shape().tuple_shapes().size());
         if (j < i) {
           operands[i] = operands[j];
         } else {
@@ -1011,8 +1011,9 @@ class MoveOperandIntoBranch {
       CHECK_NE(new_tuple, nullptr);
       VLOG(5) << "Cloned new tuple:" << new_tuple->parent()->ToString() << "\n";
       std::vector<std::vector<HloInstruction*>> gte_users;
-      gte_users.reserve(branch_param->shape().tuple_shapes_size());
-      for (int64_t j = 0; j < branch_param->shape().tuple_shapes_size(); ++j) {
+      gte_users.reserve(branch_param->shape().tuple_shapes().size());
+      for (int64_t j = 0; j < branch_param->shape().tuple_shapes().size();
+           ++j) {
         gte_users.push_back(std::vector<HloInstruction*>());
       }
       for (auto* param_user : branch_param->users()) {
@@ -1045,7 +1046,7 @@ class MoveOperandIntoBranch {
             if (matching_index > 0) {
               param_tuple = branch_param;
             }
-            CHECK_GT(param_shape->tuple_shapes_size(), tuple_index);
+            CHECK_GT(param_shape->tuple_shapes().size(), tuple_index);
             new_param_shape = &param_shape->tuple_shapes(tuple_index);
             param_shape = new_param_shape;
             VLOG(1) << "new_param_shape: " << param_shape->ToString();
diff --git a/third_party/xla/xla/service/conditional_code_motion_test.cc b/third_party/xla/xla/service/conditional_code_motion_test.cc
index 1398a9b1fdc8..77e23d92b058 100644
--- a/third_party/xla/xla/service/conditional_code_motion_test.cc
+++ b/third_party/xla/xla/service/conditional_code_motion_test.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -37,7 +37,7 @@ limitations under the License.
 namespace xla {
 namespace conditional_opt {
 
-using ConditionalCodeMotionTest = HloTestBase;
+using ConditionalCodeMotionTest = HloHardwareIndependentTestBase;
 namespace op = xla::testing::opcode_matchers;
 
 TEST_F(ConditionalCodeMotionTest, MoveSubsetTupleOut) {
@@ -2329,11 +2329,12 @@ ENTRY %xla_computation  {
       root->branch_computation(1)->root_instruction();
   const HloInstruction* conditional_true =
       root->branch_computation(0)->root_instruction();
-  EXPECT_THAT(conditional_false->shape().tuple_shapes_size(), 1);
-  EXPECT_THAT(conditional_false->shape().tuple_shapes(0).tuple_shapes_size(),
+  EXPECT_THAT(conditional_false->shape().tuple_shapes().size(), 1);
+  EXPECT_THAT(conditional_false->shape().tuple_shapes(0).tuple_shapes().size(),
+              2);
+  EXPECT_THAT(conditional_true->shape().tuple_shapes().size(), 1);
+  EXPECT_THAT(conditional_true->shape().tuple_shapes(0).tuple_shapes().size(),
               2);
-  EXPECT_THAT(conditional_true->shape().tuple_shapes_size(), 1);
-  EXPECT_THAT(conditional_true->shape().tuple_shapes(0).tuple_shapes_size(), 2);
 }
 
 // Move partially used operands inside empty conditional branches.
@@ -2391,11 +2392,12 @@ ENTRY %xla_computation  {
       root->branch_computation(1)->root_instruction();
   const HloInstruction* conditional_true =
       root->branch_computation(0)->root_instruction();
-  EXPECT_THAT(conditional_false->shape().tuple_shapes_size(), 2);
-  EXPECT_THAT(conditional_false->shape().tuple_shapes(1).tuple_shapes_size(),
+  EXPECT_THAT(conditional_false->shape().tuple_shapes().size(), 2);
+  EXPECT_THAT(conditional_false->shape().tuple_shapes(1).tuple_shapes().size(),
+              2);
+  EXPECT_THAT(conditional_true->shape().tuple_shapes().size(), 2);
+  EXPECT_THAT(conditional_true->shape().tuple_shapes(1).tuple_shapes().size(),
               2);
-  EXPECT_THAT(conditional_true->shape().tuple_shapes_size(), 2);
-  EXPECT_THAT(conditional_true->shape().tuple_shapes(1).tuple_shapes_size(), 2);
 }
 
 // Move partially used operands inside empty conditional branches.
diff --git a/third_party/xla/xla/service/conditional_simplifier.cc b/third_party/xla/xla/service/conditional_simplifier.cc
index edd01e32da60..d294ac5eb4d0 100644
--- a/third_party/xla/xla/service/conditional_simplifier.cc
+++ b/third_party/xla/xla/service/conditional_simplifier.cc
@@ -209,7 +209,8 @@ bool RemoveUnusedTupleElements(HloInstruction* conditional_op) {
     return false;
   }
 
-  const int old_tuple_shapes_size = conditional_op->shape().tuple_shapes_size();
+  const int old_tuple_shapes_size =
+      conditional_op->shape().tuple_shapes().size();
 
   // Select indices that are actually used by some GTE instructions.
   std::vector<bool> used_indices(old_tuple_shapes_size, false);
@@ -419,7 +420,7 @@ bool MergeDuplicateTupleElements(HloInstruction* conditional) {
   bool changed = false;
   absl::flat_hash_map<std::vector<const HloInstruction*>, int64_t>
       index_collision_table;
-  for (int i = 0; i < conditional->shape().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < conditional->shape().tuple_shapes().size(); ++i) {
     const std::vector<const HloInstruction*> ith_operands_vector =
         vectorize_branches_root_tuple_ith_operand(i);
     const auto emplace_res =
diff --git a/third_party/xla/xla/service/conditional_simplifier_test.cc b/third_party/xla/xla/service/conditional_simplifier_test.cc
index 8394cdcdc7a0..6baae0030ee2 100644
--- a/third_party/xla/xla/service/conditional_simplifier_test.cc
+++ b/third_party/xla/xla/service/conditional_simplifier_test.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -36,7 +36,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ConditionalSimplifierTest : public HloTestBase {
+class ConditionalSimplifierTest : public HloHardwareIndependentTestBase {
  public:
   // Makes a computation that contains a conditional with constant predicate.
   HloComputation* MakeConditional(HloModule* module, bool is_constant = true);
diff --git a/third_party/xla/xla/service/conditional_to_select.h b/third_party/xla/xla/service/conditional_to_select.h
index 4e3676468fd8..8bba94a9329f 100644
--- a/third_party/xla/xla/service/conditional_to_select.h
+++ b/third_party/xla/xla/service/conditional_to_select.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_CONDITIONAL_TO_SELECT_H_
 #define XLA_SERVICE_CONDITIONAL_TO_SELECT_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
diff --git a/third_party/xla/xla/service/conditional_to_select_test.cc b/third_party/xla/xla/service/conditional_to_select_test.cc
index b32aeacf14e4..a1dabfae0e23 100644
--- a/third_party/xla/xla/service/conditional_to_select_test.cc
+++ b/third_party/xla/xla/service/conditional_to_select_test.cc
@@ -16,22 +16,22 @@ limitations under the License.
 #include "xla/service/conditional_to_select.h"
 
 #include <memory>
-#include <utility>
+#include <string>
 
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
-using ConditionalToSelectTest = HloTestBase;
+using ConditionalToSelectTest = HloHardwareIndependentTestBase;
 using ::testing::_;
 
 // Test that a conditional of simple constants is transformed to a select
diff --git a/third_party/xla/xla/service/constant_value.cc b/third_party/xla/xla/service/constant_value.cc
index 14de2501c5c0..fddf4f932d34 100644
--- a/third_party/xla/xla/service/constant_value.cc
+++ b/third_party/xla/xla/service/constant_value.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 #include "xla/service/constant_value.h"
 
+#include <cstdint>
 #include <string>
 
+#include "absl/base/casts.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+
 namespace xla {
 
 absl::StatusOr<ConstantValue> ConstantValue::FromLiteral(
     const Literal& literal) {
-  CHECK_EQ(literal.shape().dimensions_size(), 0) << "Expected scalar literal";
+  CHECK_EQ(literal.shape().dimensions().size(), 0) << "Expected scalar literal";
   return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<ConstantValue>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<ConstantValue> {
         if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
diff --git a/third_party/xla/xla/service/convert_async_collectives_to_sync.h b/third_party/xla/xla/service/convert_async_collectives_to_sync.h
deleted file mode 100644
index 3e3884b98a0f..000000000000
--- a/third_party/xla/xla/service/convert_async_collectives_to_sync.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
-#define XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/convert_async_collectives_to_sync.h"
-
-#endif  // XLA_SERVICE_CONVERT_ASYNC_COLLECTIVES_TO_SYNC_H_
diff --git a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h
deleted file mode 100644
index 17f629fd0588..000000000000
--- a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
-
-#ifndef XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
-#define XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h"
-
-#endif  // XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
diff --git a/third_party/xla/xla/service/convert_mover.h b/third_party/xla/xla/service/convert_mover.h
deleted file mode 100644
index a335a4583cae..000000000000
--- a/third_party/xla/xla/service/convert_mover.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CONVERT_MOVER_H_
-#define XLA_SERVICE_CONVERT_MOVER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/convert_mover.h"
-
-#endif  // XLA_SERVICE_CONVERT_MOVER_H_
diff --git a/third_party/xla/xla/service/convert_operand_folding.h b/third_party/xla/xla/service/convert_operand_folding.h
deleted file mode 100644
index 863cd7da8d49..000000000000
--- a/third_party/xla/xla/service/convert_operand_folding.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
-#define XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/convert_operand_folder.h"
-
-#endif  // XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
diff --git a/third_party/xla/xla/service/convolution_4d_expander.h b/third_party/xla/xla/service/convolution_4d_expander.h
deleted file mode 100644
index 2a290290ebdd..000000000000
--- a/third_party/xla/xla/service/convolution_4d_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
-#define XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/convolution_4d_expander.h"
-
-#endif  // XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
diff --git a/third_party/xla/xla/service/convolution_group_converter.h b/third_party/xla/xla/service/convolution_group_converter.h
deleted file mode 100644
index 21d68d2751a0..000000000000
--- a/third_party/xla/xla/service/convolution_group_converter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
-#define XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/convolution_group_converter.h"
-
-#endif  // XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
diff --git a/third_party/xla/xla/service/convolution_pred_expander.h b/third_party/xla/xla/service/convolution_pred_expander.h
deleted file mode 100644
index 84c57681afb0..000000000000
--- a/third_party/xla/xla/service/convolution_pred_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CONVOLUTION_PRED_EXPANDER_H_
-#define XLA_SERVICE_CONVOLUTION_PRED_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/convolution_pred_expander.h"
-
-#endif  // XLA_SERVICE_CONVOLUTION_PRED_EXPANDER_H_
diff --git a/third_party/xla/xla/service/copy_insertion.cc b/third_party/xla/xla/service/copy_insertion.cc
index 75c7570207a0..de26766a3e51 100644
--- a/third_party/xla/xla/service/copy_insertion.cc
+++ b/third_party/xla/xla/service/copy_insertion.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
-#include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -28,20 +28,16 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "xla/frontend_attributes.h"
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -49,11 +45,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/ir/ptrvec.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
-#include "xla/map_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/compile_time_cap.h"
+#include "xla/service/copy_removal.h"
 #include "xla/service/dump.h"
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_value.h"
@@ -61,16 +60,13 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using absl::StrAppend;
-
 bool IsReadonlyEntryParameterValue(const HloValue& value) {
   const HloComputation* computation = value.defining_instruction()->parent();
   return value.defining_instruction()->opcode() == HloOpcode::kParameter &&
@@ -191,20 +187,32 @@ DeepCopyAndAddControlEdges(HloInstruction* from, HloInstruction* to,
   return std::make_pair(from_deep_copy, to_deep_copy);
 }
 
-bool IsSendRecv(const HloInstruction* instruction) {
-  return instruction->opcode() == HloOpcode::kSend ||
-         instruction->opcode() == HloOpcode::kRecv;
-}
-
-bool IsSendRecvDone(const HloInstruction* instruction) {
-  return instruction->opcode() == HloOpcode::kSendDone ||
-         instruction->opcode() == HloOpcode::kRecvDone;
+// Returns true if the instruction produces non-copyable results.
+//
+// Currently, only asynchronous start ops produce non-copyable results and the
+// the whole result is non-copyable.
+bool IsNonCopyable(const HloInstruction* instruction) {
+  // Currently, the verifier only allows the pipelining of Send/Recv. As such,
+  // here we only handle to the ops allowed by
+  // HloDataflowAnalysis::IsAsynchronousOperationStart that pass through its
+  // operand for now. For the ops that don't pass through its operand, we need
+  // to add a copy of its operand for the straight line case in order to allow
+  // all ops in HloDataflowAnalysis::IsAsynchronousOperationStart.
+  HloOpcode opcode = instruction->opcode();
+  return opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv ||
+         opcode == HloOpcode::kCopyStart;
 }
 
-bool IsSendRecvInInit(const HloInstruction* init, const ShapeIndex& index) {
-  if (index.empty()) return false;
+// Returns true if the value at the given index in the while init is
+// non-copyable.
+bool IsNonCopyableInWhileInit(const HloInstruction* while_init,
+                              const ShapeIndex& index) {
+  if (index.empty()) {
+    return false;
+  }
   int64_t i = index.front();
-  return i < init->operand_count() && IsSendRecv(init->operand(i));
+  return i < while_init->operand_count() &&
+         IsNonCopyable(while_init->operand(i));
 }
 
 // Compute the indices of the loop state which need copies in order to avoid
@@ -223,13 +231,14 @@ bool IndicesToCopyForWhile(const HloDataflowAnalysis& dataflow,
   for (auto& pair : *indices_to_copy) {
     const ShapeIndex& index = pair.first;
     bool& should_copy = pair.second;
-    if (IsSendRecvInInit(init, index)) {
-      // Do not copy partially pipelined send/recv ops. The required copies will
-      // be inserted specifically for the send/recv ops.
+    if (IsNonCopyableInWhileInit(init, index)) {
+      // Do not copy non-copyable values, instead, we will add copies for
+      // transitioning into and out of non-copyable values.
       should_copy = false;
       continue;
-    } else if (dataflow.GetValueSet(init, index).values().size() > 1 ||
-               dataflow.GetValueSet(xla_while, index).values().size() > 1) {
+    }
+    if (dataflow.GetValueSet(init, index).values().size() > 1 ||
+        dataflow.GetValueSet(xla_while, index).values().size() > 1) {
       // If there is any ambiguity, then loop state must be copied.
       should_copy = true;
     } else {
@@ -509,1483 +518,6 @@ absl::Status StripControlDependenciesFrom(HloInstruction* instruction) {
 
   return absl::OkStatus();
 }
-
-class LiveRangeRegions {
- public:
-  struct InstructionInfo {
-    InstructionInfo() : value_definition(nullptr), is_definition(false) {}
-
-    // The instruction that defines the value being used. It basically saves
-    // the defining instruction of each HloValue.
-    HloInstruction* value_definition;
-    // Whether the instruction defines a new value (or merely uses one). This
-    // basically remembers whether the instruction actually creates an HloValue
-    // or merely uses one, from a collection of given HloValues. Note that if
-    // is_definition = true, it merely says the instruction creates a new
-    // HloValue with or without defining a new one. For example, kAdd create a
-    // new HloValue (can be value_definition), but tuples or get-tuple-element,
-    // create a new HloValue aliasing without defining a new value (cannot be
-    // value_definition).
-    bool is_definition;
-
-    std::string ToString() const {
-      return absl::StrCat(
-          "is_definition: ", std::to_string(is_definition),
-          ", value_definition: ",
-          value_definition ? value_definition->name() : "nullptr");
-    }
-  };
-  // Map instructions that use a value to the defining instruction of the value.
-  // Because all values must belong to the same live range, an instruction can
-  // have at most a single value-defining instruction; otherwise the multiple
-  // incoming active values would share a single buffer, which is not allowed.
-  // The value-defining and value-use instructions do not have to belong to the
-  // same computation, but the value use needs to be nested within the defining
-  // computation.
-  typedef HloInstructionMap<InstructionInfo> InstructionMap;
-  typedef std::pair<HloInstruction*, InstructionInfo> InstructionEntry;
-  // Map each computation to its immediately contained instructions.
-  typedef absl::flat_hash_map<const HloComputation*, InstructionMap>
-      ComputationMap;
-
-  InstructionMap& operator[](const HloComputation* computation) {
-    if (computation_map_.find(computation) == computation_map_.end()) {
-      computation_vector_.push_back(computation);
-    }
-    return computation_map_[computation];
-  }
-
-  const InstructionMap& operator[](const HloComputation* computation) const {
-    ComputationMap::const_iterator p = computation_map_.find(computation);
-    CHECK(p != computation_map_.end());
-    return p->second;
-  }
-  absl::InlinedVector<const HloComputation*, 5>::const_iterator begin() const {
-    return computation_vector_.begin();
-  }
-  absl::InlinedVector<const HloComputation*, 5>::const_iterator end() const {
-    return computation_vector_.end();
-  }
-  int64_t size() const {
-    CHECK_EQ(computation_vector_.size(), computation_map_.size());
-    return computation_vector_.size();
-  }
-  bool empty() const { return size() == 0; }
-  const HloComputation* Computation(int64_t index) const {
-    return computation_vector_[index];
-  }
-  bool contains(HloInstruction* instr) const {
-    CHECK_NE(instr, nullptr);
-    auto* computation = instr->parent();
-    auto p = computation_map_.find(computation);
-    if (p == computation_map_.end()) {
-      return false;
-    }
-    auto instr_map = (*p).second;
-    return instr_map.find(instr) != instr_map.end();
-  }
-
-  std::string ToString() const {
-    std::string result;
-
-    for (const auto* computation : computation_vector_) {
-      StrAppend(&result, "computation: ", computation->name(), "\n");
-      for (const auto& entry : computation_map_.at(computation)) {
-        StrAppend(&result, "  entry: ", entry.first->name(), ", ",
-                  entry.second.ToString(), "\n");
-      }
-    }
-
-    return result;
-  }
-
- private:
-  ComputationMap computation_map_;
-  absl::InlinedVector<const HloComputation*, 5> computation_vector_;
-};
-
-namespace {
-// Represent relations between the locations of two regions of instructions,
-// each region can include 0-n instructions.
-class Relation {
- public:
-  enum RuntimeOrder {
-    // Indicate that there is no overlap whatsoever between the two regions.
-    kNoOverlap = 0,
-    // Indicate that the first region includes the same set of instructions as
-    // the second region.
-    kSameInstr = 1,
-    // Indicate that the first region is entirely before the second region
-    // starts.
-    kBeforeStart = 2,
-    // Indicate that the first region is before the second region ends.
-    kBeforeStartOrSameInstr = kBeforeStart | kSameInstr,
-    // Indicate that the first region is entirely after the second region ends.
-    kAfterEnd = 4,
-    // Indicate that the first region is after the second region
-    // starts, with some instructions before the second region ends.
-    kAfterEndOrSameInstr = kAfterEnd | kSameInstr,
-    // Indicate that the first region overlaps with the second one, but share no
-    // common instructions.
-    kBeforeStartOrAfterEnd = kBeforeStart | kAfterEnd,
-    // Indicate that the first region overlaps with the second one, and have
-    // some common instructions.
-    kBeforeOrAfterOrOverlap = kBeforeStart | kAfterEnd | kSameInstr,
-  };
-  Relation() : intercept_def_use_(false) {}
-  explicit Relation(RuntimeOrder order, bool intercept_def_use = false)
-      : intercept_def_use_(intercept_def_use) {
-    orders_.push_back(order);
-  }
-  Relation(const Relation& that)
-      : intercept_def_use_(that.intercept_def_use_), orders_(that.orders_) {}
-  bool operator==(const Relation& that) const {
-    return intercept_def_use_ == that.intercept_def_use_ &&
-           absl::c_equal(orders_, that.orders_);
-  }
-
-  // Return whether the runtime ordering may imply interception, assuming it
-  // models the relation between a modifying and a use instruction.
-  bool UseImpliesInterception() const {
-    CHECK_EQ(orders_.size(), 1);
-    return UseImpliesInterception(orders_[0]);
-  }
-  // Return whether the runtime ordering may imply interception, assuming it
-  // models the relation between a modifying and a definition instruction.
-  bool DefinitionImpliesInterception() const {
-    CHECK_EQ(orders_.size(), 1);
-    return DefinitionImpliesInterception(orders_[0]);
-  }
-  // Return whether the current relation models a modifying instruction that
-  // intercepts the dataflow of another live range region.
-  bool InterceptDefUse() const { return intercept_def_use_; }
-  // Update interception state to the given value.
-  void UpdateInterception(bool value) {
-    CHECK_EQ(orders_.size(), 1);
-    intercept_def_use_ = value;
-  }
-  Relation::RuntimeOrder GetRuntimeOrder() const {
-    if (orders_.empty()) {
-      return Relation::kNoOverlap;
-    }
-    CHECK_EQ(orders_.size(), 1);
-    return orders_[0];
-  }
-  // Return whether the current relation implies two overlapping regions.
-  bool RuntimeOrderOverlap() const {
-    return absl::c_any_of(orders_, ImpliesOverlap);
-  }
-  bool RuntimeOrderIsUnordered() const {
-    return orders_.size() == 1 && orders_[0] == kBeforeStartOrAfterEnd;
-  }
-  bool RuntimeOrderIsNoOverlap() const {
-    return orders_.empty() || (orders_.size() == 1 && orders_[0] == kNoOverlap);
-  }
-  bool RuntimeOrderIsRunBefore() const {
-    return orders_.size() == 1 && orders_[0] == kBeforeStart;
-  }
-  bool RuntimeOrderIsRunAfter() const {
-    return orders_.size() == 1 && orders_[0] == kAfterEnd;
-  }
-  std::string ToString() const {
-    return absl::StrCat("Interception = ", intercept_def_use_, ";",
-                        absl::StrJoin(orders_, ","));
-  }
-
-  static bool DefinitionImpliesInterception(RuntimeOrder definition) {
-    return (definition == kAfterEnd || definition == kBeforeStartOrAfterEnd);
-  }
-  static bool UseImpliesInterception(RuntimeOrder use) {
-    return (use == kBeforeStart || use == kBeforeStartOrAfterEnd);
-  }
-
-  // Summarize additional relations into a single runtime ordering, assuming
-  // both relations are modeling constraints of the same source instruction.
-  void UnionRelationFromSameSource(const Relation& rel) {
-    CHECK_LE(orders_.size(), 1);
-    CHECK_EQ(rel.orders_.size(), 1);
-    if (orders_.empty()) {
-      orders_.push_back(rel.orders_[0]);
-    } else {
-      orders_[0] = Union(orders_[0], rel.orders_[0]);
-    }
-    intercept_def_use_ = intercept_def_use_ || rel.intercept_def_use_;
-  }
-
-  // Summarize additional relations into disjoint runtime orderings, assuming
-  // the relations are modeling constraints of different source instructions.
-  void UnionRelationFromDifferentSource(const Relation& rel) {
-    if (rel.orders_.empty()) {
-      return;
-    }
-    CHECK_EQ(rel.orders_.size(), 1);
-    intercept_def_use_ = intercept_def_use_ || rel.intercept_def_use_;
-    for (auto& local_order : orders_) {
-      if (OverwriteIfSubsume(rel.orders_[0], &local_order)) {
-        return;
-      }
-    }
-    orders_.push_back(rel.orders_[0]);
-  }
-
-  static Relation::RuntimeOrder ReverseRuntimeOrder(RuntimeOrder order) {
-    switch (order) {
-      case kNoOverlap:
-      case kSameInstr:
-      case kBeforeStartOrAfterEnd:
-      case kBeforeOrAfterOrOverlap:
-        return order;
-      case kBeforeStart:
-        return kAfterEnd;
-      case kBeforeStartOrSameInstr:
-        return kAfterEndOrSameInstr;
-      case kAfterEnd:
-        return kBeforeStart;
-      case kAfterEndOrSameInstr:
-        return kBeforeStartOrSameInstr;
-    }
-  }
-
- private:
-  // Indicate that the second region may intercept the def-use dataflow of the
-  // first region, if their buffers are combined.
-  bool intercept_def_use_;
-  // Remember the different runtime orderings of different instructions.
-  absl::InlinedVector<RuntimeOrder, 4> orders_;
-
-  static RuntimeOrder Union(RuntimeOrder o1, RuntimeOrder o2) {
-    return static_cast<Relation::RuntimeOrder>(o1 | o2);
-  }
-  static bool ImpliesOverlap(RuntimeOrder o) {
-    return o >= RuntimeOrder::kBeforeStartOrAfterEnd;
-  }
-  // Returns whether ordering constraint o1 includes o2 as a subset, when they
-  // represent runtime orderings (interleavings) of two different regions.
-  static bool Subsume(RuntimeOrder o1, RuntimeOrder o2) {
-    return Union(o1, o2) == o1;
-  }
-  // Overwrites o1 with o2 if o2 subsumes o1 (as defined above by the Subsume
-  // function). Return whether o2 is subsumed by the new value in o1.
-  static bool OverwriteIfSubsume(RuntimeOrder o2, RuntimeOrder* o1) {
-    if (*o1 == o2) {
-      return true;
-    }
-    CHECK_NE(o1, nullptr);
-    // Overwrite o1 with o2 if it is subsumed by o2.
-    if (Subsume(o2, *o1)) {
-      *o1 = o2;
-      return true;
-    } else if (Subsume(*o1, o2)) {
-      // If o2 is already subsumed by o1, do nothing.
-      return true;
-    }
-    // If neither o1 nor o2 is subsumed by the other, return false, so that o2
-    // will be inserted as a separate entry representing all possible orderings.
-    return false;
-  }
-};
-
-class ComputeRelativeLocation {
- public:
-  typedef LiveRangeRegions::InstructionEntry InstructionEntry;
-  explicit ComputeRelativeLocation(HloOrdering* ordering)
-      : ordering_(ordering) {
-    VLOG(3) << "New analysis";
-  }
-
-  // Compute locationing constraints between two instructions. Here entry2 is
-  // the source instruction, in that the returned value describes the relation
-  // of entry2 in terms of whether it is before or after entry1, and whether it
-  // can intercept the def-use data flow of entry1.
-  Relation Compute(const InstructionEntry& entry1,
-                   const InstructionEntry& entry2, bool instr2_can_modify) {
-    auto def = entry1.second.value_definition;
-    auto use = entry1.first;
-    Relation::RuntimeOrder order =
-        ComputeRuntimeOrdering(entry2.first, entry1.first);
-    if (order == Relation::kSameInstr &&
-        entry1.second.is_definition != entry2.second.is_definition) {
-      if (entry1.second.is_definition) {
-        order = Relation::kBeforeStart;
-      } else {
-        order = Relation::kAfterEnd;
-      }
-    }
-    bool intercept = AlwaysForceInterception(entry2.first);
-    if (def == nullptr || !instr2_can_modify) {
-      return Relation(order, intercept);
-    }
-    // If the definition and use are parameter and return (root) of the parent
-    // computation, then any modification is considered intercepting.
-    if (def->opcode() == HloOpcode::kParameter &&
-        use == use->parent()->root_instruction()) {
-      VLOG(3) << "Setting interception due to parameter/root relation";
-      return Relation(order, true);
-    }
-
-    // If the modification is inside the while body, it will not intercept the
-    // def-use chain outside of the while body. For the following example, %add
-    // does not intercept the def-use chain of %while - %root
-    //
-    // body = {
-    //   ...
-    //   add = ...  // modify buffer1
-    // }
-    // %while = While (param, cond, body) // def buffer1
-    // %root = get-tuple-element(%while), index=1 // use buffer1
-
-    if (use->parent() == def->parent() &&
-        ComputeRuntimeOrdering(use, entry2.first) == Relation::kAfterEnd &&
-        def->opcode() == HloOpcode::kWhile &&
-        entry2.first->parent() == def->while_body()) {
-      return Relation(order, false);
-    }
-
-    if (use->parent() == def->parent() &&
-        ComputeRuntimeOrdering(def, entry2.first) == Relation::kBeforeStart &&
-        use->opcode() == HloOpcode::kWhile &&
-        entry2.first->parent() == use->while_body()) {
-      return Relation(order, false);
-    }
-
-    // Special case for conditional instruction when in one branch two results
-    // can be put in one buffers and another branch returns two results from a
-    // multi-output instruction, e.g. fusion or variadic reduction.
-    //
-    //  branch_0 {
-    //    exp = f64[] exp(...)
-    //    ROOT tuple = (f64[], f64[]) tuple(exp, exp)
-    //  }
-    //
-    //  fused_computation {
-    //    abs = f64[] abs(...)
-    //    negate = f64[] negate(...)
-    //    ROOT tuple = (f64[], f64[]) tuple(abs, negate)
-    //  }
-    //
-    //  branch_1 {
-    //    ROOT fusion = (f64[], f64[]) fusion(...), calls=%fused_computation
-    //  }
-    //
-    //  ENTRY main {
-    //    ROOT root = (f64[], f64[]) conditional(...),
-    //    branch_computations={%branch_0, %branch_1}
-    //  }
-    //
-    // `branch_0` can use one buffer for both result. `branch_1` must use two
-    // different buffers.
-    //
-    // During live range analysis of results of `branch_0` this function will be
-    // called when entry1 and entry2 are different outputs on `fusion` in
-    // `branch_1`. `fusion` defines two buffers, but `value_definition` in
-    // LiveRangeRegions::InstructionInfo does not track the output index. The
-    // analysis will say that they are not interfering and assign the same
-    // buffer to both.
-    //
-    // This check makes sure that outputs of multi-output instructions are
-    // always interfering and can not be combined. It can be a false positive
-    // when entry1 and entry2 correspond to the same output, but we prefer that
-    // over correctness issues.
-    //
-    // A proper solution would be to track output index in
-    // LiveRangeRegions::InstructionInfo.
-    if (use->parent() == def->parent() &&
-        def->parent()->IsConditionalBranchComputation() &&
-        def == entry2.first && def->shape().IsTuple()) {
-      VLOG(3) << "Setting interception for multi-output instruction inside "
-                 "conditional branch: "
-              << def->name();
-      return Relation(order, true);
-    }
-
-    if (Relation::UseImpliesInterception(order)) {
-      auto order2 = ComputeRuntimeOrdering(entry2.first, def);
-      if (Relation::DefinitionImpliesInterception(order2)) {
-        VLOG(3) << "Setting interception for " << def->ToString()
-                << " with use: " << entry1.first->ToString();
-        intercept = true;
-      }
-    }
-    return Relation(order, intercept);
-  }
-
-  // Return the relative locations (defined above) of range2 in relation to
-  // instructions in range1. Return kNoOverlap if range2 is outside of range1.
-  Relation Compute(const LiveRangeRegions& range1,
-                   const LiveRangeRegions& range2) {
-    Relation dir_src_dest;
-    for (const auto* computation1 : range1) {
-      for (const auto* computation2 : range2) {
-        for (auto instr_entry2 : range2[computation2]) {
-          if (!ordering_->call_graph().Dominates(computation1, computation2)) {
-            continue;
-          }
-          VLOG(3) << "Locationing " << instr_entry2.first->ToString();
-          // Saves relations between instr2 and other instructions in range1.
-          bool instr2_can_modify =
-              InstructionCanIntercept(instr_entry2, range1);
-          Relation instr2_relation;
-          std::vector<InstructionEntry> unordered_ops;
-          bool unordered_intercept = false;
-          for (auto instr_entry1 : range1[computation1]) {
-            auto rel = Compute(instr_entry1, instr_entry2, instr2_can_modify);
-            VLOG(3) << "New relation with " << instr_entry1.first->name()
-                    << ": " << rel.ToString();
-            if (!rel.RuntimeOrderIsUnordered()) {
-              instr2_relation.UnionRelationFromSameSource(rel);
-            } else {
-              unordered_ops.push_back(instr_entry1);
-              unordered_intercept |= rel.InterceptDefUse();
-            }
-            VLOG(3) << "instr2 relation: " << instr2_relation.ToString();
-          }
-          // Here instr2_relation is guaranteed to have at most a single entry,
-          // because it was initialized to be empty, and has been updated only
-          // via instr2_relation.UnionRelationFromSameSource(rel), which
-          // maintains that the updated result has only a single entry.
-          if (!ForceRuntimeOrder(unordered_ops, instr_entry2,
-                                 instr2_relation.GetRuntimeOrder())) {
-            VLOG(3) << "Unable to force ordering of unordered ops";
-            instr2_relation.UnionRelationFromSameSource(Relation(
-                Relation::kBeforeStartOrAfterEnd, unordered_intercept));
-          }
-          dir_src_dest.UnionRelationFromDifferentSource(instr2_relation);
-          VLOG(3) << "Resulting relation: " << dir_src_dest.ToString();
-        }
-      }
-    }
-    return dir_src_dest;
-  }
-
-  // Return whether control dependences, if exist, are added successfully.
-  bool AddControlDependenceForUnorderedOps() {
-    if (ctrl_deps_.empty()) {
-      return true;
-    }
-    PredecessorHloOrdering* ordering =
-        dynamic_cast<PredecessorHloOrdering*>(ordering_);
-    if (ordering == nullptr) {
-      // Support force ordering of unordered-ops only when using predecssor
-      // ordering.
-      return false;
-    }
-    for (const auto& comp_it : ctrl_deps_) {
-      HloComputation* parent = comp_it.first;
-      HloReachabilityMap& reachability_map = ordering->reachability_map(parent);
-      for (const auto& instr_it : comp_it.second) {
-        HloInstruction* entry1 = instr_it.first;
-        for (HloInstruction* entry2 : instr_it.second) {
-          VLOG(3) << "Add control dependence between " << entry2->name()
-                  << " vs " << entry1->name();
-          TF_CHECK_OK(entry2->AddControlDependencyTo(entry1));
-        }
-        reachability_map.UpdateReachabilityThroughInstruction(entry1);
-        for (HloInstruction* entry2 : instr_it.second) {
-          DCHECK(ordering_->GetExecutionConstraint(entry1, entry2) ==
-                 HloOrdering::ExecutionConstraint::kRunAfter);
-        }
-      }
-    }
-    return true;
-  }
-
- private:
-  enum ComputeStatus {
-    kFullyComputed,
-    kPartiallyComputed,
-    kNotComputed,
-  };
-  typedef std::pair<ComputeStatus, Relation::RuntimeOrder> SavedRelation;
-
-  // Returns whether it is safe to force the desired_relation ordering between
-  // all operations in unordered_ops and entry2. If safe, save the new enforced
-  // ordering relations.
-  bool ForceRuntimeOrder(absl::Span<const InstructionEntry> unordered_ops,
-                         const InstructionEntry entry2,
-                         Relation::RuntimeOrder desired_relation) {
-    if (unordered_ops.empty()) {
-      return true;
-    }
-    if (desired_relation != Relation::kBeforeStart &&
-        desired_relation != Relation::kAfterEnd) {
-      return false;
-    }
-    auto ModifiesNonCopy = [](HloInstruction* instr, const HloInstruction* op) {
-      auto in_place = HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr);
-      if (in_place.empty()) {
-        return false;
-      }
-      return absl::c_any_of(
-          in_place, [&](const std::pair<HloOperandIndex, ShapeIndex>&
-                            operand_and_output_index) {
-            auto* op2 =
-                instr->operand(operand_and_output_index.first.operand_number);
-            return (op == nullptr) ? (op2->opcode() == HloOpcode::kCopy)
-                                   : (op2 == op);
-          });
-    };
-    for (const InstructionEntry& entry1 : unordered_ops) {
-      // Only consider instructions in the same computation.
-      if (entry1.first->parent() != entry2.first->parent()) {
-        return false;
-      }
-      HloInstruction* pred = (desired_relation == Relation::kBeforeStart)
-                                 ? entry2.first
-                                 : entry1.first;
-      HloInstruction* succ = (desired_relation == Relation::kBeforeStart)
-                                 ? entry1.first
-                                 : entry2.first;
-      if (pred == pred->parent()->root_instruction()) {
-        return false;
-      }
-      if (succ->opcode() == HloOpcode::kCopy &&
-          ModifiesNonCopy(pred, succ->operand(0))) {
-        VLOG(3) << "Failed to force unordered op ordering due to copy ordering "
-                << " between " << pred->name() << " vs " << succ->name();
-        return false;
-      }
-    }
-    for (const InstructionEntry& entry1 : unordered_ops) {
-      Save(entry2.first, entry1.first, desired_relation,
-           /*is_unordered_originally=*/true);
-    }
-    return true;
-  }
-
-  static bool AlwaysForceInterception(HloInstruction* instr) {
-    // The following communication operations can have some unexpected side
-    // effects, when synchronizing across processes. Therefore, we
-    // conservatively try provide dedicated buffers to these operations instead
-    // of allowing them to share buffers with other operations, as the reuse may
-    // cause unexpected interferences.
-    if (HloDataflowAnalysis::IsAsynchronousOperationStart(instr->opcode()) ||
-        HloDataflowAnalysis::IsAsynchronousOperationDone(instr->opcode())) {
-      return true;
-    }
-    switch (instr->opcode()) {
-      // TODO(b/190903339): It appears that collectivePermute needs to be
-      // followed by a copy when escaping through a computation root.
-      case HloOpcode::kCollectivePermute:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  // Returns whether the given instr may intercept the def-use flow of another
-  // ongoing live range if its buffer is combined with the other live range.
-  // The function should return true if instr creates a new HloValue that could
-  // overwrite an existing HloValue in the combined buffer.
-  // More specifically, here we are looking for operations that create new
-  // values, e.g., add, subtract, in contrast to HLOs that merely create
-  // aliasings among existing values, e.g., tuple, get-tuple-element. Any of the
-  // new values created by operations such as add or subtract, when included as
-  // definition operations in a live range, are aliases of the buffer to be
-  // allocated to the live range and so are treated as they may be modifying the
-  // targeting buffer.
-  bool InstructionCanIntercept(const InstructionEntry& entry,
-                               const LiveRangeRegions& region) {
-    auto instr = entry.first;
-    if (!entry.second.is_definition) {
-      // If the instruction only uses the value, it can intercept only if it
-      // modifies the buffer in place.
-      for (const auto& operand_and_output_index :
-           HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr)) {
-        const HloOperandIndex& operand_index = operand_and_output_index.first;
-        if (region.contains(
-                instr->mutable_operand(operand_index.operand_number))) {
-          return true;
-        }
-      }
-      return false;
-    }
-    switch (instr->opcode()) {
-      // If the copy instruction is used to connect two live range regions,
-      // it does not overwrite the combined buffer with new values.
-      case HloOpcode::kCopy: {
-        // Checking the copy simply copies from the other live range with no
-        // layout conflicts.
-        HloInstruction* operand = instr->mutable_operand(0);
-        if (operand->opcode() == HloOpcode::kGetTupleElement) {
-          // kGetTupleElement only creates an alias among HloValues and is not
-          // included in the live range region. We check its operand instead.
-          operand = operand->mutable_operand(0);
-        }
-        if (region.contains(operand) &&
-            ShapeUtil::Equal(instr->shape(), instr->operand(0)->shape())) {
-          return false;  // Cannot intercept.
-        }
-        return true;
-      }
-      // The following operations merely create aliases among the HloValues.
-      case HloOpcode::kParameter:
-      case HloOpcode::kTuple:
-      case HloOpcode::kGetTupleElement:
-      // Here we consider all the compound operations (e.g., conditionals and
-      // while loops) as if they do not modify any HloValue, with the argument
-      // being that any value modifying operation contained inside will be
-      // considered separately to make sure the kIntercept relation being
-      // recorded as appropriate. Since the compound operations may or may not
-      // modify, not treating them as value modifying would make the algorithm
-      // less conservative.
-      case HloOpcode::kWhile:
-      case HloOpcode::kCall:
-      case HloOpcode::kConditional:
-        return false;
-      default:
-        return true;
-    }
-    return true;
-  }
-
-  SavedRelation AlreadyComputed(HloInstruction* op1, HloInstruction* op2) {
-    auto p2 = saved_relations_.find(op2);
-    if (p2 != saved_relations_.end()) {
-      auto p1 = (*p2).second.find(op1);
-      if (p1 != (*p2).second.end()) {
-        return SavedRelation(kFullyComputed, (*p1).second);
-      }
-    }
-    p2 = saved_relations_.find(op1);
-    if (p2 != saved_relations_.end()) {
-      auto p1 = (*p2).second.find(op2);
-      if (p1 != (*p2).second.end()) {
-        return SavedRelation(kPartiallyComputed,
-                             Relation::ReverseRuntimeOrder((*p1).second));
-      }
-    }
-    return SavedRelation(kNotComputed, Relation::kNoOverlap);
-  }
-
-  Relation::RuntimeOrder Save(HloInstruction* entry1, HloInstruction* entry2,
-                              const Relation::RuntimeOrder relation,
-                              bool is_unordered_originally = false) {
-    CHECK_EQ(AlreadyComputed(entry1, entry2).first, kNotComputed);
-    // Do not save unordered relations.
-    CHECK_NE(relation, Relation::kBeforeStartOrAfterEnd);
-    saved_relations_[entry2][entry1] = relation;
-    if (is_unordered_originally) {
-      CHECK(relation == Relation::kBeforeStart ||
-            relation == Relation::kAfterEnd)
-          << relation;
-      HloInstruction* pred =
-          (relation == Relation::kBeforeStart) ? entry1 : entry2;
-      HloInstruction* succ =
-          (relation == Relation::kBeforeStart) ? entry2 : entry1;
-      VLOG(3) << "Save unordered relation: " << pred->name() << " vs "
-              << succ->name();
-      CHECK_EQ(succ->parent(), pred->parent());
-      auto& dep_vec = ctrl_deps_[succ->parent()][succ];
-      for (HloInstruction*& op : dep_vec) {
-        auto rel = AlreadyComputed(pred, op);
-        if (rel.first != kNotComputed) {
-          if (rel.second == Relation::kAfterEnd) {
-            op = pred;
-          } else {
-            CHECK(rel.second == Relation::kBeforeStart);
-          }
-          return relation;
-        }
-      }
-      VLOG(2) << "Forcing unordered: " << pred->name() << " vs "
-              << succ->name();
-      dep_vec.push_back(pred);
-    }
-    return relation;
-  }
-
-  // Compute the runtime ordering constraints between two instructions.
-  Relation::RuntimeOrder ComputeRuntimeOrdering(HloInstruction* instr1,
-                                                HloInstruction* instr2) {
-    auto saved_relation = AlreadyComputed(instr1, instr2);
-    if (saved_relation.first != kNotComputed) {
-      VLOG(3) << "Already computed between " << instr1->name() << " vs "
-              << instr2->name();
-      return saved_relation.second;
-    }
-    auto constraint = ordering_->GetExecutionConstraint(instr1, instr2);
-    switch (constraint) {
-      case HloOrdering::ExecutionConstraint::kIsSame:
-        return Save(instr1, instr2, Relation::kSameInstr);
-      case HloOrdering::ExecutionConstraint::kRunBeforeEnd:
-        return Save(instr1, instr2, Relation::kBeforeStartOrSameInstr);
-      case HloOrdering::ExecutionConstraint::kRunBeforeStart:
-        return Save(instr1, instr2, Relation::kBeforeStart);
-      case HloOrdering::ExecutionConstraint::kRunAfter:
-        return Save(instr1, instr2, Relation::kAfterEnd);
-      case HloOrdering::ExecutionConstraint::kRunExclusiveBefore:
-      case HloOrdering::ExecutionConstraint::kRunExclusiveAfter:
-        return Save(instr1, instr2, Relation::kNoOverlap);
-      case HloOrdering::ExecutionConstraint::kUnordered: {
-        if (instr1->parent() != instr2->parent()) {
-          return Relation::kBeforeStartOrAfterEnd;
-        }
-        auto ControlDependenceBefore = [&](HloInstruction* op1,
-                                           HloInstruction* op2) {
-          auto constraint = ComputeRuntimeOrdering(op1, op2);
-          if (constraint == Relation::kBeforeStart ||
-              constraint == Relation::kSameInstr ||
-              constraint == Relation::kBeforeStartOrSameInstr) {
-            return true;
-          } else {
-            return false;
-          }
-        };
-        if (!ctrl_deps_.empty()) {
-          auto ctrl_deps = ctrl_deps_[instr1->parent()];
-          if (absl::c_any_of(ctrl_deps[instr2], [&](HloInstruction* pred2) {
-                return ControlDependenceBefore(instr1, pred2);
-              })) {
-            VLOG(2) << "control-dependent: " << instr1->name() << " vs "
-                    << instr2->name();
-            return Save(instr1, instr2, Relation::kBeforeStart);
-          } else if (absl::c_any_of(
-                         ctrl_deps[instr1], [&](HloInstruction* pred1) {
-                           return ControlDependenceBefore(instr2, pred1);
-                         })) {
-            VLOG(2) << "control-dependent: " << instr2->name() << " vs "
-                    << instr1->name();
-            return Save(instr1, instr2, Relation::kAfterEnd);
-          }
-        }
-        // Don't save the result for unordered operations, so they can be
-        // refined later.
-        return Relation::kBeforeStartOrAfterEnd;
-      }
-    }
-  }
-
-  HloOrdering* ordering_;
-  absl::flat_hash_map<
-      HloInstruction*,
-      absl::flat_hash_map<HloInstruction*, Relation::RuntimeOrder>>
-      saved_relations_;
-  absl::flat_hash_map<
-      HloComputation*,
-      absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>>>
-      ctrl_deps_;
-};
-}  // namespace
-
-// Class which tracks the HLO values within each HLO buffer in the module
-// during copy removal.
-//
-// The values are held in a linked list where there is one list for each
-// buffer. Removing a copy instruction merges together the values in the
-// source buffer of the copy to the destination buffer of the copy. This class
-// tracks these value lists as copies are removed from the graph (and value
-// lists are merged).
-//
-// The CopyRemover object is initialized to match the state of
-// HloAliasAnalysis. However, as copies are removed this state diverges. The
-// values-to-buffer mapping is maintained outside of HloAliasAnalysis because
-// a fully updatable alias analysis is very slow.
-class CopyRemover {
- public:
-  // The values held in a single HLO buffer are represented using a linked
-  // list. An element type in this list is ValueNode.
-  //
-  // This linked list is hand-rolled to enable efficient splicing of lists
-  // using only references to list elements without knowing which lists are
-  // being spliced. std::list requires a reference to the list object to
-  // splice.
-  struct ValueNode {
-    explicit ValueNode(const HloValue* v) : value(v) {}
-
-    const HloValue* value;
-
-    // The uses are maintained outside of HloValue::uses() because
-    // HloValue::uses() is not updatable (a fully updatable dataflow analysis
-    // is slow).
-    std::vector<const HloUse*> uses;
-
-    // next/prev elements in the linked list. The list is circularly linked so
-    // these values are never null for elements in the list.
-    ValueNode* prev = nullptr;
-    ValueNode* next = nullptr;
-  };
-
-  CopyRemover(const HloModule& module, const HloAliasAnalysis& alias_analysis,
-              HloOrdering* ordering, bool check_live_range_ordering,
-              const absl::flat_hash_set<absl::string_view>& execution_threads)
-      : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
-    // Instruction indices based on post order traversal of computations and
-    // instructions. Used as an enhancement for getting strict weak ordering
-    // used for sorting below.
-    absl::flat_hash_map<int, int64_t> instruction_ids;
-    int64_t id = 0;
-    for (HloComputation* computation : module.MakeComputationPostOrder()) {
-      for (HloInstruction* instruction :
-           computation->MakeInstructionPostOrder()) {
-        instruction_ids[instruction->unique_id()] = id++;
-      }
-    }
-
-    // Construct a list for each HLO buffer in the alias analysis. Maintain a
-    // map from HloValue to the respective list element representing that
-    // value. The map is used to construct the copy info map below.
-    absl::flat_hash_map<const HloValue*, ValueNode*> value_to_node;
-    // Perform check only if the default dependence-based ordering is used.
-    for (const HloBuffer& buffer : alias_analysis.buffers()) {
-      // No copies should have been inserted within fused computations, so no
-      // need to remove them. HloOrdering isn't compatible with HloValues inside
-      // fusions, so skip copy removal for them.
-      if (buffer.values().at(0)->defining_instruction()->IsFused()) {
-        continue;
-      }
-
-      std::vector<const HloValue*> values = buffer.values();
-      absl::c_sort(values, [this, &instruction_ids](const HloValue* a,
-                                                    const HloValue* b) {
-        // IsDefinedBefore() is generally not strict weak ordering required by
-        // the sort algorithm, since a may not be comparable to b or c by
-        // IsDefinedBefore(), but b and c can be comparable. Such as in:
-        //   if () { b = ...; c = b + 1; } else { a = ...; }
-        // or
-        //   a = param(0) + param(1); b = param(2) + param(3); c = b + 1;
-        // So it fails the "incomparability being transitive" requirement by
-        // strict weak ordering. We enhance the ordering test by using
-        // instruction ids generated by post order visiting of the
-        // computations/instructions. All HloValue's are comparable and
-        // dependency (thus transitivity) is respected when hlo ordering cannot
-        // decide the order.
-        if (a == b) {
-          return false;
-        }
-        const bool a_has_smaller_id =
-            instruction_ids.at(a->defining_instruction()->unique_id()) <
-            instruction_ids.at(b->defining_instruction()->unique_id());
-        // Use a_has_smaller_id as a hint for the order between a and b. In case
-        // it's right, there is no need for two IsDefinedBefore() tests.
-        if (a_has_smaller_id) {
-          // Test a is defined before b first.
-          if (ordering_->IsDefinedBefore(*a, *b)) {
-            return true;
-          }
-          if (ordering_->IsDefinedBefore(*b, *a)) {
-            return false;
-          }
-        } else {
-          // Test b is defined before a first.
-          if (ordering_->IsDefinedBefore(*b, *a)) {
-            return false;
-          }
-          if (ordering_->IsDefinedBefore(*a, *b)) {
-            return true;
-          }
-        }
-
-        // Use post order as tie breaker.
-        return a_has_smaller_id;
-      });
-
-      // Create a list containing all of the values in the buffer.
-      AddValueList(values, &value_to_node);
-    }
-
-    // Create copy_map_ which contains the source and destination values
-    // of all copies.
-    CreateCopyMap(module, value_to_node);
-
-    XLA_VLOG_LINES(3, ToString());
-    TF_DCHECK_OK(Verify());
-  }
-
-  // Add a list containing the given values to CopyRemover. This
-  // represents the values contained in a single buffer. For each value in
-  // 'values' an entry is created in value_to_node which indicates the
-  // respective ValueNode representing that value.
-  void AddValueList(
-      absl::Span<const HloValue* const> values,
-      absl::flat_hash_map<const HloValue*, ValueNode*>* value_to_node) {
-    ValueNode* tail = nullptr;
-    ValueNode* head = nullptr;
-    for (const HloValue* value : values) {
-      auto new_node = new ValueNode(value);
-      (*value_to_node)[value] = new_node;
-
-      // Copy the HLO values's uses into the ValueNode for the value. These
-      // uses in ValueNode are updated as copies are removed.
-      new_node->uses.reserve(value->GetUses().size());
-      for (const HloUse& use : value->GetUses()) {
-        new_node->uses.push_back(&use);
-      }
-
-      // Connect the new node into the linked list.
-      if (tail == nullptr) {
-        head = new_node;
-      } else {
-        tail->next = new_node;
-        new_node->prev = tail;
-      }
-      tail = new_node;
-    }
-
-    // The linked list is circular so connect the head and tail.
-    tail->next = head;
-    head->prev = tail;
-    value_lists_.insert(head);
-  }
-
-  // This method also fills in copy_map_ which indicates which nodes
-  // in the value lists corresponding to the source and destination values of
-  // kCopy instructions. value_to_node should map each HloValue to its
-  // respective ValueNode.
-  void CreateCopyMap(
-      const HloModule& module,
-      const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node) {
-    for (HloComputation* computation : module.MakeNonfusionComputations()) {
-      for (HloInstruction* instruction : computation->instructions()) {
-        // Add copies with unambiguous source values to the map. Copies with
-        // ambiguous sources are not removable.
-        if (instruction->opcode() == HloOpcode::kCopy) {
-          const HloValueSet& src_value_set =
-              dataflow_.GetValueSet(instruction->operand(0));
-          if (src_value_set.values().size() == 1) {
-            CopyNodes& copy_node = copy_map_[instruction];
-            copy_node.dest =
-                value_to_node.at(&dataflow_.GetUniqueValueAt(instruction));
-            copy_node.src = value_to_node.at(&src_value_set.GetUniqueValue());
-          }
-        }
-      }
-    }
-  }
-
-  ~CopyRemover() {
-    for (const ValueNode* head : value_lists_) {
-      const ValueNode* p = head;
-      do {
-        const ValueNode* tmp = p->next;
-        delete p;
-        p = tmp;
-      } while (p != head);
-    }
-  }
-
-  // Verify invariants within the linked lists.
-  absl::Status Verify() const {
-    for (const ValueNode* head : value_lists_) {
-      const ValueNode* p = head;
-      do {
-        // Verify links between elements are consistent.
-        TF_RET_CHECK(p->prev->next == p);
-        TF_RET_CHECK(p->next->prev == p);
-
-        const HloInstruction* def = p->value->defining_instruction();
-        if (def->opcode() == HloOpcode::kCopy && ContainsKey(copy_map_, def)) {
-          TF_RET_CHECK(copy_map_.at(def).dest == p);
-        }
-        for (const HloUse* use : p->uses) {
-          if (use->instruction->opcode() == HloOpcode::kCopy &&
-              ContainsKey(copy_map_, use->instruction)) {
-            TF_RET_CHECK(copy_map_.at(use->instruction).src == p);
-          }
-        }
-
-        p = p->next;
-      } while (p != head);
-    }
-    return absl::OkStatus();
-  }
-
-  // Compute the set of instructions where values are alive and organize these
-  // instructions by separating them into their respective computations.
-  LiveRangeRegions ComputeLiveRangeRegions(const ValueNode* head) {
-    LiveRangeRegions live_range;
-
-    auto VisitValueNode = [&](const ValueNode* node) {
-      HloInstruction* def_op = node->value->instruction();
-      HloComputation* def_parent = def_op->parent();
-      live_range[def_parent][def_op].is_definition = true;
-      for (const auto& use : node->uses) {
-        auto* use_op = use->instruction;
-        HloComputation* use_parent = use_op->parent();
-        live_range[use_parent][use_op].value_definition = def_op;
-      }
-    };
-    ForEachValueInRange(head, VisitValueNode);
-    return live_range;
-  }
-
-  // Try to elide the given copy. Elision of a copy is possible only if no
-  // live range interference is introduced by the copy's elimination. If
-  // elision is possible, then the internal state (value lists) are updated,
-  // and true is returned. Returns false otherwise.
-  bool TryElideCopy(const HloInstruction* copy,
-                    int64_t* region_analysis_limit) {
-    VLOG(2) << "Trying to remove " << copy->name();
-    CHECK_NE(region_analysis_limit, nullptr);
-    if (copy->shape().has_layout() && copy->operand(0)->shape().has_layout()) {
-      if (copy->shape().layout().memory_space() == Layout::kHostMemorySpace &&
-          copy->operand(0)->shape().layout().memory_space() !=
-              Layout::kHostMemorySpace) {
-        return false;
-      }
-      if (copy->shape().layout().memory_space() != Layout::kHostMemorySpace &&
-          copy->operand(0)->shape().layout().memory_space() ==
-              Layout::kHostMemorySpace) {
-        return false;
-      }
-    }
-
-    if (!ContainsKey(copy_map_, copy)) {
-      VLOG(2) << copy->name() << " is not removable";
-      return false;
-    }
-    if (!ShapeUtil::Equal(copy->shape(), copy->operand(0)->shape())) {
-      VLOG(2) << copy->name() << " is not removable (shape mismatch)";
-      return false;
-    }
-    const CopyNodes& copy_node = copy_map_.at(copy);
-    DCHECK(copy_node.src != nullptr);
-    DCHECK(copy_node.dest != nullptr);
-
-    int64_t live_range_size1 = 0, live_range_size2 = 0;
-    ForEachValueInRange(copy_node.src, [&](const ValueNode* node) {
-      live_range_size1 += 1 + node->uses.size();
-    });
-    ForEachValueInRange(copy_node.dest, [&](const ValueNode* node) {
-      live_range_size2 += 1 + node->uses.size();
-    });
-    // Use the more accurate region-based live range interference analysis if
-    // the live range size is within a given limit (or if no limit is given).
-    // Also don't use the new analysis for copies of broadcasts as these copies
-    // are cheap and are later removed by replicating the broadcasts.
-    bool use_region_analysis =
-        copy->operand(0)->opcode() != HloOpcode::kBroadcast &&
-        (*region_analysis_limit < 0 ||
-         live_range_size1 * live_range_size2 <= *region_analysis_limit);
-    *region_analysis_limit = 0;
-    VLOG(3) << copy->name() << " copies value "
-            << copy_node.src->value->ToShortString();
-    VLOG(3) << "Source buffer values: " << ValueListToString(copy_node.src);
-    VLOG(3) << "Dest buffer values: " << ValueListToString(copy_node.dest);
-    // Checks whether the live range at src is before that defined by dest.
-    auto CheckLiveRangeBefore = [&](ValueNode* src, ValueNode* dest) {
-      for (ValueNode* next_dest = dest; next_dest != nullptr;
-           next_dest = Next(*next_dest)) {
-        for (ValueNode* prev_src = src; prev_src != nullptr;
-             prev_src = Prev(*prev_src)) {
-          if (!LiveRangeBefore(*prev_src, *next_dest)) {
-            VLOG(2) << "Live range of " << prev_src->value->ToShortString()
-                    << " is not before " << next_dest->value->ToShortString();
-            return false;
-          }
-        }
-      }
-      return true;
-    };
-    auto CheckLiveRangeInterference = [&](ValueNode* src, ValueNode* dest,
-                                          const CombineLiveRangeOption option) {
-      CHECK_NE(src, nullptr);
-      CHECK_NE(dest, nullptr);
-      if (!use_region_analysis) {
-        VLOG(2) << "Configured to not use region-based analysis.";
-        return true;
-      }
-      *region_analysis_limit += live_range_size1 * live_range_size2;
-      if (ValuesInterfere(src, dest, option)) {
-        VLOG(2) << "Region-based interference is true.";
-        return true;
-      }
-      VLOG(2) << "Region-based interference is false.";
-      return false;
-    };
-
-    // A kCopy instruction copies an HLO value from a source buffer and
-    // defines an HLO value in a destination buffer. Most generally, the
-    // source and destination buffers may each hold more than one value at
-    // different points in the computation so we define the following:
-    //
-    //   Values in source buffer:      {s_0, ..., s_n}
-    //   Values in destination buffer: {d_0, ..., d_m}
-    //
-    // A kCopy instruction between these buffers copies a value s_x in the
-    // source buffer and defines a value d_y in the destination buffer. The
-    // elision of a copy merges the source and destination buffers together,
-    // so the list of values for the source and destination buffers are
-    // merged.
-    //
-    // We handle two different cases for copy elision:
-    //
-    //  (1) the kCopy defines the first value in the destination buffer (d_0).
-    //
-    //  (2) the kCopy copies the last value in the source buffer (s_n).
-    //
-    // For the remaining case where the kCopy copies a not-last value from the
-    // source buffer to a not-first value of the destination buffer, the kCopy
-    // instruction cannot be removed. This case is generated, for example, if
-    // the kCopy copies a while body parameter of the loop state at one tuple
-    // index to a different tuple index in the while body root. Removal of the
-    // copy necessarily results in live range interference of values in the
-    // loop state at the two different tuple indices.
-    //
-    //  We can only perform copy elision if the resulting merged values have
-    //  totally ordered live ranges; otherwise the merged buffer would have
-    //  live range interference.
-    if (copy_node.src->next == copy_node.dest) {
-      // In the process of eliding copies, it's possible for a copy to have the
-      // same source and destination buffer. In this case, the copy can be
-      // safely removed.
-      VLOG(2) << copy->name() << " source and destination buffers are same.";
-    } else if (IsHead(*copy_node.dest)) {
-      // The copy copies an arbitrary value in the source buffer (call it s_x)
-      // and defines d_0, the first value in the destination buffer. After
-      // merging, the values in the combined buffer must be strictly ordered
-      // as follows** to elide the copy:
-      //
-      // {s_0, ..., s_x, d_1, ..., d_m, s_{x+1}, ..., s_n}
-      //
-      // Removing the copy eliminates d_0, and uses of d_0 become uses of
-      // s_x. In the above ordering, the live range of d_m will be ordered
-      // before the live range of s_{x+1} and the definition and all uses of
-      // s_x will be ordered before the definition of d_1. To make sure the
-      // copy elision is safe, the following code checks that this ordering is
-      // valid --- in particular we check it is safe to order d_m ahead of all
-      // the liverages at and after s_{x+1}, and it is safe to order all uses
-      // of s_x before the definition of d_1, by checking the live range
-      // constraints for each pair --- we cannot skip the later checks because
-      // the live range ordering is not guranteed to be transitive --- while it
-      // may be ok to have lr_1 before lr_2, and lr_2 before lv_3 while merging
-      // their buffers, it may not be ok to merge the buffers of lr_1 and lv_3,
-      // because the exclusiveness relation of non-overlapping computations is
-      // not transitive.
-      //
-      // ** Technically it might be possible to have a non-interfering
-      //    non-trivial interleaving of the values of the source and
-      //    destination buffers in the resulting order. This can be potentially
-      //    supported in the ValuesInterfere function, which performs
-      //    interference analysis at a more global scope than the alternative
-      //    LiveRangeBefore analysis which requires strict ordering of all live
-      //    ranges. Currently, however, this is not yet supported, as
-      //    we simply check for the case where *all* values of the destination
-      //    buffer (d_1 through d_m) are spliced into the point where the copy
-      //    used to be.
-      VLOG(2) << copy->name() << " defines the first value in its buffer";
-      bool live_range_before =
-          // Live range of (s_x, s_{x-1},...) must be before 'next_dest' (d_1);
-          CheckLiveRangeBefore(copy_node.src, Next(*copy_node.dest)) &&
-          // Live range of 'last_dest' (d_m) must be before 'next_src' s_{x+1}.
-          CheckLiveRangeBefore(copy_node.dest->prev, Next(*copy_node.src));
-      VLOG(2) << "LiveRangeBefore result: " << live_range_before;
-      if (!live_range_before &&
-          CheckLiveRangeInterference(copy_node.src, copy_node.dest,
-                                     kMergeFirstDestInSource)) {
-        return false;
-      }
-      VLOG(2) << "Splice dest after source.";
-      // Splice in destination buffer values list right after 'src'.
-      SpliceAfter(copy_node.dest, copy_node.src);
-    } else if (IsTail(*copy_node.src)) {
-      // The copy copies the last value in the source buffer, s_n, and defines
-      // an arbitrary value in the destination buffer, d_y.  After
-      // merging, the values in the combined buffer must be strictly ordered
-      // as follows** to elide the copy:
-      //
-      // {d_0, ..., d_{y-1}, s_0, ..., s_n, d_{y+1}, ..., d_m}
-      //
-      // Removing the copy eliminates d_y, and uses of d_y become uses of
-      // s_n. To enforce the above order, the live range of d_{y-1} must be
-      // before the live range of s_0, and the live range of s_n must be
-      // before the live range of d_{y+1}.
-      //
-      // ** See comment above in the code handling Case (1).
-      VLOG(2) << copy->name() << " copies the last value ("
-              << copy_node.src->value->ToShortString() << ") in its buffer";
-      bool live_range_before =
-          // Live range of d_0, ..., d_{y-1} must be before s_0;
-          // Since copy_node.src is tail for this if branch, copy_node.src->next
-          // is s0 because the list is circularly linked.
-          CheckLiveRangeBefore(Prev(*copy_node.dest), copy_node.src->next) &&
-          // Live range of 'last_src' must be before next_dest d_{y+1}.
-          CheckLiveRangeBefore(copy_node.src, Next(*copy_node.dest));
-      VLOG(2) << "LiveRangeBefore result: " << live_range_before;
-      if (!live_range_before &&
-          CheckLiveRangeInterference(copy_node.src, copy_node.dest,
-                                     kMergeLastSourceInDest)) {
-        VLOG(2) << "Region-based analysis concludes interference.";
-        return false;
-      }
-      VLOG(2) << "Splice src after prev of dest.";
-      // Splice source buffer values list right after 'prev_dest'.
-      SpliceAfter(copy_node.src->next, Prev(*copy_node.dest));
-    } else {
-      VLOG(2) << copy->name()
-              << " copies value in middle of source buffer to value in middle "
-                 "of destination buffer";
-      return false;
-    }
-
-    RemoveCopyValue(copy_node.dest);
-
-    XLA_VLOG_LINES(4, ToString());
-    TF_DCHECK_OK(Verify());
-
-    return true;
-  }
-
-  // Delete the given ValueNode associated with a elided kCopy
-  // instruction. This should be called after splicing the value lists of the
-  // source and destination buffers together.
-  void RemoveCopyValue(ValueNode* copy_value_node) {
-    CHECK_EQ(copy_value_node->value->defining_instruction()->opcode(),
-             HloOpcode::kCopy);
-    ValueNode* operand_node = copy_value_node->prev;
-    CHECK(operand_node != copy_value_node);
-
-    VLOG(2) << "Removing copy " << operand_node->value->ToShortString()
-            << " => " << copy_value_node->value->ToShortString();
-
-    // Splice out the copy value node.
-    operand_node->next = copy_value_node->next;
-    copy_value_node->next->prev = operand_node;
-
-    // Patch up uses. Remove use of copy from operand_node uses.
-    auto it = absl::c_find_if(operand_node->uses, [copy_value_node](
-                                                      const HloUse* use) {
-      return use->instruction == copy_value_node->value->defining_instruction();
-    });
-    CHECK(it != operand_node->uses.end());
-    operand_node->uses.erase(it);
-
-    // If the elided copy has any uses which are themselves kCopy instructions
-    // then patch up the copy info to reflect the that this kCopy instruction
-    // has a different operand (the operand of the elided copy).
-    for (const HloUse* copy_use : copy_value_node->uses) {
-      operand_node->uses.push_back(copy_use);
-      if (copy_use->instruction->opcode() == HloOpcode::kCopy &&
-          ContainsKey(copy_map_, copy_use->instruction)) {
-        copy_map_.at(copy_use->instruction).src = operand_node;
-      }
-    }
-
-    // Delete the copy info and the value node.
-    copy_map_.erase(copy_value_node->value->defining_instruction());
-    delete copy_value_node;
-  }
-
-  // Returns true if the live range of given value 'a' is before the live
-  // range of 'b'.
-  //
-  // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
-  // updated as copies are removed. Also here because the result is used
-  // to directly drive copy elision, use_is_always_before_def_in_same_instr is
-  // set to false.
-  bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
-    if (a.uses.empty()) {
-      VLOG(2) << "Empty uses for " << *a.value;
-      return ordering_->IsDefinedBefore(*a.value, *b.value);
-    }
-    VLOG(3) << "Checking live ranges before: " << ValueListToString(&a)
-            << " vs " << ValueListToString(&b);
-    // If any of the positions of the "a" value is a root of the same
-    // computation as "b", "a"'s live range cannot be before "b"'s. This catches
-    // the cases where the root may not be the last instruction in the
-    // computation.
-    if (a.value->IsRootOf(b.value->defining_instruction()->parent())) {
-      VLOG(3) << "Value is root of the same computation";
-      return false;
-    }
-    return ordering_->UsesBeforeValueDefinition(
-        a.uses, *b.value, dataflow_,
-        /* use_is_always_before_def_in_same_instr=*/false);
-  }
-
-  // Returns whether 'node' is the last node in its list.
-  bool IsTail(const ValueNode& node) const {
-    return ContainsKey(value_lists_, node.next);
-  }
-
-  // Returns whether 'node' is the first node in its list.
-  bool IsHead(const ValueNode& node) const {
-    return ContainsKey(value_lists_, &node);
-  }
-
-  // Returns the next node in the list after 'node'. If 'node' is the
-  // tail, then nullptr is returned.
-  ValueNode* Next(const ValueNode& node) const {
-    if (IsTail(node)) {
-      return nullptr;
-    } else {
-      return node.next;
-    }
-  }
-
-  // Returns the previous node in the list before 'node'. If 'node'
-  // is the head, then nullptr is returned.
-  ValueNode* Prev(const ValueNode& node) const {
-    if (IsHead(node)) {
-      return nullptr;
-    } else {
-      return node.prev;
-    }
-  }
-
-  // Splices the entire linked list with 'head' as its head right after the
-  // node 'insert_after' in another linked list.
-  void SpliceAfter(ValueNode* head, ValueNode* insert_after) {
-    DCHECK(IsHead(*head));
-    value_lists_.erase(head);
-
-    ValueNode* tail = head->prev;
-    tail->next = insert_after->next;
-    insert_after->next->prev = tail;
-
-    insert_after->next = head;
-    head->prev = insert_after;
-  }
-
-  enum CombineLiveRangeOption {
-    kMergeFirstDestInSource = 1,
-    kMergeLastSourceInDest = 2
-  };
-  // This function analyzes all the HloValues that have been grouped together
-  // with src to share a single buffer, and all the HloValues that have been
-  // similarly grouped together with dest, to determine whether these two groups
-  // can be combined, by removing the operation in dest, which makes a copy of
-  // the buffer in src.
-  bool ValuesInterfere(const ValueNode* src, const ValueNode* dest,
-                       CombineLiveRangeOption merge_location) {
-    // Get the entire range of values sharing the buffers in src and dest.
-    auto src_live_range = ComputeLiveRangeRegions(src);
-    auto dest_live_range = ComputeLiveRangeRegions(dest);
-
-    VLOG(5) << "src value: " << src->value->ToString();
-    VLOG(5) << "src live range:\n" << src_live_range.ToString();
-
-    VLOG(5) << "dest value: " << dest->value->ToString();
-    VLOG(5) << "dest live range:\n" << dest_live_range.ToString();
-
-    ComputeRelativeLocation relative_location_analysis(ordering_);
-    auto rel1 =
-        relative_location_analysis.Compute(src_live_range, dest_live_range);
-    VLOG(3) << "Location of dest in relation to src: " << rel1.ToString()
-            << " with interception set to " << rel1.InterceptDefUse();
-    auto rel2 =
-        relative_location_analysis.Compute(dest_live_range, src_live_range);
-    VLOG(3) << "Location of src in relation to dest: " << rel2.ToString()
-            << " with interception set to " << rel2.InterceptDefUse();
-    // If src and dest are interleaved with each other, they interfere.
-    if (rel1.RuntimeOrderOverlap() && rel2.RuntimeOrderOverlap()) {
-      VLOG(3) << "Both relations are overlap.";
-      return true;
-    }
-    // If src and dest belong to the same group of computations and do not
-    // overlap, they do not interfere.
-    if (rel1.RuntimeOrderOverlap() || rel2.RuntimeOrderOverlap()) {
-      VLOG(3) << "At least one relation is overlap.";
-      if (rel1.RuntimeOrderOverlap()) {
-        VLOG(3) << "rel1 is overlap, with interception = "
-                << rel1.InterceptDefUse();
-        if (rel1.InterceptDefUse() ||
-            (merge_location != kMergeFirstDestInSource &&
-             rel2.InterceptDefUse())) {
-          return true;
-        }
-      } else {
-        VLOG(3) << "rel2 is overlap, with interception = "
-                << rel2.InterceptDefUse();
-        // Here src is at the end of a nested computation inside dest.
-        if (rel2.InterceptDefUse() ||
-            (merge_location != kMergeLastSourceInDest &&
-             rel1.InterceptDefUse())) {
-          return true;
-        }
-      }
-    }
-    if (relative_location_analysis.AddControlDependenceForUnorderedOps()) {
-      return false;
-    } else {
-      // Disallow removing of copy if control deps cannot be added.
-      return true;
-    }
-  }
-
-  // Calls `visitor` on each item in the sequence of HloValues starting from
-  // `element`.
-  //
-  // If element is not head, traverse from element to tail, then wrap
-  // around. The ordering is important for live range region analysis.
-  void ForEachValueInRange(const ValueNode* element,
-                           absl::FunctionRef<void(const ValueNode*)> visitor) {
-    const ValueNode* head = element;
-    for (const ValueNode* p = head; p != nullptr; p = Next(*p)) {
-      visitor(p);
-    }
-    while (!IsHead(*head)) {
-      head = Prev(*head);
-    }
-    for (const ValueNode* p = head; p != element; p = Next(*p)) {
-      visitor(p);
-    }
-  }
-
-  std::string ValueListToString(const ValueNode* element) {
-    std::string result = "{";
-    auto VisitValueNode = [&](const ValueNode* node) {
-      if (result == "{") {
-        StrAppend(&result, node->value->ToShortString());
-      } else {
-        StrAppend(&result, ", ", node->value->ToShortString());
-      }
-    };
-    ForEachValueInRange(element, VisitValueNode);
-    StrAppend(&result, "}");
-    return result;
-  }
-
-  std::string ToString() const {
-    std::string out = absl::StrCat("CopyRemover:\n");
-    StrAppend(&out, "  Def-use chains in each buffer:\n");
-    for (const ValueNode* head : value_lists_) {
-      StrAppend(&out, "    Buffer defined by ", head->value->ToShortString(),
-                ":\n");
-      const ValueNode* p = head;
-      do {
-        StrAppend(&out, "      ", p->value->ToShortString(), ", uses: ",
-                  absl::StrJoin(p->uses, "; ",
-                                [](std::string* s, const HloUse* use) {
-                                  StrAppend(s, use->ToString());
-                                }),
-                  "\n");
-
-        p = p->next;
-      } while (p != head);
-    }
-    StrAppend(&out, "  Potentially removable copies:\n");
-    for (const auto& pair : copy_map_) {
-      const HloInstruction* copy = pair.first;
-      const CopyNodes& copy_info = pair.second;
-
-      StrAppend(&out, "    ", copy->name(), " : ",
-                copy_info.src->value->ToShortString(), " => ",
-                copy_info.dest->value->ToShortString(), "\n");
-    }
-    return out;
-  }
-
- private:
-  const HloDataflowAnalysis& dataflow_;
-  HloOrdering* ordering_;
-
-  // The heads of all the value lists. Each value list represents the HLO
-  // values contained in a particular HLO buffer. The values in the list are
-  // in dependency order.
-  absl::flat_hash_set<const ValueNode*> value_lists_;
-
-  // Copy removal requires fast access to the value list elements
-  // corresponding to the source and destination values of the kCopy
-  // instruction. This data structure holds pointers to these elements for
-  // each kCopy instruction in the graph.
-  struct CopyNodes {
-    // The source and destinations values of the kCopy instruction.
-    ValueNode* src = nullptr;
-    ValueNode* dest = nullptr;
-  };
-  absl::flat_hash_map<const HloInstruction*, CopyNodes> copy_map_;
-};
-
 }  // namespace
 
 // We add copies for all phi indices of the true and false computation
@@ -2018,51 +550,112 @@ absl::Status CopyInsertion::AddCopiesForConditional(
   return absl::OkStatus();
 }
 
-HloInstruction* FindAsyncSendRecvDoneInWhileBody(
-    const HloComputation* while_body, const HloInstruction* start_op) {
-  // Partially pipelined send/recv must have a single user.
-  if (start_op->user_count() != 1) return nullptr;
-  HloInstruction* unique_user = start_op->users().front();
-  // Send/recv must be consumed by send/recv-done op or be passed through the
-  // loop.
-  if (IsSendRecvDone(unique_user)) return unique_user;
-  if (unique_user->opcode() != HloOpcode::kTuple || !unique_user->IsRoot())
+// If `chain_start` is the head of a chain of non-copyable ops inside a while
+// loop, and part of the chain is rotated to the next iteration, returns the
+// chain end in the rotated part. Otherwise, returns nullptr.
+HloInstruction* FindEndOpForRotatedNonCopyableChain(
+    const HloComputation* while_body, const HloInstruction* chain_start) {
+  // Non-copyable op must have a single user.
+  if (chain_start->user_count() != 1) {
+    return nullptr;
+  }
+  HloInstruction* unique_user = chain_start->users().front();
+  if (unique_user->opcode() != HloOpcode::kTuple || !unique_user->IsRoot()) {
     return nullptr;
-  int64_t index = unique_user->operand_index(start_op);
+  }
+  int64_t index = unique_user->operand_index(chain_start);
   for (const HloInstruction* it :
        while_body->parameter_instruction(0)->users()) {
     const auto* gte = DynCast<HloGetTupleElementInstruction>(it);
     if (gte->tuple_index() == index) {
-      CHECK_EQ(gte->user_count(), 1) << "send/recv in next loop iteration must "
-                                        "be consumed by unique send/recv-done.";
+      CHECK_EQ(gte->user_count(), 1)
+          << "non-copyable value in next loop iteration must "
+             "be consumed by unique instruction.";
       HloInstruction* next_unique_user = gte->users().front();
-      if (IsSendRecvDone(next_unique_user)) return next_unique_user;
+      if (HloDataflowAnalysis::IsAsynchronousOperationDone(
+              next_unique_user->opcode())) {
+        return next_unique_user;
+      }
+      break;
     }
   }
   return nullptr;
 }
 
-// Add copies for partially pipelined async send/recv. Copies are added before
-// starting to send and after finishing to recv. This is to prevent overlapping
-// live times of the buffers. The control flow edges from the added copy to the
-// recv or send-done operation guarantee disjoint live times of the buffers.
-// Note that we have anchor these control flow edges to the copies as the send
-// and recv-done ops are aliasing.
+// Adds copies for non-copyable transitioning between copyable and non-copyable
+// for a chain start with `chain_start` and part of the chain is rotated to the
+// next iteration that ends with `chain_end`.
+absl::Status AddCopiesForNonCopyableTransitionsRotatedCase(
+    HloInstruction* chain_start, HloInstruction* chain_end) {
+  HloComputation* while_body = chain_start->parent();
+  // Handle aliasing input for the op, where we transition from copyable to
+  // non-copyable.
+  if (!chain_start->operands().empty()) {
+    // A chain_start may have multiple operands, but we assume only the first
+    // operand is a buffer aliasing with the output, which is true currently.
+    HloInstruction* operand = chain_start->mutable_operand(0);
+    HloInstruction* copied_operand =
+        while_body->AddInstruction(HloInstruction::CreateUnary(
+            operand->shape(), HloOpcode::kCopy, operand));
+    TF_RETURN_IF_ERROR(operand->ReplaceUseWith(chain_start, copied_operand));
+    TF_RETURN_IF_ERROR(chain_end->AddControlDependencyTo(copied_operand));
+  }
+
+  // The chain_end is rotated and semantically paired with the chain_start of
+  // the previous iteration. We add a control dependency from the chain_end to
+  // the chain_start to in the same lexical iteration guarantee disjoint live
+  // times of the buffers involved.
+  TF_RETURN_IF_ERROR(chain_end->AddControlDependencyTo(chain_start));
+
+  // If chain_end has users, insert copies for the result produced by the
+  // chain_end with aliasing input and output buffers, where we transition from
+  // non-copyable to copyable.
+  PtrVec<HloInstruction*> users = chain_end->users();
+  if (users.empty()) {
+    return absl::OkStatus();
+  }
+  ShapeTree<HloInstruction*> copies_added(chain_end->shape());
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * copy,
+      while_body->DeepCopyInstruction(chain_end, /*indices_to_copy=*/nullptr,
+                                      &copies_added));
+  for (auto [shape_index, instr] : copies_added) {
+    if (instr != nullptr) {
+      TF_RETURN_IF_ERROR(instr->AddControlDependencyTo(chain_start));
+    }
+  }
+  for (HloInstruction* it : users) {
+    TF_RETURN_IF_ERROR(chain_end->ReplaceUseWith(it, copy));
+  }
+  return absl::OkStatus();
+}
+
+// Adds the needed copies for transitioning into and out of non-copyable values,
+// to prevent overlapping live times of buffers. This is needed when the unique
+// user of the non-copyable op is rotated (also called pipelined) in a
+// while-loop. In particlar, if a non-copyable op has an input aliasing with its
+// output, such as async Send, we make a copy of its input to transition from
+// copyable to non-copyable. If a non-copyable op's unique user produces an
+// output aliasing with its input, such as async Recv, we make a copy of the
+// output produced by the unique user, to transition out of non-copyable to
+// copyable. We also add control-flow edges between the copies and the
+// non-copyable op to guarantee disjoint live times of the buffers invovled.
 //
+// Using async Send and Recv as examples, here is the transformation:
 //
 // Before:
 //
-//      kParameter                kParameter
-//          |                         |
-//      kSendDone                 kRecvDone
-//                                    |
-//         ...                     consumer
+//      kParameter               kParameter
+//          |                        |
+//      kSendDone                kRecvDone (end of a non-copyable chain)
+//                                   |
+//         ...                    consumer
 //
-//       producer                    ...
+//       producer                   ...
 //          |
-//        kSend                     kRecv
-//          |                         |
-//     (body root)               (body root)
+//        kSend                    kRecv   (start of a non-copyable op)
+//          |                        |
+//     (body root)              (body root)
 //
 //
 // After:
@@ -2080,57 +673,52 @@ HloInstruction* FindAsyncSendRecvDoneInWhileBody(
 //          |                         |
 //     (body root)               (body root)
 //
-absl::Status CopyInsertion::AddCopiesForAsyncSendRecv(
-    const HloAliasAnalysis& alias_analysis, HloInstruction* start_op) {
-  // If start op has multiple users, this must be the synchronous use of
-  // send/recv.
-  // TODO(b/369589022): Disambiguate sync and async use of send/recv.
-  if (start_op->users().size() != 1) return absl::OkStatus();
+absl::Status CopyInsertion::AddCopiesForNonCopyableTransitions(
+    const HloAliasAnalysis& alias_analysis, HloInstruction* chain_start) {
+  if (chain_start->users().empty()) {
+    return absl::OkStatus();
+  }
 
+  // Currently non-copyable ops can have at most one user.
+  if (chain_start->users().size() != 1) {
+    return absl::InvalidArgumentError(
+        "Non-copyable op must have a single user.");
+  }
+
+  HloInstruction* unique_user = chain_start->users().front();
   // If start feeds directly into done, the live time is contained and we don't
   // need to add any copies.
-  HloInstruction* unique_user = start_op->users().front();
-  const HloOpcode done_opcode = start_op->opcode() == HloOpcode::kSend
-                                    ? HloOpcode::kSendDone
-                                    : HloOpcode::kRecvDone;
-  if (unique_user->opcode() == done_opcode) {
+  if (HloDataflowAnalysis::IsAsynchronousOperationDone(unique_user->opcode())) {
     return absl::OkStatus();
   }
 
-  // For send/recv outside of the while loop, live times are disjoint. No copies
-  // needed.
-  HloComputation* while_body = start_op->parent();
-  if (!while_body->IsWhileBodyComputation()) return absl::OkStatus();
-
-  // Handle send case.
-  HloInstruction* done_op =
-      FindAsyncSendRecvDoneInWhileBody(while_body, start_op);
-  // TODO(b/369589022): Disambiguate sync and async use of send/recv.
-  if (done_op == nullptr) return absl::OkStatus();
-  if (start_op->opcode() == HloOpcode::kSend) {
-    HloInstruction* operand = start_op->mutable_operand(0);
+  HloComputation* parent = chain_start->parent();
+  // If a start op with an operand is fed into a pipelined while-loop, we
+  // need to make a copy of the operand and use the copy in the start op.
+  if (chain_start->operand_count() > 0 &&
+      unique_user->opcode() == HloOpcode::kTuple &&
+      unique_user->users().size() == 1 &&
+      unique_user->users().front()->opcode() == HloOpcode::kWhile) {
+    HloInstruction* operand = chain_start->mutable_operand(0);
     HloInstruction* copied_operand =
-        while_body->AddInstruction(HloInstruction::CreateUnary(
+        parent->AddInstruction(HloInstruction::CreateUnary(
             operand->shape(), HloOpcode::kCopy, operand));
-    TF_RETURN_IF_ERROR(operand->ReplaceUseWith(start_op, copied_operand));
-    TF_RETURN_IF_ERROR(done_op->AddControlDependencyTo(copied_operand));
+    TF_RETURN_IF_ERROR(operand->ReplaceUseWith(chain_start, copied_operand));
     return absl::OkStatus();
   }
 
-  // Handle recv case.
-  CHECK_EQ(start_op->opcode(), HloOpcode::kRecv);
-  PtrVec<HloInstruction*> done_op_users = done_op->users();
-  ShapeTree<HloInstruction*> copies_added(done_op->shape());
-  TF_ASSIGN_OR_RETURN(HloInstruction * done_op_copy,
-                      while_body->DeepCopyInstruction(
-                          done_op, /*indices_to_copy=*/nullptr, &copies_added));
-  for (auto [shape_index, instr] : copies_added) {
-    if (instr != nullptr)
-      TF_RETURN_IF_ERROR(instr->AddControlDependencyTo(start_op));
+  // For other cases where a non-copyable chain is outside of the while loop,
+  // live times are disjoint. No copies are needed.
+  if (parent->caller_instructions(HloOpcode::kWhile).empty()) {
+    return absl::OkStatus();
   }
-  TF_RETURN_IF_ERROR(done_op->AddControlDependencyTo(start_op));
-  for (HloInstruction* it : done_op_users) {
-    TF_RETURN_IF_ERROR(done_op->ReplaceUseWith(it, done_op_copy));
+
+  // For async start ops, the end of the chain is the async done op.
+  HloInstruction* chain_end =
+      FindEndOpForRotatedNonCopyableChain(parent, chain_start);
+  if (chain_end) {
+    return AddCopiesForNonCopyableTransitionsRotatedCase(chain_start,
+                                                         chain_end);
   }
   return absl::OkStatus();
 }
@@ -2155,10 +743,9 @@ absl::Status CopyInsertion::AddCopiesToResolveInterference(
       } else if (instruction->opcode() == HloOpcode::kConditional) {
         TF_RETURN_IF_ERROR(
             AddCopiesForConditional(*alias_analysis, instruction));
-      } else if (IsSendRecv(instruction)) {
-        // TODO(b/371225893): Generalize this to all async collectives.
+      } else if (IsNonCopyable(instruction)) {
         TF_RETURN_IF_ERROR(
-            AddCopiesForAsyncSendRecv(*alias_analysis, instruction));
+            AddCopiesForNonCopyableTransitions(*alias_analysis, instruction));
       } else {
         // When an operand is a tuple, we avoid copying the operand multiple
         // times by recording and checking the operand number of operands that
@@ -2392,7 +979,8 @@ static int64_t GetNumExistingCopies(
 
 absl::Status CopyInsertion::RemoveUnnecessaryCopies(
     HloModule* module, bool check_live_range_ordering,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    bool insert_post_scheduling_control_dependencies) {
   XLA_VLOG_LINES(
       4, module->ToString(HloPrintOptions().set_syntax_sugar_async_ops(false)));
 
@@ -2421,19 +1009,21 @@ absl::Status CopyInsertion::RemoveUnnecessaryCopies(
   int64_t num_existing_copies = GetNumExistingCopies(module, execution_threads);
   bool changed = true;
   int64_t num_iterations = -1;
-  VLOG(6) << "Copy Insertion analyzing module with instruction count = "
-          << module->instruction_count();
+  VLOG(6) << "Copy removal analyzing module (" << module->name()
+          << ") with instruction count = " << module->instruction_count();
   BoundNonLinearCompilerAnalysis allowance(module, name(), 10);
   while (changed) {
     CHECK_LE(++num_iterations, num_existing_copies);
     changed = false;
-    VLOG(2) << "Running fixpoint iteration " << num_iterations
-            << " of copy elision";
+    VLOG(2) << "RemoveUnnecessaryCopies running fixpoint iteration "
+            << num_iterations << " of copy elision";
     for (HloComputation* computation :
          module->computations(execution_threads)) {
       VLOG(2) << "computation:" << computation->name();
       for (HloInstruction* instruction : computation->instructions()) {
-        if (instruction->opcode() != HloOpcode::kCopy) continue;
+        if (instruction->opcode() != HloOpcode::kCopy) {
+          continue;
+        }
 
         // The region_analysis_cost_now is always set to
         // use_region_based_live_range_analysis_ if it is < 0, in which case the
@@ -2443,12 +1033,16 @@ absl::Status CopyInsertion::RemoveUnnecessaryCopies(
                 ? 0
                 : std::min(allowance.analysis_allowance(),
                            use_region_based_live_range_analysis_);
-        if (copy_remover.TryElideCopy(instruction, &region_analysis_cost_now)) {
+        if (copy_remover.TryElideCopy(
+                instruction, &region_analysis_cost_now,
+                insert_post_scheduling_control_dependencies)) {
           changed = true;
           TF_RETURN_IF_ERROR(StripControlDependenciesFrom(instruction));
           TF_RETURN_IF_ERROR(
               instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
-          VLOG(6) << "succeeded in eliminating copy.";
+          VLOG(3) << "Copy removed successfully: " << instruction->ToString();
+          XLA_VLOG_LINES(
+              6, absl::StrCat("   Resulting Module: ", module->ToString()));
         }
         if (allowance.ContinueAnalysis() && region_analysis_cost_now > 0) {
           VLOG(6) << "Copy Insertion analyzing module cost: "
diff --git a/third_party/xla/xla/service/copy_insertion.h b/third_party/xla/xla/service/copy_insertion.h
index 0b2ba86e3ef3..36b1975123c6 100644
--- a/third_party/xla/xla/service/copy_insertion.h
+++ b/third_party/xla/xla/service/copy_insertion.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_COPY_INSERTION_H_
 #define XLA_SERVICE_COPY_INSERTION_H_
 
+#include <cstdint>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -80,7 +82,8 @@ class CopyInsertion : public HloModulePass {
   // in all the existing aliased buffers.
   absl::Status RemoveUnnecessaryCopies(
       HloModule* module, bool check_live_range_ordering = false,
-      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {},
+      bool insert_post_scheduling_control_dependencies = false);
 
   // Add copies to address special constraints on the roots of computations not
   // related to live range interference:
@@ -107,9 +110,9 @@ class CopyInsertion : public HloModulePass {
   virtual absl::Status AddCopiesForConditional(
       const HloAliasAnalysis& alias_analysis, HloInstruction* conditional);
 
-  // Add copies for async send/recv instructions.
-  absl::Status AddCopiesForAsyncSendRecv(const HloAliasAnalysis& alias_analysis,
-                                         HloInstruction* async);
+  // Adds copies for transitioning into and out of non-copyable values.
+  absl::Status AddCopiesForNonCopyableTransitions(
+      const HloAliasAnalysis& alias_analysis, HloInstruction* chain_start);
 
   // Backend specific function that decides whether an instruction can share
   // buffer with its operand.
diff --git a/third_party/xla/xla/service/copy_insertion_test.cc b/third_party/xla/xla/service/copy_insertion_test.cc
index 9f9d92e4c710..d08f26d55923 100644
--- a/third_party/xla/xla/service/copy_insertion_test.cc
+++ b/third_party/xla/xla/service/copy_insertion_test.cc
@@ -33,6 +33,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/layout.h"
@@ -41,9 +44,6 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
@@ -91,7 +91,7 @@ int64_t CountControlEdges(const HloModule& module) {
   return count;
 }
 
-class CopyInsertionTest : public HloTestBase {
+class CopyInsertionTest : public HloHardwareIndependentTestBase {
  protected:
   void InsertCopies(HloModule* module) {
     CopyInsertion copy_insertion;
@@ -790,7 +790,7 @@ ENTRY %DependentTupleElements.While () -> (s32[], f32[8]) {
           while_hlo->mutable_operand(0)));
   HloInstruction* outer_param = outer_while_body->parameter_instruction(0);
   std::vector<HloInstruction*> materialized_gtes;
-  for (int i = 0; i < outer_param->shape().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < outer_param->shape().tuple_shapes().size(); ++i) {
     materialized_gtes.push_back(
         outer_while_body->AddInstruction(HloInstruction::CreateGetTupleElement(
             outer_param->shape().tuple_shapes(i), outer_param, i)));
@@ -916,7 +916,7 @@ ENTRY %DependentTupleElements.While () -> (s32[], f32[8]{0}, s32[], f32[8]{0}, s
           while_hlo->mutable_operand(0)));
   HloInstruction* outer_param = outer_while_body->parameter_instruction(0);
   std::vector<HloInstruction*> materialized_gtes;
-  for (int i = 0; i < outer_param->shape().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < outer_param->shape().tuple_shapes().size(); ++i) {
     materialized_gtes.push_back(
         outer_while_body->AddInstruction(HloInstruction::CreateGetTupleElement(
             outer_param->shape().tuple_shapes(i), outer_param, i)));
@@ -4021,17 +4021,18 @@ TEST_F(CopyInsertionTest, PartiallyPipelinedAsyncSendMultipleUses) {
     }
 
     ENTRY main_spmd {
-      data = f32[16]{0} parameter(0)
+      data0 = f32[16]{0} parameter(0)
+      data1 = f32[16] add(data0, data0)
       after_all = token[] after-all()
-      send = (f32[16]{0}, u32[], token[]) send(data, after_all), channel_id=1,
+      send = (f32[16]{0}, u32[], token[]) send(data1, after_all), channel_id=1,
           frontend_attributes={
             _xla_send_send_source_target_pairs={{0,1},{1,2},{2,3}}}
-      init = ((f32[16]{0}, u32[], token[]), f32[16]{0}) tuple(send, data)
+      init = ((f32[16]{0}, u32[], token[]), f32[16]{0}) tuple(send, data1)
       while = ((f32[16]{0}, u32[], token[]), f32[16]{0}) while(init),
           condition=while_condition, body=while_body
       send_ctx = (f32[16]{0}, u32[], token[]) get-tuple-element(while), index=0
       send_done = (f32[16]{0}, token[]) send-done(send_ctx), channel_id=1
-      ROOT data_ = f32[16]{0} get-tuple-element(while), index=1
+      ROOT data2 = f32[16]{0} get-tuple-element(while), index=1
     }
     )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
@@ -4043,14 +4044,14 @@ TEST_F(CopyInsertionTest, PartiallyPipelinedAsyncSendMultipleUses) {
   VLOG(2) << module->ToString();
 
   // All async start/end will be ordered so that all copies, except for an extra
-  // use of the send operand, are removable. Additionally, there will be 2
-  // copies leading into the loop and returning copying the result.
+  // use of the send operand, are removable. Additionally, there will be one
+  // copy leading into the loop.
+  EXPECT_EQ(CountCopies(*module), 2);
+
+  // For While-body, check for a copy of the send operand and a control
+  // dependency from send-done to the copy.
   HloComputation* while_body =
       hlo_query::FindComputation(module.get(), "while_body");
-  EXPECT_EQ(CountCopies(*module), 3);
-  EXPECT_EQ(CountCopies(*while_body), 1);
-
-  // Expect control dependency from send-done to send.
   HloInstruction* send_done =
       hlo_query::FindInstruction(while_body, HloOpcode::kSendDone);
   HloInstruction* send =
@@ -4060,6 +4061,16 @@ TEST_F(CopyInsertionTest, PartiallyPipelinedAsyncSendMultipleUses) {
   EXPECT_THAT(send, op::Send(send_operand_copy, op::AfterAll()));
   EXPECT_THAT(send_operand_copy->control_predecessors(),
               UnorderedElementsAre(send_done));
+
+  // For main, check for a copy of the send operand feeding into the loop and
+  // no control dependency from send-done to the copy.
+  HloComputation* main = hlo_query::FindComputation(module.get(), "main_spmd");
+  HloInstruction* send_main =
+      hlo_query::FindInstruction(main, HloOpcode::kSend);
+  HloInstruction* send_main_operand_copy =
+      hlo_query::FindInstruction(main, HloOpcode::kCopy);
+  EXPECT_THAT(send_main, op::Send(send_main_operand_copy, op::AfterAll()));
+  EXPECT_TRUE(send_main_operand_copy->control_predecessors().empty());
 }
 
 TEST_F(CopyInsertionTest, PartiallyPipelinedAsyncSendRecvPipelineParallelism) {
diff --git a/third_party/xla/xla/service/copy_removal.cc b/third_party/xla/xla/service/copy_removal.cc
new file mode 100644
index 000000000000..1523073f55a9
--- /dev/null
+++ b/third_party/xla/xla/service/copy_removal.cc
@@ -0,0 +1,1377 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/copy_removal.h"
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/layout.h"
+#include "xla/map_util.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/util.h"
+
+using absl::StrAppend;
+
+namespace xla {
+
+// Summarize additional relations into a single runtime ordering, assuming
+// both relations are modeling constraints of the same source instruction.
+void Relation::UnionRelationFromSameSource(const Relation& rel) {
+  CHECK_LE(orders_.size(), 1);
+  CHECK_EQ(rel.orders_.size(), 1);
+  if (orders_.empty()) {
+    orders_.push_back(rel.orders_[0]);
+  } else {
+    orders_[0] = Union(orders_[0], rel.orders_[0]);
+  }
+  intercept_def_use_ = intercept_def_use_ || rel.intercept_def_use_;
+}
+
+// Summarize additional relations into disjoint runtime orderings, assuming
+// the relations are modeling constraints of different source instructions.
+void Relation::UnionRelationFromDifferentSource(const Relation& rel) {
+  if (rel.orders_.empty()) {
+    return;
+  }
+  CHECK_EQ(rel.orders_.size(), 1);
+  intercept_def_use_ = intercept_def_use_ || rel.intercept_def_use_;
+  for (auto& local_order : orders_) {
+    if (OverwriteIfSubsumes(rel.orders_[0], &local_order)) {
+      return;
+    }
+  }
+  orders_.push_back(rel.orders_[0]);
+}
+
+Relation::RuntimeOrder Relation::ReverseRuntimeOrder(RuntimeOrder order) {
+  switch (order) {
+    case kNoOverlap:
+    case kSameInstr:
+    case kBeforeStartOrAfterEnd:
+    case kBeforeOrAfterOrOverlap:
+      return order;
+    case kBeforeStart:
+      return kAfterEnd;
+    case kBeforeStartOrSameInstr:
+      return kAfterEndOrSameInstr;
+    case kAfterEnd:
+      return kBeforeStart;
+    case kAfterEndOrSameInstr:
+      return kBeforeStartOrSameInstr;
+  }
+}
+
+// Overwrites o1 with o2 if o2 subsumes o1 (as defined above by the Subsume
+// function). Return whether o2 is subsumed by the new value in o1.
+bool Relation::OverwriteIfSubsumes(RuntimeOrder o2, RuntimeOrder* o1) {
+  if (*o1 == o2) {
+    return true;
+  }
+  CHECK_NE(o1, nullptr);
+  // Overwrite o1 with o2 if it is subsumed by o2.
+  if (Subsumes(o2, *o1)) {
+    *o1 = o2;
+    return true;
+  }
+  if (Subsumes(*o1, o2)) {
+    // If o2 is already subsumed by o1, do nothing.
+    return true;
+  }
+  // If neither o1 nor o2 is subsumed by the other, return false, so that o2
+  // will be inserted as a separate entry representing all possible orderings.
+  return false;
+}
+
+// Compute locationing constraints between two instructions. Here entry2 is
+// the source instruction, in that the returned value describes the relation
+// of entry2 in terms of whether it is before or after entry1, and whether it
+// can intercept the def-use data flow of entry1.
+Relation ComputeRelativeLocation::ComputeBetweenInstructionEntries(
+    const InstructionEntry& entry1, const InstructionEntry& entry2,
+    bool instr2_can_modify) {
+  auto def = entry1.second.value_definition;
+  auto use = entry1.first;
+  Relation::RuntimeOrder order =
+      ComputeRuntimeOrdering(entry2.first, entry1.first);
+  if (order == Relation::kSameInstr &&
+      entry1.second.is_definition != entry2.second.is_definition) {
+    if (entry1.second.is_definition) {
+      order = Relation::kBeforeStart;
+    } else {
+      order = Relation::kAfterEnd;
+    }
+  }
+  bool intercept = AlwaysForceInterception(entry2.first);
+  if (def == nullptr || !instr2_can_modify) {
+    return Relation(order, intercept);
+  }
+  // If the definition and use are parameter and return (root) of the parent
+  // computation, then any modification is considered intercepting.
+  if (def->opcode() == HloOpcode::kParameter &&
+      use == use->parent()->root_instruction()) {
+    VLOG(3) << "ComputeBetweenInstructionEntries: Setting interception due to "
+               "parameter/root relation";
+    return Relation(order, true);
+  }
+
+  // If the modification is inside the while body, it will not intercept the
+  // def-use chain outside of the while body. For the following example, %add
+  // does not intercept the def-use chain of %while - %root
+  //
+  // body = {
+  //   ...
+  //   add = ...  // modify buffer1
+  // }
+  // %while = While (param, cond, body) // def buffer1
+  // %root = get-tuple-element(%while), index=1 // use buffer1
+
+  if (use->parent() == def->parent() &&
+      ComputeRuntimeOrdering(use, entry2.first) == Relation::kAfterEnd &&
+      def->opcode() == HloOpcode::kWhile &&
+      entry2.first->parent() == def->while_body()) {
+    VLOG(3) << "ComputeBetweenInstructionEntries: Setting interception due to "
+               "def-while body relation";
+    return Relation(order, false);
+  }
+
+  if (use->parent() == def->parent() &&
+      ComputeRuntimeOrdering(def, entry2.first) == Relation::kBeforeStart &&
+      use->opcode() == HloOpcode::kWhile &&
+      entry2.first->parent() == use->while_body()) {
+    VLOG(3) << "ComputeBetweenInstructionEntries: Setting interception due to "
+               "use-while body relation";
+    return Relation(order, false);
+  }
+
+  // Special case for conditional instruction when in one branch two results
+  // can be put in one buffers and another branch returns two results from a
+  // multi-output instruction, e.g. fusion or variadic reduction.
+  //
+  //  branch_0 {
+  //    exp = f64[] exp(...)
+  //    ROOT tuple = (f64[], f64[]) tuple(exp, exp)
+  //  }
+  //
+  //  fused_computation {
+  //    abs = f64[] abs(...)
+  //    negate = f64[] negate(...)
+  //    ROOT tuple = (f64[], f64[]) tuple(abs, negate)
+  //  }
+  //
+  //  branch_1 {
+  //    ROOT fusion = (f64[], f64[]) fusion(...), calls=%fused_computation
+  //  }
+  //
+  //  ENTRY main {
+  //    ROOT root = (f64[], f64[]) conditional(...),
+  //    branch_computations={%branch_0, %branch_1}
+  //  }
+  //
+  // `branch_0` can use one buffer for both result. `branch_1` must use two
+  // different buffers.
+  //
+  // During live range analysis of results of `branch_0` this function will be
+  // called when entry1 and entry2 are different outputs on `fusion` in
+  // `branch_1`. `fusion` defines two buffers, but `value_definition` in
+  // LiveRangeRegions::InstructionInfo does not track the output index. The
+  // analysis will say that they are not interfering and assign the same
+  // buffer to both.
+  //
+  // This check makes sure that outputs of multi-output instructions are
+  // always interfering and can not be combined. It can be a false positive
+  // when entry1 and entry2 correspond to the same output, but we prefer that
+  // over correctness issues.
+  //
+  // A proper solution would be to track output index in
+  // LiveRangeRegions::InstructionInfo.
+  if (use->parent() == def->parent() &&
+      !def->parent()->caller_instructions(HloOpcode::kConditional).empty() &&
+      def == entry2.first && def->shape().IsTuple()) {
+    VLOG(3) << "ComputeBetweenInstructionEntries: Setting interception for "
+               "multi-output instruction inside conditional branch: "
+            << def->name();
+    return Relation(order, true);
+  }
+
+  if (Relation::UseImpliesInterception(order)) {
+    auto order2 = ComputeRuntimeOrdering(entry2.first, def);
+    VLOG(3) << "ComputeRuntimeOrdering result: "
+            << Relation::GetRuntimeOrderName(order2);
+    if (Relation::DefinitionImpliesInterception(order2)) {
+      VLOG(3) << "ComputeBetweenInstructionEntries: Setting interception for "
+              << def->ToString() << " with use: " << entry1.first->ToString();
+      intercept = true;
+    }
+  }
+  return Relation(order, intercept);
+}
+
+// Return the relative locations (defined above) of range2 in relation to
+// instructions in range1. Return kNoOverlap if range2 is outside of range1.
+Relation ComputeRelativeLocation::ComputeBetweenLiveRangeRegions(
+    const LiveRangeRegions& range1, const LiveRangeRegions& range2) {
+  Relation dir_src_dest;
+  for (const auto* computation1 : range1) {
+    for (const auto* computation2 : range2) {
+      VLOG(3) << "Computing relative location constraints between: ";
+      VLOG(3) << "        computation1: " << computation1->name();
+      VLOG(3) << "    and computation2: " << computation2->name();
+      for (auto instr_entry2 : range2[computation2]) {
+        if (!ordering_->call_graph().Dominates(computation1, computation2)) {
+          continue;
+        }
+        VLOG(3) << "              instr2: " << instr_entry2.first->ToString();
+        // Saves relations between instr2 and other instructions in range1.
+        bool instr2_can_modify = InstructionCanIntercept(instr_entry2, range1);
+        Relation instr2_relation;
+        std::vector<InstructionEntry> unordered_ops;
+        bool unordered_intercept = false;
+        for (auto instr_entry1 : range1[computation1]) {
+          auto rel = ComputeBetweenInstructionEntries(
+              instr_entry1, instr_entry2, instr2_can_modify);
+          VLOG(3) << "  Target Instruction: " << instr_entry1.first->name();
+          VLOG(3) << "            Relation: " << rel.ToString();
+          if (!rel.RuntimeOrderIsUnordered()) {
+            instr2_relation.UnionRelationFromSameSource(rel);
+          } else {
+            unordered_ops.push_back(instr_entry1);
+            unordered_intercept |= rel.InterceptDefUse();
+          }
+          VLOG(3) << "     instr2 relation: " << instr2_relation.ToString();
+        }
+        // Here instr2_relation is guaranteed to have at most a single entry,
+        // because it was initialized to be empty, and has been updated only
+        // via instr2_relation.UnionRelationFromSameSource(rel), which
+        // maintains that the updated result has only a single entry.
+        if (!ForceRuntimeOrder(unordered_ops, instr_entry2,
+                               instr2_relation.GetRuntimeOrder())) {
+          VLOG(3) << "Unable to force ordering of unordered ops";
+          instr2_relation.UnionRelationFromSameSource(
+              Relation(Relation::kBeforeStartOrAfterEnd, unordered_intercept));
+        }
+        dir_src_dest.UnionRelationFromDifferentSource(instr2_relation);
+        VLOG(3) << "  Resulting relation: " << dir_src_dest.ToString();
+        VLOG(3) << "--------------------------------------------------------";
+      }
+    }
+  }
+  return dir_src_dest;
+}
+
+// Return whether control dependences, if exist, are added successfully.
+bool ComputeRelativeLocation::AddControlDependenceForUnorderedOps() {
+  if (ctrl_deps_.empty()) {
+    return true;
+  }
+  PredecessorHloOrdering* ordering =
+      dynamic_cast<PredecessorHloOrdering*>(ordering_);
+  if (ordering == nullptr) {
+    // Support force ordering of unordered-ops only when using predecssor
+    // ordering.
+    return false;
+  }
+  for (const auto& comp_it : ctrl_deps_) {
+    HloComputation* parent = comp_it.first;
+    HloReachabilityMap& reachability_map = ordering->reachability_map(parent);
+    for (const auto& instr_it : comp_it.second) {
+      HloInstruction* entry1 = instr_it.first;
+      for (HloInstruction* entry2 : instr_it.second) {
+        VLOG(3) << "   Adding control dependence between:";
+        VLOG(3) << "     predecessor: " << entry2->name();
+        VLOG(3) << "       successor: " << entry1->name();
+        TF_CHECK_OK(entry2->AddControlDependencyTo(entry1));
+      }
+      reachability_map.UpdateReachabilityThroughInstruction(entry1);
+      for (HloInstruction* entry2 : instr_it.second) {
+        DCHECK(ordering_->GetExecutionConstraint(entry1, entry2) ==
+               HloOrdering::ExecutionConstraint::kRunAfter);
+      }
+    }
+  }
+  return true;
+}
+
+// Returns whether it is safe to force the desired_relation ordering between
+// all operations in unordered_ops and entry2. If safe, save the new enforced
+// ordering relations.
+bool ComputeRelativeLocation::ForceRuntimeOrder(
+    absl::Span<const InstructionEntry> unordered_ops,
+    const InstructionEntry entry2, Relation::RuntimeOrder desired_relation) {
+  if (unordered_ops.empty()) {
+    return true;
+  }
+  if (desired_relation != Relation::kBeforeStart &&
+      desired_relation != Relation::kAfterEnd) {
+    VLOG(3) << "      ForceRuntimeOrder: desired_relation is not "
+               "kBeforeStart or kAfterEnd";
+    return false;
+  }
+  auto ModifiesNonCopy = [](HloInstruction* instr, const HloInstruction* op) {
+    auto in_place = HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr);
+    if (in_place.empty()) {
+      return false;
+    }
+    return absl::c_any_of(
+        in_place, [&](const std::pair<HloOperandIndex, ShapeIndex>&
+                          operand_and_output_index) {
+          auto* op2 =
+              instr->operand(operand_and_output_index.first.operand_number);
+          return (op == nullptr) ? (op2->opcode() == HloOpcode::kCopy)
+                                 : (op2 == op);
+        });
+  };
+  for (const InstructionEntry& entry1 : unordered_ops) {
+    // Only consider instructions in the same computation.
+    if (entry1.first->parent() != entry2.first->parent()) {
+      VLOG(3) << "      ForceRuntimeOrder: instructions are not in the same "
+                 "computation";
+      return false;
+    }
+    HloInstruction* pred = (desired_relation == Relation::kBeforeStart)
+                               ? entry2.first
+                               : entry1.first;
+    HloInstruction* succ = (desired_relation == Relation::kBeforeStart)
+                               ? entry1.first
+                               : entry2.first;
+    if (pred == pred->parent()->root_instruction()) {
+      VLOG(3) << "      ForceRuntimeOrder: predecessor (" << pred->name()
+              << ") is the root instruction";
+      return false;
+    }
+    if (succ->opcode() == HloOpcode::kCopy &&
+        ModifiesNonCopy(pred, succ->operand(0))) {
+      VLOG(3) << "Failed to force unordered op ordering due to copy ordering "
+              << " between " << pred->name() << " vs " << succ->name();
+      return false;
+    }
+  }
+  for (const InstructionEntry& entry1 : unordered_ops) {
+    Save(entry2.first, entry1.first, desired_relation,
+         /*is_unordered_originally=*/true);
+    VLOG(3) << "      ForceRuntimeOrder: saved unordered relation: ";
+    VLOG(3) << "        entry2: " << entry2.first->name();
+    VLOG(3) << "        entry1: " << entry1.first->name();
+    VLOG(3) << "        relation: "
+            << Relation::GetRuntimeOrderName(desired_relation);
+  }
+  return true;
+}
+
+bool ComputeRelativeLocation::AlwaysForceInterception(HloInstruction* instr) {
+  // The following communication operations can have some unexpected side
+  // effects, when synchronizing across processes. Therefore, we
+  // conservatively try provide dedicated buffers to these operations instead
+  // of allowing them to share buffers with other operations, as the reuse may
+  // cause unexpected interferences.
+  if (HloDataflowAnalysis::IsAsynchronousOperationStart(instr->opcode()) ||
+      HloDataflowAnalysis::IsAsynchronousOperationDone(instr->opcode())) {
+    return true;
+  }
+  switch (instr->opcode()) {
+    // TODO(b/190903339): It appears that collectivePermute needs to be
+    // followed by a copy when escaping through a computation root.
+    case HloOpcode::kCollectivePermute:
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Returns whether the given instr may intercept the def-use flow of another
+// ongoing live range if its buffer is combined with the other live range.
+// The function should return true if instr creates a new HloValue that could
+// overwrite an existing HloValue in the combined buffer.
+// More specifically, here we are looking for operations that create new
+// values, e.g., add, subtract, in contrast to HLOs that merely create
+// aliasings among existing values, e.g., tuple, get-tuple-element. Any of the
+// new values created by operations such as add or subtract, when included as
+// definition operations in a live range, are aliases of the buffer to be
+// allocated to the live range and so are treated as they may be modifying the
+// targeting buffer.
+bool ComputeRelativeLocation::InstructionCanIntercept(
+    const InstructionEntry& entry, const LiveRangeRegions& region) {
+  auto instr = entry.first;
+  if (!entry.second.is_definition) {
+    // If the instruction only uses the value, it can intercept only if it
+    // modifies the buffer in place.
+    for (const auto& operand_and_output_index :
+         HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr)) {
+      const HloOperandIndex& operand_index = operand_and_output_index.first;
+      if (region.contains(
+              instr->mutable_operand(operand_index.operand_number))) {
+        return true;
+      }
+    }
+    return false;
+  }
+  switch (instr->opcode()) {
+    // If the copy instruction is used to connect two live range regions,
+    // it does not overwrite the combined buffer with new values.
+    case HloOpcode::kCopy: {
+      // Checking the copy simply copies from the other live range with no
+      // layout conflicts.
+      HloInstruction* operand = instr->mutable_operand(0);
+      if (operand->opcode() == HloOpcode::kGetTupleElement) {
+        // kGetTupleElement only creates an alias among HloValues and is not
+        // included in the live range region. We check its operand instead.
+        operand = operand->mutable_operand(0);
+      }
+      if (region.contains(operand) &&
+          ShapeUtil::Equal(instr->shape(), instr->operand(0)->shape())) {
+        return false;  // Cannot intercept.
+      }
+      return true;
+    }
+    // The following operations merely create aliases among the HloValues.
+    case HloOpcode::kParameter:
+    case HloOpcode::kTuple:
+    case HloOpcode::kGetTupleElement:
+    // Here we consider all the compound operations (e.g., conditionals and
+    // while loops) as if they do not modify any HloValue, with the argument
+    // being that any value modifying operation contained inside will be
+    // considered separately to make sure the kIntercept relation being
+    // recorded as appropriate. Since the compound operations may or may not
+    // modify, not treating them as value modifying would make the algorithm
+    // less conservative.
+    case HloOpcode::kWhile:
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
+      return false;
+    default:
+      return true;
+  }
+  return true;
+}
+
+ComputeRelativeLocation::SavedRelation ComputeRelativeLocation::AlreadyComputed(
+    HloInstruction* op1, HloInstruction* op2) {
+  auto p2 = saved_relations_.find(op2);
+  if (p2 != saved_relations_.end()) {
+    auto p1 = (*p2).second.find(op1);
+    if (p1 != (*p2).second.end()) {
+      return SavedRelation(kFullyComputed, (*p1).second);
+    }
+  }
+  p2 = saved_relations_.find(op1);
+  if (p2 != saved_relations_.end()) {
+    auto p1 = (*p2).second.find(op2);
+    if (p1 != (*p2).second.end()) {
+      return SavedRelation(kPartiallyComputed,
+                           Relation::ReverseRuntimeOrder((*p1).second));
+    }
+  }
+  return SavedRelation(kNotComputed, Relation::kNoOverlap);
+}
+
+Relation::RuntimeOrder ComputeRelativeLocation::Save(
+    HloInstruction* entry1, HloInstruction* entry2,
+    const Relation::RuntimeOrder relation, bool is_unordered_originally) {
+  CHECK_EQ(AlreadyComputed(entry1, entry2).first, kNotComputed);
+  // Do not save unordered relations.
+  CHECK_NE(relation, Relation::kBeforeStartOrAfterEnd);
+  saved_relations_[entry2][entry1] = relation;
+  if (is_unordered_originally) {
+    CHECK(relation == Relation::kBeforeStart || relation == Relation::kAfterEnd)
+        << relation;
+    HloInstruction* pred =
+        (relation == Relation::kBeforeStart) ? entry1 : entry2;
+    HloInstruction* succ =
+        (relation == Relation::kBeforeStart) ? entry2 : entry1;
+    VLOG(3) << "Save unordered relation: " << pred->name() << " vs "
+            << succ->name();
+    CHECK_EQ(succ->parent(), pred->parent());
+    auto& dep_vec = ctrl_deps_[succ->parent()][succ];
+    for (HloInstruction*& op : dep_vec) {
+      auto rel = AlreadyComputed(pred, op);
+      if (rel.first != kNotComputed) {
+        if (rel.second == Relation::kAfterEnd) {
+          op = pred;
+        } else {
+          CHECK(rel.second == Relation::kBeforeStart);
+        }
+        return relation;
+      }
+    }
+    VLOG(2) << "Forcing unordered: " << pred->name() << " vs " << succ->name();
+    dep_vec.push_back(pred);
+  }
+  return relation;
+}
+
+// Compute the runtime ordering constraints between two instructions.
+Relation::RuntimeOrder ComputeRelativeLocation::ComputeRuntimeOrdering(
+    HloInstruction* instr1, HloInstruction* instr2) {
+  auto saved_relation = AlreadyComputed(instr1, instr2);
+  VLOG(3) << "   ComputeRuntimeOrdering: " << instr1->name() << " vs "
+          << instr2->name();
+  if (saved_relation.first != kNotComputed) {
+    VLOG(3) << "   ComputeRuntimeOrdering: Already computed between "
+            << instr1->name() << " vs " << instr2->name();
+    return saved_relation.second;
+  }
+  auto constraint = ordering_->GetExecutionConstraint(instr1, instr2);
+  switch (constraint) {
+    case HloOrdering::ExecutionConstraint::kIsSame:
+      return Save(instr1, instr2, Relation::kSameInstr);
+    case HloOrdering::ExecutionConstraint::kRunBeforeEnd:
+      return Save(instr1, instr2, Relation::kBeforeStartOrSameInstr);
+    case HloOrdering::ExecutionConstraint::kRunBeforeStart:
+      return Save(instr1, instr2, Relation::kBeforeStart);
+    case HloOrdering::ExecutionConstraint::kRunAfter:
+      return Save(instr1, instr2, Relation::kAfterEnd);
+    case HloOrdering::ExecutionConstraint::kRunExclusiveBefore:
+    case HloOrdering::ExecutionConstraint::kRunExclusiveAfter:
+      return Save(instr1, instr2, Relation::kNoOverlap);
+    case HloOrdering::ExecutionConstraint::kUnordered: {
+      if (instr1->parent() != instr2->parent()) {
+        return Relation::kBeforeStartOrAfterEnd;
+      }
+      auto ControlDependenceBefore = [&](HloInstruction* op1,
+                                         HloInstruction* op2) {
+        auto constraint = ComputeRuntimeOrdering(op1, op2);
+        if (constraint == Relation::kBeforeStart ||
+            constraint == Relation::kSameInstr ||
+            constraint == Relation::kBeforeStartOrSameInstr) {
+          return true;
+        }
+        return false;
+      };
+      if (!ctrl_deps_.empty()) {
+        auto ctrl_deps = ctrl_deps_[instr1->parent()];
+        if (absl::c_any_of(ctrl_deps[instr2], [&](HloInstruction* pred2) {
+              return ControlDependenceBefore(instr1, pred2);
+            })) {
+          VLOG(2) << "control-dependent: " << instr1->name() << " vs "
+                  << instr2->name();
+          return Save(instr1, instr2, Relation::kBeforeStart);
+        }
+        if (absl::c_any_of(ctrl_deps[instr1], [&](HloInstruction* pred1) {
+              return ControlDependenceBefore(instr2, pred1);
+            })) {
+          VLOG(2) << "control-dependent: " << instr2->name() << " vs "
+                  << instr1->name();
+          return Save(instr1, instr2, Relation::kAfterEnd);
+        }
+      }
+      // Don't save the result for unordered operations, so they can be
+      // refined later.
+      return Relation::kBeforeStartOrAfterEnd;
+    }
+  }
+}
+
+CopyRemover::CopyRemover(
+    const HloModule& module, const HloAliasAnalysis& alias_analysis,
+    HloOrdering* ordering, bool check_live_range_ordering,
+    const absl::flat_hash_set<absl::string_view>& execution_threads)
+    : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
+  // Instruction indices based on post order traversal of computations and
+  // instructions. Used as an enhancement for getting strict weak ordering
+  // used for sorting below.
+  absl::flat_hash_map<int, int64_t> instruction_ids;
+  int64_t id = 0;
+
+  // Generate instruction ids for all instructions in the module, starting at
+  // the entry computation, processing instructions post-order and recursing
+  // depth-first into called computations.
+  absl::flat_hash_set<HloComputation*> visited;
+  std::function<void(HloComputation*)> assign_ids_dfs =
+      [&](HloComputation* computation) {
+        // Only visit each computation once.
+        auto [it, inserted] = visited.insert(computation);
+        if (!inserted) {
+          return;
+        }
+        // Assign ids to parameters first to match logic in IsDefinedBefore()
+        for (HloInstruction* instruction :
+             computation->parameter_instructions()) {
+          instruction_ids[instruction->unique_id()] = id++;
+        }
+
+        // Use the schedule order if available, otherwise use post order.
+        const HloInstructionSequence* seq =
+            ordering->SequentialOrder(*computation);
+        std::vector<HloInstruction*> instructions =
+            seq != nullptr ? seq->instructions()
+                           : computation->MakeInstructionPostOrder();
+
+        // Traverse depth-first, assigning ids to caller instructions
+        // *after* called computations.
+        for (HloInstruction* instruction : instructions) {
+          switch (instruction->opcode()) {
+            case HloOpcode::kParameter:
+              // Parameters are already assigned ids above.
+              continue;
+            case HloOpcode::kWhile:
+              // While condition executes before body.
+              assign_ids_dfs(instruction->while_condition());
+              assign_ids_dfs(instruction->while_body());
+              break;
+            default:
+              for (HloComputation* called_computation :
+                   instruction->called_computations()) {
+                assign_ids_dfs(called_computation);
+              }
+              break;
+          }
+          instruction_ids[instruction->unique_id()] = id++;
+        }
+      };
+
+  CHECK(module.has_entry_computation());
+  assign_ids_dfs(module.entry_computation());
+
+  // Construct a list for each HLO buffer in the alias analysis. Maintain a
+  // map from HloValue to the respective list element representing that
+  // value. The map is used to construct the copy info map below.
+  absl::flat_hash_map<const HloValue*, ValueNode*> value_to_node;
+  // Perform check only if the default dependence-based ordering is used.
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    // No copies should have been inserted within fused computations, so no
+    // need to remove them. HloOrdering isn't compatible with HloValues inside
+    // fusions, so skip copy removal for them.
+    if (buffer.values().at(0)->defining_instruction()->IsFused()) {
+      continue;
+    }
+
+    std::vector<const HloValue*> values = buffer.values();
+    absl::c_sort(
+        values, [this, &instruction_ids](const HloValue* a, const HloValue* b) {
+          // IsDefinedBefore() is generally not strict weak ordering required by
+          // the sort algorithm, since a may not be comparable to b or c by
+          // IsDefinedBefore(), but b and c can be comparable. Such as in:
+          //   if () { b = ...; c = b + 1; } else { a = ...; }
+          // or
+          //   a = param(0) + param(1); b = param(2) + param(3); c = b + 1;
+          // So it fails the "incomparability being transitive" requirement by
+          // strict weak ordering. We enhance the ordering test by using
+          // instruction ids generated by post order visiting of the
+          // computations/instructions. All HloValue's are comparable and
+          // dependency (thus transitivity) is respected when hlo ordering
+          // cannot decide the order.
+          if (a == b) {
+            return false;
+          }
+          const bool a_has_smaller_id =
+              instruction_ids.at(a->defining_instruction()->unique_id()) <
+              instruction_ids.at(b->defining_instruction()->unique_id());
+          // Use a_has_smaller_id as a hint for the order between a and b. In
+          // case it's right, there is no need for two IsDefinedBefore() tests.
+          if (a_has_smaller_id) {
+            // Test a is defined before b first.
+            if (ordering_->IsDefinedBefore(*a, *b)) {
+              return true;
+            }
+            if (ordering_->IsDefinedBefore(*b, *a)) {
+              return false;
+            }
+          } else {
+            // Test b is defined before a first.
+            if (ordering_->IsDefinedBefore(*b, *a)) {
+              return false;
+            }
+            if (ordering_->IsDefinedBefore(*a, *b)) {
+              return true;
+            }
+          }
+
+          // Use post order as tie breaker.
+          return a_has_smaller_id;
+        });
+
+    // Create a list containing all of the values in the buffer.
+    AddValueList(values, &value_to_node);
+  }
+
+  // Create copy_map_ which contains the source and destination values
+  // of all copies.
+  CreateCopyMap(module, value_to_node);
+
+  XLA_VLOG_LINES(3, ToString());
+  TF_DCHECK_OK(Verify());
+}
+
+// Add a list containing the given values to CopyRemover. This
+// represents the values contained in a single buffer. For each value in
+// 'values' an entry is created in value_to_node which indicates the
+// respective ValueNode representing that value.
+void CopyRemover::AddValueList(
+    absl::Span<const HloValue* const> values,
+    absl::flat_hash_map<const HloValue*, ValueNode*>* value_to_node) {
+  ValueNode* tail = nullptr;
+  ValueNode* head = nullptr;
+  for (const HloValue* value : values) {
+    auto new_node = new ValueNode(value);
+    (*value_to_node)[value] = new_node;
+
+    // Copy the HLO values's uses into the ValueNode for the value. These
+    // uses in ValueNode are updated as copies are removed.
+    new_node->uses.reserve(value->GetUses().size());
+    for (const HloUse& use : value->GetUses()) {
+      new_node->uses.push_back(&use);
+    }
+
+    // Connect the new node into the linked list.
+    if (tail == nullptr) {
+      head = new_node;
+    } else {
+      tail->next = new_node;
+      new_node->prev = tail;
+    }
+    tail = new_node;
+  }
+
+  // The linked list is circular so connect the head and tail.
+  tail->next = head;
+  head->prev = tail;
+  value_lists_.insert(head);
+}
+
+// This method also fills in copy_map_ which indicates which nodes
+// in the value lists corresponding to the source and destination values of
+// kCopy instructions. value_to_node should map each HloValue to its
+// respective ValueNode.
+void CopyRemover::CreateCopyMap(
+    const HloModule& module,
+    const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node) {
+  for (HloComputation* computation : module.MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      // Add copies with unambiguous source values to the map. Copies with
+      // ambiguous sources are not removable.
+      if (instruction->opcode() == HloOpcode::kCopy) {
+        const HloValueSet& src_value_set =
+            dataflow_.GetValueSet(instruction->operand(0));
+        if (src_value_set.values().size() == 1) {
+          CopyNodes& copy_node = copy_map_[instruction];
+          copy_node.dest =
+              value_to_node.at(&dataflow_.GetUniqueValueAt(instruction));
+          copy_node.src = value_to_node.at(&src_value_set.GetUniqueValue());
+        }
+      }
+    }
+  }
+}
+
+CopyRemover::~CopyRemover() {
+  for (const ValueNode* head : value_lists_) {
+    const ValueNode* p = head;
+    do {
+      const ValueNode* tmp = p->next;
+      delete p;
+      p = tmp;
+    } while (p != head);
+  }
+}
+
+// Verify invariants within the linked lists.
+absl::Status CopyRemover::Verify() const {
+  for (const ValueNode* head : value_lists_) {
+    const ValueNode* p = head;
+    do {
+      // Verify links between elements are consistent.
+      TF_RET_CHECK(p->prev->next == p);
+      TF_RET_CHECK(p->next->prev == p);
+
+      const HloInstruction* def = p->value->defining_instruction();
+      if (def->opcode() == HloOpcode::kCopy && ContainsKey(copy_map_, def)) {
+        TF_RET_CHECK(copy_map_.at(def).dest == p);
+      }
+      for (const HloUse* use : p->uses) {
+        if (use->instruction->opcode() == HloOpcode::kCopy &&
+            ContainsKey(copy_map_, use->instruction)) {
+          TF_RET_CHECK(copy_map_.at(use->instruction).src == p);
+        }
+      }
+
+      p = p->next;
+    } while (p != head);
+  }
+  return absl::OkStatus();
+}
+
+// Compute the set of instructions where values are alive and organize these
+// instructions by separating them into their respective computations.
+LiveRangeRegions CopyRemover::ComputeLiveRangeRegions(const ValueNode* head) {
+  LiveRangeRegions live_range;
+
+  auto VisitValueNode = [&](const ValueNode* node) {
+    HloInstruction* def_op = node->value->instruction();
+    HloComputation* def_parent = def_op->parent();
+    live_range[def_parent][def_op].is_definition = true;
+    for (const auto& use : node->uses) {
+      auto* use_op = use->instruction;
+      HloComputation* use_parent = use_op->parent();
+      live_range[use_parent][use_op].value_definition = def_op;
+    }
+  };
+  ForEachValueInRange(head, VisitValueNode);
+  return live_range;
+}
+
+bool CopyRemover::IsCopyToFromHost(const HloInstruction* copy) {
+  if (copy->shape().has_layout() && copy->operand(0)->shape().has_layout()) {
+    if (copy->shape().layout().memory_space() == Layout::kHostMemorySpace &&
+        copy->operand(0)->shape().layout().memory_space() !=
+            Layout::kHostMemorySpace) {
+      return true;
+    }
+    if (copy->shape().layout().memory_space() != Layout::kHostMemorySpace &&
+        copy->operand(0)->shape().layout().memory_space() ==
+            Layout::kHostMemorySpace) {
+      return true;
+    }
+  }
+  return false;
+}
+// Try to elide the given copy. Elision of a copy is possible only if no
+// live range interference is introduced by the copy's elimination. If
+// elision is possible, then the internal state (value lists) are updated,
+// and true is returned. Returns false otherwise.
+bool CopyRemover::TryElideCopy(
+    const HloInstruction* copy, int64_t* region_analysis_limit,
+    bool insert_post_scheduling_control_dependencies) {
+  VLOG(3) << "TryElideCopy starting for: " << copy->name();
+  CHECK_NE(region_analysis_limit, nullptr);
+
+  // Don't elide copies to/from the host.
+  if (IsCopyToFromHost(copy)) {
+    return false;
+  }
+
+  // Don't elide copies that are not in the copy map.
+  if (!ContainsKey(copy_map_, copy)) {
+    VLOG(2) << copy->name() << " is not removable";
+    return false;
+  }
+
+  // Don't elide copies with different shapes.
+  if (!ShapeUtil::Equal(copy->shape(), copy->operand(0)->shape())) {
+    VLOG(2) << copy->name() << " is not removable (shape mismatch)";
+    return false;
+  }
+  const CopyNodes& copy_node = copy_map_.at(copy);
+  DCHECK(copy_node.src != nullptr);
+  DCHECK(copy_node.dest != nullptr);
+
+  int64_t src_total_read_writes = 0, dst_total_read_writes = 0;
+  ForEachValueInRange(copy_node.src, [&](const ValueNode* node) {
+    src_total_read_writes += 1 + node->uses.size();
+  });
+  ForEachValueInRange(copy_node.dest, [&](const ValueNode* node) {
+    dst_total_read_writes += 1 + node->uses.size();
+  });
+  // Use the more accurate region-based live range interference analysis if
+  // the live range size is within a given limit (or if no limit is given).
+  // Also don't use the new analysis for copies of broadcasts as these copies
+  // are cheap and are later removed by replicating the broadcasts.
+  bool use_region_analysis =
+      copy->operand(0)->opcode() != HloOpcode::kBroadcast &&
+      (*region_analysis_limit < 0 ||
+       src_total_read_writes * dst_total_read_writes <= *region_analysis_limit);
+
+  *region_analysis_limit = 0;
+  VLOG(3) << "Source buffer values: " << ValueListToString(copy_node.src);
+  VLOG(3) << "Dest buffer values: " << ValueListToString(copy_node.dest);
+  // Checks whether the live range at src is before that defined by dest.
+  auto CheckLiveRangeBefore = [&](ValueNode* src, ValueNode* dest) {
+    for (ValueNode* next_dest = dest; next_dest != nullptr;
+         next_dest = Next(*next_dest)) {
+      for (ValueNode* prev_src = src; prev_src != nullptr;
+           prev_src = Prev(*prev_src)) {
+        if (!LiveRangeBefore(*prev_src, *next_dest)) {
+          VLOG(4) << "   CheckLiveRangeBefore - live range of: "
+                  << prev_src->value->ToShortString() << ", is not before; "
+                  << next_dest->value->ToShortString();
+          return false;
+        }
+      }
+    }
+    return true;
+  };
+  auto CheckLiveRangeInterference = [&](ValueNode* src, ValueNode* dest,
+                                        const CombineLiveRangeOption option) {
+    CHECK_NE(src, nullptr);
+    CHECK_NE(dest, nullptr);
+    if (!use_region_analysis) {
+      VLOG(2) << " TryElideCopy: Configured to not use region-based analysis.";
+      return true;
+    }
+    *region_analysis_limit += src_total_read_writes * dst_total_read_writes;
+    if (ValuesInterfere(src, dest, option)) {
+      VLOG(2) << " TryElideCopy: Region-based interference is true.";
+      return true;
+    }
+    VLOG(2) << " TryElideCopy: Region-based interference is false.";
+    return false;
+  };
+  auto AddControlDependenciesBetween = [&](ValueNode* src, ValueNode* dst) {
+    if (src == nullptr || dst == nullptr) {
+      return;
+    }
+    for (auto use : src->uses) {
+      if (use->instruction->parent() != dst->value->instruction()->parent() ||
+          use->instruction == dst->value->instruction()) {
+        // Don't add control dependencies if the use is in a different
+        // computation or if the use is the same as the destination.
+        continue;
+      }
+
+      VLOG(2)
+          << "      AddControlDependenciesBetween: Adding control dependency:";
+      VLOG(2) << "      AddControlDependenciesBetween:  From: "
+              << use->instruction->ToString();
+      VLOG(2) << "      AddControlDependenciesBetween:   Use: "
+              << use->ToString();
+
+      VLOG(2) << "      AddControlDependenciesBetween:    To: "
+              << dst->value->instruction()->ToShortString();
+      VLOG(2) << "      AddControlDependenciesBetween: Value: "
+              << dst->value->ToString();
+
+      CHECK_OK(
+          use->instruction->AddControlDependencyTo(dst->value->instruction()));
+    }
+  };
+
+  // A kCopy instruction copies an HLO value from a source buffer and
+  // defines an HLO value in a destination buffer. Most generally, the
+  // source and destination buffers may each hold more than one value at
+  // different points in the computation so we define the following:
+  //
+  //   Values in source buffer:      {s_0, ..., s_n}
+  //   Values in destination buffer: {d_0, ..., d_m}
+  //
+  // A kCopy instruction between these buffers copies a value s_x in the
+  // source buffer and defines a value d_y in the destination buffer. The
+  // elision of a copy merges the source and destination buffers together,
+  // so the list of values for the source and destination buffers are
+  // merged.
+  //
+  // We handle two different cases for copy elision:
+  //
+  //  (1) the kCopy defines the first value in the destination buffer (d_0).
+  //
+  //  (2) the kCopy copies the last value in the source buffer (s_n).
+  //
+  // For the remaining case where the kCopy copies a not-last value from the
+  // source buffer to a not-first value of the destination buffer, the kCopy
+  // instruction cannot be removed. This case is generated for example, if
+  // the kCopy copies a while body parameter of the loop state at one tuple
+  // index to a different tuple index in the while body root. Removal of the
+  // copy necessarily results in live range interference of values in the
+  // loop state at the two different tuple indices.
+  //
+  //  We can only perform copy elision if the merged values have
+  //  totally ordered live ranges; otherwise the merged buffer would have
+  //  live range interference.
+  if (copy_node.src->next == copy_node.dest) {
+    // In the process of eliding copies, it's possible for a copy to have the
+    // same source and destination buffer. In this case, the copy can be
+    // safely removed.
+    VLOG(2) << "TryElideCopy - copy (" << copy->name()
+            << ") has same source / destination buffers";
+
+  } else if (IsHead(*copy_node.dest)) {
+    // The copy copies an arbitrary value in the source buffer (call it s_x)
+    // and defines d_0, the first value in the destination buffer. After
+    // merging, the values in the combined buffer must be strictly ordered
+    // as follows** to elide the copy:
+    //
+    // {s_0, ..., s_x, d_1, ..., d_m, s_{x+1}, ..., s_n}
+    //
+    // Removing the copy eliminates d_0, and uses of d_0 become uses of
+    // s_x. In the above ordering, the live range of d_m will be ordered
+    // before the live range of s_{x+1} and the definition and all uses of
+    // s_x will be ordered before the definition of d_1. To make sure the
+    // copy elision is safe, the following code checks that this ordering is
+    // valid --- in particular we check it is safe to order d_m ahead of all
+    // the liverages at and after s_{x+1}, and it is safe to order all uses
+    // of s_x before the definition of d_1, by checking the live range
+    // constraints for each pair --- we cannot skip the later checks because
+    // the live range ordering is not guaranteed to be transitive --- while it
+    // may be ok to have lr_1 before lr_2, and lr_2 before lv_3 while merging
+    // their buffers, it may not be ok to merge the buffers of lr_1 and lv_3,
+    // because the exclusiveness relation of non-overlapping computations is
+    // not transitive.
+    //
+    // ** Technically it might be possible to have a non-interfering
+    //    non-trivial interleaving of the values of the source and
+    //    destination buffers in the resulting order. This can be potentially
+    //    supported in the ValuesInterfere function, which performs
+    //    interference analysis at a more global scope than the alternative
+    //    LiveRangeBefore analysis which requires strict ordering of all live
+    //    ranges. Currently, however, this is not yet supported, as
+    //    we simply check for the case where *all* values of the destination
+    //    buffer (d_1 through d_m) are spliced into the point where the copy
+    //    used to be.
+    VLOG(2) << "TryElideCopy - copy (" << copy->name()
+            << ") defines the first value in its buffer.";
+    // Live range of (s_x, s_{x-1},...) must be before 'next_dest' (d_1);
+    bool src_use_before_first_dest_def =
+        CheckLiveRangeBefore(copy_node.src, Next(*copy_node.dest));
+    std::string a = copy_node.src->value->ToShortString();
+    std::string b = Next(*copy_node.dest) == nullptr
+                        ? "null"
+                        : Next(*copy_node.dest)->value->ToShortString();
+    VLOG(6) << "TryElideCopy - source uses of value defined by (" << a
+            << ") complete before next dest definition (" << b
+            << "): " << src_use_before_first_dest_def;
+    // Live range of 'last_dest' (d_m) must be before 'next_src' s_{x+1}.
+    bool src_def_after_last_dest_use =
+        CheckLiveRangeBefore(copy_node.dest->prev, Next(*copy_node.src));
+    a = copy_node.dest->prev->value->ToShortString();
+    b = Next(*copy_node.src) == nullptr
+            ? "null"
+            : Next(*copy_node.src)->value->ToShortString();
+    VLOG(6) << "TryElideCopy - dest uses of value defined by (" << a
+            << ") complete before next src definition (" << b
+            << "): " << src_def_after_last_dest_use;
+
+    bool live_range_before =
+        src_use_before_first_dest_def && src_def_after_last_dest_use;
+    VLOG(3) << "TryElideCopy - LiveRangeBefore result: " << live_range_before;
+
+    // If the live range is before, we can add control dependencies to ensure
+    // the ordering. Otherwise, we check for interference (which will
+    // also add control dependencies if needed)
+    if (live_range_before) {
+      if (insert_post_scheduling_control_dependencies) {
+        // Ensure that the last uses of the copy source (e.g. s_x) are
+        // ordered before the next definition of the copy destination buffer
+        // (d_1).
+        AddControlDependenciesBetween(copy_node.src, Next(*copy_node.dest));
+
+        // Also ensure that the last uses of the copy destination (e.g. d_m)
+        // are ordered before the next definition of the copy source buffer
+        // (s_{x+1}).
+        AddControlDependenciesBetween(copy_node.dest->prev,
+                                      Next(*copy_node.src));
+      }
+    } else if (CheckLiveRangeInterference(copy_node.src, copy_node.dest,
+                                          kMergeFirstDestInSource)) {
+      return false;
+    }
+    VLOG(2) << "TryElideCopy - splicing dest after source.";
+    // Splice in destination buffer values list right after 'src'.
+    SpliceAfter(copy_node.dest, copy_node.src);
+  } else if (IsTail(*copy_node.src)) {
+    // The copy copies the last value in the source buffer, s_n, and defines
+    // an arbitrary value in the destination buffer, d_y.  After
+    // merging, the values in the combined buffer must be strictly ordered
+    // as follows** to elide the copy:
+    //
+    // {d_0, ..., d_{y-1}, s_0, ..., s_n, d_{y+1}, ..., d_m}
+    //
+    // Removing the copy eliminates d_y, and uses of d_y become uses of
+    // s_n. To enforce the above order, the live range of d_{y-1} must be
+    // before the live range of s_0, and the live range of s_n must be
+    // before the live range of d_{y+1}.
+    //
+    // ** See comment above in the code handling Case (1).
+    VLOG(2) << "TryElideCopy - copy (" << copy->name()
+            << ") copies the last value in its buffer.";
+    // Live range of d_0, ..., d_{y-1} must be before s_0;
+    // Since copy_node.src is tail for this if branch, copy_node.src->next
+    // is s0 because the list is circularly linked.
+    bool prev_dest_use_before_next_src_def =
+        CheckLiveRangeBefore(Prev(*copy_node.dest), copy_node.src->next);
+
+    std::string a = Prev(*copy_node.dest) == nullptr
+                        ? "null"
+                        : Prev(*copy_node.dest)->value->ToShortString();
+    std::string b = copy_node.src->next->value->ToShortString();
+
+    VLOG(6) << "TryElideCopy - prev dest uses of value defined by (" << a
+            << ") complete before src definition (" << b
+            << "): " << prev_dest_use_before_next_src_def;
+    // Live range of 'last_src' must be before next_dest d_{y+1}.
+    bool src_use_before_next_dest_def =
+        CheckLiveRangeBefore(copy_node.src, Next(*copy_node.dest));
+
+    a = copy_node.src->value->ToShortString();
+    b = Next(*copy_node.dest) == nullptr
+            ? "null"
+            : Next(*copy_node.dest)->value->ToShortString();
+    VLOG(6) << "TryElideCopy - src uses of value defined by (" << a
+            << ") complete before dest definition (" << b
+            << "): " << src_use_before_next_dest_def;
+
+    bool live_range_before =
+        prev_dest_use_before_next_src_def && src_use_before_next_dest_def;
+
+    VLOG(2) << "TryElideCopy - LiveRangeBefore result: " << live_range_before;
+    // If the live range is before, we can add control dependencies to ensure
+    // the ordering. Otherwise, we check for interference (which will
+    // also add control dependencies if needed)
+    if (live_range_before) {
+      if (insert_post_scheduling_control_dependencies) {
+        // Ensure that the last uses of the copy source (e.g. s_n) are
+        // ordered before the next definition of the copy destination buffer
+        // (d_{y+1}).
+        AddControlDependenciesBetween(Prev(*copy_node.dest),
+                                      copy_node.src->next);
+        // Also ensure that the last uses of the copy source (e.g. s_n) are
+        // ordered before next definition of the copy destination (e.g.
+        // d_{y+1}).
+        AddControlDependenciesBetween(copy_node.src, Next(*copy_node.dest));
+      }
+    } else if (CheckLiveRangeInterference(copy_node.src, copy_node.dest,
+                                          kMergeLastSourceInDest)) {
+      VLOG(2) << "Region-based analysis concludes interference.";
+      return false;
+    }
+    VLOG(2) << "Splice src after prev of dest.";
+    // Splice source buffer values list right after 'prev_dest'.
+    SpliceAfter(copy_node.src->next, Prev(*copy_node.dest));
+  } else {
+    VLOG(2) << copy->name()
+            << " copies value in middle of source buffer to value in middle "
+               "of destination buffer";
+    return false;
+  }
+
+  RemoveCopyValue(copy_node.dest);
+
+  XLA_VLOG_LINES(4, ToString());
+  TF_DCHECK_OK(Verify());
+  VLOG(3) << "TryElideCopy succeeded for: " << copy->name();
+  return true;
+}
+
+// Delete the given ValueNode associated with a elided kCopy
+// instruction. This should be called after splicing the value lists of the
+// source and destination buffers together.
+void CopyRemover::RemoveCopyValue(ValueNode* copy_value_node) {
+  CHECK_EQ(copy_value_node->value->defining_instruction()->opcode(),
+           HloOpcode::kCopy);
+  ValueNode* operand_node = copy_value_node->prev;
+  CHECK(operand_node != copy_value_node);
+
+  VLOG(2) << "Removing copy " << operand_node->value->ToShortString() << " => "
+          << copy_value_node->value->ToShortString();
+
+  // Splice out the copy value node.
+  operand_node->next = copy_value_node->next;
+  copy_value_node->next->prev = operand_node;
+
+  // Patch up uses. Remove use of copy from operand_node uses.
+  auto it =
+      absl::c_find_if(operand_node->uses, [copy_value_node](const HloUse* use) {
+        return use->instruction ==
+               copy_value_node->value->defining_instruction();
+      });
+  CHECK(it != operand_node->uses.end());
+  operand_node->uses.erase(it);
+
+  // If the elided copy has any uses which are themselves kCopy instructions
+  // then patch up the copy info to reflect the that this kCopy instruction
+  // has a different operand (the operand of the elided copy).
+  for (const HloUse* copy_use : copy_value_node->uses) {
+    operand_node->uses.push_back(copy_use);
+    if (copy_use->instruction->opcode() == HloOpcode::kCopy &&
+        ContainsKey(copy_map_, copy_use->instruction)) {
+      copy_map_.at(copy_use->instruction).src = operand_node;
+    }
+  }
+
+  // Delete the copy info and the value node.
+  copy_map_.erase(copy_value_node->value->defining_instruction());
+  delete copy_value_node;
+}
+
+// Returns true if the live range of given value 'a' is before the live
+// range of 'b'.
+//
+// We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
+// updated as copies are removed. Also here because the result is used
+// to directly drive copy elision, use_is_always_before_def_in_same_instr is
+// set to false.
+bool CopyRemover::LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
+  if (a.uses.empty()) {
+    VLOG(2) << "Empty uses for " << *a.value;
+    return ordering_->IsDefinedBefore(*a.value, *b.value);
+  }
+  VLOG(3) << "Checking live ranges before: " << ValueListToString(&a) << " vs "
+          << ValueListToString(&b);
+  // If any of the positions of the "a" value is a root of the same
+  // computation as "b", "a"'s live range cannot be before "b"'s. This catches
+  // the cases where the root may not be the last instruction in the
+  // computation.
+  if (a.value->IsRootOf(b.value->defining_instruction()->parent())) {
+    VLOG(3) << "Value is root of the same computation";
+    return false;
+  }
+  return ordering_->UsesBeforeValueDefinition(
+      a.uses, *b.value, dataflow_,
+      /* use_is_always_before_def_in_same_instr=*/false);
+}
+
+// Splices the entire linked list with 'head' as its head right after the
+// node 'insert_after' in another linked list.
+void CopyRemover::SpliceAfter(ValueNode* head, ValueNode* insert_after) {
+  DCHECK(IsHead(*head));
+  value_lists_.erase(head);
+
+  ValueNode* tail = head->prev;
+  tail->next = insert_after->next;
+  insert_after->next->prev = tail;
+
+  insert_after->next = head;
+  head->prev = insert_after;
+}
+
+bool CopyRemover::ValuesInterfere(const ValueNode* src, const ValueNode* dest,
+                                  CombineLiveRangeOption merge_location) {
+  // Get the entire range of values sharing the buffers in src and dest.
+  auto src_live_range = ComputeLiveRangeRegions(src);
+  auto dest_live_range = ComputeLiveRangeRegions(dest);
+
+  VLOG(3) << "    ValuesInterfere source value: " << src->value->ToString();
+  VLOG(5) << "    ValuesInterfere source live range:\n"
+          << src_live_range.ToString();
+  VLOG(3) << "    ValuesInterfere destination value: "
+          << dest->value->ToString();
+  VLOG(5) << "    ValuesInterfere destination live range:\n"
+          << dest_live_range.ToString();
+
+  ComputeRelativeLocation relative_location_analysis(ordering_);
+  auto rel1 = relative_location_analysis.ComputeBetweenLiveRangeRegions(
+      src_live_range, dest_live_range);
+  VLOG(3) << "    ValuesInterfere - location of dest in relation to src: ";
+  VLOG(3) << "            " << rel1.ToString();
+
+  auto rel2 = relative_location_analysis.ComputeBetweenLiveRangeRegions(
+      dest_live_range, src_live_range);
+  VLOG(3) << "    ValuesInterfere - location of src in relation to dest: ";
+  VLOG(3) << "            " << rel2.ToString();
+
+  // If src and dest are interleaved with each other, they interfere.
+  if (rel1.RuntimeOrderOverlap() && rel2.RuntimeOrderOverlap()) {
+    VLOG(3) << "    ValuesInterfere: Both relations are overlapped.";
+    return true;
+  }
+  // If src and dest belong to the same group of computations and do not
+  // overlap, they do not interfere.
+  if (rel1.RuntimeOrderOverlap() || rel2.RuntimeOrderOverlap()) {
+    VLOG(3) << "    ValuesInterfere: At least one relation is overlapped.";
+    if (rel1.RuntimeOrderOverlap()) {
+      VLOG(3) << "    ValuesInterfere: rel1 is overlapped, with interception = "
+              << rel1.InterceptDefUse();
+      if (rel1.InterceptDefUse() ||
+          (merge_location != kMergeFirstDestInSource &&
+           rel2.InterceptDefUse())) {
+        return true;
+      }
+    } else {
+      VLOG(3) << "    ValuesInterfere: rel2 is overlapped, with interception = "
+              << rel2.InterceptDefUse();
+      // Here src is at the end of a nested computation inside dest.
+      if (rel2.InterceptDefUse() || (merge_location != kMergeLastSourceInDest &&
+                                     rel1.InterceptDefUse())) {
+        return true;
+      }
+    }
+  }
+  if (relative_location_analysis.AddControlDependenceForUnorderedOps()) {
+    return false;
+  }
+  // Disallow removing of copy if control deps cannot be added.
+  return true;
+}
+
+// Calls `visitor` on each item in the sequence of HloValues starting from
+// `element` and wrapping around.
+void CopyRemover::ForEachValueInRange(
+    const ValueNode* element,
+    absl::FunctionRef<void(const ValueNode*)> visitor) {
+  const ValueNode* p = element;
+  do {
+    CHECK_NE(p, nullptr);
+    visitor(p);
+    p = p->next;
+  } while (p != element);
+}
+
+std::string CopyRemover::ValueListToString(const ValueNode* element) {
+  std::string result = "{";
+  auto VisitValueNode = [&](const ValueNode* node) {
+    if (result == "{") {
+      StrAppend(&result, node->value->ToShortString());
+    } else {
+      StrAppend(&result, ", ", node->value->ToShortString());
+    }
+  };
+  ForEachValueInRange(element, VisitValueNode);
+  StrAppend(&result, "}");
+  return result;
+}
+
+std::string CopyRemover::ToString() const {
+  std::string out = absl::StrCat("CopyRemover:\n");
+  StrAppend(&out, "  Def-use chains in each buffer:\n");
+  for (const ValueNode* head : value_lists_) {
+    StrAppend(&out, "    Buffer defined by ", head->value->ToShortString(),
+              ":\n");
+    const ValueNode* p = head;
+    do {
+      StrAppend(&out, "      ", p->value->ToShortString(), ", uses: ",
+                absl::StrJoin(p->uses, "; ",
+                              [](std::string* s, const HloUse* use) {
+                                StrAppend(s, use->ToString());
+                              }),
+                "\n");
+
+      p = p->next;
+    } while (p != head);
+  }
+  StrAppend(&out, "  Potentially removable copies:\n");
+  for (const auto& pair : copy_map_) {
+    const HloInstruction* copy = pair.first;
+    const CopyNodes& copy_info = pair.second;
+
+    StrAppend(&out, "    ", copy->name(), " : ",
+              copy_info.src->value->ToShortString(), " => ",
+              copy_info.dest->value->ToShortString(), "\n");
+  }
+  return out;
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/service/copy_removal.h b/third_party/xla/xla/service/copy_removal.h
new file mode 100644
index 000000000000..02d71e1ddcb1
--- /dev/null
+++ b/third_party/xla/xla/service/copy_removal.h
@@ -0,0 +1,532 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COPY_REMOVAL_H_
+#define XLA_SERVICE_COPY_REMOVAL_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
+#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/map_util.h"
+#include "xla/service/hlo_value.h"
+
+namespace xla {
+using absl::StrAppend;
+
+class LiveRangeRegions {
+ public:
+  struct InstructionInfo {
+    InstructionInfo() : value_definition(nullptr), is_definition(false) {}
+
+    // The instruction that defines the value being used. It basically saves
+    // the defining instruction of each HloValue.
+    HloInstruction* value_definition;
+    // Whether the instruction defines a new value (or merely uses one). This
+    // basically remembers whether the instruction actually creates an HloValue
+    // or merely uses one, from a collection of given HloValues. Note that if
+    // is_definition = true, it merely says the instruction creates a new
+    // HloValue with or without defining a new one. For example, kAdd create a
+    // new HloValue (can be value_definition), but tuples or get-tuple-element,
+    // create a new HloValue aliasing without defining a new value (cannot be
+    // value_definition).
+    bool is_definition;
+
+    std::string ToString() const {
+      return absl::StrCat(
+          "is_definition: ", std::to_string(is_definition),
+          ", value_definition: ",
+          value_definition ? value_definition->name() : "nullptr");
+    }
+  };
+  // Map instructions that use a value to the defining instruction of the
+  // value. Because all values must belong to the same live range, an
+  // instruction can have at most a single value-defining instruction;
+  // otherwise the multiple incoming active values would share a single
+  // buffer, which is not allowed. The value-defining and value-use
+  // instructions do not have to belong to the same computation, but the value
+  // use needs to be nested within the defining computation.
+  typedef HloInstructionMap<InstructionInfo> InstructionMap;
+  typedef std::pair<HloInstruction*, InstructionInfo> InstructionEntry;
+  // Map each computation to its immediately contained instructions.
+  typedef absl::flat_hash_map<const HloComputation*, InstructionMap>
+      ComputationMap;
+
+  InstructionMap& operator[](const HloComputation* computation) {
+    if (computation_map_.find(computation) == computation_map_.end()) {
+      computation_vector_.push_back(computation);
+    }
+    return computation_map_[computation];
+  }
+
+  const InstructionMap& operator[](const HloComputation* computation) const {
+    ComputationMap::const_iterator p = computation_map_.find(computation);
+    CHECK(p != computation_map_.end());
+    return p->second;
+  }
+  absl::InlinedVector<const HloComputation*, 5>::const_iterator begin() const {
+    return computation_vector_.begin();
+  }
+  absl::InlinedVector<const HloComputation*, 5>::const_iterator end() const {
+    return computation_vector_.end();
+  }
+  int64_t size() const {
+    CHECK_EQ(computation_vector_.size(), computation_map_.size());
+    return computation_vector_.size();
+  }
+  bool empty() const { return size() == 0; }
+  const HloComputation* Computation(int64_t index) const {
+    return computation_vector_[index];
+  }
+  bool contains(HloInstruction* instr) const {
+    CHECK_NE(instr, nullptr);
+    auto* computation = instr->parent();
+    auto p = computation_map_.find(computation);
+    if (p == computation_map_.end()) {
+      return false;
+    }
+    auto instr_map = (*p).second;
+    return instr_map.find(instr) != instr_map.end();
+  }
+
+  std::string ToString() const {
+    std::string result;
+
+    for (const auto* computation : computation_vector_) {
+      StrAppend(&result, "computation: ", computation->name(), "\n");
+      for (const auto& entry : computation_map_.at(computation)) {
+        StrAppend(&result, "  entry: ", entry.first->name(), ", ",
+                  entry.second.ToString(), "\n");
+      }
+    }
+
+    return result;
+  }
+
+ private:
+  ComputationMap computation_map_;
+  absl::InlinedVector<const HloComputation*, 5> computation_vector_;
+};
+
+#define RUNTIME_ORDER_LIST(V)                                                  \
+  /* Indicates that there is no overlap whatsoever between the two regions. */ \
+  V(kNoOverlap, 0)                                                             \
+  /* Indicates that the first region includes the same set of instructions     \
+    as the second region. */                                                   \
+  V(kSameInstr, 1)                                                             \
+  /* Indicates that the first region is entirely before the second region      \
+    starts. */                                                                 \
+  V(kBeforeStart, 2)                                                           \
+  /* Indicates that the first region is before the second region ends. */      \
+  V(kBeforeStartOrSameInstr, kBeforeStart | kSameInstr)                        \
+  /* Indicates that the first region is entirely after the second region       \
+    ends. */                                                                   \
+  V(kAfterEnd, 4)                                                              \
+  /* Indicates that the first region is after the second region                \
+    starts, with some instructions before the second region ends. */           \
+  V(kAfterEndOrSameInstr, kAfterEnd | kSameInstr)                              \
+  /* Indicates that the first region overlaps with the second one, but share   \
+    no common instructions. */                                                 \
+  V(kBeforeStartOrAfterEnd, kBeforeStart | kAfterEnd)                          \
+  /* Indicates that the first region overlaps with the second one, and have    \
+    some common instructions. */                                               \
+  V(kBeforeOrAfterOrOverlap, kBeforeStart | kAfterEnd | kSameInstr)
+
+// Represent relations between the locations of two regions of instructions,
+// each region can include 0-n instructions.
+class Relation {
+ public:
+  enum RuntimeOrder {
+#define DECLARE_ENUM(enum_name, enum_value) enum_name = enum_value,
+    RUNTIME_ORDER_LIST(DECLARE_ENUM)
+#undef DECLARE_ENUM
+  };
+  Relation() : intercept_def_use_(false) {}
+  explicit Relation(RuntimeOrder order, bool intercept_def_use = false)
+      : intercept_def_use_(intercept_def_use) {
+    orders_.push_back(order);
+  }
+  Relation(const Relation& that) = default;
+  bool operator==(const Relation& that) const {
+    return intercept_def_use_ == that.intercept_def_use_ &&
+           absl::c_equal(orders_, that.orders_);
+  }
+
+  // Return whether the runtime ordering may imply interception, assuming it
+  // models the relation between a modifying and a use instruction.
+  bool UseImpliesInterception() const {
+    CHECK_EQ(orders_.size(), 1);
+    return UseImpliesInterception(orders_[0]);
+  }
+  // Return whether the runtime ordering may imply interception, assuming it
+  // models the relation between a modifying and a definition instruction.
+  bool DefinitionImpliesInterception() const {
+    CHECK_EQ(orders_.size(), 1);
+    return DefinitionImpliesInterception(orders_[0]);
+  }
+  // Return whether the current relation models a modifying instruction that
+  // intercepts the dataflow of another live range region.
+  bool InterceptDefUse() const { return intercept_def_use_; }
+  // Update interception state to the given value.
+  void UpdateInterception(bool value) {
+    CHECK_EQ(orders_.size(), 1);
+    intercept_def_use_ = value;
+  }
+  Relation::RuntimeOrder GetRuntimeOrder() const {
+    if (orders_.empty()) {
+      return Relation::kNoOverlap;
+    }
+    CHECK_EQ(orders_.size(), 1);
+    return orders_[0];
+  }
+  // Return whether the current relation implies two overlapping regions.
+  bool RuntimeOrderOverlap() const {
+    return absl::c_any_of(orders_, ImpliesOverlap);
+  }
+  bool RuntimeOrderIsUnordered() const {
+    return orders_.size() == 1 && orders_[0] == kBeforeStartOrAfterEnd;
+  }
+  bool RuntimeOrderIsNoOverlap() const {
+    return orders_.empty() || (orders_.size() == 1 && orders_[0] == kNoOverlap);
+  }
+  bool RuntimeOrderIsRunBefore() const {
+    return orders_.size() == 1 && orders_[0] == kBeforeStart;
+  }
+  bool RuntimeOrderIsRunAfter() const {
+    return orders_.size() == 1 && orders_[0] == kAfterEnd;
+  }
+  static std::string GetRuntimeOrderName(const RuntimeOrder order) {
+    std::string out;
+    FormatRuntimeOrder(&out, order);
+    return out;
+  }
+
+  static void FormatRuntimeOrder(std::string* out, const RuntimeOrder order) {
+    switch (order) {
+#define DECLARE_CASE(enum_name, enum_value) \
+  case enum_name:                           \
+    absl::StrAppend(out, #enum_name);       \
+    break;
+      RUNTIME_ORDER_LIST(DECLARE_CASE)
+#undef DECLARE_CASE
+    }
+  }
+  std::string ToString() const {
+    return absl::StrCat("Interception = ", intercept_def_use_, " Orders = ",
+                        absl::StrJoin(orders_, ", ", FormatRuntimeOrder), ",");
+  }
+
+  static bool DefinitionImpliesInterception(RuntimeOrder definition) {
+    return (definition == kAfterEnd || definition == kBeforeStartOrAfterEnd);
+  }
+  static bool UseImpliesInterception(RuntimeOrder use) {
+    return (use == kBeforeStart || use == kBeforeStartOrAfterEnd);
+  }
+
+  // Summarize additional relations into a single runtime ordering, assuming
+  // both relations are modeling constraints of the same source instruction.
+  void UnionRelationFromSameSource(const Relation& rel);
+
+  // Summarize additional relations into disjoint runtime orderings, assuming
+  // the relations are modeling constraints of different source instructions.
+  void UnionRelationFromDifferentSource(const Relation& rel);
+
+  static Relation::RuntimeOrder ReverseRuntimeOrder(RuntimeOrder order);
+
+ private:
+  // Indicate that the second region may intercept the def-use dataflow of the
+  // first region, if their buffers are combined.
+  bool intercept_def_use_;
+  // Remember the different runtime orderings of different instructions.
+  absl::InlinedVector<RuntimeOrder, 4> orders_;
+
+  static RuntimeOrder Union(RuntimeOrder o1, RuntimeOrder o2) {
+    return static_cast<Relation::RuntimeOrder>(o1 | o2);
+  }
+  static bool ImpliesOverlap(RuntimeOrder o) {
+    return o >= RuntimeOrder::kBeforeStartOrAfterEnd;
+  }
+  // Returns whether ordering constraint o1 includes o2 as a subset, when they
+  // represent runtime orderings (interleavings) of two different regions.
+  static bool Subsumes(RuntimeOrder o1, RuntimeOrder o2) {
+    return Union(o1, o2) == o1;
+  }
+  // Overwrites o1 with o2 if o2 subsumes o1 (as defined above by the Subsume
+  // function). Return whether o2 is subsumed by the new value in o1.
+  static bool OverwriteIfSubsumes(RuntimeOrder o2, RuntimeOrder* o1);
+};
+
+class ComputeRelativeLocation {
+ public:
+  typedef LiveRangeRegions::InstructionEntry InstructionEntry;
+  explicit ComputeRelativeLocation(HloOrdering* ordering)
+      : ordering_(ordering) {
+    VLOG(3) << "New analysis";
+  }
+
+  // Compute locationing constraints between two instructions. Here entry2 is
+  // the source instruction, in that the returned value describes the relation
+  // of entry2 in terms of whether it is before or after entry1, and whether it
+  // can intercept the def-use data flow of entry1.
+  Relation ComputeBetweenInstructionEntries(const InstructionEntry& entry1,
+                                            const InstructionEntry& entry2,
+                                            bool instr2_can_modify);
+
+  // Return the relative locations (defined above) of range2 in relation to
+  // instructions in range1. Return kNoOverlap if range2 is outside of range1.
+  Relation ComputeBetweenLiveRangeRegions(const LiveRangeRegions& range1,
+                                          const LiveRangeRegions& range2);
+
+  // Return whether control dependences, if exist, are added successfully.
+  bool AddControlDependenceForUnorderedOps();
+
+ private:
+  enum ComputeStatus {
+    kFullyComputed,
+    kPartiallyComputed,
+    kNotComputed,
+  };
+  typedef std::pair<ComputeStatus, Relation::RuntimeOrder> SavedRelation;
+
+  // Returns whether it is safe to force the desired_relation ordering between
+  // all operations in unordered_ops and entry2. If safe, save the new enforced
+  // ordering relations.
+  bool ForceRuntimeOrder(absl::Span<const InstructionEntry> unordered_ops,
+                         InstructionEntry entry2,
+                         Relation::RuntimeOrder desired_relation);
+
+  static bool AlwaysForceInterception(HloInstruction* instr);
+
+  // Returns whether the given instr may intercept the def-use flow of another
+  // ongoing live range if its buffer is combined with the other live range.
+  // The function should return true if instr creates a new HloValue that could
+  // overwrite an existing HloValue in the combined buffer.
+  // More specifically, here we are looking for operations that create new
+  // values, e.g., add, subtract, in contrast to HLOs that merely create
+  // aliasings among existing values, e.g., tuple, get-tuple-element. Any of the
+  // new values created by operations such as add or subtract, when included as
+  // definition operations in a live range, are aliases of the buffer to be
+  // allocated to the live range and so are treated as they may be modifying the
+  // targeting buffer.
+  bool InstructionCanIntercept(const InstructionEntry& entry,
+                               const LiveRangeRegions& region);
+
+  SavedRelation AlreadyComputed(HloInstruction* op1, HloInstruction* op2);
+
+  Relation::RuntimeOrder Save(HloInstruction* entry1, HloInstruction* entry2,
+                              Relation::RuntimeOrder relation,
+                              bool is_unordered_originally = false);
+
+  // Compute the runtime ordering constraints between two instructions.
+  Relation::RuntimeOrder ComputeRuntimeOrdering(HloInstruction* instr1,
+                                                HloInstruction* instr2);
+
+  HloOrdering* ordering_;
+  absl::flat_hash_map<
+      HloInstruction*,
+      absl::flat_hash_map<HloInstruction*, Relation::RuntimeOrder>>
+      saved_relations_;
+  absl::flat_hash_map<
+      HloComputation*,
+      absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>>>
+      ctrl_deps_;
+};
+// Class which tracks the HLO values within each HLO buffer in the module
+// during copy removal.
+//
+// The values are held in a linked list where there is one list for each
+// buffer. Removing a copy instruction merges together the values in the
+// source buffer of the copy to the destination buffer of the copy. This class
+// tracks these value lists as copies are removed from the graph (and value
+// lists are merged).
+//
+// The CopyRemover object is initialized to match the state of
+// HloAliasAnalysis. However, as copies are removed this state diverges. The
+// values-to-buffer mapping is maintained outside of HloAliasAnalysis because
+// a fully updatable alias analysis is very slow.
+class CopyRemover {
+ public:
+  // The values held in a single HLO buffer are represented using a linked
+  // list. An element type in this list is ValueNode.
+  //
+  // This linked list is hand-rolled to enable efficient splicing of lists
+  // using only references to list elements without knowing which lists are
+  // being spliced. std::list requires a reference to the list object to
+  // splice.
+  struct ValueNode {
+    explicit ValueNode(const HloValue* v) : value(v) {}
+
+    const HloValue* value;
+
+    // The uses are maintained outside of HloValue::uses() because
+    // HloValue::uses() is not updatable (a fully updatable dataflow analysis
+    // is slow).
+    std::vector<const HloUse*> uses;
+
+    // next/prev elements in the linked list. The list is circularly linked so
+    // these values are never null for elements in the list.
+    ValueNode* prev = nullptr;
+    ValueNode* next = nullptr;
+  };
+
+  CopyRemover(const HloModule& module, const HloAliasAnalysis& alias_analysis,
+              HloOrdering* ordering, bool check_live_range_ordering,
+              const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Add a list containing the given values to CopyRemover. This
+  // represents the values contained in a single buffer. For each value in
+  // 'values' an entry is created in value_to_node which indicates the
+  // respective ValueNode representing that value.
+  void AddValueList(
+      absl::Span<const HloValue* const> values,
+      absl::flat_hash_map<const HloValue*, ValueNode*>* value_to_node);
+
+  // This method also fills in copy_map_ which indicates which nodes
+  // in the value lists corresponding to the source and destination values of
+  // kCopy instructions. value_to_node should map each HloValue to its
+  // respective ValueNode.
+  void CreateCopyMap(
+      const HloModule& module,
+      const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node);
+
+  ~CopyRemover();
+
+  // Verify invariants within the linked lists.
+  absl::Status Verify() const;
+
+  // Compute the set of instructions where values are alive and organize these
+  // instructions by separating them into their respective computations.
+  LiveRangeRegions ComputeLiveRangeRegions(const ValueNode* head);
+
+  // Returns true if the copy is to or from the host.
+  static bool IsCopyToFromHost(const HloInstruction* copy);
+
+  // Try to elide the given copy. Elision of a copy is possible only if no
+  // live range interference is introduced by the copy's elimination. If
+  // elision is possible, then the internal state (value lists) are updated,
+  // and true is returned. Returns false otherwise.
+  bool TryElideCopy(const HloInstruction* copy, int64_t* region_analysis_limit,
+                    bool insert_post_scheduling_control_dependencies);
+
+  // Delete the given ValueNode associated with a elided kCopy
+  // instruction. This should be called after splicing the value lists of the
+  // source and destination buffers together.
+  void RemoveCopyValue(ValueNode* copy_value_node);
+
+  // Returns true if the live range of given value 'a' is before the live
+  // range of 'b'.
+  //
+  // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
+  // updated as copies are removed. Also here because the result is used
+  // to directly drive copy elision, use_is_always_before_def_in_same_instr is
+  // set to false.
+  bool LiveRangeBefore(const ValueNode& a, const ValueNode& b);
+
+  // Returns whether 'node' is the last node in its list.
+  bool IsTail(const ValueNode& node) const {
+    return ContainsKey(value_lists_, node.next);
+  }
+
+  // Returns whether 'node' is the first node in its list.
+  bool IsHead(const ValueNode& node) const {
+    return ContainsKey(value_lists_, &node);
+  }
+
+  // Returns the next node in the list after 'node'. If 'node' is the
+  // tail, then nullptr is returned.
+  ValueNode* Next(const ValueNode& node) const {
+    if (IsTail(node)) {
+      return nullptr;
+    }
+    return node.next;
+  }
+
+  // Returns the previous node in the list before 'node'. If 'node'
+  // is the head, then nullptr is returned.
+  ValueNode* Prev(const ValueNode& node) const {
+    if (IsHead(node)) {
+      return nullptr;
+    }
+    return node.prev;
+  }
+
+  // Splices the entire linked list with 'head' as its head right after the
+  // node 'insert_after' in another linked list.
+  void SpliceAfter(ValueNode* head, ValueNode* insert_after);
+
+  enum CombineLiveRangeOption {
+    kMergeFirstDestInSource = 1,
+    kMergeLastSourceInDest = 2
+  };
+
+  // This function analyzes all the HloValues that have been grouped together
+  // with src to share a single buffer, and all the HloValues that have been
+  // similarly grouped together with dest, to determine whether these two groups
+  // can be combined, by removing the operation in dest, which makes a copy of
+  // the buffer in src.
+  bool ValuesInterfere(const ValueNode* src, const ValueNode* dest,
+                       CombineLiveRangeOption merge_location);
+
+  // Calls `visitor` on each item in the sequence of HloValues starting from
+  // `element`.
+  //
+  // If element is not head, traverse from element to tail, then wrap
+  // around. The ordering is important for live range region analysis.
+  void ForEachValueInRange(const ValueNode* element,
+                           absl::FunctionRef<void(const ValueNode*)> visitor);
+  std::string ValueListToString(const ValueNode* element);
+
+  std::string ToString() const;
+
+ private:
+  const HloDataflowAnalysis& dataflow_;
+  HloOrdering* ordering_;
+
+  // The heads of all the value lists. Each value list represents the HLO
+  // values contained in a particular HLO buffer. The values in the list are
+  // in dependency order.
+  absl::flat_hash_set<const ValueNode*> value_lists_;
+
+  // Copy removal requires fast access to the value list elements
+  // corresponding to the source and destination values of the kCopy
+  // instruction. This data structure holds pointers to these elements for
+  // each kCopy instruction in the graph.
+  struct CopyNodes {
+    // The source and destinations values of the kCopy instruction.
+    ValueNode* src = nullptr;
+    ValueNode* dest = nullptr;
+  };
+  absl::flat_hash_map<const HloInstruction*, CopyNodes> copy_map_;
+};
+};  // namespace xla
+
+#endif  // XLA_SERVICE_COPY_REMOVAL_H_
diff --git a/third_party/xla/xla/service/cost_modelling/BUILD b/third_party/xla/xla/service/cost_modelling/BUILD
index 12f349efb628..898a8c3deb70 100644
--- a/third_party/xla/xla/service/cost_modelling/BUILD
+++ b/third_party/xla/xla/service/cost_modelling/BUILD
@@ -2,7 +2,7 @@
 #   A common directory for cost modelling related code.
 
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility")
diff --git a/third_party/xla/xla/service/cost_modelling/op_cost.cc b/third_party/xla/xla/service/cost_modelling/op_cost.cc
index 69cdff0cd18c..7fa42f5be680 100644
--- a/third_party/xla/xla/service/cost_modelling/op_cost.cc
+++ b/third_party/xla/xla/service/cost_modelling/op_cost.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 08db7bbd14a3..24247e609939 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -8,7 +8,7 @@ load(
     "if_enable_acl",
 )
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_binary",
     "xla_cc_test",
 )
@@ -122,11 +122,6 @@ filegroup(
     visibility = internal_visibility([":friends"]),
 )
 
-cc_library(
-    name = "cpu_event",
-    hdrs = ["cpu_event.h"],
-)
-
 cc_library(
     name = "cpu_xfeed",
     srcs = ["cpu_xfeed.cc"],
@@ -173,7 +168,6 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/host:host_platform_id",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
@@ -211,14 +205,17 @@ cc_library(
         ":cpu_options",
         ":dot_op_emitter",
         ":executable_proto_cc",
+        ":fusion_wrapper",
         ":ir_emission_utils",
         ":ir_emitter",
         ":ir_emitter2",
         ":metrics",
         ":onednn_contraction_rewriter",
+        ":onednn_float_support",
         ":onednn_ops_rewriter",
         ":parallel_task_assignment",
         ":runtime_symbol_generator",
+        ":small_while_loop_hoisting_pass",
         ":thunk_emitter",
         ":xla_framework",
         "//xla:cpu_function_runtime",
@@ -233,6 +230,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu:xnn_fusion",
         "//xla/backends/cpu/codegen:compiled_function_library",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
@@ -240,6 +238,7 @@ cc_library(
         "//xla/backends/cpu/codegen:jit_compiler",
         "//xla/backends/cpu/codegen:object_loader",
         "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/backends/cpu/runtime:kernel_thunk",
         "//xla/backends/cpu/runtime:thunk",
@@ -277,6 +276,7 @@ cc_library(
         "//xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier",
         "//xla/hlo/transforms/simplifiers:flatten_call_graph",
         "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/hlo/transforms/simplifiers:gather_simplifier",
         "//xla/hlo/transforms/simplifiers:hlo_constant_folding",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
@@ -326,6 +326,7 @@ cc_library(
         "//xla/service:logical_buffer",
         "//xla/service:map_inliner",
         "//xla/service:scatter_expander",
+        "//xla/service:scatter_simplifier",
         "//xla/service:select_and_scatter_expander",
         "//xla/service:sharding_propagation",
         "//xla/service:sharding_remover",
@@ -366,6 +367,7 @@ cc_library(
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Object",
         "@llvm-project//llvm:OrcJIT",
@@ -463,6 +465,7 @@ cc_library(
         ":cpu_aot_compilation_result",
         ":executable_proto_cc",
         "//xla:util",
+        "//xla/backends/cpu/codegen:ir_compiler",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
@@ -478,7 +481,9 @@ cc_library(
         "//xla/stream_executor/host:host_platform_id",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
     ],
     alwayslink = True,  # Contains compiler registration
 )
@@ -500,12 +505,61 @@ xla_test(
         "//xla/tsl/lib/monitoring:collected_metrics",
         "//xla/tsl/lib/monitoring:collection_registry",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
 
+xla_test(
+    name = "cpu_compiler_internals_test",
+    srcs = ["cpu_compiler_internals_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    deps = [
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:llvm_compiler",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_test(
+    name = "cpu_aot_compiler_test",
+    srcs = ["cpu_aot_compiler_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    deps = [
+        ":cpu_aot_compilation_result",
+        ":test_header_helper",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:hlo_runner",
+        "//xla/service:hlo_runner_interface",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:literal_test_util",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 tf_proto_library(
     name = "executable_proto",
     srcs = ["executable.proto"],
@@ -653,11 +707,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:OrcShared",
-        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -691,11 +741,13 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:symbol_name_util",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/service:elemental_ir_emitter",
         "//xla/service:hlo_module_config",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
@@ -704,8 +756,8 @@ cc_library(
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -716,8 +768,9 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:statusor",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -733,6 +786,7 @@ xla_cc_test(
         ":runtime_symbol_generator",
         ":target_machine_features_stub",
         "//xla:cpu_function_runtime",
+        "//xla:shape_util",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/backends/cpu/codegen:execution_engine",
         "//xla/backends/cpu/codegen:ir_compiler",
@@ -741,13 +795,13 @@ xla_cc_test(
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
         "//xla/service:buffer_assignment",
         "//xla/service:buffer_value",
         "//xla/service:hlo_module_config",
         "//xla/service:logical_buffer",
         "//xla/service/llvm_ir:llvm_util",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
@@ -777,7 +831,6 @@ cc_library(
         ":cpu_runtime",
         ":dot_op_emitter",
         ":elemental_ir_emitter",
-        ":elemental_math_emitter",
         ":ir_emission_utils",
         ":ir_function",
         ":onednn_config_proto_cc",
@@ -788,7 +841,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
-        "//xla:window_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/hlo/ir:hlo",
@@ -809,11 +861,16 @@ cc_library(
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/service/llvm_ir:tuple_ops",
         "//xla/tsl/lib/math:math_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -821,11 +878,9 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -898,10 +953,13 @@ cc_library(
         "//xla/backends/cpu:onednn_fusion",
         "//xla/backends/cpu:xnn_emitter",
         "//xla/backends/cpu:xnn_fusion",
+        "//xla/backends/cpu/codegen:computation_kernel_emitter",
+        "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/backends/cpu/codegen/dot:dot_kernel_emitter",
         "//xla/backends/cpu/codegen/elemental:concatenate_kernel_emitter",
         "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
         "//xla/backends/cpu/runtime:all_gather_thunk",
         "//xla/backends/cpu/runtime:all_reduce_thunk",
         "//xla/backends/cpu/runtime:all_to_all_thunk",
@@ -919,7 +977,6 @@ cc_library(
         "//xla/backends/cpu/runtime:logical_id_thunk",
         "//xla/backends/cpu/runtime:outfeed_thunk",
         "//xla/backends/cpu/runtime:reduce_scatter_thunk",
-        "//xla/backends/cpu/runtime:resource_use",
         "//xla/backends/cpu/runtime:rng_state_thunk",
         "//xla/backends/cpu/runtime:sort_thunk",
         "//xla/backends/cpu/runtime:thunk",
@@ -931,7 +988,9 @@ cc_library(
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/ir:hlo",
+        "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_module_config",
@@ -949,6 +1008,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:JITLink",
+        "@llvm-project//llvm:ir_headers",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
@@ -981,15 +1041,12 @@ cc_library(
         ":backend_config_proto_cc",
         ":cpu_options",
         ":cpu_runtime",
-        ":ir_emission_utils",
         ":tiled_dot_emitter",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/codegen:vector_ir_builder",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/service/llvm_ir:ir_array",
@@ -998,13 +1055,11 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -1053,7 +1108,6 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/collectives:cpu_clique",
         "//xla/backends/cpu/collectives:cpu_clique_key",
         "//xla/backends/cpu/collectives:cpu_cliques",
         "//xla/backends/cpu/collectives:cpu_collectives",
@@ -1066,6 +1120,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:status",
@@ -1073,6 +1128,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -1095,8 +1151,8 @@ cc_library(
         "//xla:executable_run_options",
         "//xla/backends/cpu/runtime:convolution_thunk_internal",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:mutex",  # build_cleaner: keep
     ],
 )
 
@@ -1110,12 +1166,9 @@ cc_library(
         ":runtime_lightweight_check",
         "//xla:executable_run_options",
         "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
-        "//xla/tsl/framework/convolution:eigen_helpers",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:mutex",  # build_cleaner: keep
     ],
 )
 
@@ -1128,7 +1181,6 @@ cc_library(
     deps = [
         "//xla/service:custom_call_status_internal",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
     ],
 )
 
@@ -1187,8 +1239,8 @@ cc_library(
         "//xla/tsl/framework/contraction:eigen_contraction_kernel",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:mutex",  # build_cleaner: keep
     ],
 )
 
@@ -1240,12 +1292,9 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
-        "//xla/tsl/framework/convolution:eigen_helpers",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:mutex",  # build_cleaner: keep
     ],
 )
 
@@ -1257,12 +1306,9 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
-        "//xla/tsl/framework/convolution:eigen_helpers",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:mutex",  # build_cleaner: keep
     ],
 )
 
@@ -1388,19 +1434,23 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
+        "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
+        "//xla/ffi/api:c_api",
         "//xla/service:custom_call_status_public_headers",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1439,9 +1489,9 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:transpose_folding",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
@@ -1479,7 +1529,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:fusion_node_indexing_evaluation",
         "//xla/service:instruction_fusion",
-        "//xla/service/llvm_ir:fused_ir_emitter",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1489,6 +1538,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "fusion_wrapper",
+    srcs = ["fusion_wrapper.cc"],
+    hdrs = ["fusion_wrapper.h"],
+    deps = [
+        "//xla/codegen/emitters:fusion_wrapper_base",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "fusion_wrapper_test",
+    srcs = ["fusion_wrapper_test.cc"],
+    deps = [
+        ":fusion_wrapper",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "ir_emission_utils",
     srcs = ["ir_emission_utils.cc"],
@@ -1511,8 +1585,8 @@ xla_cc_test(
     deps = [
         ":ir_emission_utils",
         ":target_machine_features_stub",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -1552,11 +1626,11 @@ xla_cc_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:computation_layout",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log:check",
@@ -1597,9 +1671,9 @@ xla_cc_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -1640,11 +1714,10 @@ xla_cc_test(
         ":target_machine_features_stub",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/service:hlo_cost_analysis",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -1657,6 +1730,8 @@ cc_library(
     deps = [
         "//xla/service:hlo_module_config",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1680,8 +1755,8 @@ xla_cc_test(
     deps = [
         ":ir_emission_utils",
         ":target_machine_features_stub",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -1700,9 +1775,9 @@ xla_cc_test(
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/service:compiler",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
@@ -1719,12 +1794,14 @@ xla_cc_test(
     name = "scoped_ir_builder_test",
     srcs = ["scoped_ir_builder_test.cc"],
     deps = [
+        ":cpu_executable",
         ":ir_emitter",
         ":target_machine_features_stub",
         "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
         "//xla/service:logical_buffer",
-        "//xla/tests:hlo_test_base",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Core",
         "@local_tsl//tsl/platform:test",
@@ -1835,6 +1912,7 @@ cc_library(
         ":runtime_lightweight_check",
         "//xla:executable_run_options",
         "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/synchronization",
@@ -1901,6 +1979,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":onednn_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
     ] + mkl_deps(),
 )
 
@@ -1967,17 +2047,50 @@ cc_library(
     ] + mkl_deps(),
 )
 
+cc_library(
+    name = "onednn_float_support",
+    srcs = ["onednn_float_support.cc"],
+    hdrs = ["onednn_float_support.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":onednn_contraction_rewriter",
+        "//xla/service:float_support",
+    ],
+)
+
 cc_library(
     name = "cpu_float_support",
-    srcs = ["cpu_float_support.cc"],
     hdrs = ["cpu_float_support.h"],
     copts = tsl_copts(),
     deps = [
-        ":onednn_contraction_rewriter",
+        "//xla/backends/cpu:xnn_fusion",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
         "//xla/service:float_support",
     ],
 )
 
+xla_cc_test(
+    name = "cpu_float_support_test",
+    srcs = ["cpu_float_support_test.cc"],
+    deps = [
+        ":cpu_float_support",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/backends/cpu/codegen:target_machine_test_base",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "cpu_symbol_repository",
     hdrs = ["cpu_symbol_repository.h"],
@@ -2023,6 +2136,44 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "small_while_loop_hoisting_pass",
+    srcs = ["small_while_loop_hoisting_pass.cc"],
+    hdrs = ["small_while_loop_hoisting_pass.h"],
+    deps = [
+        ":cpu_executable",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "small_while_loop_hoisting_pass_test",
+    srcs = ["small_while_loop_hoisting_pass_test.cc"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":small_while_loop_hoisting_pass",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "metrics_test",
     srcs = ["metrics_test.cc"],
diff --git a/third_party/xla/xla/service/cpu/conv_canonicalization_test.cc b/third_party/xla/xla/service/cpu/conv_canonicalization_test.cc
index 80d8313b7c75..a90f04e57e69 100644
--- a/third_party/xla/xla/service/cpu/conv_canonicalization_test.cc
+++ b/third_party/xla/xla/service/cpu/conv_canonicalization_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal_util.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -33,7 +33,7 @@ namespace cpu {
 
 using ::testing::ElementsAre;
 
-class ConvCanonicalizationTest : public HloTestBase {
+class ConvCanonicalizationTest : public HloHardwareIndependentTestBase {
  public:
   ConvCanonicalizationTest() {
     for (int i = 0; i < 2; ++i) {
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
index c24ee7c60b7b..794f99a470c4 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
@@ -15,22 +15,42 @@ limitations under the License.
 
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 
+#include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/constant_allocation.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
+#include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
 #include "xla/service/compiler.h"
+#include "xla/service/cpu/buffer_info_util.h"
+#include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/executable.pb.h"
+#include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla::cpu {
@@ -51,7 +71,7 @@ se::Platform::Id CpuAotCompilationOptions::PlatformId() const {
   return se::host::kHostPlatformId;
 }
 
-CpuAotCompilationResult::CpuAotCompilationResult(
+CpuAotCompilationResultLegacy::CpuAotCompilationResultLegacy(
     ObjectFileData object_file_data, std::vector<BufferInfo> buffer_infos,
     int64_t result_buffer_index, std::unique_ptr<HloModule> module,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
@@ -61,12 +81,174 @@ CpuAotCompilationResult::CpuAotCompilationResult(
       module_(std::move(module)),
       hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {}
 
-const HloModule* CpuAotCompilationResult::optimized_module() const {
+const HloModule* CpuAotCompilationResultLegacy::optimized_module() const {
   return module_.get();
 }
 
-std::unique_ptr<HloModule> CpuAotCompilationResult::consume_optimized_module() {
+std::unique_ptr<HloModule>
+CpuAotCompilationResultLegacy::consume_optimized_module() {
   return std::move(module_);
 }
 
+/*static*/ absl::StatusOr<std::unique_ptr<CpuAotCompilationResultThunks>>
+CpuAotCompilationResultThunks::Create(
+    const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+    absl::string_view function_name, std::vector<std::string> obj_files,
+    std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
+    FunctionLibrary* function_library,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data) {
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
+      &buffer_assignment->Allocations());
+  TF_ASSIGN_OR_RETURN(ThunkSequenceProto thunk_proto,
+                      thunk_sequence_serdes.ToProto(thunks));
+
+  std::vector<cpu_function_runtime::BufferInfo> buffer_infos;
+  std::optional<size_t> temp_allocation_index;
+
+  if (buffer_assignment) {
+    buffer_infos =
+        CreateBufferInfosFromBufferAssignment(*hlo_module, *buffer_assignment);
+
+    // Find temp allocation index if it exists
+    for (const BufferAllocation& allocation :
+         buffer_assignment->Allocations()) {
+      if (allocation.IsPreallocatedTempBuffer()) {
+        if (temp_allocation_index.has_value()) {
+          return Internal("Multiple temp buffer allocations found");
+        }
+        temp_allocation_index = allocation.index();
+      }
+    }
+  }
+
+  return absl::WrapUnique(new CpuAotCompilationResultThunks(
+      hlo_module, buffer_assignment, function_name, std::move(obj_files),
+      std::move(symbols), thunk_proto, std::move(temp_allocation_index),
+      std::move(buffer_infos), std::move(function_library),
+      std::move(hlo_profile_printer_data)));
+}
+
+CpuAotCompilationResultThunks::CpuAotCompilationResultThunks(
+    const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+    absl::string_view function_name, std::vector<std::string> obj_files,
+    std::vector<SymbolProto> symbols, const ThunkSequenceProto& thunks,
+    std::optional<size_t> temp_allocation_index,
+    std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
+    FunctionLibrary* function_library,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
+    : temp_allocation_index_(temp_allocation_index),
+      buffer_infos_(std::move(buffer_infos)),
+      function_library_(std::move(function_library)),
+      hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {
+  *proto_.mutable_hlo_module()->mutable_hlo_module() = hlo_module->ToProto();
+  *proto_.mutable_hlo_module()->mutable_config() =
+      hlo_module->config().ToProto();
+  *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
+  proto_.set_entry_function_name(std::string(function_name));
+  for (std::string& obj_file : obj_files) {
+    proto_.add_obj_files(std::move(obj_file));
+  }
+
+  for (const auto& symbol : symbols) {
+    auto* symbol_proto = proto_.add_compiled_symbols();
+    *symbol_proto = symbol;
+  }
+  proto_.set_obj_files_kind(CompilationResultProto::KERNELS);
+  module_ = hlo_module->Clone();
+
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
+      &buffer_assignment->Allocations());
+  *proto_.mutable_thunk_sequence() = thunks;
+}
+
+absl::StatusOr<std::unique_ptr<Executable>>
+CpuAotCompilationResultThunks::LoadExecutable(
+    [[maybe_unused]] Compiler* compiler,
+    const se::StreamExecutor* stream_exec) const&& {
+  // Compiler would be used only to get the BufferSizeBytesFunction. Doing this
+  // we ensure the user doesn't expect a different function to be used.
+  CHECK(compiler == nullptr);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      HloModule::CreateFromProtoWithConfig(proto_.hlo_module()));
+
+  VLOG(2) << "Load XLA:CPU executable for module: " << module->name();
+
+  // Copied from cpu_compiler.cc in order to avoid dependency on cpu_compiler.
+  std::function<int64_t(const BufferValue&)> buffer_size_bytes_function_getter =
+      []() {
+        HloCostAnalysis::ShapeSizeFunction shape_size =
+            CpuExecutable::ShapeSizeBytes;
+        return [shape_size](const BufferValue& buffer) {
+          return shape_size(buffer.shape());
+        };
+      }();
+
+  // Recreate BufferAssignment from proto.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> buffer_assignment,
+      BufferAssignment::FromProto(proto_.buffer_assignment(), module.get(),
+                                  buffer_size_bytes_function_getter,
+                                  /*can_share_buffer=*/nullptr));
+
+  std::unique_ptr<CpuExecutable> cpu_executable;
+
+  if (proto_.obj_files_kind() != CompilationResultProto::KERNELS) {
+    return Internal("AOT compilation result does not have thunks.");
+  }
+
+  ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
+      &buffer_assignment->Allocations());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<ThunkSequence> thunks,
+                      thunk_sequence_serdes.FromProto(proto_.thunk_sequence()));
+
+  VLOG(3) << "Loaded " << thunks->size() << " thunks.";
+
+  std::vector<FunctionLibrary::Symbol> compiled_symbols;
+
+  for (const auto& symbol_proto : proto_.compiled_symbols()) {
+    switch (symbol_proto.function_type_id()) {
+      case SymbolProto::KERNEL:
+        compiled_symbols.push_back(
+            FunctionLibrary::Sym<FunctionLibrary::Kernel>(symbol_proto.name()));
+        break;
+      case SymbolProto::COMPARATOR:
+        compiled_symbols.push_back(
+            FunctionLibrary::Sym<FunctionLibrary::Comparator>(
+                symbol_proto.name()));
+        break;
+      default:
+        return Internal(
+            "Unknown function type id %s",
+            SymbolProto_FunctionTypeId_Name(symbol_proto.function_type_id()));
+    }
+  }
+
+  VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
+  for (const auto& symbol : compiled_symbols) {
+    VLOG(3) << " Symbol: " << symbol.name;
+  }
+
+  // Create constant allocations from the buffer assignment.
+  TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
+                      CreateConstantAllocations(*buffer_assignment));
+
+  TF_ASSIGN_OR_RETURN(
+      cpu_executable,
+      CpuExecutable::Create(absl::WrapUnique(function_library_),
+                            std::move(buffer_assignment), std::move(module),
+                            std::move(*thunks), std::move(constants), nullptr,
+                            nullptr));
+
+  // Dump computation proto state and buffer assignment for
+  // GetCompiledMemoryStats results.
+  auto hlo_proto = std::make_unique<HloProto>();
+  *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
+  *hlo_proto->mutable_buffer_assignment() =
+      cpu_executable->buffer_assignment().ToProto();
+  cpu_executable->set_hlo_proto(std::move(hlo_proto));
+
+  return cpu_executable;
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 223214112c6a..2bc4e6407546 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -16,20 +16,30 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_CPU_AOT_COMPILATION_RESULT_H_
 #define XLA_SERVICE_CPU_CPU_AOT_COMPILATION_RESULT_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/executable.pb.h"
+#include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla::cpu {
@@ -82,14 +92,23 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
   const RelocationModel relocation_model_;
 };
 
+// Temporary base class for CpuAotCompilationResultLegacy and
+// CpuAotCompilationResultThunks, CpuAotCompilationResultThunks will take this
+// name once legacy runtime is removed.
 class CpuAotCompilationResult : public AotCompilationResult {
  public:
-  CpuAotCompilationResult(
+  // NOLINTNEXTLINE clang-tidy complains that `override` should be used here.
+  virtual ~CpuAotCompilationResult() = default;
+};
+
+class CpuAotCompilationResultLegacy : public CpuAotCompilationResult {
+ public:
+  CpuAotCompilationResultLegacy(
       ObjectFileData object_file_data,
       std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
       int64_t result_buffer_index, std::unique_ptr<HloModule> module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
-  ~CpuAotCompilationResult() override = default;
+  ~CpuAotCompilationResultLegacy() override = default;
 
   HloProfilePrinterData* hlo_profile_printer_data() const {
     return hlo_profile_printer_data_.get();
@@ -125,6 +144,104 @@ class CpuAotCompilationResult : public AotCompilationResult {
   std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
 };
 
+// TODO(basioli) Once we fully migrate to new runtime this will be the only
+// implementation of CpuAotCompilationResult.
+class CpuAotCompilationResultThunks : public CpuAotCompilationResult {
+ public:
+  static absl::StatusOr<std::unique_ptr<CpuAotCompilationResultThunks>> Create(
+      const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+      absl::string_view function_name, std::vector<std::string> obj_files,
+      std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
+      FunctionLibrary* function_library,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
+
+  ~CpuAotCompilationResultThunks() override = default;
+
+  absl::StatusOr<std::string> SerializeAsString() const override {
+    return proto_.SerializeAsString();
+  }
+
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      [[maybe_unused]] Compiler* compiler,
+      const se::StreamExecutor* stream_exec) const&& override;
+
+  const HloModule* optimized_module() const override { return module_.get(); }
+
+  std::unique_ptr<HloModule> consume_optimized_module() override {
+    return std::move(module_);
+  }
+
+  const CompilationResultProto& proto() const { return proto_; }
+
+  std::vector<absl::string_view> obj_files() const {
+    std::vector<absl::string_view> obj_files;
+    for (const auto& obj_file : proto_.obj_files()) {
+      obj_files.push_back(obj_file);
+    }
+    return obj_files;
+  }
+
+  std::optional<size_t> temp_allocation_index() const {
+    return temp_allocation_index_;
+  }
+
+  const std::vector<cpu_function_runtime::BufferInfo>& buffer_infos() const {
+    return buffer_infos_;
+  }
+
+  const HloProfilePrinterData* hlo_profile_printer_data() const {
+    return hlo_profile_printer_data_.get();
+  }
+
+  static absl::StatusOr<std::unique_ptr<CpuAotCompilationResultThunks>>
+  FromString(const std::string& serialized, FunctionLibrary* function_library) {
+    CompilationResultProto proto;
+    if (!proto.ParseFromString(serialized)) {
+      return Internal(
+          "Failed to parse serialized CpuAotCompilationResultThunks.");
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> module,
+        HloModule::CreateFromProtoWithConfig(proto.hlo_module()));
+
+    return std::unique_ptr<CpuAotCompilationResultThunks>(
+        new CpuAotCompilationResultThunks(proto, std::move(module),
+                                          std::move(function_library)));
+  }
+
+ private:
+  CpuAotCompilationResultThunks(
+      const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+      absl::string_view function_name, std::vector<std::string> obj_files,
+      std::vector<SymbolProto> symbols, const ThunkSequenceProto& thunks,
+      std::optional<size_t> temp_allocation_index,
+      std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
+      FunctionLibrary* function_library,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
+
+  explicit CpuAotCompilationResultThunks(CompilationResultProto proto,
+                                         std::unique_ptr<HloModule> module,
+                                         FunctionLibrary* function_library)
+      : proto_(std::move(proto)),
+        module_(std::move(module)),
+        function_library_(std::move(function_library)) {}
+
+  CompilationResultProto proto_;
+  std::unique_ptr<HloModule> module_;
+  std::optional<size_t> temp_allocation_index_;
+  std::vector<cpu_function_runtime::BufferInfo> buffer_infos_;
+
+  // Exists only to be moved to the executable on loading, has to be a raw
+  // pointer because the executable takes ownership of the library, and
+  // LoadExecutable() is const.
+  FunctionLibrary* function_library_;
+
+  // Contains an instance of HloProfilePrinterData if HLO profiling is enabled,
+  // otherwise is nullptr.
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
+};
+
 }  // namespace xla::cpu
 
 #endif  // XLA_SERVICE_CPU_CPU_AOT_COMPILATION_RESULT_H_
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
new file mode 100644
index 000000000000..050f77a74470
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
@@ -0,0 +1,131 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/compiler.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
+#include "xla/service/cpu/test_target_triple_helper.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using CpuAotCompilerTest = HloTestBase;
+
+// Separate from cpu_compiler_test.cc because we have to use HloTestBase to
+// get the HloRunner since HloRunnerAgnosticTestBase doesn't support AOT
+// compilation.
+// TODO(basioli): Unify this test with the gpu_compiler one.
+TEST_F(CpuAotCompilerTest, AheadOfTimeCompilation) {
+  constexpr absl::string_view kHloAdd1 = R"(
+add1 {
+  p = s32[] parameter(0)
+  c = s32[] constant(1)
+  ROOT a = s32[] add(p, c)
+}
+
+ENTRY e {
+  p = s32[] parameter(0)
+  ROOT r = s32[] fusion(p), kind=kLoop, calls=add1
+})";
+
+  constexpr absl::string_view kHloAdd2 = R"(
+add2 {
+  p = s32[] parameter(0)
+  c = s32[] constant(2)
+  ROOT a = s32[] add(p, c)
+}
+
+ENTRY e {
+  p = s32[] parameter(0)
+  ROOT r = s32[] fusion(p), kind=kLoop, calls=add2
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          se::PlatformManager::PlatformWithName("host"));
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
+                          platform->ExecutorForDevice(0));
+
+  Compiler* compiler = backend().compiler();
+  ASSERT_NE(compiler, nullptr);
+
+  std::unique_ptr<AotCompilationOptions> aot_options = std::make_unique<
+      CpuAotCompilationOptions>(
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::BigPic);
+  aot_options->set_executor(stream_exec);
+
+  auto test = [this, &compiler, aot_options = std::move(aot_options)](
+                  absl::string_view test_name, absl::string_view hlo, int input,
+                  int expected_result) {
+    SCOPED_TRACE(test_name);
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            ParseAndReturnVerifiedModule(hlo));
+    auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+        compiler->CompileAheadOfTime(std::move(module_group), *aot_options));
+
+    TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
+                            aot_results[0]->SerializeAsString());
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<AotCompilationResult> aot_result,
+        compiler->LoadAotCompilationResult(serialized_aot_result));
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<Executable> executable,
+        std::move(*aot_result)
+            .LoadExecutable(compiler, aot_options->executor()));
+    std::unique_ptr<OpaqueExecutable> wrapped_executable =
+        test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
+
+    const xla::Literal literal_input =
+        xla::LiteralUtil::CreateR0<int32_t>(input);
+    const xla::Literal literal_expected_result =
+        xla::LiteralUtil::CreateR0<int32_t>(expected_result);
+
+    TF_ASSERT_OK_AND_ASSIGN(Literal result,
+                            test_runner_as_hlo_runner().ExecuteWithExecutable(
+                                wrapped_executable.get(), {&literal_input}));
+
+    EXPECT_TRUE(LiteralTestUtil::Equal(result, literal_expected_result));
+  };
+
+  test("Test kHloAdd1", kHloAdd1, 1, 2);
+  test("Test kHloAdd2", kHloAdd2, 1, 3);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 882b1ac238ab..fcffdafcf5e5 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -55,8 +55,10 @@ limitations under the License.
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
@@ -69,7 +71,9 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/PassManager.h"
@@ -79,6 +83,7 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
 #include "xla/backends/cpu/codegen/execution_engine.h"
 #include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/jit_compiler.h"
@@ -90,6 +95,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
 #include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
+#include "xla/backends/cpu/xnn_fusion.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/analysis/indexed_array_analysis.h"
@@ -128,6 +134,7 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
 #include "xla/hlo/transforms/simplifiers/float_normalization.h"
+#include "xla/hlo/transforms/simplifiers/gather_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/hlo_constant_folding.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
@@ -160,16 +167,19 @@ limitations under the License.
 #include "xla/service/cpu/conv_canonicalization.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/cpu/cpu_float_support.h"
 #include "xla/service/cpu/cpu_instruction_fusion.h"
 #include "xla/service/cpu/cpu_layout_assignment.h"
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/dot_op_emitter.h"
 #include "xla/service/cpu/executable.pb.h"
+#include "xla/service/cpu/fusion_wrapper.h"
 #include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/cpu/metrics.h"
 #include "xla/service/cpu/parallel_task_assignment.h"
 #include "xla/service/cpu/runtime_symbol_generator.h"
+#include "xla/service/cpu/small_while_loop_hoisting_pass.h"
 #include "xla/service/cpu/thunk_emitter.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/dump.h"
@@ -192,6 +202,7 @@ limitations under the License.
 #include "xla/service/logical_buffer.h"
 #include "xla/service/map_inliner.h"
 #include "xla/service/scatter_expander.h"
+#include "xla/service/scatter_simplifier.h"
 #include "xla/service/select_and_scatter_expander.h"
 #include "xla/service/sharding_propagation.h"
 #include "xla/service/sharding_remover.h"
@@ -229,8 +240,8 @@ limitations under the License.
 
 #if defined(INTEL_MKL)
 #include "xla/hlo/transforms/simplifiers/simplify_fp_conversions.h"
-#include "xla/service/cpu/cpu_float_support.h"
 #include "xla/service/cpu/onednn_contraction_rewriter.h"
+#include "xla/service/cpu/onednn_float_support.h"
 #include "xla/service/cpu/onednn_ops_rewriter.h"
 #endif
 
@@ -255,7 +266,7 @@ static tsl::thread::ThreadPool* GetCompilationThreadPool() {
   tsl::ThreadOptions thread_options;
   thread_options.stack_size = 4 * 1024 * 1024;  // 4 MB
 
-  static auto* thread_pool = new tsl::thread::ThreadPool(
+  static auto* const thread_pool = new tsl::thread::ThreadPool(
       tsl::Env::Default(), thread_options, "xla-cpu-llvm-codegen",
       std::min(kMaxCompilationThreads, tsl::port::MaxParallelism()));
   return thread_pool;
@@ -439,12 +450,68 @@ void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
   }
 }
 
+std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
+    absl::string_view name, HloModule* module, bool is_fusion_emitters) {
+  // Run the following passes to a fixed point.
+  auto pipeline =
+      std::make_unique<HloPassFix<HloPassPipeline>>(std::string(name));
+  AddHloVerifier(pipeline.get(), HloVerifierOpts{},
+                 /*debug_only=*/true);
+
+  AlgebraicSimplifierOptions options;
+  options.set_enable_dot_strength_reduction(false);
+  // "slow" minmax means we propagate nan.
+  options.set_minmax_propagate_nan(
+      !module->config().debug_options().xla_cpu_enable_fast_min_max());
+  options.set_supports_non_canonical_dots(false);
+  options.set_executing_on_cpu(true);
+  pipeline->AddPass<AlgebraicSimplifier>(options);
+  pipeline->AddPass<SortSimplifier>();
+  pipeline->AddPass<HloDCE>();
+  pipeline->AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
+  if (is_fusion_emitters) {
+    // Conversion to MLIR only works with simplified gathers.
+    pipeline->AddPass<GatherSimplifier>();
+  }
+
+  // Needs to happen after algebraic simplifier.
+  pipeline->AddPass<TreeReductionRewriter>();
+
+  // BatchNormExpander can create zero-sized ops, so zero-sized HLO
+  // elimination has to come after that pass.
+  pipeline->AddPass<ZeroSizedHloElimination>();
+
+  pipeline->AddPass<WhileLoopInvariantCodeMotion>();
+  pipeline->AddPass<TupleSimplifier>();
+  pipeline->AddPass<WhileLoopConstantSinking>();
+  pipeline->AddPass<WhileLoopSimplifier>();
+
+  // TODO(b/134075051): Re-enable after b/134075051 is fixed.
+  // pipeline->AddPass<SliceSinker>();
+
+  pipeline->AddPass<HloDCE>();
+  pipeline->AddPass<ReshapeMover>();
+  pipeline->AddPass<HloConstantFolding>(
+      options::FoldAllConstants(module->config())
+          ? HloConstantFolding::Level::kAggressive
+          : HloConstantFolding::Level::kDefault);
+  pipeline->AddPass<ConditionalSimplifier>();
+
+  return pipeline;
+}
+
 }  // namespace
 
 absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     HloModule* module, bool is_aot_compile,
     TargetMachineFeatures* target_machine_features) {
   const int64_t num_partitions = module->config().num_partitions();
+  const bool is_thunk_runtime =
+      module->config().debug_options().xla_cpu_use_thunk_runtime();
+  const bool is_fusion_emitters =
+      is_thunk_runtime &&
+      module->config().debug_options().xla_cpu_use_fusion_emitters();
+  bool use_shardy_partitioner = module->config().use_shardy_partitioner();
   if (num_partitions > 1) {
     if (!module->config().use_spmd_partitioning()) {
       return InvalidArgument(
@@ -455,10 +522,11 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     // Run some IR cleanup passes before running the SPMD partitioning
     // passes.
     AddHloVerifier(&spmd_pipeline);
+    spmd_pipeline.AddPass<FlattenCallGraph>();
     spmd_pipeline.AddPass<CallInliner>();
     spmd_pipeline.AddPass<ZeroSizedHloElimination>();
     spmd_pipeline.AddPass<ConditionalCanonicalizer>();
-    if (module->config().use_shardy_partitioner()) {
+    if (use_shardy_partitioner) {
       spmd_pipeline.AddPass<sdy::ShardyXLA>();
     } else {
       spmd_pipeline.AddPass<ShardingPropagation>(
@@ -474,6 +542,11 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     AddHloVerifier(&sharding_removal_pipeline);
     // Remove redundant sharding ops when partition_count == 1.
     sharding_removal_pipeline.AddPass<ShardingRemover>();
+    // Run ShardyXLA without propagation, which enforces use-tuple-args.
+    if (use_shardy_partitioner) {
+      sharding_removal_pipeline.AddPass<sdy::ShardyXLA>(
+          /*runSdyShardingPropagation=*/false);
+    }
     sharding_removal_pipeline.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(sharding_removal_pipeline.Run(module).status());
   }
@@ -491,7 +564,30 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   AddHloVerifier(&pipeline);
   pipeline.AddPass<BatchedGatherScatterNormalizer>();
   pipeline.AddPass<ResultCaster>();
-  pipeline.AddPass<OperandUpcaster>();
+
+  // If XNNPACK is enabled, we only need to upcast dots that XnnDotThunk does
+  // not support. `upcaster_filter` returns false if the instruction shouldn't
+  // be processed.
+  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
+  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
+  // `XnnFusionThunk`.
+  bool xnnpack_enabled = module->config().debug_options().xla_cpu_use_xnnpack();
+  auto call_library_for_dot = [&](const HloInstruction& instr) {
+    if (!xnnpack_enabled) return false;
+    DotImplementationStrategy strategy = GetDotImplementationStrategy(
+        module->config(), instr, *target_machine_features,
+        /*allow_runtime_calls=*/true);
+    return strategy == DotImplementationStrategy::kEigen;
+  };
+  HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
+    if (!call_library_for_dot(*instr)) return true;
+    return !IsXnnDotSupported(instr->dot_dimension_numbers(),
+                              instr->operand(0)->shape(),
+                              instr->operand(1)->shape(), instr->shape(),
+                              target_machine_features)
+                .value_or(false);
+  };
+  pipeline.AddPass<OperandUpcaster>(upcaster_filter);
 
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
@@ -528,8 +624,6 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Rewrite to custom calls with target as oneDNN library calls.
 #if defined(INTEL_MKL)
   // AOT compiled code runs in single thread.
-  bool is_thunk_runtime =
-      module->config().debug_options().xla_cpu_use_thunk_runtime();
   if (!is_aot_compile && !is_thunk_runtime) {
     // Placing OneDnnOpsRewriter here to match the flax patterns
     // TODO: Decide where would be the appropriate place for this pass to make
@@ -547,9 +641,10 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
-  FloatSupport bf16_support(BF16);
+  CpuFloatSupport bf16_support(BF16, call_library_for_dot,
+                               target_machine_features);
 #if defined(INTEL_MKL)
-  CpuFloatSupport onednn_bf16_support(BF16);
+  OneDnnFloatSupport onednn_bf16_support(BF16);
   if (!is_aot_compile && !is_thunk_runtime) {
     pipeline.AddPass<FloatNormalization>(&onednn_bf16_support);
   } else {
@@ -645,50 +740,25 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         F16, F32, HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution>);
   }
 
-  // Run the following passes to a fixed point.
-  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification"),
-   module] {
-    AddHloVerifier(&pipeline, HloVerifierOpts{},
-                   /*debug_only=*/true);
-
-    AlgebraicSimplifierOptions options;
-    options.set_enable_dot_strength_reduction(false);
-    // TODO(b/209827141): XLA:CPU doesn't propagate NaN through min/max, but
-    // other platforms do, so it should be changed.
-    options.set_minmax_propagate_nan(false);
-    options.set_supports_non_canonical_dots(false);
-    options.set_executing_on_cpu(true);
-    pipeline.AddPass<AlgebraicSimplifier>(options);
-    pipeline.AddPass<SortSimplifier>();
-    pipeline.AddPass<HloDCE>();
-    pipeline.AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
-
-    // Needs to happen after algebraic simplifier.
-    pipeline.AddPass<TreeReductionRewriter>();
-
-    // BatchNormExpander can create zero-sized ops, so zero-sized HLO
-    // elimination has to come after that pass.
-    pipeline.AddPass<ZeroSizedHloElimination>();
-
-    pipeline.AddPass<WhileLoopInvariantCodeMotion>();
-    pipeline.AddPass<TupleSimplifier>();
-    pipeline.AddPass<WhileLoopConstantSinking>();
-    pipeline.AddPass<WhileLoopSimplifier>();
-
-    // TODO(b/134075051): Re-enable after b/134075051 is fixed.
-    // pipeline.AddPass<SliceSinker>();
-
-    pipeline.AddPass<HloDCE>();
-    pipeline.AddPass<ReshapeMover>();
-    pipeline.AddPass<HloConstantFolding>(
-        options::FoldAllConstants(module->config())
-            ? HloConstantFolding::Level::kAgressive
-            : HloConstantFolding::Level::kDefault);
-    pipeline.AddPass<ConditionalSimplifier>();
-  }();
+  pipeline.AddPass(CreateSimplificationPipeline("simplification", module,
+                                                is_fusion_emitters));
 
+  // Scatter expander is sandwiched between two simplification pipelines to
+  // enable constant folding with the original scatter instructions (which is
+  // more efficient than with the expanded version) but then to also ensure that
+  // the resulting while loops are simplified.
   pipeline.AddPass<SelectAndScatterExpander>();
-  pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
+  if (is_fusion_emitters) {
+    pipeline.AddPass<ScatterExpander>(
+        ScatterExpander::kEliminateSimpleScatters);
+    pipeline.AddPass<ScatterSimplifier>();
+  }
+  if (!is_fusion_emitters || !kFusionEmitterScatterEnabled) {
+    pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
+  }
+
+  pipeline.AddPass(CreateSimplificationPipeline(
+      "post_scatter_expansion_simplification", module, is_fusion_emitters));
 
   pipeline.AddPass<BitcastDtypesExpander>();
 
@@ -698,8 +768,8 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
   pipeline.AddPass<TransposeFolding>(
       [&](const HloInstruction& dot, int64_t operand) -> absl::StatusOr<bool> {
-        if (DotImplementationCanHandleTranspose(dot,
-                                                *target_machine_features)) {
+        if (DotImplementationCanHandleTranspose(dot, *target_machine_features,
+                                                /*allow_runtime_calls=*/true)) {
           return TransposeFolding::IsRowColumnTransposeDotOperand(dot, operand);
         }
         return false;
@@ -738,6 +808,10 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     HloModule* module, bool is_aot_compile,
     TargetMachineFeatures* target_machine_features,
     const CompileOptions& compile_options) {
+  const auto& debug_options = module->config().debug_options();
+  const bool is_thunk_runtime = debug_options.xla_cpu_use_thunk_runtime();
+  const bool is_fusion_emitters =
+      is_thunk_runtime && debug_options.xla_cpu_use_fusion_emitters();
   HloPassPipeline pipeline("HLO passes after layout assignment");
 
   {
@@ -761,12 +835,8 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
           : tsl::port::NumSchedulableCPUs();
 
 #if defined(INTEL_MKL)
-  auto& debug_options = module->config().debug_options();
-  bool is_thunk_runtime = debug_options.xla_cpu_use_thunk_runtime();
-
   // AOT compiled code runs in single thread.
   if (!is_aot_compile && !is_thunk_runtime) {
-    auto debug_options = module->config().debug_options();
     // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
     // easier to match.
     // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
@@ -792,12 +862,16 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 
   // Add a fusion pass now that layout assignment is done.
   pipeline.AddPass<CpuInstructionFusion>();
+  if (is_fusion_emitters) {
+    pipeline.AddPass<FusionWrapper>();
+  }
 
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   // Run this to a fixed point.
   [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
-       "simplification after layout assignment")] {
+       "simplification after layout assignment"),
+   &module] {
     AddHloVerifier(
         &pipeline,
         HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
@@ -807,9 +881,9 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     options.set_is_layout_sensitive(true);
     options.set_supports_non_canonical_dots(false);
     options.set_enable_dot_strength_reduction(false);
-    // TODO(b/209827141): XLA:CPU doesn't propagate NaN through min/max, but
-    // other platforms do, so it should be changed.
-    options.set_minmax_propagate_nan(false);
+    // "slow" minmax means we propagate nan.
+    options.set_minmax_propagate_nan(
+        !module->config().debug_options().xla_cpu_enable_fast_min_max());
     options.set_executing_on_cpu(true);
     pipeline.AddPass<AlgebraicSimplifier>(options);
     pipeline.AddPass<HloDCE>();
@@ -836,9 +910,7 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   pipeline.AddPass<OptimizeInputOutputBufferAlias>(true);
 
   // If enabled we'll use more precise region based analysis for copy removal.
-  if (module->config()
-          .debug_options()
-          .xla_cpu_copy_insertion_use_region_analysis()) {
+  if (debug_options.xla_cpu_copy_insertion_use_region_analysis()) {
     pipeline.AddPass<CopyInsertion>(
         /*can_share_buffer=*/nullptr,
         /*use_region_based_live_range_analysis=*/-1);
@@ -846,6 +918,15 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     pipeline.AddPass<CopyInsertion>();
   }
 
+  // The hoisting of small while loops is only useful in the context of the
+  // thunk runtime.
+  if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
+    TF_ASSIGN_OR_RETURN(
+        int64_t byte_threshold,
+        xla::cpu::options::SmallWhileLoopByteThreshold(module->config()));
+    pipeline.AddPass<SmallWhileLoopHoistingPass>(byte_threshold);
+  }
+
   pipeline.AddPass<HloDCE>();
   return pipeline.Run(module).status();
 }
@@ -969,7 +1050,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<llvm::TargetMachine> jit_target_machine,
-      JitCompiler::InferTargetMachine(
+      IrCompiler::InferTargetMachine(
           CompilerTargetOptions(config), IrCompiler::GetCodeGenOptLevel(config),
           CpuFeatureFromString(config.debug_options().xla_cpu_max_isa())));
 
@@ -1232,6 +1313,42 @@ static void StripPayloadFromLiteralProto(HloProto& proto) {
   }
 }
 
+// Extracts the given set of kernels from the original module.
+// Returns a new module with the extracted kernels.
+static absl::StatusOr<std::unique_ptr<llvm::Module>> ExtractKernelsFromModule(
+    llvm::Module* original_module,
+    absl::flat_hash_set<llvm::StringRef> kernels) {
+  // Clone into a new module, only keeping definitions of the relevant kernels.
+  auto should_clone_definition = [&kernels](const llvm::GlobalValue* gv) {
+    if (auto* func = llvm::dyn_cast<llvm::Function>(gv)) {
+      return kernels.contains(func->getName());
+    }
+    return false;
+  };
+  llvm::ValueToValueMapTy vmap;
+  std::unique_ptr<llvm::Module> module =
+      llvm::CloneModule(*original_module, vmap, should_clone_definition);
+
+  // Erase the cloned symbols from the original module.
+  for (const auto& kernel_name : kernels) {
+    llvm::Function* to_be_removed = original_module->getFunction(kernel_name);
+    if (to_be_removed == nullptr) {
+      return Internal("Cannot remove kernel %s: cannot be found in module %s",
+                      kernel_name, original_module->getName());
+    }
+    to_be_removed->eraseFromParent();
+  }
+  return module;
+}
+
+static void AddXlaBackendExtraOptionsAsModuleFlag(
+    llvm::Module* llvm_module, llvm::StringRef backend_extra_options) {
+  auto* options_mdstring =
+      llvm::MDString::get(llvm_module->getContext(), backend_extra_options);
+  llvm_module->addModuleFlag(llvm::Module::Error, "xla_backend_extra_options",
+                             options_mdstring);
+}
+
 absl::StatusOr<std::unique_ptr<CpuExecutable>>
 CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
   TraceMe trace([&] {
@@ -1269,6 +1386,7 @@ CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
   IrCompiler::Options ir_compiler_options{
       /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(config),
       /*optimize_for_size=*/options::OptimizeForSizeRequested(config),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
       /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config),
       /*disable_expensive_passes=*/
       debug_options.xla_llvm_disable_expensive_passes(),
@@ -1291,18 +1409,18 @@ CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
 
   // Options for orchestrating the JIT compilation process.
   JitCompiler::Options jit_compiler_options{
-      std::move(ir_compiler_options),
-      std::move(ir_compiler_hooks),
       /*num_dylibs=*/parallel_codegen_split_count,
       /*definition_generator=*/std::move(definition_generator),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
   };
 
+  std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
+      CompilerTargetOptions(module->config()), std::move(ir_compiler_options),
+      std::move(ir_compiler_hooks));
+
   TF_ASSIGN_OR_RETURN(
       JitCompiler jit_compiler,
-      JitCompiler::Create(CompilerTargetOptions(module->config()),
-                          std::move(jit_compiler_options),
-                          GetCompilationTaskRunner()));
+      JitCompiler::Create(std::move(jit_compiler_options),
+                          std::move(ir_compiler), GetCompilationTaskRunner()));
 
   HloComputation* entry_computation = module->entry_computation();
   absl::flat_hash_map<const HloInstruction*, int64_t>
@@ -1406,14 +1524,43 @@ CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
       TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked()));
     }
 
+    // Some kernels have to be compiled separately because they have
+    // extra backend options.
+    int num_extra_functions = 0;
+    using BackendOptions = llvm::StringRef;
+    using Kernel = llvm::StringRef;
+    absl::flat_hash_map<BackendOptions, absl::flat_hash_set<Kernel>>
+        backend_extra_options_to_kernels;
+    for (const auto& k : ir_emitter2.kernels()) {
+      if (k.backend_extra_options.empty()) continue;
+      auto [_, inserted] =
+          backend_extra_options_to_kernels[k.backend_extra_options].insert(
+              k.name);
+      CHECK(inserted) << "Kernel " << k.name << " is not unique";
+      num_extra_functions++;
+    }
+    const int num_extra_parts = backend_extra_options_to_kernels.size();
+    // We assign one dylib to each set of kernels that have the same extra
+    // backend options. We do this because we work under the assumption that
+    // very few kernels will set extra options, and if they do, the options are
+    // likely to be identical.
+    if (num_extra_parts >= parallel_codegen_split_count) {
+      return Internal(
+          "Too many extra compilation parts due to non-default options (%d). "
+          "Consider reducing this number or increasing "
+          "parallel_codegen_split_count (%d)",
+          num_extra_parts, parallel_codegen_split_count);
+    }
+
     // We define the number of module parts based on the total number of
     // compiled functions (kernels and comparators) that are called from thunks,
     // and the maximum number of parts that we want to split the module into.
     size_t num_compiled_functions = ir_emitter2.kernels().size() +
                                     ir_emitter2.comparators().size() +
                                     thunk_emitter.kernels().size();
-    size_t num_parts =
-        std::min(num_compiled_functions, parallel_codegen_split_count);
+    size_t num_default_parts =
+        std::min(num_compiled_functions - num_extra_functions,
+                 parallel_codegen_split_count - num_extra_parts);
 
     // JIT compile the LLVM IR module to in-memory machine code. We split the
     // module into `num_jit_dylibs` parts to allow parallel compilation. In
@@ -1430,35 +1577,57 @@ CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
             << " kernels and " << ir_emitter2.comparators().size()
             << " comparators";
 
+    int dylib_index = 0;
+    auto add_jit_module = [&](std::unique_ptr<llvm::Module> llvm_module_part) {
+      // Collect symbols that are compiled in this LLVM module part.
+      RemoveUnusedSymbols(*llvm_module_part);
+      compiled_parts.push_back(
+          CollectCompiledSymbolsPart(ir_emitter2, *llvm_module_part));
+
+      std::string dump = llvm_ir::DumpToString(llvm_module_part.get());
+      VLOG(5) << "Adding compilation module:\n" << dump;
+
+      // Clone LLVM module part into its own thread safe context.
+      auto tsm =
+          CloneAsThreadSafeModule(dylib_index, std::move(llvm_module_part));
+      TF_CHECK_OK(jit_compiler.AddModule(std::move(tsm), dylib_index++));
+    };
+
+    // If there are extra parts, compile them first, since we must
+    // remove the affected kernels from the LLVM module.
+    if (num_extra_parts > 0) {
+      TraceMe trace([&] {
+        return TraceMeEncode("CompileExtraKernels",
+                             {{"num_extra_parts", num_extra_parts}});
+      });
+      for (const auto& [backend_extra_options, kernels] :
+           backend_extra_options_to_kernels) {
+        TF_ASSIGN_OR_RETURN(
+            std::unique_ptr<llvm::Module> new_module,
+            ExtractKernelsFromModule(llvm_module.get(), kernels));
+        AddXlaBackendExtraOptionsAsModuleFlag(new_module.get(),
+                                              backend_extra_options);
+        add_jit_module(std::move(new_module));
+      }
+    }
+
     if (HasLargeConstants(*llvm_module)) {
       VLOG(3) << "Skip parallel compilation due to large constants";
-      num_parts = 1;
+      num_default_parts = 1;
     }
 
-    if (num_parts > 1) {
-      VLOG(3) << "Split LLVM module into " << num_parts
+    if (num_default_parts > 1) {
+      VLOG(3) << "Split LLVM module into " << num_default_parts
               << " parts before codegen to enable parallel compilation"
               << " (max split count: " << parallel_codegen_split_count << ")";
 
       TraceMe trace([&] {
-        return TraceMeEncode("SplitModule", {{"num_parts", num_parts}});
+        return TraceMeEncode("SplitModule",
+                             {{"num_default_parts", num_default_parts}});
       });
 
-      llvm::SplitModule(
-          *llvm_module, num_parts,
-          [&, n = 0](std::unique_ptr<llvm::Module> llvm_module_part) mutable {
-            // Collect symbols that are compiled in this LLVM module part.
-            RemoveUnusedSymbols(*llvm_module_part);
-            compiled_parts.push_back(
-                CollectCompiledSymbolsPart(ir_emitter2, *llvm_module_part));
-
-            // Clone LLVM module part into its own thread safe context.
-            auto tsm = CloneAsThreadSafeModule(n, std::move(llvm_module_part));
-            TF_CHECK_OK(
-                jit_compiler.AddModule(std::move(tsm), /*dylib_index=*/n++));
-          },
-          /*PreserveLocals=*/true, /*RoundRobin=*/true);
-
+      llvm::SplitModule(*llvm_module, num_default_parts, add_jit_module,
+                        /*PreserveLocals=*/true, /*RoundRobin=*/true);
       // Free resources used by the original LLVM module.
       llvm_module.reset();
       llvm_context.reset();
@@ -1480,16 +1649,19 @@ CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
 
     VLOG(3) << "Adding " << thunk_emitter.kernels().size()
             << " kernels to the JIT compiler";
-    int kernel_dylib_index = 0;
+    // Make sure we use all the "default" modules for maximum parallelism.
+    int num_default_so_far = dylib_index - num_extra_parts;
+    int kernel_dylib_index =
+        num_default_so_far < num_default_parts ? num_default_so_far : 0;
     for (auto& [name, module] : thunk_emitter.kernels()) {
       compiled_symbols.push_back(
           FunctionLibrary::Sym<FunctionLibrary::Kernel>(name));
       symbol_type_id_to_function_type_id.emplace(
           compiled_symbols.back().type_id, SymbolProto::KERNEL);
-      TF_CHECK_OK(
-          jit_compiler.AddModule(std::move(module), kernel_dylib_index));
-      // Simply roundrobin the kernel dylibs
-      kernel_dylib_index = (kernel_dylib_index + 1) % num_parts;
+      TF_CHECK_OK(jit_compiler.AddModule(std::move(module),
+                                         num_extra_parts + kernel_dylib_index));
+      // Simply roundrobin the default kernel dylibs
+      kernel_dylib_index = (kernel_dylib_index + 1) % num_default_parts;
     }
 
     for (const CompiledSymbolsPart& part : compiled_parts) {
@@ -1511,7 +1683,8 @@ CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
 
     TraceMe trace_codegen([&] {
       return TraceMeEncode(
-          "Codegen", {{"num_parts", num_parts},
+          "Codegen", {{"num_default_parts", num_default_parts},
+                      {"num_extra_parts", num_extra_parts},
                       {"num_compiled_functions", num_compiled_functions}});
     });
 
@@ -1655,6 +1828,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
   auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
       modules[0]->config().debug_options().xla_backend_extra_options());
+  VlogMaxIsa(modules[0]->config().debug_options().xla_cpu_max_isa());
   llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
 
   // We can pass just one llvm::TargetOptions when we compile the LLVM module,
@@ -1730,189 +1904,501 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
   }
   llvm::CodeGenOptLevel opt_level =
       IrCompiler::GetCodeGenOptLevel(modules[0]->config());
-  std::shared_ptr<llvm::TargetMachine> target_machine =
-      absl::WrapUnique(target->createTargetMachine(
-          triple.getTriple(), options.cpu_name(), options.features(),
-          CompilerTargetOptions(modules[0]->config()), reloc_model,
-          std::nullopt, opt_level));
+  llvm::TargetOptions target_options =
+      CompilerTargetOptions(modules[0]->config());
+  auto target_machine_builder = [&]() {
+    return absl::WrapUnique(target->createTargetMachine(
+        triple.getTriple(), options.cpu_name(), options.features(),
+        target_options, reloc_model, std::nullopt, opt_level));
+  };
+
+  std::unique_ptr<llvm::TargetMachine> target_machine =
+      target_machine_builder();
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
   llvm::LLVMContext llvm_context;
 
   std::vector<std::unique_ptr<AotCompilationResult>> results;
-  for (size_t i = 0; i < modules.size(); ++i) {
-    HloModule* module = modules[i].get();
-    VLOG(1) << "Compiling ahead-of-time: " << module->name();
-
-    if (!module->has_schedule()) {
-      const bool is_thunk_runtime =
-          module->config().debug_options().xla_cpu_use_thunk_runtime();
-      // AOT compilation is incompatible with thunks; temporarily disable them.
-      if (is_thunk_runtime) {
-        module->mutable_config()
-            .mutable_debug_options()
-            .set_xla_cpu_use_thunk_runtime(false);
-      }
-      absl::Cleanup restore_thunk_runtime_value = [&] {
-        module->mutable_config()
-            .mutable_debug_options()
-            .set_xla_cpu_use_thunk_runtime(is_thunk_runtime);
-      };
-      TF_RETURN_IF_ERROR(RunHloPasses(module, /*is_aot_compile=*/true,
-                                      target_machine.get(),
-                                      /*dummy*/ CompileOptions{}));
-
-      TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                          ScheduleModule(module, BufferSizeBytesFunction()));
-
-      // Run buffer analysis on the HLO graph. This analysis figures out which
-      // temporary buffers are required to run the computation.
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<BufferAssignment> assignment,
-          BufferAssigner::Run(module,
-                              std::make_unique<SequentialHloOrdering>(schedule),
-                              BufferSizeBytesFunction(), memory_alignment,
-                              /*allocate_buffers_for_constants=*/true));
-      // BufferAssignment::ToString() includes a header, so no need for us to
-      // print one ourselves.
-      if (DumpingEnabledForHloModule(*module)) {
-        DumpToFileInDirOrStdout(*module, "", "buffer_assignment",
-                                assignment->ToString());
-      }
-      DumpHloModuleIfEnabled(*module, *assignment,
-                             absl::StrCat("cpu_", kAfterOptimizationsDumpName));
-
-      absl::flat_hash_map<const HloInstruction*, int64_t>
-          instruction_to_profile_idx;
-      absl::flat_hash_map<const HloComputation*, int64_t>
-          computation_to_profile_idx;
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
-
-      if (module->config().hlo_profiling_enabled()) {
-        TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
-            *module, &instruction_to_profile_idx, &computation_to_profile_idx,
-            &hlo_profile_index_map, &hlo_profile_printer_data));
-      }
+  for (auto& hlo_module : modules) {
+    VLOG(1) << "Compiling ahead-of-time: " << hlo_module->name();
+    if (hlo_module->has_schedule()) {
+      continue;
+    }
 
-      TargetMachineFeatures target_machine_features(target_machine.get());
-      std::vector<cpu_function_runtime::BufferInfo> buffer_infos =
-          CreateBufferInfosFromBufferAssignment(*module, *assignment);
-      HloComputation* computation = module->entry_computation();
-
-      // Set required information before emitting IR
-      auto llvm_module =
-          std::make_unique<llvm::Module>(kXlaModuleIdentifier, llvm_context);
-      llvm_module->setDataLayout(target_machine->createDataLayout());
-      llvm_module->setTargetTriple(triple.getTriple());
-      if (pic_level != llvm::PICLevel::NotPIC) {
-        llvm_module->setPICLevel(pic_level);
-      }
-      if (pie_level != llvm::PIELevel::Default) {
-        llvm_module->setPIELevel(pie_level);
-      }
-      IrEmitter ir_emitter(
-          &mlir_context, *module, *assignment, llvm_module.get(),
-          std::move(instruction_to_profile_idx),
-          std::move(computation_to_profile_idx),
-          ModuleComputationsTransitivelyContainCustomCall(*module),
-          &target_machine_features,
-          // TODO(b/66051036): Run full msan for AOT.
-          /*emit_code_for_msan=*/false);
-
-      TF_RETURN_IF_ERROR(ir_emitter.EmitAllConstantGlobals());
-
-      for (ComputationToEmit subcomputation :
-           SubcomputationEmissionOrder(computation)) {
-        if (subcomputation.computation->IsFusionComputation()) {
-          continue;
-        }
-        TF_RETURN_IF_ERROR(
-            ir_emitter
-                .EmitComputation(subcomputation.computation,
-                                 subcomputation.computation->name(),
-                                 /*is_top_level_computation=*/false,
-                                 schedule.sequence(subcomputation.computation)
-                                     .instructions(),
-                                 subcomputation.allow_reassociation)
-                .status());
-      }
-      const std::string& entry_point_name = options.entry_point_name();
-      TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
-                          ir_emitter.EmitComputation(
-                              computation, entry_point_name,
-                              /*is_top_level_computation=*/true,
-                              schedule.sequence(computation).instructions(),
-                              /*allow_reassociation=*/false));
-
-      CHECK(entry_function->getName() == entry_point_name);
-
-      ModuleHook pre_optimization_ir_hook;
-      ModuleHook post_optimization_ir_hook;
-      std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
-          GetIRModuleHooks(*module, user_pre_optimization_hook_,
-                           user_post_optimization_hook_);
-
-      // Run the LLVM verifier over the unoptimized LLVM IR.  If it fails, run
-      // the pre-optimization IR dump hook before returning.
-      {
-        absl::Status verify_status = VerifyLlvmModule(*llvm_module);
-        if (!verify_status.ok() && pre_optimization_ir_hook) {
-          pre_optimization_ir_hook(*llvm_module);
-        }
-        TF_RETURN_IF_ERROR(verify_status);
-      }
+    TF_RETURN_IF_ERROR(RunHloPasses(hlo_module.get(), /*is_aot_compile=*/true,
+                                    target_machine.get(),
+                                    /*dummy*/ CompileOptions{}));
 
-      auto post_codegen_hook = [&](const llvm::Module& llvm_module,
-                                   const llvm::object::ObjectFile& obj_file) {
-        if (!DumpingEnabledForHloModule(*module)) {
-          return;
-        }
-        DumpModuleToFile(llvm_module, obj_file, *module);
-      };
+    if (hlo_module->config().debug_options().xla_cpu_use_thunk_runtime()) {
+      TF_ASSIGN_OR_RETURN(results.emplace_back(),
+                          CompileAheadOfTimeThunks(
+                              std::move(hlo_module), target_machine_builder,
+                              options, triple, pic_level, pie_level));
+    } else {
+      TF_ASSIGN_OR_RETURN(results.emplace_back(),
+                          CompileAheadOfTimeLegacy(
+                              std::move(hlo_module), target_machine_builder,
+                              options, triple, pic_level, pie_level));
+    }
+  }
 
-      IrCompiler::Options ir_compiler_options = {
-          /*optimization_level=*/opt_level,
-          /*optimize_for_size=*/
-          options::OptimizeForSizeRequested(module->config()),
-          /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
-          /*disable_expensive_passes=*/
-          module->config().debug_options().xla_llvm_disable_expensive_passes(),
-          /*disable_slp_vectorizer=*/
-          options::SlpVectorizerDisabled(module->config()),
-          /*disable_loop_unrolling=*/
-          options::DisableLoopUnrolling(module->config()),
-          /*dfsan_enabled=*/aot_options.sanitize_dataflow(),
-          /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()};
-
-      IrCompiler::CompilationHooks ir_compiler_hooks = {
-          pre_optimization_ir_hook,
-          post_optimization_ir_hook,
-          post_codegen_hook,
-      };
+  VLOG(1) << "Compilation finished";
+  return std::move(results);
+}
 
-      IrCompiler ir_compiler([&] { return target_machine; },
-                             std::move(ir_compiler_options),
-                             std::move(ir_compiler_hooks));
+absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+CpuCompiler::CompileAheadOfTimeLegacy(
+    std::unique_ptr<HloModule> module,
+    IrCompiler::TargetMachineBuilder target_machine_builder,
+    const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
+    const llvm::PICLevel::Level& pic_level,
+    const llvm::PIELevel::Level& pie_level) {
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module.get(), BufferSizeBytesFunction()));
+
+  // Run buffer analysis on the HLO graph. This analysis figures out which
+  // temporary buffers are required to run the computation.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> assignment,
+      BufferAssigner::Run(module.get(),
+                          std::make_unique<SequentialHloOrdering>(schedule),
+                          BufferSizeBytesFunction(), memory_alignment,
+                          /*allocate_buffers_for_constants=*/true));
+  // BufferAssignment::ToString() includes a header, so no need for us to
+  // print one ourselves.
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "", "buffer_assignment",
+                            assignment->ToString());
+  }
+  DumpHloModuleIfEnabled(*module, *assignment,
+                         absl::StrCat("cpu_", kAfterOptimizationsDumpName));
 
-      std::unique_ptr<llvm::MemoryBuffer> object_file =
-          cantFail(ir_compiler(*llvm_module));
-      ObjectFileData object_file_data(object_file->getBufferStart(),
-                                      object_file->getBufferEnd());
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instruction_to_profile_idx;
+  absl::flat_hash_map<const HloComputation*, int64_t>
+      computation_to_profile_idx;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
 
-      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                          assignment->GetUniqueTopLevelOutputSlice());
+  if (module->config().hlo_profiling_enabled()) {
+    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+        &hlo_profile_index_map, &hlo_profile_printer_data));
+  }
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
+                      target_machine_builder());
+  TargetMachineFeatures target_machine_features(target_machine.get());
+  std::vector<cpu_function_runtime::BufferInfo> buffer_infos =
+      CreateBufferInfosFromBufferAssignment(*module, *assignment);
+  HloComputation* computation = module->entry_computation();
+
+  // Compile must be thread-safe so create a new LLVM context for the module.
+  mlir::MLIRContext mlir_context;
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
+
+  // Set required information before emitting IR
+  auto llvm_module =
+      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
+  llvm_module->setDataLayout(target_machine->createDataLayout());
+  llvm_module->setTargetTriple(triple);
+  if (pic_level != llvm::PICLevel::NotPIC) {
+    llvm_module->setPICLevel(pic_level);
+  }
+  if (pie_level != llvm::PIELevel::Default) {
+    llvm_module->setPIELevel(pie_level);
+  }
+  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
+                       std::move(instruction_to_profile_idx),
+                       std::move(computation_to_profile_idx),
+                       ModuleComputationsTransitivelyContainCustomCall(*module),
+                       &target_machine_features,
+                       // TODO(b/66051036): Run full msan for AOT.
+                       /*emit_code_for_msan=*/false);
 
-      results.emplace_back(std::make_unique<CpuAotCompilationResult>(
-          std::move(object_file_data), std::move(buffer_infos),
-          result_slice.index(), std::move(modules[i]),
-          std::move(hlo_profile_printer_data)));
+  TF_RETURN_IF_ERROR(ir_emitter.EmitAllConstantGlobals());
+
+  for (ComputationToEmit subcomputation :
+       SubcomputationEmissionOrder(computation)) {
+    if (subcomputation.computation->IsFusionComputation()) {
+      continue;
     }
+    TF_RETURN_IF_ERROR(
+        ir_emitter
+            .EmitComputation(
+                subcomputation.computation, subcomputation.computation->name(),
+                /*is_top_level_computation=*/false,
+                schedule.sequence(subcomputation.computation).instructions(),
+                subcomputation.allow_reassociation)
+            .status());
   }
+  const std::string& entry_point_name = aot_options.entry_point_name();
+  TF_ASSIGN_OR_RETURN(
+      llvm::Function * entry_function,
+      ir_emitter.EmitComputation(computation, entry_point_name,
+                                 /*is_top_level_computation=*/true,
+                                 schedule.sequence(computation).instructions(),
+                                 /*allow_reassociation=*/false));
 
-  VLOG(1) << "Compilation finished";
-  return std::move(results);
+  CHECK(entry_function->getName() == entry_point_name);
+
+  ModuleHook pre_optimization_ir_hook;
+  ModuleHook post_optimization_ir_hook;
+  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
+      GetIRModuleHooks(*module, user_pre_optimization_hook_,
+                       user_post_optimization_hook_);
+
+  // Run the LLVM verifier over the unoptimized LLVM IR.  If it fails, run
+  // the pre-optimization IR dump hook before returning.
+  {
+    absl::Status verify_status = VerifyLlvmModule(*llvm_module);
+    if (!verify_status.ok() && pre_optimization_ir_hook) {
+      pre_optimization_ir_hook(*llvm_module);
+    }
+    TF_RETURN_IF_ERROR(verify_status);
+  }
+
+  auto post_codegen_hook = [&](const llvm::Module& llvm_module,
+                               const llvm::object::ObjectFile& obj_file) {
+    if (!DumpingEnabledForHloModule(*module)) {
+      return;
+    }
+    DumpModuleToFile(llvm_module, obj_file, *module);
+  };
+
+  DebugOptions debug_options = module->config().debug_options();
+  IrCompiler::Options ir_compiler_options = {
+      /*optimization_level=*/target_machine->getOptLevel(),
+      /*optimize_for_size=*/
+      options::OptimizeForSizeRequested(module->config()),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
+      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
+      /*disable_expensive_passes=*/
+      debug_options.xla_llvm_disable_expensive_passes(),
+      /*disable_slp_vectorizer=*/
+      options::SlpVectorizerDisabled(module->config()),
+      /*disable_loop_unrolling=*/
+      options::DisableLoopUnrolling(module->config()),
+      /*dfsan_enabled=*/aot_options.sanitize_dataflow(),
+      /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()};
+
+  IrCompiler::CompilationHooks ir_compiler_hooks = {
+      pre_optimization_ir_hook,
+      post_optimization_ir_hook,
+      post_codegen_hook,
+  };
+
+  IrCompiler ir_compiler(std::move(target_machine_builder),
+                         std::move(ir_compiler_options),
+                         std::move(ir_compiler_hooks));
+
+  std::unique_ptr<llvm::MemoryBuffer> object_file =
+      cantFail(ir_compiler(*llvm_module));
+  ObjectFileData object_file_data(object_file->getBufferStart(),
+                                  object_file->getBufferEnd());
+
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment->GetUniqueTopLevelOutputSlice());
+
+  return std::make_unique<CpuAotCompilationResultLegacy>(
+      std::move(object_file_data), std::move(buffer_infos),
+      result_slice.index(), std::move(module),
+      std::move(hlo_profile_printer_data));
+}
+
+absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+CpuCompiler::CompileAheadOfTimeThunks(
+    std::unique_ptr<HloModule> module,
+    IrCompiler::TargetMachineBuilder target_machine_builder,
+    const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
+    const llvm::PICLevel::Level& pic_level,
+    const llvm::PIELevel::Level& pie_level) {
+  TraceMe trace([&] {
+    return TraceMeEncode("CpuCompiler::CompileAheadOfTimeThunks",
+                         {{"name", module->name()}});
+  });
+  // Compile must be thread-safe so create a new LLVM context for the module.
+  mlir::MLIRContext mlir_context;
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
+
+  const DebugOptions& debug_options = module->config().debug_options();
+
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module));
+  TF_RETURN_IF_ERROR(module->set_schedule(schedule));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> assignment,
+                      CreateBufferAssignment(*module));
+  DumpHloModuleIfEnabled(*module, *assignment,
+                         absl::StrCat("cpu_aot_", kAfterOptimizationsDumpName));
+
+  // TODO profiling related, probably delete this
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instruction_to_profile_idx;
+  absl::flat_hash_map<const HloComputation*, int64_t>
+      computation_to_profile_idx;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
+  if (module->config().hlo_profiling_enabled()) {
+    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+        &hlo_profile_index_map, &hlo_profile_printer_data));
+  }
+  // probably delete this end
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
+                      target_machine_builder());
+  TargetMachineFeatures target_machine_features(target_machine.get());
+
+  auto llvm_module =
+      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
+
+  llvm_module->setDataLayout(target_machine->createDataLayout());
+  llvm_module->setTargetTriple(triple);
+  if (pic_level != llvm::PICLevel::NotPIC) {
+    llvm_module->setPICLevel(pic_level);
+  }
+  if (pie_level != llvm::PIELevel::Default) {
+    llvm_module->setPIELevel(pie_level);
+  }
+
+  // Emitting part
+  // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should
+  // be renamed to NestedIrEmitter and be used only for emitting nested (aka
+  // thread local or embedded) computations (reductions, maps, etc.).
+
+  // (Nested) IrEmitter is responsible for building LLVM module with functions
+  // for all HLO computations. In thunk execution mode we only build LLVM
+  // functions for embedded computations (e.g. reduction computations) and all
+  // high-level operations (fusions, elementwise, etc.) are lowered to kernel
+  // functions (which are also LLVM functions, but use a HostKernel ABI).
+  IrEmitter nested_ir_emitter(
+      &mlir_context, *module, *assignment, llvm_module.get(),
+      std::move(instruction_to_profile_idx),
+      std::move(computation_to_profile_idx),
+      ModuleComputationsTransitivelyContainCustomCall(*module),
+      &target_machine_features,
+      // TODO(b/66051036): Run full msan for AOT.
+      /*emit_code_for_msan=*/false);
+
+  // The thunk runtime manages large constants, therefore we only emit
+  // small ones.
+  TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals());
+
+  // IR emitter is responsible for building LLVM module with host kernels for
+  // corresponding HLO instructions (fusions, elemental instructions, etc.).
+  IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter);
+
+  // Thunk emitter is responsible for building a Thunk sequence that will
+  // resolved kernels in the compiled LLVM module and execute them together
+  // with Thunks implemented as library calls (e.g. oneDNN or Eigen).
+  ThunkEmitter thunk_emitter(ir_emitter2, *assignment, target_machine_features,
+                             module->config());
+  TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
+                      thunk_emitter.EmitEntryComputation(*module));
+
+  // Cache these flags here since we'll want to access them after the module's
+  // ownership is std::moved.
+  const bool embed_ir_in_executable =
+      debug_options.xla_embed_ir_in_executable();
+
+  std::string ir_module_string;
+  if (embed_ir_in_executable) {
+    std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get());
+
+    auto thunk_kernel_fmt = [](std::string* out,
+                               const ThunkEmitter::EmittedKernel& kernel) {
+      absl::StrAppend(out,
+                      llvm_ir::DumpToString(kernel.module.getModuleUnlocked()));
+    };
+    std::string thunks_ir =
+        absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt);
+
+    ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir);
+  }
+
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+  for (const auto& [name, module] : thunk_emitter.kernels()) {
+    TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked()));
+  }
+
+  // Compilation part
+  ModuleHook pre_optimization_ir_hook;
+  ModuleHook post_optimization_ir_hook;
+  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
+      GetIRModuleHooks(*module, user_pre_optimization_hook_,
+                       user_post_optimization_hook_);
+
+  std::vector<std::string> obj_files;
+  auto post_codegen_hook = [&](const llvm::Module& llvm_module,
+                               const llvm::object::ObjectFile& obj_file) {
+    obj_files.push_back(obj_file.getData().str());
+    if (!DumpingEnabledForHloModule(*module)) {
+      return;
+    }
+    absl::string_view id = llvm_module.getModuleIdentifier();
+    size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size());
+    DumpToFileInDir(
+        *module, /*file_prefix=*/"",
+        /*file_suffix=*/absl::StrCat("obj-file.", id.substr(pos), ".o"),
+        absl::string_view(obj_file.getData().data(),
+                          obj_file.getData().size()));
+  };
+
+  IrCompiler::Options ir_compiler_options = {
+      /*optimization_level=*/target_machine->getOptLevel(),
+      /*optimize_for_size=*/
+      options::OptimizeForSizeRequested(module->config()),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
+      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
+      /*disable_expensive_passes=*/
+      module->config().debug_options().xla_llvm_disable_expensive_passes(),
+      /*disable_slp_vectorizer=*/
+      options::SlpVectorizerDisabled(module->config()),
+      /*disable_loop_unrolling=*/
+      options::DisableLoopUnrolling(module->config()),
+      /*dfsan_enabled=*/aot_options.sanitize_dataflow(),
+      /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()};
+
+  IrCompiler::CompilationHooks ir_compiler_hooks = {
+      pre_optimization_ir_hook,
+      post_optimization_ir_hook,
+      post_codegen_hook,
+  };
+
+  IrCompiler ir_compiler(std::move(target_machine_builder),
+                         std::move(ir_compiler_options),
+                         std::move(ir_compiler_hooks));
+
+  // For simplicity no parallel compilation is used.
+  std::vector<CompiledSymbolsPart> compiled_parts;
+  compiled_parts.push_back(
+      CollectCompiledSymbolsPart(ir_emitter2, *llvm_module));
+
+  // Collect compiled symbols from all LLVM module parts.
+  std::vector<FunctionLibrary::Symbol> compiled_symbols;
+
+  absl::flat_hash_map<FunctionLibrary::TypeId, SymbolProto::FunctionTypeId>
+      symbol_type_id_to_function_type_id;
+
+  VLOG(3) << "Compiling " << thunk_emitter.kernels().size()
+          << " thunk kernels.";
+
+  // We have to clone the LLVM module into a local context to be able to link
+  // it with the other modules. This enables us to have one object file for all
+  // the kernels.
+  auto copy_llvm_module_to_local_context =
+      [&llvm_context](llvm::Module& module) {
+        // There is no way to clone a module from one context to another, so we
+        // need to serialize the module to bitcode and parse it back into the
+        // new context.
+        llvm::SmallString<0> bc;
+        llvm::raw_svector_ostream bcos(bc);
+        llvm::WriteBitcodeToFile(module, bcos);
+
+        // Parse module back into its own LLVM context.
+        auto clone_module = llvm::parseBitcodeFile(
+            llvm::MemoryBufferRef(llvm::StringRef(bc.data(), bc.size()),
+                                  absl::StrFormat("%s_cloned_to_local_context",
+                                                  kXlaModuleIdentifier)),
+            *llvm_context);
+
+        return clone_module;
+      };
+
+  llvm::Linker linker(*llvm_module);
+
+  for (auto& [name, module] : thunk_emitter.kernels()) {
+    compiled_symbols.push_back(
+        FunctionLibrary::Sym<FunctionLibrary::Kernel>(name));
+    symbol_type_id_to_function_type_id.emplace(compiled_symbols.back().type_id,
+                                               SymbolProto::KERNEL);
+    auto cloned_module =
+        copy_llvm_module_to_local_context(*module.getModuleUnlocked());
+    if (!cloned_module) {
+      return Internal("Failed to clone LLVM module.");
+    }
+    // Match data layouts to avoid warning messages.
+    cloned_module->get()->setDataLayout(llvm_module->getDataLayout());
+    linker.linkInModule(std::move(cloned_module.get()));
+  }
+
+  cantFail(ir_compiler(*llvm_module));
+
+  for (const CompiledSymbolsPart& part : compiled_parts) {
+    for (const IrEmitter2::KernelInfo& kernel : part.kernels) {
+      compiled_symbols.push_back(
+          FunctionLibrary::Sym<FunctionLibrary::Kernel>(kernel.name));
+      symbol_type_id_to_function_type_id.emplace(
+          compiled_symbols.back().type_id, SymbolProto::KERNEL);
+    }
+    for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) {
+      compiled_symbols.push_back(
+          FunctionLibrary::Sym<FunctionLibrary::Comparator>(comparator.name));
+      symbol_type_id_to_function_type_id.emplace(
+          compiled_symbols.back().type_id, SymbolProto::COMPARATOR);
+    }
+  }
+
+  VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
+
+  // Create constant allocations from the buffer assignment.
+  TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
+                      CreateConstantAllocations(*assignment));
+
+  TF_ASSIGN_OR_RETURN(
+      auto cpu_executable,
+      CpuExecutable::Create(
+          /*function_library=*/nullptr,  // NOTE: We don't need to generate a
+                                         // function library as the only purpose
+                                         // of this executable is to get
+                                         // exported.
+          std::move(assignment), std::move(module), std::move(thunks),
+          std::move(constants), std::move(hlo_profile_printer_data),
+          std::move(hlo_profile_index_map)));
+
+  // Save compiled symbols to be able to export them to AOT compilation
+  // result.
+  cpu_executable->set_compiled_symbols(std::move(compiled_symbols));
+
+  // Save mapping between symbol type id and function type id to be able to
+  // export them to AOT compilation result.
+  cpu_executable->set_symbol_type_id_to_function_type_id(
+      symbol_type_id_to_function_type_id);
+
+  if (embed_ir_in_executable) {
+    cpu_executable->set_ir_module_string(ir_module_string);
+  }
+
+  // Dump computation proto state and buffer assignment for
+  // GetCompiledMemoryStats results.
+  auto with_hlo_proto = [&](std::unique_ptr<CpuExecutable> cpu_executable) {
+    auto hlo_proto = std::make_unique<HloProto>();
+    *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
+    *hlo_proto->mutable_buffer_assignment() =
+        cpu_executable->buffer_assignment().ToProto();
+    StripPayloadFromLiteralProto(*hlo_proto);
+    cpu_executable->set_hlo_proto(std::move(hlo_proto));
+    return cpu_executable;
+  };
+
+  cpu_executable = with_hlo_proto(std::move(cpu_executable));
+
+  const ThunkSequence& thunk_sequence =
+      cpu_executable->thunks().thunk_sequence();
+
+  std::unique_ptr<HloProfilePrinterData> executable_hlo_profile_printer_data =
+      cpu_executable->module().config().hlo_profiling_enabled()
+          ? std::make_unique<HloProfilePrinterData>(
+                cpu_executable->hlo_profile_printer_data())
+          : nullptr;
+
+  return CpuAotCompilationResultThunks::Create(
+      &cpu_executable->module(), &cpu_executable->buffer_assignment(),
+      cpu_executable->module_name(), std::move(obj_files),
+      cpu_executable->get_compiled_symbols_proto(), thunk_sequence,
+      std::move(*cpu_executable).consume_function_library().release(),
+      std::move(executable_hlo_profile_printer_data));
 }
 
 se::Platform::Id CpuCompiler::PlatformId() const {
@@ -1925,8 +2411,14 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
 
 namespace {
 
-// This is a result of exporting JIT compiled CpuExecutable to AOT compilation
-// result that can be saved on disk and shipped over the wire.
+// TODO(basioli): This should be removed once new runtime is implemented, and
+// CpuAotCompilationResult will be the only implementation of
+// AotCompilationResult. This is still used as it allows us to `Export` and
+// subsequently load both runtimes.
+
+// This is a result of exporting JIT compiled
+// CpuExecutable to AOT compilation result that can be saved on disk and shipped
+// over the wire.
 class CpuExecutableAotCompilationResult : public AotCompilationResult {
  public:
   static absl::StatusOr<std::unique_ptr<CpuExecutableAotCompilationResult>>
@@ -1968,7 +2460,8 @@ class CpuExecutableAotCompilationResult : public AotCompilationResult {
   }
 
   absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) const override;
+      Compiler* compiler,
+      const se::StreamExecutor* stream_exec) const&& override;
 
   const HloModule* optimized_module() const override { return module_.get(); }
 
@@ -2018,7 +2511,7 @@ class CpuExecutableAotCompilationResult : public AotCompilationResult {
 
 absl::StatusOr<std::unique_ptr<Executable>>
 CpuExecutableAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* stream_exec) const {
+    Compiler* compiler, const se::StreamExecutor* stream_exec) const&& {
   // Recreate HloModule from proto.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
@@ -2038,12 +2531,12 @@ CpuExecutableAotCompilationResult::LoadExecutable(
   const HloModuleConfig& config = module->config();
 
   // Infer target machine from the current host CPU.
-  IrCompiler::TargetMachineBuilder target_machine_builder =
-      JitCompiler::InferTargetMachineBuilder(
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<llvm::TargetMachine> target_machine,
+      IrCompiler::InferTargetMachine(
           std::move(CompilerTargetOptions(module->config())),
           IrCompiler::GetCodeGenOptLevel(config),
-          CpuFeatureFromString(debug_options.xla_cpu_max_isa()));
-  TF_ASSIGN_OR_RETURN(auto target_machine, target_machine_builder());
+          CpuFeatureFromString(debug_options.xla_cpu_max_isa())));
 
   // Definition generator to link with XLA:CPU host runtime symbols.
   ExecutionEngine::DefinitionGenerator definition_generator =
@@ -2188,17 +2681,18 @@ CpuCompiler::LoadAotCompilationResult(
 absl::StatusOr<HloSchedule> CpuCompiler::CreateHloSchedule(
     const HloModule& hlo_module) const {
   // Select a memory scheduler optimized for concurrency vs minimal memory.
-  auto scheduler = hlo_module.config()
-                           .debug_options()
-                           .xla_cpu_enable_concurrency_optimized_scheduler()
-                       ? BFSMemoryScheduler
-                       : DFSMemoryScheduler;
+  auto scheduler =
+      hlo_module.config()
+              .debug_options()
+              .xla_cpu_enable_concurrency_optimized_scheduler()
+          ? std::unique_ptr<ModuleSchedulerAlgorithm>(
+                std::make_unique<BFScheduler>(BufferSizeBytesFunction()))
+          : std::make_unique<DFSMemoryScheduler>(BufferSizeBytesFunction());
 
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
   // and reduced memory usage (as compared to using `DependencyHloOrdering`).
-  return ScheduleModule(&hlo_module, BufferSizeBytesFunction(),
-                        ComputationSchedulerToModuleScheduler(scheduler));
+  return ScheduleModule(&hlo_module, *scheduler);
 }
 
 absl::StatusOr<std::unique_ptr<BufferAssignment>>
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index 291974727e82..602e4f5a7301 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -22,7 +22,10 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
@@ -116,6 +119,22 @@ class CpuCompiler : public LLVMCompiler {
   absl::StatusOr<std::unique_ptr<CpuExecutable>> CompileCpuExecutable(
       std::unique_ptr<HloModule> module);
 
+  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  CompileAheadOfTimeLegacy(
+      std::unique_ptr<HloModule> module,
+      IrCompiler::TargetMachineBuilder target_machine_builder,
+      const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
+      const llvm::PICLevel::Level& pic_level,
+      const llvm::PIELevel::Level& pie_level);
+
+  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  CompileAheadOfTimeThunks(
+      std::unique_ptr<HloModule> module,
+      IrCompiler::TargetMachineBuilder target_machine_builder,
+      const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
+      const llvm::PICLevel::Level& pic_level,
+      const llvm::PIELevel::Level& pie_level);
+
   CpuCompiler(const CpuCompiler&) = delete;
   CpuCompiler& operator=(const CpuCompiler&) = delete;
 };
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc b/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
new file mode 100644
index 000000000000..cefc02741065
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/cpu_compiler_internals_test.cc
@@ -0,0 +1,169 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/nullability.h"
+#include "absl/strings/string_view.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Casting.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/llvm_compiler.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using CpuCompilerInternalsTest = HloTestBase;
+
+std::optional<int64_t> GetMetadataInt(llvm::Metadata* absl_nullable value) {
+  if (value == nullptr) {
+    return std::nullopt;
+  }
+  auto* cam = llvm::dyn_cast<llvm::ConstantAsMetadata>(value);
+  if (cam == nullptr) {
+    return std::nullopt;
+  }
+  auto* c = llvm::dyn_cast<llvm::ConstantInt>(cam->getValue());
+  if (c == nullptr) {
+    return std::nullopt;
+  }
+  return c->getSExtValue();
+}
+
+std::optional<std::string> GetMetadataString(
+    llvm::Metadata* absl_nullable value) {
+  if (value == nullptr) {
+    return std::nullopt;
+  }
+  auto* md_string = llvm::dyn_cast<llvm::MDString>(value);
+  if (md_string == nullptr) {
+    return std::nullopt;
+  }
+  return md_string->getString().str();
+}
+
+std::optional<int64_t> GetXlaDylibIndex(const llvm::Module& llvm_module) {
+  llvm::Metadata* md = llvm_module.getModuleFlag("xla_dylib_index");
+  return GetMetadataInt(md);
+}
+
+std::optional<std::string> GetXlaBackendExtraOptions(
+    const llvm::Module& llvm_module) {
+  llvm::Metadata* md = llvm_module.getModuleFlag("xla_backend_extra_options");
+  return GetMetadataString(md);
+}
+
+static constexpr absl::string_view kAddScatterHlo = R"(
+  add {
+    %lhs = f32[] parameter(0)
+    %rhs = f32[] parameter(1)
+    ROOT %add.2 = f32[] add(%lhs, %rhs)
+  }
+
+  ENTRY main {
+    %a = f32[50,64,8] parameter(0)
+    %b = f32[50,64,8] parameter(1)
+    %operand = f32[50,64,8] add(%a, %b)
+    %indices = s32[500,1]{1,0} parameter(2)
+    %updates = f32[500,1,64,8] parameter(3)
+    ROOT %scatter = f32[50,64,8] scatter(%operand, %indices, %updates),
+      update_window_dims={1,2,3},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      to_apply=add
+  }
+)";
+
+TEST_F(CpuCompilerInternalsTest, DylibWithThunks) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kAddScatterHlo));
+  DebugOptions& debug_options =
+      hlo_module->mutable_config().mutable_debug_options();
+  debug_options.set_xla_cpu_use_thunk_runtime(true);
+  debug_options.set_xla_cpu_use_fusion_emitters(false);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
+                          GetOptimizedModule(std::move(hlo_module)));
+
+  int64_t max_seen = -1;
+  auto pre_opt_hook = [&](const llvm::Module& llvm_module) {
+    std::optional<int64_t> dylib_index = GetXlaDylibIndex(llvm_module);
+    if (dylib_index) {
+      max_seen = std::max(max_seen, *dylib_index);
+    }
+  };
+
+  LLVMCompiler* compiler = static_cast<LLVMCompiler*>(backend().compiler());
+  compiler->SetPreOptimizationHook(pre_opt_hook);
+  ASSERT_TRUE(compiler
+                  ->RunBackend(std::move(optimized_module),
+                               backend().default_stream_executor(),
+                               /*device_allocator=*/nullptr)
+                  .ok());
+  compiler->RemovePreOptimizationHook();
+
+  EXPECT_GT(max_seen, 0) << "max dylib_index(" << max_seen << ") too low; "
+                         << "expected to use more dylibs.";
+}
+
+TEST_F(CpuCompilerInternalsTest, JustOneDylibWithThunks) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kAddScatterHlo));
+  DebugOptions& debug_options =
+      hlo_module->mutable_config().mutable_debug_options();
+  debug_options.set_xla_cpu_use_thunk_runtime(true);
+  debug_options.set_xla_cpu_use_fusion_emitters(false);
+  debug_options.set_xla_cpu_parallel_codegen_split_count(1);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
+                          GetOptimizedModule(std::move(hlo_module)));
+
+  int64_t max_seen = -1;
+  auto pre_opt_hook = [&](const llvm::Module& llvm_module) {
+    std::optional<int64_t> dylib_index = GetXlaDylibIndex(llvm_module);
+    if (dylib_index) {
+      max_seen = std::max(max_seen, *dylib_index);
+    }
+  };
+
+  LLVMCompiler* compiler = static_cast<LLVMCompiler*>(backend().compiler());
+  compiler->SetPreOptimizationHook(pre_opt_hook);
+  ASSERT_TRUE(compiler
+                  ->RunBackend(std::move(optimized_module),
+                               backend().default_stream_executor(),
+                               /*device_allocator=*/nullptr)
+                  .ok());
+  compiler->RemovePreOptimizationHook();
+
+  EXPECT_EQ(max_seen, 0) << "max dylib_index(" << max_seen
+                         << ") != 0, but only "
+                         << "one dylib is allowed.";
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/third_party/xla/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
index b898eab1c5d2..688697cd24bf 100644
--- a/third_party/xla/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/cpu/ir_emission_utils.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace cpu {
@@ -28,7 +28,7 @@ namespace {
 // Test that we don't call into Eigen with tensors too small to be aligned
 // reliably.
 
-using CpuEigenTensorAlignmentTest = HloTestBase;
+using CpuEigenTensorAlignmentTest = HloHardwareIndependentTestBase;
 
 TEST_F(CpuEigenTensorAlignmentTest, EigenConvAlignment) {
   std::string hlo_string = R"(
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 8ee9339e20b8..158a4000d277 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -76,7 +76,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<FunctionLibrary> function_library,
     std::unique_ptr<const BufferAssignment> assignment,
@@ -120,8 +119,11 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
       std::move(hlo_profile_index_map), std::move(assignment)));
   executable->function_library_ = std::move(function_library);
 
-  TF_ASSIGN_OR_RETURN(executable->thunks_,
-                      ThunkExecutor::Create(std::move(thunks)));
+  ThunkExecutor::Options thunk_executor_options;
+  thunk_executor_options.is_nested_executor = false;
+  TF_ASSIGN_OR_RETURN(
+      executable->thunks_,
+      ThunkExecutor::Create(std::move(thunks), thunk_executor_options));
 
   // Re-index constants by their allocation index to allow efficient lookup.
   for (auto& constant : constants) {
@@ -513,7 +515,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
   }
   // Each dynamic dimension size is represented as a S32.
-  int64_t metadata_size = sizeof(int32_t) * shape.dimensions_size();
+  int64_t metadata_size = sizeof(int32_t) * shape.dimensions().size();
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
 }
 
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.cc b/third_party/xla/xla/service/cpu/cpu_float_support.cc
deleted file mode 100644
index d0098db4e0ab..000000000000
--- a/third_party/xla/xla/service/cpu/cpu_float_support.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if defined(INTEL_MKL)
-
-#include "xla/service/cpu/cpu_float_support.h"
-
-#include "xla/service/cpu/onednn_contraction_rewriter.h"
-
-namespace xla {
-namespace cpu {
-
-bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
-  switch (hlo.opcode()) {
-    // oneDNN rewritable ops
-    case HloOpcode::kDot:
-      return LowPrecisionType() == BF16 &&
-             OneDnnContractionRewriter::ShouldRewriteDot(&hlo, true);
-    case HloOpcode::kConvolution:
-      return LowPrecisionType() == BF16 &&
-             OneDnnContractionRewriter::ShouldRewriteConv(&hlo);
-    // Collective ops.
-    case HloOpcode::kAllGather:
-    case HloOpcode::kAllReduce:
-    case HloOpcode::kAllReduceStart:
-    case HloOpcode::kAllReduceDone:
-    case HloOpcode::kAllToAll:
-    case HloOpcode::kCollectivePermute:
-    case HloOpcode::kReduceScatter:
-    // Data movement only ops.
-    case HloOpcode::kBroadcast:
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kCopy:
-    case HloOpcode::kDynamicSlice:
-    case HloOpcode::kDynamicUpdateSlice:
-    case HloOpcode::kGather:
-    case HloOpcode::kPad:
-    case HloOpcode::kReshape:
-    case HloOpcode::kReverse:
-    case HloOpcode::kScatter:
-    case HloOpcode::kSelect:
-    case HloOpcode::kSelectAndScatter:
-    case HloOpcode::kSlice:
-    case HloOpcode::kTranspose:
-    // Other special ops.
-    case HloOpcode::kBitcast:
-      return true;
-    default:
-      return false;
-  }
-}
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // INTEL_MKL
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.h b/third_party/xla/xla/service/cpu/cpu_float_support.h
index cd7b8f568cb4..4582d97d78bf 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.h
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,8 +16,12 @@ limitations under the License.
 #ifndef XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
 #define XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
 
-#if defined(INTEL_MKL)
+#include <functional>
 
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/xnn_fusion.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/float_support.h"
 
 namespace xla {
@@ -25,28 +29,31 @@ namespace cpu {
 
 class CpuFloatSupport : public FloatSupport {
  public:
-  explicit CpuFloatSupport(PrimitiveType low_precision_type)
-      : FloatSupport(low_precision_type) {}
-
-  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
-                                   int64_t operand_index) const override {
-    return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
-           IsSupported(hlo);
-  }
-
-  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
-    return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
+  using DotStrategyChecker = std::function<bool(const HloInstruction& hlo)>;
+
+  explicit CpuFloatSupport(PrimitiveType low_precision_type,
+                           DotStrategyChecker call_library_for_dot,
+                           TargetMachineFeatures* cpu_features)
+      : FloatSupport(low_precision_type),
+        call_library_for_dot_(call_library_for_dot),
+        cpu_features_(cpu_features) {}
+
+  // Skip trying to upcast the dot if XNNPACK is enabled and the dot is
+  // supported by XNNPACK.
+  bool ShouldSkipInstruction(const HloInstruction& hlo) const override {
+    return hlo.opcode() == HloOpcode::kDot && call_library_for_dot_(hlo) &&
+           IsXnnDotSupported(hlo.dot_dimension_numbers(),
+                             hlo.operand(0)->shape(), hlo.operand(1)->shape(),
+                             hlo.shape(), cpu_features_)
+               .value_or(false);
   }
 
  private:
-  bool IsSupported(const HloInstruction& hlo) const;
-  // Performs early check for things that cannot be delayed becuase some later
-  // passes may change the shape of dot inputs.
-  bool DotSupported(const HloInstruction& hlo) const;
+  DotStrategyChecker call_library_for_dot_;
+  TargetMachineFeatures* cpu_features_;
 };
 
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // INTEL_MKL
 #endif  // XLA_SERVICE_CPU_CPU_FLOAT_SUPPORT_H_
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support_test.cc b/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
new file mode 100644
index 000000000000..79ac03cfdddf
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/cpu_float_support_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/cpu_float_support.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/codegen/target_machine_test_base.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/hlo/transforms/simplifiers/float_normalization.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+namespace {
+
+struct SkipInstructionTestSpec {
+  HloOpcode op;
+  bool call_library_for_dot;
+  std::string cpu_name;
+  std::string features;
+  bool upcast;
+};
+
+class SkipInstructionTest
+    : public TargetMachineTestBase,
+      public ::testing::WithParamInterface<SkipInstructionTestSpec> {
+ public:
+  static std::string Name(
+      const ::testing::TestParamInfo<SkipInstructionTestSpec>& info) {
+    absl::string_view op = HloOpcodeString(info.param.op);
+    absl::string_view dot_strategy =
+        info.param.call_library_for_dot ? "LibDot" : "NoLibDot";
+    absl::string_view bf16_strategy =
+        absl::StrContains(info.param.features, "+avx512bf16") ? "Bf16"
+                                                              : "NoBf16";
+    return absl::StrCat(op, "_", dot_strategy, "_", bf16_strategy);
+  }
+
+  void SetUp() override { TargetMachineTestBase::SetUp(); }
+
+  void CheckDtype(HloModule* module, PrimitiveType lhs_type,
+                  PrimitiveType rhs_type, PrimitiveType out_type) {
+    HloInstruction* op = module->entry_computation()->root_instruction();
+    EXPECT_EQ(op->operand(0)->shape().element_type(), lhs_type);
+    EXPECT_EQ(op->operand(1)->shape().element_type(), rhs_type);
+    EXPECT_EQ(op->shape().element_type(), out_type);
+  }
+};
+
+TEST_P(SkipInstructionTest, Bf16InF32Out) {
+  SkipInstructionTestSpec spec = GetParam();
+
+  // Create the HLO module: p0 <op> p1.
+  HloComputation::Builder builder("SkipInstructionTest");
+  Shape input_shape = ShapeUtil::MakeShape(BF16, {100, 100});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {100, 100});
+  HloInstruction* p0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "p0"));
+  HloInstruction* p1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, input_shape, "p1"));
+  if (spec.op == HloOpcode::kDot) {
+    DotDimensionNumbers dot_dimensions;
+    dot_dimensions.add_lhs_contracting_dimensions(1);
+    dot_dimensions.add_rhs_contracting_dimensions(0);
+    builder.AddInstruction(HloInstruction::CreateDot(
+        output_shape, p0, p1, dot_dimensions, PrecisionConfig()));
+  } else {
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(output_shape, spec.op, p0, p1));
+  }
+  std::unique_ptr<HloComputation> computation = builder.Build();
+  std::unique_ptr<HloModule> module = std::make_unique<VerifiedHloModule>(
+      "test", HloModuleConfig(),
+      /*verifier_layout_sensitive=*/false,
+      /*allow_mixed_precision_in_hlo_verifier=*/true,
+      ShapeUtil::ByteSizeOfElements);
+  module->AddEntryComputation(std::move(computation));
+
+  // Create CpuFloatSupport.
+  CpuFloatSupport::DotStrategyChecker call_library_for_dot =
+      [&spec](const HloInstruction& hlo) { return spec.call_library_for_dot; };
+  std::unique_ptr<TargetMachineFeatures> features = CreateTargetMachineFeatures(
+      "x86_64-unknown-linux-gnu", spec.cpu_name, spec.features);
+  CpuFloatSupport cpu_float_support(BF16, call_library_for_dot, features.get());
+
+  // Run FloatNormalization and check the results.
+  FloatNormalization float_normalization(&cpu_float_support);
+  TF_ASSERT_OK_AND_ASSIGN(bool upcast, float_normalization.Run(module.get()));
+  EXPECT_EQ(upcast, spec.upcast);
+  PrimitiveType expected_input_dtype = spec.upcast ? F32 : BF16;
+  CheckDtype(module.get(), expected_input_dtype, expected_input_dtype, F32);
+}
+
+std::vector<SkipInstructionTestSpec> GetSkipInstructionTestSpecs() {
+  return std::vector<SkipInstructionTestSpec>{
+      // Add op, always upcast.
+      SkipInstructionTestSpec{HloOpcode::kAdd,
+                              /*call_library_for_dot=*/true,
+                              /*cpu_name=*/"sapphirerapids",
+                              /*features=*/"+avx512bf16",
+                              /*upcast=*/true},
+      // CPU has BF16, but library dot is disabled.
+      SkipInstructionTestSpec{HloOpcode::kDot,
+                              /*call_library_for_dot=*/false,
+                              /*cpu_name=*/"sapphirerapids",
+                              /*features=*/"+avx512bf16",
+                              /*upcast=*/true},
+      // Library dot is enabled, but CPU does not have BF16.
+      SkipInstructionTestSpec{HloOpcode::kDot,
+                              /*call_library_for_dot=*/true,
+                              /*cpu_name=*/"znver3",
+                              /*features=*/"+avx2",
+                              /*upcast=*/true},
+      // Library dot is enabled and CPU has BF16. Use mixed precision.
+      SkipInstructionTestSpec{HloOpcode::kDot,
+                              /*call_library_for_dot=*/true,
+                              /*cpu_name=*/"sapphirerapids",
+                              /*features=*/"+avx512bf16",
+                              /*upcast=*/false}};
+}
+
+INSTANTIATE_TEST_SUITE_P(SkipInstructionTestSuite, SkipInstructionTest,
+                         ::testing::ValuesIn(GetSkipInstructionTestSpecs()),
+                         SkipInstructionTest::Name);
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.cc b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.cc
index 4efd62b31281..bd26c1c87621 100644
--- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.cc
@@ -54,7 +54,8 @@ bool CanBeLoopFused(const HloInstruction& hlo) {
 bool IsNonComplexNonBatchedMatrixVectorDot(const HloInstruction* hlo) {
   const Shape& hlo_shape = hlo->shape();
   return !ShapeUtil::ElementIsComplex(hlo_shape) &&
-         hlo->opcode() == HloOpcode::kDot && hlo_shape.dimensions_size() <= 1 &&
+         hlo->opcode() == HloOpcode::kDot &&
+         hlo_shape.dimensions().size() <= 1 &&
          hlo->dot_dimension_numbers().lhs_batch_dimensions_size() == 0;
 }
 
@@ -84,6 +85,9 @@ void CpuInstructionFusion::ComputeInstructionsToSkip(
   const auto computations_list =
       module->MakeComputationPostOrder(execution_threads);
   instructions_to_skip_.clear();
+  const bool is_fusion_emitters =
+      module->config().debug_options().xla_cpu_use_thunk_runtime() &&
+      module->config().debug_options().xla_cpu_use_fusion_emitters();
   for (auto* computation : computations_list) {
     for (auto* instruction : computation->MakeInstructionPostOrder()) {
       if (instruction->IsCustomFusion() ||
@@ -96,6 +100,16 @@ void CpuInstructionFusion::ComputeInstructionsToSkip(
         for (HloInstruction* instr :
              callable->called_computation()->instructions())
           instructions_to_skip_.insert(instr);
+      } else if (is_fusion_emitters &&
+                 instruction->opcode() == HloOpcode::kScatter) {
+        // Disallow fusions in the called computation (e.g. reduction)
+        // of a scatter "fusion"; the fusion emitter can't handle them.
+        auto* scatter = Cast<HloScatterInstruction>(instruction);
+        for (const auto* computation : scatter->called_computations()) {
+          for (const auto* instr : computation->instructions()) {
+            instructions_to_skip_.insert(instr);
+          }
+        }
       }
     }
   }
@@ -148,7 +162,7 @@ FusionDecision CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   // better job with pure data movement loops.
   auto is_minor_dim_concatenate = [](const HloInstruction* hlo) {
     // For vectors it's always beneficial to fuse concatenations.
-    if (hlo->shape().rank() <= 1) return false;
+    if (hlo->shape().dimensions().size() <= 1) return false;
 
     // For small concatenated dimensions we don't loose any performance by
     // fusing the concatenation as we don't have opportunities for vectorization
@@ -218,18 +232,19 @@ FusionDecision CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     // fusion can easily be overshadowed by the overhead of a naive GEMM
     // algorithm in the IR.
     const Shape& output_shape = consumer->shape();
-    if (output_shape.dimensions_size() <= 1) {
+    if (output_shape.dimensions().size() <= 1) {
       // We fuse in cases where we have a matrix*vector or vector*matrix dot and
       // fusion can get rid of the larger tensor.  We assume that a naive
       // traversal of a small enough (to fit in L1) column or row tensor is
       // "good enough" from the perspective of cache management; and calling out
       // to an optimized GEMM kernel is not a huge win.
-      if (consumer->operand(0)->shape().rank() == 1 && operand_index == 1 &&
+      if (consumer->operand(0)->shape().dimensions().size() == 1 &&
+          operand_index == 1 &&
           ShapeUtil::ByteSizeOfElements(consumer->operand(0)->shape()) <
               kFusionThresholdBytes) {
         VLOG(2) << "Fusing small matrix-vector product.";
         return FusionDecision::Allow();
-      } else if (consumer->operand(1)->shape().rank() == 1 &&
+      } else if (consumer->operand(1)->shape().dimensions().size() == 1 &&
                  operand_index == 0 &&
                  ShapeUtil::ByteSizeOfElements(consumer->operand(1)->shape()) <
                      kFusionThresholdBytes) {
@@ -239,23 +254,6 @@ FusionDecision CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     }
   }
 
-  // Don't fuse reductions over the major dimensions. These have an efficient
-  // lowering that's only implemented for the unfused case.
-  if (consumer->opcode() == HloOpcode::kReduce &&
-      !absl::c_linear_search(
-          consumer->dimensions(),
-          LayoutUtil::Minor(consumer->operand(0)->shape().layout(), 0))) {
-    return FusionDecision::Forbid(
-        "Not fusing reductions over major dimensions");
-  }
-  if (producer->opcode() == HloOpcode::kReduce &&
-      !absl::c_linear_search(
-          producer->dimensions(),
-          LayoutUtil::Minor(producer->operand(0)->shape().layout(), 0))) {
-    return FusionDecision::Forbid(
-        "Not fusing reductions over major dimensions");
-  }
-
   if (consumer->IsLoopFusion()) {
     VLOG(2) << "Fusing: consumer is a fusion node.";
     return FusionDecision::Allow();
diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc b/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc
index 1074439062e2..b93eccd8e172 100644
--- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -28,11 +28,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/service/transpose_folding.h"
 #include "xla/shape.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -42,12 +42,13 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla::cpu {
 namespace {
 
-using InstructionFusionTest = HloTestBase;
+using InstructionFusionTest = HloHardwareIndependentTestBase;
 
 std::unique_ptr<HloInstruction> MakeDot(const Shape& shape, HloInstruction* lhs,
                                         HloInstruction* rhs) {
   DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(lhs->shape().rank() - 1);
+  dot_dnums.add_lhs_contracting_dimensions(lhs->shape().dimensions().size() -
+                                           1);
   dot_dnums.add_rhs_contracting_dimensions(0);
   PrecisionConfig precision_config;
   precision_config.mutable_operand_precision()->Resize(
@@ -881,7 +882,7 @@ INSTANTIATE_TEST_SUITE_P(GatherLoopFusionTestInstantiation,
                          ::testing::ValuesIn(GetGatherLoopFusionTestSpecs()),
                          GatherLoopFusionTestSpec::Name);
 
-TEST_F(InstructionFusionTest, NoFuseReduceMajor) {
+TEST_F(InstructionFusionTest, FuseReduceMajor) {
   absl::string_view module_string = R"(
 HloModule module
 
@@ -904,9 +905,8 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(module_string));
   TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
                           CpuInstructionFusion().Run(module.get()));
-  EXPECT_FALSE(fused_something);
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              Not(op::Fusion()));
+  EXPECT_TRUE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Fusion());
 }
 
 TEST_F(InstructionFusionTest, FuseReduceMinor) {
@@ -1028,5 +1028,55 @@ ENTRY %main (Arg_0: f32[10,10], Arg_1: f32[10,10]) -> f32[10,10] {
   EXPECT_FALSE(changed);
 }
 
+static constexpr absl::string_view kScatterModuleString = R"(
+HloModule module
+
+%scatter_max (param0: f32[], param1: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  %maximum.1 = f32[] maximum(f32[] lhs, f32[] rhs)
+  %convert.8 = bf16[] convert(f32[] maximum.1)
+  ROOT %convert.9 = f32[] convert(bf16[] convert.8)
+}
+
+ENTRY %main (arg0: f32[13,5,10,62], arg1: s32[3,1], arg2: f32[3,1,5,10,62])
+    -> f32[13,5,10,62] {
+  %arg0 = f32[13,5,10,62]{3,2,1,0} parameter(0)
+  %arg1 = s32[3,1]{1,0} parameter(1)
+  %arg2 = f32[3,1,5,10,62]{4,3,2,1,0} parameter(2)
+  ROOT %scatter.2 = f32[13,5,10,62]{3,2,1,0} scatter(
+      f32[13,5,10,62]{3,2,1,0} %arg0,
+      s32[3,1]{1,0} %arg1,
+      f32[3,1,5,10,62]{4,3,2,1,0} %arg2),
+    update_window_dims={1,2,3,4}, inserted_window_dims={},
+    scatter_dims_to_operand_dims={0}, index_vector_dim=1, to_apply=scatter_max
+}
+)";
+
+TEST_F(InstructionFusionTest, SkipScatterComputationsIfFusionEmitters) {
+  auto mod_config = GetModuleConfigForTest();
+  auto debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_cpu_use_thunk_runtime(true);
+  debug_options.set_xla_cpu_use_fusion_emitters(true);
+  mod_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kScatterModuleString, mod_config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(InstructionFusionTest, NoSkipScatterComputationsIfNoFusionEmitters) {
+  auto mod_config = GetModuleConfigForTest();
+  auto debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_cpu_use_fusion_emitters(false);
+  mod_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kScatterModuleString, mod_config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_TRUE(changed);
+}
+
 }  // namespace
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc b/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
index 86b5b2fe23b4..a2ae89ee50f5 100644
--- a/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
@@ -93,7 +93,7 @@ static Shape RowMajorShape(Shape shape) {
         if (!subshape->IsArray()) {
           return;
         }
-        std::vector<int64_t> dimension_order(subshape->dimensions_size());
+        std::vector<int64_t> dimension_order(subshape->dimensions().size());
         std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
         *subshape->mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
       });
@@ -102,7 +102,7 @@ static Shape RowMajorShape(Shape shape) {
 
 static Shape ColMajorShape(const Shape& old_shape) {
   Shape new_shape(old_shape);
-  std::vector<int64_t> dimension_order(new_shape.dimensions_size());
+  std::vector<int64_t> dimension_order(new_shape.dimensions().size());
   std::iota(dimension_order.begin(), dimension_order.end(), 0);
   *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
   return new_shape;
@@ -115,8 +115,8 @@ static bool OperandsAndResultMustHaveRowMajorLayout(
     return PotentiallyImplementedAsEigenConvolution(instr,
                                                     target_machine_features);
   } else if (instr.opcode() == HloOpcode::kDot) {
-    return DotOperandsAndResultMustHaveRowMajorLayout(instr,
-                                                      target_machine_features);
+    return DotOperandsAndResultMustHaveRowMajorLayout(
+        instr, target_machine_features, /*allow_runtime_calls=*/true);
   } else if (instr.opcode() == HloOpcode::kCustomCall) {
     return instr.custom_call_target() == "TopK";
   }
diff --git a/third_party/xla/xla/service/cpu/cpu_layout_assignment_test.cc b/third_party/xla/xla/service/cpu/cpu_layout_assignment_test.cc
index 66c3a4f509a4..74943277b1f5 100644
--- a/third_party/xla/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/utils/hlo_matchers.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/service/cpu/target_machine_features_stub.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -47,7 +47,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-class CpuLayoutAssignmentTest : public HloTestBase {
+class CpuLayoutAssignmentTest : public HloHardwareIndependentTestBase {
  protected:
   void AssignLayouts(HloModule* module,
                      ComputationLayout* entry_computation_layout) {
@@ -348,7 +348,9 @@ static void AssertCorrectLayoutForDotOutputFusion(
   Layout expected_dot_rhs_layout = expect_col_major_dot_rhs
                                        ? LayoutUtil::MakeLayout({0, 1})
                                        : LayoutUtil::MakeLayout({1, 0});
-  if (layout_assignment_result.dot_rhs_fusion_param->shape().rank() == 1) {
+  if (layout_assignment_result.dot_rhs_fusion_param->shape()
+          .dimensions()
+          .size() == 1) {
     expected_dot_rhs_layout = LayoutUtil::MakeLayout({0});
   }
   EXPECT_TRUE(LayoutUtil::Equal(
@@ -357,12 +359,16 @@ static void AssertCorrectLayoutForDotOutputFusion(
 
   EXPECT_TRUE(LayoutUtil::Equal(
       LayoutUtil::MakeDescendingLayout(
-          layout_assignment_result.dot_lhs_fusion_param->shape().rank()),
+          layout_assignment_result.dot_lhs_fusion_param->shape()
+              .dimensions()
+              .size()),
       layout_assignment_result.dot_lhs_fusion_param->shape().layout()));
 
   EXPECT_TRUE(LayoutUtil::Equal(
       LayoutUtil::MakeDescendingLayout(
-          layout_assignment_result.addend_fusion_param->shape().rank()),
+          layout_assignment_result.addend_fusion_param->shape()
+              .dimensions()
+              .size()),
       layout_assignment_result.addend_fusion_param->shape().layout()));
   EXPECT_THAT(computation->instructions(), Each(Not(op::Copy())));
 }
diff --git a/third_party/xla/xla/service/cpu/cpu_options.cc b/third_party/xla/xla/service/cpu/cpu_options.cc
index f151d18ce65a..6de38a21ef8a 100644
--- a/third_party/xla/xla/service/cpu/cpu_options.cc
+++ b/third_party/xla/xla/service/cpu/cpu_options.cc
@@ -22,27 +22,26 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "xla/service/hlo_module_config.h"
 
-namespace {
-
-const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
-const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
-const char* const kXlaForceEnableExperimentalLlvmIrGemm =
+constexpr char kXlaOptimizeForSizeCpuOption[] = "xla_cpu_optimize_for_size";
+constexpr char kLlvmIrDotTilingFactor[] = "xla_llvm_dot_tiling_factor";
+constexpr char kXlaForceEnableExperimentalLlvmIrGemm[] =
     "xla_force_enable_experimental_llvm_ir_gemm";
-const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
-const char* const kDisableSlpVectorizer = "xla_cpu_disable_slp_vectorizer";
-const char* const kDisableLoopUnrolling = "xla_cpu_disable_loop_unrolling";
-const char* const kFoldAllConstants = "xla_cpu_fold_all_constants";
-
-}  // namespace
+constexpr char kLlvmIrGemmTileSize[] = "xla_llvm_ir_gemm_tile_size";
+constexpr char kDisableSlpVectorizer[] = "xla_cpu_disable_slp_vectorizer";
+constexpr char kDisableLoopUnrolling[] = "xla_cpu_disable_loop_unrolling";
+constexpr char kFoldAllConstants[] = "xla_cpu_fold_all_constants";
+constexpr char kSmallWhileLoopByteThreshold[] =
+    "xla_cpu_small_while_loop_byte_threshold";
 
-namespace xla {
-namespace cpu {
-namespace options {
+namespace xla::cpu::options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config) {
   const auto& extra_options_map =
@@ -86,6 +85,24 @@ std::optional<int64_t> LlvmIrGemvTilingFactor(const HloModuleConfig& config) {
   return std::nullopt;
 }
 
+absl::StatusOr<int64_t> SmallWhileLoopByteThreshold(
+    const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+
+  auto itr = extra_options_map.find(kSmallWhileLoopByteThreshold);
+  if (itr == extra_options_map.end()) {
+    return 1024;  // Default value.
+  }
+
+  int64_t byte_threshold;
+  if (!absl::SimpleAtoi(itr->second, &byte_threshold)) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Failed to parse value for: ", kSmallWhileLoopByteThreshold, "."));
+  }
+  return byte_threshold;
+}
+
 bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
@@ -128,6 +145,4 @@ std::optional<std::tuple<int64_t, int64_t, int64_t>> LlvmIrGemmTileSize(
                                                tile_size_n_in_vector_width);
 }
 
-}  // namespace options
-}  // namespace cpu
-}  // namespace xla
+}  // namespace xla::cpu::options
diff --git a/third_party/xla/xla/service/cpu/cpu_options.h b/third_party/xla/xla/service/cpu/cpu_options.h
index 5b87155ca733..17f92d5251be 100644
--- a/third_party/xla/xla/service/cpu/cpu_options.h
+++ b/third_party/xla/xla/service/cpu/cpu_options.h
@@ -18,16 +18,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
-#include <string>
 #include <tuple>
 
+#include "absl/status/statusor.h"
 #include "xla/service/hlo_module_config.h"
 
 // Helper functions for querying options that are specific to the CPU backend.
 
-namespace xla {
-namespace cpu {
-namespace options {
+namespace xla::cpu::options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
@@ -38,9 +36,9 @@ bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 std::optional<int64_t> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 std::optional<std::tuple<int64_t, int64_t, int64_t>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
+absl::StatusOr<int64_t> SmallWhileLoopByteThreshold(
+    const HloModuleConfig& config);
 
-}  // namespace options
-}  // namespace cpu
-}  // namespace xla
+}  // namespace xla::cpu::options
 
 #endif  // XLA_SERVICE_CPU_CPU_OPTIONS_H_
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index 95c383f9d57d..7caf9c43b111 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/dynamic_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -57,6 +58,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/status.h"
@@ -69,8 +71,8 @@ namespace cpu {
 namespace runtime {
 
 XfeedManager* GetXfeedManager(int device_ordinal) {
-  static auto* managers = new absl::flat_hash_map<int, XfeedManager*>();
-  static absl::Mutex* mutex = new absl::Mutex();
+  static auto* const managers = new absl::flat_hash_map<int, XfeedManager*>();
+  static absl::Mutex* const mutex = new absl::Mutex();
 
   absl::MutexLock lock(mutex);
   auto it = managers->find(device_ordinal);
@@ -205,7 +207,7 @@ absl::StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
   if (!shape_proto.ParseFromArray(shape_ptr, size_bytes)) {
     return tsl::errors::Internal("Failed parsing the shape proto");
   }
-  Shape shape(shape_proto);
+  TF_ASSIGN_OR_RETURN(Shape shape, Shape::FromProto(shape_proto));
   auto status = ShapeUtil::ValidateShape(shape);
   if (!status.ok()) {
     return status;
@@ -400,8 +402,8 @@ void AllToAllImpl(const ExecutableRunOptions* run_options,
 
   CpuCollectives::Executor executor(rendezvous_key, DefaultCollectiveTimeout());
 
-  std::vector<se::DeviceMemoryBase> source_buffers_data;
-  std::vector<se::DeviceMemoryBase> destination_buffers_data;
+  absl::InlinedVector<se::DeviceMemoryBase, 4> source_buffers_data;
+  absl::InlinedVector<se::DeviceMemoryBase, 4> destination_buffers_data;
   for (int i = 0; i < num_buffers; i++) {
     source_buffers_data.push_back(
         se::DeviceMemoryBase(source_buffers[i], buffer_size));
@@ -409,9 +411,11 @@ void AllToAllImpl(const ExecutableRunOptions* run_options,
         se::DeviceMemoryBase(destination_buffers[i], buffer_size));
   }
 
-  TF_CHECK_OK(communicator->AllToAll(source_buffers_data,
-                                     destination_buffers_data, U8, buffer_size,
-                                     executor));
+  auto event = communicator->AllToAll(std::move(source_buffers_data),
+                                      std::move(destination_buffers_data), U8,
+                                      buffer_size, executor);
+  tsl::BlockUntilReady(event);
+  CHECK(!event.IsError()) << event.GetError();
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
@@ -441,8 +445,10 @@ void AllGatherImpl(const ExecutableRunOptions* run_options,
   se::DeviceMemoryBase output_buffer_data(destination_buffer, buffer_size);
 
   CpuCollectives::Executor executor(rendezvous_key, DefaultCollectiveTimeout());
-  TF_CHECK_OK(communicator->AllGather(input_buffer_data, output_buffer_data, U8,
-                                      buffer_size, executor));
+  auto event = communicator->AllGather(input_buffer_data, output_buffer_data,
+                                       U8, buffer_size, executor);
+  tsl::BlockUntilReady(event);
+  CHECK(!event.IsError()) << event.GetError();
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
@@ -479,9 +485,12 @@ void ReduceScatterImpl(const ExecutableRunOptions* run_options,
                                           primitive_util::ByteWidth(dtype));
 
   CpuCollectives::Executor executor(rendezvous_key, DefaultCollectiveTimeout());
-  TF_CHECK_OK(communicator->ReduceScatter(
+  auto event = communicator->ReduceScatter(
       input_buffer_data, output_buffer_data, dtype, chunk_elems,
-      static_cast<ReductionKind>(reduction_kind), executor));
+      static_cast<ReductionKind>(reduction_kind), executor);
+
+  tsl::BlockUntilReady(event);
+  CHECK(!event.IsError()) << event.GetError();
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
@@ -532,10 +541,12 @@ void AllReduceImpl(const ExecutableRunOptions* run_options,
 
   for (int i = 0; i < num_buffers; i++) {
     Shape subshape = num_buffers == 1 ? shape : shape.tuple_shapes(i);
-    TF_CHECK_OK(communicator->AllReduce(
+    auto event = communicator->AllReduce(
         input_buffers_data[i], output_buffers_data[i], subshape.element_type(),
         ShapeUtil::ElementsIn(subshape),
-        static_cast<ReductionKind>(reduction_kind), executor));
+        static_cast<ReductionKind>(reduction_kind), executor);
+    tsl::BlockUntilReady(event);
+    CHECK(!event.IsError()) << event.GetError();
   }
 }
 
@@ -586,9 +597,11 @@ void CollectivePermuteImpl(const ExecutableRunOptions* run_options,
   se::DeviceMemoryBase input_buffer_data(input_buffer, byte_size);
   se::DeviceMemoryBase output_buffer_data(output_buffer, byte_size);
 
-  TF_CHECK_OK(communicator->CollectivePermute(
+  auto event = communicator->CollectivePermute(
       input_buffer_data, output_buffer_data, U8, byte_size, source_replica_id,
-      copy_to, executor));
+      copy_to, executor);
+  tsl::BlockUntilReady(event);
+  CHECK(!event.IsError()) << event.GetError();
 }
 }  // namespace
 }  // namespace runtime
diff --git a/third_party/xla/xla/service/cpu/cpu_xfeed.cc b/third_party/xla/xla/service/cpu/cpu_xfeed.cc
index 0c60155e7a3c..4e1c453186fb 100644
--- a/third_party/xla/xla/service/cpu/cpu_xfeed.cc
+++ b/third_party/xla/xla/service/cpu/cpu_xfeed.cc
@@ -253,7 +253,7 @@ absl::Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
   }
 
   std::vector<std::pair<void*, int64_t>> buffer_data;
-  for (int i = 0; i < literal.shape().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < literal.shape().tuple_shapes().size(); ++i) {
     const Shape& tuple_element_shape =
         ShapeUtil::GetTupleElementShape(literal.shape(), i);
     int64_t size = cpu::runtime::GetByteSizeRequirement(tuple_element_shape,
@@ -309,8 +309,8 @@ absl::Status ReadDynamicShapesOnCpu(
             reinterpret_cast<const int32_t*>(buffer_8 + offset);
 
         // Update shape size from metadata.
-        for (int64_t i = 0; i < device_sub_shape.rank(); ++i) {
-          device_sub_shape.mutable_dimensions()[i] = metadata_buffer[i];
+        for (int64_t i = 0; i < device_sub_shape.dimensions().size(); ++i) {
+          device_sub_shape.set_dimensions(i, metadata_buffer[i]);
         }
         return absl::OkStatus();
       }));
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.cc b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
index 4911cbcf235a..2a2ee0c59e55 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.cc
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
@@ -79,11 +79,9 @@ bool ShouldUseMultiThreadedEigen(const HloModuleConfig& config) {
 }
 
 // Return whether the given shape is rank 2.
-bool IsRank2(const Shape& shape) { return shape.rank() == 2; }
+bool IsRank2(const Shape& shape) { return shape.dimensions().size() == 2; }
 
-bool IsSimpleLayout(const Layout& layout) {
-  return layout.tiles().empty() && LayoutUtil::IsDense(layout);
-}
+bool IsSimpleLayout(const Layout& layout) { return layout.tiles().empty(); }
 
 // In a gemm operation where output = lhs * rhs, check whether the given shapes
 // are valid for the operation.
@@ -165,7 +163,8 @@ bool CanEmitTiledLlvmIrGemm(
 // Returns dot implementation strategy for non-batch dot operations.
 DotImplementationStrategy GetNonBatchDotImplementationStrategy(
     const HloModuleConfig& config, const DotInfo& dot_info,
-    const TargetMachineFeatures& target_machine_features) {
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls) {
   PrimitiveType element_type = dot_info.result_shape.element_type();
 
   // Batched dot either handled by a runtime call or expanded into a sequence
@@ -177,8 +176,8 @@ DotImplementationStrategy GetNonBatchDotImplementationStrategy(
   // Any Matrix-Vector product of floating point or integral type, or
   // a transpose-dot fusion of the same can be lowered to a tiled LLVM
   // IR implementation.
-  if ((dot_info.result_shape.dimensions_size() <= 1 ||
-       (dot_info.result_shape.dimensions_size() == 2 &&
+  if ((dot_info.result_shape.dimensions().size() <= 1 ||
+       (dot_info.result_shape.dimensions().size() == 2 &&
         (dot_info.result_shape.dimensions(0) == 1 ||
          dot_info.result_shape.dimensions(1) == 1))) &&
       (primitive_util::IsFloatingPointType(element_type) ||
@@ -187,12 +186,12 @@ DotImplementationStrategy GetNonBatchDotImplementationStrategy(
   }
 
   // MatMul smaller than 3x3 should use naive nested loop.
-  if ((dot_info.lhs_shape.dimensions_size() <= 1 ||
-       (dot_info.lhs_shape.dimensions_size() == 2 &&
+  if ((dot_info.lhs_shape.dimensions().size() <= 1 ||
+       (dot_info.lhs_shape.dimensions().size() == 2 &&
         (dot_info.lhs_shape.dimensions(0) <= 3 ||
          dot_info.lhs_shape.dimensions(1) <= 3))) &&
-      (dot_info.rhs_shape.dimensions_size() <= 1 ||
-       (dot_info.rhs_shape.dimensions_size() == 2 &&
+      (dot_info.rhs_shape.dimensions().size() <= 1 ||
+       (dot_info.rhs_shape.dimensions().size() == 2 &&
         (dot_info.rhs_shape.dimensions(0) <= 3 ||
          dot_info.rhs_shape.dimensions(1) <= 3))) &&
       (primitive_util::IsFloatingPointType(element_type) ||
@@ -203,8 +202,9 @@ DotImplementationStrategy GetNonBatchDotImplementationStrategy(
   if (IsAlignedGemm(dot_info, target_machine_features)) {
     if (CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)) {
       return DotImplementationStrategy::kTiledLlvmIrGemm;
+    } else if (allow_runtime_calls) {
+      return DotImplementationStrategy::kEigen;
     }
-    return DotImplementationStrategy::kEigen;
   }
 
   return DotImplementationStrategy::kNaiveLlvmIr;
@@ -528,7 +528,8 @@ absl::Status DotOpEmitter::Emit() {
   }
 
   switch (GetNonBatchDotImplementationStrategy(hlo_module_config_, dot_info_,
-                                               target_machine_features_)) {
+                                               target_machine_features_,
+                                               allow_runtime_calls_)) {
     case DotImplementationStrategy::kNaiveLlvmIr:
       EmitNaiveLlvmIrGemm();
       return absl::OkStatus();
@@ -959,14 +960,15 @@ absl::Status DotOpEmitter::EmitCallToBatchRuntime() {
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
-  CHECK_LE(dot_info_.result_shape.dimensions_size(), 2);
+  CHECK_LE(dot_info_.result_shape.dimensions().size(), 2);
 
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
   const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   auto is_column_major = [](const Shape& shape) {
-    return shape.rank() > 1 && LayoutUtil::Minor(shape.layout(), 0) == 0;
+    return shape.dimensions().size() > 1 &&
+           LayoutUtil::Minor(shape.layout(), 0) == 0;
   };
 
   // Non-contracting dots should never make it here.
@@ -974,29 +976,30 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
   CHECK_GE(dim_nums.rhs_contracting_dimensions_size(), 0);
 
   return {
-      /*m=*/lhs_shape.rank() <= 1
+      /*m=*/lhs_shape.dimensions().size() <= 1
           ? 1LL
           : lhs_shape.dimensions(1LL - dim_nums.lhs_contracting_dimensions(0)),
       /*k=*/lhs_shape.dimensions(dim_nums.lhs_contracting_dimensions(0)),
-      /*n=*/rhs_shape.rank() <= 1
+      /*n=*/rhs_shape.dimensions().size() <= 1
           ? 1LL
           : rhs_shape.dimensions(1LL - dim_nums.rhs_contracting_dimensions(0)),
       /*lhs_column_major=*/is_column_major(lhs_shape),
-      /*lhs_canonical=*/lhs_shape.rank() <= 1 ||
+      /*lhs_canonical=*/lhs_shape.dimensions().size() <= 1 ||
           dim_nums.lhs_contracting_dimensions(0) == 1,
       /*rhs_column_major=*/is_column_major(rhs_shape),
       /*rhs_canonical=*/dim_nums.rhs_contracting_dimensions(0) == 0};
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetBatchMatMultDims() const {
-  CHECK_LE(dot_info_.result_shape.dimensions_size(), 2);
+  CHECK_LE(dot_info_.result_shape.dimensions().size(), 2);
 
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
   const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   auto is_column_major = [](const Shape& shape) {
-    return shape.rank() > 1 && LayoutUtil::Minor(shape.layout(), 0) == 0;
+    return shape.dimensions().size() > 1 &&
+           LayoutUtil::Minor(shape.layout(), 0) == 0;
   };
 
   // Non-contracting dots should never make it here.
@@ -1004,15 +1007,15 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetBatchMatMultDims() const {
   CHECK_GE(dim_nums.rhs_contracting_dimensions_size(), 0);
 
   return {
-      /*m=*/lhs_shape.rank() <= 1
+      /*m=*/lhs_shape.dimensions().size() <= 1
           ? 1LL
           : lhs_shape.dimensions(2LL - dim_nums.lhs_contracting_dimensions(0)),
       /*k=*/lhs_shape.dimensions(1LL + dim_nums.lhs_contracting_dimensions(0)),
-      /*n=*/rhs_shape.rank() <= 1
+      /*n=*/rhs_shape.dimensions().size() <= 1
           ? 1LL
           : rhs_shape.dimensions(2LL - dim_nums.rhs_contracting_dimensions(0)),
       /*lhs_column_major=*/is_column_major(lhs_shape),
-      /*lhs_canonical=*/lhs_shape.rank() <= 1 ||
+      /*lhs_canonical=*/lhs_shape.dimensions().size() <= 1 ||
           dim_nums.lhs_contracting_dimensions(0) == 1,
       /*rhs_column_major=*/is_column_major(rhs_shape),
       /*rhs_canonical=*/dim_nums.rhs_contracting_dimensions(0) == 0};
@@ -1022,8 +1025,8 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetBatchMatMultDims() const {
 // column major.
 std::optional<int64_t> ProfitableToMakeDotOperandColumnMajor(
     const HloInstruction& hlo) {
-  if (hlo.opcode() == HloOpcode::kDot && hlo.shape().dimensions_size() <= 1) {
-    if (hlo.operand(0)->shape().rank() != 1 ||
+  if (hlo.opcode() == HloOpcode::kDot && hlo.shape().dimensions().size() <= 1) {
+    if (hlo.operand(0)->shape().dimensions().size() != 1 ||
         hlo.dot_dimension_numbers().rhs_contracting_dimensions(0) != 0) {
       return {};
     }
@@ -1113,7 +1116,7 @@ llvm_ir::IrArray CollapseFirstNDims(llvm::IRBuilderBase* b,
   const Shape& shape = array.GetShape();
   CHECK(shape.has_layout() &&
         LayoutUtil::IsMonotonicWithDim0Major(shape.layout()));
-  CHECK_GE(shape.dimensions_size(), n);
+  CHECK_GE(shape.dimensions().size(), n);
   Shape new_shape = CollapseFirstNDims(shape, n);
   llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, b->getContext());
   return llvm_ir::IrArray(array.GetBasePointer(), new_ir_type,
@@ -1140,7 +1143,7 @@ llvm_ir::IrArray SliceOutInnerArray(llvm_ir::IrArray outer_array,
                                     llvm::Value* batch_index,
                                     llvm::IRBuilderBase* b) {
   Shape inner_shape = DropFirstDim(outer_array.GetShape());
-  std::vector<llvm::Value*> multidim_index(inner_shape.rank() + 1,
+  std::vector<llvm::Value*> multidim_index(inner_shape.dimensions().size() + 1,
                                            b->getInt64(0));
   multidim_index[0] = batch_index;
   llvm_ir::IrArray::Index slice_index(multidim_index, outer_array.GetShape(),
@@ -1156,7 +1159,8 @@ bool PotentiallyImplementedAsEigenMatmul(
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilderBase* b,
     const HloModuleConfig& hlo_module_config,
-    const TargetMachineFeatures& target_machine_features, DotInfo& dot_info) {
+    const TargetMachineFeatures& target_machine_features, DotInfo& dot_info,
+    bool allow_runtime_calls) {
   int64_t num_batch_dims =
       dot.dot_dimension_numbers().lhs_batch_dimensions_size();
 
@@ -1203,7 +1207,8 @@ bool PotentiallyImplementedAsEigenMatmul(
 
   DotImplementationStrategy impl_strategy =
       GetNonBatchDotImplementationStrategy(dot.GetModule()->config(), dot_info,
-                                           target_machine_features);
+                                           target_machine_features,
+                                           allow_runtime_calls);
 
   return impl_strategy == DotImplementationStrategy::kEigen;
 }
@@ -1223,7 +1228,8 @@ absl::Status EmitBatchDotOperation(
   if (ShouldUseMultiThreadedEigen(hlo_module_config) &&
       PotentiallyImplementedAsEigenMatmul(
           dot, target_array, lhs_array, rhs_array, executable_run_options_value,
-          b, hlo_module_config, target_machine_features, dot_info)) {
+          b, hlo_module_config, target_machine_features, dot_info,
+          allow_runtime_calls)) {
     DotOpEmitter dot_emitter(dot_info, std::string(dot.name()), target_array,
                              lhs_array, rhs_array, nullptr /*addend_array*/,
                              executable_run_options_value, b, hlo_module_config,
@@ -1336,23 +1342,25 @@ DotInfo InnerDotInfo(const DotInfo& batch_dot) {
 
 DotImplementationStrategy GetDotImplementationStrategy(
     const HloModuleConfig& config, const HloInstruction& instr,
-    const TargetMachineFeatures& target_machine_features) {
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls) {
   DotInfo dot_info(instr);
   return GetNonBatchDotImplementationStrategy(
       config, IsBatchDot(dot_info) ? InnerDotInfo(dot_info) : dot_info,
-      target_machine_features);
+      target_machine_features, allow_runtime_calls);
 }
 
 bool DotImplementationCanHandleTranspose(
     const HloInstruction& dot_instr,
-    const TargetMachineFeatures& target_machine_features) {
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls) {
   DotInfo dot_info(dot_instr);
 
   DotImplementationStrategy impl_strategy =
       GetNonBatchDotImplementationStrategy(
           dot_instr.GetModule()->config(),
           IsBatchDot(dot_info) ? InnerDotInfo(dot_info) : dot_info,
-          target_machine_features);
+          target_machine_features, allow_runtime_calls);
 
   return impl_strategy == DotImplementationStrategy::kNaiveLlvmIr ||
          impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemv ||
@@ -1361,7 +1369,8 @@ bool DotImplementationCanHandleTranspose(
 
 bool DotOperandsAndResultMustHaveRowMajorLayout(
     const HloInstruction& dot_instr,
-    const TargetMachineFeatures& target_machine_features) {
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls) {
   // Batched dots require the batch dimensions to be major. DotDecomposer always
   // moves batch dimensions to the front of the shape, so force a row-major
   // layout.
@@ -1370,9 +1379,9 @@ bool DotOperandsAndResultMustHaveRowMajorLayout(
   }
 
   DotImplementationStrategy impl_strategy =
-      GetNonBatchDotImplementationStrategy(dot_instr.GetModule()->config(),
-                                           DotInfo(dot_instr),
-                                           target_machine_features);
+      GetNonBatchDotImplementationStrategy(
+          dot_instr.GetModule()->config(), DotInfo(dot_instr),
+          target_machine_features, allow_runtime_calls);
 
   return impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemm ||
          impl_strategy == DotImplementationStrategy::kEigen;
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.h b/third_party/xla/xla/service/cpu/dot_op_emitter.h
index f267cc48b0b1..b3c9f4138ac0 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.h
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.h
@@ -93,19 +93,22 @@ DotInfo InnerDotInfo(const DotInfo& batch_dot);
 // `dot_info`.
 DotImplementationStrategy GetDotImplementationStrategy(
     const HloModuleConfig& config, const HloInstruction& instr,
-    const TargetMachineFeatures& target_machine_features);
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls);
 
 // Returns true if the two operands and the output of `dot_instr` must have row
 // major layout.
 bool DotOperandsAndResultMustHaveRowMajorLayout(
     const HloInstruction& dot_instr,
-    const TargetMachineFeatures& target_machine_features);
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls);
 
 // Returns true our lowering strategy for `dot_instr` can fold in transposes to
 // the either of the inputs.
 bool DotImplementationCanHandleTranspose(
     const HloInstruction& dot_instr,
-    const TargetMachineFeatures& target_machine_features);
+    const TargetMachineFeatures& target_machine_features,
+    bool allow_runtime_calls);
 
 // Returns the index for an operand to `hlo` that should ideally be column
 // major.  Returns nullopt if there is no such operand or if `hlo` is not a dot
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.cc b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
new file mode 100644
index 000000000000..b9d2b7f4ef64
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
@@ -0,0 +1,34 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/fusion_wrapper.h"
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+
+namespace xla {
+namespace cpu {
+
+bool FusionWrapper::MustWrapInstruction(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kScatter:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.h b/third_party/xla/xla/service/cpu/fusion_wrapper.h
new file mode 100644
index 000000000000..96c9c303c860
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.h
@@ -0,0 +1,42 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_FUSION_WRAPPER_H_
+#define XLA_SERVICE_CPU_FUSION_WRAPPER_H_
+
+#include "absl/strings/string_view.h"
+#include "xla/codegen/emitters/fusion_wrapper_base.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+
+namespace xla {
+namespace cpu {
+
+// Wraps certain HLO ops with a fusion op, so that the fusion emitter can
+// kick in.
+class FusionWrapper : public emitters::FusionWrapperBase {
+ public:
+  explicit FusionWrapper() = default;
+  ~FusionWrapper() override = default;
+
+  absl::string_view name() const override { return "fusion-wrapper"; }
+
+  bool MustWrapInstruction(HloOpcode opcode) override;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_FUSION_WRAPPER_H_
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc b/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
new file mode 100644
index 000000000000..a069ed6f4521
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/fusion_wrapper.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+class FusionWrapperTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(FusionWrapperTest, Scatter) {
+  static constexpr absl::string_view hlo_string = R"(
+  HloModule m
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT sum = f32[] add(p0, p1)
+    }
+    ENTRY e {
+      operand = f32[10,5] parameter(0)
+      indices = s32[24,1] parameter(1)
+      update = f32[24,2,3] parameter(2)
+      ROOT scatter = f32[10,5] scatter(
+          f32[10,5] operand,
+          s32[24,1] indices,
+          f32[24,2,3] update
+        ),
+        update_window_dims={1,2},
+        inserted_window_dims={},
+        scatter_dims_to_operand_dims={0},
+        index_vector_dim=1,
+        unique_indices=false,
+        to_apply=add
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  FusionWrapper wrapper;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, wrapper.Run(m.get()));
+  EXPECT_TRUE(changed);
+
+  // A subsequent run should be a no-op -- the scatter is already fused.
+  TF_ASSERT_OK_AND_ASSIGN(changed, wrapper.Run(m.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/ir_emission_utils.cc b/third_party/xla/xla/service/cpu/ir_emission_utils.cc
index e80029709178..a765f921d8d2 100644
--- a/third_party/xla/xla/service/cpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/cpu/ir_emission_utils.cc
@@ -104,14 +104,15 @@ bool PotentiallyImplementedAsEigenConvolution(
   }
 
   return dnums.input_batch_dimension() == 0 &&
-         dnums.input_feature_dimension() == input_shape.dimensions_size() - 1 &&
+         dnums.input_feature_dimension() ==
+             input_shape.dimensions().size() - 1 &&
          dnums.output_batch_dimension() == 0 &&
          dnums.output_feature_dimension() ==
-             output_shape.dimensions_size() - 1 &&
+             output_shape.dimensions().size() - 1 &&
          dnums.kernel_input_feature_dimension() ==
-             kernel_shape.dimensions_size() - 2 &&
+             kernel_shape.dimensions().size() - 2 &&
          dnums.kernel_output_feature_dimension() ==
-             kernel_shape.dimensions_size() - 1;
+             kernel_shape.dimensions().size() - 1;
 }
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/cpu/ir_emission_utils_test.cc
index b957dde61e37..baac36dbc939 100644
--- a/third_party/xla/xla/service/cpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emission_utils_test.cc
@@ -18,14 +18,14 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
 
-using IrEmitterTest = HloTestBase;
+using IrEmitterTest = HloHardwareIndependentTestBase;
 
 TEST_F(IrEmitterTest, ConvWithZeroSizedKernelNotImplementedAsEigen) {
   const char* const hlo_string = R"(
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index 5cadf817cc1a..feca6552d243 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -36,16 +36,23 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/meta/type_traits.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
@@ -54,12 +61,19 @@ limitations under the License.
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/TargetParser/Triple.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/map_util.h"
 #include "xla/primitive_util.h"
@@ -71,7 +85,6 @@ limitations under the License.
 #include "xla/service/cpu/cpu_runtime.h"
 #include "xla/service/cpu/dot_op_emitter.h"
 #include "xla/service/cpu/elemental_ir_emitter.h"
-#include "xla/service/cpu/elemental_math_emitter.h"
 #include "xla/service/cpu/ir_emission_utils.h"
 #include "xla/service/cpu/ir_function.h"
 #include "xla/service/cpu/onednn_config.pb.h"
@@ -80,21 +93,22 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
+#include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_loop.h"
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/service/llvm_ir/tuple_ops.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/lib/math/math_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 #if defined(INTEL_MKL)
 #include "xla/service/cpu/onednn_memory_util.h"
@@ -126,7 +140,10 @@ IrEmitter::IrEmitter(mlir::MLIRContext* mlir_context,
                      absl::flat_hash_map<const HloComputation*, bool>
                          computation_transitively_contains_custom_call,
                      const TargetMachineFeatures* target_machine_features,
-                     bool emit_code_for_msan)
+                     bool emit_code_for_msan,
+                     absl::flat_hash_map<BufferAllocation::Slice, int64_t>
+                         slice_to_buffer_table_index,
+                     bool allow_runtime_calls)
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
@@ -142,7 +159,9 @@ IrEmitter::IrEmitter(mlir::MLIRContext* mlir_context,
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
       target_machine_features_(*target_machine_features),
-      emit_code_for_msan_(emit_code_for_msan) {
+      emit_code_for_msan_(emit_code_for_msan),
+      slice_to_buffer_table_index_(std::move(slice_to_buffer_table_index)),
+      allow_runtime_calls_(allow_runtime_calls) {
   b()->setFastMathFlags(llvm_ir::GetCpuFastMathFlags(hlo_module_config_));
   absl::Status s = GatherComputationsByAllocationType(
       &hlo_module, &thread_local_computations_, &global_computations_);
@@ -158,7 +177,8 @@ IrEmitter::~IrEmitter() {
   }
 };
 
-void IrEmitter::EmitThreadLocalFunctionEpilogue(HloComputation* computation) {
+void IrEmitter::EmitThreadLocalFunctionEpilogue(
+    const HloComputation* computation) {
   llvm::Argument* out_parameter = compute_function()->result_arg();
   llvm_ir::IrArray root_value = GetIrArrayFor(computation->root_instruction());
   const Shape& return_shape = computation->root_instruction()->shape();
@@ -174,7 +194,7 @@ void IrEmitter::EmitThreadLocalFunctionEpilogue(HloComputation* computation) {
     llvm::Type* tuple_type =
         llvm_ir::ShapeToIrType(return_shape, module_->getContext());
 
-    for (int i = 0; i < return_shape.tuple_shapes_size(); i++) {
+    for (int i = 0; i < return_shape.tuple_shapes().size(); i++) {
       const Shape& element_shape = return_shape.tuple_shapes(i);
       llvm::Value* destination = llvm_ir::EmitGetTupleElement(
           element_shape,
@@ -194,7 +214,7 @@ void IrEmitter::EmitThreadLocalFunctionEpilogue(HloComputation* computation) {
 }
 
 absl::StatusOr<llvm::Function*> IrEmitter::EmitComputation(
-    HloComputation* computation, absl::string_view function_name_prefix,
+    const HloComputation* computation, absl::string_view function_name_prefix,
     bool is_top_level_computation,
     absl::Span<HloInstruction* const> instruction_order,
     bool allow_reassociation,
@@ -454,15 +474,10 @@ void IrEmitter::AttachInvariantLoadMetadataForLoad(llvm::LoadInst* load) const {
 
 absl::Status IrEmitter::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
-  // A tuple is an array of pointers, one for each operand. Each pointer points
-  // to the output buffer of its corresponding operand. A GetTupleElement
-  // instruction forwards a pointer to the tuple element buffer at the given
-  // index.
-  const HloInstruction* operand = get_tuple_element->operand(0);
-  const Shape& shape = get_tuple_element->shape();
-  emitted_value_[get_tuple_element] = llvm_ir::EmitGetTupleElement(
-      shape, get_tuple_element->tuple_index(), MinimumAlignmentForShape(shape),
-      GetEmittedValueFor(operand), IrShapeType(operand->shape()), b());
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                      assignment_.GetUniqueTopLevelSlice(get_tuple_element));
+  llvm::Value* addr = EmitBufferPointer(slice, get_tuple_element->shape());
+  emitted_value_[get_tuple_element] = addr;
   return absl::OkStatus();
 }
 
@@ -502,7 +517,7 @@ absl::Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
     // tuple outer buffer containing pointers to the internal
     // elements.
     std::vector<llvm::Value*> tuple_element_addresses;
-    for (int i = 0; i < data_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < data_shape.tuple_shapes().size(); ++i) {
       TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
                           assignment_.GetUniqueSlice(infeed, {0, i}));
 
@@ -614,7 +629,7 @@ absl::Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
 
   TF_RET_CHECK(!ShapeUtil::IsNestedTuple(operand_shape));
 
-  for (int i = 0; i < operand_shape.tuple_shapes_size(); ++i) {
+  for (int i = 0; i < operand_shape.tuple_shapes().size(); ++i) {
     const Shape& tuple_element_shape =
         ShapeUtil::GetTupleElementShape(operand_shape, i);
     llvm::Value* tuple_element = llvm_ir::EmitGetTupleElement(
@@ -681,7 +696,7 @@ absl::Status IrEmitter::HandleSort(HloInstruction* hlo) {
     higher_dimensions *= normalized_keys_shape.dimensions(i);
   }
   int64_t lower_dimensions = 1;
-  for (int64_t i = normalized_keys_shape.rank() - 1;
+  for (int64_t i = normalized_keys_shape.dimensions().size() - 1;
        i > physical_dimension_to_sort; --i) {
     lower_dimensions *= normalized_keys_shape.dimensions(i);
   }
@@ -783,10 +798,10 @@ absl::Status IrEmitter::HandleDot(HloInstruction* dot) {
           << llvm_ir::DumpToString(target_array.GetBasePointer());
 
   // Dot operation is complicated so we delegate to a helper class.
-  return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
-                          /*addend_array=*/nullptr,
-                          GetExecutableRunOptionsArgument(), b(),
-                          hlo_module_config_, target_machine_features_);
+  return EmitDotOperation(
+      *dot, target_array, lhs_array, rhs_array,
+      /*addend_array=*/nullptr, GetExecutableRunOptionsArgument(), b(),
+      hlo_module_config_, target_machine_features_, allow_runtime_calls_);
 }
 
 absl::Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
@@ -799,8 +814,8 @@ absl::Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
 
   // TODO(tonywy): Add PotentiallyImplementedAsMKLConvolution to support
   // different data layouts.
-  if (PotentiallyImplementedAsEigenConvolution(*convolution,
-                                               target_machine_features_)) {
+  if (allow_runtime_calls_ && PotentiallyImplementedAsEigenConvolution(
+                                  *convolution, target_machine_features_)) {
     const Shape& lhs_shape = lhs->shape();
     const Shape& rhs_shape = rhs->shape();
     const Shape& convolution_shape = convolution->shape();
@@ -812,7 +827,7 @@ absl::Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       // convolutions, except that we pretend that the 1D convolution is really
       // a 2D convolution with the missing dimension set to 1.  We also adjust
       // the padding, dilation parameters as needed.
-      bool one_dim_convolution = lhs_shape.dimensions_size() == 3;
+      bool one_dim_convolution = lhs_shape.dimensions().size() == 3;
       llvm::Value* lhs_address = GetEmittedValueFor(lhs);
       llvm::Value* rhs_address = GetEmittedValueFor(rhs);
       TF_RETURN_IF_ERROR(EmitTargetAddressForOp(convolution));
@@ -994,7 +1009,7 @@ absl::Status IrEmitter::HandleFft(HloInstruction* fft) {
   // Flatten operand batches.
   absl::InlinedVector<int64_t, 4> operand_shape_flat(fft_rank + 1);
   int64_t input_batch = 1;
-  int64_t input_batch_length = fft->shape().dimensions_size() - fft_rank;
+  int64_t input_batch_length = fft->shape().dimensions().size() - fft_rank;
   for (int i = 0; i < input_batch_length; i++) {
     input_batch *= operand->shape().dimensions(i);
   }
@@ -1428,7 +1443,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
   const Shape& result_shape = reduce.shape();
 
   int64_t delta = 0;
-  for (int64_t i = 0; i < operand_shape.dimensions_size(); i++) {
+  for (int64_t i = 0; i < operand_shape.dimensions().size(); i++) {
     if (reduced_dims.contains(i)) {
       delta++;
     } else {
@@ -1440,7 +1455,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
   // dimensions in the source and target shapes are equivalent.
   int64_t result_dim_idx = 0;
   for (int64_t operand_dim_idx = 0;
-       operand_dim_idx < operand_shape.dimensions_size(); operand_dim_idx++) {
+       operand_dim_idx < operand_shape.dimensions().size(); operand_dim_idx++) {
     int64_t operand_dim =
         operand_shape.layout().minor_to_major(operand_dim_idx);
     if (!reduced_dims.contains(operand_dim)) {
@@ -1451,7 +1466,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
     }
   }
 
-  CHECK_EQ(result_dim_idx, result_shape.dimensions_size());
+  CHECK_EQ(result_dim_idx, result_shape.dimensions().size());
 
   return true;
 }
@@ -1796,7 +1811,7 @@ absl::StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
   llvm_ir::ForLoopNest loop_nest(IrName(reduce), b());
   std::vector<llvm::Value*> array_multi_index(
-      reduce->shape().dimensions_size());
+      reduce->shape().dimensions().size());
   for (int i = LayoutUtil::MinorToMajor(reduce->shape()).size() - 1; i > 0;
        --i) {
     int64_t dimension = LayoutUtil::Minor(reduce->shape().layout(), i);
@@ -1954,7 +1969,7 @@ absl::Status IrEmitter::HandleSlice(HloInstruction* slice) {
   }
 
   const Layout& layout = operand->shape().layout();
-  const int64_t num_dims = operand->shape().dimensions_size();
+  const int64_t num_dims = operand->shape().dimensions().size();
 
   // The slice lowering finds maximal contiguous blocks of memory that can be
   // copied from the source to the target. This is done by looking at the
@@ -2364,7 +2379,7 @@ absl::Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
   // PadToStatic has a dynamic tensor as input and variadic size of outputs:
   // (static_tensor, dynamic_dim_0, dynamic_dim_1, ... )
   // Dynamic dimension sizes starts from output index 1.
-  for (int i = 1; i < hlo->shape().tuple_shapes_size(); ++i) {
+  for (int i = 1; i < hlo->shape().tuple_shapes().size(); ++i) {
     // Read from the metadata section of the dynamic input (operand 0).
     const Shape& dim_shape = ShapeUtil::GetSubshape(hlo->shape(), {i});
     TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
@@ -2412,7 +2427,7 @@ absl::Status IrEmitter::HandleTopK(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
   const HloInstruction* input = hlo->operand(0);
   const int64_t k = hlo->shape().tuple_shapes(0).dimensions().back();
-  const bool has_batch = hlo->shape().tuple_shapes(0).dimensions_size() == 2;
+  const bool has_batch = hlo->shape().tuple_shapes(0).dimensions().size() == 2;
   TF_RET_CHECK(input->shape().element_type() == F32) << hlo->ToString();
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(
       hlo->shape().tuple_shapes(0).layout()))
@@ -2465,6 +2480,14 @@ std::vector<StackAlloca> IrEmitter::EmitOneDnnOperandsAlloca(
   return operands_stack_alloca;
 }
 
+std::pair<llvm::Value*, StackAlloca> IrEmitter::GetPtrAndAllocaFromBufferSlice(
+    const BufferAllocation::Slice& slice, const Shape& shape) {
+  llvm::Value* slice_ptr = EmitBufferPointer(slice, shape);
+  llvm::Type* type = IrShapeType(shape);
+  llvm_ir::IrArray ir_array = llvm_ir::IrArray(slice_ptr, type, shape);
+  return {slice_ptr, GetAllocaAndEmitMemrefInfo(*b(), ir_array)};
+}
+
 absl::Status IrEmitter::HandleOneDnnMatMulCalls(
     HloInstruction* custom_call, std::string runtime_symbol_name) {
   // We would like to emit LLVM IR for the following function call
@@ -2535,29 +2558,24 @@ absl::Status IrEmitter::HandleOneDnnMatMulCalls(
   // scratch and void** args
   std::vector<llvm::Value*> fn_call_args;
   fn_call_args.reserve(3);
+  // Add the scratch buffer to the output, so that oneDNN can use it as a
+  // user-provided scratchpad
   const bool use_scratchpad = custom_call->shape().IsTuple();
   if (use_scratchpad) {
     llvm::Value* result_slice_ptr;
     llvm::Value* scratch_slice_ptr;
-    llvm_ir::IrArray result_array;
-    llvm_ir::IrArray scratch_array;
     TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
                         assignment_.GetUniqueSlice(custom_call, {0}));
     const Shape& result_shape = custom_call->shape().tuple_shapes(0);
-    result_slice_ptr = EmitBufferPointer(result_slice, result_shape);
-    llvm::Type* ir_type = IrShapeType(result_shape);
-    result_array = llvm_ir::IrArray(result_slice_ptr, ir_type, result_shape);
-    result_stack_alloca = GetAllocaAndEmitMemrefInfo(*b(), result_array);
+    std::tie(result_slice_ptr, result_stack_alloca) =
+        GetPtrAndAllocaFromBufferSlice(result_slice, result_shape);
     fn_call_args.push_back(result_stack_alloca.value);
 
     TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice scratch_slice,
                         assignment_.GetUniqueSlice(custom_call, {1}));
     const Shape& scratch_shape = custom_call->shape().tuple_shapes(1);
-    scratch_slice_ptr = EmitBufferPointer(scratch_slice, scratch_shape);
-    llvm::Type* scratch_type = IrShapeType(scratch_shape);
-    scratch_array =
-        llvm_ir::IrArray(scratch_slice_ptr, scratch_type, scratch_shape);
-    scratch_stack_alloca = GetAllocaAndEmitMemrefInfo(*b(), scratch_array);
+    std::tie(scratch_slice_ptr, scratch_stack_alloca) =
+        GetPtrAndAllocaFromBufferSlice(scratch_slice, scratch_shape);
     fn_call_args.push_back(scratch_stack_alloca.value);
     llvm_ir::EmitTuple(GetIrArrayFor(custom_call),
                        {result_slice_ptr, scratch_slice_ptr}, b());
@@ -2635,19 +2653,53 @@ absl::Status IrEmitter::HandleOneDnnConvolution(HloInstruction* custom_call) {
   b()->CreateStore(args_val, args_ptr);
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
-  llvm_ir::IrArray result_array = GetIrArrayFor(custom_call);
-  auto result_stack_alloca = GetAllocaAndEmitMemrefInfo(*b(), result_array);
 
-  EmitCallToFunc(runtime::kOneDnnConvolutionSymbolName,
-                 {result_stack_alloca.value, args_ptr}, b()->getVoidTy());
+  StackAlloca result_stack_alloca;
+  StackAlloca scratch_stack_alloca;
+  std::vector<llvm::Value*> fn_call_args;
+  fn_call_args.reserve(3);
+  // Add the scratch buffer to the output, so that oneDNN can use it as a
+  // user-provided scratchpad
+  const bool use_scratchpad = custom_call->shape().IsTuple();
+  if (use_scratchpad) {
+    llvm::Value* result_slice_ptr;
+    llvm::Value* scratch_slice_ptr;
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                        assignment_.GetUniqueSlice(custom_call, {0}));
+    const Shape& result_shape = custom_call->shape().tuple_shapes(0);
+    std::tie(result_slice_ptr, result_stack_alloca) =
+        GetPtrAndAllocaFromBufferSlice(result_slice, result_shape);
+    fn_call_args.push_back(result_stack_alloca.value);
+
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice scratch_slice,
+                        assignment_.GetUniqueSlice(custom_call, {1}));
+    const Shape& scratch_shape = custom_call->shape().tuple_shapes(1);
+    std::tie(scratch_slice_ptr, scratch_stack_alloca) =
+        GetPtrAndAllocaFromBufferSlice(scratch_slice, scratch_shape);
+    fn_call_args.push_back(scratch_stack_alloca.value);
+    llvm_ir::EmitTuple(GetIrArrayFor(custom_call),
+                       {result_slice_ptr, scratch_slice_ptr}, b());
+  } else {
+    llvm_ir::IrArray result_array;
+    result_array = GetIrArrayFor(custom_call);
+    result_stack_alloca = GetAllocaAndEmitMemrefInfo(*b(), result_array);
+    fn_call_args.push_back(result_stack_alloca.value);
+    fn_call_args.push_back(llvm::ConstantPointerNull::get(b()->getPtrTy()));
+  }
+  fn_call_args.push_back(args_ptr);
+  EmitCallToFunc(runtime::kOneDnnConvolutionSymbolName, fn_call_args,
+                 b()->getVoidTy());
 
   // Lifetime ends for all stack allocations.
   b()->CreateLifetimeEnd(nargs_ptr, b()->getInt64(-1));
-  for (int i = 0; i < num_operands; ++i) {
-    operands_stack_alloca[i].EmitLifetimeEnd();
-  }
   b()->CreateLifetimeEnd(args_ptr, b()->getInt64(-1));
+  for (StackAlloca& alloca : operands_stack_alloca) {
+    alloca.EmitLifetimeEnd();
+  }
   result_stack_alloca.EmitLifetimeEnd();
+  if (use_scratchpad) {
+    scratch_stack_alloca.EmitLifetimeEnd();
+  }
 
   return absl::OkStatus();
 }
@@ -2795,7 +2847,7 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
       // Emit nested tuples as flat buffer pointers
       TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
           operand->shape(), [&](const Shape& shape, const ShapeIndex& index) {
-            if (!shape.IsArray()) {
+            if (!shape.IsArray() && !shape.IsToken()) {
               return absl::OkStatus();
             }
             TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
@@ -2851,6 +2903,15 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
 
   switch (typed_custom_call->api_version()) {
     case CustomCallApiVersion::API_VERSION_ORIGINAL:
+#ifdef PLATFORM_GOOGLE
+      LOG(FATAL)
+#else
+      LOG(ERROR)
+#endif
+          << "Custom call API version `API_VERSION_ORIGINAL` is not supported "
+             "by XLA:CPU. Prefer https://docs.jax.dev/en/latest/ffi.html. It "
+             "will be fully removed in November 2025.";
+
       EmitCallToFunc(custom_call->custom_call_target(),
                      {output_address, operands_alloca}, b()->getVoidTy());
       break;
@@ -2879,7 +2940,7 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
       TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
           custom_call->shape(),
           [&](const Shape& shape, const ShapeIndex& index) {
-            if (!shape.IsArray()) {
+            if (!shape.IsArray() && !shape.IsToken()) {
               return absl::OkStatus();
             }
             TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
@@ -2962,9 +3023,7 @@ absl::Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
         return absl::OkStatus();
       }));
 
-  // Set emitted value to that of 'init' with which it shares an allocation.
-  const HloInstruction* init = xla_while->operand(0);
-  emitted_value_[xla_while] = GetEmittedValueFor(init);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(xla_while));
 
   // Generating:
   //   while (Condition(while_result)) {
@@ -2980,6 +3039,8 @@ absl::Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   Br(header_bb);
   b()->SetInsertPoint(header_bb);
 
+  // TODO(willfroom): Use trip count if known.
+
   // Calls the condition function to determine whether to proceed with the
   // body.  It must return a bool, so use the scalar call form.
   EmitGlobalCall(*xla_while->while_condition(), IrName(xla_while, "cond"));
@@ -3880,8 +3941,18 @@ llvm::Value* IrEmitter::EmitThreadLocalBufferPointer(
 llvm::Value* IrEmitter::EmitGlobalBufferPointer(
     const BufferAllocation::Slice& slice, const Shape& target_shape) {
   const BufferAllocation& allocation = *slice.allocation();
+
+  const auto explicit_index_it = slice_to_buffer_table_index_.find(slice);
+  bool has_explicit_index =
+      explicit_index_it != slice_to_buffer_table_index_.end();
+
+  CHECK(slice_to_buffer_table_index_.empty() || has_explicit_index)
+      << "All or none of the slices should have an explicit index.";
+
+  int64_t index =
+      has_explicit_index ? explicit_index_it->second : slice.index();
   llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP(
-      GetBufferTableArgument(), b()->getPtrTy(), slice.index(), b());
+      GetBufferTableArgument(), b()->getPtrTy(), index, b());
   llvm::LoadInst* tempbuf_address_base =
       Load(b()->getPtrTy(), tempbuf_address_ptr);
 
@@ -3890,7 +3961,8 @@ llvm::Value* IrEmitter::EmitGlobalBufferPointer(
   AttachDereferenceableMetadataForLoad(tempbuf_address_base, allocation.size());
 
   llvm::Value* tempbuf_address_untyped = tempbuf_address_base;
-  if (slice.offset() > 0) {
+  // Any explicit buffer pointer should point to the start of the slice.
+  if (!has_explicit_index && slice.offset() > 0) {
     // Adjust the address to account for the slice offset.
     tempbuf_address_untyped = InBoundsGEP(
         b()->getInt8Ty(), tempbuf_address_base, b()->getInt64(slice.offset()));
@@ -3902,8 +3974,10 @@ llvm::Value* IrEmitter::EmitBufferPointer(const BufferAllocation::Slice& slice,
                                           const Shape& target_shape) {
   if (slice.allocation()->is_thread_local()) {
     return EmitThreadLocalBufferPointer(slice, target_shape);
-  } else if (slice.allocation()->is_constant()) {
-    return FindOrDie(constant_buffer_to_global_, slice.allocation()->index());
+  } else if (const auto itr =
+                 constant_buffer_to_global_.find(slice.allocation()->index());
+             itr != constant_buffer_to_global_.end()) {
+    return itr->second;
   } else {
     return EmitGlobalBufferPointer(slice, target_shape);
   }
@@ -4070,7 +4144,7 @@ std::vector<llvm::Value*> IrEmitter::EmitThreadLocalCall(
     allocas_for_returned_scalars.push_back(return_value_buffer);
   } else {
     constexpr int max_tuple_size = 1000;
-    CHECK_LT(return_shape.tuple_shapes_size(), max_tuple_size)
+    CHECK_LT(return_shape.tuple_shapes().size(), max_tuple_size)
         << "Multivalue function can not return more than 1000 elements to avoid"
         << " stack smashing";
     allocas_for_returned_scalars =
@@ -4172,17 +4246,17 @@ CpuElementalIrEmitter IrEmitter::ElementalIrEmmiterFactory() {
       hlo_module_config_.debug_options().xla_cpu_enable_fast_min_max());
 }
 
-absl::Status IrEmitter::EmitNestedComputation(const HloComputation& callee,
-                                              absl::string_view name,
-                                              bool is_reducer) {
+absl::StatusOr<llvm::Function*> IrEmitter::EmitNestedComputation(
+    const HloComputation& callee, absl::string_view name, bool is_reducer) {
   // Module must be scheduled to emit thread local computation.
   if (!hlo_module_.has_schedule()) {
     return absl::InternalError(
         "HLO module must be scheduled to emit thread local computation.");
   }
 
-  if (is_computation_emitted(callee, is_reducer)) {
-    return absl::OkStatus();
+  if (const auto itr = emitted_functions_.find({&callee, is_reducer});
+      itr != emitted_functions_.end()) {
+    return itr->second;
   }
 
   for (HloInstruction* instr : callee.instructions()) {
@@ -4190,23 +4264,60 @@ absl::Status IrEmitter::EmitNestedComputation(const HloComputation& callee,
                              instr->opcode() == HloOpcode::kReduceWindow;
     for (HloComputation* called_computation : instr->called_computations()) {
       // reassociation is transitive so we "or" the caller and the callee.
-      TF_RETURN_IF_ERROR(
-          EmitNestedComputation(*called_computation, llvm_ir::IrName(instr),
-                                is_reducer || nested_is_reducer));
+      TF_RETURN_IF_ERROR(EmitNestedComputation(*called_computation,
+                                               llvm_ir::IrName(instr),
+                                               is_reducer || nested_is_reducer)
+                             .status());
     }
   }
 
   if (callee.IsFusionComputation()) {
-    return absl::OkStatus();
+    return nullptr;
   }
 
   VLOG(2) << "Emit nested computation: " << callee.name();
   return EmitComputation(
-             const_cast<HloComputation*>(&callee), name, false,
-             hlo_module_.schedule().sequence(&callee).instructions(),
-             /*allow_reassociation=*/is_reducer,
-             /*function_attributes=*/{llvm::Attribute::AlwaysInline})
-      .status();
+      const_cast<HloComputation*>(&callee), name, false,
+      hlo_module_.schedule().sequence(&callee).instructions(),
+      /*allow_reassociation=*/is_reducer,
+      /*function_attributes=*/{llvm::Attribute::AlwaysInline});
+}
+
+// Implementation detail for ComputationsTransitivelyContainCustomCall, which
+// recursively checks whether a computation contains a custom call.
+bool RecursivelyCheckForCustomCall(
+    const HloComputation& computation,
+    absl::flat_hash_map<const HloComputation*, bool>& custom_call_map) {
+  if (const auto itr = custom_call_map.find(&computation);
+      itr != custom_call_map.end()) {
+    return itr->second;
+  }
+
+  bool contains_custom_call =
+      !computation.caller_instructions(HloOpcode::kCustomCall).empty();
+
+  for (const HloInstruction* instruction : computation.instructions()) {
+    contains_custom_call |= instruction->opcode() == HloOpcode::kCustomCall;
+    for (const HloComputation* nested_computation :
+         instruction->called_computations()) {
+      contains_custom_call |=
+          RecursivelyCheckForCustomCall(*nested_computation, custom_call_map);
+    }
+  }
+
+  custom_call_map[&computation] = contains_custom_call;
+  return contains_custom_call;
+}
+
+absl::flat_hash_map<const HloComputation*, bool>
+ComputationsTransitivelyContainCustomCall(const HloInstruction* instr) {
+  absl::flat_hash_map<const HloComputation*, bool> custom_call_map;
+
+  for (const HloComputation* computation : instr->called_computations()) {
+    RecursivelyCheckForCustomCall(*computation, custom_call_map);
+  }
+
+  return custom_call_map;
 }
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index 2d2219eeee49..40f54d2f4bff 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -112,7 +112,10 @@ class IrEmitter : public DfsHloVisitorWithDefault,
             absl::flat_hash_map<const HloComputation*, bool>
                 computation_transitively_contains_custom_call,
             const TargetMachineFeatures* target_machine,
-            bool emit_code_for_msan);
+            bool emit_code_for_msan,
+            absl::flat_hash_map<BufferAllocation::Slice, int64_t>
+                slice_to_buffer_table_index = {},
+            bool allow_runtime_calls = true);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
@@ -136,7 +139,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // If 'allow_reassociation' is true, the fast-math reassociation flag will
   // be enabled in the function's body. This is used when emitting reducers.
   absl::StatusOr<llvm::Function*> EmitComputation(
-      HloComputation* computation, absl::string_view function_name_prefix,
+      const HloComputation* computation, absl::string_view function_name_prefix,
       bool is_top_level_computation,
       absl::Span<HloInstruction* const> instruction_order,
       bool allow_reassociation,
@@ -245,8 +248,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
     return IRBuilderGuard(this, &builder);
   }
 
-  absl::Status EmitNestedComputation(const HloComputation& callee,
-                                     absl::string_view name, bool is_reducer);
+  absl::StatusOr<llvm::Function*> EmitNestedComputation(
+      const HloComputation& callee, absl::string_view name, bool is_reducer);
 
  protected:
   friend class IrEmitter2;
@@ -337,6 +340,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   std::vector<StackAlloca> EmitOneDnnOperandsAlloca(HloInstruction* custom_call,
                                                     llvm::Value*& args_val,
                                                     int& arg_indx);
+  std::pair<llvm::Value*, StackAlloca> GetPtrAndAllocaFromBufferSlice(
+      const BufferAllocation::Slice& slice, const Shape& shape);
   absl::Status HandleOneDnnMatMulCalls(HloInstruction* hlo,
                                        std::string runtime_symbol_name);
   absl::Status HandleOneDnnSoftmax(HloInstruction* hlo);
@@ -352,7 +357,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Note that since the call graph is flattened, if the same function is
   // called in both thread-local and non-thread-local it would be codegen'd
   // twice, and we would know whether it's thread-local at codegen time.
-  void EmitThreadLocalFunctionEpilogue(HloComputation* computation);
+  void EmitThreadLocalFunctionEpilogue(const HloComputation* computation);
 
   // Convenience functions to generate a GEP into the profile counter parameter
   // which would correspond to the index for a given HLO instruction or
@@ -843,6 +848,11 @@ class IrEmitter : public DfsHloVisitorWithDefault,
 
   bool emit_code_for_msan_;
 
+  absl::flat_hash_map<BufferAllocation::Slice, int64_t>
+      slice_to_buffer_table_index_;
+
+  bool allow_runtime_calls_;
+
   IrEmitter(const IrEmitter&) = delete;
   IrEmitter& operator=(const IrEmitter&) = delete;
 };
@@ -861,6 +871,11 @@ absl::Status EmitFastConcatenate(
     const llvm_ir::IrArray& target_array, llvm::Module* module,
     llvm::IRBuilderBase& b);
 
+// For each called computation called by the instruction, determines if that
+// computation calls a custom-call function, either directly or transitively.
+absl::flat_hash_map<const HloComputation*, bool>
+ComputationsTransitivelyContainCustomCall(const HloInstruction* instr);
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.cc b/third_party/xla/xla/service/cpu/ir_emitter2.cc
index 5c7ea5db1922..bce2108bb875 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/cpu/ir_emitter2.h"
 
+#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
@@ -24,14 +25,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
@@ -47,7 +48,13 @@ limitations under the License.
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Support/CodeGen.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/kernel_api_ir_builder.h"
 #include "xla/backends/cpu/codegen/symbol_name_util.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -56,14 +63,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/layout_util.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/dot_op_emitter.h"
 #include "xla/service/cpu/elemental_ir_emitter.h"
 #include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/cpu/parallel_loop_emitter.h"
-#include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
@@ -72,16 +76,45 @@ limitations under the License.
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/shape.h"
 #include "xla/shape_partition.h"
-#include "xla/shape_util.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla::cpu {
 
+namespace {
+
+// Explicitly in HLO we mostly see "loop" types for fusions.
+// However, internally we pick a fusion kind to pick the appropriate
+// fusion emitter.
+enum class FusionEmitterKind {
+  kLoop,
+  kScatter,
+};
+
+// This is very crude at the moment. Eventually we will need to either have
+// the fusion indicate what emitter it is meant for (e.g. producing this
+// info via a cost model), or heuristics to estimate which emitter is best
+// for the fusion.
+FusionEmitterKind AnalyzeHloFusion(const HloFusionInstruction* fusion) {
+  if (fusion->fused_expression_root()->opcode() == HloOpcode::kScatter) {
+    return FusionEmitterKind::kScatter;
+  }
+  return FusionEmitterKind::kLoop;
+}
+
+std::string SortCsv(absl::string_view csv) {
+  std::vector<absl::string_view> v =
+      absl::StrSplit(csv, ',', absl::SkipEmpty());
+  std::sort(v.begin(), v.end());
+  return absl::StrJoin(v, ",");
+}
+
+}  // namespace
+
 //===----------------------------------------------------------------------===//
 // IrEmitter2
 //===----------------------------------------------------------------------===//
@@ -98,6 +131,21 @@ IrEmitter2::IrEmitter2(const HloModule& hlo_module, llvm::Module* module,
 bool IrEmitter2::fast_min_max() const {
   return hlo_module_.config().debug_options().xla_cpu_enable_fast_min_max();
 }
+
+bool IrEmitter2::IsSupportedByFusionEmitter(
+    const HloFusionInstruction* fusion) const {
+  if (!hlo_module_.config().debug_options().xla_cpu_use_fusion_emitters()) {
+    return false;
+  }
+  FusionEmitterKind fusion_emitter_kind = AnalyzeHloFusion(fusion);
+  switch (fusion_emitter_kind) {
+    case FusionEmitterKind::kScatter:
+      return kFusionEmitterScatterEnabled;
+    default:
+      return false;
+  }
+}
+
 IrEmitter2::KernelInfo::KernelInfo(KernelPrototype prototype,
                                    const se::BlockDim& block_dims,
                                    const se::ThreadDim& thread_dims)
@@ -106,6 +154,17 @@ IrEmitter2::KernelInfo::KernelInfo(KernelPrototype prototype,
       thread_dims(thread_dims),
       invariant_arguments(std::move(prototype.invariant_arguments)) {}
 
+IrEmitter2::KernelInfo::KernelInfo(
+    const std::string& name, const se::BlockDim& block_dims,
+    const se::ThreadDim& thread_dims,
+    const absl::flat_hash_set<int64_t>& invariant_arguments,
+    absl::string_view backend_extra_options)
+    : name(name),
+      block_dims(block_dims),
+      thread_dims(thread_dims),
+      invariant_arguments(invariant_arguments),
+      backend_extra_options(SortCsv(backend_extra_options)) {}
+
 absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitPadHostKernel(
     const HloInstruction* pad) {
   VLOG(2) << "Emit Pad host kernel.";
@@ -159,8 +218,10 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitFusionHostKernel(
   IrEmitter::IRBuilderGuard builder_guard = nested_ir_emitter_->WithBuilder(b);
 
   HloComputation* nested_computation = fusion->fused_instructions_computation();
-  TF_RETURN_IF_ERROR(nested_ir_emitter_->EmitNestedComputation(
-      *nested_computation, llvm_ir::IrName(fusion), false));
+  TF_RETURN_IF_ERROR(nested_ir_emitter_
+                         ->EmitNestedComputation(*nested_computation,
+                                                 llvm_ir::IrName(fusion), false)
+                         .status());
 
   CpuElementalIrEmitter elemental_emitter = ElementalIrEmmiterFactory(&b);
 
@@ -235,8 +296,8 @@ absl::StatusOr<IrEmitter2::KernelInfo> IrEmitter2::EmitDotFusionHostKernel(
 
   // Check that we can emit LLVM IR for this dot operation.
   DotImplementationStrategy strategy = GetDotImplementationStrategy(
-      hlo_module_.config(), *dot,
-      nested_ir_emitter_->target_machine_features());
+      hlo_module_.config(), *dot, nested_ir_emitter_->target_machine_features(),
+      /*allow_runtime_calls=*/false);
 
   if (!IsDotCodegenStrategy(strategy)) {
     return Internal("Unsupported dot implementation strategy");
diff --git a/third_party/xla/xla/service/cpu/ir_emitter2.h b/third_party/xla/xla/service/cpu/ir_emitter2.h
index 69d0d7399733..e720e06f3764 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter2.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter2.h
@@ -81,11 +81,23 @@ class IrEmitter2 {
     explicit KernelInfo(KernelPrototype prototype,
                         const se::BlockDim& block_dims,
                         const se::ThreadDim& thread_dims);
+    explicit KernelInfo(const std::string& name, const se::BlockDim& block_dims,
+                        const se::ThreadDim& thread_dims,
+                        const absl::flat_hash_set<int64_t>& invariant_arguments,
+                        absl::string_view backend_extra_options = "");
 
     std::string name;
     se::BlockDim block_dims;
     se::ThreadDim thread_dims;
     absl::flat_hash_set<int64_t> invariant_arguments;
+    // CSV with extra compilation options. Overrides the
+    // xla_backend_extra_options flag in ModuleConfig.
+    // This is here because currently in IrEmitter2 all codegen'ed objects
+    // end up being linked in the same LLVM::Module. If we had one module
+    // per object, we could simply embed these options in the object.
+    // TODO(ecg): move IrEmitter2 to a model where we have one object per
+    // LLVM::Module. Or migrate IrEmitter2 to something better.
+    std::string backend_extra_options;
   };
 
   // Emitted comparator function information (for sort operation).
@@ -122,6 +134,8 @@ class IrEmitter2 {
 
   bool CanUpdateDynamicSliceInPlace(const HloInstruction* update) const;
 
+  bool IsSupportedByFusionEmitter(const HloFusionInstruction* fusion) const;
+
  private:
   class ElementalIrEmitter;
 
diff --git a/third_party/xla/xla/service/cpu/ir_emitter_test.cc b/third_party/xla/xla/service/cpu/ir_emitter_test.cc
index 9da5cda02f91..2ccf8d27fe02 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter_test.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter_test.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
@@ -62,7 +63,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/logical_buffer.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
@@ -73,7 +74,7 @@ limitations under the License.
 namespace xla::cpu {
 namespace {
 
-using IrEmitterTest = HloTestBase;
+using IrEmitterTest = HloHardwareIndependentTestBase;
 
 static std::pair<llvm::Function*, llvm::BasicBlock*> CreateFunction(
     llvm::LLVMContext& context, llvm::Module* module, llvm::IRBuilderBase* b) {
@@ -110,7 +111,9 @@ TEST_F(IrEmitterTest, ComputeFuncStack) {
       std::unique_ptr<BufferAssignment> buffer_assignment,
       BufferAssigner::Run(
           hlo.get(), std::make_unique<DependencyHloOrdering>(hlo.get()),
-          backend().compiler()->BufferSizeBytesFunction(),
+          [](const BufferValue& buffer) {
+            return ShapeUtil::ByteSizeOf(buffer.shape(), sizeof(void*));
+          },
           [](LogicalBuffer::Color) { return /*alignment=*/1; }));
 
   TargetMachineFeaturesStub target_machine([](int64_t size) { return 1; });
@@ -227,6 +230,7 @@ CreateIrEmitterForConstantEmissionTests(HloModule& module,
   IrCompiler::Options ir_compiler_options{
       /*optimization_level=*/llvm::CodeGenOptLevel::Default,
       /*optimize_for_size=*/options::OptimizeForSizeRequested(config),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
       /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config),
       /*disable_expensive_passes=*/
       debug_options.xla_llvm_disable_expensive_passes(),
@@ -241,11 +245,8 @@ CreateIrEmitterForConstantEmissionTests(HloModule& module,
 
   // Options for orchestrating the JIT compilation process.
   JitCompiler::Options jit_compiler_options{
-      std::move(ir_compiler_options),
-      {},
       /*num_dylibs=*/1,
       /*definition_generator=*/std::move(definition_generator),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
   };
 
   llvm::TargetOptions target_options;
@@ -259,23 +260,25 @@ CreateIrEmitterForConstantEmissionTests(HloModule& module,
     thread_pool->Schedule(std::move(task));
   };
 
+  std::unique_ptr<IrCompiler> ir_compiler =
+      IrCompiler::Create(target_options, std::move(ir_compiler_options),
+                         IrCompiler::CompilationHooks());
+
   TF_ASSIGN_OR_RETURN(
       JitCompiler jit_compiler,
-      JitCompiler::Create(target_options, std::move(jit_compiler_options),
-                          compilation_task_runner));
-
-  auto scheduler =
-      debug_options.xla_cpu_enable_concurrency_optimized_scheduler()
-          ? BFSMemoryScheduler
-          : DFSMemoryScheduler;
-
+      JitCompiler::Create(std::move(jit_compiler_options),
+                          std::move(ir_compiler), compilation_task_runner));
   auto buffer_size_bytes_function = [](const BufferValue& buffer) {
     return CpuExecutable::ShapeSizeBytes(buffer.shape());
   };
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleModule(&module, buffer_size_bytes_function,
-                     ComputationSchedulerToModuleScheduler(scheduler)));
+  auto scheduler =
+      debug_options.xla_cpu_enable_concurrency_optimized_scheduler()
+          ? std::unique_ptr<ModuleSchedulerAlgorithm>(
+                std::make_unique<BFScheduler>(buffer_size_bytes_function))
+          : std::make_unique<DFSMemoryScheduler>(buffer_size_bytes_function);
+
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(&module, *scheduler));
   TF_RETURN_IF_ERROR(module.set_schedule(schedule));
 
   auto memory_alignment = [](LogicalBuffer::Color) {
diff --git a/third_party/xla/xla/service/cpu/onednn_config.proto b/third_party/xla/xla/service/cpu/onednn_config.proto
index 44829a6857f1..58eee0f3ef22 100644
--- a/third_party/xla/xla/service/cpu/onednn_config.proto
+++ b/third_party/xla/xla/service/cpu/onednn_config.proto
@@ -36,7 +36,7 @@ message OneDnnOptimizationConfig {
 }
 
 message OneDnnFusionConfig {
-  // These enum needs to be mapped to oneDNN enum for post_op algorithm.
+  // This enum needs to be mapped to oneDNN enum for post_op algorithm.
   // TODO(intel-tf): Add kinds supported by oneDNN.
   enum FusionKind {
     UNDEFINED = 0;
@@ -50,11 +50,12 @@ message OneDnnFusionConfig {
     ELU = 8;
     RELU6 = 9;
     SIGMOID = 10;
+    SUM = 11;  // This represents in-place accumulation.
   }
   repeated FusionKind ops = 1;
   // To avoid protobuf failures for specific decimal values,
   // the original float value alpha is type-casted to int32.
-  int32 alpha_typecast = 2;
+  repeated int32 alpha_typecast = 2;
 }
 
 message OneDnnTensorLayoutProto {
@@ -82,6 +83,9 @@ message OneDnnMatMulConfig {
   reserved 7;  // was user_scratchpad
 
   OneDnnOptimizationConfig optimization_config = 8;
+  OneDnnTensorLayoutProto lhs = 9;
+  OneDnnTensorLayoutProto rhs = 10;
+  OneDnnTensorLayoutProto result = 11;
 }
 
 message OneDnnWindowProto {
diff --git a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
index 23f2bd3f7e70..df86cea1e7e9 100644
--- a/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_contraction_rewriter.cc
@@ -334,8 +334,8 @@ absl::StatusOr<Shape> AdjustAddendShape(const HloInstruction* contraction,
     // TODO(intel-tf): Modify this condition when Contraction + Bias +
     // Add is enabled.
     if (IsOneDnnConvolutionInstr(contraction) &&
-        ShapeUtil::TrueRank(addend->shape()) == 1 &&
-        addend->shape().rank() != 1) {
+        ShapeUtil::TrueNumDimensions(addend->shape()) == 1 &&
+        addend->shape().dimensions_size() != 1) {
       return ShapeUtil::FilterDimensions(
           [&addend](int64_t dim) {
             return ShapeUtil::GetDimension(addend->shape(), dim) != 1;
@@ -361,7 +361,7 @@ absl::StatusOr<Shape> AdjustAddendShape(const HloInstruction* contraction,
   //      bitcast = f32[3,1,1,6]{3,2,1,0} bitcast(arg)
   //      fused = f32[3,4,5,6]{3,2,1,0} custom-call((..., bitcast)
   auto kept_dimensions = bcast->dimensions();
-  for (int i = 0; i < new_shape.rank(); i++) {
+  for (int i = 0; i < new_shape.dimensions_size(); i++) {
     if (!absl::c_linear_search(kept_dimensions, i)) {
       new_shape.set_dimensions(i, 1);
     }
@@ -370,7 +370,8 @@ absl::StatusOr<Shape> AdjustAddendShape(const HloInstruction* contraction,
   // If rank(new_shape) > rank(instr), extra dimensions with value = 1 can be
   // deleted from the new_shape.
   auto instr_shape = contraction->shape();
-  int64_t rank_difference = new_shape.rank() - instr_shape.rank();
+  int64_t rank_difference =
+      new_shape.dimensions_size() - instr_shape.dimensions_size();
   auto new_dims = new_shape.dimensions();
   std::vector<int64_t> dims_to_delete;
   for (int i = 0; i < rank_difference; ++i) {
@@ -382,7 +383,7 @@ absl::StatusOr<Shape> AdjustAddendShape(const HloInstruction* contraction,
 
   // New shape for bias should satisfy the condition:
   //   rank(new_shape) <= rank(instr).
-  if (new_shape.rank() > instr_shape.rank()) {
+  if (new_shape.dimensions_size() > instr_shape.dimensions_size()) {
     return absl::CancelledError(
         "Bias shape could not be adjusted for a fusion.");
   }
@@ -455,10 +456,13 @@ bool OneDnnContractionRewriter::ShouldRewriteDot(
   }
   // OneDNN only supports rank <= kOneDnnMaxNDims and singular non-contracting
   // dimensions. We should not rewrite if any of these conditions are violated.
-  if (lhs_shape.rank() <= 0 || lhs_shape.rank() > kOneDnnMaxNDims ||
-      rhs_shape.rank() <= 0 || rhs_shape.rank() > kOneDnnMaxNDims ||
-      output_shape.rank() > std::min({lhs_shape.rank(), rhs_shape.rank(),
-                                      static_cast<int64_t>(kOneDnnMaxNDims)})) {
+  if (lhs_shape.dimensions_size() <= 0 ||
+      lhs_shape.dimensions_size() > kOneDnnMaxNDims ||
+      rhs_shape.dimensions_size() <= 0 ||
+      rhs_shape.dimensions_size() > kOneDnnMaxNDims ||
+      output_shape.dimensions_size() >
+          std::min({lhs_shape.dimensions_size(), rhs_shape.dimensions_size(),
+                    kOneDnnMaxNDims})) {
     return false;
   }
 
@@ -477,7 +481,8 @@ bool OneDnnContractionRewriter::ShouldRewriteDot(
   int64_t lhs_dim_k = dot_dim_numbers.lhs_contracting_dimensions(0);
   int64_t rhs_dim_k = dot_dim_numbers.rhs_contracting_dimensions(0);
   // Supported contraction is only in one of last two dimensions.
-  if (lhs_dim_k < lhs_shape.rank() - 2 || rhs_dim_k < rhs_shape.rank() - 2) {
+  if (lhs_dim_k < lhs_shape.dimensions_size() - 2 ||
+      rhs_dim_k < rhs_shape.dimensions_size() - 2) {
     return false;
   }
 
@@ -488,7 +493,7 @@ bool OneDnnContractionRewriter::ShouldRewriteDot(
   // matmul is achieved.
   auto num_flops = xla::HloCostAnalysis::GetDotFlops(lhs_shape, output_shape,
                                                      dot_dim_numbers);
-  auto rank = output_shape.rank();
+  auto rank = output_shape.dimensions_size();
   auto flops_threshold = (rank <= 2) ? (1 << 24) : (1 << 19);
   return (num_flops >= flops_threshold);
 }
@@ -516,8 +521,8 @@ bool OneDnnContractionRewriter::ShouldRewriteConv(
   auto dims = conv_instr->window().dimensions().size();
   if (dims >= 4 || dims <= 0) return false;
 
-  if (inp_shape.rank() != ker_shape.rank() ||
-      inp_shape.rank() != out_shape.rank()) {
+  if (inp_shape.dimensions_size() != ker_shape.dimensions_size() ||
+      inp_shape.dimensions_size() != out_shape.dimensions_size()) {
     return false;
   }
 
@@ -557,8 +562,8 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
     BackendConfig backend_config;
     OneDnnMatMulConfig* matmul_config =
         backend_config.mutable_onednn_matmul_config();
-    bool transpose_a = (lhs_dim_k != lhs_shape.rank() - 1);
-    bool transpose_b = (rhs_dim_k != rhs_shape.rank() - 2);
+    bool transpose_a = (lhs_dim_k != lhs_shape.dimensions_size() - 1);
+    bool transpose_b = (rhs_dim_k != rhs_shape.dimensions_size() - 2);
     matmul_config->set_transpose_a(transpose_a);
     matmul_config->set_transpose_b(transpose_b);
     TF_RETURN_IF_ERROR(matmul_call->set_backend_config(backend_config));
@@ -580,7 +585,7 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
     OneDnnConvolutionConfig* conv_config =
         backend_config.mutable_onednn_conv_config();
 
-    conv_config->set_dims(conv_shape.rank());
+    conv_config->set_dims(conv_shape.dimensions_size());
     conv_config->set_feature_groups(conv->feature_group_count());
     conv_config->mutable_input()->mutable_data()->set_batch_dim(
         conv_dims.input_batch_dimension());
@@ -705,7 +710,8 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
 
       // oneDNN library requires Convolution biases to always have rank 1.
       // Therefore, these bias shapes should remain unchanged.
-      if (IsOneDnnMatmulInstr(contraction) || addend->shape().rank() != 1) {
+      if (IsOneDnnMatmulInstr(contraction) ||
+          addend->shape().dimensions_size() != 1) {
         auto new_shape =
             AdjustAddendShape(contraction, addend, optional_addend_broadcast);
         if (!new_shape.ok()) {
@@ -718,8 +724,11 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
       }
 
       // Validate addend for fusion.
+      auto addend_user_count = addend->user_count();
+      auto addend_idx = -1;
       if (IsSupportedType(addend->shape().element_type()) &&
           IsOperandFusible(addend, contraction)) {
+        addend_idx = new_operands.size();
         new_operands.push_back(addend);
       } else {
         return absl::OkStatus();
@@ -730,18 +739,28 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
               contraction->shape(), new_operands)));
 
       auto backend_config = custom_call->backend_config<BackendConfig>();
+      bool can_fuse_sum =
+          (ShapeUtil::Equal(custom_call->shape(), addend->shape()) &&
+           addend_user_count == 1 &&
+           custom_call->output_operand_aliasing().empty());
       auto fusions_config = GetFusionsConfig(&backend_config);
       auto optimization_config = GetOptimizationsConfig(&backend_config);
       // TODO(intel-tf): Here, we allow 1D addends only when they are the first
       // fused op. Remove this restriction once oneDNN has an optimized
       // implementation for broadcasted add across all dimensions.
       OneDnnFusionConfig_FusionKind kind =
-          (ShapeUtil::TrueRank(addend->shape()) == 1)
+          (ShapeUtil::TrueNumDimensions(addend->shape()) == 1)
               ? (fusions_config->ops().empty() ? OneDnnFusionConfig::BIAS
                                                : OneDnnFusionConfig::UNDEFINED)
-              : OneDnnFusionConfig::BINARY_ADD;
+          : can_fuse_sum ? OneDnnFusionConfig::SUM
+                         : OneDnnFusionConfig::BINARY_ADD;
       if (kind == OneDnnFusionConfig::UNDEFINED) return absl::OkStatus();
 
+      // Alias output buffers to addend for in-place accumulation
+      if (kind == OneDnnFusionConfig::SUM) {
+        custom_call->set_output_to_operand_aliasing({{{}, {addend_idx, {}}}});
+      }
+
       fusions_config->add_ops(kind);
 
       if (optional_addend_broadcast) {
@@ -919,7 +938,7 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
       fusions_config->add_ops(OneDnnFusionConfig::LINEAR);
       // Casting to int32 because of issues in proto config for decimal types
       // handling.
-      fusions_config->set_alpha_typecast(
+      fusions_config->add_alpha_typecast(
           *(reinterpret_cast<int32_t*>(&constant_value.value())));
       TF_RETURN_IF_ERROR(custom_call->set_backend_config(*backend_config));
       HloInstruction* new_instr;
@@ -962,6 +981,31 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
+  absl::Status HandleCopy(HloInstruction* instr) override {
+    HloInstruction *copy, *transpose, *custom_call;
+    if (Match(instr,
+              m::Copy(&copy, m::Transpose(&transpose,
+                                          OneDnnMatmulInstr(&custom_call))))) {
+      auto backend_config = custom_call->backend_config<BackendConfig>();
+      auto dimensions = backend_config->mutable_onednn_matmul_config()
+                            ->mutable_result()
+                            ->mutable_tensor()
+                            ->mutable_dimensions();
+      dimensions->Resize(transpose->dimensions().size(), 0);
+      // Configure inverse transpose dimensions
+      int counter = 1;
+      for (auto x : transpose->dimensions()) {
+        dimensions->Set(x, counter++);
+      }
+      auto matmul_call = Cast<HloCustomCallInstruction>(
+          custom_call->AddInstruction(custom_call->CloneWithNewOperands(
+              copy->shape(), custom_call->mutable_operands())));
+      TF_RETURN_IF_ERROR(matmul_call->set_backend_config(*backend_config));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(copy, matmul_call));
+    }
+    return absl::OkStatus();
+  }
+
   absl::Status FuseActivation(OneDnnFusionConfig_FusionKind kind,
                               HloInstruction* activation,
                               HloInstruction* contraction,
@@ -1022,12 +1066,12 @@ class OneDnnContractionRewriteVisitor : public DfsHloRewriteVisitor {
 
     auto lhs_batch_dims = dim_numbers.lhs_batch_dimensions();
     auto lhs_contraction_dims = dim_numbers.lhs_contracting_dimensions();
-    bool is_lhs_vector = lhs->shape().rank() ==
+    bool is_lhs_vector = lhs->shape().dimensions_size() ==
                          (lhs_batch_dims.size() + lhs_contraction_dims.size());
 
     auto rhs_batch_dims = dim_numbers.rhs_batch_dimensions();
     auto rhs_contraction_dims = dim_numbers.rhs_contracting_dimensions();
-    bool is_rhs_vector = rhs->shape().rank() ==
+    bool is_rhs_vector = rhs->shape().dimensions_size() ==
                          (rhs_batch_dims.size() + rhs_contraction_dims.size());
 
     if (!is_lhs_vector && !is_rhs_vector) return dot_instr;
@@ -1131,13 +1175,43 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
 #endif
   }
 
+  void UpdateTransposeDimensions(
+      HloInstruction* matmul, absl::InlinedVector<HloInstruction*, 2>& new_ops,
+      int operand_idx, absl::StatusOr<BackendConfig>* backend_config) {
+    HloInstruction *transpose, *operand;
+    // Update the dimensions only when the transpose does not involve the batch
+    // dimension, as modifying it could significantly impact the performance.
+    if (Match(matmul->mutable_operand(operand_idx),
+              m::Copy(m::Transpose(&transpose, m::Op(&operand)))) &&
+        transpose->dimensions()[0] == 0) {
+      new_ops[operand_idx] = operand;
+      for (auto x : transpose->dimensions()) {
+        (*GetOperandTensor(operand_idx, backend_config))->Add(x + 1);
+      }
+    }
+  }
+
   absl::Status HandleCustomCall(HloInstruction* custom_call) override {
-    HloInstruction* matmul;
-    if (Match(custom_call, OneDnnMatmulInstr(&matmul))) {
+    HloInstruction* contraction;
+    if (Match(custom_call, OneDnnMatmulInstr(&contraction))) {
+      auto backend_config = contraction->backend_config<BackendConfig>();
+
+      auto new_ops = contraction->mutable_operands();
+
+      UpdateTransposeDimensions(contraction, new_ops, 0, &backend_config);
+      UpdateTransposeDimensions(contraction, new_ops, 1, &backend_config);
+
+      auto matmul_call = Cast<HloCustomCallInstruction>(
+          contraction->AddInstruction(contraction->CloneWithNewOperands(
+              contraction->shape(), new_ops)));
+      TF_RETURN_IF_ERROR(matmul_call->set_backend_config(*backend_config));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(contraction, matmul_call));
       return HandleCustomCallInternal<dnnl::matmul::primitive_desc>(
-          custom_call);
+          matmul_call);
+    } else if (Match(custom_call, OneDnnConvolutionInstr(&contraction))) {
+      return HandleCustomCallInternal<
+          dnnl::convolution_forward::primitive_desc>(custom_call);
     }
-
     return DefaultAction(custom_call);
   }
 
@@ -1146,12 +1220,20 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
     auto scratch_add = AddScratch<PrimDesc>(custom_call);
     if (scratch_add.ok()) {
       custom_call = *scratch_add;
+      auto aliases = custom_call->output_operand_aliasing();
+      if (!aliases.empty()) {
+        custom_call->set_output_to_operand_aliasing({{{0}, aliases[0].second}});
+      }
     } else {
       VLOG(2) << scratch_add.status();
     }
-    auto weights_prepack = PrepackWeights<PrimDesc>(custom_call);
-    if (!weights_prepack.ok()) {
-      VLOG(2) << weights_prepack.status();
+    // TODO(intel-tf): Remove this condition after enabling weights prepacking
+    // for convolutions
+    if constexpr (std::is_same_v<PrimDesc, dnnl::matmul::primitive_desc>) {
+      auto weights_prepack = PrepackWeights<PrimDesc>(custom_call);
+      if (!weights_prepack.ok()) {
+        VLOG(2) << weights_prepack.status();
+      }
     }
     return absl::OkStatus();
   }
@@ -1168,8 +1250,8 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
   template <typename>
   bool GetUserScratch(HloInstruction*);
 
-  // Add scratch for matmul by changing the result of custom-call to
-  // tuple(result, scratch)
+  // Add scratch for matmul and convolution by changing the result of
+  // custom-call to tuple(result, scratch)
   template <typename PrimDesc>
   absl::StatusOr<HloInstruction*> AddScratch(HloInstruction* custom_call) {
     if (GetUserScratch<PrimDesc>(custom_call)) {
@@ -1202,7 +1284,7 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
     auto weights = custom_call->operand(1);
     auto weights_shape = weights->shape();
     Literal weights_literal;
-    if (!(weights_shape.rank() == 2 &&
+    if (!(weights_shape.dimensions_size() == 2 &&
           evaluator_.TryEvaluate(weights, &weights_literal, true))) {
       return absl::CancelledError(
           "Cannot prepack weights. Not constant 2D weights.");
@@ -1235,6 +1317,18 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
     }
   }
 
+  absl::StatusOr<tsl::protobuf::RepeatedField<uint64_t>*> GetOperandTensor(
+      int operand_idx, absl::StatusOr<BackendConfig>* backend_config) {
+    if (operand_idx > 1) {
+      return absl::CancelledError("Operand index must be either 0 or 1");
+    }
+    auto operand =
+        (operand_idx == 0)
+            ? (*backend_config)->mutable_onednn_matmul_config()->mutable_lhs()
+            : (*backend_config)->mutable_onednn_matmul_config()->mutable_rhs();
+    return operand->mutable_tensor()->mutable_dimensions();
+  }
+
   void ReorderWeight(const dnnl::memory::desc& src_md, void* src_buf,
                      const dnnl::memory::desc& dst_md, void* dst_buf) {
     auto onednn_threadpool = CreateOneDnnThreadPool(threadpool_device_.get());
@@ -1272,6 +1366,9 @@ EMIT_GET_BACKEND_CONFIG_SPECIALIZATION(GetWeightsPrepack,
                                        dnnl::matmul::primitive_desc,
                                        onednn_matmul_config,
                                        optimization_config, weights_prepacked);
+EMIT_GET_BACKEND_CONFIG_SPECIALIZATION(
+    GetUserScratch, dnnl::convolution_forward::primitive_desc,
+    onednn_conv_config, optimization_config, user_scratchpad);
 
 #define EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(SETTER, PRIM_DESC, CONFIG_TYPE, \
                                                CONFIG, SUB_CONFIG, FIELD)      \
@@ -1293,6 +1390,10 @@ EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(SetUserScratch,
                                        dnnl::matmul::primitive_desc,
                                        OneDnnMatMulConfig, onednn_matmul_config,
                                        optimization_config, user_scratchpad);
+EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(
+    SetUserScratch, dnnl::convolution_forward::primitive_desc,
+    OneDnnConvolutionConfig, onednn_conv_config, optimization_config,
+    user_scratchpad);
 
 absl::StatusOr<bool> OneDnnContractionRewriter::Run(
     HloModule* module,
diff --git a/third_party/xla/xla/service/cpu/onednn_convolution.cc b/third_party/xla/xla/service/cpu/onednn_convolution.cc
index 406fea773f95..4f67555fd8c9 100644
--- a/third_party/xla/xla/service/cpu/onednn_convolution.cc
+++ b/third_party/xla/xla/service/cpu/onednn_convolution.cc
@@ -21,22 +21,29 @@ limitations under the License.
 #include <cmath>
 #include <cstring>
 #include <initializer_list>
+#include <iterator>
 #include <utility>
 #include <vector>
 
-#define EIGEN_USE_THREADS
-
 #include "absl/base/dynamic_annotations.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "dnnl.hpp"
 #include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/onednn_util.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/util/onednn_threadpool.h"
+#include "tsl/platform/cpu_info.h"
 #include "tsl/platform/logging.h"
 
+#define EIGEN_USE_THREADS
+
 namespace xla {
 namespace cpu {
 namespace {
@@ -45,6 +52,26 @@ using dnnl::convolution_forward;
 using dnnl::memory;
 using dnnl::prop_kind;
 using dnnl::stream;
+
+memory::dims GetPrimitiveParameter(
+    const tsl::protobuf::RepeatedField<uint64_t>& field, int offset) {
+  memory::dims param_field(field.begin(), field.end());
+  // Subtract the offset so that values are interpreted accurately
+  for (int64_t& n : param_field) n -= offset;
+  return param_field;
+}
+
+std::vector<int> ComputePermutations(
+    uint64_t dims, uint64_t dim0, uint64_t dim1,
+    const tsl::protobuf::RepeatedField<uint64_t>& spatial_dims) {
+  std::vector<int> perm_axes(dims);
+  perm_axes[dim0] = 0;
+  perm_axes[dim1] = 1;
+  int index = 2;
+  for (uint64_t n : spatial_dims) perm_axes[n - 1] = index++;
+  return perm_axes;
+}
+
 }  // namespace
 
 dnnl::memory ReorderMemory(const dnnl::engine& engine,
@@ -70,8 +97,119 @@ GetKernelConfig<kOnednnConvConfig>(
   return (*backend_config)->mutable_onednn_conv_config();
 }
 
+template <>
+std::unique_ptr<dnnl::convolution_forward::primitive_desc>
+CreateOneDnnPrimDesc<dnnl::convolution_forward::primitive_desc>(
+    HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kCustomCall) {
+    return nullptr;
+  }
+  xla::HloCustomCallInstruction* custom_call =
+      Cast<xla::HloCustomCallInstruction>(instr);
+  absl::StatusOr<BackendConfig> backend_config =
+      custom_call->backend_config<BackendConfig>();
+  if (!backend_config.ok()) {
+    return nullptr;
+  }
+  const auto& conv_config = backend_config->onednn_conv_config();
+  const HloInstruction::InstructionVector& operands = custom_call->operands();
+  const Shape& input_shape = operands[0]->shape();
+  const Shape& weight_shape = operands[1]->shape();
+  const Shape& output_shape = custom_call->shape().IsTuple()
+                                  ? custom_call->shape().tuple_shapes(0)
+                                  : custom_call->shape();
+
+  std::vector<Shape> fused_shapes;
+  for (int i = 2; i < operands.size(); ++i) {
+    fused_shapes.push_back(operands[i]->shape());
+  }
+
+  memory::desc input_md = ShapeToMemDesc(input_shape);
+  memory::desc weights_md = ShapeToMemDesc(weight_shape);
+  memory::desc output_md = ShapeToMemDesc(output_shape);
+
+  memory::dims strides =
+      GetPrimitiveParameter(conv_config.window().strides(), 1);
+  memory::dims pad_left =
+      GetPrimitiveParameter(conv_config.window().pad_left(), 1);
+  memory::dims pad_right =
+      GetPrimitiveParameter(conv_config.window().pad_right(), 1);
+  memory::dims rhs_dilations =
+      GetPrimitiveParameter(conv_config.window().window_dilations(), 2);
+
+  uint64_t groups = conv_config.feature_groups();
+
+  memory::desc new_inp_md = input_md.permute_axes(ComputePermutations(
+      conv_config.dims(), conv_config.input().data().batch_dim(),
+      conv_config.input().data().feature_dim(),
+      conv_config.input().data().spatial_dims()));
+  memory::desc new_ker_md = weights_md.permute_axes(ComputePermutations(
+      conv_config.dims(), conv_config.kernel().filter().output_feature_dim(),
+      conv_config.kernel().filter().input_feature_dim(),
+      conv_config.kernel().filter().spatial_dims()));
+  memory::desc new_out_md = output_md.permute_axes(ComputePermutations(
+      conv_config.dims(), conv_config.output().data().batch_dim(),
+      conv_config.output().data().feature_dim(),
+      conv_config.output().data().spatial_dims()));
+
+  if (groups > 1) {
+    memory::dims corr_dims = new_ker_md.get_dims();
+    corr_dims.insert(corr_dims.begin(), 1, groups);
+    corr_dims[1] = corr_dims[1] / groups;
+    new_ker_md = new_ker_md.reshape(corr_dims);
+  }
+
+  std::vector<memory::desc> fused_mds;
+  for (const Shape& shape : fused_shapes) {
+    memory::desc mem_desc = ShapeToMemDesc(shape);
+    // The post-op argument must be oriented consistently with the output memory
+    // descriptor.
+    // Bias inputs are one-dimensional, as required by oneDNN. The broadcast
+    // dimensions of all binary post-op inputs are expanded in the rewriter.
+    // Hence, this condition should hold for all non-bias inputs.
+    if (mem_desc.get_ndims() == new_out_md.get_ndims()) {
+      mem_desc = mem_desc.permute_axes(ComputePermutations(
+          conv_config.dims(), conv_config.output().data().batch_dim(),
+          conv_config.output().data().feature_dim(),
+          conv_config.output().data().spatial_dims()));
+    }
+    fused_mds.push_back(mem_desc);
+  }
+
+  memory::desc bias_md = memory::desc();
+
+  dnnl::post_ops post_ops =
+      PopulateOneDnnPostOps(dnnl::engine(dnnl::engine::kind::cpu, 0), fused_mds,
+                            &conv_config.fusions(), nullptr, &bias_md);
+
+  memory::desc any_ker_md =
+      memory::desc(new_ker_md.get_dims(), new_ker_md.get_data_type(),
+                   dnnl::memory::format_tag::any);
+  memory::desc any_inp_md =
+      memory::desc(new_inp_md.get_dims(), new_inp_md.get_data_type(),
+                   GetFormatTag(new_inp_md.get_ndims()));
+  memory::desc any_res_md =
+      memory::desc(new_out_md.get_dims(), new_out_md.get_data_type(),
+                   GetFormatTag(new_out_md.get_ndims()));
+
+  dnnl::primitive_attr attrs;
+
+  if (conv_config.optimization_config().user_scratchpad()) {
+    attrs.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  }
+
+  if (post_ops.len() > 0) {
+    attrs.set_post_ops(post_ops);
+  }
+
+  return std::make_unique<convolution_forward::primitive_desc>(
+      dnnl::engine(dnnl::engine::kind::cpu, 0), prop_kind::forward_inference,
+      algorithm::convolution_direct, any_inp_md, any_ker_md, bias_md,
+      any_res_md, strides, rhs_dilations, pad_left, pad_right, attrs);
+}
+
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnConvolution(
-    void* result, void** args) {
+    void* result, void* scratch, void** args) {
   // args[0]: ptr to nargs
   // args[1]: ptr to ExecutableRunOptions
   // args[2]: ptr to OneDnnConvolutionConfig
@@ -97,81 +235,37 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnConvolution(
   OneDnnConvolutionConfig conv_config;
   conv_config.ParseFromString(config_str);
 
-  // Generate permutations to create memory descriptors
-  std::vector<int64_t> inp_perm_axes(conv_config.dims());
-  std::vector<int64_t> ker_perm_axes(conv_config.dims());
-  std::vector<int64_t> out_perm_axes(conv_config.dims());
-
-  int index_i = 0;
-  int index_o = 0;
-  int index_k = 0;
-
-  inp_perm_axes[conv_config.input().data().batch_dim()] = index_i++;
-  out_perm_axes[conv_config.output().data().batch_dim()] = index_o++;
-  ker_perm_axes[conv_config.kernel().filter().output_feature_dim()] = index_k++;
-
-  inp_perm_axes[conv_config.input().data().feature_dim()] = index_i++;
-  out_perm_axes[conv_config.output().data().feature_dim()] = index_o++;
-  ker_perm_axes[conv_config.kernel().filter().input_feature_dim()] = index_k++;
-
-  std::vector<int64_t> inp_dim_axes(
-      conv_config.input().data().spatial_dims().begin(),
-      conv_config.input().data().spatial_dims().end());
-  std::vector<int64_t> ker_dim_axes(
-      conv_config.kernel().filter().spatial_dims().begin(),
-      conv_config.kernel().filter().spatial_dims().end());
-  std::vector<int64_t> out_dim_axes(
-      conv_config.output().data().spatial_dims().begin(),
-      conv_config.output().data().spatial_dims().end());
-
-  std::for_each(inp_dim_axes.begin(), inp_dim_axes.end(),
-                [&inp_perm_axes, &index_i](int64_t& n) {
-                  n -= 1;
-                  inp_perm_axes[n] = index_i++;
-                });
-  std::for_each(ker_dim_axes.begin(), ker_dim_axes.end(),
-                [&ker_perm_axes, &index_k](int64_t& n) {
-                  n -= 1;
-                  ker_perm_axes[n] = index_k++;
-                });
-  std::for_each(out_dim_axes.begin(), out_dim_axes.end(),
-                [&out_perm_axes, &index_o](int64_t& n) {
-                  n -= 1;
-                  out_perm_axes[n] = index_o++;
-                });
-
-  memory::dims strides(conv_config.window().strides().begin(),
-                       conv_config.window().strides().end());
-  memory::dims pad_left(conv_config.window().pad_left().begin(),
-                        conv_config.window().pad_left().end());
-  memory::dims pad_right(conv_config.window().pad_right().begin(),
-                         conv_config.window().pad_right().end());
-  memory::dims rhs_dilations(conv_config.window().window_dilations().begin(),
-                             conv_config.window().window_dilations().end());
-
-  std::for_each(strides.begin(), strides.end(), [](int64_t& n) { n -= 1; });
-  std::for_each(pad_left.begin(), pad_left.end(), [](int64_t& n) { n -= 1; });
-  std::for_each(pad_right.begin(), pad_right.end(), [](int64_t& n) { n -= 1; });
-  std::for_each(rhs_dilations.begin(), rhs_dilations.end(),
-                [](int64_t& n) { n -= 2; });
-
-  auto groups = conv_config.feature_groups();
-
   MemrefInfo inp_minfo(args[arg_indx++]);
   MemrefInfo ker_minfo(args[arg_indx++]);
   MemrefInfo res_minfo(result);
 
-  auto inp_md = inp_minfo.GetOneDnnMemDesc();
-  auto ker_md = ker_minfo.GetOneDnnMemDesc();
-  auto res_md = res_minfo.GetOneDnnMemDesc();
-
-  std::vector<int> inp_axes(inp_perm_axes.begin(), inp_perm_axes.end());
-  std::vector<int> ker_axes(ker_perm_axes.begin(), ker_perm_axes.end());
-  std::vector<int> out_axes(out_perm_axes.begin(), out_perm_axes.end());
-
-  auto new_inp_md = inp_md.permute_axes(inp_axes);
-  auto new_ker_md = ker_md.permute_axes(ker_axes);
-  auto new_res_md = res_md.permute_axes(out_axes);
+  memory::desc inp_md = inp_minfo.GetOneDnnMemDesc();
+  memory::desc ker_md = ker_minfo.GetOneDnnMemDesc();
+  memory::desc res_md = res_minfo.GetOneDnnMemDesc();
+
+  memory::desc new_inp_md = inp_md.permute_axes(ComputePermutations(
+      conv_config.dims(), conv_config.input().data().batch_dim(),
+      conv_config.input().data().feature_dim(),
+      conv_config.input().data().spatial_dims()));
+  memory::desc new_ker_md = ker_md.permute_axes(ComputePermutations(
+      conv_config.dims(), conv_config.kernel().filter().output_feature_dim(),
+      conv_config.kernel().filter().input_feature_dim(),
+      conv_config.kernel().filter().spatial_dims()));
+  memory::desc new_res_md = res_md.permute_axes(ComputePermutations(
+      conv_config.dims(), conv_config.output().data().batch_dim(),
+      conv_config.output().data().feature_dim(),
+      conv_config.output().data().spatial_dims()));
+
+  memory::dims strides =
+      GetPrimitiveParameter(conv_config.window().strides(), 1);
+  memory::dims pad_left =
+      GetPrimitiveParameter(conv_config.window().pad_left(), 1);
+  memory::dims pad_right =
+      GetPrimitiveParameter(conv_config.window().pad_right(), 1);
+  memory::dims rhs_dilations =
+      GetPrimitiveParameter(conv_config.window().window_dilations(), 2);
+
+  uint64_t groups = conv_config.feature_groups();
 
   if (groups > 1) {
     auto corr_dims = new_ker_md.get_dims();
@@ -184,10 +278,19 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnConvolution(
   std::vector<memory::desc> fused_mds;
   std::vector<void*> fused_bufs;
   for (int64_t i = 0; i < num_fused_operands; ++i) {
+    // Skip the MemrefInfo object for the SUM operand, as oneDNN does not
+    // require an input and performs in-place accumulation.
+    if (conv_config.fusions().ops(i) == OneDnnFusionConfig::SUM) {
+      arg_indx++;
+      continue;
+    }
     MemrefInfo operand_minfo(args[arg_indx++]);
-    auto mem_desc = operand_minfo.GetOneDnnMemDesc();
+    memory::desc mem_desc = operand_minfo.GetOneDnnMemDesc();
     if (mem_desc.get_ndims() == new_res_md.get_ndims()) {
-      mem_desc = mem_desc.permute_axes(out_axes);
+      mem_desc = mem_desc.permute_axes(ComputePermutations(
+          conv_config.dims(), conv_config.output().data().batch_dim(),
+          conv_config.output().data().feature_dim(),
+          conv_config.output().data().spatial_dims()));
     }
     fused_mds.push_back(mem_desc);
     fused_bufs.push_back(operand_minfo.Data());
@@ -196,7 +299,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnConvolution(
   std::vector<std::pair<int, dnnl::memory>> postop_args;
   FusedOperandsRef fused_operands_ref{fused_bufs, postop_args};
 
-  auto bias_md = memory::desc();
+  memory::desc bias_md = memory::desc();
 
   dnnl::post_ops post_ops =
       PopulateOneDnnPostOps(cpu_engine, fused_mds, &conv_config.fusions(),
@@ -215,6 +318,11 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnConvolution(
   XLA_LIGHTWEIGHT_CHECK(num_args == arg_indx);
 
   dnnl::primitive_attr attrs;
+
+  if (conv_config.optimization_config().user_scratchpad()) {
+    attrs.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  }
+
   if (post_ops.len() > 0) {
     attrs.set_post_ops(post_ops);
   }
@@ -246,6 +354,17 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnConvolution(
                                             {DNNL_ARG_WEIGHTS, new_ker_mem},
                                             {DNNL_ARG_DST, new_res_mem}};
 
+  if (conv_config.optimization_config().user_scratchpad()) {
+    XLA_LIGHTWEIGHT_CHECK(scratch != nullptr);
+    MemrefInfo scratch_minfo(scratch);
+    memory::desc scratchpad_md = conv_pd->scratchpad_desc();
+    XLA_LIGHTWEIGHT_CHECK(scratchpad_md.get_size() <=
+                          scratch_minfo.GetOneDnnDims()[0]);
+    memory scratch_mem =
+        memory(scratchpad_md, cpu_engine, scratch_minfo.Data());
+    conv_args.insert({DNNL_ARG_SCRATCHPAD, scratch_mem});
+  }
+
   conv_args.insert(postop_args.begin(), postop_args.end());
   conv_prim.execute(onednn_stream, conv_args);
 
diff --git a/third_party/xla/xla/service/cpu/onednn_convolution.h b/third_party/xla/xla/service/cpu/onednn_convolution.h
index 291693375d1b..d8c0de29423f 100644
--- a/third_party/xla/xla/service/cpu/onednn_convolution.h
+++ b/third_party/xla/xla/service/cpu/onednn_convolution.h
@@ -25,7 +25,8 @@ namespace cpu {
 constexpr auto kOnednnConvConfig = BackendConfigOneofCase::kOnednnConvConfig;
 
 extern "C" {
-extern void __xla_cpu_runtime_OneDnnConvolution(void* result, void** args);
+extern void __xla_cpu_runtime_OneDnnConvolution(void* result, void* scratch,
+                                                void** args);
 }  // extern "C"
 
 template <>
diff --git a/third_party/xla/xla/service/cpu/onednn_float_support.cc b/third_party/xla/xla/service/cpu/onednn_float_support.cc
new file mode 100644
index 000000000000..abb5b863200d
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_float_support.cc
@@ -0,0 +1,68 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL)
+
+#include "xla/service/cpu/onednn_float_support.h"
+
+#include "xla/service/cpu/onednn_contraction_rewriter.h"
+
+namespace xla {
+namespace cpu {
+
+bool OneDnnFloatSupport::IsSupported(const HloInstruction& hlo) const {
+  switch (hlo.opcode()) {
+    // oneDNN rewritable ops
+    case HloOpcode::kDot:
+      return LowPrecisionType() == BF16 &&
+             OneDnnContractionRewriter::ShouldRewriteDot(&hlo, true);
+    case HloOpcode::kConvolution:
+      return LowPrecisionType() == BF16 &&
+             OneDnnContractionRewriter::ShouldRewriteConv(&hlo);
+    // Collective ops.
+    case HloOpcode::kAllGather:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllReduceStart:
+    case HloOpcode::kAllReduceDone:
+    case HloOpcode::kAllToAll:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kReduceScatter:
+    // Data movement only ops.
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kCopy:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kPad:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTranspose:
+    // Other special ops.
+    case HloOpcode::kBitcast:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL
diff --git a/third_party/xla/xla/service/cpu/onednn_float_support.h b/third_party/xla/xla/service/cpu/onednn_float_support.h
new file mode 100644
index 000000000000..c7f556e000b7
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_float_support.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_FLOAT_SUPPORT_H_
+#define XLA_SERVICE_CPU_ONEDNN_FLOAT_SUPPORT_H_
+
+#if defined(INTEL_MKL)
+
+#include "xla/service/float_support.h"
+
+namespace xla {
+namespace cpu {
+
+class OneDnnFloatSupport : public FloatSupport {
+ public:
+  explicit OneDnnFloatSupport(PrimitiveType low_precision_type)
+      : FloatSupport(low_precision_type) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return FloatSupport::SupportsLowPrecisionOperand(hlo, operand_index) ||
+           IsSupported(hlo);
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return FloatSupport::SupportsLowPrecisionOutput(hlo) || IsSupported(hlo);
+  }
+
+ private:
+  bool IsSupported(const HloInstruction& hlo) const;
+  // Performs early check for things that cannot be delayed becuase some later
+  // passes may change the shape of dot inputs.
+  bool DotSupported(const HloInstruction& hlo) const;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL
+#endif  // XLA_SERVICE_CPU_ONEDNN_FLOAT_SUPPORT_H_
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index 0e1a6a6d9298..e617387831de 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -50,6 +50,20 @@ using dnnl::matmul;
 using dnnl::memory;
 using dnnl::stream;
 
+void TransposeIfNecessary(
+    const tsl::protobuf::RepeatedField<uint64_t> dimensions,
+    bool transpose_last_2_dims, dnnl::memory::desc& mem_desc) {
+  if (mem_desc.get_ndims() < 2) return;
+  std::vector<int> permutation(mem_desc.get_ndims());
+  std::iota(permutation.begin(), permutation.end(), 0);
+  int counter = 0;
+  for (auto it = dimensions.begin(); it != dimensions.end(); it++) {
+    permutation[*it - 1] = counter++;
+  }
+  mem_desc = mem_desc.permute_axes(permutation);
+  TRANSPOSE_LAST_TWO_DIMS_IF(transpose_last_2_dims, mem_desc);
+}
+
 dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
     const dnnl::engine& engine, const dnnl::memory::desc& input_md,
     const dnnl::memory::desc& weights_md, const dnnl::memory::desc& bias_md,
@@ -70,13 +84,17 @@ dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
     const Shape& output_shape, const OneDnnMatMulConfig* matmul_config) {
   auto input_md = ShapeToMemDesc(input_shape);
   auto weights_md = ShapeToMemDesc(weights_shape);
-  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config->transpose_a(), input_md);
-  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config->transpose_b(), weights_md);
+  TransposeIfNecessary(matmul_config->lhs().tensor().dimensions(),
+                       matmul_config->transpose_a(), input_md);
+  TransposeIfNecessary(matmul_config->rhs().tensor().dimensions(),
+                       matmul_config->transpose_b(), weights_md);
   auto bias_md = absl::c_count(matmul_config->fusions().ops(),
                                OneDnnFusionConfig::BIAS) > 0
                      ? ShapeToMemDesc(bias_shape)
                      : dnnl::memory::desc{};
   auto output_md = ShapeToMemDesc(output_shape);
+  TransposeIfNecessary(matmul_config->result().tensor().dimensions(), false,
+                       output_md);
 
   // extend bias rank to match result rank
   auto missed_rank = output_md.get_ndims() - bias_md.get_ndims();
@@ -140,9 +158,13 @@ std::unique_ptr<matmul::primitive_desc> CreateMatMulPrimDesc(
     const OneDnnMatMulConfig& matmul_config) {
   auto input_md = ShapeToMemDesc(input_shape);
   auto weights_md = ShapeToMemDesc(weights_shape);
-  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_a(), input_md);
-  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_b(), weights_md);
+  TransposeIfNecessary(matmul_config.lhs().tensor().dimensions(),
+                       matmul_config.transpose_a(), input_md);
+  TransposeIfNecessary(matmul_config.rhs().tensor().dimensions(),
+                       matmul_config.transpose_b(), weights_md);
   auto output_md = ShapeToMemDesc(output_shape);
+  TransposeIfNecessary(matmul_config.result().tensor().dimensions(), false,
+                       output_md);
   std::vector<memory::desc> fused_mds;
   std::transform(fused_shapes.begin(), fused_shapes.end(),
                  std::back_inserter(fused_mds),
@@ -218,10 +240,10 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
   auto weights_md = weights_minfo.GetOneDnnMemDesc();
   // Input and weights memory::desc need to be in correct layout before matmul
   // primitive descriptor is created.
-  TRANSPOSE_LAST_TWO_DIMS_IF(
-      matmul_config.transpose_a() && input_md.get_ndims() > 1, input_md);
-  TRANSPOSE_LAST_TWO_DIMS_IF(
-      matmul_config.transpose_b() && weights_md.get_ndims() > 1, weights_md);
+  TransposeIfNecessary(matmul_config.lhs().tensor().dimensions(),
+                       matmul_config.transpose_a(), input_md);
+  TransposeIfNecessary(matmul_config.rhs().tensor().dimensions(),
+                       matmul_config.transpose_b(), weights_md);
   auto output_md = output_minfo.GetOneDnnMemDesc();
 
   Literal* reordered_weights_literal = nullptr;
@@ -278,11 +300,18 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
     onednn_stream.wait();
     weights_md = reordered_weights_md;
   }
-
+  TransposeIfNecessary(matmul_config.result().tensor().dimensions(), false,
+                       output_md);
   const int64_t num_fused_operands = num_args - arg_indx;
   std::vector<memory::desc> fused_mds;
   std::vector<void*> fused_bufs;
   for (int64_t i = 0; i < num_fused_operands; ++i) {
+    // Skip the MemrefInfo object for the SUM operand, as oneDNN does not
+    // require an input and performs in-place accumulation.
+    if (matmul_config.fusions().ops(i) == OneDnnFusionConfig::SUM) {
+      arg_indx++;
+      continue;
+    }
     MemrefInfo operand_minfo(args[arg_indx++]);
     fused_mds.push_back(operand_minfo.GetOneDnnMemDesc());
     fused_bufs.push_back(operand_minfo.Data());
@@ -368,8 +397,10 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMulReorder(
   XLA_LIGHTWEIGHT_CHECK(num_args >= arg_indx);
 
   // Update dims and strides for transposed inputs.
-  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_a(), input_md);
-  TRANSPOSE_LAST_TWO_DIMS_IF(matmul_config.transpose_b(), weight_md);
+  TransposeIfNecessary(matmul_config.lhs().tensor().dimensions(),
+                       matmul_config.transpose_a(), input_md);
+  TransposeIfNecessary(matmul_config.rhs().tensor().dimensions(),
+                       matmul_config.transpose_b(), weight_md);
 
   // extend bias rank to match result rank
   if (!bias_md.is_zero()) {
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.cc b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
index b2faa73488f2..c81a8e186374 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
@@ -45,15 +45,17 @@ namespace cpu {
 struct MemrefInfoPOD {
   int64_t dtype;
   int64_t rank;
+  void* data;
+  int64_t unused;  // This unused value pads the struct to align with a 64-byte
+                   // cacheline
   int64_t dims[kOneDnnMaxNDims];
   int64_t strides[kOneDnnMaxNDims];
-  void* data;
 };
 
 MemrefInfoHandler CreateMemrefFromShape(const Shape& shape, void* const buf) {
   MemrefInfoHandler result(new MemrefInfoPOD);
   result->dtype = shape.element_type();
-  result->rank = shape.rank();
+  result->rank = shape.dimensions_size();
   auto dimensions = shape.dimensions();
   std::copy(dimensions.begin(), dimensions.end(),
             absl::MakeSpan(result->dims).begin());
@@ -76,8 +78,8 @@ MemrefInfoHandler CreateMemrefInfoFromLiteral(const Literal* literal) {
 std::pair<std::vector<int64_t>, std::vector<int64_t>> GetDimsStrides(
     const Shape& shape) {
   // oneDNN handles scalar as a vector of size 1.
-  const bool is_scalar = shape.rank() == 0;
-  int64_t rank = is_scalar ? 1 : shape.rank();
+  const bool is_scalar = shape.dimensions_size() == 0;
+  int64_t rank = is_scalar ? 1 : shape.dimensions_size();
   std::vector<int64_t> strides(rank);
   std::vector<int64_t> scalar_shape(1, 1);
   absl::Span<const int64_t> dimensions =
@@ -99,7 +101,7 @@ StackAlloca GetAllocaAndEmitMemrefInfo(llvm::IRBuilderBase& builder,
                                        const llvm_ir::IrArray& ir_array) {
   const Shape& shape = ir_array.GetShape();
   // oneDNN handles scalar as a vector of size 1.
-  int64_t rank = shape.rank() == 0 ? 1 : shape.rank();
+  int64_t rank = shape.dimensions_size() == 0 ? 1 : shape.dimensions_size();
   auto [dims, strides] = GetDimsStrides(shape);
 
   // Type of struct
@@ -109,7 +111,7 @@ StackAlloca GetAllocaAndEmitMemrefInfo(llvm::IRBuilderBase& builder,
       llvm::ArrayType::get(builder.getInt64Ty(), kOneDnnMaxNDims);
   llvm::StructType* memref_info_type = llvm::StructType::get(
       builder.getContext(),
-      {i64_type, i64_type, i64_array_type, i64_array_type, ptr_type});
+      {i64_type, i64_type, ptr_type, i64_type, i64_array_type, i64_array_type});
 
   // Prepare array dims and strides.
   llvm::Value* dims_val = llvm::UndefValue::get(i64_array_type);
@@ -121,16 +123,19 @@ StackAlloca GetAllocaAndEmitMemrefInfo(llvm::IRBuilderBase& builder,
     strides_val = builder.CreateInsertValue(strides_val, stride_val, i);
   }
 
-  // Prepare values for struct MemrefInfo.
+  // Prepare values for struct MemrefInfo with padding to align to system
+  // cacheline
   llvm::Value* dtype_val = builder.getInt64(shape.element_type());
   llvm::Value* rank_val = builder.getInt64(rank);
+  llvm::Value* pad_val = builder.getInt64(0xff);
   llvm::Value* data_ptr = ir_array.GetBasePointer();
   llvm::Value* memref_info_val = llvm::UndefValue::get(memref_info_type);
   memref_info_val = builder.CreateInsertValue(memref_info_val, dtype_val, 0);
   memref_info_val = builder.CreateInsertValue(memref_info_val, rank_val, 1);
-  memref_info_val = builder.CreateInsertValue(memref_info_val, dims_val, 2);
-  memref_info_val = builder.CreateInsertValue(memref_info_val, strides_val, 3);
-  memref_info_val = builder.CreateInsertValue(memref_info_val, data_ptr, 4);
+  memref_info_val = builder.CreateInsertValue(memref_info_val, data_ptr, 2);
+  memref_info_val = builder.CreateInsertValue(memref_info_val, pad_val, 3);
+  memref_info_val = builder.CreateInsertValue(memref_info_val, dims_val, 4);
+  memref_info_val = builder.CreateInsertValue(memref_info_val, strides_val, 5);
 
   // Allocate MemrefInfo on the stack
   llvm::Value* memref_info_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
@@ -159,8 +164,8 @@ dnnl::memory::data_type MemrefInfo::GetOneDnnDataType() const {
 }
 
 dnnl::memory::desc MemrefInfo::GetOneDnnMemDesc() const {
-  auto dims = GetOneDnnDims();
   auto dtype = GetOneDnnDataType();
+  auto dims = GetOneDnnDims();
   auto strides = GetOneDnnStrides();
   return dnnl::memory::desc{dims, dtype, strides};
 }
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index ddc27539a65a..08a5fd33f133 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -347,8 +347,9 @@ bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
 
   // Currently patterns without scale and shift are not supported.
   // OneDNN only supports 2 <= rank <= 5
-  if (!(prod_shape.rank() >= 2 && prod_shape.rank() <= 5) || !shiftFound ||
-      !scaleFound) {
+  if (!(prod_shape.dimensions_size() >= 2 &&
+        prod_shape.dimensions_size() <= 5) ||
+      !shiftFound || !scaleFound) {
     return false;
   }
 
@@ -401,7 +402,7 @@ bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
                                             HloOpcode::kAdd &&
                                         reduce->dimensions().size() == 1 &&
                                         reduce->dimensions()[0] ==
-                                            reduce->shape().rank());
+                                            reduce->shape().dimensions_size());
                               }))
           .WithOperand(1, m::Op(&broadcast0).WithOpcode(HloOpcode::kBroadcast))
           .WithOneUser();
@@ -420,7 +421,8 @@ bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
                     return (reducer->root_instruction()->opcode() ==
                                 HloOpcode::kAdd &&
                             reduce->dimensions().size() == 1 &&
-                            reduce->dimensions()[0] == reduce->shape().rank());
+                            reduce->dimensions()[0] ==
+                                reduce->shape().dimensions_size());
                   }))
           .WithOperand(1, m::Op(&broadcast1).WithOpcode(HloOpcode::kBroadcast));
 
diff --git a/third_party/xla/xla/service/cpu/onednn_util.cc b/third_party/xla/xla/service/cpu/onednn_util.cc
index c1e727cc73be..2a5cf9de8313 100644
--- a/third_party/xla/xla/service/cpu/onednn_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_util.cc
@@ -47,6 +47,7 @@ dnnl::post_ops PopulateOneDnnPostOps(
     FusedOperandsRef* fused_operands_ref, dnnl::memory::desc* bias_md) {
   dnnl::post_ops post_ops;
   int fused_operand_idx = 0;
+  int linear_scale_idx = 0;
   for (auto& fused_op : fusion_config->ops()) {
     switch (fused_op) {
       case OneDnnFusionConfig::RELU:
@@ -67,6 +68,9 @@ dnnl::post_ops PopulateOneDnnPostOps(
       case OneDnnFusionConfig::SIGMOID:
         post_ops.append_eltwise(dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
         break;
+      case OneDnnFusionConfig::SUM:
+        post_ops.append_sum();
+        break;
       case OneDnnFusionConfig::BIAS: {
         *bias_md = fused_mds.at(fused_operand_idx);
         if (fused_operands_ref) {
@@ -96,9 +100,10 @@ dnnl::post_ops PopulateOneDnnPostOps(
       case OneDnnFusionConfig::LINEAR: {
         float const_float;
         *(reinterpret_cast<int32_t*>(&const_float)) =
-            fusion_config->alpha_typecast();
+            fusion_config->alpha_typecast()[linear_scale_idx];
         post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, const_float,
                                 0.f);
+        linear_scale_idx++;
       } break;
       default:
         LOG(FATAL) << __FILE__ << ":" << __LINE__
diff --git a/third_party/xla/xla/service/cpu/onednn_util.h b/third_party/xla/xla/service/cpu/onednn_util.h
index 22423e93fdbb..17561bbd18b2 100644
--- a/third_party/xla/xla/service/cpu/onednn_util.h
+++ b/third_party/xla/xla/service/cpu/onednn_util.h
@@ -53,6 +53,10 @@ inline bool IsSupportedType(xla::PrimitiveType dtype) {
   return false;
 }
 
+inline bool HasAMXTile() {
+  return TestCPUFeature(tsl::port::CPUFeature::AMX_TILE);
+}
+
 struct FusedOperandsRef {
   const std::vector<void*>& bufs;
   std::vector<std::pair<int, dnnl::memory>>& postop_args;
diff --git a/third_party/xla/xla/service/cpu/parallel_loop_emitter.cc b/third_party/xla/xla/service/cpu/parallel_loop_emitter.cc
index ba5bec52bd18..2bfffd88df93 100644
--- a/third_party/xla/xla/service/cpu/parallel_loop_emitter.cc
+++ b/third_party/xla/xla/service/cpu/parallel_loop_emitter.cc
@@ -50,7 +50,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   CHECK(!ShapeUtil::IsScalar(shape_));
 
   llvm_ir::ForLoopNest loop_nest(loop_name, b_);
-  const int64_t num_dims = shape_.dimensions_size();
+  const int64_t num_dims = shape_.dimensions().size();
   std::vector<llvm::Value*> array_multi_index(num_dims);
 
   // Add loops from outer-most to inner-most dimensions.
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
index 3d9fd3169fd0..da6a909b5d86 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
@@ -230,18 +230,35 @@ bool ParallelTaskAssigner::AssignParallelTasksHelper(
   std::vector<HloInstruction*> instructions(computation->instructions().begin(),
                                             computation->instructions().end());
   for (auto* instruction : instructions) {
-    // Assign parallel tasks to sub-computations for While and Call HLOs.
+    // Assign parallel tasks to sub-computations for While, Conditional and Call
+    // HLOs.
     // TODO(b/27458679) Evaluate alternative intra-op parallelism placement,
     // and support other callable computations like reduce.
+    bool control_flow_hlo = false;
+
     if (instruction->opcode() == HloOpcode::kWhile) {
+      control_flow_hlo = true;
       changed |= AssignParallelTasksHelper(module, instruction->while_body(),
                                            hlo_to_parallel_tasks);
-      continue;
+
+    } else if (instruction->opcode() == HloOpcode::kConditional) {
+      control_flow_hlo = true;
+      for (HloComputation* branch : instruction->branch_computations()) {
+        changed |=
+            AssignParallelTasksHelper(module, branch, hlo_to_parallel_tasks);
+      }
+
     } else if (instruction->opcode() == HloOpcode::kCall) {
+      control_flow_hlo = true;
       changed |= AssignParallelTasksHelper(module, instruction->to_apply(),
                                            hlo_to_parallel_tasks);
+    }
+
+    // Continue to the next instruction if we handled control flow above.
+    if (control_flow_hlo) {
       continue;
     }
+
     // Skip if no parallel tasks were computed in first pass.
     auto it = hlo_to_parallel_tasks.find(instruction);
     if (it == hlo_to_parallel_tasks.end()) {
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
index 5cc1c12bd5c8..b4f8c9ed7562 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
@@ -23,18 +23,18 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
 #include "xla/service/hlo_cost_analysis.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class ParallelTaskAssignmentTest : public HloTestBase {
+class ParallelTaskAssignmentTest : public HloHardwareIndependentTestBase {
  protected:
   // Use any value larger than 2 since we only test whether a module is
   // parallelized or not
@@ -43,7 +43,7 @@ class ParallelTaskAssignmentTest : public HloTestBase {
   cpu::TargetMachineFeaturesStub target_machine_features_;
 
   ParallelTaskAssignmentTest()
-      : HloTestBase(), target_machine_features_([](int64_t shape_size) {
+      : target_machine_features_([](int64_t shape_size) {
           return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
         }) {}
 
diff --git a/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.cc b/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.cc
index 3e9cee923751..b8a828d36126 100644
--- a/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.cc
+++ b/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.cc
@@ -17,12 +17,15 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <utility>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
 #include "absl/base/dynamic_annotations.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -32,14 +35,16 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/executable_run_options.h"
+#include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/primitive_util.h"
 #include "xla/service/custom_call_status.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace ffi = xla::ffi;
 
@@ -61,6 +66,10 @@ void BuildArgBuffers(absl::Span<const int32_t> types, int64_t* encoded_dims,
   int64_t dim_pos = 0;
   for (int64_t i = 0; i < types.size(); ++i) {
     auto dtype = static_cast<xla::PrimitiveType>(types[i]);
+    if (dtype == xla::PrimitiveType::TOKEN) {
+      builder.AddTokenArg();
+      continue;
+    }
     auto dims = DecodeDims(encoded_dims + dim_pos);
     auto elem_count = absl::c_accumulate(dims, 1, std::multiplies<int64_t>());
     auto data_width = xla::primitive_util::ByteWidth(dtype) * elem_count;
@@ -80,6 +89,10 @@ void BuildRetBuffers(absl::Span<const int32_t> types, int64_t* encoded_dims,
   int64_t dim_pos = 0;
   for (int64_t i = 0; i < types.size(); ++i) {
     auto dtype = static_cast<xla::PrimitiveType>(types[i]);
+    if (dtype == xla::PrimitiveType::TOKEN) {
+      builder.AddTokenRet();
+      continue;
+    }
     auto dims = DecodeDims(encoded_dims + dim_pos);
     auto elem_count = absl::c_accumulate(dims, 1, std::multiplies<int64_t>());
     auto data_width = xla::primitive_util::ByteWidth(dtype) * elem_count;
@@ -128,6 +141,22 @@ static absl::Status BuildAndCallFfi(
     }
   }
 
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+
+  // Initialize handler execution state
+  if (registration->bundle.instantiate) {
+    ffi::CallFrameBuilder builder(0, 0);
+    ffi::CallFrameBuilder::AttributesBuilder attrs;
+    attrs.Append(attributes);
+    ffi::CallFrameBuilder::AttributesMap attrs_map = attrs.Build();
+    builder.AddAttributes(attrs_map);
+    ffi::CallFrame call_frame = builder.Build();
+    ffi::CallOptions options;
+    options.execution_state = execution_state.get();
+    TF_RETURN_IF_ERROR(Call(registration->bundle.instantiate, call_frame,
+                            options, XLA_FFI_ExecutionStage_INSTANTIATE));
+  }
+
   ffi::CallFrameBuilder builder(inputs.size(), outputs.size());
 
   // Forward the constructed attributes to the call frame
@@ -141,9 +170,12 @@ static absl::Status BuildAndCallFfi(
 
   // Forward executable run options to the FFI handlers via the call options.
   ffi::CallOptions call_options = {
-      run_options->run_id(), run_options->device_ordinal(),
+      run_options->run_id(),
+      run_options->device_ordinal(),
       ffi::CallOptions::CpuOptions{run_options->intra_op_thread_pool()},
-      /*called_computation=*/nullptr, run_options->ffi_execution_context()};
+      /*called_computation=*/nullptr,
+      run_options->ffi_execution_context(),
+      execution_state.get()};
 
   ffi::CallFrame call_frame = builder.Build();
   return ffi::Call(registration->bundle.execute, call_frame, call_options);
diff --git a/third_party/xla/xla/service/cpu/scoped_ir_builder_test.cc b/third_party/xla/xla/service/cpu/scoped_ir_builder_test.cc
index e93886274399..e702fa26aaae 100644
--- a/third_party/xla/xla/service/cpu/scoped_ir_builder_test.cc
+++ b/third_party/xla/xla/service/cpu/scoped_ir_builder_test.cc
@@ -21,17 +21,19 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/cpu/target_machine_features_stub.h"
 #include "xla/service/logical_buffer.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla::cpu {
 namespace {
 
-class IRBuilderGuardTest : public HloTestBase {
+class IRBuilderGuardTest : public HloHardwareIndependentTestBase {
  public:
   IrEmitter MakeIrEmitter(llvm::LLVMContext& context) {
     auto module = std::make_unique<llvm::Module>("test", context);
@@ -46,7 +48,9 @@ class IRBuilderGuardTest : public HloTestBase {
     std::unique_ptr<BufferAssignment> buffer_assignment =
         BufferAssigner::Run(
             hlo.get(), std::make_unique<DependencyHloOrdering>(hlo.get()),
-            backend().compiler()->BufferSizeBytesFunction(),
+            [](const BufferValue& buffer) {
+              return CpuExecutable::ShapeSizeBytes(buffer.shape());
+            },
             [](LogicalBuffer::Color) { return /*alignment=*/1; })
             .value();
 
diff --git a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
new file mode 100644
index 000000000000..c599b5f3ff64
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.cc
@@ -0,0 +1,146 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/small_while_loop_hoisting_pass.h"
+
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::cpu {
+
+static bool InstructionIsUnavailable(const HloInstruction* instr) {
+  // The following instructions are not currently supported by the call thunk
+  // emitter due to how the legacy & thunk emitters interact; specifically,
+  // how the run options are passed.
+  // Convolution & Dot may or may not call into Eigen depending on the shape,
+  // Eigen requires a thread pool to be passed so we conservatively exclude it.
+  // (This could be relaxed with a little work to make it optional if required).
+  switch (instr->opcode()) {
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kScatter:
+      return true;
+    default:
+      return IsCollective(instr);
+  }
+}
+
+// Simple DFS check to see if a computation contains any instructions that are
+// "unavailable" for the call thunk emitter.
+static bool ContainsUnavailableInstruction(
+    const HloInstruction* instr,
+    absl::flat_hash_map<const HloInstruction*, bool>& has_unavailable_instr) {
+  if (const auto itr = has_unavailable_instr.find(instr);
+      itr != has_unavailable_instr.end()) {
+    return itr->second;
+  }
+
+  if (InstructionIsUnavailable(instr)) {
+    return has_unavailable_instr.insert({instr, true}).first->second;
+  }
+
+  for (const HloComputation* called_comp : instr->called_computations()) {
+    for (const HloInstruction* sub_instr : called_comp->instructions()) {
+      if (ContainsUnavailableInstruction(sub_instr, has_unavailable_instr)) {
+        return has_unavailable_instr.insert({instr, true}).first->second;
+      }
+    }
+  }
+
+  return has_unavailable_instr.insert({instr, false}).first->second;
+}
+
+SmallWhileLoopHoistingPass::SmallWhileLoopHoistingPass(
+    int64_t small_buffer_access_size)
+    : small_buffer_access_size_(small_buffer_access_size) {}
+
+absl::StatusOr<bool> SmallWhileLoopHoistingPass::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  std::vector<HloInstruction*> while_instrs;
+  for (auto* comp : module->MakeComputationPostOrder(execution_threads)) {
+    absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
+                    HloPredicateIsOp<HloOpcode::kWhile>);
+  }
+
+  bool changed = false;
+  absl::flat_hash_map<const HloInstruction*, bool> has_unavailable_instr;
+  for (HloInstruction* while_instr : while_instrs) {
+    if (ContainsUnavailableInstruction(while_instr, has_unavailable_instr)) {
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(bool is_small_call_site, IsSmall(while_instr));
+    if (!is_small_call_site) {
+      continue;
+    }
+
+    HloComputation::Builder builder(
+        absl::StrCat(while_instr->name(), "_computation"));
+    std::vector<HloInstruction*> parameters;
+    parameters.reserve(while_instr->operand_count());
+    for (HloInstruction* operand : while_instr->operands()) {
+      TF_ASSIGN_OR_RETURN(HloInstruction * parameter,
+                          builder.AddParameter(HloInstruction::CreateParameter(
+                              while_instr->operand_index(operand),
+                              operand->shape(), operand->name())));
+      parameters.push_back(parameter);
+    }
+    builder.AddInstruction(
+        while_instr->CloneWithNewOperands(while_instr->shape(), parameters));
+
+    HloInstruction* call_instruction =
+        while_instr->AddInstruction(HloInstruction::CreateCall(
+            while_instr->shape(), while_instr->operands(),
+            module->AddEmbeddedComputation(builder.Build())));
+    call_instruction->add_frontend_attribute("xla_cpu_small_call", "true");
+
+    TF_RETURN_IF_ERROR(while_instr->ReplaceAllUsesWith(call_instruction));
+    TF_RETURN_IF_ERROR(while_instr->SafelyDropAllControlDependencies());
+    TF_RETURN_IF_ERROR(while_instr->parent()->RemoveInstruction(while_instr));
+
+    changed = true;
+  }
+
+  return changed;
+}
+
+absl::StatusOr<bool> SmallWhileLoopHoistingPass::IsSmall(
+    const HloInstruction* instr) {
+  HloCostAnalysis cost_analysis(&CpuExecutable::ShapeSizeBytes);
+  TF_RETURN_IF_ERROR(cost_analysis.RevisitInstruction(instr));
+  return cost_analysis.bytes_accessed(*instr) < small_buffer_access_size_;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
new file mode 100644
index 000000000000..a190d2a38039
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_SMALL_WHILE_LOOP_HOISTING_PASS_H_
+#define XLA_SERVICE_CPU_SMALL_WHILE_LOOP_HOISTING_PASS_H_
+
+#include <cstdint>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+namespace xla::cpu {
+
+// Hoists small while loops into a separate function.
+// This pass enables the thunk emitter to emit small while loops as a single
+// kernel instead of a series of thunks.
+class SmallWhileLoopHoistingPass final : public HloModulePass {
+ public:
+  explicit SmallWhileLoopHoistingPass(int64_t small_buffer_access_size);
+
+  absl::string_view name() const final { return "small-while-loop-hoisting"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) final;
+
+ private:
+  absl::StatusOr<bool> IsSmall(const HloInstruction* instr);
+
+ private:
+  int64_t small_buffer_access_size_;
+};
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_SMALL_WHILE_LOOP_HOISTING_PASS_H_
diff --git a/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass_test.cc b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass_test.cc
new file mode 100644
index 000000000000..cfc1ae6401da
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/small_while_loop_hoisting_pass_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/small_while_loop_hoisting_pass.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+class SmallWhileLoopHoistingPassTest : public HloHardwareIndependentTestBase {
+ protected:
+  absl::StatusOr<bool> RunSmallWhileLoopHoistingPass(HloModule* module) {
+    return cpu::SmallWhileLoopHoistingPass(256).Run(module);
+  }
+};
+
+TEST_F(SmallWhileLoopHoistingPassTest, SmallWhileLoopHoisting) {
+  constexpr absl::string_view hlo_string = R"(
+    HloModule simple_while_loop
+
+    while_body {
+      counter = s32[] parameter(0)
+      increment = s32[] constant(1)
+      ROOT incremented_counter = s32[] add(counter, increment)
+    }
+
+    while_condition {
+      counter = s32[] parameter(0)
+      limit = s32[] constant(10)
+      ROOT less_than = pred[] compare(counter, limit), direction=LT
+    }
+
+    ENTRY main {
+      initial_counter = s32[] constant(0)
+      ROOT while_loop = s32[] while(initial_counter), condition=while_condition, body=while_body
+    }
+    )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunSmallWhileLoopHoistingPass(m.get()));
+  EXPECT_TRUE(changed);
+
+  const HloInstruction* call_instr = FindInstruction(m.get(), HloOpcode::kCall);
+  ASSERT_NE(call_instr, nullptr);
+  std::optional<std::string> maybe_small_call =
+      call_instr->get_frontend_attribute("xla_cpu_small_call");
+  ASSERT_NE(maybe_small_call, std::nullopt);
+  EXPECT_EQ(*maybe_small_call, "true");
+
+  EXPECT_EQ(call_instr->to_apply()->root_instruction()->opcode(),
+            HloOpcode::kWhile);
+}
+
+TEST_F(SmallWhileLoopHoistingPassTest, NoBigWhileLoopHoisting) {
+  constexpr absl::string_view hlo_string = R"(
+    HloModule simple_while_loop
+
+    reduce_fn {
+      x = s32[] parameter(0)
+      y = s32[] parameter(1)
+      ROOT add = s32[] add(x, y)
+    }
+
+    while_body {
+      counter = s32[] parameter(0)
+      dummy_constant = s32[1000000] constant({...})
+      // The big constant must be in the call graph to be considered in the cost
+      // analysis, hence the reduce.
+      element_reduce = s32[] reduce(dummy_constant, counter), dimensions={0}, to_apply=reduce_fn
+      ROOT incremented_counter = s32[] add(counter, element_reduce)
+    }
+
+    while_condition {
+      counter = s32[] parameter(0)
+      limit = s32[] constant(10)
+      ROOT less_than = pred[] compare(counter, limit), direction=LT
+    }
+
+    ENTRY main {
+      initial_counter = s32[] constant(0)
+      ROOT while_loop = s32[] while(initial_counter), condition=while_condition, body=while_body
+    }
+    )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunSmallWhileLoopHoistingPass(m.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(SmallWhileLoopHoistingPassTest, NoInOutFeedWhileLoopHoisting) {
+  constexpr absl::string_view hlo_string = R"(
+    HloModule in_out_feed_while_loop, entry_computation_layout={(pred[])->(pred[])}
+
+    body_fn (T.4: (pred[])) -> (pred[]) {
+      T.4 = (pred[]) parameter(0)
+      after-all.5 = token[] after-all()
+      infeed.6 = ((f32[1,3]{1,0}, pred[], u32[]), token[]) infeed(token[] after-all.5)
+      get-tuple-element.7 = token[] get-tuple-element(((f32[1,3]{1,0}, pred[], u32[]), token[]) infeed.6), index=1
+      get-tuple-element.8 = (f32[1,3]{1,0}, pred[], u32[]) get-tuple-element(((f32[1,3]{1,0}, pred[], u32[]), token[]) infeed.6), index=0
+      get-tuple-element.11 = f32[1,3]{1,0} get-tuple-element((f32[1,3]{1,0}, pred[], u32[]) get-tuple-element.8), index=0
+      constant.12 = f32[] constant(1)
+      broadcast.13 = f32[1,3]{1,0} broadcast(f32[] constant.12), dimensions={}
+      multiply.14 = f32[1,3]{1,0} multiply(f32[1,3]{1,0} get-tuple-element.11, f32[1,3]{1,0} broadcast.13)
+      concatenate.15 = f32[1,6]{1,0} concatenate(f32[1,3]{1,0} multiply.14, f32[1,3]{1,0} multiply.14), dimensions={1}
+      get-tuple-element.10 = u32[] get-tuple-element((f32[1,3]{1,0}, pred[], u32[]) get-tuple-element.8), index=2
+      tuple.16 = (f32[1,6]{1,0}, u32[]) tuple(f32[1,6]{1,0} concatenate.15, u32[] get-tuple-element.10)
+      after-all.17 = token[] after-all()
+      outfeed.18 = token[] outfeed((f32[1,6]{1,0}, u32[]) tuple.16, token[] after-all.17), outfeed_shape=(f32[1,6]{1,0}, u32[])
+      tuple.19 = () tuple()
+      get-tuple-element.9 = pred[] get-tuple-element((f32[1,3]{1,0}, pred[], u32[]) get-tuple-element.8), index=1
+      ROOT tuple.20 = (pred[]) tuple(pred[] get-tuple-element.9)
+    }
+
+    condition_fn (T.22: (pred[])) -> pred[] {
+      T.22 = (pred[]) parameter(0)
+      ROOT get-tuple-element.23 = pred[] get-tuple-element((pred[]) T.22), index=0
+    }
+
+    ENTRY main (prev0.1: pred[]) -> (pred[]) {
+      prev0.1 = pred[] parameter(0)
+      tuple.2 = (pred[]) tuple(pred[] prev0.1)
+      ROOT tuple.26 = (pred[]) while((pred[]) tuple.2), condition=condition_fn, body=body_fn
+    }
+    )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunSmallWhileLoopHoistingPass(m.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 990a98aa5be5..97c3cedcafbc 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #    Tests for LLVM-based CPU backend for XLA.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "tsl_copts")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api")
@@ -36,7 +36,6 @@ cc_library(
         "//xla/service:cpu_plugin",
         "//xla/tests:llvm_irgen_test_base",
         "//xla/tsl/platform:test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -97,7 +96,6 @@ xla_cc_test(
         "//xla/tests:literal_test_util",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -110,7 +108,6 @@ xla_cc_test(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_googletest//:gtest",
         "@llvm-project//llvm:ir_headers",
     ],
 )
@@ -149,12 +146,29 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
-        "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
     ],
 )
 
+xla_cc_test(
+    name = "cpu_ffi_test",
+    srcs = ["cpu_ffi_test.cc"],
+    deps = [
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tests:pjrt_cpu_client_registry",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 xla_cc_test(
     name = "cpu_intrinsic_test",
     srcs = ["cpu_intrinsic_test.cc"],
@@ -164,8 +178,8 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/cpu:cpu_compiler",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -216,7 +230,6 @@ xla_cc_test(
         ":cpu_codegen_test_main",
         "//xla/service/cpu:cpu_compiler",
         "//xla/tsl/platform:test",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -241,7 +254,6 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -295,7 +307,6 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -328,8 +339,8 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/backends/cpu/codegen:cpu_features",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/cpu:cpu_compiler",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -337,6 +348,7 @@ xla_cc_test(
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
@@ -346,9 +358,15 @@ xla_cc_test(
     srcs = ["cpu_while_test.cc"],
     deps = [
         ":cpu_codegen_test_main",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/hlo/ir:hlo",
         "//xla/service/cpu:cpu_compiler",
         "//xla/tests:literal_test_util",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
@@ -377,6 +395,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:test_macros_header",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
@@ -403,6 +422,25 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "onednn_memory_util_test",
+    srcs = ["onednn_memory_util_test.cc"],
+    copts = tsl_copts(),
+    fail_if_no_test_linked = False,  # NOLINT=There are only tests for Intel MKL.
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/service:cpu_plugin",
+        "//xla/service/cpu:onednn_memory_util",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/core:status_test_util",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_cc_test(
     name = "onednn_layer_norm_test",
     srcs = ["onednn_layer_norm_test.cc"],
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
index 2628e4db9129..95f413837e66 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
@@ -68,7 +68,7 @@ class CpuAotCompilationTest : public HloTestBase {
     // Load Executable from AOT compilation result.
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        loaded_aot_result->LoadExecutable(compiler, stream_exec));
+        std::move(*loaded_aot_result).LoadExecutable(compiler, stream_exec));
   }
 };
 
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc
index c07b7306df28..f0148de99325 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_dyn_shape_test.cc
@@ -61,6 +61,10 @@ TEST_F(CpuDynamicShapeTest, DynamicShapeR2) {
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
+  hlo_module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
                                 filecheck_pattern,
                                 /*match_optimized_ir=*/false);
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 4ba27769e6d3..1ca442e5c06d 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -58,6 +58,10 @@ class CpuEigenDotOperationTest
     auto hlo_module = CreateNewVerifiedModule();
     hlo_module->AddEntryComputation(std::move(entry_computation));
 
+    hlo_module->mutable_config()
+        .mutable_debug_options()
+        .set_xla_cpu_use_thunk_runtime(false);
+
     CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
                                   filecheck_lines,
                                   /*match_optimized_ir=*/true);
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_ffi_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_ffi_test.cc
new file mode 100644
index 000000000000..dcaff570f27a
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/tests/cpu_ffi_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "absl/status/status.h"
+#include "xla/debug_options_flags.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+static absl::Status HostIOCallback(ffi::Token, ffi::Result<ffi::Token>,
+                                   ffi::Result<ffi::AnyBuffer>) {
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(
+    kIOCallback, HostIOCallback,
+    ffi::Ffi::Bind().Arg<ffi::Token>().Ret<ffi::Token>().Ret<ffi::AnyBuffer>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$io_callback", "Host",
+                         kIOCallback);
+
+class CpuFFITest : public HloPjRtTestBase,
+                   public ::testing::WithParamInterface<bool> {
+ protected:
+  bool thunk_rt_val_;
+
+  CpuFFITest() { thunk_rt_val_ = GetParam(); }
+
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = GetDebugOptionsFromFlags();
+    debug_options.set_xla_cpu_use_thunk_runtime(thunk_rt_val_);
+    return debug_options;
+  }
+};
+
+TEST_P(CpuFFITest, EmulateImpureCallbackWithTokens) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  HloInstruction* p0 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto instr = Cast<HloCustomCallInstruction>(
+      builder.AddInstruction(HloInstruction::CreateCustomCall(
+          ShapeUtil::MakeTupleShape(
+              {ShapeUtil::MakeTokenShape(), ShapeUtil::MakeShape(S32, {})}),
+          {p0}, "__xla_test$$io_callback", "",
+          CustomCallApiVersion::API_VERSION_TYPED_FFI)));
+
+  instr->set_custom_call_has_side_effect(true);
+  module->AddEntryComputation(builder.Build());
+
+  TF_EXPECT_OK(Execute(std::move(module), {}).status());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FFITest, CpuFFITest, ::testing::Values(true, false),
+    [](const ::testing::TestParamInfo<CpuFFITest::ParamType>& info) {
+      return info.param ? "ThunkRuntime" : "LegacyRuntime";
+    });
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc
index 27e411efb54b..5680e7c90c32 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -166,15 +166,19 @@ TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
           HloInstruction::CreateParameter(1, r0f32, "y"))));
   auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
+  Window window;
+  WindowDimension* window_dimension = window.add_dimensions();
+  window_dimension->set_size(1);
+  window_dimension->set_stride(1);
+  window_dimension->set_base_dilation(1);
+  window_dimension->set_window_dilation(1);
   // This is a nop reduction.
-  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
-      cshape,
-      builder.AddInstruction(HloInstruction::CreateReshape(
-          ShapeUtil::MakeShape(F32, {1, 6}), concatenate)),
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduceWindow(
+      cshape, concatenate,
       /*init_value=*/
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0))),
-      /*dimensions_to_reduce=*/{0}, add_f32));
+      window, add_f32));
 
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(cshape, HloOpcode::kExp, reduce));
@@ -208,11 +212,11 @@ TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
 
   auto fusion_instruction2 = reduce->operand(0);
   EXPECT_EQ(HloOpcode::kFusion, fusion_instruction1->opcode());
-  EXPECT_EQ(HloOpcode::kReshape,
+  EXPECT_EQ(HloOpcode::kConcatenate,
             fusion_instruction2->fused_expression_root()->opcode());
   // There should be 5 fused instructions in the second fusion instruction: 1
-  // parameter, negate, ceil, concat, and reshape.
-  EXPECT_EQ(5, fusion_instruction2->fused_instruction_count())
+  // parameter, negate, ceil and concat.
+  EXPECT_EQ(4, fusion_instruction2->fused_instruction_count())
       << fusion_instruction2->fused_instructions_computation()->ToString();
 
   // Compile and execute the computation.
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
index aafd7830efdb..bb9082719936 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -44,6 +44,8 @@ const char* const kTriple_android_arm = "armv7-none-android";
 
 struct IntrinsicTestSpec {
   HloOpcode opcode;
+  PrimitiveType type;
+  bool match_optimized_ir;
   absl::string_view triple;
   absl::string_view features;
   absl::string_view check_lines;
@@ -61,6 +63,9 @@ class CpuUnaryIntrinsicTest
     std::string opcode(HloOpcodeString(spec.opcode));
     opcode[0] = toupper(opcode[0]);
 
+    std::string type(PrimitiveType_Name(spec.type));
+    type[0] = toupper(type[0]);
+
     std::string triple{spec.triple.data(), spec.triple.size()};
     if (triple == kTriple_x86_64) {
       triple = "x86_64";
@@ -79,13 +84,16 @@ class CpuUnaryIntrinsicTest
       features = "";
     }
 
-    return absl::StrCat(opcode, "_On_", triple,
+    std::string opt = spec.match_optimized_ir ? "" : "_PreOpt_";
+
+    return absl::StrCat(opcode, "_", type, opt, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
 
  private:
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     HloTestBase::SetAotFastMathDebugOptions(&debug_options);
     return debug_options;
   }
@@ -104,7 +112,7 @@ TEST_P(CpuUnaryIntrinsicTest, DoIt) {
   LLVMInitializeARMTargetInfo();
   LLVMInitializeARMTargetMC();
 
-  auto param_shape = ShapeUtil::MakeShape(F32, {1024});
+  auto param_shape = ShapeUtil::MakeShape(spec.type, {1024});
   HloInstruction* param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "input"));
   builder.AddInstruction(
@@ -124,8 +132,12 @@ TEST_P(CpuUnaryIntrinsicTest, DoIt) {
 
   std::string check_lines{spec.check_lines.data(), spec.check_lines.size()};
 
+  hlo_module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options, check_lines,
-                                /*match_optimized_ir=*/true);
+                                spec.match_optimized_ir);
 }
 
 IntrinsicTestSpec CpuUnaryIntrinsicTestCases[] = {
@@ -133,39 +145,48 @@ IntrinsicTestSpec CpuUnaryIntrinsicTestCases[] = {
     // a function call.
 
     IntrinsicTestSpec{
-        HloOpcode::kExp, kTriple_x86_64, "",
+        HloOpcode::kExp, F32, true, kTriple_x86_64, "",
         R"(CHECK: fmul fast <4 x float> splat (float 0xBF2BD01060000000)"},
 
+    IntrinsicTestSpec{HloOpcode::kExp, F64, true, kTriple_x86_64, "",
+                      R"(CHECK: tail call fast <2 x double> @llvm.exp.v2f64)"},
+
+    IntrinsicTestSpec{HloOpcode::kExp, F64, false, kTriple_x86_64, "",
+                      R"(CHECK: call fast double @llvm.exp.f64(double %4)"},
+
     IntrinsicTestSpec{
-        HloOpcode::kExp, kTriple_x86_64, "+avx",
+        HloOpcode::kExp, F32, true, kTriple_x86_64, "+avx",
         R"(CHECK: fmul fast <8 x float> splat (float 0xBF2BD01060000000)"},
 
     IntrinsicTestSpec{
-        HloOpcode::kExp, kTriple_android_arm, "+neon",
+        HloOpcode::kExp, F32, true, kTriple_android_arm, "+neon",
         R"(CHECK: fmul fast <4 x float> splat (float 0xBF2BD01060000000)"},
 
     IntrinsicTestSpec{
-        HloOpcode::kTanh, kTriple_x86_64, "",
-        R"(CHECK: fcmp fast uge <4 x float> %wide.load, splat (float 0xC01FFEC880000000)"},
+        HloOpcode::kTanh, F32, true, kTriple_x86_64, "",
+        R"(CHECK: fcmp fast uge <4 x float> %wide.load, splat (float
+        0xC01FFEC880000000)"},
 
     IntrinsicTestSpec{
-        HloOpcode::kTanh, kTriple_x86_64, "+avx",
-        R"(CHECK: fcmp fast uge <8 x float> %wide.load, splat (float 0xC01FFEC880000000)"},
+        HloOpcode::kTanh, F32, true, kTriple_x86_64, "+avx",
+        R"(CHECK: fcmp fast uge <8 x float> %wide.load, splat (float
+        0xC01FFEC880000000)"},
 
     IntrinsicTestSpec{
-        HloOpcode::kTanh, kTriple_android_arm, "",
-        R"(CHECK: fcmp fast uge <4 x float> %wide.load, splat (float 0xC01FFEC880000000)"},
+        HloOpcode::kTanh, F32, true, kTriple_android_arm, "",
+        R"(CHECK: fcmp fast uge <4 x float> %wide.load, splat (float
+        0xC01FFEC880000000)"},
 
     IntrinsicTestSpec{
-        HloOpcode::kLog, kTriple_x86_64, "",
+        HloOpcode::kLog, F32, true, kTriple_x86_64, "",
         R"(CHECK: fadd fast <4 x float> splat (float 0x3FBDE4A340000000)"},
 
     IntrinsicTestSpec{
-        HloOpcode::kLog, kTriple_x86_64, "+avx",
+        HloOpcode::kLog, F32, true, kTriple_x86_64, "+avx",
         R"(CHECK: fadd fast <8 x float> splat (float 0x3FBDE4A340000000)"},
 
     IntrinsicTestSpec{
-        HloOpcode::kLog, kTriple_android_arm, "",
+        HloOpcode::kLog, F32, true, kTriple_android_arm, "",
         R"(CHECK: fadd fast <4 x float> splat (float 0x3FBDE4A340000000)"}};
 
 INSTANTIATE_TEST_SUITE_P(CpuUnaryIntrinsicTestInstantiation,
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc
index 957acb051cac..f14c47115a25 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_key_value_sort_test.cc
@@ -58,6 +58,10 @@ CHECK: call void @__xla_cpu_runtime_KeyValueSort
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
                                 /*match_optimized_ir=*/true);
 }
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc
index 3bf4aa8e34e1..94a894e88c49 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -56,6 +56,10 @@ CHECK: private unnamed_addr constant [48 x i8]
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
                                 /*match_optimized_ir=*/false);
 }
@@ -84,6 +88,10 @@ CHECK: private unnamed_addr constant [0 x i8]
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
                                 /*match_optimized_ir=*/false);
 }
@@ -112,6 +120,10 @@ CHECK: Outfeed
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
                                 /*match_optimized_ir=*/false);
 }
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc
index 43300f04bf8e..f793a6515150 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_profiling_test.cc
@@ -90,6 +90,11 @@ TEST_P(CpuProfilingTest, DoIt) {
 
   std::string check_lines{spec.check_lines.data(), spec.check_lines.size()};
 
+  hlo_module.value()
+      ->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(hlo_module).value(), options,
                                 check_lines,
                                 /*match_optimized_ir=*/true);
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc
index a97122264456..ff80410ebac3 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_spmd_compile_test.cc
@@ -74,6 +74,10 @@ ENTRY main {
   config.set_debug_options(GetDebugOptionsFromFlags());
   auto module = ParseAndReturnVerifiedModule(hlo_string, config).value();
 
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CpuAotCompilationOptions options{
       /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
       /*features=*/"",
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
index 593d9910a7ac..96c6b9ca83e5 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -29,14 +29,15 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
+#include "tsl/platform/platform.h"
 
 namespace xla {
 namespace cpu {
@@ -80,13 +81,14 @@ class CpuVectorizationTest
           '_');
     }
 
-    return absl::StrCat(opcode, "_On_", triple,
+    return absl::StrCat(opcode, "_on_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
 
  private:
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     HloTestBase::SetAotFastMathDebugOptions(&debug_options);
     return debug_options;
   }
@@ -122,6 +124,10 @@ TEST_P(CpuVectorizationTest, DoIt) {
 
   std::string check_lines{spec.check_lines.data(), spec.check_lines.size()};
 
+  hlo_module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_cpu_use_thunk_runtime(false);
+
   CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options, check_lines,
                                 /*match_optimized_ir=*/true);
 }
@@ -166,16 +172,21 @@ class MaxIsaTest : public CpuCodegenTest,
   }
 };
 
-TEST_P(MaxIsaTest, ShouldEnableFeature) {
+class X86MaxIsaTest : public MaxIsaTest {};
+
+TEST_P(X86MaxIsaTest, ShouldEnableFeature) {
   HloComputation::Builder builder(TestName());
   MaxIsaTestSpec spec = GetParam();
+  if (!tsl::port::IsX86CPU()) {
+    GTEST_SKIP() << "This test is for x86 CPUs.";
+  }
 
   auto max_feature = CpuFeatureFromString(spec.max_isa);
   bool should_enable = ShouldEnableCpuFeature(spec.feature, *max_feature);
   EXPECT_EQ(should_enable, spec.should_enable);
 }
 
-std::vector<MaxIsaTestSpec> GetMaxIsaTestCases() {
+std::vector<MaxIsaTestSpec> GetX86MaxIsaTestCases() {
   return std::vector<MaxIsaTestSpec>({
       MaxIsaTestSpec{"AVX2", "avx", true},
       MaxIsaTestSpec{"AVX2", "avx2", true},
@@ -188,9 +199,52 @@ std::vector<MaxIsaTestSpec> GetMaxIsaTestCases() {
   });
 }
 
-INSTANTIATE_TEST_SUITE_P(MaxIsaTestInstantiation, MaxIsaTest,
-                         ::testing::ValuesIn(GetMaxIsaTestCases()),
-                         MaxIsaTest::Name);
+INSTANTIATE_TEST_SUITE_P(X86MaxIsaTestInstantiation, X86MaxIsaTest,
+                         ::testing::ValuesIn(GetX86MaxIsaTestCases()),
+                         X86MaxIsaTest::Name);
+
+class AArch64MaxIsaTest : public MaxIsaTest {};
+
+TEST_P(AArch64MaxIsaTest, ShouldEnableFeature) {
+  HloComputation::Builder builder(TestName());
+  MaxIsaTestSpec spec = GetParam();
+  if (!tsl::port::IsAarch64CPU()) {
+    GTEST_SKIP() << "This test is for AArch64 CPUs.";
+  }
+
+  auto max_feature = CpuFeatureFromString(spec.max_isa);
+  bool should_enable = ShouldEnableCpuFeature(spec.feature, *max_feature);
+  EXPECT_EQ(should_enable, spec.should_enable);
+}
+
+std::vector<MaxIsaTestSpec> GetAArch64MaxIsaTestCases() {
+  return std::vector<MaxIsaTestSpec>({
+      MaxIsaTestSpec{"NEON", "neon", true},
+      MaxIsaTestSpec{"NEON", "sve", false},
+      MaxIsaTestSpec{"NEON", "sve2", false},
+      MaxIsaTestSpec{"SVE", "neon", true},
+      MaxIsaTestSpec{"SVE", "sve", true},
+      MaxIsaTestSpec{"SVE", "sve2", false},
+      MaxIsaTestSpec{"SVE2", "neon", true},
+      MaxIsaTestSpec{"SVE2", "sve", true},
+      MaxIsaTestSpec{"SVE2", "sve2", true},
+  });
+}
+
+INSTANTIATE_TEST_SUITE_P(AArch64MaxIsaTestInstantiation, AArch64MaxIsaTest,
+                         ::testing::ValuesIn(GetAArch64MaxIsaTestCases()),
+                         AArch64MaxIsaTest::Name);
+
+class DefaultMaxIsaTest : public CpuCodegenTest {};
+
+TEST_F(DefaultMaxIsaTest, NeonForOssAArch64) {
+  if (!tsl::port::IsAarch64CPU()) {
+    GTEST_SKIP() << "This test is for AArch64 CPUs.";
+  }
+  DebugOptions debug_options =
+      HloHardwareIndependentTestBase::GetDebugOptionsForTest();
+  EXPECT_EQ(debug_options.xla_cpu_max_isa(), "NEON");
+}
 
 struct JitVectorizationTestSpec {
   HloOpcode opcode;
@@ -213,7 +267,8 @@ class JitVectorizationTest
  private:
   DebugOptions GetDebugOptionsForTest() const override {
     JitVectorizationTestSpec spec = GetParam();
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_cpu_max_isa(spec.max_isa);
     // For AVX512, we have to override the default `prefer_vector_width=256`
     // setting. Otherwise, LLVM won't generate AVX512.
@@ -225,7 +280,9 @@ class JitVectorizationTest
   }
 };
 
-TEST_P(JitVectorizationTest, JitUpToIsa) {
+// Most Aarch64 CPUs are still using 128-bit registers so we don't have this
+// test for Aarch64.
+TEST_P(JitVectorizationTest, JitX86UpToIsa) {
   if (!tsl::port::IsX86CPU()) {
     GTEST_SKIP() << "This feature only works for x86 CPUs.";
   }
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc
index 4582b57db8cd..fbd33b76a454 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_while_test.cc
@@ -17,8 +17,14 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -68,6 +74,214 @@ ENTRY entry {
   LiteralTestUtil::ExpectR0Equal(3, result);
 }
 
+// Add a small while loop that calls sort to verify that the small call emitter
+// can deal with thread local buffers.
+TEST_F(CpuCodegenTest, WhileSort) {
+  const std::string hlo_text = R"(
+    HloModule sort_loop
+
+    %sort_comparison (x: f32[], y: f32[]) -> pred[] {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %result = compare(x, y), direction=GT
+    }
+
+    %sort_body (input: (f32[4])) -> (f32[4]) {
+      %input = f32[4] parameter(0)
+      %sorted = sort(%input), dimensions={0}, is_stable=true, to_apply=%sort_comparison
+      ROOT %result = tuple(%sorted)
+    }
+
+    %while_condition (loop_state: (s32[], f32[4])) -> pred[] {
+      %loop_state = (s32[], f32[4]) parameter(0)
+      %index = s32[] get-tuple-element(%loop_state), index=0
+      %limit = s32[] constant(4)
+      ROOT %result = compare(%index, %limit), direction=LT
+    }
+
+    %while_body (loop_state: (s32[], f32[4])) -> (s32[], f32[4]) {
+      %loop_state = (s32[], f32[4]) parameter(0)
+      %index = s32[] get-tuple-element(%loop_state), index=0
+      %data = f32[4] get-tuple-element(%loop_state), index=1
+      %increment = s32[] constant(1)
+      %new_index = add(%index, %increment)
+      %sorted_data = sort(%data), dimensions={0}, is_stable=true, to_apply=%sort_comparison
+      ROOT %new_state = tuple(%new_index, %sorted_data)
+    }
+
+    ENTRY %main (input: f32[4]) -> f32[4] {
+      %init_index = s32[] constant(0)
+      %input = f32[4] parameter(0)
+      %loop_state = (s32[], f32[4]) tuple(%init_index, %input)
+      %while_loop = (s32[], f32[4]) while(%loop_state), condition=%while_condition, body=%while_body
+      ROOT %final_sorted = f32[4] get-tuple-element(%while_loop), index=1
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+
+  Literal input = LiteralUtil::CreateR1<float>({3, 1, 4, 2});
+
+  // Compile and execute the computation.
+  auto result = ExecuteAndTransfer(module->Clone(), {&input});
+
+  // Check the output correctness.
+  LiteralTestUtil::ExpectR1Equal(absl::MakeConstSpan({4.0f, 3.0f, 2.0f, 1.0f}),
+                                 result);
+}
+
+// Simply runs a HLO module that was seen to fail before a fix due to an
+// attempted deref of a nullptr in eigen runtime dot call. This test is not
+// intended to verify the correctness of the result, just that it does not
+// result in an error.
+TEST_F(CpuCodegenTest, WhileDotDoesNotError) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule Complex_1.44, entry_computation_layout={(f32[1,2,2]{2,1,0}, f32[1,2,2]{2,1,0})->f32[]}
+
+    %add_F32.39 (lhs.40: f32[], rhs.41: f32[]) -> f32[] {
+      %lhs.40 = f32[] parameter(0)
+      %rhs.41 = f32[] parameter(1)
+      ROOT %add.42 = f32[] add(%lhs.40, %rhs.41)
+    }
+
+    %unblocked_body.14 (parameter.15: (s32[], c64[1,2,2], c64[1,2,2], pred[1,1,1])) -> (s32[], c64[1,2,2], c64[1,2,2], pred[1,1,1]) {
+      %parameter.15 = (s32[], c64[1,2,2]{2,1,0}, c64[1,2,2]{2,1,0}, pred[1,1,1]{2,1,0}) parameter(0)
+      %get-tuple-element.16 = s32[] get-tuple-element(%parameter.15), index=0
+      %constant.20 = s32[] constant(1)
+      %add.21 = s32[] add(%get-tuple-element.16, %constant.20)
+      %get-tuple-element.17 = c64[1,2,2]{2,1,0} get-tuple-element(%parameter.15), index=1
+      %iota.23 = s32[1,2,2]{2,1,0} iota(), iota_dimension=1
+      %iota.22 = s32[1,2,2]{2,1,0} iota(), iota_dimension=2
+      %compare.24 = pred[1,2,2]{2,1,0} compare(%iota.23, %iota.22), direction=GE
+      %broadcast.25 = s32[1,2,2]{2,1,0} broadcast(%get-tuple-element.16), dimensions={}
+      %compare.26 = pred[1,2,2]{2,1,0} compare(%iota.22, %broadcast.25), direction=EQ
+      %and.27 = pred[1,2,2]{2,1,0} and(%compare.24, %compare.26)
+      %get-tuple-element.18 = c64[1,2,2]{2,1,0} get-tuple-element(%parameter.15), index=2
+      %real.30 = f32[1,2,2]{2,1,0} real(%get-tuple-element.18)
+      %imag.31 = f32[1,2,2]{2,1,0} imag(%get-tuple-element.18)
+      %negate.32 = f32[1,2,2]{2,1,0} negate(%imag.31)
+      %complex.33 = c64[1,2,2]{2,1,0} complex(%real.30, %negate.32)
+      %dot.34 = c64[1,2,2]{2,1,0} dot(%get-tuple-element.18, %complex.33), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+      %transpose.35 = c64[1,2,2]{2,1,0} transpose(%dot.34), dimensions={0,1,2}
+      %subtract.36 = c64[1,2,2]{2,1,0} subtract(%get-tuple-element.17, %transpose.35)
+      %constant.37 = s32[] constant(0)
+      %dynamic-slice.38 = c64[1,1,1]{2,1,0} dynamic-slice(%subtract.36, %constant.37, %get-tuple-element.16, %get-tuple-element.16), dynamic_slice_sizes={1,1,1}
+      %real.39 = f32[1,1,1]{2,1,0} real(%dynamic-slice.38)
+      %sqrt.40 = f32[1,1,1]{2,1,0} sqrt(%real.39)
+      %constant.41 = f32[] constant(0)
+      %broadcast.42 = f32[1,1,1]{2,1,0} broadcast(%constant.41), dimensions={}
+      %complex.43 = c64[1,1,1]{2,1,0} complex(%sqrt.40, %broadcast.42)
+      %reshape.46 = c64[1]{0} reshape(%complex.43)
+      %broadcast.47 = c64[1,2,2]{2,1,0} broadcast(%reshape.46), dimensions={0}
+      %divide.48 = c64[1,2,2]{2,1,0} divide(%subtract.36, %broadcast.47)
+      %constant.28 = c64[] constant((0, 0))
+      %broadcast.29 = c64[1,2,2]{2,1,0} broadcast(%constant.28), dimensions={}
+      %select.49 = c64[1,2,2]{2,1,0} select(%and.27, %divide.48, %broadcast.29)
+      %add.50 = c64[1,2,2]{2,1,0} add(%select.49, %get-tuple-element.18)
+      %get-tuple-element.19 = pred[1,1,1]{2,1,0} get-tuple-element(%parameter.15), index=3
+      %compare.44 = pred[1,1,1]{2,1,0} compare(%sqrt.40, %sqrt.40), direction=NE
+      %or.45 = pred[1,1,1]{2,1,0} or(%get-tuple-element.19, %compare.44)
+      ROOT %tuple.51 = (s32[], c64[1,2,2]{2,1,0}, c64[1,2,2]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(%add.21, %get-tuple-element.17, %add.50, %or.45)
+    }
+
+    %unblocked_condition.52 (parameter.53: (s32[], c64[1,2,2], c64[1,2,2], pred[1,1,1])) -> pred[] {
+      %parameter.53 = (s32[], c64[1,2,2]{2,1,0}, c64[1,2,2]{2,1,0}, pred[1,1,1]{2,1,0}) parameter(0)
+      %get-tuple-element.55 = c64[1,2,2]{2,1,0} get-tuple-element(%parameter.53), index=1
+      %get-tuple-element.56 = c64[1,2,2]{2,1,0} get-tuple-element(%parameter.53), index=2
+      %get-tuple-element.57 = pred[1,1,1]{2,1,0} get-tuple-element(%parameter.53), index=3
+      %get-tuple-element.54 = s32[] get-tuple-element(%parameter.53), index=0
+      %constant.58 = s32[] constant(2)
+      ROOT %compare.59 = pred[] compare(%get-tuple-element.54, %constant.58), direction=LT
+    }
+
+    %xla.cholesky_c64_1_2_2__lower.76 (a.1: c64[1,2,2]) -> c64[1,2,2] {
+      %constant.12 = s32[] constant(0)
+      %a.1 = c64[1,2,2]{2,1,0} parameter(0)
+      %slice.6 = c64[1,2,2]{2,1,0} slice(%a.1), slice={[0:1], [0:2], [0:2]}
+      %slice.7 = c64[1,2,2]{2,1,0} slice(%slice.6), slice={[0:1], [0:2], [0:2]}
+      %constant.8 = c64[] constant((0, 0))
+      %broadcast.9 = c64[1,2,2]{2,1,0} broadcast(%constant.8), dimensions={}
+      %constant.10 = pred[] constant(false)
+      %broadcast.11 = pred[1,1,1]{2,1,0} broadcast(%constant.10), dimensions={}
+      %tuple.13 = (s32[], c64[1,2,2]{2,1,0}, c64[1,2,2]{2,1,0}, pred[1,1,1]{2,1,0}) tuple(%constant.12, %slice.7, %broadcast.9, %broadcast.11)
+      %while.60 = (s32[], c64[1,2,2]{2,1,0}, c64[1,2,2]{2,1,0}, pred[1,1,1]{2,1,0}) while(%tuple.13), condition=%unblocked_condition.52, body=%unblocked_body.14
+      %get-tuple-element.61 = s32[] get-tuple-element(%while.60), index=0
+      %get-tuple-element.62 = c64[1,2,2]{2,1,0} get-tuple-element(%while.60), index=1
+      %constant.4 = pred[] constant(false)
+      %broadcast.5 = pred[1,1,1]{2,1,0} broadcast(%constant.4), dimensions={}
+      %get-tuple-element.64 = pred[1,1,1]{2,1,0} get-tuple-element(%while.60), index=3
+      %or.65 = pred[1,1,1]{2,1,0} or(%broadcast.5, %get-tuple-element.64)
+      %broadcast.70 = pred[1,1,1]{2,1,0} broadcast(%or.65), dimensions={0,1,2}
+      %reshape.71 = pred[1]{0} reshape(%broadcast.70)
+      %broadcast.72 = pred[1,2,2]{2,1,0} broadcast(%reshape.71), dimensions={0}
+      %constant.73 = c64[] constant((nan, 0))
+      %broadcast.74 = c64[1,2,2]{2,1,0} broadcast(%constant.73), dimensions={}
+      %constant.2 = c64[] constant((0, 0))
+      %broadcast.3 = c64[1,2,2]{2,1,0} broadcast(%constant.2), dimensions={}
+      %get-tuple-element.63 = c64[1,2,2]{2,1,0} get-tuple-element(%while.60), index=2
+      %constant.66 = s32[] constant(0)
+      %constant.67 = s32[] constant(0)
+      %constant.68 = s32[] constant(0)
+      %dynamic-update-slice.69 = c64[1,2,2]{2,1,0} dynamic-update-slice(%broadcast.3, %get-tuple-element.63, %constant.66, %constant.67, %constant.68)
+      ROOT %select.75 = c64[1,2,2]{2,1,0} select(%broadcast.72, %broadcast.74, %dynamic-update-slice.69)
+    }
+
+    ENTRY %Complex_1.44 (input_real.1: f32[1,2,2], input_imag.2: f32[1,2,2]) -> f32[] {
+      %input_real.1 = f32[1,2,2]{2,1,0} parameter(0)
+      %input_imag.2 = f32[1,2,2]{2,1,0} parameter(1)
+      %complex.3 = c64[1,2,2]{2,1,0} complex(%input_real.1, %input_imag.2)
+      %real.4 = f32[1,2,2]{2,1,0} real(%complex.3)
+      %imag.5 = f32[1,2,2]{2,1,0} imag(%complex.3)
+      %negate.6 = f32[1,2,2]{2,1,0} negate(%imag.5)
+      %complex.7 = c64[1,2,2]{2,1,0} complex(%real.4, %negate.6)
+      %transpose.8 = c64[1,2,2]{1,2,0} transpose(%complex.7), dimensions={0,2,1}
+      %dot.9 = c64[1,2,2]{2,1,0} dot(%complex.3, %transpose.8), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+      %transpose.10 = c64[1,2,2]{2,1,0} transpose(%dot.9), dimensions={0,1,2}
+      %iota.13 = s32[2]{0} iota(), iota_dimension=0
+      %constant.14 = s32[] constant(0)
+      %broadcast.15 = s32[2]{0} broadcast(%constant.14), dimensions={}
+      %add.16 = s32[2]{0} add(%iota.13, %broadcast.15)
+      %broadcast.18 = s32[2,2]{1,0} broadcast(%add.16), dimensions={0}
+      %iota.12 = s32[2]{0} iota(), iota_dimension=0
+      %broadcast.17 = s32[2,2]{1,0} broadcast(%iota.12), dimensions={1}
+      %compare.19 = pred[2,2]{1,0} compare(%broadcast.18, %broadcast.17), direction=GE
+      %broadcast.20 = pred[1,2,2]{2,1,0} broadcast(%compare.19), dimensions={1,2}
+      %call = c64[1,2,2]{2,1,0} call(%transpose.10), to_apply=%xla.cholesky_c64_1_2_2__lower.76
+      %constant.21 = c64[] constant((0, 0))
+      %broadcast.22 = c64[1,2,2]{2,1,0} broadcast(%constant.21), dimensions={}
+      %select.23 = c64[1,2,2]{2,1,0} select(%broadcast.20, %call, %broadcast.22)
+      %real.24 = f32[1,2,2]{2,1,0} real(%select.23)
+      %imag.25 = f32[1,2,2]{2,1,0} imag(%select.23)
+      %negate.26 = f32[1,2,2]{2,1,0} negate(%imag.25)
+      %complex.27 = c64[1,2,2]{2,1,0} complex(%real.24, %negate.26)
+      %transpose.28 = c64[1,2,2]{1,2,0} transpose(%complex.27), dimensions={0,2,1}
+      %dot.29 = c64[1,2,2]{2,1,0} dot(%select.23, %transpose.28), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}, operand_precision={highest,highest}, frontend_attributes={grad_x="false",grad_y="false"}
+      %transpose.30 = c64[1,2,2]{2,1,0} transpose(%dot.29), dimensions={0,1,2}
+      %subtract.31 = c64[1,2,2]{2,1,0} subtract(%transpose.10, %transpose.30)
+      %real.32 = f32[1,2,2]{2,1,0} real(%subtract.31)
+      %imag.33 = f32[1,2,2]{2,1,0} imag(%subtract.31)
+      %negate.34 = f32[1,2,2]{2,1,0} negate(%imag.33)
+      %complex.35 = c64[1,2,2]{2,1,0} complex(%real.32, %negate.34)
+      %multiply.36 = c64[1,2,2]{2,1,0} multiply(%subtract.31, %complex.35)
+      %abs.37 = f32[1,2,2]{2,1,0} abs(%multiply.36)
+      %constant.38 = f32[] constant(0)
+      ROOT %reduce.43 = f32[] reduce(%abs.37, %constant.38), dimensions={0,1,2}, to_apply=%add_F32.39
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  Literal input_real =
+      LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  Literal input_imag =
+      LiteralUtil::CreateR3<float>({{{9, 10}, {11, 12}}, {{13, 14}, {15, 16}}});
+
+  // No need to check the values of the Literal, just that it returns a valid
+  // result.
+  TF_EXPECT_OK(Execute(module->Clone(), {&input_real, &input_imag}));
+}
+
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc b/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
index e512420e6bb5..2c60880b6412 100644
--- a/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/onednn_convolution_test.cc
@@ -72,9 +72,7 @@ class ConvolutionTest : public HloTestBase,
   ConvolutionTest() {
     dtype_ = GetParam();
     atol_ = rtol_ = (dtype_ == F32) ? 1e-4 : 1e-2;
-    // TODO(intel-tf): Set default value of user_scratchpad to true after
-    // enabling feature
-    user_scratchpad_ = false;
+    user_scratchpad_ = true;
     weights_prepacked_ = false;
     dtypeString_ = primitive_util::LowercasePrimitiveTypeName(dtype_);
   }
@@ -139,6 +137,14 @@ class ConvolutionTest : public HloTestBase,
     return primitive_util::LowercasePrimitiveTypeName(PromotedDtype());
   }
 
+  void RunAndExpectExit(const absl::string_view outline, int signal) {
+    const std::string convolution_module_str = absl::StrReplaceAll(
+        outline,
+        {{"$dtype", dtypeString_}, {"$pdtype", PromotedDtypeToString()}});
+    EXPECT_EXIT((void)Run(convolution_module_str, true),
+                ::testing::KilledBySignal(signal), "");
+  }
+
   void RunCompareAndMatchOptimizedHlo(
       const absl::string_view outline,
       const std::vector<absl::string_view> fused_ops) {
@@ -273,6 +279,53 @@ TEST_P(ConvolutionTest, Conv2DWithBiasAndReluTest) {
   RunCompareAndMatchOptimizedHlo(outline, {"BIAS", "RELU"});
 }
 
+TEST_P(ConvolutionTest, ConvInsufficientScratchTest) {
+  // Running death tests in a single-threaded environment
+  GTEST_FLAG_SET(death_test_style, "threadsafe");
+  if (dtype_ == BF16 && !HasAMXTile()) {
+    GTEST_SKIP() << "Skipping test for " << dtypeString_
+                 << " because oneDNN may not request scratch allocations "
+                    "on platforms that emulate "
+                 << dtypeString_;
+  }
+  const absl::string_view outline = R"(
+  HloModule convolution.test.insufficient.scratch
+
+  ENTRY convolution.test.insufficient.scratch {
+    p0 = $dtype[1,1024,1] parameter(0)
+    p1 = $dtype[3,1,3] parameter(1)
+    ROOT conv = ($dtype[1,1022,3], u8[128]) custom-call(p0, p1),
+      custom_call_target="__onednn$convolution",
+        backend_config={
+          "outer_dimension_partitions":[],
+          "onednn_conv_config":{
+            "dims":"3",
+            "input":{
+              "dims":"3",
+              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["2"]}
+            },
+            "kernel":{
+              "dims":"3",
+              "filter":{"input_feature_dim":"1","output_feature_dim":"2",
+                "spatial_dims":["1"],"shape":[]}
+            },
+            "output":{
+              "dims":"3",
+              "data":{"batch_dim":"0","feature_dim":"2","spatial_dims":["2"]}
+            },
+            "window":{
+              "size":[],"pad_left":["1"],"pad_right":["1"],
+              "strides":["2"],"window_dilations":["2"]
+            },
+            "feature_groups":"1",
+            "optimization_config":{"user_scratchpad":true}
+          }
+        }
+  })";
+
+  RunAndExpectExit(outline, SIGABRT);
+}
+
 TEST_P(ConvolutionTest, Conv2DWithBinaryAddTest) {
   const absl::string_view outline = R"(
   HloModule convolution.test.with.binaryadd
@@ -347,6 +400,25 @@ TEST_P(ConvolutionTest, ToeplitzConstrcutionTest) {
   RunCompareAndMatchOptimizedHlo(outline, {"BINARY_ADD"});
 }
 
+TEST_P(ConvolutionTest, Conv2DWithSumTest) {
+  const absl::string_view outline = R"(
+  HloModule convolution.test.with.sum
+  ENTRY convolution.test.with.sum {
+    arg0.1 = $dtype[1,22,22,1] parameter(0)
+    arg0.2 = $dtype[1,11,11,1] parameter(1)
+    constant.3 = $dtype[] constant(1)
+    broadcast.4 = $dtype[8,8,1,1] broadcast(constant.3), dimensions={}
+    convolution.0 = $dtype[1,11,11,1] convolution(arg0.1, broadcast.4),
+          window={size=8x8 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f
+    ROOT add.10 = $dtype[1,11,11,1] add(convolution.0, arg0.2)
+  })";
+
+  // Optimized HLO must match "SUM" only for precisions that support Elementwise
+  // Add operations
+  RunCompareAndMatchOptimizedHlo(outline,
+                                 {(dtype_ == BF16) ? "BINARY_ADD" : "SUM"});
+}
+
 TEST_P(ConvolutionTest, Conv2DWithBiasAndTanhTest) {
   const absl::string_view outline = R"(
   HloModule convolution.bias.tanh.test
diff --git a/third_party/xla/xla/service/cpu/tests/onednn_matmul_test.cc b/third_party/xla/xla/service/cpu/tests/onednn_matmul_test.cc
index 8a89a6656dd3..f3dd040dafd0 100644
--- a/third_party/xla/xla/service/cpu/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/onednn_matmul_test.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #if defined(INTEL_MKL)
 
+#include <string>
 #include <utility>
 
+#include "absl/strings/str_replace.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
@@ -64,6 +66,17 @@ class MatmulTest : public HloTestBase {
     ; CHECK-DAG:   }
     ; CHECK:     }
     )";
+  const char* fused_matmul_sum_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fusions":{
+    ; CHECK-DAG:         "ops":["SUM"]
+    ; CHECK-DAG:     }
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
   const char* matmul_rewrite_str_ = R"(
     ; CHECK:     custom_call_target="__onednn$matmul",
     ; CHECK:       backend_config={
@@ -137,6 +150,10 @@ class MatmulTest : public HloTestBase {
     ; CHECK-DAG:     }
     ; CHECK:     }
     )";
+  const char* matmul_transpose_rewrite_str_ = R"(
+    ; CHECK-NOT: transpose(%{{[a-z,A-Z,0-9,_,\.]*}}),
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    )";
 };
 
 TEST_F(MatmulTest, SimpleTestF32) {
@@ -267,7 +284,51 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter1) {
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
-  MatchOptimizedHlo(matmul_module_str, fused_matmul_binary_add_);
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_sum_);
+}
+
+TEST_F(MatmulTest, SimpleTestF32Add2Dots) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.biasadd.test.f32
+
+  ENTRY matmul.biasadd.test.f32 {
+    arg0.1 = f32[32,32,40,30] parameter(0)
+    arg0.2 = f32[32,32,30,40] parameter(1)
+    arg0.3 = f32[32,32,40,40] parameter(2)
+    arg0.4 = f32[32,32,40,40] parameter(3)
+    dot.7 = f32[32,32,40,40] dot(arg0.1, arg0.2), lhs_batch_dims={0,1},
+       lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    dot.8 = f32[32,32,40,40] dot(arg0.3, arg0.4), lhs_batch_dims={0,1},
+       lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    ROOT add.10 = f32[32,32,40,40] add(dot.7, dot.8)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_sum_);
+}
+
+TEST_F(MatmulTest, SimpleTestF16Add2Dots) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule matmul.biasadd.test.f16
+
+  ENTRY matmul.biasadd.test.f16 {
+    arg0.1 = f16[32,64,128] parameter(0)
+    arg0.2 = f16[32,128,64] parameter(1)
+    arg0.3 = f16[32,64,64] parameter(2)
+    arg0.4 = f16[32,64,64] parameter(3)
+    dot.7 = f16[32,64,64] dot(arg0.1, arg0.2), lhs_batch_dims={0},
+        lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    dot.8 = f16[32,64,64] dot(arg0.3, arg0.4), lhs_batch_dims={0},
+        lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    ROOT add.10 = f16[32,64,64] add(dot.7, dot.8)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_sum_);
 }
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2) {
@@ -1477,6 +1538,62 @@ TEST_F(MatmulTest, SimpleTestBF16WithMulAndAddFusion) {
     )");
 }
 
+std::string CreateTransposeFusionModuleText(std::string dtype) {
+  const char* matmul_module_str = R"(
+  ENTRY matmul.test {
+    arg0.1 = DTYPE[32,40,30,64] parameter(0), parameter_replication={false}
+    transpose.1 = DTYPE[32,30,40,64]{3,1,2,0} transpose(arg0.1), dimensions={0,2,1,3}
+    copy.1 = DTYPE[32,30,40,64]{3,2,1,0} copy(transpose.1)
+    arg0.2 = DTYPE[32,40,30,64]{3,2,1,0} parameter(1), parameter_replication={false}
+    transpose.2 = DTYPE[32,30,40,64]{3,1,2,0} transpose(arg0.2), dimensions={0,2,1,3}
+    copy.2 = DTYPE[32,30,40,64]{3,2,1,0} copy(transpose.2)
+    ROOT dot.201 = DTYPE[32,30,40,40]{3,2,1,0} dot(copy.1, copy.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  })";
+  return absl::StrReplaceAll(matmul_module_str, {{"DTYPE", dtype}});
+}
+
+TEST_F(MatmulTest, SimpleTestTransposeFusionF32) {
+  const std::string matmul_module_str = CreateTransposeFusionModuleText("f32");
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_transpose_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SimpleTestTransposeFusionBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+  const std::string matmul_module_str = CreateTransposeFusionModuleText("bf16");
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, matmul_transpose_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SimpleTestTransposeFusionF16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const std::string matmul_module_str = CreateTransposeFusionModuleText("f16");
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, matmul_transpose_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SimpleTestNoTransposeFusion) {
+  const char* matmul_module_str = R"(
+  ENTRY matmul.test {
+    arg0.1 = f32[32,40,40,32] parameter(0), parameter_replication={false}
+    arg0.2 = f32[32,40,40,32] parameter(1), parameter_replication={false}
+    transpose.2 = f32[32,40,40,32] transpose(arg0.2), dimensions={3,2,1,0}
+    copy.2 = f32[32,40,40,32] copy(transpose.2)
+    ROOT dot.201 = f32[32,40,40,40] dot(arg0.1, copy.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+    ; CHECK:     transpose(%{{[a-z,A-Z,0-9,_,\.]*}}),
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    )");
+}
+
 TEST_F(MatmulTest, WeightsPrepackAndScratch) {
   const char* matmul_module_str = R"(
   HloModule matmul.test.f32
@@ -1574,6 +1691,36 @@ TEST_F(MatmulTest, BroadcastedAddAfterFusion) {
   )");
 }
 
+TEST_F(MatmulTest, MulTanhMul) {
+  const char* matmul_module_str = R"(
+  ENTRY matmul.multanhmul.test.f32 {
+    arg.0 = f32[16,400,500] parameter(0)
+    arg.1 = f32[16,500,3] parameter(1)
+    dot.0 = f32[16,400,3] dot(arg.0, arg.1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    constant.0 = f32[] constant(0.02)
+    broadcast.0 = f32[16,400,3] broadcast(constant.0), dimensions={}
+    multiply.0 = f32[16,400,3] multiply(dot.0, broadcast.0)
+    tanh.0 = f32[16,400,3] tanh(multiply.0)
+    constant.1 = f32[] constant(50)
+    broadcast.1 = f32[16,400,3] broadcast(constant.1), dimensions={}
+    ROOT multiply.1 = f32[16,400,3] multiply(tanh.0, broadcast.1)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec(1e-4, 1e-4)));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:     custom_call_target="__onednn$matmul",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "outer_dimension_partitions":[],
+  ; CHECK-DAG:     "onednn_matmul_config":{
+  ; CHECK-DAG:       "fusions":{
+  ; CHECK-DAG:         "ops":["LINEAR","TANH","LINEAR"]
+  ; CHECK-DAG:     }
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )");
+}
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/tests/onednn_memory_util_test.cc b/third_party/xla/xla/service/cpu/tests/onednn_memory_util_test.cc
new file mode 100644
index 000000000000..b622f47cc720
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/tests/onednn_memory_util_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL)
+
+#include "xla/service/cpu/onednn_memory_util.h"
+
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+class MemoryUtilTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::vector<int64_t>> {
+ protected:
+  constexpr static const char* test_pattern_ = R"(
+    CHECK: %[[mref0:[0-9]+]] = insertvalue
+    CHECK: %[[mref1:[0-9]+]] = insertvalue
+    CHECK-SAME: [[arr:\[12 x i64\]]] } %[[mref0]], i64 255, 3
+    CHECK: %{{[0-9]+}} = insertvalue
+    CHECK-SAME: %[[mref1]], [[arr]] )";
+
+  auto GetMemRefTestPattern(Shape shape) {
+    std::ostringstream stream;
+    stream << "[";
+    absl::c_for_each(shape.dimensions(),
+                     [&stream](auto x) { stream << "i64 " << x << ", "; });
+    return absl::StrCat(test_pattern_, stream.str());
+  }
+};
+
+TEST_P(MemoryUtilTest, VerifyMemRefTest) {
+  std::string filecheck_input;
+  llvm::LLVMContext context = llvm::LLVMContext();
+  llvm::IRBuilder builder(context);
+  llvm::raw_string_ostream ostream(filecheck_input);
+  llvm::Module module("MemoryUtilTest", context);
+
+  llvm::FunctionType* function_type = llvm::FunctionType::get(
+      llvm::Type::getVoidTy(context), {builder.getPtrTy()}, false);
+  llvm::Function* function = llvm::Function::Create(
+      function_type, llvm::Function::LinkageTypes::ExternalLinkage,
+      "memory_util_test", module);
+  llvm::BasicBlock* bb = llvm::BasicBlock::Create(context, "BB", function);
+  builder.SetInsertPoint(bb);
+
+  Shape shape = ShapeUtil::MakeShape(F32, GetParam());
+  llvm::Argument* ptr = function->getArg(0);
+  llvm::Type* type = llvm_ir::PrimitiveTypeToIrType(F32, builder.getContext());
+
+  if (shape.IsArray()) {
+    for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+      type = llvm::ArrayType::get(type, shape.dimensions(dim));
+    }
+  }
+
+  llvm_ir::IrArray ir_array(ptr, type, shape);
+  auto alloca = GetAllocaAndEmitMemrefInfo(builder, ir_array);
+  alloca.EmitLifetimeEnd();
+  ostream << module;
+
+  absl::StatusOr<bool> match =
+      RunFileCheck(filecheck_input, GetMemRefTestPattern(shape));
+  TF_ASSERT_OK(match.status());
+  EXPECT_TRUE(match.value());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MemoryUtilTestSuite, MemoryUtilTest,
+    ::testing::Values(std::vector<int64_t>({30}),
+                      std::vector<int64_t>({30, 40}),
+                      std::vector<int64_t>({30, 40, 50})),
+    [](const ::testing::TestParamInfo<MemoryUtilTest::ParamType>& info) {
+      return absl::StrCat("Rank_", info.param.size());
+    });
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index c16fb949839e..9079ed8a16d1 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -31,9 +31,12 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
+#include "xla/backends/cpu/codegen/computation_kernel_emitter.h"
 #include "xla/backends/cpu/codegen/dot/dot_kernel_emitter.h"
 #include "xla/backends/cpu/codegen/elemental/concatenate_kernel_emitter.h"
 #include "xla/backends/cpu/codegen/elemental/elemental_kernel_emitter.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h"
+#include "xla/backends/cpu/codegen/fusion_compiler.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/runtime/all_gather_thunk.h"
 #include "xla/backends/cpu/runtime/all_reduce_thunk.h"
@@ -52,7 +55,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/logical_id_thunk.h"
 #include "xla/backends/cpu/runtime/outfeed_thunk.h"
 #include "xla/backends/cpu/runtime/reduce_scatter_thunk.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/rng_state_thunk.h"
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 #include "xla/backends/cpu/runtime/thunk.h"
@@ -65,6 +67,7 @@ limitations under the License.
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/kernel_spec.h"
 #include "xla/codegen/llvm_ir_kernel_source.h"
+#include "xla/codegen/mlir_kernel_source.h"
 #include "xla/comparison_util.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -75,6 +78,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/layout_util.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/cpu/backend_config.pb.h"
@@ -548,11 +552,32 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitReduceScatterThunk(
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCallThunk(
     const HloInstruction* instruction) {
-  TF_ASSIGN_OR_RETURN(
-      ThunkSequence called_sequence,
-      EmitHloComputation(instruction->called_computations().front()));
-  return ThunkSequence::Of<CallThunk>(ThunkInfo(instruction),
-                                      std::move(called_sequence));
+  if (std::optional<std::string> maybe_small_call =
+          instruction->get_frontend_attribute("xla_cpu_small_call");
+      maybe_small_call.has_value() && *maybe_small_call == "true") {
+    ComputationKernelEmitter emitter(instruction, &buffer_assignment_,
+                                     &target_machine_features_);
+    TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
+                        emitter.EmitKernelDefinition());
+
+    auto [kernel_spec, kernel_source] = std::move(kernel_definition).release();
+    auto llvm_ir_kernel_source = absl::WrapUnique<LlvmIrKernelSource>(
+        tsl::down_cast<LlvmIrKernelSource*>(kernel_source.release()));
+
+    kernels_.push_back(
+        {kernel_spec.name(),
+         std::move(*llvm_ir_kernel_source).thread_safe_module()});
+
+    return MakeKernelThunkSequence(
+        instruction, std::move(kernel_spec),
+        /*min_alignment=*/cpu_function_runtime::MinAlign());
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        ThunkSequence called_sequence,
+        EmitHloComputation(instruction->called_computations().front()));
+    return ThunkSequence::Of<CallThunk>(ThunkInfo(instruction),
+                                        std::move(called_sequence));
+  }
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConcatenateKernelThunk(
@@ -688,6 +713,31 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitPadKernelThunk(
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitFusionKernelThunk(
     const HloInstruction* instruction) {
   auto* fusion = Cast<HloFusionInstruction>(instruction);
+
+  if (ir_emitter_.IsSupportedByFusionEmitter(fusion) &&
+      fusion->fused_expression_root()->opcode() == HloOpcode::kScatter) {
+    auto kernel_emitter =
+        std::make_unique<CpuScatterFusion>(buffer_assignment_, fusion);
+
+    TF_ASSIGN_OR_RETURN(KernelDefinition kernel_definition,
+                        kernel_emitter->EmitKernelDefinition());
+
+    auto [kernel_spec, kernel_source] = std::move(kernel_definition).release();
+    auto* mlir_kernel_source =
+        tsl::down_cast<MlirKernelSource*>(kernel_source.get());
+
+    FusionCompiler compiler(FusionCompiler::Options{});
+    TF_ASSIGN_OR_RETURN(LlvmIrKernelSource llvm_ir_kernel_source,
+                        compiler.Compile(std::move(*mlir_kernel_source)));
+
+    kernels_.push_back({kernel_spec.name(),
+                        std::move(llvm_ir_kernel_source).thread_safe_module()});
+
+    return MakeKernelThunkSequence(
+        instruction, std::move(kernel_spec),
+        /*min_alignment=*/cpu_function_runtime::MinAlign());
+  }
+
   TF_ASSIGN_OR_RETURN(auto kernel, ir_emitter_.EmitFusionHostKernel(fusion));
   TF_ASSIGN_OR_RETURN(auto buffers, GetHostKernelAllocationSlices(instruction));
 
@@ -830,10 +880,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
   const HloInstruction* lhs = instruction->operand(0);
   const HloInstruction* rhs = instruction->operand(1);
 
-  TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
-      *instruction, /*operands=*/{lhs, rhs},
-      /*supported_types=*/
-      {PRED, S8, U8, S16, U16, S32, U32, S64, U64, F16, F32, F64, C64, C128}));
+  TF_RETURN_IF_ERROR(
+      ElementTypesSameAndSupported(*instruction, /*operands=*/{lhs, rhs},
+                                   /*supported_types=*/
+                                   {PRED, S8, U8, S16, U16, S32, U32, S64, U64,
+                                    BF16, F16, F32, F64, C64, C128}));
 
   const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
   if (dnums.lhs_contracting_dimensions_size() != 1) {
@@ -842,7 +893,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
   }
 
   DotImplementationStrategy strategy = GetDotImplementationStrategy(
-      hlo_module_config_, *instruction, target_machine_features_);
+      hlo_module_config_, *instruction, target_machine_features_,
+      /*allow_runtime_calls=*/true);
 
   switch (strategy) {
     // Emit host kernel implementing dot instruction.
@@ -917,7 +969,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTopKThunk(
 
   // Deduce parameters from the result shape and operand shape
   const int64_t input_size = input->shape().dimensions().back();
-  const bool has_batch = result_shape.tuple_shapes(0).dimensions_size() == 2;
+  const bool has_batch = result_shape.tuple_shapes(0).dimensions().size() == 2;
   const int64_t batch_size =
       has_batch ? result_shape.tuple_shapes(0).dimensions(0) : 1;
   const int64_t k = result_shape.tuple_shapes(0).dimensions().back();
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index ac749aa5f103..9cdc8ae39816 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/resource_use.h"
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/codegen/kernel_spec.h"
@@ -37,6 +36,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/ir_emitter2.h"
 #include "xla/service/hlo_module_config.h"
diff --git a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 607d10be78ec..ffdf7c096a9e 100644
--- a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -31,17 +31,18 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/test_target_triple_helper.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
-class CodegenReduceOnArchWithNoVectorRegisters : public HloTestBase {};
+class CodegenReduceOnArchWithNoVectorRegisters
+    : public HloHardwareIndependentTestBase {};
 
 absl::StatusOr<unsigned int> GetTargetVectorRegisterByteSize(
     std::string triple) {
diff --git a/third_party/xla/xla/service/cpu/xfeed_manager.cc b/third_party/xla/xla/service/cpu/xfeed_manager.cc
index d7d40ff09e1b..cc1a953ef9bd 100644
--- a/third_party/xla/xla/service/cpu/xfeed_manager.cc
+++ b/third_party/xla/xla/service/cpu/xfeed_manager.cc
@@ -69,7 +69,7 @@ int64_t GetByteSizeRequirement(const Shape& shape, int64_t pointer_size) {
   if (shape.IsTuple() || shape.is_static()) {
     return ShapeUtil::ByteSizeOf(shape, pointer_size);
   }
-  int64_t metadata_size = sizeof(int32_t) * shape.dimensions_size();
+  int64_t metadata_size = sizeof(int32_t) * shape.dimensions().size();
   return ShapeUtil::ByteSizeOf(shape, pointer_size) + metadata_size;
 }
 
diff --git a/third_party/xla/xla/service/cpu/xfeed_manager_test.cc b/third_party/xla/xla/service/cpu/xfeed_manager_test.cc
index bf47cc8f8b16..58f17bb4add5 100644
--- a/third_party/xla/xla/service/cpu/xfeed_manager_test.cc
+++ b/third_party/xla/xla/service/cpu/xfeed_manager_test.cc
@@ -70,9 +70,9 @@ class TestInfeedBuffer : public cpu::runtime::XfeedBuffer {
 // Performs the acquire/release sequence on the infeed, as the generated CPU
 // code would in the process of executing the infeed operation.
 void ProcessNextBuffer(int32_t length) {
-  auto shape = ShapeUtil::MakeShape(U8, {length});
-  std::string bytes = shape.SerializeAsString();
-  void* buffer = __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
+  const auto shape = ShapeUtil::MakeShape(U8, {length});
+  const std::string bytes = shape.ToProto().SerializeAsString();
+  void* const buffer = __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
       /*run_options=*/nullptr, length, bytes.data(), bytes.size());
   __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
       /*run_options=*/nullptr, length, buffer, bytes.data(), bytes.size());
@@ -81,8 +81,8 @@ void ProcessNextBuffer(int32_t length) {
 // Performs the acquire/release sequence on the outfeed, as the generated CPU
 // code would in the process of executing the outfeed operation.
 void ProcessNextOutfeedBuffer(int32_t length, const Shape& shape) {
-  std::string bytes = shape.SerializeAsString();
-  void* buffer = __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+  const std::string bytes = shape.ToProto().SerializeAsString();
+  void* const buffer = __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
       /*run_options=*/nullptr, length, bytes.data(), bytes.size());
   __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
       /*run_options=*/nullptr, length, buffer, bytes.data(), bytes.size());
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
index 07cad8368597..8d5d0ee74d77 100644
--- a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
@@ -32,11 +32,6 @@ absl::Status CpuGpuShapeVerifier::Preprocess(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       hlo->shape(), [&](const Shape& shape, const ShapeIndex&) {
         if (shape.has_layout()) {
-          if (LayoutUtil::IsSparseArray(shape)) {
-            return absl::InvalidArgumentError(absl::StrFormat(
-                "The XLA CPU/GPU backend does not support sparse shapes: %s",
-                hlo->ToString()));
-          }
           if (!primitive_util::IsSubByteNonPredType(shape.element_type()) &&
               shape.layout().element_size_in_bits() != 0) {
             return absl::InvalidArgumentError(absl::StrFormat(
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
index d460db645aaa..06d9a1e14306 100644
--- a/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 
@@ -33,7 +33,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class CpuGpuShapeVerifierTest : public HloTestBase {
+class CpuGpuShapeVerifierTest : public HloHardwareIndependentTestBase {
  public:
   CpuGpuShapeVerifierTest() {
     // Create HloVerifier which uses CpuGpuShapeVerifier
diff --git a/third_party/xla/xla/service/custom_call_target_registry.cc b/third_party/xla/xla/service/custom_call_target_registry.cc
index d19db6ab0da6..5df448bec8ac 100644
--- a/third_party/xla/xla/service/custom_call_target_registry.cc
+++ b/third_party/xla/xla/service/custom_call_target_registry.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace xla {
 
 CustomCallTargetRegistry* CustomCallTargetRegistry::Global() {
-  static auto* registry = new CustomCallTargetRegistry;
+  static auto* const registry = new CustomCallTargetRegistry;
   return registry;
 }
 
diff --git a/third_party/xla/xla/service/custom_call_target_registry.h b/third_party/xla/xla/service/custom_call_target_registry.h
index e78d188426ee..6a3811aa7a8f 100644
--- a/third_party/xla/xla/service/custom_call_target_registry.h
+++ b/third_party/xla/xla/service/custom_call_target_registry.h
@@ -78,9 +78,9 @@ class CustomCallTargetRegistry {
   //
   // Different platforms have different ABIs.  TODO(jlebar): Describe them!
   //
-  // (We use std::unordered_map and std::mutex rather than absl::flat_hash_map
-  // and absl::mutex because we want to avoid an absl dependency, because this
-  // library is pulled in by all XLA:CPU AoT binaries.)
+  // (We use std::unordered_map and std::mutex rather than
+  // absl::flat_hash_map and absl::Mutex because we want to avoid an absl
+  // dependency, because this library is pulled in by all XLA:CPU AoT binaries.)
   std::unordered_map<std::pair<std::string, std::string>, void*,
                      HashPairOfStrings>
       registered_symbols_;
diff --git a/third_party/xla/xla/service/custom_call_target_registry_test.cc b/third_party/xla/xla/service/custom_call_target_registry_test.cc
index 1b423449953b..a4f58097d33b 100644
--- a/third_party/xla/xla/service/custom_call_target_registry_test.cc
+++ b/third_party/xla/xla/service/custom_call_target_registry_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "xla/service/custom_call_target_registry.h"
 
+#include "xla/hlo/testlib/test.h"
 #include "xla/service/custom_call_status.h"
-#include "xla/test.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/defuser.h b/third_party/xla/xla/service/defuser.h
deleted file mode 100644
index 46ad02630dfc..000000000000
--- a/third_party/xla/xla/service/defuser.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_DEFUSER_H_
-#define XLA_SERVICE_DEFUSER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/defuser.h"
-
-#endif  // XLA_SERVICE_DEFUSER_H_
diff --git a/third_party/xla/xla/service/despecializer.h b/third_party/xla/xla/service/despecializer.h
deleted file mode 100644
index c230c27805b0..000000000000
--- a/third_party/xla/xla/service/despecializer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_DESPECIALIZER_H_
-#define XLA_SERVICE_DESPECIALIZER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/despecializer.h"
-
-#endif  // XLA_SERVICE_DESPECIALIZER_H_
diff --git a/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc b/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc
index 2fe22688ee20..f948ff4e5adf 100644
--- a/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc
+++ b/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -19,17 +19,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class DfsHloVisitorWithDefaultTest : public HloTestBase {};
+class DfsHloVisitorWithDefaultTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(DfsHloVisitorWithDefaultTest, DefaultElementwiseTest) {
   // Verify that HandleElementwiseBinary and HandleElementwiseUnary are called
diff --git a/third_party/xla/xla/service/dot_as_convolution_util.cc b/third_party/xla/xla/service/dot_as_convolution_util.cc
index 8de34a379922..b755931f9bb1 100644
--- a/third_party/xla/xla/service/dot_as_convolution_util.cc
+++ b/third_party/xla/xla/service/dot_as_convolution_util.cc
@@ -130,9 +130,9 @@ bool SpatialIsContracting(int64_t lhs_spatial_size, int64_t rhs_spatial_size,
     }
   }
 
-  dims.lhs_shape_rank = conv->operand(0)->shape().rank();
-  dims.rhs_shape_rank = conv->operand(1)->shape().rank();
-  dims.output_shape_rank = conv->shape().rank();
+  dims.lhs_shape_rank = conv->operand(0)->shape().dimensions().size();
+  dims.rhs_shape_rank = conv->operand(1)->shape().dimensions().size();
+  dims.output_shape_rank = conv->shape().dimensions().size();
   return dims;
 }
 
@@ -204,7 +204,7 @@ DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot) {
     dnums.contracting_dims.back().spatial_dim = -1;
   }
   for (auto i :
-       GetNonContractingDims(dot->operand(0)->shape().rank(),
+       GetNonContractingDims(dot->operand(0)->shape().dimensions().size(),
                              dot_dim_numbs.lhs_contracting_dimensions(),
                              dot_dim_numbs.lhs_batch_dimensions())) {
     dnums.lhs_non_contracting_dims.emplace_back();
@@ -216,7 +216,7 @@ DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot) {
     dnums.lhs_non_contracting_dims.back().spatial_dim = -1;
   }
   for (auto i :
-       GetNonContractingDims(dot->operand(1)->shape().rank(),
+       GetNonContractingDims(dot->operand(1)->shape().dimensions().size(),
                              dot_dim_numbs.rhs_contracting_dimensions(),
                              dot_dim_numbs.rhs_batch_dimensions())) {
     dnums.rhs_non_contracting_dims.emplace_back();
@@ -229,9 +229,9 @@ DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot) {
     dnums.rhs_non_contracting_dims.back().spatial_dim = -1;
   }
 
-  dnums.lhs_shape_rank = dot->operand(0)->shape().rank();
-  dnums.rhs_shape_rank = dot->operand(1)->shape().rank();
-  dnums.output_shape_rank = dot->shape().rank();
+  dnums.lhs_shape_rank = dot->operand(0)->shape().dimensions().size();
+  dnums.rhs_shape_rank = dot->operand(1)->shape().dimensions().size();
+  dnums.output_shape_rank = dot->shape().dimensions().size();
   return dnums;
 }
 
diff --git a/third_party/xla/xla/service/dot_decomposer.h b/third_party/xla/xla/service/dot_decomposer.h
deleted file mode 100644
index 1e6f4015f169..000000000000
--- a/third_party/xla/xla/service/dot_decomposer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_DOT_DECOMPOSER_H_
-#define XLA_SERVICE_DOT_DECOMPOSER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/dot_decomposer.h"
-
-#endif  // XLA_SERVICE_DOT_DECOMPOSER_H_
diff --git a/third_party/xla/xla/service/dot_dimension_merger.h b/third_party/xla/xla/service/dot_dimension_merger.h
deleted file mode 100644
index dcc23bc14921..000000000000
--- a/third_party/xla/xla/service/dot_dimension_merger.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_DOT_DIMENSION_MERGER_H_
-#define XLA_SERVICE_DOT_DIMENSION_MERGER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/dot_dimension_merger.h"
-
-#endif  // XLA_SERVICE_DOT_DIMENSION_MERGER_H_
diff --git a/third_party/xla/xla/service/dot_merger.h b/third_party/xla/xla/service/dot_merger.h
deleted file mode 100644
index 5f8c1160686c..000000000000
--- a/third_party/xla/xla/service/dot_merger.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_DOT_MERGER_H_
-#define XLA_SERVICE_DOT_MERGER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/dot_merger.h"
-
-#endif  // XLA_SERVICE_DOT_MERGER_H_
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index bf90e7aa6166..328631d2a738 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/dump.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <iostream>
@@ -27,6 +28,8 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/const_init.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
@@ -54,6 +57,7 @@ limitations under the License.
 #include "xla/tsl/lib/io/zlib_outputbuffer.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/file_system_helper.h"
 #include "xla/util.h"
@@ -80,7 +84,8 @@ absl::Status CreateDirIfNeeded(const std::string& dir, tsl::Env* env) {
     if (!status.ok()) {
       status = env->IsDirectory(dir);
       if (!status.ok()) {
-        LOG(ERROR) << "Could not create directory " << dir;
+        LOG(ERROR) << "Could not create directory: " << dir
+                   << ". Error: " << status;
         return status;
       }
     }
@@ -288,12 +293,11 @@ static absl::Status WriteStringToFile(tsl::Env* env, const std::string& fname,
       TF_RETURN_IF_ERROR(gz_file.Append(next_producer()));
     }
     return gz_file.Close();
-  } else {
-    while (auto next_producer = data_producer.Next()) {
-      TF_RETURN_IF_ERROR(file->Append(next_producer()));
-    }
-    return file->Close();
   }
+  while (auto next_producer = data_producer.Next()) {
+    TF_RETURN_IF_ERROR(file->Append(next_producer()));
+  }
+  return file->Close();
 }
 
 static absl::Status WriteStringToFile(tsl::Env* env, const std::string& fname,
@@ -367,7 +371,9 @@ static std::optional<std::string> DumpToFileInDirImpl(
     string_view filename, string_view contents,
     const CanonicalDebugOptions& opts, bool compress = false) {
   auto file_path = GetDumpFilePath(filename, opts);
-  if (!file_path) return std::nullopt;
+  if (!file_path) {
+    return std::nullopt;
+  }
 
   auto status =
       WriteStringToFile(tsl::Env::Default(), *file_path, contents, compress);
@@ -384,7 +390,9 @@ static std::optional<std::string> DumpToFileInDirImpl(
     string_view filename, DataProducer& data_producer,
     const CanonicalDebugOptions& opts, bool compress = false) {
   auto file_path = GetDumpFilePath(filename, opts);
-  if (!file_path) return std::nullopt;
+  if (!file_path) {
+    return std::nullopt;
+  }
 
   auto status = WriteStringToFile(tsl::Env::Default(), *file_path,
                                   data_producer, compress);
@@ -678,7 +686,9 @@ void DumpToFileInDirOrStdout(const DebugOptions& debug_options, int unique_id,
 void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
                              mlir::Operation* op) {
   CanonicalDebugOptions opts(module.config().debug_options());
-  if (opts.dumping_to_stdout()) return op->dump();
+  if (opts.dumping_to_stdout()) {
+    return op->dump();
+  }
 
   mlir::OpPrintingFlags print_flags = mlir::OpPrintingFlags();
   // Enable debug info so that it is easier to see the corresponding HLO node.
@@ -753,12 +763,162 @@ std::vector<std::string> DumpHloModuleIfEnabled(const HloModule& module,
   return {};
 }
 
+std::string GetRepeatedValueAsString(
+    const tsl::protobuf::Reflection* reflection,
+    const DebugOptions& debug_options,
+    const tsl::protobuf::FieldDescriptor* field, int index) {
+  switch (field->type()) {
+    case tsl::protobuf::FieldDescriptor::TYPE_INT32:
+      return std::to_string(
+          reflection->GetRepeatedInt32(debug_options, field, index));
+    case tsl::protobuf::FieldDescriptor::TYPE_INT64:
+      return std::to_string(
+          reflection->GetRepeatedInt64(debug_options, field, index));
+    case tsl::protobuf::FieldDescriptor::TYPE_UINT32:
+      return std::to_string(
+          reflection->GetRepeatedUInt32(debug_options, field, index));
+    case tsl::protobuf::FieldDescriptor::TYPE_UINT64:
+      return std::to_string(
+          reflection->GetRepeatedUInt64(debug_options, field, index));
+    case tsl::protobuf::FieldDescriptor::TYPE_DOUBLE:
+      return std::to_string(
+          reflection->GetRepeatedDouble(debug_options, field, index));
+    case tsl::protobuf::FieldDescriptor::TYPE_FLOAT:
+      return std::to_string(
+          reflection->GetRepeatedFloat(debug_options, field, index));
+    case tsl::protobuf::FieldDescriptor::TYPE_BOOL:
+      return reflection->GetRepeatedBool(debug_options, field, index) ? "true"
+                                                                      : "false";
+    case tsl::protobuf::FieldDescriptor::TYPE_ENUM:
+      return std::string(
+          reflection->GetRepeatedEnum(debug_options, field, index)->name());
+    case tsl::protobuf::FieldDescriptor::TYPE_STRING:
+      return reflection->GetRepeatedString(debug_options, field, index);
+    case tsl::protobuf::FieldDescriptor::TYPE_MESSAGE: {
+      tsl::protobuf::TextFormat::Printer tsl_printer;
+      tsl_printer.SetInitialIndentLevel(1);
+      std::string result;
+      tsl_printer.PrintToString(
+          reflection->GetRepeatedMessage(debug_options, field, index), &result);
+      return "{\n" + result + "}";
+    }
+    default:
+      return "Unsupported field type";
+  }
+}
+
+std::string GetValueAsString(const tsl::protobuf::Reflection* reflection,
+                             const DebugOptions& debug_options,
+                             const tsl::protobuf::FieldDescriptor* field) {
+  // Based on the field type, get the value and convert it to a string
+  switch (field->type()) {
+    case tsl::protobuf::FieldDescriptor::TYPE_INT32:
+      return std::to_string(reflection->GetInt32(debug_options, field));
+    case tsl::protobuf::FieldDescriptor::TYPE_INT64:
+      return std::to_string(reflection->GetInt64(debug_options, field));
+    case tsl::protobuf::FieldDescriptor::TYPE_UINT32:
+      return std::to_string(reflection->GetUInt32(debug_options, field));
+    case tsl::protobuf::FieldDescriptor::TYPE_UINT64:
+      return std::to_string(reflection->GetUInt64(debug_options, field));
+    case tsl::protobuf::FieldDescriptor::TYPE_DOUBLE:
+      return std::to_string(reflection->GetDouble(debug_options, field));
+    case tsl::protobuf::FieldDescriptor::TYPE_FLOAT:
+      return std::to_string(reflection->GetFloat(debug_options, field));
+    case tsl::protobuf::FieldDescriptor::TYPE_BOOL:
+      return reflection->GetBool(debug_options, field) ? "true" : "false";
+    case tsl::protobuf::FieldDescriptor::TYPE_ENUM:
+      return std::string(reflection->GetEnum(debug_options, field)->name());
+    case tsl::protobuf::FieldDescriptor::TYPE_STRING:
+      return "\"" + reflection->GetString(debug_options, field) + "\"";
+    case tsl::protobuf::FieldDescriptor::TYPE_MESSAGE: {
+      tsl::protobuf::TextFormat::Printer tsl_printer;
+      tsl_printer.SetSingleLineMode(false);
+      std::string result;
+      tsl_printer.PrintToString(reflection->GetMessage(debug_options, field),
+                                &result);
+      return "{\n" + result + "}";
+    }
+    default:
+      return "Unsupported field type";
+  }
+}
+
+std::string GetNonDefaultDebugOptions(const DebugOptions& debug_options) {
+  // Create a default DebugOptions to compare against
+  DebugOptions default_options = DefaultDebugOptionsIgnoringFlags();
+  std::string non_default_options;
+
+  // Use protobuf reflection to compare fields
+  const tsl::protobuf::Descriptor* descriptor = debug_options.GetDescriptor();
+  const tsl::protobuf::Reflection* reflection = debug_options.GetReflection();
+
+  // Iterate through all fields
+  for (int i = 0; i < descriptor->field_count(); i++) {
+    const tsl::protobuf::FieldDescriptor* field = descriptor->field(i);
+
+    if (field->is_repeated()) {
+      // Handle repeated fields by comparing the values
+      int repeated_count = reflection->FieldSize(debug_options, field);
+      int default_count = reflection->FieldSize(default_options, field);
+
+      // Only process if the repeated field has values
+      if (repeated_count > 0) {
+        std::vector<std::string> debug_values(repeated_count);
+        std::vector<std::string> default_values(default_count);
+
+        // Collect all values from debug_options
+        for (int j = 0; j < repeated_count; j++) {
+          debug_values[j] =
+              GetRepeatedValueAsString(reflection, debug_options, field, j);
+        }
+
+        // Collect all values from default_options
+        for (int j = 0; j < default_count; j++) {
+          default_values[j] =
+              GetRepeatedValueAsString(reflection, default_options, field, j);
+        }
+
+        // Sort both vectors for comparison
+        std::sort(debug_values.begin(), debug_values.end());
+        std::sort(default_values.begin(), default_values.end());
+
+        // Compare the sorted vectors
+        if (debug_values != default_values) {
+          // Values differ, append all debug values to output
+          for (const auto& value : debug_values) {
+            absl::StrAppend(&non_default_options, field->name(), ": ", value,
+                            "\n");
+          }
+        }
+      }
+      continue;
+    }
+
+    if (GetValueAsString(reflection, debug_options, field) !=
+        GetValueAsString(reflection, default_options, field)) {
+      absl::StrAppend(&non_default_options, field->name(), ": ",
+                      GetValueAsString(reflection, debug_options, field), "\n");
+    }
+  }
+
+  return non_default_options;
+}
+
+void DumpNonDefaultDebugOptions(const HloModule& module,
+                                absl::string_view suffix) {
+  const DebugOptions& debug_options = module.config().debug_options();
+  auto filename = FilenameFor(module, "", suffix);
+  auto nonDefaultDebugOptions = GetNonDefaultDebugOptions(debug_options);
+  DumpToFileInDir(debug_options, filename, nonDefaultDebugOptions);
+}
+
 std::vector<std::string> DumpHloModuleIfEnabled(
     const HloModule& module, const BufferAssignment& buffer_assn,
     string_view name) {
   CanonicalDebugOptions opts(module.config().debug_options());
   if (opts.should_dump_module(module.name())) {
     DumpHloModuleImpl(module, &buffer_assn, TimestampFor(module), name, opts);
+    DumpNonDefaultDebugOptions(module, kNonDefaultDebugOptionsDumpSuffix);
   }
   return {};
 }
@@ -779,6 +939,28 @@ std::vector<std::string> DumpHloModuleProtoIfEnabled(
   return {};
 }
 
+void DumpHloConfigIfEnabled(const HloModule& module) {
+  if (!module.config().debug_options().xla_dump_full_hlo_config()) {
+    return;
+  }
+
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.dumping_to_stdout()) {
+    VLOG(2) << "Refusing to write HLO config proto for " << module.name()
+            << " to stdout. Pass --xla_dump_to=<path> to write to a file.";
+    return;
+  }
+  std::string config_str;
+  if (tsl::protobuf::TextFormat::PrintToString(module.config().ToProto(),
+                                               &config_str)) {
+    std::string filename = FilenameFor(module, "", "config.pbtxt");
+    DumpToFileInDirImpl(filename, config_str, opts);
+  } else {
+    VLOG(1) << "Failed to convert HloModuleConfig to text. Module: "
+            << module.name();
+  }
+}
+
 bool DumpingEnabledForHloModule(string_view hlo_module_name,
                                 const DebugOptions& opts) {
   return CanonicalDebugOptions(opts).should_dump_module(hlo_module_name);
diff --git a/third_party/xla/xla/service/dump.h b/third_party/xla/xla/service/dump.h
index 0aabba193507..c35b1d98fb32 100644
--- a/third_party/xla/xla/service/dump.h
+++ b/third_party/xla/xla/service/dump.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_DUMP_H_
 
 #include <string>
+#include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_graph_dumper.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/xla.pb.h"
 
 // Consolidated utilities for logging information during compilation, usually
@@ -39,6 +41,7 @@ namespace xla {
 // performed on an HloModule.
 constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
 constexpr char kAfterOptimizationsDumpName[] = "after_optimizations";
+constexpr char kNonDefaultDebugOptionsDumpSuffix[] = "debug_options";
 
 class BufferAssignment;
 class HloSnapshot;
@@ -197,6 +200,18 @@ absl::Status DumpProtoToDirectory(const tsl::protobuf::Message& message,
                                   const std::string& file_name,
                                   std::string* full_path = nullptr);
 
+void DumpHloConfigIfEnabled(const HloModule& module);
+
+// Dumps the non-default debug options to a file in the xla_dump_to directory
+// specified by the module's DebugOptions.
+void DumpNonDefaultDebugOptions(const HloModule& module,
+                                absl::string_view suffix);
+
+// Returns the non-default debug options as a string. The default debug options
+// are received from DefaultDebugOptionsIgnoringFlags().
+// TODO: move this to xla/debug_options_flags.cc
+std::string GetNonDefaultDebugOptions(const DebugOptions& debug_options);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_DUMP_H_
diff --git a/third_party/xla/xla/service/dump_test.cc b/third_party/xla/xla/service/dump_test.cc
index 98fa454439a8..686c5ace5ee0 100644
--- a/third_party/xla/xla/service/dump_test.cc
+++ b/third_party/xla/xla/service/dump_test.cc
@@ -15,15 +15,25 @@ limitations under the License.
 
 #include "xla/service/dump.h"
 
+#include <sys/types.h>
+
 #include <memory>
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/runtime/large_hlo_snapshot_serialization/serialization.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
@@ -283,5 +293,131 @@ TEST(DumpTest, DumpHloUnoptimizedSnapshotProtoBinary) {
   EXPECT_EQ(hlo_snapshot_loaded.hlo_module().name(), module.name());
 }
 
+TEST(DumpTest, GetNonDefaultDebugOptions) {
+  DebugOptions options;
+  DebugOptions default_options = DefaultDebugOptionsIgnoringFlags();
+  std::string dump_folder = tsl::testing::TmpDir();
+
+  // String field
+  options.set_xla_dump_to(dump_folder);
+  // Int32 field
+  options.set_xla_gpu_dot_merger_threshold_mb(
+      default_options.xla_gpu_dot_merger_threshold_mb() + 100);
+  // Int64 field
+  options.set_xla_gpu_experimental_collective_cse_distance_threshold(
+      default_options.xla_gpu_experimental_collective_cse_distance_threshold() +
+      100);
+  // Bool field
+  options.set_xla_gpu_enable_nccl_user_buffers(
+      !default_options.xla_gpu_enable_nccl_user_buffers());
+  options.set_xla_enable_dumping(true);
+  options.set_xla_gpu_enable_shared_constants(false);
+  // Enum field
+  options.clear_xla_gpu_enable_command_buffer();
+  options.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLAS);
+  options.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
+  // Message field
+  int gpus_per_node;
+  EXPECT_TRUE(absl::SimpleAtoi(
+      default_options.xla_gpu_analytical_latency_estimator_options().at(
+          "gpus_per_node"),
+      &gpus_per_node));
+  int chunk_size_bytes;
+  EXPECT_TRUE(absl::SimpleAtoi(
+      default_options.xla_gpu_analytical_latency_estimator_options().at(
+          "chunk_size_bytes"),
+      &chunk_size_bytes));
+  options.mutable_xla_gpu_analytical_latency_estimator_options()->insert(
+      {"gpus_per_node", std::to_string(gpus_per_node + 1)});
+  options.mutable_xla_gpu_analytical_latency_estimator_options()->insert(
+      {"chunk_size_bytes", std::to_string(chunk_size_bytes)});
+
+  auto non_default_options = GetNonDefaultDebugOptions(options);
+  EXPECT_THAT(non_default_options,
+              testing::HasSubstr("xla_dump_to: \"" + dump_folder + "\""));
+  EXPECT_THAT(
+      non_default_options,
+      testing::HasSubstr(
+          "xla_gpu_dot_merger_threshold_mb: " +
+          std::to_string(default_options.xla_gpu_dot_merger_threshold_mb() +
+                         100)));
+  EXPECT_THAT(
+      non_default_options,
+      testing::HasSubstr(
+          "xla_gpu_experimental_collective_cse_distance_threshold: " +
+          std::to_string(
+              default_options
+                  .xla_gpu_experimental_collective_cse_distance_threshold() +
+              100)));
+  EXPECT_THAT(non_default_options,
+              testing::HasSubstr("xla_gpu_enable_nccl_user_buffers: true"));
+  EXPECT_THAT(non_default_options,
+              testing::HasSubstr("xla_gpu_enable_shared_constants: false"));
+  EXPECT_THAT(non_default_options,
+              testing::HasSubstr("xla_gpu_enable_command_buffer: CUBLAS"));
+  EXPECT_THAT(non_default_options,
+              testing::HasSubstr("xla_gpu_enable_command_buffer: FUSION"));
+  EXPECT_THAT(
+      non_default_options,
+      testing::HasSubstr("xla_gpu_analytical_latency_estimator_options: {\n"
+                         "  key: \"gpus_per_node\"\n"
+                         "  value: \"" +
+                         std::to_string(gpus_per_node + 1) +
+                         "\"\n"
+                         "}"));
+  EXPECT_THAT(
+      non_default_options,
+      testing::HasSubstr("xla_gpu_analytical_latency_estimator_options: {\n"
+                         "  key: \"chunk_size_bytes\"\n"
+                         "  value: \"" +
+                         std::to_string(chunk_size_bytes) +
+                         "\"\n"
+                         "}"));
+  tsl::protobuf::TextFormat::Parser parser;
+  DebugOptions parsed_options = DefaultDebugOptionsIgnoringFlags();
+  parser.ParseFromString(non_default_options, &parsed_options);
+  EXPECT_EQ(parsed_options.xla_dump_to(), dump_folder);
+  EXPECT_EQ(parsed_options.xla_gpu_dot_merger_threshold_mb(),
+            default_options.xla_gpu_dot_merger_threshold_mb() + 100);
+  EXPECT_EQ(
+      parsed_options.xla_gpu_experimental_collective_cse_distance_threshold(),
+      default_options.xla_gpu_experimental_collective_cse_distance_threshold() +
+          100);
+  EXPECT_EQ(parsed_options.xla_gpu_enable_nccl_user_buffers(),
+            !default_options.xla_gpu_enable_nccl_user_buffers());
+  EXPECT_EQ(parsed_options.xla_gpu_enable_command_buffer_size(), 2);
+  EXPECT_EQ(parsed_options.xla_gpu_enable_command_buffer(0),
+            DebugOptions::CUBLAS);
+  EXPECT_EQ(parsed_options.xla_gpu_enable_command_buffer(1),
+            DebugOptions::FUSION);
+  EXPECT_EQ(parsed_options.xla_gpu_analytical_latency_estimator_options().at(
+                "gpus_per_node"),
+            std::to_string(gpus_per_node + 1));
+  EXPECT_EQ(parsed_options.xla_gpu_analytical_latency_estimator_options().at(
+                "chunk_size_bytes"),
+            std::to_string(chunk_size_bytes));
+
+  HloModuleConfig config;
+  config.set_debug_options(options);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnUnverifiedModule(R"(
+    HloModule test
+    ENTRY test {
+      p0 = s32[11] parameter(0)
+      c = s32[11] constant({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10})
+      ROOT x = s32[11] multiply(p0, c)
+    }
+  )",
+                                                         config));
+  DumpNonDefaultDebugOptions(*m, kNonDefaultDebugOptionsDumpSuffix);
+  std::string real_contents;
+  TF_ASSERT_OK(tsl::ReadFileToString(
+      tsl::Env::Default(),
+      tsl::io::JoinPath(dump_folder,
+                        FilenameFor(*m, "", kNonDefaultDebugOptionsDumpSuffix)),
+      &real_contents));
+  EXPECT_THAT(real_contents, testing::Eq(non_default_options));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.cc b/third_party/xla/xla/service/dynamic_dimension_inference.cc
index 2d2bf2f7e1e2..e8bf0b24d27f 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.cc
@@ -89,7 +89,7 @@ WidenComputation(HloComputation* narrow_comp, const Shape& wide_shape) {
 
   HloInstruction* wide_parameter = wide_comp->parameter_instruction(0);
   HloInstruction* truncated_parameter = TupleUtil::ExtractPrefix(
-      wide_parameter, narrow_shape.tuple_shapes_size(),
+      wide_parameter, narrow_shape.tuple_shapes().size(),
       absl::StrCat("renarrowed.",
                    narrow_comp->parameter_instruction(0)->name()));
   HloInstruction* call_narrow_comp = wide_comp->AddInstruction(
@@ -325,8 +325,9 @@ void DynamicDimensionInferenceVisitor::SetDynamicSizes(
     HloInstruction* inst, const ShapeIndex& index,
     absl::Span<HloInstruction* const> sizes) {
   const Shape& subshape = ShapeUtil::GetSubshape(inst->shape(), index);
-  CHECK(subshape.IsArray() && subshape.rank() == sizes.size());
-  for (int64_t dimension = 0; dimension < subshape.rank(); ++dimension) {
+  CHECK(subshape.IsArray() && subshape.dimensions().size() == sizes.size());
+  for (int64_t dimension = 0; dimension < subshape.dimensions().size();
+       ++dimension) {
     if (sizes[dimension] != nullptr) {
       SetDynamicSize(inst, index, dimension, sizes[dimension]);
     }
@@ -427,7 +428,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConstant(
                                                index,
                                                /*only_dynamic_bound=*/true));
     if (!requires_pad) {
-      for (int64_t dimension = 0; dimension < subshape.rank(); ++dimension) {
+      for (int64_t dimension = 0; dimension < subshape.dimensions().size();
+           ++dimension) {
         if (subshape.is_dynamic_dimension(dimension)) {
           padded_literal.SetDynamicSize(
               dimension, index,
@@ -449,7 +451,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConstant(
         const Shape& subshape =
             ShapeUtil::GetSubshape(constant->shape(), index);
         TF_RET_CHECK(subshape.IsArray());
-        for (int64_t dimension = 0; dimension < subshape.rank(); ++dimension) {
+        for (int64_t dimension = 0; dimension < subshape.dimensions().size();
+             ++dimension) {
           if (!subshape.is_dynamic_dimension(dimension)) {
             continue;
           }
@@ -468,7 +471,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConstant(
 absl::Status DynamicDimensionInferenceVisitor::HandleCustomCall(
     HloInstruction* hlo) {
   if (hlo->custom_call_target() == "PadToStatic") {
-    for (int64_t i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+    for (int64_t i = 0; i < hlo->operand(0)->shape().dimensions().size(); ++i) {
       if (hlo->operand(0)->shape().is_dynamic_dimension(i)) {
         HloInstruction* dynamic_size =
             hlo->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -642,9 +645,9 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReduce(
           return absl::OkStatus();
         }
         if (rank < 0) {
-          rank = subshape.rank();
+          rank = subshape.dimensions().size();
         } else {
-          TF_RET_CHECK(rank == subshape.rank());
+          TF_RET_CHECK(rank == subshape.dimensions().size());
         }
         return absl::OkStatus();
       }));
@@ -667,7 +670,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReduce(
 
         // Find out the new dynamic dimension after reduce.
         int64_t dimensions_not_reduced_count = 0;
-        for (int64_t i = 0; i < operand->shape().rank(); ++i) {
+        for (int64_t i = 0; i < operand->shape().dimensions().size(); ++i) {
           if (dimension == i) {
             // The dimensions of all data operands of a variadic reduce have
             // to be the same.  This means that if one operand of variadic
@@ -699,8 +702,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return absl::OkStatus();
   }
-  absl::InlinedVector<HloInstruction*, 4> dynamic_sizes(hlo->shape().rank(),
-                                                        nullptr);
+  absl::InlinedVector<HloInstruction*, 4> dynamic_sizes(
+      hlo->shape().dimensions().size(), nullptr);
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex operand_shape_index,
@@ -741,7 +744,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
         }
 
         // Handle dimensions in the lhs.
-        for (int64_t i = 0; i < dot->operand(0)->shape().rank(); i++) {
+        for (int64_t i = 0; i < dot->operand(0)->shape().dimensions().size();
+             i++) {
           // Look for non-contracting and non-batching dimension.
           if (absl::c_linear_search(
                   dimension_numbers.lhs_contracting_dimensions(), i)) {
@@ -758,7 +762,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
         }
 
         // Handle dimensions in the rhs.
-        for (int64_t i = 0; i < dot->operand(1)->shape().rank(); i++) {
+        for (int64_t i = 0; i < dot->operand(1)->shape().dimensions().size();
+             i++) {
           // Look for non-contracting and non-batching dimension.
           if (absl::c_linear_search(
                   dimension_numbers.rhs_contracting_dimensions(), i)) {
@@ -857,7 +862,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConcatenate(
   std::vector<HloInstruction*> dynamic_concat_dims;
   for (int64_t i = 0; i < hlo->operand_count(); ++i) {
     HloInstruction* concat_dim_size = nullptr;
-    for (int64_t dimension = 0; dimension < hlo->operand(i)->shape().rank();
+    for (int64_t dimension = 0;
+         dimension < hlo->operand(i)->shape().dimensions().size();
          ++dimension) {
       if (dimension == hlo->concatenate_dimension()) {
         HloInstruction* dynamic_size =
@@ -875,7 +881,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConcatenate(
   }
   // If concat dimension is dynamic, calculate its size by summing up static
   // dims and dynamic dims together.
-  std::vector<HloInstruction*> dynamic_sizes(hlo->shape().rank(), nullptr);
+  std::vector<HloInstruction*> dynamic_sizes(hlo->shape().dimensions().size(),
+                                             nullptr);
   if (!dynamic_concat_dims.empty()) {
     HloInstruction* dim_size_total =
         hlo->parent()->AddInstruction(HloInstruction::CreateConstant(
@@ -919,14 +926,15 @@ absl::Status DynamicDimensionInferenceVisitor::HandleGetDimensionSize(
   // the shape (although the value contains the real size of the dynamic
   // dimension of the input).
   int64_t dim = gds->dimension();
-  TF_RET_CHECK(dim < gds->operand(0)->shape().rank()) << gds->ToString();
+  TF_RET_CHECK(dim < gds->operand(0)->shape().dimensions().size())
+      << gds->ToString();
   HloInstruction* operand = gds->mutable_operand(0);
-  TF_RET_CHECK(dim < operand->shape().rank());
+  TF_RET_CHECK(dim < operand->shape().dimensions().size());
   HloInstruction* replacement = parent_->GetDynamicSize(operand, {}, dim);
   HloComputation* computation = gds->parent();
   if (replacement == nullptr &&
       !gds->operand(0)->shape().is_dynamic_dimension(dim)) {
-    TF_RET_CHECK(dim < gds->operand(0)->shape().rank());
+    TF_RET_CHECK(dim < gds->operand(0)->shape().dimensions().size());
     int32_t size = gds->operand(0)->shape().dimensions(dim);
     replacement = computation->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(size)),
@@ -956,7 +964,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleSetDimensionSize(
     // s32[2, 5] = set-dimension-size(s32[2,<=5]{1,0} %param, s32[] %size),
     //                                                        dimensions={1}
     // The result shape has no dynamic dimension.
-    TF_RET_CHECK(size->shape().rank() == 0);
+    TF_RET_CHECK(size->shape().dimensions().size() == 0);
     if (size->literal().Get<int32_t>({}) ==
             hlo->shape().dimensions(hlo->dimension()) &&
         !hlo->shape().is_dynamic_dimension(hlo->dimension())) {
@@ -1056,10 +1064,11 @@ DynamicDimensionInferenceVisitor::HandleDynamicConvolutionInputGrad(
   // The output size of convolution input grad is corresponding input size.
   HloInstruction* input_sizes = hlo->mutable_operand(0);
   HloComputation* comp = hlo->parent();
-  TF_RET_CHECK(input_sizes->shape().rank() == 1) << hlo->ToString();
+  TF_RET_CHECK(input_sizes->shape().dimensions().size() == 1)
+      << hlo->ToString();
   TF_RET_CHECK(input_sizes->shape().element_type() == S32) << hlo->ToString();
   TF_RET_CHECK(input_sizes->shape().dimensions(0) ==
-               hlo->shape().dimensions_size())
+               hlo->shape().dimensions().size())
       << hlo->ToString();
   // Slice to get corresponding input size.
   HloInstruction* slice = comp->AddInstruction(
@@ -1092,7 +1101,7 @@ absl::Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
                int64_t operand_index, HloInstruction* dynamic_size) {
         const Shape& subshape = ShapeUtil::GetSubshape(hlo->shape(), index);
         auto* element = dynamic_sizes.mutable_element(index);
-        element->resize(subshape.rank(), nullptr);
+        element->resize(subshape.dimensions().size(), nullptr);
         (*element)[dimension] = dynamic_size;
         return absl::OkStatus();
       }));
@@ -1152,7 +1161,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleElementwiseNary(
   // First find all the dynamic sizes of the operands, and arrange them by
   // dimension.
   absl::InlinedVector<absl::InlinedVector<HloInstruction*, 2>, 2> operand_sizes(
-      hlo->shape().rank(),
+      hlo->shape().dimensions().size(),
       absl::InlinedVector<HloInstruction*, 2>(hlo->operand_count(), nullptr));
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
@@ -1163,11 +1172,12 @@ absl::Status DynamicDimensionInferenceVisitor::HandleElementwiseNary(
         return absl::OkStatus();
       }));
 
-  absl::InlinedVector<HloInstruction*, 2> existing_sizes(hlo->shape().rank(),
-                                                         nullptr);
+  absl::InlinedVector<HloInstruction*, 2> existing_sizes(
+      hlo->shape().dimensions().size(), nullptr);
   for (int operand_index = 0; operand_index < hlo->operand_count();
        ++operand_index) {
-    for (int64_t dimension = 0; dimension < hlo->shape().rank(); ++dimension) {
+    for (int64_t dimension = 0; dimension < hlo->shape().dimensions().size();
+         ++dimension) {
       HloInstruction* dynamic_size = operand_sizes[dimension][operand_index];
       if (dynamic_size == nullptr) {
         continue;
@@ -1247,7 +1257,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleDynamicReshape(
   }
   HloDynamicReshapeInstruction* dynamic_reshape =
       Cast<HloDynamicReshapeInstruction>(hlo);
-  for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo->shape().dimensions().size(); ++i) {
     if (hlo->shape().is_dynamic_dimension(i)) {
       SetDynamicSize(hlo, {}, i, dynamic_reshape->dim_sizes(i));
     }
@@ -1262,8 +1272,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReshape(
   }
   VLOG(2) << "Handle reshape: " << hlo->ToString() << "\n";
 
-  absl::InlinedVector<HloInstruction*, 2> dynamic_sizes(hlo->shape().rank(),
-                                                        nullptr);
+  absl::InlinedVector<HloInstruction*, 2> dynamic_sizes(
+      hlo->shape().dimensions().size(), nullptr);
   using ReshapeGroup = std::pair<int64_t, int64_t>;
   using ReshapeGroupPair = std::pair<ReshapeGroup, ReshapeGroup>;
   auto is_reverse_reshape_group_pair =
@@ -1324,8 +1334,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReshape(
         VLOG(2) << "Find reverse past reshape from " << op->ToString()
                 << " for " << dynamic_dimension_size << "\n";
         absl::InlinedVector<int64_t, 4> found_dims;
-        for (int op_dim_index = 0; op_dim_index < op->shape().rank();
-             ++op_dim_index) {
+        for (int op_dim_index = 0;
+             op_dim_index < op->shape().dimensions().size(); ++op_dim_index) {
           if (op->shape().dimensions(op_dim_index) == dynamic_dimension_size) {
             found_dims.push_back(op_dim_index);
           }
@@ -1401,7 +1411,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReshape(
       HloInstruction* dynamic_size = comp->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(1)));
       int64_t static_size = 1;
-      for (int64_t i = 0; i < operand->shape().rank(); i++) {
+      for (int64_t i = 0; i < operand->shape().dimensions().size(); i++) {
         HloInstruction* dynamic_dim_size =
             parent_->GetDynamicSize(operand, {}, i);
         if (dynamic_dim_size == nullptr) {
@@ -1448,7 +1458,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReshape(
           int64_t input_dynamic_dimension, int64_t operand_index,
           HloInstruction* operand_dynamic_size) -> absl::Status {
         HloInstruction* const reshape = hlo;
-        if (reshape->shape().rank() == 0) {
+        if (reshape->shape().dimensions().size() == 0) {
           VLOG(0) << "Reshaping a dynamic dimension into a scalar, which has "
                      "undefined behavior when input size is 0. The offending "
                      "instruction is: "
@@ -1467,8 +1477,9 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReshape(
           // most-minor.
           if (input_dynamic_dimension == 0) {
             output_dynamic_dimension = 0;
-          } else if (input_dynamic_dimension == operand->shape().rank() - 1) {
-            output_dynamic_dimension = reshape->shape().rank() - 1;
+          } else if (input_dynamic_dimension ==
+                     operand->shape().dimensions().size() - 1) {
+            output_dynamic_dimension = reshape->shape().dimensions().size() - 1;
           }
 
           if (output_dynamic_dimension == -1) {
@@ -1675,7 +1686,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
               }
               auto* leaf_dynamic_sizes =
                   dynamic_sizes.mutable_element(reduce_window_result_index);
-              leaf_dynamic_sizes->resize(subshape.rank(), nullptr);
+              leaf_dynamic_sizes->resize(subshape.dimensions().size(), nullptr);
               (*leaf_dynamic_sizes)[dimension] = dynamic_size;
             });
 
@@ -1791,7 +1802,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
     return absl::OkStatus();
   }
   absl::InlinedVector<HloInstruction*, 2> output_dynamic_sizes(
-      hlo->shape().rank(), nullptr);
+      hlo->shape().dimensions().size(), nullptr);
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
@@ -1838,7 +1849,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleGather(
     return absl::OkStatus();
   }
   absl::InlinedVector<HloInstruction*, 2> output_dynamic_sizes(
-      hlo->shape().rank(), nullptr);
+      hlo->shape().dimensions().size(), nullptr);
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex /*index*/,
@@ -1856,13 +1867,14 @@ absl::Status DynamicDimensionInferenceVisitor::HandleGather(
               operand->shape().dimensions(input_dynamic_dimension)) {
             int64_t operand_dimension = 0;
             for (int64_t output_dimension : gather_dims.offset_dims()) {
-              TF_RET_CHECK(output_dimension < hlo->shape().rank());
-              while (operand_dimension < operand->shape().rank() &&
+              TF_RET_CHECK(output_dimension < hlo->shape().dimensions().size());
+              while (operand_dimension < operand->shape().dimensions().size() &&
                      absl::c_linear_search(gather_dims.collapsed_slice_dims(),
                                            operand_dimension)) {
                 ++operand_dimension;
               }
-              TF_RET_CHECK(operand_dimension < operand->shape().rank());
+              TF_RET_CHECK(operand_dimension <
+                           operand->shape().dimensions().size());
               if (operand_dimension == input_dynamic_dimension) {
                 output_dynamic_sizes[output_dimension] = dynamic_size;
                 return absl::OkStatus();
@@ -1876,11 +1888,11 @@ absl::Status DynamicDimensionInferenceVisitor::HandleGather(
               "is not supported: %s, %lld",
               hlo->ToString(), input_dynamic_dimension);
         }
-        int64_t indices_rank = hlo->operand(1)->shape().rank();
+        int64_t indices_rank = hlo->operand(1)->shape().dimensions().size();
         if (gather_dims.index_vector_dim() == indices_rank) {
           ++indices_rank;
         }
-        int64_t output_rank = hlo->shape().rank();
+        int64_t output_rank = hlo->shape().dimensions().size();
 
         // indices_dim is an iterator over indices dimensions.
         int64_t indices_dim = 0;
@@ -1936,16 +1948,16 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConditional(
     // Only look at branch_index + 1, the correct operand index for a
     // given branch.
     const int64_t operand_index = branch_index + 1;
-
+    const Shape& operand_shape = hlo->operand(operand_index)->shape();
     int operand_count =
-        hlo->operand(operand_index)->shape().tuple_shapes_size();
+        operand_shape.IsTuple() ? operand_shape.tuple_shapes().size() : 0;
     // Prepare to pass dynamic dimension into the new computation and add
     // dynamic dimension sizes as parameters to the new tuple.
     TF_RETURN_IF_ERROR(ForEachDynamicDimensionInOperand(
         hlo, operand_index,
         [&](HloInstruction*, ShapeIndex, int64_t, int64_t,
             HloInstruction* dynamic_size) -> absl::Status {
-          TF_RET_CHECK(hlo->operand(operand_index)->shape().IsTuple())
+          TF_RET_CHECK(operand_shape.IsTuple())
               << "Only tuple typed inputs can have dynamic dimension. Please "
                  "file a bug against XLA team.";
           const HloInstruction* tuple_operand = hlo->operand(operand_index);
@@ -2025,7 +2037,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConditional(
     new_branch_computations.push_back(new_computation);
     new_operands.push_back(new_operand);
   }
-  int tuple_count = hlo->shape().tuple_shapes_size();
+  int tuple_count =
+      hlo->shape().IsTuple() ? hlo->shape().tuple_shapes().size() : 0;
   // The dynamism of the output of branches can be different.
   // E.g.,
   //   true_branch  (s32[<=4])
@@ -2038,7 +2051,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConditional(
         if (!subshape.IsArray()) {
           return;
         }
-        for (int64_t i = 0; i < subshape.rank(); ++i) {
+        for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
           for (int64_t j = 0; j < new_branch_computations.size(); ++j) {
             HloInstruction* dynamic_size = parent_->GetDynamicSize(
                 new_branch_computations[j]->root_instruction(), index, i);
@@ -2062,7 +2075,7 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConditional(
           if (!subshape.IsArray()) {
             return;
           }
-          for (int64_t i = 0; i < subshape.rank(); ++i) {
+          for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
             if (dynamic_output_mapping.element(index).contains(i)) {
               HloInstruction* dynamic_size = parent_->GetDynamicSize(
                   new_branch_computations[branch_index]->root_instruction(),
@@ -2103,7 +2116,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleConditional(
           hlo->mutable_operand(0), new_branch_computations, new_operands));
 
   HloInstruction* new_conditional_extracted = TupleUtil::ExtractPrefix(
-      new_conditional, hlo->shape().tuple_shapes_size());
+      new_conditional,
+      hlo->shape().IsTuple() ? hlo->shape().tuple_shapes().size() : 0);
   // Now set the dynamic dimensions of the newly created conditional.
   dynamic_output_mapping.ForEachElement(
       [&](const ShapeIndex& index,
@@ -2161,7 +2175,8 @@ absl::Status DynamicDimensionInferenceVisitor::HandleScatter(
           // Dynamic update window dimension is only allowed if it is exactly
           // the same as the corresponding operand dimension.
           std::vector<int64_t> update_window_dims_in_operand;
-          for (int64_t i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+          for (int64_t i = 0; i < hlo->operand(0)->shape().dimensions().size();
+               ++i) {
             if (absl::c_linear_search(scatter_dims.inserted_window_dims(), i)) {
               continue;
             }
@@ -2219,11 +2234,12 @@ absl::Status DynamicDimensionInferenceVisitor::HandleWhile(
   // (represented by a shape index as output index and an int64_t dimension
   // number) to output index (represented by an int64_t) is tracked for the
   // while instruction.
-  Shape original_shape = hlo->shape();
+  const Shape& original_shape = hlo->shape();
   ShapeTree<absl::flat_hash_map<int64_t, int64_t>> dynamic_output_mapping(
       original_shape);
   std::vector<HloInstruction*> operands_to_add;
-  const int original_tuple_count = original_shape.tuple_shapes_size();
+  const int original_tuple_count =
+      original_shape.IsTuple() ? original_shape.tuple_shapes().size() : 0;
   int operand_count = original_tuple_count;
   // Clean up the result shape
   DynamicParameterBinding binding_for_while;
@@ -2526,7 +2542,7 @@ absl::Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
         if (subshape.IsTuple()) {
           absl::InlinedVector<HloInstruction*, 2> children;
           ShapeIndex child_index = shape_index;
-          for (int i = 0; i < subshape.tuple_shapes_size(); ++i) {
+          for (int i = 0; i < subshape.tuple_shapes().size(); ++i) {
             child_index.push_back(i);
             children.push_back(padded.element(child_index));
             child_index.pop_back();
@@ -2553,7 +2569,7 @@ absl::Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
           Shape data_output_shape =
               ShapeUtil::MakeStaticShape(element->shape());  // 0th element.
           Shape output_shape = ShapeUtil::MakeTupleShape({data_output_shape});
-          for (int64_t i = 0; i < element->shape().rank(); ++i) {
+          for (int64_t i = 0; i < element->shape().dimensions().size(); ++i) {
             ShapeUtil::AppendShapeToTuple(ShapeUtil::MakeScalarShape(S32),
                                           &output_shape);
           }
@@ -2567,7 +2583,7 @@ absl::Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
                                                     pad_to_static, 0),
               absl::StrCat(element->name(), ".data"));
           SetVisited(*data_output);
-          for (int64_t i = 0; i < element->shape().rank(); ++i) {
+          for (int64_t i = 0; i < element->shape().dimensions().size(); ++i) {
             if (!element->shape().is_dynamic_dimension(i)) {
               continue;
             }
@@ -2670,7 +2686,7 @@ void DynamicDimensionInference::SetDynamicSize(HloInstruction* inst,
           << index.ToString() << "@" << dim << " to " << size->ToShortString();
   const Shape& subshape = ShapeUtil::GetSubshape(inst->shape(), index);
   CHECK(!subshape.IsTuple()) << "Can't set a tuple shape to dynamic dimension";
-  CHECK(dim < subshape.rank() && dim >= 0)
+  CHECK(dim < subshape.dimensions().size() && dim >= 0)
       << "Asked to set invalid dynamic dimension. Shape: "
       << subshape.ToString() << ", Dimension: " << dim;
   DynamicDimension dynamic_dimension{inst, index, dim};
@@ -2783,7 +2799,7 @@ absl::Status DynamicDimensionInference::ForwardDynamicSize(
     HloInstruction* inst, HloInstruction* new_inst, const ShapeIndex& index) {
   TF_RET_CHECK(ShapeUtil::Compatible(inst->shape(), new_inst->shape()));
 
-  for (int64_t dim = 0; dim < inst->shape().rank(); ++dim) {
+  for (int64_t dim = 0; dim < inst->shape().dimensions().size(); ++dim) {
     DynamicDimension dynamic_dimension_new{new_inst, index, dim};
     DynamicDimension dynamic_dimension{inst, index, dim};
     auto iter = dynamic_mapping_.find(dynamic_dimension);
@@ -2800,21 +2816,24 @@ absl::Status DynamicDimensionInference::ForwardDynamicSize(
 bool DynamicDimensionInference::HasDynamicDimension(
     HloInstruction* inst, ShapeIndexView index) const {
   bool has_dynamic_dim = false;
-  ShapeUtil::ForEachSubshape(inst->shape(), [&](const Shape& subshape,
-                                                const ShapeIndex& subindex) {
-    if (subshape.IsTuple()) {
-      return;
-    }
-    if (ShapeIndexView(subindex).subspan(0, index.size()) != index) {
-      return;
-    }
-    for (int64_t i = 0; i < subshape.dimensions_size(); ++i) {
-      HloInstruction* operand_dynamic_size = GetDynamicSize(inst, subindex, i);
-      if (operand_dynamic_size != nullptr) {
-        has_dynamic_dim = true;
-      }
-    }
-  });
+  ShapeUtil::ForEachSubshape(
+      inst->shape(), [&](const Shape& subshape, const ShapeIndex& subindex) {
+        if (subshape.IsTuple()) {
+          return;
+        }
+        if (ShapeIndexView(subindex).subspan(0, index.size()) != index) {
+          return;
+        }
+        if (subshape.IsArray()) {
+          for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
+            HloInstruction* operand_dynamic_size =
+                GetDynamicSize(inst, subindex, i);
+            if (operand_dynamic_size != nullptr) {
+              has_dynamic_dim = true;
+            }
+          }
+        }
+      });
   return has_dynamic_dim;
 }
 
@@ -2825,7 +2844,8 @@ Shape DynamicDimensionInference::GetDynamicShape(HloInstruction* inst) {
         if (!subshape->IsArray()) {
           return;
         }
-        for (int64_t dimension = 0; dimension < subshape->rank(); ++dimension) {
+        for (int64_t dimension = 0; dimension < subshape->dimensions().size();
+             ++dimension) {
           if (GetDynamicSize(inst, index, dimension) != nullptr) {
             subshape->set_dynamic_dimension(dimension, true);
           }
@@ -2852,7 +2872,8 @@ const HloInstruction* DynamicDimensionInference::GetDynamicSize(
 std::vector<HloInstruction*> DynamicDimensionInference::GetDynamicSizes(
     HloInstruction* inst, const ShapeIndex& index) const {
   CHECK(ShapeUtil::IndexIsValid(inst->shape(), index));
-  const int64_t rank = ShapeUtil::GetSubshape(inst->shape(), index).rank();
+  const int64_t rank =
+      ShapeUtil::GetSubshape(inst->shape(), index).dimensions().size();
   std::vector<HloInstruction*> result(rank, nullptr);
   for (int64_t i = 0; i < rank; ++i) {
     result[i] = GetDynamicSize(inst, index, i);
@@ -2880,7 +2901,7 @@ bool DynamicDimensionInference::CanInfer(HloInstruction* hlo) {
           if (!subshape.IsArray()) {
             return;
           }
-          for (int64_t dimension = 0; dimension < subshape.rank();
+          for (int64_t dimension = 0; dimension < subshape.dimensions().size();
                ++dimension) {
             bool shape_is_dynamic = subshape.is_dynamic_dimension(dimension);
             bool dynamic_size_recorded =
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference_test.cc b/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
index caec7dc72cdc..760c92b931be 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
@@ -23,13 +23,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -38,11 +37,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class DynamicDimensionInferenceTest : public HloTestBase {
+class DynamicDimensionInferenceTest : public HloHardwareIndependentTestBase {
  protected:
-  DynamicDimensionInferenceTest() : HloTestBase() {
-    module_ = CreateNewVerifiedModule();
-  }
+  DynamicDimensionInferenceTest() { module_ = CreateNewVerifiedModule(); }
 
   absl::Status RunInference(
       OpSupportsDynamismHandler op_supports_dynamism_handler = nullptr,
@@ -296,7 +293,7 @@ TEST_F(DynamicDimensionInferenceTest, DotTest) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
       HloInstruction::CreateDot(xz_dynamic_shape, a_param, b_param, dot_dnums,
-                                HloTestBase::DefaultPrecisionConfig(2)));
+                                DefaultPrecisionConfig(2)));
 
   module_->AddEntryComputation(builder.Build());
 
@@ -332,9 +329,8 @@ TEST_F(DynamicDimensionInferenceTest, DotTestBatch) {
   dot_dnums.add_lhs_batch_dimensions(2);
   dot_dnums.add_rhs_batch_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(2);
-  auto dot = builder.AddInstruction(
-      HloInstruction::CreateDot(output_shape, a_param, b_param, dot_dnums,
-                                HloTestBase::DefaultPrecisionConfig(2)));
+  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
+      output_shape, a_param, b_param, dot_dnums, DefaultPrecisionConfig(2)));
 
   module_->AddEntryComputation(builder.Build());
 
@@ -380,9 +376,8 @@ TEST_F(DynamicDimensionInferenceTest, DotTestMultiContracting) {
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_contracting_dimensions(1);
-  auto dot = builder.AddInstruction(
-      HloInstruction::CreateDot(output_shape, a_param, b_param, dot_dnums,
-                                HloTestBase::DefaultPrecisionConfig(2)));
+  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
+      output_shape, a_param, b_param, dot_dnums, DefaultPrecisionConfig(2)));
 
   module_->AddEntryComputation(builder.Build());
 
@@ -430,8 +425,7 @@ TEST_F(DynamicDimensionInferenceTest, ConvolutionTest) {
 
   auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       zx_shape_dynamic, a_param, b_param, /*feature_group_count=*/1,
-      /*batch_group_count=*/1, window, dnums,
-      HloTestBase::DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
   module_->AddEntryComputation(builder.Build());
 
@@ -755,7 +749,7 @@ TEST_F(DynamicDimensionInferenceTest, WhileTest) {
   ASSERT_NE(while_hlo, nullptr);
   // The original while shape has 2 parameters. With dynamic size, the tuple
   // should have 4 elements (We don't deduplicate the arguments).
-  EXPECT_EQ(while_hlo->shape().tuple_shapes_size(), 4);
+  EXPECT_EQ(while_hlo->shape().tuple_shapes().size(), 4);
   HloInstruction* add_inst = nullptr;
   for (HloInstruction* inst : while_hlo->while_body()->instructions()) {
     if (inst->opcode() == HloOpcode::kAdd) {
@@ -889,7 +883,7 @@ TEST_F(DynamicDimensionInferenceTest, ConditionalInputTest) {
   ASSERT_NE(conditional_hlo, nullptr);
   // The original conditional shape has 1 parameters. With dynamic size passed
   // out from the computation, another element is added to the tuple.
-  EXPECT_EQ(conditional_hlo->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(conditional_hlo->shape().tuple_shapes().size(), 2);
   HloInstruction* add_true_branch = nullptr;
   for (HloInstruction* inst :
        conditional_hlo->true_computation()->instructions()) {
@@ -1349,8 +1343,9 @@ ENTRY computation {
             /*opaque=*/std::string{}, API_VERSION_STATUS_RETURNING));
       }));
 
-  absl::StatusOr<bool> filecheck_result = RunFileCheck(module_->ToString({}),
-                                                       R"(
+  absl::StatusOr<bool> filecheck_result = RunFileCheck(
+      module_->ToString(HloPrintOptions().set_print_operand_shape(true)),
+      R"(
 // CHECK: compare = pred[] compare(s32[] %a_size_1, s32[] %b_size_1), direction=EQ
 // CHECK: compare.5 = pred[] compare(s32[] %a_size_2, s32[] %b_size_2), direction=EQ
 // CHECK: and.2 = pred[] and(pred[] %compare, pred[] %compare.5)
diff --git a/third_party/xla/xla/service/dynamic_dimension_simplifier.h b/third_party/xla/xla/service/dynamic_dimension_simplifier.h
deleted file mode 100644
index 0824118cb48b..000000000000
--- a/third_party/xla/xla/service/dynamic_dimension_simplifier.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef XLA_SERVICE_DYNAMIC_DIMENSION_SIMPLIFIER_H_
-#define XLA_SERVICE_DYNAMIC_DIMENSION_SIMPLIFIER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h"
-
-#endif  // XLA_SERVICE_DYNAMIC_DIMENSION_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/dynamic_index_splitter.h b/third_party/xla/xla/service/dynamic_index_splitter.h
deleted file mode 100644
index 670d297da852..000000000000
--- a/third_party/xla/xla/service/dynamic_index_splitter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
-#define XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/dynamic_index_splitter.h"
-
-#endif  // XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
diff --git a/third_party/xla/xla/service/dynamic_padder.cc b/third_party/xla/xla/service/dynamic_padder.cc
index 47ad65f3634a..32d41e9680b7 100644
--- a/third_party/xla/xla/service/dynamic_padder.cc
+++ b/third_party/xla/xla/service/dynamic_padder.cc
@@ -551,7 +551,7 @@ absl::StatusOr<bool> RewriteDynamicReshapeSplitInput(
 
   GatherDimensionNumbers gather_dim_numbers;
   // Use gather to rearrange the input dim dimension.
-  for (int64_t i = 0; i < operand_shape.dimensions_size(); ++i) {
+  for (int64_t i = 0; i < operand_shape.dimensions().size(); ++i) {
     // Offset dim is every dimension including newly added size 1 dim, except
     // for input_dim, which acts as a batch_dim.
     if (i != input_dim) {
@@ -736,7 +736,7 @@ absl::StatusOr<bool> RewriteDynamicReshapeCombineInput(
 
   GatherDimensionNumbers gather_dim_numbers;
   // Use gather to rearrange the output dim dimension.
-  for (int64_t i = 0; i < output_shape.dimensions_size(); ++i) {
+  for (int64_t i = 0; i < output_shape.dimensions().size(); ++i) {
     // Offset dim is every dimension including newly added size 1 dim, except
     // for input_dim, which acts as a batch_dim.
     if (i != output_dim) {
@@ -874,7 +874,7 @@ absl::StatusOr<bool> RewriteReverse(
   PaddingConfig padding;
   // Doubles dynamic dimension size using a pad.
   Shape pad_shape = reverse_shape;
-  for (int i = 0; i < reverse_shape.rank(); ++i) {
+  for (int i = 0; i < reverse_shape.dimensions().size(); ++i) {
     auto dimension = padding.add_dimensions();
     if (dynamic_reverse_dims.count(i) > 0) {
       dimension->set_edge_padding_low(0);
@@ -889,8 +889,8 @@ absl::StatusOr<bool> RewriteReverse(
   HloInstruction* pad = reverse->AddInstruction(
       HloInstruction::CreatePad(pad_shape, cloned_reverse, zero, padding));
   std::vector<HloInstruction*> start_indices;
-  start_indices.reserve(reverse_shape.rank());
-  for (int i = 0; i < reverse_shape.rank(); ++i) {
+  start_indices.reserve(reverse_shape.dimensions().size());
+  for (int i = 0; i < reverse_shape.dimensions().size(); ++i) {
     if (dynamic_reverse_dims.count(i) > 0) {
       // Start at bound_size - dynamic_size.
       HloInstruction* bound_size =
@@ -927,11 +927,12 @@ HloInstruction* RewriteInputWithDynamicPadding(
   // Padded shape represents the bounded shape after dynamic padding.
   Shape padded_shape = input->shape();
   PaddingConfig padding_configs;
-  for (int64_t i = 0; i < input->shape().rank(); ++i) {
+  for (int64_t i = 0; i < input->shape().dimensions().size(); ++i) {
     PaddingConfig::PaddingConfigDimension padding_dim;
     *padding_configs.add_dimensions() = padding_dim;
   }
-  std::vector<HloInstruction*> start_indices(input->shape().rank(), zero_s32);
+  std::vector<HloInstruction*> start_indices(input->shape().dimensions().size(),
+                                             zero_s32);
   for (int64_t dim_index = 0; dim_index < input_window->dimensions_size();
        ++dim_index) {
     if (padding_before[dim_index] == nullptr) {
@@ -961,11 +962,12 @@ HloInstruction* RewriteInputWithDynamicPadding(
             padding_before[dim_index]));
     start_indices[shape_dim] = slicing_start;
 
-    padded_shape.mutable_dimensions()[shape_dim] =
+    padded_shape.set_dimensions(
+        shape_dim,
         window_dim->padding_low() +
-        window_util::DilatedBound(padded_shape.dimensions(shape_dim),
-                                  window_dim->base_dilation()) +
-        window_dim->padding_high();
+            window_util::DilatedBound(padded_shape.dimensions(shape_dim),
+                                      window_dim->base_dilation()) +
+            window_dim->padding_high());
     window_dim->clear_padding_high();
     window_dim->clear_padding_low();
     window_dim->set_base_dilation(1);
@@ -1201,9 +1203,10 @@ absl::StatusOr<bool> RewriteDynamicReduceWindowSamePadding(
   }
   HloInstruction* input = hlo->mutable_operand(0);
   HloInstruction* init = hlo->mutable_operand(1);
-  int64_t rank = hlo->shape().rank();
+  int64_t rank = hlo->shape().dimensions().size();
   Window window = hlo->window();
-  std::vector<HloInstruction*> padding_before(hlo->shape().rank(), nullptr);
+  std::vector<HloInstruction*> padding_before(hlo->shape().dimensions().size(),
+                                              nullptr);
   for (int64_t dim_index = 0; dim_index < rank; ++dim_index) {
     HloInstruction* operand_dynamic_size =
         dynamic_dimension_inference->GetDynamicSize(hlo->mutable_operand(0), {},
@@ -1244,9 +1247,10 @@ absl::StatusOr<bool> RewriteDynamicSelectAndScatterSamePadding(
   HloInstruction* init = hlo->mutable_operand(2);
   TF_ASSIGN_OR_RETURN(HloInstruction * input_padding_value,
                       ChooseIdentityValue(hlo, /*operand_number=*/0));
-  int64_t rank = hlo->shape().rank();
+  int64_t rank = hlo->shape().dimensions().size();
   Window window = hlo->window();
-  std::vector<HloInstruction*> padding_before(hlo->shape().rank(), nullptr);
+  std::vector<HloInstruction*> padding_before(hlo->shape().dimensions().size(),
+                                              nullptr);
   for (int64_t dim_index = 0; dim_index < rank; ++dim_index) {
     const WindowDimension& window_dim = window.dimensions(dim_index);
     if (window_util::IsTrivialWindowDimension(window_dim)) {
@@ -1289,8 +1293,9 @@ absl::StatusOr<bool> RewriteDynamicSelectAndScatterSamePadding(
           input->shape(), input, hlo->called_computations()[0], window, source,
           init, hlo->called_computations()[1]));
   std::vector<HloInstruction*> start_indices(
-      input->shape().rank(), hlo->AddInstruction(HloInstruction::CreateConstant(
-                                 LiteralUtil::Zero(S32))));
+      input->shape().dimensions().size(),
+      hlo->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
   PaddingConfig padding_configs;
   for (int64_t dim_index = 0; dim_index < rank; ++dim_index) {
     PaddingConfig::PaddingConfigDimension padding_dim;
@@ -1322,8 +1327,8 @@ absl::StatusOr<bool> RewriteDynamicConcat(
     return false;
   }
   std::vector<HloInstruction*> offsets;
-  offsets.reserve(concat->shape().dimensions_size());
-  for (int64_t i = 0; i < concat->shape().dimensions_size(); ++i) {
+  offsets.reserve(concat->shape().dimensions().size());
+  for (int64_t i = 0; i < concat->shape().dimensions().size(); ++i) {
     offsets.push_back(concat->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(0))));
   }
@@ -1455,7 +1460,8 @@ absl::StatusOr<bool> RewriteDynamicBinaryOp(
   HloInstruction* operand_0 = binary->mutable_operand(0);
   HloInstruction* operand_1 = binary->mutable_operand(1);
 
-  TF_RET_CHECK(operand_0->shape().rank() == operand_1->shape().rank());
+  TF_RET_CHECK(operand_0->shape().dimensions().size() ==
+               operand_1->shape().dimensions().size());
   auto dims_0 = dynamic_dimension_inference->GetDynamicSizes(operand_0, {});
   auto dims_1 = dynamic_dimension_inference->GetDynamicSizes(operand_1, {});
   bool changed = false;
@@ -1489,8 +1495,8 @@ absl::StatusOr<bool> RewriteDynamicBinaryOp(
             ShapeUtil::ChangeElementType(static_shape, PRED), pred, {}));
         Shape slice_shape = static_shape;
         slice_shape.set_dimensions(i, 1);
-        std::vector<int64_t> start_indices(slice_shape.rank(), 0);
-        std::vector<int64_t> strides(slice_shape.rank(), 1);
+        std::vector<int64_t> start_indices(slice_shape.dimensions().size(), 0);
+        std::vector<int64_t> strides(slice_shape.dimensions().size(), 1);
         HloInstruction* slice = binary->AddInstruction(
             HloInstruction::CreateSlice(slice_shape, operand, start_indices,
                                         slice_shape.dimensions(), strides));
@@ -1498,9 +1504,9 @@ absl::StatusOr<bool> RewriteDynamicBinaryOp(
         HloInstruction* reshape = binary->AddInstruction(
             HloInstruction::CreateReshape(reshape_shape, slice));
         std::vector<int64_t> broadcast_dims;
-        broadcast_dims.reserve(static_shape.rank() - 1);
+        broadcast_dims.reserve(static_shape.dimensions().size() - 1);
         // Broadcast to all dims execpt for i.
-        for (int64_t j = 0; j < static_shape.rank(); ++j) {
+        for (int64_t j = 0; j < static_shape.dimensions().size(); ++j) {
           if (j != i) {
             broadcast_dims.push_back(j);
           }
@@ -1613,9 +1619,9 @@ absl::StatusOr<bool> RewriteDynamicUpdateSlice(
   HloInstruction* update = dus->mutable_operand(1);
   HloInstruction* base = dus->mutable_operand(0);
   std::vector<HloInstruction*> dynamic_dims_in_partial_update(
-      update->shape().rank(), nullptr);
+      update->shape().dimensions().size(), nullptr);
   bool needs_rewrite = false;
-  for (int64_t i = 0; i < update->shape().rank(); ++i) {
+  for (int64_t i = 0; i < update->shape().dimensions().size(); ++i) {
     if (update->shape().dimensions(i) < base->shape().dimensions(i)) {
       HloInstruction* dynamic_dim =
           dynamic_dimension_inference->GetDynamicSize(update, {}, i);
@@ -1668,15 +1674,15 @@ absl::StatusOr<bool> RewriteDynamicReshape(
   bool changed = false;
   HloInstruction* operand = reshape->mutable_operand(0);
   std::vector<HloInstruction*> input_dynamic_dims;
-  input_dynamic_dims.reserve(operand->shape().dimensions_size());
-  for (int64_t dim = 0; dim < operand->shape().dimensions_size(); ++dim) {
+  input_dynamic_dims.reserve(operand->shape().dimensions().size());
+  for (int64_t dim = 0; dim < operand->shape().dimensions().size(); ++dim) {
     input_dynamic_dims.push_back(
         dynamic_dimension_inference->GetDynamicSize(operand, {}, dim));
   }
 
   std::vector<HloInstruction*> output_dynamic_dims;
-  output_dynamic_dims.reserve(reshape->shape().dimensions_size());
-  for (int64_t dim = 0; dim < reshape->shape().dimensions_size(); ++dim) {
+  output_dynamic_dims.reserve(reshape->shape().dimensions().size());
+  for (int64_t dim = 0; dim < reshape->shape().dimensions().size(); ++dim) {
     output_dynamic_dims.push_back(
         dynamic_dimension_inference->GetDynamicSize(reshape, {}, dim));
   }
@@ -1744,7 +1750,7 @@ absl::StatusOr<bool> RewriteDynamicReshape(
     HloInstruction* dynamic_size =
         operand->AddInstruction(HloInstruction::CreateConstant(
             LiteralUtil::CreateR0<int32_t>(num_elements)));
-    for (int64_t i = 0; i < operand->shape().rank(); i++) {
+    for (int64_t i = 0; i < operand->shape().dimensions().size(); i++) {
       HloInstruction* dynamic_dim_size =
           dynamic_dimension_inference->GetDynamicSize(operand, {}, i);
       if (dynamic_dim_size != nullptr) {
@@ -1934,7 +1940,7 @@ absl::StatusOr<HloInstruction*> DynamicShapeRemovingVisitor::ConvertToDynamic(
     // slice to dynamic to create a dynamic tensor.
     std::vector<HloInstruction*> slice_operand;
     slice_operand.push_back(*element);
-    for (int64_t i = 0; i < subshape.dimensions_size(); ++i) {
+    for (int64_t i = 0; i < subshape.dimensions().size(); ++i) {
       auto dimension_size =
           dynamic_dimension_inference_->GetDynamicSize(inst, index, i);
       if (dimension_size == nullptr) {
@@ -2188,8 +2194,8 @@ absl::StatusOr<bool> DynamicPadder::Run(
           continue;
         }
 
-        for (int64_t input_dim = 0; input_dim < operand->shape().rank();
-             ++input_dim) {
+        for (int64_t input_dim = 0;
+             input_dim < operand->shape().dimensions().size(); ++input_dim) {
           HloInstruction* operand_dynamic_size =
               dynamic_dimension_inference.GetDynamicSize(original_operand, {},
                                                          input_dim);
diff --git a/third_party/xla/xla/service/dynamic_padder_test.cc b/third_party/xla/xla/service/dynamic_padder_test.cc
index 13c754482e79..9ae246791e1b 100644
--- a/third_party/xla/xla/service/dynamic_padder_test.cc
+++ b/third_party/xla/xla/service/dynamic_padder_test.cc
@@ -33,6 +33,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
@@ -42,11 +45,8 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/dynamic_dimension_inference.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/llvm_irgen_test_base.h"
 #include "xla/tests/test_macros.h"
diff --git a/third_party/xla/xla/service/dynamic_parameter_binding_test.cc b/third_party/xla/xla/service/dynamic_parameter_binding_test.cc
index 94eaf4e5166b..8b13dd8bb9d0 100644
--- a/third_party/xla/xla/service/dynamic_parameter_binding_test.cc
+++ b/third_party/xla/xla/service/dynamic_parameter_binding_test.cc
@@ -22,14 +22,14 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
-using DynamicParameterBindingTest = HloTestBase;
+using DynamicParameterBindingTest = HloHardwareIndependentTestBase;
 
 TEST_F(DynamicParameterBindingTest, SimpleBinding) {
   // 'b' is a dynamic shape; 'a' represents the real size of b's first
diff --git a/third_party/xla/xla/service/dynamic_update_slice_test.cc b/third_party/xla/xla/service/dynamic_update_slice_test.cc
index 96298fb6437f..fd1ee8815be7 100644
--- a/third_party/xla/xla/service/dynamic_update_slice_test.cc
+++ b/third_party/xla/xla/service/dynamic_update_slice_test.cc
@@ -13,18 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/execution_options_util.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/status_macros.h"
-#include "xla/test.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tests/test_utils.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class DynamicUpdateSliceTest : public HloTestBase {};
+using DynamicUpdateSliceTest =
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 XLA_TEST_F(DynamicUpdateSliceTest, ShardedInPlaceDUS) {
   // A dynamic-update-slice within a while loop.  This construction is an easy
diff --git a/third_party/xla/xla/service/dynamic_window_utils.cc b/third_party/xla/xla/service/dynamic_window_utils.cc
index 25b20e3ea3c2..a19af9c5d505 100644
--- a/third_party/xla/xla/service/dynamic_window_utils.cc
+++ b/third_party/xla/xla/service/dynamic_window_utils.cc
@@ -39,7 +39,7 @@ class HloOp {
   void SetName(const std::string& name) {
     inst_->SetAndSanitizeName(name);
     if (inst_->GetModule() != nullptr) {
-      inst_->UniquifyName(&inst_->GetModule()->instruction_name_uniquer());
+      inst_->UniquifyName(inst_->GetModule());
     }
   }
   HloInstruction* get() { return inst_; }
diff --git a/third_party/xla/xla/service/eigh_expander.h b/third_party/xla/xla/service/eigh_expander.h
deleted file mode 100644
index 5ef10cffe0bb..000000000000
--- a/third_party/xla/xla/service/eigh_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_EIGH_EXPANDER_H_
-#define XLA_SERVICE_EIGH_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/eigh_expander.h"
-
-#endif  // XLA_SERVICE_EIGH_EXPANDER_H_
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index ceffcf058abe..89391a3bdd1d 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -3252,7 +3252,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
     llvm_ir::IrArray::Index operand_index(source_index.GetType());
     // If we are concatenating the fastest varying dimension, we can reuse the
     // linear index.
-    if (source_index.linear() != nullptr && operand->shape().rank() > 1 &&
+    if (source_index.linear() != nullptr &&
+        operand->shape().dimensions().size() > 1 &&
         concat_dim == operand->shape().layout().minor_to_major(0)) {
       llvm::Value* linear_without_concat_dim = b_->CreateUDiv(
           source_index.linear(), source_index.GetConstantWithIndexType(
@@ -3342,7 +3343,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     const llvm_ir::IrArray::Index& index) {
   // Emit IR to read dynamic start indices from hlo->operand(1).
   const HloInstruction* input_hlo = hlo->operand(0);
-  const int64_t rank = input_hlo->shape().rank();
+  const int64_t rank = input_hlo->shape().dimensions().size();
   // Use the same index type for all tensor accesses in the same kernel.
   llvm::Type* index_type = index.GetType();
   std::vector<llvm::Value*> slice_start_multi_index(rank);
@@ -3407,9 +3408,9 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
   // First copy in the window indices to operand_index. Also collect a mapping
   // from operand dimension to output window dimension. Elided window dimensions
   // map to -1.
-  std::vector<int64_t> operand_to_output_dim(operand_shape.dimensions_size(),
+  std::vector<int64_t> operand_to_output_dim(operand_shape.dimensions().size(),
                                              -1);
-  for (int64_t i = 0, e = operand_shape.dimensions_size(),
+  for (int64_t i = 0, e = operand_shape.dimensions().size(),
                operand_index_dim = 0;
        i < e; i++) {
     if (absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
@@ -3424,14 +3425,14 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
   // This is the index of the index vector in the start_indices tensor.
   std::vector<llvm::Value*> gather_index_index_components;
   {
-    for (int64_t i = 0, e = output_shape.dimensions_size(); i < e; i++) {
+    for (int64_t i = 0, e = output_shape.dimensions().size(); i < e; i++) {
       if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) {
         gather_index_index_components.push_back(index[i]);
       }
     }
 
     if (gather_index_index_components.size() !=
-        indices_shape.dimensions_size()) {
+        indices_shape.dimensions().size()) {
       gather_index_index_components.insert(
           gather_index_index_components.begin() +
               dim_numbers.index_vector_dim(),
@@ -3480,7 +3481,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
         Add(operand_multi_index[operand_dim], maybe_truncated_clamped_index);
   };
 
-  if (indices_shape.dimensions_size() == dim_numbers.index_vector_dim()) {
+  if (indices_shape.dimensions().size() == dim_numbers.index_vector_dim()) {
     IrArray::Index gather_index_index(gather_index_index_components,
                                       indices_shape, index_type);
     TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component,
@@ -3512,7 +3513,7 @@ ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   const HloInstruction* update_hlo = hlo->operand(1);
   const HloInstruction* start_hlo = hlo->operand(2);
   // Calculate slice start/end indices.
-  const int64_t rank = input_hlo->shape().rank();
+  const int64_t rank = input_hlo->shape().dimensions().size();
   std::vector<llvm::Value*> slice_start_multi_index(rank);
   std::vector<llvm::Value*> slice_limit_multi_index(rank);
   // Slice intersection gathers (ANDs) conditions on all ranks for which
@@ -3678,8 +3679,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
 
   int64_t contracted_dim_size =
       hlo->operand(0)->shape().dimensions(lhs_contracting_dim);
-  int64_t lhs_dims = hlo->operand(0)->shape().dimensions_size();
-  int64_t rhs_dims = hlo->operand(1)->shape().dimensions_size();
+  int64_t lhs_dims = hlo->operand(0)->shape().dimensions().size();
+  int64_t rhs_dims = hlo->operand(1)->shape().dimensions().size();
 
   llvm::Type* index_type = dot_result_index.GetType();
   auto index_typed_const = [&](uint64_t c) -> llvm::Constant* {
@@ -3743,8 +3744,10 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
   // There are rhs_dims - 1 - num_batch_dims non-contracting dimensions for the
   // rhs operand. We can assume they have the same relative order as in the
   // output.
-  DCHECK_EQ(hlo->shape().rank(), lhs_dims + rhs_dims - 2 - num_batch_dims);
-  for (int64_t i = lhs_dims - 1, j = 0; i < hlo->shape().rank(); ++i, ++j) {
+  DCHECK_EQ(hlo->shape().dimensions().size(),
+            lhs_dims + rhs_dims - 2 - num_batch_dims);
+  for (int64_t i = lhs_dims - 1, j = 0; i < hlo->shape().dimensions().size();
+       ++i, ++j) {
     // Skip the positions which have already been filled with contracting
     // dimension and batch dimensions.
     while (j < rhs_dims && rhs_multi_index[j] != nullptr) {
@@ -3905,7 +3908,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         auto* iota = Cast<HloIotaInstruction>(hlo);
         PrimitiveType element_type = iota->shape().element_type();
         IrArray::Index elem_index =
-            iota->shape().rank() > 1
+            iota->shape().dimensions().size() > 1
                 ? target_index.SourceIndexOfBroadcast(
                       iota->shape(),
                       ShapeUtil::MakeShapeWithDescendingLayout(
@@ -4282,7 +4285,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduce(
   int accumulators_count = 1;
   if (is_variadic) {
     CHECK(out_shape.IsTuple());
-    accumulators_count = out_shape.tuple_shapes_size();
+    accumulators_count = out_shape.tuple_shapes().size();
   }
 
   absl::Span<const int64_t> reduced_dimensions(reduce->dimensions());
diff --git a/third_party/xla/xla/service/elemental_ir_emitter_test.cc b/third_party/xla/xla/service/elemental_ir_emitter_test.cc
index 43426d10a640..b8b0315985c2 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter_test.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter_test.cc
@@ -33,12 +33,12 @@ limitations under the License.
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
 #include "xla/types.h"
@@ -674,10 +674,8 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-// TODO(b/324385428): Failing on GPU at head due to an LLVM integrate. Re-enable
-// once this has been fixed.
 XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
-           DISABLED_MinimumHandlesNaNsOnTheRight) {
+           MinimumHandlesNaNsOnTheRight) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
diff --git a/third_party/xla/xla/service/executable_test.cc b/third_party/xla/xla/service/executable_test.cc
index 3c896a016396..73c37dbdca7e 100644
--- a/third_party/xla/xla/service/executable_test.cc
+++ b/third_party/xla/xla/service/executable_test.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -47,7 +47,7 @@ class TestExecutable : public Executable {
   }
 };
 
-class ExecutableTest : public HloTestBase {};
+class ExecutableTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(ExecutableTest, HloProtoGetterIsThreadCompatible) {
   // Executable::hlo_proto() is doing some lazy initialization of a
diff --git a/third_party/xla/xla/service/flatten_call_graph.h b/third_party/xla/xla/service/flatten_call_graph.h
deleted file mode 100644
index ff5af7039ee3..000000000000
--- a/third_party/xla/xla/service/flatten_call_graph.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Flatten the call graph for an HLO module into a tree.
-
-#ifndef XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
-#define XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
-
-#endif  // XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
diff --git a/third_party/xla/xla/service/float_normalization.h b/third_party/xla/xla/service/float_normalization.h
deleted file mode 100644
index db54be02642d..000000000000
--- a/third_party/xla/xla/service/float_normalization.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_FLOAT_NORMALIZATION_H_
-#define XLA_SERVICE_FLOAT_NORMALIZATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/float_normalization.h"
-
-#endif  // XLA_SERVICE_FLOAT_NORMALIZATION_H_
diff --git a/third_party/xla/xla/service/float_support.h b/third_party/xla/xla/service/float_support.h
index 51b782ab759f..ff164d6d213d 100644
--- a/third_party/xla/xla/service/float_support.h
+++ b/third_party/xla/xla/service/float_support.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_SERVICE_FLOAT_SUPPORT_H_
 
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -74,6 +73,11 @@ class FloatSupport {
   virtual bool EffectiveOperandPrecisionIsLowPrecision(
       const HloInstruction& hlo, int64_t operand_index) const;
 
+  // Returns whether FloatNormalization should skip analyzing the instruction.
+  virtual bool ShouldSkipInstruction(const HloInstruction& hlo) const {
+    return false;
+  }
+
  private:
   PrimitiveType low_precision_type_;
   PrimitiveType high_precision_type_;
diff --git a/third_party/xla/xla/service/fusion_constant_sinking.h b/third_party/xla/xla/service/fusion_constant_sinking.h
deleted file mode 100644
index 15f15e41af05..000000000000
--- a/third_party/xla/xla/service/fusion_constant_sinking.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_FUSION_CONSTANT_SINKING_H_
-#define XLA_SERVICE_FUSION_CONSTANT_SINKING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/fusion_constant_sinking.h"
-
-#endif  // XLA_SERVICE_FUSION_CONSTANT_SINKING_H_
diff --git a/third_party/xla/xla/service/fusion_node_indexing_evaluation_test.cc b/third_party/xla/xla/service/fusion_node_indexing_evaluation_test.cc
index 8becc665ecde..b48e88aab003 100644
--- a/third_party/xla/xla/service/fusion_node_indexing_evaluation_test.cc
+++ b/third_party/xla/xla/service/fusion_node_indexing_evaluation_test.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 
-using FusionNodeIndexingEvaluationTest = HloTestBase;
+using FusionNodeIndexingEvaluationTest = HloHardwareIndependentTestBase;
 
 // Subclass of InstructionFusion exposing the protected methods Fuse and
 // FuseInstruction for testing. Also adds the FusionNodeIndexingEvaluation to
diff --git a/third_party/xla/xla/service/fuzzy_matcher_test.cc b/third_party/xla/xla/service/fuzzy_matcher_test.cc
index ac97d13233aa..5447a7c01dcf 100644
--- a/third_party/xla/xla/service/fuzzy_matcher_test.cc
+++ b/third_party/xla/xla/service/fuzzy_matcher_test.cc
@@ -16,15 +16,15 @@ limitations under the License.
 #include "xla/service/fuzzy_matcher.h"
 
 #include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using FuzzyMatcherTest = HloTestBase;
+using FuzzyMatcherTest = HloHardwareIndependentTestBase;
 
 TEST_F(FuzzyMatcherTest, IgnoreConvert) {
   constexpr char kModuleStr[] = R"(
diff --git a/third_party/xla/xla/service/gather_expander.cc b/third_party/xla/xla/service/gather_expander.cc
index d0190d02d7c2..af17d739e2de 100644
--- a/third_party/xla/xla/service/gather_expander.cc
+++ b/third_party/xla/xla/service/gather_expander.cc
@@ -34,27 +34,38 @@ limitations under the License.
 namespace xla {
 
 namespace {
+
+bool IsIota(absl::Span<const int64_t> range) {
+  for (int64_t i = 0; i < range.size(); ++i) {
+    if (range[i] != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
 absl::StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
     HloInstruction* start_indices, int64_t index_vector_dim) {
   const Shape& start_indices_shape = start_indices->shape();
 
-  if (start_indices_shape.dimensions_size() == index_vector_dim) {
+  if (start_indices_shape.dimensions().size() == index_vector_dim) {
     return start_indices;
   }
 
-  if (index_vector_dim == (start_indices_shape.dimensions_size() - 1)) {
+  if (index_vector_dim == (start_indices_shape.dimensions().size() - 1)) {
     return start_indices;
   }
 
   std::vector<int64_t> permutation;
-  permutation.reserve(start_indices_shape.dimensions_size());
-  for (int64_t i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
+  permutation.reserve(start_indices_shape.dimensions().size());
+  for (int64_t i = 0, e = start_indices_shape.dimensions().size(); i < e; i++) {
     if (i != index_vector_dim) {
       permutation.push_back(i);
     }
   }
   permutation.push_back(index_vector_dim);
-  return MakeTransposeHlo(start_indices, permutation);
+  return IsIota(permutation) ? start_indices
+                             : MakeTransposeHlo(start_indices, permutation);
 }
 
 // Canonicalizes the start_indices tensors so that we only have deal with some
@@ -68,7 +79,7 @@ absl::StatusOr<HloInstruction*> CanonicalizeGatherIndices(
       HloInstruction * transposed_start_indices,
       TransposeIndexVectorDimToLast(start_indices, index_vector_dim));
   bool indices_are_scalar =
-      index_vector_dim == start_indices->shape().dimensions_size();
+      index_vector_dim == start_indices->shape().dimensions().size();
 
   // The number of dimensions in start_indices that are index dimensions.
   const int64_t index_dims_in_start_indices = indices_are_scalar ? 0 : 1;
@@ -78,14 +89,14 @@ absl::StatusOr<HloInstruction*> CanonicalizeGatherIndices(
   // uniformity.  Otherwise create a "collapsed" leading dimension that subsumes
   // all of the non-index-vector dimensions.
   const Shape& shape = transposed_start_indices->shape();
-  if (shape.dimensions_size() == index_dims_in_start_indices) {
+  if (shape.dimensions().size() == index_dims_in_start_indices) {
     return PrependDegenerateDims(transposed_start_indices, 1);
   } else {
     // Collapse all but the dimensions (0 or 1) in start_indices containing the
     // index vectors.
     return CollapseFirstNDims(
         transposed_start_indices,
-        shape.dimensions_size() - index_dims_in_start_indices);
+        shape.dimensions().size() - index_dims_in_start_indices);
   }
 }
 
@@ -95,8 +106,8 @@ absl::StatusOr<HloInstruction*> AdjustBatchDimsInAccumulator(
     const Shape& start_indices_shape, HloInstruction* accumulator,
     int64_t index_vector_dim) {
   std::vector<int64_t> batch_dim_bounds;
-  batch_dim_bounds.reserve(start_indices_shape.dimensions_size());
-  for (int64_t i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
+  batch_dim_bounds.reserve(start_indices_shape.dimensions().size());
+  for (int64_t i = 0, e = start_indices_shape.dimensions().size(); i < e; i++) {
     if (i != index_vector_dim) {
       batch_dim_bounds.push_back(start_indices_shape.dimensions(i));
     }
@@ -139,9 +150,10 @@ absl::StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
   HloInstruction* const output_accumulator = incoming_loop_state[2];
   const Shape& orig_start_indices_shape = gather.operand(1)->shape();
 
-  bool has_scalar_indices = start_indices->shape().dimensions_size() == 1;
-  CHECK_EQ(has_scalar_indices, dim_numbers.index_vector_dim() ==
-                                   orig_start_indices_shape.dimensions_size());
+  bool has_scalar_indices = start_indices->shape().dimensions().size() == 1;
+  CHECK_EQ(has_scalar_indices,
+           dim_numbers.index_vector_dim() ==
+               orig_start_indices_shape.dimensions().size());
 
   HloInstruction* induction_var_as_vector =
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
@@ -177,7 +189,7 @@ absl::StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
   TF_ASSIGN_OR_RETURN(
       HloInstruction * gathered_slice_start,
       ExpandIndexVectorIntoOperandSpace(
-          orig_start_indices_shape, operand->shape().dimensions_size(),
+          orig_start_indices_shape, operand->shape().dimensions().size(),
           dim_numbers.index_vector_dim(), dim_numbers.start_index_map(),
           dim_numbers.start_indices_batching_dims(),
           dim_numbers.operand_batching_dims(), index_vector, induction_var));
@@ -200,7 +212,7 @@ absl::StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
       PadVectorWithZeros(
           induction_var_as_vector, /*zeros_to_prepend=*/0,
           /*zeros_to_append=*/
-          gathered_slice_with_dims_collapsed->shape().dimensions_size()));
+          gathered_slice_with_dims_collapsed->shape().dimensions().size()));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction* const updated_accumulator,
@@ -252,7 +264,8 @@ absl::StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
     }
   }
 
-  return MakeTransposeHlo(accumulator, permutation);
+  return IsIota(permutation) ? accumulator
+                             : MakeTransposeHlo(accumulator, permutation);
 }
 
 // Computes how many trips a loop implementing this gather op would take.
@@ -263,7 +276,7 @@ int64_t GatherLoopTripCount(HloInstruction* gather_instr) {
       gather_instr->gather_dimension_numbers();
 
   int64_t trip_count = 1;
-  for (int64_t i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
+  for (int64_t i = 0, e = start_indices_shape.dimensions().size(); i < e; i++) {
     if (i != dim_numbers.index_vector_dim()) {
       trip_count *= start_indices_shape.dimensions(i);
     }
@@ -338,7 +351,7 @@ absl::StatusOr<HloInstruction*> GatherExpander::ExpandInstruction(
   HloInstruction* operand = gather_instr->mutable_operand(0);
   HloInstruction* start_indices = gather_instr->mutable_operand(1);
   const Shape& output_shape = gather_instr->shape();
-  int64_t output_rank = output_shape.dimensions_size();
+  int64_t output_rank = output_shape.dimensions().size();
 
   const GatherDimensionNumbers& dim_numbers =
       gather_instr->gather_dimension_numbers();
diff --git a/third_party/xla/xla/service/gather_expander_test.cc b/third_party/xla/xla/service/gather_expander_test.cc
index a7f39c326336..64584a869699 100644
--- a/third_party/xla/xla/service/gather_expander_test.cc
+++ b/third_party/xla/xla/service/gather_expander_test.cc
@@ -21,15 +21,15 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_query.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
 
 namespace xla {
 namespace {
 
-class GatherExpanderTest : public HloTestBase {
+class GatherExpanderTest : public HloHardwareIndependentTestBase {
  protected:
   void CheckWhileBody(HloModule* module, absl::string_view expected) {
     std::vector<HloInstruction*> while_instructions =
@@ -297,44 +297,44 @@ ENTRY main {
   const std::string expected = R"(
   //CHECK: (s32[], s32[5,2], s32[5,1], s32[5,1])) -> (s32[], s32[5,2], s32[5,1], s32[5,1]) {
   //CHECK: %[[PARAM:.*]] = (s32[], s32[5,2], s32[5,1], s32[5,1]) parameter(0)
-  //CHECK: %[[I:.*]] = s32[] get-tuple-element((s32[], s32[5,2], s32[5,1], s32[5,1]) %[[PARAM]]), index=
+  //CHECK: %[[I:.*]] = s32[] get-tuple-element(%[[PARAM]]), index=
   //CHECK: %[[CONSTANT1:.*]] = s32[] constant(1)
-  //CHECK: %[[I_PLUS_1:.*]] = s32[] add(s32[] %[[I]], s32[] %[[CONSTANT1]])
-  //CHECK: %[[OPERAND:.*]] = s32[5,2] get-tuple-element((s32[], s32[5,2], s32[5,1], s32[5,1]) %[[PARAM]]), index=1
-  //CHECK: %[[START_INDICES:.*]] = s32[5,1] get-tuple-element((s32[], s32[5,2], s32[5,1], s32[5,1]) %[[PARAM]]), index=2
-  //CHECK: %[[RESULT:.*]] = s32[5,1] get-tuple-element((s32[], s32[5,2], s32[5,1], s32[5,1]) %[[PARAM]]), index=3
+  //CHECK: %[[I_PLUS_1:.*]] = s32[] add(%[[I]], %[[CONSTANT1]])
+  //CHECK: %[[OPERAND:.*]] = s32[5,2] get-tuple-element(%[[PARAM]]), index=1
+  //CHECK: %[[START_INDICES:.*]] = s32[5,1] get-tuple-element(%[[PARAM]]), index=2
+  //CHECK: %[[RESULT:.*]] = s32[5,1] get-tuple-element(%[[PARAM]]), index=3
 
-  //CHECK: %[[I_1D_1:.*]] = s32[1] broadcast(s32[] %[[I]])
-  //CHECK: %[[I_1D_2:.*]] = s32[1] broadcast(s32[] %[[I]])
+  //CHECK: %[[I_1D_1:.*]] = s32[1] broadcast(%[[I]])
+  //CHECK: %[[I_1D_2:.*]] = s32[1] broadcast(%[[I]])
 
   //CHECK: %[[START_INDICES_INDEX_D1_PAD:.*]] = s32[] constant(0)
-  //CHECK: %[[START_INDICES_INDEX_VECTOR:.*]] = s32[2] pad(s32[1] %[[I_1D_2]], s32[] %[[START_INDICES_INDEX_D1_PAD]]), padding=0_1
-  //CHECK: %[[START_INDICES_INDEX_D0_SLICE:.*]] = s32[1] slice(s32[2] %[[START_INDICES_INDEX_VECTOR]]), slice={[0:1]}
-  //CHECK: %[[START_INDICES_INDEX_D0:.*]] = s32[] reshape(s32[1] %[[START_INDICES_INDEX_D0_SLICE]])
-  //CHECK: %[[START_INDICES_INDEX_D1_SLICE:.*]] = s32[1] slice(s32[2] %[[START_INDICES_INDEX_VECTOR]]), slice={[1:2]}
-  //CHECK: %[[START_INDICES_INDEX_D1:.*]] = s32[] reshape(s32[1] %[[START_INDICES_INDEX_D1_SLICE]])
-  //CHECK: %[[INDEX_VECTOR:.*]] = s32[1,1] dynamic-slice(s32[5,1] %[[START_INDICES]], s32[] %[[START_INDICES_INDEX_D0]], s32[] %[[START_INDICES_INDEX_D1]])
-
-  //CHECK: %[[OFFSET_RAW:.*]] = s32[1] reshape(s32[1,1] %[[INDEX_VECTOR]])
-  //CHECK: %[[OFFSET:.*]] = s32[1] slice(s32[1] %[[OFFSET_RAW]])
-  //CHECK: %[[OPERAND_INDEX:.*]] = s32[2] concatenate(s32[1] %[[I_1D_1]], s32[1] %[[OFFSET]])
-  //CHECK: %[[OPERAND_INDEX_D0_RAW:.*]] = s32[1] slice(s32[2] %[[OPERAND_INDEX]]), slice={[0:1]}
-  //CHECK: %[[OPERAND_INDEX_D0:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D0_RAW]])
-  //CHECK: %[[OPERAND_INDEX_D1_RAW:.*]] = s32[1] slice(s32[2] %[[OPERAND_INDEX]]), slice={[1:2]}
-  //CHECK: %[[OPERAND_INDEX_D1:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D1_RAW]])
-  //CHECK: %[[RESULT_SLICE_RAW0:.*]] = s32[1,1] dynamic-slice(s32[5,2] %[[OPERAND]], s32[] %[[OPERAND_INDEX_D0]], s32[] %[[OPERAND_INDEX_D1]])
-
-  //CHECK: %[[RESULT_SLICE_RAW1:.*]] = s32[1] reshape(s32[1,1] %[[RESULT_SLICE_RAW0]])
-  //CHECK: %[[RESULT_SLICE:.*]] = s32[1,1] reshape(s32[1] %[[RESULT_SLICE_RAW1]])
+  //CHECK: %[[START_INDICES_INDEX_VECTOR:.*]] = s32[2] pad(%[[I_1D_2]], %[[START_INDICES_INDEX_D1_PAD]]), padding=0_1
+  //CHECK: %[[START_INDICES_INDEX_D0_SLICE:.*]] = s32[1] slice(%[[START_INDICES_INDEX_VECTOR]]), slice={[0:1]}
+  //CHECK: %[[START_INDICES_INDEX_D0:.*]] = s32[] reshape(%[[START_INDICES_INDEX_D0_SLICE]])
+  //CHECK: %[[START_INDICES_INDEX_D1_SLICE:.*]] = s32[1] slice(%[[START_INDICES_INDEX_VECTOR]]), slice={[1:2]}
+  //CHECK: %[[START_INDICES_INDEX_D1:.*]] = s32[] reshape(%[[START_INDICES_INDEX_D1_SLICE]])
+  //CHECK: %[[INDEX_VECTOR:.*]] = s32[1,1] dynamic-slice(%[[START_INDICES]], %[[START_INDICES_INDEX_D0]], %[[START_INDICES_INDEX_D1]])
+
+  //CHECK: %[[OFFSET_RAW:.*]] = s32[1] reshape(%[[INDEX_VECTOR]])
+  //CHECK: %[[OFFSET:.*]] = s32[1] slice(%[[OFFSET_RAW]])
+  //CHECK: %[[OPERAND_INDEX:.*]] = s32[2] concatenate(%[[I_1D_1]], %[[OFFSET]])
+  //CHECK: %[[OPERAND_INDEX_D0_RAW:.*]] = s32[1] slice(%[[OPERAND_INDEX]]), slice={[0:1]}
+  //CHECK: %[[OPERAND_INDEX_D0:.*]] = s32[] reshape(%[[OPERAND_INDEX_D0_RAW]])
+  //CHECK: %[[OPERAND_INDEX_D1_RAW:.*]] = s32[1] slice(%[[OPERAND_INDEX]]), slice={[1:2]}
+  //CHECK: %[[OPERAND_INDEX_D1:.*]] = s32[] reshape(%[[OPERAND_INDEX_D1_RAW]])
+  //CHECK: %[[RESULT_SLICE_RAW0:.*]] = s32[1,1] dynamic-slice(%[[OPERAND]], %[[OPERAND_INDEX_D0]], %[[OPERAND_INDEX_D1]])
+
+  //CHECK: %[[RESULT_SLICE_RAW1:.*]] = s32[1] reshape(%[[RESULT_SLICE_RAW0]])
+  //CHECK: %[[RESULT_SLICE:.*]] = s32[1,1] reshape(%[[RESULT_SLICE_RAW1]])
   //CHECK: %[[RESULT_INDEX_D1_PAD:.*]] = s32[] constant(0)
-  //CHECK: %[[RESULT_INDEX_VECTOR:.*]] = s32[2] pad(s32[1] %[[I_1D_2]], s32[] %[[RESULT_INDEX_D1_PAD]]), padding=0_1
-  //CHECK: %[[RESULT_INDEX_D0_SLICE:.*]] = s32[1] slice(s32[2] %[[RESULT_INDEX_VECTOR]]), slice={[0:1]}
-  //CHECK: %[[RESULT_INDEX_D0:.*]] = s32[] reshape(s32[1] %[[RESULT_INDEX_D0_SLICE]])
-  //CHECK: %[[RESULT_INDEX_D1_SLICE:.*]] = s32[1] slice(s32[2] %[[RESULT_INDEX_VECTOR]]), slice={[1:2]}
-  //CHECK: %[[RESULT_INDEX_D1:.*]] = s32[] reshape(s32[1] %[[RESULT_INDEX_D1_SLICE]])
-  //CHECK: %[[UPDATED_RESULT:.*]] = s32[5,1] dynamic-update-slice(s32[5,1] %[[RESULT]], s32[1,1] %[[RESULT_SLICE]], s32[] %[[RESULT_INDEX_D0]], s32[] %[[RESULT_INDEX_D1]])
-
-  //CHECK: ROOT %{{.*}} = (s32[], s32[5,2], s32[5,1], s32[5,1]) tuple(s32[] %[[I_PLUS_1]], s32[5,2] %[[OPERAND]], s32[5,1] %[[START_INDICES]], s32[5,1] %[[UPDATED_RESULT]])
+  //CHECK: %[[RESULT_INDEX_VECTOR:.*]] = s32[2] pad(%[[I_1D_2]], %[[RESULT_INDEX_D1_PAD]]), padding=0_1
+  //CHECK: %[[RESULT_INDEX_D0_SLICE:.*]] = s32[1] slice(%[[RESULT_INDEX_VECTOR]]), slice={[0:1]}
+  //CHECK: %[[RESULT_INDEX_D0:.*]] = s32[] reshape(%[[RESULT_INDEX_D0_SLICE]])
+  //CHECK: %[[RESULT_INDEX_D1_SLICE:.*]] = s32[1] slice(%[[RESULT_INDEX_VECTOR]]), slice={[1:2]}
+  //CHECK: %[[RESULT_INDEX_D1:.*]] = s32[] reshape(%[[RESULT_INDEX_D1_SLICE]])
+  //CHECK: %[[UPDATED_RESULT:.*]] = s32[5,1] dynamic-update-slice(%[[RESULT]], %[[RESULT_SLICE]], %[[RESULT_INDEX_D0]], %[[RESULT_INDEX_D1]])
+
+  //CHECK: ROOT %{{.*}} = (s32[], s32[5,2], s32[5,1], s32[5,1]) tuple(%[[I_PLUS_1]], %[[OPERAND]], %[[START_INDICES]], %[[UPDATED_RESULT]])
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -369,38 +369,38 @@ ENTRY main {
   const std::string expected = R"(
   //CHECK: (s32[], s32[7,3,4,5], s32[70], s32[70,3])) -> (s32[], s32[7,3,4,5], s32[70], s32[70,3]) {
   //CHECK: %[[PARAM:.*]] = (s32[], s32[7,3,4,5], s32[70], s32[70,3]) parameter(0)
-  //CHECK: %[[I:.*]] = s32[] get-tuple-element((s32[], s32[7,3,4,5], s32[70], s32[70,3]) %[[PARAM]]), index=0
+  //CHECK: %[[I:.*]] = s32[] get-tuple-element(%[[PARAM]]), index=0
 
   //CHECK: %[[CONSTANT1:.*]] = s32[] constant(1)
-  //CHECK: %[[I_PLUS_1:.*]] = s32[] add(s32[] %[[I]], s32[] %[[CONSTANT1]])
-  //CHECK: %[[OPERAND:.*]] = s32[7,3,4,5] get-tuple-element((s32[], s32[7,3,4,5], s32[70], s32[70,3]) %[[PARAM]]), index=1
-  //CHECK: %[[START_INDICES:.*]] = s32[70] get-tuple-element((s32[], s32[7,3,4,5], s32[70], s32[70,3]) %[[PARAM]]), index=2
+  //CHECK: %[[I_PLUS_1:.*]] = s32[] add(%[[I]], %[[CONSTANT1]])
+  //CHECK: %[[OPERAND:.*]] = s32[7,3,4,5] get-tuple-element(%[[PARAM]]), index=1
+  //CHECK: %[[START_INDICES:.*]] = s32[70] get-tuple-element(%[[PARAM]]), index=2
 
   //CHECK: %[[CONSTANT7:.*]] = s32[] constant(7)
-  //CHECK: %[[BD0_RAW:.*]] = s32[] remainder(s32[] %[[I]], s32[] %[[CONSTANT7]])
-  //CHECK: %[[BD0:.*]] = s32[1] broadcast(s32[] %[[BD0_RAW]])
+  //CHECK: %[[BD0_RAW:.*]] = s32[] remainder(%[[I]], %[[CONSTANT7]])
+  //CHECK: %[[BD0:.*]] = s32[1] broadcast(%[[BD0_RAW]])
   //CHECK: %[[CONSTANT0:.*]] = s32[1] constant({0})
-  //CHECK: %[[I_1D_1:.*]] = s32[1] broadcast(s32[] %[[I]])
-  //CHECK: %[[START_INDICES_INDEX_RAW:.*]] = s32[1] slice(s32[1] %[[I_1D_1]])
-  //CHECK: %[[START_INDICES_INDEX:.*]] = s32[] reshape(s32[1] %[[START_INDICES_INDEX_RAW]])
-  //CHECK: %[[INDEX_VECTOR:.*]] = s32[1] dynamic-slice(s32[70] %[[START_INDICES]], s32[] %[[START_INDICES_INDEX]])
+  //CHECK: %[[I_1D_1:.*]] = s32[1] broadcast(%[[I]])
+  //CHECK: %[[START_INDICES_INDEX_RAW:.*]] = s32[1] slice(%[[I_1D_1]])
+  //CHECK: %[[START_INDICES_INDEX:.*]] = s32[] reshape(%[[START_INDICES_INDEX_RAW]])
+  //CHECK: %[[INDEX_VECTOR:.*]] = s32[1] dynamic-slice(%[[START_INDICES]], %[[START_INDICES_INDEX]])
 
-  //CHECK: %[[OFFSET:.*]] = s32[1] slice(s32[1] %[[INDEX_VECTOR]])
-  //CHECK: %[[BD1:.*]] = s32[] divide(s32[] %[[I]], s32[] %[[CONSTANT7]])
+  //CHECK: %[[OFFSET:.*]] = s32[1] slice(%[[INDEX_VECTOR]])
+  //CHECK: %[[BD1:.*]] = s32[] divide(%[[I]], %[[CONSTANT7]])
   //CHECK: %[[CONSTANT2:.*]] = s32[] constant(2)
-  //CHECK: %[[BD2_RAW:.*]] = s32[] divide(s32[] %[[BD1]], s32[] %[[CONSTANT2]])
-  //CHECK: %[[BD2:.*]] = s32[1] broadcast(s32[] %[[BD2_RAW]])
-  //CHECK: %[[OPERAND_INDEX:.*]] = s32[4] concatenate(s32[1] %[[BD0]], s32[1] %[[CONSTANT0]], s32[1] %[[OFFSET]], s32[1] %[[BD2]])
-
-  //CHECK: %[[OPERAND_INDEX_D0_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDEX]]), slice={[0:1]}
-  //CHECK: %[[OPERAND_INDEX_D0:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D0_RAW]])
-  //CHECK: %[[OPERAND_INDEX_D1_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDEX]]), slice={[1:2]}
-  //CHECK: %[[OPERAND_INDEX_D1:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D1_RAW]])
-  //CHECK: %[[OPERAND_INDEX_D2_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDEX]]), slice={[2:3]}
-  //CHECK: %[[OPERAND_INDEX_D2:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D2_RAW]])
-  //CHECK: %[[OPERAND_INDEX_D3_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDEX]]), slice={[3:4]}
-  //CHECK: %[[OPERAND_INDEX_D3:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D3_RAW]])
-  //CHECK: %{{.*}} = s32[1,3,1,1] dynamic-slice(s32[7,3,4,5] %[[OPERAND]], s32[] %[[OPERAND_INDEX_D0]], s32[] %[[OPERAND_INDEX_D1]], s32[] %[[OPERAND_INDEX_D2]], s32[] %[[OPERAND_INDEX_D3]])
+  //CHECK: %[[BD2_RAW:.*]] = s32[] divide(%[[BD1]], %[[CONSTANT2]])
+  //CHECK: %[[BD2:.*]] = s32[1] broadcast(%[[BD2_RAW]])
+  //CHECK: %[[OPERAND_INDEX:.*]] = s32[4] concatenate(%[[BD0]], %[[CONSTANT0]], %[[OFFSET]], %[[BD2]])
+
+  //CHECK: %[[OPERAND_INDEX_D0_RAW:.*]] = s32[1] slice(%[[OPERAND_INDEX]]), slice={[0:1]}
+  //CHECK: %[[OPERAND_INDEX_D0:.*]] = s32[] reshape(%[[OPERAND_INDEX_D0_RAW]])
+  //CHECK: %[[OPERAND_INDEX_D1_RAW:.*]] = s32[1] slice(%[[OPERAND_INDEX]]), slice={[1:2]}
+  //CHECK: %[[OPERAND_INDEX_D1:.*]] = s32[] reshape(%[[OPERAND_INDEX_D1_RAW]])
+  //CHECK: %[[OPERAND_INDEX_D2_RAW:.*]] = s32[1] slice(%[[OPERAND_INDEX]]), slice={[2:3]}
+  //CHECK: %[[OPERAND_INDEX_D2:.*]] = s32[] reshape(%[[OPERAND_INDEX_D2_RAW]])
+  //CHECK: %[[OPERAND_INDEX_D3_RAW:.*]] = s32[1] slice(%[[OPERAND_INDEX]]), slice={[3:4]}
+  //CHECK: %[[OPERAND_INDEX_D3:.*]] = s32[] reshape(%[[OPERAND_INDEX_D3_RAW]])
+  //CHECK: %{{.*}} = s32[1,3,1,1] dynamic-slice(%[[OPERAND]], %[[OPERAND_INDEX_D0]], %[[OPERAND_INDEX_D1]], %[[OPERAND_INDEX_D2]], %[[OPERAND_INDEX_D3]])
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/third_party/xla/xla/service/gather_scatter_utils.cc b/third_party/xla/xla/service/gather_scatter_utils.cc
index 68000b90b461..a13a9d0b7bdd 100644
--- a/third_party/xla/xla/service/gather_scatter_utils.cc
+++ b/third_party/xla/xla/service/gather_scatter_utils.cc
@@ -51,7 +51,7 @@ std::vector<HloInstruction*> GenerateExplicitBatchDimIndices(
     return {};
   }
 
-  int64_t rank = start_indices_shape.dimensions_size();
+  int64_t rank = start_indices_shape.dimensions().size();
   int64_t num_batch_dims = (rank == index_vector_dim) ? rank : rank - 1;
   HloComputation* computation = induction_var->parent();
   HloInstruction* divident = induction_var;
@@ -60,7 +60,7 @@ std::vector<HloInstruction*> GenerateExplicitBatchDimIndices(
   std::vector<HloInstruction*> explicit_batch_dim_indices(
       start_indices_batching_dims.size());
 
-  for (int64_t i = start_indices_shape.dimensions_size() - 1; i >= 0; i--) {
+  for (int64_t i = start_indices_shape.dimensions().size() - 1; i >= 0; i--) {
     if (i == index_vector_dim) {
       continue;
     }
@@ -95,7 +95,7 @@ std::vector<HloInstruction*> GenerateExplicitBatchDimIndices(
 
 absl::StatusOr<HloInstruction*> TransformStartIndices(
     HloInstruction* indices, int64_t index_vector_dim) {
-  int64_t rank = indices->shape().rank();
+  int64_t rank = indices->shape().dimensions().size();
   if (index_vector_dim == rank) {
     // Add a size 1 dimension to the indices if the index_vector_dim is
     // implicit.
diff --git a/third_party/xla/xla/service/gather_simplifier.h b/third_party/xla/xla/service/gather_simplifier.h
deleted file mode 100644
index 0cbcadc09b0d..000000000000
--- a/third_party/xla/xla/service/gather_simplifier.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GATHER_SIMPLIFIER_H_
-#define XLA_SERVICE_GATHER_SIMPLIFIER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/gather_simplifier.h"
-
-#endif  // XLA_SERVICE_GATHER_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/generic_transfer_manager.cc b/third_party/xla/xla/service/generic_transfer_manager.cc
index 85168eb9a969..63ef6d9dfeda 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager.cc
@@ -286,7 +286,7 @@ int64_t GenericTransferManager::GetByteSizeRequirement(
   if (shape.IsTuple() || shape.is_static()) {
     return ShapeUtil::ByteSizeOf(shape, pointer_size_);
   }
-  int64_t metadata_size = sizeof(int32_t) * shape.dimensions_size();
+  int64_t metadata_size = sizeof(int32_t) * shape.dimensions().size();
   return ShapeUtil::ByteSizeOf(shape, pointer_size_) + metadata_size;
 }
 
diff --git a/third_party/xla/xla/service/global_device_id.h b/third_party/xla/xla/service/global_device_id.h
index 92f30b9f1c11..22d83942b490 100644
--- a/third_party/xla/xla/service/global_device_id.h
+++ b/third_party/xla/xla/service/global_device_id.h
@@ -16,19 +16,15 @@ limitations under the License.
 #ifndef XLA_SERVICE_GLOBAL_DEVICE_ID_H_
 #define XLA_SERVICE_GLOBAL_DEVICE_ID_H_
 
-#include <cstdint>
 #include <string>
 
 #include "absl/types/span.h"
-#include "xla/tsl/lib/gtl/int_type.h"
+#include "xla/runtime/device_id.h"
 
 namespace xla {
 
-// Strongly-typed integer type for naming a device globally within a distributed
-// system. XLA doesn't have a strong opinion about what global numbering scheme
-// is applied to GPUs; the user must provide a local -> global mapping via
-// GpuExecutableRunOptions for the local GPUs.
-TSL_LIB_GTL_DEFINE_INT_TYPE(GlobalDeviceId, int64_t);
+// DEPRECATED: Use GlobalDeviceId from device_id.h instead.
+using GlobalDeviceId = GlobalDeviceId;
 
 // Returns a comma-separated string of global device IDs.
 std::string GlobalDeviceIdsToString(absl::Span<GlobalDeviceId const> ids);
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 29c559680790..e71c97554374 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2,17 +2,13 @@
 #   GPU-specific components in XLA service implementation.
 
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
-    "rocm_copts",
 )
-load("//xla:xla.bzl", "xla_cc_test", "xla_internal")
+load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
 load(
     "//xla/service/gpu:build_defs.bzl",
-    "build_cub_sort_kernels",
-    "get_cub_sort_kernel_types",
     "gpu_kernel_library",
 )
 load(
@@ -78,7 +74,7 @@ xla_cc_test(
     deps = [
         ":backend_configs_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -95,12 +91,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:clique_id",
         "//xla/core/collectives:clique_key",
         "//xla/service:global_device_id",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -119,7 +113,10 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_value",
+        "//xla/service/gpu:backend_configs_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -138,16 +135,13 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
 xla_test(
     name = "custom_call_test",
-    srcs = if_gpu_is_configured(["custom_call_test.cc"]),
+    srcs = ["custom_call_test.cc"],
     backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
@@ -170,16 +164,18 @@ xla_test(
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/tests:client_library_test_base",
+        "//xla/tests:client_library_test_runner_mixin",
+        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ] + if_cuda_is_configured([
@@ -197,10 +193,10 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "//xla/service:copy_insertion",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -282,6 +278,7 @@ cc_library(
     deps = [
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
     ],
 )
 
@@ -295,7 +292,7 @@ cc_library(
         ":gpu_executable",
         ":ir_emission_utils",
         ":kernel_reuse_cache",
-        "//xla/backends/gpu/runtime:nccl_collective_thunk",
+        "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
@@ -345,7 +342,14 @@ cc_library(
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/triton:fusion_emitter",
         "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/runtime:all_gather_thunk",
+        "//xla/backends/gpu/runtime:all_reduce_thunk",
+        "//xla/backends/gpu/runtime:all_to_all_thunk",
         "//xla/backends/gpu/runtime:cholesky_thunk",
+        "//xla/backends/gpu/runtime:collective_broadcast_thunk",
+        "//xla/backends/gpu/runtime:collective_group_thunk",
+        "//xla/backends/gpu/runtime:collective_permute_thunk",
+        "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:command_buffer_cmd",
         "//xla/backends/gpu/runtime:command_buffer_cmd_emitter",
         "//xla/backends/gpu/runtime:command_buffer_thunk",
@@ -359,25 +363,19 @@ cc_library(
         "//xla/backends/gpu/runtime:fft_thunk",
         "//xla/backends/gpu/runtime:gemm_thunk",
         "//xla/backends/gpu/runtime:gpublas_lt_matmul_thunk",
+        "//xla/backends/gpu/runtime:host_send_recv_thunk",
         "//xla/backends/gpu/runtime:infeed_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
-        "//xla/backends/gpu/runtime:nccl_all_gather_thunk",
-        "//xla/backends/gpu/runtime:nccl_all_reduce_thunk",
-        "//xla/backends/gpu/runtime:nccl_all_to_all_thunk",
-        "//xla/backends/gpu/runtime:nccl_collective_broadcast_thunk",
-        "//xla/backends/gpu/runtime:nccl_collective_permute_thunk",
-        "//xla/backends/gpu/runtime:nccl_collective_thunk",
-        "//xla/backends/gpu/runtime:nccl_group_thunk",
-        "//xla/backends/gpu/runtime:nccl_p2p_thunk_common",
-        "//xla/backends/gpu/runtime:nccl_ragged_all_to_all_thunk",
-        "//xla/backends/gpu/runtime:nccl_recv_thunk",
-        "//xla/backends/gpu/runtime:nccl_send_thunk",
         "//xla/backends/gpu/runtime:norm_thunk",
         "//xla/backends/gpu/runtime:outfeed_thunk",
+        "//xla/backends/gpu/runtime:p2p_thunk_common",
+        "//xla/backends/gpu/runtime:ragged_all_to_all_thunk",
+        "//xla/backends/gpu/runtime:recv_thunk",
         "//xla/backends/gpu/runtime:replica_id_thunk",
-        "//xla/backends/gpu/runtime:send_recv_thunk",
+        "//xla/backends/gpu/runtime:send_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
         "//xla/backends/gpu/runtime:thunk",
+        "//xla/backends/gpu/runtime:topk",
         "//xla/backends/gpu/runtime:triangular_solve_thunk",
         "//xla/backends/gpu/runtime:wait_for_streams_thunk",
         "//xla/backends/gpu/runtime:while_thunk",
@@ -389,13 +387,13 @@ cc_library(
         "//xla/mlir/utils:error_util",
         "//xla/mlir_hlo:transforms_gpu_passes",
         "//xla/service:buffer_assignment",
+        "//xla/service:call_graph",
         "//xla/service:collective_ops_utils",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:global_device_id",
         "//xla/service:name_uniquer",
         "//xla/service/gpu/kernels:custom_kernel",
-        "//xla/service/gpu/kernels:topk_custom_kernel",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
         "//xla/service/llvm_ir:buffer_assignment_util",
         "//xla/service/llvm_ir:ir_array",
@@ -407,7 +405,10 @@ cc_library(
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -433,9 +434,7 @@ cc_library(
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:ToLLVMIRTranslation",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:human_readable_json",
-        "@local_tsl//tsl/platform:statusor",
         "@triton//:TritonDialects",
     ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_solver_context",
@@ -454,7 +453,6 @@ cc_library(
         "ir_emitter.h",
         "ir_emitter_nested.h",
     ],
-    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
     deps = [
         ":hlo_to_ir_bindings",
         ":ir_emission_utils",
@@ -473,7 +471,6 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/service/llvm_ir:tuple_ops",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
@@ -491,11 +488,8 @@ cc_library(
 
 cc_library(
     name = "triton_call",
-    srcs = if_gpu_is_configured(["triton_call.cc"]),
+    srcs = ["triton_call.cc"],
     hdrs = ["triton_call.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//mlir:AsmParser",
@@ -565,6 +559,7 @@ cc_library(
         ":gpu_constants",
         ":gpu_executable_run_options",
         ":ir_emission_utils",
+        ":resource_requests",
         ":stream_executor_util",
         "//xla:executable_run_options",
         "//xla:shape_tree",
@@ -587,7 +582,6 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:xla_debug_info_manager",
-        "//xla/service/gpu:resource_requests",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
@@ -600,11 +594,18 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/stream_executor/sycl:sycl_platform_id",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:env_time",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -614,25 +615,10 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:env_time",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:random",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
-    ] + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cublas_plugin",
-        "//xla/stream_executor/cuda:cudnn_plugin",
-        "//xla/stream_executor/cuda:cufft_plugin",
-        "//xla/stream_executor/cuda:stream_executor_cuda",
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:stream_executor_rocm",
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+    ],
 )
 
 cc_library(
@@ -644,6 +630,7 @@ cc_library(
         ":backend_configs_cc",
         ":target_util",
         "//xla:literal",
+        "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
@@ -652,15 +639,17 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:buffer_assignment",
-        "//xla/service/llvm_ir:llvm_type_conversion_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/tsl/lib/strings:proto_serialization",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -686,10 +675,14 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:buffer_assignment",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/service:hlo_runner",
+        "//xla/service:platform_util",
+        "//xla/tests:hlo_runner_agnostic_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -723,7 +716,7 @@ xla_cc_test(
         ":reduction_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -745,37 +738,6 @@ cc_library(
     ],
 )
 
-gpu_kernel_library(
-    name = "gpu_prim",
-    hdrs = ["gpu_prim.h"],
-    deps = [
-        "@eigen_archive//:eigen3",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:bfloat16",
-    ] + if_cuda_is_configured(["@local_config_cuda//cuda:cub_headers"]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocprim",
-    ]),
-)
-
-cc_library(
-    name = "variant_visitor",
-    hdrs = ["variant_visitor.h"],
-)
-
-build_cub_sort_kernels(
-    name = "cub_sort_kernel",
-    srcs = if_gpu_is_configured(["cub_sort_kernel.cu.cc"]),
-    hdrs = if_gpu_is_configured(["cub_sort_kernel.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    types = get_cub_sort_kernel_types(),
-    deps = if_gpu_is_configured([
-        ":gpu_prim",
-        "//xla/stream_executor/gpu:gpu_types_header",
-    ]),
-)
-
 cc_library(
     name = "triton_tiling_propagation",
     srcs = ["triton_tiling_propagation.cc"],
@@ -803,7 +765,7 @@ xla_cc_test(
     srcs = ["triton_tiling_propagation_test.cc"],
     deps = [
         ":triton_tiling_propagation",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -841,13 +803,14 @@ xla_cc_test(
     deps = [
         ":triton_fusion_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service/gpu/transforms:gemm_fusion",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status_matchers",
@@ -873,7 +836,9 @@ cc_library(
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:algorithm_util",
         "//xla/service:hlo_creation_utils",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
@@ -882,8 +847,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -899,20 +862,18 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:hlo_verifier",
         "//xla/service:layout_assignment",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -942,15 +903,12 @@ xla_cc_test(
     deps = [
         ":matmul_indexing_utils",
         "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -996,8 +954,8 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -1103,10 +1061,10 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -1140,10 +1098,10 @@ xla_cc_test(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1160,10 +1118,10 @@ cc_library(
     srcs = ["cublas_padding_requirements.cc"],
     hdrs = ["cublas_padding_requirements.h"],
     deps = [
-        ":variant_visitor",
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:overload",
         "//xla/stream_executor:device_description",
     ],
 )
@@ -1274,7 +1232,6 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:hlo_proto_cc",
         "//xla/service:logical_buffer",
-        "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -1285,12 +1242,12 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/log:globals",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:AsmParser",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:TransformUtils",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
@@ -1298,6 +1255,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -1313,6 +1271,7 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
+        "//xla/service/gpu/transforms:fusion_dynamic_memcpy_rewriter",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
@@ -1370,14 +1329,14 @@ cc_library(
 
 cc_library(
     name = "gpu_compiler",
-    srcs = if_gpu_is_configured([
+    srcs = [
         "gpu_compiler.cc",
-    ]),
-    hdrs = if_gpu_is_configured([
+    ],
+    hdrs = [
         "gpu_compiler.h",
-    ]),
-    deps = if_gpu_is_configured([
-        # go/keep-sorted start prefix_order=":,,
+    ],
+    tags = ["gpu"],
+    deps = [
         ":buffer_sharing",
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
@@ -1392,7 +1351,6 @@ cc_library(
         ":gpu_float_support",
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
-        ":gpu_p2p_pipeliner",
         ":gpu_spmd_pipeline",
         ":hlo_fusion_stats",
         ":ir_emission_utils",
@@ -1406,28 +1364,14 @@ cc_library(
         ":reduction_utils",
         ":runtime_intrinsics",
         ":stream_executor_util",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
-        "@llvm-project//llvm:AsmParser",
-        "@llvm-project//llvm:BitReader",
-        "@llvm-project//llvm:BitWriter",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TransformUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
+        "//xla:autotune_results_proto_cc",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
@@ -1436,11 +1380,17 @@ cc_library(
         "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/transforms:convert_memory_placement_to_internal_annotations",
+        "//xla/hlo/transforms:host_offload_legalize",
+        "//xla/hlo/transforms:host_offloader",
+        "//xla/hlo/transforms:operand_upcaster",
+        "//xla/hlo/transforms:while_loop_trip_count_annotator",
         "//xla/hlo/transforms/collectives:all_gather_broadcast_reorder",
         "//xla/hlo/transforms/collectives:all_gather_combiner",
         "//xla/hlo/transforms/collectives:all_reduce_combiner",
         "//xla/hlo/transforms/collectives:all_reduce_contiguous",
         "//xla/hlo/transforms/collectives:async_collective_creator",
+        "//xla/hlo/transforms/collectives:collective_permute_combiner",
         "//xla/hlo/transforms/collectives:collective_quantizer",
         "//xla/hlo/transforms/collectives:collectives_schedule_linearizer",
         "//xla/hlo/transforms/collectives:convert_async_collectives_to_sync",
@@ -1486,35 +1436,69 @@ cc_library(
         "//xla/hlo/transforms/simplifiers:sub_byte_normalization",
         "//xla/hlo/transforms/simplifiers:tuple_simplifier",
         "//xla/hlo/transforms/simplifiers:zero_sized_hlo_elimination",
-        "//xla/hlo/transforms:convert_memory_placement_to_internal_annotations",
-        "//xla/hlo/transforms:host_offload_legalize",
-        "//xla/hlo/transforms:host_offloader",
-        "//xla/hlo/transforms:operand_upcaster",
-        "//xla/hlo/transforms:while_loop_trip_count_annotator",
         "//xla/hlo/utils:hlo_query",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:all_reduce_promotion",
+        "//xla/service:all_reduce_reassociate",
+        "//xla/service:all_reduce_simplifier",
+        "//xla/service:batched_gather_scatter_normalizer",
+        "//xla/service:batchnorm_expander",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:call_inliner",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:collective_permute_decomposer",
+        "//xla/service:collective_pipeliner",
+        "//xla/service:collective_pipeliner_utils",
+        "//xla/service:collective_utils",
+        "//xla/service:compiler",
+        "//xla/service:conditional_simplifier",
+        "//xla/service:copy_insertion",
+        "//xla/service:cpu_gpu_shape_verifier",
+        "//xla/service:dump",
+        "//xla/service:dynamic_dimension_inference",
+        "//xla/service:dynamic_padder",
+        "//xla/service:executable",
+        "//xla/service:export_hlo",
+        "//xla/service:float_support",
+        "//xla/service:gather_expander",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_cse",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_verifier",
+        "//xla/service:layout_assignment",
+        "//xla/service:layout_normalization",
+        "//xla/service:llvm_compiler",
+        "//xla/service:logical_buffer",
+        "//xla/service:loop_schedule_linearizer",
+        "//xla/service:reduce_scatter_combiner",
+        "//xla/service:reduce_scatter_reassociate",
+        "//xla/service:scatter_determinism_expander",
+        "//xla/service:scatter_expander",
+        "//xla/service:scatter_simplifier",
+        "//xla/service:select_and_scatter_expander",
+        "//xla/service:sharding_remover",
+        "//xla/service:slow_operation_alarm",
+        "//xla/service:topk_rewriter",
+        "//xla/service:transpose_folding",
+        "//xla/service:while_loop_all_reduce_code_motion",
+        "//xla/service:while_loop_constant_sinking",
+        "//xla/service:while_loop_simplifier",
         "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/service/gpu/autotuning:custom_kernel_fusion_autotuner",
+        "//xla/service/gpu/model:collective_ptable_stats_collection",
         "//xla/service/gpu/model:gpu_cost_model_stats_collection",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
+        "//xla/service/gpu/model:matmul_ptable_stats_collection",
         "//xla/service/gpu/model:sol_gpu_cost_model_stats_collection",
-        "//xla/service/gpu/transforms/collectives:convert_async_collectives_to_sync",
-        "//xla/service/gpu/transforms/collectives:gpu_all_gather_combiner",
-        "//xla/service/gpu/transforms/collectives:gpu_all_reduce_combiner",
-        "//xla/service/gpu/transforms/collectives:gpu_collective_combiner_utils",
-        "//xla/service/gpu/transforms/collectives:gpu_reduce_scatter_combiner",
+        "//xla/service/gpu/transforms:add_tracking_suffix_to_instruction_names",
         "//xla/service/gpu/transforms:algebraic_simplifier",
         "//xla/service/gpu/transforms:algorithm_checker",
-        "//xla/service/gpu/transforms:all_gather_dynamic_slice_simplifier",
-        "//xla/service/gpu/transforms:all_gather_optimizer",
-        "//xla/service/gpu/transforms:all_reduce_blueconnect",
-        "//xla/service/gpu/transforms:all_reduce_splitter",
         "//xla/service/gpu/transforms:async_collective_annotator",
         "//xla/service/gpu/transforms:async_wrapper",
         "//xla/service/gpu/transforms:collective_permute_cycle_decomposer",
-        "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator",
-        "//xla/service/gpu/transforms:collective_select_folder",
         "//xla/service/gpu/transforms:command_buffer_scheduling",
         "//xla/service/gpu/transforms:conv_rewriter",
         "//xla/service/gpu/transforms:cudnn_custom_call_converter",
@@ -1525,6 +1509,7 @@ cc_library(
         "//xla/service/gpu/transforms:dot_operand_converter",
         "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
         "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
+        "//xla/service/gpu/transforms:explicit_collectives_group_async_wrapper",
         "//xla/service/gpu/transforms:explicit_stream_annotation_async_wrapper",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
         "//xla/service/gpu/transforms:fusion_wrapper",
@@ -1535,7 +1520,7 @@ cc_library(
         "//xla/service/gpu/transforms:gemv_rewriter",
         "//xla/service/gpu/transforms:layout_assignment",
         "//xla/service/gpu/transforms:move_copy_to_users",
-        "//xla/service/gpu/transforms:pipelined_p2p_rewriter",
+        "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/service/gpu/transforms:ragged_all_to_all_canonicalizer",
         "//xla/service/gpu/transforms:ragged_all_to_all_decomposer",
         "//xla/service/gpu/transforms:reduce_scatter_creator",
@@ -1549,6 +1534,7 @@ cc_library(
         "//xla/service/gpu/transforms:scatter_slice_simplifier",
         "//xla/service/gpu/transforms:softmax_rewriter_triton",
         "//xla/service/gpu/transforms:sort_rewriter",
+        "//xla/service/gpu/transforms:splitk_rewriter",
         "//xla/service/gpu/transforms:stream_attribute_annotator",
         "//xla/service/gpu/transforms:stream_attribute_async_wrapper",
         "//xla/service/gpu/transforms:topk_specializer",
@@ -1557,56 +1543,23 @@ cc_library(
         "//xla/service/gpu/transforms:tree_reduction_rewriter",
         "//xla/service/gpu/transforms:triton_fusion_numerics_verifier",
         "//xla/service/gpu/transforms:windowed_einsum_handler",
+        "//xla/service/gpu/transforms/collectives:all_gather_dynamic_slice_simplifier",
+        "//xla/service/gpu/transforms/collectives:all_gather_optimizer",
+        "//xla/service/gpu/transforms/collectives:all_reduce_blueconnect",
+        "//xla/service/gpu/transforms/collectives:all_reduce_decomposer",
+        "//xla/service/gpu/transforms/collectives:all_reduce_splitter",
+        "//xla/service/gpu/transforms/collectives:collective_combiner_annotator",
+        "//xla/service/gpu/transforms/collectives:collective_select_folder",
+        "//xla/service/gpu/transforms/collectives:convert_async_collectives_to_sync",
+        "//xla/service/gpu/transforms/collectives:gpu_all_gather_combiner",
+        "//xla/service/gpu/transforms/collectives:gpu_all_reduce_combiner",
+        "//xla/service/gpu/transforms/collectives:gpu_collective_combiner_utils",
+        "//xla/service/gpu/transforms/collectives:gpu_reduce_scatter_combiner",
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd:collective_permute_motion",
-        "//xla/service:all_reduce_promotion",
-        "//xla/service:all_reduce_reassociate",
-        "//xla/service:all_reduce_simplifier",
-        "//xla/service:batched_gather_scatter_normalizer",
-        "//xla/service:batchnorm_expander",
-        "//xla/service:buffer_assignment",
-        "//xla/service:buffer_value",
-        "//xla/service:call_inliner",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:collective_permute_decomposer",
-        "//xla/service:collective_pipeliner",
-        "//xla/service:collective_utils",
-        "//xla/service:compiler",
-        "//xla/service:conditional_simplifier",
-        "//xla/service:copy_insertion",
-        "//xla/service:cpu_gpu_shape_verifier",
-        "//xla/service:dump",
-        "//xla/service:dynamic_dimension_inference",
-        "//xla/service:dynamic_padder",
-        "//xla/service:executable",
-        "//xla/service:export_hlo",
-        "//xla/service:float_support",
-        "//xla/service:gather_expander",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_cse",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service:hlo_verifier",
-        "//xla/service:layout_assignment",
-        "//xla/service:layout_normalization",
-        "//xla/service:llvm_compiler",
-        "//xla/service:logical_buffer",
-        "//xla/service:loop_schedule_linearizer",
-        "//xla/service:reduce_scatter_combiner",
-        "//xla/service:reduce_scatter_reassociate",
-        "//xla/service:scatter_determinism_expander",
-        "//xla/service:scatter_expander",
-        "//xla/service:scatter_simplifier",
-        "//xla/service:select_and_scatter_expander",
-        "//xla/service:sharding_remover",
-        "//xla/service:slow_operation_alarm",
-        "//xla/service:topk_rewriter",
-        "//xla/service:transpose_folding",
-        "//xla/service:while_loop_all_reduce_code_motion",
-        "//xla/service:while_loop_constant_sinking",
-        "//xla/service:while_loop_simplifier",
-        "//xla/stream_executor/integrations:device_mem_allocator",
+        "//xla/service/spmd:schedule_aware_collective_ops_cse",
+        "//xla/service/spmd/shardy:shardy_xla_pass",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:device_memory_allocator",
@@ -1615,18 +1568,35 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/tsl/lib/monitoring:counter",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "//xla:autotune_results_proto_cc",
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
+        "@llvm-project//llvm:AsmParser",
+        "@llvm-project//llvm:BitReader",
+        "@llvm-project//llvm:BitWriter",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -1638,8 +1608,7 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
-        # go/keep-sorted end
-    ]) + xla_internal(["service:export_hlo"]) + if_google([
+    ] + xla_internal(["service:export_hlo"]) + if_google([
         "//xla/hlo/experimental/auto_sharding",
         "//xla/hlo/experimental/auto_sharding:auto_sharding_option",
     ]),
@@ -1650,11 +1619,10 @@ xla_test(
     srcs = ["gpu_compiler_test.cc"],
     backends = ["gpu"],
     data = ["gpu_compiler_test_autotune_db.textproto"],
-    # TODO(b/399912696): Remove this once the test case order dependency is fixed.
-    shuffle_tests = False,  # CANNOT_SHUFFLE_TESTS=existing dependency on test case order.
     deps = [
         ":backend_configs_cc",
         ":gpu_compiler",
+        ":gpu_executable",
         ":gpu_hlo_schedule",
         ":metrics",
         "//xla:autotune_results_proto_cc",
@@ -1663,6 +1631,8 @@ xla_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/runtime:sequential_thunk",
+        "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/testlib:filecheck",
@@ -1677,6 +1647,7 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
@@ -1684,9 +1655,17 @@ xla_test(
         "//xla/tsl/lib/monitoring:collected_metrics",
         "//xla/tsl/lib/monitoring:collection_registry",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:log_sink",
+        "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -1760,7 +1739,6 @@ cc_library(
     ],
     deps = [
         ":nvptx_compiler_impl",
-        "//xla:debug_options_flags",
         "//xla/service:compiler",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "@local_tsl//tsl/platform:path",
@@ -1959,12 +1937,9 @@ xla_test(
 
 xla_cc_test(
     name = "gpu_aot_compilation_test",
-    srcs = if_gpu_is_configured([
+    srcs = [
         "gpu_aot_compilation_test.cc",
-    ]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    ],
     tags = [
         "gpu",
         "no_oss",
@@ -2006,7 +1981,6 @@ cc_library(
     srcs = [
         "amdgpu_compiler_registration.cc",
     ],
-    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     tags = [
         "gpu",
         "manual",
@@ -2079,13 +2053,11 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:ir_headers",
+        "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_rocm_is_configured([
-        # keep sorted
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+    ],
 )
 
 cc_library(
@@ -2108,7 +2080,6 @@ cc_library(
         "infeed_manager.h",
         "outfeed_manager.h",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":xfeed_queue",
         "//xla:literal",
@@ -2117,12 +2088,10 @@ cc_library(
         "//xla:util",
         "//xla/stream_executor:device_memory_handle",
         "//xla/stream_executor:stream_executor_h",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:notification",
@@ -2156,7 +2125,6 @@ cc_library(
         "//xla/service/gpu/model:sol_latency_estimator",
         "//xla/service/gpu/transforms:async_collective_annotator",
         "//xla/service/gpu/transforms:pgle_accuracy_checker",
-        "//xla/service/gpu/transforms:schedule_postprocessing",
         "//xla/service/gpu/transforms:scheduling_instruction_annotator",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_description",
@@ -2187,86 +2155,31 @@ xla_test(
     deps = [
         ":gpu_compiler",
         ":gpu_hlo_schedule",
-        ":gpu_latency_hiding_scheduler",
         "//xla:shape_util",
         "//xla/hlo/analysis:hlo_ordering",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
         "//xla/service:hlo_module_config",
-        "//xla/service:latency_hiding_scheduler",
-        "//xla/service:legalize_scheduling_annotations",
-        "//xla/service/gpu/transforms:schedule_postprocessing",
-        "//xla/service/gpu/transforms:scheduling_instruction_annotator",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "gpu_p2p_pipeliner",
-    srcs = ["gpu_p2p_pipeliner.cc"],
-    hdrs = ["gpu_p2p_pipeliner.h"],
-    deps = [
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/service:collective_conflict_analysis",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:collective_pipeliner",
-        "//xla/service:pattern_matcher",
-        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:nullability",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:log_sink",
+        "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-xla_cc_test(
-    name = "gpu_p2p_pipeliner_test",
-    srcs = [
-        "gpu_p2p_pipeliner_test.cc",
-    ],
-    deps = [
-        ":gpu_p2p_pipeliner",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/hlo/testlib:filecheck",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_verifier",
-        "//xla/service:pattern_matcher",
-        "//xla/service:pattern_matcher_gmock",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/protobuf:profiled_instructions_proto_cc",
     ],
 )
 
@@ -2316,11 +2229,11 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
         "//xla/service:hlo_module_config",
         "//xla/service/spmd/shardy:constants",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -2343,8 +2256,8 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:while_loop_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -2363,6 +2276,7 @@ gpu_kernel_library(
     ]) + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "@local_config_rocm//rocm:rocm_config",
+        "@local_config_rocm//rocm:hip",
     ]),
 )
 
@@ -2483,91 +2397,17 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         ":hlo_fusion_analysis",
         ":ir_emission_utils",
-        "//xla:protobuf_util",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-cc_library(
-    name = "buffer_comparator",
-    srcs = if_gpu_is_configured(["buffer_comparator.cc"]),
-    hdrs = if_gpu_is_configured(["buffer_comparator.h"]),
-    deps = if_gpu_is_configured([
-        # keep sorted
-        ":buffer_comparator_kernel",
-        ":gpu_asm_opts_util",
-        ":launch_dimensions",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla/service:hlo_module_config",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_handle",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:typed_kernel_factory",
-        "//xla/stream_executor/gpu:asm_compiler",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:statusor",
-    ]) + if_rocm_is_configured([
-        # keep sorted
-        "@local_config_rocm//rocm:rocm_config",
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
-)
-
-gpu_kernel_library(
-    name = "buffer_comparator_kernel",
-    srcs = if_gpu_is_configured(["buffer_comparator.cu.cc"]),
-    copts = rocm_copts(),
-    deps = [
-        "//xla:shape_util",
-        "//xla:types",
-    ] + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
-)
-
-xla_test(
-    name = "buffer_comparator_test",
-    srcs = if_gpu_is_configured(["buffer_comparator_test.cc"]),
-    backends = ["gpu"],
-    deps = [
-        ":stream_executor_util",
-        "//xla:shape_util",
-        "//xla:types",
-        "//xla/service:hlo_module_config",
-        "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:device_memory_handle",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:test",
-    ] + if_gpu_is_configured([
-        ":buffer_comparator",
-        "//xla/stream_executor:device_memory",
-    ]),
-)
-
 cc_library(
     name = "buffer_sharing",
     srcs = ["buffer_sharing.cc"],
@@ -2639,7 +2479,6 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -2652,9 +2491,9 @@ xla_test(
         "gpu_b200",
     ],
     deps = [
-        ":variant_visitor",
         "//xla:error_spec",
         "//xla:xla_proto_cc",
+        "//xla/service:overload",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -2705,6 +2544,7 @@ xla_cc_test(
     data = ["data/hlo_algorithm_denylist.pbtxt"],
     deps = [
         ":hlo_algorithm_denylist",
+        "//xla:debug_options_flags",
         "//xla/stream_executor:dnn",
         "//xla/tests:test_utils",
         "@com_google_absl//absl/strings",
@@ -2725,11 +2565,11 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/transforms/simplifiers:float_normalization",
         "//xla/service:hlo_verifier",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -2766,41 +2606,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "make_batch_pointers",
-    srcs = if_gpu_is_configured(["make_batch_pointers.cc"]),
-    hdrs = if_gpu_is_configured(["make_batch_pointers.h"]),
-    deps = [
-        "//xla:types",
-        "//xla:util",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:launch_dim",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:typed_kernel_factory",
-        "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        ":make_batch_pointers_kernel",
-    ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:rocm_helpers",
-    ]),
-)
-
-cuda_library(
-    name = "make_batch_pointers_kernel",
-    srcs = if_cuda_is_configured(["make_batch_pointers.cu.cc"]),
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",  # build_cleaner: keep
-    ],
-)
-
 tsl_gpu_library(
     name = "runtime_intrinsics",
     srcs = ["runtime_intrinsics.cc"],
@@ -2864,7 +2669,7 @@ xla_cc_test(
     deps = [
         ":hlo_fusion_stats",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/strings",
@@ -2952,7 +2757,6 @@ xla_cc_test(
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2978,11 +2782,7 @@ cc_library(
 xla_test(
     name = "determinism_test",
     srcs = ["determinism_test.cc"],
-    backends = [
-        "gpu_a100",
-        "gpu_b200",
-        "gpu_amd_any",
-    ],
+    backends = ["gpu"],
     deps = [
         "//xla:literal",
         "//xla:xla_proto_cc",
@@ -3000,11 +2800,11 @@ xla_test(
         "//xla/tests:literal_test_util",
         "//xla/tests:test_utils",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3043,7 +2843,7 @@ xla_cc_test(
         ":execution_stream_assignment",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -3118,7 +2918,7 @@ xla_cc_test(
     deps = [
         ":fusion_deduplication_cache",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -3144,7 +2944,6 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/stream_executor/cuda:compilation_options",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -3166,6 +2965,7 @@ xla_cc_test(
     srcs = ["flag_utils_test.cc"],
     deps = [
         ":flag_utils",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/service:collective_pipeliner",
@@ -3173,7 +2973,6 @@ xla_cc_test(
         "//xla/service:latency_hiding_scheduler",
         "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -3197,7 +2996,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index eaa9eaab0c29..8adc48f67b0b 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -5,7 +5,7 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
@@ -43,6 +43,7 @@ cc_library(
     deps = [
         ":autotuner_compile_util",
         ":autotuner_util",
+        ":redzone_buffers",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -58,8 +59,8 @@ cc_library(
         "//xla/service/gpu/transforms:cudnn_fusion_compiler",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:env",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
@@ -83,6 +84,7 @@ cc_library(
     deps = [
         ":autotuner_compile_util",
         ":autotuner_util",
+        ":redzone_buffers",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -118,6 +120,8 @@ cc_library(
         ":autotuner_compile_util",
         ":autotuner_status_key",
         ":autotuner_util",
+        ":dot_search_space",
+        ":redzone_buffers",
         "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
@@ -125,6 +129,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu/runtime:buffer_comparator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -141,7 +146,6 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:buffer_comparator",
         "//xla/service/gpu:gpu_float_support",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_indexing_utils",
@@ -155,6 +159,7 @@ cc_library(
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:fusion_wrapper",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
@@ -187,6 +192,7 @@ cc_library(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
@@ -256,28 +262,70 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "dot_search_space",
+    srcs = ["dot_search_space.cc"],
+    hdrs = ["dot_search_space.h"],
+    tags = ["gpu"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_traversal",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/lib/core:bits",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+xla_test(
+    name = "dot_search_space_test",
+    srcs = ["dot_search_space_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":dot_search_space",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "gemm_algorithm_picker",
     srcs = ["gemm_algorithm_picker.cc"],
     hdrs = ["gemm_algorithm_picker.h"],
     tags = ["gpu"],
     deps = [
-        ":autotuner_compile_util",
         ":autotuner_util",
+        ":redzone_buffers",
         "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu/runtime:buffer_comparator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_module_config",
+        "//xla/service:overload",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:buffer_comparator",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:variant_visitor",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
@@ -316,17 +364,13 @@ cc_library(
         ":autotuner_status_key",
         "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
-        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:dump",
-        "//xla/service/gpu:stream_executor_util",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/platform:env",
@@ -385,22 +429,50 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "redzone_buffers",
+    srcs = ["redzone_buffers.cc"],
+    hdrs = ["redzone_buffers.h"],
+    deps = [
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/gpu:redzone_allocator",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
 xla_test(
-    name = "autotuner_compile_util_test",
-    srcs = ["autotuner_compile_util_test.cc"],
+    name = "redzone_buffers_test",
+    srcs = ["redzone_buffers_test.cc"],
     backends = ["gpu"],
     deps = [
-        ":autotuner_compile_util",
-        ":autotuner_util",
+        ":redzone_buffers",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
@@ -420,9 +492,9 @@ xla_test(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:overload",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:variant_visitor",
         "//xla/service/gpu/transforms:gemm_rewriter",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
@@ -435,7 +507,6 @@ xla_test(
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -449,6 +520,7 @@ cc_library(
         ":autotuner_compile_util",
         ":autotuner_util",
         ":gpu_autotuning_proto_cc",
+        ":redzone_buffers",
         "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla:debug_options_flags",
@@ -497,7 +569,7 @@ cc_library(
         "@local_tsl//tsl/platform:numbers",
     ] + if_cuda_is_configured([
         # keep sorted
-        "//xla/service/gpu:buffer_comparator",
+        "//xla/backends/gpu/runtime:buffer_comparator",
         "//xla/stream_executor/gpu:redzone_allocator",
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -540,7 +612,6 @@ xla_test(
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -552,6 +623,7 @@ cc_library(
     deps = [
         ":autotuner_compile_util",
         ":autotuner_util",
+        ":redzone_buffers",
         "//xla:autotuning_proto_cc",
         "//xla:status_macros",
         "//xla:util",
@@ -650,9 +722,11 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
index 50a70dc48a5b..ff55863cb49c 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
 namespace gpu {
@@ -71,13 +72,13 @@ std::vector<ExecutionInput> ExecutionInputsFromBuffers(
 }  // namespace
 
 AutotunerCompileUtil::AutotunerCompileUtil(const AutotuneConfig& config,
-                                           Compiler* compiler,
+                                           std::unique_ptr<Compiler> compiler,
                                            se::StreamExecutor& stream_executor,
                                            se::Stream& stream,
                                            se::DeviceMemoryAllocator& allocator,
                                            const DebugOptions& opts)
     : config_(config),
-      compiler_(compiler),
+      compiler_(std::move(compiler)),
       stream_executor_(stream_executor),
       stream_(stream),
       allocator_(allocator),
@@ -104,6 +105,7 @@ AutotunerCompileUtil::ProfileExecutable(
     Executable* executable, se::Stream* stream,
     absl::Span<se::DeviceMemoryBase const> input_buffers,
     absl::Span<Shape const> input_shapes) {
+  tsl::profiler::TraceMe traceme("ProfileExecutable");
   {
     std::vector<ExecutionInput> execution_inputs =
         ExecutionInputsFromBuffers(input_buffers, input_shapes);
@@ -129,6 +131,7 @@ AutotunerCompileUtil::ProfileExecutable(
 
 absl::StatusOr<std::unique_ptr<Executable>> AutotunerCompileUtil::Compile(
     GenerateModuleFn extractor) {
+  tsl::profiler::TraceMe traceme("AutotunerCompile");
   absl::StatusOr<std::unique_ptr<HloModule>> new_hlo_module = extractor(opts_);
   if (new_hlo_module.status().GetPayload(kUncompilableFusion).has_value()) {
     // Incompatible value of split-k is an example of an expected failure.
@@ -158,6 +161,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> AutotunerCompileUtil::ExtractModule(
 
 /*static*/ absl::StatusOr<AutotunerCompileUtil> AutotunerCompileUtil::Create(
     const AutotuneConfig& config, const DebugOptions& opts) {
+  tsl::profiler::TraceMe traceme("AutotunerCreate");
   if (config.IsDeviceless()) {
     return absl::InvalidArgumentError(
         "Deviceless autotuning is not supported.");
@@ -165,15 +169,16 @@ absl::StatusOr<std::unique_ptr<HloModule>> AutotunerCompileUtil::ExtractModule(
   se::StreamExecutor* stream_exec = config.GetExecutor();
   se::DeviceMemoryAllocator* allocator = config.GetAllocator();
   TF_ASSIGN_OR_RETURN(se::Stream* const stream, config.GetStream());
-  TF_ASSIGN_OR_RETURN(Compiler * compiler,
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
                       Compiler::GetForPlatform(stream_exec->GetPlatform()));
-  return AutotunerCompileUtil(config, compiler, *stream_exec, *stream,
-                              *allocator, opts);
+  return AutotunerCompileUtil(config, std::move(compiler), *stream_exec,
+                              *stream, *allocator, opts);
 }
 
 absl::StatusOr<ExecutionOutput> AutotunerCompileUtil::Execute(
     Executable& executable, std::vector<ExecutionInput> arguments,
     ExecutionProfile* profile) {
+  tsl::profiler::TraceMe traceme("AutotunerExecute");
   // Require exclusive GPU lock to prevent other runs during autotuning.
   GpuExecutableRunOptions gpu_opts;
   gpu_opts.set_requires_exclusive_lock_on_gpu();
@@ -192,85 +197,5 @@ absl::StatusOr<ExecutionOutput> AutotunerCompileUtil::Execute(
   return std::move(output);
 }
 
-absl::StatusOr<RedzoneBuffers> RedzoneBuffers::FromInstruction(
-    const HloInstruction& instruction, const AutotuneConfig& config,
-    const DebugOptions& debug_options, BuffersToCreate buffers_to_create) {
-  RedzoneBuffers buffers;
-  TF_ASSIGN_OR_RETURN(se::Stream * stream, config.GetStream());
-  buffers.redzone_allocator_ = std::make_unique<se::RedzoneAllocator>(
-      stream, config.GetAllocator(),
-      /*memory_limit=*/std::numeric_limits<int64_t>::max(),
-      /*redzone_size=*/config.should_check_correctness()
-          ? debug_options.xla_gpu_redzone_padding_bytes()
-          : 0);
-
-  int64_t rng_state = 0;
-
-  TF_RETURN_IF_ERROR(
-      buffers.CreateInputs(instruction, config, debug_options, rng_state));
-
-  if (buffers_to_create == BuffersToCreate::kAllInputsAllOutputs ||
-      buffers_to_create == BuffersToCreate::kAllInputsOutputsNoScratch) {
-    TF_RETURN_IF_ERROR(buffers.CreateOutputs(instruction, config, debug_options,
-                                             buffers_to_create, rng_state));
-  }
-
-  return buffers;
-}
-
-absl::Status RedzoneBuffers::CreateInputs(const HloInstruction& instruction,
-                                          const AutotuneConfig& config,
-                                          const DebugOptions& debug_options,
-                                          int64_t& rng_state) {
-  for (const auto* operand : instruction.operands()) {
-    TF_ASSIGN_OR_RETURN(
-        se::DeviceMemoryBase buf,
-        redzone_allocator_->CreateBuffer(
-            operand->shape(), config.should_init_buffers(), rng_state));
-    input_buffers_.push_back(buf);
-    input_shapes_.push_back(operand->shape());
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RedzoneBuffers::CreateOutputs(const HloInstruction& instruction,
-                                           const AutotuneConfig& config,
-                                           const DebugOptions& debug_options,
-                                           BuffersToCreate buffers_to_create,
-                                           int64_t& rng_state) {
-  if (!instruction.shape().IsTuple()) {
-    TF_ASSIGN_OR_RETURN(
-        se::DeviceMemoryBase buf,
-        redzone_allocator_->CreateBuffer(
-            instruction.shape(), config.should_init_buffers(), rng_state));
-    output_buffers_.push_back(buf);
-    output_shape_ = instruction.shape();
-    return absl::OkStatus();
-  }
-
-  // The output is a tuple.
-
-  auto current_shape_it = instruction.shape().tuple_shapes().begin();
-  auto end = instruction.shape().tuple_shapes().end();
-  end -= buffers_to_create == kAllInputsAllOutputs ? 0 : 1;
-
-  output_shape_ = std::distance(current_shape_it, end) == 1
-                      ? output_shape_ = *current_shape_it
-                      : ShapeUtil::MakeTupleShape(
-                            std::vector<Shape>{current_shape_it, end});
-
-  for (; current_shape_it < end; current_shape_it++) {
-    if (current_shape_it->IsTuple()) {
-      return Unimplemented("Nested tuples are unsupported by RedzoneBuffers.");
-    }
-    TF_ASSIGN_OR_RETURN(
-        se::DeviceMemoryBase buf,
-        redzone_allocator_->CreateBuffer(
-            *current_shape_it, config.should_init_buffers(), rng_state));
-    output_buffers_.push_back(buf);
-  }
-  return absl::OkStatus();
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.h
index 0e0fcc712a6e..5c586d42b7fb 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.h
@@ -98,7 +98,8 @@ class AutotunerCompileUtil {
       GenerateModuleFn extractor);
 
  private:
-  AutotunerCompileUtil(const AutotuneConfig& config, Compiler* compiler,
+  AutotunerCompileUtil(const AutotuneConfig& config,
+                       std::unique_ptr<Compiler> compiler,
                        se::StreamExecutor& stream_executor, se::Stream& stream,
                        se::DeviceMemoryAllocator& allocator,
                        const DebugOptions& opts);
@@ -108,65 +109,13 @@ class AutotunerCompileUtil {
                                           ExecutionProfile* profile = nullptr);
 
   AutotuneConfig config_;
-  Compiler* compiler_;
+  std::unique_ptr<Compiler> compiler_;
   se::StreamExecutor& stream_executor_;
   se::Stream& stream_;
   se::DeviceMemoryAllocator& allocator_;
   DebugOptions opts_;
 };
 
-// A RedZone allocator and a collection of buffers that store the inputs and
-// outputs of an HloInstruction. These are used when running the instruction
-// for autotuning.
-class RedzoneBuffers {
- public:
-  enum BuffersToCreate {
-    // Create a buffer for all of the instruction's operands. The result shape
-    // is ignored.
-    kAllInputs = 0,
-    // Create a buffer for all of the instruction's operands and the entire
-    // result shape. If the result shape is a tuple, a separate buffer is
-    // created for each subshape.
-    kAllInputsAllOutputs = 1,
-    // Create a buffer for all of the instruction's operands and all of the
-    // subshapes of the result tuple, except for the last one. The last subshape
-    // is considered a scratch buffer and is assumed to be allocated elsewhere.
-    // If the result shape is not a tuple, this will create a buffer
-    // corresponding to the entire shape - equivalent to `kAllInputsAllOutputs`.
-    kAllInputsOutputsNoScratch = 2,
-  };
-  static absl::StatusOr<RedzoneBuffers> FromInstruction(
-      const HloInstruction& instruction, const AutotuneConfig& config,
-      const DebugOptions& debug_options, BuffersToCreate buffers_to_create);
-
-  const std::vector<se::DeviceMemoryBase>& input_buffers() const {
-    return input_buffers_;
-  }
-  const std::vector<Shape>& input_shapes() const { return input_shapes_; }
-  const std::vector<se::DeviceMemoryBase>& output_buffers() const {
-    return output_buffers_;
-  }
-  const Shape& output_shape() const { return output_shape_; }
-  se::RedzoneAllocator& RedzoneAllocator() const { return *redzone_allocator_; }
-
- private:
-  absl::Status CreateInputs(const HloInstruction& instruction,
-                            const AutotuneConfig& config,
-                            const DebugOptions& debug_options,
-                            int64_t& rng_state);
-
-  absl::Status CreateOutputs(const HloInstruction& instruction,
-                             const AutotuneConfig& config,
-                             const DebugOptions& debug_options,
-                             BuffersToCreate buffers_to_create,
-                             int64_t& rng_state);
-
-  std::unique_ptr<se::RedzoneAllocator> redzone_allocator_;
-  std::vector<se::DeviceMemoryBase> input_buffers_;
-  std::vector<Shape> input_shapes_;
-  std::vector<se::DeviceMemoryBase> output_buffers_;
-  Shape output_shape_;
-};
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc
deleted file mode 100644
index 885271da4e64..000000000000
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util_test.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/autotuning/autotuner_compile_util.h"
-
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/autotuning/autotuner_util.h"
-#include "xla/service/platform_util.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla::gpu {
-namespace {
-
-using AutotunerCompileUtilTest = HloTestBase;
-
-TEST_F(AutotunerCompileUtilTest, VerifyOutputNotATuple) {
-  constexpr absl::string_view kHlo = R"(
-HloModule hlo
-ENTRY main {
-  p0 = f32[2,2] parameter(0)
-  p1 = f32[4,4] parameter(1)
-  p2 = f32[6,6] parameter(2)
-  ROOT root = f32[1,2,3] custom-call(p0, p1, p2), custom_call_target="fake"
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
-
-  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
-                          PlatformUtil::GetStreamExecutors(platform));
-
-  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
-                                 GetDebugOptionsForTest()};
-
-  auto& root = *module->entry_computation()->root_instruction();
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputs));
-
-  EXPECT_EQ(rzb.input_shapes().size(), 3);
-  EXPECT_EQ(rzb.input_buffers().size(), 3);
-  EXPECT_EQ(rzb.output_buffers().size(), 0);
-  EXPECT_NE(rzb.output_shape(), root.shape());
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb2,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputsAllOutputs));
-
-  EXPECT_EQ(rzb2.input_shapes().size(), 3);
-  EXPECT_EQ(rzb2.input_buffers().size(), 3);
-  EXPECT_EQ(rzb2.output_buffers().size(), 1);
-  EXPECT_EQ(rzb2.output_shape(), root.shape());
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputsOutputsNoScratch));
-
-  EXPECT_EQ(rzb3.input_shapes().size(), 3);
-  EXPECT_EQ(rzb3.input_buffers().size(), 3);
-  EXPECT_EQ(rzb3.output_buffers().size(), 1);
-  EXPECT_EQ(rzb3.output_shape(), root.shape());
-}
-
-TEST_F(AutotunerCompileUtilTest, VerifyOutputTupleOneElement) {
-  constexpr absl::string_view kHlo = R"(
-HloModule hlo
-ENTRY main {
-  p0 = f32[2,2] parameter(0)
-  p1 = f32[4,4] parameter(1)
-  p2 = f32[6,6] parameter(2)
-  ROOT root = (f32[1,2,3]) custom-call(p0, p1, p2), custom_call_target="fake"
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
-
-  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
-                          PlatformUtil::GetStreamExecutors(platform));
-
-  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
-                                 GetDebugOptionsForTest()};
-
-  auto& root = *module->entry_computation()->root_instruction();
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputs));
-
-  EXPECT_EQ(rzb.input_shapes().size(), 3);
-  EXPECT_EQ(rzb.input_buffers().size(), 3);
-  EXPECT_EQ(rzb.output_buffers().size(), 0);
-  EXPECT_NE(rzb.output_shape(), root.shape());
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb2,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputsAllOutputs));
-
-  EXPECT_EQ(rzb2.input_shapes().size(), 3);
-  EXPECT_EQ(rzb2.input_buffers().size(), 3);
-  EXPECT_EQ(rzb2.output_buffers().size(), 1);
-  EXPECT_FALSE(rzb2.output_shape().IsTuple());
-  EXPECT_EQ(rzb2.output_shape(), root.shape().tuple_shapes(0));
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputsOutputsNoScratch));
-
-  EXPECT_EQ(rzb3.input_shapes().size(), 3);
-  EXPECT_EQ(rzb3.input_buffers().size(), 3);
-  EXPECT_EQ(rzb3.output_buffers().size(), 0);
-}
-
-TEST_F(AutotunerCompileUtilTest, VerifyOutputTupleTwoElements) {
-  constexpr absl::string_view kHlo = R"(
-HloModule hlo
-ENTRY main {
-  p0 = f32[2,2] parameter(0)
-  p1 = f32[4,4] parameter(1)
-  p2 = f32[6,6] parameter(2)
-  ROOT root = (f32[1,2,3], u8[1,2]) custom-call(p0, p1, p2), custom_call_target="fake"
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
-
-  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
-                          PlatformUtil::GetStreamExecutors(platform));
-
-  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
-                                 GetDebugOptionsForTest()};
-
-  auto& root = *module->entry_computation()->root_instruction();
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputs));
-
-  EXPECT_EQ(rzb.input_shapes().size(), 3);
-  EXPECT_EQ(rzb.input_buffers().size(), 3);
-  EXPECT_EQ(rzb.output_buffers().size(), 0);
-  EXPECT_NE(rzb.output_shape(), root.shape());
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb2,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputsAllOutputs));
-
-  EXPECT_EQ(rzb2.input_shapes().size(), 3);
-  EXPECT_EQ(rzb2.input_buffers().size(), 3);
-  EXPECT_EQ(rzb2.output_buffers().size(), 2);
-  EXPECT_TRUE(rzb2.output_shape().IsTuple());
-  EXPECT_EQ(rzb2.output_shape(), root.shape());
-
-  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
-                          RedzoneBuffers::FromInstruction(
-                              root, autotune_config, GetDebugOptionsForTest(),
-                              RedzoneBuffers::kAllInputsOutputsNoScratch));
-
-  EXPECT_EQ(rzb3.input_shapes().size(), 3);
-  EXPECT_EQ(rzb3.input_buffers().size(), 3);
-  EXPECT_EQ(rzb3.output_buffers().size(), 1);
-  EXPECT_FALSE(rzb3.output_shape().IsTuple());
-  EXPECT_EQ(rzb3.output_shape(), root.shape().tuple_shapes(0));
-}
-
-}  // namespace
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc
index 7cf12687336e..32a6adb571df 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.cc
@@ -249,6 +249,7 @@ void SerializeAutotuneEntry(AutotuneResults* results, const AutotuneCacheKey& k,
   auto& entry = *results->add_results();
   entry.set_device(std::string(k.GetModelStr()));
   entry.set_hlo(std::string(k.GetHlo()));
+  entry.set_version(k.GetVersion());
   *entry.mutable_result() = *res;
 }
 }  // namespace
@@ -272,7 +273,7 @@ void SerializeAutotuneEntry(AutotuneResults* results, const AutotuneCacheKey& k,
     const AutotuneResults& results, bool allow_override) {
   absl::MutexLock lock(&autotune_cache_mu);
   for (const AutotuneResults::Entry& result : results.results()) {
-    AutotuneCacheKey key(result.device(), result.hlo());
+    AutotuneCacheKey key(result.device(), result.hlo(), result.version());
     if (allow_override) {
       autotune_cache.insert_or_assign(key, result.result());
     } else {
@@ -469,6 +470,7 @@ namespace {
 
 bool IsTextProtoPath(absl::string_view file_path) {
   return absl::EndsWith(file_path, ".txt") ||
+         absl::EndsWith(file_path, ".txtpb") ||
          absl::EndsWith(file_path, ".textproto") ||
          absl::EndsWith(file_path, ".prototxt") ||
          absl::EndsWith(file_path, ".pbtxt");
@@ -494,6 +496,7 @@ bool IsTextProtoPath(absl::string_view file_path) {
         kVersion, results.version()));
   }
 
+  AddVersionToAutotuneResults(results);
   TF_RETURN_IF_ERROR(LoadAutotuneResults(results, allow_override));
   return absl::OkStatus();
 }
@@ -568,5 +571,14 @@ bool IsTextProtoPath(absl::string_view file_path) {
   autotune_cache_stats = CacheStats();
 }
 
+void AddVersionToAutotuneResults(AutotuneResults& results) {
+  for (auto& result : *results.mutable_results()) {
+    if (result.version() == 0) {
+      // Set to current version if we don't have one specified.
+      result.set_version(AutotuneCacheKey::kCurrentVersion);
+    }
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
index 55429c6d09c8..6dff87f65911 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util.h
@@ -58,6 +58,11 @@ struct DevicelessConfig {
 
 class AutotuneCacheKey {
  public:
+  // Tie a version to the cache key in order to invalidate the cache when
+  // necessary. This should be incremented on triton upgrades or any other
+  // changes that may affect the autotuning results.
+  static constexpr int kCurrentVersion = 5;
+
   AutotuneCacheKey(const se::DeviceDescription& device_description,
                    const HloInstruction& instruction)
       : AutotuneCacheKey(DeviceDescriptionToCacheKey(device_description),
@@ -70,22 +75,31 @@ class AutotuneCacheKey {
                             absl::string_view hlo_canonical)
       : model_str_(model_str), hlo_canonical_(hlo_canonical) {}
 
+  explicit AutotuneCacheKey(absl::string_view model_str,
+                            absl::string_view hlo_canonical, int version)
+      : model_str_(model_str),
+        hlo_canonical_(hlo_canonical),
+        version_(version) {}
+
   absl::string_view GetModelStr() const { return model_str_; }
 
   absl::string_view GetHlo() const { return hlo_canonical_; }
 
+  int GetVersion() const { return version_; }
+
   template <typename H>
   friend H AbslHashValue(H h, const AutotuneCacheKey& w) {
-    return H::combine(std::move(h), w.model_str_, w.hlo_canonical_);
+    return H::combine(std::move(h), w.model_str_, w.hlo_canonical_, w.version_);
   }
 
   bool operator==(const AutotuneCacheKey& w) const {
-    return model_str_ == w.model_str_ && hlo_canonical_ == w.hlo_canonical_;
+    return model_str_ == w.model_str_ && hlo_canonical_ == w.hlo_canonical_ &&
+           version_ == w.version_;
   }
 
   std::string ToString() const {
-    return absl::StrFormat("<key model='%s', hlo='%s'>", model_str_,
-                           hlo_canonical_);
+    return absl::StrFormat("<key model='%s', hlo='%s', version=%d>", model_str_,
+                           hlo_canonical_, version_);
   }
 
   static std::string DeviceDescriptionToCacheKey(
@@ -94,6 +108,7 @@ class AutotuneCacheKey {
  private:
   std::string model_str_;
   std::string hlo_canonical_;
+  int version_ = kCurrentVersion;
 };
 
 using AutotuneCacheKeySet = absl::flat_hash_set<AutotuneCacheKey>;
@@ -349,6 +364,13 @@ absl::StatusOr<std::string> GetBase64EncodedSha256Hash(absl::string_view s);
 
 std::string ToCanonicalString(const HloInstruction* instr);
 
+// Adds version information to each entry in AutotuneResults. Useful for unit
+// tests involving hard-coded AutotuneResults (including those read from files,
+// which happens automatically), as the entry version changes much more often
+// than the overall structure version of the AutotuneResults itself, so it's
+// nice to only have to change one place to update it.
+void AddVersionToAutotuneResults(AutotuneResults& results);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc
index 45df01d64292..97838f143b1c 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_util_test.cc
@@ -23,9 +23,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash_testing.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
@@ -64,6 +66,7 @@ using ::testing::Ne;
 using ::testing::Not;
 using ::testing::TempDir;
 using ::testing::UnorderedElementsAre;
+using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 
 class AutotunerUtilTest : public HloTestBase {
@@ -176,6 +179,20 @@ TEST_F(AutotunerUtilTest, LoadAutotuneResultsFromFile_TextProto1) {
 
   TF_EXPECT_OK(AutotunerUtil::LoadAutotuneResultsFromFile(kFilePath));
   EXPECT_FALSE(AutotunerUtil::ResultCacheIsEmpty());
+
+  AutotuneResults results;
+  EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      std::string(kResultText), &results));
+  ASSERT_GT(results.results().size(), 0);
+  AddVersionToAutotuneResults(results);
+  AutotuneCacheKey key(results.results(0).device(), results.results(0).hlo(),
+                       results.results(0).version());
+  auto options = DebugOptions();
+  options.set_xla_gpu_require_complete_aot_autotune_results(true);
+  stream_executor::StreamExecutor* executor = NewStreamExecutor();
+  AutotuneConfig config(DeviceConfig{executor}, options);
+
+  EXPECT_THAT(AutotunerUtil::IsInCache(key, config), IsOkAndHolds(true));
 }
 
 TEST_F(AutotunerUtilTest, LoadAutotuneResultsFromFile_TextProto2) {
@@ -515,6 +532,23 @@ TEST(AutotuneCacheKeyTest, DeviceDescriptionToCacheKey) {
             "1638 GB/s, L2 cache: 8 MB");
 }
 
+TEST(AutotuneCacheKeyTest, VersionIsIncludedInCacheKey) {
+  AutotuneCacheKey key = AutotuneCacheKey("model", "hlo");
+  EXPECT_THAT(key.ToString(),
+              HasSubstr(absl::StrFormat("version=%d", key.GetVersion())));
+}
+
+TEST(AutotuneCacheKeyTest, VersionChangeInvalidateCacheKey) {
+  AutotuneCacheKey key0 = AutotuneCacheKey("model", "hlo", /*version=*/0);
+  AutotuneCacheKey key1 = AutotuneCacheKey("model", "hlo", /*version=*/1);
+  EXPECT_FALSE(key0 == key1);
+  EXPECT_NE(key0.ToString(), key1.ToString());
+  EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly({
+      key0,
+      key1,
+  }));
+}
+
 TEST_F(FileBasedCacheTest, AddResultDoesNotWriteTheFileInReadMode) {
   SetCacheMode(DebugOptions::AUTOTUNE_CACHE_MODE_READ);
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
index 0f39c280ea8d..29c0d9da21d1 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc
@@ -42,9 +42,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/literal_util.h"
-#include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/autotuning/gpu_autotuning.pb.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
@@ -83,7 +83,7 @@ limitations under the License.
 #else
 #include "third_party/gpus/cudnn/cudnn_ops_infer.h"
 #endif  // CUDNN_VERSION >= 90000
-#include "xla/service/gpu/buffer_comparator.h"
+#include "xla/backends/gpu/runtime/buffer_comparator.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #endif
 
@@ -160,8 +160,8 @@ absl::StatusOr<se::DeviceMemory<uint8_t>> ScratchAllocator::AllocateBytes(
 }
 
 absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
-    const GpuConvConfig& config, se::Stream* stream, bool use_cudnn_frontend,
-    bool use_fallback, const se::NumericOptions& numeric_options) {
+    const GpuConvConfig& config, se::Stream* stream, bool use_fallback,
+    const se::NumericOptions& numeric_options) {
   TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
                       GetDNNConvKindFromCudnnConvKind(config.kind));
 
@@ -188,7 +188,6 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
       }
       std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners;
       TF_RETURN_IF_ERROR(dnn->GetFusedConvolveRunners(
-          use_cudnn_frontend,
           // This refers to the kind of convolution op inside the fusion, not
           // the whole fused graph.
           se::dnn::ConvolutionKind::FORWARD, input_type,
@@ -234,8 +233,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
       // This path is cuDNN-only, where the DeviceMemoryBase arguments and the
       // allocator are unused; so, they're all provided as nullptr.
       TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
-          use_cudnn_frontend, kind, input_type, output_type, stream,
-          config.input_descriptor,
+          kind, input_type, output_type, stream, config.input_descriptor,
           /* input_data = */ DeviceMemoryBase(nullptr),
           config.filter_descriptor,
           /* filter_data = */ DeviceMemoryBase(nullptr),
@@ -282,9 +280,8 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
     return absl::InvalidArgumentError("No DNN in stream executor.");
   }
   TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
-      /* use_cudnn_frontend = */ false, kind, dtype, dtype, stream,
-      params.config->input_descriptor, params.input_buf,
-      params.config->filter_descriptor, params.filter_buf,
+      kind, dtype, dtype, stream, params.config->input_descriptor,
+      params.input_buf, params.config->filter_descriptor, params.filter_buf,
       params.config->output_descriptor, params.output_buf,
       params.config->conv_desc,
       /* use_fallback = */ false, scratch_allocator, numeric_options,
@@ -448,10 +445,16 @@ absl::StatusOr<GpuConvAlgorithmPicker::AutotuneRuntimeArguments>
 GpuConvAlgorithmPicker::AutotuneRuntimeArguments::FromInstruction(
     const HloCustomCallInstruction* instr, const AutotuneConfig& config,
     const DebugOptions& debug_options) {
-  TF_ASSIGN_OR_RETURN(auto rz_buffers,
-                      RedzoneBuffers::FromInstruction(
-                          *instr, config, debug_options,
-                          RedzoneBuffers::kAllInputsOutputsNoScratch));
+  bool should_init_buffers = config.should_init_buffers();
+  bool should_check_correctness = config.should_check_correctness();
+  int redzone_padding_bytes = debug_options.xla_gpu_redzone_padding_bytes();
+  TF_ASSIGN_OR_RETURN(se::Stream * stream, config.GetStream());
+  TF_ASSIGN_OR_RETURN(
+      auto rz_buffers,
+      RedzoneBuffers::FromInstruction(
+          *instr, config.GetAllocator(), stream,
+          RedzoneBuffers::kAllInputsOutputsNoScratch, should_init_buffers,
+          should_check_correctness, redzone_padding_bytes));
 
   // Get canonical HLO.
   std::string canonical_hlo(
@@ -824,8 +827,6 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
         blas_version, runtime_arguments.canonical_hlo.value());
   }
 
-  const bool cudnn_frontend_enabled =
-      debug_options.xla_gpu_enable_cudnn_frontend();
   bool allow_tf32 = true;
   // TODO(b/284371623): Properly set allow_tf32 even if instr==nullptr, which is
   // the case when running an AOT compiled executable with runtime autotuning.
@@ -846,7 +847,6 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
   TF_ASSIGN_OR_RETURN(
       std::vector<GenericConvRunner> runners,
       GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                    cudnn_frontend_enabled,
                     /* use_fallback = */ false, numeric_options));
 
   std::vector<AutotuneResult> profile_results;
@@ -870,7 +870,6 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
     TF_ASSIGN_OR_RETURN(
         std::vector<GenericConvRunner> fallback_runners,
         GetAlgorithms(runtime_arguments.gpu_conv_config, stream,
-                      cudnn_frontend_enabled,
                       /* use_fallback = */ true, numeric_options));
 
     for (auto& runner_cache : fallback_runners) {
@@ -969,9 +968,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   }
 
   std::vector<se::DeviceMemoryBase> result_buffers(
-      instr->shape().tuple_shapes_size());
+      instr->shape().tuple_shapes().size());
   if (instr->shape().IsTuple()) {
-    for (int i = 0; i < instr->shape().tuple_shapes_size(); ++i) {
+    for (int i = 0; i < instr->shape().tuple_shapes().size(); ++i) {
       TF_ASSIGN_OR_RETURN(
           result_buffers[i],
           input_output_allocator.AllocateBytes(
@@ -1109,8 +1108,8 @@ absl::StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(
   HloComputation* computation = instr->parent();
   std::vector<Shape> new_call_element_shapes;
   // Add the shapes of the outputs of the convolution.
-  new_call_element_shapes.reserve(instr->shape().tuple_shapes_size() - 1);
-  for (int i = 0; i < instr->shape().tuple_shapes_size() - 1; ++i) {
+  new_call_element_shapes.reserve(instr->shape().tuple_shapes().size() - 1);
+  for (int i = 0; i < instr->shape().tuple_shapes().size() - 1; ++i) {
     new_call_element_shapes.emplace_back(instr->shape().tuple_shapes(i));
   }
   // The final element is the size of the workspace.
@@ -1140,8 +1139,8 @@ absl::StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(
   TF_RETURN_IF_ERROR(new_call->set_backend_config(gpu_backend_config));
 
   std::vector<HloInstruction*> new_tuple_elements;
-  new_tuple_elements.reserve(new_call->shape().tuple_shapes_size() - 1);
-  for (int i = 0; i < new_call->shape().tuple_shapes_size() - 1; ++i) {
+  new_tuple_elements.reserve(new_call->shape().tuple_shapes().size() - 1);
+  for (int i = 0; i < new_call->shape().tuple_shapes().size() - 1; ++i) {
     new_tuple_elements.emplace_back(
         computation->AddInstruction(HloInstruction::CreateGetTupleElement(
             new_call->shape().tuple_shapes(i), new_call, i)));
diff --git a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
index fd5d6a8caefb..facf79d63209 100644
--- a/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/autotuning/conv_algorithm_picker.h
@@ -31,14 +31,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
-#include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/stream_executor.h"
 #include "xla/xla.pb.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner.cc
index 316a7f3381c4..b14cec9d38dd 100644
--- a/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/custom_kernel_fusion_autotuner.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <optional>
 #include <tuple>
 #include <vector>
 
@@ -34,12 +33,11 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion.h"
-#include "xla/service/shaped_buffer.h"
-#include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
@@ -90,12 +88,16 @@ absl::StatusOr<std::vector<std::tuple<int, absl::Duration>>> ProfileKernels(
           std::make_unique<se::StreamExecutorMemoryAllocator>(stream_exec);
       allocator = owned_allocator.get();
     }
-    TF_ASSIGN_OR_RETURN(se::Stream* const stream, autotune_config.GetStream());
 
+    bool should_init_buffers = autotune_config.should_init_buffers();
+    bool should_check_correctness = autotune_config.should_check_correctness();
+    int redzone_padding_bytes = debug_options.xla_gpu_redzone_padding_bytes();
+    TF_ASSIGN_OR_RETURN(se::Stream* const stream, autotune_config.GetStream());
     TF_ASSIGN_OR_RETURN(auto rz_buffers,
                         RedzoneBuffers::FromInstruction(
-                            *fusion_instruction, autotune_config, debug_options,
-                            RedzoneBuffers::kAllInputs));
+                            *fusion_instruction, allocator, stream,
+                            RedzoneBuffers::kAllInputs, should_init_buffers,
+                            should_check_correctness, redzone_padding_bytes));
 
     TF_ASSIGN_OR_RETURN(
         AutotunerCompileUtil::ProfilingOutput profiling_output,
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
new file mode 100644
index 000000000000..f1547301eb2f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
@@ -0,0 +1,613 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/autotuning/dot_search_space.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/lib/core/bits.h"
+#include "xla/util.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla::gpu {
+namespace {
+
+// Returns the size (in number of elements) of the subshape of `shape` defined
+// by `dimensions`.
+int64_t GetSizeInDimensions(
+    const Shape& shape,
+    const tsl::protobuf::RepeatedField<int64_t>& dimensions) {
+  int64_t size = 1;
+  for (int64_t dim : dimensions) {
+    size *= shape.dimensions(dim);
+  }
+  return size;
+}
+
+// Finds the next power of two larger than or equal to x.
+//
+// Unlike tsl::NextPowerOfTwo, doesn't crash for 0.
+int64_t NextPowerOfTwo(int64_t x) {
+  if (x == 0) {
+    return 1;
+  }
+  return tsl::NextPowerOfTwoS64(x);
+}
+
+// Finds the previous power of two, smaller or equal to x.
+//
+// Returns 1 for an edge case of x = 0 (which we can get as a result of integer
+// division). This might feel a bit weird, but it does the right thing when
+// calculating tile sizes, since we need a strictly positive size.
+int64_t PreviousPowerOfTwo(int64_t x) {
+  if (x == 0) {
+    return 1;
+  }
+  return tsl::NextPowerOfTwoS64(x + 1) / 2;
+}
+
+}  // namespace
+
+TritonDotFusionSearchSpace::TritonDotFusionSearchSpace(
+    const se::DeviceDescription& device_description,
+    const HloDotInstruction* dot)
+    :  // Set up basic information about the hardware and the problem.
+      device_description_(device_description),
+      contracting_size_(GetSizeInDimensions(
+          dot->operand(0)->shape(),
+          dot->dot_dimension_numbers().lhs_contracting_dimensions())),
+      batch_size_(GetSizeInDimensions(
+          dot->operand(0)->shape(),
+          dot->dot_dimension_numbers().lhs_batch_dimensions())),
+      lhs_parallel_size_(ShapeUtil::ElementsIn(dot->operand(0)->shape()) /
+                         (contracting_size_ * batch_size_)),
+      rhs_parallel_size_(ShapeUtil::ElementsIn(dot->operand(1)->shape()) /
+                         (contracting_size_ * batch_size_)),
+      operand_bitwidth_(  // The bitwitdth of both operands is the same.
+          primitive_util::BitWidth(dot->operand(0)->shape().element_type())),
+      compute_bitwidth_(primitive_util::BitWidth(dot->shape().element_type())),
+      // Figure out some basic limitations on tiling based on the above.
+      lhs_has_expensive_op_(HasExpensiveTransitiveParent(dot->operand(0))),
+      rhs_has_expensive_op_(HasExpensiveTransitiveParent(dot->operand(1))),
+      desired_total_warps_(GetDesiredTotalWarps()),
+      max_out_tile_(GetMaxOutputTile()),
+      should_optimize_for_occupancy_(ShouldOptimizeForOccupancy()),
+      min_out_tile_(GetMinOutputTile()),
+      min_warps_per_cta_(GetMinWarpsPerCta()),
+      min_contracting_tile_size_(GetMinContractingTileSize()),
+      max_contracting_split_(GetMaxContractingSplit(max_out_tile_)) {
+  // Make sure that the range of output tile sizes is not empty
+  // (min_output_tile_ is a hard limit, while max_output_tile_ is a soft one).
+  max_out_tile_.lhs_dim =
+      std::max(min_out_tile_.lhs_dim, max_out_tile_.lhs_dim);
+  max_out_tile_.rhs_dim =
+      std::max(min_out_tile_.rhs_dim, max_out_tile_.rhs_dim);
+}
+
+std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
+    std::optional<int64_t> force_contracting_split) const {
+  std::vector<ConfigWithNotes> configs;
+  if (force_contracting_split.has_value()) {
+    ConfigWithNotes config;
+    const int split = force_contracting_split.value();
+    config.config.split_k = split;
+    // It is possible that the user manually forced a huge contracting split
+    // that is outside of the search space. In that case, we would end up
+    // discarding all configs, and use the smallest possible tile size further
+    // down, which is likely not what the user had in mind.
+    config.keep_large_split = GetMaxContractingSplit(max_out_tile_) < split;
+    VLOG(5) << "Forcing split_k, config = " << config.ToString();
+    if (config.keep_large_split) {
+      LOG(WARNING)
+          << "split_k is larger than what we would have found automatically. "
+             "Skipping split and output tile compatibility checks. Should we "
+             "expand the split_k search space?";
+    }
+    configs.push_back(config);
+  } else {
+    configs = GenerateContractingSplitFactors();
+  }
+
+  ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddOutputTilings);
+  EliminateLowOccupancyConfigs(configs);
+  ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddCtaSizeParameter);
+  ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddContractingTiling);
+  ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddPipeliningParameter);
+
+  std::vector<TritonGemmConfig> result;
+  result.reserve(configs.size());
+  for (ConfigWithNotes& config_with_notes : configs) {
+    TritonGemmConfig& config = config_with_notes.config;
+    // TODO: b/408386169 - Implement CTA cluster support.
+    config.num_ctas = 1;
+    result.push_back(config);
+  }
+  return result;
+}
+
+std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::OptimizeConfigSet(
+    const std::vector<TritonGemmConfig>& configs,
+    const std::vector<TritonGemmConfig>& hints) const {
+  if (hints.empty() || configs.empty()) {
+    return configs;
+  }
+
+  auto split_limits = std::minmax_element(
+      configs.begin(), configs.end(),
+      [](const auto& a, const auto& b) { return a.split_k < b.split_k; });
+  absl::flat_hash_set<TritonGemmConfig> filter;
+  for (TritonGemmConfig config : hints) {
+    // Our default config set does not take problem size into account, so we
+    // might not even have some of them in the "exhaustive set", since they
+    // might be outside of the efficient config range. Hence, we limit the tile
+    // to what can appear in the exhaustive set.
+    config.block_m = std::clamp(config.block_m, min_out_tile_.lhs_dim,
+                                max_out_tile_.lhs_dim);
+    config.block_n = std::clamp(config.block_n, min_out_tile_.rhs_dim,
+                                max_out_tile_.rhs_dim);
+    config.block_k =
+        std::clamp(config.block_k, min_contracting_tile_size_,
+                   GetMaxContractingTileSize({config.block_m, config.block_n},
+                                             /*contracting_split=*/1));
+    config.split_k = std::clamp(config.split_k, split_limits.first->split_k,
+                                split_limits.second->split_k);
+    VLOG(10) << "Adding config to hint filter: " << config.ToString();
+    filter.insert(config);
+  }
+
+  std::vector<TritonGemmConfig> result_configs;
+  for (const TritonGemmConfig& config : configs) {
+    if (!filter.contains(config)) {
+      continue;
+    }
+    VLOG(10) << "Filtering out configs based on hints: surviving config = "
+             << config.ToString();
+    result_configs.push_back(config);
+  };
+
+  if (result_configs.empty()) {
+    LOG(WARNING) << "All configs were filtered out because none of them "
+                    "sufficiently match the hints. Maybe the hints set does "
+                    "not contain a good representative set of valid configs?"
+                    "Working around this by using the full hints set instead.";
+    return hints;
+  }
+  return result_configs;
+}
+
+std::string TritonDotFusionSearchSpace::ToString() const {
+  return absl::StrFormat(
+      "problem_size_BxMxNxKxE: %dx%dx%dx%dx(%d->%d) "
+      "tile_range_SxMxNxK: [1-%d]x[%d-%d]x[%d-%d]x[%d-?] "
+      "desired_total_warps: %d occupancy_optimization: %d "
+      "warps_per_cta: [%d-?]",
+      batch_size_, lhs_parallel_size_, rhs_parallel_size_, contracting_size_,
+      operand_bitwidth_, compute_bitwidth_, max_contracting_split_,
+      min_out_tile_.lhs_dim, max_out_tile_.lhs_dim, min_out_tile_.rhs_dim,
+      max_out_tile_.rhs_dim, min_contracting_tile_size_, desired_total_warps_,
+      should_optimize_for_occupancy_, min_warps_per_cta_);
+}
+
+bool TritonDotFusionSearchSpace::HasExpensiveTransitiveParent(
+    const HloInstruction* operand) const {
+  return HloBfsAnyOf({operand}, [](const HloInstruction* instr) {
+    // XLA uses old absl that doesn't have absl:NoDestructor, so have to use
+    // new instead to prevent the destructor from being called.
+    static const auto kExpensiveOps = new absl::flat_hash_set<HloOpcode>{
+        HloOpcode::kAtan2,    HloOpcode::kCos,   HloOpcode::kExp,
+        HloOpcode::kExpm1,    HloOpcode::kLog,   HloOpcode::kLog1p,
+        HloOpcode::kLogistic, HloOpcode::kPower, HloOpcode::kRsqrt,
+        HloOpcode::kSin,      HloOpcode::kSqrt,  HloOpcode::kTan,
+        HloOpcode::kTanh,
+    };
+    return kExpensiveOps->contains(instr->opcode());
+  });
+}
+
+int TritonDotFusionSearchSpace::GetDesiredTotalWarps() const {
+  constexpr int kSchedulersPerCore = 4;
+  constexpr int kDesiredWarpsPerCore =
+      kMaxWarpsPerScheduler * kSchedulersPerCore;
+  return kDesiredWarpsPerCore * device_description_.core_count();
+}
+
+TritonDotFusionSearchSpace::OutputTile
+TritonDotFusionSearchSpace::GetMaxOutputTile() const {
+  constexpr int kRegisterSizeInBits = 32;
+  const int64_t max_elements_per_cta =
+      device_description_.registers_per_block_limit() * kRegisterSizeInBits /
+      compute_bitwidth_;
+  auto limit_other_size_to_fit = [max_elements_per_cta](int64_t this_size) {
+    return PreviousPowerOfTwo(max_elements_per_cta / this_size);
+  };
+  // We generally want to have square-ish tiles if possible to get maximal
+  // reuse. For wgmma the optimal instruction shape is 64x256, so optimizing for
+  // larger RHS given the choice.
+  OutputTile max_tile;
+  max_tile.lhs_dim = PreviousPowerOfTwo(std::sqrt(max_elements_per_cta));
+  max_tile.rhs_dim = limit_other_size_to_fit(max_tile.lhs_dim);
+  VLOG(5) << "Computing max_output_tile: Based on available registers, "
+             "max_output_tile = "
+          << max_tile.lhs_dim << "x" << max_tile.rhs_dim;
+
+  const int64_t lhs_parallel_limit = NextPowerOfTwo(lhs_parallel_size_);
+  const int64_t rhs_parallel_limit = NextPowerOfTwo(rhs_parallel_size_);
+  if (lhs_parallel_limit < max_tile.lhs_dim) {
+    max_tile.lhs_dim = lhs_parallel_limit;
+    max_tile.rhs_dim = std::min(limit_other_size_to_fit(lhs_parallel_limit),
+                                rhs_parallel_limit);
+    VLOG(5) << "Computing max_tile: However, due to small LHS parallel size,"
+               "max_output_tile = "
+            << max_tile.lhs_dim << "x" << max_tile.rhs_dim;
+  }
+  if (rhs_parallel_limit < max_tile.rhs_dim) {
+    max_tile.lhs_dim = std::min(limit_other_size_to_fit(rhs_parallel_limit),
+                                lhs_parallel_limit);
+    max_tile.rhs_dim = rhs_parallel_limit;
+    VLOG(5) << "Computing max_tile: However, due to small RHS parallel "
+               "size, max_output_tile = "
+            << max_tile.lhs_dim << "x" << max_tile.rhs_dim;
+  }
+  return max_tile;
+}
+
+bool TritonDotFusionSearchSpace::ShouldOptimizeForOccupancy() const {
+  const int64_t desired_num_ctas =
+      desired_total_warps_ / kMinWarpsPerCtaForWgmma;
+  const int64_t min_result_tiles = GetNumResultTiles(max_out_tile_);
+  if (desired_num_ctas > min_result_tiles) {
+    VLOG(5) << "Occupancy optimization: Might have as few as "
+            << min_result_tiles << " tiles, but want at least "
+            << desired_num_ctas
+            << " CTAs. Will consider trading off compute performance for "
+               "occupancy.";
+    return true;
+  }
+  return false;
+}
+
+TritonDotFusionSearchSpace::OutputTile
+TritonDotFusionSearchSpace::GetMinOutputTile() const {
+  // Triton currently doesn't support tiles smaller than 16x16.
+  // TODO: b/395572776 - Lift this restriction, and calculate a smaller tile
+  // based on the requested algorithm (e.g., if we want to use wgmma vs mma
+  // vs fma, the minimal reasonable tile size is different).
+  constexpr OutputTile kMinSupportedTile = {16, 16};
+  constexpr OutputTile kMinWgmmaTile = {64, 16};
+  if (device_description_.cuda_compute_capability().IsAtLeastHopper() &&
+      !should_optimize_for_occupancy_) {
+    VLOG(5) << "Computing output_tile: Want to use wgmma, so output_tile >= "
+            << kMinWgmmaTile.lhs_dim << "x" << kMinWgmmaTile.rhs_dim;
+    return kMinWgmmaTile;
+  }
+  VLOG(5)
+      << "Computing output_tile: Might want to target mma, so output_tile >= "
+      << kMinSupportedTile.lhs_dim << "x" << kMinSupportedTile.rhs_dim;
+  return kMinSupportedTile;
+}
+
+int TritonDotFusionSearchSpace::GetMinWarpsPerCta() const {
+  if (device_description_.cuda_compute_capability().IsAtLeastHopper() &&
+      !should_optimize_for_occupancy_) {
+    VLOG(5) << "Computing num_warps: Want to use wgmma, so num_warps >= "
+            << kMinWarpsPerCtaForWgmma;
+    return kMinWarpsPerCtaForWgmma;
+  }
+  VLOG(5) << "Computing num_warps: Considering occupancy, so num_warps >= "
+          << kMinWarpsPerCtaForOccupancy;
+  return kMinWarpsPerCtaForOccupancy;
+}
+
+int64_t TritonDotFusionSearchSpace::GetNumResultTiles(
+    OutputTile output_tile) const {
+  return batch_size_ *
+         CeilOfRatio<int64_t>(lhs_parallel_size_, output_tile.lhs_dim) *
+         CeilOfRatio<int64_t>(rhs_parallel_size_, output_tile.rhs_dim);
+}
+
+int TritonDotFusionSearchSpace::GetMaxWarpsPerCta(OutputTile tile) const {
+  // A single mma instruction is of output shape at least 16x8 (the same
+  // also holds for wgmma: the warp-group level instruction is at least
+  // 64x8, and split 4-ways across the 4 warps in the group).
+  constexpr OutputTile kMmaSubTile = {16, 8};
+  const int max_warps =
+      device_description_.threads_per_block_limit() /
+      std::max<int>(device_description_.threads_per_warp(), 1);
+  const int lhs_warps = CeilOfRatio(tile.lhs_dim, kMmaSubTile.lhs_dim);
+  const int rhs_warps = CeilOfRatio(tile.rhs_dim, kMmaSubTile.rhs_dim);
+  return std::max(min_warps_per_cta_,
+                  std::min(max_warps, lhs_warps * rhs_warps));
+}
+
+int TritonDotFusionSearchSpace::GetMinContractingTileSize() const {
+  // The number of bits that both MMA and WGMMA instructions expect to have in
+  // the contracting dimension. See
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-shape
+  constexpr int kMmaContractingBitwidth = 128;
+  /// TODO: b/395572776 - Triton currently requires at least 16 elements, but we
+  // should be able to relax this and remove this limit here.
+  constexpr int kTritonLowerLimit = 16;
+  const int min_contracting_tile_size =
+      std::max(kMmaContractingBitwidth / operand_bitwidth_, kTritonLowerLimit);
+  VLOG(5) << "Computing min_contracting_tile_size: Based on bitwidth of "
+          << operand_bitwidth_
+          << ", min_contracting_tile_size = " << min_contracting_tile_size;
+  return min_contracting_tile_size;
+}
+
+int TritonDotFusionSearchSpace::GetMaxContractingSplit(
+    OutputTile output_tile) const {
+  const int64_t desired_num_ctas = desired_total_warps_ / min_warps_per_cta_;
+  VLOG(5) << "Computing split_k: Considering output tile "
+          << output_tile.lhs_dim << "x" << output_tile.rhs_dim;
+  VLOG(5) << "Computing split_k: Want up to " << desired_num_ctas
+          << " CTAs to occupy all cores.";
+
+  const int64_t min_result_tiles = GetNumResultTiles(output_tile);
+  VLOG(5) << "Computing split_k: Without split_k have " << min_result_tiles
+          << " tiles.";
+
+  const int64_t split_for_occupancy =
+      NextPowerOfTwo(CeilOfRatio(desired_num_ctas, min_result_tiles));
+  VLOG(5) << "Computing split_k: Want split_k of up to " << split_for_occupancy
+          << " for sufficient occupancy.";
+
+  const int64_t split_for_contracting_size =
+      NextPowerOfTwo(contracting_size_ / min_contracting_tile_size_);
+  VLOG(5) << "Computing split_k: Can't have split_k more than "
+          << split_for_contracting_size
+          << " to have sufficiently large contracting dimension.";
+
+  const int64_t split =
+      std::min(split_for_occupancy, split_for_contracting_size);
+  VLOG(5) << "Computing split_k: max_split_k = " << split;
+  return split;
+}
+
+int TritonDotFusionSearchSpace::GetContractingSizeLimitToFitSharedMemory(
+    OutputTile output_tile) const {
+  const int64_t shared_memory_budget =
+      device_description_.shared_memory_per_block_optin();
+  // Need to satisfy:
+  //   (lhs_dim  + rhs_dim) * contracting_dim * bitwidth <= budget_in_bits
+  return 8 * shared_memory_budget / compute_bitwidth_ /
+         (output_tile.lhs_dim + output_tile.rhs_dim);
+}
+
+int TritonDotFusionSearchSpace::GetMaxContractingTileSize(
+    OutputTile output_tile, int contracting_split) const {
+  const int64_t available_size = contracting_size_ / contracting_split;
+  const int size_limit = GetContractingSizeLimitToFitSharedMemory(output_tile);
+  const int max_size =
+      std::min(NextPowerOfTwo(available_size), PreviousPowerOfTwo(size_limit));
+  VLOG(5) << "Computing max_contracting_tile_size for tiling BxMxN = "
+          << contracting_split << "x" << output_tile.lhs_dim << "x"
+          << output_tile.rhs_dim << ": limit based on problem is "
+          << available_size << ", limit based on available shared memory is "
+          << size_limit << ", max_contracting_tile_size = " << max_size;
+  return std::max(min_contracting_tile_size_, max_size);
+}
+
+int TritonDotFusionSearchSpace::GetMaxNumStages(OutputTile output_tile,
+                                                int contracting_tile_size,
+                                                int contracting_split) const {
+  const int64_t available_stages = CeilOfRatio<int64_t>(
+      contracting_size_, contracting_split * contracting_tile_size);
+  const int64_t stage_limit = std::max(
+      1, CeilOfRatio(GetContractingSizeLimitToFitSharedMemory(output_tile),
+                     contracting_tile_size));
+  // Number of stages is basically a replacement for oversubscription, so
+  // the maximum number we want is also limited by kMaxWarpsPerScheduler.
+  const int stages = std::min({available_stages, stage_limit,
+                               static_cast<int64_t>(kMaxWarpsPerScheduler)});
+  VLOG(5) << "Computing max_num_stages for tiling BxMxNxK = "
+          << contracting_split << "x" << output_tile.lhs_dim << "x"
+          << output_tile.rhs_dim << "x" << contracting_tile_size
+          << ": limit based on problem is " << available_stages
+          << ", limit based on available shared memory is " << stage_limit
+          << ", max_num_stages = " << stages;
+  return stages;
+}
+
+std::vector<TritonDotFusionSearchSpace::ConfigWithNotes>
+TritonDotFusionSearchSpace::GenerateContractingSplitFactors() const {
+  CHECK_GE(max_contracting_split_, 1);
+  std::vector<ConfigWithNotes> configs;
+  ConfigWithNotes config;
+  for (int split = 1; split <= max_contracting_split_; split *= 2) {
+    config.config.split_k = split;
+    VLOG(10) << "Generating contracting split factors: config = "
+             << config.ToString();
+    configs.push_back(config);
+  }
+  return configs;
+}
+
+void TritonDotFusionSearchSpace::ExtendConfigs(
+    std::vector<ConfigWithNotes>& configs,
+    ExtendConfigCallback extend_config) const {
+  CHECK(!configs.empty());
+  std::vector<ConfigWithNotes> updated_configs;
+  for (ConfigWithNotes& config : configs) {
+    (this->*extend_config)(config, updated_configs);
+  }
+  CHECK(!updated_configs.empty());
+  configs = std::move(updated_configs);
+}
+
+void TritonDotFusionSearchSpace::AddOutputTilings(
+    const ConfigWithNotes& config,
+    std::vector<ConfigWithNotes>& updated_configs) const {
+  CHECK_GT(config.config.split_k, 0)
+      << "Need config with contracting split already set.";
+  const int split = config.config.split_k;
+  ConfigWithNotes new_config = config;
+  for (int m = min_out_tile_.lhs_dim; m <= max_out_tile_.lhs_dim; m *= 2) {
+    int min_n = min_out_tile_.rhs_dim;
+    int max_n = max_out_tile_.rhs_dim;
+    // If there are square-ish tiles contained within the search space, it is
+    // extremely unlikely that a non-square-ish tile will perform better, since
+    // it does not optimize data reuse. The one exception to this is the
+    // edge-case where one of the dimensions is small: m >= LHS dim, or max_n >=
+    // RHS dim.
+    //
+    // Thus, as soon as there are square-ish tiles in the search space, and
+    // we're not in the edge case (i.e., m < LHS dim; the requirement on max_n
+    // is satisfied by construction as soon as [m/2, m*2] and [min_n, max_n]
+    // overlap), we can restrict the n-space to only these tiles.
+    auto overlaps = [](std::pair<int, int> a, std::pair<int, int> b) {
+      return !(a.second < b.first || b.second < a.first);
+    };
+    if (m < lhs_parallel_size_ && overlaps({m / 2, m * 2}, {min_n, max_n})) {
+      // If one of the sides has an expensive op fused in, then we should allow
+      // the tile of the other side to be larger, as that reduce the amount of
+      // recomputation of the expensive op.
+      if (!rhs_has_expensive_op_) {
+        min_n = std::max(m / 2, min_n);
+      }
+      if (!lhs_has_expensive_op_) {
+        max_n = std::min(m * 2, max_n);
+      }
+      VLOG(5) << "Computing output tile: For m = " << m
+              << ", restricting n-space to [" << min_n << "," << max_n
+              << "] to have square-ish tiles.";
+    }
+    for (int n = min_n; n <= max_n; n *= 2) {
+      OutputTile tile = {m, n};
+      // We could make the tile size limits depend on split_k, but then we
+      // need to implement the "inverse" of `GetMaxContractingSplit`.
+      // Simpler is to just verify that the given combination of tiling and
+      // split_k is compatible.
+      if (!config.keep_large_split && GetMaxContractingSplit(tile) < split) {
+        VLOG(10) << "Skipping due to too large split_k, config = "
+                 << new_config.ToString();
+        continue;
+      }
+      new_config.not_enough_tiles =
+          GetNumResultTiles(tile) * split < device_description_.core_count();
+      new_config.config.block_m = m;
+      new_config.config.block_n = n;
+      VLOG(10) << "Adding output tiling: config = " << new_config.ToString();
+      updated_configs.push_back(new_config);
+    }
+  }
+}
+
+void TritonDotFusionSearchSpace::AddCtaSizeParameter(
+    const ConfigWithNotes& config,
+    std::vector<ConfigWithNotes>& updated_configs) const {
+  ConfigWithNotes new_config = config;
+  const int tile_rows = config.config.block_m;
+  const int tile_cols = config.config.block_n;
+  CHECK_GT(tile_rows * tile_cols, 0)
+      << "Need configs with output tilings determined.";
+  const int max_warps = GetMaxWarpsPerCta({tile_rows, tile_cols});
+  VLOG(5) << "Computing max_warps: For output_tile = " << tile_rows << "x"
+          << tile_cols
+          << " and (wg)mma instruction shape, max_warps = " << max_warps;
+  for (int warps = min_warps_per_cta_; warps <= max_warps; warps *= 2) {
+    new_config.config.num_warps = warps;
+    VLOG(10) << "Adding CTA size parameter: config = " << new_config.ToString();
+    updated_configs.push_back(new_config);
+  }
+}
+
+void TritonDotFusionSearchSpace::AddContractingTiling(
+    const ConfigWithNotes& config,
+    std::vector<ConfigWithNotes>& updated_configs) const {
+  const int tile_rows = config.config.block_m;
+  const int tile_cols = config.config.block_n;
+  const int split = config.config.split_k;
+  CHECK_GT(tile_rows * tile_cols, 0)
+      << "Need configs with output tilings determined.";
+  CHECK_GT(split, 0) << "Need config with contracting split determined.";
+  int max_tile_size =
+      std::max(GetMaxContractingTileSize({tile_rows, tile_cols}, split),
+               min_contracting_tile_size_);
+  ConfigWithNotes new_config = config;
+  for (int k = min_contracting_tile_size_; k <= max_tile_size; k *= 2) {
+    new_config.config.block_k = k;
+    VLOG(10) << "Adding contracting tiling: config = " << new_config.ToString();
+    updated_configs.push_back(new_config);
+  }
+}
+
+void TritonDotFusionSearchSpace::AddPipeliningParameter(
+    const ConfigWithNotes& config,
+    std::vector<ConfigWithNotes>& updated_configs) const {
+  const int tile_rows = config.config.block_m;
+  const int tile_cols = config.config.block_n;
+  const int tile_contracting = config.config.block_k;
+  const int split = config.config.split_k;
+  CHECK_GT(tile_rows * tile_cols, 0)
+      << "Need config with output tilings determined.";
+  CHECK_GT(tile_contracting, 0)
+      << "Need config with contracting tiling determined.";
+  CHECK_GT(split, 0) << "Need config with contracting split determined.";
+  int max_stages =
+      GetMaxNumStages({tile_rows, tile_cols}, tile_contracting, split);
+  ConfigWithNotes new_config = config;
+  for (int num_stages = 1; num_stages <= max_stages; ++num_stages) {
+    new_config.config.num_stages = num_stages;
+    VLOG(10) << "Adding pipelining parameter: config = "
+             << new_config.ToString();
+    updated_configs.push_back(new_config);
+  }
+}
+
+void TritonDotFusionSearchSpace::EliminateLowOccupancyConfigs(
+    std::vector<ConfigWithNotes>& configs) const {
+  CHECK(!configs.empty());
+  ConfigWithNotes last_config = configs.back();  // Largest split.
+  auto has_too_few_tiles = [](const ConfigWithNotes& config) {
+    if (config.not_enough_tiles) {
+      VLOG(10) << "Skipping due to fewer tiles than cores, config = "
+               << config.ToString();
+    }
+    return config.not_enough_tiles;
+  };
+  configs.erase(llvm::remove_if(configs, has_too_few_tiles), configs.end());
+  if (configs.empty()) {
+    // We can get no configs if the problem is small enough to not even occupy
+    // all cores. In that case, we just use the largest split and smallest
+    // tiling.
+    last_config.config.block_m = min_out_tile_.lhs_dim;
+    last_config.config.block_n = min_out_tile_.rhs_dim;
+    VLOG(10) << "No configs with sufficient occupancy, using config = "
+             << last_config.ToString();
+    configs.push_back(last_config);
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
new file mode 100644
index 000000000000..10c35f56ff18
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
@@ -0,0 +1,232 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_DOT_SEARCH_SPACE_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_DOT_SEARCH_SPACE_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Generates the space of promising Triton configs for a given dot fusion
+// and hardware.
+//
+// Takes into account the properties of the problem (e.g., operand and result
+// shapes, fused instructions), and the hardware (e.g., number of cores,
+// available registers and memory per core).
+//
+// Internal doc with rationale: go/xla-gpu-dot-search
+class TritonDotFusionSearchSpace {
+ public:
+  TritonDotFusionSearchSpace(const se::DeviceDescription& device_description,
+                             const HloDotInstruction* dot);
+
+  // Generates the list of promising configs in the search space for the
+  // autotuner to try. If `force_contracting_split` is set, the search space
+  // will be restricted to only include configs with the given split_k factor.
+  std::vector<TritonGemmConfig> GenerateConfigs(
+      std::optional<int64_t> force_contracting_split = std::nullopt) const;
+
+  // Restrict the set of configs to the ones compatible with the hints list.
+  // Generally, this will mean that configs are restricted to the ones that
+  // appear in hints. The implementation is allowed to deviate though, and
+  // slightly change the hints list if it thinks that the exact configs in the
+  // hints are unlikely to be performant (e.g., if the RHS side of a config in
+  // hints list is larger than the problem's RHS side, it might restrict that
+  // config to the problem's RHS size).
+  std::vector<TritonGemmConfig> OptimizeConfigSet(
+      const std::vector<TritonGemmConfig>& configs,
+      const std::vector<TritonGemmConfig>& hints) const;
+
+  // Serializes the search space to a human-readable string.
+  std::string ToString() const;
+
+ private:
+  // Groups together the tiling of the dot's output dimensions: the parallel
+  // dimensions of the left and right hand sides. We assume that any batch
+  // dimensions are tiled by a factor of 1.
+  struct OutputTile {
+    int lhs_dim = 0;  // LHS tiling (aka. block_m).
+    int rhs_dim = 0;  // RHS tiling (aka. block_n).
+  };
+
+  // Adds notes to configs, which carry additional information we need to
+  // consider while generating the search space.
+  struct ConfigWithNotes {
+    TritonGemmConfig config;
+    // This config has a larger than expected split_k, but we do not want to
+    // discard it.
+    bool keep_large_split = false;
+    // This config does not have enough tiles for all cores to be occupied.
+    bool not_enough_tiles = false;
+
+    std::string ToString() const { return config.ToString(); }
+  };
+
+  // Newer NVIDIA GPUs can achieve good enough occupancy with as
+  // few as 2 warps per Cooperative Thread Array (CTA). See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
+  static constexpr int kMinWarpsPerCtaForOccupancy = 2;
+  // To use Hopper's wgmma instructions, we need at least a single "warp
+  // group" (4 warps) within a CTA to cooperate on a single instruction.
+  /// https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions
+  static constexpr int kMinWarpsPerCtaForWgmma = 4;
+  // Approximation on the maximum number of warps we would want to oversubscribe
+  // the SMs with to overlap different GPU pipes (memory, tensor core, ALU,
+  // special function unit, etc.)
+  // TODO: b/408114338 - Figure out a better model for this.
+  static constexpr int kMaxWarpsPerScheduler = 5;
+
+  // Callback type for `ExtendConfigs`. The method should append zero or more
+  // extensions of `config` to the `updated_configs` vector.
+  using ExtendConfigCallback = void (TritonDotFusionSearchSpace::*)(
+      const ConfigWithNotes& config,
+      std::vector<ConfigWithNotes>& updated_configs) const;
+
+  // Extends Triton gemm configs by repeatedly calling `*extend_config()` on
+  // each config in `configs`. Expects that after all calls to `extend_config`,
+  // the updated list of configs is non-empty.
+  void ExtendConfigs(std::vector<ConfigWithNotes>& configs,
+                     ExtendConfigCallback extend_config) const;
+
+  bool HasExpensiveTransitiveParent(const HloInstruction* operand) const;
+
+  // Computes the maximum number of total warps we should have to sufficiently
+  // saturate the GPU.
+  //
+  // We're counting warps instead of blocks here, since we already need this
+  // value as a consideration to decide how large the blocks should be (which
+  // then impacts how many of them we should have).
+  int GetDesiredTotalWarps() const;
+
+  // Computes the maximum sensible size of the output tile (block_m, block_n)
+  // based on the dot shape and element type, and the available registers on
+  // the core.
+  OutputTile GetMaxOutputTile() const;
+
+  // Computes the number of result tiles we would have without
+  // splitting the contracting dimension for a given output tile.
+  int64_t GetNumResultTiles(OutputTile output_tile) const;
+
+  // Decides if the problem is small enough so it makes sense to trade off
+  // compute for occupancy efficiency.
+  bool ShouldOptimizeForOccupancy() const;
+
+  // Computes the minimum sensible size of the output tile (block_m, block_n).
+  OutputTile GetMinOutputTile() const;
+
+  // Computes the minimum number of warps we want to try using per Cooperative
+  // Thread Array (CTA).
+  int GetMinWarpsPerCta() const;
+
+  // Computes how many warps per Cooperative Thread Array (aka. CTA, aka. CUDA
+  // block) is reasonable for the given output tile and restrictions on
+  // instruction shape.
+  int GetMaxWarpsPerCta(OutputTile output_tile) const;
+
+  // Computes the minimum reasonable tile size for the contracting dimension
+  // given the element types of the operands.
+  int GetMinContractingTileSize() const;
+
+  // Computes the maximum sensible split in the contracting dimension
+  // (split_k) to sufficiently occupy all available cores when using the given
+  // output tile.
+  int GetMaxContractingSplit(OutputTile output_tile) const;
+
+  // Computes the size limit for contracting dimension, based on the shared
+  // memory budget.
+  int GetContractingSizeLimitToFitSharedMemory(OutputTile output_tile) const;
+
+  // Computes the maximum reasonable tile size for the contracting dimension for
+  // the given output tile and contracting split.
+  int GetMaxContractingTileSize(OutputTile output_tile,
+                                int contracting_split) const;
+
+  // Computes the maximum reasonable number of stages for the given output and
+  // input tilings and contracting split.
+  int GetMaxNumStages(OutputTile output_tile, int contracting_tile_size,
+                      int contracting_split) const;
+
+  // Finds all promising values for splitting the contracting dimension to
+  // achieve sufficient occupancy (split_k).
+  std::vector<ConfigWithNotes> GenerateContractingSplitFactors() const;
+
+  // Finds all promising output shape tilings (block_m, block_n), based on
+  // `config` with already determined contracting split value and appends them
+  // to `updated_configs`. Each config in the input list might yield zero or
+  // more configs in the output.
+  void AddOutputTilings(const ConfigWithNotes& config,
+                        std::vector<ConfigWithNotes>& updated_configs) const;
+
+  // Finds all promising values for the Cooperative Thread Array (aka. CTA, aka.
+  // CUDA block) size (num_warps), based on `config` with already determined
+  // output tiling and appends them to `updated_configs`. Each config in the
+  // input list might yield zero or more configs in the output.
+  void AddCtaSizeParameter(const ConfigWithNotes& config,
+                           std::vector<ConfigWithNotes>& updated_configs) const;
+
+  // Finds all promising values for the contracting dimension tile size
+  // (block_k), based on `config` with already determined contracting split and
+  // output tiling, and appends them to `updated_configs`. Each config in the
+  // input list might yield zero or more configs in the output.
+  void AddContractingTiling(
+      const ConfigWithNotes& config,
+      std::vector<ConfigWithNotes>& updated_configs) const;
+
+  // Finds all promising values for the pipelining parameter, based on
+  // `config` with already determined contracting split, output tiling, and
+  // contracting tile size, and appends them to `updated_configs`. Each config
+  // in the input list might yield zero or more configs in the output.
+  void AddPipeliningParameter(
+      const ConfigWithNotes& config,
+      std::vector<ConfigWithNotes>& updated_configs) const;
+
+  // Removes configs that are marked with `not_enough_tiles` from the list. If
+  // this results in an empty list, adds a config that should be the most
+  // optimal one even though it does not occupy all cores.
+  void EliminateLowOccupancyConfigs(
+      std::vector<ConfigWithNotes>& configs) const;
+
+  // The order of these fields is important: the values of those defined earlier
+  // are used to compute the values of later ones.
+  se::DeviceDescription device_description_;
+  int64_t contracting_size_;
+  int64_t batch_size_;
+  int64_t lhs_parallel_size_;
+  int64_t rhs_parallel_size_;
+  int operand_bitwidth_;
+  int compute_bitwidth_;
+  bool lhs_has_expensive_op_;
+  bool rhs_has_expensive_op_;
+  int desired_total_warps_;
+  OutputTile max_out_tile_;
+  bool should_optimize_for_occupancy_;
+  OutputTile min_out_tile_;
+  int min_warps_per_cta_;
+  int min_contracting_tile_size_;
+  int max_contracting_split_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_DOT_SEARCH_SPACE_H_
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
new file mode 100644
index 000000000000..2875c004f6a9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space_test.cc
@@ -0,0 +1,537 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/autotuning/dot_search_space.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::AllOf;
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Field;
+using ::testing::Ge;
+using ::testing::IsEmpty;
+using ::testing::Le;
+using ::testing::SizeIs;
+
+// Returns a matcher that verifies that each container element that matches
+// `filter` also matches `matcher`, and that there is at least one such element.
+template <typename FilterMatcher, typename Matcher>
+auto WhenFilteredBy(FilterMatcher filter, Matcher matcher) {
+  // We check the negation: there is no element that matches `filter` and does
+  // not match `matcher`.
+  return AllOf(Contains(filter), Not(Contains(AllOf(filter, Not(matcher)))));
+}
+
+template <typename MatcherType>
+auto BlockMIs(MatcherType matcher) {
+  return Field("block_m", &TritonGemmConfig::block_m, matcher);
+}
+template <typename MatcherType>
+auto BlockNIs(MatcherType matcher) {
+  return Field("block_n", &TritonGemmConfig::block_n, matcher);
+}
+template <typename MatcherType>
+auto BlockKIs(MatcherType matcher) {
+  return Field("block_k", &TritonGemmConfig::block_k, matcher);
+}
+template <typename MatcherType>
+auto SplitKIs(MatcherType matcher) {
+  return Field("split_k", &TritonGemmConfig::split_k, matcher);
+}
+template <typename MatcherType>
+auto NumStagesIs(MatcherType matcher) {
+  return Field("num_stages", &TritonGemmConfig::num_stages, matcher);
+}
+template <typename MatcherType>
+auto NumWarpsIs(MatcherType matcher) {
+  return Field("num_warps", &TritonGemmConfig::num_warps, matcher);
+}
+template <typename MatcherType>
+auto NumCtasIs(MatcherType matcher) {
+  return Field("num_ctas", &TritonGemmConfig::num_ctas, matcher);
+}
+
+auto IsValidConfig() {
+  return AllOf(BlockMIs(Ge(1)), BlockNIs(Ge(1)), BlockKIs(Ge(1)),
+               SplitKIs(Ge(1)), NumStagesIs(Ge(1)), NumWarpsIs(Ge(1)),
+               NumCtasIs(Ge(1)));
+};
+
+class DefaultDeviceDotSearchSpaceTest : public HloHardwareIndependentTestBase {
+ protected:
+  se::DeviceDescription device_description_{
+      se::GpuDeviceInfoProto::default_instance()};
+
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>> GetDefaultDotModule(
+      int lhs_parallel_dim = 1024, int rhs_parallel_dim = 1024,
+      int contracting_dim = 1024) {
+    constexpr const char* kModuleTextFormat = R"(
+ENTRY e {
+  p0 = f16[%d,%d] parameter(0)
+  p1 = f16[%d,%d] parameter(1)
+  ROOT r = f16[%d,%d] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+    return ParseAndReturnVerifiedModule(absl::StrFormat(
+        kModuleTextFormat, lhs_parallel_dim, contracting_dim, contracting_dim,
+        rhs_parallel_dim, lhs_parallel_dim, rhs_parallel_dim));
+  }
+
+  HloDotInstruction* GetDot(VerifiedHloModule* module) {
+    return Cast<HloDotInstruction>(
+        module->entry_computation()->root_instruction());
+  }
+
+  TritonDotFusionSearchSpace MakeSearchSpace(VerifiedHloModule* module) {
+    return TritonDotFusionSearchSpace(device_description_, GetDot(module));
+  }
+};
+
+TEST_F(DefaultDeviceDotSearchSpaceTest, ReturnsValidConfigList) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule());
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(), Not(IsEmpty()));
+}
+
+class DotSearchSpaceTest : public DefaultDeviceDotSearchSpaceTest {
+ protected:
+  DotSearchSpaceTest() {
+    // Using H100 numbers as the most relevant example here.
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+    // https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/#nvidia_h100_gpu_architecture_in-depth
+    device_description_.set_registers_per_block_limit(64 * 1024);
+    device_description_.set_core_count(132);
+    device_description_.set_threads_per_block_limit(1024);
+    device_description_.set_threads_per_warp(32);
+    device_description_.set_shared_memory_per_block_optin(227 * 1024);
+    device_description_.set_gpu_compute_capability(
+        se::CudaComputeCapability::Hopper());
+  }
+};
+
+TEST_F(DotSearchSpaceTest, SerializesSearchSpace) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/1024, /*rhs_parallel_dim=*/1024,
+                          /*contracting_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_EQ(search_space.ToString(),
+            "problem_size_BxMxNxKxE: 1x1024x1024x1024x(16->16) "
+            "tile_range_SxMxNxK: [1-64]x[16-256]x[16-512]x[16-?] "
+            "desired_total_warps: 2640 occupancy_optimization: 1 "
+            "warps_per_cta: [2-?]");
+}
+
+TEST_F(DotSearchSpaceTest, ReturnsValidConfigList) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule());
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              AllOf(Not(IsEmpty()), Each(IsValidConfig())));
+}
+
+TEST_F(DotSearchSpaceTest, HonorsForcedContractingSplit) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule());
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(
+      search_space.GenerateConfigs(/*force_contracting_split=*/2),
+      AllOf(Not(IsEmpty()), Each(IsValidConfig()), Each(SplitKIs(Eq(2)))));
+}
+
+TEST_F(DotSearchSpaceTest, ConsidersContractingSplitForSmallOutputSize) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/16,
+                                              /*rhs_parallel_dim=*/16,
+                                              /*contracting_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(), Contains(SplitKIs(Ge(2))));
+}
+
+TEST_F(DotSearchSpaceTest, LimitsContractingSplitForSmallerContractingSize) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/16,
+                                              /*rhs_parallel_dim=*/16,
+                                              /*contracting_dim=*/32));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              AllOf(Not(IsEmpty()), Each(SplitKIs(Le(4)))));
+}
+
+TEST_F(DotSearchSpaceTest, FindsGoodDataReuseOutputTiles) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/1024,
+                                              /*rhs_parallel_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              Contains(AllOf(BlockMIs(Ge(32)), BlockNIs(Ge(32)))).Times(Ge(2)));
+}
+
+TEST_F(DotSearchSpaceTest, RestrictsOutputToSquareishTiles) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/1024,
+                                              /*rhs_parallel_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(
+      search_space.GenerateConfigs(),
+      WhenFilteredBy(BlockMIs(Eq(64)), BlockNIs(AllOf(Ge(32), Le(128)))));
+}
+
+TEST_F(DotSearchSpaceTest, AllowsLargerRhsForExpensiveLhs) {
+  constexpr const char* kModuleText = R"(
+ENTRY e {
+  p0 = f16[4096,4096] parameter(0)
+  e0 = f16[4096,4096] exponential(p0)
+  p1 = f16[4096,4096] parameter(1)
+  ROOT r = f16[4096,4096] dot(e0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleText));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              Contains(AllOf(BlockMIs(Eq(32)), BlockNIs(Ge(128)))));
+}
+
+TEST_F(DotSearchSpaceTest, AllowsLargerLhsForExpensiveRhs) {
+  constexpr const char* kModuleText = R"(
+ENTRY e {
+  p0 = f16[4096,4096] parameter(0)
+  p1 = f16[4096,4096] parameter(1)
+  e1 = f16[4096,4096] exponential(p1)
+  ROOT r = f16[4096,4096] dot(p0, e1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleText));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              Contains(AllOf(BlockMIs(Ge(128)), BlockNIs(Eq(32)))));
+}
+
+TEST_F(DotSearchSpaceTest, FindsGoodDataReuseTilesForLowOccupancyProblem) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/4096, /*rhs_parallel_dim=*/16,
+                          /*contracting_dim=*/4096));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              Contains(AllOf(BlockMIs(Ge(32)), SplitKIs(Ge(2)))));
+}
+
+TEST_F(DotSearchSpaceTest,
+       FindsUniqueOccupancyMaximizingTilingForSmallProblem) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/32, /*rhs_parallel_dim=*/32,
+                          /*contracting_dim=*/32));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              AllOf(SizeIs(1), Each(AllOf(BlockMIs(Eq(16)), BlockNIs(Eq(16)),
+                                          SplitKIs(Eq(2))))));
+}
+
+TEST_F(DotSearchSpaceTest, FindsGoodDataReuseTilesForForcedHugeSplit) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/1024,
+                                              /*rhs_parallel_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(
+      search_space.GenerateConfigs(/*force_contracting_split=*/128),
+      Contains(AllOf(BlockMIs(Ge(32)), BlockNIs(Ge(32)), SplitKIs(Eq(128)))));
+}
+
+TEST_F(DotSearchSpaceTest, PadsTilesForSmallParallelDimension) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/1024,
+                                              /*rhs_parallel_dim=*/15,
+                                              /*contracting_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(), Contains(BlockNIs(Eq(16))));
+}
+
+TEST_F(DotSearchSpaceTest, HonorsMinimumOutputTileSizeForTinyProblem) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/12,
+                                              /*rhs_parallel_dim=*/8,
+                                              /*contracting_dim=*/16));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(
+      search_space.GenerateConfigs(),
+      AllOf(Not(IsEmpty()), Each(BlockMIs(Ge(16))), Each(BlockNIs(Ge(16)))));
+}
+
+TEST_F(DotSearchSpaceTest, AssignsEnoughWarpsPerScheduler) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/1024, /*rhs_parallel_dim=*/512,
+                          /*contracting_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  // 1024x512 elements / 32x32 elements/CTA = 32x16 blocks = 512 CTAs.
+  // 512 CTAs * 4 warps/CTA = 2048 warps.
+  // 132 cores * 4 schedulers/core * 5 desired warps/scheduler = 2640 desired
+  // warps.
+  // ceil(2640 desired warps / 2048 warps) = ceil(1.3) = 2 desired split
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              Contains(AllOf(BlockMIs(Eq(32)), BlockNIs(Eq(32)),
+                             NumWarpsIs(Eq(4)), SplitKIs(Eq(2)))));
+}
+
+TEST_F(DotSearchSpaceTest, DoesNotBreakCtaSizeLimits) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/1024 * 16,
+                                              /*rhs_parallel_dim=*/1024 * 16));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              AllOf(Not(IsEmpty()), Each(NumWarpsIs(Le(32)))));
+}
+
+TEST_F(DotSearchSpaceTest, ConsidersAppropriateCtaSizeForTileSize) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/4096,
+                                              /*rhs_parallel_dim=*/4096));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              AllOf(Contains(AllOf(BlockMIs(Eq(64)), BlockNIs(Eq(32)),
+                                   NumWarpsIs(Eq(4)))),
+                    Contains(AllOf(BlockMIs(Eq(64)), BlockNIs(Eq(64)),
+                                   NumWarpsIs(Eq(8))))));
+}
+
+TEST_F(DotSearchSpaceTest, FindsFullCacheLineContractingTileSize) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/1024, /*rhs_parallel_dim=*/1024,
+                          /*contracting_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(), Contains(BlockKIs(Ge(64))));
+}
+
+TEST_F(DotSearchSpaceTest, HonorsSharedMemoryLimit) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/4096, /*rhs_parallel_dim=*/4096,
+                          /*contracting_dim=*/4096));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  // We pick the 128x128 output tiling and only verify that configs with these
+  // properties honor the memory limit. This simplifies the test logic and makes
+  // the calculation easier to verify by hand, while not reducing the coverage
+  // of the test.
+  // 2B * (128 + 128) * block_k < 227 KB =>
+  // block_k <= 227 KB / (2B * (128 + 128)) = 454
+  EXPECT_THAT(search_space.GenerateConfigs(/*force_contracting_split=*/1),
+              WhenFilteredBy(AllOf(BlockMIs(Eq(128)), BlockNIs(Eq(128))),
+                             BlockKIs(Le(256))));
+}
+
+TEST_F(DotSearchSpaceTest, HonorsContractingSizeLimit) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/1024, /*rhs_parallel_dim=*/1024,
+                          /*contracting_dim=*/256));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(/*force_contracting_split=*/4),
+              AllOf(Not(IsEmpty()), Each(BlockKIs(Le(64)))));
+}
+
+TEST_F(DotSearchSpaceTest, EnsuresContractingTileSizeFitsInstructonShape) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/1024, /*rhs_parallel_dim=*/1024,
+                          /*contracting_dim=*/4));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              AllOf(Not(IsEmpty()), Each(BlockKIs(Ge(8)))));
+}
+
+TEST_F(DotSearchSpaceTest, FindReasonablePipeliningStageCount) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule());
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(search_space.GenerateConfigs(),
+              AllOf(Contains(NumStagesIs(Ge(2))).Times(Ge(2)),
+                    Contains(NumStagesIs(Eq(1))), Each(NumStagesIs(Le(5)))));
+}
+
+TEST_F(DotSearchSpaceTest, LimitsStagesToAvailableTileSize) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/1024, /*rhs_parallel_dim=*/1024,
+                          /*contracting_dim=*/128));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  // We pick the 64x32x32 tiling and only verify that configs with these
+  // properties choose the right number of stages. This simplifies the test
+  // logic and makes the calculation easier to verify by hand, while not
+  // reducing the coverage of the test.
+  EXPECT_THAT(search_space.GenerateConfigs(/*force_contracting_split=*/2),
+              WhenFilteredBy(
+                  AllOf(BlockMIs(Eq(64)), BlockNIs(Eq(32)), BlockKIs(Eq(32))),
+                  NumStagesIs(Le(2))));
+}
+
+TEST_F(DotSearchSpaceTest, ConsidersFewWarpsPerCtaAndMmaForSmallProblem) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/128, /*rhs_parallel_dim=*/128,
+                          /*contracting_dim=*/128));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(
+      search_space.GenerateConfigs(),
+      Contains(AllOf(NumWarpsIs(Eq(2)), BlockMIs(Eq(16)), BlockNIs(Eq(16)))));
+}
+
+TEST_F(DotSearchSpaceTest, EnsuresWgmmaShapeForLargeProblem) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/16 * 1024,
+                                              /*rhs_parallel_dim=*/16 * 1024,
+                                              /*contracting_dim=*/4096));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  EXPECT_THAT(
+      search_space.GenerateConfigs(),
+      AllOf(Not(IsEmpty()), Each(AllOf(NumWarpsIs(Ge(4)), BlockMIs(Ge(64)),
+                                       BlockNIs(Ge(16))))));
+}
+
+TEST_F(DotSearchSpaceTest, ReturnsAllConfigsIfNoHints) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule());
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+  std::vector<TritonGemmConfig> configs = search_space.GenerateConfigs();
+
+  EXPECT_THAT(search_space.OptimizeConfigSet(configs, {}),
+              ElementsAreArray(configs));
+}
+
+TEST_F(DotSearchSpaceTest, OptimizesEmptyConfigSet) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule());
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+  TritonGemmConfig hint = {/*block_m=*/32,   /*block_n=*/32,
+                           /*block_k=*/32,   /*split_k=*/1,
+                           /*num_stages=*/1, /*num_warps=*/4,
+                           /*num_ctas=*/1};
+
+  EXPECT_THAT(search_space.OptimizeConfigSet({}, {hint}), IsEmpty());
+}
+
+TEST_F(DotSearchSpaceTest, RestrictsConfigsToHints) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule());
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+  TritonGemmConfig matching_hint = {
+      /*block_m=*/32, /*block_n=*/32,   /*block_k=*/32,
+      /*split_k=*/1,  /*num_stages=*/1, /*num_warps=*/4,
+      /* num_ctas=*/1};
+  TritonGemmConfig non_matching_hint = {
+      /*block_m=*/64, /*block_n=*/32,   /*block_k=*/32,
+      /*split_k=*/1,  /*num_stages=*/1, /*num_warps=*/4,
+      /*num_ctas=*/1};
+  TritonGemmConfig other_config = {
+      /*block_m=*/32, /*block_n=*/64,   /*block_k=*/32,
+      /*split_k=*/1,  /*num_stages=*/1, /*num_warps=*/4,
+      /*num_ctas=*/1};
+
+  EXPECT_THAT(
+      search_space.OptimizeConfigSet({other_config, matching_hint},
+                                     {matching_hint, non_matching_hint}),
+      ElementsAre(matching_hint));
+}
+
+TEST_F(DotSearchSpaceTest, RestrictsConfigsWithPartialMatch) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> module,
+      GetDefaultDotModule(/*lhs_parallel_dim=*/4096, /*rhs_parallel_dim=*/16,
+                          /*contracting_dim=*/1024));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+  TritonGemmConfig hint = {/*block_m=*/32,   /*block_n=*/32,
+                           /*block_k=*/32,   /*split_k=*/1,
+                           /*num_stages=*/1, /*num_warps=*/4,
+                           /*num_ctas=*/1};
+  TritonGemmConfig expected = {/*block_m=*/32,   /*block_n=*/16,
+                               /*block_k=*/32,   /*split_k=*/2,
+                               /*num_stages=*/1, /*num_warps=*/4,
+                               /*num_ctas=*/1};
+
+  EXPECT_THAT(
+      search_space.OptimizeConfigSet(
+          search_space.GenerateConfigs(/*force_contracting_split=*/2), {hint}),
+      ElementsAre(expected));
+}
+
+TEST_F(DotSearchSpaceTest, ReturnsNonEmptySetForUnusualHints) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          GetDefaultDotModule(/*lhs_parallel_dim=*/4096,
+                                              /*rhs_parallel_dim=*/4096));
+  TritonDotFusionSearchSpace search_space = MakeSearchSpace(module.get());
+
+  TritonGemmConfig hint = {/*block_m=*/1024, /*block_n=*/1024,
+                           /*block_k=*/32,   /*split_k=*/1,
+                           /*num_stages=*/1, /*num_warps=*/4,
+                           /*num_ctas=*/1};
+
+  EXPECT_THAT(
+      search_space.OptimizeConfigSet(search_space.GenerateConfigs(), {hint}),
+      Not(IsEmpty()));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc
index c6cd3e4b3320..4b3a8c182365 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -34,16 +33,16 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
+#include "xla/backends/gpu/runtime/buffer_comparator.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/buffer_comparator.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/gpu/variant_visitor.h"
+#include "xla/service/overload.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
@@ -51,6 +50,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -128,9 +128,16 @@ class GemmAutotuner {
     // Don't run autotuning concurrently on the same GPU.
     absl::MutexLock gpu_lock(&GetGpuMutex(stream_->parent()));
 
-    TF_ASSIGN_OR_RETURN(rz_buffers_, RedzoneBuffers::FromInstruction(
-                                         *gemm, autotune_config_, debug_options,
-                                         RedzoneBuffers::kAllInputsAllOutputs));
+    bool should_init_buffers = autotune_config_.should_init_buffers();
+    bool should_check_correctness = autotune_config_.should_check_correctness();
+    int redzone_padding_bytes = debug_options.xla_gpu_redzone_padding_bytes();
+    TF_ASSIGN_OR_RETURN(se::Stream * stream, autotune_config_.GetStream());
+    TF_ASSIGN_OR_RETURN(
+        rz_buffers_,
+        RedzoneBuffers::FromInstruction(
+            *gemm, autotune_config_.GetAllocator(), stream,
+            RedzoneBuffers::kAllInputsAllOutputs, should_init_buffers,
+            should_check_correctness, redzone_padding_bytes));
 
     return IsCublasLtMatmul(*gemm) || IsCublasLtMatmulF8(*gemm)
                ? TuneGpuBlasLt(gemm, gemm_config)
@@ -151,8 +158,8 @@ class GemmAutotuner {
 
   absl::StatusOr<AutotuneResult> TuneGpuBlasLt(const HloInstruction* gemm,
                                                const GemmConfig& gemm_config) {
-    auto workspace_buffer =
-        rz_buffers_.output_buffers().at(gemm->shape().tuple_shapes_size() - 1);
+    auto workspace_buffer = rz_buffers_.output_buffers().at(
+        gemm->shape().tuple_shapes().size() - 1);
 
     GpuBackendConfig gpu_config =
         gemm->backend_config<GpuBackendConfig>().value();
@@ -174,9 +181,24 @@ class GemmAutotuner {
     se::DeviceMemoryBase a_scale_buffer, b_scale_buffer, c_scale_buffer,
         d_scale_buffer, d_amax_buffer, bias_buffer, aux_buffer;
 
+    int64_t input_buffer_idx = 2;  // lhs is at 0, rhs is at 1
     if (has_vector_bias) {
-      bias_buffer = rz_buffers_.input_buffers().at(has_matrix_bias ? 3 : 2);
+      if (has_matrix_bias) {
+        input_buffer_idx++;
+      }
+      bias_buffer = rz_buffers_.input_buffers().at(input_buffer_idx++);
     }
+    // In the current GemmRewriter design for FP8, the a/b scales remain active
+    // even when they are not used. Consequently, we must inform the autotuner
+    // so it can choose algorithms that properly support a/b scales.
+    if (xla::primitive_util::IsF8Type(
+            gemm->operand(0)->shape().element_type()) &&
+        xla::primitive_util::IsF8Type(
+            gemm->operand(1)->shape().element_type())) {
+      a_scale_buffer = rz_buffers_.input_buffers().at(input_buffer_idx++);
+      b_scale_buffer = rz_buffers_.input_buffers().at(input_buffer_idx++);
+    }
+
     if (has_aux_output) {
       aux_buffer = rz_buffers_.output_buffers().at(1);
     }
@@ -186,24 +208,24 @@ class GemmAutotuner {
 
     TF_ASSIGN_OR_RETURN(
         auto algorithms,
-        plan->GetAlgorithms(stream_, /*max_algorithm_count*/ 128,
+        plan->GetAlgorithms(stream_, GemmConfig::kNumAlgorithms,
                             /*max_workspace_size*/ workspace_buffer.size()));
 
     auto tuned_func = [&](const BlasLt::MatmulAlgorithm& algorithm)
         -> absl::StatusOr<se::blas::ProfileResult> {
       // Run a warmup iteration without the profiler active.
+      TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithm));
       TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
           stream_, LhsBuffer(), RhsBuffer(), OutputBuffer(), OutputBuffer(),
           bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
-          c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
-          workspace_buffer));
+          c_scale_buffer, d_scale_buffer, d_amax_buffer, workspace_buffer));
       se::blas::ProfileResult profile_result;
       profile_result.set_warmup_run_executed(true);
       TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
           stream_, LhsBuffer(), RhsBuffer(), OutputBuffer(), OutputBuffer(),
           bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
-          c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
-          workspace_buffer, &profile_result));
+          c_scale_buffer, d_scale_buffer, d_amax_buffer, workspace_buffer,
+          &profile_result));
       return std::move(profile_result);
     };
 
@@ -422,17 +444,17 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
   auto old_algorithm = backend_config.selected_algorithm();
   bool update_algorithm =
       IsCublasLtMatmulF8(*gemm) ||
-      std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                                  // We only set the 'algorithm' field on
-                                  // non-Ampere architectures, as for Ampere
-                                  // it's ignored in any case.
-                                  return !cc.IsAtLeast(
-                                      se::CudaComputeCapability::kAmpere);
-                                },
-                                [](const se::RocmComputeCapability&) {
-                                  return true;  // TODO: not decided yet
-                                }},
-                 config.GetGpuComputeCapability());
+      std::visit(
+          Overload{[](const se::CudaComputeCapability& cc) {
+                     // We only set the 'algorithm' field on
+                     // non-Ampere architectures, as for Ampere
+                     // it's ignored in any case.
+                     return !cc.IsAtLeast(se::CudaComputeCapability::kAmpere);
+                   },
+                   [](const se::RocmComputeCapability&) {
+                     return true;  // TODO: not decided yet
+                   }},
+          config.GetGpuComputeCapability());
 
   if (update_algorithm) {
     int64_t new_algorithm{};
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.h b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.h
index 40a57e0293a9..8f74c23456d6 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/blas.h"
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
index ba8f0dc7d59e..003abe52bddf 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_algorithm_picker_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
-#include "xla/service/gpu/variant_visitor.h"
+#include "xla/service/overload.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
@@ -74,7 +74,7 @@ class GemmAlgorithmPickerTest : public HloTestBase,
     bool blas_get_version = name.rfind("BlasGetVersion") == 0;
 
     std::visit(
-        VariantVisitor{
+        Overload{
             [&](const se::CudaComputeCapability& cc) {
               if (!blas_get_version && cc.IsAtLeastAmpere()) {
                 GTEST_SKIP()
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
index 20b0b79bcf3e..cf121a28c375 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <array>
 #include <atomic>
 #include <cstdint>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
@@ -43,6 +42,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
+#include "xla/backends/gpu/runtime/buffer_comparator.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -64,8 +64,9 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_status_key.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/autotuning/dot_search_space.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/buffer_comparator.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
@@ -79,6 +80,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_graph_dumper.h"
@@ -109,6 +111,7 @@ limitations under the License.
 #include "tsl/platform/path.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
+#include "tsl/profiler/lib/traceme.h"
 
 // Log levels used in this file:
 // VLOG(1): Overview
@@ -130,9 +133,6 @@ namespace {
 // Minimum tile size.
 constexpr int kMinTileSize = 16;
 
-// Default tiling when autotuning is disabled.
-constexpr TritonGemmConfig kDefaultGemmTiling = {32, 32, 32, 1, 1, 4};
-
 // Split-K is enabled when the estimate number of waves is lower than the limit.
 constexpr int kMaxWavesForSplitK = 5;
 
@@ -164,6 +164,7 @@ class GemmFusionCollector : public ConstDfsHloVisitorWithDefault {
   absl::StatusOr<GemmFusionCollectorResult> CollectGemmFusions(
       const HloModule& module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {}) {
+    tsl::profiler::TraceMe trace("CollectGemmFusions");
     error_out_on_cache_miss_ =
         module.config()
             .debug_options()
@@ -180,6 +181,7 @@ class GemmFusionCollector : public ConstDfsHloVisitorWithDefault {
   }
 
   absl::Status HandleFusion(const HloInstruction* hlo) override {
+    tsl::profiler::TraceMe trace("HandleFusion");
     TF_ASSIGN_OR_RETURN(auto gpu_config,
                         hlo->backend_config<GpuBackendConfig>());
     const FusionBackendConfig& backend_config =
@@ -246,6 +248,7 @@ class GemmFusionCollector : public ConstDfsHloVisitorWithDefault {
 
   absl::StatusOr<BackendConfigs> GenerateConfigs(
       const KeysAndInstructions& keys_and_instructions) const {
+    tsl::profiler::TraceMe trace("GenerateConfigs");
     BackendConfigs result;
     result.reserve(keys_and_instructions.size());
     for (const auto& [_, fusion] : keys_and_instructions) {
@@ -314,6 +317,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     const se::DeviceDescription& gpu_device_info,
     const HloFusionInstruction* fusion, DebugOptions debug_opts,
     bool allow_filtering_kernels_spilling_registers) {
+  tsl::profiler::TraceMe traceme("TritonGemmAutotuneExtractor");
   std::unique_ptr<HloModule> new_module =
       ExtractInstructionIntoNewModule(*fusion);
   if (!allow_filtering_kernels_spilling_registers) {
@@ -352,6 +356,13 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     FusionWrapper fusion_wrapper(gpu_device_info);
     TF_RETURN_IF_ERROR(fusion_wrapper.Run(new_module.get()).status());
   }
+
+  if (debug_opts
+          .xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms()) {
+    NestGemmFusion nest_gemm_fusion(gpu_device_info.gpu_compute_capability());
+    TF_RETURN_IF_ERROR(nest_gemm_fusion.Run(new_module.get()).status());
+  }
+
   return new_module;
 }
 
@@ -359,6 +370,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CublasGemmAutotuneExtractor(
     const AutotuneConfig& config, const se::DeviceDescription& gpu_device_info,
     const se::SemanticVersion& toolkit_version,
     const HloFusionInstruction* fusion, const DebugOptions& debug_opts) {
+  tsl::profiler::TraceMe traceme("CublasGemmAutotuneExtractor");
   const HloComputation* fusion_computation = fusion->called_computation();
   std::unique_ptr<HloModule> new_module =
       ExtractComputationIntoNewModule(*fusion_computation);
@@ -406,6 +418,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CustomFusionKernelAutotuneExtractor(
     const GemmFusionAutotunerImpl::CustomKernelFusionConfig& cutlass_config,
     const AutotuneConfig& config, const se::SemanticVersion& toolkit_version,
     const HloFusionInstruction* fusion, const DebugOptions& debug_opts) {
+  tsl::profiler::TraceMe traceme("CustomFusionKernelAutotuneExtractor");
   const HloComputation* fusion_computation = fusion->called_computation();
   std::unique_ptr<HloModule> new_module =
       ExtractComputationIntoNewModule(*fusion_computation);
@@ -431,6 +444,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CustomFusionKernelAutotuneExtractor(
 
 absl::StatusOr<std::unique_ptr<HloModule>> FusionExtractor(
     const HloFusionInstruction& fusion, const DebugOptions& debug_opts) {
+  tsl::profiler::TraceMe traceme("FusionExtractor");
   std::unique_ptr<HloModule> module = ExtractInstructionIntoNewModule(fusion);
   module->mutable_config().set_debug_options(debug_opts);
   return module;
@@ -439,6 +453,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> FusionExtractor(
 absl::StatusOr<std::unique_ptr<HloModule>> CuDnnFusionExtractor(
     const HloFusionInstruction& fusion, const DebugOptions& debug_opts,
     const int plan_id) {
+  tsl::profiler::TraceMe traceme("CuDnnFusionExtractor");
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       FusionExtractor(fusion, debug_opts));
 
@@ -521,6 +536,10 @@ absl::Status DumpAutotunedFusion(const AutotuneConfig& autotune_config,
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
       util.ExtractModule([&](const DebugOptions& debug_opts) {
+        tsl::profiler::TraceMe traceme([&] {
+          return tsl::profiler::TraceMeEncode("ExtractModule",
+                                              {{"name", fusion->name()}});
+        });
         if (result.has_algorithm()) {
           return CuDnnFusionExtractor(*fusion, debug_opts,
                                       result.algorithm().algo_id());
@@ -580,6 +599,7 @@ std::string Serialize(const BackendConfig& config) {
 absl::Status RewriteGemmFusionToCall(HloInstruction* fusion_instr) {
   // Falling back to cuBLAS: Converting the fusion to a Call, so that it
   // can be inlined back again.
+  tsl::profiler::TraceMe traceme("RewriteGemmFusionToCall");
   HloComputation* const computation = fusion_instr->parent();
   HloInstruction* const call =
       computation->AddInstruction(HloInstruction::CreateCall(
@@ -591,6 +611,7 @@ absl::Status RewriteGemmFusionToCall(HloInstruction* fusion_instr) {
 absl::Status RewriteGemmFusionToCustomKernelFusion(
     HloInstruction* fusion_instr, se::DeviceDescription device_description,
     int64_t kernel_index) {
+  tsl::profiler::TraceMe traceme("RewriteGemmFusionToCustomKernelFusion");
   // Rewrites gemm fusion to custom kernel fusion.
   // First convert the fusion to a call. Then inlines the call. Then
   // rewrites to custom kernel fusion.
@@ -610,6 +631,7 @@ absl::Status RewriteGemmFusionToCustomKernelFusion(
 
 absl::Status HandleTritonGemm(HloInstruction* fusion_instr,
                               FusionBackendConfig& fusion_backend_config) {
+  tsl::profiler::TraceMe traceme("HandleTritonGemm");
   TF_ASSIGN_OR_RETURN(
       const TritonGemmConfig config,
       TritonGemmConfig::FromProto(fusion_backend_config.triton_gemm_config()));
@@ -623,6 +645,7 @@ absl::Status HandleTritonGemm(HloInstruction* fusion_instr,
 
 absl::Status GemmFusionAutotunerRewriterVisitor::HandleFusion(
     HloInstruction* fusion_instr) {
+  tsl::profiler::TraceMe traceme("HandleFusion");
   TF_ASSIGN_OR_RETURN(auto gpu_config,
                       fusion_instr->backend_config<GpuBackendConfig>());
   FusionBackendConfig& fusion_backend_config =
@@ -733,6 +756,7 @@ bool GemmFusionAutotunerImpl::IsAutotuningEnabled() const {
 static std::vector<BackendConfig> GenerateCustomKernelFusionConfigs(
     const HloFusionInstruction& fusion,
     se::DeviceDescription device_description) {
+  tsl::profiler::TraceMe traceme("GenerateCustomKernelFusionConfigs");
   const CustomKernelFusionPatternRegistry* patterns =
       CustomKernelFusionPatternRegistry::Default();
   HloComputation* computation = fusion.called_computation();
@@ -804,6 +828,7 @@ static std::vector<BackendConfig> GenerateCustomKernelFusionConfigs(
 
 absl::StatusOr<std::vector<BackendConfig>>
 GemmFusionAutotunerImpl::GenerateConfigs(const HloFusionInstruction& fusion) {
+  tsl::profiler::TraceMe traceme("GenerateConfigs");
   const HloDotInstruction* dot =
       Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
           *fusion.called_computation(), HloOpcode::kDot));
@@ -873,6 +898,49 @@ void ModifyPotentiallyFailingConfig(TritonGemmConfig& config, int minBitWidth,
 
 absl::StatusOr<std::vector<TritonGemmConfig>>
 GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
+  tsl::profiler::TraceMe traceme("GenerateTritonConfigs");
+  // Default tiling when autotuning is disabled.
+  constexpr TritonGemmConfig kDefaultConfig = {
+      /*block_m=*/32, /*block_n=*/32,   /*block_k=*/32,
+      /*split_k=*/1,  /*num_stages=*/1, /*num_warps=*/4,
+      /*num_ctas=*/1};
+  constexpr int kMinGemmElements = 2 * 32 * 32;
+  bool small_dot = ShapeUtil::ElementsIn(dot.operand(0)->shape()) +
+                       ShapeUtil::ElementsIn(dot.operand(1)->shape()) <=
+                   kMinGemmElements;
+  // TODO: b/393299275 - Remove this once the new emitter lands and we can
+  // support slices in contracting dimension with splits.
+  bool supports_contracting_split =
+      HloBfsFindAll({&dot}, [&](const HloInstruction* node) {
+        return node->opcode() == HloOpcode::kSlice;
+      }).empty();
+  bool autotune_contracting_split =
+      supports_contracting_split &&
+      debug_options_.xla_gpu_enable_split_k_autotuning();
+
+  if (debug_options_.xla_gpu_experimental_enable_dynamic_dot_search_space()) {
+    TritonDotFusionSearchSpace search_space(config_.GetDeviceDescription(),
+                                            &dot);
+    VLOG(1) << "Generating configs from search space: "
+            << search_space.ToString();
+    // We don't need to consider small_dot here. The new search space will
+    // already generate a unique config for small problems.
+    std::vector<TritonGemmConfig> configs = search_space.GenerateConfigs(
+        /*force_contracting_split=*/autotune_contracting_split
+            ? std::nullopt
+            : std::make_optional(1));
+    if (!debug_options_.xla_gpu_exhaustive_tiling_search()) {
+      VLOG(1) << "Restricting configs to the default set.";
+      configs = search_space.OptimizeConfigSet(
+          configs, /*hints=*/GetDefaultTritonConfigs());
+    }
+    if (!IsAutotuningEnabled()) {
+      // Keep the first config, which likely does not spill registers.
+      configs.resize(1);
+    }
+    return configs;
+  }
+
   // Retrieve the minimum bit-width participating in the dot. This is needed
   // to avoid autotuning configurations that are not supported by Triton. This
   // is used to restrict the values for tile_k.
@@ -898,20 +966,14 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
 
   // Generate the list of configurations (once).
   if (triton_configs_.empty()) {
-    triton_configs_ = !IsAutotuningEnabled()
-                          ? std::vector(1, kDefaultGemmTiling)
+    triton_configs_ = !IsAutotuningEnabled() ? std::vector(1, kDefaultConfig)
                       : debug_options_.xla_gpu_exhaustive_tiling_search()
                           ? GetExhaustiveTritonConfigs()
                           : GetDefaultTritonConfigs();
   }
 
-  // Avoid autotuning tiny fusions.
-  constexpr int kMinGemmElements = 32 * 32;
-  bool small_dot =
-      ShapeUtil::ElementsIn(dot.operand(0)->shape()) <= kMinGemmElements &&
-      ShapeUtil::ElementsIn(dot.operand(1)->shape()) <= kMinGemmElements;
   std::vector<TritonGemmConfig> triton_configs =
-      small_dot ? std::vector(1, kDefaultGemmTiling) : triton_configs_;
+      small_dot ? std::vector(1, kDefaultConfig) : triton_configs_;
 
   // Split-K optimization enables more even utilization of a GPU in cases
   // where tiling just the non-contracting dimensions of a GEMM does not create
@@ -932,7 +994,7 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
     config.block_n = std::min(config.block_n, limits.block_n);
     config.block_k = std::min(config.block_k, limits.block_k);
     int max_split_k = 1;
-    if (debug_options_.xla_gpu_enable_split_k_autotuning()) {
+    if (autotune_contracting_split) {
       int64_t ratio = kSufficientNumberOfTiles * config.block_m *
                       config.block_n / result_size;
       max_split_k = 1 << std::max<int>(tsl::Log2Floor64(ratio), 0);
@@ -969,6 +1031,7 @@ absl::StatusOr<absl::flat_hash_map<
     std::vector<GemmFusionAutotunerImpl::ExecutableCandidate>>>
 GemmFusionAutotunerImpl::CompileAll(AutotunerCompileUtil& compile_util,
                                     const BackendConfigs& task) {
+  tsl::profiler::TraceMe traceme("CompileAll");
   tsl::profiler::ScopedAnnotation annotation("XlaAutotunerCompilation");
 
   absl::flat_hash_map<const HloFusionInstruction*,
@@ -1000,6 +1063,7 @@ GemmFusionAutotunerImpl::CompileAll(AutotunerCompileUtil& compile_util,
                      const BackendConfig& config,
                      bool allow_filtering_kernels_spilling_registers)
       -> absl::StatusOr<std::unique_ptr<Executable>> {
+    tsl::profiler::TraceMe traceme("Compile");
     if (std::holds_alternative<TritonGemmConfig>(config)) {
       return compile_util.Compile([&](const DebugOptions& opts) {
         return TritonGemmAutotuneExtractor(
@@ -1121,6 +1185,7 @@ absl::Status GemmFusionAutotunerImpl::CompareBuffers(
     const HloFusionInstruction& fusion,
     const ScopedShapedBuffer& reference_buffer,
     const ScopedShapedBuffer& buffer, AutotuneResult& res) {
+  tsl::profiler::TraceMe traceme("CompareBuffers");
   const HloInstruction& root = *fusion.called_computation_root();
   BufferComparator comparator(root.shape(),
                               debug_options_.xla_gpu_autotune_gemm_rtol());
@@ -1147,6 +1212,7 @@ absl::Status GemmFusionAutotunerImpl::CompareBuffers(
 
 absl::StatusOr<bool> GemmFusionAutotunerImpl::CheckRedZones(
     const RedzoneBuffers& rz_buffers, AutotuneResult& res) {
+  tsl::profiler::TraceMe traceme("CheckRedZones");
   TF_ASSIGN_OR_RETURN(se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
                       rz_buffers.RedzoneAllocator().CheckRedzones());
   if (rz_check_status.ok()) return true;
@@ -1161,20 +1227,34 @@ absl::StatusOr<AutotuneResult> GemmFusionAutotunerImpl::MeasurePerformance(
     AutotunerCompileUtil& compile_util, const HloFusionInstruction& fusion,
     const ExecutableCandidate& candidate,
     std::optional<ScopedShapedBuffer>& reference_buffer) {
+  tsl::profiler::TraceMe traceme("MeasurePerformance");
   se::StreamExecutor* stream_exec = config_.GetExecutor();
-  if (!stream_exec->SynchronizeAllActivity()) {
-    return Internal("Failed to synchronize GPU for autotuning.");
+  {
+    tsl::profiler::TraceMe traceme("SynchronizeAllActivity");
+    if (!stream_exec->SynchronizeAllActivity()) {
+      return Internal("Failed to synchronize GPU for autotuning.");
+    }
   }
-  TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
 
   VLOG(5) << "Trying : " << ConfigToString(candidate.config);
   AutotuneResult res = FromConfig(candidate.config);
 
   const HloComputation* fusion_computation = fusion.called_computation();
-  TF_ASSIGN_OR_RETURN(auto rz_buffers,
-                      RedzoneBuffers::FromInstruction(
-                          *fusion_computation->FusionInstruction(), config_,
-                          debug_options_, RedzoneBuffers::kAllInputs));
+
+  bool should_init_buffers = config_.should_init_buffers();
+  bool should_check_correctness = config_.should_check_correctness();
+  int redzone_padding_bytes = debug_options_.xla_gpu_redzone_padding_bytes();
+  se::Stream* stream = nullptr;
+  {
+    tsl::profiler::TraceMe traceme("GetStream");
+    TF_ASSIGN_OR_RETURN(stream, config_.GetStream());
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto rz_buffers,
+      RedzoneBuffers::FromInstruction(
+          *fusion_computation->FusionInstruction(), config_.GetAllocator(),
+          stream, RedzoneBuffers::kAllInputs, should_init_buffers,
+          should_check_correctness, redzone_padding_bytes));
 
   TF_ASSIGN_OR_RETURN(
       ProfilingOutput profiling_output,
@@ -1215,6 +1295,7 @@ absl::StatusOr<AutotuneResult> GemmFusionAutotunerImpl::MeasurePerformance(
 absl::StatusOr<std::vector<AutotuneResult>> GemmFusionAutotunerImpl::Profile(
     AutotunerCompileUtil& compile_util, const HloFusionInstruction& fusion,
     absl::Span<const ExecutableCandidate> candidates) {
+  tsl::profiler::TraceMe traceme("Profile");
   tsl::profiler::ScopedAnnotation annotation([&] {
     return absl::StrFormat("XlaAutotunerMeasurement:#hlo_op=%s#",
                            fusion.name());
@@ -1253,6 +1334,7 @@ absl::StatusOr<std::vector<AutotuneResult>> GemmFusionAutotunerImpl::Profile(
 
 std::vector<TritonGemmConfig>
 GemmFusionAutotunerImpl::GetExhaustiveTritonConfigs() const {
+  tsl::profiler::TraceMe traceme("GetExhaustiveTritonConfigs");
   std::vector<TritonGemmConfig> configs;
   se::GpuComputeCapability gcc = GetComputeCapability();
 
@@ -1323,6 +1405,7 @@ static absl::Status DumpAutotuningLogs(const DebugOptions& debug_opts,
 absl::Status GemmFusionAutotunerImpl::Autotune(
     AutotunerCompileUtil& compile_util, const BackendConfigs& gemm_config_sets,
     AutoTuneCacheKeyCount fusion_count_map) {
+  tsl::profiler::TraceMe traceme("Autotune");
   TF_ASSIGN_OR_RETURN(auto executable_sets,
                       CompileAll(compile_util, gemm_config_sets));
 
@@ -1405,6 +1488,7 @@ static absl::Status ExchangeResults(KeyValueStoreInterface& key_value_store,
                                     absl::string_view fingerprint,
                                     const int shard_index,
                                     const int shard_count) {
+  tsl::profiler::TraceMe traceme("ExchangeResults");
   AutotuneResults results;
   TF_RETURN_IF_ERROR(
       AutotunerUtil::SerializeAutotuneResults(&results, &keys_to_send));
@@ -1460,8 +1544,6 @@ absl::StatusOr<bool> GemmFusionAutotuner::Run(
   XLA_SCOPED_LOGGING_TIMER("GEMM fusion autotuner");
 
   const DebugOptions& debug_options = module->config().debug_options();
-  const bool shard_autotuning = debug_options.xla_gpu_shard_autotuning() &&
-                                key_value_store_.process_count > 1;
   GemmFusionAutotunerImpl autotuner(config_, toolkit_version_, debug_options,
                                     thread_pool_);
   GemmFusionCollector fusion_collector(&autotuner);
@@ -1469,6 +1551,10 @@ absl::StatusOr<bool> GemmFusionAutotuner::Run(
       GemmFusionCollectorResult fusions,
       fusion_collector.CollectGemmFusions(*module, execution_threads));
   AutotuneCacheKeySet keys_of_this_rank;
+
+  const bool shard_autotuning = debug_options.xla_gpu_shard_autotuning() &&
+                                key_value_store_.process_count > 1 &&
+                                autotuner.IsAutotuningEnabled();
   if (shard_autotuning) {
     if (key_value_store_.key_value_store == nullptr) {
       return absl::FailedPreconditionError(
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
index 1dc24294fe7d..1b1764dc7baf 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -40,6 +39,7 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
index da0cd4ffc3a6..bc62e1cb545a 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
@@ -14,10 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
-#include <iterator>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -30,7 +28,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
-#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 
 namespace xla {
 namespace gpu {
@@ -82,36 +80,46 @@ bool GemmFusionAutotunerImpl::AddLibConfigs(
 std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
     const {
   using Config = TritonGemmConfig;
-
-  std::vector<Config> configs = {
-      Config(32, 32, 256, 1, 1, 4),   Config(64, 32, 32, 16, 1, 4),
-      Config(32, 64, 64, 4, 1, 4),    Config(128, 128, 64, 4, 1, 4),
-      Config(16, 16, 256, 1, 1, 4),   Config(16, 128, 32, 16, 1, 4),
-      Config(16, 64, 128, 1, 1, 4),   Config(16, 128, 32, 8, 1, 4),
-      Config(16, 16, 512, 1, 1, 4),   Config(32, 16, 512, 1, 1, 4),
-      Config(64, 32, 64, 1, 2, 8),    Config(128, 256, 32, 1, 3, 8),
-      Config(256, 128, 32, 1, 3, 8),  Config(256, 64, 32, 1, 4, 4),
-      Config(64, 256, 32, 1, 4, 4),   Config(128, 64, 32, 1, 4, 4),
-      Config(64, 128, 32, 1, 4, 4),   Config(256, 128, 128, 1, 3, 8),
-      Config(256, 64, 128, 1, 4, 4),  Config(64, 256, 128, 1, 4, 4),
-      Config(128, 128, 128, 1, 4, 4), Config(128, 64, 64, 1, 4, 4),
-      Config(64, 128, 64, 1, 4, 4),   Config(128, 32, 64, 1, 4, 4),
-      Config(64, 32, 64, 1, 4, 4),    Config(32, 128, 32, 1, 4, 4),
-      Config(128, 128, 32, 1, 4, 4),  Config(16, 16, 256, 1, 3, 4),
-      Config(128, 128, 64, 2, 1, 8),  Config(64, 64, 64, 1, 2, 4),
-      Config(16, 64, 256, 8, 1, 4),   Config(256, 256, 128, 1, 3, 8)};
-  auto cu_compute_capability =
+  auto compute_capability =
       std::get<se::CudaComputeCapability>(GetComputeCapability());
-  if (cu_compute_capability.IsAtLeastHopper()) {
-    absl::c_copy(
-        std::vector<Config>{
-            Config(16, 32, 32, 8, 1, 2),
-            Config(16, 64, 128, 8, 1, 4),
-            Config(16, 64, 128, 16, 3, 4),
-        },
-        std::back_inserter(configs));
+
+  if (compute_capability.IsHopper() || compute_capability.IsAmpere()) {
+    return {Config(16, 16, 64, 1, 4, 2),    Config(16, 16, 128, 1, 4, 4),
+            Config(16, 16, 128, 128, 4, 2), Config(16, 16, 128, 16, 1, 2),
+            Config(16, 256, 16, 1, 1, 2),   Config(32, 32, 128, 16, 1, 4),
+            Config(32, 256, 32, 1, 3, 4),   Config(32, 256, 32, 16, 3, 8),
+            Config(64, 16, 32, 1, 4, 2),    Config(64, 16, 32, 16, 4, 2),
+            Config(64, 16, 64, 1, 1, 4),    Config(64, 16, 64, 4, 3, 2),
+            Config(64, 16, 64, 16, 4, 4),   Config(64, 16, 128, 1, 4, 2),
+            Config(64, 16, 128, 16, 4, 4),  Config(64, 32, 32, 1, 4, 4),
+            Config(64, 32, 64, 16, 3, 4),   Config(64, 32, 128, 1, 3, 2),
+            Config(64, 32, 128, 128, 2, 4), Config(64, 64, 32, 1, 4, 4),
+            Config(64, 64, 64, 1, 4, 4),    Config(64, 64, 64, 4, 4, 4),
+            Config(64, 64, 128, 16, 3, 4),  Config(64, 64, 256, 16, 4, 8),
+            Config(64, 128, 16, 1, 4, 2),   Config(64, 128, 64, 1, 3, 4),
+            Config(64, 128, 128, 8, 1, 4),  Config(64, 256, 32, 1, 4, 4),
+            Config(128, 16, 32, 8, 4, 2),   Config(128, 16, 64, 16, 3, 2),
+            Config(128, 16, 64, 16, 1, 4),  Config(128, 32, 32, 8, 4, 2),
+            Config(128, 128, 32, 8, 4, 8),  Config(128, 256, 32, 1, 4, 8),
+            Config(128, 256, 64, 1, 4, 8)};
   }
-  return configs;
+
+  return {Config(32, 32, 256, 1, 1, 4),   Config(64, 32, 32, 16, 1, 4),
+          Config(32, 64, 64, 4, 1, 4),    Config(128, 128, 64, 4, 1, 4),
+          Config(16, 16, 256, 1, 1, 4),   Config(16, 128, 32, 16, 1, 4),
+          Config(16, 64, 128, 1, 1, 4),   Config(16, 128, 32, 8, 1, 4),
+          Config(16, 16, 512, 1, 1, 4),   Config(32, 16, 512, 1, 1, 4),
+          Config(64, 32, 64, 1, 2, 8),    Config(128, 256, 32, 1, 3, 8),
+          Config(256, 128, 32, 1, 3, 8),  Config(256, 64, 32, 1, 4, 4),
+          Config(64, 256, 32, 1, 4, 4),   Config(128, 64, 32, 1, 4, 4),
+          Config(64, 128, 32, 1, 4, 4),   Config(256, 128, 128, 1, 3, 8),
+          Config(256, 64, 128, 1, 4, 4),  Config(64, 256, 128, 1, 4, 4),
+          Config(128, 128, 128, 1, 4, 4), Config(128, 64, 64, 1, 4, 4),
+          Config(64, 128, 64, 1, 4, 4),   Config(128, 32, 64, 1, 4, 4),
+          Config(64, 32, 64, 1, 4, 4),    Config(32, 128, 32, 1, 4, 4),
+          Config(128, 128, 32, 1, 4, 4),  Config(16, 16, 256, 1, 3, 4),
+          Config(128, 128, 64, 2, 1, 8),  Config(64, 64, 64, 1, 2, 4),
+          Config(16, 64, 256, 8, 1, 4),   Config(256, 256, 128, 1, 3, 8)};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
index 498c848b133a..59aa9f542f4a 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <memory>
+#include <set>
 #include <string>
 #include <utility>
 #include <variant>
@@ -370,6 +371,10 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
           VLOG(5) << m->ToString();
           const HloInstruction* dot_fusion =
               m->entry_computation()->root_instruction();
+          // Split-K rewriting may introduce a convert and / or a reduce op.
+          if (dot_fusion->opcode() == HloOpcode::kConvert) {
+            dot_fusion = dot_fusion->operand(0);
+          }
           if (dot_fusion->opcode() == HloOpcode::kReduce) {
             dot_fusion = dot_fusion->operand(0);
           }
@@ -388,31 +393,49 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
   }
 };
 
-class GemmFusionAutotunerTestWithMorePreciseReduction
-    : public GemmFusionAutotunerTest {
+// TODO: b/404470821 - Remove once this is enabled by default.
+class DynamicSearchSpaceAutotunerTest : public GemmFusionAutotunerTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options =
         GemmFusionAutotunerTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_triton_gemm_disable_reduced_precision_reduction(
+    debug_options.set_xla_gpu_experimental_enable_dynamic_dot_search_space(
         true);
     return debug_options;
   }
 };
 
+// TODO: b/404470821 - Merge this with the autotuning levels test once dynamic
+// search space is enabled by default.
+class DynamicSearchSpaceAutotunerDisabledTest
+    : public DynamicSearchSpaceAutotunerTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options =
+        GemmFusionAutotunerTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_autotune_level(0);
+    return debug_options;
+  }
+};
+
 absl::StatusOr<std::vector<TritonGemmConfig>>
 GetPossibleMatmulAutotuneTritonConfigs(
     const HloDotInstruction& dot,
     const se::CudaComputeCapability& compute_capability,
     const se::SemanticVersion& toolkit_version,
     const DebugOptions& debug_options) {
-  se::GpuDeviceInfoProto deviceless_proto;
-  auto ccc = deviceless_proto.mutable_cuda_compute_capability();
-  ccc->set_major(compute_capability.major);
-  ccc->set_minor(compute_capability.minor);
-  deviceless_proto.set_core_count(100);
-  deviceless_proto.set_threads_per_warp(32);
-  DevicelessConfig test_config{se::DeviceDescription{deviceless_proto}};
+  se::DeviceDescription device_description(
+      se::GpuDeviceInfoProto::default_instance());
+  device_description.set_gpu_compute_capability(compute_capability);
+  // Using H100 numbers as the most relevant example here.
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
+  // https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/#nvidia_h100_gpu_architecture_in-depth
+  device_description.set_registers_per_block_limit(64 * 1024);
+  device_description.set_core_count(132);
+  device_description.set_threads_per_block_limit(1024);
+  device_description.set_threads_per_warp(32);
+  device_description.set_shared_memory_per_block_optin(227 * 1024);
+  DevicelessConfig test_config = {device_description};
   AutotuneConfig autotune_config{test_config, debug_options};
   GemmFusionAutotunerImpl autotuner(autotune_config, toolkit_version,
                                     debug_options, nullptr);
@@ -536,31 +559,6 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTest, SelectsSplitK) {
-  // Shapes with K >> M, N have to force split-K configurations.
-  const std::string kHloText = R"(
-HloModule t
-
-ENTRY e {
-  p0 = s8[7,8192] parameter(0)
-  p0c = f16[7,8192] convert(p0)
-  p1 = f16[8192,18] parameter(1)
-  ROOT dot.0 = f16[7,18] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})";
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: reduce
-; CHECK: ENTRY
-; CHECK-NEXT: parameter
-; CHECK-NEXT: parameter
-; CHECK-NEXT: kCustom
-; CHECK-NEXT: kLoop
-)");
-
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1, /*arel=*/0.5}));
-}
-
-TEST_F(GemmFusionAutotunerTestWithMorePreciseReduction, SelectsSplitK) {
   // Shapes with K >> M, N have to force split-K configurations.
   constexpr absl::string_view kHloText = R"(
 HloModule t
@@ -579,7 +577,7 @@ ENTRY e {
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: kCustom
-; CHECK-NEXT: kLoop
+; CHECK-NEXT: {{kLoop|kInput}}
 )");
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-3}));
@@ -602,7 +600,7 @@ ENTRY e {
 })";
 
   MatchOptimizedHlo(kHloText, R"(
-; CHECK: f16[3,55,20]
+; CHECK: f{{(16|32)}}[3,55,20]
 ; CHECK: {"block_m":16,"block_n":64,"block_k":32,"split_k":3,"num_stages":1,"num_warps":2,"num_ctas":1}
 ; CHECK: f16[55,20]{1,0} {{(reduce|fusion)}}
 )");
@@ -762,7 +760,6 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(AutotuneResults autotune_results_override,
                           ParseTextProto<AutotuneResults>(R"pb(
-                            version: 3
                             results {
                               device: "..."
                               hlo: "..."
@@ -771,6 +768,7 @@ ENTRY main {
                                 run_time { nanos: 14 }
                               }
                             })pb"));
+  AddVersionToAutotuneResults(autotune_results_override);
   autotune_results_override.mutable_results(0)->set_device(
       std::string(cache_key.GetModelStr()));
   autotune_results_override.mutable_results(0)->set_hlo(
@@ -971,12 +969,16 @@ ENTRY e {
     EXPECT_TRUE(changed);
 
     // Check default configuration.
+    // TODO: b/407494653 - This is a bad test because it relies on particular
+    // implementation details to succeed. Thus, it tests that there is no
+    // autotuning happening in a brittle way. Fix this when refactoring the
+    // autotuner.
     TF_ASSERT_OK_AND_ASSIGN(
         bool filecheck_matches,
         RunFileCheck(
             module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
             R"(
-// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}},"force_earliest_schedule":false}
+// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"16","split_k":"1","num_stages":"1","num_warps":"2","num_ctas":"1"
             )"));
     EXPECT_TRUE(filecheck_matches);
   } else {
@@ -1021,119 +1023,6 @@ ENTRY e {
 )");
 }
 
-// TODO(b/337839570): Triton currently has a limitation where it crashes
-// on small block_k values depending on the bit-width of the inputs to the
-// dot. For this test case, it should skip any block_k values that are <= 16
-// since the smallest type has a bit-width of 8.
-TEST_F(GemmFusionAutotunerExhaustiveTest, SkipsCrashingTileKConfigWithConvert) {
-  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
-HloModule module
-ENTRY e {
-  x = s8[33,33]{1,0} parameter(0)
-  c = f16[33,33]{1,0} convert(x)
-  y = f16[33,33]{1,0} parameter(1)
-  ROOT out = f16[33,33]{1,0} dot(c, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)")
-                                                  .value();
-  const se::CudaComputeCapability compute_capability{
-      se::CudaComputeCapability::kAmpere, /*minor=*/0};
-  TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> configs,
-      GetPossibleMatmulAutotuneTritonConfigs(
-          *Cast<HloDotInstruction>(
-              module->entry_computation()->root_instruction()),
-          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
-  EXPECT_TRUE(std::all_of(
-      configs.begin(), configs.end(),
-      [](const TritonGemmConfig& config) { return config.block_k > 16; }));
-}
-
-// TODO(b/337839570): Triton currently has a limitation where it crashes
-// on small block_k values depending on the bit-width of the inputs to the
-// dot. For this test case, it should skip any block_k values that are <= 16
-// since the smallest type has a bit-width of 8.
-TEST_F(GemmFusionAutotunerExhaustiveTest, SkipsCrashingTileKConfigNoConvert) {
-  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
-HloModule module
-ENTRY e {
-  x = f8e4m3fn[33,33]{1,0} parameter(0)
-  y = f8e4m3fn[33,33]{1,0} parameter(1)
-  ROOT out = bf16[33,33]{1,0} dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)")
-                                                  .value();
-  const se::CudaComputeCapability compute_capability{
-      se::CudaComputeCapability::kAmpere, /*minor=*/0};
-  TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> configs,
-      GetPossibleMatmulAutotuneTritonConfigs(
-          *Cast<HloDotInstruction>(
-              module->entry_computation()->root_instruction()),
-          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
-  EXPECT_TRUE(std::all_of(
-      configs.begin(), configs.end(),
-      [](const TritonGemmConfig& config) { return config.block_k > 16; }));
-}
-
-// TODO(b/337839570): In addition to Triton's existing limitations on small
-// block_k values, there are further issues happening on FP8 types and
-// predicates that require additional restriction on block_m when num_warps
-// > 8 (see b/378660935). It's unclear if the issue extends beyond these cases,
-// so restrictions here are conservative to these.
-TEST_F(GemmFusionAutotunerExhaustiveTest, SkipsCrashingConfigsFP8Dot) {
-  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
-HloModule module
-ENTRY e {
-  x = f8e4m3fn[33,33]{1,0} parameter(0)
-  y = f8e4m3fn[33,33]{1,0} parameter(1)
-  ROOT out = bf16[33,33]{1,0} dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)")
-                                                  .value();
-  const se::CudaComputeCapability compute_capability{
-      se::CudaComputeCapability::kAmpere, /*minor=*/0};
-  TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> configs,
-      GetPossibleMatmulAutotuneTritonConfigs(
-          *Cast<HloDotInstruction>(
-              module->entry_computation()->root_instruction()),
-          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
-  EXPECT_TRUE(std::all_of(
-      configs.begin(), configs.end(), [](const TritonGemmConfig& config) {
-        return config.block_k > 16 &&
-               (config.num_warps <= 8 || config.block_m > 16);
-      }));
-}
-
-// TODO(b/337839570): In addition to FP8 and predicates, also S8 leads to
-// crashes when num_warps > 8 and block_m < 32.
-TEST_F(GemmFusionAutotunerExhaustiveTest, SkipsCrashingConfigsS8Dot) {
-  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
-HloModule module
-ENTRY e {
-  lhs = s8[512,4608]{0,1} parameter(0)
-  rhs = f32[4608,16384]{1,0} parameter(1)
-  ROOT d = f32[512,16384]{1,0} dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)")
-                                                  .value();
-  const se::CudaComputeCapability compute_capability{
-      se::CudaComputeCapability::kAmpere, /*minor=*/0};
-  TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> configs,
-      GetPossibleMatmulAutotuneTritonConfigs(
-          *Cast<HloDotInstruction>(
-              module->entry_computation()->root_instruction()),
-          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
-
-  for (const auto& config : configs) {
-    if (config.num_warps > 8) {
-      EXPECT_GE(config.block_m, 32);
-    }
-  }
-}
-
 class GemmFusionAutotunerDisableSplitK : public GemmFusionAutotunerTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
@@ -1166,46 +1055,6 @@ ENTRY e {
       [](const TritonGemmConfig& config) { return config.split_k == 1; }));
 }
 
-class GemmFusionAutotunerConfigTest
-    : public StatelessAutotunerTest,
-      public ::testing::WithParamInterface<bool> {};
-
-TEST_P(GemmFusionAutotunerConfigTest, SparseDotDiscardsUnsupportedTiles) {
-  const std::string kHloText = R"(
-HloModule test
-ENTRY wais {
-  lhs = f16[5,1600] parameter(0)
-  rhs = f16[3200,10] parameter(1)
-  meta = u16[5,200] parameter(2)
-  ROOT dot = f32[5,10] dot(lhs, rhs, meta),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  const se::CudaComputeCapability compute_capability{
-      se::CudaComputeCapability::kAmpere, /*minor=*/0};
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_exhaustive_tiling_search(GetParam());
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> configs,
-      GetPossibleMatmulAutotuneTritonConfigs(
-          *Cast<HloDotInstruction>(
-              module->entry_computation()->root_instruction()),
-          compute_capability, GetToolkitVersion(), debug_options));
-  for (const auto& config : configs) {
-    int metadata_size = config.block_m * config.block_k / 16;
-    EXPECT_LE(
-        config.num_warps *
-            WarpSize(
-                backend().default_stream_executor()->GetDeviceDescription()),
-        metadata_size);
-    EXPECT_GT(config.block_k, 16);  // kMinTileSize
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmFusionAutotunerConfigSweep,
-                         GemmFusionAutotunerConfigTest, ::testing::Bool());
-
 TEST_F(GemmFusionAutotunerTest, SplitKFLoatNormalization) {
   if (!GetCudaComputeCapability().IsAtLeastHopper()) {
     GTEST_SKIP() << "f8 types are only supported from Hopper onwards.";
@@ -1295,25 +1144,25 @@ TEST_F(GemmFusionAutotunerTest, CreatesCustomKernelFusionConfigs) {
       }));
 }
 
-TEST_F(GemmFusionAutotunerTest, GeneratesConfigForUpcastGemmWithPrologue) {
+TEST_F(GemmFusionAutotunerTest, GeneratesTwoConfigsForUpcastGemmWithPrologue) {
   if (isRocm()) {
     GTEST_SKIP() << "Not supported on ROCm.";
   }
   const std::string kHlo = R"(
   HloModule module
 
-  %gemm_fusion_r_computation (parameter_0.1: f32[1,256,4,4096], parameter_1.1: bf16[1,4,4096,4096]) -> f32[256,4096] {
-    %parameter_0.1 = f32[1,256,4,4096]{3,2,1,0} parameter(0)
-    %bitcast.60 = f32[256,16384]{1,0} bitcast(f32[1,256,4,4096]{3,2,1,0} %parameter_0.1)
-    %parameter_1.1 = bf16[1,4,4096,4096]{3,2,1,0} parameter(1)
-    %bitcast.61 = bf16[16384,4096]{1,0} bitcast(bf16[1,4,4096,4096]{3,2,1,0} %parameter_1.1)
-    %convert.22 = f32[16384,4096]{1,0} convert(bf16[16384,4096]{1,0} %bitcast.61)
-    ROOT r = f32[256,4096]{1,0} dot(f32[256,16384]{1,0} %bitcast.60, f32[16384,4096]{1,0} %convert.22), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %gemm_fusion_r_computation (parameter_0.1: f32[1,256,4,16], parameter_1.1: bf16[1,4,16,4096]) -> f32[256,4096] {
+    %parameter_0.1 = f32[1,256,4,16]{3,2,1,0} parameter(0)
+    %bitcast.60 = f32[256,64]{1,0} bitcast(f32[1,256,4,16]{3,2,1,0} %parameter_0.1)
+    %parameter_1.1 = bf16[1,4,16,4096]{3,2,1,0} parameter(1)
+    %bitcast.61 = bf16[64,4096]{1,0} bitcast(bf16[1,4,16,4096]{3,2,1,0} %parameter_1.1)
+    %convert.22 = f32[64,4096]{1,0} convert(bf16[64,4096]{1,0} %bitcast.61)
+    ROOT r = f32[256,4096]{1,0} dot(f32[256,64]{1,0} %bitcast.60, f32[64,4096]{1,0} %convert.22), lhs_contracting_dims={1}, rhs_contracting_dims={0}
   }
 
   ENTRY main {
-    %p0 = f32[1,256,4,4096] parameter(0)
-    %p1 = bf16[1,4,4096,4096] parameter(1)
+    %p0 = f32[1,256,4,16] parameter(0)
+    %p1 = bf16[1,4,16,4096] parameter(1)
     ROOT %gemm_fusion_r = f32[256,4096] fusion(%p0, %p1), kind=kCustom,
     calls=gemm_fusion_r_computation,
     backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm"},"force_earliest_schedule":false}
@@ -1330,12 +1179,59 @@ TEST_F(GemmFusionAutotunerTest, GeneratesConfigForUpcastGemmWithPrologue) {
       GetPossibleMatmulAutotuneConfigs(*module, compute_capability,
                                        GetToolkitVersion(),
                                        GetDebugOptionsForTest()));
-  EXPECT_TRUE(std::any_of(
-      configs.begin(), configs.end(),
-      [](const GemmFusionAutotunerImpl::BackendConfig& config) {
-        return std::holds_alternative<
-            GemmFusionAutotunerImpl::CustomKernelFusionConfig>(config);
-      }));
+  EXPECT_EQ(
+      2, std::count_if(
+             configs.begin(), configs.end(),
+             [](const GemmFusionAutotunerImpl::BackendConfig& config) {
+               return std::holds_alternative<
+                   GemmFusionAutotunerImpl::CustomKernelFusionConfig>(config);
+             }));
+}
+
+TEST_F(GemmFusionAutotunerTest, GeneratesOneConfigForUpcastGemmWithPrologue) {
+  // Same as GeneratesTwoConfigsForUpcastGemmWithPrologue, but with contracting
+  // dimension size = 128 which is not supported by the SplitK kernel.
+  if (isRocm()) {
+    GTEST_SKIP() << "Not supported on ROCm.";
+  }
+  const std::string kHlo = R"(
+  HloModule module
+
+  %gemm_fusion_r_computation (parameter_0.1: f32[1,256,4,32], parameter_1.1: bf16[1,4,32,4096]) -> f32[256,4096] {
+    %parameter_0.1 = f32[1,256,4,32]{3,2,1,0} parameter(0)
+    %bitcast.60 = f32[256,128]{1,0} bitcast(f32[1,256,4,32]{3,2,1,0} %parameter_0.1)
+    %parameter_1.1 = bf16[1,4,32,4096]{3,2,1,0} parameter(1)
+    %bitcast.61 = bf16[128,4096]{1,0} bitcast(bf16[1,4,32,4096]{3,2,1,0} %parameter_1.1)
+    %convert.22 = f32[128,4096]{1,0} convert(bf16[128,4096]{1,0} %bitcast.61)
+    ROOT r = f32[256,4096]{1,0} dot(f32[256,128]{1,0} %bitcast.60, f32[128,4096]{1,0} %convert.22), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY main {
+    %p0 = f32[1,256,4,32] parameter(0)
+    %p1 = bf16[1,4,32,4096] parameter(1)
+    ROOT %gemm_fusion_r = f32[256,4096] fusion(%p0, %p1), kind=kCustom,
+    calls=gemm_fusion_r_computation,
+    backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm"},"force_earliest_schedule":false}
+  }
+)";
+
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndReturnVerifiedModule(kHlo).value();
+  const se::CudaComputeCapability compute_capability{
+      se::CudaComputeCapability::kAmpere, /*minor=*/0};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<GemmFusionAutotunerImpl::BackendConfig> configs,
+      GetPossibleMatmulAutotuneConfigs(*module, compute_capability,
+                                       GetToolkitVersion(),
+                                       GetDebugOptionsForTest()));
+  EXPECT_EQ(
+      1, std::count_if(
+             configs.begin(), configs.end(),
+             [](const GemmFusionAutotunerImpl::BackendConfig& config) {
+               return std::holds_alternative<
+                   GemmFusionAutotunerImpl::CustomKernelFusionConfig>(config);
+             }));
 }
 
 TEST_F(GemmFusionAutotunerTest,
@@ -1346,13 +1242,13 @@ TEST_F(GemmFusionAutotunerTest,
   const std::string kHlo = R"(
   HloModule module
 
-  %gemm_fusion_r_computation (parameter_0.1: f32[1,256,4,4096], parameter_1.1: bf16[1,4,4096,4096]) -> bf16[1048576] {
-    %parameter_0.1 = f32[1,256,4,4096]{3,2,1,0} parameter(0)
-    %bitcast.60 = f32[256,16384]{1,0} bitcast(f32[1,256,4,4096]{3,2,1,0} %parameter_0.1)
-    %parameter_1.1 = bf16[1,4,4096,4096]{3,2,1,0} parameter(1)
-    %bitcast.61 = bf16[16384,4096]{1,0} bitcast(bf16[1,4,4096,4096]{3,2,1,0} %parameter_1.1)
-    %convert.22 = f32[16384,4096]{1,0} convert(bf16[16384,4096]{1,0} %bitcast.61)
-    %dot.5 = f32[256,4096]{1,0} dot(f32[256,16384]{1,0} %bitcast.60, f32[16384,4096]{1,0} %convert.22), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %gemm_fusion_r_computation (parameter_0.1: f32[1,256,4,16], parameter_1.1: bf16[1,4,16,4096]) -> bf16[1048576] {
+    %parameter_0.1 = f32[1,256,4,16]{3,2,1,0} parameter(0)
+    %bitcast.60 = f32[256,64]{1,0} bitcast(f32[1,256,4,16]{3,2,1,0} %parameter_0.1)
+    %parameter_1.1 = bf16[1,4,16,4096]{3,2,1,0} parameter(1)
+    %bitcast.61 = bf16[64,4096]{1,0} bitcast(bf16[1,4,16,4096]{3,2,1,0} %parameter_1.1)
+    %convert.22 = f32[64,4096]{1,0} convert(bf16[64,4096]{1,0} %bitcast.61)
+    %dot.5 = f32[256,4096]{1,0} dot(f32[256,64]{1,0} %bitcast.60, f32[64,4096]{1,0} %convert.22), lhs_contracting_dims={1}, rhs_contracting_dims={0}
     %convert.23 = bf16[256,4096]{1,0} convert(f32[256,4096]{1,0} %dot.5)
     %bitcast.62 = bf16[1,256,4096]{2,1,0} bitcast(bf16[256,4096]{1,0} %convert.23)
     %transpose.18 = bf16[1,4096,256]{2,1,0} transpose(bf16[1,256,4096]{2,1,0} %bitcast.62), dimensions={0,2,1}
@@ -1360,8 +1256,8 @@ TEST_F(GemmFusionAutotunerTest,
   }
 
   ENTRY main {
-    %p0 = f32[1,256,4,4096] parameter(0)
-    %p1 = bf16[1,4,4096,4096] parameter(1)
+    %p0 = f32[1,256,4,16] parameter(0)
+    %p1 = bf16[1,4,16,4096] parameter(1)
     ROOT %gemm_fusion_r = bf16[1048576] fusion(%p0, %p1), kind=kCustom,
     calls=gemm_fusion_r_computation,
     backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm"},"force_earliest_schedule":false}
@@ -1378,12 +1274,13 @@ TEST_F(GemmFusionAutotunerTest,
       GetPossibleMatmulAutotuneConfigs(*module, compute_capability,
                                        GetToolkitVersion(),
                                        GetDebugOptionsForTest()));
-  EXPECT_TRUE(std::any_of(
-      configs.begin(), configs.end(),
-      [](const GemmFusionAutotunerImpl::BackendConfig& config) {
-        return std::holds_alternative<
-            GemmFusionAutotunerImpl::CustomKernelFusionConfig>(config);
-      }));
+  EXPECT_EQ(
+      2, std::count_if(
+             configs.begin(), configs.end(),
+             [](const GemmFusionAutotunerImpl::BackendConfig& config) {
+               return std::holds_alternative<
+                   GemmFusionAutotunerImpl::CustomKernelFusionConfig>(config);
+             }));
 }
 
 // Implements KeyValueStoreInterface for testing. When attempting to get a key
@@ -1448,6 +1345,7 @@ absl::StatusOr<AutotuneResults> GetDummyAutotuneResultsForCacheKey(
                             run_time { nanos: 14 }
                           }
                         })pb"));
+  AddVersionToAutotuneResults(autotune_results);
   autotune_results.mutable_results(0)->set_device(
       std::string(cache_key.GetModelStr()));
   autotune_results.mutable_results(0)->set_hlo(std::string(cache_key.GetHlo()));
@@ -1679,10 +1577,64 @@ TEST_F(GemmFusionAutotunerTest, RewritesGemmFusionToCustomKernelFusion) {
   EXPECT_TRUE(file_check_matches);
 }
 
-TEST_F(GemmFusionAutotunerTest, NumCtasAutotuningOnHopper) {
-  if (!GetCudaComputeCapability().IsAtLeastHopper()) {
-    GTEST_SKIP() << "NumCtas autotuning is only supported from Hopper onwards.";
-  }
+TEST_F(DynamicSearchSpaceAutotunerTest, AutotunesSimpleDotFusion) {
+  const std::string hlo = R"(
+HloModule module
+ENTRY e {
+  x = s8[128,64] parameter(0)
+  c = f16[128,64] convert(x)
+  y = f16[64,6144] parameter(1)
+  ROOT out = f16[128,6144] dot(c, y),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  CheckTritonAutotuning(hlo, R"(
+// CHECK: ENTRY
+// CHECK: ROOT
+// CHECK-SAME: kCustom
+// CHECK-SAME: block_m
+)");
+  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{/*aabs=*/5e-3, /*arel=*/5e-3}));
+}
+
+TEST_F(DynamicSearchSpaceAutotunerTest, UsesSplitKForSmallOuterDimensions) {
+  const std::string hlo = R"(
+HloModule module
+ENTRY e {
+  x = s8[32,16384] parameter(0)
+  c = f16[32,16384] convert(x)
+  y = f16[16384,32] parameter(1)
+  ROOT out = f16[32,32] dot(c, y),
+                lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  CheckTritonAutotuning(hlo, R"(
+// CHECK: ENTRY
+// CHECK: __triton_gemm
+// CHECK-NOT: "split_k":"1"
+// CHECK: ROOT
+)");
+}
+
+TEST_F(DynamicSearchSpaceAutotunerTest,
+       FindsValidConfigForSlicedContractingDimension) {
+  const std::string hlo = R"(
+ENTRY e {
+  p0 = f16[32,16400] parameter(0)
+  s0 = f16[32,16384] slice(p0), slice={[0:32], [11:16395]}
+  p1 = f16[16384,32] parameter(1)
+  ROOT _ = f16[32,32] dot(s0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  CheckTritonAutotuning(hlo, R"(
+// CHECK: ENTRY
+// CHECK: __triton_gemm
+)");
+}
+
+TEST_F(DynamicSearchSpaceAutotunerDisabledTest,
+       ReturnsSingleConfigWhenAutotuningIsDisabled) {
   std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
 ENTRY e {
   p0 = f32[1024,1024] parameter(0)
@@ -1691,21 +1643,55 @@ ENTRY e {
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })")
                                                   .value();
+  const se::CudaComputeCapability compute_capability{
+      se::CudaComputeCapability::kAmpere, /*minor=*/0};
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
+      GetPossibleMatmulAutotuneTritonConfigs(
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
+          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
+
+  EXPECT_EQ(configs.size(), 1);
+}
+
+TEST_F(GemmFusionAutotunerTest, VerifyHopperConfigsAreDifferentFromBlackwell) {
+  if (isRocm()) {
+    GTEST_SKIP() << "Not supported on ROCm.";
+  }
 
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_exhaustive_tiling_search(true);
-  debug_options.set_xla_gpu_enable_triton_hopper(true);
-  debug_options.set_xla_gpu_autotune_level(1);
+  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
+    ENTRY e {
+      p0 = f32[1024,1024] parameter(0)
+      p1 = f32[1024,1024] parameter(1)
+      ROOT r = f32[1024,1024] dot(p0, p1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })")
+                                                  .value();
 
   TF_ASSERT_OK_AND_ASSIGN(
-      const std::vector<TritonGemmConfig> configs,
+      const std::vector<TritonGemmConfig> blackwell_configs,
       GetPossibleMatmulAutotuneTritonConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          GetCudaComputeCapability(), GetToolkitVersion(), debug_options));
-  EXPECT_TRUE(std::any_of(
-      configs.begin(), configs.end(),
-      [](const TritonGemmConfig& config) { return config.num_ctas > 2; }));
+          se::CudaComputeCapability(se::CudaComputeCapability::kBlackwell, 0),
+          GetToolkitVersion(), GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> hopper_configs,
+      GetPossibleMatmulAutotuneTritonConfigs(
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
+          se::CudaComputeCapability(se::CudaComputeCapability::kHopper, 0),
+          GetToolkitVersion(), GetDebugOptionsForTest()));
+
+  std::set<TritonGemmConfig> blackwell_configs_set(blackwell_configs.begin(),
+                                                   blackwell_configs.end());
+  std::set<TritonGemmConfig> hopper_configs_set(hopper_configs.begin(),
+                                                hopper_configs.end());
+
+  EXPECT_GT(blackwell_configs_set.size(), 0);
+  EXPECT_GT(hopper_configs_set.size(), 0);
+  EXPECT_NE(blackwell_configs_set, hopper_configs_set);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/autotuning/redzone_buffers.cc b/third_party/xla/xla/service/gpu/autotuning/redzone_buffers.cc
new file mode 100644
index 000000000000..4c99f2225583
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/redzone_buffers.cc
@@ -0,0 +1,125 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
+
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::stream_executor;
+
+absl::StatusOr<RedzoneBuffers> RedzoneBuffers::FromInstruction(
+    const HloInstruction& instruction, se::DeviceMemoryAllocator* allocator,
+    se::Stream* stream, BuffersToCreate buffers_to_create,
+    bool should_init_buffers, bool should_check_correctness,
+    int redzone_padding_bytes) {
+  tsl::profiler::TraceMe traceme("create redzone buffers");
+  RedzoneBuffers buffers;
+  buffers.redzone_allocator_ = std::make_unique<se::RedzoneAllocator>(
+      stream, allocator,
+      /*memory_limit=*/std::numeric_limits<int64_t>::max(),
+      /*redzone_size=*/should_check_correctness ? redzone_padding_bytes : 0);
+
+  int64_t rng_state = 0;
+
+  TF_RETURN_IF_ERROR(
+      buffers.CreateInputs(instruction, should_init_buffers, rng_state));
+
+  if (buffers_to_create == BuffersToCreate::kAllInputsAllOutputs ||
+      buffers_to_create == BuffersToCreate::kAllInputsOutputsNoScratch) {
+    TF_RETURN_IF_ERROR(buffers.CreateOutputs(instruction, buffers_to_create,
+                                             should_init_buffers, rng_state));
+  }
+
+  return buffers;
+}
+
+absl::Status RedzoneBuffers::CreateInputs(const HloInstruction& instruction,
+                                          bool should_init_buffers,
+                                          int64_t& rng_state) {
+  tsl::profiler::TraceMe traceme("create inputs");
+  for (const auto* operand : instruction.operands()) {
+    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buf,
+                        redzone_allocator_->CreateBuffer(
+                            operand->shape(), should_init_buffers, rng_state));
+    input_buffers_.push_back(buf);
+    input_shapes_.push_back(operand->shape());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RedzoneBuffers::CreateOutputs(const HloInstruction& instruction,
+                                           BuffersToCreate buffers_to_create,
+                                           bool should_init_buffers,
+                                           int64_t& rng_state) {
+  tsl::profiler::TraceMe traceme("create outputs");
+  if (!instruction.shape().IsTuple()) {
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemoryBase buf,
+        redzone_allocator_->CreateBuffer(instruction.shape(),
+                                         should_init_buffers, rng_state));
+    output_buffers_.push_back(buf);
+    output_shape_ = instruction.shape();
+    return absl::OkStatus();
+  }
+
+  // The output is a tuple.
+
+  auto current_shape_it = instruction.shape().tuple_shapes().begin();
+  auto end = instruction.shape().tuple_shapes().end();
+  end -= buffers_to_create == kAllInputsAllOutputs ? 0 : 1;
+
+  output_shape_ = std::distance(current_shape_it, end) == 1
+                      ? output_shape_ = *current_shape_it
+                      : ShapeUtil::MakeTupleShape(
+                            std::vector<Shape>{current_shape_it, end});
+
+  for (; current_shape_it < end; current_shape_it++) {
+    if (current_shape_it->IsTuple()) {
+      return Unimplemented("Nested tuples are unsupported by RedzoneBuffers.");
+    }
+    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buf,
+                        redzone_allocator_->CreateBuffer(
+                            *current_shape_it, should_init_buffers, rng_state));
+    output_buffers_.push_back(buf);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/redzone_buffers.h b/third_party/xla/xla/service/gpu/autotuning/redzone_buffers.h
new file mode 100644
index 000000000000..51de7fa9ffbf
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/redzone_buffers.h
@@ -0,0 +1,97 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_AUTOTUNING_REDZONE_BUFFERS_H_
+#define XLA_SERVICE_GPU_AUTOTUNING_REDZONE_BUFFERS_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// A RedZone allocator and a collection of buffers that store the inputs and
+// outputs of an HloInstruction. These are used when running the instruction
+// for autotuning.
+class RedzoneBuffers {
+ public:
+  enum BuffersToCreate {
+    // Create a buffer for all of the instruction's operands. The result shape
+    // is ignored.
+    kAllInputs = 0,
+    // Create a buffer for all of the instruction's operands and the entire
+    // result shape. If the result shape is a tuple, a separate buffer is
+    // created for each subshape.
+    kAllInputsAllOutputs = 1,
+    // Create a buffer for all of the instruction's operands and all of the
+    // subshapes of the result tuple, except for the last one. The last subshape
+    // is considered a scratch buffer and is assumed to be allocated elsewhere.
+    // If the result shape is not a tuple, this will create a buffer
+    // corresponding to the entire shape - equivalent to `kAllInputsAllOutputs`.
+    kAllInputsOutputsNoScratch = 2,
+  };
+
+  static absl::StatusOr<RedzoneBuffers> FromInstruction(
+      const HloInstruction& instruction, se::DeviceMemoryAllocator* allocator,
+      se::Stream* stream, BuffersToCreate buffers_to_create,
+      bool should_init_buffers, bool should_check_correctness,
+      int redzone_padding_bytes);
+
+  const std::vector<se::DeviceMemoryBase>& input_buffers() const {
+    return input_buffers_;
+  }
+
+  const std::vector<Shape>& input_shapes() const { return input_shapes_; }
+
+  const std::vector<se::DeviceMemoryBase>& output_buffers() const {
+    return output_buffers_;
+  }
+
+  const Shape& output_shape() const { return output_shape_; }
+  se::RedzoneAllocator& RedzoneAllocator() const { return *redzone_allocator_; }
+
+ private:
+  absl::Status CreateInputs(const HloInstruction& instruction,
+                            bool should_init_buffers, int64_t& rng_state);
+
+  absl::Status CreateOutputs(const HloInstruction& instruction,
+                             BuffersToCreate buffers_to_create,
+                             bool should_init_buffers, int64_t& rng_state);
+
+  std::unique_ptr<se::RedzoneAllocator> redzone_allocator_;
+  std::vector<se::DeviceMemoryBase> input_buffers_;
+  std::vector<Shape> input_shapes_;
+  std::vector<se::DeviceMemoryBase> output_buffers_;
+  Shape output_shape_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_AUTOTUNING_REDZONE_BUFFERS_H_
diff --git a/third_party/xla/xla/service/gpu/autotuning/redzone_buffers_test.cc b/third_party/xla/xla/service/gpu/autotuning/redzone_buffers_test.cc
new file mode 100644
index 000000000000..9fc804ca4303
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/redzone_buffers_test.cc
@@ -0,0 +1,196 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+using RedzoneBuffersTest = HloTestBase;
+
+constexpr int kRedzonePaddingBytes = 8 * 1024 * 1024;
+
+TEST_F(RedzoneBuffersTest, VerifyOutputNotATuple) {
+  constexpr absl::string_view kHlo = R"(
+  HloModule hlo
+  ENTRY main {
+    p0 = f32[2,2] parameter(0)
+    p1 = f32[4,4] parameter(1)
+    p2 = f32[6,6] parameter(2)
+    ROOT root = f32[1,2,3] custom-call(p0, p1, p2), custom_call_target="fake"
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
+  auto& root = *module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetDefaultPlatform());
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_executor,
+                          platform->ExecutorForDevice(0));
+  auto allocator =
+      std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor);
+  TF_ASSERT_OK_AND_ASSIGN(se::Stream * stream, allocator->GetStream(0));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      RedzoneBuffers rzb,
+      RedzoneBuffers::FromInstruction(root, allocator.get(), stream,
+                                      RedzoneBuffers::kAllInputs, true, true,
+                                      kRedzonePaddingBytes));
+
+  EXPECT_EQ(rzb.input_shapes().size(), 3);
+  EXPECT_EQ(rzb.input_buffers().size(), 3);
+  EXPECT_EQ(rzb.output_buffers().size(), 0);
+  EXPECT_NE(rzb.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      RedzoneBuffers rzb2,
+      RedzoneBuffers::FromInstruction(root, allocator.get(), stream,
+                                      RedzoneBuffers::kAllInputsAllOutputs,
+                                      true, true, kRedzonePaddingBytes));
+
+  EXPECT_EQ(rzb2.input_shapes().size(), 3);
+  EXPECT_EQ(rzb2.input_buffers().size(), 3);
+  EXPECT_EQ(rzb2.output_buffers().size(), 1);
+  EXPECT_EQ(rzb2.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
+                          RedzoneBuffers::FromInstruction(
+                              root, allocator.get(), stream,
+                              RedzoneBuffers::kAllInputsOutputsNoScratch, true,
+                              true, kRedzonePaddingBytes));
+
+  EXPECT_EQ(rzb3.input_shapes().size(), 3);
+  EXPECT_EQ(rzb3.input_buffers().size(), 3);
+  EXPECT_EQ(rzb3.output_buffers().size(), 1);
+  EXPECT_EQ(rzb3.output_shape(), root.shape());
+}
+
+TEST_F(RedzoneBuffersTest, VerifyOutputTupleOneElement) {
+  constexpr absl::string_view kHlo = R"(
+  HloModule hlo
+  ENTRY main {
+    p0 = f32[2,2] parameter(0)
+    p1 = f32[4,4] parameter(1)
+    p2 = f32[6,6] parameter(2)
+    ROOT root = (f32[1,2,3]) custom-call(p0, p1, p2), custom_call_target="fake"
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
+  auto& root = *module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetDefaultPlatform());
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_executor,
+                          platform->ExecutorForDevice(0));
+  auto allocator =
+      std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor);
+  TF_ASSERT_OK_AND_ASSIGN(se::Stream * stream, allocator->GetStream(0));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      RedzoneBuffers rzb,
+      RedzoneBuffers::FromInstruction(root, allocator.get(), stream,
+                                      RedzoneBuffers::kAllInputs, true, true,
+                                      kRedzonePaddingBytes));
+  EXPECT_EQ(rzb.input_shapes().size(), 3);
+  EXPECT_EQ(rzb.input_buffers().size(), 3);
+  EXPECT_EQ(rzb.output_buffers().size(), 0);
+  EXPECT_NE(rzb.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      RedzoneBuffers rzb2,
+      RedzoneBuffers::FromInstruction(root, allocator.get(), stream,
+                                      RedzoneBuffers::kAllInputsAllOutputs,
+                                      true, true, kRedzonePaddingBytes));
+  EXPECT_EQ(rzb2.input_shapes().size(), 3);
+  EXPECT_EQ(rzb2.input_buffers().size(), 3);
+  EXPECT_EQ(rzb2.output_buffers().size(), 1);
+  EXPECT_FALSE(rzb2.output_shape().IsTuple());
+  EXPECT_EQ(rzb2.output_shape(), root.shape().tuple_shapes(0));
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
+                          RedzoneBuffers::FromInstruction(
+                              root, allocator.get(), stream,
+                              RedzoneBuffers::kAllInputsOutputsNoScratch, true,
+                              true, kRedzonePaddingBytes));
+
+  EXPECT_EQ(rzb3.input_shapes().size(), 3);
+  EXPECT_EQ(rzb3.input_buffers().size(), 3);
+  EXPECT_EQ(rzb3.output_buffers().size(), 0);
+}
+
+TEST_F(RedzoneBuffersTest, VerifyOutputTupleTwoElements) {
+  constexpr absl::string_view kHlo = R"(
+  HloModule hlo
+  ENTRY main {
+    p0 = f32[2,2] parameter(0)
+    p1 = f32[4,4] parameter(1)
+    p2 = f32[6,6] parameter(2)
+    ROOT root = (f32[1,2,3], u8[1,2]) custom-call(p0, p1, p2),
+    custom_call_target="fake"
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
+  auto& root = *module->entry_computation()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetDefaultPlatform());
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_executor,
+                          platform->ExecutorForDevice(0));
+  auto allocator =
+      std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor);
+  TF_ASSERT_OK_AND_ASSIGN(se::Stream * stream, allocator->GetStream(0));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      RedzoneBuffers rzb,
+      RedzoneBuffers::FromInstruction(root, allocator.get(), stream,
+                                      RedzoneBuffers::kAllInputs, true, true,
+                                      kRedzonePaddingBytes));
+  EXPECT_EQ(rzb.input_shapes().size(), 3);
+  EXPECT_EQ(rzb.input_buffers().size(), 3);
+  EXPECT_EQ(rzb.output_buffers().size(), 0);
+  EXPECT_NE(rzb.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      RedzoneBuffers rzb2,
+      RedzoneBuffers::FromInstruction(root, allocator.get(), stream,
+                                      RedzoneBuffers::kAllInputsAllOutputs,
+                                      true, true, kRedzonePaddingBytes));
+  EXPECT_EQ(rzb2.input_shapes().size(), 3);
+  EXPECT_EQ(rzb2.input_buffers().size(), 3);
+  EXPECT_EQ(rzb2.output_buffers().size(), 2);
+  EXPECT_TRUE(rzb2.output_shape().IsTuple());
+  EXPECT_EQ(rzb2.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
+                          RedzoneBuffers::FromInstruction(
+                              root, allocator.get(), stream,
+                              RedzoneBuffers::kAllInputsOutputsNoScratch, true,
+                              true, kRedzonePaddingBytes));
+  EXPECT_EQ(rzb3.input_shapes().size(), 3);
+  EXPECT_EQ(rzb3.input_buffers().size(), 3);
+  EXPECT_EQ(rzb3.output_buffers().size(), 1);
+  EXPECT_FALSE(rzb3.output_shape().IsTuple());
+  EXPECT_EQ(rzb3.output_shape(), root.shape().tuple_shapes(0));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index a7ff5bfba234..1d5dfc7fd347 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -105,6 +105,8 @@ message GemmBackendConfig {
   optional bool grad_x = 16;
   optional bool grad_y = 17;
   bool damax_output = 18;
+
+  reserved 19;
 }
 
 // Backend config for bitcast operation generated from MLIR MHLO dialect.
@@ -116,19 +118,20 @@ message BitcastBackendConfig {
 // Backend config for async collective operations. Note that for is_sync will
 // be false by default, so even if a backend config is not explicitly attached
 // to the HLOInstruction, getting the backend_config will yield a default valued
-// proto which will have is_sync = false. Attribute no_parallel_custom_call
-// asserts that an asynchronous collective operation does not execute in
-// parallel with custom-calls, which can trigger device synchronization . This
-// attribute will also be false by default and should lead to conversative
-// runtime behavior.
+// proto which will have is_sync = false.
 message CollectiveBackendConfig {
   bool is_sync = 1;
-  bool no_parallel_custom_call = 2;
   // Determines whether the collective op of interested has been pipelined
   // within a loop.
   bool is_pipelined = 3;
-  // Cost model prediction.
-  ReificationCost reification_cost = 4;
+
+  enum CollectiveBackend {
+    // Default backend is NCCL.
+    DEFAULT = 0;
+    NVSHMEM = 1;
+  }
+  CollectiveBackend backend = 5;
+  reserved 2, 4;
 }
 
 // Backend config for cost model estimates.
@@ -147,6 +150,9 @@ message ReificationCost {
 
   // Estimate for memory access (read+write) time in microseconds.
   double memory_access_time_us = 4;
+
+  // Name of the internal cost modeling approach.
+  string name = 5;
 }
 
 // Backend config for a custom fusion (pre-compiled device kernel implementing a
@@ -180,6 +186,12 @@ message BlockLevelFusionConfig {
 
   // The number of warps to use for the kernel.
   int64 num_warps = 2;
+
+  // The number of CTAs to use for the kernel.
+  int32 num_ctas = 4;
+
+  // The number of stages to use for loop pipelining.
+  int32 num_stages = 5;
 }
 
 message FusionBackendConfig {
@@ -198,17 +210,18 @@ message FusionBackendConfig {
   // present, we use the default Triton config.
   AutotuneResult.TritonGemmKey triton_gemm_config = 2;
 
-  // Only valid when kind == "__triton" for now. Code generation of such
-  // fusions will fail if this field is not set.
+  // Only valid when kind is "__triton" or "__triton_nested_gemm_fusion". Code
+  // generation of such fusions will fail if this field is not set.
   BlockLevelFusionConfig block_level_fusion_config = 6;
 
   // Only valid when kind == "__custom_fusion".
   CustomFusionConfig custom_fusion_config = 4;
 
   // Cost model prediction.
-  ReificationCost reification_cost = 3;
 
   CuDnnFusionConfig cudnn_fusion_config = 5;
+
+  reserved 3;
 }
 
 // Backed config for norm executed by cuDNN.
@@ -283,6 +296,9 @@ message CudnnfMHABackendConfig {
   // Only used with packed layout
   // ignored if the valued <= 1
   int32 max_seg_per_batch = 25;
+
+  // Is Paged Attention
+  bool is_paged_attention = 26;
 }
 
 // Backend config for a general custom call instruction, e.g. XLA FFI.
@@ -297,7 +313,7 @@ message CustomCallBackendConfig {
 }
 
 // Generic backend config for XLA:GPU
-// Next-Id: 12
+// Next-Id: 13
 message GpuBackendConfig {
   // Specifies which operation queue the current instruction will run on.
   // A backend may have multiple operation queues to run instructions
@@ -340,4 +356,8 @@ message GpuBackendConfig {
   // is always scheduled first as it's not SM-heavy.
   // In this case we can use this flag to enforce the ordering.
   bool force_earliest_schedule = 10;
+
+  // Reification cost for the instruction which attaches this
+  // `GpuBackendConfig`.
+  repeated ReificationCost reification_cost = 12;
 }
diff --git a/third_party/xla/xla/service/gpu/backend_configs_test.cc b/third_party/xla/xla/service/gpu/backend_configs_test.cc
index 7883547f077d..c2b5e87d54ac 100644
--- a/third_party/xla/xla/service/gpu/backend_configs_test.cc
+++ b/third_party/xla/xla/service/gpu/backend_configs_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -31,10 +31,11 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::testing::HasSubstr;
 using ::testing::IsFalse;
 using ::tsl::testing::IsOk;
 
-using BackendConfigsTest = HloTestBase;
+using BackendConfigsTest = HloHardwareIndependentTestBase;
 
 TEST_F(BackendConfigsTest, DefaultCollectiveBackendConfig) {
   constexpr absl::string_view kHloString = R"(
@@ -59,7 +60,6 @@ TEST_F(BackendConfigsTest, DefaultCollectiveBackendConfig) {
   const auto& collective_backend_config =
       gpu_config.collective_backend_config();
   EXPECT_THAT(collective_backend_config.is_sync(), IsFalse());
-  EXPECT_THAT(collective_backend_config.no_parallel_custom_call(), IsFalse());
 }
 
 TEST_F(BackendConfigsTest, DefaultGpuBackendConfigParseOpQueue) {
@@ -132,9 +132,10 @@ TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetOpQueue) {
   GpuBackendConfig gpu_backend_config;
   gpu_backend_config.set_operation_queue_id(2);
   EXPECT_THAT(add->set_backend_config(gpu_backend_config), IsOk());
-  EXPECT_EQ(add->raw_backend_config_string(),
-            "{\"operation_queue_id\":\"2\",\"wait_on_operation_queues\":[],"
-            "\"force_earliest_schedule\":false}");
+  EXPECT_THAT(
+      add->raw_backend_config_string(),
+      HasSubstr("{\"operation_queue_id\":\"2\",\"wait_on_operation_queues\":[],"
+                "\"force_earliest_schedule\":false"));
 }
 
 TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetWaitOnQueue) {
@@ -159,9 +160,11 @@ TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetWaitOnQueue) {
   gpu_backend_config.mutable_wait_on_operation_queues()->Add(0);
   gpu_backend_config.mutable_wait_on_operation_queues()->Add(1);
   EXPECT_THAT(add->set_backend_config(gpu_backend_config), IsOk());
-  EXPECT_EQ(add->raw_backend_config_string(),
-            "{\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[\"0\","
-            "\"1\"],\"force_earliest_schedule\":false}");
+  EXPECT_THAT(
+      add->raw_backend_config_string(),
+      HasSubstr(
+          "{\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[\"0\","
+          "\"1\"],\"force_earliest_schedule\":false"));
   TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig config,
                           add->backend_config<GpuBackendConfig>());
 }
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cu.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cu.cc
deleted file mode 100644
index 08563ce27320..000000000000
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cu.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-#include "xla/primitive_util.h"
-#include "xla/types.h"
-
-namespace xla::gpu::buffer_comparator {
-
-// Comparison kernel code: compare two buffers of
-// fp8/bf16/fp16/fp32/fp64/int8_t/int32_t of length buffer_length where the
-// relative error does not exceed the passed rel_error_threshold. Write the
-// number of mismatches into out parameter mismatch_count.
-
-namespace {
-
-// NaN's are considered equal, and for half's we clamp all numbers to largest
-// and smallest numbers representable to avoid miscomparisons due to overflows.
-template <typename T>
-__device__ __inline__ auto Canonicalize(T elem) {
-  // All fp16 infinities are treated as 65505 or -65505, in order to avoid
-  // differences due to overflows.
-  if (Eigen::numext::isinf(elem)) {
-    return std::copysignf(Eigen::NumTraits<xla::half>::highest(), elem);
-  }
-  return static_cast<float>(elem);
-}
-
-template <>
-__device__ __inline__ auto Canonicalize(float elem) {
-  return elem;
-}
-
-template <>
-__device__ __inline__ auto Canonicalize(double elem) {
-  return elem;
-}
-
-template <typename T>
-__global__ void xla_fp_comparison(T* buffer_a, T* buffer_b,
-                                  float rel_error_threshold,
-                                  uint64_t buffer_length, int* mismatch_count) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx >= buffer_length) {
-    return;
-  }
-
-  auto elem_a = Canonicalize(buffer_a[idx]);
-  auto elem_b = Canonicalize(buffer_b[idx]);
-
-  // NaN's are considered equal.
-  if (Eigen::numext::isnan(elem_a) && Eigen::numext::isnan(elem_b)) {
-    return;
-  }
-
-  // Two infinities are considered equal. Computing relative error would
-  // otherwise result in NaN.
-  if (elem_a == elem_b) {
-    return;
-  }
-
-  float rel_error = Eigen::numext::abs(elem_a - elem_b) /
-                    (Eigen::numext::maxi(Eigen::numext::abs(elem_a),
-                                         Eigen::numext::abs(elem_b)) +
-                     1);
-
-  if (rel_error > rel_error_threshold || Eigen::numext::isnan(rel_error))
-    atomicAdd(mismatch_count, 1);
-}
-
-// TODO(b/191520348): The comparison below requires exact equality.
-template <typename T>
-__global__ void xla_int_comparison(T* buffer_a, T* buffer_b,
-                                   float rel_error_threshold,
-                                   uint64_t buffer_length,
-                                   int* mismatch_count) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx >= buffer_length) return;
-  float elem_a;
-  float elem_b;
-  if constexpr (std::numeric_limits<T>::is_signed) {
-    elem_a = static_cast<int64_t>(buffer_a[idx]);
-    elem_b = static_cast<int64_t>(buffer_b[idx]);
-  } else {
-    elem_a = static_cast<uint64_t>(buffer_a[idx]);
-    elem_b = static_cast<uint64_t>(buffer_b[idx]);
-  }
-  float rel_error =
-      fabs(elem_a - elem_b) / (fmax(fabs(elem_a), fabs(elem_b)) + 1);
-  if (rel_error > rel_error_threshold || isnan(rel_error))
-    atomicAdd(mismatch_count, 1);
-}
-
-}  // namespace
-
-void* comparison_fn(const xla::PrimitiveType type) {
-  if (xla::primitive_util::IsFloatingPointType(type)) {
-    return primitive_util::FloatingPointTypeSwitch<void*>(
-        [](auto cst_type) {
-          using native_type = primitive_util::NativeTypeOf<cst_type>;
-          return reinterpret_cast<void*>(&xla_fp_comparison<native_type>);
-        },
-        type);
-  }
-  if (xla::primitive_util::IsIntegralType(type)) {
-    return primitive_util::IntegralTypeSwitch<void*>(
-        [](auto cst_type) {
-          using native_type = primitive_util::NativeTypeOf<cst_type>;
-          return reinterpret_cast<void*>(&xla_int_comparison<native_type>);
-        },
-        type);
-  }
-  return nullptr;
-}
-
-}  // namespace xla::gpu::buffer_comparator
diff --git a/third_party/xla/xla/service/gpu/buffer_sharing.cc b/third_party/xla/xla/service/gpu/buffer_sharing.cc
index 62904053b58f..1030ae48a0b8 100644
--- a/third_party/xla/xla/service/gpu/buffer_sharing.cc
+++ b/third_party/xla/xla/service/gpu/buffer_sharing.cc
@@ -132,7 +132,7 @@ std::optional<bool> FusionCanShareBufferHint(
       // the scatter inputs.
       if (hlo == non_bitcast_root && hlo->opcode() == HloOpcode::kScatter) {
         int64_t num_scatter_inputs =
-            hlo->shape().IsTuple() ? hlo->shape().tuple_shapes_size() : 1;
+            hlo->shape().IsTuple() ? hlo->shape().tuple_shapes().size() : 1;
         if (hlo->operand_index(hlo_operand) < num_scatter_inputs &&
             absl::c_count(hlo->operands(), hlo_operand) == 1) {
           continue;
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 9ae3e2ab0b08..92242b1b2a49 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -42,6 +42,9 @@ def get_cub_sort_kernel_types(name = ""):
         "u8_b16",
         "u8_b32",
         "u8_b64",
+        "f32_b16",
+        "f32_b32",
+        "f32_b64",
     ]
 
 def build_cub_sort_kernels(name, types, local_defines = [], **kwargs):
@@ -176,7 +179,7 @@ def gen_gpu_hlo_compile_tests(
 
         # Expand "gpu" backend name to specific GPU backends and update tags.
         backends, disabled_backends, backend_tags, backend_args = \
-            prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args)
+            prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args, tags)
 
         backends = [
             backend
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 106682459989..add86e12af4b 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectRegistry.h"
@@ -75,6 +76,7 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/path.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla::gpu {
 
@@ -125,14 +127,14 @@ CompileModuleResults InitializeResults(const HloModule* hlo_module,
   results.module_name = module_name;
   results.llvm_module =
       std::make_unique<llvm::Module>(module_name, *llvm_context);
-  results.llvm_module->setTargetTriple(target_triple);
+  results.llvm_module->setTargetTriple(llvm::Triple(target_triple));
   results.llvm_module->setDataLayout(data_layout);
 
   if (split_constants_module) {
     // Constants are emitted into a separate module to avoid caching them.
     results.llvm_module_constants = std::make_unique<llvm::Module>(
         absl::StrCat(module_name, "_consts"), *llvm_context);
-    results.llvm_module_constants->setTargetTriple(target_triple);
+    results.llvm_module_constants->setTargetTriple(llvm::Triple(target_triple));
     results.llvm_module_constants->setDataLayout(data_layout);
   }
 
@@ -214,6 +216,7 @@ absl::StatusOr<std::unique_ptr<SequentialThunk>> LowerHlo(
 
 absl::Status LoadCache(IrEmitterContext& ir_emitter_context,
                        absl::string_view cache_file_path) {
+  tsl::profiler::TraceMe traceme("LoadCache");
   CHECK(!cache_file_path.empty());
   std::string resolved_path;
   if (!tsl::io::ResolveTestPrefixes(cache_file_path, resolved_path)) {
@@ -249,9 +252,12 @@ absl::StatusOr<std::unique_ptr<BufferAssignment>> RunBufferAssignment(
   ScopedAnnotation annotation(Phase("XlaBufferAssignment", module));
 
   const DebugOptions& options = module->config().debug_options();
-  BufferAssigner::Colorer colorer = options.xla_gpu_enable_nccl_user_buffers()
-                                        ? CollectiveColorer()
-                                        : BufferAssigner::DefaultColorer();
+  BufferAssigner::Colorer colorer =
+      (options.xla_gpu_enable_nccl_user_buffers() ||
+       options.xla_gpu_experimental_enable_nvshmem())
+          ? CollectiveColorer(options.xla_gpu_enable_nccl_user_buffers(),
+                              options.xla_gpu_experimental_enable_nvshmem())
+          : BufferAssigner::DefaultColorer();
 
   std::optional<BufferValue::Color> color =
       options.xla_gpu_temp_buffer_use_separate_color()
@@ -286,6 +292,7 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
     const BufferValue::SizeFunction& buffer_size_bytes_function,
     bool split_constants_module) {
+  tsl::profiler::TraceMe traceme("CompileModuleToLlvmIr");
   const bool use_cache =
       UseCache(hlo_module->config().debug_options(), split_constants_module);
 
diff --git a/third_party/xla/xla/service/gpu/conv_layout_normalization.cc b/third_party/xla/xla/service/gpu/conv_layout_normalization.cc
index 0a41310e23ef..58978dda3842 100644
--- a/third_party/xla/xla/service/gpu/conv_layout_normalization.cc
+++ b/third_party/xla/xla/service/gpu/conv_layout_normalization.cc
@@ -45,7 +45,7 @@ absl::StatusOr<std::optional<HloInstruction*>> UpdateLayoutForCudnnConvolution(
       hlo->convolution_dimension_numbers();
 
   auto transpose_dim = [&](int64_t dim, const Shape& unnormalized_shape) {
-    return unnormalized_shape.rank() -
+    return unnormalized_shape.dimensions().size() -
            FindIndex(unnormalized_shape.layout().minor_to_major(), dim) - 1;
   };
 
@@ -110,7 +110,7 @@ absl::StatusOr<std::optional<HloInstruction*>> UpdateLayoutForCudnnConvolution(
 
   Shape normalized_shape;
   if (hlo->shape().IsTuple()) {
-    TF_RET_CHECK(hlo->shape().tuple_shapes().back().rank() == 1)
+    TF_RET_CHECK(hlo->shape().tuple_shapes().back().dimensions().size() == 1)
         << "The last element in the tuple returned by a convolution Custom "
            "Call is expected to be an "
            "allocator of rank one";
@@ -166,9 +166,9 @@ absl::StatusOr<std::optional<HloInstruction*>> UpdateLayoutForCudnnConvolution(
   HloInstruction* bc_to_orig;
   if (normalized_conv->shape().IsTuple()) {
     std::vector<HloInstruction*> tuple_elements(
-        normalized_conv->shape().tuple_shapes_size());
+        normalized_conv->shape().tuple_shapes().size());
 
-    for (int i = 0; i < normalized_conv->shape().tuple_shapes_size(); ++i) {
+    for (int i = 0; i < normalized_conv->shape().tuple_shapes().size(); ++i) {
       TF_ASSIGN_OR_RETURN(HloInstruction * normalized_out,
                           MakeGetTupleElementHlo(normalized_conv, i));
       tuple_elements[i] =
diff --git a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
deleted file mode 100644
index 507bd4d7da95..000000000000
--- a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
+++ /dev/null
@@ -1,260 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/cub_sort_kernel.h"
-
-#include <cstddef>
-#include <cstdint>
-
-#include "xla/service/gpu/gpu_prim.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-#if GOOGLE_CUDA
-#define CHK_GPU_ERR(err)            \
-  if (err != cudaSuccess) {         \
-    return cudaGetErrorString(err); \
-  }
-#elif TENSORFLOW_USE_ROCM
-#define CHK_GPU_ERR(err)           \
-  if (err != hipSuccess) {         \
-    return hipGetErrorString(err); \
-  }
-#endif
-
-template <typename KeyT>
-const char* CubSortKeys(
-    void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,
-    void* d_keys_out, size_t num_items, bool descending,
-    stream_executor::gpu::GpuStreamHandle gpu_stream_handle) {
-  auto err =
-      descending
-          ? gpuprim::DeviceRadixSort::SortKeysDescending<KeyT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out), num_items, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle)
-          : gpuprim::DeviceRadixSort::SortKeys<KeyT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out), num_items, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle);
-  CHK_GPU_ERR(err)
-  return nullptr;
-}
-
-template <typename KeyT>
-const char* CubSortKeys(
-    void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,
-    void* d_keys_out, size_t num_items, bool descending, size_t batch_size,
-    stream_executor::gpu::GpuStreamHandle gpu_stream_handle) {
-  if (batch_size == 1) {
-    return CubSortKeys<KeyT>(d_temp_storage, temp_bytes, d_keys_in, d_keys_out,
-                             num_items, descending, gpu_stream_handle);
-  }
-  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
-  int* start_offsets =
-      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
-  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
-  auto err =
-      descending
-          ? gpuprim::DeviceSegmentedRadixSort::SortKeysDescending<KeyT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out), num_items, batch_size,
-                start_offsets, end_offsets, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle)
-          : gpuprim::DeviceSegmentedRadixSort::SortKeys<KeyT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out), num_items, batch_size,
-                start_offsets, end_offsets, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle);
-  CHK_GPU_ERR(err) return nullptr;
-}
-
-template <typename KeyT, typename ValT>
-const char* CubSortPairs(
-    void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,
-    void* d_keys_out, const void* d_values_in, void* d_values_out,
-    size_t num_items, bool descending,
-    stream_executor::gpu::GpuStreamHandle gpu_stream_handle) {
-  auto err =
-      descending
-          ? gpuprim::DeviceRadixSort::SortPairsDescending<KeyT, ValT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out),
-                static_cast<const ValT*>(d_values_in),
-                static_cast<ValT*>(d_values_out), num_items, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle)
-          : gpuprim::DeviceRadixSort::SortPairs<KeyT, ValT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out),
-                static_cast<const ValT*>(d_values_in),
-                static_cast<ValT*>(d_values_out), num_items, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle);
-  CHK_GPU_ERR(err)
-  return nullptr;
-}
-
-template <typename KeyT, typename ValT>
-const char* CubSortPairs(
-    void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,
-    void* d_keys_out, const void* d_values_in, void* d_values_out,
-    size_t num_items, bool descending, size_t batch_size,
-    stream_executor::gpu::GpuStreamHandle gpu_stream_handle) {
-  if (batch_size == 1) {
-    return CubSortPairs<KeyT, ValT>(d_temp_storage, temp_bytes, d_keys_in,
-                                    d_keys_out, d_values_in, d_values_out,
-                                    num_items, descending, gpu_stream_handle);
-  }
-  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
-  int* start_offsets =
-      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
-  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
-  auto err =
-      descending
-          ? gpuprim::DeviceSegmentedRadixSort::SortPairsDescending<KeyT, ValT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out),
-                static_cast<const ValT*>(d_values_in),
-                static_cast<ValT*>(d_values_out), num_items, batch_size,
-                start_offsets, end_offsets, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle)
-          : gpuprim::DeviceSegmentedRadixSort::SortPairs<KeyT, ValT>(
-                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
-                static_cast<KeyT*>(d_keys_out),
-                static_cast<const ValT*>(d_values_in),
-                static_cast<ValT*>(d_values_out), num_items, batch_size,
-                start_offsets, end_offsets, /*begin_bit=*/0,
-                /*end_bit=*/sizeof(KeyT) * 8, gpu_stream_handle);
-  CHK_GPU_ERR(err)
-  return nullptr;
-}
-
-}  // namespace
-
-#define XLA_CUB_DEFINE_SORT_KEYS(suffix, type)                                \
-  const char* CubSortKeys_##suffix(                                           \
-      void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,        \
-      void* d_keys_out, size_t num_items, bool descending, size_t batch_size, \
-      stream_executor::gpu::GpuStreamHandle gpu_stream_handle) {              \
-    return CubSortKeys<type>(d_temp_storage, temp_bytes, d_keys_in,           \
-                             d_keys_out, num_items, descending, batch_size,   \
-                             gpu_stream_handle);                              \
-  }
-
-#define XLA_CUB_DEFINE_SORT_PAIRS(suffix, type1, type2)                      \
-  const char* CubSortPairs_##suffix(                                         \
-      void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,       \
-      void* d_keys_out, const void* d_values_in, void* d_values_out,         \
-      size_t num_items, bool descending, size_t batch_size,                  \
-      stream_executor::gpu::GpuStreamHandle gpu_stream_handle) {             \
-    return CubSortPairs<type1, type2>(                                       \
-        d_temp_storage, temp_bytes, d_keys_in, d_keys_out, d_values_in,      \
-        d_values_out, num_items, descending, batch_size, gpu_stream_handle); \
-  }
-
-// Floating point types.
-#ifdef CUB_TYPE_BF16
-#if GOOGLE_CUDA
-XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
-#elif TENSORFLOW_USE_ROCM
-XLA_CUB_DEFINE_SORT_KEYS(bf16, hip_bfloat16)
-#endif
-#endif
-#ifdef CUB_TYPE_F16
-XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
-#endif
-#ifdef CUB_TYPE_F32
-XLA_CUB_DEFINE_SORT_KEYS(f32, float)
-#endif
-#ifdef CUB_TYPE_F64
-XLA_CUB_DEFINE_SORT_KEYS(f64, double)
-#endif
-
-// Signed integer types.
-#ifdef CUB_TYPE_S8
-XLA_CUB_DEFINE_SORT_KEYS(s8, int8_t)
-#endif
-#ifdef CUB_TYPE_S16
-XLA_CUB_DEFINE_SORT_KEYS(s16, int16_t)
-#endif
-#ifdef CUB_TYPE_S32
-XLA_CUB_DEFINE_SORT_KEYS(s32, int32_t)
-#endif
-#ifdef CUB_TYPE_S64
-XLA_CUB_DEFINE_SORT_KEYS(s64, int64_t)
-#endif
-
-// Unsigned integer types.
-#ifdef CUB_TYPE_U8
-XLA_CUB_DEFINE_SORT_KEYS(u8, uint8_t)
-#endif
-#ifdef CUB_TYPE_U16
-XLA_CUB_DEFINE_SORT_KEYS(u16, uint16_t)
-#endif
-#ifdef CUB_TYPE_U32
-XLA_CUB_DEFINE_SORT_KEYS(u32, uint32_t)
-#endif
-#ifdef CUB_TYPE_U64
-XLA_CUB_DEFINE_SORT_KEYS(u64, uint64_t)
-#endif
-
-// Pairs with 8-bit key.
-#ifdef CUB_TYPE_U8_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b16, uint8_t, uint16_t)
-#endif
-#ifdef CUB_TYPE_U8_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b32, uint8_t, uint32_t)
-#endif
-#ifdef CUB_TYPE_U8_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u8_b64, uint8_t, uint64_t)
-#endif
-
-// Pairs with 16-bit key.
-#ifdef CUB_TYPE_U16_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b16, uint16_t, uint16_t)
-#endif
-#ifdef CUB_TYPE_U16_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b32, uint16_t, uint32_t)
-#endif
-#ifdef CUB_TYPE_U16_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u16_b64, uint16_t, uint64_t)
-#endif
-
-// Pairs with 32-bit key.
-#ifdef CUB_TYPE_U32_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b16, uint32_t, uint16_t)
-#endif
-#ifdef CUB_TYPE_U32_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b32, uint32_t, uint32_t)
-#endif
-#ifdef CUB_TYPE_U32_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u32_b64, uint32_t, uint64_t)
-#endif
-
-// Pairs with 64-bit key.
-#ifdef CUB_TYPE_U64_B16
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b16, uint64_t, uint16_t)
-#endif
-#ifdef CUB_TYPE_U64_B32
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b32, uint64_t, uint32_t)
-#endif
-#ifdef CUB_TYPE_U64_B64
-XLA_CUB_DEFINE_SORT_PAIRS(u64_b64, uint64_t, uint64_t)
-#endif
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cub_sort_kernel.h b/third_party/xla/xla/service/gpu/cub_sort_kernel.h
deleted file mode 100644
index 29b163e7b1bf..000000000000
--- a/third_party/xla/xla/service/gpu/cub_sort_kernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_CUB_SORT_KERNEL_H_
-#define XLA_SERVICE_GPU_CUB_SORT_KERNEL_H_
-
-#include <cstddef>
-#include <cstdint>
-
-#include "xla/stream_executor/gpu/gpu_types.h"
-
-namespace xla {
-namespace gpu {
-
-// Returns nullptr if no error, otherwise the error message as a null-terminated
-// string (cudaGetErrorString or similar).
-#define XLA_CUB_DECLARE_SORT_KEYS(suffix)                                     \
-  const char* CubSortKeys_##suffix(                                           \
-      void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in,        \
-      void* d_keys_out, size_t num_items, bool descending, size_t batch_size, \
-      stream_executor::gpu::GpuStreamHandle gpu_stream_handle);
-
-// Returns nullptr if no error, otherwise the error message as a null-terminated
-// string (cudaGetErrorString or similar).
-#define XLA_CUB_DECLARE_SORT_PAIRS(suffix)                             \
-  const char* CubSortPairs_##suffix(                                   \
-      void* d_temp_storage, size_t& temp_bytes, const void* d_keys_in, \
-      void* d_keys_out, const void* d_values_in, void* d_values_out,   \
-      size_t num_items, bool descending, size_t batch_size,            \
-      stream_executor::gpu::GpuStreamHandle gpu_stream_handle);
-
-XLA_CUB_DECLARE_SORT_KEYS(bf16)
-XLA_CUB_DECLARE_SORT_KEYS(f16)
-XLA_CUB_DECLARE_SORT_KEYS(f32)
-XLA_CUB_DECLARE_SORT_KEYS(f64)
-XLA_CUB_DECLARE_SORT_KEYS(s8)
-XLA_CUB_DECLARE_SORT_KEYS(s16)
-XLA_CUB_DECLARE_SORT_KEYS(s32)
-XLA_CUB_DECLARE_SORT_KEYS(s64)
-XLA_CUB_DECLARE_SORT_KEYS(u8)
-XLA_CUB_DECLARE_SORT_KEYS(u16)
-XLA_CUB_DECLARE_SORT_KEYS(u32)
-XLA_CUB_DECLARE_SORT_KEYS(u64)
-
-XLA_CUB_DECLARE_SORT_PAIRS(u8_b16)
-XLA_CUB_DECLARE_SORT_PAIRS(u8_b32)
-XLA_CUB_DECLARE_SORT_PAIRS(u8_b64)
-XLA_CUB_DECLARE_SORT_PAIRS(u16_b16)
-XLA_CUB_DECLARE_SORT_PAIRS(u16_b32)
-XLA_CUB_DECLARE_SORT_PAIRS(u16_b64)
-XLA_CUB_DECLARE_SORT_PAIRS(u32_b16)
-XLA_CUB_DECLARE_SORT_PAIRS(u32_b32)
-XLA_CUB_DECLARE_SORT_PAIRS(u32_b64)
-XLA_CUB_DECLARE_SORT_PAIRS(u64_b16)
-XLA_CUB_DECLARE_SORT_PAIRS(u64_b32)
-XLA_CUB_DECLARE_SORT_PAIRS(u64_b64)
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_CUB_SORT_KERNEL_H_
diff --git a/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc b/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
index d9634a914566..917f88be9158 100644
--- a/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
+++ b/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/variant_visitor.h"
+#include "xla/service/overload.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
@@ -33,7 +33,7 @@ namespace {
 bool DimensionRequiresPadding(const int64_t size, const PrimitiveType data_type,
                               const se::GpuComputeCapability& gpu_cc) {
   return std::visit(
-      VariantVisitor{
+      Overload{
           [&](const se::CudaComputeCapability& cc) {
             for (const auto& req : CublasPaddingRequirements) {
               if (cc.IsAtLeast(req.min_compute_capability) &&
@@ -59,7 +59,7 @@ bool ShapeRequiresPadding(const Shape& shape, int batch_dimensions_size,
   // Non-batch dimensions requiring potential padding are placed at higher
   // indices than batch dimensions. This is because dots are canonicalized prior
   // to padding.
-  for (int i = batch_dimensions_size; i < shape.rank(); i++) {
+  for (int i = batch_dimensions_size; i < shape.dimensions().size(); i++) {
     if (DimensionRequiresPadding(shape.dimensions(i), shape.element_type(),
                                  cc)) {
       return true;
diff --git a/third_party/xla/xla/service/gpu/cudnn_support_utils.cc b/third_party/xla/xla/service/gpu/cudnn_support_utils.cc
index 30ac372a86ff..489ed743dae4 100644
--- a/third_party/xla/xla/service/gpu/cudnn_support_utils.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_support_utils.cc
@@ -129,7 +129,7 @@ CudnnInferTransposeForFilterReordering(
     const Shape& shape, const ConvolutionDimensionNumbers& dimension_numbers) {
   // A normal filter should have four dimensions: [O, I, H, W]
   // An already vectorized filter will have five: [O, I/k, H, W, k]; k=4|32
-  if (shape.rank() != 4 && shape.rank() != 5) {
+  if (shape.dimensions().size() != 4 && shape.dimensions().size() != 5) {
     return Internal("Filter shape has unexpected rank.");
   }
 
@@ -140,7 +140,7 @@ CudnnInferTransposeForFilterReordering(
   const int64_t dW = dimension_numbers.kernel_spatial_dimensions().at(1);
   // In case of re-vectorization (rank=5), the missing dimension can be
   // calculated as Σi(i=0..4)-(dO+dI+dH+dW)
-  bool revectorize = shape.rank() == 5;
+  bool revectorize = shape.dimensions().size() == 5;
   const int64_t dZ = revectorize ? 10 - dO - dI - dH - dW : -1;
   const int64_t vsize = revectorize ? shape.dimensions(dZ) : 1;
 
@@ -196,7 +196,7 @@ CudnnInferTransposeForFilterReordering(
 absl::StatusOr<CudnnReorderTransposeConfig>
 CudnnInferTransposeForBiasReordering(const Shape& shape) {
   // Expected bias has one dimension: [O]
-  if (shape.rank() != 1) {
+  if (shape.dimensions().size() != 1) {
     return Internal("Bias shape has unexpected rank.");
   }
   if (shape.dimensions(0) % 32 != 0) {
diff --git a/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc b/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc
index 31a9556abcb6..6fa6ab47e52c 100644
--- a/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_support_utils_test.cc
@@ -30,12 +30,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -48,7 +48,7 @@ namespace {
 
 using ::tsl::testing::IsOkAndHolds;
 
-class CudnnSupportUtilsTest : public HloTestBase {
+class CudnnSupportUtilsTest : public HloHardwareIndependentTestBase {
  public:
   // Gets the custom call with `target` from the `module`. Expects that there is
   // one and only one matching call.
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index a215ed0b578d..9aad22fb5695 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -16,11 +16,12 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <ostream>
 #include <sstream>
 #include <string>
 #include <vector>
 
+#include "xla/shape.h"
+
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
@@ -31,12 +32,11 @@ limitations under the License.
 #define PLATFORM "ROCM"
 #endif
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/ffi.h"
@@ -46,19 +46,19 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
-#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 #if GOOGLE_CUDA
 #define gpuSuccess cudaSuccess
@@ -93,125 +93,7 @@ XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(::xla::Range, StructMember<int64_t>("lo"),
 namespace xla {
 namespace {
 
-class CustomCallTest : public ClientLibraryTestBase {};
-
-bool is_invoked_called = false;
-void Callback_IsInvoked(se::gpu::GpuStreamHandle /*stream*/, void** /*buffers*/,
-                        const char* /*opaque*/, size_t /*opaque_len*/) {
-  is_invoked_called = true;
-}
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_IsInvoked, PLATFORM);
-
-TEST_F(CustomCallTest, IsInvoked) {
-  XlaBuilder b(TestName());
-  CustomCall(&b, "Callback_IsInvoked", /*operands=*/{},
-             ShapeUtil::MakeShape(F32, {}),
-             /*opaque=*/"");
-  EXPECT_FALSE(is_invoked_called);
-  TF_ASSERT_OK(Execute(&b, {}).status());
-  EXPECT_TRUE(is_invoked_called);
-}
-
-TEST_F(CustomCallTest, UnknownTarget) {
-  XlaBuilder b(TestName());
-  CustomCall(&b, "UnknownTarget", /*operands=*/{},
-             ShapeUtil::MakeShape(F32, {}),
-             /*opaque=*/"");
-  ASSERT_FALSE(Execute(&b, {}).ok());
-}
-void Callback_Memcpy(se::gpu::GpuStreamHandle stream, void** buffers,
-                     const char* /*opaque*/, size_t /*opaque_len*/) {
-  void* src = buffers[0];
-  void* dst = buffers[1];
-  auto err = gpuMemcpyAsync(dst, src, /*count=*/sizeof(float) * 128,
-                            gpuMemcpyDeviceToDevice, stream);
-  ASSERT_EQ(err, gpuSuccess);
-}
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Memcpy, PLATFORM);
-TEST_F(CustomCallTest, Memcpy) {
-  XlaBuilder b(TestName());
-  CustomCall(&b, "Callback_Memcpy",
-             /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
-             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
-  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
-  EXPECT_THAT(result.data<float>(), ::testing::Each(42));
-}
-
-// Check that opaque handles nulls within the string.
-std::string& kExpectedOpaque = *new std::string("abc\0def", 7);
-void Callback_Opaque(se::gpu::GpuStreamHandle /*stream*/, void** /*buffers*/,
-                     const char* opaque, size_t opaque_len) {
-  std::string opaque_str(opaque, opaque_len);
-  ASSERT_EQ(opaque_str, kExpectedOpaque);
-}
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Opaque, PLATFORM);
-TEST_F(CustomCallTest, Opaque) {
-  XlaBuilder b(TestName());
-  CustomCall(&b, "Callback_Opaque", /*operands=*/{},
-             ShapeUtil::MakeShape(F32, {}), kExpectedOpaque);
-  TF_ASSERT_OK(Execute(&b, {}).status());
-}
-
-void Callback_SubBuffers(se::gpu::GpuStreamHandle stream, void** buffers,
-                         const char* /*opaque*/, size_t /*opaque_len*/) {
-  // `buffers` is a flat array containing device pointers to the following.
-  //
-  //  0:  param 0 at tuple index {0}, shape f32[128]
-  //  1:  param 0 at tuple index {1}, shape f32[256]
-  //  2:  param 1 at tuple index {0}, shape f32[1024]
-  //  3:  param 1 at tuple index {1}, shape f32[8]
-  //  4:  result at tuple index {0}, shape f32[8]
-  //  5:  result at tuple index {1, 0}, shape f32[128]
-  //  6:  result at tuple index {1, 1}, shape f32[256]
-  //  7:  result at tuple index {2}, shape f32[1024]
-  //
-
-  // Set output leaf buffers, copying data from the corresponding same-sized
-  // inputs.
-  auto err = gpuMemcpyAsync(buffers[4], buffers[3], 8 * sizeof(float),
-                            gpuMemcpyDeviceToDevice, stream);
-  ASSERT_EQ(err, gpuSuccess);
-  err = gpuMemcpyAsync(buffers[5], buffers[0], 128 * sizeof(float),
-                       gpuMemcpyDeviceToDevice, stream);
-  ASSERT_EQ(err, gpuSuccess);
-  err = gpuMemcpyAsync(buffers[6], buffers[1], 256 * sizeof(float),
-                       gpuMemcpyDeviceToDevice, stream);
-  ASSERT_EQ(err, gpuSuccess);
-  err = gpuMemcpyAsync(buffers[7], buffers[2], 1024 * sizeof(float),
-                       gpuMemcpyDeviceToDevice, stream);
-  ASSERT_EQ(err, gpuSuccess);
-}
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_SubBuffers, PLATFORM);
-TEST_F(CustomCallTest, SubBuffers) {
-  XlaBuilder b(TestName());
-  CustomCall(&b, "Callback_SubBuffers", /*operands=*/
-             {
-                 Tuple(&b,
-                       {
-                           Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
-                           Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
-                       }),
-                 Tuple(&b,
-                       {
-                           Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
-                           Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
-                       }),
-             },
-             ShapeUtil::MakeTupleShape({
-                 ShapeUtil::MakeShape(F32, {8}),
-                 ShapeUtil::MakeTupleShape({
-                     ShapeUtil::MakeShape(F32, {128}),
-                     ShapeUtil::MakeShape(F32, {256}),
-                 }),
-                 ShapeUtil::MakeShape(F32, {1024}),
-             }),
-             /*opaque=*/"");
-  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
-  EXPECT_THAT(result.data<float>({0}), ::testing::Each(4));
-  EXPECT_THAT(result.data<float>({1, 0}), ::testing::Each(1));
-  EXPECT_THAT(result.data<float>({1, 1}), ::testing::Each(2));
-  EXPECT_THAT(result.data<float>({2}), ::testing::Each(3));
-}
+using CustomCallTest = ClientLibraryTestRunnerMixin<HloTestBase>;
 
 // The test case for custom call with tokens encodes the arguments and result
 // type using a string with A(=Array), T(=Token) and {} for Tuples. It also
@@ -230,11 +112,6 @@ struct TokenTestCase {
   std::string opaque;
 };
 
-std::ostream& operator<<(std::ostream& s, const TokenTestCase& tc) {
-  s << tc.input << "x" << tc.output << "x" << tc.opaque;
-  return s;
-}
-
 void Callback_Tokens(se::gpu::GpuStreamHandle stream, void** buffers,
                      const char* opaque, size_t opaque_len) {
   for (int i = 0; i < opaque_len; ++i) {
@@ -260,7 +137,7 @@ std::vector<TokenTestCase> GetTokenTestCases() {
 
 class CustomCallTokensTest
     : public ::testing::WithParamInterface<TokenTestCase>,
-      public ClientLibraryTestBase {
+      public ClientLibraryTestRunnerMixin<HloTestBase> {
  public:
   static std::vector<XlaOp> BuildInputs(XlaBuilder& b,
                                         std::istringstream& str) {
@@ -302,25 +179,6 @@ class CustomCallTokensTest
   }
 };
 
-TEST_P(CustomCallTokensTest, TokensTest) {
-  const TokenTestCase& tc = GetParam();
-
-  XlaBuilder b("CustomCallTokens");
-
-  std::istringstream input(tc.input);
-  std::istringstream output(tc.output);
-  std::vector<XlaOp> call_inputs = BuildInputs(b, input);
-  std::vector<Shape> call_output = BuildOutputType(output);
-  ASSERT_EQ(call_output.size(), 1);
-
-  CustomCall(&b, "Callback_Tokens", call_inputs, call_output.front(),
-             tc.opaque);
-  TF_ASSERT_OK(Execute(&b, {}).status());
-}
-
-INSTANTIATE_TEST_CASE_P(CustomCallTokens, CustomCallTokensTest,
-                        ::testing::ValuesIn(GetTokenTestCases()));
-
 void Callback_WithStatusSucceeded(se::gpu::GpuStreamHandle /*stream*/,
                                   void** /*buffers*/, const char* /*opaque*/,
                                   size_t /*opaque_len*/,
@@ -338,7 +196,7 @@ TEST_F(CustomCallTest, WithStatusSucceeded) {
       /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
       /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
       /*api_version=*/CustomCallApiVersion::API_VERSION_STATUS_RETURNING);
-  TF_ASSERT_OK(Execute(&b, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
 void Callback_WithStatusFailed(se::gpu::GpuStreamHandle /*stream*/,
@@ -358,7 +216,7 @@ TEST_F(CustomCallTest, WithStatusFailed) {
       /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
       /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
       /*api_version=*/CustomCallApiVersion::API_VERSION_STATUS_RETURNING);
-  auto status = Execute(&b, {}).status();
+  auto status = ExecuteAndTransfer(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed"));
 }
@@ -387,7 +245,7 @@ TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
              /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
-  auto status = Execute(&b, {}).status();
+  auto status = ExecuteAndTransfer(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Uh oh, wrong value: 42"));
 }
@@ -404,7 +262,7 @@ TEST_F(CustomCallTest, PassAttributesByBackendConfig) {
       /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
       /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
       /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
-  auto status = Execute(&b, {}).status();
+  auto status = ExecuteAndTransfer(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Uh oh, wrong value: 42"));
 }
@@ -464,7 +322,7 @@ TEST_F(CustomCallTest, PassUserPointerWithAttrs) {
              /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
-  auto status = Execute(&b, {}).status();
+  auto status = ExecuteAndTransfer(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("User-defined message"));
 }
@@ -502,7 +360,7 @@ TEST_F(CustomCallTest, ExportedFfiUnknownTarget) {
              /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
-  auto status = Execute(&b, {}).status();
+  auto status = ExecuteAndTransfer(&b, {}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kUnimplemented);
   EXPECT_THAT(status.message(),
               ::testing::HasSubstr("No registered implementation"));
@@ -511,14 +369,16 @@ TEST_F(CustomCallTest, ExportedFfiUnknownTarget) {
 // Memcpy and SubBuffers tests are already ported in
 // fusions/address_computation_fusion_test.cc
 
-// Reusing kExpectedOpaque from the original test.
+std::string& kExpectedOpaque = *new std::string("abc\0def", 7);
+
 static absl::Status Opaque(ffi::Result<ffi::AnyBuffer>,
                            const std::string* str) {
   std::string opaque(*str);
-  if (opaque != kExpectedOpaque)
+  if (opaque != kExpectedOpaque) {
     return absl::InternalError(absl::StrFormat(
         "Opaque string does not match. Expected `%s` but got `%s`",
         kExpectedOpaque, opaque));
+  }
   return absl::OkStatus();
 }
 
@@ -541,7 +401,7 @@ TEST_F(CustomCallTest, ExportedFfiOpaque) {
              /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
-  TF_ASSERT_OK(Execute(&b, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
 static absl::Status CheckTokens(std::vector<PrimitiveType> args,
@@ -612,7 +472,7 @@ TEST_P(CustomCallTokensTest, ExportedTokensTest) {
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
 
-  TF_ASSERT_OK(Execute(&b, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
 INSTANTIATE_TEST_SUITE_P(CustomCallTokensTest, CustomCallTokensTest,
@@ -636,7 +496,7 @@ TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
              /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
-  TF_ASSERT_OK(Execute(&b, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
 //===----------------------------------------------------------------------===//
@@ -646,11 +506,14 @@ TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
 static absl::Status FfiAttributes(ffi::Result<ffi::AnyBuffer>,
                                   absl::Span<const int32_t> i32_arr,
                                   Range range) {
-  if (i32_arr.size() != 4)
+  if (i32_arr.size() != 4) {
     return absl::InternalError("i32_arr size does not match");
+  }
 
-  if (i32_arr[0] != 1 || i32_arr[1] != 2 || i32_arr[2] != 3 || i32_arr[3] != 4)
+  if (i32_arr[0] != 1 || i32_arr[1] != 2 || i32_arr[2] != 3 ||
+      i32_arr[3] != 4) {
     return absl::InternalError("i32_arr values do not match");
+  }
 
   if (range.lo != 0 || range.hi != 42) {
     return absl::InternalError("range values do not match");
@@ -679,7 +542,7 @@ TEST_F(CustomCallTest, FfiAttributes) {
              /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
-  TF_ASSERT_OK(Execute(&b, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
 //===----------------------------------------------------------------------===//
@@ -690,18 +553,24 @@ static absl::Status MemcpyWithCalledComputation(
     se::Stream* stream, int32_t device_ordinal,
     se::OwningScratchAllocator<> scratch_allocator, ffi::AnyBuffer src,
     ffi::Result<ffi::AnyBuffer> dst, const HloComputation* called_computation) {
-  if (called_computation == nullptr)
+  if (called_computation == nullptr) {
     return absl::InternalError("Called computation is not defined");
+  }
 
-  if (called_computation->instruction_count() != 1)
+  if (called_computation->instruction_count() != 1) {
     return absl::InternalError("Unexpected number of instructions");
+  }
 
-  if (!DynCast<HloParameterInstruction>(called_computation->root_instruction()))
+  if (!DynCast<HloParameterInstruction>(
+          called_computation->root_instruction())) {
     return absl::InternalError("ROOT must be a paremeter");
+  }
 
   // Check that scratch allocator is working.
   auto scratch = scratch_allocator.AllocateBytes(1024);
-  if (!scratch.ok()) return scratch.status();
+  if (!scratch.ok()) {
+    return scratch.status();
+  }
 
   return Memcpy(stream, src, dst);
 }
@@ -759,7 +628,9 @@ struct SomeExtraContext {
 template <ffi::ExecutionStage stage>
 static absl::Status ExecutionContext(ffi::Result<ffi::AnyBuffer>,
                                      SomeExtraContext* ctx) {
-  if (ctx->value != 42) return absl::InternalError("Unexpected value");
+  if (ctx->value != 42) {
+    return absl::InternalError("Unexpected value");
+  }
   if constexpr (stage == ffi::ExecutionStage::kPrepare) {
     ctx->prepared = true;
   } else if constexpr (stage == ffi::ExecutionStage::kInitialize) {
@@ -816,7 +687,7 @@ TEST_F(CustomCallTest, FfiExecutionContext) {
   ffi::internal::ScopedExecutionContext scoped_execution_context(
       &execution_context);
 
-  TF_ASSERT_OK(Execute(&b, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 
   // Check that FFI handler was called during initialization and execution.
   TF_ASSERT_OK_AND_ASSIGN(auto* user_context,
@@ -876,7 +747,7 @@ TEST_F(CustomCallTest, FfiExecutionState) {
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
 
-  TF_ASSERT_OK(Execute(&b, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
 }  // anonymous namespace
diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index 85894e4219ed..c17b467ccffc 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -40,8 +40,8 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -141,12 +141,12 @@ class DeterminismTest : public GpuCodegenTest {
     EXPECT_TRUE(filecheck_result.value());
   }
 
-  bool IsVoltaOrLater() const {
+  bool IsAmpereOrLater() const {
     return backend()
         .default_stream_executor()
         ->GetDeviceDescription()
         .cuda_compute_capability()
-        .IsAtLeastVolta();
+        .IsAtLeastAmpere();
   }
 
   bool IsRocm() const {
@@ -193,9 +193,9 @@ ENTRY e {
 }
 
 TEST_F(DeterminismTest, DeterministicTritonGemmUsesDefaultConfig) {
-  if (!IsVoltaOrLater()) {
+  if (!IsAmpereOrLater()) {
     GTEST_SKIP() << "Triton is not supported on non-NVIDIA and "
-                    "pre-Volta NVIDIA GPUs.";
+                    "pre-Ampere NVIDIA GPUs.";
   }
 
   constexpr absl::string_view kHloText = R"(
@@ -209,19 +209,23 @@ ENTRY e {
   // Disable autotuning.
   debug_options_.set_xla_gpu_deterministic_ops(true);
   // Check that triton is used but without autotuning (default config).
+  // TODO: b/407494653 - This is a bad test because it relies on particular
+  // implementation details to succeed. Thus, it tests that there is no
+  // autotuning happening in a brittle way. Fix this when refactoring the
+  // autotuner.
   AutotunerUtil::ClearAutotuneResults();
   MatchOptimizedHlo(kHloText, R"(
     CHECK: __triton_gemm
-    CHECK: {"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}
+    CHECK: {"block_m":"16","block_n":"16","block_k":"64","split_k":"1","num_stages":"4","num_warps":"2","num_ctas":"1"
   )",
                     TimerCreation::kForbidden);
   AssertDeterminism(kHloText, /*num_runs=*/3);
 }
 
 TEST_F(DeterminismTest, ExcludingNonDeterministicOpsDoesNotDisableAutotuning) {
-  if (!IsVoltaOrLater()) {
+  if (!IsAmpereOrLater()) {
     GTEST_SKIP() << "Triton is not supported on non-NVIDIA and "
-                    "pre-Volta NVIDIA GPUs.";
+                    "pre-Ampere NVIDIA GPUs.";
   }
 
   debug_options_.set_xla_gpu_cublas_fallback(false);
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
index 0cdb0f81c524..7f13ddba32a3 100644
--- a/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -40,7 +40,7 @@ using AsyncExecutionStreamIds =
 namespace xla::gpu {
 namespace {
 
-class ExecutionStreamAssignmentTest : public HloTestBase {
+class ExecutionStreamAssignmentTest : public HloHardwareIndependentTestBase {
  protected:
   // Adds expectations for the `ExecutionStreamId` for all synchronous
   // `HloInstructions` in the given `HloComputation`.
diff --git a/third_party/xla/xla/service/gpu/flag_utils.h b/third_party/xla/xla/service/gpu/flag_utils.h
index c51c09fee530..ccd065247acd 100644
--- a/third_party/xla/xla/service/gpu/flag_utils.h
+++ b/third_party/xla/xla/service/gpu/flag_utils.h
@@ -43,9 +43,9 @@ bool IsPassEnabledAtOptimizationEffort(const HloModule& module) {
   if (is_collective_optimization_pass) {
     ExecutionOptions::EffortLevel opt_level =
         module.config().optimization_level();
-    return (opt_level == ExecutionOptions::EFFORT_O3) ||
-           (opt_level == ExecutionOptions::EFFORT_UNKNOWN &&
-            exec_effort >= kExtraCollectiveOptimizations);
+
+    return exec_effort >= kExtraCollectiveOptimizations ||
+           opt_level >= ExecutionOptions::EFFORT_O1;
   }
 
   return true;
diff --git a/third_party/xla/xla/service/gpu/flag_utils_test.cc b/third_party/xla/xla/service/gpu/flag_utils_test.cc
index fbad690eac3a..030f9f218e7f 100644
--- a/third_party/xla/xla/service/gpu/flag_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/flag_utils_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/latency_hiding_scheduler.h"
-#include "tsl/platform/test.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -54,6 +54,39 @@ TEST(FlagUtilsTest, IsPassEnabledAtOptimizationEffort) {
       IsPassEnabledAtOptimizationEffort<LatencyHidingScheduler>(module));
 }
 
+TEST(FlagUtilsTest, IsPassEnabledAtOptimizationLevel) {
+  HloModule module("test_module", {});
+
+  for (ExecutionOptions::EffortLevel level :
+       {ExecutionOptions::EFFORT_O1, ExecutionOptions::EFFORT_O2,
+        ExecutionOptions::EFFORT_O3}) {
+    HloModuleConfig config;
+    config.set_optimization_level(level);
+    module.set_config(config);
+
+    // Collective optimization passes.
+    EXPECT_TRUE(IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(module));
+    EXPECT_TRUE(
+        IsPassEnabledAtOptimizationEffort<DoubleBufferLoopUnrolling>(module));
+    EXPECT_TRUE(
+        IsPassEnabledAtOptimizationEffort<LatencyHidingScheduler>(module));
+
+    // Other passes.
+    EXPECT_TRUE(IsPassEnabledAtOptimizationEffort<HloDCE>(module));
+  }
+
+  HloModuleConfig config;
+  config.set_optimization_level(ExecutionOptions::EFFORT_O0);
+  module.set_config(config);
+
+  // Collective optimization passes.
+  EXPECT_FALSE(IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(module));
+  EXPECT_FALSE(
+      IsPassEnabledAtOptimizationEffort<DoubleBufferLoopUnrolling>(module));
+  EXPECT_FALSE(
+      IsPassEnabledAtOptimizationEffort<LatencyHidingScheduler>(module));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/float_support_test.cc b/third_party/xla/xla/service/gpu/float_support_test.cc
index a0bddbe09579..fea048a09159 100644
--- a/third_party/xla/xla/service/gpu/float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/float_support_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
-#include "xla/service/gpu/variant_visitor.h"
+#include "xla/service/overload.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla.pb.h"
@@ -51,7 +51,6 @@ class FloatSupportTestWithTriton : public FloatSupportTest {
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = FloatSupportTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_triton_gemm(true);
-    debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
     debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
   }
@@ -76,10 +75,10 @@ ENTRY e {
 
 TEST_F(FloatSupportTestWithTriton, MixedTypeDotWithBF16IsNotUpcasted) {
   bool skip_test = std::visit(
-      VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                       return !cc.IsAtLeast(se::CudaComputeCapability::kAmpere);
-                     },
-                     [](const se::RocmComputeCapability&) { return true; }},
+      Overload{[](const se::CudaComputeCapability& cc) {
+                 return !cc.IsAtLeast(se::CudaComputeCapability::kAmpere);
+               },
+               [](const se::RocmComputeCapability&) { return true; }},
       GetGpuComputeCapability());
 
   if (skip_test) {
diff --git a/third_party/xla/xla/service/gpu/fusion_deduplication_cache.cc b/third_party/xla/xla/service/gpu/fusion_deduplication_cache.cc
index 9d6dac5b171e..f5420446f74a 100644
--- a/third_party/xla/xla/service/gpu/fusion_deduplication_cache.cc
+++ b/third_party/xla/xla/service/gpu/fusion_deduplication_cache.cc
@@ -103,12 +103,12 @@ class HloInstructionPtrEq {
 }
 
 FusionDeduplicationCache::InstructionId
-FusionDeduplicationCache::GetInstructionId(const HloInstruction& instruction) {
-  return instruction_id_map_.at(&instruction);
+FusionDeduplicationCache::GetInstructionId(const HloInstruction* instruction) {
+  return instruction_id_map_.at(instruction);
 }
 
 FusionDeduplicationCache::FusionId FusionDeduplicationCache::GetFusionId(
-    const HloInstruction& producer, const HloInstruction& consumer,
+    const HloInstruction* producer, const HloInstruction* consumer,
     int64_t consumer_operand_index, bool allow_multi_output) {
   FusionDeduplicationCache::FusionId fusion_id{
       GetInstructionId(producer), GetInstructionId(consumer),
@@ -122,18 +122,18 @@ FusionDeduplicationCache::FusionId FusionDeduplicationCache::GetFusionId(
 }
 
 FusionDeduplicationCache::FusionId FusionDeduplicationCache::GetFusionId(
-    const HloInstruction& producer, const HloInstruction& consumer,
+    const HloInstruction* producer, const HloInstruction* consumer,
     bool allow_multi_output) {
-  return GetFusionId(producer, consumer, consumer.operand_index(&producer),
+  return GetFusionId(producer, consumer, consumer->operand_index(producer),
                      allow_multi_output);
 }
 
 void FusionDeduplicationCache::UpdateFusedInstructionId(
-    const HloInstruction& fusion_instruction,
-    const HloInstruction& original_producer,
-    const HloInstruction& original_consumer, int64_t consumer_operand_index,
+    const HloInstruction* fusion_instruction,
+    const HloInstruction* original_producer,
+    const HloInstruction* original_consumer, int64_t consumer_operand_index,
     bool allow_multi_output) {
-  instruction_id_map_[&fusion_instruction] = fusion_id_map_.at(
+  instruction_id_map_[fusion_instruction] = fusion_id_map_.at(
       GetFusionId(original_producer, original_consumer, consumer_operand_index,
                   allow_multi_output));
 }
diff --git a/third_party/xla/xla/service/gpu/fusion_deduplication_cache.h b/third_party/xla/xla/service/gpu/fusion_deduplication_cache.h
index 7e601ba1aca9..c4a9aa98de57 100644
--- a/third_party/xla/xla/service/gpu/fusion_deduplication_cache.h
+++ b/third_party/xla/xla/service/gpu/fusion_deduplication_cache.h
@@ -29,7 +29,10 @@ namespace gpu {
 
 // A cache that helps to track identical HLO instructions and their fusions. The
 // cache assigns an InstructionId to each instruction. Instructions that are the
-// same in terms of `HloInstruction::Identical` have the same id.
+// same in terms of `HloInstruction::Identical` have the same id. The cache
+// operates with HloInstruction pointers and does not dereference them when
+// retrieving/updating a FusionId, except one method which has a corresponding
+// comment.
 //
 // The id depends on the fusion order. If we have the following chain of HLO
 // instructions:
@@ -67,13 +70,14 @@ class FusionDeduplicationCache {
   // Returns the id for the given instruction. The instruction should have an id
   // already assigned, either during the initialization process in `Create` or
   // manually after the fusion by `SetFusedInstructionId`.
-  InstructionId GetInstructionId(const HloInstruction& instruction);
+  InstructionId GetInstructionId(const HloInstruction* instruction);
 
   // Returns the id for the fusion of `producer` and `consumer`.
   // `allow_multi_output` should be set to true if we allow to create a
-  // multi-output fusion to avoid having to duplicate `producer`.
-  FusionId GetFusionId(const HloInstruction& producer,
-                       const HloInstruction& consumer,
+  // multi-output fusion to avoid having to duplicate `producer`. `consumer`
+  // should not be deleted yet.
+  FusionId GetFusionId(const HloInstruction* producer,
+                       const HloInstruction* consumer,
                        bool allow_multi_output = false);
 
   // Sets the new id for the `fusion_instruction`.
@@ -89,9 +93,9 @@ class FusionDeduplicationCache {
   // The operand index needs to be obtained before the fusion happened and
   // provided explicitly, because at this point `original_producer` and
   // `original_consumer` have been modified and became disconnected.
-  void UpdateFusedInstructionId(const HloInstruction& fusion_instruction,
-                                const HloInstruction& original_producer,
-                                const HloInstruction& original_consumer,
+  void UpdateFusedInstructionId(const HloInstruction* fusion_instruction,
+                                const HloInstruction* original_producer,
+                                const HloInstruction* original_consumer,
                                 int64_t consumer_operand_index,
                                 bool allow_multi_output = false);
 
@@ -101,10 +105,9 @@ class FusionDeduplicationCache {
                            instruction_id_map)
       : next_id_(next_id), instruction_id_map_(std::move(instruction_id_map)) {}
 
-  FusionId GetFusionId(const HloInstruction& producer,
-                       const HloInstruction& consumer,
-                       int64_t consumer_operand_index,
-                       bool allow_multi__output);
+  FusionId GetFusionId(const HloInstruction* producer,
+                       const HloInstruction* consumer,
+                       int64_t consumer_operand_index, bool allow_multi_output);
 
   int64_t next_id_ = 0;
 
diff --git a/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc b/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc
index e9a45a80b364..58a709346f05 100644
--- a/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc
+++ b/third_party/xla/xla/service/gpu/fusion_deduplication_cache_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -54,7 +54,8 @@ HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
     }
   }
 
-  if (producer->user_count() == 0) {
+  // In case of multi-output fusion, `producer` would already be deleted.
+  if (!allow_multi_output && producer->user_count() == 0) {
     TF_CHECK_OK(computation->RemoveInstruction(producer));
   }
 
@@ -63,7 +64,7 @@ HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
 
 bool IsFusible(const HloInstruction& instruction) { return true; }
 
-using FusionDeduplicationCacheTest = HloTestBase;
+using FusionDeduplicationCacheTest = HloHardwareIndependentTestBase;
 
 TEST_F(FusionDeduplicationCacheTest, IdenticalInstructions_EqualId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
@@ -81,7 +82,7 @@ TEST_F(FusionDeduplicationCacheTest, IdenticalInstructions_EqualId) {
 
   const HloInstruction* add2 = module->entry_computation()->root_instruction();
   const HloInstruction* add1 = add2->operand(0);
-  EXPECT_EQ(cache.GetInstructionId(*add1), cache.GetInstructionId(*add2));
+  EXPECT_EQ(cache.GetInstructionId(add1), cache.GetInstructionId(add2));
 }
 
 TEST_F(FusionDeduplicationCacheTest,
@@ -107,7 +108,7 @@ TEST_F(FusionDeduplicationCacheTest,
   const HloInstruction* add1 =
       module->GetComputationWithName("computation.1")->root_instruction();
   const HloInstruction* add2 = module->entry_computation()->root_instruction();
-  EXPECT_EQ(cache.GetInstructionId(*add1), cache.GetInstructionId(*add2));
+  EXPECT_EQ(cache.GetInstructionId(add1), cache.GetInstructionId(add2));
 }
 
 TEST_F(FusionDeduplicationCacheTest, IdenticalFusionInstructions_EqualId) {
@@ -131,21 +132,21 @@ TEST_F(FusionDeduplicationCacheTest, IdenticalFusionInstructions_EqualId) {
 
   FusionDeduplicationCache cache =
       FusionDeduplicationCache::Create(*module, IsFusible);
-  EXPECT_EQ(cache.GetInstructionId(*add1), cache.GetInstructionId(*add2));
-  EXPECT_EQ(cache.GetInstructionId(*log1), cache.GetInstructionId(*log2));
-  EXPECT_NE(cache.GetInstructionId(*add1), cache.GetInstructionId(*log1));
+  EXPECT_EQ(cache.GetInstructionId(add1), cache.GetInstructionId(add2));
+  EXPECT_EQ(cache.GetInstructionId(log1), cache.GetInstructionId(log2));
+  EXPECT_NE(cache.GetInstructionId(add1), cache.GetInstructionId(log1));
 
-  EXPECT_EQ(cache.GetFusionId(*log1, *add1), cache.GetFusionId(*log2, *add2));
+  EXPECT_EQ(cache.GetFusionId(log1, add1), cache.GetFusionId(log2, add2));
 
   HloInstruction* fusion1 = Fuse(log1, add1);
-  cache.UpdateFusedInstructionId(*fusion1, *log1, *add1,
+  cache.UpdateFusedInstructionId(fusion1, log1, add1,
                                  /*consumer_operand_index=*/0);
 
   HloInstruction* fusion2 = Fuse(log2, add2);
-  cache.UpdateFusedInstructionId(*fusion2, *log2, *add2,
+  cache.UpdateFusedInstructionId(fusion2, log2, add2,
                                  /*consumer_operand_index=*/0);
 
-  EXPECT_EQ(cache.GetInstructionId(*fusion1), cache.GetInstructionId(*fusion2));
+  EXPECT_EQ(cache.GetInstructionId(fusion1), cache.GetInstructionId(fusion2));
 }
 
 TEST_F(FusionDeduplicationCacheTest,
@@ -171,23 +172,23 @@ TEST_F(FusionDeduplicationCacheTest,
 
   FusionDeduplicationCache cache =
       FusionDeduplicationCache::Create(*module, IsFusible);
-  EXPECT_EQ(cache.GetInstructionId(*add1), cache.GetInstructionId(*add2));
-  EXPECT_EQ(cache.GetInstructionId(*log1), cache.GetInstructionId(*log2));
-  EXPECT_NE(cache.GetInstructionId(*add1), cache.GetInstructionId(*log1));
+  EXPECT_EQ(cache.GetInstructionId(add1), cache.GetInstructionId(add2));
+  EXPECT_EQ(cache.GetInstructionId(log1), cache.GetInstructionId(log2));
+  EXPECT_NE(cache.GetInstructionId(add1), cache.GetInstructionId(log1));
 
-  EXPECT_EQ(cache.GetFusionId(*log1, *add1), cache.GetFusionId(*log2, *add2));
+  EXPECT_EQ(cache.GetFusionId(log1, add1), cache.GetFusionId(log2, add2));
 
   HloInstruction* fusion1 = Fuse(log1, add1, /*allow_multi_output=*/true);
-  cache.UpdateFusedInstructionId(*fusion1, *log1, *add1,
+  cache.UpdateFusedInstructionId(fusion1, log1, add1,
                                  /*consumer_operand_index=*/0,
                                  /*allow_multi_output=*/true);
 
   HloInstruction* fusion2 = Fuse(log2, add2);
-  cache.UpdateFusedInstructionId(*fusion2, *log2, *add2,
+  cache.UpdateFusedInstructionId(fusion2, log2, add2,
                                  /*consumer_operand_index=*/0,
                                  /*allow_multi_output=*/true);
 
-  EXPECT_EQ(cache.GetInstructionId(*fusion1), cache.GetInstructionId(*fusion2));
+  EXPECT_EQ(cache.GetInstructionId(fusion1), cache.GetInstructionId(fusion2));
 }
 
 TEST_F(FusionDeduplicationCacheTest,
@@ -213,23 +214,23 @@ TEST_F(FusionDeduplicationCacheTest,
 
   FusionDeduplicationCache cache =
       FusionDeduplicationCache::Create(*module, IsFusible);
-  EXPECT_EQ(cache.GetInstructionId(*add1), cache.GetInstructionId(*add2));
-  EXPECT_EQ(cache.GetInstructionId(*log1), cache.GetInstructionId(*log2));
-  EXPECT_NE(cache.GetInstructionId(*add1), cache.GetInstructionId(*log1));
+  EXPECT_EQ(cache.GetInstructionId(add1), cache.GetInstructionId(add2));
+  EXPECT_EQ(cache.GetInstructionId(log1), cache.GetInstructionId(log2));
+  EXPECT_NE(cache.GetInstructionId(add1), cache.GetInstructionId(log1));
 
-  EXPECT_EQ(cache.GetFusionId(*log1, *add1), cache.GetFusionId(*log2, *add2));
+  EXPECT_EQ(cache.GetFusionId(log1, add1), cache.GetFusionId(log2, add2));
 
   HloInstruction* fusion1 = Fuse(log1, add1, /*allow_multi_output=*/true);
-  cache.UpdateFusedInstructionId(*fusion1, *log1, *add1,
+  cache.UpdateFusedInstructionId(fusion1, log1, add1,
                                  /*consumer_operand_index=*/0,
                                  /*allow_multi_output=*/true);
 
   HloInstruction* fusion2 = Fuse(log2, add2);
-  cache.UpdateFusedInstructionId(*fusion2, *log2, *add2,
+  cache.UpdateFusedInstructionId(fusion2, log2, add2,
                                  /*consumer_operand_index=*/0,
                                  /*allow_multi_output=*/false);
 
-  EXPECT_NE(cache.GetInstructionId(*fusion1), cache.GetInstructionId(*fusion2));
+  EXPECT_NE(cache.GetInstructionId(fusion1), cache.GetInstructionId(fusion2));
 }
 
 TEST_F(FusionDeduplicationCacheTest,
@@ -255,17 +256,17 @@ TEST_F(FusionDeduplicationCacheTest,
   FusionDeduplicationCache cache =
       FusionDeduplicationCache::Create(*module, IsFusible);
 
-  EXPECT_NE(cache.GetFusionId(*log1, *add1), cache.GetFusionId(*log2, *add2));
+  EXPECT_NE(cache.GetFusionId(log1, add1), cache.GetFusionId(log2, add2));
 
   HloInstruction* fusion1 = Fuse(log1, add1);
-  cache.UpdateFusedInstructionId(*fusion1, *log1, *add1,
+  cache.UpdateFusedInstructionId(fusion1, log1, add1,
                                  /*consumer_operand_index=*/0);
 
   HloInstruction* fusion2 = Fuse(log2, add2);
-  cache.UpdateFusedInstructionId(*fusion2, *log2, *add2,
+  cache.UpdateFusedInstructionId(fusion2, log2, add2,
                                  /*consumer_operand_index=*/1);
 
-  EXPECT_NE(cache.GetInstructionId(*fusion1), cache.GetInstructionId(*fusion2));
+  EXPECT_NE(cache.GetInstructionId(fusion1), cache.GetInstructionId(fusion2));
 }
 
 TEST_F(FusionDeduplicationCacheTest, OnlyFusibleInstructionsAreCached) {
@@ -293,8 +294,8 @@ TEST_F(FusionDeduplicationCacheTest, OnlyFusibleInstructionsAreCached) {
 
   // kParameter and kGetTupleElement are not fusible, so assignment of fusion
   // IDs started from `add`.
-  EXPECT_EQ(cache.GetInstructionId(*add), 0);
-  EXPECT_EQ(cache.GetInstructionId(*mul), 1);
+  EXPECT_EQ(cache.GetInstructionId(add), 0);
+  EXPECT_EQ(cache.GetInstructionId(mul), 1);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
index 435ad70c4732..08961d9d9cb8 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/layout_util.h"
 #include "xla/service/gpu/transforms/fusion_block_level_rewriter.h"
+#include "xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
@@ -57,7 +58,7 @@ bool IsSlowLoopTransposeFusion(const HloFusionInstruction* fusion) {
   // is neither the minormost nor the second minormost dimension in the output,
   // and the output minormost dimension is swapped with the new minormost
   // dimension.
-  int64_t rank = root->shape().rank();
+  int64_t rank = root->shape().dimensions().size();
 
   // The transpose dimension grouper has run, so it should be enough to check
   // that the minormost dimension's index within the result is smaller than
@@ -131,6 +132,7 @@ HloPassPipeline FusionDispatchPipeline(
   HloPassPipeline pipeline("fusion-dispatch-pipeline");
   pipeline.AddPass<FusionBlockLevelRewriter>(device_description, shape_size_fn,
                                              std::move(try_rewrite_fusion_if));
+  pipeline.AddPass<FusionDynamicMemcpyRewriter>();
   return pipeline;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusion_process_dump_test.cc b/third_party/xla/xla/service/gpu/fusion_process_dump_test.cc
index 6d392e5efa32..850358e0f3b1 100644
--- a/third_party/xla/xla/service/gpu/fusion_process_dump_test.cc
+++ b/third_party/xla/xla/service/gpu/fusion_process_dump_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/gpu/fusion_process_dump.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace m = ::xla::match;
@@ -35,7 +35,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using FusionProcessDumpTest = HloTestBase;
+using FusionProcessDumpTest = HloHardwareIndependentTestBase;
 
 void AddFusion(FusionProcessDumpProto& dump_proto,
                const std::string& fusion_name, const std::string& producer_name,
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
index f66d37a383c1..e683a0a223df 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -80,8 +80,9 @@ ENTRY main {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          aot_result->LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
 }
 
 TEST_F(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
@@ -123,8 +124,9 @@ ENTRY main {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          aot_result->LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
 }
 
 namespace {
@@ -236,8 +238,9 @@ TEST_F(GpuAotCompilationTest, ExportAndLoadExecutableWithTriton) {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          aot_result->LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
   std::unique_ptr<OpaqueExecutable> wrapped_executable =
       test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 546c3d6c3f09..0f16ec6c2980 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
-#include "absl/types/variant.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -54,6 +54,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 #include "mlir/IR/Diagnostics.h"
@@ -73,6 +74,7 @@ limitations under the License.
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/collectives/all_gather_broadcast_reorder.h"
 #include "xla/hlo/transforms/collectives/all_reduce_contiguous.h"
+#include "xla/hlo/transforms/collectives/collective_permute_combiner.h"
 #include "xla/hlo/transforms/collectives/collective_quantizer.h"
 #include "xla/hlo/transforms/collectives/collectives_schedule_linearizer.h"
 #include "xla/hlo/transforms/convert_memory_placement_to_internal_annotations.h"
@@ -134,6 +136,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/collective_permute_decomposer.h"
 #include "xla/service/collective_pipeliner.h"
+#include "xla/service/collective_pipeliner_utils.h"
 #include "xla/service/collective_utils.h"
 #include "xla/service/compiler.h"
 #include "xla/service/conditional_simplifier.h"
@@ -159,7 +162,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
-#include "xla/service/gpu/gpu_p2p_pipeliner.h"
 #include "xla/service/gpu/gpu_spmd_pipeline.h"
 #include "xla/service/gpu/hlo_fusion_stats.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -168,25 +170,28 @@ limitations under the License.
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/metrics.h"
+#include "xla/service/gpu/model/collective_ptable_stats_collection.h"
 #include "xla/service/gpu/model/gpu_cost_model_stats_collection.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/matmul_ptable_stats_collection.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h"
 #include "xla/service/gpu/pre_scheduling_copy_insertion_pipeline.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/runtime_intrinsics.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h"
 #include "xla/service/gpu/transforms/algebraic_simplifier.h"
 #include "xla/service/gpu/transforms/algorithm_checker.h"
-#include "xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h"
-#include "xla/service/gpu/transforms/all_gather_optimizer.h"
-#include "xla/service/gpu/transforms/all_reduce_blueconnect.h"
-#include "xla/service/gpu/transforms/all_reduce_splitter.h"
 #include "xla/service/gpu/transforms/async_wrapper.h"
 #include "xla/service/gpu/transforms/collective_permute_cycle_decomposer.h"
-#include "xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h"
-#include "xla/service/gpu/transforms/collective_select_folder.h"
 #include "xla/service/gpu/transforms/collectives/all_gather_combiner.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_optimizer.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h"
 #include "xla/service/gpu/transforms/collectives/all_reduce_combiner.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_decomposer.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_splitter.h"
+#include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
 #include "xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.h"
 #include "xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h"
 #include "xla/service/gpu/transforms/collectives/reduce_scatter_combiner.h"
@@ -200,6 +205,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/dot_operand_converter.h"
 #include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
 #include "xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h"
+#include "xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h"
 #include "xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 #include "xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h"
@@ -209,7 +215,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/gemv_rewriter.h"
 #include "xla/service/gpu/transforms/layout_assignment.h"
 #include "xla/service/gpu/transforms/move_copy_to_users.h"
-#include "xla/service/gpu/transforms/pipelined_p2p_rewriter.h"
+#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/gpu/transforms/ragged_all_to_all_canonicalizer.h"
 #include "xla/service/gpu/transforms/ragged_all_to_all_decomposer.h"
 #include "xla/service/gpu/transforms/reduce_scatter_creator.h"
@@ -223,6 +229,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/scatter_slice_simplifier.h"
 #include "xla/service/gpu/transforms/softmax_rewriter_triton.h"
 #include "xla/service/gpu/transforms/sort_rewriter.h"
+#include "xla/service/gpu/transforms/splitk_rewriter.h"
 #include "xla/service/gpu/transforms/stream_attribute_annotator.h"
 #include "xla/service/gpu/transforms/stream_attribute_async_wrapper.h"
 #include "xla/service/gpu/transforms/topk_specializer.h"
@@ -247,6 +254,8 @@ limitations under the License.
 #include "xla/service/select_and_scatter_expander.h"
 #include "xla/service/sharding_remover.h"
 #include "xla/service/slow_operation_alarm.h"
+#include "xla/service/spmd/schedule_aware_collective_ops_cse.h"
+#include "xla/service/spmd/shardy/shardy_xla_pass.h"
 #include "xla/service/topk_rewriter.h"
 #include "xla/service/transpose_folding.h"
 #include "xla/service/while_loop_all_reduce_code_motion.h"
@@ -255,6 +264,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/dnn.h"
@@ -291,6 +301,7 @@ using MaybeOwningThreadPool = MaybeOwning<tsl::thread::ThreadPool>;
 MaybeOwningThreadPool CreateMaybeOwningThreadPool(
     int parallelism, tsl::thread::ThreadPool* default_thread_pool,
     int default_parallelism) {
+  tsl::profiler::TraceMe traceme("CreateMaybeOwningThreadPool");
   CHECK_GE(parallelism, 0);
   CHECK_GE(default_parallelism, 1);
   // CurrentThreadId() returns -1 if the current thread does not belong to the
@@ -340,7 +351,8 @@ class GpuThunkAotCompilationResult : public AotCompilationResult {
   FromModule(const HloModule* hlo_module,
              const BufferAssignment* buffer_assignment,
              absl::string_view asm_text, absl::Span<const uint8_t> binary,
-             const BinaryMap& dnn_compiled_graphs) {
+             const BinaryMap& dnn_compiled_graphs, int pointer_size) {
+    tsl::profiler::TraceMe traceme("ResultFromModule");
     CompilationResultProto proto;
     *proto.mutable_hlo_module_with_config() = hlo_module->ToProtoWithConfig();
     *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
@@ -349,12 +361,13 @@ class GpuThunkAotCompilationResult : public AotCompilationResult {
     proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs.cbegin(),
                                                 dnn_compiled_graphs.cend());
     return std::unique_ptr<GpuThunkAotCompilationResult>(
-        new GpuThunkAotCompilationResult(hlo_module->Clone(),
-                                         std::move(proto)));
+        new GpuThunkAotCompilationResult(hlo_module->Clone(), std::move(proto),
+                                         pointer_size));
   }
 
   static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
-  FromString(const std::string& serialized) {
+  FromString(const std::string& serialized, int pointer_size) {
+    tsl::profiler::TraceMe traceme("ResultFromString");
     CompilationResultProto proto;
     if (!proto.ParseFromString(serialized)) {
       return Internal(
@@ -365,7 +378,8 @@ class GpuThunkAotCompilationResult : public AotCompilationResult {
         std::unique_ptr<HloModule> module,
         HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
     return std::unique_ptr<GpuThunkAotCompilationResult>(
-        new GpuThunkAotCompilationResult(std::move(module), std::move(proto)));
+        new GpuThunkAotCompilationResult(std::move(module), std::move(proto),
+                                         pointer_size));
   }
 
   absl::StatusOr<std::string> SerializeAsString() const override {
@@ -373,27 +387,48 @@ class GpuThunkAotCompilationResult : public AotCompilationResult {
   }
 
   absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) const override;
+      Compiler* compiler,
+      const se::StreamExecutor* stream_exec) const&& override;
 
   const HloModule* optimized_module() const override { return module_.get(); }
   std::unique_ptr<HloModule> consume_optimized_module() override {
     return std::move(module_);
   }
 
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const override;
+
  private:
   GpuThunkAotCompilationResult(std::unique_ptr<HloModule> module,
-                               CompilationResultProto proto)
-      : module_(std::move(module)), proto_(std::move(proto)) {}
+                               CompilationResultProto proto, int pointer_size)
+      : module_(std::move(module)),
+        proto_(std::move(proto)),
+        pointer_size_(pointer_size) {}
 
   std::unique_ptr<HloModule> module_;
   CompilationResultProto proto_;
+  int pointer_size_;
 };
 
 }  // end anonymous namespace
 
+absl::StatusOr<std::unique_ptr<BufferAssignment>>
+GpuThunkAotCompilationResult::buffer_assignment() const {
+  auto buffer_size_bytes_function =
+      [pointer_size = pointer_size_](const BufferValue& buffer) {
+        return gpu::ShapeSizeBytesFunction(pointer_size)(buffer.shape());
+      };
+
+  // Recreate BufferAssignment from proto.
+  return BufferAssignment::FromProto(proto_.buffer_assignment(), module_.get(),
+                                     buffer_size_bytes_function,
+                                     /*can_share_buffer=*/nullptr);
+}
+
 absl::StatusOr<std::unique_ptr<Executable>>
 GpuThunkAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* stream_exec) const {
+    Compiler* compiler, const se::StreamExecutor* stream_exec) const&& {
+  tsl::profiler::TraceMe traceme("LoadExecutable");
   // Recreate HloModule+HloModuleConfig from proto.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> hlo_module,
@@ -425,7 +460,7 @@ GpuThunkAotCompilationResult::LoadExecutable(
     return Internal("Compiler is not a GpuCompiler.");
   }
   auto llvm_module = std::make_unique<llvm::Module>("", llvm_context);
-  llvm_module->setTargetTriple(gpu_compiler->target_triple());
+  llvm_module->setTargetTriple(llvm::Triple(gpu_compiler->target_triple()));
   llvm_module->setDataLayout(gpu_compiler->data_layout());
   IrEmitterContext ir_emitter_context(
       hlo_module.get(), buffer_assignment.get(), &execution_stream_assignment,
@@ -457,26 +492,26 @@ GpuThunkAotCompilationResult::LoadExecutable(
           .debug_options()
           .xla_debug_buffer_assignment_show_max();
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GpuExecutable> executable,
-      GpuExecutable::Create(GpuExecutable::Params{
-          /*asm_text=*/proto_.asm_text(),
-          /*binary=*/binary,
-          /*dnn_compiled_graphs=*/
-          BinaryMap(proto_.dnn_compiled_graphs().cbegin(),
-                    proto_.dnn_compiled_graphs().cend()),
-          /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
-          /*executable=*/ir_emitter->ConsumeThunkSequence(),
-          /*constants=*/std::move(constants),
-          /*output_info=*/std::move(output_info),
-          /*module_name=*/std::move(hlo_module->name()),
-          /*output_shape=*/std::move(output_shape),
-          /*mlir_allocations=*/std::nullopt,
-          /*buffer_assignment=*/std::move(buffer_assignment),
-          /*debug_buffer_assignment_show_max=*/debug_buffer_assignment_show_max,
-          /*debug_module=*/std::move(hlo_module),
-          /*enable_debug_info_manager=*/true}));
-  return executable;
+  {
+    tsl::profiler::TraceMe traceme("CreateGpuExecutable");
+    return GpuExecutable::Create(GpuExecutable::Params{
+        /*asm_text=*/proto_.asm_text(),
+        /*binary=*/binary,
+        /*dnn_compiled_graphs=*/
+        BinaryMap(proto_.dnn_compiled_graphs().cbegin(),
+                  proto_.dnn_compiled_graphs().cend()),
+        /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
+        /*executable=*/ir_emitter->ConsumeThunkSequence(),
+        /*constants=*/std::move(constants),
+        /*output_info=*/std::move(output_info),
+        /*module_name=*/std::move(hlo_module->name()),
+        /*output_shape=*/std::move(output_shape),
+        /*mlir_allocations=*/std::nullopt,
+        /*buffer_assignment=*/std::move(buffer_assignment),
+        /*debug_buffer_assignment_show_max=*/debug_buffer_assignment_show_max,
+        /*debug_module=*/std::move(hlo_module),
+        /*enable_debug_info_manager=*/true});
+  }
 }
 
 GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
@@ -518,10 +553,23 @@ void CheckNotScheduled(HloModule* hlo_module) {
 void LogDebugOptions(HloModule* hlo_module) {
   // LOG_LINES is used instead of LOG since the message can exceed the
   // maximum line length, which results in the message being truncated.
-  XLA_VLOG_LINES(
-      1, absl::StrFormat("GpuCompilationEnvironment of hlo_module %s:\n%s",
-                         hlo_module->name(),
-                         hlo_module->config().debug_options().DebugString()));
+  //
+  // We are also printing all the fields, instead of the DebugString() because
+  // DebugString() does not print the default values of the fields. For example
+  // for all boolean fields, the default value is false, it will not be printed
+  // by DebugString() if the value is false. We however need that value to be
+  // printed, because our "default" values (defined in
+  // xla/debug_options_flags.cc) override the default values of the fields. If
+  // we set the "default" value of a field to be true in
+  // `debug_option_flags.cc`, and it is overridden by XLA_FLAGS to false, we
+  // will not see it in the DebugString(). So, we print all the fields.
+  if (VLOG_IS_ON(1)) {
+    XLA_VLOG_LINES(
+        1,
+        absl::StrFormat("GpuCompilationEnvironment of hlo_module %s:\n%s",
+                        hlo_module->name(),
+                        PrintAllFields(hlo_module->config().debug_options())));
+  }
 }
 
 AlgebraicSimplifierOptions LayoutInsensitiveAlgebraicSimplifierOptions(
@@ -558,16 +606,25 @@ AlgebraicSimplifierOptions LayoutInsensitiveAlgebraicSimplifierOptions(
   }
   layout_insensitive_algsimp_opts
       .set_enable_unconditional_reduce_of_concat_replacement(false);
+  // GPU pipeline handles transposes better than slice+concatenate, so keep
+  // the transpose.
+  layout_insensitive_algsimp_opts
+      .set_rewrite_reshape_transpose_as_slice_concatenate(false);
   return layout_insensitive_algsimp_opts;
 }
 
 absl::Status RunPreSPMDPartitionerPasses(HloModule* hlo_module) {
+  const DebugOptions& debug_options = hlo_module->config().debug_options();
   HloPassPipeline pre_spmd_pipeline("pre-spmd-partitioner");
   // Run some IR cleanup passes before running the SPMD partitioning
   // passes.
   pre_spmd_pipeline.AddPass<CuDnnCustomCallConverter>();
   pre_spmd_pipeline.AddPass<ConvertMemoryPlacementToInternalAnnotations>();
-  pre_spmd_pipeline.AddPass<CallInliner>();
+  pre_spmd_pipeline.AddPass<FlattenCallGraph>();
+  pre_spmd_pipeline.AddPass<CallInliner>(
+      /*single_call_site=*/false, /*update_domain=*/false,
+      /*composites_to_preserve=*/absl::flat_hash_set<std::string>(),
+      /*uniquify_channel_ids=*/debug_options.xla_ignore_channel_id());
   pre_spmd_pipeline.AddPass<ZeroSizedHloElimination>();
   pre_spmd_pipeline.AddPass<ConditionalCanonicalizer>();
 
@@ -612,25 +669,26 @@ absl::Status RunSPMDPasses(
 #else
         std::nullopt);
 #endif  // PLATFORM_GOOGLE
-    if (hlo_module->config()
-            .debug_options()
-            .xla_gpu_unsafe_pipelined_loop_annotator()) {
-      spmd_pipeline.AddPass<WhileLoopTripCountAnnotator>();
-      spmd_pipeline.AddPass<CollectivePermuteValidIterationAnnotator>();
-    }
     return spmd_pipeline.Run(hlo_module).status();
   } else {
     HloPassPipeline sharding_removal_pipeline("sharding-removal");
     // Remove redundant sharding ops when partition_count == 1.
     sharding_removal_pipeline.AddPass<ShardingRemover>();
+    // Run ShardyXLA without propagation, which enforces use-tuple-args.
+    if (hlo_module->config().use_shardy_partitioner()) {
+      sharding_removal_pipeline.AddPass<sdy::ShardyXLA>(
+          /*runSdyShardingPropagation=*/false);
+    }
     sharding_removal_pipeline.AddPass<HloDCE>();
     return sharding_removal_pipeline.Run(hlo_module).status();
   }
 }
 
 absl::Status RunOptimizationPasses(
-    HloModule* hlo_module, const Compiler::TargetConfig& gpu_target_config,
-    const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts) {
+    HloModule* hlo_module, stream_executor::StreamExecutor* stream_exec,
+    const Compiler::TargetConfig& gpu_target_config,
+    const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
+    absl::string_view platform_name) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
 
   HloPassPipeline pipeline("optimization");
@@ -643,6 +701,10 @@ absl::Status RunOptimizationPasses(
   pipeline.AddPass<TopkSpecializer>();
   pipeline.AddPass<TopkDecomposer>();
 
+  pipeline.AddPass<SplitkRewriter>(gpu_target_config.device_description);
+  pipeline.AddPass<DotDimensionSorter>();
+  pipeline.AddPass<DotDecomposer>();
+
   HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
     const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
         &gpu_target_config.device_description.gpu_compute_capability());
@@ -652,9 +714,6 @@ absl::Status RunOptimizationPasses(
     }
     return !gpu::IsMatrixMultiplication(*instr);
   };
-  pipeline.AddPass<DotDimensionSorter>();
-  pipeline.AddPass<DotDecomposer>();
-
   pipeline.AddPass<ResultCaster>(upcaster_filter);
   pipeline.AddPass<OperandUpcaster>(upcaster_filter);
 
@@ -669,8 +728,17 @@ absl::Status RunOptimizationPasses(
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
 
+  // SortRewriter needs to ask the device how much scratch space is needed,
+  // which isn't feasible if we don't have a device.
   if (hlo_module->config().debug_options().xla_gpu_enable_cub_radix_sort()) {
-    pipeline.AddPass<SortRewriter>();
+    if (stream_exec != nullptr) {
+      pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
+                                     std::string{platform_name});
+    } else {
+      LOG(WARNING) << "Using fallback sort algorithm rather than SortRewriter, "
+                      "which will be slower at runtime. To avoid this, "
+                      "compile with a GPU present.";
+    }
   }
 
   // Comparison total order expander
@@ -702,7 +770,10 @@ absl::Status RunOptimizationPasses(
 
   pipeline.AddPass<DynamicIndexSplitter>();
 
-  pipeline.AddPass<CallInliner>();
+  pipeline.AddPass<CallInliner>(
+      /*single_call_site=*/false, /*update_domain=*/false,
+      /*composites_to_preserve=*/absl::flat_hash_set<std::string>(),
+      /*uniquify_channel_ids=*/debug_options.xla_ignore_channel_id());
 
   pipeline.AddPass<StochasticConvertDecomposer>();
 
@@ -753,15 +824,16 @@ absl::Status RunOptimizationPasses(
       LOG(FATAL) << "Unreachable";
   }
 
+  // DynamicPadder creates a stable KeyValue sort for dynamic reshapes.
   pipeline.AddPass<DynamicPadder>(dynamic_padder_options);
 
+  // TODO(b/407909195): Add SortRewriter here once it supports S32 keys for
+  // KeyValueSort. It needs to run before StableSortExpander, otherwise we will
+  // not match the comparison computation.
+
   // Expand the sort op to support stable sorting if required.
   pipeline.AddPass<StableSortExpander>();
 
-  if (hlo_module->config().debug_options().xla_gpu_enable_cub_radix_sort()) {
-    pipeline.AddPass<SortRewriter>();
-  }
-
   se::GpuComputeCapability gpu_version =
       gpu_target_config.device_description.gpu_compute_capability();
 
@@ -850,6 +922,9 @@ absl::Status RunCollectiveOptimizationPasses(
   collectives_pipeline.AddPass<AllReduceSimplifier>();
   collectives_pipeline.AddPass<AllReduceFolder>();
   collectives_pipeline.AddPass<AllReduceSplitter>();
+  if (debug_options.xla_gpu_unsupported_enable_all_reduce_decomposer()) {
+    collectives_pipeline.AddPass<AllReduceDecomposer>();
+  }
   collectives_pipeline.AddPass<AllGatherOptimizer>();
   collectives_pipeline.AddPass<AllGatherDynamicSliceSimplifier>();
   collectives_pipeline.AddPass<AllReduceReassociate>(
@@ -866,8 +941,7 @@ absl::Status RunCollectiveOptimizationPasses(
   // Remove dead computations after collective quantization.
   collectives_pipeline.AddPass<HloDCE>();
 
-  if (debug_options.xla_gpu_enable_pipelined_collectives() ||
-      debug_options.xla_gpu_enable_pipelined_all_reduce() ||
+  if (debug_options.xla_gpu_enable_pipelined_all_reduce() ||
       IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(*hlo_module)) {
     CollectivePipeliner::Config config{
         /*level_to_operate_on=*/0,
@@ -876,22 +950,21 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipeline_use_tree=*/true,
         /*process_different_sized_ops=*/true,
         /*pipelining_direction=*/
-        CollectivePipeliner::PipeliningDirection::kForward,
+        collective_pipeliner_utils::PipeliningDirection::kForward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
         /*acceptable_formatting=*/HloPredicateTrue,
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
-        /*postprocess_backward_peeled_op=*/std::nullopt,
-        /*postprocess_backward_rotated_op=*/std::nullopt,
-        /*postprocess_backward_peeled_trailing_op=*/std::nullopt,
+        /*postprocess_backward_peeled_op=*/{},
+        /*postprocess_backward_rotated_op=*/{},
+        /*postprocess_backward_peeled_trailing_op=*/{},
         /*should_add_loop_invariant_op_in_chain=*/false,
         /*postprocess_pipelined_ops=*/AppendPipelinedInstruction,
     };
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
-  if (debug_options.xla_gpu_enable_pipelined_collectives() ||
-      debug_options.xla_gpu_enable_pipelined_all_gather() ||
+  if (debug_options.xla_gpu_enable_pipelined_all_gather() ||
       IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(*hlo_module)) {
     CollectivePipeliner::Config config{
         /*level_to_operate_on=*/0,
@@ -900,22 +973,21 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipeline_use_tree=*/true,
         /*process_different_sized_ops=*/true,
         /*pipelining_direction=*/
-        CollectivePipeliner::PipeliningDirection::kBackward,
+        collective_pipeliner_utils::PipeliningDirection::kBackward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kAllGather>,
         /*acceptable_formatting=*/HloPredicateTrue,
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
-        /*postprocess_backward_peeled_op=*/std::nullopt,
-        /*postprocess_backward_rotated_op=*/std::nullopt,
-        /*postprocess_backward_peeled_trailing_op=*/std::nullopt,
+        /*postprocess_backward_peeled_op=*/{},
+        /*postprocess_backward_rotated_op=*/{},
+        /*postprocess_backward_peeled_trailing_op=*/{},
         /*should_add_loop_invariant_op_in_chain=*/true,
         /*postprocess_pipelined_ops=*/AppendPipelinedInstruction,
     };
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
-  if (debug_options.xla_gpu_enable_pipelined_collectives() ||
-      debug_options.xla_gpu_enable_pipelined_reduce_scatter() ||
+  if (debug_options.xla_gpu_enable_pipelined_reduce_scatter() ||
       IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(*hlo_module)) {
     CollectivePipeliner::Config config{
         /*level_to_operate_on=*/0,
@@ -924,15 +996,15 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipeline_use_tree=*/true,
         /*process_different_sized_ops=*/true,
         /*pipelining_direction=*/
-        CollectivePipeliner::PipeliningDirection::kForward,
+        collective_pipeliner_utils::PipeliningDirection::kForward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kReduceScatter>,
         /*acceptable_formatting=*/HloPredicateTrue,
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
-        /*postprocess_backward_peeled_op=*/std::nullopt,
-        /*postprocess_backward_rotated_op=*/std::nullopt,
-        /*postprocess_backward_peeled_trailing_op=*/std::nullopt,
+        /*postprocess_backward_peeled_op=*/{},
+        /*postprocess_backward_rotated_op=*/{},
+        /*postprocess_backward_peeled_trailing_op=*/{},
         /*should_add_loop_invariant_op_in_chain=*/false,
         /*postprocess_pipelined_ops=*/AppendPipelinedInstruction,
     };
@@ -943,19 +1015,11 @@ absl::Status RunCollectiveOptimizationPasses(
 
   DebugOptions::PipelineParallelismOptLevel pipeline_parallelism_opt_level =
       debug_options.xla_gpu_experimental_pipeline_parallelism_opt_level();
-  if (pipeline_parallelism_opt_level ==
-          DebugOptions::
-              PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER ||
-      debug_options.xla_gpu_enable_pipelined_p2p()) {
+  if (debug_options.xla_gpu_enable_pipelined_p2p()) {
     collectives_pipeline.AddPass<CollectivePermuteCycleDecomposer>(
         debug_options.xla_gpu_collective_permute_decomposer_threshold());
   }
 
-  if (pipeline_parallelism_opt_level ==
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER) {
-    collectives_pipeline.AddPass<CollectiveSelectFolder>();
-  }
-
   if (pipeline_parallelism_opt_level !=
           DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE ||
       debug_options.xla_gpu_enable_pipelined_p2p()) {
@@ -964,16 +1028,6 @@ absl::Status RunCollectiveOptimizationPasses(
         pipeline_parallelism_opt_level);
   }
 
-  bool enable_partial_send_recv_pipelining =
-      pipeline_parallelism_opt_level !=
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE;
-  if (debug_options.xla_gpu_enable_pipelined_collectives() ||
-      debug_options.xla_gpu_enable_pipelined_p2p() ||
-      enable_partial_send_recv_pipelining) {
-    collectives_pipeline.AddPass<GpuP2PPipeliner>(
-        enable_partial_send_recv_pipelining);
-  }
-
   // Run algebraic simplifier to reshape(broadcast) into a broadcast when
   // the reshape is just adding a unit dimension. This will help with the
   // AllGatherBroadcastReorder pass.
@@ -982,6 +1036,14 @@ absl::Status RunCollectiveOptimizationPasses(
 
   collectives_pipeline.AddPass<AllGatherBroadcastReorder>();
 
+  if (debug_options.xla_gpu_experimental_collective_cse_distance_threshold() >
+      0) {
+    collectives_pipeline.AddPass<ScheduleAwareCollectiveOpsCSE>(
+        /*distance_threshold=*/debug_options
+            .xla_gpu_experimental_collective_cse_distance_threshold(),
+        /*for_replicas=*/false);
+  }
+
   // promote 16 bit integer all-reduce and reduce-scatter to 32-bit.
   const std::pair<PrimitiveType, PrimitiveType> ar_promoted_types[] = {
       {U16, U32}, {S16, S32}};
@@ -1028,9 +1090,7 @@ absl::Status RunLayoutAssignmentPasses(
   // Run HostOffloadLegalize before LayoutNormalization to prevent
   // the creation of invalid transpose/bitcast operations within
   // host memory offloading segments.
-  pipeline.AddPass<HostOffloadLegalize>(
-      static_cast<int64_t>(stream_executor::MemoryType::kHost),
-      /* after_layout= */ true);
+  pipeline.AddPass<HostOffloadLegalize>();
   return pipeline.Run(hlo_module).status();
 }
 
@@ -1041,6 +1101,10 @@ absl::Status RunFusionPasses(HloModule* hlo_module,
   const se::DeviceDescription& gpu_device_info =
       gpu_target_config.device_description;
 
+  HloPassPipeline pre_fusion("pre-fusion");
+  pre_fusion.AddPass<AddTrackingSuffixToInstructionNames>();
+  TF_RETURN_IF_ERROR(pre_fusion.Run(hlo_module).status());
+
   TF_RETURN_IF_ERROR(FusionPipeline(hlo_module->config().debug_options(),
                                     shape_size_fn, thread_pool, gpu_device_info)
                          .Run(hlo_module)
@@ -1096,33 +1160,44 @@ void AddDoubleBufferingPasses(const HloModule& module,
   }
 }
 
-absl::Status RunPostFusionPasses(
-    HloModule* hlo_module, const se::DeviceDescription& device_description,
-    int pointer_size) {
-  const DebugOptions& opts = hlo_module->config().debug_options();
+constexpr int kCombineThresholdCount = 256;
+
+void AddCollectiveCombinerPasses(
+    HloPassPipeline& pipeline, const HloModule& module,
+    const se::DeviceDescription& device_description, int pointer_size) {
+  const DebugOptions& opts = module.config().debug_options();
+
+  if (opts.xla_gpu_experimental_enable_sync_collective_combining()) {
+    pipeline.AddPass<CollectiveCombinerAnnotator>(device_description,
+                                                  pointer_size);
+  }
 
-  HloPassPipeline pipeline("post-fusion optimization");
-  pipeline.AddPass<RenameFusions>();
   pipeline.AddPass<GpuAllGatherCombiner>(
-      device_description,
-      /*default_combine_threshold_in_bytes=*/kDefaultAllGatherCombineThreshold,
-      /*combine_threshold_in_bytes=*/
-      opts.xla_gpu_all_gather_combine_threshold_bytes(),
-      /*combine_threshold_count=*/256,
-      /*combine_by_dim=*/opts.xla_gpu_enable_all_gather_combine_by_dim(),
-      /*combine_different_dtypes=*/true, /*pointer_size=*/pointer_size);
+      device_description, kDefaultAllGatherCombineThreshold,
+      opts.xla_gpu_all_gather_combine_threshold_bytes(), kCombineThresholdCount,
+      opts.xla_gpu_enable_all_gather_combine_by_dim(),
+      /*combine_different_dtypes=*/true, pointer_size);
   pipeline.AddPass<GpuAllReduceCombiner>(
       device_description, kDefaultAllReduceCombineThreshold,
-      opts.xla_gpu_all_reduce_combine_threshold_bytes(),
-      /*combine_threshold_count=*/256, /*pointer_size=*/pointer_size);
+      opts.xla_gpu_all_reduce_combine_threshold_bytes(), kCombineThresholdCount,
+      pointer_size);
   pipeline.AddPass<GpuReduceScatterCombiner>(
-      device_description, /*default_combine_threshold_in_bytes=*/
-      kDefaultReduceScatterCombineThreshold,
-      /*combine_threshold_in_bytes=*/
+      device_description, kDefaultReduceScatterCombineThreshold,
       opts.xla_gpu_reduce_scatter_combine_threshold_bytes(),
-      /*combine_threshold_count=*/256,
-      /*combine_by_dim=*/opts.xla_gpu_enable_reduce_scatter_combine_by_dim(),
-      /*pointer_size=*/pointer_size);
+      kCombineThresholdCount,
+      opts.xla_gpu_enable_reduce_scatter_combine_by_dim(), pointer_size);
+  pipeline.AddPass<CollectivePermuteCombiner>(
+      opts.xla_gpu_collective_permute_combine_threshold_bytes(),
+      kCombineThresholdCount);
+}
+
+absl::Status RunPostFusionPasses(
+    HloModule* hlo_module, const se::DeviceDescription& device_description,
+    int pointer_size) {
+  HloPassPipeline pipeline("post-fusion optimization");
+  pipeline.AddPass<RenameFusions>();
+  AddCollectiveCombinerPasses(pipeline, *hlo_module, device_description,
+                              pointer_size);
 
   pipeline.AddPass<AllReduceContiguous>();
 
@@ -1167,6 +1242,7 @@ absl::Status RunPostFusionSimplificationPasses(
           .xla_gpu_experimental_stream_annotation()) {
     pipeline.AddPass<ExplicitStreamAnnotationAsyncWrapper>();
   }
+  pipeline.AddPass<ExplicitCollectivesGroupAsyncWrapper>();
   return pipeline.Run(hlo_module).status();
 }
 
@@ -1221,6 +1297,7 @@ absl::Status RunLayoutNormalizationPasses(
 }
 
 absl::Status RunAsyncDotPasses(HloModule* hlo_module) {
+  tsl::profiler::TraceMe traceme("RunAsyncDotPasses");
   HloPassPipeline pipeline("async-wrapper");
   const DebugOptions& debug_options = hlo_module->config().debug_options();
   if (debug_options.xla_gpu_async_dot()) {
@@ -1241,14 +1318,19 @@ absl::Status RunAsyncDotPasses(HloModule* hlo_module) {
   return pipeline.Run(hlo_module).status();
 }
 
-absl::Status RunDynamicSliceFusionPasses(HloModule* hlo_module,
-                                         se::Platform::Id platform_id) {
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_dynamic_slice_fusion()) {
+absl::Status RunDynamicSliceFusionPasses(
+    HloModule* hlo_module, se::Platform::Id platform_id,
+    const se::DeviceDescription& device_description, int64_t pointer_size) {
+  const DebugOptions& opts = hlo_module->config().debug_options();
+  if (opts.xla_gpu_enable_dynamic_slice_fusion()) {
     HloPassPipeline pipeline("dynamic-slice");
     TF_ASSIGN_OR_RETURN(se::Platform * platform,
                         se::PlatformManager::PlatformWithId(platform_id));
+    pipeline.AddPass<GpuReduceScatterCombiner>(
+        device_description, kDefaultReduceScatterCombineThreshold,
+        opts.xla_gpu_reduce_scatter_combine_threshold_bytes(),
+        kCombineThresholdCount,
+        opts.xla_gpu_enable_reduce_scatter_combine_by_dim(), pointer_size);
     pipeline.AddPass<DynamicSliceFusionRewriter>(platform->Name());
     pipeline.AddPass<AsyncWrapper>([](const HloInstruction* instr) {
       if (!IsDynamicSliceFusion(instr)) {
@@ -1266,7 +1348,6 @@ absl::Status RunDynamicSliceFusionPasses(HloModule* hlo_module,
 
   return absl::OkStatus();
 }
-
 }  // namespace
 
 absl::Status GpuCompiler::RunCollectiveScheduleLinearizerPasses(
@@ -1283,7 +1364,7 @@ absl::Status GpuCompiler::RunCollectiveScheduleLinearizerPasses(
 absl::Status GpuCompiler::OptimizeHloModule(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
     const CompileOptions& options, const TargetConfig& gpu_target_config) {
-  tsl::profiler::TraceMe traceme("GpuCompiler::OptimizeHloModule");
+  tsl::profiler::TraceMe traceme("OptimizeHloModule");
   const se::DeviceDescription& device_description =
       gpu_target_config.device_description;
 
@@ -1305,8 +1386,12 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_RETURN_IF_ERROR(RunPreSPMDPartitionerPasses(hlo_module));
   TF_RETURN_IF_ERROR(RunSPMDPasses(hlo_module, gpu_target_config,
                                    layout_insensitive_algsimp_opts));
-  TF_RETURN_IF_ERROR(RunOptimizationPasses(hlo_module, gpu_target_config,
-                                           layout_insensitive_algsimp_opts));
+  TF_ASSIGN_OR_RETURN(
+      const stream_executor::Platform* platform,
+      stream_executor::PlatformManager::PlatformWithId(PlatformId()));
+  TF_RETURN_IF_ERROR(
+      RunOptimizationPasses(hlo_module, stream_exec, gpu_target_config,
+                            layout_insensitive_algsimp_opts, platform->Name()));
   se::GpuComputeCapability gpu_version =
       device_description.gpu_compute_capability();
   TF_RETURN_IF_ERROR(RunCollectiveOptimizationPasses(
@@ -1335,7 +1420,10 @@ absl::Status GpuCompiler::OptimizeHloModule(
       thread_pool.get_mutable()));
 
   // This is a "low effort, high impact" fusion that should be run first.
-  TF_RETURN_IF_ERROR(RunDynamicSliceFusionPasses(hlo_module, PlatformId()));
+  TF_RETURN_IF_ERROR(RunDynamicSliceFusionPasses(
+      hlo_module, /*platform_id=*/PlatformId(),
+      /*device_description=*/gpu_target_config.device_description,
+      /*pointer_size=*/pointer_size_));
 
   TF_RETURN_IF_ERROR(RunFusionPasses(hlo_module, gpu_target_config,
                                      thread_pool.get_mutable(),
@@ -1508,6 +1596,13 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
           cuda_cc->IsAtLeast(se::CudaComputeCapability::kAmpere)) ||
          rocm_cc != nullptr)) {
       pipeline.AddPass<GemvRewriter>();
+      // Transpose dimension grouper simplifies the dimensions of the transpose
+      // and enables the symbolic tiling analysis for the generic emitter to
+      // find the possible tiling. It should run before the gemm rewriter has
+      // introduced the nested fusions. We also want to keep it close to the
+      // gemm rewriter to avoid the possibility of new passes to rewrite the
+      // transpose.
+      pipeline.AddPass<TransposeDimensionGrouper>();
       pipeline.AddPass<GemmFusion>(gpu_version);
       pipeline.AddPass<GemmFusionSwapOperands>();
     } else if (cuda_cc != nullptr &&
@@ -1536,7 +1631,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // also have unsorted update_window_dims.
     pipeline.AddPass<ScatterSimplifier>();
     pipeline.AddPass<BroadcastCanonicalizer>();
-
+    // BroadcastCanonicalizer can create transposes.
     pipeline.AddPass<TransposeDimensionGrouper>();
     pipeline.AddPass<ReductionDegenerateDimRemover>();
     pipeline.AddPass<ReductionLayoutNormalizer>();
@@ -1588,8 +1683,12 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
       &pipeline, hlo_module, autotune_config, thread_pool,
       options.key_value_store,
       gpu_target_config.device_description.runtime_version()));
+
   // Inline back the calls which have better performance with cuBLAS.
-  pipeline.AddPass<CallInliner>();
+  pipeline.AddPass<CallInliner>(
+      /*single_call_site=*/false, /*update_domain=*/false,
+      /*composites_to_preserve=*/absl::flat_hash_set<std::string>(),
+      /*uniquify_channel_ids=*/debug_options.xla_ignore_channel_id());
   // TODO(tdanyluk): Apply CublasPadForGemms to the cuBLAS GEMMs generated
   // here for possibly better cuBLAS performance.
 
@@ -1599,18 +1698,18 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
   pipeline.AddPass<GemmBroadcastFoldingRewriter>();
 
-  // Recover host-offloader invariants (such as the single-use broadcast buffer
-  // initialization before loops) by re-running the offload legalizer.
-  pipeline.AddPass<HostOffloadLegalize>(
-      static_cast<int64_t>(stream_executor::MemoryType::kHost),
-      /* after_layout= */ true);
-
   pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
 
   // Layout normalization will create scatters that are not simplified and
   // also have unsorted update_window_dims.
   pipeline.AddPass<ScatterSimplifier>();
 
+  // Verify the host memory space before the host offloader pass
+  std::unique_ptr<TargetVerifierMetadata> verifier_metadata =
+      std::make_unique<CpuGpuVerifierMetadata>(
+          HloVerifierOpts{}.VerifyNoHostMemorySpace());
+  pipeline.AddPass<HloVerifier>(std::move(verifier_metadata));
+
   pipeline.AddPass<HostOffloader>();
 
   TF_RETURN_IF_ERROR(
@@ -1621,6 +1720,14 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // normalized again.
   add_float_normalization(pipeline);
 
+  // Match the location of this pass in `gemm_fusion_autotuner.cc` to make sure
+  // that there is no discrepancy.
+  if (debug_options
+          .xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms()) {
+    pipeline.AddPass<NestGemmFusion>(
+        gpu_target_config.device_description.gpu_compute_capability());
+  }
+
   // Clean up new_tuple described above.
   pipeline.AddPass<TupleSimplifier>();
 
@@ -1646,11 +1753,10 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     HloPassPipeline& remove_no_op_reduce_precision_pipeline =
         pipeline.AddPass<HloPassPipeline>(
             "remove-no-op-reduce-precision-algebraic-simplifier");
-    AlgebraicSimplifierOptions simplifier_options_{simplifier_options};
-    simplifier_options_.set_enable_remove_no_op_reduce_precision(true);
+    AlgebraicSimplifierOptions options{simplifier_options};
+    options.set_enable_remove_no_op_reduce_precision(true);
     remove_no_op_reduce_precision_pipeline
-        .AddPass<HloPassFix<GpuAlgebraicSimplifier>>(simplifier_options_,
-                                                     gpu_version);
+        .AddPass<HloPassFix<GpuAlgebraicSimplifier>>(options, gpu_version);
   }
 
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
@@ -1724,7 +1830,6 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
   const DebugOptions debug_opts = module->config().debug_options();
-  VLOG(2) << "RunHloPasses: Debug options: " << debug_opts.DebugString();
   TF_RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
   bool is_deviceless = options.target_config.has_value() ||
                        !debug_opts.xla_gpu_target_config_filename().empty();
@@ -1734,7 +1839,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   const std::optional<std::string> unoptimized_fingerprint =
       MaybeUploadUnoptimizedGpuSymbols(module.get(),
                                        gpu_target_config.ToProto());
-
+  DumpHloConfigIfEnabled(*module);
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
   XLA_SCOPED_LOGGING_TIMER_IF(
       absl::StrCat("GpuCompiler::RunHloPasses for ", module->name()),
@@ -1753,17 +1858,6 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   TF_RETURN_IF_ERROR(
       RunPreSchedulingCopyInsertion(*module, device_description));
 
-  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-      &device_description.gpu_compute_capability());
-  if (cuda_cc != nullptr && cuda_cc->IsAtLeastAmpere()) {
-    // This needs to run after every pass affecting fusions, which includes
-    // `CopyFusion`, which runs just before.
-    TF_RETURN_IF_ERROR(
-        FusionDispatchPipeline(device_description, ShapeSizeBytesFunction())
-            .Run(module.get())
-            .status());
-  }
-
   uint64_t end_usecs = tsl::Env::Default()->NowMicros();
 
   // This won't record values for calls that error out (because if they error
@@ -1889,6 +1983,7 @@ GpuCompiler::CompileSingleModule(
     const stream_executor::DeviceDescription& device_description,
     const HloModule* debug_module, llvm::Module* llvm_module, bool relocatable,
     const CompileOptions& options, std::optional<int> shard_number) {
+  tsl::profiler::TraceMe traceme("CompileSingleModule");
   {
     // This may print multiple lines per HLO compilation because of the
     // parallelized compilation of LLVM modules.
@@ -1982,6 +2077,7 @@ absl::StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileAndLink(
     const se::DeviceDescription& device_description,
     se::StreamExecutor* stream_exec, const CompileOptions& options,
     const HloModule* debug_module) {
+  tsl::profiler::TraceMe traceme("CompileAndLink");
   llvm::Module* llvm_module = &*compile_module_results.llvm_module;
 
   bool force_module_split =
@@ -2209,8 +2305,7 @@ GpuCompiler::CompileToBackendResult(
     HloModule* module, llvm::LLVMContext* llvm_context,
     se::StreamExecutor* executor, const CompileOptions& options,
     const se::DeviceDescription& gpu_device_info) {
-  tsl::profiler::TraceMe traceme("GpuCompiler::CompileToBackendResult");
-
+  tsl::profiler::TraceMe traceme("CompileToBackendResult");
   TF_RETURN_IF_ERROR(RunPreSchedulingPasses(module, executor, gpu_device_info));
   TF_ASSIGN_OR_RETURN(
       ScheduleMetadata schedule_metadata,
@@ -2218,8 +2313,15 @@ GpuCompiler::CompileToBackendResult(
   TF_RETURN_IF_ERROR(RunPostSchedulingPipelines(
       module, schedule_metadata.scheduler_mem_limit, gpu_device_info));
 
-  TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      se::PlatformManager::PlatformWithId(PlatformId()));
+  absl::StatusOr<se::Platform*> platform =
+      se::PlatformManager::PlatformWithId(PlatformId());
+  if (!platform.ok()) {
+    return absl::Status(
+        platform.status().code(),
+        absl::StrCat(
+            platform.status().message(),
+            ". Are you missing gpu_plugin or stream_executor dependency?"));
+  }
 
   // Test whether LinkModules is supported.
   bool can_use_link_modules = (executor != nullptr);
@@ -2245,7 +2347,7 @@ GpuCompiler::CompileToBackendResult(
     TF_ASSIGN_OR_RETURN(
         compile_module_results,
         CompileModuleToLlvmIr(module, llvm_context, target_triple_,
-                              data_layout_, platform, gpu_device_info,
+                              data_layout_, *platform, gpu_device_info,
                               GetCanShareBuffer(gpu_device_info),
                               BufferSizeBytesFunction(),
                               /*split_constants_module=*/use_cache));
@@ -2313,7 +2415,6 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   }
 
   const DebugOptions& debug_opts = module->config().debug_options();
-  VLOG(2) << "RunBackend: Debug options: " << debug_opts.DebugString();
   TF_ASSIGN_OR_RETURN(TargetConfig gpu_target_config,
                       GetTargetConfig(options, debug_opts, stream_exec));
 
@@ -2440,6 +2541,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                 const AotCompilationOptions& options) {
+  tsl::profiler::TraceMe traceme("CompileAheadOfTime");
   // Check that we are on the platform (CUDA or ROCm) that was chosen for AOT
   // compilation.
   CHECK_EQ(options.PlatformId(), PlatformId());
@@ -2491,7 +2593,7 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         GpuThunkAotCompilationResult::FromModule(
             module.get(), res.compile_module_results.buffer_assignment.get(),
             res.backend_result.asm_text, res.backend_result.binary,
-            res.backend_result.dnn_compiled_graphs));
+            res.backend_result.dnn_compiled_graphs, pointer_size_));
   }
 
   return std::move(results);
@@ -2510,12 +2612,13 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
   return GpuThunkAotCompilationResult::FromModule(
       &gpu_executable->module(), gpu_executable->buffer_assignment(),
       gpu_executable->text(), gpu_executable->binary(),
-      gpu_executable->dnn_compiled_graphs());
+      gpu_executable->dnn_compiled_graphs(), pointer_size_);
 }
 
 absl::Status GpuCompiler::RunPreSchedulingPasses(
     HloModule* module, se::StreamExecutor* stream_exec,
     const se::DeviceDescription& gpu_device_info) {
+  tsl::profiler::TraceMe traceme("RunPreSchedulingPasses");
   HloPassPipeline pipeline("pre-scheduling-passes");
   pipeline.AddPass<FusionWrapper>(gpu_device_info);
   if (module->config().debug_options().xla_gpu_collect_cost_model_stats()) {
@@ -2527,9 +2630,29 @@ absl::Status GpuCompiler::RunPreSchedulingPasses(
     // Cost model analysis for compute.
     pipeline.AddPass<GpuCostModelStatsCollection>(gpu_device_info,
                                                   cost_analysis_options);
-    // Cost model analysis for collectives.
+    // S-curve model analysis for collectives.
     pipeline.AddPass<SolGpuCostModelStatsCollection>(gpu_device_info,
                                                      ShapeSizeBytesFunction());
+
+    // Perf tables model analysis for collectives.
+    if (std::string collective_perf_table_path =
+            module->config()
+                .debug_options()
+                .xla_gpu_experimental_collective_perf_table_path();
+        !collective_perf_table_path.empty()) {
+      pipeline.AddPass<CollectivePerfTableStatsCollection>(
+          collective_perf_table_path, gpu_device_info);
+    }
+
+    // Perf tables model analysis for matmuls.
+    if (std::string matmul_perf_table_path =
+            module->config()
+                .debug_options()
+                .xla_gpu_experimental_matmul_perf_table_path();
+        !matmul_perf_table_path.empty()) {
+      pipeline.AddPass<MatmulPerfTableStatsCollection>(matmul_perf_table_path,
+                                                       gpu_device_info);
+    }
   }
   return pipeline.Run(module).status();
 }
@@ -2537,6 +2660,7 @@ absl::Status GpuCompiler::RunPreSchedulingPasses(
 HloCostAnalysis::Options CreateHloAnalysisOpts(
     const HloModule& module, const se::DeviceDescription& gpu_device_info,
     ShapeSizeFn shape_size_fn) {
+  tsl::profiler::TraceMe traceme("CreateHloAnalysisOpts");
   HloCostAnalysis::Options hlo_cost_analysis_options;
   hlo_cost_analysis_options.shape_size = shape_size_fn;
   std::optional<HloRematerialization::HostMemoryOffloadConfig>
@@ -2567,6 +2691,7 @@ HloCostAnalysis::Options CreateHloAnalysisOpts(
 HloRematerialization::Options CreateRematOpts(
     const HloModule& module, const se::DeviceDescription& gpu_device_info,
     HloCostAnalysis& hlo_cost_analysis, int64_t scheduler_mem_limit) {
+  tsl::profiler::TraceMe traceme("CreateRematOpts");
   bool enable_offloading =
       module.config().debug_options().xla_gpu_enable_host_memory_offloading();
   std::optional<HloRematerialization::HostMemoryOffloadConfig>
@@ -2598,6 +2723,7 @@ HloRematerialization::Options CreateRematOpts(
 absl::Status GpuCompiler::RunPostSchedulingPipelines(
     HloModule* module, int64_t scheduler_mem_limit,
     const se::DeviceDescription& gpu_device_info) const {
+  tsl::profiler::TraceMe traceme("RunPostSchedulingPipelines");
   TF_RETURN_IF_ERROR(RunPostSchedulingCopyInsertion(
       module, GetCanShareBuffer(gpu_device_info)));
   HloPassPipeline main_pipeline("post-scheduling-passes");
@@ -2606,17 +2732,6 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
   {
     HloPassPipeline& pipeline =
         main_pipeline.AddPass<HloPassPipeline>("async-to-sync-converter");
-
-    if (module->config()
-                .debug_options()
-                .xla_gpu_experimental_pipeline_parallelism_opt_level() ==
-            DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE &&
-        (module->config()
-             .debug_options()
-             .xla_gpu_enable_pipelined_collectives() ||
-         module->config().debug_options().xla_gpu_enable_pipelined_p2p())) {
-      pipeline.AddPass<PipelinedP2PRewriter>();
-    }
     pipeline.AddPass<GpuConvertAsyncCollectivesToSync>();
   }
 
@@ -2647,6 +2762,15 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     pipeline.AddPass<FusionWrapper>(gpu_device_info);
   }
 
+  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
+      &gpu_device_info.gpu_compute_capability());
+  if (cuda_cc != nullptr && cuda_cc->IsAtLeastAmpere()) {
+    // This needs to run after every pass affecting fusions. The last passes
+    // that create new fusions are FusionWrapper and StreamAttributeAnnotator.
+    main_pipeline.AddPass<HloPassPipeline>(
+        FusionDispatchPipeline(gpu_device_info, ShapeSizeBytesFunction()));
+  }
+
   // Pipeline with passes which wrap a scheduled module into command buffers.
   {
     HloPassPipeline& pipeline =
@@ -2666,6 +2790,7 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
 
 absl::Status GpuCompiler::LoadAutotuneResultsFromFile(
     const DebugOptions& debug_options) {
+  tsl::profiler::TraceMe traceme("LoadAutotuneResultsFromFile");
   // We are doing this before the timer is started.
   if (absl::string_view file_path =
           debug_options.xla_gpu_load_autotune_results_from();
@@ -2697,13 +2822,8 @@ absl::Status GpuCompiler::SerializeAutotuneResultsToFile(
 absl::StatusOr<std::unique_ptr<AotCompilationResult>>
 GpuCompiler::LoadAotCompilationResult(
     const std::string& serialized_aot_result) {
-  return LoadAotCompilationResultStatic(serialized_aot_result);
-}
-
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-GpuCompiler::LoadAotCompilationResultStatic(
-    const std::string& serialized_aot_result) {
-  return GpuThunkAotCompilationResult::FromString(serialized_aot_result);
+  return GpuThunkAotCompilationResult::FromString(serialized_aot_result,
+                                                  pointer_size_);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index d4100c580f3e..54b78b74682c 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -89,10 +89,6 @@ class GpuCompiler : public LLVMCompiler {
   absl::StatusOr<std::unique_ptr<AotCompilationResult>>
   LoadAotCompilationResult(const std::string& serialized_aot_result) override;
 
-  // Stateless version of the same function.
-  static absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-  LoadAotCompilationResultStatic(const std::string& serialized_aot_result);
-
   absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
       Executable* executable) const override;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index c74ee28492cd..e81e70c7ab75 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -18,29 +18,41 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <iostream>
 #include <limits>
 #include <memory>
+#include <optional>
+#include <ostream>
 #include <string>
+#include <tuple>
+#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/base/log_severity.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/log/log_sink.h"
+#include "absl/log/scoped_mock_log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "xla/autotune_results.pb.h"
+#include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
@@ -51,11 +63,13 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/xla_debug_info_manager.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
@@ -65,6 +79,11 @@ limitations under the License.
 #include "xla/tsl/lib/monitoring/collected_metrics.h"
 #include "xla/tsl/lib/monitoring/collection_registry.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -82,10 +101,12 @@ namespace {
 
 namespace m = ::xla::match;
 
+using ::testing::EndsWith;
 using ::testing::IsEmpty;
 using ::testing::IsSupersetOf;
 using ::testing::Matches;
 using ::testing::Not;
+using ::testing::StartsWith;
 using ::testing::TempDir;
 
 class GpuCompilerTest : public HloTestBase {
@@ -98,10 +119,27 @@ class GpuCompilerTest : public HloTestBase {
     return tensorflow::down_cast<GpuCompiler*>(compiler)
         ->RunPostSchedulingPipelines(module, 4 * 1024 * 1024, gpu_device_info);
   }
+
+  // Like GetOptimizedModule, but also runs the backend. This is important for
+  // tests that need to verify behavior of passes that run in RunBackend. The
+  // former function will only run the passes in RunHloPasses.
+  // This returns the module and the executable because the latter owns the
+  // former.
+  absl::StatusOr<std::pair<const HloModule*, std::unique_ptr<OpaqueExecutable>>>
+  GetOptimizedModuleForExecutable(absl::string_view hlo,
+                                  const HloModuleConfig& config) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> module,
+                        ParseAndReturnVerifiedModule(hlo, config));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<OpaqueExecutable> executable,
+        CreateExecutable(std::move(module), /*run_hlo_passes=*/true));
+    TF_ASSIGN_OR_RETURN(const HloModule* optimized_module,
+                        test_runner().HloModuleFromWrapped(executable.get()));
+    return {{optimized_module, std::move(executable)}};
+  }
 };
 
-// TODO(b/399912696): Fix and enable this test.
-TEST_F(GpuCompilerTest, DISABLED_CompiledProgramsCount) {
+TEST_F(GpuCompilerTest, CompiledProgramsCount) {
   const char* hlo_text = R"(
 HloModule test
 
@@ -111,7 +149,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_text).value();
-  ResetCompiledProgramsCountForTesting();
+  auto before = GetCompiledProgramsCount();
   std::unique_ptr<Executable> executable =
       backend()
           .compiler()
@@ -121,7 +159,7 @@ ENTRY main {
                         /*layout_canonicalization_callback=*/{},
                         /*is_autotuning_compilation=*/false})
           .value();
-  EXPECT_EQ(GetCompiledProgramsCount(), 1);
+  EXPECT_EQ(GetCompiledProgramsCount(), before + 1);
 }
 
 TEST_F(GpuCompilerTest, RecordsStreamzStackTrace) {
@@ -696,10 +734,20 @@ ENTRY main {
     return GetOptimizedModule(std::move(module));
   };
 
-  auto cc = backend()
-                .default_stream_executor()
-                ->GetDeviceDescription()
-                .cuda_compute_capability();
+  auto gpu_cc = backend()
+                    .default_stream_executor()
+                    ->GetDeviceDescription()
+                    .gpu_compute_capability();
+  bool is_cuda =
+      std::holds_alternative<stream_executor::CudaComputeCapability>(gpu_cc);
+  auto cuda_cc = backend()
+                     .default_stream_executor()
+                     ->GetDeviceDescription()
+                     .cuda_compute_capability();
+  auto rocm_cc = backend()
+                     .default_stream_executor()
+                     ->GetDeviceDescription()
+                     .rocm_compute_capability();
 
   const std::string triton_keep_types = absl::Substitute(
       R"(CHECK: fusion($0{{[^)]*}}, $1{{[^)]*}}){{.*}}"kind":"__triton_gemm")",
@@ -712,7 +760,9 @@ ENTRY main {
   const std::string fallback_convert_to_f16 =
       R"(CHECK: dot(f16{{[^)]*}}, f16{{[^)]*}}))";
 
-  {
+  HloPrintOptions print_options =
+      HloPrintOptions().set_print_operand_shape(true);
+  if (is_cuda) {
     // Triton enabled, no fallback.
     TF_ASSERT_OK_AND_ASSIGN(auto optimized_module_no_fallback,
                             optimize_module(/*enable_triton=*/true,
@@ -720,13 +770,14 @@ ENTRY main {
                                             /*enable_blas_fallback=*/false));
     // Triton supports f8e4m3fn on Hopper and f8e5m2 on Ampere.
     const std::string triton_expected_check =
-        (cc.IsAtLeastHopper() ||
-         (cc.IsAtLeastAmpere() && lhs_type == F8E5M2 && rhs_type == F8E5M2))
+        (cuda_cc.IsAtLeastHopper() ||
+         (cuda_cc.IsAtLeastAmpere() && lhs_type == F8E5M2 &&
+          rhs_type == F8E5M2))
             ? triton_keep_types
             : cublas_convert_to_f16;
     TF_ASSERT_OK_AND_ASSIGN(
         bool filecheck_matched,
-        RunFileCheck(optimized_module_no_fallback->ToString(),
+        RunFileCheck(optimized_module_no_fallback->ToString(print_options),
                      triton_expected_check));
     EXPECT_TRUE(filecheck_matched);
   }
@@ -740,13 +791,15 @@ ENTRY main {
     // cuBLASlt is only available on Hopper and it doesn't support
     // f8e5m2×f8e5m2.
     const std::string blas_expected_check =
-        (cc.IsAtLeastHopper() && !(lhs_type == F8E5M2 && rhs_type == F8E5M2))
+        ((rocm_cc.has_ocp_fp8_support() || cuda_cc.IsAtLeastHopper()) &&
+         !(lhs_type == F8E5M2 && rhs_type == F8E5M2))
             ? cublaslt_keep_types
             : cublas_convert_to_f16;
 
-    TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
-                            RunFileCheck(optimized_module_no_triton->ToString(),
-                                         blas_expected_check));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool filecheck_matched,
+        RunFileCheck(optimized_module_no_triton->ToString(print_options),
+                     blas_expected_check));
     EXPECT_TRUE(filecheck_matched);
   }
 
@@ -756,132 +809,14 @@ ENTRY main {
                             optimize_module(/*enable_triton=*/false,
                                             /*enable_blas=*/false,
                                             /*enable_blas_fallback=*/false));
-    TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
-                            RunFileCheck(optimized_module_nothing->ToString(),
-                                         fallback_convert_to_f16));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool filecheck_matched,
+        RunFileCheck(optimized_module_nothing->ToString(print_options),
+                     fallback_convert_to_f16));
     EXPECT_TRUE(filecheck_matched);
   }
 }
 
-TEST_F(GpuCompilerTest, CollectivePermuteDecompositionAndPipelining) {
-  const char* kModuleStr = R"(
-HloModule cp
-
-cond {
-    param = (u32[], f32[1, 1024, 1024]) parameter(0)
-    count = get-tuple-element(%param), index=0
-    ub = u32[] constant(11)
-    ROOT result = pred[] compare(count, ub), direction=LT
- }
-
-body {
-    param = (u32[], f32[1, 1024, 1024]) parameter(0)
-    count = get-tuple-element(%param), index=0
-    send-data = get-tuple-element(%param), index=1
-
-    recv-data = f32[1, 1024, 1024] collective-permute(send-data),
-      source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}, channel_id=1
-
-    // The computation code that uses the current recv-data and
-    // produces the send-data for the next iteration.
-    c1 = u32[] constant(1)
-    new_count = u32[] add(count, c1)
-    replica = u32[] replica-id()
-    c10 = u32[] constant(10)
-    sum = u32[] add(replica, c10)
-    sum2 = u32[] add(sum, count)
-    conv = f32[] convert(sum2)
-    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
-    b = f32[1, 1024, 1024] add(p, recv-data)
-    c = f32[1, 1024, 1024] multiply(b, b)
-    d = f32[1, 1024, 1024] tan(c)
-    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
-      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-
-    ROOT result = (u32[], f32[1, 1024, 1024]) tuple(new_count, s)
-}
-
-ENTRY test_computation {
-    c0 = u32[] constant(0)
-    f0 = f32[] constant(0.0)
-    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
-    while_init = (u32[], f32[1, 1024, 1024]) tuple(c0, init)
-    while_result = (u32[], f32[1, 1024, 1024]) while(while_init), body=body, condition=cond
-    ROOT result = f32[1, 1024, 1024] get-tuple-element(while_result), index=1
-}
-)";
-
-  const char* kExpected = R"(
-CHECK:       recv-done
-CHECK-SAME:    channel_id=[[CHANNEL_ID:[0-9]+]]
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
-CHECK:       %[[AFTER_ALL:.*]] = after-all
-CHECK:       send-done
-CHECK-SAME:    channel_id=[[CHANNEL_ID]]
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
-CHECK:       %[[CUSTOM_CALL:.*]] = custom-call
-CHECK:       %[[RESULT_RECV:.*]] = recv(%[[AFTER_ALL]])
-CHECK-SAME:    channel_id=[[CHANNEL_ID]]
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
-CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3},{3,4}}},
-CHECK-SAME:                         control-predecessors={%[[CUSTOM_CALL]]}
-CHECK:       %[[RESULT_SEND:.*]] = send(%[[SOME_SEND_ARG:.*]], %[[AFTER_ALL]])
-CHECK-SAME:    channel_id=1
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
-CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3},{3,4}}},
-CHECK-SAME:                         control-predecessors={%[[RESULT_RECV]]}
-CHECK:       ROOT
-// We actually expect both RESULT_RECV and RESULT_SEND to match on this line.
-// However, despite popular belief, CHECK-DAG-SAME is not actually a valid
-// directive. Checking for both without using a DAG would be inherently flaky,
-// so we take the hit and only check for one of them.
-CHECK-SAME:    %[[RESULT_RECV]]
-
-CHECK: ENTRY
-CHECK:       %[[ENTRY_AFTER_ALL:.*]] = after-all
-CHECK:       %[[ENTRY_RECV:.*]] = recv(%[[ENTRY_AFTER_ALL]])
-CHECK-SAME:    channel_id=[[CHANNEL_ID]]
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
-CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3},{3,4}}}
-CHECK:       %[[ENTRY_SEND:.*]] = send(%[[SOME_SEND_ARG:.*]], %[[ENTRY_AFTER_ALL]])
-CHECK-SAME:    channel_id=1
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
-CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3},{3,4}}},
-CHECK-SAME:                         control-predecessors={%[[ENTRY_RECV]]}
-CHECK:       %[[WHILE_INIT:.*]] = tuple
-// Check here that the send argument is likewise passed to the while loop, as
-// a counterpart to the check in the child computation above.
-CHECK-SAME:    %[[ENTRY_SEND]]
-CHECK:       while(%[[WHILE_INIT]])
-CHECK:       recv-done
-CHECK-SAME:    channel_id=[[CHANNEL_ID]]
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
-CHECK:       send-done
-CHECK-SAME:    channel_id=[[CHANNEL_ID]]
-CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
-)";
-
-  HloModuleConfig config;
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_enable_latency_hiding_scheduler(true);
-  debug_options.set_xla_gpu_collective_permute_decomposer_threshold(1);
-  debug_options.set_xla_gpu_enable_pipelined_p2p(true);
-  debug_options.set_xla_gpu_enable_triton_gemm(false);
-  config.set_debug_options(debug_options);
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleStr, config));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(module)));
-  TF_ASSERT_OK(Schedule(optimized_module.get()));
-
-  HloPrintOptions options;
-  options.set_print_operand_shape(false);
-  options.set_print_result_shape(false);
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool filecheck_matched,
-      RunFileCheck(optimized_module->ToString(options), kExpected));
-  EXPECT_TRUE(filecheck_matched);
-}
 
 class KernelCacheTest : public HloTestBase {
  public:
@@ -1037,7 +972,8 @@ ENTRY e {
 
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        aot_result->LoadExecutable(compiler, aot_options.executor()));
+        std::move(*aot_result)
+            .LoadExecutable(compiler, aot_options.executor()));
     std::unique_ptr<OpaqueExecutable> wrapped_executable =
         test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
@@ -1091,57 +1027,6 @@ TEST_F(NoKernelCacheTest, NoCacheWithoutCompilationParallelism) {
   EXPECT_FALSE(CacheFileExists());
 }
 
-TEST_F(GpuCompilerTest, TestFlag_xla_gpu_unsafe_pipelined_loop_annotator) {
-  const char* hlo = R"(
-  HloModule test, entry_computation_layout={()->(s32[], s32[])}
-    %Body (param: (s32[], s32[])) -> (s32[], s32[]) {
-      %param = (s32[], s32[]) parameter(0)
-      %i = s32[] get-tuple-element((s32[], s32[]) %param), index=1
-      %one = s32[] constant(1)
-      %i_plus_one = s32[] add(s32[] %i, s32[] %one)
-      %permute = s32[] collective-permute(%i_plus_one), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
-      ROOT %tuple = (s32[], s32[]) tuple(s32[] %permute, s32[] %i_plus_one)
-    }
-    %Cond (param.1: (s32[], s32[])) -> pred[] {
-      %param.1 = (s32[], s32[]) parameter(0)
-      %i.1 = s32[] get-tuple-element((s32[], s32[]) %param.1), index=1
-      %trip_count = s32[] constant(10)
-      ROOT %done = pred[] compare(s32[] %i.1, s32[] %trip_count), direction=LT
-    }
-    ENTRY %test () -> (s32[], s32[]) {
-      %i_start = s32[] constant(0)
-      %p_start = s32[] constant(0)
-      %initial_tuple = (s32[], s32[]) tuple(s32[] %i_start, s32[] %p_start)
-      ROOT %while = (s32[], s32[]) while((s32[], s32[]) %initial_tuple), condition=%Cond, body=%Body, frontend_attributes={is_pipelined_while_loop="true"}
-    })";
-
-  const char* kExpected = R"(
-  // CHECK: {{.+}} = recv({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_source_target_pairs={{[{]}}{3,0}},_xla_send_recv_validation={{[{]}}{3,9}}}
-  // CHECK: {{.+}} = send({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_source_target_pairs={{[{]}}{3,0}},_xla_send_recv_validation={{[{]}}{3,9}}}
-  // CHECK: {{.+}} = recv({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_source_target_pairs={{[{]}}{0,1},{1,2},{2,3}},_xla_send_recv_validation={{[{]}}{0,6},{1,7},{2,8}}}
-  // CHECK: {{.+}} = send({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_source_target_pairs={{[{]}}{0,1},{1,2},{2,3}},_xla_send_recv_validation={{[{]}}{0,6},{1,7},{2,8}}}
-  )";
-
-  DebugOptions debug_options;
-  HloModuleConfig config;
-  debug_options.set_xla_gpu_unsafe_pipelined_loop_annotator(true);
-  debug_options.set_xla_gpu_enable_pipelined_p2p(true);
-  config.set_debug_options(debug_options);
-  config.set_num_partitions(4);
-  config.set_use_spmd_partitioning(true);
-  TF_ASSERT_OK_AND_ASSIGN(auto unoptimized_module,
-                          ParseAndReturnVerifiedModule(hlo, config));
-  TF_ASSERT_OK_AND_ASSIGN(auto optimized_module,
-                          GetOptimizedModule(std::move(unoptimized_module)));
-  HloPrintOptions options;
-  options.set_print_operand_shape(false);
-  options.set_print_result_shape(false);
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool filecheck_matched,
-      RunFileCheck(optimized_module->ToString(options), kExpected));
-  EXPECT_TRUE(filecheck_matched);
-}
-
 bool HasBlockLevelFusionConfig(const HloInstruction* fusion) {
   return fusion->opcode() == HloOpcode::kFusion &&
          fusion->has_backend_config() &&
@@ -1170,10 +1055,10 @@ ENTRY main {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> module,
-      ParseAndReturnVerifiedModule(transpose_fusion_module));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(module)));
+      auto module_and_executable,
+      GetOptimizedModuleForExecutable(transpose_fusion_module,
+                                      GetModuleConfigForTest()));
+  const HloModule* optimized_module = module_and_executable.first;
 
   if (cc.IsAtLeastAmpere()) {
     EXPECT_TRUE(HasBlockLevelFusionConfig(
@@ -1203,18 +1088,17 @@ TEST_F(
   // succeed.
   constexpr absl::string_view rewritable_transpose_string = R"(
 ENTRY main {
-  p0 = f32[1024,4096]{1,0} parameter(0)
-  reshape = f32[1024,1024,4]{2,1,0} reshape(p0)
-  ROOT transpose = f32[4,1024,1024]{2,1,0} transpose(reshape), dimensions={2,1,0}
+  p0 = f32[1024,2048]{1,0} parameter(0)
+  reshape = f32[1024,1024,2]{2,1,0} reshape(p0)
+  ROOT transpose = f32[2,1024,1024]{2,1,0} transpose(reshape), dimensions={2,1,0}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<VerifiedHloModule> rewritable_transpose_module,
-      ParseAndReturnVerifiedModule(rewritable_transpose_string));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> rewritable_transpose_optimized_module,
-      GetOptimizedModule(std::move(rewritable_transpose_module)));
+      auto rewritable_transpose_module_and_executable,
+      GetOptimizedModuleForExecutable(rewritable_transpose_string,
+                                      GetModuleConfigForTest()));
+  const HloModule* rewritable_transpose_optimized_module =
+      rewritable_transpose_module_and_executable.first;
   EXPECT_TRUE(HasBlockLevelFusionConfig(
       rewritable_transpose_optimized_module->entry_computation()
           ->root_instruction()));
@@ -1233,8 +1117,11 @@ ENTRY main {
       ParseAndReturnVerifiedModule(unrewritable_transpose_string));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> unrewritable_transpose_optimized_module,
-      GetOptimizedModule(std::move(unrewritable_transpose_module)));
+      auto unrewritable_transpose_module_and_executable,
+      GetOptimizedModuleForExecutable(unrewritable_transpose_string,
+                                      GetModuleConfigForTest()));
+  const HloModule* unrewritable_transpose_optimized_module =
+      unrewritable_transpose_module_and_executable.first;
   EXPECT_FALSE(HasBlockLevelFusionConfig(
       unrewritable_transpose_optimized_module->entry_computation()
           ->root_instruction()));
@@ -1462,7 +1349,7 @@ class PassOrderTest : public GpuCompilerTest {
     int other_pass_first_run = std::numeric_limits<int>::max();
     int run_index = 0;
     for (const HloPassMetadata& pass_metadata :
-         optimized_module_->metadata()->proto().pass_metadata()) {
+         optimized_module_->metadata().proto().pass_metadata()) {
       if (RE2::FullMatch(pass_metadata.pass_name(), first_pass_regex)) {
         VLOG(2) << "Pass " << pass_metadata.pass_name()
                 << " matches first_pass_regex." << std::endl;
@@ -1490,9 +1377,12 @@ class PassOrderTest : public GpuCompilerTest {
   // or if none of the executed passes matches `first_pass_regex` or
   // `last_pass_regex`. Returns a PassRange with the latest run index of any
   // passes with names matching `first_pass_regex` and the earliest run index of
-  // any passes with names matching 'last_pass_regex'.
+  // any passes with names matching 'last_pass_regex'. Passes matching both
+  // regexes will be counted towards last_pass (i.e., overlap of the two ranges
+  // is allowed).
   PassRange VerifyPassOrder(absl::string_view first_pass_regex,
-                            absl::string_view last_pass_regex) {
+                            absl::string_view last_pass_regex,
+                            bool include_pipeline_name = false) {
     if (!optimized_module_) {
       CompileModule(GetModuleConfigForTest());
     }
@@ -1500,16 +1390,20 @@ class PassOrderTest : public GpuCompilerTest {
     int last_pass_earliest_run = std::numeric_limits<int>::max();
     int run_index = 0;
     for (const HloPassMetadata& pass_metadata :
-         optimized_module_->metadata()->proto().pass_metadata()) {
-      if (RE2::FullMatch(pass_metadata.pass_name(), first_pass_regex)) {
-        VLOG(2) << "Pass " << pass_metadata.pass_name()
-                << " matches first_pass_regex." << std::endl;
-        first_pass_latest_run = std::max(first_pass_latest_run, run_index);
+         optimized_module_->metadata().proto().pass_metadata()) {
+      std::string name = pass_metadata.pass_name();
+      if (include_pipeline_name) {
+        name = absl::StrCat(pass_metadata.pipeline_name(), ".",
+                            pass_metadata.pass_name());
       }
-      if (RE2::FullMatch(pass_metadata.pass_name(), last_pass_regex)) {
+      if (RE2::FullMatch(name, last_pass_regex)) {
         VLOG(2) << "Pass " << pass_metadata.pass_name()
                 << " matches last_pass_regex." << std::endl;
         last_pass_earliest_run = std::min(last_pass_earliest_run, run_index);
+      } else if (RE2::FullMatch(name, first_pass_regex)) {
+        VLOG(2) << "Pass " << pass_metadata.pass_name()
+                << " matches first_pass_regex." << std::endl;
+        first_pass_latest_run = std::max(first_pass_latest_run, run_index);
       }
       ++run_index;
     }
@@ -1518,7 +1412,7 @@ class PassOrderTest : public GpuCompilerTest {
         << "Did not run a pass matching " << first_pass_regex;
     EXPECT_LT(last_pass_earliest_run, std::numeric_limits<int>::max())
         << "Did not run a pass matching " << last_pass_regex;
-    EXPECT_LE(first_pass_latest_run, last_pass_earliest_run)
+    EXPECT_LT(first_pass_latest_run, last_pass_earliest_run)
         << "One or more passes matching " << first_pass_regex
         << " ran after passes matching " << last_pass_regex;
     return {first_pass_latest_run, last_pass_earliest_run};
@@ -1528,9 +1422,10 @@ class PassOrderTest : public GpuCompilerTest {
   // `pass_range.first_pass_run_index` and `pass_range.second_pass_run_index`.
   void VerifyNotRunInBetween(const PassRange& pass_range,
                              absl::string_view pass_regex) {
+    CHECK(optimized_module_);
     int run_index = 0;
     for (const HloPassMetadata& pass_metadata :
-         optimized_module_->metadata()->proto().pass_metadata()) {
+         optimized_module_->metadata().proto().pass_metadata()) {
       if (run_index >= pass_range.second_pass_run_index) {
         break;
       }
@@ -1543,21 +1438,22 @@ class PassOrderTest : public GpuCompilerTest {
   }
 
  protected:
-  absl::Status ScheduleModule() { return Schedule(optimized_module_.get()); }
-
+  // Compiles a dummy module with the given configuration, running all passes,
+  // including the ones in RunBackend. This is important because otherwise, we
+  // might miss some passes when verifying pass order.
   void CompileModule(const HloModuleConfig& config) {
     constexpr absl::string_view constant_module = R"(
-ENTRY main {
-  ROOT constant = f32[] constant(0)
-})";
+        ENTRY main {
+          ROOT constant = f32[] constant(0)
+        })";
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<VerifiedHloModule> module,
-        ParseAndReturnVerifiedModule(constant_module, config));
-    TF_ASSERT_OK_AND_ASSIGN(optimized_module_,
-                            GetOptimizedModule(std::move(module)));
+        std::tie(optimized_module_, compiled_executable_),
+        GetOptimizedModuleForExecutable(constant_module, config));
   }
 
-  std::unique_ptr<HloModule> optimized_module_;
+  // Owns the optimized_module_ below.
+  std::unique_ptr<OpaqueExecutable> compiled_executable_ = nullptr;
+  const HloModule* optimized_module_ = nullptr;
 };
 
 TEST_F(PassOrderTest, PassesAreRunInCorrectOrder) {
@@ -1573,25 +1469,15 @@ TEST_F(PassOrderTest, OffloadingPassesAreRunInCorrectOrder) {
   // host memory offloading segments.
   VerifyPassRunsAtLeastOnceBefore(/*first_pass_regex=*/"host-offload-legalize",
                                   /*other_pass_regex=*/"layout_normalization");
-
-  // CSE should not run between HostOffloadLegalize and HostOffloader
-  // because it could break the invariants established
-  // by the legalize pass, such as the buffer initialization broadcasts
-  // before loops having only a single use
-  // (see https://github.com/openxla/xla/issues/20373).
-  auto pass_range =
-      VerifyPassOrder(/*first_pass_regex=*/"host-offload-legalize",
-                      /*last_pass_regex=*/"host-offloader");
-  VerifyNotRunInBetween(pass_range, /*pass_regex=*/"cse");
 }
 
-TEST_F(PassOrderTest, FusionBlockLevelRewriterRunsAfterAllFusionPasses) {
+TEST_F(PassOrderTest, FusionDispatchRunsAfterAllFusionPasses) {
   auto cc = backend()
                 .default_stream_executor()
                 ->GetDeviceDescription()
                 .cuda_compute_capability();
   if (!cc.IsAtLeastAmpere()) {
-    GTEST_SKIP() << "FusionBlockLevelRewriter requires Ampere+ to run.";
+    GTEST_SKIP() << "fusion-dispatch requires Ampere+ to run.";
   }
 
   DebugOptions debug_options = GetDebugOptionsForTest();
@@ -1599,17 +1485,18 @@ TEST_F(PassOrderTest, FusionBlockLevelRewriterRunsAfterAllFusionPasses) {
       true);
   SetDebugOptions(debug_options);
 
-  VerifyPassOrder(/*first_pass_regex=*/".*fusion.*",
-                  /*last_pass_regex=*/"fusion-block-level-rewriter");
+  VerifyPassOrder(
+      /*first_pass_regex=*/".*(fusion|stream-attribute-annotator).*",
+      /*last_pass_regex=*/"fusion-dispatch-pipeline.*",
+      /*include_pipeline_name=*/true);
 }
 
-TEST_F(PassOrderTest, CollectivePipelinerRunsAfterCollectiveQuantizer) {
-  DebugOptions options = GetDebugOptionsForTest();
-  options.set_xla_gpu_enable_pipelined_collectives(true);
-  SetDebugOptions(options);
-
-  VerifyPassOrder(/*first_pass_regex=*/"collective-quantizer",
-                  /*last_pass_regex=*/"collective-pipeliner.*");
+TEST_F(PassOrderTest,
+       SortRewriterRunsBeforeStableSortExpanderAndComparisonExpander) {
+  VerifyPassOrder(/*first_pass_regex=*/"sort-rewriter",
+                  /*last_pass_regex=*/"stable-sort-expander");
+  VerifyPassOrder(/*first_pass_regex=*/"sort-rewriter",
+                  /*last_pass_regex=*/"comparison-expander");
 }
 
 TEST_F(PassOrderTest,
@@ -1627,7 +1514,7 @@ TEST_F(PassOrderTest, StableSortExpanderRunsAfterDynamicPadder) {
 
 MATCHER_P(HasExpectedPasses, expected_pass_names, "") {
   std::vector<absl::string_view> run_pass_names;
-  auto metadata = arg->metadata()->proto();
+  auto metadata = arg->metadata().proto();
   run_pass_names.reserve(metadata.pass_metadata_size());
   for (auto& pass_metadata : metadata.pass_metadata()) {
     run_pass_names.push_back(pass_metadata.pass_name());
@@ -1638,7 +1525,6 @@ MATCHER_P(HasExpectedPasses, expected_pass_names, "") {
 TEST_F(PassOrderTest, ExecEffortAt0point2RunsSpecifiedPasses) {
   HloModuleConfig config = GetModuleConfigForTest();
   CompileModule(config);
-  TF_ASSERT_OK(ScheduleModule());
 
   // Make sure passes are not enabled by default.
   std::vector<std::string> kExpectedPasses = {
@@ -1653,7 +1539,6 @@ TEST_F(PassOrderTest, ExecEffortAt0point2RunsSpecifiedPasses) {
   // enabled.
   config.set_exec_time_optimization_effort(0.2);
   CompileModule(config);
-  TF_ASSERT_OK(ScheduleModule());
   EXPECT_THAT(optimized_module_, HasExpectedPasses(kExpectedPasses));
 }
 
@@ -1665,7 +1550,6 @@ TEST_F(PassOrderTest, LHSRunsIfProfileDataIsAvailable) {
       "latency-hiding-scheduler",
   };
   CompileModule(config);
-  TF_ASSERT_OK(ScheduleModule());
   EXPECT_THAT(optimized_module_, Not(HasExpectedPasses(kExpectedPasses)));
 
   // Make sure we turn the LHS on with we schedule with profile data.
@@ -1674,7 +1558,6 @@ TEST_F(PassOrderTest, LHSRunsIfProfileDataIsAvailable) {
   )pb";
   config.set_fdo_profile(kProfile);
   CompileModule(config);
-  TF_ASSERT_OK(ScheduleModule());
 
   EXPECT_THAT(optimized_module_, HasExpectedPasses(kExpectedPasses));
 }
@@ -1703,6 +1586,47 @@ TEST_F(PassOrderTest, GemmRewriterRunsAfterDotNormalizer) {
   VerifyNotRunInBetween(pass_range, /*pass_regex=*/"algsimp");
 }
 
+TEST_F(PassOrderTest, NestGemmFusionRunsAfterGemmFusionAutotuner) {
+  // NestGemmFusion expect to see __triton_gemm custom call with a backend
+  // config created by gemm_fusion_autotuner.
+  DebugOptions options = GetDebugOptionsForTest();
+  options.set_xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms(true);
+  SetDebugOptions(options);
+  VerifyPassOrder("gemm-fusion-autotuner", "nest_gemm_fusion");
+}
+
+TEST_F(PassOrderTest, TransposeDimensionGrouperRunsBeforeGemmRewriter) {
+  auto cc = backend()
+                .default_stream_executor()
+                ->GetDeviceDescription()
+                .cuda_compute_capability();
+  if (!cc.IsAtLeastAmpere()) {
+    GTEST_SKIP() << "triton-gemm-rewriter requires at least Ampere to run.";
+  }
+  if (!optimized_module_) {
+    CompileModule(GetModuleConfigForTest());
+  }
+  // DebugOptions options = GetDebugOptionsForTest();
+  // options.set_xla_gpu_enable_triton_gemm(true);
+  // SetDebugOptions(options);
+  // Verify that transpose-dimension-grouper runs immediately before
+  // triton-gemm-rewriter. We want to keep them close together to avoid the
+  // possibility of new passes to rewrite the transpose and make it
+  // not compatible with the generic triton emitter.
+  // Simple VerifyPassOrder does not work here as we want to check that passes
+  // are run next to each other, also transpose-dimension-grouper runs one more
+  // time after the gemm rewriter.
+  CHECK(optimized_module_);
+  std::string previous_pass_name;
+  for (const HloPassMetadata& pass_metadata :
+       optimized_module_->metadata().proto().pass_metadata()) {
+    if (pass_metadata.pass_name() == "triton-gemm-rewriter") {
+      EXPECT_EQ(previous_pass_name, "transpose-dimension-grouper");
+    }
+    previous_pass_name = pass_metadata.pass_name();
+  }
+}
+
 TEST_F(PassOrderTest,
        ReducePrecisionIsRemovedAfterAllCallsToSimplifyFPConversions) {
   // Because of an issue with JAX remat and `SimplifyFPConversions` (see PR:
@@ -1828,10 +1752,11 @@ TEST_F(GpuCompilerTest,
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> exec,
                           test_runner_as_hlo_runner().ExecutableFromWrapped(
                               std::move(wrapped_exec)));
+  std::cout << "exec module: " << exec->module().ToString() << "\n";
   const char* kExpected = R"(
     // CHECK:      dynamic-slice-fusion{{.+}} {
     // CHECK:        %[[slice:.+]] = {{.+}} slice({{.+}}), slice={[4:8], [0:32]}
-    // CHECK:        %[[rs:.+]] = {{.+}} reduce-scatter(%[[slice]]), 
+    // CHECK:        %[[rs:.+]] = {{.+}} reduce-scatter(%[[slice]]),
     // CHECK-SAME{LITERAL}:              replica_groups={{0,1}}, dimensions={0}
     // CHECK:        %[[bitcast:.+]] = {{.+}} bitcast(%[[rs]])
     // CHECK:        ROOT {{.+}} = {{.+}} dynamic-update-slice({{.+}}, %[[bitcast]], {{.+}})
@@ -1857,6 +1782,147 @@ TEST_F(GpuCompilerTest,
                                                 std::nullopt));
 }
 
+TEST_F(GpuCompilerTest, DynamicSliceFusionReduceScatterMultipleBuffers) {
+  const char* hlo = R"(
+    HloModule test, replica_count=2
+    add {
+      x = s32[] parameter(0)
+      y = s32[] parameter(1)
+      ROOT add = s32[] add(x, y)
+    }
+    ENTRY main {
+      p0 = s32[2,2,32] parameter(0)
+      p1 = s32[8,32] parameter(1)
+      slice = s32[4,32] slice(p1), slice={[4:8], [0:32]}
+      rs1 = s32[2,32] reduce-scatter(slice), replica_groups={{0,1}}, dimensions={0}, to_apply=add
+      slice2 = s32[4,32] slice(p1), slice={[0:4], [0:32]}
+      rs2 = s32[2,32] reduce-scatter(slice2), replica_groups={{0,1}}, dimensions={0}, to_apply=add
+      ROOT tuple = tuple(rs1, rs2)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo));
+  m->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_enable_dynamic_slice_fusion(true);
+  TF_ASSERT_OK_AND_ASSIGN(m, GetOptimizedModule(std::move(m)));
+  const char* kExpected = R"(
+    // CHECK: dynamic-slice-fusion{{.*}} {
+    // CHECK-DAG: %[[slice1:.+]] = {{.+}} slice({{.+}}), slice={[4:8], [0:32]}
+    // CHECK-DAG: %[[slice2:.+]] = {{.+}} slice({{.+}}), slice={[0:4], [0:32]}
+    // CHECK-DAG: ROOT %[[rs:.+]] = {{.+}} reduce-scatter(%[[slice1]], %[[slice2]]),
+    // CHECK-SAME{LITERAL}:                                      replica_groups={{0,1}}, dimensions={0}, to_apply=%add
+    // CHECK: ENTRY
+  )";
+  EXPECT_THAT(RunFileCheck(m->ToString(), kExpected),
+              ::tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(GpuCompilerTest, CompilingSortsWorksWithoutDevice) {
+  constexpr absl::string_view kHlo = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[1000] parameter(0)
+  ROOT %sort = f32[1000] sort(%input), dimensions={0}, to_apply=%compare
+})";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_cub_radix_sort(true);
+
+  std::string target_file;
+  ASSERT_TRUE(tsl::Env::Default()->LocalTempFilename(&target_file));
+  TF_ASSERT_OK(tsl::WriteTextProto(
+      tsl::Env::Default(), target_file,
+      Compiler::TargetConfig(backend().default_stream_executor()).ToProto()));
+  debug_options.set_xla_gpu_target_config_filename(target_file);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo, config));
+  // absl::ScopedMockLog only works if we're actually using ABSL logging, and
+  // TSL supports a homegrown logging implementation, so we should only check
+  // the log is emitted when ABSL logging is used.
+  absl::ScopedMockLog mock_log(absl::MockLogDefault::kIgnoreUnexpected);
+  if constexpr (std::is_same_v<absl::LogSink, tsl::TFLogSink>) {
+    EXPECT_CALL(mock_log,
+                Log(absl::LogSeverity::kWarning, EndsWith("/gpu_compiler.cc"),
+                    StartsWith("Using fallback sort algorithm")));
+  }
+  // StartCapturingLogs has to be called even if we expect not to capture any
+  // logs.
+  mock_log.StartCapturingLogs();
+  auto status_or_module = backend().compiler()->RunHloPasses(
+      std::move(module), nullptr, GetAllocator());
+  TF_ASSERT_OK(status_or_module.status());
+}
+
+TEST_F(GpuCompilerTest, CompilingAndCollectingMetadata) {
+  constexpr absl::string_view kHlo = R"(
+    HloModule cluster
+
+    ENTRY main {
+      cst = f32[1]{0} constant({0})
+      ROOT tuple_out = (f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0}) tuple(cst, cst, cst, cst)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+
+  std::string target_file;
+  ASSERT_TRUE(tsl::Env::Default()->LocalTempFilename(&target_file));
+  TF_ASSERT_OK(tsl::WriteTextProto(
+      tsl::Env::Default(), target_file,
+      Compiler::TargetConfig(backend().default_stream_executor()).ToProto()));
+  debug_options.set_xla_gpu_target_config_filename(target_file);
+  config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo, config));
+
+  auto status_or_module = backend().compiler()->RunHloPasses(
+      std::move(module), nullptr, GetAllocator());
+
+  auto opt_module = std::move(status_or_module.value());
+  const HloModuleMetadataProto& metadata = opt_module->metadata()->proto();
+  for (int pass = 0; pass < metadata.pass_metadata().size(); pass++) {
+    const HloPassMetadata& pass_metadata = metadata.pass_metadata(pass);
+    EXPECT_NE(pass_metadata.pass_id(), 0);
+    EXPECT_FALSE(pass_metadata.pass_name().empty());
+    EXPECT_FALSE(pass_metadata.pipeline_name().empty());
+    EXPECT_EQ(pass_metadata.module_id(), opt_module->unique_id());
+    EXPECT_GT(pass_metadata.start_timestamp_usec(), 0);
+    EXPECT_LE(pass_metadata.start_timestamp_usec(),
+              pass_metadata.end_timestamp_usec());
+  }
+  auto status_or_executable = backend().compiler()->RunBackend(
+      std::move(opt_module), backend().default_stream_executor(),
+      {/*device_allocator=*/nullptr,
+       /*thread_pool=*/nullptr,
+       /*layout_canonicalization_callback=*/{},
+       /*is_autotuning_compilation=*/false});
+
+  auto& exe_module = status_or_executable.value()->module();
+  const HloModuleMetadataProto& exe_metadata = exe_module.metadata()->proto();
+  for (int pass = 0; pass < exe_metadata.pass_metadata().size(); pass++) {
+    const HloPassMetadata& pass_metadata = exe_metadata.pass_metadata(pass);
+    EXPECT_NE(pass_metadata.pass_id(), 0);
+    EXPECT_FALSE(pass_metadata.pass_name().empty());
+    EXPECT_FALSE(pass_metadata.pipeline_name().empty());
+    EXPECT_EQ(pass_metadata.module_id(), exe_module.unique_id());
+    EXPECT_GT(pass_metadata.start_timestamp_usec(), 0);
+    EXPECT_LE(pass_metadata.start_timestamp_usec(),
+              pass_metadata.end_timestamp_usec());
+  }
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index 4a2493a64bb4..db227cc087ed 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -1,7 +1,21 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 version: 3
 results {
   device: "CUDA: 8.0, Cores: 108, GPU clock: 1.41 GHz, Memory bandwidth: 1555 GB/s, L2 cache: 40 MB"
-  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_7), dimensions={0,1,3,2}\n  tmp_9 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_8)\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_9, bf16[128,1024,1024]{2,1,0} tmp_11), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_13 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_12)\n}"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_7)\n  tmp_9 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_8), dimensions={0,2,1}\n  tmp_10 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_9)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_10)\n  tmp_12 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_13 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_12)\n  tmp_14 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_11, bf16[128,1024,1024]{2,1,0} tmp_13), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_15 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_14)\n}"
   result {
     gemm {
       algorithm: -1
@@ -25,7 +39,7 @@ results {
 }
 results {
   device: "CUDA: 8.0, Cores: 108, GPU clock: 1.41 GHz, Memory bandwidth: 2039 GB/s, L2 cache: 40 MB"
-  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_7), dimensions={0,1,3,2}\n  tmp_9 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_8)\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_9, bf16[128,1024,1024]{2,1,0} tmp_11), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_13 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_12)\n}"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_7)\n  tmp_9 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_8), dimensions={0,2,1}\n  tmp_10 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_9)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_10)\n  tmp_12 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_13 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_12)\n  tmp_14 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_11, bf16[128,1024,1024]{2,1,0} tmp_13), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_15 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_14)\n}"
   result {
     gemm {
       algorithm: -1
@@ -49,7 +63,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB"
-  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_4), dimensions={0,1,3,2}\n  tmp_6 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_5)\n  tmp_7 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_7)\n  tmp_9 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_6, bf16[128,1024,1024]{2,1,0} tmp_8), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_10 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_9)\n}"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_5), dimensions={0,2,1}\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_6)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_7)\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_10 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_10), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_12 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_11)\n}"
   result {
     gemm {
       algorithm: -1
@@ -105,4 +119,52 @@ results {
       nanos: 1
     }
   }
-}
\ No newline at end of file
+}
+results {
+  device: "CUDA: 10.0, Cores: 148, GPU clock: 1.65 GHz, Memory bandwidth: 8192 GB/s, L2 cache: 126.5 MB"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_5), dimensions={0,2,1}\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_6)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_7)\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_10 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_10), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_12 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_11)\n}"
+  result {
+    gemm {
+      algorithm: -1
+    }
+    run_time {
+      nanos: 1
+    }
+  }
+}
+results {
+  device: "CUDA: 10.0, Cores: 148, GPU clock: 1.65 GHz, Memory bandwidth: 8192 GB/s, L2 cache: 126.5 MB"
+  hlo: "{\n  tmp_0 = f8e4m3fn[12288,4096]{0,1} parameter(0)\n  tmp_1 = f8e4m3fn[4096,16384]{0,1} parameter(1)\n  tmp_2 = bf16[12288,16384]{1,0} dot(f8e4m3fn[12288,4096]{0,1} tmp_0, f8e4m3fn[4096,16384]{0,1} tmp_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}\n  tmp_3 = bf16[] constant({...})\n  tmp_4 = bf16[12288,16384]{1,0} broadcast(bf16[] tmp_3), dimensions={}\n  ROOT tmp_5 = bf16[12288,16384]{1,0} multiply(bf16[12288,16384]{1,0} tmp_2, bf16[12288,16384]{1,0} tmp_4)\n}"
+  result {
+    gemm {
+      algorithm: -1
+    }
+    run_time {
+      nanos: 1
+    }
+  }
+}
+results {
+  device: "CUDA: 10.0, Cores: 148, GPU clock: 1.965 GHz, Memory bandwidth: 7672 GB/s, L2 cache: 126.5 MB"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_5), dimensions={0,2,1}\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_6)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_7)\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_10 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_10), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_12 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_11)\n}"
+  result {
+    gemm {
+      algorithm: -1
+    }
+    run_time {
+      nanos: 1
+    }
+  }
+}
+results {
+  device: "CUDA: 10.0, Cores: 148, GPU clock: 1.965 GHz, Memory bandwidth: 7672 GB/s, L2 cache: 126.5 MB"
+  hlo: "{\n  tmp_0 = f8e4m3fn[12288,4096]{0,1} parameter(0)\n  tmp_1 = f8e4m3fn[4096,16384]{0,1} parameter(1)\n  tmp_2 = bf16[12288,16384]{1,0} dot(f8e4m3fn[12288,4096]{0,1} tmp_0, f8e4m3fn[4096,16384]{0,1} tmp_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}\n  tmp_3 = bf16[] constant({...})\n  tmp_4 = bf16[12288,16384]{1,0} broadcast(bf16[] tmp_3), dimensions={}\n  ROOT tmp_5 = bf16[12288,16384]{1,0} multiply(bf16[12288,16384]{1,0} tmp_2, bf16[12288,16384]{1,0} tmp_4)\n}"
+  result {
+    gemm {
+      algorithm: -1
+    }
+    run_time {
+      nanos: 1
+    }
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
index 544b6255e31e..b3c6514a0785 100644
--- a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/copy_insertion.h"
 #include "xla/service/gpu/buffer_sharing.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -86,10 +86,8 @@ class CanShareBufferWrapper {
   const HloDataflowAnalysis::CanShareBuffer can_share_buffer_;
 };
 
-class GpuCopyInsertionTest : public HloTestBase {
+class GpuCopyInsertionTest : public HloHardwareIndependentTestBase {
  public:
-  using HloTestBase::HloTestBase;
-
   CopyInsertion CreateCopyInsertion() const {
     return CopyInsertion(can_share_buffer_wrapper_.GetCanShareBuffer(),
                          /*use_region_based_live_range_analysis=*/0);
@@ -159,7 +157,7 @@ ENTRY main {
   EXPECT_EQ(CountCopies(*module), 2);
 }
 
-class FusionCanShareBufferHintTest : public HloTestBase {
+class FusionCanShareBufferHintTest : public HloHardwareIndependentTestBase {
  public:
   FusionCanShareBufferHintTest()
       : can_share_buffer_(can_share_buffer_wrapper_.GetCanShareBuffer()) {}
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
index 150b3fe3ebe4..ed12324b4f67 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -47,6 +47,32 @@ stream_executor::DeviceDescription TestGpuDeviceInfo::RTXA6000DeviceInfo(
   return b;
 }
 
+stream_executor::DeviceDescription TestGpuDeviceInfo::RTXH100SXMDeviceInfo(
+    stream_executor::GpuComputeCapability cc) {
+  stream_executor::DeviceDescription b;
+  b.set_gpu_compute_capability(cc);
+  b.set_threads_per_block_limit(1024);
+  b.set_threads_per_warp(32);
+  b.set_shared_memory_per_block(48 * 1024);
+  b.set_shared_memory_per_block_optin(227 * 1024);
+  b.set_shared_memory_per_core(228 * 1024);
+  b.set_threads_per_core_limit(2048);
+  b.set_core_count(132);
+  b.set_fpus_per_core(128);
+  b.set_block_dim_limit_x(2'147'483'647);
+  b.set_block_dim_limit_y(65535);
+  b.set_block_dim_limit_z(65535);
+  b.set_memory_bandwidth(3'352'320'000'000);
+  b.set_l2_cache_size(50 * 1024 * 1024);
+  b.set_clock_rate_ghz(1.98);
+  b.set_device_memory_size(84'978'434'048);
+  b.set_registers_per_core_limit(65536);
+  b.set_registers_per_block_limit(65536);
+  b.set_runtime_version(stream_executor::SemanticVersion{12, 4, 0});
+  b.set_driver_version(stream_executor::SemanticVersion{12, 4, 0});
+  return b;
+}
+
 stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() {
   stream_executor::DeviceDescription b;
   b.set_gpu_compute_capability(
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
index 9085763fee43..54501ace5a6e 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
 #define XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
 
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -26,6 +27,9 @@ class TestGpuDeviceInfo {
   static stream_executor::DeviceDescription RTXA6000DeviceInfo(
       stream_executor::GpuComputeCapability cc =
           stream_executor::CudaComputeCapability(8, 9));
+  static stream_executor::DeviceDescription RTXH100SXMDeviceInfo(
+      stream_executor::GpuComputeCapability cc =
+          stream_executor::CudaComputeCapability(9, 0));
   static stream_executor::DeviceDescription AMDMI210DeviceInfo();
   // Returns deafult RTXA6000 or AMDMI210 device info
   static stream_executor::DeviceDescription CudaOrRocmDeviceInfo();
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 71e5f12a0c6a..a00b51442674 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -80,14 +81,12 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/sycl/sycl_platform_id.h"
+#include "xla/tsl/platform/env_time.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/env_time.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
 #include "tsl/platform/random.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 #include "tsl/profiler/lib/traceme.h"
 
@@ -234,7 +233,7 @@ absl::Status ExecuteThunksImpl(
     stream_priority = stream_executor::StreamPriority::Highest;
   }
 
-  // Borrow streams required for NcclCollectiveThunk.
+  // Borrow streams required for CollectiveThunk.
   absl::InlinedVector<se::Stream*, kAsyncStreamTotal> async_comms_streams(
       kAsyncStreamTotal, nullptr);
   se::Stream* command_buffer_trace_stream = nullptr;
@@ -295,7 +294,7 @@ absl::Status ExecuteThunksImpl(
   {  // Collect resource requirements from thunks.
     Thunk::PrepareParams prepare_params{&collective_params};
 
-    tsl::profiler::TraceMe trace([&] { return "Thunks::Prepare"; });
+    tsl::profiler::TraceMe trace_prepare("Thunks::Prepare");
     TF_RETURN_IF_ERROR(
         thunk_sequence.Prepare(prepare_params, resource_requests));
   }
@@ -325,7 +324,7 @@ absl::Status ExecuteThunksImpl(
         run_options->local_device_count(),
         requires_exclusive_lock_on_gpu};
 
-    tsl::profiler::TraceMe trace([&] { return "Thunks::Initialize"; });
+    tsl::profiler::TraceMe trace_initialize("Thunks::Initialize");
     TF_RETURN_IF_ERROR(thunk_sequence.Initialize(initialize_params));
   }
 
@@ -345,7 +344,11 @@ absl::Status ExecuteThunksImpl(
       command_buffer_trace_stream, &collective_params, &collective_cliques,
       std::move(additional_execution_streams));
 
+  VLOG(1) << "[" << run_options->device_ordinal() << "] "
+          << "Start GpuExecutable::ExecuteOnStream module: " << module_name;
   TF_RETURN_IF_ERROR(thunk_sequence.ExecuteOnStream(execute_params));
+  VLOG(1) << "[" << run_options->device_ordinal() << "] "
+          << "End GpuExecutable::ExecuteOnStream module: " << module_name;
 
   return MaybeSyncAndProfile(run_options, execution_timer.get(),
                              block_host_until_done ? main_stream : nullptr);
@@ -422,7 +425,7 @@ absl::Status RendezvousAfterInitialization(
       run_options->device_ordinal(),
       run_options->run_options().run_id().ToInt());
 
-  Rendezvous(
+  return Rendezvous(
       rendezvous_name, rendezvous_key, num_local_participants,
       absl::Seconds(
           debug_options
@@ -432,8 +435,6 @@ absl::Status RendezvousAfterInitialization(
           debug_options
               ? debug_options->xla_gpu_executable_terminate_timeout_seconds()
               : 30));
-
-  return absl::OkStatus();
 }
 
 absl::Status MaybeSyncAndProfile(const ServiceExecutableRunOptions* run_options,
@@ -847,6 +848,11 @@ absl::Status GpuExecutable::VerboseAllocationError(absl::Status s) {
 absl::Status GpuExecutable::ExecuteThunks(
     const BufferAllocations& buffer_allocations,
     const ServiceExecutableRunOptions* run_options) {
+  tsl::profiler::TraceMe trace([&] {
+    return tsl::profiler::TraceMeEncode("GpuExecutable::ExecuteThunks",
+                                        {{"module_name", module_name_}});
+  });
+
   if (VLOG_IS_ON(5)) {
     se::StreamExecutor* executor = run_options->stream()->parent();
     // Debug code to compare current allocation's address with previous run's
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.cc b/third_party/xla/xla/service/gpu/gpu_float_support.cc
index 343391e0b836..1443a861d3de 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.cc
@@ -91,6 +91,7 @@ bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
     case HloOpcode::kDynamicUpdateSlice:
     case HloOpcode::kGather:
     case HloOpcode::kPad:
+    case HloOpcode::kRaggedAllToAll:
     case HloOpcode::kReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kScatter:
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
index 1644e3908654..72ce9560f13a 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/transforms/simplifiers/float_normalization.h"
 #include "xla/service/gpu/backend_configs.pb.h"
@@ -35,18 +36,18 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
-class FloatSupportTest : public HloTestBase {
+class FloatSupportTest : public HloHardwareIndependentTestBase {
  protected:
   FloatSupportTest()
-      : HloTestBase(/*verifier_layout_sensitive=*/false,
-                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/false,
+            /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   bool Normalize(HloModule* module, se::GpuComputeCapability cc,
                  PrimitiveType low_precision_type,
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 8140177db488..be53c5b505f0 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -509,28 +509,31 @@ static int64_t SharedMemoryUsageNoCache(
              IsReductionFromOrToContiguousDimensions(instr, device_info)) {
     ReductionDimensions reduction_info =
         GetReductionKindAndContiguousComponents(instr);
-    int64_t primitive_size = ShapeUtil::ByteSizeOfPrimitiveType(
-        instr.operand(0)->shape().element_type());
-    int num_variadic =
-        instr.shape().IsTuple() ? instr.shape().tuple_shapes_size() : 1;
+    int64_t primitive_size_sum = 0;
+    // Variadic reductions will allocate one shared memory buffer for each
+    // input. They all have the same shape, so we can just sum up the primitive
+    // sizes of the inputs.
+    for (int i = 0; i < instr.operand_count() / 2; ++i) {
+      primitive_size_sum += ShapeUtil::ByteSizeOfPrimitiveType(
+          instr.operand(i)->shape().element_type());
+    }
+
     if (reduction_info.is_row_reduction) {
-      // __shared__[32] is used for row reduction.
-      return 32 * primitive_size * num_variadic;
+      // In row reductions, we write at most one element per warp to shared
+      // memory, regardless of whether the reduction is vectorized or not. We
+      // have at most 32 warps for a single row. We could tighten this estimate,
+      // but it doesn't really matter. Row reductions are very unlikely to ever
+      // run out of shared memory budget.
+      return 32 * primitive_size_sum;
     } else {
-      // __shared__[4][32][33] cache is used for column reduction ("4" comes
-      // from potential x-tiling).
-      return 4 * 32 * 33 * primitive_size * num_variadic;
+      // The shape of the cache for column reductions is 32x(vector_size * 32 +
+      // 1). We don't know the actual vector size here, so we assume the
+      // maximum.
+      constexpr int kMaxVectorSize = 4;
+      return 32 * (kMaxVectorSize * 32 + 1) * primitive_size_sum;
     }
   } else if (auto tr = GetDescriptionForTiledTransposeEmitter(instr)) {
-    // Tile size for transposition.
-    int64_t primitive_size =
-        ShapeUtil::ByteSizeOfPrimitiveType(instr.shape().element_type());
-    int64_t bytes_required = 32 * 33 * primitive_size;
-    // If the last dimension is not changed, it becomes part of the tile.
-    if (tr->permutation.back() == tr->permutation.size() - 1) {
-      bytes_required *= tr->dimensions.back();
-    }
-    return bytes_required;
+    return tr->shmem_usage;
   }
   // Other fused expressions for now don't need the shared memory budget.
   return 0;
@@ -819,6 +822,8 @@ std::vector<const HloInstruction*> GetFusionRoots(
 }
 
 bool IsGenericTritonFusion(const HloInstruction& instr) {
+  // Note that we don't accept kTritonNestedGemmFusionKind here as they should
+  // not be fused with anything else.
   return instr.opcode() == HloOpcode::kFusion &&
          instr.fusion_kind() == HloInstruction::FusionKind::kCustom &&
          instr.backend_config<GpuBackendConfig>().ok() &&
@@ -840,8 +845,10 @@ bool MayPreventVectorization(const HloFusionAdaptor& fusion) {
       case HloOpcode::kConcatenate:
         return node.instruction().operand_count() >
                kMaxConcatArgumentsForUnrolling;
-      case HloOpcode::kReduce:
-        return node.instruction().shape().tuple_shapes_size() > 1;
+      case HloOpcode::kReduce: {
+        const Shape& shape = node.instruction().shape();
+        return shape.IsTuple() && shape.tuple_shapes().size() > 1;
+      }
       default:
         return false;
     }
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index ac786cf7fcf5..ae1fcfada0bc 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "xla/service/gpu/gpu_fusible.h"
 
 #include <memory>
@@ -35,6 +34,7 @@ namespace gpu {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
 
 auto MakeDeviceDescription() {
   stream_executor::DeviceDescription device_description{
@@ -46,9 +46,8 @@ auto MakeDeviceDescription() {
 class GpuFusibleTest : public HloRunnerAgnosticTestBase {
  public:
   GpuFusibleTest()
-      : HloRunnerAgnosticTestBase(
-            std::make_unique<HloRunner>(
-                PlatformUtil::GetDefaultPlatform().value())),
+      : HloRunnerAgnosticTestBase(std::make_unique<HloRunner>(
+            PlatformUtil::GetDefaultPlatform().value())),
         device_description_(MakeDeviceDescription()) {}
 
   bool IsReduceInputFusion(const HloInstruction& instr) const {
@@ -1527,7 +1526,8 @@ TEST_F(GpuFusibleTest, GetFusibleComputations) {
 
   // fused_reduce is already fused, scalar_add is not fusible.
   auto fusible = GetFusibleComputations(*module, {});
-  EXPECT_THAT(fusible, ElementsAre(module->GetComputationWithName("body_c"),
+  EXPECT_THAT(fusible,
+              UnorderedElementsAre(module->GetComputationWithName("body_c"),
                                    // From the conditional
                                    module->GetComputationWithName("body_a"),
                                    module->GetComputationWithName("body_b"),
@@ -1550,6 +1550,54 @@ TEST_F(GpuFusibleTest, GetSharedMemoryUsage) {
   EXPECT_EQ(cache.GetSharedMemoryUsage(*fusion), 32 * 33 * 2 * 4);
 }
 
+TEST_F(GpuFusibleTest, GetSharedMemoryUsageForPackedTranspose) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    wrapped_transpose {
+      p0 = s8[128,64,10,6] parameter(0)
+      ROOT transpose = s8[128,6,10,64] transpose(p0), dimensions={0,3,2,1}
+    }
+    ENTRY main {
+      p0 = s8[128,64,10,6] parameter(0)
+      ROOT res = s8[128,6,10,64] fusion(p0), kind=kInput, calls=wrapped_transpose
+    })"))
+                    .value();
+  FusionInfoCache cache(device_description());
+  auto fusion = module->entry_computation()->root_instruction();
+  EXPECT_EQ(cache.GetSharedMemoryUsage(*fusion), 32 * 32 * 4 * 4);
+}
+
+TEST_F(GpuFusibleTest, GetSharedMemoryUsageVariadicReduction) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+        reducer {
+          p0 = pred[] parameter(0)
+          p1 = s32[] parameter(1)
+          p2 = pred[] parameter(2)
+          p3 = s32[] parameter(3)
+          ROOT %tuple.20.0 = (pred[], s32[]) tuple(p2, p3)
+        }
+        reduce {
+          p0 = pred[4,128,128] parameter(0)
+          p1 = s32[4,128,128] parameter(1)
+          cfalse = pred[] constant(false)
+          c0 = s32[] constant(0)
+          ROOT reduce = (pred[4,128], s32[4,128]) reduce(p0, p1, cfalse, c0),
+            dimensions={1}, to_apply=reducer
+        }
+        ENTRY main {
+          p0 = pred[4,128,128] parameter(0)
+          p1 = s32[4,128,128] parameter(1)
+          ROOT fusion = (pred[4,128], s32[4,128]) fusion(p0, p1),
+            kind=kInput, calls=reduce
+        })")));
+  FusionInfoCache cache(device_description());
+  auto fusion = module->entry_computation()->root_instruction();
+  constexpr int kMaxVectorSize = 4;
+  EXPECT_EQ(
+      cache.GetSharedMemoryUsage(*fusion),
+      (sizeof(int8_t) + sizeof(int32_t)) * 32 * (32 * kMaxVectorSize + 1));
+}
+
 TEST_F(GpuFusibleTest, IsConsumerTheOnlyNonRootUser) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
 e {
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 314ad8551b16..233d2974a212 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/async_collective_annotator.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/gpu/transforms/pgle_accuracy_checker.h"
-#include "xla/service/gpu/transforms/schedule_postprocessing.h"
 #include "xla/service/gpu/transforms/scheduling_instruction_annotator.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/latency_hiding_scheduler.h"
@@ -397,6 +396,8 @@ std::optional<ProfiledInstructionsProto> ProfileFromPath(
     LOG(ERROR) << "Tried but failed to parse PGLE proto from "
                << (as_text ? "text" : "binary") << " file '" << path
                << "'. Error message: " << s.message();
+  } else {
+    LOG(ERROR) << "PGLE profile file does not exist: " << path;
   }
   return std::nullopt;
 }
@@ -564,6 +565,7 @@ LegalizeSchedulingAnnotations::Config SchedulingAnnotationsConfig() {
 absl::Status RunLatencyHidingSchedulerPasses(
     HloModule* module, int pointer_size, absl::string_view fingerprint,
     uint64_t memory_limit, const se::DeviceDescription& gpu_device_info) {
+  tsl::profiler::TraceMe traceme("RunLatencyHidingSchedulerPasses");
   HloPassPipeline pipeline("latency-hiding-scheduler");
   const DebugOptions& options = module->config().debug_options();
   pipeline.AddPass<LegalizeSchedulingAnnotations>(
@@ -596,7 +598,6 @@ absl::Status RunLatencyHidingSchedulerPasses(
       std::move(estimator), std::move(async_tracker), std::move(scheduler_core),
       shape_size_in_bytes);
   pipeline.AddPass<SchedulingInstructionAnnotator>();
-  pipeline.AddPass<SchedulePostprocessing>();
 
   return pipeline.Run(module).status();
 }
@@ -649,17 +650,16 @@ uint64_t GetSchedulerMemoryLimit(const HloModule& module,
         total_io_size -= get_device_shape_size(subshape);
       });
 
-  uint64_t limit = 0;
   if (total_io_size > base_limit) {
     LOG(ERROR) << "The byte size of input/output arguments (" << total_io_size
                << ") exceeds the base limit (" << base_limit
                << "). This indicates an error in the calculation!";
-  } else {
-    limit = (base_limit - total_io_size) *
-            module.config().debug_options().xla_gpu_memory_limit_slop_factor() /
-            100;
+    return 0;
   }
-  return limit;
+
+  return (base_limit - total_io_size) *
+         module.config().debug_options().xla_gpu_memory_limit_slop_factor() /
+         100;
 }
 
 bool IsLHSEnabled(const HloModule& module, absl::string_view fingerprint) {
@@ -746,7 +746,7 @@ absl::Status RunAsyncCollectivesConversionPasses(HloModule* module) {
 absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
     HloModule* module, int64_t pointer_size,
     const se::DeviceDescription& gpu_device_info) {
-  tsl::profiler::TraceMe traceme("GpuCompiler::CompileToBackendResult");
+  tsl::profiler::TraceMe traceme("ScheduleGpuModule");
 
   // Tag the module with its 128 bit fingerprint. The fingerprint should include
   // instruction name with ids.
@@ -792,9 +792,8 @@ absl::StatusOr<HloSchedule> ScheduleGpuModuleWithMemoryScheduler(
     }
     return ShapeUtil::ByteSizeOf(shape, pointer_size);
   };
-  ModuleSchedulerAlgorithm algorithm = ComputationSchedulerToModuleScheduler(
-      DefaultMemoryScheduler, PostProcessSchedule);
-  return ScheduleModule(module, size_func, algorithm,
+  return ScheduleModule(module,
+                        DefaultMemoryScheduler(size_func, PostProcessSchedule),
                         /*execution_threads=*/{}, peak_memory_bytes);
 }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index f2d3c1ebbedf..83255a893e68 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -21,12 +21,17 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/base/log_severity.h"
 #include "absl/log/log.h"
+#include "absl/log/log_sink.h"
+#include "absl/log/scoped_mock_log.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
@@ -34,36 +39,32 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/backend.h"
 #include "xla/service/gpu/gpu_compiler.h"
-#include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
-#include "xla/service/gpu/transforms/schedule_postprocessing.h"
-#include "xla/service/gpu/transforms/scheduling_instruction_annotator.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/latency_hiding_scheduler.h"
-#include "xla/service/legalize_scheduling_annotations.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/profiler/protobuf/profiled_instructions.pb.h"
 
 namespace xla {
 namespace gpu {
 
+using ::testing::_;
 using ::testing::ElementsAre;
-using ::testing::HasSubstr;
+using ::testing::EndsWith;
 using ::tsl::testing::StatusIs;
 
 class GpuHloScheduleTest : public HloTestBase {
@@ -113,57 +114,9 @@ class GpuHloScheduleTest : public HloTestBase {
     // Verify that the fingerprint of HLO prior to LHS is present.
     const FrontendAttributes& attrs = module->frontend_attributes();
     auto it = attrs.map().find(kFingerprintBeforeLHS);
-
     // The fingerprint is 128 bits stored as a hex string (128/4 hex digits).
     return it != attrs.map().end() && it->second.size() == 128 / 4;
   }
-
-  // Run the gpu hlo scheduler and latency hiding scheduler
-  absl::StatusOr<bool> RunGpuLatencyHidingScheduler(HloModule* module,
-                                                    uint64_t memory_limit) {
-    HloModuleConfig default_config = GetModuleConfig({});
-    auto* gpu_compiler = dynamic_cast<GpuCompiler*>(backend().compiler());
-    EXPECT_NE(gpu_compiler, nullptr);
-    const int64_t pointer_size = gpu_compiler->GetPointerSize();
-
-    auto shape_size_in_bytes = ShapeSizeBytesFunction(pointer_size);
-
-    int64_t initial_peak_memory = -1;
-    TF_ASSIGN_OR_RETURN(HloSchedule initial_schedule,
-                        ScheduleGpuModuleWithMemoryScheduler(
-                            module, pointer_size, &initial_peak_memory));
-
-    TF_CHECK_OK(module->set_schedule(std::move(initial_schedule)));
-
-    SchedulerConfig config;
-    config.memory_limit = memory_limit;
-
-    auto estimator = std::make_unique<ApproximateLatencyEstimator>();
-    auto async_tracker = std::make_unique<GpuAsyncTracker>(config);
-    auto tracker_ptr = async_tracker.get();
-    auto scheduler_core = std::make_unique<DefaultSchedulerCore>(
-        shape_size_in_bytes, tracker_ptr, estimator.get(), config,
-        /*target_scheduling_rule=*/nullptr,
-        /*early_target_scheduling_rule=*/nullptr,
-        /*post_processing_fn=*/nullptr,
-        /*scheduling_instruction_crosses_overlap_limit=*/
-        GpuScheduleCrossesOverlapLimit);
-
-    HloPassPipeline pipeline("latency-hiding-scheduler");
-    // Only run latency hiding scheduling if the memory limit is positive
-    // to avoid out of memory
-    if (memory_limit > 0) {
-      pipeline.AddPass<LatencyHidingScheduler>(
-          std::move(estimator), std::move(async_tracker),
-          std::move(scheduler_core), shape_size_in_bytes);
-      return pipeline.Run(module);
-    } else {
-      return Internal(
-          "The byte size of input/output arguments exceeds the "
-          "base limit. This indicates an error in the calculation!");
-    }
-    return true;
-  }
 };
 
 // Test of a single stream, where data dependencies fully determine the
@@ -1796,76 +1749,67 @@ TEST_F(GpuHloScheduleTest, DiscountCPUMemoryFromGPUPeakMemoryUsage) {
 )"));
 }
 
-constexpr absl::string_view kCopyStartOverlap = R"(
-  HloModule conv_offloading
-  ENTRY %main (param_0: f32[1024], param_1: f32[1024]) -> f32[1024] {
-    %param_1 = f32[1024]{0} parameter(1)
-    %param_0 = f32[1024]{0} parameter(0)
-    %res_3 = f32[1024]{0} add(f32[1024]{0} %param_0, f32[1024]{0} %param_1)
-    %copy-start = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_3)
-    %copy-done = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start)
-    %res_4 = f32[1024]{0} tanh(f32[1024]{0} %res_3)
-    %copy-start.2 = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_4)
-    %copy-done.2 = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start.2)
-    %res_5 = f32[1024]{0} tanh(f32[1024]{0} %res_4)
-    %res_6 = f32[1024]{0} tanh(f32[1024]{0} %res_5)
-    %res_7 = f32[1024]{0} add(f32[1024]{0} %res_6, f32[1024]{0} %res_6)
-    %copy-start.1 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done)
-    %copy-done.1 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.1)
-    %res_8 = f32[1024]{0} add(f32[1024]{0} %res_7, f32[1024]{0} %res_5)
-    %copy-start.3 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done.2)
-    %copy-done.3 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.3)
-    %res_9 = f32[1024]{0} add(f32[1024]{0} %res_8, f32[1024]{0} %copy-done.3)
-    %res_10 = f32[1024]{0} add(f32[1024]{0} %res_9, f32[1024]{0} %copy-done.1)
-    ROOT %res_11 = f32[1024]{0} tanh(f32[1024]{0} %res_10)
-})";
-
-// This test ensures that the GPU scheduler applies latency hiding scheduling
-// while adhering to a specified memory limit.
-TEST_F(GpuHloScheduleTest, RunLHSToBeWithinMemoryLimit) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      ParseAndReturnVerifiedModule(kCopyStartOverlap, GetModuleConfig({})));
-
-  // Define a large memory limit for the scheduler.
-  constexpr uint64_t kMemoryLimitLarge = 22000;
+TEST_F(GpuHloScheduleTest, ReturnsValidScheduleMetadata) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule m
 
-  // Run the latency hiding scheduler with the specified memory limit.
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunGpuLatencyHidingScheduler(
-                                            module.get(), kMemoryLimitLarge));
+    ENTRY ar {
+      p0 = f32[32,32] parameter(0)
+      p1 = f32[32,32] parameter(1)
 
-  EXPECT_TRUE(changed);
+      ROOT _ = f32[32,32]{1,0} custom-call(p0, p1),
+        custom_call_target="__cublas$gemm"
+    })";
+  HloModuleConfig module_config;
+  constexpr uint64_t kMemoryLimitLarge = 22000;
+  module_config.set_device_memory_size(kMemoryLimitLarge);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kHloText, module_config));
 
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-// CHECK: ENTRY
-// CHECK: %copy-start.2 = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start
-// CHECK: %copy-start = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start
-// CHECK: %copy-done.2 = f32[1024]{0:S(5)} copy-done
-// CHECK: %copy-start.3 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start
-// CHECK: %copy-done = f32[1024]{0:S(5)} copy-done
-// CHECK: %copy-start.1 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start
-// CHECK: %copy-done.3 = f32[1024]{0} copy-done
-// CHECK: %copy-done.1 = f32[1024]{0} copy-done
-)"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto metadata,
+      ScheduleGpuModule(
+          module.get(), /*pointer_size=*/8,
+          backend().default_stream_executor()->GetDeviceDescription()));
+  EXPECT_GT(metadata.scheduler_mem_limit, 0);
 }
 
-// This test verifies that the GPU scheduler doesn't run latency hiding
-// scheduling if the given memory limit is negative.
-TEST_F(GpuHloScheduleTest, NegativeTestMemoryLimit) {
+// This test verifies that the scheduling logs an error if the size of
+// input/output arguments exceeds the base limit.
+TEST_F(GpuHloScheduleTest, LogAnErrorWhenArgumentSizeExceedsMemoryLimit) {
+  constexpr absl::string_view kHloText = R"(
+    HloModule m
+
+    ENTRY ar {
+      p0 = f32[32,32] parameter(0)
+      p1 = f32[32,32] parameter(1)
+
+      ROOT _ = f32[32,32]{1,0} custom-call(p0, p1),
+        custom_call_target="__cublas$gemm"
+    })";
+  HloModuleConfig module_config;
+  constexpr uint64_t kMemoryLimitSmall = 1;
+  module_config.set_device_memory_size(kMemoryLimitSmall);
   TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      ParseAndReturnVerifiedModule(kCopyStartOverlap, GetModuleConfig({})));
-
-  constexpr uint64_t kMemoryLimitNeg = 0;
-
-  // Run latency hiding scheduler with a negative memory limit
-  auto status =
-      RunGpuLatencyHidingScheduler(module.get(), kMemoryLimitNeg).status();
-  EXPECT_FALSE(status.ok());
-  EXPECT_THAT(
-      status.message(),
-      HasSubstr("The byte size of input/output arguments exceeds the "
-                "base limit. This indicates an error in the calculation!"));
+      auto module, ParseAndReturnVerifiedModule(kHloText, module_config));
+
+  absl::ScopedMockLog mock_log(absl::MockLogDefault::kIgnoreUnexpected);
+  // absl::ScopedMockLog only works if we're actually using ABSL logging, and
+  // TSL supports a homegrown logging implementation, so we should only check
+  // the log is emitted when ABSL logging is used.
+  if constexpr (std::is_same_v<absl::LogSink, tsl::TFLogSink>) {
+    EXPECT_CALL(mock_log,
+                Log(absl::LogSeverity::kError, _,
+                    EndsWith("This indicates an error in the calculation!")))
+        .Times(1);
+  }
+  mock_log.StartCapturingLogs();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto metadata,
+      ScheduleGpuModule(
+          module.get(), /*pointer_size=*/8,
+          backend().default_stream_executor()->GetDeviceDescription()));
+  EXPECT_EQ(metadata.scheduler_mem_limit, 0);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler.cc
index 378e66251252..fddb4441fc6d 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler.cc
@@ -49,6 +49,16 @@ static constexpr int64_t kCostlyAllReduceThreshold = 30 * 1024 * 1024;
 // Multiplier which we apply to expand the base cost for the costly AR.
 static constexpr int64_t kCostlyAllReduceMultiplier = 4;
 
+// Multipliers for p2p collectives.
+static constexpr int64_t kCostlyP2PSendMultiplier = 1024;
+static constexpr int64_t kCostlyP2PCollectivePermuteMultiplier = 14;
+static constexpr int64_t kCostlyP2PRecvMultiplier = 6;
+
+// Number of P2P collectives that can be in flight at the same time. Note that
+// this does not mean that these collectives run in parallel but synchronisation
+// may not happen after each one of them.
+static constexpr int64_t kNumAsyncCollectivesP2P = 4;
+
 // Classifies `hlo` instruction as noop or not.
 bool IsNopInstruction(const HloInstruction& hlo) {
   switch (hlo.opcode()) {
@@ -164,7 +174,7 @@ HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction(
       return size;
     }
     // Each dynamic dimension size is represented as a S32.
-    int64_t metadata_size = sizeof(int32_t) * shape.dimensions_size();
+    int64_t metadata_size = sizeof(int32_t) * shape.dimensions().size();
     return size + metadata_size;
   };
 }
@@ -221,7 +231,9 @@ bool GpuScheduleCrossesOverlapLimit(
       CHECK(
           hlo_query::IsAsyncCollectiveStartOp(curr_hlo_inst.operand(0), true));
       const HloInstruction* curr_start_inst =
-          curr_hlo_inst.operand(0)->async_wrapped_instruction();
+          curr_hlo_inst.IsAsynchronous()
+              ? curr_hlo_inst.operand(0)->async_wrapped_instruction()
+              : curr_hlo_inst.operand(0);
 
       // If candidate can be overlapped with in-flight collectives
       bool can_overlap = true;
@@ -413,6 +425,11 @@ int64_t GpuAsyncTracker::GetNumAvailableResources(int64_t resource_type) const {
     return config_.parallel_collective_overlap_limit;
   }
 
+  if (resource_type ==
+      ResourceTypeToIndex(GpuResourceType::kGpuAsyncStreamCollectivesP2P)) {
+    return kNumAsyncCollectivesP2P;
+  }
+
   return 1;
 }
 
@@ -532,12 +549,28 @@ ApproximateLatencyEstimator::TimeCost GpuLatencyEstimator::NodeCost(
 ApproximateLatencyEstimator::TimeCost GpuLatencyEstimator::GetLatencyBetween(
     const HloGraphNode& from, const HloGraphNode& to) const {
   if (IsAsyncPair(from, to)) {
-    if (from.GetInstr().opcode() == HloOpcode::kRecv) {
-      // Recv -> RecvDone has a low latency.
-      return ApproximateLatencyEstimator::kLowLatency;
-    } else if (from.GetInstr().opcode() == HloOpcode::kSend) {
-      // Send -> SendDone has a very high latency.
-      return ApproximateLatencyEstimator::kHighLatency * 10;
+    if (IsAnnotatedForGpuAsyncStreamCollectivesP2P(from.GetInstr())) {
+      HloOpcode inner_opcode = GpuGetCanonicalAsyncOp(from.GetInstr()).inner;
+      if (inner_opcode == HloOpcode::kSend) {
+        return kCostlyP2PSendMultiplier * kHighLatency;
+      } else if (inner_opcode == HloOpcode::kCollectivePermute) {
+        // The collective permutes in p2p communication force-synchronize all
+        // devices and destroy the desired staggering. The latency we assign
+        // them here must compensate for that. We give them the time they take
+        // plus the maximum time any of them will have to wait for their
+        // furthest peer.
+        int64_t num_partitions =
+            from.GetInstr().GetModule()->config().num_partitions();
+        int64_t cycle_length = num_partitions / 8;
+        int64_t staggering_factor = std::max<int64_t>(cycle_length - 1, 1);
+        return staggering_factor * kCostlyP2PRecvMultiplier *
+                   ApproximateLatencyEstimator::kHighLatency +
+               kCostlyP2PCollectivePermuteMultiplier *
+                   ApproximateLatencyEstimator::kHighLatency;
+      } else {
+        return kCostlyP2PRecvMultiplier *
+               ApproximateLatencyEstimator::kHighLatency;
+      }
     }
 
     bool enable_approx_collectives =
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index d68ab8035abb..280ad92c5a2c 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -652,5 +652,186 @@ TEST_F(GpuLatencyHidingSchedulerBaseTest,
             GetIndexByName(main_instructions, "tuple"));
 }
 
+TEST_F(GpuLatencyHidingSchedulerBaseTest, ScheduleP2PWithMultipliers) {
+  absl::string_view kHloModule = R"(
+    HloModule test, num_partitions=4
+
+    ENTRY main {
+      p0 = f32[64] parameter(0)
+      p1 = f32[64] parameter(1)
+      p2 = f32[64] parameter(2)
+
+      // Send on p2p resource.
+      after_all = token[] after-all()
+      send_start = (f32[64], u32[], token[]) send(p0, after_all),
+          frontend_attributes={_xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3}},_xla_gpu_collective_stream="p2p"}
+      send_done = token[] send-done(send_start)
+
+      // Collective-permute on p2p resource.
+      cp_start = (f32[64], f32[64], u32[], u32[])
+          collective-permute-start(p1),
+          source_target_pairs={{0,1},{1,2},{2,3},{3,0}}, channel_id=1,
+          frontend_attributes={_xla_gpu_collective_stream="p2p"},
+          control-predecessors={send_done}
+      cp_done = f32[64] collective-permute-done(cp_start)
+
+      // Multiple "expensive" ops to overlap with.
+      add_0 = f32[64] add(p2, p2)
+      add_1 = f32[64] add(add_0, add_0)
+      add_2 = f32[64] add(add_1, add_1)
+
+      // Recv on p2p resource.
+      recv_start = (f32[64], u32[], token[]) recv(after_all),
+          frontend_attributes={_xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3}},_xla_gpu_collective_stream="p2p"},
+          control-predecessors={cp_done}
+      recv_done = (f32[64], token[]) recv-done(recv_start)
+      recv_data = f32[64] get-tuple-element(recv_done), index=0
+
+      ROOT tuple = (f32[64], f32[64]) tuple(cp_done, add_2)
+    }
+  )";
+
+  // Set the expense for adds so that they will overlap with the send,
+  // collective-permute, and recv of these same latency cost.
+  absl::string_view kFdoProfile = R"pb(
+    costs { name: "add_0" cost_us: 5120000.0 }
+    costs { name: "add_1" cost_us: 100000.0 }
+    costs { name: "add_2" cost_us: 30000.0 }
+  )pb";
+  auto config = GetModuleConfig(
+      kFdoProfile, /*pipeline_parallelism_opt_level=*/DebugOptions::
+          PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloModule, config));
+
+  TF_EXPECT_OK(ScheduleModule(module.get(), /*num_parallel_resources=*/1,
+                              DebugOptions::PGLE_STRICTNESS_LEVEL_OFF));
+  auto schedule = module->schedule();
+
+  VLOG(3) << module->schedule().ToString();
+  HloComputation* main_computation = FindComputation(module.get(), "main");
+  std::vector<HloInstruction*> main_instructions =
+      schedule.sequence(main_computation).instructions();
+
+  // Expect each add op to overlap with one of the collectives: send,
+  // collective-permute, and recv.
+  //   - send_start
+  //   - add_0
+  //   - send_done
+  //   - cp_start
+  //   - add_1
+  //   - cp_done
+  //   - recv_start
+  //   - add_2
+  //   - recv_done
+  EXPECT_LT(GetIndexByName(main_instructions, "send_start"),
+            GetIndexByName(main_instructions, "add_0"));
+  EXPECT_LT(GetIndexByName(main_instructions, "add_0"),
+            GetIndexByName(main_instructions, "send_done"));
+  EXPECT_LT(GetIndexByName(main_instructions, "send_done"),
+            GetIndexByName(main_instructions, "cp_start"));
+  EXPECT_LT(GetIndexByName(main_instructions, "cp_start"),
+            GetIndexByName(main_instructions, "add_1"));
+  EXPECT_LT(GetIndexByName(main_instructions, "add_1"),
+            GetIndexByName(main_instructions, "cp_done"));
+  EXPECT_LT(GetIndexByName(main_instructions, "cp_done"),
+            GetIndexByName(main_instructions, "recv_start"));
+  EXPECT_LT(GetIndexByName(main_instructions, "recv_start"),
+            GetIndexByName(main_instructions, "add_2"));
+  EXPECT_LT(GetIndexByName(main_instructions, "add_2"),
+            GetIndexByName(main_instructions, "recv_done"));
+}
+
+TEST_F(GpuLatencyHidingSchedulerBaseTest,
+       ScheduleP2P32PartitionsWithMultipliers) {
+  absl::string_view kHloModule = R"(
+    HloModule test, num_partitions=32
+
+    ENTRY main {
+      p0 = f32[64] parameter(0)
+      p1 = f32[64] parameter(1)
+      p2 = f32[64] parameter(2)
+
+      // Send on p2p resource.
+      after_all = token[] after-all()
+      send_start = (f32[64], u32[], token[]) send(p0, after_all),
+          frontend_attributes={_xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3}},_xla_gpu_collective_stream="p2p"}
+      send_done = token[] send-done(send_start)
+
+      // Collective-permute on p2p resource.
+      cp_start = (f32[64], f32[64], u32[], u32[])
+          collective-permute-start(p1),
+          source_target_pairs={{0,1},{1,2},{2,3},{3,0}}, channel_id=1,
+          frontend_attributes={_xla_gpu_collective_stream="p2p"},
+          control-predecessors={send_done}
+      cp_done = f32[64] collective-permute-done(cp_start)
+
+      // Multiple "expensive" ops to overlap with.
+      add_0 = f32[64] add(p2, p2)
+      add_1 = f32[64] add(add_0, add_0)
+      add_2 = f32[64] add(add_1, add_1)
+
+      // Recv on p2p resource.
+      recv_start = (f32[64], u32[], token[]) recv(after_all),
+          frontend_attributes={_xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3}},_xla_gpu_collective_stream="p2p"},
+          control-predecessors={cp_done}
+      recv_done = (f32[64], token[]) recv-done(recv_start)
+      recv_data = f32[64] get-tuple-element(recv_done), index=0
+
+      ROOT tuple = (f32[64], f32[64]) tuple(cp_done, add_2)
+    }
+  )";
+
+  // Set the expense for adds so that they will overlap with the send,
+  // collective-permute, and recv of these same latency cost.
+  absl::string_view kFdoProfile = R"pb(
+    costs { name: "add_0" cost_us: 5120000.0 }
+    costs { name: "add_1" cost_us: 160000.0 }
+    costs { name: "add_2" cost_us: 30000.0 }
+  )pb";
+  auto config = GetModuleConfig(
+      kFdoProfile, /*pipeline_parallelism_opt_level=*/DebugOptions::
+          PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloModule, config));
+
+  TF_EXPECT_OK(ScheduleModule(module.get(), /*num_parallel_resources=*/1,
+                              DebugOptions::PGLE_STRICTNESS_LEVEL_OFF));
+  auto schedule = module->schedule();
+
+  VLOG(3) << module->schedule().ToString();
+  HloComputation* main_computation = FindComputation(module.get(), "main");
+  std::vector<HloInstruction*> main_instructions =
+      schedule.sequence(main_computation).instructions();
+
+  // Expect each add op to overlap with one of the collectives: send,
+  // collective-permute, and recv.
+  //   - send_start
+  //   - add_0
+  //   - send_done
+  //   - cp_start
+  //   - add_1
+  //   - cp_done
+  //   - recv_start
+  //   - add_2
+  //   - recv_done
+  EXPECT_LT(GetIndexByName(main_instructions, "send_start"),
+            GetIndexByName(main_instructions, "add_0"));
+  EXPECT_LT(GetIndexByName(main_instructions, "add_0"),
+            GetIndexByName(main_instructions, "send_done"));
+  EXPECT_LT(GetIndexByName(main_instructions, "send_done"),
+            GetIndexByName(main_instructions, "cp_start"));
+  EXPECT_LT(GetIndexByName(main_instructions, "cp_start"),
+            GetIndexByName(main_instructions, "add_1"));
+  EXPECT_LT(GetIndexByName(main_instructions, "add_1"),
+            GetIndexByName(main_instructions, "cp_done"));
+  EXPECT_LT(GetIndexByName(main_instructions, "cp_done"),
+            GetIndexByName(main_instructions, "recv_start"));
+  EXPECT_LT(GetIndexByName(main_instructions, "recv_start"),
+            GetIndexByName(main_instructions, "add_2"));
+  EXPECT_LT(GetIndexByName(main_instructions, "add_2"),
+            GetIndexByName(main_instructions, "recv_done"));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
index 9dc80eb45a7c..93983e36db29 100644
--- a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
@@ -18,11 +18,15 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
+#include "absl/strings/match.h"
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo_value.h"
 
 namespace xla {
@@ -34,30 +38,74 @@ inline constexpr int64_t kTempBufferMemorySpaceColor = 2;
 // Set memory space to kCollectiveMemorySpaceColor for all allocations used by
 // all-reduce, all-gather, and reduce-scatter. This memory space maps to
 // collective memory using ncclMemAlloc in the runtime.
-inline BufferAssigner::Colorer CollectiveColorer() {
-  return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
-    static const auto* kSupportedOpcodes = new absl::flat_hash_set<HloOpcode>{
-        HloOpcode::kAllReduce,
-        HloOpcode::kAllReduceStart,
-        HloOpcode::kAllReduceDone,
-        HloOpcode::kAllGather,
-        HloOpcode::kAllGatherStart,
-        HloOpcode::kAllGatherDone,
-        HloOpcode::kReduceScatter,
-        HloOpcode::kCollectivePermute,
-        HloOpcode::kCollectivePermuteStart,
-        HloOpcode::kCollectivePermuteDone,
-        HloOpcode::kAllToAll,
+inline BufferAssigner::Colorer CollectiveColorer(bool use_user_buffers,
+                                                 bool use_nvshmem) {
+  return [use_user_buffers, use_nvshmem](HloAliasAnalysis* alias_analysis,
+                                         const HloOrdering&) {
+    static const auto* const kSupportedOpcodes =
+        new absl::flat_hash_set<HloOpcode>{
+            HloOpcode::kAllReduce,
+            HloOpcode::kAllReduceStart,
+            HloOpcode::kAllReduceDone,
+            HloOpcode::kAllGather,
+            HloOpcode::kAllGatherStart,
+            HloOpcode::kAllGatherDone,
+            HloOpcode::kReduceScatter,
+            HloOpcode::kCollectivePermute,
+            HloOpcode::kCollectivePermuteStart,
+            HloOpcode::kCollectivePermuteDone,
+            HloOpcode::kAllToAll,
+        };
+
+    auto is_nvshmem_op = [](const HloInstruction* inst) {
+      bool is_nvshmem_collective = false;
+      if (inst->has_backend_config()) {
+        auto gpu_config = inst->backend_config<GpuBackendConfig>();
+        if (!gpu_config.ok()) {
+          return false;
+        }
+        const CollectiveBackendConfig& backend_config =
+            gpu_config.value().collective_backend_config();
+        is_nvshmem_collective =
+            backend_config.backend() == CollectiveBackendConfig::NVSHMEM;
+      }
+      return is_nvshmem_collective;
+    };
+
+    auto is_mosaic_gpu_nvshmem_instr = [](const HloInstruction* instr) {
+      return instr->opcode() == HloOpcode::kCustomCall &&
+             (instr->custom_call_target() == "mosaic_gpu" ||
+              instr->custom_call_target() == "mosaic_gpu_v2") &&
+             absl::StrContains(instr->raw_backend_config_string(), "nvshmem");
+    };
+    auto is_collective_memory_instr = [&](const HloInstruction* instr) {
+      if (use_user_buffers) {
+        return kSupportedOpcodes->contains(instr->opcode()) ||
+               // opcode or async wrapped opcode is in kSupportedOpcodes.
+               ((instr->opcode() == HloOpcode::kAsyncStart ||
+                 instr->opcode() == HloOpcode::kAsyncDone) &&
+                kSupportedOpcodes->contains(instr->async_wrapped_opcode()));
+      }
+      if (use_nvshmem) {
+        return is_mosaic_gpu_nvshmem_instr(instr) || is_nvshmem_op(instr);
+      }
+      return false;
+    };
+    auto has_collective_memory_in_uses = [&](const HloValue* input_alias) {
+      // If any use is a collective instruction, we must color the value to use
+      // collective memory space.
+      for (auto& use : input_alias->GetUses()) {
+        if (is_collective_memory_instr(use.instruction)) {
+          return true;
+        }
+      }
+      return false;
     };
     for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
       auto& buffer = alias_analysis->GetBufferContainingValue(*value);
       for (const auto& alias : buffer.values()) {
-        // opcode or async wrapped opcode is in kSupportedOpcodes.
-        if (kSupportedOpcodes->contains(alias->instruction()->opcode()) ||
-            ((alias->instruction()->opcode() == HloOpcode::kAsyncStart ||
-              alias->instruction()->opcode() == HloOpcode::kAsyncDone) &&
-             kSupportedOpcodes->contains(
-                 alias->instruction()->async_wrapped_opcode()))) {
+        if (is_collective_memory_instr(alias->instruction()) ||
+            has_collective_memory_in_uses(alias)) {
           value->set_color(kCollectiveMemorySpaceColor);
         }
       }
diff --git a/third_party/xla/xla/service/gpu/gpu_offloading_test.cc b/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
index ab61d55ac72e..cd049ba27c33 100644
--- a/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
@@ -55,11 +55,9 @@ class GpuOffloadingTest : public HloTestBase {
                                                int64_t min_remat_size = 0) {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (!module->has_schedule()) {
-      HloMemoryScheduler scheduler(
-          [](const BufferValue& buffer) {
-            return ::xla::ShapeUtil::ByteSizeOf(buffer.shape());
-          },
-          ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
+      HloMemoryScheduler scheduler([](const BufferValue& buffer) {
+        return ::xla::ShapeUtil::ByteSizeOf(buffer.shape());
+      });
       TF_EXPECT_OK(scheduler.Run(module).status());
     }
     // Create a configuration where any compute is much much slower than any
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
index da4b76f9fa73..0d12b814b163 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/service/collective_conflict_analysis.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/collective_pipeliner.h"
+#include "xla/service/collective_pipeliner_utils.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -287,7 +288,8 @@ struct PeeledHloInstructionInfo {
 
 // Finds the start instruction for send/recv where send/recv-done are chosen to
 // be pipelined.
-static HloInstruction* GetSendRecvStartInstruction(HloInstruction* instr) {
+static HloInstruction* absl_nullable GetSendRecvStartInstruction(
+    HloInstruction* absl_nonnull instr) {
   if (instr->opcode() == HloOpcode::kRecv ||
       instr->opcode() == HloOpcode::kSend) {
     return instr;
@@ -299,23 +301,30 @@ static HloInstruction* GetSendRecvStartInstruction(HloInstruction* instr) {
   return nullptr;
 }
 
-static std::vector<HloInstruction*> GetSendRecvStartInstructions(
-    const std::vector<HloInstruction*>& instructions) {
-  std::vector<HloInstruction*> start_instructions;
-  for (HloInstruction* instr : instructions) {
-    HloInstruction* start_instr = GetSendRecvStartInstruction(instr);
+static std::vector<HloInstruction* absl_nonnull> GetSendRecvStartInstructions(
+    const std::vector<HloInstruction* absl_nonnull>& instructions) {
+  std::vector<HloInstruction* absl_nonnull> start_instructions;
+  for (HloInstruction* absl_nonnull instr : instructions) {
+    HloInstruction* absl_nullable start_instr =
+        GetSendRecvStartInstruction(instr);
     if (start_instr != nullptr) start_instructions.push_back(start_instr);
   }
   return start_instructions;
 }
 
-static absl::Nullable<HloInstruction*> GetSendRecvDoneInstructions(
-    absl::Nonnull<HloInstruction*> rotated_instr) {
+static HloInstruction* absl_nonnull GetSendRecvStartInstructionOrSelf(
+    HloInstruction* absl_nonnull instr) {
+  HloInstruction* send_recv_start = GetSendRecvStartInstruction(instr);
+  return send_recv_start != nullptr ? send_recv_start : instr;
+}
+
+static HloInstruction* absl_nonnull GetSendRecvDoneInstructionOrSelf(
+    HloInstruction* absl_nonnull rotated_instr) {
   auto it = absl::c_find_if(rotated_instr->users(), [](HloInstruction* user) {
     return user->opcode() == HloOpcode::kRecvDone ||
            user->opcode() == HloOpcode::kSendDone;
   });
-  return it != rotated_instr->users().end() ? *it : nullptr;
+  return it != rotated_instr->users().end() ? *it : rotated_instr;
 }
 
 // Post-process rotated send/recv ops to add control dependencies with
@@ -323,7 +332,7 @@ static absl::Nullable<HloInstruction*> GetSendRecvDoneInstructions(
 static absl::Status PostProcessRotatedSendRecvOps(
     const std::vector<HloInstruction*>& rotated) {
   // Find the start instructions for send/recv.
-  std::vector<HloInstruction*> rotated_send_recvs =
+  std::vector<HloInstruction* absl_nonnull> rotated_send_recvs =
       GetSendRecvStartInstructions(rotated);
 
   VLOG(5) << "Post-processing rotated send/recv ops:";
@@ -334,7 +343,7 @@ static absl::Status PostProcessRotatedSendRecvOps(
   }
 
   // Convert to set for faster lookup.
-  absl::flat_hash_set<HloInstruction*> rotated_send_recvs_set(
+  absl::flat_hash_set<HloInstruction* absl_nonnull> rotated_send_recvs_set(
       rotated_send_recvs.begin(), rotated_send_recvs.end());
 
   // Add control dependencies from conflicting collectives to rotated send/recv
@@ -349,11 +358,11 @@ static absl::Status PostProcessRotatedSendRecvOps(
          FindAllConflictingCollectives(parent, {rotated_instr})) {
       if (rotated_send_recvs_set.contains(conflicting_collective)) continue;
       num_conflicting_collectives++;
-      HloInstruction* new_control_dependency =
-          GetSendRecvDoneInstructions(rotated_instr);
-      CHECK_NE(new_control_dependency, nullptr);
-      TF_RETURN_IF_ERROR(conflicting_collective->AddControlDependencyTo(
-          new_control_dependency));
+      conflicting_collective =
+          GetSendRecvDoneInstructionOrSelf(conflicting_collective);
+      rotated_instr = GetSendRecvStartInstructionOrSelf(rotated_instr);
+      TF_RETURN_IF_ERROR(
+          conflicting_collective->AddControlDependencyTo(rotated_instr));
       VLOG(5) << "Adding control dependency from "
               << conflicting_collective->ToShortString() << " to "
               << rotated_instr->ToShortString() << "\n";
@@ -539,8 +548,7 @@ absl::StatusOr<bool> GpuP2PPipeliner::Run(
       PostprocessPeeledP2P;
   CollectivePipeliner::HloPostprocessor postprocess_backward_rotated_op =
       PostprocessRotatedP2P;
-  CollectivePipeliner::HloPostprocessor
-      postprocess_backward_peeled_trailing_op = std::nullopt;
+  CollectivePipeliner::HloPostprocessor postprocess_backward_peeled_trailing_op;
 
   // If partial send/recv pipelining is enabled, collect send/recv instructions
   // for post-processing.
@@ -585,7 +593,7 @@ absl::StatusOr<bool> GpuP2PPipeliner::Run(
       /*pipeline_use_tree=*/false,
       /*process_different_sized_ops=*/true,
       /*pipelining_direction=*/
-      CollectivePipeliner::PipeliningDirection::kBackward,
+      collective_pipeliner_utils::PipeliningDirection::kBackward,
       /*should_process=*/should_process,
       /*acceptable_formatting=*/HloPredicateTrue,
       /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
index 38d757ebcc11..fad6b1141b28 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
@@ -33,11 +33,11 @@ limitations under the License.
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -48,7 +48,7 @@ namespace m = xla::match;
 using ::testing::IsEmpty;
 using ::testing::UnorderedElementsAre;
 
-class GpuP2PPipelinerTest : public HloTestBase {
+class GpuP2PPipelinerTest : public HloHardwareIndependentTestBase {
  public:
   GpuP2PPipelinerTest() {
     const int64_t kNumReplicas = 1;
@@ -245,22 +245,22 @@ TEST_F(GpuP2PPipelinerTest, SendRecvForwardCycle) {
     // back edge and one set for the forward edge. Also check that the send/recv
     // target pairs and validation attributes are correct.
     CHECK: %[[RECV_BWD_START:.*]] = {{.*}} after-all()
-    CHECK: %[[RECV_BWD:.*]] = {{.*}} recv(token[] %[[RECV_BWD_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation={{[{][{]}}2,9{{[}][}]}}}
-    CHECK: %[[RECV_DONE_BWD:.*]] = {{.*}} recv-done({{.*}} %[[RECV_BWD:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+    CHECK: %[[RECV_BWD:.*]] = {{.*}} recv(%[[RECV_BWD_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation={{[{][{]}}2,9{{[}][}]}}}
+    CHECK: %[[RECV_DONE_BWD:.*]] = {{.*}} recv-done(%[[RECV_BWD:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
     CHECK: %[[RECV_FWD_START:.*]] = {{.*}} after-all()
-    CHECK: %[[RECV_FWD:.*]] = {{.*}} recv(token[] %[[RECV_FWD_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,6},{0,7},{1,8{{[}][}]}}}
-    CHECK: %[[RECV_DONE_FWD:.*]] = {{.*}} recv-done((f32[2,2]{1,0}, u32[], token[]) %[[RECV_FWD:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
-    CHECK: %[[SEND_BWD:.*]] = {{.*}} send({{.*}} %[[RECV_BWD_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation={{[{][{]}}2,9{{[}][}]}}}
-    CHECK: %[[SEND_DONE_BWD:.*]] = {{.*}} send-done({{.*}} %[[SEND_BWD:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-    CHECK: %[[SEND_FWD:.*]] = {{.*}} send({{.*}} %[[RECV_FWD_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,6},{0,7},{1,8{{[}][}]}}}
-    CHECK: %[[SEND_DONE_FWD:.*]] = {{.*}} send-done({{.*}} %[[SEND_FWD:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+    CHECK: %[[RECV_FWD:.*]] = {{.*}} recv(%[[RECV_FWD_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,6},{0,7},{1,8{{[}][}]}}}
+    CHECK: %[[RECV_DONE_FWD:.*]] = {{.*}} recv-done(%[[RECV_FWD:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+    CHECK: %[[SEND_BWD:.*]] = {{.*}} send(%[[RECV_BWD_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation={{[{][{]}}2,9{{[}][}]}}}
+    CHECK: %[[SEND_DONE_BWD:.*]] = {{.*}} send-done(%[[SEND_BWD:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+    CHECK: %[[SEND_FWD:.*]] = {{.*}} send(%[[RECV_FWD_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,6},{0,7},{1,8{{[}][}]}}}
+    CHECK: %[[SEND_DONE_FWD:.*]] = {{.*}} send-done(%[[SEND_FWD:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
     // Check that the total iterations of the while loop in the output is 1
     // fewer than the max iteration of the input HLO.
     CHECK: %[[WHILE_COND:.*]] (cond_param: {{.*}}
     CHECK-NEXT: %[[COND_PARAM:.*]] = {{.*}} parameter(0)
-    CHECK: %[[CURRENT_ITER:.*]] = {{.*}} get-tuple-element({{.*}} %[[COND_PARAM:.*]]), index=0
+    CHECK: %[[CURRENT_ITER:.*]] = {{.*}} get-tuple-element(%[[COND_PARAM:.*]]), index=0
     CHECK: %[[TWO:.*]] = {{.*}} constant(2)
-    CHECK: ROOT %[[COMPARE:.*]] = pred[] compare({{.*}} %[[CURRENT_ITER:.*]], {{.*}} %[[TWO:.*]]), direction=LT
+    CHECK: ROOT %[[COMPARE:.*]] = pred[] compare(%[[CURRENT_ITER:.*]], %[[TWO:.*]]), direction=LT
 
     // Check that after transformation, main function in ENTRY contains the
     // first iteration of the while loop.
@@ -268,22 +268,22 @@ TEST_F(GpuP2PPipelinerTest, SendRecvForwardCycle) {
 
     // Set up dummy send and recv.
     CHECK: %[[RECV_BWD_DUMMY_START:.*]] = {{.*}} after-all()
-    CHECK: %[[RECV_BWD_DUMMY:.*]] = {{.*}} recv(token[] %[[RECV_BWD_DUMMY_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation="invalid"}
-    CHECK: %[[RECV_DONE_BWD_DUMMY:.*]] = {{.*}} recv-done({{.*}} %[[RECV_BWD_DUMMY:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+    CHECK: %[[RECV_BWD_DUMMY:.*]] = {{.*}} recv(%[[RECV_BWD_DUMMY_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation="invalid"}
+    CHECK: %[[RECV_DONE_BWD_DUMMY:.*]] = {{.*}} recv-done(%[[RECV_BWD_DUMMY:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
 
     // Execute what was previously iter 0 of the while loop.
     CHECK: %[[RECV_FWD_FIRST_ITER_START:.*]] = {{.*}} after-all()
-    CHECK: %[[RECV_FWD_FIRST_ITER:.*]] = {{.*}} recv(token[] %[[RECV_FWD_FIRST_ITER_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,0},{1,0},{1,0{{[}][}]}}}
-    CHECK: %[[RECV_DONE_FWD_FIRST_ITER:.*]] = {{.*}} recv-done({{.*}} %[[RECV_FWD_FIRST_ITER:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
-    CHECK: %[[SEND_BWD_DUMMY:.*]] = {{.*}} send({{.*}} %[[RECV_DUMMY_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation="invalid"}
-    CHECK: %[[SEND_DONE_BWD_DUMMY:.*]] = {{.*}} send-done({{.*}} %[[SEND_BWD_DUMMY:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-    CHECK: %[[SEND_FWD_FIRST_ITER:.*]] = {{.*}} send({{.*}} %[[RECV_FWD_FIRST_ITER_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,0},{1,0},{1,0{{[}][}]}}}
-    CHECK: %[[SEND_DONE_FWD_FIRST_ITER:.*]] = {{.*}} send-done({{.*}} %[[SEND_FWD_FIRST_ITER:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+    CHECK: %[[RECV_FWD_FIRST_ITER:.*]] = {{.*}} recv(%[[RECV_FWD_FIRST_ITER_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,0},{1,0},{1,0{{[}][}]}}}
+    CHECK: %[[RECV_DONE_FWD_FIRST_ITER:.*]] = {{.*}} recv-done(%[[RECV_FWD_FIRST_ITER:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+    CHECK: %[[SEND_BWD_DUMMY:.*]] = {{.*}} send(%[[RECV_DUMMY_START:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{[{][{]}}3,0{{[}][}]}},_xla_send_recv_validation="invalid"}
+    CHECK: %[[SEND_DONE_BWD_DUMMY:.*]] = {{.*}} send-done(%[[SEND_BWD_DUMMY:.*]]), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+    CHECK: %[[SEND_FWD_FIRST_ITER:.*]] = {{.*}} send(%[[RECV_FWD_FIRST_ITER_START:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{[{][{]}}0,1},{1,2},{2,3{{[}][}]}},_xla_send_recv_validation={{[{][{]}}0,0},{1,0},{1,0{{[}][}]}}}
+    CHECK: %[[SEND_DONE_FWD_FIRST_ITER:.*]] = {{.*}} send-done(%[[SEND_FWD_FIRST_ITER:.*]]), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
 
     // Set up main loop, starting from iter 1.
     CHECK: %[[START_LOOP_FROM_ITER_ONE:.*]] = u32[] constant(1)
-    CHECK: %[[LOOP_INPUT:.*]] = {{.*}} tuple({{.*}} %[[START_LOOP_FROM_ITER_ONE:.*]])
-    CHECK: %[[WHILE:.*]] = {{.*}} while({{.*}} %[[LOOP_INPUT:.*]]), {{.*}}
+    CHECK: %[[LOOP_INPUT:.*]] = {{.*}} tuple(%[[START_LOOP_FROM_ITER_ONE:.*]])
+    CHECK: %[[WHILE:.*]] = {{.*}} while(%[[LOOP_INPUT:.*]]), {{.*}}
   )")
                   .value());
 }
@@ -418,8 +418,7 @@ TEST_F(GpuP2PPipelinerTest, OneSendRecvWithOneConflictingAllReduce) {
           frontend_attributes={_xla_send_recv_source_target_pairs={{0,1},{1,2},{2,3}}},
           control-predecessors={recv}
       recv_done = (f32[64], token[]) recv-done(recv), channel_id=1
-      send_done = token[] send-done(send), channel_id=1,
-          control-predecessors={recv_done}
+      send_done = token[] send-done(send), channel_id=1
       recv_data = f32[64] get-tuple-element(recv_done), index=0
 
       // Conflicting all-reduce.
@@ -472,9 +471,9 @@ TEST_F(GpuP2PPipelinerTest, OneSendRecvWithOneConflictingAllReduce) {
   EXPECT_THAT(send_done_op->control_predecessors(), IsEmpty());
   EXPECT_THAT(ar_op->control_predecessors(),
               UnorderedElementsAre(send_done_op));
-  EXPECT_THAT(recv_op->control_predecessors(), IsEmpty());
-  EXPECT_THAT(recv_done_op->control_predecessors(),
-              UnorderedElementsAre(send_op, ar_op));
+  EXPECT_THAT(recv_op->control_predecessors(),
+              UnorderedElementsAre(send_done_op, ar_op));
+  EXPECT_THAT(recv_done_op->control_predecessors(), IsEmpty());
 }
 
 TEST_F(GpuP2PPipelinerTest, OneSendRecvWithConflictingAllReduceAfterLoop) {
diff --git a/third_party/xla/xla/service/gpu/gpu_prim.h b/third_party/xla/xla/service/gpu/gpu_prim.h
deleted file mode 100644
index 836058644984..000000000000
--- a/third_party/xla/xla/service/gpu/gpu_prim.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-To in writing unless required by applicable law or agreed,
-distributed on an, software distributed under the license is "AS IS"
-BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
-or implied. For the specific language governing permissions and
-limitations under the license, the license you must see.
-==============================================================================*/
-#ifndef XLA_SERVICE_GPU_GPU_PRIM_H_
-#define XLA_SERVICE_GPU_GPU_PRIM_H_
-
-#include "tsl/platform/bfloat16.h"
-
-#if GOOGLE_CUDA
-#include "cub/block/block_load.cuh"
-#include "cub/block/block_scan.cuh"
-#include "cub/block/block_store.cuh"
-#include "cub/device/device_histogram.cuh"
-#include "cub/device/device_radix_sort.cuh"
-#include "cub/device/device_reduce.cuh"
-#include "cub/device/device_scan.cuh"
-#include "cub/device/device_segmented_radix_sort.cuh"
-#include "cub/device/device_segmented_reduce.cuh"
-#include "cub/device/device_select.cuh"
-#include "cub/iterator/counting_input_iterator.cuh"
-#include "cub/iterator/transform_input_iterator.cuh"
-#include "cub/thread/thread_operators.cuh"
-#include "cub/warp/warp_reduce.cuh"
-#include "third_party/gpus/cuda/include/cusparse.h"
-
-namespace gpuprim = ::cub;
-
-// Required for sorting Eigen::half and bfloat16.
-namespace cub {
-template <>
-__device__ __forceinline__ void ThreadStoreVolatilePtr<Eigen::half>(
-    Eigen::half *ptr, Eigen::half val, Int2Type<true> /*is_primitive*/) {
-  *reinterpret_cast<volatile uint16_t *>(ptr) =
-      Eigen::numext::bit_cast<uint16_t>(val);
-}
-
-__device__ __forceinline__ Eigen::half ThreadLoadVolatilePointer(
-    const Eigen::half *ptr, Int2Type<true> /*is_primitive*/) {
-  uint16_t result = *reinterpret_cast<volatile const uint16_t *>(ptr);
-  return Eigen::numext::bit_cast<Eigen::half>(result);
-}
-
-template <>
-__device__ __forceinline__ void ThreadStoreVolatilePtr<tsl::bfloat16>(
-    tsl::bfloat16 *ptr, tsl::bfloat16 val, Int2Type<true> /*is_primitive*/) {
-  *reinterpret_cast<volatile uint16_t *>(ptr) =
-      Eigen::numext::bit_cast<uint16_t>(val);
-}
-
-__device__ __forceinline__ tsl::bfloat16 ThreadLoadVolatilePointer(
-    tsl::bfloat16 *ptr, Int2Type<true> /*is_primitive*/) {
-  uint16_t result = *reinterpret_cast<volatile uint16_t *>(ptr);
-  return Eigen::numext::bit_cast<tsl::bfloat16>(result);
-}
-
-template <>
-struct NumericTraits<Eigen::half>
-    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
-                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
-                 /*T=*/Eigen::half> {};
-template <>
-struct NumericTraits<tsl::bfloat16>
-    : BaseTraits</*_CATEGORY=*/FLOATING_POINT, /*_PRIMITIVE=*/true,
-                 /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
-                 /*T=*/tsl::bfloat16> {};
-}  // namespace cub
-#elif TENSORFLOW_USE_ROCM
-
-#include "rocm/include/hipcub/hipcub.hpp"
-#include "rocm/rocm_config.h"
-namespace gpuprim = ::hipcub;
-
-// Required for sorting Eigen::half and bfloat16.
-namespace rocprim {
-namespace detail {
-
-#if (TF_ROCM_VERSION >= 50200)
-template <>
-struct float_bit_mask<Eigen::half> {
-  static constexpr uint16_t sign_bit = 0x8000;
-  static constexpr uint16_t exponent = 0x7C00;
-  static constexpr uint16_t mantissa = 0x03FF;
-  using bit_type = uint16_t;
-};
-
-template <>
-struct float_bit_mask<tsl::bfloat16> {
-  static constexpr uint16_t sign_bit = 0x8000;
-  static constexpr uint16_t exponent = 0x7F80;
-  static constexpr uint16_t mantissa = 0x007F;
-  using bit_type = uint16_t;
-};
-#endif  // TF_ROCM_VERSION >= 50200
-template <>
-struct radix_key_codec_base<Eigen::half>
-    : radix_key_codec_floating<Eigen::half, uint16_t> {};
-template <>
-struct radix_key_codec_base<tsl::bfloat16>
-    : radix_key_codec_floating<tsl::bfloat16, uint16_t> {};
-};  // namespace detail
-};  // namespace rocprim
-
-#endif  // TENSORFLOW_USE_ROCM
-
-#endif  // XLA_SERVICE_GPU_GPU_PRIM_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
index 2348dee1afd6..55dd3accaec5 100644
--- a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/spmd/shardy/constants.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -41,7 +41,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class GpuSpmdPartitioningTest : public HloTestBase,
+class GpuSpmdPartitioningTest : public HloHardwareIndependentTestBase,
                                 public ::testing::WithParamInterface<bool> {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
@@ -73,7 +73,8 @@ class GpuSpmdPartitioningTest : public HloTestBase,
   bool UseShardy() const { return GetParam(); }
 
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     return debug_options;
   }
 };
diff --git a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
index 17876412ab49..4474c348f598 100644
--- a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
+++ b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
@@ -67,8 +67,8 @@ GpuTransferManager::GpuTransferManager(se::Platform::Id id,
 
 InfeedManager* GpuTransferManager::GetOrCreateInfeedManager(
     se::StreamExecutor* executor) {
-  static absl::Mutex* mutex = new absl::Mutex();
-  static auto* infeed_managers =
+  static absl::Mutex* const mutex = new absl::Mutex();
+  static auto* const infeed_managers =
       new absl::flat_hash_map<se::StreamExecutor*,
                               std::unique_ptr<InfeedManager>>();
   absl::MutexLock lock(mutex);
@@ -81,8 +81,8 @@ InfeedManager* GpuTransferManager::GetOrCreateInfeedManager(
 
 OutfeedManager* GpuTransferManager::GetOrCreateOutfeedManager(
     se::StreamExecutor* executor) {
-  static absl::Mutex* mutex = new absl::Mutex();
-  static auto* outfeed_managers =
+  static absl::Mutex* const mutex = new absl::Mutex();
+  static auto* const outfeed_managers =
       new absl::flat_hash_map<se::StreamExecutor*,
                               std::unique_ptr<OutfeedManager>>();
   absl::MutexLock lock(mutex);
@@ -218,8 +218,8 @@ absl::Status GpuTransferManager::ReadDynamicShapes(
   for (int i = 0; i < copies.size(); i++) {
     Shape* dst_shape = copies[i].second;
     int32_t* dst = h2d_memcpy_dsts[i];
-    for (int64_t j = 0; j < dst_shape->rank(); j++) {
-      dst_shape->mutable_dimensions()[j] = dst[j];
+    for (int64_t j = 0; j < dst_shape->dimensions().size(); j++) {
+      dst_shape->set_dimensions(j, dst[j]);
     }
   }
 
diff --git a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
index 349238c7dcde..019b5119598f 100644
--- a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
+#include "xla/debug_options_flags.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/tests/test_utils.h"
 #include "tsl/platform/env.h"
@@ -46,6 +47,7 @@ class DenylistTest : public testing::Test {
                               "data", "hlo_algorithm_denylist.pbtxt"))
             .data(),
         /*overwrite=*/1);
+    ParseDebugOptionFlagsFromEnv(false);
     config_ =
         ParseTextProto<GpuBackendConfig>(
             "operation_queue_id: 0 wait_on_operation_queues: [] "
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 9f81f459fe78..faa4ebc8cf45 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -233,25 +233,17 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
   }
 
   if (fusion_backend_config_.kind() == kTritonFusionKind ||
-      fusion_backend_config_.kind() == kTritonGemmFusionKind) {
+      fusion_backend_config_.kind() == kTritonGemmFusionKind ||
+      fusion_backend_config_.kind() == kTritonNestedGemmFusionKind) {
     return EmitterFusionKind::kTriton;
   }
 
-  if (fusion_backend_config_.kind() == kCuDnnFusionKind) {
-    return EmitterFusionKind::kCuDnn;
+  if (fusion_backend_config_.kind() == kDynamicMemcpyFusionKind) {
+    return EmitterFusionKind::kDynamicMemcpy;
   }
 
-  if (input_output_info_.smallest_input_dtype_bits < 8 ||
-      input_output_info_.smallest_output_dtype_bits < 8) {
-    // Only loop and input slice fusions currently can handle packed
-    // inputs/outputs, due to the special handling with IrArray needed to deal
-    // with multiple values occupying a single byte.
-    if (fusion_roots_.size() > 1 &&
-        IsInputFusibleNonStridedSlices(fusion_roots_) &&
-        AllSliceInputsAreCompatible(fusion_roots_)) {
-      return EmitterFusionKind::kInputSlices;
-    }
-    return EmitterFusionKind::kLoop;
+  if (fusion_backend_config_.kind() == kCuDnnFusionKind) {
+    return EmitterFusionKind::kCuDnn;
   }
 
   std::optional<HloInstructionAdaptor> first_reduce_hero;
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index 22aa7fb55a88..5f5a81f37f78 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -46,6 +46,7 @@ class HloFusionAnalysis {
     kInputSlices,
     kScatter,
     kCuDnn,
+    kDynamicMemcpy,
   };
 
   // Precomputed information about inputs (arguments) and outputs (roots) of the
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
index d053d660ce42..b53c8fcf8af4 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
@@ -15,20 +15,22 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 
 #include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/protobuf_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
-class HloFusionAnalysisTest : public HloTestBase {};
+using ::tsl::proto_testing::EqualsProto;
+
+class HloFusionAnalysisTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HloFusionAnalysisTest, DoesNotPeekOutsideBoundary) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
@@ -407,9 +409,8 @@ TEST_F(HloFusionAnalysisTest,
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = HloFusionAnalysis::Create(*root, device_info);
 
-  EXPECT_TRUE(
-      protobuf_util::ProtobufEquals(analysis.fusion_backend_config(),
-                                    FusionBackendConfig::default_instance()));
+  EXPECT_THAT(analysis.fusion_backend_config(),
+              EqualsProto(FusionBackendConfig::default_instance()));
 }
 
 TEST_F(HloFusionAnalysisTest,
@@ -435,9 +436,8 @@ TEST_F(HloFusionAnalysisTest,
   auto* producer = consumer->operand(0);
   auto analysis = HloFusionAnalysis::Create(*producer, *consumer, device_info);
 
-  EXPECT_TRUE(
-      protobuf_util::ProtobufEquals(analysis.fusion_backend_config(),
-                                    FusionBackendConfig::default_instance()));
+  EXPECT_THAT(analysis.fusion_backend_config(),
+              EqualsProto(FusionBackendConfig::default_instance()));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_stats_test.cc b/third_party/xla/xla/service/gpu/hlo_fusion_stats_test.cc
index abc10887f3f8..0c63d2f4ad1f 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_stats_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_stats_test.cc
@@ -20,14 +20,14 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
 #include "xla/hlo/parser/hlo_parser.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using HloFusionStatsTest = HloTestBase;
+using HloFusionStatsTest = HloHardwareIndependentTestBase;
 
 TEST_F(HloFusionStatsTest, LoopFusionAndReduceFusion) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 2f0e5fa6db65..185737a73618 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -29,13 +29,15 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/escaping.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/FPEnv.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
@@ -52,17 +54,18 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/literal.h"
+#include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/target_util.h"
-#include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"
@@ -75,12 +78,12 @@ namespace {
 
 // Return whether the given shape is rank 2 excluding the batch dimensions.
 bool IsRank2(const Shape& shape, int64_t batch_dimensions_size) {
-  return shape.rank() == batch_dimensions_size + 2;
+  return shape.dimensions().size() == batch_dimensions_size + 2;
 }
 
 // Return whether the given shape is rank 1 excluding the batch dimensions.
 bool IsRank1(const Shape& shape, int64_t batch_dimensions_size) {
-  return shape.rank() == batch_dimensions_size + 1;
+  return shape.dimensions().size() == batch_dimensions_size + 1;
 }
 
 }  // namespace
@@ -393,6 +396,32 @@ absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
   return true;
 }
 
+int GetBitwidth(PrimitiveType type) {
+  if (type == PRED) {
+    return 8;
+  }
+  return primitive_util::BitWidth(type);
+}
+
+bool IsNormalized(const HloTransposeInstruction& transpose) {
+  const auto& permutation = transpose.dimensions();
+  for (int i = 0; i < permutation.size() - 1; ++i) {
+    if (permutation[i] + 1 == permutation[i + 1]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool CanEmitPackedTranspose(const HloTransposeInstruction& transpose) {
+  // Support only normalized transposes.
+  if (!IsNormalized(transpose)) {
+    return false;
+  }
+  const auto& spec = GetTransposeSpec(&transpose);
+  return GetPackedTransposeTileSizes(spec).ok();
+}
+
 std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     const HloInstruction& hero) {
   if (hero.opcode() != HloOpcode::kTranspose) {
@@ -407,31 +436,152 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
   if (permutation.size() < 2) {
     return std::nullopt;
   }
+  auto bit_width = GetBitwidth(hero.shape().element_type());
   absl::InlinedVector<int64_t, 3> dimensions(hero.shape().dimensions().begin(),
                                              hero.shape().dimensions().end());
   int64_t operand_most_minor_dim = hero.operand(0)->shape().dimensions().back();
+  if (CanEmitPackedTranspose(*Cast<HloTransposeInstruction>(&hero))) {
+    int64_t vector_size =
+        kBankBitwidth / GetBitwidth(hero.shape().element_type());
+    int64_t shmem_usage_bytes =
+        kNumShmemBanks * (kBankBitwidth / 8) * kNumShmemBanks * vector_size;
+    return TransposeDescription{&hero, dimensions, permutation,
+                                shmem_usage_bytes};
+  }
+  int64_t num_elements_after_transposed_dims = 1;
+  std::pair<int64_t, int64_t> transposed_dims;
   if (permutation.back() == dimensions.size() - 1) {
-    operand_most_minor_dim =
-        hero.operand(0)->shape().dimensions(dimensions.size() - 2);
-    auto byte_width = primitive_util::ByteWidth(hero.shape().element_type());
-    if (byte_width * dimensions.back() <= kMaxBytesInMostMinorDimension &&
-        byte_width * dimensions.back() *
-                std::min(operand_most_minor_dim,
-                         dimensions[dimensions.size() - 2]) >=
-            kMinDimensionToTransposeTiled) {
-      return TransposeDescription{&hero, dimensions, permutation};
+    if (bit_width * dimensions.back() > kMaxBitsInMostMinorDimension) {
+      return std::nullopt;
+    }
+    num_elements_after_transposed_dims = dimensions.back();
+    transposed_dims = {
+        hero.operand(0)->shape().dimensions(dimensions.size() - 2),
+        dimensions[dimensions.size() - 2]};
+  } else {
+    // TODO(b/415741994): TransposeEmitter is regressing for S4 when the last
+    // dimension is being transposed. The issue seems to be related to bank
+    // conflicts but a proper investigation is needed.
+    if (bit_width == 4) {
+      return std::nullopt;
     }
-  } else if ((operand_most_minor_dim >= kMinDimensionToTransposeTiled &&
-              dimensions.back() >= kMinDimensionToTransposeTiled) ||
-             (operand_most_minor_dim >= kMinDimensionToTransposeTiled2 &&
-              dimensions.back() >= kMinDimensionToTransposeTiled2 &&
-              operand_most_minor_dim * dimensions.back() >=
-                  kMinTotalDimensionsToTransposeTiled)) {
-    return TransposeDescription{&hero, dimensions, permutation};
+    transposed_dims = {operand_most_minor_dim, dimensions.back()};
+  }
+  if ((std::min(transposed_dims.first, transposed_dims.second) >=
+       kMinDimensionToTransposeTiled) &&
+      (transposed_dims.first * transposed_dims.second >=
+       kMinTotalDimensionsToTransposeTiled)) {
+    int64_t shmem_usage_bytes =
+        CeilOfRatio(kNumShmemBanks * (kNumShmemBanks + 1LL) * bit_width *
+                        num_elements_after_transposed_dims,
+                    8LL);
+    return TransposeDescription{&hero, dimensions, permutation,
+                                shmem_usage_bytes};
   }
   return std::nullopt;
 }
 
+TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose) {
+  auto inv_permutation = InversePermutation(transpose->dimensions());
+  auto& output_shape = transpose->shape();
+  llvm::SmallVector<int64_t, 3> canonical_output_shape =
+      llvm::to_vector<3>(output_shape.dimensions());
+  llvm::SmallVector<int64_t, 3> canonical_permutation =
+      llvm::to_vector<3>(transpose->dimensions());
+
+  // If the last dimension is transposed, add a size-1 B dimension.
+  if (canonical_permutation.back() != canonical_output_shape.size() - 1) {
+    canonical_permutation.push_back(output_shape.dimensions().size());
+    canonical_output_shape.push_back(1);
+  }
+  int64_t dim_t1 = -1;
+  int64_t dim_t2 = -1;
+  for (int64_t i = canonical_permutation.size() - 1; i >= 0; --i) {
+    if (canonical_permutation[i] != i) {
+      dim_t2 = canonical_permutation[i];
+      dim_t1 = i;
+      break;
+    }
+  }
+  // Insert size-1 A dimension if necessary.
+  auto rank = canonical_output_shape.size();
+  if (canonical_permutation[rank - 3] != rank - 3) {
+    canonical_output_shape.insert(canonical_output_shape.begin() + dim_t1, 1);
+    for (auto& p : canonical_permutation) {
+      if (p > rank - 3) p++;
+    }
+    canonical_permutation.insert(canonical_permutation.begin() + dim_t1,
+                                 dim_t1);
+  }
+  auto canonical_inv_permutation = InversePermutation(canonical_permutation);
+  auto canonical_input_shape =
+      Permute(canonical_output_shape, canonical_inv_permutation);
+  return TransposeSpec{
+      transpose,
+      llvm::to_vector<3>(transpose->dimensions()),
+      llvm::to_vector<3>(inv_permutation),
+      canonical_output_shape,
+      canonical_permutation,
+      llvm::to_vector<3>(canonical_inv_permutation),
+      llvm::to_vector<3>(canonical_input_shape),
+  };
+}
+
+std::string TransposeSpec::ToString() const {
+  return absl::Substitute(R"(
+transpose: $0
+canonical_input_shape: $1
+canonical_output_shape: $2
+canonical_permutation: $3
+canonical_inv_permutation: $4
+[T2, A, T1, B] = [$5, $6, $7, $8]
+)",
+                          transpose->ToString(),
+                          absl::StrJoin(canonical_input_shape, ","),
+                          absl::StrJoin(canonical_output_shape, ","),
+                          absl::StrJoin(canonical_permutation, ","),
+                          absl::StrJoin(canonical_inv_permutation, ","),
+                          dim_T2(), dim_A(), dim_T1(), dim_B());
+}
+
+absl::StatusOr<absl::InlinedVector<int64_t, 3>> GetPackedTransposeTileSizes(
+    const TransposeSpec& spec) {
+  // Check the side outputs, etc.
+  int64_t bits_per_element = GetBitwidth(spec.elem_type());
+  if (bits_per_element >= kBankBitwidth) {
+    return absl::InvalidArgumentError("Element type is too large");
+  }
+  absl::InlinedVector<int64_t, 3> tile_sizes(spec.canonical_rank(), 1);
+  int64_t vector_size = kBankBitwidth / bits_per_element;
+
+  // The shmem size is `shmem_dim x shmem_dim`.
+  int64_t shmem_dim = kNumShmemBanks * vector_size;
+  int64_t tile_size_T1 = std::min(spec.dim_T1(), shmem_dim);
+  int64_t tile_size_A = std::min(spec.dim_A(), shmem_dim / tile_size_T1);
+  int64_t tile_size_T2 = std::min(spec.dim_T2(), shmem_dim);
+  int64_t populated_shmem_rows = tile_size_T2;
+  int64_t populated_shmem_cols = tile_size_A * tile_size_T1;
+
+  // Do not use the packed transpose if there are not enough populated rows or
+  // columns in shmem.
+  const int64_t kNumMinPopulatedRowsOrColumns = 10 * vector_size;
+  if (populated_shmem_cols < kNumMinPopulatedRowsOrColumns ||
+      populated_shmem_rows < kNumMinPopulatedRowsOrColumns) {
+    return absl::InvalidArgumentError("Not enough rows or columns in shmem");
+  }
+
+  // These divisibility constrains are too strict, we can do better.
+  if (spec.dim_B() != 1 || populated_shmem_rows % vector_size != 0 ||
+      populated_shmem_cols % vector_size != 0 ||
+      spec.dim_T2() % tile_size_T2 % vector_size != 0) {
+    return absl::InvalidArgumentError("The shape is not supported");
+  }
+  tile_sizes[spec.dim_T1_output_id()] = tile_size_T1;
+  tile_sizes[spec.dim_T2_output_id()] = tile_size_T2;
+  tile_sizes[spec.dim_A_id()] = tile_size_A;
+  return tile_sizes;
+}
+
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count) {
   // Number of operands should be in range [1, allowed_operand_count].
   if (instr->operand_count() == 0 ||
@@ -664,5 +814,165 @@ bool IsDynamicSliceFusion(const HloInstruction* instr) {
          name == kDynamicSliceFusionWithDynamicAddressComputationConfigName;
 }
 
+bool IsDynamicMemcpyFusion(const HloInstruction* instr) {
+  absl::StatusOr<GpuBackendConfig> backend_config =
+      instr->backend_config<GpuBackendConfig>();
+  return backend_config.ok() &&
+         backend_config->fusion_backend_config().kind() ==
+             kDynamicMemcpyFusionKind;
+}
+
+namespace {
+
+// Whether the instruction is semantically a call.
+bool IsCallLike(const HloInstruction* caller) {
+  return caller->opcode() == HloOpcode::kFusion ||
+         caller->opcode() == HloOpcode::kAsyncStart ||
+         caller->opcode() == HloOpcode::kCall;
+}
+
+const HloInstruction* GetUniqueCallerOrNull(const HloComputation* callee) {
+  auto callers = callee->caller_instructions();
+  return callers.size() == 1 ? callers.front() : nullptr;
+}
+
+// Returns the transitive dependencies of `root`, including those of callers.
+// Returns nullopt if any dependencies have side effects.
+std::optional<absl::flat_hash_set<const HloInstruction*>>
+GetTransitiveFunctionalDependencies(const HloInstruction* root) {
+  absl::flat_hash_set<const HloInstruction*> seen{root};
+  std::queue<const HloInstruction*> queue;
+  queue.push(root);
+
+  auto enqueue = [&](const HloInstruction* instr) {
+    if (seen.insert(instr).second) {
+      queue.push(instr);
+    }
+  };
+
+  while (!queue.empty()) {
+    const auto* instruction = queue.front();
+    VLOG(5) << "Visiting " << instruction->name() << ".";
+    queue.pop();
+
+    if (instruction->opcode() == HloOpcode::kCustomCall ||
+        instruction->HasSideEffect()) {
+      VLOG(5) << "Found an unsafe operation.";
+      return std::nullopt;
+    }
+
+    if (instruction->opcode() == HloOpcode::kParameter) {
+      const HloInstruction* caller =
+          GetUniqueCallerOrNull(instruction->parent());
+      if (!caller) {
+        VLOG(5) << "Failed to determine unique caller, aborting traversal.";
+        return std::nullopt;
+      }
+
+      // If this is semantically a call, continue the traversal at the call
+      // site.
+      if (IsCallLike(caller)) {
+        int64_t index = instruction->parameter_number();
+        enqueue(caller->operand(index));
+      }
+    }
+
+    for (auto* operand : instruction->operands()) {
+      enqueue(operand);
+    }
+  }
+
+  return seen;
+}
+
+// Returns true if `dependency` contains a valid functional dependency: `loop`
+// and `induction_var` are set, and `induction_var` actually points to the
+// loop's induction variable.
+bool VerifyFunctionalDependency(
+    const InductionVariableFunctionalDependency& dependency) {
+  if (!dependency.loop || !dependency.induction_var) {
+    VLOG(5) << "Loop or induction variable not set.";
+    return false;
+  }
+
+  if (dependency.induction_var->opcode() != HloOpcode::kGetTupleElement ||
+      dependency.loop->while_body()->parameter_instruction(0) !=
+          dependency.induction_var->operand(0)) {
+    VLOG(5) << "induction_var does not point to the loop's parameter.";
+    return false;
+  }
+
+  auto config = dependency.loop->backend_config<xla::WhileLoopBackendConfig>();
+  if (!config.ok() || !config->has_known_induction_variable() ||
+      dependency.induction_var->tuple_index() !=
+          config->known_induction_variable().tuple_index()) {
+    VLOG(5) << "induction_var does not access the loop's induction variable.";
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+std::optional<InductionVariableFunctionalDependency>
+ResolveFunctionalDependencyOnInductionVariable(const HloInstruction* instr) {
+  VLOG(5) << "Looking for defining while loop of " << instr->name();
+
+  auto dependencies = GetTransitiveFunctionalDependencies(instr);
+  // If there is a side effect in the dependencies, the result will be nullopt.
+  if (!dependencies) {
+    return std::nullopt;
+  }
+
+  // In the dependencies, there should be exactly one parameter of a while loop,
+  // and exactly one GTE for that parameter. We already verified that there are
+  // no side-effecting dependencies.
+  InductionVariableFunctionalDependency result{};
+  for (const HloInstruction* dep : *dependencies) {
+    if (dep->opcode() == HloOpcode::kParameter) {
+      const HloComputation* callee = dep->parent();
+      const HloInstruction* caller = GetUniqueCallerOrNull(callee);
+      if (caller && IsCallLike(caller)) {
+        // Register the parameter as a required intermediate value.
+        auto& required = result.required_parameters[callee];
+        if (required.empty()) {
+          required.resize(callee->num_parameters());
+        }
+        required[dep->parameter_number()] = true;
+      } else if (caller && caller->opcode() == HloOpcode::kWhile) {
+        if (result.loop) {
+          LOG(WARNING) << "While loop not unique. This should never happen.";
+          return std::nullopt;
+        }
+        result.loop = caller;
+      } else {
+        // We arrived at an unexpected parameter. This likely means we're not in
+        // a while loop, or there's an unsupported instruction between the while
+        // loop and `instr`.
+        VLOG(5) << "Unsupported parameter: " << dep->name() << ".";
+        return std::nullopt;
+      }
+    }
+
+    if (dep->opcode() == HloOpcode::kGetTupleElement) {
+      // Note that this may not actually be the induction variable. We will
+      // verify this later (in VerifyFunctionalDependency). We can't do it here
+      // because we may not have visited the loop yet.
+      if (result.induction_var) {
+        VLOG(5) << "Found non-unique GTEs.";
+        return std::nullopt;
+      }
+      result.induction_var = dep;
+    }
+  }
+
+  if (!VerifyFunctionalDependency(result)) {
+    return std::nullopt;
+  }
+  VLOG(5) << "While loop for " << instr->name() << ": " << result.loop->name();
+  return result;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 3f65c32ab700..17fc513b39f7 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -23,11 +23,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "xla/hlo/ir/backend_config.h"
@@ -48,17 +50,15 @@ using BinaryMap = absl::flat_hash_map<std::string, std::string>;
 
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
-inline constexpr int64_t kMinDimensionToTransposeTiled = 16;
-// But if both swap dimensions are larger than 'kMinDimensionToTransposeTiled2',
-// and the product of the dimensions to be swapped is larger than
+inline constexpr int64_t kMinDimensionToTransposeTiled = 4;
+// If the product of the dimensions to be swapped is larger than
 // 'kMinTotalDimensionsToTransposeTiled', tiled transposition may be more
-// efficient.
-inline constexpr int64_t kMinDimensionToTransposeTiled2 = 8;
-inline constexpr int64_t kMinTotalDimensionsToTransposeTiled = 64 * 128;
+// efficient. See go/xla-transpose-emitter-performance-analysis.
+inline constexpr int64_t kMinTotalDimensionsToTransposeTiled = 16 * 16;
 // As the amount of shared memory is limited, we need to make sure that we don't
 // detect 102 transposes that would require too much bytes for the most minor
 // dimension.
-inline constexpr int64_t kMaxBytesInMostMinorDimension = 8;
+inline constexpr int64_t kMaxBitsInMostMinorDimension = 8 * 8;
 
 // Matrix multiplication before the rewrite.
 bool IsMatrixMultiplication(const HloInstruction& dot);
@@ -75,19 +75,35 @@ inline constexpr absl::string_view kCustomFusionKind = "__custom_fusion";
 
 // Generic fusions that use Triton have FusionBackendConfig.kind equal to this
 // string. This fusion kind will eventually subsume all usages of
-// kTritonGemmFusionKind and kTritonSoftmaxFusionKind.
+// kTritonGemmFusionKind.
 inline constexpr absl::string_view kTritonFusionKind = "__triton";
 
 // Fusions that use Triton have FusionBackendConfig.kind equal to this string.
 inline constexpr absl::string_view kTritonGemmFusionKind = "__triton_gemm";
 
+// Generic fusions that use Triton have FusionBackendConfig.kind equal to this
+// string. Used for fusions that implement a dot expressed as nested fusions.
+inline constexpr absl::string_view kTritonNestedGemmFusionKind =
+    "__triton_nested_gemm_fusion";
+
 inline constexpr absl::string_view kCuDnnFusionKind = "__cudnn$fusion";
 
+// Fusions that can be emitted using a dynamic memcpy. A dynamic memcpy depends
+// on some loop induction variable.
+inline constexpr absl::string_view kDynamicMemcpyFusionKind =
+    "__dynamic_memcpy";
+
 inline constexpr absl::string_view kUncompilableFusion =
     "__uncompilable_fusion";
 
 inline constexpr absl::string_view kTopKCustomCallTarget = "__gpu$TopK";
 
+// The number of shared memory banks.
+inline constexpr int64_t kNumShmemBanks = 32;
+
+// The bitwidth of a shared memory bank.
+inline constexpr int64_t kBankBitwidth = 32;
+
 // The name of the custom fusion config for dynamic slice fusion with static
 // slices, such that the offset can be computed at compile time.
 inline constexpr absl::string_view
@@ -111,6 +127,15 @@ std::optional<std::string> GetCustomFusionConfigName(
 // fusion. This is determined by checking the name of custom fusion config.
 bool IsDynamicSliceFusion(const HloInstruction* instr);
 
+// Returns the bitwidth of the given primitive type. Unfortunately,
+// primitive_util::BitWidth(PRED) return 1 instead of 8.
+int GetBitwidth(PrimitiveType type);
+
+// Returns true if the given instruction is a dynamic memcpy fusion. This
+// function only checks the fusion kind, which is populated by the
+// FusionDispatch pipeline.
+bool IsDynamicMemcpyFusion(const HloInstruction* instr);
+
 // Returns true if `hlo` will be implemented as a call to a cuSolver routine.
 //
 // This returns true if `hlo` is a CustomCall HLO with a call target equal to
@@ -182,14 +207,17 @@ struct TransposeDescription {
   // Permutations of normalized transpose dimensions.
   absl::InlinedVector<int64_t, 3> permutation;
 
-  TransposeDescription(absl::InlinedVector<int64_t, 3> dimensions,
-                       absl::InlinedVector<int64_t, 3> permutation)
-      : TransposeDescription(/*instr=*/nullptr, dimensions, permutation) {}
+  // Required amount of shared memory in bytes.
+  int64_t shmem_usage = 0;
 
   TransposeDescription(const HloInstruction* instr,
                        absl::InlinedVector<int64_t, 3> dimensions,
-                       absl::InlinedVector<int64_t, 3> permutation)
-      : instr(instr), dimensions(dimensions), permutation(permutation) {}
+                       absl::InlinedVector<int64_t, 3> permutation,
+                       int64_t shmem_usage)
+      : instr(instr),
+        dimensions(dimensions),
+        permutation(permutation),
+        shmem_usage(shmem_usage) {}
 
   // Transpose instruction input shape.
   const Shape& input_shape() const { return instr->operand(0)->shape(); }
@@ -197,13 +225,73 @@ struct TransposeDescription {
   // Returns true, if both descriptions have the same dimensions and
   // permutation, even if they're produced by different instructions.
   bool IsEquivalent(const TransposeDescription& other) const {
-    return dimensions == other.dimensions && permutation == other.permutation;
+    return dimensions == other.dimensions && permutation == other.permutation &&
+           GetBitwidth(instr->shape().element_type()) ==
+               GetBitwidth(other.instr->shape().element_type());
   }
 };
 
 std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     const HloInstruction& hero);
 
+// Canonical transpose permutes the input shape
+// <D_0 x ... x D_n x T2 x D_{n+1} x ... x D_m x A x T1 x B> into
+// <D'_0 x ... x D'_n' x T1 x D'_{n'+1} x ... x D'_m x A x T2 x B>.
+// Note that the `D` dimensions are batch dimensions. They can also be
+// permuted, but they are tiled by 1.
+//
+// Examples:
+// 1. <8x32> -> <32x8> will be canonicalized to <8x1x32x1> -> <32x1x8x1>.
+// 2. <8x2x32> -> <32x2x8> will be canonicalized to <8x2x32x1> -> <32x2x8x1>.
+// 3. <8x2x32x7x6> -> <6x32x2x7x8> becomes <8x2x32x7x6x1> -> <6x32x2x7x8x1>.
+
+// TODO(b/370690811): Unify this with TransposeDescription.
+struct TransposeSpec {
+  PrimitiveType elem_type() const { return input_shape().element_type(); }
+
+  const Shape& input_shape() const { return transpose->operand(0)->shape(); }
+  const Shape& output_shape() const { return transpose->shape(); }
+
+  int64_t rank() const { return input_shape().dimensions().size(); }
+  int64_t canonical_rank() const { return canonical_input_shape.size(); }
+
+  int64_t dim_A() const { return canonical_input_shape[dim_A_id()]; }
+  int64_t dim_A_id() const { return canonical_rank() - 3; }
+
+  int64_t dim_B() const { return canonical_input_shape.back(); }
+  int64_t dim_B_id() const { return canonical_rank() - 1; }
+
+  int64_t dim_T1() const { return canonical_input_shape[dim_T1_input_id()]; }
+  int64_t dim_T1_input_id() const { return canonical_rank() - 2; }
+  int64_t dim_T1_output_id() const {
+    return canonical_inv_permutation[canonical_rank() - 2];
+  }
+
+  int64_t dim_T2() const { return canonical_input_shape[dim_T2_input_id()]; }
+  int64_t dim_T2_input_id() const {
+    return canonical_permutation[canonical_rank() - 2];
+  }
+  int64_t dim_T2_output_id() const { return canonical_rank() - 2; }
+
+  std::string ToString() const;
+
+  const HloTransposeInstruction* transpose;
+
+  llvm::SmallVector<int64_t, 3> permutation;
+  llvm::SmallVector<int64_t, 3> inv_permutation;
+
+  llvm::SmallVector<int64_t, 3> canonical_output_shape;
+  llvm::SmallVector<int64_t, 3> canonical_permutation;
+  llvm::SmallVector<int64_t, 3> canonical_inv_permutation;
+  llvm::SmallVector<int64_t, 3> canonical_input_shape;
+};
+
+TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose);
+
+// Returns the default tile sizes for the packed transpose emitter.
+absl::StatusOr<absl::InlinedVector<int64_t, 3>> GetPackedTransposeTileSizes(
+    const TransposeSpec& spec);
+
 // Checks if the instruction is elementwise.
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1);
 
@@ -272,6 +360,33 @@ absl::StatusOr<std::string> FingerprintWithBackendConfig(
                       ", backend_config_fingerprint=", fingerprint);
 }
 
+struct InductionVariableFunctionalDependency {
+  // The dependency may be via multiple levels of intermediate calls. At each
+  // level, we need to know which parameters to evaluate, since not all of them
+  // may be relevant. The while loop's body is not included here, since the
+  // induction variable is implicitly the only dependency that is allowed.
+  // The size of the value is always the same as the number of parameters in the
+  // computation. We request a single element of inlined space, which will
+  // automatically pick the optimum value (16, usually).
+  absl::flat_hash_map<const HloComputation*,
+                      absl::InlinedVector<bool, 1 /* chosen automatically */>>
+      required_parameters;
+
+  // The loop and its induction variable that the value depends on.
+  const HloInstruction* loop;
+  const HloInstruction* induction_var;
+};
+
+// Checks if `instr`'s value is a pure function of a while loop's induction
+// variable. This supports instructions that are inside call, async or fusion
+// instructions. The dependency can be through arbitrary non-side-effecting
+// instructions.
+// Currently, this does not support nested while loops. Only dependencies on the
+// inner-most while loop will successfully be analyzed.
+// Requires `while_loop_trip_count_annotator` to have been run on the loop.
+std::optional<InductionVariableFunctionalDependency>
+ResolveFunctionalDependencyOnInductionVariable(const HloInstruction* instr);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index 0eb88d93f432..10c36c85ccd0 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -20,18 +20,26 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/backend_config.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_runner_agnostic_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -40,9 +48,23 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using ::testing::ElementsAre;
+using ::testing::SizeIs;
 using ::tsl::testing::IsOkAndHolds;
 
-using IrEmissionUtilsTest = HloTestBase;
+class IrEmissionUtilsTest : public HloRunnerAgnosticTestBase {
+ public:
+  IrEmissionUtilsTest()
+      : HloRunnerAgnosticTestBase(
+            std::make_unique<HloRunner>(*PlatformUtil::GetDefaultPlatform())) {}
+
+  TransposeSpec GetTransposeSpecFromRoot(absl::string_view hlo_text) {
+    auto module = ParseAndReturnVerifiedModule(hlo_text).value();
+    auto* root = module->entry_computation()->root_instruction();
+    return GetTransposeSpec(Cast<HloTransposeInstruction>(root));
+  }
+};
+
 using InlinedVector = absl::InlinedVector<int64_t, 3>;
 
 TEST_F(IrEmissionUtilsTest, FindTiledLogicalTranspose) {
@@ -85,6 +107,65 @@ ENTRY entry {
   EXPECT_EQ(result->permutation, InlinedVector({1, 0, 2}));
 }
 
+TEST_F(IrEmissionUtilsTest, FindTiledLogical210TransposeWithSmallDimension) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f16[4,256,128]{2,1,0} parameter(0)
+  ROOT t = f16[128,256,4]{2,1,0} transpose(p), dimensions={2,1,0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  HloInstruction* tr = module->entry_computation()->root_instruction();
+
+  auto result = GetDescriptionForTiledTransposeEmitter(*tr);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result->instr, tr);
+  EXPECT_EQ(result->dimensions, InlinedVector({128, 256, 4}));
+  EXPECT_EQ(result->permutation, InlinedVector({2, 1, 0}));
+}
+
+TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeDimensionTooSmall) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f16[1024,3]{1,0} parameter(0)
+  ROOT t = f16[3,1024]{1,0} transpose(p), dimensions={1,0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  HloInstruction* tr = module->entry_computation()->root_instruction();
+
+  auto result = GetDescriptionForTiledTransposeEmitter(*tr);
+
+  // Transposed dimensions should be at least 4. See
+  // go/xla-transpose-emitter-performance-analysis for more details.
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST_F(IrEmissionUtilsTest, FindTiledLogicalTranspose2103ProductTooSmall) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = s4[8,256,8,2]{3,2,1,0} parameter(0)
+  ROOT t = s4[8,256,8,2]{3,2,1,0} transpose(p), dimensions={2,1,0,3}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  HloInstruction* tr = module->entry_computation()->root_instruction();
+
+  // Transposed dimensions combined should be at least 256 (16 x 16). See
+  // go/xla-transpose-emitter-performance-analysis for more details.
+  auto result = GetDescriptionForTiledTransposeEmitter(*tr);
+  EXPECT_FALSE(result.has_value());
+}
+
 TEST_F(IrEmissionUtilsTest, FindTiledLogical102TransposeTooMuchMemoryRequired) {
   const char* hlo = R"(
 HloModule module
@@ -1124,5 +1205,169 @@ ENTRY e {
                          kTestProtoFingerprint));
 }
 
+constexpr absl::string_view kWhileLoopTestModule = R"(
+    plus_one {
+      p0 = s32[] parameter(0)
+      p1 = s32[] parameter(1)
+      c1 = s32[] constant(0)
+      sum = s32[] add(p0, c1)
+      ROOT tuple = (s32[], s32[]) tuple(sum, p1)
+    }
+    identity2 {
+      ROOT p0 = s32[] parameter(0)
+    }
+
+    remainder {
+      p0 = s32[] parameter(0)
+      c4 = s32[] constant(4)
+      ROOT remainder = s32[] remainder(p0, c4)
+    }
+
+    call_body {
+      p0 = s32[] parameter(0)
+      p1 = s32[] parameter(1)
+      p2 = s32[] parameter(2)
+      sum = s32[] add(p0, p2)
+      called_fusion = (s32[], s32[]) fusion(p1, sum), kind=kLoop, calls=plus_one
+      ROOT gte = s32[] get-tuple-element(called_fusion), index=0
+    }
+
+    add_values {
+      p0 = s32[] parameter(0)
+      p1 = s32[] parameter(1)
+      ROOT sum = s32[] add(p0, p1)
+    }
+
+    while_body {
+      p0 = (s32[], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      ivar_copy = s32[] copy(ivar)
+
+      side_effect = s32[] custom-call(), custom_call_target=""
+
+      derived = s32[] fusion(ivar_copy), kind=kLoop, calls=remainder
+      call = s32[] call(side_effect, derived, ivar), to_apply=call_body
+
+      // `derived_with_invalid_dep` and `not_functionally_dependent` are not, because
+      // they have a custom call in their transitive dependencies.
+      derived_with_invalid_dep = s32[] fusion(ivar_copy, side_effect), kind=kLoop,
+        calls=add_values
+      not_functionally_dependent = s32[] fusion(derived_with_invalid_dep),
+        kind=kLoop, calls=identity2
+
+      c1 = s32[] constant(1)
+      next_ivar = s32[] add(ivar_copy, c1)
+      use = s32[] add(call, not_functionally_dependent)
+
+      ROOT result = (s32[], s32[]) tuple(next_ivar, use)
+    }
+
+    compare {
+      p0 = s32[] parameter(0)
+      c5 = s32[] constant(5)
+      ROOT cmp = pred[] compare(p0, c5), direction=LT
+    }
+
+    condition {
+      p0 = (s32[], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      ROOT cmp = pred[] fusion(ivar), kind=kLoop, calls=compare
+    }
+
+    ENTRY main {
+      c0 = s32[] constant(0)
+      tuple = (s32[], s32[]) tuple(c0, c0)
+      ROOT while = (s32[], s32[]) while(tuple),
+          condition=condition, body=while_body,
+          backend_config={"known_induction_variable":{"tuple_index":"0"}}
+    }
+)";
+
+TEST_F(IrEmissionUtilsTest, ResolveWhileLoopDependency) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kWhileLoopTestModule));
+
+  HloComputation* while_body = module->GetComputationWithName("while_body");
+  HloComputation* plus_one = module->GetComputationWithName("plus_one");
+  HloComputation* call_body = module->GetComputationWithName("call_body");
+
+  const HloInstruction* loop = module->entry_computation()->root_instruction();
+  auto result = ResolveFunctionalDependencyOnInductionVariable(
+      plus_one->GetInstructionWithName("sum"));
+
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(result->loop, loop);
+  EXPECT_EQ(result->induction_var, while_body->GetInstructionWithName("ivar"));
+
+  EXPECT_THAT(result->required_parameters, SizeIs(2));
+  EXPECT_THAT(result->required_parameters[plus_one], ElementsAre(true, false));
+  EXPECT_THAT(result->required_parameters[call_body],
+              ElementsAre(false, true, false));
+}
+
+TEST_F(IrEmissionUtilsTest,
+       ResolveWhileLoopDependencyUnknownInductionVariable) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kWhileLoopTestModule));
+
+  HloInstruction* loop = module->entry_computation()->root_instruction();
+  loop->clear_backend_config();
+  auto result = ResolveFunctionalDependencyOnInductionVariable(
+      module->GetComputationWithName("plus_one")->root_instruction());
+
+  ASSERT_FALSE(result.has_value());
+}
+
+TEST_F(IrEmissionUtilsTest, ResolveWhileLoopDependencySideEffect) {
+  // Verifies that we detect `not_functionally_dependent` depends on an
+  // instruction that has a side effect.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kWhileLoopTestModule));
+
+  auto* while_body = module->GetComputationWithName("while_body");
+  const HloInstruction* called_fusion =
+      while_body->GetInstructionWithName("not_functionally_dependent");
+  auto result = ResolveFunctionalDependencyOnInductionVariable(
+      called_fusion->called_computations()[0]->root_instruction());
+
+  ASSERT_FALSE(result.has_value());
+}
+
+TEST_F(IrEmissionUtilsTest, Transpose_10) {
+  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+    p0 = f32[8, 32] parameter(0)
+    ROOT transpose_p0 = f32[32, 8] transpose(p0), dimensions={1, 0}
+  })");
+  EXPECT_THAT(spec.permutation, ElementsAre(1, 0));
+  EXPECT_THAT(spec.inv_permutation, ElementsAre(1, 0));
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 1, 32, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 1, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
+}
+
+TEST_F(IrEmissionUtilsTest, Transpose_210) {
+  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+    p0 = f32[8, 2, 32] parameter(0)
+    ROOT transpose_p0 = f32[32, 2, 8] transpose(p0), dimensions={2, 1, 0}
+  })");
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 2, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
+}
+
+TEST_F(IrEmissionUtilsTest, Transpose_102) {
+  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+    p0 = f32[8, 2, 32, 7, 6] parameter(0)
+    ROOT transpose_p0 = f32[6, 32, 2, 7, 8] transpose(p0),
+      dimensions={4, 2, 1, 3, 0}
+  })");
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 7, 6, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(6, 32, 2, 7, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 9a28b96ad946..fc0ca7d0ad8a 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
@@ -44,15 +44,10 @@ namespace xla {
 namespace gpu {
 // Maps async start ops to their async events so we can emit done thunk
 // sharing events with corresponding start thunk. Async events may be null if
-// the start op is degenerate (so not emitted). For Send and Recv, this maps
-// <isRecv, channel_id> to the asyn events, as multiple Recv and Recv-done or
-// multiple Send and Send-done may map to the same async events and a Recv-done
-// or Send-done operand may not be its corresponding Recv or Send, when a
-// Send-Recv chain inside a loop is pipelined.
+// the start op is degenerate (so not emitted).
 using CollectivesAsyncEvents =
-    absl::flat_hash_map<std::variant<mlir::Operation*, const HloInstruction*,
-                                     std::pair<bool, uint64_t>>,
-                        std::shared_ptr<NcclCollectiveThunk::AsyncEvents>>;
+    absl::flat_hash_map<std::variant<mlir::Operation*, const HloInstruction*>,
+                        std::shared_ptr<CollectiveThunk::AsyncEvents>>;
 
 // IrEmitterContext encapsulates common (mutable and immutable) data structures
 // used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
index 149bddb5bbe2..4aae3e524181 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
@@ -211,7 +211,7 @@ absl::StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
       llvm::Type* tuple_type =
           llvm_ir::ShapeToIrType(return_shape, module_->getContext());
 
-      for (int i = 0; i < return_shape.tuple_shapes_size(); i++) {
+      for (int i = 0; i < return_shape.tuple_shapes().size(); i++) {
         const Shape& element_shape = return_shape.tuple_shapes(i);
         llvm::Value* destination = llvm_ir::EmitGetTupleElement(
             element_shape,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index fcedefa8f3e2..e20082965bd3 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
@@ -88,18 +89,21 @@ limitations under the License.
 #include "xla/mlir_hlo/transforms/gpu_passes.h"
 #include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #ifdef GOOGLE_CUDA
 #include "xla/stream_executor/cuda/cuda_solver_context.h"
 #endif  // GOOGLE_CUDA
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
+#include "xla/backends/gpu/runtime/topk.h"
 #include "xla/service/gpu/execution_stream_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
@@ -111,7 +115,6 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
-#include "xla/service/gpu/kernels/topk_custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
@@ -119,7 +122,14 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/rocm/rocm_solver_context.h"
 #endif  // TENSORFLOW_USE_ROCM
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/cholesky_thunk.h"
+#include "xla/backends/gpu/runtime/collective_broadcast_thunk.h"
+#include "xla/backends/gpu/runtime/collective_group_thunk.h"
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd.h"
 #include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
@@ -133,23 +143,16 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/fft_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
+#include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/infeed_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_gather_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_reduce_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_all_to_all_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_broadcast_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_permute_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_collective_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_group_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_recv_thunk.h"
-#include "xla/backends/gpu/runtime/nccl_send_thunk.h"
 #include "xla/backends/gpu/runtime/norm_thunk.h"
 #include "xla/backends/gpu/runtime/outfeed_thunk.h"
+#include "xla/backends/gpu/runtime/p2p_thunk_common.h"
+#include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
+#include "xla/backends/gpu/runtime/recv_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
-#include "xla/backends/gpu/runtime/send_recv_thunk.h"
+#include "xla/backends/gpu/runtime/send_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/triangular_solve_thunk.h"
@@ -172,12 +175,12 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/human_readable_json.h"
-#include "tsl/platform/statusor.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla {
@@ -185,7 +188,7 @@ namespace gpu {
 
 IrEmitterUnnested::IrEmitterUnnested(IrEmitterContext* ir_emitter_context)
     : IrEmitter(ir_emitter_context, /*is_nested=*/false),
-      send_recv_events_(std::make_shared<SendRecvAsyncEvents>()),
+      send_recv_events_(std::make_shared<HostSendRecvAsyncEvents>()),
       copy_events_(std::make_shared<CopyThunk::AsyncEvents>()),
       call_graph_(CallGraph::Build(&ir_emitter_context->hlo_module())) {}
 
@@ -215,17 +218,6 @@ absl::Status IrEmitterUnnested::EmitConstant(
   return absl::OkStatus();
 }
 
-static ConditionalThunkConfig GetConditionalThunkConfig(
-    const HloInstruction* instr,
-    std::vector<std::unique_ptr<SequentialThunk>> branch_thunk_sequences) {
-  ConditionalThunkConfig config;
-  config.branch_index_is_bool =
-      instr->operand(0)->shape().element_type() == PRED;
-  config.branch_count = instr->branch_count();
-  config.branch_thunks = std::move(branch_thunk_sequences);
-  return config;
-}
-
 absl::Status IrEmitterUnnested::EmitConditional(const HloInstruction* instr) {
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.reserve(instr->branch_count());
@@ -241,14 +233,12 @@ absl::Status IrEmitterUnnested::EmitConditional(const HloInstruction* instr) {
         ir_emitter->ConsumeThunkSequence(branch_thunk_info));
   }
 
-  ConditionalThunkConfig config =
-      GetConditionalThunkConfig(instr, std::move(branch_thunks));
-
   TF_ASSIGN_OR_RETURN(auto slice,
                       GetAllocationSliceForHlo(instr->operand(0), {}));
-  AddThunkToThunkSequence(std::unique_ptr<Thunk>(
-      new ConditionalThunk(Thunk::ThunkInfo::WithProfileAnnotation(instr),
-                           std::move(config), slice)));
+  bool branch_index_is_bool = instr->operand(0)->shape().element_type() == PRED;
+  AddThunkToThunkSequence(std::unique_ptr<Thunk>(new ConditionalThunk(
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), slice,
+      std::move(branch_thunks), branch_index_is_bool)));
   return absl::OkStatus();
 }
 
@@ -567,18 +557,19 @@ absl::Status IrEmitterUnnested::EmitCommandBufferThunk(
   // Maybe serialize all commands in a sequence by forcing barriers between all
   // recorded commands. This guarantees that we execute all device operations
   // in the exact same order as a thunk sequence.
-  CommandBufferCmdSequence::SynchronizationMode synchronization_mode =
+  CommandBufferCmdExecutor::SynchronizationMode synchronization_mode =
       ir_emitter_context_->debug_options()
               .xla_gpu_graph_enable_concurrent_region()
-          ? CommandBufferCmdSequence::SynchronizationMode::kAutomatic
-          : CommandBufferCmdSequence::SynchronizationMode::kSerialize;
+          ? CommandBufferCmdExecutor::SynchronizationMode::kAutomatic
+          : CommandBufferCmdExecutor::SynchronizationMode::kSerialize;
 
   TF_ASSIGN_OR_RETURN(
-      CommandBufferCmdSequence cmd_sequence,
-      ConvertToCommands(thunk_sequence->thunks(), synchronization_mode));
+      CommandBufferCmdExecutor cmd_executor,
+      ConvertToCommands(thunk_sequence->thunks(),
+                        ConvertToCommandsOptions{synchronization_mode}));
 
   AddThunkToThunkSequence(std::make_unique<CommandBufferThunk>(
-      std::move(cmd_sequence), Thunk::ThunkInfo::WithProfileAnnotation(instr),
+      std::move(cmd_executor), Thunk::ThunkInfo::WithProfileAnnotation(instr),
       std::move(thunk_sequence),
       ir_emitter_context_->debug_options()
           .xla_enable_command_buffers_during_profiling()));
@@ -600,7 +591,7 @@ absl::Status IrEmitterUnnested::EmitConvolutionThunk(
   // always the result and the scratch buffer. It may have auxiliary results in
   // addition to the main result.
   std::vector<BufferAllocation::Slice> result_slices;
-  for (int i = 0; i < instr->shape().tuple_shapes_size() - 1; i++) {
+  for (int i = 0; i < instr->shape().tuple_shapes().size() - 1; i++) {
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
                         GetAllocationSliceForHlo(instr, {i}));
     result_slices.push_back(result_slice);
@@ -709,9 +700,10 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
 
   std::optional<BufferAllocation::Slice> workspace_buffer;
   if (instr->shape().IsTuple() &&
-      (instr->shape().tuple_shapes_size() - has_aux_output - 1)) {
-    TF_RET_CHECK((has_aux_output && instr->shape().tuple_shapes_size() == 3) ||
-                 (!has_aux_output && instr->shape().tuple_shapes_size() == 2));
+      (instr->shape().tuple_shapes().size() - has_aux_output - 1)) {
+    TF_RET_CHECK(
+        (has_aux_output && instr->shape().tuple_shapes().size() == 3) ||
+        (!has_aux_output && instr->shape().tuple_shapes().size() == 2));
     TF_ASSIGN_OR_RETURN(workspace_buffer,
                         GetAllocationSliceForHlo(
                             instr, {instr->shape().tuple_shapes_size() - 1}));
@@ -732,9 +724,8 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
   TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::Epilogue blas_lt_epilogue,
                       gpublas_lt::AsBlasLtEpilogue(epilogue));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
-      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax, workspace_buffer);
+      instr, std::move(gemm_config), blas_lt_epilogue, algorithm, a, b, c, d,
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax, workspace_buffer);
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
@@ -815,7 +806,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
   BufferAllocation::Slice aux;  // Not used.
   TF_RET_CHECK(!has_aux_output);
   std::optional<BufferAllocation::Slice> workspace_buffer;
-  if (instr->shape().tuple_shapes_size() - config.damax_output() == 2) {
+  if (instr->shape().tuple_shapes().size() - config.damax_output() == 2) {
     TF_ASSIGN_OR_RETURN(workspace_buffer,
                         GetAllocationSliceForHlo(
                             instr, {instr->shape().tuple_shapes_size() - 1}));
@@ -824,9 +815,8 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
   TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::Epilogue blas_lt_epilogue,
                       gpublas_lt::AsBlasLtEpilogue(epilogue));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
-      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax, workspace_buffer);
+      instr, std::move(gemm_config), blas_lt_epilogue, algorithm, a, b, c, d,
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax, workspace_buffer);
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
@@ -835,7 +825,7 @@ absl::Status IrEmitterUnnested::EmitConvolutionReorderThunk(
     const HloCustomCallInstruction* instr) {
   bool has_bias = instr->operand_count() > 1;
   Shape shape = has_bias ? instr->shape().tuple_shapes(0) : instr->shape();
-  if (shape.rank() != 5 || shape.dimensions(4) != 32) {
+  if (shape.dimensions().size() != 5 || shape.dimensions(4) != 32) {
     return Internal("Unexpected shape for convolution reorder: %s",
                     instr->ToString());
   }
@@ -1020,7 +1010,8 @@ absl::Status IrEmitterUnnested::EmitCubDeviceRadixSort(
           : std::nullopt,
       operands, results, scratch, options.descending(),
       Product(operand_shape.dimensions()) /
-          operand_shape.dimensions(operand_shape.rank() - 1));
+          operand_shape.dimensions(operand_shape.dimensions().size() - 1),
+      ir_emitter_context_->platform_name());
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
@@ -1029,7 +1020,7 @@ absl::Status IrEmitterUnnested::EmitCholeskyThunk(const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(CholeskyOptions options,
                       instr->backend_config<CholeskyOptions>());
   const Shape& shape = instr->operand(0)->shape();
-  int ndim = shape.dimensions_size();
+  int ndim = shape.dimensions().size();
   CHECK_GE(ndim, 2);
   int64_t n = shape.dimensions(ndim - 1);
 
@@ -1160,6 +1151,15 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   // xla/g3doc/custom_call.md.
   switch (instr->api_version()) {
     case CustomCallApiVersion::API_VERSION_ORIGINAL:
+#ifdef PLATFORM_GOOGLE
+      LOG(FATAL)
+#else
+      LOG(ERROR)
+#endif
+          << "Custom call API version `API_VERSION_ORIGINAL` is not supported "
+             "by XLA:GPU. Prefer https://docs.jax.dev/en/latest/ffi.html. It "
+             "will be fully removed in November 2025.";
+
       custom_call_target = [call_target](stream_executor::Stream* stream,
                                          void** buffers, const char* opaque,
                                          size_t opaque_len,
@@ -1258,7 +1258,7 @@ absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
   TF_RET_CHECK(instr->operand_count() == 2);
   auto operands = instr->operands();
   TF_RET_CHECK(instr->shape().IsTuple() &&
-               instr->shape().tuple_shapes_size() == 2);
+               instr->shape().tuple_shapes().size() == 2);
 
   // We expect Fortran layout for everything other than the temp buffer (the
   // last operand).  Fortran layout is not XLA default layout with elements 0
@@ -1304,8 +1304,8 @@ absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
         /*mem_size=*/ShapeUtil::ByteSizeOf(b_shape)));
   }
 
-  int64_t m = b_shape.dimensions(b_shape.rank() - 2);
-  int64_t n = b_shape.dimensions(b_shape.rank() - 1);
+  int64_t m = b_shape.dimensions(b_shape.dimensions().size() - 2);
+  int64_t n = b_shape.dimensions(b_shape.dimensions().size() - 1);
   int64_t batch_size = std::accumulate(
       b_shape.dimensions().begin(), b_shape.dimensions().end() - 2, int64_t{1},
       [](int64_t a, int64_t b) { return a * b; });
@@ -1339,18 +1339,18 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
       << "Expect only 1 operand for TopK custom call.";
   TF_RET_CHECK(shape.IsTuple())
       << "Expect TopK custom call to have tuple shape.";
-  TF_RET_CHECK(shape.tuple_shapes_size() == 2)
+  TF_RET_CHECK(shape.tuple_shapes().size() == 2)
       << "Expect TopK custom call shape to have exactly 2 sub-shapes.";
 
   auto data_shape = operands[0]->shape();
   auto top_elements_shape = shape.tuple_shapes()[0];
   auto indices_shape = shape.tuple_shapes()[1];
 
-  TF_RET_CHECK(data_shape.rank() <= 2) << "Invalid input shape.";
+  TF_RET_CHECK(data_shape.dimensions().size() <= 2) << "Invalid input shape.";
   TF_RET_CHECK(indices_shape.element_type() == PrimitiveType::S32)
       << "Indices should be S32.";
 
-  bool has_batch = data_shape.rank() == 2;
+  bool has_batch = data_shape.dimensions().size() == 2;
   auto [batch_size, n, k] =
       has_batch
           ? std::tuple<size_t, size_t, size_t>{data_shape.dimensions(0),
@@ -1359,10 +1359,14 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
           : std::tuple<size_t, size_t, size_t>{
                 1, data_shape.dimensions(0), top_elements_shape.dimensions(0)};
 
+  auto wavefront_size =
+      ir_emitter_context_->gpu_device_info().threads_per_warp();
+
   // Load TopK custom kernel.
-  TF_ASSIGN_OR_RETURN(CustomKernel kernel,
-                      kernel::topk::GetTopKKernel(
-                          "topk", data_shape.element_type(), n, k, batch_size));
+  TF_ASSIGN_OR_RETURN(
+      CustomKernel kernel,
+      kernel::topk::GetTopKKernel("topk", data_shape.element_type(), n, k,
+                                  batch_size, platform_name(), wavefront_size));
 
   // Prepare kernel arguments.
   TF_ASSIGN_OR_RETURN(
@@ -1423,7 +1427,7 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
 
     TF_ASSIGN_OR_RETURN(
         auto result,
-        CompileTritonToLLVM(hlo_module->config(), hlo_module->name(),
+        CompileTritonToLLVM(kernel_name, *hlo_module,
                             ir_emitter_context_->gpu_device_info(),
                             block_level_parameters, triton_module.get(),
                             ir_emitter_context_->llvm_module(), mlir_context,
@@ -1453,11 +1457,11 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
       llvm::Function* kernel;
       std::vector<llvm_ir::IrArray> inputs;
       std::vector<llvm_ir::IrArray> outputs;
-      TF_ASSIGN_OR_RETURN(
-          std::tie(kernel, inputs, outputs),
-          BuildKernelPrototypeFromUniqueName(
-              *ir_emitter_context_, sanitized_kernel_name,
-              kernel_arguments.args(), arg_size, launch_dimensions, &builder));
+      TF_ASSIGN_OR_RETURN(std::tie(kernel, inputs, outputs),
+                          BuildKernelPrototypeFromUniqueName(
+                              *ir_emitter_context_, impl_fn->getName().str(),
+                              sanitized_kernel_name, kernel_arguments.args(),
+                              arg_size, launch_dimensions, &builder));
 
       // Move function body into kernel prototype.
       llvm::Function* prototype_func = builder.GetInsertBlock()->getParent();
@@ -1672,10 +1676,12 @@ absl::Status IrEmitterUnnested::EmitSort(const HloSortInstruction* sort) {
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
     // same.
-    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(keys_shape,
-                                                  sort->operand(i)->shape()));
+    TF_RET_CHECK(
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, sort->operand(i)->shape(),
+                                         Layout::Equal().IgnoreMemorySpace()));
     TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
-        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index)));
+        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index),
+        Layout::Equal().IgnoreMemorySpace()));
 
     BufferAllocation::Slice destination_buffer;
     BufferAllocation::Slice source_address;
@@ -1854,7 +1860,7 @@ absl::Status IrEmitterUnnested::EmitCollectivePermute(
     const HloCollectivePermuteInstruction* instr) {
   // First output is aliased.
   TF_RET_CHECK(
-      instr->shape().IsTuple() && instr->shape().tuple_shapes_size() == 2 &&
+      instr->shape().IsTuple() && instr->shape().tuple_shapes().size() == 2 &&
       Shape::Equal().IgnoreMemorySpaceInLayout()(
           instr->shape().tuple_shapes(0), instr->shape().tuple_shapes(1)));
 
@@ -1863,7 +1869,7 @@ absl::Status IrEmitterUnnested::EmitCollectivePermute(
   const int64_t partition_count = hlo_config.num_partitions();
 
   auto operands = instr->operands();
-  std::vector<NcclCollectiveThunk::Buffer> buffers;
+  std::vector<CollectiveThunk::Buffer> buffers;
   for (int oprd_idx = 0; oprd_idx < operands.size(); ++oprd_idx) {
     const auto operand = operands.at(oprd_idx);
     const ShapeIndex nested_shape_idx = {1, oprd_idx}, normal_shape_idx = {1};
@@ -1882,8 +1888,8 @@ absl::Status IrEmitterUnnested::EmitCollectivePermute(
 
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice source_slice,
                         GetAllocationSliceForHlo(operand));
-    if (NcclCollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
-                                                      partition_count)) {
+    if (CollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
+                                                  partition_count)) {
       // For a degenerate collective permute, just generate a copy thunk.
       AddThunkToThunkSequence(std::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo::WithProfileAnnotation(instr),
@@ -1893,7 +1899,7 @@ absl::Status IrEmitterUnnested::EmitCollectivePermute(
       // Signal that start thunk not created with nullptr.
       GetCollectivesAsyncEvents().try_emplace(instr, nullptr);
     } else {
-      const NcclCollectiveThunk::Buffer buffer = {
+      const CollectiveThunk::Buffer buffer = {
           /*element_count=*/ShapeUtil::ElementsIn(operand_shape),
           /*source_buffer=*/source_slice,
           /*destination_buffer=*/result_slice,
@@ -1902,9 +1908,9 @@ absl::Status IrEmitterUnnested::EmitCollectivePermute(
       buffers.push_back(buffer);
     }
   }
-  if (!NcclCollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
-                                                     partition_count)) {
-    auto thunk = std::make_unique<NcclCollectivePermuteStartThunk>(
+  if (!CollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
+                                                 partition_count)) {
+    auto thunk = std::make_unique<CollectivePermuteStartThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), instr, replica_count,
         partition_count, buffers,
         ir_emitter_context_->debug_options().xla_gpu_use_memcpy_local_p2p(),
@@ -1915,14 +1921,14 @@ absl::Status IrEmitterUnnested::EmitCollectivePermute(
   return absl::OkStatus();
 }
 
-template <typename NcclThunkType, typename HloInstType>
-absl::Status IrEmitterUnnested::EmitNcclThunk(
+template <typename CollectiveThunkType, typename HloInstType>
+absl::Status IrEmitterUnnested::EmitCollectiveThunk(
     Thunk::Kind kind, const HloInstruction* async_start,
     const HloInstType* inst, std::optional<bool> use_global_device_ids) {
   const auto& hlo_config = ir_emitter_context_->hlo_module().config();
   int64_t replica_count = hlo_config.replica_count();
   int64_t partition_count = hlo_config.num_partitions();
-  VLOG(2) << NcclThunkType::GetHloOpName()
+  VLOG(2) << CollectiveThunkType::GetHloOpName()
           << "; replica count: " << replica_count
           << "; partition count: " << partition_count
           << "; operand count: " << inst->operand_count();
@@ -1937,16 +1943,16 @@ absl::Status IrEmitterUnnested::EmitNcclThunk(
   // statically known. This operation can not be expressed in term of standard
   // HLO instructions, so the best solution we have is to use NCCL thunk even
   // for degenerate cases.
-  bool is_degenerate = kind != Thunk::Kind::kNcclRaggedAllToAll &&
-                       GetNcclCollectiveConfig(inst, use_global_device_ids)
+  bool is_degenerate = kind != Thunk::Kind::kRaggedAllToAll &&
+                       GetCollectiveConfig(inst, use_global_device_ids)
                            .IsDegenerate(replica_count, partition_count);
-  absl::Status implementable_status =
-      NcclThunkType::CheckImplementable(inst, replica_count, partition_count);
+  absl::Status implementable_status = CollectiveThunkType::CheckImplementable(
+      inst, replica_count, partition_count);
   bool should_use_nccl_thunk = !is_degenerate && implementable_status.ok();
 
-  // Stash relevant information in NcclCollectiveThunk::Buffer even if we may
-  // not generate an NcclCollectiveThunk.
-  std::vector<NcclCollectiveThunk::Buffer> buffers;
+  // Stash relevant information in CollectiveThunk::Buffer even if we may
+  // not generate an CollectiveThunk.
+  std::vector<CollectiveThunk::Buffer> buffers;
 
   int64_t operand_count = inst->operand_count();
   buffers.reserve(operand_count);
@@ -1955,17 +1961,17 @@ absl::Status IrEmitterUnnested::EmitNcclThunk(
   auto add_buffer = [&](int64_t element_count, BufferAllocation::Slice src,
                         int64_t src_memory_space, BufferAllocation::Slice dst,
                         int64_t dst_memory_space) {
-    buffers.push_back(NcclCollectiveThunk::Buffer{
-        /*element_count=*/element_count,
-        /*source_buffer=*/src,
-        /*destination_buffer=*/dst,
-        /*source_memory_space=*/src_memory_space,
-        /*destination_memory_space=*/dst_memory_space,
-        /*source_value=*/nullptr,
-        /*destination_value=*/nullptr});
+    buffers.push_back(
+        CollectiveThunk::Buffer{/*element_count=*/element_count,
+                                /*source_buffer=*/src,
+                                /*destination_buffer=*/dst,
+                                /*source_memory_space=*/src_memory_space,
+                                /*destination_memory_space=*/dst_memory_space,
+                                /*source_value=*/nullptr,
+                                /*destination_value=*/nullptr});
   };
 
-  if (kind == Thunk::Kind::kNcclAllGatherStart) {
+  if (kind == Thunk::Kind::kAllGatherStart) {
     // Start operations return a tuple of (<<inputs>>, <<outputs>>) where
     // outputs can be a tuple itself (if operation has multiple operands).
     for (int64_t i = 0; i < operand_count; i++) {
@@ -1978,7 +1984,7 @@ absl::Status IrEmitterUnnested::EmitNcclThunk(
                  src_shape.layout().memory_space(), dst,
                  dst_shape.layout().memory_space());
     }
-  } else if (kind == Thunk::Kind::kNcclRaggedAllToAll) {
+  } else if (kind == Thunk::Kind::kRaggedAllToAll) {
     // RaggedAllToAll operation has 6 operands: input, output, input_offset,
     // send_size, output_offset, recv_size.
     // `output` operand is aliased with the instruction result. All other
@@ -2029,7 +2035,7 @@ absl::Status IrEmitterUnnested::EmitNcclThunk(
     if (ir_emitter_context_->debug_options().xla_syntax_sugar_async_ops()) {
       thunk_info.profile_annotation = async_start->name();
     }
-    auto thunk = std::make_unique<NcclThunkType>(
+    auto thunk = std::make_unique<CollectiveThunkType>(
         thunk_info, inst, /*buffers=*/std::move(buffers),
         ir_emitter_context_->debug_options().xla_gpu_use_memcpy_local_p2p());
     GetCollectivesAsyncEvents().insert({async_start, thunk->async_events()});
@@ -2115,8 +2121,10 @@ static const HloInstruction* FindCanonicalSendRecvStartOp(
           unique_user->opcode() == HloOpcode::kWhile);
     if (unique_user->IsRoot()) {
       // send/recv op in the loop body.
-      CHECK(unique_user->parent()->IsWhileBodyComputation());
-      while_op = unique_user->parent()->WhileCallInstruction();
+      auto maybe_while_op =
+          unique_user->parent()->GetUniqueCaller(HloOpcode::kWhile);
+      CHECK(maybe_while_op);
+      while_op = *maybe_while_op;
       i = unique_user->operand_index(inst);
     } else {
       // send/recv leading into the loop.
@@ -2146,8 +2154,10 @@ static const HloInstruction* FindCanonicalSendRecvStartOp(
     if (iter_tuple->opcode() == HloOpcode::kParameter) {
       // send-done/recv-done in the loop body.
       CHECK(Cast<HloParameterInstruction>(iter_tuple)->parameter_number() == 0);
-      CHECK(operand->parent()->IsWhileBodyComputation());
-      while_op = iter_tuple->parent()->WhileCallInstruction();
+      auto maybe_while =
+          iter_tuple->parent()->GetUniqueCaller(HloOpcode::kWhile);
+      CHECK(maybe_while);
+      while_op = *maybe_while;
       i = gte->tuple_index();
     } else {
       // send-done/recv-done proceeding the loop.
@@ -2159,7 +2169,7 @@ static const HloInstruction* FindCanonicalSendRecvStartOp(
 
   // Extract canonical start op from while loop's init.
   CHECK(while_op != nullptr);
-  CHECK(0 <= i && i < while_op->shape().tuple_shapes_size());
+  CHECK(0 <= i && i < while_op->shape().tuple_shapes().size());
   const HloInstruction* init = while_op->operand(0);
   const HloInstruction* canonical_start_op = init->operand(i);
   CHECK(canonical_start_op->opcode() == HloOpcode::kSend ||
@@ -2167,7 +2177,7 @@ static const HloInstruction* FindCanonicalSendRecvStartOp(
   return canonical_start_op;
 }
 
-absl::Status IrEmitterUnnested::EmitNcclGroupStartThunk(
+absl::Status IrEmitterUnnested::EmitCollectiveGroupStartThunk(
     const HloInstruction* instr) {
   emit_group_thunks_ = true;
   std::optional<AsyncStreamKind> stream_kind;
@@ -2182,8 +2192,8 @@ absl::Status IrEmitterUnnested::EmitNcclGroupStartThunk(
       stream_kind = GetStreamKindForP2P(nested_instruction);
     }
   }
-  auto thunk = std::make_unique<NcclGroupThunk>(
-      instr, Thunk::Kind::kNcclGroupStart, std::move(scoped_thunk_sequence_),
+  auto thunk = std::make_unique<CollectiveGroupThunk>(
+      instr, Thunk::Kind::kGroupStart, std::move(scoped_thunk_sequence_),
       stream_kind.value_or(AsyncStreamKind::kCollective));
   emit_group_thunks_ = false;
 
@@ -2192,11 +2202,11 @@ absl::Status IrEmitterUnnested::EmitNcclGroupStartThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitNcclAsyncDone(Thunk::Kind kind,
-                                                  const HloInstruction* inst) {
+absl::Status IrEmitterUnnested::EmitCollectiveAsyncDone(
+    Thunk::Kind kind, const HloInstruction* inst) {
   // Partial pipelining is only implemented for send/recv.
   bool is_send_recv =
-      kind == Thunk::Kind::kNcclRecvDone || kind == Thunk::Kind::kNcclSendDone;
+      kind == Thunk::Kind::kRecvDone || kind == Thunk::Kind::kSendDone;
   const HloInstruction* start =
       is_send_recv ? FindCanonicalSendRecvStartOp(inst) : inst->operand(0);
 
@@ -2215,7 +2225,7 @@ absl::Status IrEmitterUnnested::EmitNcclAsyncDone(Thunk::Kind kind,
   if (is_send_recv) {
     stream_kind = GetStreamKindForP2P(start);
   }
-  AddThunkToThunkSequence(std::make_unique<NcclCollectiveDoneThunk>(
+  AddThunkToThunkSequence(std::make_unique<CollectiveDoneThunk>(
       kind, Thunk::ThunkInfo::WithProfileAnnotation(inst),
       async_events_it->second, stream_kind));
   return absl::OkStatus();
@@ -2293,9 +2303,10 @@ IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
   std::vector<llvm_ir::IrArray> outputs;
   TF_ASSIGN_OR_RETURN(
       std::tie(kernel, inputs, outputs),
-      BuildKernelPrototype(
-          *ir_emitter_context_, suggested_kernel_name, kernel_arguments.args(),
-          kernel_arguments.args().size(), launch_dimensions, &b_));
+      BuildKernelPrototype(*ir_emitter_context_, suggested_kernel_name,
+                           suggested_kernel_name, kernel_arguments.args(),
+                           kernel_arguments.args().size(), launch_dimensions,
+                           &b_));
 
   AddThunkToThunkSequence(std::make_unique<KernelThunk>(
       hlo, kernel->getName().str(), kernel_arguments.args(), launch_dimensions,
@@ -2451,13 +2462,13 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
         instr->shape().IsTuple()
             ? instr->shape().tuple_shapes(0).layout().memory_space()
             : instr->shape().layout().memory_space();
-    const NcclCollectiveThunk::Buffer nccl_buffer = {
+    const CollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(src->shape()),
         /*source_buffer=*/buffer,
         /*destination_buffer=*/buffer,
         /*source_memory_space=*/memory_space,
         /*destination_memory_space=*/memory_space};
-    auto thunk = std::make_unique<NcclSendThunk>(
+    auto thunk = std::make_unique<SendThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), instr, replica_count,
         partition_count, nccl_buffer);
     CollectivesAsyncEvents& collectives_async_events =
@@ -2483,7 +2494,7 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
         "Unknown channel id in host transfer send instruction");
   }
 
-  AddThunkToThunkSequence(std::make_unique<SendThunk>(
+  AddThunkToThunkSequence(std::make_unique<HostSendThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), src->shape(), buffer,
       *instr->channel_id(), send_recv_events_,
       ConvertFrontendAttributes(instr->frontend_attributes()),
@@ -2495,7 +2506,7 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
 absl::Status IrEmitterUnnested::EmitSendDoneThunk(
     const HloSendDoneInstruction* instr) {
   if (!instr->is_host_transfer()) {
-    return EmitNcclAsyncDone(Thunk::kNcclSendDone, instr);
+    return EmitCollectiveAsyncDone(Thunk::kSendDone, instr);
   }
 
   if (!instr->channel_id().has_value()) {
@@ -2503,7 +2514,7 @@ absl::Status IrEmitterUnnested::EmitSendDoneThunk(
         "Unknown channel id in host transfer send done instruction");
   }
 
-  AddThunkToThunkSequence(std::make_unique<SendDoneThunk>(
+  AddThunkToThunkSequence(std::make_unique<HostSendDoneThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), *instr->channel_id(),
       send_recv_events_, DeviceConstraint(instr)));
 
@@ -2525,13 +2536,13 @@ absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
             ? instr->shape().tuple_shapes(0).layout().memory_space()
             : instr->shape().layout().memory_space();
 
-    const NcclCollectiveThunk::Buffer nccl_buffer = {
+    const CollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(instr->shape().tuple_shapes(0)),
         /*source_buffer=*/buffer,
         /*destination_buffer=*/buffer,
         /*source_memory_space=*/memory_space,
         /*destination_memory_space=*/memory_space};
-    auto thunk = std::make_unique<NcclRecvThunk>(
+    auto thunk = std::make_unique<RecvThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), instr, replica_count,
         partition_count, nccl_buffer);
     CollectivesAsyncEvents& collectives_async_events =
@@ -2556,7 +2567,7 @@ absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
         "Unknown channel id in host transfer recv instruction");
   }
 
-  AddThunkToThunkSequence(std::make_unique<RecvThunk>(
+  AddThunkToThunkSequence(std::make_unique<HostRecvThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr),
       instr->shape().tuple_shapes()[0], buffer, *instr->channel_id(),
       send_recv_events_,
@@ -2569,13 +2580,13 @@ absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
 absl::Status IrEmitterUnnested::EmitRecvDoneThunk(
     const HloRecvDoneInstruction* instr) {
   if (!instr->is_host_transfer()) {
-    return EmitNcclAsyncDone(Thunk::kNcclRecvDone, instr);
+    return EmitCollectiveAsyncDone(Thunk::kRecvDone, instr);
   }
   if (!instr->channel_id().has_value()) {
     return absl::InternalError(
         "Unknown channel id in host transfer recv done instruction");
   }
-  AddThunkToThunkSequence(std::make_unique<RecvDoneThunk>(
+  AddThunkToThunkSequence(std::make_unique<HostRecvDoneThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), *instr->channel_id(),
       send_recv_events_, DeviceConstraint(instr)));
 
@@ -2599,37 +2610,38 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
     const HloInstruction* instr) {
   switch (instr->opcode()) {
     case HloOpcode::kAllGatherDone:
-      return EmitNcclAsyncDone(Thunk::kNcclAllGatherDone, instr);
+      return EmitCollectiveAsyncDone(Thunk::kAllGatherDone, instr);
     case HloOpcode::kAllGatherStart: {
       auto* all_gather = Cast<HloAllGatherInstruction>(instr);
-      return EmitNcclThunk<NcclAllGatherStartThunk, HloAllGatherInstruction>(
-          Thunk::kNcclAllGatherStart, all_gather, all_gather,
+      return EmitCollectiveThunk<AllGatherStartThunk, HloAllGatherInstruction>(
+          Thunk::kAllGatherStart, all_gather, all_gather,
           all_gather->use_global_device_ids());
     }
 
     case HloOpcode::kAllReduceDone:
-      return EmitNcclAsyncDone(Thunk::kNcclAllReduceDone, instr);
+      return EmitCollectiveAsyncDone(Thunk::kAllReduceDone, instr);
     case HloOpcode::kAllReduceStart: {
       auto* all_reduce = Cast<HloAllReduceInstruction>(instr);
-      return EmitNcclThunk<NcclAllReduceStartThunk, HloAllReduceInstruction>(
-          Thunk::kNcclAllReduceStart, all_reduce, all_reduce,
+      return EmitCollectiveThunk<AllReduceStartThunk, HloAllReduceInstruction>(
+          Thunk::kAllReduceStart, all_reduce, all_reduce,
           all_reduce->use_global_device_ids());
     }
     case HloOpcode::kAsyncDone: {
       if (!instr->async_wrapped_computation()
                ->CanExpandIntoSingleInstruction()) {
-        return EmitNcclAsyncDone(Thunk::kNcclGroupDone, instr);
+        return EmitCollectiveAsyncDone(Thunk::kGroupDone, instr);
       }
       const HloInstruction* wrapped = instr->async_wrapped_instruction();
       switch (wrapped->opcode()) {
         case HloOpcode::kReduceScatter:
-          return EmitNcclAsyncDone(Thunk::kNcclReduceScatterDone, instr);
+          return EmitCollectiveAsyncDone(Thunk::kReduceScatterDone, instr);
         case HloOpcode::kAllToAll:
-          return EmitNcclAsyncDone(Thunk::kNcclAllToAllDone, instr);
+          return EmitCollectiveAsyncDone(Thunk::kAllToAllDone, instr);
         case HloOpcode::kRaggedAllToAll:
-          return EmitNcclAsyncDone(Thunk::kNcclRaggedAllToAllDone, instr);
+          return EmitCollectiveAsyncDone(Thunk::kRaggedAllToAllDone, instr);
         case HloOpcode::kCollectiveBroadcast:
-          return EmitNcclAsyncDone(Thunk::kNcclCollectiveBroadcastDone, instr);
+          return EmitCollectiveAsyncDone(Thunk::kCollectiveBroadcastDone,
+                                         instr);
         case HloOpcode::kFusion: {
           auto collective_hero = GetCollectiveHeroForDynamicSliceFusion(
               Cast<HloFusionInstruction>(wrapped));
@@ -2637,7 +2649,7 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
             switch ((*collective_hero)->opcode()) {
               case HloOpcode::kReduceScatter:
                 TF_RETURN_IF_ERROR(
-                    EmitNcclAsyncDone(Thunk::kNcclReduceScatterDone, instr));
+                    EmitCollectiveAsyncDone(Thunk::kReduceScatterDone, instr));
                 break;
               default:
                 return absl::InternalError(absl::StrFormat(
@@ -2674,35 +2686,35 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       // Multi-op async start will emit a NCCL group thunk.
       if (!instr->async_wrapped_computation()
                ->CanExpandIntoSingleInstruction()) {
-        return EmitNcclGroupStartThunk(instr);
+        return EmitCollectiveGroupStartThunk(instr);
       }
       const HloInstruction* wrapped = instr->async_wrapped_instruction();
       switch (wrapped->opcode()) {
         case HloOpcode::kReduceScatter: {
           auto* reduce_scatter = Cast<HloReduceScatterInstruction>(wrapped);
-          return EmitNcclThunk<NcclReduceScatterStartThunk,
-                               HloReduceScatterInstruction>(
-              Thunk::kNcclReduceScatter, instr, reduce_scatter,
+          return EmitCollectiveThunk<ReduceScatterStartThunk,
+                                     HloReduceScatterInstruction>(
+              Thunk::kReduceScatter, instr, reduce_scatter,
               reduce_scatter->use_global_device_ids());
         }
         case HloOpcode::kAllToAll: {
           auto* all_to_all = Cast<HloAllToAllInstruction>(wrapped);
-          return EmitNcclThunk<NcclAllToAllStartThunk, HloAllToAllInstruction>(
-              Thunk::kNcclAllToAll, instr, all_to_all, std::nullopt);
+          return EmitCollectiveThunk<AllToAllStartThunk,
+                                     HloAllToAllInstruction>(
+              Thunk::kAllToAll, instr, all_to_all, std::nullopt);
         }
         case HloOpcode::kRaggedAllToAll: {
           auto* ragged_all_to_all = Cast<HloRaggedAllToAllInstruction>(wrapped);
-          return EmitNcclThunk<NcclRaggedAllToAllStartThunk,
-                               HloRaggedAllToAllInstruction>(
-              Thunk::kNcclRaggedAllToAll, instr, ragged_all_to_all,
-              std::nullopt);
+          return EmitCollectiveThunk<RaggedAllToAllStartThunk,
+                                     HloRaggedAllToAllInstruction>(
+              Thunk::kRaggedAllToAll, instr, ragged_all_to_all, std::nullopt);
         }
         case HloOpcode::kCollectiveBroadcast: {
           auto* collective_broadcast =
               Cast<HloCollectiveBroadcastInstruction>(wrapped);
-          return EmitNcclThunk<NcclCollectiveBroadcastStartThunk,
-                               HloCollectiveBroadcastInstruction>(
-              Thunk::kNcclCollectiveBroadcast, instr, collective_broadcast,
+          return EmitCollectiveThunk<CollectiveBroadcastStartThunk,
+                                     HloCollectiveBroadcastInstruction>(
+              Thunk::kCollectiveBroadcast, instr, collective_broadcast,
               std::nullopt);
         }
         case HloOpcode::kFusion: {
@@ -2737,7 +2749,7 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
     case HloOpcode::kCall:
       return EmitCommandBufferThunk(instr);
     case HloOpcode::kCollectivePermuteDone:
-      return EmitNcclAsyncDone(Thunk::kNcclCollectivePermuteDone, instr);
+      return EmitCollectiveAsyncDone(Thunk::kCollectivePermuteDone, instr);
     case HloOpcode::kCollectivePermuteStart:
       return EmitCollectivePermute(
           Cast<HloCollectivePermuteInstruction>(instr));
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index 36e41606c137..b315552a2a66 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -33,13 +33,14 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
-#include "xla/backends/gpu/runtime/send_recv_thunk.h"
+#include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/gpu/ir_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -87,7 +88,7 @@ class IrEmitterUnnested : public IrEmitter {
   static std::unique_ptr<IrEmitterUnnested> Create(
       IrEmitterContext* ir_emitter_context);
 
-  // Transfers the ownship of thunk_sequence_ out.
+  // Transfers the ownership of thunk_sequence_ out.
   std::unique_ptr<SequentialThunk> ConsumeThunkSequence(
       Thunk::ThunkInfo thunk_info = Thunk::ThunkInfo{}) {
     return std::make_unique<SequentialThunk>(thunk_info,
@@ -147,13 +148,14 @@ class IrEmitterUnnested : public IrEmitter {
   absl::Status EmitRecvThunk(const HloRecvInstruction* instr);
   absl::Status EmitRecvDoneThunk(const HloRecvDoneInstruction* instr);
 
-  template <typename NcclThunkType, typename HloInstType>
-  absl::Status EmitNcclThunk(Thunk::Kind kind,
-                             const HloInstruction* async_start,
-                             const HloInstType* inst,
-                             std::optional<bool> use_global_device_ids);
+  template <typename CollectiveThunkType, typename HloInstType>
+  absl::Status EmitCollectiveThunk(Thunk::Kind kind,
+                                   const HloInstruction* async_start,
+                                   const HloInstType* inst,
+                                   std::optional<bool> use_global_device_ids);
 
-  absl::Status EmitNcclAsyncDone(Thunk::Kind kind, const HloInstruction* instr);
+  absl::Status EmitCollectiveAsyncDone(Thunk::Kind kind,
+                                       const HloInstruction* instr);
 
   template <typename ThunkType>
   absl::Status EmitReplicaOrPartitionId(const HloInstruction* instr);
@@ -167,7 +169,7 @@ class IrEmitterUnnested : public IrEmitter {
 
   absl::Status EmitHloInstruction(const HloInstruction* instr);
 
-  absl::Status EmitNcclGroupStartThunk(const HloInstruction* instr);
+  absl::Status EmitCollectiveGroupStartThunk(const HloInstruction* instr);
 
   absl::Status EmitTargetElementLoop(
       const HloInstruction& hlo,
@@ -325,8 +327,8 @@ class IrEmitterUnnested : public IrEmitter {
   ThunkSequence scoped_thunk_sequence_;
   bool emit_group_thunks_ = false;
 
-  // Container for async send/recv events shared by send/recv thunks.
-  std::shared_ptr<SendRecvAsyncEvents> send_recv_events_;
+  // Container for async host send/recv events shared by host send/recv thunks.
+  std::shared_ptr<HostSendRecvAsyncEvents> send_recv_events_;
 
   // Container for async copy-start/copy-done events.
   std::shared_ptr<CopyThunk::AsyncEvents> copy_events_;
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index 83b911067105..1b71b169fb2e 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -1,7 +1,5 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("//xla:xla.bzl", "xla_cc_binary")
-load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
+load("//xla:xla.default.bzl", "xla_cc_binary")
 load("//xla/tests:build_defs.bzl", "DEFAULT_DISABLED_BACKENDS", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_windows")
 load(
@@ -129,7 +127,6 @@ xla_test(
         "//xla:literal_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
         "//xla/stream_executor:device_description",
@@ -142,127 +139,6 @@ xla_test(
     ],
 )
 
-cc_library(
-    name = "topk_kernel",
-    srcs = ["topk_kernel.cc"],
-    hdrs = ["topk_kernel.h"],
-    compatible_with = [],
-    tags = ["gpu"],
-    deps = [
-        ":topk_kernel_gpu",
-        "//xla:shape_util",
-        "//xla:types",
-        "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:launch_dim",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:typed_kernel_factory",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/numeric:bits",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-gpu_kernel_library(
-    name = "topk_kernel_gpu",
-    srcs = [
-        "topk_kernel.cu.h",
-        "topk_kernel_bfloat16.cu.cc",
-        "topk_kernel_float.cu.cc",
-    ],
-    hdrs = ["topk_kernel_common.h"],
-    compatible_with = [],
-    deps = [
-        "//xla:types",
-        "//xla/tsl/lib/math:math_util",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
-)
-
-xla_test(
-    name = "topk_kernel_test",
-    srcs = ["topk_kernel_test.cc"],
-    backends = ["gpu"],
-    deps = [
-        ":topk_kernel",
-        "//xla:types",
-        "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:device_memory_handle",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor/gpu:gpu_init",
-        "//xla/stream_executor/host:host_platform",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "//xla/tsl/platform:test_benchmark",
-        "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/random",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "topk_custom_kernel",
-    srcs = ["topk_custom_kernel.cc"],
-    hdrs = ["topk_custom_kernel.h"],
-    tags = ["gpu"],
-    visibility = [":friends"],
-    deps = [
-        ":custom_kernel",
-        ":topk_kernel_gpu",
-        "//xla:types",
-        "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:kernel_spec",
-        "//xla/stream_executor:launch_dim",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/numeric:bits",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-xla_test(
-    name = "topk_custom_kernel_test",
-    srcs = ["topk_custom_kernel_test.cc"],
-    backends = ["gpu"],
-    deps = [
-        ":topk_custom_kernel",
-        "//xla:types",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:platform_util",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/random",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:path",
-    ],
-)
-
 #===--------------------------------------------------------------------------------------------===#
 # CUTLASS Gemm <-> xla::gpu::kernel::CustomKernel adaptor
 #===--------------------------------------------------------------------------------------------===#
@@ -297,7 +173,6 @@ xla_test(
     name = "cutlass_gemm_custom_kernel_test",
     srcs = ["cutlass_gemm_custom_kernel_test.cc"],
     backends = ["gpu"],
-    data = [":cutlass_gemm_kernel_f32xf32_to_f32.so"],
     linkstatic = False,  # This test is incompatible with linkstatic in the OSS build.
     tags = ["cuda-only"],
     deps = [
@@ -369,6 +244,7 @@ cuda_library(
     deps = [
         ":cutlass_gemm",
         "@cutlass_archive//:cutlass",
+        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -422,7 +298,6 @@ cuda_library(
     deps = [
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
-        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -437,7 +312,6 @@ cuda_library(
     deps = [
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
-        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -455,7 +329,6 @@ cuda_library(
     deps = [
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
-        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -473,7 +346,6 @@ cuda_library(
     deps = [
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
-        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -488,24 +360,7 @@ cuda_library(
     deps = [
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-#===--------------------------------------------------------------------------------------------===#
-# CUTLASS Gemm kernel libraries
-#===--------------------------------------------------------------------------------------------===#
-
-cc_binary(
-    name = "cutlass_gemm_kernel_f32xf32_to_f32.so",
-    srcs = ["cutlass_gemm_kernel_f32xf32_to_f32.cc"],
-    linkshared = True,
-    linkstatic = False,
-    tags = [
-        "cuda-only",
-        "gpu",
     ],
-    deps = [":cutlass_gemm"],
 )
 
 #===--------------------------------------------------------------------------------------------===#
@@ -522,10 +377,10 @@ cc_library(
     ],
     deps = [
         ":custom_kernel",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
index 88039dd467ae..e9ebcca5dd8a 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
@@ -32,7 +32,7 @@ namespace xla::gpu {
 //===----------------------------------------------------------------------===//
 
 CustomKernelFusionRegistry* CustomKernelFusionRegistry::Default() {
-  static auto* registry = new CustomKernelFusionRegistry();
+  static auto* const registry = new CustomKernelFusionRegistry();
   return registry;
 }
 
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
index b290585680bc..2e46ebe1eb93 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
@@ -68,7 +68,7 @@ CustomKernelFusionPattern::Match::BuildReplacement(
 
 CustomKernelFusionPatternRegistry*
 CustomKernelFusionPatternRegistry::Default() {
-  static auto* registry = new CustomKernelFusionPatternRegistry();
+  static auto* const registry = new CustomKernelFusionPatternRegistry();
   return registry;
 }
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
index 5c6f9b926563..0196b71a1998 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
@@ -206,6 +206,36 @@ static CustomKernel Load(std::string name, GemmMode mode, int32_t batch_count,
   }
 }
 
+namespace {
+std::vector<CustomKernel> GetF32xBF16ToF32Kernels(
+    std::string name, int32_t m, int32_t n, int32_t k,
+    const ArgsIndices& indices, const DynamicSliceIndices& slices,
+    const se::DeviceDescription& device) {
+  std::vector<CustomKernel> kernels{Load<F32xBf16ToF32<Default>>(
+      name, GemmMode::kGemm, 1, m, n, k, indices, slices, device)};
+  if (k == 32 || k == 64) {
+    kernels.push_back(Load<F32xBf16ToF32<Default>>(
+        name, GemmMode::kGemmSplitKParallel,
+        /*batch_count=*/16, m, n, k, indices, slices, device));
+  }
+  return kernels;
+}
+
+std::vector<CustomKernel> GetBF16xS8ToF32Kernels(
+    std::string name, int32_t m, int32_t n, int32_t k,
+    const ArgsIndices& indices, const DynamicSliceIndices& slices,
+    const se::DeviceDescription& device) {
+  std::vector<CustomKernel> kernels{Load<Bf16xS8ToF32<Default>>(
+      name, GemmMode::kGemm, 1, m, n, k, indices, slices, device)};
+  if (k == 64 || k == 128) {
+    kernels.push_back(Load<Bf16xS8ToF32<Default>>(
+        name, GemmMode::kGemmSplitKParallel,
+        /*batch_count=*/16, m, n, k, indices, slices, device));
+  }
+  return kernels;
+}
+}  // namespace
+
 absl::StatusOr<std::vector<CustomKernel>> GetCutlassGemmKernels(
     std::string name, PrimitiveType dot_type, PrimitiveType lhs_type,
     PrimitiveType rhs_type, int32_t m, int32_t n, int32_t k,
@@ -217,26 +247,18 @@ absl::StatusOr<std::vector<CustomKernel>> GetCutlassGemmKernels(
                       std::vector<CustomKernel>>
       kernels = {
           {{BF16, BF16, BF16},
-           {{Load<Bf16xBf16ToBf16<Default>>(name, GemmMode::kGemm, 1, m, n, k,
-                                            indices, slices, device)}}},
+           {Load<Bf16xBf16ToBf16<Default>>(name, GemmMode::kGemm, 1, m, n, k,
+                                           indices, slices, device)}},
           {{BF16, BF16, F32},
-           {{Load<Bf16xBf16ToF32<Default>>(name, GemmMode::kGemm, 1, m, n, k,
-                                           indices, slices, device)}}},
+           {Load<Bf16xBf16ToF32<Default>>(name, GemmMode::kGemm, 1, m, n, k,
+                                          indices, slices, device)}},
           {{F32, BF16, F32},
-           {{Load<F32xBf16ToF32<Default>>(name, GemmMode::kGemm, 1, m, n, k,
-                                          indices, slices, device)},
-            {Load<F32xBf16ToF32<Default>>(name, GemmMode::kGemmSplitKParallel,
-                                          /*batch_count=*/16, m, n, k, indices,
-                                          slices, device)}}},
+           GetF32xBF16ToF32Kernels(name, m, n, k, indices, slices, device)},
           {{BF16, S8, F32},
-           {{Load<Bf16xS8ToF32<Default>>(name, GemmMode::kGemm, 1, m, n, k,
-                                         indices, slices, device)},
-            {Load<Bf16xS8ToF32<Default>>(name, GemmMode::kGemmSplitKParallel,
-                                         /*batch_count=*/16, m, n, k, indices,
-                                         slices, device)}}},
+           GetBF16xS8ToF32Kernels(name, m, n, k, indices, slices, device)},
           {{F32, F32, F32},
-           {{Load<F32xF32ToF32<Default>>(name, GemmMode::kGemm, 1, m, n, k,
-                                         indices, slices, device)}}}};
+           {Load<F32xF32ToF32<Default>>(name, GemmMode::kGemm, 1, m, n, k,
+                                        indices, slices, device)}}};
 
   auto loaded_kernels = kernels.find({lhs_type, rhs_type, dot_type});
   if (loaded_kernels != kernels.end()) {
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
index 761b672bdfdd..3b0ef36deeee 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
@@ -85,53 +85,4 @@ TEST(CutlassGemmKernelTest, SimpleGemm) {
   ASSERT_EQ(dst, expected);
 }
 
-TEST(CutlassGemmKernelTest, LoadFromSharedLibrary) {
-  std::string kernel_lib_path =
-      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", "gpu", "kernels",
-                        "cutlass_gemm_kernel_f32xf32_to_f32.so");
-
-  se::Platform* platform =
-      se::PlatformManager::PlatformWithName("CUDA").value();
-  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  auto stream = executor->CreateStream().value();
-
-  // Load [4, 4] x [4, 4] gemm kernel written in CUDA C++ with CUTLASS.
-  auto custom_kernel = LoadCutlassGemmKernel(
-      "cutlass_gemm", kernel_lib_path, PrimitiveType::F32, 4, 4, 4,
-      /*indices=*/{0, 1, 2}, /*slices=*/{}, executor->GetDeviceDescription());
-
-  TF_ASSERT_OK_AND_ASSIGN(auto gemm,
-                          executor->LoadKernel(custom_kernel->kernel_spec()));
-
-  int64_t length = 4 * 4;
-  int64_t byte_length = sizeof(float) * length;
-
-  se::DeviceMemory<float> a = executor->AllocateArray<float>(length, 0);
-  se::DeviceMemory<float> b = executor->AllocateArray<float>(length, 0);
-  se::DeviceMemory<float> c = executor->AllocateArray<float>(length, 0);
-
-  float value = 2.0;
-  uint32_t pattern;
-  std::memcpy(&pattern, &value, sizeof(pattern));
-
-  TF_ASSERT_OK(stream->Memset32(&a, pattern, byte_length));
-  TF_ASSERT_OK(stream->Memset32(&b, pattern, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&c, byte_length));
-
-  // Launch gemm kernel with device memory arguments.
-  se::KernelArgsDeviceMemoryArray arr(
-      std::vector<se::DeviceMemoryBase>({a, b, c}),
-      custom_kernel->shared_memory_bytes());
-  TF_ASSERT_OK(gemm->Launch(custom_kernel->thread_dims(),
-                            custom_kernel->block_dims(), stream.get(), arr));
-
-  // Copy `c` data back to host.
-  std::vector<float> dst(length, -1.0f);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), c, byte_length));
-
-  std::vector<float> expected(length, 16.0);
-  ASSERT_EQ(dst, expected);
-}
-
 }  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index f4b0702127f4..50beeddeab1c 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -64,7 +64,7 @@ struct RootWithWorkspace {
   HloInstruction* workspace;
 };
 
-static RootWithWorkspace MatchRootWithWorkspace(HloInstruction* root) {
+RootWithWorkspace MatchRootWithWorkspace(HloInstruction* root) {
   RootWithWorkspace result;
   if (Match(root, match::Tuple(match::Op(&result.root),
                                match::CustomCall(
@@ -104,8 +104,8 @@ struct GemmWithDynamicSlice {
 
 // Returns OK if dot instruction is a simple 2D row-major gemm.
 absl::Status MatchRowMajorGemm(HloDotInstruction* dot) {
-  if (dot->operand(0)->shape().dimensions_size() != 2 ||
-      dot->operand(1)->shape().dimensions_size() != 2) {
+  if (dot->operand(0)->shape().dimensions().size() != 2 ||
+      dot->operand(1)->shape().dimensions().size() != 2) {
     return absl::InternalError("operands must have rank 2");
   }
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
index 982c24736492..9047fef63470 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
@@ -443,8 +443,6 @@ TEST_F(CutlassFusionTest, RowMajorGemmKernel) {
                                       error_spec, /*run_hlo_passes=*/false));
 }
 
-// TODO(b/362698643): Add a test for a kernel with a left-hand side upcast once
-// we add a kernel for it.
 TEST_F(CutlassFusionTest, GemmWithRightHandSideUpcastKernel) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
@@ -486,6 +484,49 @@ TEST_F(CutlassFusionTest, GemmWithRightHandSideUpcastKernel) {
                                       error_spec, /*run_hlo_passes=*/false));
 }
 
+TEST_F(CutlassFusionTest, GemmWithRightHandSideUpcastKernelSplitK) {
+  // kernel_index = 1 is the SplitK kernel. It only works for contracting
+  // dimension size 32 and 64.
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_text_cublas = R"(
+  HloModule cublas
+
+  ENTRY e {
+    p0 = f32[16,32]{1,0} parameter(0)
+    p1 = bf16[32,8]{1,0} parameter(1)
+    c1 = f32[32,8]{1,0} convert(p1)
+    gemm = (f32[16,8]{1,0}, s8[0]{0}) custom-call(p0, c1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+    ROOT get-tuple-element = f32[16,8]{1,0} get-tuple-element(gemm), index=0
+  })";
+
+  const char* hlo_text_custom_fusion = R"(
+  HloModule cutlass
+
+  cutlass_gemm_with_upcast {
+    p0 = f32[16,32]{1,0} parameter(0)
+    p1 = bf16[32,8]{1,0} parameter(1)
+    c1 = f32[32,8]{1,0} convert(p1)
+    ROOT dot = f32[16,8]{1,0} dot(p0, c1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY e {
+    p0 = f32[16,32]{1,0} parameter(0)
+    p1 = bf16[32,8]{1,0} parameter(1)
+    ROOT _ = f32[16,8]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=cutlass_gemm_with_upcast,
+      backend_config={"fusion_backend_config":{kind: "__custom_fusion",
+      custom_fusion_config: {"name":"cutlass_gemm_with_upcast",
+      "kernel_index":1}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_cublas, hlo_text_custom_fusion,
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
 TEST_F(CutlassFusionTest, GemmWithLeftHandAndRightHandSideUpcastKernel) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
@@ -526,6 +567,48 @@ TEST_F(CutlassFusionTest, GemmWithLeftHandAndRightHandSideUpcastKernel) {
                                       error_spec, /*run_hlo_passes=*/false));
 }
 
+TEST_F(CutlassFusionTest, GemmWithLeftHandAndRightHandSideUpcastKernelSplitK) {
+  // kernel_index = 1 is the SplitK kernel. It only works for contracting
+  // dimension size 64 and 128.
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_text_cublas = R"(
+  HloModule cublas
+
+  ENTRY e {
+    p0 = bf16[16,128]{1,0} parameter(0)
+    c0 = f32[16,128]{1,0} convert(p0)
+    p1 = s8[128,8]{1,0} parameter(1)
+    c1 = f32[128,8]{1,0} convert(p1)
+    gemm = (f32[16,8]{1,0}, s8[0]{0}) custom-call(c0, c1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+    ROOT get-tuple-element = f32[16,8]{1,0} get-tuple-element(gemm), index=0
+  })";
+
+  const char* hlo_text_custom_fusion = R"(
+  HloModule cutlass
+
+  cutlass_gemm_with_upcast {
+    p0 = bf16[16,128]{1,0} parameter(0)
+    c0 = f32[16,128]{1,0} convert(p0)
+    p1 = s8[128,8]{1,0} parameter(1)
+    c1 = f32[128,8]{1,0} convert(p1)
+    ROOT dot = f32[16,8]{1,0} dot(c0, c1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY e {
+    p0 = bf16[16,128]{1,0} parameter(0)
+    p1 = s8[128,8]{1,0} parameter(1)
+    ROOT _ = f32[16,8]{1,0} fusion(p0, p1), kind=kCustom, calls=cutlass_gemm_with_upcast,
+      backend_config={"fusion_backend_config":{kind: "__custom_fusion", custom_fusion_config: {"name":"cutlass_gemm_with_upcast", "kernel_index":1}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_cublas, hlo_text_custom_fusion,
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
 TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSliceKernel) {
   if (GpuSharedMemorySize() <
       CutlassGemmKernelSharedMemorySize(BF16, BF16, BF16, 8, 8, 8)) {
diff --git a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel.cc
index 21e6e56b7c71..dad8648f3210 100644
--- a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
@@ -34,8 +35,8 @@ absl::StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>
 KernelArgsPacking(const se::Kernel &kernel, const se::KernelArgs &args) {
   auto *mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
 
-  return se::PackKernelArgs(mem_args->device_memory_args(),
-                            mem_args->number_of_shared_bytes());
+  return se::PackKernelArgs<se::DeviceMemoryBase>(
+      mem_args->device_memory_args(), mem_args->number_of_shared_bytes());
 }
 
 // Note: Make sure that the kernel_name matches the kernel name in the ptx,
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc
deleted file mode 100644
index 854d1e018621..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/kernels/topk_custom_kernel.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/numeric/bits.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "xla/service/gpu/kernels/custom_kernel.h"
-#include "xla/service/gpu/kernels/topk_kernel_common.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/kernel_spec.h"
-#include "xla/stream_executor/launch_dim.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/types.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu::kernel::topk {
-
-namespace {
-
-using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;
-
-// The optimal number of threads is the smaller value between the number of
-// threads available per block and the number of slices of data.
-size_t EstimateOptimalNumThreads(size_t n, size_t k, size_t batch_size) {
-  // Estimate number of threads per block that can run concurrently given the
-  // register footprint (k elements are kept in registers at all times).
-  constexpr size_t kEstimatedThreadsPerBlock = 512;
-  constexpr size_t kMaxKValue = 16;
-  size_t simultaneous_threads_per_block =
-      kEstimatedThreadsPerBlock * (kMaxKValue / k);
-  size_t threads_per_block =
-      std::min(simultaneous_threads_per_block, kTopKMaxThreadsPerBlock);
-  // Minimum amount of data that each thread needs to receive for the algorithm.
-  size_t min_slice = absl::bit_floor(n / absl::bit_ceil(k));
-  return std::min(threads_per_block, min_slice);
-}
-
-// Gets the right version of TopK kernel based on the value of `k`.
-template <typename T>
-absl::StatusOr<void*> GetKernel(int n, int k) {
-  if (k <= 1) return GetTopKKernelForK<T, 1>(n);
-  if (k <= 2) return GetTopKKernelForK<T, 2>(n);
-  if (k <= 4) return GetTopKKernelForK<T, 4>(n);
-  if (k <= 8) return GetTopKKernelForK<T, 8>(n);
-  if (k <= 16) return GetTopKKernelForK<T, 16>(n);
-  return absl::UnimplementedError(absl::StrCat("Unsupported K: ", k));
-}
-
-// Returns the function creating packed arguments for TopK kernel.
-template <typename T>
-KernelArgsPacking CreateTopKArgsPacking(size_t num_elements, size_t k) {
-  using Packed = absl::StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>;
-
-  return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
-    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
-
-    se::DeviceMemory<T> data(mem_args->device_memory_args()[0]);
-    se::DeviceMemory<T> top_elements(mem_args->device_memory_args()[1]);
-    se::DeviceMemory<uint32_t> top_indices(mem_args->device_memory_args()[2]);
-
-    return se::PackKernelArgs(args.number_of_shared_bytes(), data, num_elements,
-                              top_elements, top_indices, k);
-  };
-}
-
-// Implementation for creating a CustomKernel for TopK operation with element
-// type `T`.
-template <typename T>
-absl::StatusOr<CustomKernel> GetTypedTopK(std::string name, size_t num_elements,
-                                          size_t k, size_t batch_size) {
-  constexpr size_t kMaxKVSize = sizeof(uint64_t);
-  // Allocate shmem assuming we have a full reduction.
-  int shmem_size = absl::bit_ceil(k) * kMaxKVSize * GetTopKWaveFrontSize<T>();
-  int num_threads = EstimateOptimalNumThreads(num_elements, k, batch_size);
-  if (num_threads == 0) {
-    return absl::FailedPreconditionError(
-        "Invalid kernel parameters. This is likely a bug in the "
-        "TopkSpecializer.");
-  }
-
-  auto packing = CreateTopKArgsPacking<T>(num_elements, k);
-
-  se::MultiKernelLoaderSpec spec(/*arity=*/5, std::move(packing));
-  TF_ASSIGN_OR_RETURN(void* kernel_symbol, GetKernel<T>(num_elements, k));
-  spec.AddInProcessSymbol(kernel_symbol, name);
-
-  return CustomKernel(std::move(name), std::move(spec),
-                      se::BlockDim(batch_size, 1, 1),
-                      se::ThreadDim(num_threads, 1, 1), shmem_size);
-}
-
-}  // namespace
-
-absl::StatusOr<CustomKernel> GetTopKKernel(std::string name,
-                                           PrimitiveType dtype,
-                                           size_t num_elements, size_t k,
-                                           size_t batch_size) {
-  switch (dtype) {
-    case PrimitiveType::F32:
-      return GetTypedTopK<float>(std::move(name), num_elements, k, batch_size);
-    case PrimitiveType::BF16:
-      return GetTypedTopK<bfloat16>(std::move(name), num_elements, k,
-                                    batch_size);
-    default:
-      return absl::InvalidArgumentError(
-          absl::StrCat("Unsupported GpuTopK data type: ", dtype));
-  }
-}
-
-}  // namespace xla::gpu::kernel::topk
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h
deleted file mode 100644
index 715f92f7701e..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
-#define XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
-
-#include <cstddef>
-#include <string>
-
-#include "absl/status/statusor.h"
-#include "xla/service/gpu/kernels/custom_kernel.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu::kernel::topk {
-
-// Creates a CustomKernel for TopK operation.
-absl::StatusOr<CustomKernel> GetTopKKernel(std::string name,
-                                           PrimitiveType dtype,
-                                           size_t num_elements, size_t k,
-                                           size_t batch_size);
-
-}  // namespace xla::gpu::kernel::topk
-
-#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
deleted file mode 100644
index f175b3c0ab12..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/kernels/topk_custom_kernel.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <tuple>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/random/random.h"
-#include "absl/strings/ascii.h"
-#include "absl/strings/substitute.h"
-#include "xla/service/platform_util.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/types.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu::kernel::topk {
-
-using ::testing::Combine;
-using ::testing::Values;
-
-template <typename T>
-std::vector<T> RandomVecRange(int num_elements, T start, T end) {
-  std::vector<T> local;
-  local.reserve(num_elements);
-  thread_local absl::BitGen gen;
-  for (int i = 0; i < num_elements; ++i) {
-    local.push_back(absl::Uniform<T>(gen, start, end));
-  }
-  return local;
-}
-
-template <typename T>
-std::vector<T> RandomVec(int num_elements) {
-  return RandomVecRange(num_elements, static_cast<T>(0),
-                        static_cast<T>(num_elements));
-}
-
-template <typename T>
-std::vector<T> RandomVecNegative(int num_elements) {
-  return RandomVecRange(num_elements, -static_cast<T>(num_elements),
-                        static_cast<T>(0));
-}
-
-PrimitiveType Get(float) { return PrimitiveType::F32; }
-
-PrimitiveType Get(bfloat16) { return PrimitiveType::BF16; }
-
-// Params:
-//  - n_kb: number of elements in kilobytes.
-//  - k: number of elements to return.
-//  - batch_size
-//  - offset
-using TopKKernelTest = ::testing::TestWithParam<std::tuple<int, int, int, int>>;
-
-// In this test we only check that the TopK logic works with float. For the full
-// dtype coverage suite, please add them to topk_test.cc, where we can use XLA
-// utilities to simplify the test logic.
-TEST_P(TopKKernelTest, TopKFloat) {
-  using T = float;
-
-  auto name =
-      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
-  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
-  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  auto stream = executor->CreateStream().value();
-
-  const auto [n_kb, k, batch_size, offset] = GetParam();
-  const size_t n = n_kb * 1024 + offset;
-
-  se::DeviceMemory<T> input_buffer =
-      executor->AllocateArray<T>(n * batch_size, 0);
-  se::DeviceMemory<T> output_values =
-      executor->AllocateArray<T>(k * batch_size, 0);
-  se::DeviceMemory<uint32_t> output_indices =
-      executor->AllocateArray<uint32_t>(k * batch_size, 0);
-
-  auto source = RandomVec<T>(n * batch_size);
-  TF_ASSERT_OK(
-      stream->Memcpy(&input_buffer, source.data(), n * batch_size * sizeof(T)));
-  TF_ASSERT_OK(stream->MemZero(&output_values, k * batch_size * sizeof(T)));
-  TF_ASSERT_OK(
-      stream->MemZero(&output_indices, k * batch_size * sizeof(uint32_t)));
-
-  auto custom_kernel =
-      GetTopKKernel("topk", PrimitiveType::F32, n, k, batch_size);
-
-  TF_ASSERT_OK_AND_ASSIGN(auto kernel,
-                          executor->LoadKernel(custom_kernel->kernel_spec()));
-
-  // Launch topk kernel with device memory arguments.
-  se::KernelArgsDeviceMemoryArray arr(
-      std::vector<se::DeviceMemoryBase>(
-          {input_buffer, output_values, output_indices}),
-      custom_kernel->shared_memory_bytes());
-  TF_ASSERT_OK(kernel->Launch(custom_kernel->thread_dims(),
-                              custom_kernel->block_dims(), stream.get(), arr));
-
-  std::vector<T> got(k);
-  ASSERT_TRUE(stream->BlockHostUntilDone().ok());
-  for (int i = 0; i < batch_size; i++) {
-    TF_ASSERT_OK(stream->Memcpy(got.data(), output_values.GetSlice(k * i, k),
-                                k * sizeof(T)));
-    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
-    std::sort(slice.begin(), slice.end(), std::greater<T>());
-    slice.resize(k);
-    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
-        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
-  }
-}
-
-TEST_P(TopKKernelTest, TopKPackedNegative) {
-  using T = float;
-
-  auto name =
-      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
-  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
-  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  auto stream = executor->CreateStream().value();
-
-  const auto [n_kb, k, batch_size, offset] = GetParam();
-  const size_t n = n_kb * 1024 + offset;
-
-  se::DeviceMemory<T> input_buffer =
-      executor->AllocateArray<T>(n * batch_size, 0);
-  se::DeviceMemory<T> output_values =
-      executor->AllocateArray<T>(k * batch_size, 0);
-  se::DeviceMemory<uint32_t> output_indices =
-      executor->AllocateArray<uint32_t>(k * batch_size, 0);
-
-  auto source = RandomVecNegative<T>(n * batch_size);
-  TF_ASSERT_OK(
-      stream->Memcpy(&input_buffer, source.data(), n * batch_size * sizeof(T)));
-  TF_ASSERT_OK(stream->MemZero(&output_values, k * batch_size * sizeof(T)));
-  TF_ASSERT_OK(
-      stream->MemZero(&output_indices, k * batch_size * sizeof(uint32_t)));
-
-  auto custom_kernel =
-      GetTopKKernel("topk", PrimitiveType::F32, n, k, batch_size);
-
-  TF_ASSERT_OK_AND_ASSIGN(auto kernel,
-                          executor->LoadKernel(custom_kernel->kernel_spec()));
-
-  // Launch topk kernel with device memory arguments.
-  se::KernelArgsDeviceMemoryArray arr(
-      std::vector<se::DeviceMemoryBase>(
-          {input_buffer, output_values, output_indices}),
-      custom_kernel->shared_memory_bytes());
-  TF_ASSERT_OK(kernel->Launch(custom_kernel->thread_dims(),
-                              custom_kernel->block_dims(), stream.get(), arr));
-
-  std::vector<T> got(k);
-  ASSERT_TRUE(stream->BlockHostUntilDone().ok());
-  for (int i = 0; i < batch_size; i++) {
-    TF_ASSERT_OK(stream->Memcpy(got.data(), output_values.GetSlice(k * i, k),
-                                k * sizeof(T)));
-    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
-    std::sort(slice.begin(), slice.end(), std::greater<T>());
-    slice.resize(k);
-    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
-        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(TopKTests, TopKKernelTest,
-                         Combine(
-                             /*n_kb=*/Values(1, 8, 12, 64, 128),
-                             /*k=*/Values(1, 2, 8, 16, 7, 12),
-                             /*batch_size=*/Values(1, 16, 64, 128),
-                             /*offset=*/Values(0, 7, 4)),
-                         [](const auto& info) {
-                           return absl::Substitute(
-                               "n$0KiB_k$1_batch_size$2_offset$3",
-                               std::get<0>(info.param), std::get<1>(info.param),
-                               std::get<2>(info.param),
-                               std::get<3>(info.param));
-                         });
-
-}  // namespace xla::gpu::kernel::topk
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
deleted file mode 100644
index 41ffdbfba5ae..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file contains bespoke and optimized implementation for TopK shapes. When
-// adding support for new shapes/dtypes, you also need to modify the rewriter
-// on topk_specializer.cc for these changes to be picked up.
-
-#include "xla/service/gpu/kernels/topk_kernel.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-
-#include "absl/log/log.h"
-#include "absl/numeric/bits.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "xla/primitive_util.h"
-#include "xla/service/gpu/kernels/topk_kernel_common.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/typed_kernel_factory.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/types.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu {
-namespace {
-
-size_t NumThreads(size_t n, size_t k, size_t batch_size) {
-  // Estimate number of threads per block that can run concurrently given the
-  // register footprint.
-  size_t simultaneous_threads_per_block = 512 * (16 / k);
-  size_t threads_per_block =
-      std::min(simultaneous_threads_per_block, kTopKMaxThreadsPerBlock);
-  // Minimum amount of data that each thread needs to receive for the algorithm.
-  size_t min_slice = absl::bit_floor(n / absl::bit_ceil(k));
-  return std::min(threads_per_block, min_slice);
-}
-
-template <typename T>
-absl::StatusOr<void*> GetKernel(int n, int k) {
-  if (k <= 1) return GetTopKKernelForK<T, 1>(n);
-  if (k <= 2) return GetTopKKernelForK<T, 2>(n);
-  if (k <= 4) return GetTopKKernelForK<T, 4>(n);
-  if (k <= 8) return GetTopKKernelForK<T, 8>(n);
-  if (k <= 16) return GetTopKKernelForK<T, 16>(n);
-  return absl::UnimplementedError(absl::StrCat("Unsupported K: ", k));
-}
-
-template <typename T>
-absl::Status TypedTopK(se::Stream* stream, se::DeviceMemoryBase data,
-                       size_t num_elements, se::DeviceMemoryBase top_elements,
-                       se::DeviceMemoryBase top_indices, size_t k,
-                       size_t batch_size) {
-  constexpr size_t max_kv_size = sizeof(uint64_t);
-  // Allocate shmem assuming we have a full reduction.
-  int shmem_size = absl::bit_ceil(k) * max_kv_size * GetTopKWaveFrontSize<T>();
-  int num_threads = NumThreads(num_elements, k, batch_size);
-  if (num_threads == 0) {
-    return absl::FailedPreconditionError(
-        "Invalid kernel parameters. This is likely a bug in the "
-        "TopkSpecializer.");
-  }
-  se::StreamExecutor* executor = stream->parent();
-  se::DeviceMemory<T> data_typed(data);
-  se::DeviceMemory<T> top_elements_typed(top_elements);
-  se::DeviceMemory<uint32_t> top_indices_typed(top_indices);
-
-  TF_ASSIGN_OR_RETURN(void* kernel_symbol, GetKernel<T>(num_elements, k));
-  TF_ASSIGN_OR_RETURN(
-      auto kernel,
-      (se::TypedKernelFactory<se::DeviceMemory<T>, size_t, se::DeviceMemory<T>,
-                              se::DeviceMemory<uint32_t>,
-                              size_t>::Create(executor, "topk",
-                                              kernel_symbol)));
-
-  TF_RETURN_IF_ERROR(kernel.Launch(se::ThreadDim(num_threads, 1, 1),
-                                   se::BlockDim(batch_size, 1, 1), shmem_size,
-                                   stream, data_typed, num_elements,
-                                   top_elements_typed, top_indices_typed, k));
-
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-absl::Status RunTopk(se::Stream* stream, PrimitiveType dtype,
-                     se::DeviceMemoryBase data, size_t num_elements,
-                     se::DeviceMemoryBase top_elements,
-                     se::DeviceMemoryBase top_indices, size_t k,
-                     size_t batch_size) {
-  VLOG(2) << "TopK: " << primitive_util::LowercasePrimitiveTypeName(dtype)
-          << ", n: " << num_elements << ", k: " << k << ", bs: " << batch_size;
-  switch (dtype) {
-    case PrimitiveType::F32:
-      return TypedTopK<float>(stream, data, num_elements, top_elements,
-                              top_indices, k, batch_size);
-    case PrimitiveType::BF16:
-      return TypedTopK<bfloat16>(stream, data, num_elements, top_elements,
-                                 top_indices, k, batch_size);
-    default:
-      return absl::UnimplementedError("GpuTopK not implemented for this dtype");
-  }
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h
deleted file mode 100644
index a0435866afb1..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h
+++ /dev/null
@@ -1,322 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
-#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
-
-// This file contains bespoke and optimized implementation for TopK shapes. When
-// adding support for new shapes/dtypes, you also need to modify the rewriter
-// on topk_specializer.cc for these changes to be picked up.
-
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-
-#include "xla/service/gpu/kernels/topk_kernel_common.h"
-#include "xla/tsl/lib/math/math_util.h"
-
-#if GOOGLE_CUDA
-
-#define WAVEFRONT_SIZE 32
-#define FORCEINLINE __forceinline__
-
-#elif TENSORFLOW_USE_ROCM  // GOOGLE_CUDA
-
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-#define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
-#else
-#define WAVEFRONT_SIZE 64
-#endif
-#define FORCEINLINE __forceinline__
-
-#endif  // TENSORFLOW_USE_ROCM
-
-namespace xla::gpu {
-
-enum class ShflType { kSync, kUp, kDown, kXor };
-
-template <ShflType Type, class NT>
-__device__ FORCEINLINE NT GpuShuffle(NT val, uint32_t idx,
-                                     uint32_t allmsk = 0xffffffffu) {
-  constexpr uint32_t SZ =
-      tsl::MathUtil::CeilOfRatio(sizeof(NT), sizeof(uint32_t));
-  union S {
-    NT v;
-    uint32_t d[SZ];
-  };
-  S in{val}, res{};
-
-#pragma unroll
-  for (uint32_t i = 0; i < SZ; i++) {
-#if GOOGLE_CUDA
-    if constexpr (Type == ShflType::kSync)
-      res.d[i] = __shfl_sync(allmsk, in.d[i], idx);
-    else if constexpr (Type == ShflType::kUp)
-      res.d[i] = __shfl_up_sync(allmsk, in.d[i], idx);
-    else if constexpr (Type == ShflType::kDown)
-      res.d[i] = __shfl_down_sync(allmsk, in.d[i], idx);
-    else if constexpr (Type == ShflType::kXor)
-      res.d[i] = __shfl_xor_sync(allmsk, in.d[i], idx);
-#elif TENSORFLOW_USE_ROCM  // ROcm does not support sync shuffle intrinsics
-    if constexpr (Type == ShflType::kSync)
-      res.d[i] = __shfl(in.d[i], idx);
-    else if constexpr (Type == ShflType::kUp)
-      res.d[i] = __shfl_up(in.d[i], idx);
-    else if constexpr (Type == ShflType::kDown)
-      res.d[i] = __shfl_down(in.d[i], idx);
-    else if constexpr (Type == ShflType::kXor)
-      res.d[i] = __shfl_xor(in.d[i], idx);
-#endif
-  }
-  return res.v;
-}
-
-// Default implementation for KV holder. Useful for testing while adding support
-// for a new type, but generally bitpacking those values is more efficient. See
-// implementations below.
-template <typename T, typename V>
-struct Descending {
-  struct KVT {
-    T key;
-    V idx;
-  };
-
-  __device__ FORCEINLINE static bool cmp(const KVT& lhs, const KVT& rhs) {
-    return lhs.key == rhs.key ? lhs.idx < rhs.idx : lhs.key > rhs.key;
-  }
-};
-
-// TopK implements a faster TopK for K < 16.
-//
-// To compute the final largest K elements, we shard the data threads and each
-// of them computes the top k elements for the data in its slice. When all lanes
-// in a warp are done with their TopK, we merge all the lane-local topks into
-// lane 0 using warp-local reductions. The lane-local topk is computed at
-// PerWarpTopK() and the warp reduction is computed in Reduce(). The warp-local
-// results are stored in shared memory.
-//
-// Once all warps are done, we load all previously produced results into a
-// single warp and repeat the reduction described above. This is implemented in
-// MergeTopKs() and we reuse the Reduce() implementation described above. On
-// MergeTopKs we also write the final results to the user-provided buffer.
-//
-// === Detailed Design
-//
-// The high level goals of this implementations are:
-//  - Low latency for small N (i.e. kilobytes).
-//  - High throughput for large N and/or large batch.
-//
-// Non-goals:
-//  - K > 32. Register pressure will be too high.
-//  - Sharding over multiple SMs. As explained later, we can use TopK's
-//    structure to get this "for free".
-//
-// The core observation of this implementation is that reading/writing to main
-// memory is the bottleneck in usual the Sort/TopK implementations and that for
-// K<16 a linear scan with in-register data is faster than using a heap with
-// shared memory, especially when K is a power of two.
-//
-// The heap for K=7 looks like:
-//
-//             a0
-//        a1        a2
-//      a3  a4    a5  a6
-//
-// When performing a push/pop, in the worst case scenario we need to compare it
-// with the root, both of its children, and one of the two subtrees. This means
-// that using a heap for K=7 only save us 2/7 comparison. Additionally, if the
-// tree were unbalanced(e.g. K=8), we would not be able to unroll this
-// computation.
-//
-// If we're using linear insertion, the worst case results in the full K
-// comparisons comparisons, but with care all of those values can be kept in
-// registers, replacing somewhat load/store instructions with movs. This
-// performance are more than enough to surpass the heap.
-//
-// We split the data evenly over T (<=1024) threads, and use the algorithm above
-// to maintain a sorted list of K elements in registers and perform linear
-// insertions on every new element. Once a warp is done with their local slice,
-// we reduce the slice-local data using shfl and the insertion described above,
-// by adding the other lane's TopK results to the local lane. Once the warp is
-// done, lane 0 writes its results to shared memory. This step has complexity:
-//    theta(k * slice_size + k^2 * log2(k))
-//
-// On a second pass, we use a single warp to consume the results of the previous
-// step and merge them into a final topk, using an analogous algorithm to what
-// has been previously described. Complexity of this stage is:
-//    theta(k^2 * log2(k)).
-//
-// This algorithm only uses a single block per batch dimension, but for large N,
-// we can split the input into B batches of size N/B, calculate each of their
-// topks and then compute a final topk, fixing the indices in the process.
-//
-// Future improvements:
-//  - Use optimal sort/merge networks to reduce the complexity the algorithm and
-//    allow better scaling past K=16. This is fairly tricky to implement
-//    efficiently, so it was let out of v1.
-//
-
-template <size_t K, typename KT, typename VT,
-          template <class, class> class Traits = Descending>
-struct TopK {
-  using Trait = Traits<KT, VT>;
-  using KVT = typename Trait::KVT;
-
-  __device__ TopK(void* buffer, int num_outputs)
-      : buffer_(reinterpret_cast<KVT*>(buffer)), num_outputs_(num_outputs) {}
-
-  __device__ FORCEINLINE uint32_t Idx(uint32_t i) {
-    return blockDim.x * i + threadIdx.x;
-  }
-
-  // Compute a per-warp topk of a slice of data.
-  __device__ void PerWarpTopK(KT* key, int n) {
-    KVT tmp[K];
-    // TODO(doak): Use bitonic sort.
-#pragma unroll
-    for (int i = 0; i < K; i++) {
-      tmp[i] = {key[Idx(i)], VT(Idx(i))};
-    }
-#pragma unroll
-    for (int i = 0; i < K; i++) {
-#pragma unroll
-      for (int j = i + 1; j < K; j++) {
-        KVT ti = tmp[i];
-        KVT tj = tmp[j];
-        bool res = Trait::cmp(ti, tj);
-        tmp[i] = res ? ti : tj;
-        tmp[j] = res ? tj : ti;
-      }
-    }
-    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
-
-    for (int idx = K; idx < n; idx++) {
-      KVT kv{key[Idx(idx)], VT(Idx(idx))};
-      Push(tmp, kv);
-    }
-    Reduce(tmp, WarpSize);
-
-    if (threadIdx.x % WarpSize != 0) return;
-    int warp_id = threadIdx.x / WarpSize;
-#pragma unroll
-    for (int i = 0; i < K; i++) {
-      buffer_[i * WarpSize + warp_id] = tmp[i];
-    }
-  }
-
-  // Merge the per-warp topks into a single topk. The final data is written to
-  // `keys` and `idxs`
-  __device__ void MergeTopKs(KT* keys, uint32_t* idxs) {
-    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
-    KVT tmp[K];
-    // We only use one warp for this step.
-    if (threadIdx.x >= WarpSize) return;
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < K; i++) {
-      tmp[i] = buffer_[i * WarpSize + threadIdx.x];
-    }
-    Reduce(tmp, blockDim.x / WarpSize);
-    if (threadIdx.x != 0) return;
-    for (int i = 0; i < num_outputs_; ++i) {
-      keys[i] = tmp[i].key;
-      idxs[i] = tmp[i].idx;
-    }
-  }
-
-  // Merge `tmp` (a reverse-sorted array) from (0, `num_lanes`) lanes. The
-  // resulting array is stored in the tmp array of lane 0. For all other lanes,
-  // `tmp` is unspecified after this function is called.
-  __device__ FORCEINLINE void Reduce(KVT tmp[K], int num_lanes) {
-    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
-    int lane_id = threadIdx.x % WarpSize;
-    for (int offset = num_lanes / 2; offset > 0; offset /= 2) {
-#pragma unroll
-      for (int i = 0; i < K; i++) {
-        KVT kv = GpuShuffle<ShflType::kDown>(tmp[i], offset);
-        if (lane_id >= offset) continue;
-        Push(tmp, kv);
-      }
-    }
-  }
-
-  // Given a K-array of previously reverse-sorted KVTs, add kv to it and
-  // remove the smallest element of the resulting array. Preserves the sorted
-  // order of `tmp`.
-  // We are careful to write this code in a way that nvcc/ptxas will use
-  // predication rather than branching. If we don't get this right, then we
-  // can greatly expands the code size of the generated PTX and SASS by
-  // tens of thousands of instructions. This increased the size of the
-  // compressed JAX wheel by 25MiB, so be very careful to check the generated
-  // code size when changing this function.
-  static __device__ FORCEINLINE void Push(KVT tmp[K], const KVT& kv) {
-    bool p = Trait::cmp(tmp[K - 1], kv);
-    tmp[K - 1] = p ? tmp[K - 1] : kv;
-#pragma unroll
-    for (int i = static_cast<int>(K) - 2; i >= 0; --i) {
-      // Note: even though we could exit early as soon as the first time we
-      // see a value greater than kv, we don't do this because it makes nvcc
-      // generate terrible code.
-      bool p = Trait::cmp(tmp[i], kv);
-      auto t = tmp[i];
-      tmp[i] = p ? tmp[i] : tmp[i + 1];
-      tmp[i + 1] = p ? tmp[i + 1] : t;
-    }
-  }
-
-  KVT* buffer_;
-  int num_outputs_;
-};
-
-// This shared memory buffer needs to be declared outside of the templated
-// Run(), as otherwise it would generate name conflicts from the multiple
-// instantiations of Run() from the multiple monomorphizations of Run().
-extern __device__ __shared__ int shmem[];
-
-template <size_t K, typename KT, typename VT>
-__launch_bounds__(kTopKMaxThreadsPerBlock, 1) __global__
-    void Run(KT* data, int n, KT* result, uint32_t* result_idxs, int k) {
-  TopK<K, KT, VT> obj(shmem, k);
-
-  const uint32_t bidx = blockIdx.x;
-  auto in = data + n * bidx;
-  auto vals_out = result + k * bidx;
-  auto idxs_out = result_idxs + k * bidx;
-  int slice_size = n / blockDim.x;
-  if (threadIdx.x < n % blockDim.x) {
-    slice_size++;
-  }
-
-  obj.PerWarpTopK(in, slice_size);
-  obj.MergeTopKs(vals_out, idxs_out);
-}
-
-template <typename T, size_t K>
-void* GetTopKKernelForK(int n) {
-  // TODO(doak): Switch to uint32_t if we don't have an efficient
-  // implementation for uint16_t.
-  return n < std::numeric_limits<uint16_t>::max()
-             ? reinterpret_cast<void*>(&Run<K, T, uint16_t>)
-             : reinterpret_cast<void*>(&Run<K, T, uint32_t>);
-}
-
-template <typename T>
-int32_t GetTopKWaveFrontSize() {
-  return WAVEFRONT_SIZE;
-}
-
-}  // namespace xla::gpu
-
-#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel.h b/third_party/xla/xla/service/gpu/kernels/topk_kernel.h
deleted file mode 100644
index 8e15483f8c06..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
-#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
-
-#include <stddef.h>
-
-#include "absl/status/status.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"  // IWYU pragma: keep
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu {
-
-// Input: [batch_size, num_elements]dtype
-// Output:
-//  - top_elements: [batch_size, k] dtype
-//  - top_indices: [batch_size, k] u32
-// Where `top_elements` contains the largest elements of the input, and
-// `top_indices` their original indices.
-absl::Status RunTopk(se::Stream* stream, PrimitiveType dtype,
-                     se::DeviceMemoryBase data, size_t num_elements,
-                     se::DeviceMemoryBase top_elements,
-                     se::DeviceMemoryBase top_indices, size_t k,
-                     size_t batch_size);
-
-}  // namespace xla::gpu
-
-#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel_bfloat16.cu.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel_bfloat16.cu.cc
deleted file mode 100644
index c0e47295a18d..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel_bfloat16.cu.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/kernels/topk_kernel.cu.h"
-#include "xla/types.h"
-
-namespace xla::gpu {
-
-template void* GetTopKKernelForK<bfloat16, 1>(int n);
-template void* GetTopKKernelForK<bfloat16, 2>(int n);
-template void* GetTopKKernelForK<bfloat16, 4>(int n);
-template void* GetTopKKernelForK<bfloat16, 8>(int n);
-template void* GetTopKKernelForK<bfloat16, 16>(int n);
-
-template int32_t GetTopKWaveFrontSize<bfloat16>();
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h b/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h
deleted file mode 100644
index 5ddd9ed513d9..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
-#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
-
-#include <cstddef>
-
-// Contains shared declarations between topk_kernel.cc and topk_kernel.cu.cc
-// but avoids including ABSL, etc. which some CUDA compilers cannot
-// handle.
-
-namespace xla::gpu {
-
-// We perform 2 32-way reductions, which means the largest number of threads per
-// block we support is 1024.
-static constexpr size_t kTopKMaxThreadsPerBlock = 1024;
-
-template <typename T, size_t K>
-void* GetTopKKernelForK(int n);
-
-template <typename T>
-int32_t GetTopKWaveFrontSize();
-
-}  // namespace xla::gpu
-
-#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel_float.cu.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel_float.cu.cc
deleted file mode 100644
index b7b7823a4dff..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel_float.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/kernels/topk_kernel.cu.h"
-
-namespace xla::gpu {
-
-template void* GetTopKKernelForK<float, 1>(int n);
-template void* GetTopKKernelForK<float, 2>(int n);
-template void* GetTopKKernelForK<float, 4>(int n);
-template void* GetTopKKernelForK<float, 8>(int n);
-template void* GetTopKKernelForK<float, 16>(int n);
-
-template int32_t GetTopKWaveFrontSize<float>();
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
deleted file mode 100644
index 488e20fc9606..000000000000
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/kernels/topk_kernel.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <functional>
-#include <tuple>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/check.h"
-#include "absl/random/random.h"
-#include "absl/strings/substitute.h"
-#include "absl/time/time.h"
-#include "xla/stream_executor/device_memory_handle.h"
-#include "xla/stream_executor/gpu/gpu_init.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/test_benchmark.h"
-#include "xla/types.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu {
-namespace {
-
-using ::testing::Combine;
-using ::testing::Values;
-
-template <typename T>
-std::vector<T> RandomVecRange(int num_elements, T start, T end) {
-  std::vector<T> local;
-  local.reserve(num_elements);
-  thread_local absl::BitGen gen;
-  for (int i = 0; i < num_elements; ++i) {
-    local.push_back(absl::Uniform<T>(gen, start, end));
-  }
-  return local;
-}
-
-template <typename T>
-std::vector<T> RandomVec(int num_elements) {
-  return RandomVecRange(num_elements, static_cast<T>(0),
-                        static_cast<T>(num_elements));
-}
-
-template <typename T>
-std::vector<T> RandomVecNegative(int num_elements) {
-  return RandomVecRange(num_elements, -static_cast<T>(num_elements),
-                        static_cast<T>(0));
-}
-
-PrimitiveType Get(float) { return PrimitiveType::F32; }
-
-se::StreamExecutor* GetGpuExecutor() {
-  auto* platform =
-      se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
-  return platform->ExecutorForDevice(0).value();
-}
-
-// Params:
-//  - n_kb: number of elements in kilobytes.
-//  - k: number of elements to return.
-//  - batch_size
-//  - offset
-using TopkTest = ::testing::TestWithParam<std::tuple<int, int, int, int>>;
-
-// In this test we only check that the TopK logic works with float. For the full
-// dtype coverage suite, please add them to topk_test.cc, where we can use XLA
-// utilities to simplify the test logic.
-TEST_P(TopkTest, TopKFloat) {
-  using T = float;
-
-  auto* executor = GetGpuExecutor();
-  auto stream = executor->CreateStream().value();
-
-  const auto [n_kb, k, batch_size, offset] = GetParam();
-  const size_t n = n_kb * 1024 + offset;
-
-  stream_executor::DeviceMemoryHandle input_buffer(
-      executor, executor->AllocateArray<T>(n * batch_size));
-  stream_executor::DeviceMemoryHandle output_values(
-      executor, executor->AllocateArray<T>(k * batch_size));
-  stream_executor::DeviceMemoryHandle output_indices(
-      executor, executor->AllocateArray<uint32_t>(k * batch_size));
-
-  ASSERT_TRUE(!(input_buffer.memory().is_null() ||
-                output_values.memory().is_null() ||
-                output_indices.memory().is_null()));
-
-  auto source = RandomVec<T>(n * batch_size);
-  CHECK_OK(stream->Memcpy(input_buffer.memory_ptr(), source.data(),
-                          n * batch_size * sizeof(T)));
-
-  ASSERT_TRUE(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
-                      output_values.memory(), output_indices.memory(), k,
-                      batch_size)
-                  .ok());
-  std::vector<T> got(k);
-  ASSERT_TRUE(stream->BlockHostUntilDone().ok());
-  for (int i = 0; i < batch_size; i++) {
-    CHECK_OK(stream->Memcpy(
-        got.data(),
-        se::DeviceMemory<T>(output_values.memory()).GetSlice(k * i, k),
-        k * sizeof(T)));
-    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
-    std::sort(slice.begin(), slice.end(), std::greater<T>());
-    slice.resize(k);
-    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
-        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
-  }
-}
-
-TEST_P(TopkTest, TopKPackedNegative) {
-  using T = float;
-
-  auto* executor = GetGpuExecutor();
-  auto stream = executor->CreateStream().value();
-
-  const auto [n_kb, k, batch_size, offset] = GetParam();
-  const size_t n = n_kb * 1024 + offset;
-
-  stream_executor::DeviceMemoryHandle input_buffer(
-      executor, executor->AllocateArray<T>(n * batch_size));
-  stream_executor::DeviceMemoryHandle output_values(
-      executor, executor->AllocateArray<T>(k * batch_size));
-  stream_executor::DeviceMemoryHandle output_indices(
-      executor, executor->AllocateArray<uint32_t>(k * batch_size));
-
-  ASSERT_TRUE(!(input_buffer.memory().is_null() ||
-                output_values.memory().is_null() ||
-                output_indices.memory().is_null()));
-
-  auto source = RandomVecNegative<T>(n * batch_size);
-  CHECK_OK(stream->Memcpy(input_buffer.memory_ptr(), source.data(),
-                          n * batch_size * sizeof(T)));
-
-  ASSERT_TRUE(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
-                      output_values.memory(), output_indices.memory(), k,
-                      batch_size)
-                  .ok());
-  std::vector<T> got(k);
-  ASSERT_TRUE(stream->BlockHostUntilDone().ok());
-  for (int i = 0; i < batch_size; i++) {
-    CHECK_OK(stream->Memcpy(
-        got.data(),
-        se::DeviceMemory<T>(output_values.memory()).GetSlice(k * i, k),
-        k * sizeof(T)));
-    std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
-    std::sort(slice.begin(), slice.end(), std::greater<T>());
-    slice.resize(k);
-    EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
-        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(TopkTests, TopkTest,
-                         Combine(
-                             /*n_kb=*/Values(1, 8, 12, 64, 128),
-                             /*k=*/Values(1, 2, 8, 16, 7, 12),
-                             /*batch_size=*/Values(1, 16, 64, 128),
-                             /*offset=*/Values(0, 7, 4)),
-                         [](const auto& info) {
-                           return absl::Substitute(
-                               "n$0KiB_k$1_batch_size$2_offset$3",
-                               std::get<0>(info.param), std::get<1>(info.param),
-                               std::get<2>(info.param),
-                               std::get<3>(info.param));
-                         });
-
-template <size_t K>
-void BM_SmallTopk(benchmark::State& state) {
-  using T = float;
-
-  size_t k = K;
-  size_t batch_size = state.range(0);
-  size_t n = state.range(1) * 1024;
-  state.SetLabel(
-      absl::Substitute("n=$0Ki k=$1 batch_size=$2", n / 1024, k, batch_size));
-
-  auto* executor = GetGpuExecutor();
-  auto stream = executor->CreateStream().value();
-
-  stream_executor::DeviceMemoryHandle input_buffer(
-      executor, executor->AllocateArray<T>(n * batch_size));
-  stream_executor::DeviceMemoryHandle output_values(
-      executor, executor->AllocateArray<T>(k * batch_size));
-  stream_executor::DeviceMemoryHandle output_indices(
-      executor, executor->AllocateArray<uint32_t>(k * batch_size));
-
-  if (input_buffer.memory().is_null() || output_values.memory().is_null() ||
-      output_indices.memory().is_null()) {
-    state.SkipWithError("Unable to allocate GPU memory: aborting benchmark");
-    return;
-  }
-
-  auto source = RandomVec<T>(n);
-  // use the same random vector for all batches (otherwise it takes too much
-  // time to generate random data)
-  for (size_t i = 0; i < batch_size; i++) {
-    auto slice = se::DeviceMemory<T>(input_buffer.memory()).GetSlice(i * n, n);
-    CHECK_OK(stream->Memcpy(&slice, source.data(), n * sizeof(T)));
-  }
-
-  for (auto _ : state) {
-    // Warmup execution without GpuTimer active
-    CHECK_OK(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
-                     output_values.memory(), output_indices.memory(), k,
-                     batch_size));
-    TF_ASSERT_OK_AND_ASSIGN(auto timer, stream->CreateEventBasedTimer(true));
-    CHECK_OK(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
-                     output_values.memory(), output_indices.memory(), k,
-                     batch_size));
-    auto timer_duration = timer->GetElapsedDuration();
-    CHECK_OK(timer_duration.status());
-    state.SetIterationTime(absl::ToDoubleSeconds(timer_duration.value()));
-  }
-  size_t items_processed = batch_size * n * state.iterations();
-  state.SetItemsProcessed(items_processed);
-  state.SetBytesProcessed(items_processed * sizeof(T));
-}
-
-BENCHMARK(BM_SmallTopk<1>)->RangePair(1, 1024, 16, 1024)->UseManualTime();
-BENCHMARK(BM_SmallTopk<2>)->RangePair(1, 1024, 16, 1024)->UseManualTime();
-BENCHMARK(BM_SmallTopk<4>)->RangePair(1, 1024, 16, 1024)->UseManualTime();
-BENCHMARK(BM_SmallTopk<8>)->RangePair(1, 1024, 16, 1024)->UseManualTime();
-BENCHMARK(BM_SmallTopk<16>)->RangePair(1, 1024, 16, 1024)->UseManualTime();
-
-}  // namespace
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 4c4409f01208..2bde15b7094c 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/tsl:tsl.bzl",
     "if_google",
@@ -29,32 +29,23 @@ cc_library(
     deps = [
         ":load_ir_module",
         ":utils",
-        "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_proto_cc",
-        "//xla/service/gpu:metrics",
-        "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_type_conversion_util",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:semantic_version",
-        "//xla/tsl/util:env_var",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
         "@llvm-project//llvm:CodeGen",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:IPO",
-        "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:ObjCARC",  # buildcleaner: keep
@@ -63,17 +54,22 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:cuda_root_path",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:random",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
-        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "ptx_version_util",
+    srcs = [
+        "ptx_version_util.cc",
+    ],
+    hdrs = [
+        "ptx_version_util.h",
+    ],
+    deps = [
+        "//xla/stream_executor:semantic_version",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -93,6 +89,7 @@ cc_library(
         ":llvm_gpu_backend",
         ":load_ir_module",
         ":nvptx_libdevice_path",
+        ":ptx_version_util",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/service/gpu:metrics",
@@ -141,38 +138,26 @@ cc_library(
     deps = [
         ":llvm_gpu_backend",
         ":load_ir_module",
-        ":utils",
-        "//xla:status_macros",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_proto_cc",
-        "//xla/service/gpu:metrics",
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_type_conversion_util",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:semantic_version",
-        "//xla/stream_executor/cuda:subprocess_compilation",
         "//xla/tsl/platform:rocm_rocdl_path",
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:AMDGPUAsmParser",
-        "@llvm-project//llvm:AMDGPUCodeGen",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
         "@llvm-project//llvm:CodeGen",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:IPO",
-        "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:ObjCARC",  # buildcleaner: keep
@@ -180,10 +165,6 @@ cc_library(
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
-        "@llvm-project//mlir:NVVMDialect",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:cuda_root_path",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -191,7 +172,6 @@ cc_library(
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -252,7 +232,6 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
         "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
index ff15924754dc..429393b12360 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
@@ -320,6 +320,9 @@ absl::Status AMDGPUTargetModuleLinker(
       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
     }
   }
+  const int32_t kAbiVersion = 500;
+  module->addModuleFlag(llvm::Module::Error, "amdhsa_code_object_version",
+                        kAbiVersion);
 
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 0fb6db0211b7..3b44bc068f07 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
@@ -61,11 +61,10 @@ limitations under the License.
 #include "xla/service/gpu/llvm_gpu_backend/utils.h"
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
@@ -192,16 +191,16 @@ std::string MakeNameForTempProduct(absl::string_view input_filename,
 // NOLINTEND: clang-diagnostic-unused-function
 
 void DumpModule(const std::string output_filename, const llvm::Module* module) {
-  std::error_code ec;
-  auto out = std::make_unique<llvm::raw_fd_ostream>(
-      llvm::StringRef(output_filename), ec, llvm::sys::fs::OF_None);
-  if (ec) {
-    LOG(FATAL) << "Unable to open " << output_filename
-               << " to dump LLVM IR: " << ec.message();
-    return;
+  std::string content;
+  llvm::raw_string_ostream string_stream(content);
+  module->print(string_stream, /*AAW=*/nullptr);
+
+  auto status =
+      WriteStringToFile(tsl::Env::Default(), output_filename, content);
+  if (!status.ok()) {
+    LOG(FATAL) << "Unable to write " << output_filename
+               << " to dump LLVM IR: " << status.message();
   }
-  module->print(*out, /*AAW=*/nullptr);
-  out->close();
 }
 
 const llvm::Module* GetModule(llvm::Any IR) {
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
index 857b8d2521bd..08cf34ea4275 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
@@ -238,8 +238,9 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
   int sm_version = 30;
   // If the current compute capability isn't known, fallback to the
   // most recent version before it.
-  int supported_versions[] = {120, 101, 100, 90, 89, 87, 86, 80, 75, 72, 70,
-                              62,  61,  60,  53, 52, 50, 37, 35, 32, 30};
+  int supported_versions[] = {121, 120, 103, 101, 100, 90, 89, 87,
+                              86,  80,  75,  72,  70,  62, 61, 60,
+                              53,  52,  50,  37,  35,  32, 30};
   for (int v : supported_versions) {
     if (v <= compute_capability_version) {
       sm_version = v;
@@ -261,7 +262,7 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
   // On Hopper, default to sm_90a so that all instructions can be used. But
   // only sm_90 is forward compatible, so don't use sm_90a with newer hardware:
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#ptx-compatibility
-  // Similarly for sm_100a, sm_101a and sm_120a (Blackwell).
+  // Similarly for sm_10#a and sm_12#a (Blackwell).
   absl::string_view extension =
       stream_executor::ShouldUsePtxExtension(compute_capability) ? "a" : "";
   return absl::StrCat("sm_", sm_version, extension);
@@ -330,39 +331,4 @@ absl::StatusOr<std::string> CompileToPtx(
   }
   return ptx;
 }
-
-namespace {
-constexpr stream_executor::SemanticVersion kFallbackPtxVersion{6, 5, 0};
-constexpr stream_executor::SemanticVersion kMaxPtxVersion{8, 7, 0};
-}  // namespace
-
-stream_executor::SemanticVersion
-DetermineHighestSupportedPtxVersionFromCudaVersion(
-    stream_executor::SemanticVersion cuda_version) {
-  if (cuda_version < stream_executor::SemanticVersion{11, 0, 0}) {
-    // For everything below CUDA 11 we just fall back to PTX 6.5.
-    // We don't support CUDA below 11 anymore.
-    return kFallbackPtxVersion;
-  }
-
-  // Mapping determined from
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
-  // Examples:
-  // CUDA 11.0 -> PTX 7.0
-  // CUDA 11.1 -> PTX 7.1
-  // CUDA 12.0 -> PTX 8.0
-  // CUDA 12.4 -> PTX 8.4
-  // This versioning scheme is valid until CUDA 12.6
-  if (cuda_version < stream_executor::SemanticVersion{12, 6, 0}) {
-    return {cuda_version.major() - 4, cuda_version.minor(), 0};
-  }
-  // CUDA 12.6 -> PTX 8.5
-  // CUDA 12.8 -> PTX 8.7
-  if (cuda_version < stream_executor::SemanticVersion{12, 9, 0}) {
-    return {cuda_version.major() - 4, cuda_version.minor() - 1, 0};
-  }
-
-  // Return maximum known PTX version.
-  return kMaxPtxVersion;
-}
 }  // namespace xla::gpu::nvptx
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h
index 38711afcb52b..525641d90b46 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
+#include "xla/service/gpu/llvm_gpu_backend/ptx_version_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/xla.pb.h"
@@ -48,11 +49,6 @@ absl::StatusOr<std::string> CompileToPtx(
     const DebugOptions& debug_options,
     std::function<void(llvm::TargetMachine*)> configure_target = nullptr);
 
-// Determine PTX version from CUDA version.
-stream_executor::SemanticVersion
-DetermineHighestSupportedPtxVersionFromCudaVersion(
-    stream_executor::SemanticVersion cuda_version);
-
 // Returns the LLVM command line flags that we use for compilation.
 std::vector<std::string> GetNVPTXBackendOptions(
     const DebugOptions& debug_options);
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
index 72ccef79ae3d..ddc8a9087982 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
@@ -30,10 +30,12 @@ TEST(UtilsTest, TestGetSmName) {
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{9, 0}), "sm_90a");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 0}), "sm_100a");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 1}), "sm_101a");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 3}), "sm_103a");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 0}), "sm_120a");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 1}), "sm_121a");
   // Do not use the extension for a yet-unknown compute capability.
   // https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes-ptx-release-history
-  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 9}), "sm_120");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 9}), "sm_121");
 }
 
 using VersionPair = std::pair<se::SemanticVersion, se::SemanticVersion>;
@@ -66,6 +68,7 @@ INSTANTIATE_TEST_SUITE_P(VersionTest, PtxVersionFromCudaVersionTest,
                              {{12, 5, 0}, {8, 5, 0}},
                              {{12, 6, 0}, {8, 5, 0}},
                              {{12, 8, 0}, {8, 7, 0}},
+                             {{12, 9, 0}, {8, 8, 0}},
                          }),
                          [](::testing::TestParamInfo<VersionPair> data) {
                            se::SemanticVersion cuda_version = data.param.first;
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/ptx_version_util.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/ptx_version_util.cc
new file mode 100644
index 000000000000..749eb30f7bbf
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/ptx_version_util.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/llvm_gpu_backend/ptx_version_util.h"
+
+namespace xla::gpu::nvptx {
+
+namespace {
+constexpr stream_executor::SemanticVersion kFallbackPtxVersion{6, 5, 0};
+constexpr stream_executor::SemanticVersion kMaxPtxVersion{8, 8, 0};
+}  // namespace
+
+stream_executor::SemanticVersion
+DetermineHighestSupportedPtxVersionFromCudaVersion(
+    stream_executor::SemanticVersion cuda_version) {
+  if (cuda_version < stream_executor::SemanticVersion{11, 0, 0}) {
+    // For everything below CUDA 11 we just fall back to PTX 6.5.
+    // We don't support CUDA below 11 anymore.
+    return kFallbackPtxVersion;
+  }
+
+  // Mapping determined from
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
+  // Examples:
+  // CUDA 11.0 -> PTX 7.0
+  // CUDA 11.1 -> PTX 7.1
+  // CUDA 12.0 -> PTX 8.0
+  // CUDA 12.4 -> PTX 8.4
+  // This versioning scheme is valid until CUDA 12.6
+  if (cuda_version < stream_executor::SemanticVersion{12, 6, 0}) {
+    return {cuda_version.major() - 4, cuda_version.minor(), 0};
+  }
+  // CUDA 12.6 -> PTX 8.5
+  // CUDA 12.8 -> PTX 8.7
+  // CUDA 12.9 -> PTX 8.8
+  if (cuda_version < stream_executor::SemanticVersion{12, 10, 0}) {
+    return {cuda_version.major() - 4, cuda_version.minor() - 1, 0};
+  }
+
+  // Return maximum known PTX version.
+  return kMaxPtxVersion;
+}
+}  // namespace xla::gpu::nvptx
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/ptx_version_util.h b/third_party/xla/xla/service/gpu/llvm_gpu_backend/ptx_version_util.h
new file mode 100644
index 000000000000..ecfb0c1a1736
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/ptx_version_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_LLVM_GPU_BACKEND_PTX_VERSION_UTIL_H_
+#define XLA_SERVICE_GPU_LLVM_GPU_BACKEND_PTX_VERSION_UTIL_H_
+
+#include "xla/stream_executor/semantic_version.h"
+
+namespace xla::gpu::nvptx {
+
+// Determine PTX version from CUDA version.
+stream_executor::SemanticVersion
+DetermineHighestSupportedPtxVersionFromCudaVersion(
+    stream_executor::SemanticVersion cuda_version);
+
+}  // namespace xla::gpu::nvptx
+
+#endif  // XLA_SERVICE_GPU_LLVM_GPU_BACKEND_PTX_VERSION_UTIL_H_
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
deleted file mode 100644
index ad569593a849..000000000000
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/make_batch_pointers.h"
-
-#include <cstddef>
-
-#include "absl/status/status.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/typed_kernel_factory.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-#if TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_stream.h"
-namespace stream_executor::gpu {
-
-extern void rocm_MakeBatchPointers(void* stream, char* base, int stride, int n,
-                                   void** ptrs_out);
-
-}  // namespace stream_executor::gpu
-#endif
-
-namespace xla::gpu {
-
-namespace make_batch_pointers {
-void* kernel();  // returns a pointer to a CUDA C++ device function
-}  // namespace make_batch_pointers
-
-absl::Status MakeBatchPointers(se::Stream* stream,
-                               se::DeviceMemoryBase base_ptr,
-                               size_t stride_bytes, size_t n,
-                               se::DeviceMemoryBase ptrs_out) {
-#if TENSORFLOW_USE_ROCM
-  stream_executor::gpu::rocm_MakeBatchPointers(
-      se::gpu::AsGpuStreamValue(stream),
-      reinterpret_cast<char*>(base_ptr.opaque()), stride_bytes, n,
-      reinterpret_cast<void**>(ptrs_out.opaque()));
-#else
-  se::StreamExecutor* executor = stream->parent();
-  static constexpr size_t kThreads = 128;
-
-  TF_ASSIGN_OR_RETURN(
-      auto kernel,
-      (se::TypedKernelFactory<
-          se::DeviceMemoryBase, size_t, size_t,
-          se::DeviceMemoryBase>::Create(executor, "make_batch_pointers",
-                                        make_batch_pointers::kernel())));
-
-  TF_RETURN_IF_ERROR(kernel.Launch(se::ThreadDim(kThreads, 1, 1),
-                                   se::BlockDim(CeilOfRatio(n, kThreads), 1, 1),
-                                   stream, base_ptr, stride_bytes, n,
-                                   ptrs_out));
-#endif
-  return absl::OkStatus();
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cu.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cu.cc
deleted file mode 100644
index 344f8ecc214e..000000000000
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.cu.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstddef>
-
-namespace xla::gpu {
-namespace {
-__global__ void MakeBatchPointers(char* base, size_t stride, size_t n,
-                                  void** ptrs_out) {
-  size_t idx = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
-  if (idx >= n) return;
-  ptrs_out[idx] = base + idx * stride;
-}
-}  // namespace
-
-namespace make_batch_pointers {
-void* kernel() { return reinterpret_cast<void*>(MakeBatchPointers); }
-}  // namespace make_batch_pointers
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/matmul_indexing_utils.cc b/third_party/xla/xla/service/gpu/matmul_indexing_utils.cc
index 7359fede2cc5..4255c6e0a4af 100644
--- a/third_party/xla/xla/service/gpu/matmul_indexing_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_indexing_utils.cc
@@ -33,11 +33,11 @@ namespace gpu {
 absl::StatusOr<std::vector<int64_t>> GetNonContractingDims(
     const Shape& shape, absl::Span<const int64_t> batch_dims,
     absl::Span<const int64_t> contracting_dims) {
-  auto nc =
-      ::xla::GetNonContractingDims(shape.rank(), contracting_dims, batch_dims);
+  auto nc = ::xla::GetNonContractingDims(shape.dimensions().size(),
+                                         contracting_dims, batch_dims);
 
   TF_RET_CHECK(batch_dims.size() + contracting_dims.size() + nc.size() ==
-               shape.rank());
+               shape.dimensions().size());
   return std::vector<int64_t>(nc.begin(), nc.end());
 }
 
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 079a6b29e4a4..7a203388fd88 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -65,7 +65,7 @@ absl::StatusOr<Shape> GetBatchRowColumnShape(
   TF_RET_CHECK(shape.has_layout());
 
   std::vector<int64_t> minor_to_major;
-  for (size_t i = 0; i < shape.rank();) {
+  for (size_t i = 0; i < shape.dimensions().size();) {
     // The GeMM output always has its layout set such that the batch, row, and
     // col dim groups are each laid out physically sequentially. GeMM operands
     // must, therefore, be laid out similarly.
@@ -112,7 +112,7 @@ absl::StatusOr<Shape> GetBatchRowColumnShape(
 
 // Returns the matrix layout for a logical shape (batch, rows, columns).
 /*static*/ absl::StatusOr<MatrixLayout> MatrixLayout::For(const Shape& shape) {
-  TF_RET_CHECK(shape.rank() == 3);
+  TF_RET_CHECK(shape.dimensions().size() == 3);
   TF_RET_CHECK(shape.has_layout());
 
   int64_t batch_size = shape.dimensions(0);
@@ -130,6 +130,13 @@ absl::StatusOr<Shape> GetBatchRowColumnShape(
     case 012:  // (B,R,C) (major-to-minor)
       break;
     case 021:  // (B,C,R)
+      if (num_cols == 1) {
+        // If rhs operand has no non-contracting dims, guarantee bias vector
+        // length will still match matrix D rows with HIPBLASLT_EPILOGUE_BIAS
+        // epilogue
+        // (https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/datatypes.html).
+        break;
+      }
       order = Order::kColumnMajor;
       leading_dim_stride = num_rows;
       break;
@@ -168,10 +175,10 @@ absl::StatusOr<Shape> GetBatchRowColumnShape(
     size_t rhs_num_batch_dims, size_t rhs_num_col_dims) {
   size_t num_batch_dims = std::max(lhs_num_batch_dims, rhs_num_batch_dims);
 
-  TF_RET_CHECK(shape.rank() ==
+  TF_RET_CHECK(shape.dimensions().size() ==
                num_batch_dims + lhs_num_row_dims + rhs_num_col_dims);
 
-  std::vector<int64_t> dims(shape.rank());
+  std::vector<int64_t> dims(shape.dimensions().size());
   absl::c_iota(dims, 0);
 
   auto batch_dims = absl::Span<const int64_t>(dims).first(num_batch_dims);
@@ -293,10 +300,10 @@ absl::StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   int64_t num_batch_dims =
       std::max(lhs_batch_dims.size(), rhs_batch_dims.size());
 
-  TF_RET_CHECK(output_shape.rank() ==
+  TF_RET_CHECK(output_shape.dimensions().size() ==
                num_batch_dims + lhs_row_dims.size() + rhs_col_dims.size());
 
-  std::vector<int64_t> output_dims(output_shape.rank());
+  std::vector<int64_t> output_dims(output_shape.dimensions().size());
   absl::c_iota(output_dims, 0);
 
   auto output_batch_dims =
@@ -860,5 +867,26 @@ bool IsDotSupportedByClassicalEmitters(const HloInstruction& dot) {
   }
 }
 
+PrimitiveType GetGemmAccumulatorType(HloDotInstruction* dot) {
+  // Return the accumulator type if it is explicitly specified as dot algorithm.
+  auto accumulator_type = algorithm_util::GetDotAccumulatorType(
+      dot->precision_config().algorithm());
+  if (accumulator_type.ok()) {
+    return accumulator_type.value();
+  }
+  // Otherwise, return the default accumulator type for the output type.
+  PrimitiveType output_type = dot->shape().element_type();
+  switch (output_type) {
+    case PrimitiveType::F16:
+    case PrimitiveType::BF16:
+      return PrimitiveType::F32;
+    case PrimitiveType::F32:
+    case PrimitiveType::F64:
+    case PrimitiveType::S32:
+    default:
+      return output_type;
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index 7f3be061fc59..12e1bc2520e3 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/blas.h"
@@ -58,6 +59,10 @@ absl::StatusOr<bool> IsMatrixMultiplicationTooSmallForRewriting(
 // so we need to always use cuBLAS or Triton for those.
 bool IsDotSupportedByClassicalEmitters(const HloInstruction& dot);
 
+// Returns the accumulator type for the given dot instruction (either extracted
+// from the dot algorithm or inferred from the output type).
+PrimitiveType GetGemmAccumulatorType(HloDotInstruction* dot);
+
 // extending plain MatrixLayout struct with creator functions
 struct MatrixLayout : public se::gpu::MatrixLayout {
   // Returns the matrix layout for a logical shape (batch, rows, columns).
@@ -82,7 +87,10 @@ struct GemmConfig : public se::gpu::GemmConfig {
   // Size of the workspace based on NVIDIA recommendation:
   // https://docs.nvidia.com/cuda/cublas/#cublassetworkspace
   static constexpr int64_t kHopperWorkspace = 32 * 1024 * 1024;  // 32 MiB
+  static constexpr int64_t kGFX950Workspace = 64 * 1024 * 1024;  // 64 MiB
   static constexpr int64_t kDefaultWorkspace = 4 * 1024 * 1024;  // 4 MiB
+  // the number of algorithms to consider for autotuning by default
+  static constexpr int64_t kNumAlgorithms = 128;
 
   static absl::StatusOr<GemmConfig> For(
       const HloInstruction* gemm, const se::GpuComputeCapability& gpu_version);
diff --git a/third_party/xla/xla/service/gpu/matmul_utils_test.cc b/third_party/xla/xla/service/gpu/matmul_utils_test.cc
index 0d07c023302d..5dfa4ce30eb0 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/shape.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -35,7 +35,7 @@ namespace {
 
 using ::tsl::testing::IsOkAndHolds;
 
-using CanFoldTransposeOperandIntoDotTest = HloTestBase;
+using CanFoldTransposeOperandIntoDotTest = HloHardwareIndependentTestBase;
 
 TEST_F(CanFoldTransposeOperandIntoDotTest, ArgTransposeFoldGemm) {
   const char* hlo_text = R"(
@@ -223,7 +223,7 @@ TEST(GetMatrixLayoutTest, BatchInMostMinorPhysicalDimension) {
   EXPECT_FALSE(MatrixLayout::For(shape).ok());
 }
 
-using GetMatrixSizeRewriteThresholdTest = HloTestBase;
+using GetMatrixSizeRewriteThresholdTest = HloHardwareIndependentTestBase;
 
 TEST_F(GetMatrixSizeRewriteThresholdTest, MatMulTooSmallForRewrite) {
   const char* hlo_text = R"(
diff --git a/third_party/xla/xla/service/gpu/metrics.cc b/third_party/xla/xla/service/gpu/metrics.cc
index e6003790466e..c07a222e5b26 100644
--- a/third_party/xla/xla/service/gpu/metrics.cc
+++ b/third_party/xla/xla/service/gpu/metrics.cc
@@ -96,10 +96,6 @@ int64_t GetCompiledProgramsCount() {
   return compiled_programs_count->GetCell()->value();
 }
 
-void ResetCompiledProgramsCountForTesting() {
-  compiled_programs_count->GetCell()->IncrementBy(-GetCompiledProgramsCount());
-}
-
 void RecordXlaDeviceBinarySize(const int64_t size) {
   xla_device_binary_size->GetCell()->Set(size);
 }
diff --git a/third_party/xla/xla/service/gpu/metrics.h b/third_party/xla/xla/service/gpu/metrics.h
index 8244ff77d83b..7995ac88b022 100644
--- a/third_party/xla/xla/service/gpu/metrics.h
+++ b/third_party/xla/xla/service/gpu/metrics.h
@@ -44,10 +44,6 @@ void RecordPtxToCubinDuration(uint64_t time_usecs);
 // Counts compiled programs count.
 void IncrementCompiledProgramsCount();
 
-// DO NOT USE---this is exposed only for testing.
-// Resets compiled programs count.
-void ResetCompiledProgramsCountForTesting();
-
 // Gets compiled programs count.
 int64_t GetCompiledProgramsCount();
 
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 23a11eab53c9..13443eaccce3 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 
 # Libraries for performance modeling of HLO.
 load("//xla/tests:build_defs.bzl", "xla_test")
@@ -37,9 +37,9 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -48,20 +48,30 @@ cc_library(
     srcs = ["sol_latency_estimator.cc"],
     hdrs = ["sol_latency_estimator.h"],
     deps = [
+        ":collective_interpolator",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         ":gpu_performance_model_base",
+        ":hlo_op_profile_proto_cc",
+        ":hlo_op_profiles",
         ":sol_gpu_cost_model",
         "//xla:util",
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_ops_utils",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
         "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
     ],
 )
@@ -70,14 +80,16 @@ xla_cc_test(
     name = "sol_latency_estimator_test",
     srcs = ["sol_latency_estimator_test.cc"],
     deps = [
+        ":gpu_performance_model_base",
         ":sol_gpu_cost_model",
         ":sol_latency_estimator",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/time",
@@ -105,8 +117,6 @@ xla_cc_test(
     deps = [
         ":sol_gpu_cost_model",
         "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
     ],
@@ -121,6 +131,9 @@ xla_test(
         "gpu_h100",
         "gpu_b200",
     ],
+    tags = [
+        "notap",
+    ],
     deps = [
         ":analytical_latency_estimator",
         "//xla:shape_util",
@@ -158,10 +171,10 @@ xla_cc_test(
     deps = [
         ":fusion_analysis_cache",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -193,15 +206,14 @@ xla_cc_test(
     name = "gpu_cost_model_stats_collection_test",
     srcs = ["gpu_cost_model_stats_collection_test.cc"],
     deps = [
+        ":fusion_analysis_cache",
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
-        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
@@ -291,14 +303,13 @@ xla_cc_test(
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model_base",
-        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -320,13 +331,14 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/model:fusion_analysis_cache",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:status",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -342,14 +354,15 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
@@ -360,6 +373,51 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "gpu_dot_fusion_cost_model",
+    srcs = ["gpu_dot_fusion_cost_model.cc"],
+    hdrs = ["gpu_dot_fusion_cost_model.h"],
+    deps = [
+        ":gpu_performance_model_base",
+        ":tiled_hlo_instruction_or_computation",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_dot_fusion_cost_model_test",
+    srcs = ["gpu_dot_fusion_cost_model_test.cc"],
+    deps = [
+        ":gpu_dot_fusion_cost_model",
+        ":tiled_hlo_instruction_or_computation",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test_helpers",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "gpu_collective_performance_model",
     srcs = ["gpu_collective_performance_model.cc"],
@@ -385,27 +443,30 @@ cc_library(
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:launch_dimensions",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status",
-    ] + if_cuda_is_configured(["@local_config_cuda//cuda:nvml_headers"]),
+    ] + if_cuda_is_configured(["@local_config_cuda//cuda:cuda_headers"]),
 )
 
 xla_cc_test(
     name = "gpu_collective_performance_model_test",
     srcs = ["gpu_collective_performance_model_test.cc"],
     deps = [
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -464,6 +525,7 @@ xla_cc_test(
         ":tiled_hlo_instruction_or_computation",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_cost_analysis",
@@ -472,7 +534,6 @@ xla_cc_test(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/status",
@@ -503,7 +564,7 @@ xla_cc_test(
     srcs = ["affine_map_evaluator_test.cc"],
     deps = [
         ":affine_map_evaluator",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:test",
@@ -613,9 +674,8 @@ xla_cc_test(
         ":symbolic_tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
-        "//xla/tests:hlo_test_base",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
@@ -663,7 +723,7 @@ xla_cc_test(
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:indexing_test_utils",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
     ],
@@ -696,6 +756,8 @@ cc_library(
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
         "//xla/service:name_uniquer",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -720,19 +782,21 @@ xla_cc_test(
     name = "symbolic_tile_analysis_test",
     srcs = ["symbolic_tile_analysis_test.cc"],
     deps = [
-        ":symbolic_tile",
+        ":constraint_expression",
         ":symbolic_tile_analysis",
         ":symbolic_tiled_hlo_instruction",
         ":tiled_hlo_instruction_or_computation",
         "//xla:util",
         "//xla/hlo/analysis:indexing_test_utils",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -740,8 +804,6 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -751,14 +813,17 @@ cc_library(
     hdrs = ["triton_emitter_constraints.h"],
     deps = [
         ":affine_map_evaluator",
+        ":constraint_expression",
         ":symbolic_tile",
         ":symbolic_tile_analysis",
         ":symbolic_tiled_hlo_instruction",
         "//xla:shape_util",
+        "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
@@ -775,18 +840,18 @@ xla_cc_test(
         ":symbolic_tile_analysis",
         ":triton_emitter_constraints",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -821,7 +886,6 @@ xla_cc_test(
     srcs = ["coalescing_analysis_test.cc"],
     deps = [
         ":coalescing_analysis",
-        ":symbolic_tile",
         ":symbolic_tile_analysis",
         ":tiled_hlo_instruction_or_computation",
         "//xla:shape_util",
@@ -829,12 +893,12 @@ xla_cc_test(
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -923,55 +987,47 @@ cc_library(
     ],
 )
 
-[
-    xla_cc_test(
-        name = "hlo_op_profiler_run_" + sm,
-        timeout = "eternal",
-        srcs = ["hlo_op_profiler_run.cc"],
-        # Disable backend optimizations (in particular reassociate and instcombine) which would optimize
-        # expressions like integer add and multiply.
-        args = ["--xla_backend_optimization_level=0"],
-        # This is a development tool, not a normal test, and thus should only be run
-        # manually with --config=cuda.
-        tags = [
-            "gpu",
-            "manual",
-            "notap",
-            "requires-gpu-" + sm + "-only",
-        ],
-        deps = [
-            ":hlo_op_profile_proto_cc",
-            ":hlo_op_profiler_lib",
-            ":hlo_op_profiles",
-            "//xla:debug_options_flags",
-            "//xla:xla_data_proto_cc",
-            "//xla/hlo/ir:hlo",
-            "//xla/service:hlo_runner",
-            "//xla/service:platform_util",
-            "//xla/stream_executor:device_description",
-            "//xla/tsl/util:command_line_flags",
-            "@com_google_absl//absl/log",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
-            "@local_tsl//tsl/platform:env",
-            "@local_tsl//tsl/platform:path",
-            "@local_tsl//tsl/platform:platform_port",
-            "@local_tsl//tsl/platform:protobuf",
-            "@local_tsl//tsl/platform:status",
-        ],
-    )
-    for sm in [
-        "sm60",
-        "sm70",
-        "sm80",
-        "sm90",
-    ]
-]
+xla_test(
+    name = "hlo_op_profiler_run",
+    timeout = "eternal",
+    srcs = ["hlo_op_profiler_run.cc"],
+    # Disable backend optimizations (in particular reassociate and instcombine) which would optimize
+    # expressions like integer add and multiply.
+    args = ["--xla_backend_optimization_level=0"],
+    backends = ["gpu"],
+    # This is a development tool, not a normal test, and thus should only be run
+    # manually with --config=cuda.
+    tags = [
+        "gpu",
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":hlo_op_profile_proto_cc",
+        ":hlo_op_profiler_lib",
+        ":hlo_op_profiles",
+        "//xla:debug_options_flags",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_runner",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
 
 build_test(
     name = "hlo_op_profiler_build_test",
     targets = [
-        ":hlo_op_profiler_run_sm80",
+        ":hlo_op_profiler_run",
     ],
 )
 
@@ -999,6 +1055,7 @@ cc_library(
     srcs = ["sol_gpu_cost_model_stats_collection.cc"],
     hdrs = ["sol_gpu_cost_model_stats_collection.h"],
     deps = [
+        ":collective_interpolator",
         ":sol_gpu_cost_model",
         ":sol_latency_estimator",
         "//xla/hlo/ir:hlo",
@@ -1008,6 +1065,7 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -1021,10 +1079,11 @@ xla_cc_test(
     deps = [
         ":sol_gpu_cost_model_stats_collection",
         "//xla:shape_util",
-        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
@@ -1035,7 +1094,11 @@ xla_cc_test(
 cc_library(
     name = "interpolator",
     hdrs = ["interpolator.h"],
-    deps = ["@com_google_absl//absl/log:check"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 xla_cc_test(
@@ -1043,6 +1106,8 @@ xla_cc_test(
     srcs = ["interpolator_test.cc"],
     deps = [
         ":interpolator",
+        "//xla/service:overload",
+        "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1050,15 +1115,21 @@ xla_cc_test(
 cc_library(
     name = "collective_interpolator",
     srcs = ["collective_interpolator.cc"],
-    hdrs = ["collective_interpolator.h"],
+    hdrs = [
+        "collective_interpolator.h",
+        "collective_interpolator_data.h",
+    ],
     deps = [
         ":gpu_hlo_cost_analysis",
         ":hlo_op_profile_proto_cc",
+        ":hlo_op_profiles",
         ":interpolator",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:collective_ops_utils",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
+        "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -1084,11 +1155,172 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu/transforms/collectives:collective_ops_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "collective_ptable_stats_collection",
+    srcs = ["collective_ptable_stats_collection.cc"],
+    hdrs = ["collective_ptable_stats_collection.h"],
+    deps = [
+        ":collective_interpolator",
+        ":hlo_op_profile_proto_cc",
+        ":hlo_op_profiles",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_ptable_stats_collection_test",
+    srcs = ["collective_ptable_stats_collection_test.cc"],
+    deps = [
+        ":collective_ptable_stats_collection",
+        ":hlo_op_profile_proto_cc",
+        ":hlo_op_profiles",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:path",
+    ],
+)
+
+cc_library(
+    name = "matmul_interpolator",
+    srcs = ["matmul_interpolator.cc"],
+    hdrs = ["matmul_interpolator.h"],
+    deps = [
+        ":hlo_op_profile_proto_cc",
+        ":interpolator",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+xla_cc_test(
+    name = "matmul_interpolator_test",
+    srcs = ["matmul_interpolator_test.cc"],
+    deps = [
+        ":hlo_op_profile_proto_cc",
+        ":matmul_interpolator",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "matmul_ptable_stats_collection",
+    srcs = ["matmul_ptable_stats_collection.cc"],
+    hdrs = ["matmul_ptable_stats_collection.h"],
+    deps = [
+        ":gpu_dot_fusion_cost_model",
+        ":hlo_op_profile_proto_cc",
+        ":hlo_op_profiles",
+        ":matmul_interpolator",
+        ":tiled_hlo_instruction_or_computation",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/transforms:nest_gemm_fusion",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "matmul_ptable_stats_collection_test",
+    srcs = ["matmul_ptable_stats_collection_test.cc"],
+    deps = [
+        ":hlo_op_profile_proto_cc",
+        ":matmul_ptable_stats_collection",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:path",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/model/affine_map_evaluator_test.cc b/third_party/xla/xla/service/gpu/model/affine_map_evaluator_test.cc
index 8c33813f0558..87d350a39604 100644
--- a/third_party/xla/xla/service/gpu/model/affine_map_evaluator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/affine_map_evaluator_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -31,7 +31,7 @@ using ::mlir::bindDims;
 using ::mlir::bindSymbols;
 using ::testing::ElementsAre;
 
-class AffineMapEvaluator : public HloTestBase {
+class AffineMapEvaluator : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
 };
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index 43f05340ce2f..36bd4b004922 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -26,11 +26,10 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_collective_performance_model.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
-#include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/stream_executor/device_description.h"
-#include "tsl/platform/status.h"
+#include "xla/tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
@@ -62,9 +61,8 @@ LatencyEstimator::TimeCost AnalyticalLatencyEstimator::NodeCost(
   }
 
   absl::Duration total_estimated_time =
-      GpuPerformanceModel::EstimateRunTimeForInstruction(
-          instr, gpu_info_, &*cost_analysis_,
-          GpuPerformanceModelOptions::Default())
+      gpu_performance_model_
+          .EstimateRunTimeForInstruction(instr, &*cost_analysis_)
           .exec_time;
   LatencyEstimator::TimeCost cost_in_us =
       absl::ToDoubleMicroseconds(total_estimated_time);
@@ -81,6 +79,7 @@ AnalyticalLatencyEstimator::AnalyticalLatencyEstimator(
     HloComputation* computation)
     : config_(config),
       gpu_info_(gpu_info),
+      gpu_performance_model_(gpu_info),
       latency_estimator_(std::move(latency_estimator)),
       shape_size_function_(shape_size_function) {
   cost_analysis_.emplace(
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
index 3f3c67f9da40..fa5c9cac3af2 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/stream_executor/device_description.h"
@@ -53,6 +54,7 @@ class AnalyticalLatencyEstimator : public LatencyEstimator {
  private:
   const SchedulerConfig config_;
   const se::DeviceDescription& gpu_info_;
+  GpuPerformanceModelOwning gpu_performance_model_;
   std::optional<GpuHloCostAnalysis> cost_analysis_;
   std::unique_ptr<LatencyEstimator> latency_estimator_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 29e336fe4f06..c931ec6b2cd3 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -221,12 +221,12 @@ bool EstimateCoalescingViaMemoryTransactionsCount(
 
 // Returns a linearized shape, i.e. tensor<num_elements(input) x element_type>.
 Shape GetLinearizedShape(const Shape& shape) {
-  if (shape.rank() == 0) {
+  if (shape.dimensions().size() == 0) {
     return shape;
   }
   std::vector<int64_t> dims{ShapeUtil::ElementsIn(shape)};
   auto result = Shape(shape.element_type(), dims,
-                      absl::InlinedVector<bool, 4>(dims.size(), false), {});
+                      absl::InlinedVector<bool, 4>(dims.size(), false));
   *result.mutable_layout() = xla::Layout({0});
   return result;
 }
@@ -243,7 +243,7 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
        llvm::enumerate(fusion_analysis.fusion_heroes())) {
     for (const auto& [hero_operand_index, hero_operand] :
          llvm::enumerate(hero.GetOperands())) {
-      if (hero_operand.shape().rank() == 0) {
+      if (hero_operand.shape().dimensions().size() == 0) {
         continue;
       }
       // Compute thread ID -> hero operand indexing map.
@@ -666,7 +666,7 @@ bool CoalescingAnalysis::ComputeCoalescingForAllOperands(
     return false;
   }
   for (const HloInstruction* operand : operands) {
-    if (operand->shape().rank() == 0) {
+    if (operand->shape().dimensions().size() == 0) {
       coalescing_per_operand_.insert({operand, true});
       continue;
     }
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index a5dbaf2ead04..4b1e4b5e64f1 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -49,7 +49,7 @@ namespace {
 
 using ::testing::ElementsAre;
 
-class CoalescingTest : public HloTestBase {
+class CoalescingTest : public HloHardwareIndependentTestBase {
  public:
   std::vector<bool> IsReadCoalescedPerOperand(absl::string_view hlo_string) {
     auto module = ParseAndReturnVerifiedModule(hlo_string).value();
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
index 702e7d61a3e5..48fda43a9464 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -34,9 +36,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/gpu/model/collective_interpolator_data.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
 #include "xla/service/gpu/model/interpolator.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
@@ -49,17 +55,15 @@ namespace xla::gpu {
 
 namespace {
 
-// TODO(olechwierowicz): Add support per device generation (Ampere, Hopper,
-// Blackwell).
-constexpr uint8_t kNumDevicesPerHost = 8;
-
 struct InterpolationSpecification {
   HloOpcode opcode;
-  CollectiveInterpolator::CommunicationType comm;
-  int num_devices;
-  int transfer_size;
+  GPUCommunicationType comm;
+  int64_t num_devices;
+  int64_t transfer_size;
 };
 
+// Returns number of participating devices in an input `device_list`. Supports
+// only `iota_replica_group_list`.
 absl::StatusOr<int> GetNumParticipatingDevices(
     const CollectiveDeviceList& device_list) {
   auto iota = device_list.iota_replica_group_list();
@@ -70,31 +74,6 @@ absl::StatusOr<int> GetNumParticipatingDevices(
   return iota->num_devices_per_group();
 }
 
-absl::StatusOr<int> GetNumParticipatingDevices(
-    const HloCollectiveInstruction& instr) {
-  return GetNumParticipatingDevices(instr.device_list());
-}
-
-absl::StatusOr<CollectiveInterpolator::CommunicationType> CommType(
-    const CollectiveDeviceList& device_list) {
-  auto iota = device_list.iota_replica_group_list();
-  if (!iota.has_value()) {
-    return absl::FailedPreconditionError(
-        "Only iota device assignment is supported.");
-  }
-  if (iota->num_replica_groups() == 1) {
-    return CollectiveInterpolator::CommunicationType::RAIL_ALIGNED;
-  }
-  if (iota->num_replica_groups() == kNumDevicesPerHost &&
-      iota->transpose_perm().size() == 2 && iota->transpose_perm()[0] == 1) {
-    return CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED;
-  }
-  if (iota->num_devices_per_group() == kNumDevicesPerHost) {
-    return CollectiveInterpolator::CommunicationType::SINGLE_HOST;
-  }
-  return CollectiveInterpolator::CommunicationType::UNDEFINED;
-}
-
 absl::StatusOr<InterpolationSpecification> Spec(
     const HloInstructionProfile& profile,
     const se::DeviceDescription& device_info) {
@@ -108,10 +87,13 @@ absl::StatusOr<InterpolationSpecification> Spec(
 
   GpuHloCostAnalysis analysis(GpuHloCostAnalysis::Options(), device_info);
   TF_RETURN_IF_ERROR(collective->Accept(&analysis));
-  int bytes_transferred = analysis.BytesTransferred(*collective);
+  int64_t bytes_transferred = analysis.BytesTransferred(*collective);
 
-  TF_ASSIGN_OR_RETURN(auto comm, CommType(collective->device_list()));
-  TF_ASSIGN_OR_RETURN(int num_devices, GetNumParticipatingDevices(*collective));
+  TF_ASSIGN_OR_RETURN(
+      auto comm,
+      CommunicationType(*collective, device_info.gpu_compute_capability()));
+  TF_ASSIGN_OR_RETURN(int num_devices,
+                      GetNumParticipatingDevices(collective->device_list()));
 
   return InterpolationSpecification{
       /*opcode=*/collective->opcode(),
@@ -125,11 +107,15 @@ std::unique_ptr<HloModule> AllReduceModule(
     const HloInstructionProfile& profile) {
   HloModuleConfig config;
   auto module = std::make_unique<HloModule>("m", config);
-  Shape shape(profile.instruction().shape());
+  auto shape = Shape::FromProto(profile.instruction().shape());
+  if (!shape.ok()) {
+    VLOG(1) << "Cannot parse shape: " << profile.DebugString();
+    return nullptr;
+  }
 
   HloComputation::Builder wrapped_computation("wrapped_computation");
   HloComputation::Builder entry_builder("entry");
-  Shape s(shape.element_type(), {}, {}, {});
+  Shape s(shape->element_type(), {}, {});
   HloInstruction* a = wrapped_computation.AddInstruction(
       HloInstruction::CreateParameter(0, s, "p0.1"));
   HloInstruction* b = wrapped_computation.AddInstruction(
@@ -148,9 +134,9 @@ std::unique_ptr<HloModule> AllReduceModule(
   HloComputation* subcomp =
       module->AddEmbeddedComputation(wrapped_computation.Build());
   HloInstruction* p0 = entry_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "p0"));
+      HloInstruction::CreateParameter(0, *shape, "p0"));
   entry_builder.AddInstruction(HloInstruction::CreateAllReduce(
-      shape, {p0}, subcomp, collective_device_list,
+      *shape, {p0}, subcomp, collective_device_list,
       profile.instruction().constrain_layout(),
       profile.instruction().channel_id(),
       profile.instruction().use_global_device_ids()));
@@ -162,11 +148,15 @@ std::unique_ptr<HloModule> ReduceScatterModule(
     const HloInstructionProfile& profile) {
   HloModuleConfig config;
   auto module = std::make_unique<HloModule>("m", config);
-  Shape shape(profile.instruction().shape());
+  auto shape = Shape::FromProto(profile.instruction().shape());
+  if (!shape.ok()) {
+    VLOG(1) << "Cannot parse shape: " << profile.DebugString();
+    return nullptr;
+  }
 
   HloComputation::Builder wrapped_computation("wrapped_computation");
   HloComputation::Builder entry_builder("entry");
-  Shape s(shape.element_type(), {}, {}, {});
+  Shape s(shape->element_type(), {}, {});
   HloInstruction* a = wrapped_computation.AddInstruction(
       HloInstruction::CreateParameter(0, s, "p0.1"));
   HloInstruction* b = wrapped_computation.AddInstruction(
@@ -184,12 +174,12 @@ std::unique_ptr<HloModule> ReduceScatterModule(
 
   HloComputation* subcomp =
       module->AddEmbeddedComputation(wrapped_computation.Build());
-  if (shape.dimensions().size() != 1) {
+  if (shape->dimensions().size() != 1) {
     VLOG(1) << "Unsupported number of dimensions: " << profile.DebugString();
     return nullptr;
   }
-  std::vector<int64_t> new_dims(shape.dimensions().begin(),
-                                shape.dimensions().end());
+  std::vector<int64_t> new_dims(shape->dimensions().begin(),
+                                shape->dimensions().end());
   auto num_participating_devices =
       GetNumParticipatingDevices(collective_device_list);
   if (!num_participating_devices.ok()) {
@@ -198,11 +188,11 @@ std::unique_ptr<HloModule> ReduceScatterModule(
     return nullptr;
   }
   new_dims[0] = new_dims.front() * *num_participating_devices;
-  Shape p_shape = ShapeUtil::MakeShape(shape.element_type(), new_dims);
+  Shape p_shape = ShapeUtil::MakeShape(shape->element_type(), new_dims);
   HloInstruction* p0 = entry_builder.AddInstruction(
       HloInstruction::CreateParameter(0, p_shape, "p0"));
   entry_builder.AddInstruction(HloInstruction::CreateReduceScatter(
-      shape, {p0}, subcomp, collective_device_list,
+      *shape, {p0}, subcomp, collective_device_list,
       profile.instruction().constrain_layout(),
       profile.instruction().channel_id(),
       profile.instruction().use_global_device_ids(),
@@ -215,7 +205,11 @@ std::unique_ptr<HloModule> AllGatherModule(
     const HloInstructionProfile& profile) {
   HloModuleConfig config;
   auto module = std::make_unique<HloModule>("m", config);
-  Shape shape(profile.instruction().shape());
+  auto shape = Shape::FromProto(profile.instruction().shape());
+  if (!shape.ok()) {
+    VLOG(1) << "Cannot parse shape: " << profile.DebugString();
+    return nullptr;
+  }
 
   HloComputation::Builder entry_builder("entry");
 
@@ -224,7 +218,7 @@ std::unique_ptr<HloModule> AllGatherModule(
                                           .collective_device_list()
                                           .iota_replica_group_list()));
 
-  if (shape.dimensions().size() != 1) {
+  if (shape->dimensions().size() != 1) {
     VLOG(1) << "Unsupported number of dimensions: " << profile.DebugString();
     return nullptr;
   }
@@ -235,14 +229,14 @@ std::unique_ptr<HloModule> AllGatherModule(
             << profile.DebugString();
     return nullptr;
   }
-  std::vector<int64_t> new_dims(shape.dimensions().begin(),
-                                shape.dimensions().end());
+  std::vector<int64_t> new_dims(shape->dimensions().begin(),
+                                shape->dimensions().end());
   new_dims[0] = new_dims.front() / *num_participating_devices;
-  Shape p_shape = ShapeUtil::MakeShape(shape.element_type(), new_dims);
+  Shape p_shape = ShapeUtil::MakeShape(shape->element_type(), new_dims);
   HloInstruction* p0 = entry_builder.AddInstruction(
       HloInstruction::CreateParameter(0, p_shape, "p0"));
   entry_builder.AddInstruction(HloInstruction::CreateAllGather(
-      shape, {p0}, /*all_gather_dimension=*/0, collective_device_list,
+      *shape, {p0}, /*all_gather_dimension=*/0, collective_device_list,
       profile.instruction().constrain_layout(),
       profile.instruction().channel_id(),
       profile.instruction().use_global_device_ids()));
@@ -268,12 +262,62 @@ HloOpcode AsyncToSyncOpcode(const HloCollectiveInstruction& instr) {
   return opcode;
 }
 
+absl::StatusOr<HloInstructionProfileList> ReadDefaultProfiles(
+    const se::DeviceDescription& device_info) {
+  DeviceHloInstructionProfiles profile;
+
+  if (!tsl::protobuf::TextFormat::ParseFromString(kDefaultCollectivePTable,
+                                                  &profile)) {
+    return absl::FailedPreconditionError("Cannot parse a default profile.");
+  }
+  std::string key = HloOpProfiles::GetProfileName(device_info);
+
+  if (!profile.entries().contains(key)) {
+    return absl::NotFoundError(absl::StrCat("Cannot find key: ", key));
+  }
+  return profile.entries().at(key);
+}
+
 }  // namespace
 
 /*static*/ absl::StatusOr<std::unique_ptr<CollectiveInterpolator>>
-CollectiveInterpolator::Create(HloInstructionProfileList profiles,
+CollectiveInterpolator::Create(const se::DeviceDescription& device_info) {
+  auto interpolators = std::make_unique<absl::flat_hash_map<
+      InterpolatorKey, std::unique_ptr<InterpolatorBase<int64_t, 2>>>>();
+
+  TF_ASSIGN_OR_RETURN(HloInstructionProfileList profiles,
+                      ReadDefaultProfiles(device_info));
+  for (auto& profile : profiles.entries()) {
+    TF_ASSIGN_OR_RETURN(InterpolationSpecification spec,
+                        Spec(profile, device_info));
+    CollectiveInterpolator::InterpolatorKey key{
+        /*opcode=*/spec.opcode,
+        /*communication_type=*/spec.comm,
+    };
+    auto it = interpolators->find(key);
+    if (it == interpolators->end()) {
+      auto interpolator =
+          std::make_unique<EuclideanComplementInterpolator<int64_t, 2>>(
+              /*next_context=*/std::array<int64_t, 2>{-1, -1},
+              /*next_power_context=*/std::array<int64_t, 2>{1, 1},
+              /*max_context=*/std::array<int64_t, 2>{1 << 30, 8},
+              /*min_context=*/std::array<int64_t, 2>{1 << 10, 8});
+
+      (*interpolators)[key] = std::move(interpolator);
+    }
+    std::array<int64_t, 2> point = {spec.transfer_size, spec.num_devices};
+    interpolators->at(key)->Add(point,
+                                profile.network_throughput_bytes_per_sec());
+  }
+  return std::unique_ptr<CollectiveInterpolator>(
+      new CollectiveInterpolator(std::move(interpolators), device_info));
+}
+
+/*static*/ absl::StatusOr<std::unique_ptr<CollectiveInterpolator>>
+CollectiveInterpolator::Create(const HloInstructionProfileList& profiles,
                                const se::DeviceDescription& device_info) {
-  CollectiveInterpolator::InterpolatorMap interpolators;
+  auto interpolators = std::make_unique<absl::flat_hash_map<
+      InterpolatorKey, std::unique_ptr<InterpolatorBase<int64_t, 2>>>>();
 
   for (auto& profile : profiles.entries()) {
     TF_ASSIGN_OR_RETURN(InterpolationSpecification spec,
@@ -282,41 +326,44 @@ CollectiveInterpolator::Create(HloInstructionProfileList profiles,
         /*opcode=*/spec.opcode,
         /*communication_type=*/spec.comm,
     };
-    auto it = interpolators.find(key);
-    if (it == interpolators.end()) {
-      interpolators[key] = EuclideanNNInterpolator<int64_t, 2>();
+    auto it = interpolators->find(key);
+    if (it == interpolators->end()) {
+      auto interpolator =
+          std::make_unique<EuclideanNNInterpolator<int64_t, 2>>();
+      (*interpolators)[key] = std::move(interpolator);
     }
     std::array<int64_t, 2> point = {spec.transfer_size, spec.num_devices};
-    interpolators[key].Add(point, profile.network_throughput_bytes_per_sec());
+    interpolators->at(key)->Add(point,
+                                profile.network_throughput_bytes_per_sec());
   }
   return std::unique_ptr<CollectiveInterpolator>(
-      new CollectiveInterpolator(profiles, interpolators, device_info));
+      new CollectiveInterpolator(std::move(interpolators), device_info));
 }
 
 std::optional<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
-    HloCollectiveInstruction& instr) {
+    const HloCollectiveInstruction& instr) const {
   GpuHloCostAnalysis analysis(GpuHloCostAnalysis::Options(), device_info_);
   CHECK_OK(instr.Accept(&analysis));
   int64_t bytes_transferred = analysis.BytesTransferred(instr);
-  auto comm = CommType(instr.device_list());
+  auto comm = CommunicationType(instr, device_info_.gpu_compute_capability());
   if (!comm.ok()) {
     return std::nullopt;
   }
-  auto num_devices = GetNumParticipatingDevices(instr);
+  auto num_devices = GetReplicaGroupCountAndSize(&instr);
   if (!num_devices.ok()) {
     return std::nullopt;
   }
-  std::array<int64_t, 2> point({bytes_transferred, *num_devices});
+  std::array<int64_t, 2> point({bytes_transferred, (*num_devices)->second});
   CollectiveInterpolator::InterpolatorKey key{
       /*opcode=*/AsyncToSyncOpcode(instr),
       /*communication_type=*/*comm,
   };
-  if (!interpolators_.contains(key)) {
+  if (!interpolators_->contains(key)) {
     VLOG(1) << "Cannot find key for instr: " << instr.ToString();
     return std::nullopt;
   }
   return absl::Seconds(1.0 * bytes_transferred /
-                       interpolators_.at(key).Eval(point));
+                       interpolators_->at(key)->Eval(point));
 }
 
 /*static*/ std::unique_ptr<HloModule> CollectiveInterpolator::ConstructModule(
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.h b/third_party/xla/xla/service/gpu/model/collective_interpolator.h
index 5c37dbed99c4..2a1c800ff83f 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.h
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.h
@@ -29,22 +29,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/service/gpu/model/interpolator.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
 
 class CollectiveInterpolator {
  public:
-  enum class CommunicationType {
-    UNDEFINED = 0,
-    RAIL_ALIGNED = 1,
-    NON_RAIL_ALIGNED = 2,
-    SINGLE_HOST = 3
-  };
-
   struct InterpolatorKey {
     HloOpcode opcode;
-    CommunicationType communication_type;
+    GPUCommunicationType communication_type;
 
     template <typename H>
     friend H AbslHashValue(H h, const InterpolatorKey& key) {
@@ -57,11 +51,14 @@ class CollectiveInterpolator {
     }
   };
 
-  using InterpolatorMap =
-      absl::flat_hash_map<InterpolatorKey, EuclideanNNInterpolator<int64_t, 2>>;
+  using InterpolatorMap = std::unique_ptr<absl::flat_hash_map<
+      InterpolatorKey, std::unique_ptr<InterpolatorBase<int64_t, 2>>>>;
+
+  static absl::StatusOr<std::unique_ptr<CollectiveInterpolator>> Create(
+      const HloInstructionProfileList& profiles,
+      const se::DeviceDescription& device_info);
 
   static absl::StatusOr<std::unique_ptr<CollectiveInterpolator>> Create(
-      HloInstructionProfileList profiles,
       const se::DeviceDescription& device_info);
 
   // Constructs the semantically correct module from the profile.
@@ -72,19 +69,15 @@ class CollectiveInterpolator {
 
   // Returns the estimated runtime for a supported `collective`.
   std::optional<absl::Duration> EstimatedRuntime(
-      HloCollectiveInstruction& instr);
+      const HloCollectiveInstruction& instr) const;
 
  private:
   // Uses `EuclideanNNInterpolator` to figure get the closest neighbour from
   // profiles.
-  explicit CollectiveInterpolator(HloInstructionProfileList profiles,
-                                  InterpolatorMap interpolators,
+  explicit CollectiveInterpolator(InterpolatorMap interpolators,
                                   const se::DeviceDescription& device_info)
-      : profiles_(profiles),
-        interpolators_(interpolators),
-        device_info_(device_info) {}
+      : interpolators_(std::move(interpolators)), device_info_(device_info) {}
 
-  HloInstructionProfileList profiles_;
   InterpolatorMap interpolators_;
 
   const se::DeviceDescription& device_info_;
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h b/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
new file mode 100644
index 000000000000..3a395b29706f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
@@ -0,0 +1,3022 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_COLLECTIVE_INTERPOLATOR_DATA_H_
+#define XLA_SERVICE_GPU_MODEL_COLLECTIVE_INTERPOLATOR_DATA_H_
+
+// Textproto below is generated via
+//
+//   bazel run --config=cuda -- //xla/tools:collective_perf_table_gen_main
+constexpr char kDefaultCollectivePTable[] = R"pb(
+  entries {
+    key: "sm_90"
+    value {
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 28571428
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 158415841
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 33023735
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 167539267
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 32
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 49535603
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 164948453
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 47024246
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 329896907
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 68965517
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 328205128
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 64
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 67297581
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 326530612
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 113174182
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 646464646
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 112775330
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 684491978
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 128
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 240601503
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 656410256
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 345479082
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1333333333
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 266666666
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1333333333
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 451499118
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1280000000
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 413570274
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 2680628272
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 708160442
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 2534653465
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 666666666
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 2652849740
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 739350180
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 5145728643
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1723905723
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 5224489795
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1479768786
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 5305699481
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1472322070
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 10502564102
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 3424749163
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 10395939086
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 2314124293
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 10343434343
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 3444911690
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 20480000000
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 6714754098
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 21005128205
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 2862334032
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 20897959183
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 5957818181
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 42226804123
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 13255663430
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 42010256410
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 8224899598
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 40554455445
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 18984936268
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 81920000000
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 20177339901
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 81920000000
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 16532795156
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 80313725490
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 22443835616
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 152409302325
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 42666666666
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 151703703703
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 35121114683
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 148945454545
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 43030860144
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 296542986425
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 62594078319
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 295207207207
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 82643127364
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 291271111111
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 63906387128
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 553046413502
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 87323117921
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 537180327868
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 116924174843
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 534987755102
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 74451576256
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 865161716171
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 134432820512
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 826952681388
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 153390286717
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 834853503184
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 110353188802
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 981812734082
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 194613214550
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 934559714795
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 204560280920
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 948079566003
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 144272977435
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 992969696969
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 232397163120
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1005346116970
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 233900513049
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1009216554379
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 165927051190
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1149754385964
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 313428784934
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1141617855198
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 311519904931
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1137901247965
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 195347398817
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1228200292825
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 338032237266
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1230001173020
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 327500897946
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1227481416447
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 208692606229
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1278361475160
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 368002105724
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1276222120797
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 347901791639
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1275833916349
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 218058669855
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1287781393920
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 384032229267
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1287880248714
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 359209009549
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1286991101564
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 222030980976
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1305416744475
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 391036278245
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1303236571251
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 364916444628
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1304452513314
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 224423344971
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1315576326674
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 400124397805
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1316944621060
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 373842693108
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 8
+              num_devices_per_group: 1
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        network_throughput_bytes_per_sec: 1315318476705
+      }
+    }
+  }
+)pb";
+
+#endif  // XLA_SERVICE_GPU_MODEL_COLLECTIVE_INTERPOLATOR_DATA_H_
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
index 10df69dabf31..5445a4285940 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator_test.cc
@@ -24,17 +24,23 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/parser/hlo_parser.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
@@ -49,7 +55,7 @@ using ::testing::ValuesIn;
 struct SpaceSpec {
   // Discrete key.
   HloOpcode opcode;
-  CollectiveInterpolator::CommunicationType comm;
+  GPUCommunicationType comm;
 
   // Euclidean space.
   int tensor_size;
@@ -74,8 +80,9 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
           space_spec.network_througput_bytes);
       *profiles.add_entries() = entry;
     }
-    interpolator_ = *CollectiveInterpolator::Create(
-        profiles, TestGpuDeviceInfo::RTXA6000DeviceInfo());
+    device_info_ = TestGpuDeviceInfo::RTXA6000DeviceInfo(
+        stream_executor::CudaComputeCapability::Hopper());
+    interpolator_ = *CollectiveInterpolator::Create(profiles, device_info_);
   }
 
  protected:
@@ -84,9 +91,10 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
                                  test_spec.tensor_size, test_spec.num_nodes);
   }
 
-  HloInstructionProfile CollectiveInstruction(
-      HloOpcode opcode, CollectiveInterpolator::CommunicationType comm,
-      int64_t tensor_size, int num_hosts) {
+  HloInstructionProfile CollectiveInstruction(HloOpcode opcode,
+                                              GPUCommunicationType comm,
+                                              int64_t tensor_size,
+                                              int num_hosts) {
     Shape shape;
     CollectiveDeviceList device_list;
     switch (opcode) {
@@ -121,9 +129,10 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
     return profile;
   }
 
-  std::optional<absl::Duration> EstimateRuntime(
-      HloOpcode opcode, CollectiveInterpolator::CommunicationType comm,
-      int64_t tensor_size, int num_hosts) {
+  std::optional<absl::Duration> EstimateRuntime(HloOpcode opcode,
+                                                GPUCommunicationType comm,
+                                                int64_t tensor_size,
+                                                int num_hosts) {
     auto instr = CollectiveInstruction(opcode, comm, tensor_size, num_hosts);
     auto module = CollectiveInterpolator::ConstructModule(instr);
     auto* eval = Cast<HloCollectiveInstruction>(
@@ -134,17 +143,17 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
   CollectiveInterpolator& interpolator() { return *interpolator_; }
 
  private:
-  IotaReplicaGroupList CommToDeviceList(
-      CollectiveInterpolator::CommunicationType comm, int num_hosts) {
+  IotaReplicaGroupList CommToDeviceList(GPUCommunicationType comm,
+                                        int num_hosts) {
     IotaReplicaGroupList iota(1, 1);
     switch (comm) {
-      case CollectiveInterpolator::CommunicationType::SINGLE_HOST:
+      case GPUCommunicationType::SINGLE_HOST:
         iota = IotaReplicaGroupList(num_hosts, kNumGpusPerHost);
         break;
-      case CollectiveInterpolator::CommunicationType::RAIL_ALIGNED:
+      case GPUCommunicationType::RAIL_ALIGNED:
         iota = IotaReplicaGroupList(1, num_hosts * kNumGpusPerHost);
         break;
-      case CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED:
+      case GPUCommunicationType::NON_RAIL_ALIGNED:
         iota = IotaReplicaGroupList(kNumGpusPerHost, num_hosts,
                                     {num_hosts, kNumGpusPerHost}, {1, 0});
         break;
@@ -154,256 +163,257 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
     return iota;
   }
 
+  se::DeviceDescription device_info_;
   std::unique_ptr<CollectiveInterpolator> interpolator_;
   std::vector<SpaceSpec> test_space_ = {
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllReduce,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kReduceScatter,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 1024,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/4 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/5 * 512,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/2,
           /*network_througput_bytes=*/2 * 2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2048,
       },
       {
           /*opcode=*/HloOpcode::kAllGather,
-          /*comm=*/CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
           /*tensor_size=*/2 * 1024,
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
@@ -427,7 +437,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -439,7 +449,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -451,7 +461,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -463,7 +473,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -475,7 +485,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -487,7 +497,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -499,7 +509,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -511,7 +521,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -523,7 +533,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -535,7 +545,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -547,7 +557,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -559,7 +569,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduce,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -571,7 +581,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllReduceStart,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -583,7 +593,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -595,7 +605,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -607,7 +617,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -619,7 +629,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -631,7 +641,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -643,7 +653,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -655,7 +665,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1032,
                 /*num_nodes=*/3,
             },
@@ -667,7 +677,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -679,7 +689,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -691,7 +701,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -703,7 +713,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -715,7 +725,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kReduceScatter,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -727,7 +737,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -739,7 +749,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -751,7 +761,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1056,
                 /*num_nodes=*/3,
             },
@@ -763,7 +773,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::RAIL_ALIGNED,
+                GPUCommunicationType::RAIL_ALIGNED,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -775,7 +785,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -787,7 +797,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -799,7 +809,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1032,
                 /*num_nodes=*/3,
             },
@@ -811,7 +821,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::NON_RAIL_ALIGNED,
+                GPUCommunicationType::NON_RAIL_ALIGNED,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -823,7 +833,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/8,
             },
@@ -835,7 +845,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/4 * 1024,
                 /*num_nodes=*/2,
             },
@@ -847,7 +857,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024,
                 /*num_nodes=*/3,
             },
@@ -859,7 +869,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGather,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -871,7 +881,7 @@ INSTANTIATE_TEST_SUITE_P(
             {
                 /*opcode=*/HloOpcode::kAllGatherStart,
                 /*comm=*/
-                CollectiveInterpolator::CommunicationType::SINGLE_HOST,
+                GPUCommunicationType::SINGLE_HOST,
                 /*tensor_size=*/1024 + 256,
                 /*num_nodes=*/2,
             },
@@ -882,5 +892,35 @@ INSTANTIATE_TEST_SUITE_P(
       return info.param.test_name;
     });
 
+TEST(CollectiveInterpolatorTest, LoadsDefaultProfile) {
+  auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo(
+      stream_executor::CudaComputeCapability::Hopper());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<CollectiveInterpolator> interpolator,
+                          CollectiveInterpolator::Create(device_info));
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    wrapped_add {
+        a = f32[] parameter(0)
+        b = f32[] parameter(1)
+        ROOT _ = f32[] add(a,b)
+    }
+
+    ENTRY main {
+        p = f32[256] parameter(0)
+        ROOT _ = f32[256] all-reduce(p),
+        to_apply=wrapped_add,
+        replica_groups=[1,8]<=[8],
+        use_global_device_ids=true,
+        channel_id=1
+    }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+
+  EXPECT_TRUE(interpolator->EstimatedRuntime(*instr).has_value());
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
new file mode 100644
index 000000000000..a1c760375806
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.cc
@@ -0,0 +1,103 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/collective_ptable_stats_collection.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/model/collective_interpolator.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+namespace {
+
+absl::StatusOr<HloInstructionProfileList> CollectProfiles(
+    const std::string& perf_table_path,
+    const se::DeviceDescription& device_info) {
+  DeviceHloInstructionProfiles profile;
+
+  TF_RETURN_IF_ERROR(tsl::Env::Default()->FileExists(perf_table_path));
+  TF_RETURN_IF_ERROR(tsl::ReadTextOrBinaryProto(tsl::Env::Default(),
+                                                perf_table_path, &profile));
+  std::string key = HloOpProfiles::GetProfileName(device_info);
+
+  if (!profile.entries().contains(key)) {
+    return absl::NotFoundError(absl::StrCat("Cannot find key: ", key));
+  }
+  return profile.entries().at(key);
+}
+
+}  // namespace
+
+absl::StatusOr<bool> CollectivePerfTableStatsCollection::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  TF_ASSIGN_OR_RETURN(HloInstructionProfileList profiles,
+                      CollectProfiles(perf_table_path_, device_info_));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<CollectiveInterpolator> interpolator,
+                      CollectiveInterpolator::Create(profiles, device_info_));
+  hlo_query::ForEachInstructionWithPred(
+      *module,
+      [](const HloInstruction* instr) {
+        return hlo_query::IsCollectiveCommunicationOp(instr->opcode());
+      },
+      [&](HloInstruction* instr) {
+        // Generate exec time for a collective.
+        HloCollectiveInstruction* coll_instr =
+            Cast<HloCollectiveInstruction>(instr);
+        auto estimation = interpolator->EstimatedRuntime(*coll_instr);
+        if (!estimation.has_value()) {
+          LOG(WARNING) << "No estimation for: " << coll_instr->ToString();
+          return;
+        }
+        absl::Duration exec_time = *estimation;
+
+        // Set it in the `CollectiveBackendConfig`.
+        auto gpu_config = instr->backend_config<GpuBackendConfig>();
+        TF_CHECK_OK(gpu_config.status())
+            << "Cannot parse backend config: " << instr->ToString();
+        auto reification_cost = gpu_config->add_reification_cost();
+        reification_cost->set_exec_time_us(
+            absl::ToDoubleMicroseconds(exec_time));
+        *reification_cost->mutable_name() = name();
+        TF_CHECK_OK(instr->set_backend_config(*gpu_config));
+      });
+
+  return false;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
new file mode 100644
index 000000000000..b88e40148519
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection.h
@@ -0,0 +1,54 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_COLLECTIVE_PTABLE_STATS_COLLECTION_H_
+#define XLA_SERVICE_GPU_MODEL_COLLECTIVE_PTABLE_STATS_COLLECTION_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+class CollectivePerfTableStatsCollection : public HloModulePass {
+ public:
+  explicit CollectivePerfTableStatsCollection(
+      absl::string_view perf_table_path,
+      const se::DeviceDescription& device_info)
+      : perf_table_path_(perf_table_path), device_info_(device_info) {}
+
+  absl::string_view name() const override {
+    return "collective-perf-table-stats-collection";
+  }
+
+  using HloPassInterface::Run;
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const std::string perf_table_path_;
+  const se::DeviceDescription& device_info_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MODEL_COLLECTIVE_PTABLE_STATS_COLLECTION_H_
diff --git a/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection_test.cc
new file mode 100644
index 000000000000..7c786d55f0be
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/collective_ptable_stats_collection_test.cc
@@ -0,0 +1,130 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/collective_ptable_stats_collection.h"
+
+#include <string>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/collective_device_list.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/path.h"
+
+namespace xla::gpu {
+namespace {
+
+constexpr const char* kFile = "profiles.pbtxt";
+
+DeviceHloInstructionProfiles TestProfiles(
+    const se::DeviceDescription& device_info) {
+  DeviceHloInstructionProfiles profiles;
+  HloInstructionProfileList profile_list;
+  HloInstructionProfile profile_entry;
+
+  // Create a simple AllReduce instruction w/ shape f32[1024] running on a
+  // single host (8 devices).
+  HloInstructionProto instr;
+  *instr.mutable_opcode() = HloOpcodeString(HloOpcode::kAllReduce);
+  Shape shape = ShapeUtil::MakeShape(F32, {1024});
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_constrain_layout(false);
+  instr.set_use_global_device_ids(true);
+  instr.set_channel_id(1);
+  IotaReplicaGroupList iota(/*num_replica_groups=*/1,
+                            /*num_devices_per_group=*/8);
+  CollectiveDeviceList collective_device_list(iota);
+  *instr.mutable_collective_device_list() = collective_device_list.ToProto();
+
+  *profile_entry.mutable_instruction() = std::move(instr);
+  profile_entry.set_network_throughput_bytes_per_sec(4 * 1024);
+
+  *profile_list.add_entries() = std::move(profile_entry);
+  profiles.mutable_entries()->insert(
+      {HloOpProfiles::GetProfileName(device_info), std::move(profile_list)});
+
+  return profiles;
+}
+
+class CollectivePerfTableStatsCollectionTest
+    : public HloHardwareIndependentTestBase {
+ public:
+  explicit CollectivePerfTableStatsCollectionTest()
+      : device_info_(TestGpuDeviceInfo::RTXA6000DeviceInfo(
+            se::CudaComputeCapability(9, 0))),
+        profiles_path_(tsl::io::JoinPath(tsl::testing::TmpDir(), kFile)) {}
+
+  void SetUp() override {
+    TF_ASSERT_OK(tsl::WriteTextProto(tsl::Env::Default(), profiles_path_,
+                                     TestProfiles(device_info_)));
+  }
+
+ protected:
+  const se::DeviceDescription device_info_;
+  const std::string profiles_path_;
+};
+
+TEST_F(CollectivePerfTableStatsCollectionTest,
+       CollectsCollectivePerfTableData) {
+  constexpr absl::string_view kHloText = R"(
+  HloModule m
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT _ = f32[] add(x, y)
+  }
+
+  ENTRY ar {
+    p0 = f32[1024] parameter(0)
+
+    ar-start = f32[1024] all-reduce-start(p0), to_apply=add,
+      replica_groups=[1,8]<=[8]
+    ROOT ar-done = f32[1024] all-reduce-done(ar-start)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, CollectivePerfTableStatsCollection(
+                                            profiles_path_, device_info_)
+                                            .Run(module.get()));
+
+  VLOG(1) << module->ToString();
+
+  EXPECT_FALSE(changed);
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+  CHECK: ar-start
+  CHECK-SAME: "exec_time_us":1000000
+  )"));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
index c5711bf8c80b..2f17f2e0ab5b 100644
--- a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
@@ -18,16 +18,16 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
-class FusionAnalysisCacheTest : public HloTestBase {
+class FusionAnalysisCacheTest : public HloHardwareIndependentTestBase {
  public:
   stream_executor::DeviceDescription device_{
       TestGpuDeviceInfo::RTXA6000DeviceInfo()};
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
index 9166cfb3398c..441a82260efe 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <cstdlib>
-#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -29,10 +28,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 
 #if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/nvml/include/nvml.h"
 #endif  // GOOGLE_CUDA
 namespace xla {
@@ -136,7 +137,7 @@ float GpuPerformanceWithCollectiveModel::GetNvlinkBw(
 }
 
 /*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() {
-#if GOOGLE_CUDA && (defined(PLATFORM_POSIX) || defined(PLATFORM_GOOGLE))
+#if GOOGLE_CUDA && defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE)
   void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
   CHECK(libhandle != nullptr) << "Failed to open libnvidia-ml.so.1";
 
@@ -151,9 +152,22 @@ float GpuPerformanceWithCollectiveModel::GetNvlinkBw(
       {(void**)&xla_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
       {(void**)&xla_nvmlDeviceGetNvLinkCapability,
        "nvmlDeviceGetNvLinkCapability"},
+      {(void**)&xla_nvmlSystemGetNVMLVersion, "nvmlSystemGetNVMLVersion"},
   };
+#if GOOGLE_CUDA && CUDA_VERSION >= 12040
+  symbols.push_back({(void**)&xla_nvmlDeviceGetHandleByPciBusId_v2,
+                     "nvmlDeviceGetHandleByPciBusId_v2"});
+  symbols.push_back({(void**)&xla_nvmlDeviceGetGpuFabricInfoV,
+                     "nvmlDeviceGetGpuFabricInfoV"});
+#endif  // CUDA_VERSION >= 12040
   for (SymbolEntry se : symbols) {
     *se.functor = dlsym(libhandle, se.name);
+    if (*se.functor == nullptr) {
+      const char* dlsym_error = dlerror();
+      if (dlsym_error) {
+        VLOG(0) << "Error: " << dlsym_error;
+      }
+    }
   }
   nvmlReturn_t init_result = xla_nvmlInit();
   return init_result == NVML_SUCCESS;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
index e1bcff0b5023..01c3f3eb4537 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 
+#include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 
 #if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
 #if defined(PLATFORM_POSIX) || defined(PLATFORM_GOOGLE)
 #include <dlfcn.h>
 #endif
@@ -43,6 +45,15 @@ NVML_FUNCTOR(nvmlDeviceGetHandleByIndex, nvmlReturn_t,
 NVML_FUNCTOR(nvmlDeviceGetNvLinkCapability, nvmlReturn_t,
              (nvmlDevice_t device, unsigned int link,
               nvmlNvLinkCapability_t capability, unsigned int* capResult))
+NVML_FUNCTOR(nvmlSystemGetNVMLVersion, nvmlReturn_t,
+             (char* version, size_t versionSize))
+
+#if CUDA_VERSION >= 12040
+NVML_FUNCTOR(nvmlDeviceGetHandleByPciBusId_v2, nvmlReturn_t,
+             (const char* pciBusId, nvmlDevice_t* device))
+NVML_FUNCTOR(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t,
+             (nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo))
+#endif  // CUDA_VERSION >= 12040
 
 #endif
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
index 4e68c8704001..ca5db0357764 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
@@ -13,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using GpuPerformanceWithCollectiveModelTest = HloTestBase;
+using GpuPerformanceWithCollectiveModelTest = HloHardwareIndependentTestBase;
 
 TEST_F(GpuPerformanceWithCollectiveModelTest, TestNvmlLibraryLoading) {
 #if GOOGLE_CUDA
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
index 78a4305fc29f..ff410f28c449 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
-#include "xla/service/gpu/model/gpu_performance_model_base.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
@@ -33,15 +31,16 @@ absl::StatusOr<bool> GpuCostModelStatsCollection::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Scan all computations for fusion instructions.
+
+  GpuPerformanceModelOwning gpu_performance_model{device_info_};
   for (auto* computation : module->MakeComputationPostOrder()) {
     TF_CHECK_OK(computation->Accept(&cost_analysis_));
 
     for (auto* fusion_instr : computation->instructions()) {
       if (fusion_instr->opcode() != HloOpcode::kFusion) continue;
 
-      GpuPerformanceModel::RecordEstimatedRunTime(
-          fusion_instr, device_info_, &cost_analysis_,
-          GpuPerformanceModelOptions::Default());
+      gpu_performance_model.RecordEstimatedRunTime(fusion_instr,
+                                                   &cost_analysis_);
     }
   }
   return false;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
index 8b1d92eb55df..5f31999ba10a 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
@@ -21,17 +21,17 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
-class GpuCostModelStatsCollectionTest : public HloTestBase {
+class GpuCostModelStatsCollectionTest : public HloHardwareIndependentTestBase {
  public:
   GpuCostModelStatsCollection cost_model_stats_{
       TestGpuDeviceInfo::RTXA6000DeviceInfo(),
@@ -58,11 +58,9 @@ TEST_F(GpuCostModelStatsCollectionTest, FusinInEntryComputation) {
   HloInstruction* root = module->entry_computation()->root_instruction();
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           root->backend_config<GpuBackendConfig>());
-  const FusionBackendConfig& backend_config =
-      gpu_config.fusion_backend_config();
 
-  EXPECT_TRUE(backend_config.has_reification_cost());
-  EXPECT_GT(backend_config.reification_cost().end_to_end_cycles(), 0);
+  EXPECT_EQ(gpu_config.reification_cost_size(), 1);
+  EXPECT_GT(gpu_config.reification_cost()[0].end_to_end_cycles(), 0);
 }
 
 TEST_F(GpuCostModelStatsCollectionTest, FusinInWhileComputation) {
@@ -97,11 +95,9 @@ TEST_F(GpuCostModelStatsCollectionTest, FusinInWhileComputation) {
                              ->root_instruction();
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           root->backend_config<GpuBackendConfig>());
-  const FusionBackendConfig& backend_config =
-      gpu_config.fusion_backend_config();
 
-  EXPECT_TRUE(backend_config.has_reification_cost());
-  EXPECT_GT(backend_config.reification_cost().end_to_end_cycles(), 0);
+  EXPECT_EQ(gpu_config.reification_cost_size(), 1);
+  EXPECT_GT(gpu_config.reification_cost()[0].end_to_end_cycles(), 0);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model.cc b/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model.cc
new file mode 100644
index 000000000000..06bf8c283fc0
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model.cc
@@ -0,0 +1,397 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_dot_fusion_cost_model.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using primitive_util::BitWidth;
+
+namespace {
+
+bool TileFitsInRegisters(int64_t block_m, int64_t block_n,
+                         const PrimitiveType& element_type,
+                         const se::DeviceDescription& device_info) {
+  int bits_per_output_elem = BitWidth(element_type);
+  int registers_per_block = device_info.registers_per_block_limit();
+  int64_t block_size = block_m * block_n;
+  int64_t bytes_per_block =
+      CeilOfRatio<int64_t>(block_size * bits_per_output_elem, 8);
+  constexpr double kFractionOfRegistersAvailableForAccumulators = 0.8;
+  return bytes_per_block <=
+         (registers_per_block * kFractionOfRegistersAvailableForAccumulators);
+}
+
+absl::StatusOr<absl::InlinedVector<BlockLevelParameters, 4>>
+GetDotAlgorithmValidConfigs(const HloDotInstruction* dot,
+                            const se::DeviceDescription& device_info) {
+  absl::InlinedVector<BlockLevelParameters, 4> valid_configs;
+
+  for (int64_t block_m = detail::kMinBlockDim; block_m <= detail::kMaxBlockDim;
+       block_m *= 2) {
+    for (int64_t block_n = detail::kMinBlockDim;
+         block_n <= detail::kMaxBlockDim; block_n *= 2) {
+      if (!TileFitsInRegisters(block_m, block_n, dot->shape().element_type(),
+                               device_info)) {
+        continue;
+      }
+
+      // TODO(maniananth): Add the logic to find valid kBlock stages.
+      BlockLevelParameters block_level_parameters;
+      block_level_parameters.output_tile_sizes.push_back(
+          std::vector<int64_t>{block_m, block_n});
+      // TODO(maniananth): Add the logic to sweep num warps per block.
+      block_level_parameters.num_warps = detail::kNumWarpsPerBlock;
+      valid_configs.push_back(block_level_parameters);
+    }
+  }
+
+  return valid_configs;
+}
+
+int64_t CalculateNumThreadblocks(const HloDotInstruction* dot, int64_t tile_m,
+                                 int64_t tile_n) {
+  GpuDotFusionCostModel::DotProblemDimensions dims(*dot);
+  int64_t tile_k = dims.k;
+  // TODO(maniananth): Add special handling for grouped matmuls here.
+  int64_t num_tiles_along_m_dimension = CeilOfRatio<int64_t>(dims.m, tile_m);
+  int64_t num_tiles_along_n_dimension = CeilOfRatio<int64_t>(dims.n, tile_n);
+  int64_t num_tiles_along_k_dimension = CeilOfRatio<int64_t>(dims.k, tile_k);
+  int64_t num_threadblocks = dims.b * num_tiles_along_m_dimension *
+                             num_tiles_along_n_dimension *
+                             num_tiles_along_k_dimension;
+
+  return num_threadblocks;
+}
+
+int64_t CalculateNumWaves(int64_t threadblock_count,
+                          const se::DeviceDescription& device_info) {
+  int64_t core_count = device_info.core_count();
+  return CeilOfRatio<int64_t>(threadblock_count, core_count);
+}
+
+int64_t CalculateTileFlops(int64_t tile_m, int64_t tile_n, int64_t problem_k) {
+  return /*flops per MAC*/ 2 * tile_m * tile_n * problem_k;
+}
+
+// Calculates the effective flops for a GPU DOT operation as a function of the
+// tile size (excludes clock throttling). Not all tile sizes are equally able to
+// extract utilization on the same generation GPUs even if the workload is
+// compute bound. GEMM performance is sensitive to the tensor core
+// instruction throughputs that the programming model exposes.
+double GetEffectiveFlopsPerNsForTileSize(
+    const int64_t tile_m, const se::DeviceDescription& device_info) {
+  se::CudaComputeCapability cuda_compute_capability =
+      device_info.cuda_compute_capability();
+
+  // Peak flops per ns for device.
+  int64_t peak_flops_per_ns =
+      GpuPerformanceModelBase::CalculateEffectiveFlopsPerNs(
+          device_info, device_info.fpus_per_core(), device_info.core_count());
+
+  // Final flops derate factor.
+  double flops_derate = 1.0;
+
+  if (cuda_compute_capability.IsBlackwell()) {
+    if (tile_m < 128) {
+      // TODO(maniananth): Update this derate once we have more data from
+      // actual measurements on Blackwell. For now, we are applying a 50%
+      // derate to account for smaller M shapes.
+      flops_derate = 0.5;
+    }
+  } else if (cuda_compute_capability.IsHopper()) {
+    if (tile_m < 64) {
+      // Having a tile size M < 64 will lead to not being able to use the H100
+      // tensor core instructions (wgmma). Defaulting to wmma instructions from
+      // A100 can result in a 63% derate in flops as benchmarked by HazyResearch
+      // as part of ThunderKittens work.
+      // (https://hazyresearch.stanford.edu/blog/2024-05-12-tk)
+      flops_derate = 0.63;
+    }
+  } else if (cuda_compute_capability.IsAmpere()) {
+    if (tile_m < 16) {
+      // A100 tensor core instructions are effective at tile_m >= 16. We're
+      // applying a 50% derate to account for this.
+      flops_derate = 0.5;
+    }
+  }
+  return peak_flops_per_ns * flops_derate;
+}
+
+int64_t CalculateL2Bytes(absl::Span<const int64_t> tile_shape,
+                         int64_t problem_k, int64_t threadblock_count) {
+  // When tiling the GEMM problem on the outputs and mapping one tile per SM,
+  // the problem of data replication (or extra loads of the same data) between
+  // multiple SMs occurs. This leads to more data loads than what’s expected
+  // algorithmically, and increases bandwidth needs on the L2 → SM paths.
+
+  // Input data loaded by each tile is equal to (Tile_M + Tile_N) * Tile_K
+  // bytes.
+  int64_t l2_data_per_tile = (tile_shape[0] + tile_shape[1]) * problem_k;
+
+  // Across all the tiles, data loads will be equal to: (l2_data_per_tile *
+  // threadblock_count).
+
+  // TODO(maniananth): Since H100, threadblocks within the same cluster will
+  // avoid redundant loads by reading from L2 cache once and multicasting the
+  // data to all threadblocks within the cluster. This is controlled
+  // programmatically and most performant GEMM implementations will use this
+  // feature. To model this, we scale the total data loads by the total number
+  // of threadblocks in a cluster.
+
+  // On A100 and older GPUs, we will not see this behavior and the total data
+  // loads will be equal to (l2_data_per_tile * threadblock_count). Hence the
+  // cluster shape can be set to (1x1).
+  // TODO(maniananth): Account for Threadblock clusters here.
+  int64_t total_l2_data = l2_data_per_tile * threadblock_count;
+  return total_l2_data;
+}
+
+}  // namespace
+
+namespace detail {
+
+absl::StatusOr<absl::Duration> CalculateComputeTimeWithTileAndWaveQuantization(
+    const HloDotInstruction* dot, absl::Span<const int64_t> tile_shape,
+    const se::DeviceDescription& device_info) {
+  if (tile_shape.size() != 2) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Tile shape must be of size 2, got ", tile_shape.size()));
+  }
+
+  GpuDotFusionCostModel::DotProblemDimensions dims(*dot);
+  int64_t tile_m = tile_shape[0], tile_n = tile_shape[1];
+  int64_t threadblock_count = CalculateNumThreadblocks(dot, tile_m, tile_n);
+  int64_t wave_count = CalculateNumWaves(threadblock_count, device_info);
+  int64_t flops_per_tile = CalculateTileFlops(tile_m, tile_n, dims.k);
+  // The following is not the actual number of threadblocks launched, but due to
+  // how wave quantization works, we get the effect of running extra
+  // threadblocks when adding to roofline projections.
+  int64_t cta_count_with_wave_quant = wave_count * device_info.core_count();
+  int64_t total_flops_with_wave_quant =
+      flops_per_tile * cta_count_with_wave_quant;
+  double effective_flops =
+      GetEffectiveFlopsPerNsForTileSize(tile_m, device_info);
+  // TODO(maniananth): Add a cap for power throttling here.
+  return absl::Nanoseconds(1.0f * total_flops_with_wave_quant /
+                           effective_flops);
+}
+
+absl::StatusOr<absl::Duration> CalculateL2Time(
+    const HloDotInstruction* dot, absl::Span<const int64_t> tile_shape,
+    const se::DeviceDescription& device_info) {
+  if (tile_shape.size() != 2) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Tile shape must be of size 2, got ", tile_shape.size()));
+  }
+  // TODO(maniananth): L2 bandwidth has been hardcoded for H100 based on
+  // microbenchmarking L2 bandwidth within a partition, but we should add this
+  // to the device info and extend for more GPUs.
+  // TODO(maniananth): Enforcing this check will cause unit tests written for
+  // RTX A6000 device descriptions to fail. We should enable this check once we
+  // have the L2 bandwidth for RTX A6000 or move unit tests to use H100
+  // device description.
+  // if (device_info.cuda_compute_capability() !=
+  //     se::CudaComputeCapability(9, 0)) {
+  //   return absl::InvalidArgumentError(
+  //       "L2 time calculation is only supported for H100 GPUs.");
+  // }
+
+  GpuDotFusionCostModel::DotProblemDimensions dims(*dot);
+  int64_t tile_m = tile_shape[0], tile_n = tile_shape[1];
+  int64_t threadblock_count = CalculateNumThreadblocks(dot, tile_m, tile_n);
+  double device_l2_bandwidth = 6.65 * 1e12;  // Measured H100 L2 bandwidth.
+
+  return absl::Seconds(1.0f *
+                       CalculateL2Bytes(tile_shape, dims.k, threadblock_count) /
+                       device_l2_bandwidth);
+}
+
+absl::Duration CalculateHbmTime(const HloDotInstruction* dot,
+                                const se::DeviceDescription& device_info) {
+  // TODO(maniananth): Implement HBM derate lookup using profiled tables.
+  float hbm_bandwidth_utilization_rate = 0.8;
+  float dram_bandwidth =
+      device_info.memory_bandwidth() * hbm_bandwidth_utilization_rate;
+
+  GpuDotFusionCostModel::DotProblemDimensions dims(*dot);
+  PrimitiveType lhs_element_type = dot->operand(0)->shape().element_type();
+  PrimitiveType rhs_element_type = dot->operand(1)->shape().element_type();
+  PrimitiveType output_element_type = dot->shape().element_type();
+
+  // Calculate the number of bytes for input reads and output writes to HBM.
+  int64_t lhs_tile_bytes = CeilOfRatio<int64_t>(
+      dims.b * dims.m * dims.k * BitWidth(lhs_element_type), 8);
+  int64_t rhs_tile_bytes = CeilOfRatio<int64_t>(
+      dims.b * dims.k * dims.n * BitWidth(rhs_element_type), 8);
+  int64_t output_tile_bytes = CeilOfRatio<int64_t>(
+      dims.b * dims.m * dims.n * BitWidth(output_element_type), 8);
+
+  // Main loop loads the input matrices from HBM using SW pipelining and updates
+  // accumulators stored in register files (within the SM/compute unit). The
+  // epilogue loop writes the output matrices from register files to HBM. Main
+  // loop and epilogue loop are executed sequentially.
+  int64_t main_loop_bytes = lhs_tile_bytes + rhs_tile_bytes;
+  int64_t epilogue_bytes = output_tile_bytes;
+
+  // Calculate the HBM time using the effective bandwidth for each transfer
+  // size. In the current implementation, we are assuming that the main loop and
+  // epilogue loop have the same effective DRAM bandwidth. This could change in
+  // the future, if we choose to model it based on their respective transfer
+  // sizes.
+  absl::Duration hbm_time =
+      absl::Seconds(1.0f * (main_loop_bytes + epilogue_bytes) / dram_bandwidth);
+
+  return hbm_time;
+}
+
+}  // namespace detail
+
+namespace GpuDotFusionCostModel {
+
+absl::Status IsSupported(const HloDotInstruction* dot) {
+  const Shape& lhs_shape = dot->operand(0)->shape();
+  const Shape& rhs_shape = dot->operand(1)->shape();
+  const DotDimensionNumbers& dim_numbers = dot->dot_dimension_numbers();
+
+  DimensionVector lhs_non_contracting_dims = GetNonContractingDims(
+      lhs_shape.dimensions().size(), dim_numbers.lhs_batch_dimensions(),
+      dim_numbers.lhs_contracting_dimensions());
+  DimensionVector rhs_non_contracting_dims = GetNonContractingDims(
+      rhs_shape.dimensions().size(), dim_numbers.rhs_batch_dimensions(),
+      dim_numbers.rhs_contracting_dimensions());
+
+  if (lhs_non_contracting_dims.size() > 1 ||
+      rhs_non_contracting_dims.size() > 1) {
+    return absl::UnimplementedError(absl::StrCat(
+        "Multiple non-contracting dimensions are not supported, got LHS: [",
+        absl::StrJoin(lhs_non_contracting_dims, ","), "], RHS: [",
+        absl::StrJoin(rhs_non_contracting_dims, ","), "]"));
+  }
+  // Only checking one side of batch and contracting dimensions, since they must
+  // be the same for left and right.
+  if (dim_numbers.lhs_batch_dimensions_size() > 1) {
+    return absl::UnimplementedError(
+        absl::StrCat("Batch dimension > 1 is not supported, got ",
+                     absl::StrJoin(dim_numbers.lhs_batch_dimensions(), ",")));
+  }
+  if (dim_numbers.lhs_contracting_dimensions_size() != 1) {
+    return absl::UnimplementedError(absl::StrCat(
+        "Exactly one contracting dimension is supported, got ",
+        absl::StrJoin(dim_numbers.lhs_contracting_dimensions(), ",")));
+  }
+  if (dim_numbers.lhs_contracting_dimensions(0) != 1 ||
+      dim_numbers.rhs_contracting_dimensions(0) != 0) {
+    return absl::UnimplementedError(absl::StrCat(
+        "Only lhs_contracting_dimensions=1 (got ",
+        absl::StrJoin(dim_numbers.lhs_contracting_dimensions(), ","),
+        ") and  rhs_contracting_dimensions=0 (got ",
+        absl::StrJoin(dim_numbers.rhs_contracting_dimensions(), ","),
+        ") are supported."));
+  }
+
+  return absl::OkStatus();
+}
+
+DotProblemDimensions::DotProblemDimensions(const HloDotInstruction& dot) {
+  const Shape& lhs_shape = dot.operand(0)->shape();
+  const Shape& rhs_shape = dot.operand(1)->shape();
+  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+
+  DimensionVector lhs_non_contracting_dims = GetNonContractingDims(
+      lhs_shape.dimensions().size(), dim_numbers.lhs_contracting_dimensions(),
+      dim_numbers.lhs_batch_dimensions());
+  DimensionVector rhs_non_contracting_dims = GetNonContractingDims(
+      rhs_shape.dimensions().size(), dim_numbers.rhs_contracting_dimensions(),
+      dim_numbers.rhs_batch_dimensions());
+
+  b = dim_numbers.lhs_batch_dimensions_size() > 0
+          ? dim_numbers.lhs_batch_dimensions(0)
+          : 1;
+  m = lhs_shape.dimensions(lhs_non_contracting_dims[0]);
+  n = rhs_shape.dimensions(rhs_non_contracting_dims[0]);
+  k = lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions()[0]);
+}
+
+absl::StatusOr<absl::Duration> EstimateRunTimeForDotOpWithBlockParameters(
+    const HloDotInstruction* dot, const BlockLevelParameters& block_params,
+    const se::DeviceDescription& device_info) {
+  TF_RETURN_IF_ERROR(IsSupported(dot));
+  if (block_params.output_tile_sizes.size() != 1) {
+    return absl::UnimplementedError(
+        absl::StrCat("Only single tile size is supported, got ",
+                     block_params.output_tile_sizes.size()));
+  }
+
+  // Calculate compute roofline with tile and wave quantization.
+  TF_ASSIGN_OR_RETURN(absl::Duration compute_time,
+                      detail::CalculateComputeTimeWithTileAndWaveQuantization(
+                          dot, block_params.output_tile_sizes[0], device_info));
+  // Calculate HBM roofline.
+  absl::Duration hbm_time = detail::CalculateHbmTime(dot, device_info);
+  // Calculate L2 time.
+  TF_ASSIGN_OR_RETURN(absl::Duration l2_time,
+                      detail::CalculateL2Time(
+                          dot, block_params.output_tile_sizes[0], device_info));
+
+  // Assuming perfect overlap between compute and memory.
+  return std::max({compute_time, hbm_time, l2_time});
+}
+
+absl::StatusOr<absl::Duration> EstimateRunTimeForDotOp(
+    const HloDotInstruction* dot, const se::DeviceDescription& device_info) {
+  TF_RETURN_IF_ERROR(IsSupported(dot));
+
+  // TODO(maniananth): Implement this.
+  return absl::UnimplementedError("Not implemented yet");
+}
+
+absl::StatusOr<BlockLevelParameters> FindBestBlockLevelParameters(
+    const HloDotInstruction* dot, const se::DeviceDescription& device_info) {
+  TF_RETURN_IF_ERROR(IsSupported(dot));
+
+  // TODO(maniananth): Implement this.
+  return absl::UnimplementedError("Not implemented yet");
+}
+
+}  // namespace GpuDotFusionCostModel
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model.h b/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model.h
new file mode 100644
index 000000000000..e9d58066a036
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model.h
@@ -0,0 +1,92 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_DOT_FUSION_COST_MODEL_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_DOT_FUSION_COST_MODEL_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+namespace GpuDotFusionCostModel {
+
+struct DotProblemDimensions {
+  int64_t b;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+
+  explicit DotProblemDimensions(const HloDotInstruction& dot);
+};
+
+// Returns OkStatus if the dot operation is supported by the cost model.
+absl::Status IsSupported(const HloDotInstruction* dot);
+
+// Estimates the run time for a GPU DOT operation with the given set of block
+// parameters.
+absl::StatusOr<absl::Duration> EstimateRunTimeForDotOpWithBlockParameters(
+    const HloDotInstruction* dot, const BlockLevelParameters& block_params,
+    const se::DeviceDescription& device_info);
+
+// Estimates the run time for a GPU DOT operation.
+absl::StatusOr<absl::Duration> EstimateRunTimeForDotOp(
+    const HloDotInstruction* dot, const se::DeviceDescription& device_info);
+
+absl::StatusOr<BlockLevelParameters> FindBestBlockLevelParameters(
+    const HloDotInstruction* dot, const se::DeviceDescription& device_info);
+}  // namespace GpuDotFusionCostModel
+
+namespace detail {
+
+// Calculates the HBM time for a GPU DOT operation. Current implementation
+// uses a flat derate on top of the spec bandwidth. A HBM bandwidth model based
+// derate lookup from profiled data will be added in the future.
+absl::Duration CalculateHbmTime(const HloDotInstruction* dot,
+                                const se::DeviceDescription& device_info);
+
+// Calculates the L2 time for a GPU DOT operation.
+absl::StatusOr<absl::Duration> CalculateL2Time(
+    const HloDotInstruction* dot, absl::Span<const int64_t> tile_shape,
+    const se::DeviceDescription& device_info);
+
+// Calculates the compute time for a GPU DOT operation with tile and wave
+// quantization effects taken into account.
+// (1) Tile Quantization effects occur when the input problem dimensions are
+//     quantized to the tile shape.
+// (2) Wave Quantization effects occur when the number of threadblocks is
+//     quantized to the number of SMs per GPU.
+absl::StatusOr<absl::Duration> CalculateComputeTimeWithTileAndWaveQuantization(
+    const HloDotInstruction* dot, absl::Span<const int64_t> tile_shape,
+    const se::DeviceDescription& device_info);
+
+const int kMinBlockDim = 32;
+const int kMaxBlockDim = 256;
+const int kMaxSplitK = 128;
+const int kNumWarpsPerBlock = 4;
+}  // namespace detail
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_DOT_FUSION_COST_MODEL_H_
diff --git a/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model_test.cc
new file mode 100644
index 000000000000..4e434abd360a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_dot_fusion_cost_model_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_dot_fusion_cost_model.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuDotFusionCostModelTest : public HloHardwareIndependentTestBase {
+ protected:
+  se::DeviceDescription device_description_{
+      TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+};
+
+TEST_F(GpuDotFusionCostModelTest, GpuDotComputeBoundBf16) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+p0 = bf16[8192,8192] parameter(0)
+p1 = bf16[8192,8192] parameter(1)
+ROOT r = bf16[8192,8192] dot(p0, p1),
+lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=dot_bf16_bf16_bf16
+})"));
+
+  BlockLevelParameters block_params;
+  block_params.output_tile_sizes = {{64, 64}};
+  block_params.num_warps = 4;
+  block_params.num_ctas = 1;
+  block_params.num_stages = 1;
+  auto* dot =
+      Cast<HloDotInstruction>(module->entry_computation()->root_instruction());
+  ASSERT_IS_OK(GpuDotFusionCostModel::IsSupported(dot));
+  absl::Duration runtime =
+      GpuDotFusionCostModel::EstimateRunTimeForDotOpWithBlockParameters(
+          dot, block_params, device_description_)
+          .value();
+  absl::Duration expected_runtime_compute_bound =
+      detail::CalculateComputeTimeWithTileAndWaveQuantization(
+          dot, block_params.output_tile_sizes[0], device_description_)
+          .value();
+  ASSERT_EQ(runtime, expected_runtime_compute_bound);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index 8f22c3e347f6..c124961eca27 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -335,8 +335,8 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
                   launch_dimensions.num_threads_per_block());
   absl::Duration write_time = WriteTime(*device_info_, bytes_written);
   absl::Duration memory_access_time = read_time + write_time;
-  absl::Duration exec_time = CombineComputeAndMemoryAccessTime(
-      compute_time, memory_access_time, GpuPerformanceModelOptions::Default());
+  absl::Duration exec_time =
+      CombineComputeAndMemoryAccessTime(compute_time, memory_access_time);
 
   EstimateRunTimeData runtime_data = {flops,     bytes_read, bytes_written,
                                       read_time, write_time, compute_time,
@@ -512,12 +512,12 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForTiledHloComputation(
   }
 
   absl::Duration compute_time =
-      ComputeTime(*device_info_, flops, launch_dimensions.num_blocks(),
+      ComputeTime(*device_info_, flops, num_blocks,
                   launch_dimensions.num_threads_per_block());
 
   absl::Duration memory_access_time = read_time + write_time;
-  absl::Duration exec_time = CombineComputeAndMemoryAccessTime(
-      compute_time, memory_access_time, GpuPerformanceModelOptions::Default());
+  absl::Duration exec_time =
+      CombineComputeAndMemoryAccessTime(compute_time, memory_access_time);
 
   return EstimateRunTimeData{/*flops=*/flops,
                              /*bytes_read=*/bytes_read,
@@ -543,15 +543,12 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForTiledFusion(
     return absl::FailedPreconditionError(absl::StrCat(
         "SymbolicTileAnalysis failed. ", fusion_decision->Explain()));
   }
-  // TODO(b/390559452): Add support for more than one fusion root.
-  if (tile_sizes.size() != 1) {
-    return absl::UnimplementedError("Only 1 root is supported right now");
-  }
   SymbolicTileAnalysis analysis =
       std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
 
   TF_ASSIGN_OR_RETURN(TiledHloComputation tiled_hlo_computation,
-                      analysis.ComputeTiledHloInstructions(tile_sizes[0]));
+                      analysis.ComputeTiledHloInstructions(
+                          tile_sizes[analysis.real_root_index()]));
 
   return EstimateRunTimeForTiledHloComputation(
       fusion_adaptor, tiled_hlo_computation, launch_dimensions);
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
index 577ab9bf7f6c..ab41fdf82bee 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
 #define XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
 
-#include <cstddef>
 #include <cstdint>
 #include <variant>
 #include <vector>
@@ -32,7 +31,6 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
-#include "xla/service/gpu/model/symbolic_tile_analysis.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/instruction_fusion.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
index 02f5b3c2eccc..3c81d0766a75 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/backend_configs.pb.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -56,7 +56,7 @@ using ::testing::ElementsAre;
 using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
-class GpuIndexingPerformanceModelTest : public HloTestBase {
+class GpuIndexingPerformanceModelTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
   // The reference times in the test cases below are measured
@@ -68,8 +68,6 @@ class GpuIndexingPerformanceModelTest : public HloTestBase {
       &mlir_context_};
 
   size_t WarpSize() const { return ::xla::gpu::WarpSize(device_info_); }
-
-  GpuIndexingPerformanceModelTest() : HloTestBase() {}
 };
 
 TEST_F(GpuIndexingPerformanceModelTest, BroadcastElementwise) {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index ab09a82537e9..c8bd2a39c65c 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/model/gpu_performance_model.h"
 
-#include <algorithm>
 #include <cstdint>
+#include <memory>
 #include <optional>
 
 #include "absl/log/check.h"
@@ -32,41 +32,39 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/coalescing_analysis.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/status.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
 
-/*static*/ EstimateRunTimeData
-GpuPerformanceModel::EstimateRunTimeForInstruction(
-    const HloInstruction* instr, const se::DeviceDescription& device_info,
-    const GpuHloCostAnalysis* cost_analysis,
-    const GpuPerformanceModelOptions& config) {
+GpuPerformanceModel::GpuPerformanceModel(
+    const se::DeviceDescription& device_info,
+    HloFusionAnalysisCache& fusion_analysis_cache,
+    GpuPerformanceModelCache& gpu_performance_model_cache)
+    : device_info_(device_info),
+      fusion_analysis_cache_(fusion_analysis_cache),
+      gpu_performance_model_cache_(gpu_performance_model_cache) {};
+
+EstimateRunTimeData GpuPerformanceModel::EstimateRunTimeForInstructionImpl(
+    const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis) {
   VLOG(8) << "EstimateRunTimeForInstruction: " << instr->name();
 
   int64_t flops = cost_analysis->flop_count(*instr);
   int64_t bytes_written = cost_analysis->output_bytes_accessed(*instr);
 
-  // Use the analysis cache if present.
-  // TODO(jreiffers): Remove this once all callers use a cache.
-  std::optional<HloFusionAnalysis> local_analysis;
-  if (!config.fusion_analysis_cache) {
-    local_analysis = HloFusionAnalysis::Create(*instr, device_info);
-  }
-  const auto& fusion_analysis = config.fusion_analysis_cache
-                                    ? config.fusion_analysis_cache->Get(*instr)
-                                    : local_analysis.value();
+  const auto& fusion_analysis = fusion_analysis_cache_.Get(*instr);
   LaunchDimensions launch_dimensions =
       EstimateFusionLaunchDimensions(fusion_analysis);
   int64_t num_blocks = launch_dimensions.num_blocks();
 
   absl::Duration compute_time =
-      ComputeTime(device_info, flops, num_blocks,
+      ComputeTime(device_info_, flops, num_blocks,
                   launch_dimensions.num_threads_per_block());
 
   CoalescingAnalysis coalescing_analysis(instr, instr->operands(),
@@ -87,14 +85,14 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
     VLogOperandRead(operand, n_bytes_total, n_bytes_net, coalesced);
 
     read_time += ReadTimeWithDRAMHeuristic(
-        device_info, num_blocks, n_bytes_net, n_bytes_total,
+        device_info_, num_blocks, n_bytes_net, n_bytes_total,
         operand->shape().element_type(),
-        GetCoalescingUtilizationRate(element_type, device_info, coalesced));
+        GetCoalescingUtilizationRate(element_type, device_info_, coalesced));
   }
 
-  absl::Duration write_time = WriteTime(device_info, bytes_written);
-  absl::Duration exec_time = CombineComputeAndMemoryAccessTime(
-      compute_time, read_time + write_time, config);
+  absl::Duration write_time = WriteTime(device_info_, bytes_written);
+  absl::Duration exec_time =
+      CombineComputeAndMemoryAccessTime(compute_time, read_time + write_time);
 
   EstimateRunTimeData runtime_data = {flops,     bytes_read, bytes_written,
                                       read_time, write_time, compute_time,
@@ -105,35 +103,24 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
   return runtime_data;
 }
 
-/*static*/ EstimateRunTimeData
-GpuPerformanceModel::EstimateRunTimeForInstructionCached(
-    const HloInstruction* instr, const se::DeviceDescription& device_info,
-    const GpuHloCostAnalysis* cost_analysis,
-    const GpuPerformanceModelOptions& config) {
-  if (config.gpu_performance_model_cache) {
-    if (auto cached_result = config.gpu_performance_model_cache->Get(*instr)) {
-      return *cached_result;
-    }
+EstimateRunTimeData GpuPerformanceModel::EstimateRunTimeForInstruction(
+    const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis) {
+  if (auto cached_result_opt = gpu_performance_model_cache_.Get(*instr)) {
+    return *cached_result_opt;
   }
 
-  auto runtime_data =
-      EstimateRunTimeForInstruction(instr, device_info, cost_analysis, config);
+  auto runtime_data = EstimateRunTimeForInstructionImpl(instr, cost_analysis);
 
-  if (config.gpu_performance_model_cache) {
-    config.gpu_performance_model_cache->Set(*instr, runtime_data);
-  }
+  gpu_performance_model_cache_.Set(*instr, runtime_data);
 
   return runtime_data;
 }
 
-/*static*/ absl::Duration GpuPerformanceModel::EstimateRunTimeForFusion(
+absl::Duration GpuPerformanceModel::EstimateRunTimeForFusionImpl(
     const HloInstruction* producer, const HloInstruction* consumer,
     const EstimateRunTimeData& producer_runtime,
     const EstimateRunTimeData& consumer_runtime,
-    const se::DeviceDescription& device_info,
-    const GpuHloCostAnalysis* cost_analysis,
-    const GpuPerformanceModelOptions& config,
-    bool producer_writes_side_output) {
+    const GpuHloCostAnalysis* cost_analysis, bool producer_writes_side_output) {
   VLOG(8) << "EstimateRunTimeForFusion, producer: " << producer->name()
           << " consumer: " << consumer->name();
 
@@ -151,15 +138,8 @@ GpuPerformanceModel::EstimateRunTimeForInstructionCached(
     }
   }
 
-  std::optional<HloFusionAnalysis> local_analysis_fused;
-  if (!config.fusion_analysis_cache) {
-    local_analysis_fused =
-        HloFusionAnalysis::Create(*producer, *consumer, device_info);
-  }
   const auto& fusion_analysis =
-      config.fusion_analysis_cache
-          ? config.fusion_analysis_cache->Get(*producer, *consumer)
-          : local_analysis_fused.value();
+      fusion_analysis_cache_.Get(*producer, *consumer);
 
   LaunchDimensions launch_dimensions =
       EstimateFusionLaunchDimensions(fusion_analysis);
@@ -168,7 +148,7 @@ GpuPerformanceModel::EstimateRunTimeForInstructionCached(
                   consumer_runtime.flops;
 
   absl::Duration compute_time =
-      ComputeTime(device_info, flops, launch_dimensions.num_blocks(),
+      ComputeTime(device_info_, flops, launch_dimensions.num_blocks(),
                   launch_dimensions.num_threads_per_block());
 
   auto fusion_operands = fusion_analysis.fusion().GetParameters();
@@ -191,9 +171,9 @@ GpuPerformanceModel::EstimateRunTimeForInstructionCached(
     VLogOperandRead(operand, n_bytes_total, n_bytes_net, coalesced);
 
     read_time += ReadTimeWithDRAMHeuristic(
-        device_info, launch_dimensions.num_blocks(), n_bytes_net, n_bytes_total,
-        operand->shape().element_type(),
-        GetCoalescingUtilizationRate(element_type, device_info, coalesced));
+        device_info_, launch_dimensions.num_blocks(), n_bytes_net,
+        n_bytes_total, operand->shape().element_type(),
+        GetCoalescingUtilizationRate(element_type, device_info_, coalesced));
   }
 
   int64_t bytes_written = consumer_runtime.bytes_written;
@@ -207,8 +187,8 @@ GpuPerformanceModel::EstimateRunTimeForInstructionCached(
     write_time += producer_runtime.write_time;
   }
 
-  auto exec_time = CombineComputeAndMemoryAccessTime(
-      compute_time, read_time + write_time, config);
+  auto exec_time =
+      CombineComputeAndMemoryAccessTime(compute_time, read_time + write_time);
 
   VLOG(3) << "Runtime data for producer-consumer fusion:\n"
           << " producer: " << producer->name() << "\n"
@@ -222,39 +202,28 @@ GpuPerformanceModel::EstimateRunTimeForInstructionCached(
   return exec_time;
 }
 
-/*static*/
-absl::Duration GpuPerformanceModel::EstimateRunTimeForFusionCached(
+absl::Duration GpuPerformanceModel::EstimateRunTimeForFusion(
     const HloInstruction* producer, const HloInstruction* consumer,
     const EstimateRunTimeData& producer_runtime,
     const EstimateRunTimeData& consumer_runtime,
-    const se::DeviceDescription& device_info,
-    const GpuHloCostAnalysis* cost_analysis,
-    const GpuPerformanceModelOptions& config) {
-  if (config.gpu_performance_model_cache) {
-    if (auto fusion_runtime =
-            config.gpu_performance_model_cache->Get(*producer, *consumer)) {
-      return *fusion_runtime;
-    }
+    const GpuHloCostAnalysis* cost_analysis, bool producer_writes_side_output) {
+  if (auto fusion_runtime_opt =
+          gpu_performance_model_cache_.Get(*producer, *consumer)) {
+    return *fusion_runtime_opt;
   }
 
-  auto fusion_runtime = EstimateRunTimeForFusion(
-      producer, consumer, producer_runtime, consumer_runtime, device_info,
-      cost_analysis, config);
+  auto fusion_runtime = EstimateRunTimeForFusionImpl(
+      producer, consumer, producer_runtime, consumer_runtime, cost_analysis,
+      producer_writes_side_output);
 
-  if (config.gpu_performance_model_cache) {
-    config.gpu_performance_model_cache->Set(*producer, *consumer,
-                                            fusion_runtime);
-  }
+  gpu_performance_model_cache_.Set(*producer, *consumer, fusion_runtime);
   return fusion_runtime;
 }
 
-/*static*/
 GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
-    const HloInstruction* producer, const se::DeviceDescription& device_info,
-    const GpuHloCostAnalysis* cost_analysis,
-    const GpuPerformanceModelOptions& config,
+    const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
     absl::Span<const HloInstruction* const> fused_consumers) {
-  auto cache_result = config.gpu_performance_model_cache->Get(*producer);
+  auto cache_result = gpu_performance_model_cache_.Get(*producer);
   CHECK(cache_result.has_value());
   EstimateRunTimeData producer_runtime = *cache_result;
 
@@ -267,16 +236,15 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
   for (auto fused_consumer : fused_consumers) {
     VLOG(8) << "Fused consumer: " << fused_consumer->name();
 
-    auto cache_result =
-        config.gpu_performance_model_cache->Get(*fused_consumer);
+    auto cache_result = gpu_performance_model_cache_.Get(*fused_consumer);
     CHECK(cache_result.has_value());
     EstimateRunTimeData consumer_runtime = *cache_result;
 
     time_unfused += consumer_runtime.exec_time;
 
-    time_fused += EstimateRunTimeForFusionCached(
-        producer, fused_consumer, producer_runtime, consumer_runtime,
-        device_info, cost_analysis, config);
+    time_fused +=
+        EstimateRunTimeForFusion(producer, fused_consumer, producer_runtime,
+                                 consumer_runtime, cost_analysis);
   }
 
   if (VLOG_IS_ON(8)) {
@@ -288,20 +256,14 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
   return {time_unfused, time_fused};
 }
 
-/*static*/
 GpuPerformanceModel::RunTimes
 GpuPerformanceModel::EstimateRunTimesForMultiOutputFusion(
     const HloInstruction* producer, const HloInstruction* consumer,
-    const se::DeviceDescription& device_info,
     const GpuHloCostAnalysis* cost_analysis) {
-  GpuPerformanceModelOptions config = GpuPerformanceModelOptions::Default();
-
   EstimateRunTimeData producer_runtime =
-      GpuPerformanceModel::EstimateRunTimeForInstruction(producer, device_info,
-                                                         cost_analysis, config);
+      EstimateRunTimeForInstruction(producer, cost_analysis);
   EstimateRunTimeData consumer_runtime =
-      GpuPerformanceModel::EstimateRunTimeForInstruction(consumer, device_info,
-                                                         cost_analysis, config);
+      EstimateRunTimeForInstruction(consumer, cost_analysis);
 
   absl::Duration time_unfused = 2 * kKernelLaunchOverhead +
                                 producer_runtime.exec_time +
@@ -310,8 +272,8 @@ GpuPerformanceModel::EstimateRunTimesForMultiOutputFusion(
   absl::Duration time_fused =
       kKernelLaunchOverhead +
       EstimateRunTimeForFusion(producer, consumer, producer_runtime,
-                               consumer_runtime, device_info, cost_analysis,
-                               config, /*producer_writes_side_output=*/true);
+                               consumer_runtime, cost_analysis,
+                               /*producer_writes_side_output=*/true);
 
   if (VLOG_IS_ON(8)) {
     LOG(INFO) << "Unfused time: " << time_unfused;
@@ -321,23 +283,19 @@ GpuPerformanceModel::EstimateRunTimesForMultiOutputFusion(
   return {time_unfused, time_fused};
 }
 
-/*static*/
 void GpuPerformanceModel::RecordEstimatedRunTime(
-    HloInstruction* instruction, const se::DeviceDescription& device_info,
-    const GpuHloCostAnalysis* cost_analysis,
-    const GpuPerformanceModelOptions& config) {
+    HloInstruction* instruction, const GpuHloCostAnalysis* cost_analysis) {
   DCHECK(Cast<const HloFusionInstruction>(instruction)) << "expected fusion";
   DCHECK(cost_analysis != nullptr) << "expected cost analysis";
 
-  EstimateRunTimeData data = EstimateRunTimeForInstruction(
-      instruction, device_info, cost_analysis, config);
+  EstimateRunTimeData data =
+      EstimateRunTimeForInstruction(instruction, cost_analysis);
   double cycles =
-      absl::ToDoubleNanoseconds(data.exec_time) * device_info.clock_rate_ghz();
+      absl::ToDoubleNanoseconds(data.exec_time) * device_info_.clock_rate_ghz();
 
   auto gpu_config = instruction->backend_config<GpuBackendConfig>();
   TF_CHECK_OK(gpu_config.status()) << instruction->ToString();
-  auto reification_cost =
-      gpu_config->mutable_fusion_backend_config()->mutable_reification_cost();
+  auto reification_cost = gpu_config->add_reification_cost();
   reification_cost->set_end_to_end_cycles(cycles);
   reification_cost->set_compute_time_us(
       absl::ToDoubleMicroseconds(data.compute_time));
@@ -350,5 +308,33 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
   VLOG(8) << "RecordEstimatedRunTime: " << instruction->ToString();
 }
 
+GpuPerformanceModelOwning::GpuPerformanceModelOwning(
+    const se::DeviceDescription& device_info)
+    : fusion_analysis_cache_(device_info),
+      gpu_performance_model_(std::make_unique<GpuPerformanceModel>(
+          device_info, fusion_analysis_cache_, gpu_performance_model_cache_)) {
+      };
+
+void GpuPerformanceModelOwning::RecordEstimatedRunTime(
+    HloInstruction* instruction,
+    const GpuHloCostAnalysis* cost_analysis) const {
+  gpu_performance_model_->RecordEstimatedRunTime(instruction, cost_analysis);
+}
+
+EstimateRunTimeData GpuPerformanceModelOwning::EstimateRunTimeForInstruction(
+    const HloInstruction* instr,
+    const GpuHloCostAnalysis* cost_analysis) const {
+  return gpu_performance_model_->EstimateRunTimeForInstruction(instr,
+                                                               cost_analysis);
+}
+
+GpuPerformanceModel::RunTimes
+GpuPerformanceModelOwning::EstimateRunTimesForMultiOutputFusion(
+    const HloInstruction* producer, const HloInstruction* consumer,
+    const GpuHloCostAnalysis* cost_analysis) const {
+  return gpu_performance_model_->EstimateRunTimesForMultiOutputFusion(
+      producer, consumer, cost_analysis);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index 7f31d54353fb..cb0efe343dad 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_H_
 #define XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_H_
 
+#include <memory>
+
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/stream_executor/device_description.h"
@@ -28,50 +31,82 @@ namespace gpu {
 
 class GpuPerformanceModel : public GpuPerformanceModelBase {
  public:
-  static EstimateRunTimeData EstimateRunTimeForInstruction(
-      const HloInstruction* instr, const se::DeviceDescription& device_info,
-      const GpuHloCostAnalysis* cost_analysis,
-      const GpuPerformanceModelOptions& config);
+  // Lifetime to all references to this constructor must live at least as long
+  GpuPerformanceModel(const se::DeviceDescription& device_info,
+                      HloFusionAnalysisCache& fusion_analysis_cache,
+                      GpuPerformanceModelCache& gpu_performance_model_cache);
 
-  static EstimateRunTimeData EstimateRunTimeForInstructionCached(
-      const HloInstruction* instr, const se::DeviceDescription& device_info,
-      const GpuHloCostAnalysis* cost_analysis,
-      const GpuPerformanceModelOptions& config);
+  EstimateRunTimeData EstimateRunTimeForInstruction(
+      const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis);
 
   // TODO(shyshkov): Unify interface with EstimateRunTimeForInstruction.
-  static absl::Duration EstimateRunTimeForFusion(
+  absl::Duration EstimateRunTimeForFusion(
       const HloInstruction* producer, const HloInstruction* consumer,
       const EstimateRunTimeData& producer_runtime,
       const EstimateRunTimeData& consumer_runtime,
-      const se::DeviceDescription& device_info,
       const GpuHloCostAnalysis* cost_analysis,
-      const GpuPerformanceModelOptions& config,
       bool producer_writes_side_output = false);
 
-  static absl::Duration EstimateRunTimeForFusionCached(
+  RunTimes EstimateRunTimes(
+      const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
+      absl::Span<const HloInstruction* const> fused_consumers = {});
+
+  RunTimes EstimateRunTimesForMultiOutputFusion(
+      const HloInstruction* producer, const HloInstruction* consumer,
+      const GpuHloCostAnalysis* cost_analysis);
+
+  // Writes estimated execution time to FusionBackendConfig.reification_cost.
+  void RecordEstimatedRunTime(HloInstruction* instruction,
+                              const GpuHloCostAnalysis* cost_analysis);
+
+ private:
+  EstimateRunTimeData EstimateRunTimeForInstructionImpl(
+      const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis);
+
+  absl::Duration EstimateRunTimeForFusionImpl(
       const HloInstruction* producer, const HloInstruction* consumer,
       const EstimateRunTimeData& producer_runtime,
       const EstimateRunTimeData& consumer_runtime,
-      const se::DeviceDescription& device_info,
       const GpuHloCostAnalysis* cost_analysis,
-      const GpuPerformanceModelOptions& config);
+      bool producer_writes_side_output);
 
-  static RunTimes EstimateRunTimes(
-      const HloInstruction* producer, const se::DeviceDescription& device_info,
-      const GpuHloCostAnalysis* cost_analysis,
-      const GpuPerformanceModelOptions& config,
-      absl::Span<const HloInstruction* const> fused_consumers = {});
+  const se::DeviceDescription& device_info_;
+  HloFusionAnalysisCache& fusion_analysis_cache_;
+  // TODO(sohaibiftikhar) Make this an owning member of this class. Currently
+  // this is not possible because the cache is used directly by
+  // xla::gpu::PriorityFusionQueue
+  GpuPerformanceModelCache& gpu_performance_model_cache_;
+};
+
+// An owning wrapper around GpuPerformanceModel that also owns the caches.
+// This keeps dependencies for the analysis caches out of the files that only
+// need to perform the analysis without owning or knowing about the caches.
+// If access to the analysis caches are required by the caller then the non
+// owning model should be used.
+class GpuPerformanceModelOwning {
+ public:
+  explicit GpuPerformanceModelOwning(const se::DeviceDescription& device_info);
+
+  // Wrapper over GpuPerformanceModel::RecordEstimatedRunTime
+  void RecordEstimatedRunTime(HloInstruction* instruction,
+                              const GpuHloCostAnalysis* cost_analysis) const;
+
+  // Wrapper over GpuPerformanceModel::EstimateRunTimeForInstruction.
+  EstimateRunTimeData EstimateRunTimeForInstruction(
+      const HloInstruction* instr,
+      const GpuHloCostAnalysis* cost_analysis) const;
 
-  static RunTimes EstimateRunTimesForMultiOutputFusion(
+  // Wrapper over GpuPerformanceModel::EstimateRunTimesForMultiOutputFusion.
+  GpuPerformanceModel::RunTimes EstimateRunTimesForMultiOutputFusion(
       const HloInstruction* producer, const HloInstruction* consumer,
-      const se::DeviceDescription& device_info,
-      const GpuHloCostAnalysis* cost_analysis);
+      const GpuHloCostAnalysis* cost_analysis) const;
 
-  // Writes estimated execution time to FusionBackendConfig.reification_cost.
-  static void RecordEstimatedRunTime(HloInstruction* instruction,
-                                     const se::DeviceDescription& device_info,
-                                     const GpuHloCostAnalysis* cost_analysis,
-                                     const GpuPerformanceModelOptions& config);
+ private:
+  HloFusionAnalysisCache fusion_analysis_cache_;
+  GpuPerformanceModelCache gpu_performance_model_cache_{};
+  // Unique pointer to allow const access to the model since caches will be
+  // mutated after estimation.
+  std::unique_ptr<GpuPerformanceModel> gpu_performance_model_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index c15cc173db50..d769f2b0f029 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -334,9 +334,9 @@ absl::Duration GpuPerformanceModelBase::WriteTime(
 }
 
 /*static*/
-absl::Duration GpuPerformanceModelBase::ComputeTime(
-    const se::DeviceDescription& gpu_device_info, int64_t flops,
-    int64_t num_blocks, int64_t num_threads_per_block) {
+int64_t GpuPerformanceModelBase::CalculateEffectiveFlopsPerNs(
+    const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+    int64_t num_threads_per_block) {
   int64_t n_active_fpus_per_core =
       std::min<int64_t>(num_threads_per_block, gpu_device_info.fpus_per_core());
 
@@ -345,17 +345,23 @@ absl::Duration GpuPerformanceModelBase::ComputeTime(
   int64_t fpu_count = n_active_core * n_active_fpus_per_core;
 
   int64_t flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
-  int64_t flop_per_ns_effective = flop_per_ns_per_fpu * fpu_count;
+  return flop_per_ns_per_fpu * fpu_count;
+}
+
+/*static*/
+absl::Duration GpuPerformanceModelBase::ComputeTime(
+    const se::DeviceDescription& gpu_device_info, int64_t flops,
+    int64_t num_blocks, int64_t num_threads_per_block) {
+  int64_t flop_per_ns_effective = CalculateEffectiveFlopsPerNs(
+      gpu_device_info, num_blocks, num_threads_per_block);
   return absl::Nanoseconds(1.0f * flops / flop_per_ns_effective);
 }
 
 /*static*/
 absl::Duration GpuPerformanceModelBase::CombineComputeAndMemoryAccessTime(
-    absl::Duration compute_time, absl::Duration memory_access_time,
-    const GpuPerformanceModelOptions& config) {
+    absl::Duration compute_time, absl::Duration memory_access_time) {
   return compute_time + memory_access_time -
-         std::min(compute_time, memory_access_time) *
-             config.memory_compute_parallelism;
+         std::min(compute_time, memory_access_time) * kMemoryComputeParallelism;
 }
 
 /*static*/
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
index 0ac09b5dcf2b..2f51ec25e5d4 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -126,34 +126,6 @@ class GpuPerformanceModelCache {
       fusion_runtime_data_;
 };
 
-struct GpuPerformanceModelOptions {
-  // Factor for how much parallelism between compute and memory accesses should
-  // be assumed. If 1.0, assume perfect parallelism (the run time is the maximum
-  // of both times). If 0.0, assume no parallelism (the run time is the sum of
-  // both times).
-  //
-  // This constant was chosen empirically in early 2024, based on runtime
-  // performance on a set of benchmarks internal to Google. Intuitively, we
-  // expect it to be close to 1, but not quite 1 (i.e., sometimes, compute
-  // or memory accesses will be stalled waiting for the other, but usually
-  // they won't).
-  double memory_compute_parallelism = 0.95;
-
-  // If present, use this to retrieve fusion analyses.
-  HloFusionAnalysisCache* fusion_analysis_cache = nullptr;
-
-  GpuPerformanceModelCache* gpu_performance_model_cache = nullptr;
-
-  static GpuPerformanceModelOptions Default(
-      HloFusionAnalysisCache* fusion_analysis_cache = nullptr,
-      GpuPerformanceModelCache* gpu_performance_model_cache = nullptr) {
-    GpuPerformanceModelOptions config;
-    config.fusion_analysis_cache = fusion_analysis_cache;
-    config.gpu_performance_model_cache = gpu_performance_model_cache;
-    return config;
-  }
-};
-
 class GpuPerformanceModelBase {
  public:
   struct RunTimes {
@@ -167,6 +139,17 @@ class GpuPerformanceModelBase {
       absl::Microseconds(5);
   static constexpr float kL2CacheSpeedup = 2.5;
   static constexpr float kL1CacheSpeedup = 8;
+  // Factor for how much parallelism between compute and memory accesses should
+  // be assumed. If 1.0, assume perfect parallelism (the run time is the maximum
+  // of both times). If 0.0, assume no parallelism (the run time is the sum of
+  // both times).
+  //
+  // This constant was chosen empirically in early 2024, based on runtime
+  // performance on a set of benchmarks internal to Google. Intuitively, we
+  // expect it to be close to 1, but not quite 1 (i.e., sometimes, compute
+  // or memory accesses will be stalled waiting for the other, but usually
+  // they won't).
+  static constexpr double kMemoryComputeParallelism = 0.95;
 
   // Uses HloFusionAnalysis for computing the actual number of threads and
   // blocks that the IR emitter will use.
@@ -220,13 +203,16 @@ class GpuPerformanceModelBase {
   static absl::Duration WriteTime(const se::DeviceDescription& gpu_device_info,
                                   int64_t bytes_written);
 
+  static int64_t CalculateEffectiveFlopsPerNs(
+      const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+      int64_t num_threads_per_block);
+
   static absl::Duration ComputeTime(
       const se::DeviceDescription& gpu_device_info, int64_t flops,
       int64_t num_blocks, int64_t num_threads_per_block);
 
   static absl::Duration CombineComputeAndMemoryAccessTime(
-      absl::Duration compute_time, absl::Duration memory_access_time,
-      const GpuPerformanceModelOptions& config);
+      absl::Duration compute_time, absl::Duration memory_access_time);
 
   // Logs estimates for the operand read if VLOG is enabled.
   static void VLogOperandRead(const HloInstruction* operand,
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
index b520dfa9c591..f70800e83881 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -21,20 +21,20 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-class GpuPerformanceModelBaseTest : public HloTestBase {
+class GpuPerformanceModelBaseTest : public HloHardwareIndependentTestBase {
  public:
   GpuHloCostAnalysis::Options options_;
   // The reference times in the test cases below are measured
@@ -42,7 +42,7 @@ class GpuPerformanceModelBaseTest : public HloTestBase {
   se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
   std::unique_ptr<GpuHloCostAnalysis> analysis_;
 
-  GpuPerformanceModelBaseTest() : HloTestBase() {
+  GpuPerformanceModelBaseTest() {
     options_.count_multiple_input_accesses = true;
     analysis_ = std::make_unique<GpuHloCostAnalysis>(options_, device_info_);
   }
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 085c9b67f4a7..13df4b908046 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
@@ -40,33 +41,25 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-class GpuPerformanceModelTest : public HloTestBase {
+class GpuPerformanceModelTest : public HloHardwareIndependentTestBase {
  public:
   GpuPerformanceModel::RunTimes EstimateRunTimes(
       const HloInstruction* producer,
       std::vector<HloInstruction*> fused_consumers = {}) {
-    auto config = GpuPerformanceModelOptions::Default(
-        &fusion_analysis_cache_, &gpu_performance_model_cache_);
-
-    auto runtime_data = GpuPerformanceModel::EstimateRunTimeForInstruction(
-        producer, device_info_, &analysis_, config);
-    gpu_performance_model_cache_.Set(*producer, runtime_data);
+    gpu_performance_model_.EstimateRunTimeForInstruction(producer, &analysis_);
     for (auto consumer : fused_consumers) {
-      auto runtime_data = GpuPerformanceModel::EstimateRunTimeForInstruction(
-          consumer, device_info_, &analysis_, config);
-      gpu_performance_model_cache_.Set(*consumer, runtime_data);
+      gpu_performance_model_.EstimateRunTimeForInstruction(consumer,
+                                                           &analysis_);
     }
-    return GpuPerformanceModel::EstimateRunTimes(
-        producer, device_info_, &analysis_, config, fused_consumers);
+    return gpu_performance_model_.EstimateRunTimes(producer, &analysis_,
+                                                   fused_consumers);
   }
 
   mlir::MLIRContext mlir_context_;
@@ -77,12 +70,12 @@ class GpuPerformanceModelTest : public HloTestBase {
   HloFusionAnalysisCache fusion_analysis_cache_{device_info_};
   GpuHloCostAnalysis analysis_{options_, device_info_};
   GpuPerformanceModelCache gpu_performance_model_cache_;
+  GpuPerformanceModel gpu_performance_model_{
+      device_info_, fusion_analysis_cache_, gpu_performance_model_cache_};
 
   GpuPerformanceModelWithIndexingAnalysis indexing_cost_model_{
       &device_info_, &fusion_analysis_cache_, HloCostAnalysis::DefaultShapeSize,
       &mlir_context_};
-
-  GpuPerformanceModelTest() : HloTestBase() {}
 };
 
 TEST_F(GpuPerformanceModelTest, LargeWrite) {
@@ -136,11 +129,9 @@ ENTRY e {
   // Dominated by the kernel launch overhead.
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 1, 1);
 
-  GpuPerformanceModel::RecordEstimatedRunTime(
-      root, device_info_, &analysis_, GpuPerformanceModelOptions::Default());
-  auto reification_cost = root->backend_config<GpuBackendConfig>()
-                              ->fusion_backend_config()
-                              .reification_cost();
+  gpu_performance_model_.RecordEstimatedRunTime(root, &analysis_);
+  auto reification_cost =
+      root->backend_config<GpuBackendConfig>()->reification_cost()[0];
   EXPECT_NEAR(reification_cost.end_to_end_cycles(), 38.4, 0.1);
   EXPECT_NEAR(reification_cost.exec_time_us(), 0, 1);
 
@@ -173,11 +164,9 @@ ENTRY e {
   // Dominated by the DRAM bandwidth.
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 175, 30);
 
-  GpuPerformanceModel::RecordEstimatedRunTime(
-      root, device_info_, &analysis_, GpuPerformanceModelOptions::Default());
-  auto reification_cost = root->backend_config<GpuBackendConfig>()
-                              ->fusion_backend_config()
-                              .reification_cost();
+  gpu_performance_model_.RecordEstimatedRunTime(root, &analysis_);
+  auto reification_cost =
+      root->backend_config<GpuBackendConfig>()->reification_cost()[0];
   EXPECT_NEAR(reification_cost.end_to_end_cycles(), 220284, 100);
   EXPECT_NEAR(reification_cost.exec_time_us(), 156, 10);
   EXPECT_NEAR(reification_cost.compute_time_us(), 1, 1);
@@ -679,18 +668,14 @@ ENTRY fusion {
   auto* producer = module->entry_computation()->GetInstructionWithName("exp");
   auto* consumer = module->entry_computation()->GetInstructionWithName("add");
 
-  auto config = GpuPerformanceModelOptions::Default(
-      &fusion_analysis_cache_, &gpu_performance_model_cache_);
-
   auto producer_runtime = EstimateRunTimeData::Infinite();
   gpu_performance_model_cache_.Set(*producer, producer_runtime);
 
-  auto consumer_runtime = GpuPerformanceModel::EstimateRunTimeForInstruction(
-      consumer, device_info_, &analysis_, config);
+  auto consumer_runtime = gpu_performance_model_.EstimateRunTimeForInstruction(
+      consumer, &analysis_);
 
-  auto result = GpuPerformanceModel::EstimateRunTimeForFusion(
-      producer, consumer, producer_runtime, consumer_runtime, device_info_,
-      &analysis_, config);
+  auto result = gpu_performance_model_.EstimateRunTimeForFusion(
+      producer, consumer, producer_runtime, consumer_runtime, &analysis_);
 
   EXPECT_EQ(result, absl::InfiniteDuration());
 }
@@ -710,18 +695,14 @@ ENTRY fusion {
   auto* producer = module->entry_computation()->GetInstructionWithName("exp");
   auto* consumer = module->entry_computation()->GetInstructionWithName("add");
 
-  auto config = GpuPerformanceModelOptions::Default(
-      &fusion_analysis_cache_, &gpu_performance_model_cache_);
-
-  auto producer_runtime = GpuPerformanceModel::EstimateRunTimeForInstruction(
-      producer, device_info_, &analysis_, config);
+  auto producer_runtime = gpu_performance_model_.EstimateRunTimeForInstruction(
+      producer, &analysis_);
 
   auto consumer_runtime = EstimateRunTimeData::Infinite();
   gpu_performance_model_cache_.Set(*producer, consumer_runtime);
 
-  auto result = GpuPerformanceModel::EstimateRunTimeForFusion(
-      producer, consumer, producer_runtime, consumer_runtime, device_info_,
-      &analysis_, config);
+  auto result = gpu_performance_model_.EstimateRunTimeForFusion(
+      producer, consumer, producer_runtime, consumer_runtime, &analysis_);
 
   EXPECT_EQ(result, absl::InfiniteDuration());
 }
@@ -775,8 +756,8 @@ ENTRY entry_computation.1 {
   auto* consumer = module->entry_computation()->GetInstructionWithName(
       "input_reduce_fusion");
 
-  auto t = GpuPerformanceModel::EstimateRunTimesForMultiOutputFusion(
-      producer, consumer, device_info_, &analysis_);
+  auto t = gpu_performance_model_.EstimateRunTimesForMultiOutputFusion(
+      producer, consumer, &analysis_);
   EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_unfused), 162, 1);
   EXPECT_NEAR(absl::ToInt64Milliseconds(t.time_fused), 145, 1);
 }
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
index c6d663a8b4ca..3d7de820513f 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
@@ -68,7 +68,7 @@ class CuptiKernelTracer : public HloOpProfiler::KernelTracer,
         // Not interested in API callbacks, but empty list enables them all.
         CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject);
     options.activities_selected.push_back(CUPTI_ACTIVITY_KIND_KERNEL);
-    cupti_tracer_->Enable(options, this);
+    cupti_tracer_->Enable(options, this).IgnoreError();
   }
 
   uint64_t getMedianKernelTimeNs() && override {
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiles_data.h b/third_party/xla/xla/service/gpu/model/hlo_op_profiles_data.h
index 043596a51fef..27282d36a0e9 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiles_data.h
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiles_data.h
@@ -23,6 +23,782 @@ namespace gpu {
 // xla/service/gpu/model:hlo_op_profiler_run
 
 constexpr char kDeviceHloOpProfiles[] = R"pb(
+  entries {
+    key: "sm_100"  # "NVIDIA B200"
+    value {
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 373
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S8 }
+        }
+        clock_cycles: 153
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 369
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S16 }
+        }
+        clock_cycles: 145
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 306
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S32 }
+        }
+        clock_cycles: 172
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 730
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: S64 }
+        }
+        clock_cycles: 298
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 302
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U8 }
+        }
+        clock_cycles: 153
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 302
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U16 }
+        }
+        clock_cycles: 149
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 125
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 3
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U32 }
+        }
+        clock_cycles: 172
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 628
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: U64 }
+        }
+        clock_cycles: 298
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 200
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 1017
+      }
+      entries {
+        instruction {
+          opcode: "erf"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 161
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 106
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 216
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 102
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 216
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 220
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 98
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 1006
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 98
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 141
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 400
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 43
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 451
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F16 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 180
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 1002
+      }
+      entries {
+        instruction {
+          opcode: "erf"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 161
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 82
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 196
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 165
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 200
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 176
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 74
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 990
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 74
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 137
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 373
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 23
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 463
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F32 }
+        }
+        clock_cycles: 7
+      }
+      entries {
+        instruction {
+          opcode: "cbrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 569
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 538
+      }
+      entries {
+        instruction {
+          opcode: "erf"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 746
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 314
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 385
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 844
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 793
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 428
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 231
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 534
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 294
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 459
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 829
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 483
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 1796
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: F64 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2511
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 695
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 864
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 703
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 664
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2314
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2507
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2440
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 4271
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 2444
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 6346
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 330
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 31
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 5285
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C64 }
+        }
+        clock_cycles: 11
+      }
+      entries {
+        instruction {
+          opcode: "cosine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1862
+      }
+      entries {
+        instruction {
+          opcode: "exponential"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1375
+      }
+      entries {
+        instruction {
+          opcode: "exponential-minus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1513
+      }
+      entries {
+        instruction {
+          opcode: "log"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 3859
+      }
+      entries {
+        instruction {
+          opcode: "log-plus-one"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5985
+      }
+      entries {
+        instruction {
+          opcode: "logistic"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5973
+      }
+      entries {
+        instruction {
+          opcode: "rsqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 5792
+      }
+      entries {
+        instruction {
+          opcode: "sine"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 1878
+      }
+      entries {
+        instruction {
+          opcode: "sqrt"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 6602
+      }
+      entries {
+        instruction {
+          opcode: "tanh"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 4519
+      }
+      entries {
+        instruction {
+          opcode: "add"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15
+      }
+      entries {
+        instruction {
+          opcode: "atan2"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 13310
+      }
+      entries {
+        instruction {
+          opcode: "divide"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 2338
+      }
+      entries {
+        instruction {
+          opcode: "multiply"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 39
+      }
+      entries {
+        instruction {
+          opcode: "power"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 8685
+      }
+      entries {
+        instruction {
+          opcode: "subtract"
+          shape { element_type: C128 }
+        }
+        clock_cycles: 15
+      }
+    }
+  }
+
   entries {
     key: "sm_90"  # "NVIDIA H100 80GB HBM3"
     value {
diff --git a/third_party/xla/xla/service/gpu/model/interpolator.h b/third_party/xla/xla/service/gpu/model/interpolator.h
index 491407164e74..75abb0eecb5e 100644
--- a/third_party/xla/xla/service/gpu/model/interpolator.h
+++ b/third_party/xla/xla/service/gpu/model/interpolator.h
@@ -20,10 +20,14 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/strings/str_join.h"
 
 namespace xla::gpu {
 
@@ -35,15 +39,10 @@ class InterpolatorBase {
   virtual ~InterpolatorBase() = default;
 
   // Adds point to the interpolation space.
-  void Add(std::array<int64_t, N>& point, R val) {
-    plane_.emplace_back(point, val);
-  };
+  virtual void Add(std::array<int64_t, N>& point, R val) = 0;
 
   // Returns interpolated value.
-  virtual R Eval(std::array<int64_t, N>& point) = 0;
-
- protected:
-  std::vector<std::pair<std::array<int64_t, N>, R>> plane_;
+  virtual R Eval(std::array<int64_t, N>& point) const = 0;
 };
 
 // `Interpolates` any point in euclidean space by just returning the nearest
@@ -52,18 +51,20 @@ class InterpolatorBase {
 // is to make it aware of the n-dimensional grid properties (like a constant
 // distance per dimension between neighbouring points) which in turn can make
 // shave off a bunch of time complexity.
-// TODO: Speed up NN retrieval if it happens to be a compilation bottleneck (by
-// rounding, k-d trees etc).
 template <typename R, size_t N>
 class EuclideanNNInterpolator : public InterpolatorBase<R, N> {
  public:
-  R Eval(std::array<int64_t, N>& point) override {
-    CHECK_GT(this->plane_.size(), 0);
+  void Add(std::array<int64_t, N>& point, R val) override {
+    plane_.emplace_back(point, val);
+  };
+
+  R Eval(std::array<int64_t, N>& point) const override {
+    CHECK_GT(plane_.size(), 0);
 
     R result;
     uint64_t min_dist = std::numeric_limits<uint64_t>::max();
 
-    for (const auto& [plane_point, val] : this->plane_) {
+    for (const auto& [plane_point, val] : plane_) {
       int64_t dist = Norm2(plane_point, point);
       if (dist < min_dist) {
         result = val;
@@ -75,7 +76,7 @@ class EuclideanNNInterpolator : public InterpolatorBase<R, N> {
 
  private:
   int64_t Norm2(const std::array<int64_t, N>& lhs,
-                const std::array<int64_t, N>& rhs) {
+                const std::array<int64_t, N>& rhs) const {
     int64_t dist = 0;
     for (int i = 0; i < lhs.size(); ++i) {
       int coord = lhs[i];
@@ -84,6 +85,96 @@ class EuclideanNNInterpolator : public InterpolatorBase<R, N> {
     }
     return dist;
   }
+
+  std::vector<std::pair<std::array<int64_t, N>, R>> plane_;
+};
+
+template <typename R, size_t N>
+class EuclideanComplementInterpolator : public EuclideanNNInterpolator<R, N> {
+ public:
+  explicit EuclideanComplementInterpolator(
+      std::array<int64_t, N> next_context,
+      std::array<int64_t, N> next_power_context,
+      std::array<int64_t, N> max_context, std::array<int64_t, N> min_context)
+      : retrieval_ctx_(next_context),
+        retrieval_pow_ctx_(next_power_context),
+        max_ctx_(max_context),
+        min_ctx_(min_context) {}
+
+  void Add(std::array<int64_t, N>& point, R val) override {
+    retrieval_[point] = val;
+  }
+
+  R Eval(std::array<int64_t, N>& point) const override {
+    CHECK_GT(retrieval_.size(), 0);
+    std::array<int64_t, N> interpolation_point;
+    for (int i = 0; i < point.size(); ++i) {
+      std::optional<int64_t> next_potential_dim;
+      if (retrieval_ctx_[i] != -1) {
+        int64_t next = retrieval_ctx_[i];
+        next_potential_dim = Closest(point[i], PrevComplement(point[i], next),
+                                     NextComplement(point[i], next));
+      }
+      if (retrieval_pow_ctx_[i] != -1) {
+        next_potential_dim = Closest(point[i], PrevPowerOfTwo(point[i]),
+                                     NextPowerOfTwo(point[i]));
+      }
+      CHECK(next_potential_dim.has_value());
+      interpolation_point[i] =
+          std::max(std::min(*next_potential_dim, max_ctx_[i]), min_ctx_[i]);
+    }
+    return retrieval_.at(interpolation_point);
+  }
+
+ private:
+  int64_t Closest(int64_t n, int64_t prev, int64_t next) const {
+    if (n - prev < next - n) {
+      return prev;
+    }
+    return next;
+  }
+
+  int64_t NextComplement(int64_t n, int64_t complement) const {
+    return (n + complement) & ~(complement - 1);
+  }
+
+  int64_t PrevComplement(int64_t n, int64_t complement) const {
+    return n & ~(complement - 1);
+  }
+
+  bool IsPowerOfTwo(int n) {
+    if (n <= 0) {
+      return false;
+    }
+    return (n & (n - 1)) == 0;
+  }
+
+  int64_t PrevPowerOfTwo(int64_t n) const { return NextPowerOfTwo(n << 1); }
+
+  int64_t NextPowerOfTwo(int64_t n) const {
+    if (n == 0) {
+      return 1;
+    }
+    n--;
+    n |= n >> 1;
+    n |= n >> 2;
+    n |= n >> 4;
+    n |= n >> 8;
+    n |= n >> 16;
+    n |= n >> 32;
+    return n + 1;
+  }
+
+  std::string PointStr(std::array<int64_t, N> point) const {
+    return absl::StrJoin(point, ", ");
+  }
+
+  std::array<int64_t, N> retrieval_ctx_;
+  std::array<int64_t, N> retrieval_pow_ctx_;
+  std::array<int64_t, N> max_ctx_;
+  std::array<int64_t, N> min_ctx_;
+
+  absl::flat_hash_map<std::array<int64_t, N>, R> retrieval_;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/interpolator_test.cc b/third_party/xla/xla/service/gpu/model/interpolator_test.cc
index 7223b3751c92..53fb9511d185 100644
--- a/third_party/xla/xla/service/gpu/model/interpolator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/interpolator_test.cc
@@ -18,22 +18,45 @@ limitations under the License.
 #include <array>
 #include <cstddef>
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "xla/service/overload.h"
 
 namespace xla::gpu {
 namespace {
 
+using ::testing::Combine;
 using ::testing::TestWithParam;
-using ::testing::Values;
+using ::testing::ValuesIn;
+
+enum class InterpolatorType {
+  NN = 0,
+  Complement = 1,
+};
+
+template <typename T, size_t N>
+using Interpolator =
+    std::variant<std::unique_ptr<EuclideanNNInterpolator<T, N>>,
+                 std::unique_ptr<EuclideanComplementInterpolator<T, N>>>;
 
 class InterpolatorFake : public InterpolatorBase<int, 2> {
  public:
+  ~InterpolatorFake() override = default;
+
   // Fake eval function which just returns the size of the consumed set.
-  int Eval(std::array<int64_t, 2>& x) override { return plane_.size(); }
+  int Eval(std::array<int64_t, 2>& x) const override { return plane_.size(); }
+
+  void Add(std::array<int64_t, 2>& x, int val) override { plane_.push_back(x); }
+
+ private:
+  std::vector<std::array<int64_t, 2>> plane_;
 };
 
 TEST(Interpolator, PersistsEuclideanPoints) {
@@ -56,57 +79,107 @@ struct EuclideanNNInterpolatorTestCase {
 };
 
 class EuclideanNN2DInterpolatorTest
-    : public TestWithParam<EuclideanNNInterpolatorTestCase<int, 2>> {
+    : public TestWithParam<std::tuple<
+          InterpolatorType, EuclideanNNInterpolatorTestCase<int, 2>>> {
   void SetUp() override {
-    std::array<int64_t, 2> p1 = {3, 4};
-    std::array<int64_t, 2> p2 = {5, 7};
-    interpolator_.Add(p1, /*val=*/1);
-    interpolator_.Add(p2, /*val=*/2);
+    std::array<int64_t, 2> p1 = {8, 16};
+    std::array<int64_t, 2> p2 = {8, 8};
+    std::array<int64_t, 2> p3 = {16, 8};
+    std::array<int64_t, 2> p4 = {16, 16};
+    plane_.push_back({p1, 1});
+    plane_.push_back({p2, 2});
+    plane_.push_back({p3, 3});
+    plane_.push_back({p4, 4});
   }
 
  protected:
-  EuclideanNNInterpolator<int64_t, 2> interpolator_;
   std::vector<std::pair<std::array<int64_t, 2>, int>> plane_;
+
+  Interpolator<int64_t, 2> DispatchInterpolator(InterpolatorType type) {
+    if (type == InterpolatorType::NN) {
+      auto interpolator =
+          std::make_unique<EuclideanNNInterpolator<int64_t, 2>>();
+      return std::move(interpolator);
+    }
+    if (type == InterpolatorType::Complement) {
+      auto interpolator =
+          std::make_unique<EuclideanComplementInterpolator<int64_t, 2>>(
+              /*next_context=*/std::array<int64_t, 2>{8, 8},
+              /*next_power_context=*/std::array<int64_t, 2>{-1, -1},
+              /*max_context=*/std::array<int64_t, 2>{16, 16},
+              /*min_context=*/std::array<int64_t, 2>{8, 8});
+      return std::move(interpolator);
+    }
+    LOG(FATAL) << "Unreachable.";
+  }
 };
 
 TEST_P(EuclideanNN2DInterpolatorTest, ReturnsNearestNeighbour) {
-  auto param = GetParam();
-  for (auto& [plane_point, val] : plane_) {
-    interpolator_.Add(plane_point, val);
+  InterpolatorType interpolator_type = std::get<0>(GetParam());
+  auto param = std::get<1>(GetParam());
+
+  Interpolator<int64_t, 2> interpolator =
+      DispatchInterpolator(interpolator_type);
+  for (const auto& point : plane_) {
+    std::array<int64_t, 2> plane_point = point.first;
+    int val = point.second;
+    std::visit(
+        Overload{[&](const std::unique_ptr<EuclideanNNInterpolator<int64_t, 2>>&
+                         nn) { return nn->Add(plane_point, val); },
+                 [&](const std::unique_ptr<
+                     EuclideanComplementInterpolator<int64_t, 2>>& comp) {
+                   return comp->Add(plane_point, val);
+                 }},
+        interpolator);
   }
-  EXPECT_EQ(interpolator_.Eval(param.eval_point), param.expected_value);
+  std::visit(
+      Overload{
+          [&](const std::unique_ptr<EuclideanNNInterpolator<int64_t, 2>>& nn) {
+            EXPECT_EQ(nn->Eval(param.eval_point), param.expected_value);
+          },
+          [&](const std::unique_ptr<
+              EuclideanComplementInterpolator<int64_t, 2>>& comp) {
+            EXPECT_EQ(comp->Eval(param.eval_point), param.expected_value);
+          }},
+      interpolator);
 }
 
-// We have 2 points on a 2D plane.
-// X = {(3,4), (5,7)}
-INSTANTIATE_TEST_SUITE_P(EuclideanNNInterpolator2DIntegerTest,
-                         EuclideanNN2DInterpolatorTest,
-                         Values(EuclideanNNInterpolatorTestCase<int, 2>{
-                                    /*test_name=*/"near_first_point",
-                                    /*eval_point=*/{4, 3},
-                                    /*expected_value=*/1,
-                                },
-                                EuclideanNNInterpolatorTestCase<int, 2>{
-                                    /*test_name=*/"near_second_point",
-                                    /*eval_point=*/{7, 5},
-                                    /*expected_value=*/2,
-                                },
-                                EuclideanNNInterpolatorTestCase<int, 2>{
-                                    /*test_name=*/"nearer_only_by_one",
-                                    /*eval_point=*/{4, 6},
-                                    /*expected_value=*/2,
-                                },
-                                EuclideanNNInterpolatorTestCase<int, 2>{
-                                    /*test_name=*/"extrapolate_first_point",
-                                    /*eval_point=*/{2, 3},
-                                    /*expected_value=*/1,
-                                },
-                                EuclideanNNInterpolatorTestCase<int, 2>{
-                                    /*test_name=*/"extrapolate_second_point",
-                                    /*eval_point=*/{6, 8},
-                                    /*expected_value=*/2,
-                                }),
-                         [](const auto& info) { return info.param.test_name; });
+// We have 4 points on a 2D plane.
+// X = {(8,8), (8,16), (16,8), (16,16)}
+INSTANTIATE_TEST_SUITE_P(
+    EuclideanNNInterpolator2DIntegerTest, EuclideanNN2DInterpolatorTest,
+    Combine(ValuesIn({InterpolatorType::NN, InterpolatorType::Complement}),
+            ValuesIn({
+                EuclideanNNInterpolatorTestCase<int, 2>{
+                    /*test_name=*/"near_first_point",
+                    /*eval_point=*/{7, 9},
+                    /*expected_value=*/2,
+                },
+                EuclideanNNInterpolatorTestCase<int, 2>{
+                    /*test_name=*/"near_second_point",
+                    /*eval_point=*/{15, 17},
+                    /*expected_value=*/4,
+                },
+                EuclideanNNInterpolatorTestCase<int, 2>{
+                    /*test_name=*/"nearer_only_by_one",
+                    /*eval_point=*/{13, 8},
+                    /*expected_value=*/3,
+                },
+                EuclideanNNInterpolatorTestCase<int, 2>{
+                    /*test_name=*/"extrapolate_first_point",
+                    /*eval_point=*/{7, 7},
+                    /*expected_value=*/2,
+                },
+                EuclideanNNInterpolatorTestCase<int, 2>{
+                    /*test_name=*/"extrapolate_second_point",
+                    /*eval_point=*/{17, 9},
+                    /*expected_value=*/3,
+                },
+            })),
+    [](const auto& info) {
+      return absl::StrCat(std::get<1>(info.param).test_name, "x",
+                          std::get<0>(info.param));
+    });
 
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/matmul_interpolator.cc b/third_party/xla/xla/service/gpu/model/matmul_interpolator.cc
new file mode 100644
index 000000000000..c2dbfe1f76a7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/matmul_interpolator.cc
@@ -0,0 +1,200 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/matmul_interpolator.h"
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
+#include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/model/interpolator.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+namespace {
+
+struct InterpolationSpecification {
+  int b;
+  int m;
+  int k;
+  int n;
+};
+
+bool IsTritonGemm(const HloInstruction& instr) {
+  if (instr.called_computations().size() != 1) {
+    return false;
+  }
+  if (!IsTritonFusedComputation(*instr.called_computations()[0])) {
+    return false;
+  }
+  auto fused_range = instr.fused_instructions();
+  return absl::c_count_if(fused_range, HloPredicateIsOp<HloOpcode::kDot>) == 1;
+}
+
+InterpolationSpecification ExtractDotSpec(const DotDimensionNumbers& dot_dims,
+                                          const Shape& lhs, const Shape& rhs) {
+  int b = 1, m = 1, n = 1, k = 1;
+  for (int dim : dot_dims.lhs_batch_dimensions()) {
+    b *= ShapeUtil::GetDimension(lhs, dim);
+  }
+  k *= ShapeUtil::ByteSizeOfPrimitiveType(lhs.element_type());
+  for (int dim : dot_dims.lhs_contracting_dimensions()) {
+    k *= ShapeUtil::GetDimension(lhs, dim);
+  }
+  m *= ShapeUtil::ByteSizeOfPrimitiveType(lhs.element_type());
+  for (int dim : GetNonContractingDims(lhs.dimensions().size(),
+                                       dot_dims.lhs_contracting_dimensions(),
+                                       dot_dims.lhs_batch_dimensions())) {
+    m *= ShapeUtil::GetDimension(lhs, dim);
+  }
+  n *= ShapeUtil::ByteSizeOfPrimitiveType(rhs.element_type());
+  for (int dim : GetNonContractingDims(rhs.dimensions().size(),
+                                       dot_dims.rhs_contracting_dimensions(),
+                                       dot_dims.rhs_batch_dimensions())) {
+    n *= ShapeUtil::GetDimension(rhs, dim);
+  }
+  return InterpolationSpecification{
+      /*b=*/b,
+      /*m=*/m,
+      /*k=*/k,
+      /*n=*/n,
+  };
+}
+
+absl::StatusOr<InterpolationSpecification> Spec(
+    const HloInstructionProfile& profile,
+    const se::DeviceDescription& device_info) {
+  if (profile.operands_size() != 2) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "Expected exactly two operands for dot: ", profile.DebugString()));
+  }
+  if (profile.instruction().opcode() != HloOpcodeString(HloOpcode::kDot)) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "Expected dot, got: ", profile.instruction().DebugString()));
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape lhs_shape,
+                      Shape::FromProto(profile.operands(0).shape()));
+  TF_ASSIGN_OR_RETURN(Shape rhs_shape,
+                      Shape::FromProto(profile.operands(1).shape()));
+  DotDimensionNumbers dot_dims = profile.instruction().dot_dimension_numbers();
+  return ExtractDotSpec(dot_dims, lhs_shape, rhs_shape);
+}
+
+InterpolationSpecification Spec(const HloDotInstruction& dot) {
+  const Shape& lhs_shape = dot.operand(0)->shape();
+  const Shape& rhs_shape = dot.operand(1)->shape();
+  DotDimensionNumbers dot_dims = dot.dot_dimension_numbers();
+  return ExtractDotSpec(dot_dims, lhs_shape, rhs_shape);
+}
+
+InterpolationSpecification Spec(const HloCustomCallInstruction& dot) {
+  CHECK(IsCublasGemm(dot));
+  const Shape& lhs_shape = dot.operand(0)->shape();
+  const Shape& rhs_shape = dot.operand(1)->shape();
+  DotDimensionNumbers dot_dims = dot.backend_config<GpuBackendConfig>()
+                                     ->gemm_backend_config()
+                                     .dot_dimension_numbers();
+  return ExtractDotSpec(dot_dims, lhs_shape, rhs_shape);
+}
+
+InterpolationSpecification Spec(const HloFusionInstruction& dot_fusion) {
+  CHECK(IsTritonGemm(dot_fusion));
+
+  auto fused = dot_fusion.fused_instructions();
+  auto dot_it = absl::c_find_if(fused, HloPredicateIsOp<HloOpcode::kDot>);
+  CHECK(dot_it != std::end(fused));
+
+  const HloDotInstruction& dot = *Cast<HloDotInstruction>(*dot_it);
+  const Shape& lhs_shape = dot.operand(0)->shape();
+  const Shape& rhs_shape = dot.operand(1)->shape();
+  DotDimensionNumbers dot_dims = dot.dot_dimension_numbers();
+  return ExtractDotSpec(dot_dims, lhs_shape, rhs_shape);
+}
+
+}  // namespace
+
+/*static*/ absl::StatusOr<std::unique_ptr<MatmulInterpolator>>
+MatmulInterpolator::Create(const HloInstructionProfileList& profiles,
+                           const se::DeviceDescription& device_info) {
+  auto interpolator = std::make_unique<EuclideanNNInterpolator<int64_t, 4>>();
+  for (auto& profile : profiles.entries()) {
+    TF_ASSIGN_OR_RETURN(InterpolationSpecification spec,
+                        Spec(profile, device_info));
+    std::array<int64_t, 4> point = {
+        spec.b,
+        spec.m,
+        spec.k,
+        spec.n,
+    };
+    int64_t fmas = 2ll * spec.b * spec.m * spec.n * spec.k;
+    int64_t runtime_ns = profile.clock_cycles() / device_info.clock_rate_ghz();
+    interpolator->Add(point, fmas * 1e9 / runtime_ns);
+  }
+
+  return std::unique_ptr<MatmulInterpolator>(
+      new MatmulInterpolator(std::move(interpolator)));
+}
+
+std::optional<absl::Duration> MatmulInterpolator::EstimatedRuntime(
+    const HloInstruction& instr) const {
+  InterpolationSpecification spec;
+  if (instr.opcode() == HloOpcode::kDot) {
+    auto* dot = Cast<HloDotInstruction>(&instr);
+    spec = Spec(*dot);
+  } else if (IsCublasGemm(instr)) {
+    auto* dot = Cast<HloCustomCallInstruction>(&instr);
+    spec = Spec(*dot);
+  } else if (IsTritonGemm(instr)) {
+    auto* dot_fusion = Cast<HloFusionInstruction>(&instr);
+    spec = Spec(*dot_fusion);
+  } else {
+    VLOG(1) << "Opcodes different than 'kDot' are unsupported: "
+            << instr.ToString();
+    return std::nullopt;
+  }
+
+  std::array<int64_t, 4> point = {
+      spec.b,
+      spec.m,
+      spec.k,
+      spec.n,
+  };
+  int64_t flops = 2ll * spec.b * spec.m * spec.k * spec.n;
+  return absl::Seconds(1.0 * flops / interpolator_->Eval(point));
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/matmul_interpolator.h b/third_party/xla/xla/service/gpu/model/matmul_interpolator.h
new file mode 100644
index 000000000000..07dbb1af60eb
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/matmul_interpolator.h
@@ -0,0 +1,55 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_MATMUL_INTERPOLATOR_H_
+#define XLA_SERVICE_GPU_MODEL_MATMUL_INTERPOLATOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/model/interpolator.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+class MatmulInterpolator {
+ public:
+  static absl::StatusOr<std::unique_ptr<MatmulInterpolator>> Create(
+      const HloInstructionProfileList& profiles,
+      const se::DeviceDescription& device_info);
+
+  // Returns the estimated runtime for a supported `collective`.
+  std::optional<absl::Duration> EstimatedRuntime(
+      const HloInstruction& instr) const;
+
+ private:
+  // Uses `EuclideanNNInterpolator` to figure get the closest neighbour from
+  // profiles.
+  explicit MatmulInterpolator(
+      std::unique_ptr<EuclideanNNInterpolator<int64_t, 4>> interpolator)
+      : interpolator_(std::move(interpolator)) {}
+
+  std::unique_ptr<EuclideanNNInterpolator<int64_t, 4>> interpolator_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MODEL_MATMUL_INTERPOLATOR_H_
diff --git a/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc b/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
new file mode 100644
index 000000000000..c39fa1700919
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/matmul_interpolator_test.cc
@@ -0,0 +1,391 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/matmul_interpolator.h"
+
+#include <time.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::Test;
+using ::testing::TestParamInfo;
+using ::testing::TestWithParam;
+using ::testing::ValuesIn;
+
+struct DotContext {
+  HloInstruction* dot;
+  std::unique_ptr<HloModule> module;
+};
+
+struct DotSpec {
+  int b;
+  int m;
+  int n;
+  int k;
+  int64_t clock_cycles;
+};
+
+struct ParametrizedTestCase {
+  std::string test_name;
+  DotSpec spec;
+  absl::Duration expected_duration;
+};
+
+class MatmulInterpolatorParamTest : public TestWithParam<ParametrizedTestCase> {
+ public:
+  MatmulInterpolatorParamTest()
+      : device_info_(TestGpuDeviceInfo::RTXA6000DeviceInfo()) {}
+
+  void SetUp() override {
+    absl::StatusOr<HloInstructionProfileList> profiles =
+        DotInterpolationSpace(interpolation_space_);
+    CHECK_OK(profiles.status()) << "Cannot generate interpolation space.";
+    absl::StatusOr<std::unique_ptr<const MatmulInterpolator>> interpolator =
+        MatmulInterpolator::Create(*std::move(profiles), device_info_);
+    CHECK_OK(interpolator.status()) << "Cannot construct interpolator.";
+    interpolator_ = std::move(*interpolator);
+  }
+
+ protected:
+  absl::StatusOr<DotContext> Dot(int b, int m, int n, int k) {
+    absl::string_view kTemplate = R"(
+    HloModule m
+
+    ENTRY r {
+      lhs = f32[$0,$1,$2] parameter(0)
+      rhs = f32[$0,$2,$3] parameter(1)
+      ROOT _ = f32[$0,$1,$3] dot(lhs,rhs),
+       lhs_contracting_dims={2}, rhs_contracting_dims={1},
+       lhs_batch_dims={0}, rhs_batch_dims={0}
+    })";
+    TF_ASSIGN_OR_RETURN(auto module,
+                        ParseAndReturnUnverifiedModule(
+                            absl::Substitute(kTemplate, b, m, k, n)));
+    return DotContext{
+        /*dot=*/module->entry_computation()->root_instruction(),
+        /*module=*/std::move(module),
+    };
+  }
+
+  void AddProfileEntry(DotContext dot_context, int64_t clock_cycles,
+                       HloInstructionProfileList& list) {
+    HloInstructionProfile profile;
+    *profile.mutable_instruction() = dot_context.dot->ToProto();
+    *profile.add_operands() = dot_context.dot->operands()[0]->ToProto();
+    *profile.add_operands() = dot_context.dot->operands()[1]->ToProto();
+    profile.set_clock_cycles(clock_cycles);
+    *list.add_entries() = std::move(profile);
+  }
+
+  absl::StatusOr<HloInstructionProfileList> DotInterpolationSpace(
+      absl::Span<const DotSpec> specs) {
+    HloInstructionProfileList list;
+    for (DotSpec spec : specs) {
+      TF_ASSIGN_OR_RETURN(DotContext dot_context,
+                          Dot(spec.b, spec.m, spec.n, spec.k));
+      AddProfileEntry(std::move(dot_context), spec.clock_cycles, list);
+    }
+    return list;
+  }
+
+  const MatmulInterpolator& interpolator() { return *interpolator_; }
+
+ private:
+  int64_t ClockCycles(absl::Duration runtime) {
+    return absl::ToInt64Nanoseconds(runtime) * device_info_.clock_rate_ghz();
+  }
+
+  const se::DeviceDescription device_info_;
+  const std::vector<DotSpec> interpolation_space_ = {
+      DotSpec{
+          /*b=*/1,
+          /*m=*/256,
+          /*n=*/1024,
+          /*k=*/512,
+          /*clock_cycles=*/ClockCycles(absl::Seconds(1)),
+      },
+      DotSpec{
+          /*b=*/1,
+          /*m=*/256,
+          /*n=*/2048,
+          /*k=*/512,
+          /*clock_cycles=*/ClockCycles(absl::Seconds(2)),
+      },
+      DotSpec{
+          /*b=*/1,
+          /*m=*/64,
+          /*n=*/2048,
+          /*k=*/512,
+          /*clock_cycles=*/ClockCycles(absl::Seconds(3)),
+      },
+      DotSpec{
+          /*b=*/2,
+          /*m=*/256,
+          /*n=*/1024,
+          /*k=*/512,
+          /*clock_cycles=*/ClockCycles(absl::Seconds(4)),
+      },
+      DotSpec{
+          /*b=*/2,
+          /*m=*/256,
+          /*n=*/2048,
+          /*k=*/512,
+          /*clock_cycles=*/ClockCycles(absl::Seconds(5)),
+      },
+  };
+  std::unique_ptr<const MatmulInterpolator> interpolator_;
+};
+
+TEST_P(MatmulInterpolatorParamTest,
+       MatmulInteprolatorNextNeighbourInterpolation) {
+  const auto& [_, spec, expected_duration] = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(DotContext context,
+                          Dot(spec.b, spec.m, spec.n, spec.k));
+  EXPECT_EQ(absl::Trunc(*interpolator().EstimatedRuntime(*context.dot),
+                        absl::Milliseconds(1)),
+            expected_duration);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MatmulInterpolatorTestInstantiation, MatmulInterpolatorParamTest,
+    ValuesIn<ParametrizedTestCase>({
+        /*Interpolates to b=1,m=256,n=1024,k=512*/
+        {
+            /*test_name=*/"smallest_dims_extrapolate", /*spec=*/
+            {
+                /*b=*/1,
+                /*m=*/64,
+                /*n=*/64,
+                /*k=*/64,
+            },
+            /*expected_duration=*/absl::Milliseconds(1),
+        },
+        /*Interpolates to b=2,m=256,n=2048,k=512*/
+        {
+            /*test_name=*/"highest_dims_extrapolate", /*spec=*/
+            {
+                /*b=*/4,
+                /*m=*/512,
+                /*n=*/2048,
+                /*k=*/1024,
+            },
+            /*expected_duration=*/absl::Seconds(40),
+        },
+        /*Interpolates to b=1,m=64,n=2048,k=512*/
+        {
+            /*test_name=*/"m_interpolate", /*spec=*/
+            {
+                /*b=*/2,
+                /*m=*/128,
+                /*n=*/2048,
+                /*k=*/512,
+            },
+            /*expected_duration=*/absl::Seconds(12),
+        },
+        /*Interpolates to b=2,m=256,n=2048,k=512*/
+        {
+            /*test_name=*/"m_extrapolate", /*spec=*/
+            {
+                /*b=*/2,
+                /*m=*/512,
+                /*n=*/2048,
+                /*k=*/512,
+            },
+            /*expected_duration=*/absl::Seconds(10),
+        },
+        /*Interpolates to b=2,m=256,n=2048,k=512*/
+        {
+            /*test_name=*/"n_extrapolate", /*spec=*/
+            {
+                /*b=*/2,
+                /*m=*/256,
+                /*n=*/4096,
+                /*k=*/512,
+            },
+            /*expected_duration=*/absl::Seconds(10),
+        },
+        /*Interpolates to b=2,m=256,n=2048,k=512*/
+        {
+            /*test_name=*/"k_extrapolate", /*spec=*/
+            {
+                /*b=*/2,
+                /*m=*/256,
+                /*n=*/2048,
+                /*k=*/1024,
+            },
+            /*expected_duration=*/absl::Seconds(10),
+        },
+    }),
+    [](const TestParamInfo<MatmulInterpolatorParamTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+class MatmulInterpolatorTest : public Test {
+ public:
+  void SetUp() override {
+    constexpr char perf_table[] = R"pb(
+      entries {
+        instruction {
+          opcode: "dot"
+          shape {
+            element_type: BF16
+            dimensions: 1
+            dimensions: 1024
+            dimensions: 1024
+          }
+          dot_dimension_numbers {
+            lhs_contracting_dimensions: 2
+            rhs_contracting_dimensions: 1
+            lhs_batch_dimensions: 0
+            rhs_batch_dimensions: 0
+          }
+          id: 2
+          operand_ids: 0
+          operand_ids: 1
+        }
+        operands {
+          name: "lhs"
+          opcode: "parameter"
+          shape {
+            element_type: BF16
+            dimensions: 1
+            dimensions: 1024
+            dimensions: 1024
+          }
+        }
+        operands {
+          name: "rhs"
+          opcode: "parameter"
+          shape {
+            element_type: BF16
+            dimensions: 1
+            dimensions: 1024
+            dimensions: 1024
+          }
+          parameter_number: 1
+          id: 1
+        }
+        clock_cycles: 1410000000
+      }
+    )pb";
+    HloInstructionProfileList profiles;
+    CHECK(tsl::protobuf::TextFormat::ParseFromString(perf_table, &profiles));
+    interpolator_ = *MatmulInterpolator::Create(
+        profiles, TestGpuDeviceInfo::RTXA6000DeviceInfo());
+  }
+
+ protected:
+  MatmulInterpolator& interpolator() { return *interpolator_; }
+
+ private:
+  std::unique_ptr<MatmulInterpolator> interpolator_;
+};
+
+TEST_F(MatmulInterpolatorTest, SupportsCublasCustomCalls) {
+  absl::string_view hlo = R"(
+    HloModule m
+
+    ENTRY e {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+        custom_call_target="__cublas$gemm",
+        backend_config={
+          "operation_queue_id":"0",
+          "wait_on_operation_queues":[],
+          "gemm_backend_config":{
+            "alpha_real":1,
+            "beta":1,
+            "dot_dimension_numbers": {
+              "lhs_contracting_dimensions":["1"],
+              "rhs_contracting_dimensions":["1"],
+              "lhs_batch_dimensions":[],
+              "rhs_batch_dimensions":[]
+            }
+          }
+        }
+    }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  const HloInstruction& custom_call =
+      *module->entry_computation()->root_instruction();
+  EXPECT_EQ(*interpolator().EstimatedRuntime(custom_call), absl::Seconds(1));
+}
+
+TEST_F(MatmulInterpolatorTest, SupportsDotTritonFusion) {
+  absl::string_view hlo = R"(
+    HloModule m
+
+    comp {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT dot = bf16[1024,1024] dot(p0,p1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+    }
+
+    ENTRY e {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT _ =  bf16[1024,1024] fusion(p0,p1),
+        kind=kCustom,
+        calls=comp,
+        backend_config={
+          "operation_queue_id":"0",
+          "wait_on_operation_queues":[],
+          "fusion_backend_config": {
+            "kind":"__triton_gemm",
+            "triton_gemm_config":{
+              "block_m":"128",
+              "block_n":"128",
+              "block_k":"64",
+              "split_k":"1",
+              "num_stages":"1",
+              "num_warps":"8",
+              "num_ctas":"1"
+            }
+          },
+        }
+    }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  const HloInstruction& custom_call =
+      *module->entry_computation()->root_instruction();
+  EXPECT_EQ(*interpolator().EstimatedRuntime(custom_call), absl::Seconds(1));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
new file mode 100644
index 000000000000..cc56701049c3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.cc
@@ -0,0 +1,183 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/matmul_ptable_stats_collection.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/backends/gpu/codegen/triton/support.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/gpu_dot_fusion_cost_model.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/service/gpu/model/matmul_interpolator.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/service/gpu/transforms/nest_gemm_fusion.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+namespace {
+
+constexpr absl::string_view kGemmCostModelName = "gemm-cost-model";
+
+constexpr absl::string_view kPerfTablesModelName = "perf-table-model";
+
+absl::StatusOr<HloInstructionProfileList> CollectProfiles(
+    const std::string& perf_table_path,
+    const se::DeviceDescription& device_info) {
+  DeviceHloInstructionProfiles profile;
+
+  TF_RETURN_IF_ERROR(tsl::Env::Default()->FileExists(perf_table_path));
+  TF_RETURN_IF_ERROR(tsl::ReadTextOrBinaryProto(tsl::Env::Default(),
+                                                perf_table_path, &profile));
+  std::string key = HloOpProfiles::GetProfileName(device_info);
+
+  if (!profile.entries().contains(key)) {
+    return absl::NotFoundError(absl::StrCat("Cannot find key: ", key));
+  }
+  return profile.entries().at(key);
+}
+
+HloDotInstruction* GetTritonGemmInstruction(const HloInstruction& dot_fusion) {
+  if (!(HloPredicateIsOp<HloOpcode::kFusion>(&dot_fusion) &&
+        IsTritonFusedComputation(
+            *dot_fusion.fused_instructions_computation()))) {
+    return nullptr;
+  }
+
+  HloInstruction* dot = hlo_query::GetFirstInstructionWithOpcode(
+      *dot_fusion.fused_instructions_computation(), HloOpcode::kDot);
+  if (dot == nullptr) {
+    return nullptr;
+  }
+  return DynCast<HloDotInstruction>(dot);
+}
+
+absl::StatusOr<BlockLevelParameters> GetBlockLevelParams(
+    HloDotInstruction& dot, TritonGemmConfig& config) {
+  mlir::MLIRContext ctx;
+  return ::xla::gpu::detail::FindBlockLevelParameters(&dot, config, &ctx);
+}
+
+absl::Status SetReificationCost(HloInstruction& instr, absl::Duration exec_time,
+                                absl::string_view reification_name) {
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
+                      instr.backend_config<GpuBackendConfig>());
+  ReificationCost& reification_cost = *gpu_config.add_reification_cost();
+  reification_cost.set_exec_time_us(absl::ToDoubleMicroseconds(exec_time));
+  *reification_cost.mutable_name() = reification_name;
+  return instr.set_backend_config(gpu_config);
+}
+
+// Computes the runtime estimation via analytical GEMM cost model and adds a
+// reification cost to `instr`. We do not make any constraints on what fusions
+// do we add the cost to. In particular it can be the case there's a non trivial
+// fusion on dot operands. As of now the analytical GEMM model does not support
+// these cases so result interpretation has take this into consideration.
+absl::Status MaybeRecordGemmCostModelForGemmTritonFusion(
+    const se::DeviceDescription& device_info, HloInstruction& instr) {
+  HloDotInstruction* dot = GetTritonGemmInstruction(instr);
+  if (dot == nullptr) {
+    VLOG(2) << "Cannot get triton gemm: " << instr.ToString();
+    return absl::OkStatus();
+  }
+  auto triton_gemm_key = instr.backend_config<GpuBackendConfig>()
+                             ->fusion_backend_config()
+                             .triton_gemm_config();
+  TF_ASSIGN_OR_RETURN(TritonGemmConfig triton_gemm_config,
+                      TritonGemmConfig::FromProto(triton_gemm_key));
+  TF_ASSIGN_OR_RETURN(BlockLevelParameters block_params,
+                      GetBlockLevelParams(*dot, triton_gemm_config));
+  TF_ASSIGN_OR_RETURN(
+      absl::Duration exec_time,
+      GpuDotFusionCostModel::EstimateRunTimeForDotOpWithBlockParameters(
+          dot, block_params, device_info));
+  return SetReificationCost(instr, exec_time, kGemmCostModelName);
+}
+
+absl::Status MaybeRecordPerfTablesForDotsAndCustomCalls(
+    const se::DeviceDescription& device_info, HloInstruction& instr,
+    MatmulInterpolator& interpolator) {
+  if (HloPredicateIsNotOp<HloOpcode::kCustomCall, HloOpcode::kDot>(&instr)) {
+    VLOG(2) << "Not a dot or custom call: " << instr.ToString();
+    return absl::OkStatus();
+  }
+  std::optional<absl::Duration> exec_time =
+      interpolator.EstimatedRuntime(instr);
+  if (!exec_time.has_value()) {
+    return absl::InternalError(
+        absl::StrCat("Cannot estimate runtime for: ", instr.ToString()));
+  }
+  return SetReificationCost(instr, *exec_time, kPerfTablesModelName);
+}
+
+}  // namespace
+
+absl::StatusOr<bool> MatmulPerfTableStatsCollection::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  TF_ASSIGN_OR_RETURN(HloInstructionProfileList profiles,
+                      CollectProfiles(perf_table_path_, device_info_));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<MatmulInterpolator> interpolator,
+                      MatmulInterpolator::Create(profiles, device_info_));
+
+  hlo_query::ForEachInstructionWithPred(
+      *module,
+      HloPredicateIsOp<HloOpcode::kCustomCall, HloOpcode::kFusion,
+                       HloOpcode::kDot>,
+      [&](HloInstruction* instr) {
+        if (absl::Status status = MaybeRecordPerfTablesForDotsAndCustomCalls(
+                device_info_, *instr, *interpolator);
+            !status.ok()) {
+          VLOG(1) << "Cannot record perf tables stats data: "
+                  << instr->ToString() << ". Status: " << status;
+        }
+        if (absl::Status status = MaybeRecordGemmCostModelForGemmTritonFusion(
+                device_info_, *instr);
+            !status.ok()) {
+          VLOG(1) << "Cannot record GEMM cost model stats data: "
+                  << instr->ToString() << ". Status: " << status;
+        }
+      });
+
+  return false;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
new file mode 100644
index 000000000000..7d7d254c5ef5
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection.h
@@ -0,0 +1,54 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_MATMUL_PTABLE_STATS_COLLECTION_H_
+#define XLA_SERVICE_GPU_MODEL_MATMUL_PTABLE_STATS_COLLECTION_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+class MatmulPerfTableStatsCollection : public HloModulePass {
+ public:
+  explicit MatmulPerfTableStatsCollection(
+      absl::string_view perf_table_path,
+      const se::DeviceDescription& device_info)
+      : perf_table_path_(perf_table_path), device_info_(device_info) {}
+
+  absl::string_view name() const override {
+    return "matmul-perf-table-stats-collection";
+  }
+
+  using HloPassInterface::Run;
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const std::string perf_table_path_;
+  const se::DeviceDescription& device_info_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MODEL_MATMUL_PTABLE_STATS_COLLECTION_H_
diff --git a/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
new file mode 100644
index 000000000000..f20361d2036c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/matmul_ptable_stats_collection_test.cc
@@ -0,0 +1,269 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/matmul_ptable_stats_collection.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "tsl/platform/path.h"
+
+namespace xla::gpu {
+namespace {
+
+constexpr const char* kFile = "profiles.pbtxt";
+
+using ::testing::DoubleNear;
+using ::testing::ElementsAre;
+using ::testing::Property;
+using ::testing::Test;
+
+DeviceHloInstructionProfiles TestProfiles(
+    const se::DeviceDescription& device_info) {
+  constexpr char perf_table[] = R"pb(
+    entries {
+      key: "sm_89"
+      value {
+        entries {
+          instruction {
+            opcode: "dot"
+            shape {
+              element_type: BF16
+              dimensions: 1
+              dimensions: 1024
+              dimensions: 1024
+            }
+            dot_dimension_numbers {
+              lhs_contracting_dimensions: 2
+              rhs_contracting_dimensions: 1
+              lhs_batch_dimensions: 0
+              rhs_batch_dimensions: 0
+            }
+            id: 2
+            operand_ids: 0
+            operand_ids: 1
+          }
+          operands {
+            name: "lhs"
+            opcode: "parameter"
+            shape {
+              element_type: BF16
+              dimensions: 1
+              dimensions: 1024
+              dimensions: 1024
+            }
+          }
+          operands {
+            name: "rhs"
+            opcode: "parameter"
+            shape {
+              element_type: BF16
+              dimensions: 1
+              dimensions: 1024
+              dimensions: 1024
+            }
+            parameter_number: 1
+            id: 1
+          }
+          clock_cycles: 1410000000
+        }
+      }
+    }
+  )pb";
+  DeviceHloInstructionProfiles profiles;
+  CHECK(tsl::protobuf::TextFormat::ParseFromString(perf_table, &profiles));
+  return profiles;
+}
+
+class MatmulStatsCollectionTest : public Test {
+ public:
+  explicit MatmulStatsCollectionTest()
+      : device_info_(TestGpuDeviceInfo::RTXA6000DeviceInfo()),
+        profiles_path_(tsl::io::JoinPath(tsl::testing::TmpDir(), kFile)) {}
+
+  void SetUp() override {
+    CHECK_OK(tsl::WriteTextProto(tsl::Env::Default(), profiles_path_,
+                                 TestProfiles(device_info_)));
+  }
+
+ protected:
+  const se::DeviceDescription device_info_;
+  const std::string profiles_path_;
+};
+
+TEST_F(MatmulStatsCollectionTest,
+       CollectsMatmulPerfTableDataForGemmCustomCalls) {
+  absl::string_view hlo = R"(
+    HloModule m
+
+    ENTRY e {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT dot =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+        custom_call_target="__cublas$gemm",
+        backend_config={
+          "operation_queue_id":"0",
+          "wait_on_operation_queues":[],
+          "gemm_backend_config":{
+            "alpha_real":1,
+            "beta":1,
+            "dot_dimension_numbers": {
+              "lhs_contracting_dimensions":["1"],
+              "rhs_contracting_dimensions":["1"],
+              "lhs_batch_dimensions":[],
+              "rhs_batch_dimensions":[]
+            }
+          }
+        }
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, MatmulPerfTableStatsCollection(profiles_path_, device_info_)
+                        .Run(module.get()));
+
+  VLOG(1) << module->ToString();
+
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(
+      module->entry_computation()
+          ->root_instruction()
+          ->backend_config<GpuBackendConfig>()
+          ->reification_cost(),
+      ElementsAre(Property(&ReificationCost::exec_time_us,
+                           DoubleNear(1000000, /*max_abs_error=*/0.01))));
+}
+
+TEST_F(MatmulStatsCollectionTest,
+       CollectsMatmulPerfTableDataForTritonFusionConfig) {
+  absl::string_view hlo = R"(
+    HloModule m
+
+    comp {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT _ = bf16[1024,1024] dot(p0,p1),
+        lhs_contracting_dims={0},
+        rhs_contracting_dims={1}
+    }
+
+    ENTRY e {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT triton_gemm =  bf16[1024,1024] fusion(p0,p1),
+        kind=kCustom,
+        calls=comp,
+        backend_config={
+          "operation_queue_id":"0",
+          "wait_on_operation_queues":[],
+          "fusion_backend_config": {
+            "kind":"__triton_gemm",
+            "triton_gemm_config":{
+              "block_m":"128",
+              "block_n":"128",
+              "block_k":"64",
+              "split_k":"1",
+              "num_stages":"1",
+              "num_warps":"8",
+              "num_ctas":"1"
+            }
+          },
+        }
+    }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, MatmulPerfTableStatsCollection(profiles_path_, device_info_)
+                        .Run(module.get()));
+
+  VLOG(1) << module->ToString();
+
+  EXPECT_FALSE(changed);
+  EXPECT_EQ(module->entry_computation()
+                ->root_instruction()
+                ->backend_config<GpuBackendConfig>()
+                ->reification_cost_size(),
+            0);
+}
+
+TEST_F(MatmulStatsCollectionTest,
+       CollectsMatmulGEMMCostModelDataForTritonFusionConfig) {
+  absl::string_view hlo = R"(
+    HloModule m
+
+    comp {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT _ = bf16[1024,1024] dot(p0,p1),
+        lhs_contracting_dims={1},
+        rhs_contracting_dims={0}
+    }
+
+    ENTRY e {
+      p0 = bf16[1024,1024] parameter(0)
+      p1 = bf16[1024,1024] parameter(1)
+      ROOT triton_gemm =  bf16[1024,1024] fusion(p0,p1),
+        kind=kCustom,
+        calls=comp,
+        backend_config={
+          "operation_queue_id":"0",
+          "wait_on_operation_queues":[],
+          "fusion_backend_config": {
+            "kind":"__triton_gemm",
+            "triton_gemm_config":{
+              "block_m":"128",
+              "block_n":"128",
+              "block_k":"64",
+              "split_k":"1",
+              "num_stages":"1",
+              "num_warps":"8",
+              "num_ctas":"1"
+            }
+          },
+        }
+    }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, MatmulPerfTableStatsCollection(profiles_path_, device_info_)
+                        .Run(module.get()));
+
+  VLOG(1) << module->ToString();
+
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(module->entry_computation()
+                  ->root_instruction()
+                  ->backend_config<GpuBackendConfig>()
+                  ->reification_cost(),
+              ElementsAre(Property(&ReificationCost::exec_time_us,
+                                   DoubleNear(199, /*max_abs_error=*/1))));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
index 6a84c801d658..7b4c73519d7d 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
@@ -24,11 +24,14 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-inline constexpr absl::string_view kSplitMaskWorldLevel = "0x0";
 
+// Speed-of-Light (SoL) analytical cost model for NCCL collectives.
 class SolGPUCostModel {
-  // Speed-of-Light (SoL) analytical cost model for NCCL collectives.
  public:
+  static constexpr absl::string_view kSplitMaskWorldLevel = "0x0";
+
+  static constexpr absl::string_view kSplitMaskNonRailAligned = "0x7";
+
   // Tunable system configuration, see
   // xla_gpu_analytical_latency_estimator_options
   struct Config {
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
index 766123c2c492..532b8d26a21d 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h"
 
+#include <memory>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -22,9 +24,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/model/collective_interpolator.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model.h"
 #include "xla/service/gpu/model/sol_latency_estimator.h"
 #include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
@@ -33,6 +37,10 @@ absl::StatusOr<bool> SolGpuCostModelStatsCollection::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   SolGPUCostModel::Config config = SolGPUCostModel::GetConfig(module);
 
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<CollectiveInterpolator> collective_interpolator,
+      CollectiveInterpolator::Create(device_info_));
+
   hlo_query::ForEachInstructionWithPred(
       *module,
       [](const HloInstruction* instr) {
@@ -41,13 +49,14 @@ absl::StatusOr<bool> SolGpuCostModelStatsCollection::Run(
       [&](HloInstruction* instr) {
         // Generate exec time for a collective.
         absl::Duration exec_time = SolLatencyEstimator::ComputeCollectiveTime(
-            *instr, device_info_, shape_size_in_bytes_fn_, config);
+            *instr, device_info_, shape_size_in_bytes_fn_, config,
+            collective_interpolator.get());
 
         // Set it in the `CollectiveBackendConfig`.
         auto gpu_config = instr->backend_config<GpuBackendConfig>();
         TF_CHECK_OK(gpu_config.status()) << instr->ToString();
-        auto reification_cost = gpu_config->mutable_collective_backend_config()
-                                    ->mutable_reification_cost();
+        auto reification_cost = gpu_config->add_reification_cost();
+        *reification_cost->mutable_name() = name();
         reification_cost->set_exec_time_us(
             absl::ToDoubleMicroseconds(exec_time));
         TF_CHECK_OK(instr->set_backend_config(*gpu_config));
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
index 7b7bb20ad1a2..533af1a112d1 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
@@ -18,25 +18,31 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
+using ::testing::DoubleNear;
+using ::testing::Property;
+
 using ShapeSizeFn = std::function<int64_t(const Shape&)>;
 
-class SolGpuCostModelStatsCollectionTest : public HloTestBase {
+class SolGpuCostModelStatsCollectionTest
+    : public HloHardwareIndependentTestBase {
  public:
-  explicit SolGpuCostModelStatsCollectionTest() : HloTestBase() {
+  explicit SolGpuCostModelStatsCollectionTest() {
     ShapeSizeFn shape_size_bytes =
         [&shape_size_bytes](const Shape& shape) -> int64_t {
       int64_t shape_size = 0;
@@ -52,7 +58,8 @@ class SolGpuCostModelStatsCollectionTest : public HloTestBase {
   }
 
  protected:
-  se::DeviceDescription device_info_ = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  se::DeviceDescription device_info_ =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(se::CudaComputeCapability(9, 0));
   ShapeSizeFn shape_size_fn_;
 };
 
@@ -84,11 +91,13 @@ TEST_F(SolGpuCostModelStatsCollectionTest,
   VLOG(1) << module->ToString();
 
   EXPECT_FALSE(changed);
-  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-  CHECK: ar-start
-  CHECK-SAME: collective_backend_config
-  CHECK-SAME: "exec_time_us":1
-  )"));
+  EXPECT_THAT(module->entry_computation()
+                  ->root_instruction()
+                  ->operand(0)
+                  ->backend_config<GpuBackendConfig>()
+                  ->reification_cost(),
+              ElementsAre(Property(&ReificationCost::exec_time_us,
+                                   DoubleNear(643.1, /*max_abs_error=*/0.1))));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
index 53c756586af6..b3268cd79df5 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
@@ -17,23 +17,37 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/model/collective_interpolator.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/service/gpu/model/hlo_op_profile.pb.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -41,54 +55,28 @@ namespace gpu {
 
 namespace {
 
-int GetNumGpus(const HloInstruction& instr) {
-  const HloInstruction* i = &instr;
-  if (instr.opcode() == HloOpcode::kAsyncStart) {
-    i = instr.async_wrapped_instruction();
-  }
-  int size = 0;
-  for (auto& rg : i->replica_groups()) {
-    size += rg.replica_ids_size();
-  }
-  return size;
-}
+absl::StatusOr<HloInstructionProfileList> ReadProfiles(
+    const std::string& perf_table_path,
+    const se::DeviceDescription& device_info) {
+  DeviceHloInstructionProfiles profile;
 
-}  // namespace
+  TF_RETURN_IF_ERROR(tsl::Env::Default()->FileExists(perf_table_path));
+  TF_RETURN_IF_ERROR(tsl::ReadTextOrBinaryProto(tsl::Env::Default(),
+                                                perf_table_path, &profile));
+  std::string key = HloOpProfiles::GetProfileName(device_info);
 
-/*static*/ absl::Duration SolLatencyEstimator::ComputeCollectiveTime(
-    const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
-    HloCostAnalysis::ShapeSizeFunction shape_size_fn,
-    const SolGPUCostModel::Config& sol_flags) {
-  GpuHloCostAnalysis analysis(
-      GpuHloCostAnalysis::Options{shape_size_fn,
-                                  /*per_second_rates=*/{},
-                                  /*min_latencies_seconds=*/{},
-                                  /*count_multiple_input_accesses=*/true},
-      gpu_device_info);
-
-  CHECK_OK(instr.parent()->Accept(&analysis));
-
-  return SolLatencyEstimator::ComputeCollectiveTime(
-      instr, gpu_device_info, shape_size_fn, sol_flags, analysis);
+  if (!profile.entries().contains(key)) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Key not present: ", key));
+  }
+  return profile.entries().at(key);
 }
 
-/*static*/ absl::Duration SolLatencyEstimator::ComputeCollectiveTime(
+std::optional<absl::Duration> DCNCollectiveDuration(
+    int num_participating_hosts, absl::string_view mask,
     const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
-    HloCostAnalysis::ShapeSizeFunction shape_size_fn,
     const SolGPUCostModel::Config& sol_flags,
     const GpuHloCostAnalysis& analysis) {
-  // TODO(b/390095346): This is incorrect way of determining how many nodes
-  // participate in a collective.
-  const int num_nodes = GetNumGpus(instr) / sol_flags.gpus_per_node;
-  if (num_nodes == 1) {
-    VLOG(8) << "Returning only kernel launch overhead for a single node.";
-    return GpuPerformanceModelBase::kNcclKernelLaunchOverhead;
-  }
-
-  if (HloDataflowAnalysis::IsAsynchronousOperationDone(instr.opcode())) {
-    VLOG(8) << "Returning 0 cost for async done op " << instr.name();
-    return absl::ZeroDuration();
-  }
   SolGPUCostModel sol_model(sol_flags);
   const int64_t msg_size = analysis.BytesTransferred(instr);
 
@@ -96,46 +84,51 @@ int GetNumGpus(const HloInstruction& instr) {
   // (more granularly) model bytes accessed (input + output) for collectives.
   absl::Duration result = absl::Seconds(1.0f * analysis.bytes_accessed(instr) /
                                         gpu_device_info.memory_bandwidth());
+  GpuPerformanceModelOwning gpu_performance_model{gpu_device_info};
   switch (instr.opcode()) {
     case HloOpcode::kAllGather:
     case HloOpcode::kAllGatherStart: {
       result += sol_model.RingLatency(
-          msg_size, num_nodes, SolGPUCostModel::CollectiveType::kAllGather);
+          msg_size, num_participating_hosts,
+          SolGPUCostModel::CollectiveType::kAllGather, mask);
       break;
     }
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllReduceStart: {
-      result += GpuPerformanceModel::EstimateRunTimeForInstruction(
-                    &instr, gpu_device_info, &analysis, /*config=*/{})
-                    .compute_time;
+      result +=
+          gpu_performance_model.EstimateRunTimeForInstruction(&instr, &analysis)
+              .compute_time;
       result += sol_model.RingLatency(
-          msg_size, num_nodes, SolGPUCostModel::CollectiveType::kAllReduce);
+          msg_size, num_participating_hosts,
+          SolGPUCostModel::CollectiveType::kAllReduce, mask);
       break;
     }
     case HloOpcode::kReduceScatter: {
-      result += GpuPerformanceModel::EstimateRunTimeForInstruction(
-                    &instr, gpu_device_info, &analysis, {})
-                    .compute_time;
+      result +=
+          gpu_performance_model.EstimateRunTimeForInstruction(&instr, &analysis)
+              .compute_time;
       result += sol_model.RingLatency(
-          msg_size, num_nodes, SolGPUCostModel::CollectiveType::kReduceScatter);
+          msg_size, num_participating_hosts,
+          SolGPUCostModel::CollectiveType::kReduceScatter, mask);
       break;
     }
     case HloOpcode::kAsyncStart: {
       if (instr.async_wrapped_opcode() == HloOpcode::kReduceScatter) {
-        result += GpuPerformanceModel::EstimateRunTimeForInstruction(
-                      instr.async_wrapped_instruction(), gpu_device_info,
-                      &analysis, {})
+        result += gpu_performance_model
+                      .EstimateRunTimeForInstruction(
+                          instr.async_wrapped_instruction(), &analysis)
                       .compute_time;
         result += sol_model.RingLatency(
-            msg_size, num_nodes,
-            SolGPUCostModel::CollectiveType::kReduceScatter);
+            msg_size, num_participating_hosts,
+            SolGPUCostModel::CollectiveType::kReduceScatter, mask);
       }
       break;
     }
     case HloOpcode::kRecv:
     case HloOpcode::kSend: {
       result += sol_model.RingLatency(
-          msg_size, num_nodes, SolGPUCostModel::CollectiveType::kSendRecv);
+          msg_size, num_participating_hosts,
+          SolGPUCostModel::CollectiveType::kSendRecv, mask);
       break;
     }
     // note: AllToAll is not yet supported in XLA
@@ -149,6 +142,103 @@ int GetNumGpus(const HloInstruction& instr) {
   return result;
 }
 
+std::optional<absl::Duration> DispatchEstimation(
+    const absl::StatusOr<GPUCommunicationType>& communication_type,
+    const HloCollectiveInstruction& instr,
+    const se::DeviceDescription& gpu_device_info,
+    const SolGPUCostModel::Config& sol_flags,
+    const GpuHloCostAnalysis& analysis,
+    const CollectiveInterpolator* collective_interpolator) {
+  if (!communication_type.ok()) {
+    VLOG(1) << "Failed to determine communication type: "
+            << communication_type.status();
+    return std::nullopt;
+  }
+
+  GPUCommunicationType comm = *communication_type;
+  auto num_groups_and_devices = GetReplicaGroupCountAndSize(&instr);
+  if (!num_groups_and_devices.ok()) {
+    VLOG(1) << "Failed to determine a number of devices participating in "
+               "the collective: "
+            << instr.ToString();
+    return std::nullopt;
+  }
+
+  switch (comm) {
+    case GPUCommunicationType::RAIL_ALIGNED: {
+      return DCNCollectiveDuration(
+          (*num_groups_and_devices)->second / sol_flags.gpus_per_node,
+          SolGPUCostModel::kSplitMaskWorldLevel, instr, gpu_device_info,
+          sol_flags, analysis);
+    }
+    case GPUCommunicationType::NON_RAIL_ALIGNED: {
+      return DCNCollectiveDuration((*num_groups_and_devices)->second,
+                                   SolGPUCostModel::kSplitMaskNonRailAligned,
+                                   instr, gpu_device_info, sol_flags, analysis);
+    }
+    case GPUCommunicationType::SINGLE_HOST: {
+      if (collective_interpolator == nullptr) {
+        return GpuPerformanceModelBase::kNcclKernelLaunchOverhead;
+      }
+      return collective_interpolator->EstimatedRuntime(instr);
+    }
+    case xla::gpu::GPUCommunicationType::UNDEFINED:
+      LOG(WARNING) << "Cannot determine communication type: "
+                   << instr.ToString();
+      return std::nullopt;
+  }
+}
+
+}  // namespace
+
+/*static*/ absl::Duration SolLatencyEstimator::ComputeCollectiveTime(
+    const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
+    HloCostAnalysis::ShapeSizeFunction shape_size_fn,
+    const SolGPUCostModel::Config& sol_flags,
+    const CollectiveInterpolator* collective_interpolator) {
+  GpuHloCostAnalysis analysis(
+      GpuHloCostAnalysis::Options{shape_size_fn,
+                                  /*per_second_rates=*/{},
+                                  /*min_latencies_seconds=*/{},
+                                  /*count_multiple_input_accesses=*/true},
+      gpu_device_info);
+
+  CHECK_OK(instr.parent()->Accept(&analysis));
+
+  if (instr.IsAsynchronous()) {
+    CHECK_OK(instr.async_wrapped_instruction()->Accept(&analysis));
+  }
+
+  return SolLatencyEstimator::ComputeCollectiveTime(
+      instr, gpu_device_info, shape_size_fn, sol_flags, analysis,
+      collective_interpolator);
+}
+
+/*static*/ absl::Duration SolLatencyEstimator::ComputeCollectiveTime(
+    const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
+    HloCostAnalysis::ShapeSizeFunction shape_size_fn,
+    const SolGPUCostModel::Config& sol_flags,
+    const GpuHloCostAnalysis& analysis,
+    const CollectiveInterpolator* collective_interpolator) {
+  if (HloDataflowAnalysis::IsAsynchronousOperationDone(instr.opcode())) {
+    VLOG(8) << "Returning 0 cost for async done op " << instr.name();
+    return absl::ZeroDuration();
+  }
+
+  if (auto* collective_instr = DynCast<HloCollectiveInstruction>(
+          instr.IsAsynchronous() ? instr.async_wrapped_instruction() : &instr);
+      collective_instr != nullptr) {
+    absl::StatusOr<GPUCommunicationType> communication_type = CommunicationType(
+        *collective_instr, gpu_device_info.gpu_compute_capability());
+    return DispatchEstimation(communication_type, *collective_instr,
+                              gpu_device_info, sol_flags, analysis,
+                              collective_interpolator)
+        .value_or(absl::ZeroDuration());
+  }
+
+  return absl::ZeroDuration();
+}
+
 LatencyEstimator::TimeCost SolLatencyEstimator::GetLatencyBetween(
     const HloGraphNode& from, const HloGraphNode& target) const {
   const HloOpcode from_op = from.GetInstr().opcode();
@@ -158,9 +248,9 @@ LatencyEstimator::TimeCost SolLatencyEstimator::GetLatencyBetween(
   }
 
   if (IsAsyncPair(from, target)) {
-    double coll_time = absl::ToDoubleMicroseconds(
-        ComputeCollectiveTime(from.GetInstr(), gpu_info_, shape_size_function_,
-                              sol_flags_, *cost_analysis_));
+    double coll_time = absl::ToDoubleMicroseconds(ComputeCollectiveTime(
+        from.GetInstr(), gpu_info_, shape_size_function_, sol_flags_,
+        *cost_analysis_, collective_interpolator_.get()));
     VLOG(10) << "[SoL] Analytical estimator calculated latency between "
              << from.GetInstr().name() << " and " << target.GetInstr().name()
              << " to be: " << coll_time << " us.";
@@ -177,9 +267,8 @@ LatencyEstimator::TimeCost SolLatencyEstimator::NodeCost(
   }
 
   absl::Duration total_estimated_time =
-      GpuPerformanceModel::EstimateRunTimeForInstruction(
-          instr, gpu_info_, &*cost_analysis_,
-          GpuPerformanceModelOptions::Default())
+      gpu_performance_model_
+          .EstimateRunTimeForInstruction(instr, &*cost_analysis_)
           .exec_time;
   LatencyEstimator::TimeCost cost_in_us =
       absl::ToDoubleMicroseconds(total_estimated_time);
@@ -196,6 +285,7 @@ SolLatencyEstimator::SolLatencyEstimator(
     HloComputation* computation)
     : config_(config),
       gpu_info_(gpu_info),
+      gpu_performance_model_(gpu_info),
       latency_estimator_(std::move(latency_estimator)),
       shape_size_function_(shape_size_function),
       sol_flags_(SolGPUCostModel::GetConfig(computation->parent())) {
@@ -205,12 +295,33 @@ SolLatencyEstimator::SolLatencyEstimator(
                                   /*min_latencies_seconds=*/{},
                                   /*count_multiple_input_accesses=*/true},
       gpu_info_);
-  TF_CHECK_OK(computation->Accept(&cost_analysis_.value()));
+  if (!computation->Accept(&cost_analysis_.value()).ok()) {
+    VLOG(1) << "Cannot analyze computation: " << computation->ToString();
+  }
+
   if (sol_flags_.nccl_op_launch_time == absl::ZeroDuration() ||
       sol_flags_.nic_speed_gbps == 0 ||
       sol_flags_.chunk_prep_time == absl::ZeroDuration() ||
       sol_flags_.rtt == absl::ZeroDuration() || sol_flags_.gpus_per_node == 0) {
-    LOG(WARNING) << "[SoL] Failed to parse SoL system config options.";
+    VLOG(1) << "[SoL] Failed to parse SoL system config options.";
+  }
+
+  absl::StatusOr<std::unique_ptr<CollectiveInterpolator>>
+      collective_interpolator;
+  absl::StatusOr<HloInstructionProfileList> collective_profiles =
+      ReadProfiles(computation->parent()
+                       ->config()
+                       .debug_options()
+                       .xla_gpu_experimental_collective_perf_table_path(),
+                   gpu_info);
+  if (collective_profiles.ok()) {
+    collective_interpolator =
+        CollectiveInterpolator::Create(*collective_profiles, gpu_info);
+  } else {
+    collective_interpolator = CollectiveInterpolator::Create(gpu_info);
+  }
+  if (collective_interpolator.ok()) {
+    collective_interpolator_ = *std::move(collective_interpolator);
   }
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
index 0c9da3d0abcc..50e21ceb26e1 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.h
@@ -22,7 +22,9 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/collective_interpolator.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/latency_hiding_scheduler.h"
@@ -51,13 +53,15 @@ class SolLatencyEstimator : public LatencyEstimator {
   static absl::Duration ComputeCollectiveTime(
       const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
       HloCostAnalysis::ShapeSizeFunction shape_size_fn,
-      const SolGPUCostModel::Config& sol_flags);
+      const SolGPUCostModel::Config& sol_flags,
+      const CollectiveInterpolator* collective_interpolator = nullptr);
 
   static absl::Duration ComputeCollectiveTime(
       const HloInstruction& instr, const se::DeviceDescription& gpu_device_info,
       HloCostAnalysis::ShapeSizeFunction shape_size_fn,
       const SolGPUCostModel::Config& sol_flags,
-      const GpuHloCostAnalysis& cost_analysis);
+      const GpuHloCostAnalysis& cost_analysis,
+      const CollectiveInterpolator* collective_interpolator = nullptr);
 
   static constexpr TimeCost kLowCost = 1.0;
   static constexpr TimeCost kLowLatency = 1.0;
@@ -65,10 +69,12 @@ class SolLatencyEstimator : public LatencyEstimator {
  private:
   const SchedulerConfig config_;
   const se::DeviceDescription& gpu_info_;
+  GpuPerformanceModelOwning gpu_performance_model_;
   std::optional<GpuHloCostAnalysis> cost_analysis_;
   std::unique_ptr<LatencyEstimator> latency_estimator_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
   const SolGPUCostModel::Config sol_flags_;
+  std::unique_ptr<CollectiveInterpolator> collective_interpolator_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index f8a1ff9133a8..29c130d994fc 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -23,12 +23,14 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model.h"
 #include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -45,12 +47,13 @@ struct EstimatorTestCase {
   absl::Duration expected_latency;
 };
 
-class SolLatencyEstimatorTest : public HloTestBase,
+class SolLatencyEstimatorTest : public HloHardwareIndependentTestBase,
                                 public WithParamInterface<EstimatorTestCase> {
  protected:
   SolLatencyEstimatorTest()
       : shape_size_fn_(HloCostAnalysis::DefaultShapeSize),
-        gpu_device_info_(TestGpuDeviceInfo::RTXA6000DeviceInfo()),
+        gpu_device_info_(TestGpuDeviceInfo::RTXA6000DeviceInfo(
+            se::CudaComputeCapability(9, 0))),
         sol_flags_({
             /*nccl_op_launch_time=*/absl::Microseconds(100),
             /*nic_speed_gbps=*/100,
@@ -95,10 +98,9 @@ ENTRY main {
     use_global_device_ids=true,
     dimensions={1}
   ROOT ag-done = bf16[16000,8000] all-gather-done(ag-start)
-
 })",
       /*opcode=*/HloOpcode::kAllGatherStart,
-      /*expected_latency=*/absl::Microseconds(2303),
+      /*expected_latency=*/GpuPerformanceModelBase::kNcclKernelLaunchOverhead,
   };
 
   EstimatorTestCase all_gather_inter_host_pairwise = {
@@ -116,7 +118,7 @@ ENTRY main {
   ROOT ag-done = bf16[16000,8000] all-gather-done(ag-start)
 })",
       /*opcode=*/HloOpcode::kAllGatherStart,
-      /*expected_latency=*/absl::Microseconds(2178),
+      /*expected_latency=*/absl::Microseconds(5445),
   };
 
   EstimatorTestCase all_gather_all_ranks = {
@@ -164,7 +166,7 @@ ENTRY main {
   ROOT rs-done = bf16[64,128256] async-done(rs-start)
 })",
       /*opcode=*/HloOpcode::kAsyncStart,
-      /*expected_latency=*/absl::Microseconds(18894),
+      /*expected_latency=*/absl::Microseconds(18895),
   };
 
   return {
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
index 874882685d8f..7cf7fd99b23d 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/model/symbolic_tile.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <optional>
@@ -296,8 +297,20 @@ llvm::SmallVector<int64_t> EvaluateTileSizes(
 
 llvm::SmallVector<int64_t> EvaluateTileStrides(
     const SymbolicTile& symbolic_tile, absl::Span<int64_t const> parameters) {
+  llvm::SmallVector<int64_t> clamped_parameters;
+  clamped_parameters.reserve(parameters.size());
+  // We need to clamp the parameters to the dimension bounds, otherwise the
+  // stride expressions would potentially return wrong results. The underlying
+  // implementation detail is that the IfNeqOne affine expression that we use
+  // for expanding reshapes assumes that the tile parameter is not bigger than
+  // the dimension bound. To make the assumption hold, we clamp the parameters
+  // accordingly.
+  for (auto [parameter, dim_bounds] :
+       llvm::zip(parameters, symbolic_tile.tile_map().GetDimensionBounds())) {
+    clamped_parameters.push_back(std::min(parameter, dim_bounds.upper));
+  }
   return EvaluateAffineMap(symbolic_tile.stride_map(),
-                           /*dim_values=*/parameters);
+                           /*dim_values=*/clamped_parameters);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 3ab0b7a01d13..be4ce8f1034a 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -34,12 +34,14 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/hash/hash.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -56,6 +58,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/model/constraint_expression.h"
 #include "xla/service/gpu/model/symbolic_tile.h"
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
@@ -92,19 +96,35 @@ struct OutputTilingInfo {
   IndexingMap output_tile_offset_indexing;
 };
 
-OutputTilingInfo ComputeOutputTilingInfo(const IndexingMap& root_indexing,
-                                         absl::Span<const int64_t> tile_sizes,
-                                         mlir::MLIRContext* mlir_context) {
+absl::StatusOr<OutputTilingInfo> ComputeOutputTilingInfo(
+    const IndexingMap& root_indexing, absl::Span<const int64_t> tile_sizes,
+    mlir::MLIRContext* mlir_context) {
   int64_t rank = root_indexing.GetDimVarsCount();
   CHECK_EQ(rank, tile_sizes.size());  // Crash OK
 
   llvm::SmallVector<int64_t> outer_loop_bounds;
+  std::vector<IndexingMap::Variable> dim_vars;
   outer_loop_bounds.reserve(rank);
+  dim_vars.reserve(rank);
   for (auto [dim_bounds, tile_size] :
        llvm::zip(root_indexing.GetDimensionBounds(), tile_sizes)) {
-    CHECK_EQ(dim_bounds.lower, 0)
-        << "Root indexing domain does not start at 0.";
-    outer_loop_bounds.push_back(CeilOfRatio(dim_bounds.upper + 1, tile_size));
+    outer_loop_bounds.push_back(
+        CeilOfRatio(dim_bounds.upper + 1 - dim_bounds.lower, tile_size));
+
+    // Start out by making the assumption that we only get tiles with offsets
+    // divisible by the tile size. This is true for our initial support of
+    // concatenates, but is not a given in the future.
+    if (dim_bounds.lower % tile_size != 0) {
+      return absl::UnimplementedError(
+          absl::StrCat("Dimension bounds are not divisible by tile size: ",
+                       ToString(root_indexing)));
+    }
+
+    int64_t loop_lower_bound_offset = dim_bounds.lower / tile_size;
+    int64_t loop_upper_bound_ofset =
+        loop_lower_bound_offset + outer_loop_bounds.back() - 1;
+    dim_vars.push_back({loop_lower_bound_offset, loop_upper_bound_ofset});
+    dim_vars.back().name = absl::StrCat("pid_", dim_vars.size() - 1);
   }
 
   llvm::SmallVector<AffineExpr> tiled_dims;
@@ -114,18 +134,11 @@ OutputTilingInfo ComputeOutputTilingInfo(const IndexingMap& root_indexing,
                          mlir::getAffineDimExpr(dim_id, mlir_context));
   }
 
-  std::vector<IndexingMap::Variable> dim_vars =
-      DimVarsFromTensorSizes(outer_loop_bounds);
-  // Name the dimension variables for convenience.
-  for (auto&& [idx, dim_var] : llvm::enumerate(dim_vars)) {
-    dim_var.name = absl::StrCat("pid_", idx);
-  }
-
   IndexingMap output_tile_offset_indexing{
       mlir::AffineMap::get(
           /*dimCount=*/rank, /*symbolCount=*/0, tiled_dims, mlir_context),
       dim_vars, /*range_vars=*/{}, /*rt_vars=*/{}};
-  return {outer_loop_bounds, output_tile_offset_indexing};
+  return OutputTilingInfo{outer_loop_bounds, output_tile_offset_indexing};
 }
 
 // Extension of SymbolicTiledHloInstruction for fusions that holds the analysis
@@ -247,6 +260,20 @@ class OrderedUniquePtrValueHashSet {
   std::vector<std::unique_ptr<T>> data_;
 };
 
+// Whether the given HLO instruction is part of a nested GEMM fusion.
+bool IsWithinNestedGemmFusion(const HloInstruction* hlo) {
+  const HloComputation* computation = hlo->parent();
+  if (computation->IsFusionComputation()) {
+    const GpuBackendConfig backend_config =
+        *computation->FusionInstruction()->backend_config<GpuBackendConfig>();
+    absl::string_view fusion_kind =
+        backend_config.fusion_backend_config().kind();
+    return fusion_kind == kTritonNestedGemmFusionKind;
+  }
+
+  return false;
+}
+
 // Detects pathological cases on which symbolic tile derivation should bail out.
 // Note that this function bypasses temporary limitations of the infrastructure,
 // and not actual fundamental limitations.
@@ -254,11 +281,14 @@ FusionDecision ShouldProceedWithSymbolicTileDerivation(
     const SymbolicTiledHloInstruction& tiled_hlo_instruction) {
   const HloInstruction* hlo = tiled_hlo_instruction.hlo();
   const IndexingMap& indexing_map = tiled_hlo_instruction.indexing_map();
-
-  // Bail out on instructions that are known to cause problems down the
-  // line. This is not an inherent limitation of the approach, but simply
-  // issues to be resolved in the current implementation.
-  if (hlo->opcode() == HloOpcode::kConcatenate) {
+  // Bail out on concatenates in the general path for now, but allow a
+  // restricted form of concatenates for the nested GEMM fusion path.
+  //
+  // Relaxing this restriction will require making sure that the cost model
+  // works well with concatenates, and that we always construct nested fusions
+  // for concatenates.
+  if (hlo->opcode() == HloOpcode::kConcatenate &&
+      !IsWithinNestedGemmFusion(hlo)) {
     return FusionDecision::Forbid("Bailing out on ") << hlo->ToString();
   }
 
@@ -266,7 +296,7 @@ FusionDecision ShouldProceedWithSymbolicTileDerivation(
   // deriving a standalone symbolic tile when constructing Triton-specific
   // constraints, reshapes and bitcasts may cause problems down the line.
   // The added check here allows us to bail out early when we reach such a
-  // a problematic.
+  // a problematic case.
   //
   // TODO(b/365727080): get rid of this filter once the issue is properly
   // fixed.
@@ -373,6 +403,51 @@ void SortTiledHloInstructionsInPostOrder(
                });
 }
 
+// Returns `true` if `SymbolicTileAnalysis` should simplify point dimensions
+// away when deriving indexing maps.
+//
+// Simplifying point dimensions away is helpful as it allows symbolic tile
+// derivation to succeed in more cases. However, it can lead to generating
+// ill-typed programs when we need to propagate a larger (padded) tile through
+// the program. In that case, simplifying the point dimension away prevents
+// propagation, and leads to the downstream generation of an incorrect program.
+//
+// This is typically the case when trying to feed a vector-matrix or
+// matrix-vector dot product into NVIDIA GPU tensor cores---which expect their
+// inputs to have specific dimensions. In that case, we usually want to pretend
+// to tile the vector with a tile size appropriate for the tensor core, even
+// though one of its dimensions is 1.
+//
+// Adding this here is a slight abstraction leak, since it slightly specializes
+// symbolic tile analysis to NVIDIA GPUs. This is not totally unreasonable
+// though: given sufficient analytical capabilities for symbolic tile
+// derivation, preventing the simplification of point dimensions should not
+// cause us to fail to tile more programs, and would better track the
+// propagation of tiles throughout the program. As a result, a mode that does
+// not perform this simplification is actually "more correct"---but currently
+// leads to more fusions being untileable.
+bool ShouldDerivationSimplifyPointDimensions(const HloFusionAdaptor& fusion) {
+  for (const HloInstructionAdaptor& instruction_adaptor :
+       fusion.MakeInstructionPostOrder()) {
+    if (!fusion.ContainsInstruction(&instruction_adaptor.instruction())) {
+      continue;
+    }
+
+    if (instruction_adaptor.opcode() == HloOpcode::kDot) {
+      return false;
+    }
+
+    if (instruction_adaptor.opcode() == HloOpcode::kFusion) {
+      auto nested_fusion_adaptor = HloFusionAdaptor::ForComputation(
+          instruction_adaptor.instruction().fused_instructions_computation());
+      if (!ShouldDerivationSimplifyPointDimensions(*nested_fusion_adaptor)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // anonymous namespace
 
 // Extracts HloInstructions from a span of HloInstructionAdaptors.
@@ -444,6 +519,12 @@ absl::StatusOr<int64_t> GetRealRootIndex(
   OrderedUniquePtrValueHashSet<SymbolicTiledHloInstruction>
       tiled_hlo_instructions_set;
 
+  IndexingMap::SimplifyPointDimensions simplification_mode =
+      IndexingMap::SimplifyPointDimensions::kPreserve;
+  if (ShouldDerivationSimplifyPointDimensions(fusion)) {
+    simplification_mode = IndexingMap::SimplifyPointDimensions::kReplace;
+  }
+
   // TODO(b/372454662): Once we get rid of the restriction of only one real
   // root, this needs to be adapted.
   auto [root_tiled_hlo, _] = tiled_hlo_instructions_set.Insert(
@@ -480,7 +561,7 @@ absl::StatusOr<int64_t> GetRealRootIndex(
                << tiled_hlo_instruction->hlo()->ToString() << " and operand "
                << operand.instruction().ToString();
       }
-      operand_indexing_map.Simplify();
+      operand_indexing_map.Simplify(simplification_mode);
       operand_indexing_map.RescaleSymbols();
       operand_indexing_map.RemoveUnusedSymbols();
 
@@ -498,9 +579,18 @@ absl::StatusOr<int64_t> GetRealRootIndex(
         absl::c_iota(range_var_indices, 0);
         auto nested_root_map = ConvertRangeVariablesToDimensions(
             operand_indexing_map, range_var_indices);
+        auto nested_roots = ToInstructions(nested_fusion_adaptor->GetRoots());
+        // Nested fusions can be empty. Walk up to the parent parameter. This
+        // avoids touching the delicate HloFusionAdaptor logic.
+        for (auto& root : nested_roots) {
+          if (root->opcode() == HloOpcode::kParameter) {
+            root = root->parent()->FusionInstruction()->operand(
+                root->parameter_number());
+          }
+        }
         RootIndexing nested_root_indexing{
             /*real_root_index=*/0,
-            /*roots=*/ToInstructions(nested_fusion_adaptor->GetRoots()),
+            /*roots=*/nested_roots,
             /*real_root_indexing=*/nested_root_map};
 
         auto analysis_or = SymbolicTileAnalysis::AnalyzeFusionImpl(
@@ -517,6 +607,10 @@ absl::StatusOr<int64_t> GetRealRootIndex(
         tiled_operand = std::make_unique<SymbolicTiledHloInstruction>(
             &operand.instruction(), std::move(operand_indexing_map));
       }
+
+      // TODO(b/393299275): propagation to operands is not correct when nesting,
+      // because we derive something all the way to the parameters that are
+      // outside the fusion. We should not derive anything for those operands.
       auto [operand_tiled_hlo, inserted] =
           tiled_hlo_instructions_set.Insert(std::move(tiled_operand));
       tiled_hlo_instruction->AppendOperand(operand_tiled_hlo);
@@ -594,137 +688,123 @@ absl::StatusOr<bool> SymbolicTileAnalysis::ParametersSatisfyConstraints(
 }
 
 namespace {
-void CollectUsedDimIds(AffineExpr expr, std::vector<bool>& dim_var_used) {
-  switch (expr.getKind()) {
-    case AffineExprKind::DimId: {
-      const uint32_t dim_id =
-          mlir::cast<mlir::AffineDimExpr>(expr).getPosition();
-      dim_var_used[dim_id] = true;
-      break;
-    }
-    case AffineExprKind::Add:
-    case AffineExprKind::Mul:
-    case AffineExprKind::FloorDiv:
-    case AffineExprKind::Mod: {
-      auto bin_op = mlir::cast<mlir::AffineBinaryOpExpr>(expr);
-      CollectUsedDimIds(bin_op.getLHS(), dim_var_used);
-      CollectUsedDimIds(bin_op.getRHS(), dim_var_used);
-      break;
-    }
-    case AffineExprKind::CeilDiv: {
-      std::string expr_str;
-      llvm::raw_string_ostream string_stream(expr_str);
-      expr.print(string_stream);
-      CHECK(false)
-          << "We do not expect CeilDiv in our indexing expressions, got "
-          << expr_str;
-      break;
-    }
-    case AffineExprKind::Constant:
-    case AffineExprKind::SymbolId:
-      break;
+llvm::SmallVector<int64_t> GetNumberOfTilesPerDimension(
+    const TiledHloInstruction& tiled_hlo_instr) {
+  llvm::SmallVector<int64_t> result;
+  absl::Span<const int64_t> dimensions =
+      tiled_hlo_instr.hlo()->shape().dimensions();
+  result.reserve(dimensions.size());
+  for (auto [dim_size, tile_size] :
+       llvm::zip(dimensions, tiled_hlo_instr.tile_sizes())) {
+    result.push_back(CeilOfRatio(dim_size, tile_size));
   }
+  return result;
 }
 
-bool AllDimIdsAreUsedOrHaveDomainSize1(const IndexingMap& tile_offsets) {
-  std::vector<bool> dim_var_used(tile_offsets.GetDimVarsCount(), false);
-  for (int64_t i = 0; i < dim_var_used.size(); ++i) {
-    if (tile_offsets.GetDimensionBound(i).IsPoint()) {
-      dim_var_used[i] = true;
-    }
-  }
-  auto offset_expressions = tile_offsets.GetAffineMap().getResults();
-  for (auto offset_expr : offset_expressions) {
-    CollectUsedDimIds(offset_expr, dim_var_used);
-  }
-  return absl::c_all_of(dim_var_used, [](bool value) { return value; });
+IndexingMap LinearizeTileOffsets(
+    const IndexingMap& tile_offsets_indexing,
+    absl::Span<const int64_t> num_output_tiles_per_dim,
+    mlir::MLIRContext* mlir_context) {
+  int64_t num_tiles = Product(num_output_tiles_per_dim);
+  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
+  auto tile_exprs =
+      DelinearizeIndex(num_output_tiles_per_dim, program_id, mlir_context);
+  auto program_id_to_output_dims = IndexingMap::FromTensorSizes(
+      mlir::AffineMap::get(
+          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, mlir_context),
+      /*dim_upper_bounds=*/{num_tiles}, /*symbol_upper_bounds=*/{});
+  auto linearized_tile_offsets_indexing =
+      ComposeIndexingMaps(program_id_to_output_dims, tile_offsets_indexing);
+  linearized_tile_offsets_indexing.Simplify();
+  linearized_tile_offsets_indexing.RescaleSymbols();
+  linearized_tile_offsets_indexing.RemoveUnusedSymbols();
+  return linearized_tile_offsets_indexing;
 }
 
-// Returns true when we can determine that the tiling attached to
-// `tiled_hlo_instr` covers the whole shape of the corresponding hlo instruction
-// uniquely. For a tiling to cover a whole shape uniquely, we need to prove that
-// every output index is covered by the tiling function (*surjectivity*), and
-// that tiles do not overlap (*injectivity*).
-bool TilingCoversWholeShapeUniquely(TiledHloInstruction* tiled_hlo_instr) {
-  // Check whether we can use `tiled_hlo_instr`.
-  Shape output_shape = tiled_hlo_instr->hlo()->shape();
-  auto maybe_tile_offset_indexing = tiled_hlo_instr->tile_offsets_indexing();
-  if (!maybe_tile_offset_indexing.ok()) {
-    return false;
-  }
-  auto tile_offset_indexing = maybe_tile_offset_indexing.value();
-  // We first check *injectivity*. `tile_offsets_indexing` is derived from a map
-  // from the "real root"'s output to the output of the given instruction, and
-  // gives us the tile offsets of each tile. By construction, we know that the
-  // mapping can be decomposed into offset + stride * index, so it is a linear
-  // expression. Therefore it should hold that also the composed indexing map
-  // `tile_offset_indexing` (where we have inserted 0, tile_size, tile_size * 2,
-  // ... instead of 0, 1, 2, ...) can be decomposed to an expression
-  // offset + stride * tile_size * index. This implies that for each dimension
-  // `tile_offset_indexing` has an expression for the tile offsets that ensures
-  // there is a gap of tile_size between different tile offsets. Therefore
-  // injectivity can be checked by verifying that for each combination of input
-  // variables, a distinct result is produced. We can check this by ensuring
-  // that each input variable is used at least once in the result expression of
-  // `tile_offsets_indexing`.
-
-  // This is a slightly handwavy claim, but holds because expressions of the
-  // form `d0 floordiv c` never initially appear in a symbolic tile without an
-  // associated `d0 mod c`, and operations of the form `d0 * d1` never appear.
-  // This should leave us with a guarantee that any combination involving
-  // several independent variables won't ever produce duplicate indices. (And
-  // trivially, if a parameter `d0` doesn't appear in the output expressions, it
-  // means that there are at least `range(d0)` identical outputs for the
-  // function. In that case, injectivity only holds if `range(d0) = 1`.
-  // TODO(b/390559452): This logic may not work out for ops like ReduceWindow or
-  // Convolutions, where we might have overlapping tiles "by design"
-  // (recognizable with symbol `s_i` with `range(s_i) > 1`). For now, just
-  // disallow any symbols.
-  if (tile_offset_indexing.GetSymbolCount() > 0) {
-    return false;
-  }
-  if (!AllDimIdsAreUsedOrHaveDomainSize1(tile_offset_indexing)) {
+// Returns whether the tiling from `output` can be used by the emitter for
+// producing a fusion output without causing issues in case a buffer is shared
+// between a fusion operand and a fusion output. Buffer sharing is (as of May
+// 2025) allowed if there is a path from the fusion operand to the fusion output
+// with only elementwise or bitcast ops. To avoid race conditions where we
+// overwrite an input value that is still required to compute another output
+// value, we need to make sure that we use an input tile only in the iteration
+// in which we overwrite it. Just using the propagated tile offsets of `output`
+// does not ensure this, as a tile size may not divide the dimension size evenly
+// and padding will be used. We might have different padding for different
+// output shapes. For example consider the following triton fusion
+// fused_computation {
+//   param_0 = f32[36] parameter(0)
+//   abs = f32[36] abs(param_0)
+//   reshape = f32[3,12] reshape(abs)
+//   ROOT tuple = (f32[3,12], f32[36]) tuple(reshape, abs)
+// }
+// with tiling parameters {1, 16} and {16}, respectively. With the f32[36]
+// shape, we would only pad the last tile, while for the shape f32[3,12] we
+// would pad all tiles. By ensuring the equality of the propagated tile offsets
+// indexing map with a tile offsets indexing map computed directly for this root
+// using the propagated tile size parameter, we ensure that there will be no
+// difference regarding which of the output tiles is padded. In our example
+// above, the propagated tile offsets on the `abs` instruction would be [0, 12,
+// 24], while the directly computed tile offsets would be [0, 16, 32].
+// The equality of the tile offsets maps implies that we would be able to CSE
+// all producer instructions of `output` with the TiledHloInstructions computed
+// by using `output` as a tiling root using the propagated tile sizes as tiling
+// parameters. As a side effect, this check will also make sure that by using
+// the tiling from `output` we will produce the full output.
+// `reference_output_tiling_iteration_space` is the number of tiles per output
+// dimension of the root from which the tiling was propagated. We need to ensure
+// that the iteration spaces for tiles match for all outputs. This is a
+// restriction we may lift later in case the buffer sharing logic is adapted.
+// This method assumes that `output` has tile_offset_indexing computed, and
+// returns a FailedPrecondition error if not.
+absl::StatusOr<bool> IsSafeForBufferSharing(
+    const TiledHloInstruction& output,
+    absl::Span<const int64_t> reference_output_tiling_iteration_space,
+    mlir::MLIRContext* mlir_context) {
+  // For expanding reshapes, we can have the case that the number of
+  // blocks are different. This is not supported by the triton emitter.
+  llvm::SmallVector<int64_t> num_tiles_per_dim =
+      GetNumberOfTilesPerDimension(output);
+  if (Product(num_tiles_per_dim) !=
+      Product(reference_output_tiling_iteration_space)) {
     return false;
   }
-  auto range_evaluator = tile_offset_indexing.GetRangeEvaluator();
-  for (int64_t i = 0; i < output_shape.rank(); ++i) {
-    // For now, all strides need to be 0 or 1. With stride 0, we also need to
-    // check whether the tile covers the whole dimension.
-    // TODO(b/390559452): If we allow strides with absolute value > 1, we
-    // need to make sure that the tile offset expression has an additive
-    // component with domain [0, stride - 1]. Also we don't handle negative
-    // strides yet.
-    if (tiled_hlo_instr->tile_stride(i) < 0 ||
-        tiled_hlo_instr->tile_stride(i) > 1) {
-      return false;
-    }
-    // Below we check *surjectivity*, which amounts to checking that
-    // `tile_offsets_indexing` yields contiguous tiles of contiguous
-    // elements (`stride = 1`), and that the first tile and last tile
-    // respectively map to the start and end of the array.
-    // Given that we restrict the stride to 0 or 1, it is enough to check the
-    // range of the tile offsets whether the largest tile offset plus the tile
-    // cover the dimension size, and the smallest tile offset is 0.
-    auto interval = range_evaluator.ComputeExpressionRange(
-        tile_offset_indexing.GetAffineMap().getResult(i));
-    if (interval.lower != 0) {
-      return false;
-    }
-    // We can allow that the last tile extends into out of bounds, we add
-    // proper masking during codegen to make sure that we don't
-    // read/write out of bounds.
-    if ((interval.upper + tiled_hlo_instr->tile_size(i) <
-         output_shape.dimensions(i))) {
-      return false;
-    }
+  // Compute the tile offset indexing directly for `output`. We use default
+  // iteration order and tile stride of 1, which means we can take the identity
+  // map.
+  auto identity_indexing_map =
+      CreateIdentityMap(output.hlo()->shape(), mlir_context);
+  TF_ASSIGN_OR_RETURN(auto tiling_info, ComputeOutputTilingInfo(
+                                            identity_indexing_map,
+                                            output.tile_sizes(), mlir_context));
+
+  // Check whether the tile_offsets_indexing expression is the same as one
+  // computed directly for this root. We need to linearize both indexing
+  // maps to be able to compare them.
+  absl::StatusOr<IndexingMap> maybe_tile_offset_indexing =
+      output.tile_offsets_indexing();
+  if (!maybe_tile_offset_indexing.ok()) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Expected ", output.ToString(),
+                     " to have a tile_offsets_indexing value"));
   }
-  return true;
+  IndexingMap linearized_propagated_tile_offsets_indexing =
+      LinearizeTileOffsets(maybe_tile_offset_indexing.value(),
+                           reference_output_tiling_iteration_space,
+                           mlir_context);
+  IndexingMap linearized_local_tile_offsets_indexing = LinearizeTileOffsets(
+      tiling_info.output_tile_offset_indexing, num_tiles_per_dim, mlir_context);
+  return linearized_propagated_tile_offsets_indexing ==
+         linearized_local_tile_offsets_indexing;
 }
 
 absl::StatusOr<std::vector<const TiledHloInstruction*>> InitializeTiledRoots(
     absl::Span<const HloInstruction* const> roots,
     const std::vector<std::unique_ptr<TiledHloInstruction>>&
-        tiled_hlo_instructions) {
+        tiled_hlo_instructions,
+    absl::Span<const int64_t> num_output_tiles_per_dim,
+    mlir::MLIRContext* mlir_context) {
+  // TODO(b/390559452): Investigate whether it is faster to use linear lookup.
   absl::flat_hash_map<const HloInstruction*, int64_t> roots_to_output_index;
   roots_to_output_index.reserve(roots.size());
   int64_t output_index = 0;
@@ -737,19 +817,33 @@ absl::StatusOr<std::vector<const TiledHloInstruction*>> InitializeTiledRoots(
   // outputs can reference "internal" tiled hlo instructions and may appear
   // multiple times in `instructions_`.
   std::vector<const TiledHloInstruction*> tiled_roots(roots.size(), nullptr);
-  // Handle the real root as special case.
-  tiled_roots[roots_to_output_index[tiled_hlo_instructions.back()->hlo()]] =
-      tiled_hlo_instructions.back().get();
+  // Handle the real root as special case. Then we don't need to do any extra
+  // work in case we are not dealing with a multi-output fusion.
+  auto real_root = tiled_hlo_instructions.back().get();
+  tiled_roots[roots_to_output_index[real_root->hlo()]] = real_root;
+
   for (const auto& tiled_hlo_instr : llvm::drop_end(tiled_hlo_instructions)) {
     auto it = roots_to_output_index.find(tiled_hlo_instr->hlo());
-    if (it != roots_to_output_index.end() &&
-        TilingCoversWholeShapeUniquely(tiled_hlo_instr.get())) {
-      // We may overwrite a previous value, but in case there are multiple
-      // tiled hlo instructions for the root, we arbitrarily prefer the last one
-      // in def-before-use order.
-      tiled_roots[it->second] = tiled_hlo_instr.get();
+    if (it == roots_to_output_index.end()) {
+      continue;
+    }
+    // We potentially allow sharing an input buffer with an output buffer.
+    // Therefore we need to make sure that we use an input tile only in the
+    // iteration in which we overwrite it.
+    TF_ASSIGN_OR_RETURN(
+        bool valid,
+        IsSafeForBufferSharing(*tiled_hlo_instr,
+                               /*reference_output_tiling_iteration_space=*/
+                               num_output_tiles_per_dim, mlir_context));
+    if (!valid) {
+      continue;
     }
+    // We may overwrite a previous value, but in case there are multiple
+    // tiled hlo instructions for the root, we arbitrarily prefer the last one
+    // in def-before-use order.
+    tiled_roots[it->second] = tiled_hlo_instr.get();
   }
+
   // We expect that we found at least one tiled hlo instruction for each root.
   // If not, return an error.
   for (auto [tiled_root, root] : llvm::zip(tiled_roots, roots)) {
@@ -783,6 +877,9 @@ absl::StatusOr<int64_t> GetReductionTileSize(
         "Nested fusions should only have one root.");
   }
   const auto& indexing_map = symbolic_fusion_tiling.indexing_map();
+  // TODO(b/393299275): this is hacky, and will fail if the fusion involves e.g.
+  // another reduction. This'll need fixing before this can be generalized to
+  // arbitrary fusions beyond the dot emitter.
   auto symbol_expr =
       mlir::getAffineSymbolExpr(0, indexing_map.GetMLIRContext());
   const auto& results = indexing_map.GetAffineMap().getResults();
@@ -795,6 +892,7 @@ absl::StatusOr<int64_t> GetReductionTileSize(
 
 }  // namespace
 
+// TODO(b/406244630): this function is too long. We should chunk it up.
 absl::StatusOr<TiledHloComputation>
 SymbolicTileAnalysis::ComputeTiledHloInstructions(
     absl::Span<const int64_t> tile_parameters,
@@ -876,14 +974,15 @@ SymbolicTileAnalysis::ComputeTiledHloInstructions(
   // TODO(b/390569102): This assumes that there is only one root that matters
   // for computing the tiling, and that it is the last symbolic tiled hlo
   // instruction in the list.
-  OutputTilingInfo output_tiling_info = ComputeOutputTilingInfo(
-      root_indexing_.real_root_indexing, tile_parameters, context_);
+  TF_ASSIGN_OR_RETURN(OutputTilingInfo output_tiling_info,
+                      ComputeOutputTilingInfo(root_indexing_.real_root_indexing,
+                                              tile_parameters, context_));
 
   OrderedUniquePtrValueHashSet<TiledHloInstruction> tiled_hlo_instructions_set;
   absl::flat_hash_map<const SymbolicTiledHloInstruction*, TiledHloInstruction*>
       symbolic_to_tiled_hlo_map;
-  // The actual number of TiledHloInstructions can be smaller than the number of
-  // SymbolicTiledHloInstructions, because some instruction will be
+  // The actual number of `TiledHloInstruction`s can be smaller than the number
+  // of `SymbolicTiledHloInstruction`s, because some instruction will be
   // deduplicated, but we reserve to the upper bound to avoid reallocations and
   // additional hash calculations.
   tiled_hlo_instructions_set.Reserve(symbolic_tiled_hlo_instructions_.size());
@@ -924,21 +1023,27 @@ SymbolicTileAnalysis::ComputeTiledHloInstructions(
     if (const auto* symbolic_fusion_tiling =
             dynamic_cast<const SymbolicTiledHloFusionInstruction*>(
                 symbolic_tiled_hlo.get())) {
-      // Instruction is a nested fusion, compute tiled instructions recursively.
-      const HloModule* hlo_module = GetRoot(0)->GetModule();
-      if (hlo_module) {
-        auto debug_options = hlo_module->config().debug_options();
-        QCHECK(
-            debug_options
-                .xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms())
-            << "Nested fusions should only appear for Triton GEMMs.";
-      }
       std::vector<int64_t> nested_tiling_parameters(tile_parameters.begin(),
                                                     tile_parameters.end());
-      TF_ASSIGN_OR_RETURN(int64_t reduction_tile_size,
-                          GetReductionTileSize(*symbolic_fusion_tiling));
-      nested_tiling_parameters.push_back(reduction_tile_size);
+      const HloInstruction* user = hlo->users().front();
+      // Nested fusions materialize regions of control flow delineated rooted
+      // in their user. If the user has a contracting dimension, we need to
+      // derive a tile size along the reduction dimension as well, and therefore
+      // add a dimension to the tile parameters.
+      if (user->opcode() == HloOpcode::kDot) {
+        // TODO(b/393299275): reductions will also fall through this branch,
+        // so the check will have to be extended.
+        TF_ASSIGN_OR_RETURN(int64_t reduction_tile_size,
+                            GetReductionTileSize(*symbolic_fusion_tiling));
+        nested_tiling_parameters.push_back(reduction_tile_size);
+      } else if (user->opcode() != HloOpcode::kConcatenate) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Expected the user of a nested fusion to be a dot or concatenate, "
+            "but got ",
+            user->ToString()));
+      }
 
+      // Compute tiled instructions recursively.
       TF_ASSIGN_OR_RETURN(
           auto tiled_hlo_computation,
           symbolic_fusion_tiling->analysis_.ComputeTiledHloInstructions(
@@ -968,7 +1073,9 @@ SymbolicTileAnalysis::ComputeTiledHloInstructions(
   auto tiled_hlo_instructions = tiled_hlo_instructions_set.ExtractData();
   TF_ASSIGN_OR_RETURN(
       auto tiled_roots,
-      InitializeTiledRoots(root_indexing_.roots, tiled_hlo_instructions));
+      InitializeTiledRoots(root_indexing_.roots, tiled_hlo_instructions,
+                           output_tiling_info.num_output_tiles_per_dim,
+                           context_));
   return TiledHloComputation::FromSortedTiledHloInstructions(
       std::move(tiled_hlo_instructions), tiled_roots,
       output_tiling_info.num_output_tiles_per_dim);
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index 17bfd837ad38..07243bb4904e 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -142,6 +142,9 @@ class SymbolicTileAnalysis {
     return root_indexing_.roots[idx];
   }
 
+  // Returns the output index of the real root.
+  int64_t real_root_index() const { return root_indexing_.real_root_index; }
+
   // Returns the number of tile parameters in this symbolic analysis.
   // TODO(b/390569102): This assumes that there is only one root that matters
   // for computing the tiling, and that it is the last symbolic tiled hlo
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index 84fc88a51ed6..5ecd59294c1e 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -33,19 +34,20 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/service/gpu/model/constraint_expression.h"
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -57,7 +59,6 @@ using ::testing::ExplainMatchResult;
 using ::testing::IsEmpty;
 using ::testing::Matcher;
 using ::testing::Not;
-using ::testing::SizeIs;
 using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 using TilingVector = std::vector<SymbolicTileAnalysis::Tiling>;
@@ -113,7 +114,7 @@ class FakeEmitterSpecificConstraints : public EmitterSpecificConstraints {
   int64_t dim0_tile_size_;
 };
 
-class SymbolicTileAnalysisTest : public HloTestBase {
+class SymbolicTileAnalysisTest : public HloHardwareIndependentTestBase {
  public:
   std::optional<SymbolicTileAnalysis> TryAnalyzeModule(
       HloModule* module,
@@ -231,6 +232,42 @@ ENTRY main {
   EXPECT_EQ(p0_from_subtract0, p0_from_subtract1);
 }
 
+TEST_F(SymbolicTileAnalysisTest,
+       ExpandingReshapeIsSupportedWithTileParamsOutsideBounds) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+fusion {
+  param_0 = f32[20] parameter(0)
+  abs = f32[20] abs(param_0)
+  ROOT reshape = f32[4,5] reshape(abs)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[20] parameter(0)
+  ROOT fusion = f32[4, 5] fusion(param_0), kind=kCustom, calls=fusion
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+
+  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
+                          analysis->ComputeTiledHloInstructions(
+                              /*tile_parameters=*/{1, 8},
+                              /*constraints_are_known_satisfied=*/false,
+                              /*compute_all_tile_offset_indexing_maps=*/true));
+
+  const TiledHloInstruction* root = tiled_hlo_computation.GetRoots()[0];
+  auto parameter = root->operand(0)->operand(0);
+  EXPECT_THAT(*parameter, MatchTiledHloInstruction(
+                              /*tile_sizes=*/{8},
+                              /*tile_strides=*/{1},
+                              /*tile_offsets_indexing=*/R"(
+    (pid_0, pid_1) -> (pid_0 * 5),
+    domain:
+    pid_0 in [0, 3],
+    pid_1 in [0, 0]
+  )"));
+}
+
 TEST_F(SymbolicTileAnalysisTest, ProducerConsumerFusionIsSupported) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
@@ -470,6 +507,73 @@ ENTRY entry_computation {
   )"));
 }
 
+TEST_F(
+    SymbolicTileAnalysisTest,
+    ExtraOutputCannotReuseTileForExpandingReshapeDueToDifferentNumberOfBlocks) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_computation {
+  param_0.1 = f32[6400] parameter(0)
+  abs = f32[6400] abs(param_0.1)
+  reshape = f32[64,100] reshape(abs)
+  ROOT tuple = (f32[64,100], f32[6400]) tuple(reshape, abs)
+}
+
+ENTRY entry_computation {
+  param_0.2 = f32[6400] parameter(0)
+  ROOT fusion = (f32[64,100], f32[6400]) fusion(param_0.2), kind=kCustom,
+    calls=fused_computation
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+
+  auto maybe_tiled_hlo_computation = analysis->ComputeTiledHloInstructions(
+      /*tile_parameters=*/{1, 16},
+      /*constraints_are_known_satisfied=*/false,
+      /*compute_all_tile_offset_indexing_maps=*/false);
+  EXPECT_THAT(
+      maybe_tiled_hlo_computation.status(),
+      tsl::testing::StatusIs(
+          tsl::error::UNIMPLEMENTED,
+          ::testing::HasSubstr("Unsupported case of multi-output fusion")));
+}
+
+TEST_F(SymbolicTileAnalysisTest,
+       ExtraOutputCannotReuseTileForExpandingReshapeDueToDifferentTileOffsets) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_computation {
+  param_0 = f32[36] parameter(0)
+  abs = f32[36] abs(param_0)
+  reshape = f32[3,12] reshape(abs)
+  ROOT tuple = (f32[3,12], f32[36]) tuple(reshape, abs)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[36] parameter(0)
+  ROOT fusion = (f32[3,12], f32[36]) fusion(param_0), kind=kCustom,
+    calls=fused_computation
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+
+  auto maybe_tiled_hlo_computation = analysis->ComputeTiledHloInstructions(
+      /*tile_parameters=*/{1, 16},
+      /*constraints_are_known_satisfied=*/false,
+      /*compute_all_tile_offset_indexing_maps=*/false);
+  EXPECT_THAT(
+      maybe_tiled_hlo_computation.status(),
+      tsl::testing::StatusIs(
+          tsl::error::UNIMPLEMENTED,
+          ::testing::HasSubstr("Unsupported case of multi-output fusion")));
+}
+
 TEST_F(SymbolicTileAnalysisTest, ExtraOutputCanReuseTileForCollapsingReshape) {
   constexpr absl::string_view kHloText = R"(
 HloModule m
@@ -1184,7 +1288,8 @@ ENTRY main {
   param_1 = s32[] parameter(1)
   param_2 = s32[] parameter(2)
   param_3 = s32[] parameter(3)
-  ROOT fusion = s32[1,2] fusion(param_0, param_1, param_2, param_3), kind=kLoop, calls=fused_computation
+  ROOT fusion = s32[1,2] fusion(param_0, param_1, param_2, param_3), kind=kLoop,
+      calls=fused_computation
 }
 )"));
 
@@ -1260,7 +1365,7 @@ ENTRY main {
                ->fused_instructions_computation(),
           &mlir_context_, /*emitter_specific_constraints_builder=*/nullptr);
 
-  EXPECT_TRUE(std::holds_alternative<FusionDecision>(analysis_or_error));
+  ASSERT_TRUE(std::holds_alternative<FusionDecision>(analysis_or_error));
   EXPECT_THAT(std::get<FusionDecision>(analysis_or_error).Explain(),
               ::testing::HasSubstr("Bailing out on reshape"));
 }
@@ -1324,44 +1429,40 @@ TEST_F(SymbolicTileAnalysisTest, TileNestedDotFusions) {
   // M is tiled to 128, K: 8, N: 32.
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
-    lhs {
-      lhs.p0 = bf16[8192,256]{1,0} parameter(0)
-      ROOT lhs.root = bf16[8192,256]{1,0} negate(lhs.p0)
-    }
+lhs {
+  lhs.p0 = bf16[8192,256]{1,0} parameter(0)
+  ROOT lhs.root = bf16[8192,256]{1,0} negate(lhs.p0)
+}
 
-    rhs {
-      ROOT rhs.p0 = bf16[256,512]{1,0} parameter(0)
-    }
+rhs {
+  ROOT rhs.p0 = bf16[256,512]{1,0} parameter(0)
+}
 
-    dot {
-      dot.p0 = bf16[8192,256]{1,0} parameter(0)
-      dot.p1 = bf16[256,512]{1,0} parameter(1)
-
-      dot.lhs = bf16[8192,256]{1,0} fusion(dot.p0),
-        kind=kCustom, calls=lhs, backend_config={
-          "fusion_backend_config":{
-            "block_level_fusion_config":{
-              "output_tiles":[{"sizes":["128","8"]}]}}}
-      dot.rhs = bf16[256,512]{1,0} fusion(dot.p1),
-        kind=kCustom, calls=rhs, backend_config={
-          "fusion_backend_config":{
-            "block_level_fusion_config":{
-              "output_tiles":[{"sizes":["8","32"]}]}}}
-
-      ROOT dot = bf16[8192,512]{1,0} dot(dot.lhs, dot.rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
+dot {
+  dot.p0 = bf16[8192,256]{1,0} parameter(0)
+  dot.p1 = bf16[256,512]{1,0} parameter(1)
 
-    ENTRY main {
-      main.p0 = bf16[8192,256]{1,0} parameter(0)
-      main.p1 = bf16[256,512]{1,0} parameter(1)
-      ROOT fusion = bf16[8192,512]{1,0} fusion(main.p0, main.p1),
-        kind=kCustom, calls=dot
-    }
-  )"));
-  module->mutable_config()
-      .mutable_debug_options()
-      .set_xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms(true);
+  dot.lhs = bf16[8192,256]{1,0} fusion(dot.p0),
+    kind=kCustom, calls=lhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["128","8"]}]}}}
+  dot.rhs = bf16[256,512]{1,0} fusion(dot.p1),
+    kind=kCustom, calls=rhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["8","32"]}]}}}
+
+  ROOT dot = bf16[8192,512]{1,0} dot(dot.lhs, dot.rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY main {
+  main.p0 = bf16[8192,256]{1,0} parameter(0)
+  main.p1 = bf16[256,512]{1,0} parameter(1)
+  ROOT fusion = bf16[8192,512]{1,0} fusion(main.p0, main.p1),
+    kind=kCustom, calls=dot
+})"));
   std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
   ASSERT_TRUE(analysis.has_value());
   TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
@@ -1447,6 +1548,218 @@ ENTRY main {
                          "(pid_0) -> (pid_0 * 2), domain: pid_0 in [0, 3]"));
 }
 
+TEST_F(SymbolicTileAnalysisTest,
+       ConcatenatesInNestedGemmFusionsAllowSymbolicTileDerivation) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+concatenate {
+  p0 = bf16[6] parameter(0)
+  p1 = bf16[6] parameter(1)
+  p2 = bf16[6] parameter(2)
+  ROOT concatenate = bf16[18] concatenate(p0, p1, p2), dimensions={0}
+}
+
+ENTRY main {
+  p0 = bf16[6] parameter(0)
+  p1 = bf16[6] parameter(1)
+  p2 = bf16[6] parameter(2)
+  ROOT fusion = bf16[18] fusion(p0, p1, p2),
+    kind=kCustom, calls=concatenate, backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion"}}
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  EXPECT_TRUE(analysis.has_value());
+}
+
+TEST_F(SymbolicTileAnalysisTest,
+       ConcatenatesOutsideOfNestedGemmFusionsForbidSymbolicTileDerivation) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+concatenate {
+  p0 = bf16[6] parameter(0)
+  p1 = bf16[6] parameter(1)
+  p2 = bf16[6] parameter(2)
+  ROOT concatenate = bf16[18] concatenate(p0, p1, p2), dimensions={0}
+}
+
+ENTRY main {
+  p0 = bf16[6] parameter(0)
+  p1 = bf16[6] parameter(1)
+  p2 = bf16[6] parameter(2)
+  ROOT fusion = bf16[18] fusion(p0, p1, p2),
+    kind=kCustom, calls=concatenate
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  EXPECT_FALSE(analysis.has_value());
+}
+
+TEST_F(SymbolicTileAnalysisTest,
+       ConcatenateOperandsInNestedGemmFusionsAreProvidedCorrectTilingBounds) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+nest0 {
+  ROOT p0 = s32[128] parameter(0)
+}
+
+nest1 {
+  ROOT p0 = s32[128] parameter(0)
+}
+
+nest2 {
+  ROOT p0 = s32[128] parameter(0)
+}
+
+concatenate {
+  p0 = s32[128] parameter(0)
+  p1 = s32[128] parameter(1)
+  p2 = s32[128] parameter(2)
+
+  fusion0 = s32[128] fusion(p0), kind=kCustom, calls=nest0
+  fusion1 = s32[128] fusion(p1), kind=kCustom, calls=nest1
+  fusion2 = s32[128] fusion(p2), kind=kCustom, calls=nest2
+
+  ROOT concatenate = s32[384] concatenate(fusion0, fusion1, fusion2), dimensions={0}
+}
+
+ENTRY main {
+  p0 = s32[128] parameter(0)
+  p1 = s32[128] parameter(1)
+  p2 = s32[128] parameter(2)
+  ROOT fusion = s32[384] fusion(p0, p1, p2),
+    kind=kCustom, calls=concatenate, backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion"}}
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      TiledHloComputation tiled_hlo_computation,
+      analysis->ComputeTiledHloInstructions(
+          /*tile_parameters=*/{32}, /*constraints_are_known_satisfied=*/false,
+          /*compute_all_tile_offset_indexing_maps=*/true));
+
+  // Gather the three nested fusions present in the module, in order.
+  std::vector<const TiledHloFusionInstruction*> nested_fusions(3, nullptr);
+  for (const TiledHloInstruction* tiled_instr :
+       tiled_hlo_computation.instructions()) {
+    if (auto tiled_fusion =
+            dynamic_cast<const TiledHloFusionInstruction*>(tiled_instr)) {
+      nested_fusions[tiled_fusion->operand(0)->hlo()->parameter_number()] =
+          tiled_fusion;
+    }
+  }
+
+  // Ensure that each parameter has domain bounds with the proper offsets.
+  // Concatenate creates partial functions,
+  const TiledHloInstruction* nested_p0 =
+      nested_fusions[0]->called_computation()->GetRoots()[0];
+  EXPECT_THAT(*nested_p0,
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{32}, /*tile_strides=*/{1},
+                  /*tile_offsets_indexing=*/
+                  "(pid_0) -> (pid_0 * 32), domain: pid_0 in [0, 3]"));
+  const TiledHloInstruction* nested_p1 =
+      nested_fusions[1]->called_computation()->GetRoots()[0];
+  EXPECT_THAT(*nested_p1,
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{32}, /*tile_strides=*/{1},
+                  /*tile_offsets_indexing=*/
+                  "(pid_0) -> (pid_0 * 32 - 128), domain: pid_0 in [4, 7]"));
+
+  const TiledHloInstruction* nested_p2 =
+      nested_fusions[2]->called_computation()->GetRoots()[0];
+  EXPECT_THAT(*nested_p2,
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{32}, /*tile_strides=*/{1},
+                  /*tile_offsets_indexing=*/
+                  "(pid_0) -> (pid_0 * 32 - 256), domain: pid_0 in [8, 11]"));
+
+  // Ensure that providing tile sizes that do not divide the resulting offsets
+  // results in the tiling being rejected, even if we pretend that `33`
+  // satisfies the constraints.
+  auto tiled_hlo_computation_or = analysis->ComputeTiledHloInstructions(
+      /*tile_parameters=*/{33}, /*constraints_are_known_satisfied=*/true,
+      /*compute_all_tile_offset_indexing_maps=*/false);
+
+  EXPECT_THAT(tiled_hlo_computation_or,
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kUnimplemented,
+                  ::testing::HasSubstr("not divisible by tile size")));
+}
+
+TEST_F(SymbolicTileAnalysisTest, TrivialDimensionParametersArePreserved) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+lhs {
+  ROOT p0 = f32[137,115] parameter(0)
+}
+
+rhs {
+  ROOT p0 = f32[1,115] parameter(0)
+}
+
+dot {
+  p0 = f32[137,115] parameter(0)
+  p1 = f32[1,115] parameter(1)
+
+  lhs = f32[137,115] fusion(p0),
+    kind=kCustom, calls=lhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16","32"]}]}}}
+  rhs = f32[1,115] fusion(p1),
+    kind=kCustom, calls=rhs, backend_config={
+      "fusion_backend_config":{
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["16","32"]}]}}}
+
+  ROOT dot = f32[137,1] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY main {
+  p0 = f32[137,115] parameter(0)
+  p1 = f32[1,115] parameter(1)
+  ROOT fusion = f32[137,1] fusion(p0, p1),
+    kind=kCustom, calls=dot
+})"));
+  std::optional<SymbolicTileAnalysis> analysis = TryAnalyzeModule(module.get());
+  ASSERT_TRUE(analysis.has_value());
+
+  TF_ASSERT_OK_AND_ASSIGN(TiledHloComputation tiled_hlo_computation,
+                          analysis->ComputeTiledHloInstructions(
+                              /*tile_parameters=*/{16, 16},
+                              /*constraints_are_known_satisfied=*/true,
+                              /*compute_all_tile_offset_indexing_maps=*/true));
+
+  const TiledHloInstruction* dot = tiled_hlo_computation.GetRoots().front();
+  ASSERT_EQ(dot->hlo()->opcode(), HloOpcode::kDot);
+
+  const TiledHloFusionInstruction* lhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(0));
+  const TiledHloFusionInstruction* rhs_fusion =
+      static_cast<const TiledHloFusionInstruction*>(dot->operand(1));
+
+  EXPECT_THAT(*lhs_fusion->called_computation()->GetRoots().front(),
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{16, 32}, /*tile_strides=*/{1, 1},
+                  /*tile_offsets_indexing=*/
+                  "(pid_0, pid_1, pid_2) -> (pid_0 * 16, pid_2 * 32), domain: "
+                  "pid_0 in [0, 8], pid_1 in [0, 0], pid_2 in [0, 3]"));
+
+  // RHS has a trivial dimension. We make sure here that the requested padding
+  // is propagated as requested, and not simplified away (which would result in
+  // an invalid tile size of size "1").
+  // The trivial argument is still expected to be eliminated in the
+  // `tile_offsets_indexing` map, since this allows for more effective CSE.
+  EXPECT_THAT(*rhs_fusion->called_computation()->GetRoots().front(),
+              MatchTiledHloInstruction(
+                  /*tile_sizes=*/{16, 32}, /*tile_strides=*/{1, 1},
+                  /*tile_offsets_indexing=*/
+                  "(pid_0, pid_1, pid_2) -> (0, pid_2 * 32), domain: "
+                  "pid_0 in [0, 8], pid_1 in [0, 0], pid_2 in [0, 3]"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction_test.cc
index ea60fabaf0ef..bc7221e48434 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tiled_hlo_instruction_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/model/symbolic_tile.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -35,7 +35,7 @@ namespace gpu {
 namespace {
 
 using ::testing::ElementsAre;
-using SymbolicTiledHloInstructionTest = HloTestBase;
+using SymbolicTiledHloInstructionTest = HloHardwareIndependentTestBase;
 
 TEST_F(SymbolicTiledHloInstructionTest, TransposeTileSizesAreSupported) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
index dd14a30e4e63..a551667374c3 100644
--- a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
@@ -48,6 +48,8 @@ struct BlockLevelParameters {
       const BlockLevelFusionConfig& config) {
     BlockLevelParameters result;
     result.num_warps = config.num_warps();
+    result.num_ctas = config.num_ctas();
+    result.num_stages = config.num_stages();
     result.output_tile_sizes.reserve(config.output_tiles_size());
     for (const auto& tile : config.output_tiles()) {
       result.output_tile_sizes.push_back(
@@ -65,6 +67,8 @@ struct BlockLevelParameters {
       *config.add_output_tiles() = tile;
     }
     config.set_num_warps(num_warps);
+    config.set_num_ctas(num_ctas);
+    config.set_num_stages(num_stages);
     return config;
   }
 };
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation_test.cc b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation_test.cc
index d5087c013a47..5c091ce4969b 100644
--- a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation_test.cc
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation_test.cc
@@ -33,6 +33,8 @@ TEST(BlockLevelParametersTest,
   tile.mutable_sizes()->Add(19);
   *block_level_fusion_config.add_output_tiles() = tile;
   block_level_fusion_config.set_num_warps(12);
+  block_level_fusion_config.set_num_ctas(13);
+  block_level_fusion_config.set_num_stages(14);
 
   BlockLevelParameters block_level_parameters =
       BlockLevelParameters::FromBlockLevelFusionConfig(
@@ -40,6 +42,8 @@ TEST(BlockLevelParametersTest,
   EXPECT_EQ(block_level_parameters.output_tile_sizes.size(), 1);
   EXPECT_THAT(block_level_parameters.output_tile_sizes[0], ElementsAre(18, 19));
   EXPECT_THAT(block_level_parameters.num_warps, 12);
+  EXPECT_THAT(block_level_parameters.num_ctas, 13);
+  EXPECT_THAT(block_level_parameters.num_stages, 14);
 }
 
 TEST(BlockLevelParametersTest,
@@ -47,6 +51,8 @@ TEST(BlockLevelParametersTest,
   BlockLevelParameters block_level_parameters;
   block_level_parameters.output_tile_sizes = {{18, 19}};
   block_level_parameters.num_warps = 12;
+  block_level_parameters.num_ctas = 13;
+  block_level_parameters.num_stages = 14;
 
   BlockLevelFusionConfig block_level_fusion_config =
       block_level_parameters.ToBlockLevelFusionConfig();
@@ -55,6 +61,8 @@ TEST(BlockLevelParametersTest,
   EXPECT_THAT(block_level_fusion_config.output_tiles(0).sizes(),
               ElementsAre(18, 19));
   EXPECT_THAT(block_level_fusion_config.num_warps(), 12);
+  EXPECT_THAT(block_level_fusion_config.num_ctas(), 13);
+  EXPECT_THAT(block_level_fusion_config.num_stages(), 14);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
index bdefbad7cf46..e1f3de999485 100644
--- a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction.cc
@@ -47,7 +47,8 @@ absl::Status VerifyTiledHloInstructionConstructorPreconditions(
     const HloInstruction* hlo, llvm::SmallVector<int64_t> tile_sizes,
     llvm::SmallVector<int64_t> tile_strides,
     std::optional<IndexingMap> tile_offsets_indexing) {
-  int rank = hlo->shape().rank();
+  const int rank =
+      hlo->shape().IsArray() ? hlo->shape().dimensions().size() : 0;
 
   if (tile_sizes.size() != rank) {
     return absl::InvalidArgumentError(
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction_test.cc b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction_test.cc
index a997e4e20823..344c2ecd292f 100644
--- a/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_instruction_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"  // IWYU pragma: keep
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -34,7 +34,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class TiledHloInstructionTest : public HloTestBase {
+class TiledHloInstructionTest : public HloHardwareIndependentTestBase {
  public:
   mlir::MLIRContext mlir_context_;
 };
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
index 5a66928b2d4b..c893575fc544 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "absl/status/statusor.h"
@@ -29,23 +30,31 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/MathExtras.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/model/affine_map_evaluator.h"
+#include "xla/service/gpu/model/constraint_expression.h"
 #include "xla/service/gpu/model/symbolic_tile.h"
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace gpu {
 
 namespace {
 
+using ::mlir::AffineExpr;
+using ::mlir::AffineMap;
+using ::mlir::MLIRContext;
+
 // Triton enforces that all tensors in the program have less than 1048576
 // elements, otherwise it will fail to compile.
 constexpr int64_t kMaxTensorNumElements = 1048576;
@@ -71,18 +80,16 @@ TritonEmitterConstraints::DeriveCustomConstraints(
 
   for (const auto& instruction : instructions) {
     const HloInstruction* hlo = instruction->hlo();
+    // Don't consider operands to the fusion computation for constraints.
+    if (!fusion_adaptor.ContainsInstruction(hlo)) {
+      continue;
+    }
+
     // Construct custom constraints for parameters of bitcasts and reshapes
-    // within `instructions`. If the operation's parameter is not part of
-    // `instructions`, then the bitcast/reshape node is an operand of the
-    // fusion computation, and there is no need to add constraints.
+    // within `instructions`.
     if (hlo->opcode() == HloOpcode::kReshape ||
         hlo->opcode() == HloOpcode::kBitcast) {
-      if (!fusion_adaptor.ContainsInstruction(hlo)) {
-        continue;
-      }
-
-      mlir::MLIRContext* ctx =
-          instruction->symbolic_tile().size_map().getContext();
+      MLIRContext* ctx = instruction->symbolic_tile().size_map().getContext();
 
       IndexingMap reshape_indexing_map =
           *ComputeOutputToInputIndexing(hlo, /*output_id=*/0, ctx)
@@ -104,6 +111,55 @@ TritonEmitterConstraints::DeriveCustomConstraints(
       result.push_back(
           CustomConstraints{instruction->symbolic_tile().size_map(),
                             std::move(reshape_constraints)});
+      continue;
+    }
+
+    // Construct emitter-specific constraints for concatenates. This allows
+    // filtering for tile sizes that divide the concatenated dimension for all
+    // the operands exactly.
+    if (hlo->opcode() == HloOpcode::kConcatenate) {
+      AffineMap size_map = instruction->symbolic_tile().size_map();
+      MLIRContext* ctx = size_map.getContext();
+      int concatenate_dimension_index = hlo->concatenate_dimension();
+      AffineExpr concatenate_dimension_map_parameter =
+          mlir::getAffineDimExpr(concatenate_dimension_index, ctx);
+
+      // Check that each operand's concatenation dimension is divisible by the
+      // tile size along this dimension.
+      ConstraintExpression divisibility_constraints =
+          ConstraintExpression::GetAlwaysSatisfied();
+
+      for (const HloInstruction* operand : hlo->operands()) {
+        AffineExpr operand_concat_dimension = mlir::getAffineConstantExpr(
+            operand->shape().dimensions(concatenate_dimension_index), ctx);
+        ConstraintExpression::Constraint divisibility_constraint{
+            operand_concat_dimension % concatenate_dimension_map_parameter,
+            Interval{0, 0}};
+        divisibility_constraints =
+            divisibility_constraints && divisibility_constraint;
+      }
+
+      result.push_back(
+          CustomConstraints{size_map, std::move(divisibility_constraints)});
+
+      AffineMap identity_map =
+          AffineMap::getMultiDimIdentityMap(size_map.getNumDims(), ctx);
+
+      // Check that the offset along the contracting dimension is 0.
+      ConstraintExpression::Constraint offset_constraint{
+          instruction->symbolic_tile().offset_map().getResult(
+              concatenate_dimension_index),
+          Interval{0, 0}};
+      result.push_back(CustomConstraints{
+          identity_map, ConstraintExpression(offset_constraint)});
+
+      // Check that the stride along the contracting dimension is 1.
+      ConstraintExpression::Constraint stride_constraint{
+          instruction->symbolic_tile().stride_map().getResult(
+              concatenate_dimension_index),
+          Interval{1, 1}};
+      result.push_back(CustomConstraints{
+          identity_map, ConstraintExpression(stride_constraint)});
     }
   }
 
@@ -116,21 +172,34 @@ TritonEmitterConstraints::GetBuilder(
   return [=](const std::vector<std::unique_ptr<SymbolicTiledHloInstruction>>&
                  instructions,
              const HloFusionAdaptor& fusion_adaptor) {
-    llvm::DenseSet<mlir::AffineMap> unique_tile_size_maps;
+    llvm::DenseSet<AffineMap> unique_tile_size_maps;
+    llvm::SmallVector<RootTileInfo, 2> root_infos;
+    auto roots = fusion_adaptor.GetRoots();
     for (const auto& tiled_hlo_instruction : instructions) {
       unique_tile_size_maps.insert(
           tiled_hlo_instruction->symbolic_tile().size_map());
+      if (absl::c_any_of(roots, [&tiled_hlo_instruction](
+                                    const HloInstructionAdaptor& instr) {
+            return &instr.instruction() == tiled_hlo_instruction->hlo();
+          })) {
+        const auto& shape = tiled_hlo_instruction->hlo()->shape();
+        root_infos.push_back(
+            RootTileInfo{tiled_hlo_instruction->symbolic_tile().size_map(),
+                         shape.IsArray() ? SpanToVector(shape.dimensions())
+                                         : std::vector<int64_t>()});
+      }
     }
 
     std::vector<CustomConstraints> custom_constraints =
         DeriveCustomConstraints(instructions, fusion_adaptor);
 
-    llvm::SmallVector<mlir::AffineMap, 4> tile_size_maps(
+    llvm::SmallVector<AffineMap, 4> tile_size_maps(
         unique_tile_size_maps.begin(), unique_tile_size_maps.end());
 
     return std::unique_ptr<TritonEmitterConstraints>(
         absl::WrapUnique(new TritonEmitterConstraints(
-            std::move(tile_size_maps), std::move(custom_constraints),
+            std::move(tile_size_maps), std::move(root_infos),
+            std::move(custom_constraints),
             /*root_shape=*/instructions.back()->hlo()->shape(),
             device_description)));
   };
@@ -152,9 +221,11 @@ absl::StatusOr<bool> TritonEmitterConstraints::ParametersSatisfyConstraints(
   }
 
   int64_t num_tiles = 1;
-  for (auto [dim_size, tile_size] :
-       llvm::zip(root_shape_.dimensions(), tile_parameters)) {
-    num_tiles *= (dim_size + tile_size - 1) / tile_size;
+  if (root_shape_.IsArray()) {
+    for (auto [dim_size, tile_size] :
+         llvm::zip(root_shape_.dimensions(), tile_parameters)) {
+      num_tiles *= (dim_size + tile_size - 1) / tile_size;
+    }
   }
 
   // Number of blocks will exceed the hardware limit. This limitation comes from
@@ -179,6 +250,26 @@ absl::StatusOr<bool> TritonEmitterConstraints::ParametersSatisfyConstraints(
       return false;
     }
   }
+  for (const auto& root : roots_) {
+    llvm::SmallVector<int64_t> transformed_tile_parameters =
+        EvaluateAffineMap(root.size_map,
+                          /*dim_values=*/tile_parameters);
+    // We require that the propagated tile sizes for potential root tiles are
+    // either powers of 2 or are equal to the dimension size.
+    // TODO(b/365727080): Technically the tile size should always be a power of
+    // 2, but currently if we capture a dimension fully, we use the dimension
+    // size as tile size.
+    for (auto [tile_size, dim_size] :
+         llvm::zip(transformed_tile_parameters, root.dim_sizes)) {
+      CHECK_GT(tile_size, 0);
+      // If the tile size is neither a power of 2, nor equal to dim size, it is
+      // invalid. Otherwise we would for example compute the launch config
+      // incorrectly.
+      if ((tile_size & (tile_size - 1)) && tile_size != dim_size) {
+        return false;
+      }
+    }
+  }
 
   return true;
 }
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.h b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.h
index c8f1356c398d..26b6ca35f0a5 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.h
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints.h
@@ -54,11 +54,21 @@ class TritonEmitterConstraints : public EmitterSpecificConstraints {
     ConstraintExpression constraints;
   };
 
+  // Holds the info needed to validate whether the tiling parameters satisfy the
+  // constraint that they are either powers of 2, or equal to the dimension
+  // size.
+  struct RootTileInfo {
+    mlir::AffineMap size_map;
+    std::vector<int64_t> dim_sizes;
+  };
+
   explicit TritonEmitterConstraints(
       llvm::SmallVector<mlir::AffineMap, 4> tile_size_maps,
+      llvm::SmallVector<RootTileInfo, 2> roots,
       std::vector<CustomConstraints> custom_constraints,
       const Shape& root_shape, const se::DeviceDescription& device_info)
       : tile_size_maps_(std::move(tile_size_maps)),
+        roots_(std::move(roots)),
         custom_constraints_(std::move(custom_constraints)),
         root_shape_(root_shape),
         device_info_(device_info) {}
@@ -91,6 +101,10 @@ class TritonEmitterConstraints : public EmitterSpecificConstraints {
   // collection of unique maps to improve compilation time.
   llvm::SmallVector<mlir::AffineMap, 4> tile_size_maps_;
 
+  // Holds the info for all fusion roots necessary to check whether the tile
+  // sizes evaluate to powers of 2 or have the same size as the dimension.
+  llvm::SmallVector<RootTileInfo, 2> roots_;
+
   // Custom emitter-specific constraints to check in
   // `ParametersSatisfyConstraints`.
   std::vector<CustomConstraints> custom_constraints_;
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
index c3b7e6515077..580ca0b169ec 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
@@ -26,16 +26,16 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -43,7 +43,7 @@ namespace {
 
 using ::tsl::testing::IsOkAndHolds;
 
-class TritonEmitterConstraintsTest : public HloTestBase {
+class TritonEmitterConstraintsTest : public HloHardwareIndependentTestBase {
  public:
   std::optional<SymbolicTileAnalysis> TryAnalyzeModule(
       HloModule* module, bool with_triton_emitter_specific_constraints = true) {
@@ -227,6 +227,276 @@ ENTRY entry_computation {
                    ->HasCustomConstraints());
 }
 
+TEST_F(TritonEmitterConstraintsTest,
+       CustomConcatenateSizeConstraintsAreEnforced) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+concatenate {
+  p0 = bf16[8] parameter(0)
+  p1 = bf16[8] parameter(1)
+  p2 = bf16[8] parameter(2)
+  ROOT concatenate = bf16[24] concatenate(p0, p1, p2), dimensions={0}
+}
+
+ENTRY main {
+  p0 = bf16[8] parameter(0)
+  p1 = bf16[8] parameter(1)
+  p2 = bf16[8] parameter(2)
+  ROOT fusion = bf16[24] fusion(p0, p1, p2),
+    kind=kCustom, calls=concatenate, backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion"}}
+})"));
+  std::optional<SymbolicTileAnalysis> analysis_without_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/false);
+  ASSERT_TRUE(analysis_without_triton_constraints.has_value());
+
+  // (16,) is a theoretically valid tiling for this concatenate, so
+  // SymbolicTileAnalysis should allow it.
+  EXPECT_THAT(
+      analysis_without_triton_constraints->ParametersSatisfyConstraints({16}),
+      IsOkAndHolds(true));
+
+  std::optional<SymbolicTileAnalysis> analysis_with_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/true);
+
+  ASSERT_TRUE(analysis_with_triton_constraints.has_value());
+
+  // (16,) is a theoretically valid tiling for this concatenate, but it won't
+  // work in our lowering for now, because we want to be loading from a single
+  // operand at a time, and it doesn't divide each operand's concatenation
+  // dimension. We want to reject it here.
+  //
+  // Note: this is perfectly OK to expand later as our codegen improves to
+  // handle this case.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({16}),
+      IsOkAndHolds(false));
+
+  // However, (4,) is valid and should still work.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({4}),
+      IsOkAndHolds(true));
+}
+
+TEST_F(TritonEmitterConstraintsTest,
+       ConcatenateConstrainsOffsetToBeZeroAlongConcatenationDimension) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+concatenate {
+  p0 = bf16[16] parameter(0)
+  p1 = bf16[16] parameter(1)
+  p2 = bf16[16] parameter(2)
+  concatenate = bf16[48] concatenate(p0, p1, p2), dimensions={0}
+  ROOT slice = bf16[24] slice(concatenate), slice={[24:48]}
+}
+
+ENTRY main {
+  p0 = bf16[16] parameter(0)
+  p1 = bf16[16] parameter(1)
+  p2 = bf16[16] parameter(2)
+  ROOT fusion = bf16[24] fusion(p0, p1, p2),
+    kind=kCustom, calls=concatenate, backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion"}}
+})"));
+  std::optional<SymbolicTileAnalysis> analysis_without_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/false);
+  ASSERT_TRUE(analysis_without_triton_constraints.has_value());
+
+  // (8,) is a theoretically valid tiling for this concatenate, and one that
+  // works for all operands, so SymbolicTileAnalysis should allow it.
+  EXPECT_THAT(
+      analysis_without_triton_constraints->ParametersSatisfyConstraints({8}),
+      IsOkAndHolds(true));
+
+  std::optional<SymbolicTileAnalysis> analysis_with_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/true);
+
+  ASSERT_TRUE(analysis_with_triton_constraints.has_value());
+
+  // (8,) is a theoretically valid tiling for this concatenate, but the
+  // constraints enforce that the offset along the concatenation dimension be 0.
+  // Here, it is 24, so we expect the tiling to be rejected.
+  //
+  // Note: this is perfectly OK to expand later as our codegen improves to
+  // handle this case.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({8}),
+      IsOkAndHolds(false));
+
+  // Even the smallest tiling, (1,) should be rejected here. (This is
+  // unnecessary in theory, but a sanity check for the implementation).
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({1}),
+      IsOkAndHolds(false));
+}
+
+TEST_F(TritonEmitterConstraintsTest,
+       ConcatenateConstrainsStrideToBeOneAlongConcatenationDimension) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+concatenate {
+  p0 = bf16[16] parameter(0)
+  p1 = bf16[16] parameter(1)
+  p2 = bf16[16] parameter(2)
+  concatenate = bf16[48] concatenate(p0, p1, p2), dimensions={0}
+  ROOT slice = bf16[24] slice(concatenate), slice={[0:48:2]}
+}
+
+ENTRY main {
+  p0 = bf16[16] parameter(0)
+  p1 = bf16[16] parameter(1)
+  p2 = bf16[16] parameter(2)
+  ROOT fusion = bf16[24] fusion(p0, p1, p2),
+    kind=kCustom, calls=concatenate, backend_config={"fusion_backend_config":{
+      "kind":"__triton_nested_gemm_fusion"}}
+})"));
+  std::optional<SymbolicTileAnalysis> analysis_without_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/false);
+  ASSERT_TRUE(analysis_without_triton_constraints.has_value());
+
+  // (8,) is a theoretically valid tiling for this concatenate, and one that
+  // works for all operands, so SymbolicTileAnalysis should allow it.
+  EXPECT_THAT(
+      analysis_without_triton_constraints->ParametersSatisfyConstraints({8}),
+      IsOkAndHolds(true));
+
+  std::optional<SymbolicTileAnalysis> analysis_with_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/true);
+
+  ASSERT_TRUE(analysis_with_triton_constraints.has_value());
+
+  // (8,) is a theoretically valid tiling for this concatenate, but the
+  // constraints enforce that the stride along the concatenation dimension be 1.
+  // Here, it is 2, so we expect the tiling to be rejected.
+  //
+  // Note: this is perfectly OK to expand later as our codegen improves to
+  // handle this case.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({8}),
+      IsOkAndHolds(false));
+
+  // Even the smallest tiling, (1,) should be rejected here. (This is
+  // unnecessary in theory, but a sanity check for the implementation).
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({1}),
+      IsOkAndHolds(false));
+}
+
+TEST_F(TritonEmitterConstraintsTest, FusionHasValidTileSizes) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+fused_computation {
+  param_0 = f32[36] parameter(0)
+  abs = f32[36] abs(param_0)
+  ROOT reshape = f32[6,6] reshape(abs)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[36] parameter(0)
+  ROOT fusion = f32[6,6] fusion(param_0), kind=kCustom,
+    calls=fused_computation, backend_config={"fusion_backend_config":{"kind":"__triton"}}
+})"));
+  std::optional<SymbolicTileAnalysis> analysis_without_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/false);
+  ASSERT_TRUE(analysis_without_triton_constraints.has_value());
+
+  // (1,3) is a theoretically valid tiling for this fusion, so
+  // SymbolicTileAnalysis should allow it.
+  EXPECT_THAT(
+      analysis_without_triton_constraints->ParametersSatisfyConstraints({1, 3}),
+      IsOkAndHolds(true));
+
+  std::optional<SymbolicTileAnalysis> analysis_with_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/true);
+
+  ASSERT_TRUE(analysis_with_triton_constraints.has_value());
+
+  // (1,3) is a theoretically valid tiling for this fusion, but it does not pass
+  // the triton specific condition that all tile sizes are either powers of 2,
+  // or equal to the dimension size.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({1, 3}),
+      IsOkAndHolds(false));
+
+  // However if we capture the last dimension fully, it should be valid.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({1, 6}),
+      IsOkAndHolds(true));
+
+  // Also powers of 2 are valid.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({2, 1}),
+      IsOkAndHolds(true));
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({1, 8}),
+      IsOkAndHolds(true));
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({1, 4}),
+      IsOkAndHolds(true));
+}
+
+TEST_F(TritonEmitterConstraintsTest, MultiOutputFusionHasPowerOfTwoTileSizes) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+fused_computation {
+  param_0 = f32[36] parameter(0)
+  abs = f32[36] abs(param_0)
+  reshape = f32[3,12] reshape(abs)
+  zero = f32[] constant(0)
+  reduce = f32[3] reduce(reshape, zero), to_apply=add, dimensions={1}
+  ROOT tuple = (f32[3], f32[36]) tuple(reduce, abs)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[36] parameter(0)
+  ROOT fusion = (f32[3], f32[36]) fusion(param_0), kind=kCustom,
+    calls=fused_computation, backend_config={"fusion_backend_config":{"kind":"__triton"}}
+})"));
+  std::optional<SymbolicTileAnalysis> analysis_without_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/false);
+  ASSERT_TRUE(analysis_without_triton_constraints.has_value());
+
+  // (1,) is a theoretically valid tiling for this multi-output fusion, so
+  // SymbolicTileAnalysis should allow it.
+  EXPECT_THAT(
+      analysis_without_triton_constraints->ParametersSatisfyConstraints({1}),
+      IsOkAndHolds(true));
+
+  std::optional<SymbolicTileAnalysis> analysis_with_triton_constraints =
+      TryAnalyzeModule(module.get(),
+                       /*with_triton_emitter_specific_constraints=*/true);
+
+  ASSERT_TRUE(analysis_with_triton_constraints.has_value());
+
+  // (1,) is a theoretically valid tiling for this multi-output fusion, but the
+  // propagated tile size of (1,12) for the extra output does not pass the
+  // condition that all tile sizes are powers of 2. This can result in different
+  // paddings for the different roots being used, which can cause problems if
+  // buffers are shared.
+  EXPECT_THAT(
+      analysis_with_triton_constraints->ParametersSatisfyConstraints({1}),
+      IsOkAndHolds(false));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 5290bdc984c1..51b0a8ba3ce2 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -124,11 +124,8 @@ class ConvBfloat16Support : public FloatSupport {
       se::dnn::VersionInfo cudnn_version,
       se::CudaComputeCapability cuda_compute_capability)
       : FloatSupport(BF16),
-        is_conv_bf16_supported_((cudnn_version.major_version() > 8 ||
-                                 (cudnn_version.major_version() == 8 &&
-                                  cudnn_version.minor_version() >= 2)) &&
-                                cuda_compute_capability.IsAtLeast(
-                                    se::CudaComputeCapability::kAmpere)) {}
+        is_conv_bf16_supported_(cuda_compute_capability.IsAtLeast(
+            se::CudaComputeCapability::kAmpere)) {}
 
   bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
                                    int64_t operand_index) const override {
@@ -201,7 +198,7 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   if (!hlo_module->config()
            .debug_options()
            .xla_gpu_experimental_disable_binary_libraries()) {
-    pipeline.AddPass<ConvRewriter>(cuda_compute_capability);
+    pipeline.AddPass<ConvRewriter>(cuda_compute_capability, dnn_version);
     pipeline.AddPass<CudnnFusedConvRewriter>(cuda_compute_capability,
                                              dnn_version, toolkit_version);
     pipeline.AddPass<ConvPaddingLegalization>();
diff --git a/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc b/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
index b9216712e9bf..6fd2ce9b3309 100644
--- a/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
+++ b/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
@@ -180,7 +180,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
 
   llvm::Value* linear_index_base = linear_base_and_thread_idx.linear_base;
 
-  std::vector<llvm::Value*> multidim(shape_.rank(), nullptr);
+  std::vector<llvm::Value*> multidim(shape_.dimensions().size(), nullptr);
   for (int i = 0; i < launch_config_.unroll_factor; ++i) {
     // The add operation is needed even if the offset is 0, since when the
     // kernel is unrolled, the following GEP instruction shares the same pointer
diff --git a/third_party/xla/xla/service/gpu/reduction_utils.cc b/third_party/xla/xla/service/gpu/reduction_utils.cc
index 8e4a29bc7587..30f6fbb8d2aa 100644
--- a/third_party/xla/xla/service/gpu/reduction_utils.cc
+++ b/third_party/xla/xla/service/gpu/reduction_utils.cc
@@ -185,7 +185,7 @@ ReductionDimensions GetReductionKindAndContiguousComponents(
   Shape input_shape = reduce.operand(0)->shape();
   absl::Span<const int64_t> dims_to_reduce = reduce.dimensions();
   DimensionVector dims_to_keep;
-  for (int64_t dim = 0; dim < input_shape.rank(); ++dim) {
+  for (int64_t dim = 0; dim < input_shape.dimensions().size(); ++dim) {
     if (!absl::c_linear_search(dims_to_reduce, dim)) {
       dims_to_keep.push_back(dim);
     }
diff --git a/third_party/xla/xla/service/gpu/reduction_utils_test.cc b/third_party/xla/xla/service/gpu/reduction_utils_test.cc
index 4a7db5677f79..498d9b95609a 100644
--- a/third_party/xla/xla/service/gpu/reduction_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/reduction_utils_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 
 namespace xla {
 namespace gpu {
@@ -28,7 +28,7 @@ namespace {
 
 using ::testing::ElementsAre;
 
-using ReductionUtilsTest = HloTestBase;
+using ReductionUtilsTest = HloHardwareIndependentTestBase;
 
 const char kModulePrefix[] = R"(
     HloModule test_module
diff --git a/third_party/xla/xla/service/gpu/resource_requests.cc b/third_party/xla/xla/service/gpu/resource_requests.cc
index 4f8731e92812..af419c84a36e 100644
--- a/third_party/xla/xla/service/gpu/resource_requests.cc
+++ b/third_party/xla/xla/service/gpu/resource_requests.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/backends/gpu/collectives/gpu_clique.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
@@ -53,39 +52,19 @@ struct PersistentCliquesMap {
 };
 
 static PersistentCliquesMap& GetPersistentCliquesMap() {
-  static auto* persistent_cliques = new PersistentCliquesMap();
+  static auto* const persistent_cliques = new PersistentCliquesMap();
   return *persistent_cliques;
 }
 }  // namespace
 
-absl::Status ResourceRequests::AddClique(const GpuCliqueKey& clique_key,
-                                         int32_t num_local_participants) {
-  VLOG(5) << "Add collective clique request: " << clique_key.ToString()
-          << "; num_local_participants: " << num_local_participants;
-
-  // Check if there is already a clique request for this clique key.
-  if (auto it = cliques_.find(clique_key); it != cliques_.end()) {
-    // We can't have multiple requests for a same clique key with different
-    // number of local participants as we can acquire a clique only once and
-    // we have to know how many executables will join the rendezvous.
-    if (it->second.num_local_participants != num_local_participants) {
-      return absl::InternalError(absl::StrFormat(
-          "Clique request for a clique key %s has number of local "
-          "participants %d different from previously requested value of %d. "
-          "This will lead to deadlock at run time and is an XLA compiler "
-          "bug. Please report it to XLA team.",
-          clique_key.ToString(), num_local_participants,
-          it->second.num_local_participants));
-    }
-    return absl::OkStatus();
-  }
+absl::Status ResourceRequests::AddClique(const GpuCliqueKey& clique_key) {
+  VLOG(5) << "Add collective clique request: " << clique_key.ToString();
 
   // XLA compiler guarantees that all collective operations have the same
   // order on all replicas. We rely on this property to assign unique id to
-  // clique requests simply based on the number of already recored requests.
+  // clique requests simply based on the number of already recorded requests.
   int64_t id = cliques_.size();
-  cliques_.try_emplace(clique_key,
-                       CliqueRequest{clique_key, num_local_participants, id});
+  cliques_.try_emplace(clique_key, CliqueRequest{clique_key, id});
   return absl::OkStatus();
 }
 
@@ -108,7 +87,7 @@ ResourceRequests::AcquireCollectiveCliques(
     const CliqueRequest& r = ordered_cliques[i];
     VLOG(2) << "  clique #" << i << " (for global device id "
             << params.global_device_id.value() << ")"
-            << ": num_local_participants=" << r.num_local_participants
+            << ": num_local_participants=" << r.key.num_local_participants()
             << "; id=" << r.id << "; key=" << r.key.ToString();
   }
 
@@ -133,10 +112,9 @@ ResourceRequests::AcquireCollectiveCliques(
           " in clique key ", r.key.ToString()));
     }
 
-    bool is_local = r.key.devices().size() == r.num_local_participants;
     TF_ASSIGN_OR_RETURN(const CliqueIdCallback* clique_id_callback,
                         params.collectives->GetCliqueIdCallback(
-                            params.nccl_clique_id_callback, is_local));
+                            params.nccl_clique_id_callback, r.key.is_local()));
 
     int64_t max_channels = r.key.stream_kind() == AsyncStreamKind::kCollective
                                ? params.collective_max_nchannels
@@ -159,8 +137,8 @@ ResourceRequests::AcquireCollectiveCliques(
     TF_ASSIGN_OR_RETURN(
         std::shared_ptr<LockableGpuClique::Lock> clique,
         AcquireGpuClique(params.collectives, params.executor, params.run_id,
-                         r.key, *clique_id_callback, *rank,
-                         r.num_local_participants, cliques_map, max_channels));
+                         r.key, *clique_id_callback, *rank, cliques_map,
+                         max_channels));
     ++num_transient_cliques;
 
     // Take a copy of the clique lock, so that we can reuse it. This is
diff --git a/third_party/xla/xla/service/gpu/resource_requests.h b/third_party/xla/xla/service/gpu/resource_requests.h
index 3f531ce9e95b..810a103f41ec 100644
--- a/third_party/xla/xla/service/gpu/resource_requests.h
+++ b/third_party/xla/xla/service/gpu/resource_requests.h
@@ -31,8 +31,7 @@ namespace gpu {
 // Shared resources required for thunk initialization and execution.
 class ResourceRequests : public Thunk::ResourceRequestsInterface {
  public:
-  absl::Status AddClique(const GpuCliqueKey& clique_key,
-                         int32_t num_local_participants) final;
+  absl::Status AddClique(const GpuCliqueKey& clique_key) final;
 
   absl::StatusOr<Thunk::CollectiveCliques> AcquireCollectiveCliques(
       const Thunk::CollectiveExecuteParams& params,
@@ -41,7 +40,6 @@ class ResourceRequests : public Thunk::ResourceRequestsInterface {
  private:
   struct CliqueRequest {
     GpuCliqueKey key;
-    int64_t num_local_participants;
     int64_t id;
   };
 
diff --git a/third_party/xla/xla/service/gpu/runtime_intrinsics_test.cc b/third_party/xla/xla/service/gpu/runtime_intrinsics_test.cc
index f7de70ae7b8b..64219467cc9f 100644
--- a/third_party/xla/xla/service/gpu/runtime_intrinsics_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime_intrinsics_test.cc
@@ -34,7 +34,7 @@ HloModule m
 
 ENTRY e {
   constant = u32[2]{0} constant({0, 1})
-  ROOT nop_return_token = token[] custom-call(constant), custom_call_target="NopReturnToken", custom_call_has_side_effect=true
+  ROOT nop_return_token = token[] custom-call(constant), custom_call_target="NopReturnToken", custom_call_has_side_effect=true, api_version=API_VERSION_STATUS_RETURNING
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index 1535ec8afd32..34ae71ba3c30 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -37,21 +37,21 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/layout.h"
 #include "xla/literal_util.h"
+#include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_indexing_utils.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -188,7 +188,8 @@ absl::StatusOr<HloInstruction*> MakeSplitKOperand(
         dot.parent()->AddInstruction(HloInstruction::CreateConstant(
             LiteralUtil::Zero(operand->shape().element_type())));
 
-    PaddingConfig padding_config = MakeNoPaddingConfig(operand->shape().rank());
+    PaddingConfig padding_config =
+        MakeNoPaddingConfig(operand->shape().dimensions().size());
     padding_config.mutable_dimensions(contracting_dim_idx)
         ->set_edge_padding_high(config.split_k - k % config.split_k);
 
@@ -201,9 +202,9 @@ absl::StatusOr<HloInstruction*> MakeSplitKOperand(
 
   // Add bitcast.
   const Shape& shape = operand->shape();
-  Shape new_shape(shape.element_type(), {}, {}, {});
+  Shape new_shape(shape.element_type(), {}, {});
 
-  for (int i = 0; i < shape.rank(); ++i) {
+  for (int i = 0; i < shape.dimensions().size(); ++i) {
     const int64_t dimension_size = shape.dimensions(i);
     if (i == contracting_dim_idx) {
       new_shape.add_dimensions(config.split_k);
@@ -233,9 +234,8 @@ absl::StatusOr<HloInstruction*> MakeSplitKOperand(
 // Apply split K configuration from the tiling config to the fused dot()
 // computation: bitcast the operands, change the output shape and the dot
 // dimensions.
-absl::Status MakeDotComputationSplitKBatch(
-    HloComputation* computation, const TritonGemmConfig& config,
-    bool disable_reduced_precision_reduction) {
+absl::Status MakeDotComputationSplitKBatch(HloComputation* computation,
+                                           const TritonGemmConfig& config) {
   HloDotInstruction* dot = Cast<HloDotInstruction>(
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot));
   TF_ASSIGN_OR_RETURN(const auto analysis,
@@ -294,6 +294,7 @@ absl::Status MakeDotComputationSplitKBatch(
 
   // Process the collected HLOs from computation root to dot.
   bool did_pad = false;
+  PrimitiveType accumulator_dtype = GetGemmAccumulatorType(dot);
   while (!to_process.empty()) {
     HloInstruction* current = to_process.top();
     to_process.pop();
@@ -319,8 +320,9 @@ absl::Status MakeDotComputationSplitKBatch(
         TF_ASSIGN_OR_RETURN(sparse_meta[i],
                             MakeSparseMetaOperand(*dot, config));
       }
+      // Keep the precision of the accumulator type for the dot output.
       expanded = MakeDotHlo(lhs, rhs, new_dim_numbers, dot->precision_config(),
-                            dot->shape().element_type(), sparsity, sparse_meta)
+                            accumulator_dtype, sparsity, sparse_meta)
                      .value();
       // Make the added batch dimension the major-most, keep the order of the
       // original dimensions.
@@ -333,14 +335,17 @@ absl::Status MakeDotComputationSplitKBatch(
       expanded->mutable_shape()->mutable_layout()->add_minor_to_major(0);
       dot->SetupDerivedInstruction(expanded);
     } else {
-      expanded = computation->AddInstruction(current->CloneWithNewShape(
-          ShapeUtil::PrependMajorDimension(config.split_k, current->shape())));
+      // Propagate the precision of the accumulator to the GEMM fusion root.
+      expanded = computation->AddInstruction(
+          current->CloneWithNewShape(ShapeUtil::PrependMajorDimension(
+              config.split_k, ShapeUtil::ChangeElementType(
+                                  current->shape(), accumulator_dtype))));
       if (expanded->opcode() == HloOpcode::kTranspose) {
         const auto* old_transpose = Cast<HloTransposeInstruction>(current);
         auto* new_transpose = Cast<HloTransposeInstruction>(expanded);
         new_transpose->mutable_dimensions()->clear();
         new_transpose->mutable_dimensions()->reserve(
-            new_transpose->shape().rank());
+            new_transpose->shape().dimensions().size());
         // The split-K batch dimension is always major.
         new_transpose->mutable_dimensions()->push_back(0);
         for (const int64_t dim : old_transpose->dimensions()) {
@@ -357,27 +362,23 @@ absl::Status MakeDotComputationSplitKBatch(
     for (int i = 0; i < expanded->operands().size(); ++i) {
       HloInstruction* operand = expanded->mutable_operand(i);
       if (!to_process_set.contains(operand)) {
-        std::vector<int64_t> broadcast_dimensions(operand->shape().rank());
+        // Broadcast the operand to the Split-K dimension and convert to the
+        // accumulator dtype.
+        HloInstruction* convert = MakeConvertToHlo(operand, accumulator_dtype);
+        std::vector<int64_t> broadcast_dimensions(
+            operand->shape().dimensions().size());
         absl::c_iota(broadcast_dimensions, 1);
         TF_RETURN_IF_ERROR(expanded->ReplaceOperandWithDifferentShape(
-            i, MakeBroadcastHlo(operand, broadcast_dimensions,
-                                ShapeUtil::PrependMajorDimension(
-                                    config.split_k, operand->shape()))));
+            i,
+            MakeBroadcastHlo(convert, broadcast_dimensions,
+                             ShapeUtil::PrependMajorDimension(
+                                 config.split_k,
+                                 ShapeUtil::ChangeElementType(
+                                     operand->shape(), accumulator_dtype)))));
       }
     }
   }
 
-  if (disable_reduced_precision_reduction) {
-    PrimitiveType output_type =
-        computation->root_instruction()->shape().element_type();
-    PrimitiveType accumulator_type = output_type == PrimitiveType::F64
-                                         ? PrimitiveType::F64
-                                         : PrimitiveType::F32;
-
-    computation->root_instruction()->mutable_shape()->set_element_type(
-        accumulator_type);
-  }
-
   if (did_pad) {
     // Check if the analysis can work on the transformed HLO.
     // We can fail gracefully here, but not in IrEmitterTriton.
@@ -398,23 +399,18 @@ absl::Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
     return Unimplemented("Tuple output is not supported with split-K yet.");
   }
 
-  const bool disable_reduced_precision_reduction =
-      dot_fusion->GetModule()
-          ->config()
-          .debug_options()
-          .xla_gpu_triton_gemm_disable_reduced_precision_reduction();
   const PrimitiveType output_type = dot_fusion->shape().element_type();
   const Layout output_layout = dot_fusion->shape().layout();
 
   TF_RETURN_IF_ERROR(MakeDotComputationSplitKBatch(
-      dot_fusion->fused_instructions_computation(), config,
-      disable_reduced_precision_reduction));
+      dot_fusion->fused_instructions_computation(), config));
   const HloInstruction* root = dot_fusion->fused_expression_root();
 
   *dot_fusion->mutable_shape() = root->shape();
   HloInstruction* zero =
       dot_fusion->parent()->AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::Zero(root->shape().element_type())));
+  auto initial_dot_fusion_users = dot_fusion->users();
   // The batch dimension to reduce is the first one by construction.
   TF_ASSIGN_OR_RETURN(HloInstruction * reduce,
                       MakeReduceHlo(dot_fusion, zero, /*dimensions=*/{0},
@@ -423,21 +419,15 @@ absl::Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
   // The output of the reduce has to have the layout of the original dot.
   *reduce->mutable_shape()->mutable_layout() = output_layout;
 
+  HloInstruction* convert = MakeConvertToHlo(reduce, output_type);
+
   if (dot_fusion->IsRoot()) {
-    dot_fusion->parent()->set_root_instruction(reduce,
+    dot_fusion->parent()->set_root_instruction(convert,
                                                /*accept_different_shape=*/true);
   } else {
-    TF_RETURN_IF_ERROR(dot_fusion->ReplaceAllUsesWithDifferentShape(reduce));
-  }
-
-  if (disable_reduced_precision_reduction) {
-    HloInstruction* convert = MakeConvertToHlo(reduce, output_type);
-    if (reduce->IsRoot()) {
-      reduce->parent()->set_root_instruction(convert,
-                                             /*accept_different_shape=*/true);
-    } else {
-      TF_RETURN_IF_ERROR(reduce->ReplaceAllUsesWithDifferentShape(convert));
-    }
+    // Replace all users expect for convert created above to avoid cycles.
+    TF_RETURN_IF_ERROR(dot_fusion->ReplaceAllUsesWithDifferentShape(
+        initial_dot_fusion_users, convert));
   }
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
index 20809a8863fa..e56e0db98ed7 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
@@ -27,22 +27,19 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "xla/layout.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/layout_assignment.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -68,70 +65,8 @@ TEST(HasDivisibleSuffixAllowingSplitTest, AllTests) {
   EXPECT_FALSE(HasDivisibleSuffixAllowingSplit({2, 3}, 2));
 }
 
-using SplitKTest = HloTestBase;
+using SplitKTest = HloHardwareIndependentTestBase;
 
-TEST_F(SplitKTest, MakeSplitK) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_gemm_dot {
-  parameter_0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
-  bitcast.1 = s8[3,5,32,128]{2,1,3,0} bitcast(parameter_0)
-  copy.1 = s8[3,5,32,128]{3,2,1,0} copy(bitcast.1)
-  reshape.5 = s8[480,128]{1,0} reshape(copy.1)
-  convert.8 = bf16[480,128]{1,0} convert(reshape.5)
-  parameter_1 = bf16[16,128]{1,0} parameter(1)
-  ROOT dot.0 = bf16[480,16]{1,0} dot(convert.8, parameter_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
-  p1 = bf16[16,128]{1,0} parameter(1)
-  ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm",
-    metadata={op_name="foo"}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
-  TF_EXPECT_OK(MakeDotSplitKBatch(
-      module->entry_computation()->root_instruction(), config));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kReduce);
-  EXPECT_EQ(root->metadata().op_name(), "foo");
-}
-
-TEST_F(SplitKTest, MakeSplitKWithOutputFusion) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_gemm_dot {
-  p0 = f16[480,128]{1,0} parameter(0)
-  p1 = f16[16,128]{1,0} parameter(1)
-  d = f16[480,16]{1,0} dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  c = bf16[] constant(123)
-  n = bf16[] negate(c)
-  bc = bf16[480,16]{1,0} broadcast(n)
-  cv = bf16[480,16]{1,0} convert(d)
-  ROOT a = bf16[480,16]{1,0} multiply(bc, cv)
-}
-
-ENTRY e {
-  p0 = f16[480,128]{1,0} parameter(0)
-  p1 = f16[16,128]{1,0} parameter(1)
-  ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
-  TF_EXPECT_OK(MakeDotSplitKBatch(
-      module->entry_computation()->root_instruction(), config));
-  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
-            HloOpcode::kReduce);
-}
 
 TEST_F(SplitKTest, PreventSplitKWithNonDistributiveOperations) {
   const std::string hlo_text = R"(
@@ -222,21 +157,21 @@ TEST_F(SplitKTest, MakeSplitKWithNonStandardOutputLayout) {
 HloModule t
 
 triton_gemm_dot {
-  parameter_0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
-  bitcast.1 = s8[3,5,32,128]{2,1,3,0} bitcast(parameter_0)
-  copy.1 = s8[3,5,32,128]{3,2,1,0} copy(bitcast.1)
-  reshape.5 = s8[480,128]{1,0} reshape(copy.1)
-  convert.8 = bf16[480,128]{1,0} convert(reshape.5)
-  parameter_1 = bf16[16,128]{1,0} parameter(1)
-  ROOT dot.0 = bf16[480,16]{0,1} dot(convert.8, parameter_1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+parameter_0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+bitcast.1 = s8[3,5,32,128]{2,1,3,0} bitcast(parameter_0)
+copy.1 = s8[3,5,32,128]{3,2,1,0} copy(bitcast.1)
+reshape.5 = s8[480,128]{1,0} reshape(copy.1)
+convert.8 = bf16[480,128]{1,0} convert(reshape.5)
+parameter_1 = bf16[16,128]{1,0} parameter(1)
+ROOT dot.0 = bf16[480,16]{0,1} dot(convert.8, parameter_1),
+lhs_contracting_dims={1}, rhs_contracting_dims={1}
 }
 
 ENTRY e {
-  p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
-  p1 = bf16[16,128]{1,0} parameter(1)
-  ROOT fusion = bf16[480,16]{0,1} fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+p1 = bf16[16,128]{1,0} parameter(1)
+ROOT fusion = bf16[480,16]{0,1} fusion(p0, p1),
+kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(kHloText));
@@ -245,10 +180,9 @@ ENTRY e {
   TF_EXPECT_OK(MakeDotSplitKBatch(
       module->entry_computation()->root_instruction(), config));
 
-  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
-            HloOpcode::kReduce);
-  EXPECT_EQ(module->entry_computation()->root_instruction()->shape().layout(),
-            Layout({0, 1}));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert(
+                  m::Reduce().WithShape(m::Shape().WithLayout({0, 1})))));
 }
 
 TEST_F(SplitKTest, MakeSplitKWithExistingBatchDim) {
@@ -536,18 +470,18 @@ ENTRY e {
   config.split_k = 8;
   TF_EXPECT_OK(MakeDotSplitKBatch(
       module->entry_computation()->root_instruction(), config));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kReduce);
-  const HloComputation* dot_computation = module->entry_computation()
-                                              ->root_instruction()
-                                              ->operand(0)
-                                              ->called_computations()[0];
+  HloInstruction* dot_fusion;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert(
+                  m::Reduce(m::Fusion(&dot_fusion), m::ConstantScalar()))))
+      << module->ToString();
+  const HloComputation* dot_computation = dot_fusion->called_computations()[0];
   const HloInstruction* p0 = dot_computation->parameter_instruction(0);
   TF_ASSERT_OK_AND_ASSIGN(
       const auto analysis,
       TritonFusionAnalysis::Execute(*dot_computation, config.split_k));
   EXPECT_EQ(dot_computation->root_instruction()->shape(),
-            ShapeUtil::MakeShapeWithDescendingLayout(F16, {8, 7, 5}));
+            ShapeUtil::MakeShapeWithDescendingLayout(F32, {8, 7, 5}));
   EXPECT_THAT(
       *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
       ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2560, /*slice_start=*/0,
@@ -608,12 +542,13 @@ ENTRY e {
   TritonGemmConfig config(16, 16, 16, 2, 1, 4);
   TF_EXPECT_OK(MakeDotSplitKBatch(
       module->entry_computation()->root_instruction(), config));
-  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
-            HloOpcode::kReduce);
-  const HloComputation* dot_computation = module->entry_computation()
-                                              ->root_instruction()
-                                              ->operand(0)
-                                              ->called_computations()[0];
+  HloInstruction* dot_fusion;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert(
+                  m::Reduce(m::Fusion(&dot_fusion), m::ConstantScalar()))))
+      << module->ToString();
+  const HloComputation* dot_computation =
+      dot_fusion->fused_instructions_computation();
   TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
                           TritonFusionAnalysis::Execute(*dot_computation));
 }
@@ -684,19 +619,7 @@ ENTRY e {
   EXPECT_FALSE(result.ok());
 }
 
-class SplitKTestWithMorePreciseReduction
-    : public HloTestBase,
-      public ::testing::WithParamInterface<int> {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_triton_gemm_disable_reduced_precision_reduction(
-        true);
-    return debug_options;
-  }
-};
-
-TEST_F(SplitKTestWithMorePreciseReduction, MakeSplitK) {
+TEST_F(SplitKTest, MakeSplitK) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -715,6 +638,42 @@ ENTRY e {
   p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
   p1 = bf16[16,128]{1,0} parameter(1)
   ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm",
+    metadata={op_name="foo"}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
+
+  HloInstruction* reduce;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert().WithElementType(BF16).WithOperand(
+                  0, m::Op(&reduce)
+                         .WithOpcode(HloOpcode::kReduce)
+                         .WithElementType(F32)
+                         .WithOperand(0, m::Fusion().WithElementType(F32)))))
+      << module->ToString();
+  EXPECT_EQ(reduce->metadata().op_name(), "foo");
+}
+
+TEST_F(SplitKTest, MakeSplitKForInt32Dot) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+triton_gemm_dot {
+  parameter_0 = s8[480,128]{1,0} parameter(0)
+  parameter_1 = s8[16,128]{1,0} parameter(1)
+  ROOT dot.0 = s32[480,16]{1,0} dot(parameter_0, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = s8[480,128]{1,0} parameter(0)
+  p1 = s8[16,128]{1,0} parameter(1)
+  ROOT fusion = s32[480,16]{1,0} fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -725,10 +684,49 @@ ENTRY e {
       module->entry_computation()->root_instruction(), config));
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Convert(m::Reduce(m::Fusion(), m::Constant()))));
+              GmockMatch(m::Reduce()
+                             .WithElementType(S32)
+                             .WithOpcode(HloOpcode::kReduce)
+                             .WithOperand(0, m::Fusion().WithElementType(S32))))
+      << module->ToString();
 }
 
-TEST_F(SplitKTestWithMorePreciseReduction, MakeSplitKWithOutputFusion) {
+TEST_F(SplitKTest, MakeSplitKHonorsDotAlgorithm) {
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+triton_gemm_dot {
+  parameter_0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  bitcast.1 = s8[3,5,32,128]{2,1,3,0} bitcast(parameter_0)
+  copy.1 = s8[3,5,32,128]{3,2,1,0} copy(bitcast.1)
+  reshape.5 = s8[480,128]{1,0} reshape(copy.1)
+  convert.8 = bf16[480,128]{1,0} convert(reshape.5)
+  parameter_1 = bf16[16,128]{1,0} parameter(1)
+  ROOT dot.0 = bf16[480,16]{1,0} dot(convert.8, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    algorithm=dot_bf16_bf16_bf16
+}
+
+ENTRY e {
+  p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  p1 = bf16[16,128]{1,0} parameter(1)
+  ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce().WithElementType(BF16).WithOperand(
+                  0, m::Fusion().WithElementType(BF16))))
+      << module->ToString();
+}
+
+TEST_F(SplitKTest, MakeSplitKWithOutputFusion) {
   const std::string hlo_text = R"(
 HloModule t
 
@@ -755,8 +753,54 @@ ENTRY e {
   TritonGemmConfig config(16, 16, 16, 4, 1, 4);
   TF_EXPECT_OK(MakeDotSplitKBatch(
       module->entry_computation()->root_instruction(), config));
+  HloInstruction* dot_fusion;
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Convert(m::Reduce(m::Fusion(&dot_fusion), m::Constant()))));
+  EXPECT_THAT(
+      dot_fusion->fused_instructions_computation()->root_instruction(),
+      GmockMatch(
+          m::MultiplyAnyOrder(
+              m::Broadcast().WithElementType(F32),
+              m::Convert(m::Dot().WithElementType(F32)).WithElementType(F32))
+              .WithElementType(F32)))
+      << module->ToString();
+}
+
+TEST_F(SplitKTest, MakeSplitKTuple) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_gemm_dot {
+  parameter_0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  bitcast.1 = s8[3,5,32,128]{2,1,3,0} bitcast(parameter_0)
+  copy.1 = s8[3,5,32,128]{3,2,1,0} copy(bitcast.1)
+  reshape.5 = s8[480,128]{1,0} reshape(copy.1)
+  convert.8 = bf16[480,128]{1,0} convert(reshape.5)
+  parameter_1 = bf16[16,128]{1,0} parameter(1)
+  ROOT dot.0 = bf16[480,16]{1,0} dot(convert.8, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  p1 = bf16[16,128]{1,0} parameter(1)
+  fusion = bf16[480,16]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm",
+    metadata={op_name="foo"}
+  ROOT tuple = (bf16[480,16]{1,0}, bf16[16,128]{1,0}) tuple(fusion, p1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  TritonGemmConfig config(16, 16, 16, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction()->mutable_operand(0),
+      config));
+
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Convert(m::Reduce(m::Fusion(), m::Constant()))));
+              GmockMatch(m::Tuple().WithOperand(
+                  0, m::Convert(m::Reduce().WithOperand(0, m::Fusion())))))
+      << module->ToString();
 }
 
 TEST_F(SplitKTest, MakeSplitKWithTransposeAfterDot) {
@@ -781,12 +825,13 @@ ENTRY e {
   TritonGemmConfig config(16, 128, 32, 8, 1, 4);
   TF_EXPECT_OK(MakeDotSplitKBatch(
       module->entry_computation()->root_instruction(), config));
-  const auto* transpose =
-      Cast<HloTransposeInstruction>(module->entry_computation()
-                                        ->root_instruction()
-                                        ->operand(0)
-                                        ->fused_instructions_computation()
-                                        ->root_instruction());
+  HloInstruction* dot_fusion;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert(
+                  m::Reduce(m::Fusion(&dot_fusion), m::ConstantScalar()))))
+      << module->ToString();
+  const auto* transpose = Cast<HloTransposeInstruction>(
+      dot_fusion->fused_instructions_computation()->root_instruction());
   EXPECT_THAT(transpose->dimensions(), ElementsAre(0, 2, 1, 3));
 }
 
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index 123d515811f2..27f927d3ba18 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/proto/proto_utils.h"
@@ -334,15 +335,15 @@ FindVectorizedFeatureDims(const ConvolutionDimensionNumbers& dnums,
                           const Shape& input, const Shape& filter,
                           const Shape& output) {
   return {
-      FindVectorizedDim(input.dimensions_size(), dnums.input_batch_dimension(),
-                        dnums.input_feature_dimension(),
-                        dnums.input_spatial_dimensions()),
-      FindVectorizedDim(filter.dimensions_size(),
+      FindVectorizedDim(
+          input.dimensions().size(), dnums.input_batch_dimension(),
+          dnums.input_feature_dimension(), dnums.input_spatial_dimensions()),
+      FindVectorizedDim(filter.dimensions().size(),
                         dnums.kernel_input_feature_dimension(),
                         dnums.kernel_output_feature_dimension(),
                         dnums.kernel_spatial_dimensions()),
       FindVectorizedDim(
-          output.dimensions_size(), dnums.output_batch_dimension(),
+          output.dimensions().size(), dnums.output_batch_dimension(),
           dnums.output_feature_dimension(), dnums.output_spatial_dimensions()),
   };
 }
@@ -386,31 +387,22 @@ absl::StatusOr<std::unique_ptr<se::Kernel>> CreateKernel(
   return kernel;
 }
 
-absl::Status ExecuteKernelOnStream(se::Kernel& kernel,
-                                   absl::Span<const se::DeviceMemoryBase> args,
-                                   const LaunchDimensions& dims,
-                                   se::Stream* stream) {
+absl::Status ExecuteKernelOnStream(
+    se::Kernel& kernel, absl::Span<const se::KernelArgument> args,
+    const LaunchDimensions& dims,
+    const std::optional<se::ClusterDim>& cluster_dim, se::Stream* stream) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args,
       se::PackKernelArgs(args, kernel.metadata()));
 
+  if (cluster_dim.has_value()) {
+    return kernel.Launch(dims.thread_counts_per_block(), dims.block_counts(),
+                         cluster_dim.value(), stream, *kernel_args);
+  }
   return kernel.Launch(dims.thread_counts_per_block(), dims.block_counts(),
                        stream, *kernel_args);
 }
 
-absl::Status ExecuteKernelOnStream(se::Kernel& kernel,
-                                   absl::Span<const se::DeviceMemoryBase> args,
-                                   const LaunchDimensions& dims,
-                                   const se::ClusterDim& cluster_dim,
-                                   se::Stream* stream) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args,
-      se::PackKernelArgs(args, kernel.metadata()));
-
-  return kernel.Launch(dims.thread_counts_per_block(), dims.block_counts(),
-                       cluster_dim, stream, *kernel_args);
-}
-
 // Unimplemented for integers yet.
 template <typename T, typename Generator>
 typename std::enable_if<std::is_integral<T>::value,
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.h b/third_party/xla/xla/service/gpu/stream_executor_util.h
index 87a91c0bd10f..e261be542016 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.h
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <tuple>
+#include <variant>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -104,17 +105,10 @@ absl::StatusOr<std::unique_ptr<se::Kernel>> CreateKernel(
     uint32_t shared_mem_bytes = 0);
 
 // Runs loaded kernel on the stream with the provided arguments.
-absl::Status ExecuteKernelOnStream(se::Kernel& kernel,
-                                   absl::Span<const se::DeviceMemoryBase> args,
-                                   const LaunchDimensions& dims,
-                                   se::Stream* stream);
-
-// Runs loaded kernel on the stream with the provided arguments.
-absl::Status ExecuteKernelOnStream(se::Kernel& kernel,
-                                   absl::Span<const se::DeviceMemoryBase> args,
-                                   const LaunchDimensions& dims,
-                                   const se::ClusterDim& cluster_dim,
-                                   se::Stream* stream);
+absl::Status ExecuteKernelOnStream(
+    se::Kernel& kernel, absl::Span<const se::KernelArgument> args,
+    const LaunchDimensions& dims,
+    const std::optional<se::ClusterDim>& cluster_dim, se::Stream* stream);
 
 // Initializes `buffer` with random data on `stream`.
 // `rng_state` is an inout parameter for the pseudorandom generator state.
diff --git a/third_party/xla/xla/service/gpu/target_util_test.cc b/third_party/xla/xla/service/gpu/target_util_test.cc
index 862f4f262def..505025152295 100644
--- a/third_party/xla/xla/service/gpu/target_util_test.cc
+++ b/third_party/xla/xla/service/gpu/target_util_test.cc
@@ -50,7 +50,7 @@ class TargetUtilTest : public testing::Test {
 };
 
 TEST_F(TargetUtilTest, NVPTXGroupBarrier) {
-  module_.setTargetTriple("nvptx");
+  module_.setTargetTriple(llvm::Triple("nvptx"));
   EmitCallToTargetIntrinsic(TargetIntrinsicID::kGroupBarrierId,
                             {/*membermask=*/builder_.getInt32(-1)}, {},
                             &builder_);
@@ -59,7 +59,7 @@ TEST_F(TargetUtilTest, NVPTXGroupBarrier) {
 }
 
 TEST_F(TargetUtilTest, AMDGCNGroupBarrier) {
-  module_.setTargetTriple("amdgcn");
+  module_.setTargetTriple(llvm::Triple("amdgcn"));
   EmitCallToTargetIntrinsic(TargetIntrinsicID::kGroupBarrierId, {}, {},
                             &builder_);
   builder_.CreateRetVoid();
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 7e2880c635c7..8b031e70f8d0 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -7,7 +7,7 @@ load(
 )
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
@@ -62,8 +62,10 @@ cc_library(
         "//xla/service/gpu:gpu_executable",
         "//xla/stream_executor:platform_manager",
         "//xla/tests:llvm_irgen_test_base",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -72,7 +74,10 @@ xla_test(
     srcs = if_gpu_is_configured(["dynamic_slice_fusion_test.cc"]),
     backends = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    tags = ["notsan"],  # TODO(b/345034145): Fix tsan error.
+    tags = [
+        "notsan",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],  # TODO(b/345034145): Fix tsan error.
     deps = if_gpu_is_configured(
         #keep sorted
         [
@@ -80,7 +85,6 @@ xla_test(
             "//xla:shape_util",
             "//xla/ffi",
             "//xla/ffi:ffi_api",
-            "//xla/tests:hlo_test_base",
             "@com_google_absl//absl/algorithm:container",
             "@com_google_absl//absl/status",
             "@local_tsl//tsl/platform:test",
@@ -88,7 +92,9 @@ xla_test(
     ) + [
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -96,10 +102,13 @@ xla_test(
     name = "element_wise_row_vectorization_test",
     srcs = ["element_wise_row_vectorization_test.cc"],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:error_spec",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -115,20 +124,38 @@ xla_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "async_kernel_launch_test",
     srcs = ["async_kernel_launch_test.cc"],
+    backends = ["gpu"],
     # "requires-net:external" tag allows uploading `xprof` results.
-    tags = if_google(["requires-net:external"]) + tf_cuda_tests_tags(),
+    tags = if_google(["requires-net:external"]) + ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:debug_options_flags",
+        "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:xla_proto_cc",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_module_config",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tests:literal_test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_test(
+    name = "command_buffer_test",
+    srcs = ["command_buffer_test.cc"],
+    backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
+    deps = [
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:literal_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -282,6 +309,7 @@ xla_test(
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:verified_hlo_module",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -320,6 +348,7 @@ xla_test(
     srcs = ["gpu_triton_custom_call_test.cc"],
     backends = [
         "gpu_a100",
+        "gpu_h100",
         "gpu_v100",
         "gpu_b200",
         "gpu_amd_any",
@@ -402,6 +431,7 @@ xla_test(
         ":gpu_codegen_test",
         "//xla:error_spec",
         "//xla/service:hlo_module_config",
+        "//xla/service:platform_util",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
@@ -537,11 +567,12 @@ xla_test(
         "//xla:shape_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -613,15 +644,18 @@ lit_test_suite(
 #     name = "xla-opt",
 #     srcs = ["xla-opt.cc"],
 #     deps = [
-#         "//xla/backends/gpu/codegen/emitters/transforms:passes",
-#         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
-#         "//xla/backends/gpu/codegen/triton/transforms:passes",
-#         "//xla/codegen/emitters/transforms:passes",
 #         "@llvm-project//mlir:AllExtensions",
 #         "@llvm-project//mlir:FuncDialect",
 #         "@llvm-project//mlir:MlirOptLib",
 #         "@llvm-project//mlir:TensorDialect",
+#         "//xla/backends/gpu/codegen/emitters/transforms:passes",
+#         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+#         "//xla/backends/gpu/codegen/triton/transforms:passes",
+#         # Needed for xla_ops.h
+#         "//xla/codegen/emitters/ir:xla",  # buildcleaner: keep
+#         "//xla/codegen/emitters/transforms:passes",
 #         "@triton//:AllPassesAndDialects",
+#         "@triton//third_party/amd:TestAMDRangeAnalysis",
 #     ],
 # )
 # copybara:uncomment_end
@@ -651,9 +685,11 @@ xla_test(
     name = "in_place_op_test",
     srcs = ["in_place_op_test.cc"],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:debug_options_flags",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -668,11 +704,15 @@ xla_test(
         "//xla:xla_proto_cc",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -693,38 +733,17 @@ xla_test(
     ] + if_oss([
         "gpu_any",
     ]),
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:error_spec",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
     ],
 )
 
-xla_test(
-    name = "gpu_sparse_dot_test",
-    srcs = if_cuda_is_configured(["gpu_sparse_dot_test.cc"]),
-    backends = [
-        "gpu_a100",
-        "gpu_h100",
-        "gpu_b200",
-    ],
-    tags = ["cuda-only"],
-    deps = if_cuda_is_configured(
-        [
-            ":gpu_codegen_test",
-            "@com_google_googletest//:gtest",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/types:span",
-            "//xla:literal",
-            "//xla:literal_util",
-            "//xla/tests:xla_internal_test_main",
-            "//xla/tsl/lib/core:status_test_util",
-        ],
-        ["@com_google_googletest//:gtest_main"],  # b/317293391
-    ),
-)
-
 xla_test(
     name = "gpu_cub_sort_test",
     size = "medium",
@@ -889,42 +908,13 @@ xla_test(
     name = "nop_custom_call_test",
     srcs = ["nop_custom_call_test.cc"],
     backends = ["gpu"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:literal",
         "//xla:literal_util",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-xla_cc_test(
-    name = "nvshmem_test",
-    srcs = ["nvshmem_test.cc"],
-    tags = [
-        "cuda-only",
-        "gpu",
-        "no_oss",
-        "nomsan",
-    ] + if_google(google_value = ["requires-gpu-nvidia:2"]),  # This tag is not documented in OSS yet.
-    deps = [
-        "//xla:debug_options_flags",
-        "//xla:status_macros",
-        "//xla/backends/gpu/collectives:nvshmem_collectives",
-        "//xla/pjrt/distributed",
-        "//xla/pjrt/distributed:client",
-        "//xla/pjrt/distributed:service",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:subprocess",
-        "//xla/tsl/util:command_line_flags",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
diff --git a/third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc b/third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc
index 7aac65a38121..f47e560b332f 100644
--- a/third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/async_kernel_launch_test.cc
@@ -17,17 +17,19 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/debug_options_flags.h"
+#include "xla/error_spec.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/xla.pb.h"
 
 namespace xla::gpu {
 namespace {
 
-class AsyncKernelLaunchTest : public HloTestBase {};
+using AsyncKernelLaunchTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 HloModuleConfig GetModuleConfig() {
   // Allow even small graphs to be launched on the GPU.
@@ -77,5 +79,58 @@ TEST_F(AsyncKernelLaunchTest, BasicFusion) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+TEST_F(AsyncKernelLaunchTest, BasicAsyncComputation) {
+  const char* hlo_text = R"(
+    HloModule Test1
+
+    add_F32 {
+      lhs = f32[2]{0} parameter(0)
+      rhs = f32[2]{0} parameter(1)
+      ROOT add = f32[2]{0} add(lhs, rhs)
+    }
+
+    ENTRY Test1 {
+      a = f32[2]{0} parameter(0)
+      b = f32[2]{0} parameter(1)
+      start = ((f32[2]{0}, f32[2]{0}), f32[2]{0}) call-start(a, b), to_apply=add_F32
+      ROOT done = f32[2]{0} call-done(start)
+    }
+  )";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(AsyncKernelLaunchTest, ScheduledOverlappingAsyncComputations) {
+  const char* hlo_text = R"(
+    HloModule Test1
+
+    add {
+      lhs = f32[2]{0} parameter(0)
+      rhs = f32[2]{0} parameter(1)
+      ROOT add = f32[2]{0} add(lhs, rhs)
+    }
+    
+    mul {
+      lhs = f32[2]{0} parameter(0)
+      rhs = f32[2]{0} parameter(1)
+      ROOT mul = f32[2] multiply(lhs, rhs)
+    }
+
+    ENTRY Test1 {
+      a = f32[2]{0} parameter(0)
+      b = f32[2]{0} parameter(1)
+      start = ((f32[2]{0}, f32[2]{0}), f32[2]{0}) call-start(a, b), to_apply=add,
+        frontend_attributes={_xla_stream_annotation="1", _scheduling_group_id="0"}
+      start.1 = ((f32[2]{0}, f32[2]{0}), f32[2]{0}) call-start(a, b), to_apply=mul,
+        frontend_attributes={_xla_stream_annotation="2", _scheduling_group_id="0"}
+      done = f32[2]{0} call-done(start)
+      done.1 = f32[2]{0} call-done(start.1)
+      ROOT result = f32[2]{0} add(done, done.1)
+    }
+  )";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc b/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
new file mode 100644
index 000000000000..1818613d7830
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/command_buffer_test.cc
@@ -0,0 +1,271 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+class CommandBufferTest : public HloPjRtTestBase,
+                          public ::testing::WithParamInterface<bool> {
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = HloPjRtTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_graph_enable_concurrent_region(GetParam());
+    return debug_options;
+  }
+};
+
+TEST_P(CommandBufferTest, Fusions) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule m, is_scheduled=true
+
+  double {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] add(p0, p0)
+  }
+
+  square {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] multiply(p0, p0)
+  }
+
+  sum {
+    p0 = f32[2,2] parameter(0)
+    p1 = f32[2,2] parameter(1)
+    ROOT sum = f32[2,2] add(p0, p1)
+  }
+
+  command_buffer {
+    p0 = f32[2,2] parameter(0)
+    f0 = f32[2,2] fusion(p0), kind=kLoop, calls=double
+    f1 = f32[2,2] fusion(p0), kind=kLoop, calls=square
+    ROOT f3 = f32[2,2] fusion(f0, f1), kind=kLoop, calls=sum
+  }
+
+  ENTRY main {
+    p0 = f32[2,2] parameter(0)
+    ROOT call = f32[2,2] call(p0), to_apply=command_buffer
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  Literal expected = LiteralUtil::CreateR2<float>({{3.0, 8.0}, {15.0, 24.0}});
+
+  Literal result = ExecuteNoHloPasses(std::move(module), {&argument});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+TEST_P(CommandBufferTest, TrueFalseConditional) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule m, is_scheduled=true
+
+  double {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] add(p0, p0)
+  }
+
+  square {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] multiply(p0, p0)
+  }
+
+  double_computation {
+    p0 = f32[2,2] parameter(0)
+    ROOT double = f32[2,2] fusion(p0), kind=kLoop, calls=double
+  }
+
+  square_computation {
+    p0 = f32[2,2] parameter(0)
+    ROOT square = f32[2,2] fusion(p0), kind=kLoop, calls=square
+  }
+
+  command_buffer {
+    p0 = pred[] parameter(0)
+    p1 = f32[2,2] parameter(1)
+    ROOT conditional = f32[2,2] conditional(p0, p1, p1),
+                                true_computation=double_computation,
+                                false_computation=square_computation
+  }
+
+  ENTRY main {
+    p0 = pred[] parameter(0)
+    p1 = f32[2,2] parameter(1)
+    ROOT call = f32[2,2] call(p0, p1), to_apply=command_buffer
+  })";
+
+  Literal p1 = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+
+  {  // Execute `true` branch.
+    TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+
+    Literal pred = LiteralUtil::CreateR0<bool>(true);
+    Literal expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
+    Literal result = ExecuteNoHloPasses(std::move(m), {&pred, &p1});
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+
+  {  // Execute `false` branch.
+    TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+
+    Literal pred = LiteralUtil::CreateR0<bool>(false);
+    Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
+    Literal result = ExecuteNoHloPasses(std::move(m), {&pred, &p1});
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+}
+
+TEST_P(CommandBufferTest, IndexConditional) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule m, is_scheduled=true
+
+  double {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] add(p0, p0)
+  }
+
+  square {
+    p0 = f32[2,2] parameter(0)
+    ROOT add = f32[2,2] multiply(p0, p0)
+  }
+
+  double_computation {
+    p0 = f32[2,2] parameter(0)
+    ROOT double = f32[2,2] fusion(p0), kind=kLoop, calls=double
+  }
+
+  square_computation {
+    p0 = f32[2,2] parameter(0)
+    ROOT square = f32[2,2] fusion(p0), kind=kLoop, calls=square
+  }
+
+  command_buffer {
+    p0 = s32[] parameter(0)
+    p1 = f32[2,2] parameter(1)
+    ROOT conditional = f32[2,2] conditional(p0, p1, p1),
+      branch_computations={double_computation, square_computation}
+  }
+
+  ENTRY main {
+    p0 = s32[] parameter(0)
+    p1 = f32[2,2] parameter(1)
+    ROOT call = f32[2,2] call(p0, p1), to_apply=command_buffer
+  })";
+
+  Literal p1 = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+
+  {  // Execute `0` branch.
+    TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+
+    Literal index = LiteralUtil::CreateR0<int32_t>(0);
+    Literal expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
+    Literal result = ExecuteNoHloPasses(std::move(m), {&index, &p1});
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+
+  {  // Execute `1` branch.
+    TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+
+    Literal index = LiteralUtil::CreateR0<int32_t>(1);
+    Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
+    Literal result = ExecuteNoHloPasses(std::move(m), {&index, &p1});
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+
+  {  // Execute `1024` branch (our of bound index executes N-1 branch).
+    TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+
+    Literal index = LiteralUtil::CreateR0<int32_t>(1024);
+    Literal expected = LiteralUtil::CreateR2<float>({{1.0, 4.0}, {9.0, 16.0}});
+    Literal result = ExecuteNoHloPasses(std::move(m), {&index, &p1});
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+}
+
+TEST_P(CommandBufferTest, WhileLoop) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule m, is_scheduled=true
+
+  compare_fusion {
+    p0 = s32[] parameter(0)
+    ten = s32[] constant(10)
+    ROOT compare = compare(p0, ten), direction=LT
+  }
+
+  add_one {
+    p0 = s32[] parameter(0)
+    one = s32[] constant(1)
+    ROOT add = add(p0, one)
+  }
+
+  add_two {
+    p0 = f32[] parameter(0)
+    two = f32[] constant(2.0)
+    ROOT add = add(p0, two)
+  }
+
+  body {
+    p0 = (s32[], f32[]) parameter(0)
+    cnt = get-tuple-element(p0), index=0
+    val = get-tuple-element(p0), index=1
+    add_cnt = s32[] fusion(cnt), kind=kLoop, calls=add_one
+    add_val = f32[] fusion(val), kind=kLoop, calls=add_two
+    ROOT tuple = (s32[], f32[]) tuple(add_cnt, add_val)
+  }
+
+  cond {
+    p0 = (s32[], f32[]) parameter(0)
+    cnt = get-tuple-element(p0), index=0
+    ROOT compare = pred[] fusion(cnt), kind=kLoop, calls=compare_fusion
+  }
+
+  command_buffer {
+    p0 = (s32[], f32[]) parameter(0)
+    ROOT while = while(p0), condition=cond, body=body
+  }
+
+  ENTRY main {
+    p0 = (s32[], f32[]) parameter(0)
+    ROOT call = (s32[], f32[]) call(p0), to_apply=command_buffer
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+
+  Literal cnt = LiteralUtil::CreateR0<int32_t>(0);
+  Literal value = LiteralUtil::CreateR0<float>(0.0);
+  Literal argument = LiteralUtil::MakeTuple({&cnt, &value});
+
+  Literal expected_cnt = LiteralUtil::CreateR0<int32_t>(10);
+  Literal expected_value = LiteralUtil::CreateR0<float>(20.0);
+  Literal expected = LiteralUtil::MakeTuple({&expected_cnt, &expected_value});
+
+  Literal result = ExecuteNoHloPasses(std::move(module), {&argument});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+INSTANTIATE_TEST_SUITE_P(CommandBufferTests, CommandBufferTest,
+                         ::testing::Values(false, true));
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
index a88d1b17befc..dd2a75881159 100644
--- a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
@@ -3,8 +3,13 @@
 // RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
 
 
-// CHECK-SM70: custom-call(f32
-// CHECK-SM80: custom-call(bf16
+// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} convert(%{{.+}})
+// CHECK-SM70: custom-call(%[[convert1]], %[[convert2]]), custom_call_target="__cublas$gemm"
+
+// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM80: %[[b:.+]] = bf16[32,1536]{1,0} parameter(1)
+// CHECK-SM80: custom-call(%[[convert]], %[[b]]), custom_call_target="__cublas$gemm"
 
 HloModule module
 
@@ -17,8 +22,13 @@ ENTRY %computation1 {
 
 // -----
 
-// CHECK-SM70: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(f32[1536,6144]{1,0} {{.*}}, f32[32,1536]{1,0} {{.*}}), custom_call_target="__cublas$gemm"
-// CHECK-SM80: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(bf16[1536,6144]{1,0} %convert.2.0, bf16[32,1536]{1,0} %b.1), custom_call_target="__cublas$gemm"
+// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} convert(%{{.+}})
+// CHECK-SM70: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(%[[convert1]], %[[convert2]]), custom_call_target="__cublas$gemm"
+
+// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM80: %[[b:.+]] = bf16[32,1536]{1,0} parameter(1)
+// CHECK-SM80: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(%[[convert]], %[[b]]), custom_call_target="__cublas$gemm"
 
 HloModule module2
 
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
index 2e1f9d806112..b384c51b5463 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
@@ -15,21 +15,24 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -173,10 +176,11 @@ TEST(SharedMemoryUseTest, ArrayReversalWorks) {
   se::DeviceMemory<uint32_t> dev_n_rows = executor->AllocateScalar<uint32_t>();
   TF_CHECK_OK(stream->Memcpy(&dev_n_rows, &n_rows, sizeof(uint32_t)));
   TF_CHECK_OK(stream->BlockHostUntilDone());
+
   TF_CHECK_OK(ExecuteKernelOnStream(
       *kernel, {device_buffer, dev_n_cols, dev_n_rows},
       {/*block_x_count=*/1, /*thread_x_count_per_block=*/n_cols},
-      stream.get()));
+      /*cluster_dim=*/{}, stream.get()));
   TF_CHECK_OK(stream->BlockHostUntilDone());
   TF_CHECK_OK(
       stream->Memcpy(host_buffer.data(), device_buffer, buffer_size_bytes));
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
index e379e512a57e..6238a5cba571 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_slice_fusion_test.cc
@@ -13,20 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
-#include <functional>
 #include <utility>
 
-#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "xla/error_spec.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
-#include "xla/primitive_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -38,7 +34,7 @@ static constexpr char kPlatform[] = "CUDA";
 static constexpr char kPlatform[] = "ROCM";
 #endif
 
-class DynamicSliceFusionTest : public HloTestBase {};
+class DynamicSliceFusionTest : public HloPjRtTestBase {};
 
 TEST_F(DynamicSliceFusionTest, GemmSlice) {
   const char* hlo_reference = R"(
diff --git a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization_test.cc b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization_test.cc
index ffd507f94959..6ec496842a08 100644
--- a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization_test.cc
@@ -11,13 +11,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-class ElementWiseRowVectorizationTest : public HloTestBase {};
+using ElementWiseRowVectorizationTest =
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 TEST_F(ElementWiseRowVectorizationTest, SimpleAddSmallRowBroadcastingTest) {
   const char* hlo_text = R"(
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_atomic_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_atomic_test.cc
index 6897b9fa850e..c3eac7e9df6a 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_atomic_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_atomic_test.cc
@@ -101,10 +101,10 @@ TEST_F(GpuAtomicTest, TestAddAtomicF32) {
 )";
 
   CompileAndVerifyIr(hlo_string, is_built_with_rocm_ ? R"(
-CHECK: atomicrmw fadd ptr addrspace(1) %[[ADDR:.*]], float %[[VALUE:.*]] syncscope("agent") seq_cst
+CHECK: atomicrmw fadd ptr %[[ADDR:.*]], float %[[VALUE:.*]] syncscope("agent-one-as") monotonic
 )"
                                                      : R"(
-CHECK: atomicrmw fadd ptr %[[ADDR:.*]], float %[[VALUE:.*]] seq_cst
+CHECK: atomicrmw fadd ptr %[[ADDR:.*]], float %[[VALUE:.*]] monotonic
 )");
 }
 
@@ -141,7 +141,7 @@ TEST_F(GpuAtomicTest, TestAddAtomicF64) {
 )";
 
   CompileAndVerifyIr(hlo_string, R"(
-CHECK: atomicrmw fadd ptr %[[ADDR:.*]], double %[[VALUE:.*]] seq_cst
+CHECK: atomicrmw fadd ptr %[[ADDR:.*]], double %[[VALUE:.*]] monotonic
 )");
 }
 
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.cc
index 6525d792775d..4f31f91069fe 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -49,9 +51,11 @@ GpuCodegenTest::CreateNewVerifiedModuleWithFTZ(bool ftz) {
 }
 
 void GpuCodegenTest::CompileAndOptionallyVerifyPtx(
-    std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          CompileToExecutable(std::move(hlo_module)));
+    std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern,
+    bool run_optimization_passes) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      CompileToExecutable(std::move(hlo_module), run_optimization_passes));
   std::string ptx_str(static_cast<GpuExecutable*>(executable.get())->text());
 
   // On the ROCM platform the "ptx" string is not populated for the compiled
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h b/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
index d77a4463055f..6fe9ac6ddca2 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
@@ -48,7 +48,8 @@ class GpuCodegenTest : public LlvmIrGenTestBase {
   // and hence the "Optionally" in function name.
   // For ROCm platform this routine will only do the "Compile" part.
   void CompileAndOptionallyVerifyPtx(
-      std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern);
+      std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern,
+      bool run_optimization_passes = true);
 
   bool is_built_with_rocm_;
 };
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
index 6a0149915efa..2f265c64419b 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/strings/str_replace.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -73,5 +74,429 @@ TEST_F(GpuCopyTest, CopyTranspose) {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+constexpr char kSliceMemcpyModule[] = R"(
+    dynamic_slice {
+      p0 = s32[4,8,8]{2,1,0} parameter(0)
+      p1 = s32[] parameter(1)
+      c1 = s32[] constant(1)
+      p2 = s32[] parameter(2)
+
+      p1p1 = s32[] add(p1, c1)
+
+      // Test all supported kinds of offsets: derived from the while loop's
+      // induction variable (p1p1), constant (c1) and always clamped to 0, so
+      // the value is irrelevant (p2).
+      ROOT slice = s32[1,1,8] dynamic-slice(p0, p1p1, c1, p2),
+          dynamic_slice_sizes={1,1,8}
+    }
+
+    remainder {
+      p0 = s32[] parameter(0)
+      c5 = s32[] constant(5)
+      // We take the value modulo 5 to test for correct clamping (the offset 4
+      // must get clamped to 3, since it's greater or equal than the dimension
+      // size).
+      ROOT remainder = s32[] remainder(p0, c5)
+    }
+
+    add {
+      p0 = s32[] parameter(0)
+      c1 = s32[] constant(1)
+      ROOT sum = s32[] add(p0, c1)
+    }
+
+    add_slices {
+      p0 = s32[1,1,8] parameter(0)
+      p1 = s32[1,1,8] parameter(1)
+      ROOT sum = s32[1,1,8] add(p0, p1)
+    }
+
+    times_two {
+      p0 = s32[] parameter(0)
+      ROOT sum = s32[] add(p0, p0)
+    }
+
+    body {
+      p0 = (s32[], s32[4,8,8]{2,1,0}, s32[1,1,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      input = s32[4,8,8]{2,1,0} get-tuple-element(p0), index=1
+
+      ivar_copy = s32[] copy(ivar)
+      acc = s32[1,1,8] get-tuple-element(p0), index=2
+      acc_copy = s32[1,1,8] copy(acc)
+
+      offset1 = s32[] fusion(ivar_copy), kind=kLoop, calls=remainder
+      offset2 = s32[] get-tuple-element(p0), index=3
+
+      slice = s32[1,1,8] fusion(input, offset1, offset2), kind=kLoop, calls=dynamic_slice,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      next_ivar = s32[] fusion(ivar_copy), kind=kLoop, calls=add
+      next_offset_2 = s32[] fusion(offset2), kind=kLoop, calls=times_two
+
+      next_acc = s32[1,1,8] fusion(acc_copy, slice), kind=kLoop, calls=add_slices
+      ROOT result = (s32[], s32[4,8,8]{2,1,0}, s32[1,1,8], s32[])
+          tuple(next_ivar, input, next_acc, next_offset_2)
+    }
+
+    compare {
+      p0 = s32[] parameter(0)
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(p0, c6), direction=LT
+    }
+
+    condition {
+      p0 = (s32[], s32[4,8,8]{2,1,0}, s32[1,1,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      ROOT cmp = pred[] fusion(ivar), kind=kLoop, calls=compare
+    }
+
+    zero {
+      c0 = s32[] constant(0)
+      ROOT bc = s32[1,1,8] broadcast(c0), dimensions={}
+    }
+
+    input {
+      iota = s32[256] iota(), iota_dimension=0
+      ROOT bc = s32[4,8,8]{2,1,0} bitcast(iota)
+    }
+
+    ENTRY main {
+      input = s32[4,8,8]{2,1,0} fusion(), kind=kLoop, calls=input
+      init_acc = s32[1,1,8] fusion(), kind=kLoop, calls=zero
+      c0 = s32[] constant(0)
+      c1 = s32[] constant(1)
+      tuple = (s32[], s32[4,8,8]{2,1,0}, s32[1,1,8], s32[]) tuple(c0, input, init_acc, c1)
+      ROOT while = (s32[], s32[4,8,8]{2,1,0}, s32[1,1,8], s32[]) while(tuple),
+          condition=condition, body=body,
+          backend_config={"known_trip_count":{"n":"6"},
+                          "known_init_step":{"init":"0","step":"1"},
+                          "known_induction_variable":{"tuple_index":"0"}}
+    })";
+
+TEST_F(GpuCopyTest, UseMemcpyForDynamicSlice) {
+  // This verifies that dynamic slices can be implemented using memcpy in
+  // certain conditions.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kSliceMemcpyModule));
+
+  // There should not be a kernel for `dynamic_slice`.
+  CompileAndVerifyIr(std::move(hlo_module), "; CHECK-NOT: void @slice",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/false);
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(kSliceMemcpyModule, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(GpuCopyTest, DoNotUseMemcpyForDynamicSlice) {
+  // This is a test for the CompileAndVerifyIr statement in
+  // UseMemcpyForDynamicSlice. When the conditions are not met, there should be
+  // a fusion for the slice.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kSliceMemcpyModule));
+
+  // This prevents the memcpy fusion logic from triggering.
+  hlo_module->entry_computation()->root_instruction()->clear_backend_config();
+
+  CompileAndVerifyIr(std::move(hlo_module), "; CHECK: void @slice",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/false);
+}
+
+TEST_F(GpuCopyTest, DoNotUseMemcpyWithLayoutChange) {
+  // By changing the layout of the result, the slice is no longer contiguous and
+  // cannot be emitted with a memcpy. Technically, this means the input program
+  // is incorrect, since the __dynamic_memcpy fusion kind should not have been
+  // set. We still verify that we correctly fall back to codegen.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(absl::StrReplaceAll(
+                              kSliceMemcpyModule, {{"{2,1,0}", "{0,2,1}"}})));
+
+  CompileAndVerifyIr(std::move(hlo_module), "; CHECK: void @slice",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/false);
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kSliceMemcpyModule, ErrorSpec{0, 0}));
+}
+
+constexpr char kDynamicUpdateSliceModule[] = R"(
+    dynamic_update_slice {
+      p0 = s32[4,8,8] parameter(0)
+      p1 = s32[1,1,8] parameter(1)
+      p2 = s32[] parameter(2)
+      c0 = s32[] constant(0)
+
+      ROOT update-slice = s32[4,8,8] dynamic-update-slice(p0, p1, p2, c0, c0)
+    }
+
+    add {
+      p0 = s32[] parameter(0)
+      c1 = s32[] constant(1)
+      ROOT sum = s32[] add(p0, c1)
+    }
+
+    add_slices {
+      p0 = s32[1,1,8] parameter(0)
+      ROOT sum = s32[1,1,8] add(p0, p0)
+    }
+
+    body {
+      p0 = (s32[], s32[4,8,8], s32[1,1,8]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      input = s32[4,8,8] get-tuple-element(p0), index=1
+      input-copy = s32[4,8,8] copy(input)
+
+      ivar_copy = s32[] copy(ivar)
+      acc = s32[1,1,8] get-tuple-element(p0), index=2
+      acc_copy = s32[1,1,8] copy(acc)
+
+      updated = s32[4,8,8] fusion(input-copy, acc_copy, ivar_copy), kind=kLoop,
+          calls=dynamic_update_slice,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      next_ivar = s32[] fusion(ivar_copy), kind=kLoop, calls=add
+
+      next_acc = s32[1,1,8] fusion(acc_copy), kind=kLoop, calls=add_slices
+      ROOT result = (s32[], s32[4,8,8], s32[1,1,8])
+          tuple(next_ivar, updated, next_acc)
+    }
+
+    compare {
+      p0 = s32[] parameter(0)
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(p0, c6), direction=LT
+    }
+
+    condition {
+      p0 = (s32[], s32[4,8,8], s32[1,1,8]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      ROOT cmp = pred[] fusion(ivar), kind=kLoop, calls=compare
+    }
+
+    input {
+      iota = s32[256] iota(), iota_dimension=0
+      ROOT bc = s32[4,8,8] bitcast(iota)
+    }
+
+    ENTRY main {
+      input = s32[4,8,8] fusion(), kind=kLoop, calls=input
+      init_acc = s32[1,1,8] constant({{{7,6,5,4,3,2,1,0}}})
+      c0 = s32[] constant(0)
+      tuple = (s32[], s32[4,8,8], s32[1,1,8]) tuple(c0, input, init_acc)
+      ROOT while = (s32[], s32[4,8,8], s32[1,1,8]) while(tuple),
+          condition=condition, body=body,
+          backend_config={"known_trip_count":{"n":"6"},
+                          "known_init_step":{"init":"0","step":"1"},
+                          "known_induction_variable":{"tuple_index":"0"}}
+    })";
+
+TEST_F(GpuCopyTest, UseMemcpyForDynamicUpdateSlice) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> hlo_module,
+      ParseAndReturnVerifiedModule(kDynamicUpdateSliceModule));
+
+  CompileAndVerifyIr(std::move(hlo_module), "; CHECK-NOT: void @updated",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/false);
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(kDynamicUpdateSliceModule, ErrorSpec{0, 0}));
+}
+
+TEST_F(GpuCopyTest, DoNotUseMemcpyForDynamicUpdateSlice) {
+  // This is a test for the CompileAndVerifyIr statement in
+  // UseMemcpyForDynamicUpdateSlice. When the conditions are not met, there
+  // should be a fusion for the slice.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> hlo_module,
+      ParseAndReturnVerifiedModule(kDynamicUpdateSliceModule));
+
+  // This prevents the memcpy fusion logic from triggering.
+  hlo_module->entry_computation()->root_instruction()->clear_backend_config();
+  CompileAndVerifyIr(std::move(hlo_module), "; CHECK: void @updated",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/false);
+}
+
+constexpr char kDynamicUpdateSliceWithBitcastModule[] = R"(
+    dynamic_update_slice {
+      p0 = s32[8,8] parameter(0)
+      p1 = s32[1,4] parameter(1)
+      p2 = s32[] parameter(2)
+      bc0 = s32[64] bitcast(p0)
+      bc1 = s32[4] bitcast(p1)
+      update-slice = s32[64] dynamic-update-slice(bc0, bc1, p2)
+      ROOT bc = s32[8,8] bitcast(update-slice)
+    }
+
+    add {
+      p0 = s32[] parameter(0)
+      c1 = s32[] constant(1)
+      ROOT sum = s32[] add(p0, c1)
+    }
+
+    body {
+      while_arg = (s32[], s32[8,8], s32[1,4]) parameter(0)
+      ivar = s32[] get-tuple-element(while_arg), index=0
+      input = s32[8,8] get-tuple-element(while_arg), index=1
+      update = s32[1,4] get-tuple-element(while_arg), index=2
+      input-copy = s32[8,8] copy(input)
+      ivar-copy = s32[] copy(ivar)
+
+      updated_bc = s32[8,8] fusion(input-copy, update, ivar-copy), kind=kLoop,
+          calls=dynamic_update_slice,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      next_ivar = s32[] fusion(ivar-copy), kind=kLoop, calls=add
+
+      ROOT result = (s32[], s32[8,8], s32[1,4])
+          tuple(next_ivar, updated_bc, update)
+    }
+
+    compare {
+      p0 = s32[] parameter(0)
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(p0, c6), direction=LT
+    }
+
+    condition {
+      while_arg = (s32[], s32[8,8], s32[1,4]) parameter(0)
+      ivar = s32[] get-tuple-element(while_arg), index=0
+      ROOT cmp = pred[] fusion(ivar), kind=kLoop, calls=compare
+    }
+
+    input {
+      iota = s32[64] iota(), iota_dimension=0
+      ROOT bc = s32[8,8] bitcast(iota)
+    }
+
+    ENTRY main {
+      input = s32[8,8] fusion(), kind=kLoop, calls=input
+      init_acc = s32[1,4] constant({{3,2,1,0}})
+      c0 = s32[] constant(0)
+      tuple = (s32[], s32[8,8], s32[1,4]) tuple(c0, input, init_acc)
+      ROOT while = (s32[], s32[8,8], s32[1,4]) while(tuple),
+          condition=condition, body=body,
+          backend_config={"known_trip_count":{"n":"6"},
+                          "known_init_step":{"init":"0","step":"1"},
+                          "known_induction_variable":{"tuple_index":"0"}}
+    })";
+
+TEST_F(GpuCopyTest, UseMemcpyForDynamicUpdateSliceWithBitcasts) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> hlo_module,
+      ParseAndReturnVerifiedModule(kDynamicUpdateSliceWithBitcastModule));
+
+  CompileAndVerifyIr(std::move(hlo_module), R"(
+    CHECK-NOT: void @
+    CHECK: void @input
+    CHECK-NOT: void @
+    CHECK: void @cmp
+    CHECK-NOT: void @
+    CHECK: void @next_ivar
+    CHECK-NOT: void @
+  )",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/false);
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kDynamicUpdateSliceWithBitcastModule,
+                                       ErrorSpec{0, 0}));
+}
+
+constexpr char kSliceMemcpyModuleUnfused[] = R"(
+    body {
+      p0 = (s32[], s32[4,8,1000000], s32[1,1,1000000]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      input = s32[4,8,1000000] get-tuple-element(p0), index=1
+
+      ivar_copy = s32[] copy(ivar)
+      c1 = s32[] constant(1)
+      slice = s32[1,1,1000000] dynamic-slice(input, ivar_copy, c1, c1),
+          dynamic_slice_sizes={1,1,1000000}
+
+      next_ivar = s32[] add(ivar_copy, c1)
+      ROOT result = (s32[], s32[4,8,1000000], s32[1,1,1000000])
+          tuple(next_ivar, input, slice)
+    }
+
+    compare {
+      p0 = s32[] parameter(0)
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(p0, c6), direction=LT
+    }
+
+    condition {
+      p0 = (s32[], s32[4,8,1000000], s32[1,1,1000000]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(ivar, c6), direction=LT
+    }
+
+    ENTRY main {
+      p0 = s32[4,8,1000000] parameter(0)
+      p1 = s32[1,1,1000000] parameter(1)
+      c0 = s32[] constant(0)
+      tuple = (s32[], s32[4,8,1000000], s32[1,1,1000000]) tuple(c0, p0, p1)
+      ROOT while = (s32[], s32[4,8,1000000], s32[1,1,1000000]) while(tuple),
+          condition=condition, body=body
+    })";
+
+TEST_F(GpuCopyTest, UseDynamicMemcpyIntegrationTest) {
+  auto compute_capability = backend()
+                                .default_stream_executor()
+                                ->GetDeviceDescription()
+                                .gpu_compute_capability();
+  if (auto cc = std::get_if<stream_executor::CudaComputeCapability>(
+          &compute_capability);
+      !cc || !cc->IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Test requires at least Ampere.";
+  }
+
+  // This is an integration test to verify that the pipeline for replacing
+  // dynamic-slices that depend on while loop iteration variables with memcpy
+  // works as a whole.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> hlo_module,
+      ParseAndReturnVerifiedModule(kSliceMemcpyModuleUnfused));
+
+  // Check that there are exactly two fusions:
+  // 1. A `compare` fusion for the loop condition.
+  // 2. An `add` fusion for the next ivar.
+  // If the dynamic memcpy optimization does not trigger, there will be a third
+  // fusion for the dynamic-slice.
+  CompileAndVerifyIr(std::move(hlo_module), R"(
+                       CHECK-NOT: void @
+
+                       CHECK: void @
+                       CHECK-NEXT: load
+                       CHECK-NEXT: icmp
+                       CHECK-NEXT: zext
+                       CHECK-NEXT: store
+                       CHECK-NEXT: ret
+
+                       CHECK-NOT: void @
+                       CHECK: void @
+                       CHECK-NEXT: load
+                       CHECK-NEXT: add
+                       CHECK-NEXT: store
+                       CHECK-NEXT: ret
+
+                       CHECK-NOT: void @)",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/true);
+}
+
+TEST_F(GpuCopyTest, UseDynamicMemcpyIntegrationTestControl) {
+  // Control for UseDynamicMemcpyIntegrationTest. Verify that without
+  // fusion-dynamic-memcpy-rewriter, we have a third fusion.
+  HloModuleConfig config;
+  DebugOptions options;
+  options.add_xla_disable_hlo_passes("fusion-dynamic-memcpy-rewriter");
+  config.set_debug_options(options);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> hlo_module,
+      ParseAndReturnVerifiedModule(kSliceMemcpyModuleUnfused, config));
+  CompileAndVerifyIr(std::move(hlo_module), R"(
+                       CHECK-COUNT-3: void @
+                       CHECK-NOT: void @)",
+                     /*match_optimized_ir=*/false,
+                     /*run_optimization_passes=*/true);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc
index 907fe384c320..eb22488d981f 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_cub_sort_test.cc
@@ -52,13 +52,12 @@ class CubSortKeysTest : public HloTestBase,
  public:
   void SetUp() override {
     HloTestBase::SetUp();
-    SortRewriter::SetSortSizeThresholdForTestingOnly(
-        0);  // Always use CUB sort.
+    SortRewriter::SetSortModeForTestingOnly(SortRewriter::Mode::kAlways);
   }
 };
 
 TEST_F(CubSortKeysTest, AlwaysUsesCubSort) {
-  EXPECT_EQ(SortRewriter::SortSizeThreshold(), 0);
+  EXPECT_EQ(SortRewriter::SortMode(), SortRewriter::Mode::kAlways);
 }
 
 TEST_P(CubSortKeysTest, CompareToReference) {
@@ -208,13 +207,12 @@ class CubSortPairsTest
  public:
   void SetUp() override {
     HloTestBase::SetUp();
-    SortRewriter::SetSortSizeThresholdForTestingOnly(
-        0);  // Always use CUB sort.
+    SortRewriter::SetSortModeForTestingOnly(SortRewriter::Mode::kAlways);
   }
 };
 
 TEST_F(CubSortPairsTest, AlwaysUsesCubSort) {
-  EXPECT_EQ(SortRewriter::SortSizeThreshold(), 0);
+  EXPECT_EQ(SortRewriter::SortMode(), SortRewriter::Mode::kAlways);
 }
 
 TEST_P(CubSortPairsTest, CompareToReference) {
@@ -308,7 +306,7 @@ ENTRY m {
 
 INSTANTIATE_TEST_SUITE_P(
     CubSort, CubSortPairsTest,
-    ::testing::Combine(::testing::Values(U8, U16, U32, U64),
+    ::testing::Combine(::testing::Values(U8, U16, U32, U64, F32),
                        ::testing::Values(F16, F32, F64), ::testing::Bool(),
                        ::testing::Values(1, 10)),
     [](const ::testing::TestParamInfo<CubSortPairsTest::ParamType>& info) {
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index 1b24b026939a..b2f3aee8ba7b 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -336,7 +336,6 @@ class FlashAttentionBMMScaleCausalMaskSoftmaxBMM
         se::dnn::VersionInfo(9, 0, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_BMM1_CausalMask_Softmax_BMM2_HloString_BF16();  // NOLINT
     // reference cudnn fmha
@@ -352,7 +351,6 @@ class FlashAttentionBMMScaleCausalMaskSoftmaxBMM
         se::dnn::VersionInfo(9, 0, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_Training_BMM1_CausalMask_Softmax_BMM2_HloString_BF16();  // NOLINT
     std::string hlo_string_ref =
@@ -435,7 +433,7 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   const std::string  // NOLINT
   GetModuleFlash_Attention_Training_BMM1_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[6,1024,1024]{2,1,0}, bf16[2,6,1024,64]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
 
     region_0.13 {
       Arg_0.14 = bf16[] parameter(0)
@@ -465,8 +463,9 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
       Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0), sharding={replicated}
       Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1), sharding={replicated}
       dot.11 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      Arg_3.4 = bf16[2,6,1024,1024]{3,2,1,0} parameter(3), sharding={replicated}
-      add.12 = bf16[2,6,1024,1024]{3,2,1,0} add(dot.11, Arg_3.4)
+      Arg_3.4 = bf16[6,1024,1024]{2,1,0} parameter(3), sharding={replicated}
+      broadcast.9 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(Arg_3.4), dimensions={1,2,3}
+      add.12 = bf16[2,6,1024,1024]{3,2,1,0} add(dot.11, broadcast.9)
       constant.9 = bf16[] constant(-inf)
       reduce.17 = bf16[2,6,1024]{2,1,0} reduce(add.12, constant.9), dimensions={3}, to_apply=region_0.13
       reshape.18 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.17)
@@ -525,20 +524,21 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   const std::string  // NOLINT
   GetModuleFlash_Attention_CuDNN_Training_BMM1_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
     const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[6,1024,1024]{2,1,0}, bf16[2,6,1024,64]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
 
     ENTRY main.72 {
       Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0), sharding={replicated}
       Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1), sharding={replicated}
       transpose = bf16[2,6,1024,64]{3,2,1,0} transpose(Arg_1.2), dimensions={0,1,3,2}
       Arg_2.3 = bf16[2,6,1024,64]{3,2,1,0} parameter(2), sharding={replicated}
-      Arg_3.4 = bf16[2,6,1024,1024]{3,2,1,0} parameter(3), sharding={replicated}
-      fmha-bmm-scale-bias-softmax-bmm = (bf16[2,6,1024,64]{3,2,1,0}, f32[2,6,1024]{2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose, Arg_2.3, Arg_3.4), custom_call_target="__cudnn$fmhaScaleBiasSoftmax", operand_layout_constraints={bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,1024]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"24":"0","17":"1"},"is_cudnn_frontend":true,"workspace_size":"0"},"fmha_scale":1,"dropout_rate":0,"bmm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","1024","1024"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
+      Arg_3.4 = bf16[6,1024,1024]{2,1,0} parameter(3), sharding={replicated}
+      reshape.131 = bf16[1,6,1024,1024]{3,2,1,0} reshape(Arg_3.4)
+      fmha-bmm-scale-bias-softmax-bmm = (bf16[2,6,1024,64]{3,2,1,0}, f32[2,6,1024]{2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose, Arg_2.3, reshape.131), custom_call_target="__cudnn$fmhaScaleBiasSoftmax", operand_layout_constraints={bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[1,6,1024,1024]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"24":"0","17":"1"},"is_cudnn_frontend":true,"workspace_size":"0"},"fmha_scale":1,"dropout_rate":0,"bmm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","1024","1024"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
       get-tuple-element = bf16[2,6,1024,64]{3,2,1,0} get-tuple-element(fmha-bmm-scale-bias-softmax-bmm), index=0
       transpose.2 = bf16[2,6,1024,64]{3,2,1,0} transpose(Arg_1.2), dimensions={0,1,3,2}
       get-tuple-element.2 = f32[2,6,1024]{2,1,0} get-tuple-element(fmha-bmm-scale-bias-softmax-bmm), index=1
       Arg_4.5 = bf16[2,6,1024,64]{3,2,1,0} parameter(4), sharding={replicated}
-      fmha-bmm-scale-bias-softmax-bmm-backward = (bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose.2, Arg_2.3, get-tuple-element.2, Arg_4.5, /*index=5*/Arg_3.4, get-tuple-element), custom_call_target="__cudnn$fmhaScaleBiasSoftmaxBackward", operand_layout_constraints={bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, f32[2,6,1024]{2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"17":"1","24":"0"},"is_cudnn_frontend":true,"workspace_size":"0"},"fmha_scale":1,"dropout_rate":0,"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","1024","1024"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"bmm1_grad_gemm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["2"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm1_grad_gemm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_grad_gemm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["2"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_grad_gemm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
+      fmha-bmm-scale-bias-softmax-bmm-backward = (bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, u8[0]{0}) custom-call(Arg_0.1, transpose.2, Arg_2.3, get-tuple-element.2, Arg_4.5, /*index=5*/reshape.131, get-tuple-element), custom_call_target="__cudnn$fmhaScaleBiasSoftmaxBackward", operand_layout_constraints={bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, f32[2,6,1024]{2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[1,6,1024,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_fmha_backend_config":{"algorithm":{"algo_id":"0","math_type":"TENSOR_OP_MATH","tuning_knobs":{"17":"1","24":"0"},"is_cudnn_frontend":true,"workspace_size":"0"},"fmha_scale":1,"dropout_rate":0,"intermediate_tensor_shape":{"element_type":"BF16","dimensions":["2","6","1024","1024"],"tuple_shapes":[],"layout":{"dim_level_types":[],"dim_unique":[],"dim_ordered":[],"minor_to_major":["3","2","1","0"],"tiles":[],"tail_padding_alignment_in_elements":"1","element_size_in_bits":"0","memory_space":"0","index_primitive_type":"PRIMITIVE_TYPE_INVALID","pointer_primitive_type":"PRIMITIVE_TYPE_INVALID","dynamic_shape_metadata_prefix_bytes":"0","split_configs":[]},"is_dynamic_dimension":[false,false,false,false]},"bmm1_grad_gemm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["2"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm1_grad_gemm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_grad_gemm1_dot_dimension_numbers":{"lhs_contracting_dimensions":["2"],"rhs_contracting_dimensions":["2"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"bmm2_grad_gemm2_dot_dimension_numbers":{"lhs_contracting_dimensions":["3"],"rhs_contracting_dimensions":["3"],"lhs_batch_dimensions":["0","1"],"rhs_batch_dimensions":["0","1"]},"seed":"42","is_flash_attention":false,"is_causal_mask":false,"mask_type":"NO_MASK","force_deterministic":false,"sliding_window_length":0},"force_earliest_schedule":false}
       get-tuple-element.3 = bf16[2,6,1024,64]{3,2,1,0} get-tuple-element(fmha-bmm-scale-bias-softmax-bmm-backward), index=0
       get-tuple-element.4 = bf16[2,6,1024,64]{3,2,1,0} get-tuple-element(fmha-bmm-scale-bias-softmax-bmm-backward), index=1
       transpose.1 = bf16[2,6,64,1024]{3,2,1,0} transpose(get-tuple-element.4), dimensions={0,1,3,2}
@@ -745,7 +745,6 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
         se::dnn::VersionInfo(9, 0, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_HloString_BF16();
     std::string hlo_string_ref =
@@ -760,7 +759,6 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
         se::dnn::VersionInfo(9, 0, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_Training_BMM1_Bias_Softmax_BMM2_HloString_BF16();  // NOLINT
     std::string hlo_string_ref =
@@ -776,7 +774,6 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
       GTEST_SKIP() << "Flash Attention cross attention requires "
                       "cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_Cross_Attention_HloString_BF16();  // NOLINT
     std::string hlo_string_ref =
@@ -794,7 +791,6 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
       GTEST_SKIP()
           << "Flash Attention dbias requires cuDNN >= 9.0.0 and Hopper arch.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_Dbias_HloString_BF16();  // NOLINT
     std::string hlo_string_ref =
@@ -958,7 +954,6 @@ class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
         se::dnn::VersionInfo(9, 0, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_Training_BMM1_Softmax_BMM2_HloString_BF16();  // NOLINT
     std::string hlo_string_ref =
@@ -976,7 +971,6 @@ class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
       GTEST_SKIP() << "Flash Attention deterministic kernels requires cuDNN >= "
                       "9.0.0 and Hopper arch.";
     }
-    XlaBuilder builder(TestName());
     std::string hlo_string =
         GetModuleFlash_Attention_CuDNN_Training_BMM1_Softmax_BMM2_Deterministic_HloString_BF16();  // NOLINT
     EXPECT_TRUE(
@@ -1081,7 +1075,6 @@ class FlashAttentionBMMScalePaddingMaskSoftmaxBMM
         se::dnn::VersionInfo(9, 0, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
     // pass padding mask as bias
     std::string hlo_string =
         GetModuleFlash_Attention_Training_BMM1_PaddingMask_As_Bias_Softmax_BMM2_HloString_BF16();  // NOLINT
@@ -1189,7 +1182,6 @@ class FlashAttentionBMMScaleSlidingWindowMaskSoftmaxBMM
         se::dnn::VersionInfo(9, 2, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.2.0.";
     }
-    XlaBuilder builder(TestName());
     // pass sliding window mask as bias
     std::string hlo_string =
         GetModuleFlash_Attention_Training_BMM1_SlidingWindowMask_As_Bias_Softmax_BMM2_HloString_BF16();  // NOLINT
@@ -1324,7 +1316,6 @@ class FlashAttentionBMMScaleSegmentMaskSoftmaxBMM
     if (!cc.IsAtLeastHopper()) {
       GTEST_SKIP() << "Flash Attention segment mask requires at least Hopper.";
     }
-    XlaBuilder builder(TestName());
     // Cudnn sequence packing packs multiple batches(segments) into one batch
     // using offsets and seqlen tensors to indicate where each segment begins
     std::string hlo_string =
@@ -1337,6 +1328,65 @@ class FlashAttentionBMMScaleSegmentMaskSoftmaxBMM
   }
 };
 
+class FlashAttentionPagedAttention : public MultiHeadedAttentionTest {
+ protected:
+  const std::string                                  // NOLINT
+  GetModuleFlash_Attention_Paged_Attention_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+    HloModule jit_cudnn_attn, is_scheduled=true, entry_computation_layout={(bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0})->bf16[1,128,2,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_parameters={true,true}, allow_spmd_sharding_propagation_to_output={true}, frontend_attributes={fingerprint_before_lhs="38478931696ec7b03ee7a27161e4fae1"}
+
+    %wrapped_iota_computation () -> s32[1,1,2,1] {
+      ROOT %iota.3.1 = s32[1,1,2,1]{3,2,1,0} iota(), iota_dimension=2
+    }
+
+    ENTRY %main.11 (Arg_0.1: bf16[1,128,2,128], Arg_1.2: bf16[1,128,2,128]) -> bf16[1,128,2,128] {
+      %constant_3_0 = s32[1]{0} constant({128})
+      %Arg_1.2 = bf16[1,128,2,128]{3,2,1,0} parameter(1)
+      %Arg_0.1 = bf16[1,128,2,128]{3,2,1,0} parameter(0)
+      %bitcast.33.0 = bf16[2,64,2,128]{3,2,1,0} bitcast(%Arg_1.2)
+      %wrapped_iota = s32[1,1,2,1]{3,2,1,0} fusion(), kind=kLoop, calls=%wrapped_iota_computation
+      %custom-call.7 = (bf16[1,2,128,128]{3,1,2,0}, u8[0]{0}) custom-call(%Arg_0.1, %bitcast.33.0, %bitcast.33.0, %constant_3_0, %constant_3_0, /*index=5*/%wrapped_iota, %wrapped_iota), custom_call_target="__cudnn$fmhaSoftmax", operand_layout_constraints={bf16[1,128,2,128]{3,2,1,0}, bf16[2,64,2,128]{3,2,1,0}, bf16[2,64,2,128]{3,2,1,0}, s32[1]{0}, s32[1]{0}, s32[1,1,2,1]{3,2,1,0}, s32[1,1,2,1]{3,2,1,0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"operation_queue_id": "0", "wait_on_operation_queues": [], "cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "is_cudnn_frontend": true, "workspace_size": "0"}, "fmha_scale": 1.0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["1", "2", "128", "64"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "is_flash_attention": true, "mask_type": "NO_MASK", "bmm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "dropout_rate": 0, "seed": 42, "sliding_window_length": 0, "max_seg_per_batch": 1, "is_paged_attention": true}}
+      %get-tuple-element.8.0 = bf16[1,2,128,128]{3,1,2,0} get-tuple-element(%custom-call.7), index=0
+      ROOT %bitcast.11.0 = bf16[1,128,2,128]{3,2,1,0} bitcast(%get-tuple-element.8.0)
+    }
+  )";
+    return hlo_text;
+  }
+
+  const std::string                                            // NOLINT
+  GetModuleFlash_Attention_Paged_Attention_Reference_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+    HloModule jit_xla_attn, is_scheduled=true, entry_computation_layout={(bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0})->bf16[1,128,2,128]{3,2,1,0}}, allow_spmd_sharding_propagation_to_parameters={true,true}, allow_spmd_sharding_propagation_to_output={true}, frontend_attributes={fingerprint_before_lhs="2e4a32943e2d007d6896efc1ad690359"}
+
+    ENTRY %main.7 (Arg_0.1: bf16[1,128,2,128], Arg_1.2: bf16[1,128,2,128]) -> bf16[1,128,2,128] {
+      %Arg_1.2 = bf16[1,128,2,128]{3,2,1,0} parameter(1)
+      %Arg_0.1 = bf16[1,128,2,128]{3,2,1,0} parameter(0)
+      %custom-call.3 = (bf16[1,2,128,128]{3,1,2,0}, u8[256]{0}) custom-call(%Arg_0.1, %Arg_1.2, %Arg_1.2), custom_call_target="__cudnn$fmhaSoftmax", operand_layout_constraints={bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"operation_queue_id": "0", "wait_on_operation_queues": [], "cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "is_cudnn_frontend": true, "workspace_size": "0"}, "fmha_scale": 1.0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["1", "2", "128", "128"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "is_flash_attention": true, "mask_type": "NO_MASK", "bmm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "dropout_rate": 0, "seed": 42, "sliding_window_length": 0, "max_seg_per_batch": 1, "is_paged_attention": false}}
+      %get-tuple-element.4.0 = bf16[1,2,128,128]{3,1,2,0} get-tuple-element(%custom-call.3), index=0
+      ROOT %bitcast.6.0 = bf16[1,128,2,128]{3,2,1,0} bitcast(%get-tuple-element.4.0)
+    }
+  )";
+    return hlo_text;
+  }
+
+  template <typename T>
+  void TestImpl_Flash_Attention_Paged_Attention() {
+    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+    if (GetDnnVersionInfoOrDefault(backend().default_stream_executor()) <
+        se::dnn::VersionInfo(9, 5, 0)) {
+      GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.5.0.";
+    }
+    // Cudnn paged attention where kv is converted to kv blocks with paged table
+    std::string hlo_string =
+        GetModuleFlash_Attention_Paged_Attention_BF16();  // NOLINT
+    // Reference implementation is regular cudnn attention.
+    std::string hlo_string_ref =
+        GetModuleFlash_Attention_Paged_Attention_Reference_BF16();  // NOLINT
+    EXPECT_TRUE(RunAndCompareTwoModules(hlo_string, hlo_string_ref,
+                                        ErrorSpec{1e-3, 1e-3}, false));
+  }
+};
+
 class FlashAttentionBMMScaleSoftmaxBMMF8 : public MultiHeadedAttentionTest {};
 
 class FlashAttentionBMMScaleSoftmaxDropoutBMM
@@ -1376,7 +1426,6 @@ class FlashAttentionBMMScaleSoftmaxDropoutBMM
         se::dnn::VersionInfo(9, 0, 0)) {
       GTEST_SKIP() << "Flash Attention requires cuDNN >= 9.0.0.";
     }
-    XlaBuilder builder(TestName());
 
     auto lhs_bmm1_literal =
         GetInput4DLiteral<bfloat16>({4, 1024, 4, 64}, {3, 2, 1, 0});
@@ -1458,6 +1507,11 @@ XLA_TEST_F(FlashAttentionBMMScaleSegmentMaskSoftmaxBMM,
       bfloat16>();  // NOLINT
 }
 
+// Paged Attention
+XLA_TEST_F(FlashAttentionPagedAttention, Flash_Attention_Paged_Attention_BF16) {
+  TestImpl_Flash_Attention_Paged_Attention<bfloat16>();
+}
+
 absl::string_view GetModuleFlashAttentionBMMScaleSoftmaxBMMCommonRef() {
   static constexpr absl::string_view hlo_text =
       R"(
@@ -1548,7 +1602,6 @@ XLA_TEST_F(FlashAttentionBMMScaleSoftmaxBMMF8,
   if (!cc.IsAtLeastHopper()) {
     GTEST_SKIP() << "Flash Attention fp8 requires at least Hopper.";
   }
-  XlaBuilder builder(TestName());
   std::string ref_bnth = R"(
     custom-call.4.0 = (
         bf16[4,4,16,16]{3,1,2,0},
@@ -1728,7 +1781,6 @@ XLA_TEST_F(FlashAttentionBMMScaleSoftmaxBMMF8,
   if (!cc.IsAtLeastHopper()) {
     GTEST_SKIP() << "Flash Attention fp8 requires at least Hopper.";
   }
-  XlaBuilder builder(TestName());
 
   std::string ref_btnh = R"(
     custom-call.4.0 = (
@@ -1913,7 +1965,6 @@ XLA_TEST_F(FlashAttentionBMMScaleSoftmaxBMMF8,
   if (!cc.IsAtLeastHopper()) {
     GTEST_SKIP() << "Flash Attention fp8 requires at least Hopper.";
   }
-  XlaBuilder builder(TestName());
   std::string hlo_string_ref = R"(
     HloModule fmha_cudnn_custom_call_bwd
     // Process inputs: clip, convert to f8e4m3fn, and convert back to bf16
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index b4c0dc19c56d..f1d43c8deefc 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "xla/error_spec.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/platform_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
@@ -398,10 +399,18 @@ TEST_F(GpuKernelTilingTest, ReductionInputTooLarge) {
   )";
   auto hlo_module = ParseAndReturnVerifiedModule(kHloString).value();
   absl::Status status = CompileToExecutable(std::move(hlo_module)).status();
-  EXPECT_THAT(status.message(),
-              ::testing::ContainsRegex(
-                  "Kernel '.*' launch needs more blocks [(]4294967296, 1[)] "
-                  "than allowed by hardware [(]2147483647, 65535[)]"));
+
+  if (xla::PlatformUtil::CanonicalPlatformName("gpu").value() == "rocm") {
+    EXPECT_THAT(status.message(),
+                ::testing::ContainsRegex(
+                    "Kernel '.*' launch needs more blocks [(]2147483648, 1[)] "
+                    "than allowed by hardware [(]2147483647, 65536[)]"));
+  } else {
+    EXPECT_THAT(status.message(),
+                ::testing::ContainsRegex(
+                    "Kernel '.*' launch needs more blocks [(]4294967296, 1[)] "
+                    "than allowed by hardware [(]2147483647, 65535[)]"));
+  }
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_sparse_dot_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_sparse_dot_test.cc
deleted file mode 100644
index 6f6dde7a185d..000000000000
--- a/third_party/xla/xla/service/gpu/tests/gpu_sparse_dot_test.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <limits>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "absl/strings/substitute.h"
-#include "absl/types/span.h"
-#include "xla/literal.h"
-#include "xla/literal_util.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-uint16_t float_to_bf16(float value) {
-  uint16_t buf[2];
-  *reinterpret_cast<float*>(buf) = value;
-  return buf[1];
-}
-
-class SparseDotTest
-    : public GpuCodegenTest,
-      public ::testing::WithParamInterface<std::tuple<int, int, int>> {
- protected:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_triton_gemm(true);
-    debug_options.set_xla_gpu_autotune_level(0);
-    return debug_options;
-  }
-
-  // Combinations of 2-item indices in the 4-group, where the first index is
-  // less than the second.
-  const int indices_2_in_4_[6] = {0b0100, 0b1000, 0b1100,
-                                  0b1001, 0b1101, 0b1110};
-
-  std::vector<uint16_t> CreateInput(int width, int height) {
-    std::vector<uint16_t> input(width * height);
-    for (int i = 0; i < height; ++i) {
-      for (int j = 0; j < width; ++j) {
-        int value = (i * 2 + j) % 100 + 1;
-        input[i * width + j] = float_to_bf16(value);
-      }
-    }
-    return input;
-  }
-
-  std::vector<uint16_t> CreateMeta(int width, int height) {
-    std::vector<uint16_t> meta(width * height);
-    for (int i = 0; i < height; ++i) {
-      for (int j = 0; j < width; ++j) {
-        uint16_t bitmask = 0;
-        for (int k = 0; k < 4; ++k) {
-          int index = (i + j * 2 + k * 3) % 6;
-          bitmask |= indices_2_in_4_[index] << (k * 4);
-        }
-        meta[i * width + j] = bitmask;
-      }
-    }
-    return meta;
-  }
-
-  std::vector<uint16_t> Sparsify(absl::Span<uint16_t> input,
-                                 absl::Span<uint16_t> meta) {
-    std::vector<uint16_t> result(input.size());
-    for (int i = 0; i < input.size(); i += 16) {
-      for (int j = 0; j < 4; ++j) {
-        int mask = meta[i / 16] >> (j * 4);
-        int p1 = i + j * 4 + (mask & 0b0011);
-        int p2 = i + j * 4 + (mask & 0b1100) / 4;
-        result[p1] = input[p1];
-        result[p2] = input[p2];
-      }
-    }
-    return result;
-  }
-
-  std::vector<uint16_t> Compress(absl::Span<uint16_t> input) {
-    std::vector<uint16_t> result;
-    for (uint16_t value : input) {
-      if (value != 0) result.push_back(value);
-    }
-    return result;
-  }
-};
-
-TEST_P(SparseDotTest, CompareWithDense) {
-  int m, n, k;
-  std::tie(m, n, k) = GetParam();
-
-  auto in1 = CreateInput(m, k);
-  auto in2 = CreateInput(n, k);
-  auto meta = CreateMeta(m, k / 16);
-  auto sparse_zeros = Sparsify(absl::MakeSpan(in1), absl::MakeSpan(meta));
-  auto sparse_packed = Compress(absl::MakeSpan(sparse_zeros));
-
-  // Execute dense dot.
-  const char* kDenseTpl = R"(
-HloModule TestDense
-
-ENTRY main {
-  lhs = bf16[$0,$1] parameter(0)
-  rhs = bf16[$2,$1] parameter(1)
-  ROOT dot = f32[$0,$2] dot(lhs, rhs),
-      lhs_contracting_dims={1}, rhs_contracting_dims={1}
-})";
-  auto dense_hlo = absl::Substitute(kDenseTpl, m, k, n);
-
-  Literal dense_lhs =
-      LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(sparse_zeros));
-  Literal dense_rhs = LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(in2));
-
-  auto dense_module = ParseAndReturnVerifiedModule(dense_hlo);
-  TF_EXPECT_OK(dense_module);
-  auto dense_result =
-      Execute(std::move(*dense_module), {&dense_lhs, &dense_rhs});
-  TF_EXPECT_OK(dense_result);
-
-  // Execute sparse dot.
-  const char* kSparseTpl = R"(
-HloModule TestSparse
-
-ENTRY main {
-  lhs = bf16[$0,$1] parameter(0)
-  rhs = bf16[$2,$3] parameter(1)
-  meta = u16[$0,$4] parameter(2)
-  ROOT dot = f32[$0,$2] dot(lhs, rhs, meta),
-      lhs_contracting_dims={1}, rhs_contracting_dims={1}, sparsity=L.1@2:4
-})";
-  auto sparse_hlo = absl::Substitute(kSparseTpl, m, k / 2, n, k, k / 16);
-
-  Literal sparse_lhs =
-      LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(sparse_packed));
-  Literal sparse_rhs = LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(in2));
-  Literal sparse_meta = LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(meta));
-
-  auto sparse_module = ParseAndReturnVerifiedModule(sparse_hlo);
-  TF_EXPECT_OK(sparse_module);
-  auto sparse_result = Execute(std::move(*sparse_module),
-                               {&sparse_lhs, &sparse_rhs, &sparse_meta});
-  TF_EXPECT_OK(sparse_result);
-
-  // Compare the results.
-  EXPECT_EQ(*dense_result, *sparse_result);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    Sparsity, SparseDotTest,
-    ::testing::Combine(/*m=*/::testing::Values(32, 256),
-                       /*n=*/::testing::Values(32, 256),
-                       /*k=*/::testing::Values(64, 512)),
-    [](const ::testing::TestParamInfo<SparseDotTest::ParamType>& info) {
-      return absl::StrCat("m", std::get<0>(info.param), "n",
-                          std::get<1>(info.param), "k",
-                          std::get<2>(info.param));
-    });
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
index 73dfe14387e2..442a970d8c9f 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
@@ -164,6 +164,57 @@ TEST_F(GpuIrEmitterUnnestedTest,
                      /*match_optimized_ir=*/false);
 }
 
+TEST_F(GpuIrEmitterUnnestedTest,
+       EmitTritonCustomCallWithCorrectKernelParamAttributes) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
+  }
+
+  constexpr absl::string_view kMLIRTextWithTMAAttributes = R"(
+    module {
+      tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.nv_tma_desc = 1 : i32},
+      %arg1: !tt.ptr<f32, 1> {tt.nv_tma_desc = 1 : i32},
+      %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32},
+      %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
+        %0 = tt.get_program_id x : i32
+        %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
+        %cst = arith.constant 1.000000e+00 : f32
+        %3 = arith.addf %1, %cst : f32
+        %4 = tt.load %arg2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
+        tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : !tt.ptr<f32>
+        tt.return
+      }
+    }
+    )";
+
+  // Check that the compiled LLVM IR retains the ByVal attribute that we expect
+  // to be added in the TTIR lowering when we use tt.nv_tma_desc.
+  HloComputation::Builder computation_builder(TestName());
+
+  // Create parameters and custom call in the computation builder.
+  Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 256});
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "arg_0"));
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "arg_1"));
+
+  computation_builder.AddInstruction(CreateTritonCustomCall(
+      ShapeUtil::MakeTupleShape({shape, std::move(shape)}), param_0, param_1,
+      kMLIRTextWithTMAAttributes, kCallName));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+
+  // TODO(b/412980654): for custom kernels, the alignment attribute is getting
+  // dropped for compile time so we do not check for this.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+  ; CHECK: @add_one
+  ; CHECK: byval([128 x i8])
+        )",
+                     /*match_optimized_ir=*/false);
+}
+
 TEST_F(GpuIrEmitterUnnestedTest, CanNotEmitTritonCustomCallOnPreAmpereGpu) {
   if (GetCudaComputeCapability().IsAtLeastAmpere()) {
     GTEST_SKIP() << "Running on Ampere or more recent GPU, skipping.";
diff --git a/third_party/xla/xla/service/gpu/tests/in_place_op_test.cc b/third_party/xla/xla/service/gpu/tests/in_place_op_test.cc
index 17af5a49919e..00832954536d 100644
--- a/third_party/xla/xla/service/gpu/tests/in_place_op_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/in_place_op_test.cc
@@ -16,13 +16,14 @@ limitations under the License.
 #include <optional>
 
 #include "xla/debug_options_flags.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-class InPlaceOpTest : public HloTestBase {
+class InPlaceOpTest : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
   // Don't override any flags.
   DebugOptions GetDebugOptionsForTest() const override {
     return GetDebugOptionsFromFlags();
diff --git a/third_party/xla/xla/service/gpu/tests/nop_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/nop_custom_call_test.cc
index 06df6792eb3e..cf8e79aead3b 100644
--- a/third_party/xla/xla/service/gpu/tests/nop_custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/nop_custom_call_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/test.h"
 
@@ -26,7 +26,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class NopCustomCallTest : public HloTestBase {};
+class NopCustomCallTest : public HloPjRtTestBase {};
 
 TEST_F(NopCustomCallTest, RunAllocateBufferAndUpdate) {
   // The test uses a custom call with the AllocateBuffer target (also known as
diff --git a/third_party/xla/xla/service/gpu/tests/nvshmem_test.cc b/third_party/xla/xla/service/gpu/tests/nvshmem_test.cc
deleted file mode 100644
index 0bc1be8fc8cf..000000000000
--- a/third_party/xla/xla/service/gpu/tests/nvshmem_test.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/time/time.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "xla/backends/gpu/collectives/nvshmem_collectives.h"
-#include "xla/debug_options_flags.h"
-#include "xla/pjrt/distributed/client.h"
-#include "xla/pjrt/distributed/distributed.h"
-#include "xla/pjrt/distributed/service.h"
-#include "xla/status_macros.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/subprocess.h"
-#include "xla/tsl/util/command_line_flags.h"
-
-namespace xla {
-namespace {
-
-// Tests that NVSHMEM library can be loaded and initialized.
-TEST(NvshmemTest, Initialization) {
-  const int num_nodes = 2;
-  tsl::SubProcess child[num_nodes];
-  for (int node_id = 0; node_id < num_nodes; ++node_id) {
-    std::vector<std::string> argv;
-    argv.push_back("nvshmem_test");
-    argv.push_back(absl::StrFormat("--node_id=%d", node_id));
-    argv.push_back(absl::StrFormat("--num_nodes=%d", num_nodes));
-    child[node_id].SetProgram("/proc/self/exe", argv);
-    child[node_id].SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
-    child[node_id].SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
-    ASSERT_TRUE(child[node_id].Start()) << "node " << node_id;
-  }
-  for (int node_id = 0; node_id < num_nodes; ++node_id) {
-    std::string stdout_str;
-    std::string stderr_str;
-    int child_status =
-        child[node_id].Communicate(nullptr, &stdout_str, &stderr_str);
-    EXPECT_EQ(child_status, 0) << " node " << node_id << "\nstdout:\n"
-                               << stdout_str << "\nstderr:\n"
-                               << stderr_str;
-  }
-}
-
-absl::Status InitializationTestBody(const int node_id, const int num_nodes) {
-  std::unique_ptr<xla::DistributedRuntimeService> service;
-  if (node_id == 0) {
-    xla::CoordinationServiceImpl::Options service_options;
-    service_options.num_nodes = num_nodes;
-    TF_ASSIGN_OR_RETURN(service, xla::GetDistributedRuntimeService(
-                                     "[::]:12345", service_options));
-  }
-
-  xla::DistributedRuntimeClient::Options distributed_options;
-  distributed_options.node_id = node_id;
-  distributed_options.init_timeout = absl::Seconds(120);
-  auto distributed_client =
-      GetDistributedRuntimeClient("127.0.0.1:12345", distributed_options);
-  TF_QCHECK_OK(distributed_client->Connect());
-  auto kv_store =
-      GetDistributedKeyValueStore(distributed_client, /*key_prefix=*/"gpu:");
-
-  xla::gpu::NvshmemCollectives::Default()->SetEnvInfo(node_id, num_nodes, 1,
-                                                      kv_store);
-  cudaSetDevice(node_id);
-  TF_ASSIGN_OR_RETURN(void* ptr,
-                      xla::gpu::NvshmemCollectives::Default()->Allocate(1024));
-  TF_RET_CHECK(ptr != nullptr);
-  TF_RETURN_IF_ERROR(xla::gpu::NvshmemCollectives::Default()->Deallocate(ptr));
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-}  // namespace xla
-
-int main(int argc, char* argv[]) {
-  // Save name of binary so that it may invoke itself.
-  int node_id = -1;
-  int num_nodes = -1;
-  std::vector<tsl::Flag> flag_list = {
-      tsl::Flag("node_id", &node_id, "Node ID for Initialization test."),
-      tsl::Flag("num_nodes", &num_nodes,
-                "Number of nodes for Initialization test."),
-  };
-  xla::AppendDebugOptionsFlags(&flag_list);
-  std::string usage = tsl::Flags::Usage(argv[0], flag_list);
-  tsl::Flags::Parse(&argc, argv, flag_list);
-  testing::InitGoogleTest(&argc, argv);
-  if (node_id >= 0) {
-    absl::Status result = xla::InitializationTestBody(node_id, num_nodes);
-    if (!result.ok()) {
-      LOG(ERROR) << result;
-    }
-    return result.raw_code();
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/third_party/xla/xla/service/gpu/tests/reduction_emitter_test.cc b/third_party/xla/xla/service/gpu/tests/reduction_emitter_test.cc
deleted file mode 100644
index b6ecfd527dee..000000000000
--- a/third_party/xla/xla/service/gpu/tests/reduction_emitter_test.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/error_spec.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class ReductionEmitterTest : public gpu::GpuCodegenTest {};
-
-TEST_F(ReductionEmitterTest, ProperShmemAllocation) {
-  const char* const kHloString = R"(
-  HloModule m
-
-  add {
-    a = f64[] parameter(0)
-    b = f64[] parameter(1)
-    ROOT out = f64[] add(a, b)
-  }
-
-  fused_computation {
-    p1 = f64[1024,1024]{1,0} parameter(0)
-    p2 = f64[1024,1024]{1,0} parameter(1)
-    s = pred[1024,1024]{1,0} parameter(2)
-    p = f64[1024,1024]{1,0} select(s, p1, p2)
-    z = f64[] constant(0)
-    ROOT out = f64[1024]{0} reduce(p, z), to_apply=add, dimensions={0}
-  }
-
-  ENTRY e {
-    p1 = f64[1024,1024]{1,0} parameter(0)
-    p2 = f64[1024,1024]{1,0} parameter(1)
-    s = pred[1024,1024]{1,0} parameter(2)
-    ROOT f = f64[1024]{0} fusion(p1, p2, s), kind=kInput, calls=fused_computation
-  })";
-
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index 90f1ae793d3c..03e3c7e5fd7e 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
@@ -27,15 +28,20 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "Eigen/Core"
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -69,6 +75,55 @@ ENTRY TestComputation {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+// Test that verifies the IgnoreMemorySpace option works correctly
+TEST_F(SortingTest, LayoutsInShapesEqualWithIgnoreMemorySpace) {
+  const char* hlo_text = R"(
+HloModule TestModule
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  p.1.lhs = f32[] parameter(2)
+  p.1.rhs = f32[] parameter(3)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY TestComputation {
+  data = f32[6] parameter(0)
+
+  // Create two copies in different memory spaces
+  keys = f32[6] copy(data)
+  values = f32[6] copy(data)
+
+  // Sort operation with operands in different memory spaces
+  ROOT sort = (f32[6], f32[6]) sort(keys, values), dimensions={0}, to_apply=compare
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  HloInstruction* values =
+      module->entry_computation()->GetInstructionWithName("values");
+  Shape values_shape = values->shape();
+  values_shape.mutable_layout()->set_memory_space(1);
+  *values->mutable_shape() = values_shape;
+
+  const HloInstruction* sort = module->entry_computation()->root_instruction();
+  EXPECT_EQ(sort->opcode(), HloOpcode::kSort);
+
+  const HloInstruction* keys = sort->operand(0);
+
+  EXPECT_FALSE(
+      LayoutUtil::LayoutsInShapesEqual(keys->shape(), values->shape()));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(
+      keys->shape(), values->shape(), Layout::Equal().IgnoreMemorySpace()));
+
+  auto literal = LiteralUtil::CreateR1<float>({1.0, 6.0, 7.0, 0.0, 2.0, 5.0});
+  absl::StatusOr<Literal> executed = Execute(std::move(module), {&literal});
+  EXPECT_TRUE(executed.ok()) << executed.status().message();
+}
+
 // Size of the radix sort tests.
 static constexpr int kRadixSortTestSize = 100000;
 
diff --git a/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc b/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc
index b7c9c92772b3..ca9a7ffce301 100644
--- a/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc
@@ -16,13 +16,19 @@ limitations under the License.
 #include <string>
 
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+// The error tolerances are small enough so that the use of TF32 will cause
+// the error to be greater than the tolerances.
+constexpr ErrorSpec kErrorSpec = ErrorSpec{1e-4, 1e-4};
+
 // Test that setting the TensorFloat-32 global variable to false causes
 // TensorFloat-32 not to be used, even when the operand precision is set to the
 // default.
@@ -30,15 +36,12 @@ namespace {
 // NOTE: Unfortunately TF2XLA doesn't set the precision config for all
 // operations based on tensor_float_32_execution_enabled(), so we can not ignore
 // the global variable.
-class TensorFloat32GlobalVarTest : public ::testing::WithParamInterface<bool>,
-                                   public HloTestBase {
+class TensorFloat32GlobalVarTest
+    : public ::testing::WithParamInterface<bool>,
+      public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  protected:
   TensorFloat32GlobalVarTest() {
     tsl::enable_tensor_float_32_execution(false);
-
-    // The error tolerances are small enough so that the use of TF32 will cause
-    // the error to be greater than the tolerances.
-    error_spec_ = ErrorSpec{1e-4, 1e-4};
   }
 
   ~TensorFloat32GlobalVarTest() override {
@@ -46,11 +49,10 @@ class TensorFloat32GlobalVarTest : public ::testing::WithParamInterface<bool>,
   }
 
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options = HloPjRtTestBase::GetDebugOptionsForTest();
     const bool enable_triton_gemm = GetParam();
     if (enable_triton_gemm) {
       debug_options.set_xla_gpu_enable_triton_gemm(true);
-      debug_options.set_xla_gpu_unsupported_force_triton_gemm(true);
       debug_options.set_xla_gpu_cublas_fallback(false);
     } else {
       debug_options.set_xla_gpu_enable_triton_gemm(false);
@@ -69,7 +71,7 @@ ENTRY %dot_computation (x: f32[1024,1024], source: f32[1024,1024]) -> f32[1024,1
   ROOT %result = f32[1024,1024] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default, default}
 }
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_text, error_spec_));
+  EXPECT_TRUE(RunAndCompare(hlo_text, kErrorSpec));
 }
 
 TEST_P(TensorFloat32GlobalVarTest, Convolution) {
@@ -82,7 +84,7 @@ ENTRY %conv_computation (x: f32[16,40,40,64], source: f32[3,3,64,64]) -> f32[16,
   ROOT %result = f32[16,40,40,64] convolution(x, y), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, operand_precision={default, default}
 }
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_text, error_spec_));
+  EXPECT_TRUE(RunAndCompare(hlo_text, kErrorSpec));
 }
 
 std::string TestParamToString(const ::testing::TestParamInfo<bool>& info) {
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_emitter_test.cc b/third_party/xla/xla/service/gpu/tests/transpose_emitter_test.cc
deleted file mode 100644
index d7f3de7ba224..000000000000
--- a/third_party/xla/xla/service/gpu/tests/transpose_emitter_test.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-
-#include "xla/error_spec.h"
-#include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class TransposeEmitterTest : public gpu::GpuCodegenTest {
- protected:
-  TransposeEmitterTest() = default;
-};
-
-// TODO(cheshire): Test vectorization somehow.
-
-TEST_F(TransposeEmitterTest, SimpleLogicalTranspose) {
-  const char* const kHloString = R"(
-  HloModule m
-
-  ENTRY e {
-    para0 = f16[32,16,64]{2,1,0} parameter(0)
-    ROOT t = f16[64,32,16]{2,1,0} transpose(para0), dimensions={2,0,1}
-  })";
-
-  auto expected_ir = R"(
-; CHECK: call void BARRIER()
-)";
-  CompileAndVerifyIr(kHloString, MakePlatformSpecificLlvm(expected_ir),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/true);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, BatchedLogicalTranspose) {
-  const char* const kHloString = R"(
-  HloModule m
-
-  ENTRY e {
-    para0 = f16[32,48,64]{2,1,0} parameter(0)
-    ROOT t = f16[32,64,48]{2,1,0} transpose(para0), dimensions={0,2,1}
-  })";
-
-  auto expected_ir = R"(
-; CHECK: call void BARRIER()
-)";
-  CompileAndVerifyIr(kHloString, MakePlatformSpecificLlvm(expected_ir),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, FusionAfterHero) {
-  const char* hlo = R"(
-HloModule m
-
-%fused_computation {
-  %param_0.1 = f32[16,32]{1,0} parameter(0)
-  %s.1 = f32[16,32]{1,0} sqrt(%param_0.1)
-  bc = f32[1,16,32]{2,1,0} bitcast(%s.1)
-  %t.1 = f32[1,32,16]{2,1,0} transpose(bc), dimensions={0,2,1}
-  b = f32[32,16,1]{2,1,0} bitcast(%t.1)
-  ROOT o = f32[32,16,1]{2,1,0} sqrt(b)
-}
-
-ENTRY main {
-  %p = f32[16,32]{1,0} parameter(0)
-  ROOT %fusion = f32[32,16,1]{2,1,0} fusion(%p), kind=kInput, calls=%fused_computation
-}
-  )";
-
-  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
-// CHECK: call void BARRIER()
-  )"),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, MultipleTransposesWithPostFusion) {
-  const char* hlo = R"(
-HloModule m
-
-%fused_computation {
-  %param_0.1 = f32[16,32]{1,0} parameter(0)
-  %s.1 = f32[16,32]{1,0} sqrt(%param_0.1)
-  %bc.1 = f32[1,16,32]{2,1,0} bitcast(%s.1)
-  %bc.2 = f32[1,16,32]{2,1,0} bitcast(%param_0.1)
-  %t.1 = f32[1,32,16]{2,1,0} transpose(%bc.1), dimensions={0,2,1}
-  %t1.1 = f32[1,32,16]{2,1,0} transpose(%bc.2), dimensions={0,2,1}
-  %r.1 = f32[32,16,1]{2,1,0} reshape(%t.1)
-  %r1.1 = f32[32,16,1]{2,1,0} reshape(%t1.1)
-  ROOT %tuple = (f32[32,16,1]{2,1,0}, f32[32,16,1]{2,1,0}) tuple(%r.1, %r1.1)
-}
-
-ENTRY main {
-  %p = f32[16,32]{1,0} parameter(0)
-  ROOT %fusion = (f32[32,16,1]{2,1,0}, f32[32,16,1]{2,1,0}) fusion(%p), kind=kInput, calls=%fused_computation
-}
-  )";
-
-  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
-// CHECK: call void BARRIER()
-  )"),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, MultipleTransposes) {
-  const char* hlo = R"(
-HloModule m
-
-%fused_computation {
-  %param_0.1 = f32[16,32]{1,0} parameter(0)
-  %s.1 = f32[16,32]{1,0} sqrt(%param_0.1)
-  %t.1 = f32[32,16]{1,0} transpose(%s.1), dimensions={1,0}
-  %t1.1 = f32[32,16]{1,0} transpose(%param_0.1), dimensions={1,0}
-  ROOT %tuple = (f32[32,16]{1,0}, f32[32,16]{1,0}) tuple(%t.1, %t1.1)
-}
-
-ENTRY main {
-  %p = f32[16,32]{1,0} parameter(0)
-  ROOT %fusion = (f32[32,16]{1,0}, f32[32,16]{1,0}) fusion(%p), kind=kInput, calls=%fused_computation
-}
-  )";
-
-  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
-// CHECK: call void BARRIER()
-  )"),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, MultipleTransposesLogical) {
-  const char* hlo = R"(
-HloModule m
-
-%fused_computation {
-  %param_0.1 = f32[16,32]{1,0} parameter(0)
-  %s.1 = f32[16,32]{1,0} sqrt(%param_0.1)
-  %bc.1 = f32[1,16,32]{2,1,0} bitcast(%s.1)
-  %bc.2 = f32[1,16,32]{2,1,0} bitcast(%param_0.1)
-  %c.1 = f32[1,32,16]{2,1,0} transpose(%bc.1), dimensions={0,2,1}
-  %c1.1 = f32[1,32,16]{2,1,0} transpose(%bc.2), dimensions={0,2,1}
-  ROOT %tuple = (f32[1,32,16]{2,1,0}, f32[1,32,16]{2,1,0}) tuple(%c.1, %c1.1)
-}
-
-ENTRY main {
-  %p = f32[16,32]{1,0} parameter(0)
-  ROOT %fusion = (f32[1,32,16]{2,1,0}, f32[1,32,16]{2,1,0}) fusion(%p), kind=kInput, calls=%fused_computation
-}
-  )";
-
-  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
-// CHECK: call void BARRIER()
-  )"),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, MultipleTransposesDifferentTypes) {
-  const char* hlo = R"(
-HloModule module
-
-%fused_computation (param_0.1: f16[16,32]) -> (f32[32,16], f16[32,16]) {
-  %param_0.1 = f16[16,32]{1,0} parameter(0)
-  %s.1 = f32[16,32]{1,0} convert(%param_0.1)
-  %t.1 = f32[32,16]{1,0} transpose(%s.1), dimensions={1,0}
-  %t1.1 = f16[32,16]{1,0} transpose(%param_0.1), dimensions={1,0}
-  ROOT %tuple = (f32[32,16]{1,0}, f16[32,16]{1,0}) tuple(%t.1, %t1.1)
-}
-
-ENTRY %main (p: f16[16,32]) -> (f32[32,16], f16[32,16]) {
-  %p = f16[16,32]{1,0} parameter(0)
-  %fusion = (f32[32,16]{1,0}, f16[32,16]{1,0}) fusion(%p), kind=kInput, calls=%fused_computation
-  %get-tuple-element = f32[32,16]{1,0} get-tuple-element(%fusion), index=0
-  %get-tuple-element.1 = f16[32,16]{1,0} get-tuple-element(%fusion), index=1
-  ROOT %t = (f32[32,16]{1,0}, f16[32,16]{1,0}) tuple(%get-tuple-element, %get-tuple-element.1)
-}
-  )";
-
-  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
-// CHECK: call void BARRIER()
-  )"),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, TransposeAndInput) {
-  const char* hlo = R"(
-HloModule m
-
-%fused_computation {
-  %param_0.1 = f32[16,32]{1,0} parameter(0)
-  %s.1 = f32[16,32]{1,0} sqrt(%param_0.1)
-  %t.1 = f32[32,16]{1,0} transpose(%s.1), dimensions={1,0}
-  %exp = f32[16,32]{1,0} exponential(%param_0.1)
-  ROOT %tuple = (f32[32,16]{1,0}, f32[16,32]{1,0}) tuple(%t.1, %exp)
-}
-
-ENTRY entry {
-  %p = f32[16,32]{1,0} parameter(0)
-  ROOT %fusion = (f32[32,16]{1,0}, f32[16,32]{1,0}) fusion(%p), kind=kInput, calls=%fused_computation
-}
-  )";
-
-  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
-// CHECK: call void BARRIER()
-  )"),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
-}
-
-TEST_F(TransposeEmitterTest, InconsistentTransposes) {
-  const char* hlo = R"(
-HloModule module
-
-fusion {
-  p0 = f32[32, 64] parameter(0)
-  p1 = f32[64, 32] parameter(1)
-  t0 = f32[64, 32] transpose(p0), dimensions={1,0}
-  t1 = f32[32, 64] transpose(p1), dimensions={1,0}
-  ROOT tuple = (f32[64, 32], f32[32, 64]) tuple(t0, t1)
-}
-
-ENTRY module {
-  p0 = f32[32, 64] parameter(0)
-  p1 = f32[64, 32] parameter(1)
-  ROOT fusion = (f32[64, 32], f32[32, 64]) fusion(p0, p1), kind=kLoop, calls=fusion
-}
-  )";
-  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
-// CHECK-NOT: call void BARRIER()
-  )"),
-                     /*match_optimized_ir=*/true,
-                     /*run_optimization_passes=*/false);
-  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/xla-opt.cc b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
index 358490f6798d..f4b3fcc1c718 100644
--- a/third_party/xla/xla/service/gpu/tests/xla-opt.cc
+++ b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
 #include "xla/backends/gpu/codegen/triton/ir/triton_xla_ops.h"
 #include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "third_party/triton/bin/RegisterTritonDialects.h"
 
@@ -28,7 +29,7 @@ int main(int argc, char **argv) {
   mlir::registerAllExtensions(registry);
   registerTritonDialects(registry);  // This registers all passes as well.
   registry.insert<mlir::func::FuncDialect, mlir::tensor::TensorDialect,
-                  mlir::triton::xla::XlaTritonDialect>();
+                  mlir::triton::xla::XlaTritonDialect, xla::XlaDialect>();
   mlir::triton::xla::registerTritonXlaTransformsPasses();
   xla::emitters::registerTransformsPasses();
   xla::gpu::registerGpuFusionTransformsPasses();
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 0cc03bd2ae67..8d245814afea 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
@@ -13,6 +13,7 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        "//xla/backends/autotuner/backends/gpu:__subpackages__",
         "//xla/backends/gpu:__subpackages__",
         "//xla/hlo/tools/hlo_opt:__subpackages__",
         "//xla/service/gpu:__subpackages__",
@@ -30,6 +31,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_creation_utils",
@@ -80,12 +82,15 @@ xla_cc_test(
     deps = [
         ":algebraic_simplifier",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -138,141 +143,13 @@ xla_cc_test(
     ],
     deps = [
         ":alias_passthrough_params",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:test",
     ],
 )
 
-cc_library(
-    name = "all_gather_optimizer",
-    srcs = ["all_gather_optimizer.cc"],
-    hdrs = ["all_gather_optimizer.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:collective_ops_utils",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-xla_cc_test(
-    name = "all_gather_optimizer_test",
-    srcs = ["all_gather_optimizer_test.cc"],
-    deps = [
-        ":all_gather_optimizer",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_module_config",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-cc_library(
-    name = "all_reduce_blueconnect",
-    srcs = ["all_reduce_blueconnect.cc"],
-    hdrs = ["all_reduce_blueconnect.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:computation_placer_hdr",
-        "//xla/service:global_device_id",
-        "//xla/service:hlo_creation_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "all_reduce_blueconnect_test",
-    srcs = ["all_reduce_blueconnect_test.cc"],
-    deps = [
-        ":all_reduce_blueconnect",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:pattern_matcher_gmock",
-        "//xla/service:computation_placer_hdr",
-        "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "all_reduce_splitter",
-    srcs = ["all_reduce_splitter.cc"],
-    hdrs = ["all_reduce_splitter.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/service:collective_opt_utils",
-        "//xla/service:hlo_module_config",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "all_reduce_splitter_test",
-    srcs = ["all_reduce_splitter_test.cc"],
-    deps = [
-        ":all_reduce_splitter",
-        ":reduce_scatter_creator",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/hlo/testlib:filecheck",
-        "//xla/service:hlo_module_config",
-        "//xla/tests:hlo_test_base",
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "async_collective_annotator",
     srcs = ["async_collective_annotator.cc"],
@@ -298,9 +175,9 @@ xla_cc_test(
         ":async_collective_annotator",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:test_macros_header",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -346,38 +223,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "all_gather_dynamic_slice_simplifier",
-    srcs = ["all_gather_dynamic_slice_simplifier.cc"],
-    hdrs = ["all_gather_dynamic_slice_simplifier.h"],
-    deps = [
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/transforms/expanders:op_expander_pass",
-        "//xla/service:collective_opt_utils",
-        "//xla/service:hlo_module_config",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "all_gather_dynamic_slice_simplifier_test",
-    srcs = ["all_gather_dynamic_slice_simplifier_test.cc"],
-    deps = [
-        ":all_gather_dynamic_slice_simplifier",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/service:hlo_module_config",
-        "//xla/tests:hlo_test_base",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "fusion_block_level_rewriter",
     srcs = ["fusion_block_level_rewriter.cc"],
@@ -413,13 +258,13 @@ xla_cc_test(
         ":fusion_block_level_rewriter",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/model:symbolic_tile_analysis",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
@@ -431,6 +276,44 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "fusion_dynamic_memcpy_rewriter",
+    srcs = ["fusion_dynamic_memcpy_rewriter.cc"],
+    hdrs = ["fusion_dynamic_memcpy_rewriter.h"],
+    deps = [
+        "//xla/backends/gpu/codegen:copy",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "fusion_dynamic_memcpy_rewriter_test",
+    srcs = ["fusion_dynamic_memcpy_rewriter_test.cc"],
+    deps = [
+        ":fusion_dynamic_memcpy_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "block_scaling_rewriter",
     srcs = ["block_scaling_rewriter.cc"],
@@ -444,8 +327,8 @@ cc_library(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:constants",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/transforms/expanders:op_expander_pass",
         "//xla/service:hlo_creation_utils",
-        "//xla/service:op_expander_pass",
         "//xla/service:shape_inference",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/tsl/platform:errors",
@@ -476,13 +359,17 @@ xla_test(
     name = "block_scaling_rewriter_cudnn_test",
     srcs = ["block_scaling_rewriter_cudnn_test.cc"],
     backends = ["gpu_b200"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":block_scaling_rewriter",
-        "//xla/tests:hlo_test_base",
+        "//xla:error_spec",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -540,7 +427,7 @@ xla_cc_test(
         ":collective_send_recv_combiner",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -565,90 +452,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "collective_select_folder",
-    srcs = ["collective_select_folder.cc"],
-    hdrs = ["collective_select_folder.h"],
-    deps = [
-        "//xla:comparison_util",
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:collective_ops_utils",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "collective_select_folder_test",
-    srcs = ["collective_select_folder_test.cc"],
-    deps = [
-        ":collective_select_folder",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:filecheck",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "collective_permute_valid_iteration_annotator",
-    srcs = ["collective_permute_valid_iteration_annotator.cc"],
-    hdrs = ["collective_permute_valid_iteration_annotator.h"],
-    deps = [
-        "//xla:literal_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/analysis:while_loop_analysis",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:collective_permute_cycle",
-        "//xla/service:pattern_matcher",
-        "//xla/service:source_target_pairs",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    # cc_test(
-    name = "collective_permute_valid_iteration_annotator_test",
-    srcs = ["collective_permute_valid_iteration_annotator_test.cc"],
-    deps = [
-        ":collective_permute_valid_iteration_annotator",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/transforms:while_loop_trip_count_annotator",
-        "//xla/service:collective_ops_utils",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "command_buffer_scheduling",
     srcs = ["command_buffer_scheduling.cc"],
@@ -656,14 +459,15 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/service:overload",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:variant_visitor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
         "@com_google_absl//absl/algorithm:container",
@@ -689,6 +493,7 @@ xla_test(
     ],
     deps = [
         ":command_buffer_scheduling",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
@@ -701,6 +506,7 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -740,11 +546,11 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@local_tsl//tsl/platform:test",
     ],
@@ -765,8 +571,11 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:dnn",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -785,8 +594,8 @@ xla_cc_test(
         ":conv_rewriter",
         "//xla:array4d",
         "//xla:literal_util",
-        "//xla:protobuf_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
@@ -797,8 +606,12 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -819,6 +632,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
@@ -832,10 +646,11 @@ xla_cc_test(
     deps = [
         ":copy_fusion",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -851,6 +666,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -858,6 +674,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:logging",
@@ -874,11 +691,13 @@ xla_cc_test(
     ],
     deps = [
         ":cublas_pad_for_gemms",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_googletest//:gtest",
     ],
@@ -906,7 +725,7 @@ xla_cc_test(
     srcs = ["cudnn_custom_call_converter_test.cc"],
     deps = [
         ":cudnn_custom_call_converter",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -923,6 +742,8 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/analysis:hlo_reachability",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:hlo_creation_utils",
@@ -933,6 +754,7 @@ cc_library(
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -968,6 +790,8 @@ xla_test(
         ":cudnn_fused_conv_rewriter",
         "//xla:comparison_util",
         "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -989,7 +813,9 @@ xla_test(
         "//xla/stream_executor:semantic_version",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1100,6 +926,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service/gpu:cublas_cudnn",
@@ -1108,6 +935,8 @@ cc_library(
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -1123,11 +952,12 @@ xla_cc_test(
     srcs = ["cudnn_pad_for_convolutions_test.cc"],
     deps = [
         ":cudnn_pad_for_convolutions",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_googletest//:gtest",
     ],
@@ -1149,6 +979,8 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1167,8 +999,10 @@ xla_cc_test(
         ":cudnn_vectorize_convolutions",
         "//xla:literal",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
         "//xla/hlo/transforms/simplifiers:reshape_mover",
@@ -1177,10 +1011,11 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1197,6 +1032,7 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
@@ -1212,6 +1048,8 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1228,7 +1066,9 @@ xla_cc_test(
     deps = [
         ":cudnn_vectorize_convolutions",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:call_inliner",
         "//xla/service:pattern_matcher",
@@ -1236,7 +1076,6 @@ xla_cc_test(
         "//xla/service/gpu:cublas_cudnn",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
@@ -1277,7 +1116,11 @@ cc_library(
         "//xla/stream_executor/cuda:cudnn_plugin",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-    ]) + ["@com_google_absl//absl/container:flat_hash_map"],
+    ]) + [
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/protobuf:dnn_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )
 
 cc_library(
@@ -1287,8 +1130,11 @@ cc_library(
     tags = ["gpu"],
     deps = [
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/kernels:custom_fusion_library",
         "//xla/service/gpu/kernels:custom_kernel_fusion_pattern",
         "//xla/stream_executor:device_description",
@@ -1296,6 +1142,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1312,10 +1159,11 @@ xla_cc_test(
     deps = [
         ":custom_kernel_fusion_rewriter",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu/kernels:custom_kernel_fusion_pattern",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
     ],
@@ -1334,6 +1182,7 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -1350,6 +1199,7 @@ xla_test(
     deps = [
         ":dot_dimension_sorter",
         "//xla:error_spec",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -1365,6 +1215,7 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
         "@com_google_absl//absl/status:statusor",
@@ -1378,10 +1229,11 @@ xla_cc_test(
     srcs = ["dot_normalizer_test.cc"],
     deps = [
         ":dot_normalizer",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
@@ -1396,8 +1248,10 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
@@ -1461,7 +1315,7 @@ xla_cc_test(
         ":dot_sparsity_rewriter",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
@@ -1507,10 +1361,10 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
         "//xla/hlo/transforms/simplifiers:tuple_simplifier",
         "//xla/hlo/utils:hlo_query",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1529,6 +1383,7 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/ffi:ffi_api",
         "//xla/hlo/analysis:while_loop_analysis",
         "//xla/hlo/ir:hlo",
@@ -1537,6 +1392,7 @@ cc_library(
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:call_graph",
         "//xla/service:custom_call_target_registry",
+        "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:gpu_constants",
@@ -1546,6 +1402,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1566,17 +1424,20 @@ xla_cc_test(
     deps = [
         ":dynamic_slice_fusion_rewriter",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:constants",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:custom_call_target_registry",
         "//xla/service:hlo_module_config",
+        "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -1584,6 +1445,44 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "explicit_collectives_group_async_wrapper",
+    srcs = ["explicit_collectives_group_async_wrapper.cc"],
+    hdrs = ["explicit_collectives_group_async_wrapper.h"],
+    deps = [
+        "//xla:side_effect_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "explicit_collectives_group_async_wrapper_test",
+    srcs = ["explicit_collectives_group_async_wrapper_test.cc"],
+    deps = [
+        ":explicit_collectives_group_async_wrapper",
+        "//xla:side_effect_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "explicit_stream_annotation_async_wrapper",
     srcs = ["explicit_stream_annotation_async_wrapper.cc"],
@@ -1612,8 +1511,8 @@ xla_cc_test(
         "//xla:side_effect_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
@@ -1627,15 +1526,11 @@ cc_library(
     srcs = ["fusion_wrapper.cc"],
     hdrs = ["fusion_wrapper.h"],
     deps = [
+        "//xla/codegen/emitters:fusion_wrapper_base",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
         "//xla/service/gpu:gpu_fusible",
         "//xla/stream_executor:device_description",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -1644,7 +1539,8 @@ xla_cc_test(
     srcs = ["fusion_wrapper_test.cc"],
     deps = [
         ":fusion_wrapper",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/stream_executor:device_description_proto_cc",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1678,6 +1574,7 @@ xla_test(
         ":gemm_broadcast_folding_rewriter",
         ":gemm_rewriter",
         "//xla:error_spec",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:semantic_version",
@@ -1750,7 +1647,7 @@ xla_cc_test(
     deps = [
         ":gemm_fusion_swap_operands",
         "//xla/hlo/testlib:filecheck",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
     ],
@@ -1766,6 +1663,7 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:pattern_matcher",
@@ -1773,7 +1671,6 @@ xla_cc_test(
         "//xla/service/gpu:triton_fusion_analysis",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -1840,7 +1737,7 @@ cc_library(
         "//xla/stream_executor:semantic_version",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -1885,6 +1782,7 @@ xla_test(
         ":gemm_rewriter",
         ":gemm_rewriter_test_lib",
         "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -1911,16 +1809,19 @@ xla_test(
         ":gemm_rewriter",
         ":gemm_rewriter_test_lib",
         "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:semantic_version",
+        "//xla/tests:hlo_runner_agnostic_test_base",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
@@ -1958,7 +1859,7 @@ xla_cc_test(
     deps = [
         ":gemv_rewriter",
         "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
@@ -1966,16 +1867,12 @@ xla_cc_test(
     ],
 )
 
-# TODO(b/358278858): Currently lacking test coverage.
 cc_library(
     name = "gpusolver_rewriter",
-    srcs = if_gpu_is_configured(["gpusolver_rewriter.cc"]),
-    hdrs = if_gpu_is_configured(["gpusolver_rewriter.h"]),
-    deps = if_gpu_is_configured([
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
+    srcs = ["gpusolver_rewriter.cc"],
+    hdrs = ["gpusolver_rewriter.h"],
+    tags = ["gpu"],
+    deps = [
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
@@ -1984,16 +1881,46 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/stream_executor:gpu_solver_context",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:gpu_solver_context",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@com_google_absl//absl/functional:any_invocable",
-    ]),
+    ],
+)
+
+xla_cc_test(
+    name = "gpusolver_rewriter_test",
+    srcs = ["gpusolver_rewriter_test.cc"],
+    tags = ["gpu"],
+    deps = [
+        ":gpusolver_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:pattern_matcher",
+        "//xla/stream_executor:blas",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:gpu_solver_context",
+        "//xla/stream_executor:stream",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+    ],
 )
 
 cc_library(
@@ -2109,9 +2036,9 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:computation_layout",
-        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:layout_assignment",
         "//xla/service:logical_buffer",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/service/gpu:ir_emission_utils",
@@ -2125,6 +2052,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -2190,8 +2118,8 @@ xla_cc_test(
     deps = [
         ":move_copy_to_users",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:layout_assignment",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -2239,12 +2167,12 @@ xla_cc_test(
         ":multi_output_fusion",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:gpu_fusible",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -2256,6 +2184,10 @@ cc_library(
     srcs = ["nest_gemm_fusion.cc"],
     hdrs = ["nest_gemm_fusion.h"],
     deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
@@ -2269,6 +2201,10 @@ cc_library(
         "//xla/service/gpu/model:symbolic_tile_analysis",
         "//xla/service/gpu/model:symbolic_tiled_hlo_instruction",
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
+        "//xla/stream_executor:device_description",
+        "//xla/tools:hlo_extractor",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -2281,9 +2217,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2295,64 +2228,23 @@ xla_cc_test(
     ],
     deps = [
         ":nest_gemm_fusion",
-        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
-        "//xla/service:hlo_cost_analysis",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu:gpu_fusible",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/tests:hlo_test_base",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "pipelined_p2p_rewriter",
-    srcs = ["pipelined_p2p_rewriter.cc"],
-    hdrs = ["pipelined_p2p_rewriter.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/service:collective_ops_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "pipelined_p2p_rewriter_test",
-    srcs = ["pipelined_p2p_rewriter_test.cc"],
-    deps = [
-        ":pipelined_p2p_rewriter",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/testlib:filecheck",
-        "//xla/tests:hlo_test_base",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2363,8 +2255,10 @@ cc_library(
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/analysis:hlo_dfs_reachability",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_traversal",
@@ -2386,23 +2280,24 @@ cc_library(
         "//xla/service/gpu/model:tiled_hlo_instruction_or_computation",
         "//xla/service/gpu/model:triton_emitter_constraints",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2413,6 +2308,7 @@ xla_cc_test(
     deps = [
         ":priority_fusion",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:pattern_matcher",
@@ -2422,7 +2318,6 @@ xla_cc_test(
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -2431,6 +2326,30 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "add_tracking_suffix_to_instruction_names",
+    srcs = ["add_tracking_suffix_to_instruction_names.cc"],
+    hdrs = ["add_tracking_suffix_to_instruction_names.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "add_tracking_suffix_to_instruction_names_test",
+    srcs = ["add_tracking_suffix_to_instruction_names_test.cc"],
+    deps = [
+        ":add_tracking_suffix_to_instruction_names",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "reduce_scatter_creator",
     srcs = ["reduce_scatter_creator.cc"],
@@ -2459,11 +2378,11 @@ xla_cc_test(
         ":reduce_scatter_creator",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -2499,7 +2418,7 @@ xla_cc_test(
     ],
     deps = [
         ":reduction_degenerate_dim_remover",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
@@ -2533,7 +2452,7 @@ xla_cc_test(
     ],
     deps = [
         ":reduction_dimension_grouper",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
@@ -2609,11 +2528,11 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
 )
@@ -2639,7 +2558,7 @@ xla_cc_test(
     srcs = ["rename_fusions_test.cc"],
     deps = [
         ":rename_fusions",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -2669,9 +2588,9 @@ xla_cc_test(
         ":sanitize_constant_names",
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -2721,48 +2640,11 @@ xla_cc_test(
         ":scatter_slice_simplifier",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "schedule_postprocessing",
-    srcs = ["schedule_postprocessing.cc"],
-    hdrs = ["schedule_postprocessing.h"],
-    deps = [
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu/transforms/collectives:collective_ops_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "schedule_postprocessing_test",
-    srcs = ["schedule_postprocessing_test.cc"],
-    deps = [
-        ":schedule_postprocessing",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2788,7 +2670,7 @@ xla_cc_test(
         ":scheduling_instruction_annotator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -2851,6 +2733,7 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
@@ -2859,7 +2742,6 @@ xla_cc_test(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -2871,12 +2753,8 @@ xla_cc_test(
 
 cc_library(
     name = "sort_rewriter",
-    srcs = if_gpu_is_configured(
-        ["sort_rewriter.cc"],
-        ["sort_rewriter_stub.cc"],
-    ),
+    srcs = ["sort_rewriter.cc"],
     hdrs = ["sort_rewriter.h"],
-    visibility = ["//xla/service/gpu:__subpackages__"] + if_google(["//learning/brain/engprod/xwatch:__subpackages__"]),
     deps = [
         "//xla:comparison_util",
         "//xla:literal_util",
@@ -2888,12 +2766,15 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:cublas_cudnn",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -2906,6 +2787,7 @@ xla_test(
     ],
     tags = [
         "cuda-only",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
         ":sort_rewriter",
@@ -2914,14 +2796,18 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:pattern_matcher",
+        "//xla/service:platform_util",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/tests:hlo_test_base",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:platform",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2957,9 +2843,9 @@ xla_cc_test(
         ":stream_attribute_annotator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -2993,8 +2879,8 @@ xla_cc_test(
     deps = [
         ":stream_attribute_async_wrapper",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -3114,7 +3000,7 @@ xla_cc_test(
     ],
     deps = [
         ":transpose_dimension_grouper",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:errors",
@@ -3159,15 +3045,14 @@ xla_cc_test(
     ],
     deps = [
         ":tree_reduction_rewriter",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
     ],
 )
 
-# TODO(b/358278858): Currently lacking test coverage.
 cc_library(
     name = "triangular_solve_rewriter",
     srcs = ["triangular_solve_rewriter.cc"],
@@ -3187,6 +3072,23 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "triangular_solve_rewriter_test",
+    srcs = [
+        "triangular_solve_rewriter_test.cc",
+    ],
+    deps = [
+        ":triangular_solve_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "triton_fusion_numerics_verifier",
     srcs = ["triton_fusion_numerics_verifier.cc"],
@@ -3197,9 +3099,9 @@ cc_library(
         ":priority_fusion",
         ":tree_reduction_rewriter",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:util",
         "//xla:xla_proto_cc",
+        "//xla/backends/gpu/runtime:buffer_comparator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:dump",
@@ -3208,13 +3110,15 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:buffer_comparator",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/autotuning:autotuner_compile_util",
         "//xla/service/gpu/autotuning:autotuner_util",
+        "//xla/service/gpu/autotuning:redzone_buffers",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream",
         "//xla/tools:hlo_decomposer_lib",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
@@ -3223,8 +3127,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3237,6 +3139,7 @@ xla_test(
         "gpu_h100",
         "gpu_b200",
     ],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":triton_fusion_numerics_verifier",
         "//xla:shape_util",
@@ -3247,13 +3150,13 @@ xla_test(
         "//xla/service/gpu/autotuning:autotuner_compile_util",
         "//xla/service/gpu/autotuning:autotuner_util",
         "//xla/stream_executor:platform",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -3289,8 +3192,8 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
     ],
@@ -3336,10 +3239,10 @@ xla_cc_test(
         ":windowed_einsum_handler",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
@@ -3391,6 +3294,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
+        "//xla/service:collective_ops_utils",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -3458,3 +3362,46 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "splitk_rewriter",
+    srcs = ["splitk_rewriter.cc"],
+    hdrs = ["splitk_rewriter.h"],
+    deps = [
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:hlo_creation_utils",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "splitk_rewriter_test",
+    srcs = ["splitk_rewriter_test.cc"],
+    deps = [
+        ":splitk_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tests:test_utils",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
new file mode 100644
index 000000000000..297e2523159b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.cc
@@ -0,0 +1,57 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<bool> AddTrackingSuffixToInstructionNames::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+
+  // Only rename instructions in non-fusion computations.
+  for (xla::HloComputation* computation :
+       module->MakeNonfusionComputationsSorted(execution_threads)) {
+    for (auto* instruction : computation->instructions()) {
+      // Skip non-fusible instruction to avoid breaking tests that are not
+      // related to fusion.
+      if (instruction->opcode() == HloOpcode::kParameter ||
+          instruction->opcode() == HloOpcode::kCustomCall ||
+          instruction->opcode() == HloOpcode::kFusion ||
+          !instruction->IsFusible())
+        continue;
+
+      auto new_name = absl::StrCat(instruction->name(), ".0");
+      module->SetAndUniquifyInstrName(instruction, new_name);
+
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
new file mode 100644
index 000000000000..741937b79336
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h
@@ -0,0 +1,51 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ADD_TRACKING_SUFFIX_TO_INSTRUCTION_NAMES_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ADD_TRACKING_SUFFIX_TO_INSTRUCTION_NAMES_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Appends ".0" suffix to instruction names.
+//
+// Priority fusion pass duplicates instructions, and it's hard to match
+// instructions before and after the run as they got renamed.
+// To make debugging easier, we append ".0" suffix to instruction names
+// and priority fusion pass will increment this suffix:
+//
+// Original: broadcast.123
+// After this pass: broadcast.123.0
+// After priority fusion: broadcast.123.1, broadcast.123.2, ...
+//
+// One can match instructions before and after by their original name.
+class AddTrackingSuffixToInstructionNames : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "rename-instructions"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ADD_TRACKING_SUFFIX_TO_INSTRUCTION_NAMES_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names_test.cc b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names_test.cc
new file mode 100644
index 000000000000..30d24a03e6e3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/add_tracking_suffix_to_instruction_names.h"
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class RenameInstructionsTest : public HloHardwareIndependentTestBase {};
+
+TEST_F(RenameInstructionsTest, BasicCase) {
+  absl::string_view kHlo = R"(
+  HloModule m
+
+  ENTRY main {
+    param_0.315 = f32[] parameter(0)
+    log.482 = f32[] log(param_0.315)
+    param_1.426 = f32[] parameter(1)
+    subtract.22 = f32[] subtract(log.482, param_1.426)
+    ROOT exponential.15 = f32[] exponential(subtract.22)
+  }
+  )";
+  RunAndFilecheckHloRewrite(kHlo, AddTrackingSuffixToInstructionNames(), R"(
+  // CHECK: ENTRY %main {{.*}} {
+  // CHECK:   %param_0.315 = f32[] parameter(0)
+  // CHECK:   %log.482.0 = f32[] log(%param_0.315)
+  // CHECK:   %param_1.426 = f32[] parameter(1)
+  // CHECK:   %subtract.22.0 = f32[] subtract(%log.482.0, %param_1.426)
+  // CHECK:   ROOT %exponential.15.0 = f32[] exponential(%subtract.22.0)
+  // CHECK: })");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
index f03ea5706bb4..94a9115ef087 100644
--- a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
@@ -82,7 +82,11 @@ absl::Status GpuAlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
 }
 
 bool GpuAlgebraicSimplifierVisitor::SupportedDotPrecisionConfig(
-    const PrecisionConfig& config) {
+    const PrecisionConfig& config, bool has_contracting_dim) {
+  if (!has_contracting_dim) {
+    return config.algorithm() == PrecisionConfig::ALG_UNSET ||
+           config.algorithm() == PrecisionConfig::ALG_DOT_F32_F32_F32;
+  }
   return config.algorithm() == PrecisionConfig::ALG_UNSET ||
          config.algorithm() == PrecisionConfig::ALG_DOT_BF16_BF16_F32 ||
          config.algorithm() == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3 ||
@@ -135,10 +139,10 @@ bool GpuAlgebraicSimplifierVisitor::ShouldStrengthReduceDotToReduce(
   DotDimensionNumbers dnums = dot->dot_dimension_numbers();
   bool lhs_is_vector = (dnums.lhs_batch_dimensions_size() +
                             dnums.lhs_contracting_dimensions_size() ==
-                        lhs->shape().rank());
+                        lhs->shape().dimensions().size());
   bool rhs_is_vector = (dnums.rhs_batch_dimensions_size() +
                             dnums.rhs_contracting_dimensions_size() ==
-                        rhs->shape().rank());
+                        rhs->shape().dimensions().size());
   // Strength-reduce vector-vector dots since they are not supported by
   // GemmFusion.
   if (lhs_is_vector && rhs_is_vector) {
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
index 51a4c67fdefe..781dea4d5fb6 100644
--- a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
@@ -45,7 +45,8 @@ class GpuAlgebraicSimplifierVisitor : public AlgebraicSimplifierVisitor {
 
  private:
   // Returns true if the dot precision config is supported by simplifier.
-  bool SupportedDotPrecisionConfig(const PrecisionConfig& config) override;
+  bool SupportedDotPrecisionConfig(const PrecisionConfig& config,
+                                   bool has_contracting_dim) override;
 
   // Makes algorithm specific set of instructions for multiply with precision
   // algorithm in mind. In the trivial case it returns just multiply.
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier_test.cc b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier_test.cc
index 50e76c98201e..54bc36e6f78e 100644
--- a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier_test.cc
@@ -19,12 +19,15 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -32,7 +35,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-class GpuAlgebraicSimplifierTest : public HloTestBase {
+class GpuAlgebraicSimplifierTest : public HloHardwareIndependentTestBase {
  public:
   se::CudaComputeCapability Ampere() {
     return se::CudaComputeCapability::Ampere();
@@ -360,5 +363,27 @@ TEST_F(GpuAlgebraicSimplifierTest,
   ASSERT_TRUE(GpuAlgebraicSimplifier(options, Ampere()).Run(m.get()).value());
 }
 
+TEST_F(
+    GpuAlgebraicSimplifierTest,
+    DotToMultiplyRewriteForZeroContractingDimWith_BF16_BF16_F32_X6_Algorithm) {
+  constexpr char kModuleStr[] = R"(
+    HloModule test
+    ENTRY dot {
+    a = f32[] parameter(0)
+    b = f32[] parameter(1)
+    ROOT dot = f32[] dot(a, b),
+      algorithm=dot_bf16_bf16_f32_x6
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options;
+  ASSERT_TRUE(GpuAlgebraicSimplifier(options, Ampere()).Run(m.get()).value());
+  constexpr absl::string_view kPattern = R"(
+    CHECK-COUNT-6: %[[partial_result:.*]] = bf16[] multiply
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(bool matched, RunFileCheck(m->ToString(), kPattern));
+  EXPECT_TRUE(matched);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params_test.cc b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params_test.cc
index 32c1e5b9fc6f..0f4fa0ae8164 100644
--- a/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/alias_passthrough_params_test.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/alias_passthrough_params.h"
 
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
 
-class AliasPassthroughParamsTest : public HloTestBase {};
+class AliasPassthroughParamsTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(AliasPassthroughParamsTest, AliasPassThroughParams) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/async_collective_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/async_collective_annotator_test.cc
index 6622a7b2d200..1dcd0e677242 100644
--- a/third_party/xla/xla/service/gpu/transforms/async_collective_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/async_collective_annotator_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
@@ -98,7 +98,7 @@ struct TestCase {
 };
 
 class AsyncCollectiveAnnotatorTest
-    : public HloTestBase,
+    : public HloHardwareIndependentTestBase,
       public ::testing::WithParamInterface<TestCase> {};
 
 XLA_TEST_P(AsyncCollectiveAnnotatorTest, Test) {
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
index 23bc6bba4613..0b8e10277b1d 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.cc
@@ -59,8 +59,8 @@ absl::StatusOr<HloInstruction*> ExpandInstructionUsingBuilder(
 // Determine block size from the shapes.
 absl::StatusOr<int> GetBlockSize(const Shape& quant_shape,
                                  const Shape& scale_shape) {
-  int rank = quant_shape.rank();
-  TF_RET_CHECK(rank >= 1 && rank == scale_shape.rank());
+  int rank = quant_shape.dimensions().size();
+  TF_RET_CHECK(rank >= 1 && rank == scale_shape.dimensions().size());
   TF_RET_CHECK(quant_shape.dimensions().subspan(0, rank - 1) ==
                scale_shape.dimensions().subspan(0, rank - 1));
   int m = quant_shape.dimensions(rank - 1);
@@ -94,7 +94,7 @@ absl::StatusOr<XlaOp> BuildQuantize(XlaBuilder& builder,
                   Abs(Parameter(&amax_builder, 1, scalar, "b")));
   TF_ASSIGN_OR_RETURN(XlaComputation amax_comp, amax_builder.Build(out));
   XlaOp amax = Reduce(input_blocks, ConstantLiteral(&builder, Literal(scalar)),
-                      amax_comp, {scale_shape.rank()});
+                      amax_comp, {scale_shape.dimensions_size()});
 
   // Use EMAX of the quantization type as the denominator.
   double emax_value =
@@ -109,7 +109,7 @@ absl::StatusOr<XlaOp> BuildQuantize(XlaBuilder& builder,
   XlaOp scale_cvt = ConvertElementType(scale, scalar.element_type());
 
   // Broadcast scale to input shape.
-  std::vector<int64_t> broadcast_dims(scale_shape.rank());
+  std::vector<int64_t> broadcast_dims(scale_shape.dimensions().size());
   absl::c_iota(broadcast_dims, 0);
   XlaOp scale_bc = BroadcastInDim(scale_cvt, new_dims, broadcast_dims);
   new_dims.pop_back();
@@ -129,7 +129,7 @@ absl::StatusOr<HloInstruction*> ExpandQuantizeCustomCall(
   if (instruction->operand_count() != 1) {
     return InvalidArgument("Incorrect number of operands for quantize op");
   }
-  if (instruction->shape().tuple_shapes_size() != 2 ||
+  if (instruction->shape().tuple_shapes().size() != 2 ||
       instruction->operand(0)->shape().dimensions() !=
           instruction->shape().tuple_shapes(0).dimensions()) {
     return InvalidArgument("Incorrect output shape for quantize op");
@@ -162,7 +162,7 @@ absl::StatusOr<XlaOp> BuildDequantize(XlaOp input_op, XlaOp scale_op,
   std::vector<int64_t> new_dims(scale_shape.dimensions().begin(),
                                 scale_shape.dimensions().end());
   new_dims.push_back(block_size);
-  std::vector<int64_t> broadcast_dims(scale_shape.rank());
+  std::vector<int64_t> broadcast_dims(scale_shape.dimensions().size());
   absl::c_iota(broadcast_dims, 0);
   scale_op = BroadcastInDim(scale_op, new_dims, broadcast_dims);
   new_dims.pop_back();
@@ -252,16 +252,19 @@ absl::StatusOr<std::tuple<XlaOp, XlaOp, int64_t>> BuildScaledDotInputs(
   XlaBuilder& builder = *input_op.builder();
   TF_ASSIGN_OR_RETURN(Shape input_shape, builder.GetShape(input_op));
   TF_ASSIGN_OR_RETURN(Shape scale_shape, builder.GetShape(scale_op));
-  TF_RET_CHECK(input_shape.rank() == 2 || input_shape.rank() == 3);
+  TF_RET_CHECK(input_shape.dimensions().size() == 2 ||
+               input_shape.dimensions().size() == 3);
 
   // Calculate output shape size.
-  int64_t batch_size = input_shape.rank() == 3 ? input_shape.dimensions(0) : 1;
+  int64_t batch_size =
+      input_shape.dimensions().size() == 3 ? input_shape.dimensions(0) : 1;
   int64_t size_contracting = input_shape.dimensions().back();
-  int64_t size_noncontracting = input_shape.dimensions(input_shape.rank() - 2);
+  int64_t size_noncontracting =
+      input_shape.dimensions(input_shape.dimensions().size() - 2);
   int64_t scale_contracting = scale_shape.dimensions().back();
 
   // Reshape inputs, if necessary.
-  if (input_shape.rank() != 3) {
+  if (input_shape.dimensions().size() != 3) {
     input_op = Reshape(input_op, {1, size_noncontracting, size_contracting});
     scale_op = Reshape(scale_op, {1, size_noncontracting, scale_contracting});
   }
@@ -413,9 +416,9 @@ absl::StatusOr<HloInstruction*> ExpandBlockScaledDotCustomCall(
   const Shape& lhs_shape = instruction->operand(0)->shape();
   const Shape& rhs_shape = instruction->operand(1)->shape();
   DotDimensionNumbers dnums;
-  dnums.add_lhs_contracting_dimensions(lhs_shape.rank() - 1);
-  dnums.add_rhs_contracting_dimensions(rhs_shape.rank() - 1);
-  if (lhs_shape.rank() == 3) {
+  dnums.add_lhs_contracting_dimensions(lhs_shape.dimensions().size() - 1);
+  dnums.add_rhs_contracting_dimensions(rhs_shape.dimensions().size() - 1);
+  if (lhs_shape.dimensions().size() == 3) {
     dnums.add_lhs_batch_dimensions(0);
     dnums.add_rhs_batch_dimensions(0);
   }
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
index ecdb1ad60e8a..37d261fe6e18 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/op_expander_pass.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_cudnn_test.cc b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_cudnn_test.cc
index 7f4e74467170..84e5e21c70d2 100644
--- a/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/block_scaling_rewriter_cudnn_test.cc
@@ -13,17 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/transforms/block_scaling_rewriter.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/status_matchers.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/status_matchers.h"
 
 namespace xla::gpu {
 namespace {
 
 using ::tsl::testing::IsOkAndHolds;
-using BlockScalingRewriterCudnnTest = HloTestBase;
+using BlockScalingRewriterCudnnTest =
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 TEST_F(BlockScalingRewriterCudnnTest, Mxfp8) {
   constexpr absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.cc
deleted file mode 100644
index ddc8c01b0a83..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/while_loop_analysis.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/literal_util.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/collective_permute_cycle.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/source_target_pairs.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-using CycleType = collective_permute_cycle::CycleType;
-// Finds and returns the non-constant operand in instr.
-// CHECK-fails if instr doesn't have exactly one unique non-constant operand.
-static const HloInstruction* NonConstantOperand(const HloInstruction* instr) {
-  const HloInstruction* result = nullptr;
-  for (const HloInstruction* operand : instr->operands()) {
-    if (!operand->IsConstant()) {
-      if (result != nullptr) {
-        CHECK_EQ(result, operand);
-      }
-      result = operand;
-    }
-  }
-  CHECK_NE(result, nullptr);
-  return result;
-}
-
-// Finds the step (k) for while instruction, if the loop is of the form:
-//
-// while(cond) {
-//   ind_var = ind_var + k
-// }
-//
-// If this pattern is not found, it returns std::nullopt.
-std::optional<int64_t> GetStep(HloInstruction* while_inst) {
-  // Get the update operation
-  std::optional<int64_t> indvar_tuple_idx =
-      GetLoopInductionVarTupleIdx(while_inst);
-  if (!indvar_tuple_idx) {
-    return std::nullopt;
-  };
-  auto* while_body_indvar_update =
-      while_inst->while_body()->root_instruction()->mutable_operand(
-          *indvar_tuple_idx);
-  auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
-
-  HloInstruction* trip_count_increase_step_instr = nullptr;
-  if (!Match(while_body_indvar_update,
-             match::AddAnyOrder(match::Op().Is(while_body_indvar),
-                                match::Op(&trip_count_increase_step_instr)))) {
-    return std::nullopt;
-  }
-  return LiteralUtil::LiteralAsScalarInt64(
-      trip_count_increase_step_instr->literal());
-}
-
-absl::StatusOr<bool> CollectivePermuteValidIterationAnnotator::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  for (HloComputation* comp : module->computations(execution_threads)) {
-    for (HloInstruction* inst : comp->instructions()) {
-      if (HloPredicateIsNotOp<HloOpcode::kCollectivePermute>(inst)) {
-        continue;
-      }
-
-      if (inst->frontend_attributes().map().contains(kSendRecvValidationAttr)) {
-        continue;
-      }
-      CycleType cycleType =
-          GetCycleTypeAndIndices(inst->source_target_pairs()).first;
-
-      if (cycleType == CycleType::kNone) {
-        continue;
-      }
-
-      HloInstruction* whileOp = inst->parent()->WhileCallInstruction();
-      if (whileOp == nullptr) {
-        VLOG(2) << "No surrounding while op found. Ignoring " << inst->name();
-        continue;
-      }
-      if (!whileOp->frontend_attributes().map().contains(
-              "is_pipelined_while_loop")) {
-        continue;
-      }
-
-      TF_ASSIGN_OR_RETURN(WhileLoopBackendConfig config,
-                          whileOp->backend_config<WhileLoopBackendConfig>());
-      if (!config.has_known_trip_count()) {
-        VLOG(2) << "Trip count for while loop (" << whileOp->name()
-                << "): unknown";
-        continue;
-      }
-
-      int64_t trip_count = config.known_trip_count().n();
-      std::optional<int64_t> step = GetStep(whileOp);
-      VLOG(2) << "Trip count for while loop (" << whileOp->name()
-              << "): " << trip_count;
-      if (!step) {
-        VLOG(2) << "Could not find step for while operation";
-        continue;
-      }
-      VLOG(2) << "Step for while loop (" << whileOp->name() << "): " << *step;
-      if (*step != 1) {
-        VLOG(2) << "Step is not 1. Skipping...";
-        continue;
-      }
-
-      // For each source i, the send/recv iteration instances are {i, i+offset}
-      // where offset is `number of microbatches * CR - 1`. We know that
-      // `trip_count = number_of_microbatches * CR + num_devices - 1` So, offset
-      // = number_of_microbatches * CR - 1 = trip_count - num_devices.
-      SourceTargetPairs sourceTargetPairs(inst->source_target_pairs());
-      int64_t num_devices = sourceTargetPairs.GetMaxDeviceNum() + 1;
-      int64_t offset = trip_count - num_devices;
-      SourceTargetPairs sendRecvValidation;
-      for (int64_t currIdx = 0; currIdx < sourceTargetPairs.size(); currIdx++) {
-        sendRecvValidation.emplace_back(currIdx, currIdx + offset);
-      }
-
-      if (cycleType == CycleType::kBackward) {
-        std::reverse(sendRecvValidation.data().begin(),
-                     sendRecvValidation.data().end());
-      }
-
-      inst->set_frontend_attribute(kSendRecvValidationAttr,
-                                   sendRecvValidation.ToString());
-      changed = true;
-    }
-  }
-  return changed;
-}
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h b/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h
deleted file mode 100644
index dc00e7b595ba..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_VALID_ITERATION_ANNOTATOR_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_VALID_ITERATION_ANNOTATOR_H_
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-
-namespace xla {
-
-// This is an unsafe transformation that is triggered only if the attribute
-// `is_pipelined_while_loop` is present on a while loop.
-//
-// If a while loop is known to be a pipelined while loop, has a known trip count
-// and increments with step=1, then this pass annotates the `collective-permute`
-// operations within the while loop with valid iterations for each GPU. This is
-// only done when the source-target pairs of the `collective-permute` operation
-// form a forward or backward cycle.
-//
-// For example, if the trip count is 10 (iteration 0 to 9), with step=1, and the
-// source-target pairs of a `collective-permute` operation are
-// `{{0,1},{1,2},{2,3},{3,0}}`, then this pass would annotate such operation
-// with `_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"`. This annotation
-// means that
-//   - for GPU index 0, the valid iterations are 0,1,2,3,4,5,6.
-//   - for GPU index 1, the valid iterations are 1,2,3,4,5,6,7.
-//   - for GPU index 2, the valid iterations are 2,3,4,5,6,7,8.
-//   - for GPU index 3, the valid iterations are 3,4,5,6,7,8,9.
-//
-// The index in the list denotes the device index and the bounds {start,end} are
-// inclusive. For more examples, look at
-// `xla/service/spmd/collective_permute_valid_iteration_annotator_tests.cc`.
-class CollectivePermuteValidIterationAnnotator : public HloModulePass {
- public:
-  CollectivePermuteValidIterationAnnotator() = default;
-  absl::string_view name() const override {
-    return "collective-permute-valid-iteration-annotator";
-  }
-
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-};
-
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_PERMUTE_VALID_ITERATION_ANNOTATOR_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator_test.cc
deleted file mode 100644
index 282a3e16aa00..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/collective_permute_valid_iteration_annotator_test.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h"
-
-#include <memory>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla {
-namespace {
-
-const char* base_hlo = R"(
-    HloModule test, entry_computation_layout={()->(s32[], s32[])}
-    %Body (param: (s32[], s32[])) -> (s32[], s32[]) {
-      %param = (s32[], s32[]) parameter(0)
-      %i = s32[] get-tuple-element((s32[], s32[]) %param), index=1
-      %one = s32[] constant(1)
-      %i_plus_one = s32[] add(s32[] %i, s32[] %one)
-      %permute = s32[] collective-permute(%i_plus_one), channel_id=1,
-        source_target_pairs=$source_target_pairs
-      ROOT %tuple = (s32[], s32[]) tuple(s32[] %permute, s32[] %i_plus_one)
-    }
-    %Cond (param.1: (s32[], s32[])) -> pred[] {
-      %param.1 = (s32[], s32[]) parameter(0)
-      %i.1 = s32[] get-tuple-element((s32[], s32[]) %param.1), index=1
-      %trip_count = s32[] constant(10)
-      ROOT %done = pred[] compare(s32[] %i.1, s32[] %trip_count), direction=LT
-    }
-    ENTRY %test () -> (s32[], s32[]) {
-      %i_start = s32[] constant(0)
-      %p_start = s32[] constant(0)
-      %initial_tuple = (s32[], s32[]) tuple(s32[] %i_start, s32[] %p_start)
-      ROOT %while = (s32[], s32[]) while((s32[], s32[]) %initial_tuple),
-        condition=%Cond, body=%Body,
-        frontend_attributes={$is_pipelined_while_loop}
-    }
-  )";
-
-struct AnnotatorTestCase {
-  std::string test_name;
-  std::string pairs;
-  std::string pipelined_attr;
-  bool expect_change;
-  std::optional<std::string> validate_attr;
-
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const AnnotatorTestCase& tc) {
-    os << absl::StrFormat("%s : %s, %s", tc.test_name, tc.pairs,
-                          tc.pipelined_attr);
-    return os;
-  }
-};
-
-std::vector<AnnotatorTestCase> GetTestCases() {
-  return {
-      {.test_name = "NoPipelinedWhileLoopAttribute",
-       .pairs = "{{0,1},{1,2},{2,3},{3,0}}",
-       .pipelined_attr = "",
-       .expect_change = false,
-       .validate_attr = std::nullopt},
-
-      {.test_name = "ForwardCycle",
-       .pairs = "{{0,1},{1,2},{2,3},{3,0}}",
-       .pipelined_attr = "is_pipelined_while_loop=\"true\"",
-       .expect_change = true,
-       .validate_attr = "{{0,6},{1,7},{2,8},{3,9}}"},
-
-      {.test_name = "BackwardCycle",
-       .pairs = "{{0,3},{1,0},{2,1},{3,2}}",
-       .pipelined_attr = "is_pipelined_while_loop=\"true\"",
-       .expect_change = true,
-       .validate_attr = "{{3,9},{2,8},{1,7},{0,6}}"},
-  };
-}
-
-class AnnotatorTest : public HloHardwareIndependentTestBase,
-                      public ::testing::WithParamInterface<AnnotatorTestCase> {
-};
-
-TEST_P(AnnotatorTest, ParameterizedAnnotatorTest) {
-  const AnnotatorTestCase& test_case = GetParam();
-  FixedMapping params = {
-      {"$source_target_pairs", test_case.pairs},
-      {"$is_pipelined_while_loop", test_case.pipelined_attr}};
-
-  // WhileLoopTripCountAnnotator should discover the while loop and add the
-  // trip count to the backend config which is used by the
-  // CollectivePermuteValidIterationAnnotator.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> module,
-      RunAndCheckHloRewrite(base_hlo, WhileLoopTripCountAnnotator(), true,
-                            params));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      CollectivePermuteValidIterationAnnotator().Run(module.get()));
-  VLOG(2) << "HLO after CollectivePermuteValidIterationAnnotator: \n"
-          << module->ToString(HloPrintOptions::ShortParsable()
-                                  .set_print_control_dependencies(true));
-  EXPECT_EQ(changed, test_case.expect_change);
-
-  HloInstruction* cp_base =
-      FindInstruction(module.get(), HloOpcode::kCollectivePermute);
-  ASSERT_NE(cp_base, nullptr);
-  HloCollectivePermuteInstruction* cp =
-      Cast<HloCollectivePermuteInstruction>(cp_base);
-  ASSERT_EQ(cp->get_frontend_attribute(kSendRecvValidationAttr),
-            test_case.validate_attr);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    AnnotatorTestSuite, AnnotatorTest, testing::ValuesIn(GetTestCases()),
-    [](const testing::TestParamInfo<AnnotatorTest::ParamType>& info) {
-      return info.param.test_name;
-    });
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collective_send_recv_combiner_test.cc b/third_party/xla/xla/service/gpu/transforms/collective_send_recv_combiner_test.cc
index f164f6a44ac5..a538302240ad 100644
--- a/third_party/xla/xla/service/gpu/transforms/collective_send_recv_combiner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collective_send_recv_combiner_test.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/filecheck.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using CollectiveSendRecvCombinerTest = HloTestBase;
+using CollectiveSendRecvCombinerTest = HloHardwareIndependentTestBase;
 
 TEST_F(CollectiveSendRecvCombinerTest, TransformedWithSourceTargetPairs) {
   const char* kHloStr = R"(
@@ -57,24 +57,24 @@ TEST_F(CollectiveSendRecvCombinerTest, TransformedWithSourceTargetPairs) {
     CHECK-SAME: ((f32[], u32[], token[]), (f32[], u32[], token[]))
     CHECK-NEXT: %[[PARAM0:.*]] = f32[] parameter(0)
     CHECK: %[[PARAM1:.*]] = token[] parameter(1)
-    CHECK: %[[SEND1:.*]] = ({{.*}}) send(f32[] %[[PARAM0]], token[] %[[PARAM1]]), channel_id=1,
+    CHECK: %[[SEND1:.*]] = ({{.*}}) send(%[[PARAM0]], %[[PARAM1]]), channel_id=1,
     CHECK-SAME: frontend_attributes{{.*}}_xla_send_recv_source_target_pairs{{.*}}0,1{{.*}}1,2{{.*}}2,3{{.*}}
     CHECK-NEXT: %[[PARAM2:.*]] = {{.*}} parameter(2)
-    CHECK: %[[RECV1:.*]] = ({{.*}}) recv({{.*}} %[[PARAM2]]), channel_id=1,
+    CHECK: %[[RECV1:.*]] = ({{.*}}) recv(%[[PARAM2]]), channel_id=1,
     CHECK-SAME: frontend_attributes{{.*}}_xla_send_recv_source_target_pairs{{.*}}0,1{{.*}}1,2{{.*}}2,3{{.*}}
-    CHECK-NEXT: ROOT %[[OUT:.*]] = {{.*}} tuple(({{.*}}) %[[SEND1]], ({{.*}}) %[[RECV1]])
+    CHECK-NEXT: ROOT %[[OUT:.*]] = {{.*}} tuple(%[[SEND1]], %[[RECV1]])
     CHECK: ENTRY %[[MAIN:.*]] () -> f32[]
     CHECK: %[[DATA:.*]] = {{.*}} constant(5)
     CHECK: %[[RECV_START:.*]] = {{.*}} after-all()
-    CHECK: %[[TUPLE_START:.*]] = {{.*}} async-start({{.*}} %[[DATA]], {{.*}} %[[RECV_START]], {{.*}} %[[RECV_START]]), calls=%[[WRAPPED_SEND_RECV]]
-    CHECK-NEXT: %[[TUPLE_DONE:.*]] = {{.*}} async-done({{.*}}%[[TUPLE_START]])
-    CHECK %[[GTE2:.*]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE_DONE]]), index=1
-    CHECK %[[GTE3:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE2]]), index=0
-    CHECK %[[GTE4:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE2]]), index=2
-    CHECK %[[TUPLE1:.*]] = {{.*}} tuple({{.*}} %[[GTE3:.*]], {{.*}} %[[GTE4]])
-    CHECK ROOT %[[OUT:.*]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE1]]), index=0
-    CHECK %[[GTE:.*]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE_DONE]]), index=0
-    CHECK %[[GTE1:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE]]), index=2
+    CHECK: %[[TUPLE_START:.*]] = {{.*}} async-start(%[[DATA]], %[[RECV_START]], %[[RECV_START]]), calls=%[[WRAPPED_SEND_RECV]]
+    CHECK-NEXT: %[[TUPLE_DONE:.*]] = {{.*}} async-done(%[[TUPLE_START]])
+    CHECK %[[GTE2:.*]] = {{.*}} get-tuple-element(%[[TUPLE_DONE]]), index=1
+    CHECK %[[GTE3:.*]] = {{.*}} get-tuple-element(%[[GTE2]]), index=0
+    CHECK %[[GTE4:.*]] = {{.*}} get-tuple-element(%[[GTE2]]), index=2
+    CHECK %[[TUPLE1:.*]] = {{.*}} tuple(%[[GTE3]], %[[GTE4]])
+    CHECK ROOT %[[OUT:.*]] = {{.*}} get-tuple-element(%[[TUPLE1]]), index=0
+    CHECK %[[GTE:.*]] = {{.*}} get-tuple-element(%[[TUPLE_DONE]]), index=0
+    CHECK %[[GTE1:.*]] = {{.*}} get-tuple-element(%[[GTE]]), index=2
   )"));
 }
 
@@ -153,42 +153,27 @@ TEST_F(CollectiveSendRecvCombinerTest, TransformedWithControlDependency) {
   TF_ASSERT_OK_AND_ASSIGN(bool changed, combiner.Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
-    CHECK: %[[WRAPPED_SEND_RECV:.*]]
-      (param0: f32[], param1: token[], param2: token[]) ->
-      ((f32[], u32[], token[]), (f32[], u32[], token[])) {
+    CHECK: %[[WRAPPED_SEND_RECV:.*]] (param0: f32[], param1: token[], param2: token[]) -> ((f32[], u32[], token[]), (f32[], u32[], token[])) {
 
     CHECK: %[[PARAM0:.*]] = f32[] parameter(0)
     CHECK: %[[PARAM1:.*]] = token[] parameter(1)
-    CHECK: %[[SEND1:.*]] = (f32[], u32[], token[]) send(f32[] %[[PARAM0:.*]],
-      token[] %[[PARAM1:.*]]), channel_id=1
+    CHECK: %[[SEND1:.*]] = (f32[], u32[], token[]) send(%[[PARAM0]], %[[PARAM1]]), channel_id=1
     CHECK: %[[PARAM2:.*]] = token[] parameter(2)
-    CHECK: %[[RECV1:.*]] = (f32[], u32[], token[])
-      recv(token[] %[[PARAM2:.*]]), channel_id=1
-    CHECK: ROOT %[[OUT:.*]] = ((f32[], u32[], token[]),
-      (f32[], u32[], token[])) tuple((f32[], u32[], token[])
-      %[[SEND1:.*]], (f32[], u32[], token[]) %[[RECV1:.*]])
+    CHECK: %[[RECV1:.*]] = (f32[], u32[], token[]) recv(%[[PARAM2]]), channel_id=1
+    CHECK: ROOT %[[OUT:.*]] = ((f32[], u32[], token[]), (f32[], u32[], token[])) tuple(%[[SEND1]], %[[RECV1]])
 
     CHECK: ENTRY %[[MAIN:.*]] () -> f32[] {
     CHECK: %[[DATA:.*]] = f32[] constant(5)
     CHECK: %[[RECV_START:.*]] = token[] after-all()
-    CHECK: %[[TUPLE_START:.*]] = ((f32[], token[], token[]),
-      ((f32[], u32[], token[]), (f32[], u32[], token[])), s32[])
-      async-start(f32[] %[[DATA:.*]], token[] %[[RECV_START:.*]],
-      token[] %[[RECV_START:.*]]), calls=%[[WRAPPED_SEND_RECV:.*]]
-    CHECK: %[[TUPLE_DONE:.*]] = ((f32[], u32[], token[]),
-      (f32[], u32[], token[])) async-done(((f32[], token[], token[]),
-      ((f32[], u32[], token[]), (f32[], u32[], token[])), s32[]) %[[TUPLE_START:.*]])
-    CHECK %[[GTE2:.*]] = (f32[], u32[], token[])
-      get-tuple-element(((f32[], u32[], token[]),
-      (f32[], u32[], token[])) %[[TUPLE_DONE:.*]]), index=1
-    CHECK %[[GTE3:.*]] = f32[] get-tuple-element((f32[], u32[], token[]) %[[GTE2:.*]]), index=0
-    CHECK %[[GTE4:.*]] = token[] get-tuple-element((f32[], u32[], token[]) %[[GTE2:.*]]), index=2
-    CHECK %[[TUPLE1:.*]] = (f32[], token[]) tuple(f32[] %[[GTE3:.*]], token[] %[[GTE4:.*]]),
-      control-predecessors={%[[TUPLE_START:.*]]}
-    CHECK ROOT %[[OUT:.*]] = f32[] get-tuple-element((f32[], token[]) %[[TUPLE1:.*]]), index=0
-    CHECK %[[GTE:.*]] = (f32[], u32[], token[])
-      get-tuple-element(((f32[], u32[], token[]), (f32[], u32[], token[])) %[[TUPLE_DONE:.*]]), index=0
-    CHECK %[[GTE1:.*]] = token[] get-tuple-element((f32[], u32[], token[]) %[[GTE:.*]]), index=2
+    CHECK: %[[TUPLE_START:.*]] = ((f32[], token[], token[]), ((f32[], u32[], token[]), (f32[], u32[], token[])), s32[]) async-start(%[[DATA]], %[[RECV_START]], %[[RECV_START]]), calls=%[[WRAPPED_SEND_RECV]]
+    CHECK: %[[TUPLE_DONE:.*]] = ((f32[], u32[], token[]), (f32[], u32[], token[])) async-done(%[[TUPLE_START]])
+    CHECK %[[GTE2:.*]] = (f32[], u32[], token[]) get-tuple-element(%[[TUPLE_DONE]], index=1)
+    CHECK %[[GTE3:.*]] = f32[] get-tuple-element(%[[GTE2]], index=0)
+    CHECK %[[GTE4:.*]] = token[] get-tuple-element(%[[GTE2]], index=2)
+    CHECK %[[TUPLE1:.*]] = (f32[], token[]) tuple(%[[GTE3]], %[[GTE4]]), control-predecessors={%[[TUPLE_START]]}
+    CHECK ROOT %[[OUT:.*]] = f32[] get-tuple-element(%[[TUPLE1]], index=0)
+    CHECK %[[GTE:.*]] = (f32[], u32[], token[]) get-tuple-element(%[[TUPLE_DONE]], index=0)
+    CHECK %[[GTE1:.*]] = token[] get-tuple-element(%[[GTE]], index=2)
   )"));
 }
 
@@ -226,15 +211,15 @@ TEST_F(CollectiveSendRecvCombinerTest, TransformedWithMultipleSendRecv) {
     CHECK-SAME: (f32[], u32[], token[]))
     CHECK-NEXT: %[[PARAM0:.*]] = {{.*}} parameter(0)
     CHECK: %[[PARAM1:.*]] = {{.*}} parameter(1)
-    CHECK: %[[SEND1:.*]] = {{.*}} send({{.*}} %[[PARAM0]], {{.*}} %[[PARAM1]]), channel_id=1
+    CHECK: %[[SEND1:.*]] = {{.*}} send(%[[PARAM0]], %[[PARAM1]]), channel_id=1
     CHECK: %[[PARAM2:.*]] = f32[] parameter(2)
     CHECK: %[[PARAM3:.*]] = {{.*}} parameter(3)
-    CHECK: %[[SEND2:.*]] = {{.*}} send({{.*}} %[[PARAM2]], {{.*}} %[[PARAM3]]), channel_id=2
+    CHECK: %[[SEND2:.*]] = {{.*}} send(%[[PARAM2]], %[[PARAM3]]), channel_id=2
     CHECK: %[[PARAM4:.*]] = {{.*}} parameter(4)
-    CHECK: %[[RECV1:.*]] = {{.*}} recv({{.*}} %[[PARAM4]]), channel_id=1
+    CHECK: %[[RECV1:.*]] = {{.*}} recv(%[[PARAM4]]), channel_id=1
     CHECK: %[[PARAM5:.*]] = {{.*}} parameter(5)
-    CHECK: %[[RECV2:.*]] = {{.*}} recv({{.*}} %[[PARAM5]]), channel_id=2
-    CHECK: ROOT %[[OUT:.*]] = {{.*}} tuple({{.*}} %[[SEND1]], {{.*}} %[[SEND2]], {{.*}} %[[RECV1]], {{.*}} %[[RECV2]])
+    CHECK: %[[RECV2:.*]] = {{.*}} recv(%[[PARAM5]]), channel_id=2
+    CHECK: ROOT %[[OUT:.*]] = {{.*}} tuple(%[[SEND1]], %[[SEND2]], %[[RECV1]], %[[RECV2]])
 
     CHECK: ENTRY %[[MAIN:.*]] () -> (f32[], f32[])
     CHECK: %[[DATA1:.*]] = {{.*}} constant(1)
@@ -242,23 +227,23 @@ TEST_F(CollectiveSendRecvCombinerTest, TransformedWithMultipleSendRecv) {
     CHECK: %[[DATA2:.*]] = {{.*}} constant(2)
     CHECK: %[[AFTER_ALL2:.*]] = {{.*}} after-all()
     CHECK: %[[TUPLE_START:.*]] = {{.*}} async-start{{.*}}calls=%[[WRAPPED_SEND_RECV]]
-    CHECK: %[[TUPLE_DONE:.*]] = {{.*}} async-done({{.*}} %[[TUPLE_START]])
-    CHECK %[[GTE4:.*]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE_DONE]]), index=2
-    CHECK %[[GTE5:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE4]]), index=0
-    CHECK %[[GTE6:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE4]]), index=2
-    CHECK %[[[TUPLE1:.*]]]] = {{.*}} tuple({{.*}} %[[GTE5]], {{.*}} %[[GTE6]]), control-predecessors={%[[TUPLE_START]]]}
-    CHECK %[[DATA_OUT1:.*]]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE1]]), index=0
-
-    CHECK %[[GTE7:.*]]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE_DONE]]), index=3
-    CHECK %[[GTE8:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE7]]), index=0
-    CHECK %[[GTE9:.*]]] = {{.*}} get-tuple-element({{.*}} %[[GTE7]]), index=2
-    CHECK %[[TUPLE2:.*]] = {{.*}} tuple({{.*}} %[[GTE8]], {{.*}} %[[GTE9]]), control-predecessors={%[[TUPLE_START]]}
-    CHECK %[[DATA_OUT2:.*]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE2]]), index=0
-    CHECK ROOT %[[OUT:.*]] = {{.*}} tuple({{.*}} %[[DATA_OUT1]], {{.*}} %[[DATA_OUT2]])
-    CHECK %[[GTE:.*]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE_DONE]]]), index=0
-    CHECK %[[GTE1:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE]]]), index=2
-    CHECK %[[GTE2:.*]] = {{.*}} get-tuple-element({{.*}} %[[TUPLE_DONE]]), index=1
-    CHECK %[[GTE3:.*]] = {{.*}} get-tuple-element({{.*}} %[[GTE2]]), index=2
+    CHECK: %[[TUPLE_DONE:.*]] = {{.*}} async-done(%[[TUPLE_START]])
+    CHECK %[[GTE4:.*]] = {{.*}} get-tuple-element(%[[TUPLE_DONE]]), index=2
+    CHECK %[[GTE5:.*]] = {{.*}} get-tuple-element(%[[GTE4]]), index=0
+    CHECK %[[GTE6:.*]] = {{.*}} get-tuple-element(%[[GTE4]]), index=2
+    CHECK %[[[TUPLE1:.*]]]] = {{.*}} tuple(%[[GTE5]], %[[GTE6]]), control-predecessors={%[[TUPLE_START]]]}
+    CHECK %[[DATA_OUT1:.*]]] = {{.*}} get-tuple-element(%[[TUPLE1]]), index=0
+
+    CHECK %[[GTE7:.*]]] = {{.*}} get-tuple-element(%[[TUPLE_DONE]]), index=3
+    CHECK %[[GTE8:.*]] = {{.*}} get-tuple-element(%[[GTE7]]), index=0
+    CHECK %[[GTE9:.*]]] = {{.*}} get-tuple-element(%[[GTE7]]), index=2
+    CHECK %[[TUPLE2:.*]] = {{.*}} tuple(%[[GTE8]], %[[GTE9]]), control-predecessors={%[[TUPLE_START]]}
+    CHECK %[[DATA_OUT2:.*]] = {{.*}} get-tuple-element(%[[TUPLE2]]), index=0
+    CHECK ROOT %[[OUT:.*]] = {{.*}} tuple(%[[DATA_OUT1]], %[[DATA_OUT2]])
+    CHECK %[[GTE:.*]] = {{.*}} get-tuple-element(%[[TUPLE_DONE]]), index=0
+    CHECK %[[GTE1:.*]] = {{.*}} get-tuple-element(%[[GTE]]), index=2
+    CHECK %[[GTE2:.*]] = {{.*}} get-tuple-element(%[[TUPLE_DONE]]), index=1
+    CHECK %[[GTE3:.*]] = {{.*}} get-tuple-element(%[[GTE2]]), index=2
   )"));
 }
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
index cb945fff12cf..9ca173a374d2 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -37,8 +37,8 @@ xla_cc_test(
         ":convert_async_collectives_to_sync",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:backend_configs_cc",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -53,8 +53,36 @@ cc_library(
     srcs = ["collective_ops_utils.cc"],
     hdrs = ["collective_ops_utils.h"],
     deps = [
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_ops_utils_test",
+    srcs = ["collective_ops_utils_test.cc"],
+    deps = [
+        ":collective_ops_utils",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -63,23 +91,15 @@ cc_library(
     srcs = ["gpu_collective_combiner_utils.cc"],
     hdrs = ["gpu_collective_combiner_utils.h"],
     deps = [
-        ":collective_ops_utils",
-        ":convert_async_collectives_to_sync",
         "//xla:util",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
         "//xla/service:collective_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_hlo_schedule",
         "//xla/stream_executor:device_description",
-        "//xla/tsl/platform:errors",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -92,16 +112,15 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_pipeliner",
+        "//xla/service:collective_pipeliner_utils",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -113,6 +132,7 @@ cc_library(
     srcs = ["all_gather_combiner.cc"],
     hdrs = ["all_gather_combiner.h"],
     deps = [
+        ":collective_combiner_annotator",
         ":gpu_collective_combiner_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -122,7 +142,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -136,10 +155,10 @@ xla_cc_test(
         ":gpu_all_gather_combiner",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:collective_utils",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -153,6 +172,7 @@ cc_library(
     srcs = ["reduce_scatter_combiner.cc"],
     hdrs = ["reduce_scatter_combiner.h"],
     deps = [
+        ":collective_combiner_annotator",
         ":gpu_collective_combiner_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -162,7 +182,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -175,10 +194,10 @@ xla_cc_test(
         ":gpu_reduce_scatter_combiner",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:collective_utils",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -192,6 +211,7 @@ cc_library(
     srcs = ["all_reduce_combiner.cc"],
     hdrs = ["all_reduce_combiner.h"],
     deps = [
+        ":collective_combiner_annotator",
         ":gpu_collective_combiner_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -201,7 +221,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -214,10 +233,10 @@ xla_cc_test(
         ":gpu_all_reduce_combiner",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:collective_utils",
         "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -225,3 +244,285 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "collective_combiner_annotator",
+    srcs = ["collective_combiner_annotator.cc"],
+    hdrs = ["collective_combiner_annotator.h"],
+    deps = [
+        ":collective_ops_utils",
+        ":convert_async_collectives_to_sync",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service/gpu:gpu_hlo_schedule",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_combiner_annotator_test",
+    srcs = ["collective_combiner_annotator_test.cc"],
+    deps = [
+        ":collective_combiner_annotator",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "all_gather_optimizer",
+    srcs = ["all_gather_optimizer.cc"],
+    hdrs = ["all_gather_optimizer.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:collective_ops_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+xla_cc_test(
+    name = "all_gather_optimizer_test",
+    srcs = ["all_gather_optimizer_test.cc"],
+    deps = [
+        ":all_gather_optimizer",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:hlo_module_config",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "all_gather_dynamic_slice_simplifier",
+    srcs = ["all_gather_dynamic_slice_simplifier.cc"],
+    hdrs = ["all_gather_dynamic_slice_simplifier.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/transforms/expanders:op_expander_pass",
+        "//xla/service:collective_opt_utils",
+        "//xla/service:hlo_module_config",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "all_gather_dynamic_slice_simplifier_test",
+    srcs = ["all_gather_dynamic_slice_simplifier_test.cc"],
+    deps = [
+        ":all_gather_dynamic_slice_simplifier",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:hlo_module_config",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "all_reduce_blueconnect",
+    srcs = ["all_reduce_blueconnect.cc"],
+    hdrs = ["all_reduce_blueconnect.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:global_device_id",
+        "//xla/service:hlo_creation_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "all_reduce_blueconnect_test",
+    srcs = ["all_reduce_blueconnect_test.cc"],
+    deps = [
+        ":all_reduce_blueconnect",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:pattern_matcher",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "all_reduce_splitter",
+    srcs = ["all_reduce_splitter.cc"],
+    hdrs = ["all_reduce_splitter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_opt_utils",
+        "//xla/service:hlo_module_config",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "all_reduce_splitter_test",
+    srcs = ["all_reduce_splitter_test.cc"],
+    deps = [
+        ":all_reduce_splitter",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu/transforms:reduce_scatter_creator",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "all_reduce_decomposer",
+    srcs = ["all_reduce_decomposer.cc"],
+    hdrs = ["all_reduce_decomposer.h"],
+    deps = [
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:shape_inference",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "all_reduce_decomposer_test",
+    srcs = ["all_reduce_decomposer_test.cc"],
+    deps = [
+        ":all_reduce_decomposer",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/service:hlo_cse",
+        "//xla/service:hlo_runner",
+        "//xla/service:platform_util",
+        "//xla/tests:hlo_runner_agnostic_test_base",
+        "//xla/tests:test_utils",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "collective_select_folder",
+    srcs = ["collective_select_folder.cc"],
+    hdrs = ["collective_select_folder.h"],
+    deps = [
+        "//xla:comparison_util",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:collective_ops_utils",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_select_folder_test",
+    srcs = ["collective_select_folder_test.cc"],
+    deps = [
+        ":collective_select_folder",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
index 92933d75b348..7b142904cee5 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/bind_front.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -26,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/collectives/all_gather_combiner.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
 #include "xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h"
 #include "xla/service/hlo_domain_map.h"
 #include "xla/tsl/platform/statusor.h"
@@ -49,10 +49,9 @@ std::optional<AllGatherCombiner::GroupKey> PipelinedCombinerKey(
 }
 
 std::optional<AllGatherCombiner::GroupKey> SynchronousCombinerKey(
-    const absl::flat_hash_set<HloInstruction*>& sync_collectives,
     const HloInstruction* instruction, const HloDomainMap& domain_map,
     bool combine_by_dim, bool combine_different_dtypes) {
-  if (!sync_collectives.contains(instruction)) {
+  if (!IsCombinableSyncCollective(*instruction)) {
     return std::nullopt;
   }
   return AllGatherCombiner::CombineKey(instruction, domain_map, combine_by_dim,
@@ -70,31 +69,14 @@ absl::StatusOr<bool> GpuAllGatherCombiner::Run(
   }
 
   // Combiner threshold is not specified. We use heuristics.
-  // We sequentially combine synchronous collectives then pipelined collectives
-  // and finally the rest. Note that collectives can be both synchronous and
-  // pipelined. Hence, we combine them in two steps.
+  // We sequentially combine pipelined collectives then synchronous collectives
+  // and finally the rest.
+  // We currently don't support combining a collective that was previously
+  // combined (see b/415761650). We favor combining pipelined collectives over
+  // synchronous collectives.
 
   bool changed = false;
 
-  // Combine as much as possible for synchronous collectives.
-  absl::flat_hash_set<HloInstruction*> sync_collectives;
-  if (module->config()
-          .debug_options()
-          .xla_gpu_experimental_enable_sync_collective_combining()) {
-    TF_ASSIGN_OR_RETURN(
-        sync_collectives,
-        SynchronousCollectives(*module, pointer_size_, device_info_));
-  }
-  if (!sync_collectives.empty()) {
-    combine_threshold_in_bytes_ = MaxAvailableMemory(*module, device_info_);
-    TF_ASSIGN_OR_RETURN(
-        bool combined,
-        RunWithKeyCombiner(
-            module, execution_threads,
-            absl::bind_front(SynchronousCombinerKey, sync_collectives)));
-    changed |= combined;
-  }
-
   // If there are no pipelined instructions in the IR, the optimizations below
   // do not kick in anyway.
   if (ContainsPipelinedInstruction(*module)) {
@@ -107,6 +89,15 @@ absl::StatusOr<bool> GpuAllGatherCombiner::Run(
     changed |= combined;
   }
 
+  // Combine as much as possible for synchronous collectives.
+  if (ContainsCombinableSyncCollective(*module)) {
+    combine_threshold_in_bytes_ = MaxAvailableMemory(*module, device_info_);
+    TF_ASSIGN_OR_RETURN(
+        bool combined,
+        RunWithKeyCombiner(module, execution_threads, SynchronousCombinerKey));
+    changed |= combined;
+  }
+
   // Use default combiner thresholds after we combine pipelined collectives.
   // The rest is combined by the parent pass code.
   combine_threshold_in_bytes_ = default_combine_threshold_in_bytes_;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc
index 98e1154716b7..db8c764dcb56 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_combiner_test.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/collective_utils.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -37,7 +37,7 @@ using ::tsl::testing::IsOkAndHolds;
 
 namespace op = xla::testing::opcode_matchers;
 
-using GpuAllGatherCombinerTest = HloTestBase;
+using GpuAllGatherCombinerTest = HloHardwareIndependentTestBase;
 
 TEST_F(GpuAllGatherCombinerTest,
        CombinesPipelinedCollectivesUpToSuggestedThreshold) {
@@ -356,8 +356,10 @@ TEST_F(GpuAllGatherCombinerTest, CombinesSynchronousCollectivesMaximally) {
       p1 = f16[5000000]{0} parameter(1)
 
       // 20MB combinable all-gather collectives. Default combiner threshold is 30MB.
-      ag0 = f16[10000000]{0} all-gather(p0), replica_groups={}, dimensions={0}
-      ag1 = f16[10000000]{0} all-gather(p1), replica_groups={}, dimensions={0}
+      ag0 = f16[10000000]{0} all-gather(p0), replica_groups={}, dimensions={0},
+        frontend_attributes={sync_collective="true"}
+      ag1 = f16[10000000]{0} all-gather(p1), replica_groups={}, dimensions={0},
+        frontend_attributes={sync_collective="true"}
       ROOT result = tuple(ag0, ag1)
     }
   )";
@@ -373,13 +375,7 @@ TEST_F(GpuAllGatherCombinerTest, CombinesSynchronousCollectivesMaximally) {
       /*combine_by_dim=*/false,
       /*combine_different_dtypes=*/true, /*pointer_size=*/4);
 
-  EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(false));
-
-  module->mutable_config()
-      .mutable_debug_options()
-      .set_xla_gpu_experimental_enable_sync_collective_combining(true);
   EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(true));
-
   Matcher<const HloInstruction*> combined_all_gather =
       op::AllGather(op::Parameter(0), op::Parameter(1));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -387,6 +383,48 @@ TEST_F(GpuAllGatherCombinerTest, CombinesSynchronousCollectivesMaximally) {
                         op::GetTupleElement(combined_all_gather, 1)));
 }
 
+TEST_F(GpuAllGatherCombinerTest, FavorsPipelinedCollectivesOverSynchronous) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    ENTRY main {
+      p0 = f16[1000000]{0} parameter(0)
+      p1 = f16[1000000]{0} parameter(1)
+      p2 = f16[1000000]{0} parameter(2)
+
+      ag0 = f16[10000000]{0} all-gather(p0), replica_groups={}, dimensions={0},
+        frontend_attributes={sync_collective="true"},
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      ag1 = f16[10000000]{0} all-gather(p1), replica_groups={}, dimensions={0},
+        frontend_attributes={sync_collective="true"},
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      ag2 = f16[10000000]{0} all-gather(p2), replica_groups={}, dimensions={0},
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+
+      ROOT result = tuple(ag0, ag1, ag2)
+    }
+  )";
+  DeviceDescription device_info;
+  device_info.set_device_memory_size(10000000000);  // 10GB
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  GpuAllGatherCombiner combiner(
+      device_info, /*default_combine_threshold_in_bytes=*/
+      kDefaultAllGatherCombineThreshold,
+      /*combine_threshold_in_bytes=*/kDefaultAllGatherCombineThreshold,
+      /*combine_threshold_count=*/256,
+      /*combine_by_dim=*/false,
+      /*combine_different_dtypes=*/true, /*pointer_size=*/4);
+
+  EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(true));
+  Matcher<const HloInstruction*> combined_all_gather =
+      op::AllGather(op::Parameter(0), op::Parameter(1), op::Parameter(2));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::GetTupleElement(combined_all_gather, 0),
+                        op::GetTupleElement(combined_all_gather, 1),
+                        op::GetTupleElement(combined_all_gather, 2)));
+}
+
 }  // namespace
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.cc
index a2ab49741f10..b3ab88f1cfa9 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.h"
 
 #include <optional>
 
diff --git a/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.h
similarity index 88%
rename from third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.h
index dc0b1c06ae73..813ebe946631 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
 
 #include <utility>
 
@@ -67,4 +67,4 @@ class AllGatherDynamicSliceSimplifier : public OpExpanderPass {
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_GATHER_DYNAMIC_SLICE_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier_test.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier_test.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier_test.cc
index 3f25b26af7ad..b31373ba4dc2 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.h"
 
 #include <cstdint>
 #include <memory>
@@ -26,9 +26,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -38,7 +38,8 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class AllGatherDynamicSliceSimplifierTest : public HloTestBase {
+class AllGatherDynamicSliceSimplifierTest
+    : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, int64_t num_replicas,
diff --git a/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
index 472ac79f5a1f..1764a29679f1 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_gather_optimizer.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_optimizer.h"
 
 #include <cstdint>
 #include <utility>
diff --git a/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
similarity index 87%
rename from third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.h
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
index 6250876fb5c3..a4d6e71953a3 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_OPTIMIZER_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_OPTIMIZER_H_
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_GATHER_OPTIMIZER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_GATHER_OPTIMIZER_H_
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -46,4 +46,4 @@ class AllGatherOptimizer : public HloModulePass {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_GATHER_OPTIMIZER_H_
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_GATHER_OPTIMIZER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer_test.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/transforms/all_gather_optimizer_test.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer_test.cc
index 27f6d65df781..7cbc59935af4 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_gather_optimizer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_gather_optimizer_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_gather_optimizer.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_optimizer.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -24,8 +24,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -34,7 +34,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class GpuAllGatherOptimizerTest : public HloTestBase {
+class GpuAllGatherOptimizerTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, int64_t num_replicas,
diff --git a/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
index c3a25e11260a..e56ad11bf3de 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_reduce_blueconnect.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h"
 
 #include <algorithm>
 #include <cstddef>
@@ -53,10 +53,10 @@ std::vector<HloInstruction*> GetOutputs(HloInstruction& instruction) {
   }
 
   std::vector<HloInstruction*> outputs;
-  outputs.reserve(instruction.shape().tuple_shapes_size());
+  outputs.reserve(instruction.shape().tuple_shapes().size());
 
   HloComputation& computation = *instruction.parent();  // never null
-  for (int i = 0; i < instruction.shape().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < instruction.shape().tuple_shapes().size(); ++i) {
     outputs.push_back(computation.AddInstruction(
         HloInstruction::CreateGetTupleElement(&instruction, i)));
   }
diff --git a/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
similarity index 89%
rename from third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.h
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
index 087d27bccfd0..6a63d4c9d61e 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_BLUECONNECT_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_BLUECONNECT_H_
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_BLUECONNECT_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_BLUECONNECT_H_
 
 #include <cstddef>
 
@@ -53,4 +53,4 @@ class AllReduceBlueConnect : public HloModulePass {
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_BLUECONNECT_H_
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_BLUECONNECT_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect_test.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect_test.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect_test.cc
index b1948971d7bc..6747805c70cb 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_reduce_blueconnect_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_blueconnect_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_reduce_blueconnect.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_blueconnect.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -29,12 +29,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -45,7 +45,7 @@ namespace {
 using ::tsl::testing::IsOkAndHolds;
 namespace m = ::xla::match;
 
-using AllReduceBlueConnectTest = HloTestBase;
+using AllReduceBlueConnectTest = HloHardwareIndependentTestBase;
 
 HloPredicate MatchChannelId(std::optional<int64_t> channel_id) {
   return [channel_id](const HloInstruction* instruction) {
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
index d1d40692e217..510eebde4709 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/bind_front.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -26,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/collectives/all_reduce_combiner.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
 #include "xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h"
 #include "xla/service/hlo_domain_map.h"
 #include "xla/tsl/platform/statusor.h"
@@ -47,9 +47,8 @@ std::optional<AllReduceCombiner::GroupKey> PipelinedCombinerKey(
 }
 
 std::optional<AllReduceCombiner::GroupKey> SynchronousCombinerKey(
-    const absl::flat_hash_set<HloInstruction*>& sync_collectives,
     const HloInstruction* instruction, const HloDomainMap& domain_map) {
-  if (!sync_collectives.contains(instruction)) {
+  if (!IsCombinableSyncCollective(*instruction)) {
     return std::nullopt;
   }
   return AllReduceCombiner::CombineKey(instruction, domain_map);
@@ -66,31 +65,14 @@ absl::StatusOr<bool> GpuAllReduceCombiner::Run(
   }
 
   // Combiner threshold is not specified. We use heuristics.
-  // We sequentially combine synchronous collectives then pipelined collectives
-  // and finally the rest. Note that collectives can be both synchronous and
-  // pipelined. Hence, we combine them in two steps.
+  // We sequentially combine pipelined collectives then synchronous collectives
+  // and finally the rest.
+  // We currently don't support combining a collective that was previously
+  // combined (see b/415761650). We favor combining pipelined collectives over
+  // synchronous collectives.
 
   bool changed = false;
 
-  // Combine as much as possible for synchronous collectives.
-  absl::flat_hash_set<HloInstruction*> sync_collectives;
-  if (module->config()
-          .debug_options()
-          .xla_gpu_experimental_enable_sync_collective_combining()) {
-    TF_ASSIGN_OR_RETURN(
-        sync_collectives,
-        SynchronousCollectives(*module, pointer_size_, device_info_));
-  }
-  if (!sync_collectives.empty()) {
-    combine_threshold_in_bytes_ = MaxAvailableMemory(*module, device_info_);
-    TF_ASSIGN_OR_RETURN(
-        bool combined,
-        RunWithKeyCombiner(
-            module, execution_threads,
-            absl::bind_front(SynchronousCombinerKey, sync_collectives)));
-    changed |= combined;
-  }
-
   // If there are no pipelined instructions in the IR, the optimizations below
   // do not kick in anyway.
   if (ContainsPipelinedInstruction(*module)) {
@@ -103,6 +85,15 @@ absl::StatusOr<bool> GpuAllReduceCombiner::Run(
     changed |= combined;
   }
 
+  // Combine as much as possible for synchronous collectives.
+  if (ContainsCombinableSyncCollective(*module)) {
+    combine_threshold_in_bytes_ = MaxAvailableMemory(*module, device_info_);
+    TF_ASSIGN_OR_RETURN(
+        bool combined,
+        RunWithKeyCombiner(module, execution_threads, SynchronousCombinerKey));
+    changed |= combined;
+  }
+
   // Use default combiner thresholds after we combine pipelined collectives.
   // The rest is combined by the parent pass code.
   combine_threshold_in_bytes_ = default_combine_threshold_in_bytes_;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc
index 75028d75d80c..0b85ba7a7fc8 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_combiner_test.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/collective_utils.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -37,7 +37,7 @@ using ::tsl::testing::IsOkAndHolds;
 
 namespace op = xla::testing::opcode_matchers;
 
-using GpuAllReduceCombinerTest = HloTestBase;
+using GpuAllReduceCombinerTest = HloHardwareIndependentTestBase;
 
 TEST_F(GpuAllReduceCombinerTest,
        CombinesPipelinedCollectivesUpToSuggestedThreshold) {
@@ -359,8 +359,10 @@ TEST_F(GpuAllReduceCombinerTest, CombinesSynchronousCollectivesMaximally) {
       p1 = f16[10000000]{0} parameter(1)
 
       // 20MB combinable all-reduce collectives. Default combiner threshold is 30MB.
-      ar0 = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add
-      ar1 = f16[10000000]{0} all-reduce(p1), replica_groups={}, to_apply=add
+      ar0 = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add,
+        frontend_attributes={sync_collective="true"}
+      ar1 = f16[10000000]{0} all-reduce(p1), replica_groups={}, to_apply=add,
+        frontend_attributes={sync_collective="true"}
       ROOT result = tuple(ar0, ar1)
     }
   )";
@@ -374,13 +376,7 @@ TEST_F(GpuAllReduceCombinerTest, CombinesSynchronousCollectivesMaximally) {
       /*combine_threshold_in_bytes=*/kDefaultAllReduceCombineThreshold,
       /*combine_threshold_count=*/256, /*pointer_size=*/4);
 
-  EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(false));
-
-  module->mutable_config()
-      .mutable_debug_options()
-      .set_xla_gpu_experimental_enable_sync_collective_combining(true);
   EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(true));
-
   Matcher<const HloInstruction*> combined_all_reduce =
       op::AllReduce(op::Parameter(0), op::Parameter(1));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -400,17 +396,17 @@ TEST_F(GpuAllReduceCombinerTest,
     }
 
     ENTRY main {
-      p0 = f16[10000000]{0} parameter(0)
-      p1 = f16[10000000]{0} parameter(1)
+      p0 = f16[10000]{0} parameter(0)
+      p1 = f16[10000]{0} parameter(1)
 
       // This all-reduce must happen first, which is enforced by the control
       // dependency and must be respected.
-      lead_ar = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add
+      lead_ar = f16[10000]{0} all-reduce(p0), replica_groups={}, to_apply=add
 
       // These all-reduce have control dependencies and must not be combined.
-      ar0 = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add,
+      ar0 = f16[10000]{0} all-reduce(p0), replica_groups={}, to_apply=add,
           control-predecessors={lead_ar}
-      ar1 = f16[10000000]{0} all-reduce(p1), replica_groups={}, to_apply=add,
+      ar1 = f16[10000]{0} all-reduce(p1), replica_groups={}, to_apply=add,
           control-predecessors={lead_ar}
       ROOT result = tuple(ar0, ar1)
     }
@@ -424,13 +420,54 @@ TEST_F(GpuAllReduceCombinerTest,
       kDefaultAllReduceCombineThreshold,
       /*combine_threshold_in_bytes=*/kDefaultAllReduceCombineThreshold,
       /*combine_threshold_count=*/256, /*pointer_size=*/4);
-
-  module->mutable_config()
-      .mutable_debug_options()
-      .set_xla_gpu_experimental_enable_sync_collective_combining(true);
   EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(false));
 }
 
+TEST_F(GpuAllReduceCombinerTest, FavorsPipelinedCollectivesOverSynchronous) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      p0 = f16[] parameter(0)
+      p1 = f16[] parameter(1)
+      ROOT add = f16[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f16[1000000]{0} parameter(0)
+      p1 = f16[1000000]{0} parameter(1)
+      p2 = f16[1000000]{0} parameter(2)
+
+      ar0 = f16[1000000]{0} all-reduce(p0), replica_groups={}, to_apply=add,
+        frontend_attributes={sync_collective="true"},
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      ar1 = f16[1000000]{0} all-reduce(p1), replica_groups={}, to_apply=add,
+        frontend_attributes={sync_collective="true"},
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      ar2 = f16[1000000]{0} all-reduce(p2), replica_groups={}, to_apply=add,
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      ROOT result = tuple(ar0, ar1, ar2)
+    }
+  )";
+  DeviceDescription device_info;
+  device_info.set_device_memory_size(10000000000);  // 10GB
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  GpuAllReduceCombiner combiner(
+      device_info, /*default_combine_threshold_in_bytes=*/
+      kDefaultAllReduceCombineThreshold,
+      /*combine_threshold_in_bytes=*/kDefaultAllReduceCombineThreshold,
+      /*combine_threshold_count=*/256, /*pointer_size=*/4);
+
+  EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(true));
+  Matcher<const HloInstruction*> combined_all_reduce =
+      op::AllReduce(op::Parameter(0), op::Parameter(1), op::Parameter(2));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::GetTupleElement(combined_all_reduce, 0),
+                        op::GetTupleElement(combined_all_reduce, 1),
+                        op::GetTupleElement(combined_all_reduce, 2)));
+}
+
 }  // namespace
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
new file mode 100644
index 000000000000..b6159e131ea8
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.cc
@@ -0,0 +1,160 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/collectives/all_reduce_decomposer.h"
+
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/shape_inference.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// The threshold is the upper limit of the number of elements in the input to
+// an all-reduce operation for it to be decomposed. The value is chosen
+// empirically in Feb 2025 to be a reasonable trade-off between performance and
+// memory usage.
+constexpr int64_t kOneShotAllReduceThreshold = 256 * 1024;
+
+bool IsSmallAllReduce(const HloInstruction* hlo) {
+  return HloPredicateIsOp<HloOpcode::kAllReduce>(hlo) &&
+         ShapeUtil::ElementsInRecursive(hlo->shape()) <=
+             kOneShotAllReduceThreshold;
+}
+
+std::optional<Literal> CreateReductionInitLiteral(
+    HloAllReduceInstruction* all_reduce, HloComputation* computation) {
+  std::optional<ReductionKind> reduction_kind =
+      MatchReductionComputation(all_reduce->to_apply());
+  if (!reduction_kind.has_value()) {
+    return std::nullopt;
+  }
+
+  return GetReductionIdentity(reduction_kind.value(),
+                              all_reduce->shape().element_type());
+}
+
+// Adds a size-1 major dimension to the given HLO instruction.
+HloInstruction* PrependSize1MajorDimension(HloInstruction* hlo,
+                                           HloComputation* computation) {
+  absl::InlinedVector<int64_t, 4> reshape_dimensions;
+  reshape_dimensions.reserve(hlo->shape().dimensions().size() + 1);
+  reshape_dimensions.push_back(1);
+  absl::c_copy(hlo->shape().dimensions(),
+               std::back_inserter(reshape_dimensions));
+
+  Shape reshape_shape =
+      ShapeUtil::MakeShape(hlo->shape().element_type(), reshape_dimensions);
+  return computation->AddInstruction(
+      HloInstruction::CreateReshape(reshape_shape, hlo));
+}
+
+// Decomposes the given all-reduce operation into an all-gather and a reduce
+// operation.
+absl::StatusOr<bool> DecomposeAllReduce(HloInstruction* hlo,
+                                        HloComputation* computation,
+                                        HloModule* module) {
+  HloAllReduceInstruction* all_reduce = Cast<HloAllReduceInstruction>(hlo);
+
+  HloInstruction* input = all_reduce->mutable_operand(0);
+
+  std::optional<Literal> reduction_init_literal =
+      CreateReductionInitLiteral(all_reduce, computation);
+  if (!reduction_init_literal.has_value()) {
+    // Unsupported reduction type.
+    return false;
+  }
+
+  TF_ASSIGN_OR_RETURN(auto replica_group_count_and_size,
+                      GetReplicaGroupCountAndSize(all_reduce));
+
+  if (!replica_group_count_and_size.has_value()) {
+    // Could not determine the number of participating devices at compilation.
+    return false;
+  }
+
+  int64_t num_participating_devices = replica_group_count_and_size->second;
+
+  // Add a size-1 major dimension to the input that will be used as the
+  // all-gather and reduction dimension.
+  HloInstruction* reshape = PrependSize1MajorDimension(input, computation);
+
+  TF_ASSIGN_OR_RETURN(Shape all_gather_shape,
+                      ShapeInference::InferAllGatherShape(
+                          {&reshape->shape()}, /*all_gather_dimension=*/0,
+                          num_participating_devices));
+
+  HloInstruction* all_gather =
+      computation->AddInstruction(HloInstruction::CreateAllGather(
+          all_gather_shape, {reshape}, /*all_gather_dimension=*/0,
+          all_reduce->device_list(), all_reduce->constrain_layout(),
+          all_reduce->channel_id(), all_reduce->use_global_device_ids()));
+
+  HloInstruction* init = computation->AddInstruction(
+      HloInstruction::CreateConstant(*std::move(reduction_init_literal)));
+
+  HloInstruction* reduce =
+      computation->AddInstruction(HloInstruction::CreateReduce(
+          input->shape(), all_gather, init,
+          /*dimensions_to_reduce=*/{0}, all_reduce->to_apply()));
+
+  TF_RETURN_IF_ERROR(all_reduce->ReplaceAllUsesWith(reduce));
+  TF_RETURN_IF_ERROR(
+      computation->RemoveInstructionAndUnusedOperands(all_reduce));
+
+  return true;
+}
+
+absl::StatusOr<bool> AllReduceDecomposer::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+
+  for (auto computation : module->computations(execution_threads)) {
+    for (auto hlo : computation->MakeInstructionPostOrder()) {
+      if (!IsSmallAllReduce(hlo)) {
+        continue;
+      }
+      TF_ASSIGN_OR_RETURN(bool decomposed,
+                          DecomposeAllReduce(hlo, computation, module));
+      changed |= decomposed;
+    }
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
new file mode 100644
index 000000000000..844f3e82cd20
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer.h
@@ -0,0 +1,41 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_DECOMPOSER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_DECOMPOSER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites an `all-reduce` as `all-gather` and `reduce`.
+class AllReduceDecomposer : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "all-reduce-decomposer"; }
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_DECOMPOSER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer_test.cc
new file mode 100644
index 000000000000..d2b0ee0061f6
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_decomposer_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/collectives/all_reduce_decomposer.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/service/hlo_cse.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/platform_util.h"
+#include "xla/tests/hlo_runner_agnostic_test_base.h"
+#include "xla/tests/test_utils.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class AllReduceDecomposerTest : public HloRunnerAgnosticTestBase {
+ public:
+  AllReduceDecomposerTest()
+      : HloRunnerAgnosticTestBase(std::make_unique<HloRunner>(
+            PlatformUtil::GetDefaultPlatform().value())) {}
+};
+
+TEST_F(AllReduceDecomposerTest, SmallAllReduceIsDecomposed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  input = f32[16] parameter(0)
+  ROOT all-reduce = f32[16] all-reduce(input), replica_groups={{0,1}}, to_apply=add
+}
+)"));
+
+  AllReduceDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
+  EXPECT_TRUE(changed);
+  TF_EXPECT_OK(VerifyHloModule(module.get(), true, true));
+  TF_EXPECT_OK(HloCSE(true).Run(module.get()));
+
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    // CHECK: f32[1,16]{1,0} reshape
+    // CHECK: f32[2,16]{1,0} all-gather
+    // CHECK: f32[16]{0} reduce
+  )"));
+}
+
+TEST_F(AllReduceDecomposerTest, LargeAllReduceIsNotDecomposed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  input = f32[16777216] parameter(0)
+  ROOT all-reduce = f32[16777216] all-reduce(input), replica_groups={{0,1}}, to_apply=add
+}
+)"));
+
+  AllReduceDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
index 51f71c06c800..322a7ac37657 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_reduce_splitter.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_splitter.h"
 
 #include <cstdint>
 #include <optional>
@@ -126,7 +126,7 @@ std::optional<int> GetSplitDim(const HloAllReduceInstruction& ar,
                                const HloDynamicSliceInstruction& ds) {
   int split_dim = -1;
   int num_dims = 0;
-  for (int64_t dim = 0; dim < ar.shape().rank(); ++dim) {
+  for (int64_t dim = 0; dim < ar.shape().dimensions().size(); ++dim) {
     if (ar.shape().dimensions(dim) != ds.shape().dimensions(dim)) {
       num_dims++;
       split_dim = dim;
diff --git a/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.h b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
similarity index 93%
rename from third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.h
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
index 2967eb793e0d..f11241f945ba 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_SPLITTER_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_SPLITTER_H_
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_SPLITTER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_SPLITTER_H_
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -74,4 +74,4 @@ class AllReduceSplitter : public HloModulePass {
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_ALL_REDUCE_SPLITTER_H_
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_ALL_REDUCE_SPLITTER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter_test.cc
similarity index 91%
rename from third_party/xla/xla/service/gpu/transforms/all_reduce_splitter_test.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter_test.cc
index 581237dd5f04..3f2f6030f8eb 100644
--- a/third_party/xla/xla/service/gpu/transforms/all_reduce_splitter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/all_reduce_splitter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/all_reduce_splitter.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_splitter.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -31,9 +31,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/transforms/reduce_scatter_creator.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -44,7 +44,7 @@ namespace {
 
 using ::tsl::testing::IsOkAndHolds;
 
-class AllReduceSplitterTest : public HloTestBase {
+class AllReduceSplitterTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> PrepareModule(
       absl::string_view hlo_module, int64_t num_replicas,
@@ -115,15 +115,15 @@ ENTRY main {
   EXPECT_THAT(AllReduceSplitter().Run(module.get()), IsOkAndHolds(true));
   TF_EXPECT_OK(FileCheck(module->ToString(), R"(
     CHECK-DAG:    %[[P0:.*]] = bf16[2,4096,4096]{2,1,0} parameter(0)
-    CHECK:        %[[AR0:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(bf16[2,4096,4096]{2,1,0} %[[P0]])
+    CHECK:        %[[AR0:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(%[[P0]])
     CHECK-SAME:   replica_groups={[[DESIRED_RGS:.*]]}
     CHECK-DAG:    %[[ZERO:.*]] = bf16[] constant(0)
-    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(bf16[2,4096,4096]{2,1,0} %[[AR0]], bf16[] %[[ZERO]])
-    CHECK:        %[[AR1:.*]] = bf16[4096]{0} all-reduce(bf16[4096]{0} %[[LOCAL_REDUCE]])
+    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(%[[AR0]], %[[ZERO]])
+    CHECK:        %[[AR1:.*]] = bf16[4096]{0} all-reduce(%[[LOCAL_REDUCE]])
     CHECK-SAME:   replica_groups={[[DESIRED_RGS]]}
-    CHECK:        %[[DS:.*]] = bf16[1024]{0} dynamic-slice(bf16[4096]{0} %[[AR1]], s32[] %[[_:.*]])
+    CHECK:        %[[DS:.*]] = bf16[1024]{0} dynamic-slice(%[[AR1]], %[[_:.*]])
     CHECK-SAME:   dynamic_slice_sizes={1024}
-    CHECK-NEXT:   ROOT %[[AR2:.*]] = bf16[1024]{0} all-reduce(bf16[1024]{0} %[[DS]])
+    CHECK-NEXT:   ROOT %[[AR2:.*]] = bf16[1024]{0} all-reduce(%[[DS]])
     CHECK-SAME:   replica_groups={{[{]}}{0,4},{1,5},{2,6},{3,7}{{[}]}}
     )"));
 }
@@ -202,14 +202,14 @@ ENTRY main {
   TF_EXPECT_OK(FileCheck(module->ToString(), R"(
     CHECK-DAG:    %[[P0:.*]] = bf16[2,4096,4096]{2,1,0} parameter(0)
     CHECK-DAG:    %[[ZERO:.*]] = bf16[] constant(0)
-    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(bf16[2,4096,4096]{2,1,0} %[[P0]], bf16[] %[[ZERO]])
-    CHECK:        %[[AR0:.*]] = bf16[4096]{0} all-reduce(bf16[4096]{0} %[[LOCAL_REDUCE]])
+    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(%[[P0]], %[[ZERO]])
+    CHECK:        %[[AR0:.*]] = bf16[4096]{0} all-reduce(%[[LOCAL_REDUCE]])
     CHECK-SAME:   replica_groups={[[DESIRED_RGS:.*]]}
-    CHECK:        %[[DS:.*]] = bf16[1024]{0} dynamic-slice(bf16[4096]{0} %[[AR0]], s32[] %[[_:.*]])
+    CHECK:        %[[DS:.*]] = bf16[1024]{0} dynamic-slice(%[[AR0]], %[[_:.*]])
     CHECK-SAME:   dynamic_slice_sizes={1024}
-    CHECK-NEXT:   %[[AR1:.*]] = bf16[1024]{0} all-reduce(bf16[1024]{0} %[[DS]])
+    CHECK-NEXT:   %[[AR1:.*]] = bf16[1024]{0} all-reduce(%[[DS]])
     CHECK-SAME:   replica_groups={{[{]}}{0,4},{1,5},{2,6},{3,7}{{[}]}}
-    CHECK:        %[[EXISTING_AR:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(bf16[2,4096,4096]{2,1,0} %[[P0]])
+    CHECK:        %[[EXISTING_AR:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(%[[P0]])
     CHECK-SAME:   replica_groups={[[DESIRED_RGS]]}
     CHECK:        ROOT
     CHECK-NOT:    %[[AR1]]
@@ -438,13 +438,13 @@ ENTRY main {
   EXPECT_THAT(pipeline.Run(module.get()), IsOkAndHolds(true));
   TF_EXPECT_OK(FileCheck(module->ToString(), R"(
     CHECK-DAG:    %[[P0:.*]] = bf16[2,4096,4096]{2,1,0} parameter(0)
-    CHECK:        %[[AR0:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(bf16[2,4096,4096]{2,1,0} %[[P0]])
+    CHECK:        %[[AR0:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(%[[P0]])
     CHECK-SAME:   replica_groups={[[DESIRED_RGS:.*]]}
     CHECK-DAG:    %[[ZERO:.*]] = bf16[] constant(0)
-    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(bf16[2,4096,4096]{2,1,0} %[[AR0]], bf16[] %[[ZERO]])
-    CHECK:        %[[REDUCE_SCATTER:.*]] = bf16[1024]{0} reduce-scatter(bf16[4096]{0} %[[LOCAL_REDUCE]])
+    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(%[[AR0]], %[[ZERO]])
+    CHECK:        %[[REDUCE_SCATTER:.*]] = bf16[1024]{0} reduce-scatter(%[[LOCAL_REDUCE]])
     CHECK-SAME:   replica_groups={[[DESIRED_RGS]]}
-    CHECK-NEXT:   ROOT %[[AR2:.*]] = bf16[1024]{0} all-reduce(bf16[1024]{0} %[[REDUCE_SCATTER]])
+    CHECK-NEXT:   ROOT %[[AR2:.*]] = bf16[1024]{0} all-reduce(%[[REDUCE_SCATTER]])
     CHECK-SAME:   replica_groups={{[{]}}{0,4},{1,5},{2,6},{3,7}{{[}]}}
     )"));
 }
@@ -490,11 +490,11 @@ ENTRY main {
   TF_EXPECT_OK(FileCheck(module->ToString(), R"(
     CHECK-DAG:    %[[P0:.*]] = bf16[2,4096,4096]{2,1,0} parameter(0)
     CHECK-DAG:    %[[ZERO:.*]] = bf16[] constant(0)
-    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(bf16[2,4096,4096]{2,1,0} %[[P0]], bf16[] %[[ZERO]])
-    CHECK:        %[[REDUCE_SCATTER:.*]] = bf16[1024]{0} reduce-scatter(bf16[4096]{0} %[[LOCAL_REDUCE]])
-    CHECK-NEXT:   %[[AR1:.*]] = bf16[1024]{0} all-reduce(bf16[1024]{0} %[[REDUCE_SCATTER]])
+    CHECK-DAG:    %[[LOCAL_REDUCE:.*]] = bf16[4096]{0} reduce(%[[P0]], %[[ZERO]])
+    CHECK:        %[[REDUCE_SCATTER:.*]] = bf16[1024]{0} reduce-scatter(%[[LOCAL_REDUCE]])
+    CHECK-NEXT:   %[[AR1:.*]] = bf16[1024]{0} all-reduce(%[[REDUCE_SCATTER]])
     CHECK-SAME:   replica_groups={{[{]}}{0,4},{1,5},{2,6},{3,7}{{[}]}}
-    CHECK:        %[[EXISTING_AR:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(bf16[2,4096,4096]{2,1,0} %[[P0]])
+    CHECK:        %[[EXISTING_AR:.*]] = bf16[2,4096,4096]{2,1,0} all-reduce(%[[P0]])
     CHECK:        ROOT
     CHECK-NOT:    %[[AR1]]
     CHECK-SAME:   %[[EXISTING_AR]]
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
new file mode 100644
index 000000000000..6c03d576c745
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
@@ -0,0 +1,140 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/gpu_hlo_schedule.h"
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+#include "xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+namespace {
+
+static constexpr const char kCollectiveIdAttr[] = "collective_id";
+static constexpr const char kCollectiveSyncAttr[] = "sync_collective";
+
+std::string CollectiveId(const HloInstruction* instr) {
+  return absl::StrCat(instr->unique_id());
+}
+
+// Annotate all collective instructions with a unique identifier that will be
+// preserved after async collective conversion.
+void AnnotateCollectives(HloModule* module) {
+  HloPredicate is_collective = [](const HloInstruction* instr) {
+    return hlo_query::IsCollectiveCommunicationOp(instr->opcode());
+  };
+  hlo_query::ForEachInstructionWithPred(
+      *module, is_collective, [](HloInstruction* instr) {
+        instr->add_frontend_attribute(kCollectiveIdAttr, CollectiveId(instr));
+      });
+}
+
+absl::Status AnnotateSyncCollectives(HloModule* module) {
+  HloPassPipeline pipeline("annotate-sync-collectives");
+  pipeline.AddPass<GpuConvertAsyncCollectivesToSync>();
+  return pipeline.Run(module).status();
+}
+
+absl::flat_hash_set<std::string> SyncCollectiveIds(const HloModule& module) {
+  absl::flat_hash_set<std::string> sync_collective_ids;
+  HloPredicate is_sync_collective = [](const HloInstruction* instr) {
+    return IsGPUSyncCollective(*instr);
+  };
+  hlo_query::ForEachInstructionWithPred(
+      module, is_sync_collective,
+      [&sync_collective_ids](const HloInstruction* instr) {
+        sync_collective_ids.insert(
+            *instr->get_frontend_attribute(kCollectiveIdAttr));
+      });
+  return sync_collective_ids;
+}
+
+// Return the set of collective instructions that are synchronous post
+// scheduling.
+absl::StatusOr<absl::flat_hash_set<std::string>> SynchronousCollectives(
+    const HloModule& module, int64_t pointer_size,
+    const se::DeviceDescription& device_info) {
+  std::unique_ptr<HloModule> cloned_module = module.Clone();
+  AnnotateCollectives(cloned_module.get());
+  TF_RETURN_IF_ERROR(RunAsyncCollectivesConversionPasses(cloned_module.get()));
+  TF_RETURN_IF_ERROR(
+      ScheduleGpuModule(cloned_module.get(), pointer_size, device_info)
+          .status());
+  TF_RETURN_IF_ERROR(AnnotateSyncCollectives(cloned_module.get()));
+  return SyncCollectiveIds(*cloned_module);
+}
+
+}  // namespace
+
+absl::StatusOr<bool> CollectiveCombinerAnnotator::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  TF_ASSIGN_OR_RETURN(
+      absl::flat_hash_set<std::string> sync_collectives,
+      SynchronousCollectives(*module, pointer_size_, device_info_));
+  if (sync_collectives.empty()) {
+    return false;
+  }
+
+  bool changed = false;
+  for (HloComputation* comp : module->computations(execution_threads)) {
+    for (HloInstruction* instr : comp->instructions()) {
+      if (!sync_collectives.contains(CollectiveId(instr))) {
+        continue;
+      }
+      instr->add_frontend_attribute(kCollectiveSyncAttr, "true");
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+bool IsCombinableSyncCollective(const HloInstruction& instr) {
+  return instr.get_frontend_attribute(kCollectiveSyncAttr).value_or("false") ==
+         "true";
+}
+
+bool ContainsCombinableSyncCollective(const HloModule& module) {
+  for (const HloComputation* computation : module.computations()) {
+    for (const HloInstruction* instr : computation->instructions()) {
+      if (IsCombinableSyncCollective(*instr)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
new file mode 100644
index 000000000000..cfac165316bf
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
@@ -0,0 +1,61 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_COMBINER_ANNOTATOR_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_COMBINER_ANNOTATOR_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Annotates collective operations with metadata used by collective combiners.
+class CollectiveCombinerAnnotator : public HloModulePass {
+ public:
+  CollectiveCombinerAnnotator(se::DeviceDescription device_info,
+                              int64_t pointer_size)
+      : device_info_(std::move(device_info)), pointer_size_(pointer_size) {}
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  absl::string_view name() const override {
+    return "collective-combiner-annotator";
+  }
+
+ private:
+  se::DeviceDescription device_info_;
+  int64_t pointer_size_;
+};
+
+// Returns true if `instr` is a combinable sync collective. False otherwise.
+bool IsCombinableSyncCollective(const HloInstruction& instr);
+
+// Returns true if module contains any combinable sync collective. False
+// otherwise.
+bool ContainsCombinableSyncCollective(const HloModule& module);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_COMBINER_ANNOTATOR_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
new file mode 100644
index 000000000000..641640d7b5be
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+
+class CollectiveCombinerAnnotatorTest : public HloHardwareIndependentTestBase {
+ protected:
+  absl::StatusOr<bool> RunCollectiveCombinerAnnotator(HloModule* module) {
+    int pointer_size = 4;
+    stream_executor::DeviceDescription device_info;
+    device_info.set_device_memory_size(20000);
+    return RunHloPass(
+        CollectiveCombinerAnnotator(std::move(device_info), pointer_size),
+        module);
+  }
+};
+
+TEST_F(CollectiveCombinerAnnotatorTest, SynchronousCollectivesNoOverlap) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+        p0 = f16[] parameter(0)
+        p1 = f16[] parameter(1)
+        ROOT add = f16[] add(p0, p1)
+    }
+
+    ENTRY main {
+        p0 = f16[10000000]{0} parameter(0)
+        p1 = f16[10000000]{0} parameter(1)
+        ar0 = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add
+        ar1 = f16[10000000]{0} all-reduce(p1), replica_groups={}, to_apply=add
+        ROOT result = tuple(ar0, ar1)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  EXPECT_THAT(RunCollectiveCombinerAnnotator(module.get()), IsOkAndHolds(true));
+  const HloInstruction* ar0 =
+      module->entry_computation()->root_instruction()->operand(0);
+  EXPECT_TRUE(IsCombinableSyncCollective(*ar0));
+  const HloInstruction* ar1 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(IsCombinableSyncCollective(*ar1));
+}
+
+TEST_F(CollectiveCombinerAnnotatorTest, SynchronousCollectivesWithOverlap) {
+  // Expected schedule:
+  // ------------------
+  // c0 –> ar0
+  //       c1 –> ar1
+  // ------------------
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      p0 = f16[] parameter(0)
+      p1 = f16[] parameter(1)
+      ROOT add = f16[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f16[10000000]{0} parameter(0)
+      p1 = f16[10000000]{0} parameter(1)
+
+      c0 = f16[10000000]{0} copy(p0)
+      c1 = f16[10000000]{0} copy(p1)
+
+      ar0 = f16[10000000]{0} all-reduce(c0), replica_groups={}, to_apply=add
+      ar1 = f16[10000000]{0} all-reduce(c1), replica_groups={}, to_apply=add
+
+      ROOT result = tuple(ar0, ar1)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  EXPECT_THAT(RunCollectiveCombinerAnnotator(module.get()), IsOkAndHolds(true));
+  const HloInstruction* ar0 =
+      module->entry_computation()->root_instruction()->operand(0);
+  EXPECT_FALSE(IsCombinableSyncCollective(*ar0));
+  const HloInstruction* ar1 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(IsCombinableSyncCollective(*ar1));
+}
+
+TEST_F(CollectiveCombinerAnnotatorTest,
+       ContainsCombinableSyncCollectiveReturnFalseForNonAnnotatedCollectives) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      p0 = f16[] parameter(0)
+      p1 = f16[] parameter(1)
+      ROOT add = f16[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f16[10000000]{0} parameter(0)
+      ROOT result = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  EXPECT_FALSE(ContainsCombinableSyncCollective(*module));
+}
+
+TEST_F(CollectiveCombinerAnnotatorTest,
+       ContainsCombinableSyncCollectiveReturnTRUEForAnnotatedCollectives) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+    p0 = f16[] parameter(0)
+    p1 = f16[] parameter(1)
+    ROOT add = f16[] add(p0, p1)
+    }
+
+    ENTRY main {
+    p0 = f16[10000000]{0} parameter(0)
+    ROOT result = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add,
+      frontend_attributes={sync_collective="true"}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  EXPECT_TRUE(ContainsCombinableSyncCollective(*module));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
index 6a9d99521dc9..863665475475 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.cc
@@ -15,12 +15,97 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
 
+namespace {
+
+struct CommunicationMetadata {
+  absl::flat_hash_map<int64_t, size_t> node_to_participant_count;
+};
+
+bool SameParticipantCounts(const absl::flat_hash_map<int64_t, size_t>& lhs,
+                           const absl::flat_hash_map<int64_t, size_t>& rhs) {
+  std::vector<size_t> lhs_counts, rhs_counts;
+  lhs_counts.reserve(lhs.size());
+  for (const auto& [_, v] : lhs) {
+    lhs_counts.push_back(v);
+  }
+
+  rhs_counts.reserve(rhs.size());
+  for (const auto& [_, v] : rhs) {
+    rhs_counts.push_back(v);
+  }
+  std::sort(lhs_counts.begin(), lhs_counts.end());
+  std::sort(rhs_counts.begin(), rhs_counts.end());
+  return lhs_counts == rhs_counts;
+}
+
+absl::StatusOr<CommunicationMetadata> CommunicationContext(
+    const CollectiveDeviceList& device_list, int num_devices_per_host) {
+  absl::flat_hash_map<int64_t, size_t> node_to_participant_count;
+  for (const ReplicaGroup& replica_group : device_list.replica_groups()) {
+    absl::flat_hash_map<int64_t, size_t> buffer;
+    for (int64_t rank : replica_group.replica_ids()) {
+      int64_t node_id = rank / num_devices_per_host;
+      buffer[node_id]++;
+    }
+    if (!node_to_participant_count.empty() &&
+        !SameParticipantCounts(buffer, node_to_participant_count)) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Non homogenous replica group: ", device_list.ToString()));
+    }
+    if (node_to_participant_count.empty()) {
+      node_to_participant_count = buffer;
+    }
+  }
+
+  return CommunicationMetadata{node_to_participant_count};
+}
+
+bool IsSingleHost(const CommunicationMetadata& pattern) {
+  return pattern.node_to_participant_count.size() == 1;
+}
+
+bool IsRailAligned(const CommunicationMetadata& pattern,
+                   int num_devices_per_host) {
+  return absl::c_all_of(pattern.node_to_participant_count,
+                        [num_devices_per_host](const auto& elem) {
+                          const auto& [node_id, participant_count] = elem;
+                          return participant_count == num_devices_per_host;
+                        });
+}
+
+bool IsNonRailAligned(const CommunicationMetadata& pattern,
+                      int num_devices_per_host) {
+  return !IsSingleHost(pattern) &&
+         !IsRailAligned(pattern, num_devices_per_host);
+}
+
+}  // namespace
+
 bool IsGPUSyncCollective(const HloInstruction& instr) {
   auto backend_config = instr.backend_config<GpuBackendConfig>();
   if (!backend_config.ok()) {
@@ -29,5 +114,57 @@ bool IsGPUSyncCollective(const HloInstruction& instr) {
   return backend_config->collective_backend_config().is_sync();
 }
 
+absl::StatusOr<GPUCommunicationType> CommunicationType(
+    const HloCollectiveInstruction& instr,
+    const se::GpuComputeCapability& gpu_version) {
+  auto iota = instr.device_list().iota_replica_group_list();
+
+  if (!std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+    return absl::FailedPreconditionError("Only CUDA is supported.");
+  }
+
+  auto cuda_compute_capability =
+      std::get<se::CudaComputeCapability>(gpu_version);
+  if (!cuda_compute_capability.IsHopper()) {
+    return absl::FailedPreconditionError(
+        "Only Hopper is supported to get communication type");
+  }
+
+  // We assume no topology was provided to the compiler and no
+  // `CUDA_VISIBLE_DEVICES` env var has been set.
+  // For now we only support H100 and assume 8GPUs per host.
+  int num_devices_per_host = 8;
+
+  TF_ASSIGN_OR_RETURN(
+      CommunicationMetadata comm,
+      CommunicationContext(instr.device_list(), num_devices_per_host));
+  if (IsSingleHost(comm)) {
+    return GPUCommunicationType::SINGLE_HOST;
+  }
+  if (IsRailAligned(comm, num_devices_per_host)) {
+    return GPUCommunicationType::RAIL_ALIGNED;
+  }
+  if (IsNonRailAligned(comm, num_devices_per_host)) {
+    return GPUCommunicationType::NON_RAIL_ALIGNED;
+  }
+
+  return GPUCommunicationType::UNDEFINED;
+}
+
+std::optional<bool> IsMultiHostTopology(
+    const HloModuleConfig& config,
+    const se::DeviceDescription& device_description) {
+  se::CudaComputeCapability cc = device_description.cuda_compute_capability();
+  // TODO: b/390095346 - Use topology information once available at compile
+  // time.
+  if (cc.IsHopper()) {
+    return config.num_partitions() * config.replica_count() > 8;
+  }
+  if (cc.IsAmpere()) {
+    return config.num_partitions() * config.replica_count() > 16;
+  }
+  return std::nullopt;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
index 96fd28a9c7be..362cc2bb5ade 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils.h
@@ -16,14 +16,41 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_OPS_UTILS_H_
 #define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_OPS_UTILS_H_
 
+#include <optional>
+
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
 
+enum class GPUCommunicationType {
+  UNDEFINED = 0,
+  RAIL_ALIGNED = 1,
+  NON_RAIL_ALIGNED = 2,
+  SINGLE_HOST = 3
+};
+
+absl::StatusOr<GPUCommunicationType> CommunicationType(
+    const HloCollectiveInstruction& instr,
+    const se::GpuComputeCapability& gpu_version);
+
 // Returns true if instruction is a synchronous collective op.
 bool IsGPUSyncCollective(const HloInstruction& instr);
 
+// Returns true if the topology is multi-host. Currently this function is
+// heuristic based: it can be the case it will not detect a multi host case when
+// a user decides to use < 8 GPUs per host. Moreover it tells nothing about how
+// fast the interconnect between hosts is (Infiniband, NVLINK, DCN, etc.).
+//
+// Will return `std::nullopt` on any platform other than Hopper and Ampere.
+std::optional<bool> IsMultiHostTopology(
+    const HloModuleConfig& config,
+    const se::DeviceDescription& device_description);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
new file mode 100644
index 000000000000..9d2111e2a472
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_ops_utils_test.cc
@@ -0,0 +1,244 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::Optional;
+using ::testing::Test;
+using ::tsl::testing::IsOkAndHolds;
+
+std::optional<bool> IsMultiHostTopology(
+    se::CudaComputeCapability compute_capability, int num_partitions,
+    int replica_count) {
+  HloModuleConfig config;
+  config.set_num_partitions(num_partitions);
+  config.set_replica_count(replica_count);
+  se::DeviceDescription device_description;
+  device_description.set_gpu_compute_capability(compute_capability);
+  return xla::gpu::IsMultiHostTopology(config, device_description);
+}
+
+TEST(IsMultiHostTopologyTest, SingleHostSingleDevice) {
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Ampere(),
+                                  /*num_partitions=*/1, /*replica_count=*/1),
+              Optional(false));
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Hopper(),
+                                  /*num_partitions=*/1, /*replica_count=*/1),
+              Optional(false));
+}
+
+TEST(IsMultiHostTopologyTest, SingleHostMultiDevices) {
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Ampere(),
+                                  /*num_partitions=*/16, /*replica_count=*/1),
+              Optional(false));
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Ampere(),
+                                  /*num_partitions=*/1, /*replica_count=*/16),
+              Optional(false));
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Hopper(),
+                                  /*num_partitions=*/8, /*replica_count=*/1),
+              Optional(false));
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Hopper(),
+                                  /*num_partitions=*/1, /*replica_count=*/8),
+              Optional(false));
+}
+
+TEST(IsMultiHostTopologyTest, MultiHosts) {
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Ampere(),
+                                  /*num_partitions=*/32, /*replica_count=*/1),
+              Optional(true));
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Ampere(),
+                                  /*num_partitions=*/1, /*replica_count=*/32),
+              Optional(true));
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Hopper(),
+                                  /*num_partitions=*/16, /*replica_count=*/1),
+              Optional(true));
+  EXPECT_THAT(IsMultiHostTopology(se::CudaComputeCapability::Hopper(),
+                                  /*num_partitions=*/1, /*replica_count=*/16),
+              Optional(true));
+}
+
+TEST(IsMultiHostTopologyTest, NonAmpereAndHopper) {
+  EXPECT_EQ(IsMultiHostTopology(se::CudaComputeCapability::Volta(),
+                                /*num_partitions=*/1, /*replica_count=*/1),
+            std::nullopt);
+  EXPECT_EQ(IsMultiHostTopology(se::CudaComputeCapability::Blackwell(),
+                                /*num_partitions=*/1, /*replica_count=*/1),
+            std::nullopt);
+}
+
+class CommunicationTypeTest : public Test {
+ protected:
+  se::DeviceDescription& device_info() { return device_info_; }
+
+ private:
+  se::DeviceDescription device_info_ = TestGpuDeviceInfo::RTXA6000DeviceInfo(
+      stream_executor::CudaComputeCapability(9, 0));
+};
+
+TEST_F(CommunicationTypeTest, DetectsSingleHost8Devices) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[1024] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1,8]<=[8]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(*instr, device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSingleHost4Devices) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=8
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[512] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1,4]<=[4]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(*instr, device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+}
+
+TEST_F(CommunicationTypeTest, DetectsSingleHost16Devices) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[512] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[2,8]<=[16]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(*instr, device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::SINGLE_HOST));
+}
+
+TEST_F(CommunicationTypeTest, DetectRailAlignedAllDevices) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[2048] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups=[1,16]<=[16]
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(*instr, device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+}
+
+TEST_F(CommunicationTypeTest, DetectRailAlignedHalfMesh) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=32
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[512] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups={
+          {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15},
+          {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}
+        }
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(*instr, device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::RAIL_ALIGNED));
+}
+
+TEST_F(CommunicationTypeTest, DetectNonRailAligned) {
+  absl::string_view kHlo = R"(
+    HloModule m, num_partitions=16
+
+    ENTRY e {
+      p = f32[128] parameter(0)
+      ROOT _ = f32[512] all-gather(p),
+        dimensions={0},
+        use_global_device_ids=true,
+        channel_id=1,
+        replica_groups={{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}}
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(kHlo));
+
+  HloCollectiveInstruction* instr = Cast<HloCollectiveInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_THAT(CommunicationType(*instr, device_info().gpu_compute_capability()),
+              IsOkAndHolds(GPUCommunicationType::NON_RAIL_ALIGNED));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collective_select_folder.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/transforms/collective_select_folder.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
index d3cc0143b6bd..465a27f5e3ca 100644
--- a/third_party/xla/xla/service/gpu/transforms/collective_select_folder.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/collective_select_folder.h"
+#include "xla/service/gpu/transforms/collectives/collective_select_folder.h"
 
 #include <cstdint>
 #include <optional>
diff --git a/third_party/xla/xla/service/gpu/transforms/collective_select_folder.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
similarity index 92%
rename from third_party/xla/xla/service/gpu/transforms/collective_select_folder.h
rename to third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
index c53eb2ca508b..7d2172d18517 100644
--- a/third_party/xla/xla/service/gpu/transforms/collective_select_folder.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SELECT_FOLDER_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SELECT_FOLDER_H_
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_SELECT_FOLDER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_SELECT_FOLDER_H_
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -83,4 +83,4 @@ class CollectiveSelectFolder : public HloModulePass {
 
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVE_SELECT_FOLDER_H_
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_COLLECTIVES_COLLECTIVE_SELECT_FOLDER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/collective_select_folder_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder_test.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/transforms/collective_select_folder_test.cc
rename to third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder_test.cc
index 5d7453abba64..a18f3ba33e04 100644
--- a/third_party/xla/xla/service/gpu/transforms/collective_select_folder_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_select_folder_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/transforms/collective_select_folder.h"
+#include "xla/service/gpu/transforms/collectives/collective_select_folder.h"
 
 #include <initializer_list>
 #include <memory>
@@ -430,18 +430,18 @@ TEST_F(CollectiveSelectFolderTest,
     // CHECK:      ENTRY %computation
     // CHECK:        %[[PARAM:.*]] = (f32[8192]{0}, f32[8192]{0}) parameter(0)
     // CHECK:        %[[OPERAND_BWD:.*]] = {{.*}} get-tuple-element
-    // CHECK-SAME:       ({{.*}} %[[PARAM]]), index=0
+    // CHECK-SAME:       ({{.*}}%[[PARAM]]), index=0
     // CHECK:        %[[OPERAND_FWD:.*]] = {{.*}} get-tuple-element
-    // CHECK-SAME:       ({{.*}} %[[PARAM]]), index=1
+    // CHECK-SAME:       ({{.*}}%[[PARAM]]), index=1
     // CHECK:        %[[CP_BWD:.*]] = {{.*}} collective-permute
-    // CHECK-SAME:       ({{.*}} %[[OPERAND_BWD]]), channel_id=1,
+    // CHECK-SAME:       ({{.*}}%[[OPERAND_BWD]]), channel_id=1,
     // CHECK-SAME:       source_target_pairs={{\{}}{3,0}}
     // CHECK:        %[[CP_FWD:.*]] = {{.*}} collective-permute
-    // CHECK-SAME:       ({{.*}} %[[OPERAND_FWD]]), channel_id=2,
+    // CHECK-SAME:       ({{.*}}%[[OPERAND_FWD]]), channel_id=2,
     // CHECK-SAME:       source_target_pairs={{\{}}{0,1},{1,2},{2,3}}
     // CHECK:        ROOT %{{.*}} =
-    // CHECK-SAME:       select({{.*}} %{{.*}}, {{.*}} %[[CP_BWD]],
-    // CHECK-SAME:       %[[CP_FWD]])
+    // CHECK-SAME:       select({{.*}}, {{.*}}%[[CP_BWD]],
+    // CHECK-SAME:       {{.*}}%[[CP_FWD]])
     // CHECK:      }
   )";
   TF_ASSERT_OK_AND_ASSIGN(bool filecheck_result,
@@ -476,8 +476,8 @@ TEST_F(CollectiveSelectFolderTest, DtypeConvertedPartitionId) {
                                                 /*expect_change=*/true));
   const absl::string_view kExpected = R"(
     // CHECK: %[[PARAM:.*]] = {{.*}} parameter(0)
-    // CHECK: %[[DATA_A:.*]] = {{.*}} get-tuple-element({{.*}} %[[PARAM]]), index=0
-    // CHECK: ROOT %[[DATA_A_:.*]] = {{.*}} collective-permute({{.*}} %[[DATA_A]])
+    // CHECK: %[[DATA_A:.*]] = {{.*}} get-tuple-element({{.*}}%[[PARAM]]), index=0
+    // CHECK: ROOT %[[DATA_A_:.*]] = {{.*}} collective-permute({{.*}}%[[DATA_A]])
   )";
   TF_ASSERT_OK_AND_ASSIGN(bool filecheck_result,
                           RunFileCheck(module->ToString(), kExpected));
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.cc b/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.cc
index 6e82af23278d..8d34319a2b59 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.cc
@@ -47,13 +47,11 @@ absl::Status GpuConvertAsyncCollectivesToSync::ConvertAsyncInstructionsToSync(
     absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
     const {
   absl::flat_hash_map<HloInstruction*, HloInstruction*> replaced_ops;
-  CollectiveBackendConfig sync_config;
-  sync_config.set_is_sync(true);
   for (auto& [async_start, async_done] : async_pairs) {
     // Tag the async start with is_sync = true.
     TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
                         async_start->backend_config<GpuBackendConfig>());
-    *gpu_config.mutable_collective_backend_config() = sync_config;
+    gpu_config.mutable_collective_backend_config()->set_is_sync(true);
     TF_RETURN_IF_ERROR(async_start->set_backend_config(gpu_config));
     replaced_ops[async_start] = nullptr;
     replaced_ops[async_done] = async_start;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync_test.cc
index 3998187cb71f..d4cf2ba6a269 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -37,7 +37,8 @@ using ::testing::IsTrue;
 // Note: The pass only processes modules that are already scheduled. If the test
 // does not work as expected, make sure to check if "is_scheduled=true" is added
 // to the HLO module string.
-class GpuConvertAsyncCollectivesToSyncTest : public HloTestBase {
+class GpuConvertAsyncCollectivesToSyncTest
+    : public HloHardwareIndependentTestBase {
  public:
   absl::Status RunPass(HloModule *module, bool expect_change) {
     TF_ASSIGN_OR_RETURN(bool changed,
@@ -339,6 +340,42 @@ TEST_F(GpuConvertAsyncCollectivesToSyncTest, MultipleInFlightNestedPartial) {
   EXPECT_THAT(IsSync(module.get(), "start2"), IsTrue());
 }
 
+TEST_F(GpuConvertAsyncCollectivesToSyncTest,
+       SimpleAllReducePreserveBackendConfig) {
+  const absl::string_view hlo_string = R"(
+      HloModule test, is_scheduled=true
+
+      apply_op {
+        x = u32[] parameter(0)
+        y = u32[] parameter(1)
+        ROOT apply_op = u32[] add(x, y)
+      }
+
+      ENTRY test_computation {
+        id = u32[] replica-id()
+        start = u32[] all-reduce-start(id), to_apply=apply_op, channel_id=3, replica_groups={{0,1}, {2,3}}, backend_config={"collective_backend_config":{"backend":"NVSHMEM"}}
+        id2 = f32[] bitcast(id)
+        ROOT done = u32[] all-reduce-done(start)
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloInstruction *inst_orig = FindInstruction(module.get(), "start");
+  const CollectiveBackendConfig backend_config_orig =
+      inst_orig->backend_config<GpuBackendConfig>()
+          .value()
+          .collective_backend_config();
+
+  TF_ASSERT_OK(RunPass(module.get(), /*expect_change=*/true));
+  EXPECT_THAT(IsSync(module.get(), "start"), IsTrue());
+  const HloInstruction *inst = FindInstruction(module.get(), "start");
+  const CollectiveBackendConfig backend_config =
+      inst->backend_config<GpuBackendConfig>()
+          .value()
+          .collective_backend_config();
+  EXPECT_EQ(backend_config.backend(), backend_config_orig.backend());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
index 5a263db9b8e7..748802ae8e5e 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.cc
@@ -14,27 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
-#include <memory>
-#include <string>
 
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/hlo/pass/hlo_pass_pipeline.h"
-#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/collective_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "xla/service/gpu/transforms/collectives/convert_async_collectives_to_sync.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 
 namespace xla::gpu {
@@ -53,73 +44,8 @@ int64_t GetDefaultValue(HloOpcode opcode) {
   return -1;
 }
 
-static constexpr const char* kCollectiveIdAttr = "collective_id";
-
-std::string CollectiveId(const HloInstruction* instr) {
-  return absl::StrCat(instr->unique_id());
-}
-
-// Annotate all collective instructions with a unique identifier that will be
-// preserved after async collective conversion.
-void AnnotateCollectives(HloModule* module) {
-  HloPredicate is_collective = [](const HloInstruction* instr) {
-    return hlo_query::IsCollectiveCommunicationOp(instr->opcode());
-  };
-  hlo_query::ForEachInstructionWithPred(
-      *module, is_collective, [](HloInstruction* instr) {
-        instr->add_frontend_attribute(kCollectiveIdAttr, CollectiveId(instr));
-      });
-}
-
-absl::Status AnnotateSyncCollectives(HloModule* module) {
-  HloPassPipeline pipeline("annotate-sync-collectives");
-  pipeline.AddPass<GpuConvertAsyncCollectivesToSync>();
-  return pipeline.Run(module).status();
-}
-
-absl::flat_hash_set<std::string> SyncCollectiveIds(const HloModule& module) {
-  absl::flat_hash_set<std::string> sync_collective_ids;
-  HloPredicate is_sync_collective = [](const HloInstruction* instr) {
-    return IsGPUSyncCollective(*instr);
-  };
-  hlo_query::ForEachInstructionWithPred(
-      module, is_sync_collective,
-      [&sync_collective_ids](const HloInstruction* instr) {
-        sync_collective_ids.insert(
-            *instr->get_frontend_attribute(kCollectiveIdAttr));
-      });
-  return sync_collective_ids;
-}
-
 }  // namespace
 
-absl::StatusOr<absl::flat_hash_set<HloInstruction*>> SynchronousCollectives(
-    const HloModule& module, int64_t pointer_size,
-    const se::DeviceDescription& device_info) {
-  std::unique_ptr<HloModule> cloned_module = module.Clone();
-  AnnotateCollectives(cloned_module.get());
-  TF_RETURN_IF_ERROR(RunAsyncCollectivesConversionPasses(cloned_module.get()));
-  TF_RETURN_IF_ERROR(
-      ScheduleGpuModule(cloned_module.get(), pointer_size, device_info)
-          .status());
-  TF_RETURN_IF_ERROR(AnnotateSyncCollectives(cloned_module.get()));
-
-  absl::flat_hash_set<std::string> sync_collective_ids =
-      SyncCollectiveIds(*cloned_module);
-
-  // Find the corresponding sync collective instructions in the original module.
-  absl::flat_hash_set<HloInstruction*> sync_collectives;
-  HloPredicate is_sync_collective =
-      [&sync_collective_ids](const HloInstruction* instr) {
-        return sync_collective_ids.contains(CollectiveId(instr));
-      };
-  hlo_query::ForEachInstructionWithPred(
-      module, is_sync_collective, [&sync_collectives](HloInstruction* instr) {
-        sync_collectives.insert(instr);
-      });
-  return sync_collectives;
-}
-
 int64_t MaxAvailableMemory(const HloModule& module,
                            const se::DeviceDescription& device_info) {
   int64_t base_limit = module.config().device_memory_size() != 0
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h
index 225182961ed3..f23792c0744d 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h
@@ -18,9 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -29,12 +27,6 @@ limitations under the License.
 
 namespace xla::gpu {
 
-// Return the set of collective instructions that are synchronous post
-// scheduling.
-absl::StatusOr<absl::flat_hash_set<HloInstruction*>> SynchronousCollectives(
-    const HloModule& module, int64_t pointer_size,
-    const se::DeviceDescription& device_info);
-
 // Returns the maximum available memory on a device.
 int64_t MaxAvailableMemory(const HloModule& module,
                            const se::DeviceDescription& device_info);
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
index d2d740faf74b..5b1992b66168 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
@@ -16,35 +16,29 @@ limitations under the License.
 #include "xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h"
 
 #include <cstdint>
-#include <optional>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/collective_pipeliner.h"
+#include "xla/service/collective_pipeliner_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla::gpu {
 namespace {
 
-using ::testing::UnorderedElementsAre;
-using ::tsl::testing::IsOkAndHolds;
-
-using CollectiveCombinerUtilsTest = HloTestBase;
+using CollectiveCombinerUtilsTest = HloHardwareIndependentTestBase;
 
 TEST_F(CollectiveCombinerUtilsTest,
        ComputeSuggestedCombinerThresholdReturnsMemoryThresholdForDeviceInfo) {
@@ -161,7 +155,7 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*pipeline_use_tree=*/false,
       /*process_different_sized_ops=*/true,
       /*pipelining_direction=*/
-      CollectivePipeliner::PipeliningDirection::kForward,
+      collective_pipeliner_utils::PipeliningDirection::kForward,
       /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
       /*acceptable_formatting=*/HloPredicateTrue,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
@@ -249,7 +243,7 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*pipeline_use_tree=*/false,
       /*process_different_sized_ops=*/true,
       /*pipelining_direction=*/
-      CollectivePipeliner::PipeliningDirection::kForward,
+      collective_pipeliner_utils::PipeliningDirection::kForward,
       /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
       /*acceptable_formatting=*/HloPredicateTrue,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
@@ -332,15 +326,15 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*pipeline_use_tree=*/false,
       /*process_different_sized_ops=*/true,
       /*pipelining_direction=*/
-      CollectivePipeliner::PipeliningDirection::kBackward,
+      collective_pipeliner_utils::PipeliningDirection::kBackward,
       /*should_process=*/HloPredicateIsOp<HloOpcode::kAllGather>,
       /*acceptable_formatting=*/HloPredicateTrue,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
       /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
       /*should_allow_control_dependencies=*/false,
-      /*postprocess_backward_peeled_op=*/std::nullopt,
-      /*postprocess_backward_rotated_op=*/std::nullopt,
-      /*postprocess_backward_peeled_trailing_op=*/std::nullopt,
+      /*postprocess_backward_peeled_op=*/{},
+      /*postprocess_backward_rotated_op=*/{},
+      /*postprocess_backward_peeled_trailing_op=*/{},
       /*should_add_loop_invariant_op_in_chain=*/true,
   };
   config.postprocess_pipelined_ops = AppendPipelinedInstruction;
@@ -427,15 +421,15 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*pipeline_use_tree=*/false,
       /*process_different_sized_ops=*/true,
       /*pipelining_direction=*/
-      CollectivePipeliner::PipeliningDirection::kBackward,
+      collective_pipeliner_utils::PipeliningDirection::kBackward,
       /*should_process=*/HloPredicateIsOp<HloOpcode::kAllGather>,
       /*acceptable_formatting=*/HloPredicateTrue,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
       /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
       /*should_allow_control_dependencies=*/false,
-      /*postprocess_backward_peeled_op=*/std::nullopt,
-      /*postprocess_backward_rotated_op=*/std::nullopt,
-      /*postprocess_backward_peeled_trailing_op=*/std::nullopt,
+      /*postprocess_backward_peeled_op=*/{},
+      /*postprocess_backward_rotated_op=*/{},
+      /*postprocess_backward_peeled_trailing_op=*/{},
       /*should_add_loop_invariant_op_in_chain=*/true,
   };
   config.postprocess_pipelined_ops = AppendPipelinedInstruction;
@@ -519,77 +513,5 @@ TEST_F(CollectiveCombinerUtilsTest,
   EXPECT_FALSE(ContainsPipelinedInstruction(*module));
 }
 
-absl::StatusOr<absl::flat_hash_set<HloInstruction*>> SynchronousCollectives(
-    const HloModule& module) {
-  int pointer_size = 4;
-  stream_executor::DeviceDescription device_info;
-  device_info.set_device_memory_size(20000);
-  return xla::gpu::SynchronousCollectives(module, pointer_size, device_info);
-}
-
-TEST_F(CollectiveCombinerUtilsTest, SynchronousCollectivesNoOverlap) {
-  absl::string_view kHloText = R"(
-    HloModule m
-
-    add {
-      p0 = f16[] parameter(0)
-      p1 = f16[] parameter(1)
-      ROOT add = f16[] add(p0, p1)
-    }
-
-    ENTRY main {
-      p0 = f16[10000000]{0} parameter(0)
-      p1 = f16[10000000]{0} parameter(1)
-      ar0 = f16[10000000]{0} all-reduce(p0), replica_groups={}, to_apply=add
-      ar1 = f16[10000000]{0} all-reduce(p1), replica_groups={}, to_apply=add
-      ROOT result = tuple(ar0, ar1)
-    }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  const HloInstruction* ar0 =
-      module->entry_computation()->root_instruction()->operand(0);
-  const HloInstruction* ar1 =
-      module->entry_computation()->root_instruction()->operand(1);
-  EXPECT_THAT(SynchronousCollectives(*module),
-              IsOkAndHolds(UnorderedElementsAre(ar0, ar1)));
-}
-
-TEST_F(CollectiveCombinerUtilsTest, SynchronousCollectivesWithOverlap) {
-  // Expected schedule:
-  // ------------------
-  // c0 –> ar0
-  //       c1 –> ar1
-  // ------------------
-  absl::string_view kHloText = R"(
-    HloModule m
-
-    add {
-      p0 = f16[] parameter(0)
-      p1 = f16[] parameter(1)
-      ROOT add = f16[] add(p0, p1)
-    }
-
-    ENTRY main {
-      p0 = f16[10000000]{0} parameter(0)
-      p1 = f16[10000000]{0} parameter(1)
-
-      c0 = f16[10000000]{0} copy(p0)
-      c1 = f16[10000000]{0} copy(p1)
-
-      ar0 = f16[10000000]{0} all-reduce(c0), replica_groups={}, to_apply=add
-      ar1 = f16[10000000]{0} all-reduce(c1), replica_groups={}, to_apply=add
-
-      ROOT result = tuple(ar0, ar1)
-    }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  const HloInstruction* ar1 =
-      module->entry_computation()->root_instruction()->operand(1);
-  EXPECT_THAT(SynchronousCollectives(*module),
-              IsOkAndHolds(UnorderedElementsAre(ar1)));
-}
-
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
index ab2c7f9fd0b7..4fb0c0be1adf 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/bind_front.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/transforms/collectives/collective_combiner_annotator.h"
 #include "xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils.h"
 #include "xla/service/hlo_domain_map.h"
 #include "xla/service/reduce_scatter_combiner.h"
@@ -47,10 +47,9 @@ std::optional<ReduceScatterCombiner::GroupKey> PipelinedCombinerKey(
 }
 
 std::optional<ReduceScatterCombiner::GroupKey> SynchronousCombinerKey(
-    const absl::flat_hash_set<HloInstruction*>& sync_collectives,
     const HloInstruction* instruction, const HloDomainMap& domain_map,
     bool combine_by_dim) {
-  if (!sync_collectives.contains(instruction)) {
+  if (!IsCombinableSyncCollective(*instruction)) {
     return std::nullopt;
   }
   return ReduceScatterCombiner::CombineKey(instruction, domain_map,
@@ -68,31 +67,14 @@ absl::StatusOr<bool> GpuReduceScatterCombiner::Run(
   }
 
   // Combiner threshold is not specified. We use heuristics.
-  // We sequentially combine synchronous collectives then pipelined collectives
-  // and finally the rest. Note that collectives can be both synchronous and
-  // pipelined. Hence, we combine them in two steps.
+  // We sequentially combine pipelined collectives then synchronous collectives
+  // and finally the rest.
+  // We currently don't support combining a collective that was previously
+  // combined (see b/415761650). We favor combining pipelined collectives over
+  // synchronous collectives.
 
   bool changed = false;
 
-  // Combine as much as possible for synchronous collectives.
-  absl::flat_hash_set<HloInstruction*> sync_collectives;
-  if (module->config()
-          .debug_options()
-          .xla_gpu_experimental_enable_sync_collective_combining()) {
-    TF_ASSIGN_OR_RETURN(
-        sync_collectives,
-        SynchronousCollectives(*module, pointer_size_, device_info_));
-  }
-  if (!sync_collectives.empty()) {
-    combine_threshold_in_bytes_ = MaxAvailableMemory(*module, device_info_);
-    TF_ASSIGN_OR_RETURN(
-        bool combined,
-        RunWithKeyCombiner(
-            module, execution_threads,
-            absl::bind_front(SynchronousCombinerKey, sync_collectives)));
-    changed |= combined;
-  }
-
   // If there are no pipelined instructions in the IR, the optimizations below
   // do not kick in anyway.
   if (ContainsPipelinedInstruction(*module)) {
@@ -105,6 +87,15 @@ absl::StatusOr<bool> GpuReduceScatterCombiner::Run(
     changed |= combined;
   }
 
+  // Combine as much as possible for synchronous collectives.
+  if (ContainsCombinableSyncCollective(*module)) {
+    combine_threshold_in_bytes_ = MaxAvailableMemory(*module, device_info_);
+    TF_ASSIGN_OR_RETURN(
+        bool combined,
+        RunWithKeyCombiner(module, execution_threads, SynchronousCombinerKey));
+    changed |= combined;
+  }
+
   // Use default combiner thresholds after we combine pipelined collectives.
   // The rest is combined by the parent pass code.
   combine_threshold_in_bytes_ = default_combine_threshold_in_bytes_;
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc
index 1157ac1899de..edf97858fa4e 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/reduce_scatter_combiner_test.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/collective_utils.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -37,7 +37,7 @@ using ::tsl::testing::IsOkAndHolds;
 
 namespace op = xla::testing::opcode_matchers;
 
-using GpuReduceScatterCombinerTest = HloTestBase;
+using GpuReduceScatterCombinerTest = HloHardwareIndependentTestBase;
 
 TEST_F(GpuReduceScatterCombinerTest,
        CombinesPipelinedCollectivesUpToSuggestedThreshold) {
@@ -363,8 +363,10 @@ TEST_F(GpuReduceScatterCombinerTest, CombinesSynchronousCollectivesMaximally) {
       p1 = f16[20000000]{0} parameter(1)
 
       // 20MB combinable reduce-scatter collectives. Default combiner threshold is 30MB.
-      rs0 = f16[10000000]{0} reduce-scatter(p0), replica_groups={{0,1}}, dimensions={0}, to_apply=add
-      rs1 = f16[10000000]{0} reduce-scatter(p1), replica_groups={{0,1}}, dimensions={0}, to_apply=add
+      rs0 = f16[10000000]{0} reduce-scatter(p0), replica_groups={{0,1}}, dimensions={0}, to_apply=add,
+        frontend_attributes={sync_collective="true"}
+      rs1 = f16[10000000]{0} reduce-scatter(p1), replica_groups={{0,1}}, dimensions={0}, to_apply=add,
+        frontend_attributes={sync_collective="true"}
       ROOT result = tuple(rs0, rs1)
     }
   )";
@@ -379,13 +381,7 @@ TEST_F(GpuReduceScatterCombinerTest, CombinesSynchronousCollectivesMaximally) {
       /*combine_threshold_count=*/256,
       /*combine_by_dim=*/false, /*pointer_size=*/4);
 
-  EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(false));
-
-  module->mutable_config()
-      .mutable_debug_options()
-      .set_xla_gpu_experimental_enable_sync_collective_combining(true);
   EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(true));
-
   Matcher<const HloInstruction*> combined_reduce_scatter =
       op::ReduceScatter(op::Parameter(0), op::Parameter(1));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -393,5 +389,52 @@ TEST_F(GpuReduceScatterCombinerTest, CombinesSynchronousCollectivesMaximally) {
                         op::GetTupleElement(combined_reduce_scatter, 1)));
 }
 
+TEST_F(GpuReduceScatterCombinerTest,
+       FavorsPipelinedCollectivesOverSynchronous) {
+  absl::string_view kHloText = R"(
+    HloModule m
+
+    add {
+      p0 = f16[] parameter(0)
+      p1 = f16[] parameter(1)
+      ROOT add = f16[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f16[20000000]{0} parameter(0)
+      p1 = f16[20000000]{0} parameter(1)
+      p2 = f16[20000000]{0} parameter(2)
+
+      rs0 = f16[10000000]{0} reduce-scatter(p0), replica_groups={{0,1}}, dimensions={0}, to_apply=add,
+        frontend_attributes={sync_collective="true"},
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      rs1 = f16[10000000]{0} reduce-scatter(p1), replica_groups={{0,1}}, dimensions={0}, to_apply=add,
+        frontend_attributes={sync_collective="true"},
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      rs2 = f16[10000000]{0} reduce-scatter(p2), replica_groups={{0,1}}, dimensions={0}, to_apply=add,
+        backend_config={"collective_backend_config": {"is_pipelined": true}}
+      ROOT result = tuple(rs0, rs1, rs2)
+    }
+  )";
+  DeviceDescription device_info;
+  device_info.set_device_memory_size(10000000000);  // 10GB
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  GpuReduceScatterCombiner combiner(
+      device_info, /*default_combine_threshold_in_bytes=*/
+      kDefaultReduceScatterCombineThreshold,
+      /*combine_threshold_in_bytes=*/kDefaultReduceScatterCombineThreshold,
+      /*combine_threshold_count=*/256,
+      /*combine_by_dim=*/false, /*pointer_size=*/4);
+
+  EXPECT_THAT(combiner.Run(module.get()), IsOkAndHolds(true));
+  Matcher<const HloInstruction*> combined_reduce_scatter =
+      op::ReduceScatter(op::Parameter(0), op::Parameter(1), op::Parameter(2));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::GetTupleElement(combined_reduce_scatter, 0),
+                        op::GetTupleElement(combined_reduce_scatter, 1),
+                        op::GetTupleElement(combined_reduce_scatter, 2)));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
index ba06effb219c..c1043c6fe86c 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/ffi/ffi_api.h"
@@ -44,7 +45,7 @@ limitations under the License.
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/variant_visitor.h"
+#include "xla/service/overload.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
@@ -79,6 +80,10 @@ static bool IsParameter(const HloInstruction* hlo) {
   return HloPredicateIsOp<HloOpcode::kParameter>(hlo);
 }
 
+static bool IsGetTupleElement(const HloInstruction* hlo) {
+  return HloPredicateIsOp<HloOpcode::kGetTupleElement>(hlo);
+}
+
 // Returns true if instruction is no-op at run time and doesn't have a
 // corresponding Thunk or Command (metadata only operation).
 static bool IsNoOp(const HloInstruction* hlo) {
@@ -97,6 +102,42 @@ static bool IsNoOp(const HloInstruction* hlo) {
 // done operation is not part of the same command buffer, we would change the
 // execution semantics and create additional synchronization point.
 
+static bool AsyncStartOrDoneCommandIsSupported(
+    const HloInstruction* hlo, const CommandBufferConfig& config) {
+  CHECK(hlo->opcode() == HloOpcode::kAsyncStart ||
+        hlo->opcode() == HloOpcode::kAsyncDone);
+
+  if (IsCublasGemm(*hlo->async_wrapped_instruction())) {
+    return config.enabled_commands.contains(DebugOptions::CUBLAS);
+  }
+
+  if (hlo->async_wrapped_opcode() == HloOpcode::kFusion) {
+    // We don't currently support dynamic memcpy fusions in command buffers.
+    if (IsDynamicMemcpyFusion(hlo->async_wrapped_instruction())) {
+      return false;
+    }
+
+    // We currently only support static address computations in command
+    // buffers.
+    if (IsDynamicSliceFusion(hlo->async_wrapped_instruction())) {
+      bool is_static_ds_fusion =
+          GetCustomFusionConfigName(hlo->async_wrapped_instruction()) ==
+          kDynamicSliceFusionWithStaticAddressComputationConfigName;
+      return is_static_ds_fusion && config.enabled_commands.contains(
+                                        DebugOptions::DYNAMIC_SLICE_FUSION);
+    }
+
+    return config.enabled_commands.contains(DebugOptions::FUSION);
+  }
+
+  if (hlo->async_wrapped_opcode() == HloOpcode::kReduceScatter ||
+      hlo->async_wrapped_opcode() == HloOpcode::kAllToAll) {
+    return config.enabled_commands.contains(DebugOptions::COLLECTIVES);
+  }
+
+  return false;
+}
+
 static bool IsAsyncStartCommand(const HloInstruction* hlo,
                                 const CommandBufferConfig& config) {
   if (HloPredicateIsOp<HloOpcode::kAllReduceStart, HloOpcode::kAllGatherStart>(
@@ -105,26 +146,7 @@ static bool IsAsyncStartCommand(const HloInstruction* hlo,
   }
 
   if (HloPredicateIsOp<HloOpcode::kAsyncStart>(hlo)) {
-    if (IsCublasGemm(*hlo->async_wrapped_instruction())) {
-      return config.enabled_commands.contains(DebugOptions::CUBLAS);
-    }
-    if (hlo->async_wrapped_opcode() == HloOpcode::kFusion) {
-      // We currently only support static address computations in command
-      // buffers.
-      if (IsDynamicSliceFusion(hlo->async_wrapped_instruction())) {
-        bool is_static_ds_fusion =
-            GetCustomFusionConfigName(hlo->async_wrapped_instruction()) ==
-            kDynamicSliceFusionWithStaticAddressComputationConfigName;
-        return is_static_ds_fusion && config.enabled_commands.contains(
-                                          DebugOptions::DYNAMIC_SLICE_FUSION);
-      } else {
-        return config.enabled_commands.contains(DebugOptions::FUSION);
-      }
-    }
-    if (hlo->async_wrapped_opcode() == HloOpcode::kReduceScatter ||
-        hlo->async_wrapped_opcode() == HloOpcode::kAllToAll) {
-      return config.enabled_commands.contains(DebugOptions::COLLECTIVES);
-    }
+    return AsyncStartOrDoneCommandIsSupported(hlo, config);
   }
 
   if (HloPredicateIsOp<HloOpcode::kReduceScatter, HloOpcode::kAllToAll>(hlo)) {
@@ -142,26 +164,7 @@ static bool IsAsyncDoneCommand(const HloInstruction* hlo,
   }
 
   if (HloPredicateIsOp<HloOpcode::kAsyncDone>(hlo)) {
-    if (IsCublasGemm(*hlo->async_wrapped_instruction())) {
-      return config.enabled_commands.contains(DebugOptions::CUBLAS);
-    }
-    if (hlo->async_wrapped_opcode() == HloOpcode::kFusion) {
-      // We currently only support static address computations in command
-      // buffers.
-      if (IsDynamicSliceFusion(hlo->async_wrapped_instruction())) {
-        bool is_static_ds_fusion =
-            GetCustomFusionConfigName(hlo->async_wrapped_instruction()) ==
-            kDynamicSliceFusionWithStaticAddressComputationConfigName;
-        return is_static_ds_fusion && config.enabled_commands.contains(
-                                          DebugOptions::DYNAMIC_SLICE_FUSION);
-      } else {
-        return config.enabled_commands.contains(DebugOptions::FUSION);
-      }
-    }
-    if (hlo->async_wrapped_opcode() == HloOpcode::kReduceScatter ||
-        hlo->async_wrapped_opcode() == HloOpcode::kAllToAll) {
-      return config.enabled_commands.contains(DebugOptions::COLLECTIVES);
-    }
+    return AsyncStartOrDoneCommandIsSupported(hlo, config);
   }
 
   return false;
@@ -261,6 +264,10 @@ static bool IsCommand(const HloInstruction* hlo,
     if (backend_config.kind() == kCuDnnFusionKind) {
       return config.enabled_commands.contains(DebugOptions::CUDNN);
     }
+    if (IsDynamicMemcpyFusion(fusion)) {
+      // Dynamic memcpy fusions do not yet have a command implementation.
+      return false;
+    }
     if (IsDynamicSliceFusion(fusion)) {
       auto fusion_analysis =
           HloFusionAnalysis::Create(*hlo, config.device_description);
@@ -359,6 +366,73 @@ static void RemoveTrailingNoOps(HloInstructionSequence& seq) {
   }
 }
 
+// Moves GetTupleElement instructions to right after the instruction that
+// produces the tuple. Returns whether the computation was changed. This is run
+// before command buffer scheduling.
+//
+// The motivation is to ensure the live range of large elements in the tuple are
+// not extended due to the creation of command buffers. For example, consider
+// the following input HLO to this pass.
+//
+//     x = f32[] parameter(0)
+//     t = (f32[], f32[10000]) custom-call()
+//     ... # Many instructions, none which use t
+//     x_squared = f32[] multiply(x, x)
+//     t0 = f32[] get-tuple-element(t), index=0
+//     y = f32[] add(x_squared, t0)
+//
+// The 10000-element buffer can immediately be freed after the custom-call, as
+// it is unused. However, if `t0` is not moved right after `t`, then the
+// scheudling of command buffers might turn the HLO into the following,
+// extending the live range of the 10000-element buffer as 't' is passed to the
+// command buffer:
+//
+//     command_buffer {
+//       t = (f32[], f32[10000]) paramter(0)
+//       x_squared = f32[] multiply(x, x)
+//       t0 = f32[] get-tuple-element(t), index=0
+//       ROOT y = f32[] add(x_squared, t0)
+//     }
+//
+//     main {
+//       x = f32[] parameter(0)
+//       t = (f32[], f32[10000]) custom-call()
+//       ... # Many instructions, none which use t
+//       ROOT y = f32[] call(t), to_apply=command_buffer
+//     }
+//
+// Moving the GTE right after `t` solves this, as command-buffers never start
+// with a GTE, so it's impossible for a command buffer to contain the GTE but
+// not the custom-call itself.
+static absl::StatusOr<bool> MoveGTEsRightAfterTupleDefinition(
+    HloComputation* computation) {
+  HloInstructionSequence new_sequence;
+  HloSchedule& schedule = computation->parent()->schedule();
+  const HloInstructionSequence sequence =
+      schedule.GetOrCreateSequence(computation);
+
+  absl::flat_hash_set<HloInstruction*> moved_gtes;
+
+  for (HloInstruction* inst : sequence.instructions()) {
+    if (!moved_gtes.contains(inst)) {
+      new_sequence.push_back(inst);
+    }
+    if (!inst->shape().IsTuple()) {
+      continue;
+    }
+    for (HloInstruction* user : inst->users()) {
+      if (IsGetTupleElement(user) && !user->HasControlDependencies()) {
+        new_sequence.push_back(user);
+        moved_gtes.insert(user);
+      }
+    }
+  }
+
+  bool changed = new_sequence != sequence;
+  schedule.set_sequence(computation, std::move(new_sequence));
+  return changed;
+}
+
 //===----------------------------------------------------------------------===//
 // Discovering sequences of compatible Hlo instructions
 //===----------------------------------------------------------------------===//
@@ -635,18 +709,15 @@ absl::StatusOr<CommandBuffer> CommandBufferScheduling::PrepareCommandBuffer(
     // Cloned instructions should call the same computations as original
     // instructions will be dead code eliminated.
     for (HloComputation* called_computation : inst->called_computations()) {
-      // Async computations can only be referenced by a single async chain at
-      // a time. Detach the current chain to let its copy bind to the
-      // computation.
-      if (called_computation->IsAsyncComputation()) {
-        called_computation->RemoveAsyncStart();
-      }
       ctx.MapComputation(called_computation, called_computation);
     }
-
     inst_mapping[inst] = builder.AddInstruction(
         inst->CloneWithNewOperands(inst->shape(), mapped_operands(inst), &ctx));
     inst_mapping[inst]->UniquifyId(module);
+
+    // Clear the called computations of the old instruction, because it is
+    // typically not legal for one computation to have more than one caller.
+    inst->ClearCalledComputations();
   }
 
   // Convert parameters to command buffer arguments.
@@ -876,7 +947,7 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
     erase(kRequireConditionals);  // on-device control flow
   };
 
-  std::visit(VariantVisitor{erase_cuda, erase_rocm},
+  std::visit(Overload{erase_cuda, erase_rocm},
              device_description_.gpu_compute_capability());
 
   auto order = module->MakeComputationPostOrder();
@@ -887,7 +958,7 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
   for (HloComputation* comp : order) {
     // Skip special computations that do not have lowering to thunks.
     if (comp->IsFusionComputation() || comp->IsAsyncComputation() ||
-        comp->IsCustomCallComputation())
+        !comp->caller_instructions(HloOpcode::kCustomCall).empty())
       continue;
 
     // Skip computations that already part of command buffers.
@@ -895,6 +966,8 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
 
     TF_ASSIGN_OR_RETURN(bool changed_, MoveParametersAndConstantsToFront(comp));
     changed |= changed_;
+    TF_ASSIGN_OR_RETURN(changed_, MoveGTEsRightAfterTupleDefinition(comp));
+    changed |= changed_;
 
     std::vector<HloInstructionSequence> sequences =
         CollectCommandBufferSequences(
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
index 71d5b421c1ee..297d3e76fcde 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling.h
@@ -31,10 +31,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
 
 namespace xla::gpu {
 
-// Lift fusion instructions to command buffers.
+// Lift instructions to command buffers.
 //
 // Before the pass:
 //   %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
diff --git a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc
index 3c60b7f6ec22..ad7ef3bee696 100644
--- a/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/command_buffer_scheduling_test.cc
@@ -15,11 +15,14 @@ limitations under the License.
 #include "xla/service/gpu/transforms/command_buffer_scheduling.h"
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
@@ -34,6 +37,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -161,13 +165,12 @@ TEST_F(CommandBufferSchedulingTest, MultipleCommandBuffers) {
       })";
 
   const char* expected = R"(
-// CHECK:  %command_buffer ([[P0:.+]]: s32[], [[P1:.+]]: s32[], [[P2:.+]]: (s32[], s32[])) -> s32[] {
+// CHECK:  %command_buffer ([[P0:.+]]: s32[], [[P1:.+]]: s32[], [[P2:.+]]: s32[]) -> s32[] {
 // CHECK:    %[[P0]] = s32[] parameter(0)
 // CHECK:    %[[P1]] = s32[] parameter(1)
-// CHECK:    %[[P2]] = (s32[], s32[]) parameter(2)
+// CHECK:    %[[P2]] = s32[] parameter(2)
 // CHECK:    %[[F0:.+]] = s32[] fusion(%[[P0]], %[[P1]]), kind=kLoop, calls=%fused_computation
-// CHECK:    %[[V0:.+]] = s32[] get-tuple-element(%[[P2]]), index=0
-// CHECK:    ROOT {{.*}} = s32[] fusion(%[[F0]], %[[V0]]), kind=kLoop, calls=%fused_computation.1
+// CHECK:    ROOT {{.*}} = s32[] fusion(%[[F0]], %[[P2]]), kind=kLoop, calls=%fused_computation.1
 // CHECK:  }
 
 // CHECK:  %command_buffer.2 ([[P0:.+]]: s32[], [[P1:.+]]: s32[]) -> s32[] {
@@ -181,8 +184,9 @@ TEST_F(CommandBufferSchedulingTest, MultipleCommandBuffers) {
 // CHECK:    %a = s32[] parameter(0)
 // CHECK:    %b = s32[] parameter(1)
 // CHECK:    %c = (s32[], s32[]) parameter(2)
-// CHECK:    %[[CMD0:.+]] = s32[] call(%a, %b, %c), to_apply=%command_buffer
+// CHECK:    %d = s32[] get-tuple-element(%c), index=0
 // CHECK:    %e = s32[] get-tuple-element(%c), index=1
+// CHECK:    %[[CMD0:.+]] = s32[] call(%a, %b, %d), to_apply=%command_buffer
 // CHECK:    %[[CALL:.+]] = s32[] custom-call(%[[CMD0]], %e), custom_call_target="some target"
 // CHECK:    %[[CMD1:.+]] = s32[] call(%[[CALL]], %a), to_apply=%command_buffer.2
 // CHECK:    ROOT {{.*}} = s32[] custom-call(%[[CMD1]]), custom_call_target="some target"
@@ -209,7 +213,7 @@ TEST_F(CommandBufferSchedulingTest, AllReduceStartFollowedByDone) {
       %a = s32[4] parameter(0)
       %start = s32[4]{0} all-reduce-start(s32[4]{0} %a),
         replica_groups={{0,1}}, to_apply=%add,
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
       ROOT %done = s32[4]{0} all-reduce-done(s32[4]{0} %start)
     })";
 
@@ -242,7 +246,7 @@ TEST_F(CommandBufferSchedulingTest, AllGatherStartFollowedByDone) {
 
       %start = (s32[2]{0}, s32[4]{0}) all-gather-start(%a),
         channel_id=555, replica_groups={{0,1}}, dimensions={0},
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
 
       ROOT %done = s32[4]{0} all-gather-done(%start)
     })";
@@ -282,7 +286,7 @@ TEST_F(CommandBufferSchedulingTest, ReduceScatterStartFollowedByDone) {
 
       %start = ((s32[4]{0}), s32[2]{0}) reduce-scatter-start(%a),
         channel_id=555, replica_groups={{0,1}}, dimensions={0}, to_apply=add,
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
 
       ROOT %done = s32[2]{0} reduce-scatter-done(%start)
     })";
@@ -321,7 +325,7 @@ TEST_F(CommandBufferSchedulingTest, AllReduceStartFollowedByBitcast) {
       %a = s32[4] parameter(0)
       %start = s32[4]{0} all-reduce-start(s32[4]{0} %a),
         replica_groups={{0,1}}, to_apply=%add,
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
       %bitcast = s32[4] bitcast(s32[4]{0} %a)
       ROOT %done = s32[4]{0} all-reduce-done(s32[4]{0} %start)
     })";
@@ -361,10 +365,10 @@ TEST_F(CommandBufferSchedulingTest, AllReduceStartFollowedAllReduceStart) {
       %a = s32[4] parameter(0)
       %start1 = s32[4]{0} all-reduce-start(s32[4]{0} %a),
         replica_groups={{0,1}}, to_apply=%add,
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
       %start2 = s32[4]{0} all-reduce-start(s32[4]{0} %a),
         replica_groups={{0,1}}, to_apply=%add,
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
       %done1 = s32[4]{0} all-reduce-done(s32[4]{0} %start1)
       ROOT %done2 = s32[4]{0} all-reduce-done(s32[4]{0} %start2)
     })";
@@ -418,11 +422,11 @@ TEST_F(CommandBufferSchedulingTest, DoNotCaptureUnmatchedAsyncDone) {
       %b = s32[] parameter(1)
       %start1 = s32[4]{0} all-reduce-start(s32[4]{0} %a),
         replica_groups={{0,1}}, to_apply=%add,
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
       %c = s32[] custom-call(), custom_call_target="target"
       %start2 = s32[4]{0} all-reduce-start(s32[4]{0} %a),
         replica_groups={{0,1}}, to_apply=%add,
-        backend_config={"collective_backend_config": {"is_sync":true,"no_parallel_custom_call":false}}
+        backend_config={"collective_backend_config": {"is_sync":true}}
       %done1 = s32[4]{0} all-reduce-done(s32[4]{0} %start1)
       %done2 = s32[4]{0} all-reduce-done(s32[4]{0} %start2)
       %fusion = s32[] fusion(s32[] %b, s32[] %c), kind=kLoop, calls=%fused_computation
@@ -1031,13 +1035,12 @@ TEST_F(CommandBufferSchedulingTest, AsyncCustomCall) {
     })";
 
   const char* expected = R"(
-    CHECK: %command_buffer ([[P:.+]]: f32[2,2]) -> ((f32[2,2], s8[4]), (f32[2,2], s8[4])) {
+    CHECK: %command_buffer ([[P:.+]]: f32[2,2]) -> (f32[2,2], (f32[2,2], s8[4])) {
     CHECK:   %[[P]] = f32[2,2]{1,0} parameter(0)
     CHECK:   %[[S1:.+]] = ((f32[2,2]{1,0}, f32[2,2]{1,0}), (f32[2,2]{1,0}, s8[4]{0}), u32[]) custom-call-start(%[[P]], %[[P]]), custom_call_target="__cublas$gemm"
     CHECK:   %[[S2:.+]] = ((f32[2,2]{1,0}, f32[2,2]{1,0}), (f32[2,2]{1,0}, s8[4]{0}), u32[]) custom-call-start(%[[P]], %[[P]]), custom_call_target="__cublas$gemm"
     CHECK:   %[[D1:.+]] = (f32[2,2]{1,0}, s8[4]{0}) custom-call-done(%[[S1]])
     CHECK:   %[[D2:.+]] = (f32[2,2]{1,0}, s8[4]{0}) custom-call-done(%[[S2]])
-    CHECK:   ROOT %[[T:.+]] = ((f32[2,2]{1,0}, s8[4]{0}), (f32[2,2]{1,0}, s8[4]{0})) tuple(%[[D1]], %[[D2]])
     CHECK: })";
 
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
@@ -1202,6 +1205,35 @@ TEST_F(CommandBufferSchedulingTest, DynamicSliceFusionStaticSlicing) {
                                                 false, true, std::nullopt));
 }
 
+TEST_F(CommandBufferSchedulingTest, AsyncDynamicMemcpyFusion) {
+  // Regression test to verify that async memcpy fusions are not commands.
+  const char* hlo = R"(
+      HloModule m, is_scheduled=true
+
+      %fused_computation {
+        p0 = s32[64] parameter(0)
+        c32 = s32[] constant(32)
+        ROOT %slice = s32[32] dynamic-slice(p0, c32), dynamic_slice_sizes={32}
+      }
+
+      %async_computation {
+        p0 = s32[64] parameter(0)
+        ROOT fusion0 = s32[32] fusion(p0), kind=kLoop,
+          calls=%fused_computation,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      }
+
+      main {
+        p0 = s32[64] parameter(0)
+        async-start = ((s32[64]), s32[32]) async-start(p0),
+          calls=async_computation
+        ROOT async-done = s32[32] async-done(async-start)
+      })";
+
+  RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
+                            std::nullopt /* no change expected*/);
+}
+
 TEST_F(CommandBufferSchedulingTest, ReturnFalseWhenNoChange) {
   const char* hlo = R"(
     HloModule module, is_scheduled=true
@@ -1312,5 +1344,52 @@ TEST_F(CommandBufferSchedulingTest,
                                                 /*error=*/std::nullopt));
 }
 
+TEST_F(CommandBufferSchedulingTest, MoveGTEs) {
+  const char* hlo = R"(
+      HloModule m, is_scheduled=true
+
+      %fused_computation (param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      %fused_computation.1 (param_0: s32[], param_1: s32[]) -> s32[] {
+        %p0 = s32[] parameter(0)
+        %p1 = s32[] parameter(1)
+        ROOT %add = s32[] add(s32[] %p0, s32[] %p1)
+      }
+
+      main {
+        x = s32[] parameter(0)
+        t = (s32[], f32[10000]) custom-call(), custom_call_target="some target"
+        fusion0 = s32[] fusion(x, x), kind=kLoop, calls=%fused_computation
+        t0 = s32[] get-tuple-element(t), index=0
+        ROOT fusion1 = s32[] fusion(fusion0, t0), kind=kLoop, calls=%fused_computation.1
+      })";
+
+  // The get-tuple-element instruction is moved right after its usage.
+  const char* expected = R"(
+// CHECK:  %command_buffer ([[P0:.+]]: s32[], [[P1:.+]]: s32[]) -> s32[] {
+// CHECK:    %[[P0]] = s32[] parameter(0)
+// CHECK:    %[[P1]] = s32[] parameter(1)
+// CHECK:    %fusion0 = s32[] fusion(%[[P0]], %[[P0]]), kind=kLoop, calls=%fused_computation
+// CHECK:    ROOT %fusion1 = s32[] fusion(%fusion0, %[[P1]]), kind=kLoop, calls=%fused_computation.1
+// CHECK:  }
+
+// CHECK:  ENTRY %main (x: s32[]) -> s32[] {
+// CHECK:    %x = s32[] parameter(0)
+// CHECK:    %t = (s32[], f32[10000]{0}) custom-call(), custom_call_target="some target"
+// CHECK:    %t0 = s32[] get-tuple-element(%t), index=0
+// CHECK:    ROOT {{.*}} = s32[] call(%x, %t0), to_apply=%command_buffer
+// CHECK:  })";
+
+  RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(device_desc()),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
index f072a9130764..fc75d8f7c465 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization.cc
@@ -77,7 +77,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     // within cudnn is basically free, whereas a kPad's cost increases as the
     // amount of padding increases.
     PaddingConfig padding_config =
-        MakeNoPaddingConfig(input->shape().dimensions_size());
+        MakeNoPaddingConfig(input->shape().dimensions().size());
     for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
       int64_t dim = conv_dnums.input_spatial_dimensions(i);
       if (conv_window->dimensions(i).padding_low() > 0) {
@@ -109,10 +109,10 @@ HloInstruction* MaybePaddedAndSlicedInput(
     //
     // For each dimension, initialize the start index to 0 and the limit index
     // to the size of that dimension.
-    std::vector<int64_t> start_indices(input->shape().dimensions_size(), 0);
+    std::vector<int64_t> start_indices(input->shape().dimensions().size(), 0);
     std::vector<int64_t> limit_indices(input->shape().dimensions().begin(),
                                        input->shape().dimensions().end());
-    std::vector<int64_t> strides(input->shape().dimensions_size(), 1);
+    std::vector<int64_t> strides(input->shape().dimensions().size(), 1);
     for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
       int64_t dim = conv_dnums.input_spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
@@ -147,8 +147,8 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window,
   // Compute the shape and padding config of the pad to be inserted.
   PaddingConfig padding_config;
   padding_config.mutable_dimensions()->Reserve(
-      kernel->shape().dimensions_size());
-  for (size_t i = 0; i < kernel->shape().dimensions_size(); ++i) {
+      kernel->shape().dimensions().size());
+  for (size_t i = 0; i < kernel->shape().dimensions().size(); ++i) {
     padding_config.add_dimensions();
   }
   for (size_t i = 0; i < conv_dnums.kernel_spatial_dimensions().size(); ++i) {
@@ -239,7 +239,7 @@ bool ConvPaddingLegalization::CanonicalizeBackwardFilterConvolution(
   Window new_backward_conv_window = backward_conv->window();
   // input_padding_config is the config of the kPad to be inserted.
   PaddingConfig input_padding_config =
-      MakeNoPaddingConfig(input->shape().rank());
+      MakeNoPaddingConfig(input->shape().dimensions().size());
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
@@ -369,11 +369,11 @@ bool ConvPaddingLegalization::CanonicalizeBackwardInputConvolution(
   //
   // Initialize start_indices and limit_indices as no slicing.
   std::vector<int64_t> start_indices(
-      new_backward_conv->shape().dimensions_size(), 0LL);
+      new_backward_conv->shape().dimensions().size(), 0LL);
   std::vector<int64_t> limit_indices(
       new_backward_conv->shape().dimensions().begin(),
       new_backward_conv->shape().dimensions().end());
-  std::vector<int64_t> strides(new_backward_conv->shape().dimensions_size(),
+  std::vector<int64_t> strides(new_backward_conv->shape().dimensions().size(),
                                1LL);
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64_t padding_low = backward_conv->window().dimensions(i).padding_low();
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
index 7d238078d100..d719d407fb84 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_padding_legalization_test.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include "xla/service/gpu/transforms/conv_padding_legalization.h"
 
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
@@ -32,7 +32,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-using ConvPaddingLegalizationTest = HloTestBase;
+using ConvPaddingLegalizationTest = HloHardwareIndependentTestBase;
 
 TEST_F(ConvPaddingLegalizationTest, BackwardInputConvolve) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
index 567d66ac7a0b..de98e998172f 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.cc
@@ -28,8 +28,10 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "absl/strings/str_replace.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -41,6 +43,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
@@ -54,9 +57,10 @@ namespace gpu {
 
 namespace {
 
-absl::Status CheckTypes(HloInstruction* conv,
-                        const se::GpuComputeCapability cc) {
-  auto valid_shape = [conv, &cc](const Shape& shape) -> absl::Status {
+absl::Status CheckTypes(HloInstruction* conv, const se::GpuComputeCapability cc,
+                        const se::dnn::VersionInfo dnn_version) {
+  auto valid_shape = [conv, &cc,
+                      &dnn_version](const Shape& shape) -> absl::Status {
     PrimitiveType type = shape.element_type();
     if (!primitive_util::IsFloatingPointType(type) &&
         !primitive_util::IsIntegralType(type)) {
@@ -81,6 +85,16 @@ absl::Status CheckTypes(HloInstruction* conv,
             "FP8 convolutions are only supported on CUDA GPUs, but got "
             "FP8 convolution on ROCm GPU: %s",
             conv->ToString());
+      }
+      if (dnn_version >= se::dnn::VersionInfo{9, 8, 0}) {
+        if (!std::get<se::CudaComputeCapability>(cc).IsAtLeastAda()) {
+          return Unimplemented(
+              "FP8 convolutions are only supported on CUDA GPUs with compute "
+              "capability at least 8.9, but got "
+              "FP8 convolution on GPU with compute capability %s: %s",
+              std::get<se::CudaComputeCapability>(cc).ToString(),
+              conv->ToString());
+        }
       } else if (!std::get<se::CudaComputeCapability>(cc).IsAtLeastHopper()) {
         return Unimplemented(
             "FP8 convolutions are only supported on CUDA GPUs with compute "
@@ -629,7 +643,7 @@ ConvolutionMatch MatchBackwardInput(HloInstruction* conv) {
 
   // Transpose [H, W, ..., G, in_depth/G, out_depth / G] -> [H, W, ...,
   // in_depth/G, G, out_depth / G]
-  std::vector<int64_t> transpose_dims(rhs->shape().dimensions_size());
+  std::vector<int64_t> transpose_dims(rhs->shape().dimensions().size());
   std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
   transpose_dims.erase(transpose_dims.begin() + input_feature_dimension);
   transpose_dims.insert(transpose_dims.begin() + output_feature_dimension,
@@ -728,7 +742,7 @@ HloInstruction* ConvertBatchGroupedToFeatureGroupedConvolution(
 
   // Transpose G to the axis before C, For eg: [G, N/G, H, W, C ] -> [N/G, H,
   // W, G, C]
-  std::vector<int64_t> transpose_dims(lhs->shape().dimensions_size());
+  std::vector<int64_t> transpose_dims(lhs->shape().dimensions().size());
   std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
   transpose_dims.erase(transpose_dims.begin() + input_batch_dimension);
   transpose_dims.insert(transpose_dims.begin() + input_feature_dimension,
@@ -762,8 +776,9 @@ CudnnConvBackendConfig GetDefaultBackendConfig() {
 // Helper function to create a custom_call instruction to replace the given
 // conv instruction
 static absl::StatusOr<HloInstruction*> CreateCustomCallHelper(
-    HloInstruction* conv, const se::GpuComputeCapability& cc) {
-  TF_RETURN_IF_ERROR(CheckTypes(conv, cc));
+    HloInstruction* conv, const se::GpuComputeCapability& cc,
+    const se::dnn::VersionInfo& dnn_version) {
+  TF_RETURN_IF_ERROR(CheckTypes(conv, cc, dnn_version));
   if (ConvolutionMatch m = MatchBackwardInput(conv)) {
     auto& [window, dnums, rhs] = *m;
     return CreateGpuConv(kCudnnConvBackwardInputCallTarget, conv->shape(),
@@ -798,11 +813,12 @@ static absl::StatusOr<HloInstruction*> CreateCustomCallHelper(
 
 // Tries to rewrite a single convolution into a call to cudnn/miopen.
 absl::StatusOr<bool> RunOnInstruction(HloInstruction* conv,
-                                      const se::GpuComputeCapability& cc) {
+                                      const se::GpuComputeCapability& cc,
+                                      const se::dnn::VersionInfo& dnn_version) {
   CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
 
   TF_ASSIGN_OR_RETURN(HloInstruction * custom_call,
-                      CreateCustomCallHelper(conv, cc));
+                      CreateCustomCallHelper(conv, cc, dnn_version));
   if (custom_call == nullptr) {
     return false;
   }
@@ -827,7 +843,8 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* conv,
 // cudnn/miopen.
 // Returns true if it made any changes.
 absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
-                                      const se::GpuComputeCapability& cc) {
+                                      const se::GpuComputeCapability& cc,
+                                      const se::dnn::VersionInfo dnn_version) {
   std::vector<HloInstruction*> convs;
   for (auto* hlo : computation->instructions()) {
     if (HloPredicateIsOp<HloOpcode::kConvolution>(hlo)) {
@@ -837,7 +854,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
 
   bool changed = false;
   for (HloInstruction* conv : convs) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv, cc));
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv, cc, dnn_version));
     changed |= result;
   }
   return changed;
@@ -851,8 +868,9 @@ absl::StatusOr<bool> ConvRewriter::Run(
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(bool result,
-                        RunOnComputation(computation, compute_capability_));
+    TF_ASSIGN_OR_RETURN(
+        bool result,
+        RunOnComputation(computation, compute_capability_, dnn_version_));
     changed |= result;
   }
   XLA_VLOG_LINES(2, "ConvRewriter::Run(), after:\n" + module->ToString());
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
index 5ad7e7111807..d7074a459012 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/dnn.h"
 
 namespace xla {
 namespace gpu {
@@ -36,8 +37,10 @@ namespace gpu {
 
 class ConvRewriter : public HloModulePass {
  public:
-  explicit ConvRewriter(const se::GpuComputeCapability& compute_capability)
-      : compute_capability_(compute_capability) {};
+  explicit ConvRewriter(
+      const se::GpuComputeCapability& compute_capability,
+      se::dnn::VersionInfo dnn_version = se::dnn::VersionInfo{})
+      : compute_capability_(compute_capability), dnn_version_(dnn_version) {};
 
   absl::string_view name() const override { return "conv-rewriter"; }
 
@@ -49,7 +52,8 @@ class ConvRewriter : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  se::GpuComputeCapability compute_capability_;
+  const se::GpuComputeCapability compute_capability_;
+  const se::dnn::VersionInfo dnn_version_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
index 062ae3c99585..6330e9e66242 100644
--- a/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/conv_rewriter_test.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include <string>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "xla/array4d.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -28,13 +31,14 @@ limitations under the License.
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal_util.h"
-#include "xla/protobuf_util.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/shape_inference.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -43,6 +47,8 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::tsl::proto_testing::EqualsProto;
+
 namespace m = ::xla::match;
 
 class ConvRewriterTest : public HloTestBase {
@@ -150,8 +156,7 @@ TEST_F(ConvRewriterTest, BackwardFilterConvolve) {
   // Check that metadata was preserved.
   const auto& md_after_opt =
       entry_computation->root_instruction()->operand(0)->metadata();
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(md_after_opt, metadata))
-      << md_after_opt.DebugString() << " vs " << metadata.DebugString();
+  EXPECT_THAT(md_after_opt, EqualsProto(metadata));
 }
 
 TEST_F(ConvRewriterTest, BackwardFilterConvolveEquivalentToForwardConvolution) {
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
index 23706a4dbcf1..ae75edd41fe5 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -117,7 +119,7 @@ absl::StatusOr<bool> CopyFusion::DoCopyFusion(HloComputation* computation) {
     // Skip dynamic update slice fusions which might be emitted in-place.
     if (!dynamic_update_slices.empty() &&
         (HloPredicateIsNotOp<HloOpcode::kTuple>(root) ||
-         dynamic_update_slices.size() == root->shape().tuple_shapes_size())) {
+         dynamic_update_slices.size() == root->shape().tuple_shapes().size())) {
       continue;
     }
     changed = true;
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
index e557990fa07c..49b89b3a9eea 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/stream_executor/device_description.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -36,7 +37,7 @@ auto MakeDeviceDescriptor() {
   return device_description;
 }
 
-class CopyFusionTest : public HloTestBase {
+class CopyFusionTest : public HloHardwareIndependentTestBase {
  public:
   CopyFusionTest()
       : device_description_(MakeDeviceDescriptor()), cf_(device_description_) {}
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
index 7c3f3a95af65..5784d91d86d2 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <tuple>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_replace.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -3334,6 +3336,39 @@ ENTRY test {
 )");
 }
 
+TEST_F(CublasLtGemmRewriteTest, CublasLtFullyContractingRhsWithBias) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  param_0 = bf16[10240,1024]{1,0} parameter(0)
+  param_1 = bf16[1024,1]{1,0} parameter(1)
+  dot = bf16[10240,1]{1,0} dot(param_0, param_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  transpose = bf16[10240,1]{1,0} transpose(dot), dimensions={0,1}
+  param_2 = bf16[1]{0} parameter(2)
+  reshape = bf16[1]{0} reshape(param_2)
+  broadcast = bf16[10240,1]{1,0} broadcast(reshape), dimensions={1}
+  ROOT out = bf16[10240,1]{1,0} add(transpose, broadcast)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
+
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
+  }
+
+  MatchOptimizedHlo(hlo_text, R"(
+; CHECK-DAG: [[LHS:%[^ ]+]] = bf16[10240,1024]{1,0} parameter(0)
+; CHECK-DAG: [[P_1:%[^ ]+]] = bf16[1024,1]{1,0} parameter(1)
+; CHECK-DAG: [[P_2:%[^ ]+]] = bf16[1]{0} parameter(2)
+; CHECK-DAG: [[RHS:%[^ ]+]] = bf16[1024]{0} {{.+}}([[P_1]])
+; CHECK-DAG: [[BIAS:%[^ ]+]] = bf16[] {{.+}}([[P_2]])
+; CHECK: custom-call([[LHS]], [[RHS]], [[BIAS]]), custom_call_target="__cublas$lt$matmul"
+)");
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
index 56421f521352..f144a83423a6 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.cc
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/codegen/triton/support_legacy.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
@@ -61,8 +64,8 @@ static absl::StatusOr<bool> PadForGemm(HloDotInstruction* dot,
     // Since the dot instruction is canonicalized, the last two dimensions for
     // each operand represent non-batch dimensions, and the others are the same
     // for both operands and correspond to batch dimensions.
-    pad_dim(s, s.rank() - 2);
-    pad_dim(s, s.rank() - 1);
+    pad_dim(s, s.dimensions().size() - 2);
+    pad_dim(s, s.dimensions().size() - 1);
     return s;
   };
 
@@ -80,7 +83,7 @@ static absl::StatusOr<bool> PadForGemm(HloDotInstruction* dot,
 
   auto create_padding_config = [](Shape& shape, Shape& new_shape) {
     PaddingConfig padding_config;
-    for (int i = 0; i < shape.rank(); ++i) {
+    for (int i = 0; i < shape.dimensions().size(); ++i) {
       auto dimension = padding_config.add_dimensions();
       dimension->set_edge_padding_high(new_shape.dimensions()[i] -
                                        shape.dimensions()[i]);
@@ -110,8 +113,8 @@ static absl::StatusOr<bool> PadForGemm(HloDotInstruction* dot,
   HloInstruction* new_dot = parent->AddInstruction(
       dot->CloneWithNewOperands(new_result_shape, {lpad, rpad}));
 
-  std::vector<int64_t> start_indices(result_shape.rank(), 0);
-  std::vector<int64_t> strides(result_shape.rank(), 1);
+  std::vector<int64_t> start_indices(result_shape.dimensions().size(), 0);
+  std::vector<int64_t> strides(result_shape.dimensions().size(), 1);
   HloInstruction* slice = parent->AddInstruction(
       HloInstruction::CreateSlice(result_shape, new_dot, start_indices,
                                   result_shape.dimensions(), strides));
@@ -136,9 +139,9 @@ bool CheckCanonical(HloDotInstruction* dot) {
   const auto& dimension_numbers = dot->dot_dimension_numbers();
 
   if (dimension_numbers.lhs_batch_dimensions_size() + 2 !=
-          dot->operand(0)->shape().rank() ||
+          dot->operand(0)->shape().dimensions().size() ||
       dimension_numbers.rhs_batch_dimensions_size() + 2 !=
-          dot->operand(1)->shape().rank()) {
+          dot->operand(1)->shape().dimensions().size()) {
     VLOG(2)
         << dot->ToString()
         << " is not canonical: Expected all dimensions but 2 to be "
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
index 15c6c74d9b50..9df27d017e2c 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms_test.cc b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms_test.cc
index aac0a4f579cc..e55f4de743a6 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_pad_for_gemms_test.cc
@@ -19,10 +19,12 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace m = ::xla::match;
 
@@ -30,7 +32,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class CublasGemmPadForTensorCoresTest : public HloTestBase {
+class CublasGemmPadForTensorCoresTest : public HloHardwareIndependentTestBase {
  protected:
   bool PadForF16Gemms(HloModule* module) {
     return CublasPadForGemms(se::CudaComputeCapability(7, 0),
@@ -40,7 +42,8 @@ class CublasGemmPadForTensorCoresTest : public HloTestBase {
   }
 
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     // Some pads would not be added if we detect that Triton will handle the
     // given dot operation.
     debug_options.set_xla_gpu_triton_gemm_any(false);
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
index 4c15388a0a49..15cdf2726df5 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_compiler.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -46,7 +45,9 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_dnn.h"
 #include "xla/stream_executor/cuda/cudnn_frontend_helpers.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -132,11 +133,13 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToForwardFMHA(
                                         custom_call->shape(), {1})));
   }
 
+  int input_index = 3;
   std::optional<se::dnn::TensorDescriptor> bias;
   if (kind == CudnnfMHAKind::kScaleBiasSoftmax ||
       kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout) {
     const HloInstruction &bias_hlo = *custom_call->operand(3);
     TF_ASSIGN_OR_RETURN(bias, TensorDescriptorFor(bias_hlo.shape()));
+    input_index++;
   }
 
   const double dropout_rate = config.dropout_rate();
@@ -148,13 +151,39 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToForwardFMHA(
 
   const int sliding_window_length = config.sliding_window_length();
   const int max_seg_per_batch = config.max_seg_per_batch();
+  const bool is_paged_attention = config.is_paged_attention();
+
+  if (config.mask_type() == xla::gpu::CudnnfMHABackendConfig::PADDING ||
+      config.mask_type() == xla::gpu::CudnnfMHABackendConfig::PADDING_CAUSAL ||
+      max_seg_per_batch > 1 || is_paged_attention) {
+    // skip q_seqlen and kv_seqlen
+    input_index += 2;
+  }
+
+  if (max_seg_per_batch > 1) {
+    // skip q_offsets and kv_offsets
+    input_index += 2;
+  }
+
+  std::optional<se::dnn::TensorDescriptor> page_table_k;
+  std::optional<se::dnn::TensorDescriptor> page_table_v;
+  if (is_paged_attention) {
+    TF_ASSIGN_OR_RETURN(
+        page_table_k,
+        TensorDescriptorFor(custom_call->operand(input_index++)->shape()));
+    TF_ASSIGN_OR_RETURN(
+        page_table_v,
+        TensorDescriptorFor(custom_call->operand(input_index++)->shape()));
+  }
+  TF_RET_CHECK(input_index == custom_call->operand_count());
+
   TF_ASSIGN_OR_RETURN(
       se::gpu::CudnnGraph graph,
       se::gpu::GetCudnnFlashAttentionOperationGraph(
           dnn_support, lhs_bmm1, rhs_bmm1, rhs_bmm2, output, bias, activation,
-          static_cast<float>(config.fmha_scale()), dropout_rate > 0.0,
-          dropout_rate, dnn_mask_type, sliding_window_length,
-          max_seg_per_batch));
+          page_table_k, page_table_v, static_cast<float>(config.fmha_scale()),
+          dropout_rate > 0.0, dropout_rate, dnn_mask_type,
+          sliding_window_length, max_seg_per_batch));
   return graph;
 }
 
@@ -165,7 +194,8 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToForwardFMHAF8(
       custom_call->backend_config<xla::gpu::GpuBackendConfig>());
   const xla::gpu::CudnnfMHABackendConfig &config =
       gpu_config.cudnn_fmha_backend_config();
-  Shape intermediate_tensor_shape(config.intermediate_tensor_shape());
+  TF_ASSIGN_OR_RETURN(Shape intermediate_tensor_shape,
+                      Shape::FromProto(config.intermediate_tensor_shape()));
 
   TF_ASSIGN_OR_RETURN(CudnnfMHAMaskKind cudnn_mask_type,
                       AsCudnnFmhaMaskKind(config.mask_type()));
@@ -217,7 +247,8 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToBackwardFMHA(
       custom_call->operand(input_index++)->shape();
   const Shape &bmm2_grad_gemm2_rhs_shape =
       custom_call->operand(input_index++)->shape();
-  const Shape bmm2_grad_gemm1_lhs_shape(config.intermediate_tensor_shape());
+  TF_ASSIGN_OR_RETURN(const Shape bmm2_grad_gemm1_lhs_shape,
+                      Shape::FromProto(config.intermediate_tensor_shape()));
   ++input_index;
   const Shape &d_output_shape = custom_call->operand(input_index++)->shape();
 
@@ -255,8 +286,10 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToBackwardFMHA(
   const Shape &d_bmm2_rhs_shape =
       ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
   bool has_dbias = custom_call->shape().tuple_shapes().size() == 5;
+  std::optional<Shape> dbias_shape;
   if (has_dbias) {
-    ++output_index;
+    dbias_shape =
+        ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
   }
   // The last one is the workspace.
   TF_RET_CHECK(output_index == custom_call->shape().tuple_shapes().size() - 1);
@@ -295,10 +328,15 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToBackwardFMHA(
                       TensorDescriptorFor(d_bmm2_rhs_shape));
 
   std::optional<se::dnn::TensorDescriptor> bias;
+  std::optional<se::dnn::TensorDescriptor> dbias;
   if (bias_shape.has_value()) {
     TF_ASSIGN_OR_RETURN(bias, TensorDescriptorFor(*bias_shape));
   }
 
+  if (dbias_shape.has_value()) {
+    TF_ASSIGN_OR_RETURN(dbias, TensorDescriptorFor(*dbias_shape));
+  }
+
   const double dropout_rate = config.dropout_rate();
 
   TF_ASSIGN_OR_RETURN(CudnnfMHAMaskKind cudnn_mask_type,
@@ -312,7 +350,7 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToBackwardFMHA(
       se::gpu::GetCudnnFlashAttentionBackwardOperationGraph(
           dnn_support, bmm1_grad_gemm1_rhs, bmm1_grad_gemm2_rhs,
           bmm2_grad_gemm1_lhs, bmm2_grad_gemm2_rhs, d_output, d_bmm1_lhs,
-          d_bmm1_rhs, d_bmm2_rhs, bias, dropout_rate, config.seed(),
+          d_bmm1_rhs, d_bmm2_rhs, bias, dbias, dropout_rate, config.seed(),
           config.fmha_scale(), dropout_rate > 0.0, bias != std::nullopt,
           dnn_mask_type, force_deterministic, sliding_window_length,
           max_seg_per_batch));
@@ -334,7 +372,8 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToBackwardFMHAF8(
   Shape fwd_output_shape = custom_call->operand(3)->shape();
   Shape d_output_shape = custom_call->operand(4)->shape();
 
-  Shape bmm2_grad_gemm1_lhs_shape(config.intermediate_tensor_shape());
+  TF_ASSIGN_OR_RETURN(Shape bmm2_grad_gemm1_lhs_shape,
+                      Shape::FromProto(config.intermediate_tensor_shape()));
 
   Shape d_bmm1_lhs_shape = ShapeUtil::GetSubshape(custom_call->shape(), {0});
   Shape d_bmm1_rhs_shape = ShapeUtil::GetSubshape(custom_call->shape(), {1});
@@ -388,7 +427,7 @@ absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToBackwardFMHAF8(
 absl::StatusOr<se::gpu::CudnnGraph> BuildGraphForCustomCallToBlockScaledDot(
     se::dnn::DnnSupport &dnn_support, HloCustomCallInstruction *custom_call) {
   TF_RET_CHECK(custom_call->operand_count() == 4);
-  TF_RET_CHECK(custom_call->shape().tuple_shapes_size() == 2);
+  TF_RET_CHECK(custom_call->shape().tuple_shapes().size() == 2);
 
   TF_ASSIGN_OR_RETURN(TensorDescriptor lhs_data,
                       TensorDescriptorFor(custom_call->operand(0)->shape()));
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter_test.cc
index ad29e1566330..7f5dfae23f3f 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_custom_call_converter_test.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include "xla/service/gpu/transforms/cudnn_custom_call_converter.h"
 
 #include <gtest/gtest.h>
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using ConverterTest = HloTestBase;
+using ConverterTest = HloHardwareIndependentTestBase;
 
 TEST_F(ConverterTest, CustomCallGetsConvertedToCustomFusion) {
   RunAndFilecheckHloRewrite(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
index b577dd041092..ee7b9c91398d 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <limits>
+#include <memory>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -40,6 +41,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
@@ -54,7 +56,9 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/ml_dtypes.h"
@@ -361,33 +365,62 @@ absl::StatusOr<bool> FuseConvAlpha(HloComputation* comp) {
 
 // The format of the serialized graph describing a sequence of ops fused
 // into the cuDNN convolution Custom Call is
-// "UID:[output_type]conv();UID[output_type]:op_name(operand
-// UID);UID:[output_type]op_name(operand UID);..." with the convolution assumed
-// to be the first op in the graph. Operand UIDs identifying ops outside the
-// serialized graph are elided. Currently, multiplication and division by a
-// broadcast scalar, addition of a matrix bias, the application of a ReLU
-// activation and the calculation of the maximum of the absolute value are
-// supported.
+// "UID:[output_type]conv();UID:[output_type]op_name(operand
+// UIDs);UID:[output_type]op_name(operand UIDs);..." with the convolution
+// assumed to be the first op in the graph. Operand UIDs identifying ops outside
+// the serialized graph are elided. Currently, multiplication and division by a
+// broadcast scalar, addition, the application of various activations and the
+// calculation of the maximum of the absolute value are supported.
 class GraphString {
  public:
   GraphString() = default;
 
   bool AppendOp(std::string op_name, HloInstruction* op,
                 std::vector<HloInstruction*> operands = {}) {
-    std::optional<int64_t> operand_uid;
-    int num_operands_in_graph = 0;
-    for (HloInstruction* operand : operands) {
-      if (OpInGraph(operand->unique_id())) {
-        num_operands_in_graph++;
-        // Ops with more than one operand in the graph are not supported.
-        if (num_operands_in_graph > 1) {
-          return false;
-        }
-        operand_uid = operand->unique_id();
-      }
+    return AppendOp(op_name, op, op->shape().element_type(), operands);
+  }
+
+  bool AppendOp(std::string op_name, HloInstruction* op,
+                PrimitiveType element_type,
+                std::vector<HloInstruction*> operands = {}) {
+    // Do not add the same op multiple times.
+    if (OpInGraph(op)) {
+      return false;
     }
-    graph_.emplace_back(OpDescriptor(
-        {op->unique_id(), op->shape().element_type(), op_name, operand_uid}));
+
+    // Insert op in front of its first use as an operand in graph_ or at the end
+    // of graph_ if not an operand of another op.
+    auto pos = std::find_if(
+        graph_.begin(), graph_.end(), [op](OpDescriptor graph_op) -> bool {
+          return std::find(graph_op.operands.begin(), graph_op.operands.end(),
+                           op) != graph_op.operands.end();
+        });
+    pos = graph_.insert(pos, OpDescriptor{op, element_type, op_name, operands});
+
+    // If necessary, move the operands of the op already in the graph in front
+    // of the op and recursively repeat for the operands' operands. Iterator pos
+    // marks the position of op in the vector and serves as an approximately
+    // optimal start point when finding operands that may need to be moved. With
+    // the exception of cases with multiple in-graph operands, pos is expected
+    // to be graph_.end(), which causes reorder_operands to return immediately.
+    std::function<void(std::vector<HloInstruction*>,
+                       std::vector<OpDescriptor>::iterator)>
+        reorder_operands = [&](auto operands, auto pos) {
+          for (HloInstruction* operand : operands) {
+            auto operand_pos = std::find_if(
+                pos, graph_.end(), [operand](OpDescriptor graph_op) -> bool {
+                  return operand == graph_op.instr;
+                });
+            if (operand_pos != graph_.end()) {
+              OpDescriptor operand_desc = *operand_pos;
+              graph_.erase(operand_pos);
+              pos = graph_.insert(pos, operand_desc);
+              reorder_operands(Operands(operand), pos);
+            }
+          }
+        };
+    reorder_operands(operands, pos);
+
     return true;
   }
 
@@ -398,39 +431,54 @@ class GraphString {
 
   std::string Graph() const {
     std::string graph;
-    for (OpDescriptor op : graph_) {
-      graph.append(std::to_string(op.uid));
+    for (const OpDescriptor& op : graph_) {
+      std::vector<int64_t> operand_uids_in_graph;
+      for (HloInstruction* operand : op.operands) {
+        if (OpInGraph(operand)) {
+          operand_uids_in_graph.push_back(operand->unique_id());
+        }
+      }
+      graph.append(std::to_string(op.instr->unique_id()));
       graph.append(":[" +
                    primitive_util::LowercasePrimitiveTypeName(op.output_type) +
                    "]");
       graph.append(op.name);
       graph.append("(");
-      if (op.operand.has_value()) {
-        graph.append(std::to_string(*op.operand));
-      }
+      graph.append(absl::StrJoin(operand_uids_in_graph, ","));
       graph.append(");");
     }
     return graph;
   }
 
-  bool OpInGraph(int64_t uid, std::string op_name = "") const {
-    auto op_filter = [&](OpDescriptor op) -> bool {
+  bool OpInGraph(HloInstruction* op, std::string op_name = "") const {
+    auto op_filter = [&](OpDescriptor graph_op) -> bool {
       if (op_name.empty()) {
-        return op.uid == uid;
+        return graph_op.instr->unique_id() == op->unique_id();
       } else {
-        return op.uid == uid && op.name == op_name;
+        return graph_op.instr->unique_id() == op->unique_id() &&
+               graph_op.name == op_name;
       }
     };
     return std::find_if(graph_.begin(), graph_.end(), op_filter) !=
            graph_.end();
   }
 
+  std::vector<HloInstruction*> Operands(HloInstruction* op) const {
+    auto op_it = std::find_if(
+        graph_.begin(), graph_.end(),
+        [op](OpDescriptor graph_op) -> bool { return op == graph_op.instr; });
+    if (op_it != graph_.end()) {
+      return op_it->operands;
+    }
+    return {};
+  }
+
  private:
   struct OpDescriptor {
-    int64_t uid;
+    HloInstruction* instr;
     PrimitiveType output_type;
     std::string name;
-    std::optional<int64_t> operand;
+    std::vector<HloInstruction*> operands;
   };
 
   std::vector<OpDescriptor> graph_;
@@ -488,120 +536,253 @@ void CaptureConvGraphRecursive(HloInstruction* instr,
                                std::vector<HloInstruction*>& aux_outputs,
                                GraphString& graph_string,
                                absl::flat_hash_set<int>& visited_instrs,
-                               HloInstruction*& final_instr) {
+                               HloInstruction*& final_instr, int& num_endpoints,
+                               HloReachabilityMap* reachability) {
   // Avoid visiting the same instruction more than once.
   if (!visited_instrs.emplace(instr->unique_id()).second) {
     return;
   }
-  final_instr = instr;
 
   // Copy the current state in case fusion will be unsuccessful or unfavorable.
   GraphString init_graph_string = graph_string;
-  std::vector<HloInstruction*> init_operands = operands,
-                               init_aux_outputs = aux_outputs;
-  // The loop adds each user of `instr` that supports fusion into the
-  // cuDNN convolution Custom Call to GraphString. Most ops following the
-  // convolution describe a linear sequence that generates a single return
-  // tensor. The identification of one of these linear ops is followed by a
-  // recursive call of CaptureConvGraphRecursive to match and potentially fuse
-  // its users. The calculation of the scalar maximum of the absolute value
-  // (Amax) of a preceding op is considered a nonlinear user as it adds a
-  // return value to the convolution. The users of a nonlinear op are
-  // not considered for fusion into the Custom Call. The numbers of linear and
-  // nonlinear users of `instr` are stored in `num_linear_users` and
-  // `num_nonlinear_users`.
-  int num_linear_users = 0, num_nonlinear_users = 0;
+  std::vector<HloInstruction*> init_operands = operands;
+  std::vector<HloInstruction*> init_aux_outputs = aux_outputs;
+  int init_num_endpoints = num_endpoints;
+
+  // The loop adds each user of instr that supports fusion into the cuDNN
+  // convolution Custom Call to graph_string. The identification of an op is
+  // followed by a recursive call of CaptureConvGraphRecursive to match and
+  // potentially fuse its users. Aside from scalar reduction results, the graph
+  // must have a single tensor endpoint.
+  //
+  // Examples:
+  //
+  //        C
+  //       / \
+  //  A - B   E - F
+  //       \ /
+  //        D
+  //
+  // The graph is fully fused since it has one endpoint.
+  //
+  //        C - D
+  //       /
+  //  A - B
+  //       \
+  //        E - F
+  //
+  // Fusion stops at B since the graph would have two endpoints.
+
+  // External operands of ops eligible for fusion into the cuDNN graph must not
+  // be reachable from graph outputs of existing fused ops as this may create a
+  // circular dependency between fused and unfused instructions.
+  auto eligible_operand =
+      [reachability, &aux_outputs](const HloInstruction* operand) -> bool {
+    return std::none_of(aux_outputs.begin(), aux_outputs.end(),
+                        [reachability, operand](HloInstruction* aux_output) {
+                          return reachability->IsReachable(aux_output, operand);
+                        });
+  };
+
+  // External operands of existing fused ops must not be reachable from graph
+  // outputs of ops eligible for fusion into the cuDNN graph as this may create
+  // a circular dependency between fused and unfused instructions.
+  auto eligible_aux_output =
+      [reachability, &operands](const HloInstruction* aux_output) -> bool {
+    return std::none_of(operands.begin(), operands.end(),
+                        [reachability, aux_output](HloInstruction* operand) {
+                          return reachability->IsReachable(aux_output, operand);
+                        });
+  };
+
+  int num_new_users = 0;
+  int num_existing_users = 0;
   for (HloInstruction* user : instr->users()) {
-    HloInstruction *op, *operand0, *operand1;
+    HloInstruction *op0, *op1, *op2, *operand0, *operand1;
     // Add
-    if (Match(user, m::AddAnyOrder(&op, m::Op(&operand0), m::Op(&operand1)))) {
-      if (graph_string.AppendOp("add", op, {operand0, operand1})) {
-        operands.push_back(operand0 == instr ? operand1 : operand0);
-        num_linear_users++;
+    if (Match(user,
+              m::AddAnyOrder(&op0, m::Op().Is(instr), m::Op(&operand0))) &&
+        eligible_operand(operand0)) {
+      if (graph_string.AppendOp("add", op0, {instr, operand0})) {
+        operands.push_back(operand0);
+        ++num_new_users;
         CaptureConvGraphRecursive(user, operands, aux_outputs, graph_string,
-                                  visited_instrs, final_instr);
+                                  visited_instrs, final_instr, num_endpoints,
+                                  reachability);
+      } else {
+        // Since operands only holds ops that are not part of the graph, remove
+        // instr.
+        operands.erase(std::remove(operands.begin(), operands.end(), instr),
+                       operands.end());
+        ++num_existing_users;
       }
       continue;
     }
     // Scale
-    if (Match(user, m::MultiplyAnyOrder(&op, m::Op(&operand0),
-                                        m::Broadcast(m::Op(&operand1)))) &&
-        ShapeUtil::IsScalar(operand1->shape())) {
-      if (graph_string.AppendOp("scale", op, {operand0, operand1})) {
-        operands.push_back(operand1);
-        num_linear_users++;
+    if (Match(user, m::MultiplyAnyOrder(&op0, m::Op().Is(instr),
+                                        m::Broadcast(m::Op(&operand0)))) &&
+        ShapeUtil::IsScalar(operand0->shape()) && eligible_operand(operand0)) {
+      if (graph_string.AppendOp("scale", op0, {instr, operand0})) {
+        operands.push_back(operand0);
+        ++num_new_users;
         CaptureConvGraphRecursive(user, operands, aux_outputs, graph_string,
-                                  visited_instrs, final_instr);
+                                  visited_instrs, final_instr, num_endpoints,
+                                  reachability);
       }
       continue;
     }
     // Inverse Scale
-    if (Match(user, m::Divide(&op, m::Op(&operand0),
-                              m::Broadcast(m::Op(&operand1)))) &&
-        ShapeUtil::IsScalar(operand1->shape())) {
-      if (graph_string.AppendOp("invscale", op, {operand0, operand1})) {
-        operands.push_back(operand1);
-        num_linear_users++;
+    if (Match(user, m::Divide(&op0, m::Op().Is(instr),
+                              m::Broadcast(m::Op(&operand0)))) &&
+        ShapeUtil::IsScalar(operand0->shape()) && eligible_operand(operand0)) {
+      if (graph_string.AppendOp("invscale", op0, {instr, operand0})) {
+        operands.push_back(operand0);
+        ++num_new_users;
         CaptureConvGraphRecursive(user, operands, aux_outputs, graph_string,
-                                  visited_instrs, final_instr);
+                                  visited_instrs, final_instr, num_endpoints,
+                                  reachability);
       }
       continue;
     }
     // ReLU
-    if (Match(user, m::MaximumAnyOrder(&op, m::Op(&operand0),
+    if (Match(user, m::MaximumAnyOrder(&op0, m::Op().Is(instr),
                                        m::Broadcast(m::ConstantScalar(0))))) {
-      if (graph_string.AppendOp("relu", op, {operand0})) {
-        num_linear_users++;
+      if (graph_string.AppendOp("relu", op0, {instr})) {
+        ++num_new_users;
+        CaptureConvGraphRecursive(user, operands, aux_outputs, graph_string,
+                                  visited_instrs, final_instr, num_endpoints,
+                                  reachability);
+      }
+      continue;
+    }
+    //  Maximum of the absolute value (Amax) following ReLU (elided Abs)
+    if (Match(user, m::Reduce(&op0, m::Op().Is(instr), m::Op())) &&
+        graph_string.OpInGraph(instr, "relu") && AppliesMaxReduce(op0) &&
+        eligible_aux_output(op0)) {
+      if (graph_string.AppendOp("amax", op0, {instr})) {
+        aux_outputs.push_back(op0);
+        ++num_new_users;
+      }
+      continue;
+    }
+    // ReLU6, equivalently represented in the cuDNN graph as min(ReLU, 6)
+    if (Match(
+            user,
+            m::Clamp(
+                &op1,
+                m::Broadcast(&op0, m::ConstantEffectiveScalar(0)).WithOneUser(),
+                m::Op().Is(instr),
+                m::Broadcast(&operand0, m::ConstantEffectiveScalar(6))
+                    .WithOneUser())) &&
+        eligible_operand(operand0)) {
+      if (!graph_string.OpInGraph(op0) && !graph_string.OpInGraph(op1)) {
+        graph_string.AppendOp("relu", op0, {instr});
+        graph_string.AppendOp("min", op1, {op0, operand0});
+        ++num_new_users;
+        operands.push_back(operand0);
         CaptureConvGraphRecursive(user, operands, aux_outputs, graph_string,
-                                  visited_instrs, final_instr);
+                                  visited_instrs, final_instr, num_endpoints,
+                                  reachability);
       }
       continue;
     }
-    //  Maximum of the absolute value (Amax) following ReLU (elided Abs) -- not
-    //  a linear user
-    if (Match(user, m::Reduce(&op, m::Op(&operand0), m::Op())) &&
-        graph_string.OpInGraph(operand0->unique_id(), "relu") &&
-        AppliesMaxReduce(op)) {
-      if (graph_string.AppendOp("amax", op, {operand0})) {
-        aux_outputs.push_back(op);
-        num_nonlinear_users++;
+    // ELU
+    if (Match(user,
+              m::Select(&op0,
+                        m::Compare(m::Op().Is(instr),
+                                   m::Broadcast(m::ConstantEffectiveScalar(0)))
+                            .WithComparisonDirection(ComparisonDirection::kGt),
+                        m::Op().Is(instr), m::Expm1(m::Op().Is(instr))))) {
+      if (graph_string.AppendOp("elu", op0, {instr})) {
+        num_new_users += 3;
+        CaptureConvGraphRecursive(user, operands, aux_outputs, graph_string,
+                                  visited_instrs, final_instr, num_endpoints,
+                                  reachability);
       }
       continue;
     }
 
     // The following patterns match the user of `user`.
-    if (!user->users().empty()) {
+    if (user->user_count() == 1) {
       HloInstruction* users_user = user->users()[0];
       // Convert with Clamp to FP8 types
       std::optional<PrimitiveType> f8_type = IsSaturatingCastToF8(users_user);
       if (f8_type.has_value()) {
         graph_string.ChangeDataType(f8_type.value());
-        num_linear_users++;
+        ++num_new_users;
         CaptureConvGraphRecursive(users_user, operands, aux_outputs,
-                                  graph_string, visited_instrs, final_instr);
+                                  graph_string, visited_instrs, final_instr,
+                                  num_endpoints, reachability);
         continue;
       }
-      // Maximum of the absolute value (Amax) -- not a linear user
+      // Maximum of the absolute value (Amax)
       if (Match(users_user,
-                m::Reduce(&op, m::Abs(m::Op(&operand0)), m::Op())) &&
-          AppliesMaxReduce(op)) {
-        if (graph_string.AppendOp("amax", op, {operand0})) {
-          aux_outputs.push_back(op);
-          num_nonlinear_users++;
+                m::Reduce(&op0, m::Abs(m::Op().Is(instr)), m::Op())) &&
+          AppliesMaxReduce(op0) && eligible_aux_output(op0)) {
+        if (graph_string.AppendOp("amax", op0, {instr})) {
+          aux_outputs.push_back(op0);
+          ++num_new_users;
+        }
+        continue;
+      }
+      // Leaky ReLU, equivalently represented in the cuDNN graph as the maximum
+      // of the value and the slope times the minimum of zero and the value,
+      // max(x, alpha * min(0, x))
+      if (Match(
+              users_user,
+              m::Select(&op2,
+                        m::Compare(&op0, m::Op().Is(instr),
+                                   m::Broadcast(&operand0,
+                                                m::ConstantEffectiveScalar(0)))
+                            .WithComparisonDirection(ComparisonDirection::kGt)
+                            .WithOneUser(),
+                        m::Op().Is(instr),
+                        m::MultiplyAnyOrder(
+                            &op1, m::Op().Is(instr),
+                            m::Broadcast(m::ConstantEffectiveScalar(&operand1)))
+                            .WithOneUser())) &&
+          eligible_operand(operand0) && eligible_operand(operand1)) {
+        if (!graph_string.OpInGraph(op0) && !graph_string.OpInGraph(op1) &&
+            !graph_string.OpInGraph(op2)) {
+          graph_string.AppendOp("min", op0, op2->shape().element_type(),
+                                {instr, operand0});
+          graph_string.AppendOp("scale", op1, {op0, operand1});
+          graph_string.AppendOp("max", op2, {op1, instr});
+          num_new_users += 3;
+          operands.push_back(operand0);
+          operands.push_back(operand1);
+          CaptureConvGraphRecursive(users_user, operands, aux_outputs,
+                                    graph_string, visited_instrs, final_instr,
+                                    num_endpoints, reachability);
         }
         continue;
       }
     }
   }
-  // Do not fuse into the cuDNN convolution Custom Call when there are more than
-  // one linear or nonlinear users, or when the number of users eligible for
-  // fusion is less than the total number of users.
-  if (num_linear_users > 1 || num_nonlinear_users > 1 ||
-      num_linear_users + num_nonlinear_users < instr->user_count()) {
-    graph_string = init_graph_string;
+
+  // instr is an endpoint when the number of users eligible for fusion is less
+  // than the total number of users or when instr has no users.
+  if (num_new_users + num_existing_users < instr->user_count() ||
+      instr->user_count() == 0) {
+    final_instr = instr;
+    ++num_endpoints;
+  }
+
+  // Do not fuse into the cuDNN convolution Custom Call and roll back the graph
+  // when the number of users eligible for fusion is less than the total number
+  // of users of instr. Since the cuDNN graph cannot have more than one
+  // endpoint, also roll back the graph when there are multiple endpoints. If
+  // the resulting graph still has more than one endpoint, the recursive caller
+  // will continue to roll back the graph.
+  if (num_new_users + num_existing_users < instr->user_count() ||
+      num_endpoints > 1) {
+    graph_string = std::move(init_graph_string);
     operands = init_operands;
     aux_outputs = init_aux_outputs;
     final_instr = instr;
+    // Reverting the graph removes any users of instr from the cuDNN graph and
+    // makes instr an endpoint.
+    num_endpoints = init_num_endpoints + 1;
   }
 }
 
@@ -644,22 +825,24 @@ CaptureConvGraph(HloInstruction* instr, HloInstruction* convolution,
   std::vector<HloInstruction*> operands, aux_outputs;
   absl::flat_hash_set<int> visited_instrs;
   HloInstruction* final_instr;
+  int num_endpoints = 0;
+  std::unique_ptr<HloReachabilityMap> reachability =
+      HloReachabilityMap::Build(instr->parent());
   CaptureConvGraphRecursive(instr, operands, aux_outputs, graph_string,
-                            visited_instrs, final_instr);
+                            visited_instrs, final_instr, num_endpoints,
+                            reachability.get());
   return std::make_tuple(operands, aux_outputs, graph_string, final_instr);
 }
 
 // Matches convolutions operating on FP8 inputs and filters and rewrites into a
-// ForwardGraph Custom Call. For scaled FP8 convolutions on Hopper systems, the
-// following steps are elided and rewritten into a ForwardGraph Custom Call:
+// ForwardGraph Custom Call. For scaled FP8 convolutions, the following steps
+// are elided and rewritten into a ForwardGraph Custom Call:
 //
 // 1. Cast the filter and input from FP8 to a wider type such as FP16 or FP32.
 // 2. Optionally unscale the filter and input by multiplying or dividing by
 // scalars.
 // 3. Evaluate the convolution based on the scaled filter and input.
-// 4. Apply a series of elementwise transformations, where a transformation can
-// be adding a matrix bias, applying a ReLU activation, or
-// multiplying or dividing by a broadcast scalar.
+// 4. Apply a series of elementwise transformations.
 // 5. Optionally calculate the maximum of the absolute of the result.
 // 6. Optionally cast the output back to FP8.
 absl::StatusOr<bool> F8GraphConv(HloComputation* comp,
@@ -668,13 +851,15 @@ absl::StatusOr<bool> F8GraphConv(HloComputation* comp,
                                  const se::SemanticVersion& toolkit_version) {
   bool changed = false;
 
-  if (dnn_version < se::dnn::VersionInfo(8, 9, 0)) {
+  if (toolkit_version < se::SemanticVersion{12, 0, 0}) {
     return false;
   }
-  if (toolkit_version < se::SemanticVersion{12, 0, 0}) {
+  if (!cc.IsAtLeastAda()) {
     return false;
   }
-  if (!cc.IsAtLeast(se::CudaComputeCapability::kHopper)) {
+  if (dnn_version < se::dnn::VersionInfo{9, 8, 0} && !cc.IsAtLeastHopper()) {
+    // Ada is not supported on older cuDNN versions, and instead Hopper or later
+    // is required.
     return false;
   }
   for (auto instr : comp->MakeInstructionPostOrder()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
index ebb133f8db9b..f2da50e6bcaf 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fused_conv_rewriter_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_replace.h"
@@ -54,6 +55,9 @@ limitations under the License.
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -97,6 +101,15 @@ class CudnnFusedConvRewriterHloTest : public HloTestBase {
         .runtime_version();
   }
 
+  ConvRewriter GetConvRewriter() const {
+    return ConvRewriter(GetCudaComputeCapability(), GetDnnVersion());
+  }
+
+  CudnnFusedConvRewriter GetCudnnFusedConvRewriter() const {
+    return CudnnFusedConvRewriter(GetCudaComputeCapability(), GetDnnVersion(),
+                                  GetToolkitVersion());
+  }
+
   CudnnFusedConvRewriterHloTest()
       : HloTestBase(/*verifier_layout_sensitive=*/false,
                     /*allow_mixed_precision_in_hlo_verifier=*/false,
@@ -208,9 +221,13 @@ class CudnnFusedConvRewriterTest : public GpuCodegenTest {
   void TestF8(std::string pre_hlo_string, std::string custom_call_string,
               std::string serialized_graph_string) {
     if (!IsCuda()) return;
-    if (GetCudaComputeCapability().IsAtLeast(
-            se::CudaComputeCapability::kHopper)) {
-      // On Hopper and newer architectures, test numerical correctness and
+
+    bool fp8_supported = GetDnnVersion() >= se::dnn::VersionInfo{9, 8, 0}
+                             ? GetCudaComputeCapability().IsAtLeastAda()
+                             : GetCudaComputeCapability().IsAtLeastHopper();
+    LOG(INFO) << "RRR fp8_supported: " << fp8_supported;
+    if (fp8_supported) {
+      // On Ada/Hopper and newer architectures, test numerical correctness and
       // verify the HLO of the Custom Call with operand and return layouts and
       // the serialized graph based on the full compiler pipeline.
       std::string optimized_hlo_string = GetOptimizedHlo(pre_hlo_string);
@@ -286,16 +303,15 @@ class CudnnFusedConvRewriterTest : public GpuCodegenTest {
   }
 };
 
-#define MAYBE_SKIP_TEST(CAUSE)                                           \
-  do {                                                                   \
-    if (absl::string_view(CAUSE) == "F8" && IsCuda() &&                  \
-        (GetToolkitVersion() < se::SemanticVersion{12, 0, 0} ||          \
-         GetDnnVersion() < se::dnn::VersionInfo(8, 9, 0))) {             \
-      GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9."; \
-    }                                                                    \
-    if (!IsCuda()) {                                                     \
-      GTEST_SKIP() << CAUSE " fusion is only supported on CUDA.";        \
-    }                                                                    \
+#define MAYBE_SKIP_TEST(CAUSE)                                    \
+  do {                                                            \
+    if (absl::string_view(CAUSE) == "F8" && IsCuda() &&           \
+        GetToolkitVersion() < se::SemanticVersion{12, 0, 0}) {    \
+      GTEST_SKIP() << "FP8 convolutions require CUDA 12.";        \
+    }                                                             \
+    if (!IsCuda()) {                                              \
+      GTEST_SKIP() << CAUSE " fusion is only supported on CUDA."; \
+    }                                                             \
   } while (0)
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvOnly) {
@@ -1067,6 +1083,127 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledReluF8) {
       )");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, TestConvScaledRelu6F8) {
+  MAYBE_SKIP_TEST("F8");
+  TestF8(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+       input = f8e4m3fn[1,128,6,6] parameter(0)
+       filter = f8e4m3fn[3,3,128,16] parameter(1)
+       input_f32 = f32[1,128,6,6] convert(input)
+       filter_f32 = f32[3,3,128,16] convert(filter)
+       z_scale = f32[] parameter(2)
+       z_scale_bcast = f32[1,16,6,6] broadcast(z_scale), dimensions={}
+       c0 = f32[] constant(0)
+       c0_bcast = f32[1,16,6,6] broadcast(c0), dimensions={}
+       c6 = f32[] constant(6)
+       c6_bcast = f32[1,16,6,6] broadcast(c6), dimensions={}
+       conv_a = f32[1,16,6,6] convolution(input_f32, filter_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+       relu6_a = f32[1,16,6,6] clamp(c0_bcast, conv_a, c6_bcast)
+       relu6_a_scaled = f32[1,16,6,6] multiply(relu6_a, z_scale_bcast)
+       c1 = f32[] constant(-448.)
+       c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
+       c2 = f32[] constant(448.)
+       c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
+       relu6_a_clamped = f32[1,16,6,6] clamp(c1_bcast, relu6_a_scaled, c2_bcast)
+       ROOT conv_f8 = f8e4m3fn[1,16,6,6] convert(relu6_a_clamped)
+
+    })",
+      // custom_call
+      R"(
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f8e4m3fn[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]], [[OPERAND3:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+  )",
+      // serialized_graph
+      R"(
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[RELU_UID:[0-9]+]]:[f32]relu([[CONV_UID]]);[[MIN_UID:[0-9]+]]:[f32]min([[RELU_UID]]);[[SCALE0_UID:[0-9]+]]:[f8e4m3fn]scale([[MIN_UID]]);"
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestConvScaledEluF8) {
+  MAYBE_SKIP_TEST("F8");
+  TestF8(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+       input = f8e4m3fn[1,128,6,6] parameter(0)
+       filter = f8e4m3fn[3,3,128,16] parameter(1)
+       input_f32 = f32[1,128,6,6] convert(input)
+       filter_f32 = f32[3,3,128,16] convert(filter)
+       z_scale = f32[] parameter(2)
+       z_scale_bcast = f32[1,16,6,6] broadcast(z_scale), dimensions={}
+       c0 = f32[] constant(0)
+       c0_bcast = f32[1,16,6,6] broadcast(c0), dimensions={}
+       alpha = f32[] constant(0.2)
+       alpha_bcast = f32[1,16,6,6] broadcast(alpha), dimensions={}
+       conv_a = f32[1,16,6,6] convolution(input_f32, filter_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+       conv_a_compare = compare(conv_a, c0_bcast), direction=GT
+       expm1_a = exponential-minus-one(conv_a)
+       elu_a = select(conv_a_compare, conv_a, expm1_a)
+       elu_a_scaled = f32[1,16,6,6] multiply(elu_a, z_scale_bcast)
+       c1 = f32[] constant(-448.)
+       c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
+       c2 = f32[] constant(448.)
+       c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
+       elu_a_clamped = f32[1,16,6,6] clamp(c1_bcast, elu_a_scaled, c2_bcast)
+       ROOT conv_f8 = f8e4m3fn[1,16,6,6] convert(elu_a_clamped)
+
+    })",
+      // custom_call
+      R"(
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f8e4m3fn[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+  )",
+      // serialized_graph
+      R"(
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[ELU_UID:[0-9]+]]:[f32]elu([[CONV_UID]]);[[SCALE0_UID:[0-9]+]]:[f8e4m3fn]scale([[ELU_UID]]);"
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestConvScaledLeakyReluF8) {
+  MAYBE_SKIP_TEST("F8");
+  TestF8(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+       input = f8e4m3fn[1,128,6,6] parameter(0)
+       filter = f8e4m3fn[3,3,128,16] parameter(1)
+       input_f32 = f32[1,128,6,6] convert(input)
+       filter_f32 = f32[3,3,128,16] convert(filter)
+       z_scale = f32[] parameter(2)
+       z_scale_bcast = f32[1,16,6,6] broadcast(z_scale), dimensions={}
+       c0 = f32[] constant(0)
+       c0_bcast = f32[1,16,6,6] broadcast(c0), dimensions={}
+       alpha = f32[] constant(0.2)
+       alpha_bcast = f32[1,16,6,6] broadcast(alpha), dimensions={}
+       conv_a = f32[1,16,6,6] convolution(input_f32, filter_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+       conv_a_compare = compare(conv_a, c0_bcast), direction=GT
+       conv_a_alpha = multiply(conv_a, alpha_bcast)
+       lrelu_a = select(conv_a_compare, conv_a, conv_a_alpha)
+       lrelu_a_scaled = f32[1,16,6,6] multiply(lrelu_a, z_scale_bcast)
+       c1 = f32[] constant(-448.)
+       c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
+       c2 = f32[] constant(448.)
+       c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
+       lrelu_a_clamped = f32[1,16,6,6] clamp(c1_bcast, lrelu_a_scaled, c2_bcast)
+       ROOT conv_f8 = f8e4m3fn[1,16,6,6] convert(lrelu_a_clamped)
+
+    })",
+      // custom_call
+      R"(
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f8e4m3fn[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]], [[OPERAND3:%[^ ]+]], [[OPERAND4:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+  )",
+      // serialized_graph
+      R"(
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[MIN_UID:[0-9]+]]:[f32]min([[CONV_UID]]);[[SCALE0_UID:[0-9]+]]:[f32]scale([[MIN_UID]]);[[MAX_UID:[0-9]+]]:[f32]max([[SCALE0_UID]],[[CONV_UID]]);[[SCALE1_UID:[0-9]+]]:[f8e4m3fn]scale([[MAX_UID]]);"
+      )");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestConvAmaxF8) {
   MAYBE_SKIP_TEST("F8");
   TestF8(
@@ -1186,15 +1323,18 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputMultipleUsersF8) {
        z_scale0_bcast = f32[1,16,6,6] broadcast(z_scale0), dimensions={}
        z_scale1 = f32[] parameter(3)
        z_scale1_bcast = f32[1,16,6,6] broadcast(z_scale1), dimensions={}
+       z_scale2 = f32[] parameter(4)
+       z_scale2_bcast = f32[1,16,6,6] broadcast(z_scale2), dimensions={}
        conv_a = f32[1,16,6,6] convolution(input_f32, filter_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
        conv_a_scaled0 = f32[1,16,6,6] multiply(conv_a, z_scale0_bcast)
-       conv_a_scaled1 = f32[1,16,6,6] multiply(conv_a, z_scale1_bcast)
+       conv_a_scaled1 = f32[1,16,6,6] multiply(conv_a_scaled0, z_scale1_bcast)
+       conv_a_scaled2 = f32[1,16,6,6] multiply(conv_a_scaled0, z_scale2_bcast)
        c1 = f32[] constant(-448.)
        c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
        c2 = f32[] constant(448.)
        c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
-       conv_a_clamped0 = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled0, c2_bcast)
-       conv_a_clamped1 = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled1, c2_bcast)
+       conv_a_clamped0 = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled1, c2_bcast)
+       conv_a_clamped1 = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled2, c2_bcast)
        conv_a_convert0 = f8e4m3fn[1,16,6,6] convert(conv_a_clamped0)
        conv_a_convert1 = f8e4m3fn[1,16,6,6] convert(conv_a_clamped1)
        ROOT conv_f8 = (f8e4m3fn[1,16,6,6], f8e4m3fn[1,16,6,6]) tuple(conv_a_convert0, conv_a_convert1)
@@ -1202,11 +1342,140 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputMultipleUsersF8) {
     })",
       // custom_call
       R"(
-// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f32[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f32[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
   )",
       // serialized_graph
       R"(
-// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();"
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[SCALE_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);"
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest,
+       TestConvScaledOutputMultipleUsersInGraphAddF8) {
+  MAYBE_SKIP_TEST("F8");
+  TestF8(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+       input = f8e4m3fn[1,128,6,6] parameter(0)
+       filter = f8e4m3fn[3,3,128,16] parameter(1)
+       input_f32 = f32[1,128,6,6] convert(input)
+       filter_f32 = f32[3,3,128,16] convert(filter)
+       z_scale0 = f32[] parameter(2)
+       z_scale0_bcast = f32[1,16,6,6] broadcast(z_scale0), dimensions={}
+       z_scale1 = f32[] parameter(3)
+       z_scale1_bcast = f32[1,16,6,6] broadcast(z_scale1), dimensions={}
+       conv_a = f32[1,16,6,6] convolution(input_f32, filter_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+       conv_a_scaled0 = f32[1,16,6,6] multiply(conv_a, z_scale0_bcast)
+       conv_a_scaled1 = f32[1,16,6,6] multiply(conv_a, z_scale1_bcast)
+       conv_a_scaled_sum = f32[1,16,6,6] add(conv_a_scaled0, conv_a_scaled1)
+       c1 = f32[] constant(-448.)
+       c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
+       c2 = f32[] constant(448.)
+       c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
+       conv_a_clamped = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled_sum, c2_bcast)
+       ROOT conv_f8 = f8e4m3fn[1,16,6,6] convert(conv_a_clamped)
+
+    })",
+      // custom_call
+      R"(
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f8e4m3fn[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]], [[OPERAND3:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+  )",
+      // serialized_graph
+      R"(
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[SCALE0_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[SCALE1_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[ADD_UID:[0-9]+]]:[f8e4m3fn]add([[SCALE0_UID]],[[SCALE1_UID]]);"
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest,
+       TestConvScaledOutputMultipleUsersInGraphDoubleAddF8) {
+  MAYBE_SKIP_TEST("F8");
+  TestF8(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+       input = f8e4m3fn[1,128,6,6] parameter(0)
+       filter = f8e4m3fn[3,3,128,16] parameter(1)
+       input_f32 = f32[1,128,6,6] convert(input)
+       filter_f32 = f32[3,3,128,16] convert(filter)
+       z_scale0 = f32[] parameter(2)
+       z_scale0_bcast = f32[1,16,6,6] broadcast(z_scale0), dimensions={}
+       z_scale1 = f32[] parameter(3)
+       z_scale1_bcast = f32[1,16,6,6] broadcast(z_scale1), dimensions={}
+       z_scale2 = f32[] parameter(4)
+       z_scale2_bcast = f32[1,16,6,6] broadcast(z_scale2), dimensions={}
+       conv_a = f32[1,16,6,6] convolution(input_f32, filter_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+       conv_a_scaled0 = f32[1,16,6,6] multiply(conv_a, z_scale0_bcast)
+       conv_a_scaled1 = f32[1,16,6,6] multiply(conv_a, z_scale1_bcast)
+       conv_a_scaled_sum0 = f32[1,16,6,6] add(conv_a_scaled0, conv_a_scaled1)
+       conv_a_scaled2 = f32[1,16,6,6] multiply(conv_a, z_scale2_bcast)
+       conv_a_scaled_sum1 = f32[1,16,6,6] add(conv_a_scaled_sum0, conv_a_scaled2)
+       c1 = f32[] constant(-448.)
+       c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
+       c2 = f32[] constant(448.)
+       c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
+       conv_a_clamped = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled_sum1, c2_bcast)
+       ROOT conv_f8 = f8e4m3fn[1,16,6,6] convert(conv_a_clamped)
+
+    })",
+      // custom_call
+      R"(
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f8e4m3fn[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]], [[OPERAND3:%[^ ]+]], [[OPERAND4:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+  )",
+      // serialized_graph
+      R"(
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[SCALE0_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[SCALE1_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[ADD0_UID:[0-9]+]]:[f32]add([[SCALE0_UID]],[[SCALE1_UID]]);[[SCALE2_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[ADD1_UID:[0-9]+]]:[f8e4m3fn]add([[ADD0_UID]],[[SCALE2_UID]]);"
+      )");
+}
+
+TEST_F(CudnnFusedConvRewriterTest,
+       TestConvScaledOutputMultipleUsersInGraphTripleAddF8) {
+  MAYBE_SKIP_TEST("F8");
+  TestF8(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    ENTRY Test {
+       input = f8e4m3fn[1,128,6,6] parameter(0)
+       filter = f8e4m3fn[3,3,128,16] parameter(1)
+       input_f32 = f32[1,128,6,6] convert(input)
+       filter_f32 = f32[3,3,128,16] convert(filter)
+       z_scale0 = f32[] parameter(2)
+       z_scale0_bcast = f32[1,16,6,6] broadcast(z_scale0), dimensions={}
+       z_scale1 = f32[] parameter(3)
+       z_scale1_bcast = f32[1,16,6,6] broadcast(z_scale1), dimensions={}
+       z_scale2 = f32[] parameter(4)
+       z_scale2_bcast = f32[1,16,6,6] broadcast(z_scale2), dimensions={}
+       z_scale3 = f32[] parameter(5)
+       z_scale3_bcast = f32[1,16,6,6] broadcast(z_scale3), dimensions={}
+       conv_a = f32[1,16,6,6] convolution(input_f32, filter_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+       conv_a_scaled0 = f32[1,16,6,6] multiply(conv_a, z_scale0_bcast)
+       conv_a_scaled1 = f32[1,16,6,6] multiply(conv_a, z_scale1_bcast)
+       conv_a_scaled_sum0 = f32[1,16,6,6] add(conv_a_scaled0, conv_a_scaled1)
+       conv_a_scaled2 = f32[1,16,6,6] multiply(conv_a, z_scale2_bcast)
+       conv_a_scaled3 = f32[1,16,6,6] multiply(conv_a, z_scale3_bcast)
+       conv_a_scaled_sum1 = f32[1,16,6,6] add(conv_a_scaled2, conv_a_scaled3)
+       conv_a_scaled_sum2 = f32[1,16,6,6] add(conv_a_scaled_sum0, conv_a_scaled_sum1)
+       c1 = f32[] constant(-448.)
+       c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
+       c2 = f32[] constant(448.)
+       c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
+       conv_a_clamped = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled_sum2, c2_bcast)
+       ROOT conv_f8 = f8e4m3fn[1,16,6,6] convert(conv_a_clamped)
+
+    })",
+      // custom_call
+      R"(
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f8e4m3fn[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]], [[OPERAND3:%[^ ]+]], [[OPERAND4:%[^ ]+]], /*index=5*/[[OPERAND5:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+  )",
+      // serialized_graph
+      R"(
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[SCALE0_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[SCALE1_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[ADD0_UID:[0-9]+]]:[f32]add([[SCALE0_UID]],[[SCALE1_UID]]);[[SCALE2_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[SCALE3_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[ADD1_UID:[0-9]+]]:[f32]add([[SCALE2_UID]],[[SCALE3_UID]]);[[ADD2_UID:[0-9]+]]:[f8e4m3fn]add([[ADD0_UID]],[[ADD1_UID]]);"
       )");
 }
 
@@ -1246,6 +1515,58 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputUnsupportedUserF8) {
       )");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, TestConvAddOperandReachableFromAmaxF8) {
+  MAYBE_SKIP_TEST("F8");
+  TestF8(
+      // pre_hlo
+      R"(
+    HloModule Test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] maximum(a, b)
+    }
+
+    ENTRY Test {
+       input = f8e4m3fn[1,128,6,6] parameter(0)
+       filter = f8e4m3fn[3,3,128,16] parameter(1)
+       input_scale = f32[] parameter(2)
+       input_scale_bcast = f32[1,128,6,6] broadcast(input_scale), dimensions={}
+       filter_scale = f32[] parameter(3)
+       filter_scale_bcast = f32[3,3,128,16] broadcast(filter_scale), dimensions={}
+       input_f32 = f32[1,128,6,6] convert(input)
+       input_unscaled = f32[1,128,6,6] multiply(input_f32, input_scale_bcast)
+       filter_f32 = f32[3,3,128,16] convert(filter)
+       filter_unscaled = f32[3,3,128,16] multiply(filter_f32, filter_scale_bcast)
+       conv_a = f32[1,16,6,6] convolution(input_unscaled, filter_unscaled), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+       z_scale = f32[] parameter(4)
+       z_scale_bcast = f32[1,16,6,6] broadcast(z_scale), dimensions={}
+       conv_a_scaled = f32[1,16,6,6] multiply(conv_a, z_scale_bcast)
+       abs_conv_a = f32[1,16,6,6] abs(conv_a)
+       c0 = f32[] constant(-inf)
+       amax = f32[] reduce(abs_conv_a, c0), dimensions={0,1,2,3}, to_apply=apply
+       amax_bcast = f32[1,16,6,6] broadcast(amax), dimensions={}
+       conv_a_scaled_amax = f32[1,16,6,6] add(conv_a_scaled, amax_bcast)
+       c1 = f32[] constant(-448.)
+       c1_bcast = f32[1,16,6,6] broadcast(c1), dimensions={}
+       c2 = f32[] constant(448.)
+       c2_bcast = f32[1,16,6,6] broadcast(c2), dimensions={}
+       conv_a_clamped = f32[1,16,6,6] clamp(c1_bcast, conv_a_scaled_amax, c2_bcast)
+       conv_a_clamped_f8 = f8e4m3fn[1,16,6,6] convert(conv_a_clamped)
+       ROOT conv_f8 = (f8e4m3fn[1,16,6,6], f32[]) tuple(conv_a_clamped_f8, amax)
+
+    })",
+      // custom_call
+      R"(
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (f32[1,6,6,16]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[OPERAND0:%[^ ]+]], [[OPERAND1:%[^ ]+]], [[OPERAND2:%[^ ]+]], [[OPERAND3:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForwardGraph"
+    )",
+      // serialized_graph
+      R"(
+// CHECK: "serialized_graph":"[[CONV_UID:[0-9]+]]:[f32]conv();[[SCALE0_UID:[0-9]+]]:[f32]scale([[CONV_UID]]);[[SCALE1_UID:[0-9]+]]:[f32]scale([[SCALE0_UID]]);"
+      )");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8) {
   MAYBE_SKIP_TEST("I8");
   // max(0, clamp(conv(x, w)))); for int8_t
@@ -1301,10 +1622,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloat) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1335,10 +1655,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToInt8BiasSideInput) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1376,10 +1695,9 @@ TEST_F(CudnnFusedConvRewriterHloTest,
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1417,10 +1735,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestReluAfterConvert) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1471,10 +1788,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloatBiasSideInput) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1517,10 +1833,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, Int8SideInputWithScaleAndReshape) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1572,10 +1887,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseAlpha) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1612,10 +1926,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseRelu) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1653,10 +1966,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseReluIfMultipleUses) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1703,7 +2015,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseElu) {
   debug_opts.set_xla_gpu_use_runtime_fusion(true);
   m->mutable_config().set_debug_options(debug_opts);
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
   // elu fusion is only active on Ampere+.
   CudnnFusedConvRewriter fuser{se::CudaComputeCapability(8, 0), GetDnnVersion(),
@@ -1752,10 +2064,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseEluIfMultipleUses) {
   debug_opts.set_xla_gpu_use_runtime_fusion(true);
   m->mutable_config().set_debug_options(debug_opts);
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1805,7 +2116,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseRelu6) {
   debug_opts.set_xla_gpu_use_runtime_fusion(true);
   m->mutable_config().set_debug_options(debug_opts);
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
   // relu6 fusion is only enabled on Ampere+.
   CudnnFusedConvRewriter fuser{se::CudaComputeCapability(8, 0), GetDnnVersion(),
@@ -1849,10 +2160,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseRelu6IfMultipleUses) {
   debug_opts.set_xla_gpu_use_runtime_fusion(true);
   m->mutable_config().set_debug_options(debug_opts);
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1897,7 +2207,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseLeakyRelu) {
   debug_opts.set_xla_gpu_use_runtime_fusion(true);
   m->mutable_config().set_debug_options(debug_opts);
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
   // Leaky-relu fusion is only enabled on Ampere+.
   CudnnFusedConvRewriter fuser{se::CudaComputeCapability(8, 0), GetDnnVersion(),
@@ -1944,10 +2254,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseLeakyReluIfMultipleUses) {
   debug_opts.set_xla_gpu_use_runtime_fusion(true);
   m->mutable_config().set_debug_options(debug_opts);
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1992,10 +2301,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseAlphaIfMultipleUsers) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2032,10 +2340,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseBiasIfMultipleUsers) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2071,10 +2378,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseSideInputThroughRelu) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2110,10 +2416,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseBiasThroughRelu) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2146,10 +2451,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseSideInputIfMultipleUsers) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2183,10 +2487,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseConvertToF16IfMultipleUsers) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2217,10 +2520,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseToS8IfMultipleUsers) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2250,10 +2552,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS32ToF32) {
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
   SCOPED_TRACE(m->ToString());
   HloInstruction* conv1 = nullptr;
@@ -2277,10 +2578,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS8ToF32) {
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
   SCOPED_TRACE(m->ToString());
   HloInstruction* conv1 = nullptr;
@@ -2304,10 +2604,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingF32ToS8) {
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
   SCOPED_TRACE(m->ToString());
   HloInstruction* conv1 = nullptr;
@@ -2332,10 +2631,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontRemoveConvertDuetoMultpleUser) {
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
   SCOPED_TRACE(m->ToString());
   HloInstruction* conv1 = nullptr;
@@ -2362,10 +2660,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseBias) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2394,10 +2691,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseSideInput) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2437,10 +2733,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseScaledSideInput) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2480,10 +2775,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseBiasAndSideInput) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2518,10 +2812,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, EffectiveScalarBias) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -2561,10 +2854,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, StrengthReduceF32ToF16) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -2608,10 +2900,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, BroadcastReshapeTransposeAfterConvert) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -2661,10 +2952,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, NoStrengthReduceF32ToF16IfBiasIsF32) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -2717,10 +3007,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, F32Constants) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph, and fold
@@ -2771,10 +3060,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, F32ConstantsNotLosslesslyConvertible) {
     })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph, and fold
@@ -2835,10 +3123,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseReluBeforeConvert) {
   })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -2878,10 +3165,9 @@ TEST_F(CudnnFusedConvRewriterHloTest, BiasTypeMatchesConvTypeIfFp) {
   })";
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ConvRewriter rewriter{GetCudaComputeCapability()};
+  ConvRewriter rewriter = GetConvRewriter();
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser{GetCudaComputeCapability(), GetDnnVersion(),
-                               GetToolkitVersion()};
+  CudnnFusedConvRewriter fuser = GetCudnnFusedConvRewriter();
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
index 501cdce31916..abf4b1064eef 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_fusion_compiler.cc
@@ -243,9 +243,10 @@ class GemmDimensionAdapter {
                            .value()[0]};
         break;
       case TritonFusionAnalysis::Scope::OUTPUT:
-        lhs_noncontracting_index = dot_.shape().rank() - 2;
+        lhs_noncontracting_index = dot_.shape().dimensions().size() - 2;
         dim_indices = {dims.lhs_batch_dimensions().empty() ? -1 : 0,
-                       lhs_noncontracting_index, dot_.shape().rank() - 1};
+                       lhs_noncontracting_index,
+                       dot_.shape().dimensions_size() - 1};
         break;
       case TritonFusionAnalysis::Scope::META:
         LOG(FATAL) << "Unsupported scope.";
@@ -386,7 +387,7 @@ HandleClampToCudnnGraph(
     absl::flat_hash_map<const HloInstruction*,
                         std::shared_ptr<graph::Tensor_attributes>>
         hlo_to_cudnn,
-    fe::DataType_t data_type, fe::DataType_t compute_dtype) {
+    fe::DataType_t compute_dtype) {
   CHECK(hlo.opcode() == HloOpcode::kClamp)
       << "HLO is not a clamp: " << hlo.ToShortString();
   CHECK(hlo.operands().size() == 3)
@@ -397,7 +398,14 @@ HandleClampToCudnnGraph(
                              .set_compute_data_type(compute_dtype);
   std::shared_ptr<graph::Tensor_attributes> min_tensor = graph.pointwise(
       hlo_to_cudnn[hlo.operand(1)], hlo_to_cudnn[hlo.operand(2)], min_attrs);
-  min_tensor->set_data_type(data_type).set_name(std::string(hlo.name()));
+  const std::optional<fe::DataType_t> data_type =
+      ToCudnnDataType(hlo.shape().element_type());
+  if (!data_type.has_value()) {
+    VLOG(3) << "Unimplemented data type: "
+            << PrimitiveType_Name(hlo.shape().element_type());
+    return std::nullopt;
+  }
+  min_tensor->set_data_type(*data_type).set_name(std::string(hlo.name()));
   const auto max_attrs = graph::Pointwise_attributes()
                              .set_mode(fe::PointwiseMode_t::MAX)
                              .set_compute_data_type(compute_dtype);
@@ -465,11 +473,6 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     auto operand = [&hlo_to_cudnn, &hlo](int i) {
       return hlo_to_cudnn[hlo->operand(i)];
     };
-    const auto data_type = ToCudnnDataType(hlo->shape().element_type());
-    if (!data_type.has_value()) {
-      VLOG(3) << "Unimplemented data type: " << hlo->shape().element_type();
-      return std::nullopt;
-    }
     if (HloPredicateIsOp<HloOpcode::kParameter>(hlo)) {
       CHECK(hlo_to_cudnn.contains(hlo));
       continue;
@@ -508,9 +511,8 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
         return std::nullopt;
       }
       if (HloPredicateIsOp<HloOpcode::kClamp>(hlo)) {
-        const auto clamp =
-            HandleClampToCudnnGraph(*hlo, graph, hlo_to_cudnn,
-                                    data_type.value(), compute_dtype.value());
+        const auto clamp = HandleClampToCudnnGraph(*hlo, graph, hlo_to_cudnn,
+                                                   compute_dtype.value());
         if (!clamp.has_value()) {
           return std::nullopt;
         }
@@ -580,6 +582,13 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       VLOG(3) << "Creation of the operation failed.";
       return std::nullopt;
     }
+    const std::optional<fe::DataType_t> data_type =
+        ToCudnnDataType(hlo->shape().element_type());
+    if (!data_type.has_value()) {
+      VLOG(3) << "Unimplemented data type: "
+              << PrimitiveType_Name(hlo->shape().element_type());
+      return std::nullopt;
+    }
     hlo_to_cudnn[hlo]
         ->set_data_type(data_type.value())
         .set_name(std::string(hlo->name()));
@@ -659,35 +668,23 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
   absl::Status HandleFusion(HloInstruction* hlo) override {
     TF_ASSIGN_OR_RETURN(auto gpu_config,
                         hlo->backend_config<GpuBackendConfig>());
-    const auto& fusion_backend_config = gpu_config.fusion_backend_config();
+    const FusionBackendConfig& fusion_backend_config =
+        gpu_config.fusion_backend_config();
     if (fusion_backend_config.kind() != kCuDnnFusionKind) {
       return absl::OkStatus();
     }
-    int64_t plan_id = -1;
-    if (fusion_backend_config.has_cudnn_fusion_config()) {
-      plan_id = fusion_backend_config.cudnn_fusion_config().plan_id();
-    }
-
     VLOG(4) << "Processing " << hlo->ToString();
-    VLOG(4) << "Plan ID: " << plan_id;
 
-    auto add_workspace = [&](const int64_t workspace_size) {
-      if (workspace_size > 0) {
-        TF_ASSIGN_OR_RETURN(hlo, AddWorkspace(*hlo, workspace_size));
-        SetVisited(*hlo);
-      }
-      return absl::OkStatus();
-    };
-    const std::string fingerprint_without_workspace =
-        GetComputationFingerprint(hlo->fused_instructions_computation(), {});
-    auto workspace_size_it =
-        workspace_sizes_.find(fingerprint_without_workspace);
-    if (workspace_size_it == workspace_sizes_.cend()) {
+    auto compile_graph = [&]() -> absl::StatusOr<se::gpu::CudnnGraph> {
       TF_ASSIGN_OR_RETURN(
           se::gpu::CudnnGraph graph,
           PrepareGraph(dnn_support_, *DynCast<HloFusionInstruction>(hlo)));
 
-      if (plan_id >= 0) {
+      if (fusion_backend_config.has_cudnn_fusion_config() &&
+          fusion_backend_config.cudnn_fusion_config().plan_id() >= 0) {
+        const int64_t plan_id =
+            fusion_backend_config.cudnn_fusion_config().plan_id();
+        VLOG(4) << "Plan ID: " << plan_id;
         // Build single plan with given ID.
         if (plan_id >= graph.Graph().get_execution_plan_count()) {
           return absl::InternalError("cuDNN graph plan does not exist.");
@@ -696,8 +693,8 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
       } else {
         // Build plans one by one till first successful when no plan_id was
         // provided.
-        for (plan_id = 0; plan_id < graph.Graph().get_execution_plan_count();
-             ++plan_id) {
+        int64_t plan_id = 0;
+        for (; plan_id < graph.Graph().get_execution_plan_count(); ++plan_id) {
           VLOG(7) << "Trying plan ID " << plan_id;
           if (graph.Build(dnn_support_, plan_id).ok()) {
             VLOG(7) << "Successfully built plan ID " << plan_id;
@@ -707,28 +704,63 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
         if (plan_id == graph.Graph().get_execution_plan_count()) {
           return absl::InternalError("No cuDNN plans can be built.");
         }
+        CuDnnFusionConfig* cudnn_config =
+            gpu_config.mutable_fusion_backend_config()
+                ->mutable_cudnn_fusion_config();
+        cudnn_config->set_plan_id(plan_id);
+        TF_RETURN_IF_ERROR(hlo->set_backend_config(gpu_config));
+      }
+      return graph;
+    };
+
+    auto serialize_graph =
+        [](const se::gpu::CudnnGraph& graph) -> absl::StatusOr<std::string> {
+      std::vector<uint8_t> serialized_graph;
+      RETURN_IF_CUDNN_FRONTEND_ERROR(graph.Graph().serialize(serialized_graph));
+      return std::string(reinterpret_cast<char*>(serialized_graph.data()),
+                         serialized_graph.size());
+    };
+
+    if (IsWorkspaceAllocationRoot(*hlo->fused_expression_root())) {
+      // The graph already has a workspace.
+      const std::string fingerprint =
+          GetComputationFingerprint(hlo->fused_instructions_computation(), {});
+      if (auto it = compilation_results_.find(fingerprint);
+          it == compilation_results_.cend()) {
+        TF_ASSIGN_OR_RETURN(const se::gpu::CudnnGraph graph, compile_graph());
+        TF_ASSIGN_OR_RETURN(const std::string serialized,
+                            serialize_graph(graph));
+        compilation_results_.insert(it, {fingerprint, serialized});
+      }
+      return absl::OkStatus();
+    }
+
+    auto add_workspace = [&](const int64_t workspace_size) {
+      if (workspace_size > 0) {
+        TF_ASSIGN_OR_RETURN(hlo, AddWorkspace(*hlo, workspace_size));
+        SetVisited(*hlo);
       }
+      return absl::OkStatus();
+    };
+
+    const std::string fingerprint_without_workspace =
+        GetComputationFingerprint(hlo->fused_instructions_computation(), {});
+
+    auto workspace_size_it =
+        workspace_sizes_.find(fingerprint_without_workspace);
+    if (workspace_size_it == workspace_sizes_.cend()) {
+      TF_ASSIGN_OR_RETURN(const se::gpu::CudnnGraph graph, compile_graph());
       const int64_t workspace_size = graph.Graph().get_workspace_size();
       workspace_sizes_.insert(workspace_size_it,
                               {fingerprint_without_workspace, workspace_size});
       TF_RETURN_IF_ERROR(add_workspace(workspace_size));
-
-      std::vector<uint8_t> serialized_graph;
-      RETURN_IF_CUDNN_FRONTEND_ERROR(graph.Graph().serialize(serialized_graph));
-      // Compute a new fingerprint with a potential workspace for the
-      // compilation results to match a fingerprint computed by the emitter.
+      TF_ASSIGN_OR_RETURN(const std::string serialized, serialize_graph(graph));
       compilation_results_[GetComputationFingerprint(
-          hlo->fused_instructions_computation(), {})] =
-          std::string(reinterpret_cast<char*>(serialized_graph.data()),
-                      serialized_graph.size());
+          hlo->fused_instructions_computation(), {})] = serialized;
     } else {
       VLOG(4) << "Cache hit.";
       TF_RETURN_IF_ERROR(add_workspace(workspace_size_it->second));
     }
-    auto cudnn_config = gpu_config.mutable_fusion_backend_config()
-                            ->mutable_cudnn_fusion_config();
-    cudnn_config->set_plan_id(plan_id);
-    TF_RETURN_IF_ERROR(hlo->set_backend_config(gpu_config));
 
     MarkAsChanged();
     return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
index 8dda337930b5..d23a813d0820 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter.cc
@@ -183,8 +183,8 @@ bool CompatibleElementType(const HloInstruction* instr) {
 std::vector<int64_t> AdjustedDimensions(const Shape& shape,
                                         absl::Span<const int64_t> dimensions) {
   absl::flat_hash_map<int64_t, int64_t> dimension_map;
-  for (int64_t dimension = 0, non_degen_dimension = 0; dimension < shape.rank();
-       ++dimension) {
+  for (int64_t dimension = 0, non_degen_dimension = 0;
+       dimension < shape.dimensions().size(); ++dimension) {
     if (shape.dimensions(dimension) > 1) {
       dimension_map.insert({dimension, non_degen_dimension});
       non_degen_dimension++;
@@ -328,12 +328,13 @@ std::vector<int64_t> MapDimensions(const Shape& original_shape,
   absl::flat_hash_map<int64_t, std::vector<int64_t>> dimensions_map;
   std::vector<int64_t> original_dimensions, reshaped_dimensions;
   for (int64_t original_dimension = 0, reshaped_dimension = 0;
-       original_dimension < original_shape.rank(); ++original_dimension) {
+       original_dimension < original_shape.dimensions().size();
+       ++original_dimension) {
     original_dimensions.push_back(original_dimension);
     while ((reshaped_dimensions.empty() ||
             dimension_product(reshaped_shape, reshaped_dimensions) <
                 dimension_product(original_shape, original_dimensions)) &&
-           reshaped_dimension < reshaped_shape.rank()) {
+           reshaped_dimension < reshaped_shape.dimensions().size()) {
       reshaped_dimensions.emplace_back(reshaped_dimension++);
     }
 
@@ -826,7 +827,7 @@ auto F1(UniqueHloInstruction* x, UniqueHloInstruction* x_center,
                                        .GetAsDouble({})
                                        .value();
     int64_t nelems = 1;
-    for (int i = 0; i < instr->shape().dimensions_size(); ++i) {
+    for (int i = 0; i < instr->shape().dimensions().size(); ++i) {
       if (!absl::c_linear_search(instr->dimensions(), i)) {
         nelems *= instr->shape().dimensions()[i];
       }
@@ -993,7 +994,8 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       std::vector<int64_t> norm_dims_adjusted = AdjustedDimensions(reduce);
       if (norm_dims_adjusted.size() !=
           ShapeUtil::DropDegenerateDimensions(scale->shape())
-              .dimensions_size()) {
+              .dimensions()
+              .size()) {
         VLOG(1) << "Layer norm input dimensions not supported.";
         return absl::OkStatus();
       }
@@ -1014,7 +1016,8 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       // If necessary, transpose the input so that the dimensions not being
       // normalized are the leading dimensions.
       std::vector<int64_t> non_norm_dims;
-      for (int64_t x_dim = 0; x_dim < x.Instr()->shape().rank(); ++x_dim) {
+      for (int64_t x_dim = 0; x_dim < x.Instr()->shape().dimensions().size();
+           ++x_dim) {
         if (std::find(norm_dims.begin(), norm_dims.end(), x_dim) ==
             norm_dims.end()) {
           non_norm_dims.push_back(x_dim);
@@ -1348,7 +1351,7 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       // broadcasted dimensions.
       float actual_r_nelems = scalar->literal().GetAsDouble({}).value();
       int64_t nelems = 1;
-      for (int i = 0; i < broadcast->shape().dimensions_size(); ++i) {
+      for (int i = 0; i < broadcast->shape().dimensions().size(); ++i) {
         if (!absl::c_linear_search(broadcast->dimensions(), i)) {
           nelems *= broadcast->shape().dimensions()[i];
         }
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
index b612770c9f79..a430c3a5ed84 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions.cc
@@ -25,7 +25,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -40,6 +43,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -59,10 +63,10 @@ static HloInstruction* PadInstruction(HloInstruction* instr,
   HloComputation* comp = instr->parent();
 
   const Shape& shape = instr->shape();
-  PaddingConfig pad_config = MakeNoPaddingConfig(shape.rank());
+  PaddingConfig pad_config = MakeNoPaddingConfig(shape.dimensions().size());
 
   bool added_padding = false;
-  for (int64_t dim = 0; dim < shape.rank(); ++dim) {
+  for (int64_t dim = 0; dim < shape.dimensions().size(); ++dim) {
     if (shape.dimensions(dim) == new_shape.dimensions(dim)) {
       continue;
     }
@@ -122,10 +126,10 @@ static absl::Status PadConv(HloCustomCallInstruction* conv,
   // Slice the new conv result if necessary, keeping in mind that new_conv
   // has tuple shape (new_result_shape, u8[0]).
   if (!ShapeUtil::Equal(result_shape, new_result_shape)) {
-    std::vector<int64_t> start_indices(result_shape.dimensions_size(), 0);
+    std::vector<int64_t> start_indices(result_shape.dimensions().size(), 0);
     std::vector<int64_t> end_indices(result_shape.dimensions().begin(),
                                      result_shape.dimensions().end());
-    std::vector<int64_t> strides(result_shape.dimensions_size(), 1);
+    std::vector<int64_t> strides(result_shape.dimensions().size(), 1);
 
     auto* new_conv_result = add(
         HloInstruction::CreateGetTupleElement(new_result_shape, new_conv, 0));
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions_test.cc
index 55c5197a13c9..ae8f86956371 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_pad_for_convolutions_test.cc
@@ -18,10 +18,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -29,7 +30,7 @@ namespace {
 
 namespace m = xla::match;
 
-class CudnnPadForConvolutionsTest : public HloTestBase {};
+class CudnnPadForConvolutionsTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(CudnnPadForConvolutionsTest, DoNotPadF16ForwardConvWhenGrouped) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
index 30e0a4b3bef6..fc24b74a0f76 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -70,7 +73,7 @@ std::optional<int64_t> FindFalseIndex(absl::Span<const bool> vals) {
 std::optional<int64_t> FindOutputVectCDim(HloInstruction* conv) {
   const ConvolutionDimensionNumbers& dnums =
       conv->convolution_dimension_numbers();
-  int64_t num_dims = conv->shape().tuple_shapes(0).dimensions_size();
+  int64_t num_dims = conv->shape().tuple_shapes(0).dimensions().size();
   absl::InlinedVector<bool, 5> seen_dims(num_dims);
   seen_dims[dnums.output_batch_dimension()] = true;
   seen_dims[dnums.output_feature_dimension()] = true;
@@ -84,7 +87,7 @@ std::optional<int64_t> FindOutputVectCDim(HloInstruction* conv) {
 std::optional<int64_t> FindKernelVectCDim(HloInstruction* conv) {
   const ConvolutionDimensionNumbers& dnums =
       conv->convolution_dimension_numbers();
-  int64_t num_dims = conv->operand(1)->shape().dimensions_size();
+  int64_t num_dims = conv->operand(1)->shape().dimensions().size();
   absl::InlinedVector<bool, 5> seen_dims(num_dims);
   seen_dims[dnums.kernel_input_feature_dimension()] = true;
   seen_dims[dnums.kernel_output_feature_dimension()] = true;
@@ -123,7 +126,8 @@ std::optional<int64_t> NumTrailingZeroOutputFeatures(HloInstruction* conv) {
     // If these don't hold, it means that some pass (e.g. constant folding)
     // has modified the filter, making making it infeasible to get the original,
     // un-reordered value.
-    if (!matched || feature_dim != 0 || transpose->shape().rank() != 8) {
+    if (!matched || feature_dim != 0 ||
+        transpose->shape().dimensions().size() != 8) {
       VLOG(2) << "The filter output feature dimension cannot be determined, as "
                  "the reordering sequence is modified";
       return std::nullopt;
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding_test.cc
index deff8f7d9c83..a4ac45cc3d92 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_simplify_padding_test.cc
@@ -22,10 +22,13 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/hlo/transforms/simplifiers/reshape_mover.h"
@@ -37,9 +40,9 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -49,7 +52,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-class CudnnSimplifyPaddingTest : public HloTestBase {
+class CudnnSimplifyPaddingTest : public HloHardwareIndependentTestBase {
  protected:
   // Runs the whole relevant pass pipeline starting at CudnnPadForConvolutions.
   // This lets us test that we're matching the patterns that actually get
@@ -64,7 +67,7 @@ class CudnnSimplifyPaddingTest : public HloTestBase {
 
     TF_RETURN_IF_ERROR(
         RunHloPass(CudnnVectorizeConvolutions(
-                       cc, /*cudnn_version=*/se::dnn::VersionInfo{8, 3, 0}),
+                       cc, /*cudnn_version=*/se::dnn::VersionInfo{8, 9, 0}),
                    module)
             .status());
     VLOG(1) << "after vectorizing convs:\n" << module->ToString();
@@ -624,7 +627,7 @@ TEST_F(CudnnSimplifyPaddingTest, SliceMoreElementsThanPad) {
   // into a slice.
   ASSERT_THAT(root, GmockMatch(m::Slice(
                         &slice, m::GetTupleElement(m::CustomCall(), 0))));
-  for (int64_t i = 0; i < slice->shape().dimensions_size(); ++i) {
+  for (int64_t i = 0; i < slice->shape().dimensions().size(); ++i) {
     SCOPED_TRACE(i);
     EXPECT_EQ(slice->slice_starts(i), 0);
     EXPECT_EQ(slice->slice_strides(i), 1);
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions.cc
index 8325c0f2da2d..33ef6df65dbc 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -48,6 +50,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -128,7 +131,7 @@ static Shape SplitShapeAtDim(Shape shape, int64_t dim, int64_t vect_size) {
 // Transposes dimension `src` to right before `dst`.
 static XlaOp MoveDim(XlaOp instr, int64_t src, int64_t dst) {
   XlaBuilder& b = *instr.builder();
-  int64_t rank = b.GetShape(instr)->dimensions_size();
+  int64_t rank = b.GetShape(instr)->dimensions().size();
 
   DimensionVector idxs(rank);
   absl::c_iota(idxs, 0);
@@ -406,8 +409,7 @@ static absl::StatusOr<bool> TryRevectorizeConv(
   const auto& debug_options = conv->GetModule()->config().debug_options();
   bool use_reordering =
       input_shape.element_type() == xla::S8 && vect_size == 32 &&
-      debug_options.xla_gpu_enable_cudnn_int8x32_convolution_reordering() &&
-      cudnn_version >= se::dnn::VersionInfo{8, 3, 0};
+      debug_options.xla_gpu_enable_cudnn_int8x32_convolution_reordering();
   if (use_reordering) {
     // Reordering helper supports vector sizes of 4 and 32, so an additional
     // reshape-transpose-reshape is not necessary in these cases.
@@ -494,7 +496,7 @@ static absl::StatusOr<bool> TryVectorizeConv(
     return false;
   }
 
-  if (input_shape.dimensions_size() >
+  if (input_shape.dimensions().size() >
       2 + dnums->input_spatial_dimensions_size()) {
     // Conv already has an extra dimension, which we assume is the vectorized
     // features dim.
@@ -551,8 +553,7 @@ static absl::StatusOr<bool> TryVectorizeConv(
   const auto& debug_options = conv->GetModule()->config().debug_options();
   bool use_reordering =
       input_shape.element_type() == xla::S8 && vect_size == 32 &&
-      debug_options.xla_gpu_enable_cudnn_int8x32_convolution_reordering() &&
-      cudnn_version >= se::dnn::VersionInfo{8, 3, 0};
+      debug_options.xla_gpu_enable_cudnn_int8x32_convolution_reordering();
   if (use_reordering) {
     new_operands[1] = filter;
     TF_RETURN_IF_ERROR(ReorderInt8NchwVect(conv, new_operands.data()));
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions_test.cc
index d3a058659838..f65519b0c719 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_vectorize_convolutions_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/call_inliner.h"
 #include "xla/service/gpu/backend_configs.pb.h"
@@ -31,8 +32,8 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -42,7 +43,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-class CudnnVectorizeConvolutionsTest : public HloTestBase {
+class CudnnVectorizeConvolutionsTest : public HloHardwareIndependentTestBase {
  protected:
   // Runs this pass and some cleanup to make pattern-matching easier.
   absl::StatusOr<bool> Run(std::pair<int, int> compute_capability,
@@ -50,7 +51,7 @@ class CudnnVectorizeConvolutionsTest : public HloTestBase {
     CudnnVectorizeConvolutions pass(
         se::CudaComputeCapability{compute_capability.first,
                                   compute_capability.second},
-        se::dnn::VersionInfo(8, 3, 0));
+        se::dnn::VersionInfo(8, 9, 0));
     TF_ASSIGN_OR_RETURN(bool changed, RunHloPass(&pass, module));
 
     CallInliner inliner;
@@ -229,7 +230,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, NoVectorizeTo4) {
                     .value();
   CudnnVectorizeConvolutions pass(
       /*compute_capability=*/{7, 5},
-      /*cudnn_version=*/se::dnn::VersionInfo{8, 3, 0});
+      /*cudnn_version=*/se::dnn::VersionInfo{8, 9, 0});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, Run({7, 5}, module.get()));
 
   SCOPED_TRACE(module->ToString());
diff --git a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
index bfbdef3dadda..223b0338c3c3 100644
--- a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -32,9 +34,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter_test.cc
index 4d44d99066e9..35d932165733 100644
--- a/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/custom_kernel_fusion_rewriter_test.cc
@@ -19,13 +19,15 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla::gpu {
@@ -52,7 +54,7 @@ struct SimpleGemmPattern : public CustomKernelFusionPattern {
 
 //===----------------------------------------------------------------------===//
 
-class CustomKernelFusionRewriterTest : public HloTestBase {};
+class CustomKernelFusionRewriterTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(CustomKernelFusionRewriterTest, SimpleGemm) {
   const char* hlo = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
index fedba5849fd7..fac30e4daac8 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_algorithm_rewriter.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
index 7bfdb137e47c..736e84ccbc3a 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter.cc
@@ -22,7 +22,9 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter_test.cc b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter_test.cc
index 8b6efd2e757d..aa97bee74bd3 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_dimension_sorter_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_normalizer.cc b/third_party/xla/xla/service/gpu/transforms/dot_normalizer.cc
index 40a858034365..8f0374174582 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_normalizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_normalizer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
 namespace xla::gpu {
@@ -51,8 +52,8 @@ absl::StatusOr<HloInstruction*> DotNormalizer::ExpandInstruction(
       dot->AddInstruction(HloInstruction::CreateBitcast(new_rhs_shape, rhs));
   TF_RETURN_IF_ERROR(dot->ReplaceOperandWithDifferentShape(1, normalized_rhs));
   DotDimensionNumbers* dnums = dot->mutable_dot_dimension_numbers();
-  dnums->add_lhs_contracting_dimensions(new_lhs_shape.rank() - 1);
-  dnums->add_rhs_contracting_dimensions(new_rhs_shape.rank() - 1);
+  dnums->add_lhs_contracting_dimensions(new_lhs_shape.dimensions().size() - 1);
+  dnums->add_rhs_contracting_dimensions(new_rhs_shape.dimensions().size() - 1);
   return nullptr;
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_normalizer_test.cc b/third_party/xla/xla/service/gpu/transforms/dot_normalizer_test.cc
index 8e8c83017821..929488b6a1c0 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_normalizer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_normalizer_test.cc
@@ -15,10 +15,13 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/dot_normalizer.h"
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -28,7 +31,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-using DotNormalizerTest = HloTestBase;
+using DotNormalizerTest = HloHardwareIndependentTestBase;
 using ::tsl::testing::IsOkAndHolds;
 
 TEST_F(DotNormalizerTest, DotWithoutContractingDims) {
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_operand_converter.cc b/third_party/xla/xla/service/gpu/transforms/dot_operand_converter.cc
index d15e67795030..4840c0365fe1 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_operand_converter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_operand_converter.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/dot_operand_converter.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
 namespace xla::gpu {
@@ -39,7 +41,8 @@ bool DotOperandConverter::InstructionMatchesPattern(
   }
 
   // Exclude conversions between FP8 types.
-  absl::flat_hash_set<PrimitiveType> non_converting = {F8E4M3FN, F8E5M2};
+  absl::flat_hash_set<PrimitiveType> non_converting = {F8E4M3FN, F8E5M2,
+                                                       F8E4M3FNUZ, F8E5M2FNUZ};
   if (non_converting.contains(lhs_type) && non_converting.contains(rhs_type)) {
     return false;
   }
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_operand_converter_test.cc b/third_party/xla/xla/service/gpu/transforms/dot_operand_converter_test.cc
index be05b6767abb..03992378cc45 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_operand_converter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_operand_converter_test.cc
@@ -124,6 +124,23 @@ TEST_F(DotOperandConverterTest, NoConvertFromF8toF8) {
   EXPECT_FALSE(upcasted);
 }
 
+TEST_F(DotOperandConverterTest, NoConvertFromF8FNUZtoF8FNUZ) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f8e4m3fnuz[2,3]{1,0} parameter(0)
+    p1 = f8e5m2fnuz[3,2]{1,0} parameter(1)
+    ROOT dot = bf16[2,2]{1,0} dot(p0, p1), lhs_contracting_dims={1},
+                                           rhs_contracting_dims={0}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool upcasted,
+                          DotOperandConverter().Run(module.get()));
+  EXPECT_FALSE(upcasted);
+}
+
 TEST_F(DotOperandConverterTest, CompilerOptimizesUsingDotOperandConverter) {
   absl::string_view module_string = R"(
   HloModule module
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter.cc
index 637689a8063a..108e9acd7746 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter.cc
@@ -71,12 +71,12 @@ class SparseDotRewriterImpl : public DfsHloRewriteVisitor {
 
     // Result dimensions: <batch>, <rhs_noncontracting>, <lhs_noncontracting>
     int batch_dims = dnums.lhs_batch_dimensions().size();
-    int new_lhs_noncontracting = rhs->shape().rank() - batch_dims -
+    int new_lhs_noncontracting = rhs->shape().dimensions().size() - batch_dims -
                                  dnums.lhs_contracting_dimensions().size();
-    int new_rhs_noncontracting = lhs->shape().rank() - batch_dims -
+    int new_rhs_noncontracting = lhs->shape().dimensions().size() - batch_dims -
                                  dnums.rhs_contracting_dimensions().size();
 
-    int rank = dot->shape().rank();
+    int rank = dot->shape().dimensions().size();
     DimensionVector dimensions(rank);
     for (int i = 0; i < batch_dims; ++i) {
       dimensions[i] = i;
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter_test.cc
index 28f813fb5be4..b98238ac97f3 100644
--- a/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dot_sparsity_rewriter_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -32,9 +32,10 @@ namespace {
 
 using ::testing::ElementsAre;
 
-class DotSparsityRewriterTest : public HloTestBase {
+class DotSparsityRewriterTest : public HloHardwareIndependentTestBase {
  public:
-  DotSparsityRewriterTest() : HloTestBase(/*verifier_layout_sensitive=*/true) {}
+  DotSparsityRewriterTest()
+      : HloHardwareIndependentTestBase(/*verifier_layout_sensitive=*/true) {}
 };
 
 TEST_F(DotSparsityRewriterTest, SparseDotRhsToLhs) {
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
index 16646269eb75..3a6d2425d5e0 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
@@ -80,8 +80,6 @@ void SetChannelIdForNewCollective(HloInstruction* new_instr,
     if (channel_id_comp_map.find(new_channel_id) == channel_id_comp_map.end()) {
       channel_id_comp_map[new_channel_id] =
           new_instr->async_wrapped_computation();
-    } else {
-      channel_id_comp_map[new_channel_id]->AddAsyncStart(new_instr);
     }
   } else if (hlo_query::IsCollectiveCommunicationOp(new_instr->opcode()) ||
              hlo_query::IsAsyncCollectiveStartOp(new_instr)) {
@@ -519,6 +517,21 @@ absl::StatusOr<bool> DoubleBufferingUnroll(HloInstruction* while_instr,
 
   WhileLoopBackendConfig new_config;
   new_config.mutable_known_trip_count()->set_n(exact_trip_count / 2);
+
+  // Keep known induction variable metadata if it was present before.
+  if (config.has_known_induction_variable()) {
+    *new_config.mutable_known_induction_variable() =
+        config.known_induction_variable();
+  }
+
+  // Update the init/step metadata if it was present before.
+  if (config.has_known_init_step()) {
+    int64_t step = config.known_init_step().step();
+    new_config.mutable_known_init_step()->set_step(step * 2);
+    new_config.mutable_known_init_step()->set_init(
+        config.known_init_step().init() + (peel_one_iteration ? step : 0));
+  }
+
   TF_RETURN_IF_ERROR(while_instr->set_backend_config(new_config));
   return true;  // changed
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
index c51eb6e75d46..37c373d8d5ec 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
@@ -28,10 +28,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
 #include "xla/hlo/utils/hlo_query.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -58,7 +58,7 @@ int64_t CountInstructions(HloModule& module, HloOpcode opcode) {
   return count;
 }
 
-using GpuLoopDoubleBufferTransformerTest = HloTestBase;
+using GpuLoopDoubleBufferTransformerTest = HloHardwareIndependentTestBase;
 
 TEST_F(GpuLoopDoubleBufferTransformerTest,
        AutoUnrollLoopWhenCollectivesArePresent) {
@@ -92,7 +92,9 @@ ENTRY main {
   param_0 = f32[] parameter(0)
   param_2 = s32[] constant(0)
   tuple = (f32[], s32[]) tuple(param_0, param_2)
-  ROOT while = (f32[], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"}}
+  ROOT while = (f32[], s32[]) while(tuple), condition=condition, body=body,
+      backend_config={"known_trip_count":{"n":"10"},
+                      "known_induction_variable":{"tuple_index":"1"}}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
@@ -110,6 +112,7 @@ ENTRY main {
       WhileLoopBackendConfig config,
       while_instruction->backend_config<WhileLoopBackendConfig>());
   EXPECT_EQ(config.known_trip_count().n(), 5);
+  EXPECT_EQ(config.known_induction_variable().tuple_index(), 1);
   EXPECT_EQ(CountInstructions((*while_instruction->while_body()),
                               HloOpcode::kAllReduceStart),
             2);
@@ -1014,10 +1017,10 @@ ENTRY main {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body {{.+}} {
     // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}}
-    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.+}} %[[cp1]], {{.+}})
-    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.+}} %[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.+}} %[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
-    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.+}} %[[cp2]], {{.+}})
+    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
+    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
+    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %main {{.+}} {
     // CHECK-NOT: collective-permute
@@ -1066,15 +1069,15 @@ ENTRY main {
   VLOG(1) << module->ToString();
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6},{3,6}{{[}]}}}
-    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.+}} %[[cp1]], {{.+}})
-    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.+}} %[[out1]])
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6}{{[}]}}}
-    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.+}} %[[cp2]], {{.+}})
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6},{3,6}{{[}]}}}
+    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
+    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]])
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6}{{[}]}}}
+    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: ENTRY %main {{.+}} {
-    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}}
-    // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.+}} %[[cp_peeled]], {{.+}})
-    // CHECK:   %[[while:.+]] = {{.+}} while({{.+}} %[[out_peeled]])
+    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}}
+    // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.*}}%[[cp_peeled]], {{.*}})
+    // CHECK:   %[[while:.+]] = {{.+}} while({{.*}}%[[out_peeled]])
     // CHECK: }
     )"));
 }
@@ -1120,11 +1123,11 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = f32[] collective-permute(f32[] %param_0), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{4,6},{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3}{{[}]}}}
-    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.+}} %[[cp1]], {{.+}})
-    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.+}} %[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.+}} %[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3},{0,2}{{[}]}}}
-    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.+}} %[[cp2]], {{.+}})
+    // CHECK:   %[[cp1:.+]] = f32[] collective-permute(%param_0), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{4,6},{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3}{{[}]}}}
+    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
+    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3},{0,2}{{[}]}}}
+    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: ENTRY %main
     // CHECK-NOT: collective-permute
     // CHECK: }
@@ -1173,15 +1176,15 @@ ENTRY main {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
     // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{3,6},{2,5},{2,5},{1,4},{1,4},{0,3},{0,3}{{[}]}}}
-    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.+}} %[[cp1]], {{.+}})
-    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.+}} %[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.+}} %[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{2,5},{2,5},{1,4},{1,4},{0,3},{0,3},{0,2}{{[}]}}}
-    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.+}} %[[cp2]], {{.+}})
+    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
+    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{2,5},{2,5},{1,4},{1,4},{0,3},{0,3},{0,2}{{[}]}}}
+    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %main
     // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}}
-    // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.+}} %[[cp_peeled]], {{.+}})
-    // CHECK:   ROOT {{.+}} = {{.+}} while({{.+}} %[[out_peeled]])
+    // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.*}}%[[cp_peeled]], {{.*}})
+    // CHECK:   ROOT {{.+}} = {{.+}} while({{.*}}%[[out_peeled]])
     // CHECK: }
   )"));
 }
@@ -1229,12 +1232,12 @@ ENTRY main {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
     // CHECK:   %[[cp_start1:.+]] = {{.+}} collective-permute-start({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}}
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute-done({{.+}} %[[cp_start1]])
-    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.+}} %[[cp1]], {{.+}})
-    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.+}} %[[out1]]), index=0
-    // CHECK:   %[[cp_start2:.+]] = {{.+}} collective-permute-start({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute-done({{.+}} %[[cp_start2]])
-    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.+}} %[[cp2]], {{.+}})
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute-done({{.*}}%[[cp_start1]])
+    // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
+    // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
+    // CHECK:   %[[cp_start2:.+]] = {{.+}} collective-permute-start({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute-done({{.*}}%[[cp_start2]])
+    // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %main
     // CHECK-NOT: collective-permute
@@ -1397,6 +1400,94 @@ ENTRY main {
   EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(false));
 }
 
+TEST_F(GpuLoopDoubleBufferTransformerTest, UpdateInitStepOddTripCount) {
+  absl::string_view kModuleString = R"(
+    HloModule m
+      condition {
+      input_tuple = (s32[]) parameter(0)
+      iter = s32[] get-tuple-element(input_tuple), index=0
+      c12 = s32[] constant(12)
+      ROOT continue = pred[] compare(iter, c12), direction=LT
+    }
+
+    body {
+      input_tuple = (s32[]) parameter(0)
+      iter = s32[] get-tuple-element(input_tuple), index=0
+      c2 = s32[] constant(2)
+      next_iter = s32[] add(iter, c2)
+      ROOT output_tuple = (s32[]) tuple(next_iter)
+    }
+
+    ENTRY main {
+      c3 = s32[] constant(3)
+      tuple = (s32[]) tuple(c3)
+      // Values: 3, 5, 7, 9, 11
+      ROOT while = (s32[]) while(tuple), condition=condition, body=body,
+        backend_config={"known_trip_count":{"n":"5"},
+                        "known_init_step":{"init":"3","step":"2"}}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  DoubleBufferLoopUnrolling unroller(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kDoubleBuffer);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, unroller.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  EXPECT_EQ(config.known_trip_count().n(), 2);
+  EXPECT_EQ(config.known_init_step().init(), 5);
+  EXPECT_EQ(config.known_init_step().step(), 4);
+}
+
+TEST_F(GpuLoopDoubleBufferTransformerTest, UpdateInitStepEvenTripCount) {
+  absl::string_view kModuleString = R"(
+    HloModule m
+      condition {
+      input_tuple = (s32[]) parameter(0)
+      iter = s32[] get-tuple-element(input_tuple), index=0
+      c14 = s32[] constant(14)
+      ROOT continue = pred[] compare(iter, c14), direction=LT
+    }
+
+    body {
+      input_tuple = (s32[]) parameter(0)
+      iter = s32[] get-tuple-element(input_tuple), index=0
+      c2 = s32[] constant(2)
+      next_iter = s32[] add(iter, c2)
+      ROOT output_tuple = (s32[]) tuple(next_iter)
+    }
+
+    ENTRY main {
+      c3 = s32[] constant(3)
+      tuple = (s32[]) tuple(c3)
+      // Values: 3, 5, 7, 9, 11, 13
+      ROOT while = (s32[]) while(tuple), condition=condition, body=body,
+        backend_config={"known_trip_count":{"n":"6"},
+                        "known_init_step":{"init":"3","step":"2"}}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  DoubleBufferLoopUnrolling unroller(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kDoubleBuffer);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, unroller.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  EXPECT_EQ(config.known_trip_count().n(), 3);
+  EXPECT_EQ(config.known_init_step().init(), 3);
+  EXPECT_EQ(config.known_init_step().step(), 4);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
index 80d1a9a2d88f..d38a33f68535 100644
--- a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <iterator>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -27,9 +28,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/ffi/ffi_api.h"
@@ -47,9 +51,11 @@ limitations under the License.
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -408,7 +414,7 @@ absl::Status CreateRootTuple(
     DataflowPathsView sliced_user_paths,
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
         instr_mapping) {
-  unsigned tuple_size = hero->shape().tuple_shapes_size();
+  unsigned tuple_size = hero->shape().tuple_shapes().size();
 
   std::vector<HloInstruction*> sliced_elems(tuple_size, nullptr);
   for (auto& sliced_user_path : sliced_user_paths) {
@@ -481,7 +487,7 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
 
   // Create a tuple if the hero is a tuple to make sure there's a buffer
   // assigned for each of the elements. Make sure the tuple is not nil first.
-  if (hero->shape().IsTuple() && hero->shape().tuple_shapes_size() > 0) {
+  if (hero->shape().IsTuple() && hero->shape().tuple_shapes().size() > 0) {
     TF_RETURN_IF_ERROR(
         CreateRootTuple(hero, builder, sliced_user_paths, instr_mapping));
   }
@@ -533,8 +539,7 @@ absl::StatusOr<bool> DynamicSliceFusionRewriter::Run(
   for (HloComputation* computation : module->computations()) {
     if (computation->IsFusionComputation()) continue;
     for (HloInstruction* instr : computation->instructions()) {
-      if ((HloPredicateIsOp<HloOpcode::kReduceScatter>(instr) &&
-           instr->shape().IsArray()) ||
+      if ((HloPredicateIsOp<HloOpcode::kReduceScatter>(instr)) ||
           IsLegacyCublasMatmul(*instr) || IsCustomCall(instr, platform_name_)) {
         UseDefDataflowPaths sliced_operand_paths =
             GetSlicedOperandPaths(instr, call_graph.get());
diff --git a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc
index 622fe832785c..7426e7e03a32 100644
--- a/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/dynamic_slice_fusion_rewriter_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <optional>
 
+#include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
@@ -25,20 +26,23 @@ limitations under the License.
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla::gpu {
 
-class DynamicSliceFusionRewriterTest : public HloTestBase {};
+class DynamicSliceFusionRewriterTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(DynamicSliceFusionRewriterTest, SimpleGemm) {
   const char* hlo = R"(
@@ -937,9 +941,11 @@ TEST_F(DynamicSliceFusionRewriterTest, SimpleCustomCall) {
              /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
              /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      ProgramShape program_shape,
+      ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -985,9 +991,11 @@ TEST_F(DynamicSliceFusionRewriterTest, SimpleCustomCallLegacy) {
                     {128}, {1})},
              ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      ProgramShape program_shape,
+      ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -1045,9 +1053,11 @@ TEST_F(DynamicSliceFusionRewriterTest, TupleSliceCustomCallLegacy) {
       },
       ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      ProgramShape program_shape,
+      ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -1117,9 +1127,11 @@ TEST_F(DynamicSliceFusionRewriterTest, TupledOutputCustomCallLegacy) {
   Tuple(&b, {GetTupleElement(GetTupleElement(custom_call, 1), 0),
              GetTupleElement(custom_call, 2)});
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      ProgramShape program_shape,
+      ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -1178,9 +1190,11 @@ TEST_F(DynamicSliceFusionRewriterTest, UnalignedSlice) {
       {Slice(Broadcast(ConstantR0WithType(&b, S32, 42), {17}), {1}, {17}, {1})},
       ShapeUtil::MakeShape(S32, {16}), /*opaque=*/"");
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
-  xla::HloModuleConfig hlo_config(
-      xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      ProgramShape program_shape,
+      ProgramShape::FromProto(computation.proto().host_program_shape()));
+  xla::HloModuleConfig hlo_config(program_shape,
+                                  /*ignore_layouts=*/false);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_dynamic_slice_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -2060,31 +2074,6 @@ TEST_F(DynamicSliceFusionRewriterTest, DUSSimpleGemmLaxScan) {
   RunAndFilecheckHloRewrite(hlo, DynamicSliceFusionRewriter("gpu"), expected);
 }
 
-// Remove this when tuple support is added to dynamic slice fusion
-TEST_F(DynamicSliceFusionRewriterTest, DUSReduceScatterTupleNoTransform) {
-  const char* hlo = R"(
-  HloModule test, replica_count=2
-
-  add {
-    param_0 = f16[] parameter(0)
-    param_1 = f16[] parameter(1)
-    ROOT add.1 = f16[] add(param_0, param_1)
-  }
-
-  ENTRY main.9 {
-    param_0 = f16[128,128]{1,0} parameter(0)
-    param_1 = f16[128,128]{1,0} parameter(1)
-    param_2 = f16[128,128]{1,0} parameter(2)
-    constant_20 = u32[] constant(20)
-    constant_0 = u32[] constant(0)
-    reduce-scatter = (f16[64,128]{1,0}, f16[64,128]{1,0}) reduce-scatter(param_0, param_2), channel_id=64, replica_groups={{0,1}}, use_global_device_ids=true, dimensions={0}, to_apply=add
-    rs1 = get-tuple-element(reduce-scatter), index=0
-    ROOT loop_dynamic_update_slice_fusion = f16[128,128]{1,0} dynamic-update-slice(param_1, rs1, constant_20, constant_0)
-  })";
-  RunAndFilecheckHloRewrite(hlo, DynamicSliceFusionRewriter("gpu"),
-                            std::nullopt);
-}
-
 TEST_F(DynamicSliceFusionRewriterTest, ReduceScatterSlice) {
   const char* hlo = R"(
   HloModule jit_slice, replica_count=2
@@ -2196,4 +2185,73 @@ TEST_F(DynamicSliceFusionRewriterTest,
   )");
 }
 
+TEST_F(DynamicSliceFusionRewriterTest,
+       ReduceScatterDynamicSliceAndDUSMultipleBuffersGetsFused) {
+  const char* hlo = R"(
+    HloModule test, replica_count=2
+    add {
+      a = s32[] parameter(0)
+      b = s32[] parameter(1)
+      ROOT add = s32[] add(a, b)
+    }
+    body {
+      param.1 = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) parameter(0)
+      iter.1 = s32[] get-tuple-element(param.1), index=0
+      c1 = s32[] constant(1)
+      c0 = s32[] constant(0)
+      src1 = s32[8,8,8] get-tuple-element(param.1), index=1
+      src2 = s32[8,8,8] get-tuple-element(param.1), index=2
+      dst1 = s32[8,4,8] get-tuple-element(param.1), index=3
+      dst2 = s32[8,4,8] get-tuple-element(param.1), index=4
+      ds1 = s32[1,8,8]{2,1,0} dynamic-slice(src1, iter.1, c0, c0), dynamic_slice_sizes={1,8,8}
+      ds2 = s32[1,8,8]{2,1,0} dynamic-slice(src2, iter.1, c0, c0), dynamic_slice_sizes={1,8,8}
+      rs1 = s32[8,8] bitcast(ds1)
+      rs2 = s32[8,8] bitcast(ds2)
+      rs = (s32[4,8], s32[4,8]) reduce-scatter(rs1, rs2), dimensions={0}, replica_groups={{0,1}}, to_apply=add
+      reduce-scatter1 = s32[4,8] get-tuple-element(rs), index=0
+      reduce-scatter2 = s32[4,8] get-tuple-element(rs), index=1
+      bitcast1 = s32[1,4,8] bitcast(reduce-scatter1)
+      bitcast2 = s32[1,4,8] bitcast(reduce-scatter2)
+      dus1 = s32[8,4,8] dynamic-update-slice(dst1, bitcast1, iter.1, c0, c0)
+      dus2 = s32[8,4,8] dynamic-update-slice(dst2, bitcast2, iter.1, c0, c0)
+      add = s32[] add(iter.1, c1)
+      ROOT tuple = tuple(add, src1, src2, dus1, dus2)
+    }
+    condition {
+      param.2 = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) parameter(0)
+      iter.2 = s32[] get-tuple-element(param.2), index=0
+      c8 = s32[] constant(8)
+      ROOT compare = pred[] compare(iter.2, c8), direction=LT
+    }
+    ENTRY main {
+      c0 = s32[] constant(0)
+      p1 = s32[8,8,8] parameter(0)
+      p2 = s32[8,8,8] parameter(1)
+      p3 = s32[8,4,8] parameter(2)
+      p4 = s32[8,4,8] parameter(3)
+      tuple = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) tuple(c0, p1, p2, p3, p4)
+      ROOT while = (s32[], s32[8,8,8], s32[8,8,8], s32[8,4,8], s32[8,4,8]) while(tuple), body=body, condition=condition
+    }
+  )";
+
+  // Checking for 2 dynamic-slices, their uses in reduce-scatter and their
+  // update via dus inside the fusion.
+  RunAndFilecheckHloRewrite(hlo, DynamicSliceFusionRewriter("gpu"), R"(
+    // CHECK: dynamic-slice-fusion
+    // CHECK-DAG:   %[[ds1:.+]] = {{.+}} dynamic-slice({{.+}})
+    // CHECK-DAG:   %[[ds2:.+]] = {{.+}} dynamic-slice({{.+}})
+    // CHECK-DAG:   %[[bc1:.+]] = {{.+}} bitcast(%[[ds1]])
+    // CHECK-DAG:   %[[bc2:.+]] = {{.+}} bitcast(%[[ds2]])
+    // CHECK-DAG:   %[[rs:.+]] = {{.+}} reduce-scatter(%[[bc1]], %[[bc2]])
+    // CHECK-DAG:   %[[rs1:.+]] = {{.+}} get-tuple-element(%[[rs]]), index=0
+    // CHECK-DAG:   %[[rs2:.+]] = {{.+}} get-tuple-element(%[[rs]]), index=1
+    // CHECK-DAG:   %[[bc3:.+]] = {{.+}} bitcast(%[[rs1]])
+    // CHECK-DAG:   %[[bc4:.+]] = {{.+}} bitcast(%[[rs2]])
+    // CHECK-DAG:   %[[dus1:.+]] = {{.+}} dynamic-update-slice({{.+}}, %[[bc3]], {{.+}})
+    // CHECK-DAG:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.+}}, %[[bc4]], {{.+}})
+
+    // CHECK: body
+  )");
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
new file mode 100644
index 000000000000..57733c1feb5e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.cc
@@ -0,0 +1,87 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h"
+
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/side_effect_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+absl::StatusOr<bool> CreateCollectivesGroupAsyncPair(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kCall ||
+      !instr->frontend_attributes().map().contains(kCollectivesGroupAttr)) {
+    return false;
+  }
+  HloComputation* computation = instr->parent();
+  auto new_computation = instr->GetModule()->AddEmbeddedComputation(
+      instr->to_apply()->Clone("collectives_group"));
+  for (auto inner_instruction : new_computation->instructions()) {
+    inner_instruction->erase_frontend_attribute(kXlaSchedulingGroupIdAttr);
+  }
+  // Get the shapes for the original instruction.
+  std::vector<const Shape*> parameter_shapes(instr->operand_count());
+  for (int i = 0; i < instr->operand_count(); ++i) {
+    parameter_shapes[i] = &instr->operand(i)->shape();
+  }
+  std::vector<Shape> start_shapes = {
+      ShapeUtil::MakeTupleShapeWithPtrs(parameter_shapes), instr->shape()};
+  HloInstruction* async_start =
+      computation->AddInstruction(HloInstruction::CreateAsyncStart(
+          ShapeUtil::MakeTupleShape(start_shapes), instr->operands(),
+          new_computation, "explicit"));
+  HloInstruction* async_done = computation->AddInstruction(
+      HloInstruction::CreateAsyncDone(instr->shape(), async_start));
+  // Forward frontend attributes to both async instructions.
+  async_start->set_frontend_attributes(instr->frontend_attributes());
+  async_done->set_frontend_attributes(instr->frontend_attributes());
+  TF_RETURN_IF_ERROR(computation->ReplaceInstruction(instr, async_done));
+  return true;
+}
+}  // namespace
+
+absl::StatusOr<bool> ExplicitCollectivesGroupAsyncWrapper::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (const HloComputation* comp :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instr : comp->instructions()) {
+      TF_ASSIGN_OR_RETURN(bool result, CreateCollectivesGroupAsyncPair(instr));
+      changed |= result;
+    }
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
new file mode 100644
index 000000000000..21795c32824b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_EXPLICIT_COLLECTIVES_GROUP_ASYNC_WRAPPER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_EXPLICIT_COLLECTIVES_GROUP_ASYNC_WRAPPER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// This pass will find the kCall instructions that
+// are annotated with explicit collectives groups in their frontend
+// attributes. It then will convert the kCall into an async
+// start-done pair with the same computation. This is then
+// picked up by the IR emitter stage, and the entire computation
+// will be launched in a single Collective Group.
+class ExplicitCollectivesGroupAsyncWrapper : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "explicit-collectives-group-async-wrapper";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_EXPLICIT_COLLECTIVES_GROUP_ASYNC_WRAPPER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper_test.cc b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper_test.cc
new file mode 100644
index 000000000000..50ff5231e1b3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_collectives_group_async_wrapper_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/side_effect_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+using ExplicitCollectivesGroupAsyncWrapperTest = HloHardwareIndependentTestBase;
+
+TEST_F(ExplicitCollectivesGroupAsyncWrapperTest, AnnotatedOpIsWrapped) {
+  const absl::string_view hlo_string = R"(
+  HloModule composite
+  comms {
+    a = f32[1] parameter(0)
+    x = f32[1] all-gather(a), dimensions={0}
+    y = f32[1] collective-permute(a), source_target_pairs={{0,1}}
+    ROOT result = (f32[1], f32[1]) tuple(x, y)
+  }
+
+  ENTRY main {
+    b = f32[1] parameter(0)
+    ROOT c = (f32[1], f32[1]) call(b), to_apply=comms, frontend_attributes={_collectives_group=""}
+  }
+  )";
+
+  auto debug_options = HloHardwareIndependentTestBase::GetDebugOptionsForTest();
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  ExplicitCollectivesGroupAsyncWrapper wrapper_pass;
+
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, wrapper_pass.Run(module.get()));
+  absl::StatusOr<bool> filecheck_result = RunFileCheck(module->ToString({}), R"(
+  // CHECK: %b = f32[1]{0} parameter(0)
+  // CHECK: %tuple-start = ((f32[1]{0}), (f32[1]{0}, f32[1]{0})) async-start(%b), async_execution_thread="explicit", calls=%comms.collectives_group, frontend_attributes={_collectives_group=""}
+  // CHECK: ROOT %tuple-done = (f32[1]{0}, f32[1]{0}) async-done(%tuple-start), frontend_attributes={_collectives_group=""}
+  )");
+  TF_ASSERT_OK(filecheck_result.status());
+  EXPECT_TRUE(*filecheck_result);
+  ASSERT_TRUE(mutated);
+}
+
+TEST_F(ExplicitCollectivesGroupAsyncWrapperTest,
+       RemoveSchedulingGroupAnnotation) {
+  const absl::string_view hlo_string = R"(
+  HloModule composite
+  comms {
+    a = f32[1] parameter(0)
+    x = f32[1] all-gather(a), replica_groups={}, dimensions={0}, frontend_attributes={_scheduling_group_id="1"}
+    y = f32[1] collective-permute(a), source_target_pairs={{0,1}}, frontend_attributes={_scheduling_group_id="1"}
+    ROOT result = (f32[1], f32[1]) tuple(x, y)
+  }
+
+  ENTRY main {
+    b = f32[1] parameter(0)
+    ROOT c = (f32[1], f32[1]) call(b), to_apply=comms, frontend_attributes={_collectives_group="", _scheduling_group_id="1"}
+  }
+  )";
+
+  auto debug_options = HloHardwareIndependentTestBase::GetDebugOptionsForTest();
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  ExplicitCollectivesGroupAsyncWrapper wrapper_pass;
+
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, wrapper_pass.Run(module.get()));
+  // Assert that the scheduling annotation is removed within the cloned
+  // computation, but remains on the async operations.
+  absl::StatusOr<bool> filecheck_result = RunFileCheck(module->ToString({}), R"(
+  // CHECK: %comms.collectives_group {{.*}} {
+  // CHECK-NEXT: %{{.*}} parameter(0)
+  // CHECK-NEXT: %{{.*}} all-gather({{.*}}), replica_groups={}, dimensions={0}
+  // CHECK-NEXT: %{{.*}} collective-permute({{.*}}), source_target_pairs={{[{][{]0,1[}][}]}}
+  // CHECK: ENTRY %main {{.*}}
+  // CHECK-NEXT: %[[P0:.*]] = {{.*}} parameter(0)
+  // CHECK-NEXT: %[[P1:.*]] = {{.*}} async-start(%[[P0]]), async_execution_thread="explicit", calls=%comms.collectives_group, frontend_attributes={_collectives_group="",_scheduling_group_id="1"}  
+  // CHECK-NEXT: ROOT %{{.*}} async-done(%[[P1]]), frontend_attributes={_collectives_group="",_scheduling_group_id="1"}
+  )");
+  TF_ASSERT_OK(filecheck_result.status());
+  EXPECT_TRUE(*filecheck_result);
+  ASSERT_TRUE(mutated);
+}
+
+TEST_F(ExplicitCollectivesGroupAsyncWrapperTest, ManyCollectivesGroups) {
+  // This test calls the same collectives group computation twice, so the
+  // computation is cloned so it can be used with many async instructions.
+  const absl::string_view hlo_string = R"(
+  HloModule composite
+  comms {
+    a = f32[1] parameter(0)
+    x = f32[1] all-gather(a), dimensions={0}
+    y = f32[1] collective-permute(a), source_target_pairs={{0,1}}
+    ROOT result = (f32[1], f32[1]) tuple(x, y)
+  }
+
+  ENTRY main {
+    b = f32[1] parameter(0)
+    group1 = (f32[1], f32[1]) call(b), to_apply=comms, frontend_attributes={_collectives_group=""}
+    c = get-tuple-element(group1), index=0
+    ROOT d = (f32[1], f32[1]) call(c), to_apply=comms, frontend_attributes={_collectives_group=""}
+  }
+  )";
+
+  auto debug_options = HloHardwareIndependentTestBase::GetDebugOptionsForTest();
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  ExplicitCollectivesGroupAsyncWrapper wrapper_pass;
+
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, wrapper_pass.Run(module.get()));
+  absl::StatusOr<bool> filecheck_result = RunFileCheck(module->ToString({}), R"(
+  // CHECK: %b = f32[1]{0} parameter(0)
+  // CHECK: %tuple-start = ((f32[1]{0}), (f32[1]{0}, f32[1]{0})) async-start(%b), async_execution_thread="explicit", calls=%comms.collectives_group, frontend_attributes={_collectives_group=""} 
+  // CHECK: %tuple-done = (f32[1]{0}, f32[1]{0}) async-done(%tuple-start), frontend_attributes={_collectives_group=""}
+  // CHECK: %c = f32[1]{0} get-tuple-element(%tuple-done), index=0
+  // CHECK: %tuple-start.1 = ((f32[1]{0}), (f32[1]{0}, f32[1]{0})) async-start(%c), async_execution_thread="explicit", calls=%comms.collectives_group.1, frontend_attributes={_collectives_group=""}
+  // CHECK: ROOT %tuple-done.1 = (f32[1]{0}, f32[1]{0}) async-done(%tuple-start.1), frontend_attributes={_collectives_group=""}
+  )");
+  TF_ASSERT_OK(filecheck_result.status());
+  EXPECT_TRUE(*filecheck_result);
+  ASSERT_TRUE(mutated);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
index 75c52e045edd..249b57132a45 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
@@ -80,7 +80,8 @@ absl::StatusOr<bool> ExplicitStreamAnnotationAsyncWrapper::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-  for (const HloComputation* comp : module->computations()) {
+  for (const HloComputation* comp :
+       module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instr : comp->instructions()) {
       TF_ASSIGN_OR_RETURN(bool result, AsynchronizeInstruction(instr));
       changed |= result;
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper_test.cc b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper_test.cc
index 41497e317f89..ab42940992ba 100644
--- a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper_test.cc
@@ -22,16 +22,16 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/side_effect_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
-using ExplicitStreamAnnotationAsyncWrapperTest = HloTestBase;
+using ExplicitStreamAnnotationAsyncWrapperTest = HloHardwareIndependentTestBase;
 
 TEST_F(ExplicitStreamAnnotationAsyncWrapperTest, AnnotatedOpIsWrapped) {
   const absl::string_view hlo_string = R"(
@@ -48,7 +48,7 @@ TEST_F(ExplicitStreamAnnotationAsyncWrapperTest, AnnotatedOpIsWrapped) {
     %call1 = f32[] call(f32[] %lhs), to_apply=%sub, frontend_attributes={_xla_stream_annotation="1"}
   })";
 
-  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  auto debug_options = HloHardwareIndependentTestBase::GetDebugOptionsForTest();
   debug_options.set_xla_gpu_experimental_stream_annotation(true);
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
   module->mutable_config().set_debug_options(debug_options);
@@ -57,8 +57,8 @@ TEST_F(ExplicitStreamAnnotationAsyncWrapperTest, AnnotatedOpIsWrapped) {
   TF_ASSERT_OK_AND_ASSIGN(bool mutated, wrapper_pass.Run(module.get()));
   absl::StatusOr<bool> filecheck_result = RunFileCheck(module->ToString({}), R"(
   // CHECK: %lhs.1 = f32[] constant(42)
-  // CHECK: %call-start = ((f32[]), f32[]) call-start(f32[] %lhs.1), async_execution_thread="explicit", to_apply=%sub, frontend_attributes={_xla_stream_annotation="1"}
-  // CHECK: ROOT %call-done = f32[] call-done(((f32[]), f32[]) %call-start), frontend_attributes={_xla_stream_annotation="1"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false}
+  // CHECK: %call-start = ((f32[]), f32[]) call-start(%lhs.1), async_execution_thread="explicit", to_apply=%sub, frontend_attributes={_xla_stream_annotation="1"}
+  // CHECK: ROOT %call-done = f32[] call-done(%call-start), frontend_attributes={_xla_stream_annotation="1"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false
   )");
   TF_ASSERT_OK(filecheck_result.status());
   EXPECT_TRUE(*filecheck_result);
@@ -92,7 +92,7 @@ TEST_F(ExplicitStreamAnnotationAsyncWrapperTest, OverlappingGemms) {
     ROOT %call2 =  f32[2048,2048]{1,0} call(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), to_apply=%gemm2, frontend_attributes={_scheduling_group_id="1", _xla_stream_annotation="1"}
   })";
 
-  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  auto debug_options = HloHardwareIndependentTestBase::GetDebugOptionsForTest();
   debug_options.set_xla_gpu_experimental_stream_annotation(true);
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
   module->mutable_config().set_debug_options(debug_options);
@@ -102,10 +102,10 @@ TEST_F(ExplicitStreamAnnotationAsyncWrapperTest, OverlappingGemms) {
   ASSERT_TRUE(mutated);
 
   absl::StatusOr<bool> filecheck_result = RunFileCheck(module->ToString({}), R"(
-  // CHECK: %call-start = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), async_execution_thread="explicit", to_apply=%gemm1, frontend_attributes={_scheduling_group_id="0",_xla_stream_annotation="2"}
-  // CHECK: %call-done = f32[2048,2048]{1,0} call-done(((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) %call-start), frontend_attributes={_scheduling_group_id="0",_xla_stream_annotation="2"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false}
-  // CHECK: %call-start.1 = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), async_execution_thread="explicit", to_apply=%gemm2, frontend_attributes={_scheduling_group_id="1",_xla_stream_annotation="1"} 
-  // CHECK: ROOT %call-done.1 = f32[2048,2048]{1,0} call-done(((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) %call-start.1), frontend_attributes={_scheduling_group_id="1",_xla_stream_annotation="1"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false}
+  // CHECK: %call-start = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(%x, %y), async_execution_thread="explicit", to_apply=%gemm1, frontend_attributes={_scheduling_group_id="0",_xla_stream_annotation="2"}
+  // CHECK: %call-done = f32[2048,2048]{1,0} call-done(%call-start), frontend_attributes={_scheduling_group_id="0",_xla_stream_annotation="2"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false
+  // CHECK: %call-start.1 = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(%x, %y), async_execution_thread="explicit", to_apply=%gemm2, frontend_attributes={_scheduling_group_id="1",_xla_stream_annotation="1"}
+  // CHECK: ROOT %call-done.1 = f32[2048,2048]{1,0} call-done(%call-start.1), frontend_attributes={_scheduling_group_id="1",_xla_stream_annotation="1"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false
   )");
   TF_ASSERT_OK(filecheck_result.status());
   EXPECT_TRUE(*filecheck_result);
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
index fd8c8b6e0082..57a780bb2af0 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/fusion_block_level_rewriter.h"
 
-#include <cstdint>
 #include <memory>
 #include <variant>
 
@@ -30,13 +29,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -58,7 +57,7 @@ bool HasTritonBlockLevelFusionConfig(const HloInstruction* fusion) {
                  .kind() == kTritonFusionKind;
 }
 
-class FusionBlockLevelRewriterTest : public HloTestBase {
+class FusionBlockLevelRewriterTest : public HloHardwareIndependentTestBase {
  protected:
   se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo(
       se::CudaComputeCapability::Ampere())};
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
new file mode 100644
index 000000000000..c27c22faf66b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.cc
@@ -0,0 +1,65 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h"
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/codegen/copy.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<bool> FusionDynamicMemcpyRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool has_changed = false;
+
+  for (HloComputation* computation : module->computations(execution_threads)) {
+    if (!computation->IsFusionComputation()) {
+      continue;
+    }
+
+    HloFusionInstruction* fusion =
+        ::xla::Cast<HloFusionInstruction>(computation->FusionInstruction());
+    if (DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(*fusion)) {
+      TF_ASSIGN_OR_RETURN(auto backend_config,
+                          fusion->backend_config<GpuBackendConfig>());
+      backend_config.mutable_fusion_backend_config()->set_kind(
+          std::string(kDynamicMemcpyFusionKind));
+      TF_RETURN_IF_ERROR(fusion->set_backend_config(backend_config));
+      has_changed = true;
+    }
+  }
+
+  return has_changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
new file mode 100644
index 000000000000..14543734e53b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h
@@ -0,0 +1,48 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_FUSION_DYNAMIC_MEMCPY_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_FUSION_DYNAMIC_MEMCPY_REWRITER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Detects fusions that can be emitted using DynamicMemcpyFusion and annotates
+// them accordingly.
+class FusionDynamicMemcpyRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "fusion-dynamic-memcpy-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_FUSION_DYNAMIC_MEMCPY_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter_test.cc
new file mode 100644
index 000000000000..d7205cb68071
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/fusion_dynamic_memcpy_rewriter.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+
+using FusionDynamicMemcpyRewriterTest = HloHardwareIndependentTestBase;
+
+bool IsMemcpyFusion(const HloInstruction* instr) {
+  const auto& config = instr->backend_config<GpuBackendConfig>();
+  return config.ok() &&
+         config->fusion_backend_config().kind() == kDynamicMemcpyFusionKind;
+}
+
+constexpr char kSliceMemcpyModule[] = R"(
+    // This fusion is technically not a dynamic memcpy. Tests for that are in
+    // the unit tests for DynamicMemcpyFusion::GetMemcpyDescriptorForFusion,
+    // in copy_test.cc. Here, we just test that the logic triggers as expected.
+    dynamic_slice {
+      p0 = s32[4] parameter(0)
+      c1 = s32[] constant(1)
+
+      ROOT slice = s32[1] dynamic-slice(p0, c1), dynamic_slice_sizes={1}
+    }
+
+    ENTRY main {
+      p0 = s32[4] parameter(0)
+      ROOT fusion = s32[1] fusion(p0), kind=kLoop, calls=dynamic_slice
+    })";
+
+TEST_F(FusionDynamicMemcpyRewriterTest, AnnotatesMemcpyFusion) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kSliceMemcpyModule));
+  EXPECT_THAT(FusionDynamicMemcpyRewriter().Run(module.get()),
+              IsOkAndHolds(true));
+  EXPECT_TRUE(IsMemcpyFusion(module->entry_computation()->root_instruction()))
+      << module->ToString();
+}
+
+constexpr char kSliceCallModule[] = R"(
+    dynamic_slice {
+      p0 = s32[4] parameter(0)
+      c1 = s32[] constant(1)
+
+      ROOT slice = s32[1] dynamic-slice(p0, c1), dynamic_slice_sizes={1}
+    }
+
+    ENTRY main {
+      p0 = s32[4] parameter(0)
+      ROOT call = s32[1] call(p0), to_apply=dynamic_slice
+    })";
+
+TEST_F(FusionDynamicMemcpyRewriterTest, DoesNotAnnotateCall) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kSliceCallModule));
+  EXPECT_THAT(FusionDynamicMemcpyRewriter().Run(module.get()),
+              IsOkAndHolds(false))
+      << module->ToString();
+  EXPECT_FALSE(IsMemcpyFusion(module->entry_computation()->root_instruction()))
+      << module->ToString();
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc
index 287859ba7dd5..4d64fa9a5365 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.cc
@@ -14,140 +14,89 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 
-#include <functional>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/gpu_fusible.h"
-#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<bool> FusionWrapper::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  auto instructions = module->entry_computation()->MakeInstructionPostOrder();
-  bool changed = false;
-
-  std::function<absl::Status(HloInstruction*)> handle_instruction;
-  handle_instruction = [&](HloInstruction* instruction) -> absl::Status {
-    switch (instruction->opcode()) {
-      case HloOpcode::kConditional:
-      case HloOpcode::kWhile:
-        for (auto* computation : instruction->called_computations()) {
-          for (auto* inner_instruction :
-               computation->MakeInstructionPostOrder()) {
-            TF_RETURN_IF_ERROR(handle_instruction(inner_instruction));
-          }
-        }
-        break;
-      case HloOpcode::kAbs:
-      case HloOpcode::kAdd:
-      case HloOpcode::kAnd:
-      case HloOpcode::kAtan2:
-      case HloOpcode::kBitcastConvert:
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kCeil:
-      case HloOpcode::kCbrt:
-      case HloOpcode::kClamp:
-      case HloOpcode::kClz:
-      case HloOpcode::kCompare:
-      case HloOpcode::kComplex:
-      case HloOpcode::kConcatenate:
-      case HloOpcode::kConvolution:
-      case HloOpcode::kConvert:
-      case HloOpcode::kCos:
-      case HloOpcode::kDivide:
-      case HloOpcode::kDot:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-      case HloOpcode::kErf:
-      case HloOpcode::kExp:
-      case HloOpcode::kExpm1:
-      case HloOpcode::kFloor:
-      case HloOpcode::kGather:
-      case HloOpcode::kImag:
-      case HloOpcode::kIota:
-      case HloOpcode::kIsFinite:
-      case HloOpcode::kLog:
-      case HloOpcode::kLog1p:
-      case HloOpcode::kMap:
-      case HloOpcode::kMaximum:
-      case HloOpcode::kMinimum:
-      case HloOpcode::kMultiply:
-      case HloOpcode::kNegate:
-      case HloOpcode::kNot:
-      case HloOpcode::kOr:
-      case HloOpcode::kPad:
-      case HloOpcode::kPopulationCount:
-      case HloOpcode::kPower:
-      case HloOpcode::kReal:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReduce:
-      case HloOpcode::kReducePrecision:
-      case HloOpcode::kReduceWindow:
-      case HloOpcode::kRemainder:
-      case HloOpcode::kReverse:
-      case HloOpcode::kRoundNearestAfz:
-      case HloOpcode::kRoundNearestEven:
-      case HloOpcode::kRsqrt:
-      case HloOpcode::kScatter:
-      case HloOpcode::kSelect:
-      case HloOpcode::kShiftLeft:
-      case HloOpcode::kShiftRightLogical:
-      case HloOpcode::kShiftRightArithmetic:
-      case HloOpcode::kSign:
-      case HloOpcode::kSin:
-      case HloOpcode::kSlice:
-      case HloOpcode::kSqrt:
-      case HloOpcode::kSubtract:
-      case HloOpcode::kStochasticConvert:
-      case HloOpcode::kTan:
-      case HloOpcode::kTanh:
-      case HloOpcode::kTranspose:
-      case HloOpcode::kXor: {
-        auto* computation = instruction->parent();
-        auto* fusion_instruction =
-            computation->AddInstruction(HloInstruction::CreateFusion(
-                instruction->shape(),
-                ChooseFusionKind(*instruction, *instruction,
-                                 device_description_),
-                instruction));
-        const absl::string_view wrapped_opcode =
-            HloOpcodeString(instruction->opcode());
-        module->SetAndUniquifyInstrName(
-            fusion_instruction, absl::StrCat("wrapped_", wrapped_opcode));
-        module->SetAndUniquifyComputationName(
-            fusion_instruction->fused_instructions_computation(),
-            absl::StrCat("wrapped_", wrapped_opcode, "_computation"));
-        if (module->has_schedule()) {
-          module->schedule().replace_instruction(computation, instruction,
-                                                 fusion_instruction);
-        }
-        TF_RETURN_IF_ERROR(
-            fusion_instruction->CopyAllControlDepsFrom(instruction));
-        TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
-        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(fusion_instruction));
-        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
-        changed = true;
-        break;
-      }
-      default:
-        break;
-    }
-    return absl::OkStatus();
-  };
-
-  for (auto* instruction : instructions) {
-    TF_RETURN_IF_ERROR(handle_instruction(instruction));
+bool FusionWrapper::MustWrapInstruction(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kAbs:
+    case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
+    case HloOpcode::kAtan2:
+    case HloOpcode::kBitcastConvert:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kCeil:
+    case HloOpcode::kCbrt:
+    case HloOpcode::kClamp:
+    case HloOpcode::kClz:
+    case HloOpcode::kCompare:
+    case HloOpcode::kComplex:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kConvolution:
+    case HloOpcode::kConvert:
+    case HloOpcode::kCos:
+    case HloOpcode::kDivide:
+    case HloOpcode::kDot:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kErf:
+    case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
+    case HloOpcode::kFloor:
+    case HloOpcode::kGather:
+    case HloOpcode::kImag:
+    case HloOpcode::kIota:
+    case HloOpcode::kIsFinite:
+    case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
+    case HloOpcode::kMap:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
+    case HloOpcode::kPad:
+    case HloOpcode::kPopulationCount:
+    case HloOpcode::kPower:
+    case HloOpcode::kReal:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReducePrecision:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kRemainder:
+    case HloOpcode::kReverse:
+    case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRoundNearestEven:
+    case HloOpcode::kRsqrt:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelect:
+    case HloOpcode::kShiftLeft:
+    case HloOpcode::kShiftRightLogical:
+    case HloOpcode::kShiftRightArithmetic:
+    case HloOpcode::kSign:
+    case HloOpcode::kSin:
+    case HloOpcode::kSlice:
+    case HloOpcode::kSqrt:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kStochasticConvert:
+    case HloOpcode::kTan:
+    case HloOpcode::kTanh:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kXor:
+      return true;
+    default:
+      return false;
   }
-  return changed;
+}
+
+HloInstruction::FusionKind FusionWrapper::ChooseFusionKind(
+    const HloInstruction& producer, const HloInstruction& consumer) {
+  return gpu::ChooseFusionKind(producer, consumer, device_description_);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
index 804d0590ee33..c8a7a4498f11 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper.h
@@ -15,11 +15,10 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_TRANSFORMS_FUSION_WRAPPER_H_
 #define XLA_SERVICE_GPU_TRANSFORMS_FUSION_WRAPPER_H_
 
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/codegen/emitters/fusion_wrapper_base.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -27,16 +26,16 @@ namespace gpu {
 
 // Wraps leftover unfused instruction that are in the entry computation that
 // have no LHLO equivalent in fusions containing just that instruction.
-class FusionWrapper : public HloModulePass {
+class FusionWrapper : public emitters::FusionWrapperBase {
  public:
   explicit FusionWrapper(const se::DeviceDescription& device_description)
       : device_description_(device_description) {}
+
   absl::string_view name() const override { return "fusion-wrapper"; }
 
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  bool MustWrapInstruction(HloOpcode opcode) override;
+  HloInstruction::FusionKind ChooseFusionKind(
+      const HloInstruction& producer, const HloInstruction& consumer) override;
 
  private:
   const se::DeviceDescription& device_description_;
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper_test.cc b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper_test.cc
index 84421328cabd..54d5b1b582da 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_wrapper_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_wrapper_test.cc
@@ -14,11 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 
-#include <cstdint>
 #include <optional>
 
 #include <gtest/gtest.h>
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/stream_executor/device_description.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -31,10 +31,8 @@ auto MakeDeviceDescription() {
   return device_description;
 }
 
-class FusionWrapperTest : public HloTestBase {
+class FusionWrapperTest : public HloHardwareIndependentTestBase {
  public:
-  using HloTestBase::HloTestBase;
-
   const stream_executor::DeviceDescription& device_description() const {
     return device_description_;
   }
@@ -236,6 +234,35 @@ TEST_F(FusionWrapperTest, WhileInFusion) {
                             std::nullopt);
 }
 
+TEST_F(FusionWrapperTest, AsyncComputationFusion) {
+  RunAndFilecheckHloRewrite(R"(
+      HloModule AsyncComputation
+
+      mul {
+        a = f32[5] parameter(0)
+        ROOT b = f32[5] multiply(a, a)
+      }
+
+      ENTRY %main {
+        parameter = f32[5] parameter(0)
+        start = ((f32[5]), f32[5]) call-start(parameter), to_apply=mul
+        ROOT done = f32[5] call-done(start)
+      })",
+                            FusionWrapper(device_description()), R"(
+//CHECK:      %wrapped_multiply_computation {{.*}} {
+//CHECK-NEXT:   %[[P0:.*]] = {{.*}} parameter(0)
+//CHECK-NEXT:   ROOT {{.*}} multiply(%[[P0]], %[[P0]])
+//CHECK-NEXT: }
+//CHECK:      %mul {{.*}} {
+//CHECK-NEXT:   %[[P0:.*]] = {{.*}} parameter(0)
+//CHECK-NEXT:   ROOT {{.*}} fusion(%[[P0]]), kind=kLoop, calls=%wrapped_multiply_computation
+//CHECK-NEXT: }
+//CHECK:     ENTRY %main {{.*}} {
+//CHECK-NEXT:        %[[P0:.*]] = {{.*}} parameter(0)
+//CHECK-NEXT:        {{.*}} call-start(%[[P0]]), to_apply=%mul
+)");
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
index 3395ed8a7db1..dceb042dfd74 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.cc
@@ -53,8 +53,8 @@ class GemmBroadcastFoldingVisitor : public DfsHloRewriteVisitor {
       GemmBackendConfig &config = *gpu_config.mutable_gemm_backend_config();
       DotDimensionNumbers *dim_nums = config.mutable_dot_dimension_numbers();
       int bcast_operand_index = instr->operand_index(bcast);
-      int num_bcast_dims = (bcast->shape().dimensions_size() -
-                            bcast->operand(0)->shape().dimensions_size());
+      int num_bcast_dims = (bcast->shape().dimensions().size() -
+                            bcast->operand(0)->shape().dimensions().size());
       int num_batch_dims = dim_nums->lhs_batch_dimensions_size();
 
       const tsl::protobuf::RepeatedField<int64_t> &batch_dimensions =
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter_test.cc
index 6973521e7837..312a57e4678d 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_broadcast_folding_rewriter_test.cc
@@ -17,11 +17,13 @@ limitations under the License.
 
 #include <memory>
 
+#include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
 #include "xla/stream_executor/semantic_version.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
index d12d87f2ec07..90029be40425 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion.cc
@@ -673,18 +673,9 @@ absl::StatusOr<Decision> CreateDotFusion(
     return Decision::Deny(is_supported.Explain());
   }
 
-  // Verify sparse dot constraints.
+  // Verify not sparse.
   if (dot.sparse_operands()) {
-    const SparsityDescriptor& descriptor = dot.sparsity().front();
-    if (dot.sparse_operands() != 1 || descriptor.index() != 0) {
-      return InvalidArgument("Sparsity is only supported on left operand");
-    }
-    if (descriptor.type() != SparsityType::SPARSITY_STRUCTURED_N_M ||
-        descriptor.n() != 2 || descriptor.m() != 4) {
-      return InvalidArgument("Only 2:4 structured sparsity is supported");
-    }
-    // DotDimensionSorter pass makes sure the sparse dimension is minor.
-    CHECK_EQ(descriptor.dimension(), dot.operand(0)->shape().rank() - 1);
+    return InvalidArgument("Sparsity is not supported");
   }
 
   TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_hlos_and_reqs,
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
index 38913814dc94..45c8fe8c9541 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands.cc
@@ -58,14 +58,14 @@ HloDotInstruction* MakeDotWithSwappedOperands(HloInstruction* dot) {
   const DotDimensionNumbers& dot_dims = dot->dot_dimension_numbers();
   const size_t num_batch_dims = dot_dims.lhs_batch_dimensions_size();
   const size_t num_lhs_noncontracting_dims =
-      dot->operand(0)->shape().rank() - num_batch_dims -
+      dot->operand(0)->shape().dimensions().size() - num_batch_dims -
       dot_dims.lhs_contracting_dimensions_size();
   const size_t num_rhs_noncontracting_dims =
-      dot->operand(1)->shape().rank() - num_batch_dims -
+      dot->operand(1)->shape().dimensions().size() - num_batch_dims -
       dot_dims.rhs_contracting_dimensions_size();
 
   std::vector<int64_t> out_shape_permutation;
-  out_shape_permutation.reserve(dot->shape().rank());
+  out_shape_permutation.reserve(dot->shape().dimensions().size());
   auto fill_permutation = [&](int64_t count, int64_t start) {
     while (count--) out_shape_permutation.push_back(start++);
   };
@@ -139,7 +139,7 @@ absl::StatusOr<int64_t> GetNonContractingDimsNumElements(
       operand_index == 0 ? dot_dims.lhs_contracting_dimensions()
                          : dot_dims.rhs_contracting_dimensions();
   const DimensionVector noncontracting_dim_indices = GetNonContractingDims(
-      shape.rank(), batch_dim_indices, contracting_dim_indices);
+      shape.dimensions().size(), batch_dim_indices, contracting_dim_indices);
   return absl::c_accumulate(
       noncontracting_dim_indices, int64_t{1},
       [&](int64_t acc, int64_t dim) { return acc * shape.dimensions(dim); });
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands_test.cc
index 9fa49cb33535..9215050bca3c 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_swap_operands_test.cc
@@ -17,14 +17,14 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/hlo/testlib/filecheck.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-class SwapOperandsTest : public HloTestBase {};
+class SwapOperandsTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(SwapOperandsTest, CodeGeneratingMovesToLhs) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
index ec962264a1fd..26f1d229454d 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -51,14 +51,16 @@ using ::testing::FieldsAre;
 
 namespace m = ::xla::match;
 
-class GemmFusionTest : public HloTestBase {
+class GemmFusionTest : public HloHardwareIndependentTestBase {
  public:
   GemmFusionTest()
-      : HloTestBase(/*verifier_layout_sensitive=*/true,
-                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/true,
+            /*allow_mixed_precision_in_hlo_verifier=*/false) {}
 
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_triton_gemm_any(false);
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
     return debug_options;
@@ -723,16 +725,16 @@ ENTRY e {
   MatchHloModule(*module, R"(
 CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
 CHECK-DAG: %[[P1:.*]] = f32[2,4]{1,0} parameter(1)
-CHECK-DAG: %[[ADD0:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P1]])
+CHECK-DAG: %[[ADD0:.*]] = f32[2,4]{1,0} add(%[[P0]], %[[P1]])
 CHECK-DAG: %[[P2:.*]] = f32[2,4]{1,0} parameter(2)
 CHECK-DAG: %[[P3:.*]] = f32[2,4]{1,0} parameter(3)
-CHECK-DAG: %[[ADD1:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P2]], f32[2,4]{1,0} %[[P3]])
-CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0} dot(f32[2,4]{1,0} %[[ADD0]], f32[2,4]{1,0} %[[ADD1]])
+CHECK-DAG: %[[ADD1:.*]] = f32[2,4]{1,0} add(%[[P2]], %[[P3]])
+CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0} dot(%[[ADD0]], %[[ADD1]])
 CHECK: ENTRY
 CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
 CHECK-DAG: %[[P1:.*]] = f32[2,4]{1,0} parameter(1)
 CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0}
-CHECK-SAME: fusion(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P1]], f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P1]]),
+CHECK-SAME: fusion(%[[P0]], %[[P1]], %[[P0]], %[[P1]]),
 CHECK-SAME: kind=kCustom
 CHECK-SAME: __triton_gemm
 })");
@@ -756,14 +758,14 @@ ENTRY e {
 
   MatchHloModule(*module, R"(
 CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
-CHECK-DAG: %[[ADD0:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P0]])
+CHECK-DAG: %[[ADD0:.*]] = f32[2,4]{1,0} add(%[[P0]], %[[P0]])
 CHECK-DAG: %[[P1:.*]] = f32[2,4]{1,0} parameter(1)
-CHECK-DAG: %[[ADD1:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P1]], f32[2,4]{1,0} %[[P1]])
-CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0} dot(f32[2,4]{1,0} %[[ADD0]], f32[2,4]{1,0} %[[ADD1]])
+CHECK-DAG: %[[ADD1:.*]] = f32[2,4]{1,0} add(%[[P1]], %[[P1]])
+CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0} dot(%[[ADD0]], %[[ADD1]])
 CHECK: ENTRY
 CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
 CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0}
-CHECK-SAME: fusion(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P0]])
+CHECK-SAME: fusion(%[[P0]], %[[P0]])
 CHECK-SAME: kind=kCustom
 CHECK-SAME: __triton_gemm
 })");
@@ -789,15 +791,15 @@ ENTRY e {
   MatchHloModule(*module, R"(
 CHECK-DAG: %[[P0:.*]] = f32[4,4]{1,0} parameter(0)
 CHECK-DAG: %[[P1:.*]] = f32[4,4]{1,0} parameter(1)
-CHECK-DAG: %[[NEGATE:.*]] = f32[4,4]{1,0} negate(f32[4,4]{1,0} %[[P0]])
-CHECK-DAG: %[[SINE:.*]] = f32[4,4]{1,0} sine(f32[4,4]{1,0} %[[NEGATE]])
-CHECK-DAG: %[[ADD:.*]] = f32[4,4]{1,0} add(f32[4,4]{1,0} %[[NEGATE]], f32[4,4]{1,0} %[[SINE]])
-CHECK-DAG: ROOT {{.*}} = f32[4,4]{1,0} dot(f32[4,4]{1,0} %[[ADD]], f32[4,4]{1,0} %[[P1]])
+CHECK-DAG: %[[NEGATE:.*]] = f32[4,4]{1,0} negate(%[[P0]])
+CHECK-DAG: %[[SINE:.*]] = f32[4,4]{1,0} sine(%[[NEGATE]])
+CHECK-DAG: %[[ADD:.*]] = f32[4,4]{1,0} add(%[[NEGATE]], %[[SINE]])
+CHECK-DAG: ROOT {{.*}} = f32[4,4]{1,0} dot(%[[ADD]], %[[P1]])
 CHECK: ENTRY
 CHECK-DAG: %[[P0:.*]] = f32[4,4]{1,0} parameter(0)
 CHECK-DAG: %[[P1:.*]] = f32[4,4]{1,0} parameter(1)
 CHECK-DAG: ROOT {{.*}} = f32[4,4]{1,0}
-CHECK-SAME: fusion(f32[4,4]{1,0} %[[P0]], f32[4,4]{1,0} %[[P1]])
+CHECK-SAME: fusion(%[[P0]], %[[P1]])
 CHECK-SAME: kind=kCustom
 CHECK-SAME: __triton_gemm
 })");
@@ -823,14 +825,14 @@ ENTRY e {
 CHECK-DAG: %[[P0:.*]] = f32[4,4]{1,0} parameter(0)
 CHECK-DAG: %[[P1:.*]] = f32[4,4]{1,0} parameter(1)
 CHECK-DAG: %[[P2:.*]] = f32[4,4]{1,0} parameter(2)
-CHECK-DAG: %[[TRANSPOSE:.*]] = f32[4,4]{1,0} transpose(f32[4,4]{1,0} %[[P1]])
-CHECK-DAG: %[[ADD:.*]] = f32[4,4]{1,0} add(f32[4,4]{1,0} %[[P0]], f32[4,4]{1,0} %[[TRANSPOSE]])
-CHECK-DAG: ROOT {{.*}} = f32[4,4]{1,0} dot(f32[4,4]{1,0} %[[ADD]], f32[4,4]{1,0} %[[P2]])
+CHECK-DAG: %[[TRANSPOSE:.*]] = f32[4,4]{1,0} transpose(%[[P1]])
+CHECK-DAG: %[[ADD:.*]] = f32[4,4]{1,0} add(%[[P0]], %[[TRANSPOSE]])
+CHECK-DAG: ROOT {{.*}} = f32[4,4]{1,0} dot(%[[ADD]], %[[P2]])
 CHECK: ENTRY
 CHECK-DAG: %[[P0:.*]] = f32[4,4]{1,0} parameter(0)
 CHECK-DAG: %[[P1:.*]] = f32[4,4]{1,0} parameter(1)
 CHECK-DAG: ROOT {{.*}} = f32[4,4]{1,0}
-CHECK-SAME: fusion(f32[4,4]{1,0} %[[P0]], f32[4,4]{1,0} %[[P0]], f32[4,4]{1,0} %[[P1]])
+CHECK-SAME: fusion(%[[P0]], %[[P0]], %[[P1]])
 CHECK-SAME: kind=kCustom
 CHECK-SAME: __triton_gemm
 })");
@@ -1302,7 +1304,7 @@ ENTRY e {
 ; CHECK-LABEL: ENTRY %e ({{.*}}: f16[2,10], {{.*}}: f16[10,2]) -> f16[10,10] {
 ; CHECK-NEXT: [[P0:%[^ ]+]] = f16[2,10]{1,0} parameter(0)
 ; CHECK-NEXT: [[P1:%[^ ]+]] = f16[10,2]{1,0} parameter(1)
-; CHECK:      ROOT {{.*}} = f16[10,10]{1,0} dot(f16[2,10]{1,0} [[P0]], f16[10,2]{1,0} [[P1]])
+; CHECK:      ROOT {{.*}} = f16[10,10]{1,0} dot([[P0]], [[P1]])
 })");
 }
 
@@ -1325,69 +1327,12 @@ ENTRY e {
 ; CHECK-NEXT: [[P0:%[^ ]+]] = f16[2,18]{1,0} parameter(0)
 ; CHECK-NEXT: [[P1:%[^ ]+]] = f16[50,2]{1,0} parameter(1)
 ; CHECK:      ROOT {{.*}} = f16[18,50]{1,0}
-; CHECK:        fusion(f16[2,18]{1,0} [[P0]], f16[50,2]{1,0} [[P1]]),
+; CHECK:        fusion([[P0]], [[P1]]),
 ; CHECK:        kind=kCustom
 ; CHECK:        __triton_gemm
 })");
 }
 
-class SparseDotTest : public GemmFusionTest {};
-
-TEST_F(SparseDotTest, DotWithSparseLhsOperandIsRewritten) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule test
-ENTRY main {
-  lhs = f16[2,16] parameter(0)
-  rhs = f16[32,2] parameter(1)
-  meta = u16[2,2] parameter(2)
-  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
-})")
-                    .value();
-  EXPECT_TRUE(GemmFusion(gpu_version_).Run(module.get()).value());
-
-  MatchHloModule(*module, R"(
-; CHECK-LABEL: ENTRY %main ({{.*}}: f16[2,16], {{.*}}: f16[32,2], {{.*}}: u16[2,2]) -> f32[2,2] {
-; CHECK-NEXT: [[P0:%[^ ]+]] = f16[2,16]{1,0} parameter(0)
-; CHECK-NEXT: [[P1:%[^ ]+]] = f16[32,2]{1,0} parameter(1)
-; CHECK-NEXT: [[META:%[^ ]+]] = u16[2,2]{1,0} parameter(2)
-; CHECK:      ROOT {{.*}} = f32[2,2]{1,0}
-; CHECK-SAME:   fusion(f16[2,16]{1,0} [[P0]], f16[32,2]{1,0} [[P1]], u16[2,2]{1,0} [[META]]),
-; CHECK-SAME:   kind=kCustom
-; CHECK-SAME:   __triton_gemm
-})");
-}
-
-TEST_F(SparseDotTest, DotWithSparseRhsOperandIsNotSupported) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule test
-ENTRY main {
-  lhs = f16[2,32] parameter(0)
-  rhs = f16[16,2] parameter(1)
-  meta = u16[2,2] parameter(2)
-  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=R.0@2:4
-})")
-                    .value();
-  auto result = GemmFusion(gpu_version_).Run(module.get());
-  EXPECT_FALSE(result.ok());
-}
-
-TEST_F(SparseDotTest, UnsupportedSparsityType) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule test
-ENTRY main {
-  lhs = f16[2,8] parameter(0)
-  rhs = f16[32,2] parameter(1)
-  meta = u16[2,1] parameter(2)
-  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@1:4
-})")
-                    .value();
-  auto result = GemmFusion(gpu_version_).Run(module.get());
-  EXPECT_FALSE(result.ok());
-}
-
 TEST_F(SmallDotGemmFusionTest, Int4DotIsRewritten) {
   constexpr auto kInt4Dot = R"(
     ENTRY e {
@@ -1424,7 +1369,9 @@ TEST_F(SmallDotGemmFusionTest, Int4ConcatPlusConvertIsRewritten) {
 CHECK: gemm_fusion_dot_computation
 CHECK:  %parameter_0 = s4[8,1024]{1,0} parameter(0)
 CHECK: ENTRY
-CHECK-DAG: ROOT {{.*}} = bf16[8,4]{1,0} fusion(s4[8,1024]{1,0} %lhs_concat, bf16[1024,4]{1,0} %rhs)
+CHECK-DAG: %[[LHS_CONCAT:.*]] = s4[8,1024]{1,0} concatenate(%{{.+}}, %{{.+}}), dimensions={0}
+CHECK-DAG: %[[RHS:.*]] = bf16[1024,4]{1,0} parameter(2)
+CHECK-DAG: ROOT {{.*}} = bf16[8,4]{1,0} fusion(%[[LHS_CONCAT]], %[[RHS]])
 })");
 }
 
@@ -1448,7 +1395,9 @@ TEST_F(SmallDotGemmFusionTest, Int4ConvertPlusNegateIsRewritten) {
 CHECK: gemm_fusion_dot_computation
 CHECK:  %parameter_0 = s4[8,1024]{1,0} parameter(0)
 CHECK: ENTRY
-CHECK-DAG: ROOT {{.*}} = f32[8,4]{1,0} fusion(s4[8,1024]{1,0} %lhs, f32[1024,4]{1,0} %rhs)
+CHECK-DAG: %[[LHS:.+]] = s4[8,1024]{1,0} parameter(0)
+CHECK-DAG: %[[RHS:.+]] = f32[1024,4]{1,0} parameter(1)
+CHECK-DAG: ROOT {{.*}} = f32[8,4]{1,0} fusion(%[[LHS]], %[[RHS]])
 })");
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
index f39d45b38037..5054a4407781 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include <numeric>
 #include <optional>
 #include <tuple>
+#include <unordered_set>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -127,7 +128,7 @@ bool IsF8Type(const HloInstruction *instr) {
 Shape PadShapeToMultipleOf16(const Shape old_shape,
                              const absl::Span<const int64_t> batch_dims) {
   Shape padded_shape = old_shape;
-  for (int i = 0; i < old_shape.rank(); ++i) {
+  for (int i = 0; i < old_shape.dimensions().size(); ++i) {
     if (!absl::c_linear_search(batch_dims, i)) {
       int64_t padded_dimension =
           RoundUpTo<int64_t>(old_shape.dimensions(i), 16);
@@ -146,7 +147,7 @@ HloInstruction *PadOperandToTargetShape(const Shape &target,
   }
 
   PaddingConfig padding_config;
-  for (int i = 0; i < x->shape().rank(); ++i) {
+  for (int i = 0; i < x->shape().dimensions().size(); ++i) {
     auto dimension = padding_config.add_dimensions();
     dimension->set_edge_padding_low(0);
     dimension->set_edge_padding_high(target.dimensions(i) -
@@ -356,14 +357,14 @@ HloInstruction *TransposeMatrix(HloInstruction *instr, int64_t contracting_dim,
   auto input_shape = instr->shape();
   // Identify the dimensional order which describes a transpose of the
   // contracting and non-contracting dimensions of the GEMM.
-  std::vector<int64_t> permutation(input_shape.dimensions_size(), -1);
+  std::vector<int64_t> permutation(input_shape.dimensions().size(), -1);
   // Discard the batch dimensions.
   for (int64_t batch_dim : batch_dims) {
     permutation[batch_dim] = batch_dim;
   }
   // Identify the non-contracting dimension.
   int non_contracting_dim;
-  for (int i = 0; i < input_shape.dimensions_size(); ++i) {
+  for (int i = 0; i < input_shape.dimensions().size(); ++i) {
     if (permutation[i] == -1 && contracting_dim != i) {
       non_contracting_dim = i;
     }
@@ -628,9 +629,9 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     int64_t lhs_batch_dims_size =
         instr->dot_dimension_numbers().lhs_batch_dimensions_size();
     bool is_lhs_vector =
-        lhs->shape().dimensions_size() == lhs_batch_dims_size + 1;
+        lhs->shape().dimensions().size() == lhs_batch_dims_size + 1;
     bool is_rhs_vector =
-        rhs->shape().dimensions_size() == lhs_batch_dims_size + 1;
+        rhs->shape().dimensions().size() == lhs_batch_dims_size + 1;
     int64_t lhs_stride =
         is_lhs_vector ? lhs->shape().dimensions(lhs_batch_dims_size)
                       : lhs->shape().dimensions(lhs_batch_dims_size) *
@@ -1108,22 +1109,52 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     }
 
     if (IsRocm(gpu_version_)) {
-      if (a_type == F8E5M2FNUZ && b_type == F8E5M2FNUZ) {
-        VLOG(1)
-            << "Failed to rewrite " << instr->ToShortString()
-            << " into FP8 Custom Call. The element type of one of the operands "
-               "must be F8E4M3FNUZ.";
-        return false;
+      TF_ASSIGN_OR_RETURN(auto rocm_compute_capability,
+                          GetRocmComputeCapability(gpu_version_));
+      if (rocm_compute_capability.has_ocp_fp8_support()) {
+        if (a_type == F8E5M2 && b_type == F8E5M2) {
+          VLOG(1) << "Failed to rewrite " << instr->ToShortString()
+                  << " into FP8 Custom Call. For "
+                  << rocm_compute_capability.gfx_version()
+                  << " arch, one of the input types must be F8E4M3FN, but got "
+                  << PrimitiveType_Name(a_type) << " and "
+                  << PrimitiveType_Name(b_type);
+          return false;
+        }
+        if ((a_type != F8E5M2 && a_type != F8E4M3FN) ||
+            (b_type != F8E5M2 && b_type != F8E4M3FN)) {
+          VLOG(1)
+              << "Failed to rewrite " << instr->ToShortString()
+              << " into FP8 Custom Call. For "
+              << rocm_compute_capability.gfx_version()
+              << " arch, the input types must be F8E5M2 or F8E4M3FN, but got "
+              << PrimitiveType_Name(a_type) << " and "
+              << PrimitiveType_Name(b_type);
+          return false;
+        }
       }
-      if ((a_type != F8E5M2FNUZ && a_type != F8E4M3FNUZ) ||
-          (b_type != F8E5M2FNUZ && b_type != F8E4M3FNUZ)) {
-        VLOG(1)
-            << "Failed to rewrite " << instr->ToShortString()
-            << " into FP8 Custom Call. The input types must be F8E5M2FNUZ or "
-               "F8E4M3FNUZ, but got "
-            << PrimitiveType_Name(a_type) << " and "
-            << PrimitiveType_Name(b_type);
-        return false;
+      if (rocm_compute_capability.has_nanoo_fp8_support()) {
+        if (a_type == F8E5M2FNUZ && b_type == F8E5M2FNUZ) {
+          VLOG(1)
+              << "Failed to rewrite " << instr->ToShortString()
+              << " into FP8 Custom Call. For "
+              << rocm_compute_capability.gfx_version()
+              << " arch, one of the input types must be F8E4M3FNUZ, but got "
+              << PrimitiveType_Name(a_type) << " and "
+              << PrimitiveType_Name(b_type);
+          return false;
+        }
+        if ((a_type != F8E5M2FNUZ && a_type != F8E4M3FNUZ) ||
+            (b_type != F8E5M2FNUZ && b_type != F8E4M3FNUZ)) {
+          VLOG(1) << "Failed to rewrite " << instr->ToShortString()
+                  << " into FP8 Custom Call. For "
+                  << rocm_compute_capability.gfx_version()
+                  << " arch, the input types must be F8E5M2FNUZ or F8E4M3FNUZ, "
+                     "but got "
+                  << PrimitiveType_Name(a_type) << " and "
+                  << PrimitiveType_Name(b_type);
+          return false;
+        }
       }
     }
 
@@ -1170,25 +1201,56 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     }
 
     PrimitiveType d_type = instr->shape().element_type();
-    bool supported_d_type = (d_type == BF16 || d_type == F16 || d_type == F32);
-    if (IsCuda(gpu_version_) && (d_type == F8E4M3FN || d_type == F8E5M2)) {
-      supported_d_type = true;
-    }
-    if (IsRocm(gpu_version_) &&
-        toolkit_version_ >= stream_executor::SemanticVersion{6, 2, 0} &&
-        (d_type == F8E4M3FNUZ || d_type == F8E5M2FNUZ)) {
-      supported_d_type = true;
+    std::unordered_set<PrimitiveType> supported_d_types = {BF16, F16, F32};
+    if (IsCuda(gpu_version_)) {
+      supported_d_types.insert(F8E4M3FN);
+      supported_d_types.insert(F8E5M2);
+      if (supported_d_types.find(d_type) == supported_d_types.end()) {
+        VLOG(1) << "Failed to rewrite " << instr->ToShortString()
+                << " into FP8 Custom Call. Output type must be "
+                   "F8E4M3FN, F8E5M2, BF16, F16 or F32, but got "
+                << PrimitiveType_Name(d_type);
+        return false;
+      }
     }
-    if (!supported_d_type) {
-      VLOG(1) << "Failed to rewrite " << instr->ToShortString()
-              << " into FP8 Custom Call. Output element type must be "
-              << (IsCuda(gpu_version_) ? "F8E4M3FN, F8E5M2, BF16, F16 or F32. "
-                  : toolkit_version_ >=
-                          stream_executor::SemanticVersion{6, 2, 0}
-                      ? "F8E4M3FNUZ, F8E5M2FNUZ, BF16, F16 or F32. "
-                      : "BF16, F16 or F32. ")
-              << "Actual element type is " << PrimitiveType_Name(d_type);
-      return false;
+    if (IsRocm(gpu_version_)) {
+      if (toolkit_version_ < stream_executor::SemanticVersion{6, 2, 0}) {
+        if (supported_d_types.find(d_type) == supported_d_types.end()) {
+          VLOG(1) << "Failed to rewrite " << instr->ToShortString()
+                  << " into FP8 Custom Call. For ROCm version < 6.2, output "
+                     "type must be BF16, F16 or F32, but got "
+                  << PrimitiveType_Name(d_type);
+          return false;
+        }
+      }
+      TF_ASSIGN_OR_RETURN(auto rocm_compute_capability,
+                          GetRocmComputeCapability(gpu_version_));
+      if (rocm_compute_capability.has_ocp_fp8_support()) {
+        supported_d_types.insert(F8E4M3FN);
+        supported_d_types.insert(F8E5M2);
+        if (supported_d_types.find(d_type) == supported_d_types.end()) {
+          VLOG(1) << "Failed to rewrite " << instr->ToShortString()
+                  << " into FP8 Custom Call. For "
+                  << rocm_compute_capability.gfx_version()
+                  << " arch output type must be F8E4M3FN, F8E5M2, BF16, F16 or "
+                     "F32, but got "
+                  << PrimitiveType_Name(d_type);
+          return false;
+        }
+      }
+      if (rocm_compute_capability.has_nanoo_fp8_support()) {
+        supported_d_types.insert(F8E4M3FNUZ);
+        supported_d_types.insert(F8E5M2FNUZ);
+        if (supported_d_types.find(d_type) == supported_d_types.end()) {
+          VLOG(1) << "Failed to rewrite " << instr->ToShortString()
+                  << " into FP8 Custom Call. For "
+                  << rocm_compute_capability.gfx_version()
+                  << " arch output type must be F8E4M3FNUZ, F8E5M2FNUZ, BF16, "
+                     "F16 or F32, but got "
+                  << PrimitiveType_Name(d_type);
+          return false;
+        }
+      }
     }
 
     // Each operand must have exactly one contracting and one non-contracting
@@ -1209,7 +1271,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       const HloInstruction *input = param.commutative_ops.empty()
                                         ? param.fp8_input
                                         : param.commutative_ops.back().first;
-      if (input->shape().rank() != num_batch_dims + 2) {
+      if (input->shape().dimensions().size() != num_batch_dims + 2) {
         VLOG(1) << "Failed to rewrite " << instr->ToShortString()
                 << "into FP8 Custom Call. Inputs must have exactly one "
                    "contracting and one non-contracting dimension.";
@@ -1324,8 +1386,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // Slice the result of the GEMM if the operands were padded.
     HloInstruction *slice = nullptr;
     if (new_output_shape.dimensions() != instr->shape().dimensions()) {
-      std::vector<int64_t> start_indices(instr->shape().rank(), 0);
-      std::vector<int64_t> strides(instr->shape().rank(), 1);
+      std::vector<int64_t> start_indices(instr->shape().dimensions().size(), 0);
+      std::vector<int64_t> strides(instr->shape().dimensions().size(), 1);
       slice = instr->AddInstruction(HloInstruction::CreateSlice(
           instr->shape(), new_custom_call, start_indices,
           instr->shape().dimensions(), strides));
@@ -1383,6 +1445,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                           HloInstruction *d_scale, HloInstruction *clamp_lower,
                           HloInstruction *clamp_upper,
                           bool mult_scale = false) {
+    // TODO: add ROCm support to this fusion pattern
+    if (IsRocm(gpu_version_)) {
+      return absl::OkStatus();
+    }
     // Verify the data types and the operands of clamp.
     if (instr->shape().element_type() == F8E4M3FN) {
       if (!clamp_lower->literal().IsAllFloat(static_cast<float>(
@@ -1559,7 +1625,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // To ensure correctness, only slices that chop off the ends of dimensions
     // are supported.
     if (slice) {
-      int slice_op_dim = slice->operand(0)->shape().rank();
+      int slice_op_dim = slice->operand(0)->shape().dimensions().size();
       if (slice->slice_starts() != std::vector<int64_t>(slice_op_dim, 0) ||
           slice->slice_strides() != std::vector<int64_t>(slice_op_dim, 1)) {
         return absl::OkStatus();
@@ -1697,13 +1763,13 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     GemmBackendConfig &config = *gpu_config.mutable_gemm_backend_config();
     // # output column dims == # non-contracting rhs operand dims.
     const DotDimensionNumbers &dot_dims = config.dot_dimension_numbers();
-    size_t num_col_dims = gemm->operand(1)->shape().rank() -
+    size_t num_col_dims = gemm->operand(1)->shape().dimensions().size() -
                           dot_dims.rhs_batch_dimensions_size() -
                           dot_dims.rhs_contracting_dimensions_size();
 
     if ((gemm->user_count() != 1) ||
         (config.epilogue() != GemmBackendConfig::DEFAULT) ||
-        (bias->shape().rank() != num_col_dims)) {
+        (bias->shape().dimensions().size() != num_col_dims)) {
       return false;
     }
     // We require the bias vector to have been broadcast in the most major
@@ -1780,21 +1846,22 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // Replace add(gemm, broadcast) with fused new_gemm.
     operands.push_back(bias);
     config.set_epilogue(GemmBackendConfig::BIAS);
-    std::unique_ptr<HloInstruction> result =
-        gemm->CloneWithNewOperands(gemm->shape(), operands);
+    HloComputation *computation = gemm->parent();
+    HloInstruction *result = computation->AddInstruction(
+        gemm->CloneWithNewOperands(gemm->shape(), operands));
+
     TF_RETURN_IF_ERROR(result->set_backend_config(gpu_config));
-    TF_RETURN_IF_ERROR(SetName(result->GetModule(), result.get()));
+    TF_RETURN_IF_ERROR(SetName(gemm->GetModule(), result));
     if (slice) {
-      result = slice->CloneWithNewOperands(
-          slice->shape(), {slice->parent()->AddInstruction(std::move(result))});
+      result = computation->AddInstruction(
+          slice->CloneWithNewOperands(slice->shape(), {result}));
     }
 
     if (bitcast) {
-      result = bitcast->CloneWithNewOperands(
-          bitcast->shape(),
-          {bitcast->parent()->AddInstruction(std::move(result))});
+      result = computation->AddInstruction(
+          bitcast->CloneWithNewOperands(bitcast->shape(), {result}));
     }
-    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(instr, std::move(result)));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(instr, result));
     return true;
   }
 
@@ -1825,17 +1892,18 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return absl::OkStatus();
     }
 
-    std::unique_ptr<HloInstruction> result = gemm->Clone();
+    HloComputation *computation = gemm->parent();
+    HloInstruction *result = computation->AddInstruction(gemm->Clone());
     TF_RETURN_IF_ERROR(result->set_backend_config(gpu_config));
-    TF_RETURN_IF_ERROR(SetName(result->GetModule(), result.get()));
+    TF_RETURN_IF_ERROR(SetName(gemm->GetModule(), result));
 
     if (slice_or_bitcast) {
-      result = slice_or_bitcast->CloneWithNewOperands(
-          slice_or_bitcast->shape(),
-          {slice_or_bitcast->parent()->AddInstruction(std::move(result))});
+      result =
+          computation->AddInstruction(slice_or_bitcast->CloneWithNewOperands(
+              slice_or_bitcast->shape(), {result}));
     }
 
-    return ReplaceWithNewInstruction(instr, std::move(result));
+    return ReplaceInstruction(instr, result);
   }
 
   absl::Status FuseGeluActivation(HloInstruction *multiply,
@@ -2127,7 +2195,39 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return true;
     }
     const TypeCombinations supported_hipblas_type_combinations = {
-        // FP8 types:
+        // OCP FP8 types:
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kFloat},
+
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kF8E4M3FN},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kF8E5M2},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kFloat},
+
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kF8E5M2},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kFloat},
+
+        // NANOO FP8 types:
         {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
          PrimitiveType::F8E4M3FNUZ, DataType::kBF16},
         {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
@@ -2331,7 +2431,7 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
           xla::gpu::gpublas_lt::EpilogueHasAuxiliaryOutput(epilogue));
 
       if (!((instr->shape().IsTuple() &&
-             instr->shape().tuple_shapes_size() ==
+             instr->shape().tuple_shapes().size() ==
                  has_aux_output + config.damax_output() + 1) ||
             instr->shape().IsArray())) {
         return absl::OkStatus();
@@ -2341,15 +2441,18 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
       return absl::OkStatus();
     }
 
-    auto *cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version_);
-
     // Pass a user-managed workspace to legacy cuBLAS operations, as
     // otherwise cuBLAS will use its own internal pool which will be competing
     // with XLA allocator for device memory.
-    int64_t workspace = cuda_cc == nullptr ? GemmConfig::kDefaultWorkspace
-                        : cuda_cc->IsAtLeastHopper()
-                            ? GemmConfig::kHopperWorkspace
-                            : GemmConfig::kDefaultWorkspace;
+    int64_t workspace = GemmConfig::kDefaultWorkspace;
+    auto *cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version_);
+    if (cuda_cc != nullptr && cuda_cc->IsAtLeastHopper()) {
+      workspace = GemmConfig::kHopperWorkspace;
+    }
+    auto *rocm_cc = std::get_if<se::RocmComputeCapability>(&gpu_version_);
+    if (rocm_cc != nullptr && rocm_cc->gfx_version() == "gfx950") {
+      workspace = GemmConfig::kGFX950Workspace;
+    }
 
     // We do not know the workspace size required by cuBLAS, but we can guess
     // that in a worst case cuBLAS will transpose all operands into tiled
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
index 9f1cbb6c46c0..27d587fea3e1 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
@@ -35,14 +35,17 @@ limitations under the License.
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
 #include "xla/service/gpu/transforms/gemm_rewriter_test_lib.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
+#include "xla/tests/hlo_runner_agnostic_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -55,11 +58,26 @@ class ParameterizedFp8GemmRewriteTest
     : public ParameterizedGemmRewriteTestBase {
  public:
   ParameterizedFp8GemmRewriteTest() {
-    replacements_[kF8E4M3DatatypePlaceholder] =
-        IsCuda() ? "f8e4m3fn" : "f8e4m3fnuz";
-    replacements_[kF8E5M2DatatypePlaceholder] =
-        IsCuda() ? "f8e5m2" : "f8e5m2fnuz";
-    replacements_[kF8E4M3AmaxPlaceholder] = IsCuda() ? "448." : "240.";
+    if (IsCuda()) {
+      replacements_[kF8E4M3DatatypePlaceholder] = "f8e4m3fn";
+      replacements_[kF8E5M2DatatypePlaceholder] = "f8e5m2";
+      replacements_[kF8E4M3AmaxPlaceholder] = "448.";
+      return;
+    }
+    if (IsRocm() && std::get<se::RocmComputeCapability>(Capability())
+                        .has_ocp_fp8_support()) {
+      replacements_[kF8E4M3DatatypePlaceholder] = "f8e4m3fn";
+      replacements_[kF8E5M2DatatypePlaceholder] = "f8e5m2";
+      replacements_[kF8E4M3AmaxPlaceholder] = "448.";
+      return;
+    }
+    if (IsRocm() && std::get<se::RocmComputeCapability>(Capability())
+                        .has_nanoo_fp8_support()) {
+      replacements_[kF8E4M3DatatypePlaceholder] = "f8e4m3fnuz";
+      replacements_[kF8E5M2DatatypePlaceholder] = "f8e5m2fnuz";
+      replacements_[kF8E4M3AmaxPlaceholder] = "240.";
+      return;
+    }
   }
 
   void SetUp() override {
@@ -71,6 +89,12 @@ class ParameterizedFp8GemmRewriteTest
       GTEST_SKIP()
           << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
     }
+
+    if (IsRocm() &&
+        !std::get<se::RocmComputeCapability>(Capability()).has_fp8_support()) {
+      GTEST_SKIP()
+          << "F8 gemm rewrite is only supported on MI300 and newer archs.";
+    }
   }
 
  protected:
@@ -118,12 +142,16 @@ class ParameterizedFp8GemmRewriteTest
     }
   }
 
+  using ParameterizedGemmRewriteTestBase::ParseAndReturnVerifiedModule;
+
   absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
-  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
-                               int64_t replica_count = 1,
-                               int64_t num_partitions = 1) {
+  ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text, int64_t replica_count = 1,
+      int64_t num_partitions = 1,
+      std::optional<DeviceAssignment> device_assignment = std::nullopt) const {
     return GemmRewriteTestBase::ParseAndReturnVerifiedModule(
-        absl::StrReplaceAll(hlo_text, replacements_));
+        absl::StrReplaceAll(hlo_text, replacements_), replica_count,
+        num_partitions, device_assignment);
   }
 
  private:
@@ -289,7 +317,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       checks);
 }
@@ -312,7 +340,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDMatrixBiasF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: <<F8E4M3>>[16,16]) -> <<F8E4M3>>[16,16] {
@@ -361,7 +389,7 @@ HloModule test
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[2,64,32], {{.*}}: <<F8E4M3>>[2,32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[2,64,16] {
@@ -418,7 +446,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -470,7 +498,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[13,17], {{.*}}: <<F8E4M3>>[17,31], {{.*}}: f32[], {{.*}}: f32[]) -> f32[13,31] {
@@ -528,7 +556,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDBitcastF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
@@ -558,7 +586,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDWithConvertF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16]) -> f32[16,16] {
@@ -614,7 +642,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -673,7 +701,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -731,7 +759,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
@@ -739,7 +767,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[32,32], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -794,7 +822,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDSelectF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
@@ -802,7 +830,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDSelectF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: pred[16,32]) -> f32[16,16] {
@@ -861,7 +889,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_FALSE(changed);
@@ -890,7 +918,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[10,16,32], {{.*}}: <<F8E4M3>>[10,32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[10,16,16] {
@@ -945,7 +973,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -1001,7 +1029,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -1125,7 +1153,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
     RunAndFilecheckHloRewrite(
         hlo_text,
-        GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+        GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                      GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
         checks);
   }
@@ -1221,7 +1249,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
       )";
     RunAndFilecheckHloRewrite(
         hlo_text,
-        GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+        GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                      GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
         checks);
   }
@@ -1250,7 +1278,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
@@ -1285,7 +1313,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -1344,7 +1372,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -1407,7 +1435,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABScaledDF8) {
   CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> <<F8E4M3>>[16,16] {
@@ -1458,7 +1486,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABScaledF32DF8) {
   CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> f32[16,16] {
@@ -1507,7 +1535,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABInvScaledF32DF8) {
   CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> f32[16,16] {
@@ -1558,7 +1586,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABScaledF32DMatrixBiasF8) {
   CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> f32[16,16] {
@@ -1623,7 +1651,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> <<F8E4M3>>[16,16] {
@@ -1688,7 +1716,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABInvScaledDF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -1732,7 +1760,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> <<F8E4M3>>[16,16] {
@@ -1811,7 +1839,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasWithDAmaxF8) {
   CheckFp8IfSupported(hlo_text, ErrorSpec{0.1, 0.1});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -1880,7 +1908,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
   CheckFp8IfSupported(hlo_text, ErrorSpec{0.1, 0.1});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 
@@ -1946,7 +1974,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -2006,7 +2034,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   CheckFp8IfSupported(hlo_text, ErrorSpec{2e-3, 0.});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[16], {{.*}}: f16[], {{.*}}: f16[]) -> f16[16,16] {
@@ -2064,7 +2092,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
@@ -2078,7 +2106,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
 
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[4,16,16], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[32], {{.*}}: f16[], {{.*}}: f16[]) -> f16[4,16,32] {
@@ -2141,7 +2169,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
@@ -2157,7 +2185,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[4,15,15], {{.*}}: <<F8E4M3>>[15,31], {{.*}}: f32[31], {{.*}}: f16[], {{.*}}: f16[]) -> f16[4,15,31] {
@@ -2224,7 +2252,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
@@ -2238,7 +2266,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
 
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[4,16,16], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[4,16,32], {{.*}}: f32[], {{.*}}: f32[]) -> f32[4,16,32] {
@@ -2297,7 +2325,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
@@ -2313,7 +2341,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[3,15,15], {{.*}}: <<F8E4M3>>[15,31], {{.*}}: f32[3,15,31], {{.*}}: f32[], {{.*}}: f32[]) -> f32[3,15,31] {
@@ -2380,14 +2408,14 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+  GemmRewriter pass(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                     GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[48,16], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[32,16] {
@@ -2449,7 +2477,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   CheckFp8IfSupported(hlo_text, ErrorSpec{2e-3, 0.});
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL:   ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[16], {{.*}}: f16[16,16], {{.*}}: f16[], {{.*}}: f16[]) -> f16[16,16] {
@@ -2527,7 +2555,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> (<<F8E4M3>>[16,16], f32[]) {
@@ -2605,7 +2633,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[], {{.*}}: f16[], {{.*}}: f16[]) -> (<<F8E4M3>>[16,16], f16[]) {
@@ -2686,7 +2714,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> (<<F8E4M3>>[16,16], f32[]) {
@@ -2810,7 +2838,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8Parameterized) {
 
     RunAndFilecheckHloRewrite(
         hlo_text,
-        GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+        GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                      GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
         R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
@@ -2878,7 +2906,7 @@ ENTRY f {
 
     RunAndFilecheckHloRewrite(
         hlo_text,
-        GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+        GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                      GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
         R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
@@ -2909,7 +2937,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8TF32E5M2) {
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
       hlo_text,
-      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+      GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                    GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
       R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
@@ -2935,22 +2963,12 @@ TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
       ROOT out = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
           }
 )";
-  if (IsCuda()) {
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            ParseAndReturnVerifiedModule(hlo_text));
-    GemmRewriter pass(
-        CudaHopperOrRocmMI300(), GetToolkitVersion(),
-        GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
-    TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                            this->RunHloPass(&pass, module.get()));
-    EXPECT_FALSE(changed);
-    return;
-  }
-  if (IsRocm()) {
+  if (IsRocm() && std::get<se::RocmComputeCapability>(Capability())
+                      .has_nanoo_fp8_support()) {
     EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
     RunAndFilecheckHloRewrite(
         hlo_text,
-        GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+        GemmRewriter(CudaHopperOrRocmCapability(), GetToolkitVersion(),
                      GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only}),
         R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f8e4m3fnuz[16,32], {{.*}}: f8e4m3fnuz[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -2988,6 +3006,16 @@ TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
       )");
+  } else {
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            ParseAndReturnVerifiedModule(hlo_text));
+    GemmRewriter pass(
+        CudaHopperOrRocmCapability(), GetToolkitVersion(),
+        GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only});
+    TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                            this->RunHloPass(&pass, module.get()));
+    EXPECT_FALSE(changed);
+    return;
   }
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc
index b2650bf3e4e0..ac8c56e79bcc 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.cc
@@ -67,11 +67,11 @@ bool GemmRewriteTestBase::IsBlackwell() const {
 }
 
 stream_executor::GpuComputeCapability
-GemmRewriteTestBase::CudaHopperOrRocmMI300() {
+GemmRewriteTestBase::CudaHopperOrRocmCapability() {
   if (IsCuda()) {
-    return stream_executor::CudaComputeCapability::Hopper();
+    return se::CudaComputeCapability::Hopper();
   } else {
-    return stream_executor::RocmComputeCapability{"gfx942"};
+    return std::get<se::RocmComputeCapability>(Capability());
   }
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.h b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.h
index 8b628d8a8cea..efefb8a25c02 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test_lib.h
@@ -39,7 +39,7 @@ class GemmRewriteTestBase : public GpuCodegenTest {
 
   bool IsBlackwell() const;
 
-  stream_executor::GpuComputeCapability CudaHopperOrRocmMI300();
+  stream_executor::GpuComputeCapability CudaHopperOrRocmCapability();
 
   DebugOptions GetDebugOptionsForTest() const override;
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
index fddb9e662a0a..8684f34ec873 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter.cc
@@ -67,11 +67,11 @@ class GemvRewriterVisitor : public DfsHloRewriteVisitor {
     // This pass relies on dot decomposer which ensures that all non-batch
     // dimensions are merged into one.
     bool lhs_has_non_contracting_dim =
-        lhs->shape().rank() ==
+        lhs->shape().dimensions().size() ==
         dim_numbers.lhs_batch_dimensions_size() +
             dim_numbers.lhs_contracting_dimensions_size() + 1;
     bool rhs_has_non_contracting_dim =
-        rhs->shape().rank() ==
+        rhs->shape().dimensions().size() ==
         dim_numbers.rhs_batch_dimensions_size() +
             dim_numbers.rhs_contracting_dimensions_size() + 1;
 
@@ -101,8 +101,7 @@ class GemvRewriterVisitor : public DfsHloRewriteVisitor {
       new_lhs_dimensions.push_back(1);
       Shape new_lhs_shape(
           lhs_shape.element_type(), new_lhs_dimensions,
-          absl::InlinedVector<bool, 4>(new_lhs_dimensions.size(), false),
-          /*tuple_shapes=*/{});
+          absl::InlinedVector<bool, 4>(new_lhs_dimensions.size(), false));
       TF_ASSIGN_OR_RETURN(
           *new_lhs_shape.mutable_layout(),
           GetLayoutWithNewMinorMostDimension(lhs_shape.layout()));
@@ -119,8 +118,7 @@ class GemvRewriterVisitor : public DfsHloRewriteVisitor {
       new_rhs_dimensions.push_back(1);
       Shape new_rhs_shape(
           rhs_shape.element_type(), new_rhs_dimensions,
-          absl::InlinedVector<bool, 4>(new_rhs_dimensions.size(), false),
-          /*tuple_shapes=*/{});
+          absl::InlinedVector<bool, 4>(new_rhs_dimensions.size(), false));
       TF_ASSIGN_OR_RETURN(
           *new_rhs_shape.mutable_layout(),
           GetLayoutWithNewMinorMostDimension(rhs_shape.layout()));
@@ -145,8 +143,7 @@ class GemvRewriterVisitor : public DfsHloRewriteVisitor {
 
     Shape new_out_shape(
         dot->shape().element_type(), new_out_dimensions,
-        absl::InlinedVector<bool, 4>(new_out_dimensions.size(), false),
-        /*tuple_shapes=*/{});
+        absl::InlinedVector<bool, 4>(new_out_dimensions.size(), false));
     TF_ASSIGN_OR_RETURN(
         *new_out_shape.mutable_layout(),
         GetLayoutWithNewMinorMostDimension(dot->shape().layout()));
diff --git a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter_test.cc
index d2555286297d..1d7378d676d8 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemv_rewriter_test.cc
@@ -21,13 +21,13 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
-class GemvRewriterTest : public HloTestBase {};
+class GemvRewriterTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(GemvRewriterTest, RewriteMatrixVectorMultiplicationToGemm) {
   const char* hlo = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc
index b960cbf5ea1f..aa9ac168002b 100644
--- a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter.cc
@@ -17,11 +17,16 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -35,11 +40,10 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/gpu_solver_context.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -60,7 +64,7 @@ absl::StatusOr<HloInstruction*> CreateCholesky(
   HloComputation* computation = operand->parent();
 
   Shape a_shape = operand->shape();
-  int ndim = a_shape.dimensions_size();
+  int ndim = a_shape.dimensions().size();
   CHECK_GE(ndim, 2);
   int64_t n = a_shape.dimensions(ndim - 1);
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc
new file mode 100644
index 000000000000..830ed6b5c04f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/gpusolver_rewriter_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/gpusolver_rewriter.h"
+
+#include <complex>
+#include <cstdint>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu_solver_context.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace m = ::xla::match;
+
+class GpuSolverContextStub : stream_executor::GpuSolverContext {
+ public:
+  GpuSolverContextStub() = default;
+  static absl::StatusOr<std::unique_ptr<GpuSolverContext>> Create() {
+    return absl::WrapUnique(
+        static_cast<GpuSolverContext*>(new GpuSolverContextStub));
+  }
+
+  absl::Status SetStream(stream_executor::Stream* stream) override {
+    return UnimplementedError();
+  }
+
+  absl::Status PotrfBatched(stream_executor::blas::UpperLower uplo, int n,
+                            stream_executor::DeviceMemory<float*> as, int lda,
+                            stream_executor::DeviceMemory<int> lapack_info,
+                            int batch_size) override {
+    return UnimplementedError();
+  }
+  absl::Status PotrfBatched(stream_executor::blas::UpperLower uplo, int n,
+                            stream_executor::DeviceMemory<double*> as, int lda,
+                            stream_executor::DeviceMemory<int> lapack_info,
+                            int batch_size) override {
+    return UnimplementedError();
+  }
+  absl::Status PotrfBatched(
+      stream_executor::blas::UpperLower uplo, int n,
+      stream_executor::DeviceMemory<std::complex<float>*> as, int lda,
+      stream_executor::DeviceMemory<int> lapack_info, int batch_size) override {
+    return UnimplementedError();
+  }
+  absl::Status PotrfBatched(
+      stream_executor::blas::UpperLower uplo, int n,
+      stream_executor::DeviceMemory<std::complex<double>*> as, int lda,
+      stream_executor::DeviceMemory<int> lapack_info, int batch_size) override {
+    return UnimplementedError();
+  }
+
+  absl::Status Potrf(stream_executor::blas::UpperLower uplo, int n,
+                     stream_executor::DeviceMemory<float> a, int lda,
+                     stream_executor::DeviceMemory<int> lapack_info,
+                     stream_executor::DeviceMemory<float> workspace) override {
+    return UnimplementedError();
+  }
+  absl::Status Potrf(stream_executor::blas::UpperLower uplo, int n,
+                     stream_executor::DeviceMemory<double> a, int lda,
+                     stream_executor::DeviceMemory<int> lapack_info,
+                     stream_executor::DeviceMemory<double> workspace) override {
+    return UnimplementedError();
+  }
+  absl::Status Potrf(
+      stream_executor::blas::UpperLower uplo, int n,
+      stream_executor::DeviceMemory<std::complex<float>> a, int lda,
+      stream_executor::DeviceMemory<int> lapack_info,
+      stream_executor::DeviceMemory<std::complex<float>> workspace) override {
+    return UnimplementedError();
+  }
+  absl::Status Potrf(
+      stream_executor::blas::UpperLower uplo, int n,
+      stream_executor::DeviceMemory<std::complex<double>> a, int lda,
+      stream_executor::DeviceMemory<int> lapack_info,
+      stream_executor::DeviceMemory<std::complex<double>> workspace) override {
+    return UnimplementedError();
+  }
+
+  absl::StatusOr<int64_t> PotrfBufferSize(
+      xla::PrimitiveType type, stream_executor::blas::UpperLower uplo, int n,
+      int lda, int batch_size) override {
+    return 0;
+  }
+
+ private:
+  static absl::Status UnimplementedError() {
+    return absl::UnimplementedError("Not needed for the unit test");
+  }
+};
+
+class GpusolverRewriterTest : public HloHardwareIndependentTestBase {
+ public:
+  GpusolverRewriter gpusolver_rewriter_{GpuSolverContextStub::Create};
+};
+
+TEST_F(GpusolverRewriterTest, CholeskyTest) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule CholeskyTest
+
+  ENTRY entry_computation {
+    input = f32[1,256,256] parameter(0)
+    ROOT decomp = f32[1,256,256] cholesky(input)
+  }
+)")
+                    .value();
+
+  EXPECT_TRUE(gpusolver_rewriter_.Run(module.get()).value());
+
+  const HloInstruction* entry_root =
+      module->entry_computation()->root_instruction();
+  ASSERT_THAT(
+      entry_root,
+      GmockMatch(m::Select(
+          m::Broadcast(
+              m::Compare(m::GetTupleElement(), m::Broadcast(m::Constant()))),
+          m::GetTupleElement(m::CustomCall()), m::Broadcast(m::Constant()))));
+}
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/horizontal_input_fusion.cc b/third_party/xla/xla/service/gpu/transforms/horizontal_input_fusion.cc
index e96146b17ecd..791bd2241fcc 100644
--- a/third_party/xla/xla/service/gpu/transforms/horizontal_input_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/horizontal_input_fusion.cc
@@ -103,7 +103,7 @@ std::vector<HloInstruction*> FindAndSortFusionCandidates(
                 // shapes will be placed adjacent each other.
                 // Sort `fusion_instrs` according to instruction counts, because
                 // we'd like to fuse together computations of similar sizes.
-                return std::tuple{shape.rank(), shape.dimensions(),
+                return std::tuple{shape.dimensions().size(), shape.dimensions(),
                                   GetInstrCountOfFusible(*op), op->unique_id()};
               };
               return tuple_for_op(shape_a, a) < tuple_for_op(shape_b, b);
diff --git a/third_party/xla/xla/service/gpu/transforms/horizontal_loop_fusion.cc b/third_party/xla/xla/service/gpu/transforms/horizontal_loop_fusion.cc
index a6cb22add0cd..6f3d1d911442 100644
--- a/third_party/xla/xla/service/gpu/transforms/horizontal_loop_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/horizontal_loop_fusion.cc
@@ -555,7 +555,7 @@ absl::Status HorizontalLoopFusionImpl::CreateFusedComputation(
         const HloInstruction* old_output =
             GetOutputsOfFusible(*fused_fusion_instrs[j])[i];
         HloInstruction* new_output = clone_map[old_output];
-        if (new_output->shape().dimensions_size() == 1) {
+        if (new_output->shape().dimensions().size() == 1) {
           instr_outputs[j] = new_output;
         } else {
           if (!LayoutUtil::IsMonotonicWithDim0Major(
@@ -671,7 +671,7 @@ absl::Status HorizontalLoopFusionImpl::Fuse(
           HloInstruction * gep,
           MakeGetTupleElementHlo(hori_fusion_instr, total_output_id++));
       // This pass runs late, so useless bitcast won't be cleaned up.
-      if (output->shape().dimensions_size() == 1) {
+      if (output->shape().dimensions().size() == 1) {
         bitcasts_or_gte.push_back(gep);
       } else {
         bitcasts_or_gte.push_back(computation_->AddInstruction(
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
index 80ba374dcdef..8586cefea509 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <initializer_list>
 #include <memory>
 #include <tuple>
@@ -25,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -44,8 +46,8 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/logical_buffer.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
@@ -107,7 +109,7 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
   int num_spatial_dimensions = dnums.input_spatial_dimensions_size();
   if (primitive_util::IsIntegralType(input_ty)) {
     if (input_ty == S8 && num_spatial_dimensions == 2 &&
-        input_shape.dimensions_size() == 5) {
+        input_shape.dimensions().size() == 5) {
       VLOG(2) << "Using NCHW_VECT_C for int8_t conv " << instr->ToString();
       return kAllNCHW_VECT_C;
     }
@@ -174,25 +176,7 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
         cuda_compute_capability &&
         cuda_compute_capability->IsAtLeast(se::CudaComputeCapability::kVolta);
     if (!isFloat16 || !is_volta ||
-        instr->shape().tuple_shapes(0).dimensions_size() != 4) {
-      return kAllNCHW;
-    }
-
-    // Empirically we've found with Volta and cudnn <= 7.3 that backward-input
-    // convs with stride are significantly faster with NCHW layouts.
-    //
-    // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
-    // which on paper gives good performance. However, there are two
-    // observations:
-    // * a mixed layout combination is more cuDNN-bug prone, based on empirical
-    //   evidence.
-    // * we've also observed that for mixed layouts, cuDNN transposes data back
-    //   and forth from a different layout combination. If we end up with
-    //   transposes anyway, we prefer to have them in XLA, as they can be fused.
-    if (std::make_tuple(dnn_version.major_version(),
-                        dnn_version.minor_version()) <= std::make_tuple(7, 3) &&
-        instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
-        window_util::HasStride(instr->window())) {
+        instr->shape().tuple_shapes(0).dimensions().size() != 4) {
       return kAllNCHW;
     }
   } else if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
@@ -202,7 +186,8 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
     auto rocm_compute_capability =
         std::get<se::RocmComputeCapability>(gpu_version);
     if (!isFloat16 || (!rocm_compute_capability.has_nhwc_layout_support()) ||
-        instr->shape().tuple_shapes(0).dimensions_size() != 4 || !is_enabled) {
+        instr->shape().tuple_shapes(0).dimensions().size() != 4 ||
+        !is_enabled) {
       return kAllNCHW;
     }
   }
@@ -322,22 +307,54 @@ bool DotCanSupportShapeWithLayout(const HloInstruction* dot,
   // If we are able to construct a `MatrixLayout` then the dot can support
   // this layout.
   return MatrixLayout::For(shape, dot_dims.lhs_batch_dimensions().size(),
-                           dot->operand(0)->shape().rank() -
+                           dot->operand(0)->shape().dimensions().size() -
                                dot_dims.lhs_contracting_dimensions().size() -
                                dot_dims.lhs_batch_dimensions().size(),
                            dot_dims.rhs_batch_dimensions().size(),
-                           dot->operand(1)->shape().rank() -
+                           dot->operand(1)->shape().dimensions().size() -
                                dot_dims.rhs_contracting_dimensions().size() -
                                dot_dims.rhs_batch_dimensions().size())
       .ok();
 }
 
+// Checks whether some of the instruction predecessors (going up a chain of
+// unary or elementwise binary operations) are packed. An instruction is
+// considered packed if it results in a sub-byte type.
 bool IsPackedInstruction(const HloInstruction* instruction) {
-  return primitive_util::IsSubByteNonPredType(
-             instruction->shape().element_type()) ||
-         (instruction->opcode() == HloOpcode::kConvert &&
-          primitive_util::IsSubByteNonPredType(
-              instruction->operand(0)->shape().element_type()));
+  absl::flat_hash_set<const HloInstruction*> visited_instructions;
+  std::function<bool(const HloInstruction*)> is_packed_instruction =
+      [&](const HloInstruction* instruction) {
+        // Going up the chain of unary operations.
+        while (true) {
+          if (!visited_instructions.insert(instruction).second) {
+            return false;
+          }
+          // If the instruction results in a sub-byte type, then it is packed.
+          if (primitive_util::IsSubByteNonPredType(
+                  instruction->shape().element_type())) {
+            return true;
+          }
+          if (instruction->operand_count() != 1) {
+            break;
+          }
+          instruction = instruction->operand(0);
+        }
+        if (instruction->IsElementwiseBinary()) {
+          return is_packed_instruction(instruction->operand(0)) ||
+                 is_packed_instruction(instruction->operand(1));
+        }
+        return false;
+      };
+  return is_packed_instruction(instruction);
+}
+
+bool IsCustomCallToMemoryPlacement(const HloInstruction* hlo) {
+  if (hlo->opcode() != HloOpcode::kCustomCall) {
+    return false;
+  }
+  const std::string& target = hlo->custom_call_target();
+  return target == memory_annotations::kMoveToDeviceCustomCallTarget ||
+         target == memory_annotations::kMoveToHostCustomCallTarget;
 }
 
 }  // namespace
@@ -379,20 +396,12 @@ absl::Status GpuLayoutAssignment::AddDotBackendConstraints(
   // dimensions. Additionally, no batch dimension can be in the most
   // minor physical dimension for inputs or the output.
 
-  const bool xla_gpu_ensure_minor_dot_contraction_dims =
-      instruction->GetModule()
-          ->config()
-          .debug_options()
-          .xla_gpu_ensure_minor_dot_contraction_dims();
   const bool pack_along_contracting_dims =
       instruction->GetModule()
           ->config()
           .debug_options()
           .xla_gpu_experimental_pack_dot_operands_along_k_dimension();
 
-  const bool is_bf16_to_bf16 =
-      (output_type == PrimitiveType::BF16 && lhs.type == PrimitiveType::BF16 &&
-       rhs.type == PrimitiveType::BF16);
   const bool is_s8_to_s32 = output_type == PrimitiveType::S32 &&
                             lhs.type == PrimitiveType::S8 &&
                             rhs.type == PrimitiveType::S8;
@@ -404,7 +413,6 @@ absl::Status GpuLayoutAssignment::AddDotBackendConstraints(
   const se::CudaComputeCapability* cc =
       std::get_if<se::CudaComputeCapability>(&gpu_version_);
   const bool both_operands_require_minor_contraction_dims =
-      (is_bf16_to_bf16 && xla_gpu_ensure_minor_dot_contraction_dims) ||
       is_s8_to_s32 || (is_fp8 && !(cc && cc->IsBlackwell()));
 
   for (const Side& side : {lhs, rhs}) {
@@ -479,11 +487,11 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
       TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
     } else if ((HloPredicateIsOp<HloOpcode::kSort>(instruction) ||
                 IsCubDeviceRadixSort(*instruction)) &&
-               instruction->operand(0)->shape().rank() > 1) {
+               instruction->operand(0)->shape().dimensions().size() > 1) {
       // Make sure that all the operands and the output(s) have the same layout.
       Shape keys_shape = instruction->operand(0)->shape();
       Layout keys_layout =
-          LayoutUtil::GetDefaultLayoutForRank(keys_shape.rank());
+          LayoutUtil::GetDefaultLayoutForRank(keys_shape.dimensions().size());
       for (int64_t i = 0; i < instruction->operand_count(); ++i) {
         Shape shape = instruction->operand(i)->shape();
         *shape.mutable_layout() = keys_layout;
@@ -503,7 +511,7 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
     } else if (IsCustomCallToTopK(*instruction)) {
       // The output of the TopK custom call needs to have default layout.
       Layout default_layout = LayoutUtil::GetDefaultLayoutForRank(
-          instruction->operand(0)->shape().rank());
+          instruction->operand(0)->shape().dimensions().size());
       TF_ASSIGN_OR_RETURN(
           auto values_buffer,
           points_to_analysis_->GetBufferDefinedAt(instruction, {0}));
@@ -571,6 +579,13 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
             LayoutUtil::SetToDefaultLayout(subshape);
           });
       TF_RETURN_IF_ERROR(SetInstructionLayout(s, instruction));
+    } else if (IsCustomCallToMemoryPlacement(instruction)) {
+      // Make sure that host memory buffers use the default layout so that
+      // the compiler does not insert transposes on host memory buffers.
+      Shape operand_shape = instruction->operand(0)->shape();
+      LayoutUtil::SetToDefaultLayout(&operand_shape);
+      TF_RETURN_IF_ERROR(SetOperandLayout(operand_shape, instruction, 0));
+      TF_RETURN_IF_ERROR(SetInstructionLayout(operand_shape, instruction));
     }
   }
   return absl::OkStatus();
@@ -691,19 +706,12 @@ bool GpuLayoutAssignment::PropagateReductionLayoutToOperand(
 
 bool GpuLayoutAssignment::InstructionCanChangeLayoutInstance(
     const HloInstruction* instruction) {
-  // The host offloading custom calls will be eventually removed
-  // by the offloader, so we need to make sure that the calls do not change
-  // the layout and thus cause layout mismatches after the removal.
   // The TopK custom call cannot handle the case if the operand has a different
   // layout.
   const HloCustomCallInstruction* custom_call =
       DynCast<HloCustomCallInstruction>(instruction);
   if (custom_call != nullptr &&
-      (custom_call->custom_call_target() ==
-           host_memory_offload_annotations::kMoveToHostCustomCallTarget ||
-       custom_call->custom_call_target() ==
-           host_memory_offload_annotations::kMoveToDeviceCustomCallTarget ||
-       custom_call->custom_call_target() == kTopKCustomCallTarget)) {
+      custom_call->custom_call_target() == kTopKCustomCallTarget) {
     return false;
   }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
index 2e54f9ec373a..929f60758768 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
@@ -70,10 +70,8 @@ class LayoutAssignmentTest : public HloTestBase {
   }
 
   se::dnn::VersionInfo GetDnnVersion() {
-    // GpuLayoutAssignment has a special case heuristic for cudnn <= 7.3, but
-    // none of the tests trigger this heuristic.
     return GetDnnVersionInfoOrDefault(backend().default_stream_executor(),
-                                      se::dnn::VersionInfo{8, 3, 0});
+                                      se::dnn::VersionInfo{8, 9, 0});
   }
 };
 
@@ -517,9 +515,10 @@ TEST_F(LayoutAssignmentTest, MoveToHostCustomCallConstrained) {
 HloModule TestModule
 
 ENTRY entry {
-  Arg_0 = f32[2,5,5]{2,1,0} parameter(0)
+  Arg_0 = f32[2,5,5]{0,1,2} parameter(0)
   custom-call.0 = f32[2,5,5] custom-call(Arg_0), custom_call_target="MoveToHost"
-  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0), custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{1,2,0}}
+  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0),
+      custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{1,2,0}}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
@@ -536,9 +535,8 @@ ENTRY entry {
   const HloInstruction* call_0 = FindInstruction(m.get(), "custom-call.0");
   const Layout input_layout = call_0->operand(0)->shape().layout();
   const Layout output_layout = call_0->shape().layout();
-  EXPECT_TRUE(LayoutUtil::Equal(input_layout, output_layout))
-      << "Expected the same input/output layouts.  Input: " << input_layout
-      << ". Output: " << output_layout;
+  EXPECT_EQ(input_layout, LayoutUtil::GetDefaultLayoutForR3());
+  EXPECT_EQ(output_layout, LayoutUtil::GetDefaultLayoutForR3());
 }
 
 TEST_F(LayoutAssignmentTest, MoveToDeviceCustomCallConstrained) {
@@ -546,9 +544,10 @@ TEST_F(LayoutAssignmentTest, MoveToDeviceCustomCallConstrained) {
 HloModule TestModule
 
 ENTRY entry {
-  Arg_0 = f32[2,5,5]{2,1,0} parameter(0)
+  Arg_0 = f32[2,5,5]{1,2,0} parameter(0)
   custom-call.0 = f32[2,5,5] custom-call(Arg_0), custom_call_target="MoveToDevice"
-  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0), custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{1,2,0}}
+  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0),
+      custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{0,1,2}}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
@@ -565,9 +564,8 @@ ENTRY entry {
   const HloInstruction* call_0 = FindInstruction(m.get(), "custom-call.0");
   const Layout input_layout = call_0->operand(0)->shape().layout();
   const Layout output_layout = call_0->shape().layout();
-  EXPECT_TRUE(LayoutUtil::Equal(input_layout, output_layout))
-      << "Expected the same input/output layouts.  Input: " << input_layout
-      << ". Output: " << output_layout;
+  EXPECT_EQ(input_layout, LayoutUtil::GetDefaultLayoutForR3());
+  EXPECT_EQ(output_layout, LayoutUtil::GetDefaultLayoutForR3());
 }
 
 TEST_F(LayoutAssignmentTest, CuDNNConvolutionHasNHWCLayoutPostHopper) {
@@ -874,6 +872,46 @@ TEST_F(LayoutAssignmentTest, AutoLayoutS4DotContractingMinorRhs) {
               GmockMatch(m::Dot().WithShape(BF16, {128, 10240}, {1, 0})));
 }
 
+TEST_F(LayoutAssignmentTest, AutoLayoutS4DotFollowingTheChain) {
+  const char* hlo = R"(
+  HloModule AutoLayoutS4DotFollowingTheChain
+
+  ENTRY main {
+    p0 = s4[3072,128] parameter(0)
+    p0.c = s8[3072,128] convert(p0)
+    p0.c2 = bf16[3072,128] convert(p0.c)
+    p1 = bf16[3072,128] parameter(1)
+    p0.m = bf16[3072,128] multiply(p0.c2, p1)
+    p2 = bf16[3072,9216] parameter(2)
+    ROOT dot = bf16[128,9216] dot(p0.m, p2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> m,
+      ParseAndReturnUnverifiedModule(
+          hlo, {}, HloParserOptions().set_fill_missing_layouts(false)));
+  DebugOptions debug_options = m->config().debug_options();
+  debug_options.set_xla_gpu_experimental_pack_dot_operands_along_k_dimension(
+      true);
+  m->mutable_config().set_debug_options(debug_options);
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion(),
+      GetDeviceDescription());
+  ASSERT_THAT(layout_assignment.Run(m.get()), IsOkAndHolds(true));
+  EXPECT_THAT(m->entry_computation()->parameter_instruction(0),
+              GmockMatch(m::Parameter(0).WithShape(S4, {3072, 128}, {0, 1})));
+  EXPECT_THAT(m->entry_computation()->parameter_instruction(1),
+              GmockMatch(m::Parameter(1).WithShape(BF16, {3072, 128}, {0, 1})));
+  EXPECT_THAT(
+      m->entry_computation()->parameter_instruction(2),
+      GmockMatch(m::Parameter(2).WithShape(BF16, {3072, 9216}, {1, 0})));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Dot().WithShape(BF16, {128, 9216}, {1, 0})));
+}
+
 TEST_F(LayoutAssignmentTest, VariadicReduceSameOperandLayout) {
   const char* module_str = R"(
 HloModule variadic_reduce
diff --git a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users_test.cc b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users_test.cc
index c6bb3be881c9..b7256834af97 100644
--- a/third_party/xla/xla/service/gpu/transforms/move_copy_to_users_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/move_copy_to_users_test.cc
@@ -21,19 +21,20 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/layout_assignment.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class MoveCopyToUsersTest : public HloTestBase {
+class MoveCopyToUsersTest : public HloHardwareIndependentTestBase {
  public:
   MoveCopyToUsersTest()
-      : HloTestBase(/*verifier_layout_sensitive=*/true,
-                    /*allow_mixed_precision_in_hlo_verifier=*/true,
-                    LayoutAssignment::InstructionCanChangeLayout) {}
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/true,
+            /*allow_mixed_precision_in_hlo_verifier=*/true,
+            LayoutAssignment::InstructionCanChangeLayout) {}
   void CheckMoveCopyToUsers(absl::string_view hlo,
                             std::optional<absl::string_view> expected) {
     RunAndFilecheckHloRewrite(hlo, MoveCopyToUsers{}, expected);
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
index 88906d6361c7..766909ab8cb6 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion.cc
@@ -103,7 +103,7 @@ FusionDecision ParameterSlicesAreNonOverlapping(const HloInstruction& instr1,
   auto& limits1 = slice1->slice_limits();
   auto& limits2 = slice2->slice_limits();
 
-  for (int64_t dim = 0; dim < parent->shape().rank(); ++dim) {
+  for (int64_t dim = 0; dim < parent->shape().dimensions().size(); ++dim) {
     bool overlap = starts1[dim] < limits2[dim] && starts2[dim] < limits1[dim];
     if (!overlap) {
       return FusionDecision::Forbid("slices are non-overlapping");
@@ -188,6 +188,7 @@ FusionDecision ProducerCandidateIsFusible(
     const HloInstruction& producer, const HloInstruction& consumer,
     const HloDfsReachability& reachability, FusionInfoCache* fusion_info_cache,
     const se::DeviceDescription& device_info,
+    GpuPerformanceModelOwning& gpu_performance_model,
     GpuHloCostAnalysis* cost_analysis) {
   if (!IsFusibleAsMultiOutputFusionRoot(consumer, device_info)) {
     return FusionDecision::Forbid(
@@ -209,8 +210,8 @@ FusionDecision ProducerCandidateIsFusible(
   }
 
   GpuPerformanceModel::RunTimes t =
-      GpuPerformanceModel::EstimateRunTimesForMultiOutputFusion(
-          &producer, &consumer, device_info, cost_analysis);
+      gpu_performance_model.EstimateRunTimesForMultiOutputFusion(
+          &producer, &consumer, cost_analysis);
   if (t.time_fused > t.time_unfused) {
     return FusionDecision::Forbid("will execute slower if fused");
   }
@@ -222,6 +223,7 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
     const HloInstruction* producer, const HloDfsReachability& reachability,
     FusionInfoCache* fusion_info_cache,
     const se::DeviceDescription& device_info,
+    GpuPerformanceModelOwning& gpu_performance_model,
     GpuHloCostAnalysis* cost_analysis) {
   std::vector<HloInstruction*> fusion_candidates;
   const HloComputation* computation = producer->parent();
@@ -249,7 +251,7 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
 
     if (auto decision = ProducerCandidateIsFusible(
             *producer, *consumer, reachability, fusion_info_cache, device_info,
-            cost_analysis)) {
+            gpu_performance_model, cost_analysis)) {
       fusion_candidates.push_back(consumer);
     } else if (dump_fusion) {
       RegisterFusionState(
@@ -420,6 +422,7 @@ absl::StatusOr<bool> MultiOutputFusion::DoMultiOutputFusion() {
       computation_->MakeInstructionPostOrder();
 
   FusionInfoCache fusion_info_cache(device_info_);
+  GpuPerformanceModelOwning gpu_performance_model(device_info_);
   // Traverse the HLO in uses-before-defs order.
   for (auto it = defs_before_uses.rbegin(); it != defs_before_uses.rend();
        ++it) {
@@ -443,7 +446,7 @@ absl::StatusOr<bool> MultiOutputFusion::DoMultiOutputFusion() {
     // traversal, and hence, not get into the way of subsequent fusion attempts.
     const auto candidates = GetProducerConsumerMultiOutputFusionCandidates(
         producer, *reachability_, &fusion_info_cache, device_info_,
-        &cost_analysis);
+        gpu_performance_model, &cost_analysis);
     auto* consumer_for_fusion = SelectPreferredFusionCandidate(candidates);
     if (consumer_for_fusion == nullptr) {
       continue;
diff --git a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
index 100740e14c5b..3b5d0ac89072 100644
--- a/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/multi_output_fusion_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_fusible.h"
@@ -34,14 +35,13 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace gpu {
 
 namespace m = ::xla::match;
 
-class MultiOutputFusionTest : public HloTestBase {
+class MultiOutputFusionTest : public HloHardwareIndependentTestBase {
  public:
   MultiOutputFusion mof_{TestGpuDeviceInfo::RTXA6000DeviceInfo(),
                          HloCostAnalysis::DefaultShapeSize};
@@ -1797,26 +1797,26 @@ HloModule module
 
 fused_computation {
   param_0.1 = f16[16,32]{1,0} parameter(0)
-  s.1 = f32[16,32]{1,0} convert(param_0.1)
-  ROOT t.1 = f32[32,16]{1,0} transpose(s.1), dimensions={1,0}
+  s.1 = s16[16,32]{1,0} convert(param_0.1)
+  ROOT t.1 = s16[32,16]{1,0} transpose(s.1), dimensions={1,0}
 }
 
 ENTRY main {
   p = f16[16,32]{1,0} parameter(0)
-  fusion = f32[32,16]{1,0} fusion(p), kind=kInput, calls=fused_computation
+  fusion = s16[32,16]{1,0} fusion(p), kind=kInput, calls=fused_computation
   t1 = f16[32,16]{1,0} transpose(p), dimensions={1,0}
-  ROOT t = (f32[32,16]{1,0}, f16[32,16]{1,0}) tuple(fusion, t1)
+  ROOT t = (s16[32,16]{1,0}, f16[32,16]{1,0}) tuple(fusion, t1)
 }
   )";
 
   CheckMultiOutputFusion(hlo, R"(
-// CHECK: %fused_computation (param_0.1: f16[16,32]) -> (f32[32,16], f16[32,16]) {
+// CHECK: %fused_computation (param_0.1: f16[16,32]) -> (s16[32,16], f16[32,16]) {
 // CHECK-NEXT:   [[param_0_1_0:%[^ ]+]] = f16[16,32]{1,0} parameter(0)
-// CHECK-NEXT:   [[s_1_1:%[^ ]+]] = f32[16,32]{1,0} convert([[param_0_1_0]])
-// CHECK-NEXT:   [[c_1_2:%[^ ]+]] = f32[32,16]{1,0} transpose([[s_1_1]]), dimensions={1,0}
+// CHECK-NEXT:   [[s_1_1:%[^ ]+]] = s16[16,32]{1,0} convert([[param_0_1_0]])
+// CHECK-NEXT:   [[c_1_2:%[^ ]+]] = s16[32,16]{1,0} transpose([[s_1_1]]), dimensions={1,0}
 // CHECK-NEXT:   [[c1_1_3:%[^ ]+]] = f16[32,16]{1,0} transpose([[param_0_1_0]]), dimensions={1,0}
-// CHECK-NEXT:   ROOT [[tuple_4:%[^ ]+]] = (f32[32,16]{1,0}, f16[32,16]{1,0}) tuple([[c_1_2]], [[c1_1_3]])
-// CHECK:   [[fusion_5:%[^ ]+]] = (f32[32,16]{1,0}, f16[32,16]{1,0}) fusion([[p_6:%[^ ]+]]), kind=kInput, calls=[[fused_computation_7:%[^ ]+]]
+// CHECK-NEXT:   ROOT [[tuple_4:%[^ ]+]] = (s16[32,16]{1,0}, f16[32,16]{1,0}) tuple([[c_1_2]], [[c1_1_3]])
+// CHECK:   [[fusion_5:%[^ ]+]] = (s16[32,16]{1,0}, f16[32,16]{1,0}) fusion([[p_6:%[^ ]+]]), kind=kInput, calls=[[fused_computation_7:%[^ ]+]]
 )");
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index 1b4ddb9c74eb..bda35b22ebfd 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <deque>
-#include <iterator>
+#include <memory>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -29,14 +29,18 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/MLIRContext.h"
+#include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -45,6 +49,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_indexing_utils.h"
@@ -54,20 +59,25 @@ limitations under the License.
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
 #include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/instruction_fusion.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/macros.h"
-#include "tsl/platform/statusor.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tools/hlo_extractor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 
 namespace {
-// Fuses the given instructions together. The instructions are expected to be
-// passed in def-before-use order.  The resulting fusion has a single root
-// instruction, which is the last instructions in the input span.  We only
-// replace the uses of the root in 'consumer', and leave other users alone.
-absl::Status FuseInstructionsForConsumer(
-    absl::Span<HloInstruction* const> instructions, HloInstruction& consumer) {
-  HloComputation::Builder builder(instructions.back()->name());
+
+// Creates a fusion for instructions starting from 'root' and returns it.
+absl::StatusOr<HloInstruction*> FuseInstructionsFromRoot(HloInstruction& root) {
+  std::vector<HloInstruction*> instructions =
+      root.parent()->MakeInstructionPostOrderFrom(root);
+
+  HloComputation::Builder builder(root.name());
 
   absl::flat_hash_map<const HloInstruction*, HloInstruction*>
       old_to_new_mapping;
@@ -101,27 +111,37 @@ absl::Status FuseInstructionsForConsumer(
     old_to_new_mapping[instruction] = builder.AddInstruction(
         instruction->CloneWithNewOperands(instruction->shape(), new_operands));
   }
-
-  HloInstruction* old_root = instructions.back();
-  old_to_new_mapping[old_root]->MarkAsRoot();
+  old_to_new_mapping[&root]->MarkAsRoot();
 
   HloComputation* computation =
-      old_root->GetModule()->AddComputationAndUnifyNamesAndIds(
-          builder.Build(), /*is_entry=*/false);
+      root.GetModule()->AddComputationAndUnifyNamesAndIds(builder.Build(),
+                                                          /*is_entry=*/false);
   HloInstruction* fusion =
-      old_root->parent()->AddInstruction(HloInstruction::CreateFusion(
-          old_root->shape(), HloInstruction::FusionKind::kCustom, parameters,
+      root.parent()->AddInstruction(HloInstruction::CreateFusion(
+          root.shape(), HloInstruction::FusionKind::kCustom, parameters,
           computation));
   fusion->GetModule()->SetAndUniquifyInstrName(fusion, "block_fusion");
 
+  return fusion;
+}
+
+// Fuses the instructions starting from 'root' for 'consumer'. Other users of
+// 'root' are not affected. Annotates fusion with `kTritonNestedGemmFusionKind`.
+absl::Status FuseInstructionsForConsumer(HloInstruction& root,
+                                         HloInstruction& consumer) {
+  CHECK(absl::c_count(consumer.operands(), &root) != 0)
+      << "Consumer " << consumer.ToString() << " does not use root "
+      << root.ToString();
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * fusion, FuseInstructionsFromRoot(root));
+
   TF_ASSIGN_OR_RETURN(auto gpu_config,
                       fusion->backend_config<GpuBackendConfig>());
-  FusionBackendConfig& backend_config =
-      *gpu_config.mutable_fusion_backend_config();
-  backend_config.set_kind(std::string(kTritonFusionKind));
+  gpu_config.mutable_fusion_backend_config()->set_kind(
+      std::string(kTritonNestedGemmFusionKind));
   TF_RETURN_IF_ERROR(fusion->set_backend_config(gpu_config));
 
-  for (int64_t operand_index : consumer.OperandIndices(old_root)) {
+  for (int64_t operand_index : consumer.OperandIndices(&root)) {
     TF_RETURN_IF_ERROR(consumer.ReplaceOperandWith(operand_index, fusion));
   }
 
@@ -155,19 +175,23 @@ absl::Status AnnotateDotOperandNestedFusionImpl(
 
   // We have a single contracting dimension, and a single non-contracting
   // dimension. All the other output tile sizes are set to 1.
-  std::vector<int64_t> output_tile_sizes(dot.operand(0)->shape().rank(), 1);
+  std::vector<int64_t> output_tile_sizes(
+      dot.operand(0)->shape().dimensions().size(), 1);
   output_tile_sizes[contracting_dimensions[0]] = contracting_dim_size;
   output_tile_sizes[non_contracting_dimensions[0]] = non_contracting_dim_size;
 
   BlockLevelParameters block_level_parameters;
   block_level_parameters.output_tile_sizes = {std::move(output_tile_sizes)};
+  block_level_parameters.num_warps = config.num_warps;
+  block_level_parameters.num_ctas = config.num_ctas;
+  block_level_parameters.num_stages = config.num_stages;
 
-  TF_ASSIGN_OR_RETURN(auto backend_config,
+  TF_ASSIGN_OR_RETURN(auto gpu_config,
                       nested_fusion.backend_config<GpuBackendConfig>());
-  *backend_config.mutable_fusion_backend_config()
+  *gpu_config.mutable_fusion_backend_config()
        ->mutable_block_level_fusion_config() =
       block_level_parameters.ToBlockLevelFusionConfig();
-  TF_RETURN_IF_ERROR(nested_fusion.set_backend_config(backend_config));
+  TF_RETURN_IF_ERROR(nested_fusion.set_backend_config(gpu_config));
 
   return absl::OkStatus();
 }
@@ -192,76 +216,6 @@ absl::Status AnnotateDotRhsNestedFusion(HloFusionInstruction& nested_fusion,
       dimension_numbers.rhs_batch_dimensions(), config.block_k, config.block_n);
 }
 
-// Finds tile sizes for the root of the analysis that satisfy the
-// requirements of the dot. That is, the tile sizes need to satisfy the
-// constraints of the analysis and map to the given config of the dot.
-absl::StatusOr<llvm::SmallVector<int64_t>> FindOutputTileSizesForEpilogue(
-    HloDotInstruction* dot, const TritonGemmConfig& config,
-    mlir::MLIRContext* ctx) {
-  HloComputation* computation = dot->parent();
-  SymbolicTileAnalysisOrError analysis_or =
-      SymbolicTileAnalysis::AnalyzeComputation(*computation, ctx);
-  if (std::holds_alternative<FusionDecision>(analysis_or)) {
-    return absl::InternalError(
-        absl::StrCat("Failed to analyze the computation (",
-                     std::get<FusionDecision>(analysis_or).Explain(),
-                     "): ", computation->ToString()));
-  }
-
-  auto& analysis = std::get<SymbolicTileAnalysis>(analysis_or);
-  const auto& tiled_instructions = analysis.GetSymbolicTiledHloComputation();
-  auto is_dot = [&](const auto& instr) { return instr->hlo() == dot; };
-  auto tiled_dot_it = absl::c_find_if(tiled_instructions, is_dot);
-  if (tiled_dot_it == tiled_instructions.end()) {
-    return absl::InternalError(absl::StrCat(
-        "Couldn't find a symbolic tiled instruction for ", dot->ToString()));
-  }
-  const SymbolicTiledHloInstruction& tiled_dot = **tiled_dot_it;
-
-  auto get_tile_sizes = [&](int64_t rank) {
-    CHECK_GE(rank, 2);
-    // We always expect the shape to be [1, ..., block_m, block_n], by
-    // construction of GemmFusions.
-    llvm::SmallVector<int64_t> tile_sizes(rank - 2, 1);
-    tile_sizes.append({config.block_m, config.block_n});
-    return tile_sizes;
-  };
-
-  auto expected_dot_tile_sizes = get_tile_sizes(dot->shape().rank());
-  if (VLOG_IS_ON(2)) {
-    std::ostringstream oss;
-    for (const auto& size : expected_dot_tile_sizes) {
-      oss << size << " ";
-    }
-    LOG(INFO) << "FindOutputTileSizesForEpilogue: " << tiled_dot.ToString()
-              << "Constraints: " << analysis.GetConstraints().ToString()
-              << "Expected dot tile sizes: " << oss.str();
-  }
-
-  // Try all permutations of the dot tile sizes to see if any of them satisfy
-  // the constraints of the analysis and map to the given config of the dot.
-  int64_t out_rank = computation->root_instruction()->shape().rank();
-  auto output_tile_sizes = get_tile_sizes(out_rank);
-  std::sort(output_tile_sizes.begin(), output_tile_sizes.end());
-  do {
-    TF_ASSIGN_OR_RETURN(
-        bool parameters_satisfy_constraints,
-        analysis.ParametersSatisfyConstraints(output_tile_sizes));
-    if (!parameters_satisfy_constraints) {
-      continue;
-    }
-    auto mapped_dot_tile_sizes =
-        EvaluateTileSizes(tiled_dot.symbolic_tile(), output_tile_sizes);
-    if (mapped_dot_tile_sizes == expected_dot_tile_sizes) {
-      return output_tile_sizes;
-    }
-  } while (std::next_permutation(output_tile_sizes.begin(),
-                                 output_tile_sizes.end()));
-
-  return absl::InternalError(absl::StrCat(
-      "Couldn't find output tile sizes that satisfy ", tiled_dot.ToString()));
-}
-
 // Extracts the TritonGemmConfig from the given fusion's backend config.
 absl::StatusOr<TritonGemmConfig> GetTritonGemmConfig(
     const HloFusionInstruction& fusion) {
@@ -276,6 +230,20 @@ absl::StatusOr<TritonGemmConfig> GetTritonGemmConfig(
   return TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
 }
 
+// Constructs nested fusion nodes for the operands of `concatenate` instructions
+// and annotates them with `kTritonNestedGemmFusionKind`.
+absl::Status FuseAndAnnotateConcatOperands(HloComputation* computation) {
+  for (HloInstruction* instr : computation->MakeInstructionPostOrder()) {
+    if (instr->opcode() != HloOpcode::kConcatenate) {
+      continue;
+    }
+    for (HloInstruction* operand : instr->mutable_operands()) {
+      TF_RETURN_IF_ERROR(FuseInstructionsForConsumer(*operand, *instr));
+    }
+  }
+  return absl::OkStatus();
+}
+
 // Transforms a fusion into an equivalent nested fusion if it has a single dot.
 // Returns ok if the transformation was successful.
 absl::Status MakeNestedFusionFromGemmFusion(HloFusionInstruction* fusion,
@@ -286,18 +254,20 @@ absl::Status MakeNestedFusionFromGemmFusion(HloFusionInstruction* fusion,
 
   HloComputation* computation = fusion->called_computation();
 
+  // First, create nested fusions for the operands of `concatenate` instructions
+  // if they exist.
+  TF_RETURN_IF_ERROR(FuseAndAnnotateConcatOperands(computation));
+
   // Left-hand side of the dot.
-  TF_RETURN_IF_ERROR(FuseInstructionsForConsumer(
-      computation->MakeInstructionPostOrderFrom(*dot->mutable_operand(0)),
-      *dot));
+  TF_RETURN_IF_ERROR(
+      FuseInstructionsForConsumer(*dot->mutable_operand(0), *dot));
   TF_RETURN_IF_ERROR(AnnotateDotLhsNestedFusion(
       *::xla::Cast<HloFusionInstruction>(dot->mutable_operand(0)), *dot,
       config));
 
   // Right-hand side of the dot.
-  TF_RETURN_IF_ERROR(FuseInstructionsForConsumer(
-      computation->MakeInstructionPostOrderFrom(*dot->mutable_operand(1)),
-      *dot));
+  TF_RETURN_IF_ERROR(
+      FuseInstructionsForConsumer(*dot->mutable_operand(1), *dot));
   TF_RETURN_IF_ERROR(AnnotateDotRhsNestedFusion(
       *::xla::Cast<HloFusionInstruction>(dot->mutable_operand(1)), *dot,
       config));
@@ -309,18 +279,16 @@ absl::Status MakeNestedFusionFromGemmFusion(HloFusionInstruction* fusion,
                           /*remove_cross_partition_collective_ops=*/false));
 
   // Annotate the fusion itself.
-  TF_ASSIGN_OR_RETURN(llvm::SmallVector<int64_t> output_tile_sizes,
-                      FindOutputTileSizesForEpilogue(dot, config, ctx));
-
   TF_ASSIGN_OR_RETURN(auto gpu_config,
                       fusion->backend_config<GpuBackendConfig>());
   FusionBackendConfig& backend_config =
       *gpu_config.mutable_fusion_backend_config();
-  backend_config.set_kind(std::string(kTritonFusionKind));
+  backend_config.clear_triton_gemm_config();
+  backend_config.set_kind(std::string(kTritonNestedGemmFusionKind));
 
-  BlockLevelParameters block_level_parameters;
-  block_level_parameters.output_tile_sizes = {
-      std::vector<int64_t>(output_tile_sizes.begin(), output_tile_sizes.end())};
+  TF_ASSIGN_OR_RETURN(
+      BlockLevelParameters block_level_parameters,
+      ::xla::gpu::detail::FindBlockLevelParameters(dot, config, ctx));
 
   *backend_config.mutable_block_level_fusion_config() =
       block_level_parameters.ToBlockLevelFusionConfig();
@@ -337,7 +305,7 @@ size_t GetDotCount(HloComputation* computation) {
 // Returns the set of instructions that are reachable from 'instruction' using
 // the given accessor.
 template <typename T>
-HloInstructionSet GetTransitiveInstructionSet(HloInstruction* instruction,
+HloInstructionSet GetTransitiveInstructionSet(const HloInstruction* instruction,
                                               T (HloInstruction::*get)()
                                                   const) {
   std::deque<HloInstruction*> worklist;
@@ -357,11 +325,11 @@ HloInstructionSet GetTransitiveInstructionSet(HloInstruction* instruction,
 }
 
 // Returns the set of producers reachable from 'instruction'.
-HloInstructionSet GetProducerSet(HloInstruction* instruction) {
+HloInstructionSet GetProducerSet(const HloInstruction* instruction) {
   return GetTransitiveInstructionSet(instruction, &HloInstruction::operands);
 }
 // Returns the set of consumers reachable from 'instruction'.
-HloInstructionSet GetConsumerSet(HloInstruction* instruction) {
+HloInstructionSet GetConsumerSet(const HloInstruction* instruction) {
   return GetTransitiveInstructionSet(instruction, &HloInstruction::users);
 }
 
@@ -370,7 +338,7 @@ HloInstructionSet GetConsumerSet(HloInstruction* instruction) {
 // either in the set itself or the root.
 template <typename T>
 absl::Status VerifyIsClosedInstructionSet(const HloInstructionSet& instructions,
-                                          HloInstruction* root,
+                                          const HloInstruction* root,
                                           T (HloInstruction::*get)() const) {
   for (HloInstruction* instruction : instructions) {
     for (HloInstruction* reachable : (instruction->*get)()) {
@@ -388,59 +356,243 @@ absl::Status VerifyIsClosedInstructionSet(const HloInstructionSet& instructions,
 }
 
 absl::Status VerifyIsClosedProducerSet(const HloInstructionSet& instructions,
-                                       HloInstruction* root) {
+                                       const HloInstruction* root) {
   return VerifyIsClosedInstructionSet(instructions, root,
                                       &HloInstruction::users);
 }
 absl::Status VerifyIsClosedConsumerSet(const HloInstructionSet& instructions,
-                                       HloInstruction* root) {
+                                       const HloInstruction* root) {
   return VerifyIsClosedInstructionSet(instructions, root,
                                       &HloInstruction::operands);
 }
 
-// Returns true if it's safe to hoist a bitcast past the given instruction.
-bool IsSafeToHoistPast(HloInstruction* instruction) {
+bool IsSafeToSinkBitcastBelow(HloInstruction* instruction) {
   switch (instruction->opcode()) {
-    case HloOpcode::kParameter:
-    case HloOpcode::kConstant:
     case HloOpcode::kBitcast:
+      // TODO(b/393299275): Support sinking through broadcast.
       return true;
     default:
       return instruction->IsElementwise();
   }
 }
 
-// Hoists the given 'bitcast' upwards out of its computation, to the parent of
-// each caller.
-absl::Status HoistBitcastUpwardsToCallers(
-    HloInstruction* bitcast, const std::vector<HloInstruction*>& callers) {
-  HloInstructionSet producers = GetProducerSet(bitcast);
-  TF_RETURN_IF_ERROR(VerifyIsClosedProducerSet(producers, bitcast));
+// Parameters to rewrite a broadcast + reshape as reshape + broadcast.
+struct ReshapeBroadcastOutputParams {
+  std::vector<int64_t> new_broadcast_dim_map;
+  Shape new_operand_shape;
+};
 
-  if (auto it = absl::c_find_if_not(producers, IsSafeToHoistPast);
-      it != producers.end()) {
-    return absl::InternalError(
-        absl::StrCat("Cannot hoist bitcast past ", (*it)->ToString()));
+// Returns parameters to rewrite a broadcast + reshape as reshape + broadcast.
+//
+// Example:
+//
+// b = T[...] broadcast(operand)
+// c = T[target_dims] reshape(b)
+//
+// to
+//
+// d = [new operand shape] reshape(operand)
+// c = T[target_dims] broadcast(d), dimensions={broadcast dims parameter}.
+//
+// Assumes that:
+// - broadcast does not transpose dimensions (checked by hlo_verifier);
+// - reshape does not mix operand and broadcast dimensions (checks);
+absl::StatusOr<ReshapeBroadcastOutputParams> CalculateBroadcastOutputReshape(
+    const HloBroadcastInstruction* broadcast,
+    absl::Span<const int64_t> target_dims) {
+  // The rewrite works by splitting the broadcast output dimensions and the
+  // target dimensions into groups of equal size. Every group is then associated
+  // with either the operand or added broadcast dimensions. A group coming from
+  // the operand is used to construct the new operand shape.
+  auto broadcast_dims = broadcast->shape().dimensions();
+  QCHECK_EQ(broadcast->dimensions().size(),
+            broadcast->operands()[0]->shape().dimensions().size())
+      << absl::StrCat("Broadcast 'dimensions' parameter size ",
+                      broadcast->dimensions().size(),
+                      " does not the match the operand rank ",
+                      broadcast->operands()[0]->shape().dimensions().size());
+  if (Product(broadcast_dims) != Product(target_dims)) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Broadcast shape dimensions product ", Product(broadcast_dims), " (",
+        broadcast->shape().ToString(),
+        ") does not match target shape dimensions product ",
+        Product(target_dims), " (", absl::StrJoin(target_dims, ","), ")"));
+  }
+  if (!LayoutUtil::IsMonotonicWithDim0Major(broadcast->shape().layout())) {
+    // TODO(b/393299275): do we need to support non-default layouts?
+    return absl::UnimplementedError(
+        absl::StrCat("Not-default layouts for broadcast is not supported yet: ",
+                     broadcast->shape().layout().ToString()));
+  }
+  std::vector<bool> output_dim_from_operand(broadcast_dims.size(), false);
+  for (const int64_t i : broadcast->dimensions()) {
+    output_dim_from_operand[i] = true;
   }
+  ReshapeBroadcastOutputParams result;
+  std::vector<int64_t> new_operand_dims;
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> factors =
+      CommonFactors(broadcast_dims, target_dims);
+  for (int64_t i = 0; i + 1 < factors.size(); ++i) {
+    bool has_broadcasted_dim = false;
+    bool has_operand_dim = false;
+    auto [broadcast_from, target_from] = factors[i];
+    auto [broadcast_to, target_to] = factors[i + 1];
+    for (int64_t j = broadcast_from; j < broadcast_to; ++j) {
+      has_operand_dim |= output_dim_from_operand[j];
+      has_broadcasted_dim |= !output_dim_from_operand[j];
+    }
+    if (!has_operand_dim) {
+      // Group of dimensions is coming from the broadcast, skip it as it will
+      // be simply introduced by the new broadcast.
+      continue;
+    }
+    if (has_broadcasted_dim) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot reshape broadcast for ", broadcast->ToString(),
+                       " as it mixes operand and broadcast dimensions."));
+    }
+    // Update the expected operand shape.
+    for (int64_t j = target_from; j < target_to; ++j) {
+      result.new_broadcast_dim_map.push_back(j);
+      new_operand_dims.push_back(target_dims[j]);
+    }
+  }
+  result.new_operand_shape = ShapeUtil::MakeShape(
+      broadcast->operand(0)->shape().element_type(), new_operand_dims);
+  return std::move(result);
+}
 
-  // Adjust the shape of of every producer instruction.
-  Shape shape = bitcast->shape();
-  for (HloInstruction* instruction : producers) {
-    *instruction->mutable_shape() = shape;
-    if (HloPredicateIsNotOp<HloOpcode::kParameter>(instruction)) {
+// Simulates a rewrite of all producers of a given bitcast, moving the bitcast
+// outside of the computation.
+// Returns the new shapes of affected instructions in order of traversal from
+// users to producers.
+// Assumes that the bitcast does not covert the type of the operand.
+absl::StatusOr<std::vector<std::pair<HloInstruction*, Shape>>>
+PlanHoistBitcastToCallers(const HloInstruction* bitcast) {
+  // Check that all producers only affect the bitcast. If there are any
+  // other users: refuse the hoisting.
+  // It is possible to support more cases by sinking the bitcast from such
+  // producers downward.
+  HloInstructionSet producers = GetProducerSet(bitcast);
+  TF_RETURN_IF_ERROR(VerifyIsClosedProducerSet(producers, bitcast));
+  if (bitcast->shape().element_type() !=
+      bitcast->operand(0)->shape().element_type()) {
+    return absl::UnimplementedError(
+        absl::StrCat("Hoisting bitcast with type conversion is not supported: ",
+                     bitcast->ToString()));
+  }
+  HloInstructionMap<Shape> to_update;
+
+  auto set_shape = [&](const absl::Span<HloInstruction* const> instructions,
+                       const Shape& shape) -> absl::Status {
+    for (HloInstruction* instruction : instructions) {
+      auto it = to_update.find(instruction);
+      // Only update the dimensions keeping the type intact.
+      Shape updated_shape(shape);
+      updated_shape.set_element_type(instruction->shape().element_type());
+      if (it == to_update.end()) {
+        to_update.emplace(instruction, updated_shape);
+      } else if (it->second != updated_shape) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Conflicting shape assignment for ", instruction->ToString(),
+            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
+            ShapeUtil::HumanStringWithLayout(shape)));
+      }
+    }
+    return absl::OkStatus();
+  };
+  TF_RETURN_IF_ERROR(set_shape(bitcast->operands(), bitcast->shape()));
+  std::vector<std::pair<HloInstruction*, Shape>> result;
+  // We want to visit instructions in order from users to producers: we
+  // hoist the bitcast upwards and having a valid HLO at every rewrite step
+  // helps a lot.
+  // A simple DFS or BFS over operands will not work in non-tree situations when
+  // there are multiple users of the same producer. Instead of writing a custom
+  // traversal we can simply walk the post-order (producers before users)
+  // list backward and only update the instructions affected.
+  // TODO(b/393299275): use MakeInstructionPostOrderFrom(bitcast) - that should
+  // be slightly more efficient.
+  auto use_before_def = bitcast->parent()->MakeInstructionPostOrder();
+  absl::c_reverse(use_before_def);
+  for (HloInstruction* instruction : use_before_def) {
+    auto it = to_update.find(instruction);
+    if (it == to_update.end()) {
+      // Not affected.
       continue;
     }
-    // For parameters, we need to bitcast the caller's operand.
-    int64_t number = instruction->parameter_number();
-    for (HloInstruction* caller : callers) {
-      HloInstruction* new_bitcast =
-          caller->AddInstruction(HloInstruction::CreateBitcast(
-              shape, caller->mutable_operand(number)));
-      TF_RETURN_IF_ERROR(
-          caller->ReplaceOperandWithDifferentShape(number, new_bitcast));
+    Shape& shape = it->second;
+    result.emplace_back(instruction, shape);
+    VLOG(2) << absl::StrCat(
+        "updating the shape of ", instruction->ToString(), " from ",
+        ShapeUtil::HumanStringWithLayout(instruction->shape()), " to ",
+        ShapeUtil::HumanStringWithLayout(shape));
+    switch (instruction->opcode()) {
+      case HloOpcode::kParameter:
+      case HloOpcode::kConstant:
+        // No operands.
+        break;
+      case HloOpcode::kBitcast:
+        // Other bitcast will be hoisted separately so we don't need to update
+        // its operand.
+        break;
+      case HloOpcode::kBroadcast: {
+        TF_ASSIGN_OR_RETURN(ReshapeBroadcastOutputParams params,
+                            CalculateBroadcastOutputReshape(
+                                Cast<HloBroadcastInstruction>(instruction),
+                                shape.dimensions()));
+        TF_RETURN_IF_ERROR(
+            set_shape(instruction->operands(), params.new_operand_shape));
+        break;
+      }
+      default:
+        if (!instruction->IsElementwise()) {
+          return absl::FailedPreconditionError(absl::StrCat(
+              "Cannot hoist bitcast past ", instruction->ToString()));
+        }
+        TF_RETURN_IF_ERROR(set_shape(instruction->operands(), shape));
+        break;
     }
   }
+  return result;
+}
 
+// Hoists the given 'bitcast' upwards out of its computation, to the parent of
+// each caller.
+absl::Status HoistBitcastUpwardsToCallers(
+    HloInstruction* bitcast, const std::vector<HloInstruction*>& callers) {
+  TF_ASSIGN_OR_RETURN(auto rewrite_plan, PlanHoistBitcastToCallers(bitcast));
+  for (auto [instruction, shape] : rewrite_plan) {
+    VLOG(2) << absl::StrCat(
+        "rewriting shape of ", instruction->ToString(), " from ",
+        ShapeUtil::HumanStringWithLayout(instruction->shape()), " to ",
+        ShapeUtil::HumanStringWithLayout(shape));
+    switch (instruction->opcode()) {
+      case HloOpcode::kParameter: {
+        // Create a new bitcast in callers.
+        int64_t number = instruction->parameter_number();
+        for (HloInstruction* caller : callers) {
+          HloInstruction* new_bitcast =
+              caller->AddInstruction(HloInstruction::CreateBitcast(
+                  shape, caller->mutable_operand(number)));
+          TF_RETURN_IF_ERROR(
+              caller->ReplaceOperandWithDifferentShape(number, new_bitcast));
+        }
+        break;
+      }
+      case HloOpcode::kBroadcast: {
+        auto* broadcast = Cast<HloBroadcastInstruction>(instruction);
+        auto params =
+            CalculateBroadcastOutputReshape(broadcast, shape.dimensions());
+        QCHECK_OK(params);  // This must be OK as we have already ran this in
+                            // AssignShapesToHoistBitcastToCallers.
+        *broadcast->mutable_dimensions() = params.value().new_broadcast_dim_map;
+        break;
+      }
+      default:
+        break;
+    }
+    *instruction->mutable_shape() = shape;
+  }
   TF_RETURN_IF_ERROR(bitcast->ReplaceAllUsesWith(bitcast->mutable_operand(0)));
   TF_RETURN_IF_ERROR(bitcast->parent()->RemoveInstruction(bitcast));
   return absl::OkStatus();
@@ -451,13 +603,14 @@ absl::Status HoistBitcastUpwardsToCallers(
 absl::Status HoistBitcastDownwardsToCallers(
     HloInstruction* bitcast, const std::vector<HloInstruction*>& callers) {
   HloInstructionSet consumers = GetConsumerSet(bitcast);
+  // Check whether all operands of consumers are within the set of consumers, or
+  // the bitcast itself.
   TF_RETURN_IF_ERROR(VerifyIsClosedConsumerSet(consumers, bitcast));
   auto is_root = [](HloInstruction* instr) { return instr->IsRoot(); };
   CHECK(is_root(bitcast) || absl::c_any_of(consumers, is_root))
       << "Expected" << bitcast->ToString()
       << " to be a root or have a root consumer.";
-
-  if (auto it = absl::c_find_if_not(consumers, IsSafeToHoistPast);
+  if (auto it = absl::c_find_if_not(consumers, IsSafeToSinkBitcastBelow);
       it != consumers.end()) {
     return absl::InternalError(
         absl::StrCat("Cannot hoist bitcast past ", (*it)->ToString()));
@@ -466,7 +619,13 @@ absl::Status HoistBitcastDownwardsToCallers(
   // Adjust the shape of of every consumer instruction.
   Shape shape = bitcast->operand(0)->shape();
   for (HloInstruction* instruction : consumers) {
-    *instruction->mutable_shape() = shape;
+    Shape updated_shape(shape);
+    updated_shape.set_element_type(instruction->shape().element_type());
+    VLOG(2) << absl::StrCat(
+        "rewriting shape of ", instruction->ToString(), " from ",
+        ShapeUtil::HumanStringWithLayout(instruction->shape()), " to ",
+        ShapeUtil::HumanStringWithLayout(updated_shape));
+    *instruction->mutable_shape() = updated_shape;
   }
 
   // Insert new bitcast for each caller's result.
@@ -474,7 +633,9 @@ absl::Status HoistBitcastDownwardsToCallers(
     HloInstruction* new_bitcast = caller->AddInstruction(
         HloInstruction::CreateBitcast(caller->shape(), caller));
     TF_RETURN_IF_ERROR(caller->ReplaceAllUsesWith(new_bitcast));
-    *caller->mutable_shape() = shape;
+    Shape updated_shape(shape);
+    updated_shape.set_element_type(caller->shape().element_type());
+    *caller->mutable_shape() = updated_shape;
   }
 
   TF_RETURN_IF_ERROR(
@@ -515,38 +676,60 @@ absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
 
 class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit NestGemmFusionVisitor(mlir::MLIRContext* ctx, CallGraph* call_graph)
-      : ctx_(ctx), call_graph_(call_graph) {}
+  explicit NestGemmFusionVisitor(
+      mlir::MLIRContext* ctx, CallGraph* call_graph,
+      const se::GpuComputeCapability compute_capability)
+      : ctx_(ctx),
+        call_graph_(call_graph),
+        compute_capability_(compute_capability) {}
 
   absl::Status HandleFusion(HloInstruction* instruction) override {
     HloFusionInstruction* fusion = Cast<HloFusionInstruction>(instruction);
 
     absl::StatusOr<TritonGemmConfig> config = GetTritonGemmConfig(*fusion);
     if (!config.ok()) {
-      return absl::OkStatus();  // Skip because it's not a Triton gemm fusion.
+      VLOG(2) << "Skipping fusion as it does not have a TritonGemmConfig";
+      return absl::OkStatus();
     }
 
     HloComputation* computation = fusion->called_computation();
-    HloInstruction* dot =
+    HloInstruction* instr =
         hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
-    if (dot == nullptr) {
-      return absl::OkStatus();  // Skip because fusion has no dot.
+    if (instr == nullptr) {
+      VLOG(2) << "Skipping fusion as it has no dot instruction";
+      return absl::OkStatus();
     }
     DCHECK_EQ(GetDotCount(computation), 1) << "Fusion has more than one dot.";
-
+    HloDotInstruction* dot = Cast<HloDotInstruction>(instr);
     TF_RETURN_IF_ERROR(
-        TryHoistBitcastsInComputationToCallers(dot, call_graph_));
+        TryHoistBitcastsInComputationToCallers(instr, call_graph_));
     VLOG(2) << "After hoisting bitcasts: " << computation->ToString();
-    TF_RETURN_IF_ERROR(MakeNestedFusionFromGemmFusion(
-        fusion, config.value(), Cast<HloDotInstruction>(dot), ctx_));
+
+    TF_RETURN_IF_ERROR(
+        MakeNestedFusionFromGemmFusion(fusion, config.value(), dot, ctx_));
 
     this->MarkAsChanged();
+    // TODO(b/393299275): support checks should be run *before* the fusion is
+    // constructed and this pass should only be applied to the known supported
+    // HLO. Currently though, we are at mercy of what GemmFusion pass thinks
+    // legacy emitter can handle. We change the kind of the fusion here and
+    // switch the track. Thus it is on us to make sure that the generic emitter
+    // will be able to handle the result. That is an early check to make sure
+    // that that nesting did not produce an unsupported HLO.
+    CodegenDecision can_codegen_computation =
+        IsTritonSupportedComputation(*computation, compute_capability_);
+    if (!can_codegen_computation) {
+      return absl::InternalError(absl::StrCat(
+          "Computation of fusion ", fusion->ToString(),
+          " is not supported by Triton: ", can_codegen_computation.Explain()));
+    }
     return absl::OkStatus();
   }
 
  private:
   mlir::MLIRContext* ctx_;
   CallGraph* call_graph_;
+  const se::GpuComputeCapability compute_capability_;
 };
 
 }  // namespace
@@ -559,11 +742,98 @@ absl::StatusOr<bool> NestGemmFusion::Run(
   mlir::MLIRContext ctx;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    NestGemmFusionVisitor visitor(&ctx, call_graph.get());
+    NestGemmFusionVisitor visitor(&ctx, call_graph.get(), compute_capability_);
     TF_RETURN_IF_ERROR(computation->Accept(&visitor));
     changed |= visitor.changed();
   }
   return changed;
 }
 
+namespace detail {
+
+absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
+    HloDotInstruction* dot, const TritonGemmConfig& config,
+    mlir::MLIRContext* ctx) {
+  HloComputation* computation = dot->parent();
+  VLOG(3) << "FindOutputTileSizesForEpilogue of computation: "
+          << computation->ToString();
+  SymbolicTileAnalysisOrError analysis_or =
+      SymbolicTileAnalysis::AnalyzeComputation(*computation, ctx);
+  if (std::holds_alternative<FusionDecision>(analysis_or)) {
+    std::unique_ptr<HloModule> extracted_computation_module =
+        ExtractModule(computation->FusionInstruction());
+    return absl::InternalError(
+        absl::StrCat("Failed to analyze the computation (",
+                     std::get<FusionDecision>(analysis_or).Explain(),
+                     "): ", extracted_computation_module->ToString()));
+  }
+
+  auto& analysis = std::get<SymbolicTileAnalysis>(analysis_or);
+  const auto& tiled_instructions = analysis.GetSymbolicTiledHloComputation();
+  auto is_dot = [&](const auto& instr) { return instr->hlo() == dot; };
+  auto tiled_dot_it = absl::c_find_if(tiled_instructions, is_dot);
+  if (tiled_dot_it == tiled_instructions.end()) {
+    return absl::InternalError(absl::StrCat(
+        "Couldn't find a symbolic tiled instruction for ", dot->ToString()));
+  }
+  const SymbolicTiledHloInstruction& tiled_dot = **tiled_dot_it;
+
+  auto get_tile_sizes = [&](int64_t rank) {
+    QCHECK_GE(rank, 2) << "Expected at least rank 2 for the dot, got " << rank
+                       << " in computation " << computation->ToString();
+    // We always expect the shape to be [1, ..., block_m, block_n], by
+    // construction of GemmFusions.
+    llvm::SmallVector<int64_t> tile_sizes(rank - 2, 1);
+    tile_sizes.append({config.block_m, config.block_n});
+    return tile_sizes;
+  };
+
+  VLOG(3) << "FindOutputTileSizesForEpilogue: dot shape: "
+          << dot->shape().ToString();
+  auto expected_dot_tile_sizes =
+      get_tile_sizes(dot->shape().dimensions().size());
+  if (VLOG_IS_ON(2)) {
+    std::ostringstream oss;
+    for (const auto& size : expected_dot_tile_sizes) {
+      oss << size << " ";
+    }
+    LOG(INFO) << "FindOutputTileSizesForEpilogue: " << tiled_dot.ToString()
+              << "Constraints: " << analysis.GetConstraints().ToString()
+              << "Expected dot tile sizes: " << oss.str();
+  }
+
+  // Try all permutations of the dot tile sizes to see if any of them satisfy
+  // the constraints of the analysis and map to the given config of the dot.
+  int64_t out_rank =
+      computation->root_instruction()->shape().dimensions().size();
+  VLOG(3) << "FindOutputTileSizesForEpilogue: computation root shape: "
+          << computation->root_instruction()->shape().ToString();
+  auto output_tile_sizes = get_tile_sizes(out_rank);
+  std::sort(output_tile_sizes.begin(), output_tile_sizes.end());
+  do {
+    TF_ASSIGN_OR_RETURN(
+        bool parameters_satisfy_constraints,
+        analysis.ParametersSatisfyConstraints(output_tile_sizes));
+    if (!parameters_satisfy_constraints) {
+      continue;
+    }
+    auto mapped_dot_tile_sizes =
+        EvaluateTileSizes(tiled_dot.symbolic_tile(), output_tile_sizes);
+    if (mapped_dot_tile_sizes == expected_dot_tile_sizes) {
+      BlockLevelParameters params;
+      params.output_tile_sizes = {std::vector<int64_t>(
+          output_tile_sizes.begin(), output_tile_sizes.end())};
+      params.num_warps = config.num_warps;
+      params.num_ctas = config.num_ctas;
+      params.num_stages = config.num_stages;
+      return params;
+    }
+  } while (std::next_permutation(output_tile_sizes.begin(),
+                                 output_tile_sizes.end()));
+
+  return absl::InternalError(absl::StrCat(
+      "Couldn't find output tile sizes that satisfy ", tiled_dot.ToString()));
+}
+
+}  // namespace detail
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
index aee2ece23afd..79ea4683954e 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
@@ -19,14 +19,27 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
 
 // Rewrites Triton GEMM fusions to generic Triton fusions. Any other fusions are
 // left unchanged.
 //
+// Fusions with kind kCustom and fusion_backend_config.kind "__triton_gemm" are
+// rewritten to fusion_backend_config.kind
+// "__triton_nested_fusion_gemm".
+//
+// While this new fusion kind is supported by generic triton emitter we want
+// to distinguish it from "__triton" as we don't want other passes to modify the
+// resulting fusions.
+//
 // The fusion's backend config is set to a BlockLevelFusionConfig, derived from
 // a previously set TritonGemmConfig.
 //
@@ -34,6 +47,9 @@ namespace xla::gpu {
 // nested fusions, each with their own BlockLevelFusionConfig.
 class NestGemmFusion : public HloModulePass {
  public:
+  explicit NestGemmFusion(const se::GpuComputeCapability& compute_capability)
+      : compute_capability_(compute_capability) {}
+
   absl::string_view name() const override { return "nest_gemm_fusion"; }
 
   using HloPassInterface::Run;
@@ -42,8 +58,26 @@ class NestGemmFusion : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
+  const se::GpuComputeCapability compute_capability_;
 };
 
+namespace detail {
+
+// Returns block level parameters based on tile sizes for the root of the
+// analysis that satisfy the requirements of the `dot`. That is, the tile sizes
+// need to satisfy the constraints of the analysis and map to the given `config`
+// of the dot.
+//
+// We expose this function because using `GpuDotFusionCostModel` is only
+// possible with `EstimateRunTimeForDotOpWithBlockParameters` method. This
+// function can be removed once `GpuDotFusionCostModel::EstimateRunTimeForDotOp`
+// is implemented.
+absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
+    HloDotInstruction* dot, const TritonGemmConfig& config,
+    mlir::MLIRContext* ctx);
+
+}  // namespace detail
+
 }  // namespace xla::gpu
 
 #endif  // XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index 1754fc4dab63..36310a5bd449 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -15,30 +15,31 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
-#include <ostream>
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
 
 using ::testing::ElementsAre;
-using ::testing::Values;
 using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
 
 namespace xla {
 
-// Gtest hook to pretty-print an HloInstruction.
-static void PrintTo(const HloInstruction& hlo, std::ostream* os) {
-  *os << hlo.ToString();
-}
-
 namespace gpu {
 namespace {
 
@@ -57,39 +58,48 @@ MATCHER_P(OutputTileSizesIs, matcher, "") {
     *result_listener << "has no block level fusion config";
     return false;
   }
+  if (fusion_backend_config.kind() != "__triton_nested_gemm_fusion") {
+    *result_listener << "fusion kind is not __triton_nested_gemm_fusion";
+    return false;
+  }
   auto output_tile_sizes =
       fusion_backend_config.block_level_fusion_config().output_tiles(0).sizes();
   return ExplainMatchResult(matcher, output_tile_sizes, result_listener);
 }
 
-class NestGemmFusionTest : public HloTestBase {};
+class NestGemmFusionTest : public HloHardwareIndependentTestBase {
+ protected:
+  const se::GpuComputeCapability compute_capability_{
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(se::CudaComputeCapability::Ampere())
+          .gpu_compute_capability()};
+};
 
 TEST_F(NestGemmFusionTest, BasicTest) {
   absl::string_view hlo = R"(
 dot {
-  lhs = bf16[8192,512] parameter(0)
-  rhs = bf16[512,512] parameter(1)
-  ROOT  dot = bf16[8192,512] dot(lhs, rhs),
+  lhs = f32[8192,512] parameter(0)
+  rhs = f32[512,512] parameter(1)
+  ROOT  dot = f32[8192,512] dot(lhs, rhs),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
 ENTRY entry {
-  p0 = bf16[8192,512] parameter(0)
-  p1 = bf16[512,512] parameter(1)
-  ROOT fusion = bf16[8192,512] fusion(p0, p1),
+  p0 = f32[8192,512] parameter(0)
+  p1 = f32[512,512] parameter(1)
+  ROOT fusion = f32[8192,512] fusion(p0, p1),
     kind=kCustom, calls=dot, backend_config={
       "fusion_backend_config": {
         "kind":"__triton_gemm",  "triton_gemm_config": {
           "block_m":"64", "block_n":"256", "block_k":"32",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+          "split_k":"1", "num_stages":"5", "num_warps":"4", "num_ctas":"3"
         }
       }
     }
-}
-)";
+})";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  ASSERT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 
   const HloInstruction* fusion = nullptr;
@@ -97,12 +107,27 @@ ENTRY entry {
               GmockMatch(match::Fusion(&fusion)));
   EXPECT_THAT(*fusion, OutputTileSizesIs(ElementsAre(64, 256)));
 
+  BlockLevelFusionConfig block_level_fusion_config =
+      fusion->backend_config<GpuBackendConfig>()
+          ->fusion_backend_config()
+          .block_level_fusion_config();
+  EXPECT_THAT(block_level_fusion_config.output_tiles(0).sizes(),
+              ElementsAre(64, 256));
+  EXPECT_THAT(block_level_fusion_config.num_warps(), 4);
+  EXPECT_THAT(block_level_fusion_config.num_ctas(), 3);
+  EXPECT_THAT(block_level_fusion_config.num_stages(), 5);
+
   const HloInstruction* lhs = nullptr;
   const HloInstruction* rhs = nullptr;
   EXPECT_THAT(fusion->fused_expression_root(),
               GmockMatch(match::Dot(match::Fusion(&lhs), match::Fusion(&rhs))));
   EXPECT_THAT(*lhs, OutputTileSizesIs(ElementsAre(64, 32)));
   EXPECT_THAT(*rhs, OutputTileSizesIs(ElementsAre(32, 256)));
+
+  // The old GEMM config should have been deleted.
+  EXPECT_FALSE(fusion->backend_config<GpuBackendConfig>()
+                   ->fusion_backend_config()
+                   .has_triton_gemm_config());
 }
 
 // Tests hoisting of bitcasts which would otherwise trigger unsatisfiable
@@ -133,7 +158,8 @@ ENTRY entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  ASSERT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 
   const HloInstruction* fusion = nullptr;
@@ -175,7 +201,8 @@ ENTRY entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  ASSERT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 
   const HloInstruction* fusion = nullptr;
@@ -217,7 +244,8 @@ ENTRY entry {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -244,10 +272,47 @@ ENTRY entry {
         }
       }
     }
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
 }
-)";
+
+TEST_F(NestGemmFusionTest, BitcastsCanBeHoistedPastConvertEpilogues) {
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[3,7] parameter(0)
+  rhs = f32[7,11] parameter(1)
+  dot = f32[3,11] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bitcast = f32[33] bitcast(dot)
+  ROOT convert = f16[33] convert(bitcast)
+}
+
+ENTRY entry {
+  p0 = f32[3, 7] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f16[33] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+})";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: f16[3,11]{1,0} convert(
+CHECK: f16[3,11]{1,0} fusion(
+)"),
+      IsOkAndHolds(true));
+
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -279,10 +344,10 @@ ENTRY entry {
         }
       }
     }
-}
-)";
+})";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -311,10 +376,10 @@ ENTRY entry {
         }
       }
     }
-}
-)";
+})";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -345,11 +410,11 @@ ENTRY entry_computation {
         }
       }
     }
-}
-)";
+})";
   // Note: block sizes were 16,16,32, but that now fails to satisfy constraints.
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -378,11 +443,355 @@ ENTRY entry_computation {
         }
       }
     }
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+}
+
+TEST_F(NestGemmFusionTest, ConcatenationsAreHoistedWithinNestedGemmFusions) {
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_gemm {
+  parameter_0 = f32[2,3,10]{2,1,0} parameter(0)
+  parameter_1 = f32[2,10,128]{2,1,0} parameter(1)
+  parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
+  concatenate = f32[2,10,384]{2,1,0} concatenate(parameter_1, parameter_2), dimensions={2}
+  ROOT dot = f32[2,3,384]{2,1,0} dot(parameter_0, concatenate),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  parameter_0 = f32[2,3,10]{2,1,0} parameter(0)
+  parameter_1 = f32[2,10,128]{2,1,0} parameter(1)
+  parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
+  ROOT dot = f32[2,3,384]{2,1,0} fusion(parameter_0, parameter_1, parameter_2),
+    kind=kCustom, calls=triton_gemm,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  HloComputation* fusion_computation = module->entry_computation()
+                                           ->root_instruction()
+                                           ->fused_instructions_computation();
+  HloInstruction* dot_lhs;
+  HloInstruction* dot_rhs;
+  EXPECT_THAT(
+      fusion_computation->root_instruction(),
+      GmockMatch(match::Dot(match::Fusion(&dot_lhs), match::Fusion(&dot_rhs))));
+  EXPECT_THAT(dot_rhs->fused_instructions_computation()->root_instruction(),
+              GmockMatch(match::Concatenate(match::Fusion(), match::Fusion())));
+}
+
+TEST_F(NestGemmFusionTest, UnsupportedComputationsAreRejected) {
+  // Fusions other than kTritonNestedGemmFusionKind are not supported so
+  // we expect that the pass will fail as the resulting computation is not
+  // supported.
+  absl::string_view hlo = R"(
+identity {
+  ROOT result = f32[128,128]{1,0} parameter(0)
+}
+
+triton_dot {
+  p0 = f32[128,128]{1,0} parameter(0)
+  cp0 = f32[128,128]{1,0} fusion(p0), kind=kCustom, calls=identity
+  p1 = f32[128,128]{1,0} parameter(1)
+  ROOT result = f32[128,128]{1,0} dot(cp0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
+
+ENTRY e {
+  p0 = f32[128,128]{1,0} parameter(0)
+  p1 = f32[128,128]{1,0} parameter(1)
+  ROOT result = f32[128,128] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    "triton_gemm_config": {
+      "block_m":32,"block_n":16,"block_k":128,
+      "split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}}
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(NestGemmFusion().Run(module.get()), IsOkAndHolds(true));
+  absl::StatusOr<bool> result =
+      NestGemmFusion(compute_capability_).Run(module.get());
+  EXPECT_THAT(result, StatusIs(absl::StatusCode::kInternal)) << result.status();
+}
+
+TEST_F(NestGemmFusionTest, BitcastsAreHoistedPastCompare) {
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = s32[11,24,128]{2,1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  eq = pred[11,24,128]{2,1,0} compare(p0, p1), direction=EQ
+  eq_reshape = pred[264,128]{1,0} bitcast(eq)
+  eq_f32 = f32[264,128]{1,0} convert(eq_reshape)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT result = f32[264,8]{1,0} dot(eq_f32, p2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s32[11,24, 128]{2,1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT result = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {
+      "block_m":32,"block_n":16,"block_k":128,
+      "split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+}
+
+TEST_F(NestGemmFusionTest, BitcastsAreHoistedUpThroughBroadcasts) {
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,1,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,1,2,3}
+  p0_reshape = f32[264,128] bitcast(p0_broadcast)
+
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,1,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
+CHECK-NEXT: }
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_F(NestGemmFusionTest, BitcastOfOperandAndBroadcastDimsIsNotHoisted) {
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,4] parameter(0)
+  p0_broadcast = f32[3,4,5] broadcast(p0), dimensions={0,1}
+  p0_cos = f32[3,4,5] cosine(p0_broadcast)
+  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
+  p0_reshape = f32[3,20] bitcast(p0_cos)
+
+  p1 = f32[20,7]{1,0} parameter(1)
+  ROOT result = f32[3,7]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[3,4] parameter(0)
+  p1 = f32[20,7] parameter(1)
+  ROOT result = f32[3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_TRUE(!NestGemmFusion(compute_capability_).Run(module.get()).ok());
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  // Cos should not be rewritten as we cannot hoist bitcast.
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: f32[3,4,5]{2,1,0} cosine
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_F(NestGemmFusionTest, BitcastsAreHoistedUpThroughBroadcastDiamonds) {
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,5] parameter(0)
+  b0 = f32[3,5,77,1] broadcast(p0), dimensions={0,1}
+  b1 = f32[3,5,1] broadcast(p0), dimensions={0,1}
+  b2 = f32[3,5,77,1] broadcast(b1), dimensions={0,1,3}
+  sum = add(b0, b2)
+  sum_reshape = f32[15,77] bitcast(sum)
+  p1 = f32[77,8]{1,0} parameter(1)
+  ROOT result = f32[15,8] dot(sum_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[3,5] parameter(0)
+  p1 = f32[77,8] parameter(1)
+  ROOT result = f32[15,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: [[p0:[^ ]+]] = f32[15]{0} parameter(0)
+CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[p0]]), dimensions={0}
+CHECK-DAG: [[br:[^ ]+]] = f32[15]{0} broadcast([[p0]]), dimensions={0}
+CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[br]]), dimensions={0}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_F(NestGemmFusionTest, BitcastsAreHoistedOverBroadcasts) {
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,1,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128,1] broadcast(p0), dimensions={0,1,2,5}
+  p0_reshape = f32[264,128] bitcast(p0_broadcast)
+
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,1,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
+CHECK-NEXT: }
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_F(NestGemmFusionTest, BitcastsLayoutIsPreserved) {
+  absl::string_view hlo = R"(
+HloModule t
+
+gemm_dot {
+  p0 = pred[3,122,96,12] parameter(0)
+  bitcast0 = pred[3,122,1152] bitcast(p0)
+  transpose0 = pred[3,1152,122] transpose(bitcast0), dimensions={0,2,1}
+  bitcast1 = pred[3,96,12,122] bitcast(transpose0)
+  bitcast2 = pred[3456,122] bitcast(bitcast1)
+  convert0 = f16[3456,122] convert(bitcast2)
+  p1 = pred[1,5,122] parameter(1)
+  bitcast3 = pred[5,122] bitcast(p1)
+  convert1 = f16[5,122] convert(bitcast3)
+  bitcast4 = f16[122,5]{0,1} bitcast(convert1)
+  dot0 = f16[3456,5]{1,0} dot(convert0, bitcast4), lhs_contracting_dims={1},
+    rhs_contracting_dims={0}
+  ROOT bitcast5 = f16[3,96,12,1,5] bitcast(dot0)
+}
+
+ENTRY e {
+  p0 = pred[3,122,96,12] parameter(0)
+  p1 = pred[1,5,122] parameter(1)
+  ROOT fusion = f16[3,96,12,1,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: {{.*}} {
+CHECK: transpose
+CHECK: [[bitcast:[^ ]+]] = pred[3456,122]{1,0} bitcast({{.*}})
+CHECK: ROOT {{.*}} = f16[3456,122]{1,0} convert([[bitcast]])
+CHECK-NEXT: }
+CHECK: {{.*}} {
+CHECK-NOT: bitcast
+CHECK: ROOT {{.*}} = f16[122,5]{0,1} convert({{.*}})
+CHECK-NEXT: }
+CHECK: ENTRY {{.*}} {
+CHECK: {{.*}} = pred[122,5]{0,1} bitcast({{.*}})
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_F(NestGemmFusionTest, CheckDimensionsOfBroadcastAfterBitcastIsHoisted) {
+  absl::string_view hlo = R"(
+dot {
+  p0 = bf16[1,8] parameter(0)
+  broadcast0 = bf16[1,8,8] broadcast(p0), dimensions={0,2}
+  lhs = bf16[1,2,4,8] bitcast(broadcast0)
+
+  p1 = bf16[1,8] parameter(1)
+  broadcast1 = bf16[1,8,8] broadcast(p1), dimensions={0,2}
+  rhs = bf16[1,2,4,8] bitcast(broadcast1)
+
+  ROOT dot = bf16[2,1,4,4] dot(lhs, rhs),
+    lhs_contracting_dims={3}, lhs_batch_dims={1,0},
+    rhs_contracting_dims={3}, rhs_batch_dims={1,0}
+}
+
+ENTRY entry {
+  p0 = bf16[1,8] parameter(0)
+  ROOT fusion = bf16[2,1,4,4] fusion(p0, p0), kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  EXPECT_THAT(NestGemmFusion(compute_capability_).Run(module.get()),
+              IsOkAndHolds(true));
   TF_ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
+)"),
+      IsOkAndHolds(true));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.cc
deleted file mode 100644
index 378935dc6a81..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.cc
+++ /dev/null
@@ -1,706 +0,0 @@
-
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/pipelined_p2p_rewriter.h"
-
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/hlo/ir/dfs_hlo_visitor.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/hlo/utils/hlo_query.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-// Maps a computation to a boolean that indicates whether there is any
-// collective operations directly or indirectly invoked in the computation.
-using CollectiveInComputation =
-    absl::flat_hash_map<const HloComputation*, bool>;
-
-using InstructionVector = HloInstruction::InstructionVector;
-
-// Records starting index and the ending index of a pipelined while-op. They
-// are the indices of the while-loop operand.
-struct PipelinedP2PInfo {
-  int64_t opnd_start;
-  int64_t opnd_end;
-};
-
-// Returns whether the instruction is a collective operation.
-bool IsCollectiveOp(const HloInstruction* op) {
-  HloOpcode opcode = op->opcode();
-  // TODO(NVIDIA/4364298): The information is recorded in b/309639264.
-  // we need to avoid custom-calls to overlap with Send/Recv to workaround the
-  // bug. Remove custom-calls here when the bug is fixed.
-  if (opcode == HloOpcode::kCustomCall) {
-    return true;
-  }
-
-  return hlo_query::IsCollectiveCommunicationOp(opcode) ||
-         opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv;
-}
-
-// Returns whether the instruction may invoke collective operations directly
-// or indirectly.
-bool MayInvokeCollectiveOp(
-    const HloInstruction* hlo,
-    const CollectiveInComputation& collective_in_computation) {
-  if (IsCollectiveOp(hlo)) {
-    return true;
-  }
-  for (HloComputation* callee : hlo->called_computations()) {
-    auto collective_in_comp = collective_in_computation.find(callee);
-    CHECK(collective_in_comp != collective_in_computation.end());
-    if (collective_in_comp->second) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Returns the unique get-tuple-element user with the given idx or nullptr if
-// there isn't such a unique user.
-HloInstruction* FindUniqueGTEUserWithIndex(const HloInstruction* op,
-                                           int64_t idx) {
-  CHECK(op->shape().IsTuple());
-
-  HloInstruction* gte = nullptr;
-  for (auto user : op->users()) {
-    if (HloPredicateIsNotOp<HloOpcode::kGetTupleElement>(user)) {
-      continue;
-    }
-    if (user->tuple_index() == idx) {
-      if (gte == nullptr) {
-        gte = user;
-      } else {
-        return nullptr;
-      }
-    }
-  }
-  return gte;
-}
-
-// Returns whether there is any get-tuple-element user with the given idx.
-bool HasGTEUserWithIndex(const HloInstruction* op, int64_t idx) {
-  CHECK(op->shape().IsTuple());
-
-  for (auto user : op->users()) {
-    if (HloPredicateIsNotOp<HloOpcode::kGetTupleElement>(user)) {
-      continue;
-    }
-    if (user->tuple_index() == idx) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Returns the instruction hidden behind a trivial tuple or `op`. This allows
-// the discovery of recv-done for the following case, for which the indirection
-// would have been removed by tuple-simplification.
-//    gte.0 = f32[1,1024,1024] get-tuple-element(recv-done), index=0
-//    gte.1 = token get-tuple-element(recv-done.p), index=1
-//    op = (f32[1,1024,1024], token[]) tuple(gte.0, gte.1)
-//
-// TODO(bixia): investigate the possible of implementing
-// m::TrivialTuple(m::RecvDone(&instr)) as suggested by code review.
-HloInstruction* MaySkipTrivialTuple(HloInstruction* op) {
-  if (HloPredicateIsNotOp<HloOpcode::kTuple>(op)) {
-    return op;
-  }
-  HloInstruction* hidden_op = nullptr;
-  for (auto opnd : op->mutable_operands()) {
-    if (HloPredicateIsNotOp<HloOpcode::kGetTupleElement>(opnd)) {
-      return op;
-    }
-    if (hidden_op == nullptr) {
-      hidden_op = opnd->mutable_operand(0);
-    } else if (opnd->mutable_operand(0) != hidden_op) {
-      return op;
-    }
-  }
-  return hidden_op;
-}
-
-// This routine is similar to the non-const version above except that the
-// the given instruction is used for pattern checking only and can't be mutated.
-const HloInstruction* MaySkipTrivialTuple(const HloInstruction* op) {
-  // Use const_cast to avoid repeating the non-const version above to find
-  // operands of the instruction through operands() instead of
-  // mutable_operands().
-  return MaySkipTrivialTuple(const_cast<HloInstruction*>(op));
-}
-
-// Finds a consecutive block of balanced SendDone/RecvDone in the while_init
-// of a while-loop, assuming its while_init is a tuple.
-std::optional<PipelinedP2PInfo>
-FindConsecutiveAndBalanceBlockOfSendDoneRecvDone(
-    const HloInstruction* while_init) {
-  PipelinedP2PInfo pipelined_p2p_info{0, 0};
-  // Return whether the first SendDone/RecvDone has been seen.
-  auto has_started = [&]() {
-    return pipelined_p2p_info.opnd_start != pipelined_p2p_info.opnd_end;
-  };
-  // Record the difference between the number of SendDone and RecvDone in a
-  // consecutive block.
-  int difference = 0;
-  // If SendDone/RecvDone exists in a consecutive block in the while_init
-  // tuple, find such block.
-  for (int64_t i = 0; i < while_init->operand_count(); ++i) {
-    const HloInstruction* op = while_init->operand(i);
-    if ((HloPredicateIsOp<HloOpcode::kRecvDone, HloOpcode::kSendDone>(op)) &&
-        op->frontend_attributes().map().count(kSendRecvPipelineAttr) > 0) {
-      if (HloPredicateIsOp<HloOpcode::kRecvDone>(op)) {
-        difference++;
-      } else {
-        difference--;
-      }
-      if (!has_started()) {
-        pipelined_p2p_info.opnd_start = i;
-      }
-      pipelined_p2p_info.opnd_end = i + 1;
-    } else {
-      if (has_started()) {
-        VLOG(10) << "End a consecutive block";
-        break;
-      }
-    }
-  }
-
-  if (difference != 0) {
-    VLOG(10) << "Mismatch number of SendDone and RecvDone: " << difference;
-    return std::nullopt;
-  }
-
-  if (has_started()) {
-    // Check for SendDone/RecvDone outside the consecutive block.
-    for (int64_t i = pipelined_p2p_info.opnd_end;
-         i < while_init->operand_count(); ++i) {
-      const HloInstruction* op = while_init->operand(i);
-      if (HloPredicateIsOp<HloOpcode::kRecvDone, HloOpcode::kSendDone>(op)) {
-        VLOG(10) << "SendDone/RecvDone outside the consecutive block";
-        return std::nullopt;
-        break;
-      }
-    }
-  }
-
-  if (!has_started()) {
-    VLOG(10) << "No SendDone/RecvDone in while-init ";
-    return std::nullopt;
-  }
-
-  return pipelined_p2p_info;
-}
-
-// Checks whether the while-op, its while-body and while-condition have a
-// recognized pipelined pattern. If a pipelined pattern is found, returns the
-// first and last indices for the pipelined instruction in the while-init tuple.
-// For pipelined Send/Recv to work, the SendDone/RecvDone doesn't have to be in
-// a consecutive block, but this simplifies the implementation and is the
-// pattern that the current gpu-p2p-pipeliner generated.
-//
-// As a summary, this is what the routine looks for:
-//
-// . The while-init has a tuple with a single user.
-// . The while-init has a consecutive block of SendDone and RecvDone. The
-//   numbers of SendDone and RecvDone are the same, and there isn't any other
-//   SendDone and RecvDone outside the block.
-// . The while-body has a single tuple parameter.
-// . For the while-op result tuple and the while-body parameter tuple:
-//     The index corresponding to the index of SendDone in while-init should not
-//       correspond to any get-element-tuple user.
-//     The index corresponding to the index of RecvDone in while-init should
-//       correspond to a single get-element-tuple user.
-// . In the while-body result tuple, the operand with an index corresponding to
-//   the index in the while-init SendDone and RecvDone should also be a SendDone
-//   or RecvDone.
-//
-// TODO(bixia): support pipelined SendDone/RecvDone not in a consecutive block
-// if the gpu-p2p-pipeliner will ever generate such code in the future.
-std::optional<PipelinedP2PInfo> FindPipelinedP2P(
-    const HloInstruction* while_op) {
-  VLOG(10) << "while_op: " << while_op->ToString();
-  const HloInstruction* while_init = while_op->while_init();
-  if (HloPredicateIsNotOp<HloOpcode::kTuple>(while_init) ||
-      while_init->user_count() != 1) {
-    return std::nullopt;
-  }
-
-  // The while-body and while-condition should have one parameter of a tuple
-  // shape.
-  const HloComputation* while_body = while_op->while_body();
-  const HloComputation* while_condition = while_op->while_condition();
-  if (while_body->num_parameters() != 1 ||
-      while_condition->num_parameters() != 1) {
-    return std::nullopt;
-  }
-
-  std::optional<PipelinedP2PInfo> pipelined_p2p_info =
-      FindConsecutiveAndBalanceBlockOfSendDoneRecvDone(while_init);
-  if (!pipelined_p2p_info.has_value()) {
-    return std::nullopt;
-  }
-
-  VLOG(10) << "opnd_start " << pipelined_p2p_info->opnd_start << " opnd_end "
-           << pipelined_p2p_info->opnd_end;
-
-  // In the while-result or while-body parameter, the index for RecvDone should
-  // correspond to one get-tuple-element user and the index for SendDone should
-  // not correspond to any get-tuple-element user.
-  for (int64_t i = pipelined_p2p_info->opnd_start;
-       i < pipelined_p2p_info->opnd_end; ++i) {
-    const HloInstruction* op = while_init->operand(i);
-    if (HloPredicateIsOp<HloOpcode::kRecvDone>(op)) {
-      if (!FindUniqueGTEUserWithIndex(while_op, i)) {
-        VLOG(10) << "While result get-tuple-element user with index " << i
-                 << " not unique";
-        return std::nullopt;
-      }
-      if (!FindUniqueGTEUserWithIndex(while_body->parameter_instruction(0),
-                                      i)) {
-        VLOG(10) << "While-body parameter get-tuple-element user with index "
-                 << i << " not unique";
-        return std::nullopt;
-      }
-    } else {
-      CHECK(HloPredicateIsOp<HloOpcode::kSendDone>(op));
-      if (HasGTEUserWithIndex(while_op, i) ||
-          HasGTEUserWithIndex(while_body->parameter_instruction(0), i)) {
-        VLOG(10) << "SendDone with index " << i << " has unexpected users";
-        return std::nullopt;
-      }
-    }
-  }
-
-  // The element in the while-body result tuple corresponding to the pipelined
-  // SendDone/RecvDone in the while-init have the same opcode.
-  const HloInstruction* root = while_body->root_instruction();
-  for (int64_t i = pipelined_p2p_info->opnd_start;
-       i < pipelined_p2p_info->opnd_end; ++i) {
-    const HloInstruction* op_init = while_init->operand(i);
-    const HloInstruction* op_root = root->operand(i);
-    op_root = MaySkipTrivialTuple(op_root);
-    if (op_init->opcode() != op_root->opcode()) {
-      VLOG(10) << "Mismatching opcode, op_init: " << op_init->ToString()
-               << " op_root: " << op_root->ToString();
-      return std::nullopt;
-    }
-  }
-
-  return pipelined_p2p_info.value();
-}
-
-absl::Status RemoveOpFromParent(HloInstruction* op) {
-  TF_RETURN_IF_ERROR(op->DropAllControlDeps());
-  TF_RETURN_IF_ERROR(op->parent()->RemoveInstruction(op));
-  return absl::OkStatus();
-}
-
-absl::Status ReplaceOpInSequence(HloInstruction* old_op, HloInstruction* new_op,
-                                 HloInstructionSequence& instruction_sequence) {
-  VLOG(10) << "old_op: " << old_op->ToString();
-  VLOG(10) << "new_op: " << new_op->ToString();
-  instruction_sequence.replace_instruction(old_op, new_op);
-  return RemoveOpFromParent(old_op);
-}
-
-absl::Status ReplaceUsesAndUpdateSequence(
-    HloInstruction* old_op, HloInstruction* new_op,
-    HloInstructionSequence& instruction_sequence, bool diff_shape = false) {
-  VLOG(10) << "old_op: " << old_op->ToString();
-  VLOG(10) << "new_op: " << new_op->ToString();
-  if (diff_shape) {
-    TF_RETURN_IF_ERROR(old_op->ReplaceAllUsesWithDifferentShape(new_op));
-  } else {
-    TF_RETURN_IF_ERROR(old_op->ReplaceAllUsesWith(new_op));
-  }
-  return ReplaceOpInSequence(old_op, new_op, instruction_sequence);
-}
-
-absl::Status ReplaceUsesAndUpdateSequence(
-    const InstructionVector& old_ops, const InstructionVector& new_ops,
-    HloInstructionSequence& instruction_sequence) {
-  CHECK(old_ops.size() == new_ops.size());
-  for (int64_t i = 0; i < old_ops.size(); ++i) {
-    TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(old_ops[i], new_ops[i],
-                                                    instruction_sequence));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status RemoveDoneOpsAndUpdateSequence(
-    const InstructionVector& ops,
-    HloInstructionSequence& instruction_sequence) {
-  auto remove_op = [&](HloInstruction* op) {
-    VLOG(10) << "op: " << op->ToString();
-    TF_RETURN_IF_ERROR(RemoveOpFromParent(op));
-    instruction_sequence.remove_instruction(op);
-    return absl::OkStatus();
-  };
-  for (auto op : ops) {
-    if (HloPredicateIsOp<HloOpcode::kTuple>(op)) {
-      InstructionVector to_remove;
-      HloInstruction* tuple_op = op;
-      op = MaySkipTrivialTuple(tuple_op);
-      to_remove.push_back(tuple_op);
-      for (auto opnd : tuple_op->mutable_operands()) {
-        to_remove.push_back(opnd);
-      }
-      for (auto opnd : to_remove) {
-        TF_RETURN_IF_ERROR(remove_op(opnd));
-      }
-    }
-    TF_RETURN_IF_ERROR(remove_op(op));
-  }
-  return absl::OkStatus();
-}
-
-bool InsertBeforeFirstCollectiveOp(
-    const InstructionVector& ops,
-    const CollectiveInComputation& collective_in_computation,
-    HloInstructionSequence& instruction_sequence, int64_t& idx,
-    int64_t& idx_tot) {
-  bool inserted = false;
-  while (idx < idx_tot) {
-    HloInstruction* hlo = instruction_sequence.instructions()[idx];
-    if (MayInvokeCollectiveOp(hlo, collective_in_computation)) {
-      for (auto op : ops) {
-        instruction_sequence.insert_instruction(op, idx);
-        idx++;
-        idx_tot++;
-      }
-      inserted = true;
-      break;
-    }
-    idx++;
-  }
-  return inserted;
-}
-
-void CopyInstructionInfo(const HloInstruction* old_op, HloInstruction* new_op) {
-  new_op->SetAndSanitizeName(absl::StrCat(old_op->name(), ".clone"));
-  new_op->set_metadata(old_op->metadata());
-  new_op->add_frontend_attributes(old_op->frontend_attributes());
-  new_op->CopyBackendConfigFrom(old_op);
-}
-
-HloInstruction* CreateRecvDoneFrom(const HloInstruction* old_recv_done,
-                                   HloInstruction* recv,
-                                   HloComputation* computation) {
-  HloInstruction* recv_done = computation->AddInstruction(
-      HloInstruction::CreateRecvDone(recv, old_recv_done->channel_id().value(),
-                                     /*is_host_transfer=*/false));
-  CopyInstructionInfo(old_recv_done, recv_done);
-  return recv_done;
-}
-
-HloInstruction* CreateSendDoneFrom(const HloInstruction* old_send_done,
-                                   HloInstruction* send,
-                                   HloComputation* computation) {
-  HloInstruction* send_done = computation->AddInstruction(
-      HloInstruction::CreateSendDone(send, old_send_done->channel_id().value(),
-                                     /*is_host_transfer=*/false));
-  CopyInstructionInfo(old_send_done, send_done);
-  return send_done;
-}
-
-absl::Status RewritePipelinedP2PWhileBody(
-    const CollectiveInComputation& collective_in_computation,
-    const std::vector<Shape>& new_parameter_shapes, HloInstruction* while_op,
-    int64_t opnd_start, int64_t opnd_end) {
-  HloComputation* computation = while_op->while_body();
-  HloInstruction* while_init = while_op->while_init();
-  HloInstruction* root = computation->root_instruction();
-  HloInstructionSequence& instruction_sequence =
-      computation->parent()->schedule().GetOrCreateSequence(computation);
-
-  HloInstruction* param = computation->parameter_instruction(0);
-  *param->mutable_shape() = ShapeUtil::MakeTupleShape(new_parameter_shapes);
-
-  InstructionVector recv_dones;
-  InstructionVector new_recv_dones;
-  InstructionVector new_send_dones;
-  for (int64_t i = opnd_start; i < opnd_end; ++i) {
-    const HloInstruction* op = root->operand(i);
-    op = MaySkipTrivialTuple(op);
-    if (HloPredicateIsOp<HloOpcode::kRecvDone>(op)) {
-      HloInstruction* gte = FindUniqueGTEUserWithIndex(param, i);
-      CHECK(gte != nullptr);
-      recv_dones.push_back(gte);
-
-      // Create the new RecvDone using the new while-body parameter.
-      HloInstruction* recv = computation->AddInstruction(
-          HloInstruction::CreateGetTupleElement(param, i));
-
-      HloInstruction* recv_done = CreateRecvDoneFrom(op, recv, computation);
-      new_recv_dones.push_back(recv_done);
-      continue;
-    }
-    CHECK(HloPredicateIsOp<HloOpcode::kSendDone>(op));
-    //  Create the new SendDone using the new while-op result.
-    HloInstruction* send = computation->AddInstruction(
-        HloInstruction::CreateGetTupleElement(param, i));
-    HloInstruction* send_done = CreateSendDoneFrom(op, send, computation);
-    new_send_dones.push_back(send_done);
-  }
-  TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(recv_dones, new_recv_dones,
-                                                  instruction_sequence));
-
-  // Create a new root tuple.
-  InstructionVector done_ops;
-  InstructionVector new_opnds;
-  for (int64_t i = 0; i < while_init->operand_count(); ++i) {
-    HloInstruction* op = root->mutable_operand(i);
-    if (i >= opnd_start && i < opnd_end) {
-      new_opnds.push_back(MaySkipTrivialTuple(op)->mutable_operand(0));
-      done_ops.push_back(op);
-    } else {
-      new_opnds.push_back(op);
-    }
-  }
-  HloInstruction* new_root =
-      computation->AddInstruction(HloInstruction::CreateTuple(new_opnds));
-  computation->set_root_instruction(new_root,
-                                    /*accept_different_shape=*/true);
-  TF_RETURN_IF_ERROR(computation->RemoveInstruction(root));
-  instruction_sequence.replace_instruction(root, new_root);
-
-  TF_RETURN_IF_ERROR(
-      RemoveDoneOpsAndUpdateSequence(done_ops, instruction_sequence));
-
-  // Find a place to put the new SendDone. It will be either the first
-  // may-invoke-collective ops that is not in the pipelined Send/Recv chain or
-  // the first op in the pipelined Send/Recv chain.
-  int64_t idx = 0;
-  int64_t idx_end = instruction_sequence.size();
-  bool inserted =
-      InsertBeforeFirstCollectiveOp(new_send_dones, collective_in_computation,
-                                    instruction_sequence, idx, idx_end);
-  CHECK(inserted);  // There are Send/Recv in the while-body, expect inserted.
-  CHECK(idx_end == instruction_sequence.size());
-
-  // The module schedule will be updated at the end of the pass.
-  return absl::OkStatus();
-}
-
-void RewritePipelinedP2PWhileCond(
-    const std::vector<Shape>& new_parameter_shapes, HloInstruction* while_op) {
-  HloComputation* computation = while_op->while_condition();
-  HloInstruction* param = computation->parameter_instruction(0);
-  *param->mutable_shape() = ShapeUtil::MakeTupleShape(new_parameter_shapes);
-  VLOG(10) << computation->ToString();
-}
-
-// Rewrites the while-op with a recognized pipelined SendDone/RecvDone pattern
-// to pipeline Send/Recv instead.
-absl::Status TransformLoop(
-    const PipelinedP2PInfo& pipelined_info,
-    const CollectiveInComputation& collective_in_computation, int64_t& idx,
-    int64_t& idx_end, HloInstructionSequence& instruction_sequence,
-    HloInstruction* while_op) {
-  HloComputation* computation = while_op->parent();
-  int64_t opnd_start = pipelined_info.opnd_start;
-  int64_t opnd_end = pipelined_info.opnd_end;
-  VLOG(10) << "Transform pipelined while-op " << while_op->ToString();
-  HloInstruction* while_init = while_op->while_init();
-  InstructionVector new_while_init_opnds;
-  std::vector<Shape> new_parameter_shapes;
-  for (int64_t i = 0; i < while_init->operand_count(); ++i) {
-    HloInstruction* op = while_init->mutable_operand(i);
-    if (i >= opnd_start && i < opnd_end) {
-      // Get Send/Recv from SendDone/RecvDone.
-      new_while_init_opnds.push_back(op->mutable_operand(0));
-    } else {
-      new_while_init_opnds.push_back(op);
-    }
-    new_parameter_shapes.push_back(new_while_init_opnds.back()->shape());
-  }
-
-  RewritePipelinedP2PWhileCond(new_parameter_shapes, while_op);
-  TF_RETURN_IF_ERROR(RewritePipelinedP2PWhileBody(
-      collective_in_computation, new_parameter_shapes, while_op, opnd_start,
-      opnd_end));
-  HloInstruction* new_while_init = computation->AddInstruction(
-      HloInstruction::CreateTuple(new_while_init_opnds), "while-init");
-  VLOG(10) << "new_while_init: " << new_while_init->ToString();
-  HloInstruction* new_while_op = computation->AddInstruction(
-      HloInstruction::CreateWhile(
-          while_op->while_body()->root_instruction()->shape(),
-          while_op->while_condition(), while_op->while_body(), new_while_init),
-      "while-result");
-  CopyInstructionInfo(while_op, new_while_op);
-  VLOG(10) << "new_while_op: " << new_while_op->ToString();
-
-  InstructionVector recv_dones;
-  InstructionVector new_recv_dones;
-  InstructionVector new_send_dones;
-  InstructionVector done_ops;
-  for (int64_t i = opnd_start; i < opnd_end; ++i) {
-    HloInstruction* op = while_init->mutable_operand(i);
-    done_ops.push_back(op);
-    if (HloPredicateIsOp<HloOpcode::kRecvDone>(op)) {
-      HloInstruction* gte = FindUniqueGTEUserWithIndex(while_op, i);
-      CHECK(gte != nullptr);
-      recv_dones.push_back(gte);
-
-      // Create the new RecvDone using the new while-op result.
-      HloInstruction* recv = computation->AddInstruction(
-          HloInstruction::CreateGetTupleElement(new_while_op, i));
-      HloInstruction* recv_done =
-          computation->AddInstruction(HloInstruction::CreateRecvDone(
-              recv, op->channel_id().value(), /*is_host_transfer=*/false));
-      new_recv_dones.push_back(recv_done);
-      CopyInstructionInfo(op, recv_done);
-      continue;
-    }
-    CHECK(HloPredicateIsOp<HloOpcode::kSendDone>(op));
-    //  Create the new SendDone using the new while-op result.
-    HloInstruction* send = computation->AddInstruction(
-        HloInstruction::CreateGetTupleElement(new_while_op, i));
-    HloInstruction* send_done =
-        computation->AddInstruction(HloInstruction::CreateSendDone(
-            send, op->channel_id().value(), /*is_host_transfer=*/false));
-    new_send_dones.push_back(send_done);
-    CopyInstructionInfo(op, send_done);
-  }
-
-  TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(
-      while_op, new_while_op, instruction_sequence, /*diff_shape*/ true));
-  TF_RETURN_IF_ERROR(
-      ReplaceOpInSequence(while_init, new_while_init, instruction_sequence));
-  TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(recv_dones, new_recv_dones,
-                                                  instruction_sequence));
-  TF_RETURN_IF_ERROR(
-      RemoveDoneOpsAndUpdateSequence(done_ops, instruction_sequence));
-
-  int64_t opnd_tot = opnd_end - opnd_start;
-  // Verify that the numbers of ops we have removed from the sequence is
-  // opnd_tot and they are before the position of the new while-op.
-  CHECK(idx_end == instruction_sequence.size() + opnd_tot);
-  CHECK(instruction_sequence.instructions()[idx - opnd_tot] == new_while_op);
-
-  // Update idx_end to reflect the current size of the instruction sequence.
-  // Update idx to right after the new while-op.
-  idx_end -= opnd_tot;
-  idx = idx - opnd_tot + 1;
-  bool inserted =
-      InsertBeforeFirstCollectiveOp(new_send_dones, collective_in_computation,
-                                    instruction_sequence, idx, idx_end);
-  CHECK(idx_end == instruction_sequence.size());
-  // If there isn't any may-invoke-collective ops after the while-op, add
-  // the new SendDone ops before the last instruction in the sequence.
-  if (!inserted) {
-    CHECK(idx_end == idx);
-    idx--;
-    for (auto send_done : new_send_dones) {
-      instruction_sequence.insert_instruction(send_done, idx++);
-    }
-  }
-  return absl::OkStatus();
-}
-
-// Find while-loop with pipelined Send/Recv and rotates the SendDone/RecvDone
-// for such while-loop.
-absl::StatusOr<bool> ProcessComputation(
-    HloModule* module, HloComputation* computation,
-    CollectiveInComputation& collective_in_computation) {
-  VLOG(10) << "Process compuation " << computation->name();
-  bool changed = false;
-  HloInstructionSequence& instruction_sequence =
-      module->schedule().GetOrCreateSequence(computation);
-  int64_t idx = 0;
-  int64_t idx_end = instruction_sequence.size();
-  while (idx < idx_end) {
-    HloInstruction* hlo = instruction_sequence.instructions()[idx];
-
-    if (MayInvokeCollectiveOp(hlo, collective_in_computation)) {
-      collective_in_computation[computation] = true;
-    }
-
-    if (HloPredicateIsNotOp<HloOpcode::kWhile>(hlo)) {
-      idx++;
-      continue;
-    }
-
-    std::optional<PipelinedP2PInfo> pipelined_info = FindPipelinedP2P(hlo);
-    if (!pipelined_info.has_value()) {
-      idx++;
-      continue;
-    }
-    TF_RETURN_IF_ERROR(TransformLoop(pipelined_info.value(),
-                                     collective_in_computation, idx, idx_end,
-                                     instruction_sequence, hlo));
-    changed = true;
-  }
-  // After processing all the operations, if nothing set the flag
-  // collective_in_computation[computation] to true, then there must be no
-  // collective in this computation. `insert` is a no-op if the key already
-  // exists in the `flat_hash_map`.
-  collective_in_computation.insert({computation, false});
-  return changed;
-}
-}  // namespace
-
-absl::StatusOr<bool> PipelinedP2PRewriter::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  if (!module->has_schedule()) return changed;
-  CollectiveInComputation collective_in_computation;
-  // Visit the computations in the order of callees to callers, so that
-  // while-body is processed before while-op.
-  for (auto* computation :
-       module->MakeComputationPostOrder(execution_threads)) {
-    // For both fusions and non-fusion computations, we assume that if there is
-    // a collective in the body of the called computations then the operation
-    // may invoke collectives.
-    TF_ASSIGN_OR_RETURN(
-        bool cur_changed,
-        ProcessComputation(module, computation, collective_in_computation));
-    changed |= cur_changed;
-  }
-
-  if (changed) {
-    TF_RETURN_IF_ERROR(module->schedule().Update());
-  }
-
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.h b/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.h
deleted file mode 100644
index 9731f00c633e..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_PIPELINED_P2P_REWRITER_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_PIPELINED_P2P_REWRITER_H_
-
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// PipelinedP2PRewriter is a pass that rewrites pipelined Send/Recv related
-// code for point-to-point communication to rotate SendDone and RecvDone at the
-// end of a loop iteration to the beginning of the next iteration. This pass
-// operates on scheduled module and updates the instruction sequence.
-//
-// In particular, a pipelined Send/Recv chain with one channel group with this
-// code pattern:
-//
-// main:
-//    recv
-//    send
-//    recv-done
-//    send-done
-//    while-init = (recv-done, send-done, ...)
-//    while-op = while(whiel-init) ...
-//
-// while-body:
-//    ...
-//    recv
-//    send
-//    recv-done
-//    send-done
-//    ROOT tuple(recv-done, send-done, ...)
-//
-// Will be transformed to:
-//
-// main:
-//    recv
-//    send
-//    while-init = (recv, send, ...)
-//    while-op = while(whiel-init) ...
-//    recv-done
-//    send-done
-//
-// while-body:
-//    recv-done
-//    ...
-//    send-done
-//    recv
-//    send
-//    ROOT tuple(recv, send, ...)
-//
-// A pipelined Send/Recv chain with two channel groups with this code pattern:
-//
-// main:
-//    recv.0
-//    send.0
-//    recv.1
-//    send.1
-//    recv-done.0
-//    send-done.0
-//    recv-done.1
-//    send-done.1
-//    while-init = (recv-done.0, send-done.0, recv-done.1, send-done.1, ...)
-//    while-op = while(whiel-init) ...
-//
-// while-body:
-//    ...
-//    recv.0
-//    send.0
-//    recv.1
-//    send.1
-//    recv-done.0
-//    send-done.0
-//    recv-done.1
-//    send-done.1
-//    ROOT = tuple(recv-done.0, send-done.0, recv-done.1, send-done.1, ...)
-//
-// Will be transformed to:
-//
-// main:
-//
-//    recv.0
-//    send.0
-//    recv.1
-//    send.1
-//    while-init = (recv.0, send.0, recv.1, send.1, ...)
-//    while-op = while(while-init) ...
-//    recv-done.0
-//    send-done.0
-//    recv-done.1
-//    send-done.1
-//
-// while-body:
-//    recv-done.0
-//    recv-done.1
-//    ...
-//    send-done.0
-//    send-done.1
-//    recv.0
-//    send.1
-//    recv.1
-//    send.1
-//    ROOT tuple(recv.0, send.0, recv.1, send.1, ...)
-//
-class PipelinedP2PRewriter : public HloModulePass {
- public:
-  absl::string_view name() const override { return "pipelined-p2p-rewriter"; }
-
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_PIPELINED_P2P_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter_test.cc
deleted file mode 100644
index 400901ec6a65..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/pipelined_p2p_rewriter_test.cc
+++ /dev/null
@@ -1,728 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/pipelined_p2p_rewriter.h"
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/hlo/testlib/filecheck.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-class PipelinedP2pRewriterTest : public HloTestBase {
- protected:
-  void DoFileCheck(const HloModule* module, absl::string_view expected) {
-    HloPrintOptions options;
-    options.set_print_operand_shape(false);
-    options.set_print_result_shape(false);
-    TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
-                            RunFileCheck(module->ToString(options), expected));
-    EXPECT_TRUE(filecheck_matched);
-  }
-};
-
-TEST_F(PipelinedP2pRewriterTest, SendRecUnpipelinedNotTransform) {
-  const char* kModuleStr = R"(
-HloModule test
-
-cond {
-    param = (u32[], u32[2]) parameter(0)
-    count = get-tuple-element(%param), index=0
-    ub = u32[] constant(11)
-    ROOT result = pred[] compare(count, ub), direction=LT
- }
-
-body {
-    param = (u32[], u32[2]) parameter(0)
-    count = get-tuple-element(param), index=0
-    send-data = u32[2] get-tuple-element(param), index=1
-
-    after-all.0.n = token[] after-all()
-    recv.0 = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{3,0}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send.0 = (u32[2], u32[], token[]) send(send-data, after-all.0.n),
-      channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{3,0}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.0 = token[] send-done(send.0), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-
-    recv-data = u32[2] get-tuple-element(recv-done.0), index=0
-
-    c1 = u32[] constant(1)
-    new_count = u32[] add(count, c1)
-
-    r = u32[2] broadcast(c1), dimensions={}
-    s = u32[2] add(r, recv-data)
-
-    ROOT result = (u32[], u32[2]) tuple(new_count, s)
-  }
-
-  ENTRY test_computation {
-    c0 = u32[] constant(0)
-    c1 = u32[] constant(1)
-    r = u32[] replica-id()
-    a = u32[] add(c1, r)
-    init = u32[2] broadcast(a), dimensions={}
-    while_init = (u32[], u32[2]) tuple(c0, init)
-    while_result = (u32[], u32[2]) while(while_init), body=body, condition=cond,
-      backend_config={"known_trip_count":{"n":"11"}}
-    ROOT recv-data = u32[2] get-tuple-element(while_result), index=1
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  PipelinedP2PRewriter rewriter;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
-// Tests the rewrite for a pipelined Send/Recv chain with only one channel
-// group.
-TEST_F(PipelinedP2pRewriterTest, SendRecvPipelined1) {
-  const char* kModuleStr = R"(
-  HloModule test, is_scheduled=true
-
-  while-cond {
-    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-    ub = u32[] constant(25)
-    ROOT cond-result = pred[] compare(count, ub), direction=LT
-  }
-
-  while-body {
-    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-
-    recv-done.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
-    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.q), index=0
-
-    c1 = u32[] constant(1)
-    new-count = u32[] add(count, c1)
-    replica = u32[] replica-id()
-    c10 = u32[] constant(10)
-    sum = u32[] add(replica, c10)
-    sum2 = u32[] add(sum, count)
-    conv = f32[] convert(sum2)
-    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
-    b = f32[1, 1024, 1024] add(p, recv-data)
-    c = f32[1, 1024, 1024] multiply(b, b)
-    d = f32[1, 1024, 1024] tan(c)
-    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
-      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-    send-data = f32[1, 1024, 1024] add(c, s)
-
-    after-all = token[] after-all()
-    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all),
-      channel_id=1, frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.p = (f32[1,1024,1024], token[]) recv-done(recv), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.p = token[] send-done(send), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    gte.0 = f32[1,1024,1024] get-tuple-element(recv-done.p), index=0
-    gte.1 = token[] get-tuple-element(recv-done.p), index=1
-    recv-done-tuple = (f32[1,1024,1024], token[]) tuple(gte.0, gte.1)
-    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[])
-      tuple(new-count, recv-done-tuple, send-done.p)
-  }
-
-  ENTRY main {
-    c0 = u32[] constant(0)
-    f0 = f32[] constant(0.0)
-    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
-
-    after-all.1 = token[] after-all()
-    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send.1 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.1.p = (f32[1,1024,1024], token[]) recv-done(recv.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.1.p = token[] send-done(send.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    while-init.p =  (u32[], (f32[1,1024,1024], token[]), token[])
-      tuple(c0, recv-done.1.p, send-done.1.p)
-    while-result.p = (u32[], (f32[1,1024,1024], token[]), token[])
-      while(while-init.p),
-      body=while-body, condition=while-cond,
-      backend_config={"known_trip_count":{"n":"25"}}
-
-    recv-done.1.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result.p), index=1
-
-    ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
-  }
-  )";
-
-  const char* kExpected = R"(
-  CHECK: %while-body (param.1: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[])) {
-  CHECK: %param.1 = parameter(0)
-  CHECK: %get-tuple-element = get-tuple-element(%param.1), index=1
-  CHECK: %get-tuple-element.1 = get-tuple-element(%param.1), index=2
-  CHECK: %count.1 = get-tuple-element(%param.1), index=0
-  CHECK: %recv-done.p.clone = recv-done(%get-tuple-element), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK: %recv-data = get-tuple-element(%recv-done.p.clone), index=0
-  CHECK: %c1 = constant(1)
-  CHECK: %new-count = add(%count.1, %c1)
-  CHECK: %replica = replica-id()
-  CHECK: %c10 = constant(10)
-  CHECK: %sum = add(%replica, %c10)
-  CHECK: %sum2 = add(%sum, %count.1)
-  CHECK: %conv = convert(%sum2)
-  CHECK: %p = broadcast(%conv), dimensions={}
-  CHECK: %b = add(%p, %recv-data)
-  CHECK: %c = multiply(%b, %b)
-  CHECK: %d = tan(%c)
-  CHECK: %s = dot(%c, %d), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  CHECK: %send-data = add(%c, %s)
-  CHECK: %after-all = after-all()
-  CHECK: %send-done.p.clone = send-done(%get-tuple-element.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK{LITERAL}: %recv = recv(%after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}}
-  CHECK{LITERAL}: %send = send(%send-data, %after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}}
-  CHECK: ROOT %tuple = tuple(%new-count, %recv, %send)
-  CHECK: }
-
-  CHECK: %while-cond (param: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> pred[] {
-  CHECK: %param = parameter(0)
-  CHECK: %count = get-tuple-element(%param), index=0
-  CHECK: %ub = constant(25)
-  CHECK: ROOT %cond-result = compare(%count, %ub), direction=LT
-  CHECK: }
-
-  CHECK: ENTRY %main () -> f32[1,1024,1024] {
-  CHECK: %c0 = constant(0)
-  CHECK: %f0 = constant(0)
-  CHECK: %init = broadcast(%f0), dimensions={}
-  CHECK: %after-all.1 = after-all()
-  CHECK{LITERAL}: %recv.1 = recv(%after-all.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}}
-  CHECK{LITERAL}: %send.1 = send(%init, %after-all.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}}
-  CHECK: %while-init = tuple(%c0, %recv.1, %send.1)
-  CHECK: %while-result.p.clone = while(%while-init), condition=%while-cond, body=%while-body,
-  CHECK-SAME{LITERAL}: backend_config={"known_trip_count":{"n":"25"}}
-  CHECK: %get-tuple-element.2 = get-tuple-element(%while-result.p.clone), index=1
-  CHECK: %get-tuple-element.3 = get-tuple-element(%while-result.p.clone), index=2
-  CHECK: %recv-done.1.p.clone = recv-done(%get-tuple-element.2), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK: %send-done.1.p.clone = send-done(%get-tuple-element.3), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK: ROOT %entry-result = get-tuple-element(%recv-done.1.p.clone), index=0
-  CHECK: })";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  PipelinedP2PRewriter rewriter;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
-  EXPECT_TRUE(changed);
-
-  DoFileCheck(module.get(), kExpected);
-}
-
-// Repeats the Send/Recv pattern in the previous test, to test that we can
-// rewrite a routine with multiple pipelined loops without crashing.
-TEST_F(PipelinedP2pRewriterTest, SendRecvTwoPipelinedWhileLoops) {
-  const char* kModuleStr = R"(
-  HloModule test, is_scheduled=true
-
-  while-cond {
-    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-    ub = u32[] constant(25)
-    ROOT cond-result = pred[] compare(count, ub), direction=LT
-  }
-
-  while-body {
-    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-
-    recv-done.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
-    send-data = f32[1, 1024, 1024] get-tuple-element(recv-done.q), index=0
-
-    c1 = u32[] constant(1)
-    new-count = u32[] add(count, c1)
-
-    after-all = token[] after-all()
-    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all),
-      channel_id=1, frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.p = (f32[1,1024,1024], token[]) recv-done(recv), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.p = token[] send-done(send), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    gte.0 = f32[1,1024,1024] get-tuple-element(recv-done.p), index=0
-    gte.1 = token[] get-tuple-element(recv-done.p), index=1
-    recv-done-tuple = (f32[1,1024,1024], token[]) tuple(gte.0, gte.1)
-    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[])
-      tuple(new-count, recv-done-tuple, send-done.p)
-  }
-
-  while-cond-2 {
-    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-    ub = u32[] constant(25)
-    ROOT cond-result = pred[] compare(count, ub), direction=LT
-  }
-
-  while-body-2 {
-    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-
-    recv-done.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
-    send-data = f32[1, 1024, 1024] get-tuple-element(recv-done.q), index=0
-
-    c1 = u32[] constant(1)
-    new-count = u32[] add(count, c1)
-
-    after-all = token[] after-all()
-    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all),
-      channel_id=1, frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.p = (f32[1,1024,1024], token[]) recv-done(recv), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.p = token[] send-done(send), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    gte.0 = f32[1,1024,1024] get-tuple-element(recv-done.p), index=0
-    gte.1 = token[] get-tuple-element(recv-done.p), index=1
-    recv-done-tuple = (f32[1,1024,1024], token[]) tuple(gte.0, gte.1)
-    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[])
-      tuple(new-count, recv-done-tuple, send-done.p)
-  }
-
-  ENTRY main {
-    c0 = u32[] constant(0)
-    f0 = f32[] constant(0.0)
-    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
-
-    after-all.1 = token[] after-all()
-    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send.1 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.1.p = (f32[1,1024,1024], token[]) recv-done(recv.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.1.p = token[] send-done(send.1), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    while-init.p =  (u32[], (f32[1,1024,1024], token[]), token[])
-      tuple(c0, recv-done.1.p, send-done.1.p)
-    while-result.p = (u32[], (f32[1,1024,1024], token[]), token[])
-      while(while-init.p),
-      body=while-body, condition=while-cond,
-      backend_config={"known_trip_count":{"n":"25"}}
-
-    recv-done.1.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result.p), index=1
-
-    after-all-2.1 = token[] after-all()
-    recv-2.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all-2.1), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send-2.1 = (f32[1, 1024, 1024], u32[], token[]) send(recv-done.1.q, after-all-2.1), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done-2.1.p = (f32[1,1024,1024], token[]) recv-done(recv-2.1), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done-2.1.p = token[] send-done(send-2.1), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    while-init-2.p =  (u32[], (f32[1,1024,1024], token[]), token[])
-      tuple(c0, recv-done-2.1.p, send-done-2.1.p)
-    while-result-2.p = (u32[], (f32[1,1024,1024], token[]), token[])
-      while(while-init-2.p),
-      body=while-body-2, condition=while-cond-2,
-      backend_config={"known_trip_count":{"n":"25"}}
-
-    recv-done-2.1.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result-2.p), index=1
-
-    ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(recv-done-2.1.q), index=0
-  }
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  PipelinedP2PRewriter rewriter;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
-  // Check that we transform the module without crashing.
-  EXPECT_TRUE(changed);
-}
-
-// Tests the rewrite for a pipelined Send/Recv chain with two channel groups.
-TEST_F(PipelinedP2pRewriterTest, SendRecvPipelined2) {
-  const char* kModuleStr = R"(
-  HloModule test, is_scheduled=true
-
-  while-cond {
-    param = (u32[], (f32[1,1024,1024], token[]), token[],
-      (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-    ub = u32[] constant(25)
-    ROOT cond-result = pred[] compare(count, ub), direction=LT
-  }
-
-  while-body {
-    param = (u32[], (f32[1,1024,1024], token[]), token[],
-      (f32[1,1024,1024], token[]), token[]) parameter(0)
-    count = get-tuple-element(param), index=0
-
-    recv-done.0.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
-    recv-data.0 = f32[1, 1024, 1024] get-tuple-element(recv-done.0.q), index=0
-    recv-done.1.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=3
-    recv-data.1 = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
-
-    replica = u32[] replica-id()
-    constant0 = u32[] constant(0)
-    compare0 = pred[] compare(replica, constant0), direction=EQ
-    compare = pred[1, 1024, 1024] broadcast(compare0), dimensions={}
-    recv-data = f32[1, 1024, 1024] select(compare, recv-data.0, recv-data.1)
-
-    c1 = u32[] constant(1)
-    new-count = u32[] add(count, c1)
-    c10 = u32[] constant(10)
-    sum = u32[] add(replica, c10)
-    sum2 = u32[] add(sum, count)
-    conv = f32[] convert(sum2)
-    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
-    b = f32[1, 1024, 1024] add(p, recv-data)
-    c = f32[1, 1024, 1024] multiply(b, b)
-    d = f32[1, 1024, 1024] tan(c)
-    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
-      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-    send-data = f32[1, 1024, 1024] add(c, s)
-
-    after-all = token[] after-all()
-    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{3,0}}",
-        _xla_send_recv_pipeline="0"
-      }
-    send = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all),
-      channel_id=1, frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{3,0}}",
-        _xla_send_recv_pipeline="0"
-      }
-    recv-done.p = (f32[1,1024,1024], token[]) recv-done(recv), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.p = token[] send-done(send), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-
-    after-all.1 = token[] after-all()
-    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
-        _xla_send_recv_pipeline="1"
-      }
-    send.1 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.1),
-      channel_id=2, frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
-       _xla_send_recv_pipeline="1"
-      }
-    recv-done.1.p = (f32[1,1024,1024], token[]) recv-done(recv.1), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="1"
-      }
-    send-done.1.p = token[] send-done(send.1), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="1"
-      }
-
-    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[],
-      (f32[1,1024,1024], token[]), token[])
-      tuple(new-count, recv-done.p, send-done.p, recv-done.1.p, send-done.1.p)
-  }
-
-  ENTRY main {
-    c0 = u32[] constant(0)
-    f0 = f32[] constant(0.0)
-    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
-
-    after-all.2 = token[] after-all()
-    recv.2 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.2), channel_id=1,
-      frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{3,0}}",
-       _xla_send_recv_pipeline="0"
-    }
-    send.2 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.2), channel_id=1,
-      frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{3,0}}",
-       _xla_send_recv_pipeline="0"
-    }
-    recv-done.2.p = (f32[1,1024,1024], token[]) recv-done(recv.2), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-    send-done.2.p = token[] send-done(send.2), channel_id=1,
-      frontend_attributes={
-        _xla_send_recv_pipeline="0"
-      }
-
-    after-all.3 = token[] after-all()
-    recv.3 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.3), channel_id=2,
-      frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
-       _xla_send_recv_pipeline="1"
-    }
-    send.3 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.3), channel_id=2,
-      frontend_attributes={
-       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
-       _xla_send_recv_pipeline="1"
-    }
-    recv-done.3.p = (f32[1,1024,1024], token[]) recv-done(recv.3), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="1"
-      }
-    send-done.3.p = token[] send-done(send.3), channel_id=2,
-      frontend_attributes={
-        _xla_send_recv_pipeline="1"
-      }
-
-    while-init.p =  (u32[], (f32[1,1024,1024], token[]), token[],
-      (f32[1,1024,1024], token[]), token[]) tuple(c0, recv-done.2.p, send-done.2.p, recv-done.3.p, send-done.3.p)
-    while-result.p = (u32[], (f32[1,1024,1024], token[]), token[],
-      (f32[1,1024,1024], token[]), token[]) while(while-init.p),
-      body=while-body, condition=while-cond,
-      backend_config={"known_trip_count":{"n":"25"}}
-
-    recv-done.2.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result.p), index=1
-    recv-data.2 = f32[1, 1024, 1024] get-tuple-element(recv-done.2.q), index=0
-    recv-done.3.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result.p), index=3
-    recv-data.3 = f32[1, 1024, 1024] get-tuple-element(recv-done.3.q), index=0
-
-    replica = u32[] replica-id()
-    constant0 = u32[] constant(0)
-    compare0 = pred[] compare(replica, constant0), direction=EQ
-    compare = pred[1, 1024, 1024] broadcast(compare0), dimensions={}
-    ROOT entry-result = f32[1, 1024, 1024] select(compare, recv-data.2, recv-data.3)
-  }
-  )";
-
-  const char* kExpected = R"(
-  CHECK: %while-body (param.1: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[])) {
-  CHECK: %param.1 = parameter(0)
-  CHECK: %get-tuple-element = get-tuple-element(%param.1), index=1
-  CHECK: %get-tuple-element.1 = get-tuple-element(%param.1), index=2
-  CHECK: %get-tuple-element.2 = get-tuple-element(%param.1), index=3
-  CHECK: %get-tuple-element.3 = get-tuple-element(%param.1), index=4
-  CHECK: %count.1 = get-tuple-element(%param.1), index=0
-  CHECK: %recv-done.p.clone = recv-done(%get-tuple-element), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK: %recv-data.0 = get-tuple-element(%recv-done.p.clone), index=0
-  CHECK: %recv-done.1.p.clone = recv-done(%get-tuple-element.2), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
-  CHECK: %recv-data.1 = get-tuple-element(%recv-done.1.p.clone), index=0
-  CHECK: %replica = replica-id()
-  CHECK: %constant0 = constant(0)
-  CHECK: %compare0 = compare(%replica, %constant0), direction=EQ
-  CHECK: %compare = broadcast(%compare0), dimensions={}
-  CHECK: %recv-data.2 = select(%compare, %recv-data.0, %recv-data.1)
-  CHECK: %c1 = constant(1)
-  CHECK: %new-count = add(%count.1, %c1)
-  CHECK: %c10 = constant(10)
-  CHECK: %sum = add(%replica, %c10)
-  CHECK: %sum2 = add(%sum, %count.1)
-  CHECK: %conv = convert(%sum2)
-  CHECK: %p = broadcast(%conv), dimensions={}
-  CHECK: %b = add(%p, %recv-data.2)
-  CHECK: %c = multiply(%b, %b)
-  CHECK: %d = tan(%c)
-  CHECK: %s = dot(%c, %d), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-  CHECK: %send-data = add(%c, %s)
-  CHECK: %after-all = after-all()
-  CHECK: %send-done.p.clone = send-done(%get-tuple-element.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK: %send-done.1.p.clone = send-done(%get-tuple-element.3), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
-  CHECK{LITERAL}: %recv = recv(%after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{3,0}}}
-  CHECK{LITERAL}: %send = send(%send-data, %after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{3,0}}}
-  CHECK: %after-all.1 = after-all()
-  CHECK{LITERAL}: %recv.1 = recv(%after-all.1), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}}}
-  CHECK{LITERAL}: %send.1 = send(%send-data, %after-all.1), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}}}
-  CHECK: ROOT %tuple = tuple(%new-count, %recv, %send, %recv.1, %send.1)
-  CHECK: }
-
-  CHECK: %while-cond (param: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> pred[] {
-  CHECK: %param = parameter(0)
-  CHECK: %count = get-tuple-element(%param), index=0
-  CHECK: %ub = constant(25)
-  CHECK: ROOT %cond-result = compare(%count, %ub), direction=LT
-  CHECK: }
-
-  CHECK: ENTRY %main () -> f32[1,1024,1024] {
-  CHECK: %c0 = constant(0)
-  CHECK: %f0 = constant(0)
-  CHECK: %init = broadcast(%f0), dimensions={}
-  CHECK: %after-all.2 = after-all()
-  CHECK{LITERAL}: %recv.2 = recv(%after-all.2), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{3,0}}}
-  CHECK{LITERAL}: %send.2 = send(%init, %after-all.2), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs={{3,0}}}
-  CHECK: %after-all.3 = after-all()
-  CHECK{LITERAL}: %recv.3 = recv(%after-all.3), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}}}
-  CHECK{LITERAL}: %send.3 = send(%init, %after-all.3), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs={{0,1}, {1,2}, {2,3}}}
-  CHECK: %while-init = tuple(%c0, %recv.2, %send.2, %recv.3, %send.3)
-  CHECK{LITERAL}: %while-result.p.clone = while(%while-init), condition=%while-cond, body=%while-body, backend_config={"known_trip_count":{"n":"25"}}
-  CHECK: %get-tuple-element.4 = get-tuple-element(%while-result.p.clone), index=1
-  CHECK: %get-tuple-element.5 = get-tuple-element(%while-result.p.clone), index=2
-  CHECK: %get-tuple-element.6 = get-tuple-element(%while-result.p.clone), index=3
-  CHECK: %get-tuple-element.7 = get-tuple-element(%while-result.p.clone), index=4
-  CHECK: %recv-done.2.p.clone = recv-done(%get-tuple-element.4), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK: %recv-data.3 = get-tuple-element(%recv-done.2.p.clone), index=0
-  CHECK: %recv-done.3.p.clone = recv-done(%get-tuple-element.6), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
-  CHECK: %recv-data.4 = get-tuple-element(%recv-done.3.p.clone), index=0
-  CHECK: %replica.1 = replica-id()
-  CHECK: %constant0.1 = constant(0)
-  CHECK: %compare0.1 = compare(%replica.1, %constant0.1), direction=EQ
-  CHECK: %compare.1 = broadcast(%compare0.1), dimensions={}
-  CHECK: %send-done.2.p.clone = send-done(%get-tuple-element.5), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
-  CHECK: %send-done.3.p.clone = send-done(%get-tuple-element.7), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
-  CHECK: ROOT %entry-result = select(%compare.1, %recv-data.3, %recv-data.4)
-  CHECK: })";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  PipelinedP2PRewriter rewriter;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
-  EXPECT_TRUE(changed);
-
-  DoFileCheck(module.get(), kExpected);
-}
-
-TEST_F(PipelinedP2pRewriterTest, NoCrashOnSortOperation) {
-  const char* kModuleStr = R"(
-  HloModule test, is_scheduled=true
-  comp2 {
-    p0 = u32[] parameter(0)
-    p1 = u32[] parameter(1)
-    p2 = s32[] parameter(2)
-    p3 = s32[] parameter(3)
-    ROOT tmp_4 = pred[] compare(u32[] p0, u32[] p1), direction=LT
-  }
-  ENTRY main {
-    p0 = u32[32] parameter(0)
-    p1 = s32[32] parameter(1)
-    ROOT sort = (u32[32]{0}, s32[32]{0}) sort(u32[32]{0} p0, s32[32]{0} p1), dimensions={0}, is_stable=true, to_apply=comp2
-  })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  PipelinedP2PRewriter rewriter;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
-TEST_F(PipelinedP2pRewriterTest, NoCrashOnDynamicSliceFusion) {
-  const char* kModuleStr = R"(
-  HloModule test, is_scheduled=true, entry_computation_layout={(s32[8,32]{1,0})->s32[2,32]{1,0}}, replica_count=2
-
-  %add (p0: s32[], p1: s32[]) -> s32[] {
-    %p1 = s32[] parameter(1)
-    %p0 = s32[] parameter(0)
-    ROOT %add.3 = s32[] add(s32[] %p0, s32[] %p1)
-  }
-
-  %dynamic-slice-fusion (p0.1: s32[8,32]) -> s32[2,32] {
-    %p0.1 = s32[8,32]{1,0} parameter(0)
-    %slice = s32[4,32]{1,0} slice(s32[8,32]{1,0} %p0.1), slice={[0:4], [0:32]}
-    ROOT %rs = s32[2,32]{1,0} reduce-scatter(s32[4,32]{1,0} %slice), channel_id=10, replica_groups={{0,1}}, dimensions={0}, to_apply=%add
-  }
-
-  ENTRY %main (data.1: s32[8,32]) -> s32[2,32] {
-    %data.1 = s32[8,32]{1,0} parameter(0)
-    ROOT %address-computation.1 = s32[2,32]{1,0} fusion(s32[8,32]{1,0} %data.1), kind=kCustom, calls=%dynamic-slice-fusion,
-            backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
-  })";
-
-  // There are some `HloSchedule` errors with the verifier in the HLO above.
-  // Because of these errors, we are using ParseAndReturnUnverifiedModule velow
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(kModuleStr));
-  PipelinedP2PRewriter rewriter;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
index 20c23d57c77d..ee35b3c3750d 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
-#include "absl/meta/type_traits.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -38,14 +37,18 @@ limitations under the License.
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/analysis/hlo_dfs_reachability.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/utils/hlo_traversal.h"
+#include "xla/map_util.h"
 #include "xla/service/dump.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/fusion_deduplication_cache.h"
@@ -65,12 +68,12 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace gpu {
@@ -138,8 +141,6 @@ GpuBackendConfig GetTritonGpuBackendConfig(
 // nodes and their operands.
 class PriorityFusionQueue {
   using Priority = absl::Duration;
-  using CanFuseCallback = std::function<FusionDecision(
-      HloInstruction* /*producer*/, int64_t /*consumer operand_index*/)>;
 
  public:
   PriorityFusionQueue(HloComputation* computation,
@@ -159,10 +160,12 @@ class PriorityFusionQueue {
                                         mlir_context),
         fusion_process_dump_(fusion_process_dump),
         thread_pool_(thread_pool),
-        mlir_context_(mlir_context),
         fusion_analysis_cache_(fusion_analysis_cache),
+        gpu_performance_model_(*device_info, fusion_analysis_cache,
+                               gpu_performance_model_cache_),
         fusion_deduplication_cache_(fusion_deduplication_cache),
         fusion_info_cache_(*device_info_),
+        reachability_(HloDfsReachability::Build(computation)),
         triton_heroless_fusion_enabled_(triton_heroless_fusion_enabled) {
     VLOG(2) << "Running full HLO cost analysis for " << computation_->name();
     TF_CHECK_OK(computation_->Accept(&cost_analysis_));
@@ -254,6 +257,10 @@ class PriorityFusionQueue {
       reverse_map_.erase(current_producer_);
 
       current_consumers_ = current_producer_->users();
+      auto preferred_consumer = GetPreferredConsumer(current_producer_);
+      if (preferred_consumer) {
+        current_consumers_ = {*preferred_consumer};
+      }
 
       if (HloPredicateIsOp<HloOpcode::kBitcast>(current_producer_)) {
         // We don't check if bitcasts can be fused with all consumers, so we
@@ -267,6 +274,15 @@ class PriorityFusionQueue {
     return !current_consumers_.empty();
   }
 
+  std::optional<HloInstruction*> GetPreferredConsumer(
+      HloInstruction* producer) {
+    auto it = preferred_consumer_.find(producer);
+    if (it == preferred_consumer_.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  }
+
   absl::Status UpdatePerformanceModelCache(HloInstruction* producer) {
     if (!IsFusible(*producer)) {
       return absl::OkStatus();
@@ -282,10 +298,8 @@ class PriorityFusionQueue {
           runtime_data,
           gpu_indexing_performance_model_.EstimateRunTimeForTriton(producer));
     } else {
-      auto config = GpuPerformanceModelOptions::Default(
-          &fusion_analysis_cache_, &gpu_performance_model_cache_);
-      runtime_data = GpuPerformanceModel::EstimateRunTimeForInstruction(
-          producer, *device_info_, &cost_analysis_, config);
+      runtime_data = gpu_performance_model_.EstimateRunTimeForInstruction(
+          producer, &cost_analysis_);
     }
 
     gpu_performance_model_cache_.Set(*producer, runtime_data);
@@ -391,9 +405,19 @@ class PriorityFusionQueue {
   }
 
   // Updates data for the new fusion instruction and its users and operands.
+  // Both `original_producer` and `original_consumer` could have been removed
+  // already from the computation, waiting for deletion. We can still
+  // dereference them though.
   void OnFusingInstruction(HloInstruction* fusion,
                            HloInstruction* original_producer,
-                           HloInstruction* original_consumer) {
+                           HloInstruction* original_consumer,
+                           int64_t original_consumer_operand_index) {
+    bool creates_multi_output_fusion =
+        preferred_consumer_.contains(original_producer);
+    fusion_deduplication_cache_.UpdateFusedInstructionId(
+        fusion, original_producer, original_consumer,
+        original_consumer_operand_index, creates_multi_output_fusion);
+
     if (fusion_process_dump_) {
       auto* fusion_step =
           fusion_process_dump_->add_fusion_steps()->mutable_fusion();
@@ -412,13 +436,24 @@ class PriorityFusionQueue {
           *fusion);
     }
 
-    // The original consumer was replaced with the fusion, but it's pointer can
-    // still be referenced somewhere, for example, in to_update_priority_.
-    // Priority recomputation is called before DCE. Remove all references to
-    // the original consumer here.
-    if (fusion != original_consumer) {
+    if (fusion == original_consumer) {
+      // We need to check again whether we can use `original_consumer` as a
+      // producer for a ProducerConsumer multi-output fusion.
+      preferred_consumer_.erase(original_consumer);
+    } else {
+      // The original consumer was replaced with the fusion, but it's pointer
+      // can still be referenced somewhere, for example, in to_update_priority_.
+      // Priority recomputation is called before DCE. Remove all references to
+      // the original consumer here.
+      reachability_->OnInstructionReplaced(/*previous=*/original_consumer,
+                                           /*now=*/fusion);
       RemoveInstruction(original_consumer);
     }
+    if (creates_multi_output_fusion) {
+      // After a multi-output fusion was created, we need to rebuild the
+      // HloDfsReachability data structure.
+      reachability_ = HloDfsReachability::Build(computation_);
+    }
 
     // Collect the instructions whose priorities need to be updated.
     for (HloInstruction* operand : fusion->operands()) {
@@ -434,10 +469,21 @@ class PriorityFusionQueue {
       }
 
       to_update_priority_.insert(operand);
-      // update the consumers of this operand that we care about,
-      // so we can do incremental update of the operand
+      // Update the consumers of this operand that we care about,
+      // so we can do incremental update of the operand.
       operands_to_new_consumers_[operand].push_back(fusion);
+
+      // We may need to reset `preferred_consumer_`, as we don't know yet
+      // whether that fusion would still be valid.
+      auto it = preferred_consumer_.find(operand);
+      if (it != preferred_consumer_.end() && it->second == original_consumer) {
+        preferred_consumer_.erase(it);
+      }
     }
+    // TODO(b/390559452): For multi-output fusion, we would also need to update
+    // the priorities of the other consumers of `producer` with which we did not
+    // fuse. For now, as we only allow multi-output fusion if there is just a
+    // single fusible consumer, this is not needed.
     to_update_priority_.insert(fusion);
   }
 
@@ -452,6 +498,7 @@ class PriorityFusionQueue {
     }
     producer_priority_queue_.erase(reverse_it->second);
     reverse_map_.erase(reverse_it);
+    preferred_consumer_.erase(instruction);
   }
 
   // Returns a map from consumer to BlockLevelParameters. This is used to
@@ -486,9 +533,22 @@ class PriorityFusionQueue {
       return -absl::InfiniteDuration();
     }
 
-    // Don't fuse if we can't fuse in all users.
     if (auto fusion_decision = CanFuseWithAllNonBitcastUsers(producer);
         !fusion_decision) {
+      // If we cannot fuse `producer` into all non-bitcast consumers, try
+      // Triton multi-output fusion next.
+      std::vector<HloInstruction*> possible_consumers =
+          FindPossibleConsumersForTritonMultiOutputFusion(producer);
+      if (CanFuseTritonMultiOutputWithSingleUser(producer,
+                                                 possible_consumers)) {
+        GpuPerformanceModel::RunTimes run_times =
+            gpu_performance_model_.EstimateRunTimes(
+                producer, &cost_analysis_,
+                /*fused_consumers=*/possible_consumers);
+        preferred_consumer_[producer] = possible_consumers[0];
+        return run_times.time_unfused - run_times.time_fused;
+      }
+      // Don't fuse if we can't fuse in all users.
       if (fusion_process_dump_) {
         absl::MutexLock lock(&fusion_process_dump_mutex_);
         auto* step = fusion_process_dump_->add_fusion_steps()
@@ -510,11 +570,8 @@ class PriorityFusionQueue {
     // Note that `gpu_performance_model_cache_` may contain a runtime estimate
     // from the Triton cost model.
     GpuPerformanceModel::RunTimes run_times =
-        GpuPerformanceModel::EstimateRunTimes(
-            producer, *device_info_, &cost_analysis_,
-            GpuPerformanceModelOptions::Default(&fusion_analysis_cache_,
-                                                &gpu_performance_model_cache_),
-            fused_consumers);
+        gpu_performance_model_.EstimateRunTimes(producer, &cost_analysis_,
+                                                fused_consumers);
     Priority current_priority;
     if (is_incremental_update) {
       // subtract the runtimes of removed consumers
@@ -565,10 +622,12 @@ class PriorityFusionQueue {
   }
 
   TiledRunTimeDataOrError GetTiledRunTimeDataCached(
-      const HloInstruction* producer, const HloInstruction* consumer) {
+      const HloInstruction* producer, const HloInstruction* consumer,
+      bool use_multi_output_fusion = false) {
     FusionDeduplicationCache::FusionId fusion_id = [&]() {
       absl::MutexLock lock(&fusion_deduplication_cache_mutex_);
-      return fusion_deduplication_cache_.GetFusionId(*producer, *consumer);
+      return fusion_deduplication_cache_.GetFusionId(producer, consumer,
+                                                     use_multi_output_fusion);
     }();
 
     {
@@ -580,7 +639,8 @@ class PriorityFusionQueue {
       }
     }
 
-    auto fusion = HloFusionAdaptor::ForProducerConsumer(producer, consumer);
+    auto fusion = HloFusionAdaptor::ForProducerConsumer(
+        producer, consumer, use_multi_output_fusion);
 
     absl::StatusOr<TiledRunTimeDataOrError> result_or_status =
         gpu_indexing_performance_model_.TryFindBestTilingForFusion(*fusion);
@@ -612,7 +672,8 @@ class PriorityFusionQueue {
   }
 
   FusionDecision CanFuseTriton(HloInstruction* producer,
-                               HloInstruction* consumer) {
+                               HloInstruction* consumer,
+                               bool use_multi_output_fusion = false) {
     if (!IsGenericTritonFusion(*producer) &&
         !IsGenericTritonFusion(*consumer) && !triton_heroless_fusion_enabled_) {
       return FusionDecision::Forbid("triton heroless fusion is not enabled");
@@ -627,7 +688,7 @@ class PriorityFusionQueue {
     }
 
     TiledRunTimeDataOrError tiled_run_time_data_or_error =
-        GetTiledRunTimeDataCached(producer, consumer);
+        GetTiledRunTimeDataCached(producer, consumer, use_multi_output_fusion);
 
     if (const auto* fusion_decision =
             std::get_if<FusionDecision>(&tiled_run_time_data_or_error)) {
@@ -639,9 +700,14 @@ class PriorityFusionQueue {
 
     // This is our way to pass the runtime estimate to the CalculatePriorities()
     // function.
+    // This is somewhat brittle as we currently don't distinguish between
+    // ProducerConsumer fusion where we allow multi-output fusions to be formed,
+    // and ProducerConsumer fusion where we don't allow it. Same for the
+    // `block_level_parameters_cache_` down below. Currently we only try out
+    // multi-output fusion if we cannot fuse into all consumers, and it is tried
+    // last, so the final cached value should be what we want.
     gpu_performance_model_cache_.Set(
         *producer, *consumer, tiled_run_time_data.runtime_data.exec_time);
-
     {
       absl::MutexLock lock(&block_level_parameters_cache_mutex_);
       block_level_parameters_cache_[producer][consumer] =
@@ -781,6 +847,63 @@ class PriorityFusionQueue {
     return fusion_decision;
   }
 
+  // Checks whether any operand of `consumer` is reachable from `producer`
+  // following user edges in the HLO graph. If that is the case, we would
+  // introduce a cycle by fusing `producer` into `consumer`.
+  bool OperandReachableFromProducer(const HloInstruction* producer,
+                                    const HloInstruction* consumer) {
+    for (const auto* consumer_operand : consumer->operands()) {
+      CHECK(reachability_->IsPresent(consumer_operand) &&
+            reachability_->IsPresent(producer))
+          << "Reachability map is incomplete. This should never "
+             "happen.";
+      if (producer != consumer_operand &&
+          reachability_->IsReachable(producer, consumer_operand)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  std::vector<HloInstruction*> FindPossibleConsumersForTritonMultiOutputFusion(
+      HloInstruction* producer) {
+    bool triton_multi_output_fusion_enabled =
+        producer->GetModule()
+            ->config()
+            .debug_options()
+            .xla_gpu_unsupported_enable_triton_multi_output_fusion();
+    if (!triton_multi_output_fusion_enabled) {
+      return {};
+    }
+    std::vector<HloInstruction*> possible_consumers;
+    for (const auto& user : producer->users()) {
+      if (HloPredicateIsOp<HloOpcode::kBitcast>(user)) {
+        continue;
+      }
+      if (CanFuseTriton(producer, user, /*use_multi_output_fusion=*/true) &&
+          !OperandReachableFromProducer(producer, user)) {
+        possible_consumers.push_back(user);
+      }
+    }
+    return possible_consumers;
+  }
+
+  FusionDecision CanFuseTritonMultiOutputWithSingleUser(
+      HloInstruction* producer,
+      const std::vector<HloInstruction*>& possible_consumers) {
+    if (possible_consumers.empty()) {
+      return FusionDecision::Forbid("No users to fuse");
+    }
+
+    if (possible_consumers.size() != 1) {
+      // TODO(b/390559452): If there are several possible consumers to fuse
+      // with, decide which one is best. Also depends on what further fusions
+      // might be possible, needs checking the reachability graph.
+      return FusionDecision::Forbid("more than one consumer to fuse with");
+    }
+    return FusionDecision::Allow();
+  }
+
   FusionDecision CanFuseWithAllNonBitcastUsers(HloInstruction* producer) {
     if (producer->users().empty()) {
       return FusionDecision::Forbid("No users to fuse");
@@ -826,6 +949,11 @@ class PriorityFusionQueue {
   // A reverse map that helps find an instruction in the priority queue.
   absl::flat_hash_map<HloInstruction*, PriorityQueue::iterator> reverse_map_;
 
+  // Stores a mapping from the producer to the preferred consumer to fuse into.
+  // This is only used in case that we want to use ProducerConsumer multi-output
+  // fusion.
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> preferred_consumer_;
+
   // The current producer being visited.
   HloInstruction* current_producer_;
 
@@ -849,9 +977,10 @@ class PriorityFusionQueue {
 
   tsl::thread::ThreadPool* thread_pool_;
 
-  mlir::MLIRContext* mlir_context_;
-
   HloFusionAnalysisCache& fusion_analysis_cache_;
+  // The GpuPerformance model cache must outlive the GpuPerformanceModel.
+  GpuPerformanceModelCache gpu_performance_model_cache_;
+  GpuPerformanceModel gpu_performance_model_;
 
   FusionDeduplicationCache& fusion_deduplication_cache_;
   absl::Mutex fusion_deduplication_cache_mutex_;
@@ -877,12 +1006,14 @@ class PriorityFusionQueue {
       tiled_run_time_data_cache_;
   absl::Mutex tiled_run_time_data_cache_mutex_;
 
-  GpuPerformanceModelCache gpu_performance_model_cache_;
-
   // Cache for `FusionFitsInBudget` to avoid recomputing expensive properties
   // like shared memory usage or number of unnested reductions of fusion nodes.
   FusionInfoCache fusion_info_cache_;
 
+  // Allows evaluation of whether an HloInstruction is an ancestor of another
+  // HloInstruction.
+  std::unique_ptr<HloDfsReachability> reachability_;
+
   // If true, redirect all fusion decisions to Triton fusion.
   bool triton_heroless_fusion_enabled_;
 
@@ -910,7 +1041,7 @@ bool PriorityFusion::ConsumeFuel(HloInstruction* producer,
     return absl::StrFormat("Not fusing producer %s with consumer %s",
                            producer->name(), consumer->name());
   });
-};
+}
 
 FusionDecision PriorityFusion::CanFuseConstant(const HloInstruction* constant,
                                                const HloInstruction* user) {
@@ -957,23 +1088,6 @@ absl::StatusOr<bool> PriorityFusion::Run(
   auto fusible_computations =
       GetFusibleComputations(*module, execution_threads);
 
-  // Appends ".0" suffix to all instructions.
-  //
-  // Every time an instruction is duplicated, the last integer suffix is
-  // incremented.
-  // Before: broadcast.123 -> broadcast.124
-  // After: broadcast.123.0 -> broadcast.123.1
-  //
-  // With this modification it will be easier to match instructions before and
-  // after fusion passes, because they will have the same unique prefix. Names
-  // are not used in the pipeline, but it makes debugging much easier.
-  for (auto* computation : fusible_computations) {
-    for (auto* instruction : computation->instructions()) {
-      module->SetAndUniquifyInstrName(instruction,
-                                      absl::StrCat(instruction->name(), ".0"));
-    }
-  }
-
   if (dump_enabled) {
     fusion_process_dump_->set_hlo_module_before_fusion(
         module->ToString(HloPrintOptions::ShortParsable()));
@@ -1004,7 +1118,12 @@ absl::StatusOr<bool> PriorityFusion::Run(
           block_level_parameters_map =
               fusion_queue->GetBlockLevelParametersMap(producer);
 
-      for (auto* consumer : fusion_queue->current_consumers()) {
+      auto preferred_consumer = fusion_queue->GetPreferredConsumer(producer);
+      std::vector<HloInstruction*> consumers =
+          fusion_queue->current_consumers();
+      bool use_multi_output_fusion = preferred_consumer.has_value();
+
+      for (auto* consumer : consumers) {
         // Don't fuse into single bitcasts. We ignore them in the check
         // CanFuseWithAllNonBitcastUsers(), so we need to check it here.
         if (HloPredicateIsOp<HloOpcode::kBitcast>(consumer)) {
@@ -1018,12 +1137,8 @@ absl::StatusOr<bool> PriorityFusion::Run(
         int64_t consumer_operand_index = consumer->operand_index(producer);
 
         fusion_queue->PreFusion(producer, consumer);
-        auto fusion_instruction = Fuse(producer, consumer);
-        fusion_deduplication_cache.UpdateFusedInstructionId(
-            *fusion_instruction, *producer, *consumer, consumer_operand_index);
-        fusion_queue->OnFusingInstruction(fusion_instruction, producer,
-                                          consumer);
-
+        auto fusion_instruction =
+            Fuse(producer, consumer, use_multi_output_fusion);
         auto backend_config_it = block_level_parameters_map.find(consumer);
         if (backend_config_it != block_level_parameters_map.end()) {
           TF_RETURN_IF_ERROR(fusion_instruction->set_backend_config(
@@ -1031,20 +1146,25 @@ absl::StatusOr<bool> PriorityFusion::Run(
           fusion_instruction->set_fusion_kind(
               HloInstruction::FusionKind::kCustom);
         }
+        fusion_queue->OnFusingInstruction(fusion_instruction, producer,
+                                          consumer, consumer_operand_index);
 
         changed = true;
       }
 
       fusion_queue->ComputeRuntimesOfRemovedConsumers();
-      if (producer->user_count() == 0) {
+      if (use_multi_output_fusion || producer->user_count() == 0) {
         fusion_queue->InvalidateCaches(producer);
-        producer->DetachFromOperandsAndUsers();
         fusion_queue->RemoveInstruction(producer);
-        // Remove from computation.
-        TF_RETURN_IF_ERROR(computation->RemoveInstruction(producer));
+        // When we use ProducerConsumer multi-output fusion, `producer` will
+        // have been removed already.
+        if (!use_multi_output_fusion) {
+          producer->DetachFromOperandsAndUsers();
+          TF_RETURN_IF_ERROR(computation->RemoveInstruction(producer));
+        }
       }
 
-      for (auto* consumer : fusion_queue->current_consumers()) {
+      for (auto* consumer : consumers) {
         fusion_queue->InvalidateCaches(consumer);
       }
       TF_RETURN_IF_ERROR(fusion_queue->UpdatePriorities());
@@ -1097,6 +1217,7 @@ HloInstruction::FusionKind PriorityFusion::ChooseKind(
   // analysis.
   const auto& analysis = fusion_analysis_cache_.Get(*producer, *consumer);
   switch (analysis.GetEmitterFusionKind()) {
+    case HloFusionAnalysis::EmitterFusionKind::kDynamicMemcpy:
     case HloFusionAnalysis::EmitterFusionKind::kLoop:
       return HloInstruction::FusionKind::kLoop;
     case HloFusionAnalysis::EmitterFusionKind::kTriton:
@@ -1113,7 +1234,8 @@ HloInstruction::FusionKind PriorityFusion::ChooseKind(
 }
 
 HloInstruction* PriorityFusion::Fuse(HloInstruction* producer,
-                                     HloInstruction* consumer) {
+                                     HloInstruction* consumer,
+                                     bool use_multi_output_fusion) {
   VLOG(2) << "Fusing " << producer->ToString() << " into "
           << consumer->ToString();
 
@@ -1134,9 +1256,22 @@ HloInstruction* PriorityFusion::Fuse(HloInstruction* producer,
       /*skip_async_execution_thread_overwrite=*/false);
 
   if (HloPredicateIsOp<HloOpcode::kFusion>(producer)) {
-    fusion_instruction->MergeFusionInstruction(producer);
+    if (use_multi_output_fusion) {
+      fusion_instruction->MergeFusionInstructionIntoMultiOutput(producer);
+    } else {
+      fusion_instruction->MergeFusionInstruction(producer);
+    }
   } else {
-    fusion_instruction->FuseInstruction(producer);
+    if (use_multi_output_fusion) {
+      fusion_instruction->FuseInstructionIntoMultiOutput(producer);
+      // MergeFusionInstructionIntoMultiOutput already removes `producer` from
+      // the computation. Do the same here, so that we have the invariant that
+      // the producer has been cleaned up when multi-output fusion is used.
+      CHECK_EQ(0, producer->user_count());
+      TF_CHECK_OK(producer->parent()->RemoveInstruction(producer));
+    } else {
+      fusion_instruction->FuseInstruction(producer);
+    }
   }
 
   if (fusion_instruction != consumer) {
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.h b/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
index f1d19532a755..c9a29a5c056f 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.h
@@ -60,7 +60,8 @@ class PriorityFusion : public HloModulePass {
   HloInstruction::FusionKind ChooseKind(const HloInstruction* producer,
                                         const HloInstruction* consumer);
 
-  HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer);
+  HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
+                       bool use_multi_output_fusion = false);
 
  private:
   // Consumes a unit of compiler fuel and returns true if we should
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
index b534f46150b3..21cc98fe500d 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -48,7 +48,7 @@ using ::tsl::testing::IsOkAndHolds;
 namespace xla {
 namespace gpu {
 
-class PriorityFusionTest : public HloTestBase {
+class PriorityFusionTest : public HloHardwareIndependentTestBase {
  public:
   std::vector<HloFusionAnalysis::EmitterFusionKind> RunAndGetFusionKinds(
       absl::string_view hlo) {
@@ -1026,6 +1026,109 @@ ENTRY main {
             2);
 }
 
+TEST_F(PriorityFusionTest,
+       FuseTritonProducerWithTwoConsumersUsingMultiOutputFusion) {
+  const std::string kHloText = R"(
+HloModule t
+
+producer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  ROOT broadcast = f32[125,127] broadcast(parameter_0), dimensions={0}
+}
+
+consumer_computation {
+  parameter_0 = f32[125,127] parameter(0)
+  ROOT log = f32[125,127] log(parameter_0)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  producer_fusion = f32[125,127] fusion(param_0), kind=kCustom, calls=producer_computation, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["1","127"]}],"num_warps":"1"}}}
+  consumer_fusion = f32[125,127] fusion(producer_fusion), kind=kLoop, calls=consumer_computation
+  ROOT tuple = (f32[125,127], f32[125,127]) tuple(consumer_fusion, producer_fusion)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_unsupported_enable_triton_multi_output_fusion(false);
+  EXPECT_FALSE(priority_fusion_.Run(module.get()).value());
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_unsupported_enable_triton_multi_output_fusion(true);
+  EXPECT_TRUE(priority_fusion_.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction *fusion1, *fusion2;
+  EXPECT_THAT(root,
+              GmockMatch(m::Tuple(
+                  m::GetTupleElement(m::Fusion(&fusion1, m::Parameter()), 0),
+                  m::GetTupleElement(m::Fusion(&fusion2, m::Parameter()), 1))));
+  EXPECT_EQ(fusion1, fusion2);
+  EXPECT_TRUE(IsGenericTritonFusion(*fusion1));
+  TF_ASSERT_OK_AND_ASSIGN(auto backend_config1,
+                          fusion1->backend_config<GpuBackendConfig>());
+  EXPECT_TRUE(
+      backend_config1.fusion_backend_config().has_block_level_fusion_config());
+  EXPECT_EQ(backend_config1.fusion_backend_config()
+                .block_level_fusion_config()
+                .output_tiles(0)
+                .sizes_size(),
+            2);
+}
+
+TEST_F(PriorityFusionTest,
+       FuseProducerWithTritonConsumerUsingMultiOutputFusion) {
+  const std::string kHloText = R"(
+HloModule t
+
+consumer_computation {
+  parameter_0 = f32[125,127] parameter(0)
+  ROOT log = f32[125,127] log(parameter_0)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  producer = f32[125,127] broadcast(param_0), dimensions={0}
+  consumer_fusion = f32[125,127] fusion(producer), kind=kCustom, calls=consumer_computation, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["1","127"]}],"num_warps":"1"}}}
+  ROOT tuple = (f32[125,127], f32[125,127]) tuple(consumer_fusion, producer)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_unsupported_enable_triton_multi_output_fusion(false);
+  EXPECT_FALSE(priority_fusion_.Run(module.get()).value());
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_unsupported_enable_triton_multi_output_fusion(true);
+  EXPECT_TRUE(priority_fusion_.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction *fusion1, *fusion2;
+  EXPECT_THAT(root,
+              GmockMatch(m::Tuple(
+                  m::GetTupleElement(m::Fusion(&fusion1, m::Parameter()), 0),
+                  m::GetTupleElement(m::Fusion(&fusion2, m::Parameter()), 1))));
+  EXPECT_EQ(fusion1, fusion2);
+  EXPECT_TRUE(IsGenericTritonFusion(*fusion1));
+  TF_ASSERT_OK_AND_ASSIGN(auto backend_config1,
+                          fusion1->backend_config<GpuBackendConfig>());
+  EXPECT_TRUE(
+      backend_config1.fusion_backend_config().has_block_level_fusion_config());
+  EXPECT_EQ(backend_config1.fusion_backend_config()
+                .block_level_fusion_config()
+                .output_tiles(0)
+                .sizes_size(),
+            2);
+}
+
 TEST_F(PriorityFusionTest, TritonProducerNotSupported_DoNotFuse) {
   const std::string kHloText = R"(
 HloModule t
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
index be7488a23dfc..8f0d8eaa50c3 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal_util.h"
+#include "xla/service/collective_ops_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -48,30 +49,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Returns the number of devices that participate in the ragged-all-to-all based
-// on the replica groups. Returns nullopt if replica groups are not present or
-// have different numbers of devices.
-std::optional<int64_t> GetNumParticipatingDevices(
-    HloRaggedAllToAllInstruction* ragged_all_to_all) {
-  absl::Span<const ReplicaGroup> replica_groups =
-      ragged_all_to_all->device_list().replica_groups();
-
-  if (replica_groups.empty()) {
-    return std::nullopt;
-  }
-
-  int64_t num_participating_devices =
-      replica_groups.begin()->replica_ids_size();
-
-  if (!absl::c_all_of(replica_groups, [&](const ReplicaGroup& replica_group) {
-        return replica_group.replica_ids_size() == num_participating_devices;
-      })) {
-    return std::nullopt;
-  }
-
-  return num_participating_devices;
-}
-
 // Runs all-to-all to exchange output offsets for each participating device.
 HloInstruction* RunAllToAllOnOutputOffsets(HloComputation* computation,
                                            HloInstruction* ragged_all_to_all,
@@ -155,12 +132,12 @@ HloInstruction* GetRowSlice(HloInstruction* hlo, int64_t row_index) {
   Shape row_shape = hlo->shape();
   row_shape.set_dimensions(0, 1);
 
-  std::vector<int64_t> slice_start_indices(row_shape.rank(), 0);
+  std::vector<int64_t> slice_start_indices(row_shape.dimensions().size(), 0);
   slice_start_indices[0] = row_index;
   std::vector<int64_t> slice_limit_indices{row_shape.dimensions().begin(),
                                            row_shape.dimensions().end()};
   slice_limit_indices[0] = row_index + 1;
-  std::vector<int64_t> slice_strides(row_shape.rank(), 1);
+  std::vector<int64_t> slice_strides(row_shape.dimensions().size(), 1);
 
   HloInstruction* row_slice =
       computation->AddInstruction(HloInstruction::CreateSlice(
@@ -223,7 +200,8 @@ HloInstruction* PadOutermostDimension(HloComputation* computation,
                                       int64_t padding_size) {
   Shape padded_shape = hlo->shape();
 
-  PaddingConfig padding_config = MakeNoPaddingConfig(padded_shape.rank());
+  PaddingConfig padding_config =
+      MakeNoPaddingConfig(padded_shape.dimensions().size());
   padding_config.mutable_dimensions(0)->set_edge_padding_high(padding_size);
 
   padded_shape.set_dimensions(0, padded_shape.dimensions(0) + padding_size);
@@ -255,7 +233,7 @@ std::vector<HloInstruction*> RaggedToDense(HloComputation* computation,
     for (int64_t j = 0; j < num_updates_per_replica; ++j) {
       auto offset_multi_index = GetOffsetMultiIndex(
           computation, offsets, i * num_updates_per_replica + j,
-          ragged_input->shape().rank());
+          ragged_input->shape().dimensions().size());
 
       HloInstruction* padded_input =
           PadOutermostDimension(computation, ragged_input, max_update_size);
@@ -288,7 +266,7 @@ HloInstruction* DenseToRagged(HloComputation* computation,
                               int64_t num_updates_per_replica,
                               int64_t max_update_size) {
   int64_t num_rows = offsets->shape().dimensions(0);
-  int64_t rank = ragged_output->shape().rank();
+  int64_t rank = ragged_output->shape().dimensions().size();
 
   Shape original_shape = ragged_output->shape();
 
@@ -303,7 +281,8 @@ HloInstruction* DenseToRagged(HloComputation* computation,
     for (int64_t j = 0; j < num_updates_per_replica; ++j) {
       int idx = i * num_updates_per_replica + j;
       auto offset_multi_index = GetOffsetMultiIndex(
-          computation, offsets, idx, padded_ragged_output->shape().rank());
+          computation, offsets, idx,
+          padded_ragged_output->shape().dimensions().size());
 
       // `dense_inputs` is a tuple of updates for each replica. The number of
       // elements in the tuple is equal to the number of replicas.
@@ -365,12 +344,14 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(HloInstruction* hlo,
   HloRaggedAllToAllInstruction* all_to_all =
       Cast<HloRaggedAllToAllInstruction>(hlo);
 
-  std::optional<int64_t> num_participating_devices =
-      GetNumParticipatingDevices(all_to_all);
-  if (!num_participating_devices.has_value()) {
+  TF_ASSIGN_OR_RETURN(auto replica_group_count_and_size,
+                      GetReplicaGroupCountAndSize(all_to_all));
+  if (!replica_group_count_and_size.has_value()) {
     return false;
   }
 
+  int64_t num_participating_devices = replica_group_count_and_size->second;
+
   HloInstruction* input_operand = all_to_all->mutable_operand(0);
   HloInstruction* output_operand = all_to_all->mutable_operand(1);
 
@@ -380,7 +361,7 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(HloInstruction* hlo,
 
   int64_t num_total_updates = input_offsets->shape().dimensions(0);
   int64_t num_updates_per_replica =
-      num_total_updates / *num_participating_devices;
+      num_total_updates / num_participating_devices;
   int64_t max_update_size = input_operand->shape().dimensions(0);
 
   // Runs all-to-all to exchange output offsets for each participating device.
@@ -390,7 +371,7 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(HloInstruction* hlo,
   // from the perspective of the local buffer.
   output_offsets = RunAllToAllOnOutputOffsets(
       computation, all_to_all, output_offsets, num_updates_per_replica,
-      *num_participating_devices);
+      num_participating_devices);
 
   auto dense_input = RaggedToDense(computation, input_operand, input_offsets,
                                    num_updates_per_replica, max_update_size);
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer_test.cc
index 9abab809708c..1a36acae1e81 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_decomposer_test.cc
@@ -82,9 +82,9 @@ ENTRY main {
 }
 
 TEST_F(RaggedAllToAllDecomposerTest,
-       RaggedAllToAllWithoutReplicaGroupsIsNotSupported) {
+       RaggedAllToAllWithoutReplicaGroupsIsSupported) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
-HloModule module
+HloModule module, replica_count=2
 
 ENTRY main {
   input = bf16[16] parameter(0)
@@ -100,7 +100,25 @@ ENTRY main {
 
   RaggedAllToAllDecomposer decomposer;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
-  EXPECT_FALSE(changed);
+  EXPECT_TRUE(changed);
+  TF_EXPECT_OK(VerifyHloModule(module.get(), true, true));
+  TF_EXPECT_OK(HloCSE(true).Run(module.get()));
+
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    // CHECK: s64[2,1]{1,0} all-to-all
+    // CHECK: dynamic-slice
+    // CHECK: reshape
+    // CHECK: concatenate
+    // CHECK: dynamic-slice
+    // CHECK: reshape
+    // CHECK: concatenate
+    // CHECK: (bf16[1,16]{1,0}, bf16[1,16]{1,0}) all-to-all
+    // CHECK: dynamic-update-slice
+    // CHECK: iota
+    // CHECK: compare
+    // CHECK: select
+    // CHECK: select
+  )"));
 }
 
 TEST_F(RaggedAllToAllDecomposerTest,
diff --git a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
index c08f37944081..9d361ba3fb63 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator.cc
@@ -82,9 +82,9 @@ absl::StatusOr<bool> ReduceScatterCreator::Run(
                                      scatter_dim_size * ar_spec->group_size);
         rs_input = computation->AddInstruction(HloInstruction::CreateSlice(
             scatter_shape, rs_input,
-            std::vector<int64_t>(scatter_shape.rank(), 0),
+            std::vector<int64_t>(scatter_shape.dimensions().size(), 0),
             scatter_shape.dimensions(),
-            std::vector<int64_t>(scatter_shape.rank(), 1)));
+            std::vector<int64_t>(scatter_shape.dimensions().size(), 1)));
       }
       scatter_shape.set_dimensions(split_dim, scatter_dim_size);
 
diff --git a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc
index 085371af6fa9..0832bb0777c2 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduce_scatter_creator_test.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
@@ -45,7 +45,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-class GpuReduceScatterCreatorTest : public HloTestBase {
+class GpuReduceScatterCreatorTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, int64_t num_replicas,
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
index 8c2929c0787f..b6e727c3ccc0 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover.cc
@@ -65,7 +65,7 @@ class ReductionDegenerateDimRemoverVisitor : public DfsHloRewriteVisitor {
       auto reduced_dimensions = instr->dimensions();
       int64_t shift = 0;
 
-      for (int dim = 0; dim < input_shape.rank(); dim++) {
+      for (int dim = 0; dim < input_shape.dimensions().size(); dim++) {
         if (input_shape.dimensions(dim) == 1) {
           shift++;
         } else {
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover_test.cc b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover_test.cc
index 7a9b7fa3fdbe..d765b53b7533 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_degenerate_dim_remover_test.cc
@@ -18,14 +18,15 @@ limitations under the License.
 #include <optional>
 
 #include "absl/strings/string_view.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 
 namespace {
 
-class ReductionDegenerateDimRemoverTest : public HloTestBase {
+class ReductionDegenerateDimRemoverTest
+    : public HloHardwareIndependentTestBase {
  public:
   void CheckDegenerateDimRemover(absl::string_view hlo,
                                  std::optional<absl::string_view> expected) {
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
index ca4fba4fac94..ea4322d7fc95 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper.cc
@@ -66,10 +66,12 @@ class ReduceDimensionGroupVisitor : public DfsHloRewriteVisitor {
 
       // Since we have enforced the standard layout, iteration over logical
       // dimensions is equivalent to iteration over the major-to-minor order.
-      for (int logical_dim = 0; logical_dim < shape.rank(); logical_dim++) {
+      for (int logical_dim = 0; logical_dim < shape.dimensions().size();
+           logical_dim++) {
         VLOG(5) << "Processing dimension " << logical_dim << " of size "
                 << shape.dimensions(logical_dim);
-        if (is_reduced(logical_dim) && logical_dim < shape.rank() - 1 &&
+        if (is_reduced(logical_dim) &&
+            logical_dim < shape.dimensions().size() - 1 &&
             is_reduced(logical_dim + 1)) {
           VLOG(5) << "This and consecutive dimension are reduced, merging";
           changed = true;
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper_test.cc b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper_test.cc
index afbbbec01d3c..76dbcd1cbd5d 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_dimension_grouper_test.cc
@@ -18,14 +18,14 @@ limitations under the License.
 #include <optional>
 
 #include "absl/strings/string_view.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 
 namespace {
 
-class ReductionDimensionGrouperTest : public HloTestBase {
+class ReductionDimensionGrouperTest : public HloHardwareIndependentTestBase {
  public:
   void CheckDimensionGrouper(absl::string_view hlo,
                              std::optional<absl::string_view> expected) {
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
index e408855bf24d..38b8878eea12 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_layout_normalizer.cc
@@ -80,7 +80,7 @@ class EnforceMinorToMajorReduceOpVisitor : public DfsHloRewriteVisitor {
       // The layout order of the reduction output can be different to the
       // ordering of kept dimensions in the input operand, thus we need to
       // calculate the new layout.
-      DimensionVector new_reduce_shape_layout(reduce_shape.rank());
+      DimensionVector new_reduce_shape_layout(reduce_shape.dimensions().size());
       std::vector<int64_t> reduce_shape_logical_to_physical =
           LayoutUtil::MakeLogicalToPhysical(reduce_shape.layout());
 
@@ -92,10 +92,11 @@ class EnforceMinorToMajorReduceOpVisitor : public DfsHloRewriteVisitor {
                });
       };
 
-      for (int i = 0; i < operand_shape.rank(); i++) {
+      for (int i = 0; i < operand_shape.dimensions().size(); i++) {
         // Process the dimensions in the major-to-minor order in order to
         // enforce the default layout.
-        int64_t major_to_minor_dim_idx = operand_shape.rank() - i - 1;
+        int64_t major_to_minor_dim_idx =
+            operand_shape.dimensions().size() - i - 1;
         int64_t logical_dim =
             operand_layout.minor_to_major(major_to_minor_dim_idx);
         int64_t dim_size = operand_shape.dimensions(logical_dim);
@@ -112,8 +113,9 @@ class EnforceMinorToMajorReduceOpVisitor : public DfsHloRewriteVisitor {
               reduce_shape_logical_to_physical[logical_reduce_dim];
           VLOG(5) << "logical_reduce_dim = " << logical_reduce_dim << ", "
                   << "physical_reduce_dim = " << physical_reduce_dim;
-          new_reduce_shape_layout[reduce_shape.rank() - physical_reduce_dim -
-                                  1] = new_reduce_shape_data.size() - 1;
+          new_reduce_shape_layout[reduce_shape.dimensions().size() -
+                                  physical_reduce_dim - 1] =
+              new_reduce_shape_data.size() - 1;
         }
       }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/reduction_splitter_test.cc b/third_party/xla/xla/service/gpu/transforms/reduction_splitter_test.cc
index 3995338b8b46..8968c5f08387 100644
--- a/third_party/xla/xla/service/gpu/transforms/reduction_splitter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/reduction_splitter_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace gpu {
@@ -40,10 +40,8 @@ auto MakeDeviceDescription() {
   return device_description;
 }
 
-class ReductionSplitterTest : public HloTestBase {
+class ReductionSplitterTest : public HloHardwareIndependentTestBase {
  public:
-  using HloTestBase::HloTestBase;
-
   auto MakeReductionSplitter(bool ignore_small_dims) const {
     return ReductionSplitter{device_description_,
                              /*ignore_small_dims=*/ignore_small_dims};
diff --git a/third_party/xla/xla/service/gpu/transforms/rename_fusions_test.cc b/third_party/xla/xla/service/gpu/transforms/rename_fusions_test.cc
index 47470859f84d..17f5e8fd2b9e 100644
--- a/third_party/xla/xla/service/gpu/transforms/rename_fusions_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/rename_fusions_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 
 namespace xla {
 namespace gpu {
 
-class RenameFusionsTest : public HloTestBase {
+class RenameFusionsTest : public HloHardwareIndependentTestBase {
  protected:
   RenameFusions rename_fusions_;
 };
diff --git a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names_test.cc b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names_test.cc
index 43fa89337357..09b76dcec96f 100644
--- a/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sanitize_constant_names_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <utility>
 
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/literal_util.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -32,7 +32,7 @@ namespace gpu {
 namespace {
 
 namespace m = ::xla::match;
-using SanitizeConstantNamesTest = HloTestBase;
+using SanitizeConstantNamesTest = HloHardwareIndependentTestBase;
 
 TEST_F(SanitizeConstantNamesTest, InstructionNameWithHyphenSanitized) {
   const char *const kHloString = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
index 1317eb10b6d8..a74110fd5078 100644
--- a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier.cc
@@ -84,7 +84,7 @@ class ScatterSliceMatcher {
   // the original scatter dimensions. Return `false` if the update is not
   // possible.
   bool UpdateDimensions(const HloSliceInstruction* slice) {
-    int64_t rank = slice->shape().rank();
+    int64_t rank = slice->shape().dimensions().size();
     for (int64_t i = 0; i < rank; ++i) {
       if (slice->slice_starts(i) != 0 || slice->slice_strides(i) != 1) {
         return false;  // The slice is not a truncation.
@@ -145,10 +145,10 @@ class ScatterSliceMatcher {
 
 // Create a replacement operand for the scatter instruction.
 HloInstruction* CreateSliceFrom(HloInstruction* operand, const Shape& shape) {
-  std::vector<int64_t> start_indices(shape.rank(), 0);
-  std::vector<int64_t> limit_indices(shape.rank());
-  std::vector<int64_t> strides(shape.rank(), 1);
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  std::vector<int64_t> start_indices(shape.dimensions().size(), 0);
+  std::vector<int64_t> limit_indices(shape.dimensions().size());
+  std::vector<int64_t> strides(shape.dimensions().size(), 1);
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     limit_indices[i] = shape.dimensions(i);
   }
   return operand->AddInstruction(HloInstruction::CreateSlice(
diff --git a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier_test.cc b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier_test.cc
index 411a00b9b6bd..8c6ea44228de 100644
--- a/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scatter_slice_simplifier_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -29,7 +29,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-using ScatterSliceSimplifierTest = HloTestBase;
+using ScatterSliceSimplifierTest = HloHardwareIndependentTestBase;
 
 TEST_F(ScatterSliceSimplifierTest, Scatter1D) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.cc b/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.cc
deleted file mode 100644
index 0fd39a27d0e3..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/schedule_postprocessing.h"
-
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/hlo/utils/hlo_query.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-// Maps a computation to a boolean that indicates whether the computation may
-// invoke custom-calls directly or indirectly, which can eventually trigger gpu
-// synchronization.
-using CustomCallInComputation =
-    absl::flat_hash_map<const HloComputation*, bool>;
-
-// Returns whether the hlo may invoke custom-calls which may trigger gpu
-// synchronization. Currently, we only check for custom-calls, because they are
-// the only operations that can be parallel with asynchronous collectives
-// operations in an hlo-schedule and may trigger gpu synchronization.
-bool MayInvokeCustomCall(
-    const HloInstruction* hlo,
-    const CustomCallInComputation& custom_call_in_computation) {
-  if (HloPredicateIsOp<HloOpcode::kCustomCall>(hlo)) {
-    return true;
-  }
-
-  return absl::c_any_of(
-      hlo->called_computations(), [&](const HloComputation* callee) {
-        return custom_call_in_computation.find(callee)->second;
-      });
-}
-
-// Returns true if this is an asynchronous collective start operation, excluding
-// P2P operations.
-bool IsRelevantAsynchronousStart(const HloInstruction* hlo) {
-  return hlo_query::IsAsyncCollectiveStartOp(hlo,
-                                             /*include_send_recv=*/false) &&
-         !IsGPUSyncCollective(*hlo);
-}
-
-// Returns true if this is a collective done operation, excluding P2P
-// operations.
-bool IsRelevantAsynchronousDone(const HloInstruction* hlo) {
-  return hlo_query::IsAsyncCollectiveDoneOp(hlo,
-                                            /*include_send_recv=*/false);
-}
-
-// For a given computation, finds all the asynchronous collective operations
-// that aren't parallel with custom-calls and sets its no_parallel_custom_call
-// attribute to true. Also records whether the given computation may invoke
-// custom-calls.
-absl::StatusOr<bool> ProcessComputation(
-    const HloSchedule& schedule, HloComputation* computation,
-    CustomCallInComputation& custom_call_in_computation) {
-  bool changed = false;
-  bool has_custom_call = false;
-  absl::flat_hash_set<HloInstruction*> async_starts;
-  const HloInstructionSequence& sequence = schedule.sequence(computation);
-
-  // Visit instructions in the sequence. Collect relevant asynchronous
-  // collective start ops. When we see a relevant asynchronous collective done
-  // op, remove the corresponding start op from the collection and set its
-  // attribute no_parallel_custom_call to true. When we see a custom-call, clear
-  // the start ops from the collection and keep their attribute
-  // no_parallel_custom_call as false.
-  const std::vector<HloInstruction*>& all_instructions =
-      sequence.instructions();
-  for (HloInstruction* hlo : all_instructions) {
-    if (MayInvokeCustomCall(hlo, custom_call_in_computation)) {
-      async_starts.clear();
-      has_custom_call = true;
-      continue;
-    }
-    if (IsRelevantAsynchronousStart(hlo)) {
-      async_starts.insert(hlo);
-      continue;
-    }
-
-    if (IsRelevantAsynchronousDone(hlo)) {
-      HloInstruction* async_start = hlo->mutable_operand(0);
-      if (async_starts.contains(async_start)) {
-        changed = true;
-        TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
-                            async_start->backend_config<GpuBackendConfig>());
-        CollectiveBackendConfig& collective_backend_config =
-            *gpu_config.mutable_collective_backend_config();
-        collective_backend_config.set_no_parallel_custom_call(true);
-        TF_RETURN_IF_ERROR(async_start->set_backend_config(gpu_config));
-        async_starts.erase(async_start);
-      }
-    }
-  }
-
-  custom_call_in_computation[computation] = has_custom_call;
-  return changed;
-}
-
-}  // anonymous namespace
-
-absl::StatusOr<bool> SchedulePostprocessing::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  if (!module->has_schedule()) return false;
-  HloSchedule& schedule = module->schedule();
-  bool changed = false;
-  CustomCallInComputation custom_call_in_computation;
-
-  // We visit computations in the order of callees to callers, as information is
-  // propagated from calles to callers.
-  std::vector<HloComputation*> all_computations =
-      module->MakeComputationPostOrder(execution_threads);
-  for (auto iter = all_computations.begin(); iter != all_computations.end();
-       ++iter) {
-    HloComputation* computation = *iter;
-    if (computation->IsFusionComputation()) {
-      custom_call_in_computation[computation] = false;
-      continue;
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        bool result,
-        ProcessComputation(schedule, computation, custom_call_in_computation));
-    changed |= result;
-  }
-
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.h b/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.h
deleted file mode 100644
index d76faed7d260..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_TRANSFORMS_SCHEDULE_POSTPROCESSING_H_
-#define XLA_SERVICE_GPU_TRANSFORMS_SCHEDULE_POSTPROCESSING_H_
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/pass/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// Amends a schedule result with the needed information to support a runtime
-// implementation. Currently, this pass refines attribute
-// no_parallel_custom_call for asynchronous collective operations to support
-// runtime optimization, such as skipping rendezvous of all participating
-// threads for NCCL collective operations. In particular, it sets the attribute
-// value for Collective-start operations with is_sync=false; it also keeps the
-// attribute value untouch for the operations with is_sync=true and for P2P
-// operations, assumming the runtime won't use those values.
-//
-class SchedulePostprocessing : public HloModulePass {
- public:
-  absl::string_view name() const override { return "schedule-postprocessing"; }
-
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_TRANSFORMS_SCHEDULE_POSTPROCESSING_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing_test.cc b/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing_test.cc
deleted file mode 100644
index 01659a11f6e6..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/schedule_postprocessing_test.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/transforms/schedule_postprocessing.h"
-
-#include <memory>
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/util.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using SchedulePostprocessingTest = HloTestBase;
-
-TEST_F(SchedulePostprocessingTest, SynchronousOpsNotChanged) {
-  constexpr absl::string_view kHloString = R"(
-  HloModule module, is_scheduled=true
-
-  ENTRY entry {
-    pf32 = f32[1] parameter(0)
-
-    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}, backend_config={"collective_backend_config":{"is_sync":true,"no_parallel_custom_call":false}}
-    ROOT all-gather-done = f32[2] all-gather-done(all-gather-start)
-  }
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((kHloString)));
-  SchedulePostprocessing pass;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
-TEST_F(SchedulePostprocessingTest, P2POpsNotChanged) {
-  constexpr absl::string_view kHloString = R"(
-  HloModule module, is_scheduled=true
-
-  ENTRY main {
-    f0 = f32[] constant(0.0)
-    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
-
-    after-all = token[] after-all()
-    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=2,
-      frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}}"
-    }
-    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=2
-    ROOT recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done), index=0
-  }
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((kHloString)));
-  SchedulePostprocessing pass;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
-TEST_F(SchedulePostprocessingTest, AsynchronousOpsChanged) {
-  constexpr absl::string_view kHloString = R"(
-  HloModule module, is_scheduled=true
-
-  ENTRY entry {
-    pf32 = f32[1] parameter(0)
-    pf32.2 = f32[1] custom-call(pf32), custom_call_target="my_custom_call"
-    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32.2), dimensions={0}, backend_config={"collective_backend_config":{"is_sync":false}}
-    ROOT all-gather-done = f32[2] all-gather-done(all-gather-start)
-  }
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((kHloString)));
-  SchedulePostprocessing pass;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
-  EXPECT_TRUE(changed);
-
-  HloInstruction* start = FindInstruction(module.get(), "all-gather-start");
-  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
-                          start->backend_config<GpuBackendConfig>());
-  const CollectiveBackendConfig& collective_backend_config =
-      gpu_config.collective_backend_config();
-  EXPECT_TRUE(collective_backend_config.no_parallel_custom_call());
-}
-
-TEST_F(SchedulePostprocessingTest, AsynchronousOpsWithParallelCustomcall) {
-  constexpr absl::string_view kHloString = R"(
-  HloModule module, is_scheduled=true
-
-  ENTRY entry {
-    pf32 = f32[1] parameter(0)
-    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}, backend_config={"collective_backend_config":{"is_sync":false}}
-    pf32.2 = f32[1] custom-call(pf32), custom_call_target="my_custom_call"
-    all-gather-done = f32[2] all-gather-done(all-gather-start)
-    ROOT out = (f32[1], f32[2]) tuple(f32[1] pf32.2, f32[2] all-gather-done)
-  }
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((kHloString)));
-  SchedulePostprocessing pass;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
-  EXPECT_FALSE(changed);
-
-  HloInstruction* start = FindInstruction(module.get(), "all-gather-start");
-  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
-                          start->backend_config<GpuBackendConfig>());
-  const CollectiveBackendConfig& collective_backend_config =
-      gpu_config.collective_backend_config();
-  EXPECT_FALSE(collective_backend_config.no_parallel_custom_call());
-}
-
-TEST_F(SchedulePostprocessingTest,
-       AsynchronousOpsWithParallelNestedCustomcall) {
-  constexpr absl::string_view kHloString = R"(
-  HloModule module, is_scheduled=true
-  foo {
-    v = f32[1] parameter(0)
-    ROOT ret = f32[1] custom-call(v), custom_call_target="my_custom_call"
-  }
-
-  ENTRY entry {
-    pf32 = f32[1] parameter(0)
-    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}, backend_config={"collective_backend_config":{"is_sync":false}}
-    pf32.2 = f32[1] call(f32[1] pf32), to_apply=foo
-    all-gather-done = f32[2] all-gather-done(all-gather-start)
-    ROOT out = (f32[1], f32[2]) tuple(f32[1] pf32.2, f32[2] all-gather-done)
-  }
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((kHloString)));
-  SchedulePostprocessing pass;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
-  EXPECT_FALSE(changed);
-
-  HloInstruction* start = FindInstruction(module.get(), "all-gather-start");
-  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
-                          start->backend_config<GpuBackendConfig>());
-  const CollectiveBackendConfig& collective_backend_config =
-      gpu_config.collective_backend_config();
-  EXPECT_FALSE(collective_backend_config.no_parallel_custom_call());
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator_test.cc
index 04b050f89450..301f8931df33 100644
--- a/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/scheduling_instruction_annotator_test.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/filecheck.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
-using SchedulingInstructionAnnotatorTest = HloTestBase;
+using SchedulingInstructionAnnotatorTest = HloHardwareIndependentTestBase;
 
 TEST_F(SchedulingInstructionAnnotatorTest,
        AnnotatesAllInstructionsWithTheirRespectiveNames) {
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
index 11f13c9797e7..91336801a9c0 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
@@ -321,10 +321,10 @@ EstimateOptimizedHloRunTimeWithoutSoftMaxRewriterTriton(
 
   absl::Duration total_run_time = absl::ZeroDuration();
 
+  GpuPerformanceModelOwning gpu_performance_model(device_info);
   for (const HloInstruction* instr : entry_computation->instructions()) {
-    total_run_time += GpuPerformanceModel::EstimateRunTimeForInstruction(
-                          instr, device_info, &cost_analysis,
-                          GpuPerformanceModelOptions::Default())
+    total_run_time += gpu_performance_model
+                          .EstimateRunTimeForInstruction(instr, &cost_analysis)
                           .exec_time;
   }
 
@@ -461,7 +461,8 @@ FusionDecision ShouldFuseReduction(const HloInstruction& reduce,
   }
 
   if (reduce.dimensions().size() != 1 ||
-      reduce.dimensions(0) != reduce.operand(0)->shape().rank() - 1) {
+      reduce.dimensions(0) !=
+          reduce.operand(0)->shape().dimensions().size() - 1) {
     return FusionDecision::Forbid(
         "The reductions in the diamond must reduce 1 dimension and that "
         "dimension must be the last dimension of the operand.");
@@ -545,7 +546,7 @@ DiamondMatchingDecision MatchesTritonCompatibleClosedReductionDiamondImpl(
   producer = reduce->mutable_operand(0);
 
   if (absl::c_linear_search(broadcast->dimensions(),
-                            broadcast->shape().rank() - 1)) {
+                            broadcast->shape().dimensions().size() - 1)) {
     return FusionDecision::Forbid(
         "Broadcast is not along the reduction dimension.");
   }
@@ -594,7 +595,7 @@ SoftmaxRewriterTriton::FindAllFusibleNormalizationDiamonds(
 
   for (HloComputation* comp :
        module.MakeNonfusionComputations(execution_threads)) {
-    if (comp->IsCustomCallComputation()) {
+    if (!comp->caller_instructions(HloOpcode::kCustomCall).empty()) {
       continue;
     }
     for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
index 5ddceb687700..9134f45ed779 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status_matchers.h"
@@ -56,7 +56,7 @@ bool HasBlockLevelFusionConfig(const HloInstruction* fusion) {
 }
 
 class SoftmaxRewriterTritonTest
-    : public HloTestBase,
+    : public HloHardwareIndependentTestBase,
       public ::testing::WithParamInterface<PrimitiveType> {
  protected:
   se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
index 79d7a5e15f24..ec2f1bface80 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
@@ -21,9 +21,12 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/cub_sort_thunk.h"
@@ -40,11 +43,12 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -226,9 +230,10 @@ std::optional<SortComputationAnalysis> AnalyzeSortOp(
 
 // Create runner for CUB sort operation.
 absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateRunner(
-    const SortComputationAnalysis& sort_analysis) {
-  return CubSortRunnerInterface::Create(sort_analysis.key_type,
-                                        sort_analysis.value_type);
+    const SortComputationAnalysis& sort_analysis,
+    absl::string_view platform_name) {
+  return CubSortRunnerInterface::Create(
+      sort_analysis.key_type, sort_analysis.value_type, platform_name);
 }
 
 // Restore the result shape after sorting a pair of tensors.
@@ -302,6 +307,126 @@ HloInstruction* AddNumpySortKey(HloInstruction* operand, PrimitiveType key_type,
   return sort_keys;
 }
 
+bool IsCubSortFasterOnH100(int bitwidth, int batch_size, int num_elements,
+                           int sm_count) {
+  // The numbers below are based on extensive benchmarks: see
+  // b/407689882#comment35 and b/410480351 for more details.
+  switch (bitwidth) {
+    case 8:
+      return batch_size == 1 ||
+             (num_elements > 1300 && (batch_size > 8 || num_elements < 26000));
+    case 16:
+      return (batch_size == 1 && num_elements > (1 << 9)) ||
+             (batch_size > 12 && num_elements > (1 << 16)) ||
+             (batch_size > 14 && num_elements > (1 << 15)) ||
+             (batch_size > 16 && num_elements > (1 << 14)) ||
+             (batch_size > 18 && num_elements > (1 << 13)) ||
+             (batch_size > 33 && num_elements > (1 << 12)) ||
+             (batch_size > 66 && num_elements > (1 << 11));
+    case 32:
+      return (batch_size == 1 && num_elements > 22000) ||
+             (batch_size > 26 && num_elements > (1 << 17)) ||
+             (batch_size > 31 && num_elements > (1 << 16)) ||
+             (batch_size > 38 && num_elements > (1 << 15)) ||
+             (batch_size > 44 && num_elements > (1 << 14)) ||
+             (batch_size > 52 && num_elements > (1 << 13)) ||
+             (batch_size > 88 && batch_size <= sm_count &&
+              num_elements > (1 << 12));
+    case 64:
+      return (batch_size == 1 && num_elements > (1 << 17)) ||
+             (batch_size > 55 && num_elements > (1 << 17)) ||
+             (batch_size > 70 && num_elements > (1 << 16)) ||
+             (batch_size > 92 && num_elements > (1 << 15)) ||
+             (((batch_size > 160 && batch_size <= 2 * sm_count) ||
+               (batch_size > 354)) &&
+              num_elements > (1 << 14));
+    default:
+      return false;
+  }
+}
+
+// Returns whether a compatible sort should be rewritten based on the current
+// sort mode and possibly a heuristic.
+bool ShouldRewriteCompatibleSort(se::DeviceDescription device_description,
+                                 const HloSortInstruction* sort_op) {
+  if (SortRewriter::SortMode() == SortRewriter::Mode::kAlways) {
+    return true;
+  }
+
+  const Shape& operand_shape = sort_op->operand(0)->shape();
+  int num_elements = operand_shape.dimensions().back();
+  if (num_elements == 0) {
+    return false;
+  }
+
+  if (SortRewriter::SortMode() == SortRewriter::Mode::kAuto) {
+    if (auto cuda_cc = std::get_if<se::CudaComputeCapability>(
+            &device_description.gpu_compute_capability())) {
+      int bitwidth = primitive_util::BitWidth(operand_shape.element_type());
+      int batch_size = Product(operand_shape.dimensions()) / num_elements;
+
+      if (cuda_cc->IsHopper()) {
+        return IsCubSortFasterOnH100(bitwidth, batch_size, num_elements,
+                                     device_description.core_count());
+      }
+      if (cuda_cc->IsAmpere()) {
+        // TODO(b/410480351): Verify that the H100 heuristic also works well for
+        // Ampere or implement a custom heuristic.
+        return IsCubSortFasterOnH100(bitwidth, batch_size, num_elements,
+                                     device_description.core_count());
+      }
+    }
+  }
+
+  // TODO(b/410480351): The default heuristic below is pretty bad in the general
+  // case. Run benchmarks on different devices and add a heuristic per device.
+  return Product(operand_shape.dimensions()) > 16384;
+}
+
+bool IsCubCompatibleSort(const se::DeviceDescription& device_description,
+                         const HloSortInstruction* sort_op,
+                         absl::string_view platform_name) {
+  VLOG(1) << "Sort instruction: " << sort_op->name();
+  if (sort_op->operand_count() != 1 && sort_op->operand_count() != 2) {
+    VLOG(2) << "Unsupported operand count: " << sort_op->operand_count();
+    return false;
+  }
+
+  for (const auto& op : sort_op->operands()) {
+    if (op->shape().is_dynamic()) {
+      VLOG(2) << "Dynamic shape is not supported: " << op->shape().ToString();
+      return false;
+    }
+  }
+
+  const Shape& operand_shape = sort_op->operand(0)->shape();
+  if (sort_op->sort_dimension() != operand_shape.dimensions().size() - 1) {
+    VLOG(2) << "Sort dimension should be the minor one";
+    return false;
+  }
+
+  if (!ShouldRewriteCompatibleSort(device_description, sort_op)) {
+    VLOG(2) << "Tensor shape and type will not see an improvement.";
+    return false;
+  }
+
+  auto sort_analysis = AnalyzeSortOp(*sort_op);
+  if (!sort_analysis.has_value()) {
+    VLOG(2) << "Only simple compare computations are supported";
+    return false;
+  }
+  if (!CreateRunner(*sort_analysis, platform_name).ok()) {
+    VLOG(2) << "Unsupported operand types (no compiled CUB kernels): "
+            << PrimitiveType_Name(sort_analysis->key_type) << " "
+            << (sort_analysis->value_type.has_value()
+                    ? PrimitiveType_Name(sort_analysis->value_type.value())
+                    : "");
+    return false;
+  }
+  VLOG(2) << "Sort operation is compatible";
+  return true;
+}
+
 }  // namespace
 
 // Rewrites a single sort instruction with a custom call.
@@ -315,7 +440,7 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
   int64_t batch_size = Product(operand_shape.dimensions()) /
                        operand_shape.dimensions(sort_op->sort_dimension());
 
-  TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_analysis));
+  TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_analysis, platform_name_));
   TF_ASSIGN_OR_RETURN(
       int64_t scratch_size,
       runner->GetScratchSize(Product(operand_shape.dimensions()), batch_size));
@@ -392,7 +517,8 @@ absl::StatusOr<bool> SortRewriter::RunOnComputation(
   std::vector<HloSortInstruction*> sort_ops;
   for (auto* inst : computation->instructions()) {
     HloSortInstruction* sort = DynCast<HloSortInstruction>(inst);
-    if (sort != nullptr && IsCubCompatibleSort(sort)) {
+    if (sort != nullptr &&
+        IsCubCompatibleSort(device_description_, sort, platform_name_)) {
       sort_ops.push_back(sort);
     }
   }
@@ -419,39 +545,5 @@ absl::StatusOr<bool> SortRewriter::Run(
   return changed;
 }
 
-bool IsCubCompatibleSort(const HloSortInstruction* sort_op) {
-  VLOG(1) << "Sort instruction: " << sort_op->name();
-  if (sort_op->operand_count() != 1 && sort_op->operand_count() != 2) {
-    VLOG(2) << "Unsupported operand count: " << sort_op->operand_count();
-    return false;
-  }
-
-  const Shape& operand_shape = sort_op->operand(0)->shape();
-  if (sort_op->sort_dimension() != operand_shape.rank() - 1) {
-    VLOG(2) << "Sort dimension should be the minor one";
-    return false;
-  }
-  if (Product(operand_shape.dimensions()) < SortRewriter::SortSizeThreshold()) {
-    VLOG(2) << "Tensor shape size is too small to see an improvement";
-    return false;
-  }
-
-  auto sort_analysis = AnalyzeSortOp(*sort_op);
-  if (!sort_analysis.has_value()) {
-    VLOG(2) << "Only simple compare computations are supported";
-    return false;
-  }
-  if (!CreateRunner(*sort_analysis).ok()) {
-    VLOG(2) << "Unsupported operand types (no compiled CUB kernels): "
-            << PrimitiveType_Name(sort_analysis->key_type) << " "
-            << (sort_analysis->value_type.has_value()
-                    ? PrimitiveType_Name(sort_analysis->value_type.value())
-                    : "");
-    return false;
-  }
-  VLOG(2) << "Sort operation is compatible";
-  return true;
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
index 96835aab6306..7d8907bf87f3 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_TRANSFORMS_SORT_REWRITER_H_
 #define XLA_SERVICE_GPU_TRANSFORMS_SORT_REWRITER_H_
 
+#include <string>
+#include <utility>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -23,6 +26,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -33,16 +37,26 @@ namespace gpu {
 
 class SortRewriter : public HloModulePass {
  public:
+  explicit SortRewriter(const se::DeviceDescription& device_description,
+                        std::string platform_name)
+      : device_description_(device_description),
+        platform_name_(std::move(platform_name)) {}
   absl::string_view name() const override { return "sort-rewriter"; }
 
+  enum class Mode {
+    kAuto,   // Decide whether to rewrite compatible sorts based on a heuristic.
+    kAlways  // Always rewrite compatible sorts. Used for testing.
+  };
+
   // CUB radix sort is slower than XLA sort on small shapes, so do not rewrite
   // tensors with sizes below this limit.
-  static int SortSizeThreshold() { return sort_size_threshold_; }
-  static void SetSortSizeThresholdForTestingOnly(int threshold) {
-    // We need to be able to reduce the threshold for testing, so that the tests
-    // can run and compare against the reference interpreter, which is quite
-    // slow.
-    sort_size_threshold_ = threshold;
+  static Mode SortMode() { return sort_mode_; }
+  static void SetSortModeForTestingOnly(Mode sort_mode) {
+    // We need to be able to force rewrites for testing for arbitrary shapes.
+    // This enables the tests to run and compare against the reference
+    // interpreter, which is quite slow and needs smaller shapes that would
+    // normally not be rewritten.
+    sort_mode_ = sort_mode;
   }
 
   using HloPassInterface::Run;
@@ -54,12 +68,11 @@ class SortRewriter : public HloModulePass {
   absl::StatusOr<bool> RunOnInstruction(HloSortInstruction* sort_op);
   absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
 
-  static inline int sort_size_threshold_ = 16385;
+  static inline Mode sort_mode_ = Mode::kAuto;
+  se::DeviceDescription device_description_;
+  std::string platform_name_;
 };
 
-// Verify that the sort tensor shape is supported by CUB.
-bool IsCubCompatibleSort(const HloSortInstruction* sort_op);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_stub.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_stub.cc
deleted file mode 100644
index e9bf60cdb4c9..000000000000
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_stub.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/transforms/sort_rewriter.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-absl::StatusOr<bool> SortRewriter::RunOnInstruction(
-    HloSortInstruction* sort_op) {
-  return false;
-}
-
-absl::StatusOr<bool> SortRewriter::RunOnComputation(
-    HloComputation* computation) {
-  return false;
-}
-
-absl::StatusOr<bool> SortRewriter::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  return false;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
index a01271e4155b..abf43d868697 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/sort_rewriter.h"
 
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
@@ -26,13 +28,17 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -41,18 +47,21 @@ namespace {
 namespace m = ::xla::match;
 
 class SortRewriterTest
-    : public HloTestBase,
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>,
       public ::testing::WithParamInterface<std::tuple<PrimitiveType, bool>> {
  public:
   void SetUp() override {
-    HloTestBase::SetUp();
-    SortRewriter::SetSortSizeThresholdForTestingOnly(
-        0);  // Always use CUB sort.
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>::SetUp();
+    SortRewriter::SetSortModeForTestingOnly(SortRewriter::Mode::kAlways);
+    TF_ASSERT_OK_AND_ASSIGN(test_platform_, PlatformUtil::GetPlatform("gpu"));
   }
 
   bool RunModuleAndPass(HloModule* module) {
     auto cloned = module->Clone();
-    bool changed = SortRewriter().Run(module).value();
+    bool changed = SortRewriter(TestGpuDeviceInfo::CudaOrRocmDeviceInfo(),
+                                GetTestPlatform()->Name())
+                       .Run(module)
+                       .value();
     if (changed) {
       // Here we run an end to end test to make sure that SortRewriter does
       // not introduce an incorrect rewrite. To do this, we need to clone the
@@ -67,6 +76,13 @@ class SortRewriterTest
     auto config = instruction->backend_config<xla::SortOptions>();
     EXPECT_EQ(config->descending(), descending);
   }
+
+  const stream_executor::Platform* GetTestPlatform() const {
+    return test_platform_;
+  }
+
+ private:
+  stream_executor::Platform* test_platform_ = nullptr;
 };
 
 // Basic sort: ascending.
@@ -248,6 +264,44 @@ ENTRY %main {
   EXPECT_FALSE(RunModuleAndPass(module.get()));
 }
 
+TEST_F(SortRewriterTest, NoRewriteDynamicSize) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = u8[] parameter(0)
+  %rhs = u8[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = u8[100,<=100] parameter(0)
+  ROOT %sort = u8[100,<=100] sort(%input), dimensions={1}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunModuleAndPass(module.get()));
+}
+
+TEST_F(SortRewriterTest, NoRewriteDynamicBatch) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = u8[] parameter(0)
+  %rhs = u8[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = u8[<=100,100] parameter(0)
+  ROOT %sort = u8[<=100,100] sort(%input), dimensions={1}, to_apply=%compare
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  EXPECT_FALSE(RunModuleAndPass(module.get()));
+}
+
 // Kernels are compiled for a subset of types.
 TEST_F(SortRewriterTest, NoRewriteUnsupportedType) {
   constexpr char kHlo[] = R"(
@@ -315,7 +369,7 @@ ENTRY %main {
 
 // Small shapes do not see improvement from CUB sort.
 TEST_F(SortRewriterTest, NoRewriteSmallSize) {
-  SortRewriter::SetSortSizeThresholdForTestingOnly(16385);
+  SortRewriter::SetSortModeForTestingOnly(SortRewriter::Mode::kAuto);
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -334,6 +388,44 @@ ENTRY %main {
   EXPECT_FALSE(RunModuleAndPass(module.get()));
 }
 
+TEST_F(SortRewriterTest, H100Heuristic) {
+  SortRewriter::SetSortModeForTestingOnly(SortRewriter::Mode::kAuto);
+  constexpr char kHloTmpl[] = R"(
+HloModule TestModule
+
+%compare {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %lt = pred[] compare(%lhs, %rhs), direction=LT
+}
+
+ENTRY %main {
+  %input = f32[$0,100000] parameter(0)
+  ROOT %sort = f32[$0,100000] sort(%input), dimensions={1}, to_apply=%compare
+})";
+
+  auto pass = SortRewriter(TestGpuDeviceInfo::RTXH100SXMDeviceInfo(), "CUDA");
+
+  // Batch 1
+  std::string hlo = absl::Substitute(kHloTmpl, "1");
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+
+  // Batch 3
+  hlo = absl::Substitute(kHloTmpl, "3");
+  TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(&pass, module.get()));
+  EXPECT_FALSE(changed);
+
+  // Batch 70
+  hlo = absl::Substitute(kHloTmpl, "70");
+  TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+}
+
 // Basic sort: with batch dimension.
 TEST_F(SortRewriterTest, SortWithBatchDim) {
   constexpr char kHlo[] = R"(
@@ -404,7 +496,13 @@ ENTRY %main {
   constexpr char kExpectedPattern[] = R"(
     // CHECK: %[[CC:.*]] = (u16[1000]{0}, u8[1]{0}) custom-call({{.*}}), custom_call_target="__cub$DeviceRadixSort", metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}, backend_config={"descending":true}
   )";
-  RunAndFilecheckHloRewrite(kHlo, SortRewriter(), kExpectedPattern);
+  for (const auto& [device_description, platform_name] :
+       {std::tuple{TestGpuDeviceInfo::RTXA6000DeviceInfo(), "CUDA"},
+        std::tuple{TestGpuDeviceInfo::RTXH100SXMDeviceInfo(), "CUDA"}}) {
+    RunAndFilecheckHloRewrite(kHlo,
+                              SortRewriter(device_description, platform_name),
+                              kExpectedPattern);
+  }
 }
 
 TEST_P(SortRewriterTest, SortNumpyOrder) {
@@ -455,7 +553,7 @@ INSTANTIATE_TEST_SUITE_P(
     });
 
 TEST_F(SortRewriterTest, AlwaysUsesCubSort) {
-  EXPECT_EQ(SortRewriter::SortSizeThreshold(), 0);
+  EXPECT_EQ(SortRewriter::SortMode(), SortRewriter::Mode::kAlways);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
new file mode 100644
index 000000000000..2780c9e9e678
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.cc
@@ -0,0 +1,345 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/splitk_rewriter.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal_util.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/hlo_creation_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+struct DotDimensions {
+  int64_t b;  // batch dimensions
+  int64_t m;  // lhs non-contracting dimensions
+  int64_t n;  // rhs non-contracting dimensions
+  int64_t k;  // contracting dimensions
+  int64_t lhs_el_size_in_bits;
+  int64_t rhs_el_size_in_bits;
+  int64_t result_el_size_in_bits;
+};
+
+DotDimensions GetDotDimensions(const HloInstruction* dot) {
+  const Shape& lhs_shape = dot->operand(0)->shape();
+  const Shape& rhs_shape = dot->operand(1)->shape();
+  DotDimensionNumbers dnums = dot->dot_dimension_numbers();
+
+  auto product_dimensions = [](const Shape& shape,
+                               absl::Span<const int64_t> dimensions) {
+    return absl::c_accumulate(dimensions, static_cast<int64_t>(1),
+                              [&](int64_t product, int64_t dimension) {
+                                return product * shape.dimensions(dimension);
+                              });
+  };
+
+  auto get_side_size = [](const HloInstruction* instr) {
+    while (instr->IsElementwise() && instr->operand_count() == 1) {
+      instr = instr->operand(0);
+    }
+    return ShapeUtil::ElementSizeInBits(instr->shape());
+  };
+
+  return DotDimensions{
+      /*.b = */ product_dimensions(lhs_shape, dnums.lhs_batch_dimensions()),
+      /*.m = */
+      product_dimensions(
+          lhs_shape, GetNonContractingDims(lhs_shape.dimensions().size(),
+                                           dnums.lhs_contracting_dimensions(),
+                                           dnums.lhs_batch_dimensions())),
+      /*.n = */
+      product_dimensions(
+          rhs_shape, GetNonContractingDims(rhs_shape.dimensions().size(),
+                                           dnums.rhs_contracting_dimensions(),
+                                           dnums.rhs_batch_dimensions())),
+      /*.k = */
+      product_dimensions(lhs_shape, dnums.lhs_contracting_dimensions()),
+      /* .lhs_el_size_in_bits = */ get_side_size(dot->operand(0)),
+      /* .rhs_el_size_in_bits = */ get_side_size(dot->operand(1)),
+      /* .result_el_size_in_bits = */
+      ShapeUtil::ElementSizeInBits(dot->shape()),
+  };
+}
+
+size_t ChooseSplitK(const DotDimensions& dims, int num_cores) {
+  // Compute the computational intensity in FLOPs per 256 bits of memory I/O
+  // (instead of FLOPs per byte to avoid the need for floating point).
+  size_t computational_intensity =
+      256 * dims.m * dims.n * dims.k /
+      (dims.m * dims.k * dims.lhs_el_size_in_bits +
+       dims.n * dims.k * dims.rhs_el_size_in_bits +
+       dims.m * dims.n * dims.result_el_size_in_bits);
+
+  // The constants below were tuned the following way:
+  // 1. Generated random GEMM kernels.
+  //    * M, N, K and B dimensions are exponentially distributed between 1 and
+  //    200000
+  //    * M, N and K are rounded up to the multiple of 16.
+  //    * B is set to 1 in the half of samples.
+  //    * Use combinations (that make sense)of s32, s8, s4, fp8, bf16, f32 and
+  //    f16 as op and result types.
+  // 2. Every of these kernels were run on H100 with exhaustive tiling search
+  //    enabled.
+  // 3. The best values of the constants were picked using brute force search.
+  // 4. Two functions were used as a loss function, converging to the same
+  //    result (performance of the best splitK was taken as 1.0, and
+  //    performance of other splitK value as a fraction of it):
+  //    * Geomean.
+  //    * Mean square loss.
+  constexpr int64_t kIntensityThreshold = 240;
+  // The minimum K dimension size for the dot after splitting.
+  constexpr int64_t kMemoryBoundMinK = 768;
+  constexpr int64_t kComputeBoundMinK = 1220;
+  constexpr size_t kMaxSplitK = 128;
+  // The target number tiles of num_cores×1.55 was tuned to be the best, but
+  // let's keep it more sane-looking 1.5.
+  const int64_t kTargetNumTiles = num_cores + num_cores / 2;
+  const int64_t kMTileSize = 64;
+  const int64_t kNTileSize = 128;
+
+  VLOG(3) << "ChooseSplitK(), b=" << dims.b << " m=" << dims.m
+          << " n=" << dims.n << " k=" << dims.k
+          << " lhs_sz=" << dims.lhs_el_size_in_bits
+          << " rhs_size=" << dims.rhs_el_size_in_bits
+          << " result_size=" << dims.result_el_size_in_bits
+          << " intensity=" << computational_intensity;
+
+  if (computational_intensity < kIntensityThreshold) {
+    // Assume memory throughput bound, choose as high splitK as possible, but
+    // keep the resulting K >= kMemoryBoundMinK.
+    size_t splitk = std::min(
+        kMaxSplitK, size_t{1} << Log2Ceiling(static_cast<uint64_t>(
+                        std::max(int64_t{1}, dims.k / kMemoryBoundMinK))));
+    VLOG(3) << "Memory throughput bound, splitK=" << splitk;
+    return splitk;
+  }
+
+  // Assume compute bound, try to fill target number of tiles.
+  const int64_t m_tiles = CeilOfRatio(dims.m, kMTileSize);
+  const int64_t n_tiles = CeilOfRatio(dims.n, kNTileSize);
+  const int64_t num_tiles = dims.b * m_tiles * n_tiles;
+  const uint64_t max_splitk = 1 << Log2Floor(static_cast<uint64_t>(std::max(
+                                  int64_t{1}, dims.k / kComputeBoundMinK)));
+  const uint64_t desired_splitk = CeilOfRatio(kTargetNumTiles, num_tiles);
+  const size_t splitk = 1 << Log2Ceiling(std::min(max_splitk, desired_splitk));
+
+  VLOG(3) << "Compute throughput bound, m_tiles=" << m_tiles
+          << " n_tiles=" << n_tiles << " num_tiles=" << num_tiles
+          << " max_splitk=" << max_splitk
+          << " desired_splitk=" << desired_splitk << " splitk=" << splitk;
+  return splitk;
+}
+
+// Pads the given instruction with zeros along the given dimension to the given
+// size.
+HloInstruction* PadInstruction(HloInstruction* instr, int64_t dimension_idx,
+                               int64_t new_dimension_size) {
+  HloComputation* computation = instr->parent();
+  const PrimitiveType element_type = instr->shape().element_type();
+  HloInstruction* zero = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
+  PaddingConfig padding_config =
+      MakeNoPaddingConfig(instr->shape().dimensions().size());
+  padding_config.mutable_dimensions(dimension_idx)
+      ->set_edge_padding_low(new_dimension_size -
+                             instr->shape().dimensions(dimension_idx));
+  Shape new_shape = instr->shape();
+  new_shape.set_dimensions(dimension_idx, new_dimension_size);
+  return computation->AddInstruction(
+      HloInstruction::CreatePad(new_shape, instr, zero, padding_config));
+}
+
+// The contracting dimension index becomes new batch (split) dimension, and all
+// dimensions after it are shifted by 1.
+HloInstruction* SplitKOperand(HloInstruction* operand,
+                              int64_t contracting_dimension_idx,
+                              int64_t split_k) {
+  // if the K dimension is not divisible by split_k, we need to pad it.
+  const int64_t src_k = operand->shape().dimensions(contracting_dimension_idx);
+  const bool needs_padding = src_k % split_k != 0;
+  if (needs_padding) {
+    const int64_t padded_k = RoundUpTo(src_k, split_k);
+    operand = PadInstruction(operand, contracting_dimension_idx, padded_k);
+  }
+  const Shape& old_shape = operand->shape();
+
+  // Copy the existing shape to keep all the non-dimension/non-layout fields of
+  // the shape (element size in bits etc).
+  Shape new_shape = old_shape;
+  new_shape.clear_dimensions();
+  for (int64_t i = 0; i < old_shape.dimensions().size(); ++i) {
+    const int64_t old_dim = old_shape.dimensions(i);
+    if (i == contracting_dimension_idx) {
+      new_shape.add_dimensions(split_k);
+      new_shape.add_dimensions(old_dim / split_k);
+    } else {
+      new_shape.add_dimensions(old_dim);
+    }
+  }
+
+  // Update the physical layout so the the physical layout is preserved (i.e.
+  // the splitK dimension goes right before the contracting dimension, and all
+  // remaining dimensions are kept).
+  if (new_shape.layout().minor_to_major_size() > 0) {
+    new_shape.mutable_layout()->clear_minor_to_major();
+    for (int64_t dim_idx : old_shape.layout().minor_to_major()) {
+      if (dim_idx >= contracting_dimension_idx) {
+        new_shape.mutable_layout()->add_minor_to_major(dim_idx + 1);
+      }
+      if (dim_idx <= contracting_dimension_idx) {
+        new_shape.mutable_layout()->add_minor_to_major(dim_idx);
+      }
+    }
+  }
+
+  // Now reshape into the "new_shape".
+  return operand->parent()->AddInstruction(
+      HloInstruction::CreateReshape(new_shape, operand));
+}
+
+// Sums/reduces the tensor along the given dimension.
+absl::StatusOr<HloInstruction*> ReduceDimension(HloInstruction* instr,
+                                                int64_t dimension_idx) {
+  HloComputation* computation = instr->parent();
+  const PrimitiveType element_type = instr->shape().element_type();
+  HloInstruction* zero = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
+  return MakeReduceHlo(instr, zero, {dimension_idx}, HloOpcode::kAdd,
+                       &instr->metadata());
+}
+
+absl::StatusOr<HloInstruction*> SplitKDimensionOfDot(HloDotInstruction* src_dot,
+                                                     size_t split_k) {
+  PrimitiveType output_type = src_dot->shape().element_type();
+  PrimitiveType accumulator_type = GetGemmAccumulatorType(src_dot);
+
+  // "split_k" is the number on chunks the K dimension is split into.
+  const int64_t lhs_k_idx =
+      src_dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
+  const int64_t rhs_k_idx =
+      src_dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
+  // The operands' K dimension are split into [split_k, K/split_k] (shifting
+  // right all the dimensions after it).
+  HloInstruction* lhs =
+      SplitKOperand(src_dot->mutable_operand(0), lhs_k_idx, split_k);
+  HloInstruction* rhs =
+      SplitKOperand(src_dot->mutable_operand(1), rhs_k_idx, split_k);
+
+  // Update the dot's dimension numbers accordingly (shifting right all the
+  // dimensions starting from the K dimension and inserting new batch dims).
+  DotDimensionNumbers new_dnums = src_dot->dot_dimension_numbers();
+  auto shift_dimension = [](tsl::protobuf::RepeatedField<int64_t>* dims,
+                            int64_t idx) {
+    absl::c_for_each(*dims, [idx](int64_t& dim) {
+      if (dim >= idx) dim++;
+    });
+  };
+  shift_dimension(new_dnums.mutable_lhs_contracting_dimensions(), lhs_k_idx);
+  shift_dimension(new_dnums.mutable_rhs_contracting_dimensions(), rhs_k_idx);
+  shift_dimension(new_dnums.mutable_lhs_batch_dimensions(), lhs_k_idx);
+  shift_dimension(new_dnums.mutable_rhs_batch_dimensions(), rhs_k_idx);
+  new_dnums.mutable_lhs_batch_dimensions()->Add(lhs_k_idx);
+  new_dnums.mutable_rhs_batch_dimensions()->Add(rhs_k_idx);
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * new_dot,
+      MakeDotHlo(lhs, rhs, new_dnums, src_dot->precision_config(),
+                 accumulator_type, {}, {}, &src_dot->metadata()));
+
+  // Reduce along the new batch dimension.
+  const int64_t splitk_dim_idx = new_dnums.lhs_batch_dimensions_size() - 1;
+  TF_ASSIGN_OR_RETURN(HloInstruction * splitk_root,
+                      ReduceDimension(new_dot, splitk_dim_idx));
+  *splitk_root->mutable_shape()->mutable_layout() = src_dot->shape().layout();
+  if (output_type != accumulator_type) {
+    splitk_root = MakeConvertToHlo(splitk_root, output_type);
+  }
+  return splitk_root;
+}
+
+class SplitkRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit SplitkRewriterVisitor(se::DeviceDescription device_description)
+      : device_description_(device_description) {}
+
+ private:
+  absl::Status HandleDot(HloInstruction* instr) override {
+    HloDotInstruction* dot = DynCast<HloDotInstruction>(instr);
+    if (dot->sparse_operands()) return absl::OkStatus();
+    if (dot->dot_dimension_numbers().lhs_contracting_dimensions_size() != 1 ||
+        dot->dot_dimension_numbers().rhs_contracting_dimensions_size() != 1) {
+      // In theory we could support it, but it's rare and adds complexity.
+      return absl::OkStatus();
+    }
+    const size_t split_k =
+        ChooseSplitK(GetDotDimensions(dot), device_description_.core_count());
+    if (split_k == 1) return absl::OkStatus();
+    TF_ASSIGN_OR_RETURN(HloInstruction * new_dot,
+                        SplitKDimensionOfDot(dot, split_k));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(instr, new_dot));
+    return absl::OkStatus();
+  }
+
+  se::DeviceDescription device_description_;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> SplitkRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  if (!module->config()
+           .debug_options()
+           .xla_gpu_experimental_enable_split_k_rewrite()) {
+    return false;
+  }
+
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    SplitkRewriterVisitor visitor(device_description_);
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+    changed |= visitor.changed();
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
new file mode 100644
index 000000000000..2a06bc9cc16b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter.h
@@ -0,0 +1,50 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SPLITK_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SPLITK_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites dot instructions that don't fully utilize cores but have a long K
+// dimension. For such dots, the input tensors are split along the K dimension
+// (forming a new batch dimension) and the resulting dot is reduced along the
+// new batch dimension.
+class SplitkRewriter : public HloModulePass {
+ public:
+  explicit SplitkRewriter(se::DeviceDescription device_description)
+      : device_description_(device_description) {}
+
+ private:
+  absl::string_view name() const override { return "splitk-rewriter"; }
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  se::DeviceDescription device_description_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SPLITK_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/splitk_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter_test.cc
new file mode 100644
index 000000000000..8430b58643e7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/splitk_rewriter_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/splitk_rewriter.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tests/test_utils.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class SplitkRewriterTest : public HloHardwareIndependentTestBase {
+ public:
+  SplitkRewriterTest()
+      : rewriter_(se::DeviceDescription(
+            ParseTextProto<stream_executor::GpuDeviceInfoProto>(
+                "core_count: 132")
+                .value())) {}
+
+ protected:
+  SplitkRewriter rewriter_;
+};
+
+TEST_F(SplitkRewriterTest, SmallNonContractingDimensionCauseSplitK) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY test {
+  lhs = f32[16,10240]{1,0} parameter(0)
+  rhs = f32[10240,128]{1,0} parameter(1)
+  ROOT dot = f32[16,128]{1,0} dot(lhs, rhs),
+                              lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_experimental_enable_split_k_rewrite(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          rewriter_.HloModulePass::Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_TRUE(RunFileCheck(module->ToString(), R"(
+CHECK: dot({{.*}}), lhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+CHECK: ROOT {{.*}} = f32[16,128]{1,0} reduce
+  )")
+                  .value_or(false));
+}
+
+TEST_F(SplitkRewriterTest, PaddingIsInserted) {
+  // Huge K dimension to trigger 128 which is the largest possible splitK
+  // (hoping to make the test less fragile as heuristic changes).
+  const char* hlo_string = R"(
+  HloModule module
+
+  ENTRY test {
+    lhs = f32[16,102401]{1,0} parameter(0)
+    rhs = f32[102401,128]{1,0} parameter(1)
+    ROOT dot = f32[16,128]{1,0} dot(lhs, rhs),
+                                lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_experimental_enable_split_k_rewrite(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          rewriter_.HloModulePass::Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_TRUE(RunFileCheck(module->ToString(), R"(
+CHECK: f32[16,102528]{1,0} pad({{.*}}), padding=0_0x127_0
+    )")
+                  .value_or(false));
+}
+
+TEST_F(SplitkRewriterTest, AccumulatorTypeIsDifferentFromOutputType) {
+  // Huge K dimension to trigger 128 which is the largest possible splitK
+  // (hoping to make the test less fragile as heuristic changes).
+  const char* hlo_string = R"(
+  HloModule module
+
+  ENTRY test {
+    lhs = bf16[16,102400]{1,0} parameter(0)
+    rhs = bf16[102400,128]{1,0} parameter(1)
+    ROOT dot = bf16[16,128]{1,0} dot(lhs, rhs),
+                                lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_experimental_enable_split_k_rewrite(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          rewriter_.HloModulePass::Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_TRUE(RunFileCheck(module->ToString(), R"(
+CHECK: f32{{.*}} dot(
+CHECK: f32{{.*}} reduce(
+CHECK: bf16[16,128]{1,0} convert(
+)")
+                  .value_or(false));
+}
+
+TEST_F(SplitkRewriterTest, NoSplitKIfEnoughWork) {
+  // Huge K dimension to trigger 128 which is the largest possible splitK
+  // (hoping to make the test less fragile as heuristic changes).
+  const char* hlo_string = R"(
+    HloModule module
+  
+    ENTRY test {
+      lhs = f32[1024,10240]{1,0} parameter(0)
+      rhs = f32[10240,2048]{1,0} parameter(1)
+      ROOT dot = f32[1024,2048]{1,0} dot(lhs, rhs),
+                                  lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_experimental_enable_split_k_rewrite(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          rewriter_.HloModulePass::Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
index 5e0e1f50d2cc..46d1f72b4b69 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_annotator_test.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -42,7 +42,7 @@ auto MakeDeviceDescription() {
   return device_description;
 }
 
-class StreamAttributeAnnotatorTest : public HloTestBase {
+class StreamAttributeAnnotatorTest : public HloHardwareIndependentTestBase {
  public:
   const se::DeviceDescription& device_description() const {
     return device_description_;
diff --git a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper_test.cc b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper_test.cc
index 32ed4c50c57c..c5643d0b94f5 100644
--- a/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/stream_attribute_async_wrapper_test.cc
@@ -24,14 +24,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
-using StreamAttributeAsyncWrapperTest = HloTestBase;
+using StreamAttributeAsyncWrapperTest = HloHardwareIndependentTestBase;
 
 TEST_F(StreamAttributeAsyncWrapperTest, NonDefaultOpIsWrapped) {
   constexpr absl::string_view kHloString = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc b/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
index 1cc6206ee890..6eb258e89db6 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_specializer.cc
@@ -55,11 +55,11 @@ absl::StatusOr<HloInstruction*> SmallBufferOptimization(
         primitive_util::LowercasePrimitiveTypeName(data_shape.element_type()));
   }
   // We only support topk of the shape [x] or [batch, x].
-  if (data_shape.dimensions_size() > 2) {
+  if (data_shape.dimensions().size() > 2) {
     return InvalidArgument("Invalid input dimensions: %s",
                            data_shape.ToString());
   }
-  bool has_batch = data_shape.dimensions_size() == 2;
+  bool has_batch = data_shape.dimensions().size() == 2;
   constexpr size_t max_k = 16;
   constexpr size_t min_n = 1024;
   size_t n = data_shape.dimensions(has_batch ? 1 : 0);
diff --git a/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc b/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
index 385e06077b9c..82867c7a238e 100644
--- a/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/topk_splitter.cc
@@ -60,7 +60,7 @@ class TopkSplitterVisitor : public DfsHloRewriteVisitor {
     }
     HloComputation* comp = inst->parent();
     Shape data_shape = topk->operand(0)->shape();
-    bool has_batch = data_shape.dimensions_size() == 2;
+    bool has_batch = data_shape.dimensions().size() == 2;
     // TODO(doak): Support multiple batches.
     if (has_batch && data_shape.dimensions(0) != 1) {
       return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
index fc7803bb0419..2889be609b0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
@@ -86,7 +86,8 @@ absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
     return normalized_dims;
   }
   // Derive the permutation from the segments.
-  std::vector<int64_t> segment_to_normalized_dim(output_shape.rank(), -1);
+  std::vector<int64_t> segment_to_normalized_dim(
+      output_shape.dimensions().size(), -1);
   for (size_t segment : segments) {
     segment_to_normalized_dim[output_to_input[segment]] = 0;
   }
@@ -136,10 +137,11 @@ absl::InlinedVector<int64_t, 3> GetNormalizedLogicalTransposeShape(
     absl::InlinedVector<int64_t, 3> &permutation) {
   permutation.clear();
   // Drop degenerate dimensions.
-  absl::InlinedVector<int64_t, 3> delta(output_shape.rank() + 1, 0);
+  absl::InlinedVector<int64_t, 3> delta(output_shape.dimensions().size() + 1,
+                                        0);
   auto input_dimensions = ComposePermutations(output_shape.dimensions(),
                                               InversePermutation(dimensions));
-  for (int i = 0; i < output_shape.rank(); ++i) {
+  for (int i = 0; i < output_shape.dimensions().size(); ++i) {
     delta[i + 1] = delta[i];
     if (input_dimensions[i] == static_cast<int64_t>(1)) {
       ++delta[i + 1];
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper_test.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper_test.cc
index 4a81df90eb92..a60769e05d3a 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/strings/string_view.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -30,7 +30,7 @@ namespace {
 using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
-class TransposeDimensionGrouperTest : public HloTestBase {
+class TransposeDimensionGrouperTest : public HloHardwareIndependentTestBase {
  public:
   void CheckDimensionGrouper(absl::string_view hlo,
                              std::optional<absl::string_view> expected) {
diff --git a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter_test.cc
index 9c36d9993cdd..242a1dd3a5c5 100644
--- a/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/tree_reduction_rewriter_test.cc
@@ -18,15 +18,15 @@ limitations under the License.
 #include <optional>
 
 #include "absl/strings/string_view.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 
 namespace {
 
-class TreeReductionRewriterTest : public HloTestBase {
+class TreeReductionRewriterTest : public HloHardwareIndependentTestBase {
  public:
   void CheckTreeRewriter(absl::string_view hlo,
                          std::optional<absl::string_view> expected) {
diff --git a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
index b6cbcce83a4a..d4562437b0d0 100644
--- a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter.cc
@@ -79,6 +79,7 @@ absl::StatusOr<bool> TriangularSolveRewriter::Run(
       TF_ASSIGN_OR_RETURN(HloInstruction * gte,
                           MakeGetTupleElementHlo(custom_call, 0));
       TF_RETURN_IF_ERROR(comp->ReplaceInstruction(instr, gte));
+      changed = true;
     }
   }
   return changed;
diff --git a/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter_test.cc
new file mode 100644
index 000000000000..7936039c922e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/triangular_solve_rewriter_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/triangular_solve_rewriter.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace m = ::xla::match;
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+
+using TriangularSolveRewriterTest = HloHardwareIndependentTestBase;
+
+TEST_F(TriangularSolveRewriterTest, TriangularSolveWithTranspose) {
+  const char* const hlo_string = R"(
+HloModule TriangularSolve
+
+ENTRY main {
+  a = f32[4,4]{1,0} parameter(0)
+  b = f32[3,4]{1,0} parameter(1)
+  ROOT triangular-solve = f32[3,4]{1,0} triangular-solve(a, b), lower=true,
+                                          transpose_a=TRANSPOSE
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TriangularSolveRewriter rewriter;
+  EXPECT_THAT(rewriter.Run(module.get()), IsOkAndHolds(true));
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(
+                  m::CustomCall({"__cublas$triangularSolve"}))));
+}
+
+TEST_F(TriangularSolveRewriterTest, RightLowerNoTranspose) {
+  const char* const hlo_string = R"(
+HloModule TriangularSolve
+
+ENTRY %RightLowerNoTranspose (a: f32[4,4], b: f32[3,4]) -> f32[3,4] {
+  a = f32[4,4]{1,0} parameter(0)
+  b = f32[3,4]{1,0} parameter(1)
+  ROOT %solve = f32[3,4]{1,0} triangular-solve(a, b), lower=true, transpose_a=NO_TRANSPOSE
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TriangularSolveRewriter rewriter;
+  EXPECT_THAT(rewriter.Run(module.get()), IsOkAndHolds(true));
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(
+                  m::CustomCall({"__cublas$triangularSolve"}))));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
index 01f6c891a48c..3747f71d6019 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/triton_fusion_numerics_verifier.h"
 
 #include <memory>
-#include <optional>
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
@@ -26,17 +25,19 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/buffer_comparator.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuning/autotuner_util.h"
+#include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/buffer_comparator.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
@@ -45,14 +46,14 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
-#include "xla/status_macros.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
@@ -72,26 +73,23 @@ absl::StatusOr<const HloFusionInstruction*> AsTritonFusion(
                       fusion->backend_config<GpuBackendConfig>());
   const FusionBackendConfig& backend_config =
       gpu_config.fusion_backend_config();
-  if (backend_config.kind() == kTritonFusionKind) {
+  if (backend_config.kind() == kTritonFusionKind ||
+      backend_config.kind() == kTritonNestedGemmFusionKind) {
     return fusion;
   }
   return nullptr;
 }
 
-// Extracts the fusion, disables Triton, and re-runs the fusion pass in order
-// to make sure that the fusions are suitable for the MLIR emitters and will be
+// Extracts the fusion computation and re-runs the fusion pass in order to make
+// sure that the fusions are suitable for the MLIR emitters and will be
 // reasonably fast. Without this the generated code can be extremely slow (e.g.
 // days instead of milliseconds).
-absl::StatusOr<std::unique_ptr<HloModule>> NewHloModuleWithoutTritonFromFusion(
+absl::StatusOr<std::unique_ptr<HloModule>> NewHloModuleFromFusionComputation(
     const HloFusionInstruction& fusion, const DebugOptions& debug_opts,
     const se::DeviceDescription& gpu_device_info) {
   std::unique_ptr<HloModule> new_module =
       ExtractComputationIntoNewModule(*fusion.fused_instructions_computation());
   new_module->mutable_config().set_debug_options(debug_opts);
-  new_module->mutable_config()
-      .mutable_debug_options()
-      .add_xla_disable_hlo_passes("triton-softmax-rewriter");
-
   TreeReductionRewriter tree_reduction_rewriter(gpu_device_info);
   TF_RETURN_IF_ERROR(tree_reduction_rewriter.Run(new_module.get()).status());
 
@@ -126,7 +124,7 @@ absl::StatusOr<ScopedShapedBuffer> CompileAndRunFusion(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       util.Compile([&](const DebugOptions& opts) {
-        return disable_triton ? NewHloModuleWithoutTritonFromFusion(
+        return disable_triton ? NewHloModuleFromFusionComputation(
                                     fusion, opts, config.GetDeviceDescription())
                               : NewHloModuleWithTritonFromFusion(fusion, opts);
       }));
@@ -134,10 +132,15 @@ absl::StatusOr<ScopedShapedBuffer> CompileAndRunFusion(
     return Internal("Failed to compile Triton fusion.");
   }
 
-  TF_ASSIGN_OR_RETURN(auto rz_buffers, RedzoneBuffers::FromInstruction(
-                                           fusion, config, debug_opts,
-                                           RedzoneBuffers::kAllInputs));
-  TF_ASSIGN_OR_RETURN(auto stream, config.GetStream());
+  bool should_init_buffers = config.should_init_buffers();
+  bool should_check_correctness = config.should_check_correctness();
+  int redzone_padding_bytes = debug_opts.xla_gpu_redzone_padding_bytes();
+  TF_ASSIGN_OR_RETURN(se::Stream * stream, config.GetStream());
+  TF_ASSIGN_OR_RETURN(auto rz_buffers,
+                      RedzoneBuffers::FromInstruction(
+                          fusion, config.GetAllocator(), stream,
+                          RedzoneBuffers::kAllInputs, should_init_buffers,
+                          should_check_correctness, redzone_padding_bytes));
   TF_ASSIGN_OR_RETURN(ProfilingOutput profiling_output,
                       util.ProfileExecutable(executable.get(), stream,
                                              rz_buffers.input_buffers(),
@@ -150,16 +153,22 @@ absl::Status CompareBuffers(const ScopedShapedBuffer& current,
                             const ScopedShapedBuffer& expected,
                             const Shape& shape, const HloModuleConfig& config,
                             se::Stream* stream) {
-  BufferComparator comparator(
-      shape, config.debug_options().xla_gpu_autotune_gemm_rtol());
-  TF_ASSIGN_OR_RETURN(bool outputs_match,
-                      comparator.CompareEqual(stream, current.root_buffer(),
-                                              expected.root_buffer()));
-
-  if (!outputs_match) {
-    return Internal("Triton fusion output does not match emitters output.");
-  }
-  return absl::OkStatus();
+  return ShapeUtil::ForEachLeafShapeWithStatus(
+      shape,
+      [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
+        BufferComparator comparator(
+            subshape, config.debug_options().xla_gpu_autotune_gemm_rtol());
+        TF_ASSIGN_OR_RETURN(
+            bool outputs_match,
+            comparator.CompareEqual(stream, current.buffer(index),
+                                    expected.buffer(index)));
+
+        if (!outputs_match) {
+          return Internal(
+              "Triton fusion output does not match emitters output.");
+        }
+        return absl::OkStatus();
+      });
 }
 
 absl::Status ForAllTritonFusions(
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index cd4b44c6876d..72892da473ab 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -31,21 +31,21 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status_matchers.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/status_matchers.h"
 
 namespace xla::gpu {
 namespace {
 
 class TritonFusionNumericsVerifierTest
-    : public HloTestBase,
+    : public HloPjRtTestBase,
       public ::testing::WithParamInterface<PrimitiveType> {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
-    auto options = HloTestBase::GetDebugOptionsForTest();
+    auto options = HloPjRtTestBase::GetDebugOptionsForTest();
     options.set_xla_gpu_verify_triton_fusion_numerics(true);
     return options;
   }
@@ -116,13 +116,15 @@ triton_softmax_computation {
 ENTRY main{
   p = $0[127,125] parameter(0)
   ROOT triton_softmax = $0[127,125] fusion(p), kind=kCustom,
-    calls=triton_softmax_computation,
-    backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],
-    "fusion_backend_config":{"kind":"__triton","block_level_fusion_config":
-    {"output_tiles":[{"sizes":["1","125"]}],"num_warps":"1"}},"force_earliest_schedule":false}
-}
-
-)";
+    calls=triton_softmax_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","125"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
 
 TEST_P(TritonFusionNumericsVerifierTest, VerifyExactSoftmaxFusionNumerics) {
   auto module = Module(kSoftmaxHlo,
@@ -133,6 +135,36 @@ TEST_P(TritonFusionNumericsVerifierTest, VerifyExactSoftmaxFusionNumerics) {
   TF_EXPECT_OK(verifier.Run(module.get(), /*execution_threads=*/{}));
 }
 
+TEST_P(TritonFusionNumericsVerifierTest, VerifyMultiOutputFusionNumerics) {
+  constexpr absl::string_view kMultiOutputFusionHloText = R"(
+HloModule m
+fusion_computation {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  exponential = $0[127,125]{1,0} exponential(param_0)
+  negate = $0[127,125]{1,0} negate(exponential)
+  ROOT res = ($0[127,125]{1,0}, $0[127,125]{1,0}) tuple(exponential, negate)
+}
+
+ENTRY main{
+  p = $0[127,125] parameter(0)
+  ROOT result = ($0[127,125], $0[127,125]) fusion(p), kind=kCustom,
+    calls=fusion_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","125"]}, {"sizes":["1","125"]}],
+        "num_warps":"1",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+})";
+  auto module = Module(kMultiOutputFusionHloText,
+                       primitive_util::LowercasePrimitiveTypeName(GetParam()));
+
+  EXPECT_NE(TritonFusion(*module), nullptr);
+  auto verifier = TritonFusionNumericsVerifier(CreateAutotuneConfig());
+  TF_EXPECT_OK(verifier.Run(module.get(), /*execution_threads=*/{}));
+}
+
 TEST_F(TritonFusionNumericsVerifierTest, CheckMismatch) {
   // This test intentionally compares two different Triton modules to each
   // other. This is to test that the verifier functions correctly catch and
@@ -205,12 +237,15 @@ triton_softmax_computation {
 ENTRY main {
   param_0 = f32[16,256000] parameter(0)
   ROOT triton_softmax = f32[16,256000]{1,0} fusion(param_0), kind=kCustom,
-    calls=triton_softmax_computation,
-    backend_config={"fusion_backend_config":
-      {"kind":"__triton","block_level_fusion_config":
-        {"output_tiles":[{"sizes":["1","256000"]}],"num_warps":"32"}}}
-}
-  )",
+    calls=triton_softmax_computation, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton",
+        "block_level_fusion_config":{
+          "output_tiles":[{"sizes":["1","256000"]}],
+          "num_warps":"32",
+          "num_ctas":"1",
+          "num_stages":"1"}}}
+})",
                        "");
 
   auto verifier = TritonFusionNumericsVerifier(CreateAutotuneConfig());
@@ -273,9 +308,9 @@ ENTRY main {
   p0 = f32[16,16] parameter(0)
   p1 = f32[16,16] parameter(1)
   p2 = f32[16,16] parameter(2)
-  r0 = f32[16] fusion(p0), kind=kCustom, calls=reduce_0, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["16"]}],"num_warps":"1"}}}
-  r1 = f32[16] fusion(p1), kind=kCustom, calls=reduce_1, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["16"]}],"num_warps":"1"}}}
-  r2 = f32[16] fusion(p2), kind=kCustom, calls=reduce_2, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["16"]}],"num_warps":"1"}}}
+  r0 = f32[16] fusion(p0), kind=kCustom, calls=reduce_0, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["16"]}],"num_warps":"1","num_ctas":"1","num_stages":"1"}}}
+  r1 = f32[16] fusion(p1), kind=kCustom, calls=reduce_1, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["16"]}],"num_warps":"1","num_ctas":"1","num_stages":"1"}}}
+  r2 = f32[16] fusion(p2), kind=kCustom, calls=reduce_2, backend_config={"fusion_backend_config": {"kind":"__triton","block_level_fusion_config":{"output_tiles":[{"sizes":["16"]}],"num_warps":"1","num_ctas":"1","num_stages":"1"}}}
   add_0_1 = f32[16] add(r0, r1)
   ROOT add_0_2 = f32[16] add(add_0_1, r2)
 }
@@ -324,11 +359,14 @@ triton_softmax_computation {
 ENTRY main {
   p = f32[16384,16384] parameter(0)
   ROOT triton_softmax = f32[1,1,16384,16384] fusion(p), kind=kCustom,
-    calls=triton_softmax_computation,
-    backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],
-      "fusion_backend_config":{"kind":"__triton","block_level_fusion_config":
-        {"output_tiles":[{"sizes":["1","1","1","16384"]}],"num_warps":"32"}},
-        "force_earliest_schedule":false}
+    calls=triton_softmax_computation, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","1","1","16384"]}],
+        "num_warps":"32",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
 }
   )";
   auto module = Module(hlo_text, "");
diff --git a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter_test.cc b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter_test.cc
index 7d1101091f0d..e584928ef51d 100644
--- a/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/variadic_op_splitter_test.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/literal_util.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -34,7 +34,7 @@ namespace gpu {
 namespace {
 using match::Concatenate;
 
-class VariadicOpSplitterTest : public HloTestBase {};
+class VariadicOpSplitterTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(VariadicOpSplitterTest, DontSplit) {
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
index c4e762a32a10..ce4546241448 100644
--- a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
+++ b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler.cc
@@ -68,14 +68,16 @@ namespace m = match;
 //   inputs --> (unary) --> while loop {dequant --> collective-permute/dot/etc}.
 //
 // Unary bitcast, broadcast, copy, reshape and transpose ops are allowed between
-// dequantization and while loop. Returns whether the input computation has been
-// changed.
-absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
-  HloInstruction* while_instr = while_body->WhileCallInstruction();
+// dequantization and while loop. Returns the new while loop if the computation
+// was changed.
+absl::StatusOr<HloInstruction*> ShiftDequantizationF8(
+    HloComputation* while_body) {
+  auto maybe_while = while_body->GetUniqueCaller(HloOpcode::kWhile);
   // The input of the while loop will be modified and must have no other users.
-  if (!while_instr || while_instr->operand(0)->user_count() != 1) {
-    return false;
+  if (!maybe_while || (*maybe_while)->operand(0)->user_count() != 1) {
+    return absl::InvalidArgumentError("Expected body to be a loop.");
   }
+  HloInstruction* while_instr = *maybe_while;
 
   // Identify the scalings and type conversions applied to the inputs of the
   // while loop.
@@ -101,7 +103,7 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
                                        m::Convert(m::Op(&operands[k])),
                                        m::Broadcast(m::Op(&scales[k])))))) {
       VLOG(5) << "Unable to identify FP8 dequantization pattern.";
-      return false;
+      return nullptr;
     }
   }
 
@@ -113,7 +115,7 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
         (operand_types[0] == F8E4M3FN && operand_types[1] == F8E5M2) ||
         (operand_types[0] == F8E5M2 && operand_types[1] == F8E4M3FN))) {
     VLOG(5) << "Unsupported types.";
-    return false;
+    return nullptr;
   }
 
   // The dequantized types must be BF16, FP16 or FP32.
@@ -122,7 +124,7 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
         binaries[k]->shape().element_type() != F16 &&
         binaries[k]->shape().element_type() != F32) {
       VLOG(5) << "Unsupported types.";
-      return false;
+      return nullptr;
     }
   }
 
@@ -130,7 +132,7 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
   if (!ShapeUtil::IsScalar(scales[0]->shape()) ||
       !ShapeUtil::IsScalar(scales[1]->shape())) {
     VLOG(5) << "Scaling factors must be scalars.";
-    return false;
+    return nullptr;
   }
 
   // Identify the dot, get-tuple-element and collective-permute or dynamic-slice
@@ -172,7 +174,7 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
     VLOG(5) << "Identified reduce-scatter windowed einsum pattern.";
   } else {
     VLOG(5) << "Unable to identify valid windowed einsum pattern.";
-    return false;
+    return nullptr;
   }
 
   // Replace any dequantized bitcast, broadcast, copy, reshape and transpose ops
@@ -229,7 +231,7 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
     TF_ASSIGN_OR_RETURN(
         HloInstruction * operand_scale,
         MakeGetTupleElementHlo(
-            body_param, body_param->shape().tuple_shapes_size() - 2 + k));
+            body_param, body_param->shape().tuple_shapes().size() - 2 + k));
 
     // Also add the scaling factor to the output tuple of the while body.
     while_root->AppendOperand(operand_scale);
@@ -294,7 +296,6 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
       while_instr->CloneWithNewShape(while_root->shape()));
   TF_RETURN_IF_ERROR(
       while_instr->ReplaceAllUsesWithDifferentShape(new_while_instr));
-  while_instr->while_body()->SetWhileCallInstruction(new_while_instr);
   TF_RETURN_IF_ERROR(while_instr->parent()->RemoveInstruction(while_instr));
 
   if (coll_perms[0]) {
@@ -305,7 +306,7 @@ absl::StatusOr<bool> ShiftDequantizationF8(HloComputation* while_body) {
   TF_RETURN_IF_ERROR(while_body->RemoveInstruction(gtes[1]));
 
   VLOG(5) << "FP8 dequantization moved into while loop.";
-  return true;
+  return new_while_instr;
 }
 
 int64_t NumberOfInstructionsInComp(const HloComputation* comp, HloOpcode op) {
@@ -348,7 +349,7 @@ static int64_t GetAgActivationCacheIndex(const HloInstruction* while_loop) {
   const HloInstruction* loop_tuple = while_loop->operand(0);
   const Shape& tuple_shape = loop_tuple->shape();
   CHECK(tuple_shape.IsTuple());
-  return tuple_shape.tuple_shapes_size() - 1;
+  return tuple_shape.tuple_shapes().size() - 1;
 }
 
 bool FindDusSliceForCachedActivation(HloInstruction* inst,
@@ -648,7 +649,7 @@ absl::Status MoveAccumulationOutsideLoop(
   // The final reduction
   HloInstruction* concat_result_gte =
       comp->AddInstruction(HloInstruction::CreateGetTupleElement(
-          loop, (loop->operand(0)->shape().tuple_shapes_size() - 1)));
+          loop, (loop->operand(0)->shape().tuple_shapes().size() - 1)));
   HloInstruction* reduced_result =
       comp->AddInstruction(HloInstruction::CreateReduce(
           partial_accumulations[0]->shape(), concat_result_gte, zero, {0},
@@ -1008,13 +1009,15 @@ class WindowedEinsumVisitor : public DfsHloRewriteVisitor {
 
       // Each split is sliced out of the input buffer, we need to determine the
       // slice sizes and increments.
-      std::vector<int64_t> lhs_slice_sizes(a2a->shape().rank(), 0);
-      std::vector<int64_t> lhs_slice_increments(a2a->shape().rank(), 1);
+      std::vector<int64_t> lhs_slice_sizes(a2a->shape().dimensions().size(), 0);
+      std::vector<int64_t> lhs_slice_increments(
+          a2a->shape().dimensions().size(), 1);
       std::vector<int64_t> lhs_slice_max_range(
           a2a->shape().dimensions().begin(), a2a->shape().dimensions().end());
 
-      std::vector<int64_t> rhs_slice_sizes(rhs->shape().rank(), 0);
-      std::vector<int64_t> rhs_slice_increments(rhs->shape().rank(), 1);
+      std::vector<int64_t> rhs_slice_sizes(rhs->shape().dimensions().size(), 0);
+      std::vector<int64_t> rhs_slice_increments(
+          rhs->shape().dimensions().size(), 1);
       std::vector<int64_t> rhs_slice_max_range(
           rhs->shape().dimensions().begin(), rhs->shape().dimensions().end());
 
@@ -1244,18 +1247,18 @@ class WindowedEinsumVisitor : public DfsHloRewriteVisitor {
           matched_result.rhs->shape().dimensions()[rhs_contracting_dim];
       // Each split is sliced out of the input buffer, we need to determine the
       // slice sizes and increments.
-      std::vector<int64_t> lhs_slice_sizes(matched_result.lhs->shape().rank(),
-                                           0);
+      std::vector<int64_t> lhs_slice_sizes(
+          matched_result.lhs->shape().dimensions().size(), 0);
       std::vector<int64_t> lhs_slice_increments(
-          matched_result.lhs->shape().rank(), 1);
+          matched_result.lhs->shape().dimensions().size(), 1);
       std::vector<int64_t> lhs_slice_max_range(
           matched_result.lhs->shape().dimensions().begin(),
           matched_result.lhs->shape().dimensions().end());
 
-      std::vector<int64_t> rhs_slice_sizes(matched_result.rhs->shape().rank(),
-                                           0);
+      std::vector<int64_t> rhs_slice_sizes(
+          matched_result.rhs->shape().dimensions().size(), 0);
       std::vector<int64_t> rhs_slice_increments(
-          matched_result.rhs->shape().rank(), 1);
+          matched_result.rhs->shape().dimensions().size(), 1);
       std::vector<int64_t> rhs_slice_max_range(
           matched_result.rhs->shape().dimensions().begin(),
           matched_result.rhs->shape().dimensions().end());
@@ -1367,12 +1370,23 @@ absl::StatusOr<bool> WindowedEinsumHandler::Run(
       // If present, move the dequantization of FP8 operands of the dot into the
       // while loop to allow e.g. gemm_rewriter.cc to fuse the dequantization
       // and dot into an FP8 GEMM.
-      TF_ASSIGN_OR_RETURN(changed, ShiftDequantizationF8(comp));
+      auto maybe_while_op = comp->GetUniqueCaller(HloOpcode::kWhile);
+      if (!maybe_while_op.has_value()) {
+        return absl::InvalidArgumentError(
+            "Expected computation to be a loop body.");
+      }
+
+      auto* while_op = *maybe_while_op;
+      TF_ASSIGN_OR_RETURN(auto maybe_new_op, ShiftDequantizationF8(comp));
+      if (maybe_new_op) {
+        changed = true;
+        while_op = maybe_new_op;
+      }
+
       if (comp->name().find(kWindowedEinsumAgLoopName) == 0) {
-        all_ag_loops_.push_back(
-            WindowedEinsumAgLoops(comp->WhileCallInstruction()));
+        all_ag_loops_.push_back(WindowedEinsumAgLoops(while_op));
       }
-      all_windowed_einsum_loops.push_back(comp->WhileCallInstruction());
+      all_windowed_einsum_loops.push_back(while_op);
     }
   }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler_test.cc b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler_test.cc
index 14107e0b07cf..e51bdbe03206 100644
--- a/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/windowed_einsum_handler_test.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
@@ -37,7 +37,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-using WindowedEinsumHandlerTest = HloTestBase;
+using WindowedEinsumHandlerTest = HloHardwareIndependentTestBase;
 
 HloInstruction* FindInstructionByName(HloComputation* comp, std::string name) {
   for (auto inst : comp->instructions()) {
@@ -337,49 +337,49 @@ ENTRY main.9_spmd {
 CHECK: ENTRY
 CHECK-DAG: %[[P1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} parameter(1)
 
-CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [6144:8192]}
-CHECK: %[[A2A0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE0]]),
+CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [6144:8192]}
+CHECK: %[[A2A0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE0]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3},{4,5,6,7}
 CHECK: }
 CHECK: dimensions={1}
 CHECK-DAG: %[[P0:.*]] = bf16[1,8192,32768]{2,1,0} parameter(0)
-CHECK-DAG: %[[SLICE4:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [6144:8192], [0:32768]}
-CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A0:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"8","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE4:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [6144:8192], [0:32768]}
+CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A0:.*]], %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"8","wait_on_operation_queues":[],"force_earliest_schedule":false
 
-CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [4096:6144]}
-CHECK: %[[A2A1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE1]]),
+CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [4096:6144]}
+CHECK: %[[A2A1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE1]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3},{4,5,6,7}
 CHECK: }
 CHECK: dimensions={1}
-CHECK-DAG: %[[SLICE5:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [4096:6144], [0:32768]}
-CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A1:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"7","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE5:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [4096:6144], [0:32768]}
+CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A1:.*]], %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"7","wait_on_operation_queues":[],"force_earliest_schedule":false
 
-CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [2048:4096]}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE2]]),
+CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [2048:4096]}
+CHECK: %[[A2A2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE2]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3},{4,5,6,7}
 CHECK: }
 CHECK: dimensions={1}
-CHECK-DAG: %[[SLICE6:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [2048:4096], [0:32768]}
-CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A2:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"6","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE6:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [2048:4096], [0:32768]}
+CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A2:.*]], %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"6","wait_on_operation_queues":[],"force_earliest_schedule":false
 
-CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [0:2048]}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE3]]),
+CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [0:2048]}
+CHECK: %[[A2A3:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE3]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3},{4,5,6,7}
 CHECK: }
 CHECK: dimensions={1}
-CHECK-DAG: %[[SLICE7:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [0:2048], [0:32768]}
-CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A3:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"5","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE7:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [0:2048], [0:32768]}
+CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A3:.*]], %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"5","wait_on_operation_queues":[],"force_earliest_schedule":false
 CHECK-DAG: %[[CONSTANT:.*]] = bf16[] constant(0)
-CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,32768]{3,2,1,0} broadcast(bf16[] %[[CONSTANT:.*]]), dimensions={}
-CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT0:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[BROADCAST:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["5"],"force_earliest_schedule":false}
-CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT1:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[ADD0:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["6"],"force_earliest_schedule":false}
-CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT2:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[ADD1:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["7"],"force_earliest_schedule":false}
+CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,32768]{3,2,1,0} broadcast(%[[CONSTANT:.*]]), dimensions={}
+CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT0:.*]], %[[BROADCAST:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["5"],"force_earliest_schedule":false
+CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT1:.*]], %[[ADD0:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["6"],"force_earliest_schedule":false
+CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT2:.*]], %[[ADD1:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["7"],"force_earliest_schedule":false
 
-CHECK: ROOT {{.*}} = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT3:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[ADD2:.*]])
+CHECK: ROOT {{.*}} = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT3:.*]], %[[ADD2:.*]])
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -412,49 +412,49 @@ ENTRY main.9_spmd {
 CHECK: ENTRY
 CHECK-DAG: %[[P1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} parameter(1)
 
-CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [24576:32768]}
+CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [24576:32768]}
 CHECK-DAG: %[[P0:.*]] = bf16[1,8192,32768]{2,1,0} parameter(0)
-CHECK-DAG: %[[SLICE4:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [0:8192], [24576:32768]}
-CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE0:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"8","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT0:.*]]),
+CHECK-DAG: %[[SLICE4:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [0:8192], [24576:32768]}
+CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE0:.*]], %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"8","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT0:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 
-CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [16384:24576]}
-CHECK-DAG: %[[SLICE5:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [0:8192], [16384:24576]}
-CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE1:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"7","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT1:.*]]),
+CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [16384:24576]}
+CHECK-DAG: %[[SLICE5:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [0:8192], [16384:24576]}
+CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE1:.*]], %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"7","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT1:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 
-CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [8192:16384]}
-CHECK-DAG: %[[SLICE6:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [0:8192], [8192:16384]}
-CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE2:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"6","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT2:.*]]),
+CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [8192:16384]}
+CHECK-DAG: %[[SLICE6:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [0:8192], [8192:16384]}
+CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE2:.*]], %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"6","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT2:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 
-CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [0:8192]}
-CHECK-DAG: %[[SLICE7:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [0:8192], [0:8192]}
-CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE3:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"5","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT3:.*]]),
+CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [0:8192]}
+CHECK-DAG: %[[SLICE7:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [0:8192], [0:8192]}
+CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE3:.*]], %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"5","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT3:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 CHECK-DAG: %[[CONSTANT:.*]] = bf16[] constant(0)
-CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,8192]{3,2,1,0} broadcast(bf16[] %[[CONSTANT:.*]]), dimensions={}
-CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A0:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[BROADCAST:.*]])
-CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A1:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[ADD0:.*]])
-CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A2:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[ADD1:.*]])
+CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,8192]{3,2,1,0} broadcast(%[[CONSTANT:.*]]), dimensions={}
+CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A0:.*]], %[[BROADCAST:.*]])
+CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A1:.*]], %[[ADD0:.*]])
+CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A2:.*]], %[[ADD1:.*]])
 
-CHECK: ROOT {{.*}} = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A3:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[ADD2:.*]])
+CHECK: ROOT {{.*}} = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A3:.*]], %[[ADD2:.*]])
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -491,55 +491,55 @@ ENTRY main.9_spmd {
   const char* kExpected = R"(
 CHECK: ENTRY
 CHECK-DAG: %[[P1:.*]] = bf16[1,1,8192,4,1,2048]{5,4,3,2,1,0} parameter(1)
-CHECK-DAG: %[[TRANSPOSE0:.*]] = bf16[1,4,1,8192,1,2048]{5,4,1,3,2,0} transpose(bf16[1,1,8192,4,1,2048]{5,4,3,2,1,0} %[[P1:.*]]), dimensions={0,3,1,2,4,5}
-CHECK-DAG: %[[RESHAPE0:.*]] = bf16[1,4,8192,1,2048]{4,3,2,1,0} reshape(bf16[1,4,1,8192,1,2048]{5,4,1,3,2,0} %[[TRANSPOSE0:.*]])
-CHECK-DAG: %[[RESHAPE1:.*]] = bf16[4,8192,1,2048]{3,2,1,0} reshape(bf16[1,4,8192,1,2048]{4,3,2,1,0} %[[RESHAPE0:.*]])
-CHECK-DAG: %[[TRANSPOSE1:.*]] = bf16[1,4,2048,8192]{2,0,3,1} transpose(bf16[4,8192,1,2048]{3,2,1,0} %[[RESHAPE1:.*]]), dimensions={2,0,3,1}
-CHECK-DAG: %[[COPY:.*]] = bf16[1,4,2048,8192]{3,2,1,0} copy(bf16[1,4,2048,8192]{2,0,3,1} %[[TRANSPOSE1:.*]])
-
-CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [6144:8192]}
-CHECK: %[[A2A0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE0]]),
+CHECK-DAG: %[[TRANSPOSE0:.*]] = bf16[1,4,1,8192,1,2048]{5,4,1,3,2,0} transpose(%[[P1:.*]]), dimensions={0,3,1,2,4,5}
+CHECK-DAG: %[[RESHAPE0:.*]] = bf16[1,4,8192,1,2048]{4,3,2,1,0} reshape(%[[TRANSPOSE0:.*]])
+CHECK-DAG: %[[RESHAPE1:.*]] = bf16[4,8192,1,2048]{3,2,1,0} reshape(%[[RESHAPE0:.*]])
+CHECK-DAG: %[[TRANSPOSE1:.*]] = bf16[1,4,2048,8192]{2,0,3,1} transpose(%[[RESHAPE1:.*]]), dimensions={2,0,3,1}
+CHECK-DAG: %[[COPY:.*]] = bf16[1,4,2048,8192]{3,2,1,0} copy(%[[TRANSPOSE1:.*]])
+
+CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [6144:8192]}
+CHECK: %[[A2A0:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE0]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 CHECK-DAG: %[[P0:.*]] = bf16[1,8192,32768]{2,1,0} parameter(0)
-CHECK-DAG: %[[SLICE4:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [6144:8192], [0:32768]}
-CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A0:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"9","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE4:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [6144:8192], [0:32768]}
+CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A0:.*]], %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"9","wait_on_operation_queues":[],"force_earliest_schedule":false
 
-CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [4096:6144]}
-CHECK: %[[A2A1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE1]]),
+CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [4096:6144]}
+CHECK: %[[A2A1:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE1]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
-CHECK-DAG: %[[SLICE5:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [4096:6144], [0:32768]}
-CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A1:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"8","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE5:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [4096:6144], [0:32768]}
+CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A1:.*]], %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"8","wait_on_operation_queues":[],"force_earliest_schedule":false
 
-CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [2048:4096]}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE2]]),
+CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [2048:4096]}
+CHECK: %[[A2A2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE2]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
-CHECK-DAG: %[[SLICE6:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [2048:4096], [0:32768]}
-CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A2:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"7","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE6:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [2048:4096], [0:32768]}
+CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A2:.*]], %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"7","wait_on_operation_queues":[],"force_earliest_schedule":false
 
-CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(bf16[1,4,2048,8192]{3,2,1,0} %[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [0:2048]}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(bf16[1,4,2048,2048]{3,2,1,0} %[[SLICE3]]),
+CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,2048]{3,2,1,0} slice(%[[COPY:.*]]), slice={[0:1], [0:4], [0:2048], [0:2048]}
+CHECK: %[[A2A3:.*]] = bf16[1,4,2048,2048]{3,2,1,0} all-to-all(%[[SLICE3]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
-CHECK-DAG: %[[SLICE7:.*]] = bf16[1,2048,32768]{2,1,0} slice(bf16[1,8192,32768]{2,1,0} %[[P0:.*]]), slice={[0:1], [0:2048], [0:32768]}
-CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(bf16[1,4,2048,2048]{3,2,1,0} %[[A2A3:.*]], bf16[1,2048,32768]{2,1,0} %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"6","wait_on_operation_queues":[],"force_earliest_schedule":false}
+CHECK-DAG: %[[SLICE7:.*]] = bf16[1,2048,32768]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [0:2048], [0:32768]}
+CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,32768]{3,2,1,0} dot(%[[A2A3:.*]], %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"6","wait_on_operation_queues":[],"force_earliest_schedule":false
 CHECK-DAG: %[[CONSTANT:.*]] = bf16[] constant(0)
-CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,32768]{3,2,1,0} broadcast(bf16[] %[[CONSTANT:.*]]), dimensions={}
-CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT0:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[BROADCAST:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["6"],"force_earliest_schedule":false}
-CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT1:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[ADD0:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["7"],"force_earliest_schedule":false}
-CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT2:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[ADD1:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["8"],"force_earliest_schedule":false}
+CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,32768]{3,2,1,0} broadcast(%[[CONSTANT:.*]]), dimensions={}
+CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT0:.*]], %[[BROADCAST:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["6"],"force_earliest_schedule":false
+CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT1:.*]], %[[ADD0:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["7"],"force_earliest_schedule":false
+CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT2:.*]], %[[ADD1:.*]]), backend_config={"operation_queue_id":"0","wait_on_operation_queues":["8"],"force_earliest_schedule":false
 
-CHECK: ROOT {{.*}} = bf16[1,4,2048,32768]{3,2,1,0} add(bf16[1,4,2048,32768]{3,2,1,0} %[[DOT3:.*]], bf16[1,4,2048,32768]{3,2,1,0} %[[ADD2:.*]])
+CHECK: ROOT {{.*}} = bf16[1,4,2048,32768]{3,2,1,0} add(%[[DOT3:.*]], %[[ADD2:.*]])
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -578,55 +578,55 @@ ENTRY main.9_spmd {
 CHECK: ENTRY
 CHECK-DAG: %[[P1:.*]] = bf16[1,4,2048,32768]{3,2,1,0} parameter(0)
 
-CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [24576:32768]}
+CHECK-DAG: %[[SLICE0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [24576:32768]}
 CHECK-DAG: %[[P0:.*]] = bf16[1,32768,8192]{2,1,0} parameter(1)
-CHECK-DAG: %[[SLICE4:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,32768,8192]{2,1,0} %[[P0:.*]]), slice={[0:1], [24576:32768], [0:8192]}
-CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE0:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"12","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT0:.*]]),
+CHECK-DAG: %[[SLICE4:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [24576:32768], [0:8192]}
+CHECK-DAG: %[[DOT0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE0:.*]], %[[SLICE4:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"12","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT0:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 
-CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [16384:24576]}
-CHECK-DAG: %[[SLICE5:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,32768,8192]{2,1,0} %[[P0:.*]]), slice={[0:1], [16384:24576], [0:8192]}
-CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE1:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"11","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT1:.*]]),
+CHECK-DAG: %[[SLICE1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [16384:24576]}
+CHECK-DAG: %[[SLICE5:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [16384:24576], [0:8192]}
+CHECK-DAG: %[[DOT1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE1:.*]], %[[SLICE5:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"11","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT1:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 
-CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [8192:16384]}
-CHECK-DAG: %[[SLICE6:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,32768,8192]{2,1,0} %[[P0:.*]]), slice={[0:1], [8192:16384], [0:8192]}
-CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE2:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"10","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT2:.*]]),
+CHECK-DAG: %[[SLICE2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [8192:16384]}
+CHECK-DAG: %[[SLICE6:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [8192:16384], [0:8192]}
+CHECK-DAG: %[[DOT2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE2:.*]], %[[SLICE6:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"10","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT2:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 
-CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(bf16[1,4,2048,32768]{3,2,1,0} %[[P1]]), slice={[0:1], [0:4], [0:2048], [0:8192]}
-CHECK-DAG: %[[SLICE7:.*]] = bf16[1,8192,8192]{2,1,0} slice(bf16[1,32768,8192]{2,1,0} %[[P0:.*]]), slice={[0:1], [0:8192], [0:8192]}
-CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(bf16[1,4,2048,8192]{3,2,1,0} %[[SLICE3:.*]], bf16[1,8192,8192]{2,1,0} %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"9","wait_on_operation_queues":[],"force_earliest_schedule":false}
-CHECK: %[[A2A2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(bf16[1,4,2048,8192]{3,2,1,0} %[[DOT3:.*]]),
+CHECK-DAG: %[[SLICE3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} slice(%[[P1]]), slice={[0:1], [0:4], [0:2048], [0:8192]}
+CHECK-DAG: %[[SLICE7:.*]] = bf16[1,8192,8192]{2,1,0} slice(%[[P0:.*]]), slice={[0:1], [0:8192], [0:8192]}
+CHECK-DAG: %[[DOT3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} dot(%[[SLICE3:.*]], %[[SLICE7:.*]]), lhs_batch_dims={0}, lhs_contracting_dims={3}, rhs_batch_dims={0}, rhs_contracting_dims={1}, backend_config={"operation_queue_id":"9","wait_on_operation_queues":[],"force_earliest_schedule":false
+CHECK: %[[A2A3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} all-to-all(%[[DOT3:.*]]),
 CHECK: replica_groups={
 CHECK:     {0,1,2,3}
 CHECK: }
 CHECK: dimensions={1}
 CHECK-DAG: %[[CONSTANT:.*]] = bf16[] constant(0)
-CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,8192]{3,2,1,0} broadcast(bf16[] %[[CONSTANT:.*]]), dimensions={}
-CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A0:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[BROADCAST:.*]])
-CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A1:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[ADD0:.*]])
-CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A2:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[ADD1:.*]])
-CHECK-DAG: %[[ADD3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(bf16[1,4,2048,8192]{3,2,1,0} %[[A2A3:.*]], bf16[1,4,2048,8192]{3,2,1,0} %[[ADD2:.*]])
-
-CHECK-DAG: %[[COPY:.*]] = bf16[1,4,2048,8192]{3,2,1,0} copy(bf16[1,4,2048,8192]{3,2,1,0} %[[ADD3:.*]])
-CHECK-DAG: %[[TRANSPOSE0:.*]] = bf16[4,1,2048,8192]{3,2,0,1} transpose(bf16[1,4,2048,8192]{3,2,1,0} %[[COPY:.*]]), dimensions={1,0,2,3}
-CHECK-DAG: %[[COPY1:.*]] = bf16[4,1,2048,8192]{3,2,1,0} copy(bf16[4,1,2048,8192]{3,2,0,1} %[[TRANSPOSE0:.*]])
-CHECK-DAG: %[[RESHAPE0:.*]] = bf16[1,4,1,2048,8192]{4,3,2,1,0} reshape(bf16[4,1,2048,8192]{3,2,1,0} %[[COPY1:.*]])
-
-CHECK: ROOT {{.*}} = bf16[1,4,1,1,2048,8192]{5,4,3,2,1,0} reshape(bf16[1,4,1,2048,8192]{4,3,2,1,0} %[[RESHAPE0:.*]])
+CHECK-DAG: %[[BROADCAST:.*]] = bf16[1,4,2048,8192]{3,2,1,0} broadcast(%[[CONSTANT:.*]]), dimensions={}
+CHECK-DAG: %[[ADD0:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A0:.*]], %[[BROADCAST:.*]])
+CHECK-DAG: %[[ADD1:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A1:.*]], %[[ADD0:.*]])
+CHECK-DAG: %[[ADD2:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A2:.*]], %[[ADD1:.*]])
+CHECK-DAG: %[[ADD3:.*]] = bf16[1,4,2048,8192]{3,2,1,0} add(%[[A2A3:.*]], %[[ADD2:.*]])
+
+CHECK-DAG: %[[COPY:.*]] = bf16[1,4,2048,8192]{3,2,1,0} copy(%[[ADD3:.*]])
+CHECK-DAG: %[[TRANSPOSE0:.*]] = bf16[4,1,2048,8192]{3,2,0,1} transpose(%[[COPY:.*]]), dimensions={1,0,2,3}
+CHECK-DAG: %[[COPY1:.*]] = bf16[4,1,2048,8192]{3,2,1,0} copy(%[[TRANSPOSE0:.*]])
+CHECK-DAG: %[[RESHAPE0:.*]] = bf16[1,4,1,2048,8192]{4,3,2,1,0} reshape(%[[COPY1:.*]])
+
+CHECK: ROOT {{.*}} = bf16[1,4,1,1,2048,8192]{5,4,3,2,1,0} reshape(%[[RESHAPE0:.*]])
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -731,7 +731,7 @@ ENTRY main {
 ; CHECK-DAG:       backend_config={
 ; CHECK-DAG:         "operation_queue_id":"[[OPQUEUEID:[0-9]+]]",
 ; CHECK-DAG:         "wait_on_operation_queues":[],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[C0_S32:%[^ ]+]] = s32[] constant(0)
 ; CHECK-NEXT:    [[C0_U32:%[^ ]+]] = u32[] constant(0)
 ; CHECK-NEXT:    [[C5:%[^ ]+]] = u32[] constant(0)
@@ -748,7 +748,7 @@ ENTRY main {
 ; CHECK-DAG:       backend_config={
 ; CHECK-DAG:         "operation_queue_id":"0",
 ; CHECK-DAG:         "wait_on_operation_queues":["[[OPQUEUEID]]"],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[PERMUTED_LHS0_F32:%[^ ]+]] = f32[2,512,24576]{2,1,0} convert([[PERMUTED_LHS0]])
 ; CHECK-NEXT:    [[PERMUTED_LHS_SCALED:%[^ ]+]] = f32[2,512,24576]{2,1,0} multiply([[PERMUTED_LHS0_F32]], [[SCALE_LHS_BCAST]])
 ; CHECK-NEXT:    [[DOT1:%[^ ]+]] = f32[2,512,24576]{2,1,0} dot([[PERMUTED_LHS_SCALED]], [[RHS_SCALED]]),
@@ -885,12 +885,12 @@ ENTRY main.9_spmd {
 ; CHECK-DAG:       backend_config={
 ; CHECK-DAG:         "operation_queue_id":"[[OPQUEUEID0:[1-9][0-9]*]]",
 ; CHECK-DAG:         "wait_on_operation_queues":[],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[ADD3:%[^ ]+]] = f32[2,512,24576]{2,1,0} add([[CP0]], [[DOT0]]),
 ; CHECK-DAG:       backend_config={"
 ; CHECK-DAG:         operation_queue_id":"0",
 ; CHECK-DAG:         "wait_on_operation_queues":["[[OPQUEUEID0]]"],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[GTE6:[^ ]+]] = f32[2,512,24576]{2,1,0} get-tuple-element([[P0]]), index=3
 ; CHECK-NEXT:    [[C11:%[^ ]+]] = u32[] constant(0)
 ; CHECK-NEXT:    [[ADD6:%[^ ]+]] = u32[] add([[C11]], [[PID]])
@@ -906,7 +906,7 @@ ENTRY main.9_spmd {
 ; CHECK-DAG:       backend_config={
 ; CHECK-DAG:         "operation_queue_id":"[[OPQUEUEID:[0-9]+]]",
 ; CHECK-DAG:         "wait_on_operation_queues":[],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[ADD5:%[^ ]+]] = f32[2,512,24576]{2,1,0} add([[GTE6]], [[DOT1]])
 ; CHECK-NEXT:    [[CP1:[^ ]+]] = f32[2,512,24576]{2,1,0} collective-permute([[ADD5]]), channel_id=12
 ; CHECK-NEXT:    [[C3:%[^ ]+]] = u32[] constant(2)
@@ -944,12 +944,12 @@ ENTRY main.9_spmd {
 ; CHECK-DAG:       backend_config={
 ; CHECK-DAG:         "operation_queue_id":"[[OPQUEUEID:[0-9]+]]",
 ; CHECK-DAG:         "wait_on_operation_queues":[],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[ADD3:%[^ ]+]] = f32[2,512,24576]{2,1,0} add([[CP0]], [[DOT0]]),
 ; CHECK-DAG:       backend_config={"
 ; CHECK-DAG:         operation_queue_id":"0",
 ; CHECK-DAG:         "wait_on_operation_queues":["[[OPQUEUEID]]"],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[GTE6:[^ ]+]] = f32[2,512,24576]{2,1,0} get-tuple-element([[TUPLE0]]), index=3
 ; CHECK-NEXT:    [[C11:%[^ ]+]] = u32[] constant(1)
 ; CHECK-NEXT:    [[ADD6:%[^ ]+]] = u32[] add([[C11]], [[PID]])
@@ -965,7 +965,7 @@ ENTRY main.9_spmd {
 ; CHECK-DAG:       backend_config={
 ; CHECK-DAG:         "operation_queue_id":"[[OPQUEUEID:[0-9]+]]",
 ; CHECK-DAG:         "wait_on_operation_queues":[],
-; CHECK-DAG:         "force_earliest_schedule":false}
+; CHECK-DAG:         "force_earliest_schedule":false
 ; CHECK-NEXT:    [[ADD5:%[^ ]+]] = f32[2,512,24576]{2,1,0} add([[GTE6]], [[DOT1]])
 ; CHECK-NEXT:    [[CP1:[^ ]+]] = f32[2,512,24576]{2,1,0} collective-permute([[ADD5]]), channel_id=14
 ; CHECK-NEXT:    [[C3:%[^ ]+]] = u32[] constant(2)
@@ -1166,7 +1166,7 @@ ENTRY main.12_spmd {
   EXPECT_EQ(inst->operand(0)->tuple_index(), 5);
   EXPECT_EQ(inst->operand(0)->operand(0), ag_loop);
 
-  EXPECT_EQ(ag_loop->operand(0)->shape().tuple_shapes_size(), 7);
+  EXPECT_EQ(ag_loop->operand(0)->shape().tuple_shapes().size(), 7);
   // The root instruction's first operand should now be a reduction.
   EXPECT_EQ(
       module->entry_computation()->root_instruction()->operand(0)->opcode(),
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
index fb020163991f..758db985cfb0 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
@@ -20,13 +20,14 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/transforms/gemm_fusion.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/status_matchers.h"
@@ -39,7 +40,7 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::FieldsAre;
 
-using TritonDotAnalysisTest = HloTestBase;
+using TritonDotAnalysisTest = HloHardwareIndependentTestBase;
 
 TEST_F(TritonDotAnalysisTest, QueryingOutputScopeParametersAlwaysWorks) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 9b7d59bdea57..c97d51ddd331 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -157,7 +157,7 @@ using FragmentOrders = DimensionOrder::FragmentOrders;
 /*static*/ DimensionOrder DimensionOrder::FromDotOperandOrOutput(
     const HloInstruction& hlo, const int split_k_dimension_index) {
   DimensionOrder dim_order;
-  dim_order.tensor_fragments_order_.reserve(hlo.shape().rank());
+  dim_order.tensor_fragments_order_.reserve(hlo.shape().dimensions().size());
   for (const int i : hlo.shape().layout().minor_to_major()) {
     int target_dim_number = i;
     if (i == split_k_dimension_index) {
@@ -611,8 +611,8 @@ DimOrderMapOrError GetPropagatedDimOrdersForDimAlteringOp(
   // Group subdimensions by iterating over them in the same order as over
   // full dimensions and matching by total size.
   std::vector<std::vector<Fragment*>> src_physical;
-  src_physical.reserve(src.shape().rank());
-  if (src_fragments_order.size() < src.shape().rank()) {
+  src_physical.reserve(src.shape().dimensions().size());
+  if (src_fragments_order.size() < src.shape().dimensions().size()) {
     // It's not supported currently to further propagate dimensions after
     // reaching a trivial sized tensor. We could probably support it, but now we
     // just prevent crashing here.
@@ -676,7 +676,7 @@ DimOrderMapOrError GetPropagatedDimOrdersForDimAlteringOp(
       if (reduce->dimensions().size() != 1) {
         return FusionDecision::Forbid("Unsupported reduction.");
       } else if (reduce->dimensions().front() !=
-                 reduce->operand(0)->shape().rank() - 1) {
+                 reduce->operand(0)->shape().dimensions().size() - 1) {
         return FusionDecision::Forbid("Only row reductions are supported.");
       }
     } else if (hlo.opcode() == HloOpcode::kConcatenate) {
@@ -850,7 +850,9 @@ DimOrderMapOrError GetPropagatedDimOrdersForDimAlteringOp(
         dst_dim_fragments_order[dim_index].push_back(it->second);
       }
       for (auto* alive_fragment : alive_dst_fragments) {
-        alive_fragment->set_broadcast_multiplier(broadcast_multiplier);
+        int old_multiplier = alive_fragment->broadcast_multiplier();
+        alive_fragment->set_broadcast_multiplier(broadcast_multiplier *
+                                                 old_multiplier);
       }
     }
   }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
index 515bffbe0eb6..e8597c151a8c 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 
 namespace xla::gpu {
 namespace {
 
-using TritonTilingPropagationTest = HloTestBase;
+using TritonTilingPropagationTest = HloHardwareIndependentTestBase;
 using triton_fusion::DimensionOrder;
 
 DimensionOrder FromFragments(DimensionOrder::Fragments fragments) {
diff --git a/third_party/xla/xla/service/gpu/variant_visitor.h b/third_party/xla/xla/service/gpu/variant_visitor.h
deleted file mode 100644
index c4ff4aa89b3f..000000000000
--- a/third_party/xla/xla/service/gpu/variant_visitor.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_VARIANT_VISITOR_H_
-#define XLA_SERVICE_GPU_VARIANT_VISITOR_H_
-
-namespace xla::gpu {
-// This structure is used to support C++17 overload pattern as described in
-// https://en.cppreference.com/w/cpp/utility/variant/visit
-//
-// TODO(b/319202112): Replace with absl::Overload once abs lts_2024_XXX is
-// tagged.
-template <class... Ts>
-struct VariantVisitor : Ts... {
-  using Ts::operator()...;
-};
-template <class... Ts>
-VariantVisitor(Ts...) -> VariantVisitor<Ts...>;
-
-}  // namespace xla::gpu
-
-#endif  // XLA_SERVICE_GPU_VARIANT_VISITOR_H_
diff --git a/third_party/xla/xla/service/gpu/while_transformer_test.cc b/third_party/xla/xla/service/gpu/while_transformer_test.cc
index 97e3c97b68e5..449605984588 100644
--- a/third_party/xla/xla/service/gpu/while_transformer_test.cc
+++ b/third_party/xla/xla/service/gpu/while_transformer_test.cc
@@ -22,17 +22,17 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-class WhileTransformerTest : public HloTestBase {
+class WhileTransformerTest : public HloHardwareIndependentTestBase {
  protected:
   WhileTransformerTest()
       : module_(CreateNewVerifiedModule()),
diff --git a/third_party/xla/xla/service/graphcycles/BUILD b/third_party/xla/xla/service/graphcycles/BUILD
index 47458836c9d9..38b87179c39e 100644
--- a/third_party/xla/xla/service/graphcycles/BUILD
+++ b/third_party/xla/xla/service/graphcycles/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -18,7 +18,6 @@ cc_library(
         ":ordered_set",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
diff --git a/third_party/xla/xla/service/heap_simulator/BUILD b/third_party/xla/xla/service/heap_simulator/BUILD
index b65b860b96f0..f8cfeb6332ac 100644
--- a/third_party/xla/xla/service/heap_simulator/BUILD
+++ b/third_party/xla/xla/service/heap_simulator/BUILD
@@ -2,7 +2,7 @@
 #   XLA Heap simulator.
 
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -80,10 +80,10 @@ xla_cc_test(
         "//xla/hlo/analysis:hlo_dataflow_analysis",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:buffer_value",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_value",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:logging",
diff --git a/third_party/xla/xla/service/heap_simulator/allocation_block.cc b/third_party/xla/xla/service/heap_simulator/allocation_block.cc
index 91393578f1e9..9c0e170bd6c7 100644
--- a/third_party/xla/xla/service/heap_simulator/allocation_block.cc
+++ b/third_party/xla/xla/service/heap_simulator/allocation_block.cc
@@ -94,6 +94,10 @@ std::string AllocationBlock::ToString() const {
 }
 
 int AllocationBlock::GetColocationsCount() const {
+  if (next_colocated == nullptr) {
+    return 1;
+  }
+
   int count = 1;
   for (const AllocationBlock* colocated = next_colocated; colocated != this;
        colocated = colocated->next_colocated, ++count) {
diff --git a/third_party/xla/xla/service/heap_simulator/heap_simulator.cc b/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
index 26bba6d81a14..ec21d531b5cf 100644
--- a/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
+++ b/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
@@ -1071,6 +1071,11 @@ int64_t BufferIntervalTree::HeapSizeInInterval(const int64_t start,
   return max_memory_used;
 }
 
+void BufferIntervalTree::Clear() {
+  root_ = nullptr;
+  node_storage_.clear();
+}
+
 template <typename BufferType>
 std::string
 GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval::ToString() const {
@@ -2587,7 +2592,7 @@ ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() {
     multi_heap_result.heap_size += result_.heap_size;
     multi_heap_result.heap_results.push_back(std::move(result_));
     result_ = {};
-    interval_tree_ = {};
+    interval_tree_.Clear();
   } while (!sorted_buffer_intervals.empty());
 
   VLOG(1) << "Number of heaps produced = "
diff --git a/third_party/xla/xla/service/heap_simulator/heap_simulator.h b/third_party/xla/xla/service/heap_simulator/heap_simulator.h
index d81b29b52ad4..93f87691c952 100644
--- a/third_party/xla/xla/service/heap_simulator/heap_simulator.h
+++ b/third_party/xla/xla/service/heap_simulator/heap_simulator.h
@@ -420,6 +420,8 @@ class BufferIntervalTree {
   // heap within the time interval [start, end].
   int64_t HeapSizeInInterval(int64_t start, int64_t end) const;
 
+  void Clear();
+
  private:
   // The BufferIntervalTreeNode objects inside the result vector are guaranteed
   // to be non-null.
diff --git a/third_party/xla/xla/service/heap_simulator/heap_simulator_test.cc b/third_party/xla/xla/service/heap_simulator/heap_simulator_test.cc
index d129a0679a0c..b252d0eef6ac 100644
--- a/third_party/xla/xla/service/heap_simulator/heap_simulator_test.cc
+++ b/third_party/xla/xla/service/heap_simulator/heap_simulator_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/literal_util.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/heap_simulator/allocation_block.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/service/hlo_value.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -58,7 +58,7 @@ using ::testing::ContainerEq;
 using ::testing::HasSubstr;
 using ::testing::StrEq;
 
-class MinimumMemoryForSequenceTest : public HloTestBase {};
+class MinimumMemoryForSequenceTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
   auto module = CreateNewVerifiedModule();
@@ -421,7 +421,7 @@ class HeapSimulatorTracker {
   HeapSimulator::Result<HloValue> result_;
 };
 
-class HeapSimulatorTest : public HloTestBase {
+class HeapSimulatorTest : public HloHardwareIndependentTestBase {
  protected:
   HeapSimulatorTest() {}
   ~HeapSimulatorTest() override {}
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index 827e7e2a5317..713db5886c6f 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -614,6 +614,10 @@ message HloModuleProto {
     string fingerprint = 5;
     // The type of profile generation strategy used to generate the profile.
     ProfileGenerationStrategy profile_generation_strategy = 6;
+    // The original changelist the profile was generated at.
+    int64 original_changelist = 7;
+    // The changelist this profile was last updated at.
+    int64 changelist = 8;
   }
 
   // Profile information for the HLO module.
diff --git a/third_party/xla/xla/service/hlo_alias_analysis.h b/third_party/xla/xla/service/hlo_alias_analysis.h
deleted file mode 100644
index e2789adda9f4..000000000000
--- a/third_party/xla/xla/service/hlo_alias_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
-#define XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_alias_analysis.h"
-
-#endif  // XLA_SERVICE_HLO_ALIAS_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/hlo_computation_deduplicator.h b/third_party/xla/xla/service/hlo_computation_deduplicator.h
deleted file mode 100644
index bf82bc4ff420..000000000000
--- a/third_party/xla/xla/service/hlo_computation_deduplicator.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_COMPUTATION_DEDUPLICATOR_H_
-#define XLA_SERVICE_HLO_COMPUTATION_DEDUPLICATOR_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/hlo_computation_deduplicator.h"
-
-#endif  // XLA_SERVICE_HLO_COMPUTATION_DEDUPLICATOR_H_
diff --git a/third_party/xla/xla/service/hlo_computation_test.cc b/third_party/xla/xla/service/hlo_computation_test.cc
index d1cf9e9ca352..ec64a3cd565a 100644
--- a/third_party/xla/xla/service/hlo_computation_test.cc
+++ b/third_party/xla/xla/service/hlo_computation_test.cc
@@ -21,26 +21,29 @@ limitations under the License.
 #include <vector>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -53,7 +56,7 @@ namespace op = xla::testing::opcode_matchers;
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
-class HloComputationTest : public HloTestBase {
+class HloComputationTest : public HloHardwareIndependentTestBase {
  protected:
   HloComputationTest() = default;
 
@@ -705,8 +708,8 @@ TEST_F(HloComputationTest, Stringification) {
       R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
   %x = f32[5,10]{1,0} parameter(0)
   %y = f32[20,10]{1,0} parameter(1)
-  %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
-  ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %transpose = f32[10,20]{1,0} transpose(%y), dimensions={1,0}
+  ROOT %dot = f32[5,20]{1,0} dot(%x, %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }, execution_thread="MainThread")";
   EXPECT_EQ(computation->ToString(options), expected_computation);
 }
@@ -742,8 +745,8 @@ TEST_F(HloComputationTest, StringificationIndent) {
       R"(    %TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
       %x = f32[5,10]{1,0} parameter(0)
       %y = f32[20,10]{1,0} parameter(1)
-      %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
-      ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      %transpose = f32[10,20]{1,0} transpose(%y), dimensions={1,0}
+      ROOT %dot = f32[5,20]{1,0} dot(%x, %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
     }, execution_thread="MainThread")";
   EXPECT_EQ(computation->ToString(options), expected_computation);
 }
@@ -778,8 +781,8 @@ TEST_F(HloComputationTest, StringificationCanonical) {
       R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
   %x = f32[5,10]{1,0} parameter(0)
   %y = f32[20,10]{1,0} parameter(1)
-  %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
-  ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %transpose = f32[10,20]{1,0} transpose(%y), dimensions={1,0}
+  ROOT %dot = f32[5,20]{1,0} dot(%x, %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }, execution_thread="MainThread")";
   EXPECT_EQ(computation->ToString(options), expected_computation1);
 
@@ -843,8 +846,8 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   EXPECT_THAT(module->entry_computation()->MakeInstructionPostOrder(),
-              ElementsAre(op::Parameter(), op::AllReduce(), op::AllReduce(),
-                          op::Add(), op::Tuple()));
+              ElementsAre(op::Parameter(), op::AllReduce(), op::Add(),
+                          op::AllReduce(), op::Tuple()));
 }
 
 TEST_F(HloComputationTest, ComparisonWithCustomComparator) {
@@ -1041,5 +1044,34 @@ TEST_F(HloComputationTest, ToStringWhileCreatingReplacements) {
   EXPECT_EQ(counter, entry->instruction_count());
 }
 
+TEST_F(HloComputationTest, RemoveParameterWithBackendConfig) {
+  const absl::string_view hlo = R"(
+ENTRY main {
+  arg.0 = s32[] parameter(0)
+  arg.1 = s32[] parameter(1)
+  ROOT call.0 = (s32[]) call(arg.0, arg.1), to_apply={
+    arg.0 = s32[] parameter(0)
+    arg.1 = s32[] parameter(1), backend_config={"config" : []}
+    ROOT tuple.0 = tuple(arg.1)
+  }
+}
+  )";
+  // Since we remove the called computation parameter, we also need to remove
+  // the operand from the callee, but that's not possible, due to all related
+  // APIs being private/protected, hence the module will be left in an illegal
+  // state and not verifiable.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(hlo));
+
+  HloInstruction* call0 = module->entry_computation()->root_instruction();
+  ASSERT_EQ(call0->opcode(), HloOpcode::kCall);
+  HloComputation* computation = call0->to_apply();
+  ASSERT_TRUE(!computation->parameter_instruction(0)->has_backend_config());
+  // Parameter 0 is dead and safe to remove.
+  TF_ASSERT_OK(computation->RemoveParameter(0));
+  // Parameter 1 shifted to parameter 0 and should preserve its backend config.
+  EXPECT_TRUE(computation->parameter_instruction(0)->has_backend_config());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_constant_folding.h b/third_party/xla/xla/service/hlo_constant_folding.h
deleted file mode 100644
index 5f82f95d863e..000000000000
--- a/third_party/xla/xla/service/hlo_constant_folding.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
-#define XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/hlo_constant_folding.h"
-
-#endif  // XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.cc b/third_party/xla/xla/service/hlo_cost_analysis.cc
index 0cfd7c2b500e..aae1028cea75 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis.cc
@@ -108,7 +108,8 @@ absl::Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
-absl::Status HloCostAnalysis::RemoveInstruction(HloInstruction* instruction) {
+absl::Status HloCostAnalysis::RemoveInstruction(
+    const HloInstruction* instruction) {
   // Subtract the previously calculated properties of the instruction
   // from HLO graph's total properties_sum_ if instruction was analyzed before.
   auto it = hlo_properties_.find(instruction);
@@ -121,7 +122,8 @@ absl::Status HloCostAnalysis::RemoveInstruction(HloInstruction* instruction) {
   return absl::OkStatus();
 }
 
-absl::Status HloCostAnalysis::RevisitInstruction(HloInstruction* instruction) {
+absl::Status HloCostAnalysis::RevisitInstruction(
+    const HloInstruction* instruction) {
   TF_RETURN_IF_ERROR(RemoveInstruction(instruction));
   // Now do Preprocess() -> Visit() -> Postprocess() for the instruction same
   // way it is done during the complete analysis.
@@ -173,9 +175,6 @@ int64_t HloCostAnalysis::GetShapeSize(const Shape& shape) const {
   if (!LayoutUtil::HasLayout(shape)) {
     return 0;
   }
-  if (LayoutUtil::IsSparseArray(shape)) {
-    return 0;
-  }
   return options_.shape_size(shape);
 }
 
@@ -219,9 +218,10 @@ int64_t HloCostAnalysis::FusionParameterReadBytes(
           const auto& fusion_users = user->users();
           const HloInstruction* root_instruction =
               user->fused_instructions_computation()->root_instruction();
-          // We define the nested fusion as simple if the parameter directly
-          // feeds the root.
+          // We define the nested fusion as simple if the parameter is the root
+          // or feeds directly into the root.
           const bool fusion_is_simple =
+              root_instruction->operand_count() == 0 ||
               user->fused_parameter(idx) == root_instruction->operand(0);
           // TODO(b/332998529): deal with nested fusions more generally.
           for (const HloInstruction* fusion_user : fusion_users) {
@@ -1167,6 +1167,9 @@ absl::Status HloCostAnalysis::FusionProcessOutputBytesAccessed(
       if (bytes_accessed != 0) {
         return bytes_accessed;
       }
+      if (!shape.IsTuple()) {
+        return bytes_accessed;
+      }
       for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
         const Shape& subshape = shape.tuple_shapes(i);
         if (!subshape.IsTuple() && ShouldFilterFusionOutputIndex(fusion, {i})) {
@@ -1453,7 +1456,7 @@ int64_t HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
 
 int64_t HloCostAnalysis::operand_bytes_accessed(const HloInstruction& hlo,
                                                 int64_t operand_num,
-                                                ShapeIndex index) const {
+                                                const ShapeIndex& index) const {
   return GetPropertyForHlo(hlo, GetOperandBytesAccessedKey(operand_num, index),
                            hlo_properties_);
 }
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.h b/third_party/xla/xla/service/hlo_cost_analysis.h
index a7d159b75ce5..284923c8fedd 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/hlo_cost_analysis.h
@@ -551,9 +551,9 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   // Enable efficient updates if a known small set of instructions within an
   // HLO graph was modified.
   // Updates the cost analysis by removing one instruction.
-  absl::Status RemoveInstruction(HloInstruction* instruction);
+  absl::Status RemoveInstruction(const HloInstruction* instruction);
   // Updates the cost analysis by re-doing the analysis of one instruction.
-  absl::Status RevisitInstruction(HloInstruction* instruction);
+  absl::Status RevisitInstruction(const HloInstruction* instruction);
 
   // Decorates shape_size_ by returning 0 immediately if the shape does not have
   // a layout.
@@ -576,7 +576,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   int64_t transcendental_count(const HloInstruction& hlo) const;
   int64_t bytes_accessed(const HloInstruction& hlo) const;
   int64_t operand_bytes_accessed(const HloInstruction& hlo, int64_t operand_num,
-                                 ShapeIndex index = {}) const;
+                                 const ShapeIndex& index = {}) const;
   // Value indicating how much each input of the instruction
   // is used assuming its output is fully used.
   // This is 1.0 for most cases except operations involving slicing (<1)
diff --git a/third_party/xla/xla/service/hlo_cost_analysis_test.cc b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
index cb38f56e7884..1161c3b42180 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
@@ -15,12 +15,19 @@ limitations under the License.
 
 #include "xla/service/hlo_cost_analysis.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/inlined_vector.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array4d.h"
 #include "xla/client/client.h"
 #include "xla/client/client_library.h"
 #include "xla/client/local_client.h"
@@ -29,14 +36,17 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
-#include "xla/service/local_service.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
+#include "xla/literal_util.h"
 #include "xla/service/service.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace {
@@ -343,8 +353,11 @@ TEST_F(HloCostAnalysisTest, ConvolutionSame) {
   ASSERT_IS_OK(
       hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
 
-  // Output shape is [1x1x3x3] and each output element requires (3x3)
-  // FMAs and one FMA is 2 flops.
+  // Output shape is [1x1x3x3] with the following flops required for each
+  // element:
+  //    4 6 4
+  //    6 9 6
+  //    4 6 4
   // NOTE: This formula only works for the hard-coded dimensions for now.
   EXPECT_EQ(analysis.flop_count(), 2 * (4 + 6 + 4 + 6 + 9 + 6 + 4 + 6 + 4));
 
@@ -721,7 +734,7 @@ TEST_F(HloCostAnalysisTest, LatencyBoundedOptimalTime) {
   EXPECT_EQ(cost_analysis.optimal_seconds(), clock_cycle_seconds);
 }
 
-using FusionCostAnalysis = HloTestBase;
+using FusionCostAnalysis = HloHardwareIndependentTestBase;
 
 TEST_F(FusionCostAnalysis, LoopFusionDynUpdateSlice) {
   // Test for b/234935631.
@@ -1347,7 +1360,7 @@ TEST_F(HloCostAnalysisTest, TupleCost) {
             HloCostAnalysis::kDefaultPointerSize * 2);
 }
 
-using DomainCostAnalysis = HloTestBase;
+using DomainCostAnalysis = HloHardwareIndependentTestBase;
 TEST_F(DomainCostAnalysis, DomainCost) {
   HloCostAnalysis analysis;
 
@@ -1642,16 +1655,6 @@ TEST_F(HloCostAnalysisTest, MultioutputScatter) {
   EXPECT_EQ(analysis.output_bytes_accessed(*root), 2 * sizeof(float) * 2 * 3);
 }
 
-TEST_F(HloCostAnalysisTest, GetShapeSizeIgnoreUnsupportedShape) {
-  // Build a sparse array shape.
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  *shape.mutable_layout() =
-      LayoutUtil::MakeLayout({1, 0}, {DIM_DENSE, DIM_COMPRESSED});
-  HloCostAnalysis analysis;
-  EXPECT_TRUE(LayoutUtil::IsSparseArray(shape));
-  EXPECT_EQ(0, analysis.GetShapeSize(shape));
-}
-
 TEST_F(FusionCostAnalysis, Broadcast) {
   absl::string_view hlo_string = R"(
 HloModule m
@@ -1777,8 +1780,8 @@ ENTRY e {
   // Modify fusion root, revisit the fusion with the analysis and expect
   // updated values. Compare against a complete fresh analysis.
   fusion_root->mutable_slice_limits()->at(0) = 2;
-  fusion_root->mutable_shape()->mutable_dimensions()[0] = 2;
-  root->mutable_shape()->mutable_dimensions()[0] = 2;
+  fusion_root->mutable_shape()->set_dimensions(0, 2);
+  root->mutable_shape()->set_dimensions(0, 2);
   module->mutable_config().SetDefaultComputationLayout(
       module->entry_computation()->ComputeProgramShape());
   ASSERT_IS_OK(modified_analysis.RevisitInstruction(root));
diff --git a/third_party/xla/xla/service/hlo_creation_utils.cc b/third_party/xla/xla/service/hlo_creation_utils.cc
index efd111cc48b0..b815b1ffc796 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -416,7 +417,9 @@ absl::StatusOr<HloInstruction*> MakeMapHlo(
   for (const HloInstruction* operand : operands) {
     CHECK_EQ(computation, operand->parent());
     operand_shapes.push_back(&operand->shape());
-    max_operand_rank = std::max(max_operand_rank, operand->shape().rank());
+    max_operand_rank =
+        std::max(max_operand_rank,
+                 static_cast<int64_t>(operand->shape().dimensions_size()));
   }
   std::vector<int64_t> map_dims(max_operand_rank);
   std::iota(map_dims.begin(), map_dims.end(), 0);
@@ -516,7 +519,7 @@ absl::StatusOr<HloInstruction*> MakeReduceHlo(
     HloOpcode binary_opcode, HloModule* module, const OpMetadata* metadata,
     const FrontendAttributes* frontend_attributes) {
   DCHECK_NE(nullptr, module);
-  std::vector<int64_t> all_dims(operand->shape().rank());
+  std::vector<int64_t> all_dims(operand->shape().dimensions_size());
   std::iota(all_dims.begin(), all_dims.end(), 0);
 
   HloComputation* reduce_computation = MakeBinaryScalarComputation(
@@ -836,8 +839,8 @@ HloInstruction* CreateDegenerateRemovingReshape(HloInstruction* hlo,
                                                 const int64_t index_to_remove) {
   Shape input_shape = hlo->shape();
   std::vector<int64_t> dims;
-  dims.reserve(input_shape.rank() - 1);
-  for (int64_t index = 0; index < input_shape.rank(); index++) {
+  dims.reserve(input_shape.dimensions_size() - 1);
+  for (int64_t index = 0; index < input_shape.dimensions_size(); index++) {
     if (index == index_to_remove) {
       continue;
     }
@@ -852,15 +855,15 @@ HloInstruction* CreateDegenerateAddingReshape(HloInstruction* hlo,
                                               const int index_to_add) {
   Shape input_shape = hlo->shape();
   std::vector<int64_t> dims;
-  dims.reserve(input_shape.rank() - 1);
-  for (int64_t index = 0; index < input_shape.rank(); index++) {
+  dims.reserve(input_shape.dimensions_size() - 1);
+  for (int64_t index = 0; index < input_shape.dimensions_size(); index++) {
     if (index == index_to_add) {
       dims.push_back(1);
     }
     int64_t dim_size = input_shape.dimensions(index);
     dims.push_back(dim_size);
   }
-  if (index_to_add == input_shape.rank()) {
+  if (index_to_add == input_shape.dimensions_size()) {
     dims.push_back(1);
   }
   Shape new_shape = ShapeUtil::MakeShape(input_shape.element_type(), dims);
@@ -905,4 +908,16 @@ HloInstruction* ExpandDegenerateReshape(HloInstruction* inst) {
   return nullptr;
 }
 
+absl::StatusOr<HloInstruction*> MakeWithinBounds(HloInstruction* inst,
+                                                 HloInstruction* lower_bound,
+                                                 HloInstruction* upper_bound) {
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * le,
+      MakeCompareHlo(Comparison::Direction::kLe, lower_bound, inst));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * gt,
+      MakeCompareHlo(Comparison::Direction::kGt, upper_bound, inst));
+  return MakeBinaryHlo(HloOpcode::kAnd, le, gt);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_creation_utils.h b/third_party/xla/xla/service/hlo_creation_utils.h
index 0fe2da44434a..686ff34765f1 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.h
+++ b/third_party/xla/xla/service/hlo_creation_utils.h
@@ -21,11 +21,13 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/literal_util.h"
+#include "xla/primitive_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -312,7 +314,7 @@ HloInstruction* MakeScalarLike(HloInstruction* base, NativeT value) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<NativeT>(value)
                                          .Convert(base->shape().element_type())
                                          .value()));
-  if (base->shape().rank() == 0) {
+  if (base->shape().dimensions_size() == 0) {
     *scalar->mutable_shape() = base->shape();
     return scalar;
   }
@@ -430,6 +432,12 @@ std::unique_ptr<HloInstruction> MakeScalarConstantWithShape(const Shape& shape,
       shape.element_type());
 }
 
+// Create instructions that check if the given instruction is within the given
+// bounds (lower_bound <= inst < upper_bound).
+absl::StatusOr<HloInstruction*> MakeWithinBounds(HloInstruction* inst,
+                                                 HloInstruction* lower_bound,
+                                                 HloInstruction* upper_bound);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_HLO_CREATION_UTILS_H_
diff --git a/third_party/xla/xla/service/hlo_creation_utils_test.cc b/third_party/xla/xla/service/hlo_creation_utils_test.cc
index debabe09c3c5..d90e26615dee 100644
--- a/third_party/xla/xla/service/hlo_creation_utils_test.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils_test.cc
@@ -26,15 +26,15 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -44,7 +44,7 @@ namespace {
 
 namespace m = match;
 
-class HloCreationUtilsTest : public HloTestBase {
+class HloCreationUtilsTest : public HloHardwareIndependentTestBase {
  protected:
   std::unique_ptr<VerifiedHloModule> CreateModuleWithProgramShape(
       PrimitiveType primitive_type, absl::Span<const int64_t> input_shape_dims,
diff --git a/third_party/xla/xla/service/hlo_cse.cc b/third_party/xla/xla/service/hlo_cse.cc
index dccf23291fa1..570ffa03de0e 100644
--- a/third_party/xla/xla/service/hlo_cse.cc
+++ b/third_party/xla/xla/service/hlo_cse.cc
@@ -120,8 +120,10 @@ struct CseKey {
   template <typename H>
   friend H AbslHashValue(H h, const CseKey& key) {
     auto instruction = key.hlo;
-    h = H::combine(std::move(h), instruction->opcode(),
-                   instruction->shape().dimensions());
+    h = instruction->shape().IsArray()
+            ? H::combine(std::move(h), instruction->opcode(),
+                         instruction->shape().dimensions())
+            : H::combine(std::move(h), instruction->opcode());
     auto window_hash = [](H h, const Window& window) {
       const auto& window_dims = window.dimensions();
       for (const auto& window_dim : window_dims) {
@@ -298,6 +300,12 @@ absl::StatusOr<bool> HloCSE::RunOnComputation(HloComputation* computation) {
       continue;
     }
 
+    // Skip instructions that cannot be safely removed.
+    if (!computation->IsSafelyRemovable(instruction,
+                                        ignore_control_dependencies_)) {
+      continue;
+    }
+
     if (only_scalars_ && !ShapeUtil::IsScalar(instruction->shape())) {
       continue;
     }
diff --git a/third_party/xla/xla/service/hlo_cse_test.cc b/third_party/xla/xla/service/hlo_cse_test.cc
index e515fa358bc5..f33d68bb9ef2 100644
--- a/third_party/xla/xla/service/hlo_cse_test.cc
+++ b/third_party/xla/xla/service/hlo_cse_test.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
@@ -603,6 +603,53 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   EXPECT_THAT(root, op::Add(op::Map(op::Constant()), op::Map(op::Constant())));
 }
 
+TEST_F(HloCseTest, CopyOpCSE) {
+  // cp1 can be replaced with cp0
+  const char* const kModuleStr = R"(
+  HloModule m
+  ENTRY main {
+    c = f32[] constant(0)
+    b = f32[4,4] broadcast(c), dimensions={}
+    cp0 = f32[4,4] copy(b)
+    cp1 = f32[4,4] copy(b)
+    ROOT t = (f32[4,4], f32[4,4]) tuple(cp0, cp1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloCSE cse(/*is_layout_sensitive=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&cse, module.get()));
+  EXPECT_TRUE(result);
+  HloInstruction* cp0;
+  HloInstruction* cp1;
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Copy(&cp0), m::Copy(&cp1))));
+  // compare Op pointers to make sure it the same single copyOp
+  EXPECT_TRUE(cp0 == cp1);
+}
+
+TEST_F(HloCseTest, DontCSE_NonSafelyRemovableOp) {
+  // cp1 is not SafelyRemovable (has control-predecessors)
+  // Skip CSE
+  const char* const kModuleStr = R"(
+  HloModule m
+  ENTRY main {
+    p0 = f32[4,4] parameter(0)
+    p1 = f32[4,4] parameter(1)
+    c = f32[] constant(0)
+    b = f32[4,4] broadcast(c), dimensions={}
+    cp0 = f32[4,4] copy(b), control-predecessors={p0}
+    cp1 = f32[4,4] copy(b), control-predecessors={p1}
+    ROOT t = (f32[4,4], f32[4,4]) tuple(cp0, cp1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  // ignore_control_dependencies = false by default
+  HloCSE cse(/*is_layout_sensitive=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&cse, module.get()));
+  EXPECT_FALSE(result);
+}
+
 TEST_F(HloCseTest, CompareComputations) {
   const char* const hlo_string = R"(
     HloModule m
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.h b/third_party/xla/xla/service/hlo_dataflow_analysis.h
deleted file mode 100644
index 571638e53cf8..000000000000
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Analysis for determining the possible set of values for all positions
-// (instructions and ShapeIndexes) in the HLO module. Analysis is module-scoped
-// tracking values across computation boundaries.
-
-#ifndef XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
-#define XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_dataflow_analysis.h"
-
-#endif  // XLA_SERVICE_HLO_DATAFLOW_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/hlo_dce.h b/third_party/xla/xla/service/hlo_dce.h
deleted file mode 100644
index d0ce0665d0d0..000000000000
--- a/third_party/xla/xla/service/hlo_dce.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_DCE_H_
-#define XLA_SERVICE_HLO_DCE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
-
-#endif  // XLA_SERVICE_HLO_DCE_H_
diff --git a/third_party/xla/xla/service/hlo_domain_test.cc b/third_party/xla/xla/service/hlo_domain_test.cc
index 11acf73bf6cf..f4005a67ad19 100644
--- a/third_party/xla/xla/service/hlo_domain_test.cc
+++ b/third_party/xla/xla/service/hlo_domain_test.cc
@@ -21,19 +21,19 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_domain_metadata.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/service/call_inliner.h"
 #include "xla/service/hlo_domain_isolator.h"
 #include "xla/service/hlo_domain_remover.h"
 #include "xla/service/hlo_domain_verifier.h"
 #include "xla/service/sharding_propagation.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloDomainTest : public HloTestBase {
+class HloDomainTest : public HloHardwareIndependentTestBase {
  protected:
   bool FindUserViaDomainPath(HloInstruction* instruction,
                              HloInstruction* operand) const {
diff --git a/third_party/xla/xla/service/hlo_element_type_converter.h b/third_party/xla/xla/service/hlo_element_type_converter.h
deleted file mode 100644
index 3fed01424304..000000000000
--- a/third_party/xla/xla/service/hlo_element_type_converter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
-#define XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/hlo_element_type_converter.h"
-
-#endif  // XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.cc b/third_party/xla/xla/service/hlo_graph_dumper.cc
index 8d6c624a8ba9..48268e43d45b 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper.cc
@@ -679,7 +679,7 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
     return false;
   }
 
-  if (subcomp->WhileCallInstruction() != nullptr &&
+  if (!subcomp->caller_instructions(HloOpcode::kWhile).empty() &&
       !hlo_render_options_.show_while_subcomputations) {
     return false;
   }
@@ -1486,10 +1486,10 @@ std::string HloDotDumper::GetInstructionNodeExtraInfo(
     // layout on tuples or tensors with just one dimension (which only have one
     // possible layout) to avoid visual noise.
     bool shape_is_multidim = false;
-    ShapeUtil::ForEachSubshape(instr->shape(),
-                               [&](const Shape& s, const ShapeIndex&) {
-                                 shape_is_multidim |= s.dimensions_size() > 1;
-                               });
+    ShapeUtil::ForEachSubshape(
+        instr->shape(), [&](const Shape& s, const ShapeIndex&) {
+          shape_is_multidim |= s.IsArray() && s.dimensions().size() > 1;
+        });
     std::string instr_shape;
     if (instr->opcode() != HloOpcode::kTuple && shape_is_multidim) {
       instr_shape = ShapeUtil::HumanStringWithLayout(instr->shape());
@@ -1613,6 +1613,57 @@ const HloInstruction* HloDotDumper::GetNodeForEdge(
   return instr;
 }
 
+// Detect if an instruction is an AsyncCollectiveFusion parameter that is
+// implementation details.
+bool IsAcfPrameter(const xla::HloInstruction* instruction) {
+  // Parameter is fused
+  if (instruction->opcode() != xla::HloOpcode::kParameter ||
+      !instruction->IsFused())
+    return false;
+
+  // Parameter piped through and is only consumed by 1 user
+  // Parameter 0 consumed by both root and all-gather will always persist.
+  if (instruction->user_count() != 1) return false;
+
+  const xla::HloComputation* parent_computation = instruction->parent();
+  int64_t parameter_number = instruction->parameter_number();
+  xla::HloInstruction* fusion_instruction =
+      parent_computation->FusionInstruction();
+  const xla::HloInstruction* parameterOperand =
+      fusion_instruction->operand(parameter_number);
+  // Operand is get-tuple-element
+  if (parameterOperand->opcode() != xla::HloOpcode::kGetTupleElement) {
+    return false;
+  }
+
+  const xla::HloInstruction* gteOperand = parameterOperand->operand(0);
+  if (gteOperand->opcode() != xla::HloOpcode::kFusion) {
+    return false;
+  }
+
+  constexpr absl::string_view kAcfComputationName = "async_collective_fusion";
+  constexpr absl::string_view kAcsInstructionName = "AsyncCollectiveStart";
+  constexpr absl::string_view kAcdInstructionName = "AsyncCollectiveDone";
+  auto src_instruction =
+      gteOperand->fused_instructions_computation()->root_instruction();
+  // (1) Parameter is fused into AsyncCollectiveFusion, operand is gte from
+  // AsyncCollectiveStart custom call and user is the root node of ACF
+  // (2) Parameter is mapped from Params in AsyncCollectiveFusion - operand is
+  // gte from ACF, and user is AsyncCollectiveDone custom call
+  return (absl::StartsWith(parent_computation->name(), kAcfComputationName) &&
+          src_instruction->IsCustomCall(kAcsInstructionName) &&
+          instruction->users()[0] == parent_computation->root_instruction()) ||
+         (instruction->users()[0]->IsCustomCall(kAcdInstructionName) &&
+          absl::StartsWith(gteOperand->fused_instructions_computation()->name(),
+                           kAcfComputationName));
+}
+
+// Rules to filter out input nodes (no operands) that are implementation
+// details.
+bool ShouldFilterInputNode(const HloInstruction* instr) {
+  return IsAcfPrameter(instr);
+}
+
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
 NodeFilter MakeNodeRadiusAroundFilter(
@@ -1629,7 +1680,7 @@ NodeFilter MakeNodeRadiusAroundFilter(
     std::tie(instr, depth) = worklist.front();
     worklist.pop_front();
 
-    nodes[instr] = kNormalNode;
+    nodes[instr] = ShouldFilterInputNode(instr) ? kHideNode : kNormalNode;
     if (depth == radius) {
       continue;
     }
@@ -1731,7 +1782,8 @@ NodeFilter MakeNodeRadiusAroundFilter(
           return it->second;
         }
         // Show all nodes in subcomputations.
-        if (instr->parent() != root->parent()) {
+        if (instr->parent() != root->parent() &&
+            !ShouldFilterInputNode(instr)) {
           return kNormalNode;
         }
         return kHideNode;
@@ -1841,7 +1893,7 @@ static std::pair<int, int> FusionVisualizerStateKey(
 static absl::StatusOr<std::string> CompressAndEncode(absl::string_view input) {
   class WritableStringFile : public tsl::WritableFile {
    public:
-    explicit WritableStringFile(std::string* data) : data_(data){};
+    explicit WritableStringFile(std::string* data) : data_(data) {};
     ~WritableStringFile() override = default;
 
     absl::Status Append(absl::string_view data) override {
diff --git a/third_party/xla/xla/service/hlo_graph_dumper_test.cc b/third_party/xla/xla/service/hlo_graph_dumper_test.cc
index d76e734f33ec..17965ace1ae7 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper_test.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper_test.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/literal_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
 #include "xla/xla.pb.h"
 
@@ -34,7 +34,7 @@ namespace {
 using absl::StrCat;
 using ::testing::HasSubstr;
 
-using HloGraphDumperTest = HloTestBase;
+using HloGraphDumperTest = HloHardwareIndependentTestBase;
 
 std::string TestName() {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
diff --git a/third_party/xla/xla/service/hlo_input_output_alias_config_test.cc b/third_party/xla/xla/service/hlo_input_output_alias_config_test.cc
index 7c2cc1d945c0..8d616942563b 100644
--- a/third_party/xla/xla/service/hlo_input_output_alias_config_test.cc
+++ b/third_party/xla/xla/service/hlo_input_output_alias_config_test.cc
@@ -24,16 +24,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/shape_util.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/types.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
-class HloInputOutputAliasConfigTest : public HloTestBase {
+class HloInputOutputAliasConfigTest : public HloHardwareIndependentTestBase {
  protected:
   void expect_aliased(const ShapeIndex& output_index, int64_t param_number,
                       const ShapeIndex& param_index,
@@ -216,7 +216,7 @@ ENTRY main {
       /*param_index=*/{}));
 }
 
-class HloBufferDonorConfigTest : public HloTestBase {};
+class HloBufferDonorConfigTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HloBufferDonorConfigTest, SimpleBufferDonor) {
   const std::string module_str = R"(
diff --git a/third_party/xla/xla/service/hlo_instruction_test.cc b/third_party/xla/xla/service/hlo_instruction_test.cc
index 2a197684c37c..98ee109cf3f3 100644
--- a/third_party/xla/xla/service/hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/hlo_instruction_test.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
@@ -37,19 +39,19 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/layout_util.h"
 #include "xla/literal_util.h"
-#include "xla/protobuf_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
@@ -57,15 +59,15 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 
 namespace xla {
-
 namespace {
 
 namespace m = ::xla::match;
 
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
+using ::tsl::proto_testing::EqualsProto;
 
-class HloInstructionTest : public HloTestBase {
+class HloInstructionTest : public HloHardwareIndependentTestBase {
  protected:
   Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
 };
@@ -765,15 +767,15 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
   auto* fusion = computation->CreateFusionInstruction(
       {exp2, exp1}, HloInstruction::FusionKind::kLoop);
 
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fusion->metadata()));
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(
-      metadata, fusion->fused_expression_root()->metadata()));
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(
-      metadata, fusion->fused_expression_root()->operand(0)->metadata()));
+  EXPECT_THAT(fusion->metadata(), EqualsProto(metadata));
+  EXPECT_THAT(fusion->fused_expression_root()->metadata(),
+              EqualsProto(metadata));
+  EXPECT_THAT(fusion->fused_expression_root()->operand(0)->metadata(),
+              EqualsProto(metadata));
 
   std::string new_name = "foobarfoo";
   auto cloned = fusion->CloneWithNewOperands(fusion->shape(), {}, new_name);
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fusion->metadata()));
+  EXPECT_THAT(fusion->metadata(), EqualsProto(metadata));
 
   size_t index = cloned->name().rfind(new_name);
   EXPECT_TRUE(index != std::string::npos);
@@ -1670,8 +1672,8 @@ TEST_F(HloInstructionTest, StringifyDot) {
   auto options = HloPrintOptions().set_print_metadata(false);
 
   EXPECT_EQ(dot->ToString(options),
-            "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
-            "%transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}");
+            "%dot = f32[5,20]{1,0} dot(%x, %transpose), "
+            "lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
   auto options2 = HloPrintOptions()
                       .set_print_metadata(false)
@@ -1707,10 +1709,10 @@ TEST_F(HloInstructionTest, StringifySparseDot) {
       ShapeUtil::MakeShape(F32, {5, 20}), x, y, dot_dnums,
       DefaultPrecisionConfig(2), {sparsity_descriptor}, meta_operands));
 
-  EXPECT_EQ(dot->ToString(),
-            "%dot = f32[5,20]{1,0} dot(f32[5,16]{1,0} %x, f32[32,20]{1,0} %y, "
-            "u16[5,2]{1,0} %meta), lhs_contracting_dims={1}, "
-            "rhs_contracting_dims={0}, sparsity=L.1@2:4");
+  EXPECT_EQ(
+      dot->ToString(),
+      "%dot = f32[5,20]{1,0} dot(%x, %y, %meta), lhs_contracting_dims={1}, "
+      "rhs_contracting_dims={0}, sparsity=L.1@2:4");
 }
 
 TEST_F(HloInstructionTest, StringifyConditional) {
@@ -1742,8 +1744,7 @@ TEST_F(HloInstructionTest, StringifyConditional) {
       builder.AddInstruction(HloInstruction::CreateConditional(
           sout, pred, x, computation, x, computation));
   EXPECT_EQ(conditional->ToString(options),
-            "%conditional = f32[5,20]{1,0} conditional(pred[] %constant, "
-            "f32[5,10]{1,0} %x, f32[5,10]{1,0} %x), "
+            "%conditional = f32[5,20]{1,0} conditional(%constant, %x, %x), "
             "true_computation=%TransposeDot, false_computation=%TransposeDot");
 }
 
@@ -1773,8 +1774,8 @@ TEST_F(HloInstructionTest, StringifyWhile) {
   HloInstruction* loop = builder.AddInstruction(
       HloInstruction::CreateWhile(sout, computation, computation, x));
   EXPECT_EQ(loop->ToString(options),
-            "%while = f32[5,20]{1,0} while(f32[5,10]{1,0} %x), "
-            "condition=%TransposeDot, body=%TransposeDot");
+            "%while = f32[5,20]{1,0} while(%x), condition=%TransposeDot, "
+            "body=%TransposeDot");
 }
 
 TEST_F(HloInstructionTest, GetSetStatisticsViz) {
@@ -1800,16 +1801,14 @@ TEST_F(HloInstructionTest, GetSetStatisticsViz) {
   x->set_stat_index_to_visualize(0);
 
   EXPECT_TRUE(x->has_statistics());
-  EXPECT_TRUE(
-      protobuf_util::ProtobufEquals(x->statistic_to_visualize(), statistic));
+  EXPECT_THAT(x->statistic_to_visualize(), EqualsProto(statistic));
 
   statistic.set_stat_val(40.0);
   *statistics_viz.add_statistics() = statistic;
 
   x->set_statistics_viz(statistics_viz);
 
-  EXPECT_TRUE(
-      protobuf_util::ProtobufEquals(x->statistics_viz(), statistics_viz));
+  EXPECT_THAT(x->statistics_viz(), EqualsProto(statistics_viz));
 }
 
 TEST_F(HloInstructionTest, StringifyStatisticsViz) {
@@ -1825,8 +1824,7 @@ TEST_F(HloInstructionTest, StringifyStatisticsViz) {
 
   // Empty statistics viz must not print "statistics={}"
   add->set_statistics_viz({});
-  EXPECT_EQ(add->ToString(),
-            "%add = f32[5,10]{1,0} add(f32[5,10]{1,0} %x, f32[5,10]{1,0} %y)");
+  EXPECT_EQ(add->ToString(), "%add = f32[5,10]{1,0} add(%x, %y)");
 
   auto CreateStatisticsVizWithStatistics =
       [](int64_t stat_index_to_visualize,
@@ -1855,7 +1853,7 @@ TEST_F(HloInstructionTest, StringifyStatisticsViz) {
       1, {{"stat-1", 33.0}, {"stat-2", 44.0}}));
 
   EXPECT_EQ(add->ToString(),
-            "%add = f32[5,10]{1,0} add(f32[5,10]{1,0} %x, f32[5,10]{1,0} %y), "
+            "%add = f32[5,10]{1,0} add(%x, %y), "
             "statistics={visualizing_index=1,stat-1=33,stat-2=44}");
 }
 
@@ -1888,8 +1886,7 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
 
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
-            "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
-            "s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), "
+            "gather(%input_tensor, %start_indices), "
             "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, "
             "start_index_map={0,1,2,3,4}, "
             "index_vector_dim=4, slice_sizes={30,29,28,27,26}");
@@ -1924,8 +1921,7 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
 
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
-            "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
-            "s64[10,9,5,7,6]{4,3,2,1,0} %start_indices), "
+            "gather(%input_tensor, %start_indices), "
             "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, "
             "start_index_map={0,1,2,3,4}, "
             "index_vector_dim=2, slice_sizes={30,29,28,27,26}");
@@ -1971,15 +1967,12 @@ TEST_F(HloInstructionTest, StringifyScatter) {
           /*unique_indices=*/false));
   module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(
-      scatter_instruction->ToString(),
-      "%scatter = f32[50,49,48,47,46]{4,3,2,1,0} "
-      "scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
-      "s64[10,9,5,7,6]{4,3,2,1,0} %scatter_indices, "
-      "f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %scatter_updates), "
-      "update_window_dims={4,5,6,7,8}, inserted_window_dims={}, "
-      "scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=2, "
-      "to_apply=%Scatter.update");
+  EXPECT_EQ(scatter_instruction->ToString(),
+            "%scatter = f32[50,49,48,47,46]{4,3,2,1,0} "
+            "scatter(%input_tensor, %scatter_indices, %scatter_updates), "
+            "update_window_dims={4,5,6,7,8}, inserted_window_dims={}, "
+            "scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=2, "
+            "to_apply=%Scatter.update");
 }
 
 TEST_F(HloInstructionTest, StringifyAsyncOps) {
@@ -2017,9 +2010,9 @@ TEST_F(HloInstructionTest, StringifyAsyncOps) {
 
 ENTRY %Entry (p0: f32[10]) -> f32[20] {
   %p0 = f32[10]{0} parameter(0)
-  %custom-call-start = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-start(f32[10]{0} %p0), async_execution_thread="parallel_thread", custom_call_target="foo"
-  %custom-call-update = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-update(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-start)
-  ROOT %custom-call-done = f32[20]{0} custom-call-done(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-update)
+  %custom-call-start = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-start(%p0), async_execution_thread="parallel_thread", custom_call_target="foo"
+  %custom-call-update = ((f32[10]{0}), f32[20]{0}, s32[]) custom-call-update(%custom-call-start)
+  ROOT %custom-call-done = f32[20]{0} custom-call-done(%custom-call-update)
 }
 
 )";
@@ -2032,14 +2025,14 @@ ENTRY %Entry (p0: f32[10]) -> f32[20] {
 
 %AsyncOp (p0.1: f32[10]) -> f32[20] {
   %p0.1 = f32[10]{0} parameter(0)
-  ROOT %custom-call = f32[20]{0} custom-call(f32[10]{0} %p0.1), custom_call_target="foo"
+  ROOT %custom-call = f32[20]{0} custom-call(%p0.1), custom_call_target="foo"
 }, execution_thread="parallel_thread"
 
 ENTRY %Entry (p0: f32[10]) -> f32[20] {
   %p0 = f32[10]{0} parameter(0)
-  %custom-call-start = ((f32[10]{0}), f32[20]{0}, s32[]) async-start(f32[10]{0} %p0), async_execution_thread="parallel_thread", calls=%AsyncOp
-  %custom-call-update = ((f32[10]{0}), f32[20]{0}, s32[]) async-update(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-start)
-  ROOT %custom-call-done = f32[20]{0} async-done(((f32[10]{0}), f32[20]{0}, s32[]) %custom-call-update)
+  %custom-call-start = ((f32[10]{0}), f32[20]{0}, s32[]) async-start(%p0), async_execution_thread="parallel_thread", calls=%AsyncOp
+  %custom-call-update = ((f32[10]{0}), f32[20]{0}, s32[]) async-update(%custom-call-start)
+  ROOT %custom-call-done = f32[20]{0} async-done(%custom-call-update)
 }
 
 )";
@@ -2101,14 +2094,14 @@ TEST_F(HloInstructionTest, StringifyAsyncOpsWithReduceScatter) {
 %add (p0: f32[], p1: f32[]) -> f32[] {
   %p0 = f32[] parameter(0)
   %p1 = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %p0, f32[] %p1)
+  ROOT %add = f32[] add(%p0, %p1)
 }, execution_thread="parallel_thread"
 
 ENTRY %Entry (pentry: f32[20]) -> f32[10] {
   %pentry = f32[20]{0} parameter(0)
-  %reduce-scatter-start = ((f32[20]{0}), f32[10]{0}) reduce-scatter-start(f32[20]{0} %pentry), async_execution_thread="parallel_thread", replica_groups={}, dimensions={0}, to_apply=%add
-  %reduce-scatter-update = ((f32[20]{0}), f32[10]{0}) reduce-scatter-update(((f32[20]{0}), f32[10]{0}) %reduce-scatter-start)
-  ROOT %reduce-scatter-done = f32[10]{0} reduce-scatter-done(((f32[20]{0}), f32[10]{0}) %reduce-scatter-update)
+  %reduce-scatter-start = ((f32[20]{0}), f32[10]{0}) reduce-scatter-start(%pentry), async_execution_thread="parallel_thread", replica_groups={}, dimensions={0}, to_apply=%add
+  %reduce-scatter-update = ((f32[20]{0}), f32[10]{0}) reduce-scatter-update(%reduce-scatter-start)
+  ROOT %reduce-scatter-done = f32[10]{0} reduce-scatter-done(%reduce-scatter-update)
 }
 
 )";
@@ -2123,19 +2116,19 @@ ENTRY %Entry (pentry: f32[20]) -> f32[10] {
 %add (p0: f32[], p1: f32[]) -> f32[] {
   %p0 = f32[] parameter(0)
   %p1 = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %p0, f32[] %p1)
+  ROOT %add = f32[] add(%p0, %p1)
 }, execution_thread="parallel_thread"
 
 %AsyncOp (pasync: f32[20]) -> f32[10] {
   %pasync = f32[20]{0} parameter(0)
-  ROOT %reduce-scatter = f32[10]{0} reduce-scatter(f32[20]{0} %pasync), replica_groups={}, dimensions={0}, to_apply=%add
+  ROOT %reduce-scatter = f32[10]{0} reduce-scatter(%pasync), replica_groups={}, dimensions={0}, to_apply=%add
 }, execution_thread="parallel_thread"
 
 ENTRY %Entry (pentry: f32[20]) -> f32[10] {
   %pentry = f32[20]{0} parameter(0)
-  %reduce-scatter-start = ((f32[20]{0}), f32[10]{0}) async-start(f32[20]{0} %pentry), async_execution_thread="parallel_thread", calls=%AsyncOp
-  %reduce-scatter-update = ((f32[20]{0}), f32[10]{0}) async-update(((f32[20]{0}), f32[10]{0}) %reduce-scatter-start)
-  ROOT %reduce-scatter-done = f32[10]{0} async-done(((f32[20]{0}), f32[10]{0}) %reduce-scatter-update)
+  %reduce-scatter-start = ((f32[20]{0}), f32[10]{0}) async-start(%pentry), async_execution_thread="parallel_thread", calls=%AsyncOp
+  %reduce-scatter-update = ((f32[20]{0}), f32[10]{0}) async-update(%reduce-scatter-start)
+  ROOT %reduce-scatter-done = f32[10]{0} async-done(%reduce-scatter-update)
 }
 
 )";
@@ -2411,8 +2404,7 @@ TEST_F(HloInstructionTest, CloneWindowOnCustomCall) {
   Window w = window_util::MakeWindow({1, 2, 3});
   instr->set_window(w);
   auto clone = instr->Clone();
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(clone->window(), w))
-      << clone->window().DebugString();
+  EXPECT_THAT(clone->window(), EqualsProto(w));
 }
 
 TEST_F(HloInstructionTest, CloneDnumsOnCustomCall) {
@@ -2423,9 +2415,7 @@ TEST_F(HloInstructionTest, CloneDnumsOnCustomCall) {
   dnums.set_output_batch_dimension(42);
   instr->set_convolution_dimension_numbers(dnums);
   auto clone = instr->Clone();
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(
-      clone->convolution_dimension_numbers(), dnums))
-      << clone->convolution_dimension_numbers().DebugString();
+  EXPECT_THAT(clone->convolution_dimension_numbers(), EqualsProto(dnums));
 }
 
 TEST_F(HloInstructionTest, CloneHasSideEffectOnCustomCall) {
@@ -2465,9 +2455,8 @@ TEST_F(HloInstructionTest, PreserveOperandPrecisionOnCloneConv) {
   auto* conv = module->entry_computation()->root_instruction();
 
   auto clone = conv->Clone();
-  EXPECT_THAT(
-      clone->precision_config().operand_precision(),
-      ::testing::ElementsAre(PrecisionConfig::HIGH, PrecisionConfig::DEFAULT));
+  EXPECT_THAT(clone->precision_config().operand_precision(),
+              ElementsAre(PrecisionConfig::HIGH, PrecisionConfig::DEFAULT));
 }
 
 TEST_F(HloInstructionTest, ReuseReshapeOfFusionParameter) {
@@ -2604,9 +2593,8 @@ TEST_F(HloInstructionTest, VerifyToApplyRegionPointsToReduceScatter) {
   // the reduce-scatter instruction.
   for (HloComputation* comp : module->MakeComputationPostOrder()) {
     if (!comp->IsEntryComputation()) {
-      EXPECT_TRUE(comp->IsCollectiveCalledComputation());
-      EXPECT_EQ(comp->CollectiveCallInstruction(),
-                module->entry_computation()->root_instruction());
+      EXPECT_THAT(comp->caller_instructions(),
+                  ElementsAre(module->entry_computation()->root_instruction()));
     }
   }
 }
@@ -2642,9 +2630,8 @@ TEST_F(HloInstructionTest, VerifyToApplyRegionPointsToAllReduce) {
   // the all-reduce instruction.
   for (HloComputation* comp : module->MakeComputationPostOrder()) {
     if (!comp->IsEntryComputation()) {
-      EXPECT_TRUE(comp->IsCollectiveCalledComputation());
-      EXPECT_EQ(comp->CollectiveCallInstruction(),
-                module->entry_computation()->root_instruction());
+      EXPECT_THAT(comp->caller_instructions(),
+                  ElementsAre(module->entry_computation()->root_instruction()));
     }
   }
 }
@@ -2719,25 +2706,17 @@ TEST_F(HloInstructionTest, VerifyBodyComputationPointsToWhile) {
   module->AddEntryComputation(main_builder.Build());
   // Should find one while body computation in the graph and it should point to
   // the while instruction.
-  int num_while_body_comp = 0;
-  for (HloComputation* comp : module->MakeComputationPostOrder()) {
-    if (comp->IsWhileBodyComputation()) {
-      num_while_body_comp += 1;
-      EXPECT_EQ(comp->WhileCallInstruction(),
-                module->entry_computation()->root_instruction());
-    }
-  }
-  EXPECT_EQ(num_while_body_comp, 1);
-
+  int num_whiles = 0;
   for (HloInstruction* instruction :
        module->entry_computation()->instructions()) {
     if (instruction->opcode() == HloOpcode::kWhile) {
+      ++num_whiles;
       HloComputation* while_body = instruction->while_body();
-      EXPECT_TRUE(while_body->IsWhileBodyComputation());
-      HloInstruction* while_back_ref = while_body->WhileCallInstruction();
-      EXPECT_EQ(while_back_ref->while_body(), while_body);
+      EXPECT_EQ(while_body->GetUniqueCaller(HloOpcode::kWhile).value(),
+                instruction);
     }
   }
+  EXPECT_EQ(num_whiles, 1);
 }
 
 TEST_F(HloInstructionTest,
@@ -2787,10 +2766,12 @@ TEST_F(HloInstructionTest,
   // point to the conditional instruction.
   int num_conditional_branch_comp = 0;
   for (HloComputation* comp : module->MakeComputationPostOrder()) {
-    if (comp->IsConditionalBranchComputation()) {
+    auto conditional_callers =
+        comp->caller_instructions(HloOpcode::kConditional);
+    if (!conditional_callers.empty()) {
       num_conditional_branch_comp += 1;
-      EXPECT_EQ(comp->ConditionalCallInstruction(),
-                module->entry_computation()->root_instruction());
+      EXPECT_THAT(conditional_callers,
+                  ElementsAre(module->entry_computation()->root_instruction()));
     }
   }
   EXPECT_EQ(num_conditional_branch_comp, 2);
@@ -2862,10 +2843,12 @@ TEST_F(HloInstructionTest,
   // point to the conditional instruction.
   int num_conditional_branch_comp = 0;
   for (HloComputation* comp : module->MakeComputationPostOrder()) {
-    if (comp->IsConditionalBranchComputation()) {
+    auto conditional_callers =
+        comp->caller_instructions(HloOpcode::kConditional);
+    if (!conditional_callers.empty()) {
       num_conditional_branch_comp += 1;
-      EXPECT_EQ(comp->ConditionalCallInstruction(),
-                module->entry_computation()->root_instruction());
+      EXPECT_THAT(conditional_callers,
+                  ElementsAre(module->entry_computation()->root_instruction()));
     }
   }
   EXPECT_EQ(num_conditional_branch_comp, branch_computations.size());
@@ -3223,8 +3206,7 @@ TEST_F(HloInstructionTest, ValidResultAccuracy) {
   // exp->set_result_accuracy(result_accuracy_proto);
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(result_accuracy_proto,
-                                            exp->result_accuracy()));
+  EXPECT_THAT(exp->result_accuracy(), EqualsProto(result_accuracy_proto));
 
   // mode: HIGHEST
   EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
@@ -3234,8 +3216,7 @@ TEST_F(HloInstructionTest, ValidResultAccuracy) {
       &result_accuracy_proto));
   exp = builder.AddInstruction(HloInstruction::CreateUnary(
       r0f32_, HloOpcode::kExp, foo, result_accuracy_proto));
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(result_accuracy_proto,
-                                            exp->result_accuracy()));
+  EXPECT_THAT(exp->result_accuracy(), EqualsProto(result_accuracy_proto));
 }
 
 TEST_F(HloInstructionTest, InvalidResultAccuracy) {
@@ -3272,7 +3253,7 @@ TEST_F(HloInstructionTest, CreateFromProtoExp) {
       HloInstruction::CreateFromProto(
           proto_valid,
           {{0, HloInstruction::CreateParameter(0, r0f32_, "foo").get()}}));
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(hlo->result_accuracy(), r));
+  EXPECT_THAT(hlo->result_accuracy(), EqualsProto(r));
   HloInstructionProto proto_invalid;
   proto_invalid.set_opcode("exponential");
   proto_invalid.set_name("exp");
@@ -3332,8 +3313,7 @@ TEST_F(HloInstructionTest, CreateUnaryWithResultAccuracy) {
   std::unique_ptr<HloInstruction> unary_inst = HloInstruction::CreateUnary(
       r0f32_, HloOpcode::kExp,
       HloInstruction::CreateParameter(0, r0f32_, "foo").get(), result_accuracy);
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(result_accuracy,
-                                            unary_inst->result_accuracy()));
+  EXPECT_THAT(unary_inst->result_accuracy(), EqualsProto(result_accuracy));
 }
 
 TEST_F(HloInstructionTest, PrintUnaryWithResultAccuracy) {
@@ -3345,20 +3325,20 @@ TEST_F(HloInstructionTest, PrintUnaryWithResultAccuracy) {
   HloInstruction* exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, x, result_accuracy));
   EXPECT_EQ(exp->ToString(),
-            "%exponential = f32[] exponential(f32[] %x), "
+            "%exponential = f32[] exponential(%x), "
             "result_accuracy={tolerance={atol=0,rtol=0.4,ulps=0}}");
   EXPECT_TRUE(exp->has_result_accuracy());
   HloInstruction* exp_no_result_accuracy = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, x));
   EXPECT_EQ(exp_no_result_accuracy->ToString(),
-            "%exponential = f32[] exponential(f32[] %x)");
+            "%exponential = f32[] exponential(%x)");
   EXPECT_FALSE(exp_no_result_accuracy->has_result_accuracy());
   HloInstruction* exp_default_set = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, x));
   // Setting the mode to DEFAULT is the same as not setting it at all.
   exp_default_set->set_result_accuracy(ResultAccuracy());
   EXPECT_EQ(exp_default_set->ToString(),
-            "%exponential = f32[] exponential(f32[] %x)");
+            "%exponential = f32[] exponential(%x)");
   EXPECT_FALSE(exp_default_set->has_result_accuracy());
 }
 
diff --git a/third_party/xla/xla/service/hlo_lexer.h b/third_party/xla/xla/service/hlo_lexer.h
deleted file mode 100644
index aad399ed291f..000000000000
--- a/third_party/xla/xla/service/hlo_lexer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_LEXER_H_
-#define XLA_SERVICE_HLO_LEXER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/parser/hlo_lexer.h"
-
-#endif  // XLA_SERVICE_HLO_LEXER_H_
diff --git a/third_party/xla/xla/service/hlo_liveness_analysis.h b/third_party/xla/xla/service/hlo_liveness_analysis.h
deleted file mode 100644
index fd590408d539..000000000000
--- a/third_party/xla/xla/service/hlo_liveness_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
-#define XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_liveness_analysis.h"
-
-#endif  // XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/hlo_memory_scheduler.h b/third_party/xla/xla/service/hlo_memory_scheduler.h
deleted file mode 100644
index 09d8b432f998..000000000000
--- a/third_party/xla/xla/service/hlo_memory_scheduler.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2016 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_MEMORY_SCHEDULER_H_
-#define XLA_SERVICE_HLO_MEMORY_SCHEDULER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
-
-#endif  // XLA_SERVICE_HLO_MEMORY_SCHEDULER_H_
diff --git a/third_party/xla/xla/service/hlo_module_config.cc b/third_party/xla/xla/service/hlo_module_config.cc
index 9e30e5525cfc..abf4b5aae9b4 100644
--- a/third_party/xla/xla/service/hlo_module_config.cc
+++ b/third_party/xla/xla/service/hlo_module_config.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/service/computation_layout.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/schedule_config.h"
 #include "xla/service/sharding_config.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
@@ -71,7 +72,9 @@ std::string HloModuleConfig::compilation_cache_key() const {
       params.push_back(param_layout.shape().DebugString());
     }
     StrAppend(&key, absl::StrJoin(params, ", "), ") => ",
-              entry_computation_layout_->result_shape().SerializeAsString());
+              entry_computation_layout_->result_shape()
+                  .ToProto()
+                  .SerializeAsString());
   }
   if (seed() != 0) {
     static std::atomic<int> counter{0};
@@ -337,6 +340,7 @@ HloModuleConfigProto HloModuleConfig::ToProto() const {
   proto.set_device_memory_size(device_memory_size_);
   proto.set_use_shardy_partitioner(use_shardy_partitioner_);
   *proto.mutable_sharding_config() = ShardingConfig::ToProto(sharding_config_);
+  *proto.mutable_schedule_config() = ScheduleConfig::ToProto(schedule_config_);
   return proto;
 }
 
@@ -345,7 +349,9 @@ HloModuleConfig::CreateFromProto(const HloModuleConfigProto& proto) {
   auto config = std::make_unique<HloModuleConfig>();
 
   if (proto.has_entry_computation_layout()) {
-    auto comp_layout = ProgramShape{proto.entry_computation_layout()};
+    TF_ASSIGN_OR_RETURN(
+        auto comp_layout,
+        ProgramShape::FromProto(proto.entry_computation_layout()));
     config->SetComputationLayoutIfExists(comp_layout);
   } else {
     config->clear_entry_computation_layout();
@@ -413,6 +419,7 @@ HloModuleConfig::CreateFromProto(const HloModuleConfigProto& proto) {
   config->device_memory_size_ = proto.device_memory_size();
   config->use_shardy_partitioner_ = proto.use_shardy_partitioner();
   config->sharding_config_ = ShardingConfig::FromProto(proto.sharding_config());
+  config->schedule_config_ = ScheduleConfig::FromProto(proto.schedule_config());
   return std::move(config);
 }
 
diff --git a/third_party/xla/xla/service/hlo_module_config.h b/third_party/xla/xla/service/hlo_module_config.h
index 1b2b796dc252..23bb9c3a56fd 100644
--- a/third_party/xla/xla/service/hlo_module_config.h
+++ b/third_party/xla/xla/service/hlo_module_config.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/computation_layout.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/schedule_config.h"
 #include "xla/service/sharding_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -287,6 +288,9 @@ class HloModuleConfig {
   void set_static_device_assignment(const DeviceAssignment& device_assignment) {
     static_device_assignment_ = device_assignment;
   }
+  void reset_static_device_assignment() {
+    static_device_assignment_ = std::nullopt;
+  }
 
   // Checks if this config has a simulated device assignment.
   bool has_pre_simulation_device_assignment() const {
@@ -383,6 +387,9 @@ class HloModuleConfig {
   const ShardingConfig& sharding_config() const { return sharding_config_; }
   ShardingConfig* mutable_sharding_config() { return &sharding_config_; }
 
+  const ScheduleConfig& schedule_config() const { return schedule_config_; }
+  ScheduleConfig* mutable_schedule_config() { return &schedule_config_; }
+
   int phase_index() const { return phase_index_; }
   void set_phase_index(const int phase_index) { phase_index_ = phase_index; }
 
@@ -510,7 +517,7 @@ class HloModuleConfig {
   // instead. O3 might enable costly algorithms to reduce memory usage that may
   // greatly increase compile time.
   ExecutionOptions::EffortLevel memory_fitting_level_ =
-      ExecutionOptions::EFFORT_UNKNOWN;
+      ExecutionOptions::EFFORT_O2;
 
   // If enabled, deduplicate equivalent hlos into function calls to reduce code
   // size.
@@ -615,6 +622,10 @@ class HloModuleConfig {
   // sharding of operation v.
   ShardingConfig sharding_config_;
 
+  // Schedule configuration, where schedule_config_.sequence is the sequence of
+  // instructions to be scheduled.
+  ScheduleConfig schedule_config_;
+
   // LINT.ThenChange(//tensorflow/compiler/xla/xla.proto)
 };
 
diff --git a/third_party/xla/xla/service/hlo_module_dce_test.cc b/third_party/xla/xla/service/hlo_module_dce_test.cc
index 4b1a7b7e2e44..2d63cce92564 100644
--- a/third_party/xla/xla/service/hlo_module_dce_test.cc
+++ b/third_party/xla/xla/service/hlo_module_dce_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -29,7 +29,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class HloModuleDceTest : public HloTestBase {
+class HloModuleDceTest : public HloHardwareIndependentTestBase {
  protected:
   HloModuleDceTest() {}
 
diff --git a/third_party/xla/xla/service/hlo_module_group_test.cc b/third_party/xla/xla/service/hlo_module_group_test.cc
index 007df88bdcc9..c7256e4550ca 100644
--- a/third_party/xla/xla/service/hlo_module_group_test.cc
+++ b/third_party/xla/xla/service/hlo_module_group_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module_group.h"
 
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_group_metadata.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 
 namespace xla {
@@ -30,7 +30,7 @@ namespace op = ::xla::testing::opcode_matchers;
 using ::testing::Property;
 using ::testing::StrEq;
 
-class HloModuleGroupTest : public HloTestBase {
+class HloModuleGroupTest : public HloHardwareIndependentTestBase {
  protected:
   HloModuleGroupTest() = default;
 };
diff --git a/third_party/xla/xla/service/hlo_module_metadata_test.cc b/third_party/xla/xla/service/hlo_module_metadata_test.cc
index fc64b31d019a..e3861c97c2f5 100644
--- a/third_party/xla/xla/service/hlo_module_metadata_test.cc
+++ b/third_party/xla/xla/service/hlo_module_metadata_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module_metadata.h"
 
-#include "xla/test.h"
-#include "xla/test_helpers.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/hlo_module_test.cc b/third_party/xla/xla/service/hlo_module_test.cc
index 960f107c9117..ec38182694f1 100644
--- a/third_party/xla/xla/service/hlo_module_test.cc
+++ b/third_party/xla/xla/service/hlo_module_test.cc
@@ -36,6 +36,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
 #include "xla/hlo/utils/hlo_matchers.h"
@@ -46,8 +48,6 @@ limitations under the License.
 #include "xla/service/test_compilation_environment.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
 #include "xla/tsl/platform/statusor.h"
@@ -75,7 +75,7 @@ namespace {
 
 namespace op = ::xla::testing::opcode_matchers;
 
-class HloModuleTest : public HloTestBase {
+class HloModuleTest : public HloHardwareIndependentTestBase {
  protected:
   static void SetUpTestSuite() {
     CompilationEnvironments::RegisterProcessNewEnvFn(
@@ -248,8 +248,8 @@ ENTRY entry () -> s32[] {
   HloInstruction* cloned_custom_call =
       cloned_module->entry_computation()->GetInstructionWithName("custom-call");
 
-  EXPECT_TRUE(cloned_computation->IsCustomCallComputation());
-  EXPECT_EQ(cloned_computation->CustomCallInstruction(), cloned_custom_call);
+  EXPECT_EQ(cloned_computation->GetUniqueCaller(HloOpcode::kCustomCall),
+            cloned_custom_call);
 }
 
 TEST_F(HloModuleTest, CloneCustomCallComputationCalledComputations) {
@@ -288,10 +288,10 @@ ENTRY entry () -> s32[] {
   HloInstruction* cloned_custom_call =
       cloned_module->entry_computation()->GetInstructionWithName("custom-call");
 
-  EXPECT_TRUE(cloned_computation_0->IsCustomCallComputation());
-  EXPECT_EQ(cloned_computation_0->CustomCallInstruction(), cloned_custom_call);
-  EXPECT_TRUE(cloned_computation_1->IsCustomCallComputation());
-  EXPECT_EQ(cloned_computation_1->CustomCallInstruction(), cloned_custom_call);
+  EXPECT_EQ(cloned_computation_0->GetUniqueCaller(HloOpcode::kCustomCall),
+            cloned_custom_call);
+  EXPECT_EQ(cloned_computation_1->GetUniqueCaller(HloOpcode::kCustomCall),
+            cloned_custom_call);
 }
 
 TEST_F(HloModuleTest, CloneFusionComputation) {
@@ -710,13 +710,16 @@ TEST_F(HloModuleTest, TwoComputationsFilterexecution_threads) {
   auto* parallel_thread_computation = async_done->async_wrapped_computation();
 
   EXPECT_THAT(
-      module->MakeComputationPostOrder({HloInstruction::kMainExecutionThread}),
+      module->MakeComputationPostOrder(absl::flat_hash_set<absl::string_view>(
+          {HloInstruction::kMainExecutionThread})),
       ::testing::ElementsAre(main_thread_computation));
   EXPECT_THAT(module->MakeComputationPostOrder(),
               ::testing::ElementsAre(parallel_thread_computation,
                                      main_thread_computation));
-  EXPECT_THAT(module->MakeComputationPostOrder({kParallelThreadName}),
-              ::testing::ElementsAre(parallel_thread_computation));
+  EXPECT_THAT(
+      module->MakeComputationPostOrder(
+          absl::flat_hash_set<absl::string_view>({kParallelThreadName})),
+      ::testing::ElementsAre(parallel_thread_computation));
   // Test that computations(execution_thread) return the expected values.
   int num_all_computations = 0;
   for ([[maybe_unused]] const HloComputation* comp :
@@ -762,8 +765,7 @@ ENTRY ReduceR3ToR2.v3 {
   xla::HloModuleProtoWithConfig proto = module->ToProtoWithConfig();
   std::string serialized_module;
   ASSERT_TRUE(tsl::SerializeToStringDeterministic(proto, &serialized_module));
-  std::string original_debug_str = proto.DebugString();
-  RecordProperty("serialized_module", original_debug_str);
+  RecordProperty("serialized_module", proto.DebugString());
 
   // Verify that we can create a module from our parsed proto copy
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> reconstructed_module,
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index 7ad2350dd155..5e8828b12a72 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -61,10 +61,11 @@ absl::Status ValidateResultShape(const Shape& client_shape,
 }  // namespace
 
 absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromString(
-    const absl::string_view hlo_string, const DebugOptions& debug_options) {
+    const absl::string_view hlo_string, const DebugOptions& debug_options,
+    const HloParserOptions& parser_options) {
   HloModuleConfig config;
   config.set_debug_options(debug_options);
-  return ParseAndReturnUnverifiedModule(hlo_string, config);
+  return ParseAndReturnUnverifiedModule(hlo_string, config, parser_options);
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
@@ -160,8 +161,9 @@ absl::StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
   }
   if (execution_options != nullptr &&
       execution_options->has_shape_with_output_layout()) {
-    const Shape shape_with_output_layout(
-        execution_options->shape_with_output_layout());
+    TF_ASSIGN_OR_RETURN(
+        const Shape shape_with_output_layout,
+        Shape::FromProto(execution_options->shape_with_output_layout()));
     TF_RETURN_IF_ERROR(
         ValidateResultShape(shape_with_output_layout, program_shape.result()));
     TF_RETURN_IF_ERROR(
diff --git a/third_party/xla/xla/service/hlo_module_util.h b/third_party/xla/xla/service/hlo_module_util.h
index 4651a40c19ea..5725d596b0a1 100644
--- a/third_party/xla/xla/service/hlo_module_util.h
+++ b/third_party/xla/xla/service/hlo_module_util.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
-#include <string>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -39,7 +38,8 @@ namespace xla {
 // HloModule::ToString format).
 absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromString(
     absl::string_view hlo_string,
-    const DebugOptions& debug_options = DebugOptions::default_instance());
+    const DebugOptions& debug_options = DebugOptions::default_instance(),
+    const HloParserOptions& parser_options = HloParserOptions());
 
 // Creates an HloModule from the given proto.
 absl::StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
diff --git a/third_party/xla/xla/service/hlo_ordering.h b/third_party/xla/xla/service/hlo_ordering.h
deleted file mode 100644
index d035368156ae..000000000000
--- a/third_party/xla/xla/service/hlo_ordering.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2016 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_ORDERING_H_
-#define XLA_SERVICE_HLO_ORDERING_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_ordering.h"
-
-#endif  // XLA_SERVICE_HLO_ORDERING_H_
diff --git a/third_party/xla/xla/service/hlo_parser.h b/third_party/xla/xla/service/hlo_parser.h
deleted file mode 100644
index 6a9e8d8be603..000000000000
--- a/third_party/xla/xla/service/hlo_parser.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_PARSER_H_
-#define XLA_SERVICE_HLO_PARSER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/parser/hlo_parser.h"
-
-#endif  // XLA_SERVICE_HLO_PARSER_H_
diff --git a/third_party/xla/xla/service/hlo_pass_fix.h b/third_party/xla/xla/service/hlo_pass_fix.h
deleted file mode 100644
index c7dab4303b6e..000000000000
--- a/third_party/xla/xla/service/hlo_pass_fix.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_PASS_FIX_H_
-#define XLA_SERVICE_HLO_PASS_FIX_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/pass/hlo_pass_fix.h"
-
-#endif  // XLA_SERVICE_HLO_PASS_FIX_H_
diff --git a/third_party/xla/xla/service/hlo_pass_interface.h b/third_party/xla/xla/service/hlo_pass_interface.h
deleted file mode 100644
index 1b6a373b3a17..000000000000
--- a/third_party/xla/xla/service/hlo_pass_interface.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_PASS_INTERFACE_H_
-#define XLA_SERVICE_HLO_PASS_INTERFACE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/pass/hlo_pass_interface.h"
-
-#endif  // XLA_SERVICE_HLO_PASS_INTERFACE_H_
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline.h b/third_party/xla/xla/service/hlo_pass_pipeline.h
deleted file mode 100644
index 83d693ccfef3..000000000000
--- a/third_party/xla/xla/service/hlo_pass_pipeline.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_PASS_PIPELINE_H_
-#define XLA_SERVICE_HLO_PASS_PIPELINE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/pass/hlo_pass_pipeline.h"
-
-#endif  // XLA_SERVICE_HLO_PASS_PIPELINE_H_
diff --git a/third_party/xla/xla/service/hlo_proto_util_test.cc b/third_party/xla/xla/service/hlo_proto_util_test.cc
index d5ef461e5887..299add370da2 100644
--- a/third_party/xla/xla/service/hlo_proto_util_test.cc
+++ b/third_party/xla/xla/service/hlo_proto_util_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "xla/service/hlo_proto_util.h"
 
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/types.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/hlo_rematerialization.h b/third_party/xla/xla/service/hlo_rematerialization.h
deleted file mode 100644
index 0dcdcee36362..000000000000
--- a/third_party/xla/xla/service/hlo_rematerialization.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
-#ifndef XLA_SERVICE_HLO_REMATERIALIZATION_H_
-#define XLA_SERVICE_HLO_REMATERIALIZATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/hlo_rematerialization.h"
-
-#endif  // XLA_SERVICE_HLO_REMATERIALIZATION_H_
diff --git a/third_party/xla/xla/service/hlo_rematerialization_test_utils.h b/third_party/xla/xla/service/hlo_rematerialization_test_utils.h
deleted file mode 100644
index 8837169bec82..000000000000
--- a/third_party/xla/xla/service/hlo_rematerialization_test_utils.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Class to create computations for testing rematerialization methods.
-
-#ifndef XLA_SERVICE_HLO_REMATERIALIZATION_TEST_UTILS_H_
-#define XLA_SERVICE_HLO_REMATERIALIZATION_TEST_UTILS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/hlo_rematerialization_test_utils.h"
-
-#endif  // XLA_SERVICE_HLO_REMATERIALIZATION_TEST_UTILS_H_
diff --git a/third_party/xla/xla/service/hlo_replication_analysis.h b/third_party/xla/xla/service/hlo_replication_analysis.h
deleted file mode 100644
index 85289cb01adb..000000000000
--- a/third_party/xla/xla/service/hlo_replication_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_REPLICATION_ANALYSIS_H_
-#define XLA_SERVICE_HLO_REPLICATION_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_replication_analysis.h"
-
-#endif  // XLA_SERVICE_HLO_REPLICATION_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index 36c975bc5c00..505ae01d8a64 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/literal.h"
 #include "xla/service/backend.h"
+#include "xla/service/computation_layout.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
@@ -61,14 +62,13 @@ limitations under the License.
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
 namespace {
 class HloRunnerExecutable : public OpaqueExecutable {
  public:
-  HloRunnerExecutable(absl::Nonnull<const HloRunner*> creator,
+  HloRunnerExecutable(const HloRunner* absl_nonnull creator,
                       std::unique_ptr<Executable> executable)
       : OpaqueExecutable(creator), executable_(std::move(executable)) {}
 
@@ -78,12 +78,12 @@ class HloRunnerExecutable : public OpaqueExecutable {
   }
 
   static absl::StatusOr<HloRunnerExecutable*> TryUnwrap(
-      const HloRunner& runner, absl::Nonnull<OpaqueExecutable*> const wrapped) {
+      const HloRunner& runner, OpaqueExecutable* absl_nonnull const wrapped) {
     return OpaqueExecutable::TryUnwrap<HloRunnerExecutable>(runner, wrapped);
   }
   static absl::StatusOr<const HloRunnerExecutable*> TryUnwrap(
       const HloRunner& runner,
-      absl::Nonnull<const OpaqueExecutable*> const wrapped) {
+      const OpaqueExecutable* absl_nonnull const wrapped) {
     return OpaqueExecutable::TryUnwrap<HloRunnerExecutable>(runner, wrapped);
   }
 
@@ -107,6 +107,7 @@ HloRunner::HloRunner(se::Platform* platform, int intra_op_parallelism_threads) {
 HloRunner::~HloRunner() {}
 
 se::DeviceMemoryAllocator* HloRunner::GetAllocator() {
+  absl::MutexLock lock(&mu_);
   if (allocator_ == nullptr) {
     allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
         backend().default_stream_executor());
@@ -115,15 +116,18 @@ se::DeviceMemoryAllocator* HloRunner::GetAllocator() {
 }
 
 absl::StatusOr<ScopedShapedBuffer> HloRunner::TransferLiteralToDevice(
-    const Literal& literal, int64_t param_no) {
-  auto shape_representation_fn = [this, param_no](const Shape& shape) {
+    const Literal& literal,
+    const ComputationLayout* absl_nullable entry_computation_layout,
+    int64_t param_no) {
+  auto shape_representation_fn = [this, entry_computation_layout,
+                                  param_no](const Shape& shape) {
     Shape new_shape = device_shape_representation_fn_(shape);
-    if (entry_computation_layout_ == nullptr) {
+    if (entry_computation_layout == nullptr) {
       return new_shape;
     }
 
     Shape entry_computation_shape =
-        entry_computation_layout_->parameter_shape(param_no);
+        entry_computation_layout->parameter_shape(param_no);
     // Favor entry computation shape with some adjustment.
     ShapeUtil::ForEachMutableSubshape(
         &new_shape,
@@ -155,14 +159,17 @@ absl::StatusOr<ScopedShapedBuffer> HloRunner::TransferLiteralToDevice(
 }
 
 absl::StatusOr<std::vector<ScopedShapedBuffer>>
-HloRunner::TransferLiteralsToDevice(absl::Span<const Literal* const> literals) {
+HloRunner::TransferLiteralsToDevice(
+    absl::Span<const Literal* const> literals,
+    const ComputationLayout* absl_nullable entry_computation_layout) {
   std::vector<ScopedShapedBuffer> buffers;
   buffers.reserve(literals.size());
   for (auto i = 0; i < literals.size(); i++) {
     const Literal* literal = literals[i];
     CHECK(literal != nullptr);
-    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer buffer,
-                        TransferLiteralToDevice(*literal, i));
+    TF_ASSIGN_OR_RETURN(
+        ScopedShapedBuffer buffer,
+        TransferLiteralToDevice(*literal, entry_computation_layout, i));
     buffers.push_back(std::move(buffer));
   }
   return std::move(buffers);
@@ -175,7 +182,7 @@ HloRunner::TransferLiteralsToDevice(absl::Span<const Literal> literals) {
   for (const auto& literal : literals) {
     literal_pointers.push_back(&literal);
   }
-  return TransferLiteralsToDevice(literal_pointers);
+  return TransferLiteralsToDevice(literal_pointers, nullptr);
 }
 
 absl::StatusOr<Literal> HloRunner::TransferLiteralFromDevice(
@@ -207,11 +214,10 @@ absl::StatusOr<Literal> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
     absl::Span<const Literal* const> arguments, bool run_hlo_passes,
     ExecutionProfile* profile) {
-  xla::UpdateEntryComputationLayout(module.get(),
-                                    device_shape_representation_fn_);
-  entry_computation_layout_ = &(module->entry_computation_layout());
-  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
-                      TransferLiteralsToDevice(arguments));
+  MaybeUpdateEntryComputationLayout(module.get());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<ScopedShapedBuffer> argument_buffers,
+      TransferLiteralsToDevice(arguments, &module->entry_computation_layout()));
   TF_ASSIGN_OR_RETURN(ExecutionOutput result,
                       ExecuteWithMovedDeviceBuffersAndBufferAssignment(
                           /*module=*/std::move(module),
@@ -227,11 +233,10 @@ absl::StatusOr<Literal> HloRunner::ExecuteWithBufferAssignment(
     const BufferAssignmentProto* buffer_assignment_proto,
     absl::Span<const Literal* const> arguments, bool run_hlo_passes,
     ExecutionProfile* profile) {
-  xla::UpdateEntryComputationLayout(module.get(),
-                                    device_shape_representation_fn_);
-  entry_computation_layout_ = &(module->entry_computation_layout());
-  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
-                      TransferLiteralsToDevice(arguments));
+  MaybeUpdateEntryComputationLayout(module.get());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<ScopedShapedBuffer> argument_buffers,
+      TransferLiteralsToDevice(arguments, &module->entry_computation_layout()));
   TF_ASSIGN_OR_RETURN(ExecutionOutput result,
                       ExecuteWithMovedDeviceBuffersAndBufferAssignment(
                           /*module=*/std::move(module), buffer_assignment_proto,
@@ -246,10 +251,11 @@ absl::StatusOr<Literal> HloRunner::ExecuteWithExecutable(
     ExecutionProfile* profile) {
   TF_ASSIGN_OR_RETURN(HloRunnerExecutable* const hlo_runner_executable,
                       HloRunnerExecutable::TryUnwrap(*this, executable));
-  entry_computation_layout_ = &(
-      hlo_runner_executable->executable()->module().entry_computation_layout());
-  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
-                      TransferLiteralsToDevice(arguments));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<ScopedShapedBuffer> argument_buffers,
+      TransferLiteralsToDevice(arguments, &hlo_runner_executable->executable()
+                                               ->module()
+                                               .entry_computation_layout()));
   TF_ASSIGN_OR_RETURN(ExecutionOutput result,
                       ExecuteWithDeviceBuffers(
                           /*executable=*/hlo_runner_executable,
@@ -419,8 +425,7 @@ absl::StatusOr<ExecutionOutput> HloRunner::ExecuteWithMovedDeviceBuffers(
 absl::StatusOr<ExecutionOutput> HloRunner::ExecuteWithExecutionInputs(
     Executable* executable, std::vector<ExecutionInput> arguments,
     ExecutionProfile* profile) {
-  xla::UpdateEntryComputationLayout(&executable->module(),
-                                    device_shape_representation_fn_);
+  MaybeUpdateEntryComputationLayout(&executable->module());
 
   // Get service run options.
   TF_ASSIGN_OR_RETURN(auto stream,
@@ -724,8 +729,7 @@ absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 HloRunner::CreateExecutableWithBufferAssignment(
     std::unique_ptr<HloModule> module,
     const BufferAssignmentProto* buffer_assignment_proto, bool run_hlo_passes) {
-  xla::UpdateEntryComputationLayout(module.get(),
-                                    device_shape_representation_fn_);
+  MaybeUpdateEntryComputationLayout(module.get());
   if (run_hlo_passes) {
     if (buffer_assignment_proto != nullptr) {
       LOG(WARNING) << "Ignoring buffer assignment provided because hlo passes "
@@ -755,8 +759,7 @@ HloRunner::CreateExecutableWithBufferAssignment(
 }
 
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
-HloRunner::DeserializeExecutable(
-    absl::Nonnull<const tsl::protobuf::Message*> serialized) const {
+HloRunner::DeserializeExecutable(const absl::string_view serialized) const {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       backend().compiler()->DeserializeExecutable(serialized));
   return std::make_unique<HloRunnerExecutable>(this, std::move(executable));
@@ -781,13 +784,7 @@ ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
                                      backend().StreamBorrowerWithPriority());
 }
 
-Backend& HloRunner::backend() {
-  if (!backend_) {
-    backend_ = Backend::CreateDefaultBackend().value();
-    VLOG(1) << "Executing on platform " << backend().platform()->Name();
-  }
-  return *backend_;
-}
+Backend& HloRunner::backend() { return *backend_; }
 
 const Backend& HloRunner::backend() const {
   return const_cast<HloRunner*>(this)->backend();
@@ -829,7 +826,7 @@ std::unique_ptr<OpaqueExecutable> HloRunner::WrapExecutable(
   return std::make_unique<HloRunnerExecutable>(this, std::move(executable));
 }
 
-absl::StatusOr<absl::Nonnull<const HloModule*>> HloRunner::HloModuleFromWrapped(
+absl::StatusOr<const HloModule* absl_nonnull> HloRunner::HloModuleFromWrapped(
     const OpaqueExecutable* wrapped) const {
   TF_ASSIGN_OR_RETURN(const HloRunnerExecutable* const hlo_runner_executable,
                       HloRunnerExecutable::TryUnwrap(*this, wrapped));
@@ -839,11 +836,17 @@ absl::StatusOr<absl::Nonnull<const HloModule*>> HloRunner::HloModuleFromWrapped(
   return &hlo_runner_executable->executable()->module();
 }
 
-absl::StatusOr<absl::Nonnull<const HloProto*>> HloRunner::HloProtoFromWrapped(
+absl::StatusOr<const HloProto* absl_nonnull> HloRunner::HloProtoFromWrapped(
     const OpaqueExecutable* wrapped) const {
   TF_ASSIGN_OR_RETURN(const HloRunnerExecutable* const hlo_runner_executable,
                       HloRunnerExecutable::TryUnwrap(*this, wrapped));
   return hlo_runner_executable->executable()->hlo_proto();
 }
 
+void HloRunner::MaybeUpdateEntryComputationLayout(HloModule* module) {
+  absl::MutexLock lock(&mu_);
+  if (module_ids_with_updated_layouts_.insert(module->unique_id()).second) {
+    xla::UpdateEntryComputationLayout(module, device_shape_representation_fn_);
+  }
+}
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner.h b/third_party/xla/xla/service/hlo_runner.h
index 5c0c05914847..4eca7de6bcf5 100644
--- a/third_party/xla/xla/service/hlo_runner.h
+++ b/third_party/xla/xla/service/hlo_runner.h
@@ -22,8 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -41,7 +45,6 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -64,10 +67,6 @@ class HloRunner : public HloRunnerInterface {
   ~HloRunner() override;
 
   // Transfers data between the host and device.
-  absl::StatusOr<ScopedShapedBuffer> TransferLiteralToDevice(
-      const Literal& literal, int64_t param_no);
-  absl::StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
-      absl::Span<const Literal* const> literals);
   absl::StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
       absl::Span<const Literal> literals);
   absl::StatusOr<Literal> TransferLiteralFromDevice(const ShapedBuffer& buffer);
@@ -154,7 +153,7 @@ class HloRunner : public HloRunnerInterface {
   // representation must have been produced by a compiler of the same platform
   // and version as this one.
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> DeserializeExecutable(
-      absl::Nonnull<const tsl::protobuf::Message*> serialized) const override;
+      absl::string_view serialized) const override;
 
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
@@ -219,7 +218,7 @@ class HloRunner : public HloRunnerInterface {
       std::unique_ptr<OpaqueExecutable> wrapped) const;
   std::unique_ptr<OpaqueExecutable> WrapExecutable(
       std::unique_ptr<Executable> executable) const;
-  absl::StatusOr<absl::Nonnull<const HloModule*>> HloModuleFromWrapped(
+  absl::StatusOr<const HloModule* absl_nonnull> HloModuleFromWrapped(
       const OpaqueExecutable* wrapped) const override;
   // Returns the HloProto of the Executable wrapped by the given
   // OpaqueExecutable. This is a temporary API to help move to OpaqueExecutable.
@@ -229,10 +228,28 @@ class HloRunner : public HloRunnerInterface {
   // information is not available from a PjRt(Loaded)Executable.
   //
   // TODO: b/393183864 - Remove this API.
-  absl::StatusOr<absl::Nonnull<const HloProto*>> HloProtoFromWrapped(
+  absl::StatusOr<const HloProto* absl_nonnull> HloProtoFromWrapped(
       const OpaqueExecutable* wrapped) const;
 
+  // Returns true if the two given OpaqueExecutables originate from the same
+  // runner and are equivalent according to some notion specific to that runner.
+  // Executables that were created by different runners can never be equivalent.
+  bool ExecutablesAreEquivalent(
+      const OpaqueExecutable* absl_nonnull lhs,
+      const OpaqueExecutable* absl_nonnull rhs) const override {
+    LOG(FATAL) << "ExecutablesAreEquivalent is not implemented for HloRunner.";
+    return false;
+  }
+
  private:
+  absl::StatusOr<ScopedShapedBuffer> TransferLiteralToDevice(
+      const Literal& literal,
+      const ComputationLayout* absl_nullable entry_computation_layout,
+      int64_t param_no);
+  absl::StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
+      absl::Span<const Literal* const> literals,
+      const ComputationLayout* absl_nullable entry_computation_layout);
+
   absl::StatusOr<ExecutionOutput> ExecuteWithExecutionInputs(
       Executable* executable, std::vector<ExecutionInput> arguments,
       ExecutionProfile* profile);
@@ -257,15 +274,26 @@ class HloRunner : public HloRunnerInterface {
       DeviceAssignment* device_assignment);
 
   // Gets or creates the DeviceMemoryAllocator.
-  se::DeviceMemoryAllocator* GetAllocator();
-
-  std::unique_ptr<Backend> backend_;
+  se::DeviceMemoryAllocator* GetAllocator() ABSL_LOCKS_EXCLUDED(mu_);
 
-  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+  // Calls UpdateEntryComputationLayout if HloRunner has not called it on the
+  // module before. This method is called before the module is executed. The
+  // reason UpdateEntryComputationLayout is only called once instead of every
+  // time is to avoid one thread updating the layout while another thread is
+  // reading it during execution.
+  void MaybeUpdateEntryComputationLayout(HloModule* module)
+      ABSL_LOCKS_EXCLUDED(mu_);
 
+  std::unique_ptr<Backend> backend_;
   DeviceShapeRepresentationFn device_shape_representation_fn_;
 
-  const ComputationLayout* entry_computation_layout_ = nullptr;
+  absl::Mutex mu_;
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_ ABSL_GUARDED_BY(mu_);
+
+  // Set of module unique_ids that we already called
+  // UpdateEntryComputationLayout() on
+  absl::flat_hash_set<int> module_ids_with_updated_layouts_
+      ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner_interface.h b/third_party/xla/xla/service/hlo_runner_interface.h
index 6d430e03cf0b..034c845d4d57 100644
--- a/third_party/xla/xla/service/hlo_runner_interface.h
+++ b/third_party/xla/xla/service/hlo_runner_interface.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/macros.h"
 #include "absl/base/nullability.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
@@ -34,11 +33,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/literal.h"
 #include "xla/service/computation_placer.h"
-#include "xla/service/hlo_module_util.h"
 #include "xla/shape.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -111,16 +108,16 @@ class OpaqueExecutable {
   // !!! STOP !!!
 
  protected:
-  explicit OpaqueExecutable(absl::Nonnull<const HloRunnerInterface*> creator)
+  explicit OpaqueExecutable(const HloRunnerInterface* absl_nonnull creator)
       : creator_(ABSL_DIE_IF_NULL(creator)) {}
   // Cannot be moved or copied.
   OpaqueExecutable(const OpaqueExecutable&) = default;
   OpaqueExecutable& operator=(const OpaqueExecutable&) = default;
 
   template <typename T>
-  static absl::StatusOr<absl::Nonnull<T*>> TryUnwrap(
+  static absl::StatusOr<T* absl_nonnull> TryUnwrap(
       const HloRunnerInterface& runner,
-      absl::Nonnull<OpaqueExecutable*> const wrapped) {
+      OpaqueExecutable* absl_nonnull const wrapped) {
     static_assert(
         std::is_base_of_v<OpaqueExecutable, T>,
         "TryUnwrap must be used with a subclass of OpaqueExecutable.");
@@ -137,9 +134,9 @@ class OpaqueExecutable {
   }
 
   template <typename T>
-  static absl::StatusOr<absl::Nonnull<const T*>> TryUnwrap(
+  static absl::StatusOr<const T* absl_nonnull> TryUnwrap(
       const HloRunnerInterface& runner,
-      absl::Nonnull<const OpaqueExecutable*> const wrapped) {
+      const OpaqueExecutable* absl_nonnull const wrapped) {
     static_assert(
         std::is_base_of_v<OpaqueExecutable, T>,
         "TryUnwrap must be used with a subclass of OpaqueExecutable.");
@@ -210,15 +207,6 @@ class HloRunnerInterface {
   HloRunnerInterface() = default;
   virtual ~HloRunnerInterface() = default;
 
-  // Converts an HloModule from the given hlo textual IR string (in
-  // HloModule::ToString format).
-  ABSL_DEPRECATE_AND_INLINE()
-  inline static absl::StatusOr<std::unique_ptr<HloModule>>
-  CreateModuleFromString(absl::string_view hlo_string,
-                         const DebugOptions& debug_options) {
-    return xla::CreateModuleFromString(hlo_string, debug_options);
-  }
-
   // Creates a runner-internal executable object given an HLO module and returns
   // a OpaqueExecutable. If run_hlo_passes is true, the HLO passes will be run
   // as part of compilation.
@@ -230,8 +218,7 @@ class HloRunnerInterface {
   // representation must have been produced by a compiler of the same platform
   // and version as this one.
   virtual absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
-  DeserializeExecutable(
-      absl::Nonnull<const tsl::protobuf::Message*> serialized) const = 0;
+  DeserializeExecutable(absl::string_view serialized) const = 0;
 
   // Same as above, except it takes buffer assignment as input.
   // Note: The default implementation of the API here does not utilize the given
@@ -354,8 +341,15 @@ class HloRunnerInterface {
   // OpaqueExecutable. Returns an error if the OpaqueExecutable cannot be
   // unwrapped, or if the OpaqueExecutable does not contain at least one
   // HloModule.
-  virtual absl::StatusOr<absl::Nonnull<const HloModule*>> HloModuleFromWrapped(
+  virtual absl::StatusOr<const HloModule* absl_nonnull> HloModuleFromWrapped(
       const OpaqueExecutable* wrapped) const = 0;
+
+  // Returns true if the two given OpaqueExecutables originate from the same
+  // runner and are equivalent according to some notion specific to that runner.
+  // Executables that were created by different runners can never be equivalent.
+  virtual bool ExecutablesAreEquivalent(
+      const OpaqueExecutable* absl_nonnull lhs,
+      const OpaqueExecutable* absl_nonnull rhs) const = 0;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 7077ac6b73cf..bbbbfdc94162 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/hlo_runner_pjrt.h"
 
+#include <array>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -29,12 +30,15 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -55,7 +59,8 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
-#include "tsl/platform/protobuf.h"
+#include "tsl/platform/fingerprint.h"
+#include "tsl/platform/path.h"
 
 namespace xla {
 
@@ -132,35 +137,9 @@ absl::StatusOr<std::vector<Layout>> FlattenedParameterLayouts(
 absl::StatusOr<ExecuteOptions> GenerateExecuteOptions(const HloModule& module) {
   ExecuteOptions execute_options;
 
-  // PjRt requires untuple_result if any output leaf buffer is in host memory,
-  // or if any output leaf buffer is not an array.
+  // PjRt requires untuple_result if the output is a tuple.
   if (module.result_shape().IsTuple()) {
-    bool has_array_output_in_host_memory = false;
-    bool has_non_array_output = false;
-    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-        module.entry_computation_layout().result_shape(),
-        [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
-          if (!subshape.IsArray()) {
-            if (!subshape.IsTuple()) {
-              has_non_array_output = true;
-            }
-            // Skip token, opaque, and tuple outputs.
-            return absl::OkStatus();
-          }
-          // Arrays require a layout.
-          if (!subshape.has_layout()) {
-            return absl::InvalidArgumentError(
-                "GenerateExecuteOptions requires that all array subshapes of "
-                "the result shape have layouts.");
-          }
-          if (subshape.layout().memory_space() == Layout::kHostMemorySpace) {
-            has_array_output_in_host_memory = true;
-          }
-          return absl::OkStatus();
-        }));
-
-    execute_options.untuple_result =
-        has_array_output_in_host_memory || has_non_array_output;
+    execute_options.untuple_result = true;
   }
   return execute_options;
 }
@@ -204,8 +183,8 @@ std::vector<std::vector<PjRtBuffer*>> BufferMatToPointerMat(
 
 constexpr int kDeviceIdx = 0;
 
-absl::StatusOr<absl::Nonnull<PjRtMemorySpace*>> GetMemorySpaceFromLayout(
-    absl::Nonnull<PjRtDevice*> const device, const Layout& layout) {
+absl::StatusOr<PjRtMemorySpace* absl_nonnull> GetMemorySpaceFromLayout(
+    PjRtDevice* absl_nonnull const device, const Layout& layout) {
   PjRtMemorySpace* memory_space = nullptr;
   if (layout.memory_space() == Layout::kHostMemorySpace) {
     TF_ASSIGN_OR_RETURN(memory_space, device->memory_space_by_kind(
@@ -221,31 +200,54 @@ absl::StatusOr<absl::Nonnull<PjRtMemorySpace*>> GetMemorySpaceFromLayout(
 
 class HloRunnerPjRtExecutable : public OpaqueExecutable {
  public:
+  // Construct an internal executable with a PjRt executable that requires
+  // loading prior to execution.
+  HloRunnerPjRtExecutable(const HloRunnerPjRt* absl_nonnull creator,
+                          std::unique_ptr<PjRtExecutable> executable)
+      : OpaqueExecutable(creator), executable_(std::move(executable)) {}
+  // Construct an internal executable with a pre-loaded PjRt executable. This
+  // only exists to support PjRt clients that do not implement Compile() and
+  // DeserializeExecutable() functionality.
   HloRunnerPjRtExecutable(
-      absl::Nonnull<const HloRunnerPjRt*> creator,
-      std::unique_ptr<PjRtLoadedExecutable> pjrt_loaded_executable)
+      const HloRunnerPjRt* absl_nonnull creator,
+      std::unique_ptr<PjRtLoadedExecutable> loaded_executable)
       : OpaqueExecutable(creator),
-        pjrt_loaded_executable_(std::move(pjrt_loaded_executable)) {}
+        loaded_executable_(std::move(loaded_executable)) {}
 
-  PjRtLoadedExecutable* pjrt_loaded_executable() const {
-    return pjrt_loaded_executable_.get();
+  PjRtExecutable* executable() const {
+    if (executable_ != nullptr) {
+      return executable_.get();
+    }
+    return loaded_executable_->GetExecutable();
+  }
+  absl::StatusOr<PjRtLoadedExecutable* absl_nonnull> GetOrLoadExecutable(
+      PjRtClient* const absl_nonnull client) {
+    if (loaded_executable_ == nullptr) {
+      TF_ASSIGN_OR_RETURN(loaded_executable_,
+                          client->Load(std::move(executable_), LoadOptions()));
+    }
+    return loaded_executable_.get();
   }
 
   static absl::StatusOr<HloRunnerPjRtExecutable*> TryUnwrap(
       const HloRunnerPjRt& runner,
-      absl::Nonnull<OpaqueExecutable*> const wrapped) {
+      OpaqueExecutable* absl_nonnull const wrapped) {
     return OpaqueExecutable::TryUnwrap<HloRunnerPjRtExecutable>(runner,
                                                                 wrapped);
   }
   static absl::StatusOr<const HloRunnerPjRtExecutable*> TryUnwrap(
       const HloRunnerPjRt& runner,
-      absl::Nonnull<const OpaqueExecutable*> const wrapped) {
+      const OpaqueExecutable* absl_nonnull const wrapped) {
     return OpaqueExecutable::TryUnwrap<HloRunnerPjRtExecutable>(runner,
                                                                 wrapped);
   }
 
  private:
-  std::unique_ptr<PjRtLoadedExecutable> pjrt_loaded_executable_;
+  // If the executable is loaded, executable_ is null and loaded_executable_ is
+  // non-null. The executable is loaded if and only if required, and is cached
+  // thereafter.
+  std::unique_ptr<PjRtExecutable> executable_;
+  std::unique_ptr<PjRtLoadedExecutable> loaded_executable_;
 };
 
 }  // namespace
@@ -258,8 +260,6 @@ HloRunnerPjRt::HloRunnerPjRt(
       device_shape_representation_fn_(device_shape_representation_fn),
       device_shape_size_fn_(device_shape_size_fn) {}
 
-HloRunnerPjRt::~HloRunnerPjRt() = default;
-
 absl::StatusOr<CompileOptions> HloRunnerPjRt::GenerateDefaultCompileOptions(
     HloModule* module, bool run_hlo_passes) {
   TF_ASSIGN_OR_RETURN(
@@ -278,6 +278,8 @@ absl::StatusOr<CompileOptions> HloRunnerPjRt::GenerateDefaultCompileOptions(
       !run_hlo_passes);
   *compile_options.executable_build_options.mutable_debug_options() =
       module->config().debug_options();
+  *compile_options.executable_build_options.mutable_comp_envs() =
+      module->comp_envs();
 
   std::vector<Shape> parameter_shapes;
   parameter_shapes.reserve(
@@ -329,7 +331,7 @@ HloRunnerPjRt::TransferLiteralsToDevice(
       const Literal* literal = input_literals[i];
       TF_RET_CHECK(literal != nullptr);
       const Layout& on_device_layout = parameter_layouts[i];
-      TF_ASSIGN_OR_RETURN(absl::Nonnull<PjRtMemorySpace*> memory_space,
+      TF_ASSIGN_OR_RETURN(PjRtMemorySpace* absl_nonnull memory_space,
                           GetMemorySpaceFromLayout(device, on_device_layout));
       TF_ASSIGN_OR_RETURN(
           std::unique_ptr<PjRtBuffer> buffer,
@@ -398,51 +400,20 @@ absl::StatusOr<Literal> HloRunnerPjRt::Execute(
     std::unique_ptr<HloModule> module,
     absl::Span<const Literal* const> arguments, bool run_hlo_passes,
     ExecutionProfile* profile) {
-  TF_ASSIGN_OR_RETURN(auto executable,
+  TF_ASSIGN_OR_RETURN(const std::unique_ptr<OpaqueExecutable> executable,
                       CreateExecutable(std::move(module), run_hlo_passes));
-
   return ExecuteWithExecutable(executable.get(), arguments, {});
 }
 
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-HloRunnerPjRt::CreateExecutable(HloModule* module,
-                                CompileOptions compile_options) {
-  XlaComputation computation(module->ToProto());
-
-  return pjrt_client_->Compile(computation, std::move(compile_options));
-}
-
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-HloRunnerPjRt::ExecuteWithDeviceBuffers(
-    PjRtLoadedExecutable* executable, const ExecuteOptions& execute_options,
-    const std::vector<std::unique_ptr<PjRtBuffer>>& arguments) {
-  std::vector<PjRtBuffer*> argument_ptrs = BufferVecToPointerVec(arguments);
-
-  auto devices = pjrt_client_->addressable_devices();
-
-  std::optional<PjRtFuture<>> returned_future = {};
-
-  TF_ASSIGN_OR_RETURN(
-      auto output_buffers,
-      executable->ExecuteSharded(argument_ptrs, devices[kDeviceIdx],
-                                 execute_options, returned_future, true));
-
-  if (returned_future.has_value()) {
-    TF_RETURN_IF_ERROR(returned_future->Await());
-  }
-
-  return output_buffers;
-}
-absl::StatusOr<HloRunnerPjRt::ExecuteWithDeviceBuffersResult>
 HloRunnerPjRt::ExecuteWithDeviceBuffers(
     OpaqueExecutable* executable,
     const std::vector<std::unique_ptr<PjRtBuffer>>& arguments,
     const ExecuteOptions* execute_options) {
   TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::shared_ptr<HloModule>> hlo_modules,
-      wrapped_executable->pjrt_loaded_executable()->GetHloModules());
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
+                      wrapped_executable->executable()->GetHloModules());
   TF_RET_CHECK(hlo_modules.size() == 1);
   const HloModule& module = *hlo_modules.front();
 
@@ -452,14 +423,21 @@ HloRunnerPjRt::ExecuteWithDeviceBuffers(
                         GenerateExecuteOptions(module));
     execute_options = &*generated_execute_options;
   }
+
+  TF_ASSIGN_OR_RETURN(
+      PjRtLoadedExecutable * pjrt_executable,
+      wrapped_executable->GetOrLoadExecutable(pjrt_client_.get()));
+  std::vector<PjRtBuffer*> argument_ptrs = BufferVecToPointerVec(arguments);
+  std::optional<PjRtFuture<>> returned_future = {};
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<PjRtBuffer>> buffers,
-      ExecuteWithDeviceBuffers(wrapped_executable->pjrt_loaded_executable(),
-                               *execute_options, arguments));
-  ExecuteWithDeviceBuffersResult result;
-  result.buffers = std::move(buffers);
-  result.untuple_result = execute_options->untuple_result;
-  return result;
+      pjrt_executable->ExecuteSharded(
+          argument_ptrs, pjrt_client_->addressable_devices()[kDeviceIdx],
+          *execute_options, returned_future, true));
+  if (returned_future.has_value()) {
+    TF_RETURN_IF_ERROR(returned_future->Await());
+  }
+  return buffers;
 }
 
 absl::StatusOr<Literal> HloRunnerPjRt::ExecuteWithExecutable(
@@ -468,9 +446,8 @@ absl::StatusOr<Literal> HloRunnerPjRt::ExecuteWithExecutable(
   TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::shared_ptr<HloModule>> hlo_modules,
-      wrapped_executable->pjrt_loaded_executable()->GetHloModules());
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
+                      wrapped_executable->executable()->GetHloModules());
   TF_RET_CHECK(hlo_modules.size() == 1);
   const HloModule& module = *hlo_modules.front();
 
@@ -482,9 +459,9 @@ absl::StatusOr<Literal> HloRunnerPjRt::ExecuteWithExecutable(
           module.entry_computation_layout().parameter_layouts(), arguments));
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
-      ExecuteWithDeviceBuffers(wrapped_executable->pjrt_loaded_executable(),
-                               execute_options, std::move(argument_handles)));
-  return TransferLiteralsFromDevice(output_buffers,
+      ExecuteWithDeviceBuffers(wrapped_executable, std::move(argument_handles),
+                               &execute_options));
+  return TransferLiteralsFromDevice(std::move(output_buffers),
                                     execute_options.untuple_result);
 }
 
@@ -494,28 +471,53 @@ HloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
   TF_ASSIGN_OR_RETURN(
       CompileOptions compile_options,
       GenerateDefaultCompileOptions(module.get(), run_hlo_passes));
+  XlaComputation computation(module->ToProto());
+
+  // Attempt to compile without loading. If that fails, fall back to compile +
+  // load functionality instead.
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> pjrt_executable =
+      pjrt_client_->Compile(computation, compile_options);
+  if (pjrt_executable.ok()) {
+    return std::make_unique<HloRunnerPjRtExecutable>(
+        this, *std::move(pjrt_executable));
+  }
+  if (!absl::IsUnimplemented(pjrt_executable.status())) {
+    return pjrt_executable.status();
+  }
+
+  // Fall back to compile + load if Compile() was not implemented.
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtLoadedExecutable> pjrt_executable,
-      CreateExecutable(module.get(), std::move(compile_options)));
-  return std::make_unique<HloRunnerPjRtExecutable>(this,
-                                                   std::move(pjrt_executable));
+      std::unique_ptr<PjRtLoadedExecutable> pjrt_loaded_executable,
+      pjrt_client_->CompileAndLoad(computation, std::move(compile_options)));
+  return std::make_unique<HloRunnerPjRtExecutable>(
+      this, std::move(pjrt_loaded_executable));
 }
 
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
-HloRunnerPjRt::DeserializeExecutable(
-    absl::Nonnull<const tsl::protobuf::Message*> serialized) const {
-  std::string serialized_string;
-  serialized->SerializeToString(&serialized_string);
-
+HloRunnerPjRt::DeserializeExecutable(const absl::string_view serialized) const {
   // TODO: b/237720161 - According to the comment in the base class, the
   // `options` argument is mandatory. However, our implementation is capable of
   // handling the default case where it is not present. The options are
   // serialized with the executable and we can read them from there.
   // Remove this comment once the bug is closed.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                      pjrt_client_->DeserializeExecutable(
-                          serialized_string, /*options=*/std::nullopt));
-  return std::make_unique<HloRunnerPjRtExecutable>(this, std::move(executable));
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> pjrt_executable =
+      pjrt_client_->DeserializeExecutable(serialized, /*options=*/std::nullopt);
+  if (pjrt_executable.ok()) {
+    return std::make_unique<HloRunnerPjRtExecutable>(
+        this, *std::move(pjrt_executable));
+  }
+  if (!absl::IsUnimplemented(pjrt_executable.status())) {
+    return pjrt_executable.status();
+  }
+
+  // Fall back to deserialize + load if DeserializeExecutable() was not
+  // implemented. This is similar to how we handle CreateExecutable above.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtLoadedExecutable> pjrt_loaded_executable,
+      pjrt_client_->LoadSerializedExecutable(
+          serialized, /*options=*/std::nullopt, LoadOptions()));
+  return std::make_unique<HloRunnerPjRtExecutable>(
+      this, std::move(pjrt_loaded_executable));
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
@@ -547,23 +549,17 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
   TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
 
+  xla::ExecuteOptions execute_options;
+  execute_options.untuple_result = true;
   return ExecuteReplicatedImpl(
       [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices)
-          -> absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> {
+          -> absl::StatusOr<
+              std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         TF_ASSIGN_OR_RETURN(
-            auto execution_results,
-            wrapped_executable->pjrt_loaded_executable()->Execute(
-                argument_buffer_slices, {}));
-
-        std::vector<std::unique_ptr<PjRtBuffer>> results;
-
-        for (auto& device_execution_result : execution_results) {
-          for (auto& device_buffer : device_execution_result) {
-            results.push_back(std::move(device_buffer));
-          }
-        }
-
-        return results;
+            PjRtLoadedExecutable * pjrt_executable,
+            wrapped_executable->GetOrLoadExecutable(pjrt_client_.get()));
+        return pjrt_executable->Execute(argument_buffer_slices,
+                                        execute_options);
       },
       [&](int64_t replica) { return options.arguments.size(); },
       [&](int64_t replica, int64_t index) { return options.arguments[index]; },
@@ -580,7 +576,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
       << "Only single-computation execution is supported.";
   return ExecuteReplicatedImpl(
       [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices)
-          -> absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> {
+          -> absl::StatusOr<
+              std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         TF_RET_CHECK(options.use_threads);
 
         // The underlying data is modified concurrently. We don't need to
@@ -601,17 +598,22 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
             TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const executable,
                                 HloRunnerPjRtExecutable::TryUnwrap(
                                     *this, executable_provider(i)));
+            TF_ASSIGN_OR_RETURN(
+                PjRtLoadedExecutable * pjrt_executable,
+                executable->GetOrLoadExecutable(pjrt_client_.get()));
             TF_ASSIGN_OR_RETURN(
                 PjRtDevice * device_ptr,
                 pjrt_client_->LookupDevice(
                     DeviceIdForInvocation(*device_assignment, i)));
-            pool.Schedule([&per_replica_results, i, executable,
+            pool.Schedule([&per_replica_results, i, pjrt_executable,
                            args = argument_buffer_slices[i], device_ptr]() {
               std::optional<PjRtFuture<>> returned_future = {};
-              per_replica_results[i] =
-                  executable->pjrt_loaded_executable()->ExecuteSharded(
-                      args, device_ptr, {}, /*returned_future=*/returned_future,
-                      /*fill_future=*/true);
+              xla::ExecuteOptions options;
+              options.untuple_result = true;
+              per_replica_results[i] = pjrt_executable->ExecuteSharded(
+                  args, device_ptr, options,
+                  /*returned_future=*/returned_future,
+                  /*fill_future=*/true);
               if (returned_future.has_value()) {
                 if (const absl::Status& status = returned_future->Await();
                     !status.ok()) {
@@ -622,19 +624,14 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
           }
         }
         // Aggregate results.
-        std::vector<std::unique_ptr<PjRtBuffer>> results;
+        std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results;
         for (int64_t i = 0; i < options.num_replicas; ++i) {
           absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>&
               replica_result = per_replica_results[i];
           if (!replica_result.ok()) {
             return replica_result.status();
           }
-          if (replica_result->size() != 1) {
-            return absl::InternalError(absl::StrFormat(
-                "Expected a single result for replica %d, got %d results.", i,
-                replica_result->size()));
-          }
-          results.push_back(std::move(std::move(replica_result)->front()));
+          results.push_back(*std::move(replica_result));
         }
         return results;
       },
@@ -642,8 +639,9 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
-    std::function<absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>(
-        absl::Span<const std::vector<PjRtBuffer*>>)>
+    std::function<
+        absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
+            absl::Span<const std::vector<PjRtBuffer*>>)>
         execution_helper,
     std::function<int64_t(int64_t)> argument_count_provider,
     std::function<const Literal*(int64_t, int64_t)> argument_provider,
@@ -746,7 +744,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
 
   VLOG(1) << "Replicated execution started";
   TF_ASSIGN_OR_RETURN(
-      const std::vector<std::unique_ptr<PjRtBuffer>> result_buffers,
+      const std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
+          result_buffers,
       execution_helper(BufferMatToPointerMat(argument_buffer_slices)));
   VLOG(1) << "Replicated execution terminated";
 
@@ -755,7 +754,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   result_literals.reserve(options.num_replicas);
   for (int64_t i = 0; i < options.num_replicas; ++i) {
     TF_ASSIGN_OR_RETURN(Literal literal,
-                        TransferLiteralFromDevice(*result_buffers[i]));
+                        TransferLiteralsFromDevice(
+                            result_buffers[i], result_buffers[i].size() != 1));
     result_literals.push_back(std::move(literal));
   }
 
@@ -769,7 +769,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 HloRunnerPjRt::TransferLiteralToDevice(
-    const Literal& literal, absl::Nonnull<PjRtMemorySpace*> const memory_space,
+    const Literal& literal, PjRtMemorySpace* absl_nonnull const memory_space,
     const Layout& on_device_layout) {
   // Whenever possible, we want to respect the provided on-device layout. This
   // layout was either provided by the user or was inferred by the compiler. The
@@ -812,19 +812,105 @@ bool HloRunnerPjRt::HasProperty(const HloRunnerPropertyTag::Type tag) const {
   return false;
 }
 
-absl::StatusOr<absl::Nonnull<const HloModule*>>
+absl::StatusOr<const HloModule* absl_nonnull>
 HloRunnerPjRt::HloModuleFromWrapped(const OpaqueExecutable* wrapped) const {
   TF_ASSIGN_OR_RETURN(
       const HloRunnerPjRtExecutable* const hlo_runner_pjrt_executable,
       HloRunnerPjRtExecutable::TryUnwrap(*this, wrapped));
-  const PjRtLoadedExecutable* const executable =
-      hlo_runner_pjrt_executable->pjrt_loaded_executable();
-  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> modules,
-                      executable->GetHloModules());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::shared_ptr<HloModule>> modules,
+      hlo_runner_pjrt_executable->executable()->GetHloModules());
   if (!modules.empty()) {
     return modules.front().get();
   }
   return absl::NotFoundError("PjRtLoadedExecutable has no modules.");
 }
 
+bool HloRunnerPjRt::ExecutablesAreEquivalent(
+    const OpaqueExecutable* absl_nonnull lhs,
+    const OpaqueExecutable* absl_nonnull rhs) const {
+  constexpr auto kFingerprint =
+      [](const absl::StatusOr<const HloRunnerPjRtExecutable*> wrapped)
+      -> absl::StatusOr<std::string> {
+    TF_ASSIGN_OR_RETURN(const HloRunnerPjRtExecutable* const executable,
+                        wrapped);
+    return executable->executable()->FingerprintExecutable();
+  };
+
+  const absl::StatusOr<std::string> lhs_fingerprint =
+      kFingerprint(HloRunnerPjRtExecutable::TryUnwrap(*this, lhs));
+  if (!lhs_fingerprint.ok()) {
+    return false;
+  }
+  const absl::StatusOr<std::string> rhs_fingerprint =
+      kFingerprint(HloRunnerPjRtExecutable::TryUnwrap(*this, rhs));
+  if (!rhs_fingerprint.ok()) {
+    return false;
+  }
+  return *lhs_fingerprint == *rhs_fingerprint;
+}
+
+// Split-phase HloRunnerPjRt implementations:
+
+namespace {
+std::string MakeFilename(const HloModule& module, const bool run_hlo_passes) {
+  // TODO: b/415841352 - We need a better way to calculate this fingerprint.
+  // Right now, this fingerprint does not take into account the compilation
+  // environment, flags, etc. Since we don't intend to re-use the compilation
+  // artifacts across test runs, this should probably be fine. Each environment
+  // gets a fresh artifact directory. The fingerprint may need to be generated
+  // within PjRt itself since the environment is not easily accessed at this
+  // level of abstraction.
+  const tsl::Fprint128 module_fingerprint =
+      tsl::Fingerprint128(module.ToString(HloPrintOptions::Fingerprint()));
+  const tsl::Fprint128 run_hlo_passes_fingerprint =
+      tsl::Fingerprint128(run_hlo_passes ? "true" : "false");
+  const tsl::Fprint128 fingerprint =
+      tsl::FingerprintCat128(module_fingerprint, run_hlo_passes_fingerprint);
+  const std::array<char, 16> fingerprint_bytes =
+      tsl::Fprint128ToBytes(fingerprint);
+  const absl::string_view fingerprint_bytes_view(fingerprint_bytes.data(),
+                                                 fingerprint_bytes.size());
+  return absl::StrCat(absl::BytesToHexString(fingerprint_bytes_view), ".bin");
+}
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
+CompilePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
+                                            const bool run_hlo_passes) {
+  const std::string filename =
+      tsl::io::JoinPath(artifact_dir_, MakeFilename(*module, run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<OpaqueExecutable> wrapped_executable,
+      HloRunnerPjRt::CreateExecutable(std::move(module), run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(
+      HloRunnerPjRtExecutable* const executable,
+      HloRunnerPjRtExecutable::TryUnwrap(*this, wrapped_executable.get()));
+
+  TF_ASSIGN_OR_RETURN(const std::string serialized_executable,
+                      executable->executable()->SerializeExecutable());
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), filename,
+                                            serialized_executable));
+  return wrapped_executable;
+}
+
+absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
+ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
+                                            const bool run_hlo_passes) {
+  const std::string filename =
+      tsl::io::JoinPath(artifact_dir_, MakeFilename(*module, run_hlo_passes));
+  std::string serialized_executable;
+  if (const absl::Status status = tsl::ReadFileToString(
+          tsl::Env::Default(), filename, &serialized_executable);
+      !status.ok()) {
+    if (!compile_if_not_found_) {
+      return absl::NotFoundError(absl::StrCat(
+          "Failed to read serialized executable. ", status.message()));
+    }
+    LOG(INFO) << "Failed to read serialized executable. " << status;
+    return HloRunnerPjRt::CreateExecutable(std::move(module), run_hlo_passes);
+  }
+  return DeserializeExecutable(serialized_executable);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index b4bf82b9e32f..52dede0c000c 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -19,9 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/base/nullability.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -35,7 +38,6 @@ limitations under the License.
 #include "xla/service/hlo_runner_interface.h"
 #include "xla/shape_layout.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -50,8 +52,6 @@ class HloRunnerPjRt : public HloRunnerInterface {
       DeviceShapeRepresentationFn device_shape_representation_fn,
       DeviceShapeSizeFn device_shape_size_fn);
 
-  ~HloRunnerPjRt() override;
-
   // Transfers data between the host and device, using the given parameter
   // layouts.
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
@@ -72,26 +72,13 @@ class HloRunnerPjRt : public HloRunnerInterface {
                                   bool run_hlo_passes,
                                   ExecutionProfile* profile) override;
 
-  // As Execute(), but accepts and returns device buffers instead of host
-  // buffers.
+  // Like Execute(), but accepts and returns pjrt buffers instead of literals.
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   ExecuteWithDeviceBuffers(
-      PjRtLoadedExecutable* executable, const ExecuteOptions& execute_options,
-      const std::vector<std::unique_ptr<PjRtBuffer>>& arguments);
-
-  struct ExecuteWithDeviceBuffersResult {
-    std::vector<std::unique_ptr<PjRtBuffer>> buffers;
-    bool untuple_result = false;
-  };
-  absl::StatusOr<ExecuteWithDeviceBuffersResult> ExecuteWithDeviceBuffers(
       OpaqueExecutable* executable,
       const std::vector<std::unique_ptr<PjRtBuffer>>& arguments,
       const ExecuteOptions* execute_options = nullptr);
 
-  // Creates an executable object for an HloModule.
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> CreateExecutable(
-      HloModule* module, CompileOptions compile_options);
-
   // Creates an executable object given an HLO module. If run_hlo_passes is
   // true, the HLO passes will be run as part of compilation.
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
@@ -102,7 +89,7 @@ class HloRunnerPjRt : public HloRunnerInterface {
   // representation must have been produced by a compiler of the same platform
   // and version as this one.
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> DeserializeExecutable(
-      absl::Nonnull<const tsl::protobuf::Message*> serialized) const override;
+      absl::string_view serialized) const override;
 
   absl::StatusOr<Literal> ExecuteWithExecutable(
       OpaqueExecutable* executable, absl::Span<const Literal* const> arguments,
@@ -149,16 +136,24 @@ class HloRunnerPjRt : public HloRunnerInterface {
 
   bool HasProperty(HloRunnerPropertyTag::Type tag) const override;
 
-  absl::StatusOr<absl::Nonnull<const HloModule*>> HloModuleFromWrapped(
+  absl::StatusOr<const HloModule* absl_nonnull> HloModuleFromWrapped(
       const OpaqueExecutable* wrapped) const override;
 
+  // Returns true if the two given OpaqueExecutables originate from the same
+  // runner and are equivalent according to some notion specific to that runner.
+  // Executables that were created by different runners can never be equivalent.
+  bool ExecutablesAreEquivalent(
+      const OpaqueExecutable* absl_nonnull lhs,
+      const OpaqueExecutable* absl_nonnull rhs) const override;
+
  private:
   absl::StatusOr<CompileOptions> GenerateDefaultCompileOptions(
       HloModule* module, bool run_hlo_passes);
 
   absl::StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
-      std::function<absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>(
-          absl::Span<const std::vector<PjRtBuffer*>>)>
+      std::function<
+          absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
+              absl::Span<const std::vector<PjRtBuffer*>>)>
           execution_helper,
       std::function<int64_t(int64_t)> argument_count_provider,
       std::function<const Literal*(int64_t, int64_t)> argument_provider,
@@ -166,7 +161,7 @@ class HloRunnerPjRt : public HloRunnerInterface {
       DeviceAssignment* device_assignment);
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> TransferLiteralToDevice(
-      const Literal& literal, absl::Nonnull<PjRtMemorySpace*> memory_space,
+      const Literal& literal, PjRtMemorySpace* absl_nonnull memory_space,
       const Layout& on_device_layout);
   absl::StatusOr<Literal> TransferLiteralFromDevice(PjRtBuffer& buffer);
 
@@ -175,6 +170,71 @@ class HloRunnerPjRt : public HloRunnerInterface {
   DeviceShapeSizeFn device_shape_size_fn_;
 };
 
+// This class works just like a HloRunnerPjRt, but it only runs compilation
+// (persisting the executable to disk) and does not run the executable.
+class CompilePhaseHloRunnerPjRt : public HloRunnerPjRt {
+ public:
+  CompilePhaseHloRunnerPjRt(
+      std::unique_ptr<PjRtClient> pjrt_client,
+      DeviceShapeRepresentationFn device_shape_representation_fn,
+      DeviceShapeSizeFn device_shape_size_fn, absl::string_view artifact_dir)
+      : HloRunnerPjRt(std::move(pjrt_client),
+                      std::move(device_shape_representation_fn),
+                      std::move(device_shape_size_fn)),
+        artifact_dir_(artifact_dir) {}
+
+  absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
+
+  absl::StatusOr<Literal> ExecuteWithExecutable(
+      OpaqueExecutable* executable, absl::Span<const Literal* const> arguments,
+      ExecutionProfile* profile) override {
+    return absl::UnimplementedError(
+        "CompilePhaseHloRunnerPjRt does not support execution. This is "
+        "expected.");
+  }
+
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<OpaqueExecutable*(int64_t)> executable_provider,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override {
+    return absl::UnimplementedError(
+        "CompilePhaseHloRunnerPjRt does not support execution. This is "
+        "expected.");
+  }
+
+ private:
+  std::string artifact_dir_;
+};
+
+// This class works just like a HloRunnerPjRt, but it only runs execution
+// (reading the executable from disk) and does not compile the executable.  If
+// `compile_if_not_found` is true, this class will attempt to compile the
+// executable if the serialized version from the compile phase could not be
+// found. This effectively makes this class equivalent to HloRunnerPjRt.
+class ExecutePhaseHloRunnerPjRt : public HloRunnerPjRt {
+ public:
+  ExecutePhaseHloRunnerPjRt(
+      std::unique_ptr<PjRtClient> pjrt_client,
+      DeviceShapeRepresentationFn device_shape_representation_fn,
+      DeviceShapeSizeFn device_shape_size_fn, absl::string_view artifact_dir,
+      bool compile_if_not_found = true)
+      : HloRunnerPjRt(std::move(pjrt_client),
+                      std::move(device_shape_representation_fn),
+                      std::move(device_shape_size_fn)),
+        artifact_dir_(artifact_dir),
+        compile_if_not_found_(compile_if_not_found) {}
+
+  absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
+
+ private:
+  std::string artifact_dir_;
+  bool compile_if_not_found_;
+};
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_HLO_RUNNER_PJRT_H_
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt_test.cc b/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
new file mode 100644
index 000000000000..e2355ead3761
--- /dev/null
+++ b/third_party/xla/xla/service/hlo_runner_pjrt_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/hlo_runner_pjrt.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/parser/hlo_parser.h"
+#include "xla/pjrt/interpreter/interpreter_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/fingerprint.h"
+#include "tsl/platform/notification.h"
+#include "tsl/platform/path.h"
+
+namespace xla {
+namespace {
+
+class FakeClient : public PjRtClient {
+ public:
+  class Executable : public PjRtExecutable {
+   public:
+    int num_replicas() const override { return 1; }
+    int num_partitions() const override { return 1; }
+    int64_t SizeOfGeneratedCodeInBytes() const override { return 0; }
+    absl::string_view name() const override { return "FakeExecutable"; }
+    absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+        const override {
+      return absl::UnimplementedError("GetHloModules is not implemented.");
+    }
+    absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+    GetOutputMemoryKinds() const override {
+      return absl::UnimplementedError(
+          "GetOutputMemoryKinds is not implemented.");
+    }
+    absl::StatusOr<std::string> SerializeExecutable() const override {
+      return "serialized executable";
+    }
+  };
+
+  FakeClient() : deserialize_callback_([](absl::string_view) {}) {}
+  explicit FakeClient(
+      std::function<void(absl::string_view)> deserialize_callback)
+      : deserialize_callback_(std::move(deserialize_callback)) {}
+
+  int process_index() const override { return 0; }
+  int device_count() const override { return 1; }
+  int addressable_device_count() const override { return 1; }
+  absl::Span<PjRtDevice* const> devices() const override { return {}; }
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return {};
+  }
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
+    return {};
+  }
+  PjRtPlatformId platform_id() const override {
+    return tsl::Fingerprint64(platform_name_);
+  }
+  absl::string_view platform_name() const override { return platform_name_; }
+  absl::string_view platform_version() const override { return "0.0.0"; }
+
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override {
+    DeviceAssignment assignment(num_replicas, num_partitions);
+    assignment.FillIota(0);
+    return assignment;
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override {
+    return std::make_unique<Executable>();
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override {
+    deserialize_callback_(serialized);
+    return std::make_unique<Executable>();
+  }
+
+ private:
+  std::function<void(absl::string_view)> deserialize_callback_;
+  std::string platform_name_ = "fake";
+  PjRtPlatformId platform_id_ = tsl::Fingerprint64(platform_name_);
+};
+
+absl::StatusOr<std::unique_ptr<HloModule>> CreateFakeModule() {
+  return ParseAndReturnUnverifiedModule(R"(
+HloModule constant_s32_module, entry_computation_layout={()->s32[]}
+
+ENTRY %constant_s32 () -> s32[] {
+  ROOT %constant = s32[] constant(-42)
+}
+)");
+}
+
+constexpr absl::string_view kModuleSerializedName =
+    "4f22972e3e39d4470dc57f236347ca2d.bin";
+
+class ArtifactDirTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    TF_ASSERT_OK(tsl::Env::Default()->CreateDir(artifact_dir_));
+  }
+  void TearDown() override {
+    int64_t num_files_deleted = 0;
+    int64_t num_dirs_deleted = 0;
+    TF_ASSERT_OK(tsl::Env::Default()->DeleteRecursively(
+        artifact_dir_, &num_files_deleted, &num_dirs_deleted));
+  }
+
+  const std::string artifact_dir_ =
+      tsl::io::JoinPath(testing::TempDir(), "artifact_dir");
+};
+
+using CompilePhaseHloRunnerPjRtTest = ArtifactDirTest;
+
+// Tests that a call to CreateExecutable places the file in the right location.
+TEST_F(CompilePhaseHloRunnerPjRtTest, CreateExecutablePlacesFileCorrectly) {
+  CompilePhaseHloRunnerPjRt runner(std::make_unique<FakeClient>(),
+                                   InterpreterClient::DeviceShapeRepresentation,
+                                   InterpreterClient::ShapeSizeBytes,
+                                   artifact_dir_);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  TF_ASSERT_OK(
+      runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/false).status());
+
+  std::vector<std::string> children;
+  TF_ASSERT_OK(tsl::Env::Default()->GetChildren(artifact_dir_, &children));
+  ASSERT_EQ(children.size(), 1);
+  ASSERT_EQ(children[0], kModuleSerializedName);
+}
+
+using ExecutePhaseHloRunnerPjRtTest = ArtifactDirTest;
+
+// Tests that a call to CreateExecutable reads the file from the correct path
+// and deserializes the right contents.
+TEST_F(ExecutePhaseHloRunnerPjRtTest, CreateExecutableReadsFileCorrectly) {
+  TF_ASSERT_OK(tsl::WriteStringToFile(
+      tsl::Env::Default(),
+      tsl::io::JoinPath(artifact_dir_, kModuleSerializedName), "hello world"));
+  tsl::Notification notification;
+  std::optional<std::string> serialized_representation_read = std::nullopt;
+  ExecutePhaseHloRunnerPjRt runner(
+      std::make_unique<FakeClient>(
+          [&notification,
+           &serialized_representation_read](absl::string_view serialized) {
+            serialized_representation_read = serialized;
+            notification.Notify();
+          }),
+      InterpreterClient::DeviceShapeRepresentation,
+      InterpreterClient::ShapeSizeBytes, artifact_dir_);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m, CreateFakeModule());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<OpaqueExecutable> executable,
+      runner.CreateExecutable(std::move(m), /*run_hlo_passes=*/false));
+
+  ASSERT_TRUE(notification.WaitForNotificationWithTimeout(absl::Seconds(5)));
+  ASSERT_TRUE(serialized_representation_read.has_value());
+  ASSERT_EQ(*serialized_representation_read, "hello world");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_schedule_test.cc b/third_party/xla/xla/service/hlo_schedule_test.cc
index fd89bcc5b23f..c7bfd10d41d4 100644
--- a/third_party/xla/xla/service/hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/hlo_schedule_test.cc
@@ -25,14 +25,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
 #include "xla/literal_util.h"
 #include "xla/service/buffer_value.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -40,7 +40,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class HloScheduleTest : public HloTestBase {};
+class HloScheduleTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HloScheduleTest, UpdateScheduleUnchangedModule) {
   // Updating the schedule of an unchanged HLO module should not affect the
@@ -395,7 +395,7 @@ ENTRY %WhileLoop () -> (s32[], f32[10]) {
                            buffer.shape(),
                            /*pointer_size=*/sizeof(void*));
                      },
-                     /*algorithm=*/{}, {HloInstruction::kMainExecutionThread}));
+                     {HloInstruction::kMainExecutionThread}));
 
   HloInstruction* xla_while = module->entry_computation()
                                   ->root_instruction()
@@ -475,7 +475,7 @@ ENTRY %WhileLoop () -> (s32[], f32[10]) {
                            buffer.shape(),
                            /*pointer_size=*/sizeof(void*));
                      },
-                     /*algorithm=*/{}, {HloInstruction::kMainExecutionThread}));
+                     {HloInstruction::kMainExecutionThread}));
 
   HloComputation* entry_computation = module->entry_computation();
   // Insert computation
diff --git a/third_party/xla/xla/service/hlo_sharding_test.cc b/third_party/xla/xla/service/hlo_sharding_test.cc
index 4cc590aea415..8fa8ec668ac0 100644
--- a/third_party/xla/xla/service/hlo_sharding_test.cc
+++ b/third_party/xla/xla/service/hlo_sharding_test.cc
@@ -17,23 +17,23 @@ limitations under the License.
 #include <cstdint>
 #include <sstream>
 #include <string>
-#include <tuple>
-#include <utility>
 #include <vector>
 
 #include "absl/hash/hash.h"
 #include "absl/types/span.h"
 #include "xla/hlo/parser/hlo_parser.h"
-#include "xla/protobuf_util.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
+using ::tsl::proto_testing::EqualsProto;
+
 Array<int64_t> MakeArray(absl::Span<const int64_t> dimensions,
                          absl::Span<const int64_t> contents) {
   Array<int64_t> a(dimensions);
@@ -53,7 +53,7 @@ std::vector<OpMetadata> ListMetadata() {
   return {GetMetadata("b"), GetMetadata("c")};
 }
 
-class HloShardingTest : public HloTestBase {};
+class HloShardingTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HloShardingTest, Replicate) {
   HloSharding sharding = HloSharding::Replicate();
@@ -109,7 +109,7 @@ TEST_F(HloShardingTest, ProtoRoundTrip) {
   auto* manual = proto.add_tuple_shardings();
   manual->set_type(OpSharding::MANUAL);
   HloSharding sharding = HloSharding::FromProto(proto).value();
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(proto, sharding.ToProto()));
+  EXPECT_THAT(sharding.ToProto(), EqualsProto(proto));
 }
 
 TEST_F(HloShardingTest, IotaProtoRoundTrip) {
@@ -131,7 +131,7 @@ TEST_F(HloShardingTest, IotaProtoRoundTrip) {
   auto* manual = proto.add_tuple_shardings();
   manual->set_type(OpSharding::MANUAL);
   HloSharding sharding = HloSharding::FromProto(proto).value();
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(proto, sharding.ToProto()));
+  EXPECT_THAT(sharding.ToProto(), EqualsProto(proto));
 }
 
 TEST_F(HloShardingTest, Tile) {
@@ -577,8 +577,8 @@ TEST_F(HloShardingTest, WithMetadataNoOverwrite) {
     auto sharding_new_metadata =
         sharding.WithMetadata(SingleMetadata(), /*overwrite=*/false);
     ASSERT_EQ(sharding_new_metadata.metadata().size(), 1);
-    EXPECT_TRUE(protobuf_util::ProtobufEquals(
-        sharding_new_metadata.metadata().front(), SingleMetadata().front()));
+    EXPECT_THAT(sharding_new_metadata.metadata().front(),
+                EqualsProto(SingleMetadata().front()));
   }
 
   {
@@ -586,8 +586,8 @@ TEST_F(HloShardingTest, WithMetadataNoOverwrite) {
     auto sharding_new_metadata =
         sharding.WithMetadata(ListMetadata(), /*overwrite=*/false);
     ASSERT_EQ(sharding_new_metadata.metadata().size(), 1);
-    EXPECT_TRUE(protobuf_util::ProtobufEquals(
-        sharding.metadata().front(), sharding_new_metadata.metadata().front()));
+    EXPECT_THAT(sharding_new_metadata.metadata().front(),
+                EqualsProto(sharding.metadata().front()));
   }
 
   {
@@ -605,21 +605,18 @@ TEST_F(HloShardingTest, WithMetadataNoOverwrite) {
     ASSERT_EQ(sharding_new_metadata.tuple_elements().size(), 3);
 
     ASSERT_EQ(sharding_new_metadata.tuple_elements()[0].metadata().size(), 1);
-    EXPECT_TRUE(protobuf_util::ProtobufEquals(
-        sharding_new_metadata.tuple_elements()[0].metadata().front(),
-        SingleMetadata().front()));
+    EXPECT_THAT(sharding_new_metadata.tuple_elements()[0].metadata().front(),
+                EqualsProto(SingleMetadata().front()));
 
     ASSERT_EQ(sharding_new_metadata.tuple_elements()[1].metadata().size(), 2);
     for (int i = 0; i < 2; ++i) {
-      EXPECT_TRUE(protobuf_util::ProtobufEquals(
-          sharding_new_metadata.tuple_elements()[1].metadata()[i],
-          ListMetadata()[i]));
+      EXPECT_THAT(sharding_new_metadata.tuple_elements()[1].metadata()[i],
+                  EqualsProto(ListMetadata()[i]));
     }
 
     ASSERT_EQ(sharding_new_metadata.tuple_elements()[2].metadata().size(), 1);
-    EXPECT_TRUE(protobuf_util::ProtobufEquals(
-        sharding_new_metadata.tuple_elements()[2].metadata().front(),
-        SingleMetadata().front()));
+    EXPECT_THAT(sharding_new_metadata.tuple_elements()[2].metadata().front(),
+                EqualsProto(SingleMetadata().front()));
   }
 }
 
@@ -629,8 +626,8 @@ TEST_F(HloShardingTest, WithMetadataOverwrite) {
     auto sharding_new_metadata =
         sharding.WithMetadata(SingleMetadata(), /*overwrite=*/true);
     ASSERT_EQ(sharding_new_metadata.metadata().size(), 1);
-    EXPECT_TRUE(protobuf_util::ProtobufEquals(
-        sharding_new_metadata.metadata().front(), SingleMetadata().front()));
+    EXPECT_THAT(sharding_new_metadata.metadata().front(),
+                EqualsProto(SingleMetadata().front()));
   }
 
   {
@@ -639,8 +636,8 @@ TEST_F(HloShardingTest, WithMetadataOverwrite) {
         sharding.WithMetadata(ListMetadata(), /*overwrite=*/true);
     ASSERT_EQ(sharding_new_metadata.metadata().size(), 2);
     for (int i = 0; i < 2; ++i) {
-      EXPECT_TRUE(protobuf_util::ProtobufEquals(
-          sharding_new_metadata.metadata()[i], ListMetadata()[i]));
+      EXPECT_THAT(sharding_new_metadata.metadata()[i],
+                  EqualsProto(ListMetadata()[i]));
     }
   }
 
@@ -661,8 +658,7 @@ TEST_F(HloShardingTest, WithMetadataOverwrite) {
     for (const auto& sub_sharding : sharding_new_metadata.tuple_elements()) {
       ASSERT_EQ(sub_sharding.metadata().size(), 2);
       for (int i = 0; i < 2; ++i) {
-        EXPECT_TRUE(protobuf_util::ProtobufEquals(sub_sharding.metadata()[i],
-                                                  ListMetadata()[i]));
+        EXPECT_THAT(sub_sharding.metadata()[i], EqualsProto(ListMetadata()[i]));
       }
     }
   }
diff --git a/third_party/xla/xla/service/hlo_unstacker.cc b/third_party/xla/xla/service/hlo_unstacker.cc
index f6b747673293..8bf5ef1c2413 100644
--- a/third_party/xla/xla/service/hlo_unstacker.cc
+++ b/third_party/xla/xla/service/hlo_unstacker.cc
@@ -428,15 +428,15 @@ bool CanPropagateGteShapeChangesInComputation(
 std::unique_ptr<HloInstruction> DynamicSliceToSlice(
     HloInstruction* dynamic_slice, HloInstruction* input, int64_t i) {
   std::vector<int64_t> new_start_indices;
-  new_start_indices.reserve(dynamic_slice->shape().rank());
+  new_start_indices.reserve(dynamic_slice->shape().dimensions_size());
   std::vector<int64_t> new_limit_indices;
-  new_limit_indices.reserve(dynamic_slice->shape().rank());
+  new_limit_indices.reserve(dynamic_slice->shape().dimensions_size());
   std::vector<int64_t> new_strides;
-  new_strides.reserve(dynamic_slice->shape().rank());
+  new_strides.reserve(dynamic_slice->shape().dimensions_size());
   new_start_indices.push_back(i);
   new_limit_indices.push_back(i + 1);
   new_strides.push_back(1);
-  for (int64_t j = 1; j < dynamic_slice->shape().rank(); ++j) {
+  for (int64_t j = 1; j < dynamic_slice->shape().dimensions_size(); ++j) {
     new_start_indices.push_back(0);
     new_limit_indices.push_back(
         dynamic_slice->mutable_operand(0)->shape().dimensions(j));
@@ -1089,9 +1089,10 @@ std::optional<PatternInfo> GetDSFusionWithAddPattern(
       HloInstruction* zero =
           builder.AddInstruction(MakeScalarConstantWithShape(p1->shape(), 0));
       std::vector<HloInstruction*> slice_starts;
-      slice_starts.reserve(shape_covering_instr->shape().rank());
+      slice_starts.reserve(shape_covering_instr->shape().dimensions_size());
       slice_starts.push_back(p1);
-      for (int64_t i = 0; i < shape_covering_instr->shape().rank() - 1; i++) {
+      for (int64_t i = 0;
+           i < shape_covering_instr->shape().dimensions_size() - 1; i++) {
         slice_starts.push_back(zero);
       }
       HloInstruction* slice =
diff --git a/third_party/xla/xla/service/hlo_value.cc b/third_party/xla/xla/service/hlo_value.cc
index 8d331f89b81f..03c204c5ec0b 100644
--- a/third_party/xla/xla/service/hlo_value.cc
+++ b/third_party/xla/xla/service/hlo_value.cc
@@ -286,9 +286,6 @@ bool InstructionValueSet::IsAmbiguous() const {
 bool InstructionValueSet::AssignUnionOf(
     absl::Span<const InstructionValueSet* const> inputs) {
   CHECK_GT(inputs.size(), 0);
-  for (int i = 1; i < inputs.size(); ++i) {
-    DCHECK(ShapeUtil::Compatible(inputs[0]->shape(), inputs[i]->shape()));
-  }
   bool changed = false;
   for (auto& pair : *this) {
     const ShapeIndex& index = pair.first;
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.h b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
deleted file mode 100644
index 4a9462068790..000000000000
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
-#define XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/hlo_value_semantics_analysis.h"
-
-#endif  // XLA_SERVICE_HLO_VALUE_SEMANTICS_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index ca1eb16d5e1b..c59341020c90 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -59,11 +59,12 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
+#include "xla/side_effect_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -217,27 +218,7 @@ absl::Status ShapeVerifier::HandleDot(HloInstruction* dot) {
           dot->operand(0)->shape(), dot->operand(1)->shape(),
           dot->dot_dimension_numbers(),
           /*preferred_element_type=*/dot->shape().element_type(), sparsity));
-  if (auto nibble_count =
-          absl::c_count(dot->precision_config().operand_precision(),
-                        PrecisionConfig::PACKED_NIBBLE)) {
-    if (nibble_count == 1) {
-      return InvalidArgument("Dot cannot have a single packed nibble argument");
-    }
-    if (nibble_count == 2) {
-      if (!ShapeUtil::ElementIsIntegralWithBits(dot->operand(0)->shape(), 8)) {
-        return InvalidArgument(
-            "Packed nibble precision can only apply to 8 bit integers. LHS is "
-            "%s.",
-            dot->operand(0)->ToString());
-      }
-      if (!ShapeUtil::ElementIsIntegralWithBits(dot->operand(1)->shape(), 8)) {
-        return InvalidArgument(
-            "Packed nibble precision can only apply to 8 bit integers. RHS is "
-            "%s.",
-            dot->operand(1)->ToString());
-      }
-    }
-  }
+
   for (int i = 0; i < sparsity.size(); ++i) {
     const SparsityDescriptor& descriptor = sparsity[i];
     TF_RET_CHECK(descriptor.index() == 0 || descriptor.index() == 1);
@@ -280,42 +261,7 @@ absl::Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
           convolution->feature_group_count(), convolution->batch_group_count(),
           convolution->window(), convolution->convolution_dimension_numbers(),
           /*preferred_element_type=*/convolution->shape().element_type()));
-  if (auto nibble_count =
-          absl::c_count(convolution->precision_config().operand_precision(),
-                        PrecisionConfig::PACKED_NIBBLE)) {
-    if (nibble_count == 1) {
-      return InvalidArgument(
-          "Convolution cannot have a single packed nibble argument");
-    }
-    if (nibble_count == 2) {
-      if (convolution->feature_group_count() != 1) {
-        return InvalidArgument(
-            "Packed nibble precision does not support feature group count "
-            "%s.",
-            convolution->ToString());
-      }
-      if (convolution->batch_group_count() != 1) {
-        return InvalidArgument(
-            "Packed nibble precision does not support batch group count "
-            "%s.",
-            convolution->ToString());
-      }
-      if (!ShapeUtil::ElementIsIntegralWithBits(
-              convolution->operand(0)->shape(), 8)) {
-        return InvalidArgument(
-            "Packed nibble precision can only apply to 8 bit integers. LHS is "
-            "%s.",
-            convolution->operand(0)->ToString());
-      }
-      if (!ShapeUtil::ElementIsIntegralWithBits(
-              convolution->operand(1)->shape(), 8)) {
-        return InvalidArgument(
-            "Packed nibble precision can only apply to 8 bit integers. RHS is "
-            "%s.",
-            convolution->operand(1)->ToString());
-      }
-    }
-  }
+
   return CheckShape(convolution, expected);
 }
 
@@ -455,7 +401,8 @@ static absl::Status CheckCommonAllGatherInvariants(
 
   int64_t shard_count;
   for (int64_t i = 0; i < ag->operand_count(); ++i) {
-    TF_RET_CHECK(ag->all_gather_dimension() < ag->operand(i)->shape().rank());
+    TF_RET_CHECK(ag->all_gather_dimension() <
+                 ag->operand(i)->shape().dimensions_size());
 
     Shape output_shape;
     if (hlo->opcode() == HloOpcode::kAllGather) {
@@ -467,7 +414,7 @@ static absl::Status CheckCommonAllGatherInvariants(
                          ? ag->shape().tuple_shapes(1)
                          : ag->shape().tuple_shapes(1).tuple_shapes(i);
     }
-    TF_RET_CHECK(ag->all_gather_dimension() < output_shape.rank());
+    TF_RET_CHECK(ag->all_gather_dimension() < output_shape.dimensions_size());
     if (i == 0) {
       shard_count = CeilOfRatio(
           output_shape.dimensions(ag->all_gather_dimension()),
@@ -542,12 +489,13 @@ absl::Status ShapeVerifier::HandleReduceScatter(HloInstruction* hlo) {
   TF_RET_CHECK(ars->operand_count() >= 1);
 
   for (int64_t i = 0; i < ars->operand_count(); ++i) {
-    TF_RET_CHECK(ars->scatter_dimension() < ars->operand(i)->shape().rank());
+    TF_RET_CHECK(ars->scatter_dimension() <
+                 ars->operand(i)->shape().dimensions_size());
 
     const Shape& output_shape = (ars->operand_count() == 1)
                                     ? ars->shape()
                                     : ars->shape().tuple_shapes(i);
-    TF_RET_CHECK(ars->scatter_dimension() < output_shape.rank());
+    TF_RET_CHECK(ars->scatter_dimension() < output_shape.dimensions_size());
   }
 
   const Shape& output0_shape =
@@ -641,8 +589,8 @@ absl::Status ShapeVerifier::HandleRaggedAllToAll(HloInstruction* hlo) {
   const int64_t kOffsetsSizesOperandsStart = 2;
   for (int64_t i = kOffsetsSizesOperandsStart + 1; i < kNumRaggedOperands;
        ++i) {
-    if (operand_shapes[i - 1]->rank() != 1 &&
-        operand_shapes[i - 1]->rank() != 2) {
+    if (operand_shapes[i - 1]->dimensions_size() != 1 &&
+        operand_shapes[i - 1]->dimensions_size() != 2) {
       return Internal("RaggedAllToAll operand %d must be rank 1 or 2: %s",
                       i - 1, hlo->ToString());
     }
@@ -688,14 +636,15 @@ absl::Status CheckBufferOffset(const Shape& buffer_shape,
     if (absl::c_any_of(buffer_offset_shape.tuple_shapes(),
                        [&buffer_shape](const Shape& shape) {
                          return ShapeUtil::TupleElementCount(shape) !=
-                                buffer_shape.rank();
+                                buffer_shape.dimensions_size();
                        })) {
       return Internal(
           "Buffer offset index should have the same number of "
           "elements as the buffer's rank.");
     }
   } else {
-    if (buffer_offset_shape.tuple_shapes_size() != buffer_shape.rank()) {
+    if (buffer_offset_shape.tuple_shapes().size() !=
+        buffer_shape.dimensions().size()) {
       return Internal(
           "Buffer offset index should have the same number of "
           "elements as the buffer's rank.");
@@ -704,8 +653,9 @@ absl::Status CheckBufferOffset(const Shape& buffer_shape,
   return absl::OkStatus();
 }
 
-absl::Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
-  if (!Cast<HloCollectivePermuteInstruction>(collective_permute)->inplace()) {
+absl::Status CheckInplaceCollectivePermute(
+    HloCollectivePermuteInstruction* collective_permute) {
+  if (!collective_permute->inplace()) {
     return absl::OkStatus();
   }
   // TODO support grouped partial collective permute
@@ -720,16 +670,10 @@ absl::Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
   const Shape& output_offset_shape = collective_permute->operand(3)->shape();
 
   if (input_buffer_shape.IsArray() && output_buffer_shape.IsArray()) {
-    absl::Status check_input_buffer_offset =
-        CheckBufferOffset(input_buffer_shape, input_offset_shape);
-    if (!check_input_buffer_offset.ok()) {
-      return check_input_buffer_offset;
-    }
-    absl::Status check_output_buffer_offset =
-        CheckBufferOffset(output_buffer_shape, output_offset_shape);
-    if (!check_output_buffer_offset.ok()) {
-      return check_output_buffer_offset;
-    }
+    TF_RETURN_IF_ERROR(
+        CheckBufferOffset(input_buffer_shape, input_offset_shape));
+    TF_RETURN_IF_ERROR(
+        CheckBufferOffset(output_buffer_shape, output_offset_shape));
   } else if (input_buffer_shape.IsTuple() && output_buffer_shape.IsTuple()) {
     if (ShapeUtil::TupleElementCount(input_buffer_shape) !=
         ShapeUtil::TupleElementCount(output_buffer_shape)) {
@@ -740,26 +684,20 @@ absl::Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
             ShapeUtil::TupleElementCount(input_buffer_shape)) {
       return Internal("Unmatching input buffers and input offset.");
     }
-    for (int i = 0; i < input_buffer_shape.tuple_shapes_size(); ++i) {
-      absl::Status check_input_buffer_offset =
-          CheckBufferOffset(input_buffer_shape.tuple_shapes(i),
-                            input_offset_shape.tuple_shapes(i));
-      if (!check_input_buffer_offset.ok()) {
-        return check_input_buffer_offset;
-      }
+
+    for (int i = 0; i < input_buffer_shape.tuple_shapes().size(); ++i) {
+      TF_RETURN_IF_ERROR(CheckBufferOffset(input_buffer_shape.tuple_shapes(i),
+                                           input_offset_shape.tuple_shapes(i)));
     }
     if (!output_offset_shape.IsTuple() ||
         ShapeUtil::TupleElementCount(output_offset_shape) !=
             ShapeUtil::TupleElementCount(output_buffer_shape)) {
       return Internal("Unmatching output buffers and output offset.");
     }
-    for (int i = 0; i < output_buffer_shape.tuple_shapes_size(); ++i) {
-      absl::Status check_output_buffer_offset =
+    for (int i = 0; i < output_buffer_shape.tuple_shapes().size(); ++i) {
+      TF_RETURN_IF_ERROR(
           CheckBufferOffset(output_buffer_shape.tuple_shapes(i),
-                            output_offset_shape.tuple_shapes(i));
-      if (!check_output_buffer_offset.ok()) {
-        return check_output_buffer_offset;
-      }
+                            output_offset_shape.tuple_shapes(i)));
     }
   } else {
     return Internal("Unmatching input buffers and output buffers.");
@@ -767,52 +705,59 @@ absl::Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
   return absl::OkStatus();
 }
 
-absl::Status CheckDuplicatedSourceOrTarget(HloInstruction* hlo,
-                                           CollectiveOpGroupMode group_mode) {
+absl::Status CheckDuplicatedSourceOrTarget(
+    HloCollectivePermuteInstruction* collective_permute) {
+  TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
+                      GetCollectiveOpGroupMode(collective_permute));
+
   // A source or target cannot appear twice in the collective-permute's
   // source-target pairs. Also, based on the group formation mode, check if the
   // source and target IDs are within expected range.
 
   // Note: for collective-permute, only kCrossReplica and kCrossPartition modes
   // are valid.
-  const HloModuleConfig& config = hlo->GetModule()->config();
+  const HloModuleConfig& config = collective_permute->GetModule()->config();
   const int64_t limit = group_mode == CollectiveOpGroupMode::kCrossReplica
                             ? config.replica_count()
                             : config.num_partitions();
   absl::flat_hash_map<int64_t, std::vector<int64_t>> seen_source_to_targets;
   absl::flat_hash_map<int64_t, std::vector<int64_t>> seen_target_to_sources;
   int allowed_seen_count = 1;
-  if (hlo->operand_count() == 4) {
-    if (hlo->operand(0)->shape().IsArray()) {
-      allowed_seen_count = hlo->operand(2)->shape().tuple_shapes_size();
-    } else {
+  if (collective_permute->inplace()) {
+    if (collective_permute->operand(0)->shape().IsArray()) {
       allowed_seen_count =
-          hlo->operand(2)->shape().tuple_shapes(0).tuple_shapes_size();
+          collective_permute->operand(2)->shape().tuple_shapes().size();
+    } else {
+      allowed_seen_count = collective_permute->operand(2)
+                               ->shape()
+                               .tuple_shapes(0)
+                               .tuple_shapes()
+                               .size();
     }
   }
 
-  for (const auto& p : hlo->source_target_pairs()) {
+  for (const auto& p : collective_permute->source_target_pairs()) {
     TF_RET_CHECK(p.first >= 0)
         << "Source " << p.first
         << " in the instruction's source-target pair must be >= 0 : "
-        << hlo->ToString();
+        << collective_permute->ToString();
     TF_RET_CHECK(limit == 1 || p.first < limit)
         << "Source " << p.first
         << " in the instruction's source-target pair must be < " << limit
-        << " : " << hlo->ToString();
+        << " : " << collective_permute->ToString();
     if (seen_source_to_targets.contains(p.first) &&
         seen_source_to_targets[p.first].size() == allowed_seen_count) {
       if (allowed_seen_count == 1) {
         return Internal(
             "Source %d appears more than once in instruction's source-target "
             "pairs: %s",
-            p.first, hlo->ToString());
+            p.first, collective_permute->ToString());
       } else {
         return Internal(
             "Source %d appears more than %d times in instruction's "
             "source-target "
             "pairs: %s",
-            p.first, allowed_seen_count, hlo->ToString());
+            p.first, allowed_seen_count, collective_permute->ToString());
       }
     } else {
       seen_source_to_targets[p.first].push_back(p.second);
@@ -820,24 +765,24 @@ absl::Status CheckDuplicatedSourceOrTarget(HloInstruction* hlo,
     TF_RET_CHECK(p.second >= 0)
         << "Target " << p.second
         << " in the instruction's source-target pair must be >= 0 : "
-        << hlo->ToString();
+        << collective_permute->ToString();
     TF_RET_CHECK(limit == 1 || p.second < limit)
         << "Target " << p.second
         << " in the instruction's source-target pair must be < " << limit
-        << " : " << hlo->ToString();
+        << " : " << collective_permute->ToString();
     if (seen_target_to_sources.contains(p.second) &&
         seen_target_to_sources[p.second].size() == allowed_seen_count) {
       if (allowed_seen_count == 1) {
         return Internal(
             "Target %d appears more than once in instruction's source-target "
             "pairs: %s",
-            p.second, hlo->ToString());
+            p.second, collective_permute->ToString());
       } else {
         return Internal(
             "Target %d appears more than %d times in instruction's "
             "source-target "
             "pairs: %s",
-            p.second, allowed_seen_count, hlo->ToString());
+            p.second, allowed_seen_count, collective_permute->ToString());
       }
     } else {
       seen_target_to_sources[p.second].push_back(p.first);
@@ -858,42 +803,39 @@ absl::Status ShapeVerifier::HandleCollectiveBroadcast(HloInstruction* hlo) {
 }
 
 absl::Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
-  TF_ASSIGN_OR_RETURN(
-      CollectiveOpGroupMode group_mode,
-      GetCollectiveOpGroupMode(hlo->channel_id().has_value(),
-                               /*use_global_device_ids=*/std::nullopt));
-  TF_RETURN_IF_ERROR(CheckInplaceCollectivePermute(hlo));
-  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(hlo, group_mode));
+  HloCollectivePermuteInstruction* collective_permute =
+      Cast<HloCollectivePermuteInstruction>(hlo);
+  TF_RETURN_IF_ERROR(CheckInplaceCollectivePermute(collective_permute));
+  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(collective_permute));
   std::vector<const Shape*> operand_shapes;
   absl::c_transform(
-      hlo->operands(), std::back_inserter(operand_shapes),
+      collective_permute->operands(), std::back_inserter(operand_shapes),
       [](const HloInstruction* operand) { return &(operand->shape()); });
-  return CheckShape(hlo,
-                    ShapeInference::InferCollectivePermuteShape(
-                        operand_shapes,
-                        Cast<HloCollectivePermuteInstruction>(hlo)->inplace()));
+  return CheckShape(hlo, ShapeInference::InferCollectivePermuteShape(
+                             operand_shapes, collective_permute->inplace()));
 }
 
 absl::Status ShapeVerifier::HandleCollectivePermuteStart(HloInstruction* hlo) {
-  TF_ASSIGN_OR_RETURN(
-      CollectiveOpGroupMode group_mode,
-      GetCollectiveOpGroupMode(hlo->channel_id().has_value(),
-                               /*use_global_device_ids=*/std::nullopt));
-  TF_RETURN_IF_ERROR(CheckInplaceCollectivePermute(hlo));
-  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(hlo, group_mode));
+  HloCollectivePermuteInstruction* collective_permute_start =
+      Cast<HloCollectivePermuteInstruction>(hlo);
+
+  TF_RETURN_IF_ERROR(CheckInplaceCollectivePermute(collective_permute_start));
+  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(collective_permute_start));
   std::vector<const Shape*> operand_shapes;
   absl::c_transform(
-      hlo->operands(), std::back_inserter(operand_shapes),
+      collective_permute_start->operands(), std::back_inserter(operand_shapes),
       [](const HloInstruction* operand) { return &(operand->shape()); });
   std::vector<Shape> context_shapes;
-  if (hlo->shape().tuple_shapes_size() > 2) {
-    context_shapes = std::vector<Shape>(hlo->shape().tuple_shapes().begin() + 2,
-                                        hlo->shape().tuple_shapes().end());
+  if (collective_permute_start->shape().IsTuple() &&
+      collective_permute_start->shape().tuple_shapes().size() > 2) {
+    context_shapes = std::vector<Shape>(
+        collective_permute_start->shape().tuple_shapes().begin() + 2,
+        collective_permute_start->shape().tuple_shapes().end());
   }
-  return CheckShape(hlo,
-                    ShapeInference::InferCollectivePermuteStartShape(
-                        operand_shapes, context_shapes,
-                        Cast<HloCollectivePermuteInstruction>(hlo)->inplace()));
+  return CheckShape(
+      collective_permute_start,
+      ShapeInference::InferCollectivePermuteStartShape(
+          operand_shapes, context_shapes, collective_permute_start->inplace()));
 }
 
 absl::Status ShapeVerifier::HandleCollectivePermuteDone(HloInstruction* hlo) {
@@ -1025,7 +967,7 @@ absl::Status ShapeVerifier::HandleRngBitGenerator(HloInstruction* hlo) {
   if (!hlo->shape().IsTuple()) {
     return absl::OkStatus();
   }
-  if (hlo->shape().IsTuple() && hlo->shape().tuple_shapes_size() != 2) {
+  if (hlo->shape().IsTuple() && hlo->shape().tuple_shapes().size() != 2) {
     return Internal(
         "Expected tuple shape with 2 elements for RngBitGenerator. Got: %s",
         hlo->shape().ToString(true));
@@ -1121,11 +1063,11 @@ absl::Status ShapeVerifier::HandleSort(HloInstruction* hlo) {
   }
 
   // Verify the sort_dimension.
-  if (sort->sort_dimension() >= sort->operand(0)->shape().rank()) {
+  if (sort->sort_dimension() >= sort->operand(0)->shape().dimensions_size()) {
     return Internal(
         "Expected the sort_dimension %d of sort to be smaller than the rank %d "
         "of the operand(s).",
-        sort->sort_dimension(), sort->shape().rank());
+        sort->sort_dimension(), sort->shape().dimensions_size());
   }
 
   return CheckVariadicShape(sort);
@@ -1145,7 +1087,7 @@ absl::Status ShapeVerifier::HandleIota(HloInstruction* hlo) {
   if (!iota->shape().IsArray()) {
     return Internal("Iota does not support non-array result.");
   }
-  const int64_t rank = iota->shape().rank();
+  const int64_t rank = iota->shape().dimensions_size();
   if (rank == 0) {
     return Internal("Iota does not support scalars.");
   }
@@ -1247,12 +1189,14 @@ absl::Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   // Check for mixed precision.
   TF_RET_CHECK(SameElementType(broadcast->shape(), operand_shape))
       << broadcast->ToString();
-  TF_RET_CHECK(operand_shape.rank() == broadcast->dimensions().size())
+  TF_RET_CHECK(operand_shape.dimensions_size() ==
+               broadcast->dimensions().size())
       << broadcast->ToString();
-  for (int64_t operand_dimension = 0; operand_dimension < operand_shape.rank();
+  for (int64_t operand_dimension = 0;
+       operand_dimension < operand_shape.dimensions_size();
        ++operand_dimension) {
     int64_t output_dimension = broadcast->dimensions()[operand_dimension];
-    TF_RET_CHECK((output_dimension < broadcast->shape().rank()) &&
+    TF_RET_CHECK((output_dimension < broadcast->shape().dimensions_size()) &&
                  output_dimension >= 0 &&
                  (broadcast->shape().dimensions(output_dimension) ==
                   operand_shape.dimensions(operand_dimension)))
@@ -1268,7 +1212,7 @@ absl::Status ShapeVerifier::HandleDynamicReshape(
   TF_RET_CHECK(SameElementType(dynamic_reshape->shape(), operand_shape));
   TF_RET_CHECK(ShapeUtil::ElementsIn(dynamic_reshape->shape()) ==
                ShapeUtil::ElementsIn(operand_shape));
-  TF_RET_CHECK(dynamic_reshape->shape().rank() + 1 ==
+  TF_RET_CHECK(dynamic_reshape->shape().dimensions_size() + 1 ==
                dynamic_reshape->operand_count());
   for (int64_t i = 1; i < dynamic_reshape->operand_count(); ++i) {
     TF_RET_CHECK(dynamic_reshape->operand(i)->shape().element_type() == S32);
@@ -1488,7 +1432,9 @@ absl::Status ShapeVerifier::HandleMap(HloInstruction* map) {
   int64_t max_operand_rank = 0;
   for (const HloInstruction* operand : map->operands()) {
     operand_shapes.push_back(&operand->shape());
-    max_operand_rank = std::max(max_operand_rank, operand->shape().rank());
+    max_operand_rank =
+        std::max(max_operand_rank,
+                 static_cast<int64_t>(operand->shape().dimensions_size()));
   }
   // TODO(b/65689298) Remove code below once Map is generalized to accept
   // arbitrary map dimensions.
@@ -1653,7 +1599,7 @@ absl::Status CheckCallableInstructionThreadName(
 
 absl::Status ShapeVerifier::CheckAsyncOpComputationShapes(
     const HloInstruction* async_op, const Shape& async_shape) {
-  if (!async_shape.IsTuple() || async_shape.tuple_shapes_size() < 2) {
+  if (!async_shape.IsTuple() || async_shape.tuple_shapes().size() < 2) {
     return Internal(
         "The %s expects the async shape to be a tuple of at least two "
         "elements, found %s.",
@@ -1833,6 +1779,7 @@ absl::Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kConstant:
     case HloOpcode::kConvolution:
     case HloOpcode::kDot:
+    case HloOpcode::kRaggedDot:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllReduceStart:
     case HloOpcode::kAllReduceDone:
@@ -2621,7 +2568,8 @@ absl::Status CheckFusionInstruction(HloInstruction* fusion) {
   for (auto* instruction :
        fusion->fused_instructions_computation()->instructions()) {
     if (instruction != fused_root) {
-      if (instruction->user_count() == 0) {
+      if (instruction->user_count() == 0 &&
+          !instruction->HasSideEffectNoRecurse()) {
         return Internal("Non-root instruction %s in %s must have users.",
                         instruction->ToString(), fusion->ToString());
       }
@@ -2764,11 +2712,11 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
     // or ComputationLowerer::Visit()
     TF_RET_CHECK(broadcast->dimensions().size() ==
-                 broadcast->operand(0)->shape().rank())
+                 broadcast->operand(0)->shape().dimensions_size())
         << "Broadcast HLO (" << broadcast->ToShortString()
         << ") has invalid number of dimensions: "
         << broadcast->dimensions().size()
-        << " != " << broadcast->operand(0)->shape().rank();
+        << " != " << broadcast->operand(0)->shape().dimensions_size();
     if (opts_.verify_broadcast_dimensions_order) {
       TF_RET_CHECK(absl::c_is_sorted(broadcast->dimensions()))
           << "Broadcast dimensions should be ordered, got: "
@@ -2817,6 +2765,15 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     if (opts_.verify_call_nested_computation_thread_name) {
       return CheckCallableInstructionThreadName(call);
     }
+
+    // As opposed to other callable instructions, nothing respects input/output
+    // aliasing for call instructions, so make sure it's not set.
+    const HloCallableInstruction* callable =
+        DynCast<const HloCallableInstruction>(call);
+    TF_RET_CHECK(callable != nullptr);
+    TF_RET_CHECK(callable->output_to_operand_aliasing().empty())
+        << "Call instruction " << call->ToString()
+        << " may not have an output-to-operand aliasing set.";
     return absl::OkStatus();
   }
 
@@ -2905,7 +2862,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   absl::Status HandleScatter(HloInstruction* scatter) override {
-    int64_t rank = scatter->operand(0)->shape().rank();
+    int64_t rank = scatter->operand(0)->shape().dimensions_size();
     for (int64_t operand_dim :
          scatter->scatter_dimension_numbers().scatter_dims_to_operand_dims()) {
       if (operand_dim > rank) {
@@ -2952,6 +2909,9 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   absl::Status Postprocess(HloInstruction* instruction) override {
+    if (opts_.verify_no_host_memory_space) {
+      TF_RETURN_IF_ERROR(VerifyNoHostMemorySpace(instruction));
+    }
     if (!opts_.InstructionCanChangeLayout(instruction) &&
         LayoutUtil::IsDenseArray(instruction->shape()) &&
         instruction->shape().has_layout()) {
@@ -2960,7 +2920,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
       for (HloInstruction* operand : instruction->operands()) {
         const Shape& operand_shape = operand->shape();
         if (LayoutUtil::IsDenseArray(operand_shape) &&
-            operand_shape.rank() == result_shape.rank() &&
+            operand_shape.dimensions_size() == result_shape.dimensions_size() &&
             operand_shape.has_layout()) {
           const Layout& operand_layout = operand_shape.layout();
           Layout::Equal equal_predicate =
@@ -2969,7 +2929,8 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
               instruction->opcode() == HloOpcode::kCompare ||
               instruction->opcode() == HloOpcode::kIsFinite ||
               (instruction->opcode() == HloOpcode::kSelect &&
-               operand_shape.element_type() == PRED)) {
+               operand_shape.element_type() == PRED) ||
+              instruction->opcode() == HloOpcode::kScatter) {
             // Some instructions can change element_size_in_bits
             // Select instructions ignore element_size_in_bits for predicate
             equal_predicate.IgnoreElementSize();
@@ -3046,11 +3007,41 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return absl::OkStatus();
   }
 
+  // Returns an error status if an instruction or any operand contains host
+  // memory space.
+  static absl::Status VerifyNoHostMemorySpace(
+      const HloInstruction* instruction) {
+    return ShapeUtil::ForEachSubshapeWithStatus(
+        instruction->shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
+          if (subshape.has_layout()) {
+            const Layout& result_layout = subshape.layout();
+            if (result_layout.memory_space() == Layout::kHostMemorySpace) {
+              return absl::InternalError(absl::StrCat(
+                  "Instruction shouldn't have the layout of host memory "
+                  "space: ",
+                  instruction->ToString()));
+            }
+          }
+          return absl::OkStatus();
+        });
+  }
+
   absl::flat_hash_map<std::string, const HloInstruction*> instructions_by_name_;
   const HloVerifierOpts& opts_;
   std::optional<int64_t> num_devices_;
 };
 
+bool IsCollectivesGroupComputation(HloComputation* computation) {
+  auto maybe_caller = computation->GetUniqueCaller(HloOpcode::kAsyncStart);
+  if (!maybe_caller.has_value()) {
+    return false;
+  }
+  return (*maybe_caller)
+      ->get_frontend_attribute(kCollectivesGroupAttr)
+      .has_value();
+}
+
 }  // namespace
 
 absl::StatusOr<bool> HloVerifier::Run(
@@ -3086,7 +3077,8 @@ absl::StatusOr<bool> HloVerifier::Run(
       // collection of send/recv instructions. This is needed to represent NCCL
       // groups on GPU.
       if (computation->IsAsyncComputation() &&
-          !computation->OnlyContainsSendRecv()) {
+          !computation->OnlyContainsSendRecv() &&
+          !IsCollectivesGroupComputation(computation)) {
         TF_RETURN_IF_ERROR(VerifyAsyncComputation(computation));
       }
     }
diff --git a/third_party/xla/xla/service/hlo_verifier.h b/third_party/xla/xla/service/hlo_verifier.h
index 1e20638e4f87..b6d0d43a7ec1 100644
--- a/third_party/xla/xla/service/hlo_verifier.h
+++ b/third_party/xla/xla/service/hlo_verifier.h
@@ -107,6 +107,11 @@ struct HloVerifierOpts {
     return std::move(*this);
   }
 
+  HloVerifierOpts&& VerifyNoHostMemorySpace() {
+    verify_no_host_memory_space = true;
+    return std::move(*this);
+  }
+
   bool IsLayoutSensitive() const { return layout_sensitive; }
 
   bool AllowMixedPrecision() const { return allow_mixed_precision; }
@@ -157,6 +162,9 @@ struct HloVerifierOpts {
   // Check if channel instructions all have unique channel ids.
   bool verify_unique_channel_ids = true;
 
+  // Check if a shape has a host memory space color
+  bool verify_no_host_memory_space = false;
+
   HloPredicate instruction_can_change_layout;
 
   // Returns a target-specific shape size.
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index 939846f7a071..750747bfe739 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -33,24 +33,25 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/layout.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/layout_assignment.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/platform.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -61,20 +62,21 @@ std::unique_ptr<HloModule> CreateUnverifiedModule() {
   return std::make_unique<HloModule>("module", HloModuleConfig());
 }
 
-// This class cannot be converted to use HloTestBase. It explicitly
-// uses HloTestBase to create and test malformed HLOs.
-class HloVerifierTest : public HloTestBase {
+class HloVerifierTest : public HloHardwareIndependentTestBase {
  public:
   HloVerifierTest()
-      : HloTestBase(/*verifier_layout_sensitive=*/false,
-                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/false,
+            /*allow_mixed_precision_in_hlo_verifier=*/false) {}
 };
 
-class HloVerifierTestAllowMixedPrecision : public HloTestBase {
+class HloVerifierTestAllowMixedPrecision
+    : public HloHardwareIndependentTestBase {
  public:
   HloVerifierTestAllowMixedPrecision()
-      : HloTestBase(/*verifier_layout_sensitive=*/false,
-                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/false,
+            /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 };
 
 class HloVerifierTestLayoutSensitive : public HloHardwareIndependentTestBase {
@@ -87,59 +89,23 @@ class HloVerifierTestLayoutSensitive : public HloHardwareIndependentTestBase {
 };
 
 class HloVerifierTestLayoutSensitiveAndAllowMixedPrecision
-    : public HloTestBase {
+    : public HloHardwareIndependentTestBase {
  public:
   HloVerifierTestLayoutSensitiveAndAllowMixedPrecision()
-      : HloTestBase(/*verifier_layout_sensitive=*/true,
-                    /*allow_mixed_precision_in_hlo_verifier=*/true,
-                    LayoutAssignment::InstructionCanChangeLayout) {}
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/true,
+            /*allow_mixed_precision_in_hlo_verifier=*/true,
+            LayoutAssignment::InstructionCanChangeLayout) {}
 };
 
-class HloVerifierTestLayoutFusion : public HloTestBase {
+class HloVerifierTestLayoutFusion : public HloHardwareIndependentTestBase {
  public:
   HloVerifierTestLayoutFusion()
-      : HloTestBase(/*verifier_layout_sensitive=*/true,
-                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
+      : HloHardwareIndependentTestBase(
+            /*verifier_layout_sensitive=*/true,
+            /*allow_mixed_precision_in_hlo_verifier=*/false) {}
 };
 
-TEST_F(HloVerifierTest, NullInstructionParent) {
-  HloComputation::Builder builder(TestName());
-  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  HloInstruction* param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "param"));
-  HloInstruction* negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateUnverifiedModule();
-  module->AddEntryComputation(builder.Build());
-
-  TF_ASSERT_OK(verifier().Run(module.get()).status());
-
-  negate->set_parent(nullptr);
-
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.message(), HasSubstr("has a null parent pointer"));
-}
-
-TEST_F(HloVerifierTest, NullComputationParent) {
-  HloComputation::Builder builder(TestName());
-  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  HloInstruction* param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "param"));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateUnverifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
-
-  TF_ASSERT_OK(verifier().Run(module.get()).status());
-
-  computation->set_parent(nullptr);
-
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.message(), HasSubstr("has a null parent pointer"));
-}
-
 TEST_F(HloVerifierTest, DifferentOperandParents) {
   HloComputation::Builder builder(TestName());
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
@@ -235,6 +201,47 @@ TEST_F(HloVerifierTest, CheckCallThreadMismatch) {
                         "not match (parallel_thread vs main)"));
 }
 
+TEST_F(HloVerifierTest, CheckCallOperandOutputAliasing) {
+  constexpr absl::string_view hlo = R"(
+    HloModule Module
+
+    callme {
+      ROOT param = (s32[], f32[4]) parameter(0)
+    }
+
+    ENTRY entry {
+      p0 = (s32[], f32[4]) parameter(0)
+      ROOT mycall = (s32[], f32[4]) call(p0), to_apply=callme
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+
+  // We have to set the `output_to_operand_aliasing` attribute manually,
+  // because, even though the internal representation supports it for kCall, the
+  // parser doesn't.
+  Cast<HloCallInstruction>(
+      module->entry_computation()->GetInstructionWithName("mycall"))
+      ->set_output_to_operand_aliasing({{{}, {0, {}}}});
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(),
+              HasSubstr("may not have an output-to-operand aliasing set."));
+}
+
+TEST_F(HloVerifierTest, CheckCustomCallOperandOutputAliasing) {
+  constexpr absl::string_view hlo = R"(
+    HloModule Module
+
+    ENTRY entry {
+      p0 = (s32[], f32[4]) parameter(0)
+      ROOT mycall = (s32[], f32[4]) custom-call(p0), custom_call_target="CallMe", output_to_operand_aliasing={{0}: (0, {0})}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  TF_EXPECT_OK(verifier().Run(module.get()).status());
+}
+
 TEST_F(HloVerifierTest, CompositeCall) {
   constexpr absl::string_view hlo = R"(
   HloModule Module
@@ -855,6 +862,41 @@ TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
               HasSubstr("Instruction shouldn't change layouts"));
 }
 
+TEST_F(HloVerifierTestLayoutSensitive,
+       ScatterIgnoreElementSizeForLayoutComparison) {
+  const char* const kScatterHlo = R"(
+    HloModule module
+    overwrite {
+      %p0 = s4[] parameter(0)
+      ROOT %p1 = s4[] parameter(1)
+    }
+
+    scatter {
+      %operand = s4[2048, 2048]{1,0:E(4)}  parameter(0)
+      %update = s4[32, 16, 32]{2,1,0} parameter(1)
+      %iota = s32[8, 4]{1,0} iota(), iota_dimension=0
+      %indices = s32[32, 1]{1,0} reshape(%iota)
+
+      ROOT %scatter = s4[2048, 2048]{1,0:E(4)} scatter(
+          s4[2048, 2048]{1,0} %operand,
+          s32[32, 1]{1,0} %indices,
+          s4[32, 16, 32]{2,1,0} %update
+        ),
+        update_window_dims={1,2},
+        inserted_window_dims={},
+        scatter_dims_to_operand_dims={0},
+        index_vector_dim=1,
+        unique_indices=false,
+        indices_are_sorted=true,
+        to_apply=overwrite
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kScatterHlo));
+  auto status = verifier().Run(module.get()).status();
+  EXPECT_TRUE(status.ok());
+}
+
 TEST_F(HloVerifierTestLayoutSensitive, BitcastNeedsSameNumberOfElements) {
   const char* const hlo_string = R"(
   HloModule Module
@@ -1953,6 +1995,23 @@ TEST_F(HloVerifierTest, CollectivePermuteSameTargetTwice) {
               HasSubstr("Target 2 appears more than once"));
 }
 
+TEST_F(HloVerifierTest, CollectivePermuteMultipeOperands) {
+  absl::string_view kModuleStr = R"(
+  HloModule test
+  ENTRY entry {
+    p0 = f32[128] parameter(0)
+    p1 = f32[128] parameter(1)
+    p2 = f32[128] parameter(2)
+    p3 = f32[128] parameter(3)
+    ROOT permute = (f32[128], f32[128], f32[128], f32[128]) collective-permute(
+      p0, p1, p2, p3), source_target_pairs={{0,1}, {1,2}, {2,0}}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+  TF_EXPECT_OK(verifier().Run(module.get()).status());
+}
+
 TEST_F(HloVerifierTest, CollectivePermuteSameSourceTooManyTimes) {
   const char* const kModuleStr = R"(
   HloModule test
@@ -2790,9 +2849,9 @@ TEST_F(HloVerifierTest, ReduceScatterInvalidScatterDim) {
 
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
-  EXPECT_THAT(
-      status.message(),
-      HasSubstr("ars->scatter_dimension() < ars->operand(i)->shape().rank()"));
+  EXPECT_THAT(status.message(),
+              HasSubstr("ars->scatter_dimension() < "
+                        "ars->operand(i)->shape().dimensions_size()"));
 }
 
 TEST_F(HloVerifierTest, ReduceScatterNonUniformGroups) {
@@ -3774,5 +3833,44 @@ TEST_F(HloVerifierTest, RaggedAllToAllWithRank2OffsetsShapes) {
               HasSubstr("RaggedAllToAll operands have different shapes"));
 }
 
+TEST_F(HloVerifierTest, NoHostMemorySpaceShape) {
+  constexpr absl::string_view hlo_no_host_memory_space_shape = R"(
+HloModule hlo_no_host_memory_space
+ENTRY main {
+  x = f32[] parameter(0)
+  y = f32[] parameter(1)
+  ROOT z = f32[] add(f32[] x, f32[] y)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(
+                                           hlo_no_host_memory_space_shape));
+
+  auto status = HloVerifier{HloVerifierOpts{}.VerifyNoHostMemorySpace()}
+                    .Run(module.get())
+                    .status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, NegativeHostMemorySpaceShape) {
+  constexpr absl::string_view hlo_with_host_memory_space_shape = R"(
+HloModule custom_call_bitcast_module
+ENTRY main {
+  param.1 = bf16[8,1,64]{2,0,1} parameter(0)
+  custom-call.2 = bf16[8,1,64]{2,0,1} custom-call(param.1), custom_call_target="MoveToHost"
+  ROOT bitcast.3 = bf16[8,1,64]{2,1,0:S(5)} bitcast(custom-call.2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(
+                                           hlo_with_host_memory_space_shape));
+
+  auto status = HloVerifier{HloVerifierOpts{}.VerifyNoHostMemorySpace()}
+                    .Run(module.get())
+                    .status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.message(),
+      HasSubstr("Instruction shouldn't have the layout of host memory space"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_memory_offload_annotations.h b/third_party/xla/xla/service/host_memory_offload_annotations.h
deleted file mode 100644
index e230fdc8b607..000000000000
--- a/third_party/xla/xla/service/host_memory_offload_annotations.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
-
-#ifndef XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
-#define XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
-
-#include "absl/strings/string_view.h"
-
-namespace xla {
-namespace host_memory_offload_annotations {
-
-// External annotations:
-inline const absl::string_view kDevicePlacement = "annotate_device_placement";
-inline const absl::string_view kMemoryTargetPinnedHost = "pinned_host";
-inline const absl::string_view kMemoryTargetUnpinnedHost = "unpinned_host";
-inline const absl::string_view kMemoryTargetDevice = "device";
-inline const absl::string_view kMemoryTargetDeviceSram = "device_sram";
-inline const absl::string_view kMemoryTargetPinnedDevice = "pinned_device";
-
-// Internal annotations:
-inline const absl::string_view kMoveToHostCustomCallTarget = "MoveToHost";
-inline const absl::string_view kMoveToDeviceCustomCallTarget = "MoveToDevice";
-inline const absl::string_view kPinToDeviceCustomCallTarget = "PinToDevice";
-inline const absl::string_view kPinToDeviceSramCustomCallTarget =
-    "PinToDeviceSram";
-
-}  // namespace host_memory_offload_annotations
-}  // namespace xla
-
-#endif  // XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
diff --git a/third_party/xla/xla/service/host_memory_transfer_asyncifier.h b/third_party/xla/xla/service/host_memory_transfer_asyncifier.h
deleted file mode 100644
index d2677f2ab294..000000000000
--- a/third_party/xla/xla/service/host_memory_transfer_asyncifier.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef XLA_SERVICE_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
-#define XLA_SERVICE_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/host_memory_transfer_asyncifier.h"
-
-#endif  // XLA_SERVICE_HOST_MEMORY_TRANSFER_ASYNCIFIER_H_
diff --git a/third_party/xla/xla/service/host_offload_legalize.h b/third_party/xla/xla/service/host_offload_legalize.h
deleted file mode 100644
index 181c82e269a1..000000000000
--- a/third_party/xla/xla/service/host_offload_legalize.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
-#ifndef XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
-#define XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/host_offload_legalize.h"
-
-#endif  // XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
diff --git a/third_party/xla/xla/service/host_offload_utils.cc b/third_party/xla/xla/service/host_offload_utils.cc
index 45386aeb6bc0..be70f5c15bca 100644
--- a/third_party/xla/xla/service/host_offload_utils.cc
+++ b/third_party/xla/xla/service/host_offload_utils.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/call_graph.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/shape_util.h"
 #include "xla/side_effect_util.h"
 #include "xla/util.h"
@@ -42,8 +42,8 @@ namespace host_offload_utils {
 
 namespace {
 
-using ::xla::host_memory_offload_annotations::kMoveToDeviceCustomCallTarget;
-using ::xla::host_memory_offload_annotations::kMoveToHostCustomCallTarget;
+using ::xla::memory_annotations::kMoveToDeviceCustomCallTarget;
+using ::xla::memory_annotations::kMoveToHostCustomCallTarget;
 
 bool CustomCallReusesBuffer(const HloInstruction* custom_call,
                             int64_t operand_index) {
diff --git a/third_party/xla/xla/service/host_offload_utils.h b/third_party/xla/xla/service/host_offload_utils.h
index 5caf571a1e4c..c14de87bae88 100644
--- a/third_party/xla/xla/service/host_offload_utils.h
+++ b/third_party/xla/xla/service/host_offload_utils.h
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_buffer.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/host_offload_utils_test.cc b/third_party/xla/xla/service/host_offload_utils_test.cc
index 6f38b45ab095..c2f00787fbd2 100644
--- a/third_party/xla/xla/service/host_offload_utils_test.cc
+++ b/third_party/xla/xla/service/host_offload_utils_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
@@ -28,7 +28,7 @@ namespace xla {
 namespace host_offload_utils {
 namespace {
 
-class HostOffloadUtilsTest : public HloTestBase {};
+class HostOffloadUtilsTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HostOffloadUtilsTest, SimpleGetSuccessorsGetPredecessorsTest) {
   const std::string& hlo_string = R"(
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
deleted file mode 100644
index 0f68eb631fc0..000000000000
--- a/third_party/xla/xla/service/host_offloader.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
-#ifndef XLA_SERVICE_HOST_OFFLOADER_H_
-#define XLA_SERVICE_HOST_OFFLOADER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/host_offloader.h"
-
-#endif  // XLA_SERVICE_HOST_OFFLOADER_H_
diff --git a/third_party/xla/xla/service/host_offloading_prepare.h b/third_party/xla/xla/service/host_offloading_prepare.h
deleted file mode 100644
index 016bfadb46ba..000000000000
--- a/third_party/xla/xla/service/host_offloading_prepare.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
-
-#ifndef XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
-#define XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/host_offloading_prepare.h"
-
-#endif  // XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
diff --git a/third_party/xla/xla/service/indexed_array_analysis.h b/third_party/xla/xla/service/indexed_array_analysis.h
deleted file mode 100644
index 6dbfd2a1eccf..000000000000
--- a/third_party/xla/xla/service/indexed_array_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
-#define XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/indexed_array_analysis.h"
-
-#endif  // XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/infeed_token_propagation.h b/third_party/xla/xla/service/infeed_token_propagation.h
deleted file mode 100644
index 31a0aa19ed8c..000000000000
--- a/third_party/xla/xla/service/infeed_token_propagation.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_INFEED_TOKEN_PROPAGATION_H_
-#define XLA_SERVICE_INFEED_TOKEN_PROPAGATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/collectives/infeed_token_propagation.h"
-
-#endif  // XLA_SERVICE_INFEED_TOKEN_PROPAGATION_H_
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index 06cd11d3bd50..f75aaf57fd13 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/platform/errors.h"
 // The source_location.h is not available in open source.
 #if defined(PLATFORM_GOOGLE)
 #include "absl/types/source_location.h"
@@ -249,7 +250,8 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
       hlo->shape(),
       [&output_rank](const Shape& subshape, const ShapeIndex& shape_index) {
         if (subshape.IsArray()) {
-          output_rank = std::max(output_rank, ShapeUtil::TrueRank(subshape));
+          output_rank =
+              std::max(output_rank, ShapeUtil::TrueNumDimensions(subshape));
         }
       });
   return absl::c_count_if(
@@ -262,7 +264,11 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
                    ShapeUtil::IsEffectiveScalar(operand->shape())) {
                  return false;
                }
-               return ShapeUtil::TrueRank(operand->shape()) >= output_rank;
+               const int true_dims =
+                   operand->shape().IsArray()
+                       ? ShapeUtil::TrueNumDimensions(operand->shape())
+                       : 0;
+               return true_dims >= output_rank;
              }) <= 1;
 }
 
@@ -514,6 +520,62 @@ class ReversePostOrderFusionQueue : public FusionQueue {
   std::vector<bool> fusion_config_;
 };
 
+bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
+                                   HloInstruction* consumer,
+                                   const HloReachabilityMap& reachability) {
+  absl::flat_hash_set<int> operands;
+  auto insert = [&](const HloInstruction* operand) {
+    if (operand == producer) {
+      return false;
+    }
+
+    // If the reachability map already contains the producer and the operand of
+    // the consumer, and the producer can reach the operand, then we know for
+    // sure MultiOutputFusion would create a cycle. If not, we need to do a DFS
+    // traversal of the computation to verify that this multioutput fusion would
+    // not create a cycle.
+    if (reachability.IsPresent(producer) && reachability.IsPresent(operand) &&
+        reachability.IsReachable(producer, operand)) {
+      return true;
+    }
+    operands.insert(operand->unique_id());
+    return false;
+  };
+
+  for (const HloInstruction* operand : consumer->operands()) {
+    if (insert(operand)) {
+      return true;
+    }
+  }
+  for (const HloInstruction* predecessor : consumer->control_predecessors()) {
+    if (insert(predecessor)) {
+      return true;
+    }
+  }
+
+  // Do a DFS on the producer to see if any of the other consumer operands are
+  // reachable in the current state of the graph.
+  std::vector<HloInstruction*> worklist = producer->users();
+  worklist.insert(worklist.end(), producer->control_successors().begin(),
+                  producer->control_successors().end());
+  absl::flat_hash_set<int> visits;
+  while (!worklist.empty()) {
+    const HloInstruction* user = worklist.back();
+    worklist.pop_back();
+    if (operands.count(user->unique_id()) != 0) {
+      return true;
+    }
+    if (visits.count(user->unique_id()) == 0) {
+      visits.insert(user->unique_id());
+      worklist.insert(worklist.end(), user->users().begin(),
+                      user->users().end());
+      worklist.insert(worklist.end(), user->control_successors().begin(),
+                      user->control_successors().end());
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 std::vector<HloComputation*> InstructionFusion::GetNonFusionComputations(
@@ -666,6 +728,7 @@ absl::StatusOr<bool> InstructionFusion::Run(
           // Operand is now dead. Remove from queue.
           fusion_queue->RemoveInstruction(operand);
           // Remove from computation.
+          TF_RETURN_IF_ERROR(operand->SafelyDropAllControlDependencies());
           TF_RETURN_IF_ERROR(computation->RemoveInstruction(operand));
         }
 
@@ -785,62 +848,6 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
   return fusion_instruction;
 }
 
-bool InstructionFusion::MultiOutputFusionCreatesCycle(
-    HloInstruction* producer, HloInstruction* consumer,
-    const HloReachabilityMap& reachability) {
-  absl::flat_hash_set<int> operands;
-  auto insert = [&](const HloInstruction* operand) {
-    if (operand == producer) {
-      return false;
-    }
-
-    // If the reachability map already contains the producer and the operand of
-    // the consumer, and the producer can reach the operand, then we know for
-    // sure MultiOutputFusion would create a cycle. If not, we need to do a DFS
-    // traversal of the computation to verify that this multioutput fusion would
-    // not create a cycle.
-    if (reachability.IsPresent(producer) && reachability.IsPresent(operand) &&
-        reachability.IsReachable(producer, operand)) {
-      return true;
-    }
-    operands.insert(operand->unique_id());
-    return false;
-  };
-
-  for (const HloInstruction* operand : consumer->operands()) {
-    if (insert(operand)) {
-      return true;
-    }
-  }
-  for (const HloInstruction* predecessor : consumer->control_predecessors()) {
-    if (insert(predecessor)) {
-      return true;
-    }
-  }
-
-  // Do a DFS on the producer to see if any of the other consumer operands are
-  // reachable in the current state of the graph.
-  std::vector<HloInstruction*> worklist = producer->users();
-  worklist.insert(worklist.end(), producer->control_successors().begin(),
-                  producer->control_successors().end());
-  absl::flat_hash_set<int> visits;
-  while (!worklist.empty()) {
-    const HloInstruction* user = worklist.back();
-    worklist.pop_back();
-    if (operands.count(user->unique_id()) != 0) {
-      return true;
-    }
-    if (visits.count(user->unique_id()) == 0) {
-      visits.insert(user->unique_id());
-      worklist.insert(worklist.end(), user->users().begin(),
-                      user->users().end());
-      worklist.insert(worklist.end(), user->control_successors().begin(),
-                      user->control_successors().end());
-    }
-  }
-  return false;
-}
-
 namespace {
 
 // Extracts instructions from the fusion that satisfy the filter.
@@ -1029,7 +1036,7 @@ bool IsSafeToFuseSliceIntoDusFusion(const HloInstruction* producer,
               "Slice op has a different shape than the update shape of the "
               "dus op, bailing.");
         }
-        for (int i = 0; i < dus->shape().rank(); ++i) {
+        for (int i = 0; i < dus->shape().dimensions_size(); ++i) {
           const HloInstruction* dus_operand =
               get_real_operand(consumer, dus->operand(2 + i));
           auto constant_operand = get_constant_operand(dus_operand);
@@ -1054,7 +1061,7 @@ bool IsSafeToFuseSliceIntoDusFusion(const HloInstruction* producer,
               "Dynamic slice op has a different shape than the update shape "
               "of the dus op, bailing.");
         }
-        for (int i = 0; i < dus->shape().rank(); ++i) {
+        for (int i = 0; i < dus->shape().dimensions_size(); ++i) {
           const HloInstruction* ds_operand = get_real_operand(
               producer, producer_nonelementwise->operand(1 + i));
           const HloInstruction* dus_operand =
diff --git a/third_party/xla/xla/service/instruction_fusion.h b/third_party/xla/xla/service/instruction_fusion.h
index b291cea116bb..b02878b2b842 100644
--- a/third_party/xla/xla/service/instruction_fusion.h
+++ b/third_party/xla/xla/service/instruction_fusion.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -71,6 +72,12 @@ class FusionDecision {
     }
   }
 
+  explicit FusionDecision(absl::Status status) {
+    if (!status.ok()) {
+      explanation_ = status.message();
+    }
+  }
+
 #if defined(PLATFORM_GOOGLE)
   // We can fuse iff. the decision is `true`. The source location indicates
   // where an instance was created, making debugging easier without a need to
@@ -277,11 +284,6 @@ class InstructionFusion : public HloModulePass {
     is_expensive_ = is_expensive;
   }
 
-  // Whether multi-output fusion would introduce a cycle into the HLO graph.
-  bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
-                                     HloInstruction* consumer,
-                                     const HloReachabilityMap& reachability);
-
   FusionConfigCollection config_collection_mode() {
     return config_collection_mode_;
   }
diff --git a/third_party/xla/xla/service/instruction_fusion_test.cc b/third_party/xla/xla/service/instruction_fusion_test.cc
index f75db6e4f59c..4fbdb0c52ff0 100644
--- a/third_party/xla/xla/service/instruction_fusion_test.cc
+++ b/third_party/xla/xla/service/instruction_fusion_test.cc
@@ -24,16 +24,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
 namespace op = xla::testing::opcode_matchers;
 
-using InstructionFusionTest = HloTestBase;
+using InstructionFusionTest = HloHardwareIndependentTestBase;
 
 // Subclass of InstructionFusion exposing the protected methods Fuse and
 // FuseIntoMultiOutput for testing.
@@ -793,7 +793,7 @@ TEST_F(InstructionFusionTest, DontFuseProducerIfInplaceConflict) {
   EXPECT_FALSE(fusion_decision.CanFuse());
 }
 
-class FusionDecisionTest : public HloTestBase {};
+class FusionDecisionTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(FusionDecisionTest, NotFusionPossibleDisjunction) {
   FusionDecision a = FusionDecision::Allow();
diff --git a/third_party/xla/xla/service/instruction_hoister.h b/third_party/xla/xla/service/instruction_hoister.h
deleted file mode 100644
index bd002321eecf..000000000000
--- a/third_party/xla/xla/service/instruction_hoister.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_INSTRUCTION_HOISTER_H_
-#define XLA_SERVICE_INSTRUCTION_HOISTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/instruction_hoister.h"
-
-#endif  // XLA_SERVICE_INSTRUCTION_HOISTER_H_
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index f38f6aef2f57..7081b7b19a6f 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -108,9 +108,7 @@ bool InstructionFirstDefinesBuffer(
   if (buffer_value_info.first_definition == instruction) {
     return true;
   }
-  if (buffer_value_info.value->values()[0]->shape().has_layout() &&
-      buffer_value_info.value->values()[0]->shape().layout().memory_space() !=
-          kDefaultMemorySpace) {
+  if (buffer_value_info.non_default_memory_space_layout) {
     return false;
   }
   // Similar to logic above, also check if the instruction is a call to a
@@ -342,6 +340,34 @@ ResourcesVector AsyncTracker::GetResourcesFromInstructionImpl(
               : std::make_pair(ResourceTypeToIndex(ResourceType::kSendRecv),
                                ResourceUsageType::kResourceOccupy)};
     }
+    case HloOpcode::kWhile: {
+      ResourcesVector result;
+      absl::flat_hash_set<int64_t> seen_occupied_resources;
+      absl::flat_hash_set<int64_t> seen_released_resources;
+      absl::flat_hash_set<int64_t> seen_no_resource;
+      for (const HloInstruction* instr : hlo.while_body()->instructions()) {
+        ResourcesVector rv = GetResourcesFromInstructionImpl(*instr);
+        if (rv.empty()) {
+          continue;
+        }
+        for (const auto& [resource, usage] : rv) {
+          if (usage == ResourceUsageType::kResourceOccupy &&
+              !seen_occupied_resources.contains(resource)) {
+            seen_occupied_resources.insert(resource);
+            result.push_back(std::make_pair(resource, usage));
+          } else if (usage == ResourceUsageType::kResourceRelease &&
+                     !seen_released_resources.contains(resource)) {
+            seen_released_resources.insert(resource);
+            result.push_back(std::make_pair(resource, usage));
+          } else if (usage == ResourceUsageType::kNoResource &&
+                     !seen_no_resource.contains(resource)) {
+            seen_no_resource.insert(resource);
+            result.push_back(std::make_pair(resource, usage));
+          }
+        }
+      }
+      return result;
+    }
     default:
       return ResourcesVector{};
   }
@@ -362,6 +388,47 @@ int64_t AsyncTracker::GetNumResourcesPerInstruction(
                                        instr);
 }
 
+absl::flat_hash_map<int64_t, int64_t>
+AsyncTracker::GetNumResourcesPerInstruction(const HloInstruction& instr) const {
+  absl::flat_hash_map<int64_t, int64_t> num_resources_per_type;
+  if (instr.called_computations().empty() ||
+      instr.opcode() == HloOpcode::kAsyncStart ||
+      instr.opcode() == HloOpcode::kAsyncDone) {
+    for (const auto& [resource_type, usage] :
+         GetResourcesFromInstruction(instr)) {
+      if (usage == ResourceUsageType::kResourceOccupy) {
+        ++num_resources_per_type[resource_type];
+      }
+    }
+  }
+
+  // For instructions calling multiple computations, we assume that the called
+  // computations do not execute in parallel (i.e., they are either mutually
+  // exclusive, as in conditionals, or executed in sequence). Then for each
+  // resource type, the usage across all called computations is the maximum
+  // usage in any of the called computations.
+  for (const HloComputation* computation : instr.called_computations()) {
+    const auto& map = RecursivelyComputeResourceMap(computation);
+    for (const auto& [resource_type, num_resources] : map) {
+      num_resources_per_type[resource_type] =
+          std::max(num_resources_per_type[resource_type], num_resources);
+    }
+  }
+  return num_resources_per_type;
+}
+
+// Returns the number of "occupy" type of resources used by the instructions in
+// the given computation. If there are multiple instructions in the computation
+// that have the exact same resource usages, it only counts one of them. For
+// example, if there are two non-overlapping async all-gathers in a while loop,
+// this will have 1 for all-gather in the returned map for the while
+// instruction. This is because there is no proof that those all-gathers will
+// overlap each other and over- counting such resources causes the while not
+// being scheduled due to the resource limits (checked in
+// scheduling_node_crosses_overlap_limit).
+//
+// If an instruction uses multiple instances of the same "occupy" type of
+// resource, that number is respected and returned in the resulting map.
 const absl::flat_hash_map<int64_t, int64_t>&
 AsyncTracker::RecursivelyComputeResourceMap(
     const HloComputation* computation) const {
@@ -371,16 +438,28 @@ AsyncTracker::RecursivelyComputeResourceMap(
   }
   per_opcode_map = std::make_unique<absl::flat_hash_map<int64_t, int64_t>>();
   auto* m = per_opcode_map.get();
+  absl::flat_hash_set<int64_t> seen_resources_per_comp;
   for (HloInstruction* instr : computation->instructions()) {
     if (IsSupportedAsyncDone(*instr)) {
+      absl::flat_hash_set<int64_t> seen_resources_per_inst;
       for (const auto& resource : GetResourcesFromInstruction(*instr)) {
+        if (seen_resources_per_comp.contains(resource.first)) {
+          continue;
+        }
         ++(*m)[resource.first];
+        seen_resources_per_inst.insert(resource.first);
       }
+      seen_resources_per_comp.insert(seen_resources_per_inst.begin(),
+                                     seen_resources_per_inst.end());
     }
     for (const HloComputation* called_comp : instr->called_computations()) {
       for (auto& called_per_opcode_pair :
            RecursivelyComputeResourceMap(called_comp)) {
+        if (seen_resources_per_comp.contains(called_per_opcode_pair.first)) {
+          continue;
+        }
         (*m)[called_per_opcode_pair.first] += called_per_opcode_pair.second;
+        seen_resources_per_comp.insert(called_per_opcode_pair.first);
       }
     }
   }
@@ -407,6 +486,9 @@ int64_t AsyncTracker::GetNumResourcesPerInstruction(
     auto opcode_it = map.find(resource_type);
     if (opcode_it != map.end()) {
       num_resources += opcode_it->second;
+      // We can return early if we have found the resource we are looking for.
+      // There is no need to check each called computation.
+      break;
     }
   }
   return num_resources;
@@ -658,7 +740,9 @@ void MemoryPressureTracker::Initialize(
   output_buffers_.clear();
   defined_buffers_.clear();
   live_buffers_set_.clear();
+  int32_t next_id = 0;
   for (auto* instruction : computation->instructions()) {
+    instruction_ids_[instruction] = next_id++;
     auto& output_values = this->output_buffers_[instruction];
     auto& defined_values = this->defined_buffers_[instruction];
     ShapeUtil::ForEachSubshape(
@@ -680,8 +764,7 @@ void MemoryPressureTracker::Initialize(
   if (!initial_live_buffers.empty()) {
     for (HloBuffer::Id id : initial_live_buffers) {
       auto& buffer = buffer_tracker_.GetBufferInfo(id);
-      if (buffer.value->values()[0]->shape().has_layout() &&
-          buffer.value->values()[0]->shape().layout().memory_space() != 0) {
+      if (buffer.non_default_memory_space_layout) {
         continue;
       }
       live_buffers_[buffer.value->id()] = 1;
@@ -692,6 +775,54 @@ void MemoryPressureTracker::Initialize(
     absl::c_fill(live_buffers_, 0);
   }
   pressure_state_.live_ids_at_bottom = live_buffers_set_;
+
+  // Precompute allocated and released buffers per instruction.
+  alloc_release_spans_.resize(next_id);
+  for (auto* instruction : computation->instructions()) {
+    NodeAllocReleaseSpan s;
+    s.start = alloc_release_ids_.size();
+    s.num_alloc = ComputeBufferAllocations(instruction, &alloc_release_ids_);
+    s.num_release = ComputeBufferReleases(instruction, &alloc_release_ids_);
+    alloc_release_spans_[instruction_ids_[instruction]] = s;
+  }
+}
+
+int32_t MemoryPressureTracker::ComputeBufferAllocations(
+    const HloInstruction* instruction, std::vector<HloBuffer::Id>* dst) {
+  int32_t added = 0;
+  for (auto* op : instruction->operands()) {
+    auto it = output_buffers_.find(op);
+    CHECK(it != output_buffers_.end());
+    for (auto& b : it->second) {
+      if (ShouldSkipBufferAllocations(instruction, b.second,
+                                      b.first.first_definition) ||
+          b.first.non_default_memory_space_layout) {
+        continue;
+      }
+      dst->push_back(b.first.value->id());
+      added++;
+    }
+  }
+  return added;
+}
+
+int32_t MemoryPressureTracker::ComputeBufferReleases(
+    const HloInstruction* instruction, std::vector<HloBuffer::Id>* dst) {
+  int32_t added = 0;
+  if (!ShouldSkipBufferReleases(instruction)) {
+    auto it = defined_buffers_.find(instruction);
+    CHECK(it != defined_buffers_.end());
+    for (auto& b : it->second) {
+      if (b.non_default_memory_space_layout) {
+        continue;
+      }
+      if (InstructionFirstDefinesBuffer(instruction, b)) {
+        dst->push_back(b.value->id());
+        added++;
+      }
+    }
+  }
+  return added;
 }
 
 void MemoryPressureTracker::UpdateBuffers(const HloInstruction* instruction) {
@@ -707,40 +838,19 @@ void MemoryPressureTracker::UpdateBuffers(const HloInstruction* instruction) {
   if (pressure_state_.memory_peak < live_memory_usage_ + computations_peak) {
     pressure_state_.memory_peak = live_memory_usage_ + computations_peak;
   }
-  for (auto* op : instruction->operands()) {
-    auto& output_values = output_buffers_[op];
-    for (auto& info : output_values) {
-      if (ShouldSkipBufferAllocations(instruction, info.second,
-                                      info.first.first_definition) ||
-          (info.first.value->values()[0]->shape().has_layout() &&
-           info.first.value->values()[0]->shape().layout().memory_space() !=
-               kDefaultMemorySpace)) {
-        continue;
-      }
-      if (live_buffers_[info.first.value->id()] == 0) {
-        live_buffers_[info.first.value->id()] = 1;
-        live_buffers_set_.insert(info.first.value->id());
-        live_memory_usage_ += info.first.buffer_size;
-      }
+  for (HloBuffer::Id id : allocated_buffer_ids(instruction)) {
+    if (live_buffers_[id] == 0) {
+      live_buffers_[id] = 1;
+      live_buffers_set_.insert(id);
+      live_memory_usage_ += buffer_tracker_.GetBufferInfo(id).buffer_size;
     }
   }
   pressure_state_.memory_peak =
       std::max(live_memory_usage_, pressure_state_.memory_peak);
-  auto it = defined_buffers_.find(instruction);
-  CHECK(it != defined_buffers_.end());
-  if (!ShouldSkipBufferReleases(instruction)) {
-    for (auto& b : it->second) {
-      if (b.value->values()[0]->shape().has_layout() &&
-          b.value->values()[0]->shape().layout().memory_space() !=
-              kDefaultMemorySpace) {
-        continue;
-      }
-      if (live_buffers_[b.value->id()] != 0) {
-        if (InstructionFirstDefinesBuffer(instruction, b)) {
-          live_memory_usage_ -= b.buffer_size;
-          live_buffers_set_.erase(b.value->id());
-        }
-      }
+  for (HloBuffer::Id id : released_buffer_ids(instruction)) {
+    if (live_buffers_[id] != 0) {
+      live_memory_usage_ -= buffer_tracker_.GetBufferInfo(id).buffer_size;
+      live_buffers_set_.erase(id);
     }
   }
 }
@@ -765,39 +875,17 @@ std::pair<int64_t, int64_t> MemoryPressureTracker::MemoryPressureDifference(
           std::max(called_comp_peak, it->second.memory_peak);
     }
   }
-  // Allocate memory increase from the operand and record increase in peak.
-  for (auto* op : instruction->operands()) {
-    auto it = output_buffers_.find(op);
-    CHECK(it != output_buffers_.end());
-    for (auto& b : it->second) {
-      if (ShouldSkipBufferAllocations(instruction, b.second,
-                                      b.first.first_definition) ||
-          (b.first.value->values()[0]->shape().has_layout() &&
-           b.first.value->values()[0]->shape().layout().memory_space() !=
-               kDefaultMemorySpace)) {
-        continue;
-      }
-      if (!live_buffers_[b.first.value->id()]) {
-        increase += b.first.buffer_size;
-      }
+  // Allocate memory increase from the operands and record increase in peak.
+  for (HloBuffer::Id id : allocated_buffer_ids(instruction)) {
+    if (!live_buffers_[id]) {
+      increase += buffer_tracker_.GetBufferInfo(id).buffer_size;
     }
   }
   peak = std::max(increase, peak);
-  auto it = defined_buffers_.find(instruction);
-  CHECK(it != defined_buffers_.end());
   // Decrease memory pressure if some buffers are released.
-  if (!ShouldSkipBufferReleases(instruction)) {
-    for (auto& b : it->second) {
-      if (b.value->values()[0]->shape().has_layout() &&
-          b.value->values()[0]->shape().layout().memory_space() !=
-              kDefaultMemorySpace) {
-        continue;
-      }
-      if (live_buffers_[b.value->id()]) {
-        if (InstructionFirstDefinesBuffer(instruction, b)) {
-          increase -= b.buffer_size;
-        }
-      }
+  for (HloBuffer::Id id : released_buffer_ids(instruction)) {
+    if (live_buffers_[id]) {
+      increase -= buffer_tracker_.GetBufferInfo(id).buffer_size;
     }
   }
   return std::make_pair(increase, peak);
@@ -848,6 +936,196 @@ class ReadySetLt {
       : sched_state_(*sched_state),
         target_scheduling_rule_(target_scheduling_rule),
         early_target_scheduling_rule_(early_target_scheduling_rule) {}
+
+  std::optional<DefaultSchedulerCore::CandidateResult> MemoryPressurePolicy(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      std::pair<int64_t, int64_t>& a_increase,
+      DefaultSchedulerCore::ScheduleCandidate& b,
+      std::pair<int64_t, int64_t>& b_increase) const {
+    // If out of memory reduce memory at all costs. Choose the instruction
+    // that causes the most decrease (or least further increase) of memory
+    // pressure.
+    if (sched_state_.memory_pressure_tracker->memory_usage() >=
+        sched_state_.config.memory_limit) {
+      if (sched_state_.config.depth_based_memory_pressure_reduction) {
+        // Try to pick a node that actually reduces memory pressure first.
+        if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+                a_increase.first < 0 && a_increase.first < b_increase.first, a,
+                b_increase.first < 0 && b_increase.first < a_increase.first, b,
+                "kOnlyDecreaseMemoryOverLimit")) {
+          return *value;
+        }
+        // If there's none than prefer a node that is the deepest. That
+        // matches well with unlocking pressure-reducing nodes for typical ML
+        // graphs.
+        if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+                a.node->GetGraphDepth() > b.node->GetGraphDepth(), a,
+                b.node->GetGraphDepth() > a.node->GetGraphDepth(), b,
+                "kDepthOverLimit")) {
+          return *value;
+        }
+      }
+      // Otherwise pick a node that increases the pressure the least from the
+      // list.
+      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+              a_increase.first < b_increase.first, a,
+              b_increase.first < a_increase.first, b,
+              "kDecreaseMemoryOverLimit")) {
+        return *value;
+      }
+    }
+    // Avoid to bring peak beyond limit. Choose instruction that doesn't do
+    // so.
+    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+            a_increase.second +
+                    sched_state_.memory_pressure_tracker->memory_usage() <=
+                sched_state_.config.memory_limit,
+            a,
+            b_increase.second +
+                    sched_state_.memory_pressure_tracker->memory_usage() <=
+                sched_state_.config.memory_limit,
+            b, "kMemoryPeakOverLimit")) {
+      return *value;
+    }
+    return std::nullopt;
+  }
+
+  std::optional<DefaultSchedulerCore::CandidateResult> ReleaseStartPolicy(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      DefaultSchedulerCore::ScheduleCandidate& b) const {
+    // Prioritise scheduling ready "start" ops, to avoid useless extension of
+    // start-done latencies. This benefits future latency ops, as ops
+    // postponed here may be used to hide not-yet-scheduled latency ops.
+    const ApproximateLatencyEstimator::TimeCost a_ready_interval =
+        a.node->GetReadyTime() - sched_state_.current_time;
+    const ApproximateLatencyEstimator::TimeCost b_ready_interval =
+        b.node->GetReadyTime() - sched_state_.current_time;
+    bool a_ready_and_release =
+        a_ready_interval <= 0 &&
+        a.node->DoesReleaseResource(ResourceType::kCollectivePermute);
+    bool b_ready_and_release =
+        b_ready_interval <= 0 &&
+        b.node->DoesReleaseResource(ResourceType::kCollectivePermute);
+    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+            a_ready_and_release, a, b_ready_and_release, b, "kScheduleStart")) {
+      return *value;
+    }
+    if (a_ready_and_release && b_ready_and_release) {
+      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+              a_ready_interval < b_ready_interval, a,
+              b_ready_interval < a_ready_interval, b, "kScheduleStart")) {
+        return *value;
+      }
+    }
+    return std::nullopt;
+  }
+
+  std::optional<DefaultSchedulerCore::CandidateResult> UnlockDone(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      DefaultSchedulerCore::ScheduleCandidate& b) const {
+    //  Check if any operand is an async done operation of the two ops to be
+    //  compared. Prioritize those to unlock async dones to be scheduled.
+    //  TODO(maggioni): Develop a more complete analysis of the graph to
+    //  prioritize candidates that would more likely unlock more async dones
+    //  to be scheduled.
+    bool a_operands = absl::c_any_of(
+        a.node->GetInstr().operands(),
+        [async_tracker = sched_state_.async_tracker](const HloInstruction* i) {
+          return async_tracker->IsSupportedAsyncDone(*i);
+        });
+    bool b_operands = absl::c_any_of(
+        b.node->GetInstr().operands(),
+        [async_tracker = sched_state_.async_tracker](const HloInstruction* i) {
+          return async_tracker->IsSupportedAsyncDone(*i);
+        });
+    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+            a_operands, a, b_operands, b, "kUnlockDone")) {
+      return *value;
+    }
+    return std::nullopt;
+  }
+
+  std::optional<DefaultSchedulerCore::CandidateResult> AsyncDepth0Candidate(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      DefaultSchedulerCore::ScheduleCandidate& b) const {
+    // If an instruction releasing a resource is not resource constrained and
+    // has an async depth of 0, delay it as much as possible to avoid
+    // potential cost model inefficiencies. For example, if a pair of
+    // async-start and async-done have no dependencies on other ops inside a
+    // loop, the async-start will be pushed to the beginning of the loop.
+    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+            /*first_cond=*/!(a.node->DoesReleaseAnyResource() &&
+                             a.node->GetAsyncDepth() == 0 &&
+                             !IsResourceConstrained(a)),
+            a,
+            /*second_cond=*/
+            !(b.node->DoesReleaseAnyResource() &&
+              b.node->GetAsyncDepth() == 0 && !IsResourceConstrained(b)),
+            b, "kStartAtZeroDepth")) {
+      return value;
+    }
+    return std::nullopt;
+  }
+  std::optional<DefaultSchedulerCore::CandidateResult> AvoidWaste(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      DefaultSchedulerCore::ScheduleCandidate& b) const {
+    // Favor nodes that are the closest in amount of latency they hide with
+    // the highest amount of latency that needs to be hidden to avoid
+    // wasting / big nodes over small async operations.
+    if (!sched_state_.next_ready_stack.empty()) {
+      HloGraphNode::TimeCost latest_ready =
+          sched_state_.next_ready_stack.front()->GetReadyTime();
+      HloGraphNode::TimeCost a_cost_diff = std::abs(
+          latest_ready - sched_state_.current_time - a.node->GetCost());
+      HloGraphNode::TimeCost b_cost_diff = std::abs(
+          latest_ready - sched_state_.current_time - b.node->GetCost());
+      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+              !a.node->DoesReleaseAnyResource() && a_cost_diff < b_cost_diff, a,
+              !b.node->DoesReleaseAnyResource() && b_cost_diff < a_cost_diff, b,
+              "kAvoidWaste")) {
+        return *value;
+      }
+    }
+
+    return std::nullopt;
+  }
+
+  std::optional<DefaultSchedulerCore::CandidateResult>
+  IsValuableForSelectiveOverlap(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      DefaultSchedulerCore::ScheduleCandidate& b) const {
+    int64_t distance_to_selective_overlap_for_a =
+        GetNumHopsToClosestSelectiveOverlap(sched_state_.ready_set, a.node);
+    int64_t distance_to_selective_overlap_for_b =
+        GetNumHopsToClosestSelectiveOverlap(sched_state_.ready_set, b.node);
+    // If a is valuable for selective overlap and there is a selective
+    // overlap in the near future a can be scheduled inside, hold off
+    // scheduling a and schedule b instead. Same logic applies in reverse.
+    int64_t max_distance =
+        sched_state_.config.max_hops_to_closest_selective_overlap;
+    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+            (a.node->GetValuableForSelectiveOverlap() &&
+             distance_to_selective_overlap_for_a <= max_distance),
+            b,
+            (b.node->GetValuableForSelectiveOverlap() &&
+             distance_to_selective_overlap_for_b <= max_distance),
+            a, "kNotValuableForSelectiveOverlap")) {
+      return *value;
+    }
+    return std::nullopt;
+  }
+
+  DefaultSchedulerCore::CandidateResult OriginalPosition(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      DefaultSchedulerCore::ScheduleCandidate& b) const {
+    if (sched_state_.sched_graph.OriginalInstructionPosition(
+            &a.node->GetInstr()) >
+        sched_state_.sched_graph.OriginalInstructionPosition(
+            &b.node->GetInstr())) {
+      return {a, "kOriginalOrder"};
+    }
+    return {b, "kOriginalOrder"};
+  }
   // The comparison here implements the priority for the nodes in the ready set.
   DefaultSchedulerCore::CandidateResult operator()(
       DefaultSchedulerCore::ScheduleCandidate& a,
@@ -864,59 +1142,18 @@ class ReadySetLt {
             "kForceDelay")) {
       return *value;
     }
-    std::pair<int64_t, int64_t> a_increase = std::make_pair(0LL, 0LL);
-    std::pair<int64_t, int64_t> b_increase = std::make_pair(0LL, 0LL);
-    // Check if memory pressure tracking is enabled. Even if it evaluate memory
-    // pressure.
+    std::pair<int64_t, int64_t> a_increase = {0, 0};
+    std::pair<int64_t, int64_t> b_increase = {0, 0};
+
+    // Check if memory pressure tracking is enabled. If it is, evaluate memory
+    // pressure
     if (sched_state_.config.memory_limit != UINT64_MAX &&
         sched_state_.memory_pressure_tracker->memory_usage() >
             (sched_state_.config.memory_limit / 2)) {
       a_increase = GetMemoryPressureChanges(a);
       b_increase = GetMemoryPressureChanges(b);
-      // If out of memory reduce memory at all costs. Choose the instruction
-      // that causes the most decrease (or least further increase) of memory
-      // pressure.
-      if (sched_state_.memory_pressure_tracker->memory_usage() >=
-          sched_state_.config.memory_limit) {
-        if (sched_state_.config.depth_based_memory_pressure_reduction) {
-          // Try to pick a node that actually reduces memory pressure first.
-          if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-                  a_increase.first < 0 && a_increase.first < b_increase.first,
-                  a,
-                  b_increase.first < 0 && b_increase.first < a_increase.first,
-                  b, "kOnlyDecreaseMemoryOverLimit")) {
-            return *value;
-          }
-          // If there's none than prefer a node that is the deepest. That
-          // matches well with unlocking pressure-reducing nodes for typical ML
-          // graphs.
-          if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-                  a.node->GetGraphDepth() > b.node->GetGraphDepth(), a,
-                  b.node->GetGraphDepth() > a.node->GetGraphDepth(), b,
-                  "kDepthOverLimit")) {
-            return *value;
-          }
-        }
-        // Otherwise pick a node that increases the pressure the least from the
-        // list.
-        if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-                a_increase.first < b_increase.first, a,
-                b_increase.first < a_increase.first, b,
-                "kDecreaseMemoryOverLimit")) {
-          return *value;
-        }
-      }
-      // Avoid to bring peak beyond limit. Choose instruction that doesn't do
-      // so.
-      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-              a_increase.second +
-                      sched_state_.memory_pressure_tracker->memory_usage() <=
-                  sched_state_.config.memory_limit,
-              a,
-              b_increase.second +
-                      sched_state_.memory_pressure_tracker->memory_usage() <=
-                  sched_state_.config.memory_limit,
-              b, "kMemoryPeakOverLimit")) {
+
+      if (auto value = MemoryPressurePolicy(a, a_increase, b, b_increase)) {
         return *value;
       }
     }
@@ -952,56 +1189,17 @@ class ReadySetLt {
       // Prioritise scheduling ready "start" ops, to avoid useless extension of
       // start-done latencies. This benefits future latency ops, as ops
       // postponed here may be used to hide not-yet-scheduled latency ops.
-      const ApproximateLatencyEstimator::TimeCost a_ready_interval =
-          a.node->GetReadyTime() - sched_state_.current_time;
-      const ApproximateLatencyEstimator::TimeCost b_ready_interval =
-          b.node->GetReadyTime() - sched_state_.current_time;
-      bool a_ready_and_release =
-          a_ready_interval <= 0 &&
-          a.node->DoesReleaseResource(ResourceType::kCollectivePermute);
-      bool b_ready_and_release =
-          b_ready_interval <= 0 &&
-          b.node->DoesReleaseResource(ResourceType::kCollectivePermute);
-      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-              a_ready_and_release, a, b_ready_and_release, b,
-              "kScheduleStart")) {
+      if (auto value = ReleaseStartPolicy(a, b)) {
         return *value;
       }
-      if (a_ready_and_release && b_ready_and_release) {
-        if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-                a_ready_interval < b_ready_interval, a,
-                b_ready_interval < a_ready_interval, b, "kScheduleStart")) {
-          return *value;
-        }
-      }
     }
 
-    auto async_depth_0_candidate =
-        [this](DefaultSchedulerCore::ScheduleCandidate& a,
-               DefaultSchedulerCore::ScheduleCandidate& b)
-        -> std::optional<DefaultSchedulerCore::CandidateResult> {
-      // If an instruction releasing a resource is not resource constrained and
-      // has an async depth of 0, delay it as much as possible to avoid
-      // potential cost model inefficiencies. For example, if a pair of
-      // async-start and async-done have no dependencies on other ops inside a
-      // loop, the async-start will be pushed to the beginning of the loop.
-      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-              /*first_cond=*/!(a.node->DoesReleaseAnyResource() &&
-                               a.node->GetAsyncDepth() == 0 &&
-                               !IsResourceConstrained(a)),
-              a,
-              /*second_cond=*/
-              !(b.node->DoesReleaseAnyResource() &&
-                b.node->GetAsyncDepth() == 0 && !IsResourceConstrained(b)),
-              b, "kStartAtZeroDepth")) {
-        return value;
-      }
-      return std::nullopt;
-    };
-
     if (sched_state_.config.aggressive_scheduling_policies &&
         sched_state_.config.prioritize_async_depth_over_stall) {
-      if (auto value = async_depth_0_candidate(a, b)) {
+      // If an instruction releasing a resource is not resource constrained and
+      // has an async depth of 0, delay it as much as possible to avoid
+      // potential cost model inefficiencies.
+      if (auto value = AsyncDepth0Candidate(a, b)) {
         return *value;
       }
     }
@@ -1034,7 +1232,7 @@ class ReadySetLt {
     }
     if (sched_state_.config.aggressive_scheduling_policies &&
         !sched_state_.config.prioritize_async_depth_over_stall) {
-      if (auto value = async_depth_0_candidate(a, b)) {
+      if (auto value = AsyncDepth0Candidate(a, b)) {
         return *value;
       }
     }
@@ -1044,6 +1242,7 @@ class ReadySetLt {
             "kFreeBackedupResource")) {
       return *value;
     }
+
     if (sched_state_.config.aggressive_scheduling_policies) {
       // Try to favor paths that are dependent of chains of async operations
       // with long latency as we want to get to them as soon as possible to
@@ -1057,41 +1256,17 @@ class ReadySetLt {
       // Favor nodes that are the closest in amount of latency they hide with
       // the highest amount of latency that needs to be hidden to avoid
       // wasting / big nodes over small async operations.
-      if (!sched_state_.next_ready_stack.empty()) {
-        HloGraphNode::TimeCost latest_ready =
-            sched_state_.next_ready_stack.front()->GetReadyTime();
-        HloGraphNode::TimeCost a_cost_diff = std::abs(
-            latest_ready - sched_state_.current_time - a.node->GetCost());
-        HloGraphNode::TimeCost b_cost_diff = std::abs(
-            latest_ready - sched_state_.current_time - b.node->GetCost());
-        if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-                !a.node->DoesReleaseAnyResource() && a_cost_diff < b_cost_diff,
-                a,
-                !b.node->DoesReleaseAnyResource() && b_cost_diff < a_cost_diff,
-                b, "kAvoidWaste")) {
-          return *value;
-        }
+      if (auto value = AvoidWaste(a, b)) {
+        return *value;
       }
     }
-    //  Check if any operand is an async done operation of the two ops to be
-    //  compared. Prioritize those to unlock async dones to be scheduled.
-    //  TODO(maggioni): Develop a more complete analysis of the graph to
-    //  prioritize candidates that would more likely unlock more async dones
-    //  to be scheduled.
-    bool a_operands = absl::c_any_of(
-        a.node->GetInstr().operands(),
-        [async_tracker = sched_state_.async_tracker](const HloInstruction* i) {
-          return async_tracker->IsSupportedAsyncDone(*i);
-        });
-    bool b_operands = absl::c_any_of(
-        b.node->GetInstr().operands(),
-        [async_tracker = sched_state_.async_tracker](const HloInstruction* i) {
-          return async_tracker->IsSupportedAsyncDone(*i);
-        });
-    if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-            a_operands, a, b_operands, b, "kUnlockDone")) {
+
+    // Check if any operand is an async done operation of the two ops to be
+    // compared. Prioritize those to unlock async dones to be scheduled.
+    if (auto value = UnlockDone(a, b)) {
       return *value;
     }
+
     if (target_scheduling_rule_) {
       if (auto value = target_scheduling_rule_(a, b)) {
         return *value;
@@ -1102,22 +1277,7 @@ class ReadySetLt {
     // that are valuable for selective overlaps.
     if (sched_state_.config.enable_selective_resources &&
         sched_state_.selective_resource_releasers.empty()) {
-      int64_t distance_to_selective_overlap_for_a =
-          GetNumHopsToClosestSelectiveOverlap(sched_state_.ready_set, a.node);
-      int64_t distance_to_selective_overlap_for_b =
-          GetNumHopsToClosestSelectiveOverlap(sched_state_.ready_set, b.node);
-      // If a is valuable for selective overlap and there is a selective
-      // overlap in the near future a can be scheduled inside, hold off
-      // scheduling a and schedule b instead. Same logic applies in reverse.
-      int64_t max_distance =
-          sched_state_.config.max_hops_to_closest_selective_overlap;
-      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
-              (a.node->GetValuableForSelectiveOverlap() &&
-               distance_to_selective_overlap_for_a <= max_distance),
-              b,
-              (b.node->GetValuableForSelectiveOverlap() &&
-               distance_to_selective_overlap_for_b <= max_distance),
-              a, "kNotValuableForSelectiveOverlap")) {
+      if (auto value = IsValuableForSelectiveOverlap(a, b)) {
         return *value;
       }
     }
@@ -1142,15 +1302,10 @@ class ReadySetLt {
             "kDecreaseMemory")) {
       return *value;
     }
+
     // If none of the heuristics above triggers then prefer to schedule
     // according the original order so that we don't impact memory pressure.
-    if (sched_state_.sched_graph.OriginalInstructionPosition(
-            &a.node->GetInstr()) >
-        sched_state_.sched_graph.OriginalInstructionPosition(
-            &b.node->GetInstr())) {
-      return {a, "kOriginalOrder"};
-    }
-    return {b, "kOriginalOrder"};
+    return OriginalPosition(a, b);
   }
 
  private:
@@ -1747,19 +1902,18 @@ absl::StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
   if (!annotations.empty()) {
     VLOG(2) << "Scheduled node is a frontier: " << n->GetInstr().name();
     for (int64_t annotation : annotations) {
-      sched_state->num_scheduled_successors_for_annotation[annotation]++;
-      VLOG(2)
-          << "Annotation: " << annotation << " scheduled num successors: "
-          << sched_state->num_scheduled_successors_for_annotation[annotation]
-          << " total num successors: "
-          << annotation_tracker_->GetNumSuccessors(n->GetInstr().parent(),
-                                                   annotation);
+      DefaultSchedulerCore::SchedulingState::NumSuccessorsForAnnotation&
+          num_successors_for_annotation =
+              sched_state->num_successors_for_annotation[annotation];
+      num_successors_for_annotation.scheduled++;
+      VLOG(2) << "Annotation: " << annotation << " scheduled num successors: "
+              << num_successors_for_annotation.scheduled
+              << " total num successors: " << num_successors_for_annotation.all;
       // LegalizeSchedulingAnnotations pass should have made sure that we will
       // eventually reach a state where all successors of the annotation are
       // scheduled.
-      if (annotation_tracker_->GetNumSuccessors(n->GetInstr().parent(),
-                                                annotation) ==
-          sched_state->num_scheduled_successors_for_annotation[annotation]) {
+      if (num_successors_for_annotation.scheduled ==
+          num_successors_for_annotation.all) {
         sched_state->ready_annotations.push_back(annotation);
       }
     }
@@ -1790,9 +1944,9 @@ absl::StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
     }
   }
 
-  // If this node is an async start/done handle the increase/decrease the number
-  // of outstanding async ops.
-  for (auto& resource : n->GetResources()) {
+  // If this node is an instruction that occupies/releases resource(s), then
+  // handle the increase/decrease.
+  for (auto& resource : n->GetNetResources()) {
     if (resource.second == ResourceUsageType::kResourceRelease) {
       ++(sched_state->max_concurrent_resource[resource.first]);
     } else if (resource.second == ResourceUsageType::kResourceOccupy) {
@@ -1992,12 +2146,18 @@ absl::StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
   VLOG(10)
       << "Memory peak before schedule: "
       << sched_state->memory_pressure_tracker->pressure_state().memory_peak;
+
   sched_state->memory_pressure_tracker->UpdateBuffers(&n->GetInstr());
-  VLOG(10) << "Memory pressure after schedule: "
-           << sched_state->memory_pressure_tracker->memory_usage();
-  VLOG(10)
-      << "Memory peak after schedule: "
-      << sched_state->memory_pressure_tracker->pressure_state().memory_peak;
+  int64_t memory_after = sched_state->memory_pressure_tracker->memory_usage();
+  int64_t memory_peak =
+      sched_state->memory_pressure_tracker->pressure_state().memory_peak;
+
+  if (schedule_proto_.has_value()) {
+    sched_state->memory_trace[&n->GetInstr()] = {memory_after, memory_peak};
+  }
+
+  VLOG(10) << "Memory pressure after schedule: " << memory_after;
+  VLOG(10) << "Memory peak after schedule: " << memory_peak;
   return current_time;
 }
 
@@ -2066,19 +2226,6 @@ HloScheduleGraph::HloScheduleGraph(
       while_instrs.push_back(instr);
     }
   }
-  auto add_dependency_helper = [latency_estimator](HloGraphNode* from,
-                                                   HloGraphNode* to) {
-    // Get the latency between these two instructions for this edge.
-    const LatencyEstimator::TimeCost latency =
-        latency_estimator->GetLatencyBetween(*from, *to);
-    // Adding dependencies as successors for the instruction we are
-    // considering now (instr) and as predecessor for the user.
-    from->successors_.push_back(HloEdge(latency, to));
-    to->predecessors_.push_back(HloEdge(latency, from));
-    ++to->indegree_;
-    ++from->outdegree_;
-  };
-
   // Add dependencies edges between each of the graph nodes.
   for (const HloInstruction* instr : *post_order_instructions) {
     auto node_it = nodes_.find(instr);
@@ -2091,14 +2238,15 @@ HloScheduleGraph::HloScheduleGraph(
       auto user_node_it = nodes_.find(user);
       CHECK(user_node_it != nodes_.end());
       HloGraphNode* user_node = user_node_it->second.get();
-      add_dependency_helper(instr_node, user_node);
+      HloGraphNode::AddDependency(instr_node, user_node, latency_estimator);
     }
     for (const HloInstruction* ctrl_succ : instr->control_successors()) {
       VLOG(10) << "\tCtrl Successor: " << ctrl_succ->ToString();
       auto ctrl_succ_node_it = nodes_.find(ctrl_succ);
       CHECK(ctrl_succ_node_it != nodes_.end());
       HloGraphNode* ctrl_succ_node = ctrl_succ_node_it->second.get();
-      add_dependency_helper(instr_node, ctrl_succ_node);
+      HloGraphNode::AddDependency(instr_node, ctrl_succ_node,
+                                  latency_estimator);
     }
     // To make sure an instruction that aliases with the buffer produced
     // by the async-done operation is not scheduled in between the start and the
@@ -2128,15 +2276,18 @@ HloScheduleGraph::HloScheduleGraph(
                 it = nodes_.find(async_start);
                 CHECK(it != nodes_.end());
                 HloGraphNode* start_node = it->second.get();
+                // Ignore token operands as they are not real aliasing.
+                if (use.instruction->operand(use.operand_number)
+                        ->shape()
+                        .IsToken()) {
+                  continue;
+                }
                 // If there is already a transitive link between the nodes the
                 // other way then skip adding this one.
                 if (IsPredecessorTransitively(pred_node, start_node)) {
                   continue;
                 }
-                pred_node->successors_.push_back(HloEdge(1, start_node));
-                start_node->predecessors_.push_back(HloEdge(1, pred_node));
-                ++pred_node->outdegree_;
-                ++start_node->indegree_;
+                HloGraphNode::AddDependency(pred_node, start_node, 1);
               }
             }
           }
@@ -2182,10 +2333,7 @@ HloScheduleGraph::HloScheduleGraph(
           auto while_it = nodes_.find(dependent_while_instr);
           CHECK(while_it != nodes_.end());
           HloGraphNode* while_node = while_it->second.get();
-          send_done_node->successors_.push_back(HloEdge(1, while_node));
-          while_node->predecessors_.push_back(HloEdge(1, send_done_node));
-          ++send_done_node->outdegree_;
-          ++while_node->indegree_;
+          HloGraphNode::AddDependency(send_done_node, while_node, 1);
         }
         break;
       }
@@ -2330,6 +2478,7 @@ void HloScheduleGraph::AnnotateGraph(
 
 absl::Status DefaultSchedulerCore::InitializeScheduler(
     const HloModule* module) {
+  module_ = module;
   TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
   module_pressure_state_ = std::make_unique<ModulePressureState>(
       module, alias_analysis_.get(), shape_size_bytes_);
@@ -2339,23 +2488,19 @@ absl::Status DefaultSchedulerCore::InitializeScheduler(
   if (VLOG_IS_ON(2)) {
     annotation_tracker_->PrintAnnotationSets(2);
   }
+
   if (!scheduling_instruction_crosses_overlap_limit_) {
     scheduling_instruction_crosses_overlap_limit_ =
         [](const SchedulingState& sched_state, const HloGraphNode* node) {
-          for (const auto& [resource, limit] :
-               sched_state.max_concurrent_resource) {
-            // No resources in flight of this kind. Continue.
-            auto it = sched_state.resource_occupiers_in_flight.find(resource);
-            if (it == sched_state.resource_occupiers_in_flight.end() ||
-                it->second.empty()) {
+          auto num_resources_needed =
+              sched_state.async_tracker->GetNumResourcesPerInstruction(
+                  node->GetInstr());
+          for (const auto& [resource, count] : num_resources_needed) {
+            auto it = sched_state.max_concurrent_resource.find(resource);
+            if (it == sched_state.max_concurrent_resource.end()) {
               continue;
             }
-            // Number of instances of 'resource' needed if this instruction was
-            // to be scheduled.
-            const int64_t num_resources_needed =
-                sched_state.async_tracker->GetNumResourcesPerInstruction(
-                    resource, node->GetInstr());
-            if (limit < num_resources_needed) {
+            if (count > it->second) {
               return true;
             }
           }
@@ -2388,17 +2533,37 @@ DefaultSchedulerCore::GetNumResourcesNeededForAnnotation(
       sched_state.sched_graph.GetOriginalInstrList()[0]->parent();
   for (const HloInstruction* instr :
        annotation_tracker_->GetInstructions(comp, annotation)) {
-    absl::Span<const ResourcePair> rv =
-        sched_state.async_tracker->GetResourcesFromInstruction(*instr);
-    for (const auto& [resource, usage] : rv) {
-      if (usage == ResourceUsageType::kResourceOccupy) {
-        num_resources_needed[resource]++;
-      }
+    auto num_resources_needed_per_instr =
+        sched_state.async_tracker->GetNumResourcesPerInstruction(*instr);
+    for (const auto& [resource, usage] : num_resources_needed_per_instr) {
+      num_resources_needed[resource] += usage;
     }
   }
   return num_resources_needed;
 }
 
+int64_t DefaultSchedulerCore::GetNumSuccessorsForAnnotation(
+    const SchedulingState& sched_state, int64_t annotation) const {
+  const HloComputation* comp =
+      sched_state.sched_graph.GetOriginalInstrList()[0]->parent();
+  int64_t num_successors = 0;
+  std::vector<const HloInstruction*> instrs =
+      annotation_tracker_->GetInstructions(comp, annotation);
+  absl::flat_hash_set<const HloInstruction*> seen_instrs(instrs.begin(),
+                                                         instrs.end());
+  for (const HloInstruction* instr : instrs) {
+    for (const HloEdge& edge :
+         sched_state.sched_graph.GetNode(instr).GetSuccessors()) {
+      const HloGraphNode& user = edge.Target();
+      if (seen_instrs.insert(&user.GetInstr()).second &&
+          (user.GetAnnotation() != annotation)) {
+        ++num_successors;
+      }
+    }
+  }
+  return num_successors;
+}
+
 bool DefaultSchedulerCore::SchedulingAnnotationCrossesOverlapLimit(
     const SchedulingState& sched_state, int64_t annotation) {
   absl::flat_hash_map<int64_t, int64_t> num_resources_needed =
@@ -2427,19 +2592,25 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
       latency_estimator_, async_tracker_, &memory_pressure_tracker, config_);
   async_tracker_->PostProcessScheduleGraph(&sched_state.sched_graph,
                                            latency_estimator_);
+  sched_state.sched_graph.InitializeGraphAnalysis(async_tracker_);
+  VLOG(5) << "Just built graph:";
+
   if (annotation_tracker_->HasAnnotations(computation)) {
     sched_state.sched_graph.AnnotateGraph(annotation_tracker_.get());
     for (int64_t annotation :
          annotation_tracker_->GetAnnotations(computation)) {
-      if (annotation_tracker_->GetSuccessors(computation, annotation).empty()) {
+      int64_t num_successors =
+          GetNumSuccessorsForAnnotation(sched_state, annotation);
+      sched_state.num_successors_for_annotation[annotation].all =
+          num_successors;
+      if (num_successors == 0) {
         VLOG(3) << "Annotation " << annotation
                 << " does not have any successors, is ready to be scheduled";
         sched_state.ready_annotations.push_back(annotation);
       }
     }
   }
-  sched_state.sched_graph.InitializeGraphAnalysis(async_tracker_);
-  VLOG(5) << "Just built graph:";
+
   XLA_VLOG_LINES(5, sched_state.sched_graph.ToString(async_tracker_));
   async_tracker_->SetConcurrentResourceLimits(
       sched_state.max_concurrent_resource);
@@ -2522,25 +2693,27 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
                  .GetNode(sched_state.new_sequence_reversed.front())
                  .GetReadyTime();
 
-  const auto& debug_options = xla::GetDebugOptionsFromFlags();
-  if (debug_options.xla_dump_latency_hiding_schedule() &&
-      computation->IsEntryComputation()) {
-    int core_freq = latency_estimator_->CyclesPerMicrosecond();
-    DumpLatencyHidingSchedule(computation, sched_state.sched_graph,
-                              sched_state.new_sequence_reversed, core_freq,
-                              debug_options);
+  if (schedule_proto_.has_value()) {
+    *schedule_proto_->add_computation_schedules() = ComputationScheduleToProto(
+        computation, sched_state, *latency_estimator_,
+        sched_state.new_sequence_reversed);
   }
-
   return std::move(sched_state.new_sequence_reversed);
 }
 
-void DefaultSchedulerCore::DumpLatencyHidingSchedule(
-    const HloComputation* computation, const HloScheduleGraph& schedule_graph,
-    const std::vector<HloInstruction*>& instructions,
-    const int cycles_per_microsecond, const DebugOptions& debug_options) {
-  ScheduleProto proto;
+ScheduleProto::ComputationScheduleProto
+DefaultSchedulerCore::ComputationScheduleToProto(
+    const HloComputation* computation, const SchedulingState& sched_state,
+    const LatencyEstimator& estimator,
+    const std::vector<HloInstruction*>& instructions) {
+  const HloScheduleGraph& schedule_graph = sched_state.sched_graph;
+  ScheduleProto::ComputationScheduleProto proto;
   proto.set_computation_id(computation->unique_id());
-  proto.set_cycles_per_microsecond(cycles_per_microsecond);
+  proto.set_cycles_per_microsecond(estimator.CyclesPerMicrosecond());
+  *proto.mutable_scheduler_statistics() =
+      LatencyHidingScheduler::LatencyHidingStatistics(
+          computation, latency_estimator_, async_tracker_, shape_size_bytes_)
+          .ToProto();
 
   const HloGraphNode& first_node = schedule_graph.GetNode(instructions.front());
   const double total_time = first_node.GetReadyTime() + first_node.GetCost();
@@ -2554,11 +2727,14 @@ void DefaultSchedulerCore::DumpLatencyHidingSchedule(
     instr_msg->set_id(instr->unique_id());
     instr_msg->set_start_timestamp_cycles(start_time);
     instr_msg->set_end_timestamp_cycles(end_time);
-  }
-  *proto.mutable_hlo_module() = computation->parent()->ToProto();
 
-  const std::string fn = absl::StrFormat("%s.schedule", computation->name());
-  DumpProtobufToFile(proto, debug_options, fn);
+    auto it = sched_state.memory_trace.find(instr);
+    if (it != sched_state.memory_trace.end()) {
+      instr_msg->set_memory_usage_after(it->second.first);
+      instr_msg->set_peak_memory_after(it->second.second);
+    }
+  }
+  return proto;
 }
 
 LatencyHidingScheduler::SchedulerStatistics
@@ -2713,56 +2889,64 @@ LatencyHidingScheduler::LatencyHidingStatistics(
 }
 
 // Prints a SchedulerStatistics object.
-std::string LatencyHidingScheduler::SchedulerStatisticsString(
-    const SchedulerStatistics& sched_stats) {
+std::string LatencyHidingScheduler::SchedulerStatistics::ToString() const {
   std::string result;
-  if (const HloComputation* comp = sched_stats.computation) {
+  if (const HloComputation* comp = this->computation) {
     absl::StrAppend(&result, "For computation: ", comp->name(), ", module ",
                     comp->parent()->name(), "(", comp->parent()->unique_id(),
                     ")\n");
   }
-  absl::StrAppend(&result, "Total wasted cycles: ",
-                  sched_stats.all_gather_wasted_cycles +
-                      sched_stats.all_reduce_wasted_cycles +
-                      sched_stats.collective_broadcast_wasted_cycles +
-                      sched_stats.collective_permute_wasted_cycles +
-                      sched_stats.all_to_all_wasted_cycles +
-                      sched_stats.ragged_all_to_all_wasted_cycles +
-                      sched_stats.reduce_scatter_wasted_cycles +
-                      sched_stats.send_wasted_cycles +
-                      sched_stats.recv_wasted_cycles,
-                  "\n");
+  absl::StrAppend(&result,
+                  "Total wasted cycles: ", this->GetTotalWastedCycles(), "\n");
   absl::StrAppend(&result, "Wasted cycles for all-reduce: ",
-                  sched_stats.all_reduce_wasted_cycles, "\n");
+                  this->all_reduce_wasted_cycles, "\n");
   absl::StrAppend(&result, "Wasted cycles for all-gather: ",
-                  sched_stats.all_gather_wasted_cycles, "\n");
+                  this->all_gather_wasted_cycles, "\n");
   absl::StrAppend(&result, "Wasted cycles for collective-broadcast: ",
-                  sched_stats.collective_broadcast_wasted_cycles, "\n");
+                  this->collective_broadcast_wasted_cycles, "\n");
   absl::StrAppend(&result, "Wasted cycles for collective-permute: ",
-                  sched_stats.collective_permute_wasted_cycles, "\n");
+                  this->collective_permute_wasted_cycles, "\n");
   absl::StrAppend(&result, "Wasted cycles for all-to-all: ",
-                  sched_stats.all_to_all_wasted_cycles, "\n");
+                  this->all_to_all_wasted_cycles, "\n");
   absl::StrAppend(&result, "Wasted cycles for ragged-all-to-all: ",
-                  sched_stats.ragged_all_to_all_wasted_cycles, "\n");
+                  this->ragged_all_to_all_wasted_cycles, "\n");
   absl::StrAppend(&result, "Wasted cycles for reduce-scatter: ",
-                  sched_stats.reduce_scatter_wasted_cycles, "\n");
-  absl::StrAppend(&result,
-                  "Wasted cycles for send: ", sched_stats.send_wasted_cycles,
+                  this->reduce_scatter_wasted_cycles, "\n");
+  absl::StrAppend(&result, "Wasted cycles for send: ", this->send_wasted_cycles,
                   "\n");
+  absl::StrAppend(&result, "Wasted cycles for recv: ", this->recv_wasted_cycles,
+                  "\n");
+  absl::StrAppend(&result, "Total cycles: ", this->total_cycles, "\n");
   absl::StrAppend(&result,
-                  "Wasted cycles for recv: ", sched_stats.recv_wasted_cycles,
+                  "Memory pressure peak (bytes): ", this->memory_pressure_peak,
                   "\n");
-  absl::StrAppend(&result, "Total cycles: ", sched_stats.total_cycles, "\n");
-  absl::StrAppend(&result, "Memory pressure peak (bytes): ",
-                  sched_stats.memory_pressure_peak, "\n");
   return result;
 }
+ScheduleProto::SchedulerStatisticsProto
+LatencyHidingScheduler::SchedulerStatistics::ToProto() const {
+  ScheduleProto::SchedulerStatisticsProto proto;
+  proto.set_all_gather_wasted_cycles(all_gather_wasted_cycles);
+  proto.set_all_reduce_wasted_cycles(all_reduce_wasted_cycles);
+  proto.set_collective_broadcast_wasted_cycles(
+      collective_broadcast_wasted_cycles);
+  proto.set_collective_permute_wasted_cycles(collective_permute_wasted_cycles);
+  proto.set_all_to_all_wasted_cycles(all_to_all_wasted_cycles);
+  proto.set_ragged_all_to_all_wasted_cycles(ragged_all_to_all_wasted_cycles);
+  proto.set_reduce_scatter_wasted_cycles(reduce_scatter_wasted_cycles);
+  proto.set_send_wasted_cycles(send_wasted_cycles);
+  proto.set_recv_wasted_cycles(recv_wasted_cycles);
+  proto.set_total_wasted_cycles(this->GetTotalWastedCycles());
+  proto.set_total_cycles(total_cycles);
+  proto.set_memory_pressure_peak(memory_pressure_peak);
+  return proto;
+}
 
 void LatencyHidingScheduler::LogScheduleStatistics(
     const HloComputation* computation) {
-  XLA_VLOG_LINES(1, SchedulerStatisticsString(LatencyHidingStatistics(
-                        computation, latency_estimator_.get(),
-                        async_tracker_.get(), shape_size_bytes_)));
+  XLA_VLOG_LINES(
+      3, LatencyHidingStatistics(computation, latency_estimator_.get(),
+                                 async_tracker_.get(), shape_size_bytes_)
+             .ToString());
 }
 
 absl::StatusOr<bool> LatencyHidingScheduler::Run(
@@ -2795,6 +2979,11 @@ absl::StatusOr<bool> LatencyHidingScheduler::Run(
   absl::flat_hash_map<HloComputation*, std::vector<HloInstruction*>>
       saved_schedules;
   TF_RETURN_IF_ERROR(scheduler_core_->InitializeScheduler(module));
+  const auto& debug_options = xla::GetDebugOptionsFromFlags();
+  if (debug_options.xla_dump_latency_hiding_schedule()) {
+    TF_RETURN_IF_ERROR(scheduler_core_->CaptureScheduleProto());
+  }
+
   for (HloComputation* computation : computations_to_schedule) {
     TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> new_schedule,
                         scheduler_core_->ScheduleComputation(computation));
@@ -2819,17 +3008,24 @@ absl::StatusOr<bool> LatencyHidingScheduler::Run(
       saved_schedules[computation] = std::move(new_schedule);
     }
   }
-  LOG(INFO) << "LatencyHidingScheduler current memory usage: "
+  LOG(INFO) << "[" << name() << "]"
+            << " LatencyHidingScheduler current memory usage: "
             << scheduler_core_->GetMemoryPeak()
             << " bytes. Current limit: " << scheduler_core_->GetMemoryLimit();
   for (HloComputation* computation : computations_to_schedule) {
-    VLOG(1) << "Statistics before scheduling:";
+    VLOG(3) << "[" << name() << "] Statistics before scheduling:";
     LogScheduleStatistics(computation);
     module->schedule().set_sequence(
         computation, absl::MakeConstSpan(saved_schedules[computation]));
-    VLOG(1) << "Statistics after scheduling:";
+    VLOG(3) << "[" << name() << "] Statistics after scheduling:";
     LogScheduleStatistics(computation);
   }
+  if (debug_options.xla_dump_latency_hiding_schedule()) {
+    TF_ASSIGN_OR_RETURN(ScheduleProto proto,
+                        scheduler_core_->GetCapturedScheduleProto());
+    const std::string filename = absl::StrFormat("%s.schedule", module->name());
+    DumpProtobufToFile(proto, debug_options, filename);
+  }
   return true;
 }
 
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 00f1c2fa73fa..ab1ee5ad9227 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/hlo/ir/ptrvec.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/map_util.h"
 #include "xla/service/hlo_buffer.h"
@@ -56,6 +55,8 @@ limitations under the License.
 
 namespace xla {
 
+inline constexpr int64_t kInvalidAnnotation = -1;
+
 struct CanonicalAsyncOp {
   HloOpcode outer;  // kAsyncStart or kAsyncDone
   HloOpcode inner;  // kAllReduce, kAllGather, kAllToAll, kCollectiveBroadcast,
@@ -242,6 +243,11 @@ class AsyncTracker {
   virtual int64_t GetNumResourcesPerInstruction(
       int64_t resource_type, const HloInstruction& instr) const;
 
+  // Returns a map of number of resources used per resource type by this
+  // instruction.
+  virtual absl::flat_hash_map<int64_t, int64_t> GetNumResourcesPerInstruction(
+      const HloInstruction& instr) const;
+
   // Sets the maximum allowed number of instances for each resource
   virtual void SetConcurrentResourceLimits(
       absl::flat_hash_map<int64_t, int64_t>& max_concurrent_resource) const;
@@ -328,8 +334,14 @@ class AsyncTracker {
 class SchedulerCore {
  public:
   virtual absl::Status InitializeScheduler(const HloModule* module) = 0;
+
+  virtual absl::Status CaptureScheduleProto() = 0;
+
+  virtual absl::StatusOr<ScheduleProto> GetCapturedScheduleProto() = 0;
+
   virtual absl::StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
       const HloComputation* computation) = 0;
+
   virtual ~SchedulerCore() = default;
   virtual int64_t GetMemoryPeak() = 0;
   virtual void SetMemoryLimit(uint64_t new_limit) = 0;
@@ -368,46 +380,15 @@ class AnnotationTracker {
   }
   std::vector<const HloInstruction*> GetInstructions(
       const HloComputation* comp, const int64_t annotation) const {
+    if (annotation == kInvalidAnnotation) {
+      return {};
+    }
     return annotations_.at(annotation).at(comp);
   }
   int64_t GetNumInstructions(const HloComputation* comp,
                              const int64_t annotation) {
     return annotations_[annotation][comp].size();
   }
-  void FindSuccessors(const HloComputation* comp, const int64_t annotation) {
-    absl::flat_hash_set<const HloInstruction*> seen_instructions(
-        annotations_[annotation][comp].begin(),
-        annotations_[annotation][comp].end());
-    for (const HloInstruction* instr : annotations_.at(annotation).at(comp)) {
-      for (const PtrVec<HloInstruction*>& users :
-           {instr->users(), instr->control_successors()}) {
-        for (HloInstruction* user : users) {
-          if (!seen_instructions.contains(user) &&
-              (GetAnnotation(user) == std::nullopt ||
-               GetAnnotation(user).value() != annotation)) {
-            annotation_successors_[annotation][comp].push_back(user);
-            VLOG(3) << "Annotation group: " << annotation
-                    << ", successor: " << user->name();
-          }
-          seen_instructions.insert(user);
-        }
-      }
-    }
-  }
-  int64_t GetNumSuccessors(const HloComputation* comp,
-                           const int64_t annotation) {
-    if (!annotation_successors_[annotation].contains(comp)) {
-      FindSuccessors(comp, annotation);
-    }
-    return annotation_successors_[annotation][comp].size();
-  }
-  std::vector<const HloInstruction*> GetSuccessors(const HloComputation* comp,
-                                                   const int64_t annotation) {
-    if (!annotation_successors_[annotation].contains(comp)) {
-      FindSuccessors(comp, annotation);
-    }
-    return annotation_successors_[annotation][comp];
-  }
   void PrintAnnotationSets(int64_t level) const {
     for (const auto& [annotation, comp_instr_vector] : annotations_) {
       for (const auto& [comp, instrs] : comp_instr_vector) {
@@ -473,6 +454,49 @@ class HloGraphNode {
   // Nullptr is not a valid value for 'i'.
   explicit HloGraphNode(const HloInstruction* i, int64_t original_position)
       : instr_(i), original_position_(original_position) {}
+
+  static void UpdateOrAddDependency(HloGraphNode* from, HloGraphNode* to,
+                                    LatencyEstimator::TimeCost latency) {
+    auto update_latency_if_edge_exists =
+        [&](absl::Span<HloEdge> edges, HloGraphNode* to,
+            LatencyEstimator::TimeCost latency) {
+          auto it = absl::c_find_if(
+              edges, [&](HloEdge edge) { return &edge.Target() == to; });
+          if (it != edges.end()) {
+            it->SetLatency(latency);
+            return true;
+          }
+          return false;
+        };
+    if (!update_latency_if_edge_exists(from->GetSuccessors(), to, latency)) {
+      from->successors_.push_back(HloEdge(latency, to));
+      from->outdegree_++;
+    }
+    if (!update_latency_if_edge_exists(to->GetPredecessors(), from, latency)) {
+      to->predecessors_.push_back(HloEdge(latency, from));
+      to->indegree_++;
+    }
+  }
+
+  static void UpdateOrAddDependency(HloGraphNode* from, HloGraphNode* to,
+                                    const LatencyEstimator* latency_estimator) {
+    UpdateOrAddDependency(from, to,
+                          latency_estimator->GetLatencyBetween(*from, *to));
+  }
+
+  static void AddDependency(HloGraphNode* from, HloGraphNode* to,
+                            LatencyEstimator::TimeCost latency) {
+    to->predecessors_.push_back(HloEdge(latency, from));
+    to->indegree_++;
+    from->successors_.push_back(HloEdge(latency, to));
+    from->outdegree_++;
+  }
+
+  static void AddDependency(HloGraphNode* from, HloGraphNode* to,
+                            const LatencyEstimator* latency_estimator) {
+    AddDependency(from, to, latency_estimator->GetLatencyBetween(*from, *to));
+  }
+
   const HloInstruction& GetInstr() const { return *instr_; }
   bool IsScheduled() const { return scheduled_; }
   int32_t GetIndegree() const { return indegree_; }
@@ -506,6 +530,12 @@ class HloGraphNode {
   bool OccupiesSelectiveResource() const {
     return occupies_selective_resource_;
   }
+  void SetReleasesSelectiveResource(bool releases_selective_resource) {
+    releases_selective_resource_ = releases_selective_resource;
+  }
+  void SetOccupiesSelectiveResource(bool occupies_selective_resource) {
+    occupies_selective_resource_ = occupies_selective_resource;
+  }
   int64_t GetNumHopsToClosestSelectiveResourceOccupier() const {
     return num_hops_to_closest_selective_resource_occupier_;
   }
@@ -537,6 +567,35 @@ class HloGraphNode {
   bool DoesReleaseResource(ResourceType res) const {
     return DoesReleaseResource(ResourceTypeToIndex(res));
   }
+  bool DoesOccupyResource(int64_t res) const {
+    return absl::c_any_of(resources_, [res](const ResourcePair& resource) {
+      return resource.second == ResourceUsageType::kResourceOccupy &&
+             resource.first == res;
+    });
+  }
+  bool DoesOccupyResource(ResourceType res) const {
+    return DoesOccupyResource(ResourceTypeToIndex(res));
+  }
+  // Returns the net resources used by the node. For a while loop, it computes
+  // the net resources used by the instructions in the while body. Otherwise, it
+  // returns the readily-available resources vector.
+  ResourcesVector GetNetResources() const {
+    if (GetInstr().opcode() != HloOpcode::kWhile) {
+      return resources_;
+    }
+    ResourcesVector result;
+    for (const auto& [resource, usage] : resources_) {
+      if (usage == ResourceUsageType::kResourceOccupy &&
+          !DoesReleaseResource(resource)) {
+        result.push_back(std::make_pair(resource, usage));
+      }
+      if (usage == ResourceUsageType::kResourceRelease &&
+          !DoesOccupyResource(resource)) {
+        result.push_back(std::make_pair(resource, usage));
+      }
+    }
+    return result;
+  }
   std::optional<ResourceUsageType> UsesResourceType(ResourceType res) const {
     int64_t res_type = ResourceTypeToIndex(res);
     for (const auto& [resource_type, usage_type] : resources_) {
@@ -581,7 +640,7 @@ class HloGraphNode {
   int64_t GetOriginalPosition() const { return original_position_; }
   int64_t GetAnnotation() const { return annotation_; }
   absl::Status SetAnnotation(int64_t annotation) {
-    TF_RET_CHECK(annotation_ == -1)
+    TF_RET_CHECK(annotation_ == kInvalidAnnotation)
         << "Instruction " << instr_->name()
         << " has an existing annotation: " << annotation_;
     annotation_ = annotation;
@@ -626,7 +685,7 @@ class HloGraphNode {
   std::vector<HloEdge> successors_;
   // Instruction this Graph node represents
   const HloInstruction* instr_;
-  // The prosition of this node in the original order.
+  // The position of this node in the original order.
   int64_t original_position_;
   // Estimated time at which this node is gonna be ready to be scheduled.
   // The node should be added to the ready to be scheduled set when ready_time_
@@ -669,7 +728,7 @@ class HloGraphNode {
   // Nums hops to closest selective resource occupier.
   int64_t num_hops_to_closest_selective_resource_occupier_ =
       std::numeric_limits<int64_t>::max();
-  int64_t annotation_ = -1;
+  int64_t annotation_ = kInvalidAnnotation;
 };
 
 // Schedule graph that can be used to drive scheduling
@@ -732,6 +791,14 @@ class BufferInfoTracker {
     const HloBuffer* value = nullptr;
     const HloInstruction* first_definition = nullptr;
     int64_t buffer_size = 0;
+
+    // Precomputed value of
+    //     value->values()[0]->shape().has_layout() &&
+    //     (value->values()[0]->shape().layout().memory_space() !=
+    //      kDefaultMemorySpace)
+    // This expression is invoked repeatedly and is responsible for many cache
+    // misses.
+    bool non_default_memory_space_layout = false;
   };
   BufferInfoTracker(const HloModule* module,
                     const HloAliasAnalysis* alias_analysis,
@@ -739,9 +806,16 @@ class BufferInfoTracker {
   static ValueInfo CreateBufferInfo(
       const HloBuffer* value, const HloInstruction* first_definition,
       const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes) {
+    const auto& shape = value->values()[0]->shape();
+    const bool non_default_memory_space_layout =
+        (shape.has_layout() &&
+         (shape.layout().memory_space() != Layout::kDefaultMemorySpace));
     return ValueInfo{
-        /*value=*/value, /*first_definition=*/first_definition,
-        /*buffer_size=*/shape_size_bytes(value->values()[0]->shape())};
+        /*value=*/value,
+        /*first_definition=*/first_definition,
+        /*buffer_size=*/shape_size_bytes(shape),
+        /*non_default_memory_space_layout=*/non_default_memory_space_layout,
+    };
   }
   const ValueInfo& GetBufferInfo(HloBuffer::Id id) const {
     return buffer_infos_[id];
@@ -805,7 +879,34 @@ class MemoryPressureTracker {
   // Returns pressure state object for this MemoryPressureTracker object.
   const MemoryPressureState& pressure_state() const { return pressure_state_; }
 
+  absl::Span<const HloBuffer::Id> allocated_buffer_ids(
+      const HloInstruction* i) const {
+    auto it = instruction_ids_.find(i);
+    CHECK(it != instruction_ids_.end());
+    NodeAllocReleaseSpan s = alloc_release_spans_[it->second];
+    return absl::MakeSpan(alloc_release_ids_).subspan(s.start, s.num_alloc);
+  }
+
+  absl::Span<const HloBuffer::Id> released_buffer_ids(
+      const HloInstruction* i) const {
+    auto it = instruction_ids_.find(i);
+    CHECK(it != instruction_ids_.end());
+    NodeAllocReleaseSpan s = alloc_release_spans_[it->second];
+    return absl::MakeSpan(alloc_release_ids_)
+        .subspan(s.start + s.num_alloc, s.num_release);
+  }
+
  private:
+  // Append to *dst the list of buffer ids allocated by instruction whose
+  // memory usage should be tracked. Returns number of ids added.
+  int32_t ComputeBufferAllocations(const HloInstruction* instruction,
+                                   std::vector<HloBuffer::Id>* dst);
+
+  // Append to *dst the list of buffer ids released by instruction whose
+  // memory usage should be tracked. Returns number of ids added.
+  int32_t ComputeBufferReleases(const HloInstruction* instruction,
+                                std::vector<HloBuffer::Id>* dst);
+
   static bool ShouldSkipBufferAllocations(
       const HloInstruction* instruction, const ShapeIndex& idx,
       const HloInstruction* first_definition) {
@@ -832,6 +933,28 @@ class MemoryPressureTracker {
     return false;
   }
   const HloAliasAnalysis* hlo_alias_analysis_;
+
+  // Mapping from instruction to dense id.
+  absl::flat_hash_map<const HloInstruction*, int32_t> instruction_ids_;
+
+  // Combined vector of buffers allocated/released by each node.
+  // See allocated_buffer_ids() and released_buffer_ids().
+  std::vector<HloBuffer::Id> alloc_release_ids_;
+
+  // Information kept per node that identifies allocated/released buffers.
+  struct NodeAllocReleaseSpan {
+    // Allocated buffers in alloc_release_ids_[start,start+num_alloc).
+    // Released buffers in
+    // alloc_release_ids_[start+num_alloc,start+num_alloc+num_release)
+    uint32_t start;
+    uint32_t num_alloc;
+    uint32_t num_release;
+  };
+
+  // Mapping from dense instruction id to span information within
+  // alloc_release_ids_.
+  std::vector<NodeAllocReleaseSpan> alloc_release_spans_;
+
   // Live buffer presence set. This is used to determine if a buffer is live or
   // not in a fast way. Because this is checked very often in the evaluation
   // function of the scheduler quering the live_buffer_set_ object is too slow.
@@ -977,6 +1100,11 @@ class DefaultSchedulerCore : public SchedulerCore {
     // order (because we schedule bottom up). This will be required to be
     // reversed before assigning to the HloSchedule.
     std::vector<HloInstruction*> new_sequence_reversed;
+
+    // Memory pressure during and after an instruction in a schedule.
+    // (memory_after, memory_peak)
+    absl::flat_hash_map<const HloInstruction*, std::pair<int64_t, int64_t>>
+        memory_trace;
     // Units of time passed in the schedule. To keep track of latency hiding.
     HloGraphNode::TimeCost current_time = 0;
     // Resources and corresponding occupiers in flight.
@@ -1007,15 +1135,20 @@ class DefaultSchedulerCore : public SchedulerCore {
     std::vector<HloGraphNode*> selective_resource_releasers;
     // Similar to ready set, but only contains the no-op instructions.
     ReadyQueueSet nop_set;
-    // Number of scheduled nodes that are a successor for the given annotation.
-    absl::flat_hash_map<int64_t, int64_t>
-        num_scheduled_successors_for_annotation;
+    // Number of {scheduled, all} nodes that are a successor for the given
+    // annotation.
+    struct NumSuccessorsForAnnotation {
+      int64_t scheduled = 0;
+      int64_t all = 0;
+    };
+    absl::flat_hash_map<int64_t, NumSuccessorsForAnnotation>
+        num_successors_for_annotation;
     // List of annotations that are ready to be scheduled.
     absl::InlinedVector<int64_t, 2> ready_annotations;
     // List of annotated nodes that are ready to be scheduled.
     ReadyQueueSet annotation_ready;
     // Annotation that is currently being scheduled.
-    int64_t ongoing_annotation = -1;
+    int64_t ongoing_annotation = kInvalidAnnotation;
     // Reference to this scheduler run configuration.
     const SchedulerConfig& config;
     SchedulingState(const HloInstructionSequence* instr_sequence,
@@ -1053,7 +1186,23 @@ class DefaultSchedulerCore : public SchedulerCore {
         post_processing_fn_(post_processing_fn),
         scheduling_instruction_crosses_overlap_limit_(
             scheduling_instruction_crosses_overlap_limit) {}
+
   absl::Status InitializeScheduler(const HloModule* module) override;
+
+  absl::Status CaptureScheduleProto() override {
+    schedule_proto_ = ScheduleProto();
+    *schedule_proto_->mutable_hlo_module() = module_->ToProto();
+
+    return absl::OkStatus();
+  }
+
+  absl::StatusOr<ScheduleProto> GetCapturedScheduleProto() override {
+    if (!schedule_proto_.has_value()) {
+      return absl::FailedPreconditionError("Schedule proto not captured.");
+    }
+    return schedule_proto_.value();
+  }
+
   absl::StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
       const HloComputation* computation) override;
   static bool AddOccupierToResource(
@@ -1074,6 +1223,13 @@ class DefaultSchedulerCore : public SchedulerCore {
       const SchedulingState& sched_state, int64_t annotation);
   absl::flat_hash_map<int64_t, int64_t> GetNumResourcesNeededForAnnotation(
       const SchedulingState& sched_state, int64_t annotation);
+  int64_t GetNumSuccessorsForAnnotation(const SchedulingState& sched_state,
+                                        int64_t annotation) const;
+
+  ScheduleProto::ComputationScheduleProto ComputationScheduleToProto(
+      const HloComputation* computation, const SchedulingState& sched_state,
+      const LatencyEstimator& estimator,
+      const std::vector<HloInstruction*>& instructions);
 
  protected:
   virtual void LogInstruction(const HloInstruction* instr) const;
@@ -1095,10 +1251,6 @@ class DefaultSchedulerCore : public SchedulerCore {
   virtual absl::StatusOr<HloGraphNode*> FindAndExtractBestNodeAvailable(
       SchedulingState& sched_state,
       DefaultSchedulerCore::ShouldSkipNodeFunction should_skip_node);
-  void DumpLatencyHidingSchedule(
-      const HloComputation* computation, const HloScheduleGraph& schedule_graph,
-      const std::vector<HloInstruction*>& instructions,
-      int cycles_per_microsecond, const DebugOptions& debug_options);
 
   HloCostAnalysis::ShapeSizeFunction shape_size_bytes_;
   std::unique_ptr<ModulePressureState> module_pressure_state_;
@@ -1111,6 +1263,8 @@ class DefaultSchedulerCore : public SchedulerCore {
   PostProcessingFn post_processing_fn_ = nullptr;
   OverlapLimitRule scheduling_instruction_crosses_overlap_limit_ = nullptr;
   std::unique_ptr<AnnotationTracker> annotation_tracker_;
+  std::optional<ScheduleProto> schedule_proto_;
+  const HloModule* module_ = nullptr;
 };
 
 // A scheduler oriented to hiding latencies of operations that can run in
@@ -1130,11 +1284,22 @@ class LatencyHidingScheduler : public HloModulePass {
     double recv_wasted_cycles = 0;
     double total_cycles = 0;
     int64_t memory_pressure_peak = 0;
+
+    double GetTotalWastedCycles() const {
+      return all_gather_wasted_cycles + all_reduce_wasted_cycles +
+             collective_broadcast_wasted_cycles +
+             collective_permute_wasted_cycles + all_to_all_wasted_cycles +
+             ragged_all_to_all_wasted_cycles + reduce_scatter_wasted_cycles +
+             send_wasted_cycles + recv_wasted_cycles;
+    }
+
+    ScheduleProto::SchedulerStatisticsProto ToProto() const;
+    std::string ToString() const;
   };
 
   LatencyHidingScheduler(
-      std::unique_ptr<LatencyEstimator> latency_estimator,
-      std::unique_ptr<AsyncTracker> async_tracker,
+      std::shared_ptr<LatencyEstimator> latency_estimator,
+      std::shared_ptr<AsyncTracker> async_tracker,
       std::unique_ptr<SchedulerCore> scheduler_core,
       const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes)
       : latency_estimator_(std::move(latency_estimator)),
@@ -1152,9 +1317,7 @@ class LatencyHidingScheduler : public HloModulePass {
       const LatencyEstimator* latency_estimator,
       const AsyncTracker* async_tracker,
       const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes);
-  // Returns a string representation of the scheduler statistics object.
-  static std::string SchedulerStatisticsString(
-      const SchedulerStatistics& sched_stats);
+
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
       HloModule* module,
@@ -1163,8 +1326,8 @@ class LatencyHidingScheduler : public HloModulePass {
   virtual void LogScheduleStatistics(const HloComputation* computation);
 
  private:
-  std::unique_ptr<LatencyEstimator> latency_estimator_;
-  std::unique_ptr<AsyncTracker> async_tracker_;
+  std::shared_ptr<LatencyEstimator> latency_estimator_;
+  std::shared_ptr<AsyncTracker> async_tracker_;
   std::unique_ptr<SchedulerCore> scheduler_core_;
   const HloCostAnalysis::ShapeSizeFunction shape_size_bytes_;
   absl::flat_hash_set<HloComputation*> computations_to_schedule_;
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 9d8e1bfd7fd5..05fcd0f27f42 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -37,13 +37,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/transforms/collectives/async_collective_creator.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/legalize_scheduling_annotations.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -188,7 +188,7 @@ absl::StatusOr<bool> RunScheduler(
 
 }  // namespace
 
-class LatencyHidingSchedulerTest : public HloTestBase {
+class LatencyHidingSchedulerTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> ParseHloText(
       absl::string_view hlo_string) {
@@ -2114,11 +2114,13 @@ while_body {
   gte0 = bf16[8]{0} get-tuple-element(param), index=0
   gte1 = pred[] get-tuple-element(param), index=2
   bitcast = bf16[8]{0} bitcast(gte0)
-  collective-permute.1 = bf16[8]{0} collective-permute(gte0), source_target_pairs={{0,1},{1,2},{2,3}}
-  add0 = bf16[8]{0} add(collective-permute.1, bitcast)
+  cps.1 = (bf16[8]{0}, bf16[8]{0}, u32[], u32[]) collective-permute-start(gte0), source_target_pairs={{0,1},{1,2},{2,3}}
+  cpd.1 = bf16[8]{0} collective-permute-done(cps.1)
+  add0 = bf16[8]{0} add(cpd.1, bitcast)
   negate = bf16[8]{0} negate(add0)
-  collective-permute.2 = bf16[8]{0} collective-permute(collective-permute.1), source_target_pairs={{1,0},{0,3},{3,2}}
-  ROOT tuple = (bf16[8]{0}, bf16[8]{0}, pred[]) tuple(collective-permute.2, negate, gte1)
+  cps.2 = (bf16[8]{0}, bf16[8]{0}, u32[], u32[]) collective-permute-start(gte0), source_target_pairs={{1,0},{0,3},{3,2}}
+  cpd.2 = bf16[8]{0} collective-permute-done(cps.2)
+  ROOT tuple = (bf16[8]{0}, bf16[8]{0}, pred[]) tuple(cpd.2, negate, gte1)
 }
 
 ENTRY entry {
@@ -2127,22 +2129,21 @@ ENTRY entry {
   p2 = pred[] parameter(2)
   tuple = (bf16[8]{0}, bf16[8]{0}, pred[]) tuple(p0, p1, p2)
   while = (bf16[8]{0}, bf16[8]{0}, pred[]) while(tuple), condition=while_cond, body=while_body
-  collective-permute.3 = bf16[8]{0} collective-permute(p1), source_target_pairs={{0,1},{1,2},{2,3}}
-  gte0 = bf16[8]{0} get-tuple-element(while), index=0
-  gte1 = bf16[8]{0} get-tuple-element(while), index=1
-  add = bf16[8]{0} add(gte0, gte1)
-  ROOT add2 = bf16[8]{0} add(add, collective-permute.3)
+  cps.3 = (bf16[8]{0}, bf16[8]{0}, u32[], u32[]) collective-permute-start(p1), source_target_pairs={{0,1},{1,2},{2,3}}
+  cpd.3 = bf16[8]{0} collective-permute-done(cps.3)
+  gte = bf16[8]{0} get-tuple-element(while), index=0
+  ROOT add = bf16[8]{0} add(gte, cpd.3)
 }
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
   HloSchedule& module_schedule = hlo_module->schedule();
-  EXPECT_TRUE(hlo_module->has_entry_computation());
   auto sched_config = GetDefaultSchedConfig();
-  sched_config.collective_permute_overlap_limit = 2;
-  TF_EXPECT_OK(RunScheduler(hlo_module.get(), sched_config));
-  EXPECT_TRUE(hlo_module->has_entry_computation());
 
+  // With collective-permute overlap limit of 1, (cps.3, cpd.3) cannot overlap
+  // the while, due to the two collective-permutes in the while body.
+  sched_config.collective_permute_overlap_limit = 1;
+  TF_EXPECT_OK(RunScheduler(hlo_module.get(), sched_config));
   std::vector<HloInstruction*> new_instruction_sequence =
       module_schedule.sequence(hlo_module->entry_computation()).instructions();
   if (VLOG_IS_ON(1)) {
@@ -2150,11 +2151,27 @@ ENTRY entry {
       VLOG(1) << new_i->ToString();
     }
   }
-
-  // Do not overlap if the sum of collectives inside the loop + the collective
-  // we are trying to overlap would go beyond the overlap limit.
-  EXPECT_GT(GetIndex(new_instruction_sequence, "collective-permute-start.2"),
-            GetIndex(new_instruction_sequence, "while"));
+  EXPECT_TRUE(GetIndex(new_instruction_sequence, "cpd.3") <
+                  GetIndex(new_instruction_sequence, "while") ||
+              GetIndex(new_instruction_sequence, "while") <
+                  GetIndex(new_instruction_sequence, "cps.3"));
+
+  // With collective-permute overlap limit of 2, (cps.3, cpd.3) can overlap the
+  // while as the two collective-permutes in the while body can be scheduled
+  // sequentially.
+  sched_config.collective_permute_overlap_limit = 2;
+  TF_EXPECT_OK(RunScheduler(hlo_module.get(), sched_config));
+  new_instruction_sequence =
+      module_schedule.sequence(hlo_module->entry_computation()).instructions();
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+  EXPECT_TRUE(GetIndex(new_instruction_sequence, "cps.3") <
+                  GetIndex(new_instruction_sequence, "while") &&
+              GetIndex(new_instruction_sequence, "while") <
+                  GetIndex(new_instruction_sequence, "cpd.3"));
 }
 
 TEST_F(LatencyHidingSchedulerTest, WhileNestedOverlapLimit) {
@@ -2211,7 +2228,7 @@ ENTRY entry {
   HloSchedule& module_schedule = hlo_module->schedule();
   EXPECT_TRUE(hlo_module->has_entry_computation());
   auto sched_config = GetDefaultSchedConfig();
-  sched_config.collective_permute_overlap_limit = 2;
+  sched_config.collective_permute_overlap_limit = 1;
   TF_EXPECT_OK(RunScheduler(hlo_module.get(), sched_config));
   EXPECT_TRUE(hlo_module->has_entry_computation());
 
@@ -2223,8 +2240,8 @@ ENTRY entry {
     }
   }
 
-  // Do not overlap if the sum of collectives inside the loop + the collective
-  // we are trying to overlap would go beyond the overlap limit.
+  // Since there is at least one collective permute in the while op, overlapping
+  // it with the outer collective permute is not possible for the limit of 1.
   EXPECT_GT(GetIndex(new_instruction_sequence, "collective-permute-start.2"),
             GetIndex(new_instruction_sequence, "while"));
 }
@@ -2401,6 +2418,77 @@ ENTRY entry {
             GetIndex(new_instruction_sequence, "while"));
 }
 
+TEST_F(LatencyHidingSchedulerTest, ConditionalOverlapLimit) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule test, is_scheduled=true
+
+  region_true {
+    p0 = (s8[], s8[64,128]) parameter(0)
+    gte = s8[64,128] get-tuple-element(p0), index=1
+    ags = (s8[64,128], s8[128,128]) all-gather-start(gte), replica_groups={{0,1}}, dimensions={0}
+    ROOT agd = s8[128,128] all-gather-done(ags)
+  }
+
+  region_false {
+    p0 = (s8[], s8[64,128]) parameter(0)
+    gte = s8[64,128] get-tuple-element(p0), index=1
+    ags.1 = (s8[64,128], s8[128,128]) all-gather-start(gte), replica_groups={{0,1}}, dimensions={0}
+    ROOT agd.1 = s8[128,128] all-gather-done(ags.1)
+  }
+
+  ENTRY test {
+    param = s8[64,128] parameter(0)
+    ags.2 = (s8[64,128], s8[128,128]) all-gather-start(param), replica_groups={{0,1}}, dimensions={0}
+    agd.2 = s8[128,128] all-gather-done(ags.2)
+    constant = s8[] parameter(1)
+    cond_p0 = (s8[], s8[64,128]) tuple(constant, param)
+    cond_p1 = (s8[], s8[64,128]) tuple(constant, param)
+    or = pred[] parameter(2)
+    cond = s8[128,128] conditional(or, cond_p0, cond_p1), true_computation=region_true, false_computation=region_false
+    ROOT add = s8[128,128] add(cond, agd.2)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  auto sched_config = GetDefaultSchedConfig();
+  sched_config.aggressive_scheduling_policies = true;
+
+  // With all-gather overlap limit of 1, (ags.2, agd.2) cannot overlap cond.
+  sched_config.all_gather_overlap_limit = 1;
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config,
+                           std::make_unique<TestLatencyEstimator>())
+                  .ok());
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(hlo_module->entry_computation()).instructions();
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+  EXPECT_TRUE(GetIndex(new_instruction_sequence, "agd.2") <
+                  GetIndex(new_instruction_sequence, "cond") ||
+              GetIndex(new_instruction_sequence, "cond") <
+                  GetIndex(new_instruction_sequence, "ags.2"));
+
+  // With all-gather overlap limit of 2, (ags.2, agd.2) can overlap cond.
+  sched_config.all_gather_overlap_limit = 2;
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config,
+                           std::make_unique<TestLatencyEstimator>())
+                  .ok());
+
+  new_instruction_sequence =
+      module_schedule.sequence(hlo_module->entry_computation()).instructions();
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+  EXPECT_TRUE(GetIndex(new_instruction_sequence, "ags.2") <
+                  GetIndex(new_instruction_sequence, "cond") &&
+              GetIndex(new_instruction_sequence, "cond") <
+                  GetIndex(new_instruction_sequence, "agd.2"));
+}
+
 TEST_F(LatencyHidingSchedulerTest, AllToAllAsyncBalance) {
   absl::string_view hlo_string = R"(
 HloModule module, is_scheduled=true
@@ -4063,4 +4151,132 @@ ENTRY entry {
             GetIndex(new_instruction_sequence, "cp2d"));
 }
 
+TEST_F(LatencyHidingSchedulerTest, WhileLoopImpossibleScheduleSend) {
+  // For async instructions we can create impossible to schedule situations
+  // where there is no way to schedule a start without going beyond the schedule
+  // because of edges added by the aliasing detection logic in the scheduler.
+  // Encountered this only on send/send-done so far because of tokens being
+  // piped through the send.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+while_cond {
+  param = (bf16[8]{0}, bf16[8]{0}, pred[], token[], token[]) parameter(0)
+  ROOT gte = pred[] get-tuple-element(param), index=2
+}
+
+while_body {
+  param = (bf16[8]{0}, bf16[8]{0}, pred[], token[], token[]) parameter(0)
+  gte0 = bf16[8]{0} get-tuple-element(param), index=0
+  gte2 = bf16[8]{0} get-tuple-element(param), index=1
+  gte1 = pred[] get-tuple-element(param), index=2
+  gte3 = token[] get-tuple-element(param), index=3
+  gte4 = token[] get-tuple-element(param), index=4
+  bitcast = bf16[8]{0} bitcast(gte0)
+  s.1 = (bf16[8]{0}, token[]) send(gte0, gte3)
+  sd.1 = bf16[8]{0} send-done(s.1)
+  s.2 = (bf16[8]{0}, token[]) send(gte2, gte4)
+  sd.2 = bf16[8]{0} send-done(s.2)
+  ROOT tuple = (bf16[8]{0}, bf16[8]{0}, pred[], token[], token[]) tuple(gte2, gte0, gte1, sd.2, sd.1)
+}
+
+ENTRY entry {
+  p0 = bf16[8]{0} parameter(0)
+  p1 = bf16[8]{0} parameter(1)
+  p2 = pred[] parameter(2)
+  after-all0 = token[] after-all()
+  after-all1 = token[] after-all()
+  tuple = (bf16[8]{0}, bf16[8]{0}, pred[], token[], token[]) tuple(p0, p1, p2, after-all0, after-all1)
+  while = (bf16[8]{0}, bf16[8]{0}, pred[], token[], token[]) while(tuple), condition=while_cond, body=while_body
+  gte0 = bf16[8]{0} get-tuple-element(while), index=0
+  gte1 = bf16[8]{0} get-tuple-element(while), index=1
+  ROOT add = bf16[8]{0} add(gte0, gte1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  auto sched_config = GetDefaultSchedConfig();
+  sched_config.send_recv_overlap_limit = 1;
+  sched_config.schedule_send_recvs = true;
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  TF_EXPECT_OK(RunScheduler(hlo_module.get(), sched_config));
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  HloComputation* while_body = hlo_module->GetComputationWithName("while_body");
+
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(while_body).instructions();
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+}
+
+TEST_F(LatencyHidingSchedulerTest, WhileWithCompleteResourceList) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  while_cond {
+    param = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) parameter(0)
+    ROOT gte = pred[] get-tuple-element(param), index=2
+  }
+
+  while_body {
+    param = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) parameter(0)
+    gte0 = f32[16,64,256]{2,1,0} get-tuple-element(param), index=0
+    gte1 = f32[16,64,256]{2,1,0} get-tuple-element(param), index=1
+    gte2 = pred[] get-tuple-element(param), index=2
+    cps0 = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, u32[], u32[]) collective-permute-start(gte1), source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+    cpd0 = f32[16,64,256]{2,1,0} collective-permute-done(cps0)
+    c = f32[16,256,256]{2,1,0} convolution(gte0, gte0), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+    slice = f32[16,64,256]{2,1,0} slice(c), slice={[0:16], [0:64], [0:256]}
+    add = f32[16,64,256]{2,1,0} add(gte0, slice)
+    ROOT tuple = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) tuple(add, cpd0, gte2)
+  }
+
+  ENTRY entry {
+    p0 = f32[64,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    p3 = pred[] parameter(3)
+    cps1 = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}) collective-permute-start(p1), source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+    cpd1 = f32[16,64,256]{2,1,0} collective-permute-done(cps1)
+    cps2 = (f32[64,1024]{1,0}, f32[64,1024]{1,0}) collective-permute-start(p0), source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+    tuple = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) tuple(cpd1, p2, p3)
+    while = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) while(tuple), condition=while_cond, body=while_body
+    cpd2 = f32[64,1024]{1,0} collective-permute-done(cps2)
+    gte = f32[16,64,256]{2,1,0} get-tuple-element(while), index=0
+    ROOT tuple1 = (f32[16,64,256]{2,1,0}, f32[64,1024]{1,0}) tuple(gte, cpd2)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  auto sched_config = GetDefaultSchedConfig();
+  sched_config.aggressive_scheduling_policies = true;
+  sched_config.collective_permute_overlap_limit = 1;
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config,
+                           std::make_unique<TestLatencyEstimator>())
+                  .ok());
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(hlo_module->entry_computation()).instructions();
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+  // Without proper resources assigned to while, cpd2 would be prioritized (and
+  // hence scheduled after while) even though while has a higher async depth.
+  // With the complete resources assigned to while, it has a similar priority as
+  // cpd2 in terms of the kScheduleDone rule, so we let the kAsyncDepth rule to
+  // prioritize scheduling while. This prevents the needless delaying of blocker
+  // while ops and hence helps reducing the live ranges of their data-dependent
+  // instructions.
+  EXPECT_LT(GetIndex(new_instruction_sequence, "cpd2"),
+            GetIndex(new_instruction_sequence, "while"));
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 18ea43a2284e..b7bea1e55704 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <deque>
+#include <iterator>
 #include <memory>
 #include <ostream>
 #include <set>
@@ -27,6 +28,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -58,10 +60,6 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -308,7 +306,7 @@ absl::Status LayoutAssignment::SetBufferLayout(const Layout& layout,
   }
   VLOG(3) << "SUCC setting buffer constraint: "
           << buffer_constraint->ToString();
-  added_constraints_.push_back(buffer_constraint.get());
+  PushAddedConstraints(buffer_constraint.get());
   const HloInstruction* instruction = buffer.instruction();
   if (dynamic_cast<const HloCallableInstruction*>(instruction) != nullptr) {
     // Check and propagate via output-operand aliasing
@@ -336,7 +334,7 @@ absl::Status LayoutAssignment::SetBufferLayout(const Layout& layout,
 absl::Status LayoutAssignment::SetOperandLayout(
     const Shape& shape_with_layout, const HloInstruction* instruction,
     int64_t operand_no, bool mandatory, bool dfs, int64_t priority) {
-  if (shape_with_layout.IsArray() && shape_with_layout.rank() == 0) {
+  if (shape_with_layout.IsArray() && shape_with_layout.dimensions_size() == 0) {
     return absl::OkStatus();
   }
   LayoutConstraints& constraints =
@@ -386,14 +384,15 @@ void LayoutAssignment::PushAddedConstraints(
     const LayoutConstraint* constraint) {
   if (!constraint->dfs()) {
     // Insert a new constraint to the first location where it's strictly greater
-    // than all the subsequent constraints. Assumes invariant that the list is
-    // sorted.
-    auto it = absl::c_upper_bound(
-        added_constraints_, constraint,
-        [&](const LayoutConstraint* a, const LayoutConstraint* b) {
-          return a->priority() > b->priority();
-        });
-    added_constraints_.insert(it, constraint);
+    // than all the subsequent constraints.
+    for (auto it = added_constraints_.end(); it != added_constraints_.begin();
+         --it) {
+      if (constraint->priority() <= (*std::prev(it))->priority()) {
+        added_constraints_.insert(it, constraint);
+        return;
+      }
+    }
+    added_constraints_.insert(added_constraints_.begin(), constraint);
   } else {
     added_constraints_.push_back(constraint);
   }
@@ -508,7 +507,8 @@ absl::Status LayoutAssignment::SetInstructionLayout(
       !InstructionCanChangeLayoutInstance(instruction)) {
     VLOG(3) << "Setting operand layout: " << instruction->ToString();
     for (int i = 0; i < instruction->operand_count(); ++i) {
-      if (instruction->operand(i)->shape().rank() == shape_with_layout.rank()) {
+      if (instruction->operand(i)->shape().dimensions_size() ==
+          shape_with_layout.dimensions_size()) {
         TF_RETURN_IF_ERROR(SetArrayOperandLayout(
             shape_with_layout.layout(), instruction, /*operand_no=*/i,
             /*mandatory=*/mandatory, /*dfs=*/dfs, priority));
@@ -710,7 +710,7 @@ absl::Status LayoutAssignment::AddMandatoryConstraints(
         ShapeLayout parameter_layout =
             constraints->computation_layout().parameter_layout(
                 instruction->parameter_number());
-        // Allow some paramter/result layouts to be unset in the entry
+        // Allow some parameter/result layouts to be unset in the entry
         // computation.
         if (parameter_layout.AnyLayoutIsSet()) {
           // Clear out memory space in layout. Host offloader will do the
@@ -1372,7 +1372,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   CHECK(instruction->shape().IsArray());
   CHECK(operand->shape().IsArray());
   if (!ShapeUtil::IsScalar(operand->shape()) &&
-      operand->shape().rank() == instruction->shape().rank() &&
+      operand->shape().dimensions_size() ==
+          instruction->shape().dimensions_size() &&
       !InstructionCanChangeLayoutInstance(instruction)) {
     // Propagate the result layout to the operand layout if the instruction
     // requires the same layout out for the result and the operand.
@@ -1385,14 +1386,14 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   }
 
   if (instruction->opcode() == HloOpcode::kReshape) {
-    // Prefer the operand layout that makes the reshape an bitcast. If any
+    // Prefer the operand layout that makes the reshape a bitcast. If any
     // dimension bound is 1 in the operand shape, there may be several such
     // layouts. So if 'output_layout' is the default layout, try if the
     // reshape is a bitcast when using the same layout. This may avoid copy
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the operand's layout to the output.
-    if (ShapeUtil::TrueRank(operand->shape()) == 1 &&
-        ShapeUtil::TrueRank(instruction->shape()) == 1) {
+    if (ShapeUtil::TrueNumDimensions(operand->shape()) == 1 &&
+        ShapeUtil::TrueNumDimensions(instruction->shape()) == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
     }
@@ -1416,7 +1417,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
 
   if (instruction->opcode() == HloOpcode::kTranspose) {
     // Pick the operand layout that makes the transpose a bitcast.
-    int64_t rank = instruction->shape().rank();
+    int64_t rank = instruction->shape().dimensions_size();
     std::vector<int64_t> new_minor_to_major(rank);
     for (int64_t i = 0; i < rank; ++i) {
       int64_t output_dim = LayoutUtil::Minor(output_layout, i);
@@ -1433,8 +1434,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
       !instruction->shape().IsTuple() &&
       PropagateReductionLayoutToOperand(instruction)) {
     // Pick the operand layout that makes the reduce a row reduction.
-    int64_t rank = instruction->shape().rank();
-    int64_t operand_rank = instruction->operand(0)->shape().rank();
+    int64_t rank = instruction->shape().dimensions_size();
+    int64_t operand_rank = instruction->operand(0)->shape().dimensions_size();
     std::vector<int64_t> new_minor_to_major;
     new_minor_to_major.reserve(operand_rank);
     new_minor_to_major.insert(new_minor_to_major.begin(),
@@ -1518,7 +1519,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
       << "Fails on instruction: " << user->ToString();
 
   if (!ShapeUtil::IsScalar(operand->shape()) &&
-      operand->shape().rank() == user->shape().rank() &&
+      operand->shape().dimensions_size() == user->shape().dimensions_size() &&
       !InstructionCanChangeLayoutInstance(user)) {
     // Assign users the same layout as the operand.
     return std::make_unique<Layout>(operand_layout);
@@ -1531,8 +1532,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     // reshape is a bitcast when using the same layout. This may avoid copy
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the outputs's layout to the operand.
-    if (ShapeUtil::TrueRank(operand->shape()) == 1 &&
-        ShapeUtil::TrueRank(user->shape()) == 1) {
+    if (ShapeUtil::TrueNumDimensions(operand->shape()) == 1 &&
+        ShapeUtil::TrueNumDimensions(user->shape()) == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
     }
@@ -1554,7 +1555,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
 
   if (user->opcode() == HloOpcode::kTranspose) {
     // Pick the user layout that makes the transpose a bitcast.
-    int64_t rank = user->shape().rank();
+    int64_t rank = user->shape().dimensions_size();
     std::vector<int64_t> new_minor_to_major(rank);
     auto inverse_dimensions = InversePermutation(user->dimensions());
     for (int64_t i = 0; i < rank; ++i) {
@@ -1693,7 +1694,7 @@ bool InstructionShouldPropagateDepthFirst(const HloInstruction& hlo) {
     case HloOpcode::kGather:
       return true;
     case HloOpcode::kReshape:
-      return hlo.operand(0)->shape().rank() == 1 ||
+      return hlo.operand(0)->shape().dimensions_size() == 1 ||
              hlo.ReshapeMerelyInsertsOrDeletes1SizedDimensions().has_value();
     case HloOpcode::kScatter:
     case HloOpcode::kTranspose:
@@ -1760,7 +1761,7 @@ absl::Status LayoutAssignment::PropagateOperandConstraint(
     return absl::OkStatus();
   }
 
-  int64_t operand_rank = operand->shape().rank();
+  int64_t operand_rank = operand->shape().dimensions_size();
   if (operand_rank <= 1) {
     return absl::OkStatus();
   }
@@ -1792,7 +1793,7 @@ absl::Status LayoutAssignment::PropagateOperandConstraint(
       if (!sibling->shape().IsArray()) {
         continue;
       }
-      const int64_t sibling_rank = sibling->shape().rank();
+      const int64_t sibling_rank = sibling->shape().dimensions_size();
       if (sibling_rank <= 1) {
         continue;
       }
@@ -1809,13 +1810,14 @@ absl::Status LayoutAssignment::PropagateOperandConstraint(
           if (subshape.IsTuple()) {
             return absl::OkStatus();
           }
-          if (subshape.rank() <= 1) {
+          if (subshape.dimensions_size() <= 1) {
             return absl::OkStatus();
           }
 
           // Assign the right layout to input fusion of higher rank reduce
           // operations.
-          if (subshape.rank() != operand->shape().rank()) {
+          if (subshape.dimensions_size() !=
+              operand->shape().dimensions_size()) {
             return absl::OkStatus();
           }
           if (!points_to_analysis_->InstructionDefinesBufferAtIndex(
@@ -1844,7 +1846,7 @@ absl::Status LayoutAssignment::PropagateOperandConstraint(
         if (subshape.IsTuple()) {
           return absl::OkStatus();
         }
-        if (subshape.rank() <= 1) {
+        if (subshape.dimensions_size() <= 1) {
           return absl::OkStatus();
         }
         if (!points_to_analysis_->InstructionDefinesBufferAtIndex(
@@ -1897,7 +1899,7 @@ absl::Status LayoutAssignment::PropagateBufferConstraintToOperands(
     if (!InstructionCanChangeLayoutInstance(instruction)) {
       // Copy the layout to the operand.
       if (buffer.IsArray() && operand->shape().IsArray() &&
-          operand->shape().rank() ==
+          operand->shape().dimensions_size() ==
               LayoutUtil::MinorToMajor(buffer_constraint.layout()).size()) {
         TF_RETURN_IF_ERROR(SetArrayOperandLayout(
             buffer_constraint.layout(), instruction, operand_no,
@@ -2705,63 +2707,9 @@ absl::StatusOr<bool> LayoutAssignment::Run(
                             entry_computation_layout_->AnyLayoutSet()
                                 ? LayoutConstraint::kGivenPriority
                                 : LayoutConstraint::kDefaultPriority));
-  for (int64_t i = 0; i < kNumberOfPropagationRounds; ++i) {
-    if (i > 0) {
-      LayoutConstraints* constraints =
-          mutable_computation_constraints(module->entry_computation());
-
-      bool changed = false;
-      module->input_output_alias_config().ForEachAlias(
-          [&](const ShapeIndex& output_index,
-              const HloInputOutputAliasConfig::Alias& alias) {
-            const auto param = alias.parameter_number;
-            const auto& index = alias.parameter_index;
-            bool param_is_forced =
-                ShapeUtil::GetSubshape(
-                    saved_entry_computation_layout_.parameter_shape(param),
-                    index)
-                    .has_layout();
-            bool result_is_forced =
-                ShapeUtil::GetSubshape(
-                    saved_entry_computation_layout_.result_shape(),
-                    output_index)
-                    .has_layout();
-            Shape* param_shape =
-                ShapeUtil::GetMutableSubshape(module->entry_computation()
-                                                  ->parameter_instruction(param)
-                                                  ->mutable_shape(),
-                                              index);
-            Shape* result_shape =
-                ShapeUtil::GetMutableSubshape(module->entry_computation()
-                                                  ->root_instruction()
-                                                  ->mutable_shape(),
-                                              output_index);
-            if (param_is_forced && result_is_forced) {
-              return;
-            }
-
-            if (param_shape->layout().minor_to_major() ==
-                result_shape->layout().minor_to_major()) {
-              return;
-            }
-            changed = true;
-            if (!param_is_forced) {
-              *param_shape = *result_shape;
-              return;
-            }
-            *result_shape = *param_shape;
-          });
-      if (changed) {
-        auto computed_program_shape =
-            module->entry_computation()->ComputeProgramShape();
-        constraints->mutable_computation_constraint()->ResetComputationLayout(
-            ComputationLayout{
-                module->entry_computation()->ComputeProgramShape(), false},
-            LayoutConstraint::kGivenPriority, true, true);
-        *entry_computation_layout_ =
-            constraints->computation_constraint().computation_layout();
-      }
-    }
+  bool changed = true;
+  for (int64_t i = 0; changed || i < kNumberOfPropagationRounds; ++i) {
+    changed = false;
     VLOG(1) << "Running " << (i == 0 ? "un" : "") << "constrained pass";
     TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module, execution_threads));
     for (auto* computation : computations_to_work) {
@@ -2771,6 +2719,62 @@ absl::StatusOr<bool> LayoutAssignment::Run(
           RunOnComputation(constraints, channel_layout_constraints_));
     }
     current_priority_ += 1;
+    auto* entry_constraint =
+        mutable_computation_constraints(module->entry_computation())
+            ->mutable_computation_constraint()
+            ->mutable_computation_layout();
+    TF_RETURN_IF_ERROR(
+        module->input_output_alias_config().ForEachAliasWithStatus(
+            [&](const ShapeIndex& output_index,
+                const HloInputOutputAliasConfig::Alias& alias) {
+              const auto param = alias.parameter_number;
+              const auto& index = alias.parameter_index;
+              bool param_is_forced =
+                  ShapeUtil::GetSubshape(
+                      saved_entry_computation_layout_.parameter_shape(param),
+                      index)
+                      .has_layout();
+              bool result_is_forced =
+                  ShapeUtil::GetSubshape(
+                      saved_entry_computation_layout_.result_shape(),
+                      output_index)
+                      .has_layout();
+              if (param_is_forced && result_is_forced) {
+                return absl::OkStatus();
+              }
+              auto* entry = module->entry_computation();
+              TF_ASSIGN_OR_RETURN(
+                  auto param_layout,
+                  InferArrayLayout(entry->parameter_instruction(param), index));
+              TF_ASSIGN_OR_RETURN(
+                  auto result_layout,
+                  InferArrayLayout(entry->root_instruction(), output_index));
+              if (param_layout.minor_to_major() ==
+                  result_layout.minor_to_major()) {
+                return absl::OkStatus();
+              }
+              changed = true;
+              if (!param_is_forced) {
+                entry_computation_layout_->mutable_parameter_layout(param)
+                    ->ResetLayout(result_layout, index);
+                entry_computation_layout_->mutable_result_layout()->ResetLayout(
+                    result_layout, output_index);
+                entry_constraint->mutable_parameter_layout(param)->ResetLayout(
+                    result_layout, index);
+                entry_constraint->mutable_result_layout()->ResetLayout(
+                    result_layout, output_index);
+                return absl::OkStatus();
+              }
+              entry_computation_layout_->mutable_parameter_layout(param)
+                  ->ResetLayout(param_layout, index);
+              entry_computation_layout_->mutable_result_layout()->ResetLayout(
+                  param_layout, output_index);
+              entry_constraint->mutable_parameter_layout(param)->ResetLayout(
+                  param_layout, index);
+              entry_constraint->mutable_result_layout()->ResetLayout(
+                  param_layout, output_index);
+              return absl::OkStatus();
+            }));
   }
 
   for (auto* computation : computations_to_work) {
@@ -2935,11 +2939,14 @@ bool LayoutAssignment::InstructionCanChangeLayoutInstance(
 /* static */
 bool LayoutAssignment::IsAtMostRank1(const Shape& shape) {
   if (shape.IsArray()) {
-    return shape.rank() <= 1;
+    return shape.dimensions_size() <= 1;
   }
-  return absl::c_all_of(shape.tuple_shapes(), [](const Shape& subshape) {
-    return IsAtMostRank1(subshape);
-  });
+  if (shape.IsTuple()) {
+    return absl::c_all_of(shape.tuple_shapes(), [](const Shape& subshape) {
+      return IsAtMostRank1(subshape);
+    });
+  }
+  return true;
 }
 
 absl::Status LayoutAssignment::Init(HloModule* module) {
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index b925821311e1..5c9a5a576906 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -171,6 +171,9 @@ class ComputationLayoutConstraint : public LayoutConstraint {
   const ComputationLayout& computation_layout() const {
     return computation_layout_;
   }
+  ComputationLayout* mutable_computation_layout() {
+    return &computation_layout_;
+  }
   void ResetComputationLayout(const ComputationLayout& layout, int64_t priority,
                               bool prop_result_layout,
                               bool prop_parameter_layout) {
@@ -704,6 +707,10 @@ class LayoutAssignment : public HloModulePass {
     }
   }
 
+  void ResetEntryComputationLayout() {
+    *entry_computation_layout_ = saved_entry_computation_layout_;
+  }
+
   // Adds constraints related to host Send/Recv instructions.
   absl::Status BuildHostChannelConstraints(HloComputation* computation);
 
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index ac0ceabb80dc..0c0d9c5c5e50 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -30,6 +30,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
@@ -38,12 +41,9 @@ limitations under the License.
 #include "xla/service/computation_layout.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/util.h"
@@ -500,7 +500,8 @@ class OperandsMustBeTheSameLayoutAssignment : public LayoutAssignment {
     for (int64_t operand_no = 0; operand_no < instruction->operand_count();
          ++operand_no) {
       const HloInstruction* operand = instruction->operand(operand_no);
-      if (instruction->shape().rank() != operand->shape().rank()) {
+      if (instruction->shape().dimensions_size() !=
+          operand->shape().dimensions_size()) {
         continue;
       }
       TF_RETURN_IF_ERROR(SetArrayOperandLayout(buffer_constraint.layout(),
diff --git a/third_party/xla/xla/service/layout_normalization.cc b/third_party/xla/xla/service/layout_normalization.cc
index 6ea0a23e1757..22d52013f966 100644
--- a/third_party/xla/xla/service/layout_normalization.cc
+++ b/third_party/xla/xla/service/layout_normalization.cc
@@ -41,10 +41,10 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -54,11 +54,15 @@ namespace {
 // applied to the shape itself).
 //
 // Local precondition for every call:
-//    -> Input is a bitcast from a normalized layout.
+//    -> Input either already has a normalized layout, or is a bitcast from a
+//       normalized layout.
 //
 // Local postcondition:
 //    -> Input and output of a processed operation have descending layout*
 //
+// Instructions that already have a normalized layout for operands and output
+// are skipped.
+//
 // *: For current fusion limitations this is currently not applicable to
 // unnested reductions only.
 class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
@@ -68,6 +72,17 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
       const CustomCallTransformer& custom_call_transformer = nullptr)
       : normalization_(normalization),
         custom_call_transformer_(custom_call_transformer) {}
+  bool ShouldProcessNode(HloInstruction* hlo) override {
+    // Skip `hlo` if it already has a default layout and the operands have a
+    // default layout as well.
+    if (hlo->shape().IsArray() && HasDefaultLayout(hlo) &&
+        absl::c_all_of(hlo->operands(), [this](HloInstruction* operand) {
+          return HasDefaultLayout(operand);
+        })) {
+      return false;
+    }
+    return true;
+  }
 
   // To handle a constant, just give the literal data a new layout.
   absl::Status HandleConstant(HloInstruction* hlo) override {
@@ -77,7 +92,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
       return absl::OkStatus();
     }
 
-    const Shape& shape = hlo->shape();
+    Shape shape = hlo->shape();
     Shape normalized_shape = Normalize(shape);
     *literal.mutable_shape_do_not_use() = normalized_shape;
     // Ensure element_size_in_bits of literal is 0, because literals do not
@@ -86,8 +101,8 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         ->mutable_layout()
         ->set_element_size_in_bits(0);
 
-    HloInstruction* bc_to_orig = MakeBitcastHlo(hlo, shape);
     *hlo->mutable_shape() = normalized_shape;
+    HloInstruction* bc_to_orig = MaybeBitcast(hlo, shape);
     TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWithDifferentShape(bc_to_orig));
     MarkAsChanged();
     return absl::OkStatus();
@@ -120,7 +135,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     *normalized_slice->mutable_shape()->mutable_layout() =
         normalized_input->shape().layout();
     SetVisited(*normalized_slice);
-    HloInstruction* bc_to_orig = MakeBitcastHlo(normalized_slice, s);
+    HloInstruction* bc_to_orig = MaybeBitcast(normalized_slice, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -144,9 +159,9 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     }
 
     auto normalized_shape = Normalize(shape);
-    auto bc_to_normalized = MakeBitcastHlo(hlo, normalized_shape);
+    auto bc_to_normalized = MaybeBitcast(hlo, normalized_shape);
     SetVisited(*bc_to_normalized);
-    auto bc_to_orig = MakeBitcastHlo(bc_to_normalized, shape);
+    auto bc_to_orig = MaybeBitcast(bc_to_normalized, shape);
     TF_RETURN_IF_ERROR(hlo->ReplaceUsesWith(users, bc_to_orig));
     MarkAsChanged();
     return absl::OkStatus();
@@ -173,7 +188,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         hlo->AddInstruction(HloInstruction::CreateConcatenate(
             normalized_shape, normalized_inputs, normalized_concat_dim));
     SetVisited(*normalized_concat);
-    auto bc_to_orig = MakeBitcastHlo(normalized_concat, hlo->shape());
+    auto bc_to_orig = MaybeBitcast(normalized_concat, hlo->shape());
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -211,7 +226,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     normalization_->UpdateLayout(rw->mutable_shape());
     SetVisited(*rw);
 
-    HloInstruction* bc_to_orig = MakeBitcastHlo(rw, hlo->shape());
+    HloInstruction* bc_to_orig = MaybeBitcast(rw, hlo->shape());
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -248,7 +263,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         normalized_input, br_dimensions, normalized_shape, &hlo->metadata());
     SetVisited(*normalized_broadcast);
     VLOG(3) << "Generated broadcast: " << normalized_broadcast->ToString();
-    auto bc_to_orig = MakeBitcastHlo(normalized_broadcast, s);
+    auto bc_to_orig = MaybeBitcast(normalized_broadcast, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -265,7 +280,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         HloInstruction::CreateIota(normalized_shape, new_iota_dimension));
     SetVisited(*normalized_iota);
     VLOG(3) << "Generated iota: " << normalized_iota->ToString();
-    auto bc_to_orig = MakeBitcastHlo(normalized_iota, s);
+    auto bc_to_orig = MaybeBitcast(normalized_iota, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -273,7 +288,8 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   // BitcastConvert is only layout-preserving if it doesn't change the rank.
   absl::Status HandleBitcastConvert(HloInstruction* hlo) override {
     // If the rank isn't changing this is just an unary op.
-    if (hlo->shape().rank() == hlo->operand(0)->shape().rank()) {
+    if (hlo->shape().dimensions_size() ==
+        hlo->operand(0)->shape().dimensions_size()) {
       return HandleElementwiseUnary(hlo);
     }
 
@@ -324,7 +340,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
       // 'normalized_input' is already marked as visited.
       SetVisited(*new_unary);
     }
-    auto bc_to_orig = MakeBitcastHlo(new_unary, s);
+    auto bc_to_orig = MaybeBitcast(new_unary, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -366,7 +382,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
           new_binary, MakeBinaryHlo(hlo->opcode(), a0, b0, &hlo->metadata()));
     }
     SetVisited(*new_binary);
-    auto bc_to_orig = MakeBitcastHlo(new_binary, s);
+    auto bc_to_orig = MaybeBitcast(new_binary, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -389,7 +405,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     TF_ASSIGN_OR_RETURN(auto new_reshape,
                         MakeReshapeHlo(normalized_reshape_s, a0));
     SetVisited(*new_reshape);
-    auto bc_to_orig = MakeBitcastHlo(new_reshape, s);
+    auto bc_to_orig = MaybeBitcast(new_reshape, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -429,7 +445,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     // 'scatter_indices'. So we require that there is just a single
     // 'scatter' dimension. This is ensured by the ScatterSimplifier pass.
     const auto& dims = scatter->scatter_dimension_numbers();
-    if (scatter->scatter_updates().front()->shape().rank() -
+    if (scatter->scatter_updates().front()->shape().dimensions_size() -
             dims.update_window_dims_size() >
         1) {
       return FailedPrecondition(
@@ -510,7 +526,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         normalized_updates, scatter->to_apply(), normalized_dims,
         scatter->indices_are_sorted(), scatter->unique_indices()));
     SetVisited(*normalized_scatter);
-    auto bc_to_orig = MakeBitcastHlo(normalized_scatter, scatter->shape());
+    auto bc_to_orig = MaybeBitcast(normalized_scatter, scatter->shape());
     TF_RETURN_IF_ERROR(ReplaceInstruction(scatter, bc_to_orig));
     return absl::OkStatus();
   }
@@ -559,7 +575,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
               << normalized_transpose->ToString();
     }
 
-    auto bc_to_orig = MakeBitcastHlo(normalized_transpose, s);
+    auto bc_to_orig = MaybeBitcast(normalized_transpose, s);
     return ReplaceInstruction(hlo, bc_to_orig);
   }
 
@@ -588,7 +604,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     auto t = hlo->AddInstruction(
         HloInstruction::CreateTranspose(s_normalized, a0, dimensions));
     SetVisited(*t);
-    auto bc_to_orig = MakeBitcastHlo(t, s);
+    auto bc_to_orig = MaybeBitcast(t, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -610,7 +626,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     auto normalized_reverse = hlo->AddInstruction(
         HloInstruction::CreateReverse(a0->shape(), a0, new_dimensions));
     SetVisited(*normalized_reverse);
-    auto bc_to_orig = MakeBitcastHlo(normalized_reverse, s);
+    auto bc_to_orig = MaybeBitcast(normalized_reverse, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -643,7 +659,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     auto padded_normalized = hlo->AddInstruction(HloInstruction::CreatePad(
         s_normalized, normalized_input, padded_by, new_padding));
     SetVisited(*padded_normalized);
-    auto bc_to_orig = MakeBitcastHlo(padded_normalized, s);
+    auto bc_to_orig = MaybeBitcast(padded_normalized, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -698,7 +714,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     *normalized_dynamic_slice->mutable_shape()->mutable_layout() =
         normalized_input->shape().layout();
     SetVisited(*normalized_dynamic_slice);
-    HloInstruction* bc_to_orig = MakeBitcastHlo(normalized_dynamic_slice, s);
+    HloInstruction* bc_to_orig = MaybeBitcast(normalized_dynamic_slice, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -726,7 +742,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     *new_dus->mutable_shape()->mutable_layout() = new_operand->shape().layout();
     SetVisited(*new_dus);
 
-    HloInstruction* bc_to_orig = MakeBitcastHlo(new_dus, s);
+    HloInstruction* bc_to_orig = MaybeBitcast(new_dus, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
 
     return absl::OkStatus();
@@ -770,7 +786,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     hlo->SetupDerivedInstruction(normalized);
     SetVisited(*normalized);
 
-    HloInstruction* bc_to_orig = MakeBitcastHlo(normalized, s);
+    HloInstruction* bc_to_orig = MaybeBitcast(normalized, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return absl::OkStatus();
   }
@@ -797,15 +813,15 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   }
 
   // Due to Local Precondition we have, the input to all processed ops should
-  // be HLO in descending layout piped through bitcast.
+  // be HLO in descending layout (possibly piped through bitcast).
   absl::StatusOr<HloInstruction*> GetNormalizedInput(HloInstruction* hlo) {
+    if (HasDefaultLayout(hlo)) {
+      return hlo;
+    }
     TF_RET_CHECK(hlo->opcode() == HloOpcode::kBitcast)
         << "Unexpected HLO input: " << hlo->ToString();
     auto input = hlo->mutable_operand(0);
-    auto input_shape = input->shape();
-    TF_RET_CHECK(Layout::Equal().IgnoreElementSize()(
-        input_shape.layout(),
-        LayoutUtil::GetDefaultLayoutForShape(input_shape)));
+    TF_RET_CHECK(HasDefaultLayout(input));
     return input;
   }
 
@@ -814,6 +830,17 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(s);
   }
 
+  bool HasDefaultLayout(HloInstruction* hlo) {
+    return hlo->shape().IsArray() &&
+           LayoutUtil::IsMonotonicWithDim0Major(hlo->shape().layout());
+  }
+
+  HloInstruction* MaybeBitcast(HloInstruction* hlo,
+                               const Shape& original_shape) {
+    return hlo->shape() == original_shape ? hlo
+                                          : MakeBitcastHlo(hlo, original_shape);
+  }
+
   LayoutNormalization* normalization_;
   CustomCallTransformer custom_call_transformer_;
 };
diff --git a/third_party/xla/xla/service/layout_normalization_test.cc b/third_party/xla/xla/service/layout_normalization_test.cc
index fb40185f38d4..8fcb98cca824 100644
--- a/third_party/xla/xla/service/layout_normalization_test.cc
+++ b/third_party/xla/xla/service/layout_normalization_test.cc
@@ -20,15 +20,15 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/scatter_simplifier.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class LayoutNormalizationTest : public HloTestBase {
+class LayoutNormalizationTest : public HloHardwareIndependentTestBase {
  public:
   void CheckLayoutNormalization(
       absl::string_view hlo, std::optional<absl::string_view> expected,
@@ -55,6 +55,19 @@ ENTRY main {
 )");
 }
 
+TEST_F(LayoutNormalizationTest,
+       TestInstructionsWithNormalizedLayoutAreSkipped) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  p = f32[5,4]{1,0} parameter(0)
+  ROOT o = f32[5,4]{1,0} abs(p)
+}
+)";
+  CheckLayoutNormalization(hlo, /*expected=*/std::nullopt);
+}
+
 TEST_F(LayoutNormalizationTest, TestUnary) {
   const char* hlo = R"(
 HloModule module
@@ -138,14 +151,13 @@ HloModule module
 ENTRY main {
   a = f32[5,4]{1,0} parameter(0)
   t = f32[4,5]{0,1} transpose(a), dimensions={1,0}
-  ROOT out = abs(t)
+  ROOT out = f32[4,5]{0,1} abs(t)
 }
 )";
 
   CheckLayoutNormalization(hlo, R"(
 // CHECK: [[a_0:%[^ ]+]] = f32[5,4]{1,0} parameter(0)
-// CHECK: [[bitcast_1:%[^ ]+]] = f32[5,4]{1,0} bitcast([[a_0]])
-// CHECK: [[abs_2:%[^ ]+]] = f32[5,4]{1,0} abs([[bitcast_1]])
+// CHECK: [[abs_2:%[^ ]+]] = f32[5,4]{1,0} abs([[a_0]])
 // CHECK: ROOT [[bitcast_3_3:%[^ ]+]] = f32[4,5]{0,1} bitcast([[abs_2]])
 )");
 }
@@ -265,14 +277,13 @@ HloModule module
 ENTRY main {
   a = f32[2,3]{1,0} parameter(0)
   b = f32[2,4,3]{1,2,0} broadcast(a), dimensions={0,2}
-  ROOT out = abs(b)
+  ROOT out = f32[2,4,3]{1,2,0} abs(b)
 }
 )";
 
   CheckLayoutNormalization(hlo, R"(
 // CHECK: [[a_0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
-// CHECK: [[bitcast_1:%[^ ]+]] = f32[2,3]{1,0} bitcast([[a_0]])
-// CHECK: [[broadcast_2:%[^ ]+]] = f32[2,3,4]{2,1,0} broadcast([[bitcast_1]]), dimensions={0,1}
+// CHECK: [[broadcast_2:%[^ ]+]] = f32[2,3,4]{2,1,0} broadcast([[a_0]]), dimensions={0,1}
 // CHECK: [[abs_3:%[^ ]+]] = f32[2,3,4]{2,1,0} abs([[broadcast_2]])
 // CHECK: ROOT [[bitcast_3_4:%[^ ]+]] = f32[2,4,3]{1,2,0} bitcast([[abs_3]])
 )");
@@ -285,17 +296,11 @@ HloModule module
 ENTRY main {
   a = f32[2,3]{1,0} parameter(0)
   b = f32[3,4,2]{2,1,0} broadcast(a), dimensions={2,0}
-  ROOT out = abs(b)
+  ROOT out = f32[3,4,2]{2,1,0} abs(b)
 }
 )";
 
-  CheckLayoutNormalization(hlo, R"(
-// CHECK: [[a_0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
-// CHECK: [[bitcast_1:%[^ ]+]] = f32[2,3]{1,0} bitcast([[a_0]])
-// CHECK: [[broadcast_2:%[^ ]+]] = f32[3,4,2]{2,1,0} broadcast([[bitcast_1]]), dimensions={2,0}
-// CHECK: [[abs_3:%[^ ]+]] = f32[3,4,2]{2,1,0} abs([[broadcast_2]])
-// CHECK: ROOT [[bitcast_3_4:%[^ ]+]] = f32[3,4,2]{2,1,0} bitcast([[abs_3]])
-)");
+  CheckLayoutNormalization(hlo, std::nullopt);
 }
 
 TEST_F(LayoutNormalizationTest, BroadcastCustomOutputLayoutWithDegenerate) {
@@ -305,13 +310,13 @@ HloModule module
 ENTRY main {
   a = f32[9]{0} parameter(0)
   b = f32[2,1,4,9]{2,0,1,3} broadcast(a), dimensions={3}
-  ROOT out = abs(b)
+  ROOT out = f32[2,1,4,9]{2,0,1,3} abs(b)
 }
 )";
 
   CheckLayoutNormalization(hlo, R"(
-// CHECK: [[bitcast_0:%[^ ]+]] = f32[9]{0} bitcast([[a_1:%[^ ]+]])
-// CHECK: [[broadcast_2:%[^ ]+]] = f32[9,1,2,4]{3,2,1,0} broadcast([[bitcast_0]]), dimensions={0}
+// CHECK: [[a:%[^ ]+]] = f32[9]{0} parameter(0)
+// CHECK: [[broadcast_2:%[^ ]+]] = f32[9,1,2,4]{3,2,1,0} broadcast([[a]]), dimensions={0}
 // CHECK: [[abs_3:%[^ ]+]] = f32[9,1,2,4]{3,2,1,0} abs([[broadcast_2]])
 // CHECK: ROOT [[bitcast_3_4:%[^ ]+]] = f32[2,1,4,9]{2,0,1,3} bitcast([[abs_3]])
 )");
@@ -582,14 +587,15 @@ HloModule module
 
 ENTRY main {
   p = f32[5,4]{0,1} parameter(0)
-  c = f32[5,4]{0,1} constant({...})
+  c = f32[5,4]{0,1} constant({{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16},{17,18,19,20}})
   ROOT o = f32[5,4]{0,1} add(p, c)
 }
 )";
   CheckLayoutNormalization(hlo, R"(
 // CHECK: [[p_0:%[^ ]+]] = f32[5,4]{0,1} parameter(0)
 // CHECK-NEXT: [[bitcast_1:%[^ ]+]] = f32[4,5]{1,0} bitcast([[p_0]])
-// CHECK-NEXT: [[constant_2:%[^ ]+]] = f32[4,5]{1,0} constant({...})
+// CHECK-NEXT: [[constant_2:%[^ ]+]] = f32[4,5]{1,0} constant(
+// CHECK-SAME{LITERAL}:                { { 1, 5, 9, 13, 17 }, { 2, 6, 10, 14, 18 }, { 3, 7, 11, 15, 19 }, { 4, 8, 12, 16, 20 } })
 // CHECK-NEXT: [[add_3:%[^ ]+]] = f32[4,5]{1,0} add([[bitcast_1]], [[constant_2]])
 // CHECK-NEXT: ROOT [[bitcast_3_4:%[^ ]+]] = f32[5,4]{0,1} bitcast([[add_3]])
   )");
@@ -600,7 +606,7 @@ TEST_F(LayoutNormalizationTest, ConstantAvoidRevisitOfUser) {
 HloModule module
 
 ENTRY main {
-  c = f32[5,4]{0,1} constant({...})
+  c = f32[5,4]{0,1} constant({{1,2,3,4},{5,6,7,8},{9,10,11,12},{13,14,15,16},{17,18,19,20}})
   s = f32[5,4]{0,1} sine(c)
   t = f32[5,4]{0,1} tanh(s)
   ROOT o = f32[5,4]{0,1} add(s, t)
@@ -610,7 +616,8 @@ ENTRY main {
   // run into a CHECK failure, because the constant was normalized in-place and
   // therefore would not be revisited.
   CheckLayoutNormalization(hlo, R"(
-// CHECK: [[constant_2:%[^ ]+]] = f32[4,5]{1,0} constant({...})
+// CHECK: [[constant_2:%[^ ]+]] = f32[4,5]{1,0} constant(
+// CHECK-SAME{LITERAL}:           { { 1, 5, 9, 13, 17 }, { 2, 6, 10, 14, 18 }, { 3, 7, 11, 15, 19 }, { 4, 8, 12, 16, 20 } })
 // CHECK-NEXT: [[sine:%[^ ]+]] = f32[4,5]{1,0} sine([[constant_2]])
 // CHECK-NEXT: [[bitcast_1:%[^ ]+]] = f32[5,4]{0,1} bitcast([[sine]])
 // CHECK-NEXT: [[bitcast_2:%[^ ]+]] = f32[4,5]{1,0} bitcast([[bitcast_1]])
@@ -814,14 +821,15 @@ TEST_F(LayoutNormalizationTest, BitcastConvertToSmallerType) {
 HloModule m
 
 ENTRY main {
-  p0 = u64[4]{0} parameter(0)
-  ROOT out = u32[4,2]{0,1} bitcast-convert(u64[4]{0} p0), metadata={op_name="test"}
+  p0 = u64[3,4]{0,1} parameter(0)
+  bc_convert = u32[3,4,2]{1,0,2} bitcast-convert(p0), metadata={op_name="test"}
+  ROOT out = u32[3,4,2]{1,0,2} reverse(bc_convert), dimensions={0}
 }
 )";
 
   CheckLayoutNormalization(hlo, R"(
 // CHECK: bitcast-convert({{.*}}), metadata={op_name="test"}
-)");
+  )");
 }
 
 TEST_F(LayoutNormalizationTest, Scatter) {
@@ -925,14 +933,14 @@ TEST_F(LayoutNormalizationTest, CompareInt4) {
 HloModule module
 
 ENTRY main {
-  a = s4[10]{0:E(4)} parameter(0)
-  b = s4[10]{0:E(4)} parameter(1)
-  ROOT out = compare(a, b), direction=EQ
+  a = s4[10,11]{0,1:E(4)} parameter(0)
+  b = s4[10,11]{0,1:E(4)} parameter(1)
+  ROOT out = pred[10,11]{0,1} compare(a, b), direction=EQ
 }
 )";
 
   CheckLayoutNormalization(hlo, R"(
-// CHECK: pred[10]{0} compare({{.*}})
+// CHECK: pred[11,10]{1,0} compare({{.*}})
 )");
 }
 
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations.cc b/third_party/xla/xla/service/legalize_scheduling_annotations.cc
index 16270baaf5ce..b505e693759f 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations.cc
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations.cc
@@ -16,51 +16,108 @@ limitations under the License.
 #include "xla/service/legalize_scheduling_annotations.h"
 
 #include <cstdint>
+#include <functional>
+#include <optional>
+#include <queue>
 #include <string>
 #include <vector>
 
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/ptrvec.h"
+#include "xla/service/scheduling_annotations_util.h"
 #include "xla/side_effect_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
-absl::StatusOr<int64_t> ExtractAnnotation(
-    const ::google::protobuf::Map<std::string, std::string>& attrs,
-    absl::string_view instr_name) {
-  int64_t annotation_id;
-  if (!absl::SimpleAtoi(attrs.at(kXlaSchedulingGroupIdAttr), &annotation_id)) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Instruction has a non-integer scheduling annotation, inst: ",
-        instr_name, ", annotation: ", attrs.at(kXlaSchedulingGroupIdAttr)));
+namespace {
+
+// Given a group of annotated instructions (sources), find all reachable
+// instructions from them in the same computation.
+absl::flat_hash_set<HloInstruction*> PropagateAnnotationFromSources(
+    const std::vector<HloInstruction*>& sources,
+    const HloComputation* computation) {
+  absl::flat_hash_set<HloInstruction*> to_annotate;
+  auto reachability = HloReachabilityMap::Build(computation);
+  // worklist contains instructions that can reach any source instruction.
+  std::queue<HloInstruction*> work_queue;
+  absl::flat_hash_set<HloInstruction*> visited;
+  absl::flat_hash_set<HloInstruction*> sources_set(sources.begin(),
+                                                   sources.end());
+  for (HloInstruction* instr : sources) {
+    for (HloInstruction* another_instr : sources) {
+      if (instr == another_instr) {
+        continue;
+      }
+      if (reachability->IsReachable(instr, another_instr)) {
+        work_queue.push(instr);
+        visited.insert(instr);
+        break;
+      }
+    }
   }
-  if (annotation_id < 0) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Instruction has a negative scheduling annotation, inst: ", instr_name,
-        ", annotation: ", attrs.at(kXlaSchedulingGroupIdAttr)));
+
+  while (!work_queue.empty()) {
+    auto* instr = work_queue.front();
+    work_queue.pop();
+    if (!sources_set.contains(instr)) {
+      to_annotate.insert(instr);
+    }
+    for (const PtrVec<HloInstruction*>& users :
+         {instr->users(), instr->control_successors()}) {
+      for (HloInstruction* user : users) {
+        if (visited.contains(user)) {
+          continue;
+        }
+        // Add user to work queue if it reaches any source instruction.
+        for (HloInstruction* source : sources) {
+          if (reachability->IsReachable(user, source)) {
+            work_queue.push(user);
+            visited.insert(user);
+            break;
+          }
+        }
+      }
+    }
   }
-  return annotation_id;
+  return to_annotate;
 }
 
-void DropSchedulingAnnotation(HloInstruction* instr) {
-  VLOG(2) << "Dropping annotation from " << instr->name();
-  FrontendAttributes frontend_attributes = instr->frontend_attributes();
-  frontend_attributes.mutable_map()->erase("_scheduling_group_id");
-  instr->set_frontend_attributes(frontend_attributes);
+// Attach the annotation ID to the given instructions. Returns error if any of
+// the instructions already has an annotation.
+absl::Status AttachAnnotation(
+    Annotation annotation,
+    const absl::flat_hash_set<HloInstruction*>& instructions) {
+  for (HloInstruction* instr : instructions) {
+    TF_ASSIGN_OR_RETURN(std::optional<Annotation> instr_annotation,
+                        GetSchedulingAnnotation(instr));
+    if (instr_annotation) {
+      return absl::InternalError("Trying to propagate scheduling annotation " +
+                                 annotation.ToString() + " to " +
+                                 std::string(instr->name()) +
+                                 " but it has an existing annotation: " +
+                                 instr_annotation->ToString());
+    }
+    LOG(INFO) << "Propagating annotation " << annotation.ToString() << " to "
+              << instr->name();
+    TF_RETURN_IF_ERROR(SetSchedulingAnnotation(instr, annotation));
+  }
+  return absl::OkStatus();
 }
 
 bool IsSupportedAsyncOp(HloInstruction* instr) {
@@ -72,127 +129,70 @@ bool IsSupportedAsyncOp(HloInstruction* instr) {
       HloOpcode::kSend, HloOpcode::kRecvDone, HloOpcode::kRecv>(instr);
 }
 
-bool LegalizeSchedulingAnnotations::KeepSchedulingAnnotation(
-    HloInstruction* instr) {
-  return IsSupportedAsyncOp(instr) || config_.keep_sync_annotation(instr);
-}
-
-absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  absl::flat_hash_map<HloInstruction*, int64_t> annotation;
-  absl::flat_hash_map<
-      int64_t,
-      absl::flat_hash_map<HloComputation*, std::vector<HloInstruction*>>>
-      annotation_to_instructions;
-  // Filter the annotated ops (using config) to keep the annotations only in the
-  // desired sync ops. Annotations in all async ops are kept.
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    for (HloInstruction* instr : computation->instructions()) {
-      if (!instr->frontend_attributes().map().contains(
-              "_scheduling_group_id") ||
-          KeepSchedulingAnnotation(instr)) {
-        continue;
+absl::Status CheckStartDoneAnnotationConsistency(
+    const absl::flat_hash_map<
+        Annotation,
+        absl::flat_hash_map<HloComputation*, std::vector<HloInstruction*>>>&
+        annotation_to_instruction,
+    const absl::flat_hash_map<HloInstruction*, Annotation>&
+        instruction_to_annotation) {
+  for (const auto& [annotation, comp_inst_vector] : annotation_to_instruction) {
+    for (const auto& [comp, annotated_instructions] : comp_inst_vector) {
+      for (HloInstruction* instr : annotated_instructions) {
+        CHECK(instruction_to_annotation.contains(instr));
+        CHECK(instruction_to_annotation.at(instr) == annotation);
+        if (HloPredicateIsOp<
+                HloOpcode::kAllGatherDone, HloOpcode::kAllReduceDone,
+                HloOpcode::kCollectivePermuteDone, HloOpcode::kAsyncDone>(
+                instr) &&
+            (!instruction_to_annotation.contains(instr->operand(0)) ||
+             instruction_to_annotation.at(instr->mutable_operand(0)) !=
+                 annotation)) {
+          return absl::InternalError(absl::StrCat(
+              "Done instruction's operand is not annotated with the same id: ",
+              instr->operand(0)->name(),
+              ", annotation: ", annotation.ToString()));
+        }
       }
-      DropSchedulingAnnotation(instr);
     }
   }
-  // Find the annotated instructions and save relevant information.
-  for (HloComputation* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
-    for (HloInstruction* instr : computation->instructions()) {
-      const auto& attrs = instr->frontend_attributes().map();
-      if (!attrs.contains(kXlaSchedulingGroupIdAttr)) {
-        continue;
-      }
-      VLOG(1) << "Annotated instruction: " << instr->name() << " "
-              << attrs.at(kXlaSchedulingGroupIdAttr);
-      TF_ASSIGN_OR_RETURN(int64_t annotation_id,
-                          ExtractAnnotation(attrs, instr->name()));
+  return absl::OkStatus();
+}
 
-      annotation[instr] = annotation_id;
-      annotation_to_instructions[annotation_id][computation].push_back(instr);
-    }
-  }
-  // Move the annotation from inside fusion computation to the caller
-  // instruction if the caller doesn't have an annotation. Return an error if
-  // there are some fused instructions with different annotations.
-  for (HloComputation* computation : module->computations(execution_threads)) {
-    if (!computation->IsFusionComputation() ||
-        !config_.keep_sync_annotation(computation->FusionInstruction()) ||
-        annotation.contains(computation->FusionInstruction())) {
-      continue;
-    }
-    int64_t seen_annotation = -1;
-    for (HloInstruction* instr : computation->instructions()) {
-      const auto& attrs = instr->frontend_attributes().map();
-      if (!attrs.contains(kXlaSchedulingGroupIdAttr)) {
-        continue;
-      }
-      TF_ASSIGN_OR_RETURN(int64_t annotation_id,
-                          ExtractAnnotation(attrs, instr->name()));
-      if (seen_annotation == -1) {
-        seen_annotation = annotation_id;
-        continue;
-      }
-      if (seen_annotation != annotation_id) {
-        return absl::InternalError(absl::StrCat(
-            "Found a fusion with multiple annotations in the fused "
-            "computation. fusion: ",
-            computation->FusionInstruction()->name(),
-            ", annotations: ", seen_annotation, " and ", annotation_id));
-      }
-    }
-    // No fused instructions are annotated, nothing to do.
-    if (seen_annotation == -1) {
-      continue;
-    }
-    FrontendAttributes frontend_attributes =
-        computation->FusionInstruction()->frontend_attributes();
-    frontend_attributes.mutable_map()->insert(
-        {kXlaSchedulingGroupIdAttr, std::to_string(seen_annotation)});
-    computation->FusionInstruction()->set_frontend_attributes(
-        frontend_attributes);
-  }
-  if (annotation_to_instructions.empty()) {
-    return false;
-  }
+absl::Status CheckGapBetweenAnnotatedInstructions(
+    const absl::flat_hash_map<
+        Annotation,
+        absl::flat_hash_map<HloComputation*, std::vector<HloInstruction*>>>&
+        annotation_to_instruction,
+    const absl::flat_hash_map<HloInstruction*, Annotation>&
+        instruction_to_annotation) {
   absl::flat_hash_map<HloInstruction*, HloInstruction*> parent;
-  for (const auto& [id, comp_inst_vector] : annotation_to_instructions) {
+  for (const auto& [annotation, comp_inst_vector] : annotation_to_instruction) {
     for (const auto& [comp, annotated_instructions] : comp_inst_vector) {
       // First find the frontier nodes that are not annotated with id but use an
       // annotated instruction with id.
       std::vector<HloInstruction*> stack;
       absl::flat_hash_set<HloInstruction*> visited;
       for (HloInstruction* instr : annotated_instructions) {
-        CHECK(annotation.contains(instr));
-        CHECK_EQ(annotation[instr], id);
-        if (HloPredicateIsOp<
-                HloOpcode::kAllGatherDone, HloOpcode::kAllReduceDone,
-                HloOpcode::kCollectivePermuteDone, HloOpcode::kAsyncDone>(
-                instr) &&
-            (!annotation.contains(instr->operand(0)) ||
-             annotation[instr->mutable_operand(0)] != id)) {
-          return absl::InternalError(absl::StrCat(
-              "Done instruction's operand is not annotated with the same id: ",
-              instr->operand(0)->name(), ", annotation: ", id));
-        }
+        CHECK(instruction_to_annotation.contains(instr));
+        CHECK(instruction_to_annotation.at(instr) == annotation);
         for (const PtrVec<HloInstruction*>& users :
              {instr->users(), instr->control_successors()}) {
           for (HloInstruction* user : users) {
             if (!visited.contains(user) &&
-                (!annotation.contains(user) || annotation[user] != id)) {
+                (!instruction_to_annotation.contains(user) ||
+                 instruction_to_annotation.at(user) != annotation)) {
               stack.push_back(user);
               parent[user] = instr;
               visited.insert(user);
-              VLOG(2) << "Annotation group: " << id
+              VLOG(2) << "Annotation : " << annotation.ToString()
                       << ", frontier using a root: " << user->name();
             }
           }
         }
       }
-      VLOG(2) << "Annotation group: " << id << ", frontier has " << stack.size()
-              << " instructions";
+      VLOG(2) << "Annotation : " << annotation.ToString() << ", frontier has "
+              << stack.size() << " instructions";
       // Traverse the HLO graph starting from the frontier instructions and move
       // to the users. If there are gaps in the annotation, the traversal will
       // hit an instruction that is annotated with the same id.
@@ -202,7 +202,8 @@ absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
         for (const PtrVec<HloInstruction*>& users :
              {instr->users(), instr->control_successors()}) {
           for (HloInstruction* user : users) {
-            if (annotation.contains(user) && annotation[user] == id) {
+            if (instruction_to_annotation.contains(user) &&
+                instruction_to_annotation.at(user) == annotation) {
               LOG(INFO) << "PATH: " << user->name();
               HloInstruction* current = instr;
               LOG(INFO) << "PATH: " << current->name();
@@ -213,7 +214,7 @@ absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
               return absl::UnimplementedError(absl::StrCat(
                   "Support for annotation groups with gaps doesn't "
                   "exist yet, annotation: ",
-                  id, ", instr: ", user->name(),
+                  annotation.ToString(), ", instr: ", user->name(),
                   " has the same annotation in its operand tree but "
                   "has gaps on the way from that operand to itself."));
             }
@@ -228,6 +229,194 @@ absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
       }
     }
   }
-  return true;
+  return absl::OkStatus();
+}
+
+absl::StatusOr<bool> HaulAnnotationToFusionInstruction(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    const absl::flat_hash_map<
+        Annotation,
+        absl::flat_hash_map<HloComputation*, std::vector<HloInstruction*>>>&
+        annotation_to_instruction,
+    const absl::flat_hash_map<HloInstruction*, Annotation>&
+        instruction_to_annotation,
+    std::function<bool(HloInstruction*)> keep_sync_annotation) {
+  bool changed = false;
+  for (HloComputation* computation : module->computations(execution_threads)) {
+    if (!computation->IsFusionComputation() ||
+        !keep_sync_annotation(computation->FusionInstruction()) ||
+        instruction_to_annotation.contains(computation->FusionInstruction())) {
+      continue;
+    }
+    changed = true;
+    std::optional<Annotation> seen_annotation;
+    for (HloInstruction* instr : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(std::optional<Annotation> annotation,
+                          GetSchedulingAnnotation(instr));
+      if (!annotation) {
+        continue;
+      }
+      if (!seen_annotation) {
+        seen_annotation = annotation;
+        continue;
+      }
+      if (seen_annotation != annotation) {
+        return absl::InternalError(absl::StrCat(
+            "Found a fusion with multiple annotations in the fused "
+            "computation. fusion: ",
+            computation->FusionInstruction()->name(), ", annotations: ",
+            seen_annotation->ToString(), " and ", annotation->ToString()));
+      }
+    }
+    // No fused instructions are annotated, nothing to do.
+    if (!seen_annotation) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(SetSchedulingAnnotation(computation->FusionInstruction(),
+                                               seen_annotation->ToString()));
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> RemoveLoopIterationAnnotation(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(bool removed,
+                          RemoveSchedulingAnnotationIterationId(instr));
+      changed |= removed;
+    }
+  }
+  return changed;
+}
+
+}  // namespace
+
+absl::StatusOr<bool> LegalizeSchedulingAnnotations::PropagateAnnotations(
+    const HloComputation* computation,
+    const absl::btree_map<Annotation, std::vector<HloInstruction*>>&
+        annotation_to_instruction) {
+  bool changed = false;
+  for (auto& [annotation, sources] : annotation_to_instruction) {
+    absl::flat_hash_set<HloInstruction*> to_annotate =
+        PropagateAnnotationFromSources(sources, computation);
+    changed |= (!to_annotate.empty());
+    auto status = AttachAnnotation(annotation, to_annotate);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+  return changed;
+}
+
+bool LegalizeSchedulingAnnotations::KeepSchedulingAnnotation(
+    HloInstruction* instr) {
+  const auto& attrs = instr->frontend_attributes().map();
+  if (attrs.contains(kXlaSchedulingGroupIdAttr) &&
+      attrs.at(kXlaSchedulingGroupIdAttr) == kXlaNoOpSchedulingGroup) {
+    return false;
+  }
+
+  return IsSupportedAsyncOp(instr) || config_.keep_sync_annotation(instr);
+}
+
+absl::StatusOr<bool> LegalizeSchedulingAnnotations::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  // Remove loop iteration annotation if requested.
+  if (config_.remove_loop_iteration_annotation_only) {
+    TF_ASSIGN_OR_RETURN(bool removed, RemoveLoopIterationAnnotation(module));
+    changed |= removed;
+    return changed;
+  }
+
+  absl::flat_hash_map<HloInstruction*, Annotation> instruction_to_annotation;
+  absl::flat_hash_map<
+      Annotation,
+      absl::flat_hash_map<HloComputation*, std::vector<HloInstruction*>>>
+      annotation_to_instruction;
+  // Filter the annotated ops (using config) to keep the annotations only in the
+  // desired sync ops. Annotations in all async ops are kept.
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (HasSchedulingAnnotation(instr) && !KeepSchedulingAnnotation(instr)) {
+        changed |= RemoveSchedulingAnnotation(instr);
+      }
+    }
+  }
+
+  // Find the annotated instructions and save relevant information.
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instr : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(std::optional<Annotation> annotation,
+                          GetSchedulingAnnotation(instr));
+      if (!annotation) {
+        continue;
+      }
+      instruction_to_annotation[instr] = *annotation;
+      annotation_to_instruction[*annotation][computation].push_back(instr);
+    }
+  }
+
+  // Move the annotation from inside fusion computation to the caller
+  // instruction if the caller doesn't have an annotation. Return an error if
+  // there are some fused instructions with different annotations.
+  TF_ASSIGN_OR_RETURN(
+      bool haul_annotation_to_top_level,
+      HaulAnnotationToFusionInstruction(
+          module, execution_threads, annotation_to_instruction,
+          instruction_to_annotation, config_.keep_sync_annotation));
+  changed |= haul_annotation_to_top_level;
+
+  if (annotation_to_instruction.empty()) {
+    return changed;
+  }
+
+  if (config_.check_start_done_annotation_consistency) {
+    auto status = CheckStartDoneAnnotationConsistency(
+        annotation_to_instruction, instruction_to_annotation);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  // Either propagate the annotation to fill the gaps between instructions with
+  // the same annotation ID or check (and return error) if there are gaps.
+  if (config_.propagate_annotation) {
+    // Propagate the annotation to fill the gaps between instructions with the
+    // same annotation ID.
+    for (HloComputation* computation :
+         module->MakeNonfusionComputations(execution_threads)) {
+      absl::btree_map<Annotation, std::vector<HloInstruction*>>
+          per_computation_annotation_to_instruction;
+      for (const auto& [annotation, comp_inst_vector] :
+           annotation_to_instruction) {
+        if (comp_inst_vector.contains(computation)) {
+          per_computation_annotation_to_instruction[annotation] =
+              comp_inst_vector.at(computation);
+        }
+      }
+      if (per_computation_annotation_to_instruction.empty()) {
+        continue;
+      }
+      auto result = PropagateAnnotations(
+          computation, per_computation_annotation_to_instruction);
+      if (!result.ok()) {
+        return result.status();
+      }
+      changed |= result.value();
+    }
+  } else {
+    auto result = CheckGapBetweenAnnotatedInstructions(
+        annotation_to_instruction, instruction_to_annotation);
+    if (!result.ok()) {
+      return result;
+    }
+  }
+
+  return changed;
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations.h b/third_party/xla/xla/service/legalize_scheduling_annotations.h
index 49b02271110b..7cb1a6179638 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations.h
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations.h
@@ -16,13 +16,19 @@ limitations under the License.
 #ifndef XLA_SERVICE_LEGALIZE_SCHEDULING_ANNOTATIONS_H_
 #define XLA_SERVICE_LEGALIZE_SCHEDULING_ANNOTATIONS_H_
 
+#include <cstdint>
 #include <utility>
+#include <vector>
 
+#include "absl/container/btree_map.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/service/scheduling_annotations_util.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -33,6 +39,9 @@ class LegalizeSchedulingAnnotations : public HloModulePass {
  public:
   struct Config {
     HloPredicate keep_sync_annotation = HloPredicateTrue;
+    bool propagate_annotation = false;
+    bool check_start_done_annotation_consistency = true;
+    bool remove_loop_iteration_annotation_only = false;
   };
 
   explicit LegalizeSchedulingAnnotations(Config config)
@@ -40,6 +49,12 @@ class LegalizeSchedulingAnnotations : public HloModulePass {
   absl::string_view name() const override {
     return "legalize-scheduling-annotations";
   }
+
+  static absl::StatusOr<bool> PropagateAnnotations(
+      const HloComputation* computation,
+      const absl::btree_map<Annotation, std::vector<HloInstruction*>>&
+          annotation_to_instruction);
+
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
       HloModule* module,
diff --git a/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc b/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc
index 52ccabf5cf2c..e5fd4922030e 100644
--- a/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc
+++ b/third_party/xla/xla/service/legalize_scheduling_annotations_test.cc
@@ -16,23 +16,32 @@ limitations under the License.
 #include "xla/service/legalize_scheduling_annotations.h"
 
 #include <memory>
+#include <string>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/service/scheduling_annotations_util.h"
 #include "xla/side_effect_util.h"
-#include "xla/test_helpers.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace {
 
 using LegalizeSchedulingAnnotationsTest = HloHardwareIndependentTestBase;
+using SchedulingAnnotationPropagationTest = HloHardwareIndependentTestBase;
+using RemoveLoopIterationAnnotationTest = HloHardwareIndependentTestBase;
+using ::tsl::testing::IsOkAndHolds;
 
 TEST_F(LegalizeSchedulingAnnotationsTest, NonIntegerAnnotation) {
   constexpr absl::string_view hlo_string = R"(
@@ -323,5 +332,419 @@ ENTRY entry {
   EXPECT_IS_OK(
       LegalizeSchedulingAnnotations(config).Run(hlo_module.get()).status());
 }
+
+TEST_F(SchedulingAnnotationPropagationTest, NoOpSchedulingGroup) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[16]{0} parameter(0)
+  p1 = f32[16]{0} parameter(1)
+  ROOT a0 = f32[16]{0} add(p0, p1), frontend_attributes={_scheduling_group_id="noop"}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  EXPECT_IS_OK(
+      LegalizeSchedulingAnnotations(config).Run(hlo_module.get()).status());
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  HloInstruction* add =
+      hlo_module->GetComputationWithName("entry")->GetInstructionWithName("a0");
+  CHECK(add != nullptr);
+  const auto& attrs = add->frontend_attributes().map();
+  EXPECT_FALSE(attrs.contains(kXlaSchedulingGroupIdAttr));
+}
+
+TEST_F(LegalizeSchedulingAnnotationsTest, ProgagateAnnotationToGap) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="1"}
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="1"}
+    // This slice is not annotated.
+    slice = f32[16,64,256]{2,1,0} slice(c0), slice={[0:16], [0:64], [0:256]}
+    c1 = f32[16,256,256]{2,1,0} convolution(slice, slice), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="1"}
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="1"}
+    ROOT tuple = (f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  EXPECT_THAT(LegalizeSchedulingAnnotations(config).Run(hlo_module.get()),
+              IsOkAndHolds(true));
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  HloInstruction* slice =
+      hlo_module->entry_computation()->GetInstructionWithName("slice");
+  CHECK(slice != nullptr);
+  const auto& attrs = slice->frontend_attributes().map();
+  EXPECT_TRUE(attrs.contains(kXlaSchedulingGroupIdAttr));
+  EXPECT_EQ(attrs.at(kXlaSchedulingGroupIdAttr), "1");
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, NothingToPropagate) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="1"}
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="1"}
+    slice = f32[16,64,256]{2,1,0} slice(c0), slice={[0:16], [0:64], [0:256]}, frontend_attributes={_scheduling_group_id="1"}
+    c1 = f32[16,256,256]{2,1,0} convolution(slice, slice), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="1"}
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="1"}
+    ROOT tuple = (f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  EXPECT_THAT(LegalizeSchedulingAnnotations(config).Run(hlo_module.get()),
+              IsOkAndHolds(false));
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, NoDataDependentGap) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="1"}
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+    slice = f32[16,64,256]{2,1,0} slice(c0), slice={[0:16], [0:64], [0:256]}
+    c1 = f32[16,256,256]{2,1,0} convolution(slice, slice), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="1"}
+    ROOT tuple = (f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  EXPECT_THAT(LegalizeSchedulingAnnotations(config).Run(hlo_module.get()),
+              IsOkAndHolds(false));
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, GapDueToControlDependency) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="1"}
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, control-predecessors={ags0}
+    slice = f32[16,64,256]{2,1,0} slice(c0), slice={[0:16], [0:64], [0:256]}
+    c1 = f32[16,256,256]{2,1,0} convolution(slice, slice), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="1"}, control-predecessors={c0}
+    ROOT tuple = (f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  EXPECT_THAT(LegalizeSchedulingAnnotations(config).Run(hlo_module.get()),
+              IsOkAndHolds(true));
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  HloInstruction* c0 =
+      hlo_module->entry_computation()->GetInstructionWithName("c0");
+  CHECK(c0 != nullptr);
+  const auto& attrs = c0->frontend_attributes().map();
+  EXPECT_TRUE(attrs.contains(kXlaSchedulingGroupIdAttr));
+  EXPECT_EQ(attrs.at(kXlaSchedulingGroupIdAttr), "1");
+  HloInstruction* slice =
+      hlo_module->entry_computation()->GetInstructionWithName("slice");
+  CHECK(slice != nullptr);
+  EXPECT_FALSE(
+      slice->frontend_attributes().map().contains(kXlaSchedulingGroupIdAttr));
+  HloInstruction* c1 =
+      hlo_module->entry_computation()->GetInstructionWithName("c1");
+  CHECK(c1 != nullptr);
+  EXPECT_FALSE(
+      c1->frontend_attributes().map().contains(kXlaSchedulingGroupIdAttr));
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, GapDueToControlDependency2) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="1"}
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, control-predecessors={ags0}
+    slice = f32[16,64,256]{2,1,0} slice(c0), slice={[0:16], [0:64], [0:256]}
+    c1 = f32[16,256,256]{2,1,0} convolution(slice, slice), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="1"}, control-predecessors={c1}
+    ROOT tuple = (f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  EXPECT_THAT(LegalizeSchedulingAnnotations(config).Run(hlo_module.get()),
+              IsOkAndHolds(true));
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  HloInstruction* slice =
+      hlo_module->entry_computation()->GetInstructionWithName("slice");
+  CHECK(slice != nullptr);
+  const auto& attrs = slice->frontend_attributes().map();
+  EXPECT_TRUE(attrs.contains(kXlaSchedulingGroupIdAttr));
+  EXPECT_EQ(attrs.at(kXlaSchedulingGroupIdAttr), "1");
+  HloInstruction* c0 =
+      hlo_module->entry_computation()->GetInstructionWithName("c0");
+  CHECK(c0 != nullptr);
+  const auto& attrs_c0 = c0->frontend_attributes().map();
+  EXPECT_TRUE(attrs_c0.contains(kXlaSchedulingGroupIdAttr));
+  EXPECT_EQ(attrs_c0.at(kXlaSchedulingGroupIdAttr), "1");
+  HloInstruction* c1 =
+      hlo_module->entry_computation()->GetInstructionWithName("c1");
+  CHECK(c1 != nullptr);
+  const auto& attrs_c1 = c1->frontend_attributes().map();
+  EXPECT_TRUE(attrs_c1.contains(kXlaSchedulingGroupIdAttr));
+  EXPECT_EQ(attrs_c1.at(kXlaSchedulingGroupIdAttr), "1");
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, TwoGroups) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="2"}
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="3"}
+    slice = f32[16,64,256]{2,1,0} slice(c0), slice={[0:16], [0:64], [0:256]}
+    c1 = f32[16,256,256]{2,1,0} convolution(slice, slice), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="3"}
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="2"}
+    ROOT tuple = (f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  EXPECT_THAT(LegalizeSchedulingAnnotations(config).Run(hlo_module.get()),
+              IsOkAndHolds(true));
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  HloInstruction* slice =
+      hlo_module->entry_computation()->GetInstructionWithName("slice");
+  CHECK(slice != nullptr);
+  const auto& attrs = slice->frontend_attributes().map();
+  EXPECT_TRUE(attrs.contains(kXlaSchedulingGroupIdAttr));
+  EXPECT_EQ(attrs.at(kXlaSchedulingGroupIdAttr), "3");
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, CrossComputationAnnotation) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  while_cond {
+    param = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) parameter(0)
+    ROOT gte = pred[] get-tuple-element(param), index=2
+  }
+
+  while_body {
+    param = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) parameter(0)
+    gte0 = f32[16,64,256]{2,1,0} get-tuple-element(param), index=0
+    gte1 = f32[16,64,256]{2,1,0} get-tuple-element(param), index=1
+    gte2 = pred[] get-tuple-element(param), index=2
+    cps1 = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, u32[], u32[]) collective-permute-start(gte1), source_target_pairs={{0,1},{1,2},{2,3},{3,0}}, frontend_attributes={_scheduling_group_id="1"}
+    cpd1 = f32[16,64,256]{2,1,0} collective-permute-done(cps1), frontend_attributes={_scheduling_group_id="1"}
+    c1 = f32[16,256,256]{2,1,0} convolution(gte0, gte0), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="1"}
+    slice = f32[16,64,256]{2,1,0} slice(c1), slice={[0:16], [0:64], [0:256]}
+    add = f32[16,64,256]{2,1,0} add(gte0, slice), frontend_attributes={_scheduling_group_id="1"}
+    ROOT tuple = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) tuple(add, cpd1, gte2)
+  }
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    p3 = pred[] parameter(3)
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="1"}
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="1"}
+    tuple = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) tuple(p1, p2, p3)
+    while = (f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}, pred[]) while(tuple), condition=while_cond, body=while_body
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="1"}
+    gte = f32[16,64,256]{2,1,0} get-tuple-element(while), index=0
+    ROOT tuple1 = (f32[16,64,256]{2,1,0}, f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(gte, c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  EXPECT_THAT(LegalizeSchedulingAnnotations(config).Run(hlo_module.get()),
+              IsOkAndHolds(true));
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  HloInstruction* slice = hlo_module->GetComputationWithName("while_body")
+                              ->GetInstructionWithName("slice");
+  CHECK(slice != nullptr);
+  const auto& attrs = slice->frontend_attributes().map();
+  EXPECT_TRUE(attrs.contains(kXlaSchedulingGroupIdAttr));
+  EXPECT_EQ(attrs.at(kXlaSchedulingGroupIdAttr), "1");
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, ConflictingAnnotationGroups) {
+  constexpr absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[256,1024]{1,0} parameter(0)
+    p1 = f32[16,64,256]{2,1,0} parameter(1)
+    p2 = f32[16,64,256]{2,1,0} parameter(2)
+    ags0 = (f32[256,1024]{1,0}, f32[1024,1024]{1,0}) all-gather-start(p0), replica_groups={{0,1,2,3}}, dimensions={0}, frontend_attributes={_scheduling_group_id="2"}
+    c0 = f32[16,256,256]{2,1,0} convolution(p1, p2), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="3"}
+    slice = f32[16,64,256]{2,1,0} slice(c0), slice={[0:16], [0:64], [0:256]}, control-predecessors={ags0}
+    c1 = f32[16,256,256]{2,1,0} convolution(slice, slice), window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb, frontend_attributes={_scheduling_group_id="3"}
+    agd0 = f32[1024,1024]{1,0} all-gather-done(ags0), frontend_attributes={_scheduling_group_id="2"}, control-predecessors={slice}
+    ROOT tuple = (f32[16,256,256]{2,1,0}, f32[1024,1024]{1,0}) tuple(c0, agd0)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  auto result = LegalizeSchedulingAnnotations(config).Run(hlo_module.get());
+  EXPECT_IS_NOT_OK(result);
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  std::string error_message = std::string(result.status().message());
+  VLOG(1) << "error message: " << error_message;
+  EXPECT_TRUE(
+      absl::StrContains(error_message, "it has an existing annotation"));
+}
+
+TEST_F(SchedulingAnnotationPropagationTest, ConflictingAnnotationGroups2) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[16]{0} parameter(0)
+  p1 = f32[16]{0} parameter(1)
+  a0 = f32[16]{0} add(p0, p1), frontend_attributes={_scheduling_group_id="1"}
+  a1 = f32[16]{0} add(p0, p1), frontend_attributes={_scheduling_group_id="2"}
+  a2 = f32[16]{0} add(a0, a1)
+  a3 = f32[16]{0} add(p0, a2), frontend_attributes={_scheduling_group_id="1"}
+  a4 = f32[16]{0} add(p1, a2), frontend_attributes={_scheduling_group_id="2"}
+  ROOT tuple = (f32[16]{0}, f32[16]{0}) tuple(a3, a4)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.propagate_annotation = true;
+  auto result = LegalizeSchedulingAnnotations(config).Run(hlo_module.get());
+  EXPECT_IS_NOT_OK(result);
+  VLOG(1) << "module after: " << hlo_module->ToString();
+  std::string error_message = std::string(result.status().message());
+  VLOG(1) << "error message: " << error_message;
+  EXPECT_TRUE(
+      absl::StrContains(error_message, "it has an existing annotation"));
+}
+
+TEST_F(RemoveLoopIterationAnnotationTest, CrossComputationAnnotation) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module, entry_computation_layout={(bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0})->bf16[5,8,128]{2,1,0}}, replica_count=4, num_partitions=2
+
+%while_body.1 (param.2: (s32[], bf16[5,8,128], bf16[5,1,2,128], bf16[1,8,128], s32[])) -> (s32[], bf16[5,8,128], bf16[5,1,2,128], bf16[1,8,128], s32[]) {
+  %c3.2 = s32[] constant(3)
+  %param.2 = (s32[], bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0}, bf16[1,8,128]{2,1,0}, s32[]) parameter(0)
+  %slice_input.2 = bf16[5,1,2,128]{3,2,1,0} get-tuple-element(%param.2), index=2
+  %loop_index.2 = s32[] get-tuple-element(%param.2), index=0
+  %three_minus_loop_index.2 = s32[] subtract(%c3.2, %loop_index.2)
+  %c0.2 = s32[] constant(0)
+  %dynamic_slice.2 = bf16[1,1,2,128]{3,2,1,0} dynamic-slice(%slice_input.2, %three_minus_loop_index.2, %c0.2, %c0.2, %c0.2), dynamic_slice_sizes={1,1,2,128}
+  %dynamic_slice_reshape.2 = bf16[1,2,128]{2,1,0} reshape(%dynamic_slice.2)
+  %add.2 = bf16[1,2,128]{2,1,0} add(%dynamic_slice_reshape.2, %dynamic_slice_reshape.2), control-predecessors={%c3.2}
+  %c1.2 = s32[] constant(1)
+  %next_loop_index.1 = s32[] add(%loop_index.2, %c1.2)
+  %partial_output.1 = bf16[5,8,128]{2,1,0} get-tuple-element(%param.2), index=1
+  %get-tuple-element.1 = bf16[1,8,128]{2,1,0} get-tuple-element(%param.2), index=3
+  %updated_partial_output.1 = bf16[5,8,128]{2,1,0} dynamic-update-slice(%partial_output.1, %get-tuple-element.1, %three_minus_loop_index.2, %c0.2, %c0.2)
+  %c3.3 = s32[] constant(3)
+  %get-tuple-element = s32[] get-tuple-element(%param.2), index=4
+  %three_minus_loop_index.3 = s32[] subtract(%c3.3, %get-tuple-element)
+  %c0.3 = s32[] constant(0)
+  %dynamic_slice.3 = bf16[1,1,2,128]{3,2,1,0} dynamic-slice(%slice_input.2, %three_minus_loop_index.3, %c0.3, %c0.3, %c0.3), dynamic_slice_sizes={1,1,2,128}
+  %dynamic_slice_reshape.3 = bf16[1,2,128]{2,1,0} reshape(%dynamic_slice.3)
+  %add.3 = bf16[1,2,128]{2,1,0} add(%dynamic_slice_reshape.3, %dynamic_slice_reshape.3), control-predecessors={%c3.3}
+  %all_gather.2 = bf16[1,8,128]{2,1,0} all-gather(%add.3), replica_groups={}, dimensions={1}, frontend_attributes={_scheduling_group_id="123:-1"}
+  %constant.1 = s32[] constant(1)
+  %add.4 = s32[] add(%next_loop_index.1, %constant.1)
+  ROOT %tuple.2 = (s32[], bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0}, bf16[1,8,128]{2,1,0}, s32[]) tuple(%next_loop_index.1, %updated_partial_output.1, %slice_input.2, %all_gather.2, %add.4), control-predecessors={%add.2}
+}
+
+%while_cond.1 (cond_param: (s32[], bf16[5,8,128], bf16[5,1,2,128], bf16[1,8,128], s32[])) -> pred[] {
+  %cond_param = (s32[], bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0}, bf16[1,8,128]{2,1,0}, s32[]) parameter(0)
+  %get-tuple-element.2 = s32[] get-tuple-element(%cond_param), index=0
+  %constant.2 = s32[] constant(3)
+  ROOT %compare = pred[] compare(%get-tuple-element.2, %constant.2), direction=LT
+}
+
+ENTRY %entry (p0: bf16[5,8,128], p1: bf16[5,1,2,128]) -> bf16[5,8,128] {
+  %c3.1 = s32[] constant(3)
+  %c1.1 = s32[] constant(1)
+  %p0 = bf16[5,8,128]{2,1,0} parameter(0)
+  %p1 = bf16[5,1,2,128]{3,2,1,0} parameter(1)
+  %tuple.1 = (s32[], bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0}) tuple(%c1.1, %p0, %p1)
+  %slice_input.1 = bf16[5,1,2,128]{3,2,1,0} get-tuple-element(%tuple.1), index=2
+  %three_minus_loop_index.1 = s32[] subtract(%c3.1, %c1.1)
+  %c0.1 = s32[] constant(0)
+  %dynamic_slice.1 = bf16[1,1,2,128]{3,2,1,0} dynamic-slice(%slice_input.1, %three_minus_loop_index.1, %c0.1, %c0.1, %c0.1), dynamic_slice_sizes={1,1,2,128}
+  %dynamic_slice_reshape.1 = bf16[1,2,128]{2,1,0} reshape(%dynamic_slice.1)
+  %add.1 = bf16[1,2,128]{2,1,0} add(%dynamic_slice_reshape.1, %dynamic_slice_reshape.1), control-predecessors={%c3.1}
+  %all_gather.1 = bf16[1,8,128]{2,1,0} all-gather(%add.1), replica_groups={}, dimensions={1}, frontend_attributes={_scheduling_group_id="124"}
+  %constant = s32[] constant(2)
+  %tuple.3 = (s32[], bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0}, bf16[1,8,128]{2,1,0}, s32[]) tuple(%c1.1, %p0, %p1, %all_gather.1, %constant), control-predecessors={%add.1}
+  %while.1 = (s32[], bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0}, bf16[1,8,128]{2,1,0}, s32[]) while(%tuple.3), condition=%while_cond.1, body=%while_body.1
+  %loop_index.3 = s32[] get-tuple-element(%while.1), index=0
+  %c1.3 = s32[] constant(1)
+  %next_loop_index.2 = s32[] add(%loop_index.3, %c1.3)
+  %partial_output.2 = bf16[5,8,128]{2,1,0} get-tuple-element(%while.1), index=1
+  %get-tuple-element.3 = bf16[1,8,128]{2,1,0} get-tuple-element(%while.1), index=3
+  %c3.4 = s32[] constant(3)
+  %three_minus_loop_index.4 = s32[] subtract(%c3.4, %loop_index.3)
+  %c0.4 = s32[] constant(0)
+  %updated_partial_output.2 = bf16[5,8,128]{2,1,0} dynamic-update-slice(%partial_output.2, %get-tuple-element.3, %three_minus_loop_index.4, %c0.4, %c0.4)
+  %slice_input.4 = bf16[5,1,2,128]{3,2,1,0} get-tuple-element(%while.1), index=2
+  %tuple.4 = (s32[], bf16[5,8,128]{2,1,0}, bf16[5,1,2,128]{3,2,1,0}) tuple(%next_loop_index.2, %updated_partial_output.2, %slice_input.4)
+  ROOT %gte = bf16[5,8,128]{2,1,0} get-tuple-element(%tuple.4), index=1
+  %dynamic_slice.4 = bf16[1,1,2,128]{3,2,1,0} dynamic-slice(%slice_input.4, %three_minus_loop_index.4, %c0.4, %c0.4, %c0.4), dynamic_slice_sizes={1,1,2,128}
+  %dynamic_slice_reshape.4 = bf16[1,2,128]{2,1,0} reshape(%dynamic_slice.4)
+  %add.5 = bf16[1,2,128]{2,1,0} add(%dynamic_slice_reshape.4, %dynamic_slice_reshape.4), control-predecessors={%c3.4}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LegalizeSchedulingAnnotations::Config config;
+  config.remove_loop_iteration_annotation_only = true;
+  EXPECT_IS_OK(
+      LegalizeSchedulingAnnotations(config).Run(hlo_module.get()).status());
+  HloInstruction* all_gather =
+      FindInstruction(hlo_module.get(), "all_gather.2");
+  auto annotation = GetSchedulingAnnotation(all_gather).value();
+  EXPECT_TRUE(annotation);
+  EXPECT_TRUE(annotation->group_id);
+  EXPECT_EQ(annotation->group_id.value(), 123);
+  EXPECT_FALSE(annotation->iteration_id);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index 5bcf8f586f8e..599060b88eee 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #    Libraries for helping construct LLVM IR for XLA backends.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -59,8 +59,8 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/cpu/tests:cpu_codegen_test_main",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
@@ -227,7 +227,6 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/service:elemental_ir_emitter",
         "//xla/service/cpu:backend_config_proto_cc",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
diff --git a/third_party/xla/xla/service/llvm_ir/alias_analysis.cc b/third_party/xla/xla/service/llvm_ir/alias_analysis.cc
index 557512b2417b..1b79e32df0e3 100644
--- a/third_party/xla/xla/service/llvm_ir/alias_analysis.cc
+++ b/third_party/xla/xla/service/llvm_ir/alias_analysis.cc
@@ -35,8 +35,9 @@ namespace llvm_ir {
 
 // Sentry allocation used to represent parameters of the entry computation in
 // alias_scope_metadata_ and noalias_metadata_.
-static const BufferAllocation* kParameterAllocation = new BufferAllocation(
-    /*index=*/-1, /*size=*/0, LogicalBuffer::Color(0));
+static const BufferAllocation* const kParameterAllocation =
+    new BufferAllocation(
+        /*index=*/-1, /*size=*/0, LogicalBuffer::Color(0));
 
 void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
                                                     llvm_ir::IrArray* array,
diff --git a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
index 8c17108df8f4..7e4dbeceda7c 100644
--- a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
 
@@ -27,7 +27,8 @@ namespace {
 
 class AliasAnalysisTest : public CpuCodegenTest {
   DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        HloHardwareIndependentTestBase::GetDebugOptionsForTest();
     // We do not generate IR for while loops with thunks runtime, so we
     // explicitly disable it for this test.
     debug_options.set_xla_cpu_use_thunk_runtime(false);
diff --git a/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc b/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 171be5528423..9b39776eba8b 100644
--- a/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -130,7 +130,7 @@ static absl::Status EmitDynamicUpdateSliceInPlaceImpl(
   const Shape& output_shape = output_array.GetShape();
 
   // Read start indices from start_indices_generator.
-  const int64_t rank = output_shape.rank();
+  const int64_t rank = output_shape.dimensions().size();
   std::vector<llvm::Value*> start_multi_index(rank);
   for (int64_t i = 0; i < rank; ++i) {
     TF_ASSIGN_OR_RETURN(start_multi_index[i], start_indices_generator(i));
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.cc b/third_party/xla/xla/service/llvm_ir/ir_array.cc
index bf56001d421f..5caacff9d7cb 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.cc
@@ -92,8 +92,8 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
                                  llvm::Value* linear, const Shape& shape,
                                  absl::Span<llvm::Value*> dynamic_dims,
                                  llvm::IRBuilderBase* b) const {
-  CHECK_EQ(shape.dimensions_size(), dynamic_dims.size());
-  CHECK_EQ(multidim_.size(), shape.rank());
+  CHECK_EQ(shape.dimensions().size(), dynamic_dims.size());
+  CHECK_EQ(multidim_.size(), shape.dimensions().size());
   llvm::Value* divisor = GetConstantWithIndexType(1);
   const Layout& layout = shape.layout();
   for (int64_t i = 0; i < layout.minor_to_major_size(); ++i) {
@@ -119,7 +119,7 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
 
 IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
                       llvm::IRBuilderBase* b)
-    : multidim_(shape.rank()),
+    : multidim_(shape.dimensions().size()),
       linear_(linear),
       layout_(shape.layout()),
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
@@ -134,13 +134,13 @@ IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
 IrArray::Index::Index(llvm::Value* linear,
                       absl::Span<llvm::Value* const> multidim,
                       const Shape& shape, llvm::IRBuilderBase* b)
-    : multidim_(shape.rank()),
+    : multidim_(shape.dimensions().size()),
       linear_(linear),
       layout_(shape.layout()),
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
   CHECK_NE(linear, nullptr);
   index_type_ = linear->getType();
-  CHECK_EQ(multidim.size(), shape.rank());
+  CHECK_EQ(multidim.size(), shape.dimensions().size());
   for (auto dim : multidim) {
     if (dim) {
       CHECK_EQ(dim->getType(), index_type_);
@@ -160,7 +160,7 @@ IrArray::Index::Index(llvm::Value* linear,
 IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
                       absl::Span<llvm::Value*> dynamic_dims,
                       llvm::IRBuilderBase* b)
-    : multidim_(shape.rank()),
+    : multidim_(shape.dimensions().size()),
       linear_(linear),
       layout_(shape.layout()),
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
@@ -186,7 +186,7 @@ IrArray::Index::Index(absl::Span<llvm::Value* const> multidim,
       dims_(shape.dimensions().begin(), shape.dimensions().end()),
       index_type_(index_type) {
   CHECK_NE(index_type_, nullptr);
-  CHECK_EQ(shape.dimensions_size(), multidim.size());
+  CHECK_EQ(shape.dimensions().size(), multidim.size());
   for (const auto* dim : multidim) {
     CHECK_NE(dim, nullptr);
   }
@@ -212,7 +212,7 @@ IrArray::IrArray(llvm::Value* base_ptr, llvm::Type* pointee_type, Shape shape)
   if (!shape_.IsArray() || ShapeUtil::IsScalar(shape_)) {
     DCHECK(depth == 1 || depth == 0) << depth;
   } else {
-    DCHECK_EQ(depth, shape_.rank()) << shape_.ShortDebugString();
+    DCHECK_EQ(depth, shape_.dimensions().size()) << shape_.ToString();
   }
 }
 
@@ -228,9 +228,9 @@ bool IrArray::Index::LinearValidOnShape(const Shape& a) const {
 IrArray::Index IrArray::Index::SourceIndexOfReshape(
     const Shape& output_shape, const Shape& input_shape,
     llvm::IRBuilderBase* builder) const {
-  CHECK_EQ(multidim_.size(), output_shape.rank());
+  CHECK_EQ(multidim_.size(), output_shape.dimensions().size());
   std::vector<llvm::Value*> source_multidim_index(
-      input_shape.rank(), llvm::UndefValue::get(index_type_));
+      input_shape.dimensions().size(), llvm::UndefValue::get(index_type_));
 
   if (std::optional<ShapeUtil::ShapeEqualityDescriptor> trivial_reshape =
           ShapeUtil::InsertedOrDeleted1SizedDimensions(input_shape,
@@ -394,7 +394,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
     const Shape& shape, const Shape& operand_shape,
     absl::Span<const int64_t> dimension_mapping,
     llvm::IRBuilderBase* builder) const {
-  int64_t rank = operand_shape.rank();
+  int64_t rank = operand_shape.dimensions().size();
   std::vector<llvm::Value*> source_index(rank);
   for (int64_t i = 0; i < rank; ++i) {
     source_index[i] = multidim_[dimension_mapping[i]];
@@ -408,7 +408,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
   // The other dimensions can be masked out with a div and a mod operation.
   std::vector<int64_t> logical_to_physical =
       LayoutUtil::MakeLogicalToPhysical(shape.layout());
-  int64_t output_rank = shape.rank();
+  int64_t output_rank = shape.dimensions().size();
   // The minimum physical dimension that is broadcasted.
   int64_t min_broadcasted_dimension = output_rank;
   // The maximum physical dimension that is broadcasted.
@@ -511,7 +511,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
     // over higher-rank arrays.
     return base_ptr_;
   }
-  CHECK_EQ(index.size(), shape_.rank());
+  CHECK_EQ(index.size(), shape_.dimensions().size());
   CHECK(index.ShapeIsCompatible(shape_))
       << "Shape " << index.AsShapeWithType(shape_.element_type()).ToString(true)
       << " is not compatible with " << shape_.ToString(true);
@@ -527,8 +527,8 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
     if (!index.LinearValidOnShape(shape_)) {
       // Create a valid linear index.
       std::vector<int64_t> dimensions;
-      dimensions.reserve(shape_.rank());
-      for (int64_t i = 0; i < shape_.rank(); ++i) {
+      dimensions.reserve(shape_.dimensions().size());
+      for (int64_t i = 0; i < shape_.dimensions().size(); ++i) {
         dimensions.push_back(shape_.dimensions(i));
       }
       llvm::Value* linearized = index.Linearize(dimensions, b);
@@ -555,7 +555,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
   CHECK_GT(index.size(), 0);
   std::vector<llvm::Value*> gep_indices(
       1, llvm::ConstantInt::get(index[0]->getType(), 0));
-  for (int64_t i = 0; i < shape_.rank(); ++i) {
+  for (int64_t i = 0; i < shape_.dimensions().size(); ++i) {
     int64_t dimension = LayoutUtil::Major(shape_.layout(), i);
     gep_indices.push_back(actual_index[dimension]);
   }
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.h b/third_party/xla/xla/service/llvm_ir/ir_array.h
index 11e2761d4745..f5b2b7c4fbd7 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.h
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.h
@@ -72,7 +72,7 @@ class IrArray {
     Index(llvm::Value* linear, const Shape& shape, llvm::IRBuilderBase* b);
 
     // As before, but also take a multidim to reuse.  multidim.size()
-    // == shape.rank() must be true.  If some of the multidim element
+    // == shape.dimensions_size() must be true.  If some of the multidim element
     // are null we will use the value that would be used if
     // deliearized from linear.
     Index(llvm::Value* linear, absl::Span<llvm::Value* const> multidim,
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_loop.cc b/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
index 970a0a15a96d..43ee77ef0e85 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
@@ -244,7 +244,7 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64_t start_index,
 
 IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
                                              absl::string_view suffix) {
-  std::vector<int64_t> dimensions(shape.rank());
+  std::vector<int64_t> dimensions(shape.dimensions().size());
   std::iota(dimensions.begin(), dimensions.end(), 0);
   return IrArray::Index(AddLoopsForShapeOnDimensions(shape, dimensions, suffix),
                         shape, index_type_);
@@ -253,7 +253,7 @@ IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
 std::vector<llvm::Value*> ForLoopNest::AddLoopsForShapeOnDimensions(
     const Shape& shape, absl::Span<const int64_t> dimensions,
     absl::string_view suffix) {
-  std::vector<llvm::Value*> multi_index(shape.dimensions_size());
+  std::vector<llvm::Value*> multi_index(shape.dimensions().size());
   for (int64_t dimension : dimensions) {
     std::unique_ptr<llvm_ir::ForLoop> loop = AddLoop(
         /*start_index=*/0,
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index 50055ff6a3c5..b1db780705b4 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -137,7 +137,7 @@ llvm::CallInst* EmitCallToIntrinsic(
   llvm::Module* module = ModuleFromIRBuilder(b);
   llvm::Function* intrinsic = llvm::Intrinsic::getOrInsertDeclaration(
       module, intrinsic_id, AsArrayRef(overloaded_types));
-  return b->CreateCall(intrinsic, AsArrayRef(operands), name.data());
+  return b->CreateCall(intrinsic, AsArrayRef(operands), AsStringRef(name));
 }
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
@@ -145,7 +145,7 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           absl::string_view name) {
   if (b->getFastMathFlags().noNaNs() || enable_fast_min_max) {
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
-    return b->CreateSelect(cmp, lhs_value, rhs_value, name.data());
+    return b->CreateSelect(cmp, lhs_value, rhs_value, AsStringRef(name));
   }
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maximum,
                                       {lhs_value, rhs_value},
@@ -157,7 +157,7 @@ llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           absl::string_view name) {
   if (b->getFastMathFlags().noNaNs() || enable_fast_min_max) {
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
-    return b->CreateSelect(cmp, lhs_value, rhs_value, name.data());
+    return b->CreateSelect(cmp, lhs_value, rhs_value, AsStringRef(name));
   }
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minimum,
                                       {lhs_value, rhs_value},
@@ -285,7 +285,8 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::LLVMContext& context) {
       PrimitiveTypeToIrType(shape.element_type(), context);
   if (shape.IsTuple()) {
     // A tuple buffer is an array of pointers.
-    result_type = llvm::ArrayType::get(result_type, shape.tuple_shapes_size());
+    result_type =
+        llvm::ArrayType::get(result_type, shape.tuple_shapes().size());
   } else if (shape.IsArray()) {
     for (int64_t dimension : LayoutUtil::MinorToMajor(shape)) {
       result_type =
@@ -297,7 +298,7 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::LLVMContext& context) {
 
 absl::StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
     const Shape& shape, int32_t* shape_size, llvm::IRBuilderBase* b) {
-  std::string encoded_shape = shape.SerializeAsString();
+  const std::string encoded_shape = shape.ToProto().SerializeAsString();
   if (encoded_shape.size() > std::numeric_limits<int32_t>::max()) {
     return Internal("Encoded shape size exceeded int32_t size limit.");
   }
@@ -467,10 +468,10 @@ llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
   llvm::Value* comparison_result;
   if (lhs_value->getType()->isIntegerTy()) {
     comparison_result =
-        b->CreateICmp(predicate, lhs_value, rhs_value, name.data());
+        b->CreateICmp(predicate, lhs_value, rhs_value, AsStringRef(name));
   } else {
     comparison_result =
-        b->CreateFCmp(predicate, lhs_value, rhs_value, name.data());
+        b->CreateFCmp(predicate, lhs_value, rhs_value, AsStringRef(name));
   }
   // comparison_result is i1, but the NVPTX codegen incorrectly lowers i1
   // arrays. So we extend it to i8 so that it's addressable.
diff --git a/third_party/xla/xla/service/llvm_ir/loop_emitter.cc b/third_party/xla/xla/service/llvm_ir/loop_emitter.cc
index c498f1f51942..13f6a67764b3 100644
--- a/third_party/xla/xla/service/llvm_ir/loop_emitter.cc
+++ b/third_party/xla/xla/service/llvm_ir/loop_emitter.cc
@@ -50,7 +50,7 @@ LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
                          std::vector<llvm::Value*> dynamic_dims,
                          llvm::IRBuilderBase* b)
     : LoopEmitter::LoopEmitter(body_emitter, shape, b) {
-  CHECK_EQ(dynamic_dims.size(), shape_.dimensions_size());
+  CHECK_EQ(dynamic_dims.size(), shape_.dimensions().size());
   dynamic_dims_ = std::move(dynamic_dims);
 }
 
@@ -72,8 +72,8 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   // same dimensions.
   for (const IrArray& array : target_arrays) {
     CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()))
-        << ": '" << shape_.ShortDebugString() << "' does not match '"
-        << array.GetShape().ShortDebugString() << "'";
+        << ": '" << shape_.ToString() << "' does not match '"
+        << array.GetShape().ToString() << "'";
   }
 }
 
@@ -125,12 +125,12 @@ IrArray::Index LoopEmitter::EmitStaticIndex(ForLoopNest* loop_nest,
   // Loops are added from outermost to innermost order with the ForLoopNest
   // class so emit loops in order from most-major dimension down to most-minor
   // dimension (of the target shape).
-  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
+  std::vector<llvm::Value*> array_multi_index(shape_.dimensions().size());
   for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
     int64_t dimension = LayoutUtil::Major(shape_.layout(), i);
     // Only unroll the most minor dimension, this seems to give us good runtime
     // performance with a large improvement in compile time.
-    auto unroll_mode = (i == shape_.rank() - 1)
+    auto unroll_mode = (i == shape_.dimensions().size() - 1)
                            ? llvm_ir::UnrollMode::kDefaultUnroll
                            : llvm_ir::UnrollMode::kNoUnroll;
     std::unique_ptr<ForLoop> loop = loop_nest->AddLoop(
@@ -149,12 +149,12 @@ IrArray::Index LoopEmitter::EmitDynamicIndex(ForLoopNest* loop_nest,
   // Loops are added from outermost to innermost order with the ForLoopNest
   // class so emit loops in order from most-major dimension down to most-minor
   // dimension (of the target shape).
-  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
+  std::vector<llvm::Value*> array_multi_index(shape_.dimensions().size());
   for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
     int64_t dimension = LayoutUtil::Major(shape_.layout(), i);
     // Only unroll the most minor dimension, this seems to give us good runtime
     // performance with a large improvement in compile time.
-    auto unroll_mode = (i == shape_.rank() - 1)
+    auto unroll_mode = (i == shape_.dimensions().size() - 1)
                            ? llvm_ir::UnrollMode::kDefaultUnroll
                            : llvm_ir::UnrollMode::kNoUnroll;
     std::unique_ptr<ForLoop> loop = loop_nest->AddLoop(
diff --git a/third_party/xla/xla/service/llvm_ir/sort_util.cc b/third_party/xla/xla/service/llvm_ir/sort_util.cc
index 9c9413810c4a..007868333e4a 100644
--- a/third_party/xla/xla/service/llvm_ir/sort_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/sort_util.cc
@@ -340,7 +340,7 @@ absl::Status EmitSortInPlace(
   // comparisons).
 
   const Shape& keys_shape = values_arrays[0].GetShape();
-  int64_t rank = keys_shape.rank();
+  int64_t rank = keys_shape.dimensions().size();
   int64_t dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   std::vector<int64_t> dimensions_in_iteration_order(rank);
   std::vector<int64_t> iteration_order_to_logical_order(rank);
diff --git a/third_party/xla/xla/service/llvm_ir/tuple_ops.cc b/third_party/xla/xla/service/llvm_ir/tuple_ops.cc
index 31cae09a4f6c..b4b3fa30affb 100644
--- a/third_party/xla/xla/service/llvm_ir/tuple_ops.cc
+++ b/third_party/xla/xla/service/llvm_ir/tuple_ops.cc
@@ -74,7 +74,7 @@ std::vector<llvm::Value*> EmitTupleAllocasAtFunctionEntry(
   b->SetInsertPoint(&function->getEntryBlock(),
                     function->getEntryBlock().getFirstInsertionPt());
   CHECK(tuple_shape.IsTuple());
-  int tuple_size = tuple_shape.tuple_shapes_size();
+  int tuple_size = tuple_shape.tuple_shapes().size();
 
   std::vector<llvm::Value*> generated_allocas;
   for (int i = 0; i < tuple_size; i++) {
diff --git a/third_party/xla/xla/service/local_service_utils.cc b/third_party/xla/xla/service/local_service_utils.cc
index d6f6ce4f0280..af6d7f0c3f25 100644
--- a/third_party/xla/xla/service/local_service_utils.cc
+++ b/third_party/xla/xla/service/local_service_utils.cc
@@ -72,7 +72,8 @@ absl::StatusOr<std::unique_ptr<HloModuleConfig>> GetHloModuleConfig(
     Backend* backend) {
   const HloModuleProto& proto = computation.proto();
   TF_RET_CHECK(proto.has_host_program_shape());
-  ProgramShape program_shape(proto.host_program_shape());
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      ProgramShape::FromProto(proto.host_program_shape()));
 
   // Validate incoming layouts.
   if (argument_layouts.size() != program_shape.parameters_size()) {
diff --git a/third_party/xla/xla/service/logical_buffer_analysis.h b/third_party/xla/xla/service/logical_buffer_analysis.h
deleted file mode 100644
index 6571558fb208..000000000000
--- a/third_party/xla/xla/service/logical_buffer_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
-#define XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/logical_buffer_analysis.h"
-
-#endif  // XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/logistic_expander.h b/third_party/xla/xla/service/logistic_expander.h
deleted file mode 100644
index c0c5ec0c37f0..000000000000
--- a/third_party/xla/xla/service/logistic_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_LOGISTIC_EXPANDER_H_
-#define XLA_SERVICE_LOGISTIC_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/logistic_expander.h"
-
-#endif  // XLA_SERVICE_LOGISTIC_EXPANDER_H_
diff --git a/third_party/xla/xla/service/loop_schedule_linearizer_test.cc b/third_party/xla/xla/service/loop_schedule_linearizer_test.cc
index 3d652478eb87..76815a71747a 100644
--- a/third_party/xla/xla/service/loop_schedule_linearizer_test.cc
+++ b/third_party/xla/xla/service/loop_schedule_linearizer_test.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/copy_insertion.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -66,7 +66,7 @@ int64_t CountControlEdges(const HloModule& module) {
   return count;
 }
 
-class LoopScheduleLinearizerTest : public HloTestBase {
+class LoopScheduleLinearizerTest : public HloHardwareIndependentTestBase {
  protected:
   void InsertCopies(HloModule* module, bool expect_change) {
     LoopScheduleLinearizer loop_schedule_linearizer;
diff --git a/third_party/xla/xla/service/map_inliner_test.cc b/third_party/xla/xla/service/map_inliner_test.cc
index c9387108a19f..d3cedb796d82 100644
--- a/third_party/xla/xla/service/map_inliner_test.cc
+++ b/third_party/xla/xla/service/map_inliner_test.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/mapped_ptr_container_sorter_test.cc b/third_party/xla/xla/service/mapped_ptr_container_sorter_test.cc
index bb1b55ccdd64..896519781059 100644
--- a/third_party/xla/xla/service/mapped_ptr_container_sorter_test.cc
+++ b/third_party/xla/xla/service/mapped_ptr_container_sorter_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/functional/bind_front.h"
 #include "absl/log/log.h"
-#include "xla/test.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/memory_annotations.h b/third_party/xla/xla/service/memory_annotations.h
new file mode 100644
index 000000000000..444b6d584079
--- /dev/null
+++ b/third_party/xla/xla/service/memory_annotations.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_ANNOTATIONS_H_
+#define XLA_SERVICE_MEMORY_ANNOTATIONS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace memory_annotations {
+
+// External annotations:
+inline const absl::string_view kDevicePlacement = "annotate_device_placement";
+inline const absl::string_view kMemoryTargetPinnedHost = "pinned_host";
+inline const absl::string_view kMemoryTargetUnpinnedHost = "unpinned_host";
+inline const absl::string_view kMemoryTargetDevice = "device";
+inline const absl::string_view kMemoryTargetDeviceSram = "vmem";
+inline const absl::string_view kMemoryTargetPinnedDevice = "pinned_device";
+
+// Internal annotations:
+inline const absl::string_view kMoveToHostCustomCallTarget = "MoveToHost";
+inline const absl::string_view kMoveToDeviceCustomCallTarget = "MoveToDevice";
+inline const absl::string_view kPinToDeviceCustomCallTarget = "PinToDevice";
+inline const absl::string_view kPinToDeviceSramCustomCallTarget =
+    "PinToDeviceSram";
+
+}  // namespace memory_annotations
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_ANNOTATIONS_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 44a166c8a502..8091dfb093d8 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -2,7 +2,7 @@
 #   Memory Space Assignment service implementation.
 
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility")
@@ -38,7 +38,6 @@ cc_library(
     deps = [
         ":algorithm",
         ":allocation",
-        ":cost_analysis",
         ":memory_space_assignment_proto_cc",
         ":options",
         ":simulator",
@@ -56,6 +55,7 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
         "//xla/service/heap_simulator",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -188,7 +188,10 @@ cc_library(
         "//xla/service/cost_modelling:op_cost",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -269,7 +272,6 @@ cc_library(
         "//xla/service/heap_simulator",
         "//xla/service/heap_simulator:allocation_block",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
@@ -290,9 +292,9 @@ xla_cc_test(
         ":allocation",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_value",
         "//xla/service/heap_simulator",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -373,9 +375,9 @@ xla_cc_test(
         ":cost_analysis",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/cost_modelling:op_cost",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -417,12 +419,12 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_value",
         "//xla/service/cost_modelling:op_cost",
         "//xla/service/heap_simulator",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -532,13 +534,13 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:hlo_alias_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:buffer_value",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_value",
         "//xla/service/cost_modelling:op_cost",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -589,12 +591,14 @@ cc_library(
         "//xla/service:time_utils",
         "//xla/service/heap_simulator",
         "//xla/service/heap_simulator:allocation_block",
+        "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
@@ -625,6 +629,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -635,7 +640,6 @@ cc_library(
     hdrs = ["allocation_value.h"],
     deps = [
         ":allocation",
-        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_value",
@@ -653,10 +657,10 @@ xla_cc_test(
         ":testing_utils",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_value",
         "//xla/service/cost_modelling:op_cost",
-        "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
index d0fafdda839c..8ff63a0aa61d 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/algorithm.h"
 
 #include <sys/stat.h>
+#include <sys/types.h>
 
 #include <algorithm>
 #include <cstddef>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -48,6 +50,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
+#include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -77,6 +80,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -92,6 +96,10 @@ const HeapSimulator::Chunk kDummyChunk =
 // if the buffer occupies less of the execution time ratio than this value.
 const float kCrossProgramPrefetchOccupyFreeingLimit = 0.6;
 
+int64_t GetAlignedOffset(int64_t offset, int64_t alignment) {
+  return CeilOfRatio(offset, alignment) * alignment;
+}
+
 template <typename T>
 std::string VectorToString(const std::vector<T>& v,
                            bool include_indices = false, int start = 0,
@@ -398,7 +406,7 @@ bool MsaAlgorithm::MatchesPrefetchContext(
              producer_shape_index;
 }
 
-MsaAlgorithm::MsaAlgorithm(AllocationSequence* allocations,
+MsaAlgorithm::MsaAlgorithm(HloModule* module, AllocationSequence* allocations,
                            const Options& options,
                            const HloAliasAnalysis& alias_analysis,
                            const HloLiveRange& hlo_live_range)
@@ -410,6 +418,7 @@ MsaAlgorithm::MsaAlgorithm(AllocationSequence* allocations,
                        .all_slice_time_permutations_threshold()
                ? SliceTimePermutationIterator::Ty::kPreferred
                : SliceTimePermutationIterator::Ty::kAll)),
+      module_(module),
       allocations_(allocations),
       options_(options),
       alias_analysis_(alias_analysis),
@@ -453,18 +462,7 @@ MsaAlgorithm::MsaAlgorithm(AllocationSequence* allocations,
             options.cost_analysis->GetInstructionElapsed(*inst);
         if (options_.use_repeated_instance_for_preferred_prefetch_time ||
             options_.memory_bound_loop_optimizer_options.enabled()) {
-          std::string fingerprint;
-          absl::StrAppend(&fingerprint, inst->shape().ToString(), " ",
-                          HloOpcodeString(inst->opcode()), "(");
-          for (int operand_idx = 0; operand_idx < inst->operands().size();
-               ++operand_idx) {
-            if (operand_idx > 0) {
-              absl::StrAppend(&fingerprint, ", ");
-            }
-            absl::StrAppend(&fingerprint,
-                            inst->operand(operand_idx)->shape().ToString());
-          }
-          absl::StrAppend(&fingerprint, ")");
+          uint64_t fingerprint = absl::HashOf(MsaInstructionFingerprint(inst));
           fingerprint_map_[inst] = fingerprint;
           repeated_inst_map_[fingerprint].push_back(inst);
         }
@@ -627,6 +625,114 @@ void MsaAlgorithm::FindAliases(
   }
 }
 
+void MsaAlgorithm::ExtendScopedAlternateMemoryAllocations() {
+  VLOG(1) << "Starting vmem expansion";
+
+  // Iterate through all scoped allocations and try to expand them to the
+  // largest contiguous open space available.
+  for (std::unique_ptr<Allocation>& allocation : *allocations_) {
+    if (!allocation->is_scoped_allocation()) {
+      continue;
+    }
+
+    // Find the set of nodes that are live during allocation.
+    std::vector<Chunk> live_nodes = interval_tree_.ChunksOverlappingInTime(
+        allocation->start_time(), allocation->end_time());
+    absl::c_sort(live_nodes, [](const Chunk& a, const Chunk& b) {
+      return a.offset < b.offset;
+    });
+
+    // Loop over live_nodes to compute 2 things:
+    // 1. The largest contiguous free chunk (biggest_free_chunk)
+    // 2. The largest chunk we can get by moving the start time of the scoped
+    //    allocation earlier (i.e., to max_end_before_scoped_allocation), and
+    //    the end time later (i.e., to min_offset_after_scoped_allocation).
+    int64_t min_offset_after_scoped_allocation = available_heap_size();
+    int64_t max_end_before_scoped_allocation = 0;
+    Chunk biggest_free_chunk = Chunk::FromOffsetSize(0, 0);
+    for (int i = 0; i < live_nodes.size(); ++i) {
+      const Chunk& chunk = live_nodes[i];
+      if (allocation->chunk().chunk_end() <= chunk.offset) {
+        min_offset_after_scoped_allocation =
+            std::min(min_offset_after_scoped_allocation, chunk.offset);
+      }
+      if (allocation->chunk().offset >= chunk.chunk_end()) {
+        max_end_before_scoped_allocation =
+            std::max(max_end_before_scoped_allocation, chunk.chunk_end());
+      }
+
+      Chunk next_free_chunk = Chunk::FromOffsetEnd(
+          GetAlignedOffset(chunk.chunk_end(), options_.alignment_in_bytes),
+          (i + 1) < live_nodes.size() ? live_nodes[i + 1].offset
+                                      : available_heap_size());
+      if (next_free_chunk.size > biggest_free_chunk.size) {
+        biggest_free_chunk = next_free_chunk;
+      }
+    }
+
+    Chunk proposed_extended_chunk =
+        Chunk::FromOffsetEnd(GetAlignedOffset(max_end_before_scoped_allocation,
+                                              options_.alignment_in_bytes),
+                             min_offset_after_scoped_allocation);
+
+    // Check if we should extend the boundaries of the scoped allocation or
+    // move it.
+    Chunk proposed_chunk = allocation->chunk();
+    std::string source;
+    if (proposed_extended_chunk.size > proposed_chunk.size) {
+      proposed_chunk = proposed_extended_chunk;
+      source = "extended";
+    }
+    if (biggest_free_chunk.size > proposed_chunk.size) {
+      proposed_chunk = biggest_free_chunk;
+      source = "free";
+    }
+    if (source.empty()) {
+      VLOG(3) << "Could not move the scoped allocation for "
+              << allocation->defining_position().ToString()
+              << "; Current fragmentation: " <<
+          [&]() {
+            int64_t occupied_size = 0;
+            for (const Chunk& chunk : live_nodes) {
+              occupied_size += chunk.size;
+            }
+            double fragmentation =
+                static_cast<double>(available_heap_size() - occupied_size) /
+                static_cast<double>(available_heap_size());
+            return 100.0 * fragmentation;
+          }() << "%";
+      continue;
+    }
+
+    VLOG(1) << "Moving the scoped allocation for "
+            << allocation->defining_position().ToString() << " from "
+            << allocation->chunk().ToString() << " to "
+            << proposed_chunk.ToString() << " (" << source
+            << "); Size increase: "
+            << (100.0 *
+                static_cast<double>(proposed_chunk.size -
+                                    allocation->chunk().size) /
+                static_cast<double>(allocation->chunk().size))
+            << "%";
+
+    // Update the allocation. We don't need to update result_.chunk_map. It's
+    // not used by MSA.
+    *allocation->mutable_chunk() = proposed_chunk;
+    result_.UpdatedHeapSize(proposed_chunk);
+  }
+}
+
+std::string MsaAlgorithm::RequiredMemoryAssignment::ToString() const {
+  std::string memory_space_str =
+      memory_space == MemorySpace::kDefault ? "def" : "alt";
+  std::string offset_str =
+      offset == nullptr ? "null" : absl::StrCat(offset->offset);
+
+  return absl::StrCat(
+      "RequiredMemoryAssignment(memory_space=", memory_space_str,
+      ", time=", time, ", offset=", offset_str, ")");
+}
+
 std::vector<const MsaBufferInterval*> MsaAlgorithm::GetSortedColocatedIntervals(
     const MsaBufferInterval& interval) const {
   std::vector<const MsaBufferInterval*> colocated_intervals;
@@ -1025,7 +1131,7 @@ bool AreOperandCandidatesCompatible(int loop_size_candidate,
 }  // namespace
 
 void MsaAlgorithm::IdentifyAndOptimizeMemoryBoundLoops() {
-  absl::flat_hash_map<absl::string_view, int> fingerprint_schedule_map;
+  absl::flat_hash_map<uint64_t, int> fingerprint_schedule_map;
   const auto& instruction_sequence =
       hlo_live_range_.flattened_instruction_sequence().instructions();
   // The minimum and maximum loop sizes that we consider.
@@ -1073,7 +1179,7 @@ void MsaAlgorithm::IdentifyAndOptimizeMemoryBoundLoops() {
               << " fingerprint: "
               << (fingerprint_it == fingerprint_map_.end()
                       ? "none"
-                      : fingerprint_it->second);
+                      : std::to_string(fingerprint_it->second));
     }
     VLOG(3) << "Loop size candidate: " << loop_size_candidate;
     if (loop_size_candidate == -1) {
@@ -1134,7 +1240,7 @@ void MsaAlgorithm::IdentifyAndOptimizeMemoryBoundLoops() {
         if (!AreOperandCandidatesCompatible(loop_size_candidate,
                                             inst_operand_distances,
                                             candidate_inst_operand_distances)) {
-          // Operand distance mistatch.
+          // Operand distance mismatch.
           continue;
         }
         // Found the start of the loop.
@@ -1341,8 +1447,9 @@ bool IsTrivialInstruction(const HloInstruction* instruction) {
 }
 
 bool IsSliceLikeInstruction(const HloInstruction* instruction) {
-  return instruction->opcode() == HloOpcode::kSlice ||
-         instruction->opcode() == HloOpcode::kDynamicSlice;
+  return instruction->opcode() == HloOpcode::kSlice;
+  // TODO(b/415757985): Re-enable kDynamicSlice once we take account the other
+  // operands.
 }
 
 }  // namespace
@@ -1681,11 +1788,23 @@ void FixAllocationSequenceAfterPostAllocationTransformation(
 
   // (2)
   for (auto& allocation : *allocations) {
+    std::vector<HloUse> uses_to_update;
     for (const HloUse& use : allocation->uses()) {
-      auto new_use_it = transformation_info.update_use_map.find(use);
-      if (new_use_it != transformation_info.update_use_map.end()) {
-        allocation->RemoveUse(use);
-        allocation->AddUse(new_use_it->second);
+      for (const auto& [old_use, new_use] :
+           transformation_info.update_use_map) {
+        if (use == old_use) {
+          uses_to_update.push_back(old_use);
+          break;  // found the use, no need to keep searching update_use_map
+        }
+      }
+    }
+
+    // Perform update uses
+    if (!uses_to_update.empty()) {
+      for (const HloUse& old_use : uses_to_update) {
+        const HloUse& new_use = transformation_info.update_use_map.at(old_use);
+        allocation->RemoveUse(old_use);
+        allocation->AddUse(new_use);
       }
     }
   }
@@ -1713,8 +1832,104 @@ bool VerifyOperandsInAlternateMemoryMap(
   return reference_map == operands_in_alternate_memory_map;
 }
 
+// GetAsyncCopyElapsed with a default value.
+float CopyResourceForShape(const Options& options, const Shape& shape) {
+  return options.cost_analysis
+             ? options.cost_analysis->GetAsyncCopyElapsed(shape)
+             : 0.1;
+}
+
+absl::StatusOr<MemorySpace> GetMemorySpaceEnum(const int64_t memory_space,
+                                               const Options& options) {
+  if (memory_space == options.alternate_memory_space) {
+    return MemorySpace::kAlternate;
+  }
+  if (memory_space == options.default_memory_space) {
+    return MemorySpace::kDefault;
+  }
+  return absl::InvalidArgumentError(
+      absl::StrCat("Invalid memory space: ", memory_space));
+}
+
 }  // namespace
 
+absl::Status MsaAlgorithm::ProcessColoredBuffers() {
+  for (const auto& buffer_coloring : options_.buffer_colorings) {
+    HloPosition position;
+    HloUse use;
+    bool use_colored = false;
+    int64_t time_of_coloring;
+    if (std::holds_alternative<HloUse>(
+            buffer_coloring.buffer_position_or_use)) {
+      use_colored = true;
+      use = std::get<HloUse>(buffer_coloring.buffer_position_or_use);
+      position =
+          HloPosition{use.instruction->mutable_operand(use.operand_number),
+                      use.operand_index};
+      time_of_coloring =
+          hlo_live_range_.instruction_schedule().at(use.instruction);
+    } else {
+      position = std::get<HloPosition>(buffer_coloring.buffer_position_or_use);
+      time_of_coloring =
+          hlo_live_range_.instruction_schedule().at(position.instruction);
+    }
+    const int64_t memory_space = buffer_coloring.memory_space;
+    HloValue& value = alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+        position.instruction, position.index);
+    TF_ASSIGN_OR_RETURN(const MemorySpace memory_space_enum,
+                        GetMemorySpaceEnum(memory_space, options_));
+    if (memory_space_enum == MemorySpace::kDefault) {
+      default_memory_coloring_requirements_[value.defining_position()]
+          .push_back(time_of_coloring);
+      continue;
+    }
+    CHECK(memory_space_enum == MemorySpace::kAlternate);
+    MsaBufferInterval interval =
+        MsaBufferInterval{/*buffer=*/nullptr,
+                          /*size=*/buffer_intervals_.at(&value).size,
+                          /*start=*/time_of_coloring,
+                          /*end=*/time_of_coloring,
+                          /*colocations=*/{},
+                          /*need_allocation=*/true};
+    Chunk chunk_candidate = FindChunkCandidate(interval);
+    if (chunk_candidate.chunk_end() > available_heap_size()) {
+      if (use_colored) {
+        return FailedPrecondition(
+            "%s",
+            absl::StrCat(
+                "Too many buffers are colored in the alternate memory. Could "
+                "not reserve alternate memory for colored operand of "
+                "instruction ",
+                use.instruction->name(), " at operand index ",
+                use.operand_number, " shape index ",
+                use.operand_index.ToString()));
+      }
+      return FailedPrecondition(
+          "%s",
+          absl::StrCat(
+              "Too many buffers are colored in the alternate memory. Could not "
+              "reserve alternate memory for colored output of instruction ",
+              position.instruction->name(), " at shape index ",
+              position.index.ToString()));
+    }
+    std::vector<std::unique_ptr<ReservedAllocation>>& reserved_allocations =
+        reserved_allocations_for_alt_mem_colorings_[value.defining_position()];
+    reserved_allocations.push_back(std::make_unique<ReservedAllocation>(
+        value.defining_position(), chunk_candidate, time_of_coloring));
+    CommitChunk(interval, chunk_candidate);
+    // We need to add an allocation block to the repack_allocation_blocks_ so
+    // repacking can account for the reserved memory.
+    repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+        time_of_coloring, time_of_coloring, chunk_candidate.size,
+        chunk_candidate.offset,
+        static_cast<int64_t>(repack_allocation_blocks_.size()),
+        reserved_allocations.back().get()));
+    repack_allocation_blocks_.back().next_colocated =
+        &(repack_allocation_blocks_.back());
+  }
+  return absl::OkStatus();
+}
+
 absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
   // Note: Memory Space Assignment creates a HeapSimulator and passes an
   // MsaAlgorithm object to it. buffer_intervals_ is populated by calling the
@@ -1807,6 +2022,7 @@ absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
           << options_.max_size_in_bytes;
 
   AddInputAndOutputRequiredAssignments();
+  TF_RETURN_IF_ERROR(ProcessColoredBuffers());
 
   if (VLOG_IS_ON(3) || options_.dump_fn != nullptr) {
     VLOG(3) << "Flattened instruction sequence:";
@@ -2028,7 +2244,6 @@ absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
   // Run post allocation transformation and fix the allocation sequence if
   // needed.
   if (options_.post_allocation_transformation_fn) {
-    PostAllocationTransformationUpdate all_changes;
     VLOG(3) << "Running post allocation transformation on module";
     for (HloComputation* comp : alias_analysis_.dataflow_analysis()
                                     .module()
@@ -2056,24 +2271,24 @@ absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
           }
           for (HloInstruction* user : operand->users()) {
             if (HloDataflowAnalysis::IsInPlaceOperation(user->opcode())) {
-              continue;
+              break;
             }
           }
         }
 
         TF_ASSIGN_OR_RETURN(PostAllocationTransformationUpdate changes,
                             options_.post_allocation_transformation_fn(instr));
-        all_changes.to_be_removed.insert(all_changes.to_be_removed.end(),
-                                         changes.to_be_removed.begin(),
-                                         changes.to_be_removed.end());
-        all_changes.update_use_map.insert(changes.update_use_map.begin(),
-                                          changes.update_use_map.end());
+        VLOG(3) << "Post allocation transformation info: \n"
+                << changes.ToString();
+        FixAllocationSequenceAfterPostAllocationTransformation(allocations_,
+                                                               changes);
       }
     }
-    VLOG(3) << "Post allocation transformation info: \n"
-            << all_changes.ToString();
-    FixAllocationSequenceAfterPostAllocationTransformation(allocations_,
-                                                           all_changes);
+  }
+
+  if (options_.expanded_scoped_alternate_memory_mode ==
+      ExpandedScopedAlternateMemoryMode::ENABLED) {
+    ExtendScopedAlternateMemoryAllocations();
   }
 
   HeapSimulator::Result<HloValue> result;
@@ -2209,8 +2424,7 @@ MsaAlgorithm::GetLinkedAllocationsInAlternateMemory(
   return linked_allocations;
 }
 
-std::vector<MsaAlgorithm::HloPositionOrUse>
-MsaAlgorithm::GetInefficientAllocationSites(
+std::vector<HloPositionOrUse> MsaAlgorithm::GetInefficientAllocationSites(
     absl::Span<const AllocationValue> allocation_values) const {
   // The logic below is used mostly for testing, allowing a test case to inject
   // some custom logic for this method.
@@ -2259,7 +2473,7 @@ MsaAlgorithm::GetInefficientAllocationSites(
 
   std::vector<std::vector<const Allocation*>> linked_allocations =
       GetLinkedAllocationsInAlternateMemory(allocation_values);
-  std::vector<MsaAlgorithm::HloPositionOrUse> inefficient_sites;
+  std::vector<HloPositionOrUse> inefficient_sites;
   for (const std::vector<const Allocation*>& allocation_group :
        linked_allocations) {
     // For all of allocation in the linked allocation group, calculate the total
@@ -2640,7 +2854,8 @@ absl::StatusOr<AllocationResult> MsaAlgorithm::AllocateAllocationValues(
           preferred_offset_for_allocation_value.at(&allocation_value_to_update),
           definition_time_for_allocation_value.at(&allocation_value_to_update),
           RequiresNoCopyAlternateMemAllocation(allocation_value_to_update),
-          all_use_times, entry.only_extend_existing_allocation);
+          all_use_times, entry.only_extend_existing_allocation,
+          allocation_values.subspan(0, alloc_value_idx));
       if (options_.allocation_request_modifier_testing_fn) {
         options_.allocation_request_modifier_testing_fn(request);
       }
@@ -2651,6 +2866,8 @@ absl::StatusOr<AllocationResult> MsaAlgorithm::AllocateAllocationValues(
       if (use.hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
           use.hlo_use.instruction ==
               use.hlo_use.instruction->parent()->root_instruction()) {
+        UpdateRequestWithAlternateMemoryColoringRequirements(request);
+        UpdateRequestWithDefaultMemoryColoringRequirements(request);
         AllocationResult allocate_segment_result = AllocateSegment(request);
         VLOG(2) << "AllocateSegment result: "
                 << ResultToString(allocate_segment_result);
@@ -2816,7 +3033,8 @@ AllocationRequest MsaAlgorithm::CreateAllocationRequest(
     AliasedOffset* preferred_offset, int64_t definition_time,
     bool require_no_copy_alternate_mem_allocation,
     const std::vector<int64_t>& all_use_times,
-    bool only_extend_existing_allocation) {
+    bool only_extend_existing_allocation,
+    absl::Span<AllocationValue> processed_allocation_values) {
   const HloUse& hlo_use = use.hlo_use;
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   bool require_copy_allocation = false;
@@ -3064,7 +3282,7 @@ AllocationRequest MsaAlgorithm::CreateAllocationRequest(
 
   request.end_time = use_time;
   request.only_extend_existing_allocation = only_extend_existing_allocation;
-
+  request.processed_allocation_values = processed_allocation_values;
   return request;
 }
 
@@ -3209,16 +3427,16 @@ bool AsynchronousCopyOrdering::ViolatesOrdering(int64_t exclusive_start_time,
 }
 
 bool AsynchronousCopyResource::ConsumeResource(
-    int64_t exclusive_start_time, int64_t end_time, float resource,
-    std::vector<std::pair<int64_t, float>>* delay_changes,
-    float resource_to_free) {
+    int64_t exclusive_start_time, int64_t end_time, int64_t resource,
+    std::vector<std::pair<int64_t, int64_t>>* delay_changes,
+    int64_t resource_to_free) {
   // Cache the pointers to the arrays to avoid the overhead of `operator[]`
   // size checks in hardened libc++.
   //
   // NOTE: Do not modify the vectors `initial_resources_` or `delay_` in this
   // function, otherwise the pointers will become dangling.
-  float* initial_resources_ptr = initial_resources_.data();
-  float* delay_ptr = delay_.data();
+  int64_t* initial_resources_scaled_ptr = initial_resources_scaled_.data();
+  int64_t* delay_ptr = delay_.data();
 
   std::list<AsynchronousCopy>::iterator current_copy = async_copies_.end();
   // In order to propagate the resource to the next scheduled copy, we iterate
@@ -3227,7 +3445,7 @@ bool AsynchronousCopyResource::ConsumeResource(
   // resource (and return false).
   while (true) {
     // resource is modified below. We save its initial value for logging below.
-    const float amount_requested = resource;
+    const int64_t amount_requested = resource;
 
     VLOG(3) << "Consume resource: start time_exclusive = "
             << exclusive_start_time << ", end time = " << end_time
@@ -3241,7 +3459,7 @@ bool AsynchronousCopyResource::ConsumeResource(
                    end_time);
 
     // Nothing to do if we're not adding or removing any resources.
-    if (resource == 0.0 && resource_to_free == 0.0) {
+    if (resource == 0 && resource_to_free == 0) {
       return true;
     }
 
@@ -3270,13 +3488,14 @@ bool AsynchronousCopyResource::ConsumeResource(
     // Check if this copy will push the next copy later in time (or if removing
     // the resource, check if the removal of this copy move the next copy
     // earlier in time).
-    std::optional<float> delay_for_next_copy = std::nullopt;
-    float resource_freed = 0.0;
+    std::optional<int64_t> delay_for_next_copy = std::nullopt;
+    int64_t resource_freed = 0;
     for (int64_t time = ExclusiveToInclusiveStartTime(exclusive_start_time);
          time < end_time && resource != 0; ++time) {
+      int64_t initial_resource_scaled = initial_resources_scaled_ptr[time];
       // Iterate over the logical times that this copy spans. Note that the
       // start and end time ranges are exclusive.
-      float used_resource = std::min(resource, initial_resources_ptr[time]);
+      int64_t used_resource = std::min(resource, initial_resource_scaled);
       if (next_copy != async_copies_.end() &&
           next_copy->exclusive_start_time ==
               InclusiveToExclusiveStartTime(time)) {
@@ -3289,13 +3508,13 @@ bool AsynchronousCopyResource::ConsumeResource(
       if (!delay_for_next_copy.has_value()) {
         // Update the delay_ vector and resource_freed variable with the amount
         // that was freed when removing the copy.
-        float old_delay = delay_ptr[time];
-        float old_resource =
-            std::max(0.0f, initial_resources_ptr[time] - old_delay);
-        float new_delay = std::max(0.0f, resource - resource_to_free);
-        float new_resource =
-            std::max(0.0f, initial_resources_ptr[time] - new_delay);
-        resource_freed += std::max(0.0f, new_resource - old_resource);
+        int64_t old_delay = delay_ptr[time];
+        int64_t old_resource =
+            std::max<int64_t>(0, initial_resource_scaled - old_delay);
+        int64_t new_delay = std::max<int64_t>(0, resource - resource_to_free);
+        int64_t new_resource =
+            std::max<int64_t>(0, initial_resource_scaled - new_delay);
+        resource_freed += std::max<int64_t>(0, new_resource - old_resource);
         delay_ptr[time] = new_delay;
         if (delay_changes) {
           delay_changes->emplace_back(time, old_delay);
@@ -3305,7 +3524,8 @@ bool AsynchronousCopyResource::ConsumeResource(
       resource -= used_resource;
     }
 
-    // If resource isn't satisfied by the end, we didn't have enough resources.
+    // If resource isn't satisfied by the end, we didn't have enough
+    // resources.
     if (resource > 0) {
       VLOG(3) << "Doesn't have enough resource; requested resource = "
               << amount_requested << "; leftover resources = " << resource;
@@ -3320,14 +3540,15 @@ bool AsynchronousCopyResource::ConsumeResource(
     // removed.
     exclusive_start_time = next_copy->exclusive_start_time;
     end_time = next_copy->end_time;
-    resource = *delay_for_next_copy + next_copy->resource;
+    resource =
+        *delay_for_next_copy + GetScaledIntegerResource(next_copy->resource);
     current_copy = next_copy;
   }
 }
 
 void AsynchronousCopyResource::AddCopy(const AsynchronousCopy& copy) {
-  CHECK(
-      ConsumeResource(copy.exclusive_start_time, copy.end_time, copy.resource));
+  CHECK(ConsumeResource(copy.exclusive_start_time, copy.end_time,
+                        GetScaledIntegerResource(copy.resource)));
 
   // Find the iterator for the copy that would be right after this copy and put
   // this copy right before it in async_copies_.
@@ -3393,10 +3614,11 @@ void AsynchronousCopyResource::RemoveCopy(
   CHECK(std::next(copy_it) == async_copies_.end() ||
         std::next(copy_it)->exclusive_start_time >
             copy_it->exclusive_start_time);
-  CHECK(ConsumeResource(copy_it->exclusive_start_time, copy_it->end_time,
-                        /*resource=*/0,
-                        /*delay_changes=*/nullptr,
-                        /*resource_to_free=*/copy_it->resource));
+  CHECK(ConsumeResource(
+      copy_it->exclusive_start_time, copy_it->end_time,
+      /*resource=*/0,
+      /*delay_changes=*/nullptr,
+      /*resource_to_free=*/GetScaledIntegerResource(copy_it->resource)));
   // If the copy to be removed is the value pointed by async_copy_time_map_, we
   // make the next copy with the same start time to be pointed by
   // async_copy_time_map_. If there are no such copies, we remove the key for
@@ -3417,10 +3639,11 @@ void AsynchronousCopyResource::RemoveCopy(
 bool AsynchronousCopyResource::HasEnoughResource(int64_t exclusive_start_time,
                                                  int64_t end_time,
                                                  float resource) {
-  std::vector<std::pair<int64_t, float>> delay_changes;
+  std::vector<std::pair<int64_t, int64_t>> delay_changes;
   delay_changes.reserve(delay_.size());
   bool result =
-      ConsumeResource(exclusive_start_time, end_time, resource, &delay_changes);
+      ConsumeResource(exclusive_start_time, end_time,
+                      GetScaledIntegerResource(resource), &delay_changes);
   // Apply the delay changes in reverse order. This ensures that the original
   // value of each delay is restored.
   if (!delay_changes.empty()) {
@@ -3434,17 +3657,18 @@ bool AsynchronousCopyResource::HasEnoughResource(int64_t exclusive_start_time,
 
 bool AsynchronousCopyResource::HasEnoughResourceMultiCheck(
     const std::vector<ResourceSpec>& specs) {
-  std::vector<std::pair<int64_t, float>> delay_changes;
-  delay_changes.reserve(delay_.size());
+  delay_changes_.resize(0);
+  delay_changes_.reserve(delay_.size());
   bool result = absl::c_all_of(specs, [&](const ResourceSpec& spec) {
     return ConsumeResource(spec.exclusive_start_time, spec.end_time,
-                           spec.resource, &delay_changes);
+                           GetScaledIntegerResource(spec.resource),
+                           &delay_changes_);
   });
   // Apply the delay changes in reverse order. This ensures that the original
   // value of each delay is restored.
-  if (!delay_changes.empty()) {
-    for (int64_t i = delay_changes.size() - 1; i >= 0; --i) {
-      const auto& [time, delay] = delay_changes[i];
+  if (!delay_changes_.empty()) {
+    for (int64_t i = delay_changes_.size() - 1; i >= 0; --i) {
+      const auto& [time, delay] = delay_changes_[i];
       delay_[time] = delay;
     }
   }
@@ -3472,7 +3696,7 @@ std::string AsynchronousCopyResource::Dump(
   for (int i = start_time; i < end_time; ++i) {
     time_dump_data.push_back({
         initial_resources_[i],
-        delay_[i],
+        GetDescaledFloatResource(delay_[i]),
         available[i],
         /*overlapping_copies=*/{},
     });
@@ -3598,8 +3822,7 @@ void MsaAlgorithm::AllocateCrossProgramPrefetchBuffer(
   AllocationSequence allocations;
   allocations.push_back(std::make_unique<PinnedAllocation>(
       buffer->defining_position(), MemorySpace::kDefault, kDummyChunk,
-      prefetch_candidate.start, prefetch_candidate.end,
-      /*is_scoped_allocation=*/false));
+      prefetch_candidate.start, prefetch_candidate.end));
 
   // Find the earliest use.
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
@@ -3736,44 +3959,20 @@ void MsaAlgorithm::AllocateCrossProgramPrefetchBuffer(
 }
 
 void MsaAlgorithm::AllocateReservedScopedAllocations() {
-  const auto& instruction_sequence =
+  const std::vector<HloInstruction*>& instruction_sequence =
       hlo_live_range_.flattened_instruction_sequence().instructions();
   for (int i = 0; i < instruction_sequence.size(); ++i) {
-    const HloInstruction* instruction = instruction_sequence[i];
+    HloInstruction* instruction = instruction_sequence[i];
     int64_t reserved_scoped_memory =
         std::min(options_.reserved_scoped_memory_fn(
                      instruction, /*operands_in_alternate_memory=*/{},
                      /*outputs_in_alternate_memory=*/{}),
                  options_.max_size_in_bytes);
-    if (reserved_scoped_memory != 0) {
-      VLOG(1) << "Allocate reserved scoped memory at " << i << " ("
-              << instruction->name() << "): " << reserved_scoped_memory;
-      MsaBufferInterval interval;
-      interval.buffer = nullptr;
-      interval.size = reserved_scoped_memory;
-      interval.start = i;
-      interval.end = i;
-      interval.need_allocation = true;
-      Chunk chunk_candidate =
-          FindChunkCandidate(interval, /*preferred_offset=*/0);
-      CHECK_EQ(chunk_candidate.offset, 0);
-      AddToPendingChunks(interval, chunk_candidate);
-
-      if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
-        AppendScopedAllocationBufferInfoDebugString(
-            instruction, i, reserved_scoped_memory, buffer_info_str_);
-      }
-
-      allocations_->push_back(std::make_unique<PinnedAllocation>(
-          HloPosition{instruction_sequence[i], {}}, MemorySpace::kAlternate,
-          chunk_candidate, i, i, /*is_scoped_allocation=*/true));
-
-      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
-          i, i, reserved_scoped_memory,
-          /*initial_offset=*/0,
-          static_cast<int64_t>(repack_allocation_blocks_.size()),
-          allocations_->back().get()));
+    if (reserved_scoped_memory == 0) {
+      continue;
     }
+    AllocateScopedAllocation(instruction, /*is_post_module=*/false,
+                             reserved_scoped_memory, i);
   }
   // If requested, make all scoped allocations to colocate with each other so
   // that when we repack, all scoped allocations get the same offsets. Since
@@ -3791,14 +3990,54 @@ void MsaAlgorithm::AllocateReservedScopedAllocations() {
       repack_allocation_blocks_.back().next_colocated =
           &repack_allocation_blocks_.front();
     }
-  } else {
-    for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
-      allocation_block.next_colocated = &allocation_block;
-    }
   }
+
+  // Allocate post-module scoped allocation if requested. It never needs to be
+  // colocated with other scoped allocations.
+  if (options_.post_module_scoped_alternate_memory_size_in_bytes > 0) {
+    AllocateScopedAllocation(
+        /*instruction=*/module_->entry_computation()->root_instruction(),
+        /*is_post_module=*/true,
+        options_.post_module_scoped_alternate_memory_size_in_bytes,
+        hlo_live_range_.schedule_end_time());
+  }
+
   ClearPendingChunks();
 }
 
+void MsaAlgorithm::AllocateScopedAllocation(HloInstruction* instruction,
+                                            bool is_post_module, int64_t size,
+                                            int64_t time) {
+  VLOG(1) << "Allocate reserved scoped memory at " << time << " ("
+          << (is_post_module ? "<post-module>" : instruction->name())
+          << "): " << size;
+  MsaBufferInterval interval;
+  interval.buffer = nullptr;
+  interval.size = size;
+  interval.start = time;
+  interval.end = time;
+  interval.need_allocation = true;
+  Chunk chunk_candidate = FindChunkCandidate(interval, /*preferred_offset=*/0);
+  CHECK_EQ(chunk_candidate.offset, 0);
+  AddToPendingChunks(interval, chunk_candidate);
+
+  if (options_.dump_fn != nullptr || VLOG_IS_ON(3)) {
+    AppendScopedAllocationBufferInfoDebugString(instruction, time, size,
+                                                buffer_info_str_);
+  }
+
+  allocations_->push_back(std::make_unique<ScopedAllocation>(
+      chunk_candidate, time, instruction, is_post_module));
+
+  repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+      time, time, size,
+      /*initial_offset=*/0,
+      static_cast<int64_t>(repack_allocation_blocks_.size()),
+      allocations_->back().get()));
+  repack_allocation_blocks_.back().next_colocated =
+      &repack_allocation_blocks_.back();
+}
+
 int64_t MsaAlgorithm::GetCorrectedUseTime(
     const HloInstruction* instruction) const {
   const absl::flat_hash_map<const HloInstruction*, int64_t>& schedule =
@@ -4228,7 +4467,7 @@ void MsaAlgorithm::ExportAllocationsForRepacking(
 }
 
 void MsaAlgorithm::ImportRepackedAllocations() {
-  interval_tree_ = {};
+  interval_tree_.Clear();
   for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
     if (allocation_block.allocation->is_sliced_copy_allocation()) {
       ImportRepackedSlicedAllocation(allocation_block);
@@ -4592,6 +4831,174 @@ std::string MsaAlgorithm::ResultToString(const AllocationResult& result) {
   return result_str;
 }
 
+void MsaAlgorithm::CheckAndUpdateForDualLiveAllocationValues(
+    const std::optional<RequiredMemoryAssignment>&
+        required_memory_assignment_at_start,
+    AllocationRequest& request) {
+  if (!request.allocation_value->requires_contiguous_allocation()) {
+    return;
+  }
+  if (!required_memory_assignment_at_start.has_value()) {
+    return;
+  }
+  if (required_memory_assignment_at_start->memory_space !=
+      MemorySpace::kAlternate) {
+    return;
+  }
+  // Go through previous allocations, for the same HloValue, and check if they
+  // have already allocated alternate memory at the beginning of the current
+  // AllocationValue, such that we are required to use the same heap offset.
+  std::vector<Allocation*> overlapping_allocations;
+  Chunk required_chunk = Chunk::FromOffsetSize(
+      required_memory_assignment_at_start->offset->offset, request.size);
+  for (const AllocationValue& processed_allocation_value :
+       request.processed_allocation_values) {
+    for (const std::unique_ptr<Allocation>& allocation :
+         *processed_allocation_value.allocation_sequence()) {
+      if (allocation->is_in_alternate_mem() &&
+          allocation->start_time() <= request.inclusive_start_time &&
+          request.inclusive_start_time <= allocation->end_time() &&
+          allocation->chunk() == required_chunk) {
+        overlapping_allocations.push_back(allocation.get());
+      }
+    }
+  }
+  absl::c_sort(overlapping_allocations,
+               [](const Allocation* a, const Allocation* b) {
+                 return a->start_time() < b->start_time();
+               });
+  int64_t chunk_start_time = request.inclusive_start_time;
+  for (const Allocation* allocation : overlapping_allocations) {
+    chunk_start_time = std::max(chunk_start_time, allocation->end_time() + 1);
+  }
+
+  // Note, we don't have to set request.preferred_offset, or do anything special
+  // to handle aliasing. This is done for us. Specifically, before calling
+  // CheckAndUpdateForDualLiveAllocationValues(), AllocateSegment() inserts a
+  // PinnedAllocation with no associated heap chunk, at the beginning of
+  // request.allocation_value. It aliases that PinnedAllocation with any
+  // overlapping allocations calculated above. In
+  // AllocateInAlternateMemoryNoCopy(), we will find that PinnedAllocation and
+  // realize we need to use the same alternate memory offset.
+  request.no_copy_chunk_inclusive_start_time = chunk_start_time;
+  VLOG(3) << "Setting the no-copy chunk (inc) start time to "
+          << chunk_start_time;
+}
+
+void MsaAlgorithm::ReleaseReservedAllocationForAlternateMemoryColorings(
+    ReservedAllocation* reserved_allocation) {
+  // We check if the reserved chunk is still reserved because this might
+  // be a retry of the same allocation request and the chunk might have
+  // been released in the previous attempt.
+  if (!reserved_allocation->is_chunk_reserved_in_interval_tree()) {
+    return;
+  }
+  // Release the reserved chunk from the interval tree.
+  CHECK(interval_tree_.Remove(reserved_allocation->start_time(),
+                              reserved_allocation->end_time(),
+                              reserved_allocation->chunk()));
+  reserved_allocation->chunk_freed_in_interval_tree();
+  // Remove the allocation from the repack_allocation_blocks_ list.
+  auto it = std::remove_if(
+      repack_allocation_blocks_.begin(), repack_allocation_blocks_.end(),
+      [reserved_allocation](
+          const RepackAllocationBlock& repack_allocation_block) {
+        return repack_allocation_block.allocation == reserved_allocation;
+      });
+  size_t original_size = repack_allocation_blocks_.size();
+  repack_allocation_blocks_.erase(it, repack_allocation_blocks_.end());
+  CHECK_EQ(original_size - repack_allocation_blocks_.size(), 1);
+}
+
+void MsaAlgorithm::FreeAlternateMemoryColoringReservedAllocations(
+    AllocationRequest& request) {
+  if (!request.require_start_colored_in_alternate_memmory &&
+      !request.require_end_colored_in_alternate_memory) {
+    return;
+  }
+  const HloPosition& defining_position =
+      request.allocation_value->defining_position();
+  auto reserved_allocations_it =
+      reserved_allocations_for_alt_mem_colorings_.find(defining_position);
+  CHECK(reserved_allocations_it !=
+        reserved_allocations_for_alt_mem_colorings_.end());
+
+  int64_t inclusive_start_time = request.inclusive_start_time;
+  int64_t use_time = request.end_time;
+  for (std::unique_ptr<ReservedAllocation>& reserved_allocation_ptr :
+       reserved_allocations_it->second) {
+    if (request.require_start_colored_in_alternate_memmory &&
+        reserved_allocation_ptr->start_time() == inclusive_start_time) {
+      ReleaseReservedAllocationForAlternateMemoryColorings(
+          reserved_allocation_ptr.get());
+    }
+    if (request.require_end_colored_in_alternate_memory &&
+        reserved_allocation_ptr->end_time() == use_time) {
+      ReleaseReservedAllocationForAlternateMemoryColorings(
+          reserved_allocation_ptr.get());
+    }
+  }
+}
+
+void MsaAlgorithm::UpdateRequestWithAlternateMemoryColoringRequirements(
+    AllocationRequest& request) {
+  if (!request.allocation_value) {
+    return;
+  }
+  const HloPosition& defining_position =
+      request.allocation_value->defining_position();
+  int64_t definition_time =
+      hlo_live_range_.instruction_schedule().at(defining_position.instruction);
+
+  int64_t inclusive_start_time = request.inclusive_start_time;
+  int64_t use_time = request.end_time;
+
+  auto reserved_allocations_it =
+      reserved_allocations_for_alt_mem_colorings_.find(defining_position);
+  if (reserved_allocations_it !=
+      reserved_allocations_for_alt_mem_colorings_.end()) {
+    for (std::unique_ptr<ReservedAllocation>& reserved_allocation_ptr :
+         reserved_allocations_it->second) {
+      if (inclusive_start_time == definition_time &&
+          reserved_allocation_ptr->start_time() == definition_time) {
+        request.require_start_colored_in_alternate_memmory = true;
+      }
+      if (reserved_allocation_ptr->end_time() == use_time) {
+        request.require_end_colored_in_alternate_memory = true;
+      }
+    }
+  }
+}
+
+void MsaAlgorithm::UpdateRequestWithDefaultMemoryColoringRequirements(
+    AllocationRequest& request) {
+  if (!request.allocation_value) {
+    return;
+  }
+  const HloPosition& defining_position =
+      request.allocation_value->defining_position();
+  int64_t definition_time =
+      hlo_live_range_.instruction_schedule().at(defining_position.instruction);
+
+  int64_t inclusive_start_time = request.inclusive_start_time;
+  int64_t use_time = request.end_time;
+
+  auto default_memory_colorings_it =
+      default_memory_coloring_requirements_.find(defining_position);
+  if (default_memory_colorings_it !=
+      default_memory_coloring_requirements_.end()) {
+    for (int64_t coloring_time : default_memory_colorings_it->second) {
+      if (inclusive_start_time == definition_time &&
+          coloring_time == definition_time) {
+        request.require_start_colored_in_default_memory = true;
+      }
+      if (coloring_time == use_time) {
+        request.require_end_colored_in_default_memory = true;
+      }
+    }
+  }
+}
+
 AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
   auto allocation_sequence =
       request.allocation_value->mutable_allocation_sequence();
@@ -4695,8 +5102,7 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
       allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
           defining_position, required_assignment_at_start->memory_space,
           aliased_chunk, request.inclusive_start_time,
-          request.inclusive_start_time,
-          /*is_scoped_allocation=*/false));
+          request.inclusive_start_time));
       if (required_assignment_at_start->memory_space ==
           MemorySpace::kAlternate) {
         CreateOrAddToAliasedOffset(*allocation_sequence->back(),
@@ -4711,12 +5117,18 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
       << "; required memory assignment at end: "
       << OptionalRequiredMemoryAssignmentToString(required_assignment_at_end);
 
+  FreeAlternateMemoryColoringReservedAllocations(request);
+
   AllocationResult allocation_result = AllocationResult::kSuccess;
   // First try keeping the allocation entirely in the alternate memory.
-  if (required_memory_space_at_start != MemorySpace::kDefault &&
+  if (!request.require_start_colored_in_default_memory &&
+      !request.require_end_colored_in_default_memory &&
+      required_memory_space_at_start != MemorySpace::kDefault &&
       required_memory_space_at_end != MemorySpace::kDefault &&
       request.allow_no_copy_alternate_mem_allocation &&
       !request.require_copy_allocation) {
+    CheckAndUpdateForDualLiveAllocationValues(required_assignment_at_start,
+                                              request);
     allocation_result = AllocateInAlternateMemoryNoCopy(request);
     if (allocation_result == AllocationResult::kSuccess) {
       return AllocationResult::kSuccess;
@@ -4729,6 +5141,16 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
 
   CHECK(!request.require_no_copy_alternate_mem_allocation);
 
+  if (request.require_start_colored_in_alternate_memmory) {
+    // Since no-copy-allocation failed, continuous allocation is not possible in
+    // the alternate memory.
+    CHECK(!request.allocation_value->requires_contiguous_allocation());
+    allocation_result = ForceAlternateMemoryAllocationForMinTime(request);
+    // Allocation for short live range should succeed since we released a
+    // reserved chunk from the interval tree.
+    CHECK(allocation_result == AllocationResult::kSuccess);
+  }
+
   auto prev_allocation_it = allocation_sequence->rbegin();
   // Find a previous allocation that is in the default memory space (not
   // necessarily the very last allocation).
@@ -4745,7 +5167,9 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
         (*prev_allocation_it)->defining_position() == defining_position) {
       // If there was an allocation for this HloValue that was in the alternate
       // memory space, we also need to perform an eviction.
-      AllocationResult eviction_result = Evict(request);
+      AllocationResult eviction_result = Evict(
+          request,
+          /*force_evict=*/request.require_start_colored_in_alternate_memmory);
       if (eviction_result != AllocationResult::kSuccess) {
         // A non-success eviction requires us to uncommit previous allocations.
         return result_mark(AllocationResult::kFailRequiresUncommit,
@@ -4757,13 +5181,14 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
       allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
           defining_position, MemorySpace::kDefault,
           /*chunk=*/std::nullopt, request.inclusive_start_time,
-          request.end_time,
-          /*is_scoped_allocation=*/false));
+          request.end_time));
       prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
     }
   } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
     VLOG(3) << "Allocation requires contiguous allocation, but it wasn't "
                "possible to find one.";
+    CHECK(!request.require_start_colored_in_default_memory);
+    CHECK(!request.require_end_colored_in_default_memory);
     return result_mark(AllocationResult::kFailRequiresUncommit,
                        allocation_result);
   }
@@ -4785,24 +5210,12 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
                        allocation_result);
   }
 
-  // If the buffer must be in default memory at the end_time, don't prefetch.
-  if (required_memory_space_at_end == MemorySpace::kDefault) {
-    VLOG(3)
-        << "Not trying to prefetch because use requires buffer in default mem.";
-    (*prev_allocation_in_default_mem_it)->Extend(request.end_time);
-    (*prev_allocation_in_default_mem_it)->AddUse(request.use->hlo_use);
-
-    // If the buffer is placed in default memory, we can also try window
-    // prefetching it, which will try to prefetch only a window worth of data to
-    // alternate memory.
-    WindowPrefetch(request, **prev_allocation_in_default_mem_it);
-    return AllocationResult::kSuccess;
-  }
-
   // Finally, try to prefetch the buffer into alternate memory.
   if (request.allow_prefetch &&
       !request.allocation_value->requires_contiguous_allocation() &&
-      !request.only_extend_existing_allocation) {
+      !request.only_extend_existing_allocation &&
+      required_memory_space_at_end != MemorySpace::kDefault &&
+      !request.require_end_colored_in_default_memory) {
     if (request.require_copy_allocation && !request.required_copy_for_slice) {
       auto it = std::find_if(
           allocation_sequence->begin(), allocation_sequence->end(),
@@ -4827,7 +5240,9 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
       }
     }
     AllocationResult prefetch_result =
-        Prefetch(request, **prev_allocation_in_default_mem_it);
+        Prefetch(request, **prev_allocation_in_default_mem_it, nullptr,
+                 /*force_prefetch=*/
+                 request.require_end_colored_in_alternate_memory);
     if (prefetch_result == AllocationResult::kSuccess) {
       if (request.preferred_prefetch_time) {
         // Warn if the prefetch time picked doesn't match the preferred prefetch
@@ -4871,6 +5286,8 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
     result_mark(prefetch_result, allocation_result);
   }
 
+  CHECK(!request.require_end_colored_in_alternate_memory);
+
   // If the end assignment was required to be in alternate memory but that
   // wasn't possible, then this allocation is invalid.
   if (required_memory_space_at_end == MemorySpace::kAlternate) {
@@ -5063,6 +5480,39 @@ bool MsaAlgorithm::ViolatesMaximumOutstandingAsyncCopies(
   }
 }
 
+AllocationResult MsaAlgorithm::ForceAlternateMemoryAllocationForMinTime(
+    const AllocationRequest& request) {
+  CHECK(request.allocation_value->allocation_sequence()->empty());
+
+  MsaBufferInterval alternate_mem_interval = MsaBufferInterval{
+      /*buffer=*/request.allocation_value->value(),
+      /*size=*/request.size,
+      /*start=*/request.inclusive_start_time,
+      /*end=*/request.inclusive_start_time,
+      /*colocations=*/{},
+      /*need_allocation=*/true,
+  };
+
+  Chunk chunk_candidate = FindChunkCandidate(alternate_mem_interval);
+
+  if (chunk_candidate.chunk_end() > available_heap_size()) {
+    return AllocationResult::kFailOutOfMemory;
+  }
+
+  const HloPosition& defining_position =
+      request.allocation_value->defining_position();
+  request.allocation_value->mutable_allocation_sequence()->push_back(
+      std::make_unique<PinnedAllocation>(
+          defining_position, MemorySpace::kAlternate, chunk_candidate,
+          alternate_mem_interval.start, alternate_mem_interval.end));
+  // Since we did not use request.preferred_offset, we pass nullptr to
+  // CreateOrAddToAliasedOffset.
+  CreateOrAddToAliasedOffset(
+      *request.allocation_value->allocation_sequence()->back(),
+      /*aliased_offset=*/nullptr);
+  return AllocationResult::kSuccess;
+}
+
 AllocationResult MsaAlgorithm::AllocateInAlternateMemoryNoCopy(
     const AllocationRequest& request) {
   Allocation* prev_allocation = nullptr;
@@ -5113,6 +5563,10 @@ AllocationResult MsaAlgorithm::AllocateInAlternateMemoryNoCopy(
     // If there is a previous allocation, set the start time one after the end
     // of the previous allocation's end.
     alternate_mem_interval.start = prev_allocation->end_time() + 1;
+    if (request.no_copy_chunk_inclusive_start_time.has_value()) {
+      alternate_mem_interval.start =
+          *request.no_copy_chunk_inclusive_start_time;
+    }
   }
 
   if (request.preferred_offset) {
@@ -5177,8 +5631,7 @@ AllocationResult MsaAlgorithm::AllocateInAlternateMemoryNoCopy(
       request.allocation_value->mutable_allocation_sequence()->push_back(
           std::make_unique<PinnedAllocation>(
               defining_position, MemorySpace::kAlternate, chunk_candidate,
-              request.inclusive_start_time, request.end_time,
-              /*is_scoped_allocation=*/false));
+              request.inclusive_start_time, request.end_time));
       CreateOrAddToAliasedOffset(
           *request.allocation_value->allocation_sequence()->back(),
           preferred_offset);
@@ -5196,7 +5649,8 @@ AllocationResult MsaAlgorithm::AllocateInAlternateMemoryNoCopy(
   return AllocationResult::kFailOutOfMemory;
 }
 
-AllocationResult MsaAlgorithm::Evict(const AllocationRequest& request) {
+AllocationResult MsaAlgorithm::Evict(const AllocationRequest& request,
+                                     bool force_evict) {
   CHECK_GT(request.allocation_value->allocation_sequence()->size(), 0);
   Allocation* prev_allocation =
       request.allocation_value->allocation_sequence()->back().get();
@@ -5296,36 +5750,42 @@ AllocationResult MsaAlgorithm::Evict(const AllocationRequest& request) {
         prev_allocation->end_time(), eviction_end_time,
         request.allocation_value->mutable_allocation_sequence(),
         /*aliased_offset=*/nullptr, eviction_resource);
+    return AllocationResult::kSuccess;
+  }
+
+  if (eviction_violates_outstanding_copies) {
+    VLOG(3) << "This violates the maximum async copies.";
+  } else if (eviction_violates_resource) {
+    VLOG(3) << "This violates resource.";
   } else {
-    if (eviction_violates_outstanding_copies) {
-      VLOG(3) << "This violates the maximum async copies.";
-    } else if (eviction_violates_resource) {
-      VLOG(3) << "This violates resource.";
-    } else {
-      VLOG(3) << "Eviction interval is too short ("
-              << eviction_exclusive_start_time << ", " << eviction_end_time
-              << ").";
-    }
-    // If the original interval violated the limit, try sub-intervals within
-    // this interval.
-    bool eviction_scheduled = false;
-
-    if (!eviction_scheduled) {
-      // If the eviction couldn't be scheduled, then fail. This buffer will be
-      // kept in the default memory.
-      VLOG(3) << "Bailing: Could not evict " << request.use->hlo_use.ToString()
-              << " because we hit the limit of maximum asynchronous copies "
-              << "between ("
-              << hlo_live_range_.flattened_instruction_sequence()
-                     .instructions()[eviction_exclusive_start_time]
-              << ", "
-              << hlo_live_range_.flattened_instruction_sequence()
-                     .instructions()[eviction_end_time]
-              << ")";
-      return AllocationResult::kFailOutOfAsyncCopies;
-    }
+    VLOG(3) << "Eviction interval is too short ("
+            << eviction_exclusive_start_time << ", " << eviction_end_time
+            << ").";
   }
-  return AllocationResult::kSuccess;
+
+  if (force_evict) {
+    VLOG(3) << "Forcing evicting.";
+    AddAsyncCopyOrOtherMemOp(
+        *prev_allocation, MemorySpace::kDefault,
+        /*chunk=*/std::nullopt, prev_allocation->end_time() - 1,
+        request.end_time, prev_allocation->end_time() + 1,
+        request.allocation_value->mutable_allocation_sequence(),
+        /*aliased_offset=*/nullptr, 0);
+    return AllocationResult::kSuccess;
+  }
+
+  // If the eviction couldn't be scheduled, then fail. This buffer will be
+  // kept in the default memory.
+  VLOG(3) << "Bailing: Could not evict " << request.use->hlo_use.ToString()
+          << " because we hit the limit of maximum asynchronous copies "
+          << "between ("
+          << hlo_live_range_.flattened_instruction_sequence()
+                 .instructions()[eviction_exclusive_start_time]
+          << ", "
+          << hlo_live_range_.flattened_instruction_sequence()
+                 .instructions()[eviction_end_time]
+          << ")";
+  return AllocationResult::kFailOutOfAsyncCopies;
 }
 
 int64_t MsaAlgorithm::FindPrefetchEndTime(
@@ -5430,6 +5890,46 @@ AllocationResult MsaAlgorithm::WindowPrefetch(
 }
 
 AllocationResult MsaAlgorithm::Prefetch(
+    const AllocationRequest& request,
+    Allocation& prev_allocation_in_default_mem, const Shape* shape,
+    bool force_prefetch) {
+  AllocationResult result = PrefetchWithResourceConstraints(
+      request, prev_allocation_in_default_mem, shape);
+  if (result == AllocationResult::kSuccess || !force_prefetch) {
+    return result;
+  }
+
+  // We require the buffer in alternate memory for the use, but, prefetching
+  // failed, so we force a prefetch and point the use to the prefetch
+  // allocation.
+  MsaBufferInterval alternate_mem_interval = MsaBufferInterval{
+      /*buffer=*/request.allocation_value->value(),
+      /*size=*/request.size,
+      /*start=*/request.end_time,
+      /*end=*/request.end_time,
+      /*colocations=*/{},
+      /*need_allocation=*/true,
+  };
+
+  Chunk chunk_candidate = FindChunkCandidate(alternate_mem_interval);
+
+  if (chunk_candidate.chunk_end() > available_heap_size()) {
+    return AllocationResult::kFailOutOfMemory;
+  }
+
+  AddAsyncCopyOrOtherMemOp(
+      prev_allocation_in_default_mem, MemorySpace::kAlternate, chunk_candidate,
+      request.end_time - 1, request.end_time, request.end_time,
+      request.allocation_value->mutable_allocation_sequence(),
+      /*aliased_offset=*/nullptr, 0.0f,
+      /*cross_program_prefetch_index=*/std::nullopt,
+      request.required_copy_allocation_for);
+  request.allocation_value->allocation_sequence()->back()->AddUse(
+      request.use->hlo_use);
+  return AllocationResult::kSuccess;
+}
+
+AllocationResult MsaAlgorithm::PrefetchWithResourceConstraints(
     const AllocationRequest& request,
     Allocation& prev_allocation_in_default_mem, const Shape* shape) {
   // Try partially placing the buffer in the alternate space. The time that is
@@ -5790,13 +6290,6 @@ AllocationResult MsaAlgorithm::EnsureSomeSpatialPrefetchFitExists(
 
 namespace {
 
-// GetAsyncCopyElapsed with a default value.
-float CopyResourceForShape(const Options& options, const Shape& shape) {
-  return options.cost_analysis
-             ? options.cost_analysis->GetAsyncCopyElapsed(shape)
-             : 0.1;
-}
-
 // Returns the copy resources needed for the specified slice proposal
 // collection, in descending order.
 std::vector<float> GetCopyResourcesSortedDescending(
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.h b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
index 5fbb198394f0..1ba8449f683a 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.h
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <optional>
+#include <ostream>
 #include <set>
 #include <string>
 #include <tuple>
@@ -56,6 +57,7 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/options.h"
 #include "xla/service/memory_space_assignment/slice.h"
+#include "xla/service/memory_space_assignment/utils.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
@@ -184,7 +186,12 @@ class AsynchronousCopyResource {
   // The constructor needs the initial resources.
   explicit AsynchronousCopyResource(absl::Span<const float> initial_resources)
       : initial_resources_(initial_resources.begin(), initial_resources.end()),
-        delay_(initial_resources.size(), 0) {}
+        delay_(initial_resources.size(), 0) {
+    for (int i = 0; i < initial_resources.size(); ++i) {
+      initial_resources_scaled_.push_back(
+          GetScaledIntegerResource(initial_resources[i]));
+    }
+  }
 
   // Adds the given asynchronous copy and updates the current resources. CHECK
   // fails if there aren't enough resources to satisfy this copy (the caller
@@ -203,13 +210,24 @@ class AsynchronousCopyResource {
   // order specified.
   bool HasEnoughResourceMultiCheck(const std::vector<ResourceSpec>& specs);
 
+  int64_t GetScaledIntegerResource(float resource) const {
+    float scaled_value = resource * kCopyResourceIntScale;
+    int64_t scaled_value_int = static_cast<int64_t>(scaled_value);
+    return scaled_value_int;
+  }
+
+  float GetDescaledFloatResource(int64_t scaled_resource) const {
+    return scaled_resource / kCopyResourceIntScale;
+  }
+
   // This is only used for debugging and testing purposes, it returns the
   // currently available resource at each logical time.
   std::vector<float> GetCurrentResources() const {
     std::vector<float> current_resources(initial_resources_.begin(),
                                          initial_resources_.end());
     for (int i = 0; i < current_resources.size(); ++i) {
-      current_resources[i] -= std::min(current_resources[i], delay_[i]);
+      current_resources[i] -=
+          std::min(current_resources[i], GetDescaledFloatResource(delay_[i]));
     }
     return current_resources;
   }
@@ -219,6 +237,11 @@ class AsynchronousCopyResource {
   std::string Dump(int64_t start_time, int64_t end_time,
                    MemorySpace memory_space_filter) const;
 
+  // The scale factor to convert a float resource to an integer resource. Note
+  // that is a power of 2 to avoid introducing noise when casting the scaled
+  // value to an int64_t.
+  static constexpr int64_t kCopyResourceIntScale = 1ULL << 50;
+
  private:
   // Internal helper method to implement adding/removing/checking resources.
   // ConsumeResource() may modify delay_. If delay_changes is not null,
@@ -226,9 +249,9 @@ class AsynchronousCopyResource {
   // delay_changes, allowing callers to undo any modifications by iterating over
   // the vector in reverse order.
   bool ConsumeResource(
-      int64_t exclusive_start_time, int64_t end_time, float resource,
-      std::vector<std::pair<int64_t, float>>* delay_changes = nullptr,
-      float resource_to_free = 0.0);
+      int64_t exclusive_start_time, int64_t end_time, int64_t resource,
+      std::vector<std::pair<int64_t, int64_t>>* delay_changes = nullptr,
+      int64_t resource_to_free = 0.0);
 
   // Same as the public RemoveCopy except it works on the async_copies_
   // iterator. Assumes copy_it points to the last copy for its start time;
@@ -252,7 +275,31 @@ class AsynchronousCopyResource {
   std::map<int64_t, std::list<AsynchronousCopy>::iterator> async_copy_time_map_;
 #endif
   std::vector<float> initial_resources_;
-  std::vector<float> delay_;
+  std::vector<int64_t> initial_resources_scaled_;
+  std::vector<int64_t> delay_;
+  // A vector of pairs of (time, delay) used by
+  // HasEnoughResourceMultiCheck(), stored here to avoid reallocations.
+  std::vector<std::pair<int64_t, int64_t>> delay_changes_;
+};
+
+// Helper class to compute a minimal fingerprint of an HloInstruction and it's
+// operand shapes for MSA.
+class MsaInstructionFingerprint {
+ public:
+  explicit MsaInstructionFingerprint(const HloInstruction* instruction)
+      : inst_(instruction) {};
+
+  template <typename H>
+  friend H AbslHashValue(H h, const MsaInstructionFingerprint& fp) {
+    for (const HloInstruction* operand : fp.inst_->operands()) {
+      h = H::combine(std::move(h), operand->shape());
+    }
+    return H::combine(std::move(h), fp.inst_->opcode(),
+                      fp.inst_->operand_count(), fp.inst_->shape());
+  }
+
+ private:
+  const HloInstruction* inst_;
 };
 
 // This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
@@ -266,10 +313,8 @@ class AsynchronousCopyResource {
 // method which is overridden in this class.
 class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
  public:
-  using HloPositionOrUse = std::variant<HloPosition, HloUse>;
-
-  MsaAlgorithm(AllocationSequence* allocations, const Options& options,
-               const HloAliasAnalysis& alias_analysis,
+  MsaAlgorithm(HloModule* module, AllocationSequence* allocations,
+               const Options& options, const HloAliasAnalysis& alias_analysis,
                const HloLiveRange& hlo_live_range);
 
   // Allocates a buffer in preferred memory with whole program lifetime and
@@ -327,12 +372,17 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   const HloAliasAnalysis& alias_analysis() { return alias_analysis_; }
   const HloLiveRange& hlo_live_range() { return hlo_live_range_; }
 
+  // Runs a feature that attempts to expand the size of scoped alternate memory
+  // allocations to the largest contiguous open space available.
+  void ExtendScopedAlternateMemoryAllocations();
+
  private:
   // We inherit AllocationBlock struct to attach the Allocation information to
   // make importing repacked offsets easier.
   struct RepackAllocationBlock : AllocationBlock {
     Allocation* allocation;
   };
+
   // This struct contains mandatory memory assignments at a given time. E.g., an
   // input's required memory assignment time would correspond to the definition
   // time of the parameter instruction, and an output's time would correspond to
@@ -354,6 +404,8 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
     bool operator!=(const RequiredMemoryAssignment& other) const {
       return !(*this == other);
     }
+
+    std::string ToString() const;
   };
 
   // A struct that contains a pointer to loop-optimized allocation along with
@@ -568,6 +620,9 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   // Allocates buffers for instructions that need reserved scoped allocations in
   // the alternate memory space.
   void AllocateReservedScopedAllocations();
+  void AllocateScopedAllocation(HloInstruction* instruction,
+                                bool is_post_module, int64_t size,
+                                int64_t time);
 
   // Returns the AliasedOffset object associated with the allocation.
   AliasedOffset* GetAliasedOffset(const Allocation& allocation);
@@ -627,6 +682,9 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   // only_extend_existing_allocation is true, no new Allocations will be created
   // while processing the resulting AllocationRequest, and we only need to
   // extend an existing Allocation's end_time.
+  //
+  // * processed_allocation_values: The AllocationValues that have already been
+  //   processed for the same parent HloValue as is used in the request.
   AllocationRequest CreateAllocationRequest(
       AllocationValue& allocation_value,
       AllocationValue& allocation_value_to_update,
@@ -634,7 +692,8 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
       AliasedOffset* preferred_offset, int64_t definition_time,
       bool require_no_copy_alternate_mem_allocation,
       const std::vector<int64_t>& all_use_times,
-      bool only_extend_existing_allocation);
+      bool only_extend_existing_allocation,
+      absl::Span<AllocationValue> processed_allocation_values);
 
   // Returns true, if the allocation value requires a pinned allocation in the
   // alternate memory space.
@@ -663,6 +722,18 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   absl::StatusOr<AllocationResult> AllocateAllocationValues(
       absl::Span<AllocationValue> allocation_values);
 
+  // Checks for a situation in which an HloValue has more than one live
+  // AllocationValue at the same time, and the already processed AllocationValue
+  // has been given alternate memory at the start of the second AllocationValue.
+  // If such a case is detected, we set
+  // request.no_copy_chunk_inclusive_start_time with the time where the first
+  // AllocationValue left off. AllocateInAlternateMemoryNoCopy() takes advantage
+  // of that information.
+  void CheckAndUpdateForDualLiveAllocationValues(
+      const std::optional<RequiredMemoryAssignment>&
+          required_memory_assignment_at_start,
+      AllocationRequest& request);
+
   // Finds an allocation for an allocation request for a segment (see the
   // documentation for AllocationRequest above how a segment is defined).
   //
@@ -687,17 +758,30 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   AllocationResult AllocateInAlternateMemoryNoCopy(
       const AllocationRequest& request);
 
-  // Try evicting to default memory space.
-  AllocationResult Evict(const AllocationRequest& request);
+  // Try allocating in alternate memory for the minimum time possible.
+  AllocationResult ForceAlternateMemoryAllocationForMinTime(
+      const AllocationRequest& request);
+
+  // Try evicting to default memory space. If force_evict is true, we will
+  // evict even if the resource constraints for an eviction are not met.
+  AllocationResult Evict(const AllocationRequest& request,
+                         bool force_evict = false);
 
   // Returns the time a copy done of a prefetch should be scheduled.
   int64_t FindPrefetchEndTime(const AllocationRequest& request,
                               int64_t earliest_prefetch_time) const;
 
-  // Try prefetching to alternate memory space.
+  // Try prefetching to alternate memory space. If force_prefetch is true, we
+  // will prefetch even if the resource constraints for a prefetch are not met.
   AllocationResult Prefetch(const AllocationRequest& request,
                             Allocation& prev_allocation_in_default_mem,
-                            const Shape* shape = nullptr);
+                            const Shape* shape = nullptr,
+                            bool force_prefetch = false);
+
+  // Prefetch to alternate memory iff the resource constraints are met.
+  AllocationResult PrefetchWithResourceConstraints(
+      const AllocationRequest& request,
+      Allocation& prev_allocation_in_default_mem, const Shape* shape = nullptr);
 
   // Helper methods used to implement Prefetch().
   //
@@ -1014,6 +1098,32 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   void MaybeSplitAllocationValues(
       absl::Span<AllocationValue> allocation_values);
 
+  // Processes the buffer uses that have been colored. Note: Defining position
+  // of a buffer is also considered as a use that can be colored.
+  absl::Status ProcessColoredBuffers();
+
+  // Removes the reserved chunk from the interval_tree_ for the given
+  // allocation (if it is still reserved) and removes the corresponding
+  // RepackAllocationBlock from repack_allocation_blocks_.
+  void ReleaseReservedAllocationForAlternateMemoryColorings(
+      ReservedAllocation* allocation);
+
+  // Frees the reserved allocations that are used to satisfy alternate memory
+  // coloring requirements, for the given allocation request.
+  void FreeAlternateMemoryColoringReservedAllocations(
+      AllocationRequest& request);
+
+  // Sets the alternate memory coloring requirements for the given allocation
+  // request.
+  void UpdateRequestWithAlternateMemoryColoringRequirements(
+      AllocationRequest& request);
+
+  // Sets the default memory coloring requirements for the given allocation
+  // request.
+  void UpdateRequestWithDefaultMemoryColoringRequirements(
+      AllocationRequest& request);
+
+  HloModule* module_ = nullptr;
   AllocationSequence* allocations_;
   const Options& options_;
   const HloAliasAnalysis& alias_analysis_;
@@ -1062,10 +1172,10 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   int64_t memory_pressure_ = 0;
   int64_t next_async_copy_id_ = 0;
   // Fingerprint cache.
-  absl::flat_hash_map<const HloInstruction*, std::string> fingerprint_map_;
+  absl::flat_hash_map<const HloInstruction*, uint64_t> fingerprint_map_;
   // Vector of repeated instructions (that have the same fingerprint) indexed by
   // fingerprint.
-  absl::flat_hash_map<std::string, std::vector<const HloInstruction*>>
+  absl::flat_hash_map<uint64_t, std::vector<const HloInstruction*>>
       repeated_inst_map_;
 
   // Loop-optimized allocations found by MemoryBoundLoopOptimizer. These
@@ -1101,6 +1211,17 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   std::string buffer_info_str_;
   std::string allocation_info_str_;
   std::string instruction_schedule_str_;
+
+  // Maps an HloPosition to the chunk intervals that are reserved for it in
+  // alternate memory, in order to satisfy buffer coloring requirements.
+  absl::flat_hash_map<HloPosition,
+                      std::vector<std::unique_ptr<ReservedAllocation>>>
+      reserved_allocations_for_alt_mem_colorings_;
+
+  // Maps an HloPosition to the list of times it is required to be in
+  // default memory, to meet buffer coloring requirements.
+  absl::flat_hash_map<HloPosition, std::vector<int64_t>>
+      default_memory_coloring_requirements_;
 };
 
 }  // namespace memory_space_assignment
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
index 7e7d83640147..47f1ba591246 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -236,19 +236,14 @@ HloInstruction* Allocation::AddGetTupleElements() const {
 Allocation::Allocation(HloPosition defining_position, MemorySpace memory_space,
                        std::optional<HeapSimulator::Chunk> chunk,
                        int64_t start_time, int64_t end_time,
-                       bool is_scoped_allocation,
                        std::optional<int64_t> cross_program_prefetch_index)
     : original_defining_position_(std::move(defining_position)),
       memory_space_(memory_space),
       chunk_(chunk),
       start_time_(start_time),
       end_time_(end_time),
-      is_scoped_allocation_(is_scoped_allocation),
       cross_program_prefetch_index_(cross_program_prefetch_index),
-      split_shape_(std::nullopt) {
-  CHECK(!is_scoped_allocation ||
-        original_defining_position_.index == ShapeIndex({}));
-}
+      split_shape_(std::nullopt) {}
 
 HloPosition Allocation::original_defining_position() const {
   return original_defining_position_;
@@ -263,18 +258,15 @@ bool Allocation::base_is_equal(const Allocation& other) const {
          uses() == other.uses() && memory_space() == other.memory_space() &&
          chunk() == other.chunk() && start_time() == other.start_time() &&
          end_time() == other.end_time() &&
-         earliest_available_time() == other.earliest_available_time() &&
-         is_copy_allocation() == other.is_copy_allocation() &&
-         is_scoped_allocation() == other.is_scoped_allocation();
+         earliest_available_time() == other.earliest_available_time();
 }
 
 PinnedAllocation::PinnedAllocation(HloPosition defining_position,
                                    MemorySpace memory_space,
                                    std::optional<HeapSimulator::Chunk> chunk,
-                                   int64_t start_time, int64_t end_time,
-                                   bool is_scoped_allocation)
+                                   int64_t start_time, int64_t end_time)
     : Allocation(std::move(defining_position), memory_space, chunk, start_time,
-                 end_time, is_scoped_allocation,
+                 end_time,
                  /*cross_program_prefetch_index=*/std::nullopt) {}
 
 HloPosition PinnedAllocation::defining_position() const {
@@ -300,19 +292,8 @@ bool PinnedAllocation::operator==(const Allocation& other) const {
 }
 
 absl::Status PinnedAllocation::Process(const BitcastSplitFn& bitcast_split_fn) {
-  if (is_scoped_allocation()) {
-    // Nothing to do here for scoped allocations.
-    return absl::OkStatus();
-  }
   HloInstruction* producing_instruction = AddGetTupleElements();
   HloComputation* computation = producing_instruction->parent();
-
-  if (memory_space() == MemorySpace::kAlternate &&
-      mutable_split_shape().has_value()) {
-    CHECK(Shape::Equal().IgnoreSplitConfigInLayout()(
-        producing_instruction->shape(), mutable_split_shape().value()));
-    *producing_instruction->mutable_shape() = mutable_split_shape().value();
-  }
   return UpdateUses(computation, producing_instruction, bitcast_split_fn);
 }
 
@@ -323,8 +304,7 @@ std::string PinnedAllocation::ToString() const {
     absl::StrAppend(&memory_space_str, " (off: ", chunk->offset,
                     ", size: ", chunk->size, ")");
   }
-  return absl::StrCat((is_scoped_allocation() ? "Scoped " : ""),
-                      "PinnedAllocation in ", memory_space_str, " defined at ",
+  return absl::StrCat("PinnedAllocation in ", memory_space_str, " defined at ",
                       original_defining_position().ToString(),
                       ", start_time:", start_time(), ", end_time:", end_time(),
                       ", uses: ", UsesToString(uses()));
@@ -340,6 +320,56 @@ void PinnedAllocation::MarkNeeded(
   needed_allocations.insert(this);
 }
 
+ReservedAllocation::ReservedAllocation(HloPosition defining_position,
+                                       HeapSimulator::Chunk chunk,
+                                       int64_t reservation_time)
+    : Allocation(std::move(defining_position), MemorySpace::kAlternate, chunk,
+                 /*start_time=*/reservation_time, /*end_time=*/reservation_time,
+                 /*cross_program_prefetch_index=*/std::nullopt),
+      reserved_(true) {}
+
+HloPosition ReservedAllocation::defining_position() const {
+  return original_defining_position();
+}
+
+absl::Status ReservedAllocation::Process(
+    const BitcastSplitFn& bitcast_split_fn) {
+  return absl::OkStatus();
+}
+
+void ReservedAllocation::MarkIfNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  MarkNeeded(needed_allocations);
+}
+
+void ReservedAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+}
+
+std::string ReservedAllocation::ToString() const {
+  std::string memory_space_str = MemorySpaceToString(memory_space());
+  std::optional<HeapSimulator::Chunk> chunk = maybe_chunk();
+  if (chunk) {
+    absl::StrAppend(&memory_space_str, " chunk: ", chunk->ToString());
+  }
+  return absl::StrCat(
+      "ReservedAllocationdefined in alternate memory defined at ",
+      original_defining_position().ToString(),
+      ", reservation_time:", start_time(), ", reserved: ", reserved_);
+}
+
+bool ReservedAllocation::operator==(const Allocation& other) const {
+  const ReservedAllocation* casted_other =
+      dynamic_cast<const ReservedAllocation*>(&other);
+  return casted_other != nullptr && (*this) == (*casted_other);
+}
+
+bool ReservedAllocation::operator==(const ReservedAllocation& other) const {
+  return this->base_is_equal(static_cast<const Allocation&>(other)) &&
+         reserved_ == other.reserved_;
+}
+
 CopyAllocation::CopyAllocation(
     Allocation& prev_allocation, MemorySpace memory_space,
     std::optional<HeapSimulator::Chunk> chunk,
@@ -351,8 +381,7 @@ CopyAllocation::CopyAllocation(
           /*defining_position=*/{nullptr, {}}, memory_space, chunk,
           // Allocation uses an inclusive start time
           ExclusiveToInclusiveStartTime(copy_start_schedule_after_time),
-          end_time,
-          /*is_scoped_allocation=*/false, cross_program_prefetch_index),
+          end_time, cross_program_prefetch_index),
       prev_allocation_(prev_allocation),
       copy_start_schedule_after_(copy_start_schedule_after_time),
       copy_done_schedule_before_(copy_done_schedule_before_time),
@@ -365,26 +394,17 @@ int64_t CopyAllocation::earliest_available_time() const {
 absl::Status CopyAllocation::Process(const BitcastSplitFn& bitcast_split_fn) {
   // Copy allocations need to insert asynchronous copy nodes.
   Shape shape = defining_position().shape();
-  if (memory_space() == MemorySpace::kAlternate && sync_mem_op_ != nullptr &&
-      mutable_split_shape().has_value()) {
-    *sync_mem_op_->mutable_shape() = mutable_split_shape().value();
-  }
   HloInstruction* producing_instruction = AddGetTupleElements();
   HloComputation* computation = producing_instruction->parent();
   if (sync_mem_op_ != nullptr && sync_mem_op_->opcode() != HloOpcode::kCopy) {
-    if (sync_mem_op_->opcode() == HloOpcode::kSlice) {
+    if (sync_mem_op_->opcode() == HloOpcode::kSlice ||
+        sync_mem_op_->opcode() == HloOpcode::kDynamicSlice) {
       TF_ASSIGN_OR_RETURN(copy_done_,
                           computation->CreateAsyncInstructions(
                               sync_mem_op_, {ShapeUtil::MakeShape(S32, {})},
                               HloInstruction::kMainExecutionThread, false));
     } else {
-      TF_RET_CHECK(sync_mem_op_->opcode() == HloOpcode::kDynamicSlice);
-      TF_ASSIGN_OR_RETURN(
-          copy_done_,
-          computation->CreateAsyncInstructions(
-              sync_mem_op_,
-              {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})},
-              HloInstruction::kMainExecutionThread, false));
+      return Internal("Sync mem op is not a copy, slice, or dynamic slice.");
     }
     copy_start_ = copy_done_->mutable_operand(0);
     // If the shape of the copy start operand is not compatible with the
@@ -399,12 +419,9 @@ absl::Status CopyAllocation::Process(const BitcastSplitFn& bitcast_split_fn) {
     TF_RETURN_IF_ERROR(
         copy_start_->ReplaceOperandWith(0, producing_instruction));
   } else {
-    Shape dest_shape;
-    if (memory_space() == MemorySpace::kAlternate &&
-        mutable_split_shape().has_value()) {
-      dest_shape = mutable_split_shape().value();
-    } else {
-      dest_shape = shape;
+    Shape dest_shape = shape;
+    if (memory_space() == MemorySpace::kDefault) {
+      dest_shape.mutable_layout()->clear_split_configs();
     }
     copy_start_ = computation->AddInstruction(HloInstruction::CreateCopyStart(
         ShapeUtil::MakeTupleShape(
@@ -509,7 +526,6 @@ SlicedCopyAllocation::SlicedCopyAllocation(
               GetSlicedCopyAllocationExclusiveStartTime(
                   slice_decisions_sorted_by_exclusive_start_time)),
           end_time,
-          /*is_scoped_allocation=*/false,
           /*cross_program_prefetch_index=*/std::nullopt),
       original_shape_to_slice_(prev_allocation.defining_position().shape()),
       prev_allocation_(prev_allocation),
@@ -748,11 +764,13 @@ bool SlicedCopyAllocation::SliceDetail::operator==(
 absl::Status SlicedCopyAllocation::SliceDetail::CreateAsyncSlice(
     const Shape& original_shape, HloInstruction& producer,
     HloComputation& parent) {
-  if (original_shape.rank() != slice_decision.sizing.slice_params.size()) {
+  if (original_shape.dimensions().size() !=
+      slice_decision.sizing.slice_params.size()) {
     return FailedPrecondition(
         "%s", absl::StrCat("The number of SlicedCopyAllocation parameters ",
                            slice_decision.sizing.slice_params.size(),
-                           " does not match the rank ", original_shape.rank(),
+                           " does not match the rank ",
+                           original_shape.dimensions().size(),
                            " of the tensor we are slicing."));
   }
 
@@ -820,17 +838,13 @@ MirroredAllocation::MirroredAllocation(const Allocation& original_allocation,
     : Allocation(original_allocation.defining_position(), MemorySpace::kDefault,
                  original_allocation.maybe_chunk(),
                  /*start_time=*/time,
-                 /*end_time=*/time, /*is_scoped_allocation=*/false,
+                 /*end_time=*/time,
                  /*cross_program_prefetch_index=*/std::nullopt),
       original_allocation_(original_allocation) {}
 
 absl::Status MirroredAllocation::Process(
     const BitcastSplitFn& bitcast_split_fn) {
   set_original_defining_position(original_allocation_.defining_position());
-  if (is_scoped_allocation()) {
-    // Nothing to do here for scoped allocations.
-    return absl::OkStatus();
-  }
   HloInstruction* producing_instruction = AddGetTupleElements();
   HloComputation* computation = producing_instruction->parent();
   return UpdateUses(computation, producing_instruction, bitcast_split_fn);
@@ -842,7 +856,7 @@ ParentAllocation::ParentAllocation(const Allocation& original_allocation,
     : Allocation(std::move(position), MemorySpace::kDefault,
                  original_allocation.maybe_chunk(),
                  /*start_time=*/time,
-                 /*end_time=*/time, /*is_scoped_allocation=*/false,
+                 /*end_time=*/time,
                  /*cross_program_prefetch_index=*/std::nullopt),
       original_allocation_(original_allocation),
       calling_instruction_(calling_instruction) {}
@@ -856,7 +870,7 @@ absl::Status ParentAllocation::Process(const BitcastSplitFn& bitcast_split_fn) {
   // in the default memory space.
   HloInstruction* producing_instruction =
       original_allocation_.AddGetTupleElements();
-  int new_tuple_index = calling_instruction_->shape().tuple_shapes_size();
+  int new_tuple_index = calling_instruction_->shape().tuple_shapes().size();
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * new_while_operand,
@@ -884,10 +898,6 @@ absl::Status ParentAllocation::Process(const BitcastSplitFn& bitcast_split_fn) {
   TF_RETURN_IF_ERROR(calling_instruction_->ReplaceAllUsesWithDifferentShape(
       while_users, tuple_with_old_shape));
 
-  if (is_scoped_allocation()) {
-    // Nothing to do here for scoped allocations.
-    return absl::OkStatus();
-  }
   HloInstruction* final_instruction = AddGetTupleElements();
   HloComputation* computation = final_instruction->parent();
   return UpdateUses(computation, final_instruction, bitcast_split_fn);
@@ -956,7 +966,6 @@ WindowPrefetchedAllocation::WindowPrefetchedAllocation(
           {nullptr, {}}, MemorySpace::kAlternate, chunk,
           ExclusiveToInclusiveStartTime(prefetch_start_schedule_after_time),
           InclusiveToExclusiveEndTime(prefetch_done_schedule_before_time),
-          /*is_scoped_allocation=*/false,
           /*cross_program_prefetch_index=*/std::nullopt),
       options_(options),
       prev_allocation_(prev_allocation),
@@ -1059,6 +1068,56 @@ bool WindowPrefetchedAllocation::operator==(const Allocation& other) const {
   return casted_other != nullptr && (*this) == (*casted_other);
 }
 
+ScopedAllocation::ScopedAllocation(HeapSimulator::Chunk chunk,
+                                   int64_t allocation_time,
+                                   HloInstruction* defining_instruction,
+                                   bool is_post_module)
+    : Allocation(/*defining_position=*/{defining_instruction, {}},
+                 MemorySpace::kAlternate, chunk,
+                 /*start_time=*/allocation_time,
+                 /*end_time=*/allocation_time,
+                 /*cross_program_prefetch_index=*/std::nullopt),
+      is_post_module_(is_post_module) {}
+
+HloPosition ScopedAllocation::defining_position() const {
+  return original_defining_position();
+}
+
+absl::Status ScopedAllocation::Process(const BitcastSplitFn& bitcast_split_fn) {
+  return absl::OkStatus();
+}
+
+void ScopedAllocation::MarkIfNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  MarkNeeded(needed_allocations);
+}
+
+void ScopedAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+}
+
+std::string ScopedAllocation::ToString() const {
+  std::string name = "<post-module>";
+  if (!is_post_module_) {
+    name = defining_position().instruction->name();
+  }
+  return absl::StrCat("Scoped Allocation defined for ", name,
+                      ", allocation_time: ", start_time(),
+                      ", chunk: ", chunk().ToString());
+}
+
+bool ScopedAllocation::operator==(const Allocation& other) const {
+  const ScopedAllocation* casted_other =
+      dynamic_cast<const ScopedAllocation*>(&other);
+  return casted_other != nullptr && (*this) == (*casted_other);
+}
+
+bool ScopedAllocation::operator==(const ScopedAllocation& other) const {
+  return this->base_is_equal(static_cast<const Allocation&>(other)) &&
+         is_post_module_ == other.is_post_module_;
+}
+
 std::tuple<int64_t, bool, int64_t> GetAllocationSortTuple(
     const std::unique_ptr<Allocation>& allocation) {
   int64_t scheduled_on_or_before = allocation->start_time();
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.h b/third_party/xla/xla/service/memory_space_assignment/allocation.h
index 40196c4b953e..aa55959f62da 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.h
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
@@ -35,7 +34,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/heap_simulator/allocation_block.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo_value.h"
@@ -97,7 +95,8 @@ class Allocation {
   void set_split_shape(const std::optional<Shape>& split_shape) {
     split_shape_ = split_shape;
   }
-  std::optional<Shape> mutable_split_shape() { return split_shape_; }
+  const std::optional<Shape>& split_shape() const { return split_shape_; }
+  std::optional<Shape>& mutable_split_shape() { return split_shape_; }
 
   // Allocation timing methods
   // --------------------------------------------------------------------------
@@ -123,7 +122,6 @@ class Allocation {
   HeapSimulator::Chunk chunk() const;
   HeapSimulator::Chunk* mutable_chunk() { return &*chunk_; }
   void set_offset(int64_t offset);
-  bool is_scoped_allocation() const { return is_scoped_allocation_; }
   // Returns true if the allocation is in the alternate memory space.
   bool is_in_alternate_mem() const;
   // Returns true if the allocation is in the default memory space.
@@ -148,6 +146,7 @@ class Allocation {
   virtual bool is_copy_allocation() const = 0;
   virtual bool is_sliced_copy_allocation() const = 0;
   virtual bool is_window_prefetched_allocation() const = 0;
+  virtual bool is_scoped_allocation() const = 0;
   // True if the allocation is for a copy or a sliced-copy.
   bool is_copy_like_allocation() const;
 
@@ -183,7 +182,7 @@ class Allocation {
   // PinnedAllocation, CopyAllocation, etc.).
   Allocation(HloPosition defining_position, MemorySpace memory_space,
              std::optional<HeapSimulator::Chunk> chunk, int64_t start_time,
-             int64_t end_time, bool is_scoped_allocation,
+             int64_t end_time,
              std::optional<int64_t> cross_program_prefetch_index);
 
   // Returns the original defining position of this allocation.
@@ -198,7 +197,6 @@ class Allocation {
   std::optional<HeapSimulator::Chunk> chunk_;
   int64_t start_time_;
   int64_t end_time_;
-  const bool is_scoped_allocation_;
   std::vector<HloUse> uses_;
   std::optional<int64_t> cross_program_prefetch_index_;
   // If present, indicates the newly split shape.
@@ -220,8 +218,7 @@ class PinnedAllocation final : public Allocation {
  public:
   PinnedAllocation(HloPosition defining_position, MemorySpace memory_space,
                    std::optional<HeapSimulator::Chunk> chunk,
-                   int64_t start_time, int64_t end_time,
-                   bool is_scoped_allocation);
+                   int64_t start_time, int64_t end_time);
 
   // Overridden methods
   //
@@ -232,6 +229,7 @@ class PinnedAllocation final : public Allocation {
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return false; }
   bool is_window_prefetched_allocation() const override { return false; }
+  bool is_scoped_allocation() const override { return false; }
   absl::Status Process(const BitcastSplitFn& bitcast_split_fn) override;
   absl::Status PostProcess() override { return absl::OkStatus(); }
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
@@ -245,6 +243,47 @@ class PinnedAllocation final : public Allocation {
   bool operator==(const PinnedAllocation& other) const;
 };
 
+// This class represents an allocation that is used to reserve a chunk of
+// memory. If an HloPosition or an HloUse is colored in alternate memory, to
+// make sure we are able to satisfy the coloring requirements, we reserve a
+// chunk in the alternate memory before we start processing the buffers in
+// sorted order. The reserved chunk serves as a fallback in case we are not able
+// to satisfy the coloring requirements using the buffers in sorted order.
+class ReservedAllocation final : public Allocation {
+ public:
+  ReservedAllocation(HloPosition defining_position, HeapSimulator::Chunk chunk,
+                     int64_t reservation_time);
+
+  // Overridden methods
+  //
+  // Returns the original defining position.
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_pinned_allocation() const override { return false; }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  bool is_window_prefetched_allocation() const override { return false; }
+  bool is_scoped_allocation() const override { return false; }
+  absl::Status Process(const BitcastSplitFn& bitcast_split_fn) override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const ReservedAllocation& other) const;
+
+  bool is_chunk_reserved_in_interval_tree() const { return reserved_; }
+  void chunk_freed_in_interval_tree() { reserved_ = false; }
+
+ private:
+  // Indicates whether the chunk is still reserved in the interval_tree_.
+  bool reserved_;
+};
+
 // This class represents an allocation as a result of an asynchronous copy.
 // Note: CopyStart instructions are inserted after
 // `copy_start_schedule_after`, while CopyDone instructions are inserted
@@ -270,6 +309,7 @@ class CopyAllocation final : public Allocation {
   bool is_copy_allocation() const override { return true; }
   bool is_sliced_copy_allocation() const override { return false; }
   bool is_window_prefetched_allocation() const override { return false; }
+  bool is_scoped_allocation() const override { return false; }
   absl::Status Process(const BitcastSplitFn& bitcast_split_fn) override;
   absl::Status PostProcess() override { return absl::OkStatus(); }
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
@@ -376,6 +416,7 @@ class SlicedCopyAllocation final : public Allocation {
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return true; }
   bool is_window_prefetched_allocation() const override { return false; }
+  bool is_scoped_allocation() const override { return false; }
   // MemorySpaceAssignment::Process() calls Process(const BitcastSplitFn&
   // bitcast_split_fn) to create asynchronous slice copies, and a bitcast-concat
   // call to glue the slices back together.
@@ -452,6 +493,7 @@ class WindowPrefetchedAllocation final : public Allocation {
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return false; }
   bool is_window_prefetched_allocation() const override { return true; }
+  bool is_scoped_allocation() const override { return false; }
   // MemorySpaceAssignment::Process() calls Process(const BitcastSplitFn&
   // bitcast_split_fn) to create asynchronous window prefetches.
   absl::Status Process(const BitcastSplitFn& bitcast_split_fn) override;
@@ -507,6 +549,7 @@ class MirroredAllocation final : public Allocation {
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return false; }
   bool is_window_prefetched_allocation() const override { return false; }
+  bool is_scoped_allocation() const override { return false; }
   absl::Status Process(const BitcastSplitFn& bitcast_split_fn) override;
   absl::Status PostProcess() override { return absl::OkStatus(); }
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
@@ -541,6 +584,7 @@ class ParentAllocation final : public Allocation {
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return false; }
   bool is_window_prefetched_allocation() const override { return false; }
+  bool is_scoped_allocation() const override { return false; }
   absl::Status Process(const BitcastSplitFn& bitcast_split_fn) override;
   absl::Status PostProcess() override;
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
@@ -558,6 +602,39 @@ class ParentAllocation final : public Allocation {
   HloInstruction* calling_instruction_;
 };
 
+// An allocation representing scoped alternate memory.
+class ScopedAllocation final : public Allocation {
+ public:
+  // is_post_module is true if the allocation is for a scoped allocation that
+  // is used after the module.
+  ScopedAllocation(HeapSimulator::Chunk chunk, int64_t allocation_time,
+                   HloInstruction* defining_instruction, bool is_post_module);
+
+  // Overridden methods
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_pinned_allocation() const override { return false; }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  bool is_window_prefetched_allocation() const override { return false; }
+  bool is_scoped_allocation() const override { return true; }
+  absl::Status Process(const BitcastSplitFn& bitcast_split_fn) override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const ScopedAllocation& other) const;
+  bool is_post_module() const { return is_post_module_; }
+
+ private:
+  bool is_post_module_;
+};
+
 // A class with some utility functions that are useful in debugging.
 struct AllocationSequenceDebugging {
   // Developers can call this method to log all the allocations in alternate
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation_test.cc b/third_party/xla/xla/service/memory_space_assignment/allocation_test.cc
index fafafdbe2800..b8acfb20df10 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation_test.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo_value.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -32,7 +32,7 @@ limitations under the License.
 namespace xla::memory_space_assignment {
 namespace {
 
-class AllocationTest : public HloTestBase {};
+class AllocationTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(AllocationTest, CopyAllocationProcessSimple) {
   absl::string_view hlo_string = R"(
@@ -55,16 +55,16 @@ ENTRY entry {
   HeapSimulator::Chunk p1_negate_chunk =
       HeapSimulator::Chunk::FromOffsetSize(0, 24);
 
-  PinnedAllocation p1_negate_pinned(
-      HloPosition{p1_negate, {}}, MemorySpace::kAlternate, p1_negate_chunk,
-      /*start_time=*/0,
-      /*end_time=*/5, /*is_scoped_allocation=*/false);
+  PinnedAllocation p1_negate_pinned(HloPosition{p1_negate, {}},
+                                    MemorySpace::kDefault, p1_negate_chunk,
+                                    /*start_time=*/0,
+                                    /*end_time=*/5);
   CopyAllocation copy_allocation(p1_negate_pinned, MemorySpace::kAlternate,
                                  std::nullopt,
                                  /*copy_start_schedule_after_time=*/2,
                                  /*copy_done_schedule_before_time=*/3,
                                  /*end_time=*/5, std::nullopt,
-                                 /*sync_instruction=*/nullptr);
+                                 /*sync_mem_op=*/nullptr);
 
   // Use the correct instruction and operand numbers for the add instruction
   copy_allocation.AddUse(HloUse{add, 1});  // Use of p1_negate in add
@@ -89,6 +89,62 @@ ENTRY entry {
   EXPECT_EQ(copy_allocation.defining_position().instruction, copy_done);
 }
 
+TEST_F(AllocationTest, EvictedSplitShape) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  p0 = f32[2,3]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  p1_negate = f32[2,3]{1,0:S(1)SC(0:1)} negate(p1)
+  add = f32[2,3]{1,0} add(p0, p1_negate)
+  ROOT tuple = tuple(add, p0)
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // HloComputation* computation = module->entry_computation();
+  HloInstruction* add = FindInstruction(module.get(), "add");
+  HloInstruction* p1_negate = FindInstruction(module.get(), "p1_negate");
+
+  HeapSimulator::Chunk p1_negate_chunk =
+      HeapSimulator::Chunk::FromOffsetSize(0, 24);
+
+  PinnedAllocation p1_negate_pinned(HloPosition{p1_negate, {}},
+                                    MemorySpace::kAlternate, p1_negate_chunk,
+                                    /*start_time=*/0,
+                                    /*end_time=*/5);
+  CopyAllocation copy_allocation(p1_negate_pinned, MemorySpace::kDefault,
+                                 std::nullopt,
+                                 /*copy_start_schedule_after_time=*/2,
+                                 /*copy_done_schedule_before_time=*/3,
+                                 /*end_time=*/5, std::nullopt,
+                                 /*sync_mem_op=*/nullptr);
+
+  // Use the correct instruction and operand numbers for the add instruction
+  copy_allocation.AddUse(HloUse{add, 1});  // Use of p1_negate in add
+  BitcastSplitFn split_fn = nullptr;
+  TF_ASSERT_OK(copy_allocation.Process(split_fn));
+
+  // Check copy_start and copy_done instructions.
+  HloInstruction* copy_start = copy_allocation.copy_start();
+  ASSERT_NE(copy_start, nullptr);
+  EXPECT_EQ(copy_start->opcode(), HloOpcode::kCopyStart);
+  EXPECT_EQ(copy_start->operand(0), p1_negate);
+
+  HloInstruction* copy_done = copy_allocation.copy_done();
+  ASSERT_NE(copy_done, nullptr);
+  EXPECT_EQ(copy_done->opcode(), HloOpcode::kCopyDone);
+  EXPECT_EQ(copy_done->operand(0), copy_start);
+  EXPECT_EQ(copy_done->shape().layout().split_configs_size(), 0);
+
+  // Check that uses are updated.
+  EXPECT_EQ(add->operand(1), copy_done);
+
+  // Check defining position
+  EXPECT_EQ(copy_allocation.defining_position().instruction, copy_done);
+}
+
 TEST_F(AllocationTest, CopyAllocationProcessReplaceSyncSlice) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -112,16 +168,16 @@ ENTRY entry {
   HeapSimulator::Chunk p1_negate_chunk =
       HeapSimulator::Chunk::FromOffsetSize(0, 24);
 
-  PinnedAllocation p1_negate_pinned(
-      HloPosition{p1_negate, {}}, MemorySpace::kAlternate, p1_negate_chunk,
-      /*start_time=*/0,
-      /*end_time=*/5, /*is_scoped_allocation=*/false);
+  PinnedAllocation p1_negate_pinned(HloPosition{p1_negate, {}},
+                                    MemorySpace::kAlternate, p1_negate_chunk,
+                                    /*start_time=*/0,
+                                    /*end_time=*/5);
   CopyAllocation copy_allocation(p1_negate_pinned, MemorySpace::kAlternate,
                                  std::nullopt,
                                  /*copy_start_schedule_after_time=*/2,
                                  /*copy_done_schedule_before_time=*/3,
                                  /*end_time=*/5, std::nullopt,
-                                 /*sync_instruction=*/slice);
+                                 /*sync_mem_op=*/slice);
 
   // Use the correct instruction and operand numbers for the add instruction
   copy_allocation.AddUse(HloUse{add, 1});  // Use of p1_negate in add
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation_value.h b/third_party/xla/xla/service/memory_space_assignment/allocation_value.h
index ad4c9a4e22aa..fcf54c5f419f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation_value.h
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation_value.h
@@ -264,6 +264,29 @@ struct AllocationRequest {
   // Data structure that contains the options for making window prefetched
   // allocations.
   const WindowPrefetchedAllocation::Options* window_prefetch_options = nullptr;
+  // Previously processed AllocationValues, with the same parent HloValue as the
+  // request.
+  absl::Span<AllocationValue> processed_allocation_values;
+  // An optional override starting time for the placement of  a chunk on the MSA
+  // heap, for a no-copy allocation (see
+  // MsaAlgorithm::AllocateInAlternateMemoryNoCopy() for more details).
+  //
+  // Note, this override is used when an aliased AllocationValue has already
+  // done some of the heap allocation for us. So this request picks up where it
+  // left off.
+  std::optional<int64_t> no_copy_chunk_inclusive_start_time;
+  // Indicates if the AllocationRequest start time (definition time) has an
+  // alternate memory color requirement.
+  bool require_start_colored_in_alternate_memmory = false;
+  // Indicates if the AllocationRequest end time (use time) has an alternate
+  // memory color requirement.
+  bool require_end_colored_in_alternate_memory = false;
+  // Indicates if the AllocationRequest start time (definition time) has a
+  // default memory color requirement.
+  bool require_start_colored_in_default_memory = false;
+  // Indicates if the AllocationRequest end time (use time) has a default
+  // memory color requirement.
+  bool require_end_colored_in_default_memory = false;
 };
 
 // Result of an allocation, prefetch, eviction etc. request.  The result is
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
index db3d24aa06a7..0d01af52304c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/cost_modelling/op_cost.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -38,7 +38,8 @@ namespace {
 using memory_space_assignment::CostAnalysis;
 using memory_space_assignment::CostAnalysisOptions;
 
-class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase {
+class MemorySpaceAssignmentCostAnalysisTest
+    : public HloHardwareIndependentTestBase {
  protected:
   absl::Status Initialize(const HloModule* module,
                           float pipeline_overhead_window_size_mib = 0.0) {
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
index 4b16904915a5..e1e427bd9275 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
@@ -944,7 +944,7 @@ void MemoryBoundLoopOptimizer::PostProcess() {
       // using the defining position on the first HLO value.
       value.allocations.push_back(std::make_unique<PinnedAllocation>(
           value.hlo_values.front()->defining_position(), MemorySpace::kDefault,
-          std::nullopt, 0, loop_size_, /*is_scoped_allocation=*/false));
+          std::nullopt, 0, loop_size_));
       for (const HloUse& use : unallocated_uses) {
         value.allocations.back()->AddUse(use);
       }
@@ -987,8 +987,7 @@ bool MemoryBoundLoopOptimizer::AllocateTemporary(LoopValue& value) {
           << heap_.MemoryUsageToAsciiArt();
   value.allocations.push_back(std::make_unique<PinnedAllocation>(
       value.loop_positions[0].second, MemorySpace::kAlternate, std::nullopt,
-      definition_idx, max_use_idx,
-      /*is_scoped_allocation=*/false));
+      definition_idx, max_use_idx));
   AddAllLoopPositionsAndUses(value, /*allocate_next_iteration_uses=*/true);
   return true;
 }
@@ -1010,8 +1009,7 @@ bool MemoryBoundLoopOptimizer::AllocatePinned(LoopValue& value) {
           << heap_.MemoryUsageToAsciiArt();
   value.allocations.push_back(std::make_unique<PinnedAllocation>(
       *value.header_position, MemorySpace::kAlternate, std::nullopt, 0,
-      loop_size_,
-      /*is_scoped_allocation=*/false));
+      loop_size_));
   AddAllLoopPositionsAndUses(value, /*allocate_next_iteration_uses=*/false);
   return true;
 }
@@ -1502,7 +1500,7 @@ bool MemoryBoundLoopOptimizer::AllocatePrefetch(
   CHECK(value->header_position);
   value->allocations.push_back(std::make_unique<PinnedAllocation>(
       *value->header_position, MemorySpace::kDefault, std::nullopt, 0,
-      loop_size_, /*is_scoped_allocation=*/false));
+      loop_size_));
   int64_t begin_idx_in_loop = copy_start_loop_idx.value();
   int64_t end_idx_in_loop = last_use_idx_sentinel;
   // The chunk should always be present as we reproducing the same allocation
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
index cbb833cfc2c4..dffa36241ce7 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/buffer_value.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -261,7 +261,7 @@ TEST_F(LoopOptimizerBestFitHeapTest, TestRemoveChunk) {
   EXPECT_EQ(heap_.LastMemoryOffsetOccupied(), 0);
 }
 
-class MemoryBoundLoopOptimizerTest : public HloTestBase {
+class MemoryBoundLoopOptimizerTest : public HloHardwareIndependentTestBase {
  public:
   MemoryBoundLoopOptimizerTest() = default;
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 9d2d3c18e380..af5132c813d5 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -60,6 +60,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -396,6 +397,7 @@ MemorySpaceAssignment::RunMemorySpaceAssignment(
   if (options_.verify) {
     TF_RETURN_IF_ERROR(VerifyAllocations());
   }
+
   // DEBUG_LOG_ALLOCATIONS_AT
   //
   // Uncomment the following to log the alternate memory allocations that MSA
@@ -447,7 +449,7 @@ absl::Status MemorySpaceAssignment::FindAllocationSequence(
     const HloLiveRange& hlo_live_range,
     const HloAliasAnalysis& alias_analysis) {
   auto algorithm = std::make_unique<MsaAlgorithm>(
-      &allocations_, options_, alias_analysis, hlo_live_range);
+      module_, &allocations_, options_, alias_analysis, hlo_live_range);
 
   HeapSimulator::Options heap_simulator_options;
   heap_simulator_options.may_reuse_operand_buffers = false;
@@ -460,6 +462,24 @@ absl::Status MemorySpaceAssignment::FindAllocationSequence(
   return absl::OkStatus();
 }
 
+MemorySpaceAssignment::ScopedMemorySource
+MemorySpaceAssignment::ScopedMemorySource::ForInstruction(
+    HloInstruction* instruction) {
+  return ScopedMemorySource{false, instruction};
+}
+
+MemorySpaceAssignment::ScopedMemorySource
+MemorySpaceAssignment::ScopedMemorySource::ForPostModule() {
+  return ScopedMemorySource{true, nullptr};
+}
+
+std::string MemorySpaceAssignment::ScopedMemorySource::ToString() const {
+  if (is_post_module) {
+    return "<post-module>";
+  }
+  return std::string(instruction->name());
+}
+
 absl::Status MemorySpaceAssignment::Process(
     const HloLiveRange& hlo_live_range) {
   VLOG(1) << "Processing assigned buffers...";
@@ -486,8 +506,17 @@ absl::Status MemorySpaceAssignment::Process(
     // the output map.
     if (allocation->is_scoped_allocation()) {
       CHECK(allocation->memory_space() == MemorySpace::kAlternate);
-      scoped_memory_assignments_.emplace_back(
-          allocation->defining_position().instruction, allocation->chunk());
+      ScopedAllocation* scoped_allocation =
+          static_cast<ScopedAllocation*>(allocation.get());
+      if (scoped_allocation->is_post_module()) {
+        scoped_memory_assignments_.emplace_back(
+            ScopedMemorySource::ForPostModule(), allocation->chunk());
+      } else {
+        scoped_memory_assignments_.emplace_back(
+            ScopedMemorySource::ForInstruction(
+                scoped_allocation->defining_position().instruction),
+            allocation->chunk());
+      }
       alternate_memory_size_ =
           std::max(alternate_memory_size_, allocation->chunk().chunk_end());
     } else if (allocation->memory_space() == MemorySpace::kAlternate) {
@@ -509,7 +538,14 @@ absl::Status MemorySpaceAssignment::Process(
           allocation->defining_position(), allocation->chunk());
       alternate_memory_size_ =
           std::max(alternate_memory_size_, allocation->chunk().chunk_end());
-
+      if (allocation->split_shape().has_value()) {
+        auto result = split_map_.insert({allocation->defining_position(),
+                                         &allocation->split_shape()->layout()});
+        if (!result.second) {
+          CHECK_EQ(*result.first->second,
+                   allocation->split_shape().value().layout());
+        }
+      }
       if (allocation->cross_program_prefetch_index().has_value()) {
         TF_RETURN_IF_ERROR(module_->SetCrossProgramPrefetchOffset(
             *allocation->cross_program_prefetch_index(),
@@ -565,11 +601,16 @@ absl::Status MemorySpaceAssignment::ExportAndColorBuffers(
 
   VLOG(3) << "Exported scoped allocations in alternate memory:";
   for (const auto& instruction_and_chunk : scoped_memory_assignments_) {
-    HloInstruction* instruction = instruction_and_chunk.first;
+    ScopedMemorySource scoped_memory_source = instruction_and_chunk.first;
     const HeapSimulator::Chunk& chunk = instruction_and_chunk.second;
+    if (scoped_memory_source.is_post_module) {
+      preset_assignments_->set_post_module_scoped_alternate_memory_chunk(chunk);
+    } else {
+      preset_assignments_->add_scoped_allocation_chunk(
+          scoped_memory_source.instruction, chunk);
+    }
     VLOG(3) << " [" << chunk.offset << ", " << chunk.size
-            << "] : " << instruction->name();
-    preset_assignments_->add_scoped_allocation_chunk(instruction, chunk);
+            << "] : " << scoped_memory_source.ToString();
   }
 
   if (!preset_assignments_->chunks().empty() ||
@@ -589,6 +630,7 @@ absl::Status MemorySpaceAssignment::ExportAndColorBuffers(
   for (const auto& defining_position_and_chunk :
        preset_assignments_->chunks()) {
     const HloPosition& defining_position = defining_position_and_chunk.first;
+    auto split_result = split_map_.find(defining_position);
     for (auto& buffer : alias_analysis.ComputeBuffersAt(
              defining_position.instruction, defining_position.index)) {
       for (auto& value : buffer->values()) {
@@ -600,10 +642,16 @@ absl::Status MemorySpaceAssignment::ExportAndColorBuffers(
                                   << position.ToString();
           shape->mutable_layout()->set_memory_space(
               options_.alternate_memory_space);
+          if (split_result != split_map_.end()) {
+            CHECK_EQ(shape->layout().split_configs_size(), 0);
+            shape->mutable_layout()->add_split_configs(
+                split_result->second->split_configs(0));
+          }
         }
       }
     }
   }
+
   return absl::OkStatus();
 }
 
@@ -693,14 +741,14 @@ absl::Status MemorySpaceAssignment::SimplifyGraph() {
         } else if (instruction->opcode() == HloOpcode::kTuple) {
           // Replace Tuple(GetTupleElement(x), ..., GetTupleElement(x)) pattern
           // with x.
-          bool can_replace =
-              instruction->operand_count() > 0 &&
-              instruction->operand(0)->opcode() ==
-                  HloOpcode::kGetTupleElement &&
-              instruction->operand(0)
-                      ->operand(0)
-                      ->shape()
-                      .tuple_shapes_size() == instruction->operand_count();
+          bool can_replace = instruction->operand_count() > 0 &&
+                             instruction->operand(0)->opcode() ==
+                                 HloOpcode::kGetTupleElement &&
+                             instruction->operand(0)
+                                     ->operand(0)
+                                     ->shape()
+                                     .tuple_shapes()
+                                     .size() == instruction->operand_count();
           for (int operand_number = 0;
                operand_number < instruction->operand_count();
                ++operand_number) {
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index c488248550b7..9b49f847c844 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -180,6 +180,7 @@ Useful logging and error messages
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -200,6 +201,7 @@ Useful logging and error messages
 #include "xla/service/memory_space_assignment/allocation.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/options.h"
+#include "xla/shape.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -258,6 +260,18 @@ class PresetAssignments {
     return assignment_info_;
   }
 
+  // A chunk of alternate memory that has been allocated for post-module
+  // scoped operations.
+  std::optional<HeapSimulator::Chunk>
+  post_module_scoped_alternate_memory_chunk() const {
+    return post_module_scoped_alternate_memory_chunk_;
+  }
+
+  void set_post_module_scoped_alternate_memory_chunk(
+      const HeapSimulator::Chunk& chunk) {
+    post_module_scoped_alternate_memory_chunk_ = chunk;
+  }
+
   // Get debugging information.
   std::string buffer_info_str() const { return buffer_info_str_; }
   std::string allocation_info_str() const { return allocation_info_str_; }
@@ -269,6 +283,8 @@ class PresetAssignments {
   std::vector<std::pair<HloPosition, HeapSimulator::Chunk>> chunks_;
   std::vector<std::pair<HloInstruction*, HeapSimulator::Chunk>>
       scoped_allocation_chunks_;
+  std::optional<HeapSimulator::Chunk>
+      post_module_scoped_alternate_memory_chunk_ = std::nullopt;
   std::vector<std::pair<int64_t, AssignmentInformation>> assignment_info_;
   std::string buffer_info_str_;
   std::string allocation_info_str_;
@@ -361,6 +377,18 @@ class MemorySpaceAssignment {
   HloModule* module() { return module_; }
 
  private:
+  // A struct that represents the source of scoped alternate memory. It can be
+  // either for an instruction or for post-module operations.
+  struct ScopedMemorySource {
+    static ScopedMemorySource ForInstruction(HloInstruction* instruction);
+    static ScopedMemorySource ForPostModule();
+
+    std::string ToString() const;
+
+    bool is_post_module = false;
+    HloInstruction* instruction = nullptr;
+  };
+
   // Process calls Process methods of the allocations after the allocations have
   // been finalized.
   absl::Status Process(const HloLiveRange& hlo_live_range);
@@ -393,7 +421,9 @@ class MemorySpaceAssignment {
   std::unique_ptr<PresetAssignments> preset_assignments_;
   std::vector<std::pair<HloPosition, HeapSimulator::Chunk>>
       alternate_memory_assignments_;
-  std::vector<std::pair<HloInstruction*, HeapSimulator::Chunk>>
+  // Maps from a defining position to a shape if the tensor is split.
+  absl::flat_hash_map<HloPosition, const Layout*> split_map_;
+  std::vector<std::pair<ScopedMemorySource, HeapSimulator::Chunk>>
       scoped_memory_assignments_;
   int64_t alternate_memory_size_ = 0;
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
index 0588a1d29453..1c8903b810eb 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
@@ -193,3 +193,14 @@ message MsaSortOrderOverride {
 message MsaSortOrderOverrides {
   repeated MsaSortOrderOverride overrides = 1;
 }
+
+// Expanded scoped alternate memory is a feature used at the end of MSA, in
+// in which we attempt to expand the size of allocated scoped alternate memory
+// buffers to the largest contiguous open space available.
+message ExpandedScopedAlternateMemoryMode {
+  enum Value {
+    UNDEFINED = 0;
+    DISABLED = 1;
+    ENABLED = 2;
+  }
+}
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index d8069daae2bb..c0c8fe855241 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -108,6 +108,65 @@ const auto& ShapeSize = HloCostAnalysis::DefaultShapeSize;
 
 using MemorySpaceAssignmentTest = MemorySpaceAssignmentTestBase;
 
+// A mock MemorySpaceAssignmentRepacker class that accepts a map of
+// (start_time,offset) -> new_offset values. Using this map, the repacker
+// repacks the allocations to the new_offset.
+class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
+ public:
+  explicit FakeMemorySpaceAssignmentRepacker(
+      absl::flat_hash_map<std::pair<int64_t, int64_t>, int64_t>& repack_map,
+      std::function<void(absl::Span<AllocationBlock*>)> check_fun = nullptr,
+      bool always_return_modified = false)
+      : MemorySpaceAssignmentRepacker(/*max_size=*/128, /*alignment=*/8),
+        repack_map_(repack_map),
+        check_fun_(check_fun),
+        always_return_modified_(always_return_modified) {}
+
+  absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) override {
+    bool modified = false;
+    for (AllocationBlock* block : allocations) {
+      absl::flat_hash_set<int64_t> colocations;
+      std::string colocations_str;
+      for (const AllocationBlock* colocation : block->GetColocations()) {
+        absl::StrAppend(&colocations_str, colocation->id, ", ");
+        colocations.insert(colocation->id);
+      }
+      VLOG(1) << "Alloc id: " << block->id << " time: ["
+              << block->inclusive_start_time << ", " << block->end_time
+              << "] size: " << block->size
+              << " init offset: " << block->initial_offset << " colocations: {"
+              << colocations_str << "}";
+      auto it = repack_map_.find(
+          {block->inclusive_start_time, block->initial_offset});
+      if (it != repack_map_.end()) {
+        modified = true;
+        block->offset = it->second;
+      } else {
+        block->offset = block->initial_offset;
+      }
+      for (AllocationBlock* colocation : block->GetColocations()) {
+        if (it != repack_map_.end()) {
+          colocation->offset = it->second;
+        } else {
+          colocation->offset = colocation->initial_offset;
+        }
+      }
+    }
+    if (check_fun_) {
+      check_fun_(allocations);
+    }
+
+    return always_return_modified_ || modified;
+  }
+
+ private:
+  // A map from (start_time, offset) to new_offset.
+  absl::flat_hash_map<std::pair<int64_t, int64_t>, int64_t> repack_map_;
+  std::function<void(absl::Span<AllocationBlock*>)> check_fun_;
+  bool always_return_modified_;
+};
+
 TEST_F(MemorySpaceAssignmentTest, ParameterOnly) {
   // A module consisting of a single parameter. Inputs/outputs are currently
   // excluded from memory space assignment.
@@ -152,7 +211,16 @@ TEST_F(MemorySpaceAssignmentTest, Simple) {
   schedule.set_sequence(computation, {p0, p1, add, sub, mul});
   TF_CHECK_OK(module->set_schedule(schedule));
 
-  auto preset_assignments = AssignMemorySpace(module.get());
+  Options options = DefaultMemorySpaceOptions();
+  options.post_module_scoped_alternate_memory_size_in_bytes = 10;
+  auto preset_assignments = AssignMemorySpace(module.get(), options);
+
+  // Check that the post-module scoped alternate memory chunk is set correctly.
+  ASSERT_TRUE(preset_assignments->post_module_scoped_alternate_memory_chunk()
+                  .has_value());
+  EXPECT_EQ(
+      preset_assignments->post_module_scoped_alternate_memory_chunk()->size,
+      10);
 
   // Inputs and outputs are currently placed in the default memory. Everything
   // else should be in the alternate memory.
@@ -588,7 +656,9 @@ TEST_F(MemorySpaceAssignmentTest, SyncSliceReplacementAfterPrefetch) {
   EXPECT_THAT(concat->operand(1), op::AsyncDone(op::AsyncStart(p0)));
 }
 
-TEST_F(MemorySpaceAssignmentTest, SyncDynamicSliceReplacementAfterPrefetch) {
+// TODO(b/415757985): Enable after addressing the bug.
+TEST_F(MemorySpaceAssignmentTest,
+       DISABLED_SyncDynamicSliceReplacementAfterPrefetch) {
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -668,7 +738,9 @@ TEST_F(MemorySpaceAssignmentTest, SyncSliceReplacementIgnoredTrivials) {
               op::Bitcast(op::AsyncDone(op::AsyncStart(p0))));
 }
 
-TEST_F(MemorySpaceAssignmentTest, SyncDynamicSliceReplacementIgnoredTrivials) {
+// TODO(b/415757985): Enable after addressing the bug.
+TEST_F(MemorySpaceAssignmentTest,
+       DISABLED_SyncDynamicSliceReplacementIgnoredTrivials) {
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -757,7 +829,9 @@ TEST_F(MemorySpaceAssignmentTest, SyncSliceReplacementAfterEviction) {
                   kDefaultMemorySpace, kAlternateMemorySpace, negate_p0))));
 }
 
-TEST_F(MemorySpaceAssignmentTest, SyncDynamicSliceReplacementAfterEviction) {
+// TODO(b/415757985): Enable after addressing the bug.
+TEST_F(MemorySpaceAssignmentTest,
+       DISABLED_SyncDynamicSliceReplacementAfterEviction) {
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -846,7 +920,9 @@ TEST_F(MemorySpaceAssignmentTest, SyncSliceReplacementTwoSlices) {
   EXPECT_THAT(add->operand(1), op::AsyncDone(op::AsyncStart(p0)));
 }
 
-TEST_F(MemorySpaceAssignmentTest, SyncDynamicSliceReplacementTwoSlices) {
+// TODO(b/415757985): Enable after addressing the bug.
+TEST_F(MemorySpaceAssignmentTest,
+       DISABLED_SyncDynamicSliceReplacementTwoSlices) {
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -933,7 +1009,9 @@ TEST_F(MemorySpaceAssignmentTest, SyncSliceReplacementNestedSlices) {
   EXPECT_THAT(concat->operand(1), op::Slice(op::AsyncDone(op::AsyncStart(p0))));
 }
 
-TEST_F(MemorySpaceAssignmentTest, SyncDynamicSliceReplacementNestedSlices) {
+// TODO(b/415757985): Enable after addressing the bug.
+TEST_F(MemorySpaceAssignmentTest,
+       DISABLED_SyncDynamicSliceReplacementNestedSlices) {
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -1022,7 +1100,9 @@ TEST_F(MemorySpaceAssignmentTest, SyncSliceReplacementOneFails) {
   ASSERT_NE(slice0, nullptr);
 }
 
-TEST_F(MemorySpaceAssignmentTest, SyncDynamicSliceReplacementOneFails) {
+// TODO(b/415757985): Enable after addressing the bug.
+TEST_F(MemorySpaceAssignmentTest,
+       DISABLED_SyncDynamicSliceReplacementOneFails) {
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
 
@@ -4822,8 +4902,8 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
   // so it can be trivially placed in the alternate memory space.
   *ShapeUtil::GetMutableSubshape(&tuple_shape, {0})->mutable_layout() =
       LayoutUtil::MakeLayout(
-          /*minor_to_major=*/{1, 0}, /*dim_level_types=*/{}, /*dim_unique=*/{},
-          /*dim_ordered=*/{}, /*tiles=*/{},
+          /*minor_to_major=*/{1, 0},
+          /*tiles=*/{},
           /*tail_padding_alignment_in_elements=*/1,
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
@@ -4831,8 +4911,8 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
   // Index {1} is a scalar, so it is always placed in the default memory.
   *ShapeUtil::GetMutableSubshape(&tuple_shape, {1})->mutable_layout() =
       LayoutUtil::MakeLayout(
-          /*minor_to_major=*/{}, /*dim_level_types=*/{}, /*dim_unique=*/{},
-          /*dim_ordered=*/{}, /*tiles=*/{},
+          /*minor_to_major=*/{},
+          /*tiles=*/{},
           /*tail_padding_alignment_in_elements=*/1,
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
@@ -4840,8 +4920,8 @@ TEST_F(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
   // Index {2} of the while loop is placed in the default memory.
   *ShapeUtil::GetMutableSubshape(&tuple_shape, {2})->mutable_layout() =
       LayoutUtil::MakeLayout(
-          /*minor_to_major=*/{1, 0}, /*dim_level_types=*/{}, /*dim_unique=*/{},
-          /*dim_ordered=*/{}, /*tiles=*/{},
+          /*minor_to_major=*/{1, 0},
+          /*tiles=*/{},
           /*tail_padding_alignment_in_elements=*/1,
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
@@ -5343,30 +5423,509 @@ TEST_F(MemorySpaceAssignmentTest, MemoryBoundednessBufferIntervalCompare) {
                          tanh3, negate3, tanh4, negate4, tuple});
   TF_CHECK_OK(module->set_schedule(schedule));
 
-  AssignMemorySpaceUsingCostAnalysis(module.get());
-  // Parameters are in the default memory space.
-  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
-  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
-  Shape shape_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
-      F32, {4, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
-      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
-      kDefaultMemorySpace);
-  // Expect only negates to be in alternate memory space. Not all might fit but
-  // make sure at least one does.
-  std::vector<HloInstruction*> negate_instructions = {negate0, negate1, negate2,
-                                                      negate3, negate4};
-  int64_t num_negates_in_alternate_mem = absl::c_count_if(
-      negate_instructions, [&](const HloInstruction* instruction) {
-        return instruction->shape().layout().memory_space() ==
-               kAlternateMemorySpace;
-      });
-  EXPECT_GE(num_negates_in_alternate_mem, 1);
-  EXPECT_THAT(tanh0, op::ShapeWithLayout(shape_in_default_mem));
-  EXPECT_THAT(tanh1, op::ShapeWithLayout(shape_in_default_mem));
-  EXPECT_THAT(tanh2, op::ShapeWithLayout(shape_in_default_mem));
-  EXPECT_THAT(tanh3, op::ShapeWithLayout(shape_in_default_mem));
-  EXPECT_THAT(tanh4, op::ShapeWithLayout(shape_in_default_mem));
+  AssignMemorySpaceUsingCostAnalysis(module.get());
+  // Parameters are in the default memory space.
+  EXPECT_THAT(p0, op::ShapeWithLayout(shape));
+  EXPECT_THAT(p1, op::ShapeWithLayout(shape));
+  Shape shape_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {4, 3},
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
+      kDefaultMemorySpace);
+  // Expect only negates to be in alternate memory space. Not all might fit but
+  // make sure at least one does.
+  std::vector<HloInstruction*> negate_instructions = {negate0, negate1, negate2,
+                                                      negate3, negate4};
+  int64_t num_negates_in_alternate_mem = absl::c_count_if(
+      negate_instructions, [&](const HloInstruction* instruction) {
+        return instruction->shape().layout().memory_space() ==
+               kAlternateMemorySpace;
+      });
+  EXPECT_GE(num_negates_in_alternate_mem, 1);
+  EXPECT_THAT(tanh0, op::ShapeWithLayout(shape_in_default_mem));
+  EXPECT_THAT(tanh1, op::ShapeWithLayout(shape_in_default_mem));
+  EXPECT_THAT(tanh2, op::ShapeWithLayout(shape_in_default_mem));
+  EXPECT_THAT(tanh3, op::ShapeWithLayout(shape_in_default_mem));
+  EXPECT_THAT(tanh4, op::ShapeWithLayout(shape_in_default_mem));
+}
+
+TEST_F(MemorySpaceAssignmentTest, ColorOutputInAlternateMemory) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[3,4]{1,0} parameter(0)
+  p1 = f32[3,4]{1,0} parameter(1)
+  tanh0 = f32[3,4]{1,0} tanh(p0)
+  negate0 = f32[3,4]{1,0} negate(p1)
+  tanh1 = f32[3,4]{1,0} tanh(negate0)
+  negate1 = f32[3,4]{1,0} add(negate0, p1)
+  tanh2 = f32[3,4]{1,0} tanh(tanh1)
+  negate2 = f32[3,4]{1,0} add(negate1, p1)
+  tanh3 = f32[3,4]{1,0} tanh(tanh2)
+  negate3 = f32[3,4]{1,0} add(negate2, p1)
+  tanh4 = f32[3,4]{1,0} tanh(tanh3)
+  negate4 = f32[3,4]{1,0} add(negate3, tanh0)
+  ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+
+  HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
+  HloPosition tanh0_position{tanh0, {}};
+  memory_space_options.buffer_colorings = {
+      {tanh0_position, kAlternateMemorySpace}};
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "tanh0" }
+      override_options { assign_last: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(), memory_space_options,
+                                     std::nullopt, std::nullopt,
+                                     msa_sort_order_overrides);
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+  EXPECT_EQ(
+      FindInstruction(module.get(), "tanh0")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, ColorInputInAlternateMemory) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[3,4]{1,0} parameter(0)
+  p1 = f32[3,4]{1,0} parameter(1)
+  tanh0 = f32[3,4]{1,0} tanh(p0)
+  negate0 = f32[3,4]{1,0} negate(p1)
+  tanh1 = f32[3,4]{1,0} tanh(negate0)
+  negate1 = f32[3,4]{1,0} add(negate0, p1)
+  tanh2 = f32[3,4]{1,0} tanh(tanh1)
+  negate2 = f32[3,4]{1,0} add(negate1, p1)
+  tanh3 = f32[3,4]{1,0} tanh(tanh2)
+  negate3 = f32[3,4]{1,0} add(negate2, p1)
+  tanh4 = f32[3,4]{1,0} tanh(tanh3)
+  negate4 = f32[3,4]{1,0} add(negate3, tanh0)
+  ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+
+  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  HloUse tanh0_use_during_negate4{negate4, 1, {}};
+  memory_space_options.buffer_colorings = {
+      {tanh0_use_during_negate4, kAlternateMemorySpace}};
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "tanh0" }
+      override_options { assign_last: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(), memory_space_options,
+                                     std::nullopt, std::nullopt,
+                                     msa_sort_order_overrides);
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+  EXPECT_EQ(FindInstruction(module.get(), "negate4")
+                ->operand(1)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       ColorOutputInDefaultMemoryUseInAlternateMemory) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  p0 = f32[3,4]{1,0} parameter(0)
+  p1 = f32[3,4]{1,0} parameter(1)
+  tanh0 = f32[3,4]{1,0} tanh(p0)
+  negate0 = f32[3,4]{1,0} negate(p1)
+  tanh1 = f32[3,4]{1,0} tanh(negate0)
+  negate1 = f32[3,4]{1,0} add(negate0, p1)
+  tanh2 = f32[3,4]{1,0} tanh(tanh1)
+  negate2 = f32[3,4]{1,0} add(negate1, p1)
+  tanh3 = f32[3,4]{1,0} tanh(tanh2)
+  negate3 = f32[3,4]{1,0} add(negate2, p1)
+  tanh4 = f32[3,4]{1,0} tanh(tanh3)
+  negate4 = f32[3,4]{1,0} add(negate3, tanh0)
+  ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+
+  HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
+  HloPosition tanh0_position{tanh0, {}};
+  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  HloUse tanh0_use_during_negate4{negate4, 1, {}};
+  memory_space_options.buffer_colorings = {
+      {tanh0_position, kDefaultMemorySpace},
+      {tanh0_use_during_negate4, kAlternateMemorySpace}};
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "tanh0" }
+      override_options { assign_last: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(), memory_space_options,
+                                     std::nullopt, std::nullopt,
+                                     msa_sort_order_overrides);
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+  EXPECT_EQ(
+      FindInstruction(module.get(), "tanh0")->shape().layout().memory_space(),
+      kDefaultMemorySpace);
+  EXPECT_EQ(FindInstruction(module.get(), "negate4")
+                ->operand(1)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       ColorOutputInAlternateMemoryUseInDefaultMemory) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+p0 = f32[3,4]{1,0} parameter(0)
+p1 = f32[3,4]{1,0} parameter(1)
+tanh0 = f32[3,4]{1,0} tanh(p0)
+negate0 = f32[3,4]{1,0} negate(p1)
+tanh1 = f32[3,4]{1,0} tanh(negate0)
+negate1 = f32[3,4]{1,0} add(negate0, p1)
+tanh2 = f32[3,4]{1,0} tanh(tanh1)
+negate2 = f32[3,4]{1,0} add(negate1, p1)
+tanh3 = f32[3,4]{1,0} tanh(tanh2)
+negate3 = f32[3,4]{1,0} add(negate2, p1)
+tanh4 = f32[3,4]{1,0} tanh(tanh3)
+negate4 = f32[3,4]{1,0} add(negate3, tanh0)
+ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+
+  HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
+  HloPosition tanh0_position{tanh0, {}};
+  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  HloUse tanh0_use_during_negate4{negate4, 1, {}};
+  memory_space_options.buffer_colorings = {
+      {tanh0_position, kAlternateMemorySpace},
+      {tanh0_use_during_negate4, kDefaultMemorySpace}};
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "tanh0" }
+      override_options { assign_last: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(), memory_space_options,
+                                     std::nullopt, std::nullopt,
+                                     msa_sort_order_overrides);
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+  EXPECT_EQ(
+      FindInstruction(module.get(), "tanh0")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+  EXPECT_EQ(FindInstruction(module.get(), "negate4")
+                ->operand(1)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, ColorAlternateUsesInAlternateMemory) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+p0 = f32[3,4]{1,0} parameter(0)
+p1 = f32[3,4]{1,0} parameter(1)
+tanh0 = f32[3,4]{1,0} tanh(p0)
+negate0 = f32[3,4]{1,0} negate(p1)
+tanh1 = f32[3,4]{1,0} tanh(negate0)
+negate1 = f32[3,4]{1,0} add(negate0, p1)
+tanh2 = f32[3,4]{1,0} tanh(tanh1)
+negate2 = f32[3,4]{1,0} add(negate1, p1)
+tanh3 = f32[3,4]{1,0} tanh(tanh2)
+negate3 = f32[3,4]{1,0} add(negate2, p1)
+tanh4 = f32[3,4]{1,0} tanh(tanh3)
+negate4 = f32[3,4]{1,0} add(negate3, tanh0)
+ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+
+  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
+  HloPosition negate0_position{negate0, {}};
+  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
+  HloUse negate0_use_during_negate1{negate1, 0, {}};
+  HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
+  HloUse negate0_use_during_tanh1{tanh1, 0, {}};
+  memory_space_options.buffer_colorings = {
+      {negate0_position, kAlternateMemorySpace},
+      {negate0_use_during_tanh1, kDefaultMemorySpace},
+      {negate0_use_during_negate1, kAlternateMemorySpace}};
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "tanh0" }
+      override_options { assign_first: true }
+    }
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "negate0" }
+      override_options { assign_last: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(), memory_space_options,
+                                     std::nullopt, std::nullopt,
+                                     msa_sort_order_overrides);
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+  EXPECT_EQ(
+      FindInstruction(module.get(), "negate0")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+  EXPECT_EQ(FindInstruction(module.get(), "negate1")
+                ->operand(0)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kAlternateMemorySpace);
+  EXPECT_EQ(FindInstruction(module.get(), "tanh1")
+                ->operand(0)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, ColorBuffersWithoutRepacking) {
+  // Without repacking, the following is the expected layout:
+  // ......  chunk:  3072-4096
+  // #####.  chunk:  2048-3072
+  // #####.  chunk:  1024-2048
+  // .##...  chunk:     0-1024
+  // Since we block a chunk for coloring of negate1, we are not able to allocate
+  // negate3 in alternate memory without repacking.
+  absl::string_view hlo_string = R"(
+HloModule Slice, is_scheduled=true
+
+ENTRY main {
+  /* parameters */
+  p0 = f32[32,16] parameter(0)
+  p1 = f32[16,16] parameter(1)
+  p2 = f32[32,16] parameter(2)
+
+  negate0 = f32[32,16] negate(p0) // We will set highest priority for this.
+  negate1 = f32[16,16] negate(p1) // We will color this, but set lowest priority.
+  negate2 = f32[16,16] negate(negate1)
+  negate3 = f32[32,16] negate(p2) // We will set second highest priority for this.
+  negate4 = f32[32,16] negate(negate0)
+  negate5 = f32[32,16] negate(negate3)
+
+  ROOT tuple = (f32[16,16], f32[32,16], f32[32,16]) tuple(negate2, negate4, negate5)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  MsaBufferIntervalCompare buffer_interval_compare =
+      [](const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
+        auto lookup = [](const MsaBufferInterval& x) {
+          // An arbitrary value that is greater than that for p1, p2, p3, and
+          // p4.
+          int priority = 100;
+          if (x.buffer->instruction()->name() == "negate0") {
+            priority = 1;
+          } else if (x.buffer->instruction()->name() == "negate3") {
+            priority = 2;
+          } else if (x.buffer->instruction()->name() == "negate1") {
+            priority = 1000;
+          }
+          return std::make_tuple(priority, x.buffer->instruction()->name());
+        };
+
+        return lookup(lhs) < lookup(rhs);
+      };
+
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 4 * 1024;
+  options.max_repacks = 0;
+  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
+  HloPosition negate1_position{negate1, {}};
+  options.buffer_colorings = {{negate1_position, kAlternateMemorySpace}};
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(), options);
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+
+  EXPECT_EQ(
+      FindInstruction(module.get(), "negate1")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+
+  EXPECT_EQ(
+      FindInstruction(module.get(), "negate0")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+
+  EXPECT_EQ(FindInstruction(module.get(), "negate4")
+                ->operand(0)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kAlternateMemorySpace);
+
+  // negate3 output and use are both in default memory.
+  EXPECT_EQ(
+      FindInstruction(module.get(), "negate3")->shape().layout().memory_space(),
+      kDefaultMemorySpace);
+
+  EXPECT_EQ(FindInstruction(module.get(), "negate5")
+                ->operand(0)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kDefaultMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest, ColorBuffersWithRepacking) {
+  // Without repacking, the following is the expected layout:
+  // ......  chunk:  3072-4096
+  // #####.  chunk:  2048-3072
+  // #####.  chunk:  1024-2048
+  // .##...  chunk:     0-1024
+
+  // With repacking, we move negate1 to offset 1024, and negate0 to offset 0.
+  // ...###  chunk:  3072-4096
+  // .#####  chunk:  2048-3072
+  // #####.  chunk:  1024-2048
+  // #####.  chunk:     0-1024
+  // Repacking allows us to allocate negate3 in alternate memory.
+  absl::string_view hlo_string = R"(
+HloModule Slice, is_scheduled=true
+
+ENTRY main {
+  /* parameters */
+  p0 = f32[32,16] parameter(0)
+  p1 = f32[16,16] parameter(1)
+  p2 = f32[32,16] parameter(2)
+
+  negate0 = f32[32,16] negate(p0) // We will set highest priority for this.
+  negate1 = f32[16,16] negate(p1) // We will color this, but set lowest priority.
+  negate2 = f32[16,16] negate(negate1)
+  negate3 = f32[32,16] negate(p2) // We will set second highest priority for this.
+  negate4 = f32[32,16] negate(negate0)
+  negate5 = f32[32,16] negate(negate3)
+
+  ROOT tuple = (f32[16,16], f32[32,16], f32[32,16]) tuple(negate2, negate4, negate5)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  MsaBufferIntervalCompare buffer_interval_compare =
+      [](const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
+        auto lookup = [](const MsaBufferInterval& x) {
+          // An arbitrary value that is greater than that for p1, p2, p3, and
+          // p4.
+          int priority = 100;
+          if (x.buffer->instruction()->name() == "negate0") {
+            priority = 1;
+          } else if (x.buffer->instruction()->name() == "negate3") {
+            priority = 2;
+          } else if (x.buffer->instruction()->name() == "negate1") {
+            priority = 1000;
+          }
+          return std::make_tuple(priority, x.buffer->instruction()->name());
+        };
+
+        return lookup(lhs) < lookup(rhs);
+      };
+
+  absl::flat_hash_map<std::pair<int64_t, int64_t>, int64_t> repack_map;
+  // Move "negate1" from offset 0 to 1024.
+  repack_map[{4, 0}] = 1024;
+  // Move "negate0" from offset 1024 to 0.
+  repack_map[{3, 1024}] = 0;
+  FakeMemorySpaceAssignmentRepacker repacker =
+      FakeMemorySpaceAssignmentRepacker(repack_map);
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 4 * 1024;
+  options.max_repacks = 1;
+  options.repacker = &repacker;
+  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
+  HloPosition negate1_position{negate1, {}};
+  options.buffer_colorings = {{negate1_position, kAlternateMemorySpace}};
+
+  XLA_LOG_LINES(INFO, "Before MSA: \n" + module->ToString());
+  AssignMemorySpaceUsingCostAnalysis(module.get(), options);
+  XLA_LOG_LINES(INFO, "After MSA: \n" + module->ToString());
+
+  EXPECT_EQ(
+      FindInstruction(module.get(), "negate1")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+
+  EXPECT_EQ(
+      FindInstruction(module.get(), "negate0")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+
+  EXPECT_EQ(FindInstruction(module.get(), "negate4")
+                ->operand(0)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kAlternateMemorySpace);
+
+  // negate3 output and use are both in alternate memory.
+  EXPECT_EQ(
+      FindInstruction(module.get(), "negate3")->shape().layout().memory_space(),
+      kAlternateMemorySpace);
+
+  EXPECT_EQ(FindInstruction(module.get(), "negate5")
+                ->operand(0)
+                ->shape()
+                .layout()
+                .memory_space(),
+            kAlternateMemorySpace);
 }
 
 TEST_F(MemorySpaceAssignmentTest,
@@ -5987,12 +6546,20 @@ TEST_F(MemorySpaceAssignmentTest,
   TF_CHECK_OK(module->set_schedule(schedule));
 
   Options options = DefaultMemorySpaceOptions();
+  options.post_module_scoped_alternate_memory_size_in_bytes = 32;
   options.is_allowed_in_alternate_mem_fn = [](const HloValue& value) {
     return true;
   };
   XLA_VLOG_LINES(3, module->ToString());
   std::unique_ptr<PresetAssignments> preset_assignments =
       AssignMemorySpace(module.get(), options);
+  // Check that the post-module scoped alternate memory chunk is set correctly.
+  ASSERT_TRUE(preset_assignments->post_module_scoped_alternate_memory_chunk()
+                  .has_value());
+  EXPECT_EQ(
+      preset_assignments->post_module_scoped_alternate_memory_chunk()->size,
+      32);
+
   XLA_VLOG_LINES(3, module->ToString());
   // Ensure that p1 is in the alternate memory and add, which has p1 as an
   // operand, has a direct dependency to p1 (no CopyStart/CopyDone).
@@ -6269,6 +6836,215 @@ TEST_F(MemorySpaceAssignmentTest, DisallowedUseBugInWhile) {
   AssignMemorySpace(module.get(), options);
 }
 
+TEST_F(MemorySpaceAssignmentTest, TwoLiveAllocationValuesBase) {
+  // In this example, we have enough space to give negate.0 alternate memory,
+  // and we put put negate.0 at the top of MSA's sort order. So, we expect that
+  // it will get alternate memory.
+  //
+  // We are testing a fix for dual live AllocationsValues, with the following
+  // setup:
+  // - HloValue H containing the following positions: negate.0, cp-start.0{0}
+  //   - AllocationValue A0 defined at negate.0
+  //     - Segment A0.S0 define during [negate.0, cp-start.0]
+  //.    - Segment A0.S1 defined during [cp-start.0, add.0]
+  //   - AllocationValue A1 defined at cp-start.0{0}
+  //     - Segment A1.S0 defined during [cp-start.0, cp-done.0]
+  //
+  // A0 and A1 are both live for more than 1 instruction.
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    /*00:*/ p.0 = f32[10,10,10,10] parameter(0)
+    /*01:*/ p.1 = f32[10,10,10,10] parameter(1)
+    /*02:*/ v.0 = f32[10,10,10,10] add(p.1, p.1)
+    /*03:*/ negate.0 = f32[10,10,10,10] negate(p.0)
+    /*04:*/ cp-start.0 = (f32[10,10,10,10], f32[10,10,10,10], u32[], u32[]) collective-permute-start(negate.0), source_target_pairs={{0,1},{2,3}}
+    /*05:*/ v.1 = f32[10,10,10,10] add(v.0, v.0)
+    /*06:*/ add.0 = f32[10,10,10,10] add(negate.0, negate.0)
+    /*07:*/ v.2 = f32[10,10,10,10] add(v.1, v.1)
+    /*08:*/ cp-done.0 = f32[10,10,10,10] collective-permute-done(cp-start.0)
+    /*09:*/ v.3 = f32[10,10,10,10] add(v.2, v.2)
+    /*10:*/ ROOT tuple.0 = (f32[10,10,10,10], f32[10,10,10,10], f32[10,10,10,10]) tuple(add.0, cp-done.0, v.3)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 4 * 10 * 10 * 10 * 10;
+  MsaBufferIntervalCompare buffer_interval_compare =
+      CreateBufferIntervalCompareFnFromInstructionNames({"negate.0"});
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(1, 10);
+  std::unique_ptr<PresetAssignments> preset_assignments =
+      AssignMemorySpace(module.get(), options, buffer_interval_compare,
+                        &prefetch_interval_picker);
+  VLOG(1) << "Module after MSA:\n" << module->ToString();
+
+  HloInstruction* copy0 = FindInstruction(module.get(), "negate.0");
+  ASSERT_NE(copy0, nullptr);
+  EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       TwoLiveAllocationValuesTwoInstructionOverlap) {
+  // In this example, we have enough space to give negate.0 alternate memory,
+  // and we put put negate.0 at the top of MSA's sort order. So, we expect that
+  // it will get alternate memory.
+  //
+  // We are testing a fix for dual live AllocationValues, with the following
+  // setup:
+  // - HloValue H containing the following positions: negate.0, cp-start.0{0}
+  //   - AllocationValue A0 defined at negate.0
+  //     - Segment A0.S0 define during [negate.0, cp-start.0]
+  //.    - Segment A0.S1 defined during [cp-start.0, add.0]
+  //   - AllocationValue A1 defined at cp-start.0{0}
+  //     - Segment A1.S0 defined during [cp-start.0, cp-done.0]
+  //
+  // A0 and A1 are both live for 2 instructions
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    /*00:*/ p.0 = f32[10,10,10,10] parameter(0)
+    /*01:*/ p.1 = f32[10,10,10,10] parameter(1)
+    /*02:*/ v.0 = f32[10,10,10,10] add(p.1, p.1)
+    /*03:*/ negate.0 = f32[10,10,10,10] negate(p.0)
+    /*04:*/ cp-start.0 = (f32[10,10,10,10], f32[10,10,10,10], u32[], u32[]) collective-permute-start(negate.0), source_target_pairs={{0,1},{2,3}}
+    /*05:*/ add.0 = f32[10,10,10,10] add(negate.0, negate.0)
+    /*06:*/ v.1 = f32[10,10,10,10] add(v.0, v.0)
+    /*07:*/ v.2 = f32[10,10,10,10] add(v.1, v.1)
+    /*08:*/ cp-done.0 = f32[10,10,10,10] collective-permute-done(cp-start.0)
+    /*09:*/ v.3 = f32[10,10,10,10] add(v.2, v.2)
+    /*10:*/ ROOT tuple.0 = (f32[10,10,10,10], f32[10,10,10,10], f32[10,10,10,10]) tuple(add.0, cp-done.0, v.3)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 4 * 10 * 10 * 10 * 10;
+  MsaBufferIntervalCompare buffer_interval_compare =
+      CreateBufferIntervalCompareFnFromInstructionNames({"negate.0"});
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(1, 10);
+  std::unique_ptr<PresetAssignments> preset_assignments =
+      AssignMemorySpace(module.get(), options, buffer_interval_compare,
+                        &prefetch_interval_picker);
+  VLOG(1) << "Module after MSA:\n" << module->ToString();
+
+  HloInstruction* copy0 = FindInstruction(module.get(), "negate.0");
+  ASSERT_NE(copy0, nullptr);
+  EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       TwoLiveAllocationValuesFirstLiveAllocationValueOutlastsSecond) {
+  // In this example, we have enough space to give negate.0 alternate memory,
+  // and we put put negate.0 at the top of MSA's sort order. So, we expect that
+  // it will get alternate memory.
+  //
+  // We are testing a fix for dual live AllocationValues, with the following
+  // setup:
+  // - HloValue H containing the following positions: negate.0, cp-start.0{0}
+  //   - AllocationValue A0 defined at negate.0
+  //     - Segment A0.S0 define during [negate.0, cp-start.0]
+  //.    - Segment A0.S1 defined during [cp-start.0, add.0]
+  //     - Segment A0.S2 defined during [add.0, add.1]
+  //   - AllocationValue A1 defined at cp-start.0{0}
+  //     - Segment A1.S0 defined during [cp-start.0, cp-done.0]
+  //
+  // A0 and A1 are both live for more than 1 instruction. A0 is live beyond the
+  // end of A1.
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    /*00:*/ p.0 = f32[10,10,10,10] parameter(0)
+    /*01:*/ p.1 = f32[10,10,10,10] parameter(1)
+    /*02:*/ v.0 = f32[10,10,10,10] add(p.1, p.1)
+    /*03:*/ negate.0 = f32[10,10,10,10] negate(p.0)
+    /*04:*/ cp-start.0 = (f32[10,10,10,10], f32[10,10,10,10], u32[], u32[]) collective-permute-start(negate.0), source_target_pairs={{0,1},{2,3}}
+    /*05:*/ v.1 = f32[10,10,10,10] add(v.0, v.0)
+    /*06:*/ add.0 = f32[10,10,10,10] add(negate.0, negate.0)
+    /*07:*/ v.2 = f32[10,10,10,10] add(v.1, v.1)
+    /*08:*/ cp-done.0 = f32[10,10,10,10] collective-permute-done(cp-start.0)
+    /*09:*/ v.3 = f32[10,10,10,10] add(v.2, v.2)
+    /*10:*/ add.1 = f32[10,10,10,10] add(add.0, negate.0)
+    /*11:*/ ROOT tuple.0 = (f32[10,10,10,10], f32[10,10,10,10], f32[10,10,10,10]) tuple(add.1, cp-done.0, v.3)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 4 * 10 * 10 * 10 * 10;
+  MsaBufferIntervalCompare buffer_interval_compare =
+      CreateBufferIntervalCompareFnFromInstructionNames({"negate.0"});
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(1, 10);
+  std::unique_ptr<PresetAssignments> preset_assignments =
+      AssignMemorySpace(module.get(), options, buffer_interval_compare,
+                        &prefetch_interval_picker);
+  VLOG(1) << "Module after MSA:\n" << module->ToString();
+
+  HloInstruction* copy0 = FindInstruction(module.get(), "negate.0");
+  ASSERT_NE(copy0, nullptr);
+  EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
+TEST_F(MemorySpaceAssignmentTest,
+       TwoLiveAllocationValuesUnableToAllocateContiguousAltMem) {
+  // In this example, we have enough space to give v.2 alternate memory,
+  // and we put v.2 at the top of MSA's sort order. So, we expect that
+  // it will get alternate memory. Second, we try to give negate.0 alternate
+  // memory, but we can't. In order to give negate.0 alternate memory, we need
+  // to give it contiguous alternate memory during cp-start.0 to cp-done.0.
+  // (negate.0 and cp-start.0 {0} alias.) However, v.2 is taking too much
+  // alternate memory to accommodate that request.
+  //
+  // We are testing a fix for dual live AllocationValues, with the following
+  // setup:
+  // - HloValue H containing the following positions: negate.0, cp-start.0{0}
+  //   - AllocationValue A0 defined at negate.0
+  //     - Segment A0.S0 define during [negate.0, cp-start.0]
+  //.    - Segment A0.S1 defined during [cp-start.0, add.0]
+  //   - AllocationValue A1 defined at cp-start.0{0}
+  //     - Segment A1.S0 defined during [cp-start.0, cp-done.0]
+  //
+  // A0 and A1 are both live for more than 1 instruction.
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    /*00:*/ p.0 = f32[10,10,10,10] parameter(0)
+    /*01:*/ p.1 = f32[10,10,10,10] parameter(1)
+    /*02:*/ v.0 = f32[10,10,10,10] add(p.1, p.1)
+    /*03:*/ negate.0 = f32[10,10,10,10] negate(p.0)
+    /*04:*/ cp-start.0 = (f32[10,10,10,10], f32[10,10,10,10], u32[], u32[]) collective-permute-start(negate.0), source_target_pairs={{0,1},{2,3}}
+    /*05:*/ v.1 = f32[10,10,10,10] add(v.0, v.0)
+    /*06:*/ add.0 = f32[10,10,10,10] add(negate.0, negate.0)
+    /*07:*/ v.2 = f32[10,10,10,10] add(v.1, v.1)
+    /*08:*/ cp-done.0 = f32[10,10,10,10] collective-permute-done(cp-start.0)
+    /*09:*/ v.3 = f32[10,10,10,10] add(v.2, v.2)
+    /*10:*/ ROOT tuple.0 = (f32[10,10,10,10], f32[10,10,10,10], f32[10,10,10,10]) tuple(add.0, cp-done.0, v.3)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 4 * 10 * 10 * 10 * 10;
+  MsaBufferIntervalCompare buffer_interval_compare =
+      CreateBufferIntervalCompareFnFromInstructionNames({"v.2", "negate.0"});
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(1, 10);
+  std::unique_ptr<PresetAssignments> preset_assignments =
+      AssignMemorySpace(module.get(), options, buffer_interval_compare,
+                        &prefetch_interval_picker);
+  VLOG(1) << "Module after MSA:\n" << module->ToString();
+
+  HloInstruction* v2 = FindInstruction(module.get(), "v.2");
+  ASSERT_NE(v2, nullptr);
+  EXPECT_EQ(v2->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* copy0 = FindInstruction(module.get(), "negate.0");
+  ASSERT_NE(copy0, nullptr);
+  EXPECT_NE(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest, AvoidRedundantEvictionInWhile) {
   absl::string_view hlo_string = R"(
   HloModule module, is_scheduled=true
@@ -6375,7 +7151,7 @@ TEST_F(MemorySpaceAssignmentTest,
 
   // Expect that while tuple shape contains 3 elements like the original.
   const HloInstruction* while_instr = FindInstruction(module.get(), "while");
-  EXPECT_EQ(while_instr->shape().tuple_shapes_size(), 3);
+  EXPECT_EQ(while_instr->shape().tuple_shapes().size(), 3);
 }
 
 TEST_F(MemorySpaceAssignmentTest, AvoidRedundantEvictionInNestedWhile) {
@@ -6511,7 +7287,7 @@ TEST_F(MemorySpaceAssignmentTest, RedundantEvictionEliminationBug) {
   // Expect that redundant eviction elimination doesn't kick in because
   // while{1} is updated within the body.
   const HloInstruction* while_instr = FindInstruction(module.get(), "while");
-  EXPECT_EQ(while_instr->shape().tuple_shapes_size(), 3);
+  EXPECT_EQ(while_instr->shape().tuple_shapes().size(), 3);
   EXPECT_EQ(while_instr->shape().tuple_shapes(1).layout().memory_space(),
             kAlternateMemorySpace);
   const HloInstruction* gte1 = FindInstruction(module.get(), "gte1");
@@ -6592,8 +7368,9 @@ TEST_F(MemorySpaceAssignmentTest, RedundantEvictionEliminationInChainedWhile) {
 
   // Expect that while1 has one more value than while2 in its shape.
   EXPECT_EQ(
-      FindInstruction(module.get(), "while1")->shape().tuple_shapes_size(),
-      FindInstruction(module.get(), "while2")->shape().tuple_shapes_size() + 1);
+      FindInstruction(module.get(), "while1")->shape().tuple_shapes().size(),
+      FindInstruction(module.get(), "while2")->shape().tuple_shapes().size() +
+          1);
 }
 
 TEST_F(MemorySpaceAssignmentTest, AvoidRedundantEvictionAfterWhile) {
@@ -7185,41 +7962,6 @@ ENTRY entry {
                   .memory_space() == kAlternateMemorySpace);
 }
 
-TEST_F(MemorySpaceAssignmentTest, AsyncOpShortLiveRangeInputBufferConsumer) {
-  absl::string_view hlo_string = R"(
-HloModule module, is_scheduled=true
-
-ENTRY entry {
-  param = bf16[4]{0} parameter(0)
-  negate0 = bf16[4]{0} negate(param)
-  collective-permute-start = (bf16[4]{0}, bf16[4]{0}, u32[], u32[]) collective-permute-start(negate0), source_target_pairs={{0,1},{1,2},{2,3}}
-  negate1 = bf16[4]{0} negate(negate0)
-  negate2 = bf16[4]{0} negate(negate1)
-  negate3 = bf16[4]{0} negate(negate2)
-  collective-permute-done = bf16[4]{0} collective-permute-done(collective-permute-start)
-  ROOT add = add(collective-permute-done, negate3)
-}
-  )";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  AssignMemorySpace(module.get());
-
-  // Expect only the destination buffer to get alternate memory allocation
-  // because negate0 is also used by negate1.
-  HloInstruction* collective_permute_start =
-      module->entry_computation()->GetInstructionWithName(
-          "collective-permute-start");
-  EXPECT_TRUE(collective_permute_start->shape()
-                  .tuple_shapes(0)
-                  .layout()
-                  .memory_space() == kDefaultMemorySpace);
-  EXPECT_TRUE(collective_permute_start->shape()
-                  .tuple_shapes(1)
-                  .layout()
-                  .memory_space() == kAlternateMemorySpace);
-}
-
 TEST_F(MemorySpaceAssignmentTest, AsyncOpLongLiveRange) {
   absl::string_view hlo_string = R"(
 HloModule module, is_scheduled=true
@@ -7717,65 +8459,6 @@ ENTRY entry {
                   .memory_space() == kAlternateMemorySpace);
 }
 
-// A mock MemorySpaceAssignmentRepacker class that accepts a map of
-// (start_time,offset) -> new_offset values. Using this map, the repacker
-// repacks the allocations to the new_offset.
-class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
- public:
-  explicit FakeMemorySpaceAssignmentRepacker(
-      absl::flat_hash_map<std::pair<int64_t, int64_t>, int64_t>& repack_map,
-      std::function<void(absl::Span<AllocationBlock*>)> check_fun = nullptr,
-      bool always_return_modified = false)
-      : MemorySpaceAssignmentRepacker(/*max_size=*/128, /*alignment=*/8),
-        repack_map_(repack_map),
-        check_fun_(check_fun),
-        always_return_modified_(always_return_modified) {}
-
-  absl::StatusOr<bool> Repack(
-      absl::Span<AllocationBlock*> allocations) override {
-    bool modified = false;
-    for (AllocationBlock* block : allocations) {
-      absl::flat_hash_set<int64_t> colocations;
-      std::string colocations_str;
-      for (const AllocationBlock* colocation : block->GetColocations()) {
-        absl::StrAppend(&colocations_str, colocation->id, ", ");
-        colocations.insert(colocation->id);
-      }
-      VLOG(1) << "Alloc id: " << block->id << " time: ["
-              << block->inclusive_start_time << ", " << block->end_time
-              << "] size: " << block->size
-              << " init offset: " << block->initial_offset << " colocations: {"
-              << colocations_str << "}";
-      auto it = repack_map_.find(
-          {block->inclusive_start_time, block->initial_offset});
-      if (it != repack_map_.end()) {
-        modified = true;
-        block->offset = it->second;
-      } else {
-        block->offset = block->initial_offset;
-      }
-      for (AllocationBlock* colocation : block->GetColocations()) {
-        if (it != repack_map_.end()) {
-          colocation->offset = it->second;
-        } else {
-          colocation->offset = colocation->initial_offset;
-        }
-      }
-    }
-    if (check_fun_) {
-      check_fun_(allocations);
-    }
-
-    return always_return_modified_ || modified;
-  }
-
- private:
-  // A map from (start_time, offset) to new_offset.
-  absl::flat_hash_map<std::pair<int64_t, int64_t>, int64_t> repack_map_;
-  std::function<void(absl::Span<AllocationBlock*>)> check_fun_;
-  bool always_return_modified_;
-};
-
 TEST_F(MemorySpaceAssignmentTest, Repack) {
   // We initially perform the following allocations at these offsets.
   //
@@ -9165,8 +9848,10 @@ TEST_F(MemorySpaceAssignmentTest, HoistCopyStart) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   Options options = DefaultMemorySpaceOptions();
+  options.post_module_scoped_alternate_memory_size_in_bytes = 64;
   options.enable_cross_program_prefetch = true;
-  AssignMemorySpace(module.get(), options);
+  std::unique_ptr<PresetAssignments> preset_assignments =
+      AssignMemorySpace(module.get(), options);
 
   // Ensure that get-tuple-element.1 is chosen for cross-program prefetch.
   auto cross_program_prefetches = module->CrossProgramPrefetches();
@@ -9174,6 +9859,14 @@ TEST_F(MemorySpaceAssignmentTest, HoistCopyStart) {
   ASSERT_EQ(cross_program_prefetches[0].parameter, 0);
   ASSERT_EQ(cross_program_prefetches[0].index, ShapeIndex({1}));
 
+  // Check that the preset assignments contains the correct post module scoped
+  // allocation chunk, in the presence of cross-program prefetch.
+  ASSERT_TRUE(preset_assignments->post_module_scoped_alternate_memory_chunk()
+                  .has_value());
+  EXPECT_EQ(
+      preset_assignments->post_module_scoped_alternate_memory_chunk()->size,
+      64);
+
   // Check that the async copy-start for get-tuple-element.1 is hoisted
   // after MSA (get-tuple-element.1 was initially the third operation of the
   // original schedule).
@@ -9772,6 +10465,28 @@ TEST_F(AsynchronousCopyResourceTest, StartAtZeroAndRemove) {
             std::vector<float>({0.0, 0.0, 0.0, 0.0, 2.0}));
 }
 
+// Below test only works when the resource values are scaled to int64 to avoid
+// floating point precision issues.
+TEST_F(AsynchronousCopyResourceTest, ConsumeResourceScaledIntegerResource) {
+  auto alternate_mem_space = MemorySpace::kAlternate;
+  AsynchronousCopyResource resource(
+      {5.71429e-10, 8.71333e-09, 8.71333e-09, 1.74267e-08, 1.74267e-08});
+  AsynchronousCopy copy1{0, 2, 8.71333e-09, alternate_mem_space, 0};
+  EXPECT_TRUE(resource.HasEnoughResource(0, 2, 8.71333e-09));
+  resource.AddCopy(copy1);
+
+  AsynchronousCopy copy2{0, 3, 4.35667e-09, alternate_mem_space, 1};
+  EXPECT_TRUE(resource.HasEnoughResource(0, 3, 4.35667e-09));
+  resource.AddCopy(copy2);
+
+  AsynchronousCopy copy3{2, 4, 4.35667e-09, alternate_mem_space, 2};
+  EXPECT_TRUE(resource.HasEnoughResource(2, 4, 4.35667e-09));
+  resource.AddCopy(copy3);
+
+  // This call to RemoveCopy should not cause a crash.
+  resource.RemoveCopy(copy1);
+}
+
 TEST_F(AsynchronousCopyResourceTest, OutOfOrderRemovalSameStartTime) {
   // time:      0 1 2 3 4
   // resource:  2 2 2 2 2
@@ -10700,7 +11415,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchNoReuse) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
       HloDataflowAnalysis::Run(*module));
-  LOG(ERROR) << "module: " << module->ToString();
+  LOG(INFO) << "module: " << module->ToString();
   const HloValue& cross_program_prefetched_value =
       dataflow_analysis->GetValueDefinedAt(
           module->entry_computation()->parameter_instruction(1), {});
@@ -10792,7 +11507,7 @@ TEST_F(MemorySpaceAssignmentTest, CrossProgramPrefetchWithOverrideNoReuse) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
       HloDataflowAnalysis::Run(*module));
-  LOG(ERROR) << "module: " << module->ToString();
+  LOG(INFO) << "module: " << module->ToString();
   const HloValue& cross_program_prefetched_value =
       dataflow_analysis->GetValueDefinedAt(
           module->entry_computation()->parameter_instruction(0), {});
@@ -10873,7 +11588,7 @@ TEST_F(MemorySpaceAssignmentTest, UserAnnotatedCrossProgramPrefetchNoReuse) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
       HloDataflowAnalysis::Run(*module));
-  LOG(ERROR) << "module: " << module->ToString();
+  LOG(INFO) << "module: " << module->ToString();
   const HloValue& cross_program_prefetched_value =
       dataflow_analysis->GetValueDefinedAt(
           module->entry_computation()->parameter_instruction(0), {});
@@ -10972,7 +11687,7 @@ TEST_F(MemorySpaceAssignmentTest,
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
       HloDataflowAnalysis::Run(*module));
-  LOG(ERROR) << "module: " << module->ToString();
+  LOG(INFO) << "module: " << module->ToString();
   const HloValue& cross_program_prefetched_value =
       dataflow_analysis->GetValueDefinedAt(
           module->entry_computation()->parameter_instruction(0), {});
@@ -11155,7 +11870,7 @@ TEST_F(MemorySpaceAssignmentTest,
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
       HloDataflowAnalysis::Run(*module));
-  LOG(ERROR) << "module: " << module->ToString();
+  LOG(INFO) << "module: " << module->ToString();
   const HloValue& cross_program_prefetched_value =
       dataflow_analysis->GetValueDefinedAt(
           module->entry_computation()->parameter_instruction(1), {});
@@ -11509,6 +12224,137 @@ ENTRY main {
   EXPECT_EQ(f_index, p1_copy_end + 1);
 }
 
+TEST_F(MemorySpaceAssignmentTest, ExpandScopedAlternateMemory) {
+  absl::string_view hlo_string = R"(
+  HloModule TestModule, is_scheduled=true
+    ENTRY Main {
+      p0 = f32[8,8] parameter(0)
+      p1 = f32[8,8] parameter(1)
+      p2 = f32[8,8] parameter(2)
+      p3 = f32[8,8] parameter(3)
+
+      v0 = add(p0, p1)
+      v1 = add(v0, p1)
+      v2 = add(v1, p1)
+
+      v3 = multiply(v2, p2)
+      v4 = multiply(v3, p3)
+
+      ROOT t = tuple(v3, v4)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  MsaBufferIntervalCompare buffer_interval_compare =
+      [](const MsaBufferInterval& lhs, const MsaBufferInterval& rhs) {
+        auto lookup = [](const MsaBufferInterval& x) {
+          // An arbitrary value that is greater than that used for 'prefetch'.
+          int priority = 100;
+          if (x.buffer->instruction()->name() == "p2") {
+            priority = 1;
+          } else if (x.buffer->instruction()->name() == "p3") {
+            priority = 2;
+          }
+          return std::make_tuple(priority, x.buffer->instruction()->name());
+        };
+
+        return lookup(lhs) < lookup(rhs);
+      };
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 1000);
+
+  Options options = DefaultMemorySpaceOptions();
+  options.max_size_in_bytes = 600;
+  options.reserved_scoped_memory_fn =
+      [](const HloInstruction* instruction,
+         const absl::flat_hash_set<
+             std::pair<int, ShapeIndex>>& /*operands_in_alternate_memory*/,
+         const absl::flat_hash_set<
+             ShapeIndex>& /*outputs_in_alternate_memory*/) { return 10; };
+  options.expanded_scoped_alternate_memory_mode =
+      ExpandedScopedAlternateMemoryMode::ENABLED;
+  options.alignment_in_bytes = 10;
+  std::unique_ptr<PresetAssignments> preset_assignments =
+      AssignMemorySpace(module.get(), options, buffer_interval_compare,
+                        &prefetch_interval_picker);
+
+  VLOG(1) << "Post-MSA module:\n" << module->ToString();
+
+  // We expect MSA to do the following:
+  // A. Initially allocate [0, 10) for scoped alternate memory, for each
+  //    instruction.
+  // B. Since, p2 comes first in the buffer sorting, we expect it to be
+  //    allocated [10, 266) for a prefetch
+  // C. Since, p3 comes next in the buffer sorting, we expect it to be allocated
+  //    [270, 526) for a prefetch
+  // D. Finally, MSA will try to expand the scoped alternate memory allocations
+  //    to the largest available buffers, keeping in mind the prefetches.
+
+  // Check B and C.
+  for (const auto& [position, chunk] : preset_assignments->chunks()) {
+    if (position.instruction->opcode() == HloOpcode::kCopyDone) {
+      ASSERT_EQ(position.instruction->operand_count(), 1);
+      const HloInstruction* copy_start = position.instruction->operand(0);
+      ASSERT_EQ(copy_start->operand_count(), 1);
+      const HloInstruction* copy_operand = copy_start->operand(0);
+      if (copy_operand->name() == "p2") {
+        EXPECT_EQ(chunk.offset, 10);
+        EXPECT_EQ(chunk.size, 256);
+      } else if (copy_operand->name() == "p3") {
+        EXPECT_EQ(chunk.offset, 270);
+        EXPECT_EQ(chunk.size, 256);
+      }
+    }
+  }
+
+  // Check D.
+  for (const auto& [instruction, chunk] :
+       preset_assignments->scoped_allocation_chunks()) {
+    if (instruction->name() == "p0") {
+      // Extended scoped allocation.
+      EXPECT_EQ(chunk.offset, 0);
+      EXPECT_EQ(chunk.size, 600);
+    } else if (instruction->name() == "p1") {
+      // Extended scoped allocation.
+      EXPECT_EQ(chunk.offset, 0);
+      EXPECT_EQ(chunk.size, 600);
+    } else if (instruction->name() == "p2") {
+      // Extended scoped allocation.
+      EXPECT_EQ(chunk.offset, 0);
+      EXPECT_EQ(chunk.size, 600);
+    } else if (instruction->name() == "p3") {
+      // Moved scoped allocation.
+      EXPECT_EQ(chunk.offset, 270);
+      EXPECT_EQ(chunk.size, 330);
+    } else if (instruction->name() == "v0") {
+      // Moved scoped allocation.
+      EXPECT_EQ(chunk.offset, 530);
+      EXPECT_EQ(chunk.size, 70);
+    } else if (instruction->name() == "v1") {
+      // Moved scoped allocation.
+      EXPECT_EQ(chunk.offset, 530);
+      EXPECT_EQ(chunk.size, 70);
+    } else if (instruction->name() == "v2") {
+      // Moved scoped allocation.
+      EXPECT_EQ(chunk.offset, 530);
+      EXPECT_EQ(chunk.size, 70);
+    } else if (instruction->name() == "v3") {
+      // Moved scoped allocation.
+      EXPECT_EQ(chunk.offset, 530);
+      EXPECT_EQ(chunk.size, 70);
+    } else if (instruction->name() == "v4") {
+      // Extended scoped allocation.
+      EXPECT_EQ(chunk.offset, 0);
+      EXPECT_EQ(chunk.size, 270);
+    } else if (instruction->name() == "t") {
+      // Extended scoped allocation.
+      EXPECT_EQ(chunk.offset, 0);
+      EXPECT_EQ(chunk.size, 600);
+    }
+  }
+}
+
 class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
  protected:
   // Used by CheckSchedule() to classify instructions in the schedule.
@@ -11682,7 +12528,8 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
               << "), with an inconsistent number slice starts/limits/strides";
           return false;
         }
-        if (slice->slice_starts().size() != copy_operand->shape().rank()) {
+        if (slice->slice_starts().size() !=
+            copy_operand->shape().dimensions().size()) {
           *listener
               << " has slice (" << slice->name() << "), with "
               << slice->slice_starts().size()
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
index 90296f30644c..d031f9ea89bc 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test_base.h
@@ -17,14 +17,19 @@ limitations under the License.
 #define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_TEST_BASE_H_
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <set>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -133,6 +138,39 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     return options;
   }
 
+  // Creates an MsaBufferIntervalCompare function that prioritizes the
+  // instructions named in prioritized_instruction_names, in the order
+  // specified. We default to alphabetical instruction name order for the
+  // remaining instructions.
+  static MsaBufferIntervalCompare
+  CreateBufferIntervalCompareFnFromInstructionNames(
+      std::vector<std::string> prioritized_instruction_names) {
+    absl::flat_hash_map<std::string, size_t> instruction_name_to_priority;
+    // A lower priority value means its higher on the Msa sort list.
+    for (size_t i = 0; i < prioritized_instruction_names.size(); ++i) {
+      instruction_name_to_priority[prioritized_instruction_names[i]] = i;
+    }
+    return [instruction_name_to_priority =
+                std::move(instruction_name_to_priority)](
+               const MsaBufferInterval& a, const MsaBufferInterval& b) {
+      auto get_sort_tuple = [&instruction_name_to_priority](
+                                const MsaBufferInterval& buffer_interval) {
+        auto it = instruction_name_to_priority.find(
+            buffer_interval.buffer->defining_instruction()->name());
+        if (it != instruction_name_to_priority.end()) {
+          return std::make_tuple(
+              it->second,
+              buffer_interval.buffer->defining_instruction()->name());
+        }
+        return std::make_tuple(
+            instruction_name_to_priority.size(),
+            buffer_interval.buffer->defining_instruction()->name());
+      };
+
+      return get_sort_tuple(a) < get_sort_tuple(b);
+    };
+  }
+
   std::unique_ptr<PresetAssignments> AssignMemorySpaceUsingCostAnalysis(
       HloModule* module,
       std::optional<Options> memory_space_options_override = std::nullopt,
diff --git a/third_party/xla/xla/service/memory_space_assignment/options.h b/third_party/xla/xla/service/memory_space_assignment/options.h
index ff1a951d25a7..fbc2f676428f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/options.h
+++ b/third_party/xla/xla/service/memory_space_assignment/options.h
@@ -78,6 +78,7 @@ using DetermineSplitDimensionFunction =
 using BitcastSplitFn = std::function<absl::StatusOr<int64_t>(
     const HloInstruction* instruction, int64_t split_dim)>;
 using ShapeSizeFn = std::function<int64_t(const Shape&)>;
+using HloPositionOrUse = std::variant<HloPosition, HloUse>;
 
 // MSA allows for custom post-allocation transformations. When a post-allocation
 // transformation is performed on an instruction, this result is returned. It
@@ -104,6 +105,12 @@ enum class WindowPrefetchMode {
   kWindowPrefetch,
 };
 
+// A struct to specify the memory space coloring of a buffer position or use.
+struct BufferColoring {
+  HloPositionOrUse buffer_position_or_use;  // Buffer position or use to color.
+  int64_t memory_space;                     // How to color the buffer.
+};
+
 // The different options to be passed to the Run() API.
 struct Options {
   // The backend-specific integer value that describes the default memory.
@@ -374,7 +381,20 @@ struct Options {
   WindowPrefetchMode window_prefetch_mode = WindowPrefetchMode::kWindowExposure;
 
   MsaSortOrderOverrides msa_sort_order_overrides;
+
+  // A mode that enables expanding scoped alternate memory allocations to the
+  // largest contiguous open space available.
+  ExpandedScopedAlternateMemoryMode::Value
+      expanded_scoped_alternate_memory_mode =
+          ExpandedScopedAlternateMemoryMode::DISABLED;
+
+  std::vector<BufferColoring> buffer_colorings;
+
+  // If set, this is the size of scoped alternate memory that we require MSA to
+  // allocate for post-module operations.
+  uint64_t post_module_scoped_alternate_memory_size_in_bytes = 0;
 };
+
 }  // namespace memory_space_assignment
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
index 8004e066553b..4b2299bc4baa 100644
--- a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/cost_modelling/op_cost.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
@@ -30,14 +31,13 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/testing_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace memory_space_assignment {
 namespace {
 
-using CostAnalysisPrefetchIntervalPickerTest = HloTestBase;
+using CostAnalysisPrefetchIntervalPickerTest = HloHardwareIndependentTestBase;
 
 TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
   absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/memory_space_assignment/simulator_test.cc b/third_party/xla/xla/service/memory_space_assignment/simulator_test.cc
index b907f5f43df8..cfd038210a34 100644
--- a/third_party/xla/xla/service/memory_space_assignment/simulator_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/simulator_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_alias_analysis.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/cost_modelling/op_cost.h"
 #include "xla/service/heap_simulator/heap_simulator.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "xla/service/memory_space_assignment/allocation.h"
 #include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -54,7 +54,8 @@ using ::testing::IsEmpty;
 
 constexpr int64_t kAlternateMemorySpace = 1;
 
-class MemorySpaceAssignmentSimulatorTest : public HloTestBase {
+class MemorySpaceAssignmentSimulatorTest
+    : public HloHardwareIndependentTestBase {
  protected:
   absl::Status Initialize(absl::string_view hlo_string) {
     TF_ASSIGN_OR_RETURN(module_, ParseAndReturnVerifiedModule(hlo_string));
@@ -70,7 +71,7 @@ class MemorySpaceAssignmentSimulatorTest : public HloTestBase {
                 memory_space_assignment::MemorySpace::kAlternate,
                 HeapSimulator::Chunk::FromOffsetSize(-1, -1),
                 /*start_time=*/0,
-                /*end_time=*/1, /*is_scoped_allocation=*/false);
+                /*end_time=*/1);
         for (HloInstruction* user : inst->users()) {
           allocation->AddUse(HloUse{user, 0});
         }
diff --git a/third_party/xla/xla/service/memory_space_propagation.h b/third_party/xla/xla/service/memory_space_propagation.h
deleted file mode 100644
index 11676aa45c3b..000000000000
--- a/third_party/xla/xla/service/memory_space_propagation.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
-#define XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/memory_space_propagation.h"
-
-#endif  // XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
diff --git a/third_party/xla/xla/service/metrics.proto b/third_party/xla/xla/service/metrics.proto
index ab359a56d7de..282642c9a103 100644
--- a/third_party/xla/xla/service/metrics.proto
+++ b/third_party/xla/xla/service/metrics.proto
@@ -6,6 +6,8 @@ import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
 import "google/protobuf/timestamp.proto";
 
+// internal imports
+
 // Defines generic pass stats.
 message KeyValueMetric {
   string key = 1;
diff --git a/third_party/xla/xla/service/multi_output_fusion.cc b/third_party/xla/xla/service/multi_output_fusion.cc
index 0967e152717d..7a28bbcd3f5e 100644
--- a/third_party/xla/xla/service/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/multi_output_fusion.cc
@@ -15,15 +15,22 @@ limitations under the License.
 
 #include "xla/service/multi_output_fusion.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <optional>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/analysis/hlo_reachability.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/map_util.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 
@@ -52,83 +59,7 @@ absl::StatusOr<bool> MultiOutputFusion::Run(
       candidates_.emplace_back(it);
       InsertOrDie(&candidates_index_, it, index++);
     }
-
-    // Create the initial candidate list for each Node.
-    for (auto& node : candidates_) {
-      HloInstruction* instruction = node.hlo;
-      int64_t instruction_id = get_candidate_id(instruction);
-      FusionCandidate& instr_node = candidates_[instruction_id];
-      if (!IsFusible(instruction)) {
-        continue;
-      }
-      all_fusion_candidates_.emplace_back(instruction,
-                                          reachability_->GetIndex(instruction));
-
-      std::vector<HloInstruction*> candidates;
-      absl::flat_hash_set<HloInstruction*> candidates_set;
-      VLOG(10) << "Looking at instruction: " << instruction->name();
-      for (auto operand : instruction->operands()) {
-        // Filter out the non-interesting instructions -- they
-        // will not generate the savings.
-        if (!IsProfitableOperand(operand)) {
-          VLOG(10) << "Operand not profitable: " << operand->name();
-          continue;
-        }
-        VLOG(10) << "Operand profitable: " << operand->name();
-        // We don't look at all users of operands as it's quadratic. Only look
-        // at one slice of users.
-        const int64_t kUserSliceSize = 128;
-
-        const int64_t user_slice_begin =
-            RoundDownTo(operand->UserId(instruction), kUserSliceSize);
-
-        const int64_t user_slice_end =
-            std::min(static_cast<int64_t>(operand->users().size()),
-                     user_slice_begin + kUserSliceSize);
-
-        for (int64_t i = user_slice_begin; i < user_slice_end; ++i) {
-          HloInstruction* user = operand->users()[i];
-          VLOG(10) << "User: " << user->name();
-          if (user == instruction || !IsFusible(user)) {
-            VLOG(10) << "User is not fusible, or is the instruction itself: "
-                     << user->name();
-            continue;
-          }
-          int64_t user_id = get_candidate_id(user);
-          if (is_connected(instruction, user)) {
-            VLOG(10) << "User is connected: " << user->name();
-            continue;
-          }
-          if (instruction_id < user_id &&
-              user->opcode() == HloOpcode::kFusion) {
-            VLOG(10) << "User ID for user: " << user->name() << " is "
-                     << user_id << " which is higher than " << instruction_id;
-            continue;
-          }
-          if (!LegalToFuse(instruction, user)) {
-            VLOG(10) << "User not legal to fuse: " << user->name();
-            continue;
-          }
-          if (candidates_set.insert(user).second) {
-            VLOG(10) << "User added to candidate list: " << user->name();
-            candidates.push_back(user);
-          }
-        }
-      }
-
-      // Iterate over candidates rather than candidates_set to avoid
-      // nondeterminism.
-      for (auto candidate : candidates) {
-        int64_t profit = GetProfit(instruction, candidate);
-        if (profit > 0) {
-          FusionCandidate& candidate_node =
-              candidates_[get_candidate_id(candidate)];
-          instr_node.fusibles.emplace_back(candidate, profit);
-          candidate_node.fusibles.emplace_back(instruction, profit);
-          worklist_.emplace(instruction, candidate, profit);
-        }
-      }
-    }
+    CreateFusionWorkListForCurrentComputation();
     if (Perform()) {
       changed = true;
     }
@@ -308,8 +239,8 @@ bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1,
   return LegalToFuseMainConstraints(instr1, instr2);
 }
 
-bool MultiOutputFusion::LegalToFuseMainConstraints(HloInstruction* instr1,
-                                                   HloInstruction* instr2) {
+bool MultiOutputFusion::LegalToFusePreliminaryConstraints(
+    HloInstruction* instr1, HloInstruction* instr2) {
   if (instr1 == instr2) {
     return false;
   }
@@ -341,6 +272,15 @@ bool MultiOutputFusion::LegalToFuseMainConstraints(HloInstruction* instr1,
   if (is_connected(instr1, instr2)) {
     return false;
   }
+  return true;
+}
+
+bool MultiOutputFusion::LegalToFuseMainConstraints(HloInstruction* instr1,
+                                                   HloInstruction* instr2) {
+  if (!LegalToFusePreliminaryConstraints(instr1, instr2)) {
+    return false;
+  }
+
   if (!ShapesCompatibleForFusion(instr1, instr2)) {
     return false;
   }
@@ -461,6 +401,97 @@ bool MultiOutputFusion::Perform() {
   return changed;
 }
 
+void MultiOutputFusion::CreateFusionWorkListForCurrentComputation() {
+  // Create the initial candidate list for each Node.
+  for (auto& node : candidates_) {
+    HloInstruction* instruction = node.hlo;
+    int64_t instruction_id = get_candidate_id(instruction);
+    FusionCandidate& instr_node = candidates_[instruction_id];
+    if (!IsFusible(instruction)) {
+      continue;
+    }
+    all_fusion_candidates_.emplace_back(instruction,
+                                        reachability_->GetIndex(instruction));
+
+    std::vector<HloInstruction*> candidates;
+    absl::flat_hash_set<HloInstruction*> candidates_set;
+    VLOG(10) << "Looking at instruction: " << instruction->name();
+    for (auto operand : instruction->operands()) {
+      // Filter out the non-interesting instructions -- they
+      // will not generate the savings.
+      if (!IsProfitableOperand(operand)) {
+        VLOG(10) << "Operand not profitable: " << operand->name();
+        continue;
+      }
+      VLOG(10) << "Operand profitable: " << operand->name();
+      // We don't look at all users of operands as it's quadratic. Only look
+      // at one slice of users.
+      const int64_t kUserSliceSize = 128;
+
+      const int64_t user_slice_begin =
+          RoundDownTo(operand->UserId(instruction), kUserSliceSize);
+
+      const int64_t user_slice_end =
+          std::min(static_cast<int64_t>(operand->users().size()),
+                   user_slice_begin + kUserSliceSize);
+
+      for (int64_t i = user_slice_begin; i < user_slice_end; ++i) {
+        HloInstruction* user = operand->users()[i];
+        VLOG(10) << "User: " << user->name();
+        if (user == instruction || !IsFusible(user)) {
+          VLOG(10) << "User is not fusible, or is the instruction itself: "
+                   << user->name();
+          continue;
+        }
+        int64_t user_id = get_candidate_id(user);
+        if (is_connected(instruction, user)) {
+          VLOG(10) << "User is connected: " << user->name();
+          continue;
+        }
+        if (instruction_id < user_id && user->opcode() == HloOpcode::kFusion) {
+          VLOG(10) << "User ID for user: " << user->name() << " is " << user_id
+                   << " which is higher than " << instruction_id;
+          continue;
+        }
+        if (!LegalToFuse(instruction, user)) {
+          VLOG(10) << "User not legal to fuse: " << user->name();
+          continue;
+        }
+        if (candidates_set.insert(user).second) {
+          VLOG(10) << "User added to candidate list: " << user->name();
+          candidates.push_back(user);
+        }
+      }
+    }
+
+    // Iterate over candidates rather than candidates_set to avoid
+    // nondeterminism.
+    for (auto candidate : candidates) {
+      int64_t profit = GetProfit(instruction, candidate);
+      if (profit > 0) {
+        FusionCandidate& candidate_node =
+            candidates_[get_candidate_id(candidate)];
+        instr_node.fusibles.emplace_back(candidate, profit);
+        candidate_node.fusibles.emplace_back(instruction, profit);
+        worklist_.emplace(instruction, candidate, profit);
+      }
+    }
+  }
+}
+
 bool MultiOutputFusion::DoProducerConsumerMultiOutputFusion() { return false; }
 
+void MultiOutputFusion::AddFusibleCandidate(HloInstruction* instr) {
+  CHECK_NE(instr, nullptr);
+  all_fusion_candidates_.emplace_back(instr, reachability_->GetIndex(instr));
+}
+
+void MultiOutputFusion::AddToWorkList(HloInstruction* instr1,
+                                      HloInstruction* instr2, int64_t profit) {
+  int64_t lead_instr_id = get_candidate_id(instr1);
+  FusionCandidate& lead_candidate_node = candidates_[lead_instr_id];
+  lead_candidate_node.fusibles.emplace_back(instr2, profit);
+  worklist_.emplace(instr1, instr2, profit);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/multi_output_fusion.h b/third_party/xla/xla/service/multi_output_fusion.h
index dd321c7e93d4..7348f015bde7 100644
--- a/third_party/xla/xla/service/multi_output_fusion.h
+++ b/third_party/xla/xla/service/multi_output_fusion.h
@@ -16,14 +16,18 @@ limitations under the License.
 #ifndef XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
 #define XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
 
+#include <cstdint>
+#include <memory>
 #include <optional>
 #include <queue>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/analysis/hlo_reachability.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -83,6 +87,13 @@ class MultiOutputFusion : public HloModulePass {
   // Test if it's legal to fuse instr1 and instr2 into one fusion instruction.
   virtual bool LegalToFuse(HloInstruction* instr1, HloInstruction* instr2);
 
+  // Test if it's legal to fuse instr1 and instr2 into one fusion instruction
+  // using preliminary constraints such as whether two instructions are dead or
+  // reachable from each other or are not fusible kinds of instructions such as
+  // GTEs.
+  bool LegalToFusePreliminaryConstraints(HloInstruction* instr1,
+                                         HloInstruction* instr2);
+
   // Test if it's legal to fuse instr1 and instr2 into one fusion instruction
   // using main constraints.
   bool LegalToFuseMainConstraints(HloInstruction* instr1,
@@ -101,6 +112,23 @@ class MultiOutputFusion : public HloModulePass {
   // Returns the computation for the pass.
   HloComputation* computation() const { return computation_; }
 
+  // An internal data structure for each instruction in current computation.
+  // When an instruction is removed, member 'hlo' is set to nullptr.
+  struct FusionCandidate {
+    HloInstruction* hlo;
+    std::list<std::pair<HloInstruction*, int64_t>> fusibles;
+    explicit FusionCandidate(HloInstruction* hlo) : hlo(hlo) {}
+  };
+
+  const std::vector<FusionCandidate>& current_computation_candidates() const {
+    return candidates_;
+  }
+
+  void AddFusibleCandidate(HloInstruction* instr);
+
+  void AddToWorkList(HloInstruction* instr1, HloInstruction* instr2,
+                     int64_t profit);
+
   // Update the reachability map after fusing instr1 and instr2.
   void UpdateReachability(
       HloInstruction* instr1, HloInstruction* instr2,
@@ -132,15 +160,11 @@ class MultiOutputFusion : public HloModulePass {
     return reachability_->IsConnected(instr1, instr2);
   }
 
- private:
-  // An internal data structure for each instruction in current computation.
-  // When an instruction is removed, member 'hlo' is set to nullptr.
-  struct FusionCandidate {
-    HloInstruction* hlo;
-    std::list<std::pair<HloInstruction*, int64_t>> fusibles;
-    explicit FusionCandidate(HloInstruction* hlo) : hlo(hlo) {}
-  };
+  // Creates an initial worklist of fusible instruction pairs for the current
+  // computation.
+  virtual void CreateFusionWorkListForCurrentComputation();
 
+ private:
   // The pair of candidates to be fused and the profit score.
   struct ToBeFused {
     HloInstruction* instr1;
diff --git a/third_party/xla/xla/service/name_uniquer.cc b/third_party/xla/xla/service/name_uniquer.cc
index 124cd6f427e1..cef735056e5d 100644
--- a/third_party/xla/xla/service/name_uniquer.cc
+++ b/third_party/xla/xla/service/name_uniquer.cc
@@ -56,10 +56,11 @@ NameUniquer::NameUniquer(const std::string& separator) {
     }
   }
 
-  // HLO primitive type names (with the exception of 'tuple') are keywords in
-  // the HLO text representation and cannot be names, so append an underscore if
-  // the name is a primitive type.
-  if (primitive_util::IsPrimitiveTypeName(result) && result != "tuple") {
+  // HLO primitive type names (with the exception of 'tuple' and 'buffer') are
+  // keywords in the HLO text representation and cannot be names, so append an
+  // underscore if the name is a primitive type.
+  if (primitive_util::IsPrimitiveTypeName(result) && result != "tuple" &&
+      result != "buffer") {
     result += "_";
   }
 
diff --git a/third_party/xla/xla/service/op_expander_pass.h b/third_party/xla/xla/service/op_expander_pass.h
deleted file mode 100644
index df65b012e1da..000000000000
--- a/third_party/xla/xla/service/op_expander_pass.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_OP_EXPANDER_PASS_H_
-#define XLA_SERVICE_OP_EXPANDER_PASS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/op_expander_pass.h"
-
-#endif  // XLA_SERVICE_OP_EXPANDER_PASS_H_
diff --git a/third_party/xla/xla/service/operand_upcaster.h b/third_party/xla/xla/service/operand_upcaster.h
deleted file mode 100644
index 8b237a47e0cd..000000000000
--- a/third_party/xla/xla/service/operand_upcaster.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_OPERAND_UPCASTER_H_
-#define XLA_SERVICE_OPERAND_UPCASTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/operand_upcaster.h"
-
-#endif  // XLA_SERVICE_OPERAND_UPCASTER_H_
diff --git a/third_party/xla/xla/service/optimization_barrier_expander.h b/third_party/xla/xla/service/optimization_barrier_expander.h
deleted file mode 100644
index b257010fe9a6..000000000000
--- a/third_party/xla/xla/service/optimization_barrier_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_OPTIMIZATION_BARRIER_EXPANDER_H_
-#define XLA_SERVICE_OPTIMIZATION_BARRIER_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/optimization_barrier_expander.h"
-
-#endif  // XLA_SERVICE_OPTIMIZATION_BARRIER_EXPANDER_H_
diff --git a/third_party/xla/xla/service/optimize_input_output_buffer_alias.h b/third_party/xla/xla/service/optimize_input_output_buffer_alias.h
deleted file mode 100644
index 04ad98bc4883..000000000000
--- a/third_party/xla/xla/service/optimize_input_output_buffer_alias.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
-#define XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h"
-
-#endif  // XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
diff --git a/third_party/xla/xla/service/overload.h b/third_party/xla/xla/service/overload.h
new file mode 100644
index 000000000000..f5f7e2cb03e1
--- /dev/null
+++ b/third_party/xla/xla/service/overload.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_OVERLOAD_H_
+#define XLA_SERVICE_OVERLOAD_H_
+
+namespace xla {
+
+// This structure is used to support C++17 overload pattern as described in
+// https://en.cppreference.com/w/cpp/utility/variant/visit
+//
+// TODO(b/319202112): Replace with absl::Overload once abs lts_2024_XXX is
+// tagged.
+template <class... Ts>
+struct Overload : Ts... {
+  using Ts::operator()...;
+};
+
+template <class... Ts>
+Overload(Ts...) -> Overload<Ts...>;
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_OVERLOAD_H_
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
index 828ca2099728..f5c30bf3c072 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
@@ -26,14 +26,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class P2PSchedulePreparationTest : public HloTestBase {
+class P2PSchedulePreparationTest : public HloHardwareIndependentTestBase {
  public:
   // Verifies that no control dependence is added to the P2P group.
   void VerifyP2PNotTransformed(HloModule* module,
diff --git a/third_party/xla/xla/service/pattern_matcher.h b/third_party/xla/xla/service/pattern_matcher.h
index 776b92d326a1..a56c32bcb232 100644
--- a/third_party/xla/xla/service/pattern_matcher.h
+++ b/third_party/xla/xla/service/pattern_matcher.h
@@ -954,7 +954,7 @@ class ShapePatternRankImpl {
   explicit constexpr ShapePatternRankImpl(int64_t rank) : rank_(rank) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    if (shape->rank() != rank_) {
+    if (shape->dimensions().size() != rank_) {
       if (rank_ == 0) {
         EXPLAIN << "Shape is not a scalar";
       } else {
diff --git a/third_party/xla/xla/service/pattern_matcher_gmock.h b/third_party/xla/xla/service/pattern_matcher_gmock.h
deleted file mode 100644
index f8bea2cff482..000000000000
--- a/third_party/xla/xla/service/pattern_matcher_gmock.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
-#define XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/testlib/pattern_matcher_gmock.h"
-
-#endif  // XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
diff --git a/third_party/xla/xla/service/pattern_matcher_gmock_test.cc b/third_party/xla/xla/service/pattern_matcher_gmock_test.cc
index c0a279537f68..dc581173c761 100644
--- a/third_party/xla/xla/service/pattern_matcher_gmock_test.cc
+++ b/third_party/xla/xla/service/pattern_matcher_gmock_test.cc
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
 
 #include <sstream>
 #include <string>
 #include <type_traits>
 
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/pattern_matcher_test.cc b/third_party/xla/xla/service/pattern_matcher_test.cc
index c8ecfe160291..84095af9a242 100644
--- a/third_party/xla/xla/service/pattern_matcher_test.cc
+++ b/third_party/xla/xla/service/pattern_matcher_test.cc
@@ -25,13 +25,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -40,7 +40,7 @@ namespace xla {
 namespace {
 
 namespace m = match;
-using PatternMatcherTest = HloTestBase;
+using PatternMatcherTest = HloHardwareIndependentTestBase;
 
 TEST_F(PatternMatcherTest, AddOp) {
   constexpr char kModuleStr[] = R"(HloModule two_plus_two_module
@@ -866,7 +866,7 @@ TEST_F(PatternMatcherTest, HloInstructionDescribeToAndExplain) {
       "HloInstruction has opcode constant, expected anything else\n"
       "in c = s32[] constant(0)\n"
       "in operand 1\n"
-      "in a = s32[] add(s32[] c, s32[] c)");
+      "in a = s32[] add(c, c)");
   EXPECT_DESC_AND_EXPLANATION(
       iota, m::Op().WithFusionKind(HloInstruction::FusionKind::kLoop),
       "an HloInstruction with fusion kind kLoop",
@@ -920,7 +920,7 @@ TEST_F(PatternMatcherTest, HloInstructionDescribeToAndExplain) {
       "HloInstruction doesn't have opcode iota\n"
       "in c = s32[] constant(0)\n"
       "in operand 0\n"
-      "in a = s32[] add(s32[] c, s32[] c)");
+      "in a = s32[] add(c, c)");
 
   EXPECT_DESC_AND_EXPLANATION(
       constant, m::Op().WithPredicate(HloPredicateFalse),
@@ -955,7 +955,7 @@ TEST_F(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
       "does not match RHS:\n"
       " - HloInstruction not named \"bar\"\n"
       "   in c = s32[] constant(0)\n"
-      "in a = s32[] add(s32[] b, s32[] c)");
+      "in a = s32[] add(b, c)");
 
   EXPECT_DESC_AND_EXPLANATION(
       SetName("a",
@@ -982,7 +982,7 @@ TEST_F(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
       "does not match LHS:\n"
       " - HloInstruction doesn't have opcode constant\n"
       "   in p = s32[] parameter(0)\n"
-      "in a = s32[] add(s32[] p, s32[] c)");
+      "in a = s32[] add(p, c)");
 }
 
 TEST_F(PatternMatcherTest, AnyOfMatcherDescribeToAndExplain) {
@@ -1064,8 +1064,8 @@ TEST_F(PatternMatcherTest, OneUseAndOneUser) {
     const char* kMultipleUserExplanation =
         "HloInstruction has 2 users, but expected exactly one.\n"
         "All users:\n"
-        " - r = f32[1]{0} reshape(f32[] p0)\n"
-        " - r1 = f32[1]{0} reshape(f32[] p0)\n"
+        " - r = f32[1]{0} reshape(p0)\n"
+        " - r1 = f32[1]{0} reshape(p0)\n"
         "in p0 = f32[] parameter(0)";
     EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
               kMultipleUserExplanation);
@@ -1080,7 +1080,7 @@ TEST_F(PatternMatcherTest, OneUseAndOneUser) {
   EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
   EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
             "HloInstruction is used 2 times by its user, but is expected to be "
-            "used just once: add = f32[] add(f32[] p0, f32[] p0)\n"
+            "used just once: add = f32[] add(p0, p0)\n"
             "in p0 = f32[] parameter(0)");
 }
 
@@ -1116,7 +1116,7 @@ TEST_F(PatternMatcherTest, MatchSingleUserOnlyUnaryOpTwoUsers) {
   EXPECT_EQ(Explanation(bitcast.get(), m::Bitcast(m::Op()),
                         /*single_user_only=*/true),
             "Operand 0 of HloInstruction has 2 users. Expected 1.\nin bitcast "
-            "= f32[1]{0} bitcast(f32[] p)");
+            "= f32[1]{0} bitcast(p)");
 }
 
 TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpOneUser) {
@@ -1153,7 +1153,7 @@ TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpTwoUsers) {
   EXPECT_EQ(Explanation(mul.get(), m::Multiply(m::Op(), m::Op()),
                         /*single_user_only=*/true),
             "Operand 1 of HloInstruction has 2 users. Expected 1.\nin mul = "
-            "f32[] multiply(f32[] p1, f32[] p0)");
+            "f32[] multiply(p1, p0)");
 
   EXPECT_FALSE(MatchSingleUserOnly(add.get(), m::Add(m::Op(), m::Op())));
   EXPECT_FALSE(
@@ -1161,7 +1161,7 @@ TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpTwoUsers) {
   EXPECT_EQ(Explanation(add.get(), m::Add(m::Op(), m::Op()),
                         /*single_user_only=*/true),
             "Operand 0 of HloInstruction has 2 users. Expected 1.\nin add = "
-            "f32[] add(f32[] p0, f32[] p0)");
+            "f32[] add(p0, p0)");
 }
 
 TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpTwoUsersLowerLevel) {
@@ -1195,7 +1195,7 @@ TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpTwoUsersLowerLevel) {
   EXPECT_EQ(Explanation(add.get(), m::Add(m::Op(), m::Op()),
                         /*single_user_only=*/true),
             "Operand 0 of HloInstruction has 2 users. Expected 1.\nin add = "
-            "f32[] add(f32[] p0, f32[] p0)");
+            "f32[] add(p0, p0)");
 }
 
 TEST_F(PatternMatcherTest, Comparison) {
@@ -1237,8 +1237,7 @@ TEST_F(PatternMatcherTest, Comparison) {
       " * which has exactly one user (but possibly is used "
       "multiple times by that instruction)",
       "HloInstruction is not comparison NE\n"
-      "in compare = f32[1]{0} compare(f32[1]{0} param.0, f32[1]{0} param.1), "
-      "direction=EQ");
+      "in compare = f32[1]{0} compare(param.0, param.1), direction=EQ");
 }
 
 TEST_F(PatternMatcherTest, ConvDnums) {
@@ -1403,9 +1402,8 @@ TEST_F(PatternMatcherTest, TestWithContractingDims) {
       " * with opcode dot AND\n"
       " * with lhs_contracting_dims {1} and rhs_contracting_dims {0,1}",
       "rhs_contracting_dimensions {0} don't match expected {0,1}\n"
-      "in dot1 = f32[2048,33708]{1,0} dot(f32[2048,1024]{1,0} param1, "
-      "f32[1024,33708]{1,0} param2), lhs_contracting_dims={1}, "
-      "rhs_contracting_dims={0}");
+      "in dot1 = f32[2048,33708]{1,0} dot(param1, param2), "
+      "lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 }
 
 TEST_F(PatternMatcherTest, TestWithReplicaGroups) {
@@ -1435,7 +1433,7 @@ TEST_F(PatternMatcherTest, TestWithReplicaGroups) {
       " * with replica_group {{1,0},{3,2}}",
       "replica_group {{0,1},{2,3}} don't match expected with replica_group "
       "{{1,0},{3,2}}\n"
-      "in all-reduce = f32[128,32]{0,1} all-reduce(f32[128,32]{0,1} input), "
+      "in all-reduce = f32[128,32]{0,1} all-reduce(input), "
       "replica_groups={{0,1},{2,3}}, to_apply=add");
 }
 
@@ -1490,8 +1488,7 @@ TEST_F(PatternMatcherTest, TestWithControlDeps) {
       "an HloInstruction with control predecessors {mul} and control "
       "successors {div}",
       "HloInstruction expected to have control successors {div} but has {}\n"
-      "in div = f32[4]{0} divide(f32[4]{0} p0, f32[4]{0} p1), "
-      "control-predecessors={mul}");
+      "in div = f32[4]{0} divide(p0, p1), control-predecessors={mul}");
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/platform_util.cc b/third_party/xla/xla/service/platform_util.cc
index a7e6fb889663..49d6db3af2cc 100644
--- a/third_party/xla/xla/service/platform_util.cc
+++ b/third_party/xla/xla/service/platform_util.cc
@@ -15,17 +15,21 @@ limitations under the License.
 
 #include "xla/service/platform_util.h"
 
+#include <cstdlib>
 #include <optional>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/compiler.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/host/host_platform_id.h"
@@ -33,12 +37,11 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -100,11 +103,17 @@ PlatformUtil::GetSupportedPlatforms() {
 }
 
 absl::StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
-  TF_RET_CHECK(GetDebugOptionsFromFlags().xla_allow_get_default_platform())
-      << "--xla_allow_get_default_platform=false means GetDefaultPlatform is "
-         "not allowed and the platform must be specified. If this is a test "
-         "that has been migrated to PJRT, double-check that you are using a "
-         "PJRT-compatible test class.";
+  const char* maybe_allow_get_default_platform =
+      getenv("XLA_ALLOW_GET_DEFAULT_PLATFORM");
+  if (maybe_allow_get_default_platform != nullptr) {
+    std::string allow_get_default_platform(maybe_allow_get_default_platform);
+    TF_RET_CHECK(allow_get_default_platform == "true")
+        << "GetDefaultPlatform is not allowed (XLA_ALLOW_GET_DEFAULT_PLATFORM="
+        << allow_get_default_platform
+        << ") and the platform must be specified. If this is a test that has "
+           "been migrated to PJRT, double-check that you are using a "
+           "PJRT-compatible test class.";
+  }
   TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
 
   se::Platform* platform = nullptr;
diff --git a/third_party/xla/xla/service/profile_guided_latency_estimator.cc b/third_party/xla/xla/service/profile_guided_latency_estimator.cc
index d66f53349331..a5d75ffb2f5c 100644
--- a/third_party/xla/xla/service/profile_guided_latency_estimator.cc
+++ b/third_party/xla/xla/service/profile_guided_latency_estimator.cc
@@ -173,7 +173,8 @@ absl::Status ProfileGuidedLatencyEstimator::CheckAccuracy(
     // to avoid fine-grained exclusion of fusion computations, wrapped async
     // computations, trivial to_apply computations (present in e.g. reductions)
     // etc.
-    if (!comp->IsEntryComputation() && !comp->IsWhileBodyComputation()) {
+    if (!comp->IsEntryComputation() &&
+        !comp->GetUniqueCaller(HloOpcode::kWhile)) {
       continue;
     }
     for (const HloInstruction* instr : comp->MakeInstructionPostOrder()) {
diff --git a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
index e795c475792c..fc97162b7073 100644
--- a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/latency_hiding_scheduler.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
@@ -86,7 +86,7 @@ absl::StatusOr<bool> RunScheduler(
 
 }  // namespace
 
-class LatencyHidingSchedulerTest : public HloTestBase,
+class LatencyHidingSchedulerTest : public HloHardwareIndependentTestBase,
                                    public ::testing::WithParamInterface<bool> {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> ParseHloText(
@@ -165,7 +165,7 @@ ENTRY entry {
 INSTANTIATE_TEST_SUITE_P(LatencyHidingSchedulerTest, LatencyHidingSchedulerTest,
                          ::testing::Bool());
 
-using ProfileGuidedLatencyEstimatorTest = HloTestBase;
+using ProfileGuidedLatencyEstimatorTest = HloHardwareIndependentTestBase;
 
 TEST_F(ProfileGuidedLatencyEstimatorTest,
        TestProfileGuidedLatencyEstimatorWithAsyncInstruction) {
diff --git a/third_party/xla/xla/service/propagate_original_value_test.cc b/third_party/xla/xla/service/propagate_original_value_test.cc
index 301e31e68972..908394f81624 100644
--- a/third_party/xla/xla/service/propagate_original_value_test.cc
+++ b/third_party/xla/xla/service/propagate_original_value_test.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using PropagateOriginalValueTest = HloTestBase;
+using PropagateOriginalValueTest = HloHardwareIndependentTestBase;
 
 TEST_F(PropagateOriginalValueTest, InstructionFusion) {
   constexpr absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/qr_expander.h b/third_party/xla/xla/service/qr_expander.h
deleted file mode 100644
index 067ea64c9166..000000000000
--- a/third_party/xla/xla/service/qr_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_QR_EXPANDER_H_
-#define XLA_SERVICE_QR_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/qr_expander.h"
-
-#endif  // XLA_SERVICE_QR_EXPANDER_H_
diff --git a/third_party/xla/xla/service/real_imag_expander.h b/third_party/xla/xla/service/real_imag_expander.h
deleted file mode 100644
index fc87a60e747d..000000000000
--- a/third_party/xla/xla/service/real_imag_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_REAL_IMAG_EXPANDER_H_
-#define XLA_SERVICE_REAL_IMAG_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/real_imag_expander.h"
-
-#endif  // XLA_SERVICE_REAL_IMAG_EXPANDER_H_
diff --git a/third_party/xla/xla/service/reduce_decomposer.h b/third_party/xla/xla/service/reduce_decomposer.h
deleted file mode 100644
index 12fac9b0dec6..000000000000
--- a/third_party/xla/xla/service/reduce_decomposer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_REDUCE_DECOMPOSER_H_
-#define XLA_SERVICE_REDUCE_DECOMPOSER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/reduce_decomposer.h"
-
-#endif  // XLA_SERVICE_REDUCE_DECOMPOSER_H_
diff --git a/third_party/xla/xla/service/reduce_scatter_combiner.cc b/third_party/xla/xla/service/reduce_scatter_combiner.cc
index de5c2cb5bf14..a86271036f6b 100644
--- a/third_party/xla/xla/service/reduce_scatter_combiner.cc
+++ b/third_party/xla/xla/service/reduce_scatter_combiner.cc
@@ -67,7 +67,8 @@ int64_t FindMostFrequentScatterDim(
     frequency.resize(std::max(dim + 1, static_cast<int64_t>(frequency.size())),
                      0);
     frequency[dim]++;
-    min_rank = std::min(min_rank, it->shape().rank());
+    min_rank =
+        std::min(min_rank, static_cast<int64_t>(it->shape().dimensions_size()));
   }
 
   int64_t most_frequent_dim = std::distance(
@@ -125,7 +126,7 @@ absl::Status CombineReduceScatters(
 
       // Build permutation to align gather dimension.
       auto& perm = operand_permutations.back();
-      perm = std::vector<int64_t>(operand_shape.rank());
+      perm = std::vector<int64_t>(operand_shape.dimensions_size());
       std::iota(perm->begin(), perm->end(), 0);
       std::swap((*perm)[most_frequent_dim], (*perm)[rs->scatter_dimension()]);
 
@@ -222,7 +223,8 @@ absl::StatusOr<bool> ReduceScatterCombiner::RunWithKeyCombiner(
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    if (!combine_while_loops_ && computation->IsWhileBodyComputation()) {
+    if (!combine_while_loops_ &&
+        computation->GetUniqueCaller(HloOpcode::kWhile)) {
       VLOG(2) << "Skipping this computation because the computation is a while "
                  "loop body: "
               << computation->ToString();
diff --git a/third_party/xla/xla/service/reduce_scatter_combiner_test.cc b/third_party/xla/xla/service/reduce_scatter_combiner_test.cc
index da486d17d956..3849708dd55c 100644
--- a/third_party/xla/xla/service/reduce_scatter_combiner_test.cc
+++ b/third_party/xla/xla/service/reduce_scatter_combiner_test.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
@@ -36,7 +36,7 @@ namespace op = xla::testing::opcode_matchers;
 constexpr int64_t kMaxCombineCount = 256;
 constexpr int64_t kMaxByteCount = 10 * 1024 * 1024;
 
-class ReduceScatterCombinerTest : public HloTestBase {
+class ReduceScatterCombinerTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, bool expect_change,
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index 4c093cc0425d..0be3f0699f97 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -66,7 +66,6 @@ absl::StatusOr<bool> ReduceScatterDecomposer::Run(
               rs->operand(0)->shape(), rs->operands(), apply_clone,
               rs->device_list(), rs->constrain_layout(), channel_id,
               rs->use_global_device_ids()));
-      apply_clone->SetCollectiveCallInstruction(ar);
 
       // Create start indices for a dynamic slice to decompose the all-reduce
       // results.
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
index d7f8360fbdc9..c69f32dc3ac2 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -30,7 +30,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ReduceScatterDecomposerTest : public HloTestBase {
+class ReduceScatterDecomposerTest : public HloHardwareIndependentTestBase {
  public:
   enum class PassAction {
     kNoChange,
@@ -94,7 +94,7 @@ class ReduceScatterDecomposerTest : public HloTestBase {
     auto zero_matcher = op::Constant(LiteralUtil::Zero(U32));
 
     std::vector<::testing::Matcher<const ::xla::HloInstruction *>> ds_operands(
-        shape.rank() + 1, zero_matcher);
+        shape.dimensions_size() + 1, zero_matcher);
     ds_operands[0] = op::AllReduce(op::Parameter(0));
     ds_operands[shard_dimension + 1] =
         op::Multiply(slice_index, op::Constant(std::move(multiplier)));
diff --git a/third_party/xla/xla/service/reduce_scatter_reassociate_test.cc b/third_party/xla/xla/service/reduce_scatter_reassociate_test.cc
index ed68774a8496..ecd8fc8e2f20 100644
--- a/third_party/xla/xla/service/reduce_scatter_reassociate_test.cc
+++ b/third_party/xla/xla/service/reduce_scatter_reassociate_test.cc
@@ -17,15 +17,15 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
 
 namespace m = xla::testing::opcode_matchers;
 
-class ReduceScatterReassociateTest : public HloTestBase {
+class ReduceScatterReassociateTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, bool expect_change) {
diff --git a/third_party/xla/xla/service/reduce_window_rewriter.h b/third_party/xla/xla/service/reduce_window_rewriter.h
deleted file mode 100644
index 01f1ad582676..000000000000
--- a/third_party/xla/xla/service/reduce_window_rewriter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
-#define XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/reduce_window_rewriter.h"
-
-#endif  // XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
diff --git a/third_party/xla/xla/service/rendezvous.cc b/third_party/xla/xla/service/rendezvous.cc
index e9c88dbca2e5..ef97ff36b9a7 100644
--- a/third_party/xla/xla/service/rendezvous.cc
+++ b/third_party/xla/xla/service/rendezvous.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <limits>
 
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/tsl/platform/logging.h"
@@ -37,32 +38,33 @@ static bool WaitForReadyWithTimeout(RendezvousStateSynchronization& state,
 
   // Keep checking if the rendezvous is ready inside a loop and update TraceMe
   // annotation to track the rendezvous progress.
-  while (state.ready.load() == false) {
+  while (!state.ready) {
     size_t num_pending = state.num_threads - state.ack.load();
 
     tsl::profiler::TraceMe trace([&] {
       if (num_pending == 0) {
         return absl::StrFormat("Wait for rendezvous callback");
-      } else {
-        return absl::StrFormat("Wait %d of %d", num_pending, state.num_threads);
       }
+      return absl::StrFormat("Wait: pending_threads=%d/%d", num_pending,
+                             state.num_threads);
     });
 
     bool timed_out = state.cv.WaitWithTimeout(&state.mutex, timeout);
-    bool ready = state.ready.load();
 
     // We are done and ready.
-    if (ready) return true;
+    if (state.ready) {
+      return true;
+    }
 
     // We are done with waiting because the timeout is exceeded.
-    if (timed_out && !ready) {
+    if (timed_out && !state.ready) {
       return false;
     }
 
     // Otherwise we keep waiting.
   }
 
-  return state.ready.load();
+  return state.ready;
 }
 
 void AwaitAndLogIfStuck(RendezvousStateSynchronization& state, int32_t id,
@@ -88,18 +90,18 @@ void AwaitAndLogIfStuck(RendezvousStateSynchronization& state, int32_t id,
 
   if (is_all_participants_arrived) {
     LOG(ERROR) << absl::StreamFormat(
-        "This thread has been waiting for `%s` for %d seconds and may be "
-        "stuck. All %d threads joined the rendezvous, however the leader has "
-        "not marked the rendezvous as completed. Leader can be deadlocked "
-        "inside the rendezvous callback.",
-        name, absl::ToInt64Seconds(warn_stuck_timeout), state.num_threads);
+        "[id=%d] This thread has been waiting for `%s` for %d "
+        "seconds and may be stuck. All %d threads joined the rendezvous, "
+        "however the leader has not marked the rendezvous as completed. Leader "
+        "can be deadlocked inside the rendezvous callback.",
+        id, name, absl::ToInt64Seconds(warn_stuck_timeout), state.num_threads);
 
   } else {
     LOG(ERROR) << absl::StreamFormat(
-        "This thread has been waiting for `%s` for %d seconds and may be "
-        "stuck. Expected %d threads to join the rendezvous, but not all of "
+        "[id=%d] This thread has been waiting for `%s` for %d seconds and may "
+        "be stuck. Expected %d threads to join the rendezvous, but not all of "
         "them arrived on time.",
-        name, absl::ToInt64Seconds(warn_stuck_timeout), state.num_threads);
+        id, name, absl::ToInt64Seconds(warn_stuck_timeout), state.num_threads);
   }
 
   // Wait for `terminate_timeout` for the rendezvous to be ready before killing
@@ -115,18 +117,19 @@ void AwaitAndLogIfStuck(RendezvousStateSynchronization& state, int32_t id,
 
   if (is_all_participants_arrived) {
     LOG(FATAL) << absl::StreamFormat(
-        "Termination timeout for `%s` of %d seconds exceeded. Exiting to "
-        "ensure a consistent program state. All %d threads joined the "
+        "[id=%d] Termination timeout for `%s` of %d seconds exceeded. Exiting "
+        "to ensure a consistent program state. All %d threads joined the "
         "rendezvous, however the leader has not marked the rendezvous as "
         "completed. Leader can be deadlocked inside the rendezvous callback.",
-        name, absl::ToInt64Seconds(terminate_timeout), state.num_threads);
+        id, name, absl::ToInt64Seconds(terminate_timeout), state.num_threads);
 
   } else {
     LOG(FATAL) << absl::StreamFormat(
-        "Termination timeout for `%s` of %d seconds exceeded. Exiting to "
-        "ensure a consistent program state. Expected %d threads to join the "
-        "rendezvous, but not all of them arrived on time.",
-        name, absl::ToInt64Seconds(terminate_timeout), state.num_threads);
+        "[id=%d] Termination timeout for `%s` of %d seconds exceeded. Exiting "
+        "to ensure a consistent program state. Expected %d threads to join the "
+        "rendezvous, but only %d of them arrived on time.",
+        id, name, absl::ToInt64Seconds(terminate_timeout), state.num_threads,
+        state.ack.load());
   }
 }
 
@@ -143,7 +146,9 @@ RendezvousFlag::InFlightRendezvous::InFlightRendezvous(RendezvousFlag* flag)
     : flag_(flag) {}
 
 RendezvousFlag::InFlightRendezvous::~InFlightRendezvous() {
-  if (flag_ == nullptr) return;
+  if (flag_ == nullptr) {
+    return;
+  }
 
   // Reload state and use CAS to decide if we are the one who
   // should mark rendezvous flag completed.
@@ -168,7 +173,9 @@ RendezvousFlag::InFlightRendezvous::operator bool() const {
 RendezvousFlag::InFlightRendezvous RendezvousFlag::TryJoin() {
   // If `state_` is `kCompleted` it means that we have at least one completed
   // rendezvous for this flag and can skip it.
-  if (state_.load() == kCompleted) return InFlightRendezvous(nullptr);
+  if (state_.load() == kCompleted) {
+    return InFlightRendezvous(nullptr);
+  }
 
   // Try to increment a state in a CAS loop to signal all other participants
   // that we joined an in-flight rendezvous.
@@ -178,7 +185,9 @@ RendezvousFlag::InFlightRendezvous RendezvousFlag::TryJoin() {
   }
 
   // Someone else completed the rendezvous and we don't need to join.
-  if (state == kCompleted) return InFlightRendezvous(nullptr);
+  if (state == kCompleted) {
+    return InFlightRendezvous(nullptr);
+  }
 
   return InFlightRendezvous(this);
 }
diff --git a/third_party/xla/xla/service/rendezvous.h b/third_party/xla/xla/service/rendezvous.h
index 0df0ee60b282..9caca88abd23 100644
--- a/third_party/xla/xla/service/rendezvous.h
+++ b/third_party/xla/xla/service/rendezvous.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/optimization.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
@@ -39,60 +40,36 @@ limitations under the License.
 namespace xla {
 
 //===----------------------------------------------------------------------===//
-// A rendezvous for a group of threads.
+// Rendezvous synchronization.
 //===----------------------------------------------------------------------===//
 
-// A little bit of compile time metaprogramming to simplify the rendezvous
-// return type for functions returning `absl::StatusOr`. If we detect that
-// rendezvous callback returns `absl::StatusOr` we swap the order of a shared
-// pointer and status container.
-
-template <typename R>
-struct RendezvousResult {
-  using Type = std::shared_ptr<R>;
-
-  template <typename Result>
-  static Type Wrap(Result result) {
-    static_assert(std::is_constructible_v<R, Result>,
-                  "Result `R` is not constructible from `Result`");
-    return std::make_shared<R>(std::move(result));
-  }
-
-  static Type Empty() { return std::shared_ptr<R>(); }
-};
-
-template <typename R>
-struct RendezvousResult<absl::StatusOr<R>> {
-  using Type = absl::StatusOr<std::shared_ptr<R>>;
-
-  template <typename Result>
-  static Type Wrap(absl::StatusOr<Result> result) {
-    static_assert(std::is_constructible_v<R, Result>,
-                  "Result `R` is not constructible from `Result`");
-    if (!result.ok()) return result.status();
-    return std::make_shared<R>(std::move(*result));
-  }
-
-  template <typename Result>
-  static Type Wrap(Result result) {
-    static_assert(std::is_constructible_v<R, Result>,
-                  "Result `R` is not constructible from `Result`");
-    return std::make_shared<R>(std::move(result));
-  }
-
-  static Type Empty() { return {std::shared_ptr<R>()}; }
-};
-
-template <>
-struct RendezvousResult<absl::Status> {
-  using Type = absl::Status;
-
-  static Type Wrap(absl::Status result) { return result; }
-  static Type Empty() { return absl::OkStatus(); }
-};
+// Rendezvous is an XLA synchronization primitive that guarantees that all
+// participating threads arrive to a rendezvous barrier identified by a key, and
+// the last arriving thread becomes a leader that executes a rendezvous
+// callback. The result of executing a callback broadcasted back to all
+// participants as an `std::shared_ptr<R>` value, which makes all participants
+// "collective owners" of the computed value.
+//
+// XLA uses rendezvous to guarantee that all ranks make progress together when
+// executing a partitioned XLA program, and acts as a guard against the
+// deadlocks in the lower parts of the stack (i.e. if not all participants
+// arrive to NCCL collective, then we will get a deadlock on device, which is a
+// lot harder to debug).
+//
+// Rendezvous can synchronize only within a same process, as it relies on
+// shared memory to communicate between participants.
+//
+// If rendezvous reaches a `terminate_timeout`, it will return an error status
+// to all participants, meaning that not all participants have arrived to the
+// rendezvous barrier in the given time.
+//
+// Rendezvous callback must return the value of type `R`, or `absl::StatusOr<R>`
+// which will be automatically converted to `absl::StatusOr<std::shared_ptr<R>>`
+// for all participants.
 
-template <typename R>
-using RendezvousResultType = typename RendezvousResult<R>::Type;
+//===----------------------------------------------------------------------===//
+// Rendezvous API.
+//===----------------------------------------------------------------------===//
 
 // The group of threads identifies itself with a key that must be unique to
 // the group. When all threads have arrived at the rendezvous, one thread
@@ -100,14 +77,14 @@ using RendezvousResultType = typename RendezvousResult<R>::Type;
 // all threads receive the result. Rendezvous must have a human readable name to
 // make easy to debug stuck and timed out attempts.
 template <typename R, typename K, typename V, typename Fn>
-RendezvousResultType<R> Rendezvous(
+absl::StatusOr<std::shared_ptr<R>> Rendezvous(
     absl::string_view name, const K& key, const V& value, size_t num_threads,
     Fn fn, absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
     absl::Duration terminate_timeout = absl::InfiniteDuration());
 
 // A rendezvous for a group of threads that do not have any value arguments.
 template <typename R, typename K, typename Fn>
-RendezvousResultType<R> Rendezvous(
+absl::StatusOr<std::shared_ptr<R>> Rendezvous(
     absl::string_view name, const K& key, size_t num_threads, Fn fn,
     absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
     absl::Duration terminate_timeout = absl::InfiniteDuration());
@@ -115,9 +92,10 @@ RendezvousResultType<R> Rendezvous(
 // A rendezvous for a group of threads that do not have any computation to run
 // and simply acts as a barrier for a group of thread.
 template <typename K>
-void Rendezvous(absl::string_view name, const K& key, size_t num_threads,
-                absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
-                absl::Duration terminate_timeout = absl::InfiniteDuration());
+absl::Status Rendezvous(
+    absl::string_view name, const K& key, size_t num_threads,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
 
 // An `std::once_flag`-like primitive for executing Rendezvous operations.
 //
@@ -170,10 +148,10 @@ class RendezvousFlag {
 
 // A rendezvous for a group of threads that will be executed only if the flag is
 // not in `completed` state and will switch it to `completed` after finishing a
-// rendezvous. If rendezvous will not be executed it will return empty shared
-// pointer result.
+// rendezvous. If rendezvous was not executed, the result will be an empty
+// shared pointer.
 template <typename R, typename K, typename Fn>
-RendezvousResultType<R> Rendezvous(
+absl::StatusOr<std::shared_ptr<R>> Rendezvous(
     RendezvousFlag& flag, absl::string_view name, const K& key,
     size_t num_threads, Fn fn,
     absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
@@ -183,10 +161,11 @@ RendezvousResultType<R> Rendezvous(
 // not in `completed` state and will switch it to `completed` after finishing a
 // rendezvous.
 template <typename K>
-void Rendezvous(RendezvousFlag& flag, absl::string_view name, const K& key,
-                size_t num_threads,
-                absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
-                absl::Duration terminate_timeout = absl::InfiniteDuration());
+absl::Status Rendezvous(
+    RendezvousFlag& flag, absl::string_view name, const K& key,
+    size_t num_threads,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
 
 //===----------------------------------------------------------------------===//
 // Internal implementation details.
@@ -194,6 +173,12 @@ void Rendezvous(RendezvousFlag& flag, absl::string_view name, const K& key,
 
 namespace internal {
 
+// Detects types that are `absl::StatusOr<R>` container.
+template <typename T>
+struct IsStatusOrResult : std::false_type {};
+template <typename T>
+struct IsStatusOrResult<absl::StatusOr<T>> : std::true_type {};
+
 // A base class for rendezvous state that holds synchronization primitives.
 struct RendezvousStateSynchronization {
   explicit RendezvousStateSynchronization(size_t num_threads)
@@ -208,22 +193,20 @@ struct RendezvousStateSynchronization {
   absl::CondVar cv;
 
   // Signals availability of `result`.
-  std::atomic<bool> ready ABSL_GUARDED_BY(mutex);
+  bool ready ABSL_GUARDED_BY(mutex);
 };
 
 // A state for a single round of rendezvous. We expect exactly `num_treads` to
 // arrive to a rendezvous and update corresponding slots in `values`. We
-// pre-allocate storage for values so at run time each participant doesn't have
+// pre-allocate storage for values, so at run time each participant doesn't have
 // to grab a lock and can simple write to the destination storage.
 template <typename R, typename V>
 struct RendezvousState : public RendezvousStateSynchronization {
   explicit RendezvousState(size_t n_threads)
-      : RendezvousStateSynchronization(n_threads),
-        values(n_threads, nullptr),
-        result(RendezvousResult<R>::Empty()) {}
+      : RendezvousStateSynchronization(n_threads), values(n_threads, nullptr) {}
 
   std::vector<const V*> values;
-  RendezvousResultType<R> result;
+  absl::StatusOr<std::shared_ptr<R>> result;
 };
 
 // A container for in-progress rendezvous.
@@ -231,15 +214,19 @@ struct RendezvousState : public RendezvousStateSynchronization {
 // Rendezvous state ownership:
 //
 // (1) When rendezvous participant initiates a rendezvous with a particular key
-//     we create a new state for it, keep it in a map for tracking and return a
-//     shared pointer to the caller.
+//     we create a new state for it, keep it in a map as weak pointer for
+//     tracking and return a shared pointer to the caller.
 //
 // (2) When rendezvous participant joins in-progress rendezvous it gets back
 //     a shared pointer that is copied from a tracking map.
 //
-// (3) When the last rendezvous participant computes the result it completes the
-//     rendezvous and removes a shared pointer to a state. Remaining shared
-//     pointers destructed when all participants are notified.
+// (3) When rendezvous completes, the thread that completes it removes a state
+//     from a map, so that the next rendezvous with the same key can start
+//     immediately and create a new state.
+//
+// (4) If rendezvous failed to complete, the weak pointer will expire when all
+//     participants left the rendezvous, and will be lazily garbage collected
+//     in the next call to `Join`.
 //
 // This process guarantees that all completed rendezvous are removed from a map
 // and a map has records only for rendezvous in progress.
@@ -250,64 +237,64 @@ class RendezvousMap {
 
   std::shared_ptr<State> Join(const K& key, size_t num_threads) {
     absl::MutexLock lock(&mutex_);
-    std::shared_ptr<State>& state = state_[key];
 
-    // Join an in-progress rendezvous.
-    if (state) return state;
+    // Erase expired rendezvous from the map.
+    absl::erase_if(state_, [](const auto& e) { return e.second.expired(); });
 
-    // Join a newly created rendezvous.
-    return state = std::make_shared<State>(num_threads);
-  }
-
-  void Complete(const K& key, RendezvousResultType<R> result) {
-    std::shared_ptr<State> state = [&] {
-      absl::MutexLock lock(&mutex_);
-
-      // Extract state from the map so we can immediately start a new round of
-      // rendezvous with the same key. A state for previous rendezvous will be
-      // destructed with the last copy of a shared pointer.
-      std::shared_ptr<State> state = state_.extract(key).mapped();
-
-      // Check that we have have exactly the number of participants we expected:
-      // +1 reference for all participants and a +1 reference we extracted.
-      CHECK_EQ(state.use_count(), 1 + state->values.size());  // NOLINT
+    std::weak_ptr<State>& in_progress = state_[key];
 
-      return state;
-    }();
+    // Try to join an in-progress rendezvous for a given key.
+    if (std::shared_ptr<State> joined = in_progress.lock()) {
+      return joined;
+    }
 
-    // We notify awaiting participants without holding a rendezvous map lock, as
-    // the rendezvous callback might be an expensive operation and might block
-    // the progress of concurrent rendezvous for other keys.
-
-    // Publish rendezvous result to all participants.
-    state->result = std::move(result);
+    // Start a new rendezvous for a given key.
+    std::shared_ptr<State> start = std::make_shared<State>(num_threads);
+    return (in_progress = start, start);
+  }
 
-    // Notify awaiting participants that result is ready.
-    absl::MutexLock lock(&state->mutex);
-    state->ready.store(true);
-    state->cv.SignalAll();
+  void Complete(const K& key) {
+    absl::MutexLock lock(&mutex_);
+    state_.erase(key);
   }
 
  private:
   absl::Mutex mutex_;
-  absl::flat_hash_map<K, std::shared_ptr<State>> state_ ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<K, std::weak_ptr<State>> state_ ABSL_GUARDED_BY(mutex_);
 };
 
 void AwaitAndLogIfStuck(RendezvousStateSynchronization& state, int32_t id,
                         absl::string_view name,
                         absl::Duration warn_stuck_timeout,
                         absl::Duration terminate_timeout);
+
 }  // namespace internal
 
 //===----------------------------------------------------------------------===//
 // Rendezvous implemenetation.
 //===----------------------------------------------------------------------===//
 
+template <typename R, typename V, typename Fn>
+absl::StatusOr<std::shared_ptr<R>> InvokeRendezvous(
+    Fn fn, absl::Span<const V*> values) {
+  auto result = fn(values);
+
+  if constexpr (internal::IsStatusOrResult<decltype(result)>::value) {
+    if (ABSL_PREDICT_TRUE(result.ok())) {
+      return std::make_shared<R>(*std::move(result));
+    } else {
+      return result.status();
+    }
+  } else {
+    return std::make_shared<R>(std::move(result));
+  }
+}
+
 template <typename R, typename K, typename V, typename Fn>
-RendezvousResultType<R> Rendezvous(absl::string_view name, const K& key,
-                                   const V& value, size_t num_threads, Fn fn,
-                                   absl::Duration warn_stuck_timeout,
-                                   absl::Duration terminate_timeout) {
+absl::StatusOr<std::shared_ptr<R>> Rendezvous(
+    absl::string_view name, const K& key, const V& value, size_t num_threads,
+    Fn fn, absl::Duration warn_stuck_timeout,
+    absl::Duration terminate_timeout) {
   // Check that `fn` is callable with a span of values.
   static_assert(std::is_invocable_v<Fn, absl::Span<const V*>>,
                 "invalid rendezvous function signature");
@@ -315,7 +302,7 @@ RendezvousResultType<R> Rendezvous(absl::string_view name, const K& key,
   // Fast-path (DO NOT REMOVE: the logic below doesn't work for single thread).
   if (num_threads == 1) {
     const V* ptr = &value;
-    return RendezvousResult<R>::Wrap(fn(absl::MakeSpan(&ptr, 1)));
+    return InvokeRendezvous<R, V>(std::move(fn), absl::MakeSpan(&ptr, 1));
   }
 
   using State = internal::RendezvousState<R, V>;
@@ -353,58 +340,80 @@ RendezvousResultType<R> Rendezvous(absl::string_view name, const K& key,
     internal::AwaitAndLogIfStuck(*state, id, name, warn_stuck_timeout,
                                  terminate_timeout);
   } else {
+    // Mark rendezvous as completed, so that we can immediately start a new
+    // rendezvous with the same key.
+    rendezvous.Complete(key);
+
     // Last thread to arrive executes the function and completes rendezvous by
     // making result available to all participants. All other participants will
     // be notified via `state->ready` flag when result is ready, and we rely on
-    // the store to a flag to create a memory barrier that makes access to
-    // `state->result` safe without any extra synchronization.
-    tsl::profiler::TraceMe trace("ExecuteRendezvousCallback");
+    // the mutex to create a memory barrier that makes access to `state->result`
+    // safe without any extra synchronization.
+    tsl::profiler::TraceMe trace("InvokeRendezvous");
     absl::Span<const V*> values(state->values.data(), num_threads);
-    rendezvous.Complete(key, RendezvousResult<R>::Wrap(fn(values)));
+
+    // Check that we have have exactly the number of participants we expect.
+    CHECK_EQ(state.use_count(), num_threads);  // NOLINT
+
+    // Publish rendezvous result to all participants.
+    state->result = InvokeRendezvous<R, V>(std::move(fn), values);
+
+    // Switch `ready` flag to signal all participants that result is ready.
+    {
+      absl::MutexLock lock(&state->mutex);
+      state->ready = true;
+    }
+
+    // Notify awaiting participants that result is ready.
+    state->cv.SignalAll();
   }
 
   return state->result;
 }
 
 template <typename R, typename K, typename Fn>
-RendezvousResultType<R> Rendezvous(absl::string_view name, const K& key,
-                                   size_t num_threads, Fn fn,
-                                   absl::Duration warn_stuck_timeout,
-                                   absl::Duration terminate_timeout) {
+absl::StatusOr<std::shared_ptr<R>> Rendezvous(
+    absl::string_view name, const K& key, size_t num_threads, Fn fn,
+    absl::Duration warn_stuck_timeout, absl::Duration terminate_timeout) {
   return Rendezvous<R, K, std::nullopt_t>(
       name, key, std::nullopt, num_threads, [fn](auto) { return fn(); },
       warn_stuck_timeout, terminate_timeout);
 }
 
 template <typename K>
-void Rendezvous(absl::string_view name, const K& key, size_t num_threads,
-                absl::Duration warn_stuck_timeout,
-                absl::Duration terminate_timeout) {
-  Rendezvous<std::nullopt_t, K, std::nullopt_t>(
-      name, key, std::nullopt, num_threads, [](auto) { return std::nullopt; },
-      warn_stuck_timeout, terminate_timeout);
+absl::Status Rendezvous(absl::string_view name, const K& key,
+                        size_t num_threads, absl::Duration warn_stuck_timeout,
+                        absl::Duration terminate_timeout) {
+  return Rendezvous<std::nullopt_t, K, std::nullopt_t>(
+             name, key, std::nullopt, num_threads,
+             [](auto) { return std::nullopt; }, warn_stuck_timeout,
+             terminate_timeout)
+      .status();
 }
 
 template <typename R, typename K, typename Fn>
-RendezvousResultType<R> Rendezvous(RendezvousFlag& flag, absl::string_view name,
-                                   const K& key, size_t num_threads, Fn fn,
-                                   absl::Duration warn_stuck_timeout,
-                                   absl::Duration terminate_timeout) {
+absl::StatusOr<std::shared_ptr<R>> Rendezvous(
+    RendezvousFlag& flag, absl::string_view name, const K& key,
+    size_t num_threads, Fn fn, absl::Duration warn_stuck_timeout,
+    absl::Duration terminate_timeout) {
   if (auto in_flight_rendezvous = flag.TryJoin()) {
     return Rendezvous<K>(name, key, num_threads, std::move(fn),
                          warn_stuck_timeout, terminate_timeout);
   } else {
-    return RendezvousResult<R>::Empty();
+    return std::shared_ptr<R>();
   }
 }
 
 template <typename K>
-void Rendezvous(RendezvousFlag& flag, absl::string_view name, const K& key,
-                size_t num_threads, absl::Duration warn_stuck_timeout,
-                absl::Duration terminate_timeout) {
+absl::Status Rendezvous(RendezvousFlag& flag, absl::string_view name,
+                        const K& key, size_t num_threads,
+                        absl::Duration warn_stuck_timeout,
+                        absl::Duration terminate_timeout) {
   if (auto in_flight_rendezvous = flag.TryJoin()) {
-    Rendezvous<K>(name, key, num_threads, warn_stuck_timeout,
-                  terminate_timeout);
+    return Rendezvous<K>(name, key, num_threads, warn_stuck_timeout,
+                         terminate_timeout);
+  } else {
+    return absl::OkStatus();
   }
 }
 
diff --git a/third_party/xla/xla/service/rendezvous_test.cc b/third_party/xla/xla/service/rendezvous_test.cc
index c47550a63de1..5bb8d2880a2a 100644
--- a/third_party/xla/xla/service/rendezvous_test.cc
+++ b/third_party/xla/xla/service/rendezvous_test.cc
@@ -20,28 +20,34 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
-#include "tsl/platform/threadpool.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
 
-absl::Duration Timeout() { return absl::Seconds(10); }
-absl::Duration Terminate() { return absl::Seconds(10); }
+absl::Duration Timeout() { return absl::Seconds(5); }
+absl::Duration Terminate() { return absl::Seconds(5); }
 
 tsl::thread::ThreadPool CreateThreadPool(int32_t size) {
   return tsl::thread::ThreadPool(tsl::Env::Default(), "rendezvous_test", size);
 }
 
 TEST(RendezvousTest, OneParticipant) {
-  auto result = Rendezvous<int32_t>("rendezvous_test", 0, 1, [] { return 42; });
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<int32_t> result,
+      Rendezvous<int32_t>("rendezvous_test", 0, 1, [] { return 42; }));
   ASSERT_EQ(*result, 42);
 }
 
@@ -51,8 +57,9 @@ TEST(RendezvousTest, TwoParticipants) {
 
   auto task = [&](int32_t id) {
     return [&, id] {
-      results[id] =
-          Rendezvous<int32_t>("rendezvous_test", 0, 2, [] { return 42; });
+      TF_ASSERT_OK_AND_ASSIGN(
+          results[id],
+          Rendezvous<int32_t>("rendezvous_test", 0, 2, [] { return 42; }));
       counter.DecrementCount();
     };
   };
@@ -62,7 +69,6 @@ TEST(RendezvousTest, TwoParticipants) {
   thread_pool.Schedule(task(1));
   counter.Wait();
 
-  ASSERT_EQ(results.size(), 2);
   ASSERT_EQ(*results[0], 42);
   ASSERT_EQ(*results[1], 42);
 }
@@ -79,8 +85,9 @@ TEST(RendezvousTest, TwoParticipantsWithValues) {
 
   auto task = [&](int32_t id) {
     return [&, id] {
-      results[id] =
-          Rendezvous<int32_t>("rendezvous_test", 0, id, 2, accumulate);
+      TF_ASSERT_OK_AND_ASSIGN(
+          results[id],
+          Rendezvous<int32_t>("rendezvous_test", 0, id, 2, accumulate));
       counter.DecrementCount();
     };
   };
@@ -90,7 +97,6 @@ TEST(RendezvousTest, TwoParticipantsWithValues) {
   thread_pool.Schedule(task(1));
   counter.Wait();
 
-  ASSERT_EQ(results.size(), 2);
   ASSERT_EQ(*results[0], 1);
   ASSERT_EQ(*results[1], 1);
 }
@@ -102,7 +108,9 @@ TEST(RendezvousTest, RepeatRendezvous) {
     absl::BlockingCounter counter(2);
 
     auto task = [&] {
-      Rendezvous<int32_t>("rendezvous_test", i, 2, [] { return 42; });
+      TF_ASSERT_OK(Rendezvous<int32_t>(
+          "rendezvous_test", /*key=*/0, /*num_threads=*/2, [] { return 42; },
+          Timeout(), Terminate()));
       counter.DecrementCount();
     };
 
@@ -112,14 +120,61 @@ TEST(RendezvousTest, RepeatRendezvous) {
   }
 }
 
+TEST(RendezvousTest, BackToBackRendezvous) {
+  auto thread_pool = CreateThreadPool(2);
+
+  absl::BlockingCounter counter(2);
+
+  // In contrast to the previous test, both task do back to back rendezvous
+  // without synchronization with a main thread. We check that in this case
+  // rendezvous do not step on each other and execute correctly.
+  auto task = [&] {
+    for (int32_t i = 0; i < 10; ++i) {
+      TF_ASSERT_OK(Rendezvous<int32_t>(
+          "rendezvous_test", /*key=*/0, /*num_threads=*/2, [] { return 42; },
+          Timeout(), Terminate()));
+    }
+    counter.DecrementCount();
+  };
+
+  thread_pool.Schedule(task);
+  thread_pool.Schedule(task);
+  counter.Wait();
+}
+
 TEST(RendezvousTest, ReturningStatusOr) {
+  absl::BlockingCounter counter(2);
+  std::vector<std::shared_ptr<int32_t>> results(2);
+
+  auto task = [&](int32_t id) {
+    return [&, id] {
+      TF_ASSERT_OK_AND_ASSIGN(
+          results[id],
+          Rendezvous<int32_t>("rendezvous_test", 0, 2,
+                              []() -> absl::StatusOr<int32_t> { return 42; }));
+      counter.DecrementCount();
+    };
+  };
+
+  auto thread_pool = CreateThreadPool(2);
+  thread_pool.Schedule(task(0));
+  thread_pool.Schedule(task(1));
+  counter.Wait();
+
+  ASSERT_EQ(*results[0], 42);
+  ASSERT_EQ(*results[1], 42);
+}
+
+TEST(RendezvousTest, ReturningStatusError) {
   absl::BlockingCounter counter(2);
   std::vector<absl::StatusOr<std::shared_ptr<int32_t>>> results(2);
 
   auto task = [&](int32_t id) {
     return [&, id] {
-      results[id] = Rendezvous<absl::StatusOr<int32_t>>("rendezvous_test", 0, 2,
-                                                        [] { return 42; });
+      results[id] = Rendezvous<int32_t>(
+          "rendezvous_test", 0, 2, []() -> absl::StatusOr<int32_t> {
+            return absl::InternalError("test error");
+          });
       counter.DecrementCount();
     };
   };
@@ -129,9 +184,8 @@ TEST(RendezvousTest, ReturningStatusOr) {
   thread_pool.Schedule(task(1));
   counter.Wait();
 
-  ASSERT_EQ(results.size(), 2);
-  ASSERT_EQ(**results[0], 42);
-  ASSERT_EQ(**results[1], 42);
+  ASSERT_EQ(results[0].status(), absl::InternalError("test error"));
+  ASSERT_EQ(results[1].status(), absl::InternalError("test error"));
 }
 
 TEST(RendezvousTest, RendezvousFlag) {
@@ -145,9 +199,9 @@ TEST(RendezvousTest, RendezvousFlag) {
 
   auto task = [&](absl::BlockingCounter& counter) {
     return [&] {
-      Rendezvous<int32_t>(
+      TF_ASSERT_OK(Rendezvous<int32_t>(
           flag, "rendezvous_test", 0, 2, [&] { return ++num_executed; },
-          Timeout(), Terminate());
+          Timeout(), Terminate()));
       counter.DecrementCount();
     };
   };
@@ -178,8 +232,8 @@ TEST(RendezvousTest, RendezvousFlagRace) {
 
   auto task = [&](int32_t key) {
     return [&, key] {
-      Rendezvous(flag, "key: " + std::to_string(key), key, kNumThreads,
-                 Timeout(), Terminate());
+      TF_ASSERT_OK(Rendezvous(flag, "key: " + std::to_string(key), key,
+                              kNumThreads, Timeout(), Terminate()));
     };
   };
 
@@ -208,8 +262,8 @@ TEST(RendezvousTest, RendezvousFlagRaceWithBarriers) {
     return [&, key] {
       participants_ready.DecrementCount();
       participants_notification.WaitForNotification();
-      Rendezvous(flag, "key: " + std::to_string(key), key, kNumThreads,
-                 Timeout(), Terminate());
+      TF_ASSERT_OK(Rendezvous(flag, "key: " + std::to_string(key), key,
+                              kNumThreads, Timeout(), Terminate()));
       participants_done.DecrementCount();
     };
   };
@@ -237,8 +291,9 @@ static void BM_Rendezvous(benchmark::State& state) {
     absl::BlockingCounter counter(num_threads);
     for (int64_t i = 0; i < num_threads; ++i) {
       thread_pool.Schedule([&] {
-        Rendezvous<int32_t>("rendezvous_test", 0, num_threads,
-                            [] { return 42; });
+        CHECK_OK(Rendezvous<int32_t>(
+            "rendezvous_test", /*key=*/0, num_threads, [] { return 42; },
+            Timeout(), Terminate()));
         counter.DecrementCount();
       });
     }
@@ -253,10 +308,10 @@ static void BM_RendezvousWithValues(benchmark::State& state) {
   for (auto _ : state) {
     absl::BlockingCounter counter(num_threads);
     for (int64_t i = 0; i < num_threads; ++i) {
-      thread_pool.Schedule([&] {
-        int32_t value = i;
-        Rendezvous<int32_t>("rendezvous_test", 0, value, num_threads,
-                            [](auto) { return 42; });
+      thread_pool.Schedule([&, i] {
+        CHECK_OK(Rendezvous<int32_t>(
+            "rendezvous_test", /*key=*/0, /*value=*/i, num_threads,
+            [](auto) { return 42; }, Timeout(), Terminate()));
         counter.DecrementCount();
       });
     }
@@ -264,6 +319,28 @@ static void BM_RendezvousWithValues(benchmark::State& state) {
   }
 }
 
+static void BM_GroupedRendezvous(benchmark::State& state) {
+  int64_t num_groups = state.range(0);
+  int64_t group_size = state.range(1);
+
+  auto thread_pool = CreateThreadPool(num_groups * group_size);
+
+  for (auto _ : state) {
+    absl::BlockingCounter counter(num_groups * group_size);
+    for (int64_t group = 0; group < num_groups; ++group) {
+      for (int64_t i = 0; i < group_size; ++i) {
+        thread_pool.Schedule([&, group] {
+          CHECK_OK(Rendezvous<int32_t>(
+              "rendezvous_test", /*key=*/group, /*num_threads=*/group_size,
+              [] { return 42; }, Timeout(), Terminate()));
+          counter.DecrementCount();
+        });
+      }
+    }
+    counter.Wait();
+  }
+}
+
 BENCHMARK(BM_Rendezvous)
     ->MeasureProcessCPUTime()
     ->Arg(2)
@@ -280,5 +357,11 @@ BENCHMARK(BM_RendezvousWithValues)
     ->Arg(16)
     ->Arg(32);
 
+BENCHMARK(BM_GroupedRendezvous)
+    ->MeasureProcessCPUTime()
+    ->ArgPair(2, 2)
+    ->ArgPair(4, 2)
+    ->ArgPair(2, 4);
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/reshape_decomposer.h b/third_party/xla/xla/service/reshape_decomposer.h
deleted file mode 100644
index f5d5b140b192..000000000000
--- a/third_party/xla/xla/service/reshape_decomposer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_RESHAPE_DECOMPOSER_H_
-#define XLA_SERVICE_RESHAPE_DECOMPOSER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/reshape_decomposer.h"
-
-#endif  // XLA_SERVICE_RESHAPE_DECOMPOSER_H_
diff --git a/third_party/xla/xla/service/reshape_mover.h b/third_party/xla/xla/service/reshape_mover.h
deleted file mode 100644
index 63f2003ed3e8..000000000000
--- a/third_party/xla/xla/service/reshape_mover.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_RESHAPE_MOVER_H_
-#define XLA_SERVICE_RESHAPE_MOVER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/reshape_mover.h"
-
-#endif  // XLA_SERVICE_RESHAPE_MOVER_H_
diff --git a/third_party/xla/xla/service/result_caster.h b/third_party/xla/xla/service/result_caster.h
deleted file mode 100644
index d8fc21221f50..000000000000
--- a/third_party/xla/xla/service/result_caster.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_RESULT_CASTER_H_
-#define XLA_SERVICE_RESULT_CASTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/result_caster.h"
-
-#endif  // XLA_SERVICE_RESULT_CASTER_H_
diff --git a/third_party/xla/xla/service/rng_bit_generator_expander.h b/third_party/xla/xla/service/rng_bit_generator_expander.h
deleted file mode 100644
index 40a8b3538047..000000000000
--- a/third_party/xla/xla/service/rng_bit_generator_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_RNG_BIT_GENERATOR_EXPANDER_H_
-#define XLA_SERVICE_RNG_BIT_GENERATOR_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/rng_bit_generator_expander.h"
-
-#endif  // XLA_SERVICE_RNG_BIT_GENERATOR_EXPANDER_H_
diff --git a/third_party/xla/xla/service/rng_expander.h b/third_party/xla/xla/service/rng_expander.h
deleted file mode 100644
index 5f1951d7c2c6..000000000000
--- a/third_party/xla/xla/service/rng_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_RNG_EXPANDER_H_
-#define XLA_SERVICE_RNG_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/rng_expander.h"
-
-#endif  // XLA_SERVICE_RNG_EXPANDER_H_
diff --git a/third_party/xla/xla/service/root_instruction_sinker.h b/third_party/xla/xla/service/root_instruction_sinker.h
deleted file mode 100644
index 38cc3c775690..000000000000
--- a/third_party/xla/xla/service/root_instruction_sinker.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
-#define XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/root_instruction_sinker.h"
-
-#endif  // XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
index fb20157f3fd0..97f19aa21bae 100644
--- a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
 #include "xla/literal_util.h"
-#include "xla/service/call_graph.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/while_loop_simplifier.h"
 #include "xla/service/while_loop_unroller.h"
@@ -210,16 +209,8 @@ FindAccumulatorInputPairs(const HloAliasAnalysis& alias_analysis,
 absl::StatusOr<bool> UnifyAccumulatorWithInput(
     const HloAliasAnalysis& alias_analysis,
     std::vector<std::pair<HloInstruction*, WhileLoopConfig>> unrollable_loops) {
-  // TODO(b/333521102): Helper function to check if a computation is a body of a
-  // while call. Currently, IsWhileBodyComputation api call does not work
-  // properly so we check it ourself. We should switch to IsWhileBodyComputation
-  // when it's fixed.
-  std::unique_ptr<CallGraph> call_graph =
-      CallGraph::Build(&alias_analysis.dataflow_analysis().module());
   auto is_while_body = [&](HloComputation* comp) {
-    std::vector<HloInstruction*> callers =
-        call_graph->GetComputationCallers(comp);
-    return !callers.empty() && callers.at(0)->opcode() == HloOpcode::kWhile;
+    return !comp->caller_instructions(HloOpcode::kWhile).empty();
   };
 
   std::vector<HloInstruction*> changed_loops;
diff --git a/third_party/xla/xla/service/scatter_determinism_expander.cc b/third_party/xla/xla/service/scatter_determinism_expander.cc
index 9455519f39b5..6725556dda81 100644
--- a/third_party/xla/xla/service/scatter_determinism_expander.cc
+++ b/third_party/xla/xla/service/scatter_determinism_expander.cc
@@ -578,7 +578,7 @@ absl::StatusOr<HloInstruction*> CheckValidIndices(
       ShapeUtil::MakeShape(PRED, indices->shape().dimensions()),
       max_valid_index_constant, indices, ComparisonDirection::kGe));
   HloInstruction* oob_check_mask;
-  if (indices->shape().rank() == 1) {
+  if (indices->shape().dimensions_size() == 1) {
     oob_check_mask = oob_check;
   } else {
     // Reduce across rows to get a mask (for multi-dimensional indices).
diff --git a/third_party/xla/xla/service/scatter_determinism_expander_test.cc b/third_party/xla/xla/service/scatter_determinism_expander_test.cc
index 81078b0da544..485d1d72b35e 100644
--- a/third_party/xla/xla/service/scatter_determinism_expander_test.cc
+++ b/third_party/xla/xla/service/scatter_determinism_expander_test.cc
@@ -19,15 +19,15 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class ScatterDeterminismExpanderTest : public HloTestBase {};
+class ScatterDeterminismExpanderTest : public HloPjRtTestBase {};
 
 TEST_F(ScatterDeterminismExpanderTest,
        DoNotEliminateScatterWithAssociativeCombiner) {
diff --git a/third_party/xla/xla/service/scatter_expander_test.cc b/third_party/xla/xla/service/scatter_expander_test.cc
index 664f0112068f..5ec182d83210 100644
--- a/third_party/xla/xla/service/scatter_expander_test.cc
+++ b/third_party/xla/xla/service/scatter_expander_test.cc
@@ -24,18 +24,18 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/types.h"
 
 namespace xla {
 namespace {
 
-class ScatterExpanderTest : public HloTestBase {
+class ScatterExpanderTest : public HloHardwareIndependentTestBase {
  protected:
   // The HLO parser changes all no layout shapes from the input to have a
   // default layout. Clear the layout of the scatter operand for testing.
@@ -170,38 +170,38 @@ HloModule TensorFlowScatter
   const std::string expected = R"(
   //CHECK: (s32[], s32[5,3,2,2], s32[30], s32[30,2])) -> (s32[], s32[5,3,2,2], s32[30], s32[30,2]) {
   //CHECK: %[[PARAM:.*]] = (s32[], s32[5,3,2,2], s32[30], s32[30,2]) parameter(0)
-  //CHECK: %[[I:.*]] = s32[] get-tuple-element((s32[], s32[5,3,2,2], s32[30], s32[30,2]) %[[PARAM]]), index=0
+  //CHECK: %[[I:.*]] = s32[] get-tuple-element(%[[PARAM]]), index=0
   //CHECK: %[[CONSTANT1:.*]] = s32[] constant(1)
-  //CHECK: %[[I_PLUS_1:.*]] = s32[] add(s32[] %[[I]], s32[] %[[CONSTANT1]])
-  //CHECK: %[[OPERAND:.*]] = s32[5,3,2,2] get-tuple-element((s32[], s32[5,3,2,2], s32[30], s32[30,2]) %[[PARAM]]), index=1
+  //CHECK: %[[I_PLUS_1:.*]] = s32[] add(%[[I]], %[[CONSTANT1]])
+  //CHECK: %[[OPERAND:.*]] = s32[5,3,2,2] get-tuple-element(%[[PARAM]]), index=1
 
   //CHECK: %[[CONSTANT0:.*]] = s32[] constant(0)
-  //CHECK: %[[OPERAND_INDICES_LOWER_BOUND:.*]] = s32[4] broadcast(s32[] %[[CONSTANT0]])
+  //CHECK: %[[OPERAND_INDICES_LOWER_BOUND:.*]] = s32[4] broadcast(%[[CONSTANT0]])
   //CHECK: %[[CONSTANT5:.*]] = s32[] constant(5)
-  //CHECK: %[[REMAINDER:.*]] = s32[] remainder(s32[] %[[I]], s32[] %[[CONSTANT5]])
-  //CHECK: %[[BD2:.*]] = s32[1] broadcast(s32[] %[[REMAINDER]])
-  //CHECK: %[[START_INDICES:.*]] = s32[30] get-tuple-element((s32[], s32[5,3,2,2], s32[30], s32[30,2]) %[[PARAM]]), index=2
-  //CHECK: %[[I_1D_1:.*]] = s32[1] broadcast(s32[] %[[I]])
-  //CHECK: %[[START_INDICES_INDEX_RAW:.*]] = s32[1] slice(s32[1] %[[I_1D_1]])
-  //CHECK: %[[START_INDICES_INDEX:.*]] = s32[] reshape(s32[1] %[[START_INDICES_INDEX_RAW]])
-  //CHECK: %[[INDEX_VECTOR:.*]] = s32[1] dynamic-slice(s32[30] %[[START_INDICES]], s32[] %[[START_INDICES_INDEX]])
-
-  //CHECK: %[[SCATTER_INDEX:.*]] = s32[1] slice(s32[1] %[[INDEX_VECTOR]])
+  //CHECK: %[[REMAINDER:.*]] = s32[] remainder(%[[I]], %[[CONSTANT5]])
+  //CHECK: %[[BD2:.*]] = s32[1] broadcast(%[[REMAINDER]])
+  //CHECK: %[[START_INDICES:.*]] = s32[30] get-tuple-element(%[[PARAM]]), index=2
+  //CHECK: %[[I_1D_1:.*]] = s32[1] broadcast(%[[I]])
+  //CHECK: %[[START_INDICES_INDEX_RAW:.*]] = s32[1] slice(%[[I_1D_1]])
+  //CHECK: %[[START_INDICES_INDEX:.*]] = s32[] reshape(%[[START_INDICES_INDEX_RAW]])
+  //CHECK: %[[INDEX_VECTOR:.*]] = s32[1] dynamic-slice(%[[START_INDICES]], %[[START_INDICES_INDEX]])
+
+  //CHECK: %[[SCATTER_INDEX:.*]] = s32[1] slice(%[[INDEX_VECTOR]])
   //CHECK: %[[CONSTANT0_2:.*]] = s32[1] constant({0})
-  //CHECK: %[[BD_0_1:.*]] = s32[] divide(s32[] %[[I]], s32[] %[[CONSTANT5]])
+  //CHECK: %[[BD_0_1:.*]] = s32[] divide(%[[I]], %[[CONSTANT5]])
   //CHECK: %[[CONSTANT3:.*]] = s32[] constant(3)
-  //CHECK: %[[BD0_RAW:.*]] = s32[] divide(s32[] %[[BD_0_1]], s32[] %[[CONSTANT3]])
-  //CHECK: %[[BD0:.*]] = s32[1] broadcast(s32[] %[[BD0_RAW]])
-  //CHECK: %[[OPERAND_INDICES:.*]] = s32[4] concatenate(s32[1] %[[BD2]], s32[1] %[[SCATTER_INDEX]], s32[1] %[[CONSTANT0_2]], s32[1] %[[BD0]])
-  //CHECK: %[[OPERAND_INDEX_D0_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDICES]]), slice={[0:1]}
-  //CHECK: %[[OPERAND_INDEX_D0:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D0_RAW]])
-  //CHECK: %[[OPERAND_INDEX_D1_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDICES]]), slice={[1:2]}
-  //CHECK: %[[OPERAND_INDEX_D1:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D1_RAW]])
-  //CHECK: %[[OPERAND_INDEX_D2_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDICES]]), slice={[2:3]}
-  //CHECK: %[[OPERAND_INDEX_D2:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D2_RAW]])
-  //CHECK: %[[OPERAND_INDEX_D3_RAW:.*]] = s32[1] slice(s32[4] %[[OPERAND_INDICES]]), slice={[3:4]}
-  //CHECK: %[[OPERAND_INDEX_D3:.*]] = s32[] reshape(s32[1] %[[OPERAND_INDEX_D3_RAW]])
-  //CHECK: %{{.*}} = s32[1,1,2,1] dynamic-slice(s32[5,3,2,2] %[[OPERAND]], s32[] %[[OPERAND_INDEX_D0]], s32[] %[[OPERAND_INDEX_D1]], s32[] %[[OPERAND_INDEX_D2]], s32[] %[[OPERAND_INDEX_D3]])
+  //CHECK: %[[BD0_RAW:.*]] = s32[] divide(%[[BD_0_1]], %[[CONSTANT3]])
+  //CHECK: %[[BD0:.*]] = s32[1] broadcast(%[[BD0_RAW]])
+  //CHECK: %[[OPERAND_INDICES:.*]] = s32[4] concatenate(%[[BD2]], %[[SCATTER_INDEX]], %[[CONSTANT0_2]], %[[BD0]])
+  //CHECK: %[[OPERAND_INDEX_D0_RAW:.*]] = s32[1] slice(%[[OPERAND_INDICES]]), slice={[0:1]}
+  //CHECK: %[[OPERAND_INDEX_D0:.*]] = s32[] reshape(%[[OPERAND_INDEX_D0_RAW]])
+  //CHECK: %[[OPERAND_INDEX_D1_RAW:.*]] = s32[1] slice(%[[OPERAND_INDICES]]), slice={[1:2]}
+  //CHECK: %[[OPERAND_INDEX_D1:.*]] = s32[] reshape(%[[OPERAND_INDEX_D1_RAW]])
+  //CHECK: %[[OPERAND_INDEX_D2_RAW:.*]] = s32[1] slice(%[[OPERAND_INDICES]]), slice={[2:3]}
+  //CHECK: %[[OPERAND_INDEX_D2:.*]] = s32[] reshape(%[[OPERAND_INDEX_D2_RAW]])
+  //CHECK: %[[OPERAND_INDEX_D3_RAW:.*]] = s32[1] slice(%[[OPERAND_INDICES]]), slice={[3:4]}
+  //CHECK: %[[OPERAND_INDEX_D3:.*]] = s32[] reshape(%[[OPERAND_INDEX_D3_RAW]])
+  //CHECK: %{{.*}} = s32[1,1,2,1] dynamic-slice(%[[OPERAND]], %[[OPERAND_INDEX_D0]], %[[OPERAND_INDEX_D1]], %[[OPERAND_INDEX_D2]], %[[OPERAND_INDEX_D3]])
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/third_party/xla/xla/service/scatter_simplifier.cc b/third_party/xla/xla/service/scatter_simplifier.cc
index 419593819be7..6d6f37ac52d3 100644
--- a/third_party/xla/xla/service/scatter_simplifier.cc
+++ b/third_party/xla/xla/service/scatter_simplifier.cc
@@ -43,7 +43,7 @@ absl::StatusOr<HloInstruction*> FlattenAndTransposeUpdates(
     HloInstruction* updates, absl::Span<const int64_t> update_window_dims,
     absl::Span<const int64_t> inserted_window_dims,
     int64_t scatter_indices_size) {
-  int64_t updates_rank = updates->shape().rank();
+  int64_t updates_rank = updates->shape().dimensions_size();
 
   std::vector<int64_t> permutation;
   const int64_t num_scatter_dims = updates_rank - update_window_dims.size();
@@ -224,13 +224,15 @@ absl::StatusOr<HloInstruction*> ScatterSimplifier::ExpandInstruction(
 bool ScatterSimplifier::IsSimplifiedScatter(
     const HloScatterInstruction* scatter) {
   const auto& dims = scatter->scatter_dimension_numbers();
-  auto operand_rank = scatter->scatter_operands().front()->shape().rank();
+  auto operand_rank =
+      scatter->scatter_operands().front()->shape().dimensions_size();
   if (operand_rank == 0) return false;
 
   bool standard_index_vector_dim =
-      dims.index_vector_dim() == scatter->scatter_indices()->shape().rank() - 1;
+      dims.index_vector_dim() ==
+      scatter->scatter_indices()->shape().dimensions_size() - 1;
   int64_t num_scatter_dims =
-      scatter->scatter_updates().front()->shape().rank() -
+      scatter->scatter_updates().front()->shape().dimensions_size() -
       dims.update_window_dims().size();
   bool scatter_indices_ordered =
       IsIdentityPermutation(dims.scatter_dims_to_operand_dims());
diff --git a/third_party/xla/xla/service/scatter_simplifier.h b/third_party/xla/xla/service/scatter_simplifier.h
index 98f62ac40404..cd63697df664 100644
--- a/third_party/xla/xla/service/scatter_simplifier.h
+++ b/third_party/xla/xla/service/scatter_simplifier.h
@@ -51,7 +51,7 @@ namespace xla {
 //   if n < N:
 //     "Fill `indices` with trailing zeroes so that it is of shape M x N"
 //   for update, index = shuffle(zip(updates, indices)) {  # M iterations.
-//     assert(update.rank() == len(index) == N)
+//     assert(update.dimensions_size() == len(index) == N)
 //     if "update fits in operand at index":
 //       operand[index] = update_computation(operand[index], update)
 //
diff --git a/third_party/xla/xla/service/scatter_simplifier_test.cc b/third_party/xla/xla/service/scatter_simplifier_test.cc
index 14300d6bc1c3..2f1240351503 100644
--- a/third_party/xla/xla/service/scatter_simplifier_test.cc
+++ b/third_party/xla/xla/service/scatter_simplifier_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -35,7 +35,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ScatterSimplifierTest : public HloTestBase {};
+class ScatterSimplifierTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(ScatterSimplifierTest, InsertsIndexVectorAndWindowDims) {
   // Verifies that ScatterSimplifier
@@ -355,7 +355,7 @@ TEST_F(ScatterSimplifierTest, VariadicScatterIntoScalar) {
   )");
 }
 
-class SimpleScatterExampleTest : public HloTestBase {};
+class SimpleScatterExampleTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(SimpleScatterExampleTest, 1x1d) {
   constexpr absl::string_view hlo_text = R"(
diff --git a/third_party/xla/xla/service/scatter_utils.cc b/third_party/xla/xla/service/scatter_utils.cc
index aeedc4337f1f..2d772386880a 100644
--- a/third_party/xla/xla/service/scatter_utils.cc
+++ b/third_party/xla/xla/service/scatter_utils.cc
@@ -57,7 +57,7 @@ absl::StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
 absl::StatusOr<HloInstruction*> PermuteScatterAndWindowDims(
     HloInstruction* updates, absl::Span<const int64_t> update_window_dims) {
   std::vector<int64_t> permutation;
-  const int64_t updates_rank = updates->shape().rank();
+  const int64_t updates_rank = updates->shape().dimensions_size();
   permutation.reserve(updates_rank);
 
   for (int64_t i = 0; i < updates_rank; ++i) {
@@ -96,7 +96,7 @@ absl::StatusOr<HloInstruction*> CanonicalizeScatterIndices(
   TF_ASSIGN_OR_RETURN(
       HloInstruction * transposed_scatter_indices,
       TransposeIndexVectorDimToLast(scatter_indices, index_vector_dim));
-  if (scatter_indices->shape().rank() - 1 == index_vector_dim &&
+  if (scatter_indices->shape().dimensions_size() - 1 == index_vector_dim &&
       scatter_indices->shape().dimensions(index_vector_dim) == 1) {
     auto new_shape =
         ShapeUtil::DeleteDimension(index_vector_dim, scatter_indices->shape());
@@ -223,7 +223,7 @@ bool IsScatterCombinerAssociative(const HloComputation* combiner) {
     case HloOpcode::kMultiply:
     case HloOpcode::kOr:
     case HloOpcode::kXor:
-      return combiner->root_instruction()->shape().IsInteger();
+      return combiner->root_instruction()->shape().AreAllLeavesIntegers();
     default:
       return false;
   }
diff --git a/third_party/xla/xla/service/schedule_config.cc b/third_party/xla/xla/service/schedule_config.cc
new file mode 100644
index 000000000000..ef52e7795635
--- /dev/null
+++ b/third_party/xla/xla/service/schedule_config.cc
@@ -0,0 +1,39 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/schedule_config.h"
+
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+ScheduleConfigProto ScheduleConfig::ToProto(const ScheduleConfig& config) {
+  ScheduleConfigProto schedule_config_proto;
+  for (const auto& instruction : config.schedule) {
+    schedule_config_proto.add_sequence()->set_name(instruction);
+  }
+  return schedule_config_proto;
+}
+
+ScheduleConfig ScheduleConfig::FromProto(const ScheduleConfigProto& proto) {
+  ScheduleConfig config;
+  for (const auto& instruction : proto.sequence()) {
+    config.schedule.push_back(instruction.name());
+  }
+  return config;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/schedule_config.h b/third_party/xla/xla/service/schedule_config.h
new file mode 100644
index 000000000000..f3fef933b385
--- /dev/null
+++ b/third_party/xla/xla/service/schedule_config.h
@@ -0,0 +1,39 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SCHEDULE_CONFIG_H_
+#define XLA_SERVICE_SCHEDULE_CONFIG_H_
+
+#include <string>
+#include <vector>
+
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Program's schedule configuration.
+struct ScheduleConfig {
+  std::vector<std::string> schedule;
+  bool operator==(const ScheduleConfig& other) const {
+    return schedule == other.schedule;
+  }
+  static ScheduleConfig FromProto(const ScheduleConfigProto& proto);
+  static ScheduleConfigProto ToProto(const ScheduleConfig& config);
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SCHEDULE_CONFIG_H_
diff --git a/third_party/xla/xla/service/schedule_config_test.cc b/third_party/xla/xla/service/schedule_config_test.cc
new file mode 100644
index 000000000000..6e3c96fc10ab
--- /dev/null
+++ b/third_party/xla/xla/service/schedule_config_test.cc
@@ -0,0 +1,31 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/schedule_config.h"
+
+#include <gtest/gtest.h>
+
+namespace xla {
+namespace {
+
+TEST(ScheduleConfigTest, ConfigToProtoToConfigMatchesOriginal) {
+  ScheduleConfig config = {{"op1", "op2", "op3"}};
+  EXPECT_EQ(ScheduleConfig::FromProto(ScheduleConfig::ToProto(config)), config);
+  config = {};
+  EXPECT_EQ(ScheduleConfig::FromProto(ScheduleConfig::ToProto(config)), config);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/scheduling_annotations_util.cc b/third_party/xla/xla/service/scheduling_annotations_util.cc
index f7c8479aa466..81d322d89651 100644
--- a/third_party/xla/xla/service/scheduling_annotations_util.cc
+++ b/third_party/xla/xla/service/scheduling_annotations_util.cc
@@ -16,41 +16,190 @@ limitations under the License.
 #include "xla/service/scheduling_annotations_util.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <optional>
 #include <string>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/collective_pipeliner_utils.h"
 #include "xla/side_effect_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
-std::optional<int64_t> GetSchedulingAnnotation(
-    const HloInstruction* instruction) {
-  const auto& attrs = instruction->frontend_attributes().map();
+namespace {
+
+constexpr absl::string_view delimiter = ":";
+
+absl::Status VerifyAnnotation(const HloInstruction* instr,
+                              absl::string_view annotation) {
+  auto verify_integer_or_empty =
+      [instr, annotation](
+          absl::string_view str, absl::string_view field_name,
+          bool verify_non_negative_integer = false) -> absl::Status {
+    if (str.empty()) {
+      return absl::OkStatus();
+    }
+    int64_t integer;
+    if (!absl::SimpleAtoi(str, &integer)) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Instruction has a non-integer scheduling annotation ", field_name,
+          ", inst: ", instr->name(), ", annotation: ", annotation));
+    }
+    if (verify_non_negative_integer && integer < 0) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Instruction has a negative scheduling annotation ", field_name,
+          ", inst: ", instr->name(), ", annotation: ", annotation));
+    }
+    return absl::OkStatus();
+  };
+  std::vector<absl::string_view> annotation_fields =
+      absl::StrSplit(annotation, delimiter);
+  CHECK_GE(annotation_fields.size(), 1);
+  if (annotation_fields.size() > 2) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Instruction has more than 2 scheduling annotation fields, inst: ",
+        instr->name(), ", annotation: ", annotation));
+  }
+  TF_RETURN_IF_ERROR(verify_integer_or_empty(
+      annotation_fields[0], "group id", /*verify_non_negative_integer=*/true));
+  if (annotation_fields.size() == 2) {
+    TF_RETURN_IF_ERROR(
+        verify_integer_or_empty(annotation_fields[1], "iteration id"));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::optional<Annotation>> ParseAnnotation(
+    const HloInstruction* instr) {
+  const auto& attrs = instr->frontend_attributes().map();
   if (!attrs.contains(kXlaSchedulingGroupIdAttr)) {
     return std::nullopt;
   }
-  int64_t annotation_id;
-  if (!absl::SimpleAtoi(attrs.at(kXlaSchedulingGroupIdAttr), &annotation_id)) {
+  absl::string_view annotation_str = attrs.at(kXlaSchedulingGroupIdAttr);
+  VLOG(2) << "Annotated instruction: " << instr->name() << " "
+          << annotation_str;
+  TF_RETURN_IF_ERROR(VerifyAnnotation(instr, annotation_str));
+  std::vector<absl::string_view> annotation_fields =
+      absl::StrSplit(annotation_str, delimiter);
+
+  auto parse_integer = [](absl::string_view str) -> std::optional<int64_t> {
+    if (str.empty()) {
+      return std::nullopt;
+    }
+    int64_t integer;
+    CHECK(absl::SimpleAtoi(str, &integer));
+    return integer;
+  };
+
+  Annotation annotation;
+  annotation.group_id = parse_integer(annotation_fields[0]);
+  if (annotation_fields.size() == 2 &&
+      parse_integer(annotation_fields[1]).has_value()) {
+    annotation.iteration_id = AnnotationIterationId{
+        .iteration_id = *parse_integer(annotation_fields[1])};
+  }
+
+  return annotation;
+}
+
+}  // namespace
+
+bool HasSchedulingAnnotation(const HloInstruction* instr) {
+  return instr->frontend_attributes().map().contains(kXlaSchedulingGroupIdAttr);
+}
+
+absl::StatusOr<std::optional<Annotation>> GetSchedulingAnnotation(
+    const HloInstruction* instr) {
+  return ParseAnnotation(instr);
+}
+
+absl::Status SetSchedulingAnnotation(HloInstruction* instr,
+                                     std::string annotation) {
+  TF_RETURN_IF_ERROR(VerifyAnnotation(instr, annotation));
+  FrontendAttributes frontend_attributes = instr->frontend_attributes();
+  if (frontend_attributes.map().contains(kXlaSchedulingGroupIdAttr)) {
+    frontend_attributes.mutable_map()->find(kXlaSchedulingGroupIdAttr)->second =
+        annotation;
+  } else {
+    frontend_attributes.mutable_map()->insert(
+        {kXlaSchedulingGroupIdAttr, annotation});
+  }
+  instr->set_frontend_attributes(frontend_attributes);
+  return absl::OkStatus();
+}
+
+absl::Status SetSchedulingAnnotation(HloInstruction* instr,
+                                     Annotation annotation) {
+  return SetSchedulingAnnotation(instr, annotation.ToString());
+}
+
+bool RemoveSchedulingAnnotation(HloInstruction* instr) {
+  FrontendAttributes frontend_attributes = instr->frontend_attributes();
+  if (!frontend_attributes.map().contains(kXlaSchedulingGroupIdAttr)) {
+    return false;
+  }
+  frontend_attributes.mutable_map()->erase(kXlaSchedulingGroupIdAttr);
+  instr->set_frontend_attributes(frontend_attributes);
+  return true;
+}
+
+absl::StatusOr<std::optional<AnnotationIterationId>>
+GetSchedulingAnnotationIterationId(const HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto annotation, ParseAnnotation(instr));
+  if (!annotation.has_value()) {
+    return std::nullopt;
+  }
+  return annotation->iteration_id;
+}
+
+absl::StatusOr<bool> RemoveSchedulingAnnotationIterationId(
+    HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(std::optional<Annotation> annotation,
+                      GetSchedulingAnnotation(instr));
+  if (!annotation || !annotation->iteration_id) {
+    return false;
+  }
+  if (!annotation->group_id) {
+    // If the annotation has no group id, we remove the annotation entirely.
+    return RemoveSchedulingAnnotation(instr);
+  }
+  annotation->iteration_id = std::nullopt;
+  TF_RETURN_IF_ERROR(SetSchedulingAnnotation(instr, *annotation));
+  return true;
+}
+
+absl::StatusOr<std::optional<int64_t>> GetSchedulingAnnotationGroupId(
+    const HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto annotation, ParseAnnotation(instr));
+  if (!annotation.has_value()) {
     return std::nullopt;
   }
-  return annotation_id;
+  return annotation->group_id;
 }
 
-void SetSchedulingAnnotation(HloInstruction* instruction, int64_t id) {
-  FrontendAttributes fas = instruction->frontend_attributes();
-  fas.mutable_map()->find(kXlaSchedulingGroupIdAttr)->second = absl::StrCat(id);
-  instruction->set_frontend_attributes(fas);
+absl::Status SetSchedulingAnnotationGroupId(HloInstruction* instr, int64_t id) {
+  return SetSchedulingAnnotation(instr, absl::StrCat(id));
 }
 
-int64_t NextSchedulingId(const HloModule& module) {
+absl::StatusOr<AnnotationGroupId> NextSchedulingGroupId(
+    const HloModule& module) {
   int64_t next_scheduling_id = 1;
   for (const HloComputation* comp : module.computations()) {
     for (const HloInstruction* hlo : comp->instructions()) {
-      std::optional<int64_t> scheduling_id = GetSchedulingAnnotation(hlo);
+      TF_ASSIGN_OR_RETURN(std::optional<int64_t> scheduling_id,
+                          GetSchedulingAnnotationGroupId(hlo));
       if (scheduling_id.has_value()) {
         next_scheduling_id =
             std::max(next_scheduling_id, scheduling_id.value() + 1);
@@ -60,4 +209,18 @@ int64_t NextSchedulingId(const HloModule& module) {
   return next_scheduling_id;
 }
 
+bool IsIterationIdConstentWithPipeliningDirection(
+    const AnnotationIterationId& iteration_id,
+    collective_pipeliner_utils::PipeliningDirection pipeline_direction) {
+  if (pipeline_direction ==
+      collective_pipeliner_utils::PipeliningDirection::kForward) {
+    return iteration_id.iteration_id == 1;
+  }
+  if (pipeline_direction ==
+      collective_pipeliner_utils::PipeliningDirection::kBackward) {
+    return iteration_id.iteration_id == -1;
+  }
+  return false;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/scheduling_annotations_util.h b/third_party/xla/xla/service/scheduling_annotations_util.h
index 76a36e2828fa..db94b7e19805 100644
--- a/third_party/xla/xla/service/scheduling_annotations_util.h
+++ b/third_party/xla/xla/service/scheduling_annotations_util.h
@@ -18,24 +18,147 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <string>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/collective_pipeliner_utils.h"
 
 namespace xla {
 
-// Returns the scheduling annotation id for the given instruction. If the
+// Defines utility functions for scheduling group annotations. A valid
+// annotation should be in the format of "group_id:iteration_id", where group_id
+// is a positive integer, and iteration_id is also a positive integer, and
+// either of them can be omitted.
+//
+// Examples:
+// example 1. "123:-1" means the instruction has group id 123, and iteration id
+// -1.
+// example 2. "123" means the instruction has group id 123, and no iteration id.
+// example 3. ":-1" means the instruction has no group id, and iteration id
+// -1.
+
+struct AnnotationIterationId {
+  // TODO(b/399444332): extend this to be a pair of ids marking the relative
+  // iteration index for the async-start and async-done instructions.
+  int64_t iteration_id;
+  friend bool operator==(const AnnotationIterationId& a,
+                         const AnnotationIterationId& b) {
+    return a.iteration_id == b.iteration_id;
+  }
+  friend bool operator!=(const AnnotationIterationId& a,
+                         const AnnotationIterationId& b) {
+    return !(a == b);
+  }
+  friend bool operator<(const AnnotationIterationId& a,
+                        const AnnotationIterationId& b) {
+    if (a == b) {
+      return false;
+    }
+    return a.iteration_id < b.iteration_id;
+  }
+};
+
+using AnnotationGroupId = int64_t;
+
+// Data structure to hold the group id and iteration id of an annotated
+// instruction.
+struct Annotation {
+  std::optional<AnnotationGroupId> group_id;
+  std::optional<AnnotationIterationId> iteration_id;
+
+  explicit Annotation(
+      std::optional<AnnotationGroupId> group_id = std::nullopt,
+      std::optional<AnnotationIterationId> iteration_id = std::nullopt)
+      : group_id(group_id), iteration_id(iteration_id) {}
+
+  std::string ToString() const {
+    if (group_id.has_value() && iteration_id.has_value()) {
+      return absl::StrCat(*group_id, ":", iteration_id->iteration_id);
+    }
+    if (group_id.has_value()) {
+      return absl::StrCat(*group_id);
+    }
+    if (iteration_id.has_value()) {
+      return absl::StrCat(":", iteration_id->iteration_id);
+    }
+    return "";
+  }
+  friend bool operator==(const Annotation& a, const Annotation& b) {
+    return ((a.group_id == b.group_id) && (a.iteration_id == b.iteration_id));
+  }
+  friend bool operator!=(const Annotation& a, const Annotation& b) {
+    return !(a == b);
+  }
+  friend bool operator<(const Annotation& a, const Annotation& b) {
+    if (a.group_id == b.group_id) {
+      if (a.iteration_id == b.iteration_id) {
+        return false;
+      }
+      return a.iteration_id && *a.iteration_id < *b.iteration_id;
+    }
+    return a.group_id && *a.group_id < *b.group_id;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const Annotation& key) {
+    return H::combine(std::move(h), key.ToString());
+  }
+};
+
+// Returns true if the instruction has a scheduling annotation.
+bool HasSchedulingAnnotation(const HloInstruction* instr);
+
+// Sets the scheduling annotation for the given instruction.
+absl::StatusOr<std::optional<Annotation>> GetSchedulingAnnotation(
+    const HloInstruction* instr);
+
+// Sets the scheduling annotation for the given instruction.
+absl::Status SetSchedulingAnnotation(HloInstruction* instr,
+                                     std::string annotation);
+
+// Same as above.
+absl::Status SetSchedulingAnnotation(HloInstruction* instr,
+                                     Annotation annotation);
+
+// Removes the scheduling annotation for the given instruction, and returns
+// true if the instruction has a scheduling annotation removed.
+bool RemoveSchedulingAnnotation(HloInstruction* instr);
+
+// Returns the scheduling annotation iteration id for the given instruction. If
+// the instruction does not have a scheduling annotation, or the annotation is
+// not an integer returns std::nullopt.
+absl::StatusOr<std::optional<AnnotationIterationId>>
+GetSchedulingAnnotationIterationId(const HloInstruction* instr);
+
+// Removes the scheduling annotation iteration id for the given instruction,
+// and returns true if the instruction has a scheduling annotation iteration id
+// removed.
+absl::StatusOr<bool> RemoveSchedulingAnnotationIterationId(
+    HloInstruction* instr);
+
+// Returns the scheduling annotation group id for the given instruction. If the
 // instruction does not have a scheduling annotation, or the annotation is not
 // an integer returns std::nullopt.
-std::optional<int64_t> GetSchedulingAnnotation(
-    const HloInstruction* instruction);
+absl::StatusOr<std::optional<AnnotationGroupId>> GetSchedulingAnnotationGroupId(
+    const HloInstruction* instr);
+
+// Sets the scheduling annotation group id for the given instruction.
+absl::Status SetSchedulingAnnotationGroupId(HloInstruction* instr,
+                                            AnnotationGroupId id);
 
-// Sets the scheduling annotation id for the given instruction.
-void SetSchedulingAnnotation(HloInstruction* instruction, int64_t id);
+// Returns the next available scheduling group id for the given module. The next
+// available group id is the maximum scheduling group id in the module plus one.
+absl::StatusOr<AnnotationGroupId> NextSchedulingGroupId(
+    const HloModule& module);
 
-// Returns the next available scheduling id for the given module. The next
-// available id is the maximum scheduling id in the module plus one.
-int64_t NextSchedulingId(const HloModule& module);
+bool IsIterationIdConstentWithPipeliningDirection(
+    const AnnotationIterationId& iteration_id,
+    collective_pipeliner_utils::PipeliningDirection pipeline_direction);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/scheduling_annotations_util_test.cc b/third_party/xla/xla/service/scheduling_annotations_util_test.cc
new file mode 100644
index 000000000000..59e962f74f35
--- /dev/null
+++ b/third_party/xla/xla/service/scheduling_annotations_util_test.cc
@@ -0,0 +1,181 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/scheduling_annotations_util.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using ::testing::TestParamInfo;
+using ::testing::Values;
+using ::testing::WithParamInterface;
+
+using SchedulingAnnotationsUtilTest = HloHardwareIndependentTestBase;
+
+struct SchedulingAnnotationsUtilTestParameter {
+  explicit SchedulingAnnotationsUtilTestParameter(
+      Annotation annotation, absl::string_view annotation_str)
+      : annotation(annotation), annotation_str(annotation_str) {}
+
+  Annotation annotation;
+  absl::string_view annotation_str;
+};
+
+class ParameterizedSchedulingAnnotationsUtilTest
+    : public HloHardwareIndependentTestBase,
+      public WithParamInterface<SchedulingAnnotationsUtilTestParameter> {};
+
+TEST_F(SchedulingAnnotationsUtilTest, HasSchedulingAnnotationTest) {
+  const std::string hlo_string = R"(
+  HloModule Module
+
+  ENTRY entry {
+    p0 = f32[32,32]{1,0} parameter(0)
+    ROOT p1 = f32[32,32]{1,0} copy(p0), frontend_attributes={_scheduling_group_id="0"}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  EXPECT_TRUE(HasSchedulingAnnotation(p1));
+}
+
+TEST_P(ParameterizedSchedulingAnnotationsUtilTest,
+       GetSchedulingAnnotationTest) {
+  const std::string hlo_string = R"(
+  HloModule Module
+
+  ENTRY entry {
+    p0 = f32[32,32]{1,0} parameter(0)
+    ROOT p1 = f32[32,32]{1,0} copy(p0), frontend_attributes={_scheduling_group_id="<scheduling_annotation>"}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(absl::StrReplaceAll(
+                              hlo_string, {{"<scheduling_annotation>",
+                                            GetParam().annotation_str}})));
+  HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  TF_ASSERT_OK_AND_ASSIGN(std::optional<Annotation> annotation,
+                          GetSchedulingAnnotation(p1));
+  EXPECT_TRUE(annotation.has_value());
+  EXPECT_EQ(annotation, GetParam().annotation);
+}
+
+TEST_P(ParameterizedSchedulingAnnotationsUtilTest,
+       SetSchedulingAnnotationTest_NoAnnotation) {
+  const std::string hlo_string = R"(
+  HloModule Module
+
+  ENTRY entry {
+  p0 = f32[32,32]{1,0} parameter(0)
+  ROOT p1 = f32[32,32]{1,0} copy(p0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  TF_ASSERT_OK(SetSchedulingAnnotation(p1, GetParam().annotation));
+  TF_ASSERT_OK_AND_ASSIGN(std::optional<Annotation> annotation,
+                          GetSchedulingAnnotation(p1));
+  EXPECT_TRUE(annotation.has_value());
+  EXPECT_EQ(annotation->ToString(), GetParam().annotation_str);
+}
+
+TEST_P(ParameterizedSchedulingAnnotationsUtilTest,
+       SetSchedulingAnnotationTest_WithAnnotation) {
+  const std::string hlo_string = R"(
+  HloModule Module
+
+  ENTRY entry {
+    p0 = f32[32,32]{1,0} parameter(0)
+    ROOT p1 = f32[32,32]{1,0} copy(p0), frontend_attributes={_scheduling_group_id="<scheduling_annotation>"}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(absl::StrReplaceAll(
+                              hlo_string, {{"<scheduling_annotation>",
+                                            GetParam().annotation_str}})));
+  HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  TF_ASSERT_OK(SetSchedulingAnnotation(p1, "987"));
+  TF_ASSERT_OK_AND_ASSIGN(std::optional<Annotation> annotation,
+                          GetSchedulingAnnotation(p1));
+  EXPECT_TRUE(annotation.has_value());
+  EXPECT_EQ(annotation->ToString(), "987");
+}
+
+TEST_P(ParameterizedSchedulingAnnotationsUtilTest,
+       RemoveSchedulingAnnotationTest) {
+  const std::string hlo_string = R"(
+  HloModule Module
+
+  ENTRY entry {
+  p0 = f32[32,32]{1,0} parameter(0)
+    ROOT p1 = f32[32,32]{1,0} copy(p0), frontend_attributes={_scheduling_group_id="<scheduling_annotation>"}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(absl::StrReplaceAll(
+                              hlo_string, {{"<scheduling_annotation>",
+                                            GetParam().annotation_str}})));
+  HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  EXPECT_TRUE(HasSchedulingAnnotation(p1));
+  ASSERT_TRUE(RemoveSchedulingAnnotation(p1));
+  EXPECT_FALSE(HasSchedulingAnnotation(p1));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SchedulingAnnotationsUtilTestInstance,
+    ParameterizedSchedulingAnnotationsUtilTest,
+    Values(SchedulingAnnotationsUtilTestParameter(
+               Annotation(/*group_id=*/std::nullopt,
+                          /*iteration_id=*/std::nullopt),
+               /*annotation_str=*/""),
+           SchedulingAnnotationsUtilTestParameter(
+               Annotation(/*group_id=*/123, /*iteration_id=*/std::nullopt),
+               /*annotation_str=*/"123"),
+           SchedulingAnnotationsUtilTestParameter(
+               Annotation(
+                   /*group_id=*/std::nullopt,
+                   /*iteration_id=*/AnnotationIterationId{.iteration_id = -1}),
+               /*annotation_str=*/":-1"),
+           SchedulingAnnotationsUtilTestParameter(
+               Annotation(
+                   /*group_id=*/123,
+                   /*iteration_id=*/AnnotationIterationId{.iteration_id = -1}),
+               /*annotation_str=*/"123:-1")),
+    [](const TestParamInfo<SchedulingAnnotationsUtilTestParameter>& info) {
+      return absl::StrCat("annotation_",
+                          absl::StrReplaceAll(info.param.annotation_str,
+                                              {{"-", "minus"}, {":", "_"}}));
+    });
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/select_and_scatter_expander.cc b/third_party/xla/xla/service/select_and_scatter_expander.cc
index 10437d0d5e96..44b2087a6fd2 100644
--- a/third_party/xla/xla/service/select_and_scatter_expander.cc
+++ b/third_party/xla/xla/service/select_and_scatter_expander.cc
@@ -50,8 +50,8 @@ absl::StatusOr<HloInstruction*> SelectAndScatterExpander::ExpandInstruction(
   // Construct one iota for each dimension. This will reduced in the reduction
   // to determine the indices to be scattered to.
   std::vector<HloInstruction*> iotas;
-  iotas.reserve(operand_shape.rank());
-  for (int i = 0; i < operand_shape.rank(); ++i) {
+  iotas.reserve(operand_shape.dimensions_size());
+  for (int i = 0; i < operand_shape.dimensions_size(); ++i) {
     iotas.push_back(
         computation->AddInstruction(HloInstruction::CreateIota(iota_shape, i)));
   }
@@ -165,7 +165,7 @@ absl::StatusOr<HloInstruction*> SelectAndScatterExpander::ExpandInstruction(
   // Handle the results of the reduction
   std::vector<HloInstruction*> iota_indices;
   std::vector<int64_t> broadcasted_iota_dims;
-  broadcasted_iota_dims.reserve(iota_shape_reduced.rank() + 1);
+  broadcasted_iota_dims.reserve(iota_shape_reduced.dimensions_size() + 1);
   broadcasted_iota_dims.insert(broadcasted_iota_dims.end(),
                                iota_shape_reduced.dimensions().begin(),
                                iota_shape_reduced.dimensions().end());
@@ -181,20 +181,21 @@ absl::StatusOr<HloInstruction*> SelectAndScatterExpander::ExpandInstruction(
   }
 
   // Prepare scatter inputs
-  std::vector<int64_t> scatter_dims(operand->shape().rank());
+  std::vector<int64_t> scatter_dims(operand->shape().dimensions_size());
   std::iota(scatter_dims.begin(), scatter_dims.end(), 0);
   auto* broadcasted_init_value = computation->AddInstruction(
       HloInstruction::CreateBroadcast(instruction->shape(), init_value, {}));
 
   std::vector<int64_t> concatenated_iotas_dims;
-  concatenated_iotas_dims.reserve(iota_indices.front()->shape().rank());
+  concatenated_iotas_dims.reserve(
+      iota_indices.front()->shape().dimensions_size());
   concatenated_iotas_dims.insert(concatenated_iotas_dims.end(),
                                  broadcasted_iota_dims.begin(),
                                  broadcasted_iota_dims.end());
   concatenated_iotas_dims.back() = static_cast<int64_t>(iota_indices.size());
   auto* indices = computation->AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(iota_shape.element_type(), concatenated_iotas_dims),
-      iota_indices, iota_shape.rank()));
+      iota_indices, iota_shape.dimensions_size()));
 
   // Scatter
   ScatterDimensionNumbers dim_nums =
@@ -202,7 +203,7 @@ absl::StatusOr<HloInstruction*> SelectAndScatterExpander::ExpandInstruction(
           /*update_window_dims=*/{},
           /*inserted_window_dims=*/scatter_dims,
           /*scatter_dims_to_operand_dims=*/scatter_dims,
-          /*index_vector_dim=*/source->shape().rank());
+          /*index_vector_dim=*/source->shape().dimensions_size());
   return computation->AddInstruction(HloInstruction::CreateScatter(
       /*shape=*/sas->shape(), /*operand=*/broadcasted_init_value,
       /*scatter_indices=*/indices, /*updates=*/source,
diff --git a/third_party/xla/xla/service/select_and_scatter_expander_test.cc b/third_party/xla/xla/service/select_and_scatter_expander_test.cc
index 001dea828176..c8f870fc091b 100644
--- a/third_party/xla/xla/service/select_and_scatter_expander_test.cc
+++ b/third_party/xla/xla/service/select_and_scatter_expander_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 
 namespace xla {
 namespace {
@@ -45,7 +45,7 @@ constexpr absl::string_view kModuleStr =
     ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3
   })";
 
-class SelectAndScatterExpanderTest : public HloTestBase {
+class SelectAndScatterExpanderTest : public HloHardwareIndependentTestBase {
  protected:
   // The HLO parser changes all no layout shapes from the input to have a
   // default layout. Clear the layout of the scatter operand for testing.
diff --git a/third_party/xla/xla/service/service.cc b/third_party/xla/xla/service/service.cc
index 2f22259a9d4f..8cc06e6f5a4a 100644
--- a/third_party/xla/xla/service/service.cc
+++ b/third_party/xla/xla/service/service.cc
@@ -594,11 +594,14 @@ Service::ExecuteGraphParallel(
     // the program and the argument allocations. Here, we care only about the
     // shapes of the arguments, so, it is sufficient to use the arguments of
     // replica 0.
+    TF_ASSIGN_OR_RETURN(
+        ProgramShape program_shape,
+        ProgramShape::FromProto(
+            computation.computation.proto().host_program_shape()));
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(
-            ProgramShape{computation.computation.proto().host_program_shape()},
-            replicated_arguments.front(), computation.execution_options));
+        CreateModuleConfig(program_shape, replicated_arguments.front(),
+                           computation.execution_options));
     VLOG(3)
         << "ExecuteGraphParallel created HloModuleConfig computation layout: "
         << module_config->entry_computation_layout().ToString();
@@ -823,9 +826,11 @@ absl::StatusOr<ExecutionHandle> Service::Compile(
   }
 
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(ProgramShape{computation.proto().host_program_shape()},
-                         argument_shape_ptrs, &execution_options));
+      ProgramShape program_shape,
+      ProgramShape::FromProto(computation.proto().host_program_shape()));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
+                      CreateModuleConfig(program_shape, argument_shape_ptrs,
+                                         &execution_options));
   VLOG(3) << "Compile created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
@@ -1059,7 +1064,9 @@ absl::StatusOr<Literal> Service::ComputeConstantGraph(
         "constant computation may not depend on any parameters.");
   }
 
-  ProgramShape program_shape(computation.proto().host_program_shape());
+  TF_ASSIGN_OR_RETURN(
+      ProgramShape program_shape,
+      ProgramShape::FromProto(computation.proto().host_program_shape()));
   TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
 
   if (output_layout) {
@@ -1084,7 +1091,7 @@ absl::StatusOr<Literal> Service::ComputeConstantGraph(
          absl::Span<const Literal*> operands) -> absl::StatusOr<Literal> {
         if (custom_call->custom_call_target() == "SliceToDynamic") {
           auto result = operands[0]->Clone();
-          for (int64_t i = 0; i < result.shape().rank(); ++i) {
+          for (int64_t i = 0; i < result.shape().dimensions_size(); ++i) {
             result.SetDynamicSize(i, operands[1 + i]->Get<int32_t>({}));
           }
           return result.ToStatic();
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index d60df193e09c..7985c930d812 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -124,7 +124,7 @@ absl::Status VerifyReducerShape(
   }
 
   for (const Shape* element_shape : accumulator_subshapes) {
-    if (element_shape->rank() != 0) {
+    if (element_shape->dimensions_size() != 0) {
       return InvalidArgument(
           "Reduction function must return a scalar or tuple of scalars but "
           "returns shape: %s",
@@ -187,10 +187,10 @@ absl::Status VerifyReducerShape(
 absl::StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                                              const Window& window,
                                              PrimitiveType element_type) {
-  if (window.dimensions_size() != base_shape.rank()) {
+  if (window.dimensions_size() != base_shape.dimensions_size()) {
     return InvalidArgument(
         "Window has dimension %d but base shape has dimension %d.",
-        window.dimensions_size(), base_shape.rank());
+        window.dimensions_size(), base_shape.dimensions_size());
   }
 
   std::vector<int64_t> output_dimensions(window.dimensions_size());
@@ -469,9 +469,9 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
 /* static */ absl::StatusOr<Shape> ShapeInference::InferTopKShape(
     const Shape& operand_shape, int64_t k) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of top-k operation"));
-  int64_t last_dim = operand_shape.rank() - 1;
-  std::vector<bool> is_dynamic(operand_shape.rank());
-  std::vector<int64_t> dimensions(operand_shape.rank());
+  int64_t last_dim = operand_shape.dimensions_size() - 1;
+  std::vector<bool> is_dynamic(operand_shape.dimensions_size());
+  std::vector<int64_t> dimensions(operand_shape.dimensions_size());
 
   TF_RET_CHECK(operand_shape.dimensions(last_dim) >= k)
       << "k=" << k << " is larger than the last dimension of size="
@@ -493,7 +493,7 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
   if (arg_shapes.empty()) {
     return InvalidArgument("Concatenate expects at least one argument.");
   }
-  if (dimension < 0 || dimension >= arg_shapes[0]->rank()) {
+  if (dimension < 0 || dimension >= arg_shapes[0]->dimensions_size()) {
     return InvalidArgument("Concatenate dimension out of bounds: %d.",
                            dimension);
   }
@@ -506,12 +506,12 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
       element_type = arg_shape->element_type();
       continue;
     }
-    if (arg_shape->rank() != shape->rank()) {
+    if (arg_shape->dimensions_size() != shape->dimensions_size()) {
       return InvalidArgument(
           "Cannot concatenate arrays with different ranks: %d (%s) vs %d "
           "(%s).",
-          arg_shape->rank(), ShapeUtil::HumanString(*arg_shape), shape->rank(),
-          ShapeUtil::HumanString(*shape));
+          arg_shape->dimensions_size(), ShapeUtil::HumanString(*arg_shape),
+          shape->dimensions_size(), ShapeUtil::HumanString(*shape));
     }
     if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shape, *shape)) {
       return InvalidArgument(
@@ -519,8 +519,8 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
           PrimitiveType_Name(arg_shape->element_type()),
           PrimitiveType_Name(shape->element_type()));
     }
-    for (int64_t dimension_number = 0; dimension_number < arg_shape->rank();
-         ++dimension_number) {
+    for (int64_t dimension_number = 0;
+         dimension_number < arg_shape->dimensions_size(); ++dimension_number) {
       if (!CompatibleDimensionSizes(arg_shape->dimensions(dimension_number),
                                     shape->dimensions(dimension_number))) {
         if (dimension_number == dimension) {
@@ -539,7 +539,7 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
   }
 
   // Infer the most specific (size, bound) of all dimensions of the return type
-  int64_t rank = arg_shape->rank();
+  int64_t rank = arg_shape->dimensions_size();
   std::vector<int64_t> inferred_sizes(rank, Shape::kUnboundedSize);
   std::vector<int64_t> inferred_bounds(rank, Shape::kUnboundedSize);
   // Note: for the concatenate dimension, 0 should be the identity element:
@@ -730,7 +730,7 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
     return InvalidArgument(
         "Pad operation does not support non-scalar padding values.");
   }
-  if (operand_shape.rank() != padding_config.dimensions_size()) {
+  if (operand_shape.dimensions_size() != padding_config.dimensions_size()) {
     return InvalidArgument(
         "The rank of the operand and the padding configuration do not match: "
         "%s vs %s.",
@@ -754,8 +754,8 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
     return InvalidArgument("Dynamic padding value is not supported");
   }
 
-  std::vector<int64_t> dimensions(operand_shape.rank());
-  std::vector<bool> is_dynamic(operand_shape.rank());
+  std::vector<int64_t> dimensions(operand_shape.dimensions_size());
+  std::vector<bool> is_dynamic(operand_shape.dimensions_size());
   for (int64_t i = 0; i < operand_shape.dimensions_size(); ++i) {
     const auto& p = padding_config.dimensions(i);
     if (operand_shape.is_unbounded_dynamic_dimension(i)) {
@@ -812,9 +812,9 @@ absl::Status ValidateDotDimensionNumbers(
   absl::Span<const int64_t> rhs_batch_dimensions =
       dimension_numbers.rhs_batch_dimensions();
 
-  if (!dims_in_range(lhs.rank(), lhs_contracting_dimensions,
+  if (!dims_in_range(lhs.dimensions_size(), lhs_contracting_dimensions,
                      lhs_batch_dimensions) ||
-      !dims_in_range(rhs.rank(), rhs_contracting_dimensions,
+      !dims_in_range(rhs.dimensions_size(), rhs_contracting_dimensions,
                      rhs_batch_dimensions)) {
     return InvalidArgument(
         "A dimension number is out of range in Dot: %s. %s %s",
@@ -924,8 +924,10 @@ void GenerateDotResultDimensions(
     std::vector<int64_t> rhs_group_dimensions = {}) {
   const auto& lhs_batch_dimensions = dimension_numbers.lhs_batch_dimensions();
   const auto lhs_batch_dimensions_size =
-      lhs.rank() - dimension_numbers.lhs_contracting_dimensions().size() +
-      rhs.rank() - dimension_numbers.rhs_contracting_dimensions().size() -
+      lhs.dimensions_size() -
+      dimension_numbers.lhs_contracting_dimensions().size() +
+      rhs.dimensions_size() -
+      dimension_numbers.rhs_contracting_dimensions().size() -
       dimension_numbers.rhs_batch_dimensions().size();
   dimensions.reserve(lhs_batch_dimensions_size);
   is_dynamic.reserve(lhs_batch_dimensions_size);
@@ -933,7 +935,7 @@ void GenerateDotResultDimensions(
     dimensions.push_back(lhs.dimensions(lhs_dim));
     is_dynamic.push_back(lhs.is_dynamic_dimension(lhs_dim));
   }
-  for (int64_t i = 0; i < lhs.rank(); i++) {
+  for (int64_t i = 0; i < lhs.dimensions_size(); i++) {
     if (!absl::c_linear_search(dimension_numbers.lhs_contracting_dimensions(),
                                i) &&
         !absl::c_linear_search(dimension_numbers.lhs_batch_dimensions(), i)) {
@@ -941,7 +943,7 @@ void GenerateDotResultDimensions(
       is_dynamic.push_back(lhs.is_dynamic_dimension(i));
     }
   }
-  for (int64_t i = 0; i < rhs.rank(); i++) {
+  for (int64_t i = 0; i < rhs.dimensions_size(); i++) {
     if (!absl::c_linear_search(dimension_numbers.rhs_contracting_dimensions(),
                                i) &&
         !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(), i) &&
@@ -1086,7 +1088,7 @@ void GenerateDotResultDimensions(
         dimension_numbers.lhs_contracting_dimensions().begin(),
         dimension_numbers.lhs_contracting_dimensions().end()};
     std::vector<int64_t> lhs_non_contracting_dims;
-    for (int64_t i = 0; i < lhs.rank(); ++i) {
+    for (int64_t i = 0; i < lhs.dimensions_size(); ++i) {
       if (!absl::c_linear_search(dimension_numbers.lhs_contracting_dimensions(),
                                  i) &&
           !absl::c_linear_search(dimension_numbers.lhs_batch_dimensions(), i)) {
@@ -1096,7 +1098,7 @@ void GenerateDotResultDimensions(
 
     // Construct the expected shape [b...,x...,g] of group_sizes.
     std::vector<int64_t> prefix_dims;
-    prefix_dims.reserve(lhs.rank() - 1);
+    prefix_dims.reserve(lhs.dimensions_size() - 1);
     prefix_dims.insert(prefix_dims.end(), lhs_batch_dims.begin(),
                        lhs_batch_dims.end());
     switch (mode) {
@@ -1123,13 +1125,15 @@ void GenerateDotResultDimensions(
     }
 
     // Validate the actual shape, if it was passed as something other than [g].
-    if (group_sizes.rank() != 1) {
-      if (group_sizes.rank() != expected_prefix.size() + 1) {
+    if (group_sizes.dimensions_size() != 1) {
+      if (group_sizes.dimensions_size() != expected_prefix.size() + 1) {
         return fail(StrFormat("expected group_sizes to have rank %d, got %d",
-                              expected_prefix.size() + 1, group_sizes.rank()));
+                              expected_prefix.size() + 1,
+                              group_sizes.dimensions_size()));
       }
-      if (!absl::c_equal(expected_prefix, group_sizes.dimensions().first(
-                                              group_sizes.rank() - 1))) {
+      if (!absl::c_equal(expected_prefix,
+                         group_sizes.dimensions().first(
+                             group_sizes.dimensions_size() - 1))) {
         return fail(StrFormat(
             "group_sizes is expected to have shape [%s, %d], got [%s]",
             absl::StrJoin(expected_prefix, ", "),
@@ -1244,7 +1248,7 @@ void GenerateDotResultDimensions(
   // Build the resulting shape dimensions.
   std::vector<int64_t> dimensions;
   std::vector<bool> is_dynamic;
-  for (int64_t i = 0; i < operand_shape.rank(); ++i) {
+  for (int64_t i = 0; i < operand_shape.dimensions_size(); ++i) {
     dimensions.push_back(i != sparsity.dimension() ? operand_shape.dimensions(i)
                                                    : metadata_dimension_size);
     is_dynamic.push_back(operand_shape.is_dynamic_dimension(i));
@@ -1255,15 +1259,15 @@ void GenerateDotResultDimensions(
 /* static */ absl::StatusOr<Shape>
 ShapeInference::InferDegenerateDimensionBroadcastShape(const Shape& lhs,
                                                        const Shape& rhs) {
-  TF_RET_CHECK(lhs.rank() == rhs.rank());
+  TF_RET_CHECK(lhs.dimensions_size() == rhs.dimensions_size());
 
   // The shapes have to be compatible. That is, if some dimension d has a
   // different size in the two shapes, one of them has to be 1 (a "degenerate"
   // dimension). In that case, the output shape has the non-1 dimension size
   // from the lhs/rhs pair in every index.
-  std::vector<int64_t> output_dimensions(lhs.rank());
-  std::vector<bool> output_dimensions_is_dynamic(lhs.rank());
-  for (int64_t i = 0; i < lhs.rank(); ++i) {
+  std::vector<int64_t> output_dimensions(lhs.dimensions_size());
+  std::vector<bool> output_dimensions_is_dynamic(lhs.dimensions_size());
+  for (int64_t i = 0; i < lhs.dimensions_size(); ++i) {
     if (lhs.dimensions(i) == 1 || rhs.dimensions(i) == 1) {
       // For the unbounded case, the operand with 1 should be broadcasted to the
       // unbounded size which can be > 1.
@@ -1329,13 +1333,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(const Shape& lhs,
                   ShapeUtil::HumanString(larger_shape)));
   }
 
-  if (broadcast_dimensions.size() != smaller_shape.rank()) {
+  if (broadcast_dimensions.size() != smaller_shape.dimensions_size()) {
     return InvalidArgumentError(StrFormat(
         "Size of broadcast_dimensions has to match lower-rank operand's "
         "rank; "
         " lower-rank operand's rank is %d, size of broadcast_dimensions is "
         "%u.",
-        smaller_shape.rank(), broadcast_dimensions.size()));
+        smaller_shape.dimensions_size(), broadcast_dimensions.size()));
   }
 
   // broadcast_dimensions is a sequence of dimensions; its length is equal to
@@ -1430,8 +1434,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(const Shape& lhs,
                     dimension_to_match, broadcast_dimensions.at(i - 1)));
     }
 
-    output_shape.set_dimensions(dimension_to_match, small_dimension_size);
-    output_shape.set_dynamic_dimension(dimension_to_match, small_is_dynamic);
+    output_shape.set_dimensions(dimension_to_match, small_dimension_size,
+                                small_is_dynamic);
   }
 
   return output_shape;
@@ -1451,8 +1455,8 @@ ShapeInference::InferElementwiseBinaryOpShape(
         ShapeUtil::HumanString(rhs));
   }
 
-  if (lhs.rank() == rhs.rank()) {
-    std::vector<int64_t> identity_dims(lhs.rank());
+  if (lhs.dimensions_size() == rhs.dimensions_size()) {
+    std::vector<int64_t> identity_dims(lhs.dimensions_size());
     std::iota(identity_dims.begin(), identity_dims.end(), 0);
     if (!broadcast_dimensions.empty() &&
         broadcast_dimensions != identity_dims) {
@@ -1477,7 +1481,7 @@ ShapeInference::InferElementwiseBinaryOpShape(
     Shape result = ShapeUtil::ChangeElementType(
         lhs, ShapeUtil::HigherPrecisionElementType(lhs, rhs));
 
-    for (int64_t i = 0; i < rhs.rank(); ++i) {
+    for (int64_t i = 0; i < rhs.dimensions_size(); ++i) {
       if (rhs.is_dynamic_dimension(i)) {
         result.set_dynamic_dimension(i, true);
       }
@@ -1485,13 +1489,15 @@ ShapeInference::InferElementwiseBinaryOpShape(
 
     return result;
 
-  } else if (lhs.rank() == rhs.rank()) {
+  } else if (lhs.dimensions_size() == rhs.dimensions_size()) {
     return InferDegenerateDimensionBroadcastShape(lhs, rhs);
   } else {
     // Ranks do not match, so perform InDim broadcasting using
     // broadcast_dimensions. Scalar broadcasting is a special case of this.
-    const Shape& larger_shape = lhs.rank() > rhs.rank() ? lhs : rhs;
-    const Shape& smaller_shape = lhs.rank() > rhs.rank() ? rhs : lhs;
+    const Shape& larger_shape =
+        lhs.dimensions_size() > rhs.dimensions_size() ? lhs : rhs;
+    const Shape& smaller_shape =
+        lhs.dimensions_size() > rhs.dimensions_size() ? rhs : lhs;
 
     // After InDim broadcasting, perform degenerate dimensions broadcasting.
     TF_ASSIGN_OR_RETURN(Shape indim_broadcast_shape,
@@ -1510,7 +1516,7 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   // function.
   std::optional<Shape> broadcasted_shape;
   for (const Shape& shape : shapes) {
-    if (!shape.IsArray() || shape.rank() == 0) continue;
+    if (!shape.IsArray() || shape.dimensions_size() == 0) continue;
     if (!broadcasted_shape.has_value()) {
       broadcasted_shape = shape;
     }
@@ -1609,8 +1615,7 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
     default:
       return Unimplemented(
           "Binary op shape inference: %s; lhs: %s; rhs: %s is not implemented.",
-          HloOpcodeString(opcode), lhs.ShortDebugString(),
-          rhs.ShortDebugString());
+          HloOpcodeString(opcode), lhs.ToString(), rhs.ToString());
   }
 }
 
@@ -1794,12 +1799,12 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
                absl::OkStatus());
 
-  if (feature_index >= operand_shape.rank()) {
+  if (feature_index >= operand_shape.dimensions_size()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-training to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, operand_shape.rank());
+        feature_index, operand_shape.dimensions_size());
   }
 
   if (feature_index < 0) {
@@ -1809,25 +1814,25 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         feature_index);
   }
 
-  if (operand_shape.rank() < 1) {
+  if (operand_shape.dimensions_size() < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
         "batch-norm-training to be at least 1; got %d.",
-        operand_shape.rank());
+        operand_shape.dimensions_size());
   }
 
-  if (offset_shape.rank() != 1) {
+  if (offset_shape.dimensions_size() != 1) {
     return InvalidArgument(
         "Offset input of batch-norm-training must have"
         " rank 1, but has rank %d.",
-        offset_shape.rank());
+        offset_shape.dimensions_size());
   }
 
-  if (scale_shape.rank() != 1) {
+  if (scale_shape.dimensions_size() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-training must have"
         " rank 1, but has rank %d.",
-        scale_shape.rank());
+        scale_shape.dimensions_size());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -1903,12 +1908,12 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   TF_RETURN_IF_ERROR(
       ShapeUtil::ValidateShapeWithOptionalLayout(variance_shape));
 
-  if (feature_index >= operand_shape.rank()) {
+  if (feature_index >= operand_shape.dimensions_size()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-inference to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, operand_shape.rank());
+        feature_index, operand_shape.dimensions_size());
   }
 
   if (feature_index < 0) {
@@ -1918,25 +1923,25 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         feature_index);
   }
 
-  if (operand_shape.rank() < 1) {
+  if (operand_shape.dimensions_size() < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
         "batch-norm-inference to be at least 1; got %d.",
-        operand_shape.rank());
+        operand_shape.dimensions_size());
   }
 
-  if (offset_shape.rank() != 1) {
+  if (offset_shape.dimensions_size() != 1) {
     return InvalidArgument(
         "Offset input of batch-norm-inference must have"
         " rank 1, but has rank %d.",
-        offset_shape.rank());
+        offset_shape.dimensions_size());
   }
 
-  if (scale_shape.rank() != 1) {
+  if (scale_shape.dimensions_size() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-inference must have"
         " rank 1, but has rank %d.",
-        scale_shape.rank());
+        scale_shape.dimensions_size());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -2050,41 +2055,41 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   TF_RETURN_IF_ERROR(
       ShapeUtil::ValidateShapeWithOptionalLayout(output_grad_shape));
 
-  if (feature_index >= operand_shape.rank()) {
+  if (feature_index >= operand_shape.dimensions_size()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-grad to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, operand_shape.rank());
+        feature_index, operand_shape.dimensions_size());
   }
 
-  if (operand_shape.rank() != output_grad_shape.rank()) {
+  if (operand_shape.dimensions_size() != output_grad_shape.dimensions_size()) {
     return InvalidArgument(
         "Expected operand_shape of batch-norm-grad to have the same rank as"
         " output_grad_shape; got rank(oprand_shape) %d, and"
         " rank(output_grad_shape) %d.",
-        operand_shape.rank(), output_grad_shape.rank());
+        operand_shape.dimensions_size(), output_grad_shape.dimensions_size());
   }
 
-  if (mean_shape.rank() != 1) {
+  if (mean_shape.dimensions_size() != 1) {
     return InvalidArgument(
         "Mean input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        mean_shape.rank());
+        mean_shape.dimensions_size());
   }
 
-  if (scale_shape.rank() != 1) {
+  if (scale_shape.dimensions_size() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        scale_shape.rank());
+        scale_shape.dimensions_size());
   }
 
-  if (var_shape.rank() != 1) {
+  if (var_shape.dimensions_size() != 1) {
     return InvalidArgument(
         "Var input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        var_shape.rank());
+        var_shape.dimensions_size());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -2174,7 +2179,7 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   }
 
   // Verify operand_shape and output_grad_shape have same bounds.
-  for (int64_t i = 0; i < operand_shape.rank(); ++i) {
+  for (int64_t i = 0; i < operand_shape.dimensions_size(); ++i) {
     if (!CompatibleDimensionSizes(
             ShapeUtil::GetDimension(operand_shape, i),
             ShapeUtil::GetDimension(output_grad_shape, i))) {
@@ -2243,12 +2248,12 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   }
 
   const int num_dims = num_spatial_dims + 2;
-  if (lhs.rank() != num_dims) {
+  if (lhs.dimensions_size() != num_dims) {
     return InvalidArgument(
         "The LHS argument to a convolution should have rank %d; lhs: %s.",
         num_dims, ShapeUtil::HumanString(lhs));
   }
-  if (rhs.rank() != num_dims) {
+  if (rhs.dimensions_size() != num_dims) {
     return InvalidArgument(
         "The RHS argument to a convolution should have rank %d; rhs: %s.",
         num_dims, ShapeUtil::HumanString(rhs));
@@ -2502,11 +2507,11 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
           in, in.element_type() == F32 ? C64 : C128);
       // Preserve the size of zero-sized dimensions.
       if (fft_length[fft_rank - 1] != 0) {
-        result.set_dimensions(result.dimensions_size() - 1,
-                              fft_length[fft_rank - 1] / 2 + 1);
-        if (in.is_unbounded_dynamic_dimension(result.dimensions_size() - 1)) {
-          result.set_dynamic_dimension(result.dimensions_size() - 1, false);
-        }
+        const int dim = static_cast<int>(result.dimensions().size()) - 1;
+        const bool is_dynamic = result.is_dynamic_dimension(dim) &&
+                                !in.is_unbounded_dynamic_dimension(dim);
+        result.set_dimensions(dim, fft_length[fft_rank - 1] / 2 + 1,
+                              is_dynamic);
       }
       return result;
     }
@@ -2539,11 +2544,10 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
             in.dimensions_size() - 1, last_in_dimension_size,
             fft_length[fft_rank - 1] / 2 + 1);
       }
-      result.set_dimensions(result.dimensions_size() - 1,
-                            fft_length[fft_rank - 1]);
-      if (in.is_unbounded_dynamic_dimension(result.dimensions_size() - 1)) {
-        result.set_dynamic_dimension(result.dimensions_size() - 1, false);
-      }
+      const int dim = static_cast<int>(result.dimensions().size()) - 1;
+      const bool is_dynamic = result.is_dynamic_dimension(dim) &&
+                              !in.is_unbounded_dynamic_dimension(dim);
+      result.set_dimensions(dim, fft_length[fft_rank - 1], is_dynamic);
       return result;
     }
     default:
@@ -2562,25 +2566,25 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         PrimitiveType_Name(a.element_type()),
         PrimitiveType_Name(b.element_type()));
   }
-  if (a.rank() < 2) {
+  if (a.dimensions_size() < 2) {
     return InvalidArgument(
         "The 'a' argument to TriangularSolve must have rank >= 2, got shape %s",
         a.ToString());
   }
-  if (b.rank() != a.rank()) {
+  if (b.dimensions_size() != a.dimensions_size()) {
     return InvalidArgument(
         "Arguments to triangular solve must have equal rank; got %s and %s.",
         b.ToString(), a.ToString());
   }
-  if (!CompatibleDimensionSizes(a.dimensions(a.rank() - 2),
-                                a.dimensions(a.rank() - 1))) {
+  if (!CompatibleDimensionSizes(a.dimensions(a.dimensions_size() - 2),
+                                a.dimensions(a.dimensions_size() - 1))) {
     return InvalidArgument(
         "The two minor dimensions of 'a' must have equal size, got %s.",
         a.ToString());
   }
   if (!CompatibleDimensionSizes(
-          a.dimensions(a.rank() - 1),
-          b.dimensions(b.rank() - (options.left_side() ? 2 : 1)))) {
+          a.dimensions(a.dimensions_size() - 1),
+          b.dimensions(b.dimensions_size() - (options.left_side() ? 2 : 1)))) {
     return InvalidArgument(
         "The shared dimension of 'a' and 'b' does not match, got shapes %s and "
         "%s",
@@ -2613,13 +2617,13 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         "Cholesky; got %s.",
         PrimitiveType_Name(a.element_type()));
   }
-  if (a.rank() < 2) {
+  if (a.dimensions_size() < 2) {
     return InvalidArgument(
         "The 'a' argument to Cholesky must have rank >= 2, got shape %s",
         a.ToString());
   }
-  if (!CompatibleDimensionSizes(a.dimensions(a.rank() - 2),
-                                a.dimensions(a.rank() - 1))) {
+  if (!CompatibleDimensionSizes(a.dimensions(a.dimensions_size() - 2),
+                                a.dimensions(a.dimensions_size() - 1))) {
     return InvalidArgument(
         "The two minor dimensions of 'a' must have compatible size, got %s.",
         a.ToString());
@@ -2636,16 +2640,18 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   std::vector<Shape> output_shapes;
   output_shapes.reserve(operand_shapes.size());
   for (const Shape* operand_shape : operand_shapes) {
-    TF_RET_CHECK(all_gather_dimension < operand_shape->rank());
+    TF_RET_CHECK(all_gather_dimension < operand_shape->dimensions_size());
     TF_RETURN_IF_ERROR(ExpectArray(*operand_shape, "operand of all-gather"));
 
     Shape output_shape = *operand_shape;
     int64_t output_shape_dimension =
         output_shape.dimensions(all_gather_dimension);
+    const bool is_dynamic = IsUnboundedDynamicSize(output_shape_dimension);
     output_shape.set_dimensions(all_gather_dimension,
-                                IsUnboundedDynamicSize(output_shape_dimension)
+                                is_dynamic
                                     ? Shape::kUnboundedSize
-                                    : shard_count * output_shape_dimension);
+                                    : shard_count * output_shape_dimension,
+                                is_dynamic);
     output_shapes.push_back(output_shape);
   }
   if (output_shapes.size() == 1) {
@@ -2695,7 +2701,7 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   std::vector<Shape> output_shapes;
   output_shapes.reserve(operand_shapes.size());
   for (const Shape* operand_shape : operand_shapes) {
-    TF_RET_CHECK(scatter_dimension < operand_shape->rank());
+    TF_RET_CHECK(scatter_dimension < operand_shape->dimensions_size());
     TF_RETURN_IF_ERROR(
         ExpectArray(*operand_shape, "operand of reduce-scatter"));
 
@@ -2710,10 +2716,13 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
     }
 
     Shape output_shape = *operand_shape;
-    output_shape.set_dimensions(
-        scatter_dimension, output_shape.is_dynamic_dimension(scatter_dimension)
-                               ? Shape::kUnboundedSize
-                               : scatter_dim_input_size / shard_count);
+    const bool is_dynamic =
+        output_shape.is_dynamic_dimension(scatter_dimension);
+    output_shape.set_dimensions(scatter_dimension,
+                                is_dynamic
+                                    ? Shape::kUnboundedSize
+                                    : scatter_dim_input_size / shard_count,
+                                is_dynamic);
     output_shapes.push_back(output_shape);
   }
 
@@ -2740,12 +2749,12 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   TF_RET_CHECK(split_count > 0);
   TF_RET_CHECK(!shape.is_bounded_dynamic())
       << "AllToAll does not support bounded dynamic shapes";
-  if (split_dimension >= shape.rank() || split_dimension < 0) {
+  if (split_dimension >= shape.dimensions_size() || split_dimension < 0) {
     return InvalidArgument(
         "AllToAll split_dimension %d is out-of-bounds in shape %s.",
         split_dimension, ShapeUtil::HumanString(shape));
   }
-  if (concat_dimension >= shape.rank() || concat_dimension < 0) {
+  if (concat_dimension >= shape.dimensions_size() || concat_dimension < 0) {
     return InvalidArgument(
         "AllToAll concat_dimension %d is out-of-bounds in shape %s.",
         concat_dimension, ShapeUtil::HumanString(shape));
@@ -2889,7 +2898,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   // doesn't matter which one we choose.
   const Shape& arg = *reduced_args[0];
   for (int64_t dimension : dimensions_to_reduce) {
-    if (dimension >= arg.rank() || dimension < 0) {
+    if (dimension >= arg.dimensions_size() || dimension < 0) {
       return InvalidArgument("Reducing out-of-bounds dimension %d in shape %s.",
                              dimension, ShapeUtil::HumanString(arg));
     }
@@ -2914,7 +2923,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
 
   std::vector<int64_t> new_dimensions;
   std::vector<bool> new_is_dynamic;
-  for (int i = 0; i < arg.rank(); ++i) {
+  for (int i = 0; i < arg.dimensions_size(); ++i) {
     if (dimensions_to_reduce_set.find(i) == dimensions_to_reduce_set.end()) {
       new_dimensions.push_back(arg.dimensions(i));
       new_is_dynamic.push_back(arg.is_dynamic_dimension(i));
@@ -3055,7 +3064,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
 
 /* static */ absl::StatusOr<Shape> ShapeInference::InferGetDimensionSizeShape(
     const Shape& shape, int64_t dimension) {
-  if (dimension < 0 || dimension >= shape.rank()) {
+  if (dimension < 0 || dimension >= shape.dimensions_size()) {
     return InvalidArgument("GetDimensionSize dimension out of bounds: %d.",
                            dimension);
   }
@@ -3074,12 +3083,12 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
 
 /* static */ absl::StatusOr<Shape> ShapeInference::InferSetDimensionSizeShape(
     const Shape& shape, const Shape& val_shape, int64_t dimension) {
-  if (dimension < 0 || dimension >= shape.rank()) {
+  if (dimension < 0 || dimension >= shape.dimensions_size()) {
     return InvalidArgument("SetDimensionSize dimension out of bounds: %d.",
                            dimension);
   }
 
-  if (val_shape.rank() != 0 || val_shape.element_type() != S32) {
+  if (val_shape.dimensions_size() != 0 || val_shape.element_type() != S32) {
     return InvalidArgument(
         "SetDimensionSize's value has to be S32 scalar, got %s",
         val_shape.ToString());
@@ -3186,10 +3195,10 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
                            starts.size(), strides.size()));
   }
 
-  if (starts.size() != arg.rank()) {
+  if (starts.size() != arg.dimensions_size()) {
     return InvalidArgument(
         "Slice index count does not match argument rank: %u vs %d.",
-        starts.size(), arg.rank());
+        starts.size(), arg.dimensions_size());
   }
 
   std::vector<int64_t> sizes;
@@ -3224,7 +3233,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     sizes.push_back((limit_index - start_index + stride - 1) / stride);
   }
 
-  std::vector<bool> is_dynamic(arg.rank());
+  std::vector<bool> is_dynamic(arg.dimensions_size());
   for (int64_t i = 0; i < arg.dimensions_size(); ++i) {
     // Slicing 1 out of a dynamic dimension eliminates the dynamic dimension.
     if (sizes[i] == 1) {
@@ -3242,8 +3251,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of dynamic slice"));
   auto number_of_indices = start_index_shapes.size();
   // TODO(b/118437727): Remove this path.
-  if (!allow_scalar_indices ||
-      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+  if (!allow_scalar_indices || (number_of_indices >= 1 &&
+                                start_index_shapes[0].dimensions_size() == 1)) {
     if (number_of_indices != 1) {
       return InvalidArgument(
           "Dynamic slice should have exactly 1 index operand, has %d.",
@@ -3260,10 +3269,10 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     TF_RETURN_IF_ERROR(
         ExpectArray(start_indices_shape, "start indices of dynamic slice"));
 
-    if (start_indices_shape.rank() != 1) {
+    if (start_indices_shape.dimensions_size() != 1) {
       return InvalidArgument(
           "Dynamic slice start indices of rank %d must be rank1.",
-          start_indices_shape.rank());
+          start_indices_shape.dimensions_size());
     }
 
     if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
@@ -3272,23 +3281,24 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     }
 
     const int64_t start_num_dims = start_indices_shape.dimensions(0);
-    if (operand_shape.rank() != start_num_dims) {
+    if (operand_shape.dimensions_size() != start_num_dims) {
       return InvalidArgument(
           "Dynamic slice start number of dimensions %d (%s) must match rank "
           "%d of slice input (%s).",
           start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+          operand_shape.dimensions_size(),
+          ShapeUtil::HumanString(operand_shape));
     }
   } else {
     VLOG(2) << StrFormat("slicing shape %s a with slice_sizes={%s}",
                          ShapeUtil::HumanString(operand_shape),
                          StrJoin(slice_sizes, ", "));
 
-    if (operand_shape.rank() != number_of_indices) {
+    if (operand_shape.dimensions_size() != number_of_indices) {
       return InvalidArgument(
           "Dynamic slice start number of dimensions %d must match rank "
           "%d of slice input (%s).",
-          number_of_indices, operand_shape.rank(),
+          number_of_indices, operand_shape.dimensions_size(),
           ShapeUtil::HumanString(operand_shape));
     }
 
@@ -3314,10 +3324,10 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     }
   }
 
-  if (slice_sizes.size() != operand_shape.rank()) {
+  if (slice_sizes.size() != operand_shape.dimensions_size()) {
     return InvalidArgument(
         "Dynamic slice index count does not match argument rank: %u vs %d.",
-        slice_sizes.size(), operand_shape.rank());
+        slice_sizes.size(), operand_shape.dimensions_size());
   }
 
   for (int64_t dim = 0; dim < slice_sizes.size(); ++dim) {
@@ -3339,7 +3349,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   Shape result =
       ShapeUtil::MakeShape(operand_shape.element_type(), slice_sizes);
 
-  for (int64_t dimension = 0; dimension < operand_shape.rank(); ++dimension) {
+  for (int64_t dimension = 0; dimension < operand_shape.dimensions_size();
+       ++dimension) {
     if (operand_shape.is_dynamic_dimension(dimension) &&
         slice_sizes[dimension] > 1 &&
         slice_sizes[dimension] == operand_shape.dimensions(dimension)) {
@@ -3360,8 +3371,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
 
   auto number_of_indices = start_index_shapes.size();
   // TODO(b/118437727): Remove this path.
-  if (!allow_scalar_indices ||
-      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+  if (!allow_scalar_indices || (number_of_indices >= 1 &&
+                                start_index_shapes[0].dimensions_size() == 1)) {
     if (number_of_indices != 1) {
       return InvalidArgument(
           "Dynamic update slice should have exactly 1 index operand, has %d.",
@@ -3378,10 +3389,10 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
         ShapeUtil::HumanString(start_indices_shape),
         ShapeUtil::HumanString(update_shape));
 
-    if (start_indices_shape.rank() != 1) {
+    if (start_indices_shape.dimensions_size() != 1) {
       return InvalidArgument(
           "Dynamic update slice start indices of rank %d must be rank1.",
-          start_indices_shape.rank());
+          start_indices_shape.dimensions_size());
     }
 
     if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
@@ -3390,23 +3401,24 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     }
 
     const int64_t start_num_dims = start_indices_shape.dimensions(0);
-    if (operand_shape.rank() != start_num_dims) {
+    if (operand_shape.dimensions_size() != start_num_dims) {
       return InvalidArgument(
           "Dynamic update slice start number of dimensions %d (%s) must match "
           "rank %d of slice input (%s).",
           start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+          operand_shape.dimensions_size(),
+          ShapeUtil::HumanString(operand_shape));
     }
   } else {
     VLOG(2) << StrFormat("updating slice of shape %s with update shape %s",
                          ShapeUtil::HumanString(operand_shape),
                          ShapeUtil::HumanString(update_shape));
 
-    if (operand_shape.rank() != number_of_indices) {
+    if (operand_shape.dimensions_size() != number_of_indices) {
       return InvalidArgument(
           "Dynamic update slice start number of dimensions %d must match "
           "rank %d of slice input (%s).",
-          number_of_indices, operand_shape.rank(),
+          number_of_indices, operand_shape.dimensions_size(),
           ShapeUtil::HumanString(operand_shape));
     }
 
@@ -3433,11 +3445,11 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     }
   }
 
-  if (update_shape.rank() != operand_shape.rank()) {
+  if (update_shape.dimensions_size() != operand_shape.dimensions_size()) {
     return InvalidArgument(
         "Dynamic update slice update rank does not match argument rank: "
         "%d vs %d.",
-        update_shape.rank(), operand_shape.rank());
+        update_shape.dimensions_size(), operand_shape.dimensions_size());
   }
 
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape,
@@ -3449,7 +3461,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
         PrimitiveType_Name(update_shape.element_type()));
   }
 
-  for (int64_t dim = 0; dim < operand_shape.rank(); ++dim) {
+  for (int64_t dim = 0; dim < operand_shape.dimensions_size(); ++dim) {
     const int64_t input_dim_size = operand_shape.dimensions(dim);
     const int64_t update_dim_size = update_shape.dimensions(dim);
     if (!IsUnboundedDynamicSize(update_dim_size) && update_dim_size < 0) {
@@ -3471,7 +3483,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   // dynamic.
   // If update shape is dynamic, only propagate dynamic dimension to result if
   // the update is a full update (update_shape[i] == operand_shape[i]).
-  for (int64_t i = 0; i < update_shape.rank(); ++i) {
+  for (int64_t i = 0; i < update_shape.dimensions_size(); ++i) {
     if (operand_shape.is_dynamic_dimension(i)) {
       result_shape.set_dynamic_dimension(i, true);
     }
@@ -3494,7 +3506,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     return InvalidArgument("a dimension number is duplicated in reverse");
   }
   for (int64_t dimension : dimensions) {
-    if (dimension >= operand_shape.rank() || dimension < 0) {
+    if (dimension >= operand_shape.dimensions_size() || dimension < 0) {
       return InvalidArgument(
           "One of the reverse dimensions (%d) is out-of-bounds in shape %s.",
           dimension, ShapeUtil::HumanString(operand_shape));
@@ -3622,7 +3634,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
         for (int j = 0; j < branch_computations.size(); ++j) {
           auto branch_subshape =
               ShapeUtil::GetSubshape(branch_computations[j].result(), index);
-          for (int64_t i = 0; i < branch_subshape.rank(); ++i) {
+          for (int64_t i = 0; i < branch_subshape.dimensions_size(); ++i) {
             if (branch_subshape.is_dynamic_dimension(i)) {
               subshape->set_dynamic_dimension(i, true);
             }
@@ -3670,8 +3682,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of broadcast"));
   TF_RETURN_IF_ERROR(ExpectArray(output_shape, "operand of broadcast"));
   TF_RET_CHECK(!output_shape.is_unbounded_dynamic());
-  const int64_t operand_rank = operand_shape.rank();
-  const int64_t output_rank = output_shape.rank();
+  const int64_t operand_rank = operand_shape.dimensions_size();
+  const int64_t output_rank = output_shape.dimensions_size();
   if (operand_rank > output_rank) {
     return InvalidArgument(
         "InDim style broadcast must be to an equal or higher ranked shape; "
@@ -3732,7 +3744,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   }
 
   for (const Shape* dim_size_shape : dim_size_shapes) {
-    if (dim_size_shape->element_type() != S32 && dim_size_shape->rank() != 0) {
+    if (dim_size_shape->element_type() != S32 &&
+        dim_size_shape->dimensions_size() != 0) {
       return InvalidArgument(
           "DynamicReshape's dim size has to be scalar S32, got (%s): ",
           dim_size_shape->ToString());
@@ -3778,12 +3791,13 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
         ShapeUtil::HumanString(inferred_shape));
   }
 
-  std::vector<int64_t> indices(operand.rank());
+  std::vector<int64_t> indices(operand.dimensions_size());
   std::iota(indices.begin(), indices.end(), 0);
 
   // Propagate dynamic dimension.
   auto common_factors = CommonFactors(operand.dimensions(), dimensions);
-  for (int64_t input_dim = 0; input_dim < operand.rank(); ++input_dim) {
+  for (int64_t input_dim = 0; input_dim < operand.dimensions_size();
+       ++input_dim) {
     if (!operand.is_dynamic_dimension(input_dim)) {
       continue;
     }
@@ -3860,7 +3874,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
       if (input_dim == 0) {
         output_dynamic_dimension = 0;
       }
-      if (input_dim == operand.rank() - 1) {
+      if (input_dim == operand.dimensions_size() - 1) {
         output_dynamic_dimension = dimensions.size() - 1;
       }
 
@@ -3907,7 +3921,8 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
     const Shape& operand, absl::Span<const int64_t> dimensions) {
   TF_RETURN_IF_ERROR(ExpectArray(operand, "transpose"));
 
-  if (dimensions.size() != operand.rank() || !IsPermutation(dimensions)) {
+  if (dimensions.size() != operand.dimensions_size() ||
+      !IsPermutation(dimensions)) {
     return InvalidArgument(
         "Transpose dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
@@ -3984,17 +3999,17 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   Shape result = ShapeUtil::ChangeElementType(
       full_rank_shape,
       ShapeUtil::HigherPrecisionElementType(on_true, on_false));
-  for (int64_t dimension = 0; dimension < full_rank_shape.rank(); ++dimension) {
+  for (int64_t dimension = 0; dimension < full_rank_shape.dimensions_size();
+       ++dimension) {
     if (on_true.is_unbounded_dynamic_dimension(dimension) ||
         on_false.is_unbounded_dynamic_dimension(dimension)) {
       absl::StatusOr<DimAndBound> inferred = InferMostSpecificDimAndBound(
           dimension, on_true.dimensions(dimension),
           on_false.dimensions(dimension), on_true.dimensions(dimension),
           on_false.dimensions(dimension));
-      result.set_dimensions(dimension, (*inferred).dimension);
-      result.set_dynamic_dimension(
-          dimension, on_true.is_dynamic_dimension(dimension) &&
-                         on_false.is_dynamic_dimension(dimension));
+      result.set_dimensions(dimension, (*inferred).dimension,
+                            on_true.is_dynamic_dimension(dimension) &&
+                                on_false.is_dynamic_dimension(dimension));
     } else {
       result.set_dynamic_dimension(
           dimension, (!ShapeUtil::IsScalar(pred) &&
@@ -4375,7 +4390,7 @@ absl::Status ValidateScatterDimensionNumbers(
         "update_window_dims in scatter op must not repeat; got: %s.",
         StrJoin(dim_numbers.update_window_dims(), ", "));
   }
-  const int64_t updates_rank = updates_shape.rank();
+  const int64_t updates_rank = updates_shape.dimensions_size();
   for (int64_t window_dim : dim_numbers.update_window_dims()) {
     if (window_dim < 0 || window_dim >= updates_rank) {
       return InvalidArgument(
@@ -4410,10 +4425,10 @@ absl::Status ValidateScatterDimensionNumbers(
   auto window_size = dim_numbers.update_window_dims_size() +
                      dim_numbers.inserted_window_dims_size() +
                      dim_numbers.input_batching_dims_size();
-  if (window_size != operand_shape.rank()) {
+  if (window_size != operand_shape.dimensions_size()) {
     return InvalidArgument(
         "Scatter op has window of size %d; doesn't match operand of rank %d.",
-        window_size, operand_shape.rank());
+        window_size, operand_shape.dimensions_size());
   }
 
   // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers.
@@ -4579,9 +4594,10 @@ absl::Status ValidateScatterDimensionNumbers(
     int64_t expected_updates_rank =
         expanded_scatter_indices_shape.size() - 1 +
         scatter_dim_numbers.update_window_dims_size();
-    if (updates_shape.rank() != expected_updates_rank) {
+    if (updates_shape.dimensions_size() != expected_updates_rank) {
       return InvalidArgument("Updates tensor must be of rank %d; got %d.",
-                             expected_updates_rank, updates_shape.rank());
+                             expected_updates_rank,
+                             updates_shape.dimensions_size());
     }
 
     TF_RETURN_IF_ERROR(ValidateScatterDimensionNumbers(
@@ -4602,7 +4618,7 @@ absl::Status ValidateScatterDimensionNumbers(
     }
 
     int64_t scatter_dims_seen = 0;
-    for (int64_t i = 0; i < updates_shape.rank(); ++i) {
+    for (int64_t i = 0; i < updates_shape.dimensions_size(); ++i) {
       bool is_update_window_dim =
           absl::c_binary_search(scatter_dim_numbers.update_window_dims(), i);
       if (is_update_window_dim) {
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index a4f0693f4d2e..87841d476712 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -34,10 +34,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status_matchers.h"
@@ -998,7 +998,11 @@ static void Pass(const Shape& shape, FftType type,
   const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferFftShape(shape, type, length);
   ASSERT_IS_OK(inferred_shape.status());
-  ASSERT_TRUE(ShapeUtil::Equal(expected_shape, *inferred_shape));
+  ASSERT_TRUE(ShapeUtil::Equal(expected_shape, *inferred_shape))
+      << "\nshape: " << shape << "\ntype: " << type
+      << "\nlength: " << absl::StrJoin(length, ",")
+      << "\nexpected_shape: " << expected_shape
+      << "\ninferred_shape: " << *inferred_shape;
 }
 
 static void Fail(const Shape& shape, FftType type,
@@ -3960,13 +3964,12 @@ class ScatterShapeInferenceTest
     Shape& result = *program_shape.mutable_result();
     result = ShapeUtil::MakeNil();
     result.mutable_tuple_shapes()->reserve(types.size());
-    program_shape.mutable_parameters()->reserve(types.size() * 2);
     for (PrimitiveType type : types) {
-      *program_shape.add_parameters() = scalar(type);
+      program_shape.AddParameter(scalar(type), "");
       *result.add_tuple_shapes() = scalar(type);
     }
     for (PrimitiveType type : types) {
-      *program_shape.add_parameters() = scalar(type);
+      program_shape.AddParameter(scalar(type), "");
     }
     return program_shape;
   }
diff --git a/third_party/xla/xla/service/shaped_buffer_test.cc b/third_party/xla/xla/service/shaped_buffer_test.cc
index b07e246e33b4..6e0d17b30b74 100644
--- a/third_party/xla/xla/service/shaped_buffer_test.cc
+++ b/third_party/xla/xla/service/shaped_buffer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/test.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test_benchmark.h"
diff --git a/third_party/xla/xla/service/sharding_format_picker.h b/third_party/xla/xla/service/sharding_format_picker.h
deleted file mode 100644
index 9a369faedf28..000000000000
--- a/third_party/xla/xla/service/sharding_format_picker.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_SHARDING_FORMAT_PICKER_H_
-#define XLA_SERVICE_SHARDING_FORMAT_PICKER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/sharding_format_picker.h"
-
-#endif  // XLA_SERVICE_SHARDING_FORMAT_PICKER_H_
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index 98fcec067990..9aa995b0ed15 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -50,7 +50,7 @@ limitations under the License.
 #include "xla/protobuf_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/dot_as_convolution_util.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/spmd/shard_barrier_partitioner.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
@@ -179,15 +179,18 @@ bool IsPassthroughCustomOps(const HloInstruction* hlo) {
   }
   if (hlo->operand_count() != 1 || !hlo->shape().IsArray() ||
       !hlo->operand(0)->shape().IsArray() ||
-      hlo->operand(0)->shape().rank() != hlo->shape().rank()) {
+      hlo->operand(0)->shape().dimensions_size() !=
+          hlo->shape().dimensions_size()) {
     return false;
   }
 
   return hlo->IsCustomCall(
       {"ResizeNearest", "ResizeBilinear", "ResizeNearestGrad",
        "ResizeBilinearGrad", "Cholesky",
-       host_memory_offload_annotations::kMoveToDeviceCustomCallTarget,
-       host_memory_offload_annotations::kMoveToHostCustomCallTarget});
+       memory_annotations::kMoveToHostCustomCallTarget,
+       memory_annotations::kMoveToDeviceCustomCallTarget,
+       memory_annotations::kPinToDeviceCustomCallTarget,
+       memory_annotations::kPinToDeviceSramCustomCallTarget});
 }
 
 // Return the operand which is the most suitable for determining the sharding
@@ -592,7 +595,7 @@ bool SameShardingMetadata(const HloSharding& a, const HloSharding& b) {
                           absl::Span<const OpMetadata> b) {
     if (a.size() != b.size()) return false;
     for (int i = 0, e = a.size(); i < e; ++i) {
-      if (!protobuf_util::ProtobufEquals(a[i], b[i])) {
+      if (!protobuf_util::HaveSameSerialization(a[i], b[i])) {
         return false;
       }
     }
@@ -1000,7 +1003,7 @@ bool InferUnspecifiedDimsFromUsers(HloInstruction* annotate_op,
 
 bool InferUnspecifiedDimsFromShardGroup(
     HloInstruction* annotate_op, absl::Span<const int64_t> unspecified_dims,
-    const absl::flat_hash_set<HloInstruction*>& shard_group) {
+    const std::vector<HloInstruction*>& shard_group) {
   // ProcessShardingInstruction will either keep the "Sharding" custom call as
   // is or replace it with a copy.
   CHECK(annotate_op->IsCustomCall("Sharding") ||
@@ -1046,7 +1049,7 @@ bool IsCSEPreventionTarget(const HloInstruction* instruction) {
   // Scalar broadcasts are the most common CSE target that causes cross-layer
   // propagation on unrelated subgraphs.
   return instruction->opcode() == HloOpcode::kBroadcast &&
-         instruction->operand(0)->shape().rank() == 0;
+         instruction->operand(0)->shape().dimensions_size() == 0;
 }
 
 // Marks a sharding as for CSE prevention/
@@ -1095,8 +1098,10 @@ bool InferDotShardingFromOperands(
     auto replicate_contracting_dims =
         hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
             operand_sharding, contracting_dims);
-    std::vector<int64_t> out_dims_to_op_perm(instruction->shape().rank(), -1);
-    std::vector<int64_t> op_dims_to_output_perm(operand->shape().rank(), -1);
+    std::vector<int64_t> out_dims_to_op_perm(
+        instruction->shape().dimensions_size(), -1);
+    std::vector<int64_t> op_dims_to_output_perm(
+        operand->shape().dimensions_size(), -1);
     for (const auto& dim : dnums.batch_dims) {
       out_dims_to_op_perm[dim.output] = operand_index == 0 ? dim.lhs : dim.rhs;
       op_dims_to_output_perm[operand_index == 0 ? dim.lhs : dim.rhs] =
@@ -1228,7 +1233,8 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
   const HloInstruction* lhs = instruction->operand(0);
   auto get_tiled_sharding_based_on_lhs = [&] {
     CHECK(!lhs->sharding().IsTileMaximal());
-    std::vector<int64_t> output_to_lhs_indices(instruction->shape().rank());
+    std::vector<int64_t> output_to_lhs_indices(
+        instruction->shape().dimensions_size());
     output_to_lhs_indices[dnums.output_batch_dimension()] =
         dnums.input_batch_dimension();
     output_to_lhs_indices[dnums.output_feature_dimension()] =
@@ -1276,7 +1282,7 @@ std::optional<HloSharding> InferBroadcastOperandSharding(
   }
   std::vector<int64_t> dims_to_replicate;
   bool needs_replication = false;
-  for (int64_t i = 0; i < instruction.shape().rank(); ++i) {
+  for (int64_t i = 0; i < instruction.shape().dimensions_size(); ++i) {
     if (absl::c_count(instruction.dimensions(), i) == 0) {
       dims_to_replicate.push_back(i);
       if (instruction.sharding().tile_assignment().dim(i) > 1) {
@@ -1376,9 +1382,9 @@ absl::StatusOr<bool> ProcessShardingInstruction(
     absl::flat_hash_map<int64_t, HloSharding>* saved_parameter_shardings,
     absl::flat_hash_map<HloInstruction*, int64_t>*
         instruction_to_shard_group_id,
-    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>*
+    absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_as_group,
-    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>*
+    absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_like_group,
     const std::vector<bool>*
         allow_spmd_sharding_propagation_to_parameters_vector,
@@ -1434,7 +1440,7 @@ absl::StatusOr<bool> ProcessShardingInstruction(
                      "instructions within the same shard_as group: "
                   << (*shard_as_group.begin())->shape().ToString();
             }
-            shard_as_group.insert(instruction);
+            shard_as_group.push_back(instruction);
           } else {
             auto& shard_like_group =
                 (*shard_group_id_to_shard_like_group)[shard_group_id];
@@ -1446,7 +1452,7 @@ absl::StatusOr<bool> ProcessShardingInstruction(
                      "instructions within the same shard_like group: "
                   << (*shard_like_group.begin())->shape().ToString();
             }
-            shard_like_group.insert(instruction);
+            shard_like_group.push_back(instruction);
           }
           HloSharding sharding = instruction->sharding();
           sharding.ClearShardGroup();
@@ -1683,7 +1689,7 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
       }
 
       std::vector<int64_t> slice_dims;
-      for (int64_t i = 0; i < user.shape().rank(); ++i) {
+      for (int64_t i = 0; i < user.shape().dimensions_size(); ++i) {
         if (user.shape().dimensions(i) != operand->shape().dimensions(i)) {
           slice_dims.push_back(i);
         }
@@ -1770,7 +1776,8 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
               [&](const Shape& sub_shape, const ShapeIndex& index) {
                 if (ShapeUtil::IsLeafIndex(instruction.shape(), index)) {
                   shardings.push_back(hlo_sharding_util::ReplicateAllDataDims(
-                      user_sharding, sub_shape.dimensions_size()));
+                      user_sharding,
+                      sub_shape.IsArray() ? sub_shape.dimensions().size() : 0));
                 }
               });
           return HloSharding::Tuple(instruction.shape(), shardings);
@@ -1803,7 +1810,7 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
           may_combine_partial_sharding);
     }
     case HloOpcode::kReduce: {
-      if (instruction.shape().rank() == 0) {
+      if (instruction.shape().dimensions_size() == 0) {
         return std::nullopt;
       }
       auto user_sharding =
@@ -1813,7 +1820,7 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
               : user.sharding();
       if (!user_sharding.IsTileMaximal()) {
         std::vector<int64_t> target_tile_assignment_dimensions(
-            instruction.shape().rank() +
+            instruction.shape().dimensions_size() +
             (user_sharding.ReplicateOnLastTileDim() ? 1 : 0) +
             user_sharding.subgroup_types().size());
         const auto& dimensions = user.dimensions();
@@ -2007,7 +2014,7 @@ bool InferDynamicUpdateSliceShardingFromOperand1(
   CHECK(!operand->sharding().IsManual());
 
   std::vector<int64_t> slice_dims;
-  for (int64_t i = 0; i < instruction->shape().rank(); ++i) {
+  for (int64_t i = 0; i < instruction->shape().dimensions_size(); ++i) {
     if (instruction->shape().dimensions(i) != operand->shape().dimensions(i)) {
       slice_dims.push_back(i);
     }
@@ -2032,7 +2039,7 @@ bool InferDynamicUpdateSliceShardingFromOperand0(
 
 bool ShardingPropagation::InferShardingFromShardGroup(
     HloInstruction* instruction, int64_t aggressiveness,
-    const absl::flat_hash_set<HloInstruction*>& shard_group) {
+    const std::vector<HloInstruction*>& shard_group) {
   if (!CanPropagateThroughAtAggressiveLevel(*instruction, aggressiveness)) {
     return false;
   }
@@ -2267,7 +2274,7 @@ bool ShardingPropagation::InferShardingFromOperands(
       // non-tiled.
       std::vector<int64_t> target_tile_assignment_dimensions;
       const auto& dimensions = instruction->dimensions();
-      for (int64_t i = 0; i < instruction->shape().rank(); ++i) {
+      for (int64_t i = 0; i < instruction->shape().dimensions_size(); ++i) {
         auto it = absl::c_find(dimensions, i);
         if (it == dimensions.end()) {
           target_tile_assignment_dimensions.push_back(1);
@@ -2309,7 +2316,7 @@ bool ShardingPropagation::InferShardingFromOperands(
           }
           const auto& tile_assignment =
               concat_operand->sharding().tile_assignment();
-          for (int64_t i = 0; i < instruction->shape().rank(); ++i) {
+          for (int64_t i = 0; i < instruction->shape().dimensions_size(); ++i) {
             if (absl::c_linear_search(instruction->dimensions(), i) &&
                 tile_assignment.dim(i) > 1) {
               return false;
@@ -2669,7 +2676,8 @@ bool ShardingPropagation::InferShardingFromOperands(
               // shapes.
               changed |= MaybeImproveInstructionSharding(
                   hlo_sharding_util::ReplicateAllDataDims(
-                      operand->sharding(), instruction->shape().rank()),
+                      operand->sharding(),
+                      instruction->shape().dimensions_size()),
                   instruction, may_combine_partial_sharding,
                   ComputeNonRootUsers(instruction) == 1);
               continue;
@@ -2829,9 +2837,9 @@ absl::StatusOr<bool> ShardingPropagation::RunToFixPoint(
         unspecified_dims,
     absl::flat_hash_map<HloInstruction*, int64_t>&
         instruction_to_shard_group_id,
-    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>&
+    absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>&
         shard_group_id_to_shard_as_group,
-    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>&
+    absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>&
         shard_group_id_to_shard_like_group,
     int64_t& iterations) {
   bool changed = false;
@@ -2878,7 +2886,7 @@ absl::StatusOr<bool> ShardingPropagation::RunToFixPoint(
         }
         if (instruction_to_shard_group_id.contains(hlo)) {
           const int64_t shard_group_id = instruction_to_shard_group_id.at(hlo);
-          const absl::flat_hash_set<HloInstruction*>& shard_group =
+          const std::vector<HloInstruction*>& shard_group =
               shard_group_id_to_shard_as_group.contains(shard_group_id)
                   ? shard_group_id_to_shard_as_group.at(shard_group_id)
                   : shard_group_id_to_shard_like_group.at(shard_group_id);
@@ -2901,7 +2909,7 @@ absl::StatusOr<bool> ShardingPropagation::RunToFixPoint(
           }
           const int64_t shard_group_id =
               instruction_to_shard_group_id.at(instruction);
-          const absl::flat_hash_set<HloInstruction*>& shard_group =
+          const std::vector<HloInstruction*>& shard_group =
               shard_group_id_to_shard_as_group.contains(shard_group_id)
                   ? shard_group_id_to_shard_as_group.at(shard_group_id)
                   : shard_group_id_to_shard_like_group.at(shard_group_id);
@@ -3151,9 +3159,9 @@ absl::StatusOr<bool> ShardingPropagation::Run(
   std::vector<HloSharding> saved_root_shardings;
   absl::flat_hash_map<int64_t, HloSharding> saved_parameter_shardings;
   absl::flat_hash_map<HloInstruction*, int64_t> instruction_to_shard_group_id;
-  absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>
+  absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>
       shard_group_id_to_shard_as_group;
-  absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>
+  absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>
       shard_group_id_to_shard_like_group;
   TF_ASSIGN_OR_RETURN(
       bool changed,
@@ -3590,10 +3598,9 @@ absl::StatusOr<bool> ShardingPropagation::Run(
       HloSharding param_sharding = params[0]->sharding();
       for (int64_t i = 0; i < params[0]->shape().tuple_shapes_size(); ++i) {
         if (allow_spmd_sharding_propagation_to_parameters_vector_[i] &&
-            !evenly_partitions(
-                ShapeUtil::GetSubshapeOneIndex(params[0]->shape(), i),
-                params[0]->sharding().GetSubSharding(params[0]->shape(),
-                                                     {i}))) {
+            !evenly_partitions(params[0]->shape().tuple_shapes(i),
+                               params[0]->sharding().GetSubSharding(
+                                   params[0]->shape(), {i}))) {
           param_sharding.tuple_elements()[i] = HloSharding::Replicate();
         }
       }
diff --git a/third_party/xla/xla/service/sharding_propagation.h b/third_party/xla/xla/service/sharding_propagation.h
index 903d5d773082..f1b05bcba32a 100644
--- a/third_party/xla/xla/service/sharding_propagation.h
+++ b/third_party/xla/xla/service/sharding_propagation.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_SHARDING_PROPAGATION_H_
 #define XLA_SERVICE_SHARDING_PROPAGATION_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -74,9 +75,9 @@ absl::StatusOr<bool> ProcessShardingInstruction(
     absl::flat_hash_map<int64_t, HloSharding>* saved_parameter_shardings,
     absl::flat_hash_map<HloInstruction*, int64_t>*
         instruction_to_shard_group_id = nullptr,
-    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>*
+    absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_as_group = nullptr,
-    absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>*
+    absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>*
         shard_group_id_to_shard_like_group = nullptr,
     const std::vector<bool>*
         allow_spmd_sharding_propagation_to_parameters_vector = nullptr,
@@ -149,7 +150,7 @@ class ShardingPropagation : public HloModulePass {
  private:
   bool InferShardingFromShardGroup(
       HloInstruction* instruction, int64_t aggressiveness,
-      const absl::flat_hash_set<HloInstruction*>& shard_group);
+      const std::vector<HloInstruction*>& shard_group);
   bool InferShardingFromOperands(
       HloInstruction* instruction, const ComputationMap& computation_map,
       int64_t aggressiveness, const CallGraph& call_graph,
@@ -171,9 +172,9 @@ class ShardingPropagation : public HloModulePass {
           unspecified_dims,
       absl::flat_hash_map<HloInstruction*, int64_t>&
           instruction_to_shard_group_id,
-      absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>&
+      absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>&
           shard_group_id_to_shard_as_group,
-      absl::flat_hash_map<int64_t, absl::flat_hash_set<HloInstruction*>>&
+      absl::flat_hash_map<int64_t, std::vector<HloInstruction*>>&
           shard_group_id_to_shard_like_group,
       int64_t& iterations);
 
diff --git a/third_party/xla/xla/service/sharding_propagation_test.cc b/third_party/xla/xla/service/sharding_propagation_test.cc
index 1d4eb22cf5f8..b50b09df2790 100644
--- a/third_party/xla/xla/service/sharding_propagation_test.cc
+++ b/third_party/xla/xla/service/sharding_propagation_test.cc
@@ -34,12 +34,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_op_metadata.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/simplifiers/hlo_constant_splitter.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/protobuf_util.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -49,7 +49,19 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using ShardingPropagationTest = HloTestBase;
+using ::testing::ExplainMatchResult;
+using ::testing::HasSubstr;
+using ::testing::MakeMatcher;
+using ::testing::Matcher;
+using ::testing::MatcherInterface;
+using ::testing::MatchResultListener;
+using ::testing::StringMatchResultListener;
+using ::testing::TestParamInfo;
+using ::testing::Values;
+using ::testing::WithParamInterface;
+using ::tsl::proto_testing::EqualsProto;
+
+using ShardingPropagationTest = HloHardwareIndependentTestBase;
 
 void ClearMetadata(HloModule* module) {
   for (HloComputation* computation : module->computations()) {
@@ -88,12 +100,12 @@ struct MetadataTestParameterWithOutput {
 };
 
 class ParameterizedMetadataTest
-    : public HloTestBase,
-      public ::testing::WithParamInterface<MetadataTestParameter> {};
+    : public HloHardwareIndependentTestBase,
+      public WithParamInterface<MetadataTestParameter> {};
 
 class ParameterizedMetadataTestWithOutput
-    : public HloTestBase,
-      public ::testing::WithParamInterface<MetadataTestParameterWithOutput> {};
+    : public HloHardwareIndependentTestBase,
+      public WithParamInterface<MetadataTestParameterWithOutput> {};
 
 std::string OpMetadataListToString(absl::Span<const OpMetadata> metadata) {
   std::vector<std::string> metadata_strings;
@@ -105,15 +117,13 @@ std::string OpMetadataListToString(absl::Span<const OpMetadata> metadata) {
   return absl::StrCat("{", absl::StrJoin(metadata_strings, ", "), "}");
 }
 
-class HloShardingMetadataMatcher
-    : public ::testing::MatcherInterface<const HloSharding&> {
+class HloShardingMetadataMatcher : public MatcherInterface<const HloSharding&> {
  public:
   explicit HloShardingMetadataMatcher(absl::Span<const OpMetadata> metadata)
       : metadata_(metadata.begin(), metadata.end()) {}
 
-  bool MatchAndExplain(
-      const HloSharding& sharding,
-      ::testing::MatchResultListener* listener) const override {
+  bool MatchAndExplain(const HloSharding& sharding,
+                       MatchResultListener* listener) const override {
     if (sharding.metadata().size() != metadata_.size()) {
       *listener << sharding.ToString(/*include_metadata=*/true)
                 << " has incorrect sharding metadata (expected: "
@@ -122,11 +132,13 @@ class HloShardingMetadataMatcher
     }
 
     for (int i = 0, e = metadata_.size(); i < e; ++i) {
-      if (!protobuf_util::ProtobufEquals(sharding.metadata()[i],
-                                         metadata_[i])) {
+      StringMatchResultListener sub_listener;
+      if (!ExplainMatchResult(EqualsProto(metadata_[i]), sharding.metadata()[i],
+                              &sub_listener)) {
         *listener << sharding.ToString(/*include_metadata=*/true)
                   << " has incorrect sharding metadata (expected: "
-                  << OpMetadataListToString(metadata_) << ")";
+                  << OpMetadataListToString(metadata_) << "), where the " << i
+                  << "-th element doesn't match (" << sub_listener.str() << ")";
         return false;
       }
     }
@@ -142,9 +154,9 @@ class HloShardingMetadataMatcher
   std::vector<OpMetadata> metadata_;
 };
 
-::testing::Matcher<const HloSharding&> ShardingMetadata(
+Matcher<const HloSharding&> ShardingMetadata(
     absl::Span<const OpMetadata> metadata) {
-  return ::testing::MakeMatcher(new HloShardingMetadataMatcher(metadata));
+  return MakeMatcher(new HloShardingMetadataMatcher(metadata));
 }
 
 OpMetadata CreateMetadata(const std::string& op_name) {
@@ -155,15 +167,15 @@ OpMetadata CreateMetadata(const std::string& op_name) {
 
 INSTANTIATE_TEST_SUITE_P(
     ShardingPropagation, ParameterizedMetadataTest,
-    ::testing::Values(MetadataTestParameter(/*propagate_metadata=*/false,
-                                            /*clear_metadata=*/false),
-                      MetadataTestParameter(/*propagate_metadata=*/false,
-                                            /*clear_metadata=*/true),
-                      MetadataTestParameter(/*propagate_metadata=*/true,
-                                            /*clear_metadata=*/false),
-                      MetadataTestParameter(/*propagate_metadata=*/true,
-                                            /*clear_metadata=*/true)),
-    [](const ::testing::TestParamInfo<MetadataTestParameter>& info) {
+    Values(MetadataTestParameter(/*propagate_metadata=*/false,
+                                 /*clear_metadata=*/false),
+           MetadataTestParameter(/*propagate_metadata=*/false,
+                                 /*clear_metadata=*/true),
+           MetadataTestParameter(/*propagate_metadata=*/true,
+                                 /*clear_metadata=*/false),
+           MetadataTestParameter(/*propagate_metadata=*/true,
+                                 /*clear_metadata=*/true)),
+    [](const TestParamInfo<MetadataTestParameter>& info) {
       return absl::StrCat(info.param.propagate_metadata
                               ? "MetadataPropagation"
                               : "NoMetadataPropagation",
@@ -174,39 +186,39 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(
     ShardingPropagation, ParameterizedMetadataTestWithOutput,
-    ::testing::Values(MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/false,
-                          /*clear_metadata=*/false,
-                          /*allow_root_sharding_propagation=*/false),
-                      MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/false,
-                          /*clear_metadata=*/true,
-                          /*allow_root_sharding_propagation=*/false),
-                      MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/true,
-                          /*clear_metadata=*/false,
-                          /*allow_root_sharding_propagation=*/false),
-                      MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/true,
-                          /*clear_metadata=*/true,
-                          /*allow_root_sharding_propagation=*/false),
-                      MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/false,
-                          /*clear_metadata=*/false,
-                          /*allow_root_sharding_propagation=*/true),
-                      MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/false,
-                          /*clear_metadata=*/true,
-                          /*allow_root_sharding_propagation=*/true),
-                      MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/true,
-                          /*clear_metadata=*/false,
-                          /*allow_root_sharding_propagation=*/true),
-                      MetadataTestParameterWithOutput(
-                          /*propagate_metadata=*/true,
-                          /*clear_metadata=*/true,
-                          /*allow_root_sharding_propagation=*/true)),
-    [](const ::testing::TestParamInfo<MetadataTestParameterWithOutput>& info) {
+    Values(MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/false,
+               /*clear_metadata=*/false,
+               /*allow_root_sharding_propagation=*/false),
+           MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/false,
+               /*clear_metadata=*/true,
+               /*allow_root_sharding_propagation=*/false),
+           MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/true,
+               /*clear_metadata=*/false,
+               /*allow_root_sharding_propagation=*/false),
+           MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/true,
+               /*clear_metadata=*/true,
+               /*allow_root_sharding_propagation=*/false),
+           MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/false,
+               /*clear_metadata=*/false,
+               /*allow_root_sharding_propagation=*/true),
+           MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/false,
+               /*clear_metadata=*/true,
+               /*allow_root_sharding_propagation=*/true),
+           MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/true,
+               /*clear_metadata=*/false,
+               /*allow_root_sharding_propagation=*/true),
+           MetadataTestParameterWithOutput(
+               /*propagate_metadata=*/true,
+               /*clear_metadata=*/true,
+               /*allow_root_sharding_propagation=*/true)),
+    [](const TestParamInfo<MetadataTestParameterWithOutput>& info) {
       return absl::StrCat(
           info.param.propagate_metadata ? "MetadataPropagation"
                                         : "NoMetadataPropagation",
@@ -2912,10 +2924,10 @@ ENTRY %entry {
   auto result =
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
           .Run(module.get());
-  EXPECT_THAT(result.status().message(),
-              ::testing::HasSubstr(
-                  "Instruction: count is on device: 0, which conflicts with "
-                  "device: 1 of channel instruction: recv"));
+  EXPECT_THAT(
+      result.status().message(),
+      HasSubstr("Instruction: count is on device: 0, which conflicts with "
+                "device: 1 of channel instruction: recv"));
 }
 
 TEST_P(ParameterizedMetadataTest, WhileConflictingShardingInBodyAfterRecv) {
@@ -2958,10 +2970,10 @@ ENTRY %entry {
   auto result =
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
           .Run(module.get());
-  EXPECT_THAT(result.status().message(),
-              ::testing::HasSubstr(
-                  "Instruction: data is on device: 0, which conflicts with "
-                  "device: 1 of channel instruction: recv"));
+  EXPECT_THAT(
+      result.status().message(),
+      HasSubstr("Instruction: data is on device: 0, which conflicts with "
+                "device: 1 of channel instruction: recv"));
 }
 
 TEST_P(ParameterizedMetadataTest, WhileConflictingShardingOnWhileInstruction) {
@@ -3004,10 +3016,10 @@ ENTRY %entry {
   auto result =
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
           .Run(module.get());
-  EXPECT_THAT(result.status().message(),
-              ::testing::HasSubstr(
-                  "Instruction: while is on device: 0, which conflicts with "
-                  "device: 1 of channel instruction: recv"));
+  EXPECT_THAT(
+      result.status().message(),
+      HasSubstr("Instruction: while is on device: 0, which conflicts with "
+                "device: 1 of channel instruction: recv"));
 }
 
 TEST_P(ParameterizedMetadataTest, WhileConv) {
@@ -12484,5 +12496,35 @@ ENTRY main {
   EXPECT_TRUE(instruction->sharding().IsReplicated());
 }
 
+TEST_F(ShardingPropagationTest,
+       ShardingPropagatesThroughPinToDeviceCustomCall) {
+  const char* const hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(s32[8,2]{1,0:T(2,128)})->s32[8,2]{0,1:T(2,128)}}, allow_spmd_sharding_propagation_to_output={true}, num_partitions=8
+
+ENTRY %main.6 (Arg_0.1: s32[8,2]) -> s32[8,2] {
+  %Arg_0.1 = s32[8,2]{1,0} parameter(0), sharding={devices=[4,2]<=[8]}, metadata={op_name="x"}
+  %custom-call = s32[8,2]{1,0} custom-call(s32[8,2]{1,0} %Arg_0.1), custom_call_target="PinToDevice"
+  %constant.0 = s32[] constant(2)
+  %broadcast.0 = s32[8,2]{1,0} broadcast(s32[] %constant.0), dimensions={}
+  ROOT %multiply.5 = s32[8,2]{1,0} multiply(s32[8,2]{1,0} %custom-call, s32[8,2]{1,0} %broadcast.0)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* original_custom_call =
+      FindInstruction(module.get(), "custom-call");
+  ASSERT_NE(original_custom_call, nullptr);
+  EXPECT_THAT(original_custom_call, op::NoSharding());
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  HloInstruction* sharded_custom_call =
+      FindInstruction(module.get(), "custom-call");
+  ASSERT_NE(sharded_custom_call, nullptr);
+  EXPECT_THAT(sharded_custom_call, op::Sharding("{devices=[4,2]<=[8]}"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/sharding_remover.cc b/third_party/xla/xla/service/sharding_remover.cc
index 042e9f137ef1..e2d926a9045a 100644
--- a/third_party/xla/xla/service/sharding_remover.cc
+++ b/third_party/xla/xla/service/sharding_remover.cc
@@ -46,6 +46,7 @@ absl::StatusOr<bool> ShardingRemover::Run(
       "SPMDShardToFullShape",
       "SPMDFullToShardShape",
       sdy::kShardingGroupCustomCallTargetName,
+      sdy::kPropagationBarrierCustomCallTargetName,
       sdy::kFuncResultShardingTargetName,
       spmd::kShardBarrierFrom,
       spmd::kShardBarrierTo};
@@ -80,6 +81,8 @@ absl::StatusOr<bool> ShardingRemover::Run(
       if (instruction->custom_call_target() == "Sharding" ||
           instruction->custom_call_target() ==
               sdy::kFuncResultShardingTargetName ||
+          instruction->custom_call_target() ==
+              sdy::kPropagationBarrierCustomCallTargetName ||
           instruction->custom_call_target() == spmd::kShardBarrierFrom ||
           instruction->custom_call_target() == spmd::kShardBarrierTo) {
         auto copy = computation->AddInstruction(
diff --git a/third_party/xla/xla/service/sharding_remover_test.cc b/third_party/xla/xla/service/sharding_remover_test.cc
index 110b3bc41686..fae4fdb55857 100644
--- a/third_party/xla/xla/service/sharding_remover_test.cc
+++ b/third_party/xla/xla/service/sharding_remover_test.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -29,7 +29,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using ShardingRemoverTest = HloTestBase;
+using ShardingRemoverTest = HloHardwareIndependentTestBase;
 
 TEST_F(ShardingRemoverTest, RemoveSharding) {
   const char* const hlo_string = R"(
diff --git a/third_party/xla/xla/service/simplify_fp_conversions.h b/third_party/xla/xla/service/simplify_fp_conversions.h
deleted file mode 100644
index b12727941fb0..000000000000
--- a/third_party/xla/xla/service/simplify_fp_conversions.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
-#define XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/simplify_fp_conversions.h"
-
-#endif  // XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
diff --git a/third_party/xla/xla/service/slice_sinker.h b/third_party/xla/xla/service/slice_sinker.h
deleted file mode 100644
index d1d1aa599b1a..000000000000
--- a/third_party/xla/xla/service/slice_sinker.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_SLICE_SINKER_H_
-#define XLA_SERVICE_SLICE_SINKER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/slice_sinker.h"
-
-#endif  // XLA_SERVICE_SLICE_SINKER_H_
diff --git a/third_party/xla/xla/service/sort_simplifier.h b/third_party/xla/xla/service/sort_simplifier.h
deleted file mode 100644
index d05996705787..000000000000
--- a/third_party/xla/xla/service/sort_simplifier.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_SORT_SIMPLIFIER_H_
-#define XLA_SERVICE_SORT_SIMPLIFIER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/sort_simplifier.h"
-
-#endif  // XLA_SERVICE_SORT_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/space_to_batch_converter.cc b/third_party/xla/xla/service/space_to_batch_converter.cc
index 4efd1cbb6d8f..47999ae146ae 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter.cc
@@ -533,7 +533,7 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
                        activations, activations_batch_dim, original_batch_size,
                        spatial_dimensions_to_split));
 
-  const int64_t rank = activations->shape().rank();
+  const int64_t rank = activations->shape().dimensions_size();
 
   VLOG(1) << "In HaloDuplicateWithSlice with activations "
           << activations->ToString() << " batch_size " << batch_size
@@ -667,7 +667,7 @@ ConvolutionVisitor::BringSpaceNextToBatch(
 
   int64_t spatial_dimension_to_split = spatial_dimensions_to_split->at(0);
 
-  std::vector<int64_t> transpose_dims(activations->shape().rank());
+  std::vector<int64_t> transpose_dims(activations->shape().dimensions_size());
   if (spatial_dimension_to_split == activations_batch_dim + 1) {
     absl::c_iota(transpose_dims, 0);
   } else {
@@ -677,7 +677,7 @@ ConvolutionVisitor::BringSpaceNextToBatch(
     int64_t dim_counter = 0;
     if (is_rhs) {
       CHECK(is_backprop);
-      for (int i = 0; i < activations->shape().rank(); ++i) {
+      for (int i = 0; i < activations->shape().dimensions_size(); ++i) {
         if (i == activations_batch_dim) {
           continue;
         }
@@ -709,7 +709,7 @@ ConvolutionVisitor::BringSpaceNextToBatch(
       new_dim_numbers.set_kernel_input_feature_dimension(activations_batch_dim);
 
     } else {
-      for (int i = 0; i < activations->shape().rank(); ++i) {
+      for (int i = 0; i < activations->shape().dimensions_size(); ++i) {
         if (i == activations_batch_dim) {
           continue;
         }
@@ -850,7 +850,7 @@ ConvolutionVisitor::ChangeSpatialSizeOnSpaceToBatchedShape(
 
   VLOG(3) << "First reshape done";
 
-  const int64_t rank = activations->shape().rank();
+  const int64_t rank = activations->shape().dimensions_size();
 
   // If spatial size is increased, we add padding. If it has shrunk, we slice
   // out the padding that was added before.
@@ -1587,7 +1587,7 @@ bool ConvolutionVisitor::SupportedDotForPropagation(HloInstruction* consumer,
   const int64_t old_feature_dim =
       result[DimMapper(SpaceToBatchDimMap::kFeature)];
   // No feature dimension in output
-  if (consumer->operand(1)->shape().rank() ==
+  if (consumer->operand(1)->shape().dimensions_size() ==
       batch_dims.size() + contracting_dims.size()) {
     return false;
   }
@@ -2032,7 +2032,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
     int64_t new_batch_dim = -1;
     int64_t new_space_dim = -1;
     int64_t outer = 0;
-    for (int64_t i = 0; i < producer->shape().rank(); ++i) {
+    for (int64_t i = 0; i < producer->shape().dimensions_size(); ++i) {
       if (absl::c_linear_search(
               consumer->dot_dimension_numbers().lhs_batch_dimensions(), i) ||
           absl::c_linear_search(
@@ -2056,13 +2056,13 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
     dim_map[DimMapper(SpaceToBatchDimMap::kBatch)] = new_batch_dim;
     dim_map[DimMapper(SpaceToBatchDimMap::kSpace0)] = new_space_dim;
     dim_map[DimMapper(SpaceToBatchDimMap::kFeature)] =
-        consumer->shape().rank() - 1;
+        consumer->shape().dimensions_size() - 1;
     instr_to_dim_map_[consumer] = dim_map;
     auto new_consumer = computation->AddInstruction(consumer->Clone());
-    new_consumer->mutable_shape()->mutable_dimensions()[new_batch_dim] =
-        producer->shape().dimensions(old_batch_dim);
-    new_consumer->mutable_shape()->mutable_dimensions()[new_space_dim] =
-        producer->shape().dimensions(old_space_dim);
+    new_consumer->mutable_shape()->set_dimensions(
+        new_batch_dim, producer->shape().dimensions(old_batch_dim));
+    new_consumer->mutable_shape()->set_dimensions(
+        new_space_dim, producer->shape().dimensions(old_space_dim));
     old_to_new_instrs_[consumer] = new_consumer;
     return true;
   }
@@ -2145,9 +2145,9 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
       dim_map[DimMapper(SpaceToBatchDimMap::kFeature)] = -1;
 
       instr_to_dim_map_[consumer] = dim_map;
-      const int64_t rank = first_operand->shape().rank();
+      const int64_t rank = first_operand->shape().dimensions_size();
 
-      const int64_t output_rank = new_consumer->shape().rank();
+      const int64_t output_rank = new_consumer->shape().dimensions_size();
       // Make a map of each dim in original reduce output to input.
       std::vector<int64_t> old_reduce_output_to_input(output_rank);
       int dim_number_to_assign_old = 0;
@@ -2395,7 +2395,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
       // If the window size was larger than the stride, there could be overlaps.
       // Such cases require updates from both overlaps to be applied.
       if (halo_size > 0) {
-        const int64_t rank = new_consumer->shape().rank();
+        const int64_t rank = new_consumer->shape().dimensions_size();
 
         const int64_t batch_size =
             new_consumer->shape().dimensions(new_batch_dim);
@@ -2536,10 +2536,10 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
 
       auto previous_shape =
           old_to_new_instrs_[consumer->mutable_operand(0)]->shape();
-      std::vector<int64_t> start_indices(previous_shape.rank(), 0),
+      std::vector<int64_t> start_indices(previous_shape.dimensions_size(), 0),
           end_indices(previous_shape.dimensions().begin(),
                       previous_shape.dimensions().end()),
-          strides(previous_shape.rank(), 1);
+          strides(previous_shape.dimensions_size(), 1);
 
       TF_ASSIGN_OR_RETURN(new_consumer,
                           MakeSliceHlo(new_consumer, start_indices, end_indices,
@@ -2717,7 +2717,7 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::BatchToSpace(
                       MakeReshapeHlo(new_dimensions, new_instr));
 
   VLOG(1) << "Batch to space reshape " << reshape->ToString();
-  const int64_t rank = old_instr->shape().rank();
+  const int64_t rank = old_instr->shape().dimensions_size();
   std::vector<int64_t> start_indices(rank, 0),
       end_indices(new_dimensions.begin(), new_dimensions.end()),
       strides(rank, 1);
@@ -2978,7 +2978,7 @@ absl::Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
 
   // We will generate output such that batch is followed by the split spatial
   // dimension.
-  const int64_t rank = (convolution->shape().rank());
+  const int64_t rank = (convolution->shape().dimensions_size());
   std::vector<int64_t> transpose_dims(rank);
   int dim_count = 0;
   std::map<int64_t, int64_t> dim_translator;
@@ -3091,7 +3091,7 @@ absl::Status ConvolutionVisitor::PropagateOnPad(HloInstruction* pad) {
   auto permute_dims = instr_to_dim_permute_map_[first_operand];
 
   PaddingConfig padding_config;
-  for (int i = 0; i < pad->shape().rank(); ++i) {
+  for (int i = 0; i < pad->shape().dimensions_size(); ++i) {
     auto dimension = padding_config.add_dimensions();
     const int64_t old_dim = ReverseDimLookUp(permute_dims, i);
     auto old_padding = pad->padding_config().dimensions(old_dim);
@@ -3121,10 +3121,10 @@ absl::Status ConvolutionVisitor::PropagateOnSlice(HloInstruction* slice) {
   auto operand = old_to_new_instrs_[slice->mutable_operand(0)];
   auto permute_dims = instr_to_dim_permute_map_[operand];
 
-  DimensionVector starts(slice->shape().rank());
-  DimensionVector limits(slice->shape().rank());
-  DimensionVector strides(slice->shape().rank());
-  for (int i = 0; i < slice->shape().rank(); ++i) {
+  DimensionVector starts(slice->shape().dimensions_size());
+  DimensionVector limits(slice->shape().dimensions_size());
+  DimensionVector strides(slice->shape().dimensions_size());
+  for (int i = 0; i < slice->shape().dimensions_size(); ++i) {
     const int64_t old_dim = ReverseDimLookUp(permute_dims, i);
     if (slice->shape().dimensions(old_dim) ==
         slice->operand(0)->shape().dimensions(old_dim)) {
@@ -4020,8 +4020,8 @@ absl::Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
 
   if (reduce_window_or_select_and_scatter != nullptr &&
       reduce_window_or_select_and_scatter->shape().IsArray() &&
-      reduce_window_or_select_and_scatter->shape().rank() ==
-          convolution->shape().rank()) {
+      reduce_window_or_select_and_scatter->shape().dimensions_size() ==
+          convolution->shape().dimensions_size()) {
     VLOG(2)
         << "DoesConvolutionFeedReduceWindowOrSelectAndScatter returned true";
     // Take into account the stride of the reduce window while choosing the
@@ -4098,7 +4098,7 @@ absl::Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
 
   // We will generate output such that batch is followed by the split spatial
   // dimension.
-  const int64_t rank = convolution->shape().rank();
+  const int64_t rank = convolution->shape().dimensions_size();
   std::vector<int64_t> transpose_dims(rank);
   int dim_count = 0;
   std::map<int64_t, int64_t> dim_translator;
diff --git a/third_party/xla/xla/service/space_to_batch_converter_test.cc b/third_party/xla/xla/service/space_to_batch_converter_test.cc
index 6473c65dccf7..1408bed8091d 100644
--- a/third_party/xla/xla/service/space_to_batch_converter_test.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter_test.cc
@@ -20,15 +20,15 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using SpaceToBatchConverterTest = HloTestBase;
+using SpaceToBatchConverterTest = HloHardwareIndependentTestBase;
 namespace op = testing::opcode_matchers;
 
 TEST_F(SpaceToBatchConverterTest, SimpleBatch1) {
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index afe90862cd1e..c7686f040ba0 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -1,6 +1,6 @@
 # Description: SPMD partitioning pass.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -68,7 +68,7 @@ cc_library(
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_cse",
         "//xla/service:hlo_module_config",
-        "//xla/service:host_memory_offload_annotations_hdr",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
         "//xla/service:sharding_propagation",
@@ -97,6 +97,7 @@ xla_cc_test(
     srcs = ["spmd_partitioner_test.cc"],
     shard_count = 10,
     deps = [
+        ":sharding_format_picker",
         ":spmd_partitioner",
         ":spmd_prepare",
         "//xla:shape_util",
@@ -104,12 +105,11 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/hlo/transforms:sharding_format_picker",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/hlo/utils:hlo_sharding_util",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_verifier",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
@@ -125,6 +125,23 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "sharding_format_picker",
+    testonly = True,
+    srcs = ["sharding_format_picker.cc"],
+    hdrs = ["sharding_format_picker.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:tile_assignment",
+        "//xla/hlo/pass:hlo_pass",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_cc_test(
     name = "canonicalize_all_gather_for_cse_test",
     srcs = ["canonicalize_all_gather_for_cse_test.cc"],
@@ -133,8 +150,8 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -181,7 +198,7 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -217,8 +234,8 @@ xla_cc_test(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -274,12 +291,12 @@ xla_cc_test(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/transforms/expanders:rng_expander",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_verifier",
         "//xla/service:sharding_propagation",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -319,8 +336,8 @@ xla_cc_test(
         ":collective_permute_motion",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
@@ -354,7 +371,7 @@ xla_cc_test(
     deps = [
         ":partition_assignment",
         "//xla:xla_proto_cc",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -387,8 +404,8 @@ xla_cc_test(
         ":whole_graph_manual_pass",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
index a3dceadc453f..3fe56896133d 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
@@ -76,7 +76,7 @@ absl::StatusOr<bool> CanonicalizeAllGatherForCSE::RunOnComputation(
         major_elements /= real_data->shape().dimensions(new_ag_dim++);
       }
     }
-    if (new_ag_dim == real_data->shape().rank()) {
+    if (new_ag_dim == real_data->shape().dimensions_size()) {
       continue;
     }
 
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
index 57af9880679e..b0805169709b 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -41,7 +41,7 @@ using ::testing::_;
 using ::testing::AllOf;
 namespace op = xla::testing::opcode_matchers;
 
-class AllGatherCanonicalizeTest : public HloTestBase {
+class AllGatherCanonicalizeTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module) {
diff --git a/third_party/xla/xla/service/spmd/collective_permute_motion.cc b/third_party/xla/xla/service/spmd/collective_permute_motion.cc
index b2c2bda4df42..682cb55f9fe3 100644
--- a/third_party/xla/xla/service/spmd/collective_permute_motion.cc
+++ b/third_party/xla/xla/service/spmd/collective_permute_motion.cc
@@ -202,7 +202,7 @@ absl::StatusOr<bool> MoveCollectivePermutes(HloComputation* computation,
     }
   }
   HloInstruction* ind_var = input_gtes[induction_var_idx];
-  if (ind_var == nullptr || ind_var->shape().rank() > 0) {
+  if (ind_var == nullptr || ind_var->shape().dimensions_size() > 0) {
     VLOG(2) << "Skip " << loop->name() << ", non-scalar induction var";
     return false;
   }
diff --git a/third_party/xla/xla/service/spmd/collective_permute_motion_test.cc b/third_party/xla/xla/service/spmd/collective_permute_motion_test.cc
index a0592b8f3e39..b99fb6147e68 100644
--- a/third_party/xla/xla/service/spmd/collective_permute_motion_test.cc
+++ b/third_party/xla/xla/service/spmd/collective_permute_motion_test.cc
@@ -20,14 +20,14 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using CollectivePermuteMotionTest = HloTestBase;
+using CollectivePermuteMotionTest = HloHardwareIndependentTestBase;
 namespace op = xla::testing::opcode_matchers;
 
 TEST_F(CollectivePermuteMotionTest, SimpleMove) {
diff --git a/third_party/xla/xla/service/spmd/convolution_handler.cc b/third_party/xla/xla/service/spmd/convolution_handler.cc
index 40f9937ce9a8..4950312f88b7 100644
--- a/third_party/xla/xla/service/spmd/convolution_handler.cc
+++ b/third_party/xla/xla/service/spmd/convolution_handler.cc
@@ -76,7 +76,7 @@ absl::StatusOr<HloInstruction*> PartitionConvolutionWithBatchGroupCount(
   }
 
   // Map RHS indices to LHS indices.
-  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.rank());
+  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.dimensions_size());
   rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
       dnums.input_batch_dimension();
   rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
@@ -87,13 +87,14 @@ absl::StatusOr<HloInstruction*> PartitionConvolutionWithBatchGroupCount(
   }
 
   // Map LHS indices to RHS indices.
-  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.rank());
+  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.dimensions_size());
   for (int64_t i = 0; i < rhs_to_lhs_indices.size(); ++i) {
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
 
   // Map LHS indices to output indices.
-  std::vector<int64_t> lhs_to_output_indices(lhs.base_shape().rank(), -1);
+  std::vector<int64_t> lhs_to_output_indices(lhs.base_shape().dimensions_size(),
+                                             -1);
   lhs_to_output_indices[dnums.input_batch_dimension()] =
       dnums.output_feature_dimension();
   lhs_to_output_indices[dnums.input_feature_dimension()] =
@@ -168,7 +169,7 @@ absl::StatusOr<HloInstruction*> PartitionConvolutionWithFeatureGroupCount(
   }
 
   // Align RHS indices to LHS.
-  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.rank());
+  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.dimensions_size());
   rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
       dnums.input_feature_dimension();
   rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
@@ -179,13 +180,14 @@ absl::StatusOr<HloInstruction*> PartitionConvolutionWithFeatureGroupCount(
   }
 
   // Align LHS indices to RHS.
-  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.rank());
+  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.dimensions_size());
   for (int64_t i = 0; i < rhs_to_lhs_indices.size(); ++i) {
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
 
   // Align LHS indices to output.
-  std::vector<int64_t> lhs_to_output_indices(output_base_shape.rank());
+  std::vector<int64_t> lhs_to_output_indices(
+      output_base_shape.dimensions_size());
   lhs_to_output_indices[dnums.input_feature_dimension()] =
       dnums.output_feature_dimension();
   lhs_to_output_indices[dnums.input_batch_dimension()] =
@@ -251,7 +253,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
                !rhs.sharding().IsTileMaximal());
 
   const auto& dnums = original_hlo->convolution_dimension_numbers();
-  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.rank());
+  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.dimensions_size());
   rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
       dnums.input_batch_dimension();
   rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
@@ -260,7 +262,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
         dnums.input_spatial_dimensions(i);
   }
-  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.rank());
+  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.dimensions_size());
   for (int64_t i = 0; i < rhs_to_lhs_indices.size(); ++i) {
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
@@ -347,21 +349,21 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
   }
 
   std::vector<OffsetCalculation> left_halo_size_functions(
-      output_base_shape.rank());
+      output_base_shape.dimensions_size());
   std::vector<OffsetCalculation> right_halo_size_functions(
-      output_base_shape.rank());
+      output_base_shape.dimensions_size());
   Window new_window = conv_window;
 
   // Data structures needed for Pad and DynamicSlice on LHS if needed.
   bool need_dynamic_slice_lhs = false;
   auto partition_ordinals =
       MakeTiledPartitionOrdinals(lhs.sharding(), partition_id, b);
-  std::vector<int64_t> zero_padding(output_base_shape.rank());
+  std::vector<int64_t> zero_padding(output_base_shape.dimensions_size());
   PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
   auto zero_s32 =
       b->AddInstruction(HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
   std::vector<HloInstruction*> dynamic_slice_start_indices(
-      output_base_shape.rank(), zero_s32);
+      output_base_shape.dimensions_size(), zero_s32);
   Shape dynamic_slice_shape = lhs.hlo()->shape();
   Shape pad_shape = lhs.hlo()->shape();
 
@@ -541,7 +543,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
 
   // Check if the operand shardings are aligned. Also we currently don't
   // support partitioning non-spatial dimensions.
-  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.rank());
+  std::vector<int64_t> rhs_to_lhs_indices(output_base_shape.dimensions_size());
   rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
       dnums.input_batch_dimension();
   rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
@@ -550,7 +552,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
     rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
         dnums.input_spatial_dimensions(i);
   }
-  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.rank());
+  std::vector<int64_t> lhs_to_rhs_indices(output_base_shape.dimensions_size());
   for (int64_t i = 0; i < rhs_to_lhs_indices.size(); ++i) {
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
@@ -655,9 +657,9 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
   }
 
   std::vector<OffsetCalculation> left_halo_size_functions(
-      output_base_shape.rank());
+      output_base_shape.dimensions_size());
   std::vector<OffsetCalculation> right_halo_size_functions(
-      output_base_shape.rank());
+      output_base_shape.dimensions_size());
   Window new_window = window;
 
   auto partition_ordinals =
@@ -779,7 +781,8 @@ absl::StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
   }
 
   // Check if the operand and the output sharding are aligned.
-  std::vector<int64_t> input_to_output_indices(output_base_shape.rank());
+  std::vector<int64_t> input_to_output_indices(
+      output_base_shape.dimensions_size());
   input_to_output_indices[dnums.input_batch_dimension()] =
       dnums.output_batch_dimension();
   input_to_output_indices[dnums.input_feature_dimension()] =
@@ -799,7 +802,7 @@ absl::StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
   // whereas ReshardAsWindowedInput() expects the same number of window
   // dimensions as the rank of the operand. So add two more trivial
   // dimensions.
-  std::vector<int64_t> ones(output_base_shape.rank(), 1);
+  std::vector<int64_t> ones(output_base_shape.dimensions_size(), 1);
   auto operand_window = window_util::MakeWindow(ones);
   for (int64_t i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
     *operand_window.mutable_dimensions(dnums.input_spatial_dimensions(i)) =
diff --git a/third_party/xla/xla/service/spmd/custom_call_handler.cc b/third_party/xla/xla/service/spmd/custom_call_handler.cc
index 7ca4c12db497..b8581382a9c1 100644
--- a/third_party/xla/xla/service/spmd/custom_call_handler.cc
+++ b/third_party/xla/xla/service/spmd/custom_call_handler.cc
@@ -46,7 +46,7 @@ limitations under the License.
 #include "xla/service/custom_call_sharding_helper.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/spmd/spmd_partitioner.h"
 #include "xla/service/spmd/spmd_partitioner_util.h"
 #include "xla/shape.h"
@@ -304,7 +304,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
       HloInstruction* halo = input.hlo();
       if (halo_size != shard_size) {
         halo_shape.set_dimensions(dim, halo_size);
-        std::vector<int64_t> slice_starts(hlo->shape().rank(), 0);
+        std::vector<int64_t> slice_starts(hlo->shape().dimensions_size(), 0);
         slice_starts[dim] = offset_in_shard;
         std::vector<int64_t> slice_limits(
             input.hlo()->shape().dimensions().begin(),
@@ -312,7 +312,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
         slice_limits[dim] = offset_in_shard + halo_size;
         halo = b_.AddInstruction(HloInstruction::CreateSlice(
             halo_shape, halo, slice_starts, slice_limits,
-            std::vector<int64_t>(halo_shape.rank(), 1)));
+            std::vector<int64_t>(halo_shape.dimensions_size(), 1)));
       }
       if (shard_distance != 0) {
         std::vector<std::pair<int64_t, int64_t>> pairs;
@@ -457,12 +457,16 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
   }
 
   if (hlo->custom_call_target() ==
-      host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+      memory_annotations::kMoveToHostCustomCallTarget) {
     return HandleElementwise(hlo);
   }
 
   if (hlo->custom_call_target() ==
-      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
+          memory_annotations::kMoveToDeviceCustomCallTarget ||
+      hlo->custom_call_target() ==
+          memory_annotations::kPinToDeviceCustomCallTarget ||
+      hlo->custom_call_target() ==
+          memory_annotations::kPinToDeviceSramCustomCallTarget) {
     // Use the operand's sharding to shard the move-to-device op. This avoids
     // inserting any resharding before the custom call so that the
     // host-offloader pass can pattern match the offloading sequences correctly.
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index 2b8617c542f6..370069638cfb 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -289,7 +289,8 @@ ConvolutionDimensionNumbers GenNewConvDNums(
   }
   if (rhs_concat_dim != -1 && !windowed_at_contracting_dims &&
       !windowed_at_batch_dims) {
-    input_spatial_dimensions.push_back(dot_lhs->shape().dimensions_size() - 1);
+    input_spatial_dimensions.push_back(dot_lhs->shape().dimensions().size() -
+                                       1);
   }
   // Handle the RHS dimension numbers.
   int64_t kernel_input_feature_dimension =
@@ -315,7 +316,8 @@ ConvolutionDimensionNumbers GenNewConvDNums(
   }
   if (lhs_concat_dim != -1 && !windowed_at_contracting_dims &&
       !windowed_at_batch_dims) {
-    kernel_spatial_dimensions.push_back(dot_rhs->shape().dimensions_size() - 1);
+    kernel_spatial_dimensions.push_back(dot_rhs->shape().dimensions().size() -
+                                        1);
   }
   // Handle the Output dimension numbers.
   int64_t output_batch_dimension = dnums.output_batch_dimension();
@@ -340,7 +342,7 @@ ConvolutionDimensionNumbers GenNewConvDNums(
     }
     output_spatial_dimensions.push_back(output_slice_dim);
   } else {
-    output_spatial_dimensions.push_back(new_dot_shape.dimensions_size() - 1);
+    output_spatial_dimensions.push_back(new_dot_shape.dimensions().size() - 1);
   }
   // Construct the new dot dimension numbers.
   ConvolutionDimensionNumbers new_dnums;
@@ -633,7 +635,7 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
       computation_time_in_ms = visitor->GetComputationTimeInMilliSec(dot);
 
       std::vector<int64_t> lhs_contracting_dims;
-      lhs_contracting_dims.reserve(new_lhs.base_shape().rank());
+      lhs_contracting_dims.reserve(dims_mapping.contracting_dims.size());
       for (const auto& cd : dims_mapping.contracting_dims) {
         lhs_contracting_dims.push_back(cd.lhs);
       }
@@ -1081,7 +1083,7 @@ absl::StatusOr<HloInstruction*> EmitWindowedDotGeneral(
                            bool ccw) -> HloInstruction* {
         std::vector<int64_t> new_dims;
         const int64_t dimensions_size =
-            slice_operand->shape().dimensions_size();
+            slice_operand->shape().dimensions().size();
         new_dims.reserve(dimensions_size + 1);
         for (int64_t i = 0; i < dimensions_size; ++i) {
           if (i == slice_sharding_dim) {
@@ -1101,7 +1103,8 @@ absl::StatusOr<HloInstruction*> EmitWindowedDotGeneral(
                 : LiteralUtil::Zero(
                       reshaped_slice_operand->shape().element_type())));
 
-        std::vector<int64_t> padding(reshaped_slice_operand->shape().rank());
+        std::vector<int64_t> padding(
+            reshaped_slice_operand->shape().dimensions().size());
         auto padded_slice_operand = reshaped_slice_operand;
         auto padded_shape = padded_slice_operand->shape();
         int64_t padding_dim = slice_sharding_dim;
@@ -1156,7 +1159,7 @@ absl::StatusOr<HloInstruction*> EmitWindowedDotGeneral(
       // Reshape. The reshaped slice will not be used to produce the final
       // result, but used as a hint for the shape inference.
       std::vector<int64_t> reshaped_slice_dims;
-      const int64_t dim_size = slice->shape().dimensions_size();
+      const int64_t dim_size = slice->shape().dimensions().size();
       reshaped_slice_dims.reserve(dim_size);
       for (int64_t i = 0; i < dim_size; ++i) {
         auto dim_size = slice->shape().dimensions(i);
@@ -1313,15 +1316,17 @@ absl::StatusOr<HloInstruction*> EmitWindowedDotGeneral(
                            ? indices_map.lhs_to_output_indices[lhs_concat_dim]
                            : indices_map.rhs_to_output_indices[rhs_concat_dim];
       slice_shape.set_dimensions(slice_dim, 1);
-      std::vector<int64_t> ccw_start_indices(dot->shape().rank(), 0);
-      std::vector<int64_t> cw_start_indices(dot->shape().rank(), 0);
+      std::vector<int64_t> ccw_start_indices(dot->shape().dimensions().size(),
+                                             0);
+      std::vector<int64_t> cw_start_indices(dot->shape().dimensions().size(),
+                                            0);
       cw_start_indices[slice_dim] = 1;
       auto ccw_dot = body_b.AddInstruction(HloInstruction::CreateSlice(
           slice_shape, dot, ccw_start_indices, slice_shape.dimensions(),
-          std::vector<int64_t>(dot->shape().rank(), 1)));
+          std::vector<int64_t>(dot->shape().dimensions().size(), 1)));
       auto cw_dot = body_b.AddInstruction(HloInstruction::CreateSlice(
           slice_shape, dot, cw_start_indices, dot->shape().dimensions(),
-          std::vector<int64_t>(dot->shape().rank(), 1)));
+          std::vector<int64_t>(dot->shape().dimensions().size(), 1)));
 
       std::vector<int64_t> reshaped_dims(
           original_sharded_dot_shape.dimensions().begin(),
@@ -1756,9 +1761,10 @@ absl::StatusOr<HloInstruction*> EmitWindowedDotGeneral(
                              unpadded_result_buffer_shape)) {
     result = b->AddInstruction(HloInstruction::CreateSlice(
         unpadded_result_buffer_shape, result,
-        std::vector<int64_t>(padded_result_buffer_shape.rank(), 0),
+        std::vector<int64_t>(padded_result_buffer_shape.dimensions().size(), 0),
         unpadded_result_buffer_shape.dimensions(),
-        std::vector<int64_t>(padded_result_buffer_shape.rank(), 1)));
+        std::vector<int64_t>(padded_result_buffer_shape.dimensions().size(),
+                             1)));
   }
   return result;
 }
@@ -1779,8 +1785,7 @@ absl::StatusOr<HloInstruction*> PartitionBaseCase(
     const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops,
-    bool may_reshard_without_detecting_match,
-    SpmdPartitioningVisitor* visitor) {
+    bool may_reshard_if_mismatch, SpmdPartitioningVisitor* visitor) {
   const HloSharding& lhs_sharding = lhs.sharding();
   const HloSharding& rhs_sharding = rhs.sharding();
   const int64_t lhs_batch_partitions = GetPartitionsForDims(
@@ -1810,8 +1815,9 @@ absl::StatusOr<HloInstruction*> PartitionBaseCase(
     return nullptr;
   }
   DotDimensionIndexMapping indices_map = ComputeDimensionIndexMapping(
-      dims_mapping, lhs.base_shape().rank(), rhs.base_shape().rank(),
-      output_base_shape.rank());
+      dims_mapping, lhs.base_shape().dimensions().size(),
+      rhs.base_shape().dimensions().size(),
+      output_base_shape.dimensions().size());
   auto lhs_sharding_transposed_to_match_rhs =
       hlo_sharding_util::TransposeShardingWithCollapsedDims(
           lhs_sharding, indices_map.lhs_to_rhs_indices,
@@ -1979,8 +1985,7 @@ absl::StatusOr<HloInstruction*> PartitionBaseCase(
   // LHS and RHS have the same partitioned contracting dimensions.
   if (lhs_contracting_partitions == rhs_contracting_partitions &&
       lhs_contracting_partitions == num_partitions) {
-    if (!may_reshard_without_detecting_match &&
-        !output_sharding.IsReplicated() &&
+    if (!may_reshard_if_mismatch && !output_sharding.IsReplicated() &&
         output_sharding.NumTiles() != num_partitions) {
       // The output is not fully sliced; the recursive handling has better
       // pattern matching for reduce scatters in subgroups.
@@ -1990,16 +1995,17 @@ absl::StatusOr<HloInstruction*> PartitionBaseCase(
     // on the other side.
     if (ShapeSizeInBytes(lhs.base_shape()) <
         ShapeSizeInBytes(rhs.base_shape())) {
-      lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithZero();
-      rhs = rhs.PadWithZero();
+      lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
     } else {
-      lhs = lhs.PadWithZero();
-      rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithZero();
+      rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
     }
+
+    lhs = lhs.PadWithZero();
+    rhs = rhs.PadWithZero();
     TF_ASSIGN_OR_RETURN(
         auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
     std::vector<int64_t> lhs_contracting_dims;
-    lhs_contracting_dims.reserve(lhs.base_shape().rank());
+    lhs_contracting_dims.reserve(dims_mapping.contracting_dims.size());
     for (const auto& cd : dims_mapping.contracting_dims) {
       lhs_contracting_dims.push_back(cd.lhs);
     }
@@ -2027,7 +2033,7 @@ absl::StatusOr<HloInstruction*> PartitionBaseCase(
     return create_sharded_dot(lhs.Replicate().hlo(), rhs.hlo(), b, conv_window);
   }
 
-  if (may_reshard_without_detecting_match) {
+  if (may_reshard_if_mismatch) {
     // Output is batch partitioned.
     if (output_batch_partitions == num_partitions) {
       auto resharded_lhs =
@@ -2055,54 +2061,50 @@ absl::StatusOr<HloInstruction*> PartitionBaseCase(
     }
   }
 
-  // Returns true if it is beneficial to reshard the operand at `operand_idx`
-  // across the contracting dimension.
-  const auto should_partition_contracting_dim = [&](int64_t operand_idx) {
-    if (!output_sharding.IsReplicated()) {
-      return false;
-    }
+  if (!output_sharding.IsReplicated()) {
+    return nullptr;
+  }
 
-    if (operand_idx == 0) {
-      // If LHS and output are replicated, we compare the cost of all-gather
-      // on RHS vs all-reduce on the output.
-      return (rhs_contracting_partitions == num_partitions) &&
-             lhs.sharding().IsReplicated() &&
-             ShapeUtil::ElementsIn(rhs.base_shape()) >
-                 ShapeUtil::ElementsIn(output_base_shape);
-    } else {
-      return (lhs_contracting_partitions == num_partitions) &&
-             rhs.sharding().IsReplicated() &&
-             ShapeUtil::ElementsIn(lhs.base_shape()) >
-                 ShapeUtil::ElementsIn(output_base_shape);
-    }
-  };
+  // When the output is replicated and one of the operands is fully partitioned
+  // along contracting dimensions, align the other operand to be partitioned
+  // along the contracting dimensions.
+
+  // If LHS and output are replicated, we compare the cost of all-gather on RHS
+  // vs all-reduce on the output.
+  const bool rhs_contracting_fully_partitioned =
+      (rhs_contracting_partitions == num_partitions) &&
+      lhs.sharding().IsReplicated() &&
+      ShapeUtil::ElementsIn(rhs.base_shape()) >
+          ShapeUtil::ElementsIn(output_base_shape);
+  const bool lhs_contracting_fully_partitioned =
+      (lhs_contracting_partitions == num_partitions) &&
+      rhs.sharding().IsReplicated() &&
+      ShapeUtil::ElementsIn(lhs.base_shape()) >
+          ShapeUtil::ElementsIn(output_base_shape);
+
+  if (rhs_contracting_fully_partitioned) {
+    lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
+  } else if (lhs_contracting_fully_partitioned) {
+    rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
+  } else {
+    return nullptr;
+  }
 
-  // When the output is replicated and one of the operands is partitioned along
-  // contracting dimension, align the other operand to be partitioned along
-  // the contracting dimensions.
-  if (output_sharding.IsReplicated() && (should_partition_contracting_dim(0) ||
-                                         should_partition_contracting_dim(1))) {
-    if (should_partition_contracting_dim(0)) {
-      lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithZero();
-      rhs = rhs.PadWithZero();
-    } else {
-      lhs = lhs.PadWithZero();
-      rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithZero();
-    }
-    TF_ASSIGN_OR_RETURN(
-        auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
+  lhs = lhs.PadWithZero();
+  rhs = rhs.PadWithZero();
 
-    std::vector<int64_t> lhs_contracting_dims;
-    lhs_contracting_dims.reserve(lhs.base_shape().rank());
-    for (const auto& cd : dims_mapping.contracting_dims) {
-      lhs_contracting_dims.push_back(cd.lhs);
-    }
-    return lhs.state().partitioner->AllReduceAlongShardingDims(
-        b, dot, lhs.sharding(), lhs.state().next_channel_id,
-        lhs_contracting_dims, lhs.state().collective_ops_creator,
-        MakeBinaryAdd(output_base_shape.element_type(), module));
+  TF_ASSIGN_OR_RETURN(auto dot,
+                      create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
+
+  std::vector<int64_t> lhs_contracting_dims;
+  lhs_contracting_dims.reserve(dims_mapping.contracting_dims.size());
+  for (const auto& cd : dims_mapping.contracting_dims) {
+    lhs_contracting_dims.push_back(cd.lhs);
   }
-  return nullptr;
+  return lhs.state().partitioner->AllReduceAlongShardingDims(
+      b, dot, lhs.sharding(), lhs.state().next_channel_id, lhs_contracting_dims,
+      lhs.state().collective_ops_creator,
+      MakeBinaryAdd(output_base_shape.element_type(), module));
 }
 
 absl::StatusOr<HloInstruction*> PartitionDot(
@@ -2148,13 +2150,13 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnBatchImpl(
   std::vector<int64_t> output_dims;
   auto lhs_sharding_dims_adjusted_to_output =
       lhs.sharding().IsReplicated()
-          ? std::vector<int64_t>(lhs.base_shape().rank(), 1)
+          ? std::vector<int64_t>(lhs.base_shape().dimensions().size(), 1)
           : std::vector<int64_t>(
                 lhs.sharding().tile_assignment().dimensions().begin(),
                 lhs.sharding().tile_assignment().dimensions().end());
   auto rhs_sharding_dims_adjusted_to_output =
       rhs.sharding().IsReplicated()
-          ? std::vector<int64_t>(rhs.base_shape().rank(), 1)
+          ? std::vector<int64_t>(rhs.base_shape().dimensions().size(), 1)
           : std::vector<int64_t>(
                 rhs.sharding().tile_assignment().dimensions().begin(),
                 rhs.sharding().tile_assignment().dimensions().end());
@@ -2292,7 +2294,7 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnBatchImpl(
           original_tiling.Reshape(*sharding_dims_adjusted_to_output);
       auto grouped =
           AlignGroupsWith(hlo_sharding_util::GroupShardingOnDims(
-                              operand.base_shape().rank() <
+                              operand.base_shape().dimensions().size() <
                                       sharding_dims_adjusted_to_output->size()
                                   ? HloSharding::PartialTile(reshaped_tiling)
                                   : HloSharding::Tile(reshaped_tiling),
@@ -2356,7 +2358,7 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnBatchImpl(
       PartitionDot(per_group_lhs, per_group_rhs,
                    GetPerGroupBaseShape(output_grouped, output_base_shape),
                    output_grouped.sharding, dims_mapping,
-                   num_partitions / output_grouped.device_groups.size(),
+                   num_partitions / output_grouped.device_groups.num_groups(),
                    create_sharded_dot, conv_window, module, original_hlo,
                    options, b, windowed_dot_general_loops, visitor));
   dot->set_sharding(UngroupSharding(output_grouped));
@@ -2593,7 +2595,7 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnNonContractingImpl(
 
   auto partially_replicated_other = other.hlo();
   if (other_grouped && other_grouped->group_dims.size() == 1 &&
-      other_grouped->group_dims[0] == other.base_shape().rank()) {
+      other_grouped->group_dims[0] == other.base_shape().dimensions().size()) {
     // Group on replication dim.
     other = other.Reshard(UngroupSharding(*other_grouped));
     partially_replicated_other = other.hlo();
@@ -2626,13 +2628,13 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnNonContractingImpl(
 
   auto other_p = PartitionedHlo(partially_replicated_other, other.base_shape(),
                                 per_group_partitioner_state);
-  return PartitionDot(lhs_matching ? matching_p : other_p,
-                      lhs_matching ? other_p : matching_p,
-                      GetPerGroupBaseShape(output_grouped, output_base_shape),
-                      output_grouped.sharding, dims_mapping,
-                      num_partitions / matching_grouped.device_groups.size(),
-                      create_sharded_dot, conv_window, module, original_hlo,
-                      options, b, windowed_dot_general_loops, visitor);
+  return PartitionDot(
+      lhs_matching ? matching_p : other_p, lhs_matching ? other_p : matching_p,
+      GetPerGroupBaseShape(output_grouped, output_base_shape),
+      output_grouped.sharding, dims_mapping,
+      num_partitions / matching_grouped.device_groups.num_groups(),
+      create_sharded_dot, conv_window, module, original_hlo, options, b,
+      windowed_dot_general_loops, visitor);
 }
 
 std::pair<HloSharding, HloSharding>
@@ -2750,16 +2752,17 @@ GetDotGroupPartitionContractingOutputShardings(
     }
   }
   if (output_replicate_dim_grouped) {
-    *output_replicate_dim_grouped =
-        absl::c_linear_search(output_slice_dims, output_base_shape.rank());
+    *output_replicate_dim_grouped = absl::c_linear_search(
+        output_slice_dims, output_base_shape.dimensions().size());
   }
   if (output_slice_dims_out) {
     if (output_sharding.ReplicateOnLastTileDim()) {
       // Remove the replication group dim.
       output_slice_dims.erase(
-          std::remove_if(
-              output_slice_dims.begin(), output_slice_dims.end(),
-              [&](int64_t dim) { return dim == output_base_shape.rank(); }),
+          std::remove_if(output_slice_dims.begin(), output_slice_dims.end(),
+                         [&](int64_t dim) {
+                           return dim == output_base_shape.dimensions().size();
+                         }),
           output_slice_dims.end());
     }
     (*output_slice_dims_out) = std::move(output_slice_dims);
@@ -2861,24 +2864,9 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnContractingImpl(
     lhs = lhs.Reshard(lhs_sharding);
   }
   // Mask out invalid data.
-  std::vector<int64_t> lhs_skipped_dims;
-  for (int64_t i = 0; i < lhs.base_shape().rank(); ++i) {
-    if (absl::c_linear_search(lhs_dims, i)) {
-      continue;
-    }
-    lhs_skipped_dims.push_back(i);
-  }
-  lhs = lhs.PadWithZero(
-      /*left_padded_dims=*/{}, lhs_skipped_dims);
-  std::vector<int64_t> rhs_skipped_dims;
-  for (int64_t i = 0; i < rhs.base_shape().rank(); ++i) {
-    if (absl::c_linear_search(rhs_dims, i)) {
-      continue;
-    }
-    rhs_skipped_dims.push_back(i);
-  }
-  rhs = rhs.PadWithZero(
-      /*left_padded_dims=*/{}, rhs_skipped_dims);
+  lhs = lhs.PadWithZeroOnSpecifiedDims(lhs_dims);
+  rhs = rhs.PadWithZeroOnSpecifiedDims(rhs_dims);
+
   top_level_sharding_to_reset.emplace_back(lhs.hlo(), lhs_sharding);
   lhs.hlo()->set_sharding(lhs_grouped.sharding);
   top_level_sharding_to_reset.emplace_back(rhs.hlo(), rhs_sharding);
@@ -2897,7 +2885,7 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnContractingImpl(
   Shape inner_output_base_shape = output_base_shape;
   auto get_non_slice_dims = [&] {
     std::vector<int64_t> non_group_dims;
-    for (int64_t i = 0; i < output_base_shape.rank(); ++i) {
+    for (int64_t i = 0; i < output_base_shape.dimensions().size(); ++i) {
       if (!absl::c_linear_search(output_slice_dims, i)) {
         non_group_dims.push_back(i);
       }
@@ -2941,7 +2929,8 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnContractingImpl(
       if (output_replicate_dim_grouped) {
         result = lhs.state().partitioner->AllReduceAlongShardingDims(
             b, result, outer_output_tmp_sharding, lhs.state().next_channel_id,
-            {output_base_shape.rank()}, lhs.state().collective_ops_creator,
+            {output_base_shape.dimensions_size()},
+            lhs.state().collective_ops_creator,
             MakeBinaryAdd(output_base_shape.element_type(), module));
       }
     } else {
@@ -3091,8 +3080,9 @@ EstimateWindowedEinsumIterationsForNonContractingPartitioning(
     int64_t rhs_batch_partitions, const Window& conv_window,
     SpmdPartitioningVisitor* visitor) {
   const DotDimensionIndexMapping indices_map = ComputeDimensionIndexMapping(
-      dims_mapping, lhs.base_shape().rank(), rhs.base_shape().rank(),
-      output_base_shape.rank());
+      dims_mapping, lhs.base_shape().dimensions().size(),
+      rhs.base_shape().dimensions().size(),
+      output_base_shape.dimensions().size());
   auto subsequent_einsum_iterations_estimate =
       [&](bool assume_lhs_match) -> std::optional<int64_t> {
     const std::vector<DotConvolutionDimsInfo::DimNums>&
@@ -3327,7 +3317,7 @@ bool PrioritizeContractingDimensionsPartitioning(
   Shape inner_output_base_shape = output_base_shape;
   if (!output_slice_dims.empty()) {
     std::vector<int64_t> non_group_dims;
-    for (int64_t i = 0; i < output_base_shape.rank(); ++i) {
+    for (int64_t i = 0; i < output_base_shape.dimensions().size(); ++i) {
       if (!absl::c_linear_search(output_slice_dims, i)) {
         non_group_dims.push_back(i);
       }
@@ -3353,8 +3343,9 @@ bool PrioritizeContractingDimensionsPartitioning(
   }
 
   const DotDimensionIndexMapping indices_map = ComputeDimensionIndexMapping(
-      dims_mapping, lhs.base_shape().rank(), rhs.base_shape().rank(),
-      inner_output_base_shape.rank());
+      dims_mapping, lhs.base_shape().dimensions().size(),
+      rhs.base_shape().dimensions().size(),
+      inner_output_base_shape.dimensions().size());
   std::optional<HloSharding> output_sharding_transposed_to_match_lhs =
       hlo_sharding_util::TransposeShardingWithCollapsedDims(
           inner_output_sharding, indices_map.output_to_lhs_indices,
@@ -3502,8 +3493,9 @@ bool LhsIsBestMatchForNonContractingPartitioning(
   if (may_group_on_lhs_non_contracting && may_group_on_rhs_non_contracting &&
       options.choose_faster_windowed_einsum_over_mem) {
     const DotDimensionIndexMapping indices_map = ComputeDimensionIndexMapping(
-        dims_mapping, lhs.base_shape().rank(), rhs.base_shape().rank(),
-        output_base_shape.rank());
+        dims_mapping, lhs.base_shape().dimensions().size(),
+        rhs.base_shape().dimensions().size(),
+        output_base_shape.dimensions().size());
     std::optional<int64_t> lhs_matching_iterations;
     std::optional<int64_t> rhs_matching_iterations;
     std::tie(lhs_matching_iterations, rhs_matching_iterations) =
@@ -3702,8 +3694,9 @@ PartitionConvOnBatchOrFeatureGroupedDims(
       HloSharding aligned_output_sharding = HloSharding::Replicate();
       HloInstruction* sharded_conv = nullptr;
       DotDimensionIndexMapping indices_map = ComputeDimensionIndexMapping(
-          *new_dims_mapping, lhs.base_shape().rank(), rhs.base_shape().rank(),
-          output_base_shape.rank());
+          *new_dims_mapping, lhs.base_shape().dimensions().size(),
+          rhs.base_shape().dimensions().size(),
+          output_base_shape.dimensions().size());
       if (max_batch_partitions == conv_lhs_batch_partitions) {
         resharded_lhs = resharded_lhs.Reshard(
             hlo_sharding_util::PartiallyReplicateTiledShardingOnAllDimsExcept(
@@ -4094,7 +4087,7 @@ absl::StatusOr<HloInstruction*> PartitionDotRemovingOutputPartialReplication(
   if (lhs.sharding().IsReplicated() && rhs.sharding().IsReplicated() &&
       output_sharding.ReplicateOnLastTileDim()) {
     auto grouped_output = hlo_sharding_util::GroupShardingOnDims(
-        output_sharding, {output_base_shape.rank()});
+        output_sharding, {output_base_shape.dimensions_size()});
     auto inner_state = CreatePerGroupPartitioningState(
         lhs.state(), grouped_output.device_groups, b);
     TF_ASSIGN_OR_RETURN(
@@ -4145,15 +4138,15 @@ absl::StatusOr<HloInstruction*> PartitionDot(
   }
 
   HloInstruction* partitioned_dot;
-  // Before we find partial matches along the dimensions, invoke base case
-  // again without may_reshard_without_detecting_match.
+  // Before we find partial matches along the dimensions, invoke base case with
+  // may_reshard_if_mismatch as false.
   TF_ASSIGN_OR_RETURN(
       partitioned_dot,
-      PartitionBaseCase(
-          lhs, rhs, output_base_shape, output_sharding, dims_mapping,
-          num_partitions, create_sharded_dot, conv_window, module, original_hlo,
-          options, b, windowed_dot_general_loops,
-          /*may_reshard_without_detecting_match=*/false, visitor));
+      PartitionBaseCase(lhs, rhs, output_base_shape, output_sharding,
+                        dims_mapping, num_partitions, create_sharded_dot,
+                        conv_window, module, original_hlo, options, b,
+                        windowed_dot_general_loops,
+                        /*may_reshard_if_mismatch=*/false, visitor));
   if (partitioned_dot) {
     return partitioned_dot;
   }
@@ -4208,14 +4201,14 @@ absl::StatusOr<HloInstruction*> PartitionDot(
   }
 
   // We failed to find partial matches, invoke base case again with
-  // may_reshard_without_detecting_match.
+  // may_reshard_if_mismatch as true.
   TF_ASSIGN_OR_RETURN(
       partitioned_dot,
       PartitionBaseCase(lhs, rhs, output_base_shape, output_sharding,
                         dims_mapping, num_partitions, create_sharded_dot,
                         conv_window, module, original_hlo, options, b,
                         windowed_dot_general_loops,
-                        /*may_reshard_without_detecting_match=*/true, visitor));
+                        /*may_reshard_if_mismatch=*/true, visitor));
   if (partitioned_dot) {
     return partitioned_dot;
   }
@@ -4661,7 +4654,7 @@ absl::Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
     Shape padded_out_shape = out->shape();
     int64_t operand_dim = 0;
     int64_t output_dim = 0;
-    while (output_dim < padded_out_shape.rank()) {
+    while (output_dim < padded_out_shape.dimensions().size()) {
       if (absl::c_linear_search(out->dimensions(), operand_dim)) {
         // Dimension colapsed.
         ++operand_dim;
@@ -4726,8 +4719,9 @@ absl::Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
       motion_clusters.back().dus = dus;
       motion_clusters.back().outside_to_inside[original_output] =
           dus->mutable_operand(1);
-      motion_clusters.back().slice_offsets.reserve(padded_shape.rank());
-      for (int64_t i = 0; i < padded_shape.rank(); ++i) {
+      motion_clusters.back().slice_offsets.reserve(
+          padded_shape.dimensions().size());
+      for (int64_t i = 0; i < padded_shape.dimensions().size(); ++i) {
         motion_clusters.back().slice_offsets.push_back(
             motion_clusters.back().dus->mutable_operand(i + 2));
       }
@@ -4976,9 +4970,9 @@ absl::Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
                                reduce_outputs[i]->shape())) {
       new_output = computation->AddInstruction(HloInstruction::CreateSlice(
           reduce_outputs[i]->shape(), new_output,
-          std::vector<int64_t>(new_output->shape().rank(), 0),
+          std::vector<int64_t>(new_output->shape().dimensions().size(), 0),
           reduce_outputs[i]->shape().dimensions(),
-          std::vector<int64_t>(new_output->shape().rank(), 1)));
+          std::vector<int64_t>(new_output->shape().dimensions().size(), 1)));
     }
     TF_RETURN_IF_ERROR(reduce_outputs[i]->ReplaceAllUsesWith(new_output));
     TF_RETURN_IF_ERROR(
diff --git a/third_party/xla/xla/service/spmd/fft_handler.cc b/third_party/xla/xla/service/spmd/fft_handler.cc
index 7bff2e341d5d..68ee5a660c45 100644
--- a/third_party/xla/xla/service/spmd/fft_handler.cc
+++ b/third_party/xla/xla/service/spmd/fft_handler.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -78,8 +79,8 @@ std::optional<HloInstruction*> PadEachPartitionWithHaloExchange(
   // 3. Halo exchange.
   auto halo_exchange_result =
       ExchangeHalo(hlo, left_halo_size_function, right_halo_size_function,
-                   hlo->shape().rank() - 1, sharding, collective_ops_creator,
-                   next_channel_id, b);
+                   hlo->shape().dimensions_size() - 1, sharding,
+                   collective_ops_creator, next_channel_id, b);
 
   if (halo_exchange_result.has_value()) {
     concat = halo_exchange_result.value();
@@ -93,16 +94,17 @@ std::optional<HloInstruction*> PadEachPartitionWithHaloExchange(
       OffsetCalculation(MultiplyAddDivideOffsetCalculation(
           size_padded_per_partition - size_per_partition, 0, 1));
   auto slice_shape = concat->shape();
-  slice_shape.set_dimensions(concat->shape().rank() - 1,
+  slice_shape.set_dimensions(concat->shape().dimensions_size() - 1,
                              size_padded_per_partition);
   auto zero_s32 =
       b->AddInstruction(HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-  std::vector<HloInstruction*> slice_offsets(concat->shape().rank(), zero_s32);
+  std::vector<HloInstruction*> slice_offsets(concat->shape().dimensions_size(),
+                                             zero_s32);
   auto partition_ordinals =
       MakeTiledPartitionOrdinals(sharding, partition_id, b);
-  slice_offsets[concat->shape().rank() - 1] =
+  slice_offsets[concat->shape().dimensions_size() - 1] =
       start_offset_on_padded_concat_calculation.Calculate(
-          partition_ordinals[concat->shape().rank() - 1], b);
+          partition_ordinals[concat->shape().dimensions_size() - 1], b);
   return b->AddInstruction(HloInstruction::CreateDynamicSlice(
       slice_shape, concat, slice_offsets, slice_shape.dimensions()));
 }
@@ -143,7 +145,7 @@ HloInstruction* ShuffleWithinEachPartitionUsingOneHot(HloInstruction* hlo,
           one_hot_indices, partition_indices, ComparisonDirection::kEq))));
 
   DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(hlo->shape().rank() - 1);
+  dot_dnums.add_lhs_contracting_dimensions(hlo->shape().dimensions_size() - 1);
   dot_dnums.add_rhs_contracting_dimensions(0);
   PrecisionConfig precision_config;
   precision_config.mutable_operand_precision()->Resize(
@@ -160,13 +162,11 @@ HloInstruction* ShuffleDataWithAllToAll(
     HloInstruction* hlo, int64_t num_partitions,
     const SPMDCollectiveOpsCreator& collective_ops_creator,
     int64_t* next_channel_id, SpmdBuilder* b) {
-  std::vector<std::vector<int64_t>> groups(1);
-  std::vector<int64_t> partition_subgroups(num_partitions);
-  std::iota(partition_subgroups.begin(), partition_subgroups.end(), 0);
-  groups[0] = partition_subgroups;
-  auto all_to_all = collective_ops_creator.create_cross_partition_all_to_all(
-      b, {hlo}, groups, (*next_channel_id)++, hlo->shape().rank() - 1);
-  return all_to_all;
+  IotaReplicaGroupList groups(1, num_partitions);
+  return collective_ops_creator
+      .create_cross_partition_all_to_all_with_iota_device_list(
+          b, {hlo}, groups, (*next_channel_id)++,
+          hlo->shape().dimensions_size() - 1);
 }
 
 HloInstruction* GetCorrectionFactor(HloInstruction* hlo, int64_t num_partitions,
@@ -198,8 +198,8 @@ HloInstruction* GetCorrectionFactor(HloInstruction* hlo, int64_t num_partitions,
   auto exp_operand = add_hlo(
       HloInstruction::CreateBinary(hlo->shape(), HloOpcode::kMultiply,
                                    constant_factor, broadcast_partition_id));
-  auto iota = add_hlo(
-      HloInstruction::CreateIota(hlo->shape(), hlo->shape().rank() - 1));
+  auto iota = add_hlo(HloInstruction::CreateIota(
+      hlo->shape(), hlo->shape().dimensions_size() - 1));
   exp_operand = add_hlo(HloInstruction::CreateBinary(
       hlo->shape(), HloOpcode::kMultiply, exp_operand, iota));
   return add_hlo(
@@ -339,8 +339,8 @@ HloInstruction* GetFinalFftUsingCollectivePermute(
 // Slice valid data in each partition.
 HloInstruction* SliceValidData(HloInstruction* hlo, const Shape& target_shape,
                                SpmdBuilder* b) {
-  std::vector<int64_t> start_indices(target_shape.rank(), 0);
-  std::vector<int64_t> strides(target_shape.rank(), 1);
+  std::vector<int64_t> start_indices(target_shape.dimensions_size(), 0);
+  std::vector<int64_t> strides(target_shape.dimensions_size(), 1);
   return b->AddInstruction(HloInstruction::CreateSlice(
       target_shape, hlo, start_indices, target_shape.dimensions(), strides));
 }
@@ -349,7 +349,8 @@ HloInstruction* SliceValidData(HloInstruction* hlo, const Shape& target_shape,
 
 // Distributed FFT using the algorithm described in go/tpu-spmd-fft.
 absl::Status SpmdPartitioningVisitor::HandleFft(HloInstruction* hlo) {
-  if (hlo->operand(0)->shape().rank() < 3 || hlo->fft_type() != FftType::FFT) {
+  if (hlo->operand(0)->shape().dimensions_size() < 3 ||
+      hlo->fft_type() != FftType::FFT) {
     return DefaultAction(hlo);
   }
 
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index fb2035ad31e9..f01f8d754d88 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -49,9 +49,6 @@ namespace xla {
 namespace spmd {
 namespace {
 using hlo_sharding_util::GroupedSharding;
-PartitioningMethod gather_partition_method = PartitioningMethod::kExplicitBatch;
-PartitioningMethod scatter_partition_method =
-    PartitioningMethod::kExplicitBatch;
 
 // Generates per-group partitioned hlo based on given grouped sharding.
 PartitionedHlo PerGroupPartitionedHlo(
@@ -149,7 +146,7 @@ std::vector<int64_t> GatherOperandDimsByPriority(
 // 3. start_indices_batching_dims
 std::vector<int64_t> GatherIndexDimsByPriority(
     const HloGatherInstruction* gather) {
-  int64_t indices_rank = gather->operand(1)->shape().rank();
+  int64_t indices_rank = gather->operand(1)->shape().dimensions_size();
   const GatherDimensionNumbers& dnums = gather->gather_dimension_numbers();
 
   std::vector<int64_t> priority_dims_for_indices;
@@ -183,7 +180,7 @@ std::vector<int64_t> GatherOutputDimsByPriority(
   auto operand_passthrough_output_dims =
       hlo_sharding_util::GetGatherOperandPassthroughDims(*gather, slice_sizes)
           .output_dims;
-  for (int i = 0; i != output_shape.rank(); ++i) {
+  for (int i = 0; i != output_shape.dimensions_size(); ++i) {
     if (!absl::c_linear_search(operand_passthrough_output_dims, i)) {
       priority_dims_for_output.push_back(i);
     }
@@ -193,6 +190,20 @@ std::vector<int64_t> GatherOutputDimsByPriority(
   return priority_dims_for_output;
 }
 
+template <typename T>
+HloInstruction* CreateMaxIndicesConstant(
+    const Shape& operand_base_shape, absl::Span<const int64_t> start_index_map,
+    SpmdBuilder* b) {
+  std::vector<T> max_indices_values;
+  max_indices_values.reserve(start_index_map.size());
+  for (int64_t operand_dim : start_index_map) {
+    max_indices_values.push_back(
+        static_cast<T>(operand_base_shape.dimensions(operand_dim) - 1));
+  }
+  return b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<T>(max_indices_values)));
+}
+
 PartitionedHlo ClampGatherIndices(const PartitionedHlo& indices,
                                   const Shape& operand_base_shape,
                                   absl::Span<const int64_t> start_index_map,
@@ -200,15 +211,20 @@ PartitionedHlo ClampGatherIndices(const PartitionedHlo& indices,
   const PrimitiveType indices_type = indices.hlo()->shape().element_type();
 
   HloInstruction* max_indices;
-  if (index_vector_dim < indices.rank()) {
-    std::vector<int32_t> max_indices_values;
-    max_indices_values.reserve(start_index_map.size());
-    for (int64_t operand_dim : start_index_map) {
-      max_indices_values.push_back(operand_base_shape.dimensions(operand_dim) -
-                                   1);
+  if (index_vector_dim < indices.num_dimensions()) {
+    switch (indices_type) {
+      case S32:
+        max_indices = CreateMaxIndicesConstant<int32_t>(operand_base_shape,
+                                                        start_index_map, b);
+        break;
+      case S64:
+        max_indices = CreateMaxIndicesConstant<int64_t>(operand_base_shape,
+                                                        start_index_map, b);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported indices type: "
+                   << PrimitiveType_Name(indices_type);
     }
-    max_indices = b->AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<int32_t>(max_indices_values)));
     max_indices = b->AddInstruction(HloInstruction::CreateBroadcast(
         indices.hlo()->shape(), max_indices, {index_vector_dim}));
   } else {
@@ -268,7 +284,7 @@ IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
   // Broadcast the index bounds to the same shape as the indices.
   HloInstruction* broadcast_min;
   HloInstruction* broadcast_max;
-  if (index_vector_dim < indices.rank()) {
+  if (index_vector_dim < indices.num_dimensions()) {
     // The index vector is an R1, we need to reshape individual bounds to
     // [1], and concat them if there are more than one.
     for (int64_t i = 0; i < min_indices.size(); ++i) {
@@ -337,7 +353,7 @@ absl::StatusOr<HloInstruction*> PartitionGatherIndexPassthroughDimensions(
   if (hlo_sharding_util::MergeShardingIfCompatible(
           hlo_sharding_util::PropagateShardingAlongDimsAndReplicateOthers(
               output_sharding, index_passthrough_dims.output_dims,
-              index_passthrough_dims.indices_dims, indices.rank()),
+              index_passthrough_dims.indices_dims, indices.num_dimensions()),
           &indices_sharding)) {
     indices = indices.Reshard(indices_sharding);
   }
@@ -346,7 +362,8 @@ absl::StatusOr<HloInstruction*> PartitionGatherIndexPassthroughDimensions(
   HloSharding passthrough_sharding =
       hlo_sharding_util::PropagateShardingAlongDimsAndReplicateOthers(
           indices.sharding(), index_passthrough_dims.indices_dims,
-          index_passthrough_dims.output_dims, gather->shape().rank());
+          index_passthrough_dims.output_dims,
+          gather->shape().dimensions_size());
   if (passthrough_sharding.IsTileMaximal()) {
     return nullptr;
   }
@@ -366,7 +383,7 @@ absl::StatusOr<HloInstruction*> PartitionGatherIndexPassthroughDimensions(
   const int64_t num_tiles = indices.sharding().TotalNumTiles();
   const GroupedSharding operand_grouped = AlignGroupsWith(
       hlo_sharding_util::GroupShardingOnReplicatedDim(
-          operand.sharding(), num_groups, num_tiles, operand.rank(),
+          operand.sharding(), num_groups, num_tiles, operand.num_dimensions(),
           GatherOperandDimsByPriority(operand, gather, slice_sizes)),
       output_grouped);
   PartitionedHlo per_group_operand =
@@ -444,11 +461,11 @@ absl::StatusOr<HloInstruction*> PartitionGatherOperandPassthroughDimensions(
                         output_grouped);
     // See if we can group partially replicated dimensions from the indices
     // otherwise replicate it.
-    const GroupedSharding indices_grouped =
-        AlignGroupsWith(hlo_sharding_util::GroupShardingOnReplicatedDim(
-                            indices.sharding(), num_groups, num_tiles,
-                            indices.rank(), GatherIndexDimsByPriority(gather)),
-                        output_grouped);
+    const GroupedSharding indices_grouped = AlignGroupsWith(
+        hlo_sharding_util::GroupShardingOnReplicatedDim(
+            indices.sharding(), num_groups, num_tiles, indices.num_dimensions(),
+            GatherIndexDimsByPriority(gather)),
+        output_grouped);
     PartitionedHlo per_group_operand =
         PerGroupPartitionedHlo(operand, operand_grouped, b, clean_ups);
     PartitionedHlo per_group_indices =
@@ -500,17 +517,18 @@ absl::StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
                                                *trivial_slice_dims);
     // See if we can group partially replicated dimensions from the indices
     // otherwise replicate it.
-    GroupedSharding indices_grouped =
-        AlignGroupsWith(hlo_sharding_util::GroupShardingOnReplicatedDim(
-                            indices.sharding(), num_groups, num_tiles,
-                            indices.rank(), GatherIndexDimsByPriority(gather)),
-                        operand_grouped);
-    GroupedSharding output_grouped = AlignGroupsWith(
+    GroupedSharding indices_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            output_sharding, num_groups, num_tiles, output_shape.rank(),
-            GatherOutputDimsByPriority(output_shape, operand, gather,
-                                       slice_sizes)),
+            indices.sharding(), num_groups, num_tiles, indices.num_dimensions(),
+            GatherIndexDimsByPriority(gather)),
         operand_grouped);
+    GroupedSharding output_grouped =
+        AlignGroupsWith(hlo_sharding_util::GroupShardingOnReplicatedDim(
+                            output_sharding, num_groups, num_tiles,
+                            output_shape.dimensions_size(),
+                            GatherOutputDimsByPriority(output_shape, operand,
+                                                       gather, slice_sizes)),
+                        operand_grouped);
     // For index and output sharding, if one is grouped partially but the other
     // is replicated, pass through the partially grouped sharding to the other
     // one.
@@ -519,12 +537,13 @@ absl::StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
       const HloSharding new_output_sharding =
           hlo_sharding_util::GatherOutputShardingFromIndex(indices.sharding(),
                                                            gather);
-      output_grouped = AlignGroupsWith(
-          hlo_sharding_util::GroupShardingOnReplicatedDim(
-              new_output_sharding, num_groups, num_tiles, output_shape.rank(),
-              GatherOutputDimsByPriority(output_shape, operand, gather,
-                                         slice_sizes)),
-          operand_grouped);
+      output_grouped =
+          AlignGroupsWith(hlo_sharding_util::GroupShardingOnReplicatedDim(
+                              new_output_sharding, num_groups, num_tiles,
+                              output_shape.dimensions_size(),
+                              GatherOutputDimsByPriority(output_shape, operand,
+                                                         gather, slice_sizes)),
+                          operand_grouped);
     }
     if (indices_grouped.sharding.IsTileMaximal() &&
         !output_grouped.sharding.IsTileMaximal()) {
@@ -533,8 +552,8 @@ absl::StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
                                                            gather);
       indices_grouped = AlignGroupsWith(
           hlo_sharding_util::GroupShardingOnReplicatedDim(
-              new_indices_sharding, num_groups, num_tiles, indices.rank(),
-              GatherIndexDimsByPriority(gather)),
+              new_indices_sharding, num_groups, num_tiles,
+              indices.num_dimensions(), GatherIndexDimsByPriority(gather)),
           operand_grouped);
     }
     // Reshard indices to its intended sharding before clamping and adjusting.
@@ -599,9 +618,9 @@ absl::StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
         hlo_sharding_util::UngroupSharding(output_grouped), gather));
     filter = pfilter.hlo();
 
-    if (dnums.index_vector_dim() < indices.rank()) {
+    if (dnums.index_vector_dim() < indices.num_dimensions()) {
       std::vector<int64_t> reduced_filter_dims;
-      for (int64_t i = 0; i < filter->shape().rank(); ++i) {
+      for (int64_t i = 0; i < filter->shape().dimensions_size(); ++i) {
         if (i != dnums.index_vector_dim()) {
           reduced_filter_dims.push_back(filter->shape().dimensions(i));
         }
@@ -612,7 +631,7 @@ absl::StatusOr<HloInstruction*> PartitionGatherTrivialSlicedOperandDimensions(
           MakeBinaryAdd(PRED, indices.state().module)));
     }
     std::vector<int64_t> batch_dims;
-    for (int64_t i = 0; i < pgather->shape().rank(); ++i) {
+    for (int64_t i = 0; i < pgather->shape().dimensions_size(); ++i) {
       if (!absl::c_linear_search(dnums.offset_dims(), i)) {
         batch_dims.push_back(i);
       }
@@ -704,7 +723,7 @@ absl::StatusOr<HloInstruction*> PartitionGatherParallelDimensions(
     for (int start_idx = 0; start_idx < dnums.start_index_map_size();
          ++start_idx) {
       HloInstruction* index_offset =
-          indices.rank() > index_dim
+          indices.num_dimensions() > index_dim
               ? b->AddInstruction(HloInstruction::CreateReshape(
                     ShapeUtil::MakeShape(S32, {1}),
                     operand_offsets[dnums.start_index_map(start_idx)]))
@@ -712,7 +731,7 @@ absl::StatusOr<HloInstruction*> PartitionGatherParallelDimensions(
       index_offsets.push_back(index_offset);
     }
     HloInstruction* adjusted_indices = nullptr;
-    if (indices.rank() > index_dim) {
+    if (indices.num_dimensions() > index_dim) {
       // Concatenate the offsets for the parallel dimensions to subtract.
       adjusted_indices = b->AddInstruction(HloInstruction::CreateConcatenate(
           ShapeUtil::MakeShape(S32,
@@ -728,7 +747,7 @@ absl::StatusOr<HloInstruction*> PartitionGatherParallelDimensions(
                                        indices.hlo()->shape().element_type()),
           adjusted_indices));
     }
-    if (adjusted_indices->shape().rank() == 0) {
+    if (adjusted_indices->shape().dimensions_size() == 0) {
       adjusted_indices = b->AddInstruction(HloInstruction::CreateBroadcast(
           indices.hlo()->shape(), adjusted_indices, {}));
     } else {
@@ -837,17 +856,18 @@ GatherPartitionMethods() {
 }
 
 // Helper function to get the gather partitioning method.
-decltype(PartitionGather)* GetGatherPartitionMethod(PartitioningMethod method) {
+decltype(PartitionGather)* GetGatherPartitionMethod(
+    GatherScatterPartitioningMethod method) {
   switch (method) {
-    case PartitioningMethod::kExplicitBatch:
+    case GatherScatterPartitioningMethod::kExplicitBatch:
       return PartitionGatherExplicitBatchDimensions;
-    case PartitioningMethod::kIndexParallel:
+    case GatherScatterPartitioningMethod::kIndexParallel:
       return PartitionGatherIndexParallelDimensions;
-    case PartitioningMethod::kOperandPassthrough:
+    case GatherScatterPartitioningMethod::kOperandPassthrough:
       return PartitionGatherOperandPassthroughDimensions;
-    case PartitioningMethod::kTrivialSlicedOperand:
+    case GatherScatterPartitioningMethod::kTrivialSlicedOperand:
       return PartitionGatherTrivialSlicedOperandDimensions;
-    case PartitioningMethod::kIndexPassthrough:
+    case GatherScatterPartitioningMethod::kIndexPassthrough:
       return PartitionGatherIndexPassthroughDimensions;
     default:
       return PartitionGatherExplicitBatchDimensions;
@@ -862,7 +882,12 @@ std::pair<int64_t, int64_t> GatherPartitionMethodCostModel(
     const PartitionedHlo& indices, const Shape& output_shape,
     const HloSharding& output_sharding, absl::Span<const int64_t> batch_dims,
     absl::Span<const int64_t> slice_sizes, SpmdPartitioningVisitor* visitor) {
-  if (partition_method == GetGatherPartitionMethod(gather_partition_method)) {
+  if (absl::c_any_of(
+          visitor->options().preferred_gather_partition_methods,
+          [&](const GatherScatterPartitioningMethod& preferred_method) {
+            return GetGatherPartitionMethod(preferred_method) ==
+                   partition_method;
+          })) {
     // Always prioritize the user's chosen partitioning, and assume it has zero
     // cost. The default method is kExplicitBatch.
     return {0, 0};
@@ -964,12 +989,11 @@ absl::Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
           : raw_indices;
 
   std::vector<int64_t> batch_dims;
-  for (int64_t i = 0; i < gather->shape().rank(); ++i) {
+  for (int64_t i = 0; i < gather->shape().dimensions_size(); ++i) {
     if (!absl::c_linear_search(dnums.offset_dims(), i)) {
       batch_dims.push_back(i);
     }
   }
-  gather_partition_method = options().gather_partition_method;
   TF_ASSIGN_OR_RETURN(
       HloInstruction * pgather,
       PartitionGather(gather, operand, indices, gather->shape(),
@@ -1064,7 +1088,7 @@ std::vector<int64_t> ScatterOperandDimsByPriority(
 // 3. scatter_indices_batching_dims
 std::vector<int64_t> ScatterIndexDimsByPriority(
     const HloScatterInstruction* scatter) {
-  int64_t indices_rank = scatter->scatter_indices()->shape().rank();
+  int64_t indices_rank = scatter->scatter_indices()->shape().dimensions_size();
   const ScatterDimensionNumbers& dnums = scatter->scatter_dimension_numbers();
 
   std::vector<int64_t> priority_dims_for_indices;
@@ -1099,7 +1123,7 @@ std::vector<int64_t> ScatterUpdateDimsByPriority(
   auto operand_passthrough_update_dims =
       hlo_sharding_util::GetScatterOperandPassthroughDims(*scatter, slice_sizes)
           .output_dims;
-  for (int i = 0; i != update_shape.rank(); ++i) {
+  for (int i = 0; i != update_shape.dimensions_size(); ++i) {
     if (!absl::c_linear_search(operand_passthrough_update_dims, i)) {
       priority_dims_for_output.push_back(i);
     }
@@ -1209,7 +1233,7 @@ absl::StatusOr<HloInstruction*> PartitionScatterParallelDimensions(
                                        indices.hlo()->shape().element_type()),
           adjusted_indices));
     }
-    if (adjusted_indices->shape().rank() == 0) {
+    if (adjusted_indices->shape().dimensions_size() == 0) {
       adjusted_indices = b->AddInstruction(HloInstruction::CreateBroadcast(
           indices.hlo()->shape(), adjusted_indices, {}));
     } else {
@@ -1368,7 +1392,7 @@ absl::StatusOr<HloInstruction*> PartitionScatterOperandPassthroughDimensions(
     // otherwise replicate it.
     const GroupedSharding indices_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            indices.sharding(), num_groups, num_tiles, indices.rank(),
+            indices.sharding(), num_groups, num_tiles, indices.num_dimensions(),
             ScatterIndexDimsByPriority(scatter)),
         update_grouped);
     const GroupedSharding& output_grouped = operand_grouped;
@@ -1426,7 +1450,7 @@ absl::StatusOr<HloInstruction*> PartitionScatterIndexPassthroughDimensions(
   if (hlo_sharding_util::MergeShardingIfCompatible(
           hlo_sharding_util::PropagateShardingAlongDimsAndReplicateOthers(
               updates[0].sharding(), index_passthrough_dims.output_dims,
-              index_passthrough_dims.indices_dims, indices.rank()),
+              index_passthrough_dims.indices_dims, indices.num_dimensions()),
           &indices_sharding)) {
     indices = indices.Reshard(indices_sharding);
   }
@@ -1436,7 +1460,7 @@ absl::StatusOr<HloInstruction*> PartitionScatterIndexPassthroughDimensions(
       hlo_sharding_util::PropagateShardingAlongDimsAndReplicateOthers(
           indices.sharding(), index_passthrough_dims.indices_dims,
           index_passthrough_dims.output_dims,
-          scatter->scatter_updates()[0]->shape().rank());
+          scatter->scatter_updates()[0]->shape().dimensions_size());
   if (passthrough_sharding.IsTileMaximal()) {
     return nullptr;
   }
@@ -1451,7 +1475,8 @@ absl::StatusOr<HloInstruction*> PartitionScatterIndexPassthroughDimensions(
   const int64_t num_tiles = indices.sharding().TotalNumTiles();
   const GroupedSharding operand_grouped = AlignGroupsWith(
       hlo_sharding_util::GroupShardingOnReplicatedDim(
-          operands[0].sharding(), num_groups, num_tiles, operands[0].rank(),
+          operands[0].sharding(), num_groups, num_tiles,
+          operands[0].num_dimensions(),
           ScatterOperandDimsByPriority(operands[0], scatter, slice_sizes)),
       update_grouped);
   const GroupedSharding indices_grouped = AlignGroupsWith(
@@ -1577,14 +1602,15 @@ absl::StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
     // otherwise replicate it.
     GroupedSharding indices_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            indices.sharding(), num_groups, num_tiles, indices.rank(),
+            indices.sharding(), num_groups, num_tiles, indices.num_dimensions(),
             ScatterIndexDimsByPriority(scatter)),
         operand_grouped);
     // See if we can group partially replicated dimensions from the updates
     // otherwise replicate it.
     GroupedSharding update_grouped = AlignGroupsWith(
         hlo_sharding_util::GroupShardingOnReplicatedDim(
-            updates[0].sharding(), num_groups, num_tiles, updates[0].rank(),
+            updates[0].sharding(), num_groups, num_tiles,
+            updates[0].num_dimensions(),
             ScatterUpdateDimsByPriority(updates[0].base_shape(), operands[0],
                                         scatter, slice_sizes)),
         operand_grouped);
@@ -1598,7 +1624,8 @@ absl::StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
                                                             scatter);
       update_grouped = AlignGroupsWith(
           hlo_sharding_util::GroupShardingOnReplicatedDim(
-              new_update_sharding, num_groups, num_tiles, output_shape.rank(),
+              new_update_sharding, num_groups, num_tiles,
+              output_shape.dimensions_size(),
               ScatterUpdateDimsByPriority(updates[0].base_shape(), operands[0],
                                           scatter, slice_sizes)),
           operand_grouped);
@@ -1610,8 +1637,8 @@ absl::StatusOr<HloInstruction*> PartitionScatterTrivialSlicedOperandDimensions(
               updates[0].sharding(), scatter);
       indices_grouped = AlignGroupsWith(
           hlo_sharding_util::GroupShardingOnReplicatedDim(
-              new_indices_sharding, num_groups, num_tiles, indices.rank(),
-              ScatterIndexDimsByPriority(scatter)),
+              new_indices_sharding, num_groups, num_tiles,
+              indices.num_dimensions(), ScatterIndexDimsByPriority(scatter)),
           operand_grouped);
     }
     const GroupedSharding& output_grouped = operand_grouped;
@@ -1672,17 +1699,17 @@ ScatterPartitionMethods() {
 
 // Helper function to get the actual scatter partitioning method
 decltype(PartitionScatter)* GetScatterPartitionMethod(
-    PartitioningMethod method) {
+    GatherScatterPartitioningMethod method) {
   switch (method) {
-    case PartitioningMethod::kExplicitBatch:
+    case GatherScatterPartitioningMethod::kExplicitBatch:
       return PartitionScatterExplicitBatchDimensions;
-    case PartitioningMethod::kIndexParallel:
+    case GatherScatterPartitioningMethod::kIndexParallel:
       return PartitionScatterIndexParallelDimensions;
-    case PartitioningMethod::kOperandPassthrough:
+    case GatherScatterPartitioningMethod::kOperandPassthrough:
       return PartitionScatterOperandPassthroughDimensions;
-    case PartitioningMethod::kTrivialSlicedOperand:
+    case GatherScatterPartitioningMethod::kTrivialSlicedOperand:
       return PartitionScatterTrivialSlicedOperandDimensions;
-    case PartitioningMethod::kIndexPassthrough:
+    case GatherScatterPartitioningMethod::kIndexPassthrough:
       return PartitionScatterIndexPassthroughDimensions;
     default:
       return PartitionScatterIndexParallelDimensions;
@@ -1698,7 +1725,12 @@ std::pair<int64_t, int64_t> ScatterPartitionMethodCostModel(
     const std::vector<PartitionedHlo>& updates, const Shape& output_shape,
     const HloSharding& output_sharding, absl::Span<const int64_t> slice_sizes,
     SpmdPartitioningVisitor* visitor) {
-  if (partition_method == GetScatterPartitionMethod(scatter_partition_method)) {
+  if (absl::c_any_of(
+          visitor->options().preferred_scatter_partition_methods,
+          [&](const GatherScatterPartitioningMethod& preferred_method) {
+            return GetScatterPartitionMethod(preferred_method) ==
+                   partition_method;
+          })) {
     // Always prioritize index parallel partitioning, and assume it has zero
     // cost.
     return {0, 0};
@@ -1871,7 +1903,7 @@ absl::Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
   auto indices_sharding = indices.sharding();
   // Reshard indices with -1 padding, which will have no effect on the result as
   // guaranteed by the scatter semantics.
-  for (auto i = 0; i != indices.rank(); ++i) {
+  for (auto i = 0; i != indices.num_dimensions(); ++i) {
     if (indices.base_shape().dimensions(i) !=
         indices_sharding.tile_assignment().dim(i) *
             indices.hlo()->shape().dimensions(i)) {
@@ -1881,7 +1913,6 @@ absl::Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
       break;
     }
   }
-  scatter_partition_method = options().scatter_partition_method;
   std::vector<int64_t> slice_sizes = hlo_sharding_util::GetScatterSliceSize(
       operands[0].base_shape(), updates[0].base_shape(),
       scatter->scatter_dimension_numbers());
diff --git a/third_party/xla/xla/service/spmd/partition_assignment_test.cc b/third_party/xla/xla/service/spmd/partition_assignment_test.cc
index 79cc656d96c6..966c02fa0ea0 100644
--- a/third_party/xla/xla/service/spmd/partition_assignment_test.cc
+++ b/third_party/xla/xla/service/spmd/partition_assignment_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
 namespace xla {
 namespace {
 
-using PartitionAssignmentTest = HloTestBase;
+using PartitionAssignmentTest = HloHardwareIndependentTestBase;
 
 TEST_F(PartitionAssignmentTest, NoopAlg) {
   absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
index 16911d134eab..bb276d42e6a1 100644
--- a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
+++ b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse.cc
@@ -48,7 +48,7 @@ bool IsAddingOnlyDegenerateDimensions(const HloInstruction* inst) {
   const Shape& out_shape = inst->shape();
   return ShapeUtil::ElementsIn(in_shape) == ShapeUtil::ElementsIn(out_shape) &&
          ShapeUtil::DimensionsUnmodifiedByReshape(in_shape, out_shape).size() ==
-             in_shape.rank();
+             in_shape.dimensions_size();
 }
 
 // Passthrough reshapes or bitcasts adding only degenerate hdimensions to some
diff --git a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc
index be5d58b55b61..874031817a99 100644
--- a/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc
+++ b/third_party/xla/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
@@ -35,7 +35,7 @@ namespace xla {
 namespace spmd {
 namespace {
 
-class CollectiveOpsCseTest : public HloTestBase {
+class CollectiveOpsCseTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, int64_t distance_threshold = 100) {
diff --git a/third_party/xla/xla/hlo/transforms/sharding_format_picker.cc b/third_party/xla/xla/service/spmd/sharding_format_picker.cc
similarity index 98%
rename from third_party/xla/xla/hlo/transforms/sharding_format_picker.cc
rename to third_party/xla/xla/service/spmd/sharding_format_picker.cc
index 90b192400f1f..c6a1a6a80081 100644
--- a/third_party/xla/xla/hlo/transforms/sharding_format_picker.cc
+++ b/third_party/xla/xla/service/spmd/sharding_format_picker.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/hlo/transforms/sharding_format_picker.h"
+#include "xla/service/spmd/sharding_format_picker.h"
 
 #include <algorithm>
 #include <cmath>
@@ -46,6 +46,7 @@ class HloShardingTestHelper {
   }
 };
 
+namespace test_only {
 namespace {
 
 bool PermuteDimsHelper(absl::Span<int64_t> dims, absl::Span<const int32_t> perm,
@@ -195,4 +196,5 @@ absl::StatusOr<bool> ShardingFormatPicker::Run(
   return changed;
 }
 
+}  // namespace test_only
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/sharding_format_picker.h b/third_party/xla/xla/service/spmd/sharding_format_picker.h
new file mode 100644
index 000000000000..583444157b63
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/sharding_format_picker.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDING_FORMAT_PICKER_H_
+#define XLA_SERVICE_SPMD_SHARDING_FORMAT_PICKER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::test_only {
+
+// Test-only pass to transform the HloSharding format of all the instructions in
+// a module to the selected format.
+class ShardingFormatPicker : public HloModulePass {
+ public:
+  enum class ShardingType {
+    kV1,            // Converts all HloSharding to V1 format.
+    kBestEffortV2,  // Best effort to convert all HloSharding to V2 format.
+  };
+  explicit ShardingFormatPicker(ShardingType sharding_type)
+      : sharding_type_(sharding_type) {}
+  absl::string_view name() const override { return "sharding-format-picker"; }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const ShardingType sharding_type_;
+};
+
+}  // namespace xla::test_only
+
+#endif  // XLA_SERVICE_SPMD_SHARDING_FORMAT_PICKER_H_
diff --git a/third_party/xla/xla/service/spmd/shardy/BUILD b/third_party/xla/xla/service/spmd/shardy/BUILD
index 2ad49fec70b5..defcaf8db701 100644
--- a/third_party/xla/xla/service/spmd/shardy/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/BUILD
@@ -1,6 +1,6 @@
 # Integrates Shardy into the current XLA pipeline.
 
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test", "xla_internal")
+load("//xla:xla.default.bzl", "xla_cc_binary", "xla_cc_test", "xla_internal")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -19,6 +19,7 @@ package_group(
         "//learning/deepmind/jax/ocean/...",
         "//learning/deepmind/partir/...",
         "//third_party/openxla/shardy/...",
+        "//third_party/py/jax/...",
     ],
 )
 
@@ -41,11 +42,13 @@ cc_library(
         "//xla/hlo/utils:hlo_sharding_util",
         "//xla/service:computation_layout",
         "//xla/service:hlo_proto_cc",
-        "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd/shardy/sdy_round_trip:pipelines",
         "//xla/service/spmd/shardy/stablehlo_round_trip:stablehlo_export",
         "//xla/service/spmd/shardy/stablehlo_round_trip:stablehlo_import",
         "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -57,13 +60,11 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@com_googlesource_code_re2//:re2",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
         "@shardy//shardy/common:file_utils",
         "@shardy//shardy/dialect/sdy/transforms/propagation:passes",
     ],
@@ -75,6 +76,8 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         ":constants",
+        "//xla/mlir_hlo",
+        "//xla/service/spmd/shardy/extensions:mhlo_extensions",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -89,6 +92,16 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":utils",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "constants",
     hdrs = ["constants.h"],
@@ -104,9 +117,9 @@ xla_cc_test(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
@@ -117,16 +130,17 @@ xla_cc_binary(
     name = "sdy_opt",
     srcs = ["sdy_opt_main.cc"],
     deps = [
+        "//xla/mlir_hlo",
         "//xla/mlir_hlo:stablehlo_extension_passes",
+        "//xla/service/spmd/shardy/extensions:mhlo_extensions",
         "//xla/service/spmd/shardy/round_trip_common:export_named_computations",
-        "//xla/service/spmd/shardy/round_trip_common:import_backend_func_calls",
         "//xla/service/spmd/shardy/round_trip_common:import_constants",
         "//xla/service/spmd/shardy/round_trip_common:import_sdy_custom_calls",
+        "//xla/service/spmd/shardy/round_trip_common:import_uninlineable_func_calls",
         "//xla/service/spmd/shardy/round_trip_common:open_while_free_vars_sharding",
         "//xla/service/spmd/shardy/sdy_round_trip:dedup_meshes",
         "//xla/service/spmd/shardy/sdy_round_trip:export_ops",
         "//xla/service/spmd/shardy/sdy_round_trip:export_shardy_attrs",
-        "//xla/service/spmd/shardy/sdy_round_trip:import_callback_custom_calls",
         "//xla/service/spmd/shardy/sdy_round_trip:import_shardy_attrs",
         "//xla/service/spmd/shardy/sdy_round_trip:pipelines",
         "//xla/service/spmd/shardy/sdy_round_trip:remove_size_one_axes",
diff --git a/third_party/xla/xla/service/spmd/shardy/constants.h b/third_party/xla/xla/service/spmd/shardy/constants.h
index 92e8501d4f3d..7cad4abc01fd 100644
--- a/third_party/xla/xla/service/spmd/shardy/constants.h
+++ b/third_party/xla/xla/service/spmd/shardy/constants.h
@@ -58,6 +58,9 @@ inline constexpr llvm::StringRef kFFIPythonGpuCallbackCustomCallTargetName =
 // The attribute name for backend config.
 inline constexpr llvm::StringRef kXlaBackendConfigAttr = "backend_config";
 
+// The attribute name for inlineable.
+inline constexpr llvm::StringRef kXlaInlineableAttr = "inlineable";
+
 // Attribute name for temporarily storing the Shardy sharding during HLO
 // sdy-round-trip. It cannot match the name `kShardingAttr` ("sdy.sharding"), as
 // during sdy-round-trip, going from HLO to StableHLO, the code removes
@@ -93,6 +96,14 @@ inline constexpr llvm::StringRef kShardingGroupCustomCallTargetName =
 inline constexpr llvm::StringRef kShardingGroupIdAttr =
     "xla.sdy.sharding_group_id";
 
+// Shardy propagation barrier custom call target name.
+inline constexpr llvm::StringRef kPropagationBarrierCustomCallTargetName =
+    "xla.sdy.PropagationBarrier";
+
+// Propagation barrier allowed direction attribute name.
+inline constexpr llvm::StringRef kAllowedDirectionAttr =
+    "xla.sdy.allowed_direction";
+
 // Attribute name for storing frontend attributes in XLA.
 inline constexpr llvm::StringRef kFrontendAttributesAttr =
     "mhlo.frontend_attributes";
diff --git a/third_party/xla/xla/service/spmd/shardy/extensions/BUILD b/third_party/xla/xla/service/spmd/shardy/extensions/BUILD
new file mode 100644
index 000000000000..8820445a390b
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/extensions/BUILD
@@ -0,0 +1,31 @@
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    packages = [
+        "//learning/deepmind/partir/compiler/mpmd/...",
+        "//learning/deepmind/partir/compiler/shardonnay/...",
+        "//third_party/openxla/shardy/tools/...",
+        "//xla/...",
+    ],
+)
+
+cc_library(
+    name = "mhlo_extensions",
+    srcs = ["mhlo_extensions.cc"],
+    hdrs = ["mhlo_extensions.h"],
+    deps = [
+        "//xla/mlir_hlo",
+        "@com_google_absl//absl/log:check",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@shardy//shardy/dialect/sdy/ir:dialect",
+        "@shardy//shardy/dialect/sdy/transforms/propagation:op_sharding_rule_builder",
+    ],
+)
diff --git a/third_party/xla/xla/service/spmd/shardy/extensions/mhlo_extensions.cc b/third_party/xla/xla/service/spmd/shardy/extensions/mhlo_extensions.cc
new file mode 100644
index 000000000000..b0c0b3762eb3
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/extensions/mhlo_extensions.cc
@@ -0,0 +1,199 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/spmd/shardy/extensions/mhlo_extensions.h"
+
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "shardy/dialect/sdy/ir/dialect.h"
+#include "shardy/dialect/sdy/ir/enums.h"
+#include "shardy/dialect/sdy/transforms/propagation/op_sharding_rule_builder.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mhlo = ::mlir::mhlo;
+
+namespace xla {
+namespace sdy {
+
+namespace {
+
+using ::mlir::ArrayRef;
+using ::mlir::sdy::FactorType;
+using ::mlir::sdy::kNullDim;
+using ::mlir::sdy::OpShardingRuleBuilder;
+
+struct CopyShardingRuleOpInterface
+    : public mlir::sdy::ShardingRuleOpInterface::ExternalModel<
+          CopyShardingRuleOpInterface, mhlo::CopyOp> {
+  mlir::sdy::OpShardingRuleAttr getShardingRule(mlir::Operation* op) const {
+    return OpShardingRuleBuilder::buildPointwise(op);
+  }
+};
+
+enum RaggedDotMode {
+  // Ragged non-contracting (m): [b,m,k], [g,b,k,n], [b,g] -> [b,m,n].
+  kNonContracting,
+  // Ragged contracting (k):     [b,m,k], [b,k,n],   [b,g] -> [g,b,m,n].
+  kContracting,
+  // Ragged batch (b):           [b,m,k], [b,k,n],   [g]   -> [b,m,n].
+  kBatch,
+};
+
+struct RaggedDotShardingRuleOpInterface
+    : public mlir::sdy::ShardingRuleOpInterface::ExternalModel<
+          RaggedDotShardingRuleOpInterface, mhlo::RaggedDotOp> {
+  mlir::sdy::OpShardingRuleAttr getShardingRule(mlir::Operation* op) const {
+    mhlo::RaggedDotOp raggedDot = llvm::cast<mhlo::RaggedDotOp>(op);
+    mhlo::RaggedDotDimensionNumbersAttr raggedDotDimNumbers =
+        raggedDot.getRaggedDotDimensionNumbers();
+    mhlo::DotDimensionNumbersAttr dotDimNumbers =
+        raggedDotDimNumbers.getDotDimensionNumbers();
+
+    ArrayRef<int64_t> lhsBatchingDims =
+        dotDimNumbers.getLhsBatchingDimensions();
+    ArrayRef<int64_t> rhsBatchingDims =
+        dotDimNumbers.getRhsBatchingDimensions();
+    ArrayRef<int64_t> lhsContractingDims =
+        dotDimNumbers.getLhsContractingDimensions();
+    ArrayRef<int64_t> rhsContractingDims =
+        dotDimNumbers.getRhsContractingDimensions();
+
+    ArrayRef<int64_t> lhsRaggedDims =
+        raggedDotDimNumbers.getLhsRaggedDimensions();
+    CHECK_EQ(lhsRaggedDims.size(), 1);
+    int64_t lhsRaggedDim = lhsRaggedDims.front();
+    ArrayRef<int64_t> rhsGroupDims =
+        raggedDotDimNumbers.getRhsGroupDimensions();
+
+    RaggedDotMode mode;
+    if (llvm::is_contained(lhsContractingDims, lhsRaggedDim)) {
+      CHECK(rhsGroupDims.empty());
+      mode = RaggedDotMode::kContracting;
+    } else if (llvm::is_contained(lhsBatchingDims, lhsRaggedDim)) {
+      CHECK(rhsGroupDims.empty());
+      mode = RaggedDotMode::kBatch;
+    } else {
+      CHECK_EQ(rhsGroupDims.size(), 1);
+      mode = RaggedDotMode::kNonContracting;
+    }
+
+    OpShardingRuleBuilder builder(raggedDot);
+
+    mlir::RankedTensorType lhsType = raggedDot.getLhs().getType();
+    mlir::RankedTensorType rhsType = raggedDot.getRhs().getType();
+    mlir::RankedTensorType groupSizesType = raggedDot.getGroupSizes().getType();
+
+    int64_t groupSizesDim = 0;
+    int64_t outputDim = (mode == RaggedDotMode::kContracting) ? 1 : 0;
+
+    // batching dimensions
+    for (auto [lhsDim, rhsDim] :
+         llvm::zip_equal(lhsBatchingDims, rhsBatchingDims)) {
+      builder.addFactor(
+          {lhsDim, rhsDim,
+           mode != RaggedDotMode::kBatch ? groupSizesDim++ : kNullDim},
+          outputDim++, lhsType.getDimSize(lhsDim));
+    }
+
+    // lhs non-contracting dimensions
+    bool addLhsNonContractingDimsInGroupSizes =
+        mode == RaggedDotMode::kNonContracting;
+    for (int64_t i = 0; i < lhsType.getRank(); i++) {
+      if (!llvm::is_contained(lhsContractingDims, i) &&
+          !llvm::is_contained(lhsBatchingDims, i)) {
+        FactorType factorType = FactorType::kPassThrough;
+        if (i == lhsRaggedDim) {
+          // We only add the non-contracting dimensions before the lhs ragged
+          // dimension to the group sizes in the kNonContracting mode.
+          addLhsNonContractingDimsInGroupSizes = false;
+          factorType = FactorType::kNeedReplication;
+        }
+        builder.addFactor(
+            {i, kNullDim,
+             addLhsNonContractingDimsInGroupSizes ? groupSizesDim++ : kNullDim},
+            outputDim++, lhsType.getDimSize(i), factorType);
+      }
+    }
+
+    // rhs non-contracting dimensions
+    for (int64_t i = 0; i < rhsType.getRank(); i++) {
+      if (!llvm::is_contained(rhsContractingDims, i) &&
+          !llvm::is_contained(rhsBatchingDims, i) &&
+          !llvm::is_contained(rhsGroupDims, i)) {
+        builder.addFactor({kNullDim, i, kNullDim}, outputDim++,
+                          rhsType.getDimSize(i));
+      }
+    }
+
+    // contracting dimensions
+    bool addContractingDimsInGroupSizes = mode == RaggedDotMode::kContracting;
+    for (auto [lhsDim, rhsDim] :
+         llvm::zip_equal(lhsContractingDims, rhsContractingDims)) {
+      FactorType factorType = FactorType::kReduction;
+      if (lhsDim == lhsRaggedDim) {
+        // We only add the contracting dimensions before the lhs ragged
+        // dimension to the group sizes in the kContracting mode.
+        addContractingDimsInGroupSizes = false;
+        factorType = FactorType::kNeedReplication;
+      }
+      builder.addFactor(
+          {lhsDim, rhsDim,
+           addContractingDimsInGroupSizes ? groupSizesDim++ : kNullDim},
+          kNullDim, lhsType.getDimSize(lhsDim), factorType);
+    }
+
+    switch (mode) {
+      case RaggedDotMode::kNonContracting: {
+        CHECK_EQ(groupSizesDim, groupSizesType.getRank() - 1);
+        int64_t rhsGroupDim = rhsGroupDims.front();
+        builder.addFactor({kNullDim, rhsGroupDim, groupSizesDim}, kNullDim,
+                          rhsType.getDimSize(rhsGroupDim),
+                          FactorType::kNeedReplication);
+        break;
+      }
+      case RaggedDotMode::kContracting: {
+        CHECK_EQ(groupSizesDim, groupSizesType.getRank() - 1);
+        builder.addFactor({kNullDim, kNullDim, groupSizesDim}, 0,
+                          groupSizesType.getDimSize(groupSizesDim),
+                          FactorType::kNeedReplication);
+        break;
+      }
+      case RaggedDotMode::kBatch: {
+        break;
+      }
+    }
+
+    return builder.build();
+  }
+};
+
+}  // namespace
+
+void registerMhloExtensions(mlir::DialectRegistry& registry) {
+  registry.addExtension(+[](mlir::MLIRContext* ctx, mhlo::MhloDialect*) {
+    mhlo::CopyOp::attachInterface<CopyShardingRuleOpInterface>(*ctx);
+    mhlo::RaggedDotOp::attachInterface<RaggedDotShardingRuleOpInterface>(*ctx);
+  });
+}
+
+}  // namespace sdy
+}  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/shardy/extensions/mhlo_extensions.h b/third_party/xla/xla/service/spmd/shardy/extensions/mhlo_extensions.h
new file mode 100644
index 000000000000..b8aa42e61057
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/extensions/mhlo_extensions.h
@@ -0,0 +1,30 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_EXTENSIONS_MHLO_EXTENSIONS_H_
+#define XLA_SERVICE_SPMD_SHARDY_EXTENSIONS_MHLO_EXTENSIONS_H_
+
+#include "mlir/IR/MLIRContext.h"
+
+namespace xla {
+namespace sdy {
+
+// Register Shardy op interface extensions to MHLO ops.
+void registerMhloExtensions(mlir::DialectRegistry &registry);
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_EXTENSIONS_MHLO_EXTENSIONS_H_
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD b/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
index 26b4313eb5f3..8e578f8d3dcd 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/BUILD
@@ -1,14 +1,16 @@
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
 package_group(
     name = "friends",
     packages = [
+        "//third_party/py/jax/...",
         "//xla/...",
     ],
 )
@@ -38,6 +40,8 @@ cc_library(
     srcs = ["export_named_computations.cc"],
     hdrs = ["export_named_computations.h"],
     deps = [
+        "//xla/service/spmd/shardy:constants",
+        "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -49,14 +53,13 @@ cc_library(
 )
 
 cc_library(
-    name = "import_backend_func_calls",
-    srcs = ["import_backend_func_calls.cc"],
-    hdrs = ["import_backend_func_calls.h"],
+    name = "import_uninlineable_func_calls",
+    srcs = ["import_uninlineable_func_calls.cc"],
+    hdrs = ["import_uninlineable_func_calls.h"],
     deps = [
         "//xla/service/spmd/shardy:constants",
         "//xla/service/spmd/shardy:utils",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -104,15 +107,13 @@ cc_library(
     srcs = ["pipeline_passes.cc"],
     hdrs = ["pipeline_passes.h"],
     deps = [
-        ":import_backend_func_calls",
         ":import_constants",
         ":import_sdy_custom_calls",
+        ":import_uninlineable_func_calls",
         ":open_while_free_vars_sharding",
         "//xla/mlir_hlo:stablehlo_extension_passes",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
-        "@stablehlo//:stablehlo_passes",
     ],
 )
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
index 4a94c676ae1e..ef0b54055c3f 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/export_named_computations.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "absl/log/check.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "shardy/dialect/sdy/ir/constants.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/utils.h"
+#include "xla/service/spmd/shardy/constants.h"
 
 namespace xla {
 namespace sdy {
@@ -48,7 +50,9 @@ using ::mlir::StringRef;
 using ::mlir::SymbolTable;
 using ::mlir::func::CallOp;
 using ::mlir::func::FuncOp;
+
 using ::mlir::sdy::kShardingAttr;
+using ::mlir::sdy::ManualAxesAttr;
 using ::mlir::sdy::NamedComputationOp;
 using ::mlir::sdy::TensorShardingPerValueAttr;
 
@@ -78,33 +82,56 @@ class ExportNamedComputationsPass
           namedComputationOp.getBody(), funcOp.getBody());
       rewriter.setInsertionPoint(namedComputationOp);
 
+      ManualAxesAttr manualAxesAttr =
+          namedComputationOp->getAttrOfType<ManualAxesAttr>(kManualAxes);
+      std::optional<TensorShardingPerValueAttr> inShardings =
+          namedComputationOp.getInShardings();
+      std::optional<TensorShardingPerValueAttr> outShardings =
+          namedComputationOp.getOutShardings();
+
+      if (manualAxesAttr) {
+        CHECK(!manualAxesAttr.empty());
+        CHECK(inShardings.has_value());
+        CHECK(outShardings.has_value());
+      }
+
       // Copy the input shardings to the func.
-      if (std::optional<TensorShardingPerValueAttr> inShardings =
-              namedComputationOp.getInShardings()) {
-        for (auto [arg, sharding] : llvm::zip_equal(
-                 funcOp.getArguments(), inShardings->getShardings())) {
-          setSharding(arg, sharding);
+      if (inShardings.has_value()) {
+        for (auto [i, sharding] :
+             llvm::enumerate(inShardings->getShardings())) {
+          funcOp.setArgAttr(i, kShardingAttr, sharding);
+          if (manualAxesAttr) {
+            funcOp.setArgAttr(i, kManualAxes, manualAxesAttr);
+          }
         }
       }
 
-      // Copy the output shardings to the func AND call.
-      mlir::SmallVector<NamedAttribute> callOpAttrs(
-          namedComputationOp->getDiscardableAttrs());
-      if (std::optional<TensorShardingPerValueAttr> outShardings =
-              namedComputationOp.getOutShardings()) {
+      // Copy the output shardings to the func.
+      if (outShardings.has_value()) {
         for (auto [i, sharding] :
              llvm::enumerate(outShardings->getShardings())) {
           funcOp.setResultAttr(i, kShardingAttr, sharding);
+          if (manualAxesAttr) {
+            funcOp.setResultAttr(i, kManualAxes, manualAxesAttr);
+          }
         }
-        callOpAttrs.push_back(NamedAttribute(
-            rewriter.getStringAttr(kShardingAttr), *outShardings));
       }
 
-      mlir::StringAttr funcName = symbolTable.insert(funcOp);
+      // Replace the `NamedComputationOp` with a `CallOp`.
+      mlir::SmallVector<NamedAttribute> callOpAttrs(
+          namedComputationOp->getDiscardableAttrs());
       auto callOp = rewriter.replaceOpWithNewOp<CallOp>(
-          namedComputationOp, namedComputationOp.getResultTypes(), funcName,
-          namedComputationOp.getOperands());
+          namedComputationOp, namedComputationOp.getResultTypes(),
+          symbolTable.insert(funcOp), namedComputationOp.getOperands());
       callOp->setAttrs(callOpAttrs);
+
+      // Copy the output shardings to the call op.
+      if (outShardings.has_value()) {
+        mlir::sdy::setShardings(callOp, *outShardings);
+        if (manualAxesAttr) {
+          callOp->setAttr(kManualAxes, manualAxesAttr);
+        }
+      }
     });
   }
 
@@ -113,10 +140,9 @@ class ExportNamedComputationsPass
   }
 
   StringRef getDescription() const override {
-    return "Creates a pass that converts a `NamedComputationOp` with a "
-           "`to a `CallOp` with a new private function "
-           "called the `NamedComputationOp`'s `name`. The new `FuncOp` and "
-           "`CallOp` have the same shardings as the original "
+    return "Converts a `NamedComputationOp` to a `CallOp` with a new private "
+           "function called the `NamedComputationOp`'s `name`. The new "
+           "`FuncOp` and `CallOp` have the same shardings as the original "
            "`NamedComputationOp`s operands/results.";
   }
 };
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.cc
deleted file mode 100644
index c6c069fb53ed..000000000000
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h"
-
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/SymbolTable.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Support/TypeID.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "shardy/dialect/sdy/ir/constants.h"
-#include "shardy/dialect/sdy/ir/dialect.h"
-#include "shardy/dialect/sdy/ir/utils.h"
-#include "xla/service/spmd/shardy/constants.h"
-#include "xla/service/spmd/shardy/utils.h"
-
-namespace xla {
-namespace sdy {
-
-namespace {
-
-using ::mlir::MLIRContext;
-using ::mlir::OpConversionPattern;
-using ::mlir::StringRef;
-using ::mlir::SymbolTable;
-using ::mlir::func::CallOp;
-using ::mlir::func::FuncOp;
-using ::mlir::sdy::kShardingAttr;
-using ::mlir::sdy::NamedComputationOp;
-
-class BackendFuncCallPattern : public OpConversionPattern<CallOp> {
- public:
-  explicit BackendFuncCallPattern(MLIRContext* context,
-                                  const SymbolTable& symbolTable)
-      : OpConversionPattern<CallOp>(context), symbolTable(symbolTable) {}
-
-  mlir::LogicalResult matchAndRewrite(
-      CallOp callOp, OpAdaptor adaptor,
-      mlir::ConversionPatternRewriter& rewriter) const override {
-    if (!hasFrontendAttr(callOp, kXlaBackendConfigAttr)) {
-      return mlir::failure();
-    }
-
-    FuncOp func = symbolTable.lookup<FuncOp>(adaptor.getCallee());
-    CHECK(func) << "Failed to lookup function: "
-                << std::string_view(adaptor.getCallee());  // non-absl ok
-    mlir::SmallVector<mlir::NamedAttribute> namedCompAttrs;
-    llvm::copy_if(callOp->getDiscardableAttrs(),
-                  std::back_inserter(namedCompAttrs),
-                  [](const mlir::NamedAttribute& attr) {
-                    return attr.getName() != kShardingAttr;
-                  });
-
-    auto namedCompOp = rewriter.replaceOpWithNewOp<NamedComputationOp>(
-        callOp, callOp->getResultTypes(), adaptor.getCallee(),
-        adaptor.getOperands(), /*inShardings=*/nullptr,
-        /*outShardings=*/mlir::sdy::getShardingPerValue(callOp));
-    namedCompOp->setAttrs(namedCompAttrs);
-    if (func.getBody().empty()) {
-      return rewriter.notifyMatchFailure(callOp, [](mlir::Diagnostic& diag) {
-        diag << "Tried to use an already inlined FuncOp. Expected each CallOp "
-                "with backend_config to have a unique FuncOp.";
-      });
-    }
-
-    mlir::sdy::inlineRegionAndConvertTerminatorOp<mlir::sdy::ReturnOp>(
-        func.getBody(), namedCompOp.getRegion(), rewriter);
-    rewriter.eraseOp(func);
-
-    return mlir::success();
-  }
-
- private:
-  const SymbolTable& symbolTable;
-};
-
-// Converts a `CallOp` with `backend_config` into a `NamedComputationOp`.
-class ImportBackendFuncCallsPass
-    : public mlir::PassWrapper<ImportBackendFuncCallsPass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
- public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ImportBackendFuncCallsPass)
-
-  void runOnOperation() final {
-    // NOTE: Assume that there is a unique callee for each caller. So no need to
-    // do a walk and copy the callees if there are multiple callers for the
-    // callee.
-    mlir::MLIRContext& context = getContext();
-    mlir::ConversionTarget target(context);
-    target.addLegalOp<NamedComputationOp, mlir::sdy::ReturnOp>();
-    SymbolTable symbolTable(getOperation());
-    target.addDynamicallyLegalOp<CallOp>([&](CallOp op) {
-      // In case the assumption that each host-callback caller has a unique
-      // callee is not true, and an optimized build is being run without
-      // verification, make sure that the callee is a function that exists.
-      return !hasFrontendAttr(op, kXlaBackendConfigAttr);
-    });
-    mlir::RewritePatternSet patterns(&context);
-    patterns.add<BackendFuncCallPattern>(&context, symbolTable);
-    if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
-                                                  std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-
-  StringRef getArgument() const override {
-    return "xla-sdy-import-backend-func-calls";
-  }
-
-  StringRef getDescription() const override {
-    return "Creates a pass that converts a `CallOp` with a `backend_config` "
-           "attr to a `NamedComputationOp` with the function body inlined and "
-           "name of the callee.";
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> createImportBackendFuncCallsPass() {
-  return std::make_unique<ImportBackendFuncCallsPass>();
-}
-
-void registerImportBackendFuncCallsPass() {
-  mlir::registerPass(createImportBackendFuncCallsPass);
-}
-
-}  // namespace sdy
-}  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h
deleted file mode 100644
index 50f03781a172..000000000000
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_BACKEND_FUNC_CALLS_H_
-#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_BACKEND_FUNC_CALLS_H_
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"
-
-namespace xla {
-namespace sdy {
-
-// Creates a pass that converts a `CallOp` with a `backend_config` attr to a
-// `NamedComputationOp` with the function body inlined and name of the callee.
-//
-// This pass is used to handle host offloading calls which are non inlined
-// functions that require the callee to be propagated through.
-//
-// NOTE: it assumes that there is a unique callee for each caller.
-std::unique_ptr<mlir::Pass> createImportBackendFuncCallsPass();
-
-// Register the xla-sdy-import-backend-func-calls pass.
-void registerImportBackendFuncCallsPass();
-
-}  // namespace sdy
-}  // namespace xla
-
-#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_BACKEND_FUNC_CALLS_H_
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_constants.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_constants.cc
index ac9339c6be77..a64949a98161 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_constants.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_constants.cc
@@ -86,6 +86,10 @@ class ImportConstantsPass
   StringRef getDescription() const override {
     return "Converts an `stablehlo.constant` into an `sdy.constant`.";
   }
+
+  void getDependentDialects(mlir::DialectRegistry& registry) const final {
+    registry.insert<mlir::sdy::SdyDialect>();
+  }
 };
 
 }  // namespace
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.cc
index c224ddc444fd..458985ea3731 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.cc
@@ -48,6 +48,8 @@ namespace {
 
 using ::mlir::IntegerAttr;
 using ::mlir::StringRef;
+using ::mlir::sdy::PropagationBarrierOp;
+using ::mlir::sdy::PropagationDirectionAttr;
 using ::mlir::sdy::ShardingConstraintOp;
 using ::mlir::sdy::ShardingGroupOp;
 using ::mlir::sdy::TensorShardingAttr;
@@ -87,6 +89,24 @@ mlir::LogicalResult rewriteShardingCustomCall(
   return mlir::success();
 }
 
+mlir::LogicalResult rewritePropagationBarrierCustomCall(
+    CustomCallOp op, CustomCallOpAdaptor adaptor,
+    mlir::ConversionPatternRewriter& rewriter) {
+  CHECK_EQ(op.getNumOperands(), 1);
+  CHECK_EQ(op.getNumResults(), 1);
+  std::optional<PropagationDirectionAttr> allowedDirection =
+      tryGetFrontendAttr<PropagationDirectionAttr>(op, kAllowedDirectionAttr);
+  if (!allowedDirection.has_value()) {
+    op.emitError() << "expected PropagationBarrier CustomCall Op with a "
+                      "propagation direction.";
+    return mlir::failure();
+  }
+
+  rewriter.replaceOpWithNewOp<PropagationBarrierOp>(
+      op, adaptor.getInputs().front(), allowedDirection->getValue());
+
+  return mlir::success();
+}
 mlir::LogicalResult rewriteShardingGroupCustomCall(
     CustomCallOp op, CustomCallOpAdaptor adaptor,
     mlir::ConversionPatternRewriter& rewriter) {
@@ -122,6 +142,9 @@ class SdyCustomCallPattern : public mlir::OpConversionPattern<CustomCallOp> {
     if (op.getCallTargetName() == kShardingGroupCustomCallTargetName) {
       return rewriteShardingGroupCustomCall(op, adaptor, rewriter);
     }
+    if (op.getCallTargetName() == kPropagationBarrierCustomCallTargetName) {
+      return rewritePropagationBarrierCustomCall(op, adaptor, rewriter);
+    }
 
     return rewriter.notifyMatchFailure(
         op, "expected CustomCallOp with xla.sdy target name.");
@@ -143,7 +166,8 @@ class ImportSdyCustomCallsPass
     target.addLegalDialect<mlir::sdy::SdyDialect>();
     target.addDynamicallyLegalOp<CustomCallOp>([](CustomCallOp op) {
       return op.getCallTargetName() != kShardingCustomCallTargetName &&
-             op.getCallTargetName() != kShardingGroupCustomCallTargetName;
+             op.getCallTargetName() != kShardingGroupCustomCallTargetName &&
+             op.getCallTargetName() != kPropagationBarrierCustomCallTargetName;
     });
     mlir::RewritePatternSet patterns(&context);
     patterns.add<SdyCustomCallPattern>(&context);
@@ -162,6 +186,10 @@ class ImportSdyCustomCallsPass
            "ShardingConstraintOp and with target name ShardingGroup into a "
            "ShardingGroupOp.";
   }
+
+  void getDependentDialects(mlir::DialectRegistry& registry) const final {
+    registry.insert<mlir::sdy::SdyDialect>();
+  }
 };
 
 }  // namespace
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.cc
new file mode 100644
index 000000000000..2f2720036b9a
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.cc
@@ -0,0 +1,167 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.h"
+
+#include <iterator>
+#include <memory>
+
+#include "absl/log/check.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Threading.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "shardy/dialect/sdy/ir/constants.h"
+#include "shardy/dialect/sdy/ir/dialect.h"
+#include "shardy/dialect/sdy/ir/utils.h"
+#include "xla/service/spmd/shardy/constants.h"
+#include "xla/service/spmd/shardy/utils.h"
+
+namespace xla {
+namespace sdy {
+
+namespace {
+
+using ::mlir::IRRewriter;
+using ::mlir::StringRef;
+using ::mlir::SymbolTable;
+using ::mlir::func::CallOp;
+using ::mlir::func::FuncOp;
+using ::mlir::sdy::kShardingAttr;
+using ::mlir::sdy::NamedComputationOp;
+
+bool isInlineableCallOp(CallOp callOp) {
+  if (hasFrontendAttr(callOp, kXlaBackendConfigAttr)) {
+    return false;
+  }
+  auto inlineableAttr =
+      tryGetFrontendAttr<mlir::BoolAttr>(callOp, kXlaInlineableAttr);
+  return !inlineableAttr || inlineableAttr->getValue();
+}
+
+void importCallOp(
+    CallOp callOp,
+    llvm::SmallDenseMap<StringRef, mlir::Region*>& calleeNameToMovedRegion,
+    IRRewriter& rewriter, SymbolTable& symbolTable) {
+  mlir::SmallVector<mlir::NamedAttribute> namedCompAttrs;
+  llvm::copy_if(callOp->getDiscardableAttrs(),
+                std::back_inserter(namedCompAttrs),
+                [](const mlir::NamedAttribute& attr) {
+                  return attr.getName() != kShardingAttr;
+                });
+
+  StringRef calleeName = callOp.getCallee();
+  rewriter.setInsertionPoint(callOp);
+  auto namedCompOp = rewriter.create<NamedComputationOp>(
+      callOp->getLoc(), callOp->getResultTypes(), calleeName,
+      callOp.getOperands(),
+      /*inShardings=*/nullptr,
+      /*outShardings=*/mlir::sdy::getShardingPerValue(callOp));
+  namedCompOp->setAttrs(namedCompAttrs);
+
+  mlir::Region& namedCompRegion = namedCompOp.getRegion();
+  if (auto movedRegionIt = calleeNameToMovedRegion.find(calleeName);
+      movedRegionIt != calleeNameToMovedRegion.end()) {
+    static llvm::once_flag onceFlag;
+    mlir::sdy::emitOpWarningOnce(
+        onceFlag, callOp,
+        llvm::formatv("uninlineable function @{0} has multiple call ops, we "
+                      "need to clone the function body for each call",
+                      calleeName)
+            .str());
+    rewriter.cloneRegionBefore(*movedRegionIt->second, namedCompRegion,
+                               namedCompRegion.begin());
+  } else {
+    FuncOp funcOp = symbolTable.lookup<FuncOp>(calleeName);
+    CHECK(funcOp) << "Failed to lookup function: " << calleeName.str();
+    mlir::sdy::inlineRegionAndConvertTerminatorOp<mlir::sdy::ReturnOp>(
+        funcOp.getBody(), namedCompRegion);
+    calleeNameToMovedRegion[calleeName] = &namedCompRegion;
+  }
+
+  rewriter.replaceOp(callOp, namedCompOp);
+}
+
+class ImportUninlineableFuncCallsPass
+    : public mlir::PassWrapper<ImportUninlineableFuncCallsPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ImportUninlineableFuncCallsPass)
+
+  void runOnOperation() final {
+    mlir::ModuleOp moduleOp = getOperation();
+    IRRewriter rewriter(moduleOp.getContext());
+    SymbolTable symbolTable(moduleOp);
+    // For every callee name, the first CallOp encountered with that symbol will
+    // move the body of the callee into the created NamedComputationOp, and map
+    // the symbol name to the moved region. Subsequent CallOps with that symbol
+    // will clone the mapped region.
+    llvm::SmallDenseMap<StringRef, mlir::Region*> calleeNameToMovedRegion;
+
+    moduleOp->walk([&](CallOp op) {
+      if (isInlineableCallOp(op)) {
+        return;
+      }
+      importCallOp(op, calleeNameToMovedRegion, rewriter, symbolTable);
+    });
+
+    // Erase all func ops that now have no call ops.
+    for (auto [calleeName, _] : calleeNameToMovedRegion) {
+      symbolTable.erase(symbolTable.lookup(calleeName));
+    }
+  }
+
+  StringRef getArgument() const override {
+    return "xla-sdy-import-uninlineable-func-calls";
+  }
+
+  StringRef getDescription() const override {
+    return "Creates a pass that converts a `CallOp` with a `backend_config` "
+           "or `inlineable=false` frontend attr to a `NamedComputationOp` with "
+           "the function body inlined and the name of the callee.";
+  }
+
+  void getDependentDialects(mlir::DialectRegistry& registry) const final {
+    registry.insert<mlir::sdy::SdyDialect>();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> createImportUninlineableFuncCallsPass() {
+  return std::make_unique<ImportUninlineableFuncCallsPass>();
+}
+
+void registerImportUninlineableFuncCallsPass() {
+  mlir::registerPass(createImportUninlineableFuncCallsPass);
+}
+
+}  // namespace sdy
+}  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.h b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.h
new file mode 100644
index 000000000000..fffe34bd63e3
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_UNINLINEABLE_FUNC_CALLS_H_
+#define XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_UNINLINEABLE_FUNC_CALLS_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace sdy {
+
+// Creates a pass that converts a `CallOp` with a `backend_config` or
+// `inlineable=false` frontend attr to a `NamedComputationOp` with the function
+// body inlined and name of the callee.
+//
+// This pass is used to handle host offloading and GPU stream calls which are
+// non inlined functions that require the callee to be propagated through.
+//
+// NOTE: In case there are multiple call ops for the same callee, we will clone
+// the function body for each call op and emit a warning.
+std::unique_ptr<mlir::Pass> createImportUninlineableFuncCallsPass();
+
+// Register the xla-sdy-import-uninlineable-calls pass.
+void registerImportUninlineableFuncCallsPass();
+
+}  // namespace sdy
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARDY_ROUND_TRIP_COMMON_IMPORT_UNINLINEABLE_FUNC_CALLS_H_
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
index 6fe201ccb4fb..dc1506711a2a 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
@@ -58,10 +59,15 @@ class OpenWhileFreeVarsShardingPass
         if (!sharding || sharding.getRank() == 0) {
           continue;
         }
+        auto fullyOpenSharding = TensorShardingAttr::getFullyOpenLike(sharding);
+        if (fullyOpenSharding == sharding) {
+          // The sharding of the `freeVar` is already fully open, no need to add
+          // a sharding constraint.
+          continue;
+        }
         auto shardingConstraint =
             rewriter.create<mlir::sdy::ShardingConstraintOp>(
-                freeVar.getLoc(), freeVar,
-                TensorShardingAttr::getFullyOpenLike(sharding));
+                freeVar.getLoc(), freeVar, fullyOpenSharding);
         // Only replace uses in the regions of the while op.
         rewriter.replaceUsesWithIf(
             freeVar, shardingConstraint, [op](mlir::OpOperand& use) {
@@ -79,6 +85,10 @@ class OpenWhileFreeVarsShardingPass
     return "Adds a fully open sharding constraint to free variables of while "
            "op that already have a sharding.";
   }
+
+  void getDependentDialects(mlir::DialectRegistry& registry) const final {
+    registry.insert<mlir::sdy::SdyDialect>();
+  }
 };
 
 }  // namespace
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc
index 517783cdc201..9b4b647fdd2a 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/pipeline_passes.cc
@@ -17,13 +17,11 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
-#include "stablehlo/transforms/Passes.h"
 #include "xla/mlir_hlo/stablehlo_ext/transforms/passes.h"
-#include "xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h"
 #include "xla/service/spmd/shardy/round_trip_common/import_constants.h"
 #include "xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.h"
+#include "xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.h"
 #include "xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.h"
 
 namespace xla {
@@ -56,7 +54,7 @@ void addCommonPreImportPasses(mlir::OpPassManager& pm,
 void addCommonPostImportPasses(mlir::OpPassManager& pm) {
   pm.addPass(createImportSdyCustomCallsPass());
   pm.addNestedPass<FuncOp>(createOpenWhileFreeVarsShardingPass());
-  pm.addPass(createImportBackendFuncCallsPass());
+  pm.addPass(createImportUninlineableFuncCallsPass());
 }
 
 }  // namespace sdy
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_opt_main.cc b/third_party/xla/xla/service/spmd/shardy/sdy_opt_main.cc
index 66481f6513c7..044be7311f78 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_opt_main.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_opt_main.cc
@@ -20,16 +20,17 @@ limitations under the License.
 #include "shardy/dialect/sdy/ir/register.h"
 #include "shardy/dialect/sdy/transforms/passes.h"
 #include "shardy/round_trip_import/pipelines.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/stablehlo_ext/transforms/passes.h"
+#include "xla/service/spmd/shardy/extensions/mhlo_extensions.h"
 #include "xla/service/spmd/shardy/round_trip_common/export_named_computations.h"
-#include "xla/service/spmd/shardy/round_trip_common/import_backend_func_calls.h"
 #include "xla/service/spmd/shardy/round_trip_common/import_constants.h"
 #include "xla/service/spmd/shardy/round_trip_common/import_sdy_custom_calls.h"
+#include "xla/service/spmd/shardy/round_trip_common/import_uninlineable_func_calls.h"
 #include "xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/dedup_meshes.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/export_ops.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/export_shardy_attrs.h"
-#include "xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/pipelines.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h"
@@ -51,6 +52,8 @@ int main(int argc, char** argv) {
   mlir::DialectRegistry dialects;
   mlir::sdy::registerAllDialects(dialects);
   mlir::func::registerAllExtensions(dialects);
+  dialects.insert<mlir::mhlo::MhloDialect>();
+  xla::sdy::registerMhloExtensions(dialects);
 
   // Register all SDY passes and pipelines.
   mlir::sdy::registerAllSdyPassesAndPipelines();
@@ -60,7 +63,7 @@ int main(int argc, char** argv) {
   xla::sdy::registerStablehloRoundTripShardMapImportPass();
   xla::sdy::registerImportSdyCustomCallsPass();
   xla::sdy::registerOpenWhileFreeVarsShardingPass();
-  xla::sdy::registerImportBackendFuncCallsPass();
+  xla::sdy::registerImportUninlineableFuncCallsPass();
   xla::sdy::registerImportConstantsPass();
 
   xla::sdy::registerStablehloExportPipeline();
@@ -72,7 +75,6 @@ int main(int argc, char** argv) {
 
   xla::sdy::registerSdyRoundTripStablehloToHloToStablehloPass();
   xla::sdy::registerSdyRoundTripExportShardyAttrsPass();
-  xla::sdy::registerSdyRoundTripImportCallbackCustomCallsPass();
   xla::sdy::registerSdyRoundTripImportShardyAttrsPass();
   xla::sdy::registerSdyRoundTripRemoveSizeOneAxesPass();
   xla::sdy::registerSdyRoundTripExportOpsPass();
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/BUILD b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/BUILD
index feace0176aef..285ae2de818f 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/BUILD
@@ -1,3 +1,4 @@
+load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -11,7 +12,9 @@ package_group(
     packages = [
         "//learning/deepmind/partir/compiler/mpmd/...",
         "//learning/deepmind/partir/compiler/shardonnay/...",
+        "//third_party/australis/google/ifrt/...",
         "//third_party/openxla/shardy/tools/...",
+        "//third_party/py/jax/...",
         "//xla/...",
     ],
 )
@@ -55,9 +58,12 @@ cc_library(
     name = "import_shardy_attrs",
     srcs = ["import_shardy_attrs.cc"],
     hdrs = ["import_shardy_attrs.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "//xla/service/spmd/shardy:constants",
         "//xla/service/spmd/shardy:utils",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:FuncDialect",
@@ -97,7 +103,6 @@ cc_library(
         "//xla/service/spmd/shardy:constants",
         "//xla/service/spmd/shardy:utils",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -116,7 +121,6 @@ cc_library(
     deps = [
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -126,36 +130,17 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "import_callback_custom_calls",
-    srcs = ["import_callback_custom_calls.cc"],
-    hdrs = ["import_callback_custom_calls.h"],
-    deps = [
-        "//xla/service/spmd/shardy:utils",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-        "@shardy//shardy/dialect/sdy/ir:dialect",
-        "@stablehlo//:stablehlo_ops",
-    ],
-)
-
 cc_library(
     name = "dedup_meshes",
     srcs = ["dedup_meshes.cc"],
     hdrs = ["dedup_meshes.h"],
     deps = [
-        "//xla/service/spmd/shardy:utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
         "@shardy//shardy/dialect/sdy/ir:dialect",
         "@shardy//shardy/dialect/sdy/transforms/common:sharding_walker",
-        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -163,11 +148,11 @@ cc_library(
     name = "pipelines",
     srcs = ["pipelines.cc"],
     hdrs = ["pipelines.h"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":dedup_meshes",
         ":export_ops",
         ":export_shardy_attrs",
-        ":import_callback_custom_calls",
         ":import_shardy_attrs",
         ":remove_size_one_axes",
         ":shard_map_export",
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_ops.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_ops.cc
index b9bfc2762440..e240c8835281 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_ops.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/export_ops.cc
@@ -62,6 +62,7 @@ using ::mlir::StringRef;
 using ::mlir::success;
 
 using ::mlir::sdy::ConstantOp;
+using ::mlir::sdy::PropagationBarrierOp;
 using ::mlir::sdy::ShardingConstraintOp;
 using ::mlir::sdy::ShardingGroupOp;
 using ::mlir::sdy::TensorShardingAttr;
@@ -123,7 +124,27 @@ class ShardingGroupPattern : public OpConversionPattern<ShardingGroupOp> {
     customCallOp.setCallTargetName(kShardingGroupCustomCallTargetName);
     setFrontendAttribute(customCallOp, kShardingGroupIdAttr,
                          op.getGroupIdAttr());
-    customCallOp.setHasSideEffectAttr(rewriter.getBoolAttr(true));
+    customCallOp.setHasSideEffect(true);
+    return success();
+  }
+};
+
+class PropagationBarrierPattern
+    : public OpConversionPattern<PropagationBarrierOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+ private:
+  LogicalResult matchAndRewrite(
+      PropagationBarrierOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    auto customCallOp = rewriter.replaceOpWithNewOp<stablehlo::CustomCallOp>(
+        op, op->getResultTypes(), adaptor.getInput());
+
+    customCallOp.setCallTargetName(kPropagationBarrierCustomCallTargetName);
+    customCallOp.setHasSideEffect(true);
+    setFrontendAttribute(customCallOp, kAllowedDirectionAttr,
+                         op.getAllowedDirectionAttr());
     return success();
   }
 };
@@ -139,9 +160,8 @@ class SdyRoundTripExportOpsPass
     target.addIllegalOp<ConstantOp, ShardingConstraintOp>();
     target.addLegalOp<stablehlo::ConstantOp, stablehlo::CustomCallOp>();
     mlir::RewritePatternSet patterns(&context);
-    patterns
-        .add<ConstantPattern, ShardingConstraintPattern, ShardingGroupPattern>(
-            &context);
+    patterns.add<ConstantPattern, ShardingConstraintPattern,
+                 ShardingGroupPattern, PropagationBarrierPattern>(&context);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
       signalPassFailure();
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.cc
deleted file mode 100644
index 0fa3f44d8204..000000000000
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h"
-
-#include <memory>
-
-#include "llvm/ADT/StringRef.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/DialectRegistry.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/TypeID.h"
-#include "stablehlo/dialect/StablehloOps.h"
-#include "xla/service/spmd/shardy/utils.h"
-
-namespace xla {
-namespace sdy {
-
-namespace {
-
-using ::mlir::ModuleOp;
-using ::mlir::StringRef;
-using ::mlir::stablehlo::CustomCallOp;
-
-class SdyRoundTripImportCallbackCustomCallsPass
-    : public mlir::PassWrapper<SdyRoundTripImportCallbackCustomCallsPass,
-                               mlir::OperationPass<ModuleOp>> {
- public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      SdyRoundTripImportCallbackCustomCallsPass)
-
-  void runOnOperation() final {
-    getOperation().walk([&](CustomCallOp op) {
-      if (op->getNumResults() != 0 || !isPythonCallbackCustomCall(op)) {
-        return;
-      }
-      mlir::IRRewriter rewriter(op);
-      // Shardy needs at least one op result to have a sharding annotation.
-      // Since the callback has no results, and we need to say the callbacks
-      // have a maximal sharding, we add a dummy result and set the result
-      // layout to the 0th operand layout.
-      CustomCallOp newCustomCall = cloneCustomCallWithNewResultTypes(
-          op, op->getOperand(0).getType(), rewriter);
-      newCustomCall.setResultLayoutsAttr(rewriter.getArrayAttr(
-          {op.getOperandLayoutsAttr().getValue().front()}));
-      rewriter.eraseOp(op);
-    });
-  }
-
-  StringRef getArgument() const override {
-    return "xla-sdy-round-trip-import-callback-custom-calls";
-  }
-
-  StringRef getDescription() const override {
-    return "Modifies the return types of XLA host callback custom calls to be "
-           "compatible with SDY";
-  }
-
-  void getDependentDialects(mlir::DialectRegistry& registry) const final {
-    registry.insert<mlir::stablehlo::StablehloDialect>();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> createSdyRoundTripImportCallbackCustomCallsPass() {
-  return std::make_unique<SdyRoundTripImportCallbackCustomCallsPass>();
-}
-
-void registerSdyRoundTripImportCallbackCustomCallsPass() {
-  mlir::registerPass(createSdyRoundTripImportCallbackCustomCallsPass);
-}
-
-}  // namespace sdy
-}  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h
deleted file mode 100644
index ce81f5ead471..000000000000
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_CALLBACK_CUSTOM_CALLS_H_
-#define XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_CALLBACK_CUSTOM_CALLS_H_
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"
-
-namespace xla {
-namespace sdy {
-
-// Creates the pass to modify the return types of XLA host callback custom calls
-// to be compatible with SDY.
-//
-// Shardy shardings require an op to have at least one result, and the XLA host
-// callback custom calls are not guaranteed to return a value.
-// To allow the custom calls to have a maximal sharding, we change the return
-// type to return a dummy value.
-std::unique_ptr<mlir::Pass> createSdyRoundTripImportCallbackCustomCallsPass();
-
-// Registers the xla-sdy-round-trip-import-callback-custom-calls pass.
-void registerSdyRoundTripImportCallbackCustomCallsPass();
-
-}  // namespace sdy
-}  // namespace xla
-
-#endif  // XLA_SERVICE_SPMD_SHARDY_SDY_ROUND_TRIP_IMPORT_CALLBACK_CUSTOM_CALLS_H_
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.cc
index 53dbc0cf473d..1953d21c1c80 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string_view>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/AsmParser/AsmParser.h"
@@ -45,6 +48,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 #include "shardy/dialect/sdy/ir/constants.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
+#include "shardy/dialect/sdy/ir/utils.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "xla/service/spmd/shardy/constants.h"
 #include "xla/service/spmd/shardy/utils.h"
@@ -71,9 +75,90 @@ using ::mlir::sdy::MeshAttr;
 using ::mlir::sdy::OpShardingRuleAttr;
 using ::mlir::sdy::TensorShardingAttr;
 using ::mlir::sdy::TensorShardingPerValueAttr;
+using ::mlir::stablehlo::CustomCallOp;
 
 namespace stablehlo = ::mlir::stablehlo;
 
+CustomCallOp dynCastX64CombineCustomCall(Operation* op) {
+  auto customCallOp = mlir::dyn_cast<CustomCallOp>(op);
+  if (!customCallOp || customCallOp.getCallTargetName() != "X64Combine") {
+    return nullptr;
+  }
+  return customCallOp;
+}
+
+CustomCallOp getX64CombineOnFuncResultSharding(
+    CustomCallOp funcResultSharding) {
+  if (funcResultSharding.getNumResults() != 2 ||
+      !funcResultSharding.getResult(0).hasOneUse() ||
+      !funcResultSharding.getResult(1).hasOneUse()) {
+    return nullptr;
+  }
+  Operation* lhsUser = *funcResultSharding.getResult(0).user_begin();
+  Operation* rhsUser = *funcResultSharding.getResult(1).user_begin();
+  if (lhsUser != rhsUser) {
+    return nullptr;
+  }
+  return dynCastX64CombineCustomCall(lhsUser);
+}
+
+void handleFuncResultSharding(CustomCallOp funcResultSharding, FuncOp funcOp,
+                              DictionaryAttr dictAttr, IRRewriter& rewriter) {
+  // This is a temporary CustomCallOp that holds the sharding from a
+  // func result. When importing we want to move that sharding to the
+  // func result and delete the CustomCallOp.
+  auto shardingPerValueAttr = parseStringAttr<TensorShardingPerValueAttr>(
+      dictAttr, kShardingRoundTripAttr);
+
+  auto resultUses = funcResultSharding->getUses();
+  auto x64CombineOp = getX64CombineOnFuncResultSharding(funcResultSharding);
+  if (x64CombineOp) {
+    // X64Rewriter pass will pass through the two split 32-bit operands to
+    // the `xla.sdy.FuncResultSharding`, which will return two 32-bit results,
+    // that would then be passed to a `X64Combine` custom-call. Therefore, we
+    // need to look at the uses of the `X64Combine` instead to find the
+    // corresponding `func.return` op.
+    mlir::sdy::setShardings(x64CombineOp, shardingPerValueAttr);
+    resultUses = x64CombineOp->getUses();
+  } else if (auto* defOp = funcResultSharding.getOperand(0).getDefiningOp();
+             defOp && funcResultSharding->use_empty()) {
+    // It `funcResultSharding` has no uses, it is likely because it has a
+    // dimension of size 0 (i.e. 0 num-elements), in which case its uses will be
+    // replaced with a constant of the same shape, which will replace the
+    // operand of the `funcResultSharding`.
+    resultUses = defOp->getUses();
+  }
+  TensorShardingAttr sharding = shardingPerValueAttr.getSharding(0);
+  bool hasNonFuncReturnUses = false;
+  for (mlir::OpOperand& use : llvm::make_early_inc_range(resultUses)) {
+    if (mlir::isa<mlir::func::ReturnOp>(use.getOwner())) {
+      funcOp.setResultAttr(use.getOperandNumber(), kShardingAttr,
+                           shardingPerValueAttr.getSharding(0));
+    } else if (use.getOwner() != funcResultSharding &&
+               !dynCastX64CombineCustomCall(use.getOwner())) {
+      hasNonFuncReturnUses = true;
+      LOG(WARNING) << std::string_view(  // non-absl ok
+                          kFuncResultShardingTargetName)
+                   << " custom-call has a user that isn't `func.return` ("
+                   << std::string_view(  // non-absl ok
+                          use.getOwner()->getName().getStringRef())
+                   << "). Please file a bug with a reproducer.";
+    }
+  }
+  if (hasNonFuncReturnUses && !x64CombineOp) {
+    // If there are users that are not the func return op, which might happen
+    // due to inlined func ops that originally had result shardings, we replace
+    // the `xla.sdy.FuncResultSharding` with a `ShardingConstraintOp` to
+    // preserve the original func result sharding.
+    rewriter.setInsertionPoint(funcResultSharding);
+    CHECK_EQ(funcResultSharding.getNumOperands(), 1);
+    rewriter.replaceOpWithNewOp<mlir::sdy::ShardingConstraintOp>(
+        funcResultSharding, funcResultSharding.getOperand(0), sharding);
+  } else {
+    rewriter.replaceOp(funcResultSharding, funcResultSharding.getOperands());
+  }
+}
+
 // Builds the shardy attributes coming from Shardy previously. This means
 // the module was exported from Shardy and we are now round-tripping back.
 // This should happen after the meshes were created from the `ModuleOp` attrs
@@ -87,10 +172,11 @@ void convertShardyAttrs(FuncOp funcOp, IRRewriter& rewriter) {
     // Attempt to extract the TensorShardingAttr from the frontend attributes of
     // the function argument/result.
     if (DictionaryAttr dictAttr = getFuncArgFrontendAttrs(funcOp, argNum)) {
-      funcOp.setArgAttr(argNum, kShardingAttr,
-                        parseStringAttr<TensorShardingAttr>(
-                            dictAttr, kShardingRoundTripAttr));
-      removeFrontendAttribute(funcOp, kShardingRoundTripAttr, argNum);
+      if (auto sharding = parseStringAttr<TensorShardingAttr>(
+              dictAttr, kShardingRoundTripAttr)) {
+        funcOp.setArgAttr(argNum, kShardingAttr, sharding);
+        removeFrontendAttribute(funcOp, kShardingRoundTripAttr, argNum);
+      }
     }
   }
 
@@ -111,8 +197,10 @@ void convertShardyAttrs(FuncOp funcOp, IRRewriter& rewriter) {
     // `SendOp` and `RecvOp` can have a sharding when doing TPU callbacks
     // through JAX.
     if (mlir::isa<stablehlo::SendOp, stablehlo::RecvOp>(op)) {
-      op->setAttr(kShardingAttr, parseStringAttr<TensorShardingPerValueAttr>(
-                                     dictAttr, kShardingRoundTripAttr));
+      auto sharding = parseStringAttr<TensorShardingPerValueAttr>(
+          dictAttr, kShardingRoundTripAttr);
+      CHECK(sharding) << "Expect sharding to exist for SendOp/RecvOp.";
+      op->setAttr(kShardingAttr, sharding);
     }
     // NOTE: we are only setting the sharding on known custom-calls. For any
     // other op that has a `kShardingRoundTripAttr` we discard it. XLA sometimes
@@ -120,27 +208,10 @@ void convertShardyAttrs(FuncOp funcOp, IRRewriter& rewriter) {
     // which may mean the shapes are wrong when the new instruction is a reshape
     // for example. This does mean we can't fully round-trip b/w HLO and MLIR
     // after SDY propagation.
-    if (auto customCallOp = mlir::dyn_cast<stablehlo::CustomCallOp>(op)) {
+    if (auto customCallOp = mlir::dyn_cast<CustomCallOp>(op)) {
       StringRef targetName = customCallOp.getCallTargetName();
       if (targetName == kFuncResultShardingTargetName) {
-        // This is a temporary CustomCallOp that holds the sharding from a
-        // func result. When importing we want to move that sharding to the
-        // func result and delete the CustomCallOp.
-        auto shardingPerValueAttr = parseStringAttr<TensorShardingPerValueAttr>(
-            dictAttr, kShardingRoundTripAttr);
-        for (mlir::OpOperand& use :
-             llvm::make_early_inc_range(customCallOp->getUses())) {
-          // We currently ignore users that are not the func return op.
-          // This might happen due to inlined func ops that originally had
-          // result shardings.
-          // TODO(b/370984308): explore if we need to support this properly.
-          if (mlir::isa<mlir::func::ReturnOp>(use.getOwner())) {
-            funcOp.setResultAttr(use.getOperandNumber(), kShardingAttr,
-                                 shardingPerValueAttr.getSharding(0));
-            use.set(customCallOp.getOperand(0));
-          }
-        }
-        rewriter.replaceOp(customCallOp, customCallOp.getOperand(0));
+        handleFuncResultSharding(customCallOp, funcOp, dictAttr, rewriter);
         return;
       }
       if (targetName == kShardingCustomCallTargetName ||
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h
index 0e75e2fbc648..9e930e537017 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h
@@ -25,10 +25,12 @@ namespace sdy {
 
 // Creates the pass to convert frontend attributes to SDY attributes:
 //
+// - Converts meshes from `kMeshesRoundTripAttr` to sdy.mesh symbols
 // - Converts shardings from `kShardingRoundTripAttr` to `kShardingAttr`
 // - Converts sharding rules from `kShardingRuleRoundTripAttr` to
 //   `kShardingRuleAttr`
-// - Converts meshes from `kMeshesRoundTripAttr` to sdy.mesh symbols
+// - Replaces `kFuncResultShardingTargetName` custom-calls with the
+//   corresponding func result sharding.
 std::unique_ptr<mlir::Pass> createSdyRoundTripImportShardyAttrsPass();
 
 // Registers the xla-sdy-round-trip-import-shardy-attrs pass.
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/pipelines.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/pipelines.cc
index df6420d3cc24..81aec63bb394 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/pipelines.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/pipelines.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/service/spmd/shardy/sdy_round_trip/dedup_meshes.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/export_ops.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/export_shardy_attrs.h"
-#include "xla/service/spmd/shardy/sdy_round_trip/import_callback_custom_calls.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/import_shardy_attrs.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/shard_map_export.h"
@@ -57,7 +56,6 @@ void addSdyRoundTripExportPipeline(mlir::OpPassManager& pm) {
 void addSdyRoundTripImportPipeline(mlir::OpPassManager& pm,
                                    bool enableConstantImport) {
   addCommonPreImportPasses(pm, enableConstantImport);
-  pm.addPass(createSdyRoundTripImportCallbackCustomCallsPass());
   pm.addPass(createSdyRoundTripImportShardyAttrsPass());
   pm.addPass(createSdyRoundTripShardMapImportPass());
   pm.addPass(createSdyRoundTripRemoveSizeOneAxesPass());
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.cc
index 5a881e5b1b36..627567af34d0 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/strings/string_view.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -65,18 +64,10 @@ bool hasSizeOneAxes(MeshOp meshOp) {
                       [](MeshAxisAttr axis) { return axis.getSize() == 1; });
 }
 
-MeshAttr removeSizeOneAxes(MeshAttr mesh) {
-  SmallVector<MeshAxisAttr> axes;
-  llvm::copy_if(mesh.getAxes(), std::back_inserter(axes),
-                [](MeshAxisAttr axis) { return axis.getSize() != 1; });
-  return MeshAttr::get(mesh.getContext(), axes, mesh.getDeviceIds());
-}
-
 TensorShardingAttr removeSizeOneAxes(TensorShardingAttr sharding,
                                      const SymbolTable& symbolTable) {
   MeshAttr mesh = sharding.getMesh(symbolTable);
-  CHECK(mesh) << "unknown mesh: "
-              << std::string_view(sharding.getMeshName());  // non-absl ok
+  CHECK(mesh) << "unknown mesh: " << sharding.getMeshName().str();
 
   auto isNotSizeOne = [&](AxisRefAttr axis) { return axis.getSize(mesh) != 1; };
 
@@ -105,14 +96,8 @@ TensorShardingAttr removeSizeOneAxes(TensorShardingAttr sharding,
   llvm::copy_if(sharding.getReplicatedAxes(),
                 std::back_inserter(replicatedAxes), isNotSizeOne);
 
-  // Remove for inlined mesh.
-  mlir::Attribute meshOrRef = sharding.getMeshOrRef();
-  if (auto mesh = mlir::dyn_cast<MeshAttr>(meshOrRef)) {
-    meshOrRef = removeSizeOneAxes(mesh);
-  }
-
-  return TensorShardingAttr::get(sharding.getContext(), meshOrRef, dimShardings,
-                                 replicatedAxes);
+  return TensorShardingAttr::get(sharding.getContext(), sharding.getMeshOrRef(),
+                                 dimShardings, replicatedAxes);
 }
 
 void removeSizeOneManualAxes(ManualComputationOp manualComputationOp,
@@ -159,10 +144,8 @@ class SdyRoundTripRemoveSizeOneAxesPass
             removeSizeOneManualAxes(manualComputationOp, symbolTable);
           }
         });
-
-    for (auto meshOp : moduleOp.getOps<MeshOp>()) {
-      meshOp.setMeshAttr(removeSizeOneAxes(meshOp.getMesh()));
-    }
+    // The meshes still have size one axes, but they are not used in the
+    // shardings anymore.
   }
 
   StringRef getArgument() const override {
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h
index 04d280e5d911..0100f8d82dd2 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/remove_size_one_axes.h
@@ -23,9 +23,9 @@ limitations under the License.
 namespace xla {
 namespace sdy {
 
-// Creates the pass that removes axes of size one from all meshes, shardings,
-// and manual computation ops, to avoid conflict during propagation that are due
-// to such axes.
+// Creates the pass that removes axes of size one from all shardings and manual
+// computation ops, to avoid conflict during propagation that are due to such
+// axes. Note that the axes in the meshes are not removed.
 std::unique_ptr<mlir::Pass> createSdyRoundTripRemoveSizeOneAxesPass();
 
 // Registers the xla-sdy-round-trip-remove-size-one-axes pass.
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_export.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_export.cc
index 711e1b549353..4c3dab10eee7 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_export.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_export.cc
@@ -93,12 +93,21 @@ class SdyRoundTripShardMapExportPass
         globalToLocalShape = rewriter.create<stablehlo::CustomCallOp>(
             loc, manualCompBodyArgTypes, operands);
         globalToLocalShape.setCallTargetName(kGlobalToLocalShapeCallTargetName);
+        // We mark `xla.sdy.GlobalToLocalShape` as side-effecting to avoid
+        // CSE deduping it with another taking the same operands, as it would
+        // ignore the frontend attributes that could be different.
         globalToLocalShape.setHasSideEffect(true);
+        setFrontendAttribute(globalToLocalShape, kInShardings,
+                             manualComputation.getInShardings());
+        setFrontendAttribute(globalToLocalShape, kManualAxes,
+                             manualComputation.getManualAxesAttr());
         operands = globalToLocalShape->getResults();
       }
 
       auto callOp =
           rewriter.create<CallOp>(loc, localResultTypes, funcName, operands);
+      // TODO(b/409855903): old code path. Remove after 3 weeks of cl/745735176
+      // being submitted.
       setFrontendAttribute(callOp, kInShardings,
                            manualComputation.getInShardings());
       setFrontendAttribute(callOp, kOutShardings,
@@ -110,8 +119,14 @@ class SdyRoundTripShardMapExportPass
       if (!results.empty()) {
         auto localToGlobalShape = rewriter.create<stablehlo::CustomCallOp>(
             loc, manualComputation.getResultTypes(), callOp->getResults());
-        localToGlobalShape.setHasSideEffect(true);
+        // We don't mark `xla.sdy.LocalToGlobalShape` as side-effecting, so if
+        // any of its results has a dimension of size 0 (i.e. 0 num-elements),
+        // it will be replaced with a constant of the same shape.
         localToGlobalShape.setCallTargetName(kLocalToGlobalShapeCallTargetName);
+        setFrontendAttribute(localToGlobalShape, kOutShardings,
+                             manualComputation.getOutShardings());
+        setFrontendAttribute(localToGlobalShape, kManualAxes,
+                             manualComputation.getManualAxesAttr());
         results = localToGlobalShape->getResults();
       }
       sdy::inlineRegionAndConvertTerminatorOp<mlir::func::ReturnOp>(
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
index d2c1e468167f..500646035de7 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/shard_map_import.cc
@@ -87,7 +87,7 @@ class ManualComputationPattern : public OpConversionPattern<CallOp> {
     // `ManualComputationOp` differently depending on whether the original had
     // operands/results.
     CustomCallOp globalToLocalShape;
-    mlir::ValueRange operands = callOp->getOperands();
+    mlir::ValueRange operands = adaptor.getOperands();
     if (!operands.empty()) {
       // An input to `sdy.manual_computation` can have a dimension of size 0
       // (i.e. 0 num-elements), in which case, the corresponding result of
@@ -97,7 +97,11 @@ class ManualComputationPattern : public OpConversionPattern<CallOp> {
       auto customCallResIt = llvm::find_if(operands, [](mlir::Value operand) {
         return operand.getDefiningOp<CustomCallOp>();
       });
-      CHECK(customCallResIt != operands.end());
+      if (customCallResIt == operands.end()) {
+        return callOp->emitOpError("expected at least one operand of ")
+               << callOp.getCalleeAttr() << " to be produced by a "
+               << kGlobalToLocalShapeCallTargetName << " CustomCallOp";
+      }
       globalToLocalShape = (*customCallResIt).getDefiningOp<CustomCallOp>();
       CHECK(globalToLocalShape.getCallTargetName() ==
             kGlobalToLocalShapeCallTargetName);
@@ -106,10 +110,19 @@ class ManualComputationPattern : public OpConversionPattern<CallOp> {
     mlir::TypeRange resultTypes = callOp->getResultTypes();
     CustomCallOp localToGlobalShape;
     if (!resultTypes.empty()) {
-      CHECK(callOp->getResult(0).hasOneUse())
-          << "all CallOp results should be used by a single LocalToGlobalShape";
-      localToGlobalShape =
-          mlir::cast<CustomCallOp>(*callOp->getResult(0).getUsers().begin());
+      // Same as above, a result of `sdy.manual_computation` can have a
+      // dimension of size 0, in which case, the corresponding result of
+      // `@local_xla.sdy.manual_computation_body` call would be replaced with a
+      // constant. Therefore, we check the first use rather than first result.
+      if (!callOp->use_empty()) {
+        localToGlobalShape =
+            mlir::dyn_cast<CustomCallOp>(*callOp->user_begin());
+      }
+      if (!localToGlobalShape) {
+        return callOp->emitOpError("expected the first use of ")
+               << callOp.getCalleeAttr() << " to be by a "
+               << kLocalToGlobalShapeCallTargetName << " CustomCallOp";
+      }
       CHECK(localToGlobalShape.getCallTargetName() ==
             kLocalToGlobalShapeCallTargetName);
       resultTypes = localToGlobalShape->getResultTypes();
@@ -124,18 +137,49 @@ class ManualComputationPattern : public OpConversionPattern<CallOp> {
           "two ManualComputations?");
     }
 
-    mlir::DictionaryAttr frontendAttrs = getFrontendAttrs(callOp);
-    CHECK(frontendAttrs)
-        << "Expected in/out shardings and manual axes as frontend attrs on the "
-           "CallOp during round tripping.";
+    MLIRContext* context = rewriter.getContext();
+    sdy::TensorShardingPerValueAttr inShardings =
+        sdy::TensorShardingPerValueAttr::get(context, {});
+    sdy::TensorShardingPerValueAttr outShardings =
+        sdy::TensorShardingPerValueAttr::get(context, {});
+    sdy::ManualAxesAttr manualAxes = sdy::ManualAxesAttr::get(context, {});
+    bool newCodePath = false;
+
+    auto setShardingAttrs = [&newCodePath, &manualAxes](
+                                CustomCallOp customCallOp,
+                                sdy::TensorShardingPerValueAttr& shardings,
+                                llvm::StringRef shardingAttrName) {
+      if (!customCallOp) {
+        return;
+      }
+      if (mlir::DictionaryAttr frontendAttrs = getFrontendAttrs(customCallOp)) {
+        newCodePath = true;
+        shardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
+            frontendAttrs, shardingAttrName);
+        if (manualAxes.empty()) {
+          manualAxes =
+              parseStringAttr<sdy::ManualAxesAttr>(frontendAttrs, kManualAxes);
+        }
+      }
+    };
+
+    setShardingAttrs(globalToLocalShape, inShardings, kInShardings);
+    setShardingAttrs(localToGlobalShape, outShardings, kOutShardings);
+    // TODO(b/410499196): Code to handle loading an old checkpoint. Remove after
+    // 6 months of cl/745735176 being submitted.
+    mlir::DictionaryAttr callOpFrontendAttrs = getFrontendAttrs(callOp);
+    if (!newCodePath && callOpFrontendAttrs) {
+      inShardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
+          callOpFrontendAttrs, kInShardings);
+      outShardings = parseStringAttr<sdy::TensorShardingPerValueAttr>(
+          callOpFrontendAttrs, kOutShardings);
+      manualAxes = parseStringAttr<sdy::ManualAxesAttr>(callOpFrontendAttrs,
+                                                        kManualAxes);
+    }
     auto manualComputationOp =
         rewriter.replaceOpWithNewOp<sdy::ManualComputationOp>(
-            callOp, resultTypes, operands,
-            parseStringAttr<sdy::TensorShardingPerValueAttr>(frontendAttrs,
-                                                             kInShardings),
-            parseStringAttr<sdy::TensorShardingPerValueAttr>(frontendAttrs,
-                                                             kOutShardings),
-            parseStringAttr<sdy::ManualAxesAttr>(frontendAttrs, kManualAxes));
+            callOp, resultTypes, operands, inShardings, outShardings,
+            manualAxes);
     sdy::inlineRegionAndConvertTerminatorOp<sdy::ReturnOp>(
         shmapBodyFunc.getBody(), manualComputationOp.getRegion(), rewriter);
     rewriter.eraseOp(shmapBodyFunc);
@@ -173,8 +217,20 @@ class SdyRoundTripShardMapImportPass
     patterns.add<ManualComputationPattern>(&context, symbolTable);
     if (mlir::failed(mlir::applyPartialConversion(module, target,
                                                   std::move(patterns)))) {
-      signalPassFailure();
+      return signalPassFailure();
     }
+
+    // At this point, there may be stray `xla.sdy.GlobalToLocalShape` and
+    // `xla.sdy.LocalToGlobalShape`, if the `@local_xla.sdy.manual_computation_body`
+    // call was eliminated through DCE and the custom call uses were replaced
+    // with constants as they had 0 elements, then it's safe to erase.
+    module->walk([](CustomCallOp op) {
+      if (op.getCallTargetName() == kGlobalToLocalShapeCallTargetName ||
+          op.getCallTargetName() == kLocalToGlobalShapeCallTargetName) {
+        CHECK(op.use_empty());
+        op.erase();
+      }
+    });
   }
 
   StringRef getArgument() const override {
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/BUILD b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/BUILD
index 2329426e06cb..15d6d9417562 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:UBDialect",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/stablehlo_to_hlo_to_stablehlo.cc b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/stablehlo_to_hlo_to_stablehlo.cc
index b9c1d7fa9b4d..7747c3a14e6c 100644
--- a/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/stablehlo_to_hlo_to_stablehlo.cc
+++ b/third_party/xla/xla/service/spmd/shardy/sdy_round_trip/test_utils/stablehlo_to_hlo_to_stablehlo.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/MLIRContext.h"
@@ -101,6 +102,8 @@ class SdyRoundTripStablehloToHloToStablehloPass
 
   void getDependentDialects(mlir::DialectRegistry& registry) const final {
     xla::RegisterMlirToHloDependentDialects(registry);
+    // TODO(tomnatan): Cleanup once no longer needed.
+    registry.insert<mlir::ub::UBDialect>();
   }
 };
 
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
index f37c032507ba..1f658ad3e360 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.cc
@@ -36,14 +36,13 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LLVM.h"
 #include "shardy/common/file_utils.h"
-#include "shardy/dialect/sdy/transforms/propagation/basic_propagation.h"
 #include "shardy/dialect/sdy/transforms/propagation/passes.h"
+#include "re2/re2.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -55,7 +54,6 @@ limitations under the License.
 #include "xla/map_util.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/spmd/shardy/constants.h"
 #include "xla/service/spmd/shardy/sdy_round_trip/pipelines.h"
 #include "xla/service/spmd/shardy/stablehlo_round_trip/stablehlo_export.h"
@@ -65,12 +63,12 @@ limitations under the License.
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/framework/mlir/status_scoped_diagnostic_handler.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace sdy {
@@ -296,20 +294,28 @@ void removeFrontendAttributes(HloModule* hloModule,
   hloModule->set_frontend_attributes(feAttrs);
 }
 
-}  // namespace
+std::string getShardyDirIfShouldDump(const DebugOptions& debugOptions,
+                                     absl::string_view passName) {
+  std::string shardyDir = debugOptions.xla_dump_to();
+  if (shardyDir.empty()) {
+    return "";
+  }
+  if (debugOptions.xla_dump_hlo_pass_re().empty() ||
+      !RE2::PartialMatch(passName, debugOptions.xla_dump_hlo_pass_re())) {
+    return "";
+  }
+  return shardyDir;
+}
 
-absl::StatusOr<bool> ShardyXLA::Run(
-    HloModule* hloModule,
-    const absl::flat_hash_set<absl::string_view>& executionThreads) {
+absl::Status runShardingPropagation(HloModule* hloModule,
+                                    mlir::ModuleOp mlirModule,
+                                    bool importMhloShardings,
+                                    mlir::sdy::PropagationOptions options,
+                                    absl::string_view passName) {
   LOG(INFO) << "Using Shardy for XLA SPMD propagation.";
 
-  // HLO -> StableHLO
-  auto mlirContext = std::make_unique<mlir::MLIRContext>();
-  loadAllRequiredDialects(mlirContext.get());
-  TF_ASSIGN_OR_RETURN(
-      mlir::OwningOpRef<mlir::ModuleOp> mlirModule,
-      xla::ConvertHloToStablehlo(*mlirContext.get(), hloModule));
-  std::string shardyDir = hloModule->config().debug_options().xla_dump_to();
+  const DebugOptions& debugOptions = hloModule->config().debug_options();
+  std::string shardyDir = getShardyDirIfShouldDump(debugOptions, passName);
 
   if (shardyDir == "sponge") {
     shardyDir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
@@ -335,19 +341,12 @@ absl::StatusOr<bool> ShardyXLA::Run(
   enableVerifier = true;
 #endif
 
-  mlir::PassManager pm(mlirContext.get());
+  mlir::PassManager pm(mlirModule->getContext());
   pm.enableVerifier(enableVerifier);
   pm.addPass(mlir::sdy::createSaveModuleOpPass(shardyDir,
                                                "sdy_module_before_xla_import"));
-  bool useTupleArgs = false;
-  mlir::DictionaryAttr moduleFrontendAttrs = getFrontendAttrs(*mlirModule);
-  if (hasKey(moduleFrontendAttrs, kUseTupleArgs)) {
-    useTupleArgs = true;
-    removeFrontendAttribute(*mlirModule, kUseTupleArgs);
-  }
 
-  if (hasKey(moduleFrontendAttrs, kImportMhloShardings)) {
-    removeFrontendAttribute(*mlirModule, kImportMhloShardings);
+  if (importMhloShardings) {
     auto spanToArrayRef = [](absl::Span<const bool> span) {
       return mlir::ArrayRef<bool>(span.data(), span.size());
     };
@@ -363,6 +362,40 @@ absl::StatusOr<bool> ShardyXLA::Run(
     addSdyRoundTripImportPipeline(pm);
   }
 
+  // NOTE: if we are using auto-spmd, we will use conservative propagation
+  // since the TOAST cost model cannot account for split axes or padding.
+  options.dumpDirectory = shardyDir;
+  options.conservativePropagation = hloModule->use_auto_spmd_partitioning();
+  mlir::sdy::addPropagationPipeline(pm, options);
+  addStablehloExportPipeline(pm);
+  pm.addPass(mlir::sdy::createSaveModuleOpPass(shardyDir,
+                                               "sdy_module_after_xla_export"));
+  tsl::StatusScopedDiagnosticHandler diagnosticHandler(
+      mlirModule->getContext());
+  return diagnosticHandler.consumeStatus(pm.run(mlirModule));
+}
+
+}  // namespace
+
+absl::StatusOr<bool> ShardyXLA::Run(
+    HloModule* hloModule,
+    const absl::flat_hash_set<absl::string_view>& executionThreads) {
+  auto moduleFrontendAttrs = hloModule->frontend_attributes().map();
+  bool useTupleArgs = moduleFrontendAttrs.contains(kUseTupleArgs);
+  bool importMhloShardings = moduleFrontendAttrs.contains(kImportMhloShardings);
+
+  if (!runSdyShardingPropagation && !useTupleArgs) {
+    // Nothing to do.
+    return false;
+  }
+
+  // HLO -> StableHLO
+  auto mlirContext = std::make_unique<mlir::MLIRContext>();
+  loadAllRequiredDialects(mlirContext.get());
+  TF_ASSIGN_OR_RETURN(
+      mlir::OwningOpRef<mlir::ModuleOp> mlirModule,
+      xla::ConvertHloToStablehlo(*mlirContext.get(), hloModule));
+
   // Store the entry computation layout, input-output alias config, and buffer
   // donors, which will be restored in the end, since MLIR does not preserve
   // those properties.
@@ -381,18 +414,10 @@ absl::StatusOr<bool> ShardyXLA::Run(
                                      useTupleArgs);
 
   if (runSdyShardingPropagation) {
-    // NOTE: if we are using auto-spmd, we will use conservative propagation
-    // since the TOAST cost model cannot account for split axes or padding.
-    mlir::sdy::PropagationOptions options = defaultOptions;
-    options.dumpDirectory = shardyDir;
-    options.conservativePropagation = hloModule->use_auto_spmd_partitioning();
-    mlir::sdy::addPropagationPipeline(pm, options);
+    TF_RETURN_IF_ERROR(runShardingPropagation(hloModule, mlirModule.get(),
+                                              importMhloShardings,
+                                              defaultOptions, name()));
   }
-  addStablehloExportPipeline(pm);
-  pm.addPass(mlir::sdy::createSaveModuleOpPass(shardyDir,
-                                               "sdy_module_after_xla_export"));
-  tsl::StatusScopedDiagnosticHandler diagnosticHandler(mlirContext.get());
-  TF_RETURN_IF_ERROR(diagnosticHandler.consumeStatus(pm.run(*mlirModule)));
 
   // StableHlo -> HLO
   HloProto hloProto;
@@ -408,7 +433,7 @@ absl::StatusOr<bool> ShardyXLA::Run(
 
   // Restore entry computation layout.
   *hloModule->mutable_entry_computation_layout() =
-      flattenedEntryComputationLayout;
+      std::move(flattenedEntryComputationLayout);
   hloModule->set_input_output_alias_config(
       std::move(flattenedInputOutputAliasConfig));
   hloModule->set_buffer_donor_config(std::move(flattenedBufferDonorsConfig));
@@ -423,7 +448,8 @@ absl::StatusOr<bool> ShardyXLA::Run(
 
   // We don't fully replace the HLO module, so it will continue to have the
   // temporary frontend attributes. So clean them up as XLA won't need them.
-  removeFrontendAttributes(hloModule, {kUseTupleArgs, kMeshesRoundTripAttr});
+  removeFrontendAttributes(
+      hloModule, {kUseTupleArgs, kImportMhloShardings, kMeshesRoundTripAttr});
 
   return true;
 }
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
index f08092508d0b..da8dc7f294cd 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef XLA_SERVICE_SPMD_SHARDY_SHARDY_XLA_PASS_H_
 #define XLA_SERVICE_SPMD_SHARDY_SHARDY_XLA_PASS_H_
 
-#include <string>
-
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc
index 1260a70c5382..2320f6185c45 100644
--- a/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc
+++ b/third_party/xla/xla/service/spmd/shardy/shardy_xla_pass_test.cc
@@ -20,13 +20,15 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/spmd/shardy/constants.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
 
@@ -37,21 +39,31 @@ namespace sdy {
 
 namespace {
 
-using ShardyXLATest = xla::HloTestBase;
+using ShardyXLATest = HloHardwareIndependentTestBase;
 
-void runShardy(VerifiedHloModule* module, bool stablehloImport) {
+void runShardy(VerifiedHloModule* module, bool stablehloImport,
+               bool runSdyShardingPropagation = true,
+               bool expectChanged = true) {
   FrontendAttributes attrs;
   if (stablehloImport) {
     attrs.mutable_map()->try_emplace(xla::sdy::kImportMhloShardings, "t");
   }
   module->add_frontend_attributes(attrs);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ShardyXLA().Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardyXLA(runSdyShardingPropagation).Run(module));
   VLOG(1) << module->ToString();
-  EXPECT_TRUE(changed);
+  if (expectChanged) {
+    EXPECT_TRUE(changed);
+  } else {
+    EXPECT_FALSE(changed);
+  }
 }
 
-void runShardyWithStablehloImport(VerifiedHloModule* module) {
-  runShardy(module, /*stablehloImport=*/true);
+void runShardyWithStablehloImport(VerifiedHloModule* module,
+                                  bool runSdyShardingPropagation = true,
+                                  bool expectChanged = true) {
+  runShardy(module, /*stablehloImport=*/true, runSdyShardingPropagation,
+            expectChanged);
 }
 
 void runShardyWithSdyImport(VerifiedHloModule* module) {
@@ -69,7 +81,7 @@ TEST_F(ShardyXLATest, AllowSpmdShardingPropagationParametersOutputRespected) {
       %dot = f32[8,256,128] dot(%p0, %p1),
         lhs_batch_dims={0}, rhs_batch_dims={0},
         lhs_contracting_dims={2}, rhs_contracting_dims={2}, sharding={devices=[2,2,2]<=[8]}
-      ROOT %copy = f32[8,256,128] copy(%dot), sharding={replicated}
+      ROOT %tuple = (f32[8,256,128]) tuple(%dot)
     })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hloString));
@@ -116,10 +128,6 @@ TEST_F(ShardyXLATest, ElementWise) {
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Sharding("{devices=[2,1]<=[2]}"));
-
-  // Conversions HLO -> StableHLO -> HLO removes the copy instructions.
-  auto* copy = FindInstruction(module.get(), xla::HloOpcode::kCopy);
-  EXPECT_EQ(copy, nullptr);
 }
 
 TEST_F(ShardyXLATest, CostantSplitter) {
@@ -153,7 +161,10 @@ TEST_F(ShardyXLATest, CostantSplitter) {
 
   EXPECT_EQ(dot->operand(0)->operand(0)->opcode(), HloOpcode::kConstant);
   EXPECT_EQ(dot->operand(1)->operand(0)->opcode(), HloOpcode::kConstant);
-  EXPECT_NE(dot->operand(0)->operand(0), dot->operand(1)->operand(0));
+
+  // Constants with identical shardings are expected to be merged.
+  // TODO(tomnatan): Uncomment this test once sdy pun bumped (3/31/25).
+  // EXPECT_EQ(dot->operand(0)->operand(0), dot->operand(1)->operand(0));
 }
 
 TEST_F(ShardyXLATest, Dot) {
@@ -672,6 +683,71 @@ TEST_F(ShardyXLATest, TestUseTuplesTrue) {
             "f32[8,32]{1,0:T(8,128)}))->f32[8,32]{1,0:T(8,128)}");
 }
 
+TEST_F(ShardyXLATest, TestRunShardingPropagationFalseUseTuplesFalse) {
+  const char* const hloString = R"(
+    HloModule pjit_f, buffer_donor={ (1, {}) }, input_output_alias={ {}: (2, {}, must-alias) }, entry_computation_layout={(f32[8,16]{1,0:T(8,128)}, f32[16,32]{1,0:T(8,128)}, f32[8,32]{1,0:T(8,128)})->f32[8,32]{1,0:T(8,128)}}, allow_spmd_sharding_propagation_to_parameters={false,false,false}, num_partitions=8
+
+    ENTRY %main.7 (Arg_0.1: f32[8,16], Arg_1.2: f32[16,32], Arg_2.3: f32[8,32]) -> f32[8,32] {
+      %Arg_0.1 = f32[8,16]{1,0} parameter(0)
+      %Arg_1.2 = f32[16,32]{1,0} parameter(1)
+      %dot.4 = f32[8,32]{1,0} dot(f32[8,16]{1,0} %Arg_0.1, f32[16,32]{1,0} %Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      %Arg_2.3 = f32[8,32]{1,0} parameter(2)
+      ROOT %add.5 = f32[8,32]{1,0} add(f32[8,32]{1,0} %dot.4, f32[8,32]{1,0} %Arg_2.3)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hloString));
+  runShardyWithStablehloImport(module.get(),
+                               /*runSdyShardingPropagation=*/false,
+                               /*expectChanged=*/false);
+
+  EXPECT_EQ(module->entry_computation()->parameter_instructions().size(), 3);
+  EXPECT_EQ(module->buffer_donor_config().ToShortString(), "(1, {})");
+  EXPECT_EQ(module->input_output_alias_config().ToShortString(),
+            "{}: (2, {}, must-alias)");
+  EXPECT_EQ(module->entry_computation_layout().ToString(),
+            "(f32[8,16]{1,0:T(8,128)}, f32[16,32]{1,0:T(8,128)}, "
+            "f32[8,32]{1,0:T(8,128)})->f32[8,32]{1,0:T(8,128)}");
+}
+
+TEST_F(ShardyXLATest, TestRunShardingPropagationFalseUseTuplesTrue) {
+  const char* const hloString = R"(
+    HloModule pjit_f, buffer_donor={ (1, {}) }, input_output_alias={ {}: (2, {}, must-alias) }, entry_computation_layout={(f32[8,16]{1,0:T(8,128)}, f32[16,32]{1,0:T(8,128)}, f32[8,32]{1,0:T(8,128)})->f32[8,32]{1,0:T(8,128)}}, allow_spmd_sharding_propagation_to_parameters={false,false,false}, num_partitions=8, frontend_attributes={xla.sdy.use_tuple_args="t"}
+
+    ENTRY %main.7 (Arg_0.1: f32[8,16], Arg_1.2: f32[16,32], Arg_2.3: f32[8,32]) -> f32[8,32] {
+      %Arg_0.1 = f32[8,16]{1,0} parameter(0)
+      %Arg_1.2 = f32[16,32]{1,0} parameter(1)
+      %dot.4 = f32[8,32]{1,0} dot(f32[8,16]{1,0} %Arg_0.1, f32[16,32]{1,0} %Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      %Arg_2.3 = f32[8,32]{1,0} parameter(2)
+      ROOT %add.5 = f32[8,32]{1,0} add(f32[8,32]{1,0} %dot.4, f32[8,32]{1,0} %Arg_2.3)
+    })";
+  const char* const expected = R"(
+  // CHECK:       HloModule pjit_f,
+  // CHECK-SAME:    input_output_alias={ {}: (0, {2}, must-alias) },
+  // CHECK-SAME:    buffer_donor={ (0, {1}) },
+  // CHECK-SAME:    entry_computation_layout={((f32[8,16]{1,0:T(8,128)}, f32[16,32]{1,0:T(8,128)}, f32[8,32]{1,0:T(8,128)}))->f32[8,32]{1,0:T(8,128)}},
+  // CHECK-SAME:    allow_spmd_sharding_propagation_to_parameters={false,false,false}, num_partitions=8
+  //
+  // CHECK:       ENTRY %main.0 (arg_tuple.1: (f32[8,16], f32[16,32], f32[8,32])) -> f32[8,32] {
+  // CHECK-NEXT:    %arg_tuple.1 = (f32[8,16], f32[16,32], f32[8,32]) parameter(0)
+  // CHECK-NEXT:    %get-tuple-element.2 = f32[8,16] get-tuple-element(%arg_tuple.1), index=0
+  // CHECK-NEXT:    %get-tuple-element.3 = f32[16,32] get-tuple-element(%arg_tuple.1), index=1
+  // CHECK-NEXT:    %dot.5 = f32[8,32] dot(%get-tuple-element.2, %get-tuple-element.3), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  // CHECK-NEXT:    %get-tuple-element.4 = f32[8,32] get-tuple-element(%arg_tuple.1), index=2
+  // CHECK-NEXT:    ROOT %add.6 = f32[8,32] add(%dot.5, %get-tuple-element.4)
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hloString));
+  runShardyWithStablehloImport(module.get(),
+                               /*runSdyShardingPropagation=*/false);
+  LOG(INFO) << module->ToString(
+      HloPrintOptions{}.set_include_layout_in_shapes(false));
+  EXPECT_TRUE(*RunFileCheck(
+      module->ToString(HloPrintOptions{}.set_include_layout_in_shapes(false)),
+      expected));
+}
+
 TEST_F(ShardyXLATest, TestMaximalShardingNoResults) {
   const char* const hloString = R"(
 HloModule maximal_sharding_module, entry_computation_layout={(s64[2]{0})->s64[2]{0}},
@@ -688,7 +764,7 @@ ENTRY %main.0 (Arg_0.0: s64[2]) -> s64[2] {
   const char* const expected = R"(
   // CHECK:               ENTRY %main.3 (Arg_0.1: s64[2]) -> s64[2] {
   // CHECK-NEXT:            ROOT %Arg_0.1 = s64[2] parameter(0)
-  // CHECK-NEXT{LITERAL}:   %custom-call.2 = () custom-call(s64[2] %Arg_0.1), custom_call_target="xla_ffi_python_cpu_callback",
+  // CHECK-NEXT{LITERAL}:   %custom-call.2 = () custom-call(%Arg_0.1), custom_call_target="xla_ffi_python_cpu_callback",
   // CHECK-SAME{LITERAL}:   operand_layout_constraints={s64[2]{0}}, custom_call_has_side_effect=true, api_version=API_VERSION_TYPED_FFI,
   // CHECK-SAME{LITERAL}:   sharding={{maximal device=0}}
 )";
@@ -705,7 +781,7 @@ ENTRY %main.0 (Arg_0.0: s64[2]) -> s64[2] {
 
 TEST_F(ShardyXLATest, WhileShardingOnlyOnFreeVariables) {
   const char* const hloString = R"(
-    HloModule main, entry_computation_layout={(f32[32,96]{1,0}, f32[32,96]{1,0})->f32[32,96]{1,0}}, frontend_attributes={xla.sdy.meshes="{mesh = #sdy.mesh<[\"x\"=4]>}"}
+    HloModule main, entry_computation_layout={(f32[32,96]{1,0}, f32[32,96]{1,0})->f32[32,96]{1,0}}, frontend_attributes={xla.sdy.meshes={mesh = #sdy.mesh<["x"=4]>}}
 
     %region_0.6 (arg_tuple.7: (f32[32,96], s32[], f32[32,96])) -> (f32[32,96], s32[], f32[32,96]) {
       %arg_tuple.7 = (f32[32,96]{1,0}, s32[], f32[32,96]{1,0}) parameter(0)
@@ -755,7 +831,7 @@ TEST_F(ShardyXLATest, WhileShardingOnlyOnFreeVariables) {
 
 TEST_F(ShardyXLATest, EmptyResultLayout) {
   const char* const hloString = R"(
-    HloModule pjit_f_, entry_computation_layout={(s64[2,2,2]{2,1,0})->()}, num_partitions=2, frontend_attributes={xla.sdy.meshes="{maximal_mesh_0 = #sdy.mesh<[], device_ids=[0]>, mesh = #sdy.mesh<[\"x\"=2]>}"}
+    HloModule pjit_f_, entry_computation_layout={(s64[2,2,2]{2,1,0})->()}, num_partitions=2, frontend_attributes={xla.sdy.meshes={maximal_mesh_0 = #sdy.mesh<[], device_ids=[0]>, mesh = #sdy.mesh<["x"=2]>}}
 
     ENTRY %main.5 (Arg_0.1: s64[2,2,2]) -> () {
       %Arg_0.0 = s64[2,2,2]{2,1,0} parameter(0), sharding={devices=[2,1,1]<=[2]}, metadata={op_name="x"}
@@ -771,7 +847,7 @@ TEST_F(ShardyXLATest, EmptyResultLayout) {
 
 TEST_F(ShardyXLATest, EmptyOperandLayout) {
   const char* const hloString = R"(
-    HloModule pjit_f_, entry_computation_layout={()->s64[2,2]{1,0}}, num_partitions=2, frontend_attributes={xla.sdy.meshes="{maximal_mesh_0 = #sdy.mesh<[], device_ids=[0]>, mesh = #sdy.mesh<[\"x\"=2]>}"}
+    HloModule pjit_f_, entry_computation_layout={()->s64[2,2]{1,0}}, num_partitions=2, frontend_attributes={xla.sdy.meshes={maximal_mesh_0 = #sdy.mesh<[], device_ids=[0]>, mesh = #sdy.mesh<["x"=2]>}}
 
     ENTRY %main.5 () -> s64[2,2] {
       ROOT %constant = s64[2,2]{1,0} constant({{1,1},{1,1}})
@@ -784,5 +860,29 @@ TEST_F(ShardyXLATest, EmptyOperandLayout) {
             "()->s64[2,2]{1,0}");
 }
 
+TEST_F(ShardyXLATest, RaggedDotMode1) {
+  const char* const hloString = R"(
+  HloModule ragged_dot, allow_spmd_sharding_propagation_to_parameters={true,true,true}, allow_spmd_sharding_propagation_to_output={true}, frontend_attributes={xla.sdy.meshes={mesh = #sdy.mesh<["a"=2, "b"=2, "c"=2]>}}
+    ENTRY entry {
+      p0 = f32[16,32,64] parameter(0), frontend_attributes={xla.sdy.sharding="#sdy.sharding<@mesh, [{\"a\", ?}, {\"b\", ?}, {\"c\", ?}]>"}
+      p1 = f32[4,16,64,8] parameter(1)
+      p2 = s32[16,4] parameter(2)
+      ROOT ragged-dot = f32[16,32,8] ragged-dot(p0, p1, p2), lhs_batch_dims={0}, rhs_batch_dims={1}, lhs_contracting_dims={2}, rhs_contracting_dims={2}, lhs_ragged_dims={1}, rhs_group_dims={0}
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hloString));
+  runShardyWithSdyImport(module.get());
+
+  HloComputation* entry = module->entry_computation();
+  EXPECT_THAT(
+      entry->parameter_instruction(1),
+      op::Sharding(
+          "{devices=[1,2,2,1,2]<=[2,2,2]T(0,2,1) last_tile_dim_replicate}"));
+  EXPECT_THAT(entry->parameter_instruction(2),
+              op::Sharding("{devices=[2,1,4]<=[8] last_tile_dim_replicate}"));
+  EXPECT_THAT(entry->root_instruction(),
+              op::Sharding("{devices=[2,2,1,2]<=[8] last_tile_dim_replicate}"));
+}
+
 }  // namespace sdy
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/BUILD b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/BUILD
index dbdf5c0ff267..c9360a3ce91e 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/BUILD
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/BUILD
@@ -1,6 +1,6 @@
 # Import/Export passes for going from `sdy.sharding`s to `mhlo.sharding`s and vice versa.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -48,9 +48,7 @@ cc_library(
     srcs = ["export_ops.cc"],
     hdrs = ["export_ops.h"],
     deps = [
-        "//xla:sharding_op_util",
         "//xla/mlir_hlo",
-        "//xla/service/spmd/shardy:constants",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -66,9 +64,7 @@ cc_library(
     srcs = ["shard_map_export.cc"],
     hdrs = ["shard_map_export.h"],
     deps = [
-        ":export_shardings",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo",
         "//xla/service/spmd/shardy:constants",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
index 4146dbdd1cc8..5943506c6090 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
@@ -48,12 +48,12 @@ using ::mlir::stablehlo::CustomCallOp;
 //
 // This only happens if the op has a single result and the result type is not
 // a tuple.
-void replaceCallbackWithTupleVersion(CustomCallOp customCall,
-                                     mlir::IRRewriter& rewriter) {
+void replaceCallbackWithTupleVersion(CustomCallOp customCall) {
   if (customCall.getNumResults() != 1 ||
       mlir::isa<mlir::TupleType>(customCall->getResultTypes().front())) {
     return;
   }
+  mlir::IRRewriter rewriter(customCall);
   CustomCallOp tupleCustomCall = cloneCustomCallWithNewResultTypes(
       customCall,
       mlir::TupleType::get(customCall->getContext(),
@@ -76,19 +76,10 @@ class StablehloRoundTripExportCallbackCustomCallsPass
 
   void runOnOperation() final {
     getOperation().walk([&](CustomCallOp customCall) {
-      if (!isPythonCallbackCustomCall(customCall)) {
+      if (!isPythonCallbackCustomCall(customCall) || customCall->use_empty()) {
         return;
       }
-      mlir::IRRewriter rewriter(customCall);
-      if (!customCall->use_empty()) {
-        replaceCallbackWithTupleVersion(customCall, rewriter);
-        return;
-      }
-      CustomCallOp newCustomCall = cloneCustomCallWithNewResultTypes(
-          customCall, mlir::TypeRange(), rewriter);
-      newCustomCall.setResultLayoutsAttr(rewriter.getArrayAttr({}));
-      rewriter.eraseOp(customCall);
-      return;
+      replaceCallbackWithTupleVersion(customCall);
     });
   }
 
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_ops.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_ops.cc
index bbb38c1d0107..508ca1016cfc 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_ops.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_ops.cc
@@ -15,21 +15,15 @@ limitations under the License.
 
 #include "xla/service/spmd/shardy/stablehlo_round_trip/export_ops.h"
 
-#include <cstdint>
 #include <memory>
 #include <utility>
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LogicalResult.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/MLIRContext.h"
@@ -43,13 +37,10 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/TypeID.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "shardy/dialect/sdy/ir/constants.h"
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/utils.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/service/spmd/shardy/constants.h"
-#include "xla/sharding_op_util.h"
 
 namespace xla {
 namespace sdy {
@@ -64,17 +55,20 @@ using ::mlir::LogicalResult;
 using ::mlir::OpConversionPattern;
 using ::mlir::OperationPass;
 using ::mlir::Pass;
-using ::mlir::SmallVector;
-using ::mlir::StringAttr;
 using ::mlir::StringRef;
 using ::mlir::success;
 
+using ::mlir::sdy::AllGatherOp;
+using ::mlir::sdy::AllReduceOp;
+using ::mlir::sdy::AllSliceOp;
+using ::mlir::sdy::AllToAllOp;
+using ::mlir::sdy::CollectivePermuteOp;
 using ::mlir::sdy::ConstantOp;
-using ::mlir::sdy::kShardingAttr;
+using ::mlir::sdy::PropagationBarrierOp;
+using ::mlir::sdy::ReduceScatterOp;
 using ::mlir::sdy::ReshardOp;
 using ::mlir::sdy::ShardingConstraintOp;
 using ::mlir::sdy::TensorShardingAttr;
-using ::mlir::sdy::TensorShardingPerValueAttr;
 
 // Converts `sdy::ConstantOp` to `stablehlo::ConstantOp`.
 class ConstantPattern : public OpConversionPattern<ConstantOp> {
@@ -91,6 +85,42 @@ class ConstantPattern : public OpConversionPattern<ConstantOp> {
   }
 };
 
+// We erase an `AllReduceOp` instead of converting it to a copy op, since it
+// does not reshard the tensor.
+class AllReducePattern : public OpConversionPattern<AllReduceOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+ private:
+  LogicalResult matchAndRewrite(
+      AllReduceOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    rewriter.replaceOp(op, adaptor.getTensor());
+    return success();
+  }
+};
+
+class PropagationBarrierPattern
+    : public OpConversionPattern<PropagationBarrierOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+ private:
+  LogicalResult matchAndRewrite(
+      PropagationBarrierOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    rewriter.replaceOp(op, adaptor.getInput());
+    return success();
+  }
+};
+
+void rewriteCollectiveOp(mlir::Operation* op, mlir::Value input,
+                         TensorShardingAttr sharding,
+                         ConversionPatternRewriter& rewriter) {
+  auto copyOp = rewriter.replaceOpWithNewOp<mhlo::CopyOp>(op, input);
+  mlir::sdy::setShardings(copyOp, sharding);
+}
+
 class ReshardPattern : public OpConversionPattern<ReshardOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -99,28 +129,23 @@ class ReshardPattern : public OpConversionPattern<ReshardOp> {
   LogicalResult matchAndRewrite(
       ReshardOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const override {
-    auto copyOp =
-        rewriter.replaceOpWithNewOp<mhlo::CopyOp>(op, adaptor.getInput());
-
-    TensorShardingAttr sdySharding = adaptor.getShardingAttr();
-    mlir::sdy::setShardings(copyOp, sdySharding);
-
-    SmallVector<int64_t> unspecifiedDims;
-    for (auto [dim, dimSharding] :
-         llvm::enumerate(sdySharding.getDimShardings())) {
-      // Unspecified dims are those that are marked open but is not partitioned
-      // on any axes.
-      if (!dimSharding.getIsClosed() && dimSharding.emptyAxes()) {
-        unspecifiedDims.push_back(dim);
-      }
-    }
-    if (!unspecifiedDims.empty()) {
-      copyOp->setAttr(kXlaBackendConfigAttr,
-                      StringAttr::get(op.getContext(),
-                                      xla::sharding_op_util::EncodeAttributes(
-                                          unspecifiedDims)));
-    }
+    rewriteCollectiveOp(op, adaptor.getInput(), adaptor.getSharding(),
+                        rewriter);
+    return success();
+  }
+};
+
+template <class OpTy>
+class CollectivePattern : public OpConversionPattern<OpTy> {
+ public:
+  using OpConversionPattern<OpTy>::OpConversionPattern;
 
+ private:
+  LogicalResult matchAndRewrite(
+      OpTy op, typename OpTy::Adaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    rewriteCollectiveOp(op, adaptor.getTensor(), adaptor.getOutSharding(),
+                        rewriter);
     return success();
   }
 };
@@ -136,14 +161,21 @@ class ExportOpsPass
     // We do not expect to see ShardingConstraintOp in the input module.
     // ShardingConstraintOp should be replaced by ReshardOp before this pass.
     // Hence, we add ShardingConstraintOp as an illegal op.
-    target.addIllegalOp<ConstantOp, ReshardOp, ShardingConstraintOp>();
+    target.addIllegalOp<ConstantOp, ReshardOp, AllGatherOp, AllReduceOp,
+                        AllSliceOp, AllToAllOp, CollectivePermuteOp,
+                        ReduceScatterOp, ShardingConstraintOp,
+                        PropagationBarrierOp>();
     target.addLegalOp<stablehlo::ConstantOp, mhlo::CopyOp>();
     mlir::RewritePatternSet patterns(&context);
     // After converting `sdy.constant` into `stablehlo.constant`, the constants
     // should not be deduped via folding. Fortunately, folding only happens in
     // greedy pattern rewriters. ExportHloShardingsPass does a simple walk,
     // which keeps the constants as is.
-    patterns.add<ConstantPattern, ReshardPattern>(&context);
+    patterns.add<ConstantPattern, AllReducePattern, ReshardPattern,
+                 PropagationBarrierPattern, CollectivePattern<AllGatherOp>,
+                 CollectivePattern<AllSliceOp>, CollectivePattern<AllToAllOp>,
+                 CollectivePattern<CollectivePermuteOp>,
+                 CollectivePattern<ReduceScatterOp>>(&context);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
       signalPassFailure();
@@ -158,7 +190,7 @@ class ExportOpsPass
   }
 
   void getDependentDialects(mlir::DialectRegistry& registry) const final {
-    registry.insert<mlir::sdy::SdyDialect>();
+    registry.insert<mlir::sdy::SdyDialect, mlir::mhlo::MhloDialect>();
   }
 };
 
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.cc
index 0b4eb7537573..12087241519a 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.cc
@@ -92,6 +92,7 @@ using ::mlir::func::FuncOp;
 using ::mlir::sdy::AxisRefAttr;
 using ::mlir::sdy::DimensionShardingAttr;
 using ::mlir::sdy::kShardingAttr;
+using ::mlir::sdy::ManualAxesAttr;
 using ::mlir::sdy::MeshAttr;
 using ::mlir::sdy::MeshAxisAttr;
 using ::mlir::sdy::MeshOp;
@@ -163,9 +164,15 @@ LogicalResult exportFunc(FuncOp funcOp, const SymbolTable& symbolTable,
   for (int64_t argNum = 0; argNum < funcOp.getNumArguments(); ++argNum) {
     if (auto sdySharding = funcOp.getArgAttrOfType<TensorShardingAttr>(
             argNum, kShardingAttr)) {
-      funcOp.setArgAttr(
-          argNum, kXlaShardingAttr,
-          getStringAttr(convertToHloSharding(sdySharding, getMeshAttr)));
+      ArrayRef<StringAttr> manualAxes;
+      if (ManualAxesAttr manualAxesAttr =
+              funcOp.getArgAttrOfType<ManualAxesAttr>(argNum, kManualAxes)) {
+        manualAxes = manualAxesAttr.getValue();
+        funcOp.removeArgAttr(argNum, kManualAxes);
+      }
+      funcOp.setArgAttr(argNum, kXlaShardingAttr,
+                        getStringAttr(convertToHloSharding(
+                            sdySharding, getMeshAttr, manualAxes)));
       funcOp.removeArgAttr(argNum, kShardingAttr);
     }
   }
@@ -173,27 +180,40 @@ LogicalResult exportFunc(FuncOp funcOp, const SymbolTable& symbolTable,
   for (int64_t resNum = 0; resNum < funcOp.getNumResults(); ++resNum) {
     if (auto sdySharding = funcOp.getResultAttrOfType<TensorShardingAttr>(
             resNum, kShardingAttr)) {
-      funcOp.setResultAttr(
-          resNum, kXlaShardingAttr,
-          getStringAttr(convertToHloSharding(sdySharding, getMeshAttr)));
+      ArrayRef<StringAttr> manualAxes;
+      if (ManualAxesAttr manualAxesAttr =
+              funcOp.getResultAttrOfType<ManualAxesAttr>(resNum, kManualAxes)) {
+        manualAxes = manualAxesAttr.getValue();
+        funcOp.removeResultAttr(resNum, kManualAxes);
+      }
+      funcOp.setResultAttr(resNum, kXlaShardingAttr,
+                           getStringAttr(convertToHloSharding(
+                               sdySharding, getMeshAttr, manualAxes)));
       funcOp.removeResultAttr(
           resNum, StringAttr::get(funcOp.getContext(), kShardingAttr));
     }
   }
 
   funcOp.front().walk([&](Operation* op) {
+    ArrayRef<StringAttr> manualAxes;
+    if (ManualAxesAttr manualAxesAttr =
+            op->getAttrOfType<ManualAxesAttr>(kManualAxes)) {
+      manualAxes = manualAxesAttr.getValue();
+      op->removeAttr(kManualAxes);
+    }
+
     if (ArrayRef<TensorShardingAttr> shardings = mlir::sdy::getShardings(op);
         !shardings.empty()) {
-      op->setAttr(
-          kXlaShardingAttr,
-          convertToHloShardingAttr(op, shardings, getMeshAttr, getStringAttr));
+      setHloShardingAttr(op, shardings, getMeshAttr, manualAxes);
       op->removeAttr(kShardingAttr);
     } else if (addMissingShardingToControlFlow &&
                mlir::isa<stablehlo::WhileOp, stablehlo::CaseOp,
-                         stablehlo::IfOp>(op) &&
-               !op->hasAttr(kXlaShardingAttr)) {
-      // We check if the op already has a `kXlaShardingAttr`, since a manual
-      // sharding might have been added in shard map export pass.
+                         stablehlo::IfOp>(op)) {
+      // The shard map export pass assigns shardings to any operation with
+      // manual axes. Since this operation lacks shardings
+      // (`shardings.empty()`), it cannot have manual axes. This CHECK asserts
+      // this invariant before we add a replicated sharding.
+      CHECK(manualAxes.empty());
       op->setAttr(kXlaShardingAttr, getStringAttr(HloSharding::Replicate()));
     }
   });
@@ -272,6 +292,34 @@ class ExportStablehloShardingsPass
   bool addMissingShardingToControlFlow;
 };
 
+HloSharding getHloShardingForOp(
+    Operation* op, ArrayRef<TensorShardingAttr> shardings,
+    std::function<MeshAttr(TensorShardingAttr)> getMeshAttr,
+    ArrayRef<StringAttr> manualAxes) {
+  // TODO(bartchr): pass through a symbol table to `getMesh(...)` below.
+  bool isNoResultMaximal = op->getNumResults() == 0 && shardings.size() == 1 &&
+                           (shardings.front().getMesh(op).isMaximal() ||
+                            shardings.front().isFullyReplicated());
+  CHECK(shardings.size() == op->getNumResults() || isNoResultMaximal);
+  if (op->getNumResults() == 1 || isNoResultMaximal) {
+    return convertToHloSharding(shardings.front(), getMeshAttr, manualAxes);
+  }
+
+  SmallVector<HloSharding> newShardings;
+  llvm::transform(shardings, std::back_inserter(newShardings),
+                  [&](TensorShardingAttr sdySharding) {
+                    return convertToHloSharding(sdySharding, getMeshAttr,
+                                                manualAxes);
+                  });
+
+  std::vector<xla::Shape> shapes;
+  llvm::transform(op->getResultTypes(), std::back_inserter(shapes),
+                  [&](mlir::Type type) { return xla::TypeToShape(type); });
+
+  return HloSharding::Tuple(xla::ShapeUtil::MakeTupleShape(shapes),
+                            newShardings);
+}
+
 }  // namespace
 
 HloSharding convertToHloSharding(
@@ -362,33 +410,13 @@ HloSharding convertToHloSharding(
       types);
 }
 
-StringAttr convertToHloShardingAttr(
-    Operation* op, ArrayRef<TensorShardingAttr> shardings,
-    std::function<MeshAttr(TensorShardingAttr)> getMeshAttr,
-    std::function<StringAttr(const HloSharding&)> getStringAttr,
-    ArrayRef<StringAttr> manualAxes) {
-  // TODO(bartchr): pass through a symbol table to `getMesh(...)` below.
-  bool isNoResultMaximal = op->getNumResults() == 0 && shardings.size() == 1 &&
-                           shardings.front().getMesh(op).isMaximal();
-  CHECK(shardings.size() == op->getNumResults() || isNoResultMaximal);
-  if (op->getNumResults() == 1 || isNoResultMaximal) {
-    return getStringAttr(
-        convertToHloSharding(shardings.front(), getMeshAttr, manualAxes));
-  }
-
-  SmallVector<HloSharding> newShardings;
-  llvm::transform(shardings, std::back_inserter(newShardings),
-                  [&](TensorShardingAttr sdySharding) {
-                    return convertToHloSharding(sdySharding, getMeshAttr,
-                                                manualAxes);
-                  });
-
-  std::vector<xla::Shape> shapes;
-  llvm::transform(op->getResultTypes(), std::back_inserter(shapes),
-                  [&](mlir::Type type) { return xla::TypeToShape(type); });
-
-  return getStringAttr(
-      HloSharding::Tuple(xla::ShapeUtil::MakeTupleShape(shapes), newShardings));
+void setHloShardingAttr(Operation* op, ArrayRef<TensorShardingAttr> shardings,
+                        std::function<MeshAttr(TensorShardingAttr)> getMeshAttr,
+                        ArrayRef<StringAttr> manualAxes) {
+  HloSharding hloSharding =
+      getHloShardingForOp(op, shardings, getMeshAttr, manualAxes);
+  op->setAttr(kXlaShardingAttr,
+              StringAttr::get(op->getContext(), hloSharding.ToString()));
 }
 
 std::unique_ptr<Pass> createExportStablehloShardingsPass(
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.h b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.h
index 8dc67c3b4cff..21bdfd2c4b6e 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.h
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.h
@@ -37,14 +37,13 @@ HloSharding convertToHloSharding(
         getMeshAttr,
     mlir::ArrayRef<mlir::StringAttr> manualAxes = {});
 
-// Convert the `shardings` into a `StringAttr` representing `xla::HloSharding`
-// for the given `op`.
-mlir::StringAttr convertToHloShardingAttr(
+// Convert the `shardings` into a `kXlaShardingAttr` representing
+// `xla::HloSharding` and set it on `op`.
+void setHloShardingAttr(
     mlir::Operation* op,
     mlir::ArrayRef<mlir::sdy::TensorShardingAttr> shardings,
     std::function<mlir::sdy::MeshAttr(mlir::sdy::TensorShardingAttr)>
         getMeshAttr,
-    std::function<mlir::StringAttr(const HloSharding&)> getStringAttr,
     mlir::ArrayRef<mlir::StringAttr> manualAxes = {});
 
 // Creates a pass that converts the shardings from `kShardingAttr` to
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.cc
index 2545b47dcb71..f1fa8fcf1341 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.h"
 
 #include <cassert>
-#include <functional>
 #include <memory>
 #include <tuple>
 #include <utility>
@@ -32,10 +31,12 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/SymbolTable.h"
@@ -52,10 +53,8 @@ limitations under the License.
 #include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/utils.h"
 #include "stablehlo/dialect/StablehloOps.h"
-#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/spmd/shardy/constants.h"
-#include "xla/service/spmd/shardy/stablehlo_round_trip/export_shardings.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -63,9 +62,9 @@ namespace sdy {
 
 namespace {
 
+using ::mlir::ArrayRef;
 using ::mlir::MLIRContext;
 using ::mlir::ModuleOp;
-using ::mlir::NamedAttribute;
 using ::mlir::Operation;
 using ::mlir::OperationPass;
 using ::mlir::SmallVector;
@@ -76,9 +75,12 @@ using ::mlir::mhlo::CopyOp;
 using ::mlir::stablehlo::CustomCallOp;
 
 namespace sdy = ::mlir::sdy;
+
 using sdy::kShardingAttr;
+using sdy::ManualAxesAttr;
 using sdy::ManualComputationOp;
 using sdy::MeshAttr;
+using sdy::NamedComputationOp;
 using sdy::SdyDialect;
 using sdy::TensorShardingAttr;
 using sdy::TensorShardingPerValueAttr;
@@ -87,22 +89,29 @@ using sdy::TensorShardingPerValueAttr;
 using ManualComputationToParentManualAxes =
     llvm::SmallDenseMap<ManualComputationOp, SmallVector<StringAttr>>;
 
-// Populates `regionManualAxes` with the manual axes of `op`. If `op` is nested
-// in another manual computation, also returns the manual axes of the parent
-// computation.
-mlir::ArrayRef<StringAttr> populateRegionManualAxes(
-    SmallVector<StringAttr>& regionManualAxes, ManualComputationOp op,
+// Given an ManualComputationOp `op`, `op.getManualAxes()` is the local manual
+// axes. `parent` is the manual axes of its parent ManualComputationOp,
+// recursively. `region` is the concatenation of `parent` and
+// `op.getManualAxes()`.
+struct ManualAxesHierarchy {
+  ArrayRef<StringAttr> parent;
+  SmallVector<StringAttr> region;
+};
+
+ManualAxesHierarchy getManualAxesHierarchy(
+    ManualComputationOp op,
     const ManualComputationToParentManualAxes& parentManualCompAxes) {
-  regionManualAxes = SmallVector<StringAttr>(op.getManualAxes().begin(),
-                                             op.getManualAxes().end());
-  mlir::ArrayRef<StringAttr> parentManualAxesRef;
+  ManualAxesHierarchy hierarchy;
+
   if (auto parentManualAxes = parentManualCompAxes.find(op);
       parentManualAxes != parentManualCompAxes.end()) {
-    parentManualAxesRef = parentManualAxes->getSecond();
-    regionManualAxes.append(parentManualAxes->getSecond().begin(),
-                            parentManualAxes->getSecond().end());
+    hierarchy.parent = parentManualAxes->getSecond();
   }
-  return parentManualAxesRef;
+
+  hierarchy.region =
+      SmallVector<StringAttr>(hierarchy.parent.begin(), hierarchy.parent.end());
+  hierarchy.region.append(op.getManualAxes().begin(), op.getManualAxes().end());
+  return hierarchy;
 }
 
 // Returns the first sharding of `op`. If there are no in/out shardings, returns
@@ -116,72 +125,94 @@ TensorShardingAttr getFirstSharding(ManualComputationOp op) {
   return *inOutShardings.begin();
 }
 
-// Converts the shardings of all operations in `op`'s body to StableHLO
-// shardings.
-void convertShardingsToStablehloShardings(
+void setFullyClosedShardingsIfMissing(Operation* op, StringRef meshName) {
+  MLIRContext* context = op->getContext();
+
+  if (NamedComputationOp namedComputationOp =
+          mlir::dyn_cast<NamedComputationOp>(op)) {
+    if (!namedComputationOp.getInShardings().has_value()) {
+      namedComputationOp.setInShardingsAttr(
+          TensorShardingPerValueAttr::getFullyClosed(
+              context, op->getOperandTypes(), meshName));
+    }
+    if (!namedComputationOp.getOutShardings().has_value()) {
+      namedComputationOp.setOutShardingsAttr(
+          TensorShardingPerValueAttr::getFullyClosed(
+              context, op->getResultTypes(), meshName));
+    }
+    return;
+  }
+
+  if (!op->hasAttrOfType<TensorShardingPerValueAttr>(kShardingAttr)) {
+    SmallVector<TensorShardingAttr> shardings =
+        sdy::getFullyClosedShardings(context, op->getResultTypes(), meshName);
+    if (shardings.empty() && !op->hasTrait<mlir::OpTrait::IsTerminator>()) {
+      shardings = {TensorShardingAttr::getFullyReplicated(
+          context, /*rank=*/0, meshName, /*isClosed=*/true)};
+    }
+    sdy::setShardings(op, shardings);
+  }
+}
+
+// Sets the manual axes of all operations in `op`'s body.
+void setManualAxesForOpsInBody(
     ManualComputationOp op,
     const ManualComputationToParentManualAxes& parentManualCompAxes,
     const mlir::SymbolTable& symbolTable) {
   TensorShardingAttr sharding = getFirstSharding(op);
   if (!sharding) {
-    // If there are no in/out shardings, op.getManualAxes() must be empty. No
-    // sharding conversion is needed.
+    // If there are no in/out shardings, op.getManualAxes() must be empty. We do
+    // not need to set the manual axes of the body.
     return;
   }
 
-  // For a ManualComputationOp, all in/out shardings, shardings in the body,
-  // and manual axes must refer to the same mesh.
-  StringRef meshName = sharding.getMeshName();
-  MeshAttr mesh = sharding.getMesh(symbolTable);
-  CHECK(mesh);
+  ManualAxesHierarchy manualAxes =
+      getManualAxesHierarchy(op, parentManualCompAxes);
+  if (manualAxes.region.empty()) {
+    return;
+  }
 
-  // The axes that are manual inside `op`'s region.
-  SmallVector<StringAttr> regionManualAxes;
-  (void)populateRegionManualAxes(regionManualAxes, op, parentManualCompAxes);
   MLIRContext* context = op.getContext();
-  std::function<StringAttr(const HloSharding&)> getStringAttr =
-      [&](const HloSharding& hloSharding) {
-        return StringAttr::get(context, hloSharding.ToString());
-      };
-
-  if (mesh.getAxes().size() == regionManualAxes.size()) {
-    // All operations in the body have fully manual sharding.
-    StringAttr fullyManualSharding = getStringAttr(HloSharding::Manual());
-    op.getBody().front().walk<mlir::WalkOrder::PreOrder>(
-        [&](Operation* opInBody) {
-          if (mlir::isa<ManualComputationOp>(opInBody)) {
-            // Skip `ManualComputationOp`s, they will be converted separately.
-            return mlir::WalkResult::skip();
-          }
-          opInBody->setAttr(kXlaShardingAttr, fullyManualSharding);
-          // Remove the possible fully replicated sdy.sharding attribute.
-          opInBody->removeAttr(kShardingAttr);
-          return mlir::WalkResult::advance();
-        });
-  } else {
-    auto getMeshAttr = [&](TensorShardingAttr) { return mesh; };
-    // All operations in the body must be sharded or replicated along free
-    // axes. If an operation does not have sharding annotation, it is fully
-    // replicated along free axes.
-    op.getBody().front().walk<mlir::WalkOrder::PreOrder>(
-        [&](Operation* opInBody) {
-          if (mlir::isa<ManualComputationOp>(opInBody)) {
-            return mlir::WalkResult::skip();
-          }
-          TensorShardingPerValueAttr shardingPerValue =
-              opInBody->getAttrOfType<TensorShardingPerValueAttr>(
-                  kShardingAttr);
-          if (!shardingPerValue) {
-            shardingPerValue = TensorShardingPerValueAttr::getFullyOpen(
-                context, opInBody->getResultTypes(), meshName);
+  StringRef meshName = sharding.getMeshName();
+  ManualAxesAttr manualAxesAttr =
+      ManualAxesAttr::get(context, manualAxes.region);
+
+  // Set the manual axes of all operations in the body.
+  op.getBody().front().walk<mlir::WalkOrder::PreOrder>(
+      [&](Operation* opInBody) {
+        if (mlir::isa<ManualComputationOp>(opInBody)) {
+          // Skip `ManualComputationOp`s and their nested operations, they will
+          // be handled separately.
+          return mlir::WalkResult::skip();
+        }
+
+        // TODO(b/415378067). Polish how we handle shardings with different
+        // meshes.
+        bool hasOtherMesh = false;
+        for (TensorShardingAttr opInBodySharding :
+             mlir::sdy::getShardings(opInBody)) {
+          if (opInBodySharding.getMeshName() != meshName) {
+            hasOtherMesh = true;
+            MeshAttr otherMesh = opInBodySharding.getMesh(opInBody);
+            CHECK(otherMesh.getAxes().empty() || otherMesh.isMaximal());
           }
-          opInBody->setAttr(kXlaShardingAttr,
-                            convertToHloShardingAttr(
-                                opInBody, shardingPerValue.getShardings(),
-                                getMeshAttr, getStringAttr, regionManualAxes));
+        }
+        if (hasOtherMesh) {
+          // Must be fully manual.
+          CHECK(manualAxes.region.size() ==
+                sharding.getMesh(symbolTable).getAxes().size());
           opInBody->removeAttr(kShardingAttr);
-          return mlir::WalkResult::advance();
-        });
+        }
+
+        setFullyClosedShardingsIfMissing(opInBody, meshName);
+        opInBody->setAttr(kManualAxes, manualAxesAttr);
+        return mlir::WalkResult::advance();
+      });
+}
+
+void setNonEmptyManualAxes(Operation* op, ManualAxesAttr manualAxesAttr) {
+  if (!manualAxesAttr.empty()) {
+    op->setAttr(kManualAxes, manualAxesAttr);
   }
 }
 
@@ -194,12 +225,13 @@ void convertShardingsToStablehloShardings(
 // 4. Copy for each result.
 // 5. CustomCall @SPMDShardToFullShape for each result.
 //
-// The shardings of the Copy and CustomCall ops are set based on the in/out
-// shardings of `op`.
+// The shardings and manual axes of the Copy and CustomCall ops are set based on
+// the in/out shardings of `op`.
 void convertManualComputationOp(
     ManualComputationOp op,
     const ManualComputationToParentManualAxes& parentManualCompAxes,
     const mlir::SymbolTable& symbolTable) {
+  MLIRContext* context = op.getContext();
   mlir::IRRewriter rewriter(op);
   TensorShardingAttr sharding = getFirstSharding(op);
   if (!sharding) {
@@ -217,69 +249,60 @@ void convertManualComputationOp(
   CHECK(mesh);
 
   // The axes that are manual inside `op`'s region.
-  SmallVector<StringAttr> regionManualAxes;
-  mlir::ArrayRef<StringAttr> parentManualAxes =
-      populateRegionManualAxes(regionManualAxes, op, parentManualCompAxes);
-  std::function<StringAttr(const HloSharding&)> getStringAttr =
-      [&](const HloSharding& hloSharding) {
-        return rewriter.getStringAttr(hloSharding.ToString());
-      };
-
-  StringAttr fullyManualSharding = getStringAttr(HloSharding::Manual());
-  auto createAttributes =
-      [&](StringRef callTargetName) -> SmallVector<NamedAttribute, 2> {
-    return {rewriter.getNamedAttr("call_target_name",
-                                  rewriter.getStringAttr(callTargetName)),
-            rewriter.getNamedAttr(kXlaShardingAttr, fullyManualSharding)};
-  };
-  SmallVector<NamedAttribute, 2> fullToShardAttributes =
-      createAttributes(kSPMDFullToShardShapeCallTargetName);
-  SmallVector<NamedAttribute, 2> shardToFullAttributes =
-      createAttributes(kSPMDShardToFullShapeCallTargetName);
-
-  bool fullyManual = mesh.getAxes().size() == regionManualAxes.size();
+  ManualAxesHierarchy manualAxes =
+      getManualAxesHierarchy(op, parentManualCompAxes);
+  ManualAxesAttr parentManualAxesAttr =
+      ManualAxesAttr::get(context, manualAxes.parent);
+  ManualAxesAttr regionManualAxesAttr =
+      ManualAxesAttr::get(context, manualAxes.region);
+
   mlir::Location loc = op.getLoc();
-  auto getMeshAttr = [&](TensorShardingAttr) { return mesh; };
   // Add copy and custom_call @SPMDFullToShardShape for each operand. The
   // copy corresponds to custom_call @Sharding before sharding propagation.
   SmallVector<Value> fullToShardResults;
-  for (auto [operand_index, args] : llvm::enumerate(
-           llvm::zip_equal(op.getOperands(), op.getBody().getArgumentTypes(),
-                           op.getInShardings().getShardings()))) {
-    auto [globalOperand, localArgumentType, inSharding] = args;
+  for (auto [globalOperand, localArgumentType, inSharding] :
+       llvm::zip_equal(op.getOperands(), op.getBody().getArgumentTypes(),
+                       op.getInShardings().getShardings())) {
+    if (!mlir::isa<mlir::ShapedType>(localArgumentType)) {
+      fullToShardResults.push_back(globalOperand);
+      continue;
+    }
     auto copy = rewriter.create<CopyOp>(loc, globalOperand);
-    copy->setAttr(kXlaShardingAttr,
-                  getStringAttr(convertToHloSharding(inSharding, getMeshAttr,
-                                                     parentManualAxes)));
-    fullToShardAttributes.back() = rewriter.getNamedAttr(
-        kXlaShardingAttr,
-        fullyManual ? fullyManualSharding
-                    : getStringAttr(convertToHloSharding(
-                          op.getInShardingWithoutManualAxes(operand_index),
-                          getMeshAttr, regionManualAxes)));
-    auto fullToShard = rewriter.create<CustomCallOp>(
-        loc, localArgumentType, copy.getResult(), fullToShardAttributes);
+    sdy::setShardings(copy, inSharding);
+    setNonEmptyManualAxes(copy, parentManualAxesAttr);
+
+    auto fullToShard =
+        rewriter.create<CustomCallOp>(loc, localArgumentType, copy.getResult());
+    fullToShard.setCallTargetName(kSPMDFullToShardShapeCallTargetName);
+    sdy::setShardings(fullToShard,
+                      eraseManualAxes(inSharding, manualAxes.region));
+    setNonEmptyManualAxes(fullToShard, regionManualAxesAttr);
+
     fullToShardResults.push_back(fullToShard.getResult(0));
   }
+
   Operation* terminator = getBodyTerminator(op);
   // Add custom_call @SPMDShardToFullShape and copy for each operand of
   // terminator.
   rewriter.setInsertionPointAfter(op);
+
   for (auto [terminatorOperand, opResult, outSharding] :
        llvm::zip_equal(terminator->getOpOperands(), op.getResults(),
                        op.getOutShardings().getShardings())) {
+    if (!mlir::isa<mlir::ShapedType>(opResult.getType())) {
+      opResult.replaceAllUsesWith(terminatorOperand.get());
+      continue;
+    }
     auto copy = rewriter.create<CopyOp>(loc, terminatorOperand.get());
-    copy->setAttr(kXlaShardingAttr,
-                  fullyManual ? fullyManualSharding
-                              : getStringAttr(convertToHloSharding(
-                                    op.getOutShardingWithoutManualAxes(
-                                        terminatorOperand.getOperandNumber()),
-                                    getMeshAttr, regionManualAxes)));
-    shardToFullAttributes.back() = rewriter.getNamedAttr(
-        kXlaShardingAttr, getStringAttr(convertToHloSharding(
-                              outSharding, getMeshAttr, parentManualAxes)));
-    auto shardToFull = rewriter.create<CustomCallOp>(
-        loc, opResult.getType(), copy.getResult(), shardToFullAttributes);
+    sdy::setShardings(copy, eraseManualAxes(outSharding, manualAxes.region));
+    setNonEmptyManualAxes(copy, regionManualAxesAttr);
+
+    auto shardToFull = rewriter.create<CustomCallOp>(loc, opResult.getType(),
+                                                     copy.getResult());
+    shardToFull.setCallTargetName(kSPMDShardToFullShapeCallTargetName);
+    sdy::setShardings(shardToFull, outSharding);
+    setNonEmptyManualAxes(shardToFull, parentManualAxesAttr);
+
     opResult.replaceAllUsesWith(shardToFull.getResult(0));
   }
   rewriter.inlineBlockBefore(&op.getBody().front(), op, fullToShardResults);
@@ -307,8 +330,7 @@ class ShardMapExportPass
         parentAxes.insert(parentAxes.end(), parentOp.getManualAxes().begin(),
                           parentOp.getManualAxes().end());
       }
-      convertShardingsToStablehloShardings(op, parentManualCompAxes,
-                                           symbolTable);
+      setManualAxesForOpsInBody(op, parentManualCompAxes, symbolTable);
     });
 
     // Need to do a separate post order walk to inline the
@@ -330,7 +352,8 @@ class ShardMapExportPass
   }
 
   void getDependentDialects(mlir::DialectRegistry& registry) const final {
-    registry.insert<SdyDialect, mlir::mhlo::MhloDialect>();
+    registry.insert<SdyDialect, mlir::mhlo::MhloDialect,
+                    mlir::stablehlo::StablehloDialect>();
   }
 };
 
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.h b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.h
index 0610e8dd74e1..942c33460bd8 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.h
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/shard_map_export.h
@@ -25,8 +25,8 @@ namespace xla {
 namespace sdy {
 
 // Creates a pass that converts the `ManualComputationOp` into the pattern that
-// the XLA compiler recognizes. This pass also exports fully or partially manual
-// shardings, while other shardings are processed in
+// the XLA compiler recognizes. This pass records the manual axes in the
+// `kManualAxes` attribute, which will be processed in
 // `ExportStablehloShardingsPass`.
 std::unique_ptr<mlir::Pass> createStablehloRoundTripShardMapExportPass();
 
diff --git a/third_party/xla/xla/service/spmd/shardy/test/import_backend_func_calls.mlir b/third_party/xla/xla/service/spmd/shardy/test/import_backend_func_calls.mlir
deleted file mode 100644
index 9ab41e20ce0a..000000000000
--- a/third_party/xla/xla/service/spmd/shardy/test/import_backend_func_calls.mlir
+++ /dev/null
@@ -1,58 +0,0 @@
-// RUN: sdy_opt %s -xla-sdy-import-backend-func-calls 2>&1 | FileCheck %s
-
-sdy.mesh @mesh = #sdy.mesh<["x"=2, "y"=2]>
-
-// CHECK-LABEL: func @no_out_shardings
-func.func @no_out_shardings(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
-  // CHECK-NEXT: %[[NC:.*]] = sdy.named_computation<"foo">(%arg0) (%arg1: tensor<8x2xi32>) {
-  // CHECK-NEXT:   %[[MULT:.*]] = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
-  // CHECK-NEXT:   sdy.return %[[MULT]] : tensor<8x2xi32>
-  // CHECK-NEXT: } {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"},
-  // CHECK-SAME:    random_attr = "random_value"}
-  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: %[[MOVE_TO_HOST:.*]] = stablehlo.custom_call @MoveToHost(%[[NC]]) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: return %[[MOVE_TO_HOST]] : tensor<8x2xi32>
-  %0 = call @foo(%arg0) {random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  %1 = stablehlo.custom_call @MoveToHost(%0) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  return %1 : tensor<8x2xi32>
-}
-
-func.func private @foo(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
-  %0 = stablehlo.multiply %arg0, %arg0 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
-  return %0 : tensor<8x2xi32>
-}
-
-// CHECK-LABEL: func @out_shardings
-func.func @out_shardings(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
-  // CHECK-NEXT: %[[NC:.*]] = sdy.named_computation<"bar">(%arg0) out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
-  // CHECK-NEXT:   %[[MULT:.*]] = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
-  // CHECK-NEXT:   sdy.return %[[MULT]] : tensor<8x2xi32>
-  // CHECK-NEXT: } {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"},
-  // CHECK-SAME:    random_attr = "random_value"}
-  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: %[[MOVE_TO_HOST:.*]] = stablehlo.custom_call @MoveToHost(%[[NC]]) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: return %[[MOVE_TO_HOST]] : tensor<8x2xi32>
-  %0 = call @bar(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>, random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  %1 = stablehlo.custom_call @MoveToHost(%0) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  return %1 : tensor<8x2xi32>
-}
-
-// NOTE: we ignore any arg/result shardings on the function.
-func.func private @bar(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>}) {
-  %0 = stablehlo.multiply %arg0, %arg0 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
-  return %0 : tensor<8x2xi32>
-}
-
-// Don't import if there is no backend_config.
-// CHECK-LABEL: func @no_backend_config
-func.func @no_backend_config(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
-  // CHECK-NEXT: %[[CALL:.*]] = call @baz(%arg0) : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // CHECK-NEXT: return %[[CALL]] : tensor<8x2xi32>
-  %0 = call @baz(%arg0) : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  return %0 : tensor<8x2xi32>
-}
-
-func.func private @baz(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
-  %0 = stablehlo.multiply %arg0, %arg0 : tensor<8x2xi32>
-  return %0 : tensor<8x2xi32>
-}
diff --git a/third_party/xla/xla/service/spmd/shardy/test/import_backend_func_calls_failure.mlir b/third_party/xla/xla/service/spmd/shardy/test/import_backend_func_calls_failure.mlir
deleted file mode 100644
index a071572eb419..000000000000
--- a/third_party/xla/xla/service/spmd/shardy/test/import_backend_func_calls_failure.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: sdy_opt %s -xla-sdy-import-backend-func-calls -split-input-file -verify-diagnostics
-
-sdy.mesh @mesh = #sdy.mesh<["x"=2, "y"=2]>
-
-func.func @out_shardings(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
-  %0 = call @bar(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>, random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  // expected-error @+1 {{failed to legalize operation 'func.call' that was explicitly marked illegal}}
-  %1 = call @bar(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>, random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  %2 = stablehlo.custom_call @MoveToHost(%1) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
-  return %2 : tensor<8x2xi32>
-}
-
-func.func private @bar(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>}) {
-  %0 = stablehlo.multiply %arg0, %arg0 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
-  return %0 : tensor<8x2xi32>
-}
diff --git a/third_party/xla/xla/service/spmd/shardy/test/import_uninlineable_func_calls.mlir b/third_party/xla/xla/service/spmd/shardy/test/import_uninlineable_func_calls.mlir
new file mode 100644
index 000000000000..e01152f9bb29
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/test/import_uninlineable_func_calls.mlir
@@ -0,0 +1,133 @@
+// RUN: sdy_opt --split-input-file %s -xla-sdy-import-uninlineable-func-calls  2>&1 | FileCheck %s
+// RUN: sdy_opt %s -split-input-file -xla-sdy-import-uninlineable-func-calls -verify-diagnostics
+
+sdy.mesh @mesh = #sdy.mesh<["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @backend_config_no_out_shardings
+func.func @backend_config_no_out_shardings(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
+  // CHECK-NEXT: %[[NC:.*]] = sdy.named_computation<"foo">(%arg0) (%arg1: tensor<8x2xi32>) {
+  // CHECK-NEXT:   %[[MULT:.*]] = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
+  // CHECK-NEXT:   sdy.return %[[MULT]] : tensor<8x2xi32>
+  // CHECK-NEXT: } {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"},
+  // CHECK-SAME:    random_attr = "random_value"}
+  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: %[[MOVE_TO_HOST:.*]] = stablehlo.custom_call @MoveToHost(%[[NC]]) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: return %[[MOVE_TO_HOST]] : tensor<8x2xi32>
+  %0 = call @foo(%arg0) {random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %1 = stablehlo.custom_call @MoveToHost(%0) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %1 : tensor<8x2xi32>
+}
+
+// CHECK-NOT: func private @foo
+func.func private @foo(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  %0 = stablehlo.multiply %arg0, %arg0 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
+  return %0 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func @backend_config_out_shardings
+func.func @backend_config_out_shardings(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
+  // CHECK-NEXT: %[[NC:.*]] = sdy.named_computation<"bar">(%arg0) out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+  // CHECK-NEXT:   %[[MULT:.*]] = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
+  // CHECK-NEXT:   sdy.return %[[MULT]] : tensor<8x2xi32>
+  // CHECK-NEXT: } {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"},
+  // CHECK-SAME:    random_attr = "random_value"}
+  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: %[[MOVE_TO_HOST:.*]] = stablehlo.custom_call @MoveToHost(%[[NC]]) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: return %[[MOVE_TO_HOST]] : tensor<8x2xi32>
+  %0 = call @bar(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>, random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %1 = stablehlo.custom_call @MoveToHost(%0) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %1 : tensor<8x2xi32>
+}
+
+// NOTE: we ignore any arg/result shardings on the function.
+// CHECK-NOT: func private @bar
+func.func private @bar(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>}) {
+  %0 = stablehlo.multiply %arg0, %arg0 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
+  return %0 : tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func @inlineable_false
+func.func @inlineable_false(%arg0: tensor<8x2xi32>, %arg1: tensor<8x2xi32>) -> (tensor<8x2xi32>) {
+  // CHECK-NEXT: %[[NC:.*]]:2 = sdy.named_computation<"baz">(%arg0, %arg1) out_shardings=[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>] (%arg2: tensor<8x2xi32>, %arg3: tensor<8x2xi32>) {
+  // CHECK-NEXT:   %[[MULT:.*]] = stablehlo.multiply %arg2, %arg3 : tensor<8x2xi32>
+  // CHECK-NEXT:   sdy.return %[[MULT]], %arg3 : tensor<8x2xi32>, tensor<8x2xi32>
+  // CHECK-NEXT: } {mhlo.frontend_attributes = {inlineable = "false"}}
+  // CHECK-SAME: (tensor<8x2xi32>, tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
+  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[NC]]#0, %[[NC]]#1 : tensor<8x2xi32>
+  // CHECK-NEXT: return %[[ADD]] : tensor<8x2xi32>
+  %0:2 = call @baz(%arg0, %arg1) {mhlo.frontend_attributes = {inlineable = "false"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>]>} : (tensor<8x2xi32>, tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
+  %1 = stablehlo.add %0#0, %0#1 : tensor<8x2xi32>
+  return %1 : tensor<8x2xi32>
+}
+
+// CHECK-NOT: func private @baz
+func.func private @baz(%arg0: tensor<8x2xi32>, %arg1: tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>) {
+  %0 = stablehlo.multiply %arg0, %arg1 : tensor<8x2xi32>
+  return %0, %arg1 : tensor<8x2xi32>, tensor<8x2xi32>
+}
+
+// CHECK-LABEL: func @inlineable_true
+func.func @inlineable_true(%arg0: tensor<8x2xi32>, %arg1: tensor<8x2xi32>) -> (tensor<8x2xi32>) {
+  // CHECK-NEXT: %[[CALL:.*]]:2 =  call @qux(%arg0, %arg1) {mhlo.frontend_attributes = {inlineable = "true"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>]>}
+  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[CALL]]#0, %[[CALL]]#1 : tensor<8x2xi32>
+  // CHECK-NEXT: return %[[ADD]] : tensor<8x2xi32>
+  %0:2 = call @qux(%arg0, %arg1) {mhlo.frontend_attributes = {inlineable = "true"}, sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>, <@mesh, [{}, {"y"}]>]>} : (tensor<8x2xi32>, tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>)
+  %1 = stablehlo.add %0#0, %0#1 : tensor<8x2xi32>
+  return %1 : tensor<8x2xi32>
+}
+
+// CHECK: func private @qux
+func.func private @qux(%arg0: tensor<8x2xi32>, %arg1: tensor<8x2xi32>) -> (tensor<8x2xi32>, tensor<8x2xi32>) {
+  %0 = stablehlo.multiply %arg0, %arg1 : tensor<8x2xi32>
+  return %0, %arg1 : tensor<8x2xi32>, tensor<8x2xi32>
+}
+
+// Don't import if there is no backend_config or inlineable attr.
+// CHECK-LABEL: func @no_backend_config_or_inlineable_attr
+func.func @no_backend_config_or_inlineable_attr(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
+  // CHECK-NEXT: %[[CALL:.*]] = call @quux(%arg0) : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: return %[[CALL]] : tensor<8x2xi32>
+  %0 = call @quux(%arg0) : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %0 : tensor<8x2xi32>
+}
+
+// CHECK: func private @quux
+func.func private @quux(%arg0: tensor<8x2xi32>) -> tensor<8x2xi32> {
+  %0 = stablehlo.multiply %arg0, %arg0 : tensor<8x2xi32>
+  return %0 : tensor<8x2xi32>
+}
+
+// -----
+
+sdy.mesh @mesh = #sdy.mesh<["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @multiple_call_ops_same_name
+func.func @multiple_call_ops_same_name(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) {
+  // CHECK-NEXT: %[[NC_0:.*]] = sdy.named_computation<"foobar">(%arg0) out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+  // CHECK-NEXT:   %[[MULT_0:.*]] = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
+  // CHECK-NEXT:   sdy.return %[[MULT_0]] : tensor<8x2xi32>
+  // CHECK-NEXT: } {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"},
+  // CHECK-SAME:    random_attr = "random_value"}
+  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<8x2xi32>
+
+  // CHECK-NEXT: %[[NC_1:.*]] = sdy.named_computation<"foobar">(%[[NC_0]]) out_shardings=[<@mesh, [{"x"}, {"y"}]>] (%arg1: tensor<8x2xi32>) {
+  // CHECK-NEXT:   %[[MULT_1:.*]] = stablehlo.multiply %arg1, %arg1 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
+  // CHECK-NEXT:   sdy.return %[[MULT_1]] : tensor<8x2xi32>
+  // CHECK-NEXT: } {mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"},
+  // CHECK-SAME:    random_attr = "random_value"}
+  // CHECK-SAME: (tensor<8x2xi32>) -> tensor<8x2xi32>
+
+  // CHECK-NEXT: %[[MOVE_TO_HOST:.*]] = stablehlo.custom_call @MoveToHost(%[[NC_1]]) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // CHECK-NEXT: return %[[MOVE_TO_HOST]] : tensor<8x2xi32>
+  %0 = call @foobar(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>, random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  // expected-warning @+1 {{uninlineable function @foobar has multiple call ops, we need to clone the function body for each call}}
+  %1 = call @foobar(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {"y"}]>]>, random_attr = "random_value", mhlo.frontend_attributes = {backend_config = "{\22flag_configs\22:[],\22scoped_memory_configs\22:[],\22device_type\22:\22DEVICE_TYPE_HOST\22,\22used_scoped_memory_configs\22:[]}"}} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  %2 = stablehlo.custom_call @MoveToHost(%1) {backend_config = ""} : (tensor<8x2xi32>) -> tensor<8x2xi32>
+  return %2 : tensor<8x2xi32>
+}
+
+// CHECK-NOT: func private @foobar
+func.func private @foobar(%arg0: tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>}) -> (tensor<8x2xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {"y"}]>}) {
+  %0 = stablehlo.multiply %arg0, %arg0 {mhlo.frontend_attributes = {_xla_compute_type = "host"}} : tensor<8x2xi32>
+  return %0 : tensor<8x2xi32>
+}
diff --git a/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir b/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
new file mode 100644
index 000000000000..b8ae30a6f183
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/test/mhlo_extensions_test.mlir
@@ -0,0 +1,121 @@
+// RUN: sdy_opt %s -sdy-populate-op-sharding-rules -sdy-insert-explicit-reshards -verify-diagnostics 2>&1 | FileCheck %s
+// TODO(enver): Seperate into a sharding rule and explicit reshard tests.
+
+sdy.mesh @mesh_abcd = <["a"=2, "b"=2, "c"=2, "d"=2]>
+
+// CHECK-LABEL: func @copy
+func.func @copy(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
+  // CHECK: sdy.sharding_rule = #sdy.op_sharding_rule<([i, j])->([i, j]) {i=16, j=8}>
+  %0 = mhlo.copy %arg0 : tensor<16x8xf32>
+  return %0 : tensor<16x8xf32>
+}
+
+// CHECK-LABEL: func @ragged_dot_mode_non_contracting
+func.func @ragged_dot_mode_non_contracting(
+    %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
+    %arg1: tensor<4x16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}, {"d"}]>},
+    %arg2: tensor<16x4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}]>}) -> (tensor<16x32x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>}) {
+  // CHECK: %[[RESHARD0:.*]] = sdy.reshard %arg0 <@mesh_abcd, [{"a"}, {}, {"c"}]> : tensor<16x32x64xf32>
+  // CHECK: %[[RESHARD1:.*]] = sdy.reshard %arg1 <@mesh_abcd, [{}, {"a"}, {"c"}, {"d"}]> : tensor<4x16x64x8xf32>
+  // CHECK: %[[RESHARD2:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{"a"}, {}]> : tensor<16x4xi32>
+
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%[[RESHARD0]], %[[RESHARD1]], %[[RESHARD2]]) <{
+  // CHECK: }>
+  // CHECK-SAME: sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {}, {"d"}]>]>
+  // CHECK-SAME: sdy.sharding_rule = #sdy.op_sharding_rule<([i, j, l], [m, i, l, k], [i, m])->([i, j, k]) {i=16, j=32, k=8, l=64, m=4} reduction={l} need_replication={j, m}>
+
+  // CHECK: %[[ALL_REDUCE:.*]] = sdy.all_reduce {"c"} %[[RAGGED_DOT]] out_sharding=<@mesh_abcd, [{"a"}, {}, {"d"}]> : tensor<16x32x8xf32>
+  // CHECK: %[[RESHARD3:.*]] = sdy.reshard %[[ALL_REDUCE]] <@mesh_abcd, [{"a"}, {"b"}, {"c"}]> : tensor<16x32x8xf32>
+  // CHECK: return %[[RESHARD3]] : tensor<16x32x8xf32>
+  %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
+    #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0], rhs_batching_dimensions = [1],
+      lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [2]>,
+      lhs_ragged_dimensions = [1], rhs_group_dimensions = [0]>}>
+    {sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>]>}
+    : (tensor<16x32x64xf32>, tensor<4x16x64x8xf32>, tensor<16x4xi32>) -> tensor<16x32x8xf32>
+  return %0 : tensor<16x32x8xf32>
+}
+
+// CHECK-LABEL: func @ragged_dot_mode_contracting
+func.func @ragged_dot_mode_contracting(
+    %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
+    %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
+    %arg2: tensor<16x4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}]>}) -> (tensor<4x16x32x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}, {"d"}]>}) {
+  // CHECK: %[[RESHARD0:.*]] = sdy.reshard %arg0 <@mesh_abcd, [{"a"}, {"b"}, {}]> : tensor<16x32x64xf32>
+  // CHECK: %[[RESHARD1:.*]] = sdy.reshard %arg1 <@mesh_abcd, [{"a"}, {}, {"d"}]> : tensor<16x64x8xf32>
+  // CHECK: %[[RESHARD2:.*]] = sdy.reshard %arg2 <@mesh_abcd, [{"a"}, {}]> : tensor<16x4xi32>
+
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%[[RESHARD0]], %[[RESHARD1]], %[[RESHARD2]]) <{
+  // CHECK: }>
+  // CHECK-SAME: sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{}, {"a"}, {"b"}, {"d"}]>]>
+  // CHECK-SAME: sdy.sharding_rule = #sdy.op_sharding_rule<([i, j, l], [i, l, k], [i, m])->([m, i, j, k]) {i=16, j=32, k=8, l=64, m=4} need_replication={l, m}>
+
+  // CHECK: %[[RESHARD3:.*]] = sdy.reshard %[[RAGGED_DOT]] <@mesh_abcd, [{"a"}, {"b"}, {"c"}, {"d"}]> : tensor<4x16x32x8xf32>
+  // CHECK: return %[[RESHARD3]] : tensor<4x16x32x8xf32>
+  %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
+    #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
+    lhs_batching_dimensions = [0], rhs_batching_dimensions = [0],
+    lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
+    lhs_ragged_dimensions = [2]>}>
+    {sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"c"}, {"d"}]>]>}
+    : (tensor<16x32x64xf32>, tensor<16x64x8xf32>, tensor<16x4xi32>) -> tensor<4x16x32x8xf32>
+  return %0 : tensor<4x16x32x8xf32>
+}
+
+// CHECK-LABEL: func @ragged_dot_mode_batch
+func.func @ragged_dot_mode_batch(
+    %arg0: tensor<16x32x64xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"c"}]>},
+    %arg1: tensor<16x64x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"c"}, {"d"}]>},
+    %arg2: tensor<4xi32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}]>}) -> (tensor<16x32x8xf32> {sdy.sharding=#sdy.sharding<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>}) {
+  // CHECK: %[[RAGGED_DOT:.*]] = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{
+  // CHECK: }>
+  // CHECK-SAME: sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>
+  // CHECK-SAME: sdy.sharding_rule = #sdy.op_sharding_rule<([i, j, l], [i, l, k], [m])->([i, j, k]) {i=16, j=32, k=8, l=64, m=1} reduction={l}>
+  // CHECK: %[[ALL_REDUCE:.*]] = sdy.all_reduce {"c"} %[[RAGGED_DOT]] out_sharding=<@mesh_abcd, [{"a"}, {"b"}, {"d"}]> : tensor<16x32x8xf32>
+  // CHECK: return %[[ALL_REDUCE]] : tensor<16x32x8xf32>
+  %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
+    #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
+    lhs_batching_dimensions = [0], rhs_batching_dimensions = [0],
+    lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>,
+    lhs_ragged_dimensions = [0]>}>
+    {sdy.sharding = #sdy.sharding_per_value<[<@mesh_abcd, [{"a"}, {"b"}, {"d"}]>]>}
+    : (tensor<16x32x64xf32>, tensor<16x64x8xf32>, tensor<4xi32>) -> tensor<16x32x8xf32>
+  return %0 : tensor<16x32x8xf32>
+}
+
+// CHECK-LABEL: func @ragged_dot_mode_non_contracting_multiple_dims
+func.func @ragged_dot_mode_non_contracting_multiple_dims(%arg0: tensor<41x21x11x42x22x12x43xf32>, %arg1: tensor<41x11x31x42x32x12x43x7xf32>, %arg2: tensor<11x12x21x7xi32>) -> tensor<11x12x21x22x31x32xf32> {
+  // CHECK: sdy.sharding_rule = #sdy.op_sharding_rule<([o, k, i, p, l, j, q], [o, i, m, p, n, j, q, r], [i, j, k, r])->([i, j, k, l, m, n]) {i=11, j=12, k=21, l=22, m=31, n=32, o=41, p=42, q=43, r=7} reduction={o, p, q} need_replication={l, r}>
+  %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
+    #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
+    lhs_batching_dimensions = [2, 5], rhs_batching_dimensions = [1, 5],
+    lhs_contracting_dimensions = [0, 3, 6], rhs_contracting_dimensions = [0, 3, 6]>,
+    lhs_ragged_dimensions = [4], rhs_group_dimensions = [7]>}>
+    : (tensor<41x21x11x42x22x12x43xf32>, tensor<41x11x31x42x32x12x43x7xf32>, tensor<11x12x21x7xi32>) -> tensor<11x12x21x22x31x32xf32>
+  return %0 : tensor<11x12x21x22x31x32xf32>
+}
+
+// CHECK-LABEL: func @ragged_dot_mode_contracting_multiple_dims
+func.func @ragged_dot_mode_contracting_multiple_dims(%arg0: tensor<41x21x11x42x22x12x43xf32>, %arg1: tensor<41x11x31x42x32x12x43xf32>, %arg2: tensor<11x12x41x7xi32>) -> tensor<7x11x12x21x22x31x32xf32> {
+  // CHECK: sdy.sharding_rule = #sdy.op_sharding_rule<([o, k, i, p, l, j, q], [o, i, m, p, n, j, q], [i, j, o, r])->([r, i, j, k, l, m, n]) {i=11, j=12, k=21, l=22, m=31, n=32, o=41, p=42, q=43, r=7} reduction={o, q} need_replication={p, r}>}
+  %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
+    #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
+    lhs_batching_dimensions = [2, 5], rhs_batching_dimensions = [1, 5],
+    lhs_contracting_dimensions = [0, 3, 6], rhs_contracting_dimensions = [0, 3, 6]>,
+    lhs_ragged_dimensions = [3]>}>
+    : (tensor<41x21x11x42x22x12x43xf32>, tensor<41x11x31x42x32x12x43xf32>, tensor<11x12x41x7xi32>) -> tensor<7x11x12x21x22x31x32xf32>
+  return %0 : tensor<7x11x12x21x22x31x32xf32>
+}
+
+// CHECK-LABEL: func @ragged_dot_mode_batch_multiple_dims
+func.func @ragged_dot_mode_batch_multiple_dims(%arg0: tensor<41x21x11x42x22x12x43xf32>, %arg1: tensor<41x11x31x42x32x12x43xf32>, %arg2: tensor<11x7xi32>) -> tensor<11x12x21x22x31x32xf32> {
+  // CHECK: sdy.sharding_rule = #sdy.op_sharding_rule<([o, k, i, p, l, j, q], [o, i, m, p, n, j, q], [r, s])->([i, j, k, l, m, n]) {i=11, j=12, k=21, l=22, m=31, n=32, o=41, p=42, q=43, r=1, s=1} reduction={o, p, q}>
+  %0 = "mhlo.ragged_dot"(%arg0, %arg1, %arg2) <{ragged_dot_dimension_numbers =
+    #mhlo.ragged_dot<dot_dimension_numbers = #mhlo.dot<
+    lhs_batching_dimensions = [2, 5], rhs_batching_dimensions = [1, 5],
+    lhs_contracting_dimensions = [0, 3, 6], rhs_contracting_dimensions = [0, 3, 6]>,
+    lhs_ragged_dimensions = [5]>}>
+    : (tensor<41x21x11x42x22x12x43xf32>, tensor<41x11x31x42x32x12x43xf32>, tensor<11x7xi32>) -> tensor<11x12x21x22x31x32xf32>
+  return %0 : tensor<11x12x21x22x31x32xf32>
+}
diff --git a/third_party/xla/xla/service/spmd/shardy/test/open_while_free_vars_sharding.mlir b/third_party/xla/xla/service/spmd/shardy/test/open_while_free_vars_sharding.mlir
index fe13f45d4e09..be418bad663a 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/open_while_free_vars_sharding.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/open_while_free_vars_sharding.mlir
@@ -1,5 +1,8 @@
 // RUN: sdy_opt %s -xla-sdy-open-while-free-vars-sharding 2>&1 | FileCheck %s
 
+// Verify calling this pass a second time is a no-op.
+// RUN: sdy_opt %s -xla-sdy-open-while-free-vars-sharding -xla-sdy-open-while-free-vars-sharding 2>&1 | FileCheck %s
+
 sdy.mesh @mesh1 = <["a"=2]>
 sdy.mesh @mesh2 = <["b"=2]>
 
@@ -7,7 +10,8 @@ sdy.mesh @mesh2 = <["b"=2]>
 func.func @while_with_free_variables(
     %arg0: tensor<32x96xf32>,
     %arg1: tensor<32x96xf32> {sdy.sharding = #sdy.sharding<@mesh1, [{"a"}, {}]>},
-    %arg2: tensor<32x96xf32>)
+    %arg2: tensor<32x96xf32>,
+    %arg3: tensor<32x96xf32> {sdy.sharding = #sdy.sharding<@mesh1, [{?}, {?}]>})
     -> (tensor<32x96xf32>, tensor<32x96xf32>) {
   // CHECK-NEXT: %[[C0:.*]] = stablehlo.constant dense<0>
   // CHECK-NEXT: %[[C1:.*]] = stablehlo.constant dense<1>
@@ -24,7 +28,8 @@ func.func @while_with_free_variables(
   // CHECK-NEXT:   %[[ADD_2:.*]] = stablehlo.add %iterArg, %[[SC_0]]
   // CHECK-NEXT:   %[[ADD_3:.*]] = stablehlo.add %[[ADD_2]], %arg2
   // CHECK-NEXT:   %[[ADD_4:.*]] = stablehlo.add %[[ADD_3]], %[[SC_1]]
-  // CHECK-NEXT:   stablehlo.return %[[ADD_4]], %[[ADD_1]]
+  // CHECK-NEXT:   %[[ADD_5:.*]] = stablehlo.add %[[ADD_4]], %arg3
+  // CHECK-NEXT:   stablehlo.return %[[ADD_5]], %[[ADD_1]]
   // CHECK-NEXT: }
   // CHECK-NEXT: return %[[ADD_0]], %[[WHILE]]#0
   %0 = stablehlo.constant dense<0> : tensor<i32>
@@ -40,7 +45,8 @@ func.func @while_with_free_variables(
     %6 = stablehlo.add %iterArg, %arg1 : tensor<32x96xf32>
     %7 = stablehlo.add %6, %arg2 : tensor<32x96xf32>
     %8 = stablehlo.add %7, %3 : tensor<32x96xf32>
-    stablehlo.return %8, %5 : tensor<32x96xf32>, tensor<i32>
+    %9 = stablehlo.add %8, %arg3 : tensor<32x96xf32>
+    stablehlo.return %9, %5 : tensor<32x96xf32>, tensor<i32>
   }
   return %3, %4#0 : tensor<32x96xf32>, tensor<32x96xf32>
 }
diff --git a/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline.mlir b/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline.mlir
index 522976c5bbd2..113b2b85faef 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline.mlir
@@ -230,6 +230,15 @@ func.func @main(%arg0: tensor<8x16xf32>) -> (tensor<8x16xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<8x16xf32>) -> (tensor<8x16xf32>) {
+  // CHECK: sdy.propagation_barrier %arg0 allowed_direction=BACKWARD : tensor<8x16xf32>
+  %r = sdy.propagation_barrier %arg0 allowed_direction=BACKWARD : tensor<8x16xf32>
+  return %r : tensor<8x16xf32>
+}
+
+// -----
+
 // Test call with backend config and multiple results. This is what JAX would
 // emit in the frontend, and then we'd convert it to a NamedComputationOp when
 // coming back.
diff --git a/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline_manual_computation.mlir b/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline_manual_computation.mlir
index 54ec035eed6a..2d5f070c7cd1 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline_manual_computation.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/round_trip_pipeline_manual_computation.mlir
@@ -4,27 +4,126 @@
 // sdy shardings on the custom calls. Make sure when we round-trip we get the
 // ManualComputationOp though.
 
+// ***************** Basic test *****************
+
 // Make sure this temp attr doesn't exist anymore.
 // CHECK-NOT: sharding_hlo_string
 
+
 // CHECK: sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
 sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
 
 // CHECK-LABEL: func.func @main
 func.func @main(%arg0: tensor<16x32xf32>) -> tensor<128x32xf32> {
-  // CHECK:          %[[SHARD_MAP:.*]]:2 = sdy.manual_computation(%arg0)
+  // CHECK-NEXT:     %[[MANUAL_COMP:.*]]:2 = sdy.manual_computation(%arg0)
   // CHECK-SAME{LITERAL}: in_shardings=[<@mesh_1, [{}, {}], replicated={"a", "b"}>] out_shardings=[<@mesh_1, [{"a", "b"}, {}]>, <@mesh_1, [{"b", "a"}, {}]>] manual_axes={"a", "b"} (%arg1: tensor<16x32xf32>) {
   // CHECK-NEXT:       sdy.return %arg1, %arg1 : tensor<16x32xf32>, tensor<16x32xf32>
   // CHECK-NEXT:     } : (tensor<16x32xf32>) -> (tensor<128x32xf32>, tensor<128x32xf32>)
-  // CHECK-NEXT:     %[[ADD:.*]] = stablehlo.add %[[SHARD_MAP]]#0, %[[SHARD_MAP]]#1 : tensor<128x32xf32>
+  // CHECK-NEXT:     %[[ADD:.*]] = stablehlo.add %[[MANUAL_COMP]]#0, %[[MANUAL_COMP]]#1 : tensor<128x32xf32>
   // CHECK-NEXT:     return %[[ADD]] : tensor<128x32xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<16x32xf32>) -> tensor<16x32xf32>
-  %1:2 = call @local_xla.sdy.manual_computation_body(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {}], replicated={\\\22a\\\22, \\\22b\\\22}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22, \\\22b\\\22}, {}]>, <@mesh_1, [{\\\22b\\\22, \\\22a\\\22}, {}]>]>"}} : (tensor<16x32xf32>) -> (tensor<16x32xf32>, tensor<16x32xf32>)
-  %2:2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1#0, %1#1) : (tensor<16x32xf32>, tensor<16x32xf32>) -> (tensor<128x32xf32>, tensor<128x32xf32>)
-  %3 = stablehlo.add %2#0, %2#1 : tensor<128x32xf32>
-  return %3 : tensor<128x32xf32>
+  %0:2 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_1, [{}, {}], replicated={"a", "b"}>] out_shardings=[<@mesh_1, [{"a", "b"}, {}]>, <@mesh_1, [{"b", "a"}, {}]>] manual_axes={"a", "b"} (%arg1: tensor<16x32xf32>) {
+    sdy.return %arg1, %arg1 : tensor<16x32xf32>, tensor<16x32xf32>
+  } : (tensor<16x32xf32>) -> (tensor<128x32xf32>, tensor<128x32xf32>)
+  %1 = stablehlo.add %0#0, %0#1 : tensor<128x32xf32>
+  return %1 : tensor<128x32xf32>
+}
+
+// -----
+
+// ***************** No inputs test *****************
+
+// Make sure this temp attr doesn't exist anymore.
+// CHECK-NOT: sharding_hlo_string
+
+// CHECK: sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
+sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
+
+// CHECK-LABEL: func.func @main
+func.func @main() -> tensor<4xi64> {
+  // CHECK-NEXT:     %[[MANUAL_COMP:.*]] = sdy.manual_computation()
+  // CHECK-SAME{LITERAL}: in_shardings=[] out_shardings=[<@mesh_1, [{"b"}]>] manual_axes={"b"} () {
+  // CHECK-NEXT:       %[[C:.*]] = sdy.constant dense<[2, 3]> : tensor<2xi64>
+  // CHECK-NEXT:       sdy.return %[[C]] : tensor<2xi64>
+  // CHECK-NEXT:     } : () -> tensor<4xi64>
+  // CHECK-NEXT:     return %[[MANUAL_COMP]] : tensor<4xi64>
+  %0 = sdy.manual_computation() in_shardings=[] out_shardings=[<@mesh_1, [{"b"}]>] manual_axes={"b"} () {
+    %1 = sdy.constant dense<[2, 3]> : tensor<2xi64>
+    sdy.return %1 : tensor<2xi64>
+  } : () -> tensor<4xi64>
+  func.return %0 : tensor<4xi64>
 }
-// CHECK-NOT: func.func private @local_xla.sdy.manual_computation_body
-func.func private @local_xla.sdy.manual_computation_body(%arg0: tensor<16x32xf32>) -> (tensor<16x32xf32>, tensor<16x32xf32>) {
-  return %arg0, %arg0 : tensor<16x32xf32>, tensor<16x32xf32>
+
+// -----
+
+// ***************** No outputs test *****************
+
+// Make sure this temp attr doesn't exist anymore.
+// CHECK-NOT: sharding_hlo_string
+
+// CHECK: sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
+sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
+
+// CHECK-LABEL: func.func @main
+func.func @main(%arg0: tensor<4xi64>) {
+  // CHECK-NEXT:     sdy.manual_computation(%arg0)
+  // CHECK-SAME{LITERAL}: in_shardings=[<@mesh_1, [{"b"}]>] out_shardings=[] manual_axes={"b"} (%arg1: tensor<2xi64>) {
+  // CHECK-NEXT:       stablehlo.custom_call @sdy_testonly(%arg1) {backend_config = "", has_side_effect = true, xla_shape = "()"} : (tensor<2xi64>) -> ()
+  // CHECK-NEXT:       sdy.return
+  // CHECK-NEXT:     } : (tensor<4xi64>) -> ()
+  // CHECK-NEXT:     return
+  sdy.manual_computation(%arg0) in_shardings=[<@mesh_1, [{"b"}]>] out_shardings=[] manual_axes={"b"} (%arg1: tensor<2xi64>) {
+    stablehlo.custom_call @sdy_testonly(%arg1) {has_side_effect = true} : (tensor<2xi64>) -> ()
+    sdy.return
+  } : (tensor<4xi64>) -> ()
+  return
+}
+
+
+// -----
+
+// ***************** No inputs no outputs test *****************
+
+// Make sure this temp attr doesn't exist anymore.
+// CHECK-NOT: sharding_hlo_string
+
+// CHECK-LABEL: func.func @main
+func.func @main() {
+  // CHECK-NEXT:     sdy.manual_computation()
+  // CHECK-SAME{LITERAL}: in_shardings=[] out_shardings=[] manual_axes={} () {
+  // CHECK-NEXT:       sdy.return
+  // CHECK-NEXT:     } : () -> ()
+  // CHECK-NEXT:     return
+  sdy.manual_computation() in_shardings=[] out_shardings=[] manual_axes={} () {
+    sdy.return
+  } : () -> ()
+  return
+}
+
+// -----
+
+// ***************** Tokens test *****************
+
+// Make sure this temp attr doesn't exist anymore.
+// CHECK-NOT: sharding_hlo_string
+
+// CHECK: sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
+sdy.mesh @mesh_1 = <["a"=4, "b"=2]>
+
+// CHECK-LABEL: func.func @main
+func.func @main(
+    %arg0: !stablehlo.token {sdy.sharding = #sdy.sharding<@mesh_1, []>},
+    %arg1: tensor<2xi64> {sdy.sharding = #sdy.sharding<@mesh_1, [{"b"}]>}
+) -> (!stablehlo.token, tensor<2xi64>) {
+  // CHECK-NEXT:     %[[MANUAL_COMP:.*]]:2 = sdy.manual_computation(%arg0, %arg1)
+  // CHECK-SAME{LITERAL}: in_shardings=[<@mesh_1, []>, <@mesh_1, [{"b"}]>] out_shardings=[<@mesh_1, []>, <@mesh_1, [{"b"}]>] manual_axes={"b"} (%arg2: !stablehlo.token, %arg3: tensor<1xi64>) {
+  // CHECK-NEXT:       sdy.return %arg2, %arg3 : !stablehlo.token, tensor<1xi64>
+  // CHECK-NEXT:     } : (!stablehlo.token, tensor<2xi64>) -> (!stablehlo.token, tensor<2xi64>)
+  // CHECK-NEXT:     return %[[MANUAL_COMP]]#0, %[[MANUAL_COMP]]#1 : !stablehlo.token, tensor<2xi64>
+  %0:2 = sdy.manual_computation(%arg0, %arg1)
+      in_shardings=[<@mesh_1, []>, <@mesh_1, [{"b"}]>]
+      out_shardings=[<@mesh_1, []>, <@mesh_1, [{"b"}]>]
+      manual_axes={"b"} (%arg2: !stablehlo.token, %arg3: tensor<1xi64>) {
+    sdy.return %arg2, %arg3 : !stablehlo.token, tensor<1xi64>
+  } : (!stablehlo.token, tensor<2xi64>) -> (!stablehlo.token, tensor<2xi64>)
+  return %0#0, %0#1 : !stablehlo.token, tensor<2xi64>
 }
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_inline_round_trip.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_inline_round_trip.mlir
index 17b6681d2b5c..f9dbad815b53 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_inline_round_trip.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_inline_round_trip.mlir
@@ -15,7 +15,8 @@ func.func @main(%arg0: tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"
     -> (tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"c"}, {}]>}) {
   // CHECK-NEXT: %[[ADD_0:.*]] = stablehlo.add %arg0, %arg0 : tensor<8x16xf32>
   // CHECK-NEXT: %[[MUL:.*]] = stablehlo.multiply %[[ADD_0]], %[[ADD_0]] : tensor<8x16xf32>
-  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %[[MUL]], %[[MUL]] : tensor<8x16xf32>
+  // CHECK-NEXT: %[[SC:.*]] = sdy.sharding_constraint %[[MUL]] <@mesh, [{}, {"b"}]> : tensor<8x16xf32>
+  // CHECK-NEXT: %[[ADD_1:.*]] = stablehlo.add %[[SC]], %[[SC]] : tensor<8x16xf32>
   // CHECK-NEXT: return %[[ADD_1]] : tensor<8x16xf32>
   %0 = stablehlo.add %arg0, %arg0 : tensor<8x16xf32>
   %1 = func.call @nested_func(%0) : (tensor<8x16xf32>) -> (tensor<8x16xf32>)
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_pipeline.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_pipeline.mlir
index edb7be4c0a5d..9683e7b2c53a 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_pipeline.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_export_pipeline.mlir
@@ -8,20 +8,20 @@ sdy.mesh @mesh_2 = <["x"=8, "y"=4]>
 
 // CHECK: module attributes {mhlo.frontend_attributes = {
 // CHECK-SAME: xla.sdy.meshes = "{
-// CHECK-SAME: mesh_0 = #sdy.mesh<[\\\22axis_0\\\22=2, \\\22axis_1\\\22=4, \\\22axis_2\\\22=4]>,
-// CHECK-SAME: mesh_1 = #sdy.mesh<[\\\22axis_0\\\22=16]>,
-// CHECK-SAME: mesh_2 = #sdy.mesh<[\\\22x\\\22=8, \\\22y\\\22=4]>}"}} {
+// CHECK-SAME: mesh_0 = #sdy.mesh<[\22axis_0\22=2, \22axis_1\22=4, \22axis_2\22=4]>,
+// CHECK-SAME: mesh_1 = #sdy.mesh<[\22axis_0\22=16]>,
+// CHECK-SAME: mesh_2 = #sdy.mesh<[\22x\22=8, \22y\22=4]>}"}} {
 
 // CHECK-LABEL: func @multiple_shardings(
-// CHECK-SAME:      %arg0: tensor<8x8xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_0, [{\\\22axis_2\\\22}, {\\\22axis_0\\\22, \\\22axis_1\\\22}]>"}, mhlo.sharding =
-// CHECK-SAME:      %arg1: tensor<8x8xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_0, [{}, {\\\22axis_0\\\22, \\\22axis_2\\\22}]>"}, mhlo.sharding =
-// CHECK-SAME:      %arg2: tensor<8x16xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_0, [{}, {\\\22axis_1\\\22}]>"}, mhlo.sharding =
+// CHECK-SAME:      %arg0: tensor<8x8xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_0, [{\22axis_2\22}, {\22axis_0\22, \22axis_1\22}]>"}, mhlo.sharding =
+// CHECK-SAME:      %arg1: tensor<8x8xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_0, [{}, {\22axis_0\22, \22axis_2\22}]>"}, mhlo.sharding =
+// CHECK-SAME:      %arg2: tensor<8x16xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_0, [{}, {\22axis_1\22}]>"}, mhlo.sharding =
 // CHECK-SAME:  -> tensor<8x16xf32> {
 func.func @multiple_shardings(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"axis_2"}, {"axis_0", "axis_1"}]>},
                               %arg1: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {"axis_0", "axis_2"}]>},
                               %arg2: tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {"axis_1"}]>}) -> tensor<8x16xf32> {
 // CHECK-NEXT: stablehlo.add
-// CHECK-SAME: {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22axis_1\\\22, \\\22axis_0\\\22}, {}]>]>"}, mhlo.sharding =
+// CHECK-SAME: {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_0, [{\22axis_1\22, \22axis_0\22}, {}]>]>"}, mhlo.sharding =
   %0 = stablehlo.add %arg0, %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"axis_1","axis_0"}, {}]>]>} : tensor<8x8xf32>
   %1 = stablehlo.dot %0, %arg2 : (tensor<8x8xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   return %1 : tensor<8x16xf32>
@@ -31,7 +31,7 @@ func.func @multiple_shardings(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.shardi
 func.func @multi_result_op(%arg0: tensor<4x64x8xf32>, %arg1: tensor<4x64x8xf32>) -> (tensor<4x8xf32>, tensor<4x8xf32>) {
   %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK: stablehlo.reduce
-// CHECK-SAME: {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{}, {\\\22y\\\22}]>, <@mesh_2, [{\\\22y\\\22}, {}]>]>"}, mhlo.sharding =
+// CHECK-SAME: {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{}, {\22y\22}]>, <@mesh_2, [{\22y\22}, {}]>]>"}, mhlo.sharding =
   %1:2 = stablehlo.reduce(%arg0 init: %0), (%arg1 init: %0) across dimensions = [1]
     {sdy.sharding = #sdy.sharding_per_value<[<@mesh_2, [{}, {"y"}]>, <@mesh_2, [{"y"}, {}]>]>} :
     (tensor<4x64x8xf32>, tensor<4x64x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<4x8xf32>, tensor<4x8xf32>)
@@ -44,13 +44,13 @@ func.func @multi_result_op(%arg0: tensor<4x64x8xf32>, %arg1: tensor<4x64x8xf32>)
 }
 
 // CHECK-LABEL: func @split_axes(
-// CHECK-SAME:      %arg0: tensor<8x8xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_2, [{\\\22y\\\22}, {\\\22x\\\22:(2)2}]>"}, mhlo.sharding =
-// CHECK-SAME:      %arg1: tensor<8x16xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_2, [{\\\22x\\\22:(1)2}, {\\\22x\\\22:(2)4}]>"}, mhlo.sharding =
+// CHECK-SAME:      %arg0: tensor<8x8xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_2, [{\22y\22}, {\22x\22:(2)2}]>"}, mhlo.sharding =
+// CHECK-SAME:      %arg1: tensor<8x16xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh_2, [{\22x\22:(1)2}, {\22x\22:(2)4}]>"}, mhlo.sharding =
 // CHECK-SAME:  -> tensor<8x16xf32> {
 func.func @split_axes(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"y"}, {"x":(2)2}]>},
                       %arg1: tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"x":(1)2}, {"x":(2)4}]>}) -> tensor<8x16xf32> {
 // CHECK-NEXT: stablehlo.dot
-// CHECK-SAME: {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\\\22x\\\22:(1)2, \\\22x\\\22:(4)2}, {}]>]>"}, mhlo.sharding =
+// CHECK-SAME: {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\22x\22:(1)2, \22x\22:(4)2}, {}]>]>"}, mhlo.sharding =
   %1 = stablehlo.dot %arg0, %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_2, [{"x":(1)2, "x":(4)2}, {}]>]>} : (tensor<8x8xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   return %1 : tensor<8x16xf32>
 }
@@ -60,7 +60,7 @@ func.func @func_result_sharding_returning_func_arg(
   // CHECK: %arg0: tensor<8x16xf32>) -> (tensor<8x16xf32> {mhlo.sharding =
   %arg0: tensor<8x16xf32>
   ) -> (tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"x", ?}, {"y"}p4]>}) {
-  // CHECK:      %[[CUSTOM_CALL:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\\\22x\\\22, ?}, {\\\22y\\\22}p4]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK:      %[[CUSTOM_CALL:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\22x\22, ?}, {\22y\22}p4]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
   // CHECK-NEXT: return %[[CUSTOM_CALL]] : tensor<8x16xf32>
   return %arg0 : tensor<8x16xf32>
 }
@@ -76,10 +76,10 @@ func.func @func_result_sharding_returning_op_value(%arg0: tensor<8x16xf32>)
       tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"x"}, {"y"}p1]>},
       tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_2, [{}, {}]>}) {
   // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %arg0, %arg0 : tensor<8x16xf32>
-  // CHECK-NEXT: %[[TEST_ONLY:.*]]:2 = stablehlo.custom_call @sdy_testonly(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\\\22x\\\22, \\\22y\\\22}, {}]>, <@mesh_2, [{\\\22y\\\22, \\\22x\\\22}, {}]>]>"}, mhlo.sharding =
-  // CHECK-NEXT: %[[ADD_RESULT_SHARDING_0:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[ADD]]) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\\\22x\\\22, ?}, {\\\22y\\\22}p4]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  // CHECK-NEXT: %[[TEST_ONLY_RES_SHARDING_0:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[TEST_ONLY]]#0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{?}, {\\\22y\\\22}p4]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  // CHECK-NEXT: %[[TEST_ONLY_RES_SHARDING_1:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[TEST_ONLY]]#1) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\\\22x\\\22}, {\\\22y\\\22}p1]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK-NEXT: %[[TEST_ONLY:.*]]:2 = stablehlo.custom_call @sdy_testonly(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\22x\22, \22y\22}, {}]>, <@mesh_2, [{\22y\22, \22x\22}, {}]>]>"}, mhlo.sharding =
+  // CHECK-NEXT: %[[ADD_RESULT_SHARDING_0:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[ADD]]) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\22x\22, ?}, {\22y\22}p4]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK-NEXT: %[[TEST_ONLY_RES_SHARDING_0:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[TEST_ONLY]]#0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{?}, {\22y\22}p4]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  // CHECK-NEXT: %[[TEST_ONLY_RES_SHARDING_1:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[TEST_ONLY]]#1) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\22x\22}, {\22y\22}p1]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
   // CHECK-NEXT: %[[ADD_RESULT_SHARDING_1:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[ADD]]) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{}, {}]>]>"}} : (tensor<8x16xf32>) -> tensor<8x16xf32>
   // CHECK-NEXT: return %[[ADD_RESULT_SHARDING_0]], %[[TEST_ONLY_RES_SHARDING_0]], %[[TEST_ONLY_RES_SHARDING_1]], %[[ADD_RESULT_SHARDING_1]]
   %0 = stablehlo.add %arg0, %arg0 : tensor<8x16xf32>
@@ -90,7 +90,7 @@ func.func @func_result_sharding_returning_op_value(%arg0: tensor<8x16xf32>)
 // CHECK-LABEL: func @sharding_constraint
 // CHECK-SAME:      %arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
 func.func @sharding_constraint(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  // CHECK: stablehlo.custom_call @Sharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\\\22x\\\22, ?}, {?}]>]>"}, mhlo.sharding =
+  // CHECK: stablehlo.custom_call @Sharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\22x\22, ?}, {?}]>]>"}, mhlo.sharding =
   %0 = sdy.sharding_constraint %arg0 <@mesh_2, [{"x", ?}, {?}]> :  tensor<8x8xf32>
   return %0 : tensor<8x8xf32>
 }
@@ -103,6 +103,14 @@ func.func @export_sharding_group(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
   return %arg0 : tensor<8x8xf32>
 }
 
+// CHECK-LABEL: func @export_propagation_barrier
+// CHECK-SAME:      %arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+func.func @export_propagation_barrier(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  // CHECK: %0 = stablehlo.custom_call @local_xla.sdy.PropagationBarrier(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.allowed_direction = "2 : i32"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+  %0 = sdy.propagation_barrier %arg0 allowed_direction=BACKWARD :  tensor<8x8xf32>
+  return %0 : tensor<8x8xf32>
+}
+
 // CHECK-LABEL: func @constant
 func.func @constant() -> tensor<i32> {
   // CHECK-NEXT: %[[CONST:.*]] = stablehlo.constant dense<0>
@@ -113,14 +121,14 @@ func.func @constant() -> tensor<i32> {
 
 // CHECK-LABEL: func @inlined_mesh(
 // CHECK-SAME: %arg0: tensor<32xi32>
-// CHECK-SAME:   {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<mesh<[\\\22a\\\22=2, \\\22b\\\22=2]>, [{\\\22a\\\22}]>"},
+// CHECK-SAME:   {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<mesh<[\22a\22=2, \22b\22=2]>, [{\22a\22}]>"},
 // CHECK-SAME:    mhlo.sharding = "{devices=[2,2]<=[4] last_tile_dim_replicate}"})
 // CHECK-SAME: -> (tensor<32xi32> {mhlo.sharding = "{maximal device=5}"}) {
 func.func @inlined_mesh(
   %arg0: tensor<32xi32> {sdy.sharding = #sdy.sharding<mesh<["a"=2, "b"=2]>, [{"a"}]>}
 ) -> (tensor<32xi32> {sdy.sharding = #sdy.sharding<mesh<[], device_ids=[5]>, []>}) {
   // CHECK-NEXT: %[[SHARDING:.*]] = stablehlo.custom_call @Sharding(%arg0)
-  // CHECK-SAME:   mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\\\22c\\\22=4]>, [{\\\22c\\\22}]>]>"}, mhlo.sharding = "{devices=[4]<=[4]}"}
+  // CHECK-SAME:   mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\22c\22=4]>, [{\22c\22}]>]>"}, mhlo.sharding = "{devices=[4]<=[4]}"}
   // CHECK-NEXT: %[[RESULT_SHARDING:.*]] = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%[[SHARDING]])
   // CHECK-SAME:   mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[], device_ids=[5]>, []>]>"}
   // CHECK-NEXT: return %[[RESULT_SHARDING]]
@@ -138,7 +146,7 @@ func.func @op_sharding_rule(%arg0: tensor<8x2xf32>, %arg1: tensor<8x2xf32>) -> t
 // CHECK-LABEL: func @sharding_and_op_sharding_rule
 func.func @sharding_and_op_sharding_rule(%arg0: tensor<8x2xf32>, %arg1: tensor<8x2xf32>) -> tensor<8x2xf64> {
   // CHECK: stablehlo.custom_call @foo(%arg0, %arg1) {mhlo.frontend_attributes =
-  // CHECK-SAME: {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\\\22x\\\22}, {}]>]>"
+  // CHECK-SAME: {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh_2, [{\22x\22}, {}]>]>"
   // CHECK-SAME: xla.sdy.sharding_rule = "#sdy.op_sharding_rule<([i, j], [i, j])->([i, j]) {i=8, j=2}>"}
   %0 = stablehlo.custom_call @foo(%arg0, %arg1)
     {sdy.sharding_rule = #sdy.op_sharding_rule<([i, j], [i, j])->([i, j]) {i=8, j=2}>,
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_import_pipeline.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_import_pipeline.mlir
index bc285acb7464..8455068bad26 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_import_pipeline.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_import_pipeline.mlir
@@ -2,10 +2,10 @@
 
 // CHECK-LABEL: module @multiple_func_result_shardings
 module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {xla.sdy.meshes =
-    "{mesh = #sdy.mesh<[\\\22a\\\22=8, \\\22b\\\22=8, \\\22c\\\22=8]>, mesh2 = #sdy.mesh<[\\\22a\\\22=1, \\\22b\\\22=4, \\\22c\\\22=1]>}"}} {
+    "{mesh = #sdy.mesh<[\"a\"=8, \"b\"=8, \"c\"=8]>, mesh2 = #sdy.mesh<[\"a\"=1, \"b\"=4, \"c\"=1]>, maximal_mesh = #sdy.mesh<[], device_ids=[0]>}"}} {
   // CHECK: sdy.mesh @mesh = <["a"=8, "b"=8, "c"=8]>
 
-  // CHECK: sdy.mesh @mesh2 = <["b"=4]>
+  // CHECK: sdy.mesh @mesh2 = <["a"=1, "b"=4, "c"=1]>
 
   // CHECK-LABEL: func @func_results_with_sharding
   // CHECK-SAME:    %arg0: tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"b"}p2]>},
@@ -18,40 +18,88 @@ module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {x
   // CHECK-SAME:    tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"c"}p0]>},
   // CHECK-SAME:    tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"b"}p2]>},
   // CHECK-SAME:    tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}p3]>}) {
-  // CHECK-NEXT:   return %arg0, %arg1, %arg0, %arg1, %arg1, %arg2
-  // CHECK-NEXT: }
   func.func @func_results_with_sharding(
-    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\\\22b\\\22}p2]>"}},
-    %arg1: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\\\22a\\\22}p1]>"}},
-    %arg2: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\\\22c\\\22}p0]>"}}
+    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\"b\"}p2]>"}},
+    %arg1: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\"a\"}p1]>"}},
+    %arg2: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\"c\"}p0]>"}}
   ) -> (tensor<32xi32>, tensor<32xi32>, tensor<32xi32>, tensor<32xi32>, tensor<32xi32>, tensor<32xi32>) {
-    %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p0]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22b\\\22}p2]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %2 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p1]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %3 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22c\\\22}p0]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %4 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg2) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p3]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    // CHECK-NEXT: return %arg0, %arg1, %arg0, %arg1, %arg1, %arg2
+    %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p0]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"b\"}p2]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %2 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p1]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %3 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"c\"}p0]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %4 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg2) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p3]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
     return %0, %1, %2, %3, %1, %4 : tensor<32xi32>, tensor<32xi32>, tensor<32xi32>, tensor<32xi32>, tensor<32xi32>, tensor<32xi32>
   }
 
+  // CHECK-LABEL: func @func_result_sharding_used_by_x64_combine(%arg0: tensor<16xi64>)
+  // CHECK-SAME:    -> (tensor<16xi64> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}]>}) {
+  func.func @func_result_sharding_used_by_x64_combine(
+    %arg0: tensor<16xi64>) -> tensor<16xi64> {
+    // CHECK-NEXT: %[[SPLIT_LOW:.*]] = stablehlo.custom_call @X64SplitLow(%arg0) : (tensor<16xi64>) -> tensor<16xui32>
+    // CHECK-NEXT: %[[SPLIT_HIGH:.*]] = stablehlo.custom_call @X64SplitHigh(%arg0) : (tensor<16xi64>) -> tensor<16xui32>
+    // CHECK-NEXT: %[[COMBINE:.*]] = stablehlo.custom_call @X64Combine(%[[SPLIT_LOW]], %[[SPLIT_HIGH]])
+    // CHECK-SAME:   {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a"}]>]>} : (tensor<16xui32>, tensor<16xui32>) -> tensor<16xi64>
+    // CHECK-NEXT: return %[[COMBINE]]
+    %0 = stablehlo.custom_call @X64SplitLow(%arg0) : (tensor<16xi64>) -> tensor<16xui32>
+    %1 = stablehlo.custom_call @X64SplitHigh(%arg0) : (tensor<16xi64>) -> tensor<16xui32>
+    %2 = stablehlo.tuple %0, %1 : tuple<tensor<16xui32>, tensor<16xui32>>
+    %3 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%2)
+        {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}]>]>"}} :
+        (tuple<tensor<16xui32>, tensor<16xui32>>) -> tuple<tensor<16xui32>, tensor<16xui32>>
+    %4 = stablehlo.get_tuple_element %3[0] : (tuple<tensor<16xui32>, tensor<16xui32>>) -> tensor<16xui32>
+    %5 = stablehlo.get_tuple_element %3[1] : (tuple<tensor<16xui32>, tensor<16xui32>>) -> tensor<16xui32>
+    %6 = stablehlo.custom_call @X64Combine(%4, %5) : (tensor<16xui32>, tensor<16xui32>) -> tensor<16xi64>
+    return %6 : tensor<16xi64>
+  }
+
+  // CHECK-LABEL: func @func_result_sharding_used_by_x64_combine_with_non_return_uses(%arg0: tensor<16xui32>, %arg1: tensor<16xui32>)
+  // CHECK-SAME:    -> (tensor<16xi64> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}]>}, tensor<16xi64>) {
+  func.func @func_result_sharding_used_by_x64_combine_with_non_return_uses(
+    %arg0: tensor<16xui32>, %arg1: tensor<16xui32>) -> (tensor<16xi64>, tensor<16xi64>) {
+    // CHECK-NEXT: %[[COMBINE:.*]] = stablehlo.custom_call @X64Combine(%arg0, %arg1)
+    // CHECK-SAME:   {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"a"}]>]>}
+    // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[COMBINE]], %[[COMBINE]]
+    // CHECK-NEXT: return %[[COMBINE]], %[[ADD]]
+    %0:2 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0, %arg1)
+        {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}]>]>"}} :
+        (tensor<16xui32>, tensor<16xui32>) -> (tensor<16xui32>, tensor<16xui32>)
+    %1 = stablehlo.custom_call @X64Combine(%0#0, %0#1) : (tensor<16xui32>, tensor<16xui32>) -> tensor<16xi64>
+    %2 = stablehlo.add %1, %1 : tensor<16xi64>
+    return %1, %2 : tensor<16xi64>, tensor<16xi64>
+  }
+
   // This might happen due to inlined funcs that originally had result shardings
   // CHECK-LABEL: func @func_result_shardings_used_by_other_ops(
   // CHECK-SAME:    %arg0: tensor<32xi32>, %arg1: tensor<32xi32>
   // CHECK-SAME:  ) -> (
   // CHECK-SAME:    tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"b"}p2]>},
   // CHECK-SAME:    tensor<32xi32>) {
-  // CHECK-NEXT:   %[[ADD:.*]] =  stablehlo.add %arg0, %arg1
-  // CHECK-NEXT:   return %arg0, %[[ADD]]
-  // CHECK-NEXT: }
   func.func @func_result_shardings_used_by_other_ops(
     %arg0: tensor<32xi32>, %arg1: tensor<32xi32>
   ) -> (tensor<32xi32>, tensor<32xi32>) {
-    %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p0]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22b\\\22}p2]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %2 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p3]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    // CHECK-NEXT: %[[SC0:.*]] = sdy.sharding_constraint %arg0 <@mesh, [{"a"}p0]>
+    // CHECK-NEXT: %[[SC1:.*]] = sdy.sharding_constraint %[[SC0]] <@mesh, [{"b"}p2]>
+    // CHECK-NEXT: %[[SC2:.*]] = sdy.sharding_constraint %arg1 <@mesh, [{"a"}p3]>
+    // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[SC1]], %[[SC2]]
+    // CHECK-NEXT: return %[[SC1]], %[[ADD]]
+    %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p0]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"b\"}p2]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %2 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p3]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
     %3 = stablehlo.add %1, %2 : tensor<32xi32>
     return %1, %3 : tensor<32xi32>, tensor<32xi32>
   }
 
+  // CHECK-LABEL: func @func_result_sharding_unused(
+  // CHECK-SAME: %arg0: tensor<8x1x0xf32>) -> (tensor<8x0xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}, {}]>})
+  func.func @func_result_sharding_unused(%arg0: tensor<8x1x0xf32>) -> tensor<8x0xf32> {
+    // CHECK-NEXT: %[[CONST:.*]] = sdy.constant dense<> : tensor<8x0xf32>
+    // CHECK-NEXT: return %[[CONST]]
+    %cst = sdy.constant dense<> : tensor<8x0xf32>
+    %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%cst) {backend_config = "", has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\22a\22}, {}]>]>"}} : (tensor<8x0xf32>) -> tensor<8x0xf32>
+    return %cst : tensor<8x0xf32>
+  }
+
   // CHECK-LABEL: func @while_with_free_variables
   func.func @while_with_free_variables(
       %arg0: tensor<32x96xf32>,
@@ -120,16 +168,16 @@ module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {x
   // CHECK-SAME: %arg0: tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}p0]>})
   // CHECK-SAME: -> (tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}p4]>}) {
   func.func @discard_shardings_on_unknown_ops(
-    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\\\22a\\\22}p0]>"}}
+    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh, [{\"a\"}p0]>"}}
   ) -> tensor<32xi32> {
     // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %arg0, %arg0 : tensor<32xi32>
     // CHECK-NEXT: %[[SHARDING:.*]] = sdy.sharding_constraint %[[ADD]] <@mesh, [{"a"}p2]> : tensor<32xi32>
     // CHECK-NEXT: %[[UNKNOWN:.*]] = stablehlo.custom_call @UnknownCustomCall(%[[SHARDING]]) : (tensor<32xi32>) -> tensor<32xi32>
     // CHECK-NEXT: return %[[UNKNOWN]]
-    %0 = stablehlo.add %arg0, %arg0 {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p1]>]>"}} : tensor<32xi32>
-    %1 = stablehlo.custom_call @Sharding(%0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p2]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %2 = stablehlo.custom_call @UnknownCustomCall(%1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p3]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %3 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%2) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}p4]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %0 = stablehlo.add %arg0, %arg0 {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p1]>]>"}} : tensor<32xi32>
+    %1 = stablehlo.custom_call @Sharding(%0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p2]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %2 = stablehlo.custom_call @UnknownCustomCall(%1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p3]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %3 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%2) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}p4]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
     return %3 : tensor<32xi32>
   }
 
@@ -137,11 +185,11 @@ module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {x
   // CHECK-SAME: %arg0: tensor<32xi32> {sdy.sharding = #sdy.sharding<mesh<["a"=2, "b"=2]>, [{"a"}]>})
   // CHECK-SAME: -> (tensor<32xi32> {sdy.sharding = #sdy.sharding<mesh<[], device_ids=[5]>, []>}) {
   func.func @inlined_mesh(
-    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<mesh<[\\\22a\\\22=2, \\\22b\\\22=2]>, [{\\\22a\\\22}]>"}}
+    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<mesh<[\"a\"=2, \"b\"=2]>, [{\"a\"}]>"}}
   ) -> tensor<32xi32> {
     // CHECK-NEXT: %[[SHARDING:.*]] = sdy.sharding_constraint %arg0 <mesh<["c"=4]>, [{"c"}]> : tensor<32xi32>
     // CHECK-NEXT: return %[[SHARDING]]
-    %0 = stablehlo.custom_call @Sharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\\\22c\\\22=4]>, [{\\\22c\\\22}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %0 = stablehlo.custom_call @Sharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"c\"=4]>, [{\"c\"}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
     %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[], device_ids=[5]>, []>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
     return %1 : tensor<32xi32>
   }
@@ -154,9 +202,9 @@ module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {x
   // CHECK-SAME:    tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh2, [{}]>},
   // CHECK-SAME:    tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh2, [{"b"}]>}) {
   func.func @shardings_with_size_one_axes(
-    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh2, [{\\\22b\\\22}p1], replicated={\\\22c\\\22}>"}},
-    %arg1: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh2, [{\\\22a\\\22}p2], replicated={\\\22b\\\22}>"}},
-    %arg2: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh2, [{\\\22c\\\22, \\\22b\\\22, ?}p0]>"}}
+    %arg0: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh2, [{\"b\"}p1], replicated={\"c\"}>"}},
+    %arg1: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh2, [{\"a\"}p2], replicated={\"b\"}>"}},
+    %arg2: tensor<32xi32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh2, [{\"c\", \"b\", ?}p0]>"}}
   ) -> (tensor<32xi32>, tensor<32xi32>) {
     // CHECK-NEXT:   %[[SC1:.*]] = sdy.sharding_constraint %arg0 <@mesh2, [{"b", ?}]>
     // CHECK-NEXT:   %[[ADD:.*]] = stablehlo.add %[[SC1]], %[[SC1]]
@@ -164,11 +212,11 @@ module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {x
     // CHECK-NEXT:   %[[SC2:.*]] = sdy.sharding_constraint %arg1 <@mesh2, [{}]>
     // CHECK-NEXT:   return %[[ADD]], %[[SC2]]
     // CHECK-NEXT: }
-    %0 = stablehlo.custom_call @Sharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\\\22a\\\22, \\\22b\\\22, ?}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %0 = stablehlo.custom_call @Sharding(%arg0) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\"a\", \"b\", ?}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
     %1 = stablehlo.add %0, %0 : tensor<32xi32>
-    %2 = stablehlo.custom_call @Sharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\\\22c\\\22, \\\22a\\\22}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %3 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\\\22a\\\22}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
-    %4 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%2) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\\\22b\\\22}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %2 = stablehlo.custom_call @Sharding(%arg1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\"c\", \"a\"}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %3 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%1) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\"a\"}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
+    %4 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%2) {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh2, [{\"b\"}]>]>"}} : (tensor<32xi32>) -> tensor<32xi32>
     return %3, %4 : tensor<32xi32>, tensor<32xi32>
   }
 
@@ -185,7 +233,7 @@ module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {x
     // CHECK-NEXT:          } : (tensor<16x32xf32>, tensor<16x32xf32>) -> tensor<16x32xf32>
     // CHECK-NEXT:          return %[[MAN_COMP]]
     %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<16x32xf32>, tensor<16x32xf32>) -> (tensor<16x8xf32>, tensor<16x8xf32>)
-    %1 = call @local_xla.sdy.manual_computation_body(%0#0, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh2, [{\\\22a\\\22}, {\\\22b\\\22}]>, <@mesh2, [{}, {\\\22b\\\22}], replicated={\\\22a\\\22}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh2, [{}, {\\\22b\\\22, \\\22a\\\22}]>]>"}} : (tensor<16x8xf32>, tensor<16x8xf32>) -> tensor<16x8xf32>
+    %1 = call @local_xla.sdy.manual_computation_body(%0#0, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh2, [{\"a\"}, {\"b\"}]>, <@mesh2, [{}, {\"b\"}], replicated={\"a\"}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\"a\", \"b\"}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh2, [{}, {\"b\", \"a\"}]>]>"}} : (tensor<16x8xf32>, tensor<16x8xf32>) -> tensor<16x8xf32>
     %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<16x8xf32>) -> tensor<16x32xf32>
     return %2 : tensor<16x32xf32>
   }
@@ -195,6 +243,64 @@ module @multiple_func_result_shardings attributes {mhlo.frontend_attributes = {x
     %0 = stablehlo.add %arg0, %arg1 : tensor<16x8xf32>
     return %0 : tensor<16x8xf32>
   }
+
+  // CHECK-LABEL: func @manual_computation_nested_tuples
+  func.func @manual_computation_nested_tuples(%arg0: tensor<8xi64>, %arg1: tensor<8xi32>) -> tensor<8xi32> {
+    // CHECK-NEXT: %[[SPLIT_LOW:.*]] = stablehlo.custom_call @X64SplitLow(%arg0)
+    // CHECK-NEXT: %[[SPLIT_HIGH:.*]] = stablehlo.custom_call @X64SplitHigh(%arg0)
+    // CHECK-NEXT: %[[MAN_COMP:.*]] = sdy.manual_computation(%[[SPLIT_LOW]], %[[SPLIT_HIGH]], %arg1)
+    // CHECK-SAME:     in_shardings=[<@mesh, [{"a"}]>, <@mesh, [{"a"}]>, <@mesh, [{"a"}]>]
+    // CHECK-SAME:     out_shardings=[<@mesh, [{"a"}]>] manual_axes={"a"}
+    // CHECK-SAME:     (%arg2: tensor<1xui32>, %arg3: tensor<1xui32>, %arg4: tensor<1xi32>) {
+    // CHECK-NEXT:   %[[CONVERT:.*]] = stablehlo.convert %arg2
+    // CHECK-NEXT:   %[[SUB:.*]] = stablehlo.subtract %[[CONVERT]], %arg4
+    // CHECK-NEXT:   sdy.return %[[SUB]]
+    // CHECK-NEXT: }
+    // CHECK-NEXT: return %[[MAN_COMP]]
+    %0 = stablehlo.custom_call @X64SplitLow(%arg0) : (tensor<8xi64>) -> tensor<8xui32>
+    %1 = stablehlo.custom_call @X64SplitHigh(%arg0) : (tensor<8xi64>) -> tensor<8xui32>
+    %2 = stablehlo.tuple %0, %1 : tuple<tensor<8xui32>, tensor<8xui32>>
+    %3 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%2, %arg1) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}]>, <@mesh, [{\"a\"}]>, <@mesh, [{\"a\"}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\"a\"}>"}} : (tuple<tensor<8xui32>, tensor<8xui32>>, tensor<8xi32>) -> tuple<tuple<tensor<1xui32>, tensor<1xui32>>, tensor<1xi32>>
+    %4 = stablehlo.get_tuple_element %3[0] : (tuple<tuple<tensor<1xui32>, tensor<1xui32>>, tensor<1xi32>>) -> tuple<tensor<1xui32>, tensor<1xui32>>
+    %5 = stablehlo.get_tuple_element %3[1] : (tuple<tuple<tensor<1xui32>, tensor<1xui32>>, tensor<1xi32>>) -> tensor<1xi32>
+    %6 = stablehlo.get_tuple_element %4[0] : (tuple<tensor<1xui32>, tensor<1xui32>>) -> tensor<1xui32>
+    %7 = stablehlo.get_tuple_element %4[1] : (tuple<tensor<1xui32>, tensor<1xui32>>) -> tensor<1xui32>
+    %8 = call @local_xla.sdy.manual_computation_body.1(%6, %7, %5) : (tensor<1xui32>, tensor<1xui32>, tensor<1xi32>) -> tensor<1xi32>
+    %9 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%8) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\"a\"}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh, [{\"a\"}]>]>"}} : (tensor<1xi32>) -> tensor<8xi32>
+    return %9 : tensor<8xi32>
+  }
+
+  func.func private @local_xla.sdy.manual_computation_body.1(%arg0: tensor<1xui32>, %arg1: tensor<1xui32>, %arg2: tensor<1xi32>) -> tensor<1xi32> {
+    %0 = stablehlo.convert %arg0 : (tensor<1xui32>) -> tensor<1xi32>
+    %1 = stablehlo.subtract %0, %arg2 : tensor<1xi32>
+    return %1 : tensor<1xi32>
+  }
+
+  // CHECK-LABEL: func @frontend_attr_not_sharding
+  // CHECK-SAME:    %arg0: tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh2, [{"b"}, {?}]>},
+  // CHECK-SAME:    %arg1: tensor<16x8xf32> {mhlo.frontend_attributes = {baz = 1 : i32, foo = "bar"}},
+  // CHECK-SAME:    %arg2: !stablehlo.token) -> tensor<16x8xf32> {
+  func.func @frontend_attr_not_sharding(
+    %arg0: tensor<16x8xf32> {mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding<@mesh2, [{\"b\"}, {?}]>"}},
+    %arg1: tensor<16x8xf32> {mhlo.frontend_attributes = {baz = 1 : i32, foo = "bar"}},
+    %arg2: !stablehlo.token) -> tensor<16x8xf32> {
+    // CHECK-NEXT: %[[SEND:.*]] = "stablehlo.send"(%arg0, %arg2) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 2>, is_host_transfer = true}> {mhlo.frontend_attributes = {baz = 1 : i32}, sdy.sharding = #sdy.sharding_per_value<[<@maximal_mesh, []>]>} : (tensor<16x8xf32>, !stablehlo.token) -> !stablehlo.token
+    // CHECK-NEXT: %[[RECV:.*]]:2 = "stablehlo.recv"(%[[SEND]]) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 3>, is_host_transfer = true}> {mhlo.frontend_attributes = {baz = 1 : i32}, sdy.sharding = #sdy.sharding_per_value<[<@maximal_mesh, []>, <@maximal_mesh, []>]>} : (!stablehlo.token) -> (tensor<16x8xf32>, !stablehlo.token)
+    // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[RECV]]#0, %arg1 : tensor<16x8xf32>
+    // CHECK-NEXT: return %[[ADD]] : tensor<16x8xf32>
+    %0 = "stablehlo.send"(%arg0, %arg2) {
+      channel_handle = #stablehlo.channel_handle<handle = 1, type = 2>,
+      is_host_transfer = true,
+      mhlo.frontend_attributes = {baz = 1 : i32, xla.sdy.sharding = "#sdy.sharding_per_value<[<@maximal_mesh, []>]>"}
+    } : (tensor<16x8xf32>, !stablehlo.token) -> !stablehlo.token
+    %1:2 = "stablehlo.recv"(%0) {
+      channel_handle = #stablehlo.channel_handle<handle = 1, type = 3>,
+      is_host_transfer = true,
+      mhlo.frontend_attributes = {baz = 1 : i32, xla.sdy.sharding = "#sdy.sharding_per_value<[<@maximal_mesh, []>, <@maximal_mesh, []>]>"}
+    } : (!stablehlo.token) -> (tensor<16x8xf32>, !stablehlo.token)
+    %2 = stablehlo.add %1#0, %arg1 : tensor<16x8xf32>
+    return %2 : tensor<16x8xf32>
+  }
 }
 
 // -----
@@ -244,14 +350,34 @@ func.func @import_sharding_group(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
 
 // -----
 
+// CHECK-LABEL: func @import_propagation_barrier_backward
+// CHECK-SAME:      %arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+func.func @import_propagation_barrier_backward(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  // CHECK %r = sdy.propagation_barrier %arg0 allowed_direction=BACKWARD :  tensor<8x8xf32>
+  %r = stablehlo.custom_call @local_xla.sdy.PropagationBarrier(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.allowed_direction = "2 : i32"}} : (tensor<8x8xf32>) -> (tensor<8x8xf32>)
+  return %r : tensor<8x8xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @import_propagation_barrier_forward
+// CHECK-SAME:      %arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+func.func @import_propagation_barrier_forward(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  // CHECK %r = sdy.propagation_barrier %arg0 allowed_direction=FORWARD :  tensor<8x8xf32>
+  %r = stablehlo.custom_call @local_xla.sdy.PropagationBarrier(%arg0) {mhlo.frontend_attributes = {xla.sdy.allowed_direction = "1 : i32"}} : (tensor<8x8xf32>) -> (tensor<8x8xf32>)
+  return %r : tensor<8x8xf32>
+}
+
+// -----
+
 func.func @callback_no_result(%arg0: tensor<f64>) {
   // CHECK:      %[[C:.*]] = sdy.constant
   // CHECK-NEXT: stablehlo.custom_call @xla_python_cpu_callback(%[[C]], %arg0) {
   // CHECK-SAME:   api_version = 2 : i32, backend_config = "56238273106176",
   // CHECK-SAME:   has_side_effect = true,
   // CHECK-SAME:   operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>],
-  // CHECK-SAME:   result_layouts = [dense<> : tensor<0xindex>]
-  // CHECK-SAME: } : (tensor<i64>, tensor<f64>) -> tensor<i64>
+  // CHECK-SAME:   result_layouts = []
+  // CHECK-SAME: } : (tensor<i64>, tensor<f64>) -> ()
   %c = stablehlo.constant dense<56238273106176> : tensor<i64>
   stablehlo.custom_call @xla_python_cpu_callback(%c, %arg0) {api_version = 2 : i32, backend_config = "56238273106176", has_side_effect = true, operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>], result_layouts = []} : (tensor<i64>, tensor<f64>) -> ()
   return
@@ -262,11 +388,11 @@ func.func @callback_no_result(%arg0: tensor<f64>) {
 module @maximal_sharding_module attributes {mhlo.frontend_attributes = {xla.sdy.meshes = "{maximal_mesh_0 = #sdy.mesh<[], device_ids=[0]>}"}} {
   // CHECK-LABEL: @maximal_sharding_empty_tuple
   func.func @maximal_sharding_empty_tuple(%arg0: tensor<2xi64>) -> tensor<2xi64> {
-    // CHECK-NEXT: %[[DUMMY_VAL:.*]] = stablehlo.custom_call @xla_ffi_python_cpu_callback(%arg0) {
+    // CHECK-NEXT: stablehlo.custom_call @xla_ffi_python_cpu_callback(%arg0) {
     // CHECK-SAME:   api_version = 4 : i32, backend_config = {descriptor = 126001424235520 : ui64},
-    // CHECK-SAME:   has_side_effect = true, operand_layouts = [dense<0> : tensor<1xindex>], result_layouts = [dense<0> : tensor<1xindex>],
+    // CHECK-SAME:   has_side_effect = true, operand_layouts = [dense<0> : tensor<1xindex>], result_layouts = [],
     // CHECK-SAME:   sdy.sharding = #sdy.sharding_per_value<[<@maximal_mesh_0, []>]>, xla_shape = "()"
-    // CHECK-SAME: } : (tensor<2xi64>) -> tensor<2xi64>
+    // CHECK-SAME: } : (tensor<2xi64>) -> ()
     // CHECK-NEXT: return %arg0 : tensor<2xi64>
     %2 = stablehlo.custom_call @xla_ffi_python_cpu_callback(%arg0) {
       api_version = 4 : i32, backend_config = {descriptor = 126001424235520 : ui64},
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_remove_size_one_axes.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_remove_size_one_axes.mlir
index d32cebdf7520..1dea2ce59b89 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_remove_size_one_axes.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_remove_size_one_axes.mlir
@@ -5,10 +5,10 @@ sdy.mesh @mesh2 = <["a"=4, "b"=2]>
 sdy.mesh @mesh3 = <["x"=1, "y"=1]>
 sdy.mesh @mesh4 = <["a"=1, "b"=2, "c"=1]>
 
-// CHECK: sdy.mesh @mesh1 = <["b"=2, "d"=4], device_ids=[0, 2, 1, 3, 4, 6, 5, 7]>
+// CHECK: sdy.mesh @mesh1 = <["a"=1, "b"=2, "c"=1, "d"=4, "e"=1], device_ids=[0, 2, 1, 3, 4, 6, 5, 7]>
 // CHECK: sdy.mesh @mesh2 = <["a"=4, "b"=2]>
-// CHECK: sdy.mesh @mesh3 = <[]>
-// CHECK: sdy.mesh @mesh4 = <["b"=2]>
+// CHECK: sdy.mesh @mesh3 = <["x"=1, "y"=1]>
+// CHECK: sdy.mesh @mesh4 = <["a"=1, "b"=2, "c"=1]>
 
 // CHECK-LABEL: func @func_and_op_shardings
 // CHECK-SAME:    %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh1, [{"b"}, {?}]>},
@@ -36,13 +36,13 @@ func.func @func_and_op_shardings(
 }
 
 // CHECK-LABEL: func @inlined_mesh
-// CHECK-SAME:    %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<mesh<["b"=2, "d"=2], device_ids=[3, 1, 2, 0]>, [{"b"}, {?}]>}
+// CHECK-SAME:    %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<mesh<["a"=1, "b"=2, "c"=1, "d"=2], device_ids=[3, 1, 2, 0]>, [{"b"}, {?}]>}
 // CHECK-SAME:  ) -> (
 // CHECK-SAME:    tensor<8x8xf32> {sdy.sharding = #sdy.sharding<mesh<["a"=2, "b"=2]>, [{"a", "b"}, {}]>}) {
 func.func @inlined_mesh(
   %arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<mesh<["a"=1, "b"=2, "c"=1, "d"=2], device_ids=[3, 1, 2, 0]>, [{"a", "b"}, {"c", ?}]>}
 ) -> (tensor<8x8xf32> {sdy.sharding = #sdy.sharding<mesh<["a"=2, "b"=2]>, [{"a", "b"}, {}]>}) {
-  // CHECK-NEXT: stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<mesh<[]>, [{?}, {?}]>]>}
+  // CHECK-NEXT: stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<mesh<["a"=1, "b"=1]>, [{?}, {?}]>]>}
   %0 = stablehlo.add %arg0, %arg0 {sdy.sharding = #sdy.sharding_per_value<[<mesh<["a"=1, "b"=1]>, [{"a", ?}, {"b", ?}]>]>} : tensor<8x8xf32>
   return %0 : tensor<8x8xf32>
 }
@@ -97,8 +97,8 @@ func.func @manual_computation(%arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>)
 // CHECK-LABEL: func @manual_computation_inlined_mesh
 func.func @manual_computation_inlined_mesh(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   // CHECK-NEXT:          %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
-  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh4, [{"b"}, {}]>, <mesh<["b"=2]>, [{"b"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<mesh<["b"=2]>, [{"b"}, {}]>]
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh4, [{"b"}, {}]>, <mesh<["a"=1, "b"=2, "c"=1]>, [{"b"}, {}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<mesh<["a"=1, "b"=2, "c"=1]>, [{"b"}, {}]>]
   // CHECK-SAME{LITERAL}:     manual_axes={"b"}
   %0 = sdy.manual_computation(%arg0, %arg1)
       in_shardings=[<@mesh4, [{"b", "a"}, {}]>, <mesh<["a"=1, "b"=2, "c"=1]>, [{"b"}, {}], replicated={"a"}>]
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_export.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_export.mlir
index db52b4bc4700..1904a1a37b1e 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_export.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_export.mlir
@@ -5,14 +5,23 @@ sdy.mesh @mesh_1 = <["a"=2, "b"=2, "c"=2, "d"=2]>
 
 // CHECK-LABEL: func @single_manual_comp
 func.func @single_manual_comp(%arg0: tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"a", ?}, {"b", ?}]>}, %arg1: tensor<16x32xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"b", ?}, {?}]>}) -> (tensor<8x32xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"a"}, {}]>}) {
-  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]]:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) {has_side_effect = true} : (tensor<8x16xf32>, tensor<16x32xf32>) -> (tensor<2x8xf32>, tensor<8x32xf32>)
+  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]]:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1)
+  // CHECK-SAME:   {has_side_effect = true
+  // CHECK-SAME:    mhlo.frontend_attributes = {
+  // CHECK-SAME:      xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}], replicated={\22a\22}>]>",
+  // CHECK-SAME:      xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>"}}
+  // CHECK-SAME: : (tensor<8x16xf32>, tensor<16x32xf32>) -> (tensor<2x8xf32>, tensor<8x32xf32>)
   // CHECK-NEXT: %[[SHMAP:.*]] = call @local_xla.sdy.manual_computation_body(%[[GLOBAL_TO_LOCAL]]#0, %[[GLOBAL_TO_LOCAL]]#1)
   // CHECK-SAME: {mhlo.frontend_attributes = {
-  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {\\\22b\\\22}]>, <@mesh_0, [{\\\22b\\\22}, {}], replicated={\\\22a\\\22}>]>",
-  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>",
-  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}], replicated={\\\22b\\\22}>]>"}}
+  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}], replicated={\22a\22}>]>",
+  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>",
+  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}], replicated={\22b\22}>]>"}}
   // CHECK-SAME: : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]]) {has_side_effect = true} : (tensor<2x32xf32>) -> tensor<8x32xf32>
+  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]])
+  // CHECK-SAME:   {mhlo.frontend_attributes = {
+  // CHECK-SAME:      xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>",
+  // CHECK-SAME:      xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}], replicated={\22b\22}>]>"}}
+  // CHECK-SAME: (tensor<2x32xf32>) -> tensor<8x32xf32>
   // CHECK-NEXT: return %[[LOCAL_TO_GLOBAL]] : tensor<8x32xf32>
   %0 = sdy.manual_computation(%arg0, %arg1) in_shardings=[<@mesh_0, [{"a"}, {"b"}]>, <@mesh_0, [{"b"}, {}], replicated={"a"}>] out_shardings=[<@mesh_0, [{"a"}, {}], replicated={"b"}>] manual_axes={"a", "b"} (%arg2: tensor<2x8xf32>, %arg3: tensor<8x32xf32>) {
     %1 = stablehlo.add %arg2, %arg2 : tensor<2x8xf32>
@@ -32,22 +41,40 @@ func.func @single_manual_comp(%arg0: tensor<8x16xf32> {sdy.sharding = #sdy.shard
 // CHECK-LABEL: func @manual_comp_using_another
 func.func @manual_comp_using_another(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"a"}, {}]>})
     -> (tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {"b"}]>}) {
-  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL_0:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {has_side_effect = true} : (tensor<8x8xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL_0:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0)
+  // CHECK-SAME:   {has_side_effect = true
+  // CHECK-SAME:    mhlo.frontend_attributes = {
+  // CHECK-SAME:      xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>",
+  // CHECK-SAME:      xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>"}}
+  // CHECK-SAME: : (tensor<8x8xf32>) -> tensor<2x8xf32>
   // CHECK-NEXT: %[[SHMAP_0:.*]] = call @local_xla.sdy.manual_computation_body_0(%[[GLOBAL_TO_LOCAL_0]])
   // CHECK-SAME: {mhlo.frontend_attributes = {
-  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}]>]>",
-  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22}>",
-  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}]>]>"}}
+  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>",
+  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>",
+  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}}
   // CHECK-SAME: : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL_0:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP_0]]) {has_side_effect = true} : (tensor<2x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL_1:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%[[LOCAL_TO_GLOBAL_0]]) {has_side_effect = true} : (tensor<8x8xf32>) -> tensor<8x4xf32>
+  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL_0:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP_0]])
+  // CHECK-SAME:   {mhlo.frontend_attributes = {
+  // CHECK-SAME:      xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>",
+  // CHECK-SAME:      xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}}
+  // CHECK-SAME: : (tensor<2x8xf32>) -> tensor<8x8xf32>
+  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL_1:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%[[LOCAL_TO_GLOBAL_0]])
+  // CHECK-SAME: {has_side_effect = true
+  // CHECK-SAME:  mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>",
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}}
+  // CHECK-SAME: (tensor<8x8xf32>) -> tensor<8x4xf32>
   // CHECK-NEXT: %[[SHMAP_1:.*]] = call @local_xla.sdy.manual_computation_body_1(%[[GLOBAL_TO_LOCAL_1]])
   // CHECK-SAME: {mhlo.frontend_attributes = {
-  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\\\22b\\\22}]>]>",
-  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>",
-  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\\\22b\\\22}]>]>"}}
+  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>",
+  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>"}}
   // CHECK-SAME: : (tensor<8x4xf32>) -> tensor<8x4xf32
-  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL_1:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP_1]]) {has_side_effect = true} : (tensor<8x4xf32>) -> tensor<8x8xf32>
+  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL_1:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP_1]])
+  // CHECK-SAME: {mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+  // CHECK-SAME:    xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>"}}
+  // CHECK-SAME: (tensor<8x4xf32>) -> tensor<8x8xf32>
   // CHECK-NEXT: return %[[LOCAL_TO_GLOBAL_1]] : tensor<8x8xf32>
   %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_0, [{"a"}, {}]>] out_shardings=[<@mesh_0, [{"a"}, {}]>] manual_axes={"a"} (%arg1: tensor<2x8xf32>) {
     sdy.return %arg1 : tensor<2x8xf32>
@@ -61,14 +88,23 @@ func.func @manual_comp_using_another(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy
 
 // CHECK-LABEL: func @nested_shmaps
 func.func @nested_shmaps(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a"}, {"b"}]>}) -> (tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a", ?}, {?}]>}) {
-  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {has_side_effect = true} : (tensor<4x8xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0)
+  // CHECK-SAME: {has_side_effect = true,
+  // CHECK-SAME:  mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>",
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>"}}
+  // CHECK-SAME: (tensor<4x8xf32>) -> tensor<2x8xf32>
   // CHECK-NEXT: %[[SHMAP:.*]] = call @local_xla.sdy.manual_computation_body_3(%[[GLOBAL_TO_LOCAL]])
   // CHECK-SAME: {mhlo.frontend_attributes = {
-  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>",
-  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22}>",
-  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>"}}
+  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>",
+  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>",
+  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}}
   // CHECK-SAME: : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]]) {has_side_effect = true} : (tensor<2x8xf32>) -> tensor<4x8xf32>
+  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]])
+  // CHECK-SAME: {mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>",
+  // CHECK-SAME:    xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}}
+  // CHECK-SAME: (tensor<2x8xf32>) -> tensor<4x8xf32>
   // CHECK-NEXT: return %[[LOCAL_TO_GLOBAL]] : tensor<4x8xf32>
   %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_1, [{"a"}, {}]>] out_shardings=[<@mesh_1, [{"a"}, {}]>] manual_axes={"a"} (%arg1: tensor<2x8xf32>) {
     %1 = sdy.manual_computation(%arg1) in_shardings=[<@mesh_1, [{}, {"b"}]>] out_shardings=[<@mesh_1, [{}, {"b"}]>] manual_axes={"b"} (%arg2: tensor<2x4xf32>) {
@@ -82,14 +118,23 @@ func.func @nested_shmaps(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@m
 
 // CHECK-LABEL: func @nested_shmaps_extra_op
 func.func @nested_shmaps_extra_op(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a"}, {"b"}]>}) -> (tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a", ?}, {?}]>}) {
-  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {has_side_effect = true} : (tensor<4x8xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0)
+  // CHECK-SAME: {has_side_effect = true,
+  // CHECK-SAME:  mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>",
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>"}}
+  // CHECK-SAME: (tensor<4x8xf32>) -> tensor<2x8xf32>
   // CHECK-NEXT: %[[SHMAP:.*]] = call @local_xla.sdy.manual_computation_body_5(%[[GLOBAL_TO_LOCAL]])
   // CHECK-SAME: {mhlo.frontend_attributes = {
-  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>",
-  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22}>",
-  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>"}}
+  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>",
+  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>",
+  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}}
   // CHECK-SAME: (tensor<2x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]]) {has_side_effect = true} : (tensor<2x8xf32>) -> tensor<4x8xf32>
+  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]])
+  // CHECK-SAME: {mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>",
+  // CHECK-SAME:    xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}}
+  // CHECK-SAME: (tensor<2x8xf32>) -> tensor<4x8xf32>
   // CHECK-NEXT: return %[[LOCAL_TO_GLOBAL]] : tensor<4x8xf32>
   %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_1, [{"a"}, {}]>] out_shardings=[<@mesh_1, [{"a"}, {}]>] manual_axes={"a"} (%arg1: tensor<2x8xf32>) {
     %1 = sdy.manual_computation(%arg1) in_shardings=[<@mesh_1, [{}, {"b"}]>] out_shardings=[<@mesh_1, [{}, {"b"}]>] manual_axes={"b"} (%arg2: tensor<2x4xf32>) {
@@ -107,10 +152,14 @@ func.func @manual_computation_no_inputs() -> tensor<4xi64> {
   // CHECK-NEXT: %[[SHMAP:.*]] = call @local_xla.sdy.manual_computation_body_6()
   // CHECK-SAME: {mhlo.frontend_attributes = {
   // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>",
-  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>",
-  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22b\\\22}]>]>"}}
+  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+  // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>"}}
   // CHECK-SAME: () -> tensor<2xi64>
-  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]]) {has_side_effect = true} : (tensor<2xi64>) -> tensor<4xi64>
+  // CHECK-NEXT: %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]])
+  // CHECK-SAME: {mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+  // CHECK-SAME:    xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>"}}
+  // CHECK-SAME: (tensor<2xi64>) -> tensor<4xi64>
   // CHECK-NEXT: return %[[LOCAL_TO_GLOBAL]] : tensor<4xi64>
   %0 = sdy.manual_computation() in_shardings=[] out_shardings=[<@mesh_0, [{"b"}]>] manual_axes={"b"} () {
     %1 = stablehlo.constant dense<[2, 3]> : tensor<2xi64>
@@ -121,11 +170,16 @@ func.func @manual_computation_no_inputs() -> tensor<4xi64> {
 
 // CHECK-LABEL: func @manual_computation_no_outputs
 func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
-  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {has_side_effect = true} : (tensor<4xi64>) -> tensor<2xi64>
+  // CHECK-NEXT: %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0)
+  // CHECK-SAME: {has_side_effect = true,
+  // CHECK-SAME:  mhlo.frontend_attributes = {
+  // CHECK-SAME:    xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>",
+  // CHECK-SAME:    xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}}
+  // CHECK-SAME: (tensor<4xi64>) -> tensor<2xi64>
   // CHECK-NEXT: call @local_xla.sdy.manual_computation_body_7(%[[GLOBAL_TO_LOCAL]])
   // CHECK-SAME: {mhlo.frontend_attributes = {
-  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22b\\\22}]>]>",
-  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>",
+  // CHECK-SAME: xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>",
+  // CHECK-SAME: xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
   // CHECK-SAME: xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}}
   // CHECK-SAME: : (tensor<2xi64>) -> ()
   // CHECK-NEXT: return
@@ -136,6 +190,19 @@ func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
   func.return
 }
 
+// CHECK-LABEL: func @manual_computation_no_inputs_no_outputs
+func.func @manual_computation_no_inputs_no_outputs() {
+  // CHECK-NEXT: call @local_xla.sdy.manual_computation_body_8() {mhlo.frontend_attributes = {
+  // CHECK-SAME:   xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>",
+  // CHECK-SAME:   xla.sdy.manual_axes = "#sdy<manual_axes{}>",
+  // CHECK-SAME:   xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"
+  // CHECK-SAME: }} : () -> ()
+  sdy.manual_computation() in_shardings=[] out_shardings=[] manual_axes={} () {
+    sdy.return
+  } : () -> ()
+  func.return
+}
+
 // CHECK-LABEL: func @local_xla.sdy.manual_computation_body(%arg0: tensor<2x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<2x32xf32>
 // CHECK-NEXT:    stablehlo.add
 // CHECK-NEXT:    stablehlo.dot
@@ -151,27 +218,45 @@ func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
 // CHECK-NEXT:    stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
 
 // CHECK-LABEL: func @local_xla.sdy.manual_computation_body_3(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32
-// CHECK-NEXT:   %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {has_side_effect = true} : (tensor<2x8xf32>) -> tensor<2x4xf32>
+// CHECK-NEXT:   %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0)
+// CHECK-SAME:     {has_side_effect = true,
+// CHECK-SAME:      mhlo.frontend_attributes = {
+// CHECK-SAME:        xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>",
+// CHECK-SAME:        xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}}
+// CHECK-SAME:   (tensor<2x8xf32>) -> tensor<2x4xf32>
 // CHECK-NEXT:   %[[SHMAP:.*]] = call @local_xla.sdy.manual_computation_body_2(%[[GLOBAL_TO_LOCAL]])
 // CHECK-SAME:     {mhlo.frontend_attributes = {
-// CHECK-SAME:     xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>",
-// CHECK-SAME:     xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>",
-// CHECK-SAME:     xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>"}}
+// CHECK-SAME:     xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>",
+// CHECK-SAME:     xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+// CHECK-SAME:     xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}}
 // CHECK-SAME:     : (tensor<2x4xf32>) -> tensor<2x4xf32>
-// CHECK-NEXT:   stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]]) {has_side_effect = true} : (tensor<2x4xf32>) -> tensor<2x8xf32>
+// CHECK-NEXT:   stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]])
+// CHECK-SAME:     {mhlo.frontend_attributes = {
+// CHECK-SAME:        xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+// CHECK-SAME:        xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}}
+// CEHECK-SAME:  (tensor<2x4xf32>) -> tensor<2x8xf32>
 
 // CHECK-LABEL: func @local_xla.sdy.manual_computation_body_4(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32>
 // CHECK-NEXT:    stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
 
 // CHECK-LABEL: func @local_xla.sdy.manual_computation_body_5(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32>
-// CHECK-NEXT:   %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {has_side_effect = true} : (tensor<2x8xf32>) -> tensor<2x4xf32
+// CHECK-NEXT:   %[[GLOBAL_TO_LOCAL:.*]] = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0)
+// CHECK-SAME:     {has_side_effect = true,
+// CHECK-SAME:      mhlo.frontend_attributes = {
+// CHECK-SAME:        xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>",
+// CHECK-SAME:        xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}}
+// CHECK-SAME:     (tensor<2x8xf32>) -> tensor<2x4xf32
 // CHECK-NEXT:   %[[SHMAP:.*]] = call @local_xla.sdy.manual_computation_body_4(%[[GLOBAL_TO_LOCAL]])
 // CHECK-SAME:     {mhlo.frontend_attributes = {
-// CHECK-SAME:     xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>",
-// CHECK-SAME:     xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>",
-// CHECK-SAME:     xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>"}}
+// CHECK-SAME:     xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>",
+// CHECK-SAME:     xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+// CHECK-SAME:     xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}}
 // CHECK-SAME:      : (tensor<2x4xf32>) -> tensor<2x4xf32>
-// CHECK-NEXT:   %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]]) {has_side_effect = true} : (tensor<2x4xf32>) -> tensor<2x8xf32>
+// CHECK-NEXT:   %[[LOCAL_TO_GLOBAL:.*]] = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%[[SHMAP]])
+// CHECK-SAME:     {mhlo.frontend_attributes = {
+// CHECK-SAME:        xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>",
+// CHECK-SAME:        xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}}
+// CHECK-SAME:   (tensor<2x4xf32>) -> tensor<2x8xf32>
 // CHECK-NEXT:   %[[ADD:.*]] = stablehlo.add %[[LOCAL_TO_GLOBAL]], %[[LOCAL_TO_GLOBAL]] : tensor<2x8xf32>
 // CHECK-NEXT:   return %[[ADD]] : tensor<2x8xf32>
 
@@ -182,3 +267,6 @@ func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
 // CHECK-LABEL: func @local_xla.sdy.manual_computation_body_7(%arg0: tensor<2xi64>) {
 // CHECK-NEXT:    stablehlo.custom_call @sdy_testonly(%arg0) : (tensor<2xi64>) -> ()
 // CHECK-NEXT:    return
+
+// CHECK-LABEL: func @local_xla.sdy.manual_computation_body_8() {
+// CHECK-NEXT:    return
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import.mlir
index 61971f51ff18..5b43bb87e598 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import.mlir
@@ -21,9 +21,9 @@ func.func @single_manual_comp(%arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>)
   // CHECK-NEXT:            sdy.return %[[REDUCE]] : tensor<2x32xf32>
   // CHECK-NEXT:          } : (tensor<8x16xf32>, tensor<16x32xf32>) -> tensor<8x32xf32>
   // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x32xf32>
-  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<8x16xf32>, tensor<16x32xf32>) -> (tensor<2x8xf32>, tensor<8x32xf32>)
-  %1 = call @local_xla.sdy.manual_computation_body(%0#0, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {\\\22b\\\22}]>, <@mesh_0, [{\\\22b\\\22}, {}], replicated={\\\22a\\\22}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}], replicated={\\\22b\\\22}>]>"}} : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x32xf32>) -> tensor<8x32xf32>
+  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}], replicated={\22a\22}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>"}} : (tensor<8x16xf32>, tensor<16x32xf32>) -> (tensor<2x8xf32>, tensor<8x32xf32>)
+  %1 = call @local_xla.sdy.manual_computation_body(%0#0, %0#1) : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}], replicated={\22b\22}>]>"}} : (tensor<2x32xf32>) -> tensor<8x32xf32>
   return %2 : tensor<8x32xf32>
 }
 
@@ -38,9 +38,9 @@ func.func @single_manual_comp_name_is_not_prefix_nor_suffix(%arg0: tensor<8x8xf3
   // CHECK-NEXT:            sdy.return %arg1 : tensor<2x8xf32>
   // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
   // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
-  %1 = call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>"}} : (tensor<8x8xf32>) -> tensor<2x8xf32>
+  %1 = call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%0) : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<8x8xf32>
   return %2 : tensor<8x8xf32>
 }
 
@@ -63,20 +63,20 @@ func.func @manual_comp_using_another(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32>
   // CHECK-NEXT:            sdy.return %arg1 : tensor<8x4xf32>
   // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
   // CHECK-NEXT:          return %[[MAN_COMP_1]] : tensor<8x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_0(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
-  %3 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%2) : (tensor<8x8xf32>) -> tensor<8x4xf32>
-  %4 = call @local_xla.sdy.manual_computation_body_1(%3) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\\\22b\\\22}]>]>"}} : (tensor<8x4xf32>) -> tensor<8x4xf32>
-  %5 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%4) : (tensor<8x4xf32>) -> tensor<8x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>"}} : (tensor<8x8xf32>) -> tensor<2x8xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_0(%0) : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<8x8xf32>
+  %3 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%2) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}} : (tensor<8x8xf32>) -> tensor<8x4xf32>
+  %4 = call @local_xla.sdy.manual_computation_body_1(%3) : (tensor<8x4xf32>) -> tensor<8x4xf32>
+  %5 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%4) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>"}} : (tensor<8x4xf32>) -> tensor<8x8xf32>
   return %5 : tensor<8x8xf32>
 }
 
 // CHECK-NOT: func @local_xla.sdy.manual_computation_body_3(
 func.func @local_xla.sdy.manual_computation_body_3(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_2(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}} : (tensor<2x8xf32>) -> tensor<2x4xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_2(%0) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x8xf32>
   return %2 : tensor<2x8xf32>
 }
 
@@ -105,9 +105,9 @@ func.func @nested_shmaps(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
   // CHECK-NEXT:            sdy.return %[[MAN_COMP_1]] : tensor<2x8xf32>
   // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
   // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_3(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>"}} : (tensor<4x8xf32>) -> tensor<2x8xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_3(%0) : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<4x8xf32>
   return %2 : tensor<4x8xf32>
 }
 
@@ -131,9 +131,9 @@ func.func @nested_shmaps_extra_op(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
   // CHECK-NEXT:            sdy.return %[[ADD]] : tensor<2x8xf32>
   // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
   // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_5(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\\\22a\\\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>"}} : (tensor<4x8xf32>) -> tensor<2x8xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_5(%0) : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<4x8xf32>
   return %2 : tensor<4x8xf32>
 }
 
@@ -149,8 +149,8 @@ func.func @manual_computation_no_inputs() -> tensor<4xi64> {
   // CHECK-NEXT:            sdy.return %[[C]] : tensor<2xi64>
   // CHECK-NEXT:          } : () -> tensor<4xi64>
   // CHECK-NEXT:          return %[[SHMAP]] : tensor<4xi64>
-  %0 = call @local_xla.sdy.manual_computation_body_6() {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22b\\\22}]>]>"}} : () -> tensor<2xi64>
-  %1 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%0) : (tensor<2xi64>) -> tensor<4xi64>
+  %0 = call @local_xla.sdy.manual_computation_body_6() : () -> tensor<2xi64>
+  %1 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%0) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>"}} : (tensor<2xi64>) -> tensor<4xi64>
   return %1 : tensor<4xi64>
 }
 
@@ -166,28 +166,55 @@ func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
   // CHECK-NEXT:            sdy.return
   // CHECK-NEXT:          } : (tensor<4xi64>) -> ()
   // CHECK-NEXT:          return
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4xi64>) -> tensor<2xi64>
-  call @local_xla.sdy.manual_computation_body_7(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}} : (tensor<2xi64>) -> ()
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}} : (tensor<4xi64>) -> tensor<2xi64>
+  call @local_xla.sdy.manual_computation_body_7(%0) : (tensor<2xi64>) -> ()
   return
 }
 
-// CHECK-LABEL: func @manual_computation_zero_dim_inputs
-func.func @manual_computation_zero_dim_inputs(%arg0: tensor<0x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<0x32xf32>) {
+// CHECK-LABEL: func @manual_computation_no_inputs_no_outputs
+func.func @manual_computation_no_inputs_no_outputs() {
+  // CHECK-NEXT: sdy.manual_computation() in_shardings=[] out_shardings=[] manual_axes={} () {
+  // CHECK-NEXT:   sdy.return
+  // CHECK-NEXT: } : () -> ()
+  // CHECK-NEXT: return
+  call @local_xla.sdy.manual_computation_body_8() : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @manual_computation_zero_dim_inputs_outputs
+func.func @manual_computation_zero_dim_inputs_outputs(%arg0: tensor<0x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<0x32xf32>, tensor<16x32xf32>) {
   // CHECK-NOT: call @local_xla.sdy.manual_computation_body
-  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
+  // CHECK:               %[[CONST_0_32:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<0x32xf32>
+  // CHECK:               %[[MAN_COMP:.*]]:2 = sdy.manual_computation(%arg0, %arg1)
   // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{}, {"b"}]>, <@mesh_0, [{"b"}, {}]>]
-  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {}], replicated={"b"}>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {}], replicated={"b"}>, <@mesh_0, [{"b"}, {}]>]
   // CHECK-SAME{LITERAL}:     manual_axes={"b"}
   // CHECK-SAME:              (%arg2: tensor<0x8xf32>, %arg3: tensor<8x32xf32>) {
   // CHECK-NEXT:            %[[DOT:.*]] = stablehlo.dot %arg2, %arg3
-  // CHECK-NEXT:            sdy.return %[[DOT]]
-  // CHECK-NEXT:          } : (tensor<0x16xf32>, tensor<16x32xf32>) -> tensor<0x32xf32>
-  // CHECK-NEXT:          return %[[MAN_COMP]]
-  %c = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
-  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<0x16xf32>, tensor<16x32xf32>) -> (tensor<0x8xf32>, tensor<8x32xf32>)
-  %1 = call @local_xla.sdy.manual_computation_body_8(%c, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\\\22b\\\22}]>, <@mesh_0, [{\\\22b\\\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {}], replicated={\\\22b\\\22}>]>"}} : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<0x32xf32>) -> tensor<0x32xf32>
-  return %2 : tensor<0x32xf32>
+  // CHECK-NEXT:            sdy.return %[[DOT]], %arg3
+  // CHECK-NEXT:          } : (tensor<0x16xf32>, tensor<16x32xf32>) -> (tensor<0x32xf32>, tensor<16x32xf32>)
+  // CHECK-NEXT:          return %[[CONST_0_32]], %[[MAN_COMP]]#1
+  %c1 = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
+  %c2 = stablehlo.constant dense<0.000000e+00> : tensor<0x32xf32>
+  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}} : (tensor<0x16xf32>, tensor<16x32xf32>) -> (tensor<0x8xf32>, tensor<8x32xf32>)
+  %1:2 = call @local_xla.sdy.manual_computation_body_9(%c1, %0#1) : (tensor<0x8xf32>, tensor<8x32xf32>) -> (tensor<0x32xf32>, tensor<8x32xf32>)
+  %2:2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%c2, %1#1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {}], replicated={\22b\22}>, <@mesh_0, [{\22b\22}, {}]>]>"}} : (tensor<0x32xf32>, tensor<8x32xf32>) -> (tensor<0x32xf32>, tensor<16x32xf32>)
+  return %c2, %2#1 : tensor<0x32xf32>, tensor<16x32xf32>
+}
+
+// CHECK-LABEL: func @stray_unused_manual_computation_custom_calls
+func.func @stray_unused_manual_computation_custom_calls(%arg0: tensor<0x16xf32>) -> tensor<0x16xf32> {
+  // CHECK-NEXT: %[[CONST_0_8:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
+  // CHECK-NEXT: %[[CONST_0_16:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<0x16xf32>
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body
+  // CHECK-NOT: call @local_xla.sdy.GlobalToLocalShape
+  // CHECK-NOT: call @local_xla.sdy.LocalToGlobalShape
+  // CHECK-NEXT: return %[[CONST_0_16]]
+  %c1 = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
+  %c2 = stablehlo.constant dense<0.000000e+00> : tensor<0x16xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}} : (tensor<0x16xf32>) -> tensor<0x8xf32>
+  %1 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%c1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}, {}]>]>"}} : (tensor<0x8xf32>) -> tensor<0x16xf32>
+  return %c2 : tensor<0x16xf32>
 }
 
 // CHECK-NOT: func @local_xla.sdy.manual_computation_body(
@@ -224,9 +251,9 @@ func.func @local_xla.sdy.manual_computation_body_4(%arg0: tensor<2x4xf32>) -> te
 
 // CHECK-NOT: func @local_xla.sdy.manual_computation_body_5(
 func.func @local_xla.sdy.manual_computation_body_5(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
-  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  %1 = call @local_xla.sdy.manual_computation_body_4(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\\\22b\\\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
-  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>"}} : (tensor<2x8xf32>) -> tensor<2x4xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_4(%0) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x8xf32>
   %3 = stablehlo.add %2, %2 : tensor<2x8xf32>
   return %3 : tensor<2x8xf32>
 }
@@ -243,8 +270,13 @@ func.func @local_xla.sdy.manual_computation_body_7(%arg0: tensor<2xi64>) {
   return
 }
 
-// CHECK-NOT: func @local_xla.sdy.manual_computation_body_7(
-func.func @local_xla.sdy.manual_computation_body_8(%arg0: tensor<0x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<0x32xf32> {
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_8(
+func.func @local_xla.sdy.manual_computation_body_8() {
+  return
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_9(
+func.func @local_xla.sdy.manual_computation_body_9(%arg0: tensor<0x8xf32>, %arg1: tensor<8x32xf32>) -> (tensor<0x32xf32>, tensor<8x32xf32>) {
   %0 = stablehlo.dot %arg0, %arg1 : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
-  return %0 : tensor<0x32xf32>
+  return %0, %arg1 : tensor<0x32xf32>, tensor<8x32xf32>
 }
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_failure.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_failure.mlir
index ba5f28da7a74..b9ee576482e6 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_failure.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_failure.mlir
@@ -1,15 +1,13 @@
 // RUN: sdy_opt %s -xla-sdy-round-trip-shard-map-import -split-input-file -verify-diagnostics
 
-sdy.mesh @mesh = <["a"=2]>
-
 func.func @using_same_body_func(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
   %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> (tensor<2x8xf32>)
-  %1 = call @local_xla.sdy.manual_computation_body(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}], replicated={\\\22b\\\22}>]>"}} : (tensor<2x8xf32>) -> (tensor<2x8xf32>)
+  %1 = call @local_xla.sdy.manual_computation_body(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}, {\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}, {}], replicated={\\\22b\\\22}>]>"}} : (tensor<2x8xf32>) -> (tensor<2x8xf32>)
   %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> (tensor<8x8xf32>)
   %3 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%2) : (tensor<8x8xf32>) -> (tensor<2x8xf32>)
   // expected-error @+2 {{'func.call' op expected a unique FuncOp per @local_xla.sdy.manual_computation_body call}}
   // expected-error @+1 {{failed to legalize operation 'func.call'}}
-  %4 = call @local_xla.sdy.manual_computation_body(%3) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\\\22a\\\22}, {}], replicated={\\\22b\\\22}>]>"}} : (tensor<2x8xf32>) -> (tensor<2x8xf32>)
+  %4 = call @local_xla.sdy.manual_computation_body(%3) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}, {\\\22b\\\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\\\22a\\\22, \\\22b\\\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh, [{\\\22a\\\22}, {}], replicated={\\\22b\\\22}>]>"}} : (tensor<2x8xf32>) -> (tensor<2x8xf32>)
   %5 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%4) : (tensor<2x8xf32>) -> (tensor<8x8xf32>)
   return %5 : tensor<8x8xf32>
 }
@@ -17,3 +15,34 @@ func.func @using_same_body_func(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
 func.func @local_xla.sdy.manual_computation_body(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
   return %arg0 : tensor<2x8xf32>
 }
+
+// -----
+
+func.func @manual_computation_missing_global_to_local_shape(%arg0: tensor<0x16xf32>) -> (tensor<0x16xf32>) {
+  %c = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
+  // expected-error @+2 {{'func.call' op expected at least one operand of @local_xla.sdy.manual_computation_body to be produced by a xla.sdy.GlobalToLocalShape CustomCallOp}}
+  // expected-error @+1 {{failed to legalize operation 'func.call'}}
+  %0 = call @local_xla.sdy.manual_computation_body(%c) : (tensor<0x8xf32>) -> tensor<0x8xf32>
+  %1 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%0) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh, [{}, {\22b\22}]>]>"}} : (tensor<0x8xf32>) -> (tensor<0x16xf32>)
+  return %1 : tensor<0x16xf32>
+}
+
+func.func @local_xla.sdy.manual_computation_body(%arg0: tensor<0x8xf32>) -> tensor<0x8xf32> {
+  return %arg0 : tensor<0x8xf32>
+}
+
+// -----
+
+func.func @manual_computation_missing_local_to_global_shape(%arg0: tensor<0x16xf32>) -> (tensor<0x16xf32>) {
+  %c = stablehlo.constant dense<0.000000e+00> : tensor<0x16xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) {mhlo.frontend_attributes = {xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh, [{}, {\22b\22}]>]>"}} : (tensor<0x16xf32>) -> tensor<0x8xf32>
+  // expected-error @+2 {{'func.call' op expected the first use of @local_xla.sdy.manual_computation_body to be by a xla.sdy.LocalToGlobalShape CustomCallOp}}
+  // expected-error @+1 {{failed to legalize operation 'func.call'}}
+  %1 = call @local_xla.sdy.manual_computation_body(%0) : (tensor<0x8xf32>) -> tensor<0x8xf32>
+  return %c : tensor<0x16xf32>
+}
+
+func.func @local_xla.sdy.manual_computation_body(%arg0: tensor<0x8xf32>) -> tensor<0x8xf32> {
+  return %arg0 : tensor<0x8xf32>
+}
+
diff --git a/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir
new file mode 100644
index 000000000000..611044113035
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/test/sdy_round_trip_shard_map_import_legacy_code_path.mlir
@@ -0,0 +1,265 @@
+// RUN: sdy_opt %s -xla-sdy-round-trip-shard-map-import 2>&1 | FileCheck %s
+
+sdy.mesh @mesh_0 = <["a"=4, "b"=2]>
+sdy.mesh @mesh_1 = <["a"=2, "b"=2, "c"=2, "d"=2]>
+
+// CHECK-LABEL: func @single_manual_comp
+func.func @single_manual_comp(%arg0: tensor<8x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<8x32xf32>) {
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body
+  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {"b"}]>, <@mesh_0, [{"b"}, {}], replicated={"a"}>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}], replicated={"b"}>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"a", "b"}
+  // CHECK-SAME:              (%arg2: tensor<2x8xf32>, %arg3: tensor<8x32xf32>) {
+  // CHECK-NEXT:            %[[ADD_0:.*]] = stablehlo.add %arg2, %arg2 : tensor<2x8xf32>
+  // CHECK-NEXT:            %[[DOT:.*]] = stablehlo.dot %[[ADD_0]], %arg3 : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
+  // CHECK-NEXT:            %[[REDUCE:.*]] = "stablehlo.all_reduce"(%[[DOT]])
+  // CHECK-NEXT:            ^bb0(%arg4: tensor<f32>, %arg5: tensor<f32>):
+  // CHECK-NEXT:              %[[ADD_1:.*]] = stablehlo.add %arg4, %arg5 : tensor<f32>
+  // CHECK-NEXT:              stablehlo.return %[[ADD_1]] : tensor<f32>
+  // CHECK-NEXT:            }) : (tensor<2x32xf32>) -> tensor<2x32xf32>
+  // CHECK-NEXT:            sdy.return %[[REDUCE]] : tensor<2x32xf32>
+  // CHECK-NEXT:          } : (tensor<8x16xf32>, tensor<16x32xf32>) -> tensor<8x32xf32>
+  // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x32xf32>
+  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<8x16xf32>, tensor<16x32xf32>) -> (tensor<2x8xf32>, tensor<8x32xf32>)
+  %1 = call @local_xla.sdy.manual_computation_body(%0#0, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}], replicated={\22a\22}>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22, \22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}], replicated={\22b\22}>]>"}} : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x32xf32>) -> tensor<8x32xf32>
+  return %2 : tensor<8x32xf32>
+}
+
+// CHECK-LABEL: func @single_manual_comp_name_is_not_prefix_nor_suffix
+func.func @single_manual_comp_name_is_not_prefix_nor_suffix(%arg0: tensor<8x8xf32>) -> (tensor<8x8xf32>) {
+  // CHECK-NOT: call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234
+  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
+  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
+  // CHECK-NEXT:            sdy.return %arg1 : tensor<2x8xf32>
+  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
+  // CHECK-NEXT:          return %[[MAN_COMP]] : tensor<8x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
+  %1 = call @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
+  return %2 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: func @manual_comp_using_another
+func.func @manual_comp_using_another(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_0
+  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
+  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
+  // CHECK-NEXT:            sdy.return %arg1 : tensor<2x8xf32>
+  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_1
+  // CHECK-NEXT:          %[[MAN_COMP_1:.*]] = sdy.manual_computation(%[[MAN_COMP_0]])
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{}, {"b"}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {"b"}]>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
+  // CHECK-SAME:              (%arg1: tensor<8x4xf32>) {
+  // CHECK-NEXT:            sdy.return %arg1 : tensor<8x4xf32>
+  // CHECK-NEXT:          } : (tensor<8x8xf32>) -> tensor<8x8xf32>
+  // CHECK-NEXT:          return %[[MAN_COMP_1]] : tensor<8x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<8x8xf32>) -> tensor<2x8xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_0(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<8x8xf32>
+  %3 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%2) : (tensor<8x8xf32>) -> tensor<8x4xf32>
+  %4 = call @local_xla.sdy.manual_computation_body_1(%3) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>]>"}} : (tensor<8x4xf32>) -> tensor<8x4xf32>
+  %5 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%4) : (tensor<8x4xf32>) -> tensor<8x8xf32>
+  return %5 : tensor<8x8xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_3(
+func.func @local_xla.sdy.manual_computation_body_3(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_2(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
+  return %2 : tensor<2x8xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_2(
+func.func @local_xla.sdy.manual_computation_body_2(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
+
+// CHECK-LABEL: func @nested_shmaps
+func.func @nested_shmaps(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_3
+  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_1, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_1, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
+  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
+  // CHECK-NEXT:            %[[MAN_COMP_1:.*]] = sdy.manual_computation(%arg1)
+  // CHECK-SAME{LITERAL}:       in_shardings=[<@mesh_1, [{}, {"b"}]>]
+  // CHECK-SAME{LITERAL}:       out_shardings=[<@mesh_1, [{}, {"b"}]>]
+  // CHECK-SAME{LITERAL}:       manual_axes={"b"}
+  // CHECK-SAME:                (%arg2: tensor<2x4xf32>) {
+  // CHECK-NEXT:              %[[MULT:.*]] = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
+  // CHECK-NEXT:              sdy.return %[[MULT]] : tensor<2x4xf32>
+  // CHECK-NEXT:            } : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT:            sdy.return %[[MAN_COMP_1]] : tensor<2x8xf32>
+  // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
+  // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_3(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
+  return %2 : tensor<4x8xf32>
+}
+
+// CHECK-LABEL: func @nested_shmaps_extra_op
+func.func @nested_shmaps_extra_op(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_5
+  // CHECK:               %[[MAN_COMP_0:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_1, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_1, [{"a"}, {}]>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"a"}
+  // CHECK-SAME:              (%arg1: tensor<2x8xf32>) {
+  // CHECK-NEXT:            %[[MAN_COMP_1:.*]] = sdy.manual_computation(%arg1)
+  // CHECK-SAME{LITERAL}:       in_shardings=[<@mesh_1, [{}, {"b"}]>]
+  // CHECK-SAME{LITERAL}:       out_shardings=[<@mesh_1, [{}, {"b"}]>]
+  // CHECK-SAME{LITERAL}:       manual_axes={"b"}
+  // CHECK-SAME:                (%arg2: tensor<2x4xf32>) {
+  // CHECK-NEXT:              %[[MULT:.*]] = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
+  // CHECK-NEXT:              sdy.return %[[MULT]] : tensor<2x4xf32>
+  // CHECK-NEXT:            } : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT:            %[[ADD:.*]] = stablehlo.add %[[MAN_COMP_1]], %[[MAN_COMP_1]] : tensor<2x8xf32>
+  // CHECK-NEXT:            sdy.return %[[ADD]] : tensor<2x8xf32>
+  // CHECK-NEXT:          } : (tensor<4x8xf32>) -> tensor<4x8xf32>
+  // CHECK-NEXT:          return %[[MAN_COMP_0]] : tensor<4x8xf32>
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4x8xf32>) -> tensor<2x8xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_5(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22a\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{\22a\22}, {}]>]>"}} : (tensor<2x8xf32>) -> tensor<2x8xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x8xf32>) -> tensor<4x8xf32>
+  return %2 : tensor<4x8xf32>
+}
+
+// CHECK-LABEL: func @manual_computation_no_inputs
+func.func @manual_computation_no_inputs() -> tensor<4xi64> {
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_6
+  // CHECK:               %[[SHMAP:.*]] = sdy.manual_computation()
+  // CHECK-SAME{LITERAL}:     in_shardings=[]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{"b"}]>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
+  // CHECK-SAME{LITERAL}:     () {
+  // CHECK-NEXT:            %[[C:.*]] = stablehlo.constant dense<[2, 3]> : tensor<2xi64>
+  // CHECK-NEXT:            sdy.return %[[C]] : tensor<2xi64>
+  // CHECK-NEXT:          } : () -> tensor<4xi64>
+  // CHECK-NEXT:          return %[[SHMAP]] : tensor<4xi64>
+  %0 = call @local_xla.sdy.manual_computation_body_6() {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>"}} : () -> tensor<2xi64>
+  %1 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%0) : (tensor<2xi64>) -> tensor<4xi64>
+  return %1 : tensor<4xi64>
+}
+
+// CHECK-LABEL: func @manual_computation_no_outputs
+func.func @manual_computation_no_outputs(%arg0: tensor<4xi64>) {
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body_7
+  // CHECK:               sdy.manual_computation(%arg0)
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{"b"}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[]
+  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
+  // CHECK-SAME{LITERAL}:     (%arg1: tensor<2xi64>) {
+  // CHECK-NEXT:            stablehlo.custom_call @sdy_testonly(%arg1) : (tensor<2xi64>) -> ()
+  // CHECK-NEXT:            sdy.return
+  // CHECK-NEXT:          } : (tensor<4xi64>) -> ()
+  // CHECK-NEXT:          return
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<4xi64>) -> tensor<2xi64>
+  call @local_xla.sdy.manual_computation_body_7(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}} : (tensor<2xi64>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @manual_computation_no_inputs_no_outputs
+func.func @manual_computation_no_inputs_no_outputs() {
+  // CHECK-NEXT: sdy.manual_computation() in_shardings=[] out_shardings=[] manual_axes={} () {
+  // CHECK-NEXT:   sdy.return
+  // CHECK-NEXT: } : () -> ()
+  // CHECK-NEXT: return
+  call @local_xla.sdy.manual_computation_body_8() {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[]>", xla.sdy.manual_axes = "#sdy<manual_axes{}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[]>"}} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @manual_computation_zero_dim_inputs
+func.func @manual_computation_zero_dim_inputs(%arg0: tensor<0x16xf32>, %arg1: tensor<16x32xf32>) -> (tensor<0x32xf32>) {
+  // CHECK-NOT: call @local_xla.sdy.manual_computation_body
+  // CHECK:               %[[MAN_COMP:.*]] = sdy.manual_computation(%arg0, %arg1)
+  // CHECK-SAME{LITERAL}:     in_shardings=[<@mesh_0, [{}, {"b"}]>, <@mesh_0, [{"b"}, {}]>]
+  // CHECK-SAME{LITERAL}:     out_shardings=[<@mesh_0, [{}, {}], replicated={"b"}>]
+  // CHECK-SAME{LITERAL}:     manual_axes={"b"}
+  // CHECK-SAME:              (%arg2: tensor<0x8xf32>, %arg3: tensor<8x32xf32>) {
+  // CHECK-NEXT:            %[[DOT:.*]] = stablehlo.dot %arg2, %arg3
+  // CHECK-NEXT:            sdy.return %[[DOT]]
+  // CHECK-NEXT:          } : (tensor<0x16xf32>, tensor<16x32xf32>) -> tensor<0x32xf32>
+  // CHECK-NEXT:          return %[[MAN_COMP]]
+  %c = stablehlo.constant dense<0.000000e+00> : tensor<0x8xf32>
+  %0:2 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0, %arg1) : (tensor<0x16xf32>, tensor<16x32xf32>) -> (tensor<0x8xf32>, tensor<8x32xf32>)
+  %1 = call @local_xla.sdy.manual_computation_body_9(%c, %0#1) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {\22b\22}]>, <@mesh_0, [{\22b\22}, {}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_0, [{}, {}], replicated={\22b\22}>]>"}} : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<0x32xf32>) -> tensor<0x32xf32>
+  return %2 : tensor<0x32xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body(
+func.func @local_xla.sdy.manual_computation_body(%arg0: tensor<2x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<2x32xf32> {
+  %0 = stablehlo.add %arg0, %arg0 : tensor<2x8xf32>
+  %1 = stablehlo.dot %0, %arg1 : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
+  %2 = "stablehlo.all_reduce"(%1) <{replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>}> ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %3 = stablehlo.add %arg2, %arg3 : tensor<f32>
+    stablehlo.return %3 : tensor<f32>
+  }) : (tensor<2x32xf32>) -> tensor<2x32xf32>
+  return %2 : tensor<2x32xf32>
+}
+
+func.func @my_model.___call__.fwd.xla.sdy.manual_computation_body_14.1234(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  return %arg0 : tensor<2x8xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_0(
+func.func @local_xla.sdy.manual_computation_body_0(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  return %arg0 : tensor<2x8xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_1(
+func.func @local_xla.sdy.manual_computation_body_1(%arg0: tensor<8x4xf32>) -> tensor<8x4xf32> {
+  return %arg0 : tensor<8x4xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_4(
+func.func @local_xla.sdy.manual_computation_body_4(%arg0: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_5(
+func.func @local_xla.sdy.manual_computation_body_5(%arg0: tensor<2x8xf32>) -> tensor<2x8xf32> {
+  %0 = stablehlo.custom_call @local_xla.sdy.GlobalToLocalShape(%arg0) : (tensor<2x8xf32>) -> tensor<2x4xf32>
+  %1 = call @local_xla.sdy.manual_computation_body_4(%0) {mhlo.frontend_attributes = {xla.sdy.in_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>", xla.sdy.manual_axes = "#sdy<manual_axes{\22b\22}>", xla.sdy.out_shardings = "#sdy.sharding_per_value<[<@mesh_1, [{}, {\22b\22}]>]>"}} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  %2 = stablehlo.custom_call @local_xla.sdy.LocalToGlobalShape(%1) : (tensor<2x4xf32>) -> tensor<2x8xf32>
+  %3 = stablehlo.add %2, %2 : tensor<2x8xf32>
+  return %3 : tensor<2x8xf32>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_6(
+func.func @local_xla.sdy.manual_computation_body_6() -> tensor<2xi64> {
+  %c = stablehlo.constant dense<[2, 3]> : tensor<2xi64>
+  return %c : tensor<2xi64>
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_7(
+func.func @local_xla.sdy.manual_computation_body_7(%arg0: tensor<2xi64>) {
+  stablehlo.custom_call @sdy_testonly(%arg0) : (tensor<2xi64>) -> ()
+  return
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_8(
+func.func @local_xla.sdy.manual_computation_body_8() {
+  return
+}
+
+// CHECK-NOT: func @local_xla.sdy.manual_computation_body_9(
+func.func @local_xla.sdy.manual_computation_body_9(%arg0: tensor<0x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<0x32xf32> {
+  %0 = stablehlo.dot %arg0, %arg1 : (tensor<0x8xf32>, tensor<8x32xf32>) -> tensor<0x32xf32>
+  return %0 : tensor<0x32xf32>
+}
diff --git a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_pipeline.mlir b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_pipeline.mlir
index 94455546622d..7c0e4e7f2cdc 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_pipeline.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_pipeline.mlir
@@ -103,27 +103,73 @@ func.func @split_constants() -> (tensor<8x8xf32>, tensor<8x8xf32>) {
   return %0, %1 : tensor<8x8xf32>, tensor<8x8xf32>
 }
 
-// CHECK-LABEL: func @reshard_all_closed
-func.func @reshard_all_closed(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  // CHECK: mhlo.copy %arg0 {mhlo.sharding = "{devices=[2,4,4]<=[2,4,4]T(0,2,1) last_tile_dim_replicate}"} :
+// CHECK-LABEL: func @reshard
+func.func @reshard(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[2,4,4]<=[2,4,4]T(0,2,1) last_tile_dim_replicate}"}
+  // CHECK-NEXT: return %[[COPY]]
   %0 = sdy.reshard %arg0 <@mesh_0, [{"axis_0"}, {"axis_2"}], replicated={"axis_1"}> : tensor<8x8xf32>
   return %0 : tensor<8x8xf32>
 }
 
-// CHECK-LABEL: func @reshard_partially_open_closed
-func.func @reshard_partially_open_closed(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  // CHECK: mhlo.copy %arg0 {mhlo.sharding = "{devices=[2,4,4]<=[2,4,4]T(0,2,1) last_tile_dim_replicate}"} :
-  %0 = sdy.reshard %arg0 <@mesh_0, [{"axis_0", ?}, {"axis_2"}]> : tensor<8x8xf32>
+// CHECK-LABEL: func @all_gather
+func.func @all_gather(%arg0: tensor<8x8xf32> {sdy.sharding=#sdy.sharding<@mesh_2, [{"x"}, {"y"}]>}) -> tensor<8x8xf32> {
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[1,4,8]<=[8,4]T(1,0) last_tile_dim_replicate}"}
+  // CHECK-NEXT: return %[[COPY]]
+  %0 = sdy.all_gather [{"x"}, {}] %arg0 out_sharding=<@mesh_2, [{}, {"y"}]> : tensor<8x8xf32>
   return %0 : tensor<8x8xf32>
 }
 
-// CHECK-LABEL: func @reshard_fully_open_partially_open
-func.func @reshard_fully_open_partially_open(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> {
-  // CHECK: mhlo.copy %arg0 {backend_config = "unspecified_dims=[1]", mhlo.sharding = "{devices=[2,1,16]<=[32] last_tile_dim_replicate}"}
-  %0 = sdy.reshard %arg0 <@mesh_0, [{"axis_0", ?}, {?}]> : tensor<8x8xf32>
+// CHECK-LABEL: func @all_slice
+func.func @all_slice(%arg0: tensor<8x8xf32> {sdy.sharding=#sdy.sharding<@mesh_2, [{}, {"y"}]>}) -> tensor<8x8xf32> {
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[8,4]<=[32]}"}
+  // CHECK-NEXT: return %[[COPY]]
+  %0 = sdy.all_slice [{"x"}, {}] %arg0 out_sharding=<@mesh_2, [{"x"}, {"y"}]> : tensor<8x8xf32>
   return %0 : tensor<8x8xf32>
 }
 
+// CHECK-LABEL: func @all_to_all
+func.func @all_to_all(%arg0: tensor<8x8xf32> {sdy.sharding=#sdy.sharding<@mesh_2, [{"y"}, {}]>}) -> tensor<8x8xf32> {
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[1,4,8]<=[8,4]T(1,0) last_tile_dim_replicate}"}
+  // CHECK-NEXT: return %[[COPY]]
+  %0 = sdy.all_to_all [{"y"}: 0->1] %arg0 out_sharding=<@mesh_2, [{}, {"y"}]> : tensor<8x8xf32>
+  return %0 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: func @collective_permute
+func.func @collective_permute(%arg0: tensor<32x8xf32> {sdy.sharding=#sdy.sharding<@mesh_2, [{"x", "y"}, {}]>}) -> tensor<32x8xf32> {
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[32,1]<=[8,4]T(1,0)}"}
+  // CHECK-NEXT: return %[[COPY]]
+  %0 = sdy.collective_permute %arg0 out_sharding=<@mesh_2, [{"y", "x"}, {}]> : tensor<32x8xf32>
+  return %0 : tensor<32x8xf32>
+}
+
+// CHECK-LABEL: func @all_reduce
+func.func @all_reduce(%arg0: tensor<8x8xf32> {sdy.sharding=#sdy.sharding<@mesh_2, [{"x"}, {}]>}) -> tensor<8x8xf32> {
+  // CHECK-NEXT: return %arg0
+  %0 = sdy.all_reduce {"y"} %arg0 out_sharding=<@mesh_2, [{"x"}, {}]> : tensor<8x8xf32>
+  return %0 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: func @reduce_scatter
+func.func @reduce_scatter(%arg0: tensor<8x8xf32> {sdy.sharding=#sdy.sharding<@mesh_2, [{"x"}, {}]>}) -> tensor<8x8xf32> {
+  // CHECK-NEXT: %[[COPY:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[8,4]<=[32]}"}
+  // CHECK-NEXT: return %[[COPY]]
+  %0 = sdy.reduce_scatter [{}, {"y"}] %arg0 out_sharding=<@mesh_2, [{"x"}, {"y"}]> : tensor<8x8xf32>
+  return %0 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: func @chain_of_collectives
+func.func @chain_of_collectives(%arg0: tensor<8x8xf32> {sdy.sharding=#sdy.sharding<@mesh_2, [{"y"}, {}]>}) -> tensor<8x8xf32> {
+  // CHECK-NEXT: %[[COPY_0:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[1,4,8]<=[8,4]T(1,0) last_tile_dim_replicate}"}
+  // CHECK-NEXT: %[[COPY_1:.*]] = mhlo.copy %[[COPY_0]] {mhlo.sharding = "{devices=[8,4]<=[32]}"}
+  // CHECK-NEXT: %[[COPY_2:.*]] = mhlo.copy %[[COPY_1]] {mhlo.sharding = "{devices=[8,1,4]<=[32] last_tile_dim_replicate}"}
+  // CHECK-NEXT: return %[[COPY_2]]
+  %0 = sdy.all_to_all [{"y"}: 0->1] %arg0 out_sharding=<@mesh_2, [{}, {"y"}]> : tensor<8x8xf32>
+  %1 = sdy.all_slice [{"x"}, {}] %0 out_sharding=<@mesh_2, [{"x"}, {"y"}]> : tensor<8x8xf32>
+  %2 = sdy.all_gather [{}, {"y"}] %1 out_sharding=<@mesh_2, [{"x"}, {}]> : tensor<8x8xf32>
+  return %2 : tensor<8x8xf32>
+}
+
 // CHECK-LABEL: func @sharding_in_manual_computation_body(
 // CHECK-SAME:      %arg0: tensor<8x16xf32> {mhlo.sharding = "{devices=[2,2,4]<=[16] last_tile_dim_replicate}"},
 // CHECK-SAME:      %arg1: tensor<16x32xf32> {mhlo.sharding = "{devices=[2,1,8]<=[2,2,4]T(1,0,2) last_tile_dim_replicate}"})
@@ -183,15 +229,15 @@ func.func @multiple_shardings_with_device_list(%arg0: tensor<8x8xf32> {sdy.shard
   return %1 : tensor<8x16xf32>
 }
 
-// CHECK-LABEL: func @named_sharding_in_manual_computation(
+// CHECK-LABEL: func @named_computation_in_manual_computation_partially_manual(
 // CHECK-SAME:      %arg0: tensor<32x2xi32> {mhlo.sharding = "{devices=[32,1]<=[32]}"})
 // CHECK-SAME:      -> (tensor<32x2xi32> {mhlo.sharding = "{devices=[32,1]<=[32]}"}) {
-func.func @named_sharding_in_manual_computation(
+func.func @named_computation_in_manual_computation_partially_manual(
       %arg0: tensor<32x2xi32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"x", "y"}, {}]>})
       -> (tensor<32x2xi32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"x", "y"}, {}]>}) {
   // CHECK-NEXT: %[[COPY_0:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[32,1]<=[32]}"} : tensor<32x2xi32>
   // CHECK-NEXT: %[[FULL_TO_SHARD:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%0) {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dims={manual}}"} : (tensor<32x2xi32>) -> tensor<4x2xi32>
-  // CHECK-NEXT: %[[FOO:.*]] = call @foo(%[[FULL_TO_SHARD]]) {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dim_replicate}"} : (tensor<4x2xi32>) -> tensor<4x2xi32>
+  // CHECK-NEXT: %[[FOO:.*]] = call @foo(%[[FULL_TO_SHARD]]) {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dims={manual}}"} : (tensor<4x2xi32>) -> tensor<4x2xi32>
   // CHECK-NEXT: %[[COPY_1:.*]] = mhlo.copy %[[FOO]] {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dims={manual}}"} : tensor<4x2xi32>
   // CHECK-NEXT: %[[SHARD_TO_FULL:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_1]]) {mhlo.sharding = "{devices=[32,1]<=[32]}"} : (tensor<4x2xi32>) -> tensor<32x2xi32>
   // CHECK-NEXT: return %[[SHARD_TO_FULL]] : tensor<32x2xi32>
@@ -205,6 +251,28 @@ func.func @named_sharding_in_manual_computation(
   return %0 : tensor<32x2xi32>
 }
 
+// CHECK-LABEL: func @named_computation_in_manual_computation_fully_manual(
+// CHECK-SAME:      %arg0: tensor<32xi32> {mhlo.sharding = "{devices=[32]<=[32]}"})
+// CHECK-SAME:      -> (tensor<32xi32> {mhlo.sharding = "{devices=[32]<=[32]}"}) {
+func.func @named_computation_in_manual_computation_fully_manual(
+      %arg0: tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"x", "y"}]>})
+      -> (tensor<32xi32> {sdy.sharding = #sdy.sharding<@mesh_2, [{"x", "y"}]>}) {
+  // CHECK-NEXT: %[[COPY_0:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[32]<=[32]}"} : tensor<32xi32>
+  // CHECK-NEXT: %[[FULL_TO_SHARD:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%0) {mhlo.sharding = "{manual}"} : (tensor<32xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[FOO:.*]] = call @foo_0(%[[FULL_TO_SHARD]]) {mhlo.sharding = "{manual}"} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[COPY_1:.*]] = mhlo.copy %[[FOO]] {mhlo.sharding = "{manual}"} : tensor<1xi32>
+  // CHECK-NEXT: %[[SHARD_TO_FULL:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_1]]) {mhlo.sharding = "{devices=[32]<=[32]}"} : (tensor<1xi32>) -> tensor<32xi32>
+  // CHECK-NEXT: return %[[SHARD_TO_FULL]] : tensor<32xi32>
+  %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_2, [{"x", "y"}]>] out_shardings=[<@mesh_2, [{"x", "y"}]>] manual_axes={"x", "y"} (%arg1: tensor<1xi32>) {
+    %1 = sdy.named_computation<"foo">(%arg1) out_shardings=[<@mesh_2, [{}]>] (%arg2: tensor<1xi32>) {
+      %2 = stablehlo.negate %arg2 : tensor<1xi32>
+      sdy.return %2 : tensor<1xi32>
+    } : (tensor<1xi32>) -> tensor<1xi32>
+    sdy.return %1 : tensor<1xi32>
+  } : (tensor<32xi32>) -> tensor<32xi32>
+  return %0 : tensor<32xi32>
+}
+
 // CHECK-LABEL: func @free_axis_inside_in_out_shardings_manual_computation
 func.func @free_axis_inside_in_out_shardings_manual_computation(
     %arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_5, [{"i"}, {}]>})
@@ -264,24 +332,9 @@ func.func private @callback_no_result(%arg0: tensor<f64>) {
   // CHECK-SAME:   api_version = 2 : i32, backend_config = "56238273106176",
   // CHECK-SAME:   has_side_effect = true,  mhlo.sharding = "{maximal device=0}",
   // CHECK-SAME:   operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>],
-  // CHECK-SAME:   result_layouts = []
-  // CHECK-SAME: } : (tensor<i64>, tensor<f64>) -> ()
-  %c = stablehlo.constant dense<56238273106176> : tensor<i64>
-  %0 = stablehlo.custom_call @xla_python_cpu_callback(%c, %arg0) {api_version = 2 : i32, backend_config = "56238273106176", has_side_effect = true, operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>], result_layouts = [], sdy.sharding = #sdy.sharding_per_value<[<@maximal_mesh_0, []>]>} : (tensor<i64>, tensor<f64>) -> tuple<>
-  return
-}
-
-// CHECK-LABEL: @callback_result_unused
-func.func private @callback_result_unused(%arg0: tensor<f64>) {
-  // CHECK-NEXT: %[[C:.*]] = stablehlo.constant
-  // CHECK-NEXT: stablehlo.custom_call @xla_python_cpu_callback(%[[C]], %arg0) {
-  // CHECK-SAME:   api_version = 2 : i32, backend_config = "56238273106176",
-  // CHECK-SAME:   has_side_effect = true, mhlo.sharding = "{maximal device=0}",
-  // CHECK-SAME:   operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>],
-  // CHECK-SAME:   result_layouts = []
   // CHECK-SAME: } : (tensor<i64>, tensor<f64>) -> ()
   %c = stablehlo.constant dense<56238273106176> : tensor<i64>
-  %0 = stablehlo.custom_call @xla_python_cpu_callback(%c, %arg0) {api_version = 2 : i32, backend_config = "56238273106176", has_side_effect = true, operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>], result_layouts = [dense<> : tensor<0xindex>], sdy.sharding = #sdy.sharding_per_value<[<@maximal_mesh_0, []>]>} : (tensor<i64>, tensor<f64>) -> tensor<i64>
+  stablehlo.custom_call @xla_python_cpu_callback(%c, %arg0) {api_version = 2 : i32, backend_config = "56238273106176", has_side_effect = true, operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>], result_layouts = [], sdy.sharding = #sdy.sharding_per_value<[<@maximal_mesh_0, []>]>} : (tensor<i64>, tensor<f64>) -> ()
   return
 }
 
@@ -368,7 +421,7 @@ func.func @while_with_no_sharding_inside_manual_comp(
   // CHECK-NEXT: %[[FULL_TO_SHARD:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%0)
   // CHECK:      %[[C0:.*]] = stablehlo.constant {mhlo.sharding = "{manual}"} dense<0>
   // CHECK:      %[[WHILE:.*]]:2 = stablehlo.while(%iterArg = %[[FULL_TO_SHARD]], %iterArg_1 = %[[C0]])
-  // CHECK-SAME:   attributes {mhlo.sharding = "{manual}"}
+  // CHECK-SAME:   attributes {mhlo.sharding = "{{[{][{]}}manual}, {manual}}"}
   // CHECK:      %[[COPY_1:.*]] = mhlo.copy %[[WHILE]]#0
   // CHECK-NEXT: %[[SHARD_TO_FULL:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_1]])
   // CHECK-NEXT: return %[[SHARD_TO_FULL]]
@@ -387,9 +440,22 @@ func.func @while_with_no_sharding_inside_manual_comp(
   return %0 : tensor<32x2xi32>
 }
 
+// CHECK-LABEL: func @propagation_barrier
+func.func @propagation_barrier(%arg0: tensor<8x16xf32>) -> (tensor<8x16xf32>) {
+  // CHECK-NEXT: return %arg0 : tensor<8x16xf32>
+  %r = sdy.propagation_barrier %arg0 allowed_direction=BACKWARD : tensor<8x16xf32>
+  return %r : tensor<8x16xf32>
+}
+
 
 // CHECK-LABEL: func private @foo
-// CHECK-SAME:    %arg0: tensor<4x2xi32> {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dim_replicate}"}
-// CHECK-SAME:    -> (tensor<4x2xi32> {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dim_replicate}"}) {
+// CHECK-SAME:    %arg0: tensor<4x2xi32> {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dims={manual}}"}
+// CHECK-SAME:    -> (tensor<4x2xi32> {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dims={manual}}"}) {
 // CHECK-NEXT:    %[[MULT:.*]] = stablehlo.multiply %arg0, %arg0 {mhlo.sharding = "{devices=[4,1,8]<=[8,4]T(1,0) last_tile_dims={manual}}"} : tensor<4x2xi32>
 // CHECK-NEXT:    return %[[MULT]] : tensor<4x2xi32>
+
+// CHECK-LABEL: func private @foo_0
+// CHECK-SAME:    %arg0: tensor<1xi32> {mhlo.sharding = "{manual}"}
+// CHECK-SAME:    -> (tensor<1xi32> {mhlo.sharding = "{manual}"}) {
+// CHECK-NEXT:    %[[NEGATE:.*]] = stablehlo.negate %arg0 {mhlo.sharding = "{manual}"} : tensor<1xi32>
+// CHECK-NEXT:    return %[[NEGATE]] : tensor<1xi32>
diff --git a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_round_trip_shard_map_export.mlir b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_round_trip_shard_map_export.mlir
index 03fb73473930..ba0b0be43041 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_round_trip_shard_map_export.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_round_trip_shard_map_export.mlir
@@ -2,18 +2,24 @@
 
 sdy.mesh @mesh_0 = <["a"=4, "b"=2]>
 sdy.mesh @mesh_1 = <["a"=2, "b"=2, "c"=2, "d"=2]>
+sdy.mesh @mesh_2 = <["a"=4, "b"=4, "c"=4]>
+sdy.mesh @maximal_mesh_0 = <[], device_ids=[0]>
 
 // CHECK-LABEL: func @single_manual_comp
 func.func @single_manual_comp(%arg0: tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"a", ?}, {"b", ?}]>}, %arg1: tensor<16x32xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"b", ?}, {?}]>}) -> (tensor<8x32xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"a"}, {}]>}) {
-  // CHECK-NEXT: %0 = mhlo.copy %arg0 {mhlo.sharding = "{devices=[4,2]<=[8]}"} : tensor<8x16xf32>
-  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {mhlo.sharding = "{manual}"} : (tensor<8x16xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %2 = mhlo.copy %arg1 {mhlo.sharding = "{devices=[2,1,4]<=[4,2]T(1,0) last_tile_dim_replicate}"} : tensor<16x32xf32>
-  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {mhlo.sharding = "{manual}"} : (tensor<16x32xf32>) -> tensor<8x32xf32>
-  // CHECK-NEXT: %4 = stablehlo.add %1, %1 {mhlo.sharding = "{manual}"} : tensor<2x8xf32>
-  // CHECK-NEXT: %5 = stablehlo.dot %4, %3 {mhlo.sharding = "{manual}"} : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
-  // CHECK-NEXT: %6 = "stablehlo.all_reduce"(%5)
-  // CHECK:      %7 = mhlo.copy %6 {mhlo.sharding = "{manual}"} : tensor<2x32xf32>
-  // CHECK-NEXT: %8 = stablehlo.custom_call @SPMDShardToFullShape(%7) {mhlo.sharding = "{devices=[4,1,2]<=[8] last_tile_dim_replicate}"} : (tensor<2x32xf32>) -> tensor<8x32xf32>
+  // CHECK-NEXT: %0 = mhlo.copy %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a"}, {"b"}]>]>} : tensor<8x16xf32>
+  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<8x16xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT: %2 = mhlo.copy %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}, {}], replicated={"a"}>]>} : tensor<16x32xf32>
+  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<16x32xf32>) -> tensor<8x32xf32>
+  // CHECK-NEXT: %4 = stablehlo.add %1, %1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x8xf32>
+  // CHECK-NEXT: %5 = stablehlo.dot %4, %3 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<2x8xf32>, tensor<8x32xf32>) -> tensor<2x32xf32>
+  // CHECK-NEXT: %6 = "stablehlo.all_reduce"(%5) <{channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>, replica_groups = dense<{{\[\[}}0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi64>, use_global_device_ids}> ({
+  // CHECK-NEXT: ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+  // CHECK-NEXT:   %9 = stablehlo.add %arg2, %arg3 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, []>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<f32>
+  // CHECK-NEXT:   stablehlo.return %9 {xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<f32>
+  // CHECK-NEXT: }) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<2x32xf32>) -> tensor<2x32xf32>
+  // CHECK-NEXT: %7 = mhlo.copy %6 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x32xf32>
+  // CHECK-NEXT: %8 = stablehlo.custom_call @SPMDShardToFullShape(%7) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a"}, {}], replicated={"b"}>]>} : (tensor<2x32xf32>) -> tensor<8x32xf32>
   // CHECK-NEXT: return %8 : tensor<8x32xf32>
   %0 = sdy.manual_computation(%arg0, %arg1) in_shardings=[<@mesh_0, [{"a"}, {"b"}]>, <@mesh_0, [{"b"}, {}], replicated={"a"}>] out_shardings=[<@mesh_0, [{"a"}, {}], replicated={"b"}>] manual_axes={"a", "b"} (%arg2: tensor<2x8xf32>, %arg3: tensor<8x32xf32>) {
     %1 = stablehlo.add %arg2, %arg2 : tensor<2x8xf32>
@@ -31,14 +37,14 @@ func.func @single_manual_comp(%arg0: tensor<8x16xf32> {sdy.sharding = #sdy.shard
 // CHECK-LABEL: func @manual_comp_using_another
 func.func @manual_comp_using_another(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{"a"}, {}]>})
     -> (tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {"b"}]>}) {
-  // CHECK-NEXT: %0 = mhlo.copy %arg0 {mhlo.sharding = "{devices=[4,1,2]<=[8] last_tile_dim_replicate}"} : tensor<8x8xf32>
-  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<8x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %2 = mhlo.copy %1 {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : tensor<2x8xf32>
-  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDShardToFullShape(%2) {mhlo.sharding = "{devices=[4,1,2]<=[8] last_tile_dim_replicate}"} : (tensor<2x8xf32>) -> tensor<8x8xf32>
-  // CHECK-NEXT: %4 = mhlo.copy %3 {mhlo.sharding = "{devices=[1,2,4]<=[4,2]T(1,0) last_tile_dim_replicate}"} : tensor<8x8xf32>
-  // CHECK-NEXT: %5 = stablehlo.custom_call @SPMDFullToShardShape(%4) {mhlo.sharding = "{devices=[1,1,2,4]<=[4,2]T(1,0) last_tile_dims={manual, replicated}}"} : (tensor<8x8xf32>) -> tensor<8x4xf32>
-  // CHECK-NEXT: %6 = mhlo.copy %5 {mhlo.sharding = "{devices=[1,1,2,4]<=[4,2]T(1,0) last_tile_dims={manual, replicated}}"} : tensor<8x4xf32>
-  // CHECK-NEXT: %7 = stablehlo.custom_call @SPMDShardToFullShape(%6) {mhlo.sharding = "{devices=[1,2,4]<=[4,2]T(1,0) last_tile_dim_replicate}"} : (tensor<8x4xf32>) -> tensor<8x8xf32>
+  // CHECK-NEXT: %0 = mhlo.copy %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a"}, {}]>]>} : tensor<8x8xf32>
+  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<8x8xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT: %2 = mhlo.copy %1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<2x8xf32>
+  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDShardToFullShape(%2) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a"}, {}]>]>} : (tensor<2x8xf32>) -> tensor<8x8xf32>
+  // CHECK-NEXT: %4 = mhlo.copy %3 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {"b"}]>]>} : tensor<8x8xf32>
+  // CHECK-NEXT: %5 = stablehlo.custom_call @SPMDFullToShardShape(%4) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"b"}>} : (tensor<8x8xf32>) -> tensor<8x4xf32>
+  // CHECK-NEXT: %6 = mhlo.copy %5 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"b"}>} : tensor<8x4xf32>
+  // CHECK-NEXT: %7 = stablehlo.custom_call @SPMDShardToFullShape(%6) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {"b"}]>]>} : (tensor<8x4xf32>) -> tensor<8x8xf32>
   // CHECK-NEXT: return %7 : tensor<8x8xf32>
   %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_0, [{"a"}, {}]>] out_shardings=[<@mesh_0, [{"a"}, {}]>] manual_axes={"a"} (%arg1: tensor<2x8xf32>) {
     sdy.return %arg1 : tensor<2x8xf32>
@@ -52,14 +58,14 @@ func.func @manual_comp_using_another(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy
 
 // CHECK-LABEL: func @sharding_in_manual_computation_body
 func.func @sharding_in_manual_computation_body(%arg0: tensor<8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a", ?}, {"b", ?}]>}, %arg1: tensor<16x32xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"b", ?}, {?}]>}) -> (tensor<8x32xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a"}, {}]>}) {
-  // CHECK-NEXT: %0 = mhlo.copy %arg0 {mhlo.sharding = "{devices=[2,2,4]<=[16] last_tile_dim_replicate}"} : tensor<8x16xf32>
-  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {mhlo.sharding = "{devices=[1,1,4,4]<=[16] last_tile_dims={manual, replicated}}"} : (tensor<8x16xf32>) -> tensor<4x8xf32>
-  // CHECK-NEXT: %2 = mhlo.copy %arg1 {mhlo.sharding = "{devices=[2,1,8]<=[2,2,4]T(1,0,2) last_tile_dim_replicate}"} : tensor<16x32xf32>
-  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {mhlo.sharding = "{devices=[1,1,4,4]<=[16] last_tile_dims={manual, replicated}}"} : (tensor<16x32xf32>) -> tensor<8x32xf32>
-  // CHECK-NEXT: %4 = stablehlo.add %1, %1 {mhlo.sharding = "{devices=[2,1,4,2]<=[4,2,2]T(1,0,2) last_tile_dims={manual, replicated}}"} : tensor<4x8xf32>
-  // CHECK-NEXT: %5 = stablehlo.dot %4, %3 {mhlo.sharding = "{devices=[2,2,4]<=[4,2,2]T(2,1,0) last_tile_dims={manual}}"} : (tensor<4x8xf32>, tensor<8x32xf32>) -> tensor<4x32xf32>
-  // CHECK-NEXT: %6 = mhlo.copy %5 {mhlo.sharding = "{devices=[1,1,4,4]<=[16] last_tile_dims={manual, replicated}}"} : tensor<4x32xf32>
-  // CHECK-NEXT: %7 = stablehlo.custom_call @SPMDShardToFullShape(%6) {mhlo.sharding = "{devices=[2,1,8]<=[16] last_tile_dim_replicate}"} : (tensor<4x32xf32>) -> tensor<8x32xf32>
+  // CHECK-NEXT: %0 = mhlo.copy %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"a"}, {"b"}]>]>} : tensor<8x16xf32>
+  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<8x16xf32>) -> tensor<4x8xf32>
+  // CHECK-NEXT: %2 = mhlo.copy %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"b"}, {}], replicated={"a"}>]>} : tensor<16x32xf32>
+  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<16x32xf32>) -> tensor<8x32xf32>
+  // CHECK-NEXT: %4 = stablehlo.add %1, %1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"c"}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<4x8xf32>
+  // CHECK-NEXT: %5 = stablehlo.dot %4, %3 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"d"}, {"c"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<4x8xf32>, tensor<8x32xf32>) -> tensor<4x32xf32>
+  // CHECK-NEXT: %6 = mhlo.copy %5 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<4x32xf32>
+  // CHECK-NEXT: %7 = stablehlo.custom_call @SPMDShardToFullShape(%6) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"a"}, {}], replicated={"b"}>]>} : (tensor<4x32xf32>) -> tensor<8x32xf32>
   // CHECK-NEXT: return %7 : tensor<8x32xf32>
   %0 = sdy.manual_computation(%arg0, %arg1) in_shardings=[<@mesh_1, [{"a"}, {"b"}]>, <@mesh_1, [{"b"}, {}], replicated={"a"}>] out_shardings=[<@mesh_1, [{"a"}, {}], replicated={"b"}>] manual_axes={"a", "b"} (%arg2: tensor<4x8xf32>, %arg3: tensor<8x32xf32>) {
     %1 = stablehlo.add %arg2, %arg2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"c"}, {}]>]>} : tensor<4x8xf32>
@@ -85,42 +91,49 @@ func.func @call_op_with_no_operands_or_results() {
 }
 
 // CHECK-LABEL: func @nested_shmaps
-func.func @nested_shmaps(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a"}, {"b"}]>}) -> (tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a", ?}, {?}]>}) {
-  // CHECK-NEXT: %[[COPY_OPERAND_OUTER:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[2,1,8]<=[16] last_tile_dim_replicate}"} : tensor<4x8xf32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_OUTER:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_OUTER]]) {mhlo.sharding = "{devices=[1,1,2,8]<=[16] last_tile_dims={manual, replicated}}"} : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %[[COPY_OPERAND_INNER:.*]] = mhlo.copy %[[FULL_TO_SHARD_OUTER]] {mhlo.sharding = "{devices=[1,2,2,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : tensor<2x8xf32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_INNER:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_INNER]]) {mhlo.sharding = "{devices=[1,1,4,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  // CHECK-NEXT: %[[MULT:.*]] = stablehlo.multiply %[[FULL_TO_SHARD_INNER]], %[[FULL_TO_SHARD_INNER]] {mhlo.sharding = "{devices=[1,1,4,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : tensor<2x4xf32>
-  // CHECK-NEXT: %[[COPY_RESULT_INNER:.*]]  = mhlo.copy %[[MULT]] {mhlo.sharding = "{devices=[1,1,4,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : tensor<2x4xf32>
-  // CHECK-NEXT: %[[SHARD_TO_FULL_INNER:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT_INNER]]) {mhlo.sharding = "{devices=[1,2,2,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : (tensor<2x4xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %[[COPY_RESULT_OUTER:.*]] = mhlo.copy %[[SHARD_TO_FULL_INNER]] {mhlo.sharding = "{devices=[1,1,2,8]<=[16] last_tile_dims={manual, replicated}}"} : tensor<2x8xf32>
-  // CHECK-NEXT: %[[SHARD_TO_FULL_OUTER:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT_OUTER]]) {mhlo.sharding = "{devices=[2,1,8]<=[16] last_tile_dim_replicate}"} : (tensor<2x8xf32>) -> tensor<4x8xf32>
-  // CHECK-NEXT: return %[[SHARD_TO_FULL_OUTER]] : tensor<4x8xf32>
-  %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_1, [{"a"}, {}]>] out_shardings=[<@mesh_1, [{"a"}, {}]>] manual_axes={"a"} (%arg1: tensor<2x8xf32>) {
-    %1 = sdy.manual_computation(%arg1) in_shardings=[<@mesh_1, [{}, {"b"}]>] out_shardings=[<@mesh_1, [{}, {"b"}]>] manual_axes={"b"} (%arg2: tensor<2x4xf32>) {
-      %2 = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
-      sdy.return %2 : tensor<2x4xf32>
-    } : (tensor<2x8xf32>) -> tensor<2x8xf32>
-    sdy.return %1 : tensor<2x8xf32>
-  } : (tensor<4x8xf32>) -> tensor<4x8xf32>
-  return %0 : tensor<4x8xf32>
+func.func @nested_shmaps(%arg0: tensor<4x8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a"}, {"b"}, {"c"}]>}) -> (tensor<4x8x16xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a", ?}, {?}, {?}]>}) {
+  // CHECK-NEXT: %0 = mhlo.copy %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"a"}, {}, {}]>]>} : tensor<4x8x16xf32>
+  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<4x8x16xf32>) -> tensor<2x8x16xf32>
+  // CHECK-NEXT: %2 = mhlo.copy %1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {"b"}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<2x8x16xf32>
+  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<2x8x16xf32>) -> tensor<2x4x16xf32>
+  // CHECK-NEXT: %4 = mhlo.copy %3 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {"c"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x4x16xf32>
+  // CHECK-NEXT: %5 = stablehlo.custom_call @SPMDFullToShardShape(%4) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b", "c"}>} : (tensor<2x4x16xf32>) -> tensor<2x4x8xf32>
+  // CHECK-NEXT: %6 = stablehlo.multiply %5, %5 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b", "c"}>} : tensor<2x4x8xf32>
+  // CHECK-NEXT: %7 = mhlo.copy %6 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b", "c"}>} : tensor<2x4x8xf32>
+  // CHECK-NEXT: %8 = stablehlo.custom_call @SPMDShardToFullShape(%7) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {"c"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<2x4x8xf32>) -> tensor<2x4x16xf32>
+  // CHECK-NEXT: %9 = mhlo.copy %8 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x4x16xf32>
+  // CHECK-NEXT: %10 = stablehlo.custom_call @SPMDShardToFullShape(%9) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {"b"}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<2x4x16xf32>) -> tensor<2x8x16xf32>
+  // CHECK-NEXT: %11 = mhlo.copy %10 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<2x8x16xf32>
+  // CHECK-NEXT: %12 = stablehlo.custom_call @SPMDShardToFullShape(%11) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"a"}, {}, {}]>]>} : (tensor<2x8x16xf32>) -> tensor<4x8x16xf32>
+  // CHECK-NEXT: return %12 : tensor<4x8x16xf32>
+  %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_1, [{"a"}, {}, {}]>] out_shardings=[<@mesh_1, [{"a"}, {}, {}]>] manual_axes={"a"} (%arg1: tensor<2x8x16xf32>) {
+    %1 = sdy.manual_computation(%arg1) in_shardings=[<@mesh_1, [{}, {"b"}, {}]>] out_shardings=[<@mesh_1, [{}, {"b"}, {}]>] manual_axes={"b"} (%arg2: tensor<2x4x16xf32>) {
+      %2 = sdy.manual_computation(%arg2) in_shardings=[<@mesh_1, [{}, {}, {"c"}]>] out_shardings=[<@mesh_1, [{}, {}, {"c"}]>] manual_axes={"c"} (%arg3: tensor<2x4x8xf32>) {
+        %3 = stablehlo.multiply %arg3, %arg3 : tensor<2x4x8xf32>
+        sdy.return %3 : tensor<2x4x8xf32>
+      } : (tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+      sdy.return %2 : tensor<2x4x16xf32>
+    } : (tensor<2x8x16xf32>) -> tensor<2x8x16xf32>
+    sdy.return %1 : tensor<2x8x16xf32>
+  } : (tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
 }
 
 // CHECK-LABEL: func @nested_shmaps_extra_op
 func.func @nested_shmaps_extra_op(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a"}, {"b"}]>}) -> (tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh_1, [{"a", ?}, {?}]>}) {
-  // CHECK-NEXT: %[[COPY_OPERAND_OUTER:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[2,1,8]<=[16] last_tile_dim_replicate}"} : tensor<4x8xf32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_OUTER:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_OUTER]]) {mhlo.sharding = "{devices=[1,1,2,8]<=[16] last_tile_dims={manual, replicated}}"} : (tensor<4x8xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %[[COPY_OPERAND_INNER:.*]] = mhlo.copy %[[FULL_TO_SHARD_OUTER]] {mhlo.sharding = "{devices=[1,2,2,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : tensor<2x8xf32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_INNER:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_INNER]]) {mhlo.sharding = "{devices=[1,1,4,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : (tensor<2x8xf32>) -> tensor<2x4xf32>
-  // CHECK-NEXT: %[[MULT:.*]] = stablehlo.multiply %[[FULL_TO_SHARD_INNER]], %[[FULL_TO_SHARD_INNER]] {mhlo.sharding = "{devices=[1,1,4,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : tensor<2x4xf32>
-  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[MULT]], %[[MULT]] {mhlo.sharding = "{devices=[2,1,4,2]<=[2,2,2,2]T(2,1,0,3) last_tile_dims={manual, replicated}}"} : tensor<2x4xf32>
-  // CHECK-NEXT: %[[SUB:.*]] = stablehlo.subtract %[[ADD]], %[[ADD]] {mhlo.sharding = "{devices=[4,1,4]<=[2,2,4]T(2,1,0) last_tile_dims={manual}}"} : tensor<2x4xf32>
-  // CHECK-NEXT: %[[COPY_RESULT_INNER:.*]]  = mhlo.copy %[[SUB]] {mhlo.sharding = "{devices=[1,1,4,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"}  : tensor<2x4xf32>
-  // CHECK-NEXT: %[[SHARD_TO_FULL_INNER:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT_INNER]]) {mhlo.sharding = "{devices=[1,2,2,4]<=[2,2,4]T(1,0,2) last_tile_dims={manual, replicated}}"} : (tensor<2x4xf32>) -> tensor<2x8xf32>
-  // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[SHARD_TO_FULL_INNER]], %[[SHARD_TO_FULL_INNER]] {mhlo.sharding = "{devices=[1,1,2,8]<=[16] last_tile_dims={manual, replicated}}"} : tensor<2x8xf32>
-  // CHECK-NEXT: %[[COPY_RESULT_OUTER:.*]] = mhlo.copy %[[ADD]] {mhlo.sharding = "{devices=[1,1,2,8]<=[16] last_tile_dims={manual, replicated}}"} : tensor<2x8xf32>
-  // CHECK-NEXT: %[[SHARD_TO_FULL_OUTER:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT_OUTER]]) {mhlo.sharding = "{devices=[2,1,8]<=[16] last_tile_dim_replicate}"} : (tensor<2x8xf32>) -> tensor<4x8xf32>
-  // CHECK-NEXT: return %[[SHARD_TO_FULL_OUTER]] : tensor<4x8xf32>
+  // CHECK-NEXT: %0 = mhlo.copy %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"a"}, {}]>]>} : tensor<4x8xf32>
+  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<4x8xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT: %2 = mhlo.copy %1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {"b"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<2x8xf32>
+  // CHECK-NEXT: %3 = stablehlo.custom_call @SPMDFullToShardShape(%2) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<2x8xf32>) -> tensor<2x4xf32>
+  // CHECK-NEXT: %4 = stablehlo.multiply %3, %3 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x4xf32>
+  // CHECK-NEXT: %5 = stablehlo.add %4, %4 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"c"}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x4xf32>
+  // CHECK-NEXT: %6 = stablehlo.subtract %5, %5 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"c", "d"}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x4xf32>
+  // CHECK-NEXT: %7 = mhlo.copy %6 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<2x4xf32>
+  // CHECK-NEXT: %8 = stablehlo.custom_call @SPMDShardToFullShape(%7) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {"b"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<2x4xf32>) -> tensor<2x8xf32>
+  // CHECK-NEXT: %9 = stablehlo.add %8, %8 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<2x8xf32>
+  // CHECK-NEXT: %10 = mhlo.copy %9 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<2x8xf32>
+  // CHECK-NEXT: %11 = stablehlo.custom_call @SPMDShardToFullShape(%10) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_1, [{"a"}, {}]>]>} : (tensor<2x8xf32>) -> tensor<4x8xf32>
+  // CHECK-NEXT: return %11 : tensor<4x8xf32>
   %0 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_1, [{"a"}, {}]>] out_shardings=[<@mesh_1, [{"a"}, {}]>] manual_axes={"a"} (%arg1: tensor<2x8xf32>) {
     %1 = sdy.manual_computation(%arg1) in_shardings=[<@mesh_1, [{}, {"b"}]>] out_shardings=[<@mesh_1, [{}, {"b"}]>] manual_axes={"b"} (%arg2: tensor<2x4xf32>) {
       %2 = stablehlo.multiply %arg2, %arg2 : tensor<2x4xf32>
@@ -136,24 +149,24 @@ func.func @nested_shmaps_extra_op(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sh
 
 // CHECK-LABEL: func @multiple_manual_computation_uses
 func.func @multiple_manual_computation_uses(%arg0: tensor<2x4x8xi32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {}, {"a"}]>}, %arg1: tensor<32x16x8xi32> {sdy.sharding = #sdy.sharding<@mesh_0, [{}, {}, {"a"}]>}) -> (tensor<131x4x8xi32> {sdy.sharding = #sdy.sharding<@mesh_0, [{?}, {?}, {"a"}]>}) {
-  // CHECK-NEXT: %[[COPY_OPERAND_0:.*]] = mhlo.copy %arg0 {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dim_replicate}"} : tensor<2x4x8xi32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_0:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_0]]) {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<2x4x8xi32>) -> tensor<2x4x2xi32>
-  // CHECK-NEXT: %[[CUSTOM_CALL:.*]] = stablehlo.custom_call @sdy_testonly(%[[FULL_TO_SHARD_0]])  {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<2x4x2xi32>) -> tensor<3x4x2xi32>
-  // CHECK-NEXT: %[[COPY_RESULT_0:.*]] = mhlo.copy %[[CUSTOM_CALL]] {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : tensor<3x4x2xi32>
-  // CHECK-NEXT: %[[SHARD_TO_FULL_0:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT_0]]) {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dim_replicate}"} : (tensor<3x4x2xi32>) -> tensor<3x4x8xi32>
-  // CHECK-NEXT: %[[COPY_OPERAND_1:.*]] = mhlo.copy %arg1 {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dim_replicate}"} : tensor<32x16x8xi32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_1:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_1]])  {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<32x16x8xi32>) -> tensor<32x16x2xi32>
-  // CHECK-NEXT: %[[RESHAPE:.*]] = stablehlo.reshape %[[FULL_TO_SHARD_1]] {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<32x16x2xi32>) -> tensor<128x4x2xi32>
-  // CHECK-NEXT: %[[COPY_RESULT_1:.*]] = mhlo.copy %[[RESHAPE]] {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : tensor<128x4x2xi32>
-  // CHECK-NEXT: %[[SHARD_TO_FULL_1:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT_1]]) {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dim_replicate}"} : (tensor<128x4x2xi32>) -> tensor<128x4x8xi32>
-  // CHECK-NEXT: %[[COPY_OPERAND_2:.*]] = mhlo.copy %[[SHARD_TO_FULL_0]] {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dim_replicate}"} : tensor<3x4x8xi32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_2:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_2]]) {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<3x4x8xi32>) -> tensor<3x4x2xi32>
-  // CHECK-NEXT: %[[COPY_OPERAND_3:.*]] = mhlo.copy %[[SHARD_TO_FULL_1]] {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dim_replicate}"} : tensor<128x4x8xi32>
-  // CHECK-NEXT: %[[FULL_TO_SHARD_3:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND_3]]) {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<128x4x8xi32>) -> tensor<128x4x2xi32>
-  // CHECK-NEXT: %[[CONCAT:.*]] = stablehlo.concatenate %[[FULL_TO_SHARD_3]], %[[FULL_TO_SHARD_2]], dim = 0 {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : (tensor<128x4x2xi32>, tensor<3x4x2xi32>) -> tensor<131x4x2xi32>
-  // CHECK-NEXT: %[[COPY_RESULT_2:.*]] = mhlo.copy %[[CONCAT]] {mhlo.sharding = "{devices=[1,1,1,4,2]<=[8] last_tile_dims={manual, replicated}}"} : tensor<131x4x2xi32>
-  // CHECK-NEXT: %[[SHARD_TO_FULL_2:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT_2]]) {mhlo.sharding = "{devices=[1,1,4,2]<=[8] last_tile_dim_replicate}"} : (tensor<131x4x2xi32>) -> tensor<131x4x8xi32>
-  // CHECK-NEXT: return %[[SHARD_TO_FULL_2]] : tensor<131x4x8xi32>
+  // CHECK-NEXT: %0 = mhlo.copy %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {"a"}]>]>} : tensor<2x4x8xi32>
+  // CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<2x4x8xi32>) -> tensor<2x4x2xi32>
+  // CHECK-NEXT: %2 = stablehlo.custom_call @sdy_testonly(%1) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<2x4x2xi32>) -> tensor<3x4x2xi32>
+  // CHECK-NEXT: %3 = mhlo.copy %2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<3x4x2xi32>
+  // CHECK-NEXT: %4 = stablehlo.custom_call @SPMDShardToFullShape(%3) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {"a"}]>]>} : (tensor<3x4x2xi32>) -> tensor<3x4x8xi32>
+  // CHECK-NEXT: %5 = mhlo.copy %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {"a"}]>]>} : tensor<32x16x8xi32>
+  // CHECK-NEXT: %6 = stablehlo.custom_call @SPMDFullToShardShape(%5) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<32x16x8xi32>) -> tensor<32x16x2xi32>
+  // CHECK-NEXT: %7 = stablehlo.reshape %6 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<32x16x2xi32>) -> tensor<128x4x2xi32>
+  // CHECK-NEXT: %8 = mhlo.copy %7 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<128x4x2xi32>
+  // CHECK-NEXT: %9 = stablehlo.custom_call @SPMDShardToFullShape(%8) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {"a"}]>]>} : (tensor<128x4x2xi32>) -> tensor<128x4x8xi32>
+  // CHECK-NEXT: %10 = mhlo.copy %4 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {"a"}]>]>} : tensor<3x4x8xi32>
+  // CHECK-NEXT: %11 = stablehlo.custom_call @SPMDFullToShardShape(%10) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<3x4x8xi32>) -> tensor<3x4x2xi32>
+  // CHECK-NEXT: %12 = mhlo.copy %9 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {"a"}]>]>} : tensor<128x4x8xi32>
+  // CHECK-NEXT: %13 = stablehlo.custom_call @SPMDFullToShardShape(%12) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<128x4x8xi32>) -> tensor<128x4x2xi32>
+  // CHECK-NEXT: %14 = stablehlo.concatenate %13, %11, dim = 0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<128x4x2xi32>, tensor<3x4x2xi32>) -> tensor<131x4x2xi32>
+  // CHECK-NEXT: %15 = mhlo.copy %14 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<131x4x2xi32>
+  // CHECK-NEXT: %16 = stablehlo.custom_call @SPMDShardToFullShape(%15) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}, {}, {"a"}]>]>} : (tensor<131x4x2xi32>) -> tensor<131x4x8xi32>
+  // CHECK-NEXT: return %16 : tensor<131x4x8xi32>
   %1 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_0, [{}, {}, {"a"}]>] out_shardings=[<@mesh_0, [{}, {}, {"a"}]>] manual_axes={"a"} (%arg2: tensor<2x4x2xi32>) {
     %4 = stablehlo.custom_call @sdy_testonly(%arg2) : (tensor<2x4x2xi32>) -> tensor<3x4x2xi32>
     sdy.return %4 : tensor<3x4x2xi32>
@@ -168,3 +181,63 @@ func.func @multiple_manual_computation_uses(%arg0: tensor<2x4x8xi32> {sdy.shardi
   } : (tensor<3x4x8xi32>, tensor<128x4x8xi32>) -> tensor<131x4x8xi32>
   return %3 : tensor<131x4x8xi32>
 }
+
+// CHECK-LABEL: func @named_computation_in_manual_computation
+func.func @named_computation_in_manual_computation(%arg0: tensor<32xi32>) -> (tensor<32xi32>, tensor<32xi32>, tensor<16xi32>) {
+// CHECK-NEXT: %0 = mhlo.copy %arg0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a", "b"}]>]>} : tensor<32xi32>
+// CHECK-NEXT: %1 = stablehlo.custom_call @SPMDFullToShardShape(%0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<32xi32>) -> tensor<8xi32>
+// CHECK-NEXT: %2:2 = sdy.named_computation<"foo">(%1) in_shardings=[<@mesh_0, [{}]>] out_shardings=[<@mesh_0, [{"b"}]>, <@mesh_0, [{"b"}]>] (%arg1: tensor<8xi32>) {
+// CHECK-NEXT:   %10 = stablehlo.multiply %arg1, %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<8xi32>
+// CHECK-NEXT:   %11 = stablehlo.negate %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<8xi32>
+// CHECK-NEXT:   sdy.return {xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} %10, %11 : tensor<8xi32>, tensor<8xi32>
+// CHECK-NEXT: } {xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : (tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>)
+// CHECK-NEXT: %3 = sdy.named_computation<"no_input_named_computation">() in_shardings=[] out_shardings=[<@mesh_0, [{}]>] () {
+// CHECK-NEXT:   %c = stablehlo.constant {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} dense<[0, 1, 2, 3]> : tensor<4xi32>
+// CHECK-NEXT:   %10 = stablehlo.negate %c {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<4xi32>
+// CHECK-NEXT:   sdy.return {xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} %10 : tensor<4xi32>
+// CHECK-NEXT: } {xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : () -> tensor<4xi32>
+// CHECK-NEXT: %4 = mhlo.copy %2#0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<8xi32>
+// CHECK-NEXT: %5 = stablehlo.custom_call @SPMDShardToFullShape(%4) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a", "b"}]>]>} : (tensor<8xi32>) -> tensor<32xi32>
+// CHECK-NEXT: %6 = mhlo.copy %2#1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<8xi32>
+// CHECK-NEXT: %7 = stablehlo.custom_call @SPMDShardToFullShape(%6) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a", "b"}]>]>} : (tensor<8xi32>) -> tensor<32xi32>
+// CHECK-NEXT: %8 = mhlo.copy %3 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a"}>} : tensor<4xi32>
+// CHECK-NEXT: %9 = stablehlo.custom_call @SPMDShardToFullShape(%8) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"a", "b"}]>]>} : (tensor<4xi32>) -> tensor<16xi32>
+// CHECK-NEXT: return %5, %7, %9 : tensor<32xi32>, tensor<32xi32>, tensor<16xi32>
+  %0:3 = sdy.manual_computation(%arg0) in_shardings=[<@mesh_0, [{"a", "b"}]>] out_shardings=[<@mesh_0, [{"a", "b"}]>, <@mesh_0, [{"a", "b"}]>, <@mesh_0, [{"a", "b"}]>] manual_axes={"a"} (%arg1: tensor<8xi32>) {
+    %1:2 = sdy.named_computation<"foo">(%arg1) out_shardings=[<@mesh_0, [{"b"}]>, <@mesh_0, [{"b"}]>] (%arg2: tensor<8xi32>) {
+      %2 = stablehlo.multiply %arg2, %arg2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>} : tensor<8xi32>
+      %3 = stablehlo.negate %arg2 : tensor<8xi32>
+      sdy.return %2, %3 : tensor<8xi32>, tensor<8xi32>
+    } : (tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>)
+    %4 = sdy.named_computation<"no_input_named_computation">() () {
+      %c = stablehlo.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+      %5 = stablehlo.negate %c : tensor<4xi32>
+      sdy.return %5 : tensor<4xi32>
+    } : () -> (tensor<4xi32>)
+    sdy.return %1#0, %1#1, %4 : tensor<8xi32>, tensor<8xi32>, tensor<4xi32>
+  } : (tensor<32xi32>) -> (tensor<32xi32>, tensor<32xi32>, tensor<16xi32>)
+  return %0#0, %0#1, %0#2 : tensor<32xi32>, tensor<32xi32>, tensor<16xi32>
+}
+
+// CHECK-LABEL: func @manual_computation_with_tokens
+func.func @manual_computation_with_tokens(
+    %arg0: !stablehlo.token {sdy.sharding = #sdy.sharding<@mesh_0, []>},
+    %arg1: tensor<2xi64> {sdy.sharding = #sdy.sharding<@mesh_0, [{"b"}]>}
+) -> (!stablehlo.token, tensor<2xi64>) {
+  // CHECK-NEXT: %[[COPY_OPERAND:.*]] = mhlo.copy %arg1 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>} : tensor<2xi64>
+  // CHECK-NEXT: %[[FULL_TO_SHARD:.*]] = stablehlo.custom_call @SPMDFullToShardShape(%[[COPY_OPERAND]]) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (tensor<2xi64>) -> tensor<1xi64>
+  // CHECK-NEXT: %[[TOKEN_CALL:.*]] = stablehlo.custom_call @sdy_testonly(%arg0) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, []>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (!stablehlo.token) -> !stablehlo.token
+  // CHECK-NEXT: stablehlo.custom_call @sdy_testonly(%[[TOKEN_CALL]]) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, []>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : (!stablehlo.token) -> ()
+  // CHECK-NEXT: %[[COPY_RESULT:.*]] = mhlo.copy %[[FULL_TO_SHARD]] {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{}]>]>, xla.sdy.manual_axes = #sdy<manual_axes{"a", "b"}>} : tensor<1xi64>
+  // CHECK-NEXT: %[[SHARD_TO_FULL:.*]] = stablehlo.custom_call @SPMDShardToFullShape(%[[COPY_RESULT]]) {sdy.sharding = #sdy.sharding_per_value<[<@mesh_0, [{"b"}]>]>} : (tensor<1xi64>) -> tensor<2xi64>
+  // CHECK-NEXT: return %[[TOKEN_CALL]], %[[SHARD_TO_FULL]] : !stablehlo.token, tensor<2xi64>
+  %0:2 = sdy.manual_computation(%arg0, %arg1)
+      in_shardings=[<@mesh_0, []>, <@mesh_0, [{"b"}]>]
+      out_shardings=[<@mesh_0, []>, <@mesh_0, [{"b"}]>]
+      manual_axes={"a", "b"} (%arg2: !stablehlo.token, %arg3: tensor<1xi64>) {
+    %1 = stablehlo.custom_call @sdy_testonly(%arg2) : (!stablehlo.token) -> (!stablehlo.token)
+    stablehlo.custom_call @sdy_testonly(%1) {sdy.sharding = #sdy.sharding_per_value<[<@maximal_mesh_0, []>]>} : (!stablehlo.token) -> ()
+    sdy.return %1, %arg3 : !stablehlo.token, tensor<1xi64>
+  } : (!stablehlo.token, tensor<2xi64>) -> (!stablehlo.token, tensor<2xi64>)
+  return %0#0, %0#1 : !stablehlo.token, tensor<2xi64>
+}
diff --git a/third_party/xla/xla/service/spmd/shardy/utils.cc b/third_party/xla/xla/service/spmd/shardy/utils.cc
index 775c6e548e62..e43fd017b82c 100644
--- a/third_party/xla/xla/service/spmd/shardy/utils.cc
+++ b/third_party/xla/xla/service/spmd/shardy/utils.cc
@@ -17,10 +17,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <string>
 
-#include "absl/strings/escaping.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
@@ -33,10 +36,13 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/Support/LLVM.h"
+#include "shardy/dialect/sdy/ir/dialect.h"
 #include "shardy/dialect/sdy/ir/register.h"
 #include "shardy/dialect/sdy/ir/utils.h"
 #include "stablehlo/dialect/StablehloOps.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/spmd/shardy/constants.h"
+#include "xla/service/spmd/shardy/extensions/mhlo_extensions.h"
 
 namespace xla {
 namespace sdy {
@@ -52,8 +58,14 @@ using ::mlir::StringRef;
 using xla::sdy::kFrontendAttributesAttr;
 
 using ::mlir::func::FuncOp;
+using ::mlir::sdy::TensorShardingAttr;
+using ::mlir::sdy::TensorShardingPerValueAttr;
 using ::mlir::stablehlo::CustomCallOp;
 
+absl::string_view toStringView(mlir::StringRef sr) {
+  return absl::string_view(sr.data(), sr.size());
+}
+
 DictionaryAttr getFrontendAttrs(Operation* op) {
   return op->getAttrOfType<DictionaryAttr>(kFrontendAttributesAttr);
 }
@@ -65,18 +77,12 @@ DictionaryAttr getFuncArgFrontendAttrs(FuncOp funcOp, unsigned int index) {
 
 namespace {
 
-mlir::StringAttr getStringAttribute(Attribute attr, mlir::OpBuilder& builder,
-                                    bool escapeAttr) {
+mlir::StringAttr getStringAttribute(Attribute attr, mlir::OpBuilder& builder) {
   std::string value;
   if (auto stringAttr = mlir::dyn_cast<StringAttr>(attr)) {
-    if (!escapeAttr) {
-      return stringAttr;
-    }
-    value = stringAttr.getValue().str();
-  } else {
-    value = mlir::sdy::attributeToString(attr);
+    return stringAttr;
   }
-  return builder.getStringAttr(escapeAttr ? absl::CEscape(value) : value);
+  return builder.getStringAttr(mlir::sdy::attributeToString(attr));
 }
 
 SmallVector<NamedAttribute> getExistingFrontendAttributes(
@@ -94,9 +100,9 @@ SmallVector<NamedAttribute> getExistingFrontendAttributes(
 }
 
 void setFrontendAttribute(SmallVector<NamedAttribute>& existingAttributes,
-                          StringRef name, Attribute value, bool escapeAttr) {
+                          StringRef name, Attribute value) {
   mlir::OpBuilder builder(value.getContext());
-  StringAttr stringValue = getStringAttribute(value, builder, escapeAttr);
+  StringAttr stringValue = getStringAttribute(value, builder);
   for (auto* it = existingAttributes.begin(); it != existingAttributes.end();
        ++it) {
     if (it->getName() == name) {
@@ -137,20 +143,19 @@ void setFuncArgFrontendAttrs(FuncOp funcOp, unsigned int index,
 
 }  // namespace
 
-void setFrontendAttribute(Operation* op, StringRef name, Attribute value,
-                          bool escapeAttr) {
+void setFrontendAttribute(Operation* op, StringRef name, Attribute value) {
   SmallVector<NamedAttribute> existingAttributes =
       getExistingFrontendAttributes(getFrontendAttrs(op), "");
-  setFrontendAttribute(existingAttributes, name, value, escapeAttr);
+  setFrontendAttribute(existingAttributes, name, value);
   setFrontendAttrs(op, existingAttributes);
 }
 
 void setFrontendAttribute(FuncOp funcOp, StringRef name, Attribute value,
-                          int64_t argNum, bool escapeAttr) {
+                          int64_t argNum) {
   SmallVector<NamedAttribute> existingAttributes =
       getExistingFrontendAttributes(getFuncArgFrontendAttrs(funcOp, argNum),
                                     "");
-  setFrontendAttribute(existingAttributes, name, value, escapeAttr);
+  setFrontendAttribute(existingAttributes, name, value);
   setFuncArgFrontendAttrs(funcOp, argNum, existingAttributes);
 }
 
@@ -182,6 +187,8 @@ bool hasKey(mlir::DictionaryAttr dictAttr, mlir::StringRef key) {
 void loadAllRequiredDialects(mlir::MLIRContext* context) {
   mlir::DialectRegistry registry;
   mlir::func::registerAllExtensions(registry);
+  registry.insert<mlir::mhlo::MhloDialect>();
+  registerMhloExtensions(registry);
   mlir::sdy::registerAllDialects(registry);
   context->appendDialectRegistry(registry);
   context->loadAllAvailableDialects();
@@ -209,5 +216,27 @@ bool isPythonCallbackCustomCall(mlir::stablehlo::CustomCallOp op) {
          targetName == kFFIPythonGpuCallbackCustomCallTargetName;
 }
 
+std::string duplicateShardingsAtIndices(
+    mlir::StringRef shardingsFrontendAttr,
+    const llvm::BitVector& indicesToDuplicate) {
+  auto context = std::make_unique<mlir::MLIRContext>(
+      mlir::MLIRContext::Threading::DISABLED);
+  context->loadDialect<mlir::sdy::SdyDialect>();
+  auto shardingPerValue = parseStringAttr<TensorShardingPerValueAttr>(
+      shardingsFrontendAttr, context.get());
+  CHECK(shardingPerValue);
+  SmallVector<TensorShardingAttr> newShardings;
+  newShardings.reserve(shardingPerValue.size());
+  for (auto [index, sharding] :
+       llvm::enumerate(shardingPerValue.getShardings())) {
+    newShardings.push_back(sharding);
+    if (indicesToDuplicate.test(index)) {
+      newShardings.push_back(sharding);
+    }
+  }
+  return mlir::sdy::attributeToString(
+      TensorShardingPerValueAttr::get(context.get(), newShardings));
+}
+
 }  // namespace sdy
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/shardy/utils.h b/third_party/xla/xla/service/spmd/shardy/utils.h
index 7e7f2af813cb..fe4c94b87776 100644
--- a/third_party/xla/xla/service/spmd/shardy/utils.h
+++ b/third_party/xla/xla/service/spmd/shardy/utils.h
@@ -37,6 +37,8 @@ limitations under the License.
 namespace xla {
 namespace sdy {
 
+absl::string_view toStringView(mlir::StringRef sr);
+
 // Gets the "frontend_attributes" `DictionaryAttr` from `op`. If it doesn't
 // exist, return nullptr.
 mlir::DictionaryAttr getFrontendAttrs(mlir::Operation* op);
@@ -50,14 +52,13 @@ mlir::DictionaryAttr getFuncArgFrontendAttrs(mlir::func::FuncOp funcOp,
 // `name` already exists, it will be overwritten. Note that `value` will be
 // turned into a `StringAttr`.
 void setFrontendAttribute(mlir::Operation* op, mlir::StringRef name,
-                          mlir::Attribute value, bool escapeAttr = true);
+                          mlir::Attribute value);
 
 // Adds `name` into the argument at `argNum`'s frontend attributes of `funcOp`
 // with value `value`. If `name` already exists, it will be overwritten. Note
 // that `value` will be turned into a `StringAttr`.
 void setFrontendAttribute(mlir::func::FuncOp funcOp, mlir::StringRef name,
-                          mlir::Attribute value, int64_t argNum,
-                          bool escapeAttr = true);
+                          mlir::Attribute value, int64_t argNum);
 
 // Remove `attributeName` from the frontend attributes of `op`.
 void removeFrontendAttribute(mlir::Operation* op,
@@ -76,21 +77,27 @@ bool hasKey(mlir::DictionaryAttr dictAttr, mlir::StringRef key);
 
 void loadAllRequiredDialects(mlir::MLIRContext* context);
 
+// Parses `escapedValue` to an attribute of type `AttrTy`.
+template <typename AttrTy>
+AttrTy parseStringAttr(llvm::StringRef escapedValue,
+                       mlir::MLIRContext* context) {
+  std::string unescapedValue;
+  std::string error;
+  CHECK(absl::CUnescape(
+      absl::string_view(escapedValue.data(), escapedValue.size()),
+      &unescapedValue, &error))
+      << error;
+  return mlir::cast<AttrTy>(mlir::parseAttribute(unescapedValue, context));
+}
+
 // Parses `attrName` from `dictAttr` to an attribute of type `AttrTy`.
 template <typename AttrTy>
 AttrTy parseStringAttr(mlir::DictionaryAttr dictAttr,
                        llvm::StringRef attrName) {
   if (mlir::Attribute stringAttr = dictAttr.get(attrName)) {
-    std::string unescapedValue;
-    std::string error;
-    llvm::StringRef escapedValue =
-        mlir::cast<mlir::StringAttr>(stringAttr).getValue();
-    CHECK(absl::CUnescape(
-        absl::string_view(escapedValue.data(), escapedValue.size()),
-        &unescapedValue, &error))
-        << error;
-    return mlir::cast<AttrTy>(
-        mlir::parseAttribute(unescapedValue, stringAttr.getContext()));
+    return parseStringAttr<AttrTy>(
+        mlir::cast<mlir::StringAttr>(stringAttr).getValue(),
+        stringAttr.getContext());
   }
   return nullptr;
 }
@@ -117,6 +124,12 @@ mlir::stablehlo::CustomCallOp cloneCustomCallWithNewResultTypes(
 // Whether `op` is a Python callback custom call.
 bool isPythonCallbackCustomCall(mlir::stablehlo::CustomCallOp op);
 
+// Parses `shardingsFrontendAttr` as a `TensorShardingPerValueAttr`, duplicates
+// the shardings at the specified indices, and returns the result as a string.
+std::string duplicateShardingsAtIndices(
+    mlir::StringRef shardingsFrontendAttr,
+    const llvm::BitVector& indicesToDuplicate);
+
 }  // namespace sdy
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/spmd/shardy/utils_test.cc b/third_party/xla/xla/service/spmd/shardy/utils_test.cc
new file mode 100644
index 000000000000..fdf3e1b89954
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shardy/utils_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/spmd/shardy/utils.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "llvm/ADT/BitVector.h"
+
+namespace xla {
+namespace sdy {
+
+TEST(UtilsTest, DuplicateShardingsAtIndices) {
+  std::string inputShardings =
+      "#sdy.sharding_per_value<["
+      "<@mesh, [{\"x\"}, {}]>, <@mesh, [{}, {\"y\"}]>, "
+      "<@mesh, [{}, {}]>, <@mesh, [{\"x\"}, {\"y\"}]>]>";
+  llvm::BitVector indicesToDuplicate(4);
+  EXPECT_EQ(duplicateShardingsAtIndices(inputShardings, indicesToDuplicate),
+            inputShardings);
+
+  indicesToDuplicate.set(0);
+  EXPECT_EQ(duplicateShardingsAtIndices(inputShardings, indicesToDuplicate),
+            "#sdy.sharding_per_value<["
+            "<@mesh, [{\"x\"}, {}]>, <@mesh, [{\"x\"}, {}]>, "
+            "<@mesh, [{}, {\"y\"}]>, <@mesh, [{}, {}]>, "
+            "<@mesh, [{\"x\"}, {\"y\"}]>]>");
+
+  indicesToDuplicate.reset();
+  indicesToDuplicate.set(1);
+  indicesToDuplicate.set(3);
+  EXPECT_EQ(duplicateShardingsAtIndices(inputShardings, indicesToDuplicate),
+            "#sdy.sharding_per_value<["
+            "<@mesh, [{\"x\"}, {}]>, <@mesh, [{}, {\"y\"}]>, "
+            "<@mesh, [{}, {\"y\"}]>, <@mesh, [{}, {}]>, "
+            "<@mesh, [{\"x\"}, {\"y\"}]>, <@mesh, [{\"x\"}, {\"y\"}]>]>");
+
+  indicesToDuplicate.reset();
+  indicesToDuplicate.set(1);
+  indicesToDuplicate.set(2);
+  EXPECT_EQ(duplicateShardingsAtIndices(inputShardings, indicesToDuplicate),
+            "#sdy.sharding_per_value<["
+            "<@mesh, [{\"x\"}, {}]>, <@mesh, [{}, {\"y\"}]>, "
+            "<@mesh, [{}, {\"y\"}]>, <@mesh, [{}, {}]>, "
+            "<@mesh, [{}, {}]>, <@mesh, [{\"x\"}, {\"y\"}]>]>");
+}
+
+}  // namespace sdy
+}  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index dba5d38e360d..bdc8e73b8166 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -262,7 +262,7 @@ HloInstruction* SpmdBuilder::AddInstruction(
 
 void SpmdBuilder::SetBroadcastDimsForAddedHlo(const HloInstruction& hlo) {
   if (hlo.opcode() == HloOpcode::kBroadcast) {
-    for (int64_t i = 0; i < hlo.shape().rank(); ++i) {
+    for (int64_t i = 0; i < hlo.shape().dimensions_size(); ++i) {
       if (!absl::c_linear_search(hlo.dimensions(), i)) {
         broadcast_dims_[&hlo].insert(i);
       }
@@ -296,7 +296,7 @@ void SpmdBuilder::SetBroadcastDimsForReshape(const HloInstruction& hlo) {
   if (it == broadcast_dims_.end()) {
     return;
   }
-  std::vector<int64_t> iota_dims(hlo.shape().rank());
+  std::vector<int64_t> iota_dims(hlo.shape().dimensions_size());
   absl::c_iota(iota_dims, 0);
   absl::flat_hash_set<int64_t> reshape_broadcast_dims(iota_dims.begin(),
                                                       iota_dims.end());
@@ -317,10 +317,10 @@ void SpmdBuilder::SetBroadcastDimsForReshape(const HloInstruction& hlo) {
   while (!before_dim_size_stack.empty() && !after_dim_size_stack.empty()) {
     int64_t before_size = before_dim_size_stack.back();
     int64_t after_size = after_dim_size_stack.back();
-    int64_t current_before_dim =
-        hlo.operand(0)->shape().rank() - before_dim_size_stack.size();
+    int64_t current_before_dim = hlo.operand(0)->shape().dimensions_size() -
+                                 before_dim_size_stack.size();
     int64_t current_after_dim =
-        hlo.shape().rank() - after_dim_size_stack.size();
+        hlo.shape().dimensions_size() - after_dim_size_stack.size();
     before_dim_size_stack.pop_back();
     after_dim_size_stack.pop_back();
     if (!it->second.contains(current_before_dim)) {
@@ -337,7 +337,8 @@ void SpmdBuilder::SetBroadcastDimsForReshape(const HloInstruction& hlo) {
       after_dim_size_stack.push_back(after_size / before_size);
     } else {
       // Other cases, mark all remaining dims as non-broadcast.
-      erase_reshape_broadcast_dims(current_after_dim, hlo.shape().rank());
+      erase_reshape_broadcast_dims(current_after_dim,
+                                   hlo.shape().dimensions_size());
       break;
     }
   }
@@ -357,7 +358,7 @@ void SpmdBuilder::SetBroadcastDimsForTranspose(const HloInstruction& hlo) {
     return;
   }
   absl::flat_hash_set<int64_t> xpose_broadcast_dims;
-  std::vector<int64_t> reverse_map(hlo.shape().rank());
+  std::vector<int64_t> reverse_map(hlo.shape().dimensions_size());
   for (int64_t i = 0; i < reverse_map.size(); ++i) {
     reverse_map[hlo.dimensions(i)] = i;
   }
@@ -374,7 +375,7 @@ void SpmdBuilder::SetBroadcastDimsForPad(const HloInstruction& hlo) {
     return;
   }
   absl::flat_hash_set<int64_t> pad_broadcast_dims;
-  for (int64_t i = 0; i < hlo.shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo.shape().dimensions_size(); ++i) {
     const auto& dim = hlo.padding_config().dimensions(i);
     if (dim.edge_padding_low() == 0 && dim.edge_padding_high() == 0 &&
         dim.interior_padding() == 0 && it->second.contains(i)) {
@@ -402,7 +403,7 @@ void SpmdBuilder::SetBroadcastDimsForElementwise(const HloInstruction& hlo) {
     return;
   }
   absl::flat_hash_set<int64_t> broadcast_dims;
-  for (int64_t i = 0; i < hlo.shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo.shape().dimensions_size(); ++i) {
     broadcast_dims.insert(i);
   }
   for (int64_t i = 0; i < hlo.operand_count(); ++i) {
@@ -411,7 +412,7 @@ void SpmdBuilder::SetBroadcastDimsForElementwise(const HloInstruction& hlo) {
       broadcast_dims.clear();
       break;
     }
-    for (int64_t i = 0; i < hlo.shape().rank(); ++i) {
+    for (int64_t i = 0; i < hlo.shape().dimensions_size(); ++i) {
       if (!it->second.contains(i)) {
         broadcast_dims.erase(i);
       }
@@ -580,7 +581,7 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(
         // Try to simplify the resharding by grouping those equal-sized sharding
         // dims first.
         std::vector<int64_t> equal_dims;
-        for (int64_t dim = 0; dim < hlo_->shape().rank(); ++dim) {
+        for (int64_t dim = 0; dim < hlo_->shape().dimensions_size(); ++dim) {
           if (sharding().tile_assignment().dim(dim) == 1 ||
               target.tile_assignment().dim(dim) !=
                   sharding().tile_assignment().dim(dim)) {
@@ -717,7 +718,7 @@ HloInstruction* PartitionedHlo::PadWithValueHlo(
   HloInstruction* mask = nullptr;
   auto offsets = MakePartitionOffsets(base_shape_, sharding,
                                       state_.partition_id, state_.b);
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions_size(); ++i) {
     if (base_shape_.dimensions(i) % sharding.tile_assignment().dim(i) == 0 ||
         absl::c_linear_search(skipped_dims, i)) {
       continue;
@@ -749,6 +750,19 @@ PartitionedHlo PartitionedHlo::PadWithZero(
   return PadWithValue(zero, left_padded_dims, skipped_dims);
 }
 
+PartitionedHlo PartitionedHlo::PadWithZeroOnSpecifiedDims(
+    absl::Span<const int64_t> dims,
+    absl::Span<const int64_t> left_padded_dims) const {
+  std::vector<int64_t> skipped_dims;
+  skipped_dims.reserve(base_shape_.dimensions_size() - dims.size());
+  for (int64_t i = 0; i < base_shape_.dimensions_size(); ++i) {
+    if (!absl::c_linear_search(dims, i)) {
+      skipped_dims.push_back(i);
+    }
+  }
+  return PadWithZero(left_padded_dims, skipped_dims);
+}
+
 std::optional<PartitionedHlo::WindowedInputShardReturnValue>
 PartitionedHlo::ReshardAsWindowedInput(const Window& window,
                                        const HloSharding& target,
@@ -758,7 +772,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
   auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].window_reshard_cache;
   for (auto& entry : cache) {
     if (std::get<0>(entry) == target &&
-        protobuf_util::ProtobufEquals(std::get<1>(entry), window)) {
+        protobuf_util::HaveSameSerialization(std::get<1>(entry), window)) {
       return std::get<2>(entry);
     }
   }
@@ -776,40 +790,44 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
   auto shard_shape = base_shape_;
 
   std::vector<MultiplyAddDivideOffsetCalculation> start_on_padded_calculations(
-      base_shape_.rank());
+      base_shape_.dimensions_size());
   std::vector<MultiplyAddDivideOffsetCalculation> limit_on_padded_calculations(
-      base_shape_.rank());
+      base_shape_.dimensions_size());
   std::vector<HloInstruction*> dynamic_slice_offset_on_output(
-      base_shape_.rank(), nullptr);
+      base_shape_.dimensions_size(), nullptr);
 
   Window shard_window = window;
   Shape padded_shape = base_shape_;
-  std::vector<HloInstruction*> offsets_on_padded_shape(base_shape_.rank());
-  std::vector<int64_t> per_shard_window_counts(base_shape_.rank());
-  std::vector<int64_t> explicit_left_padding(base_shape_.rank(), 0);
+  std::vector<HloInstruction*> offsets_on_padded_shape(
+      base_shape_.dimensions_size());
+  std::vector<int64_t> per_shard_window_counts(base_shape_.dimensions_size());
+  std::vector<int64_t> explicit_left_padding(base_shape_.dimensions_size(), 0);
   // Track if any shards can be skipped.
-  std::vector<int64_t> trimmed_target_sharding_tile_shape(base_shape_.rank());
+  std::vector<int64_t> trimmed_target_sharding_tile_shape(
+      base_shape_.dimensions_size());
   // There can be at most 2 ranges of skipped shards on a dimension: 1) on the
   // right side, 2) in the middle. The following vector tracks the middle range
   // (i.e., <start, size>). The leftmost shard must not be skipped because
   // outputs are left-aligned.
   std::vector<std::pair<int64_t, int64_t>> trimmed_target_sharding_middle_range(
-      base_shape_.rank(), std::pair<int64_t, int64_t>(-1, -1));
+      base_shape_.dimensions_size(), std::pair<int64_t, int64_t>(-1, -1));
   bool trimmed_shards = false;
   std::vector<int64_t> dims_needs_pre_masking;
   Shape halo_exchange_base_shape = base_shape_;
   // If all useful input data are in a single shard, we can skip in-shard data
   // (e.g., those that belong to negative padding) via a local slice.
   bool trimmed_in_shard = false;
-  std::vector<int64_t> pre_halo_exchange_slice_starts(base_shape_.rank(), 0);
+  std::vector<int64_t> pre_halo_exchange_slice_starts(
+      base_shape_.dimensions_size(), 0);
   std::vector<int64_t> pre_halo_exchange_slice_limits(
       hlo_->shape().dimensions().begin(), hlo_->shape().dimensions().end());
-  std::vector<bool> can_leave_dimension_partitioned(base_shape_.rank(), false);
-  for (int64_t i = 0; i < base_shape_.rank(); ++i) {
+  std::vector<bool> can_leave_dimension_partitioned(
+      base_shape_.dimensions_size(), false);
+  for (int64_t i = 0; i < base_shape_.dimensions_size(); ++i) {
     can_leave_dimension_partitioned[i] =
         window_util::IsTrivialWindowDimension(window.dimensions(i));
   }
-  for (int64_t i = 0; i < base_shape_.rank(); ++i) {
+  for (int64_t i = 0; i < base_shape_.dimensions_size(); ++i) {
     // Do not pad non-partitioned dimensions.
     int64_t shard_count = target.tile_assignment().dim(i);
     trimmed_target_sharding_tile_shape[i] = shard_count;
@@ -1081,7 +1099,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
   auto handle_all_windowed_dimensions_are_replicated = [&]() {
     PaddingConfig padding_config;
     auto pad_hlo_shape = padded_shape;
-    for (int64_t i = 0; i < base_shape_.rank(); ++i) {
+    for (int64_t i = 0; i < base_shape_.dimensions_size(); ++i) {
       auto padding_config_dim = padding_config.add_dimensions();
       padding_config_dim->set_interior_padding(0);
       // Do not pad non-partitioned dimensions or partitioned dimensions that
@@ -1119,7 +1137,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
 
   auto sharding_with_windowed_dims_replicated =
       GetShardingReplicatedOnWindowedDimension(target, window);
-  // If the currrent HLO is replicated or all windows dimensions are replicated,
+  // If the current HLO is replicated or all windows dimensions are replicated,
   // pad then slice. If the target sharding and current sharding are not the
   // same then give the halo exchange system a chance to run as it can skip
   // generating a dynamic slice.
@@ -1149,7 +1167,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
     Array<int64_t> trimmed_devices(trimmed_target_sharding_tile_shape);
     trimmed_devices.Each([&](absl::Span<const int64_t> indices, int64_t* d) {
       std::vector<int64_t> target_indices(indices.begin(), indices.end());
-      for (int64_t i = 0; i < base_shape_.rank(); ++i) {
+      for (int64_t i = 0; i < base_shape_.dimensions_size(); ++i) {
         const auto& range = trimmed_target_sharding_middle_range[i];
         if (range.first >= 0 && indices[i] >= range.first) {
           target_indices[i] += range.second;
@@ -1168,7 +1186,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
 
   if (!dims_needs_pre_masking.empty()) {
     std::vector<int64_t> skipped_dims;
-    for (int dim = 0; dim < base_shape_.rank(); ++dim) {
+    for (int dim = 0; dim < base_shape_.dimensions_size(); ++dim) {
       if (!absl::c_linear_search(dims_needs_pre_masking, dim)) {
         skipped_dims.push_back(dim);
       }
@@ -1179,7 +1197,8 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
 
   // If we skipped unused data within a shard, we need to slice the input shard.
   if (trimmed_in_shard) {
-    std::vector<int64_t> slice_sizes(halo_exchange_base_shape.rank());
+    std::vector<int64_t> slice_sizes(
+        halo_exchange_base_shape.dimensions_size());
     for (int64_t i = 0; i < slice_sizes.size(); ++i) {
       slice_sizes[i] =
           pre_halo_exchange_slice_limits[i] - pre_halo_exchange_slice_starts[i];
@@ -1191,14 +1210,14 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
         /*start_indices=*/pre_halo_exchange_slice_starts,
         /*limit_indices=*/pre_halo_exchange_slice_limits,
         /*strides=*/
-        std::vector<int64_t>(halo_exchange_base_shape.rank(), 1)));
+        std::vector<int64_t>(halo_exchange_base_shape.dimensions_size(), 1)));
   }
 
   // TODO(yuanzx): We are concatenating on each sharded dimension one at time,
   // and in the second dimension (and beyond) we create halos by slicing the
   // concat in the previous dimension, which is not optimal. We should generate
   // halos only concating slices, instead of slicing concats.
-  for (int dim = 0; dim < base_shape_.rank(); ++dim) {
+  for (int dim = 0; dim < base_shape_.dimensions_size(); ++dim) {
     int64_t shard_count = halo_exchange_target->tile_assignment().dim(dim);
     if (shard_count == 1 || can_leave_dimension_partitioned[dim]) {
       continue;
@@ -1286,7 +1305,7 @@ PartitionedHlo PartitionedHlo::Replicate() const {
   }
 
   // 'Tiled' to 'Replicated'.
-  std::vector<int64_t> all_dims(shape.rank());
+  std::vector<int64_t> all_dims(shape.dimensions_size());
   std::iota(all_dims.begin(), all_dims.end(), 0);
   HloInstruction* result = ReplicatePartial(all_dims);
   result->set_sharding(HloSharding::Replicate());
@@ -1367,8 +1386,8 @@ HloInstruction* PartitionedHlo::ReplicatePartial(
     // We create all-gather instructions, which may contain padding. Add a slice
     // to remove the padding.
     if (!ShapeUtil::Compatible(result->shape(), ag_result_shape)) {
-      std::vector<int64_t> start_indices(ag_result_shape.rank(), 0);
-      std::vector<int64_t> strides(ag_result_shape.rank(), 1);
+      std::vector<int64_t> start_indices(ag_result_shape.dimensions_size(), 0);
+      std::vector<int64_t> strides(ag_result_shape.dimensions_size(), 1);
       result = state_.b->AddInstruction(
           HloInstruction::CreateSlice(ag_result_shape, result, start_indices,
                                       ag_result_shape.dimensions(), strides));
@@ -1389,7 +1408,7 @@ HloInstruction* PartitionedHlo::ReplicatePartial(
     }
     if (!masking_dims.empty()) {
       std::vector<int64_t> skipped_dims;
-      for (int64_t i = 0; i < base_shape().rank(); ++i) {
+      for (int64_t i = 0; i < base_shape().dimensions_size(); ++i) {
         if (!absl::c_linear_search(masking_dims, i)) {
           skipped_dims.push_back(i);
         }
@@ -1443,7 +1462,7 @@ PartitionedHlo::ReshardToPartialReplicateWithAllGather(
   }
 
   // Get replicate dims and replicate factor of each dimensions.
-  int64_t rank = hlo_->shape().rank();
+  int64_t rank = hlo_->shape().dimensions_size();
   std::vector<int64_t> replicate_dims;
   std::vector<int64_t> replicate_factors;
   for (int64_t dim = 0; dim < rank; dim++) {
@@ -1507,7 +1526,7 @@ PartitionedHlo::ReshardFromPartialReplicateWithDynamicSlice(
   }
   std::vector<int64_t> expand_tile_dims;
   std::vector<int64_t> tiling_dim_factors;
-  int64_t rank = hlo_->shape().rank();
+  int64_t rank = hlo_->shape().dimensions_size();
   tiling_dim_factors.reserve(target.tile_assignment().num_dimensions());
   const auto& temp_target_sharding = target_compatible_sharding.value();
   for (int64_t dim = 0; dim < rank; dim++) {
@@ -1539,7 +1558,7 @@ PartitionedHlo::ReshardFromPartialReplicateWithDynamicSlice(
   // Since we are just slicing, we can just use the differences between the new
   // and old offsets in the full shape as the dynamic-slice offsets.
   auto padded_base_shape = shard_shape;
-  for (int64_t i = 0; i < padded_base_shape.rank(); ++i) {
+  for (int64_t i = 0; i < padded_base_shape.dimensions_size(); ++i) {
     padded_base_shape.set_dimensions(
         i, padded_base_shape.dimensions(i) *
                temp_target_sharding.tile_assignment().dim(i));
@@ -1678,26 +1697,8 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
   const HloSharding temp_target =
       GetAllToAllSharding(sharding(), {source_dim}, {target_dim});
 
-  // The order of ids in the group must follow the temp_target sharding.
-  std::vector<std::vector<int64_t>> groups(
-      temp_target.tile_assignment().num_elements() / group_size);
-  temp_target.tile_assignment().Each(
-      [&](absl::Span<const int64_t> indices, int64_t device) {
-        int64_t group_id = 0;
-        for (int64_t dim = 0; dim < indices.size(); ++dim) {
-          if (dim == target_dim) {
-            group_id *= temp_target.tile_assignment().dim(dim) / group_size;
-            group_id += indices[dim] / group_size;
-          } else {
-            group_id *= temp_target.tile_assignment().dim(dim);
-            group_id += indices[dim];
-          }
-        }
-        groups[group_id].push_back(device);
-      });
-
   PaddingConfig pc;
-  for (int64_t i = 0; i < hlo_->shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo_->shape().dimensions_size(); ++i) {
     auto* pd = pc.add_dimensions();
     pd->set_edge_padding_low(0);
     if (i == target_dim) {
@@ -1734,17 +1735,41 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
       padded_hlo));
   VLOG(5) << "Target ata shape: " << reshape->shape().ToString();
 
-  // After the reshape, it is guaranteed to have at least 3 dimensions.
-  auto all_to_all =
-      state_.collective_ops_creator.create_cross_partition_all_to_all(
-          state_.b, {reshape}, groups, (*state_.next_channel_id)++, target_dim);
+  HloInstruction* all_to_all = nullptr;
+  // Try to generate replica groups in compressed format.
+  std::optional<IotaReplicaGroupList> groups =
+      GetIotaPartitionGroupsAcrossTargetDims(
+          temp_target, {target_dim}, {group_size},
+          state_.partitioner->num_partitions());
+  if (state_.collective_ops_creator
+          .create_cross_partition_all_to_all_with_iota_device_list &&
+      groups.has_value()) {
+    // After the reshape, it is guaranteed to have at least 3 dimensions.
+    all_to_all = state_.collective_ops_creator
+                     .create_cross_partition_all_to_all_with_iota_device_list(
+                         state_.b, {reshape}, groups.value(),
+                         (*state_.next_channel_id)++, target_dim);
+  } else {
+    VLOG(5) << "Falling back to creating all-to-all with replica groups V1 "
+               "(list of vectors).";
+    // The order of ids in the group must follow the temp_target sharding.
+    std::vector<std::vector<int64_t>> groups =
+        GetPartitionGroupsAcrossTargetDims(temp_target, {target_dim},
+                                           {group_size});
+    // After the reshape, it is guaranteed to have at least 3 dimensions.
+    all_to_all =
+        state_.collective_ops_creator.create_cross_partition_all_to_all(
+            state_.b, {reshape}, groups, (*state_.next_channel_id)++,
+            target_dim);
+  }
+  CHECK_NE(all_to_all, nullptr);
 
   // Reorder the split dimension of the reshape to be located in front of the
   // input partition dimension, so the two dimensions can be combined.
   int64_t new_source_dim =
       (target_dim < source_dim) ? source_dim + 1 : source_dim;
   std::vector<int64_t> permutation;
-  for (int64_t i = 0; i < all_to_all->shape().rank(); ++i) {
+  for (int64_t i = 0; i < all_to_all->shape().dimensions_size(); ++i) {
     if (i == target_dim) {
       continue;
     }
@@ -1764,7 +1789,7 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
                        .value();
   HloInstruction* result = state_.b->AddInstruction(
       HloInstruction::CreateReshape(new_shape, transpose));
-  CHECK_EQ(result->shape().rank(), base_shape_.rank());
+  CHECK_EQ(result->shape().dimensions_size(), base_shape_.dimensions_size());
   result->set_sharding(temp_target);
 
   auto padded_source_base_shape = base_shape_;
@@ -1783,8 +1808,8 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
   VLOG(5) << "Current source padded shape: "
           << current_source_base_padded_shape.ToString();
 
-  std::vector<int64_t> strides(result->shape().rank(), 1);
-  std::vector<int64_t> starts(result->shape().rank(), 0);
+  std::vector<int64_t> strides(result->shape().dimensions_size(), 1);
+  std::vector<int64_t> starts(result->shape().dimensions_size(), 0);
   auto sliced_phlo = ReshardDataForSlicing(
       strides, starts, padded_source_base_shape.dimensions(),
       PartitionedHlo(result, current_source_base_padded_shape, state_),
@@ -1853,9 +1878,9 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
   // (0,4,1,2,3,5)
   // 3. reshape_1 (2,4,16,8,16,8) -> (8,16,8,16,8)
   std::vector<int64_t> shape_0_dims;
-  shape_0_dims.reserve(hlo_->shape().rank() + num_eligible_dims);
+  shape_0_dims.reserve(hlo_->shape().dimensions_size() + num_eligible_dims);
   std::vector<int64_t> permutation_0;
-  for (int64_t i = 0; i < hlo_->shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo_->shape().dimensions_size(); ++i) {
     auto it = absl::c_find(eligible_target_dims, i);
     if (it != eligible_target_dims.end()) {
       int64_t group_size =
@@ -1886,7 +1911,7 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
   absl::Span<const int64_t> transpose_shape_dims =
       transpose_0->shape().dimensions();
   std::vector<int64_t> shape_1_dims;
-  shape_1_dims.reserve(1 + base_shape_.rank());
+  shape_1_dims.reserve(1 + base_shape_.dimensions_size());
   shape_1_dims.push_back(
       std::accumulate(transpose_shape_dims.begin(),
                       transpose_shape_dims.begin() + num_eligible_dims, 1,
@@ -1900,33 +1925,32 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
 
   // // Step 2. Apply the all-to-all
   // all-to-all on (8,16,8,16,8) with split_dimension = 0
-  int64_t total_group_size = std::accumulate(
-      group_sizes.begin(), group_sizes.end(), 1, std::multiplies<int64_t>());
   const HloSharding temp_target = GetAllToAllSharding(
       sharding(), eligible_source_dims, eligible_target_dims);
-  std::vector<std::vector<int64_t>> groups(
-      temp_target.tile_assignment().num_elements() / total_group_size);
-  temp_target.tile_assignment().Each(
-      [&](absl::Span<const int64_t> indices, int64_t device) {
-        int64_t group_id = 0;
-        for (int64_t dim = 0; dim < indices.size(); ++dim) {
-          auto it = absl::c_find(eligible_target_dims, dim);
-          if (it != eligible_target_dims.end()) {
-            int64_t group_size =
-                group_sizes[std::distance(eligible_target_dims.begin(), it)];
-            group_id *= temp_target.tile_assignment().dim(dim) / group_size;
-            group_id += indices[dim] / group_size;
-          } else {
-            group_id *= temp_target.tile_assignment().dim(dim);
-            group_id += indices[dim];
-          }
-        }
-        groups[group_id].push_back(device);
-      });
-  HloInstruction* all_to_all =
-      state_.collective_ops_creator.create_cross_partition_all_to_all(
-          state_.b, {reshape_1}, groups, (*state_.next_channel_id)++, 0);
 
+  HloInstruction* all_to_all = nullptr;
+  // Try to generate replica groups in compressed format.
+  std::optional<IotaReplicaGroupList> groups =
+      GetIotaPartitionGroupsAcrossTargetDims(
+          temp_target, eligible_target_dims, group_sizes,
+          state_.partitioner->num_partitions());
+  if (state_.collective_ops_creator
+          .create_cross_partition_all_to_all_with_iota_device_list &&
+      groups.has_value()) {
+    all_to_all =
+        state_.collective_ops_creator
+            .create_cross_partition_all_to_all_with_iota_device_list(
+                state_.b, {reshape_1}, *groups, (*state_.next_channel_id)++, 0);
+  } else {
+    VLOG(5) << "Falling back to creating all-to-all with replica groups V1 "
+               "(list of vectors).";
+    std::vector<std::vector<int64_t>> groups =
+        GetPartitionGroupsAcrossTargetDims(temp_target, eligible_target_dims,
+                                           group_sizes);
+    all_to_all =
+        state_.collective_ops_creator.create_cross_partition_all_to_all(
+            state_.b, {reshape_1}, groups, (*state_.next_channel_id)++, 0);
+  }
   // Step 3. Split sharding axes to multiple dimensions
   // 1. reshape_2 (8,16,8,16,8) -> (2,4,16,8,16,8)
   // 2. transpose_1 (2,4,16,8,16,8) -> (16,4,8,2,16,8) with permutation_1
@@ -1935,7 +1959,7 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
   HloInstruction* reshape_2 = state_.b->AddInstruction(
       HloInstruction::CreateReshape(transpose_0->shape(), all_to_all));
 
-  std::vector<int64_t> permutation_1(base_shape_.rank());
+  std::vector<int64_t> permutation_1(base_shape_.dimensions_size());
   std::iota(permutation_1.begin(), permutation_1.end(), num_eligible_dims);
   for (int64_t i = 0; i < num_eligible_dims; ++i) {
     auto it = absl::c_find(permutation_1,
@@ -1950,7 +1974,7 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
           reshape_2, permutation_1));
 
   std::vector<int64_t> shape_3_dims;
-  shape_3_dims.reserve(base_shape_.rank());
+  shape_3_dims.reserve(base_shape_.dimensions_size());
   for (int64_t i = 0; i < permutation_1.size(); ++i) {
     if (permutation_1[i] < num_eligible_dims) {
       shape_3_dims.push_back(transpose_1->shape().dimensions(i) *
@@ -2367,7 +2391,7 @@ PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
       // If hlo() has broadcast dims, check if data is already the same between
       // source/destination pairs.
       std::vector<int64_t> broadcast_dims_vector;
-      for (int64_t i = 0; i < hlo()->shape().rank(); ++i) {
+      for (int64_t i = 0; i < hlo()->shape().dimensions_size(); ++i) {
         if ((*broadcast_dims)->contains(i)) {
           broadcast_dims_vector.push_back(i);
         }
@@ -2446,13 +2470,13 @@ SpmdPartitioningVisitor::MakePartitioningState() {
   state.next_channel_id = next_channel_id_;
   state.reshard_cache = &reshard_cache_;
   state.partitioner = partitioner_;
-  if (!device_groups_.empty()) {
+  if (device_groups_.has_value()) {
     // Use the original collective creator and partition_id to call
     // CreatePerGroupPartitioningState(). Current collective_ops_creator_ and
     // partition_id_ have been rewritten to be subgrouped.
     state.collective_ops_creator = *visiting_collective_ops_creator_;
     state.partition_id = *visiting_partition_id_;
-    return CreatePerGroupPartitioningState(state, device_groups_, &b_);
+    return CreatePerGroupPartitioningState(state, *device_groups_, &b_);
   } else {
     state.collective_ops_creator = collective_ops_creator_;
     state.partition_id = partition_id_;
@@ -2475,6 +2499,22 @@ std::vector<ReplicaGroup> SpmdPartitioningVisitor::CreateReplicaGroups(
   return device_groups;
 }
 
+std::vector<ReplicaGroup> SpmdPartitioningVisitor::CreateReplicaGroups(
+    const hlo_sharding_util::DeviceGroupTileAssignment& groups) {
+  std::vector<ReplicaGroup> device_groups;
+  device_groups.reserve(groups.num_groups() * num_replicas_);
+  for (int64_t i = 0; i < num_replicas_; ++i) {
+    for (int64_t g = 0; g < groups.num_groups(); ++g) {
+      device_groups.emplace_back();
+      for (int64_t d = 0; d < groups.num_devices_per_group(); ++d) {
+        device_groups.back().add_replica_ids(i * num_partitions_ +
+                                             groups(g, d));
+      }
+    }
+  }
+  return device_groups;
+}
+
 absl::Status SpmdPartitioningVisitor::HandleCall(HloInstruction* hlo) {
   std::vector<HloInstruction*> call_args;
   HloComputation* computation = hlo->called_computations()[0];
@@ -2549,6 +2589,18 @@ absl::Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
 absl::Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
   visiting_hlo_ = hlo;
   b_.set_visiting_hlo(hlo);
+
+  if (hlo->opcode() == HloOpcode::kAllReduce ||
+      hlo->opcode() == HloOpcode::kCall ||
+      hlo->opcode() == HloOpcode::kConditional ||
+      hlo->opcode() == HloOpcode::kInfeed ||
+      hlo->opcode() == HloOpcode::kOutfeed ||
+      hlo->opcode() == HloOpcode::kParameter ||
+      hlo->opcode() == HloOpcode::kRng || hlo->opcode() == HloOpcode::kTuple ||
+      hlo->opcode() == HloOpcode::kWhile) {
+    return absl::OkStatus();
+  }
+
   // Temporarily replace manual sharding to one-device sharding so that the
   // partitioner will not change the HLOs.
   auto manual_to_onedevice = [&](HloOpcode opcode, const Shape& shape,
@@ -2573,143 +2625,130 @@ absl::Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
     return sharding;
   };
 
-  if (hlo->opcode() != HloOpcode::kConditional &&
-      hlo->opcode() != HloOpcode::kTuple &&
-      hlo->opcode() != HloOpcode::kParameter &&
-      hlo->opcode() != HloOpcode::kWhile && hlo->opcode() != HloOpcode::kRng &&
-      hlo->opcode() != HloOpcode::kInfeed &&
-      hlo->opcode() != HloOpcode::kOutfeed &&
-      hlo->opcode() != HloOpcode::kAllReduce &&
-      hlo->opcode() != HloOpcode::kCall) {
-    const bool has_manual_sharding =
-        hlo->sharding().IsManual() ||
-        (hlo->sharding().IsTuple() &&
-         absl::c_any_of(
-             hlo->sharding().tuple_elements(),
-             [](const HloSharding& sharding) { return sharding.IsManual(); }));
-    if (has_manual_sharding && !hlo->IsCustomCall("SPMDFullToShardShape")) {
-      visiting_hlo_sharding_ = hlo->sharding();
-      auto get_sharding_shape = [](const HloInstruction* hlo) {
-        if (hlo->opcode() != HloOpcode::kOutfeed) {
-          return hlo->shape();
-        }
-        std::vector<Shape> operand_shapes(hlo->operand_count());
-        for (int i = 0; i < hlo->operand_count(); ++i) {
-          operand_shapes[i] = hlo->operand(i)->shape();
-        }
-        return ShapeUtil::MakeTupleShape(operand_shapes);
-      };
-      hlo->set_sharding(manual_to_onedevice(
-          hlo->opcode(), get_sharding_shape(hlo), *visiting_hlo_sharding_));
-
-      visiting_hlo_operand_shardings_.reserve(hlo->operand_count());
-      for (HloInstruction* operand : hlo->unique_operands()) {
-        visiting_hlo_operand_shardings_.push_back(operand->sharding());
-        operand->set_sharding(manual_to_onedevice(
-            hlo->opcode(), get_sharding_shape(operand), operand->sharding()));
-        GetPartitionedHlo(operand).hlo()->copy_sharding(operand);
+  const bool has_manual_sharding =
+      hlo->sharding().IsManual() ||
+      (hlo->sharding().IsTuple() &&
+       absl::c_any_of(
+           hlo->sharding().tuple_elements(),
+           [](const HloSharding& sharding) { return sharding.IsManual(); }));
+  const bool has_manual_subgroup =
+      hlo->sharding().IsManualSubgroup() ||
+      (hlo->sharding().IsTuple() &&
+       absl::c_any_of(hlo->sharding().tuple_elements(),
+                      [](const HloSharding& sharding) {
+                        return sharding.IsManualSubgroup();
+                      }));
+  if (has_manual_sharding && !hlo->IsCustomCall("SPMDFullToShardShape")) {
+    visiting_hlo_sharding_ = hlo->sharding();
+    auto get_sharding_shape = [](const HloInstruction* hlo) {
+      if (hlo->opcode() != HloOpcode::kOutfeed) {
+        return hlo->shape();
       }
-    } else {
-      const bool has_manual_subgroup =
-          hlo->sharding().IsManualSubgroup() ||
-          (hlo->sharding().IsTuple() &&
-           absl::c_any_of(hlo->sharding().tuple_elements(),
-                          [](const HloSharding& sharding) {
-                            return sharding.IsManualSubgroup();
-                          }));
-      if (has_manual_subgroup && !hlo->IsCustomCall("SPMDFullToShardShape") &&
-          !hlo->IsCustomCall("SPMDShardToFullShape") &&
-          hlo->opcode() != HloOpcode::kGetTupleElement) {
-        auto get_grouped_sharding =
-            [&](const HloSharding& sharding, const Shape& shape,
-                const GroupedSharding* ref =
-                    nullptr) -> absl::StatusOr<GroupedSharding> {
-          if (!sharding.IsTuple()) {
-            GroupedSharding grouped =
-                hlo_sharding_util::GetManualSubgroupSharding(sharding);
-            if (ref != nullptr) {
-              auto aligned =
-                  AlignGroupsWithIfCompatible(std::move(grouped), *ref);
-              TF_RET_CHECK(aligned.has_value())
-                  << "Incompatible manual sharding at " << hlo->ToString();
-              return *aligned;
-            }
-            return grouped;
-          }
-          std::vector<HloSharding> elements;
-          elements.reserve(sharding.tuple_elements().size());
-          CHECK(!sharding.tuple_elements().empty());
-          GroupedSharding grouped0 =
-              hlo_sharding_util::GetManualSubgroupSharding(
-                  sharding.tuple_elements()[0]);
-          if (ref != nullptr) {
-            auto aligned =
-                AlignGroupsWithIfCompatible(std::move(grouped0), *ref);
-            TF_RET_CHECK(aligned.has_value())
-                << "Incompatible manual sharding at " << hlo->ToString();
-            grouped0 = std::move(*aligned);
-          }
-          elements.push_back(std::move(grouped0.sharding));
-          for (int64_t i = 1; i < sharding.tuple_elements().size(); ++i) {
-            auto grouped_i = AlignGroupsWithIfCompatible(
-                hlo_sharding_util::GetManualSubgroupSharding(
-                    sharding.tuple_elements()[i]),
-                grouped0);
-            TF_RET_CHECK(grouped_i.has_value())
-                << "Incompatible manual sharding between tuple elements: "
-                << hlo->ToString();
-            elements.push_back(std::move(grouped_i->sharding));
-          }
-          grouped0.sharding = HloSharding::Tuple(shape, elements);
-          return grouped0;
-        };
-        TF_ASSIGN_OR_RETURN(
-            auto group_sharding,
-            get_grouped_sharding(hlo->sharding(), hlo->shape()));
-        // Update sharding.
-        visiting_hlo_sharding_ = hlo->sharding();
-        hlo->set_sharding(group_sharding.sharding);
-        // Update device_groups and num_partitions.
-        // Set device_groups_, visiting_partition_id_ and
-        // visiting_collective_ops_creator_ before MakePartitioningState() which
-        // uses them.
-        device_groups_ = group_sharding.device_groups;
-        visiting_num_partitions_ = num_partitions_;
-        num_partitions_ = num_partitions_ / group_sharding.device_groups.size();
-        visiting_partition_id_ = partition_id_;
-        visiting_collective_ops_creator_ = std::move(collective_ops_creator_);
-        auto grouped_state = MakePartitioningState();
-        collective_ops_creator_ =
-            std::move(grouped_state.collective_ops_creator);
-        partition_id_ = grouped_state.partition_id;
-
-        // Update sharding for the operands.
-        visiting_hlo_operand_shardings_.reserve(hlo->operand_count());
-        visiting_state_.reserve(hlo->operand_count());
-        for (HloInstruction* operand : hlo->unique_operands()) {
-          visiting_hlo_operand_shardings_.push_back(operand->sharding());
-          auto old_state = GetPartitionedHlo(operand).state();
-          visiting_state_.push_back(old_state);
-          if (operand->shape().IsArray() && operand->IsConstant() &&
-              operand->shape().rank() == 0 &&
-              !operand->sharding().IsManualSubgroup()) {
-            // We allowed scalar constants to be CSE'ed between manual/auto
-            // subgraphs. It's possible that it doesn't have a manual subgroup.
-            continue;
-          }
-          TF_ASSIGN_OR_RETURN(
-              auto op_group_sharding,
-              get_grouped_sharding(operand->sharding(), operand->shape(),
-                                   &group_sharding));
-          operand->set_sharding(op_group_sharding.sharding);
-          GetPartitionedHlo(operand).hlo()->copy_sharding(operand);
-          auto group_state = CreatePerGroupPartitioningState(
-              old_state, op_group_sharding.device_groups, &b_);
-          GetPartitionedHlo(operand).set_state(group_state);
+      std::vector<Shape> operand_shapes(hlo->operand_count());
+      for (int i = 0; i < hlo->operand_count(); ++i) {
+        operand_shapes[i] = hlo->operand(i)->shape();
+      }
+      return ShapeUtil::MakeTupleShape(operand_shapes);
+    };
+    hlo->set_sharding(manual_to_onedevice(
+        hlo->opcode(), get_sharding_shape(hlo), *visiting_hlo_sharding_));
+
+    visiting_hlo_operand_shardings_.reserve(hlo->operand_count());
+    for (HloInstruction* operand : hlo->unique_operands()) {
+      visiting_hlo_operand_shardings_.push_back(operand->sharding());
+      operand->set_sharding(manual_to_onedevice(
+          hlo->opcode(), get_sharding_shape(operand), operand->sharding()));
+      GetPartitionedHlo(operand).hlo()->copy_sharding(operand);
+    }
+  } else if (has_manual_subgroup &&
+             !hlo->IsCustomCall("SPMDFullToShardShape") &&
+             !hlo->IsCustomCall("SPMDShardToFullShape") &&
+             hlo->opcode() != HloOpcode::kGetTupleElement) {
+    auto get_grouped_sharding =
+        [&](const HloSharding& sharding, const Shape& shape,
+            const GroupedSharding* ref =
+                nullptr) -> absl::StatusOr<GroupedSharding> {
+      if (!sharding.IsTuple()) {
+        GroupedSharding grouped =
+            hlo_sharding_util::GetManualSubgroupSharding(sharding);
+        if (ref != nullptr) {
+          auto aligned = AlignGroupsWithIfCompatible(std::move(grouped), *ref);
+          TF_RET_CHECK(aligned.has_value())
+              << "Incompatible manual sharding at " << hlo->ToString();
+          return *aligned;
         }
+        return grouped;
+      }
+      std::vector<HloSharding> elements;
+      elements.reserve(sharding.tuple_elements().size());
+      CHECK(!sharding.tuple_elements().empty());
+      GroupedSharding grouped0 = hlo_sharding_util::GetManualSubgroupSharding(
+          sharding.tuple_elements()[0]);
+      if (ref != nullptr) {
+        auto aligned = AlignGroupsWithIfCompatible(std::move(grouped0), *ref);
+        TF_RET_CHECK(aligned.has_value())
+            << "Incompatible manual sharding at " << hlo->ToString();
+        grouped0 = std::move(*aligned);
+      }
+      elements.push_back(std::move(grouped0.sharding));
+      for (int64_t i = 1; i < sharding.tuple_elements().size(); ++i) {
+        auto grouped_i = AlignGroupsWithIfCompatible(
+            hlo_sharding_util::GetManualSubgroupSharding(
+                sharding.tuple_elements()[i]),
+            grouped0);
+        TF_RET_CHECK(grouped_i.has_value())
+            << "Incompatible manual sharding between tuple elements: "
+            << hlo->ToString();
+        elements.push_back(std::move(grouped_i->sharding));
+      }
+      grouped0.sharding = HloSharding::Tuple(shape, elements);
+      return grouped0;
+    };
+    TF_ASSIGN_OR_RETURN(auto group_sharding,
+                        get_grouped_sharding(hlo->sharding(), hlo->shape()));
+    // Update sharding.
+    visiting_hlo_sharding_ = hlo->sharding();
+    hlo->set_sharding(group_sharding.sharding);
+    // Update device_groups and num_partitions.
+    // Set device_groups_, visiting_partition_id_ and
+    // visiting_collective_ops_creator_ before MakePartitioningState() which
+    // uses them.
+    device_groups_ = group_sharding.device_groups;
+    visiting_num_partitions_ = num_partitions_;
+    num_partitions_ =
+        num_partitions_ / group_sharding.device_groups.num_groups();
+    visiting_partition_id_ = partition_id_;
+    visiting_collective_ops_creator_ = std::move(collective_ops_creator_);
+    auto grouped_state = MakePartitioningState();
+    collective_ops_creator_ = std::move(grouped_state.collective_ops_creator);
+    partition_id_ = grouped_state.partition_id;
+
+    // Update sharding for the operands.
+    visiting_hlo_operand_shardings_.reserve(hlo->operand_count());
+    visiting_state_.reserve(hlo->operand_count());
+    for (HloInstruction* operand : hlo->unique_operands()) {
+      visiting_hlo_operand_shardings_.push_back(operand->sharding());
+      auto old_state = GetPartitionedHlo(operand).state();
+      visiting_state_.push_back(old_state);
+      if (operand->shape().IsArray() && operand->IsConstant() &&
+          operand->shape().dimensions_size() == 0 &&
+          !operand->sharding().IsManualSubgroup()) {
+        // We allowed scalar constants to be CSE'ed between manual/auto
+        // subgraphs. It's possible that it doesn't have a manual subgroup.
+        continue;
       }
+      TF_ASSIGN_OR_RETURN(
+          auto op_group_sharding,
+          get_grouped_sharding(operand->sharding(), operand->shape(),
+                               &group_sharding));
+      operand->set_sharding(op_group_sharding.sharding);
+      GetPartitionedHlo(operand).hlo()->copy_sharding(operand);
+      auto group_state = CreatePerGroupPartitioningState(
+          old_state, op_group_sharding.device_groups, &b_);
+      GetPartitionedHlo(operand).set_state(group_state);
     }
   }
+
   return absl::OkStatus();
 }
 
@@ -2730,8 +2769,8 @@ absl::Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
     visiting_hlo_operand_shardings_.clear();
   }
 
-  if (!device_groups_.empty()) {
-    device_groups_.clear();
+  if (device_groups_.has_value()) {
+    device_groups_.reset();
     num_partitions_ = *visiting_num_partitions_;
     visiting_num_partitions_.reset();
     collective_ops_creator_ = *visiting_collective_ops_creator_;
@@ -2845,16 +2884,18 @@ absl::Status SpmdPartitioningVisitor::HandleElementwiseWithDimsToReplicate(
 }
 
 absl::Status SpmdPartitioningVisitor::HandleCholesky(HloInstruction* hlo) {
-  CHECK_GE(hlo->shape().rank(), 2);
+  CHECK_GE(hlo->shape().dimensions_size(), 2);
   return HandleElementwiseWithDimsToReplicate(
-      hlo, {hlo->shape().rank() - 2, hlo->shape().rank() - 1});
+      hlo,
+      {hlo->shape().dimensions_size() - 2, hlo->shape().dimensions_size() - 1});
 }
 
 absl::Status SpmdPartitioningVisitor::HandleTriangularSolve(
     HloInstruction* hlo) {
-  CHECK_GE(hlo->shape().rank(), 2);
+  CHECK_GE(hlo->shape().dimensions_size(), 2);
   return HandleElementwiseWithDimsToReplicate(
-      hlo, {hlo->shape().rank() - 2, hlo->shape().rank() - 1});
+      hlo,
+      {hlo->shape().dimensions_size() - 2, hlo->shape().dimensions_size() - 1});
 }
 
 absl::Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
@@ -3053,7 +3094,8 @@ absl::Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
   // -- operand rank is at least two
   // -- output tuple elements have the same sharding
   // -- the current sharding is tiled
-  if (subshape.rank() > 1 && same_subsharding && cur_sharding.IsTiled() &&
+  if (subshape.dimensions_size() > 1 && same_subsharding &&
+      cur_sharding.IsTiled() &&
       cur_sharding.tile_assignment().dim(sort_dim) != 1) {
     // Pick the new dimension to move the sharding into
     std::vector<HloInstruction*> new_operands;
@@ -3139,8 +3181,8 @@ absl::Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
     return DefaultAction(hlo);
   }
 
-  std::vector<int64_t> inverse_dimensions(hlo->shape().rank());
-  for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
+  std::vector<int64_t> inverse_dimensions(hlo->shape().dimensions_size());
+  for (int64_t i = 0; i < hlo->shape().dimensions_size(); ++i) {
     inverse_dimensions[hlo->dimensions(i)] = i;
   }
   auto desired_operand_sharding =
@@ -3291,7 +3333,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
           output_shard_shape.dimensions(output_sharded_dim);
       // Use halo exchange to fix misaligned data.
       Window window;
-      for (int64_t i = 0; i < base_shape.rank(); ++i) {
+      for (int64_t i = 0; i < base_shape.dimensions_size(); ++i) {
         WindowDimension* dim = window.add_dimensions();
         dim->set_size(1);
         dim->set_stride(1);
@@ -3345,7 +3387,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
 
       // Use halo exchange to fix misaligned data.
       Window window;
-      for (int64_t i = 0; i < tmp_shard_shape.rank(); ++i) {
+      for (int64_t i = 0; i < tmp_shard_shape.dimensions_size(); ++i) {
         WindowDimension* dim = window.add_dimensions();
         dim->set_size(1);
         dim->set_stride(1);
@@ -3404,7 +3446,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
       }
       CHECK(operand_propagated_back->IsTiled());
       Shape inner_operand_base_shape = operand_base_shape;
-      for (int64_t i = 0; i < operand_base_shape.rank(); ++i) {
+      for (int64_t i = 0; i < operand_base_shape.dimensions_size(); ++i) {
         if (operand_propagated_back->tile_assignment().dim(i) > 1) {
           operand_group_dims.push_back(i);
           inner_operand_base_shape.set_dimensions(
@@ -3417,7 +3459,7 @@ absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
       bool use_original_output_sharding =
           sharding.NumTiles() > propagated.NumTiles();
       std::vector<int64_t> output_group_dims;
-      for (int64_t i = 0; i < inner_base_shape.rank(); ++i) {
+      for (int64_t i = 0; i < inner_base_shape.dimensions_size(); ++i) {
         int64_t num_shards = propagated.tile_assignment().dim(i);
         if (num_shards > 1) {
           inner_base_shape.set_dimensions(
@@ -3607,7 +3649,7 @@ absl::Status SpmdPartitioningVisitor::HandleBitcastConvert(
     HloInstruction* hlo) {
   const Shape& input_shape = hlo->operand(0)->shape();
   const Shape& output_shape = hlo->shape();
-  if (input_shape.rank() == output_shape.rank()) {
+  if (input_shape.dimensions_size() == output_shape.dimensions_size()) {
     return HandleElementwise(hlo);
   }
 
@@ -3617,17 +3659,17 @@ absl::Status SpmdPartitioningVisitor::HandleBitcastConvert(
   PartitionedHlo& operand = GetPartitionedHlo(hlo->operand(0));
   HloSharding temp_input_sharding = HloSharding::Replicate();
   HloSharding temp_output_sharding = HloSharding::Replicate();
-  if (input_shape.rank() > output_shape.rank()) {
-    CHECK_EQ(input_shape.rank(), output_shape.rank() + 1);
-    std::vector<int64_t> extra_dim = {output_shape.rank()};
+  if (input_shape.dimensions_size() > output_shape.dimensions_size()) {
+    CHECK_EQ(input_shape.dimensions_size(), output_shape.dimensions_size() + 1);
+    std::vector<int64_t> extra_dim = {output_shape.dimensions_size()};
     temp_input_sharding =
         hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
             operand.sharding(), extra_dim);
     temp_output_sharding = hlo_sharding_util::RemoveShapeDimensions(
         temp_input_sharding, extra_dim);
   } else {
-    CHECK_EQ(input_shape.rank() + 1, output_shape.rank());
-    std::vector<int64_t> extra_dim = {input_shape.rank()};
+    CHECK_EQ(input_shape.dimensions_size() + 1, output_shape.dimensions_size());
+    std::vector<int64_t> extra_dim = {input_shape.dimensions_size()};
     temp_output_sharding =
         hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
             hlo->sharding(), extra_dim);
@@ -3654,7 +3696,7 @@ absl::Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
 
   // Tiled output.
   std::vector<int64_t> new_dims;
-  for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo->shape().dimensions_size(); ++i) {
     if (!absl::c_linear_search(hlo->dimensions(), i)) {
       new_dims.push_back(i);
     }
@@ -3683,7 +3725,7 @@ absl::Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
 
   SetPartitionedHlo(hlo, [&]() {
     auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
-    std::vector<int64_t> start_indices(hlo->shape().rank(), 0);
+    std::vector<int64_t> start_indices(hlo->shape().dimensions_size(), 0);
     auto constant = b_.AddInstruction(HloInstruction::CreateConstant(
         literal.Slice(start_indices, shard_shape.dimensions())));
     *constant->mutable_shape() = shard_shape;
@@ -3696,7 +3738,7 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
-  for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo->shape().dimensions_size(); ++i) {
     if (hlo->sharding().tile_assignment().dim(i) != 1 &&
         hlo->dynamic_slice_sizes()[i] !=
             hlo->operand(0)->shape().dimensions(i)) {
@@ -3704,7 +3746,7 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
       return DefaultAction(hlo);
     }
   }
-  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  std::vector<HloInstruction*> new_indices(hlo->shape().dimensions_size());
   auto new_input =
       GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
   for (int64_t i = 0; i < new_indices.size(); ++i) {
@@ -3733,197 +3775,147 @@ absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
     return DefaultAction(hlo);
   }
 
-  std::vector<int64_t> partitioned_slice_dims;
-  std::vector<int64_t> slice_dims;
-  std::vector<int64_t> partitioned_non_slice_dims;
-  std::vector<int64_t> partitioned_slice_offsets;
-  bool any_non_constant_sliced_dim = false;
-  for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
-    if (hlo->operand(1)->shape().dimensions(i) != hlo->shape().dimensions(i)) {
-      slice_dims.push_back(i);
-      int64_t slice_size = hlo->operand(1)->shape().dimensions(i);
-      if (hlo->sharding().tile_assignment().dim(i) != 1) {
-        if (!hlo->operand(i + 2)->IsConstant() && slice_size != 1) {
-          any_non_constant_sliced_dim = true;
-          continue;
-        }
-        partitioned_slice_dims.push_back(i);
-        // Set partitioned_slice_offsets to -1 when slice_size is 1.
-        if (slice_size == 1) {
-          partitioned_slice_offsets.push_back(-1);
-        } else {
-          partitioned_slice_offsets.push_back(
-              hlo->operand(i + 2)->literal().Get<int>({}));
-        }
-      }
-    } else if (hlo->sharding().tile_assignment().dim(i) != 1) {
-      partitioned_non_slice_dims.push_back(i);
+  std::vector<HloInstruction*> new_indices;
+  new_indices.reserve(hlo->shape().dimensions_size());
+  for (int64_t i = 0; i < hlo->shape().dimensions_size(); ++i) {
+    const HloInstruction* index = hlo->operand(i + 2);
+    if (hlo->operand(1)->shape().dimensions(i) == hlo->shape().dimensions(i)) {
+      new_indices.emplace_back(CreateZero(index->shape(), &b_));
+    } else {
+      // Replicate the indices.
+      new_indices.emplace_back(GetPartitionedHlo(index).Replicate().hlo());
     }
   }
-  auto handle_with_replicate_slice_dims = [&]() {
+
+  DynamicUpdateSliceAnalysis analysis = AnalyzeDynamicUpdateSlice(hlo);
+
+  // Method 1. Replicate the slice dimensions for all involved tensors.
+  // TODO(b/407610806). Add support if all partitioned slice dimensions have
+  // constant indices.
+  if (analysis.method == DynamicUpdateSliceMethod::kDefault ||
+      analysis.method == DynamicUpdateSliceMethod::
+                             kAllPartitionedSliceDimsHaveConstantIndices) {
+    const HloSharding& input_sharding = hlo->operand(0)->sharding();
+    const HloSharding& output_sharding = hlo->sharding();
+    const HloSharding& better_sharding =
+        input_sharding.NumTiles() > output_sharding.NumTiles()
+            ? input_sharding
+            : output_sharding;
+
     HloSharding replicated_sharding =
-        hlo_sharding_util::PartiallyReplicateTiledShardingOnAllDimsExcept(
-            hlo->operand(0)->sharding(), partitioned_non_slice_dims);
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+            better_sharding, analysis.slice_dims);
     auto base = GetPartitionedHlo(hlo->operand(0)).Reshard(replicated_sharding);
     auto operand =
         GetPartitionedHlo(hlo->operand(1)).Reshard(replicated_sharding);
-    std::vector<HloInstruction*> new_indices(hlo->shape().rank());
-    for (int64_t i = 0; i < new_indices.size(); ++i) {
-      // Replicate the indices.
-      new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2)).Replicate().hlo();
-    }
     auto dus = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
         base.hlo()->shape(), base.hlo(), operand.hlo(), new_indices));
     dus->set_sharding(replicated_sharding);
     SetPartitionedHlo(hlo, PartitionedHlo(dus, base.base_shape(), base.state())
                                .Reshard(hlo->sharding()));
-  };
-  if (any_non_constant_sliced_dim) {
-    if (partitioned_non_slice_dims.empty()) {
-      return DefaultAction(hlo);
-    }
-    handle_with_replicate_slice_dims();
     return absl::OkStatus();
   }
 
-  // Handle when there is slice dim partitioned.
-  if (!partitioned_slice_dims.empty()) {
-    auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
-      return b_.AddInstruction(std::move(to_add));
-    };
-    std::vector<HloInstruction*> new_indices(hlo->shape().rank());
-    for (int64_t i = 0; i < new_indices.size(); ++i) {
-      if (hlo->operand(1)->shape().dimensions(i) ==
-          hlo->shape().dimensions(i)) {
-        new_indices[i] = CreateZero(hlo->operand(i + 2)->shape(), &b_);
-        continue;
-      }
-      // Replicate the indices.
-      new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2)).Replicate().hlo();
-    }
-
-    // Get partitioned input.
-    const auto& dus_sharding = hlo->sharding();
-    const auto& partitioned_input =
-        GetPartitionedHlo(hlo->operand(0)).Reshard(dus_sharding).hlo();
-
-    // Get replicate update.
-    auto update_sharding = HloSharding::Replicate();
-    if (!partitioned_non_slice_dims.empty()) {
-      // Do partial replicate for update if non slice dims are partitioned.
-      update_sharding =
-          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(dus_sharding,
-                                                                   slice_dims);
-    }
-
-    // TODO(wangtao): use collective permute for sharded update.
-    HloInstruction* replicate_update =
-        GetPartitionedHlo(hlo->operand(1)).Reshard(update_sharding).hlo();
-
-    const auto& update_shape = replicate_update->shape();
-    const auto& partitioned_shape = partitioned_input->shape();
-    auto partition_ordinals = MakeTiledPartitionOrdinals(
-        hlo->sharding(), MakePartitioningState().partition_id, &b_);
-    HloInstruction* all_dims_within_partition = add_hlo(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
-
-    for (int i = 0; i < partitioned_slice_dims.size(); ++i) {
-      int dim = partitioned_slice_dims[i];
-      // Calculate per partition size.
-      const int64_t per_partition_size = partitioned_shape.dimensions(dim);
-
-      // Only update within a single partition is supported.
-      // Will ignore this check when slice size is 1 where
-      // partitioned_slice_offsets[i] is -1.
-      if ((partitioned_slice_offsets[i] != -1) &&
-          (partitioned_slice_offsets[i] / per_partition_size) !=
-              ((partitioned_slice_offsets[i] + update_shape.dimensions(dim) -
-                1) /
-               per_partition_size)) {
-        handle_with_replicate_slice_dims();
-        return absl::OkStatus();
-      }
+  // Method 2. Keep the sharding for input and output since the update is fully
+  // contained in a single partition.
+  CHECK(analysis.method == DynamicUpdateSliceMethod::kUpdateOnASinglePartition);
 
-      // within_partition = (offset >= partition_id * per_partition_size) &&
-      //                    (offset < (partition_id + 1) * per_partition_size)
-      const Shape& compare_shape =
-          ShapeUtil::ChangeElementType(partition_id_->shape(), PRED);
-      auto per_partition_size_hlo = add_hlo(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR0<int>(per_partition_size)));
-      const Shape& offset_shape = per_partition_size_hlo->shape();
-      auto partition_offset = add_hlo(HloInstruction::CreateBinary(
-          offset_shape, HloOpcode::kMultiply, partition_ordinals[dim],
-          per_partition_size_hlo));
-      // offset >= partition_id * per_partition_size
-      auto offset_ge = add_hlo(HloInstruction::CreateCompare(
-          compare_shape, new_indices[dim], partition_offset,
-          ComparisonDirection::kGe));
-      // offset < (partition_id + 1) * per_partition_size
-      auto offset_lt = add_hlo(HloInstruction::CreateCompare(
-          compare_shape, new_indices[dim],
-          add_hlo(HloInstruction::CreateBinary(
-              offset_shape, HloOpcode::kMultiply,
-              add_hlo(HloInstruction::CreateBinary(
-                  offset_shape, HloOpcode::kAdd, partition_ordinals[dim],
-                  add_hlo(HloInstruction::CreateConstant(
-                      LiteralUtil::CreateR0<int>(1))))),
-              per_partition_size_hlo)),
-          ComparisonDirection::kLt));
-      auto update_within_partition = add_hlo(HloInstruction::CreateBinary(
-          compare_shape, HloOpcode::kAnd, offset_ge, offset_lt));
-
-      all_dims_within_partition = add_hlo(HloInstruction::CreateBinary(
-          compare_shape, HloOpcode::kAnd, all_dims_within_partition,
-          update_within_partition));
-
-      // Calculate offset.
-      // slice dim offset =
-      //  within_partition ?
-      //  offset - partition_id * per_partition_size : 0
-      new_indices[dim] = add_hlo(HloInstruction::CreateTernary(
-          new_indices[dim]->shape(), HloOpcode::kSelect,
-          update_within_partition,
-          add_hlo(HloInstruction::CreateBinary(
-              new_indices[dim]->shape(), HloOpcode::kSubtract, new_indices[dim],
-              partition_offset)),
-          add_hlo(
-              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)))));
-    }
-
-    // Create dynamic update slice.
-    auto dus = add_hlo(HloInstruction::CreateDynamicUpdateSlice(
-        partitioned_shape, partitioned_input, replicate_update, new_indices));
-    SetPartitionedHlo(hlo, [&]() {
-      // Select if update is needed.
-      return add_hlo(HloInstruction::CreateTernary(
-          dus->shape(), HloOpcode::kSelect,
-          add_hlo(HloInstruction::CreateBroadcast(
-              ShapeUtil::ChangeElementType(dus->shape(), PRED),
-              all_dims_within_partition, {})),
-          dus, partitioned_input));
-    });
-    return absl::OkStatus();
-  }
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    return b_.AddInstruction(std::move(to_add));
+  };
 
-  // Partition non slice dims only.
-  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
-  auto new_input =
-      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
-  auto new_update =
-      GetPartitionedHlo(hlo->operand(1)).Reshard(hlo->sharding()).hlo();
-  for (int64_t i = 0; i < new_indices.size(); ++i) {
-    if (hlo->operand(1)->shape().dimensions(i) == hlo->shape().dimensions(i)) {
-      new_indices[i] = CreateZero(hlo->operand(i + 2)->shape(), &b_);
-      continue;
-    }
-    // Replicate the indices.
-    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2)).Replicate().hlo();
-  }
+  // Get partitioned input.
+  const auto& dus_sharding = hlo->sharding();
+  const auto& partitioned_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(dus_sharding).hlo();
+
+  auto update_sharding =
+      hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+          dus_sharding, analysis.slice_dims);
+
+  // TODO(wangtao): use collective permute for sharded update.
+  HloInstruction* replicate_update =
+      GetPartitionedHlo(hlo->operand(1)).Reshard(update_sharding).hlo();
+
+  const auto& partitioned_shape = partitioned_input->shape();
+  auto partition_ordinals = MakeTiledPartitionOrdinals(
+      hlo->sharding(), MakePartitioningState().partition_id, &b_);
+  HloInstruction* all_dims_within_partition = add_hlo(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+
+  for (int64_t dim : analysis.partitioned_slice_dims) {
+    // Calculate per partition size.
+    const int64_t per_partition_size = partitioned_shape.dimensions(dim);
+
+    // within_partition = (offset >= partition_id * per_partition_size) &&
+    //                    (offset < (partition_id + 1) * per_partition_size)
+    const Shape& compare_shape =
+        ShapeUtil::ChangeElementType(partition_id_->shape(), PRED);
+    auto per_partition_size_hlo = add_hlo(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int>(per_partition_size)));
+    const Shape& offset_shape = per_partition_size_hlo->shape();
+    const Shape& index_shape = new_indices[dim]->shape();
+    if (offset_shape.element_type() != index_shape.element_type()) {
+      new_indices[dim] = add_hlo(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(index_shape,
+                                       offset_shape.element_type()),
+          new_indices[dim]));
+    }
+    auto partition_offset = add_hlo(HloInstruction::CreateBinary(
+        offset_shape, HloOpcode::kMultiply, partition_ordinals[dim],
+        per_partition_size_hlo));
+    // offset >= partition_id * per_partition_size
+    auto offset_ge = add_hlo(HloInstruction::CreateCompare(
+        compare_shape, new_indices[dim], partition_offset,
+        ComparisonDirection::kGe));
+    // offset < (partition_id + 1) * per_partition_size
+    auto offset_lt = add_hlo(HloInstruction::CreateCompare(
+        compare_shape, new_indices[dim],
+        add_hlo(HloInstruction::CreateBinary(
+            offset_shape, HloOpcode::kMultiply,
+            add_hlo(HloInstruction::CreateBinary(
+                offset_shape, HloOpcode::kAdd, partition_ordinals[dim],
+                add_hlo(HloInstruction::CreateConstant(
+                    LiteralUtil::CreateR0<int>(1))))),
+            per_partition_size_hlo)),
+        ComparisonDirection::kLt));
+    auto update_within_partition = add_hlo(HloInstruction::CreateBinary(
+        compare_shape, HloOpcode::kAnd, offset_ge, offset_lt));
+
+    all_dims_within_partition = add_hlo(HloInstruction::CreateBinary(
+        compare_shape, HloOpcode::kAnd, all_dims_within_partition,
+        update_within_partition));
+
+    // Calculate offset.
+    // slice dim offset = within_partition ?
+    //                    offset - partition_id * per_partition_size : 0
+    new_indices[dim] = add_hlo(HloInstruction::CreateTernary(
+        new_indices[dim]->shape(), HloOpcode::kSelect, update_within_partition,
+        add_hlo(HloInstruction::CreateBinary(
+            new_indices[dim]->shape(), HloOpcode::kSubtract, new_indices[dim],
+            partition_offset)),
+        add_hlo(
+            HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)))));
+    if (new_indices[dim]->shape().element_type() !=
+        index_shape.element_type()) {
+      new_indices[dim] = add_hlo(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(new_indices[dim]->shape(),
+                                       index_shape.element_type()),
+          new_indices[dim]));
+    }
+  }
+
+  // Create dynamic update slice.
+  auto dus = add_hlo(HloInstruction::CreateDynamicUpdateSlice(
+      partitioned_shape, partitioned_input, replicate_update, new_indices));
   SetPartitionedHlo(hlo, [&]() {
-    auto partitioned_shape =
-        MakePartitionedShape(hlo->shape(), hlo->sharding());
-    return b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-        partitioned_shape, new_input, new_update, new_indices));
+    // Select if update is needed.
+    return add_hlo(HloInstruction::CreateTernary(
+        dus->shape(), HloOpcode::kSelect,
+        add_hlo(HloInstruction::CreateBroadcast(
+            ShapeUtil::ChangeElementType(dus->shape(), PRED),
+            all_dims_within_partition, {})),
+        dus, partitioned_input));
   });
   return absl::OkStatus();
 }
@@ -4156,7 +4148,7 @@ absl::Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
   std::vector<PartitionedHlo> inputs;
   std::vector<HloInstruction*> inits;
   std::vector<int64_t> preserved_dims;
-  for (int64_t i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo->operand(0)->shape().dimensions_size(); ++i) {
     if (!absl::c_linear_search(hlo->dimensions(), i)) {
       preserved_dims.push_back(i);
     }
@@ -4205,7 +4197,7 @@ absl::Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
         });
     if (reduce_sharded_dimension) {
       if (inputs[0].sharding().ReplicateOnLastTileDim()) {
-        preserved_dims.push_back(inputs[0].base_shape().rank());
+        preserved_dims.push_back(inputs[0].base_shape().dimensions_size());
       }
       if (local_reduce->shape().IsArray()) {
         reduce = partitioner_->AllReduceAlongShardingDims(
@@ -4518,8 +4510,8 @@ absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
         // Slice out useful data.
         if (element_shape.IsArray()) {
           CHECK(slice_shape.IsArray());
-          std::vector<int64_t> start_indices(slice_shape.rank(), 0);
-          std::vector<int64_t> slice_strides(slice_shape.rank(), 1);
+          std::vector<int64_t> start_indices(slice_shape.dimensions_size(), 0);
+          std::vector<int64_t> slice_strides(slice_shape.dimensions_size(), 1);
           return branch_b.AddInstruction(HloInstruction::CreateSlice(
               slice_shape, outfeed_operand, start_indices,
               slice_shape.dimensions(), slice_strides));
@@ -4745,21 +4737,25 @@ absl::Status SpmdPartitioningVisitor::HandleSelectAndScatter(
 
   // The first window for each dimension that overlaps with the shard area.
   std::vector<MultiplyAddDivideOffsetCalculation> first_window(
-      hlo->shape().rank());
+      hlo->shape().dimensions_size());
   // The first window for each dimension that goes beyond with the shard area.
   std::vector<MultiplyAddDivideOffsetCalculation> limit_window(
-      hlo->shape().rank());
-  std::vector<OffsetCalculation> data_left_halo_sizes(hlo->shape().rank());
-  std::vector<OffsetCalculation> data_right_halo_sizes(hlo->shape().rank());
-  std::vector<OffsetCalculation> source_left_halo_sizes(hlo->shape().rank());
-  std::vector<OffsetCalculation> source_right_halo_sizes(hlo->shape().rank());
+      hlo->shape().dimensions_size());
+  std::vector<OffsetCalculation> data_left_halo_sizes(
+      hlo->shape().dimensions_size());
+  std::vector<OffsetCalculation> data_right_halo_sizes(
+      hlo->shape().dimensions_size());
+  std::vector<OffsetCalculation> source_left_halo_sizes(
+      hlo->shape().dimensions_size());
+  std::vector<OffsetCalculation> source_right_halo_sizes(
+      hlo->shape().dimensions_size());
   auto unpadded_data_shard_shape =
       MakePartitionedShape(hlo->shape(), hlo->sharding());
   auto unpadded_source_shard_shape =
       MakePartitionedShape(hlo->operand(1)->shape(), hlo->sharding());
   auto source_shard_hlo = source.hlo();
   auto data_shard_hlo = operand.hlo();
-  for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
+  for (int64_t i = 0; i < hlo->shape().dimensions_size(); ++i) {
     int64_t shard_count = hlo->sharding().tile_assignment().dim(i);
     if (shard_count == 1) {
       continue;
@@ -4872,7 +4868,8 @@ absl::Status SpmdPartitioningVisitor::HandleSelectAndScatter(
     }
     auto zero = b_.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-    std::vector<HloInstruction*> slice_offsets(shard_shape.rank(), zero);
+    std::vector<HloInstruction*> slice_offsets(shard_shape.dimensions_size(),
+                                               zero);
     for (int64_t i = 0; i < window_on_shard.dimensions_size(); ++i) {
       if (hlo->sharding().tile_assignment().dim(i) == 1) {
         continue;
@@ -4950,6 +4947,62 @@ absl::Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
       "the data is replicated, and if the latter which data is replicated.");
 }
 
+absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
+  LOG(WARNING) << "You have to use Shardy for RaggedDot. If not, the behavior "
+                  "is undefined.";
+
+  const RaggedDotDimensionNumbers& ragged_dot_dnums =
+      hlo->ragged_dot_dimension_numbers();
+  const DotDimensionNumbers& dot_dnums =
+      ragged_dot_dnums.dot_dimension_numbers();
+
+  CHECK_EQ(ragged_dot_dnums.lhs_ragged_dimensions_size(), 1);
+  int64_t lhs_ragged_dim = ragged_dot_dnums.lhs_ragged_dimensions(0);
+
+  PartitionedHlo& lhs = GetPartitionedHlo(hlo->operand(0));
+  PartitionedHlo& rhs = GetPartitionedHlo(hlo->operand(1));
+  PartitionedHlo& group_sizes = GetPartitionedHlo(hlo->operand(2));
+  if (lhs.hlo() == rhs.hlo()) {
+    rhs = MakeACopyAndReturnItsPartitionedHlo(rhs, builder());
+  }
+
+  std::vector<int64_t> sharded_lhs_contracting_dims;
+  if (lhs.sharding().IsTiled()) {
+    for (int64_t dim : dot_dnums.lhs_contracting_dimensions()) {
+      if (lhs.sharding().tile_assignment().dim(dim) > 1) {
+        sharded_lhs_contracting_dims.push_back(dim);
+      }
+    }
+  }
+
+  if (!sharded_lhs_contracting_dims.empty()) {
+    lhs =
+        lhs.PadWithZeroOnSpecifiedDims(dot_dnums.lhs_contracting_dimensions());
+    rhs =
+        rhs.PadWithZeroOnSpecifiedDims(dot_dnums.rhs_contracting_dimensions());
+  }
+
+  HloInstruction* phlo;
+  Shape pshape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  if (absl::c_linear_search(dot_dnums.lhs_batch_dimensions(), lhs_ragged_dim)) {
+    phlo = b_.AddInstruction(HloInstruction::CreateDot(
+        pshape, lhs.hlo(), rhs.hlo(), dot_dnums, hlo->precision_config()));
+  } else {
+    phlo = b_.AddInstruction(hlo->CloneWithNewOperands(
+        pshape, {lhs.hlo(), rhs.hlo(), group_sizes.hlo()}));
+  }
+
+  if (!sharded_lhs_contracting_dims.empty()) {
+    phlo = lhs.state().partitioner->AllReduceAlongShardingDims(
+        lhs.state().b, phlo, lhs.sharding(), lhs.state().next_channel_id,
+        sharded_lhs_contracting_dims, lhs.state().collective_ops_creator,
+        MakeBinaryAdd(phlo->shape().element_type(), lhs.state().module));
+  }
+
+  SetPartitionedHlo(hlo, [&]() { return phlo; });
+  return absl::OkStatus();
+}
+
 SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                                                         int64_t num_replicas) {
   return {
@@ -4991,7 +5044,6 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 CollectiveDeviceList(device_groups),
                 /*constrain_layout=*/false, channel_id,
                 /*use_global_device_ids=*/true));
-        reduction_clone->SetCollectiveCallInstruction(all_reduce);
         return all_reduce;
       },
       [num_replicas, num_partitions](
@@ -5008,7 +5060,6 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                     partition_group_list, num_replicas, num_partitions),
                 /*constrain_layout=*/false, channel_id,
                 /*use_global_device_ids=*/true));
-        reduction_clone->SetCollectiveCallInstruction(all_reduce);
         return all_reduce;
       },
       [num_partitions](SpmdBuilder* b, HloInstruction* operand,
@@ -5053,6 +5104,20 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
             output_shape, operands, CollectiveDeviceList(groups),
             /*constrain_layout=*/false, channel_id, split_dimension));
       },
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+          const IotaReplicaGroupList& partition_group_list, int64_t channel_id,
+          std::optional<int64_t> split_dimension) {
+        std::vector<Shape> shapes(operands.size(), operands[0]->shape());
+        const Shape output_shape = (shapes.size() == 1)
+                                       ? shapes[0]
+                                       : ShapeUtil::MakeTupleShape(shapes);
+        return b->AddInstruction(HloInstruction::CreateAllToAll(
+            output_shape, operands,
+            ExpandPartitionGroupListAcrossReplicas(
+                partition_group_list, num_replicas, num_partitions),
+            /*constrain_layout=*/false, channel_id, split_dimension));
+      },
       [num_replicas, num_partitions](
           SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
           const std::vector<std::vector<int64_t>>& partition_subgroups,
@@ -5192,7 +5257,8 @@ SpmdPartitioner::AllGatherShardsInternal(
   }
   if (tiled_dims.size() > 1) {
     std::vector<int64_t> split_dim_shape;
-    split_dim_shape.reserve(tiled_dims.size() + operand->shape().rank());
+    split_dim_shape.reserve(tiled_dims.size() +
+                            operand->shape().dimensions_size());
     for (int64_t i : tiled_dims) {
       split_dim_shape.push_back(sharding.tile_assignment().dim(i));
     }
@@ -5205,7 +5271,7 @@ SpmdPartitioner::AllGatherShardsInternal(
   }
   // Transpose the gathered dimensions to next to their corresponding
   // partitioned dimensions.
-  std::vector<int64_t> xpose_permutation(result->shape().rank());
+  std::vector<int64_t> xpose_permutation(result->shape().dimensions_size());
   int64_t split_dims_added = 0;
   for (int64_t i = 0; i < xpose_permutation.size(); ++i) {
     if (sharding.tile_assignment().dim(i - split_dims_added) == 1 ||
@@ -5363,16 +5429,8 @@ int64_t SpmdPartitioner::CommunicationCostInBytes(HloInstruction* hlo) {
   }
 }
 
-absl::StatusOr<bool> SpmdPartitioner::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  set_execution_threads(execution_threads);
-  TF_RETURN_IF_ERROR(PreprocessSharding(module, execution_threads));
-  TF_RETURN_IF_ERROR(PreprocessHlos(module, execution_threads));
-
-  XLA_VLOG_LINES(1, SpmdLogger::ReportBeforePartition(
-                        *module, options_.report_instruction_count));
-
+/* static */ void SpmdPartitioner::RecordInputsOutputsSharding(
+    HloModule* module) {
   // Add the parameters' and output's shardings to the module.
   std::vector<HloSharding> entry_params_shardings;
   const auto num_parameters = module->entry_computation()->num_parameters();
@@ -5386,6 +5444,18 @@ absl::StatusOr<bool> SpmdPartitioner::Run(
   auto entry_root = module->entry_computation()->root_instruction();
   CHECK(entry_root->has_sharding()) << "Missing sharding in entry root.";
   module->set_spmd_output_sharding(entry_root->sharding());
+}
+
+absl::StatusOr<bool> SpmdPartitioner::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  set_execution_threads(execution_threads);
+  TF_RETURN_IF_ERROR(PreprocessSharding(module, execution_threads));
+  TF_RETURN_IF_ERROR(PreprocessHlos(module, execution_threads));
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportBeforePartition(
+                        *module, options_.report_instruction_count));
+  RecordInputsOutputsSharding(module);
 
   FlattenCallGraph flatten;
   TF_ASSIGN_OR_RETURN(auto changed, flatten.Run(module));
@@ -5396,7 +5466,8 @@ absl::StatusOr<bool> SpmdPartitioner::Run(
   int64_t next_channel_id = hlo_query::NextChannelId(*module);
   // Copy the root sharding since the partitioner visitor may temporarily change
   // the sharding to work around manual sharding.
-  HloSharding root_sharding = entry_root->sharding();
+  HloSharding root_sharding =
+      module->entry_computation()->root_instruction()->sharding();
 
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   CHECK(call_graph->IsFlattened());
@@ -5570,7 +5641,7 @@ absl::Status SpmdPartitioner::PreprocessHlos(
           std::optional<PaddingConfig> merged_padding =
               operand->padding_config();
           bool may_have_multi_halo_exchanges = false;
-          for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
+          for (int64_t i = 0; i < hlo->shape().dimensions_size(); ++i) {
             const auto& dim = operand->padding_config().dimensions(i);
             if (dim.interior_padding() != 0 || hlo->slice_strides(i) != 1) {
               merged_padding = std::nullopt;
@@ -5612,7 +5683,6 @@ absl::Status SpmdPartitioner::PreprocessHlos(
 
         if (std::optional<int64_t> amount = FindRotateRightPattern(hlo)) {
           HloInstruction* lhs = SkipCopyOperands(hlo->mutable_operand(0));
-          TF_RETURN_IF_ERROR(HandleRotateRightWhilePreprocessing(computation));
           HloInstruction* to_rotate = lhs->mutable_operand(0);
           HloInstruction* rotate = computation->AddInstruction(
               CreateCustomCallSPMDInternal_RotateRight(to_rotate, dim,
@@ -5637,7 +5707,7 @@ absl::Status SpmdPartitioner::PreprocessHlos(
           // Step 1: Pad the mid operand to the final size. The low padding is
           // the size of the lhs shape, and high padding is size of rhs shape.
           PaddingConfig padding_config =
-              MakeNoPaddingConfig(hlo->shape().rank());
+              MakeNoPaddingConfig(hlo->shape().dimensions_size());
           auto* padding_config_dim = padding_config.mutable_dimensions(dim);
           const int64_t low_pad = lhs->shape().dimensions(dim);
           const int64_t high_pad = rhs->shape().dimensions(dim);
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index 8678c193de05..0421ad982991 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/literal.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/custom_call_sharding_helper.h"
@@ -53,7 +54,7 @@ namespace xla {
 namespace spmd {
 
 // Enum representing the partitioning methods for gather and scatter.
-enum class PartitioningMethod {
+enum class GatherScatterPartitioningMethod {
   kExplicitBatch,
   kIndexParallel,
   kOperandPassthrough,
@@ -111,12 +112,16 @@ struct SpmdPartitionerOptions {
   bool disable_ag_rewrite_for_multiple_consumers = false;
 
   // Partitioning method to prioritize for gather operations.
-  PartitioningMethod gather_partition_method =
-      PartitioningMethod::kExplicitBatch;
+  std::vector<GatherScatterPartitioningMethod>
+      preferred_gather_partition_methods = {
+          GatherScatterPartitioningMethod::kExplicitBatch,
+          GatherScatterPartitioningMethod::kIndexParallel};
 
   // Partitioning method to prioritize for scatter operations.
-  PartitioningMethod scatter_partition_method =
-      PartitioningMethod::kExplicitBatch;
+  std::vector<GatherScatterPartitioningMethod>
+      preferred_scatter_partition_methods = {
+          GatherScatterPartitioningMethod::kExplicitBatch,
+          GatherScatterPartitioningMethod::kIndexParallel};
 
   // The minimum size to enable windowed einsum in total bytes.
   // This combines sizes in bytes of both operands.
@@ -221,6 +226,15 @@ struct SPMDCollectiveOpsCreator {
       int64_t channel_id, std::optional<int64_t> split_dimension)>
       create_cross_partition_all_to_all;
 
+  // Function used to create a cross-partition all-to-all HLO using device list
+  // in iota format. This function is optional: if it is a nullptr, use
+  // create_cross_partition_all_to_all.
+  std::function<HloInstruction*(
+      SpmdBuilder*, absl::Span<HloInstruction* const> operands,
+      const IotaReplicaGroupList& partition_group_list, int64_t channel_id,
+      std::optional<int64_t> split_dimension)>
+      create_cross_partition_all_to_all_with_iota_device_list;
+
   // Function used to create a cross-partition all-gather HLO. This is optional:
   // if it is nullptr, the partitioner will use all-reduce instead.
   std::function<HloInstruction*(
@@ -348,6 +362,13 @@ class SpmdPartitioner : public HloModulePass {
     return execution_threads_;
   }
 
+  // Update module's parameter and output sharding information, based on the
+  // sharding information of the module's parameters and outptuts.
+  static void RecordInputsOutputsSharding(HloModule* module);
+
+  int64_t num_partitions() const { return num_partitions_; }
+  int64_t num_replicas() const { return num_replicas_; }
+
  protected:
   // This is the internal implementation for AllGatherShards(), returns a pair
   // of hlo instructions whose first element is the result of the all-gather
@@ -392,13 +413,6 @@ class SpmdPartitioner : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
-  // A plug for subclasses to alter the IR based on the computation that has the
-  // rotate-right pattern. This is called during `PreprocessHlos`.
-  virtual absl::Status HandleRotateRightWhilePreprocessing(
-      HloComputation* computation) {
-    return absl::OkStatus();
-  };
-
   void set_execution_threads(
       const absl::flat_hash_set<absl::string_view>& execution_threads) {
     execution_threads_ = execution_threads;
@@ -489,17 +503,24 @@ class PartitionedHlo {
       absl::Span<const int64_t> left_padded_dims = {},
       absl::Span<const int64_t> skipped_dims = {}) const;
 
+  // Same as PadWithValue with zero as the pad value.
   PartitionedHlo PadWithZero(absl::Span<const int64_t> left_padded_dims = {},
                              absl::Span<const int64_t> skipped_dims = {}) const;
 
+  // PadWithZero consider all dimensions except the skipped dimensions.
+  // PadWithZeroOnSpecifiedDims considers only the specified dimensions.
+  PartitionedHlo PadWithZeroOnSpecifiedDims(
+      absl::Span<const int64_t> dims,
+      absl::Span<const int64_t> left_padded_dims = {}) const;
+
   // Returns the SPMD instruction.
   HloInstruction* hlo() const { return hlo_; }
 
   // Returns the sharding of the SPMD instruction.
   const HloSharding& sharding() const { return hlo_->sharding(); }
 
-  // Returns the rank of the SPMD instruction.
-  const int64_t rank() const { return base_shape_.rank(); }
+  // Returns the SPMD instruction's number of dimensions.
+  int64_t num_dimensions() const { return base_shape_.dimensions().size(); }
 
   // Original full shape of the data.
   const Shape& base_shape() const { return base_shape_; }
@@ -616,6 +637,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   absl::Status HandlePad(HloInstruction* hlo) override;
   absl::Status HandleParameter(HloInstruction* hlo) override;
   absl::Status HandlePartitionId(HloInstruction* hlo) override;
+  absl::Status HandleRaggedDot(HloInstruction* hlo) override;
   absl::Status HandleReduce(HloInstruction* hlo) override;
   absl::Status HandleReduceWindow(HloInstruction* hlo) override;
   absl::Status HandleReshape(HloInstruction* hlo) override;
@@ -709,6 +731,9 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   std::vector<ReplicaGroup> CreateReplicaGroups(
       std::vector<std::vector<int64_t>>& groups);
 
+  std::vector<ReplicaGroup> CreateReplicaGroups(
+      const hlo_sharding_util::DeviceGroupTileAssignment& groups);
+
   const CallGraph& call_graph() { return call_graph_; }
   int64_t num_partitions() const { return num_partitions_; }
   int64_t num_replicas() const { return num_replicas_; }
@@ -782,7 +807,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   std::optional<SPMDCollectiveOpsCreator> visiting_collective_ops_creator_;
   std::optional<HloInstruction*> visiting_partition_id_;
   std::vector<PartitionedHlo::PartitioningState> visiting_state_;
-  std::vector<std::vector<int64_t>> device_groups_;
+  std::optional<hlo_sharding_util::DeviceGroupTileAssignment> device_groups_;
   const CallGraph& call_graph_;
 };
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 6516fb3555b5..015ec0391448 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -41,15 +41,15 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
-#include "xla/hlo/transforms/sharding_format_picker.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/layout_util.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
+#include "xla/service/spmd/sharding_format_picker.h"
 #include "xla/service/spmd/spmd_prepare.h"
 #include "xla/shape.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -62,10 +62,22 @@ namespace {
 
 using ::testing::_;
 using ::testing::AllOf;
+using ::xla::test_only::ShardingFormatPicker;
 namespace op = xla::testing::opcode_matchers;
 
+std::vector<std::vector<int64_t>> ReplicaGroupsToVecOfVec(
+    const std::vector<ReplicaGroup>& replica_groups) {
+  std::vector<std::vector<int64_t>> result;
+  for (const auto& replica_group : replica_groups) {
+    auto& group = result.emplace_back();
+    group = std::vector<int64_t>(replica_group.replica_ids().begin(),
+                                 replica_group.replica_ids().end());
+  }
+  return result;
+}
+
 class SpmdPartitioningTest
-    : public HloTestBase,
+    : public HloHardwareIndependentTestBase,
       public ::testing::WithParamInterface<ShardingFormatPicker::ShardingType> {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
@@ -75,8 +87,12 @@ class SpmdPartitioningTest
       bool unroll_windowed_einsum = false,
       bool bidirectional_windowed_einsum = false,
       int64_t threshold_for_windowed_einsum_mib = -1,
-      PartitioningMethod gather_method = PartitioningMethod::kExplicitBatch,
-      PartitioningMethod scatter_method = PartitioningMethod::kExplicitBatch,
+      std::vector<GatherScatterPartitioningMethod> gather_methods =
+          {GatherScatterPartitioningMethod::kExplicitBatch,
+           GatherScatterPartitioningMethod::kIndexParallel},
+      std::vector<GatherScatterPartitioningMethod> scatter_methods =
+          {GatherScatterPartitioningMethod::kExplicitBatch,
+           GatherScatterPartitioningMethod::kIndexParallel},
       std::optional<int64_t> total_bytes_windowed_einsum_threshold =
           std::nullopt) {
     // Some tests (BackpropFilter convs) set this flag false to test two
@@ -94,8 +110,8 @@ class SpmdPartitioningTest
       options.threshold_for_windowed_einsum_mib =
           threshold_for_windowed_einsum_mib;
     }
-    options.gather_partition_method = gather_method;
-    options.scatter_partition_method = scatter_method;
+    options.preferred_gather_partition_methods = gather_methods;
+    options.preferred_scatter_partition_methods = scatter_methods;
     auto collective_ops_creator =
         GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
     // Do not use all-gather for pattern-matching purpose, as the partitioner
@@ -416,6 +432,15 @@ ENTRY entry {
   EXPECT_THAT(all_to_all, op::Shape("s32[8,32,16,32,16]"));
   EXPECT_EQ(all_to_all->replica_groups().size(), 1);
   EXPECT_EQ(all_to_all->replica_groups()[0].replica_ids_size(), 8);
+  if (GetParam() == ShardingFormatPicker::ShardingType::kBestEffortV2) {
+    EXPECT_EQ(all_to_all->device_list().iota_replica_group_list(),
+              IotaReplicaGroupList(1, 8, {4, 2}, {1, 0}));
+  } else {
+    std::vector<std::vector<int64_t>> expected_replica_groups = {
+        {0, 2, 4, 6, 1, 3, 5, 7}};
+    EXPECT_EQ(ReplicaGroupsToVecOfVec(all_to_all->replica_groups()),
+              expected_replica_groups);
+  }
 }
 
 TEST_P(SpmdPartitioningTest, MultipleSourceTargetDimsInOneAllToAll2) {
@@ -5157,8 +5182,8 @@ ENTRY entry {
                            /*unroll_windowed_einsum=*/false,
                            /*bidirectional_windowed_einsum=*/false,
                            /*threshold_for_windowed_einsum_mib=*/5,
-                           PartitioningMethod::kExplicitBatch,
-                           PartitioningMethod::kExplicitBatch,
+                           {GatherScatterPartitioningMethod::kExplicitBatch},
+                           {GatherScatterPartitioningMethod::kExplicitBatch},
                            /*total_bytes_windowed_einsum_threshold=*/1 << 30));
   VLOG(1) << module->ToString();
   // Total bytes threshold overrides threshold_for_windowed_einsum_mib,
@@ -11799,6 +11824,18 @@ ENTRY %module {
   auto gather = AllOf(op::Shape("s32[2,4,1,2]"), op::Gather(operand, indices));
   EXPECT_THAT(root, op::AllReduce(op::AllReduce(
                         op::DynamicUpdateSlice(_, gather, _, _, _, _))));
+  auto* all_to_all = FindInstruction(module.get(), "all-to-all");
+  EXPECT_TRUE(all_to_all != nullptr);
+  if (GetParam() ==
+      test_only::ShardingFormatPicker::ShardingType::kBestEffortV2) {
+    EXPECT_EQ(all_to_all->device_list().iota_replica_group_list().value(),
+              IotaReplicaGroupList(4, 2, {2, 2, 2}, {0, 2, 1}));
+  } else {
+    std::vector<std::vector<int64_t>> expected_replica_groups = {
+        {0, 2}, {1, 3}, {4, 6}, {5, 7}};
+    EXPECT_EQ(ReplicaGroupsToVecOfVec(all_to_all->replica_groups()),
+              expected_replica_groups);
+  }
 }
 
 TEST_P(SpmdPartitioningTest, GatherMergedIndexParallelAndTrivialSlicedOperand) {
@@ -11833,6 +11870,31 @@ ENTRY %module {
                   _, op::AllReduce(op::Select(_, _, gather)), _, _, _, _)));
 }
 
+TEST_P(SpmdPartitioningTest,
+       GatherMergedIndexParallelAndTrivialSlicedOperand_Large) {
+  absl::string_view hlo_string = R"(
+HloModule jit_set_zero, entry_computation_layout={(s32[4,32]{1,0:T(1,128)}, s32[4]{0:T(128)})->s32[4]{0:T(128)}}, allow_spmd_sharding_propagation_to_parameters={false,true}, allow_spmd_sharding_propagation_to_output={true}, num_partitions=16
+
+ENTRY %main.14 (Arg_0.1: s32[4,32], Arg_1.2: s32[4]) -> s32[4] {
+  %Arg_0.1 = s32[4,32]{1,0} parameter(0), sharding={devices=[2,2,4]<=[16] last_tile_dim_replicate}
+  %constant.3 = s32[4,1]{1,0} constant({ {0}, {1}, {2}, {3} }), sharding={devices=[2,1,8]<=[16] last_tile_dim_replicate}
+  %Arg_1.2 = s32[4]{0} parameter(1), sharding={devices=[2,8]<=[16] last_tile_dim_replicate}
+  %reshape.11 = s32[4,1]{1,0} reshape(%Arg_1.2), sharding={devices=[2,1,8]<=[16] last_tile_dim_replicate}, metadata={op_name="jit(set_zero)/jit(main)/broadcast_in_dim" source_line=16}
+  %concatenate.12 = s32[4,2]{1,0} concatenate(%constant.3, %reshape.11), dimensions={1}, sharding={devices=[2,1,8]<=[16] last_tile_dim_replicate}
+  ROOT %gather.13 = s32[4]{0} gather(%Arg_0.1, %concatenate.12), offset_dims={},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=1,
+    slice_sizes={1,1}, sharding={devices=[2,8]<=[16] last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+  const auto root = module->entry_computation()->root_instruction();
+  auto operand = AllOf(op::Shape("s32[2,16]"), op::Parameter());
+  auto indices = AllOf(op::Shape("s32[2,2]"), op::Subtract());
+  auto gather = AllOf(op::Shape("s32[2]"), op::Gather(operand, indices));
+  VLOG(1) << module->ToString();
+  EXPECT_THAT(root, op::AllReduce(op::Select(_, _, gather)));
+}
+
 TEST_P(SpmdPartitioningTest, GatherMergedIndexParallelAndIndexPassthrough) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -11853,9 +11915,9 @@ ENTRY %module {
     collapsed_slice_dims={0,1}, start_index_map={1,0}, index_vector_dim=0,
     slice_sizes={1,1,2,2}, sharding={devices=[4,2,1,1]<=[8]}
 })";
-  for (const PartitioningMethod& method :
-       {PartitioningMethod::kIndexParallel,
-        PartitioningMethod::kIndexPassthrough}) {
+  for (const GatherScatterPartitioningMethod& method :
+       {GatherScatterPartitioningMethod::kIndexParallel,
+        GatherScatterPartitioningMethod::kIndexPassthrough}) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto module,
         PartitionComputation(hlo_string, /*num_devices=*/8,
@@ -11863,8 +11925,8 @@ ENTRY %module {
                              /*choose_faster_windowed_einsum=*/false,
                              /*unroll_windowed_einsum=*/false,
                              /*bidirectional_windowed_einsum=*/false,
-                             /*threshold_for_windowed_einsum_mib=*/-1, method,
-                             method));
+                             /*threshold_for_windowed_einsum_mib=*/-1, {method},
+                             {method}));
     VLOG(1) << module->ToString();
     auto operand = AllOf(op::Shape("s32[2,4,2,2]"), op::Parameter());
     auto indices = AllOf(op::Shape("s32[2,2,2]"), op::Subtract());
@@ -12143,9 +12205,10 @@ ENTRY entry {
     slice_sizes={1,16}, sharding={devices=[4,1,1,8]<=[32] last_tile_dim_replicate}
 })";
   TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/32, true, false, false,
-                       false, -1, PartitioningMethod::kTrivialSlicedOperand));
+      auto module,
+      PartitionComputation(
+          hlo_string, /*num_devices=*/32, true, false, false, false, -1,
+          {GatherScatterPartitioningMethod::kTrivialSlicedOperand}));
   VLOG(1) << module->ToString();
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::AllReduce(op::Select(_, _, op::Gather(_, _))));
@@ -12175,7 +12238,8 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
       PartitionComputation(hlo_string, /*num_devices=*/32, true, false, false,
-                           false, -1, PartitioningMethod::kIndexParallel));
+                           false, -1,
+                           {GatherScatterPartitioningMethod::kIndexParallel}));
   VLOG(1) << module->ToString();
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
@@ -12843,9 +12907,9 @@ ENTRY %module {
     index_vector_dim=0, sharding={replicated}
 })";
 
-  for (const PartitioningMethod& method :
-       {PartitioningMethod::kIndexParallel,
-        PartitioningMethod::kIndexPassthrough}) {
+  for (const GatherScatterPartitioningMethod& method :
+       {GatherScatterPartitioningMethod::kIndexParallel,
+        GatherScatterPartitioningMethod::kIndexPassthrough}) {
     TF_ASSERT_OK_AND_ASSIGN(
         auto module,
         PartitionComputation(hlo_string, /*num_devices=*/8,
@@ -12853,8 +12917,8 @@ ENTRY %module {
                              /*choose_faster_windowed_einsum=*/false,
                              /*unroll_windowed_einsum=*/false,
                              /*bidirectional_windowed_einsum=*/false,
-                             /*threshold_for_windowed_einsum_mib=*/-1, method,
-                             method));
+                             /*threshold_for_windowed_einsum_mib=*/-1, {method},
+                             {method}));
     VLOG(1) << module->ToString();
     auto operand = AllOf(op::Shape("s32[2,4,2,2]"), op::Select());
     auto indices = AllOf(op::Shape("s32[2,2,2]"), op::Subtract());
@@ -13130,9 +13194,10 @@ ENTRY entry {
       index_vector_dim=2, sharding={devices=[8,1,4]<=[4,8]T(1,0) last_tile_dim_replicate}
 })";
   TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/32, true, false, false,
-                       false, -1, PartitioningMethod::kTrivialSlicedOperand));
+      auto module,
+      PartitionComputation(
+          hlo_string, /*num_devices=*/32, true, false, false, false, -1,
+          {GatherScatterPartitioningMethod::kTrivialSlicedOperand}));
   VLOG(1) << module->ToString();
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::AllReduce(op::Scatter(op::Select(_, _, _),
@@ -13169,7 +13234,8 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
       PartitionComputation(hlo_string, /*num_devices=*/32, true, false, false,
-                           false, -1, PartitioningMethod::kIndexParallel));
+                           false, -1,
+                           {GatherScatterPartitioningMethod::kIndexParallel}));
   VLOG(1) << module->ToString();
   auto all_to_all = FindInstruction(module.get(), HloOpcode::kAllToAll);
   EXPECT_NE(all_to_all, nullptr);
@@ -14797,10 +14863,10 @@ TEST_P(SpmdPartitioningTest, CustomCallShardingRegistration) {
     }
     absl::Status Partition(spmd::SpmdPartitioningVisitor* partitioner,
                            HloInstruction* hlo) const override {
-      if (hlo->shape().rank() <= 2) {
+      if (hlo->shape().dimensions_size() <= 2) {
         return partitioner->DefaultAction(hlo);
       }
-      const int first_non_batch_dim = hlo->shape().rank() - 2;
+      const int first_non_batch_dim = hlo->shape().dimensions_size() - 2;
       HloInstruction* operand = hlo->mutable_operand(0);
       HloSharding target_sharding =
           hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
@@ -15035,7 +15101,7 @@ HloModule pjit
 
 ENTRY %main.21 {
   p0 = s32[8,64] parameter(0), sharding={devices=[4,1]<=[4]}
-  ROOT scatter = s32[8,64] scatter(p0, p0, p0), update_window_dims={}, 
+  ROOT scatter = s32[8,64] scatter(p0, p0, p0), update_window_dims={},
     input_batching_dims={0}, scatter_indices_batching_dims={0},
     inserted_window_dims={1}, scatter_dims_to_operand_dims={1},
     index_vector_dim=2, to_apply=s32_add, sharding={devices=[4,1]<=[4]}
@@ -15748,6 +15814,132 @@ ENTRY main.12 {
   EXPECT_THAT(cp, op::Shape("s32[1]{0}"));
 }
 
+TEST_P(SpmdPartitioningTest, RaggedDotNonContractingMode) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  a = f32[16,32,64] parameter(0), sharding={devices=[2,1,2,2]<=[8] last_tile_dim_replicate}
+  b = f32[4,16,64,8] parameter(1), sharding={devices=[1,2,2,2]<=[8]}
+  c = u32[16,4] parameter(2), sharding={devices=[2,1,4]<=[8] last_tile_dim_replicate}
+  ROOT dot = f32[16,32,8] ragged-dot(a, b, c),
+    lhs_batch_dims={0}, rhs_batch_dims={1},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    lhs_ragged_dims={1}, rhs_group_dims={0},
+    sharding={devices=[2,1,2,2]<=[2,2,2]T(0,2,1) last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  auto param0 = AllOf(op::Parameter(0), op::Shape("f32[8,32,32]"));
+  auto param1 = AllOf(op::Parameter(1), op::Shape("f32[4,8,32,4]"));
+  auto param2 = AllOf(op::Parameter(2), op::Shape("u32[8,4]"));
+  auto ragged_dot =
+      AllOf(op::RaggedDot(param0, param1, param2), op::Shape("f32[8,32,4]"));
+  auto all_reduce = AllOf(op::AllReduce(ragged_dot), op::Shape("f32[8,32,4]"));
+
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, all_reduce);
+
+  auto replica_groups = Cast<HloAllReduceInstruction>(root)->replica_groups();
+  EXPECT_EQ(replica_groups.size(), 4);
+  EXPECT_THAT(replica_groups[0].replica_ids(), ::testing::ElementsAre(0, 2));
+}
+
+TEST_P(SpmdPartitioningTest, RaggedDotContractingMode) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  a = f32[15,33,64] parameter(0), sharding={devices=[2,2,1,2]<=[8] last_tile_dim_replicate}
+  b = f32[15,64,9] parameter(1), sharding={devices=[2,1,2,2]<=[2,2,2]T(0,2,1) last_tile_dim_replicate}
+  c = u32[15,4] parameter(2), sharding={devices=[2,1,4]<=[8] last_tile_dim_replicate}
+  ROOT dot = f32[4,15,33,9] ragged-dot(a, b, c),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1},
+    lhs_ragged_dims={2}, sharding={devices=[1,2,2,2]<=[8]}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  auto param0 = AllOf(op::Parameter(0), op::Shape("f32[8,17,64]"));
+  auto param1 = AllOf(op::Parameter(1), op::Shape("f32[8,64,5]"));
+  auto param2 = AllOf(op::Parameter(2), op::Shape("u32[8,4]"));
+  auto ragged_dot =
+      AllOf(op::RaggedDot(param0, param1, param2), op::Shape("f32[4,8,17,5]"));
+  EXPECT_THAT(module->entry_computation()->root_instruction(), ragged_dot);
+}
+
+TEST_P(SpmdPartitioningTest, RaggedDotBatchModeWithPadding) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  a = f32[16,32,63] parameter(0), sharding={devices=[2,2,2,2]<=[2,2,2,2]T(0,1,3,2) last_tile_dim_replicate}
+  b = f32[16,63,8] parameter(1), sharding={devices=[2,2,2,2]<=[2,2,2,2]T(0,3,2,1) last_tile_dim_replicate}
+  c = u32[4] parameter(2), sharding={devices=[4,4]<=[16] last_tile_dim_replicate}
+  ROOT dot = f32[16,32,8] ragged-dot(a, b, c),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1},
+    lhs_ragged_dims={0},
+    sharding={devices=[2,2,2,2]<=[16] last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+
+  auto param0 = AllOf(op::Parameter(0), op::Shape("f32[8,16,32]"));
+  auto param0_pad = AllOf(op::Select(_, param0, op::Broadcast(op::Constant())),
+                          op::Shape("f32[8,16,32]"));
+
+  auto param1 = AllOf(op::Parameter(1), op::Shape("f32[8,32,4]"));
+  auto param1_pad = AllOf(op::Select(_, param1, op::Broadcast(op::Constant())),
+                          op::Shape("f32[8,32,4]"));
+
+  auto dot = AllOf(op::Dot(param0_pad, param1_pad), op::Shape("f32[8,16,4]"));
+  auto all_reduce = AllOf(op::AllReduce(dot), op::Shape("f32[8,16,4]"));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, all_reduce);
+
+  auto replica_groups = Cast<HloAllReduceInstruction>(root)->replica_groups();
+  EXPECT_EQ(replica_groups.size(), 8);
+  EXPECT_THAT(replica_groups[0].replica_ids(), ::testing::ElementsAre(0, 1));
+}
+
+TEST_P(SpmdPartitioningTest, RaggedDotBatchModeWithoutPadding) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  a = f32[15,31,64] parameter(0), sharding={devices=[2,2,2,2]<=[2,2,2,2]T(0,1,3,2) last_tile_dim_replicate}
+  b = f32[15,64,7] parameter(1), sharding={devices=[2,2,2,2]<=[2,2,2,2]T(0,3,2,1) last_tile_dim_replicate}
+  c = u32[4] parameter(2), sharding={devices=[4,4]<=[16] last_tile_dim_replicate}
+  ROOT dot = f32[15,31,7] ragged-dot(a, b, c),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1},
+    lhs_ragged_dims={0},
+    sharding={devices=[2,2,2,2]<=[16] last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+
+  auto param0 = AllOf(op::Parameter(0), op::Shape("f32[8,16,32]"));
+  auto param1 = AllOf(op::Parameter(1), op::Shape("f32[8,32,4]"));
+  auto dot = AllOf(op::Dot(param0, param1), op::Shape("f32[8,16,4]"));
+  auto all_reduce = AllOf(op::AllReduce(dot), op::Shape("f32[8,16,4]"));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, all_reduce);
+
+  auto replica_groups = Cast<HloAllReduceInstruction>(root)->replica_groups();
+  EXPECT_EQ(replica_groups.size(), 8);
+  EXPECT_THAT(replica_groups[0].replica_ids(), ::testing::ElementsAre(0, 1));
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index 221a097369a2..cb4535a04ca7 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <functional>
+#include <iterator>
 #include <limits>
 #include <map>
 #include <memory>
@@ -31,8 +33,10 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -43,6 +47,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/pattern_matcher.h"
@@ -58,6 +63,7 @@ namespace xla {
 namespace spmd {
 
 namespace {
+using hlo_sharding_util::DeviceGroupTileAssignment;
 using hlo_sharding_util::GroupedSharding;
 }  // namespace
 
@@ -100,9 +106,11 @@ bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding) {
   if (sharding.IsTileMaximal()) {
     return sharding.IsReplicated();
   }
-  for (int64_t i = 0; i < shape.dimensions_size(); ++i) {
-    if (shape.dimensions(i) % sharding.tile_assignment().dim(i) != 0) {
-      return false;
+  if (shape.IsArray()) {
+    for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
+      if (shape.dimensions(i) % sharding.tile_assignment().dim(i) != 0) {
+        return false;
+      }
     }
   }
   return true;
@@ -184,7 +192,7 @@ std::vector<HloInstruction*> MakePartitionOffsets(
   auto shard_shape = MakePartitionedShape(shape, sharding);
   std::vector<HloInstruction*> offsets;
 
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions_size(); ++i) {
     if (sharding.tile_assignment().dim(i) == 1 ||
         (!dims.empty() && !absl::c_linear_search(dims, i))) {
       offsets.push_back(b->AddInstruction(
@@ -225,7 +233,7 @@ Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
   }
   auto shard_shape = MakePartitionedShape(base_shape, sharding);
   Shape padded_base_shape = base_shape;
-  for (int64_t i = 0; i < padded_base_shape.rank(); ++i) {
+  for (int64_t i = 0; i < padded_base_shape.dimensions_size(); ++i) {
     padded_base_shape.set_dimensions(
         i, shard_shape.dimensions(i) * sharding.tile_assignment().dim(i));
   }
@@ -234,12 +242,11 @@ Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
 
 HloInstruction* GetInGroupPartitionId(
     HloInstruction* partition_id,
-    const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b) {
-  int64_t total_devices = device_groups.size() * device_groups[0].size();
-  std::vector<uint32_t> in_group_ids(total_devices);
-  for (uint32_t i = 0; i < device_groups.size(); ++i) {
-    for (uint32_t j = 0; j < device_groups[i].size(); ++j) {
-      in_group_ids[device_groups[i][j]] = j;
+    const DeviceGroupTileAssignment& device_groups, SpmdBuilder* b) {
+  std::vector<uint32_t> in_group_ids(device_groups.num_total_devices());
+  for (uint32_t i = 0; i < device_groups.num_groups(); ++i) {
+    for (uint32_t j = 0; j < device_groups.num_devices_per_group(); ++j) {
+      in_group_ids[device_groups(i, j)] = j;
     }
   }
   return TableLookup<uint32_t>(in_group_ids, U32, partition_id, b);
@@ -247,25 +254,35 @@ HloInstruction* GetInGroupPartitionId(
 
 namespace {
 
-bool IsIota(absl::Span<const int64_t> x) {
-  for (int64_t i = 0; i < x.size(); ++i) {
-    if (x[i] != i) {
+bool IsIota(const Array<int64_t>& x) {
+  for (int64_t i = 0; i < x.num_elements(); ++i) {
+    if (x(0, i) != i) {
       return false;
     }
   }
   return true;
 }
 
+// TODO(b/365830796): Make per group collective op creator create ReplicaGroupV2
+// collectives.
 SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
     const SPMDCollectiveOpsCreator& creator,
-    const std::vector<std::vector<int64_t>>& device_groups) {
-  if (device_groups.size() == 1 && IsIota(device_groups[0])) {
-    return creator;
+    const DeviceGroupTileAssignment& device_groups) {
+  if (device_groups.num_groups() == 1) {
+    // If the device_groups has an IotaTileAssignment and has a naive reshape
+    // dim of size 1, it is an iota device group.
+    if (device_groups.has_iota() &&
+        device_groups.iota()->reshape_dims().size() == 1) {
+      return creator;
+    }
+    if (IsIota(device_groups.array())) {
+      return creator;
+    }
   }
 
   SPMDCollectiveOpsCreator result;
   auto device_groups_ptr =
-      std::make_shared<const std::vector<std::vector<int64_t>>>(device_groups);
+      std::make_shared<const DeviceGroupTileAssignment>(device_groups);
   result.create_partition_id = [creator, device_groups_ptr](SpmdBuilder* b) {
     return GetInGroupPartitionId(creator.create_partition_id(b),
                                  *device_groups_ptr, b);
@@ -275,17 +292,17 @@ SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
           const std::vector<std::vector<int64_t>>& partition_subgroups) {
         auto& device_groups = *device_groups_ptr;
         if (partition_subgroups.empty()) {
-          return device_groups;
+          return device_groups.flattened_device_groups();
         }
-        std::vector<std::vector<int64_t>> result(partition_subgroups.size() *
-                                                 device_groups.size());
-        for (int64_t g = 0; g < device_groups.size(); ++g) {
+        std::vector<std::vector<int64_t>> result(
+            partition_subgroups.size() * device_groups_ptr->num_groups());
+        for (int64_t g = 0; g < device_groups_ptr->num_groups(); ++g) {
           for (int64_t i = 0; i < partition_subgroups.size(); ++i) {
             result[g * partition_subgroups.size() + i].resize(
                 partition_subgroups[i].size());
             for (int64_t j = 0; j < partition_subgroups[i].size(); ++j) {
               result[g * partition_subgroups.size() + i][j] =
-                  device_groups[g][partition_subgroups[i][j]];
+                  device_groups(g, partition_subgroups[i][j]);
             }
           }
         }
@@ -305,15 +322,14 @@ SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
           SpmdBuilder* b, HloInstruction* operand,
           std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
           int64_t next_channel_id) {
-        auto& device_groups = *device_groups_ptr;
         std::vector<std::pair<int64_t, int64_t>> expanded_pairs(
-            src_dst_pairs.size() * device_groups.size());
-        for (int64_t g = 0; g < device_groups.size(); ++g) {
+            src_dst_pairs.size() * device_groups_ptr->num_groups());
+        for (int64_t g = 0; g < device_groups_ptr->num_groups(); ++g) {
           for (int64_t i = 0; i < src_dst_pairs.size(); ++i) {
             expanded_pairs[g * src_dst_pairs.size() + i] =
                 std::pair<int64_t, int64_t>{
-                    device_groups[g][src_dst_pairs[i].first],
-                    device_groups[g][src_dst_pairs[i].second]};
+                    device_groups_ptr->array()(g, src_dst_pairs[i].first),
+                    device_groups_ptr->array()(g, src_dst_pairs[i].second)};
           }
         }
         return creator.create_cross_partition_collective_permute(
@@ -563,7 +579,7 @@ std::optional<HloInstruction*> PadFromPartialReplicateShape(
 
   // Pad other dimensions that won't need halo exchange with a single pad.
   if (!expand_dims_without_halo_exchange.empty()) {
-    std::vector<int64_t> zero_padding(result->shape().rank());
+    std::vector<int64_t> zero_padding(result->shape().dimensions_size());
     PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
 
     auto padded_shape = result->shape();
@@ -823,13 +839,13 @@ std::optional<HloInstruction*> ExchangeHalo(
     auto source_halo_slice = hlo;
     if (halo_size != hlo->shape().dimensions(dim)) {
       halo_shape.set_dimensions(dim, halo_size);
-      std::vector<int64_t> halo_start_indices(halo_shape.rank(), 0);
+      std::vector<int64_t> halo_start_indices(halo_shape.dimensions_size(), 0);
       halo_start_indices[dim] =
           hlo->shape().dimensions(dim) - halo_size_including_skips;
       std::vector<int64_t> halo_limit_indices(hlo->shape().dimensions().begin(),
                                               hlo->shape().dimensions().end());
       halo_limit_indices[dim] -= halo_right_skips;
-      std::vector<int64_t> halo_slice_strides(halo_shape.rank(), 1);
+      std::vector<int64_t> halo_slice_strides(halo_shape.dimensions_size(), 1);
       source_halo_slice = b->AddInstruction(
           HloInstruction::CreateSlice(halo_shape, hlo, halo_start_indices,
                                       halo_limit_indices, halo_slice_strides));
@@ -848,12 +864,12 @@ std::optional<HloInstruction*> ExchangeHalo(
     } else {
       auto self_shape = hlo->shape();
       self_shape.set_dimensions(dim, self_limit - self_start);
-      std::vector<int64_t> start_indices(self_shape.rank(), 0);
+      std::vector<int64_t> start_indices(self_shape.dimensions_size(), 0);
       start_indices[dim] = self_start;
       std::vector<int64_t> limit_indices(hlo->shape().dimensions().begin(),
                                          hlo->shape().dimensions().end());
       limit_indices[dim] = self_limit;
-      std::vector<int64_t> slice_strides(self_shape.rank(), 1);
+      std::vector<int64_t> slice_strides(self_shape.dimensions_size(), 1);
       concat_pieces.push_back(b->AddInstruction(HloInstruction::CreateSlice(
           self_shape, hlo, start_indices, limit_indices, slice_strides)));
     }
@@ -885,12 +901,12 @@ std::optional<HloInstruction*> ExchangeHalo(
     HloInstruction* source_halo_slice = hlo;
     if (halo_size != halo_shape.dimensions(dim)) {
       halo_shape.set_dimensions(dim, halo_size);
-      std::vector<int64_t> halo_start_indices(halo_shape.rank(), 0);
+      std::vector<int64_t> halo_start_indices(halo_shape.dimensions_size(), 0);
       halo_start_indices[dim] = halo_left_skips;
       std::vector<int64_t> halo_limit_indices(halo_shape.dimensions().begin(),
                                               halo_shape.dimensions().end());
       halo_limit_indices[dim] += halo_left_skips;
-      std::vector<int64_t> halo_slice_strides(halo_shape.rank(), 1);
+      std::vector<int64_t> halo_slice_strides(halo_shape.dimensions_size(), 1);
       source_halo_slice = b->AddInstruction(
           HloInstruction::CreateSlice(halo_shape, hlo, halo_start_indices,
                                       halo_limit_indices, halo_slice_strides));
@@ -1089,12 +1105,12 @@ HloInstruction* ExchangeHaloCompact(
     HloInstruction* source_halo_slice = hlo;
     if (halo_size != hlo->shape().dimensions(dim)) {
       halo_shape.set_dimensions(dim, halo_size);
-      std::vector<int64_t> halo_start_indices(halo_shape.rank(), 0);
+      std::vector<int64_t> halo_start_indices(halo_shape.dimensions_size(), 0);
       halo_start_indices[dim] = start;
       std::vector<int64_t> halo_limit_indices(hlo->shape().dimensions().begin(),
                                               hlo->shape().dimensions().end());
       halo_limit_indices[dim] = limit;
-      std::vector<int64_t> halo_slice_strides(halo_shape.rank(), 1);
+      std::vector<int64_t> halo_slice_strides(halo_shape.dimensions_size(), 1);
       source_halo_slice = b->AddInstruction(
           HloInstruction::CreateSlice(halo_shape, hlo, halo_start_indices,
                                       halo_limit_indices, halo_slice_strides));
@@ -1167,11 +1183,11 @@ HloInstruction* ExchangeHaloCompact(
         if (hlo->shape().dimensions(dim) == max_size) {
           piece = hlo;
         } else {
-          std::vector<int64_t> starts(piece_shape.rank(), 0);
+          std::vector<int64_t> starts(piece_shape.dimensions_size(), 0);
           starts[dim] = min_self_start;
           std::vector<int64_t> limits(piece_shape.dimensions().begin(),
                                       piece_shape.dimensions().end());
-          std::vector<int64_t> strides(piece_shape.rank(), 1);
+          std::vector<int64_t> strides(piece_shape.dimensions_size(), 1);
           limits[dim] += min_self_start;
           piece = b->AddInstruction(HloInstruction::CreateSlice(
               piece_shape, hlo, starts, limits, strides));
@@ -1188,7 +1204,7 @@ HloInstruction* ExchangeHaloCompact(
       }
       if (piece->shape().dimensions(dim) != max_size) {
         PaddingConfig pc;
-        for (int64_t k = 0; k < piece_shape.rank(); ++k) {
+        for (int64_t k = 0; k < piece_shape.dimensions_size(); ++k) {
           auto pc_dim = pc.add_dimensions();
           pc_dim->set_interior_padding(0);
           pc_dim->set_edge_padding_low(0);
@@ -1292,7 +1308,7 @@ HloInstruction* ExchangeHaloCompact(
   if (padded_concat_size > concat_shape.dimensions(dim)) {
     // Need increase the shape size before slicing.
     PaddingConfig pc;
-    for (int64_t k = 0; k < concat_shape.rank(); ++k) {
+    for (int64_t k = 0; k < concat_shape.dimensions_size(); ++k) {
       auto pc_dim = pc.add_dimensions();
       pc_dim->set_interior_padding(0);
       pc_dim->set_edge_padding_low(0);
@@ -1310,7 +1326,7 @@ HloInstruction* ExchangeHaloCompact(
   if (concat_shape.dimensions(dim) > max_window_size) {
     Shape result_shape = concat_shape;
     result_shape.set_dimensions(dim, max_window_size);
-    std::vector<HloInstruction*> offsets(result_shape.rank(),
+    std::vector<HloInstruction*> offsets(result_shape.dimensions_size(),
                                          CreateR0WithType(S32, 0, b));
     offsets[dim] = TableLookup<int32_t>(slice_offset, S32, shard_ordinal, b);
     concat = b->AddInstruction(HloInstruction::CreateDynamicSlice(
@@ -1355,11 +1371,11 @@ std::optional<HloInstruction*> ExchangeHalo(
     const HloSharding& target,
     const SPMDCollectiveOpsCreator& collective_ops_creator,
     int64_t* next_channel_id, SpmdBuilder* b) {
-  CHECK(left_halo_size_functions.size() == hlo->shape().rank());
-  CHECK(right_halo_size_functions.size() == hlo->shape().rank());
+  CHECK(left_halo_size_functions.size() == hlo->shape().dimensions_size());
+  CHECK(right_halo_size_functions.size() == hlo->shape().dimensions_size());
 
   HloInstruction* visiting_hlo = hlo;
-  for (int dim = 0; dim < hlo->shape().rank(); ++dim) {
+  for (int dim = 0; dim < hlo->shape().dimensions_size(); ++dim) {
     auto concat = ExchangeHalo(visiting_hlo, left_halo_size_functions[dim],
                                right_halo_size_functions[dim], dim, target,
                                collective_ops_creator, next_channel_id, b);
@@ -1443,7 +1459,7 @@ std::optional<HloInstruction*> ExchangeHaloAndGetValidData(
   if (extra_left_padding > 0 || extra_right_padding > 0) {
     PaddingConfig padding_config;
     auto padded_concat_shape = concat->shape();
-    for (int64_t i = 0; i < base_shape.rank(); ++i) {
+    for (int64_t i = 0; i < base_shape.dimensions_size(); ++i) {
       auto padding_config_dim = padding_config.add_dimensions();
       padding_config_dim->set_interior_padding(0);
       padding_config_dim->set_edge_padding_low(0);
@@ -1471,15 +1487,16 @@ std::optional<HloInstruction*> ExchangeHaloAndGetValidData(
     if (left_halo_size_function.IsConstant() &&
         left_halo_size_function.Calculate(0) ==
             explicit_left_padding_on_full_shape) {
-      std::vector<int64_t> start_indices(slice_shape.rank(), 0);
-      std::vector<int64_t> strides(slice_shape.rank(), 1);
+      std::vector<int64_t> start_indices(slice_shape.dimensions_size(), 0);
+      std::vector<int64_t> strides(slice_shape.dimensions_size(), 1);
       valid_slice = b->AddInstruction(
           HloInstruction::CreateSlice(slice_shape, concat, start_indices,
                                       slice_shape.dimensions(), strides));
     } else {
       auto zero = b->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-      std::vector<HloInstruction*> slice_offsets(base_shape.rank(), zero);
+      std::vector<HloInstruction*> slice_offsets(base_shape.dimensions_size(),
+                                                 zero);
       slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
           partition_ordinal, b);
       valid_slice = b->AddInstruction(HloInstruction::CreateDynamicSlice(
@@ -1561,7 +1578,7 @@ HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
   // Create a window config to halo exchange for unevenly partitioned reverse
   // dimensions.
   Window window;
-  for (int64_t i = 0; i < original.base_shape().rank(); ++i) {
+  for (int64_t i = 0; i < original.base_shape().dimensions_size(); ++i) {
     WindowDimension* dim = window.add_dimensions();
     dim->set_size(1);
     dim->set_stride(1);
@@ -1868,35 +1885,51 @@ bool CanReshardWithCollectivePermute(const HloSharding& source,
 std::optional<GroupedSharding> AlignGroupsWithInternal(
     GroupedSharding grouped_sharding, const GroupedSharding& reference,
     bool requires_compatibility, bool ignore_group_order) {
+  CHECK_EQ(grouped_sharding.device_groups.num_groups(),
+           reference.device_groups.num_groups());
+  CHECK_EQ(grouped_sharding.device_groups.num_devices_per_group(),
+           reference.device_groups.num_devices_per_group());
+  // If the device groups are both in iota, and the iota is the same, return the
+  // grouped sharding.
+  if (grouped_sharding.device_groups.has_iota() &&
+      reference.device_groups.has_iota() &&
+      grouped_sharding.device_groups.iota() == reference.device_groups.iota()) {
+    return grouped_sharding;
+  }
   // Returns src -> dst index mapping.
-  auto get_permutation = [](absl::Span<const int64_t> src,
-                            absl::Span<const int64_t> dst) {
-    CHECK_EQ(src.size(), dst.size());
-    absl::flat_hash_map<int64_t, int64_t> dst_reverse_map(dst.size());
-    for (int64_t i = 0; i < dst.size(); ++i) {
-      dst_reverse_map[dst[i]] = i;
-    }
-    std::vector<int64_t> permutation(src.size());
-    for (int64_t i = 0; i < src.size(); ++i) {
-      auto it = dst_reverse_map.find(src[i]);
+  auto get_permutation = [&](int64_t base_group_idx,
+                             int64_t reference_group_idx) {
+    int64_t num_devices_per_group =
+        grouped_sharding.device_groups.num_devices_per_group();
+    absl::flat_hash_map<int64_t, int64_t> dst_reverse_map(
+        num_devices_per_group);
+
+    for (int64_t i = 0; i < num_devices_per_group; ++i) {
+      dst_reverse_map[reference.device_groups(reference_group_idx, i)] = i;
+    }
+    std::vector<int64_t> permutation(num_devices_per_group);
+    for (int64_t i = 0; i < num_devices_per_group; ++i) {
+      auto it = dst_reverse_map.find(
+          grouped_sharding.device_groups(base_group_idx, i));
       CHECK(it != dst_reverse_map.end());
       permutation[i] = it->second;
     }
     return permutation;
   };
-  CHECK_EQ(grouped_sharding.device_groups.size(),
-           reference.device_groups.size());
-  std::vector<int64_t> device_to_ref_group(reference.device_groups.size() *
-                                           reference.device_groups[0].size());
-  for (int64_t g = 0; g < reference.device_groups.size(); ++g) {
-    for (int64_t device : reference.device_groups[g]) {
-      device_to_ref_group[device] = g;
+  std::vector<int64_t> device_to_ref_group(
+      reference.device_groups.num_total_devices());
+  for (int64_t g = 0; g < reference.device_groups.num_groups(); ++g) {
+    for (int64_t d = 0; d < reference.device_groups.num_devices_per_group();
+         ++d) {
+      device_to_ref_group[reference.device_groups(g, d)] = g;
     }
   }
-  auto unique_ref_dev_group =
-      [&](absl::Span<const int64_t> devices) -> int64_t {
+  auto unique_ref_dev_group = [&](int64_t group) -> int64_t {
     int64_t ref_g = -1;
-    for (int64_t device : devices) {
+    for (int64_t device_idx = 0;
+         device_idx < grouped_sharding.device_groups.num_devices_per_group();
+         ++device_idx) {
+      int64_t device = grouped_sharding.device_groups(group, device_idx);
       if (ref_g == -1) {
         ref_g = device_to_ref_group[device];
       } else if (ref_g != device_to_ref_group[device]) {
@@ -1907,8 +1940,8 @@ std::optional<GroupedSharding> AlignGroupsWithInternal(
   };
   bool matching_groups = true;
   std::vector<int64_t> original_src_to_ref_permutation;
-  for (int64_t g = 0; g < grouped_sharding.device_groups.size(); ++g) {
-    int64_t ref_g = unique_ref_dev_group(grouped_sharding.device_groups[g]);
+  for (int64_t g = 0; g < grouped_sharding.device_groups.num_groups(); ++g) {
+    int64_t ref_g = unique_ref_dev_group(g);
     if (ref_g < 0 || (!ignore_group_order && g != ref_g)) {
       if (requires_compatibility) {
         return std::nullopt;
@@ -1917,12 +1950,9 @@ std::optional<GroupedSharding> AlignGroupsWithInternal(
       break;
     }
     if (g == 0) {
-      original_src_to_ref_permutation = get_permutation(
-          grouped_sharding.device_groups[g], reference.device_groups[ref_g]);
+      original_src_to_ref_permutation = get_permutation(g, ref_g);
     } else if (requires_compatibility) {
-      if (original_src_to_ref_permutation !=
-          get_permutation(grouped_sharding.device_groups[g],
-                          reference.device_groups[ref_g])) {
+      if (original_src_to_ref_permutation != get_permutation(g, ref_g)) {
         return std::nullopt;
       }
     }
@@ -1977,7 +2007,7 @@ Shape GetPerGroupBaseShape(const GroupedSharding& grouped_sharding,
   auto result = original_base_shape;
   for (int64_t i = 0; i < grouped_sharding.group_dims.size(); ++i) {
     int64_t dim = grouped_sharding.group_dims[i];
-    if (dim >= original_base_shape.rank()) {
+    if (dim >= original_base_shape.dimensions_size()) {
       continue;
     }
     int64_t groups = grouped_sharding.group_dim_sizes[i];
@@ -1988,19 +2018,28 @@ Shape GetPerGroupBaseShape(const GroupedSharding& grouped_sharding,
 
 PartitionedHlo::PartitioningState CreatePerGroupPartitioningState(
     const PartitionedHlo::PartitioningState& state,
-    const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b) {
+    const DeviceGroupTileAssignment& device_groups, SpmdBuilder* b) {
   auto result = state;
   result.collective_ops_creator = GetPerGroupCollectiveOpsCreator(
       state.collective_ops_creator, device_groups);
   result.partition_id =
       GetInGroupPartitionId(state.partition_id, device_groups, b);
-  // Create a string key for the groups.
-  std::vector<std::string> per_group_strings(device_groups.size());
-  for (int64_t i = 0; i < per_group_strings.size(); ++i) {
-    per_group_strings[i] = absl::StrJoin(device_groups[i], ",");
+
+  std::string key;
+  // If the device groups are an iota, use the iota string as the key.
+  if (device_groups.has_iota()) {
+    key = device_groups.iota()->ToString();
+  } else {
+    // Create a string key for the groups.
+    std::vector<std::string> per_group_strings(device_groups.num_groups());
+    for (int64_t i = 0; i < per_group_strings.size(); ++i) {
+      for (int64_t j = 0; j < device_groups.num_devices_per_group(); ++j) {
+        absl::StrAppend(&per_group_strings[i], device_groups(i, j), ",");
+      }
+    }
+    key = absl::StrJoin(per_group_strings, ";");
   }
-  auto& grouped_cache =
-      state.reshard_cache->groupd_caches[absl::StrJoin(per_group_strings, ";")];
+  auto& grouped_cache = state.reshard_cache->groupd_caches[key];
   if (!grouped_cache) {
     grouped_cache = std::make_unique<PartitionedHlo::ReshardCache>();
   }
@@ -2010,18 +2049,18 @@ PartitionedHlo::PartitioningState CreatePerGroupPartitioningState(
 
 HloInstruction* PerGroupSliceFromReplicated(
     HloInstruction* replicated, HloInstruction* partition_id,
-    const std::vector<std::vector<int64_t>>& device_groups,
+    const DeviceGroupTileAssignment& device_groups,
     absl::Span<const int64_t> group_dims,
     absl::Span<const int64_t> group_dim_sizes, SpmdBuilder* b) {
-  std::vector<uint32_t> group_ids(device_groups.size() *
-                                  device_groups[0].size());
-  for (int64_t g = 0; g < device_groups.size(); ++g) {
-    for (int64_t device : device_groups[g]) {
-      group_ids[device] = g;
+  std::vector<uint32_t> group_ids(device_groups.num_total_devices());
+  for (int64_t g = 0; g < device_groups.num_groups(); ++g) {
+    for (int64_t d = 0; d < device_groups.num_devices_per_group(); ++d) {
+      group_ids[device_groups(g, d)] = g;
     }
   }
   auto group_id = TableLookup<uint32_t>(group_ids, U32, partition_id, b);
-  std::vector<int64_t> group_level_tile_dims(replicated->shape().rank(), 1);
+  std::vector<int64_t> group_level_tile_dims(
+      replicated->shape().dimensions_size(), 1);
   for (int64_t i = 0; i < group_dims.size(); ++i) {
     group_level_tile_dims[group_dims[i]] = group_dim_sizes[i];
   }
@@ -2054,16 +2093,16 @@ HloInstruction* PerGroupSliceFromReplicated(
 
 std::optional<std::vector<int64_t>> FindMatchingPartitionedDimsForGrouping(
     const HloSharding& sharding,
-    const std::vector<std::vector<int64_t>>& device_groups) {
-  if (sharding.IsTileMaximal() || device_groups.size() < 2) {
+    const DeviceGroupTileAssignment& device_groups) {
+  if (sharding.IsTileMaximal() || device_groups.num_groups() < 2) {
     return std::nullopt;
   }
   const int64_t num_devices = sharding.tile_assignment().num_elements();
-  if (num_devices != device_groups.size() * device_groups[0].size()) {
+  if (num_devices != device_groups.num_elements()) {
     return std::nullopt;
   }
   std::vector<int64_t> dims;
-  if (device_groups[0].size() < 2) {
+  if (device_groups.num_devices_per_group() < 2) {
     // Trivial case: single member groups
     for (int64_t i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
       if (sharding.tile_assignment().dim(i) > 1) {
@@ -2082,20 +2121,21 @@ std::optional<std::vector<int64_t>> FindMatchingPartitionedDimsForGrouping(
       });
   int64_t group_count = 1;
   for (int64_t i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
-    if (device_to_index[device_groups[0][0]][i] ==
-        device_to_index[device_groups[0][1]][i]) {
+    if (device_to_index[device_groups(0, 0)][i] ==
+        device_to_index[device_groups(0, 1)][i]) {
       dims.push_back(i);
       group_count *= sharding.tile_assignment().dim(i);
     }
   }
-  if (group_count != device_groups.size()) {
+  if (group_count != device_groups.num_groups()) {
     return std::nullopt;
   }
-  for (const auto& group : device_groups) {
-    for (int64_t i = 1; i < group.size(); ++i) {
+  for (int64_t g = 0; g < device_groups.num_groups(); ++g) {
+    for (int64_t device = 1; device < device_groups.num_devices_per_group();
+         ++device) {
       if (absl::c_any_of(dims, [&](const int64_t dim) {
-            return device_to_index[group[i]][dim] !=
-                   device_to_index[group[0]][dim];
+            return device_to_index[device_groups(g, device)][dim] !=
+                   device_to_index[device_groups(g, 0)][dim];
           })) {
         return std::nullopt;
       }
@@ -2394,7 +2434,7 @@ std::optional<PartitionedHlo::WindowedInputShardReturnValue> ReshardDataForPad(
   bool needs_masking = false;
   const bool pad_value_is_zero =
       pad_value->IsConstant() && pad_value->literal().IsZero({});
-  for (int64_t i = 0; i < to_reshard.hlo()->shape().rank(); ++i) {
+  for (int64_t i = 0; i < to_reshard.hlo()->shape().dimensions_size(); ++i) {
     WindowDimension* dim = window.add_dimensions();
     auto pd = pc.dimensions(i);
     dim->set_size(1);
@@ -2425,7 +2465,8 @@ HloInstruction* PadDataFromWindowReshard(
     HloInstruction* pad_value, SpmdBuilder* b) {
   PaddingConfig sharded_padding_config;
   bool need_pad = false;
-  for (int64_t i = 0; i < reshard_operand.sharded_input->shape().rank(); ++i) {
+  for (int64_t i = 0;
+       i < reshard_operand.sharded_input->shape().dimensions_size(); ++i) {
     auto dim = sharded_padding_config.add_dimensions();
     const auto& wd = reshard_operand.shard_window.dimensions(i);
     dim->set_edge_padding_low(wd.padding_low());
@@ -2450,26 +2491,175 @@ HloInstruction* PadDataFromWindowReshard(
 
 std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
+  absl::Span<const int64_t> tile_assignment_dims =
+      sharding.tile_assignment().dimensions();
+  DCHECK_GE(tile_assignment_dims.size(), replication_dims.size());
   int64_t group_size = 1;
   for (int64_t i : replication_dims) {
-    group_size *= sharding.tile_assignment().dim(i);
+    DCHECK_LT(i, tile_assignment_dims.size());
+    group_size *= tile_assignment_dims[i];
   }
+  std::vector<int64_t> non_replication_indices;
+  non_replication_indices.reserve(tile_assignment_dims.size() -
+                                  replication_dims.size());
+  for (int64_t i = 0; i < tile_assignment_dims.size(); ++i) {
+    if (!absl::c_linear_search(replication_dims, i)) {
+      non_replication_indices.push_back(i);
+    }
+  }
+  DCHECK_EQ(replication_dims.size() + non_replication_indices.size(),
+            tile_assignment_dims.size());
+  std::vector<int64_t> non_replication_strides(non_replication_indices.size());
+  if (!non_replication_strides.empty()) {
+    non_replication_strides.back() = 1;
+    for (int64_t i = non_replication_indices.size() - 1; i > 0; --i) {
+      non_replication_strides[i - 1] =
+          non_replication_strides[i] *
+          tile_assignment_dims[non_replication_indices[i]];
+    }
+  }
+
   std::vector<std::vector<int64_t>> partition_groups(
       sharding.tile_assignment().num_elements() / group_size);
   sharding.tile_assignment().Each(
       [&](absl::Span<const int64_t> indices, int64_t partition) {
         int64_t group_id = 0;
-        for (int64_t i = 0; i < indices.size(); ++i) {
-          if (!absl::c_linear_search(replication_dims, i)) {
-            group_id *= sharding.tile_assignment().dim(i);
-            group_id += indices[i];
-          }
+        auto non_replication_strides_it = non_replication_strides.begin();
+        for (int64_t non_replication_index : non_replication_indices) {
+          group_id +=
+              indices[non_replication_index] * (*non_replication_strides_it);
+          ++non_replication_strides_it;
         }
+        DCHECK(non_replication_strides_it == non_replication_strides.end());
+        DCHECK_LT(group_id, partition_groups.size());
         partition_groups[group_id].push_back(partition);
       });
   return partition_groups;
 }
 
+std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
+    const HloSharding& sharding, std::vector<int64_t> target_dims,
+    std::vector<int64_t> group_sizes) {
+  CHECK(target_dims.size() == group_sizes.size());
+  int64_t total_group_size = std::accumulate(
+      group_sizes.begin(), group_sizes.end(), 1, std::multiplies<int64_t>());
+  std::vector<std::vector<int64_t>> groups(
+      sharding.tile_assignment().num_elements() / total_group_size);
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64_t> indices, int64_t device) {
+        int64_t group_id = 0;
+        for (int64_t dim = 0; dim < indices.size(); ++dim) {
+          auto it = absl::c_find(target_dims, dim);
+          if (it != target_dims.end()) {
+            int64_t group_size =
+                group_sizes[std::distance(target_dims.begin(), it)];
+            group_id *= sharding.tile_assignment().dim(dim) / group_size;
+            group_id += indices[dim] / group_size;
+          } else {
+            group_id *= sharding.tile_assignment().dim(dim);
+            group_id += indices[dim];
+          }
+        }
+        groups[group_id].push_back(device);
+      });
+  return groups;
+}
+
+std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
+    const HloSharding& sharding, std::vector<int64_t> target_dims,
+    std::vector<int64_t> group_sizes, int64_t num_partitions) {
+  CHECK(target_dims.size() == group_sizes.size());
+  // If provided sharding is not HloShardingV2, we cannot generate partition
+  // groups in an iota format.
+  if (!sharding.tile_assignment().iota().has_value()) {
+    return std::nullopt;
+  }
+
+  // If the sharding does not utilize all the partitions, we skip generating
+  // compressed format.
+  if (sharding.tile_assignment().num_elements() != num_partitions) {
+    return std::nullopt;
+  }
+
+  // The goal of this function is to generate partition groups which span across
+  // target dims without using explicit indexing and instead using transposes
+  // and reshapes of the tile assignment. We do this by reshaping the tile
+  // assignment by expanding each target dim to [target_dim/group_size,
+  // group_size]. We then transpose the tile assignment, making the newly
+  // created target dims the most minor dims, preserving the order of the target
+  // dims. Consider the following example:
+  // Tile assignment: [8,8,16]<=[1024]
+  // Target dims: [0,1]
+  // Group sizes: [4,4]
+  //
+  // In this case we would generate 64 replica groups of size 16.
+  // These replica groups would span the target dims 0 and 1.
+  // We perform the following steps on the original tile assignment:
+  // 1. Expand target dims: [8,8,16]->[2,4,2,4,16]
+  // 2. Transpose to make target dims minor: [2,4,2,4,16]->[2,2,16,4,4] with
+  // (0,1,2,3,4) -> (0,2,4,1,3)
+  // 3. Reshape to get groups of size 16: [2,4,16,2,4]->[2,2,16,16]
+  int64_t total_group_size = std::accumulate(
+      group_sizes.begin(), group_sizes.end(), 1, std::multiplies<int64_t>());
+  int64_t num_replica_groups =
+      sharding.tile_assignment().num_elements() / total_group_size;
+
+  std::vector<int64_t> reshape_dimensions;
+  reshape_dimensions.reserve(sharding.tile_assignment().num_dimensions());
+  std::vector<int64_t> target_dim_locations;
+  for (int64_t dim = 0; dim < sharding.tile_assignment().num_dimensions();
+       ++dim) {
+    auto it = std::find(target_dims.begin(), target_dims.end(), dim);
+    if (it != target_dims.end()) {
+      int64_t current_val = sharding.tile_assignment().dim(dim);
+      int64_t group_size = group_sizes[std::distance(target_dims.begin(), it)];
+      reshape_dimensions.push_back(current_val / group_size);
+      reshape_dimensions.push_back(group_size);
+      target_dim_locations.push_back(reshape_dimensions.size() - 1);
+    } else {
+      reshape_dimensions.push_back(sharding.tile_assignment().dim(dim));
+    }
+  }
+
+  std::vector<int> transpose_dims(reshape_dimensions.size());
+  std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+  for (int64_t loc : target_dim_locations) {
+    auto it = std::find(transpose_dims.begin(), transpose_dims.end(), loc);
+    if (it != transpose_dims.end()) {
+      transpose_dims.erase(it);
+      transpose_dims.push_back(loc);
+    }
+  }
+
+  // Step 1: Expand target dims.
+  auto reshaped_tile_assignment =
+      sharding.tile_assignment().Reshape(reshape_dimensions);
+
+  // If after the reshape we do not have an iota tile assignment
+  // (HloShardingV2), we cannot generate a compressed format.
+  if (!reshaped_tile_assignment.iota().has_value()) {
+    return std::nullopt;
+  }
+
+  // Step 2: Transpose the tile assignment to make the target dims minor.
+  auto tranposed_tile_assignment =
+      reshaped_tile_assignment.iota()->Transpose(transpose_dims);
+  // If after the transpose we do not have an iota tile assignment
+  // (HloShardingV2), we cannot generate a compressed format.
+  if (!tranposed_tile_assignment.has_value()) {
+    return std::nullopt;
+  }
+
+  // Step 3: Final reshape to get groups of size total_group_size. This is done
+  // implicitly by creating an IotaReplicaGroupList with num_replica_groups,
+  // total_group_size.
+  IotaReplicaGroupList groups(
+      num_replica_groups, total_group_size,
+      tranposed_tile_assignment.value().reshape_dims(),
+      tranposed_tile_assignment.value().transpose_perm());
+  return groups;
+}
+
 // Returns partition groups in an iota format.
 std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims,
@@ -2574,5 +2764,62 @@ PartitionedHlo MakeACopyAndReturnItsPartitionedHlo(const PartitionedHlo& phlo,
   return PartitionedHlo(copy_hlo, phlo.base_shape(), phlo.state());
 }
 
+DynamicUpdateSliceAnalysis AnalyzeDynamicUpdateSlice(
+    const HloInstruction* hlo) {
+  CHECK(!hlo->sharding().IsTileMaximal());
+
+  DynamicUpdateSliceAnalysis analysis;
+
+  bool update_on_a_single_partition = true;
+  bool has_partitioned_slice_dim_with_dynamic_index = false;
+  for (int64_t i = 0; i < hlo->shape().dimensions_size(); ++i) {
+    if (hlo->operand(1)->shape().dimensions(i) == hlo->shape().dimensions(i)) {
+      continue;
+    }
+    analysis.slice_dims.push_back(i);
+
+    if (hlo->sharding().tile_assignment().dim(i) == 1) {
+      continue;
+    }
+    analysis.partitioned_slice_dims.push_back(i);
+
+    int64_t slice_size = hlo->operand(1)->shape().dimensions(i);
+    if (slice_size == 1) {
+      continue;
+    }
+
+    if (hlo->operand(i + 2)->IsConstant()) {
+      const PrimitiveType elemType =
+          hlo->operand(i + 2)->shape().element_type();
+      int64_t start_index =
+          elemType == S64 ? hlo->operand(i + 2)->literal().Get<int64_t>({})
+                          : hlo->operand(i + 2)->literal().Get<int>({});
+      int64_t end_index = start_index + slice_size - 1;
+
+      int64_t per_partition_size = CeilOfRatio(
+          hlo->shape().dimensions(i), hlo->sharding().tile_assignment().dim(i));
+      if (start_index / per_partition_size != end_index / per_partition_size) {
+        update_on_a_single_partition = false;
+      }
+    } else {
+      update_on_a_single_partition = false;
+      has_partitioned_slice_dim_with_dynamic_index = true;
+    }
+  }
+
+  if (analysis.partitioned_slice_dims.empty()) {
+    analysis.method = DynamicUpdateSliceMethod::kDefault;
+  } else if (update_on_a_single_partition) {
+    analysis.method = DynamicUpdateSliceMethod::kUpdateOnASinglePartition;
+  } else if (has_partitioned_slice_dim_with_dynamic_index) {
+    analysis.method = DynamicUpdateSliceMethod::kDefault;
+  } else {
+    analysis.method =
+        DynamicUpdateSliceMethod::kAllPartitionedSliceDimsHaveConstantIndices;
+  }
+
+  return analysis;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 74689d45bf8d..fc860d72eba9 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -98,7 +98,7 @@ HloInstruction* CreateConstantBase(const Shape& shape, Literal value, T* b,
   }
   auto c = b->AddInstruction(HloInstruction::CreateConstant(
       literal_creator(std::move(value), shape.element_type())));
-  if (shape.rank() == 0) {
+  if (shape.dimensions_size() == 0) {
     return c;
   }
   return b->AddInstruction(HloInstruction::CreateBroadcast(shape, c, {}));
@@ -217,7 +217,7 @@ HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape, T* b,
     return hlo;
   }
   PaddingConfig padding_config;
-  for (int64_t i = 0; i < padded_shape.rank(); ++i) {
+  for (int64_t i = 0; i < padded_shape.dimensions_size(); ++i) {
     auto padding_config_dim = padding_config.add_dimensions();
     padding_config_dim->set_edge_padding_low(0);
     padding_config_dim->set_interior_padding(0);
@@ -466,18 +466,20 @@ Shape GetPerGroupBaseShape(
 // Returns the partition id within a group.
 HloInstruction* GetInGroupPartitionId(
     HloInstruction* partition_id,
-    const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b);
+    const hlo_sharding_util::DeviceGroupTileAssignment& device_groups,
+    SpmdBuilder* b);
 
 // Creates the nested partitioner state for in-group partitioning.
 PartitionedHlo::PartitioningState CreatePerGroupPartitioningState(
     const PartitionedHlo::PartitioningState& state,
-    const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b);
+    const hlo_sharding_util::DeviceGroupTileAssignment& device_groups,
+    SpmdBuilder* b);
 
 // Partially shards a replicated HLO into groups along the group dimensions, and
 // within each group data is still replicated.
 HloInstruction* PerGroupSliceFromReplicated(
     HloInstruction* replicated, HloInstruction* partition_id,
-    const std::vector<std::vector<int64_t>>& device_groups,
+    const hlo_sharding_util::DeviceGroupTileAssignment& device_groups,
     absl::Span<const int64_t> group_dims,
     absl::Span<const int64_t> group_dim_sizes, SpmdBuilder* b);
 
@@ -518,7 +520,7 @@ std::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
 // specified device groups. Group order and dimension order are ignored.
 std::optional<std::vector<int64_t>> FindMatchingPartitionedDimsForGrouping(
     const HloSharding& sharding,
-    const std::vector<std::vector<int64_t>>& device_groups);
+    const hlo_sharding_util::DeviceGroupTileAssignment& device_groups);
 
 // Create a sharding that matches the provided source sharding on the
 // specified dimensions. 'target_dims' and 'source_dims' represent the
@@ -591,6 +593,20 @@ HloInstruction* PadDataFromWindowReshard(
 std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims);
 
+// Generates partition groups (groups of devices that will communicate via a
+// collective) across provided target dims with provided group sizes in vector
+// of vector format (legacy format).
+std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
+    const HloSharding& sharding, std::vector<int64_t> target_dims,
+    std::vector<int64_t> group_sizes);
+
+// Generates partition groups (groups of devices that will communicate via a
+// collective) across provided target dims with provided group sizes in iota
+// format from sharding.
+std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
+    const HloSharding& sharding, std::vector<int64_t> target_dims,
+    std::vector<int64_t> group_sizes, int64_t num_partitions);
+
 // Generates partition groups (groups of devices that will communicate via a
 // collective) in iota format from sharding and provided replication_dims.
 // NOTE: If provided sharding does not utilize all the partitions, we skip
@@ -960,6 +976,42 @@ absl::StatusOr<std::pair<int64_t, int64_t>> EvaluatePartitionCost(
 PartitionedHlo MakeACopyAndReturnItsPartitionedHlo(const PartitionedHlo& phlo,
                                                    SpmdBuilder* b);
 
+// For dynamic-update-slice, we focus on the partitioned slice dimensions,
+// ignoring batch dimensions and replicated slice dimensions. We have three
+// methods to handle the partitioned slice dimensions.
+//
+// 1. **Default.** Replicate all tensors along the slice dimensions.
+// 2. **Single Partition Update.** The update is entirely contained within a
+//    single partition. All partitioned slice dimensions satisfy
+//    2.1 The slice size is 1, OR
+//    2.2 The update indices are compile-time constants, and the start and end
+//        indices reside in the same partition.
+// 3. **Constant Indices.** All partitioned slice dimensions have compile-time
+//    constant indices.
+//
+// If both optimizations (2 and 3) are feasible, we prioritize (2) over (3).
+// Refer to go/dus-spmd for more details.
+enum class DynamicUpdateSliceMethod {
+  // Replicate all tensors along the slice dimensions.
+  kDefault,
+
+  // The update is fully contained in a single partition.
+  kUpdateOnASinglePartition,
+
+  // All partitioned slice dimensions have compile-time constant indices.
+  kAllPartitionedSliceDimsHaveConstantIndices,
+};
+
+struct DynamicUpdateSliceAnalysis {
+  DynamicUpdateSliceMethod method;
+  // All slice dimensions of the dynamic update slice instruction.
+  std::vector<int64_t> slice_dims;
+  // The slice dimensions that are partitioned.
+  std::vector<int64_t> partitioned_slice_dims;
+};
+
+DynamicUpdateSliceAnalysis AnalyzeDynamicUpdateSlice(const HloInstruction* hlo);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/spmd/spmd_prepare_test.cc b/third_party/xla/xla/service/spmd/spmd_prepare_test.cc
index 0d1e37010bb8..54ca007b4441 100644
--- a/third_party/xla/xla/service/spmd/spmd_prepare_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_prepare_test.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -38,7 +38,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class SpmdPrepareTest : public HloTestBase {
+class SpmdPrepareTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module, int64_t distance_threshold = 100) {
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc
index f00de7f7fdf2..efd5a30da416 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc
@@ -88,21 +88,6 @@ bool StatefulRngSpmdPartitioner::CanSideEffectingHaveReplicatedSharding(
   return spmd::SpmdPartitioner::CanSideEffectingHaveReplicatedSharding(hlo);
 }
 
-absl::Status StatefulRngSpmdPartitioner::HandleRotateRightWhilePreprocessing(
-    HloComputation* computation) {
-  if (!computation->IsWhileBodyComputation()) {
-    return absl::OkStatus();
-  }
-  HloInstruction* while_loop = computation->WhileCallInstruction();
-  TF_RET_CHECK(while_loop);
-  if (computation->parent()
-          ->config()
-          .debug_options()
-          .xla_gpu_unsafe_pipelined_loop_annotator()) {
-    while_loop->set_frontend_attribute("is_pipelined_while_loop", "true");
-  }
-  return absl::OkStatus();
-}
 
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
index c46e25648ebb..16209c09d919 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
@@ -80,11 +80,6 @@ class StatefulRngSpmdPartitioner : public spmd::SpmdPartitioner {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  // This adds an unsafe attribute labelling the while loop as a pipelined
-  // while loop. This attribute lets the rest of the passes ignore the
-  // computations in the pipeline bubble.
-  absl::Status HandleRotateRightWhilePreprocessing(
-      HloComputation* computation) override;
   bool CanSideEffectingHaveReplicatedSharding(
       const HloInstruction* hlo) override;
 
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
index e13d8f631c11..0f6eb7d15546 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -28,12 +27,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/expanders/rng_expander.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/sharding_propagation.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -44,8 +43,6 @@ namespace xla {
 namespace spmd {
 namespace {
 
-namespace op = xla::testing::opcode_matchers;
-
 int64_t CountInstructions(const HloComputation &computation, HloOpcode opcode) {
   int64_t count = 0;
   for (const auto &instruction : computation.instructions()) {
@@ -56,7 +53,7 @@ int64_t CountInstructions(const HloComputation &computation, HloOpcode opcode) {
   return count;
 }
 
-class StatefulRngSpmdPartitionerTest : public HloTestBase {
+class StatefulRngSpmdPartitionerTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
       absl::string_view hlo_module, int64_t num_partitions,
@@ -102,7 +99,6 @@ class StatefulRngSpmdPartitionerTest : public HloTestBase {
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_threshold_for_windowed_einsum_mib(1000000);
     debug_options.set_xla_gpu_multi_streamed_windowed_einsum(false);
-    debug_options.set_xla_gpu_unsafe_pipelined_loop_annotator(false);
     return debug_options;
   }
 };
@@ -196,7 +192,7 @@ ENTRY main {
 }
 
 TEST_F(StatefulRngSpmdPartitionerTest, VerifyThresholdSetCorrectly) {
-  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  auto debug_options = HloHardwareIndependentTestBase::GetDebugOptionsForTest();
   int64_t threshold = 400;
   debug_options.set_xla_gpu_threshold_for_windowed_einsum_mib(threshold);
   debug_options.set_xla_gpu_multi_streamed_windowed_einsum(true);
@@ -210,65 +206,6 @@ TEST_F(StatefulRngSpmdPartitionerTest, VerifyThresholdSetCorrectly) {
   EXPECT_EQ(rng_spmd_partitioner.options().unroll_windowed_einsum, true);
 }
 
-TEST_F(StatefulRngSpmdPartitionerTest,
-       MergedSliceThenConcatRotateRightWhileOp) {
-  absl::string_view hlo_string = R"(
-HloModule test
-
-%Body {
-  %param = (f32[12], s32[]) parameter(0)
-  %i = s32[] get-tuple-element(%param), index=1
-  %one = s32[] constant(1)
-  %i_plus_one = s32[] add(s32[] %i, s32[] %one)
-  %param0 = f32[12] get-tuple-element(%param), index=0, sharding={devices=[4]<=[4]}
-  %slice0 = f32[2] slice(%param0), slice={[10:12]}, sharding={devices=[4]<=[4]}
-  %slice1 = f32[10] slice(%param0), slice={[0:10]}, sharding={devices=[4]<=[4]}
-  %concat = f32[12] concatenate(%slice0, %slice1), dimensions={0}, sharding={devices=[4]<=[4]}
-  ROOT %tuple = (f32[12], s32[]) tuple(%concat, %i_plus_one)
-}
-
-%Cond {
-  %param.1 = (f32[12], s32[]) parameter(0)
-  %i.1 = s32[] get-tuple-element(%param.1), index=1
-  %trip_count = s32[] constant(11)
-  ROOT %done = pred[] compare(%i.1, %trip_count), direction=LT
-}
-
-ENTRY %test {
-  %i_start = f32[12] parameter(0)
-  %p_start = s32[] constant(0)
-  %initial_tuple = (f32[12], s32[]) tuple(%i_start, %p_start)
-  ROOT %while = (f32[12], s32[]) while(%initial_tuple), condition=%Cond, body=%Body
-}
-)";
-
-  DebugOptions debug_options = GetDefaultDebugOptions();
-  debug_options.set_xla_gpu_unsafe_pipelined_loop_annotator(true);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      PartitionComputation(hlo_string, /*num_partitions=*/4, debug_options));
-  const HloInstruction *whileOp =
-      module->entry_computation()->GetInstructionWithName("while.1");
-  const HloInstruction *root =
-      whileOp->while_body()->GetInstructionWithName("concatenate");
-  auto rotate =
-      op::Concatenate(op::CollectivePermute(op::Slice()), op::Slice());
-  EXPECT_THAT(root, AllOf(rotate, op::Shape("f32[3]")));
-  EXPECT_TRUE(
-      whileOp->frontend_attributes().map().contains("is_pipelined_while_loop"));
-
-  // Checking that the IR is valid when unsafe_pipeline_attr = false
-  debug_options.set_xla_gpu_unsafe_pipelined_loop_annotator(false);
-  TF_ASSERT_OK_AND_ASSIGN(
-      module,
-      PartitionComputation(hlo_string, /*num_partitions=*/4, debug_options));
-  whileOp = module->entry_computation()->GetInstructionWithName("while.1");
-  root = whileOp->while_body()->GetInstructionWithName("concatenate");
-  rotate = op::Concatenate(op::CollectivePermute(op::Slice()), op::Slice());
-  EXPECT_THAT(root, AllOf(rotate, op::Shape("f32[3]")));
-}
-
 TEST_F(StatefulRngSpmdPartitionerTest,
        TotalFlopsThresholdOverrideOperandThreshold) {
   absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc b/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc
index e5c16ae15f7a..0777dec90463 100644
--- a/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc
+++ b/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 
@@ -40,7 +40,7 @@ using ::testing::_;
 using ::testing::AllOf;
 namespace op = xla::testing::opcode_matchers;
 
-class WholeGraphManualPassTest : public HloTestBase {
+class WholeGraphManualPassTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> RunPass(
       absl::string_view hlo_module) {
diff --git a/third_party/xla/xla/service/stable_sort_expander.h b/third_party/xla/xla/service/stable_sort_expander.h
deleted file mode 100644
index 78d58b24ba82..000000000000
--- a/third_party/xla/xla/service/stable_sort_expander.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_STABLE_SORT_EXPANDER_H_
-#define XLA_SERVICE_STABLE_SORT_EXPANDER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/stable_sort_expander.h"
-
-#endif  // XLA_SERVICE_STABLE_SORT_EXPANDER_H_
diff --git a/third_party/xla/xla/service/stochastic_convert_decomposer.h b/third_party/xla/xla/service/stochastic_convert_decomposer.h
deleted file mode 100644
index 79aefac76e30..000000000000
--- a/third_party/xla/xla/service/stochastic_convert_decomposer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
-#define XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/expanders/stochastic_convert_decomposer.h"
-
-#endif  // XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
diff --git a/third_party/xla/xla/service/stream_pool_test.cc b/third_party/xla/xla/service/stream_pool_test.cc
index 2bea4119a4d9..92da6b7e51dc 100644
--- a/third_party/xla/xla/service/stream_pool_test.cc
+++ b/third_party/xla/xla/service/stream_pool_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <memory>
 
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/test_helpers.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/sub_byte_normalization.h b/third_party/xla/xla/service/sub_byte_normalization.h
deleted file mode 100644
index 3f9f700509c4..000000000000
--- a/third_party/xla/xla/service/sub_byte_normalization.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
-#define XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/sub_byte_normalization.h"
-
-#endif  // XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
diff --git a/third_party/xla/xla/service/topk_rewriter.cc b/third_party/xla/xla/service/topk_rewriter.cc
index 7846052f9752..688f49e15950 100644
--- a/third_party/xla/xla/service/topk_rewriter.cc
+++ b/third_party/xla/xla/service/topk_rewriter.cc
@@ -286,7 +286,7 @@ TopKCustomCall CreateTopKCustomCall(HloSortInstruction* sort, const int64_t k) {
   HloInstruction* input = sort->mutable_operand(0);
   Shape data_shape = input->shape();
   PrimitiveType element_type = data_shape.element_type();
-  bool has_batch = data_shape.rank() >= 2;
+  bool has_batch = data_shape.dimensions().size() >= 2;
   int64_t sort_dim = sort->sort_dimension();
   int64_t input_size = data_shape.dimensions(sort_dim);
   int64_t batch_size = 1;
@@ -303,7 +303,7 @@ TopKCustomCall CreateTopKCustomCall(HloSortInstruction* sort, const int64_t k) {
     topk_input_shape =
         ShapeUtil::MakeShape(element_type, {batch_size, input_size});
 
-    if (data_shape.rank() > 2) {
+    if (data_shape.dimensions().size() > 2) {
       // Reshape to 2d.
       input = sort->AddInstruction(HloInstruction::CreateReshape(
           sort_dim == 0
@@ -346,7 +346,7 @@ TopKCustomCall CreateTopKCustomCall(HloSortInstruction* sort, const int64_t k) {
       index_gte = sort->AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(S32, {k, batch_size}), index_gte, {1, 0}));
     }
-    if (data_shape.rank() > 2) {
+    if (data_shape.dimensions().size() > 2) {
       // Reshape back.
       std::vector<int64_t> shape_dim(data_shape.dimensions().begin(),
                                      data_shape.dimensions().end());
@@ -378,7 +378,7 @@ absl::StatusOr<HloInstruction*> TopkRewriter::TransformPatternToCustomCall(
 
   // Sort dimension must be the first or last dimension.
   const int64_t sort_dim = sort->sort_dimension();
-  if (sort_dim != 0 && sort_dim != data->shape().rank() - 1) {
+  if (sort_dim != 0 && sort_dim != data->shape().dimensions().size() - 1) {
     return nullptr;
   }
 
@@ -496,9 +496,9 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
     HloInstruction* input = call->mutable_operand(0);
     Shape iota_shape = input->shape();
     iota_shape.set_element_type(S32);
-    size_t sort_dimension = input->shape().dimensions_size() - 1;
-    std::vector<int64_t> zeroes(iota_shape.rank(), 0);
-    std::vector<int64_t> ones(iota_shape.rank(), 1);
+    size_t sort_dimension = input->shape().dimensions().size() - 1;
+    std::vector<int64_t> zeroes(iota_shape.dimensions().size(), 0);
+    std::vector<int64_t> ones(iota_shape.dimensions().size(), 1);
     CHECK_NE(variadic_comparator, nullptr);
     // If only the topk values are necessary, skip the iota.
     if (HasSingleUserReadingOnlyTheValueOutput(call) &&
@@ -512,8 +512,8 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
               call->shape().tuple_shapes(0), sort, zeroes,
               call->shape().tuple_shapes(0).dimensions(), ones))));
     } else {
-      HloInstruction* iota = call->AddInstruction(
-          HloInstruction::CreateIota(iota_shape, iota_shape.rank() - 1));
+      HloInstruction* iota = call->AddInstruction(HloInstruction::CreateIota(
+          iota_shape, iota_shape.dimensions().size() - 1));
       HloInstruction* sort = call->AddInstruction(HloInstruction::CreateSort(
           ShapeUtil::MakeTupleShape({input->shape(), iota_shape}),
           sort_dimension, {input, iota}, variadic_comparator,
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index c601a884919d..6b5ed488aeb5 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -182,7 +182,7 @@ absl::Status TransferManager::ReadDynamicShapes(
 
         // Update shape size from metadata.
         for (int64_t i = 0; i < metadata.element_count(); ++i) {
-          device_sub_shape.mutable_dimensions()[i] = metadata.Get<int32_t>({i});
+          device_sub_shape.set_dimensions(i, metadata.Get<int32_t>({i}));
         }
         return absl::OkStatus();
       }));
diff --git a/third_party/xla/xla/service/transpose_folding.cc b/third_party/xla/xla/service/transpose_folding.cc
index 86a40ab27106..2550e3e98c4f 100644
--- a/third_party/xla/xla/service/transpose_folding.cc
+++ b/third_party/xla/xla/service/transpose_folding.cc
@@ -201,8 +201,8 @@ absl::StatusOr<bool> TransposeFolding::Run(
                                HloInstruction* instruction) {
     if (instruction->opcode() == HloOpcode::kDot) {
       // Don't fold dots with a 1D operand.
-      if ((instruction->operand(0)->shape().rank() < 2) ||
-          (instruction->operand(1)->shape().rank() < 2)) {
+      if ((instruction->operand(0)->shape().dimensions().size() < 2) ||
+          (instruction->operand(1)->shape().dimensions().size() < 2)) {
         return absl::OkStatus();
       }
 
@@ -268,7 +268,7 @@ TransposeFolding::IsRowColumnTransposeDotOperand(const HloInstruction& dot,
                               ? dot_dims.lhs_contracting_dimensions()
                               : dot_dims.rhs_contracting_dimensions();
 
-  return (batch_dims.size() == transpose.shape().rank() - 2) &&
+  return (batch_dims.size() == transpose.shape().dimensions().size() - 2) &&
          (contracting_dims.size() == 1) &&
          absl::c_all_of(batch_dims, [&](int64_t dim) {
            return transpose.dimensions(dim) == dim;
diff --git a/third_party/xla/xla/service/transpose_folding_test.cc b/third_party/xla/xla/service/transpose_folding_test.cc
index e00b6adeafb7..c602901d0412 100644
--- a/third_party/xla/xla/service/transpose_folding_test.cc
+++ b/third_party/xla/xla/service/transpose_folding_test.cc
@@ -25,14 +25,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/shape_inference.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status_matchers.h"
@@ -43,7 +43,7 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 using ::tsl::testing::IsOkAndHolds;
 
-using TransposeFoldingTest = HloTestBase;
+using TransposeFoldingTest = HloHardwareIndependentTestBase;
 
 TEST_F(TransposeFoldingTest, FoldDotTranspose) {
   constexpr absl::string_view kHloString = R"(
diff --git a/third_party/xla/xla/service/tree_reduction_rewriter.h b/third_party/xla/xla/service/tree_reduction_rewriter.h
deleted file mode 100644
index e505b69e92d0..000000000000
--- a/third_party/xla/xla/service/tree_reduction_rewriter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
-#define XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h"
-
-#endif  // XLA_SERVICE_TREE_REDUCTION_REWRITER_H_
diff --git a/third_party/xla/xla/service/triangular_solve_expander.cc b/third_party/xla/xla/service/triangular_solve_expander.cc
index 0a328e86b390..67dc0235810d 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.cc
+++ b/third_party/xla/xla/service/triangular_solve_expander.cc
@@ -51,7 +51,7 @@ XlaOp DiagonalBlocks(XlaOp a, int64_t block_size) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(a));
-    int ndims = shape.rank();
+    int ndims = shape.dimensions().size();
     int64_t n = ShapeUtil::GetDimension(shape, -1);
     int64_t num_blocks = n / block_size;
     absl::Span<int64_t const> batch_dims = absl::MakeConstSpan(
@@ -146,14 +146,14 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
     int64_t block_size = ShapeUtil::GetDimension(blocks_shape, -1);
 
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    int64_t ndims = a_shape.rank();
+    int64_t ndims = a_shape.dimensions().size();
     int64_t n = ShapeUtil::GetDimension(a_shape, -1);
     int64_t num_blocks = n / block_size + (n % block_size != 0);
     int64_t m_dim = (left_side) ? -1 : -2;
     int64_t m = ShapeUtil::GetDimension(b_shape, m_dim);
 
     std::vector<XlaOp> update_ops;
-    int bdims = b_shape.rank();
+    int bdims = b_shape.dimensions().size();
     int64_t block_dim = (left_side) ? bdims - 2 : bdims - 1;
 
     // Initialize the solution
@@ -377,7 +377,7 @@ XlaOp TriangularSolveExpander::SolveByInvertingDiagonalBlocks(
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int64_t ndims = a_shape.rank();
+    const int64_t ndims = a_shape.dimensions().size();
     int64_t k = ShapeUtil::GetDimension(a_shape, -1);
 
     // TODO(phawkins): consider pushing triangle masking into
@@ -479,13 +479,13 @@ XlaOp TriangularSolveExpander::BuildTriangularSolve(
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
     TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
-    if (a_shape.rank() != b_shape.rank()) {
+    if (a_shape.dimensions().size() != b_shape.dimensions().size()) {
       return InvalidArgument(
           "Arguments to TriangularSolve have shapes with different ranks: "
           "%s vs. %s",
           ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
-    const int64_t ndims = a_shape.rank();
+    const int64_t ndims = a_shape.dimensions().size();
     if (ndims < 2) {
       return InvalidArgument(
           "Arguments to TriangularSolve was rank %d but must have rank >= 2.",
diff --git a/third_party/xla/xla/service/triangular_solve_expander_test.cc b/third_party/xla/xla/service/triangular_solve_expander_test.cc
index 1a2ba8c71ece..d03e166b49e8 100644
--- a/third_party/xla/xla/service/triangular_solve_expander_test.cc
+++ b/third_party/xla/xla/service/triangular_solve_expander_test.cc
@@ -87,7 +87,7 @@ TEST_P(TriangularExpanderTest, TestBlockSize) {
   TF_ASSERT_OK_AND_ASSIGN(Literal lx, Execute(std::move(module), {&la, &lb}));
 
   auto x_shape = lx.shape();
-  EXPECT_EQ(x_shape.dimensions_size(), 2);
+  EXPECT_EQ(x_shape.dimensions().size(), 2);
   EXPECT_EQ(x_shape.dimensions(0), b.dim(0));
   EXPECT_EQ(x_shape.dimensions(1), b.dim(1));
 
diff --git a/third_party/xla/xla/service/tuple_points_to_analysis.h b/third_party/xla/xla/service/tuple_points_to_analysis.h
deleted file mode 100644
index 1b231e4b76ad..000000000000
--- a/third_party/xla/xla/service/tuple_points_to_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_TUPLE_POINTS_TO_ANALYSIS_H_
-#define XLA_SERVICE_TUPLE_POINTS_TO_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/tuple_points_to_analysis.h"
-
-#endif  // XLA_SERVICE_TUPLE_POINTS_TO_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/tuple_simplifier.h b/third_party/xla/xla/service/tuple_simplifier.h
deleted file mode 100644
index 19d81248537b..000000000000
--- a/third_party/xla/xla/service/tuple_simplifier.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_TUPLE_SIMPLIFIER_H_
-#define XLA_SERVICE_TUPLE_SIMPLIFIER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
-
-#endif  // XLA_SERVICE_TUPLE_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/tuple_util.cc b/third_party/xla/xla/service/tuple_util.cc
index abbc4975a5b6..f3a416fe79c9 100644
--- a/third_party/xla/xla/service/tuple_util.cc
+++ b/third_party/xla/xla/service/tuple_util.cc
@@ -72,8 +72,8 @@ namespace xla {
   HloComputation* computation = input_tuple->parent();
   const Shape& input_shape = input_tuple->shape();
   std::vector<HloInstruction*> tuple_elements;
-  tuple_elements.reserve(input_shape.tuple_shapes_size());
-  for (int i = 0; i < input_shape.tuple_shapes_size(); i++) {
+  tuple_elements.reserve(input_shape.tuple_shapes().size());
+  for (int i = 0; i < input_shape.tuple_shapes().size(); i++) {
     tuple_elements.push_back(
         computation->AddInstruction(HloInstruction::CreateGetTupleElement(
             input_shape.tuple_shapes(i), input_tuple, i)));
@@ -114,9 +114,9 @@ namespace xla {
   }
 
   HloComputation* computation = new_instruction->parent();
-  std::vector<HloInstruction*> tuple_args(tuple_shape.tuple_shapes_size());
-  CHECK_GE(tuple_shape.tuple_shapes_size(), shape_index[0]);
-  for (int i = 0; i < tuple_shape.tuple_shapes_size(); ++i) {
+  std::vector<HloInstruction*> tuple_args(tuple_shape.tuple_shapes().size());
+  CHECK_GE(tuple_shape.tuple_shapes().size(), shape_index[0]);
+  for (int i = 0; i < tuple_shape.tuple_shapes().size(); ++i) {
     const Shape& subshape = tuple_shape.tuple_shapes(i);
     // If tuple is a tuple instruction, we can get the tuple instruction's
     // operand to construct the new tuple to improve compilation time
@@ -161,7 +161,7 @@ namespace xla {
       tuple_args[i] = get_operand();
     }
   }
-  if (shape_index[0] == tuple_shape.tuple_shapes_size()) {
+  if (shape_index[0] == tuple_shape.tuple_shapes().size()) {
     // If shape_index[0] is equal to the tuple shape size, add the new
     // instruction as an additional argument.
     tuple_args.push_back(new_instruction);
@@ -225,7 +225,7 @@ HloInstruction* TupleUtil::AssembleTupleInstruction(
         if (subshape.IsTuple()) {
           absl::InlinedVector<HloInstruction*, 2> children;
           ShapeIndex child_index = index;
-          for (int i = 0; i < subshape.tuple_shapes_size(); ++i) {
+          for (int i = 0; i < subshape.tuple_shapes().size(); ++i) {
             child_index.push_back(i);
             children.push_back(elements.element(child_index));
             child_index.pop_back();
diff --git a/third_party/xla/xla/service/tuple_util.h b/third_party/xla/xla/service/tuple_util.h
index 86719aba74e1..133da2c60e80 100644
--- a/third_party/xla/xla/service/tuple_util.h
+++ b/third_party/xla/xla/service/tuple_util.h
@@ -53,7 +53,8 @@ class TupleUtil {
   // get-tuple-elements and a new tuple instruction. Returns the root of the
   // graph of instructions generated.
   static HloInstruction* Duplicate(HloInstruction* input_tuple) {
-    return ExtractPrefix(input_tuple, input_tuple->shape().tuple_shapes_size());
+    return ExtractPrefix(input_tuple,
+                         input_tuple->shape().tuple_shapes().size());
   }
 
   // Descend to the shape_index element of the tuple and replace that with
diff --git a/third_party/xla/xla/service/tuple_util_test.cc b/third_party/xla/xla/service/tuple_util_test.cc
index 6e91ad17f7e1..fdb447f987c3 100644
--- a/third_party/xla/xla/service/tuple_util_test.cc
+++ b/third_party/xla/xla/service/tuple_util_test.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -33,7 +33,7 @@ namespace {
 
 namespace op = ::xla::testing::opcode_matchers;
 
-using TupleUtilTest = HloTestBase;
+using TupleUtilTest = HloHardwareIndependentTestBase;
 
 TEST_F(TupleUtilTest, ExtractPrefix) {
   const std::string hlo_string = R"(
diff --git a/third_party/xla/xla/service/value_range.cc b/third_party/xla/xla/service/value_range.cc
index d4edd39db8ed..a4031ed60653 100644
--- a/third_party/xla/xla/service/value_range.cc
+++ b/third_party/xla/xla/service/value_range.cc
@@ -94,8 +94,9 @@ Range RecursivelyIdentifyRange(
     absl::flat_hash_map<const HloInstruction*, Range>& known_ranges,
     const HloAliasAnalysis* alias_analysis) {
   // Non scalar or non-integer HLO. Abort.
-  if ((!instr->shape().IsInteger() && instr->shape().element_type() != PRED) ||
-      instr->shape().dimensions_size() != 0) {
+  if ((!instr->shape().AreAllLeavesIntegers() &&
+       instr->shape().element_type() != PRED) ||
+      (instr->shape().IsArray() && !instr->shape().dimensions().empty())) {
     return Range{};
   }
   VLOG(5) << "Computing Range for " << instr->ToString();
@@ -166,7 +167,7 @@ Range RecursivelyIdentifyRange(
     }
     case HloOpcode::kConstant: {
       if (instr->shape().element_type() == PRED &&
-          instr->shape().dimensions_size() == 0) {
+          instr->shape().dimensions().size() == 0) {
         if (instr->literal().IsAll(true)) {
           return RecordAndReturnRange(
               Range{ConstantValue::GetOne(/*bitwidth=*/1, /*is_signed=*/false),
@@ -180,7 +181,7 @@ Range RecursivelyIdentifyRange(
                   /*is_linear=*/true},
             instr, known_ranges);
       }
-      if (!instr->shape().IsInteger()) {
+      if (!instr->shape().AreAllLeavesIntegers()) {
         return Range{};
       }
       VLOG(5) << "Handling Constant";
@@ -206,7 +207,7 @@ Range RecursivelyIdentifyRange(
           instr, known_ranges);
     }
     case HloOpcode::kAdd: {
-      if (!instr->shape().IsInteger()) {
+      if (!instr->shape().AreAllLeavesIntegers()) {
         return Range{};
       }
       VLOG(5) << "Handling Add";
@@ -236,7 +237,7 @@ Range RecursivelyIdentifyRange(
           instr, known_ranges);
     }
     case HloOpcode::kMultiply: {
-      if (!instr->shape().IsInteger()) {
+      if (!instr->shape().AreAllLeavesIntegers()) {
         return Range{};
       }
       VLOG(5) << "Handling Multiply";
@@ -301,7 +302,7 @@ Range RecursivelyIdentifyRange(
           instr, known_ranges);
     }
     case HloOpcode::kSubtract: {
-      if (!instr->shape().IsInteger()) {
+      if (!instr->shape().AreAllLeavesIntegers()) {
         return Range{};
       }
       VLOG(5) << "Handling Subtract";
diff --git a/third_party/xla/xla/service/value_range_test.cc b/third_party/xla/xla/service/value_range_test.cc
index ff389b92b11c..517330e85438 100644
--- a/third_party/xla/xla/service/value_range_test.cc
+++ b/third_party/xla/xla/service/value_range_test.cc
@@ -26,15 +26,15 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/constant_value.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class ValueRangeTest : public HloTestBase {};
+class ValueRangeTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(ValueRangeTest, AddedValue) {
   constexpr absl::string_view hlo_string = R"(
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
index eb235a6396ed..3a5ac1d0da8a 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal_util.h"
 #include "xla/map_util.h"
-#include "xla/service/call_graph.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -66,7 +65,7 @@ struct MovableAllReduceContext {
 };
 
 bool IsZero(const HloInstruction* hlo) {
-  if (hlo->IsConstant() && hlo->shape().rank() == 0 &&
+  if (hlo->IsConstant() && hlo->shape().dimensions().size() == 0 &&
       hlo->literal().IsZero({})) {
     return true;
   }
@@ -874,7 +873,7 @@ HloInstruction* CreateNewWhileResult(
   HloComputation* while_parent = new_while_instruction->parent();
   CHECK(new_while_instruction->shape().IsTuple());
   std::vector<HloInstruction*> new_while_result_elements(
-      new_while_instruction->shape().tuple_shapes_size(), nullptr);
+      new_while_instruction->shape().tuple_shapes().size(), nullptr);
   for (int i = 0; i < new_while_result_elements.size(); i++) {
     if (ContainsKey(tuple_index_to_new_buffer, i)) {
       new_while_result_elements[i] = tuple_index_to_new_buffer.at(i);
@@ -980,7 +979,6 @@ absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
   // loop. We recursively sink the all-reduce through nested while loops if
   // applicable by repeating this process.
   uint32_t count_all_reduce = 0, count_reduce_scatter = 0;
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   // We process all callees of a computation before processing the computation,
   // so that when we process a computation, the all-reduce instructions that
   // need to be hoisted to the computation from its callees have been hoisted.
@@ -989,8 +987,8 @@ absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
     // A computation could be the while body of multiple while instructions,
     // so we start from the computation and find all of its callers that is a
     // kWhile if there is any.
-    std::vector<HloInstruction*> computation_callers =
-        call_graph->GetComputationCallers(computation);
+    auto computation_callers =
+        computation->caller_instructions(HloOpcode::kWhile);
     std::vector<HloInstruction*> while_caller_instructions;
     for (HloInstruction* caller_instruction : computation_callers) {
       // For simplicity, we only support while instructions whose shape is
@@ -1065,11 +1063,6 @@ absl::StatusOr<bool> WhileLoopAllReduceCodeMotion::Run(
       TF_RETURN_IF_ERROR(computation->ReplaceInstructionWithDifferentShape(
           all_reduce, all_reduce->mutable_operand(0)));
     }
-    // Needs to rebuild the call graph after we remove instructions to avoid
-    // accessing removed instructions.
-    if (!all_reduce_to_accumulations.empty()) {
-      call_graph = CallGraph::Build(module);
-    }
   }
   VLOG(2) << "Hoisted " << count_all_reduce << " all-reduce and "
           << count_reduce_scatter << " reduce-scatter out of while loops";
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
index b422f2b3bf46..4be7e97143f7 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion_test.cc
@@ -33,9 +33,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_verifier.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 
@@ -48,7 +48,7 @@ using ::testing::NotNull;
 using ::testing::Property;
 using ::testing::SizeIs;
 
-class WhileLoopAllReduceCodeMotionTest : public HloTestBase {
+class WhileLoopAllReduceCodeMotionTest : public HloHardwareIndependentTestBase {
  public:
   template <HloOpcode op>
   HloInstruction* find_op(HloComputation* computation) {
diff --git a/third_party/xla/xla/service/while_loop_analysis.h b/third_party/xla/xla/service/while_loop_analysis.h
deleted file mode 100644
index c6d95ac80db2..000000000000
--- a/third_party/xla/xla/service/while_loop_analysis.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
-#define XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/analysis/while_loop_analysis.h"
-
-#endif  // XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.cc b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
index e1aa072d30ec..df3de5c47069 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
@@ -79,8 +79,8 @@ struct ConcatGroup {
     if (inserted_concat_dim) {
       std::vector<int64_t> dims;
       const Shape& element_shape = elements.back()->shape();
-      dims.reserve(element_shape.rank() + 1);
-      for (int64_t i = 0; i < element_shape.rank(); ++i) {
+      dims.reserve(element_shape.dimensions().size() + 1);
+      for (int64_t i = 0; i < element_shape.dimensions().size(); ++i) {
         if (i == concat_dim) {
           dims.push_back(elements.size());
         }
@@ -105,20 +105,20 @@ struct ConcatGroup {
                               HloComputation* comp) const {
     Shape shape = full_data->shape();
     shape.set_dimensions(concat_dim, element_sizes[element_index]);
-    std::vector<int64_t> starts(shape.rank(), 0);
+    std::vector<int64_t> starts(shape.dimensions().size(), 0);
     std::vector<int64_t> limits(shape.dimensions().begin(),
                                 shape.dimensions().end());
     starts[concat_dim] = element_offsets[element_index];
     limits[concat_dim] += starts[concat_dim];
-    auto slice = comp->AddInstruction(
-        HloInstruction::CreateSlice(shape, full_data, starts, limits,
-                                    std::vector<int64_t>(shape.rank(), 1)));
+    auto slice = comp->AddInstruction(HloInstruction::CreateSlice(
+        shape, full_data, starts, limits,
+        std::vector<int64_t>(shape.dimensions().size(), 1)));
     if (!inserted_concat_dim) {
       return slice;
     }
     std::vector<int64_t> element_shape;
-    element_shape.reserve(shape.rank() - 1);
-    for (int64_t i = 0; i < shape.rank(); ++i) {
+    element_shape.reserve(shape.dimensions().size() - 1);
+    for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
       if (i != concat_dim) {
         element_shape.push_back(shape.dimensions(i));
       }
@@ -132,8 +132,10 @@ struct ConcatGroup {
     if (inserted_concat_dim) {
       for (int64_t i = 0; i < input_elements.size(); ++i) {
         std::vector<int64_t> element_shape;
-        element_shape.reserve(input_elements[i]->shape().rank() + 1);
-        for (int64_t j = 0; j < input_elements[i]->shape().rank(); ++j) {
+        element_shape.reserve(input_elements[i]->shape().dimensions().size() +
+                              1);
+        for (int64_t j = 0; j < input_elements[i]->shape().dimensions().size();
+             ++j) {
           if (j == concat_dim) {
             element_shape.push_back(1);
           }
@@ -293,8 +295,8 @@ std::optional<std::pair<int64_t, bool>> GetOperandConcatDim(
     operand_inserted_concat_dim = true;
     // Try to place operand_concat_dim adjacent to dims the same way as the
     // output, if it does not exist in the operand..
-    int64_t min_dist_to_concat_dim = hlo->shape().rank();
-    for (int64_t i = 0; i < operand_shape.rank(); ++i) {
+    int64_t min_dist_to_concat_dim = hlo->shape().dimensions().size();
+    for (int64_t i = 0; i < operand_shape.dimensions().size(); ++i) {
       if (hlo->dimensions(i) == hlo_concat_dim) {
         operand_concat_dim = i;
         operand_inserted_concat_dim = hlo_inserted_concat_dim;
@@ -332,8 +334,9 @@ std::optional<std::pair<int64_t, bool>> GetOperandConcatDim(
     int64_t j = 0;
     operand_inserted_concat_dim = false;
     // Only support adding/removing trivial dims.
-    while (i < operand_shape.rank() || j <= hlo_concat_dim) {
-      if (i < operand_shape.rank() && j < hlo->shape().rank() &&
+    while (i < operand_shape.dimensions().size() || j <= hlo_concat_dim) {
+      if (i < operand_shape.dimensions().size() &&
+          j < hlo->shape().dimensions().size() &&
           operand_shape.dimensions(i) == hlo->shape().dimensions(j)) {
         if (j == hlo_concat_dim) {
           operand_inserted_concat_dim =
@@ -345,7 +348,8 @@ std::optional<std::pair<int64_t, bool>> GetOperandConcatDim(
         j++;
         continue;
       }
-      if (i < operand_shape.rank() && operand_shape.dimensions(i) == 1) {
+      if (i < operand_shape.dimensions().size() &&
+          operand_shape.dimensions(i) == 1) {
         if (j == hlo_concat_dim && hlo_inserted_concat_dim) {
           operand_concat_dim = i;
           break;
@@ -358,7 +362,8 @@ std::optional<std::pair<int64_t, bool>> GetOperandConcatDim(
         operand_inserted_concat_dim = true;
         break;
       }
-      if (j < hlo->shape().rank() && hlo->shape().dimensions(j) == 1) {
+      if (j < hlo->shape().dimensions().size() &&
+          hlo->shape().dimensions(j) == 1) {
         j++;
         continue;
       }
@@ -385,13 +390,15 @@ void ModifyHloPropertiesForConcatShape(const ConcatGroup& group,
     bool operand_inserted_concat_dim = operand_dim->second;
     if (operand_inserted_concat_dim) {
       // We should have added an dimension on the operand.
-      CHECK_EQ(hlo->operand(0)->shape().rank(), hlo->dimensions().size() + 1)
+      CHECK_EQ(hlo->operand(0)->shape().dimensions().size(),
+               hlo->dimensions().size() + 1)
           << hlo->ToString();
     } else {
-      CHECK_EQ(hlo->operand(0)->shape().rank(), hlo->dimensions().size());
+      CHECK_EQ(hlo->operand(0)->shape().dimensions().size(),
+               hlo->dimensions().size());
     }
     std::vector<int64_t> dims;
-    const int64_t rank = hlo->operand(0)->shape().rank();
+    const int64_t rank = hlo->operand(0)->shape().dimensions().size();
     dims.reserve(rank);
     for (int64_t i = 0; i < rank; ++i) {
       if (i == operand_concat_dim && operand_inserted_concat_dim) {
@@ -585,7 +592,7 @@ bool GroupHlosForConcat(
           VLOG(2) << "Slices of different operands.";
           return fail_and_cleanup();
         }
-        for (int64_t j = 0; j < hlos[i]->shape().rank(); ++j) {
+        for (int64_t j = 0; j < hlos[i]->shape().dimensions().size(); ++j) {
           if (hlos[i]->slice_strides(j) != 1) {
             VLOG(2) << "Slices with strides.";
             return fail_and_cleanup();
@@ -639,7 +646,7 @@ bool GroupHlosForConcat(
 }
 
 std::vector<bool> TupleElementsUsedInCond(HloInstruction* loop) {
-  std::vector<bool> result(loop->shape().tuple_shapes_size(), false);
+  std::vector<bool> result(loop->shape().tuple_shapes().size(), false);
   for (auto user : loop->while_condition()->parameter_instruction(0)->users()) {
     if (user->opcode() != HloOpcode::kGetTupleElement) {
       absl::c_fill(result, true);
@@ -718,7 +725,8 @@ absl::Status RewriteLoopWithConcatGroups(
   auto cond_param = loop->while_condition()->parameter_instruction(0);
 
   // First, modify loop signature and operands/users.
-  std::vector<HloInstruction*> init_elements(loop->shape().tuple_shapes_size());
+  std::vector<HloInstruction*> init_elements(
+      loop->shape().tuple_shapes().size());
   for (int64_t i = 0; i < param_gtes.size(); ++i) {
     init_elements[i] =
         loop->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -754,7 +762,7 @@ absl::Status RewriteLoopWithConcatGroups(
   auto original_loop_users = loop->users();
   const bool loop_is_root = loop == loop->parent()->root_instruction();
   std::vector<HloInstruction*> output_elements(
-      loop->shape().tuple_shapes_size());
+      loop->shape().tuple_shapes().size());
   for (int64_t i = 0; i < param_gtes.size(); ++i) {
     output_elements[i] =
         loop->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -846,7 +854,7 @@ absl::Status RewriteLoopWithConcatGroups(
                   hlo->operand(i)->shape().dimensions(operand_concat_dim + 1));
               d = operand_concat_dim + 2;
             }
-            for (; d < hlo->operand(i)->shape().rank(); ++d) {
+            for (; d < hlo->operand(i)->shape().dimensions().size(); ++d) {
               new_dims.push_back(hlo->operand(i)->shape().dimensions(d));
             }
             auto reshape = body->AddInstruction(HloInstruction::CreateReshape(
@@ -869,7 +877,7 @@ absl::Status RewriteLoopWithConcatGroups(
         Shape data_shape = hlo->operand(i)->shape();
         std::vector<int64_t> broadcast_dims;
         std::vector<int64_t> broadcast_shape;
-        const int64_t data_shape_rank = data_shape.rank();
+        const int64_t data_shape_rank = data_shape.dimensions().size();
         broadcast_dims.reserve(data_shape_rank);
         broadcast_shape.reserve(data_shape_rank + 1);
         for (int64_t j = 0; j < data_shape_rank; ++j) {
@@ -883,7 +891,7 @@ absl::Status RewriteLoopWithConcatGroups(
           }
           broadcast_shape.push_back(data_shape.dimensions(j));
         }
-        if (broadcast_shape.size() == data_shape.rank()) {
+        if (broadcast_shape.size() == data_shape.dimensions().size()) {
           // New dim at the end.
           broadcast_shape.push_back(group.elements.size());
         }
@@ -952,7 +960,7 @@ absl::StatusOr<bool> RunOnLoop(HloInstruction* loop,
   if (!param->shape().IsTuple() || root->opcode() != HloOpcode::kTuple) {
     return false;
   }
-  std::vector<HloInstruction*> gtes(param->shape().tuple_shapes_size(),
+  std::vector<HloInstruction*> gtes(param->shape().tuple_shapes().size(),
                                     nullptr);
   ConcatGroups groups;
   auto indices_used_in_cond = TupleElementsUsedInCond(loop);
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion_test.cc b/third_party/xla/xla/service/while_loop_concat_code_motion_test.cc
index 83a43f54f7dd..403f5c6a15ec 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion_test.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_verifier.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 
@@ -35,7 +35,7 @@ namespace {
 
 namespace op = ::xla::testing::opcode_matchers;
 
-class WhileLoopConcatCodeMotionTest : public HloTestBase {};
+class WhileLoopConcatCodeMotionTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(WhileLoopConcatCodeMotionTest, SimpleMotion) {
   constexpr absl::string_view kHloModule = R"(
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking_test.cc b/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
index 2cfd69a9254e..ee608ac9663c 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -28,7 +28,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
-using WhileLoopConstantSinkingTest = HloTestBase;
+using WhileLoopConstantSinkingTest = HloHardwareIndependentTestBase;
 
 TEST_F(WhileLoopConstantSinkingTest, SinkOneConstant) {
   const char* const hlo_string = R"(
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc
index fdf3164a4db5..50fd0e535b0e 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc
@@ -21,15 +21,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-using WhileLoopExpensiveInvariantCodeMotionTest = HloTestBase;
+using WhileLoopExpensiveInvariantCodeMotionTest =
+    HloHardwareIndependentTestBase;
 namespace op = xla::testing::opcode_matchers;
 
 constexpr char kModuleWithNonInflatingInvariantDot[] = R"(
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.cc b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
index 26cbf6c78244..99e17318fa37 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
@@ -52,7 +52,7 @@ namespace {
 bool IsPurelyExpanding(const HloInstruction* instr) {
   return instr->opcode() == HloOpcode::kBroadcast ||
          (instr->opcode() == HloOpcode::kConstant &&
-          instr->shape().rank() == 0) ||
+          instr->shape().dimensions().size() == 0) ||
          instr->opcode() == HloOpcode::kIota;
 }
 
@@ -188,7 +188,8 @@ absl::StatusOr<bool> TryRewritingBroadcastAsAllocateBuffer(
     HloInstruction* while_instr) {
   std::optional<int64_t> induction_var_tuple_index =
       GetLoopInductionVarTupleIdx(while_instr);
-  if (!induction_var_tuple_index.has_value()) {
+  if (!induction_var_tuple_index.has_value() ||
+      ComputeWhileLoopTripCount(while_instr).value_or(0) == 0) {
     return false;
   }
   HloComputation* while_body = while_instr->while_body();
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
index c161bc05d204..d5ecda426902 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
@@ -22,17 +22,16 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
-using WhileLoopFusibleSinkingTest = HloTestBase;
+using WhileLoopFusibleSinkingTest = HloHardwareIndependentTestBase;
 
 TEST_F(WhileLoopFusibleSinkingTest, SinkOneFusible) {
   const char* const hlo_string = R"(
@@ -290,69 +289,58 @@ TEST_F(WhileLoopFusibleSinkingTest,
               op::While(op::Tuple(_, op::CustomCall(), _, _)));
 }
 
-TEST_F(WhileLoopFusibleSinkingTest,
-       TestPlumbSingleBroadcastNoneZeroLoopIterationVar) {
+TEST_F(WhileLoopFusibleSinkingTest, TestPlumbMultipleBroadcast) {
   const std::string hlo_string_before = R"(
-    HloModule cluster_6512412223095190558_f15n_0__.258
-
-    %wide._functionalize_body_1_const_0__.164.clone.clone.clone.clone (wide.arg_tuple.1: (s32[], f32[2])) -> (s32[], f32[2]) {
-      %wide.arg_tuple.1 = (s32[], f32[2]{0}) parameter(0)
-      %get-tuple-element.383 = s32[] get-tuple-element((s32[], f32[2]{0}) %wide.arg_tuple.1), index=0
-      %constant.50..sunk.4 = s32[] constant(-1)
-      %add.48 = s32[] add(s32[] %get-tuple-element.383, s32[] %constant.50..sunk.4)
-      %get-tuple-element.384 = f32[2]{0} get-tuple-element((s32[], f32[2]{0}) %wide.arg_tuple.1), index=1
-      %constant.11..sunk.4 = f32[] constant(1)
-      %broadcast.19 = f32[2]{0} broadcast(f32[] %constant.11..sunk.4), dimensions={}
-      %add.49 = f32[2]{0} add(f32[2]{0} %get-tuple-element.384, f32[2]{0} %broadcast.19)
-      ROOT %tuple.55 = (s32[], f32[2]{0}) tuple(s32[] %add.48, f32[2]{0} %add.49)
-    }
-
-    %wide.cond_wrapper.236.clone.clone.clone.clone (wide.inputs.1: (s32[], f32[2])) -> pred[] {
-      %wide.inputs.1 = (s32[], f32[2]{0}) parameter(0)
-      %get-tuple-element.382 = s32[] get-tuple-element((s32[], f32[2]{0}) %wide.inputs.1), index=0
-      %constant.66 = s32[] constant(1)
-      ROOT %compare.10 = pred[] compare(s32[] %get-tuple-element.382, s32[] %constant.66), direction=GE
-    }
-
-    %_functionalize_body_0_const_0__.40.clone.clone.clone.clone.clone.clone.clone (arg_tuple.9: (s32[])) -> (s32[]) {
-      %arg_tuple.9 = (s32[]) parameter(0)
-      %get-tuple-element.409 = s32[] get-tuple-element((s32[]) %arg_tuple.9), index=0
-      %constant.71 = s32[] constant(1)
-      %add.57 = s32[] add(s32[] %get-tuple-element.409, s32[] %constant.71)
-      ROOT %tuple.61 = (s32[]) tuple(s32[] %add.57)
-    }
-
-    %cond_wrapper.120.clone.clone.clone.clone.clone.clone (inputs.7: (s32[])) -> pred[] {
-      %inputs.7 = (s32[]) parameter(0)
-      %get-tuple-element.408 = s32[] get-tuple-element((s32[]) %inputs.7), index=0
-      %constant.70 = s32[] constant(10)
-      ROOT %compare.12 = pred[] compare(s32[] %get-tuple-element.408, s32[] %constant.70), direction=LT
-    }
-
-    ENTRY %cluster_6512412223095190558_f15n_0__.258{
-      %arg_tuple.1 = () parameter(0)
-      %constant.24 = s32[] constant(0)
-      %tuple.60 = (s32[]) tuple(s32[] %constant.24)
-      %while.10 = (s32[]) while((s32[]) %tuple.60), condition=%cond_wrapper.120.clone.clone.clone.clone.clone.clone, body=%_functionalize_body_0_const_0__.40.clone.clone.clone.clone.clone.clone.clone
-      %get-tuple-element.380 = s32[] get-tuple-element((s32[]) %while.10), index=0
-      %constant.9 = f32[] constant(0)
-      %broadcast.10 = f32[2]{0} broadcast(f32[] %constant.9), dimensions={}
-      %tuple.54 = (s32[], f32[2]{0}) tuple(s32[] %get-tuple-element.380, f32[2]{0} %broadcast.10)
-      ROOT %while.8 = (s32[], f32[2]{0}) while((s32[], f32[2]{0}) %tuple.54), condition=%wide.cond_wrapper.236.clone.clone.clone.clone, body=%wide._functionalize_body_1_const_0__.164.clone.clone.clone.clone
-    }
+  HloModule test
+
+  loop.body {
+    loop_var.1 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.1 = s32[]{:T(128)} get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.4 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} get-tuple-element(loop_var.1), index=2
+    get-tuple-element.3 = s32[4,3,5]{2,1,0} get-tuple-element(loop_var.1), index=3
+    bitcast.12855 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} bitcast(get-tuple-element.3)
+    add.40974 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} add(get-tuple-element.2, bitcast.12855)
+    add.1 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} add(get-tuple-element.4, add.40974)
+    constant.1 = s32[]{:T(128)} constant(1)
+    idx = s32[]{:T(128)} add(get-tuple-element.1, constant.1)
+    ROOT tuple = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(idx, add.40974, add.1, get-tuple-element.3)
+  }
+
+  loop.condition {
+    loop_var.2 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.3 = s32[]{:T(128)} get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[]{:T(128)} constant(4)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+
+  ENTRY %main {
+    param.1 = s32[4,3,5]{2,1,0} iota(), iota_dimension=0
+    zero = s32[]{:T(128)} constant(0)
+    zeros32 = s32[]{:T(128)} constant(0)
+    broadcast = s32[1,1,1,4,3,5]{5,4,3,2,1,0} broadcast(zeros32)
+    input = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(zero, broadcast, broadcast, param.1)
+    ROOT while = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) while(input), condition=loop.condition, body=loop.body
+  }
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module_before,
                           ParseAndReturnVerifiedModule(hlo_string_before));
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           WhileLoopFusibleSinking{}.Run(module_before.get()));
   EXPECT_TRUE(changed);
-  EXPECT_THAT(FindInstruction(module_before.get(), "while.8"),
-              op::While(op::Tuple(_, op::CustomCall(), _)));
+  EXPECT_THAT(
+      FindInstruction(module_before.get(), "while"),
+      op::While(op::Tuple(_, op::CustomCall(), op::CustomCall(), _, _)));
 }
 
-TEST_F(WhileLoopFusibleSinkingTest, TestPlumbMultipleBroadcast) {
+TEST_F(WhileLoopFusibleSinkingTest, TestNoPlumbWithBadCondition) {
   const std::string hlo_string_before = R"(
   HloModule test
+  tmp {
+    x = s32[] parameter(0)
+    y = s32[] parameter(1)
+    ROOT add = s32[] add(x, y)
+  }
 
   loop.body {
     loop_var.1 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
@@ -371,12 +359,14 @@ TEST_F(WhileLoopFusibleSinkingTest, TestPlumbMultipleBroadcast) {
   loop.condition {
     loop_var.2 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
     get-tuple-element.3 = s32[]{:T(128)} get-tuple-element(loop_var.2), index=0
-    constant.2 = s32[]{:T(128)} constant(4)
-    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+    get-tuple-element.4 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} get-tuple-element(loop_var.2), index=1
+    z = s32[]{:T(128)} constant(0)
+    r = s32[]{:T(128)} reduce(get-tuple-element.4, z), dimensions={0,1,2,3,4,5}, to_apply=tmp
+    ROOT less-than = pred[] compare(get-tuple-element.3, r), direction=LT
   }
 
   ENTRY %main {
-    param.1 = s32[4,3,5]{2,1,0} iota(), iota_dimension=0
+    param.1 = s32[4,3,5]{2,1,0} parameter(0)
     zero = s32[]{:T(128)} constant(0)
     zeros32 = s32[]{:T(128)} constant(0)
     broadcast = s32[1,1,1,4,3,5]{5,4,3,2,1,0} broadcast(zeros32)
@@ -388,10 +378,55 @@ TEST_F(WhileLoopFusibleSinkingTest, TestPlumbMultipleBroadcast) {
                           ParseAndReturnVerifiedModule(hlo_string_before));
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           WhileLoopFusibleSinking{}.Run(module_before.get()));
-  EXPECT_TRUE(changed);
-  EXPECT_THAT(
-      FindInstruction(module_before.get(), "while"),
-      op::While(op::Tuple(_, op::CustomCall(), op::CustomCall(), _, _)));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(WhileLoopFusibleSinkingTest, TestNoPlumbWithUnknonwnTripCount) {
+  const std::string hlo_string_before = R"(
+  HloModule test
+  tmp {
+    x = s32[] parameter(0)
+    y = s32[] parameter(1)
+    ROOT add = s32[] add(x, y)
+  }
+
+  loop.body {
+    loop_var.1 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.1 = s32[]{:T(128)} get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.4 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} get-tuple-element(loop_var.1), index=2
+    get-tuple-element.3 = s32[4,3,5]{2,1,0} get-tuple-element(loop_var.1), index=3
+    bitcast.12855 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} bitcast(get-tuple-element.3)
+    add.40974 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} add(get-tuple-element.2, bitcast.12855)
+    add.1 = s32[1,1,1,4,3,5]{5,4,3,2,1,0} add(get-tuple-element.4, add.40974)
+    constant.1 = s32[]{:T(128)} constant(1)
+    idx = s32[]{:T(128)} add(get-tuple-element.1, constant.1)
+    ROOT tuple = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(idx, add.40974, add.1, get-tuple-element.3)
+  }
+
+  loop.condition {
+    loop_var.2 = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) parameter(0)
+    get-tuple-element.3 = s32[]{:T(128)} get-tuple-element(loop_var.2), index=0
+    m = s32[] constant(0)
+    v = s32[] constant(10000)
+    rng = s32[] rng(m, v), distribution=rng_uniform
+    ROOT less-than = pred[] compare(get-tuple-element.3, rng), direction=LT
+  }
+
+  ENTRY %main {
+    param.1 = s32[4,3,5]{2,1,0} parameter(0)
+    zero = s32[]{:T(128)} constant(0)
+    zeros32 = s32[]{:T(128)} constant(0)
+    broadcast = s32[1,1,1,4,3,5]{5,4,3,2,1,0} broadcast(zeros32)
+    input = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) tuple(zero, broadcast, broadcast, param.1)
+    ROOT while = (s32[]{:T(128)}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[1,1,1,4,3,5]{5,4,3,2,1,0}, s32[4,3,5]{2,1,0}) while(input), condition=loop.condition, body=loop.body
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module_before,
+                          ParseAndReturnVerifiedModule(hlo_string_before));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module_before.get()));
+  EXPECT_FALSE(changed);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
index 2a9033f0ff9b..838a12b243b6 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
 #include "xla/map_util.h"
 #include "xla/service/compile_time_cap.h"
 #include "xla/service/while_util.h"
@@ -380,6 +381,9 @@ absl::StatusOr<bool> WhileLoopInvariantCodeMotion::Run(
     // instructions that have the same channel ids).
     HloDCE dce;
     TF_RETURN_IF_ERROR(dce.Run(module).status());
+    // Simplify while loops after narrowing / widening.
+    TupleSimplifier tuple_simplifier;
+    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   }
 
   if (changed) {
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
index eadb19462118..e7fc4077bc77 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
@@ -37,7 +37,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class WhileLoopInvariantCodeMotionTest : public HloTestBase {
+class WhileLoopInvariantCodeMotionTest : public HloHardwareIndependentTestBase {
  public:
   // Makes a computation which has one parameter, of the given shape, and always
   // returns PRED[]{true}.  This is useful as a dummy loop condition.
@@ -515,31 +515,22 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
   // We expect the while body to be the equivalent of:
   //
   //  wide.body {
-  //    wide_param.1 = (f32[2]{0}, f32[2]{0}) parameter(0)
-  //    get-tuple-element.1 = f32[2]{0} get-tuple-element(wide_param.1), index=0
-  //    tuple.1 = (f32[2]{0}) tuple(get-tuple-element.1)
-  //    get-tuple-element.4 = f32[2]{0} get-tuple-element(tuple.1), index=0
-  //    get-tuple-element.7 = f32[2]{0} get-tuple-element(wide_param.1), index=1
-  //    add.1 = f32[2]{0} add(get-tuple-element.4, get-tuple-element.7)
-  //    tuple.3 = (f32[2]{0}) tuple(add.1)
-  //    get-tuple-element.8 = f32[2]{0} get-tuple-element(tuple.3), index=0
-  //    get-tuple-element.9 = f32[2]{0} get-tuple-element(wide_param.1), index=1
-  //    ROOT tuple.4 = (f32[2]{0}, f32[2]{0}) tuple(get-tuple-element.8,
-  //                                                get-tuple-element.9)
+  //    wide.p_body = (f32[2]{0}, f32[2]{0}) parameter(0)
+  //    p_body.2 = get-tuple-element(wide.p_body), index=0
+  //    wide.body.in.0 = get-tuple-element(wide.p_body), index=1
+  //    add.1 = add(p_body.2, wide.body.in.0)
+  //    wide.body.through.0 = get-tuple-element(wide.p_body), index=1
+  //    ROOT tuple = (f32[2]{0}, f32[2]{0}) tuple(add.1, wide.body.through.0)
   //  }
 
-  auto wide_param_1 = op::Parameter(0);
-  auto get_tuple_element_1 = op::GetTupleElement(wide_param_1, 0);
-  auto tuple_1 = op::Tuple(get_tuple_element_1);
-  auto get_tuple_element_4 = op::GetTupleElement(tuple_1, 0);
-  auto get_tuple_element_7 = op::GetTupleElement(wide_param_1, 1);
-  auto add_1 = op::Add(get_tuple_element_4, get_tuple_element_7);
-  auto tuple_3 = op::Tuple(add_1);
-  auto get_tuple_element_8 = op::GetTupleElement(tuple_3, 0);
-  auto get_tuple_element_9 = op::GetTupleElement(wide_param_1, 1);
-  auto tuple_4 = op::Tuple(get_tuple_element_8, get_tuple_element_9);
-
-  EXPECT_THAT(while_body->root_instruction(), tuple_4);
+  auto wide_p_body = op::Parameter(0);
+  auto p_body_2 = op::GetTupleElement(wide_p_body, 0);
+  auto wide_body_in_0 = op::GetTupleElement(wide_p_body, 1);
+  auto add_1 = op::Add(p_body_2, wide_body_in_0);
+  auto wide_body_through_0 = op::GetTupleElement(wide_p_body, 1);
+  auto tuple = op::Tuple(add_1, wide_body_through_0);
+
+  EXPECT_THAT(while_body->root_instruction(), tuple);
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistConstantByDefault) {
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
index 8f242ab227f8..06867a11ef7a 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
@@ -147,6 +147,8 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::Run(
       // Find the original bodies root after inlining. This is the inputs for
       // the next (unrolled) loop iteration.
       input_tuple = inline_map[loop_step->root_instruction()];
+      input_tuple = unrolled_body->AddInstruction(HloInstruction::CreateUnary(
+          input_tuple->shape(), HloOpcode::kOptimizationBarrier, input_tuple));
       original_roots.push_back(input_tuple);
     }
     // The final original root is now the root of the unrolled loop.
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
index 82793a2e52b2..ca9905b16850 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
@@ -23,27 +23,26 @@ limitations under the License.
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/copy_insertion.h"
-#include "xla/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 // Copied from xla/service/copy_insertion_test.cc
-int64_t CountCopies(const HloComputation& computation) {
+int64_t Count(HloOpcode opcode, const HloComputation& computation) {
   int64_t count = 0;
   for (const auto& instruction : computation.instructions()) {
-    if (instruction->opcode() == HloOpcode::kCopy) {
+    if (instruction->opcode() == opcode) {
       count++;
     }
   }
   return count;
 }
 
-class WhileLoopPipelineUnrollerTest : public HloTestBase {
+class WhileLoopPipelineUnrollerTest : public HloHardwareIndependentTestBase {
  protected:
   WhileLoopPipelineUnrollerTest() = default;
 };
@@ -91,12 +90,14 @@ ENTRY main {
   // arg.1 moves to index 0.
   // arg.2 moves to index 1.
   // out.0 moves to index 2.
-  EXPECT_EQ(CountCopies(*original_loop->while_body()), 3);
+  EXPECT_EQ(Count(HloOpcode::kCopy, *original_loop->while_body()), 3);
 
   const HloInstruction* unrolled_loop = original_loop->operand(0);
   EXPECT_EQ(unrolled_loop->opcode(), HloOpcode::kWhile);
   // There should be no copies inserted into the unrolled loop.
-  EXPECT_EQ(CountCopies(*unrolled_loop->while_body()), 0);
+  EXPECT_EQ(Count(HloOpcode::kCopy, *unrolled_loop->while_body()), 0);
+  EXPECT_EQ(
+      Count(HloOpcode::kOptimizationBarrier, *unrolled_loop->while_body()), 4);
 }
 
 TEST_F(WhileLoopPipelineUnrollerTest, PipelinedLoopWithInfeed) {
@@ -157,12 +158,14 @@ ENTRY main {
       FindInstruction(module.get(), "while.0");
   // The original loop should have 1 copy.
   // arg.2 moves to index 1.
-  EXPECT_EQ(CountCopies(*original_loop->while_body()), 1);
+  EXPECT_EQ(Count(HloOpcode::kCopy, *original_loop->while_body()), 1);
 
   const HloInstruction* unrolled_loop = original_loop->operand(0);
   EXPECT_EQ(unrolled_loop->opcode(), HloOpcode::kWhile);
   // There should be no copies inserted into the unrolled loop.
-  EXPECT_EQ(CountCopies(*unrolled_loop->while_body()), 0);
+  EXPECT_EQ(Count(HloOpcode::kCopy, *unrolled_loop->while_body()), 0);
+  EXPECT_EQ(
+      Count(HloOpcode::kOptimizationBarrier, *unrolled_loop->while_body()), 3);
 
   // All infeeds in the unrolled body need to be ordered with respect to each
   // other.
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index a11d826040e7..9eb490ca0fbf 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -591,7 +591,7 @@ static absl::StatusOr<HloInstruction*> TryRemoveRepeatedWhileTupleIndicesHelper(
 
   // We know which tuple indices are useful; i.e, those which aren't duplicates.
   absl::flat_hash_set<int64_t> used_tuple_indices;
-  for (int index = 0; index < while_init->shape().tuple_shapes_size();
+  for (int index = 0; index < while_init->shape().tuple_shapes().size();
        ++index) {
     if (!duplicates.count(index)) {
       used_tuple_indices.insert(index);
@@ -640,7 +640,7 @@ static absl::StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
   }
 
   bool changed = false;
-  while (index_to_investigate < while_init->shape().tuple_shapes_size()) {
+  while (index_to_investigate < while_init->shape().tuple_shapes().size()) {
     if (!while_init->shape().IsTuple() ||
         while_init->opcode() != HloOpcode::kTuple) {
       VLOG(2) << "While op's carried value isn't tuple shaped.";
@@ -685,7 +685,7 @@ static absl::StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
 
     // Look from index_to_investigate onwards to see if it is repeated.
     for (int64_t i = index_to_investigate + 1;
-         i < while_shape.tuple_shapes_size(); ++i) {
+         i < while_shape.tuple_shapes().size(); ++i) {
       auto* init_elem = while_init->operand(i);
       auto* body_elem = while_body_root->operand(i);
       if (pivot_body_elem->opcode() == HloOpcode::kGetTupleElement &&
@@ -783,7 +783,7 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
 
   absl::flat_hash_set<int64_t> constant_tuple_indices;
   const auto& while_shape = while_init->shape();
-  for (int i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+  for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
     auto* init_elem = while_init->operand(i);
     auto* body_elem = while_body_root->operand(i);
     if (init_elem->opcode() == HloOpcode::kConstant &&
@@ -800,7 +800,7 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
   // OK, we found some constant elements of the while parameter!  Eliminate
   // them.
   std::vector<const Shape*> new_while_shape_elems;
-  for (int i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+  for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
     if (!constant_tuple_indices.count(i)) {
       new_while_shape_elems.push_back(&while_shape.tuple_shapes(i));
     }
@@ -822,7 +822,7 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
     CHECK(ShapeUtil::Compatible(instr->shape(), while_shape));
 
     std::vector<HloInstruction*> tuple_elems;
-    for (int i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
       if (!constant_tuple_indices.count(i)) {
         tuple_elems.push_back(
             add_new_instr(HloInstruction::CreateGetTupleElement(
@@ -837,7 +837,7 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
 
     std::vector<HloInstruction*> tuple_elems;
     int64_t j = 0;
-    for (int i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
       if (constant_tuple_indices.count(i)) {
         tuple_elems.push_back(while_init->mutable_operand(i));
       } else {
@@ -1085,7 +1085,7 @@ static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
   // elements from `instrs` so that it only contains instructions we have not
   // yet processed.
   std::vector<HloInstruction*> elems;
-  for (int i = 0; i < desired_shape.tuple_shapes_size(); ++i) {
+  for (int i = 0; i < desired_shape.tuple_shapes().size(); ++i) {
     const Shape& subshape = desired_shape.tuple_shapes(i);
     if (!subshape.IsTuple()) {
       elems.push_back(instrs[0]);
@@ -1123,7 +1123,7 @@ static std::vector<HloInstruction*> GetFlatTupleElems(
     return {instr};
   }
   std::vector<HloInstruction*> elems;
-  for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
+  for (int i = 0; i < shape.tuple_shapes().size(); ++i) {
     const Shape& subshape = shape.tuple_shapes(i);
     new_instrs->push_back(
         HloInstruction::CreateGetTupleElement(subshape, instr, i));
@@ -1178,8 +1178,8 @@ static absl::StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   auto nested = [&](HloInstruction* instr) {
     std::vector<HloInstruction*> gtes;
     const Shape& flat_shape = instr->shape();
-    gtes.reserve(flat_shape.tuple_shapes_size());
-    for (int i = 0; i < flat_shape.tuple_shapes_size(); ++i) {
+    gtes.reserve(flat_shape.tuple_shapes().size());
+    for (int i = 0; i < flat_shape.tuple_shapes().size(); ++i) {
       gtes.push_back(add_new_instr(HloInstruction::CreateGetTupleElement(
           flat_shape.tuple_shapes(i), instr, i)));
     }
@@ -1356,7 +1356,7 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
   bool added_trip_counter = false;
   if (!trip_counter) {
     VLOG(10) << "Adding new trip counter to end of loop's tuple.";
-    trip_counter = new_while_shape.tuple_shapes_size();
+    trip_counter = new_while_shape.tuple_shapes().size();
     *new_while_shape.add_tuple_shapes() =
         ShapeUtil::MakeShape(elem_ty, /*dimensions=*/{});
     added_trip_counter = true;
@@ -1368,7 +1368,7 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
   auto convert_to_old_form = [&](HloInstruction* instr) {
     CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
     std::vector<HloInstruction*> tuple_elems;
-    for (int i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
       const auto& elem_shape = while_shape.tuple_shapes(i);
       if (!induction_vars.count(i)) {
         tuple_elems.push_back(add_gte(instr, i));
@@ -1394,8 +1394,8 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
     // In the new form, induction variables come from `init`, everything else
     // (including the trip counter if it's not one we created ourselves) comes
     // from the `root` tuple unmodified.
-    tuple_elems.reserve(while_shape.tuple_shapes_size());
-    for (int i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    tuple_elems.reserve(while_shape.tuple_shapes().size());
+    for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
       tuple_elems.push_back(
           add_gte((induction_vars.count(i) ? loop_body_param : old_root), i));
     }
@@ -1420,8 +1420,8 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
       return init;
     }
     std::vector<HloInstruction*> tuple_elems;
-    tuple_elems.reserve(while_shape.tuple_shapes_size());
-    for (int i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    tuple_elems.reserve(while_shape.tuple_shapes().size());
+    for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
       tuple_elems.push_back(add_gte(init, i));
     }
     tuple_elems.push_back(add_new_instr(
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index a478453e4aa8..b4d32cb98c60 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -25,14 +25,14 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/xla_data.pb.h"
 
@@ -48,7 +48,7 @@ HloInstruction* FindFirstWhile(HloModule* m) {
   return *absl::c_find_if(instrs, HloPredicateIsOp<HloOpcode::kWhile>);
 }
 
-class WhileLoopSimplifierTest : public HloTestBase {
+class WhileLoopSimplifierTest : public HloHardwareIndependentTestBase {
  protected:
   // Makes an HloModule that contains a loop with `num_iters` iteration.
   [[nodiscard]] std::unique_ptr<VerifiedHloModule> MakeModuleWithSimpleLoop(
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.h b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
deleted file mode 100644
index ee7377423b8b..000000000000
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
-#define XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
-
-#endif  // XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 6716a968c904..e23825a70e39 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -117,7 +117,7 @@ absl::Status HandleDynamicGteOrTuple(HloInstruction* instr) {
     auto index = LiteralUtil::LiteralAsScalarInt64(std::move(index_lit));
     // The index must have a compile-time integer value at this point.
     TF_RET_CHECK(index.has_value());
-    for (int64_t i = 0; i < instr->operand(0)->shape().tuple_shapes_size();
+    for (int64_t i = 0; i < instr->operand(0)->shape().tuple_shapes().size();
          i++) {
       if (i == index.value()) {
         tuple_operands.push_back(instr->mutable_operand(1));
@@ -221,13 +221,15 @@ UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
 
     // We need to assign a unique id to each scheduling group (of instructions)
     // that are unrolled within the while loop body.
-    std::optional<int64_t> scheduling_id = GetSchedulingAnnotation(body_inst);
+    TF_ASSIGN_OR_RETURN(std::optional<int64_t> scheduling_id,
+                        GetSchedulingAnnotationGroupId(body_inst));
     if (scheduling_id.has_value()) {
       if (!seen_scheduling_ids.contains(scheduling_id.value())) {
         seen_scheduling_ids.insert(scheduling_id.value());
         next_scheduling_id++;
       }
-      SetSchedulingAnnotation(body_inst, next_scheduling_id);
+      TF_RETURN_IF_ERROR(
+          SetSchedulingAnnotationGroupId(body_inst, next_scheduling_id));
     }
 
     // Handle DynamicGte and DynamicTuple custom-calls created during unstacking
@@ -293,7 +295,8 @@ absl::StatusOr<bool> UnrollInternal(HloInstruction* while_op,
   HloInstruction* unrolled_body_call_op;
   std::vector<HloInstruction*> call_operands = {while_op->operands().at(0)};
 
-  int64_t next_scheduling_id = NextSchedulingId(*while_op->GetModule());
+  TF_ASSIGN_OR_RETURN(int64_t next_scheduling_id,
+                      NextSchedulingGroupId(*while_op->GetModule()));
   for (int64_t i = config.init; i < config.trip_count + config.init; ++i) {
     CHECK(OverflowSafeAdd(i, (int64_t)1).has_value());
 
@@ -335,7 +338,8 @@ absl::StatusOr<UnrollResult> UnrollInternalWrappedAndReturnReplacement(
   // We assume while has only one tuple parameter
   call_operands.emplace_back(std::move(p.value()));
 
-  int64_t next_scheduling_id = NextSchedulingId(*while_op->GetModule());
+  TF_ASSIGN_OR_RETURN(int64_t next_scheduling_id,
+                      NextSchedulingGroupId(*while_op->GetModule()));
   for (int64_t i = config.init; i < config.trip_count + config.init; ++i) {
     CHECK(OverflowSafeAdd(i, (int64_t)1).has_value());
 
@@ -585,8 +589,9 @@ std::optional<int64_t> MatchShapeCoveringDynamicIndexInstruction(
     }
 
     const Shape& operand_shape = operand->shape();
-    CHECK_EQ(result_shape.dimensions_size(), operand_shape.dimensions_size());
-    for (int64_t i = 0; i < result_shape.dimensions_size(); ++i) {
+    CHECK_EQ(result_shape.dimensions().size(),
+             operand_shape.dimensions().size());
+    for (int64_t i = 0; i < result_shape.dimensions().size(); ++i) {
       if (i != dynamic_index &&
           result_shape.dimensions(i) != operand_shape.dimensions(i)) {
         VLOG(3) << "The slice sizes must match the operand-shape on "
@@ -633,8 +638,8 @@ std::optional<int64_t> AdvancedMatchShapeCoveringDynamicIndexInstruction(
   input = instr->operand(0);
   const Shape& input_shape = input->shape();
 
-  const int64_t num_indices = slice_shape->dimensions_size();
-  CHECK_EQ(num_indices, input_shape.dimensions_size());
+  const int64_t num_indices = slice_shape->dimensions().size();
+  CHECK_EQ(num_indices, input_shape.dimensions().size());
   CHECK_EQ(num_indices, instr->operand_count() - start_indices_offset);
 
   std::vector<int64_t> dynamic_indices;
diff --git a/third_party/xla/xla/service/while_loop_unroller_test.cc b/third_party/xla/xla/service/while_loop_unroller_test.cc
index 6692220b59a5..c5bb516f6e51 100644
--- a/third_party/xla/xla/service/while_loop_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_unroller_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/service/scheduling_annotations_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -1032,8 +1033,10 @@ TEST_F(WhileLoopUnrollerTest, LoopWithCollective2) {
   absl::flat_hash_map<int64_t, int64_t> num_instrs_per_group;
   for (const HloInstruction* instr :
        module->entry_computation()->instructions()) {
-    if (std::optional<int64_t> id = GetSchedulingAnnotation(instr)) {
-      num_instrs_per_group[id.value()]++;
+    TF_ASSERT_OK_AND_ASSIGN(std::optional<int64_t> id,
+                            GetSchedulingAnnotationGroupId(instr));
+    if (id) {
+      num_instrs_per_group[*id]++;
     }
   }
   for (const auto& [group_id, num_instrs] : num_instrs_per_group) {
diff --git a/third_party/xla/xla/service/while_util.cc b/third_party/xla/xla/service/while_util.cc
index edf9f4b0b84a..aa0f7b58366e 100644
--- a/third_party/xla/xla/service/while_util.cc
+++ b/third_party/xla/xla/service/while_util.cc
@@ -75,7 +75,7 @@ WidenWhileCondition(HloComputation* narrow_condition, const Shape& wide_shape) {
 
   HloInstruction* truncated_parameter = TupleUtil::ExtractPrefix(
       wide_while_cond->parameter_instruction(0),
-      narrow_shape.tuple_shapes_size(),
+      narrow_shape.tuple_shapes().size(),
       absl::StrCat("renarrowed.",
                    wide_while_cond->parameter_instruction(0)->name()));
   HloInstruction* call_narrow_cond = wide_while_cond->AddInstruction(
@@ -104,7 +104,7 @@ WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
 
   HloInstruction* wide_parameter = wide_while_body->parameter_instruction(0);
   HloInstruction* truncated_parameter = TupleUtil::ExtractPrefix(
-      wide_parameter, narrow_shape.tuple_shapes_size(),
+      wide_parameter, narrow_shape.tuple_shapes().size(),
       absl::StrCat("renarrowed.",
                    wide_while_body->parameter_instruction(0)->name()));
   HloInstruction* call_narrow_body =
@@ -112,13 +112,13 @@ WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
           narrow_shape, {truncated_parameter}, narrow_body));
 
   std::vector<HloInstruction*> live_through_values;
-  for (int i = narrow_shape.tuple_shapes_size();
-       i < wide_shape.tuple_shapes_size(); i++) {
+  for (int i = narrow_shape.tuple_shapes().size();
+       i < wide_shape.tuple_shapes().size(); i++) {
     live_through_values.push_back(wide_while_body->AddInstruction(
         HloInstruction::CreateGetTupleElement(wide_shape.tuple_shapes(i),
                                               wide_parameter, i),
         absl::StrCat(wide_while_body->name(), ".through.",
-                     i - narrow_shape.tuple_shapes_size())));
+                     i - narrow_shape.tuple_shapes().size())));
   }
 
   wide_while_body->set_root_instruction(
@@ -135,7 +135,7 @@ WhileUtil::MakeInstructionsLiveIn(
     absl::Span<HloInstruction* const> instructions) {
   CHECK(while_instr->shape().IsTuple());
 
-  int elements_in_old_while_shape = while_instr->shape().tuple_shapes_size();
+  int elements_in_old_while_shape = while_instr->shape().tuple_shapes().size();
   Shape new_while_shape = while_instr->shape();
   for (auto* instruction : instructions) {
     *new_while_shape.add_tuple_shapes() = instruction->shape();
@@ -156,7 +156,7 @@ WhileUtil::MakeInstructionsLiveIn(
   HloInstruction* new_while_init =
       TupleUtil::AppendSuffix(while_instr->mutable_operand(0), instructions);
   HloComputation* containing_computation = while_instr->parent();
-  HloInstruction* new_while = containing_computation->AddInstruction(
+  HloInstruction* new_while = while_instr->AddInstruction(
       HloInstruction::CreateWhile(new_while_shape, new_while_condition,
                                   new_while_body, new_while_init));
 
@@ -164,7 +164,7 @@ WhileUtil::MakeInstructionsLiveIn(
   // effecting operations so we do a manual HloComputation::RemoveInstruction
   // instead of relying on HloComputation::ReplaceInstruction.
   HloInstruction* replacement_instr = TupleUtil::ExtractPrefix(
-      new_while, while_instr->shape().tuple_shapes_size());
+      new_while, while_instr->shape().tuple_shapes().size());
   TF_RETURN_IF_ERROR(new_while->CopyAllControlDepsFrom(while_instr));
   TF_RETURN_IF_ERROR(while_instr->DropAllControlDeps());
   TF_RETURN_IF_ERROR(while_instr->ReplaceAllUsesWith(replacement_instr));
@@ -173,7 +173,7 @@ WhileUtil::MakeInstructionsLiveIn(
   HloInstruction* while_body_param = new_while_body->parameter_instruction(0);
   std::vector<HloInstruction*> live_in_instructions;
   for (int64_t i = elements_in_old_while_shape;
-       i < new_while_shape.tuple_shapes_size(); i++) {
+       i < new_while_shape.tuple_shapes().size(); i++) {
     live_in_instructions.push_back(new_while_body->AddInstruction(
         HloInstruction::CreateGetTupleElement(
             instructions[i - elements_in_old_while_shape]->shape(),
@@ -236,7 +236,7 @@ MakeCountedLoopBodyComputation(
                       MakeBinaryHlo(HloOpcode::kAdd, indvar, one));
 
   std::vector<HloInstruction*> loop_body_generator_args;
-  for (int i = 1, e = loop_state_shape.tuple_shapes_size(); i < e; i++) {
+  for (int i = 1, e = loop_state_shape.tuple_shapes().size(); i < e; i++) {
     TF_ASSIGN_OR_RETURN(HloInstruction * tuple_element,
                         MakeGetTupleElementHlo(param, i));
     loop_body_generator_args.push_back(tuple_element);
diff --git a/third_party/xla/xla/service/while_util_test.cc b/third_party/xla/xla/service/while_util_test.cc
index e2162a841d59..1c0b76cb7fbf 100644
--- a/third_party/xla/xla/service/while_util_test.cc
+++ b/third_party/xla/xla/service/while_util_test.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
@@ -36,7 +36,7 @@ namespace {
 
 namespace op = ::xla::testing::opcode_matchers;
 
-class WhileUtilTest : public HloTestBase {
+class WhileUtilTest : public HloHardwareIndependentTestBase {
  protected:
   absl::StatusOr<std::unique_ptr<VerifiedHloModule>> GetParsedModule(
       HloComputation** entry_computation, HloInstruction** param0,
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
deleted file mode 100644
index 0cae8b48151a..000000000000
--- a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2023 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-version: 3
-results {
-  device: "CUDA: 6.0, Cores: 56, GPU clock: 1.4805 GHz, Memory bandwidth: 732 GB/s, L2 cache: 4 MB"
-  hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[],\"lhs_contracting_dimensions\":[\"1\"],\"rhs_batch_dimensions\":[],\"rhs_contracting_dimensions\":[\"0\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"9\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"9\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
-  result {
-    gemm {
-      algorithm: 13
-    }
-  }
-}
-results {
-  device: "CUDA: 6.0, Cores: 56, GPU clock: 1.4805 GHz, Memory bandwidth: 732 GB/s, L2 cache: 4 MB"
-  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,2,4,4]{3,2,1,0}, f32[1,2,3,2]{3,2,1,0}), window={size=3x2}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convForward\", backend_config={\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"leakyrelu_alpha\":0,\"side_input_scale\":0},\"force_earliest_schedule\":false,\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
-  result {
-    run_time {
-      nanos: 8192
-    }
-    algorithm {
-      algo_id: 28
-      tuning_knobs {
-        key: 2
-        value: 4
-      }
-      tuning_knobs {
-        key: 3
-        value: 0
-      }
-      is_cudnn_frontend: true
-      workspace_size {
-      }
-    }
-  }
-}
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.txtpb b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.txtpb
new file mode 100644
index 000000000000..c21e3c32bf93
--- /dev/null
+++ b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.txtpb
@@ -0,0 +1,50 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# proto-file: third_party/tensorflow/compiler/xla/autotune_results.proto
+# proto-message: xla.AutotuneResults
+
+version: 3
+results {
+  device: "CUDA: 6.0, Cores: 56, GPU clock: 1.4805 GHz, Memory bandwidth: 732 GB/s, L2 cache: 4 MB"
+  hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[],\"lhs_contracting_dimensions\":[\"1\"],\"rhs_batch_dimensions\":[],\"rhs_contracting_dimensions\":[\"0\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"9\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"9\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
+  result {
+    gemm {
+      algorithm: 13
+    }
+  }
+}
+results {
+  device: "CUDA: 6.0, Cores: 56, GPU clock: 1.4805 GHz, Memory bandwidth: 732 GB/s, L2 cache: 4 MB"
+  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,2,4,4]{3,2,1,0}, f32[1,2,3,2]{3,2,1,0}), window={size=3x2}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convForward\", backend_config={\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"leakyrelu_alpha\":0,\"side_input_scale\":0},\"force_earliest_schedule\":false,\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
+  result {
+    run_time {
+      nanos: 8192
+    }
+    algorithm {
+      algo_id: 28
+      tuning_knobs {
+        key: 2
+        value: 4
+      }
+      tuning_knobs {
+        key: 3
+        value: 0
+      }
+      is_cudnn_frontend: true
+      workspace_size {
+      }
+    }
+  }
+}
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_gpu_target_config.prototxt b/third_party/xla/xla/service/xla_aot_compile_test_gpu_target_config.prototxt
deleted file mode 100644
index 4ac588f89ff2..000000000000
--- a/third_party/xla/xla/service/xla_aot_compile_test_gpu_target_config.prototxt
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2022 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-gpu_device_info {
-  cuda_compute_capability {
-    major: 6
-  }
-  threads_per_block_limit: 1024
-  threads_per_warp: 32
-  shared_memory_per_block: 49152
-  shared_memory_per_core: 65536
-  threads_per_core_limit: 2048
-  core_count: 56
-  fpus_per_core: 64
-  block_dim_limit_x: 2147483647
-  block_dim_limit_y: 65535
-  block_dim_limit_z: 65535
-  memory_bandwidth: 732160000000
-  l2_cache_size: 4194304
-  clock_rate_ghz: 1.4805
-}
-platform_name: "CUDA"
-dnn_version_info {
-  major: 8
-  minor: 3
-  patch: 2
-}
-device_description_str: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_gpu_target_config.txtpb b/third_party/xla/xla/service/xla_aot_compile_test_gpu_target_config.txtpb
new file mode 100644
index 000000000000..fa8a9df0a24e
--- /dev/null
+++ b/third_party/xla/xla/service/xla_aot_compile_test_gpu_target_config.txtpb
@@ -0,0 +1,43 @@
+# Copyright 2022 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# proto-file: third_party/tensorflow/compiler/xla/stream_executor/device_description.proto
+# proto-message: stream_executor.GpuTargetConfigProto
+
+gpu_device_info {
+  cuda_compute_capability {
+    major: 6
+  }
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 65536
+  threads_per_core_limit: 2048
+  core_count: 56
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 732160000000
+  l2_cache_size: 4194304
+  clock_rate_ghz: 1.4805
+  device_memory_size: 17071734784
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 8
+  minor: 3
+  patch: 2
+}
+device_description_str: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
diff --git a/third_party/xla/xla/service/xla_debug_info_manager.h b/third_party/xla/xla/service/xla_debug_info_manager.h
index 0d3ce1ca18a4..024c454851fe 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager.h
+++ b/third_party/xla/xla/service/xla_debug_info_manager.h
@@ -39,7 +39,7 @@ using ModuleIdentifier = int;
 class XlaDebugInfoManager {
  public:
   static XlaDebugInfoManager* Get() {
-    static XlaDebugInfoManager* singleton = new XlaDebugInfoManager();
+    static XlaDebugInfoManager* const singleton = new XlaDebugInfoManager();
     return singleton;
   }
 
diff --git a/third_party/xla/xla/service/xla_debug_info_manager_test.cc b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
index f3aaa7c47a41..0654b1090a2b 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager_test.cc
+++ b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 
@@ -71,7 +71,7 @@ namespace {
 using ::testing::IsEmpty;
 using ::testing::UnorderedElementsAre;
 
-class XlaDebugInfoManagerTest : public HloTestBase {
+class XlaDebugInfoManagerTest : public HloHardwareIndependentTestBase {
  protected:
   struct DebugMetadata {
     // We allow same id to be registered multiple times. we need unique id to
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.h b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
deleted file mode 100644
index 3da82bd21355..000000000000
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
-#define XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h"
-
-#endif  // XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
diff --git a/third_party/xla/xla/shape.cc b/third_party/xla/xla/shape.cc
index 0363b1538315..1cee38146fb0 100644
--- a/third_party/xla/xla/shape.cc
+++ b/third_party/xla/xla/shape.cc
@@ -17,11 +17,14 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/types/span.h"
@@ -30,7 +33,9 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/printer.h"
 #include "xla/shape_util.h"
+#include "xla/status_macros.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -44,71 +49,177 @@ Shape::Shape(Shape&&) noexcept = default;
 Shape& Shape::operator=(const Shape&) = default;
 Shape& Shape::operator=(Shape&&) noexcept = default;
 
-Shape::Shape(const ShapeProto& shape_proto) {
-  set_element_type(shape_proto.element_type());
-  dimensions_.reserve(shape_proto.dimensions_size());
-  for (const int64_t dimension : shape_proto.dimensions()) {
-    add_dimensions(dimension);
-  }
-  // A malformed proto may have different is_dynamic_dimension_size and
-  // dimensions_size. Since C++ is evil, and we have no good way of bailing out
-  // in a constructor, conservatively trim the is_dynamic_dimension size.
-  // TODO(b/120111794): Make this a hard error when we have a factory method
-  // instead of a constructor.
-  if (shape_proto.dimensions_size() !=
-      shape_proto.is_dynamic_dimension_size()) {
-    if (shape_proto.is_dynamic_dimension_size() != 0) {
-      LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
-                    "fields does not match number of dimension fields";
-    } else {
-      LOG(WARNING) << "Malformed shape proto: is_dynamic_dimension is empty";
-    }
-  }
-  int64_t num_dynamic_dimension_fields = std::min(
-      shape_proto.dimensions_size(), shape_proto.is_dynamic_dimension_size());
-  for (int i = 0; i < num_dynamic_dimension_fields; i++) {
-    dynamic_dimensions_[i] = shape_proto.is_dynamic_dimension(i);
-  }
-  tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
-  for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
-    tuple_shapes_.emplace_back(element_shape);
-  }
-  if (shape_proto.has_layout()) {
-    if (!IsArray()) {
-      LOG(ERROR) << "Malformed shape proto: element_type "
-                 << PrimitiveType_Name(element_type())
-                 << " should not have a layout.";
-    } else {
-      *mutable_layout() = Layout::CreateFromProto(shape_proto.layout());
-    }
-  }
+Shape::Shape(const PrimitiveType element_type) {
+  CHECK(element_type == TOKEN || element_type == OPAQUE_TYPE ||
+        element_type == BUFFER)
+      << "Invalid element type for token or opaque shape: " << element_type_;
+  set_element_type(element_type);
 }
 
-void Shape::SetProto(ShapeProto& proto) const {
-  proto.Clear();
-  proto.set_element_type(element_type_);
-  proto.mutable_dimensions()->Reserve(dimensions_size());
-  for (const int64_t dimension : dimensions()) {
-    proto.add_dimensions(dimension);
+Shape::Shape(const PrimitiveType element_type,
+             const absl::Span<const int64_t> dimensions,
+             const absl::Span<const bool> dynamic_dimensions) {
+  CHECK(primitive_util::IsArrayType(element_type))
+      << "Invalid element type for array shape: " << element_type;
+  if (!dynamic_dimensions.empty()) {
+    CHECK_EQ(dimensions.size(), dynamic_dimensions.size())
+        << "If dynamic_dimensions is provided, it must have the same size as "
+           "dimensions.";
   }
-  for (const bool dynamic : dynamic_dimensions_) {
-    proto.add_is_dynamic_dimension(dynamic);
+
+  set_element_type(element_type);
+  auto& state = array_state();
+  state.dimensions = {dimensions.begin(), dimensions.end()};
+  if (dynamic_dimensions.empty()) {
+    // Assume all dimensions are static.
+    state.dynamic_dimensions.resize(dimensions.size(), false);
+  } else {
+    state.dynamic_dimensions = absl::InlinedVector<bool, InlineRank()>(
+        dynamic_dimensions.begin(), dynamic_dimensions.end());
   }
-  proto.mutable_tuple_shapes()->Reserve(tuple_shapes_size());
-  for (const Shape& shape : tuple_shapes()) {
-    shape.SetProto(*proto.add_tuple_shapes());
+}
+
+Shape::Shape(std::vector<Shape> tuple_shapes) {
+  set_element_type(TUPLE);
+  tuple_state().tuple_shapes = std::move(tuple_shapes);
+}
+
+/* static */ Shape Shape::MakeBufferShape(Shape element_shape) {
+  CHECK(element_shape.IsArray())
+      << "element_shape must be an array shape to create a buffer shape.";
+  Shape shape(BUFFER);
+  shape.buffer_state().buffer_shape = {std::move(element_shape)};
+  return shape;
+}
+
+Shape::Shape(const ShapeProto& shape_proto) {
+  *this = FromProto(shape_proto).value_or(Shape());
+}
+
+absl::StatusOr<Shape> Shape::FromProto(const ShapeProto& shape_proto) {
+  Shape shape;
+  shape.set_element_type(shape_proto.element_type());
+  if (auto* const state = shape.if_array_state()) {
+    const int num_dims = shape_proto.dimensions_size();
+    const int num_is_dynamic_dims = shape_proto.is_dynamic_dimension_size();
+    state->dimensions.reserve(num_dims);
+    state->dynamic_dimensions.reserve(num_dims);
+    if (num_is_dynamic_dims != 0) {
+      TF_RET_CHECK(num_dims == num_is_dynamic_dims)
+          << "Malformed shape proto: number of is_dynamic_dimension "
+             "fields ("
+          << num_is_dynamic_dims << ") does not match number of dimension "
+          << "fields (" << num_dims << ").";
+    }
+    for (int i = 0; i < num_dims; ++i) {
+      const bool is_dynamic =
+          (i < num_is_dynamic_dims) && shape_proto.is_dynamic_dimension(i);
+      // We don't want to crash due to a malformed proto, so use
+      // UnsafeAddDimension. We expect that the caller will eventually call a
+      // validation routine that will detect the error in case the dimension
+      // value is invalid.
+      shape.UnsafeAddDimension(shape_proto.dimensions(i), is_dynamic);
+    }
+  } else if (auto* const state = shape.if_tuple_state()) {
+    state->tuple_shapes.reserve(shape_proto.tuple_shapes_size());
+    for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(Shape tuple_shape, Shape::FromProto(element_shape));
+      state->tuple_shapes.emplace_back(std::move(tuple_shape));
+    }
+  } else if (auto* const state = shape.if_buffer_state()) {
+    state->buffer_shape.emplace_back(shape_proto.tuple_shapes(0));
   }
-  if (has_layout()) {
-    layout().SetProto(*proto.mutable_layout());
+  if (shape_proto.has_layout()) {
+    TF_RET_CHECK(shape.IsArray()) << "Malformed shape proto: element_type "
+                                  << PrimitiveType_Name(shape.element_type())
+                                  << " should not have a layout.";
+    TF_ASSIGN_OR_RETURN(*shape.mutable_layout(),
+                        Layout::FromProto(shape_proto.layout()));
   }
+  return shape;
 }
 
 ShapeProto Shape::ToProto() const {
   ShapeProto proto;
-  SetProto(proto);
+  proto.set_element_type(element_type_);
+
+  if (const auto* const state = if_array_state()) {
+    proto.mutable_dimensions()->Reserve(state->dimensions.size());
+    for (const int64_t dimension : state->dimensions) {
+      proto.add_dimensions(dimension);
+    }
+    for (const bool dynamic : state->dynamic_dimensions) {
+      proto.add_is_dynamic_dimension(dynamic);
+    }
+    if (state->layout.has_value()) {
+      *proto.mutable_layout() = state->layout->ToProto();
+    }
+  } else if (const auto* const state = if_tuple_state()) {
+    proto.mutable_tuple_shapes()->Reserve(state->tuple_shapes.size());
+    for (const Shape& shape : state->tuple_shapes) {
+      *proto.add_tuple_shapes() = shape.ToProto();
+    }
+  } else if (const auto* const state = if_buffer_state()) {
+    proto.mutable_tuple_shapes()->Reserve(1);
+    *proto.add_tuple_shapes() = state->buffer_shape[0].ToProto();
+  }
   return proto;
 }
 
+const Shape::ArrayState& Shape::array_state() const {
+  const auto* const state = if_array_state();
+  CHECK(state) << "Expected an array shape. Got " << ToString()
+               << "\nThis is a programmer error. Please read "
+                  "the Shape object's array properties (e.g. dimensions) "
+                  "only when it's an array shape.";
+  return *state;
+}
+
+Shape::ArrayState& Shape::array_state() {
+  auto* const state = if_array_state();
+  CHECK(state) << "Expected an array shape. Got " << ToString()
+               << "\nThis is a programmer error. Please mutate "
+                  "the Shape object's array properties (e.g. dimensions) "
+                  "only when it's an array shape.";
+  return *state;
+}
+
+const Shape::TupleState& Shape::tuple_state() const {
+  const auto* const state = if_tuple_state();
+  CHECK(state) << "Expected a tuple shape. Got " << ToString()
+               << "\nThis is a programmer error. Please read "
+                  "the Shape object's tuple properties (e.g. tuple_shapes) "
+                  "only when it's a tuple shape.";
+  return *state;
+}
+
+Shape::TupleState& Shape::tuple_state() {
+  auto* const state = if_tuple_state();
+  CHECK(state) << "Expected a tuple shape. Got " << ToString()
+               << "\nThis is a programmer error. Please mutate "
+                  "the Shape object's tuple properties (e.g. tuple_shapes) "
+                  "only when it's a tuple shape.";
+  return *state;
+}
+
+const Shape::BufferState& Shape::buffer_state() const {
+  const auto* const state = if_buffer_state();
+  CHECK(state) << "Expected a buffer shape. Got " << ToString()
+               << "\nThis is a programmer error. Please read "
+                  "the Shape object's buffer properties (e.g. buffer_shapes) "
+                  "only when it's a buffer shape.";
+  return *state;
+}
+
+Shape::BufferState& Shape::buffer_state() {
+  auto* const state = if_buffer_state();
+  CHECK(state) << "Expected a buffer shape. Got " << ToString()
+               << "\nThis is a programmer error. Please mutate "
+                  "the Shape object's buffer properties (e.g. buffer_shapes) "
+                  "only when it's a buffer shape.";
+  return *state;
+}
+
 void Shape::Print(Printer* printer, bool print_layout) const {
   if (print_layout) {
     ShapeUtil::PrintHumanStringWithLayout(printer, *this);
@@ -125,76 +236,226 @@ std::string Shape::ToString(bool print_layout) const {
   }
 }
 
-bool Shape::IsInteger() const {
-  if (IsTuple()) {
-    return absl::c_all_of(tuple_shapes_,
-                          [](const Shape& s) { return s.IsInteger(); });
+bool Shape::AreAllLeavesIntegers() const {
+  if (const auto* const state = if_tuple_state()) {
+    return absl::c_all_of(state->tuple_shapes, [](const Shape& s) {
+      return s.AreAllLeavesIntegers();
+    });
   }
   return primitive_util::IsIntegralType(element_type());
 }
 
+void Shape::add_dimensions(int64_t value, bool is_dynamic) {
+  if (value < 0) {
+    CHECK(is_dynamic) << "static dimension must have size >= 0 instead of "
+                      << value << ".";
+    CHECK_EQ(value, kUnboundedSize)
+        << "dynamic dimension must have size == kUnboundedSize or >= 0.";
+  }
+  UnsafeAddDimension(value, is_dynamic);
+}
+
+void Shape::set_dynamic_dimension(int dimension, bool is_dynamic) {
+  auto& state = array_state();
+  // Ensure that the dimension size is valid for the new dynamic-ness.
+  CheckDimensionSize(dimension, state.dimensions[dimension], is_dynamic);
+  state.dynamic_dimensions[dimension] = is_dynamic;
+}
+
+void Shape::set_dimensions(int index, int64_t size,
+                           std::optional<bool> is_dynamic) {
+  auto& state = array_state();
+  const bool dynamic =
+      is_dynamic.has_value() ? *is_dynamic : state.dynamic_dimensions[index];
+  CheckDimensionSize(index, size, dynamic);
+  state.dimensions[index] = size;
+  state.dynamic_dimensions[index] = dynamic;
+}
+
+void Shape::set_dimensions_minor(int index, int64_t size,
+                                 std::optional<bool> is_dynamic) {
+  const int physical_index = layout().minor_to_major(index);
+  set_dimensions(physical_index, size, is_dynamic);
+}
+
+void Shape::CheckDimensionSize(int dim_index, int64_t size, bool is_dynamic) {
+  if (is_dynamic) {
+    if (size < 0) {
+      CHECK_EQ(size, kUnboundedSize) << "the " << dim_index
+                                     << "-th dimension is dynamic and must "
+                                        "have size == kUnboundedSize or >= 0.";
+    }
+  } else {
+    CHECK_GE(size, 0) << "the " << dim_index
+                      << "-th dimension is static and must have size >= 0.";
+  }
+}
+
+void Shape::UnsafeAddDimension(int64_t value, bool is_dynamic) {
+  auto& state = array_state();
+  CHECK_EQ(state.dimensions.size(), state.dynamic_dimensions.size())
+      << "where the shape is " << ToString();
+  state.dimensions.push_back(value);
+  state.dynamic_dimensions.push_back(is_dynamic);
+}
+
 bool Shape::is_static() const {
-  if (IsTuple()) {
-    return absl::c_all_of(tuple_shapes_,
+  if (const auto* const state = if_tuple_state()) {
+    return absl::c_all_of(state->tuple_shapes,
                           [](const Shape& s) { return s.is_static(); });
   }
-  return !absl::c_any_of(dynamic_dimensions_, [](bool b) { return b; });
+  if (const auto* const state = if_array_state()) {
+    return !absl::c_any_of(state->dynamic_dimensions, [](bool b) { return b; });
+  }
+  return true;
 }
 
 bool Shape::is_unbounded_dynamic() const {
-  if (IsTuple()) {
-    return absl::c_any_of(tuple_shapes_, [](const Shape& subshape) {
+  if (const auto* const state = if_tuple_state()) {
+    return absl::c_any_of(state->tuple_shapes, [](const Shape& subshape) {
       return subshape.is_unbounded_dynamic();
     });
   }
-  return absl::c_any_of(dimensions_,
-                        [](int64_t dim) { return dim == kUnboundedSize; });
+  if (const auto* const state = if_array_state()) {
+    return absl::c_any_of(state->dimensions,
+                          [](int64_t dim) { return dim == kUnboundedSize; });
+  }
+  return false;
 }
 
 bool Shape::is_bounded_dynamic() const {
-  if (IsTuple()) {
-    return absl::c_any_of(tuple_shapes_, [](const Shape& subshape) {
+  if (const auto* const state = if_tuple_state()) {
+    return absl::c_any_of(state->tuple_shapes, [](const Shape& subshape) {
       return subshape.is_bounded_dynamic();
     });
   }
-  for (auto i = 0; i < dimensions_.size(); ++i) {
-    if (is_bounded_dynamic_dimension(i)) return true;
+  if (const auto* const state = if_array_state()) {
+    for (auto i = 0; i < state->dimensions.size(); ++i) {
+      if (is_bounded_dynamic_dimension(i)) return true;
+    }
+    return false;
   }
   return false;
 }
 
 void Shape::DeleteDimension(int64_t dim_to_delete) {
-  CHECK(IsArray());
+  auto& state = array_state();
   CHECK_GE(dim_to_delete, 0);
-  CHECK_LT(dim_to_delete, dimensions_.size());
-  dimensions_.erase(dimensions_.begin() + dim_to_delete);
-  dynamic_dimensions_.erase(dynamic_dimensions_.begin() + dim_to_delete);
+  CHECK_LT(dim_to_delete, state.dimensions.size());
+  state.dimensions.erase(state.dimensions.begin() + dim_to_delete);
+  state.dynamic_dimensions.erase(state.dynamic_dimensions.begin() +
+                                 dim_to_delete);
   if (LayoutUtil::HasLayout(*this)) {
-    layout_->DeleteDimension(dim_to_delete);  // NOLINT: optional-access
+    state.layout->DeleteDimension(dim_to_delete);  // NOLINT: optional-access
   }
 }
 
-void Shape::DeleteDimensions(absl::Span<const int64_t> sorted_dims_to_delete) {
-  CHECK(IsArray());
-  CHECK(absl::c_is_sorted(sorted_dims_to_delete));
-  dimensions_ = RemoveElements(sorted_dims_to_delete, dimensions_);
-  dynamic_dimensions_ =
-      RemoveElements(sorted_dims_to_delete, dynamic_dimensions_);
+void Shape::DeleteDimensions(absl::Span<const int64_t> dims_to_delete) {
+  auto& state = array_state();
+  std::vector<int64_t> sorted_dims_to_delete(dims_to_delete.begin(),
+                                             dims_to_delete.end());
+  absl::c_sort(sorted_dims_to_delete);
+  state.dimensions = RemoveElements(sorted_dims_to_delete, state.dimensions);
+  state.dynamic_dimensions =
+      RemoveElements(sorted_dims_to_delete, state.dynamic_dimensions);
   if (LayoutUtil::HasLayout(*this)) {
     for (auto it = sorted_dims_to_delete.rbegin();
          it != sorted_dims_to_delete.rend(); ++it) {
-      layout_->DeleteDimension(*it);  // NOLINT: optional-access
+      state.layout->DeleteDimension(*it);  // NOLINT: optional-access
     }
   }
 }
 
+void Shape::CheckStateIsEmpty() const {
+  if (const auto* const state = if_array_state()) {
+    CHECK(state->dimensions.empty()) << ToString();
+    CHECK(state->dynamic_dimensions.empty()) << ToString();
+    CHECK(!state->layout.has_value()) << ToString();
+  } else if (const auto* const state = if_tuple_state()) {
+    CHECK(state->tuple_shapes.empty()) << ToString();
+  }
+}
+
+const std::vector<Shape>& Shape::tuple_shapes() const {
+  return tuple_state().tuple_shapes;
+}
+
+const Shape& Shape::buffer_shape() const {
+  return buffer_state().buffer_shape[0];
+}
+
+void Shape::Clear() {
+  // Before setting the element type to invalid, we need to clear the state
+  // because the state may be non-empty if the shape was previously valid.
+  // Without this step, set_element_type() may CHECK-fail.
+  if (auto* const state = if_array_state()) {
+    *state = ArrayState();
+  } else if (auto* const state = if_tuple_state()) {
+    *state = TupleState();
+  }
+  set_element_type(PRIMITIVE_TYPE_INVALID);
+}
+
+void Shape::set_element_type(const PrimitiveType value) {
+  element_type_ = value;
+
+  // Make sure the variant state matches the element type.
+  // If we have to change the case of the variant, and the current case is not
+  // empty, it's likely a programmer error - we CHECK-fail to catch it.
+  if (element_type_ == TOKEN) {
+    if (!if_token_state()) {
+      CheckStateIsEmpty();
+      state_ = TokenState();
+    }
+    return;
+  }
+  if (element_type_ == OPAQUE_TYPE) {
+    if (!if_opaque_state()) {
+      CheckStateIsEmpty();
+      state_ = OpaqueState();
+    }
+    return;
+  }
+  if (element_type_ == TUPLE) {
+    if (!if_tuple_state()) {
+      CheckStateIsEmpty();
+      state_ = TupleState();
+    }
+    return;
+  }
+  if (element_type_ == BUFFER) {
+    if (!if_buffer_state()) {
+      CheckStateIsEmpty();
+      state_ = BufferState();
+    }
+    return;
+  }
+  if (primitive_util::IsArrayType(element_type_)) {
+    if (!if_array_state()) {
+      CheckStateIsEmpty();
+      state_ = ArrayState();
+    }
+    return;
+  }
+  // Treat all other types as invalid.
+  if (element_type_ != PRIMITIVE_TYPE_INVALID) {
+    LOG(ERROR) << "Unsupported element type: " << element_type_;
+    element_type_ = PRIMITIVE_TYPE_INVALID;
+  }
+  if (!if_invalid_state()) {
+    CheckStateIsEmpty();
+    state_ = InvalidState();
+  }
+}
+
 const Shape& Shape::tuple_shapes(int index) const {
-  return tuple_shapes_[index];
+  return tuple_state().tuple_shapes[index];
 }
 
 Shape* Shape::add_tuple_shapes() {
-  tuple_shapes_.push_back(Shape());
-  return &tuple_shapes_.back();
+  auto& state = tuple_state();
+  state.tuple_shapes.push_back(Shape());
+  return &state.tuple_shapes.back();
 }
 
 bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
@@ -203,7 +464,22 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
            absl::c_equal(
                lhs.tuple_shapes(), rhs.tuple_shapes(),
                [=](const Shape& l, const Shape& r) { return (*this)(l, r); });
-  } else if (!lhs.IsArray()) {
+  }
+  if (lhs.IsBuffer() || rhs.IsBuffer()) {
+    if (!ignore_buffer_) {
+      return lhs.IsBuffer() && rhs.IsBuffer() &&
+             lhs.buffer_shape() == rhs.buffer_shape();
+    }
+    auto underline_shape = [](const Shape& shape) {
+      if (shape.IsBuffer()) {
+        return shape.buffer_shape();
+      }
+      return shape;
+    };
+    return underline_shape(lhs) == underline_shape(rhs);
+  }
+
+  if (!lhs.IsArray()) {
     // Non-tuple, non-array tupes such as opaque and token types are trivially
     // the same.
     return lhs.element_type() == rhs.element_type();
@@ -227,7 +503,7 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
       VLOG(3) << "CompareShapes: lhs rank != rhs rank";
       return false;
     }
-    for (int i = 0; i < lhs.rank(); ++i) {
+    for (int i = 0; i < lhs.dimensions().size(); ++i) {
       if (ignore_dynamic_dimension_ &&
           (lhs.is_unbounded_dynamic_dimension(i) ||
            rhs.is_unbounded_dynamic_dimension(i))) {
@@ -277,10 +553,10 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
   }
 
   if (!ignore_dynamic_dimension_) {
-    for (int i = 0; i < lhs.rank(); ++i) {
+    for (int i = 0; i < lhs.dimensions().size(); ++i) {
       if (lhs.is_dynamic_dimension(i) != rhs.is_dynamic_dimension(i)) {
-        VLOG(3)
-            << "CompareShapes: lhs and rhs have different dynamic dimensions.";
+        VLOG(3) << "CompareShapes: lhs and rhs have different dynamic "
+                   "dimensions.";
         return false;
       }
     }
@@ -301,13 +577,36 @@ ProgramShape& ProgramShape::operator=(const ProgramShape&) = default;
 ProgramShape& ProgramShape::operator=(ProgramShape&&) = default;
 
 ProgramShape::ProgramShape(const ProgramShapeProto& program_shape_proto) {
-  for (const ShapeProto& shape_proto : program_shape_proto.parameters()) {
-    *add_parameters() = Shape(shape_proto);
+  auto program_shape = FromProto(program_shape_proto);
+  if (!program_shape.ok()) {
+    LOG(ERROR) << "Failed to parse ProgramShapeProto: "
+               << program_shape_proto.DebugString();
+    return;
   }
-  *mutable_result() = Shape(program_shape_proto.result());
-  for (const std::string& name : program_shape_proto.parameter_names()) {
-    add_parameter_names(name);
+  *this = std::move(*program_shape);
+}
+
+absl::StatusOr<ProgramShape> ProgramShape::FromProto(
+    const ProgramShapeProto& program_shape_proto) {
+  ProgramShape program_shape;
+  const int num_params = program_shape_proto.parameters_size();
+  const int num_param_names = program_shape_proto.parameter_names_size();
+  TF_RET_CHECK(num_params == num_param_names)
+      << "ProgramShapeProto has different numbers of parameters and "
+         "parameter names: "
+      << num_params << " vs " << num_param_names;
+  program_shape.parameters_.reserve(num_params);
+  program_shape.parameter_names_.reserve(num_params);
+  for (int i = 0; i < num_params; ++i) {
+    const std::string& name =
+        i < num_param_names ? program_shape_proto.parameter_names(i) : "";
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        Shape::FromProto(program_shape_proto.parameters(i)));
+    program_shape.AddParameter(shape, name);
   }
+  TF_ASSIGN_OR_RETURN(*program_shape.mutable_result(),
+                      Shape::FromProto(program_shape_proto.result()));
+  return program_shape;
 }
 
 ProgramShapeProto ProgramShape::ToProto() const {
diff --git a/third_party/xla/xla/shape.h b/third_party/xla/xla/shape.h
index c8f83bb2fab8..8453dc17717e 100644
--- a/third_party/xla/xla/shape.h
+++ b/third_party/xla/xla/shape.h
@@ -16,14 +16,19 @@ limitations under the License.
 #ifndef XLA_SHAPE_H_
 #define XLA_SHAPE_H_
 
+#include <stdbool.h>
+
 #include <cstdint>
 #include <limits>
 #include <optional>
 #include <ostream>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/types/span.h"
@@ -39,31 +44,69 @@ namespace xla {
 // A shape describes the number of dimensions in a array, the bounds of each
 // dimension, and the primitive component type. For tuples, shape describes the
 // structure (number of elements and nesting).
+//
+// Depending on the element type, the shape falls into one of the following
+// categories:
+//
+// - Invalid: element_type == PRIMITIVE_TYPE_INVALID
+// - Token: element_type == TOKEN
+// - Opaque: element_type == OPAQUE_TYPE
+// - Array: element_type is an array type
+// - Tuple: element_type == TUPLE
+//
+// These categories are mutually exclusive, i.e. a shape can only be one of
+// them.
 class Shape {
  public:
+  // Returns true if the given dimension size is valid.
+  [[nodiscard]] static bool IsValidDimensionSize(int64_t size,
+                                                 bool is_dynamic) {
+    return size >= 0 || (is_dynamic && size == kUnboundedSize);
+  }
+
+  // Creates an invalid shape, with element type PRIMITIVE_TYPE_INVALID and the
+  // other fields empty.
   Shape();
+
   ~Shape();
+
   Shape(const Shape&);
   Shape(Shape&&) noexcept;
   Shape& operator=(const Shape&);
   Shape& operator=(Shape&&) noexcept;
 
-  // Construct a shape from a ShapeProto.
+  // Constructs a shape from a ShapeProto. Results in an invalid shape (as
+  // opposed to crashing) if the proto has logically invalid fields.
+  ABSL_DEPRECATED("Use FromProto instead.")
   explicit Shape(const ShapeProto& shape_proto);
 
+  // Creates a token, opaque or buffer shape.
+  // Precondition:
+  //  - `element_type` must be TOKEN, OPAQUE_TYPE or BUFFER.
+  explicit Shape(PrimitiveType element_type);
+
+  // Creates an array shape. `dimensions` can be empty, in which case the shape
+  // is a scalar (degenerated array).
+  // Precondition:
+  //  - `element_type` must be a valid array type.
+  //  - `dynamic_dimensions` must be either empty or have the same size as
+  //    `dimensions`. If it's empty, all dimensions are static.
   Shape(PrimitiveType element_type, absl::Span<const int64_t> dimensions,
-        absl::Span<const bool> dynamic_dimensions,
-        std::vector<Shape> tuple_shapes)
-      : element_type_(element_type),
-        dimensions_(dimensions.begin(), dimensions.end()),
-        dynamic_dimensions_(dynamic_dimensions.begin(),
-                            dynamic_dimensions.end()),
-        tuple_shapes_(std::move(tuple_shapes)) {}
+        absl::Span<const bool> dynamic_dimensions);
+
+  // Creates a tuple shape. `tuple_shapes` can be empty, in which case the
+  // shape is a nil shape (empty tuple).
+  explicit Shape(std::vector<Shape> tuple_shapes);
+
+  // Constructs a shape from a ShapeProto. Results in an invalid shape (as
+  // opposed to crashing) if the proto has logically invalid fields.
+  static absl::StatusOr<Shape> FromProto(const ShapeProto& shape_proto);
+
+  // Creates a buffer shape. `element_shape` must be a valid array shape.
+  static Shape MakeBufferShape(Shape element_shape);
 
   // Returns a ShapeProto representation of the Shape.
   ShapeProto ToProto() const;
-  // Sets a ShapeProto to the representation of the Shape.
-  void SetProto(ShapeProto& proto) const;
 
   // Prints a human-readable string that represents the given shape, with or
   // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
@@ -73,26 +116,58 @@ class Shape {
   // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
   std::string ToString(bool print_layout = false) const;
 
-  // Returns the rank (number of dimensions) of the given shape. Shape must be
-  // an array.
-  int64_t rank() const {
-    return dimensions_.size();
+  // Returns whether the shape is of the specified category (array, tuple, etc).
+  bool IsArray() const {
+    const bool result = primitive_util::IsArrayType(element_type());
+    // We do this check in debug mode only to avoid performance regressions.
+    DCHECK_EQ(result, if_array_state() != nullptr)
+        << "Shape " << ToString()
+        << " has inconsistent element_type and state.";
+    return result;
+  }
+  bool IsTuple() const {
+    const bool result = element_type() == TUPLE;
+    // We do this check in debug mode only to avoid performance regressions.
+    DCHECK_EQ(result, if_tuple_state() != nullptr)
+        << "Shape " << ToString()
+        << " has inconsistent element_type and state.";
+    return result;
+  }
+  bool IsBuffer() const {
+    const bool result = element_type() == BUFFER;
+    // We do this check in debug mode only to avoid performance regressions.
+    DCHECK_EQ(result, if_buffer_state() != nullptr)
+        << "Shape " << ToString()
+        << " has inconsistent element_type and state.";
+    return result;
+  }
+  bool IsToken() const {
+    const bool result = element_type() == TOKEN;
+    // We do this check in debug mode only to avoid performance regressions.
+    DCHECK_EQ(result, if_token_state() != nullptr)
+        << "Shape " << ToString()
+        << " has inconsistent element_type and state.";
+    return result;
+  }
+  bool IsOpaque() const {
+    const bool result = element_type() == OPAQUE_TYPE;
+    // We do this check in debug mode only to avoid performance regressions.
+    DCHECK_EQ(result, if_opaque_state() != nullptr)
+        << "Shape " << ToString()
+        << " has inconsistent element_type and state.";
+    return result;
   }
 
-  // Returns whether the shape is of the specified type (array, tuple, etc).
-  bool IsArray() const { return primitive_util::IsArrayType(element_type()); }
-  bool IsTuple() const { return element_type() == TUPLE; }
-  bool IsToken() const { return element_type() == TOKEN; }
-  bool IsOpaque() const { return element_type() == OPAQUE_TYPE; }
-
-  // Returns whether all elements in the shape are integer.
-  // A nested tuple of integers is considered as integer.
-  bool IsInteger() const;
+  // Returns whether all elements in the shape are integers.
+  // Tuple shapes are traversed recursively.
+  bool AreAllLeavesIntegers() const;
 
   // Returns true if no array dimension in the shape is dynamically sized. Tuple
   // shapes are traversed recursively.
   bool is_static() const;
 
+  // Returns true if the shape contains at least one dynamic dimension. Tuple
+  // shapes are traversed recursively.
   bool is_dynamic() const { return !is_static(); }
 
   // Unbounded dynamism.
@@ -108,14 +183,19 @@ class Shape {
   bool is_unbounded_dynamic() const;
 
   // Returns true if the given dimension is unbounded dynamic.
+  // Precondition: this is an array shape and `dimension` is a valid dimension
+  // index.
   bool is_unbounded_dynamic_dimension(int dimension) const {
-    return dimensions_[dimension] == kUnboundedSize;
+    return array_state().dimensions[dimension] == kUnboundedSize;
   }
 
   // Sets a given dimension as unbounded dynamic.
+  // Precondition: this is an array shape and `dimension` is a valid dimension
+  // index.
   void set_unbounded_dynamic_dimension(int dimension) {
-    dynamic_dimensions_[dimension] = true;
-    dimensions_[dimension] = kUnboundedSize;
+    auto& state = array_state();
+    state.dynamic_dimensions[dimension] = true;
+    state.dimensions[dimension] = kUnboundedSize;
   }
 
   // Returns true if the shape has one or more dimensions with bounded sizes.
@@ -124,122 +204,219 @@ class Shape {
   bool is_bounded_dynamic() const;
 
   // Returns true if the given dimension is bounded dynamic.
+  // Precondition: this is an array shape and `dimension` is a valid dimension
+  // index.
   bool is_bounded_dynamic_dimension(int dimension) const {
     return is_dynamic_dimension(dimension) &&
            !is_unbounded_dynamic_dimension(dimension);
   }
 
   // Returns true if the given dimension is dynamically-sized.
+  // Precondition: this is an array shape and `dimension` is a valid dimension
+  // index.
   bool is_dynamic_dimension(int dimension) const {
-    return dynamic_dimensions_[dimension];
+    return array_state().dynamic_dimensions[dimension];
   }
 
   // Returns true if the given dimension is statically-sized.
+  // Precondition: this is an array shape and `dimension` is a valid dimension
+  // index.
   bool is_static_dimension(int dimension) const {
-    return !dynamic_dimensions_[dimension];
+    return !array_state().dynamic_dimensions[dimension];
   }
 
   // Sets whether or not the given dimension is dynamically-sized.
-  void set_dynamic_dimension(int dimension, bool is_dynamic) {
-    dynamic_dimensions_[dimension] = is_dynamic;
-  }
-
+  // Precondition:
+  //   - This is an array shape.
+  //   - `dimension` is a valid dimension index.
+  //   - The dimension's size is valid for the given dynamic-ness.
+  void set_dynamic_dimension(int dimension, bool is_dynamic);
+
+  // Returns a span to indicate whether each dimension is dynamic.
+  // Precondition: this is an array shape.
   absl::Span<const bool> dynamic_dimensions() const {
-    return dynamic_dimensions_;
-  }
-
-  absl::Span<bool> mutable_dynamic_dimensions() {
-    return absl::MakeSpan(dynamic_dimensions_);
+    return array_state().dynamic_dimensions;
   }
 
   // Removes the given dimension from the shape. Layout, if it exists, is
   // adjusted to match the modified shape.
+  // Precondition: this is an array shape, and the input dimension indices are
+  // valid.
   void DeleteDimension(int64_t dim_to_delete);
-  void DeleteDimensions(absl::Span<const int64_t> sorted_dims_to_delete);
+  // Like the above, but deletes multiple dimensions at once. The dimensions
+  // must not contain duplicates.
+  void DeleteDimensions(absl::Span<const int64_t> dims_to_delete);
 
-  // Methods for accessing the primitive type.
+  // Returns the primitive type of the shape.
   PrimitiveType element_type() const { return element_type_; }
-  void set_element_type(PrimitiveType value) { element_type_ = value; }
 
-  // Methods for accessing the dimensions array.
-  int dimensions_size() const { return dimensions_.size(); }
-  int64_t dimensions(int index) const { return dimensions_[index]; }
+  // Sets the primitive type of the shape. If the new type and the old type
+  // are in different categories (e.g. array vs. tuple), the state is reset
+  // to the default (empty) state for the new type; otherwise, the state is
+  // preserved. This behavior ensures that the state is always consistent with
+  // the element type.
+  void set_element_type(PrimitiveType value);
+
+  // Returns the number of dimensions in the shape.
+  // Precondition: this is an array shape.
+  ABSL_DEPRECATE_AND_INLINE()
+  inline int dimensions_size() const { return dimensions().size(); }
+
+  // Returns the size of the given dimension if it's static, or the upper bound
+  // of the dimension size if it's dynamic.
+  // Precondition: this is an array shape and `index` is a valid dimension
+  // index.
+  int64_t dimensions(int index) const {
+    return array_state().dimensions[index];
+  }
 
+  // Returns the size of the index-th minor dimension.
+  // Precondition: this is an array shape, `index` is a valid dimension
+  // index, and the shape has a layout.
   int64_t dimensions_minor(int index) const {
     CHECK(has_layout());
-    return dimensions_[layout_->minor_to_major(index)];
-  }
-  void set_dimensions(int index, int64_t value) { dimensions_[index] = value; }
-  void set_dimensions_minor(int index, int64_t value) {
-    CHECK(has_layout());
-    dimensions_[layout_->minor_to_major(index)] = value;
-  }
-  void add_dimensions(int64_t value) {
-    dimensions_.push_back(value);
-    dynamic_dimensions_.push_back(false);
+    const auto& state = array_state();
+    return state.dimensions[state.layout->minor_to_major(index)];
   }
+
+  // Sets the size of the given dimension if it's static, or sets the upper
+  // bound of the dimension size if it's dynamic.
+  // Arguments:
+  //   - `index` is the index of the dimension.
+  //   - `size` is the size of the dimension if it is static, or the upper
+  //      bound of the dimension size if it is dynamic.
+  //   - `is_dynamic` is the dynamic-ness of the dimension:
+  //     - false: the dimension is static.
+  //     - true: the dimension is dynamic.
+  //     - nullopt: don't change the dynamic-ness of the dimension.
+  // Precondition:
+  //   - This is an array shape.
+  //   - `index` is a valid dimension index
+  //   - `size` is either >= 0 or, when the dimension is dynamic,
+  //     kUnboundedSize.
+  void set_dimensions(int index, int64_t size,
+                      std::optional<bool> is_dynamic = std::nullopt);
+
+  // Like set_dimensions, but sets the index-th minor dimension instead of
+  // the index-th dimension.
+  void set_dimensions_minor(int index, int64_t size,
+                            std::optional<bool> is_dynamic = std::nullopt);
+
+  // Appends a new dimension with the given size.
+  // Arguments:
+  //   - `value` is the size of the dimension if it is static, or the upper
+  //      bound of the dimension size if it is dynamic.
+  //   - `is_dynamic` is the dynamic-ness of the dimension:
+  //     - false: the dimension is static.
+  //     - true: the dimension is dynamic.
+  // Precondition:
+  //   - This is an array shape.
+  //   - Either `value` is >= 0, or `is_dynamic` is true and `value` is
+  //     kUnboundedSize.
+  void add_dimensions(int64_t value, bool is_dynamic = false);
+
+  // Clears all dimensions (i.e. makes this shape a scalar).
+  // Precondition: this is an array shape.
   void clear_dimensions() {
-    dimensions_.clear();
-    dynamic_dimensions_.clear();
+    auto& state = array_state();
+    state.dimensions.clear();
+    state.dynamic_dimensions.clear();
   }
-  absl::Span<const int64_t> dimensions() const { return dimensions_; }
-  absl::Span<int64_t> mutable_dimensions() {
-    return absl::MakeSpan(dimensions_);
+
+  // Returns a span to indicate the size of each dimension.
+  // Precondition: this is an array shape.
+  absl::Span<const int64_t> dimensions() const {
+    return array_state().dimensions;
   }
 
-  // Methods for accessing the tuple subshapes. This field only non-empty for
-  // tuple shapes.
-  int tuple_shapes_size() const { return tuple_shapes_.size(); }
+  // Returns the number of top-level tuple components in this shape.
+  // Precondition: this is a tuple shape.
+  ABSL_DEPRECATE_AND_INLINE()
+  inline int tuple_shapes_size() const { return tuple_shapes().size(); }
+
+  // Returns the shape of the i-th tuple component.
+  // Precondition: this is a tuple shape and `index` is a valid tuple component
+  // index.
   const Shape& tuple_shapes(int index) const;
-  Shape* mutable_tuple_shapes(int index) { return &tuple_shapes_[index]; }
+  Shape* mutable_tuple_shapes(int index) {
+    return &tuple_state().tuple_shapes[index];
+  }
+
+  // Appends a new invalid shape to the tuple and returns a pointer to it.
+  // Precondition: this is a tuple shape.
+  // Postcondition: the returned pointer is not null, and the pointee is owned
+  // by this shape.
   Shape* add_tuple_shapes();
-  void clear_tuple_shapes() { tuple_shapes_.clear(); }
-  const std::vector<Shape>& tuple_shapes() const { return tuple_shapes_; }
-  std::vector<Shape>* mutable_tuple_shapes() { return &tuple_shapes_; }
 
-  // Methods for accessing the layout field.
-  bool has_layout() const { return layout_ != std::nullopt; }
+  // Clears all tuple components (i.e. makes this shape a 0-tuple).
+  // Precondition: this is a tuple shape.
+  void clear_tuple_shapes() { tuple_state().tuple_shapes.clear(); }
+
+  // Returns a vector of all tuple component shapes.
+  // Precondition: this is a tuple shape.
+  const std::vector<Shape>& tuple_shapes() const;
+  std::vector<Shape>* mutable_tuple_shapes() {
+    return &tuple_state().tuple_shapes;
+  }
+
+  // Returns the underlying shape of the buffer.
+  const Shape& buffer_shape() const;
+  Shape* mutable_buffer_shape() { return &buffer_state().buffer_shape[0]; }
+
+  // Returns true if the shape is an array and has a layout.
+  bool has_layout() const {
+    const auto* const state = if_array_state();
+    return state != nullptr && state->layout != std::nullopt;
+  }
+
+  // Returns the layout of the shape.
+  // Precondition: this is an array shape and has a layout.
   const Layout& layout() const {
-    CHECK(has_layout()) << ShortDebugString();
-    return *layout_;
+    CHECK(has_layout()) << ToString();
+    return *array_state().layout;
   }
+
+  // Returns a pointer to the layout of the shape. If the shape does not have a
+  // layout, an empty layout is created.
+  // Precondition: this is an array shape.
+  // Postcondition: the returned pointer is not null, and the pointee is owned
+  // by this shape.
   Layout* mutable_layout() {
-    CHECK(IsArray()) << ShortDebugString();
-    if (layout_ == std::nullopt) {
-      layout_.emplace();
+    auto& state = array_state();
+    if (state.layout == std::nullopt) {
+      state.layout.emplace();
     }
-    return &(*layout_);
+    return &(*state.layout);
   }
-  void clear_layout() { layout_ = std::nullopt; }
+
+  // Removes the layout of the shape, if any.
+  // Precondition: this is an array shape.
+  void clear_layout() { array_state().layout = std::nullopt; }
 
   // Recursively clear all dynamic dimension of a shape, including bounded and
-  // unbounded dynamic dimensions.
+  // unbounded dynamic dimensions. Clearing a dynamic dimension means
+  // changing the dimension to static and setting its size as the dynamic
+  // dimension's size upper bound.
   void clear_dynamic_dimensions() {
-    if (!IsTuple()) {
+    if (auto* const state = if_array_state()) {
       if (is_dynamic()) {
         mutable_layout()->set_dynamic_shape_metadata_prefix_bytes(0);
       }
-      for (int64_t i = 0; i < dynamic_dimensions_.size(); ++i) {
-        dynamic_dimensions_[i] = false;
+      for (int64_t i = 0; i < state->dynamic_dimensions.size(); ++i) {
+        state->dynamic_dimensions[i] = false;
       }
       return;
     }
-    for (auto& subshape : tuple_shapes_) {
-      subshape.clear_dynamic_dimensions();
+    if (auto* const state = if_tuple_state()) {
+      for (auto& subshape : state->tuple_shapes) {
+        subshape.clear_dynamic_dimensions();
+      }
     }
   }
 
-  void Clear() {
-    element_type_ = PRIMITIVE_TYPE_INVALID;
-    clear_dimensions();
-    tuple_shapes_.clear();
-    clear_layout();
-  }
+  // Resets this to the default state (an invalid shape).
+  void Clear();
 
-  std::string SerializeAsString() const {
-    return ToProto().SerializeAsString();
-  }
-  std::string ShortDebugString() const { return ToProto().ShortDebugString(); }
   std::string DebugString() const { return ToProto().DebugString(); }
 
   // Equal is a configurable functor to check the equality of two shapes.
@@ -305,6 +482,10 @@ class Shape {
       ignore_split_config_in_layout_ = true;
       return *this;
     }
+    Equal& IgnoreBuffer(bool ignore_buffer = true) {
+      ignore_buffer_ = ignore_buffer;
+      return *this;
+    }
 
    private:
     bool ignore_layout_ = false;
@@ -317,6 +498,7 @@ class Shape {
     bool ignore_dimensions_ = false;
     bool ignore_tail_padding_alignment_in_elements_in_layout_ = false;
     bool ignore_split_config_in_layout_ = false;
+    bool ignore_buffer_ = false;
   };
 
   // Test that all fields of the shape are the same, equivalent to Equal().
@@ -325,18 +507,24 @@ class Shape {
 
   template <typename H, bool kIsLayoutSensitive = true>
   static H Hash(H h, const Shape& s) {
-    if (s.IsTuple()) {
-      for (const Shape& subshape : s.tuple_shapes_) {
+    if (const auto* const state = s.if_tuple_state()) {
+      for (const Shape& subshape : state->tuple_shapes) {
         h = Shape::Hash<H, kIsLayoutSensitive>(std::move(h), subshape);
       }
-      return H::combine(std::move(h), s.tuple_shapes_size());
+      return H::combine(std::move(h), state->tuple_shapes.size());
+    }
+    if (const auto* const state = s.if_array_state()) {
+      h = H::combine(std::move(h), s.element_type_, state->dimensions,
+                     state->dynamic_dimensions);
+      if (kIsLayoutSensitive) {
+        h = H::combine(std::move(h), state->layout);
+      }
+      return h;
     }
-    h = H::combine(std::move(h), s.element_type_, s.dimensions_,
-                   s.dynamic_dimensions_);
-    if (kIsLayoutSensitive) {
-      h = H::combine(std::move(h), s.layout_);
+    if (const auto* const state = s.if_buffer_state()) {
+      return H::combine(std::move(h), s.element_type_, state->buffer_shape);
     }
-    return std::move(h);
+    return H::combine(std::move(h), s.element_type_);
   }
 
   template <typename H>
@@ -345,23 +533,123 @@ class Shape {
   }
 
  private:
-  // The element type of this shape (tuple, array, etc).
-  PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
+  friend absl::Status ValidateNonLayoutProperties(const Shape& shape);
+
+  // Define one state struct for each shape category. Depending on the element
+  // type, the state_ variant will be set to exactly one of these structs.
+  // This design has several benefits:
+  //   - It prevents (by construction) bugs where the shape's state has
+  //     non-empty fields that don't match the shape's element type.
+  //   - It prevents (by construction) bugs where the code accesses a field
+  //     of a shape's state that doesn't match the shape's element type (e.g.
+  //     accessing the tuple_shapes field of an array shape).
+  //   - It simplifies the code by eliminating the need for runtime handling of
+  //     fields that are irrelevant to the shape's category.
+  //   - It reduces the size of the Shape class as the variant doesn't need to
+  //     store the fields for all shape categories at once.
+  struct InvalidState {};
+  struct TokenState {};
+  struct OpaqueState {};
+  struct ArrayState {
+    // The array bounds of the dimensions. For a dynamically-sized dimension,
+    // the respective value in this vector is an inclusive upper limit of the
+    // array bound.
+    DimensionVector dimensions;
+
+    // This vector has the same size as 'dimensions' and indicates whether the
+    // respective dimension is dynamically sized.
+    absl::InlinedVector<bool, InlineRank()> dynamic_dimensions;
+
+    // The layout of the shape.
+    std::optional<Layout> layout;
+  };
+  struct TupleState {
+    // The tuple element subshapes.
+    std::vector<Shape> tuple_shapes;
+  };
+  struct BufferState {
+    // The underlying array shape for the buffer type, represented as a
+    // vector with one element. Using Shape directly or
+    // absl::InlinedVector<Shape, 1> here causes recursive definition.
+    std::vector<Shape> buffer_shape;
+  };
+  using State = std::variant<InvalidState, TokenState, OpaqueState, ArrayState,
+                             TupleState, BufferState>;
+
+  // CHECKs that the dimension size is valid.
+  void CheckDimensionSize(int dim_index, int64_t size, bool is_dynamic);
+
+  // Like add_dimensions(), but does not CHECK that the arguments are valid.
+  // Instead, we rely on validation down the road to catch invalid shapes.
+  // This is useful for code that should not crash, such as constructing a
+  // Shape from an unvalidated proto.
+  void UnsafeAddDimension(int64_t value, bool is_dynamic);
+
+  // Convenience accessors for the state_ variant. Each if_*_state() accessor
+  // returns a pointer to the corresponding state struct, or nullptr if the
+  // shape is not of the corresponding category. The version without the `if_`
+  // prefix is similar, but will CHECK-fail if the shape is not of the
+  // corresponding category. I.e. if_foo_state() vs foo_state() is analogous to
+  // std::get_if() vs std::get().
+  //
+  // In general, prefer foo_state() over if_foo_state() as the former catches
+  // programmer errors earlier and generates a more informative error message.
+  // However, if_foo_state() is useful in cases where it's not a programmer
+  // error if the shape is not of the corresponding category.
+
+  const InvalidState* if_invalid_state() const {
+    return std::get_if<InvalidState>(&state_);
+  }
+  const TokenState* if_token_state() const {
+    return std::get_if<TokenState>(&state_);
+  }
+  const OpaqueState* if_opaque_state() const {
+    return std::get_if<OpaqueState>(&state_);
+  }
+  const ArrayState* if_array_state() const {
+    return std::get_if<ArrayState>(&state_);
+  }
+  ArrayState* if_array_state() { return std::get_if<ArrayState>(&state_); }
+  const TupleState* if_tuple_state() const {
+    return std::get_if<TupleState>(&state_);
+  }
+  TupleState* if_tuple_state() { return std::get_if<TupleState>(&state_); }
+  const BufferState* if_buffer_state() const {
+    return std::get_if<BufferState>(&state_);
+  }
+  BufferState* if_buffer_state() { return std::get_if<BufferState>(&state_); }
 
-  // The array bounds of the dimensions. This is nonempty only for array
-  // shapes. For a dynamically-sized dimension, the respective value in this
-  // vector is an inclusive upper limit of the array bound.
-  DimensionVector dimensions_;
+  const InvalidState& invalid_state() const {
+    const auto* const state = if_invalid_state();
+    CHECK(state) << "Expected an invalid shape. Got " << ToString();
+    return *state;
+  }
+  const TokenState& token_state() const {
+    const auto* const state = if_token_state();
+    CHECK(state) << "Expected a token shape. Got " << ToString();
+    return *state;
+  }
+  const OpaqueState& opaque_state() const {
+    const auto* const state = if_opaque_state();
+    CHECK(state) << "Expected an opaque shape. Got " << ToString();
+    return *state;
+  }
+  const ArrayState& array_state() const;
+  ArrayState& array_state();
+  const TupleState& tuple_state() const;
+  TupleState& tuple_state();
+  const BufferState& buffer_state() const;
+  BufferState& buffer_state();
 
-  // This vector is the same size as 'dimensions_' and indicates whether the
-  // respective dimension is dynamically sized.
-  absl::InlinedVector<bool, InlineRank()> dynamic_dimensions_;
+  // CHECK-fails if this shape's state is not empty.
+  void CheckStateIsEmpty() const;
 
-  // The tuple element subshapes. This is nonempty only for tuple shapes.
-  std::vector<Shape> tuple_shapes_;
+  // The element type of this shape (tuple, array, etc).
+  PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
 
-  // The layout of the shape. Only relevant for arrays.
-  std::optional<Layout> layout_;
+  // The state of this shape.
+  // Invariant: element_type_ always matches the type held in this variant.
+  State state_;
 };
 
 // Shape of the parameters and output of an XLA computation. This is analogous
@@ -375,9 +663,13 @@ class ProgramShape {
   ProgramShape& operator=(const ProgramShape&);
   ProgramShape& operator=(ProgramShape&&);
 
-  // Creates a ProgramShape from a ProgramShapeProto protobuf.
+  ABSL_DEPRECATED("Use FromProto instead.")
   explicit ProgramShape(const ProgramShapeProto& program_shape_proto);
 
+  // Creates a ProgramShape from a ProgramShapeProto protobuf.
+  static absl::StatusOr<ProgramShape> FromProto(
+      const ProgramShapeProto& program_shape_proto);
+
   // Returns a proto representation of the object.
   ProgramShapeProto ToProto() const;
 
@@ -385,58 +677,44 @@ class ProgramShape {
 
   std::string ToString() const;
 
-  // The following methods mirror the protobuf generated code interface for the
-  // message ProgramShapeProto. This enabled easy migration of this data
-  // structure from a proto to a proper C++ class.
-  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
-  // interface.
-
   // Methods for accessing and manipulating the Shape of the parameters.
   int parameters_size() const { return parameters_.size(); }
   const Shape& parameters(int index) const { return parameters_[index]; }
   Shape* mutable_parameters(int index) { return &parameters_[index]; }
-  Shape* add_parameters() {
-    parameters_.emplace_back();
-    return &parameters_.back();
+  void AddParameter(Shape shape, std::string name) {
+    parameters_.push_back(std::move(shape));
+    parameter_names_.push_back(std::move(name));
+  }
+
+  void clear_parameters() {
+    parameters_.clear();
+    parameter_names_.clear();
   }
-  void clear_parameters() { parameters_.clear(); }
   const std::vector<Shape>& parameters() const { return parameters_; }
-  std::vector<Shape>* mutable_parameters() { return &parameters_; }
 
   // Methods for accessing and manipulating the Shape of the result.
   const Shape& result() const { return result_; }
   Shape* mutable_result() { return &result_; }
 
   // Methods for accessing and manipulating the names of the parameters.
-  int parameter_names_size() const { return parameter_names_.size(); }
   const std::string& parameter_names(int index) const {
     return parameter_names_[index];
   }
   void set_parameter_names(int index, const std::string& value) {
     parameter_names_[index] = value;
   }
-  std::string* mutable_parameter_names(int index) {
-    return &parameter_names_[index];
-  }
-  void add_parameter_names(const std::string& value) {
-    parameter_names_.push_back(value);
-  }
-  std::string* add_parameter_names() {
-    parameter_names_.push_back("");
-    return &parameter_names_.back();
+  void clear_parameter_names() {
+    for (auto& name : parameter_names_) {
+      name.clear();
+    }
   }
-  void clear_parameter_names() { parameter_names_.clear(); }
   const std::vector<std::string>& parameter_names() const {
     return parameter_names_;
   }
-  std::vector<std::string>* mutable_parameter_names() {
-    return &parameter_names_;
-  }
-
-  std::string ShortDebugString() const { return ToProto().ShortDebugString(); }
-  std::string DebugString() const { return ToProto().DebugString(); }
 
  private:
+  // Invariant: parameters_ and parameter_names_ have the same size.
+
   // The shapes of the parameters of the computation represented by this object.
   std::vector<Shape> parameters_;
 
diff --git a/third_party/xla/xla/shape_layout.cc b/third_party/xla/xla/shape_layout.cc
index 9161cffe1043..89a35e5cb66a 100644
--- a/third_party/xla/xla/shape_layout.cc
+++ b/third_party/xla/xla/shape_layout.cc
@@ -95,6 +95,11 @@ void ShapeLayout::Clear(ShapeIndexView shape_index) {
   ShapeUtil::GetMutableSubshape(&shape_, shape_index)->clear_layout();
 }
 
+void ShapeLayout::ClearTiles() { LayoutUtil::ClearTiles(&shape_); }
+void ShapeLayout::ClearTiles(ShapeIndexView shape_index) {
+  LayoutUtil::ClearTiles(ShapeUtil::GetMutableSubshape(&shape_, shape_index));
+}
+
 bool ShapeLayout::LayoutIsSet() const { return LayoutUtil::HasLayout(shape_); }
 bool ShapeLayout::AnyLayoutIsSet() const {
   return LayoutUtil::HasAnyLayout(shape_);
diff --git a/third_party/xla/xla/shape_layout.h b/third_party/xla/xla/shape_layout.h
index cb3d56a53bfb..f4db89a17d8c 100644
--- a/third_party/xla/xla/shape_layout.h
+++ b/third_party/xla/xla/shape_layout.h
@@ -59,6 +59,9 @@ class ShapeLayout {
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
   void Clear(ShapeIndexView shape_index);
+  // Clears all the tiles in the layouts stored in this object.
+  void ClearTiles();
+  void ClearTiles(ShapeIndexView shape_index);
 
   // Sets all Layouts stored in this object to the default layout.
   void SetToDefaultLayout();
diff --git a/third_party/xla/xla/shape_partition_test.cc b/third_party/xla/xla/shape_partition_test.cc
index 3a4f01440ecb..c672b0684ebf 100644
--- a/third_party/xla/xla/shape_partition_test.cc
+++ b/third_party/xla/xla/shape_partition_test.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -33,7 +33,7 @@ namespace xla {
 namespace cpu {
 namespace {
 
-class ShapePartitionAssignerTest : public HloTestBase {
+class ShapePartitionAssignerTest : public HloHardwareIndependentTestBase {
  protected:
   typedef std::vector<int64_t> Vec;
 
@@ -98,7 +98,7 @@ TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
             expected_partitions);
 }
 
-class ShapePartitionIteratorTest : public HloTestBase {
+class ShapePartitionIteratorTest : public HloHardwareIndependentTestBase {
  protected:
   typedef std::vector<std::pair<int64_t, int64_t>> Partition;
 };
@@ -152,7 +152,7 @@ TEST_F(ShapePartitionIteratorTest, Shape532WithLayout210) {
   }
 }
 
-class RandomShapePartitionIteratorTest : public HloTestBase {
+class RandomShapePartitionIteratorTest : public HloHardwareIndependentTestBase {
  protected:
   typedef std::vector<std::pair<int64_t, int64_t>> Partition;
   RandomShapePartitionIteratorTest()
diff --git a/third_party/xla/xla/shape_test.cc b/third_party/xla/xla/shape_test.cc
index 38005f685087..fb51518b116e 100644
--- a/third_party/xla/xla/shape_test.cc
+++ b/third_party/xla/xla/shape_test.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "xla/shape.h"
 
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "absl/hash/hash_testing.h"
+#include "absl/strings/str_cat.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/layout.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/xla_data.pb.h"
 
@@ -36,6 +40,7 @@ class ShapeTest : public ::testing::Test {
   const Shape matrix_ = ShapeUtil::MakeShape(U32, {1, 2});
   const Shape matrix2_ =
       ShapeUtil::MakeShapeWithDenseLayout(S32, {3, 4}, {0, 1});
+  const Shape matrix_buffer_ = ShapeUtil::MakeBufferShape(S32, {3, 4});
   const Shape tuple_ =
       ShapeUtil::MakeTupleShape({opaque_, scalar_, matrix_, matrix2_});
   const Shape nested_tuple_ =
@@ -46,13 +51,24 @@ class ShapeTest : public ::testing::Test {
       ShapeUtil::MakeShape(F32, {Shape::kUnboundedSize, 784}, {true, false});
 };
 
+// Tests that if the dynamic_dimensions parameter empty in the Shape
+// constructor, it's treated as all dimensions are static.
+TEST(Shape, ArrayCtorTreatsEmptyDynamicDimensionsAsAllStatic) {
+  const Shape shape(F32, {1, 2, 3}, {});
+  EXPECT_TRUE(shape.is_static());
+  EXPECT_TRUE(shape.is_static_dimension(0));
+  EXPECT_TRUE(shape.is_static_dimension(1));
+  EXPECT_TRUE(shape.is_static_dimension(2));
+}
+
 TEST_F(ShapeTest, ShapeToFromProto) {
   for (const Shape& shape :
-       {opaque_, token_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_,
-        dynamic_matrix_, unbounded_}) {
-    Shape shape_copy(shape.ToProto());
-    EXPECT_TRUE(ShapeUtil::Equal(shape, shape_copy))
-        << shape << " != " << shape_copy;
+       {opaque_, token_, scalar_, matrix_, matrix2_, matrix_buffer_, tuple_,
+        nested_tuple_, dynamic_matrix_, unbounded_}) {
+    auto shape_copy = Shape::FromProto(shape.ToProto());
+    TF_ASSERT_OK(shape_copy);
+    EXPECT_TRUE(ShapeUtil::Equal(shape, *shape_copy))
+        << shape << " != " << *shape_copy;
   }
 }
 
@@ -72,6 +88,7 @@ TEST_F(ShapeTest, ShapeToString) {
             scalar_with_tile_.ToString(/*print_layout=*/true));
   EXPECT_EQ("u32[1,2]{1,0}", matrix_.ToString(/*print_layout=*/true));
   EXPECT_EQ("s32[3,4]{0,1}", matrix2_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("b(s32[3,4]{1,0})", matrix_buffer_.ToString(/*print_layout=*/true));
   EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
             tuple_.ToString(/*print_layout=*/true));
   EXPECT_EQ(
@@ -98,6 +115,13 @@ TEST_F(ShapeTest, DeleteDimensions) {
   EXPECT_EQ(shape, ShapeUtil::MakeShapeWithDenseLayout(F32, {5, 9}, {0, 1}));
 }
 
+TEST_F(ShapeTest, DeleteDimensionsUnordered) {
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(F32, {5, 3, 2, 7, 9},
+                                                    {2, 0, 1, 4, 3});
+  shape.DeleteDimensions({3, 1, 2});
+  EXPECT_EQ(shape, ShapeUtil::MakeShapeWithDenseLayout(F32, {5, 9}, {0, 1}));
+}
+
 TEST_F(ShapeTest, EqualityTest) {
   // Different layouts.
   EXPECT_NE(ShapeUtil::MakeShapeWithDenseLayout(F32, {23, 44}, {1, 0}),
@@ -114,26 +138,33 @@ TEST_F(ShapeTest, EqualityTest) {
   // Equal shapes.
   EXPECT_EQ(ShapeUtil::MakeShapeWithDenseLayout(F32, {23, 44}, {1, 0}),
             ShapeUtil::MakeShapeWithDenseLayout(F32, {23, 44}, {1, 0}));
+
+  // Equal with Buffer shapes.
+  EXPECT_TRUE(
+      Shape::Equal().IgnoreBuffer()(ShapeUtil::MakeBufferShape(S32, {3, 4}),
+                                    ShapeUtil::MakeShape(S32, {3, 4})));
+  EXPECT_FALSE(Shape::Equal()(ShapeUtil::MakeBufferShape(S32, {3, 4}),
+                              ShapeUtil::MakeShape(S32, {3, 4})));
 }
 
-TEST_F(ShapeTest, IsInteger) {
-  EXPECT_FALSE(opaque_.IsInteger());
-  EXPECT_FALSE(token_.IsInteger());
-  EXPECT_TRUE(matrix_.IsInteger());
-  EXPECT_FALSE(tuple_.IsInteger());
-  EXPECT_FALSE(nested_tuple_.IsInteger());
+TEST_F(ShapeTest, AreAllLeavesIntegers) {
+  EXPECT_FALSE(opaque_.AreAllLeavesIntegers());
+  EXPECT_FALSE(token_.AreAllLeavesIntegers());
+  EXPECT_TRUE(matrix_.AreAllLeavesIntegers());
+  EXPECT_FALSE(tuple_.AreAllLeavesIntegers());
+  EXPECT_FALSE(nested_tuple_.AreAllLeavesIntegers());
 
   Shape u32_shape = ShapeUtil::MakeShape(U32, {1});
-  EXPECT_TRUE(u32_shape.IsInteger());
+  EXPECT_TRUE(u32_shape.AreAllLeavesIntegers());
 
   Shape f32_shape = ShapeUtil::MakeShape(F32, {1});
-  EXPECT_FALSE(f32_shape.IsInteger());
+  EXPECT_FALSE(f32_shape.AreAllLeavesIntegers());
 
   Shape integer_tuple = ShapeUtil::MakeTupleShape({u32_shape, u32_shape});
-  EXPECT_TRUE(integer_tuple.IsInteger());
+  EXPECT_TRUE(integer_tuple.AreAllLeavesIntegers());
 
   Shape mixed_type_tuple = ShapeUtil::MakeTupleShape({u32_shape, f32_shape});
-  EXPECT_FALSE(mixed_type_tuple.IsInteger());
+  EXPECT_FALSE(mixed_type_tuple.AreAllLeavesIntegers());
 }
 
 TEST_F(ShapeTest, IsStatic) {
@@ -173,7 +204,7 @@ TEST_F(ShapeTest, IsDynamic) {
       ->set_dynamic_dimension(1, true);
   EXPECT_FALSE(unbounded_tuple.is_unbounded_dynamic());
   ShapeUtil::GetMutableSubshape(&unbounded_tuple, {2})
-      ->set_dimensions(1, Shape::kUnboundedSize);
+      ->set_dimensions(1, Shape::kUnboundedSize, /*is_dynamic=*/true);
   EXPECT_TRUE(unbounded_tuple.is_unbounded_dynamic());
 }
 
@@ -204,23 +235,21 @@ TEST_F(ShapeTest, IsStaticDimension) {
 
 TEST_F(ShapeTest, ProgramShapeToFromProto) {
   ProgramShape program_shape;
-  *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3});
-  *program_shape.add_parameters() = ShapeUtil::MakeTokenShape();
-  *program_shape.add_parameters() = ShapeUtil::MakeShape(S64, {});
-  *program_shape.add_parameters() = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(S32, {}),
-       ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}),
-       ShapeUtil::MakeShape(F32, {42, 42})});
+  program_shape.AddParameter(ShapeUtil::MakeShape(F32, {1, 2, 3}), "foo");
+  program_shape.AddParameter(ShapeUtil::MakeTokenShape(), "bar");
+  program_shape.AddParameter(ShapeUtil::MakeShape(S64, {}), "baz");
+  program_shape.AddParameter(
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(S32, {}),
+           ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}),
+           ShapeUtil::MakeShape(F32, {42, 42})}),
+      "qux qux");
 
   *program_shape.mutable_result() = ShapeUtil::MakeShape(F32, {7});
 
-  program_shape.add_parameter_names("foo");
-  program_shape.add_parameter_names("bar");
-  program_shape.add_parameter_names("baz");
-  program_shape.add_parameter_names("qux qux");
-
   // Create a copy of the program shape by round-tripping through a proto.
-  ProgramShape program_shape_copy(program_shape.ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(auto program_shape_copy,
+                          ProgramShape::FromProto(program_shape.ToProto()));
   ASSERT_EQ(program_shape.parameters_size(),
             program_shape_copy.parameters_size());
   for (int i = 0; i < program_shape.parameters_size(); ++i) {
@@ -231,9 +260,9 @@ TEST_F(ShapeTest, ProgramShapeToFromProto) {
   EXPECT_TRUE(
       ShapeUtil::Equal(program_shape.result(), program_shape_copy.result()));
 
-  ASSERT_EQ(program_shape.parameter_names_size(),
-            program_shape_copy.parameter_names_size());
-  for (int i = 0; i < program_shape.parameter_names_size(); ++i) {
+  ASSERT_EQ(program_shape.parameters_size(),
+            program_shape_copy.parameters_size());
+  for (int i = 0; i < program_shape.parameters_size(); ++i) {
     EXPECT_EQ(program_shape.parameter_names(i),
               program_shape_copy.parameter_names(i));
   }
@@ -254,12 +283,12 @@ TEST_F(ShapeTest, ProgramShapeToString) {
       "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
       prog.ToString());
 
-  prog.add_parameter_names("arg0");
-  prog.add_parameter_names("scalar");
-  prog.add_parameter_names("matrix");
-  prog.add_parameter_names("matrix2");
-  prog.add_parameter_names("tuple");
-  prog.add_parameter_names("nested_tuple");
+  prog.set_parameter_names(0, "arg0");
+  prog.set_parameter_names(1, "scalar");
+  prog.set_parameter_names(2, "matrix");
+  prog.set_parameter_names(3, "matrix2");
+  prog.set_parameter_names(4, "tuple");
+  prog.set_parameter_names(5, "nested_tuple");
   EXPECT_EQ(
       "(arg0: opaque[], "
       "scalar: f32[], "
@@ -284,38 +313,84 @@ TEST_F(ShapeTest, IgnoreSplitsComparison) {
 
 TEST_F(ShapeTest, SupportsAbslHash) {
   EXPECT_TRUE(absl::VerifyTypeImplementsAbslHashCorrectly(
-      {opaque_, token_, scalar_, scalar_with_tile_, matrix_, matrix2_, tuple_,
-       nested_tuple_, dynamic_matrix_}));
+      {opaque_, token_, scalar_, scalar_with_tile_, matrix_, matrix2_,
+       matrix_buffer_, tuple_, nested_tuple_, dynamic_matrix_}));
 }
 
-void BM_ShapeCopy(::testing::benchmark::State& state) {
-  // Create different shapes based on benchmark parameters:
+static const int kDistinctShapes = 4;
+
+static Shape MakeShapeHelper(int id) {
   Shape shape;
-  switch (state.range(0)) {
+  switch (id % kDistinctShapes) {
     case 0: {
       // Shape()
       break;
     }
     case 1: {
       // f32[1,2,2]{2,1,0}
-      shape = Shape(F32, {1, 2, 2}, {false, false, false}, {});
+      shape = Shape(F32, {1, 2, 2}, {false, false, false});
       *shape.mutable_layout() = Layout({2, 1, 0});
       break;
     }
     case 2: {
       // f32[1,2,2]{2,1,0:T(2,128)}
-      shape = Shape(F32, {1, 2, 2}, {false, false, false}, {});
-      *shape.mutable_layout() = Layout({2, 1, 0}, {}, {}, {}, {Tile({2, 128})});
+      shape = Shape(F32, {1, 2, 2}, {false, false, false});
+      *shape.mutable_layout() = Layout({2, 1, 0}, {Tile({2, 128})});
       break;
     }
+    default: {
+      // f32[1,2,2]{2,1,0}
+      shape = Shape(F32, {1024, 1024, 128}, {});
+    }
   }
+  return shape;
+}
+
+void BM_ShapeProtoCopy(::testing::benchmark::State& state) {
+  // Create different shapes based on benchmark parameters:
+  Shape shape = MakeShapeHelper(state.range(0));
   state.SetLabel(shape.ToString(true));
 
   for (auto s : state) {
-    Shape copy(shape);
+    auto copy = Shape::FromProto(shape.ToProto());
+    TF_ASSERT_OK(copy);
+    CHECK(ShapeUtil::Equal(shape, *copy));
+  }
+}
+BENCHMARK(BM_ShapeProtoCopy)->Arg(0)->Arg(1)->Arg(2);
+
+void BM_ShapeCopy(::testing::benchmark::State& state) {
+  // Create different shapes based on benchmark parameters:
+  const int n_shapes = state.range(0);
+  std::vector<Shape> shapes(n_shapes);
+  bool share = (state.range(1) != 0);
+  for (int i = 0; i < n_shapes; i++) {
+    if (share && (i >= kDistinctShapes)) {
+      shapes[i] = shapes[i % kDistinctShapes];
+    } else {
+      shapes[i] = MakeShapeHelper(i);
+    }
+  }
+  state.SetLabel(absl::StrCat("Working set: ", n_shapes, " shapes",
+                              share ? " shared" : ""));
+
+  int64_t iter = 0;
+  Shape copy;
+  for (auto s : state) {
+    copy = shapes[iter];
+    iter++;
+    if (iter == n_shapes) {
+      iter = 0;
+    }
   }
 }
-BENCHMARK(BM_ShapeCopy)->Arg(0)->Arg(1)->Arg(2);
+BENCHMARK(BM_ShapeCopy)
+    ->ArgPair(1, 0)
+    ->ArgPair(1, 1)
+    ->ArgPair(1000, 0)
+    ->ArgPair(1000, 1)
+    ->ArgPair(100000, 0)
+    ->ArgPair(100000, 1);
 
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/shape_tree.cc b/third_party/xla/xla/shape_tree.cc
index 944f53218c97..6b17c355049e 100644
--- a/third_party/xla/xla/shape_tree.cc
+++ b/third_party/xla/xla/shape_tree.cc
@@ -42,8 +42,8 @@ void IndexTable::CreateEntry(Entry& entry, const Shape& shape,
   size_t children_start_id = entries_.size();
   entry.children_start_id = children_start_id;
   // Add entry for children first, before recursing, so they are consecutive.
-  entries_.resize(entries_.size() + shape.tuple_shapes_size());
-  for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+  entries_.resize(entries_.size() + shape.tuple_shapes().size());
+  for (size_t i = 0; i < shape.tuple_shapes().size(); ++i) {
     CreateEntry(entries_[children_start_id + i], shape.tuple_shapes(i),
                 next_node_id);
   }
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 41dbec3bec3f..f36edfc45684 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "absl/synchronization/blocking_counter.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/index_util.h"
@@ -101,7 +100,10 @@ void PrintTupleShapes(Printer* printer, absl::Span<const Shape> tuple_shapes) {
   PrintShape<kPrintLayout>(printer, tuple_shapes[0]);
   for (int64_t i = 1; i < tuple_shapes.size(); ++i) {
     if (i % kAnnotationPrintInterval == 0) {
-      printer->Append(absl::StrFormat(", /*index=%lld*/", i));
+      // Faster than printer->Append(absl::StrFormat(", /*index=%lld*/", i));
+      printer->Append(", /*index=");
+      printer->Append(i);
+      printer->Append("*/");
     } else {
       printer->Append(", ");
     }
@@ -110,13 +112,18 @@ void PrintTupleShapes(Printer* printer, absl::Span<const Shape> tuple_shapes) {
   printer->Append(")");
 }
 
+template <bool kPrintLayout>
+void PrintBufferShape(Printer* printer, const Shape& shape) {
+  printer->Append("b(");
+  PrintShape<kPrintLayout>(printer, shape.buffer_shape());
+  printer->Append(")");
+}
+
 // Constructs and returns the new shape with the given minor_to_major order in
 // its Layout.
 absl::StatusOr<Shape> MakeShapeWithLayoutInternal(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> minor_to_major,
-    absl::Span<const DimLevelType> dim_level_types,
-    absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     absl::Span<const Tile> tiles, int64_t tail_padding_alignment_in_elements,
     PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
     int64_t element_size_in_bits, int64_t memory_space,
@@ -139,10 +146,9 @@ absl::StatusOr<Shape> MakeShapeWithLayoutInternal(
     element_size_in_bits = 0;
   }
   *shape.mutable_layout() = LayoutUtil::MakeLayout(
-      minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
-      tail_padding_alignment_in_elements, index_primitive_type,
-      pointer_primitive_type, element_size_in_bits, memory_space, split_configs,
-      std::move(physical_shape));
+      minor_to_major, tiles, tail_padding_alignment_in_elements,
+      index_primitive_type, pointer_primitive_type, element_size_in_bits,
+      memory_space, split_configs, std::move(physical_shape));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
   return shape;
 }
@@ -160,8 +166,7 @@ const T& Deref(const T& ref) {
 
 template <typename ShapePtrOrRef>
 Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
-  Shape result;
-  result.set_element_type(TUPLE);
+  Shape result(std::vector<Shape>{});
   result.mutable_tuple_shapes()->reserve(shapes.size());
   for (const auto& shape : shapes) {
     ShapeUtil::AppendShapeToTuple(Deref(shape), &result);
@@ -185,8 +190,8 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
   bool equal = Shape::Equal()(lhs, rhs);
 
   if (!equal && VLOG_IS_ON(3)) {
-    VLOG(3) << "ShapeUtil::Equal differ: lhs = " << lhs.ShortDebugString()
-            << ", rhs = " << rhs.ShortDebugString();
+    VLOG(3) << "ShapeUtil::Equal differ: lhs = " << lhs.ToString()
+            << ", rhs = " << rhs.ToString();
   }
 
   return equal;
@@ -197,7 +202,7 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
   bool equal = Shape::Equal().IgnoreElementType()(lhs, rhs);
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::EqualIgnoringElementType differ: lhs = "
-            << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString();
+            << lhs.ToString() << ", rhs = " << rhs.ToString();
   }
 
   return equal;
@@ -208,7 +213,7 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
   bool equal = Shape::Equal().IgnoreFpPrecision()(lhs, rhs);
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::EqualIgnoringFpPrecision differ: lhs = "
-            << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString();
+            << lhs.ToString() << ", rhs = " << rhs.ToString();
   }
 
   return equal;
@@ -227,9 +232,13 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
   return equal;
 }
 
-/* static */ int64_t ShapeUtil::TrueRank(const Shape& shape) {
+/* static */ int64_t ShapeUtil::TrueNumDimensions(const Shape& array_shape) {
+  CHECK(array_shape.IsArray())
+      << "TrueNumDimensions called on non-array shape: "
+      << array_shape.ToString();
+
   int64_t accum = 0;
-  for (int64_t dimension : shape.dimensions()) {
+  for (const int64_t dimension : array_shape.dimensions()) {
     // We do not count unit dimensions.
     if (dimension != 1) {
       accum += 1;
@@ -238,56 +247,29 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
   return accum;
 }
 
-/* static */ bool ShapeUtil::FillNewShape(PrimitiveType element_type,
-                                          absl::Span<const int64_t> dimensions,
-                                          Shape* shape) {
-  int64_t dense_shape_size = primitive_util::IsArrayType(element_type)
-                                 ? primitive_util::ByteWidth(element_type)
-                                 : -1;
-
-  // Verify that array-based lookup is consistent with public API.
-  DCHECK_EQ(dense_shape_size, ByteSizeOfPrimitiveType(element_type))
-      << element_type;
-
-  shape->set_element_type(element_type);
-  const int ndims = dimensions.size();
-  auto layout = shape->mutable_layout();
-  auto* minor_to_major = layout->mutable_minor_to_major();
-  int64_t static_extent_product = dense_shape_size;
-  bool any_overflows = false;
-  for (int i = 0; i < ndims; i++) {
-    const int64_t d = dimensions[i];
-    if (d != Shape::kUnboundedSize) {
-      bool overflow;
-      std::tie(static_extent_product, overflow) =
-          OverflowSafeMultiply(static_extent_product, d);
-      any_overflows |= overflow;
-    }
-
-    shape->add_dimensions(d);
-    minor_to_major->push_back(ndims - 1 - i);
-  }
-  if (any_overflows) {
-    return false;
-  }
-  return true;
-}
-
 /* static */ ProgramShape ShapeUtil::MakeProgramShape(
     std::initializer_list<Shape> parameters, Shape result) {
   ProgramShape program_shape;
   for (const Shape& shape : parameters) {
-    *program_shape.add_parameters() = shape;
+    program_shape.AddParameter(shape, "");
   }
   *program_shape.mutable_result() = std::move(result);
   return program_shape;
 }
 
+static std::vector<bool> MakeDynamicDimensions(
+    absl::Span<const int64_t> dimensions) {
+  std::vector<bool> dynamic_dimensions;
+  dynamic_dimensions.reserve(dimensions.size());
+  for (int64_t dimension : dimensions) {
+    dynamic_dimensions.push_back(dimension == Shape::kUnboundedSize);
+  }
+  return dynamic_dimensions;
+}
+
 /* static */ Shape ShapeUtil::MakeShape(PrimitiveType element_type,
                                         absl::Span<const int64_t> dimensions) {
-  Shape shape;
-  CHECK(FillNewShape(element_type, dimensions, &shape));
-  return shape;
+  return MakeValidatedShape(element_type, dimensions).value();
 }
 
 /* static */ Shape ShapeUtil::MakeScalarShape(PrimitiveType element_type) {
@@ -301,6 +283,11 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
       .value();
 }
 
+/* static */ Shape ShapeUtil::MakeBufferShape(
+    PrimitiveType element_type, absl::Span<const int64_t> dimensions) {
+  return Shape::MakeBufferShape(MakeShape(element_type, dimensions));
+}
+
 /* static */ Shape ShapeUtil::MakeShapeWithStaticDimensions(
     const Shape& shape) {
   Shape output = shape;
@@ -310,13 +297,8 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
 
 /* static */ absl::StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions) {
-  Shape shape;
-  if (!FillNewShape(element_type, dimensions, &shape)) {
-    return InvalidArgument("invalid shape type=%d, dims=[%s]",
-                           static_cast<int>(element_type),
-                           absl::StrJoin(dimensions, ","));
-  }
-  return shape;
+  return MakeValidatedShape(element_type, dimensions,
+                            MakeDynamicDimensions(dimensions));
 }
 
 /* static */ absl::StatusOr<Shape> ShapeUtil::MakeValidatedShape(
@@ -329,18 +311,41 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
   }
 
   Shape shape;
-  if (!FillNewShape(element_type, dimensions, &shape)) {
-    return InvalidArgument("invalid shape type=%d, dims=[%s]",
-                           static_cast<int>(element_type),
-                           absl::StrJoin(dimensions, ","));
-  }
-  for (int i = 0, n = dimensions.size(); i < n; i++) {
-    shape.set_dynamic_dimension(i, dynamic_dimensions[i]);
-    if (shape.dimensions(i) == Shape::kUnboundedSize &&
-        !dynamic_dimensions[i]) {
-      return InvalidArgument(
-          "Cannot mark a dynamic dimension at dim=%d as static", i);
+  int64_t dense_shape_size = primitive_util::IsArrayType(element_type)
+                                 ? primitive_util::ByteWidth(element_type)
+                                 : -1;
+
+  // Verify that array-based lookup is consistent with public API.
+  DCHECK_EQ(dense_shape_size, ByteSizeOfPrimitiveType(element_type))
+      << element_type;
+
+  shape.set_element_type(element_type);
+  const int ndims = dimensions.size();
+  auto layout = shape.mutable_layout();
+  auto* minor_to_major = layout->mutable_minor_to_major();
+  int64_t static_extent_product = dense_shape_size;
+  bool any_overflows = false;
+  for (int i = 0; i < ndims; i++) {
+    const int64_t d = dimensions[i];
+    const bool is_dynamic = dynamic_dimensions[i];
+    if (!Shape::IsValidDimensionSize(d, is_dynamic)) {
+      return InvalidArgument("Invalid dimension size %d, is_dynamic=%s", d,
+                             is_dynamic ? "true" : "false");
+    }
+    if (d != Shape::kUnboundedSize) {
+      bool overflow;
+      std::tie(static_extent_product, overflow) =
+          OverflowSafeMultiply(static_extent_product, d);
+      any_overflows |= overflow;
     }
+
+    shape.add_dimensions(d, is_dynamic);
+    minor_to_major->push_back(ndims - 1 - i);
+  }
+
+  if (any_overflows) {
+    return InvalidArgument("overflow in static extent product: dimes=[%s]",
+                           absl::StrJoin(dimensions, ","));
   }
   return shape;
 }
@@ -351,8 +356,7 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
     int64_t tail_padding_alignment_in_elements, int64_t element_size_in_bits,
     int64_t memory_space, absl::Span<const SplitConfig> split_configs) {
   auto ret = MakeShapeWithLayoutInternal(
-      element_type, dimensions, minor_to_major, /*dim_level_types=*/{},
-      /*dim_unique=*/{}, /*dim_ordered=*/{}, tiles,
+      element_type, dimensions, minor_to_major, tiles,
       tail_padding_alignment_in_elements,
       /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
       /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID, element_size_in_bits,
@@ -365,16 +369,14 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
 /* static */ Shape ShapeUtil::MakeShapeWithSparseLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> minor_to_major,
-    absl::Span<const DimLevelType> dim_level_types,
-    absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
     int64_t tail_padding_alignment_in_elements, int64_t element_size_in_bits,
     int64_t memory_space, std::optional<Shape> physical_shape) {
   auto ret = MakeShapeWithLayoutInternal(
-      element_type, dimensions, minor_to_major, dim_level_types, dim_unique,
-      dim_ordered, /*tiles=*/{}, tail_padding_alignment_in_elements,
-      index_primitive_type, pointer_primitive_type, element_size_in_bits,
-      memory_space, /*split_configs=*/{}, std::move(physical_shape));
+      element_type, dimensions, minor_to_major,
+      /*tiles=*/{}, tail_padding_alignment_in_elements, index_primitive_type,
+      pointer_primitive_type, element_size_in_bits, memory_space,
+      /*split_configs=*/{}, std::move(physical_shape));
   TF_CHECK_OK(ret.status());
   return *ret;
 }
@@ -382,7 +384,7 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
 /* static */ Shape ShapeUtil::MoveDimToMajor(const Shape& shape, int64_t dim) {
   if (shape.IsTuple()) {
     std::vector<Shape> result_shapes;
-    result_shapes.reserve(shape.tuple_shapes_size());
+    result_shapes.reserve(shape.tuple_shapes().size());
     for (const Shape& s : shape.tuple_shapes()) {
       result_shapes.push_back(MoveDimToMajor(s, dim));
     }
@@ -415,8 +417,8 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
 /* static */ Shape
 ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     const Shape& shape) {
-  std::vector<int64_t> dims(shape.dimensions_size());
-  for (int i = 0; i < shape.dimensions_size(); ++i) {
+  std::vector<int64_t> dims(shape.dimensions().size());
+  for (int i = 0; i < shape.dimensions().size(); ++i) {
     int dim = i;
     if (shape.has_layout()) {
       dim = LayoutUtil::Major(shape.layout(), dim);
@@ -434,7 +436,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     new_shape.mutable_layout()->set_tail_padding_alignment_in_elements(
         shape.layout().tail_padding_alignment_in_elements());
   }
-  for (int i = 0; i < shape.dimensions_size(); ++i) {
+  for (int i = 0; i < shape.dimensions().size(); ++i) {
     int dim = i;
     if (shape.has_layout()) {
       dim = LayoutUtil::Major(shape.layout(), dim);
@@ -450,10 +452,15 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     Shape* shape) {
   shape->Clear();
   shape->set_element_type(element_type);
-  for (int64_t dimension : dimensions) {
-    shape->add_dimensions(dimension);
+  if (shape->IsArray()) {
+    for (int64_t dimension : dimensions) {
+      shape->add_dimensions(dimension);
+    }
+    LayoutUtil::SetToDefaultLayout(shape);
+  } else {
+    CHECK(dimensions.empty()) << "Non-array shape " << shape->ToString()
+                              << " cannot have dimensions.";
   }
-  LayoutUtil::SetToDefaultLayout(shape);
   return ValidateShape(*shape);
 }
 
@@ -483,19 +490,9 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return MakeTupleShape(shapes);
 }
 
-/* static */ Shape ShapeUtil::MakeOpaqueShape() {
-  Shape result;
-  result.set_element_type(OPAQUE_TYPE);
-  TF_DCHECK_OK(ValidateShapeWithOptionalLayout(result));
-  return result;
-}
+/* static */ Shape ShapeUtil::MakeOpaqueShape() { return Shape(OPAQUE_TYPE); }
 
-/* static */ Shape ShapeUtil::MakeTokenShape() {
-  Shape result;
-  result.set_element_type(TOKEN);
-  TF_DCHECK_OK(ValidateShapeWithOptionalLayout(result));
-  return result;
-}
+/* static */ Shape ShapeUtil::MakeTokenShape() { return Shape(TOKEN); }
 
 /* static */ void ShapeUtil::AppendShapeToTuple(const Shape& shape,
                                                 Shape* tuple_shape) {
@@ -505,7 +502,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ void ShapeUtil::UpdateTupleShape(const Shape& shape, int64_t index,
                                               Shape* tuple_shape) {
-  CHECK_LT(index, tuple_shape->tuple_shapes_size());
+  CHECK_LT(index, tuple_shape->tuple_shapes().size());
   *tuple_shape->mutable_tuple_shapes(index) = shape;
 }
 
@@ -526,7 +523,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
   CHECK(LayoutUtil::IsDenseArray(*shape));
   if (shape->has_layout()) {
-    shape->mutable_layout()->add_minor_to_major(shape->rank());
+    shape->mutable_layout()->add_minor_to_major(shape->dimensions().size());
   }
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
@@ -534,7 +531,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 // Prepend new major-most dimension sized `bound` to the shape.
 Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
-  Shape new_shape(shape.element_type(), {}, {}, {});
+  Shape new_shape(shape.element_type(), {}, {});
   new_shape.add_dimensions(bound);
   for (const int64_t dim : shape.dimensions()) {
     new_shape.add_dimensions(dim);
@@ -561,15 +558,16 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
       shape->mutable_layout()->set_minor_to_major(dim_idx + 1, layout_idx);
     }
     // Insert the newly added dimension at the minor-most position.
-    shape->mutable_layout()->set_minor_to_major(0, shape->rank() - 1);
+    shape->mutable_layout()->set_minor_to_major(0,
+                                                shape->dimensions().size() - 1);
   }
   TF_DCHECK_OK(ValidateShape(*shape));
 }
 
 /* static */ void ShapeUtil::CopyDynamicDimensions(Shape* to,
                                                    const Shape& from) {
-  CHECK_EQ(to->rank(), from.rank());
-  for (int64_t i = 0; i < from.rank(); ++i) {
+  CHECK_EQ(to->dimensions().size(), from.dimensions().size());
+  for (int64_t i = 0; i < from.dimensions().size(); ++i) {
     to->set_dynamic_dimension(i, from.is_dynamic_dimension(i));
   }
   TF_DCHECK_OK(ValidateShape(*to));
@@ -581,7 +579,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   // If not, and the most major dimension's size is 1, then we can repeat the
   // same check for next most major dimension as returned by
   // LayoutUtil::Major(1) and so on.
-  for (int64_t i = 0; i < shape.dimensions_size(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     int64_t major_dimension = LayoutUtil::Major(shape.layout(), i);
     if (major_dimension == dimension) {
       return true;
@@ -633,7 +631,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 }
 
 /* static */ int64_t ShapeUtil::TupleElementCount(const Shape& shape) {
-  return shape.tuple_shapes_size();
+  return shape.tuple_shapes().size();
 }
 
 /* static */ const Shape& ShapeUtil::GetTupleElementShape(const Shape& shape,
@@ -654,8 +652,8 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
                                          int64_t limit) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(tuple));
   CHECK(tuple.IsTuple());
-  CHECK_LE(start, tuple.tuple_shapes_size());
-  CHECK_LE(limit, tuple.tuple_shapes_size());
+  CHECK_LE(start, tuple.tuple_shapes().size());
+  CHECK_LE(limit, tuple.tuple_shapes().size());
 
   std::vector<Shape> new_elements(tuple.tuple_shapes().begin() + start,
                                   tuple.tuple_shapes().begin() + limit);
@@ -687,9 +685,11 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   if (shape.element_type() == primitive_type) {
     return true;
   }
-  for (const Shape& element_shape : shape.tuple_shapes()) {
-    if (HasPrimitiveType(element_shape, primitive_type)) {
-      return true;
+  if (shape.IsTuple()) {
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      if (HasPrimitiveType(element_shape, primitive_type)) {
+        return true;
+      }
     }
   }
   return false;
@@ -710,14 +710,20 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
     PrintTupleShapes</*kPrintLayout=*/false>(printer, shape.tuple_shapes());
     return;
   }
+  if (shape.IsBuffer()) {
+    PrintBufferShape</*kPrintLayout=*/false>(printer, shape);
+    return;
+  }
   printer->Append(
       primitive_util::LowercasePrimitiveTypeName(shape.element_type()));
-  if (shape.dimensions().empty()) {
+  if (!shape.IsArray() || shape.dimensions().empty()) {
     printer->Append("[]");
     return;
   }
+  // Now we are in array shape with at least one dimension.
   printer->Append("[");
-  auto print_one = [&](int i) {
+  // Prints the i-th dimension of the array shape.
+  auto print_dimension = [&](int i) {
     if (shape.is_dynamic_dimension(i)) {
       if (shape.dimensions(i) != Shape::kUnboundedSize) {
         printer->Append(StrCat("<=", shape.dimensions(i)));
@@ -728,10 +734,10 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
       printer->Append(shape.dimensions(i));
     }
   };
-  print_one(0);
-  for (int i = 1, n = shape.dimensions_size(); i < n; ++i) {
+  print_dimension(0);
+  for (int i = 1, n = shape.dimensions().size(); i < n; ++i) {
     printer->Append(",");
-    print_one(i);
+    print_dimension(i);
   }
   printer->Append("]");
 }
@@ -742,7 +748,12 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
     PrintTupleShapes</*kPrintLayout=*/true>(printer, shape.tuple_shapes());
     return;
   }
+  if (shape.IsBuffer()) {
+    PrintBufferShape</*kPrintLayout=*/true>(printer, shape);
+    return;
+  }
   PrintHumanString(printer, shape);
+  if (!shape.IsArray()) return;
   if (!shape.has_layout()) return;
   if (IsScalar(shape)) {
     std::string layout_str = LayoutUtil::HumanString(shape.layout());
@@ -750,7 +761,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
     if (layout_str != "{}") {
       printer->Append(layout_str);
     }
-  } else if (shape.IsArray()) {
+  } else {
     LayoutUtil::PrintHumanString(printer, shape.layout());
   }
 }
@@ -761,11 +772,9 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   const auto& shape_parameters = program_shape.parameters();
   if (!shape_parameters.empty()) {
     auto print_one = [&](int i) {
-      if (i < program_shape.parameter_names_size()) {
-        printer->Append(program_shape.parameter_names(i));
-      } else {
-        printer->Append("(unknown)");
-      }
+      printer->Append(program_shape.parameter_names(i).empty()
+                          ? "(unknown)"
+                          : program_shape.parameter_names(i));
       printer->Append(": ");
       PrintHumanString(printer, shape_parameters[i]);
     };
@@ -801,7 +810,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
                                             const Shape& rhs) {
   if (!SameRank(lhs, rhs)) return false;
-  for (int i = 0; i < lhs.rank(); ++i) {
+  for (int i = 0; i < lhs.dimensions().size(); ++i) {
     if (!lhs.is_unbounded_dynamic_dimension(i) &&
         !rhs.is_unbounded_dynamic_dimension(i) &&
         lhs.dimensions(i) != rhs.dimensions(i)) {
@@ -813,7 +822,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 }
 
 /* static */ bool ShapeUtil::SameRank(const Shape& lhs, const Shape& rhs) {
-  return lhs.rank() == rhs.rank();
+  return lhs.dimensions().size() == rhs.dimensions().size();
 }
 
 /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) {
@@ -848,9 +857,11 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 /* static */ DimensionVector ShapeUtil::CreateDimensionVectorFromShape(
     const Shape& shape) {
   DimensionVector dimensions;
-  dimensions.reserve(shape.dimensions_size());
-  for (int i = 0; i < shape.dimensions_size(); ++i) {
-    dimensions.push_back(shape.dimensions(i));
+  if (shape.IsArray()) {
+    dimensions.reserve(shape.dimensions().size());
+    for (int i = 0; i < shape.dimensions().size(); ++i) {
+      dimensions.push_back(shape.dimensions(i));
+    }
   }
   return dimensions;
 }
@@ -863,7 +874,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 /* static */ int64_t ShapeUtil::GetDimensionNumber(const Shape& shape,
                                                    int64_t dimension_number) {
   if (dimension_number < 0) {
-    dimension_number += shape.rank();
+    dimension_number += shape.dimensions().size();
   }
   CHECK_GE(dimension_number, 0);
   return dimension_number;
@@ -896,14 +907,14 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   TF_DCHECK_OK(ValidateShape(shape));
   CHECK_EQ(TUPLE, shape.element_type());
   CHECK_GT(pointer_size, 0);
-  return pointer_size * shape.tuple_shapes_size();
+  return pointer_size * shape.tuple_shapes().size();
 }
 
 /* static */ int64_t ShapeUtil::ByteSizeOfElements(const Shape& shape) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape));
   int64_t allocated_element_count;
 
-  CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ShortDebugString();
+  CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ToString();
   allocated_element_count = ElementsIn(shape);
 
   if (shape.has_layout() && shape.layout().element_size_in_bits() != 0) {
@@ -938,7 +949,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
           return ShapeError(shape, "Shape cannot be serialiized.");
         }
         if (subshape.is_dynamic()) {
-          size += sizeof(DynamicSizeType) * subshape.rank();
+          size += sizeof(DynamicSizeType) * subshape.dimensions().size();
         }
         if (subshape.element_type() == PRED) {
           // PRED is packed 8 elements per byte.
@@ -982,7 +993,7 @@ absl::Status ValidateShapeSize(const Shape& shape) {
 absl::Status ValidateDimensions(const Shape& shape) {
   bool any_overflows = false;
   int64_t product = 1;
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     int64_t dimension = shape.dimensions(i);
     if (dimension == Shape::kUnboundedSize) {
       continue;
@@ -1001,17 +1012,21 @@ absl::Status ValidateDimensions(const Shape& shape) {
   }
   return absl::OkStatus();
 }
+}  // namespace
 
 // Validates all of the non-layout properties of the shape -- this is a helper
 // used by both the layout-optional and layout-required public method.
 absl::Status ValidateNonLayoutProperties(const Shape& shape) {
+  // Make sure the element type is valid.
   if (shape.element_type() == PRIMITIVE_TYPE_INVALID ||
       !PrimitiveType_IsValid(shape.element_type())) {
     return ShapeError(shape, "Invalid element type.");
   }
+
+  // Validate tuple shapes.
   if (shape.element_type() == TUPLE) {
-    if (shape.dimensions_size() != 0) {
-      return ShapeError(shape, "This type cannot have dimensions.");
+    if (!shape.if_tuple_state()) {
+      return ShapeError(shape, "This type must have a tuple state.");
     }
     for (auto& element_shape : shape.tuple_shapes()) {
       TF_RETURN_IF_ERROR(ValidateNonLayoutProperties(element_shape));
@@ -1019,27 +1034,34 @@ absl::Status ValidateNonLayoutProperties(const Shape& shape) {
     return absl::OkStatus();
   }
 
-  // Non-tuple shape.
-  if (shape.tuple_shapes_size() > 0) {
-    return ShapeError(shape, "Non-tuple type contains tuple_shapes.");
+  // Validate token shapes.
+  if (shape.element_type() == TOKEN) {
+    if (!shape.if_token_state()) {
+      return ShapeError(shape, "This type must have a token state.");
+    }
+    return absl::OkStatus();
   }
 
-  // Tokens and opaques should not have layout or dimensions.
-  if (shape.element_type() == TOKEN || shape.element_type() == OPAQUE_TYPE) {
-    if (shape.dimensions_size() != 0) {
-      return ShapeError(shape, "This type cannot have dimensions.");
+  // Validate opaque shapes.
+  if (shape.element_type() == OPAQUE_TYPE) {
+    if (!shape.if_opaque_state()) {
+      return ShapeError(shape, "This type must have an opaque state.");
     }
-    if (shape.has_layout()) {
-      return ShapeError(shape, "This type cannot have a layout.");
+    return absl::OkStatus();
+  }
+
+  // Validate array shapes.
+  if (primitive_util::IsArrayType(shape.element_type())) {
+    if (!shape.if_array_state()) {
+      return ShapeError(shape, "This type must have an array state.");
     }
+    TF_RETURN_IF_ERROR(ValidateDimensions(shape));
+    TF_RETURN_IF_ERROR(ValidateShapeSize(shape));
     return absl::OkStatus();
   }
 
-  TF_RETURN_IF_ERROR(ValidateDimensions(shape));
-  TF_RETURN_IF_ERROR(ValidateShapeSize(shape));
-  return absl::OkStatus();
+  return ShapeError(shape, "Unsupported element type.");
 }
-}  // namespace
 
 /* static */ absl::Status ShapeUtil::ValidateShapeWithOptionalLayout(
     const Shape& shape) {
@@ -1059,7 +1081,7 @@ absl::Status ValidateNonLayoutProperties(const Shape& shape) {
                                                 PrimitiveType type) {
   if (original.IsTuple()) {
     std::vector<Shape> new_operands;
-    new_operands.reserve(original.tuple_shapes_size());
+    new_operands.reserve(original.tuple_shapes().size());
     for (const Shape& operand : original.tuple_shapes()) {
       new_operands.push_back(ChangeElementType(operand, type));
     }
@@ -1078,7 +1100,7 @@ absl::Status ValidateNonLayoutProperties(const Shape& shape) {
                                           ShapeIndexView index) {
   const Shape* subshape = &shape;
   for (auto i : index) {
-    if (!subshape->IsTuple() || i >= subshape->tuple_shapes_size() || i < 0) {
+    if (!subshape->IsTuple() || i >= subshape->tuple_shapes().size() || i < 0) {
       return false;
     }
     subshape = &subshape->tuple_shapes(i);
@@ -1097,21 +1119,12 @@ absl::Status ValidateNonLayoutProperties(const Shape& shape) {
   return *return_shape;
 }
 
-/* static */ const Shape& ShapeUtil::GetSubshapeOneIndex(const Shape& shape,
-                                                         int64_t index) {
-  const Shape* return_shape = &shape;
-  CHECK(return_shape->IsTuple())
-      << "Invalid index " << index << " for shape " << shape;
-  return_shape = &return_shape->tuple_shapes(index);
-  return *return_shape;
-}
-
 /* static */ absl::StatusOr<const Shape*> ShapeUtil::TryGetSubshape(
     const Shape& shape, ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
     if (!return_shape->IsTuple() || i < 0 ||
-        i >= return_shape->tuple_shapes_size()) {
+        i >= return_shape->tuple_shapes().size()) {
       return InvalidArgument(
           "Shape index %s not a valid subshape index for tuple with shape %s",
           ShapeIndex(index).ToString(), shape.DebugString());
@@ -1174,7 +1187,7 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
 
 /* static */ absl::StatusOr<int64_t>
 ShapeUtil::PackedFactorFor1DInterleavedArray(const Shape& shape) {
-  if (shape.rank() == 1 && shape.layout().tiles_size() == 3 &&
+  if (shape.dimensions().size() == 1 && shape.layout().tiles_size() == 3 &&
       shape.layout().tiles()[2].dimensions().size() == 2) {
     return shape.layout().tiles()[2].dimension(0);
   }
@@ -1191,13 +1204,11 @@ ShapeUtil::PackedFactorFor1DInterleavedArray(const Shape& shape) {
     absl::Span<const int64_t> permutation, const Shape& shape) {
   Shape new_shape = shape;
   new_shape.clear_dimensions();
-  for (auto dim : Permute(shape.dimensions(), permutation)) {
-    new_shape.add_dimensions(dim);
-  }
-  auto inv_permutation = InversePermutation(permutation);
-  for (int64_t i = 0; i < shape.rank(); i++) {
-    new_shape.set_dynamic_dimension(inv_permutation[i],
-                                    shape.is_dynamic_dimension(i));
+  const auto permuted_dims = Permute(shape.dimensions(), permutation);
+  const auto permuted_dynamic_dims =
+      Permute(shape.dynamic_dimensions(), permutation);
+  for (int i = 0; i < permuted_dims.size(); ++i) {
+    new_shape.add_dimensions(permuted_dims[i], permuted_dynamic_dims[i]);
   }
 
   // If `shape` has a layout, by contract we choose a new layout such that the
@@ -1232,7 +1243,7 @@ ShapeUtil::PackedFactorFor1DInterleavedArray(const Shape& shape) {
     CHECK(LayoutUtil::IsDenseArray(shape));
     Layout* new_layout = new_shape.mutable_layout();
     new_layout->clear_minor_to_major();
-    for (auto index : ComposePermutations(inv_permutation,
+    for (auto index : ComposePermutations(InversePermutation(permutation),
                                           shape.layout().minor_to_major())) {
       new_layout->add_minor_to_major(index);
     }
@@ -1295,7 +1306,9 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
     auto unmodified_dim_pair =
         i < unmodified_dims.size()
             ? unmodified_dims[i]
-            : std::make_pair(shape_pre.rank(), shape_post.rank());
+            : std::make_pair(
+                  static_cast<int64_t>(shape_pre.dimensions().size()),
+                  static_cast<int64_t>(shape_post.dimensions().size()));
     if (!check_modified_dims(prior_unmodified_dim_pair, unmodified_dim_pair)) {
       return std::nullopt;
     }
@@ -1419,8 +1432,8 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   }
 
   if (ElementsIn(input_shape) != ElementsIn(output_shape)) {
-    VLOG(3) << "input_shape=" << input_shape.ShortDebugString()
-            << ", output_shape=" << output_shape.ShortDebugString();
+    VLOG(3) << "input_shape=" << input_shape.ToString()
+            << ", output_shape=" << output_shape.ToString();
     return false;
   }
   if (ElementsIn(input_shape) == 0) {
@@ -1542,12 +1555,13 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
     Shape output_shape_dim0_major = MakeShapeWithDescendingLayout(
         output_shape.element_type(), output_shape.dimensions());
 
-    for (int64_t input_dim = 0; input_dim < input_shape.rank(); ++input_dim) {
+    for (int64_t input_dim = 0; input_dim < input_shape.dimensions().size();
+         ++input_dim) {
       if (input_shape.dimensions(input_dim) <= 1) {
         continue;
       }
 
-      std::vector<int64_t> input_unit_index(input_shape.rank(), 0);
+      std::vector<int64_t> input_unit_index(input_shape.dimensions().size(), 0);
       input_unit_index[input_dim] = 1;
       int64_t logical_linear_index =
           IndexUtil::MultidimensionalIndexToLinearIndex(input_shape_dim0_major,
@@ -1577,7 +1591,7 @@ static absl::Span<const int64_t> LayoutPerm(const Shape& s) {
 /* static */ std::optional<std::vector<int64_t>>
 ShapeUtil::DeduceTransposeDimensionsForBitcast(const Shape& input_shape,
                                                const Shape& output_shape) {
-  if (output_shape.rank() != input_shape.rank()) {
+  if (output_shape.dimensions().size() != input_shape.dimensions().size()) {
     return std::nullopt;
   }
 
@@ -1634,7 +1648,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
   // transpose1_dims * R = input_layout  | * R, knowing R * R = I
   // transpose1_dims = input_layout * R
   decomposition.transpose1_dims = ComposePermutations(
-      LayoutPerm(input_shape), ReverseIota(input_shape.rank()));
+      LayoutPerm(input_shape), ReverseIota(input_shape.dimensions().size()));
   CHECK(TransposeIsBitcast(input_shape, decomposition.transpose1_shape,
                            decomposition.transpose1_dims,
                            /*ignore_element_type=*/false));
@@ -1643,7 +1657,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
   // transpose2_dims * output_layout = R  | * inv(output_layout)
   // transpose2_dims = R * inv(output_layout)
   decomposition.transpose2_dims =
-      ComposePermutations(ReverseIota(output_shape.rank()),
+      ComposePermutations(ReverseIota(output_shape.dimensions().size()),
                           InversePermutation(LayoutPerm(output_shape)));
   CHECK(TransposeIsBitcast(decomposition.reshape_shape, output_shape,
                            decomposition.transpose2_dims,
@@ -1690,8 +1704,8 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
     // For each one sized dimension in the output, increment the dimension
     // numbers in layout that are more minor than the one.
     absl::InlinedVector<int64_t, 8> dim_map;
-    dim_map.reserve(simple_output_shape->rank());
-    for (int64_t i = 0; i < output_shape.rank(); ++i) {
+    dim_map.reserve(simple_output_shape->dimensions().size());
+    for (int64_t i = 0; i < output_shape.dimensions().size(); ++i) {
       if (output_shape.dimensions(i) != 1) {
         dim_map.push_back(i);
       }
@@ -1702,7 +1716,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
 
     // Add the ones in descending order to the layout. Descending layouts tend
     // to reduce the number of copies inserted in layout assignment.
-    for (int64_t i = output_shape.rank() - 1; i >= 0; --i) {
+    for (int64_t i = output_shape.dimensions().size() - 1; i >= 0; --i) {
       if (output_shape.dimensions(i) == 1) {
         layout.push_back(i);
       }
@@ -1714,7 +1728,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
 
   auto common_factors =
       CommonFactors(input_shape.dimensions(), output_shape.dimensions());
-  const int64_t input_rank = input_shape.rank();
+  const int64_t input_rank = input_shape.dimensions().size();
   DimensionVector input_to_factor(input_rank);
   for (int64_t pos = 0; pos < common_factors.size() - 1; ++pos) {
     const int64_t input_start = common_factors[pos].first;
@@ -1732,7 +1746,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
     }
   }
 
-  int64_t output_rank = output_shape.rank();
+  int64_t output_rank = output_shape.dimensions().size();
   DimensionVector output_layout;
   output_layout.reserve(output_rank);
   int64_t input_minor = 0;
@@ -1767,8 +1781,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
   Shape new_shape(shape.element_type(),
                   Permute(shape.dimensions(), permutation),
                   absl::InlinedVector<bool, 8>(dynamic_dimensions.begin(),
-                                               dynamic_dimensions.end()),
-                  {});
+                                               dynamic_dimensions.end()));
   if (shape.has_layout()) {
     *new_shape.mutable_layout() = shape.layout();
     for (int64_t& dim : *new_shape.mutable_layout()->mutable_minor_to_major()) {
@@ -1787,10 +1800,10 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
 
 /* static */ bool ShapeUtil::DynamicArrayShapeIsCompatible(
     const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
-  if (dynamic_shape.rank() != bounded_shape.rank()) {
+  if (dynamic_shape.dimensions().size() != bounded_shape.dimensions().size()) {
     return false;
   }
-  for (int64_t i = 0; i < dynamic_shape.rank(); ++i) {
+  for (int64_t i = 0; i < dynamic_shape.dimensions().size(); ++i) {
     if (dynamic_shape.dimensions(i) > bounded_shape.dimensions(i)) {
       return false;
     }
@@ -1877,8 +1890,8 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
 /* static */ absl::Status ShapeUtil::ForEachIndexParallelWithStatus(
     const Shape& shape,
     const ForEachParallelVisitorFunction& visitor_function) {
-  std::vector<int64_t> base(shape.dimensions_size());
-  std::vector<int64_t> incr(shape.dimensions_size(), 1);
+  std::vector<int64_t> base(shape.dimensions().size());
+  std::vector<int64_t> incr(shape.dimensions().size(), 1);
   return ForEachIndexParallelWithStatus(shape, base,
                                         /*count=*/shape.dimensions(), incr,
                                         visitor_function);
@@ -1935,7 +1948,7 @@ struct ParallelState {
   explicit ParallelState(int64_t task_count) {
     // If this method is changed, please remember to change
     // GetForEachIndexParallelThreadCount() as well.
-    static auto* global_pool = new tsl::thread::ThreadPool(
+    static auto* const global_pool = new tsl::thread::ThreadPool(
         tsl::Env::Default(), "foreach", tsl::port::MaxParallelism());
     pool = global_pool;
   }
@@ -1977,8 +1990,8 @@ struct ParallelState {
 
   // Compute the dimensions of the "work" which are defined by the count of
   // elements in each dimension and the increment.
-  std::vector<int64_t> work_dims(shape.dimensions_size());
-  for (size_t d = 0; d < shape.rank(); ++d) {
+  std::vector<int64_t> work_dims(shape.dimensions().size());
+  for (size_t d = 0; d < shape.dimensions().size(); ++d) {
     work_dims[d] = tsl::MathUtil::CeilOfRatio(count[d], incr[d]);
   }
 
@@ -2057,10 +2070,7 @@ struct ParallelState {
 
 /* static */ Shape ShapeUtil::DeleteDimensions(
     absl::Span<int64_t const> dims_to_delete, Shape shape) {
-  std::vector<int64_t> dims_to_delete_v(dims_to_delete.begin(),
-                                        dims_to_delete.end());
-  absl::c_sort(dims_to_delete_v);
-  shape.DeleteDimensions(dims_to_delete_v);
+  shape.DeleteDimensions(dims_to_delete);
   return shape;
 }
 
@@ -2068,7 +2078,7 @@ struct ParallelState {
     absl::FunctionRef<bool(int64_t)> p, Shape shape) {
   CHECK(shape.IsArray());
   std::vector<int64_t> dims_to_delete;
-  for (int64_t i = 0; i < shape.rank(); ++i) {
+  for (int64_t i = 0; i < shape.dimensions().size(); ++i) {
     if (!p(i)) {
       dims_to_delete.push_back(i);
     }
@@ -2101,7 +2111,7 @@ absl::Status ShapeUtil::ByteStrides(const Shape& shape,
                                     absl::Span<int64_t> strides) {
   TF_RET_CHECK(shape.IsArray());
   TF_RET_CHECK(shape.has_layout());
-  TF_RET_CHECK(shape.dimensions_size() == strides.size());
+  TF_RET_CHECK(shape.dimensions().size() == strides.size());
 
   int64_t stride = ByteSizeOfPrimitiveType(shape.element_type());
   for (int i : shape.layout().minor_to_major()) {
@@ -2114,13 +2124,20 @@ absl::Status ShapeUtil::ByteStrides(const Shape& shape,
 /*static*/
 std::optional<absl::InlinedVector<int64_t, 4>> ShapeUtil::ByteStrides(
     const Shape& shape) {
-  absl::InlinedVector<int64_t, 4> strides(shape.dimensions_size());
+  absl::InlinedVector<int64_t, 4> strides(shape.dimensions().size());
   if (!ByteStrides(shape, absl::MakeSpan(strides)).ok()) {
     return std::nullopt;
   }
   return strides;
 }
 
+/*static*/ int64_t ShapeUtil::ElementSizeInBits(const Shape& shape) {
+  if (shape.has_layout() && shape.layout().element_size_in_bits() != 0) {
+    return shape.layout().element_size_in_bits();
+  }
+  return ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type()) * CHAR_BIT;
+}
+
 /*static*/ int64_t ShapeUtil::ArraySize(const Shape& shape) {
   CHECK(LayoutUtil::IsDenseArray(shape));
   if (shape.layout().tiles().empty()) {
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index f67ca394970c..4e3602671fa1 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -20,10 +20,7 @@ limitations under the License.
 #define XLA_SHAPE_UTIL_H_
 
 #include <cstdint>
-#include <functional>
 #include <initializer_list>
-#include <iterator>
-#include <numeric>
 #include <optional>
 #include <ostream>
 #include <string>
@@ -44,6 +41,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
+#include "xla/shape_util.pb.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
 #include "xla/tsl/platform/macros.h"
@@ -88,6 +86,17 @@ struct ShapeIndex : public absl::InlinedVector<int64_t, 2> {
   void pop_front() { erase(begin()); }
 
   std::string ToString() const;
+
+  static ShapeIndex FromProto(const ShapeIndexProto& proto) {
+    auto indexes = proto.indexes();
+    return ShapeIndex{indexes.begin(), indexes.end()};
+  }
+
+  ShapeIndexProto ToProto() const {
+    ShapeIndexProto proto;
+    proto.mutable_indexes()->Assign(begin(), end());
+    return proto;
+  }
 };
 
 std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index);
@@ -114,10 +123,9 @@ class ShapeUtil {
   template <bool kBoundedDynamicOk>
   static inline std::pair<int64_t, bool> ExtentProduct(const Shape& shape) {
     DCHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
-    DCHECK_EQ(shape.dimensions_size(), shape.rank());
     int64_t product = 1;
     bool any_overflows = false;
-    for (int dim = 0; dim < shape.dimensions_size(); ++dim) {
+    for (int dim = 0; dim < shape.dimensions().size(); ++dim) {
       if constexpr (kBoundedDynamicOk) {
         if (shape.is_unbounded_dynamic_dimension(dim)) {
           continue;
@@ -169,7 +177,7 @@ class ShapeUtil {
 
   // Returns the number of bytes used to store the primitive_type.
   //
-  // Precondition: shape.IsArray()
+  // Precondition: primitive_type is an array type (otherwise crashes)
   static int64_t ByteSizeOfPrimitiveType(PrimitiveType primitive_type);
 
   // Returns the number of bytes required to store the tuple member pointers for
@@ -292,10 +300,11 @@ class ShapeUtil {
   static bool EqualStructure(const Shape& lhs, const Shape& rhs);
 
   // Returns the number of dimensions for which the dimension is not (trivially)
-  // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
-  // fluff. Note that zero dimensions are included in the true rank, e.g.,
-  // f32[3,0,1] has a true rank of 2D.
-  static int64_t TrueRank(const Shape& shape);
+  // 1. e.g., f32[2x1x1] has a true dimensionality of 1D, the other dimensions
+  // are just fluff. Note that zero dimensions are included in the true
+  // dimensionality, e.g., f32[3,0,1] has a true dimensionality of 2D.
+  // Precondition: array_shape.IsArray().
+  static int64_t TrueNumDimensions(const Shape& array_shape);
 
   static ProgramShape MakeProgramShape(std::initializer_list<Shape> parameters,
                                        Shape result);
@@ -304,10 +313,10 @@ class ShapeUtil {
   // Scalar-specific
 
   static bool IsScalar(const Shape& shape) {
-    return shape.IsArray() && shape.rank() == 0;
+    return shape.IsArray() && shape.dimensions().size() == 0;
   }
   static bool IsEffectiveScalar(const Shape& shape) {
-    return shape.IsArray() && TrueRank(shape) == 0;
+    return shape.IsArray() && TrueNumDimensions(shape) == 0;
   }
 
   // Returns whether "shape" is a scalar (array) with the given element_type.
@@ -411,6 +420,10 @@ class ShapeUtil {
   static Shape MakeShape(PrimitiveType element_type,
                          absl::Span<const int64_t> dimensions,
                          const std::vector<bool>& dynamic_dimensions);
+  // Constructs a new buffer shape with the given element type, and sequence of
+  // dimensions.
+  static Shape MakeBufferShape(PrimitiveType element_type,
+                               absl::Span<const int64_t> dimensions);
 
   // Constructs a new shape with the given element type and sequence of
   // dimensions. Method checks if the element type is valid, the shape's
@@ -440,15 +453,12 @@ class ShapeUtil {
       int64_t element_size_in_bits = 0, int64_t memory_space = 0,
       absl::Span<const SplitConfig> split_configs = {});
 
-  // Constructs a new sparse array shape with the given minor_to_major order and
-  // dim_level_types in its Layout. Returns a value shape such that
+  // Constructs a new sparse array shape with the given minor_to_major order
+  // in its Layout. Returns a value shape such that
   // shape.has_layout().
   static Shape MakeShapeWithSparseLayout(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions,
       absl::Span<const int64_t> minor_to_major,
-      absl::Span<const DimLevelType> dim_level_types,
-      absl::Span<const bool> dim_unique = {},
-      absl::Span<const bool> dim_ordered = {},
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
       PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
       int64_t tail_padding_alignment_in_elements = 1,
@@ -477,6 +487,9 @@ class ShapeUtil {
       const Shape& shape);
 
   // As MakeShape, but the object to write to is passed in.
+  // Precondition:
+  //   - if element_type is a non-array type, dimensions must be empty.
+  //   - shape must not be null.
   static absl::Status PopulateShape(PrimitiveType element_type,
                                     absl::Span<const int64_t> dimensions,
                                     Shape* shape);
@@ -547,9 +560,6 @@ class ShapeUtil {
   // invalid.
   static const Shape& GetSubshape(const Shape& shape, ShapeIndexView index);
 
-  // Faster version for one index.
-  static const Shape& GetSubshapeOneIndex(const Shape& shape, int64_t index);
-
   static absl::StatusOr<const Shape*> TryGetSubshape(const Shape& shape,
                                                      ShapeIndexView index);
   static Shape* GetMutableSubshape(Shape* shape, ShapeIndexView index);
@@ -961,8 +971,8 @@ class ShapeUtil {
 
   static absl::Status ForEachIndexWithStatus(
       const Shape& shape, const ForEachVisitorFunction& visitor_function) {
-    std::vector<int64_t> base(shape.dimensions_size());
-    std::vector<int64_t> incr(shape.dimensions_size(), 1);
+    std::vector<int64_t> base(shape.dimensions().size());
+    std::vector<int64_t> incr(shape.dimensions().size(), 1);
     return ForEachIndexWithStatus(shape, base,
                                   /*count=*/shape.dimensions(), incr,
                                   visitor_function);
@@ -971,8 +981,8 @@ class ShapeUtil {
   static void ForEachIndexNoStatus(
       const Shape& shape,
       const ForEachVisitorFunctionNoStatus& visitor_function) {
-    std::vector<int64_t> base(shape.dimensions_size());
-    std::vector<int64_t> incr(shape.dimensions_size(), 1);
+    std::vector<int64_t> base(shape.dimensions().size());
+    std::vector<int64_t> incr(shape.dimensions().size(), 1);
     ForEachIndexNoStatus(shape, base,
                          /*count=*/shape.dimensions(), incr, visitor_function);
   }
@@ -1035,6 +1045,9 @@ class ShapeUtil {
   static std::optional<absl::InlinedVector<int64_t, 4>> ByteStrides(
       const Shape& shape);
 
+  // Returns the size of the tensor element in bits.
+  static int64_t ElementSizeInBits(const Shape& shape);
+
   // Returns the array size in bytes (layout/tiling required), all paddings are
   // included.
   static int64_t ArraySize(const Shape& shape);
@@ -1057,11 +1070,6 @@ class ShapeUtil {
   static std::vector<const Shape*> FlattenTupleShape(const Shape& shape);
 
  private:
-  // Fills *shape ignoring dynamic dimensions. Returns true on success.
-  // REQUIRES: *shape is empty.
-  static bool FillNewShape(PrimitiveType element_type,
-                           absl::Span<const int64_t> dimensions, Shape* shape);
-
   // Helper for ForEachSubshape which visits the subshapes of the given shape in
   // DFS pre-order starting with the index.
   template <typename Fn>
@@ -1155,7 +1163,7 @@ inline ShapeUtil::ForEachState::ForEachState(const Shape& s,
       indexes(b.begin(), b.end()),
       indexes_ptr((rank == 0) ? nullptr : indexes.data()),
       indexes_span(indexes) {
-  CHECK_EQ(shape.rank(), b.size());
+  CHECK_EQ(shape.dimensions().size(), b.size());
   CHECK_EQ(i.size(), b.size());
   CHECK_EQ(c.size(), b.size());
 }
diff --git a/third_party/xla/xla/shape_util.proto b/third_party/xla/xla/shape_util.proto
new file mode 100644
index 000000000000..54c5bee20d26
--- /dev/null
+++ b/third_party/xla/xla/shape_util.proto
@@ -0,0 +1,7 @@
+syntax = "proto3";
+
+package xla;
+
+message ShapeIndexProto {
+  repeated int64 indexes = 1;
+}
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index 78b40d2f0e9a..743aa8ecf4d5 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/shape.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test_benchmark.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
@@ -1042,8 +1043,7 @@ TEST(ShapeUtilTest, InvalidDynamicDimension) {
 
   EXPECT_FALSE(error_status.ok());
   EXPECT_THAT(error_status.status().message(),
-              ::testing::HasSubstr(
-                  "Cannot mark a dynamic dimension at dim=1 as static"));
+              ::testing::HasSubstr("Invalid dimension size"));
 }
 
 TEST(ShapeUtilTest, PermuteDynamicDimensions) {
@@ -1058,7 +1058,7 @@ TEST(ShapeUtilTest, PermuteDynamicDimensions) {
     SCOPED_TRACE(absl::StrCat("permutation=", absl::StrJoin(permutation, ",")));
 
     auto permuted = ShapeUtil::PermuteDimensions(permutation, shape);
-    for (int i = 0; i < shape.rank(); i++) {
+    for (int i = 0; i < shape.dimensions().size(); i++) {
       EXPECT_EQ(permuted.dimensions(i), shape.dimensions(permutation[i]));
       EXPECT_EQ(permuted.is_dynamic_dimension(i),
                 shape.is_dynamic_dimension(permutation[i]));
@@ -1214,7 +1214,6 @@ TEST(ShapeUtilTest, B_250640044) {
              dimensions: 137438953472
              layout {
                minor_to_major: 0
-               dim_level_types: DIM_COMPRESSED
                physical_shape {
                  element_type: TUPLE
                  tuple_shapes {}
@@ -1223,7 +1222,7 @@ TEST(ShapeUtilTest, B_250640044) {
              is_dynamic_dimension: false
            })pb",
       &proto));
-  Shape shape(proto);
+  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
   EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
 }
 
@@ -1257,7 +1256,7 @@ TEST(ShapeUtilTest, B_251055887) {
           physical_shape { element_type: -562 }
         })pb",
       &proto));
-  Shape shape(proto);
+  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
   EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
 }
 
@@ -1268,14 +1267,14 @@ TEST(ShapeUtilTest, B_385192799) {
   {
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
         R"pb(element_type: 2000)pb", &proto));
-    Shape shape(proto);
+    TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
     EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
   }
 
   {
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
         R"pb(element_type: -1)pb", &proto));
-    Shape shape(proto);
+    TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
     EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
   }
 }
@@ -1564,6 +1563,17 @@ TEST(ShapeUtilTest, FlattenTupleShape) {
   EXPECT_EQ(flattened_shapes[3]->ToString(), "f32[8,9]");
 }
 
+TEST(ShapeUtilTest, ShapeIndexProtoSerialization) {
+  ShapeIndex empty{};
+  EXPECT_EQ(empty, ShapeIndex::FromProto(empty.ToProto()));
+
+  ShapeIndex single_index{1};
+  EXPECT_EQ(single_index, ShapeIndex::FromProto(single_index.ToProto()));
+
+  ShapeIndex multi_index{1, 2, 42};
+  EXPECT_EQ(multi_index, ShapeIndex::FromProto(multi_index.ToProto()));
+}
+
 void BM_MakeShape(::testing::benchmark::State& state) {
   for (auto s : state) {
     ShapeUtil::MakeShape(F32, {2});
diff --git a/third_party/xla/xla/side_effect_util.cc b/third_party/xla/xla/side_effect_util.cc
index 5c64d9a99e5f..7fc556e0e39b 100644
--- a/third_party/xla/xla/side_effect_util.cc
+++ b/third_party/xla/xla/side_effect_util.cc
@@ -38,6 +38,8 @@ const char kXlaMaxIdsPerPartitionAttr[] = "_xla_max_ids_per_partition";
 const char kXlaMaxUniqueIdsPerPartitionAttr[] =
     "_xla_max_unique_ids_per_partition";
 
+const char kXlaMaxValencyAttr[] = "_xla_max_valency";
+
 const char kXlaShardingStrategyAttr[] = "_xla_sharding_strategy";
 
 const char kXlaShardingStrategyMod[] = "mod";
@@ -75,4 +77,11 @@ const char kXlaMultiRecvCountAttr[] = "_xla_multi_recv_count";
 
 const char kXlaSchedulingGroupIdAttr[] = "_scheduling_group_id";
 
+const char kXlaNoOpSchedulingGroup[] = "noop";
+
+const char kMustFuseAttr[] = "MUST_FUSE";
+
+const char kMaximalFuseAttr[] = "MAXIMAL_FUSE";
+
+const char kCollectivesGroupAttr[] = "_collectives_group";
 }  // namespace xla
diff --git a/third_party/xla/xla/side_effect_util.h b/third_party/xla/xla/side_effect_util.h
index d8c3c118004f..5410b4ca115c 100644
--- a/third_party/xla/xla/side_effect_util.h
+++ b/third_party/xla/xla/side_effect_util.h
@@ -45,6 +45,10 @@ extern const char kXlaMaxIdsPerPartitionAttr[];
 // partition *after* an input batch is partitioned.
 extern const char kXlaMaxUniqueIdsPerPartitionAttr[];
 
+// XLA frontend attribute name for the maximum valency of a sample. Currently
+// only used for the custom combiner coarse-grain op.
+extern const char kXlaMaxValencyAttr[];
+
 // XLA frontend attribute for how to assign ids to partitions.
 extern const char kXlaShardingStrategyAttr[];
 
@@ -85,6 +89,23 @@ extern const char kXlaMultiRecvCountAttr[];
 
 // XLA frontend attribute for specifying the scheduling group id annotations.
 extern const char kXlaSchedulingGroupIdAttr[];
+// XLA frontend attribute value for a group that will not actually be scheduled.
+extern const char kXlaNoOpSchedulingGroup[];
+
+// XLA frontend attributes for specifying fusion directives.
+// MUST_FUSE: all ops labeled so should form as single fusion,
+// MAXIMAL_FUSE: all ops labeled should be in a fusion, but can be split among
+// multiple fusions, else, the compiler will return errors.
+// TODO(b/366060148): Currently, the JAX framework has not finalized on the
+// frontend attribute name for MUST_FUSE and MAXIMAL_FUSE. Update this code
+// once the name is finalized and any additional attributes related to fusion
+// are added.
+extern const char kMustFuseAttr[];
+extern const char kMaximalFuseAttr[];
+
+// XLA frontend attribute for specifying groups of collectives that should be
+// launched together.
+extern const char kCollectivesGroupAttr[];
 }  // namespace xla
 
 #endif  // XLA_SIDE_EFFECT_UTIL_H_
diff --git a/third_party/xla/xla/status_macros.cc b/third_party/xla/xla/status_macros.cc
index 952c3d7c7e81..d6b516abcfe2 100644
--- a/third_party/xla/xla/status_macros.cc
+++ b/third_party/xla/xla/status_macros.cc
@@ -118,8 +118,6 @@ MakeErrorStream::Impl::Impl(const absl::Status& status,
       prior_message_handling_(prior_message_handling),
       prior_message_(status.message()),
       is_done_(false),
-      // Error code type is not visible here, so we can't call
-      // IsLoggedByDefault.
       should_log_(true),
       log_severity_(absl::LogSeverity::kError),
       should_log_stack_trace_(false),
diff --git a/third_party/xla/xla/status_macros.h b/third_party/xla/xla/status_macros.h
index ac207b45a0b4..b5810e154cfc 100644
--- a/third_party/xla/xla/status_macros.h
+++ b/third_party/xla/xla/status_macros.h
@@ -16,19 +16,22 @@ limitations under the License.
 #ifndef XLA_STATUS_MACROS_H_
 #define XLA_STATUS_MACROS_H_
 
+#include <cstddef>
 #include <memory>
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <string>
+#include <type_traits>
 #include <utility>
 
 #include "absl/base/log_severity.h"
 #include "absl/base/optimization.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/errors.h"  // IWYU pragma: keep, used in macro.
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace status_macros {
@@ -37,39 +40,114 @@ namespace status_macros {
 // could be handled with the non-strict AutoJit mode.
 extern const char kPossibleAutoJitAlternative[];
 
+namespace internal {
+
+// Detects if type `T` supports streaming to `std::ostream`s with `operator<<`.
+// TODO: b/415071252 - Remove this once a new version of absl is available in
+// OpenXLA.
+template <typename T, typename = void>
+struct HasOstreamOperator : std::false_type {};
+
+template <typename T>
+struct HasOstreamOperator<
+    T, std::enable_if_t<std::is_same<
+           std::ostream&, decltype(std::declval<std::ostream&>()
+                                   << std::declval<const T&>())>::value>>
+    : std::true_type {};
+
+// This is an empty class not intended to be used. It exists so that
+// `HasAbslStringify` can reference a universal class rather than needing to be
+// copied for each new sink.
+class UnimplementedSink {
+ public:
+  void Append(size_t count, char ch);
+
+  void Append(absl::string_view v);
+
+  // Support `absl::Format(&sink, format, args...)`.
+  friend void AbslFormatFlush(UnimplementedSink* sink, absl::string_view v);
+};
+
+// `HasAbslStringify<T>` detects if type `T` supports the `AbslStringify()`
+// customization point (see
+// https://abseil.io/docs/cpp/guides/format#abslstringify for the
+// documentation).
+//
+// Note that there are types that can be `StrCat`-ed that do not use the
+// `AbslStringify` customization point (for example, `int`).
+// TODO: b/415071252 - Remove this once a new version of absl is available in
+// OpenXLA.
+template <typename T, typename = void>
+struct HasAbslStringify : std::false_type {};
+
+template <typename T>
+struct HasAbslStringify<
+    T,
+    std::enable_if_t<std::is_void<decltype(AbslStringify(
+        std::declval<UnimplementedSink&>(), std::declval<const T&>()))>::value>>
+    : std::true_type {};
+
+// Formats a value for a failing `TF_RET_CHECK` statement. Excepting a few
+// special-case overloads below, behavior attempts to mimic ABSL logging.
+// The stream operator is used if supported; else absl::Stringify() support
+// is checked.
+template <typename T>
+inline void PrintToStream(const T& v, std::ostream* os) {
+  if constexpr (HasOstreamOperator<T>::value) {
+    (*os) << v;
+  } else {
+    static_assert(HasAbslStringify<T>::value,
+                  "Type must support the stream operator or absl::Stringify");
+    absl::Format(os, "%v", v);
+  }
+}
+
+}  // namespace internal
+
 // Stream object used to collect error messages in MAKE_ERROR macros
 // or append error messages with APPEND_ERROR.  It accepts any
 // arguments with operator<< to build an error string, and then has an
 // implicit cast operator to absl::Status, which converts the
 // logged string to a absl::Status object and returns it, after logging the
 // error.  At least one call to operator<< is required; a compile time
-// error will be generated if none are given. Errors will only be
-// logged by default for certain status codes, as defined in
-// IsLoggedByDefault. This class will give ERROR errors if you don't
-// retrieve a absl::Status exactly once before destruction.
+// error will be generated if none are given. This class will give ERROR errors
+// if you don't retrieve a absl::Status exactly once before destruction.
 //
 // The class converts into an intermediate wrapper object
 // MakeErrorStreamWithOutput to check that the error stream gets at least one
 // item of input.
 class MakeErrorStream {
- public:
-  // Wrapper around MakeErrorStream that only allows for output. This
-  // is created as output of the first operator<< call on
+ private:
+  // Wrapper around MakeErrorStream that only allows for output and changing log
+  // severity. This is created as output of the first operator<< call on
   // MakeErrorStream. The bare MakeErrorStream does not have a
   // absl::Status operator. The net effect of that is that you
-  // have to call operator<< at least once or else you'll get a
-  // compile time error.
+  // have to call operator<< at least once on a bare MakeErrorStreamor else
+  // you'll get a compile time error.
+  //
+  // The important difference between this and MakeErrorStream is that
+  // this doesn't require you to call operator<< at least once while
+  // MakeErrorStream does. To remember this, you can think of "WithOutput"
+  // as this already has an output and thus additional outputs are optional.
   class MakeErrorStreamWithOutput {
    public:
     explicit MakeErrorStreamWithOutput(MakeErrorStream* error_stream)
         : wrapped_error_stream_(error_stream) {}
 
+    // Returns a MakeErrorStreamWithOutput as we don't require more calls to
+    // operator<< on the result.
     template <typename T>
     MakeErrorStreamWithOutput& operator<<(const T& value) {
       *wrapped_error_stream_ << value;
       return *this;
     }
 
+    // Sets the log severity of this message. Default is ERROR.
+    MakeErrorStreamWithOutput& with_log_severity(absl::LogSeverity severity) {
+      wrapped_error_stream_->with_log_severity(severity);
+      return *this;
+    }
+
     // Implicit cast operators to absl::Status and absl::StatusOr.
     // Exactly one of these must be called exactly once before destruction.
     // NOLINTNEXTLINE(google-explicit-constructor)
@@ -88,6 +166,7 @@ class MakeErrorStream {
         delete;
   };
 
+ public:
   // When starting from an existing error status, this determines whether we'll
   // append or prepend to that status's error message.
   enum PriorMessageHandling { kAppendToPriorMessage, kPrependToPriorMessage };
@@ -96,26 +175,41 @@ class MakeErrorStream {
   template <typename ERROR_CODE_TYPE>
   MakeErrorStream(const char* file, int line, ERROR_CODE_TYPE code);
 
+  // Returns a MakeErrorStreamWithOutput as we don't require more calls to
+  // operator<< on the result.
   template <typename T>
   MakeErrorStreamWithOutput& operator<<(const T& value) {
     CheckNotDone();
-    impl_->stream_ << value;
+    internal::PrintToStream(value, &impl_->stream_);
     return impl_->make_error_stream_with_output_wrapper_;
   }
 
-  // When this message is logged (see with_logging()), include the stack trace.
+  // When this message is logged, include the stack trace.
+  // Returns a MakeErrorStream as we want to require at least one call to
+  // operator<< on the result.
   MakeErrorStream& with_log_stack_trace() {
     impl_->should_log_stack_trace_ = true;
     return *this;
   }
 
   // Disables logging this message.
+  // Returns a MakeErrorStream as we want to require at least one call to
+  // operator<< on the result.
   MakeErrorStream& without_logging() {
     impl_->should_log_ = false;
     return *this;
   }
 
+  // Sets the log severity of this message. Default is ERROR.
+  MakeErrorStream& with_log_severity(absl::LogSeverity severity) {
+    impl_->log_severity_ = severity;
+    return *this;
+  }
+
   // Adds RET_CHECK failure text to error message.
+  // Returns a MakeErrorStreamWithOutput as we don't require more calls to
+  // operator<< on the result (the caller already provided the condition text as
+  // the error message).
   MakeErrorStreamWithOutput& add_ret_check_failure(const char* condition);
 
  private:
@@ -199,6 +293,9 @@ class StatusAdaptorForMacros {
 }  // namespace status_macros
 }  // namespace xla
 
+// Returns a Status error if the condition is false. The condition text is
+// included in the error message. The caller can optionally stream more error
+// messages after the macro.
 #define TF_RET_CHECK(condition)                                      \
   while (ABSL_PREDICT_FALSE(!(condition)))                           \
   return xla::status_macros::MakeErrorStream(__FILE__, __LINE__,     \
@@ -206,6 +303,8 @@ class StatusAdaptorForMacros {
       .with_log_stack_trace()                                        \
       .add_ret_check_failure(#condition)
 
+// Returns a Status error. The caller must stream at least one error message
+// after the macro.
 #define XLA_RET_CHECK_FAIL()                                         \
   return xla::status_macros::MakeErrorStream(__FILE__, __LINE__,     \
                                              ::tsl::error::INTERNAL) \
diff --git a/third_party/xla/xla/status_macros_test.cc b/third_party/xla/xla/status_macros_test.cc
index 2866f08e1eb8..2efc38f83c00 100644
--- a/third_party/xla/xla/status_macros_test.cc
+++ b/third_party/xla/xla/status_macros_test.cc
@@ -16,14 +16,21 @@ limitations under the License.
 #include "xla/status_macros.h"
 
 #include <functional>
+#include <string>
+#include <type_traits>
 #include <utility>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/base/log_severity.h"
+#include "absl/log/log_sink.h"
+#include "absl/log/scoped_mock_log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -38,11 +45,39 @@ absl::Status RetCheckFailWithExtraMessage() {
   return absl::OkStatus();
 }
 
+absl::Status RetCheckFailWithLogSeverity(absl::LogSeverity severity) {
+  TF_RET_CHECK(1 == 2).with_log_severity(severity) << "extra message";
+  return absl::OkStatus();
+}
+
 absl::Status RetCheckSuccess() {
   TF_RET_CHECK(3 > 2);
   return absl::OkStatus();
 }
 
+absl::Status XlaRetCheckFailLogWarning() {
+  XLA_RET_CHECK_FAIL().with_log_severity(absl::LogSeverity::kWarning)
+      << "xla ret check fail message";
+}
+
+// A type that has `AbslStringify` but not `operator<<`.
+struct HasOnlyAbslStringify {
+  int i;
+
+  bool operator==(const HasOnlyAbslStringify& h) const { return i == h.i; }
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const HasOnlyAbslStringify& h) {
+    absl::Format(&sink, "Stringify-%v", h.i);
+  }
+};
+
+absl::Status RetCheckPrintAbslStringify() {
+  HasOnlyAbslStringify h = {123};
+  TF_RET_CHECK(false) << h;
+  return absl::OkStatus();
+}
+
 TEST(StatusMacros, RetCheckFailing) {
   absl::Status status = RetCheckFail();
   EXPECT_EQ(status.code(), tsl::error::INTERNAL);
@@ -57,6 +92,36 @@ TEST(StatusMacros, RetCheckFailingWithExtraMessage) {
               ::testing::ContainsRegex("RET_CHECK.*2 > 3 extra message"));
 }
 
+TEST(StatusMacros, RetCheckLogWarning) {
+  // absl::ScopedMockLog only works if we're actually using ABSL logging, and
+  // TSL supports a homegrown logging implementation, so we should only check
+  // the log is emitted when ABSL logging is used.
+  absl::ScopedMockLog mock_log(absl::MockLogDefault::kIgnoreUnexpected);
+  const std::string kExpectedRegex = "RET_CHECK.*1 == 2 extra message";
+  if constexpr (std::is_same_v<absl::LogSink, tsl::TFLogSink>) {
+    EXPECT_CALL(mock_log, Log(absl::LogSeverity::kWarning, ::testing::_,
+                              ::testing::ContainsRegex(kExpectedRegex)));
+  }
+  mock_log.StartCapturingLogs();
+  absl::Status status =
+      RetCheckFailWithLogSeverity(absl::LogSeverity::kWarning);
+  EXPECT_EQ(status.code(), tsl::error::INTERNAL);
+  EXPECT_THAT(status.message(), ::testing::ContainsRegex(kExpectedRegex));
+}
+
+TEST(StatusMacros, RetCheckLogInfo) {
+  absl::ScopedMockLog mock_log(absl::MockLogDefault::kIgnoreUnexpected);
+  const std::string kExpectedRegex = "RET_CHECK.*1 == 2 extra message";
+  if constexpr (std::is_same_v<absl::LogSink, tsl::TFLogSink>) {
+    EXPECT_CALL(mock_log, Log(absl::LogSeverity::kInfo, ::testing::_,
+                              ::testing::ContainsRegex(kExpectedRegex)));
+  }
+  mock_log.StartCapturingLogs();
+  absl::Status status = RetCheckFailWithLogSeverity(absl::LogSeverity::kInfo);
+  EXPECT_EQ(status.code(), tsl::error::INTERNAL);
+  EXPECT_THAT(status.message(), ::testing::ContainsRegex(kExpectedRegex));
+}
+
 TEST(StatusMacros, RetCheckSucceeding) {
   absl::Status status = RetCheckSuccess();
   EXPECT_IS_OK(status);
@@ -116,4 +181,24 @@ TEST(StatusMacros, AssignOrReturnUnsuccessfully) {
   EXPECT_EQ(status.code(), tsl::error::INTERNAL);
 }
 
+TEST(StatusMacros, XlaRetCheckFailLogWarning) {
+  absl::ScopedMockLog mock_log(absl::MockLogDefault::kIgnoreUnexpected);
+  const std::string kExpectedLog = "xla ret check fail message";
+  if constexpr (std::is_same_v<absl::LogSink, tsl::TFLogSink>) {
+    EXPECT_CALL(mock_log, Log(absl::LogSeverity::kWarning, ::testing::_,
+                              ::testing::HasSubstr(kExpectedLog)));
+  }
+  mock_log.StartCapturingLogs();
+  absl::Status status = XlaRetCheckFailLogWarning();
+  EXPECT_EQ(status.code(), tsl::error::INTERNAL);
+  EXPECT_THAT(status.message(), ::testing::HasSubstr(kExpectedLog));
+}
+
+TEST(StatusMacros, RetCheckPrintAbslStringify) {
+  absl::Status status = RetCheckPrintAbslStringify();
+  EXPECT_EQ(status.code(), tsl::error::INTERNAL);
+  EXPECT_THAT(status.message(),
+              ::testing::ContainsRegex("RET_CHECK.*Stringify-123"));
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index e125c1cf6425..85127e349386 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -1,5 +1,5 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_build_defs_bzl_deps", "stream_executor_friends", "stream_executor_internal")
 load("//xla/tsl:tsl.bzl", "if_google", "if_oss", "internal_visibility")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
@@ -65,6 +65,8 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tsl/lib/math:math_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -113,10 +115,10 @@ cc_library(
     name = "gpu_solver_context",
     hdrs = ["gpu_solver_context.h"],
     deps = [
+        ":blas",
+        ":device_memory",
+        ":stream",
         "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
@@ -158,7 +160,6 @@ cc_library(
         ":device_memory",
         ":event",
         ":event_based_timer",
-        ":kernel",
         ":launch_dim",
         ":platform",
         ":stream",
@@ -216,18 +217,11 @@ cc_library(
     deps = [
         ":device_memory",
         ":platform",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -317,6 +311,7 @@ cc_library(
     srcs = ["blas.cc"],
     hdrs = ["blas.h"],
     deps = [
+        ":blas_proto_cc",
         ":data_type",
         ":device_memory",
         ":numeric_options",
@@ -365,6 +360,11 @@ cc_library(
     ] + if_google(["@com_google_protobuf//:wrappers_cc_proto"]),
 )
 
+tf_proto_library(
+    name = "blas_proto",
+    srcs = ["blas.proto"],
+)
+
 cc_library(
     name = "fft",
     hdrs = ["fft.h"],
@@ -489,6 +489,7 @@ cc_library(
         ":stream_executor_h",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -503,6 +504,7 @@ cc_library(
     srcs = ["kernel_spec.cc"],
     hdrs = ["kernel_spec.h"],
     deps = [
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -539,7 +541,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -607,14 +608,14 @@ cc_library(
     deps = [
         ":bit_pattern",
         ":device_memory",
+        ":dnn",
         ":kernel",
         ":launch_dim",
-        "//xla/tsl/lib/gtl:int_type",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -659,7 +660,6 @@ cc_library(
     srcs = ["stream_common.cc"],
     hdrs = ["stream_common.h"],
     deps = [
-        ":blas",
         ":device_description",
         ":fft",
         ":platform",
@@ -670,7 +670,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -718,25 +717,23 @@ cc_library(
 # StreamExecutor tests
 #===--------------------------------------------------------------------------------------------===#
 
-cc_test(
+xla_cc_test(
     name = "generic_memory_allocation_test",
     srcs = ["generic_memory_allocation_test.cc"],
     deps = [
         ":generic_memory_allocation",
-        "//xla/tsl/platform:macros",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "generic_memory_allocator_test",
     srcs = ["generic_memory_allocator_test.cc"],
     deps = [
         ":generic_memory_allocation",
         ":generic_memory_allocator",
         ":memory_allocation",
-        "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -746,7 +743,7 @@ cc_test(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "semantic_version_test",
     srcs = ["semantic_version_test.cc"],
     deps = [
@@ -775,6 +772,8 @@ xla_cc_test(
         ":typed_kernel_factory",
         "//xla/stream_executor/host:host_platform",
         "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/types:span",
+        "@com_google_benchmark//:benchmark",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
     ],
@@ -827,11 +826,9 @@ xla_cc_test(
     name = "scoped_module_handle_test",
     srcs = ["scoped_module_handle_test.cc"],
     deps = [
-        ":device_description",
         ":mock_stream_executor",
         ":module_spec",
         ":scoped_module_handle",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:test",
     ],
@@ -856,7 +853,6 @@ xla_cc_test(
         ":device_description",
         ":semantic_version",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -890,7 +886,7 @@ xla_cc_test(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "bit_pattern_test",
     srcs = ["bit_pattern_test.cc"],
     deps = [
@@ -920,7 +916,6 @@ alias(
         "gpu",
         "rocm-only",
     ] + if_google([
-        # TODO(b/360374983): Remove this tag once the target can be built without --config=rocm.
         "manual",
     ]),
 )
diff --git a/third_party/xla/xla/stream_executor/blas.cc b/third_party/xla/xla/stream_executor/blas.cc
index ec6c40d59f74..f768a4a17bf0 100644
--- a/third_party/xla/xla/stream_executor/blas.cc
+++ b/third_party/xla/xla/stream_executor/blas.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 
 #include <cstdint>
+#include <optional>
 #include <ostream>
 #include <string>
 
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "xla/stream_executor/blas.pb.h"
 #include "xla/stream_executor/device_memory.h"
 
 namespace stream_executor {
@@ -149,5 +152,71 @@ std::ostream& operator<<(std::ostream& os, DataType ty) {
   return os << DataTypeString(ty);
 }
 
+xla::BlasTransposeProto ToProto(Transpose t) {
+  switch (t) {
+    case Transpose::kNoTranspose:
+      return xla::BLAS_NO_TRANSPOSE;
+    case Transpose::kTranspose:
+      return xla::BLAS_TRANSPOSE;
+    case Transpose::kConjugateTranspose:
+      return xla::BLAS_CONJUGATE_TRANSPOSE;
+  }
+}
+
+absl::StatusOr<Transpose> FromProto(xla::BlasTransposeProto t) {
+  switch (t) {
+    case xla::BLAS_NO_TRANSPOSE:
+      return Transpose::kNoTranspose;
+    case xla::BLAS_TRANSPOSE:
+      return Transpose::kTranspose;
+    case xla::BLAS_CONJUGATE_TRANSPOSE:
+      return Transpose::kConjugateTranspose;
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Invalid BlasTranspose value: ", t));
+  }
+}
+
+xla::BlasComputationTypeProto ToProto(ComputationType ty) {
+  switch (ty) {
+    case ComputationType::kF16:
+      return xla::BLAS_COMPUTATION_TYPE_F16;
+    case ComputationType::kF32:
+      return xla::BLAS_COMPUTATION_TYPE_F32;
+    case ComputationType::kF64:
+      return xla::BLAS_COMPUTATION_TYPE_F64;
+    case ComputationType::kI32:
+      return xla::BLAS_COMPUTATION_TYPE_I32;
+    case ComputationType::kF16AsF32:
+      return xla::BLAS_COMPUTATION_TYPE_F16_AS_F32;
+    case ComputationType::kBF16AsF32:
+      return xla::BLAS_COMPUTATION_TYPE_BF16_AS_F32;
+    case ComputationType::kTF32AsF32:
+      return xla::BLAS_COMPUTATION_TYPE_TF32_AS_F32;
+  }
+}
+
+std::optional<ComputationType> FromProto(xla::BlasComputationTypeProto ty) {
+  switch (ty) {
+    case xla::BLAS_COMPUTATION_TYPE_F16:
+      return ComputationType::kF16;
+    case xla::BLAS_COMPUTATION_TYPE_F32:
+      return ComputationType::kF32;
+    case xla::BLAS_COMPUTATION_TYPE_F64:
+      return ComputationType::kF64;
+    case xla::BLAS_COMPUTATION_TYPE_I32:
+      return ComputationType::kI32;
+    case xla::BLAS_COMPUTATION_TYPE_F16_AS_F32:
+      return ComputationType::kF16AsF32;
+    case xla::BLAS_COMPUTATION_TYPE_BF16_AS_F32:
+      return ComputationType::kBF16AsF32;
+    case xla::BLAS_COMPUTATION_TYPE_TF32_AS_F32:
+      return ComputationType::kTF32AsF32;
+    case xla::BLAS_COMPUTATION_TYPE_UNSET:
+    default:
+      return std::nullopt;
+  }
+}
+
 }  // namespace blas
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/blas.h b/third_party/xla/xla/stream_executor/blas.h
index f6ad27aebee9..1800c6400782 100644
--- a/third_party/xla/xla/stream_executor/blas.h
+++ b/third_party/xla/xla/stream_executor/blas.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/stream_executor/blas.pb.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/numeric_options.h"
@@ -64,6 +65,13 @@ enum class Transpose { kNoTranspose, kTranspose, kConjugateTranspose };
 // Returns a name for t.
 std::string TransposeString(Transpose t);
 
+// Converts blas::Transpose to the equivalent proto enum.
+xla::BlasTransposeProto ToProto(Transpose t);
+
+// Converts proto enum to the equivalent blas::Transpose.
+// Returns InvalidArgumentError if the proto value is invalid.
+absl::StatusOr<Transpose> FromProto(xla::BlasTransposeProto t);
+
 // Specifies whether the upper or lower triangular part of a
 // symmetric/Hermitian matrix is used.
 enum class UpperLower { kUpper, kLower };
@@ -116,6 +124,12 @@ std::string ComputationTypeString(ComputationType ty);
 
 std::ostream &operator<<(std::ostream &os, ComputationType ty);
 
+// Converts blas::ComputationType to the equivalent proto enum.
+xla::BlasComputationTypeProto ToProto(ComputationType ty);
+
+// Converts proto enum to the equivalent blas::ComputationType.
+std::optional<ComputationType> FromProto(xla::BlasComputationTypeProto ty);
+
 using dnn::DataType;
 using dnn::ToDataType;
 
diff --git a/third_party/xla/xla/stream_executor/blas.proto b/third_party/xla/xla/stream_executor/blas.proto
new file mode 100644
index 000000000000..71111ebffd2e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/blas.proto
@@ -0,0 +1,35 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+enum BlasTransposeProto {
+  BLAS_NO_TRANSPOSE = 0;
+  BLAS_TRANSPOSE = 1;
+  BLAS_CONJUGATE_TRANSPOSE = 2;
+}
+
+enum BlasComputationTypeProto {
+  BLAS_COMPUTATION_TYPE_UNSET = 0;
+  BLAS_COMPUTATION_TYPE_F16 = 1;          // 16-bit floating-point
+  BLAS_COMPUTATION_TYPE_F32 = 2;          // 32-bit floating-point
+  BLAS_COMPUTATION_TYPE_F64 = 3;          // 64-bit floating-point
+  BLAS_COMPUTATION_TYPE_I32 = 4;          // 32-bit integer
+  BLAS_COMPUTATION_TYPE_F16_AS_F32 = 5;   // Allow downcast to F16 precision.
+  BLAS_COMPUTATION_TYPE_BF16_AS_F32 = 6;  // Allow downcast to BF16 precision.
+  BLAS_COMPUTATION_TYPE_TF32_AS_F32 = 7;  // Allow downcast to TF32 precision.
+}
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index d69929870cc4..846cd5b3fb43 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -5,10 +5,6 @@ load(
     _if_cuda_or_rocm = "if_cuda_or_rocm",
     _if_gpu_is_configured = "if_gpu_is_configured",
 )
-load(
-    "//xla/tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 def stream_executor_friends():
     return ["//..."]
@@ -37,34 +33,5 @@ def if_cuda_or_rocm(if_true, if_false = []):
 def tf_additional_gpu_compilation_copts():
     return ["-DTF_DISABLE_NVLINK_BY_DEFAULT"]
 
-def gpu_only_cc_library(name, tags = [], **kwargs):
-    """A library that only gets compiled when GPU is configured, otherwise it's an empty target.
-
-    Args:
-      name: Name of the target
-      tags: Tags being applied to the implementation target
-      **kwargs: Accepts all arguments that a `cc_library` would also accept
-    """
-    if not native.package_name().startswith("xla/stream_executor"):
-        fail("gpu_only_cc_library may only be used in `xla/stream_executor/...`.")
-
-    cc_library(
-        name = "%s_non_gpu" % name,
-        tags = ["manual"],
-    )
-    cc_library(
-        name = "%s_gpu_only" % name,
-        tags = tags + ["manual"],
-        **kwargs
-    )
-    native.alias(
-        name = name,
-        actual = if_gpu_is_configured(":%s_gpu_only" % name, ":%s_non_gpu" % name),
-        visibility = kwargs.get("visibility"),
-        compatible_with = kwargs.get("compatible_with"),
-        restricted_to = kwargs.get("restricted_to"),
-        target_compatible_with = kwargs.get("target_compatible_with"),
-    )
-
 def stream_executor_build_defs_bzl_deps():
     return []
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 2b82d4d20280..a6b2861313f1 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -18,20 +18,18 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <functional>
-#include <variant>
 #include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/bit_pattern.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/tsl/lib/gtl/int_type.h"
-#include "tsl/platform/errors.h"
 
 namespace stream_executor {
 
@@ -49,87 +47,38 @@ class Stream;
 // device.
 class CommandBuffer {
  public:
-  // Execution scope enables fine-grained synchronization scopes inside
-  // commands buffers. Implementation is very backend-specific and for CUDA/ROCM
-  // backends it's implemented as DAG edges. By default all commands launched in
-  // the `kDefaultExecutionScope` execution scope.
-  //
-  // Example #1: independent execution scopes and independent barriers
-  //
-  // ExecutionScope #0       ExecutionScope #1
-  //
-  //          A                        D
-  //          B                        E
-  // ----- barrier -----      ----- barrier -----
-  //          C                        F
-  //
-  //   (1) Commands A and B can run concurrently and must complete before C.
-  //   (2) Commands D and E can run concurrently and must complete before F.
-  //   (3) There is no syncrhonization between execution scopes, and commands
-  //       from different execution scopes can execute concurrently with each
-  //       other as long as they satisfy constraints of their respective
-  //       execution scopes.
-  //
-  //
-  //
-  // Example #2: dependencies between scopes and inter-scope barriers
-  //
-  // ExecutionScope #0       ExecutionScope #1
-  //
-  //          A                        D
-  //          B                        E
-  // ----------------- barrier ------------------
-  //          C                        F
-  //
-  //   (1) Commands A and B can run concurrently and must complete before
-  //       C and F.
-  //   (2) Commands D and E can run concurrently and must complete before
-  //       C and F.
-  //   (3) Commands C and F can run concurrently.
-  //   (4) All commands before a shared barrier (in both excecution scopes)
-  //       should complete before any command after a berrier starts execution.
-  //
-  //
-  //
-  // Example #3: one-directional barriers between execution scopes
-  //
-  // ExecutionScope #0       ExecutionScope #1
-  //
-  //          A
-  //          B
-  // ----- barrier -----               D
-  //          C            \           E
-  //                           ----- barrier -----
-  //                                   F
-  //
-  //   (1) Commands A and B can run concurrently and must complete before
-  //       C and F.
-  //   (2) Commands D and E can run concurrently and must complete before
-  //       F (does not synchronize with C).
-  //   (3) Commands C and F can run concurrently.
-  //
-  //  This is a more fine-grained barrier than in example #2: it enforces
-  //  synchronization from execution scope #0 to execution scope #1 but no
-  //  synchronization in other direction. For CUDA/ROCM backend it has the same
-  //  semantics as stream wait operation.
-  //
-  TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionScopeId, uint64_t);
-  static constexpr auto kDefaultExecutionScope = ExecutionScopeId(0);
-
-  // Builder constructs nested command buffers owned by a parent command buffer.
-  //
-  // Builder can use arbitrary number of nested execution scopes, the only
-  // requirement is that after builder constructed all commands, they all must
-  // be synchronized with a default execution scope.
-  using Builder = std::function<absl::Status(CommandBuffer*)>;
-
-  // An extension of a `Builder` defined above that builds a nested command
-  // buffer in a given execution scope. Builder can use arbitrary number of
-  // nested execution scopes, the only requirement is that after builder
-  // constructed all commands, they all must be synchronized with an execution
-  // scope passed as an argument.
-  using ExecutionScopeBuilder =
-      std::function<absl::Status(ExecutionScopeId, CommandBuffer*)>;
+  // Command represents an operation recorded into a command buffer. It's owned
+  // by the command buffer and returned to the caller to enable efficient
+  // command buffer updates.
+  class Command {
+   public:
+    virtual ~Command() = default;
+
+   protected:
+    Command() = default;
+
+    Command(const Command&) = default;
+    Command& operator=(const Command&) = default;
+
+    Command(Command&&) = default;
+    Command& operator=(Command&&) = default;
+  };
+
+  // A callback to construct a nested `command_buffer` by creating commands in
+  // it. Created commands must execute after `dependencies`, and the callback
+  // must return a vector of commands that will be used as external dependencies
+  // for the next callback recording into the same command buffer.
+  using CreateCommands =
+      absl::AnyInvocable<absl::StatusOr<std::vector<const Command*>>(
+          CommandBuffer* command_buffer,
+          absl::Span<const Command* const> dependencies)>;
+
+  // A callback to update a nested `command_buffer` owned by a conditional
+  // command. At command buffer update time we can't change the dependency
+  // structure of the previously created commands, and can only update the
+  // parameters of the commands (i.e. device memory pointers).
+  using UpdateCommands =
+      absl::AnyInvocable<absl::Status(CommandBuffer* command_buffer)>;
 
   CommandBuffer() = default;
   virtual ~CommandBuffer() = default;
@@ -150,6 +99,21 @@ class CommandBuffer {
   //
   enum class State { kCreate, kUpdate, kFinalized };
 
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, State state) {
+    switch (state) {
+      case CommandBuffer::State::kCreate:
+        sink.Append("create");
+        break;
+      case CommandBuffer::State::kUpdate:
+        sink.Append("update");
+        break;
+      case CommandBuffer::State::kFinalized:
+        sink.Append("finalized");
+        break;
+    }
+  }
+
   // Command buffers have two modes of execution:
   //
   //   (1) kPrimary: command buffer can be submitted for execution via
@@ -159,12 +123,15 @@ class CommandBuffer {
   //
   enum class Mode { kPrimary, kNested };
 
-  friend absl::string_view ModeToString(Mode mode) {
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, Mode mode) {
     switch (mode) {
       case CommandBuffer::Mode::kPrimary:
-        return "primary";
+        sink.Append("primary");
+        break;
       case CommandBuffer::Mode::kNested:
-        return "nested";
+        sink.Append("nested");
+        break;
     }
   }
 
@@ -172,156 +139,112 @@ class CommandBuffer {
   // Command buffer API
   //===--------------------------------------------------------------------===//
 
-  // Adds an execution barrier to a given execution scope: all commands added
-  // before a barrier in a the execution scope will complete before any of the
-  // commands added after a barrier in the same execution scope.
-  virtual absl::Status Barrier(ExecutionScopeId execution_scope_id) = 0;
-
-  // Adds an execution barrier that synchronizes commands across multiple
-  // execution scopes. See example #2 in execution scope id documentation.
-  virtual absl::Status Barrier(
-      absl::Span<const ExecutionScopeId> execution_scope_ids) = 0;
-
-  // Adds an execution barrier from execution scope `from_execution_scope_id` to
-  // execution scope `to_execution_scope_id`. See example #3 for details.
-  virtual absl::Status Barrier(ExecutionScopeId from_execution_scope_id,
-                               ExecutionScopeId to_execution_scope_id) = 0;
-
-  // Adds an execution barrier to the default execution scope.
-  absl::Status Barrier() { return Barrier(kDefaultExecutionScope); }
-
-  // Adds a kernel launch command.
-  virtual absl::Status Launch(ExecutionScopeId execution_scope_id,
-                              const ThreadDim& threads, const BlockDim& blocks,
-                              const Kernel& kernel, const KernelArgs& args) = 0;
-
-  // Adds a kernel launch command to the default execution scope.
-  absl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
-                      const Kernel& kernel, const KernelArgs& args) {
-    return Launch(kDefaultExecutionScope, threads, blocks, kernel, args);
-  }
+  // Creates a kernel launch command.
+  virtual absl::StatusOr<const Command*> CreateLaunch(
+      const ThreadDim& threads, const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgs& args,
+      absl::Span<const Command* const> dependencies) = 0;
+
+  // Updates a kernel launch command.
+  virtual absl::Status UpdateLaunch(const Command* command,
+                                    const ThreadDim& threads,
+                                    const BlockDim& blocks,
+                                    const Kernel& kernel,
+                                    const KernelArgs& args) = 0;
 
   // Type-safe wrapper for launching typed kernels. Notice that the order of
   // arguments is different do disambiguate from the regular launch API.
   template <typename... Params, typename... Args>
-  absl::Status Launch(const TypedKernel<Params...>& kernel,
-                      ExecutionScopeId execution_scope_id,
-                      const ThreadDim& threads, const BlockDim& blocks,
-                      Args... args);
+  absl::StatusOr<const Command*> CreateLaunch(
+      const TypedKernel<Params...>& kernel, const ThreadDim& threads,
+      const BlockDim& blocks, absl::Span<const Command* const> dependencies,
+      Args... args);
 
-  // Type-safe wrapper for launching typed kernels in default execution scope.
+  // Type-safe wrapper for updating typed kernels. Notice that the order of
+  // arguments is different do disambiguate from the regular launch API.
   template <typename... Params, typename... Args>
-  absl::Status Launch(const TypedKernel<Params...>& kernel,
-                      const ThreadDim& threads, const BlockDim& blocks,
-                      Args... args) {
-    return Launch(kernel, kDefaultExecutionScope, threads, blocks, args...);
-  }
+  absl::Status UpdateLaunch(const Command* command,
+                            const TypedKernel<Params...>& kernel,
+                            const ThreadDim& threads, const BlockDim& blocks,
+                            Args... args);
+
+  // Creates a command that launches a nested command buffer.
+  virtual absl::StatusOr<const Command*> CreateNestedCommand(
+      const CommandBuffer& nested,
+      absl::Span<const Command* const> dependencies) = 0;
+
+  // Updates a command that launches a nested command buffer.
+  virtual absl::Status UpdateNestedCommand(const Command* command,
+                                           const CommandBuffer& nested) = 0;
+
+  // Creates a device-to-device memory copy.
+  virtual absl::StatusOr<const Command*> CreateMemcpyD2D(
+      DeviceMemoryBase* dst, const DeviceMemoryBase& src, uint64_t size,
+      absl::Span<const Command* const> dependencies) = 0;
+
+  // Updates a device-to-device memory copy.
+  virtual absl::Status UpdateMemcpyD2D(const Command* command,
+                                       DeviceMemoryBase* dst,
+                                       const DeviceMemoryBase& src,
+                                       uint64_t size) = 0;
+
+  // Creates a memset command.
+  virtual absl::StatusOr<const Command*> CreateMemset(
+      DeviceMemoryBase* dst, BitPattern bit_pattern, size_t num_elements,
+      absl::Span<const Command* const> dependencies) = 0;
+
+  // Updates a memset command.
+  virtual absl::Status UpdateMemset(const Command* command,
+                                    DeviceMemoryBase* dst,
+                                    const BitPattern& bit_pattern,
+                                    size_t num_elements) = 0;
 
-  // Adds a nested command buffer.
-  virtual absl::Status AddNestedCommandBuffer(
-      ExecutionScopeId execution_scope_id, const CommandBuffer& nested) = 0;
-
-  // Adds a nested command buffer to the default execution scope.
-  absl::Status AddNestedCommandBuffer(const CommandBuffer& nested) {
-    return AddNestedCommandBuffer(kDefaultExecutionScope, nested);
-  }
-
-  // Adds a device-to-device memory copy.
-  virtual absl::Status MemcpyDeviceToDevice(ExecutionScopeId execution_scope_id,
-                                            DeviceMemoryBase* dst,
-                                            const DeviceMemoryBase& src,
-                                            uint64_t size) = 0;
-
-  // Adds a device-to-device memory copy to the default execution scope.
-  absl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
-                                    const DeviceMemoryBase& src,
-                                    uint64_t size) {
-    return MemcpyDeviceToDevice(kDefaultExecutionScope, dst, src, size);
-  }
+  //--------------------------------------------------------------------------//
+  // Command buffer DNN graph API
+  //--------------------------------------------------------------------------//
 
-  // Adds a memset command.
-  virtual absl::Status Memset(ExecutionScopeId execution_scope_id,
-                              DeviceMemoryBase* dst, BitPattern bit_pattern,
-                              size_t num_elements) = 0;
+  // Creates a DNN graph launch command.
+  virtual absl::StatusOr<const Command*> CreateDnnGraphCommand(
+      dnn::DnnGraph&, Stream&, absl::Span<DeviceMemoryBase> operands,
+      absl::Span<const Command* const> dependencies) = 0;
 
-  // Adds a memset command to the default execution scope.
-  absl::Status Memset(DeviceMemoryBase* dst, BitPattern bit_pattern,
-                      size_t num_elements) {
-    return Memset(kDefaultExecutionScope, dst, bit_pattern, num_elements);
-  }
+  // Updates a DNN graph command.
+  virtual absl::Status UpdateDnnGraphCommand(
+      const Command*, dnn::DnnGraph&, Stream&,
+      absl::Span<DeviceMemoryBase> operands) = 0;
 
   //--------------------------------------------------------------------------//
   // Command buffer condtitional commands API
   //--------------------------------------------------------------------------//
 
-  // Adds a conditional operation that will execute a command buffer constructed
-  // by `then_builder` if `pred` value is `true`.
-  virtual absl::Status If(ExecutionScopeId execution_scope_id,
-                          DeviceMemory<bool> pred, Builder then_builder) = 0;
-
-  // Adds a conditional If operation to default execution scope.
-  absl::Status If(DeviceMemory<bool> pred, Builder then_builder) {
-    return If(kDefaultExecutionScope, pred, then_builder);
-  }
-
-  // Adds a conditional operation that will execute a command buffer constructed
-  // by `then_builder` if `pred` value is `true`, or a command buffer
-  // constructed by `else_builder` if `pred` is `false`.
-  virtual absl::Status IfElse(ExecutionScopeId execution_scope_id,
-                              DeviceMemory<bool> pred, Builder then_builder,
-                              Builder else_builder) = 0;
-
-  // Adds a conditional IfElse operation to default execution scope.
-  absl::Status IfElse(DeviceMemory<bool> pred, Builder then_builder,
-                      Builder else_builder) {
-    return IfElse(kDefaultExecutionScope, pred, then_builder, else_builder);
-  }
-
-  // Adds a conditional operation that will execute a command buffer constructed
-  // by the `branches` builder at `index`. If `index` is out of range, then it
-  // will run a conditional command buffer constructed by the last builder.
+  // Creates a conditional operation that will execute a command buffer
+  // constructed by the `branches` builder at `index`. If `index` is out of
+  // range, then it will run a conditional command buffer constructed by the
+  // last builder.
   //
   // See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
-  virtual absl::Status Case(ExecutionScopeId execution_scope_id,
-                            DeviceMemory<int32_t> index,
-                            std::vector<Builder> branches) = 0;
-
-  // Adds a conditional Case operation to default execution scope.
-  absl::Status Case(DeviceMemory<int32_t> index,
-                    std::vector<Builder> branches) {
-    return Case(kDefaultExecutionScope, index, branches);
-  }
-
-  virtual absl::Status Case(ExecutionScopeId execution_scope_id,
-                            DeviceMemory<bool> index,
-                            std::vector<Builder> branches) = 0;
-
-  // Adds a conditional Case operation to default execution scope.
-  absl::Status Case(DeviceMemory<bool> index, std::vector<Builder> branches) {
-    return Case(kDefaultExecutionScope, index, branches);
-  }
-
-  // Adds a conditional operation that will execute a command buffer constructed
-  // by the `body_builder` exactly `num_iteration` times. This means the
-  // condition is known at compile time (`num_iteration` < `loop_counter`), and
-  // does not require a `cond_builder`.
-  virtual absl::Status For(ExecutionScopeId execution_scope_id,
-                           int32_t num_iteration,
-                           DeviceMemory<int32_t> loop_counter,
-                           Builder body_builder) = 0;
-
-  // Adds a conditional For operation to default execution scope.
-  absl::Status For(int32_t num_iteration, DeviceMemory<int32_t> loop_counter,
-                   Builder body_builder) {
-    return For(kDefaultExecutionScope, num_iteration, loop_counter,
-               body_builder);
-  }
-
-  // Adds a conditional operation that will execute a command buffer constructed
-  // by the `cond_builder` that must update `pred` value, and then depending on
-  // the value might execute command buffer constructed by `body_builder` and
-  // `cond_builder`. Will continue while `pred` value (which is continuously
-  // updated by `cond_builder`) is `true`.
+  virtual absl::StatusOr<const Command*> CreateCase(
+      DeviceMemory<int32_t> index, std::vector<CreateCommands> create_branches,
+      absl::Span<const Command* const> dependencies) = 0;
+
+  virtual absl::StatusOr<const Command*> CreateCase(
+      DeviceMemory<bool> index, std::vector<CreateCommands> create_branches,
+      absl::Span<const Command* const> dependencies) = 0;
+
+  // Updates a Case command.
+  virtual absl::Status UpdateCase(
+      const Command* command, DeviceMemory<int32_t> index,
+      std::vector<UpdateCommands> update_branches) = 0;
+
+  virtual absl::Status UpdateCase(
+      const Command* command, DeviceMemory<bool> index,
+      std::vector<UpdateCommands> update_branches) = 0;
+
+  // Creates a conditional operation that will execute a command buffer
+  // constructed by the `cond_builder` that must update `pred` value, and then
+  // depending on the value might execute command buffer constructed by
+  // `body_builder` and `cond_builder`. Will continue while `pred` value (which
+  // is continuously updated by `cond_builder`) is `true`.
   //
   // In pseudocode:
   //
@@ -330,19 +253,16 @@ class CommandBuffer {
   //     body_builder()
   //     cond_builder()
   //
-  // We use execution scope builder for the condition because we have to build
-  // condition twice: (1) before the conditional node in the scope defined by
-  // `execution_scope_id` (2) inside the loop body with default execution scope.
-  virtual absl::Status While(ExecutionScopeId execution_scope_id,
-                             DeviceMemory<bool> pred,
-                             ExecutionScopeBuilder cond_builder,
-                             Builder body_builder) = 0;
-
-  // Adds a conditional While operation to default execution scope.
-  absl::Status While(DeviceMemory<bool> pred,
-                     ExecutionScopeBuilder cond_builder, Builder body_builder) {
-    return While(kDefaultExecutionScope, pred, cond_builder, body_builder);
-  }
+  virtual absl::StatusOr<const Command*> CreateWhile(
+      DeviceMemory<bool> pred, CreateCommands create_cond,
+      CreateCommands create_body,
+      absl::Span<const Command* const> dependencies) = 0;
+
+  // Updates a While command.
+  virtual absl::Status UpdateWhile(const Command* command,
+                                   DeviceMemory<bool> pred,
+                                   UpdateCommands update_cond,
+                                   UpdateCommands update_body) = 0;
 
   // Submits the command buffer for execution.
   virtual absl::Status Submit(Stream* stream) {
@@ -372,6 +292,7 @@ class CommandBuffer {
   //--------------------------------------------------------------------------//
  private:
   friend class TraceCommandBufferFactory;
+
   // Tracing APIs are private because they do not compose with command buffer
   // updates. Instead of tracing directly into the command buffer users should
   // create traced command buffers using factory methods and add them to primary
@@ -388,15 +309,21 @@ class CommandBuffer {
 //===----------------------------------------------------------------------===//
 
 template <typename... Params, typename... Args>
-inline absl::Status CommandBuffer::Launch(const TypedKernel<Params...>& kernel,
-                                          ExecutionScopeId execution_scope_id,
-                                          const ThreadDim& threads,
-                                          const BlockDim& blocks,
-                                          Args... args) {
+absl::StatusOr<const CommandBuffer::Command*> CommandBuffer::CreateLaunch(
+    const TypedKernel<Params...>& kernel, const ThreadDim& threads,
+    const BlockDim& blocks, absl::Span<const Command* const> dependencies,
+    Args... args) {
+  auto kernel_args = PackKernelArgs(kernel, args...);
+  return CreateLaunch(threads, blocks, *kernel, *kernel_args, dependencies);
+}
+
+template <typename... Params, typename... Args>
+absl::Status CommandBuffer::UpdateLaunch(const Command* command,
+                                         const TypedKernel<Params...>& kernel,
+                                         const ThreadDim& threads,
+                                         const BlockDim& blocks, Args... args) {
   auto kernel_args = PackKernelArgs(kernel, args...);
-  TF_RETURN_IF_ERROR(
-      Launch(execution_scope_id, threads, blocks, *kernel, *kernel_args));
-  return absl::OkStatus();
+  return UpdateLaunch(command, threads, blocks, *kernel, *kernel_args);
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index aa8d4d53d5ed..51d7496d53fb 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1,9 +1,10 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_test",
 )
+load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "stream_executor_friends",
@@ -17,7 +18,6 @@ load(
     "if_cuda_tools",
     # copybara:comment_end
     "if_google",
-    "if_nccl",
     "if_windows",
     "internal_visibility",
     "tsl_copts",
@@ -97,8 +97,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps =
         [
-            ":cuda_collectives",
-            ":cuda_diagnostics",  # buildcleaner: keep
+            ":cuda_diagnostics",
             ":cuda_executor",
             ":cuda_platform_id",
             ":cuda_status",
@@ -107,8 +106,9 @@ cc_library(
             "//xla/stream_executor:platform",
             "//xla/stream_executor:platform_manager",
             "//xla/stream_executor:stream_executor_h",
-            "//xla/stream_executor/gpu:gpu_diagnostics_header",
             "//xla/stream_executor/platform:initialize",
+            "//xla/tsl/platform:errors",
+            "//xla/tsl/platform:status",
             "@com_google_absl//absl/base",
             "@com_google_absl//absl/base:core_headers",
             "@com_google_absl//absl/log",
@@ -136,14 +136,13 @@ cc_library(
         "gpu",
     ],
     deps = [
-        "//xla/stream_executor/gpu:gpu_diagnostics_header",
+        "//xla/tsl/platform:env",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
@@ -160,7 +159,6 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/debugging:leak_check",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/log:globals",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -206,79 +204,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cuda_collectives",
-    hdrs = ["cuda_collectives.h"],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps = if_nccl(
-        [":cuda_collectives_impl"],
-        [":cuda_collectives_stub"],
-    ) + [
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:context",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
-cc_library(
-    name = "cuda_collectives_impl",
-    srcs = [
-        "cuda_collectives.cc",
-        "cuda_collectives.h",
-    ],
-    tags = [
-        "cuda-only",
-        "gpu",
-        "manual",
-    ],
-    deps = [
-        "//xla/stream_executor:activate_context",
-        "//xla/stream_executor:stream_executor_h",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_config_nccl//:nccl",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:numbers",
-    ],
-)
-
-cc_library(
-    name = "cuda_collectives_stub",
-    srcs = [
-        "cuda_collectives.h",
-        "cuda_collectives_stub.cc",
-    ],
-    deps = [
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:context",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
-xla_test(
-    name = "cuda_collectives_test",
-    srcs = ["cuda_collectives_test.cc"],
-    backends = ["gpu_any"],
-    tags = ["cuda-only"],
-    deps = [
-        ":cuda_collectives",
-        "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream_executor_h",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
 xla_test(
     name = "cuda_driver_test",
     srcs = ["cuda_driver_test.cc"],
@@ -287,16 +212,12 @@ xla_test(
         "cuda-only",
     ],
     deps = [
-        ":cuda_context",
         ":cuda_diagnostics",
         ":cuda_status",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -315,7 +236,6 @@ cc_library(
     deps = [
         "//xla:types",
         "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_blas_lt",
@@ -345,6 +265,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cuda_blas_utils",
+        ":cuda_compute_capability",
         ":cuda_executor",
         ":cuda_helpers",
         ":cuda_platform_id",
@@ -366,13 +287,17 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/stream_executor/gpu:gpu_helpers_header",
-        "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/platform:initialize",
         "//xla/tsl/cuda:cublas",
         "//xla/tsl/cuda:cublas_lt",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -410,6 +335,7 @@ cc_library(
         "//xla/stream_executor:gpu_solver_context",
         "//xla/stream_executor:stream",
         "//xla/tsl/cuda:cusolver",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -458,15 +384,15 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_helpers_header",
-        "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/platform:initialize",
         "//xla/tsl/cuda:cufft",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,
 )
@@ -518,17 +444,19 @@ cc_library(
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:gpu_diagnostics_header",
-        "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/platform:initialize",
         "//xla/tsl/cuda:cudnn",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -623,7 +551,6 @@ cc_library(
         "gpu",
     ],
     deps = [
-        "//xla/stream_executor/gpu:gpu_helpers_header",
         "@com_google_absl//absl/log:check",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -783,6 +710,7 @@ xla_test(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -863,7 +791,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:nvjitlink",  # buildcleaner: keep
@@ -921,6 +848,7 @@ xla_cc_test(
         ":nvjitlink_support",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:gpu_asm_opts",
+        "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1006,7 +934,6 @@ cc_library(
         "gpu",
     ],
     deps = [
-        ":cuda_collectives",
         ":cuda_command_buffer",
         ":cuda_context",
         ":cuda_event",
@@ -1017,6 +944,8 @@ cc_library(
         ":cuda_timer",
         ":cuda_version_parser",
         ":tma_util",
+        "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/core/collectives:collectives_registry",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:command_buffer",
@@ -1107,14 +1036,19 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":all_reduce_kernel_cuda",
+        ":buffer_comparator_kernel_cuda",
         ":cublas_plugin",
         ":cuda_platform",
         ":cudnn_plugin",
         ":cufft_plugin",
+        ":make_batch_pointers_kernel_cuda",
+        ":ragged_all_to_all_kernel_cuda",
+        ":topk_kernel_cuda",
         "//xla/tsl/cuda:cusolver",
         "//xla/tsl/cuda:cusparse",
         "//xla/tsl/cuda:tensorrt_rpath",
-    ],
+    ] + [":cub_sort_kernel_cuda_" + suffix for suffix in get_cub_sort_kernel_types()],
     alwayslink = 1,
 )
 
@@ -1181,7 +1115,7 @@ cc_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "cuda_version_parser_test",
     srcs = ["cuda_version_parser_test.cc"],
     deps = [
@@ -1307,12 +1241,10 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:typed_kernel_factory",
-        "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/gpu:gpu_test_kernels_cuda",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -1337,6 +1269,7 @@ cc_library(
         "//xla/stream_executor:bit_pattern",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:dnn",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:semantic_version",
@@ -1344,10 +1277,11 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/gpu:gpu_command_buffer",
-        "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/gpu:scoped_gpu_graph_exec",
         "//xla/stream_executor/gpu:scoped_update_mode",
-        "@com_google_absl//absl/algorithm:container",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -1358,9 +1292,33 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_test(
+    name = "cuda_command_buffer_test",
+    srcs = ["cuda_command_buffer_test.cc"],
+    backends = ["gpu"],
+    tags = ["cuda-only"],
+    deps = [
+        ":cuda_compute_capability",
+        ":cudnn_plugin",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:command_buffer",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:dnn",
+        "//xla/stream_executor:numeric_options",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@cudnn_frontend_archive//:cudnn_frontend",
     ],
 )
 
@@ -1423,14 +1381,7 @@ cc_binary(
     name = "dummy_cuda_binary",
     testonly = True,
     srcs = ["dummy_cuda_binary.cc"],
-    deps = [
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
+    deps = ["@com_google_absl//absl/strings"],
 )
 
 stage_in_bin_subdirectory(
@@ -1499,7 +1450,7 @@ cc_library(
     deps = ["@com_google_absl//absl/strings:str_format"],
 )
 
-cc_test(
+xla_cc_test(
     name = "compilation_options_test",
     srcs = ["compilation_options_test.cc"],
     deps = [
@@ -1507,7 +1458,6 @@ cc_test(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -1534,7 +1484,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
@@ -1548,9 +1498,9 @@ cc_library(
         ":subprocess_compilation",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:gpu_asm_opts",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:statusor",
@@ -1571,7 +1521,6 @@ cc_library(
         ":compilation_options",
         ":compilation_provider",
         ":cuda_platform",  # buildcleaner: keep
-        ":cuda_platform_id",
         ":driver_compilation_provider",
         ":nvjitlink_compilation_provider",
         ":nvjitlink_support",
@@ -1580,9 +1529,6 @@ cc_library(
         ":subprocess_compilation",
         ":subprocess_compilation_provider",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream_executor_h",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1652,6 +1598,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_asm_opts",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -1669,6 +1616,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_asm_opts",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -1787,7 +1735,6 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -1827,6 +1774,7 @@ xla_cc_test(
     srcs = ["compilation_provider_options_test.cc"],
     deps = [
         ":compilation_provider_options",
+        "//xla:xla_proto_cc",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
@@ -1910,7 +1858,7 @@ cc_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "tma_util_test",
     srcs = ["tma_util_test.cc"],
     tags = [
@@ -1950,9 +1898,167 @@ xla_cc_test(
     srcs = ["cuda_compute_capability_test.cc"],
     deps = [
         ":cuda_compute_capability",
+        ":cuda_compute_capability_proto_cc",
         "//xla/tsl/platform:status_matchers",
         "@com_google_absl//absl/hash:hash_testing",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cuda_library(
+    name = "buffer_comparator_kernel_cuda",
+    srcs = [
+        "buffer_comparator_kernel_cuda.cu.cc",
+        "//xla/stream_executor/gpu:buffer_comparator_kernel_lib.cu.h",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:buffer_comparator_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/platform:initialize",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    alwayslink = 1,
+)
+
+cuda_library(
+    name = "make_batch_pointers_kernel_cuda",
+    srcs = ["make_batch_pointers_kernel_cuda.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:make_batch_pointers_kernel",
+        "@com_google_absl//absl/base",
+    ],
+    alwayslink = 1,
+)
+
+cuda_library(
+    name = "ragged_all_to_all_kernel_cuda",
+    srcs = [
+        "ragged_all_to_all_kernel_cuda.cc",
+        "//xla/stream_executor/gpu:ragged_all_to_all_kernel_lib.cu.h",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:ragged_all_to_all_kernel",
+        "@com_google_absl//absl/base",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    alwayslink = 1,
+)
+
+cuda_library(
+    name = "all_reduce_kernel_cuda",
+    srcs = [
+        "all_reduce_kernel_cuda.cc",
+        "//xla/stream_executor/gpu:all_reduce_kernel_lib.cu.h",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla:types",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:all_reduce_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "@com_google_absl//absl/base",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    alwayslink = 1,
+)
+
+# NVCC from CUDA 12.4 and below doesn't get along with Abseil which gets
+# transitively included via the XLA FFI headers. That's why we split
+# the logic into 2 targets - the `*_impl_*` target which contains the CUDA
+# C++ code and the non-impl target which contains the FFI handle registration.
+# This can be merged into a single compilation unit when we don't care about
+# compiling with NVCC below version 12.4.1 anymore.
+[cuda_library(
+    name = "cub_sort_kernel_cuda_impl_{}".format(typename),
+    srcs = [
+        "cub_sort_kernel_cuda.h",
+        "cub_sort_kernel_cuda_impl.cu.cc",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    local_defines = ["CUB_TYPE_" + typename.upper()],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "@local_config_cuda//cuda:cub_headers",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+) for typename in get_cub_sort_kernel_types()]
+
+[cc_library(
+    name = "cub_sort_kernel_cuda_{}".format(typename),
+    srcs = [
+        "cub_sort_kernel_cuda.cc",
+        "cub_sort_kernel_cuda.h",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    local_defines = ["CUB_TYPE_" + typename.upper()],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cub_sort_kernel_cuda_impl_{}".format(typename),
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/stream_executor/cuda:cuda_status",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+    alwayslink = 1,
+) for typename in get_cub_sort_kernel_types()]
+
+cuda_library(
+    name = "topk_kernel_cuda",
+    srcs = [
+        "topk_kernel_cuda_bfloat16.cu.cc",
+        "topk_kernel_cuda_common.cu.h",
+        "topk_kernel_cuda_float.cu.cc",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":cuda_platform_id",
+        "//xla:types",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:topk_kernel",
+        "//xla/tsl/lib/math:math_util",
+    ],
+    alwayslink = 1,
+)
diff --git a/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc b/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
new file mode 100644
index 000000000000..76621f47df8f
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/all_reduce_kernel_cuda.cc
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/base/casts.h"
+#include "third_party/gpus/cuda/include/cuda_bf16.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/all_reduce_kernel.h"
+#include "xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/types.h"
+
+#define REGISTER_ALL_REDUCE_KERNEL(SUFFIX, XLA_TYPE, NV_TYPE)         \
+  GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                     \
+      AllReduceKernelCuda##SUFFIX,                                    \
+      stream_executor::gpu::AllReduceKernel<XLA_TYPE>,                \
+      stream_executor::cuda::kCudaPlatformId, ([] {                   \
+        stream_executor::MultiKernelLoaderSpec spec(4);               \
+        spec.AddInProcessSymbol(                                      \
+            absl::bit_cast<void*>(                                    \
+                &stream_executor::gpu::AllReduceKernelImpl<NV_TYPE>), \
+            "one_shot_all_reduce_" #SUFFIX);                          \
+        return spec;                                                  \
+      }));
+
+// Register the kernel for different types using the macro
+REGISTER_ALL_REDUCE_KERNEL(bf16, xla::bfloat16, __nv_bfloat16);
+REGISTER_ALL_REDUCE_KERNEL(f32, float, float);
diff --git a/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider.cc
index 782651f35a60..016384a72ce0 100644
--- a/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/assemble_compilation_provider.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
 #include "xla/stream_executor/cuda/compilation_provider_options.h"
 #include "xla/stream_executor/cuda/composite_compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_comparator_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/buffer_comparator_kernel_cuda.cu.cc
new file mode 100644
index 000000000000..59e2fd8e7b93
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_comparator_kernel_cuda.cu.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/gpus/cuda/include/device_launch_parameters.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/buffer_comparator_kernel_lib.cu.h"
+#include "xla/stream_executor/platform/initialize.h"
+
+namespace stream_executor::cuda {
+
+// Comparison kernel code: compare two buffers of
+// fp8/bf16/fp16/fp32/fp64/int8_t/int32_t of length buffer_length where the
+// relative error does not exceed the passed rel_error_threshold. Write the
+// number of mismatches into out parameter mismatch_count.
+
+namespace {
+
+static void RegisterBufferComparatorKernelCudaImpl() {
+  auto register_kernel = [&](auto primitive_type_constant) {
+    gpu::RegisterBufferComparatorKernelParametrized<
+        xla::primitive_util::NativeTypeOf<primitive_type_constant()>>(
+        stream_executor::cuda::kCudaPlatformId);
+  };
+  xla::primitive_util::IntegralTypeForEach(register_kernel);
+  xla::primitive_util::FloatingPointTypeForEach(register_kernel);
+}
+
+}  // namespace
+}  // namespace stream_executor::cuda
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    RegisterBufferComparatorKernelCuda,
+    stream_executor::cuda::RegisterBufferComparatorKernelCudaImpl());
diff --git a/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider.h
index 264b0384d99d..786105aa8c09 100644
--- a/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/caching_compilation_provider.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.cc b/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.cc
index cb5ca81fa58f..5e44911065b9 100644
--- a/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.cc
+++ b/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.cc
@@ -19,8 +19,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/stream_executor/kernel_spec.h"
 
-namespace stream_executor {
-namespace cuda {
+namespace stream_executor::cuda {
 namespace {
 
 // Collection of helper kernels required by command buffers on CUDA backends. We
@@ -35,197 +34,6 @@ namespace {
 // want to execute a CUDA graph tied to it, and to `0` otherwise. For loops, the
 // graph will keep being executed until the conditional handle becomes `0`.
 
-// PTX kernel compiled from:
-//
-// __global__ void SetIfCondition(cudaGraphConditionalHandle then_handle,
-//                                bool* predicate) {
-//   if (*predicate) {
-//     cudaGraphSetConditional(then_handle, 1);
-//   } else {
-//     cudaGraphSetConditional(then_handle, 0);
-//   }
-// }
-//
-// Easiest way to get PTX from C++ is to use https://godbolt.org.
-inline constexpr absl::string_view kSetIfConditionKernel = R"(
-.version 4.0
-.target sm_50
-.address_size 64
-
-.extern .func cudaGraphSetConditional
-(
-        .param .b64 cudaGraphSetConditional_param_0,
-        .param .b32 cudaGraphSetConditional_param_1
-)
-
-.visible .entry set_if_condition(
-        .param .u64 set_if_condition_param_0,
-        .param .u64 set_if_condition_param_1
-)
-{
-        .reg .pred      %p<2>;
-        .reg .b16       %rs<2>;
-        .reg .b64       %rd<4>;
-        .loc    1 1 0
-
-        ld.param.u64    %rd1, [set_if_condition_param_0];
-        ld.param.u64    %rd2, [set_if_condition_param_1];
-        .loc    1 3 3
-        cvta.to.global.u64      %rd3, %rd2;
-        ld.global.u8    %rs1, [%rd3];
-        setp.eq.s16     %p1, %rs1, 0;
-        @%p1 bra        $L__BB0_2;
-
-        .loc    1 4 5
-        { // callseq 0, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd1;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 1;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 0
-        bra.uni         $L__BB0_3;
-
-$L__BB0_2:
-        .loc    1 6 5
-        { // callseq 1, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd1;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 0;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 1
-
-$L__BB0_3:
-        .loc    1 8 1
-        ret;
-
-})";
-
-// PTX kernel compiled from:
-//
-// __global__ void SetIfElseCondition(cudaGraphConditionalHandle then_handle,
-//                                    cudaGraphConditionalHandle else_handle,
-//                                    bool* predicate) {
-//   if (*predicate) {
-//     cudaGraphSetConditional(then_handle, 1);
-//     cudaGraphSetConditional(else_handle, 0);
-//   } else {
-//     cudaGraphSetConditional(then_handle, 0);
-//     cudaGraphSetConditional(else_handle, 1);
-//   }
-// }
-//
-// Easiest way to get PTX from C++ is to use https://godbolt.org.
-inline constexpr absl::string_view kSetIfElseConditionKernel = R"(
-.version 4.0
-.target sm_50
-.address_size 64
-
-.extern .func cudaGraphSetConditional
-(
-        .param .b64 cudaGraphSetConditional_param_0,
-        .param .b32 cudaGraphSetConditional_param_1
-)
-
-.visible .entry set_if_else_condition(
-        .param .u64 set_if_else_condition_param_0,
-        .param .u64 set_if_else_condition_param_1,
-        .param .u64 set_if_else_condition_param_2
-)
-{
-        .reg .pred      %p<2>;
-        .reg .b16       %rs<2>;
-        .reg .b64       %rd<5>;
-        .loc    1 1 0
-
-        ld.param.u64    %rd1, [set_if_else_condition_param_0];
-        ld.param.u64    %rd2, [set_if_else_condition_param_1];
-        ld.param.u64    %rd3, [set_if_else_condition_param_2];
-        .loc    1 4 3
-        cvta.to.global.u64      %rd4, %rd3;
-        ld.global.u8    %rs1, [%rd4];
-        setp.eq.s16     %p1, %rs1, 0;
-        @%p1 bra        $L__BB0_2;
-
-        .loc    1 5 5
-        { // callseq 0, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd1;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 1;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 0
-        .loc    1 6 5
-        { // callseq 1, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd2;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 0;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 1
-        bra.uni         $L__BB0_3;
-
-$L__BB0_2:
-        .loc    1 8 5
-        { // callseq 2, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd1;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 0;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 2
-        .loc    1 9 5
-        { // callseq 3, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd2;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 1;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 3
-
-$L__BB0_3:
-        .loc    1 11 1
-        ret;
-
-})";
-
 // clang-format off
 // PTX kernel compiled from:
 //
@@ -390,10 +198,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd5;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 0;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 0
@@ -406,10 +214,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd5;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 1;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 1
@@ -428,10 +236,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd6;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 1;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 3
@@ -444,10 +252,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd6;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 0;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 2
@@ -466,10 +274,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd7;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 1;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 5
@@ -482,10 +290,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd7;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 0;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 4
@@ -504,10 +312,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd8;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 1;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 7
@@ -520,10 +328,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd8;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 0;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 6
@@ -556,10 +364,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd12;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 0;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 8
@@ -572,10 +380,10 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 	st.param.b64 	[param0+0], %rd12;
 	.param .b32 param1;
 	st.param.b32 	[param1+0], 1;
-	call.uni 
-	cudaGraphSetConditional, 
+	call.uni
+	cudaGraphSetConditional,
 	(
-	param0, 
+	param0,
 	param1
 	);
 	} // callseq 9
@@ -592,95 +400,6 @@ inline constexpr absl::string_view kSetCaseConditionKernel = R"(
 
 })";
 
-// PTX kernel compiled from:
-//
-// __global__ void SetForCondition(cudaGraphConditionalHandle handle,
-//                                 int32_t* loop_index,
-//                                 int32_t num_iterations) {
-//   if (*loop_index < num_iterations) {
-//     cudaGraphSetConditional(handle, 1);
-//   } else {
-//     cudaGraphSetConditional(handle, 0);
-//   }
-//   *loop_index += 1;
-// }
-//
-// Easiest way to get PTX from C++ is to use https://godbolt.org.
-inline constexpr absl::string_view kSetForConditionKernel = R"(
-.version 4.0
-.target sm_50
-.address_size 64
-
-.extern .func cudaGraphSetConditional
-(
-        .param .b64 cudaGraphSetConditional_param_0,
-        .param .b32 cudaGraphSetConditional_param_1
-)
-
-.visible .entry set_for_condition(
-        .param .u64 set_for_condition_param_0,
-        .param .u64 set_for_condition_param_1,
-        .param .u32 set_for_condition_param_2
-)
-{
-        .reg .pred      %p<2>;
-        .reg .b32       %r<5>;
-        .reg .b64       %rd<4>;
-        .loc    1 1 0
-
-        ld.param.u64    %rd2, [set_for_condition_param_0];
-        ld.param.u64    %rd3, [set_for_condition_param_1];
-        ld.param.u32    %r1, [set_for_condition_param_2];
-        .loc    1 3 3
-        cvta.to.global.u64      %rd1, %rd3;
-        ld.global.u32   %r2, [%rd1];
-        setp.lt.s32     %p1, %r2, %r1;
-        @%p1 bra        $L__BB0_2;
-        bra.uni         $L__BB0_1;
-
-$L__BB0_2:
-        .loc    1 4 5
-        { // callseq 1, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd2;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 1;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 1
-        bra.uni         $L__BB0_3;
-
-$L__BB0_1:
-        .loc    1 6 5
-        { // callseq 0, 0
-        .reg .b32 temp_param_reg;
-        .param .b64 param0;
-        st.param.b64    [param0+0], %rd2;
-        .param .b32 param1;
-        st.param.b32    [param1+0], 0;
-        call.uni
-        cudaGraphSetConditional,
-        (
-        param0,
-        param1
-        );
-        } // callseq 0
-
-$L__BB0_3:
-        .loc    1 8 3
-        ld.global.u32   %r3, [%rd1];
-        add.s32         %r4, %r3, 1;
-        st.global.u32   [%rd1], %r4;
-        .loc    1 9 1
-        ret;
-
-})";
-
 // While condition kernel is the same as an `If` with a single branch.
 inline constexpr absl::string_view kSetWhileConditionKernel = R"(
 .version 4.0
@@ -771,31 +490,12 @@ inline constexpr absl::string_view kNoOpKernel = R"(
 
 }  // namespace
 
-absl::StatusOr<MultiKernelLoaderSpec> GetSetIfConditionKernelLoaderSpec() {
-  MultiKernelLoaderSpec spec(/*arity=*/2);
-  spec.AddCudaPtxInMemory(cuda::kSetIfConditionKernel, "set_if_condition");
-  return spec;
-}
-
-absl::StatusOr<MultiKernelLoaderSpec> GetSetIfElseConditionKernelLoaderSpec() {
-  MultiKernelLoaderSpec spec(/*arity=*/3);
-  spec.AddCudaPtxInMemory(cuda::kSetIfElseConditionKernel,
-                          "set_if_else_condition");
-  return spec;
-}
-
 absl::StatusOr<MultiKernelLoaderSpec> GetSetCaseConditionKernelLoaderSpec() {
   MultiKernelLoaderSpec spec(/*arity=*/13);
   spec.AddCudaPtxInMemory(cuda::kSetCaseConditionKernel, "set_case_condition");
   return spec;
 }
 
-absl::StatusOr<MultiKernelLoaderSpec> GetSetForConditionKernelLoaderSpec() {
-  MultiKernelLoaderSpec spec(/*arity=*/3);
-  spec.AddCudaPtxInMemory(cuda::kSetForConditionKernel, "set_for_condition");
-  return spec;
-}
-
 absl::StatusOr<MultiKernelLoaderSpec> GetSetWhileConditionKernelLoaderSpec() {
   MultiKernelLoaderSpec spec(/*arity=*/2);
   spec.AddCudaPtxInMemory(cuda::kSetWhileConditionKernel,
@@ -809,5 +509,4 @@ absl::StatusOr<MultiKernelLoaderSpec> GetNoOpKernelLoaderSpec() {
   return spec;
 }
 
-}  // namespace cuda
-}  // namespace stream_executor
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.h b/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.h
index a610b3ebf4f4..818b2d96cbb8 100644
--- a/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.h
+++ b/third_party/xla/xla/stream_executor/cuda/command_buffer_kernels.h
@@ -24,10 +24,7 @@ namespace stream_executor::cuda {
 // These are various kernels that update Gpu conditionals based on the device
 // memory values, and allow implementing on-device control flow via conditional
 // command buffers.
-absl::StatusOr<MultiKernelLoaderSpec> GetSetIfConditionKernelLoaderSpec();
-absl::StatusOr<MultiKernelLoaderSpec> GetSetIfElseConditionKernelLoaderSpec();
 absl::StatusOr<MultiKernelLoaderSpec> GetSetCaseConditionKernelLoaderSpec();
-absl::StatusOr<MultiKernelLoaderSpec> GetSetForConditionKernelLoaderSpec();
 absl::StatusOr<MultiKernelLoaderSpec> GetSetWhileConditionKernelLoaderSpec();
 absl::StatusOr<MultiKernelLoaderSpec> GetNoOpKernelLoaderSpec();
 
diff --git a/third_party/xla/xla/stream_executor/cuda/compilation_provider_options.cc b/third_party/xla/xla/stream_executor/cuda/compilation_provider_options.cc
index 43cdddbc52d3..eb349b746e78 100644
--- a/third_party/xla/xla/stream_executor/cuda/compilation_provider_options.cc
+++ b/third_party/xla/xla/stream_executor/cuda/compilation_provider_options.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_format.h"
+#include "xla/xla.pb.h"
 
 namespace stream_executor::cuda {
 
diff --git a/third_party/xla/xla/stream_executor/cuda/compilation_provider_options_test.cc b/third_party/xla/xla/stream_executor/cuda/compilation_provider_options_test.cc
index 903b253cd53a..a096c48071f0 100644
--- a/third_party/xla/xla/stream_executor/cuda/compilation_provider_options_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/compilation_provider_options_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/hash/hash_testing.h"
 #include "absl/strings/str_cat.h"
+#include "xla/xla.pb.h"
 
 namespace stream_executor::cuda {
 namespace {
diff --git a/third_party/xla/xla/stream_executor/cuda/compilation_provider_test.h b/third_party/xla/xla/stream_executor/cuda/compilation_provider_test.h
index 118d2c8389fe..a7cb46b97705 100644
--- a/third_party/xla/xla/stream_executor/cuda/compilation_provider_test.h
+++ b/third_party/xla/xla/stream_executor/cuda/compilation_provider_test.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
 
 namespace stream_executor::cuda {
diff --git a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.cc
index c9e665aa5146..3cdb44e2f041 100644
--- a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.h
index 131d80d30b3a..59475d721e63 100644
--- a/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/composite_compilation_provider.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.cc b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.cc
new file mode 100644
index 000000000000..7a40dfeae19b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.cc
@@ -0,0 +1,219 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/cub_sort_kernel_cuda.h"
+
+#include <cstddef>
+#include <cstdint>  // IWYU pragma: keep
+
+#include "absl/status/status.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"  // IWYU pragma: keep
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"  // IWYU pragma: keep
+#include "xla/stream_executor/cuda/cuda_status.h"
+
+namespace stream_executor {
+namespace cuda {
+namespace {
+
+template <typename KeyT>
+absl::Status CubSortKeysExecute(
+    xla::ffi::AnyBuffer d_temp_storage, xla::ffi::AnyBuffer d_keys_in,
+    xla::ffi::Result<xla::ffi::AnyBuffer> d_keys_out, size_t num_items,
+    bool descending, size_t batch_size, CUstream stream) {
+  size_t temp_bytes = d_temp_storage.size_bytes();
+  return ToStatus(CubSortKeys<KeyT>(
+      d_temp_storage.untyped_data(), temp_bytes, d_keys_in.untyped_data(),
+      d_keys_out->untyped_data(), num_items, descending, batch_size, stream));
+}
+
+template <typename KeyT>
+absl::Status CubSortKeysGetScratchSize(size_t* temp_bytes, size_t num_items,
+                                       size_t batch_size) {
+  return ToStatus(CubSortKeys<KeyT>(nullptr, *temp_bytes, nullptr, nullptr,
+                                    num_items, false, batch_size, nullptr));
+}
+
+template <typename KeyT, typename ValT>
+absl::Status CubSortPairsExecute(
+    xla::ffi::AnyBuffer d_temp_storage, xla::ffi::AnyBuffer d_keys_in,
+    xla::ffi::Result<xla::ffi::AnyBuffer> d_keys_out,
+    xla::ffi::AnyBuffer d_values_in,
+    xla::ffi::Result<xla::ffi::AnyBuffer> d_values_out, size_t num_items,
+    bool descending, size_t batch_size, CUstream stream) {
+  size_t temp_bytes = d_temp_storage.size_bytes();
+  return ToStatus(CubSortPairs<KeyT, ValT>(
+      d_temp_storage.untyped_data(), temp_bytes, d_keys_in.untyped_data(),
+      d_keys_out->untyped_data(), d_values_in.untyped_data(),
+      d_values_out->untyped_data(), num_items, descending, batch_size, stream));
+}
+
+template <typename KeyT, typename ValT>
+absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes, size_t num_items,
+                                        size_t batch_size) {
+  return ToStatus(CubSortPairs<KeyT, ValT>(nullptr, *temp_bytes, nullptr,
+                                           nullptr, nullptr, nullptr, num_items,
+                                           false, batch_size, nullptr));
+}
+
+}  // namespace
+
+#define XLA_CUB_DEFINE_SORT_KEYS(suffix, type)                                \
+  XLA_FFI_DEFINE_HANDLER(kCubSortKeysExecute_##suffix,                        \
+                         CubSortKeysExecute<type>,                            \
+                         xla::ffi::Ffi::Bind()                                \
+                             .Arg<xla::ffi::AnyBuffer>()                      \
+                             .Arg<xla::ffi::AnyBuffer>()                      \
+                             .Ret<xla::ffi::AnyBuffer>()                      \
+                             .Attr<size_t>("num_items")                       \
+                             .Attr<bool>("descending")                        \
+                             .Attr<size_t>("batch_size")                      \
+                             .Ctx<xla::ffi::PlatformStream<CUstream>>());     \
+  XLA_FFI_DEFINE_HANDLER(                                                     \
+      kCubSortKeysInitialize_##suffix, CubSortKeysGetScratchSize<type>,       \
+      xla::ffi::Ffi::Bind<xla::ffi::ExecutionStage::kInitialize>()            \
+          .Attr<xla::ffi::Pointer<size_t>>("temp_bytes")                      \
+          .Attr<size_t>("num_items")                                          \
+          .Attr<size_t>("batch_size"));                                       \
+  XLA_FFI_REGISTER_HANDLER(                                                   \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_keys_" #suffix, "CUDA", \
+      {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                \
+       /* .initialize = */ kCubSortKeysInitialize_##suffix,                   \
+       /* .execute = */ kCubSortKeysExecute_##suffix});
+
+#define XLA_CUB_DEFINE_SORT_PAIRS(suffix, type1, type2)                        \
+  XLA_FFI_DEFINE_HANDLER(kCubSortPairsExecute_##suffix,                        \
+                         (CubSortPairsExecute<type1, type2>),                  \
+                         xla::ffi::Ffi::Bind()                                 \
+                             .Arg<xla::ffi::AnyBuffer>()                       \
+                             .Arg<xla::ffi::AnyBuffer>()                       \
+                             .Ret<xla::ffi::AnyBuffer>()                       \
+                             .Arg<xla::ffi::AnyBuffer>()                       \
+                             .Ret<xla::ffi::AnyBuffer>()                       \
+                             .Attr<size_t>("num_items")                        \
+                             .Attr<bool>("descending")                         \
+                             .Attr<size_t>("batch_size")                       \
+                             .Ctx<xla::ffi::PlatformStream<CUstream>>());      \
+  XLA_FFI_DEFINE_HANDLER(                                                      \
+      kCubSortPairsInitialize_##suffix,                                        \
+      (CubSortPairsGetScratchSize<type1, type2>),                              \
+      xla::ffi::Ffi::Bind<xla::ffi::ExecutionStage::kInitialize>()             \
+          .Attr<xla::ffi::Pointer<size_t>>("temp_bytes")                       \
+          .Attr<size_t>("num_items")                                           \
+          .Attr<size_t>("batch_size"));                                        \
+  XLA_FFI_REGISTER_HANDLER(                                                    \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_pairs_" #suffix, "CUDA", \
+      {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                 \
+       /* .initialize = */ kCubSortPairsInitialize_##suffix,                   \
+       /* .execute = */ kCubSortPairsExecute_##suffix});
+
+// Floating point types.
+#ifdef CUB_TYPE_BF16
+XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
+#endif
+#ifdef CUB_TYPE_F16
+XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
+#endif
+#ifdef CUB_TYPE_F32
+XLA_CUB_DEFINE_SORT_KEYS(f32, float)
+#endif
+#ifdef CUB_TYPE_F64
+XLA_CUB_DEFINE_SORT_KEYS(f64, double)
+#endif
+
+// Signed integer types.
+#ifdef CUB_TYPE_S8
+XLA_CUB_DEFINE_SORT_KEYS(s8, int8_t)
+#endif
+#ifdef CUB_TYPE_S16
+XLA_CUB_DEFINE_SORT_KEYS(s16, int16_t)
+#endif
+#ifdef CUB_TYPE_S32
+XLA_CUB_DEFINE_SORT_KEYS(s32, int32_t)
+#endif
+#ifdef CUB_TYPE_S64
+XLA_CUB_DEFINE_SORT_KEYS(s64, int64_t)
+#endif
+
+// Unsigned integer types.
+#ifdef CUB_TYPE_U8
+XLA_CUB_DEFINE_SORT_KEYS(u8, uint8_t)
+#endif
+#ifdef CUB_TYPE_U16
+XLA_CUB_DEFINE_SORT_KEYS(u16, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32
+XLA_CUB_DEFINE_SORT_KEYS(u32, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64
+XLA_CUB_DEFINE_SORT_KEYS(u64, uint64_t)
+#endif
+
+// Pairs with 8-bit key.
+#ifdef CUB_TYPE_U8_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b16, uint8_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U8_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b32, uint8_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U8_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b64, uint8_t, uint64_t)
+#endif
+
+// Pairs with 16-bit key.
+#ifdef CUB_TYPE_U16_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b16, uint16_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U16_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b32, uint16_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U16_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b64, uint16_t, uint64_t)
+#endif
+
+// Pairs with 32-bit key.
+#ifdef CUB_TYPE_U32_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b16, uint32_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b32, uint32_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U32_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b64, uint32_t, uint64_t)
+#endif
+#ifdef CUB_TYPE_F32_B16
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b16, float, uint16_t)
+#endif
+#ifdef CUB_TYPE_F32_B32
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b32, float, uint32_t)
+#endif
+#ifdef CUB_TYPE_F32_B64
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b64, float, uint64_t)
+#endif
+
+// Pairs with 64-bit key.
+#ifdef CUB_TYPE_U64_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b16, uint64_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U64_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b32, uint64_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b64, uint64_t, uint64_t)
+#endif
+
+}  // namespace cuda
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.h b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.h
new file mode 100644
index 000000000000..5fc34914ba32
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda.h
@@ -0,0 +1,49 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUB_SORT_KERNEL_CUDA_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUB_SORT_KERNEL_CUDA_H_
+
+#include <cstddef>
+
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+
+namespace stream_executor::cuda {
+
+// The CUB sort kernel registration is split into 2 compilation units
+// (cub_sort_kernel_cuda.cc and cub_sort_kernel_impl.cu.cc) because NVCC (prior
+// to CUDA 12.4 Update 1) trips over the XLA FFI headers.
+// Doing the FFI handle registration in a compilation unit that is not compiled
+// by NVCC fixes the issue.
+// The following functions declare the interface between the two compilation
+// units.
+
+template <typename KeyT>
+cudaError_t CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
+                        const void* d_keys_in, void* d_keys_out,
+                        size_t num_items, bool descending, size_t batch_size,
+                        CUstream stream);
+
+template <typename KeyT, typename ValT>
+cudaError_t CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
+                         const void* d_keys_in, void* d_keys_out,
+                         const void* d_values_in, void* d_values_out,
+                         size_t num_items, bool descending, size_t batch_size,
+                         CUstream stream);
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUB_SORT_KERNEL_CUDA_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
new file mode 100644
index 000000000000..5d5cec0f19bb
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cub_sort_kernel_cuda_impl.cu.cc
@@ -0,0 +1,232 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_segmented_radix_sort.cuh"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
+#include "xla/stream_executor/cuda/cub_sort_kernel_cuda.h"
+
+namespace stream_executor {
+namespace cuda {
+
+template <typename KeyT>
+cudaError_t CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
+                        const void* d_keys_in, void* d_keys_out,
+                        size_t num_items, bool descending, CUstream stream) {
+  return descending
+             ? cub::DeviceRadixSort::SortKeysDescending<KeyT>(
+                   d_temp_storage, temp_bytes,
+                   static_cast<const KeyT*>(d_keys_in),
+                   static_cast<KeyT*>(d_keys_out), num_items, /*begin_bit=*/0,
+                   /*end_bit=*/sizeof(KeyT) * 8, stream)
+             : cub::DeviceRadixSort::SortKeys<KeyT>(
+                   d_temp_storage, temp_bytes,
+                   static_cast<const KeyT*>(d_keys_in),
+                   static_cast<KeyT*>(d_keys_out), num_items, /*begin_bit=*/0,
+                   /*end_bit=*/sizeof(KeyT) * 8, stream);
+}
+
+template <typename KeyT>
+cudaError_t CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
+                        const void* d_keys_in, void* d_keys_out,
+                        size_t num_items, bool descending, size_t batch_size,
+                        CUstream stream) {
+  if (batch_size == 1) {
+    return CubSortKeys<KeyT>(d_temp_storage, temp_bytes, d_keys_in, d_keys_out,
+                             num_items, descending, stream);
+  }
+  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
+  int* start_offsets =
+      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
+  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
+  return descending ? cub::DeviceSegmentedRadixSort::SortKeysDescending<KeyT>(
+                          d_temp_storage, temp_bytes,
+                          static_cast<const KeyT*>(d_keys_in),
+                          static_cast<KeyT*>(d_keys_out), num_items, batch_size,
+                          start_offsets, end_offsets, /*begin_bit=*/0,
+                          /*end_bit=*/sizeof(KeyT) * 8, stream)
+                    : cub::DeviceSegmentedRadixSort::SortKeys<KeyT>(
+                          d_temp_storage, temp_bytes,
+                          static_cast<const KeyT*>(d_keys_in),
+                          static_cast<KeyT*>(d_keys_out), num_items, batch_size,
+                          start_offsets, end_offsets, /*begin_bit=*/0,
+                          /*end_bit=*/sizeof(KeyT) * 8, stream);
+}
+
+template <typename KeyT, typename ValT>
+cudaError_t CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
+                         const void* d_keys_in, void* d_keys_out,
+                         const void* d_values_in, void* d_values_out,
+                         size_t num_items, bool descending, CUstream stream) {
+  return descending
+             ? cub::DeviceRadixSort::SortPairsDescending<KeyT, ValT>(
+                   d_temp_storage, temp_bytes,
+                   static_cast<const KeyT*>(d_keys_in),
+                   static_cast<KeyT*>(d_keys_out),
+                   static_cast<const ValT*>(d_values_in),
+                   static_cast<ValT*>(d_values_out), num_items, /*begin_bit=*/0,
+                   /*end_bit=*/sizeof(KeyT) * 8, stream)
+             : cub::DeviceRadixSort::SortPairs<KeyT, ValT>(
+                   d_temp_storage, temp_bytes,
+                   static_cast<const KeyT*>(d_keys_in),
+                   static_cast<KeyT*>(d_keys_out),
+                   static_cast<const ValT*>(d_values_in),
+                   static_cast<ValT*>(d_values_out), num_items, /*begin_bit=*/0,
+                   /*end_bit=*/sizeof(KeyT) * 8, stream);
+}
+
+template <typename KeyT, typename ValT>
+cudaError_t CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
+                         const void* d_keys_in, void* d_keys_out,
+                         const void* d_values_in, void* d_values_out,
+                         size_t num_items, bool descending, size_t batch_size,
+                         CUstream stream) {
+  if (batch_size == 1) {
+    return CubSortPairs<KeyT, ValT>(d_temp_storage, temp_bytes, d_keys_in,
+                                    d_keys_out, d_values_in, d_values_out,
+                                    num_items, descending, stream);
+  }
+  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
+  int* start_offsets =
+      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
+  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
+  return descending
+             ? cub::DeviceSegmentedRadixSort::SortPairsDescending<KeyT, ValT>(
+                   d_temp_storage, temp_bytes,
+                   static_cast<const KeyT*>(d_keys_in),
+                   static_cast<KeyT*>(d_keys_out),
+                   static_cast<const ValT*>(d_values_in),
+                   static_cast<ValT*>(d_values_out), num_items, batch_size,
+                   start_offsets, end_offsets, /*begin_bit=*/0,
+                   /*end_bit=*/sizeof(KeyT) * 8, stream)
+             : cub::DeviceSegmentedRadixSort::SortPairs<KeyT, ValT>(
+                   d_temp_storage, temp_bytes,
+                   static_cast<const KeyT*>(d_keys_in),
+                   static_cast<KeyT*>(d_keys_out),
+                   static_cast<const ValT*>(d_values_in),
+                   static_cast<ValT*>(d_values_out), num_items, batch_size,
+                   start_offsets, end_offsets, /*begin_bit=*/0,
+                   /*end_bit=*/sizeof(KeyT) * 8, stream);
+}
+
+#define XLA_CUB_DEFINE_SORT_KEYS(suffix, type)                               \
+  template cudaError_t CubSortKeys<type>(void*, size_t&, const void*, void*, \
+                                         size_t, bool, size_t, CUstream);
+
+#define XLA_CUB_DEFINE_SORT_PAIRS(suffix, type1, type2)                     \
+  template cudaError_t CubSortPairs<type1, type2>(                          \
+      void*, size_t&, const void*, void*, const void*, void*, size_t, bool, \
+      size_t, CUstream);
+
+// Floating point types.
+#ifdef CUB_TYPE_BF16
+XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
+#endif
+#ifdef CUB_TYPE_F16
+XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
+#endif
+#ifdef CUB_TYPE_F32
+XLA_CUB_DEFINE_SORT_KEYS(f32, float)
+#endif
+#ifdef CUB_TYPE_F64
+XLA_CUB_DEFINE_SORT_KEYS(f64, double)
+#endif
+
+// Signed integer types.
+#ifdef CUB_TYPE_S8
+XLA_CUB_DEFINE_SORT_KEYS(s8, int8_t)
+#endif
+#ifdef CUB_TYPE_S16
+XLA_CUB_DEFINE_SORT_KEYS(s16, int16_t)
+#endif
+#ifdef CUB_TYPE_S32
+XLA_CUB_DEFINE_SORT_KEYS(s32, int32_t)
+#endif
+#ifdef CUB_TYPE_S64
+XLA_CUB_DEFINE_SORT_KEYS(s64, int64_t)
+#endif
+
+// Unsigned integer types.
+#ifdef CUB_TYPE_U8
+XLA_CUB_DEFINE_SORT_KEYS(u8, uint8_t)
+#endif
+#ifdef CUB_TYPE_U16
+XLA_CUB_DEFINE_SORT_KEYS(u16, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32
+XLA_CUB_DEFINE_SORT_KEYS(u32, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64
+XLA_CUB_DEFINE_SORT_KEYS(u64, uint64_t)
+#endif
+
+// Pairs with 8-bit key.
+#ifdef CUB_TYPE_U8_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b16, uint8_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U8_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b32, uint8_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U8_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b64, uint8_t, uint64_t)
+#endif
+
+// Pairs with 16-bit key.
+#ifdef CUB_TYPE_U16_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b16, uint16_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U16_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b32, uint16_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U16_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b64, uint16_t, uint64_t)
+#endif
+
+// Pairs with 32-bit key.
+#ifdef CUB_TYPE_U32_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b16, uint32_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b32, uint32_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U32_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b64, uint32_t, uint64_t)
+#endif
+#ifdef CUB_TYPE_F32_B16
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b16, float, uint16_t)
+#endif
+#ifdef CUB_TYPE_F32_B32
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b32, float, uint32_t)
+#endif
+#ifdef CUB_TYPE_F32_B64
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b64, float, uint64_t)
+#endif
+
+// Pairs with 64-bit key.
+#ifdef CUB_TYPE_U64_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b16, uint64_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U64_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b32, uint64_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b64, uint64_t, uint64_t)
+#endif
+
+}  // namespace cuda
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h
index caf2af501526..bc1a78170eea 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/macros.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/cuda/cubin_or_ptx_image.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index ba5b805247bd..75ebc07d2bda 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -22,7 +22,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/casts.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -40,22 +44,21 @@ limitations under the License.
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_utils.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
 namespace stream_executor {
@@ -229,7 +232,10 @@ bool CUDABlas::SetStream(Stream *stream) {
   CHECK(blas_ != nullptr);
   std::unique_ptr<ActivateContext> activation = parent_->Activate();
 
-  auto handle = (stream != nullptr) ? gpu::AsGpuStreamValue(stream) : nullptr;
+  auto handle =
+      (stream != nullptr)
+          ? absl::bit_cast<CUstream>(stream->platform_specific_handle().stream)
+          : nullptr;
   if (auto ret = cublasSetStream(blas_, handle); ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
     return false;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
index 8e2aef4d7217..317d218f6bd5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -40,20 +41,17 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/cuda/cuda_blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_utils.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/statusor.h"
 
 #define SET_ATTR(setter, handle, attr, value) \
   ToStatus(setter(handle, attr, &value, sizeof(decltype(value))), #setter)
@@ -347,8 +345,12 @@ auto BlasLt::GetMatmulPlan(const gpu::GemmConfig& cfg,
 
 absl::Status BlasLt::MatmulPlan::DoMatmul(
     Stream* stream, const void* alpha, const void* beta,
-    const MatmulAlgorithm& algorithm, const gpu::BlasLt::MemoryArgs& args,
+    const gpu::BlasLt::MemoryArgs& args,
     blas::ProfileResult* profile_result) const {
+  if (!algorithm_.has_value()) {
+    return absl::InternalError(
+        "Algorithm must be set before calling DoMatMul!");
+  }
   DeviceMemoryBase a = args.a, b = args.b;
   if (must_swap_operands_) {
     std::swap(a, b);
@@ -364,7 +366,7 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
   }
 
   void* workspace_addr = nullptr;
-  uint64_t workspace_size = algorithm.workspace_size;
+  uint64_t workspace_size = algorithm_->workspace_size;
   if (workspace_size > 0) {
     if (args.scratch_allocator != nullptr) {
       TF_ASSIGN_OR_RETURN(
@@ -379,7 +381,7 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     }
   }
 
-  auto palgo = std::any_cast<cublasLtMatmulAlgo_t>(&algorithm.opaque_algo);
+  auto palgo = std::any_cast<cublasLtMatmulAlgo_t>(&algorithm_->opaque_algo);
   {
     absl::MutexLock lock(&blas_lt->mu_);
     TF_RET_CHECK(blas_lt->blas_lt_ != nullptr);
@@ -462,7 +464,8 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
           blas_lt->blas_lt_.get(), op_desc_.get(), alpha, a.opaque(),
           a_desc_.get(), b.opaque(), b_desc_.get(), beta, args.c.opaque(),
           c_desc_.get(), args.d.opaque(), d_desc_.get(), palgo, workspace_addr,
-          workspace_size, gpu::AsGpuStreamValue(stream)));
+          workspace_size,
+          absl::bit_cast<CUstream>(stream->platform_specific_handle().stream)));
     } else {
       return absl::InternalError("cublaslt: Invalid algorithm type");
     }
@@ -479,8 +482,7 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
 }
 
 absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
-    Stream* stream, const MatmulAlgorithm& algorithm,
-    const gpu::BlasLt::MemoryArgs& args,
+    Stream* stream, const gpu::BlasLt::MemoryArgs& args,
     blas::ProfileResult* profile_result) const {
   auto wrapped_matmul = [&](auto scale) {
     using Scale = decltype(scale);
@@ -492,7 +494,7 @@ absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
       salpha = static_cast<Scale>(alpha_.real());
     }
     Scale sbeta = static_cast<Scale>(beta_);
-    return DoMatmul(stream, &salpha, &sbeta, algorithm, args, profile_result);
+    return DoMatmul(stream, &salpha, &sbeta, args, profile_result);
   };
 
   std::tuple operand_types{a_desc_.type(), b_desc_.type(), c_desc_.type(),
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
index f1b9ee30ca1c..4fc21afa50db 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstddef>
 #include <memory>
-#include <optional>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -31,7 +30,6 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/library_types.h"
 #include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -103,17 +101,20 @@ class BlasLt : public gpu::BlasLt {
     ~MatmulPlan() override = default;
 
     absl::Status ExecuteOnStream(
-        Stream* stream, const MatmulAlgorithm& algorithm,
-        const gpu::BlasLt::MemoryArgs& args,
+        Stream* stream, const gpu::BlasLt::MemoryArgs& args,
         blas::ProfileResult* profile_result) const override;
 
     absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
         const Stream* stream, size_t max_algorithm_count,
         size_t max_workspace_size) const override;
 
+    absl::Status SetAlgorithm(const MatmulAlgorithm& algorithm) override {
+      algorithm_ = algorithm;
+      return absl::OkStatus();
+    }
+
    private:
     absl::Status DoMatmul(Stream* stream, const void* alpha, const void* beta,
-                          const MatmulAlgorithm& algorithm,
                           const gpu::BlasLt::MemoryArgs& args,
                           blas::ProfileResult* profile_result) const;
 
@@ -126,6 +127,7 @@ class BlasLt : public gpu::BlasLt {
     xla::complex128 alpha_;
     double beta_;
     bool must_swap_operands_;
+    std::optional<MatmulAlgorithm> algorithm_;  // selected algorithm
   };  // class MatmulPlan
 
   explicit BlasLt(StreamExecutor* parent)
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_collectives.cc b/third_party/xla/xla/stream_executor/cuda/cuda_collectives.cc
deleted file mode 100644
index 382fd7dc3fba..000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_collectives.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/cuda/cuda_collectives.h"
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "third_party/nccl/nccl.h"
-#include "xla/stream_executor/activate_context.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/numbers.h"
-
-namespace stream_executor::gpu {
-
-/* static */ absl::StatusOr<void*> CudaCollectives::CollectiveMemoryAllocate(
-    StreamExecutor* executor, uint64_t bytes) {
-  if (bytes == 0) return nullptr;
-
-  std::unique_ptr<ActivateContext> activation = executor->Activate();
-
-  void* ptr = nullptr;
-  ncclResult_t res = ncclMemAlloc(&ptr, bytes);
-  if (res != ncclSuccess) {
-    return absl::InternalError(absl::StrFormat(
-        "failed to allocate %s (%llu bytes) from device collective memory: %s, "
-        "Last NCCL warning(error) log entry (may be unrelated): %s",
-        tsl::strings::HumanReadableNumBytes(bytes), bytes,
-        ncclGetErrorString(res), ncclGetLastError(nullptr)));
-  }
-  VLOG(2) << "Allocated collective memory " << ptr << " for executor "
-          << executor << " of " << bytes << " bytes";
-  return ptr;
-}
-
-/* static */ absl::Status CudaCollectives::CollectiveMemoryDeallocate(
-    StreamExecutor* executor, void* location) {
-  std::unique_ptr<ActivateContext> activation = executor->Activate();
-
-  ncclResult_t res = ncclMemFree(location);
-  if (res != ncclSuccess) {
-    return absl::InternalError(absl::StrFormat(
-        "failed to free device collective memory at %p; result: %s, Last NCCL "
-        "warning(error) log entry (may be unrelated): %s",
-        location, ncclGetErrorString(res), ncclGetLastError(nullptr)));
-  }
-
-  VLOG(2) << "Deallocated collective memory " << location << " for executor "
-          << executor;
-  return absl::OkStatus();
-}
-
-}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_collectives.h b/third_party/xla/xla/stream_executor/cuda/cuda_collectives.h
deleted file mode 100644
index bbcf021201d2..000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_collectives.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_COLLECTIVES_H_
-#define XLA_STREAM_EXECUTOR_CUDA_CUDA_COLLECTIVES_H_
-
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/stream_executor/stream_executor.h"
-
-namespace stream_executor::gpu {
-
-struct CudaCollectives {
-  // Allocates a collective device memory space of size bytes associated with
-  // the given context.
-  //
-  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclmemalloc
-  static absl::StatusOr<void *> CollectiveMemoryAllocate(
-      StreamExecutor *executor, uint64_t bytes);
-
-  // Deallocates a collective device memory space of size bytes associated with
-  // the given context.
-  //
-  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclmemfree
-  static absl::Status CollectiveMemoryDeallocate(StreamExecutor *executor,
-                                                 void *location);
-};
-
-}  // namespace stream_executor::gpu
-
-#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_COLLECTIVES_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_collectives_stub.cc b/third_party/xla/xla/stream_executor/cuda/cuda_collectives_stub.cc
deleted file mode 100644
index a486cfa4fccc..000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_collectives_stub.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/stream_executor/cuda/cuda_collectives.h"
-#include "xla/stream_executor/stream_executor.h"
-
-namespace stream_executor::gpu {
-
-/* static */ absl::StatusOr<void *> CudaCollectives::CollectiveMemoryAllocate(
-    StreamExecutor *executor, uint64_t bytes) {
-  if (bytes == 0) return nullptr;
-  return absl::FailedPreconditionError("XLA was compiled without NCCL support");
-}
-
-/* static */ absl::Status CudaCollectives::CollectiveMemoryDeallocate(
-    StreamExecutor *executor, void *location) {
-  return absl::FailedPreconditionError("XLA was compiled without NCCL support");
-}
-
-}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_collectives_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_collectives_test.cc
deleted file mode 100644
index 33549b6c1bbf..000000000000
--- a/third_party/xla/xla/stream_executor/cuda/cuda_collectives_test.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/cuda/cuda_collectives.h"
-
-#include <cstddef>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
-
-namespace stream_executor::gpu {
-namespace {
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-
-TEST(CudaCollectivesTest, CollectiveMemoryAllocation) {
-  auto* collectives = xla::gpu::GpuCollectives::Default();
-  if (collectives->IsImplemented()) {
-    GTEST_SKIP() << "Compiled without NCCL support";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
-                          PlatformManager::PlatformWithName("CUDA"));
-  TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
-                          platform->ExecutorForDevice(0));
-
-  constexpr size_t kAllocateSize = 1024;
-  TF_ASSERT_OK_AND_ASSIGN(
-      void* memory,
-      CudaCollectives::CollectiveMemoryAllocate(executor, kAllocateSize));
-
-  EXPECT_THAT(executor->GetPointerMemorySpace(memory),
-              IsOkAndHolds(MemoryType::kDevice));
-
-  EXPECT_THAT(CudaCollectives::CollectiveMemoryDeallocate(executor, memory),
-              IsOk());
-}
-
-}  // namespace
-}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
index b43758312a91..8f89ba039a34 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.cc
@@ -19,13 +19,11 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -43,18 +41,18 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_kernel.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/gpu/gpu_command_buffer.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/scoped_update_mode.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/typed_kernel_factory.h"  // IWYU pragma: keep
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 namespace {
@@ -142,49 +140,13 @@ absl::StatusOr<std::unique_ptr<CudaCommandBuffer>> CudaCommandBuffer::Create(
                             /*is_owned_graph=*/true));
 }
 
-absl::Status CudaCommandBuffer::LaunchSetIfElseConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle if_conditional,
-    GraphConditionalHandle else_conditional, DeviceMemory<bool> predicate) {
-  if (!set_if_else_condition_kernel_) {
-    TF_ASSIGN_OR_RETURN(auto spec,
-                        cuda::GetSetIfElseConditionKernelLoaderSpec());
-    TF_ASSIGN_OR_RETURN(
-        set_if_else_condition_kernel_,
-        SetIfElseConditionKernel::FactoryType::Create(parent_, spec));
-  }
-  return Launch(set_if_else_condition_kernel_, execution_scope_id, ThreadDim(),
-                BlockDim(), ToCudaGraphHandle(if_conditional),
-                ToCudaGraphHandle(else_conditional), predicate);
-}
-absl::Status CudaCommandBuffer::LaunchSetIfConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle if_conditional,
-    DeviceMemory<bool> predicate) {
-  if (!set_if_condition_kernel_) {
-    TF_ASSIGN_OR_RETURN(auto spec, cuda::GetSetIfConditionKernelLoaderSpec());
-    TF_ASSIGN_OR_RETURN(
-        set_if_condition_kernel_,
-        SetIfConditionKernel::FactoryType::Create(parent_, spec));
-  }
-  return Launch(set_if_condition_kernel_, execution_scope_id, ThreadDim(),
-                BlockDim(), ToCudaGraphHandle(if_conditional), predicate);
-}
-absl::Status CudaCommandBuffer::LaunchSetForConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
-    DeviceMemory<int32_t> loop_counter, int32_t iterations) {
-  if (!set_for_condition_kernel_) {
-    TF_ASSIGN_OR_RETURN(auto spec, cuda::GetSetForConditionKernelLoaderSpec());
-    TF_ASSIGN_OR_RETURN(
-        set_for_condition_kernel_,
-        SetForConditionKernel::FactoryType::Create(parent_, spec));
-  }
+//===----------------------------------------------------------------------===//
+// APIs for launching kernels to update conditional handles.
+//===----------------------------------------------------------------------===//
 
-  return CommandBuffer::Launch(
-      set_for_condition_kernel_, execution_scope_id, ThreadDim(), BlockDim(),
-      ToCudaGraphHandle(conditional), loop_counter, iterations);
-}
-absl::Status CudaCommandBuffer::LaunchSetWhileConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
-    DeviceMemory<bool> predicate) {
+absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateSetWhileConditionNode(
+    GraphConditionalHandle conditional, DeviceMemory<bool> predicate,
+    absl::Span<const GraphNodeHandle> dependencies) {
   if (!set_while_condition_kernel_) {
     TF_ASSIGN_OR_RETURN(auto spec,
                         cuda::GetSetWhileConditionKernelLoaderSpec());
@@ -193,24 +155,30 @@ absl::Status CudaCommandBuffer::LaunchSetWhileConditionKernel(
         SetWhileConditionKernel::FactoryType::Create(parent_, spec));
   }
 
-  return Launch(set_while_condition_kernel_, execution_scope_id, ThreadDim(),
-                BlockDim(), ToCudaGraphHandle(conditional), predicate);
+  auto kernel_args = PackKernelArgs(set_while_condition_kernel_,
+                                    ToCudaGraphHandle(conditional), predicate);
+  return CreateKernelNode(dependencies, ThreadDim(), BlockDim(),
+                          *set_while_condition_kernel_, *kernel_args);
 }
 
-absl::Status CudaCommandBuffer::LaunchSetCaseConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+absl::Status CudaCommandBuffer::UpdateSetWhileConditionNode(
+    GraphNodeHandle handle, GraphConditionalHandle conditional,
+    DeviceMemory<bool> predicate) {
+  auto kernel_args = PackKernelArgs(set_while_condition_kernel_,
+                                    ToCudaGraphHandle(conditional), predicate);
+  return UpdateKernelNode(handle, ThreadDim(), BlockDim(),
+                          *set_while_condition_kernel_, *kernel_args);
+}
+
+template <typename... Params>
+static std::unique_ptr<KernelArgsPackedArrayBase> PackCaseConditionKernelArgs(
+    const TypedKernel<Params...>& kernel,
+    absl::Span<const GraphConditionalHandle> conditionals,
     DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
     bool enable_conditional_default) {
   constexpr int kCaseBranchBatchSize = 8;
   CHECK(conditionals.size() <= kCaseBranchBatchSize);
 
-  if (!set_case_condition_kernel_) {
-    TF_ASSIGN_OR_RETURN(auto spec, cuda::GetSetCaseConditionKernelLoaderSpec());
-    TF_ASSIGN_OR_RETURN(
-        set_case_condition_kernel_,
-        SetCaseConditionKernel::FactoryType::Create(parent_, spec));
-  }
-
   // Pad handles up to size 8 with a default initialized handle.
   std::vector<CUgraphConditionalHandle> padded_handles{};
   padded_handles.resize(kCaseBranchBatchSize);
@@ -220,14 +188,46 @@ absl::Status CudaCommandBuffer::LaunchSetCaseConditionKernel(
                    return ToCudaGraphHandle(conditional);
                  });
 
-  return Launch(
-      set_case_condition_kernel_, execution_scope_id, ThreadDim(), BlockDim(),
-      padded_handles[0], padded_handles[1], padded_handles[2],
+  return PackKernelArgs(
+      kernel, padded_handles[0], padded_handles[1], padded_handles[2],
       padded_handles[3], padded_handles[4], padded_handles[5],
       padded_handles[6], padded_handles[7], index, index_is_bool, batch_offset,
       static_cast<int32_t>(conditionals.size()), enable_conditional_default);
 }
 
+absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateSetCaseConditionNode(
+    absl::Span<const GraphConditionalHandle> conditionals,
+    DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
+    bool enable_conditional_default,
+    absl::Span<const GraphNodeHandle> dependencies) {
+  if (!set_case_condition_kernel_) {
+    TF_ASSIGN_OR_RETURN(auto spec, cuda::GetSetCaseConditionKernelLoaderSpec());
+    TF_ASSIGN_OR_RETURN(
+        set_case_condition_kernel_,
+        SetCaseConditionKernel::FactoryType::Create(parent_, spec));
+  }
+
+  auto kernel_args = PackCaseConditionKernelArgs(
+      set_case_condition_kernel_, conditionals, index, index_is_bool,
+      batch_offset, enable_conditional_default);
+  return CreateKernelNode(dependencies, ThreadDim(), BlockDim(),
+                          *set_case_condition_kernel_, *kernel_args);
+}
+
+absl::Status CudaCommandBuffer::UpdateSetCaseConditionNode(
+    GraphNodeHandle handle,
+    absl::Span<const GraphConditionalHandle> conditionals,
+    DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
+    bool enable_conditional_default) {
+  auto kernel_args = PackCaseConditionKernelArgs(
+      set_case_condition_kernel_, conditionals, index, index_is_bool,
+      batch_offset, enable_conditional_default);
+  return UpdateKernelNode(handle, ThreadDim(), BlockDim(),
+                          *set_case_condition_kernel_, *kernel_args);
+}
+
+//===----------------------------------------------------------------------===//
+
 absl::StatusOr<CudaCommandBuffer::NoOpKernel*>
 CudaCommandBuffer::GetNoOpKernel() {
   if (!noop_kernel_) {
@@ -238,10 +238,10 @@ CudaCommandBuffer::GetNoOpKernel() {
   return &noop_kernel_;
 }
 
-absl::StatusOr<GpuCommandBuffer::ConditionalNodeResult>
-CudaCommandBuffer::CreateConditionalNode(const Dependencies& dependencies,
-                                         GraphConditionalHandle conditional,
-                                         ConditionType type) {
+absl::StatusOr<GpuCommandBuffer::GraphConditionalNodeHandle>
+CudaCommandBuffer::CreateConditionalNode(
+    absl::Span<const GraphNodeHandle> dependencies,
+    GraphConditionalHandle conditional, ConditionType type) {
 #if CUDA_VERSION >= 12030
   // Add conditional node to a graph.
   VLOG(2) << "Add conditional node to a graph " << graph_
@@ -275,7 +275,7 @@ CudaCommandBuffer::CreateConditionalNode(const Dependencies& dependencies,
   VLOG(2) << "Created conditional CUDA graph "
           << cu_params.conditional.phGraph_out[0];
 
-  return ConditionalNodeResult{
+  return GraphConditionalNodeHandle{
       FromCudaGraphHandle(node_handle),
       std::unique_ptr<CudaCommandBuffer>(
           new CudaCommandBuffer(Mode::kNested, parent_, cuda_context_,
@@ -287,8 +287,8 @@ CudaCommandBuffer::CreateConditionalNode(const Dependencies& dependencies,
 }
 
 absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateMemsetNode(
-    const Dependencies& dependencies, DeviceMemoryBase destination,
-    BitPattern bit_pattern, size_t num_elements) {
+    absl::Span<const GraphNodeHandle> dependencies,
+    DeviceMemoryBase destination, BitPattern bit_pattern, size_t num_elements) {
   VLOG(2) << "Add memset node to a graph " << graph_
           << "; dst: " << destination.opaque()
           << "; bit_pattern: " << bit_pattern.ToString()
@@ -340,8 +340,8 @@ absl::Status CudaCommandBuffer::UpdateMemsetNode(GraphNodeHandle node_handle,
 }
 
 absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateMemcpyD2DNode(
-    const Dependencies& dependencies, DeviceMemoryBase destination,
-    DeviceMemoryBase source, uint64_t size) {
+    absl::Span<const GraphNodeHandle> dependencies,
+    DeviceMemoryBase destination, DeviceMemoryBase source, uint64_t size) {
   VLOG(2) << "Add memcpy d2d node to a graph " << graph_
           << "; dst: " << destination.opaque() << "; src: " << source.opaque()
           << "; size: " << size << "; context: " << cuda_context_->context()
@@ -389,8 +389,29 @@ absl::Status CudaCommandBuffer::UpdateMemcpyD2DNode(
       "Failed to set memcpy d2d node params");
 }
 
+absl::Status CudaCommandBuffer::PopulateDnnGraphNode(
+    dnn::DnnGraph& dnn_graph, Stream& stream,
+    absl::Span<DeviceMemoryBase> operands) {
+  return dnn_graph.PopulateOrUpdateRawCommandBuffer(stream, operands, graph_,
+                                                    false);
+}
+
+absl::Status CudaCommandBuffer::UpdateDnnGraphNode(
+    dnn::DnnGraph& dnn_graph, Stream& stream,
+    absl::Span<DeviceMemoryBase> operands, GraphNodeHandle node_handle) {
+  CUgraph child_graph;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cuGraphChildGraphNodeGetGraph(
+      ToCudaGraphHandle(node_handle), &child_graph)));
+  TF_RETURN_IF_ERROR(dnn_graph.PopulateOrUpdateRawCommandBuffer(
+      stream, operands, child_graph, true));
+  return cuda::ToStatus(cuGraphExecChildGraphNodeSetParams(
+                            exec_, ToCudaGraphHandle(node_handle), child_graph),
+                        "Failed to set CUDA graph child node params");
+}
+
 absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateChildNode(
-    const Dependencies& dependencies, const CommandBuffer& nested) {
+    absl::Span<const GraphNodeHandle> dependencies,
+    const CommandBuffer& nested) {
   CUgraph child_graph =
       tensorflow::down_cast<const CudaCommandBuffer&>(nested).graph_;
   VLOG(2) << "Create a new node by cloning the child graph " << child_graph
@@ -420,7 +441,7 @@ absl::Status CudaCommandBuffer::UpdateChildNode(GraphNodeHandle node_handle,
 }
 
 absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateKernelNode(
-    const Dependencies& dependencies, const ThreadDim& threads,
+    absl::Span<const GraphNodeHandle> dependencies, const ThreadDim& threads,
     const BlockDim& blocks, const Kernel& kernel,
     const KernelArgsPackedArrayBase& args) {
   const uint64_t shared_mem_bytes = args.number_of_shared_bytes();
@@ -508,30 +529,6 @@ absl::Status CudaCommandBuffer::UpdateKernelNode(
                         "Failed to set CUDA graph kernel node params");
 }
 
-absl::StatusOr<GraphNodeHandle> CudaCommandBuffer::CreateBarrierNode(
-    const Dependencies& dependencies) {
-  if (parent_->GetDeviceDescription().driver_version() <
-      SemanticVersion(12, 4, 0)) {
-    // Instead of empty nodes we create no-op kernel nodes as barriers because
-    // CUDA 12.3 does not support empty nodes inside conditional command
-    // buffers.
-    TF_ASSIGN_OR_RETURN(NoOpKernel * noop, GetNoOpKernel());
-    return CreateKernelNode(dependencies, ThreadDim{1, 1, 1}, BlockDim{1, 1, 1},
-                            **noop, KernelArgsPackedArray<0>());
-  }
-
-  VLOG(2) << "Add empty node to a graph " << graph_
-          << "; deps: " << dependencies.size();
-
-  CUgraphNode barrier_handle = nullptr;
-  std::vector<CUgraphNode> deps = ToCudaGraphHandles(dependencies);
-  TF_RETURN_IF_ERROR(cuda::ToStatus(
-      cuGraphAddEmptyNode(&barrier_handle, graph_, deps.data(), deps.size()),
-      "Failed to add empty node to a CUDA graph"));
-
-  return FromCudaGraphHandle(barrier_handle);
-}
-
 absl::Status CudaCommandBuffer::Trace(
     Stream* stream, absl::AnyInvocable<absl::Status()> function) {
 #if CUDA_VERSION < 12030
@@ -551,7 +548,8 @@ absl::Status CudaCommandBuffer::Trace(
   VLOG(5) << "Trace into GPU command buffer graph " << graph_
           << " on a stream: " << stream;
 
-  CUstream stream_handle = AsGpuStreamValue(stream);
+  CUstream stream_handle =
+      absl::bit_cast<CUstream>(stream->platform_specific_handle().stream);
 
   // Switch stream into the capture mode.
   uint64_t start_nanos = tsl::Env::Default()->NowNanos();
@@ -574,34 +572,40 @@ absl::Status CudaCommandBuffer::Trace(
   DCHECK(captured_graph == graph_) << "Stream capture should update graph_";
   uint64_t end_nanos = tsl::Env::Default()->NowNanos();
 
-  if (!traced.ok())
+  if (!traced.ok()) {
     return absl::InternalError(
         absl::StrCat("Failed to capture gpu graph: ", traced.message()));
+  }
 
   VLOG(5) << "Traced into the GPU command buffer graph " << graph_ << " (took "
           << (end_nanos - start_nanos) / 1000 << " μs)";
 
+  // Check that traced graph is not empty. Trying to instantiate a CUDA graph
+  // with empty child node leads to a crash.
+  size_t num_root_nodes = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(
+      cuGraphGetRootNodes(captured_graph, nullptr, &num_root_nodes)));
+
+  if (num_root_nodes == 0) {
+    return absl::InternalError(
+        "Traced CUDA graph is empty. Traced function (custom call) did not "
+        "launch any CUDA operations on the captured CUDA stream. Instantiating "
+        "empty child nodes leads to CUDA crashes.");
+  }
+
   return absl::OkStatus();
 #endif
 }
 
-absl::Status CudaCommandBuffer::SetNodeExecutionEnabled(
-    GraphNodeHandle node_handle, bool enabled) {
-  // Node is enabled if value != 0, otherwise the node is disabled.
-  unsigned value = enabled ? 1 : 0;
-  VLOG(2) << "Set CUDA executable graph " << exec_ << " node " << node_handle
-          << " enabled flag to " << value;
-  return cuda::ToStatus(
-      cuGraphNodeSetEnabled(exec_, ToCudaGraphHandle(node_handle), value),
-      "Failed to set CUDA graph node enabled flag");
-}
-
 absl::Status CudaCommandBuffer::LaunchGraph(Stream* stream) {
   VLOG(3) << "Launch command buffer executable graph " << exec_
           << " on a stream: " << stream;
-  return cuda::ToStatus(cuGraphLaunch(exec_, AsGpuStreamValue(stream)),
-                        "Failed to launch CUDA graph");
+  return cuda::ToStatus(
+      cuGraphLaunch(exec_, absl::bit_cast<CUstream>(
+                               stream->platform_specific_handle().stream)),
+      "Failed to launch CUDA graph");
 }
+
 absl::StatusOr<size_t> CudaCommandBuffer::GetNodeCount() const {
   size_t num_nodes;
   TF_RETURN_IF_ERROR(
@@ -610,17 +614,19 @@ absl::StatusOr<size_t> CudaCommandBuffer::GetNodeCount() const {
 }
 
 absl::Status CudaCommandBuffer::PrepareFinalization() {
-  // TODO(b/362769658): Remove this workaround when cuda supports conditionals
-  // with empty graphs.
-  TF_ASSIGN_OR_RETURN(auto node_count, GetNodeCount());
-  if (node_count > 0) {
-    return absl::OkStatus();
-  }
-
-  TF_ASSIGN_OR_RETURN(NoOpKernel * noop, GetNoOpKernel());
-  TF_RETURN_IF_ERROR(CommandBuffer::Launch(*noop, kDefaultExecutionScope,
-                                           ThreadDim(), BlockDim()));
+  if (parent_->GetDeviceDescription().driver_version() <
+      SemanticVersion{12, 8, 0}) {
+    // For CUDA < 12080, cuda graph conditional node does not support
+    // empty body graph.
+    TF_ASSIGN_OR_RETURN(auto node_count, GetNodeCount());
+    if (node_count > 0) {
+      return absl::OkStatus();
+    }
 
+    TF_ASSIGN_OR_RETURN(NoOpKernel * noop, GetNoOpKernel());
+    TF_RETURN_IF_ERROR(
+        CreateLaunch(*noop, ThreadDim(), BlockDim(), {}).status());
+  }
   return absl::OkStatus();
 }
 
@@ -725,31 +731,4 @@ absl::Status CudaCommandBuffer::CheckCanBeUpdated() {
   return absl::OkStatus();
 }
 
-absl::StatusOr<std::vector<GraphNodeHandle>>
-CudaCommandBuffer::GetNodeDependencies(GraphNodeHandle node) {
-  VLOG(2) << "Get CUDA graph node " << node << " dependencies";
-
-  std::vector<CUgraphNode> dependencies;
-
-  size_t num_dependencies = 0;
-  TF_RETURN_IF_ERROR(
-      cuda::ToStatus(cuGraphNodeGetDependencies(ToCudaGraphHandle(node),
-                                                nullptr, &num_dependencies),
-                     "Failed to get CUDA graph node depedencies size"));
-
-  dependencies.resize(num_dependencies, nullptr);
-  TF_RETURN_IF_ERROR(cuda::ToStatus(
-      cuGraphNodeGetDependencies(ToCudaGraphHandle(node), dependencies.data(),
-                                 &num_dependencies),
-      "Failed to get CUDA graph node depedencies"));
-
-  std::vector<GraphNodeHandle> result;
-  result.reserve(dependencies.size());
-  absl::c_transform(
-      dependencies, std::back_inserter(result),
-      static_cast<GraphNodeHandle (*)(CUgraphNode)>(&FromCudaGraphHandle));
-
-  return result;
-}
-
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
index 85030edeaae4..05a2c475a6ed 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer.h
@@ -25,7 +25,9 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/bit_pattern.h"
 #include "xla/stream_executor/command_buffer.h"
@@ -64,39 +66,48 @@ class CudaCommandBuffer final : public GpuCommandBuffer {
         graph_(graph),
         is_owned_graph_(is_owned_graph) {
     VLOG(5) << "Created command buffer for graph " << graph_
-            << "; mode=" << ModeToString(mode)
+            << "; mode=" << absl::StrCat(mode)
             << "; is_owned_graph=" << is_owned_graph_;
   }
 
-  absl::Status LaunchSetIfConditionKernel(
-      ExecutionScopeId execution_scope_id,
-      GraphConditionalHandle if_conditional,
-      DeviceMemory<bool> predicate) override;
-  absl::Status LaunchSetIfElseConditionKernel(
-      ExecutionScopeId execution_scope_id,
-      GraphConditionalHandle if_conditional,
-      GraphConditionalHandle else_conditional,
-      DeviceMemory<bool> predicate) override;
-  absl::Status LaunchSetCaseConditionKernel(
-      ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+  //===--------------------------------------------------------------------===//
+  // APIs for launching kernels to update conditional handles.
+  //===--------------------------------------------------------------------===//
+
+  absl::StatusOr<GraphNodeHandle> CreateSetCaseConditionNode(
+      absl::Span<const GraphConditionalHandle> conditionals,
+      DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
+      bool enable_conditional_default,
+      absl::Span<const GraphNodeHandle> dependencies) override;
+
+  absl::Status UpdateSetCaseConditionNode(
+      GraphNodeHandle handle,
+      absl::Span<const GraphConditionalHandle> conditionals,
       DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
       bool enable_conditional_default) override;
-  absl::Status LaunchSetForConditionKernel(ExecutionScopeId execution_scope_id,
-                                           GraphConditionalHandle conditional,
-                                           DeviceMemory<int32_t> loop_counter,
-                                           int32_t iterations) override;
-  absl::Status LaunchSetWhileConditionKernel(
-      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+
+  absl::StatusOr<GraphNodeHandle> CreateSetWhileConditionNode(
+      GraphConditionalHandle conditional, DeviceMemory<bool> predicate,
+      absl::Span<const GraphNodeHandle> dependencies) override;
+
+  absl::Status UpdateSetWhileConditionNode(
+      GraphNodeHandle handle, GraphConditionalHandle conditional,
       DeviceMemory<bool> predicate) override;
+
+  //===--------------------------------------------------------------------===//
+
+  using NoOpKernel = TypedKernel<>;
+
   absl::StatusOr<NoOpKernel*> GetNoOpKernel();
 
-  absl::StatusOr<ConditionalNodeResult> CreateConditionalNode(
-      const Dependencies& dependencies, GraphConditionalHandle conditional,
-      ConditionType type) override;
+  absl::StatusOr<GraphConditionalNodeHandle> CreateConditionalNode(
+      absl::Span<const GraphNodeHandle> dependencies,
+      GraphConditionalHandle conditional, ConditionType type) override;
 
   absl::StatusOr<GraphNodeHandle> CreateMemsetNode(
-      const Dependencies& dependencies, DeviceMemoryBase destination,
-      BitPattern bit_pattern, size_t num_elements) override;
+      absl::Span<const GraphNodeHandle> dependencies,
+      DeviceMemoryBase destination, BitPattern bit_pattern,
+      size_t num_elements) override;
 
   absl::Status UpdateMemsetNode(GraphNodeHandle node_handle,
                                 DeviceMemoryBase destination,
@@ -104,22 +115,31 @@ class CudaCommandBuffer final : public GpuCommandBuffer {
                                 size_t num_elements) override;
 
   absl::StatusOr<GraphNodeHandle> CreateMemcpyD2DNode(
-      const Dependencies& dependencies, DeviceMemoryBase destination,
-      DeviceMemoryBase source, uint64_t size) override;
+      absl::Span<const GraphNodeHandle> dependencies,
+      DeviceMemoryBase destination, DeviceMemoryBase source,
+      uint64_t size) override;
 
   absl::Status UpdateMemcpyD2DNode(GraphNodeHandle node_handle,
                                    DeviceMemoryBase destination,
                                    DeviceMemoryBase source,
                                    uint64_t size) override;
 
+  absl::Status PopulateDnnGraphNode(
+      dnn::DnnGraph&, Stream&, absl::Span<DeviceMemoryBase> operands) override;
+
+  absl::Status UpdateDnnGraphNode(dnn::DnnGraph&, Stream&,
+                                  absl::Span<DeviceMemoryBase> operands,
+                                  GraphNodeHandle) override;
+
   absl::StatusOr<GraphNodeHandle> CreateChildNode(
-      const Dependencies& dependencies, const CommandBuffer& nested) override;
+      absl::Span<const GraphNodeHandle> dependencies,
+      const CommandBuffer& nested) override;
 
   absl::Status UpdateChildNode(GraphNodeHandle node_handle,
                                const CommandBuffer& nested) override;
 
   absl::StatusOr<GraphNodeHandle> CreateKernelNode(
-      const Dependencies& dependencies, const ThreadDim& threads,
+      absl::Span<const GraphNodeHandle> dependencies, const ThreadDim& threads,
       const BlockDim& blocks, const Kernel& kernel,
       const KernelArgsPackedArrayBase& args) override;
 
@@ -128,15 +148,9 @@ class CudaCommandBuffer final : public GpuCommandBuffer {
                                 const BlockDim& blocks, const Kernel& kernel,
                                 const KernelArgsPackedArrayBase& args) override;
 
-  absl::StatusOr<GraphNodeHandle> CreateBarrierNode(
-      const Dependencies& dependencies) override;
-
   absl::Status Trace(Stream* stream,
                      absl::AnyInvocable<absl::Status()> function) override;
 
-  absl::Status SetNodeExecutionEnabled(GraphNodeHandle node_handle,
-                                       bool enabled) override;
-
   absl::Status LaunchGraph(Stream* stream) override;
 
   absl::StatusOr<size_t> GetNodeCount() const override;
@@ -155,17 +169,7 @@ class CudaCommandBuffer final : public GpuCommandBuffer {
 
   absl::Status CheckCanBeUpdated() override;
 
-  absl::StatusOr<std::vector<GraphNodeHandle>> GetNodeDependencies(
-      GraphNodeHandle node) override;
-
   // A signature of a device kernels updating conditional handle(s).
-  using SetIfConditionKernel =
-      TypedKernel<CUgraphConditionalHandle, DeviceMemory<bool>>;
-
-  using SetIfElseConditionKernel =
-      TypedKernel<CUgraphConditionalHandle, CUgraphConditionalHandle,
-                  DeviceMemory<bool>>;
-
   using SetCaseConditionKernel =
       TypedKernel<CUgraphConditionalHandle, CUgraphConditionalHandle,
                   CUgraphConditionalHandle, CUgraphConditionalHandle,
@@ -173,20 +177,14 @@ class CudaCommandBuffer final : public GpuCommandBuffer {
                   CUgraphConditionalHandle, CUgraphConditionalHandle,
                   DeviceMemory<uint8_t>, bool, int32_t, int32_t, bool>;
 
-  using SetForConditionKernel =
-      TypedKernel<CUgraphConditionalHandle, DeviceMemory<int32_t>, int32_t>;
-
   using SetWhileConditionKernel =
       TypedKernel<CUgraphConditionalHandle, DeviceMemory<bool>>;
 
   // Lazy loaded auxiliary kernels required for building CUDA graphs (no-op
   // barriers, updating conditional handles, etc.).
-  SetIfConditionKernel set_if_condition_kernel_;
-  SetIfElseConditionKernel set_if_else_condition_kernel_;
+  NoOpKernel noop_kernel_;
   SetCaseConditionKernel set_case_condition_kernel_;
-  SetForConditionKernel set_for_condition_kernel_;
   SetWhileConditionKernel set_while_condition_kernel_;
-  NoOpKernel noop_kernel_;
 
   StreamExecutor* parent_;
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
new file mode 100644
index 000000000000..113f405d2b49
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/ascii.h"
+#include "absl/types/span.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend.h"  // IWYU pragma: keep - cudnn frontend headers are not hermetic
+#include "third_party/cudnn_frontend/include/cudnn_frontend/graph_interface.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend/graph_properties.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/cuda/cuda_dnn.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::cuda {
+namespace {
+
+using ::testing::Each;
+using ::tsl::testing::IsOkAndHolds;
+
+static Platform* CudaPlatform() {
+  auto name = absl::AsciiStrToUpper(
+      xla::PlatformUtil::CanonicalPlatformName("cuda").value());
+  return PlatformManager::PlatformWithName(name).value();
+}
+
+static constexpr auto primary = CommandBuffer::Mode::kPrimary;  // NOLINT
+
+TEST(CudaCommandBufferTest, CuDnnExplicitConstructionAndUpdateWork) {
+  Platform* platform = CudaPlatform();
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Stream> stream,
+                          executor->CreateStream());
+  dnn::DnnSupport& dnn_support = *executor->AsDnn();
+
+  if (dnn_support.GetVersion().value_or(dnn::VersionInfo{0, 0, 0}) <
+      dnn::VersionInfo(9, 7, 0)) {
+    GTEST_SKIP() << "Requires cuDNN 9.7.0 or later.";
+  }
+
+  if (executor->GetDeviceDescription().cuda_compute_capability() <
+      CudaComputeCapability::Ampere()) {
+    GTEST_SKIP() << "Requires at least an Ampere GPU.";
+  }
+
+  constexpr int kDimSize = 32;
+  constexpr int kTotalElements = kDimSize * kDimSize;
+
+  stream_executor::gpu::CudnnGraph graph([]() {
+    cudnn_frontend::graph::Graph graph;
+    graph.set_compute_data_type(cudnn_frontend::DataType_t::INT32);
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> lhs =
+        graph.tensor(cudnn_frontend::graph::Tensor_attributes()
+                         .set_dim({1, kDimSize, kDimSize})
+                         .set_stride({kDimSize * kDimSize, kDimSize, 1})
+                         .set_data_type(cudnn_frontend::DataType_t::INT8)
+                         .set_uid(1));
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> rhs =
+        graph.tensor_like(lhs);
+    rhs->set_uid(2);
+    graph.matmul(lhs, rhs, cudnn_frontend::graph::Matmul_attributes())
+        ->set_output(true)
+        .set_data_type(cudnn_frontend::DataType_t::INT32)
+        .set_uid(3);
+    return graph;
+  }());
+  TF_ASSERT_OK(graph.Prepare(dnn_support, NumericOptions{}));
+  TF_ASSERT_OK(graph.Build(dnn_support, /*plan_id=*/std::nullopt));
+  EXPECT_THAT(graph.SupportsExplicitCommandBufferConstruction(),
+              IsOkAndHolds(true));
+
+  DeviceMemory<int8_t> input = executor->AllocateArray<int8_t>(kTotalElements);
+  TF_ASSERT_OK(stream->MemZero(&input, input.size()));
+  DeviceMemory<int32_t> output0 =
+      executor->AllocateArray<int32_t>(kTotalElements);
+  DeviceMemoryBase workspace;
+  std::vector<DeviceMemoryBase> operands;
+  operands.reserve(4);
+  operands.push_back(input);  // multiplying the input by itself
+  operands.push_back(input);
+  operands.push_back(output0);
+  if (graph.Graph().get_workspace_size() > 0) {
+    workspace = executor->Allocate(graph.Graph().get_workspace_size());
+    operands.push_back(workspace);
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<CommandBuffer> cmd_buffer,
+                          executor->CreateCommandBuffer(primary));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* dnn_command,
+      cmd_buffer->CreateDnnGraphCommand(
+          graph, *stream, absl::Span<DeviceMemoryBase>(operands), {}));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
+
+  std::vector<int32_t> host_buffer(output0.ElementCount());
+
+  // Initialize and check the output before execution.
+  TF_ASSERT_OK(stream->Memset32(&output0, 123, output0.size()));
+  TF_ASSERT_OK(stream->Memcpy(host_buffer.data(), output0, output0.size()));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+  EXPECT_THAT(host_buffer, Each(123));
+
+  // Run the computation.
+  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
+
+  // Check the output after execution.
+  TF_ASSERT_OK(stream->Memcpy(host_buffer.data(), output0, output0.size()));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+  EXPECT_THAT(host_buffer, Each(0));
+
+  // Swap the output buffer.
+  DeviceMemory<int32_t> output1 =
+      executor->AllocateArray<int32_t>(kTotalElements);
+  operands[2] = output1;
+  executor->Deallocate(&output0);
+
+  // Initialize and check the output before execution.
+  TF_ASSERT_OK(stream->Memset32(&output1, 456, output1.size()));
+  TF_ASSERT_OK(stream->Memcpy(host_buffer.data(), output1, output1.size()));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+  EXPECT_THAT(host_buffer, Each(456));
+
+  // Update the command buffer to write into the new output buffer.
+  TF_ASSERT_OK(cmd_buffer->Update());
+  TF_ASSERT_OK(cmd_buffer->UpdateDnnGraphCommand(
+      dnn_command, graph, *stream, absl::Span<DeviceMemoryBase>(operands)));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
+
+  // Run the computation.
+  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
+
+  // Check the output after execution.
+  TF_ASSERT_OK(stream->Memcpy(host_buffer.data(), output1, output1.size()));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+  EXPECT_THAT(host_buffer, Each(0));
+}
+
+}  // namespace
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
index 7c05ee995b65..109ee3ce1051 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability.h
@@ -88,6 +88,8 @@ struct CudaComputeCapability {
     return major >= CudaComputeCapabilities::kAmpere;
   }
 
+  bool IsAtLeastAda() const { return IsAtLeast(8, 9); }
+
   bool IsAtLeastHopper() const {
     return major >= CudaComputeCapabilities::kHopper;
   }
@@ -96,6 +98,8 @@ struct CudaComputeCapability {
     return major >= CudaComputeCapabilities::kBlackwell;
   }
 
+  bool IsAmpere() const { return major == CudaComputeCapabilities::kAmpere; }
+
   bool IsHopper() const { return major == CudaComputeCapabilities::kHopper; }
 
   bool IsBlackwell() const {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
index a2740268052c..c367ae2a2175 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_compute_capability_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/hash/hash_testing.h"
 #include "absl/status/status.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.pb.h"
 #include "xla/tsl/platform/status_matchers.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
index 3fe76370f588..1e5e58c63af0 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 
-#include <cstdlib>
-#include <set>
-
 #if !defined(PLATFORM_WINDOWS)
 #include <dirent.h>
 #endif
@@ -36,21 +33,23 @@ limitations under the License.
 
 #include <sys/stat.h>
 
+#include <cstdlib>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
-#include "xla/stream_executor/gpu/gpu_diagnostics.h"
-#include "tsl/platform/env.h"
+#include "xla/tsl/platform/env.h"
 #include "tsl/platform/host_info.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 namespace cuda {
@@ -128,12 +127,6 @@ void PrintLdLibraryPathIntoVlog() {
   }
 }
 
-}  // namespace cuda
-}  // namespace stream_executor
-
-namespace stream_executor {
-namespace gpu {
-
 #if !defined(PLATFORM_WINDOWS)
 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
 #else
@@ -330,5 +323,5 @@ absl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   return status;
 }
 
-}  // namespace gpu
+}  // namespace cuda
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
index ea1fa0cfc51a..0fb34adee6f3 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
@@ -17,15 +17,15 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 
 #include <string>
+#include <tuple>
 
 #include "absl/status/statusor.h"
-#include "xla/stream_executor/gpu/gpu_diagnostics.h"
 
 namespace stream_executor {
 namespace cuda {
 
 // e.g. DriverVersion{346, 3, 4}
-using DriverVersion = gpu::DriverVersion;
+using DriverVersion = std::tuple<int, int, int>;
 
 // Converts a parsed driver version to string form.
 std::string DriverVersionToString(DriverVersion version);
@@ -36,7 +36,55 @@ std::string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version);
 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
 absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value);
 
-using Diagnostician = gpu::Diagnostician;
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static absl::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const std::string& driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static absl::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static absl::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatibility.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      absl::StatusOr<DriverVersion> dso_version,
+      absl::StatusOr<DriverVersion> kernel_version);
+
+  static std::string GetDevNodePath(int dev_node_ordinal);
+
+  Diagnostician(const Diagnostician&) = delete;
+  void operator=(const Diagnostician&) = delete;
+};
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics_test.cc
index bc93ab86d04d..cdd70c3b53d2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/debugging/leak_check.h"
 #include "absl/log/check.h"
-#include "absl/log/globals.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 05565a1f38dc..f6c869058d7e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -32,11 +32,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
 #include "absl/base/optimization.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -57,14 +60,13 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event_based_timer.h"
-#include "xla/stream_executor/gpu/gpu_diagnostics.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
@@ -81,7 +83,7 @@ limitations under the License.
 #include "third_party/gpus/cudnn/cudnn_adv.h"
 #include "third_party/gpus/cudnn/cudnn_cnn.h"
 #include "third_party/gpus/cudnn/cudnn_ops.h"
-#elif CUDNN_VERSION >= 8100
+#else
 #include "third_party/gpus/cudnn/cudnn_adv_infer.h"
 #include "third_party/gpus/cudnn/cudnn_adv_train.h"
 #include "third_party/gpus/cudnn/cudnn_cnn_infer.h"
@@ -90,9 +92,6 @@ limitations under the License.
 #include "third_party/gpus/cudnn/cudnn_ops_train.h"
 #endif
 
-#include "third_party/gpus/cudnn/cudnn_backend.h"
-
-#if CUDNN_VERSION >= 8100
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend_EngineConfig.h"
@@ -104,10 +103,8 @@ limitations under the License.
 #include "third_party/cudnn_frontend/include/cudnn_frontend_Operation.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend_OperationGraph.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend_PointWiseDesc.h"
-#include "third_party/cudnn_frontend/include/cudnn_frontend_Rng.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend_Tensor.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend_VariantPack.h"
-#endif  // CUDNN_VERSION >= 8100
 // clang-format on
 
 #ifdef __clang__
@@ -123,7 +120,7 @@ namespace gpu {
 
 namespace {
 
-static_assert(CUDNN_VERSION >= 7300, "cuDNN needs to be version 7.3 or higher");
+static_assert(CUDNN_VERSION >= 8900, "cuDNN needs to be version 8.9 or higher");
 
 // Exits the program if 'expr' doesn't return CUDNN_STATUS_SUCCESS.
 #define CHECK_CUDNN_OK(expr) CHECK_EQ(expr, CUDNN_STATUS_SUCCESS)
@@ -290,7 +287,9 @@ class CudnnAccess {
   CudnnHandle GetHandle(StreamExecutor* executor, Stream* stream) {
     auto lock = std::make_unique<absl::MutexLock>(&mutex_);
     mutex_.AssertHeld();
-    CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
+    CUstream cu_stream = stream ? absl::bit_cast<CUstream>(
+                                      stream->platform_specific_handle().stream)
+                                : cudaStreamLegacy;
     if (!current_stream_ || cu_stream != *current_stream_) {
       current_stream_ = cu_stream;
       const auto status = cudnnSetStream(handle_, cu_stream);
@@ -308,7 +307,8 @@ class CudnnAccess {
   }
 
   void NotifyStreamDestroyed(Stream* stream) {
-    CUstream cu_stream = AsGpuStreamValue(stream);
+    CUstream cu_stream =
+        absl::bit_cast<CUstream>(stream->platform_specific_handle().stream);
     absl::MutexLock lock(&mutex_);
     if (current_stream_ && cu_stream == *current_stream_) {
       current_stream_.reset();
@@ -429,17 +429,17 @@ void PreloadCudnnSubLibs(PreloadCudnnType type) {
   switch (type) {
     case PreloadCudnnType::ConvBwdFilter:
     case PreloadCudnnType::ConvBwdData: {
-#if CUDNN_VERSION >= 8004 && CUDNN_VERSION < 90000
+#if CUDNN_VERSION < 90000
       cudnnOpsTrainVersionCheck();
       cudnnCnnTrainVersionCheck();
-#endif  // CUDNN_VERSION >= 8004 && CUDNN_VERSION < 90000
+#endif  // CUDNN_VERSION < 90000
       [[clang::fallthrough]];
     }
     case PreloadCudnnType::ConvFwd: {
 #if CUDNN_VERSION >= 90000
       cudnnGraphVersionCheck();
       cudnnOpsVersionCheck();
-#elif CUDNN_VERSION >= 8004
+#else
       cudnnOpsInferVersionCheck();
       cudnnCnnInferVersionCheck();
 #endif  // CUDNN_VERSION >= 90000
@@ -449,7 +449,7 @@ void PreloadCudnnSubLibs(PreloadCudnnType type) {
 #if CUDNN_VERSION >= 90000
       cudnnOpsVersionCheck();
       cudnnAdvVersionCheck();
-#elif CUDNN_VERSION >= 8004
+#else
       cudnnOpsInferVersionCheck();
       cudnnAdvInferVersionCheck();
       cudnnOpsTrainVersionCheck();
@@ -539,7 +539,7 @@ absl::Status CudnnSupport::Init() {
              << " bytes total.";
 
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
-    auto result = gpu::Diagnostician::FindKernelDriverVersion();
+    auto result = cuda::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
       LOG(ERROR) << "Error retrieving driver version: "
                  << cuda::DriverVersionStatusToString(result);
@@ -612,20 +612,11 @@ struct RnnDescriptorDeleter {
     CHECK_CUDNN_OK(cudnnDestroyRNNDescriptor(descriptor));
   }
 };
-#if CUDNN_VERSION < 8100
-struct PersistentRnnPlanDeleter {
-  void operator()(cudnnPersistentRNNPlan_t plan) const {
-    CHECK_CUDNN_OK(cudnnDestroyPersistentRNNPlan(plan));
-  }
-};
-#endif  // CUDNN_VERSION < 8100
-#if CUDNN_VERSION >= 7603
 struct CtcLossDescriptorDeleter {
   void operator()(cudnnCTCLossDescriptor_t descriptor) const {
     CHECK_CUDNN_OK(cudnnDestroyCTCLossDescriptor(descriptor));
   }
 };
-#endif
 
 // RAII wrappers for cuDNN types.
 using TensorDescriptor =
@@ -644,17 +635,10 @@ using ActivationDescriptor =
 using DropoutDescriptor =
     std::unique_ptr<cudnnDropoutStruct, DropoutDescriptorDeleter>;
 using RnnDescriptor = std::unique_ptr<cudnnRNNStruct, RnnDescriptorDeleter>;
-#if CUDNN_VERSION >= 8100
 struct DummyType {};
 using PersistentRnnPlan = std::unique_ptr<DummyType>;
-#else
-using PersistentRnnPlan =
-    std::unique_ptr<cudnnPersistentRNNPlan, PersistentRnnPlanDeleter>;
-#endif  // CUDNN_VERSION >= 8100
-#if CUDNN_VERSION >= 7603
 using CtcLossDescriptor =
     std::unique_ptr<cudnnCTCLossStruct, CtcLossDescriptorDeleter>;
-#endif
 
 // Factory methods for cuDNN types.
 TensorDescriptor CreateTensorDescriptor() {
@@ -702,23 +686,11 @@ RnnDescriptor CreateRnnDescriptor() {
   CHECK_CUDNN_OK(cudnnCreateRNNDescriptor(&result));
   return RnnDescriptor(result);
 }
-#if CUDNN_VERSION >= 7603
 CtcLossDescriptor CreateCtcLossDescriptor() {
   cudnnCTCLossDescriptor_t result;
   CHECK_CUDNN_OK(cudnnCreateCTCLossDescriptor(&result));
   return CtcLossDescriptor(result);
 }
-#endif
-
-#if CUDNN_VERSION < 8100
-absl::StatusOr<PersistentRnnPlan> CreatePersistentRnnPlan(
-    cudnnRNNDescriptor_t rnn_desc, int batch_size, cudnnDataType_t data_type) {
-  cudnnPersistentRNNPlan_t result;
-  RETURN_IF_CUDNN_ERROR(
-      cudnnCreatePersistentRNNPlan(rnn_desc, batch_size, data_type, &result));
-  return absl::StatusOr<PersistentRnnPlan>(PersistentRnnPlan(result));
-}
-#endif  // CUDNN_VERSION < 8100
 
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a
 // scope.
@@ -829,7 +801,6 @@ class CudnnFilterDescriptor {
   FilterDescriptor handle_;  // Owned.
 };
 
-#if CUDNN_VERSION >= 8100
 // The errata sheet (JSON format) for marking the cudnn engines that might be
 // buggy. For example, we don't want the engine 999 of forward convolution:
 // R"({ "version" : 1,
@@ -910,7 +881,7 @@ const json* CudnnExecutionPlanEngineFilterStatic() {
             "comment"             : "b/281887114"
           }
       ]})";
-  static const json* json_handle = new json(json::parse(filter_str));
+  static const json* const json_handle = new json(json::parse(filter_str));
   return json_handle;
 }
 
@@ -925,8 +896,6 @@ const json* CudnnExecutionPlanEngineFilterRuntime() {
   return json_handle;
 }
 
-#endif  // CUDNN_VERSION >= 8100
-
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@@ -994,24 +963,15 @@ class CudnnConvolutionDescriptor {
             : CUDNN_CROSS_CORRELATION,
         data_type));
 
-#if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
     CHECK_CUDNN_OK(cudnnSetConvolutionGroupCount(
         handle_.get(), convolution_descriptor.group_count()));
-#else
-    CHECK_EQ(convolution_descriptor.group_count(), 1)
-        << "Requested grouped convolution for cuDNN version < 7";
-#endif
   }
 
   void set_use_tensor_op_math(bool use_tensor_op_math) {
     cudnnMathType_t math_type =
-#if CUDNN_VERSION >= 8000
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
-#else
-        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-#endif
     CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
   }
 
@@ -1026,11 +986,7 @@ class CudnnConvolutionDescriptor {
 static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
   cudnnMathType_t math_type;
   CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-#if CUDNN_VERSION >= 8000
   return math_type != CUDNN_FMA_MATH;
-#else
-  return math_type == CUDNN_TENSOR_OP_MATH;
-#endif
 }
 
 static bool TensorOpMathAvailable(
@@ -1044,13 +1000,9 @@ static bool IsTensorMathEnabled(CudaComputeCapability cuda_compute_capability,
     return false;
   }
   if (input_type == dnn::DataType::kFloat) {
-#if CUDNN_VERSION < 8000
-    return false;
-#else
     if (!allow_tf32 || !tsl::tensor_float_32_execution_enabled()) {
       return false;
     }
-#endif
   }
   return true;
 }
@@ -1213,16 +1165,12 @@ cudnn_frontend::DataType_t ToCudnnFrontendDataType(
       return cudnn_frontend::DataType_t::INT32;
     case dnn::DataType::kInt64:
       return cudnn_frontend::DataType_t::INT64;
-#if CUDNN_VERSION >= 8200
     case dnn::DataType::kBF16:
       return cudnn_frontend::DataType_t::BFLOAT16;
-#endif
-#if CUDNN_VERSION >= 8900
     case dnn::DataType::kF8E4M3FN:
       return cudnn_frontend::DataType_t::FP8_E4M3;
     case dnn::DataType::kF8E5M2:
       return cudnn_frontend::DataType_t::FP8_E5M2;
-#endif
 #if CUDNN_VERSION >= 90700
     case dnn::DataType::kF4E2M1FN:
       return cudnn_frontend::DataType_t::FP4_E2M1;
@@ -1483,15 +1431,9 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
           "Algo requests disallowed tensor op evaluation.");
     }
 
-#if CUDNN_VERSION >= 8000
     cudnnMathType_t math_type =
         use_tensor_ops ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH;
-#else
-    cudnnMathType_t math_type =
-        use_tensor_ops ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
-#endif
 
-#if CUDNN_VERSION >= 8000
     cudnnRNNBiasMode_t bias_mode = CUDNN_RNN_DOUBLE_BIAS;
     uint32_t aux_flags = 0;
     if (use_padded_io) aux_flags |= CUDNN_RNN_PADDED_IO_ENABLED;
@@ -1506,48 +1448,13 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
         /*numLayers=*/num_layers,
         /*dropoutDesc=*/dropout_desc.handle(),
         /*auxFlags=*/aux_flags));
-#else
-    RETURN_IF_CUDNN_ERROR(cudnnSetRNNDescriptor_v6(
-        cudnn.handle(), /*rnnDesc=*/rnn_desc.get(),
-        /*hiddenSize=*/hidden_size, /*numLayers=*/num_layers,
-        /*dropoutDesc=*/dropout_desc.handle(), /*inputMode=*/input_mode,
-        /*direction=*/direction_mode, /*mode=*/rnn_mode, /*algo=*/rnn_algo,
-        /*dataType=*/compute_type));
-    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
-
-    if (proj_size < hidden_size) {
-      RETURN_IF_CUDNN_ERROR(cudnnSetRNNProjectionLayers(
-          cudnn.handle(), /*rnnDesc=*/rnn_desc.get(),
-          /*recProjSize=*/proj_size, /*outProjSize=*/0));
-    }
-
-    // TODO: For now, we only use cudnnRNN**Ex API to process padded inputs.
-    // But in the future if these APIs are used to process full length arrays,
-    // we need to distinguish when to set it.
-    if (use_padded_io) {
-      RETURN_IF_CUDNN_ERROR(
-          cudnnSetRNNPaddingMode(rnn_desc.get(), CUDNN_RNN_PADDED_IO_ENABLED));
-    }
-#endif
 
     absl::StatusOr<PersistentRnnPlan> rnn_plan_wrapper;
     PersistentRnnPlan rnn_plan;
     if (rnn_algo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
       CHECK_GE(batch_size, 0);
-#if CUDNN_VERSION >= 8100
       RETURN_IF_CUDNN_ERROR(
           cudnnBuildRNNDynamic(cudnn.handle(), rnn_desc.get(), batch_size));
-#else
-      rnn_plan_wrapper =
-          CreatePersistentRnnPlan(rnn_desc.get(), batch_size, data_type);
-      if (!rnn_plan_wrapper.ok()) {
-        return absl::StatusOr<CudnnRnnDescriptor>(rnn_plan_wrapper.status());
-      } else {
-        rnn_plan = std::move(rnn_plan_wrapper).value();
-        RETURN_IF_CUDNN_ERROR(
-            cudnnSetPersistentRNNPlan(rnn_desc.get(), rnn_plan.get()));
-      }
-#endif  // CUDNN_VERSION >= 8100
     }
 
     // Create the params handle.
@@ -1618,7 +1525,6 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
   void operator=(const CudnnRnnDescriptor&) = delete;
 };
 
-#if CUDNN_VERSION >= 7603
 class CudnnCtcLossDescriptor {
  public:
   explicit CudnnCtcLossDescriptor(cudnnDataType_t data_type)
@@ -1638,13 +1544,6 @@ class CudnnCtcLossDescriptor {
   CudnnCtcLossDescriptor(const CudnnCtcLossDescriptor&) = delete;
   void operator=(const CudnnCtcLossDescriptor&) = delete;
 };
-#else
-// dummy class
-class CudnnCtcLossDescriptor {
- public:
-  CudnnCtcLossDescriptor(cudnnDataType_t data_type) {}
-};
-#endif
 
 namespace {
 
@@ -1665,7 +1564,6 @@ absl::Status CheckAndFetchProjectionWeights(
   cudnnRNNAlgo_t algo;
   cudnnDataType_t data_type;
   int rec_proj_size_v;
-#if CUDNN_VERSION >= 8100
   RETURN_IF_CUDNN_ERROR(cudnnGetRNNDescriptor_v8(
       /*rnnDesc=*/rnn_desc,
       /*algo=*/&algo,
@@ -1682,27 +1580,8 @@ absl::Status CheckAndFetchProjectionWeights(
       /*numLayers=*/&num_layers_v,
       /*dropoutDesc=*/&dropout_desc,
       /*auxFlags=*/nullptr));
-#else
-  RETURN_IF_CUDNN_ERROR(cudnnGetRNNDescriptor(
-      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-      /*hiddenSize=*/&hidden_size_v,
-      /*numLayers=*/&num_layers_v,
-      /*dropoutDesc=*/&dropout_desc,
-      /*inputMode=*/&input_mode,
-      /*direction=*/&direction,
-      /*mode=*/&mode,
-      /*algo=*/&algo,
-      /*mathPrec=*/&data_type));
-  int out_proj_size_v;
-  RETURN_IF_CUDNN_ERROR(cudnnGetRNNProjectionLayers(
-      /*handle=*/cudnn.handle(),
-      /*rnnDesc=*/rnn_desc,
-      /*recProjSize*/ &rec_proj_size_v,
-      /*outProjSize*/ &out_proj_size_v));
-#endif  // CUDNN_VERSION >= 8100
   if (rec_proj_size_v != hidden_size_v) {
     int region_id = 8;
-#if CUDNN_VERSION >= 8100
     void* b_ptr = nullptr;
     void* m_ptr = nullptr;
     void* w_ptr = nullptr;
@@ -1733,27 +1612,6 @@ absl::Status CheckAndFetchProjectionWeights(
     int64_t size =
         dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
     int64_t offset = static_cast<char*>(m_ptr) - static_cast<char*>(w_ptr);
-#else
-    void* offset = nullptr;
-    RETURN_IF_CUDNN_ERROR(cudnnGetRNNLinLayerMatrixParams(
-        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-        /*layer=*/layer, /*xDesc=*/input_desc.get(),
-        /*wDesc=*/filter_desc.get(),
-        /*w=*/nullptr, /*linLayerID=*/region_id,
-        /*linLayerMatDesc=*/region_desc_handle.get(),
-        /*linLayerMat or linLayerBias=*/&offset));
-    int dims[] = {1, 1, 1};
-    cudnnDataType_t data_type;
-    cudnnTensorFormat_t tensor_format;
-    int n_dims;
-    RETURN_IF_CUDNN_ERROR(cudnnGetFilterNdDescriptor(
-        /*filterDesc=*/region_desc_handle.get(),
-        /*nbDimsRequested=*/sizeof(dims) / sizeof(dims[0]),
-        /*dataType=*/&data_type, /*format=*/&tensor_format,
-        /*nbDims=*/&n_dims, /*filterDimA=*/dims));
-    int64_t size =
-        dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
-#endif  // CUDNN_VERSION >= 8100
     dnn::RnnDescriptor::ParamsRegion region = {static_cast<int64_t>(offset),
                                                size};
     weights->push_back(region);
@@ -1776,16 +1634,9 @@ absl::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
       /*strideA=*/strides));
 
   size_t params_size = 0;
-#if CUDNN_VERSION >= 8100
   RETURN_IF_CUDNN_ERROR(cudnnGetRNNWeightSpaceSize(
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
       /*weightSpaceSize=*/&params_size));
-#else
-  RETURN_IF_CUDNN_ERROR(cudnnGetRNNParamsSize(
-      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-      /*xDesc=*/input_desc.get(), /*sizeInBytes=*/&params_size,
-      /*dataType=*/data_type));
-#endif  // CUDNN_VERSION >= 8100
   int64_t params_size_in_bytes = static_cast<int64_t>(params_size);
 
   FilterDescriptor filter_desc = CreateFilterDescriptor();
@@ -1823,7 +1674,6 @@ absl::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
 
   for (int layer = 0; layer < layer_count; layer++) {
     for (int region = 0; region < region_count_per_layer; region++) {
-#if CUDNN_VERSION >= 8100
       void* m_ptr = nullptr;
       void* b_ptr = nullptr;
       void* w_ptr = nullptr;
@@ -1867,40 +1717,6 @@ absl::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
       int64_t b_offset = static_cast<char*>(b_ptr) - static_cast<char*>(w_ptr);
       dnn::RnnDescriptor::ParamsRegion b_region = {b_offset, b_size};
       biases.push_back(b_region);
-#else
-      for (int type = 0; type < 2; type++) {
-        void* offset = nullptr;
-        RETURN_IF_CUDNN_ERROR(
-            type == 0 ? cudnnGetRNNLinLayerMatrixParams(
-                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
-                            /*wDesc=*/filter_desc.get(),
-                            /*w=*/nullptr, /*linLayerID=*/region,
-                            /*linLayerMatDesc=*/region_desc_handle.get(),
-                            /*linLayerMat or linLayerBias=*/&offset)
-                      : cudnnGetRNNLinLayerBiasParams(
-                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
-                            /*wDesc=*/filter_desc.get(),
-                            /*w=*/nullptr, /*linLayerID=*/region,
-                            /*linLayerMatDesc=*/region_desc_handle.get(),
-                            /*linLayerMat or linLayerBias=*/&offset));
-        int dims[] = {1, 1, 1};
-        cudnnDataType_t data_type;
-        cudnnTensorFormat_t tensor_format;
-        int n_dims;
-        RETURN_IF_CUDNN_ERROR(cudnnGetFilterNdDescriptor(
-            /*filterDesc=*/region_desc_handle.get(),
-            /*nbDimsRequested=*/sizeof(dims) / sizeof(dims[0]),
-            /*dataType=*/&data_type, /*format=*/&tensor_format,
-            /*nbDims=*/&n_dims, /*filterDimA=*/dims));
-        int64_t size =
-            dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
-        dnn::RnnDescriptor::ParamsRegion region = {static_cast<int64_t>(offset),
-                                                   size};
-        (type == 0 ? weights : biases).push_back(region);
-      }
-#endif  // CUDNN_VERSION >= 8100
     }
     TF_RETURN_IF_ERROR(CheckAndFetchProjectionWeights(
         cudnn, rnn_desc, layer, input_desc, filter_desc, params_size_in_bytes,
@@ -2113,16 +1929,9 @@ absl::Status CheckRNNParameterSize(
     const CudnnHandle& cudnn, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc) {
   size_t params_size_in_bytes = 0;
-#if CUDNN_VERSION >= 8100
   RETURN_IF_CUDNN_ERROR(cudnnGetRNNWeightSpaceSize(
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
       /*sizeInBytes=*/&params_size_in_bytes));
-#else
-  RETURN_IF_CUDNN_ERROR(cudnnGetRNNParamsSize(
-      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-      /*xDesc=*/input_desc.handles()[0], /*sizeInBytes=*/&params_size_in_bytes,
-      /*dataType=*/rnn_desc.data_type()));
-#endif
   if (static_cast<int64_t>(params_size_in_bytes) !=
       rnn_desc.ParamsSizeInBytes()) {
     return absl::InvalidArgumentError("Mismatching RNN parameter size");
@@ -2140,7 +1949,6 @@ absl::Status CreateRnnTempSpace(
   size_t reserve_space_size_in_bytes = 0;
   size_t workspace_size_in_bytes = 0;
   if (input_desc.is_var_seq_lengths()) {
-#if CUDNN_VERSION >= 8100
     auto rnn_fwd_mode =
         is_fwd_training ? CUDNN_FWD_MODE_TRAINING : CUDNN_FWD_MODE_INFERENCE;
     RETURN_IF_CUDNN_ERROR(cudnnGetRNNTempSpaceSizes(
@@ -2150,10 +1958,6 @@ absl::Status CreateRnnTempSpace(
         /*xDesc=*/input_desc.data_handle(),
         /*workSpaceSize=*/&workspace_size_in_bytes,
         /*reserveSpaceSize=*/&reserve_space_size_in_bytes));
-#else
-    return tsl::errors::Internal(
-        "Sequence lengths for RNN are supported from CUDNN 8.1+");
-#endif  // CUDNN_VERSION >= 8100
   } else {
 #if CUDNN_VERSION >= 90000
     return tsl::errors::Internal(
@@ -2188,7 +1992,6 @@ absl::Status CreateRnnTempSpace(
   return absl::OkStatus();
 }
 
-#if CUDNN_VERSION >= 7402
 absl::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormForwardWorkspace(
     Stream* stream, const CudnnHandle& cudnn, const cudnnBatchNormMode_t& mode,
     const cudnnBatchNormOps_t& bn_ops,
@@ -2239,8 +2042,6 @@ absl::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormBackwardWorkspace(
   return workspace_allocator->AllocateBytes(workspace_size_in_bytes);
 }
 
-#endif
-
 }  // namespace
 
 // Populates the profile result if not empty.
@@ -2304,10 +2105,6 @@ absl::Status CudnnSupport::DoRnnForwardImpl(
   }
 
   if (input_desc.is_var_seq_lengths()) {
-    // In CUDNN v8, the cudnnRNNForward*** and cudnnRNNForward***Ex have been
-    // deprecated. Instead, we use the cudnnRNNForward which requires the
-    // sequence_lengths parameter.
-#if CUDNN_VERSION >= 8100
     auto rnn_fwd_mode =
         is_training ? CUDNN_FWD_MODE_TRAINING : CUDNN_FWD_MODE_INFERENCE;
     RETURN_IF_CUDNN_ERROR(cudnnRNNForward(
@@ -2326,41 +2123,6 @@ absl::Status CudnnSupport::DoRnnForwardImpl(
         /*workSpaceSize=*/workspace.size(), /*workspace=*/workspace.opaque(),
         /*reserveSpaceSizeInBytes=*/reserve_space.size(),
         /*reserveSpace=*/reserve_space.opaque()));
-#else
-    if (!is_training) {
-      RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInferenceEx(
-          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-          /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
-          /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
-          /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
-          /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
-          /*yDesc=*/output_desc.data_handle(),
-          /*y=*/output_data->opaque(),
-          /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
-          /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
-          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-          nullptr,
-          /*workspace=*/workspace.opaque(),
-          /*workSpaceSizeInBytes=*/workspace.size()));
-    } else {
-      RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTrainingEx(
-          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-          /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
-          /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
-          /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
-          /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
-          /*yDesc=*/output_desc.data_handle(),
-          /*y=*/output_data->opaque(),
-          /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
-          /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
-          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-          nullptr,
-          /*workspace=*/workspace.opaque(),
-          /*workSpaceSizeInBytes=*/workspace.size(),
-          /*reserveSpace=*/reserve_space.opaque(),
-          /*reserveSpaceSizeInBytes=*/reserve_space.size()));
-    }
-#endif  // CUDNN_VERSION >= 8100
   } else {
 #if CUDNN_VERSION >= 90000
     return tsl::errors::Internal(
@@ -2457,10 +2219,6 @@ absl::Status CudnnSupport::DoRnnBackwardImpl(
   }
 
   if (input_desc.is_var_seq_lengths()) {
-    // In CUDNN v8, the cudnnRNNBackward*** and cudnnRNNBackward***Ex have
-    // been deprecated. Instead, we use the cudnnRNNBackward***_v8 which
-    // requires the sequence_lengths parameter.
-#if CUDNN_VERSION >= 8100
     RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardData_v8(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*devSeqLengths=*/
@@ -2480,36 +2238,11 @@ absl::Status CudnnSupport::DoRnnBackwardImpl(
         /*workSpaceSize=*/workspace.size(), /*workSpace=*/workspace.opaque(),
         /*reserveSpaceSize=*/reserve_space_data->size(),
         /*reserveSpace=*/reserve_space_data->opaque()));
-#else
-    RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardDataEx(
-        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-        /*yDesc=*/output_desc.data_handle(), /*y=*/output_data.opaque(),
-        /*dyDesc=*/output_desc.data_handle(),
-        /*dy=*/output_backprop_data.opaque(), nullptr, nullptr,
-        /*dhyDesc=*/output_h_desc.handle(),
-        /*dhy=*/output_h_backprop_data.opaque(),
-        /*dcyDesc=*/output_c_desc.handle(),
-        /*dcy=*/output_c_backprop_data.opaque(),
-        /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
-        /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
-        /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
-        /*dxDesc=*/input_desc.data_handle(),
-        /*dx=*/input_backprop_data->opaque(),
-        /*dhxDesc=*/input_h_desc.handle(),
-        /*dhx=*/input_h_backprop_data->opaque(),
-        /*dcxDesc=*/input_c_desc.handle(),
-        /*dcx=*/input_c_backprop_data->opaque(), nullptr, nullptr,
-        /*workspace=*/workspace.opaque(),
-        /*workSpaceSizeInBytes=*/workspace.size(),
-        /*reserveSpace=*/reserve_space_data->opaque(),
-        /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
-#endif  // CUDNN_VERSION >= 8100
 
     if (params_backprop_data != nullptr) {
       // Clear the dw to zeros.
       TF_RETURN_IF_ERROR(
           stream->MemZero(params_backprop_data, params_backprop_data->size()));
-#if CUDNN_VERSION >= 8100
       RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeights_v8(
           /*handle=*/cudnn.handle(),
           /*rnnDesc=*/rnn_desc.handle(),
@@ -2528,20 +2261,6 @@ absl::Status CudnnSupport::DoRnnBackwardImpl(
           /*workSpace=*/workspace.opaque(),
           /*reserveSpaceSize=*/reserve_space_data->size(),
           /*reserveSpace=*/reserve_space_data->opaque()));
-#else
-      RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeightsEx(
-          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-          /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
-          /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
-          /*yDesc=*/output_desc.data_handle(),
-          /*y=*/output_data.opaque(),
-          /*workspace=*/workspace.opaque(),
-          /*workSpaceSizeInBytes=*/workspace.size(),
-          /*dwDesc=*/rnn_desc.params_handle(),
-          /*dw=*/params_backprop_data->opaque(),
-          /*reserveSpace=*/reserve_space_data->opaque(),
-          /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
-#endif  // CUDNN_VERSION >= 8100
     }
   } else {
 #if CUDNN_VERSION >= 90000
@@ -2617,7 +2336,6 @@ absl::Status CudnnSupport::DoCtcLossImpl(
   int total_size = kNumLabels * kNumTimestamps * kBatchSize;
   (void)total_size;
 
-#if CUDNN_VERSION >= 7603
   cudnnCTCLossAlgo_t ctc_loss_algo =
       static_cast<cudnnCTCLossAlgo_t>(ctc_loss_algo_id);
   RETURN_IF_CUDNN_ERROR(cudnnCTCLoss(
@@ -2631,11 +2349,6 @@ absl::Status CudnnSupport::DoCtcLossImpl(
       /*ctcLossDesc=*/ctc_loss_desc.handle(),
       /*workspace=*/scratch_memory.opaque(),
       /*workSpaceSizeInBytes=*/scratch_memory.size()));
-#else
-  return absl::InvalidArgumentError(
-      "No supported cudnnCTCLoss when "
-      "CUDNN_VERSION < 7.6.3");
-#endif
 
   return absl::OkStatus();
 }
@@ -2997,7 +2710,6 @@ absl::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
     const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, bool specify_workspace_limit,
     size_t memory_limit_bytes) {
-#if CUDNN_VERSION >= 8000
   const int num_requested_algos = 5;
   int num_returned_algos = 0;
   cudnnConvolutionFwdAlgoPerf_t perf_results[num_requested_algos];
@@ -3018,16 +2730,6 @@ absl::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
   return absl::InternalError(
       "cudnnGetConvolutionForwardAlgorithm_v7 returned "
       "no suitable algorithms. This could be a cudnn bug.");
-#else
-  cudnnConvolutionFwdPreference_t preference =
-      specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-                              : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-  cudnnConvolutionFwdAlgo_t algo_to_use;
-  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionForwardAlgorithm(
-      cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
-      output_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
-  return algo_to_use;
-#endif
 }
 
 absl::StatusOr<cudnnConvolutionBwdDataAlgo_t>
@@ -3038,7 +2740,6 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
                                     const CudnnTensorDescriptor& output_nd,
                                     bool specify_workspace_limit,
                                     size_t memory_limit_bytes) {
-#if CUDNN_VERSION >= 8000
   const int num_requested_algos = 5;
   int num_returned_algos = 0;
   cudnnConvolutionBwdDataAlgoPerf_t perf_results[num_requested_algos];
@@ -3060,17 +2761,6 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
   return absl::InternalError(
       "cudnnGetConvolutionBackwardDataAlgorithm_v7 returned "
       "no suitable algorithms. This could be a cudnn bug.");
-#else
-  cudnnConvolutionBwdDataPreference_t preference =
-      specify_workspace_limit
-          ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
-          : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
-  cudnnConvolutionBwdDataAlgo_t algo_to_use;
-  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardDataAlgorithm(
-      cudnn.handle(), filter.handle(), output_nd.handle(), conv.handle(),
-      input_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
-  return algo_to_use;
-#endif
 }
 
 absl::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
@@ -3081,7 +2771,6 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
                                       const CudnnTensorDescriptor& output_nd,
                                       bool specify_workspace_limit,
                                       size_t memory_limit_bytes) {
-#if CUDNN_VERSION >= 8000
   const int num_requested_algos = 5;
   int num_returned_algos = 0;
   cudnnConvolutionBwdFilterAlgoPerf_t perf_results[num_requested_algos];
@@ -3102,17 +2791,6 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
   return absl::InternalError(
       "cudnnGetConvolutionBackwardFilterAlgorithm_v7 returned "
       "no suitable algorithms. This could be a cudnn bug.");
-#else
-  cudnnConvolutionBwdFilterPreference_t preference =
-      specify_workspace_limit
-          ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
-          : CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-  cudnnConvolutionBwdFilterAlgo_t algo_to_use;
-  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterAlgorithm(
-      cudnn.handle(), input_nd.handle(), output_nd.handle(), conv.handle(),
-      filter.handle(), preference, memory_limit_bytes, &algo_to_use));
-  return algo_to_use;
-#endif
 }
 
 absl::StatusOr<DeviceMemory<uint8_t>> AllocateCudnnConvolutionForwardWorkspace(
@@ -3486,8 +3164,6 @@ struct FftTilingForward {
 // winograd-non-fused engines will be ruled out.
 struct WinogradNonfused {
   static constexpr const char* kName = "TF_ENABLE_WINOGRAD_NONFUSED";
-  // NVIDIA has fixed winograd nonfused bug for cudnn v>=7. For older versions,
-  // we have a workaround.
   static constexpr bool kDefaultFlag = true;
 };
 
@@ -3513,20 +3189,11 @@ struct ConvDoFP32ComputationFP16Input {
 // in precision.
 struct RnnDoFP32ComputationFP16Input {
   static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE";
-  // TODO(jamesqin): b/78182362 flip to true when cudnn 7.1.4 fixes the bug.
-  // Before cudnn 7.1.4 RNN are always done in fp32, no matter what math
-  // precision is set.
-  // Set it temporary to false s.t. no error is raised when using fp16 inputs,
-  // fp32 math precision.
-  //
-  // cuDNN == 7.5.0 is verified to have this fixed.
-  static constexpr bool kDefaultFlag = CUDNN_VERSION >= 7500;
+  static constexpr bool kDefaultFlag = true;
 };
 
 namespace {
 
-#if CUDNN_VERSION >= 8100
-
 bool GenericEngineFilter(cudnnBackendDescriptor_t engine_config,
                          bool disable_winograd, bool disable_nondeterminism,
                          bool disable_tensor_core) {
@@ -3552,8 +3219,6 @@ bool GenericEngineFilter(cudnnBackendDescriptor_t engine_config,
   return ret;
 }
 
-#endif  // CUDNN_VERSION >= 8100
-
 }  // namespace
 
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
@@ -3587,13 +3252,11 @@ dnn::DataType GetConvActivationType(dnn::DataType data_type) {
     case dnn::DataType::kInt8:
     case dnn::DataType::kInt32:  // TODO(awpr): does int32 do blending in float?
       return dnn::DataType::kFloat;
-#if CUDNN_VERSION >= 8200
     // TODO(awpr): as with kHalf, this is not clear.
     case dnn::DataType::kBF16:
       return CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
                  ? dnn::DataType::kFloat
                  : dnn::DataType::kBF16;
-#endif
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
@@ -3611,24 +3274,18 @@ dnn::DataType GetConvAccumulatorType(dnn::DataType data_type) {
     case dnn::DataType::kInt8:
     case dnn::DataType::kInt32:
       return dnn::DataType::kInt32;
-#if CUDNN_VERSION >= 8200
     case dnn::DataType::kBF16:
       return CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
                  ? dnn::DataType::kFloat
                  : dnn::DataType::kBF16;
-#endif
-#if CUDNN_VERSION >= 8900
     case dnn::DataType::kF8E4M3FN:
     case dnn::DataType::kF8E5M2:
       return dnn::DataType::kFloat;
-#endif
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
 
-#if CUDNN_VERSION >= 8100
-
 namespace {
 static bool allowAllConfig(cudnnBackendDescriptor_t engine_config) {
   (void)engine_config;
@@ -3696,7 +3353,6 @@ std::tuple<int, int> GetTensorVectorSizeAndDim(
   return std::make_tuple(vector_size, vector_dim);
 }
 
-#if CUDNN_VERSION >= 8800
 absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
     absl::Span<const int64_t> dims, absl::Span<const int64_t> strides,
     int64_t uid, dnn::DataType dtype, int64_t vec_count, int64_t vec_dim,
@@ -3718,41 +3374,10 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
   RETURN_MSG_IF_CUDNN_ERROR(tensor);
   return tensor;
 }
-#else
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
-    absl::Span<const int64_t> dims, absl::Span<const int64_t> strides,
-    int64_t uid, dnn::DataType dtype, int64_t vec_count, int64_t vec_dim,
-    bool is_virtual = false, bool is_reordered_nchw_vect = false) {
-  if (is_reordered_nchw_vect && (CUDNN_VERSION) < 8300) {
-    return tsl::errors::Internal(
-        "reordered nchw_vect requires cudnn 8.3+, but version was %d",
-        (CUDNN_VERSION));
-  }
-  auto tensor = cudnn_frontend::TensorBuilder()
-                    .setDim(dims.size(), dims.data())
-                    .setStride(strides.size(), strides.data())
-                    .setId(uid)
-                    .setAlignment(32)
-                    .setDataType(ToCudnnDataType(dtype))
-                    .setVectorCountAndDimension(vec_count, vec_dim)
-                    .setVirtual(is_virtual)
-// TODO(jlebar): remove guard after JAX no longer supports old cudnn
-#if CUDNN_VERSION >= 8300
-                    .setReorderType(is_reordered_nchw_vect
-
-                                        ? CUDNN_TENSOR_REORDERING_INT8x32
-                                        : CUDNN_TENSOR_REORDERING_NONE)
-#endif
-                    .build();
-  RETURN_MSG_IF_CUDNN_ERROR(tensor);
-  return tensor;
-}
-#endif
 
 absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
     const cudnn_frontend::Tensor& original, int64_t uid, dnn::DataType dtype,
     bool is_virtual = false) {
-#if CUDNN_VERSION >= 8900
   auto tensor = cudnn_frontend::TensorBuilder()
                     .cloneFrom(original, uid)
                     .setAlignment(32)
@@ -3761,13 +3386,8 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
                     .build();
   RETURN_MSG_IF_CUDNN_ERROR(tensor);
   return tensor;
-#else
-  return tsl::errors::Internal("Not implemented.");
-#endif  // CUDNN_VERSION >= 8900
 }
 
-#if CUDNN_VERSION >= 8800
-
 absl::StatusOr<cudnn_frontend::PointWiseDesc> CreatePwDesc(
     dnn::DataType dtype, cudnnPointwiseMode_t mode) {
   auto pw_desc_created = cudnn_frontend::PointWiseDescBuilder()
@@ -3821,7 +3441,6 @@ absl::StatusOr<cudnn_frontend::Operation> CreateTernaryPwOp(
   RETURN_MSG_IF_CUDNN_ERROR(pw_op_created);
   return pw_op_created;
 }
-#endif  // CUDNN_VERSION >= 8800
 
 absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
@@ -3868,31 +3487,17 @@ GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
   std::vector<int64_t> filter_strides = filter_descriptor.vectorized_strides(
       dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
 
-#if CUDNN_VERSION >= 8800
   cudnnBackendTensorReordering_t tensor_ordering_type =
       filter_descriptor.layout() ==
               dnn::FilterLayout::kOutputInputYX32_CudnnReordered
           ? CUDNN_TENSOR_REORDERING_INT8x32
           : CUDNN_TENSOR_REORDERING_NONE;
-#else
-  bool is_reordered_nchw_vect =
-      filter_descriptor.layout() ==
-      dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
-#endif
 
-#if CUDNN_VERSION >= 8800
   TF_ASSIGN_OR_RETURN(
       auto tensor_w,
       CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
                         vector_size, vector_dim,
                         /*is_virtual=*/false, tensor_ordering_type));
-#else
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_w,
-      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
-                        vector_size, vector_dim,
-                        /*is_virtual=*/false, is_reordered_nchw_vect));
-#endif
 
   // conv_desc.
   auto mode = convolution_descriptor.convolution_not_crosscorr()
@@ -3981,6 +3586,12 @@ OpNameStringToOperandKindAndMode(std::string opstring) {
 
   KINDS_AND_MODE_FROM_OP_STRING("add", TensorKind::kTensor, TensorKind::kTensor,
                                 CUDNN_POINTWISE_ADD)
+  KINDS_AND_MODE_FROM_OP_STRING("min", TensorKind::kTensor, TensorKind::kTensor,
+                                CUDNN_POINTWISE_MIN)
+  KINDS_AND_MODE_FROM_OP_STRING("max", TensorKind::kTensor, TensorKind::kTensor,
+                                CUDNN_POINTWISE_MAX)
+  KINDS_AND_MODE_FROM_OP_STRING("elu", TensorKind::kNone, TensorKind::kTensor,
+                                CUDNN_POINTWISE_ELU_FWD)
   KINDS_AND_MODE_FROM_OP_STRING("relu", TensorKind::kNone, TensorKind::kTensor,
                                 CUDNN_POINTWISE_RELU_FWD)
   KINDS_AND_MODE_FROM_OP_STRING("scale", TensorKind::kScalar,
@@ -3997,16 +3608,15 @@ OpNameStringToOperandKindAndMode(std::string opstring) {
 // Struct describing the convolution, pointwise and reduction ops in the
 // graph.
 struct OpDescriptor {
-  int uid;                         // The UID of the op.
-  std::optional<int> operand_uid;  // The UID of the at most one operand of the
-                                   // op which is part of the graph.
-  OpMode mode;                     // The mode describing the op.
-  TensorKind operand_kind;    // The kind of a second operand (side input) not
-                              // represented in the graph.
-  TensorKind result_kind;     // The kind of the output.
-  dnn::DataType result_type;  // The type of the output.
-  bool is_virtual;            // A virtual op has a user within the graph.
-  int sequence_index;         // The index of the op in the sequence.
+  int uid;                        // The UID of the op.
+  std::vector<int> operand_uids;  // The UIDs of the operands of the op that
+                                  // are part of the graph.
+  OpMode mode;                    // The mode describing the op.
+  TensorKind operand_kind;        // The kind of a second operand.
+  TensorKind result_kind;         // The kind of the output.
+  dnn::DataType result_type;      // The type of the output.
+  bool is_virtual;                // A virtual op has a user within the graph.
+  int sequence_index;             // The index of the op in the sequence.
 };
 
 // Class describing the graph of ops to be fused into the cuDNN convolution
@@ -4015,13 +3625,13 @@ class OpGraph {
  public:
   OpGraph() = default;
 
-  absl::Status AddOp(int uid, std::optional<int> operand_uid, OpMode mode,
+  absl::Status AddOp(int uid, std::vector<int> operand_uids, OpMode mode,
                      TensorKind operand_kind, TensorKind result_kind,
                      dnn::DataType result_type) {
-    ops_.emplace_back(OpDescriptor({uid, operand_uid, mode, operand_kind,
+    ops_.emplace_back(OpDescriptor({uid, operand_uids, mode, operand_kind,
                                     result_kind, result_type, false, -1}));
-    // If it exists, the operand is virtual.
-    if (operand_uid.has_value()) {
+    // If they exist, the operands are virtual.
+    for (int operand_uid : operand_uids) {
       auto it = std::find_if(
           ops_.begin(), ops_.end(),
           [operand_uid](OpDescriptor op) { return op.uid == operand_uid; });
@@ -4086,8 +3696,8 @@ GetGenericCudnnOperationGraph(
 
   // The format of the serialized graph describing a sequence of ops fused
   // into the cuDNN convolution Custom Call is
-  // "UID:[output_type]conv();UID[output_type]:op_name(operand
-  // UID);UID:[output_type]op_name(operand UID);..." with the convolution
+  // "UID:[output_type]conv();UID:[output_type]op_name(operand
+  // UIDs);UID:[output_type]op_name(operand UIDs);..." with the convolution
   // assumed to be the first op in the graph. Operand UIDs identifying ops
   // outside the serialized graph are elided.
   auto deserialize_cudnn_graph = [&]() -> absl::StatusOr<OpGraph> {
@@ -4102,10 +3712,16 @@ GetGenericCudnnOperationGraph(
       std::string data_type_string = serialized_graph.substr(m + 1, n - m - 1);
       m = serialized_graph.find('(', pos);
       std::string op_string = serialized_graph.substr(n + 1, m - n - 1);
-      std::optional<int> operand;
-      std::string::size_type l = serialized_graph.find(')', m + 1);
-      if (l > m + 1) {
-        operand = std::stoi(serialized_graph.substr(m + 1, l - m - 1));
+      std::vector<int> operands;
+      std::string::size_type l = serialized_graph.find_first_of(",)", m + 1);
+      while (l > m + 1) {
+        operands.push_back(
+            std::stoi(serialized_graph.substr(m + 1, l - m - 1)));
+        if (serialized_graph[l] == ')') {
+          break;
+        }
+        m = l;
+        l = serialized_graph.find_first_of(",)", m + 1);
       }
 
       if (serialized_graph[l + 1] != ';') {
@@ -4122,7 +3738,7 @@ GetGenericCudnnOperationGraph(
           return tsl::errors::Internal(
               "The graph must not contain more than one convolution op.");
         }
-        if (operand.has_value()) {
+        if (!operands.empty()) {
           return tsl::errors::Internal(
               "Convolution op must not have operands in the graph.");
         }
@@ -4136,15 +3752,16 @@ GetGenericCudnnOperationGraph(
           return tsl::errors::Internal(
               "The first op in the graph must be a convolution.");
         }
-        if (!operand.has_value()) {
+        if (operands.empty()) {
           return tsl::errors::Internal(
-              "Non-convolution op must have one operand in the graph.");
+              "Non-convolution op must have one or more operands in the "
+              "graph.");
         }
         TF_ASSIGN_OR_RETURN(std::tie(binary_operand_kind, output_kind, mode),
                             OpNameStringToOperandKindAndMode(op_string));
       }
-      TF_RETURN_IF_ERROR(op_graph.AddOp(uid, operand, mode, binary_operand_kind,
-                                        output_kind, output_type));
+      TF_RETURN_IF_ERROR(op_graph.AddOp(
+          uid, operands, mode, binary_operand_kind, output_kind, output_type));
     }
     return op_graph;
   };
@@ -4156,6 +3773,7 @@ GetGenericCudnnOperationGraph(
 
   std::vector<int64_t> virtual_uids, operand_uids, output_uids;
   std::vector<cudnn_frontend::Operation> ops;
+  std::vector<cudnn_frontend::Tensor> result_tensors;
 
   auto next_uid = [&operand_uids, &output_uids, &virtual_uids](
                       bool is_operand, bool is_virtual) -> int64_t {
@@ -4268,61 +3886,71 @@ GetGenericCudnnOperationGraph(
                                      .setBeta(beta)
                                      .build();
   RETURN_MSG_IF_CUDNN_ERROR(op);
-  // Add the convolution to the cuDNN graph.
-  ops.push_back(std::move(op));
-  TF_RETURN_IF_ERROR(
-      op_graph.SetSequenceIndex(op_descriptor.uid, ops.size() - 1));
 
   VLOG(4) << "\nTensor_x: " << tensor_x.describe()
           << "\nTensor_y: " << tensor_y.describe()
           << "\nTensor_w: " << tensor_w.describe()
           << "\nConv desc: " << conv_desc.describe()
-          << "\nOp: " << ops.back().describe();
+          << "\nOp: " << op.describe();
+
+  // Add the convolution to the cuDNN graph.
+  ops.push_back(std::move(op));
+  result_tensors.push_back(std::move(tensor_y));
+  TF_RETURN_IF_ERROR(
+      op_graph.SetSequenceIndex(op_descriptor.uid, ops.size() - 1));
 
   for (int op_index = 1; op_index < op_graph.Size(); ++op_index) {
     TF_ASSIGN_OR_RETURN(op_descriptor, op_graph.OpDescriptorAt(op_index));
-    TF_ASSIGN_OR_RETURN(
-        OpDescriptor preceding_op,
-        op_graph.FindOpDescriptor(op_descriptor.operand_uid.value()));
-    std::optional<cudnn_frontend::Tensor> second_operand, result;
+    std::vector<OpDescriptor> preceding_ops;
+    preceding_ops.reserve(op_descriptor.operand_uids.size());
+    for (int operand_uid : op_descriptor.operand_uids) {
+      preceding_ops.emplace_back(
+          op_graph.FindOpDescriptor(operand_uid).value());
+    }
+    std::optional<cudnn_frontend::Tensor> external_operand;
 
-    // Create cuDNN tensors for operands of binary ops (side inputs).
-    if (op_descriptor.operand_kind == TensorKind::kScalar) {
+    // Create a cuDNN tensor for the potential non-graph operand of
+    // non-convolution binary ops (side input).
+    if (op_descriptor.operand_kind == TensorKind::kScalar &&
+        preceding_ops.size() == 1) {
       std::vector<int64_t> scale_dim(4, 1);
       TF_ASSIGN_OR_RETURN(
-          second_operand,
+          external_operand,
           CreateCudnnTensor(scale_dim, scale_dim,
                             next_uid(/*is_operand=*/true, /*is_virtual=*/false),
-                            preceding_op.result_type, 1, -1));
-      VLOG(4) << "\nPointwise operand: " << second_operand->describe();
-    } else if (op_descriptor.operand_kind == TensorKind::kTensor) {
+                            preceding_ops[0].result_type, 1, -1));
+      VLOG(4) << "\nPointwise operand: " << external_operand->describe();
+    } else if (op_descriptor.operand_kind == TensorKind::kTensor &&
+               preceding_ops.size() == 1) {
       TF_ASSIGN_OR_RETURN(
-          second_operand,
+          external_operand,
           CreateCudnnTensor(tensor_y,
                             next_uid(/*is_operand=*/true, /*is_virtual=*/false),
-                            preceding_op.result_type,
+                            preceding_ops[0].result_type,
                             /*is_virtual=*/false));
-      VLOG(4) << "\nPointwise operand: " << second_operand->describe();
+      VLOG(4) << "\nPointwise operand: " << external_operand->describe();
     }
 
     // Create the result tensor of the op.
     if (op_descriptor.result_kind == TensorKind::kScalar) {
       std::vector<int64_t> scale_dim(4, 1);
-      TF_ASSIGN_OR_RETURN(
-          result, CreateCudnnTensor(
-                      scale_dim, scale_dim,
-                      next_uid(/*is_operand=*/false, /*is_virtual=*/false),
-                      op_descriptor.result_type, 1, -1));
-      VLOG(4) << "\nScalar result: " << result->describe();
+      TF_ASSIGN_OR_RETURN(cudnn_frontend::Tensor result,
+                          CreateCudnnTensor(scale_dim, scale_dim,
+                                            next_uid(/*is_operand=*/false,
+                                                     /*is_virtual=*/false),
+                                            op_descriptor.result_type, 1, -1));
+      VLOG(4) << "\nScalar result: " << result.describe();
+      result_tensors.push_back(std::move(result));
     } else if (op_descriptor.result_kind == TensorKind::kTensor) {
       TF_ASSIGN_OR_RETURN(
-          result,
+          cudnn_frontend::Tensor result,
           CreateCudnnTensor(tensor_y,
                             next_uid(/*is_operand=*/false,
                                      /*is_virtual=*/op_descriptor.is_virtual),
                             op_descriptor.result_type,
                             /*is_virtual=*/op_descriptor.is_virtual));
-      VLOG(4) << "\nTensor result: " << result->describe();
+      VLOG(4) << "\nTensor result: " << result.describe();
+      result_tensors.push_back(std::move(result));
     }
 
     if (std::holds_alternative<cudnnPointwiseMode_t>(op_descriptor.mode)) {
@@ -4334,22 +3962,30 @@ GetGenericCudnnOperationGraph(
               .build();
       VLOG(4) << "\nPointwise op desc: " << desc.describe();
       // Add the op to the operation graph.
-      if (second_operand.has_value()) {
+      if (external_operand.has_value()) {
         ops.emplace_back(
             cudnn_frontend::OperationBuilder(
                 CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                .setxDesc(ops[preceding_op.sequence_index].getOutputTensor())
-                .setbDesc(second_operand.value())
-                .setyDesc(result.value())
+                .setxDesc(result_tensors[preceding_ops[0].sequence_index])
+                .setbDesc(external_operand.value())
+                .setyDesc(result_tensors.back())
+                .setpwDesc(desc)
+                .build());
+      } else if (preceding_ops.size() == 2) {
+        ops.emplace_back(
+            cudnn_frontend::OperationBuilder(
+                CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                .setxDesc(result_tensors[preceding_ops[0].sequence_index])
+                .setbDesc(result_tensors[preceding_ops[1].sequence_index])
+                .setyDesc(result_tensors.back())
                 .setpwDesc(desc)
                 .build());
-
       } else {
         ops.emplace_back(
             cudnn_frontend::OperationBuilder(
                 CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                .setxDesc(ops[preceding_op.sequence_index].getOutputTensor())
-                .setyDesc(result.value())
+                .setxDesc(result_tensors[preceding_ops[0].sequence_index])
+                .setyDesc(result_tensors.back())
                 .setpwDesc(desc)
                 .build());
       }
@@ -4368,8 +4004,8 @@ GetGenericCudnnOperationGraph(
       ops.emplace_back(
           cudnn_frontend::OperationBuilder(
               CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-              .setxDesc(ops[preceding_op.sequence_index].getOutputTensor())
-              .setyDesc(result.value())
+              .setxDesc(result_tensors[preceding_ops[0].sequence_index])
+              .setyDesc(result_tensors.back())
               .setreductionDesc(desc)
               .build());
     }
@@ -4391,7 +4027,7 @@ GetGenericCudnnOperationGraph(
   non_virtual_uids.insert(non_virtual_uids.end(), output_uids.begin(),
                           output_uids.end());
 
-  return make_pair(
+  return std::make_pair(
       std::make_unique<cudnn_frontend::OperationGraph>(std::move(opGraph)),
       non_virtual_uids);
 }
@@ -4467,33 +4103,18 @@ GetCudnnFusedOperationGraph(
   std::vector<int64_t> filter_strides = filter_descriptor.vectorized_strides(
       dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
 
-#if CUDNN_VERSION >= 8800
   cudnnBackendTensorReordering_t tensor_ordering_type =
       filter_descriptor.layout() ==
               dnn::FilterLayout::kOutputInputYX32_CudnnReordered
           ? CUDNN_TENSOR_REORDERING_INT8x32
           : CUDNN_TENSOR_REORDERING_NONE;
-#else
-  bool is_reordered_nchw_vect =
-      filter_descriptor.layout() ==
-      dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
-#endif
 
-#if CUDNN_VERSION >= 8800
   TF_ASSIGN_OR_RETURN(
       auto tensor_w,
       CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
                         vector_size, vector_dim,
                         /*is_virtual=*/false,
                         tensor_ordering_type));  // cuDNN 8.3 fails here
-#else
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_w,
-      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
-                        vector_size, vector_dim,
-                        /*is_virtual=*/false,
-                        is_reordered_nchw_vect));  // cuDNN 8.3 fails here
-#endif
 
   // For the purposes of the cudnn graph, say that the bias tensor has the same
   // layout as the output tensor.  It doesn't actually matter, because bias is a
@@ -4527,15 +4148,9 @@ GetCudnnFusedOperationGraph(
   // kFloat). If it's not, then cuDNN silently does the reordering under the
   // hood, which yields incorrect results as we already do the reordering
   // ourselves.
-  auto maybe_tensor_b = CreateCudnnTensor(bias_dims, bias_strides, 'b',
-                                          bias_type, vector_size, vector_dim,
-                                          /*is_virtual=*/false,
-#if CUDNN_VERSION >= 8800
-                                          tensor_ordering_type
-#else
-                                          is_reordered_nchw_vect
-#endif
-  );  // cuDNN 8.3 fails here
+  auto maybe_tensor_b = CreateCudnnTensor(
+      bias_dims, bias_strides, 'b', bias_type, vector_size, vector_dim,
+      /*is_virtual=*/false, tensor_ordering_type);
   if (!maybe_tensor_b.ok()) {
     maybe_tensor_b = CreateCudnnTensor(bias_dims, bias_strides, 'b', bias_type,
                                        vector_size, vector_dim);
@@ -4871,7 +4486,6 @@ GetCudnnFusedMatmulGraph(dnn::DataType input_type, dnn::DataType bias_type,
 static absl::StatusOr<cudnn_frontend::ExecutionPlan> GetExecPlanFromHeuristics(
     cudnn_frontend::OperationGraph&& opGraph, const CudnnHandle& cudnn,
     bool include_fallback_heuristics = false) {
-#if (CUDNN_VERSION >= 8800)
   cudnn_frontend::EngineConfigList engine_configs;
   if (!include_fallback_heuristics) {
     cudnn_frontend::get_heuristics_list<1>(
@@ -4909,9 +4523,6 @@ static absl::StatusOr<cudnn_frontend::ExecutionPlan> GetExecPlanFromHeuristics(
   LOG(FATAL) << "Failed to generate cuDNN execution plan for opGraph "
              << opGraph.getTag()
              << ". absl::Status of final plan: " << CudnnStatusToString(status);
-#else
-  return absl::UnimplementedError("Supported only for cuDNN >= 8.8.0");
-#endif
 }
 
 static absl::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
@@ -4966,8 +4577,6 @@ static absl::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
   return {std::move(plan)};
 }
 
-#endif  // CUDNN_VERSION >= 8100
-
 }  // namespace
 
 void FixDimsForRaggedOffset(std::vector<int64_t>& dims, int max_reg_per_batch) {
@@ -4981,10 +4590,12 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     const dnn::MatmulTensorDescriptor& v_descriptor,
     const dnn::TensorDescriptor& o_descriptor,
     const std::optional<dnn::TensorDescriptor> bias_descriptor,
-    const std::optional<dnn::TensorDescriptor> stats_descriptor, double scale,
-    const bool use_dropout, const std::optional<double> dropout_rate,
-    const dnn::FMHAMaskKind mask_type, const int sliding_window_length,
-    const int max_seg_per_batch) {
+    const std::optional<dnn::TensorDescriptor> stats_descriptor,
+    const std::optional<dnn::TensorDescriptor> page_table_k_descriptor,
+    const std::optional<dnn::TensorDescriptor> page_table_v_descriptor,
+    double scale, const bool use_dropout,
+    const std::optional<double> dropout_rate, const dnn::FMHAMaskKind mask_type,
+    const int sliding_window_length, const int max_seg_per_batch) {
   using cudnn_frontend::graph::Tensor_attributes;
 
 #if CUDNN_VERSION >= 90000
@@ -5015,6 +4626,9 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
 
   auto next_uid = [uid = 0]() mutable -> int { return CuDnnTensorUID(uid++); };
 
+  bool is_paged_attention = page_table_k_descriptor.has_value() &&
+                            page_table_v_descriptor.has_value();
+
   std::vector<int64_t> q_dims = q_descriptor.GetCudnnCompatibleDimensions(true);
   std::vector<int64_t> k_dims = k_descriptor.GetCudnnCompatibleDimensions(true);
   std::vector<int64_t> v_dims =
@@ -5068,7 +4682,8 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
   // Setting actual seqlen
   bool is_padding = mask_type == dnn::FMHAMaskKind::PADDING ||
                     mask_type == dnn::FMHAMaskKind::PADDING_CAUSAL;
-  if (is_padding || max_seg_per_batch > 1) {
+
+  if (is_padding || max_seg_per_batch > 1 || is_paged_attention) {
     // Get batch size
     auto b = q_dims[0];
     auto seq_q_tensor =
@@ -5113,6 +4728,29 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     v_tensor->set_ragged_offset(offset_kv);
   }
 
+  if (is_paged_attention) {
+    auto page_table_k = graph.tensor(
+        Tensor_attributes()
+            .set_name("page_table_k")
+            .set_dim(page_table_k_descriptor->dimensions())
+            .set_stride(page_table_k_descriptor->GetLogicalStrides())
+            .set_uid(next_uid())
+            .set_data_type(cudnn_frontend::DataType_t::INT32));
+
+    auto page_table_v = graph.tensor(
+        Tensor_attributes()
+            .set_name("page_table_v")
+            .set_dim(page_table_v_descriptor->dimensions())
+            .set_stride(page_table_v_descriptor->GetLogicalStrides())
+            .set_uid(next_uid())
+            .set_data_type(cudnn_frontend::DataType_t::INT32));
+    sdpa_options.set_paged_attention_k_table(page_table_k);
+    sdpa_options.set_paged_attention_v_table(page_table_v);
+    auto num_blocks_per_batch = page_table_v_descriptor->dimensions()[2];
+    auto block_size = k_dims[2];
+    sdpa_options.set_paged_attention_max_seq_len_kv(num_blocks_per_batch *
+                                                    block_size);
+  }
   // Setting seed and offset
   std::shared_ptr<Tensor_attributes> seed_tensor;
   std::shared_ptr<Tensor_attributes> offset_tensor;
@@ -5611,6 +5249,7 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
     const dnn::TensorDescriptor& dq_desc, const dnn::TensorDescriptor& dk_desc,
     const dnn::TensorDescriptor& dv_desc,
     const std::optional<dnn::TensorDescriptor> bias_descriptor,
+    const std::optional<dnn::TensorDescriptor> dbias_descriptor,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
     double scale, bool use_dropout, bool use_bias, dnn::FMHAMaskKind mask_type,
     bool force_deterministic, const int sliding_window_length,
@@ -5735,12 +5374,13 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
     // shapes [1, 1, s, s], [b, 1, s, s], [b, h, s, s] are not supported for
     // dbias calculation but they are supported for forward bias calculation
     // Set UID later: this is the last output tuple element.
-    if (b == 1 && n == q_n) {
+    if (dbias_descriptor != std::nullopt) {
+      DCHECK(b == 1 && n == q_n);
       d_bias_tensor =
           graph.tensor(Tensor_attributes()
                            .set_name("dBias")
-                           .set_dim(bias_descriptor->dimensions())
-                           .set_stride(bias_descriptor->GetLogicalStrides()));
+                           .set_dim(dbias_descriptor->dimensions())
+                           .set_stride(dbias_descriptor->GetLogicalStrides()));
       sdpa_backward_options.set_dbias(d_bias_tensor);
     }
   }
@@ -6043,16 +5683,7 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
     }
 
     const auto get_fwd_bugs = [&]() -> absl::Status {
-#if CUDNN_VERSION < 8000
-      if (algo_id_ == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM &&
-          ToCudnnDataType(input_type_) == CUDNN_DATA_INT8 &&
-          ToCudnnDataType(output_type_) == CUDNN_DATA_FLOAT) {
-        return absl::FailedPreconditionError(
-            "This configuration potentially produces incorrect results.");
-      }
-#else
       (void)output_type_;  // To stop clang-tidy saying it's unused.
-#endif
       return absl::OkStatus();
     };
 
@@ -6265,7 +5896,6 @@ class ScalingParam {
   dnn::DataType default_target_dtype_;
 };
 
-#if CUDNN_VERSION >= 8100
 struct BackendDescriptorDeleter {
   void operator()(cudnnBackendDescriptor_t desc) {
     cudnnBackendDestroyDescriptor(desc);
@@ -6670,11 +6300,10 @@ absl::Status CreateOpRunners(
 }
 
 }  // namespace
-#endif  // CUDNN_VERSION >= 8100
 
 absl::Status CudnnSupport::GetConvolveRunners(
-    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-    dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor,
     DeviceMemoryBase /*input_data*/,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -6685,85 +6314,6 @@ absl::Status CudnnSupport::GetConvolveRunners(
     ScratchAllocator* /*scratch_allocator*/,
     const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans) {
-  // cuDNN frontend support became sufficiently stable to use in 8.1.
-  // TODO(awpr): remove this condition once support for cuDNN 8.0 is dropped.
-  const bool is_pre_frontend_cudnn = CUDNN_VERSION < 8100;
-
-  // cuDNN frontend support for Tx32 convolutions added in 8.3.
-  // If the filter is not reordered, do not use frontend (it is slow).
-  const bool is_disabled_x32 =
-      input_descriptor.layout() == dnn::kBatchDepthYX32 &&
-      (CUDNN_VERSION < 8300 ||
-       filter_descriptor.layout() !=
-           dnn::FilterLayout::kOutputInputYX32_CudnnReordered);
-
-  const bool actually_use_cudnn_frontend =
-      use_cudnn_frontend && !is_pre_frontend_cudnn && !is_disabled_x32;
-
-  if (use_cudnn_frontend && !actually_use_cudnn_frontend) {
-    // This will happen once per unique conv configuration/shape that gets
-    // affected (and not, for example, on every conv launch).  Confusion over
-    // whether this has happened or not has repeatedly wasted a lot of time
-    // debugging, so be sure it shows up in the logs.
-    LOG(INFO) << "Disabling cuDNN frontend for the following convolution:\n"
-              << "  input: " << input_descriptor.ToString() << "\n"
-              << "  filter: " << filter_descriptor.ToString() << "\n"
-              << "  " << convolution_descriptor.ToString() << "\n"
-              << "  ... because "
-              << (is_disabled_x32
-                      ? "Tx32 convolutions are disabled."
-                      : "the current cuDNN version does not support it.");
-  }
-
-  if (!actually_use_cudnn_frontend) {
-    auto cuda_compute_capability = stream->GetCudaComputeCapability();
-    std::vector<dnn::AlgorithmDesc> algorithms;
-    bool got_algos = false;
-    switch (kind) {
-      default:
-        return tsl::errors::Internal(absl::StrFormat(
-            "Unknown ConvolutionKind for unfused conv: %d", kind));
-      case dnn::ConvolutionKind::FORWARD:
-        got_algos = GetConvolveAlgorithms(cuda_compute_capability, input_type,
-                                          numeric_options, &algorithms);
-        break;
-      case dnn::ConvolutionKind::BACKWARD_FILTER:
-        got_algos = GetConvolveBackwardFilterAlgorithms(
-            cuda_compute_capability, input_type, numeric_options, &algorithms);
-        break;
-      case dnn::ConvolutionKind::BACKWARD_DATA:
-        got_algos = GetConvolveBackwardDataAlgorithms(
-            cuda_compute_capability, input_type, numeric_options, &algorithms);
-        break;
-    }
-    if (!got_algos) {
-      return absl::UnknownError(
-          absl::StrFormat("Listing algorithms failed for kind %d", kind));
-    }
-
-    for (const auto& algo : algorithms) {
-      auto runner_or = ConvolveRunnerFromDesc(
-          stream, algo, kind, input_type, output_type, input_descriptor,
-          filter_descriptor, output_descriptor, convolution_descriptor);
-      if (!runner_or.ok()) {
-        // Failures here can result from trying to query the workspace size
-        // for algorithms that aren't supported for the present configuration.
-        // This means we'll now return only supported algorithms, unlike the
-        // predecessor 'GetConvolveAlgorithms', which returned all existing
-        // algorithms regardless of any particular configuration.
-        //
-        // TODO(awpr): can we arrange for the expected errors here to have a
-        // particular error code (e.g. UNIMPLEMENTED or INVALID_ARGUMENT) and
-        // log errors for anything unexpected?
-        continue;
-      }
-      out_exec_plans->push_back(std::move(runner_or).value());
-    }
-
-    return absl::OkStatus();
-  }
-
-#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   TF_ASSIGN_OR_RETURN(
       auto op_graph,
@@ -6775,10 +6325,6 @@ absl::Status CudnnSupport::GetConvolveRunners(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'y'}, use_fallback, out_exec_plans,
       /*need_side_input=*/false, numeric_options);
-#else
-  return tsl::errors::Unimplemented(
-      "Cudnn execution plans are only supported with Cudnn >= 8.1.");
-#endif  // CUDNN_VERSION >= 8100
 }
 
 absl::Status CudnnSupport::GetGraphConvolveRunners(
@@ -6845,7 +6391,6 @@ CudnnSupport::ConvolveRunnerFromDesc(
     return {std::make_unique<CudnnLegacyConvRunner>(std::move(runner))};
   }
 
-#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   TF_ASSIGN_OR_RETURN(
@@ -6864,10 +6409,6 @@ CudnnSupport::ConvolveRunnerFromDesc(
           /*need_side_input=*/false));
   return {std::make_unique<CudnnExecutionPlanRunner<dnn::ConvSignature>>(
       std::move(runner))};
-#else
-  return tsl::errors::Unimplemented(
-      "Cudnn execution plans are only supported with Cudnn >= 8.1.");
-#endif
 }
 
 absl::StatusOr<std::unique_ptr<const dnn::GraphConvRunner>>
@@ -6884,7 +6425,6 @@ CudnnSupport::GraphConvolveRunnerFromDesc(
         "cuDNN graph execution requires the use of the cuDNN frontend.");
   }
 
-#if CUDNN_VERSION >= 8900
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   TF_ASSIGN_OR_RETURN(
@@ -6904,10 +6444,6 @@ CudnnSupport::GraphConvolveRunnerFromDesc(
                           /*need_side_input=*/false));
   return {std::make_unique<CudnnExecutionPlanRunner<dnn::GraphConvSignature>>(
       std::move(runner))};
-#else
-  return tsl::errors::Unimplemented(
-      "cuDNN graph execution requires cuDNN version 8.9 or higher.");
-#endif
 }
 
 class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
@@ -7141,7 +6677,6 @@ CudnnSupport::FusedConvolveRunnerFromDesc(
     return {std::make_unique<CudnnLegacyFusedConvRunner>(std::move(runner))};
   }
 
-#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   TF_ASSIGN_OR_RETURN(auto op_graph,
@@ -7162,17 +6697,12 @@ CudnnSupport::FusedConvolveRunnerFromDesc(
                           {'x', 'w', 'z', 'b', 'y'}, need_side_input));
   return {std::make_unique<CudnnExecutionPlanRunner<dnn::FusedConvSignature>>(
       std::move(runner))};
-#else
-  return tsl::errors::Unimplemented(
-      "Cudnn execution plans are only supported with Cudnn >= 8.1.");
-#endif
 }
 
 absl::Status CudnnSupport::GetFusedConvolveRunners(
-    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-    dnn::DataType input_type, dnn::DataType bias_type,
-    dnn::DataType output_type, double conv_scale, double side_input_scale,
-    double leakyrelu_alpha, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+    double side_input_scale, double leakyrelu_alpha, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::BatchDescriptor& bias_descriptor,
@@ -7184,47 +6714,6 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
   // Fused convolutions with identity activations are broken in that they
   // implicitly do ReLU on some engines, and we can't reliably detect which
   // ones.
-  const bool is_broken_identity_fused_conv =
-#if CUDNN_VERSION < 8205
-      activation_mode == dnn::ActivationMode::kNone;
-#else
-      false;
-#endif
-
-  // cuDNN frontend support became sufficiently stable to use in 8.1.
-  // TODO(awpr): remove this condition once support for cuDNN 8.0 is dropped.
-  const bool is_pre_frontend_cudnn = CUDNN_VERSION < 8100;
-
-  // cuDNN frontend support for Tx32 convolutions added in 8.3.
-  // If the filter is not reordered, do not use frontend (it is slow).
-  const bool is_disabled_x32 =
-      input_descriptor.layout() == dnn::kBatchDepthYX32 &&
-      (CUDNN_VERSION < 8300 ||
-       filter_descriptor.layout() !=
-           dnn::FilterLayout::kOutputInputYX32_CudnnReordered);
-
-  const bool actually_use_cudnn_frontend =
-      use_cudnn_frontend && !is_pre_frontend_cudnn &&
-      !is_broken_identity_fused_conv && !is_disabled_x32;
-
-  if (use_cudnn_frontend && !actually_use_cudnn_frontend) {
-    const char* reason = "the current cuDNN version does not support it.";
-    if (is_disabled_x32) {
-      reason = "Tx32 convolutions are disabled.";
-    } else if (is_broken_identity_fused_conv) {
-      reason = "it uses an identity activation.";
-    }
-
-    // This will happen once per unique conv configuration/shape that gets
-    // affected (and not, for example, on every conv launch).  Confusion over
-    // whether this has happened or not has repeatedly wasted a lot of time
-    // debugging, so be sure it shows up in the logs.
-    LOG(INFO) << "Disabling cuDNN frontend for the following convolution:\n"
-              << "  input: " << input_descriptor.ToString() << "\n"
-              << "  filter: " << filter_descriptor.ToString() << "\n"
-              << "  " << convolution_descriptor.ToString() << "\n"
-              << "  ... because " << reason;
-  }
 
   if (input_type == dnn::DataType::kInt8 &&
       !stream->GetCudaComputeCapability().IsAtLeast(6, 1)) {
@@ -7233,14 +6722,6 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
         "on GPUs with compute capability 6.1 or later.");
   }
 
-  if (input_type == dnn::DataType::kInt8 &&
-      output_type == dnn::DataType::kFloat &&
-      (CUDNN_VERSION >= 8000 && CUDNN_VERSION <= 8200)) {
-    return tsl::errors::Unimplemented(
-        "int8 -> float fused conv is disabled for this cuDNN version. See "
-        "go/nvbugs/3326122");
-  }
-
   if (activation_mode != dnn::ActivationMode::kRelu &&
       activation_mode != dnn::ActivationMode::kRelu6 &&
       activation_mode != dnn::ActivationMode::kElu &&
@@ -7251,40 +6732,6 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
         "{Relu, Relu6, Elu, <None>}.");
   }
 
-  if (!actually_use_cudnn_frontend) {
-    std::vector<dnn::AlgorithmDesc> algorithms;
-
-    auto cuda_compute_capability = stream->GetCudaComputeCapability();
-    if (!GetConvolveAlgorithms(cuda_compute_capability, input_type,
-                               numeric_options, &algorithms)) {
-      return absl::UnknownError("Listing fused convolve algorithms failed.");
-    }
-
-    for (const auto& algo : algorithms) {
-      // Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is supported
-      // for identity activation, other algs seem to quietly do Relu. See
-      // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBiasActivationForward
-      if (activation_mode == dnn::ActivationMode::kNone &&
-          algo.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
-        continue;
-      }
-      auto runner_or = FusedConvolveRunnerFromDesc(
-          stream, algo, kind, input_type, bias_type, output_type, conv_scale,
-          side_input_scale, leakyrelu_alpha, input_descriptor,
-          filter_descriptor, bias_descriptor, output_descriptor,
-          convolution_descriptor, activation_mode);
-      if (!runner_or.ok()) {
-        // See the corresponding error handling in
-        // CudnnSupport::GetConvolveRunners: this filters out algorithms that
-        // don't support this conv.
-        continue;
-      }
-      out_exec_plans->push_back(std::move(runner_or).value());
-    }
-    return absl::OkStatus();
-  }
-
-#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto op_graph_status = GetCudnnFusedOperationGraph(
       kind, input_type, bias_type, output_type, conv_scale, side_input_scale,
@@ -7302,27 +6749,16 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph), kind,
       input_type, {'x', 'w', 'z', 'b', 'y'}, use_fallback, out_exec_plans,
       need_side_input, numeric_options);
-#else
-  return tsl::errors::Unimplemented(
-      "Cudnn execution plans are only supported with Cudnn >= 8.1.");
-#endif  // CUDNN_VERSION >= 8100
 }
 
 absl::Status CudnnSupport::GetFusedMatmulRunners(
-    bool use_cudnn_frontend, dnn::DataType input_type, dnn::DataType bias_type,
+    dnn::DataType input_type, dnn::DataType bias_type,
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
     const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
-#if CUDNN_VERSION >= 8400
-  if (!use_cudnn_frontend) {
-    return tsl::errors::Unimplemented(
-        "Cudnn execution plans for matmul are only supported with cudnn "
-        "frontend APIs.");
-  }
-
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto op_graph_status = GetCudnnFusedMatmulGraph(
       input_type, bias_type, output_type, trans_a, trans_b, m, n, k, lda, ldb,
@@ -7341,11 +6777,6 @@ absl::Status CudnnSupport::GetFusedMatmulRunners(
       stream, cudnn, parent_, cudnn_.get(), std::move(op_graph),
       dnn::ConvolutionKind::INVALID, input_type, {'a', 'b', 'z', 'c'},
       use_fallback, out_exec_plans, /*need_side_input=*/true, numeric_options);
-#else
-  return tsl::errors::Unimplemented(
-      "Cudnn execution plans for matmul are only supported with Cudnn >= "
-      "8.4.");
-#endif  // CUDNN_VERSION >= 8400
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
@@ -7732,7 +7163,6 @@ absl::Status CudnnSupport::DoBatchNormalizationForwardImpl(
   DeviceMemory<uint8_t> workspace;
   DeviceMemory<uint8_t> reserve_space;
 
-#if CUDNN_VERSION >= 7402
   const auto get_bn_ops = [&]() -> cudnnBatchNormOps_t {
     if (side_input.is_null()) {
       return activation_mode == dnn::ActivationMode::kNone
@@ -7767,7 +7197,6 @@ absl::Status CudnnSupport::DoBatchNormalizationForwardImpl(
                                              reserve_space_size_in_bytes));
     }
   }
-#endif
 
   auto check_no_side_input_or_activation = [&]() -> absl::Status {
     if (activation_mode != dnn::ActivationMode::kNone ||
@@ -7800,7 +7229,6 @@ absl::Status CudnnSupport::DoBatchNormalizationForwardImpl(
     }
 
     bool called = false;
-#if CUDNN_VERSION >= 7402
     if (reserve_space_allocator != nullptr && workspace_allocator != nullptr) {
       called = true;
       RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardTrainingEx(
@@ -7830,7 +7258,6 @@ absl::Status CudnnSupport::DoBatchNormalizationForwardImpl(
           /*reserveSpace=*/reserve_space.opaque(),
           /*reserveSpaceSizeInBytes=*/reserve_space.size()));
     }
-#endif
     if (!called) {
       TF_RETURN_IF_ERROR(check_no_side_input_or_activation());
       RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardTraining(
@@ -7943,7 +7370,6 @@ absl::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   bool called = false;
-#if CUDNN_VERSION >= 7402
   if (reserve_space_data != nullptr && workspace_allocator != nullptr) {
     called = true;
     const cudnnBatchNormOps_t bn_ops = [&]() {
@@ -7998,7 +7424,6 @@ absl::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
         /*reserveSpace=*/reserve_space_data->opaque(),
         /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
   }
-#endif
   auto check_no_side_input_or_activation = [&]() -> absl::Status {
     if (activation_mode != dnn::ActivationMode::kNone ||
         !side_input_backprop->is_null()) {
@@ -8046,14 +7471,6 @@ absl::Status CudnnSupport::DoFusedConvolve(
         "on GPUs with compute capability 6.1 or later.");
   }
 
-  if (input_type == dnn::DataType::kInt8 &&
-      output_type == dnn::DataType::kFloat &&
-      (CUDNN_VERSION >= 8000 && CUDNN_VERSION <= 8200)) {
-    return tsl::errors::Unimplemented(
-        "int8 -> float fused conv is disabled for this cuDNN version. See "
-        "go/nvbugs/3326122");
-  }
-
   if (activation_mode != dnn::ActivationMode::kRelu &&
       activation_mode != dnn::ActivationMode::kNone) {
     return absl::InvalidArgumentError(
@@ -8144,7 +7561,6 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Query the workspace size.
   size_t workspace_size_in_bytes = 0;
-#if CUDNN_VERSION >= 7603
   CudnnCtcLossDescriptor cudnn_ctc_loss_desc(ToCudnnDataType(element_type));
   const CudnnRnnStateTensorDescriptor& cudnn_probs_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(probs_desc);
@@ -8183,11 +7599,6 @@ absl::Status CudnnSupport::DoPrepareForCtcLoss(
         /*sizeInBytes=*/&workspace_size_in_bytes));
   }
   *ctc_loss_algo_id = algo;
-#else
-  return absl::InvalidArgumentError(
-      "No supported cudnnGetCTCLossWorkspaceSize when "
-      "CUDNN_VERSION < 7.6.3");
-#endif
   // Allocate the workspace.
   if (workspace_size_in_bytes == 0) {
     *scratch_memory = DeviceMemory<uint8_t>();
@@ -8213,10 +7624,10 @@ absl::Status CudnnSupport::DoCtcLoss(
     DeviceMemoryBase grads_data, DeviceMemory<uint8_t> scratch_memory,
     int ctc_loss_algo_id) {
   // Current cuDNN CTC Loss only supports the float datatype
-  if (CUDNN_VERSION < 7603 || element_type != dnn::DataType::kFloat) {
+  if (element_type != dnn::DataType::kFloat) {
     return absl::InvalidArgumentError(
         "CudnnCtcLossDescriptor is supported only when the "
-        "CUDNN_VERSION >= 7.6.3 and DataType is float");
+        "DataType is float");
   }
   CudnnCtcLossDescriptor cudnn_ctc_loss_desc(ToCudnnDataType(element_type));
   const CudnnRnnStateTensorDescriptor& cudnn_probs_desc =
@@ -8572,8 +7983,6 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-#if CUDNN_VERSION >= 8100
-
 absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> CudnnSupport::DeserializeGraph(
     Stream& stream, absl::string_view serialized_data) const {
   auto cudnn = cudnn_->GetHandle(stream.parent(), &stream);
@@ -8611,12 +8020,11 @@ absl::Status CudnnGraph::Build(dnn::DnnSupport& dnn_support,
   RETURN_CUDNN_FRONTEND_STATUS(graph_.build_plans(cudnn->handle()));
 }
 
-absl::Status CudnnGraph::Execute(Stream& stream,
-                                 absl::Span<DeviceMemoryBase> operands,
-                                 int64_t local_device_ordinal) const {
-  std::unordered_map<int64_t, void*> tensor_to_ptr_map;
+CudnnGraph::VariantPack CudnnGraph::PackOperands(
+    absl::Span<DeviceMemoryBase> operands, DeviceMemoryBase& workspace,
+    std::optional<int64_t> local_device_ordinal) const {
+  CudnnGraph::VariantPack tensor_to_ptr_map;
   absl::Span<DeviceMemoryBase> operands_without_workspace = operands;
-  DeviceMemoryBase workspace;
   if (graph_.get_workspace_size() > 0) {
     workspace = operands.back();
     CHECK_EQ(graph_.get_workspace_size(), workspace.size());
@@ -8630,26 +8038,66 @@ absl::Status CudnnGraph::Execute(Stream& stream,
   }
 
   if (dropout_rng_offset_increment_ > 0) {
-#if CUDNN_VERSION >= 8800
-    UpdateDropoutState(local_device_ordinal);
+    CHECK(local_device_ordinal.has_value());
+    UpdateDropoutState(*local_device_ordinal);
     tensor_to_ptr_map[next_uid()] = (void*)&dropout_rng_seed_;
     tensor_to_ptr_map[next_uid()] =
-        (void*)&current_dropout_rng_offset_[local_device_ordinal];
-#else
-    return absl::UnimplementedError(
-        "Cudnn dropout offset and seed are only supported with Cudnn >= "
-        "8.8.0");
-#endif  // CUDNN_VERSION >= 8800
+        (void*)&current_dropout_rng_offset_[*local_device_ordinal];
   }
 
+  return tensor_to_ptr_map;
+}
+
+absl::Status CudnnGraph::Execute(Stream& stream,
+                                 absl::Span<DeviceMemoryBase> operands,
+                                 int64_t local_device_ordinal) const {
+  DeviceMemoryBase workspace;
+  VariantPack tensor_to_ptr_map =
+      PackOperands(operands, workspace, local_device_ordinal);
+
   const CudnnSupport& dnn_support =
       static_cast<CudnnSupport&>(*stream.parent()->AsDnn());
-  auto cudnn = dnn_support.cudnn_->GetHandle(stream.parent(), &stream);
+  CudnnHandle cudnn = dnn_support.cudnn_->GetHandle(stream.parent(), &stream);
+
   RETURN_CUDNN_FRONTEND_STATUS(
       graph_.execute(cudnn.handle(), tensor_to_ptr_map, workspace.opaque()));
 }
 
-#endif  // CUDNN_VERSION >= 8100
+absl::StatusOr<bool> CudnnGraph::SupportsExplicitCommandBufferConstruction()
+    const {
+  std::vector<cudnn_frontend::BehaviorNote_t> notes;
+  RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.get_behavior_notes(notes));
+  bool result = absl::c_any_of(notes, [](cudnn_frontend::BehaviorNote_t n) {
+    return n == cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API;
+  });
+  if (!result) {
+    VLOG(5) << "Graph does not support CUDA graph native API:\n"
+            << graph_.print();
+  }
+  return result;
+}
+
+absl::Status CudnnGraph::PopulateOrUpdateRawCommandBuffer(
+    Stream& stream, absl::Span<DeviceMemoryBase> operands,
+    RawCommandBufferHandle cuda_graph, bool do_update) {
+  DeviceMemoryBase workspace;
+  VariantPack tensor_to_ptr_map = PackOperands(operands, workspace);
+
+  const CudnnSupport& dnn_support =
+      static_cast<CudnnSupport&>(*stream.parent()->AsDnn());
+  CudnnHandle cudnn = dnn_support.cudnn_->GetHandle(stream.parent(), &stream);
+
+  if (do_update) {
+    RETURN_CUDNN_FRONTEND_STATUS(
+        graph_.update_cuda_graph(cudnn.handle(), tensor_to_ptr_map,
+                                 workspace.opaque(), (cudaGraph_t)cuda_graph));
+  } else {
+    RETURN_CUDNN_FRONTEND_STATUS(graph_.populate_cuda_graph(
+        cudnn.handle(), tensor_to_ptr_map, workspace.opaque(),
+        (cudaGraph_t)cuda_graph));
+  }
+}
+
 }  // namespace gpu
 
 void initialize_cudnn() {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index c3d9a56e2788..ff8aae1572ab 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -24,23 +24,23 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
 #include "third_party/gpus/cudnn/cudnn_version.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/numeric_options.h"
+#include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 
-#if CUDNN_VERSION >= 8100
-#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
-#endif  // CUDNN_VERSION >= 8100
-
 namespace stream_executor {
 namespace gpu {
 
@@ -54,7 +54,6 @@ using BatchDescriptorSlice = absl::Span<const dnn::BatchDescriptor>;
 template <typename T>
 using DeviceMemorySlice = absl::Span<const DeviceMemory<T>* const>;
 
-#if CUDNN_VERSION >= 8100
 class CudnnGraph : public dnn::DnnGraph {
  public:
   explicit CudnnGraph(cudnn_frontend::graph::Graph&& graph)
@@ -77,14 +76,22 @@ class CudnnGraph : public dnn::DnnGraph {
     current_dropout_rng_offset_[local_device_ordinal] +=
         dropout_rng_offset_increment_;
   }
+  absl::StatusOr<bool> SupportsExplicitCommandBufferConstruction()
+      const override;
+  absl::Status PopulateOrUpdateRawCommandBuffer(
+      Stream&, absl::Span<DeviceMemoryBase> operands, RawCommandBufferHandle,
+      bool do_update) override;
 
  private:
   cudnn_frontend::graph::Graph graph_;
   int64_t dropout_rng_seed_;
   mutable std::vector<int64_t> current_dropout_rng_offset_;
   int64_t dropout_rng_offset_increment_ = 0;
+  using VariantPack = std::unordered_map<int64_t, void*>;
+  VariantPack PackOperands(
+      absl::Span<DeviceMemoryBase> operands, DeviceMemoryBase& workspace,
+      std::optional<int64_t> local_device_ordinal = std::nullopt) const;
 };
-#endif  // CUDNN_VERSION >= 8100
 
 // cudnn-library based DNN support. For details on overridden interface
 // functions, see dnn.h.
@@ -255,8 +262,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      dnn::ProfileResult* output_profile_result) override;
 
   absl::Status GetConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
       const dnn::FilterDescriptor& filter_descriptor,
       DeviceMemoryBase filter_data,
@@ -298,10 +305,9 @@ class CudnnSupport : public dnn::DnnSupport {
       std::string serialized_graph) override;
 
   absl::Status GetFusedConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType input_type, dnn::DataType bias_type,
-      dnn::DataType output_type, double conv_scale, double side_input_scale,
-      double leakyrelu_alpha, Stream* stream,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+      double side_input_scale, double leakyrelu_alpha, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
       const dnn::BatchDescriptor& bias_descriptor,
@@ -313,10 +319,9 @@ class CudnnSupport : public dnn::DnnSupport {
       override;
 
   absl::Status GetFusedMatmulRunners(
-      bool use_cudnn_frontend, dnn::DataType input_type,
-      dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
-      bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
-      int64_t lda, int64_t ldb, int64_t ldc,
+      dnn::DataType input_type, dnn::DataType bias_type,
+      dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
+      uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
       const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
@@ -357,12 +362,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<float>& estimated_mean,
       const DeviceMemory<float>& estimated_var_iance,
       const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<float>* y,
-      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
-      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
-      bool is_training, ScratchAllocator* reserve_space_allocator,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
+      double exponential_average_factor, dnn::ActivationMode activation_mode,
+      DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+      DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+      DeviceMemory<float>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
       ScratchAllocator* workspace_allocator) override;
 
   bool DoBatchNormalizationForward(
@@ -372,12 +377,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<float>& estimated_variance,
       const DeviceMemory<Eigen::half>& side_input,
       const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y,
-      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
-      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
-      bool is_training, ScratchAllocator* reserve_space_allocator,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
+      double exponential_average_factor, dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
+      DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+      DeviceMemory<float>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
       ScratchAllocator* workspace_allocator) override;
 
   bool DoBatchNormalizationForward(
@@ -387,12 +392,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<float>& estimated_variance,
       const DeviceMemory<Eigen::bfloat16>& side_input,
       const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
-      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
-      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
-      bool is_training, ScratchAllocator* reserve_space_allocator,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
+      double exponential_average_factor, dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::bfloat16>* y, DeviceMemory<float>* batch_mean,
+      DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
+      DeviceMemory<float>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
       ScratchAllocator* workspace_allocator) override;
 
   bool DoBatchNormalizationBackward(
@@ -401,7 +406,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
       const DeviceMemory<float>& inv_var, const DeviceMemory<float>& y,
       const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
       dnn::ActivationMode activation_mode, DeviceMemory<float>* x_backprop,
       DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
       DeviceMemory<float>* side_input_backprop,
@@ -414,7 +419,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
       const DeviceMemory<float>& inv_var, const DeviceMemory<Eigen::half>& y,
       const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
       dnn::ActivationMode activation_mode,
       DeviceMemory<Eigen::half>* x_backprop,
       DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
@@ -429,7 +434,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<float>& inv_var,
       const DeviceMemory<Eigen::bfloat16>& y,
       const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
       dnn::ActivationMode activation_mode,
       DeviceMemory<Eigen::bfloat16>* x_backprop,
       DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
@@ -535,7 +540,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   absl::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
                          const dnn::RnnStateTensorDescriptor& probs_desc,
-                         const DeviceMemoryBase probs_data,
+                         DeviceMemoryBase probs_data,
                          absl::Span<const int> labels_data,
                          absl::Span<const int> labels_lengths_data,
                          absl::Span<const int> input_lengths_data,
@@ -554,11 +559,9 @@ class CudnnSupport : public dnn::DnnSupport {
 
   void NotifyStreamDestroyed(Stream* stream) override;
 
-#if CUDNN_VERSION >= 8100
   // Loads complete graph from its serialized representation.
   absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> DeserializeGraph(
       Stream& stream, absl::string_view serialized_data) const override;
-#endif  // CUDNN_VERSION >= 8100
 
  private:
   // Uses cuDNN handle for execution.
@@ -592,12 +595,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<U>& estimated_mean,
       const DeviceMemory<U>& estimated_variance,
       const DeviceMemory<T>& side_input, const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<T>* y,
-      DeviceMemory<U>* batch_mean, DeviceMemory<U>* batch_var,
-      DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
-      bool is_training, ScratchAllocator* reserve_space_allocator,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
+      double exponential_average_factor, dnn::ActivationMode activation_mode,
+      DeviceMemory<T>* y, DeviceMemory<U>* batch_mean,
+      DeviceMemory<U>* batch_var, DeviceMemory<U>* saved_mean,
+      DeviceMemory<U>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
       ScratchAllocator* workspace_allocator);
 
   template <class T, class U>
@@ -607,7 +610,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
       const DeviceMemory<U>& mean, const DeviceMemory<U>& inv_var,
       const DeviceMemory<T>& y, const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const dnn::BatchDescriptor& scale_offset_desc, double epsilon,
       dnn::ActivationMode activation_mode, DeviceMemory<T>* x_backprop,
       DeviceMemory<U>* scale_backprop, DeviceMemory<U>* offset_backprop,
       DeviceMemory<T>* side_input_backprop,
@@ -663,7 +666,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   absl::Status DoCtcLossImpl(
       Stream* stream, const CudnnRnnStateTensorDescriptor& probs_desc,
-      const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
+      DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
       absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
       const CudnnRnnStateTensorDescriptor& grads_desc,
@@ -704,11 +707,13 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     const dnn::MatmulTensorDescriptor& k_descriptor,
     const dnn::MatmulTensorDescriptor& v_descriptor,
     const dnn::TensorDescriptor& o_descriptor,
-    const std::optional<dnn::TensorDescriptor> bias_descriptor,
-    const std::optional<dnn::TensorDescriptor> stats_descriptor, double scale,
-    const bool use_dropout, const std::optional<double> dropout_rate,
-    const dnn::FMHAMaskKind mask_type, const int sliding_window_length,
-    const int max_seg_per_batch);
+    std::optional<dnn::TensorDescriptor> bias_descriptor,
+    std::optional<dnn::TensorDescriptor> stats_descriptor,
+    std::optional<dnn::TensorDescriptor> page_table_k_descriptor,
+    std::optional<dnn::TensorDescriptor> page_table_v_descriptor, double scale,
+    bool use_dropout, std::optional<double> dropout_rate,
+    dnn::FMHAMaskKind mask_type, int sliding_window_length,
+    int max_seg_per_batch);
 
 absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionF8OperationGraph(
     dnn::DnnSupport& dnn_support,
@@ -728,10 +733,10 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
     const dnn::TensorDescriptor& dq_desc, const dnn::TensorDescriptor& dk_desc,
     const dnn::TensorDescriptor& dv_desc,
     const std::optional<dnn::TensorDescriptor> bias_descriptor,
+    const std::optional<dnn::TensorDescriptor> dbias_descriptor,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    double scale, bool use_dropout, bool use_bias,
-    const dnn::FMHAMaskKind mask_type, bool force_deterministic,
-    const int sliding_window_length, const int max_seg_per_batch);
+    double scale, bool use_dropout, bool use_bias, dnn::FMHAMaskKind mask_type,
+    bool force_deterministic, int sliding_window_length, int max_seg_per_batch);
 
 absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardF8OperationGraph(
     dnn::DnnSupport& dnn_support, const dnn::MatmulTensorDescriptor& q_desc,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 8fe906746d98..65a7eef2e0f8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_executor.h"
 
+#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <memory>
 #include <optional>
 #include <string>
@@ -45,10 +47,11 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/collectives_registry.h"
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/cuda/cuda_collectives.h"
 #include "xla/stream_executor/cuda/cuda_command_buffer.h"
 #include "xla/stream_executor/cuda/cuda_context.h"
 #include "xla/stream_executor/cuda/cuda_event.h"
@@ -111,8 +114,9 @@ bool ShouldLaunchDelayKernel() {
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
 tsl::thread::ThreadPool* GetDriverExecutor() {
-  static tsl::thread::ThreadPool* thread_pool = new tsl::thread::ThreadPool(
-      tsl::Env::Default(), tsl::ThreadOptions(), "cuda_driver", 1);
+  static tsl::thread::ThreadPool* const thread_pool =
+      new tsl::thread::ThreadPool(tsl::Env::Default(), tsl::ThreadOptions(),
+                                  "cuda_driver", 1);
   return thread_pool;
 }
 
@@ -630,6 +634,33 @@ CudaExecutor::~CudaExecutor() {
   CHECK(gpu_binary_to_module_.empty()) << "CudaExecutor has loaded modules.";
 }
 
+absl::StatusOr<xla::gpu::GpuCollectives*> GetGpuCollectives(
+    StreamExecutor* executor) {
+  std::unique_ptr<ActivateContext> activation = executor->Activate();
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Default("gpu"));
+  return tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+}
+
+absl::StatusOr<void*> CollectiveMemoryAllocate(StreamExecutor* executor,
+                                               uint64_t bytes) {
+  if (bytes == 0) return nullptr;
+
+  std::unique_ptr<ActivateContext> activation = executor->Activate();
+  TF_ASSIGN_OR_RETURN(xla::gpu::GpuCollectives * gpu_collectives,
+                      GetGpuCollectives(executor));
+  return gpu_collectives->Allocate(bytes);
+}
+
+absl::Status CollectiveMemoryDeallocate(StreamExecutor* executor,
+                                        void* location) {
+  std::unique_ptr<ActivateContext> activation = executor->Activate();
+
+  TF_ASSIGN_OR_RETURN(xla::gpu::GpuCollectives * gpu_collectives,
+                      GetGpuCollectives(executor));
+  return gpu_collectives->Deallocate(location);
+}
+
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
 CudaExecutor::CreateMemoryAllocator(MemoryType type) {
   if (type == MemoryType::kUnified) {
@@ -663,14 +694,12 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
     return std::make_unique<GenericMemoryAllocator>(
         [this](uint64_t size)
             -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
-          TF_ASSIGN_OR_RETURN(
-              void* ptr, CudaCollectives::CollectiveMemoryAllocate(this, size));
+          TF_ASSIGN_OR_RETURN(void* ptr, CollectiveMemoryAllocate(this, size));
           VLOG(2) << "allocated " << ptr << " for context " << cuda_context_
                   << " of " << size << " bytes of collective memory";
           return std::make_unique<GenericMemoryAllocation>(
               ptr, size, [this](void* location, uint64_t size) {
-                auto status =
-                    CudaCollectives::CollectiveMemoryDeallocate(this, location);
+                auto status = CollectiveMemoryDeallocate(this, location);
                 if (!status.ok()) {
                   LOG(ERROR) << "failed to free collective memory at "
                              << location << "; result: " << status;
@@ -993,23 +1022,31 @@ CudaExecutor::CreateOrShareConstant(Stream* stream,
 }
 
 DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
+  VLOG(1) << "CudaExecutor::Allocate size: " << size
+          << " memory_space: " << memory_space;
+
   if (memory_space == static_cast<int64_t>(MemoryType::kCollective)) {
-    auto result = CudaCollectives::CollectiveMemoryAllocate(this, size);
+    auto result = CollectiveMemoryAllocate(this, size);
     if (!result.ok()) {
-      LOG(ERROR) << result.status();
+      LOG(ERROR) << "Failed to allocate collective memory: " << result.status();
       return DeviceMemoryBase(nullptr, 0);
     }
+    VLOG(1) << "CudaExecutor::Allocate returns " << result.value();
     return DeviceMemoryBase(result.value(), size);
   } else if (memory_space ==
              static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
     auto result = HostAllocate(cuda_context_, numa_node_, size);
     if (!result.ok()) {
+      LOG(ERROR) << "Failed to allocate host memory: " << result.status();
       return DeviceMemoryBase(nullptr, 0);
     }
+    VLOG(1) << "CudaExecutor::Allocate returns " << result.value();
     return DeviceMemoryBase(result.value(), size);
   }
   CHECK_EQ(memory_space, 0);
-  return DeviceMemoryBase(DeviceAllocate(cuda_context_, size), size);
+  auto device_buf_base = DeviceAllocate(cuda_context_, size);
+  VLOG(1) << "CudaExecutor::Allocate returns " << device_buf_base;
+  return DeviceMemoryBase(device_buf_base, size);
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocation>>
@@ -1018,6 +1055,8 @@ CudaExecutor::HostMemoryAllocate(uint64_t size) {
 }
 
 void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
+  VLOG(1) << "CudaExecutor::Deallocate mem: " << mem->opaque();
+
   auto status_or_memory_space = GetPointerMemorySpace(mem->opaque());
   if (!status_or_memory_space.ok()) {
     LOG(ERROR) << status_or_memory_space.status();
@@ -1192,7 +1231,7 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetSymbol(
   size_t bytes = 0;
   CHECK(static_cast<bool>(module_handle));
 
-  {  // give limited scope to mutex_lock
+  {  // give limited scope to MutexLock
     absl::MutexLock lock{&in_memory_modules_mu_};
     auto it = gpu_binary_to_module_.find(module_handle);
     CHECK(it != gpu_binary_to_module_.end());
@@ -1211,6 +1250,7 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetSymbol(
                    reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
 }
 
+namespace {
 absl::Status FillBlockDimLimit(CUdevice device, BlockDim* block_dim_limit) {
   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
   // we use BlockDims to express the dimensions of blocks within a grid
@@ -1223,6 +1263,7 @@ absl::Status FillBlockDimLimit(CUdevice device, BlockDim* block_dim_limit) {
   block_dim_limit->z = z;
   return absl::OkStatus();
 }
+}  // namespace
 
 absl::StatusOr<std::unique_ptr<Event>> CudaExecutor::CreateEvent() {
   TF_ASSIGN_OR_RETURN(auto event, CudaEvent::Create(this, false));
@@ -1421,8 +1462,8 @@ absl::StatusOr<const CudaKernel*> CudaExecutor::GetCudaKernel(
   return static_cast<const CudaKernel*>(*it);
 }
 
-absl::StatusOr<DeviceMemoryBase> CudaExecutor::CreateTensorMap(
-    TmaDescriptor tma_desc, void* global_address) {
+absl::StatusOr<TensorMap> CudaExecutor::CreateTensorMap(TmaDescriptor tma_desc,
+                                                        void* global_address) {
   TF_ASSIGN_OR_RETURN(CUtensorMapDataType data_type,
                       GetTensorMapDataType(tma_desc.element_size()));
   CUtensorMapSwizzle swizzle = GetTensorMapSwizzle(tma_desc.swizzle());
@@ -1435,7 +1476,7 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::CreateTensorMap(
 
   CUtensorMap tensor_map;
   auto result = cuTensorMapEncodeTiled(
-      &tensor_map, data_type, tma_desc.rank(), global_address,
+      &tensor_map, data_type, tma_desc.num_dimensions(), global_address,
       &tma_desc.global_dims()[0], &tma_desc.global_strides()[0],
       &tma_desc.box_dims()[0], &tma_desc.element_strides()[0], interleave,
       swizzle, l2_promotion, float_oob_fill);
@@ -1446,10 +1487,7 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::CreateTensorMap(
         "Failed to create tensormap with cuTensorMapEncodeTiled: %s",
         error_message));
   }
-  DeviceMemoryBase device_tensor_map = Allocate(sizeof(tensor_map), 0);
-  TF_RETURN_IF_ERROR(
-      SynchronousMemcpy(&device_tensor_map, &tensor_map, sizeof(tensor_map)));
-  return device_tensor_map;
+  return absl::bit_cast<TensorMap>(tensor_map);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index 99a150a956b1..781b39378692 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/cuda/cuda_collectives.h"
 #include "xla/stream_executor/cuda/cuda_context.h"
 #include "xla/stream_executor/cuda/cuda_kernel.h"
 #include "xla/stream_executor/device_description.h"
@@ -133,10 +132,10 @@ class CudaExecutor : public GpuExecutor {
   absl::StatusOr<const CudaKernel*> GetCudaKernel(const Kernel* kernel);
 
   // Creates, allocates, and copies a CUtensorMap object for the given TMA
-  // descriptor.  Returns a DeviceMemoryBase pointing to the allocated
-  // CUtensorMap object to be used as an argument to a kernel.
-  absl::StatusOr<DeviceMemoryBase> CreateTensorMap(
-      TmaDescriptor tma_desc, void* global_address) override;
+  // descriptor. Returns a TensorMap, which is 128 bytes of storage, to be
+  // passed by value to the kernel.
+  absl::StatusOr<TensorMap> CreateTensorMap(TmaDescriptor tma_desc,
+                                            void* global_address) override;
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
       MemoryType type) override;
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
index c84adac268d9..3b445dc2cee8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
@@ -23,7 +23,10 @@ limitations under the License.
 #include <type_traits>
 #include <utility>
 
+#include "absl/base/casts.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cufft.h"
@@ -33,14 +36,12 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -74,7 +75,9 @@ cufftType CUDAFftType(fft::Type type) {
 // Associates the given stream with the given cuFFT plan.
 bool SetStream(StreamExecutor *parent, cufftHandle plan, Stream *stream) {
   std::unique_ptr<ActivateContext> activation = parent->Activate();
-  auto ret = cufftSetStream(plan, AsGpuStreamValue(stream));
+  auto ret = cufftSetStream(
+      plan,
+      absl::bit_cast<CUstream>((stream->platform_specific_handle().stream)));
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "Failed to run cuFFT routine cufftSetStream: " << ret;
     return false;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
index c2e0b990d999..18bb2ca5772a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/kernel.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
index cf790ddec09c..0ca67b2fa51e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index 15b4c16df95c..f1b0040dbe61 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -24,16 +24,16 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/gpu_diagnostics.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -50,7 +50,7 @@ static absl::Status InternalInit() {
 
   LOG(ERROR) << "failed call to cuInit: " << status;
 
-  Diagnostician::LogDiagnosticInformation();
+  cuda::Diagnostician::LogDiagnosticInformation();
   return status;
 }
 
@@ -72,7 +72,9 @@ Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
 int CudaPlatform::VisibleDeviceCount() const {
   // Initialized in a thread-safe manner the first time this is run.
   static const int num_devices = [] {
-    if (!PlatformInitialize().ok()) return -1;
+    if (!PlatformInitialize().ok()) {
+      return -1;
+    }
     int device_count = 0;
     auto status = cuda::ToStatus(cuDeviceGetCount(&device_count));
     if (!status.ok()) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
index b9621f76aee3..ce3a47158100 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc b/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc
index 55e7861e0154..2b11cb67e8e9 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_solver_context.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
+#include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "third_party/gpus/cuda/include/cuComplex.h"
 #include "third_party/gpus/cuda/include/cusolverDn.h"
 #include "third_party/gpus/cuda/include/cusolver_common.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu_solver_context.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_stream.h b/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
index 68c909c5e59b..671e0f5a2258 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/cuda/cuda_event.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
index 4451ea7255fc..9992ec3dc207 100644
--- a/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/defer_relocatable_compilation_compilation_provider.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/delay_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/delay_kernel_cuda.cu.cc
index 93dfa1053d8a..10ba837934bc 100644
--- a/third_party/xla/xla/stream_executor/cuda/delay_kernel_cuda.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/delay_kernel_cuda.cu.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstddef>
+#include <cstdint>
 
+#include "absl/status/statusor.h"
 #include "xla/stream_executor/cuda/delay_kernel.h"
 #include "xla/stream_executor/gpu/gpu_semaphore.h"
 #include "xla/stream_executor/typed_kernel_factory.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.h
index e73db347c69e..ccfa64bec0c9 100644
--- a/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/driver_compilation_provider.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/make_batch_pointers_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/make_batch_pointers_kernel_cuda.cu.cc
new file mode 100644
index 000000000000..fbb41248b8f3
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/make_batch_pointers_kernel_cuda.cu.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+
+#include "absl/base/casts.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/make_batch_pointers_kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+
+namespace stream_executor::cuda {
+namespace {
+__global__ void MakeBatchPointers(char* base, size_t stride, size_t n,
+                                  void** ptrs_out) {
+  size_t idx = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
+  if (idx >= n) return;
+  ptrs_out[idx] = base + idx * stride;
+}
+}  // namespace
+
+}  // namespace stream_executor::cuda
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    MakeBatchPointersKernelCuda, stream_executor::gpu::MakeBatchPointersKernel,
+    stream_executor::cuda::kCudaPlatformId, ([] {
+      stream_executor::MultiKernelLoaderSpec spec(4);
+      spec.AddInProcessSymbol(
+          absl::bit_cast<void*>(&stream_executor::cuda::MakeBatchPointers),
+
+          "make_batch_pointers");
+      return spec;
+    }));
diff --git a/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.cc
index 6dd1f1e215b6..01d9f8da8765 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.h
index b680e0882a17..ba099b36e7e4 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/nvjitlink_compilation_provider.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/nvjitlink_impl.cc b/third_party/xla/xla/stream_executor/cuda/nvjitlink_impl.cc
index 4cee025615cb..445e96c2d9c6 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvjitlink_impl.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nvjitlink_impl.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "third_party/gpus/cuda/include/nvJitLink.h"
 #include "xla/stream_executor/cuda/nvjitlink.h"
@@ -162,13 +163,25 @@ absl::StatusOr<std::vector<uint8_t>> CompileAndLinkUsingLibNvJitLink(
   absl::c_transform(cli_args, std::back_inserter(cli_args_ptrs),
                     [](const std::string& s) { return s.c_str(); });
 
-  nvJitLinkHandle link_handle{};
-  RETURN_IF_NVJITLINK_ERROR(nvJitLinkCreate(&link_handle, cli_args_ptrs.size(),
-                                            cli_args_ptrs.data()));
+  nvJitLinkHandle link_handle = nullptr;
+  nvJitLinkResult create_result =
+      nvJitLinkCreate(&link_handle, cli_args_ptrs.size(), cli_args_ptrs.data());
+
   absl::Cleanup link_handle_cleaner = [&link_handle] {
-    CHECK_EQ(nvJitLinkDestroy(&link_handle), NVJITLINK_SUCCESS);
+    if (link_handle != nullptr) {
+      CHECK_EQ(nvJitLinkDestroy(&link_handle), NVJITLINK_SUCCESS);
+    }
   };
 
+  if (create_result != NVJITLINK_SUCCESS) {
+    TF_ASSIGN_OR_RETURN(std::string error_log,
+                        nvJitLinkGetErrorLog(link_handle));
+
+    VLOG(3) << "libnvjitlink error log output: " << error_log;
+
+    return ToStatus(create_result, error_log);
+  }
+
   for (auto& image : inputs) {
     nvJitLinkInputType input_type = image.type == NvJitLinkInput::Type::kPtx
                                         ? NVJITLINK_INPUT_PTX
diff --git a/third_party/xla/xla/stream_executor/cuda/nvjitlink_test.cc b/third_party/xla/xla/stream_executor/cuda/nvjitlink_test.cc
index f3d30ae522f7..84cea34aa59f 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvjitlink_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nvjitlink_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/nvjitlink_support.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/tsl/platform/status_matchers.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
 
@@ -165,7 +166,8 @@ TEST_F(NvJitLinkTest, IdentifiesUnsupportedArchitecture) {
   EXPECT_THAT(
       CompileAndLinkHelper(stream_executor::CudaComputeCapability{100, 0},
                            {kStandalonePtx}),
-      tsl::testing::StatusIs(absl::StatusCode::kUnimplemented));
+      tsl::testing::StatusIs(testing::AnyOf(absl::StatusCode::kUnknown,
+                                            absl::StatusCode::kUnimplemented)));
 }
 
 TEST_F(NvJitLinkTest, LinkingTwoCompilationUnitsSucceeds) {
diff --git a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
index 3cebebf36807..f0b344f4884e 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h
index 5ffdee124c19..a79642148850 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/nvptxcompiler_compilation_provider.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.cc b/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.cc
index 2331f5e4b8c8..af6de029ff92 100644
--- a/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.cc
+++ b/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.cc
@@ -101,12 +101,13 @@ void WarnIfBadPtxasVersion(absl::string_view method,
   });
 }
 
-// The extension is used for compute capabilities 9.0, 10.0, 10.1 and 12.0.
+// Extension is used for compute capabilities 9.0, 10.0/10.1/10.3 and 12.0/12.1
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#ptx-compatibility
 bool ShouldUsePtxExtension(const CudaComputeCapability& cc) {
   return (cc.major == 9 && cc.minor == 0) ||
-         (cc.major == 10 && (cc.minor == 0 || cc.minor == 1)) ||
-         (cc.major == 12 && cc.minor == 0);
+         (cc.major == 10 &&
+          (cc.minor == 0 || cc.minor == 1 || cc.minor == 3)) ||
+         (cc.major == 12 && (cc.minor == 0 || cc.minor == 1));
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.h b/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.h
index 0bf6beed34b4..9a832d95bf40 100644
--- a/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.h
+++ b/third_party/xla/xla/stream_executor/cuda/ptx_compiler_helpers.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_CUDA_PTX_COMPILER_HELPERS_H_
 
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/ragged_all_to_all_kernel_cuda.cc b/third_party/xla/xla/stream_executor/cuda/ragged_all_to_all_kernel_cuda.cc
new file mode 100644
index 000000000000..f0659acfa1f0
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/ragged_all_to_all_kernel_cuda.cc
@@ -0,0 +1,41 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "absl/base/casts.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/ragged_all_to_all_kernel.h"
+#include "xla/stream_executor/gpu/ragged_all_to_all_kernel_lib.cu.h"
+
+#define REGISTER_RAGGED_ALL_TO_ALL_KERNEL(TYPE, BITS)                   \
+  GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                       \
+      RaggedAllToAllKernelCudaUInt##BITS,                               \
+      stream_executor::gpu::RaggedAllToAllKernel<TYPE>,                 \
+      stream_executor::cuda::kCudaPlatformId, ([] {                     \
+        stream_executor::MultiKernelLoaderSpec spec(7);                 \
+        spec.AddInProcessSymbol(                                        \
+            absl::bit_cast<void*>(                                      \
+                &stream_executor::gpu::RaggedAllToAllKernelImpl<TYPE>), \
+            "ragged_all_to_all_kernel_uint" #BITS);                     \
+        return spec;                                                    \
+      }));
+
+// Register the kernel for different integer types using the macro
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint8_t, 8);
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint16_t, 16);
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint32_t, 32);
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint64_t, 64);
diff --git a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc
index 8457b566bb47..864f2ad965c7 100644
--- a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc
+++ b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.cc
@@ -227,8 +227,8 @@ static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
   using AlreadyLoggedSetTy =
       absl::flat_hash_set<std::tuple<std::string, int, int>>;
 
-  static absl::Mutex* mutex = new absl::Mutex;
-  static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
+  static absl::Mutex* const mutex = new absl::Mutex;
+  static AlreadyLoggedSetTy* const already_logged = new AlreadyLoggedSetTy;
 
   absl::MutexLock lock(mutex);
 
diff --git a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.h b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.h
index f052da91069c..99d1da03067d 100644
--- a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.h
+++ b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/cubin_or_ptx_image.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.cc b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.cc
index 52ac2f2cecaa..07d132fc8f63 100644
--- a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.cc
+++ b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.h b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.h
index 2960b3c65747..052fefc4a9eb 100644
--- a/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.h
+++ b/third_party/xla/xla/stream_executor/cuda/subprocess_compilation_provider.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/cuda/compilation_options.h"
 #include "xla/stream_executor/cuda/compilation_provider.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_bfloat16.cu.cc b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_bfloat16.cu.cc
new file mode 100644
index 000000000000..589362b3de60
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_bfloat16.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h"
+#include "xla/types.h"
+
+namespace stream_executor::cuda {
+
+using xla::bfloat16;
+
+REGISTER_TOPK_KERNEL(1, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(2, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(4, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(8, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(16, bfloat16, uint16_t);
+
+REGISTER_TOPK_KERNEL(1, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(2, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(4, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(8, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(16, bfloat16, uint32_t);
+
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
new file mode 100644
index 000000000000..6a9b1a71a4f4
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h
@@ -0,0 +1,296 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_TOPK_KERNEL_CUDA_COMMON_CU_H_
+#define XLA_STREAM_EXECUTOR_CUDA_TOPK_KERNEL_CUDA_COMMON_CU_H_
+
+// This file contains bespoke and optimized implementation for TopK shapes. When
+// adding support for new shapes/dtypes, you also need to modify the rewriter
+// on topk_specializer.cc for these changes to be picked up.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/topk_kernel.h"
+#include "xla/tsl/lib/math/math_util.h"
+
+#define WAVEFRONT_SIZE 32
+
+namespace stream_executor::cuda {
+
+enum class ShflType { kSync, kUp, kDown, kXor };
+
+template <ShflType Type, class NT>
+__device__ __forceinline__ NT GpuShuffle(NT val, uint32_t idx,
+                                         uint32_t allmsk = 0xffffffffu) {
+  constexpr uint32_t SZ =
+      tsl::MathUtil::CeilOfRatio(sizeof(NT), sizeof(uint32_t));
+  union S {
+    NT v;
+    uint32_t d[SZ];
+  };
+  S in{val}, res{};
+
+#pragma unroll
+  for (uint32_t i = 0; i < SZ; i++) {
+    if constexpr (Type == ShflType::kSync)
+      res.d[i] = __shfl_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kUp)
+      res.d[i] = __shfl_up_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kDown)
+      res.d[i] = __shfl_down_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kXor)
+      res.d[i] = __shfl_xor_sync(allmsk, in.d[i], idx);
+  }
+  return res.v;
+}
+
+// Default implementation for KV holder. Useful for testing while adding support
+// for a new type, but generally bitpacking those values is more efficient. See
+// implementations below.
+template <typename T, typename V>
+struct Descending {
+  struct KVT {
+    T key;
+    V idx;
+  };
+
+  __device__ __forceinline__ static bool cmp(const KVT& lhs, const KVT& rhs) {
+    return lhs.key == rhs.key ? lhs.idx < rhs.idx : lhs.key > rhs.key;
+  }
+};
+
+// TopK implements a faster TopK for K < 16.
+//
+// To compute the final largest K elements, we shard the data threads and each
+// of them computes the top k elements for the data in its slice. When all lanes
+// in a warp are done with their TopK, we merge all the lane-local topks into
+// lane 0 using warp-local reductions. The lane-local topk is computed at
+// PerWarpTopK() and the warp reduction is computed in Reduce(). The warp-local
+// results are stored in shared memory.
+//
+// Once all warps are done, we load all previously produced results into a
+// single warp and repeat the reduction described above. This is implemented in
+// MergeTopKs() and we reuse the Reduce() implementation described above. On
+// MergeTopKs we also write the final results to the user-provided buffer.
+//
+// === Detailed Design
+//
+// The high level goals of this implementations are:
+//  - Low latency for small N (i.e. kilobytes).
+//  - High throughput for large N and/or large batch.
+//
+// Non-goals:
+//  - K > 32. Register pressure will be too high.
+//  - Sharding over multiple SMs. As explained later, we can use TopK's
+//    structure to get this "for free".
+//
+// The core observation of this implementation is that reading/writing to main
+// memory is the bottleneck in usual the Sort/TopK implementations and that for
+// K<16 a linear scan with in-register data is faster than using a heap with
+// shared memory, especially when K is a power of two.
+//
+// The heap for K=7 looks like:
+//
+//             a0
+//        a1        a2
+//      a3  a4    a5  a6
+//
+// When performing a push/pop, in the worst case scenario we need to compare it
+// with the root, both of its children, and one of the two subtrees. This means
+// that using a heap for K=7 only save us 2/7 comparison. Additionally, if the
+// tree were unbalanced(e.g. K=8), we would not be able to unroll this
+// computation.
+//
+// If we're using linear insertion, the worst case results in the full K
+// comparisons comparisons, but with care all of those values can be kept in
+// registers, replacing somewhat load/store instructions with movs. This
+// performance are more than enough to surpass the heap.
+//
+// We split the data evenly over T (<=1024) threads, and use the algorithm above
+// to maintain a sorted list of K elements in registers and perform linear
+// insertions on every new element. Once a warp is done with their local slice,
+// we reduce the slice-local data using shfl and the insertion described above,
+// by adding the other lane's TopK results to the local lane. Once the warp is
+// done, lane 0 writes its results to shared memory. This step has complexity:
+//    theta(k * slice_size + k^2 * log2(k))
+//
+// On a second pass, we use a single warp to consume the results of the previous
+// step and merge them into a final topk, using an analogous algorithm to what
+// has been previously described. Complexity of this stage is:
+//    theta(k^2 * log2(k)).
+//
+// This algorithm only uses a single block per batch dimension, but for large N,
+// we can split the input into B batches of size N/B, calculate each of their
+// topks and then compute a final topk, fixing the indices in the process.
+//
+// Future improvements:
+//  - Use optimal sort/merge networks to reduce the complexity the algorithm and
+//    allow better scaling past K=16. This is fairly tricky to implement
+//    efficiently, so it was let out of v1.
+//
+
+template <size_t K, typename KT, typename VT,
+          template <class, class> class Traits = Descending>
+struct TopK {
+  using Trait = Traits<KT, VT>;
+  using KVT = typename Trait::KVT;
+
+  __device__ TopK(void* buffer, int num_outputs)
+      : buffer_(reinterpret_cast<KVT*>(buffer)), num_outputs_(num_outputs) {}
+
+  __device__ __forceinline__ uint32_t Idx(uint32_t i) {
+    return blockDim.x * i + threadIdx.x;
+  }
+
+  // Compute a per-warp topk of a slice of data.
+  __device__ void PerWarpTopK(KT* key, int n) {
+    KVT tmp[K];
+    // TODO(doak): Use bitonic sort.
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      tmp[i] = {key[Idx(i)], VT(Idx(i))};
+    }
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+#pragma unroll
+      for (int j = i + 1; j < K; j++) {
+        KVT ti = tmp[i];
+        KVT tj = tmp[j];
+        bool res = Trait::cmp(ti, tj);
+        tmp[i] = res ? ti : tj;
+        tmp[j] = res ? tj : ti;
+      }
+    }
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+
+    for (int idx = K; idx < n; idx++) {
+      KVT kv{key[Idx(idx)], VT(Idx(idx))};
+      Push(tmp, kv);
+    }
+    Reduce(tmp, WarpSize);
+
+    if (threadIdx.x % WarpSize != 0) return;
+    int warp_id = threadIdx.x / WarpSize;
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      buffer_[i * WarpSize + warp_id] = tmp[i];
+    }
+  }
+
+  // Merge the per-warp topks into a single topk. The final data is written to
+  // `keys` and `idxs`
+  __device__ void MergeTopKs(KT* keys, uint32_t* idxs) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+    KVT tmp[K];
+    // We only use one warp for this step.
+    if (threadIdx.x >= WarpSize) return;
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      tmp[i] = buffer_[i * WarpSize + threadIdx.x];
+    }
+    Reduce(tmp, blockDim.x / WarpSize);
+    if (threadIdx.x != 0) return;
+    for (int i = 0; i < num_outputs_; ++i) {
+      keys[i] = tmp[i].key;
+      idxs[i] = tmp[i].idx;
+    }
+  }
+
+  // Merge `tmp` (a reverse-sorted array) from (0, `num_lanes`) lanes. The
+  // resulting array is stored in the tmp array of lane 0. For all other lanes,
+  // `tmp` is unspecified after this function is called.
+  __device__ __forceinline__ void Reduce(KVT tmp[K], int num_lanes) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+    int lane_id = threadIdx.x % WarpSize;
+    for (int offset = num_lanes / 2; offset > 0; offset /= 2) {
+#pragma unroll
+      for (int i = 0; i < K; i++) {
+        KVT kv = GpuShuffle<ShflType::kDown>(tmp[i], offset);
+        if (lane_id >= offset) continue;
+        Push(tmp, kv);
+      }
+    }
+  }
+
+  // Given a K-array of previously reverse-sorted KVTs, add kv to it and
+  // remove the smallest element of the resulting array. Preserves the sorted
+  // order of `tmp`.
+  // We are careful to write this code in a way that nvcc/ptxas will use
+  // predication rather than branching. If we don't get this right, then we
+  // can greatly expands the code size of the generated PTX and SASS by
+  // tens of thousands of instructions. This increased the size of the
+  // compressed JAX wheel by 25MiB, so be very careful to check the generated
+  // code size when changing this function.
+  static __device__ __forceinline__ void Push(KVT tmp[K], const KVT& kv) {
+    bool p = Trait::cmp(tmp[K - 1], kv);
+    tmp[K - 1] = p ? tmp[K - 1] : kv;
+#pragma unroll
+    for (int i = static_cast<int>(K) - 2; i >= 0; --i) {
+      // Note: even though we could exit early as soon as the first time we
+      // see a value greater than kv, we don't do this because it makes nvcc
+      // generate terrible code.
+      bool p = Trait::cmp(tmp[i], kv);
+      auto t = tmp[i];
+      tmp[i] = p ? tmp[i] : tmp[i + 1];
+      tmp[i + 1] = p ? tmp[i + 1] : t;
+    }
+  }
+
+  KVT* buffer_;
+  int num_outputs_;
+};
+
+// This shared memory buffer needs to be declared outside of the templated
+// Run(), as otherwise it would generate name conflicts from the multiple
+// instantiations of Run() from the multiple monomorphizations of Run().
+extern __device__ __shared__ int shmem[];
+
+template <size_t K, typename KT, typename VT>
+__launch_bounds__(stream_executor::gpu::kTopKMaxThreadsPerBlock, 1) __global__
+    void Run(KT* data, int n, KT* result, uint32_t* result_idxs, int k) {
+  TopK<K, KT, VT> obj(shmem, k);
+
+  const uint32_t bidx = blockIdx.x;
+  auto in = data + n * bidx;
+  auto vals_out = result + k * bidx;
+  auto idxs_out = result_idxs + k * bidx;
+  int slice_size = n / blockDim.x;
+  if (threadIdx.x < n % blockDim.x) {
+    slice_size++;
+  }
+
+  obj.PerWarpTopK(in, slice_size);
+  obj.MergeTopKs(vals_out, idxs_out);
+}
+
+#define KERNEL_TRAIT(K_VAL, TYPE, VT) \
+  stream_executor::gpu::TopKKernel<K_VAL, TYPE, VT>
+#define REGISTER_TOPK_KERNEL(K_VAL, TYPE, VT)                                 \
+  GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                             \
+      TopKKernelCuda_K##K_VAL##_##TYPE##_##VT, KERNEL_TRAIT(K_VAL, TYPE, VT), \
+      stream_executor::cuda::kCudaPlatformId, ([] {                           \
+        stream_executor::MultiKernelLoaderSpec spec(5);                       \
+        spec.AddInProcessSymbol(absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>), \
+                                "topk_k" #K_VAL "_" #TYPE "_" #VT);           \
+        return spec;                                                          \
+      }));
+
+}  // namespace stream_executor::cuda
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_TOPK_KERNEL_CUDA_COMMON_CU_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_float.cu.cc b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_float.cu.cc
new file mode 100644
index 000000000000..4a357ad92167
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/topk_kernel_cuda_float.cu.cc
@@ -0,0 +1,34 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "xla/stream_executor/cuda/topk_kernel_cuda_common.cu.h"
+
+namespace stream_executor::cuda {
+
+REGISTER_TOPK_KERNEL(1, float, uint16_t);
+REGISTER_TOPK_KERNEL(2, float, uint16_t);
+REGISTER_TOPK_KERNEL(4, float, uint16_t);
+REGISTER_TOPK_KERNEL(8, float, uint16_t);
+REGISTER_TOPK_KERNEL(16, float, uint16_t);
+
+REGISTER_TOPK_KERNEL(1, float, uint32_t);
+REGISTER_TOPK_KERNEL(2, float, uint32_t);
+REGISTER_TOPK_KERNEL(4, float, uint32_t);
+REGISTER_TOPK_KERNEL(8, float, uint32_t);
+REGISTER_TOPK_KERNEL(16, float, uint32_t);
+
+}  // namespace stream_executor::cuda
diff --git a/third_party/xla/xla/stream_executor/device_description.cc b/third_party/xla/xla/stream_executor/device_description.cc
index 7486bda1002d..43d698cdedd2 100644
--- a/third_party/xla/xla/stream_executor/device_description.cc
+++ b/third_party/xla/xla/stream_executor/device_description.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <variant>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/lib/math/math_util.h"
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 739484eb9230..dadcdf1e99d7 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -71,15 +71,18 @@ class RocmComputeCapability {
 
   bool gfx9_mi200() const { return gfx_version() == "gfx90a"; }
 
-  bool gfx9_mi300() const { return gfx_version() == "gfx942"; }
+  bool gfx9_mi300_series() const {
+    return gfx_version() == "gfx942" || gfx_version() == "gfx950";
+  }
 
   bool gfx9_mi100_or_later() const {
-    static constexpr absl::string_view kList[] = {"gfx908", "gfx90a", "gfx942"};
+    static constexpr absl::string_view kList[] = {"gfx908", "gfx90a", "gfx942",
+                                                  "gfx950"};
     return absl::c_count(kList, gfx_version()) != 0;
   }
 
   bool gfx9_mi200_or_later() const {
-    static constexpr absl::string_view kList[] = {"gfx90a", "gfx942"};
+    static constexpr absl::string_view kList[] = {"gfx90a", "gfx942", "gfx950"};
     return absl::c_count(kList, gfx_version()) != 0;
   }
 
@@ -87,7 +90,7 @@ class RocmComputeCapability {
 
   bool gfx10_rx69xx() const { return gfx_version() == "gfx1030"; }
 
-  bool gfx11_rx7900() const { return gfx_version() == "gfx1100"; }
+  bool gfx11() const { return gfx_version().find("gfx11"); }
 
   bool gfx1200() const { return gfx_version() == "gfx1200"; }
 
@@ -98,8 +101,7 @@ class RocmComputeCapability {
   bool has_bf16_dtype_support() const { return gfx9_mi100_or_later(); }
 
   bool has_fast_fp16_support() const {
-    return gfx9_mi100_or_later() || gfx10_rx68xx() || gfx10_rx69xx() ||
-           gfx11_rx7900();
+    return gfx9_mi100_or_later() || gfx10_rx68xx() || gfx10_rx69xx() || gfx11();
   }
 
   bool has_mfma_instr_support() const { return gfx9_mi100_or_later(); }
@@ -109,10 +111,9 @@ class RocmComputeCapability {
             gfx_version().find("gfx12"));
   }
 
-  bool has_fp16_atomics_support() const {
-    // TODO(rocm): Check. This should be the same as has_fast_fp16_support().
-    return gfx9_mi200_or_later();
-  }
+  bool has_packed_fp16_atomics_support() const { return gfx9_mi100_or_later(); }
+
+  bool has_packed_bf16_atomics_support() const { return gfx9_mi300_series(); }
 
   bool fence_before_barrier() const {
     return gfx_version() != "gfx900" && gfx_version() != "gfx906";
@@ -123,9 +124,15 @@ class RocmComputeCapability {
   }
 
   bool has_fp8_support() const {
-    return gfx9_mi300() || gfx1200() || gfx1201();
+    return has_ocp_fp8_support() || has_nanoo_fp8_support();
   }
 
+  bool has_ocp_fp8_support() const {
+    return gfx1200() || gfx1201() || gfx_version() == "gfx950";
+  }
+
+  bool has_nanoo_fp8_support() const { return gfx_version() == "gfx942"; }
+
   std::string ToString() const { return gcn_arch_name(); }
 
   RocmComputeCapabilityProto ToProto() const {
@@ -147,9 +154,10 @@ class RocmComputeCapability {
       "gfx908",   // MI100
       "gfx90a",   // MI200
       "gfx942",   // MI300
+      "gfx950",   // MI355
       "gfx1030",  // RX68xx / RX69xx
       "gfx1100",  // RX7900
-      "gfx1200", "gfx1201",
+      "gfx1101", "gfx1200", "gfx1201",
   };
 };
 
@@ -276,7 +284,7 @@ class DeviceDescription {
 
   // Returns the CUDA compute capability if we're running on the CUDA platform.
   // If a CUDA compute capability is not available, the major version will be
-  // zero.
+  // negative.
   CudaComputeCapability cuda_compute_capability() const;
 
   // Returns the ROCm compute capability if we're running on the ROCm platform.
@@ -316,7 +324,7 @@ class DeviceDescription {
               return 16 * 1024;
             }
             // MI300 has 32KB L1 cache per CU.
-            if (capability.gfx9_mi300()) {
+            if (capability.gfx9_mi300_series()) {
               return 32 * 1024;
             }
           }
@@ -332,7 +340,7 @@ class DeviceDescription {
           if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
                                        RocmComputeCapability>) {
             // DRAM->L2 bus is 128 Byte width for MI300.
-            if (capability.gfx9_mi300()) {
+            if (capability.gfx9_mi300_series()) {
               return 128;
             }
           }
@@ -353,7 +361,7 @@ class DeviceDescription {
           if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
                                        RocmComputeCapability>) {
             // 16 works well on MI300.
-            if (capability.gfx9_mi300()) {
+            if (capability.gfx9_mi300_series()) {
               return 16;
             }
           }
diff --git a/third_party/xla/xla/stream_executor/device_description.proto b/third_party/xla/xla/stream_executor/device_description.proto
index 9c5fa225bfaf..80e2e64bc821 100644
--- a/third_party/xla/xla/stream_executor/device_description.proto
+++ b/third_party/xla/xla/stream_executor/device_description.proto
@@ -54,12 +54,19 @@ message DnnVersionInfoProto {
   int32 patch = 3;
 }
 
+message RuntimeVersionProto {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+}
+
 message GpuTargetConfigProto {
   GpuDeviceInfoProto gpu_device_info = 1;
   reserved 2, 3;
   reserved "cuda_compute_capability", "rocm_compute_capability";
   string platform_name = 4;
   DnnVersionInfoProto dnn_version_info = 5;
+  RuntimeVersionProto runtime_version = 8;
 
   // TODO(b/248362914): Autotuning results should be separate from
   // GpuTargetConfig because autotuning can be updated regularly separate from
diff --git a/third_party/xla/xla/stream_executor/device_memory.h b/third_party/xla/xla/stream_executor/device_memory.h
index d599faadf756..fe4368709479 100644
--- a/third_party/xla/xla/stream_executor/device_memory.h
+++ b/third_party/xla/xla/stream_executor/device_memory.h
@@ -174,6 +174,12 @@ class DeviceMemory final : public DeviceMemoryBase {
   DeviceMemory(void *opaque, uint64_t size) : DeviceMemoryBase(opaque, size) {}
 };
 
+// TensorMap is a wrapper around a 128 bytes of storage. It is used to pass TMA
+// descriptors to the kernel.
+struct TensorMap {
+  alignas(64) std::byte storage[128];
+};
+
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_DEVICE_MEMORY_H_
diff --git a/third_party/xla/xla/stream_executor/device_memory_handle_test.cc b/third_party/xla/xla/stream_executor/device_memory_handle_test.cc
index 48ebf0d53267..ca72570e9c35 100644
--- a/third_party/xla/xla/stream_executor/device_memory_handle_test.cc
+++ b/third_party/xla/xla/stream_executor/device_memory_handle_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 
 #include <utility>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/mock_stream_executor.h"
 #include "tsl/platform/test.h"
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index 24851e56d75e..16bf32e194af 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -146,9 +146,9 @@ std::vector<std::pair<int64_t, int64_t>> AlgorithmDesc::TuningKnobs() const {
 }
 
 absl::Status DnnSupport::GetConvolveRunners(
-    bool /* use_cudnn_frontend */, dnn::ConvolutionKind /*kind*/,
-    dnn::DataType /*input_type*/, dnn::DataType /*output_type*/,
-    Stream* /*stream*/, const dnn::BatchDescriptor& /*input_descriptor*/,
+    dnn::ConvolutionKind /*kind*/, dnn::DataType /*input_type*/,
+    dnn::DataType /*output_type*/, Stream* /*stream*/,
+    const dnn::BatchDescriptor& /*input_descriptor*/,
     DeviceMemoryBase /*input_data*/,
     const dnn::FilterDescriptor& /*filter_descriptor*/,
     DeviceMemoryBase /*filter_data*/,
@@ -199,10 +199,9 @@ DnnSupport::GraphConvolveRunnerFromDesc(
 }
 
 absl::Status DnnSupport::GetFusedConvolveRunners(
-    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-    dnn::DataType element_type, dnn::DataType bias_type,
-    dnn::DataType output_type, double conv_input_scale, double side_input_scale,
-    double leakyrelu_alpha, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType element_type,
+    dnn::DataType bias_type, dnn::DataType output_type, double conv_input_scale,
+    double side_input_scale, double leakyrelu_alpha, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::BatchDescriptor& bias_descriptor,
@@ -214,11 +213,11 @@ absl::Status DnnSupport::GetFusedConvolveRunners(
 }
 
 absl::Status DnnSupport::GetFusedMatmulRunners(
-    bool use_cudnn_frontend, dnn::DataType element_type,
-    dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
-    bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k, int64_t lda,
-    int64_t ldb, int64_t ldc, dnn::ActivationMode activation_mode,
-    bool use_fallback, const NumericOptions& numeric_options,
+    dnn::DataType element_type, dnn::DataType bias_type,
+    dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
+    uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
+    dnn::ActivationMode activation_mode, bool use_fallback,
+    const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   return absl::UnimplementedError("GetFusedMatmulRunners not implemented.");
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index e644eb81e347..a282805440c8 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -47,8 +47,6 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
 
 namespace Eigen {
 struct half;
@@ -197,6 +195,8 @@ class MatmulTensorDescriptor {
 
   std::string ToString() const;
 
+  TensorDescriptor tensor() const { return tensor_; }
+
  protected:
   MatmulTensorDescriptor(TensorDescriptor tensor,
                          std::vector<int64_t> batch_dims,
@@ -1105,6 +1105,12 @@ class DnnGraph {
                                int64_t local_device_ordinal) const = 0;
   virtual void InitDropoutState(int64_t local_device_count, int64_t seed,
                                 int64_t increment) = 0;
+  virtual absl::StatusOr<bool> SupportsExplicitCommandBufferConstruction()
+      const = 0;
+  using RawCommandBufferHandle = void*;
+  virtual absl::Status PopulateOrUpdateRawCommandBuffer(
+      Stream&, absl::Span<DeviceMemoryBase> operands, RawCommandBufferHandle,
+      bool do_update) = 0;
 };
 
 using LazyDnnGraph = std::unique_ptr<DnnGraph>;
@@ -1485,11 +1491,11 @@ class DnnSupport {
   }
 
   virtual absl::Status GetConvolveRunners(
-      bool use_cudnn_frontend, ConvolutionKind kind, DataType input_type,
-      DataType output_type, Stream* stream,
-      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
-      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      ConvolutionKind kind, DataType input_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
       const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
       ScratchAllocator* scratch_allocator,
       const NumericOptions& numeric_options,
@@ -1525,9 +1531,9 @@ class DnnSupport {
       std::string serialized_graph);
 
   virtual absl::Status GetFusedConvolveRunners(
-      bool use_cudnn_frontend, ConvolutionKind kind, DataType element_type,
-      DataType bias_type, DataType output_type, double conv_input_scale,
-      double side_input_scale, double leakyrelu_alpha, Stream* stream,
+      ConvolutionKind kind, DataType element_type, DataType bias_type,
+      DataType output_type, double conv_input_scale, double side_input_scale,
+      double leakyrelu_alpha, Stream* stream,
       const BatchDescriptor& input_descriptor,
       const FilterDescriptor& filter_descriptor,
       const BatchDescriptor& bias_descriptor,
@@ -1537,9 +1543,9 @@ class DnnSupport {
       std::vector<std::unique_ptr<const FusedConvRunner>>* out_exec_plans);
 
   virtual absl::Status GetFusedMatmulRunners(
-      bool use_cudnn_frontend, DataType element_type, DataType bias_type,
-      DataType output_type, Stream* stream, bool trans_a, bool trans_b,
-      uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
+      DataType element_type, DataType bias_type, DataType output_type,
+      Stream* stream, bool trans_a, bool trans_b, uint64_t m, uint64_t n,
+      uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       ActivationMode activation_mode, bool use_fallback,
       const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const FusedMatmulRunner>>* out_exec_plans);
diff --git a/third_party/xla/xla/stream_executor/executor_cache.cc b/third_party/xla/xla/stream_executor/executor_cache.cc
index 1fcfd6b847f9..546b37d7c1b0 100644
--- a/third_party/xla/xla/stream_executor/executor_cache.cc
+++ b/third_party/xla/xla/stream_executor/executor_cache.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
diff --git a/third_party/xla/xla/stream_executor/executor_cache_test.cc b/third_party/xla/xla/stream_executor/executor_cache_test.cc
index 84bed1ecaf57..ea8797cc46b8 100644
--- a/third_party/xla/xla/stream_executor/executor_cache_test.cc
+++ b/third_party/xla/xla/stream_executor/executor_cache_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "xla/stream_executor/mock_stream_executor.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 8d6fa0620292..cdd407f991f1 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -9,19 +9,19 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm",
     "if_rocm_is_configured",
+    "rocm_library",
 )
 load(
     "@local_config_sycl//sycl:build_defs.bzl",
     "if_sycl_is_configured",
 )
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/service/gpu:build_defs.bzl",
     "gpu_kernel_library",
 )
 load(
     "//xla/stream_executor:build_defs.bzl",
-    "gpu_only_cc_library",
     "stream_executor_gpu_friends",
 )
 load(
@@ -34,6 +34,10 @@ load(
     "tsl_copts",
     "tsl_gpu_library",
 )
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tf_proto_library",
+)
 load(
     "//xla/tsl/platform:build_config_root.bzl",
     "if_static",
@@ -103,6 +107,7 @@ cc_library(
     srcs = ["read_numa_node.cc"],
     hdrs = ["read_numa_node.h"],
     deps = [
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:logging",
@@ -117,6 +122,7 @@ cc_library(
     deps = [
         ":context",
         "//xla/stream_executor:activate_context",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -134,12 +140,6 @@ xla_cc_test(
     ],
 )
 
-gpu_only_cc_library(
-    name = "gpu_diagnostics_header",
-    hdrs = ["gpu_diagnostics.h"],
-    deps = ["@com_google_absl//absl/status:statusor"],
-)
-
 cc_library(
     name = "scoped_update_mode",
     hdrs = [
@@ -157,26 +157,26 @@ cc_library(
     ],
 )
 
-gpu_only_cc_library(
+cc_library(
     name = "gpu_command_buffer",
     srcs = ["gpu_command_buffer.cc"],
     hdrs = [
         "gpu_command_buffer.h",
     ],
     deps = [
-        ":gpu_executor_header",
         ":scoped_update_mode",
         "//xla/stream_executor:bit_pattern",
         "//xla/stream_executor:command_buffer",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:dnn",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
-        "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/cuda:cuda_platform_id",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -185,11 +185,8 @@ gpu_only_cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -207,7 +204,7 @@ cc_library(
     ],
 )
 
-gpu_only_cc_library(
+cc_library(
     name = "gpu_helpers_header",
     hdrs = ["gpu_helpers.h"],
     deps = [
@@ -257,25 +254,11 @@ tsl_gpu_library(
     alwayslink = True,
 )
 
-gpu_only_cc_library(
-    name = "gpu_stream_header",
-    hdrs = ["gpu_stream.h"],
-    deps = [
-        ":gpu_types_header",
-        "//xla/stream_executor:kernel",
-        "//xla/stream_executor:launch_dim",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream",
-        "//xla/stream_executor:stream_common",
-        "//xla/stream_executor:stream_executor_h",
-        "@com_google_absl//absl/log:check",
-    ],
-)
-
-gpu_only_cc_library(
+cc_library(
     name = "gpu_stream",
     srcs = ["gpu_stream.cc"],
     hdrs = ["gpu_stream.h"],
+    tags = ["gpu"],
     deps = [
         ":gpu_types_header",
         "//xla/stream_executor:stream",
@@ -284,7 +267,7 @@ gpu_only_cc_library(
     ],
 )
 
-gpu_only_cc_library(
+cc_library(
     name = "gpu_semaphore",
     srcs = ["gpu_semaphore.cc"],
     hdrs = ["gpu_semaphore.h"],
@@ -297,7 +280,7 @@ gpu_only_cc_library(
     ],
 )
 
-gpu_only_cc_library(
+cc_library(
     name = "gpu_types_header",
     hdrs = ["gpu_types.h"],
     defines = if_rocm_is_configured([
@@ -305,6 +288,7 @@ gpu_only_cc_library(
     ]) + if_sycl_is_configured([
         "TENSORFLOW_USE_SYCL=1",
     ]),
+    tags = ["gpu"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
@@ -328,7 +312,7 @@ cc_library(
     ],
 )
 
-gpu_only_cc_library(
+cc_library(
     name = "asm_compiler",
     srcs = ["asm_compiler.cc"],
     hdrs = ["asm_compiler.h"],
@@ -380,31 +364,35 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-gpu_kernel_library(
+rocm_library(
     name = "redzone_allocator_kernel_rocm",
     srcs = [
         "redzone_allocator_kernel.h",
         "redzone_allocator_kernel_rocm.cu.cc",
     ],
-    tags = ["manual"],
+    tags = [
+        "gpu",
+        "manual",
+        "rocm-only",
+    ],
     deps = [
         ":gpu_asm_opts",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:typed_kernel_factory",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status:statusor",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-gpu_only_cc_library(
+cc_library(
     name = "redzone_allocator",
     srcs = [
         "redzone_allocator.cc",
@@ -483,6 +471,8 @@ cc_library(
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
@@ -516,10 +506,10 @@ cc_library(
     name = "gpu_blas_lt",
     srcs = ["gpu_blas_lt.cc"],
     hdrs = ["gpu_blas_lt.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
+        ":gpu_blas_lt_proto_cc",
+        "//xla:protobuf_util",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:types",
@@ -531,11 +521,17 @@ cc_library(
         "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -546,6 +542,28 @@ cc_library(
     ]),
 )
 
+tf_proto_library(
+    name = "gpu_blas_lt_proto",
+    srcs = ["gpu_blas_lt.proto"],
+    protodeps = [
+        "//xla:xla_data_proto",
+        "//xla/stream_executor:blas_proto",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_blas_lt_test",
+    srcs = ["gpu_blas_lt_test.cc"],
+    deps = [
+        ":gpu_blas_lt",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor:blas",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 gpu_kernel_library(
     name = "gpu_test_kernels",
     testonly = 1,
@@ -588,10 +606,10 @@ cc_library(
     ],
 )
 
-xla_test(
+xla_cc_test(
     name = "gpu_test_kernels_fatbin_test",
     srcs = ["gpu_test_kernels_fatbin_test.cc"],
-    backends = ["gpu"],
+    tags = ["gpu"],
     deps = [
         ":gpu_test_kernels_fatbin",
         "@com_google_googletest//:gtest_main",
@@ -609,6 +627,7 @@ xla_test(
         ":gpu_test_kernels_fatbin",
         "//xla/service:platform_util",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -618,14 +637,14 @@ xla_test(
         "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -652,6 +671,12 @@ xla_test(
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/log:check",
@@ -659,12 +684,6 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
     ] + if_cuda([
         "//xla/stream_executor/cuda:cuda_platform",
     ]) + if_rocm([
@@ -798,7 +817,7 @@ cc_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "tma_metadata_test",
     srcs = ["tma_metadata_test.cc"],
     deps = [
@@ -809,3 +828,92 @@ cc_test(
         "@llvm-project//llvm:Support",
     ],
 )
+
+cc_library(
+    name = "gpu_kernel_registry",
+    srcs = ["gpu_kernel_registry.cc"],
+    hdrs = ["gpu_kernel_registry.h"],
+    deps = [
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:typed_kernel_factory",
+        "//xla/stream_executor/platform:initialize",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_kernel_registry_test",
+    srcs = ["gpu_kernel_registry_test.cc"],
+    deps = [
+        ":gpu_kernel_registry",
+        "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/cuda:cuda_platform_id",
+        "//xla/stream_executor/rocm:rocm_platform_id",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "buffer_comparator_kernel",
+    hdrs = ["buffer_comparator_kernel.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+exports_files([
+    "buffer_comparator_kernel_lib.cu.h",
+    "all_reduce_kernel_lib.cu.h",
+    "ragged_all_to_all_kernel_lib.cu.h",
+])
+
+cc_library(
+    name = "make_batch_pointers_kernel",
+    hdrs = ["make_batch_pointers_kernel.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "ragged_all_to_all_kernel",
+    hdrs = ["ragged_all_to_all_kernel.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "all_reduce_kernel",
+    hdrs = ["all_reduce_kernel.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
+
+cc_library(
+    name = "topk_kernel",
+    hdrs = ["topk_kernel.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel",
+    ],
+)
diff --git a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
new file mode 100644
index 000000000000..a23eb818ec12
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel.h
@@ -0,0 +1,42 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_ALL_REDUCE_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_ALL_REDUCE_KERNEL_H_
+
+#include <array>
+#include <cstdint>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+
+namespace stream_executor::gpu {
+
+// The maximum number of input pointers that can be passed to the all-reduce
+// kernel.
+inline constexpr int64_t kMaxNumAllReduceInputPtrs = 8;
+
+// Defines a trait for the AllReduce kernel that can be used to register
+// and look up the kernel in the GPU kernel registry.
+template <typename ElementT>
+struct AllReduceKernel {
+  using KernelType = stream_executor::TypedKernel<
+      std::array<ElementT*, kMaxNumAllReduceInputPtrs>,
+      stream_executor::DeviceMemoryBase, int64_t, int64_t>;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_ALL_REDUCE_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
new file mode 100644
index 000000000000..a82e8cf72d8c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/all_reduce_kernel_lib.cu.h
@@ -0,0 +1,96 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_GPU_ALL_REDUCE_KERNEL_LIB_CU_H_
+#define XLA_STREAM_EXECUTOR_GPU_ALL_REDUCE_KERNEL_LIB_CU_H_
+
+#include <array>
+#include <cstdint>
+
+#include "third_party/gpus/cuda/include/cuda_bf16.h"
+#include "xla/stream_executor/gpu/all_reduce_kernel.h"
+
+namespace stream_executor::gpu {
+
+constexpr int64_t kNumElementsPerThread = 4;
+
+template <typename T>
+union Vec;
+
+template <>
+union alignas(16) Vec<float> {
+  using PackedType = int4;
+
+  float data[4];
+  PackedType packed;
+};
+
+template <>
+union alignas(8) Vec<__nv_bfloat16> {
+  using PackedType = int2;
+
+  __nv_bfloat16 data[4];
+  PackedType packed;
+};
+
+template <typename T>
+__device__ __forceinline__ Vec<T> VecLoad(T* addr) {
+  Vec<T> vec;
+  vec.packed = *(reinterpret_cast<typename Vec<T>::PackedType*>(addr));
+  return vec;
+}
+
+template <typename T>
+__device__ __forceinline__ void VecStore(T* addr, const Vec<T>& vec) {
+  *(reinterpret_cast<typename Vec<T>::PackedType*>(addr)) = vec.packed;
+}
+
+template <typename T>
+__device__ __forceinline__ void VecAdd(Vec<T>& res, const Vec<T>& vec) {
+  res.data[0] += vec.data[0];
+  res.data[1] += vec.data[1];
+  res.data[2] += vec.data[2];
+  res.data[3] += vec.data[3];
+}
+
+template <typename T>
+__global__ void AllReduceKernelImpl(
+    std::array<T* __restrict__, kMaxNumAllReduceInputPtrs> input_ptrs,
+    T* __restrict__ output_ptr, int64_t num_inputs, int64_t num_elements) {
+  int64_t offset =
+      kNumElementsPerThread * (blockIdx.x * blockDim.x + threadIdx.x);
+  int64_t stride = kNumElementsPerThread * blockDim.x * gridDim.x;
+
+  for (int i = offset; i < num_elements; i += stride) {
+    Vec<T> sum;
+    sum.data[0] = 0;
+    sum.data[1] = 0;
+    sum.data[2] = 0;
+    sum.data[3] = 0;
+
+#pragma unroll
+    for (int j = 0; j < kMaxNumAllReduceInputPtrs; ++j) {
+      if (j >= num_inputs) break;
+
+      Vec<T> input_vec = VecLoad(input_ptrs[j] + i);
+      VecAdd(sum, input_vec);
+    }
+
+    VecStore(output_ptr + i, sum);
+  }
+}
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_ALL_REDUCE_KERNEL_LIB_CU_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_comparator_kernel.h b/third_party/xla/xla/stream_executor/gpu/buffer_comparator_kernel.h
new file mode 100644
index 000000000000..84eb98ff6e2a
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_comparator_kernel.h
@@ -0,0 +1,41 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_BUFFER_COMPARATOR_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_BUFFER_COMPARATOR_KERNEL_H_
+
+#include <sys/types.h>
+
+#include <cstdint>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+
+namespace stream_executor::gpu {
+
+// Defines a trait for the BufferComparator kernel that can be used to register
+// and look up the kernel in the GPU kernel registry.
+template <typename ElementT>
+struct BufferComparatorKernel {
+  using KernelType =
+      stream_executor::TypedKernel<stream_executor::DeviceMemory<ElementT>,
+                                   stream_executor::DeviceMemory<ElementT>,
+                                   float, uint64_t,
+                                   stream_executor::DeviceMemory<uint64_t>>;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_BUFFER_COMPARATOR_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_comparator_kernel_lib.cu.h b/third_party/xla/xla/stream_executor/gpu/buffer_comparator_kernel_lib.cu.h
new file mode 100644
index 000000000000..bdba30bb238d
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_comparator_kernel_lib.cu.h
@@ -0,0 +1,144 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_BUFFER_COMPARATOR_KERNEL_LIB_CU_H_
+#define XLA_STREAM_EXECUTOR_GPU_BUFFER_COMPARATOR_KERNEL_LIB_CU_H_
+
+#include <sys/types.h>
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/gpu/buffer_comparator_kernel.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/types.h"
+
+namespace stream_executor::gpu {
+
+// NaN's are considered equal, and for half's we clamp all numbers to largest
+// and smallest numbers representable to avoid miscomparisons due to overflows.
+template <typename T>
+__device__ __inline__ auto Canonicalize(T elem) {
+  // All fp16 infinities are treated as 65505 or -65505, in order to avoid
+  // differences due to overflows.
+  if (Eigen::numext::isinf(elem)) {
+    return std::copysignf(Eigen::NumTraits<xla::half>::highest(), elem);
+  }
+  return static_cast<float>(elem);
+}
+
+template <>
+__device__ __inline__ auto Canonicalize(float elem) {
+  return elem;
+}
+
+template <>
+__device__ __inline__ auto Canonicalize(double elem) {
+  return elem;
+}
+
+template <typename T>
+__global__ void xla_fp_comparison(T* buffer_a, T* buffer_b,
+                                  float rel_error_threshold,
+                                  uint64_t buffer_length, int* mismatch_count) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= buffer_length) {
+    return;
+  }
+
+  auto elem_a = Canonicalize(buffer_a[idx]);
+  auto elem_b = Canonicalize(buffer_b[idx]);
+
+  // NaN's are considered equal.
+  if (Eigen::numext::isnan(elem_a) && Eigen::numext::isnan(elem_b)) {
+    return;
+  }
+
+  // Two infinities are considered equal. Computing relative error would
+  // otherwise result in NaN.
+  if (elem_a == elem_b) {
+    return;
+  }
+
+  float rel_error = Eigen::numext::abs(elem_a - elem_b) /
+                    (Eigen::numext::maxi(Eigen::numext::abs(elem_a),
+                                         Eigen::numext::abs(elem_b)) +
+                     1);
+
+  if (rel_error > rel_error_threshold || Eigen::numext::isnan(rel_error))
+    atomicAdd(mismatch_count, 1);
+}
+
+// TODO(b/191520348): The comparison below requires exact equality.
+template <typename T>
+__global__ void xla_int_comparison(T* buffer_a, T* buffer_b,
+                                   float rel_error_threshold,
+                                   uint64_t buffer_length,
+                                   int* mismatch_count) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx >= buffer_length) return;
+  float elem_a;
+  float elem_b;
+  if constexpr (std::numeric_limits<T>::is_signed) {
+    elem_a = static_cast<int64_t>(buffer_a[idx]);
+    elem_b = static_cast<int64_t>(buffer_b[idx]);
+  } else {
+    elem_a = static_cast<uint64_t>(buffer_a[idx]);
+    elem_b = static_cast<uint64_t>(buffer_b[idx]);
+  }
+  float rel_error =
+      fabs(elem_a - elem_b) / (fmax(fabs(elem_a), fabs(elem_b)) + 1);
+  if (rel_error > rel_error_threshold || isnan(rel_error))
+    atomicAdd(mismatch_count, 1);
+}
+
+template <typename NativeT>
+void RegisterBufferComparatorKernelParametrized(Platform::Id platform_id) {
+  void* kernel_symbol = nullptr;
+  constexpr xla::PrimitiveType p_type =
+      xla::primitive_util::NativeToPrimitiveType<NativeT>();
+
+  if constexpr (xla::primitive_util::IsIntegralType(p_type)) {
+    kernel_symbol = absl::bit_cast<void*>(&xla_int_comparison<NativeT>);
+  } else if constexpr (xla::primitive_util::IsFloatingPointType(p_type)) {
+    kernel_symbol = absl::bit_cast<void*>(&xla_fp_comparison<NativeT>);
+  } else {
+    LOG(FATAL) << "Failed to register buffer comparator kernel for type "
+               << xla::primitive_util::LowercasePrimitiveTypeName(p_type);
+    return;
+  }
+  std::string kernel_name = absl::StrCat(
+      xla::primitive_util::LowercasePrimitiveTypeName(p_type), "_comparison");
+
+  stream_executor::MultiKernelLoaderSpec spec(5);
+  spec.AddInProcessSymbol(kernel_symbol, kernel_name);
+
+  absl::Status result =
+      stream_executor::gpu::GpuKernelRegistry::GetGlobalRegistry()
+          .RegisterKernel<BufferComparatorKernel<NativeT>>(platform_id, spec);
+
+  if (!result.ok()) {
+    LOG(FATAL) << "Failed to register buffer comparator kernel for type "
+               << xla::primitive_util::LowercasePrimitiveTypeName(p_type)
+               << ": " << result;
+  }
+}
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_BUFFER_COMPARATOR_KERNEL_LIB_CU_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/context_map_test.cc b/third_party/xla/xla/stream_executor/gpu/context_map_test.cc
index a719b3b18f6b..fdf66c00c75e 100644
--- a/third_party/xla/xla/stream_executor/gpu/context_map_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/context_map_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/context_map.h"
 
+#include <gtest/gtest.h>
 #include "tsl/platform/test.h"
 
 namespace stream_executor::gpu {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
index 182af599af9e..80b7d216dacc 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -16,17 +16,23 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <utility>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -151,6 +157,51 @@ void MatrixLayout::Transpose() {
   order = (order == Order::kRowMajor) ? Order::kColumnMajor : Order::kRowMajor;
 }
 
+absl::StatusOr<MatrixLayout> MatrixLayout::FromProto(
+    const xla::GemmConfigProto::MatrixLayout& proto) {
+  Order order;
+  switch (proto.order()) {
+    case xla::GemmConfigProto::MatrixLayout::ORDER_ROW_MAJOR:
+      order = Order::kRowMajor;
+      break;
+    case xla::GemmConfigProto::MatrixLayout::ORDER_COLUMN_MAJOR:
+      order = Order::kColumnMajor;
+      break;
+    case xla::GemmConfigProto::MatrixLayout::ORDER_UNKNOWN:
+    default:
+      return absl::InvalidArgumentError("Invalid matrix layout order");
+  }
+
+  TF_ASSIGN_OR_RETURN(blas::Transpose transpose,
+                      blas::FromProto(proto.transpose()));
+  return MatrixLayout(proto.dtype(), proto.num_rows(), proto.num_cols(), order,
+                      proto.batch_size(), proto.leading_dim_stride(),
+                      proto.batch_stride(), transpose);
+}
+
+xla::GemmConfigProto::MatrixLayout MatrixLayout::ToProto() const {
+  xla::GemmConfigProto::MatrixLayout proto;
+  switch (order) {
+    case Order::kRowMajor:
+      proto.set_order(xla::GemmConfigProto::MatrixLayout::ORDER_ROW_MAJOR);
+      break;
+    case Order::kColumnMajor:
+      proto.set_order(xla::GemmConfigProto::MatrixLayout::ORDER_COLUMN_MAJOR);
+      break;
+    default: {
+      LOG(FATAL) << "Invalid matrix layout order";
+    }
+  }
+  proto.set_num_rows(num_rows);
+  proto.set_num_cols(num_cols);
+  proto.set_batch_size(batch_size);
+  proto.set_leading_dim_stride(leading_dim_stride);
+  proto.set_batch_stride(batch_stride);
+  proto.set_transpose(blas::ToProto(transpose));
+  proto.set_dtype(dtype);
+  return proto;
+}
+
 absl::StatusOr<ComputationType> GetBlasComputationType(
     xla::PrecisionConfig::Algorithm algorithm, xla::PrimitiveType lhs_dtype,
     xla::PrimitiveType output_dtype, int64_t compute_precision) {
@@ -234,6 +285,78 @@ DataType GetScaleType(DataType c_type, ComputationType computation_type) {
               : c_type);
 }
 
-}  // namespace gpu
+absl::StatusOr<BlasLt::MatmulPlan*> BlasLt::GetOrCreateMatmulPlan(
+    const std::string& key, PlanCreateFunc create) {
+  absl::MutexLock lock(&plan_cache_mu_);  // double mutex ???
+  auto res = plan_cache_.emplace(key, MatmulPlanPtr{});
+  // New entry inserted: always create a new matmul plan if key is empty,
+  // this is used by command_buffer_thunk test.
+  if (res.second || key.empty()) {
+    VLOG(2) << "Creating a plan for: " << key;
+    TF_ASSIGN_OR_RETURN(res.first->second, create());
+    VLOG(2) << "Plan created: cache size: " << plan_cache_.size();
+  }
+  return res.first->second.get();
+}
 
+void BlasLt::ClearMatmulPlanCache() {
+  absl::MutexLock lock(&plan_cache_mu_);
+  plan_cache_.clear();
+}
+
+size_t BlasLt::GetMatmulPlanCacheSize() const {
+  absl::MutexLock lock(&plan_cache_mu_);
+  return plan_cache_.size();
+}
+
+absl::StatusOr<GemmConfig> GemmConfig::FromProto(
+    const xla::GemmConfigProto& proto) {
+  TF_ASSIGN_OR_RETURN(MatrixLayout lhs_layout,
+                      MatrixLayout::FromProto(proto.lhs_layout()));
+  TF_ASSIGN_OR_RETURN(MatrixLayout rhs_layout,
+                      MatrixLayout::FromProto(proto.rhs_layout()));
+  TF_ASSIGN_OR_RETURN(MatrixLayout c_layout,
+                      MatrixLayout::FromProto(proto.c_layout()));
+  TF_ASSIGN_OR_RETURN(MatrixLayout output_layout,
+                      MatrixLayout::FromProto(proto.output_layout()));
+  std::optional<blas::ComputationType> compute_type =
+      blas::FromProto(proto.compute_type());
+  return GemmConfig{
+      std::move(lhs_layout),
+      std::move(rhs_layout),
+      std::move(c_layout),
+      std::move(output_layout),
+      {proto.alpha_real(), proto.alpha_imag()},
+      proto.beta(),
+      proto.compute_precision(),
+      proto.precision_algorithm(),
+      proto.has_algorithm() ? std::optional(proto.algorithm()) : std::nullopt,
+      proto.grad_x(),
+      proto.grad_y(),
+      compute_type};
+}
+
+xla::GemmConfigProto GemmConfig::ToProto() const {
+  xla::GemmConfigProto proto;
+  *proto.mutable_lhs_layout() = lhs_layout.ToProto();
+  *proto.mutable_rhs_layout() = rhs_layout.ToProto();
+  *proto.mutable_c_layout() = c_layout.ToProto();
+  *proto.mutable_output_layout() = output_layout.ToProto();
+  proto.set_alpha_real(alpha.real());
+  proto.set_alpha_imag(alpha.imag());
+  proto.set_beta(beta);
+  proto.set_compute_precision(compute_precision);
+  proto.set_precision_algorithm(precision_algorithm);
+  if (algorithm.has_value()) {
+    proto.set_algorithm(*algorithm);
+  }
+  proto.set_grad_x(grad_x);
+  proto.set_grad_y(grad_y);
+  if (compute_type.has_value()) {
+    proto.set_compute_type(blas::ToProto(*compute_type));
+  }
+  return proto;
+}
+
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index ca4aff2c8d42..8e066278b127 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -24,10 +24,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/protobuf_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.pb.h"
 #include "xla/stream_executor/host_or_device_scalar.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -74,6 +80,10 @@ struct MatrixLayout {  // plain MatrixLayout which is extended with create
   // `batch_stride` is set to `0` when `batch_size == 1`.
   int64_t batch_stride;
   blas::Transpose transpose;
+
+  static absl::StatusOr<MatrixLayout> FromProto(
+      const xla::GemmConfigProto::MatrixLayout& proto);
+  xla::GemmConfigProto::MatrixLayout ToProto() const;
 };
 
 // compact version of the matrix layout to be used to pass matrices
@@ -120,6 +130,10 @@ struct GemmConfig {  // plain GemmConfig which is extended with create functions
   bool grad_x;
   bool grad_y;
   std::optional<blas::ComputationType> compute_type;
+
+  static absl::StatusOr<GemmConfig> FromProto(
+      const xla::GemmConfigProto& proto);
+  xla::GemmConfigProto ToProto() const;
 };
 
 struct BlasLt {
@@ -155,8 +169,8 @@ struct BlasLt {
   };
 
   struct MatmulPlan {
-    // API that uses scratch_allocator to allocate workspace.
-    // This version is used by TF: see tensorflow/core/kernels/matmul_util.cc
+    // This function is to be removed once TF interface is fixed,
+    // see tensorflow/core/kernels/matmul_util.cc
     absl::Status ExecuteOnStream(
         Stream* stream, DeviceMemoryBase a, DeviceMemoryBase b,
         DeviceMemoryBase c, DeviceMemoryBase d,
@@ -167,8 +181,29 @@ struct BlasLt {
         DeviceMemoryBase d_amax, const MatmulAlgorithm& algorithm,
         ScratchAllocator& scratch_allocator,
         blas::ProfileResult* profile_result = nullptr) const {
+      // Temporary hack until Tensorflow side is fixed
+      TF_RETURN_IF_ERROR(
+          const_cast<MatmulPlan*>(this)->SetAlgorithm(algorithm));
       return ExecuteOnStream(
-          stream, algorithm,
+          stream,
+          MemoryArgs{a, b, c, d, bias, aux, a_scale, b_scale, c_scale, d_scale,
+                     d_amax, DeviceMemoryBase{}, &scratch_allocator},
+          profile_result);
+    }
+
+    // API that uses scratch_allocator to allocate workspace.
+    // This version is used by TF: see tensorflow/core/kernels/matmul_util.cc
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a, DeviceMemoryBase b,
+        DeviceMemoryBase c, DeviceMemoryBase d,
+        DeviceMemoryBase bias,  // may be null
+        DeviceMemoryBase aux,   // may be null
+        DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+        DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+        DeviceMemoryBase d_amax, ScratchAllocator& scratch_allocator,
+        blas::ProfileResult* profile_result = nullptr) const {
+      return ExecuteOnStream(
+          stream,
           MemoryArgs{a, b, c, d, bias, aux, a_scale, b_scale, c_scale, d_scale,
                      d_amax, DeviceMemoryBase{}, &scratch_allocator},
           profile_result);
@@ -182,11 +217,10 @@ struct BlasLt {
         DeviceMemoryBase aux,   // may be null
         DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
         DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
-        DeviceMemoryBase d_amax, const MatmulAlgorithm& algorithm,
-        DeviceMemoryBase workspace,
+        DeviceMemoryBase d_amax, DeviceMemoryBase workspace,
         blas::ProfileResult* profile_result = nullptr) const {
       return ExecuteOnStream(
-          stream, algorithm,
+          stream,
           MemoryArgs{a, b, c, d, bias, aux, a_scale, b_scale, c_scale, d_scale,
                      d_amax, workspace, nullptr},
           profile_result);
@@ -194,8 +228,8 @@ struct BlasLt {
 
     // The most general form: to be implemented by derived clases.
     virtual absl::Status ExecuteOnStream(
-        Stream* stream, const MatmulAlgorithm& algorithm,
-        const MemoryArgs& args, blas::ProfileResult* profile_result) const = 0;
+        Stream* stream, const MemoryArgs& args,
+        blas::ProfileResult* profile_result) const = 0;
 
     // Returns a list of supported algorithms for DoMatmul. The algorithms are
     // returned in the order of increasing estimated compute time according to
@@ -204,10 +238,17 @@ struct BlasLt {
         const Stream* stream, size_t max_algorithm_count = 128,
         size_t max_workspace_size = 1ll << 32) const = 0;
 
+    // Algorithm must to be set before calling ExecuteOnStream function(s).
+    // Usually, we call ExecuteOnStream with the same algorithm ID, hence using
+    // a separate function here enables BlasLt implementations to do additional
+    // optimizations (like preloading matmul kernels) once the algorithm is set.
+    virtual absl::Status SetAlgorithm(const MatmulAlgorithm& algorithm) = 0;
+
     virtual ~MatmulPlan() {}
   };  // class MatmulPlan
 
   using MatmulPlanPtr = std::unique_ptr<MatmulPlan>;
+  using PlanCreateFunc = absl::AnyInvocable<absl::StatusOr<MatmulPlanPtr>()>;
 
   virtual absl::Status Init() = 0;
 
@@ -221,7 +262,18 @@ struct BlasLt {
                                                      const GemmConfig& cfg,
                                                      Epilogue epilogue);
 
+  absl::StatusOr<MatmulPlan*> GetOrCreateMatmulPlan(const std::string& key,
+                                                    PlanCreateFunc create);
+
+  void ClearMatmulPlanCache();
+  size_t GetMatmulPlanCacheSize() const;
+
   virtual ~BlasLt() {}
+
+ protected:
+  mutable absl::Mutex plan_cache_mu_;
+  absl::flat_hash_map<std::string, MatmulPlanPtr> plan_cache_
+      ABSL_GUARDED_BY(plan_cache_mu_);
 };  // class BlasLt
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.proto b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.proto
new file mode 100644
index 000000000000..7215f752aae4
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.proto
@@ -0,0 +1,53 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+import "xla/stream_executor/blas.proto";
+import "xla/xla_data.proto";
+
+message GemmConfigProto {
+  message MatrixLayout {
+    enum Order {
+      ORDER_UNKNOWN = 0;
+      ORDER_ROW_MAJOR = 1;
+      ORDER_COLUMN_MAJOR = 2;
+    }
+    Order order = 1;
+    int64 num_rows = 2;
+    int64 num_cols = 3;
+    int64 batch_size = 4;
+    int64 leading_dim_stride = 5;
+    int64 batch_stride = 6;
+    xla.BlasTransposeProto transpose = 7;
+    xla.PrimitiveType dtype = 8;
+  }
+
+  MatrixLayout lhs_layout = 1;
+  MatrixLayout rhs_layout = 2;
+  MatrixLayout c_layout = 3;
+  MatrixLayout output_layout = 4;
+  double alpha_real = 5;
+  double alpha_imag = 6;
+  double beta = 7;
+  int64 compute_precision = 8;
+  xla.PrecisionConfig.Algorithm precision_algorithm = 9;
+  optional int64 algorithm = 10;
+  bool grad_x = 11;
+  bool grad_y = 12;
+  xla.BlasComputationTypeProto compute_type = 13;
+}
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt_test.cc
new file mode 100644
index 000000000000..af188f02e417
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt_test.cc
@@ -0,0 +1,113 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+
+#include <optional>
+
+#include "xla/stream_executor/blas.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
+
+namespace stream_executor::gpu {
+
+// Helper to compare MatrixLayout structs.
+void ExpectMatrixLayoutEq(const MatrixLayout& lhs, const MatrixLayout& rhs) {
+  EXPECT_EQ(lhs.dtype, rhs.dtype);
+  EXPECT_EQ(lhs.num_rows, rhs.num_rows);
+  EXPECT_EQ(lhs.num_cols, rhs.num_cols);
+  EXPECT_EQ(lhs.order, rhs.order);
+  EXPECT_EQ(lhs.batch_size, rhs.batch_size);
+  EXPECT_EQ(lhs.leading_dim_stride, rhs.leading_dim_stride);
+  EXPECT_EQ(lhs.batch_stride, rhs.batch_stride);
+  EXPECT_EQ(lhs.transpose, rhs.transpose);
+}
+
+// Helper to compare GemmConfig structs.
+void ExpectGemmConfigEq(const GemmConfig& lhs, const GemmConfig& rhs) {
+  ExpectMatrixLayoutEq(lhs.lhs_layout, rhs.lhs_layout);
+  ExpectMatrixLayoutEq(lhs.rhs_layout, rhs.rhs_layout);
+  ExpectMatrixLayoutEq(lhs.c_layout, rhs.c_layout);
+  ExpectMatrixLayoutEq(lhs.output_layout, rhs.output_layout);
+  EXPECT_EQ(lhs.alpha.real(), rhs.alpha.real());
+  EXPECT_EQ(lhs.alpha.imag(), rhs.alpha.imag());
+  EXPECT_EQ(lhs.beta, rhs.beta);
+  EXPECT_EQ(lhs.compute_precision, rhs.compute_precision);
+  EXPECT_EQ(lhs.precision_algorithm, rhs.precision_algorithm);
+  EXPECT_EQ(lhs.algorithm, rhs.algorithm);
+  EXPECT_EQ(lhs.grad_x, rhs.grad_x);
+  EXPECT_EQ(lhs.grad_y, rhs.grad_y);
+  EXPECT_EQ(lhs.compute_type, rhs.compute_type);
+}
+
+TEST(GemmConfigTest, ProtoConversion) {
+  MatrixLayout layout(xla::PrimitiveType::F32, 16, 16,
+                      MatrixLayout::Order::kRowMajor);
+  GemmConfig original_config = {
+      layout,                           // lhs_layout
+      layout,                           // rhs_layout
+      layout,                           // c_layout
+      layout,                           // output_layout
+      {1.0, 0.0},                       // alpha
+      0.0,                              // beta
+      0,                                // compute_precision
+      xla::PrecisionConfig::ALG_UNSET,  // precision_algorithm
+      std::nullopt,                     // algorithm
+      false,                            // grad_x
+      false,                            // grad_y
+      std::nullopt                      // compute_type
+  };
+
+  xla::GemmConfigProto proto = original_config.ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(auto round_tripped_config,
+                          GemmConfig::FromProto(proto));
+
+  ExpectGemmConfigEq(original_config, round_tripped_config);
+}
+
+TEST(GemmConfigTest, ProtoConversionWithOptionals) {
+  MatrixLayout layout_a(xla::PrimitiveType::BF16, 32, 64,
+                        MatrixLayout::Order::kColumnMajor, 2, 64, 64 * 32 * 2,
+                        blas::Transpose::kTranspose);
+  MatrixLayout layout_b(xla::PrimitiveType::BF16, 64, 48,
+                        MatrixLayout::Order::kRowMajor, 2, 48, 48 * 64 * 2,
+                        blas::Transpose::kNoTranspose);
+  MatrixLayout layout_c(xla::PrimitiveType::F32, 32, 48,
+                        MatrixLayout::Order::kColumnMajor, 2, 48, 48 * 32 * 2,
+                        blas::Transpose::kNoTranspose);
+
+  GemmConfig original_config = {
+      layout_a,                                   // lhs_layout
+      layout_b,                                   // rhs_layout
+      layout_c,                                   // c_layout
+      layout_c,                                   // output_layout
+      {0.5, 0.1},                                 // alpha
+      1.5,                                        // beta
+      1,                                          // compute_precision
+      xla::PrecisionConfig::ALG_DOT_F32_F32_F32,  // precision_algorithm
+      7,                                          // algorithm
+      true,                                       // grad_x
+      false,                                      // grad_y
+      blas::ComputationType::kTF32AsF32           // compute_type
+  };
+
+  xla::GemmConfigProto proto = original_config.ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(auto round_tripped_config,
+                          GemmConfig::FromProto(proto));
+
+  ExpectGemmConfigEq(original_config, round_tripped_config);
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index fda930059e1a..55a245d7b99a 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -15,16 +15,15 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/gpu_command_buffer.h"
 
-#include <array>
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -35,48 +34,24 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/stream_executor/bit_pattern.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 
-//===----------------------------------------------------------------------===//
-// Implementation details device kernels required by GpuCommandBuffer.
-//===----------------------------------------------------------------------===//
-
 using Mode = CommandBuffer::Mode;
 using State = CommandBuffer::State;
 using GraphNodeHandle = GpuCommandBuffer::GraphNodeHandle;
 using GraphConditionalHandle = GpuCommandBuffer::GraphConditionalHandle;
-using GraphConditionalHandles = absl::Span<const GraphConditionalHandle>;
-
-namespace {
-absl::string_view to_string(State state) {
-  switch (state) {
-    case State::kCreate:
-      return "create";
-    case State::kUpdate:
-      return "update";
-    case State::kFinalized:
-      return "finalized";
-  }
-}
-
-absl::Status UnsupportedStateError(State state) {
-  return absl::InternalError(
-      absl::StrCat("Unsupported command buffer state: ", to_string(state)));
-}
-}  // namespace
 
 //===----------------------------------------------------------------------===//
 // GpuCommandBuffer resource usage tracking
@@ -85,17 +60,17 @@ absl::Status UnsupportedStateError(State state) {
 static std::atomic<int64_t> allocated_execs(0);
 static std::atomic<int64_t> alive_execs(0);
 
-/*static*/ int64_t GpuCommandBuffer::NotifyExecCreated() {
+int64_t GpuCommandBuffer::NotifyExecCreated() {
   alive_execs.fetch_add(1, std::memory_order_relaxed);
   return allocated_execs.fetch_add(1, std::memory_order_relaxed);
 }
 
-/*static*/ int64_t GpuCommandBuffer::NotifyExecDestroyed() {
+int64_t GpuCommandBuffer::NotifyExecDestroyed() {
   DCHECK_GE(alive_execs.load(std::memory_order_relaxed), 1);
   return alive_execs.fetch_sub(1, std::memory_order_relaxed) - 1;
 }
 
-/*static*/ int64_t GpuCommandBuffer::AliveExecs() {
+int64_t GpuCommandBuffer::AliveExecs() {
   return alive_execs.load(std::memory_order_relaxed);
 }
 
@@ -104,37 +79,7 @@ static std::atomic<int64_t> alive_execs(0);
 //===----------------------------------------------------------------------===//
 
 GpuCommandBuffer::GpuCommandBuffer(Mode mode, StreamExecutor* parent)
-    : mode_(mode), parent_(parent) {
-  execution_scopes_.try_emplace(kDefaultExecutionScope);
-}
-
-GpuCommandBuffer::Dependencies GpuCommandBuffer::GetBarrier(
-    ExecutionScopeId execution_scope_id) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-  return execution_scope.barriers.empty()
-             ? Dependencies{}
-             : Dependencies{execution_scope.barriers.back().handle};
-}
-
-absl::Status GpuCommandBuffer::DisableBarriersExecution(
-    GpuCommandBuffer& root_command_buffer) {
-  ExecutionScope& execution_scope = execution_scopes_[kDefaultExecutionScope];
-
-  for (GpuGraphBarrierInfo& barrier : execution_scope.barriers) {
-    if (barrier.is_barrier_node) {
-      TF_RETURN_IF_ERROR(
-          root_command_buffer.SetNodeExecutionEnabled(barrier.handle, false));
-    }
-  }
-  for (ConditionalCommandBuffers& cmd_buffers :
-       execution_scope.conditional_command_buffers) {
-    for (auto& cmd_buffer : cmd_buffers.command_buffers) {
-      TF_RETURN_IF_ERROR(
-          cmd_buffer->DisableBarriersExecution(root_command_buffer));
-    }
-  }
-  return absl::OkStatus();
-}
+    : mode_(mode), parent_(parent) {}
 
 absl::Status GpuCommandBuffer::CheckNotFinalized() {
   if (state_ == State::kFinalized)
@@ -143,219 +88,108 @@ absl::Status GpuCommandBuffer::CheckNotFinalized() {
   return absl::OkStatus();
 }
 
-absl::Status GpuCommandBuffer::CheckNumCommandBuffers(
-    const ConditionalCommandBuffers& cmd_buffers, size_t num_cmd_buffers) {
-  if (cmd_buffers.conditionals.size() != num_cmd_buffers) {
-    return absl::InternalError(absl::StrCat(
-        "Expected to have ", num_cmd_buffers,
-        " conditional command buffers, got ", cmd_buffers.conditionals.size()));
+absl::Status GpuCommandBuffer::CheckInState(State state) {
+  if (state_ != state) {
+    return absl::InternalError(absl::StrFormat(
+        "Expected command buffer to be in state %v but it was in state %v",
+        state, state_));
   }
   return absl::OkStatus();
 }
 
-GpuCommandBuffer::Dependencies GpuCommandBuffer::GetBarrierDependencies(
-    ExecutionScopeId execution_scope_id) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-  auto& barriers = execution_scope.barriers;
-
-  // Collect nodes that will become a new barrier dependencies starting from
-  // the first command node added after the last barrier in the scope.
-  Dependencies dependencies;
-  for (size_t i = barriers.empty() ? 0 : barriers.back().nodes_offset;
-       i < execution_scope.nodes.size(); ++i) {
-    dependencies.push_back(execution_scope.nodes[i].handle);
-  }
-  return dependencies;
-}
-
-absl::Status GpuCommandBuffer::Barrier(ExecutionScopeId execution_scope_id) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
+std::vector<GpuCommandBuffer::GraphNodeHandle>
+GpuCommandBuffer::ToGraphNodeDependencies(
+    absl::Span<const Command* const> dependencies) {
+  std::vector<GraphNodeHandle> handles;
 
-  if (state_ == State::kCreate) {
-    // Nodes offset for a newly created barrier.
-    size_t nodes_offset = execution_scope.nodes.size();
+  for (const Command* dep : dependencies) {
+    DCHECK(dep) << "Dependency command must be not null";
 
-    // Collect nodes that will become a new barrier dependencies starting from
-    // the first command node added after the last barrier.
-    Dependencies dependencies = GetBarrierDependencies(execution_scope_id);
+    if (auto* gpu_command = dynamic_cast<const GpuCommand*>(dep)) {
+      handles.push_back(gpu_command->handle);
 
-    // If there are no new dependencies and we have an existing barrier simply
-    // copy information from the last barrier to a new one.
-    if (dependencies.empty() && !execution_scope.barriers.empty()) {
-      execution_scope.barriers.push_back({execution_scope.barriers.back()});
-      return absl::OkStatus();
-    }
+    } else if (auto* gpu_command = dynamic_cast<const GpuCaseCommand*>(dep)) {
+      for (const auto& conditional_node : gpu_command->conditional_nodes) {
+        handles.push_back(conditional_node.handle);
+      }
 
-    // If we have only one node added after the last barrier simply reuse the
-    // last node corresponding to a command as a barrier.
-    if (dependencies.size() == 1) {
-      execution_scope.barriers.push_back(
-          {execution_scope.nodes.back().handle, false, nodes_offset});
-      return absl::OkStatus();
-    }
+    } else if (auto* gpu_command = dynamic_cast<const GpuWhileCommand*>(dep)) {
+      handles.push_back(gpu_command->conditional_node.handle);
 
-    // If we have multiple dependencies or no existing barriers we have to
-    // create a new empty node acting as an execution barrier.
-    TF_ASSIGN_OR_RETURN(auto barrier_handle, CreateBarrierNode(dependencies));
-    execution_scope.barriers.push_back({barrier_handle, true, nodes_offset});
-    return absl::OkStatus();
-  }
-
-  if (state_ == State::kUpdate) {
-    // Command buffer updates can't change the structure of the underlying gpu
-    // graph (add or delete barriers). We simply do a sanity check that at
-    // update time we didn't try to add more barriers than we had originally.
-    if (execution_scope.update_state.barrier_idx++ >=
-        execution_scope.barriers.size()) {
-      return absl::InternalError(
-          absl::StrFormat("Execution scope %d barrier index out of range",
-                          execution_scope_id.value()));
+    } else {
+      LOG(FATAL) << "Unsupported command type";  // Crash OK
     }
-    return absl::OkStatus();
   }
 
-  return UnsupportedStateError(state_);
+  return handles;
 }
 
-absl::Status GpuCommandBuffer::Barrier(
-    absl::Span<const ExecutionScopeId> execution_scope_ids) {
-  // Nothing to synchronize here.
-  if (execution_scope_ids.empty()) return absl::OkStatus();
+absl::StatusOr<const CommandBuffer::Command*>
+GpuCommandBuffer::CreateLaunchWithPackedArgs(
+    const ThreadDim& threads, const BlockDim& blocks, const Kernel& kernel,
+    const KernelArgsPackedArrayBase& packed_args,
+    absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
 
-  // Do not create two-level barriers for single execution scope.
-  if (execution_scope_ids.size() == 1) {
-    return Barrier(execution_scope_ids[0]);
-  }
-
-  // Add a new barrier to every synchronized execution scope.
-  for (ExecutionScopeId execution_scope_id : execution_scope_ids) {
-    TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
-  }
-
-  if (state_ == State::kCreate) {
-    // Collect barriers from each scope as a dependencies.
-    Dependencies dependencies;
-    for (ExecutionScopeId execution_scope_id : execution_scope_ids) {
-      ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-      dependencies.push_back(execution_scope.barriers.back().handle);
-    }
-
-    // Create a new barrier that joins all per-scope barriers together.
-    TF_ASSIGN_OR_RETURN(auto barrier_handle, CreateBarrierNode(dependencies));
-
-    // Broadcast new barrier to all participating execution scopes.
-    for (ExecutionScopeId execution_scope_id : execution_scope_ids) {
-      ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-      size_t nodes_offset = execution_scope.nodes.size();
-      execution_scope.barriers.push_back({barrier_handle, true, nodes_offset});
-    }
-
-    return absl::OkStatus();
-  }
+  CHECK_EQ(kernel.Arity() + (packed_args.number_of_shared_bytes() > 0),
+           packed_args.number_of_arguments());
 
-  if (state_ == State::kUpdate) {
-    // Command buffer updates can't change the structure of the underlying gpu
-    // graph (add or delete barriers). We simply do a sanity check that at
-    // update time we didn't try to add more barriers than we had originally.
-    for (ExecutionScopeId execution_scope_id : execution_scope_ids) {
-      ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-      if (execution_scope.update_state.barrier_idx++ >=
-          execution_scope.barriers.size()) {
-        return absl::InternalError(
-            absl::StrFormat("Execution scope %d barrier index out of range",
-                            execution_scope_id.value()));
-      }
-    }
-    return absl::OkStatus();
-  }
+  // Adds a new kernel node to the graph under construction.
+  TF_ASSIGN_OR_RETURN(GraphNodeHandle handle,
+                      CreateKernelNode(ToGraphNodeDependencies(dependencies),
+                                       threads, blocks, kernel, packed_args));
 
-  return UnsupportedStateError(state_);
+  return AppendCommand(GpuCommand{handle});
 }
 
-absl::Status GpuCommandBuffer::Barrier(ExecutionScopeId from_execution_scope_id,
-                                       ExecutionScopeId to_execution_scope_id) {
-  // If scopes are the same simply add a barrier to it.
-  if (from_execution_scope_id == to_execution_scope_id) {
-    return Barrier(from_execution_scope_id);
-  }
-
-  // Create new barriers in both execution scopes.
-  TF_RETURN_IF_ERROR(Barrier(from_execution_scope_id));
-  TF_RETURN_IF_ERROR(Barrier(to_execution_scope_id));
-
-  if (state_ == State::kCreate) {
-    // Collect barriers from each scope as dependencies.
-    Dependencies dependencies = {
-        execution_scopes_[from_execution_scope_id].barriers.back().handle,
-        execution_scopes_[to_execution_scope_id].barriers.back().handle};
-
-    // Create a new barrier that joins `from` and `to` scopes.
-    TF_ASSIGN_OR_RETURN(auto barrier_handle, CreateBarrierNode(dependencies));
+absl::Status GpuCommandBuffer::UpdateLaunchWithPackedArgs(
+    const Command* command, const ThreadDim& threads, const BlockDim& blocks,
+    const Kernel& kernel, const KernelArgsPackedArrayBase& packed_args) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
+  auto* gpu_command = tsl::down_cast<const GpuCommand*>(command);
+  return UpdateKernelNode(gpu_command->handle, threads, blocks, kernel,
+                          packed_args);
+}
 
-    // Add a new barrier only to the `to_execution_scope_id`.
-    ExecutionScope& execution_scope = execution_scopes_[to_execution_scope_id];
-    size_t nodes_offset = execution_scope.nodes.size();
-    execution_scope.barriers.push_back({barrier_handle, true, nodes_offset});
+absl::StatusOr<const CommandBuffer::Command*> GpuCommandBuffer::CreateLaunch(
+    const ThreadDim& threads, const BlockDim& blocks, const Kernel& kernel,
+    const KernelArgs& args, absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
 
-    return absl::OkStatus();
+  // If arguments are already packed we can just launch the kernel.
+  if (auto* packed = DynCast<KernelArgsPackedArrayBase>(&args)) {
+    return CreateLaunchWithPackedArgs(threads, blocks, kernel, *packed,
+                                      dependencies);
   }
 
-  if (state_ == State::kUpdate) {
-    // Command buffer updates can't change the structure of the underlying gpu
-    // graph (add or delete barriers). We simply do a sanity check that at
-    // update time we didn't try to add more barriers than we had originally.
-    ExecutionScope& execution_scope = execution_scopes_[to_execution_scope_id];
-    if (execution_scope.update_state.barrier_idx++ >=
-        execution_scope.barriers.size()) {
+  // For device memory array we rely on a custom kernel arguments packing.
+  if (auto* device_mem = DynCast<KernelArgsDeviceMemoryArray>(&args)) {
+    auto& pack = kernel.args_packing();
+    if (!pack) {
       return absl::InternalError(
-          absl::StrFormat("Execution scope %d barrier index out of range",
-                          to_execution_scope_id.value()));
+          "Kernel is missing a custom arguments packing function for device "
+          "memory arguments array");
     }
-    return absl::OkStatus();
-  }
-
-  return UnsupportedStateError(state_);
-}
 
-absl::Status GpuCommandBuffer::LaunchWithPackedArgs(
-    ExecutionScopeId execution_scope_id, const ThreadDim& threads,
-    const BlockDim& blocks, const Kernel& kernel,
-    const KernelArgsPackedArrayBase& packed_args) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-
-  CHECK_EQ(kernel.Arity() + (packed_args.number_of_shared_bytes() > 0),
-           packed_args.number_of_arguments());
-
-  // Adds a new kernel node to the graph under construction.
-  if (state_ == State::kCreate) {
-    Dependencies barrier = GetBarrier(execution_scope_id);
-    TF_ASSIGN_OR_RETURN(
-        execution_scope.nodes.emplace_back().handle,
-        CreateKernelNode(barrier, threads, blocks, kernel, packed_args));
-    return absl::OkStatus();
-  }
-
-  // Updates kernel node in the executable graph.
-  if (state_ == State::kUpdate) {
-    return UpdateKernelNode(
-        execution_scope.nodes[execution_scope.update_state.node_idx++].handle,
-        threads, blocks, kernel, packed_args);
+    TF_ASSIGN_OR_RETURN(auto packed, pack(kernel, *device_mem));
+    return CreateLaunchWithPackedArgs(threads, blocks, kernel, *packed,
+                                      dependencies);
   }
 
-  return UnsupportedStateError(state_);
+  return absl::InternalError("Unsupported kernel arguments type");
 }
 
-absl::Status GpuCommandBuffer::Launch(ExecutionScopeId execution_scope_id,
-                                      const ThreadDim& threads,
-                                      const BlockDim& blocks,
-                                      const Kernel& kernel,
-                                      const KernelArgs& args) {
-  TF_RETURN_IF_ERROR(CheckNotFinalized());
+absl::Status GpuCommandBuffer::UpdateLaunch(const Command* command,
+                                            const ThreadDim& threads,
+                                            const BlockDim& blocks,
+                                            const Kernel& kernel,
+                                            const KernelArgs& args) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
 
   // If arguments are already packed we can just launch the kernel.
   if (auto* packed = DynCast<KernelArgsPackedArrayBase>(&args)) {
-    return LaunchWithPackedArgs(execution_scope_id, threads, blocks, kernel,
-                                *packed);
+    return UpdateLaunchWithPackedArgs(command, threads, blocks, kernel,
+                                      *packed);
   }
 
   // For device memory array we rely on a custom kernel arguments packing.
@@ -368,97 +202,112 @@ absl::Status GpuCommandBuffer::Launch(ExecutionScopeId execution_scope_id,
     }
 
     TF_ASSIGN_OR_RETURN(auto packed, pack(kernel, *device_mem));
-    return LaunchWithPackedArgs(execution_scope_id, threads, blocks, kernel,
-                                *packed);
+    return UpdateLaunchWithPackedArgs(command, threads, blocks, kernel,
+                                      *packed);
   }
 
   return absl::InternalError("Unsupported kernel arguments type");
 }
 
-absl::Status GpuCommandBuffer::AddNestedCommandBuffer(
-    ExecutionScopeId execution_scope_id, const CommandBuffer& nested) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
+absl::StatusOr<const CommandBuffer::Command*>
+GpuCommandBuffer::CreateNestedCommand(
+    const CommandBuffer& nested,
+    absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
 
-  TF_RETURN_IF_ERROR(CheckNotFinalized());
+  TF_ASSIGN_OR_RETURN(
+      GraphNodeHandle handle,
+      CreateChildNode(ToGraphNodeDependencies(dependencies), nested));
 
-  // Adds a child graph node to the graph under construction.
-  if (state_ == State::kCreate) {
-    Dependencies barrier = GetBarrier(execution_scope_id);
-    TF_ASSIGN_OR_RETURN(execution_scope.nodes.emplace_back().handle,
-                        CreateChildNode(barrier, nested));
-    return absl::OkStatus();
-  }
+  return AppendCommand(GpuCommand{handle});
+}
 
-  // Updates child graph node in the executable graph.
-  if (state_ == State::kUpdate) {
-    GraphNodeHandle node =
-        execution_scope.nodes[execution_scope.update_state.node_idx++].handle;
-    return UpdateChildNode(node, nested);
-  }
+absl::Status GpuCommandBuffer::UpdateNestedCommand(
+    const Command* command, const CommandBuffer& nested) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
+  auto* gpu_command = tsl::down_cast<const GpuCommand*>(command);
+  return UpdateChildNode(gpu_command->handle, nested);
+}
 
-  return UnsupportedStateError(state_);
+absl::StatusOr<const CommandBuffer::Command*> GpuCommandBuffer::CreateMemcpyD2D(
+    DeviceMemoryBase* dst, const DeviceMemoryBase& src, uint64_t size,
+    absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
+
+  TF_ASSIGN_OR_RETURN(GraphNodeHandle handle,
+                      CreateMemcpyD2DNode(ToGraphNodeDependencies(dependencies),
+                                          *dst, src, size));
+
+  return AppendCommand(GpuCommand{handle});
 }
 
-absl::Status GpuCommandBuffer::MemcpyDeviceToDevice(
-    ExecutionScopeId execution_scope_id, DeviceMemoryBase* dst,
-    const DeviceMemoryBase& src, uint64_t size) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
+absl::Status GpuCommandBuffer::UpdateMemcpyD2D(const Command* command,
+                                               DeviceMemoryBase* dst,
+                                               const DeviceMemoryBase& src,
+                                               uint64_t size) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
+  auto* gpu_command = tsl::down_cast<const GpuCommand*>(command);
+  return UpdateMemcpyD2DNode(gpu_command->handle, *dst, src, size);
+}
 
-  TF_RETURN_IF_ERROR(CheckNotFinalized());
+absl::StatusOr<const CommandBuffer::Command*> GpuCommandBuffer::CreateMemset(
+    DeviceMemoryBase* dst, BitPattern bit_pattern, size_t num_elements,
+    absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
 
-  if (state_ == State::kCreate) {
-    Dependencies barrier = GetBarrier(execution_scope_id);
-    TF_ASSIGN_OR_RETURN(execution_scope.nodes.emplace_back().handle,
-                        CreateMemcpyD2DNode(barrier, *dst, src, size));
-    return absl::OkStatus();
-  }
+  TF_ASSIGN_OR_RETURN(GraphNodeHandle handle,
+                      CreateMemsetNode(ToGraphNodeDependencies(dependencies),
+                                       *dst, bit_pattern, num_elements));
 
-  if (state_ == State::kUpdate) {
-    GraphNodeHandle node =
-        execution_scope.nodes[execution_scope.update_state.node_idx++].handle;
-    return UpdateMemcpyD2DNode(node, *dst, src, size);
-  }
+  return AppendCommand(GpuCommand{handle});
+}
 
-  return UnsupportedStateError(state_);
+absl::Status GpuCommandBuffer::UpdateMemset(const Command* command,
+                                            DeviceMemoryBase* dst,
+                                            const BitPattern& bit_pattern,
+                                            size_t num_elements) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
+  auto* gpu_command = tsl::down_cast<const GpuCommand*>(command);
+  return UpdateMemsetNode(gpu_command->handle, *dst, bit_pattern, num_elements);
 }
 
-absl::Status GpuCommandBuffer::Memset(ExecutionScopeId execution_scope_id,
-                                      DeviceMemoryBase* dst,
-                                      BitPattern bit_pattern,
-                                      size_t num_elements) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
+//----------------------------------------------------------------------------//
+// Command buffer DNN graph API
+//----------------------------------------------------------------------------//
 
-  TF_RETURN_IF_ERROR(CheckNotFinalized());
+absl::StatusOr<const CommandBuffer::Command*>
+GpuCommandBuffer::CreateDnnGraphCommand(
+    dnn::DnnGraph& dnn_graph, Stream& stream,
+    absl::Span<DeviceMemoryBase> operands,
+    absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
 
-  if (state_ == State::kCreate) {
-    Dependencies barrier = GetBarrier(execution_scope_id);
-    TF_ASSIGN_OR_RETURN(
-        execution_scope.nodes.emplace_back().handle,
-        CreateMemsetNode(barrier, *dst, bit_pattern, num_elements));
-    return absl::OkStatus();
-  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<CommandBuffer> nested,
+                      stream.parent()->CreateCommandBuffer(Mode::kNested));
+  GpuCommandBuffer& nested_gpu =
+      tensorflow::down_cast<GpuCommandBuffer&>(*nested);
+  TF_RETURN_IF_ERROR(
+      nested_gpu.PopulateDnnGraphNode(dnn_graph, stream, operands));
 
-  if (state_ == State::kUpdate) {
-    GraphNodeHandle node =
-        execution_scope.nodes[execution_scope.update_state.node_idx++].handle;
-    return UpdateMemsetNode(node, *dst, bit_pattern, num_elements);
-  }
+  TF_ASSIGN_OR_RETURN(
+      GraphNodeHandle handle,
+      CreateChildNode(ToGraphNodeDependencies(dependencies), *nested));
 
-  return UnsupportedStateError(state_);
+  return AppendCommand(GpuCommand{handle});
 }
 
-//--------------------------------------------------------------------------//
-// Command buffer condtitional commands API
-//--------------------------------------------------------------------------//
-
-/*static*/ GpuCommandBuffer::ConditionBuilder
-GpuCommandBuffer::ToConditionBuilder(Builder builder) {
-  return [builder = std::move(builder)](CommandBuffer* cmd_buffer,
-                                        GraphConditionalHandle condition) {
-    return builder(cmd_buffer);
-  };
+absl::Status GpuCommandBuffer::UpdateDnnGraphCommand(
+    const Command* command, dnn::DnnGraph& dnn_graph, Stream& stream,
+    absl::Span<DeviceMemoryBase> operands) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
+  return UpdateDnnGraphNode(dnn_graph, stream, operands,
+                            tsl::down_cast<const GpuCommand*>(command)->handle);
 }
 
+//----------------------------------------------------------------------------//
+// Command buffer condtitional commands API
+//----------------------------------------------------------------------------//
+
 absl::StatusOr<std::vector<GraphConditionalHandle>>
 GpuCommandBuffer::CreateConditionalHandles(size_t num_handles) {
   std::vector<GraphConditionalHandle> handles;
@@ -469,156 +318,90 @@ GpuCommandBuffer::CreateConditionalHandles(size_t num_handles) {
   return handles;
 }
 
-absl::StatusOr<std::vector<std::unique_ptr<GpuCommandBuffer>>>
-GpuCommandBuffer::CreateConditionalCommandBuffers(
-    ExecutionScopeId execution_scope_id, ConditionType type,
-    absl::Span<const GraphConditionalHandle> conditionals,
-    absl::Span<const ConditionBuilder> builders) {
-  std::vector<std::unique_ptr<GpuCommandBuffer>> cmd_buffers;
-  cmd_buffers.reserve(conditionals.size());
-
-  for (size_t i = 0; i < conditionals.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(auto command_buffer,
-                        CreateConditionalCommandBuffer(execution_scope_id, type,
-                                                       conditionals[i]));
-    TF_RETURN_IF_ERROR(builders[i](command_buffer.get(), conditionals[i]));
-    TF_RETURN_IF_ERROR(command_buffer->Finalize());
-    cmd_buffers.push_back(std::move(command_buffer));
-  }
-
-  return cmd_buffers;
-}
-
-absl::Status GpuCommandBuffer::UpdateConditionalCommandBuffers(
-    absl::Span<const GraphConditionalHandle> handles,
-    absl::Span<const std::unique_ptr<GpuCommandBuffer>> command_buffers,
-    absl::Span<const ConditionBuilder> builders) {
-  for (size_t i = 0; i < command_buffers.size(); ++i) {
-    // Use parent graph executable for conditional command buffer update.
-    auto scoped_update_mode = ActivateUpdateMode(command_buffers[i].get());
-
-    // Update command buffer using user-provided builder callback.
-    TF_RETURN_IF_ERROR(command_buffers[i]->Update());
-    TF_RETURN_IF_ERROR(builders[i](command_buffers[i].get(), handles[i]));
-    TF_RETURN_IF_ERROR(command_buffers[i]->Finalize());
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<std::unique_ptr<GpuCommandBuffer>>
-GpuCommandBuffer::CreateConditionalCommandBuffer(
-    ExecutionScopeId execution_scope_id, ConditionType type,
-    GraphConditionalHandle conditional) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-
-  TF_ASSIGN_OR_RETURN(
-      auto result,
-      CreateConditionalNode(GetBarrier(execution_scope_id), conditional, type));
-  execution_scope.nodes.emplace_back().handle = result.node_handle;
-  return std::move(result.command_buffer);
-}
-
-absl::Status GpuCommandBuffer::AddConditionalCommandNode(
-    ExecutionScopeId execution_scope_id, ConditionType type,
-    SetConditionFn set_condition, absl::Span<const ConditionBuilder> builders) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
+absl::StatusOr<const CommandBuffer::Command*> GpuCommandBuffer::CreateCase(
+    DeviceMemory<uint8_t> index, bool index_is_bool,
+    std::vector<CreateCommands> create_branches,
+    absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
 
-  TF_RETURN_IF_ERROR(CheckNotFinalized());
-
-  // Every conditional command buffer is controlled by its own handle.
-  size_t num_handles = builders.size();
-
-  if (state_ == State::kCreate) {
-    TF_ASSIGN_OR_RETURN(auto handles, CreateConditionalHandles(num_handles));
-
-    // Add a kernel to update conditional handles values.
-    TF_RETURN_IF_ERROR(set_condition(execution_scope_id, handles));
-
-    // Add a barrier between conditional handles and conditional nodes.
-    TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
-
-    // Create conditional command buffer for each builder.
-    TF_ASSIGN_OR_RETURN(auto cmd_buffers,
-                        CreateConditionalCommandBuffers(
-                            execution_scope_id, type, handles, builders));
+  constexpr size_t kBranchBatchSize = 8;
 
-    // Keep track of created conditional handles and command buffers.
-    execution_scope.conditional_command_buffers.push_back(
-        {std::move(handles), std::move(cmd_buffers)});
+  GpuCaseCommand command = {};
 
-    return absl::OkStatus();
-  }
+  std::vector<GraphNodeHandle> node_dependencies =
+      ToGraphNodeDependencies(dependencies);
 
-  if (state_ == State::kUpdate) {
-    ConditionalCommandBuffers& cond_cmd_buffers =
-        execution_scope.conditional_command_buffers[execution_scope.update_state
-                                                        .conditional_idx++];
+  int32_t batch_offset = 0;
+  while (batch_offset < create_branches.size()) {
+    // Conditionals will by default run branches[branchs.size()-1] if index is
+    // `< 0` or `>= branches.size()`. See
+    // https://openxla.org/xla/operation_semantics#conditional.
+    // To break down a large case with back to back ConditionalCommands, only
+    // the last batch should accept this default case.
+    int32_t remaining_branches = create_branches.size() - batch_offset;
+    int32_t batch_size;
+    bool enable_conditional_default;
+    if (remaining_branches <= kBranchBatchSize) {
+      batch_size = remaining_branches;
+      enable_conditional_default = true;
+    } else {
+      batch_size = kBranchBatchSize;
+      enable_conditional_default = false;
+    }
 
-    // Sanity check that we got the correct conditional command buffers.
-    TF_RETURN_IF_ERROR(CheckNumCommandBuffers(cond_cmd_buffers, num_handles));
+    TF_ASSIGN_OR_RETURN(auto conditionals,
+                        CreateConditionalHandles(batch_size));
 
-    // Update a kernel that updates conditional handles values.
-    TF_RETURN_IF_ERROR(
-        set_condition(execution_scope_id, cond_cmd_buffers.conditionals));
+    TF_ASSIGN_OR_RETURN(auto set_condition_node,
+                        CreateSetCaseConditionNode(
+                            conditionals, index, index_is_bool, batch_offset,
+                            enable_conditional_default, node_dependencies));
 
-    // Update a barrier between conditional handles and conditional nodes.
-    TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
+    std::vector<GraphConditionalNodeHandle> conditional_nodes;
+    for (int z = 0; z < batch_size; ++z) {
+      int branch_offset = z + batch_offset;
+      TF_ASSIGN_OR_RETURN(
+          conditional_nodes.emplace_back(),
+          CreateConditionalNode({set_condition_node}, conditionals[z],
+                                ConditionType::kIf));
+
+      GpuCommandBuffer* case_command_buffer =
+          conditional_nodes.back().command_buffer.get();
+      TF_RETURN_IF_ERROR(create_branches[branch_offset](case_command_buffer,
+                                                        /*dependencies=*/{})
+                             .status());
+      TF_RETURN_IF_ERROR(case_command_buffer->Finalize());
+    }
 
-    // Skip updating conditional nodes.
-    execution_scope.update_state.node_idx += num_handles;
+    // Move the state into the recorded command.
+    command.conditionals.insert(command.conditionals.end(),
+                                conditionals.begin(), conditionals.end());
+    command.set_condition_nodes.push_back(set_condition_node);
+    command.conditional_nodes.insert(
+        command.conditional_nodes.end(),
+        std::make_move_iterator(conditional_nodes.begin()),
+        std::make_move_iterator(conditional_nodes.end()));
 
-    return UpdateConditionalCommandBuffers(
-        cond_cmd_buffers.conditionals,
-        absl::MakeSpan(cond_cmd_buffers.command_buffers), builders);
+    batch_offset += batch_size;
   }
 
-  return UnsupportedStateError(state_);
+  return AppendCommand(std::move(command));
 }
 
-absl::Status GpuCommandBuffer::If(ExecutionScopeId execution_scope_id,
-                                  DeviceMemory<bool> predicate,
-                                  Builder then_builder) {
-  auto set_cond_fn = [&](ExecutionScopeId id, GraphConditionalHandles handles) {
-    return LaunchSetIfConditionKernel(id, handles[0], predicate);
-  };
-
-  std::array<ConditionBuilder, 1> builders = {
-      ToConditionBuilder(std::move(then_builder))};
+absl::Status GpuCommandBuffer::UpdateCase(
+    const Command* command, DeviceMemory<uint8_t> index, bool index_is_bool,
+    std::vector<UpdateCommands> update_branches) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
 
-  return AddConditionalCommandNode(execution_scope_id, ConditionType::kIf,
-                                   set_cond_fn, builders);
-}
+  constexpr size_t kBranchBatchSize = 8;
 
-absl::Status GpuCommandBuffer::IfElse(ExecutionScopeId execution_scope_id,
-                                      DeviceMemory<bool> predicate,
-                                      Builder then_builder,
-                                      Builder else_builder) {
-  auto set_cond_fn = [&](ExecutionScopeId id, GraphConditionalHandles handles) {
-    return LaunchSetIfElseConditionKernel(id, handles[0], handles[1],
-                                          predicate);
-  };
-
-  std::array<ConditionBuilder, 2> builders = {
-      ToConditionBuilder(std::move(then_builder)),
-      ToConditionBuilder(std::move(else_builder))};
-
-  return AddConditionalCommandNode(execution_scope_id, ConditionType::kIf,
-                                   set_cond_fn, builders);
-}
+  auto* gpu_command = tsl::down_cast<const GpuCaseCommand*>(command);
 
-absl::Status GpuCommandBuffer::Case(ExecutionScopeId execution_scope_id,
-                                    DeviceMemory<uint8_t> index,
-                                    bool index_is_bool,
-                                    std::vector<Builder> branches) {
-  constexpr size_t kBranchBatchSize = 8;
+  // Update branch conditionals.
+  size_t batch_index = 0;
   int32_t batch_offset = 0;
-  while (batch_offset < branches.size()) {
-    // Conditionals will by default run branches[branchs.size()-1] if index is
-    // <0 or >= branches.size(). See
-    // https://openxla.org/xla/operation_semantics#conditional.
-    // To break down a large case with back to back ConditionalCommands, only
-    // the last batch should accept this default case.
-    int32_t remaining_branches = branches.size() - batch_offset;
+  while (batch_offset < update_branches.size()) {
+    int32_t remaining_branches = update_branches.size() - batch_offset;
     int32_t batch_size;
     bool enable_conditional_default;
     if (remaining_branches <= kBranchBatchSize) {
@@ -629,108 +412,127 @@ absl::Status GpuCommandBuffer::Case(ExecutionScopeId execution_scope_id,
       enable_conditional_default = false;
     }
 
-    auto set_cond_fn = [&, batch_offset, enable_conditional_default](
-                           ExecutionScopeId id,
-                           GraphConditionalHandles conditionals) {
-      return LaunchSetCaseConditionKernel(id, conditionals, index,
-                                          index_is_bool, batch_offset,
-                                          enable_conditional_default);
-    };
-
-    // Wrap all branches into conditional command buffer builders.
-    absl::InlinedVector<ConditionBuilder, kBranchBatchSize> builders;
-    builders.reserve(batch_size);
-    for (int z = 0; z < batch_size; ++z) {
-      int branch_offset = z + batch_offset;
-      builders.push_back(
-          ToConditionBuilder(std::move(branches[branch_offset])));
-    }
+    TF_RETURN_IF_ERROR(UpdateSetCaseConditionNode(
+        gpu_command->set_condition_nodes[batch_index],
+        absl::MakeSpan(gpu_command->conditionals)
+            .subspan(batch_offset, batch_size),
+        index, index_is_bool, batch_offset, enable_conditional_default));
 
-    TF_RETURN_IF_ERROR(AddConditionalCommandNode(
-        execution_scope_id, ConditionType::kIf, set_cond_fn, builders));
     batch_offset += batch_size;
+    batch_index += 1;
+  }
+
+  // Update branch command buffers.
+  for (size_t i = 0; i < gpu_command->conditional_nodes.size(); ++i) {
+    GpuCommandBuffer* case_command_buffer =
+        gpu_command->conditional_nodes[i].command_buffer.get();
+    auto scoped_update_mode = ActivateUpdateMode(case_command_buffer);
+    TF_RETURN_IF_ERROR(case_command_buffer->Update());
+    TF_RETURN_IF_ERROR(update_branches[i](case_command_buffer));
+    TF_RETURN_IF_ERROR(case_command_buffer->Finalize());
   }
+
   return absl::OkStatus();
 }
 
-absl::Status GpuCommandBuffer::Case(ExecutionScopeId execution_scope_id,
-                                    DeviceMemory<bool> index,
-                                    std::vector<Builder> branches) {
-  return Case(
-      execution_scope_id,
+absl::StatusOr<const CommandBuffer::Command*> GpuCommandBuffer::CreateCase(
+    DeviceMemory<int32_t> index, std::vector<CreateCommands> create_branches,
+    absl::Span<const Command* const> dependencies) {
+  return CreateCase(
       DeviceMemory<uint8_t>::MakeFromByteSize(index.opaque(), index.size()),
-      /*index_is_bool=*/true, branches);
+      /*index_is_bool=*/false, std::move(create_branches), dependencies);
 }
 
-absl::Status GpuCommandBuffer::Case(ExecutionScopeId execution_scope_id,
-                                    DeviceMemory<int32_t> index,
-                                    std::vector<Builder> branches) {
-  return Case(
-      execution_scope_id,
+absl::StatusOr<const CommandBuffer::Command*> GpuCommandBuffer::CreateCase(
+    DeviceMemory<bool> index, std::vector<CreateCommands> create_branches,
+    absl::Span<const Command* const> dependencies) {
+  return CreateCase(
       DeviceMemory<uint8_t>::MakeFromByteSize(index.opaque(), index.size()),
-      /*index_is_bool=*/false, branches);
+      /*index_is_bool=*/true, std::move(create_branches), dependencies);
 }
 
-absl::Status GpuCommandBuffer::For(ExecutionScopeId execution_scope_id,
-                                   int32_t num_iteration,
-                                   DeviceMemory<int32_t> loop_counter,
-                                   Builder body_builder) {
-  // Reset loop counter to zero.
-  TF_RETURN_IF_ERROR(Memset(execution_scope_id, &loop_counter, uint32_t{0}, 1));
-  TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
+absl::Status GpuCommandBuffer::UpdateCase(
+    const Command* command, DeviceMemory<int32_t> index,
+    std::vector<UpdateCommands> update_branches) {
+  return UpdateCase(
+      command,
+      DeviceMemory<uint8_t>::MakeFromByteSize(index.opaque(), index.size()),
+      /*index_is_bool=*/false, std::move(update_branches));
+}
 
-  auto set_cond_fn = [&](ExecutionScopeId id, GraphConditionalHandles handles) {
-    return LaunchSetForConditionKernel(id, handles[0], loop_counter,
-                                       num_iteration);
-  };
+absl::Status GpuCommandBuffer::UpdateCase(
+    const Command* command, DeviceMemory<bool> index,
+    std::vector<UpdateCommands> update_branches) {
+  return UpdateCase(
+      command,
+      DeviceMemory<uint8_t>::MakeFromByteSize(index.opaque(), index.size()),
+      /*index_is_bool=*/true, std::move(update_branches));
+}
 
-  auto body = [&](GpuCommandBuffer* body, GraphConditionalHandle conditional) {
-    TF_RETURN_IF_ERROR(body_builder(body));
-    TF_RETURN_IF_ERROR(body->Barrier());
+absl::StatusOr<const CommandBuffer::Command*> GpuCommandBuffer::CreateWhile(
+    DeviceMemory<bool> pred, CreateCommands create_cond,
+    CreateCommands create_body, absl::Span<const Command* const> dependencies) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kCreate));
 
-    // Decide if we want to continue loop iteration.
-    return body->LaunchSetForConditionKernel(
-        kDefaultExecutionScope, conditional, loop_counter, num_iteration);
-  };
+  GpuWhileCommand command = {};
 
-  std::array<ConditionBuilder, 1> builders = {std::move(body)};
+  TF_ASSIGN_OR_RETURN(auto init_cond, create_cond(this, dependencies));
+
+  TF_ASSIGN_OR_RETURN(command.conditional, CreateConditionalHandle());
+  TF_ASSIGN_OR_RETURN(
+      command.set_init_condition_node,
+      CreateSetWhileConditionNode(command.conditional, pred,
+                                  ToGraphNodeDependencies(init_cond)));
+  TF_ASSIGN_OR_RETURN(
+      command.conditional_node,
+      CreateConditionalNode({command.set_init_condition_node},
+                            command.conditional, ConditionType::kWhile));
+
+  GpuCommandBuffer* body = command.conditional_node.command_buffer.get();
+  TF_ASSIGN_OR_RETURN(auto body_commands,
+                      create_body(body, /*dependencies=*/{}));
+  TF_ASSIGN_OR_RETURN(auto update_cond, create_cond(body, body_commands));
+  TF_ASSIGN_OR_RETURN(
+      command.set_body_condition_node,
+      body->CreateSetWhileConditionNode(command.conditional, pred,
+                                        ToGraphNodeDependencies(update_cond)));
+  TF_RETURN_IF_ERROR(command.conditional_node.command_buffer->Finalize());
 
-  return AddConditionalCommandNode(execution_scope_id, ConditionType::kWhile,
-                                   set_cond_fn, builders);
+  return AppendCommand(std::move(command));
 }
 
-absl::Status GpuCommandBuffer::While(ExecutionScopeId execution_scope_id,
-                                     DeviceMemory<bool> pred,
-                                     ExecutionScopeBuilder cond_builder,
-                                     Builder body_builder) {
-  // Record condition commands into the parent command buffer.
-  TF_RETURN_IF_ERROR(cond_builder(execution_scope_id, this));
-  TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
-
-  auto set_cond_fn = [&](ExecutionScopeId id, GraphConditionalHandles handles) {
-    return LaunchSetWhileConditionKernel(id, handles[0], pred);
-  };
-
-  auto body = [&](GpuCommandBuffer* body, GraphConditionalHandle conditional) {
-    TF_RETURN_IF_ERROR(body_builder(body));
-    TF_RETURN_IF_ERROR(body->Barrier());
-    TF_RETURN_IF_ERROR(cond_builder(kDefaultExecutionScope, body));
-    TF_RETURN_IF_ERROR(body->Barrier());
-    return body->LaunchSetWhileConditionKernel(kDefaultExecutionScope,
-                                               conditional, pred);
-  };
-
-  std::array<ConditionBuilder, 1> builders = {std::move(body)};
-
-  return AddConditionalCommandNode(execution_scope_id, ConditionType::kWhile,
-                                   set_cond_fn, builders);
+absl::Status GpuCommandBuffer::UpdateWhile(const Command* command,
+                                           DeviceMemory<bool> pred,
+                                           UpdateCommands update_cond,
+                                           UpdateCommands update_body) {
+  TF_RETURN_IF_ERROR(CheckInState(State::kUpdate));
+
+  auto* gpu_command = tsl::down_cast<const GpuWhileCommand*>(command);
+
+  TF_RETURN_IF_ERROR(update_cond(this));
+
+  TF_RETURN_IF_ERROR(UpdateSetWhileConditionNode(
+      gpu_command->set_init_condition_node, gpu_command->conditional, pred));
+
+  GpuCommandBuffer* body = gpu_command->conditional_node.command_buffer.get();
+  auto body_update_mode = ActivateUpdateMode(body);
+
+  // Update command buffer using user-provided builder callback.
+  TF_RETURN_IF_ERROR(body->Update());
+  TF_RETURN_IF_ERROR(update_body(body));
+  TF_RETURN_IF_ERROR(update_cond(body));
+  TF_RETURN_IF_ERROR(body->UpdateSetWhileConditionNode(
+      gpu_command->set_body_condition_node, gpu_command->conditional, pred));
+  TF_RETURN_IF_ERROR(body->Finalize());
+
+  return absl::OkStatus();
 }
 
 absl::Status GpuCommandBuffer::Finalize() {
   TF_RETURN_IF_ERROR(CheckNotFinalized());
   TF_RETURN_IF_ERROR(PrepareFinalization());
 
-  // Maybe dump created CUDA graph to a dot file for debugging.
+  // Maybe dump created GPU graph to a dot file for debugging.
   if (state_ == State::kCreate && VLOG_IS_ON(10)) {
     std::string path = tsl::io::GetTempFilename(/*extension=*/"dot");
     TF_RETURN_IF_ERROR(WriteGraphToDotFile(path));
@@ -742,12 +544,7 @@ absl::Status GpuCommandBuffer::Finalize() {
     }
   }
 
-  // Collect number of nodes and conditionals for logging below.
-  size_t num_nodes = 0, num_cond_cmd_buffers = 0;
-  for (auto& [_, execution_scope] : execution_scopes_) {
-    num_nodes += execution_scope.nodes.size();
-    num_cond_cmd_buffers += execution_scope.conditional_command_buffers.size();
-  }
+  size_t num_commands = commands_.size();
 
   if (mode_ == Mode::kPrimary && state_ == State::kCreate) {
     uint64_t start_nanos = tsl::Env::Default()->NowNanos();
@@ -758,14 +555,13 @@ absl::Status GpuCommandBuffer::Finalize() {
 
     if (instantiated.code() == absl::StatusCode::kResourceExhausted) {
       return absl::ResourceExhaustedError(absl::StrFormat(
-          "Underlying backend ran out of memory trying to instantiate graph "
-          "with %d nodes and %d conditionals (total of %d alive graphs "
-          "in the process). You can try to (a) Give more memory to the "
-          "driver by reducing XLA_CLIENT_MEM_FRACTION (b) Disable "
-          "command buffers with 'XLA_FLAGS=--xla_gpu_enable_command_buffer=' "
-          "(empty set). Original error: %s",
-          num_nodes, num_cond_cmd_buffers, AliveExecs(),
-          instantiated.message()));
+          "Underlying backend ran out of memory trying to instantiate command "
+          "buffer with %d (total of %d alive graphs in the process). You can "
+          "try to (a) Give more memory to the driver by reducing "
+          "XLA_CLIENT_MEM_FRACTION (b) Disable command buffers with "
+          "'XLA_FLAGS=--xla_gpu_enable_command_buffer=' (empty set). Original "
+          "error: %s",
+          num_commands, AliveExecs(), instantiated.message()));
     }
     TF_RETURN_IF_ERROR(instantiated);
 
@@ -774,17 +570,9 @@ absl::Status GpuCommandBuffer::Finalize() {
     auto exec_num = NotifyExecCreated();
     VLOG(5) << "Instantiated executable graph #" << exec_num << " in "
             << (end_nanos - start_nanos) / 1000 << " μs"
-            << "; execution_scopes: " << execution_scopes_.size()
-            << "; nodes: " << num_nodes
-            << "; conditionals: " << num_cond_cmd_buffers
+            << "; commands: " << num_commands
             << "; alive executable graphs: " << AliveExecs();
 
-    if (parent_->GetPlatform()->id() == cuda::kCudaPlatformId &&
-        parent_->GetDeviceDescription().driver_version() <
-            SemanticVersion(12, 4, 0)) {
-      TF_RETURN_IF_ERROR(DisableBarriersExecution(*this));
-    }
-
   } else if (mode_ == Mode::kPrimary && state_ == State::kUpdate) {
     // If this is a finalization after update, we don't have to do anything as
     // each individual command already updated executable graph.
@@ -810,33 +598,20 @@ absl::Status GpuCommandBuffer::Update() {
         "Command buffer has to be finalized first before it can be updated");
   }
 
-  VLOG(5) << "Begin update of"
-          << (mode_ == Mode::kPrimary ? "primary" : "nested")
-          << " command buffer " << this;
+  VLOG(5) << "Begin update of " << absl::StrCat(mode_) << " command buffer "
+          << this;
 
   state_ = State::kUpdate;
-  for (auto& [_, execution_scope] : execution_scopes_) {
-    execution_scope.update_state = ExecutionScope::UpdateState();
-  }
   return absl::OkStatus();
 }
 
-absl::Span<const GpuCommandBuffer::GpuGraphNodeInfo> GpuCommandBuffer::nodes(
-    ExecutionScopeId id) const {
-  if (auto it = execution_scopes_.find(id); it != execution_scopes_.end())
-    return it->second.nodes;
-  return {};
-}
-
-absl::Span<const GpuCommandBuffer::GpuGraphBarrierInfo>
-GpuCommandBuffer::barriers(ExecutionScopeId id) const {
-  if (auto it = execution_scopes_.find(id); it != execution_scopes_.end())
-    return it->second.barriers;
-  return {};
+absl::Span<const std::unique_ptr<CommandBuffer::Command>>
+GpuCommandBuffer::commands() const {
+  return commands_;
 }
 
 absl::Status GpuCommandBuffer::Submit(Stream* stream) {
-  if (mode_ != CommandBuffer::Mode::kPrimary) {
+  if (mode_ != Mode::kPrimary) {
     return absl::InvalidArgumentError(
         "Can't submit non-primary command buffer for execution");
   }
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 2607e3de794d..49f02d3ea116 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -18,12 +18,11 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -31,9 +30,11 @@ limitations under the License.
 #include "xla/stream_executor/bit_pattern.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/gpu/scoped_update_mode.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor::gpu {
@@ -52,6 +53,9 @@ class GpuCommandBuffer : public CommandBuffer {
   struct GraphConditionalOpaque;
 
  public:
+  // A type of a conditional command support by the GPU command buffer.
+  enum class ConditionType { kIf, kWhile };
+
   // A graph node handle is an opaque handle that identifies a graph node in the
   // graph associated with a command buffer. GraphNodeHandles are created by
   // node factory functions and can be referenced in node update functions.
@@ -67,88 +71,108 @@ class GpuCommandBuffer : public CommandBuffer {
   // constructed from a nullptr, trivially copyable, POD, etc.), that's why we
   // use a pointer to define it.
   using GraphConditionalHandle = GraphConditionalOpaque*;
-  using GraphConditionalHandles = absl::Span<const GraphConditionalHandle>;
 
-  // A handle to a Gpu graph node and a metadata describing its properties. Each
-  // command (launch, memcpy, etc.) creates one or more graph nodes.
-  struct GpuGraphNodeInfo {
+  // Conditional node handle with an associated command buffer. Command buffer
+  // is owned by the conditional node, and we return it back to the caller so it
+  // can record (or update) commands into it.
+  struct GraphConditionalNodeHandle {
+    GraphNodeHandle handle;
+    std::unique_ptr<GpuCommandBuffer> command_buffer;
+  };
+
+  // A simple GPU command recorded into a GPU command buffer. Most of the GPU
+  // commands have a single node in the GPU graph, i.e. memset or kernel launch.
+  struct GpuCommand : public CommandBuffer::Command {
+    explicit GpuCommand(GraphNodeHandle handle) : handle(handle) {}
+
     // A handle to the gpu graph node corresponding to a command.
-    GraphNodeHandle handle{};
+    GraphNodeHandle handle = nullptr;
   };
 
-  // A handle to Gpu graph barrier and metadata describing its properties. Each
-  // call to `Barrier` creates a new barrier record.
-  struct GpuGraphBarrierInfo {
-    // A handle to graph node acting as a barrier that defines execution order.
-    // It can be a handle to a `GpuGraphNodeInfo` node or a handle to an empty
-    // node created to be a barrier. We try to reuse existing nodes as barriers
-    // if possible to reduce the size of constructed gpu graphs.
-    GraphNodeHandle handle{};
-
-    // If `true` it means `handle` corresponds to an empty node specifically
-    // created to act as an execution barrier, otherwise `handle` points to one
-    // of the nodes created for recorded commands.
-    bool is_barrier_node = true;
-
-    // Nodes with index smaller than `nodes_offset` are synchronized with this
-    // barrier. We use this offset to find nodes added after the last barrier
-    // that should be added as dependencies to the next barrier.
-    size_t nodes_offset = 0;
+  // A GPU command recorded for the Case operation.
+  struct GpuCaseCommand : public CommandBuffer::Command {
+    std::vector<GraphConditionalHandle> conditionals;
+    std::vector<GraphNodeHandle> set_condition_nodes;
+    std::vector<GraphConditionalNodeHandle> conditional_nodes;
+    GraphNodeHandle barrier_node;
+  };
+
+  // A GPU command recorded for the While operation.
+  struct GpuWhileCommand : public CommandBuffer::Command {
+    GraphConditionalHandle conditional;
+    GraphNodeHandle set_init_condition_node;
+    GraphNodeHandle set_body_condition_node;
+    GraphConditionalNodeHandle conditional_node;
   };
 
   GpuCommandBuffer(Mode mode, StreamExecutor* parent);
 
-  using CommandBuffer::Barrier;
-  absl::Status Barrier(ExecutionScopeId execution_scope_id) override;
+  // Bring CreateLaunch and UpdateLaunch template functions into scope.
+  using CommandBuffer::CreateLaunch;
+  using CommandBuffer::UpdateLaunch;
+
+  absl::StatusOr<const Command*> CreateLaunch(
+      const ThreadDim& threads, const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgs& args,
+      absl::Span<const Command* const> dependencies) override;
 
-  absl::Status Barrier(
-      absl::Span<const ExecutionScopeId> execution_scope_ids) override;
+  absl::Status UpdateLaunch(const Command* command, const ThreadDim& threads,
+                            const BlockDim& blocks, const Kernel& kernel,
+                            const KernelArgs& args) override;
 
-  absl::Status Barrier(ExecutionScopeId from_execution_scope_id,
-                       ExecutionScopeId to_execution_scope_id) override;
+  absl::StatusOr<const Command*> CreateNestedCommand(
+      const CommandBuffer& nested,
+      absl::Span<const Command* const> dependencies) override;
 
-  using CommandBuffer::Launch;
-  absl::Status Launch(ExecutionScopeId execution_scope_id,
-                      const ThreadDim& threads, const BlockDim& blocks,
-                      const Kernel& kernel, const KernelArgs& args) override;
+  absl::Status UpdateNestedCommand(const Command* command,
+                                   const CommandBuffer& nested) override;
 
-  absl::Status AddNestedCommandBuffer(ExecutionScopeId execution_scope_id,
-                                      const CommandBuffer& nested) override;
+  absl::StatusOr<const Command*> CreateMemcpyD2D(
+      DeviceMemoryBase* dst, const DeviceMemoryBase& src, uint64_t size,
+      absl::Span<const Command* const> dependencies) override;
 
-  absl::Status MemcpyDeviceToDevice(ExecutionScopeId execution_scope_id,
-                                    DeviceMemoryBase* dst,
-                                    const DeviceMemoryBase& src,
-                                    uint64_t size) override;
+  absl::Status UpdateMemcpyD2D(const Command* command, DeviceMemoryBase* dst,
+                               const DeviceMemoryBase& src,
+                               uint64_t size) override;
 
-  absl::Status Memset(ExecutionScopeId execution_scope_id,
-                      DeviceMemoryBase* dst, BitPattern bit_pattern,
-                      size_t num_elements) override;
+  absl::StatusOr<const Command*> CreateMemset(
+      DeviceMemoryBase* dst, BitPattern bit_pattern, size_t num_elements,
+      absl::Span<const Command* const> dependencies) override;
 
-  absl::Status If(ExecutionScopeId execution_scope_id,
-                  DeviceMemory<bool> predicate, Builder then_builder) override;
+  absl::Status UpdateMemset(const Command* command, DeviceMemoryBase* dst,
+                            const BitPattern& bit_pattern,
+                            size_t num_elements) override;
 
-  absl::Status IfElse(ExecutionScopeId execution_scope_id,
-                      DeviceMemory<bool> predicate, Builder then_builder,
-                      Builder else_builder) override;
+  absl::StatusOr<const Command*> CreateDnnGraphCommand(
+      dnn::DnnGraph&, Stream&, absl::Span<DeviceMemoryBase> operands,
+      absl::Span<const Command* const> dependencies) override;
 
-  // Case operation that uses bool value as branch index
-  absl::Status Case(ExecutionScopeId execution_scope_id,
-                    DeviceMemory<bool> index,
-                    std::vector<Builder> branches) override;
+  absl::Status UpdateDnnGraphCommand(
+      const Command*, dnn::DnnGraph&, Stream&,
+      absl::Span<DeviceMemoryBase> operands) override;
 
-  // Case operation that uses int32 value as branch index
-  absl::Status Case(ExecutionScopeId execution_scope_id,
-                    DeviceMemory<int32_t> index,
-                    std::vector<Builder> branches) override;
+  absl::StatusOr<const Command*> CreateCase(
+      DeviceMemory<int32_t> index, std::vector<CreateCommands> create_branches,
+      absl::Span<const Command* const> dependencies) override;
 
-  absl::Status For(ExecutionScopeId execution_scope_id, int32_t num_iteration,
-                   DeviceMemory<int32_t> loop_counter,
-                   Builder body_builder) override;
+  absl::StatusOr<const Command*> CreateCase(
+      DeviceMemory<bool> index, std::vector<CreateCommands> create_branches,
+      absl::Span<const Command* const> dependencies) override;
 
-  absl::Status While(ExecutionScopeId execution_scope_id,
-                     DeviceMemory<bool> pred,
-                     ExecutionScopeBuilder cond_builder,
-                     Builder body_builder) override;
+  absl::Status UpdateCase(const Command* command, DeviceMemory<int32_t> index,
+                          std::vector<UpdateCommands> update_branches) override;
+
+  absl::Status UpdateCase(const Command* command, DeviceMemory<bool> index,
+                          std::vector<UpdateCommands> update_branches) override;
+
+  absl::StatusOr<const Command*> CreateWhile(
+      DeviceMemory<bool> pred, CreateCommands create_cond,
+      CreateCommands create_body,
+      absl::Span<const Command* const> dependencies) override;
+
+  absl::Status UpdateWhile(const Command* command, DeviceMemory<bool> pred,
+                           UpdateCommands update_cond,
+                           UpdateCommands update_body) override;
 
   absl::Status Finalize() override;
   absl::Status Update() override;
@@ -157,22 +181,7 @@ class GpuCommandBuffer : public CommandBuffer {
   Mode mode() const override { return mode_; }
   State state() const override { return state_; }
 
-  absl::Span<const GpuGraphNodeInfo> nodes(ExecutionScopeId id) const;
-  absl::Span<const GpuGraphBarrierInfo> barriers(ExecutionScopeId id) const;
-
-  absl::Span<const GpuGraphNodeInfo> nodes() const {
-    return nodes(kDefaultExecutionScope);
-  }
-
-  absl::Span<const GpuGraphBarrierInfo> barriers() const {
-    return barriers(kDefaultExecutionScope);
-  }
-
-  // Returns the list of dependencies for a given node. `node` must be a node
-  // added to the current command buffer. The returned node pointer's lifetimes
-  // are bound to the current command buffer.
-  virtual absl::StatusOr<std::vector<GraphNodeHandle>> GetNodeDependencies(
-      GraphNodeHandle node) = 0;
+  absl::Span<const std::unique_ptr<Command>> commands() const;
 
  protected:
   // We track the total number of allocated and alive executable graphs in the
@@ -186,24 +195,6 @@ class GpuCommandBuffer : public CommandBuffer {
 
   using Dependencies = absl::InlinedVector<GraphNodeHandle, 1>;
 
-  using NoOpKernel = TypedKernel<>;
-
- private:
-  // A callback to launch a kernel that updates conditional handles state.
-  using SetConditionFn = std::function<absl::Status(
-      ExecutionScopeId, absl::Span<const GraphConditionalHandle>)>;
-
-  // An extension of `Builder` for building conditional command buffers tied to
-  // conditional handles.
-  using ConditionBuilder =
-      std::function<absl::Status(GpuCommandBuffer*, GraphConditionalHandle)>;
-
-  // Wraps a regular command buffer builder into condition builder.
-  static ConditionBuilder ToConditionBuilder(Builder builder);
-
- public:
-  enum class ConditionType { kIf, kWhile };
-
  private:
   // Prepares a nested command buffer for an update of the graph.
   // It's a prerequisite to a call to `Update` on a nested command buffer.
@@ -212,179 +203,103 @@ class GpuCommandBuffer : public CommandBuffer {
   virtual std::unique_ptr<ScopedUpdateMode> ActivateUpdateMode(
       GpuCommandBuffer* nested_cmd_buffer) = 0;
 
-  // For each conditional node in the Gpu graph we keep a record of conditional
-  // command buffers attached to a node, so we can apply updates to them.
-  struct ConditionalCommandBuffers {
-    std::vector<GraphConditionalHandle> conditionals;
-    std::vector<std::unique_ptr<GpuCommandBuffer>> command_buffers;
-  };
-
   absl::StatusOr<std::vector<GraphConditionalHandle>> CreateConditionalHandles(
       size_t num_handles);
 
-  absl::StatusOr<std::vector<std::unique_ptr<GpuCommandBuffer>>>
-  CreateConditionalCommandBuffers(
-      ExecutionScopeId execution_scope_id, ConditionType type,
-      absl::Span<const GraphConditionalHandle> conditionals,
-      absl::Span<const ConditionBuilder> builders);
-
-  absl::Status UpdateConditionalCommandBuffers(
-      absl::Span<const GraphConditionalHandle> handles,
-      absl::Span<const std::unique_ptr<GpuCommandBuffer>> command_buffers,
-      absl::Span<const ConditionBuilder> builders);
+  //===--------------------------------------------------------------------===//
+  // APIs for launching kernels to update conditional handles.
+  //===--------------------------------------------------------------------===//
 
-  absl::StatusOr<std::unique_ptr<GpuCommandBuffer>>
-  CreateConditionalCommandBuffer(ExecutionScopeId execution_scope_id,
-                                 ConditionType type,
-                                 GraphConditionalHandle conditional);
-
-  // Adds a new conditional command (If, IfElse, Case, While, For) to the
-  // command buffer.
-  absl::Status AddConditionalCommandNode(
-      ExecutionScopeId execution_scope_id, ConditionType type,
-      SetConditionFn set_condition,
-      absl::Span<const ConditionBuilder> builders);
-
-  Dependencies GetBarrier(ExecutionScopeId execution_scope_id);
-
-  // Launches a kernels that updates the state of the given graph conditional
-  // based on the predicate. If the predicate is true, `if_conditional` is set
-  // to 1, otherwise to 0.
-  virtual absl::Status LaunchSetIfConditionKernel(
-      ExecutionScopeId execution_scope_id,
-      GraphConditionalHandle if_conditional, DeviceMemory<bool> predicate) = 0;
-  // Launches a kernels that updates the state of the given graph conditionals
-  // based on the predicate. If the predicate is true, `if_conditional` is set
-  // to 1 and `else_conditional` to 0. If the predicate is false,
-  // `if_conditional` is set to 0 and `else_conditional` to 1.
-  virtual absl::Status LaunchSetIfElseConditionKernel(
-      ExecutionScopeId execution_scope_id,
-      GraphConditionalHandle if_conditional,
-      GraphConditionalHandle else_conditional,
-      DeviceMemory<bool> predicate) = 0;
   // Launches a kernel that updates the state of the given graph conditionals
   // based on the index and batch_offset. conditional[x] is set to 1 if index
-  // == x + batch_offset and 0 otherwise. `conditionals` may contain up to 8
-  // conditionals
-  virtual absl::Status LaunchSetCaseConditionKernel(
-      ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+  // equas `x + batch_offset` and `0` otherwise. `conditionals` may contain up
+  // to 8 conditionals.
+  virtual absl::StatusOr<GraphNodeHandle> CreateSetCaseConditionNode(
+      absl::Span<const GraphConditionalHandle> conditionals,
+      DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
+      bool enable_conditional_default,
+      absl::Span<const GraphNodeHandle> dependencies) = 0;
+
+  virtual absl::Status UpdateSetCaseConditionNode(
+      GraphNodeHandle handle,
+      absl::Span<const GraphConditionalHandle> conditionals,
       DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
       bool enable_conditional_default) = 0;
-  // Launches a kernel that updates the state of the given graph conditional
-  // based on the loop counter and the total number of iterations. If the loop
-  // counter is less than the number of iterations, `conditional` is set to 1,
-  // otherwise to 0. The loop counter is also incremented by 1.
-  virtual absl::Status LaunchSetForConditionKernel(
-      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
-      DeviceMemory<int32_t> loop_counter, int32_t iterations) = 0;
+
   // Launches a kernel that updates the state of the given graph conditional
   // based on the predicate. If the predicate is true, `conditional` is set to
   // 1, otherwise to 0.
-  virtual absl::Status LaunchSetWhileConditionKernel(
-      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+  virtual absl::StatusOr<GraphNodeHandle> CreateSetWhileConditionNode(
+      GraphConditionalHandle conditional, DeviceMemory<bool> predicate,
+      absl::Span<const GraphNodeHandle> dependencies) = 0;
+
+  virtual absl::Status UpdateSetWhileConditionNode(
+      GraphNodeHandle handle, GraphConditionalHandle conditional,
       DeviceMemory<bool> predicate) = 0;
 
-  // Recursively disable all nodes corresponding to barriers (including nested
-  // conditional command buffers). This is work around the fact that we can't
-  // use empty nodes inside conditional CUDA graphs and instead we add no-op
-  // kernel nodes, however large number of no-op kernels impacts performance.
-  // The function needs access to the root command buffer which holds the
-  // executable graph.
-  absl::Status DisableBarriersExecution(GpuCommandBuffer& root_command_buffer);
+  //===--------------------------------------------------------------------===//
 
   // Launches CUDA kernels with packed arguments.
-  absl::Status LaunchWithPackedArgs(
-      ExecutionScopeId execution_scope_id, const ThreadDim& threads,
-      const BlockDim& blocks, const Kernel& kernel,
-      const KernelArgsPackedArrayBase& packed_args);
+  absl::StatusOr<const Command*> CreateLaunchWithPackedArgs(
+      const ThreadDim& threads, const BlockDim& blocks, const Kernel& kernel,
+      const KernelArgsPackedArrayBase& packed_args,
+      absl::Span<const Command* const> dependencies);
+
+  // Updates a kernel launch command with packed arguments.
+  absl::Status UpdateLaunchWithPackedArgs(
+      const Command* command, const ThreadDim& threads, const BlockDim& blocks,
+      const Kernel& kernel, const KernelArgsPackedArrayBase& packed_args);
 
  protected:
   // Returns OK status if command buffer is not finalized and it is still
   // possible to add new commands to it, otherwise returns internal error.
   absl::Status CheckNotFinalized();
 
+  // Return OK status if command buffer is in the given state, otherwise returns
+  // an error.
+  absl::Status CheckInState(State state);
+
   // Returns OK status if the command buffer can be updated.
   virtual absl::Status CheckCanBeUpdated() = 0;
 
  private:
-  // Returns OK status if the number of command buffers is equal to the expected
-  // one, otherwise returns internal error.
-  absl::Status CheckNumCommandBuffers(
-      const ConditionalCommandBuffers& cmd_buffers, size_t num_cmd_buffers);
-
-  // Collects a set of dependencies for a new barrier.
-  Dependencies GetBarrierDependencies(ExecutionScopeId execution_scope_id);
-
-  Mode mode_;
-  State state_ = State::kCreate;
-
-  StreamExecutor* parent_;  // not owned, must outlive *this
-
- private:
-  // ExecutionScope holds the state of an underlying CUDA graph (nodes an
-  // barriers added to a graph) for a single execution scope.
-  struct ExecutionScope {
-    // Tracks indices into data structures during command buffer updates.
-    struct UpdateState {
-      // Index points to the graph node inside `nodes` that will be updated
-      // next.
-      int64_t node_idx = 0;
-
-      // Index points to the barrier node inside `barriers` that will be updated
-      // on a next call to `Barrier(...)`.
-      int64_t barrier_idx = 0;
-
-      // Index points to the conditional command buffers that will be updated
-      // next when we'll be updating next conditional command (If, Case, While).
-      int64_t conditional_idx = 0;
-    };
-
-    // Gpu graph nodes corresponding to recorded commands (launch, memcpy,
-    // etc.).
-    std::vector<GpuGraphNodeInfo> nodes;
-
-    // Gpu graph barriers that define recorded commands execution order.
-    std::vector<GpuGraphBarrierInfo> barriers;
-
-    // Command buffers for conditional nodes in the Gpu graph. Underlying Gpu
-    // graphs owned by the `graph_` instance.
-    std::vector<ConditionalCommandBuffers> conditional_command_buffers;
-
-    // Tracks execution scope update state.
-    UpdateState update_state;
-  };
-
-  // Execution scopes recorded into the command buffer.
-  absl::flat_hash_map<ExecutionScopeId, ExecutionScope> execution_scopes_;
-
-  // Track the number of command buffer updates for debugging.
-  int64_t num_updates_ = 0;
+  absl::StatusOr<const Command*> CreateCase(
+      DeviceMemory<uint8_t> index, bool index_is_bool,
+      std::vector<CreateCommands> create_branches,
+      absl::Span<const Command* const> dependencies);
+
+  absl::Status UpdateCase(const Command* command, DeviceMemory<uint8_t> index,
+                          bool index_is_bool,
+                          std::vector<UpdateCommands> update_branches);
+
+  // Appends a new command to the command buffer.
+  template <typename T>
+  const Command* AppendCommand(T command) {
+    commands_.push_back(std::make_unique<T>(std::move(command)));
+    return commands_.back().get();
+  }
 
-  // Creates a nested command buffer, associated with the same executor.
-  // The given graph will not be owned by the created command buffer.
- protected:
-  struct ConditionalNodeResult {
-    GraphNodeHandle node_handle;
-    std::unique_ptr<GpuCommandBuffer> command_buffer;
-  };
+  // Converts a list of command dependencies to a list of graph node handles.
+  std::vector<GraphNodeHandle> ToGraphNodeDependencies(
+      absl::Span<const Command* const> dependencies);
 
- private:
-  absl::Status Case(ExecutionScopeId execution_scope_id,
-                    DeviceMemory<uint8_t> index, bool index_is_bool,
-                    std::vector<Builder> branches);
+  //===--------------------------------------------------------------------===//
+  // APIs for creating and updating underlying GPU graph nodes.
+  //===--------------------------------------------------------------------===//
 
   // Adds a new conditional node to the graph and creates a corresponding nested
   // command buffer.
-  virtual absl::StatusOr<ConditionalNodeResult> CreateConditionalNode(
-      const Dependencies& dependencies, GraphConditionalHandle conditional,
-      ConditionType type) = 0;
+  virtual absl::StatusOr<GraphConditionalNodeHandle> CreateConditionalNode(
+      absl::Span<const GraphNodeHandle> dependencies,
+      GraphConditionalHandle conditional, ConditionType type) = 0;
 
-  // Adds a new memset node to the graph.
+  // Adds a new memset node to the underlying graph.
   virtual absl::StatusOr<GraphNodeHandle> CreateMemsetNode(
-      const Dependencies& dependencies, DeviceMemoryBase destination,
-      BitPattern bit_pattern, size_t num_elements) = 0;
+      absl::Span<const GraphNodeHandle> dependencies,
+      DeviceMemoryBase destination, BitPattern bit_pattern,
+      size_t num_elements) = 0;
 
-  // Updates an existing memset node. Note that `node_handle` needs to be refer
+  // Updates an existing memset node. Note that `node_handle` needs to refer
   // to a node created by `CreateMemsetNode`.
   virtual absl::Status UpdateMemsetNode(GraphNodeHandle node_handle,
                                         DeviceMemoryBase destination,
@@ -393,17 +308,25 @@ class GpuCommandBuffer : public CommandBuffer {
 
   // Adds a new memcpy node to the graph.
   virtual absl::StatusOr<GraphNodeHandle> CreateMemcpyD2DNode(
-      const Dependencies& dependencies, DeviceMemoryBase destination,
-      DeviceMemoryBase source, uint64_t size) = 0;
+      absl::Span<const GraphNodeHandle> dependencies,
+      DeviceMemoryBase destination, DeviceMemoryBase source, uint64_t size) = 0;
 
   virtual absl::Status UpdateMemcpyD2DNode(GraphNodeHandle node_handle,
                                            DeviceMemoryBase destination,
                                            DeviceMemoryBase source,
                                            uint64_t size) = 0;
 
+  virtual absl::Status PopulateDnnGraphNode(
+      dnn::DnnGraph&, Stream&, absl::Span<DeviceMemoryBase> operands) = 0;
+
+  virtual absl::Status UpdateDnnGraphNode(dnn::DnnGraph&, Stream&,
+                                          absl::Span<DeviceMemoryBase> operands,
+                                          GraphNodeHandle) = 0;
+
   // Adds a new nested command buffer node to the graph.
   virtual absl::StatusOr<GraphNodeHandle> CreateChildNode(
-      const Dependencies& dependencies, const CommandBuffer& nested) = 0;
+      absl::Span<const GraphNodeHandle> dependencies,
+      const CommandBuffer& nested) = 0;
 
   // Associate another command buffer with this child node. Will return an
   // error if the given node has not been created as a child node.
@@ -412,7 +335,7 @@ class GpuCommandBuffer : public CommandBuffer {
 
   // Adds a new kernel launch node to the graph.
   virtual absl::StatusOr<GraphNodeHandle> CreateKernelNode(
-      const Dependencies& dependencies, const ThreadDim& threads,
+      absl::Span<const GraphNodeHandle> dependencies, const ThreadDim& threads,
       const BlockDim& blocks, const Kernel& kernel,
       const KernelArgsPackedArrayBase& args) = 0;
 
@@ -423,13 +346,7 @@ class GpuCommandBuffer : public CommandBuffer {
       const BlockDim& blocks, const Kernel& kernel,
       const KernelArgsPackedArrayBase& args) = 0;
 
-  // Creates a new no-op node acting as a barrier and adds it to the graph.
-  virtual absl::StatusOr<GraphNodeHandle> CreateBarrierNode(
-      const Dependencies& dependencies) = 0;
-
-  // Enables or disables the execution of the given node in the graph.
-  virtual absl::Status SetNodeExecutionEnabled(GraphNodeHandle node_handle,
-                                               bool enabled) = 0;
+  //===--------------------------------------------------------------------===//
 
   // Launches an instantiated graph. Only supported on primary command buffers.
   virtual absl::Status LaunchGraph(Stream* stream) = 0;
@@ -450,6 +367,17 @@ class GpuCommandBuffer : public CommandBuffer {
 
   // Instantiates the executable graph from the underlying graph.
   virtual absl::Status InstantiateGraph() = 0;
+
+  Mode mode_;
+  State state_ = State::kCreate;
+
+  StreamExecutor* parent_;  // not owned, must outlive *this
+
+  // Track the number of command buffer updates for debugging.
+  int64_t num_updates_ = 0;
+
+  // Gpu commands recorded into the command buffer.
+  std::vector<std::unique_ptr<Command>> commands_;
 };
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
index afebe70913a1..2f7697b8a22e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/gpu/gpu_command_buffer.h"
-
 #include <algorithm>
-#include <cstddef>
 #include <cstdint>
+#include <utility>
 #include <vector>
 
-#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/ascii.h"
@@ -35,30 +33,18 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
-#include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/trace_command_buffer_factory.h"
 #include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/status_matchers.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
 
 namespace stream_executor::gpu {
-using testing::ElementsAre;
-using testing::IsEmpty;
-using tsl::testing::IsOkAndHolds;
-
-using ExecutionScopeId = CommandBuffer::ExecutionScopeId;
-
-static GpuCommandBuffer* CastToGpuCommandBuffer(CommandBuffer* command_buffer) {
-  return static_cast<GpuCommandBuffer*>(command_buffer);
-}
 
 static Platform* GpuPlatform() {
   auto name = absl::AsciiStrToUpper(
@@ -73,7 +59,8 @@ using MulI32Kernel =
     TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
                        DeviceMemory<int32_t>>;
 using IncAndCmpKernel =
-    TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<bool>, int32_t>;
+    TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<bool>,
+                       DeviceMemory<int32_t>>;
 
 using AddI32Ptrs3 = TypedKernelFactory<internal::Ptrs3<int32_t>>;
 
@@ -94,6 +81,12 @@ static bool IsAtLeastCuda12300(
   return true;
 }
 
+absl::StatusOr<std::vector<const CommandBuffer::Command*>> Wrap(
+    absl::StatusOr<const CommandBuffer::Command*> command) {
+  TF_RETURN_IF_ERROR(command.status());
+  return std::vector<const CommandBuffer::Command*>{*command};
+}
+
 TEST(GpuCommandBufferTest, LaunchSingleKernel) {
   Platform* platform = GpuPlatform();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
@@ -117,8 +110,11 @@ TEST(GpuCommandBufferTest, LaunchSingleKernel) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Create a command buffer with a single kernel launch.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), a, b, c));
+  TF_ASSERT_OK_AND_ASSIGN(auto cmd_buffer,
+                          executor->CreateCommandBuffer(primary));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* launch,
+      cmd_buffer->CreateLaunch(add, ThreadDim(), BlockDim(4), {}, a, b, c));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -136,7 +132,8 @@ TEST(GpuCommandBufferTest, LaunchSingleKernel) {
 
   // Update command buffer to write into `d` buffer.
   TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), a, b, d));
+  TF_ASSERT_OK(
+      cmd_buffer->UpdateLaunch(launch, add, ThreadDim(), BlockDim(4), a, b, d));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -147,17 +144,11 @@ TEST(GpuCommandBufferTest, LaunchSingleKernel) {
   ASSERT_EQ(dst, expected);
 }
 
-TEST(CudaCommandBufferTest, TraceSingleKernel) {
+TEST(GpuCommandBufferTest, TraceSingleKernel) {
   Platform* platform = GpuPlatform();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
-  if (platform->id() == rocm::kROCmPlatformId) {
-    GTEST_SKIP() << "Not supported on ROCM";
-  }
-
-  if (platform->id() == cuda::kCudaPlatformId &&
-      executor->GetDeviceDescription().runtime_version() <
-          SemanticVersion{12, 3, 0}) {
+  if (!IsAtLeastCuda12300(executor)) {
     GTEST_SKIP() << "Command buffer tracing is not supported";
   }
 
@@ -236,10 +227,14 @@ TEST(GpuCommandBufferTest, LaunchNestedCommandBuffer) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Create a command buffer with a single kernel launch.
-  auto primary_cmd = executor->CreateCommandBuffer(primary).value();
-  auto nested_cmd = executor->CreateCommandBuffer(nested).value();
-  TF_ASSERT_OK(nested_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c));
-  TF_ASSERT_OK(primary_cmd->AddNestedCommandBuffer(*nested_cmd));
+  TF_ASSERT_OK_AND_ASSIGN(auto primary_cmd,
+                          executor->CreateCommandBuffer(primary));
+  TF_ASSERT_OK_AND_ASSIGN(auto nested_cmd,
+                          executor->CreateCommandBuffer(nested));
+  TF_ASSERT_OK(
+      nested_cmd->CreateLaunch(add, ThreadDim(), BlockDim(4), {}, a, b, c));
+  TF_ASSERT_OK_AND_ASSIGN(auto* nested_command,
+                          primary_cmd->CreateNestedCommand(*nested_cmd, {}));
   TF_ASSERT_OK(primary_cmd->Finalize());
 
   TF_ASSERT_OK(primary_cmd->Submit(stream.get()));
@@ -258,9 +253,10 @@ TEST(GpuCommandBufferTest, LaunchNestedCommandBuffer) {
   // Update command buffer to write into `d` buffer by creating a new nested
   // command buffer.
   nested_cmd = executor->CreateCommandBuffer(nested).value();
-  TF_ASSERT_OK(nested_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, d));
+  TF_ASSERT_OK(
+      nested_cmd->CreateLaunch(add, ThreadDim(), BlockDim(4), {}, a, b, d));
   TF_ASSERT_OK(primary_cmd->Update());
-  TF_ASSERT_OK(primary_cmd->AddNestedCommandBuffer(*nested_cmd));
+  TF_ASSERT_OK(primary_cmd->UpdateNestedCommand(nested_command, *nested_cmd));
   TF_ASSERT_OK(primary_cmd->Finalize());
 
   TF_ASSERT_OK(primary_cmd->Submit(stream.get()));
@@ -287,8 +283,10 @@ TEST(GpuCommandBufferTest, MemcpyDeviceToDevice) {
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
 
   // Create a command buffer with a single a to b memcpy command.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->MemcpyDeviceToDevice(&b, a, byte_length));
+  TF_ASSERT_OK_AND_ASSIGN(auto cmd_buffer,
+                          executor->CreateCommandBuffer(primary));
+  TF_ASSERT_OK_AND_ASSIGN(auto* memcpy,
+                          cmd_buffer->CreateMemcpyD2D(&b, a, byte_length, {}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -302,7 +300,7 @@ TEST(GpuCommandBufferTest, MemcpyDeviceToDevice) {
 
   // Update command buffer to swap the memcpy direction.
   TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->MemcpyDeviceToDevice(&a, b, byte_length));
+  TF_ASSERT_OK(cmd_buffer->UpdateMemcpyD2D(memcpy, &a, b, byte_length));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   // Clear destination to test that command buffer actually copied memory.
@@ -329,7 +327,10 @@ TEST(GpuCommandBufferTest, Memset) {
 
   // Create a command buffer with a single memset command.
   auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->Memset(&a, uint32_t{42}, length));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      const CommandBuffer::Command* memset,
+      cmd_buffer->CreateMemset(&a, uint32_t{42}, length, {}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -343,7 +344,7 @@ TEST(GpuCommandBufferTest, Memset) {
 
   // Update command buffer to use a new bit pattern.
   TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->Memset(&a, uint32_t{43}, length));
+  TF_ASSERT_OK(cmd_buffer->UpdateMemset(memset, &a, uint32_t{43}, length));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -356,601 +357,6 @@ TEST(GpuCommandBufferTest, Memset) {
   ASSERT_EQ(dst, expected);
 }
 
-TEST(GpuCommandBufferTest, Barriers) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  // Allocate device buffers for memset operations.
-  std::vector<DeviceMemory<int32_t>> buffers;
-  for (size_t i = 0; i < 6; ++i) {
-    buffers.push_back(executor->AllocateArray<int32_t>(1, 0));
-  }
-
-  // Transfer buffers data back to host.
-  auto transfer_buffers = [&]() -> std::vector<int32_t> {
-    std::vector<int32_t> dst(buffers.size(), 0);
-    for (size_t i = 0; i < buffers.size(); ++i) {
-      TF_CHECK_OK(stream->Memcpy(dst.data() + i, buffers[i], sizeof(int32_t)));
-    }
-    return dst;
-  };
-
-  auto record = [&](CommandBuffer* cmd_buffer, uint32_t bit_pattern) {
-    // Check that root barrier ignored.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[0], bit_pattern + 0, 1));
-    // Check barrier after a single command.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[1], bit_pattern + 1, 1));
-    // Check that repeated barriers are no-op.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[2], bit_pattern + 2, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[3], bit_pattern + 3, 1));
-    // Check that barrier can have multiple dependencies.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[4], bit_pattern + 4, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[5], bit_pattern + 5, 1));
-    // Check that barrier can be that last command.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
-    return cmd_buffer->Finalize();
-  };
-
-  // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(record(cmd_buffer.get(), 42));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  std::vector<int32_t> expected = {42, 43, 44, 45, 46, 47};
-  ASSERT_EQ(transfer_buffers(), expected);
-
-  // Check the command buffer structure.
-  GpuCommandBuffer* gpu_cmd_buffer = CastToGpuCommandBuffer(cmd_buffer.get());
-  ASSERT_EQ(gpu_cmd_buffer->nodes().size(), 6);
-  ASSERT_EQ(gpu_cmd_buffer->barriers().size(), 6);
-
-  auto nodes = gpu_cmd_buffer->nodes();
-  auto barriers = gpu_cmd_buffer->barriers();
-
-  // First barrier does not have any dependencies.
-  EXPECT_TRUE(barriers[0].is_barrier_node);
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers[0].handle),
-              IsOkAndHolds(IsEmpty()));
-
-  // Second barrier reuses first memset node.
-  EXPECT_FALSE(barriers[1].is_barrier_node);
-  EXPECT_EQ(barriers[1].handle, nodes[0].handle);
-
-  // Third and fourth barriers reuse second memset node.
-  EXPECT_FALSE(barriers[2].is_barrier_node);
-  EXPECT_FALSE(barriers[3].is_barrier_node);
-  EXPECT_EQ(barriers[2].handle, nodes[1].handle);
-  EXPECT_EQ(barriers[3].handle, nodes[1].handle);
-
-  // Fifth and sixth barriers are barrier nodes.
-  EXPECT_TRUE(barriers[4].is_barrier_node);
-  EXPECT_TRUE(barriers[5].is_barrier_node);
-
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers[4].handle),
-              IsOkAndHolds(ElementsAre(nodes[2].handle, nodes[3].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers[5].handle),
-              IsOkAndHolds(ElementsAre(nodes[4].handle, nodes[5].handle)));
-
-  // Update command buffer to use a new bit pattern.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(record(cmd_buffer.get(), 43));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  expected = {43, 44, 45, 46, 47, 48};
-  ASSERT_EQ(transfer_buffers(), expected);
-}
-
-TEST(GpuCommandBufferTest, IndependentExecutionScopes) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  CommandBuffer::ExecutionScopeId s0 = CommandBuffer::ExecutionScopeId(0);
-  CommandBuffer::ExecutionScopeId s1 = CommandBuffer::ExecutionScopeId(1);
-
-  // Allocate device buffers for memset operations.
-  std::vector<DeviceMemory<int32_t>> buffers;
-  for (size_t i = 0; i < 4; ++i) {
-    buffers.push_back(executor->AllocateArray<int32_t>(1, 0));
-  }
-
-  // Transfer buffers data back to host.
-  auto transfer_buffers = [&]() -> std::vector<int32_t> {
-    std::vector<int32_t> dst(buffers.size(), 0);
-    for (size_t i = 0; i < buffers.size(); ++i) {
-      TF_CHECK_OK(stream->Memcpy(dst.data() + i, buffers[i], sizeof(int32_t)));
-    }
-    return dst;
-  };
-
-  auto record = [&](CommandBuffer* cmd_buffer, uint32_t bit_pattern) {
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[0], bit_pattern + 0, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[1], bit_pattern + 1, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[2], bit_pattern + 2, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[3], bit_pattern + 3, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s0));
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s1));
-    return cmd_buffer->Finalize();
-  };
-
-  // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(record(cmd_buffer.get(), 42));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  std::vector<int32_t> expected = {42, 43, 44, 45};
-  ASSERT_EQ(transfer_buffers(), expected);
-
-  // Check the command buffer structure.
-  GpuCommandBuffer* gpu_cmd_buffer = CastToGpuCommandBuffer(cmd_buffer.get());
-
-  auto nodes0 = gpu_cmd_buffer->nodes(s0);
-  auto nodes1 = gpu_cmd_buffer->nodes(s1);
-  auto barriers0 = gpu_cmd_buffer->barriers(s0);
-  auto barriers1 = gpu_cmd_buffer->barriers(s1);
-
-  ASSERT_EQ(nodes0.size(), 2);
-  ASSERT_EQ(nodes1.size(), 2);
-  ASSERT_EQ(barriers0.size(), 1);
-  ASSERT_EQ(barriers1.size(), 1);
-
-  EXPECT_TRUE(barriers0[0].is_barrier_node);
-  EXPECT_TRUE(barriers1[0].is_barrier_node);
-
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers0[0].handle),
-              IsOkAndHolds(ElementsAre(nodes0[0].handle, nodes0[1].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers1[0].handle),
-              IsOkAndHolds(ElementsAre(nodes1[0].handle, nodes1[1].handle)));
-
-  // Update command buffer to use a new bit pattern.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(record(cmd_buffer.get(), 43));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  expected = {43, 44, 45, 46};
-  ASSERT_EQ(transfer_buffers(), expected);
-}
-
-TEST(GpuCommandBufferTest, ExecutionScopeBarriers) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  CommandBuffer::ExecutionScopeId s0 = CommandBuffer::ExecutionScopeId(0);
-  CommandBuffer::ExecutionScopeId s1 = CommandBuffer::ExecutionScopeId(1);
-  CommandBuffer::ExecutionScopeId s2 = CommandBuffer::ExecutionScopeId(2);
-
-  // Allocate device buffers for memset operations.
-  std::vector<DeviceMemory<int32_t>> buffers;
-  for (size_t i = 0; i < 7; ++i) {
-    buffers.push_back(executor->AllocateArray<int32_t>(1, 0));
-  }
-
-  // Transfer buffers data back to host.
-  auto transfer_buffers = [&]() -> std::vector<int32_t> {
-    std::vector<int32_t> dst(buffers.size(), 0);
-    for (size_t i = 0; i < buffers.size(); ++i) {
-      TF_CHECK_OK(stream->Memcpy(dst.data() + i, buffers[i], sizeof(int32_t)));
-    }
-    return dst;
-  };
-
-  auto record = [&](CommandBuffer* cmd_buffer, uint32_t bit_pattern) {
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[0], bit_pattern + 0, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[1], bit_pattern + 1, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[2], bit_pattern + 2, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[3], bit_pattern + 3, 1));
-    // This will synchronize scopes 0 and 1 and also create an empty scope 2.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier({s0, s1, s2}));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[4], bit_pattern + 4, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[5], bit_pattern + 5, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s2, &buffers[6], bit_pattern + 6, 1));
-    return cmd_buffer->Finalize();
-  };
-
-  // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(record(cmd_buffer.get(), 42));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  std::vector<int32_t> expected = {42, 43, 44, 45, 46, 47, 48};
-  ASSERT_EQ(transfer_buffers(), expected);
-
-  // Check the command buffer structure.
-  GpuCommandBuffer* gpu_cmd_buffer = CastToGpuCommandBuffer(cmd_buffer.get());
-
-  auto nodes0 = gpu_cmd_buffer->nodes(s0);
-  auto nodes1 = gpu_cmd_buffer->nodes(s1);
-  auto nodes2 = gpu_cmd_buffer->nodes(s2);
-  auto barriers0 = gpu_cmd_buffer->barriers(s0);
-  auto barriers1 = gpu_cmd_buffer->barriers(s1);
-  auto barriers2 = gpu_cmd_buffer->barriers(s2);
-
-  ASSERT_EQ(nodes0.size(), 3);
-  ASSERT_EQ(nodes1.size(), 3);
-  ASSERT_EQ(nodes2.size(), 1);
-  ASSERT_EQ(barriers0.size(), 2);
-  ASSERT_EQ(barriers1.size(), 2);
-  ASSERT_EQ(barriers2.size(), 2);
-
-  // All barriers are real barrier nodes.
-  EXPECT_TRUE(barriers0[0].is_barrier_node && barriers0[1].is_barrier_node);
-  EXPECT_TRUE(barriers1[0].is_barrier_node && barriers1[1].is_barrier_node);
-  EXPECT_TRUE(barriers2[0].is_barrier_node && barriers2[1].is_barrier_node);
-
-  // All scopes share a broadcasted barrier.
-  EXPECT_TRUE(barriers0[1].handle == barriers1[1].handle);
-  EXPECT_TRUE(barriers1[1].handle == barriers2[1].handle);
-
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers0[0].handle),
-              IsOkAndHolds(ElementsAre(nodes0[0].handle, nodes0[1].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers1[0].handle),
-              IsOkAndHolds(ElementsAre(nodes1[0].handle, nodes1[1].handle)));
-
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers2[0].handle),
-              IsOkAndHolds(IsEmpty()));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers2[1].handle),
-              IsOkAndHolds(ElementsAre(barriers0[0].handle, barriers1[0].handle,
-                                       barriers2[0].handle)));
-
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(nodes0[2].handle),
-              IsOkAndHolds(ElementsAre(barriers0[1].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(nodes1[2].handle),
-              IsOkAndHolds(ElementsAre(barriers1[1].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(nodes2[0].handle),
-              IsOkAndHolds(ElementsAre(barriers2[1].handle)));
-
-  // Update command buffer to use a new bit pattern.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(record(cmd_buffer.get(), 43));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  expected = {43, 44, 45, 46, 47, 48, 49};
-  ASSERT_EQ(transfer_buffers(), expected);
-}
-
-TEST(GpuCommandBufferTest, ExecutionScopeOneDirectionalBarriers) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  CommandBuffer::ExecutionScopeId s0 = CommandBuffer::ExecutionScopeId(0);
-  CommandBuffer::ExecutionScopeId s1 = CommandBuffer::ExecutionScopeId(1);
-
-  // Allocate device buffers for memset operations.
-  std::vector<DeviceMemory<int32_t>> buffers;
-  for (size_t i = 0; i < 6; ++i) {
-    buffers.push_back(executor->AllocateArray<int32_t>(1, 0));
-  }
-
-  // Transfer buffers data back to host.
-  auto transfer_buffers = [&]() -> std::vector<int32_t> {
-    std::vector<int32_t> dst(buffers.size(), 0);
-    for (size_t i = 0; i < buffers.size(); ++i) {
-      TF_CHECK_OK(stream->Memcpy(dst.data() + i, buffers[i], sizeof(int32_t)));
-    }
-    return dst;
-  };
-
-  auto record = [&](CommandBuffer* cmd_buffer, uint32_t bit_pattern) {
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[0], bit_pattern + 0, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[1], bit_pattern + 1, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[2], bit_pattern + 2, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[3], bit_pattern + 3, 1));
-    // This will synchronize scopes 0 and 1.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s0, s1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[4], bit_pattern + 4, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[5], bit_pattern + 5, 1));
-    return cmd_buffer->Finalize();
-  };
-
-  // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(record(cmd_buffer.get(), 42));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  std::vector<int32_t> expected = {42, 43, 44, 45, 46, 47};
-  ASSERT_EQ(transfer_buffers(), expected);
-
-  // Check the command buffer structure.
-  GpuCommandBuffer* gpu_cmd_buffer = CastToGpuCommandBuffer(cmd_buffer.get());
-
-  auto nodes0 = gpu_cmd_buffer->nodes(s0);
-  auto nodes1 = gpu_cmd_buffer->nodes(s1);
-  auto barriers0 = gpu_cmd_buffer->barriers(s0);
-  auto barriers1 = gpu_cmd_buffer->barriers(s1);
-
-  ASSERT_EQ(nodes0.size(), 3);
-  ASSERT_EQ(nodes1.size(), 3);
-  ASSERT_EQ(barriers0.size(), 1);
-  ASSERT_EQ(barriers1.size(), 2);
-
-  // All barriers are real barrier nodes.
-  EXPECT_TRUE(barriers0[0].is_barrier_node);
-  EXPECT_TRUE(barriers1[0].is_barrier_node && barriers1[1].is_barrier_node);
-
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers0[0].handle),
-              IsOkAndHolds(ElementsAre(nodes0[0].handle, nodes0[1].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers1[0].handle),
-              IsOkAndHolds(ElementsAre(nodes1[0].handle, nodes1[1].handle)));
-  EXPECT_THAT(
-      gpu_cmd_buffer->GetNodeDependencies(barriers1[1].handle),
-      IsOkAndHolds(ElementsAre(barriers0[0].handle, barriers1[0].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(nodes0[2].handle),
-              IsOkAndHolds(ElementsAre(barriers0[0].handle)));
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(nodes1[2].handle),
-              IsOkAndHolds(ElementsAre(barriers1[1].handle)));
-
-  // Update command buffer to use a new bit pattern.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(record(cmd_buffer.get(), 43));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  expected = {43, 44, 45, 46, 47, 48};
-  ASSERT_EQ(transfer_buffers(), expected);
-}
-
-TEST(GpuCommandBufferTest, ConditionalIf) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  if (!IsAtLeastCuda12300(executor)) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  MultiKernelLoaderSpec spec(/*arity=*/3);
-  spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "AddI32");
-  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
-
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  // Prepare arguments: a=1, b=2, c=0, pred=true
-  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
-  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-
-  constexpr bool kTrue = true;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kTrue, 1));
-  TF_ASSERT_OK(stream->Memset32(&a, 1, byte_length));
-  TF_ASSERT_OK(stream->Memset32(&b, 2, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&c, byte_length));
-
-  // if (pred == true) c = a + b
-  CommandBuffer::Builder then_builder = [&](CommandBuffer* then_cmd) {
-    return then_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c);
-  };
-
-  // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->If(pred, then_builder));
-  TF_ASSERT_OK(cmd_buffer->Finalize());
-
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  // Copy `c` data back to host.
-  std::vector<int32_t> dst(4, 42);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), c, byte_length));
-
-  std::vector<int32_t> expected = {3, 3, 3, 3};
-  ASSERT_EQ(dst, expected);
-
-  // Reset predicate to false and clear output buffer.
-  constexpr bool kFalse = false;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kFalse, 1));
-  TF_ASSERT_OK(stream->MemZero(&c, byte_length));
-
-  // Submit the same command buffer, but this time it should not execute
-  // conditional branch as conditional handle should be updated to false.
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), c, byte_length));
-  std::vector<int32_t> zeroes = {0, 0, 0, 0};
-  ASSERT_EQ(dst, zeroes);
-
-  // Prepare argument for graph update: d = 0
-  DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
-  TF_ASSERT_OK(stream->MemZero(&d, byte_length));
-
-  // Set predicate buffer to true to run conditional command buffer.
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kTrue, 1));
-
-  // if (pred == true) d = a + b (write to a new location).
-  then_builder = [&](CommandBuffer* then_cmd) {
-    return then_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, d);
-  };
-
-  // Update command buffer with a conditional to use new builder.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->If(pred, then_builder));
-  TF_ASSERT_OK(cmd_buffer->Finalize());
-
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  // Copy `d` data back to host.
-  std::fill(dst.begin(), dst.end(), 42);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), d, byte_length));
-  ASSERT_EQ(dst, expected);
-}
-
-TEST(GpuCommandBufferTest, ConditionalIfWithMemset) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  if (platform->id() == rocm::kROCmPlatformId) {
-    GTEST_SKIP() << "Not supported on ROCM";
-  }
-
-  if (platform->id() == cuda::kCudaPlatformId &&
-      executor->GetDeviceDescription().driver_version() <
-          SemanticVersion{12, 4, 0}) {
-    GTEST_SKIP() << "ConditionalsWithMemset are not supported before 12.4.";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  // Prepare arguments: a=0, pred=true
-  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
-  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-
-  constexpr bool kTrue = true;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kTrue, 1));
-  TF_ASSERT_OK(stream->Memset32(&a, 0, byte_length));
-
-  // if (pred == true) memset(&a, ...);
-  CommandBuffer::Builder then_builder = [&](CommandBuffer* then_cmd) {
-    return then_cmd->Memset(&a, uint8_t{1}, byte_length);
-  };
-
-  // Create a command buffer with a single conditional operation.
-  TF_ASSERT_OK_AND_ASSIGN(auto cmd_buffer,
-                          executor->CreateCommandBuffer(primary));
-  TF_ASSERT_OK(cmd_buffer->If(pred, then_builder));
-  TF_ASSERT_OK(cmd_buffer->Finalize());
-
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  // Copy `a` data back to host.
-  std::vector<int32_t> dst(length, 42);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), a, byte_length));
-
-  std::vector<int32_t> expected(length, 1 << 24 | 1 << 16 | 1 << 8 | 1);
-  ASSERT_EQ(dst, expected);
-
-  // Prepare argument for graph update: b = 0
-  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-  TF_ASSERT_OK(stream->MemZero(&a, byte_length));
-
-  // if (pred == true) memset(&b, ...);
-  then_builder = [&](CommandBuffer* then_cmd) {
-    return then_cmd->Memset(&b, uint8_t{1}, byte_length);
-  };
-
-  // Update command buffer with a conditional to use new builder.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->If(pred, then_builder));
-  TF_ASSERT_OK(cmd_buffer->Finalize());
-
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  // Copy `b` data back to host.
-  std::fill(dst.begin(), dst.end(), 42);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), b, byte_length));
-  ASSERT_EQ(dst, expected);
-}
-
-TEST(GpuCommandBufferTest, ConditionalIfElse) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  if (!IsAtLeastCuda12300(executor)) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  // Load addition kernel.
-  MultiKernelLoaderSpec add_spec(/*arity=*/3);
-  add_spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "AddI32");
-  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, add_spec));
-
-  // Load multiplication kernel.
-  MultiKernelLoaderSpec mul_spec(/*arity=*/3);
-  mul_spec.AddInProcessSymbol(internal::GetMulI32Kernel(), "MulI32");
-  TF_ASSERT_OK_AND_ASSIGN(auto mul, MulI32Kernel::Create(executor, mul_spec));
-
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  // Prepare arguments: a=2, b=3, c=0, pred=true
-  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
-  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-
-  constexpr bool kTrue = true;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kTrue, 1));
-  TF_ASSERT_OK(stream->Memset32(&a, 2, byte_length));
-  TF_ASSERT_OK(stream->Memset32(&b, 3, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&c, byte_length));
-
-  // if (pred == true) c = a + b
-  CommandBuffer::Builder then_builder = [&](CommandBuffer* then_cmd) {
-    return then_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c);
-  };
-
-  // if (pred == false) c = a * b
-  CommandBuffer::Builder else_builder = [&](CommandBuffer* else_cmd) {
-    return else_cmd->Launch(mul, ThreadDim(), BlockDim(4), a, b, c);
-  };
-
-  // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->IfElse(pred, then_builder, else_builder));
-  TF_ASSERT_OK(cmd_buffer->Finalize());
-
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  // Copy `c` data back to host.
-  std::vector<int32_t> dst(4, 42);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), c, byte_length));
-
-  std::vector<int32_t> expected_add = {5, 5, 5, 5};
-  ASSERT_EQ(dst, expected_add);
-
-  // Reset predicate to false.
-  constexpr bool kFalse = false;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kFalse, 1));
-
-  // Submit the same command buffer, but this time it should execute `else`
-  // branch and multiply inputs.
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), c, byte_length));
-  std::vector<int32_t> expected_mul = {6, 6, 6, 6};
-  ASSERT_EQ(dst, expected_mul);
-
-  // Prepare argument for graph update: d = 0
-  DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
-  TF_ASSERT_OK(stream->MemZero(&d, byte_length));
-
-  // if (pred == false) d = a * b (write to a new location).
-  else_builder = [&](CommandBuffer* else_cmd) {
-    return else_cmd->Launch(mul, ThreadDim(), BlockDim(4), a, b, d);
-  };
-
-  // Update command buffer with a conditional to use new `else` builder.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->IfElse(pred, then_builder, else_builder));
-  TF_ASSERT_OK(cmd_buffer->Finalize());
-
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  // Copy `d` data back to host.
-  std::fill(dst.begin(), dst.end(), 42);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), d, byte_length));
-  ASSERT_EQ(dst, expected_mul);
-}
-
 TEST(GpuCommandBufferTest, ConditionalCaseEmptyGraph) {
   Platform* platform = GpuPlatform();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
@@ -982,18 +388,23 @@ TEST(GpuCommandBufferTest, ConditionalCaseEmptyGraph) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // if (index == 0) c = a + b
-  CommandBuffer::Builder branch0 = [&](CommandBuffer* branch0_cmd) {
-    return branch0_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c);
+  CommandBuffer::CreateCommands branch0 = [&](CommandBuffer* b0, auto deps) {
+    return Wrap(b0->CreateLaunch(add, ThreadDim(), BlockDim(4), deps, a, b, c));
   };
 
-  // if (index == 1) c = a * b
-  CommandBuffer::Builder branch1 = [&](CommandBuffer* branch1_cmd) {
-    return absl::OkStatus();
+  // if (index == 1) <empty graph>
+  CommandBuffer::CreateCommands branch1 = [&](CommandBuffer*, auto deps) {
+    return std::vector<const CommandBuffer::Command*>{};
   };
 
+  std::vector<CommandBuffer::CreateCommands> branches;
+  branches.push_back(std::move(branch0));
+  branches.push_back(std::move(branch1));
+
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->Case(index, {branch0, branch1}));
+  TF_ASSERT_OK_AND_ASSIGN(auto cmd_buffer,
+                          executor->CreateCommandBuffer(primary));
+  TF_ASSERT_OK(cmd_buffer->CreateCase(index, std::move(branches), {}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -1070,7 +481,7 @@ TEST_P(GpuCommandBufferCaseTest, ConditionalMultiCase) {
   const int kNumCases = GetNumCases();
   std::vector<DeviceMemory<int32_t>> values;
   std::vector<DeviceMemory<int32_t>> results;
-  std::vector<CommandBuffer::Builder> branches;
+  std::vector<CommandBuffer::CreateCommands> branches;
   values.resize(kNumCases);
   results.resize(kNumCases);
   branches.resize(kNumCases);
@@ -1079,16 +490,18 @@ TEST_P(GpuCommandBufferCaseTest, ConditionalMultiCase) {
     TF_ASSERT_OK(stream->Memset32(&values[i], i, byte_length));
     results[i] = executor->AllocateArray<int32_t>(kLength, 0);
     TF_ASSERT_OK(stream->Memset32(&results[i], 0, byte_length));
-    branches[i] = [&, i](CommandBuffer* branch_cmd) {
+    branches[i] = [&, i](CommandBuffer* branch_cmd, auto dependencies) {
       // result = i * i;
-      return branch_cmd->Launch(mul, ThreadDim(), BlockDim(kLength), values[i],
-                                values[i], results[i]);
+      return Wrap(branch_cmd->CreateLaunch(mul, ThreadDim(), BlockDim(kLength),
+                                           dependencies, values[i], values[i],
+                                           results[i]));
     };
   }
 
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->Case(index, branches));
+  TF_ASSERT_OK_AND_ASSIGN(auto cmd_buffer,
+                          executor->CreateCommandBuffer(primary));
+  TF_ASSERT_OK(cmd_buffer->CreateCase(index, std::move(branches), {}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   // We test the out of bounds cases as well ( i < 0, i >= kNumCases).
@@ -1164,18 +577,22 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // if (index == 0) c = a + b
-  CommandBuffer::Builder branch0 = [&](CommandBuffer* branch0_cmd) {
-    return branch0_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c);
+  CommandBuffer::CreateCommands branch0 = [&](CommandBuffer* b0, auto deps) {
+    return Wrap(b0->CreateLaunch(add, ThreadDim(), BlockDim(4), deps, a, b, c));
   };
 
   // if (index == 1) c = a * b
-  CommandBuffer::Builder branch1 = [&](CommandBuffer* branch1_cmd) {
-    return branch1_cmd->Launch(mul, ThreadDim(), BlockDim(4), a, b, c);
+  CommandBuffer::CreateCommands branch1 = [&](CommandBuffer* b1, auto deps) {
+    return Wrap(b1->CreateLaunch(mul, ThreadDim(), BlockDim(4), deps, a, b, c));
   };
 
+  std::vector<CommandBuffer::CreateCommands> branches;
+  branches.push_back(std::move(branch0));
+  branches.push_back(std::move(branch1));
+
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->Case(index, {branch0, branch1}));
+  TF_ASSERT_OK(cmd_buffer->CreateCase(index, std::move(branches), {}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -1218,55 +635,6 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
   ASSERT_EQ(dst, expected_mul);
 }
 
-TEST(GpuCommandBufferTest, ConditionalFor) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  if (!IsAtLeastCuda12300(executor)) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  MultiKernelLoaderSpec spec(/*arity=*/3);
-  spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "AddI32");
-  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
-
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  // Prepare arguments: a=1, b=0, loop_counter=100
-  DeviceMemory<int32_t> loop_counter = executor->AllocateArray<int32_t>(1, 0);
-  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
-
-  // Set loop counter to 100 to check that command buffer resets it.
-  TF_ASSERT_OK(stream->Memset32(&loop_counter, 100, sizeof(int32_t)));
-  TF_ASSERT_OK(stream->Memset32(&a, 1, byte_length));
-  TF_ASSERT_OK(stream->MemZero(&b, byte_length));
-
-  // Loop body: b = a + b
-  CommandBuffer::Builder body_builder = [&](CommandBuffer* body_cmd) {
-    return body_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, b);
-  };
-
-  int32_t num_iters = 10;
-
-  // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->For(num_iters, loop_counter, body_builder));
-  TF_ASSERT_OK(cmd_buffer->Finalize());
-
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  // Copy `b` data back to host.
-  std::vector<int32_t> dst(4, 42);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), b, byte_length));
-
-  std::vector<int32_t> expected = {10, 10, 10, 10};
-  ASSERT_EQ(dst, expected);
-}
-
 TEST(GpuCommandBufferTest, ConditionalWhile) {
   Platform* platform = GpuPlatform();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
@@ -1296,32 +664,35 @@ TEST(GpuCommandBufferTest, ConditionalWhile) {
   // below.
   DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
   DeviceMemory<int32_t> loop_counter = executor->AllocateArray<int32_t>(1, 0);
+  DeviceMemory<int32_t> num_iters = executor->AllocateArray<int32_t>(1, 0);
   DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
   DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
 
   static constexpr bool kFalse = false;
   TF_ASSERT_OK(stream->Memcpy(&pred, &kFalse, 1));
   TF_ASSERT_OK(stream->Memset32(&loop_counter, 0, sizeof(int32_t)));
+  TF_ASSERT_OK(stream->Memset32(&num_iters, 10, sizeof(int32_t)));
   TF_ASSERT_OK(stream->Memset32(&a, 1, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
 
-  int32_t num_iters = 10;
-
   // Loop cond: loop_counter++ < num_iters;
-  CommandBuffer::ExecutionScopeBuilder cond_builder =
-      [&](ExecutionScopeId id, CommandBuffer* cond_cmd) {
-        return cond_cmd->Launch(inc_and_cmp, id, ThreadDim(), BlockDim(),
-                                loop_counter, pred, num_iters);
-      };
+  CommandBuffer::CreateCommands create_cond = [&](CommandBuffer* cond_cmd,
+                                                  auto deps) {
+    return Wrap(cond_cmd->CreateLaunch(inc_and_cmp, ThreadDim(), BlockDim(), {},
+                                       loop_counter, pred, num_iters));
+  };
 
   // Loop body: b = a + b
-  CommandBuffer::Builder body_builder = [&](CommandBuffer* body_cmd) {
-    return body_cmd->Launch(add, ThreadDim(), BlockDim(length), a, b, b);
+  CommandBuffer::CreateCommands create_body = [&](CommandBuffer* body_cmd,
+                                                  auto deps) {
+    return Wrap(body_cmd->CreateLaunch(add, ThreadDim(), BlockDim(length), {},
+                                       a, b, b));
   };
 
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->While(pred, cond_builder, body_builder));
+  TF_ASSERT_OK(cmd_buffer->CreateWhile(pred, std::move(create_cond),
+                                       std::move(create_body), {}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -1365,6 +736,7 @@ TEST(GpuCommandBufferTest, DISABLED_WhileNestedConditional) {
   DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
   DeviceMemory<bool> pred_then = executor->AllocateArray<bool>(1, 0);
   DeviceMemory<int32_t> loop_counter = executor->AllocateArray<int32_t>(1, 0);
+  DeviceMemory<int32_t> num_iters = executor->AllocateArray<int32_t>(1, 0);
   DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
   DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
 
@@ -1373,38 +745,50 @@ TEST(GpuCommandBufferTest, DISABLED_WhileNestedConditional) {
   TF_ASSERT_OK(stream->Memcpy(&pred, &kFalse, 1));
   TF_ASSERT_OK(stream->Memcpy(&pred_then, &kTrue, 1));
   TF_ASSERT_OK(stream->Memset32(&loop_counter, 0, sizeof(int32_t)));
+  TF_ASSERT_OK(stream->Memset32(&num_iters, 10, sizeof(int32_t)));
   TF_ASSERT_OK(stream->Memset32(&a, 1, byte_length));
   TF_ASSERT_OK(stream->MemZero(&b, byte_length));
 
-  int32_t num_iters = 10;
-
-  CommandBuffer::Builder then_builder =
+  CommandBuffer::CreateCommands create_then =
       // Then body: b = a + b
-      [&](CommandBuffer* then_cmd) {
-        return then_cmd->Launch(add, ThreadDim(), BlockDim(length), a, b, b);
+      [&](CommandBuffer* then_cmd, auto deps) {
+        return Wrap(then_cmd->CreateLaunch(add, ThreadDim(), BlockDim(length),
+                                           deps, a, b, b));
+      };
+
+  CommandBuffer::CreateCommands create_else =
+      // Else body: b = a + b
+      [&](CommandBuffer* then_cmd, auto deps) {
+        return Wrap(then_cmd->CreateLaunch(add, ThreadDim(), BlockDim(length),
+                                           deps, a, b, b));
       };
 
+  std::vector<CommandBuffer::CreateCommands> branches;
+  branches.push_back(std::move(create_then));
+  branches.push_back(std::move(create_else));
+
   auto nested_cmd = executor->CreateCommandBuffer(nested).value();
-  // TODO(b/339653343): Adding this If condition causes AddNestedCommandBuffer
+  // TODO(b/339653343): Adding this Case condition causes AddNestedCommandBuffer
   // to fail.
-  TF_ASSERT_OK(nested_cmd->If(pred_then, then_builder));
+  TF_ASSERT_OK(nested_cmd->CreateCase(pred_then, std::move(branches), {}));
 
   // Loop cond: loop_counter++ < num_iters;
-  CommandBuffer::ExecutionScopeBuilder cond_builder =
-      [&](ExecutionScopeId id, CommandBuffer* cond_cmd) {
-        return cond_cmd->Launch(inc_and_cmp, id, ThreadDim(), BlockDim(length),
-                                loop_counter, pred, num_iters);
-      };
+  CommandBuffer::CreateCommands create_cond = [&](CommandBuffer* cond_cmd,
+                                                  auto deps) {
+    return Wrap(cond_cmd->CreateLaunch(inc_and_cmp, ThreadDim(),
+                                       BlockDim(length), deps, loop_counter,
+                                       pred, num_iters));
+  };
 
-  CommandBuffer::Builder body_builder =
-      [&](CommandBuffer* body_cmd) -> absl::Status {
-    CHECK_OK(body_cmd->AddNestedCommandBuffer(*nested_cmd));
-    return absl::OkStatus();
+  CommandBuffer::CreateCommands create_body = [&](CommandBuffer* body_cmd,
+                                                  auto deps) {
+    return Wrap(body_cmd->CreateNestedCommand(*nested_cmd, deps));
   };
 
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(cmd_buffer->While(pred, cond_builder, body_builder));
+  TF_ASSERT_OK(cmd_buffer->CreateWhile(pred, std::move(create_cond),
+                                       std::move(create_body), {}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
@@ -1417,207 +801,6 @@ TEST(GpuCommandBufferTest, DISABLED_WhileNestedConditional) {
   ASSERT_EQ(dst, expected);
 }
 
-TEST(GpuCommandBufferTest, ConditionalIfInExecutionScope) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  if (!IsAtLeastCuda12300(executor)) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  CommandBuffer::ExecutionScopeId s0 = CommandBuffer::ExecutionScopeId(0);
-  CommandBuffer::ExecutionScopeId s1 = CommandBuffer::ExecutionScopeId(1);
-
-  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
-
-  constexpr bool kTrue = true;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kTrue, 1));
-
-  // Allocate device buffers for memset operations.
-  std::vector<DeviceMemory<int32_t>> buffers;
-  for (size_t i = 0; i < 3; ++i) {
-    buffers.push_back(executor->AllocateArray<int32_t>(1, 0));
-  }
-
-  // Transfer buffers back to host.
-  auto transfer_buffers = [&]() -> std::vector<int32_t> {
-    std::vector<int32_t> dst(buffers.size(), 0);
-    for (size_t i = 0; i < buffers.size(); ++i) {
-      stream->Memcpy(dst.data() + i, buffers[i], sizeof(int32_t)).IgnoreError();
-    }
-    return dst;
-  };
-
-  auto record = [&](CommandBuffer* cmd_buffer, uint32_t bit_pattern) {
-    // Record memsets in execution scope #0
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[0], bit_pattern + 0, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[1], bit_pattern + 1, 1));
-
-    // Record If in execution scope #1
-    TF_RETURN_IF_ERROR(cmd_buffer->If(s1, pred, [&](CommandBuffer* then_cmd) {
-      return then_cmd->Memset(&buffers[2], bit_pattern + 2, 1);
-    }));
-
-    // Create a barrier in execution scope #0.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s0));
-
-    // Create a barrier between two execution scopes.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier({s0, s1}));
-
-    return cmd_buffer->Finalize();
-  };
-
-  // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(record(cmd_buffer.get(), 42));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  std::vector<int32_t> expected = {42, 43, 44};
-  ASSERT_EQ(transfer_buffers(), expected);
-
-  // Check the command buffer structure.
-  GpuCommandBuffer* gpu_cmd_buffer = CastToGpuCommandBuffer(cmd_buffer.get());
-
-  auto nodes0 = gpu_cmd_buffer->nodes(s0);
-  auto nodes1 = gpu_cmd_buffer->nodes(s1);
-  auto barriers0 = gpu_cmd_buffer->barriers(s0);
-  auto barriers1 = gpu_cmd_buffer->barriers(s1);
-
-  ASSERT_EQ(nodes0.size(), 2);
-  ASSERT_EQ(nodes1.size(), 2);
-  ASSERT_EQ(barriers0.size(), 3);
-  ASSERT_EQ(barriers1.size(), 3);
-
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers0[0].handle),
-              IsOkAndHolds(ElementsAre(nodes0[0].handle, nodes0[1].handle)));
-  EXPECT_EQ(barriers0[0].handle, barriers0[1].handle);
-
-  EXPECT_EQ(barriers1[0].handle, nodes1[0].handle);
-  EXPECT_EQ(barriers1[1].handle, nodes1[1].handle);
-
-  // s0 and s1 share broadcasted barrier.
-  EXPECT_EQ(barriers0[2].handle, barriers1[2].handle);
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers0[2].handle),
-              IsOkAndHolds(ElementsAre(barriers0[1].handle, nodes1[1].handle)));
-
-  // TODO(b/326284532): Add a test for bit pattern update.
-
-  // Disable conditional branch.
-  constexpr bool kFalse = false;
-  TF_ASSERT_OK(stream->Memcpy(&pred, &kFalse, 1));
-  TF_ASSERT_OK(stream->MemZero(&buffers[2], sizeof(int32_t)));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  expected = {42, 43, 0};
-  ASSERT_EQ(transfer_buffers(), expected);
-}
-
-TEST(GpuCommandBufferTest, ConditionalWhileInExecutionScope) {
-  Platform* platform = GpuPlatform();
-  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
-
-  if (!IsAtLeastCuda12300(executor)) {
-    GTEST_SKIP() << "CUDA graph conditionals are not supported";
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  CommandBuffer::ExecutionScopeId s0 = CommandBuffer::ExecutionScopeId(0);
-  CommandBuffer::ExecutionScopeId s1 = CommandBuffer::ExecutionScopeId(1);
-
-  // Load addition kernel.
-  MultiKernelLoaderSpec add_spec(/*arity=*/3);
-  add_spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "AddI32");
-  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, add_spec));
-
-  // Load inc_and_cmp kernel.
-  MultiKernelLoaderSpec icmp_spec(/*arity=*/3);
-  icmp_spec.AddInProcessSymbol(internal::GetIncAndCmpKernel(), "IncAndCmp");
-  TF_ASSERT_OK_AND_ASSIGN(auto inc_and_cmp,
-                          IncAndCmpKernel::Create(executor, icmp_spec));
-
-  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
-  DeviceMemory<int32_t> loop_counter = executor->AllocateArray<int32_t>(1, 0);
-  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(1, 0);
-  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(1, 0);
-  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(1, 0);
-
-  TF_ASSERT_OK(stream->MemZero(&loop_counter, sizeof(int32_t)));
-  TF_ASSERT_OK(stream->Memset32(&a, 1, sizeof(int32_t)));
-  TF_ASSERT_OK(stream->MemZero(&b, sizeof(int32_t)));
-
-  auto record = [&](CommandBuffer* cmd_buffer, uint32_t bit_pattern,
-                    int32_t num_iters) {
-    // Record memset in execution scope #0
-    TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &c, bit_pattern, 1));
-
-    // Record While in execution scope #1
-    TF_RETURN_IF_ERROR(cmd_buffer->While(
-        s1, pred,
-        // Loop cond: loop_counter++ < num_iters;
-        [&](ExecutionScopeId id, CommandBuffer* cond_cmd) {
-          return cond_cmd->Launch(inc_and_cmp, id, ThreadDim(), BlockDim(),
-                                  loop_counter, pred, num_iters);
-        },
-        // Loop body: b = a + b
-        [&](CommandBuffer* body_cmd) {
-          return body_cmd->Launch(add, ThreadDim(), BlockDim(), a, b, b);
-        }));
-
-    // Create a barrier between two execution scopes.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier({s0, s1}));
-
-    return cmd_buffer->Finalize();
-  };
-
-  // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
-  TF_ASSERT_OK(record(cmd_buffer.get(), 42, 10));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  // Copy `b` and `c` data back to host.
-  int32_t b_dst, c_dst;
-  TF_ASSERT_OK(stream->Memcpy(&b_dst, b, sizeof(int32_t)));
-  TF_ASSERT_OK(stream->Memcpy(&c_dst, c, sizeof(int32_t)));
-
-  EXPECT_EQ(b_dst, 10);
-  EXPECT_EQ(c_dst, 42);
-
-  // Check the command buffer structure.
-  GpuCommandBuffer* gpu_cmd_buffer = CastToGpuCommandBuffer(cmd_buffer.get());
-
-  auto nodes0 = gpu_cmd_buffer->nodes(s0);
-  auto nodes1 = gpu_cmd_buffer->nodes(s1);
-  auto barriers0 = gpu_cmd_buffer->barriers(s0);
-  auto barriers1 = gpu_cmd_buffer->barriers(s1);
-
-  // s0 should have only one real barrier joining while op and memset.
-  ASSERT_EQ(nodes0.size(), 1);
-  ASSERT_EQ(nodes1.size(), 3);
-  ASSERT_EQ(barriers0.size(), 2);
-  ASSERT_EQ(barriers1.size(), 4);
-
-  // The final barrier that joins while and memset.
-  EXPECT_THAT(gpu_cmd_buffer->GetNodeDependencies(barriers0[1].handle),
-              IsOkAndHolds(ElementsAre(nodes0[0].handle, nodes1[2].handle)));
-
-  // Update bit pattern and number of iterations.
-  TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(record(cmd_buffer.get(), 43, 20));
-
-  TF_ASSERT_OK(stream->MemZero(&loop_counter, sizeof(int32_t)));
-  TF_ASSERT_OK(stream->MemZero(&b, sizeof(int32_t)));
-  TF_ASSERT_OK(cmd_buffer->Submit(stream.get()));
-
-  TF_ASSERT_OK(stream->Memcpy(&b_dst, b, sizeof(int32_t)));
-  TF_ASSERT_OK(stream->Memcpy(&c_dst, c, sizeof(int32_t)));
-
-  EXPECT_EQ(b_dst, 20);
-  EXPECT_EQ(c_dst, 43);
-}
-
 //===----------------------------------------------------------------------===//
 // Performance benchmarks below
 //===----------------------------------------------------------------------===//
@@ -1640,7 +823,8 @@ static void BM_CreateCommandBuffer(benchmark::State& state) {
   for (auto s : state) {
     auto cmd_buffer = executor->CreateCommandBuffer(nested).value();
     for (int i = 1; i < state.range(0); ++i) {
-      CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
+      CHECK_OK(
+          cmd_buffer->CreateLaunch(add, ThreadDim(), BlockDim(4), {}, b, b, b));
     }
     CHECK_OK(cmd_buffer->Finalize());
   }
@@ -1687,14 +871,16 @@ static void BM_UpdateCommandBuffer(benchmark::State& state) {
 
   auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   for (int i = 1; i < state.range(0); ++i) {
-    CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
+    CHECK_OK(
+        cmd_buffer->CreateLaunch(add, ThreadDim(), BlockDim(4), {}, b, b, b));
   }
   CHECK_OK(cmd_buffer->Finalize());
 
   for (auto s : state) {
     CHECK_OK(cmd_buffer->Update());
     for (int i = 1; i < state.range(0); ++i) {
-      CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
+      CHECK_OK(
+          cmd_buffer->CreateLaunch(add, ThreadDim(), BlockDim(4), {}, b, b, b));
     }
     CHECK_OK(cmd_buffer->Finalize());
   }
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
index adf0347cde11..4b26c55c2f2e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
@@ -233,8 +235,8 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
   }
 
   // Set read/write access to all GPUs.
-  static auto* all_pools_ = new std::vector<CUmemoryPool>();
-  static auto* all_ids_ = new std::vector<tsl::PlatformDeviceId>();
+  static auto* const all_pools_ = new std::vector<CUmemoryPool>();
+  static auto* const all_ids_ = new std::vector<tsl::PlatformDeviceId>();
   DCHECK(all_pools_->size() == all_ids_->size());
 
   // If the cuda_state_->pool is found in all_pools_, it means it has been
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator_test.cc
index 48fd4258454e..8c38e5658e02 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
+#include <gtest/gtest.h>
 #include "absl/log/check.h"
 #include "absl/strings/ascii.h"
 #include "xla/service/platform_util.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
index 0dfc6b3b1eb2..c3d641889038 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <string>
 
+#include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/strings/ascii.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h b/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h
deleted file mode 100644
index 678a34e50a40..000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_diagnostics.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
-
-#include <string>
-#include <tuple>
-
-#include "absl/status/statusor.h"
-
-namespace stream_executor {
-namespace gpu {
-
-// e.g. DriverVersion{346, 3, 4}
-using DriverVersion = std::tuple<int, int, int>;
-
-// FIXME: These functions are in stream_executor::cuda namespaces for now
-// Will move to stream_executor::gpu namespace in the near future
-//
-//// Converts a parsed driver version to string form.
-// string DriverVersionToString(DriverVersion version);
-//
-//// Converts a parsed driver version or status value to natural string form.
-// string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version);
-//
-//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-// absl::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
-
-class Diagnostician {
- public:
-  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
-  // not initializing).
-  //
-  // Note: if we're running on a machine that has no GPUs, we don't want to
-  // produce very much log spew beyond saying, "looks like there's no CUDA
-  // kernel
-  // module running".
-  //
-  // Note: we use non-Google-File:: API here because we may be called before
-  // InitGoogle has completed.
-  static void LogDiagnosticInformation();
-
-  // Given the driver version file contents, finds the kernel module version and
-  // returns it as a string.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static absl::StatusOr<DriverVersion> FindKernelModuleVersion(
-      const std::string& driver_version_file_contents);
-
-  // Extracts the kernel driver version from the current host.
-  static absl::StatusOr<DriverVersion> FindKernelDriverVersion();
-
-  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
-  // driver-interfacing DSO version number. Returns it as a string.
-  static absl::StatusOr<DriverVersion> FindDsoVersion();
-
-  // Logs information about the kernel driver version and userspace driver
-  // library version.
-  static void LogDriverVersionInformation();
-
- private:
-  // Given the DSO version number and the driver version file contents, extracts
-  // the driver version and compares, warning the user in the case of
-  // incompatibility.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static void WarnOnDsoKernelMismatch(
-      absl::StatusOr<DriverVersion> dso_version,
-      absl::StatusOr<DriverVersion> kernel_version);
-
-  static std::string GetDevNodePath(int dev_node_ordinal);
-
-  Diagnostician(const Diagnostician&) = delete;
-  void operator=(const Diagnostician&) = delete;
-};
-
-}  // namespace gpu
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
index 97f83f534520..ac315fb434bc 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <memory>
 
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry.cc
new file mode 100644
index 000000000000..c92bf44e115c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry.cc
@@ -0,0 +1,82 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+
+#include <functional>
+#include <string>
+#include <tuple>
+#include <typeindex>
+#include <typeinfo>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+
+namespace stream_executor::gpu {
+
+namespace {
+std::string GetPlatformName(Platform::Id platform_id) {
+  absl::StatusOr<Platform*> platform =
+      PlatformManager::PlatformWithId(platform_id);
+  return platform.ok() ? platform.value()->Name() : "<unknown>";
+}
+}  // namespace
+
+GpuKernelRegistry& GpuKernelRegistry::GetGlobalRegistry() {
+  static auto registry = new GpuKernelRegistry();
+  return *registry;
+}
+
+absl::StatusOr<std::reference_wrapper<const MultiKernelLoaderSpec>>
+GpuKernelRegistry::GetKernelSpec(const std::type_info& type,
+                                 Platform::Id platform_id) {
+  absl::MutexLock lock(&mutex_);
+  auto it = kernel_specs_.find({std::type_index(type), platform_id});
+  if (it != kernel_specs_.end()) {
+    return it->second;
+  }
+
+  absl::StatusOr<Platform*> platform =
+      PlatformManager::PlatformWithId(platform_id);
+  std::string platform_name =
+      platform.ok() ? platform.value()->Name() : "<unknown>";
+
+  return absl::NotFoundError(
+      absl::StrFormat("Kernel %s not found for platform %s and type %s",
+                      type.name(), GetPlatformName(platform_id), type.name()));
+}
+
+absl::Status GpuKernelRegistry::RegisterKernel(
+    const std::type_info& type, Platform::Id platform_id,
+    const MultiKernelLoaderSpec& kernel) {
+  absl::MutexLock lock(&mutex_);
+  const auto [it, inserted] = kernel_specs_.insert(std::make_pair(
+      std::make_tuple(std::type_index(type), platform_id), kernel));
+  if (!inserted) {
+    return absl::AlreadyExistsError(
+        absl::StrFormat("Kernel %s for platform %s is already registered.",
+                        type.name(), GetPlatformName(platform_id)));
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry.h b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry.h
new file mode 100644
index 000000000000..2ad466de3e26
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry.h
@@ -0,0 +1,121 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_KERNEL_REGISTRY_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_KERNEL_REGISTRY_H_
+
+#include <functional>
+#include <tuple>
+#include <typeindex>
+#include <typeinfo>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform/initialize.h"  // IWYU pragma: keep
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/typed_kernel_factory.h"  // IWYU pragma: keep
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu {
+
+// This is a global registry for GPU kernels.
+//
+// The registry is keyed by a KernelTrait, which is a struct that contains the
+// KernelType and the target platform.
+//
+// KernelTrait example:
+//
+//   struct MyKernelTrait {
+//     using KernelType =
+//         stream_executor::TypedKernel<stream_executor::DeviceMemoryBase,
+//                                      size_t, size_t,
+//                                      stream_executor::DeviceMemoryBase>;
+//   };
+//
+// The registry is thread-safe. Registered kernels are immutable and cannot be
+// overwritten.
+//
+// Use the macro GPU_KERNEL_REGISTRY_REGISTER_STATIC_KERNEL to register a
+// kernel during application initialization.
+class GpuKernelRegistry {
+ public:
+  // Loads a kernel from the registry into the given executor. The kernel is
+  // identified by the KernelTrait. This function is thread-safe.
+  template <typename KernelTrait>
+  absl::StatusOr<typename KernelTrait::KernelType> LoadKernel(
+      StreamExecutor* executor) {
+    TF_ASSIGN_OR_RETURN(
+        const MultiKernelLoaderSpec& spec,
+        GetKernelSpec(typeid(KernelTrait), executor->GetPlatform()->id()));
+
+    return KernelTrait::KernelType::FactoryType::Create(executor, spec);
+  }
+
+  // Looks up a kernel in the registry and returns a reference to the spec
+  // object. Also have a look at `LoadKernel` which is a more convenient way to
+  // load a kernel into a StreamExecutor instance. This function is
+  // thread-safe.
+  template <typename KernelTrait>
+  absl::StatusOr<std::reference_wrapper<const MultiKernelLoaderSpec>>
+  FindKernel(Platform::Id platform_id) {
+    return GetKernelSpec(typeid(KernelTrait), platform_id);
+  }
+
+  // Registers a kernel `kernel` in the registry. This function is thread-safe.
+  template <typename KernelTrait>
+  absl::Status RegisterKernel(Platform::Id platform_id,
+                              const MultiKernelLoaderSpec& kernel) {
+    return RegisterKernel(typeid(KernelTrait), platform_id, kernel);
+  }
+
+  // Returns a reference to the process-wide instance of the registry.
+  static GpuKernelRegistry& GetGlobalRegistry();
+
+ private:
+  absl::Status RegisterKernel(const std::type_info& type,
+                              Platform::Id platform_id,
+                              const MultiKernelLoaderSpec& kernel);
+
+  absl::StatusOr<std::reference_wrapper<const MultiKernelLoaderSpec>>
+  GetKernelSpec(const std::type_info& type, Platform::Id platform_id);
+
+  absl::Mutex mutex_;
+  using KernelRegistryKey = std::tuple<std::type_index, Platform::Id>;
+  absl::flat_hash_map<KernelRegistryKey, MultiKernelLoaderSpec> kernel_specs_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+#define GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(              \
+    identifier, KernelTrait, platform_id, kernel)                    \
+  static void RegisterKernel##identifier##Impl() {                   \
+    absl::Status result =                                            \
+        stream_executor::gpu::GpuKernelRegistry::GetGlobalRegistry() \
+            .RegisterKernel<KernelTrait>(platform_id, kernel());     \
+    if (!result.ok()) {                                              \
+      LOG(FATAL) << "Failed to register kernel: " << result;         \
+    }                                                                \
+  }                                                                  \
+  STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(                       \
+      RegisterKernel##identifier, RegisterKernel##identifier##Impl());
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_KERNEL_REGISTRY_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry_test.cc
new file mode 100644
index 000000000000..646e303df707
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_registry_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/threadpool.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+struct TestKernelTrait {
+  using KernelType = TypedKernel<>;
+};
+
+struct OtherTestKernelTrait {
+  using KernelType = TypedKernel<>;
+};
+
+using testing::Property;
+using tsl::testing::IsOk;
+using tsl::testing::IsOkAndHolds;
+using tsl::testing::StatusIs;
+
+TEST(GpuKernelRegistryTest, RegisterKernel) {
+  GpuKernelRegistry registry;
+  MultiKernelLoaderSpec cuda_spec(1);
+  MultiKernelLoaderSpec rocm_spec(42);
+
+  // Can register a simple kernel
+  EXPECT_THAT(registry.RegisterKernel<TestKernelTrait>(
+                  stream_executor::cuda::kCudaPlatformId, cuda_spec),
+              IsOk());
+
+  // Can register another simple kernel - no clash
+  EXPECT_THAT(registry.RegisterKernel<OtherTestKernelTrait>(
+                  stream_executor::cuda::kCudaPlatformId, cuda_spec),
+              IsOk());
+
+  // Can register a different kernel under the same trait for a different
+  // platform.
+  EXPECT_THAT(registry.RegisterKernel<TestKernelTrait>(
+                  stream_executor::rocm::kROCmPlatformId, rocm_spec),
+              IsOk());
+
+  // Can't register a kernel if it already exists in the registry.
+  EXPECT_THAT(registry.RegisterKernel<TestKernelTrait>(
+                  stream_executor::cuda::kCudaPlatformId, cuda_spec),
+              StatusIs(absl::StatusCode::kAlreadyExists));
+}
+
+TEST(GpuKernelRegistryTest, RegisterKernelConcurrently) {
+  // This test will show races in the registry implementation when run with
+  // `--config=tsan`.
+
+  GpuKernelRegistry registry;
+
+  tsl::thread::ThreadPool pool(tsl::Env::Default(), "test_pool", 2);
+
+  pool.Schedule([&] {
+    MultiKernelLoaderSpec cuda_spec(1);
+    // Can register a simple kernel
+    EXPECT_THAT(registry.RegisterKernel<TestKernelTrait>(
+                    stream_executor::cuda::kCudaPlatformId, cuda_spec),
+                IsOk());
+  });
+
+  pool.Schedule([&] {
+    MultiKernelLoaderSpec rocm_spec(42);
+    // Can register a different kernel under the same trait for a different
+    // platform.
+    EXPECT_THAT(registry.RegisterKernel<TestKernelTrait>(
+                    stream_executor::rocm::kROCmPlatformId, rocm_spec),
+                IsOk());
+  });
+}
+
+TEST(GpuKernelRegistryTest, FindKernel) {
+  GpuKernelRegistry registry;
+  MultiKernelLoaderSpec spec(333);
+
+  ASSERT_THAT(registry.RegisterKernel<TestKernelTrait>(
+                  stream_executor::cuda::kCudaPlatformId, spec),
+              IsOk());
+
+  EXPECT_THAT(registry.FindKernel<TestKernelTrait>(
+                  stream_executor::cuda::kCudaPlatformId),
+              IsOkAndHolds(Property(&MultiKernelLoaderSpec::arity, 333)));
+
+  // No registered kernel for ROCM.
+  EXPECT_THAT(registry.FindKernel<TestKernelTrait>(
+                  stream_executor::rocm::kROCmPlatformId),
+              StatusIs(absl::StatusCode::kNotFound));
+
+  // No registered kernel for the other trait.
+  EXPECT_THAT(registry.FindKernel<OtherTestKernelTrait>(
+                  stream_executor::cuda::kCudaPlatformId),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(GpuKernelRegistryTest, FindKernelConcurrently) {
+  // This test will show races in the registry implementation when run with
+  // `--config=tsan`.
+
+  GpuKernelRegistry registry;
+  MultiKernelLoaderSpec spec(333);
+
+  ASSERT_THAT(registry.RegisterKernel<TestKernelTrait>(
+                  stream_executor::cuda::kCudaPlatformId, spec),
+              IsOk());
+
+  tsl::thread::ThreadPool pool(tsl::Env::Default(), "test_pool", 2);
+
+  pool.Schedule([&] {
+    EXPECT_THAT(registry.FindKernel<TestKernelTrait>(
+                    stream_executor::cuda::kCudaPlatformId),
+                IsOkAndHolds(Property(&MultiKernelLoaderSpec::arity, 333)));
+  });
+
+  pool.Schedule([&] {
+    EXPECT_THAT(registry.FindKernel<TestKernelTrait>(
+                    stream_executor::cuda::kCudaPlatformId),
+                IsOkAndHolds(Property(&MultiKernelLoaderSpec::arity, 333)));
+  });
+}
+
+}  // namespace
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
index 3f5500cb60b4..e56e7572b7e4 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -27,6 +30,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
+#include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -36,8 +40,8 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 namespace {
@@ -106,5 +110,93 @@ TEST_F(GpuKernelTest, LoadAndRunKernelFromSymbol) {
   RunAddI32Kernel(GetAddI32KernelSpec());
 }
 
+TEST_F(GpuKernelTest, ArrayArgByValue) {
+  constexpr absl::string_view copy_kernel = R"(
+    .version 8.0
+    .target sm_60
+    .address_size 64
+
+    .visible .entry copy_kernel(
+        .param .u64 foo_param_0,
+        .param .align 1 .b8 foo_param_1[16]
+)
+{
+        .reg .b16       %rs<17>;
+        .reg .b64       %rd<3>;
+        .loc    1 5 0
+
+        ld.param.u64    %rd1, [foo_param_0];
+        cvta.to.global.u64      %rd2, %rd1;
+        ld.param.u8     %rs1, [foo_param_1+15];
+        ld.param.u8     %rs2, [foo_param_1+14];
+        ld.param.u8     %rs3, [foo_param_1+13];
+        ld.param.u8     %rs4, [foo_param_1+12];
+        ld.param.u8     %rs5, [foo_param_1+11];
+        ld.param.u8     %rs6, [foo_param_1+10];
+        ld.param.u8     %rs7, [foo_param_1+9];
+        ld.param.u8     %rs8, [foo_param_1+8];
+        ld.param.u8     %rs9, [foo_param_1+7];
+        ld.param.u8     %rs10, [foo_param_1+6];
+        ld.param.u8     %rs11, [foo_param_1+5];
+        ld.param.u8     %rs12, [foo_param_1+4];
+        ld.param.u8     %rs13, [foo_param_1+3];
+        ld.param.u8     %rs14, [foo_param_1+2];
+        ld.param.u8     %rs15, [foo_param_1+1];
+        ld.param.u8     %rs16, [foo_param_1];
+        .loc    1 6 5
+        st.global.u8    [%rd2], %rs16;
+        st.global.u8    [%rd2+1], %rs15;
+        st.global.u8    [%rd2+2], %rs14;
+        st.global.u8    [%rd2+3], %rs13;
+        st.global.u8    [%rd2+4], %rs12;
+        st.global.u8    [%rd2+5], %rs11;
+        st.global.u8    [%rd2+6], %rs10;
+        st.global.u8    [%rd2+7], %rs9;
+        st.global.u8    [%rd2+8], %rs8;
+        st.global.u8    [%rd2+9], %rs7;
+        st.global.u8    [%rd2+10], %rs6;
+        st.global.u8    [%rd2+11], %rs5;
+        st.global.u8    [%rd2+12], %rs4;
+        st.global.u8    [%rd2+13], %rs3;
+        st.global.u8    [%rd2+14], %rs2;
+        st.global.u8    [%rd2+15], %rs1;
+        .loc    1 7 1
+        ret;
+    }
+    )";
+
+  MultiKernelLoaderSpec spec(/*arity=*/2);
+  if (executor_->GetPlatform()->id() ==
+      stream_executor::rocm::kROCmPlatformId) {
+    spec.AddInProcessSymbol(internal::GetCopyKernel(), "copy_kernel");
+  } else {
+    spec.AddCudaPtxInMemory(copy_kernel, "copy_kernel");
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor_->CreateStream());
+  TF_ASSERT_OK_AND_ASSIGN(auto kernel, executor_->LoadKernel(spec));
+
+  constexpr int64_t kLength = 16;
+
+  DeviceMemory<char> dst = executor_->AllocateArray<char>(kLength, 0);
+  TF_ASSERT_OK(stream->MemZero(&dst, kLength));
+
+  std::array<std::byte, 16> storage;
+  int i = 0;
+  for (auto& element : storage) {
+    element = static_cast<std::byte>(i++);
+  }
+
+  // Launch kernel.
+  auto args = stream_executor::PackKernelArgs(/*shmem_bytes=*/0, dst, storage);
+  TF_ASSERT_OK(kernel->Launch(ThreadDim(), BlockDim(), stream.get(), *args));
+
+  // Copy data back to host.
+  std::byte dst_host[16] = {};
+  TF_ASSERT_OK(stream->Memcpy(dst_host, dst, kLength));
+  TF_ASSERT_OK(stream->BlockHostUntilDone());
+
+  EXPECT_THAT(dst_host, ::testing::ElementsAreArray(storage));
+}
 }  // namespace
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cu.cc
index 659a8066a21a..b8b99bf14327 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cu.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.cu.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 
+#include <array>
+#include <cstddef>
 #include <cstdint>
 
 #include "xla/stream_executor/kernel_spec.h"
@@ -36,9 +38,9 @@ __global__ void MulI32(int32_t* a, int32_t* b, int32_t* c) {
   c[index] = a[index] * b[index];
 }
 
-__global__ void IncAndCmp(int32_t* counter, bool* pred, int32_t value) {
+__global__ void IncAndCmp(int32_t* counter, bool* pred, int32_t* value) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
-  pred[index] = counter[index] < value;
+  pred[index] = counter[index] < *value;
   counter[index] += 1;
 }
 
@@ -46,6 +48,14 @@ __global__ void AddI32Ptrs3(Ptrs3<int32_t> ptrs) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   ptrs.c[index] = ptrs.a[index] + ptrs.b[index];
 }
+
+__global__ void CopyKernel(std::byte* dst, std::array<std::byte, 16> byval) {
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < byval.size(); i++) {
+      dst[i] = byval[i];
+    }
+  }
+}
 }
 
 void* GetAddI32Kernel() { return reinterpret_cast<void*>(&AddI32); }
@@ -56,6 +66,8 @@ void* GetIncAndCmpKernel() { return reinterpret_cast<void*>(&IncAndCmp); }
 
 void* GetAddI32Ptrs3Kernel() { return reinterpret_cast<void*>(&AddI32Ptrs3); }
 
+void* GetCopyKernel() { return reinterpret_cast<void*>(&CopyKernel); }
+
 }  // namespace internal
 
 MultiKernelLoaderSpec GetAddI32KernelSpec() {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
index f64d1015f2ad..ff2fd2f9d28c 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
@@ -98,6 +98,8 @@ void* GetIncAndCmpKernel();
 // StreamExecutor arguments packing for custom C++ types.
 void* GetAddI32Ptrs3Kernel();
 
+void* GetCopyKernel();
+
 }  // namespace internal
 
 // Returns an in-process kernel loader spec for the `AddI32` kernel above.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_fatbin_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_fatbin_test.cc
index 5295288e17cc..9ad5c315d5b6 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_fatbin_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels_fatbin_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/stream_executor/gpu/make_batch_pointers_kernel.h b/third_party/xla/xla/stream_executor/gpu/make_batch_pointers_kernel.h
new file mode 100644
index 000000000000..1a8ed7322cfd
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/make_batch_pointers_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_MAKE_BATCH_POINTERS_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_MAKE_BATCH_POINTERS_KERNEL_H_
+
+#include <cstddef>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+
+namespace stream_executor::gpu {
+
+// Defines a trait for the MakeBatchPointers kernel that can be used to register
+// and look up the kernel in the GPU kernel registry.
+struct MakeBatchPointersKernel {
+  using KernelType =
+      stream_executor::TypedKernel<stream_executor::DeviceMemoryBase, size_t,
+                                   size_t, stream_executor::DeviceMemoryBase>;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_MAKE_BATCH_POINTERS_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc b/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
index ecc0931e89f0..44c96c7ec314 100644
--- a/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <gtest/gtest.h>
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/ragged_all_to_all_kernel.h b/third_party/xla/xla/stream_executor/gpu/ragged_all_to_all_kernel.h
new file mode 100644
index 000000000000..6e4530b1a421
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/ragged_all_to_all_kernel.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_RAGGED_ALL_TO_ALL_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_RAGGED_ALL_TO_ALL_KERNEL_H_
+
+#include <sys/types.h>
+
+#include <array>
+#include <cstdint>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+
+namespace stream_executor::gpu {
+inline constexpr int64_t kMaxNumRaggedAllToAllOutputPtrs = 8;
+
+// Defines a trait for the RaggedAllToAll kernel that can be used to register
+// and look up the kernel in the GPU kernel registry.
+template <typename ElementT>
+struct RaggedAllToAllKernel {
+  using KernelType = stream_executor::TypedKernel<
+      stream_executor::DeviceMemoryBase,
+      std::array<void*, kMaxNumRaggedAllToAllOutputPtrs>,
+      stream_executor::DeviceMemoryBase, stream_executor::DeviceMemoryBase,
+      stream_executor::DeviceMemoryBase, int64_t, int64_t>;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_RAGGED_ALL_TO_ALL_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/ragged_all_to_all_kernel_lib.cu.h b/third_party/xla/xla/stream_executor/gpu/ragged_all_to_all_kernel_lib.cu.h
new file mode 100644
index 000000000000..68abe1fef67c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/ragged_all_to_all_kernel_lib.cu.h
@@ -0,0 +1,82 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_RAGGED_ALL_TO_ALL_KERNEL_LIB_CU_H_
+#define XLA_STREAM_EXECUTOR_GPU_RAGGED_ALL_TO_ALL_KERNEL_LIB_CU_H_
+
+#include <array>
+#include <cstdint>
+
+#include "xla/stream_executor/gpu/ragged_all_to_all_kernel.h"
+
+namespace stream_executor::gpu {
+
+// RaggedAllToAll instruction performs a collective AllToAll operation on ragged
+// tensors. For the semantics of each operand see the documentation of
+// `RaggedAllToAll` HLO instruction.
+//
+// This kernel benefits from direct memory access on GPUs on single host. The
+// kernel itself does not do any collective communication. Runtime is
+// responsible to gather pointers for output buffers on different devices and
+// synchronize streams before and after the kernel.
+//
+// There are `N` devices that participate in the exchange and to each device we
+// need to send `num_updates_per_rank` updates.
+// Updates are laid out in row-major order in `input_offsets`, `send_sizes` and
+// `output_offsets` buffers.
+//
+// For i-th update to j-th device the kernel does the following copy:
+//  update_idx = i + j * num_updates_per_replica
+//
+//  input_offset = input_offsets[update_idx]
+//  send_size = send_sizes[update_idx]
+//  output_offset = output_offsets[update_idx]
+//
+//  update_slice = input[input_offset: input_offset + send_size]
+//  output_ptrs[j][output_offset : output_offset + send_size] = update_slice
+//
+// Launch parameters:
+//  - Block grid: (N*num_updates_per_rank, num_blocks_per_update, 1)
+//  - Thread grid: (num_threads_per_update, 1, 1)
+template <typename T>
+__global__ void __launch_bounds__(128) RaggedAllToAllKernelImpl(
+    const T* __restrict__ input_ptr,
+    std::array<void* __restrict__, kMaxNumRaggedAllToAllOutputPtrs> output_ptrs,
+    const int64_t* __restrict__ input_offsets_ptr,
+    const int64_t* __restrict__ send_sizes_ptr,
+    const int64_t* __restrict__ output_offsets_ptr,
+    int64_t num_updates_per_replica, int64_t num_row_elements) {
+  int64_t update_idx = blockIdx.x;
+  int64_t output_idx = update_idx / num_updates_per_replica;
+
+  T* output_ptr = static_cast<T* __restrict__>(output_ptrs[output_idx]);
+
+  int64_t input_offset = input_offsets_ptr[update_idx];
+  int64_t send_size = send_sizes_ptr[update_idx];
+  int64_t output_offset = output_offsets_ptr[update_idx];
+
+  int64_t input_offset_start = input_offset * num_row_elements;
+  int64_t output_offset_start = output_offset * num_row_elements;
+
+  int64_t update_size = send_size * num_row_elements;
+
+  for (int64_t i = threadIdx.x + blockIdx.y * blockDim.x; i < update_size;
+       i += blockDim.x * gridDim.y) {
+    output_ptr[output_offset_start + i] = input_ptr[input_offset_start + i];
+  }
+}
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_RAGGED_ALL_TO_ALL_KERNEL_LIB_CU_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/read_numa_node.cc b/third_party/xla/xla/stream_executor/gpu/read_numa_node.cc
index 9a9a5f1eea87..b705aefac9f1 100644
--- a/third_party/xla/xla/stream_executor/gpu/read_numa_node.cc
+++ b/third_party/xla/xla/stream_executor/gpu/read_numa_node.cc
@@ -17,10 +17,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstdio>
+#include <optional>
 #include <string>
 
+#include "absl/log/log.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/numa.h"
 
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_rocm.cu.cc
index 2e701f2c0ddb..f44988212ff1 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_rocm.cu.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_rocm.cu.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/container/node_hash_map.h"
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/redzone_allocator_kernel.h"
@@ -35,17 +36,35 @@ __global__ void redzone_checker_kernel(uint8_t* input_buffer,
 }  // namespace
 
 namespace stream_executor {
+template <typename... Args>
+static absl::StatusOr<TypedKernel<Args...>*> LoadKernelOrGetPtr(
+    StreamExecutor* executor, absl::string_view kernel_name, void* kernel_ptr) {
+  using KernelPtrCacheKey = std::tuple<StreamExecutor*, std::string, void*>;
 
+  static absl::Mutex kernel_ptr_cache_mutex(absl::kConstInit);
+  static auto& kernel_ptr_cache ABSL_GUARDED_BY(kernel_ptr_cache_mutex) =
+      *new std::map<KernelPtrCacheKey, TypedKernel<Args...>>;
+  KernelPtrCacheKey kernel_ptr_cache_key{executor, kernel_name, kernel_ptr};
+  absl::MutexLock lock(&kernel_ptr_cache_mutex);
+
+  auto it = kernel_ptr_cache.find(kernel_ptr_cache_key);
+  if (it == kernel_ptr_cache.end()) {
+    TF_ASSIGN_OR_RETURN(TypedKernel<Args...> loaded,
+                        (TypedKernelFactory<Args...>::Create(
+                            executor, kernel_name, kernel_ptr)));
+    it =
+        kernel_ptr_cache.emplace(kernel_ptr_cache_key, std::move(loaded)).first;
+  }
+
+  CHECK(it != kernel_ptr_cache.end());
+  return &it->second;
+}
 absl::StatusOr<ComparisonKernel*> GetComparisonKernel(
     StreamExecutor* executor, GpuAsmOpts /*gpu_asm_opts*/) {
-  static auto kernel = TypedKernelFactory<
-      DeviceMemory<uint8_t>, uint8_t, uint64_t,
-      DeviceMemory<uint64_t>>::Create(executor, "redzone_checker",
-                                      reinterpret_cast<void*>(
-                                          redzone_checker_kernel));
-
-  if (!kernel.ok()) return kernel.status();
-  return &kernel.value();
+  return LoadKernelOrGetPtr<DeviceMemory<uint8_t>, uint8_t, uint64_t,
+                            DeviceMemory<uint64_t>>(
+      executor, "redzone_checker",
+      reinterpret_cast<void*>(redzone_checker_kernel));
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
index 0195fa338a03..882c895aac50 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/scoped_activate_context.cc b/third_party/xla/xla/stream_executor/gpu/scoped_activate_context.cc
index 8607e6a6fafa..857296da3554 100644
--- a/third_party/xla/xla/stream_executor/gpu/scoped_activate_context.cc
+++ b/third_party/xla/xla/stream_executor/gpu/scoped_activate_context.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/scoped_activate_context.h"
 
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "xla/stream_executor/gpu/context.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc b/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc
index 261fe4f78d21..0628168f61aa 100644
--- a/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
diff --git a/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc b/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
index 3f94a78babc6..eacf10a83361 100644
--- a/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
+++ b/third_party/xla/xla/stream_executor/gpu/tma_metadata.cc
@@ -224,7 +224,7 @@ TmaDescriptor::TmaDescriptor(llvm::ArrayRef<uint64_t> global_dims,
                              TmaSwizzle swizzle, TmaL2Promotion l2_promotion,
                              TmaFloatOobFill float_oob_fill)
     : element_size_(element_size),
-      rank_(global_dims.size()),
+      num_dimensions_(global_dims.size()),
       global_dims_(global_dims.begin(), global_dims.end()),
       global_strides_(global_strides.begin(), global_strides.end()),
       box_dims_(box_dims.begin(), box_dims.end()),
@@ -240,7 +240,7 @@ std::string TmaDescriptor::ToString() const {
       "global_strides: {%s}, box_dims: {%s}, element_strides: {%s}, "
       "interleave: %d, swizzle: %d, l2_promotion: %d, "
       "float_oob_fill: %d}",
-      element_size_, rank_, absl::StrJoin(global_dims_, ","),
+      element_size_, num_dimensions_, absl::StrJoin(global_dims_, ","),
       absl::StrJoin(global_strides_, ","), absl::StrJoin(box_dims_, ","),
       absl::StrJoin(element_strides_, ","), interleave_, swizzle_,
       l2_promotion_, float_oob_fill_);
diff --git a/third_party/xla/xla/stream_executor/gpu/tma_metadata.h b/third_party/xla/xla/stream_executor/gpu/tma_metadata.h
index 7fb9050933d6..70700d38c6bb 100644
--- a/third_party/xla/xla/stream_executor/gpu/tma_metadata.h
+++ b/third_party/xla/xla/stream_executor/gpu/tma_metadata.h
@@ -94,8 +94,8 @@ class TmaDescriptor {
   // Element size in bytes of the tensor. Can be 1, 2, 4, 8.
   int element_size() const { return element_size_; }
 
-  // Rank of the tensor. Can be 1-5.
-  uint32_t rank() const { return rank_; }
+  // Number of dimensions of the tensor. Can be 1-5.
+  uint32_t num_dimensions() const { return num_dimensions_; }
 
   // Array containing tensor size (number of elements) along each of the rank
   // dimensions.
@@ -136,8 +136,8 @@ class TmaDescriptor {
   // Element size in bytes of the tensor. Can be 1, 2, 4, 8.
   int element_size_;
 
-  // Rank of the tensor. Can be 1-5.
-  uint32_t rank_;
+  // Number of dimensions of the tensor. Can be 1-5.
+  uint32_t num_dimensions_;
 
   // Array containing tensor size (number of elements) along each of the rank
   // dimensions.
diff --git a/third_party/xla/xla/stream_executor/gpu/topk_kernel.h b/third_party/xla/xla/stream_executor/gpu/topk_kernel.h
new file mode 100644
index 000000000000..7017ff935db1
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/topk_kernel.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_TOPK_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_TOPK_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+namespace stream_executor::gpu {
+
+// We perform 2 32-way reductions, which means the largest number of threads per
+// block we support is 1024.
+static constexpr size_t kTopKMaxThreadsPerBlock = 1024;
+
+// Defines a trait for the TopK kernel that can be used to register
+// and look up the kernel in the GPU kernel registry.
+template <size_t K, typename KT, typename VT>
+struct TopKKernel {
+  using KernelType =
+      stream_executor::TypedKernel<stream_executor::DeviceMemory<KT>, size_t,
+                                   stream_executor::DeviceMemory<KT>,
+                                   stream_executor::DeviceMemory<uint32_t>,
+                                   size_t>;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_TOPK_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 1c6f995acf2a..824c2f35d213 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Host-platform specific StreamExecutor support code.
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -82,7 +82,6 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_common",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/platform:env",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
@@ -105,7 +104,8 @@ cc_library(
     ],
     deps = [
         ":host_stream",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/host/host_stream_factory.cc b/third_party/xla/xla/stream_executor/host/host_stream_factory.cc
index 72053cda1142..adcd8cadf6b7 100644
--- a/third_party/xla/xla/stream_executor/host/host_stream_factory.cc
+++ b/third_party/xla/xla/stream_executor/host/host_stream_factory.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tsl/platform/mutex.h"
+#include "absl/base/const_init.h"
+#include "absl/synchronization/mutex.h"
 
 namespace {
 
-static tsl::mutex* get_host_stream_factory_lock() {
-  static tsl::mutex host_stream_factory_lock(tsl::LINKER_INITIALIZED);
+static absl::Mutex* get_host_stream_factory_lock() {
+  static absl::Mutex host_stream_factory_lock(absl::kConstInit);
   return &host_stream_factory_lock;
 }
 
@@ -33,7 +34,7 @@ struct FactoryItem {
 };
 
 FactoryItem& host_stream_factory() {
-  static FactoryItem* factory = new FactoryItem();
+  static FactoryItem* const factory = new FactoryItem();
   return *factory;
 }
 
@@ -45,7 +46,7 @@ namespace host {
 // static
 void HostStreamFactory::Register(std::unique_ptr<HostStreamFactory> factory,
                                  int priority) {
-  tsl::mutex_lock l(*get_host_stream_factory_lock());
+  absl::MutexLock l(get_host_stream_factory_lock());
   FactoryItem& factory_item = host_stream_factory();
   if (factory_item.factory == nullptr) {
     factory_item.factory = std::move(factory);
@@ -60,7 +61,7 @@ void HostStreamFactory::Register(std::unique_ptr<HostStreamFactory> factory,
 
 // static
 const HostStreamFactory* HostStreamFactory::GetFactory() {
-  tsl::tf_shared_lock l(*get_host_stream_factory_lock());
+  absl::ReaderMutexLock l(get_host_stream_factory_lock());
   FactoryItem& factory_item = host_stream_factory();
   return factory_item.factory.get();
 }
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index d372a3a3f655..5759b9f1a25d 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -1,4 +1,4 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
@@ -46,11 +46,11 @@ cc_library(
         "//xla:shape_util",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/framework:allocator",
+        "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
@@ -58,10 +58,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -71,14 +68,9 @@ cc_library(
         "device_mem_allocator.h",
     ],
     deps = [
-        "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/framework:allocator",
         "//xla/tsl/framework:device_id",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
index 1bc6d86ecec3..69ce79cf4b32 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
@@ -30,16 +30,17 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/framework/allocator.h"
+#include "xla/tsl/platform/logging.h"
 
 namespace stream_executor {
 
 TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator *wrapped, Stream *stream)
-    : DeviceMemoryAllocator(stream->parent()->GetPlatform()),
+    : DeviceMemoryAllocator(CHECK_NOTNULL(stream)->parent()->GetPlatform()),
       wrapped_(wrapped),
       stream_(stream) {}
 
 TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator *wrapped,
-                                       Platform *platform)
+                                       const Platform *platform)
     : DeviceMemoryAllocator(platform), wrapped_(wrapped), stream_(nullptr) {}
 
 TfAllocatorAdapter::~TfAllocatorAdapter() {}
@@ -74,10 +75,7 @@ absl::StatusOr<Stream *> TfAllocatorAdapter::GetStream(int device_ordinal) {
 
 absl::StatusOr<tsl::Allocator *> TfAllocatorAdapter::GetAllocator(
     int device_ordinal) {
-  if (stream_ == nullptr) {
-    return absl::UnavailableError("stream_ is null for TfAllocatorAdapter.");
-  }
-  if (stream_->parent()->device_ordinal() != device_ordinal) {
+  if (stream_ && stream_->parent()->device_ordinal() != device_ordinal) {
     return absl::InternalError(
         absl::StrCat("stream_->parent()->device_ordinal() ",
                      stream_->parent()->device_ordinal(),
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
index b8441ff8ffed..5ab4cde43c16 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
@@ -19,25 +19,21 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/framework/allocator.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -52,7 +48,7 @@ class TfAllocatorAdapter : public DeviceMemoryAllocator {
   TfAllocatorAdapter(tsl::Allocator *wrapped, Stream *stream);
 
   // Constructor for the cases where `stream` can not be provided.
-  TfAllocatorAdapter(tsl::Allocator *wrapped, Platform *platform);
+  TfAllocatorAdapter(tsl::Allocator *wrapped, const Platform *platform);
 
   ~TfAllocatorAdapter() override;
 
@@ -90,14 +86,17 @@ class MultiDeviceAdapter : public DeviceMemoryAllocator {
     Stream *stream;
     int64_t memory_space;
     std::optional<int> device_ordinal = std::nullopt;
+    const Platform *platform = nullptr;
 
     AllocatorInfo(std::unique_ptr<tsl::Allocator> allocator, Stream *stream,
                   int64_t memory_space,
-                  std::optional<int> device_ordinal = std::nullopt)
+                  std::optional<int> device_ordinal = std::nullopt,
+                  const Platform *platform = nullptr)
         : allocator(std::move(allocator)),
           stream(stream),
           memory_space(memory_space),
-          device_ordinal(device_ordinal) {}
+          device_ordinal(device_ordinal),
+          platform(platform) {}
   };
 
   MultiDeviceAdapter(const Platform *platform,
@@ -107,16 +106,23 @@ class MultiDeviceAdapter : public DeviceMemoryAllocator {
     for (AllocatorInfo &info : tf_allocators) {
       auto &per_device_allocators =
           memory_space_to_per_device_allocators_[info.memory_space];
-      int device_ordinal = info.device_ordinal.has_value()
-                               ? *info.device_ordinal
-                               : info.stream->parent()->device_ordinal();
+      int device_ordinal =
+          info.device_ordinal.has_value()
+              ? *info.device_ordinal
+              : CHECK_NOTNULL(info.stream)->parent()->device_ordinal();
       if (per_device_allocators.size() <= device_ordinal) {
         per_device_allocators.resize(device_ordinal + 1);
       }
       CHECK(!per_device_allocators[device_ordinal]);
-      per_device_allocators[device_ordinal] =
-          std::make_unique<TfAllocatorAdapter>(info.allocator.get(),
-                                               info.stream);
+      if (info.stream != nullptr) {
+        per_device_allocators[device_ordinal] =
+            std::make_unique<TfAllocatorAdapter>(info.allocator.get(),
+                                                 info.stream);
+      } else {
+        per_device_allocators[device_ordinal] =
+            std::make_unique<TfAllocatorAdapter>(info.allocator.get(),
+                                                 info.platform);
+      }
       tf_allocators_.push_back(std::move(info.allocator));
     }
   }
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 7ce0877cf633..c8ec5df54d52 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -79,6 +79,7 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <variant>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/meta/type_traits.h"
@@ -91,7 +92,6 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
@@ -220,9 +220,7 @@ class Kernel {
       ThreadDim threads, size_t dynamic_shared_memory_bytes) const = 0;
 
   const KernelMetadata &metadata() const { return metadata_; }
-  void set_metadata(KernelMetadata metadata) {
-    metadata_ = std::move(metadata);
-  }
+  void set_metadata(KernelMetadata metadata) { metadata_ = metadata; }
 
   const KernelArgsPacking &args_packing() const { return args_packing_; }
   void set_args_packing(KernelArgsPacking args_packing) {
@@ -430,9 +428,9 @@ class PodArgs {
  protected:
   template <typename T>
   const std::byte *add_pod_argument(const T &arg) {
-    static_assert(
-        std::is_pod_v<T> && sizeof(T) <= size & alignof(T) <= alignment,
-        "Type is not compatible with POD arguments storage");
+    static_assert(std::is_trivially_copyable_v<T> &&
+                      sizeof(T) <= size & alignof(T) <= alignment,
+                  "Type is not compatible with POD arguments storage");
 
     assert(num_args_ < capacity && "pod args overflow");
     std::byte *arg_storage = args_storage_[num_args_++].storage;
@@ -530,6 +528,8 @@ class KernelArgsPackedArray : public KernelArgsPackedArrayBase, ArgsStorage {
   size_t number_of_argument_addresses_ = 0;
 };
 
+using KernelArgument = std::variant<DeviceMemoryBase, TensorMap>;
+
 namespace internal {
 template <int n>
 std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
@@ -543,11 +543,49 @@ std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
   }
   return packed;
 }
+
+template <int n>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    absl::Span<const KernelArgument> args, uint32_t shared_mem_bytes) {
+  auto contains_tensor_map = [](absl::Span<const KernelArgument> args) -> bool {
+    return absl::c_any_of(args, [](const auto &arg) {
+      return std::holds_alternative<TensorMap>(arg);
+    });
+  };
+
+  if (contains_tensor_map(args)) {
+    auto packed =
+        std::make_unique<KernelArgsPackedArray<n, PodArgs<n, 128, 64>>>();
+    for (auto &buf : args) {
+      if (std::holds_alternative<DeviceMemoryBase>(buf)) {
+        // Buffer argument.
+        packed->add_device_memory_argument(std::get<DeviceMemoryBase>(buf));
+      } else {
+        // TMA descriptor argument.
+        packed->add_argument(std::get<TensorMap>(buf).storage);
+      }
+    }
+    if (shared_mem_bytes > 0) {
+      packed->add_shared_bytes(shared_mem_bytes);
+    }
+    return packed;
+  }
+
+  // No TensorMap arguments -> Can use EmptyArgs.
+  auto packed = std::make_unique<KernelArgsPackedArray<n, EmptyArgs>>();
+  for (auto &buf : args) {
+    packed->add_device_memory_argument(std::get<DeviceMemoryBase>(buf));
+  }
+  if (shared_mem_bytes > 0) {
+    packed->add_shared_bytes(shared_mem_bytes);
+  }
+  return packed;
+}
 }  // namespace internal
 
+template <typename ArgType>
 inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
-PackKernelArgs(absl::Span<const DeviceMemoryBase> args,
-               uint32_t shared_mem_bytes) {
+PackKernelArgs(absl::Span<const ArgType> args, uint32_t shared_mem_bytes) {
   static constexpr int kKernelArgsLimit = 1024;
 
   if (args.size() > kKernelArgsLimit)
@@ -577,9 +615,9 @@ PackKernelArgs(absl::Span<const DeviceMemoryBase> args,
   return internal::PackKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
 }
 
+template <typename ArgType>
 inline absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>
-PackKernelArgs(absl::Span<const DeviceMemoryBase> args,
-               const KernelMetadata &metadata) {
+PackKernelArgs(absl::Span<const ArgType> args, const KernelMetadata &metadata) {
   return PackKernelArgs(args, metadata.shared_memory_bytes().value_or(0));
 }
 
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index f5b5193b9bba..439a1c90ba25 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -52,12 +52,13 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
@@ -257,6 +258,10 @@ class MultiKernelLoaderSpec {
                                            absl::string_view kernel_name,
                                            absl::Span<std::string> options);
 
+  void set_kernel_args_packing(KernelArgsPacking kernel_args_packing) {
+    kernel_args_packing_ = std::move(kernel_args_packing);
+  }
+
   const KernelArgsPacking &kernel_args_packing() const {
     return kernel_args_packing_;
   }
diff --git a/third_party/xla/xla/stream_executor/kernel_test.cc b/third_party/xla/xla/stream_executor/kernel_test.cc
index a554785735d3..7dcb889e1196 100644
--- a/third_party/xla/xla/stream_executor/kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/kernel_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/types/span.h"
+#include "benchmark/benchmark.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/platform.h"
@@ -75,7 +77,7 @@ TEST(KernelTest, PackDeviceMemoryArguments) {
   DeviceMemoryBase a(reinterpret_cast<void*>(0x12345678));
   DeviceMemoryBase b(reinterpret_cast<void*>(0x87654321));
 
-  auto args = PackKernelArgs({a, b}, 0).value();
+  auto args = PackKernelArgs<DeviceMemoryBase>({a, b}, 0).value();
   ASSERT_EQ(args->number_of_arguments(), 2);
 
   auto packed = args->argument_addresses();
@@ -137,7 +139,7 @@ static void BM_PackDeviceMemoryArgs(benchmark::State& state) {
   }
 
   for (auto s : state) {
-    auto packed = PackKernelArgs(args, 0);
+    auto packed = PackKernelArgs<DeviceMemoryBase>(args, 0);
     benchmark::DoNotOptimize(packed);
   }
 }
diff --git a/third_party/xla/xla/stream_executor/platform/initialize.h b/third_party/xla/xla/stream_executor/platform/initialize.h
index 1e3069f782c9..fa8360c23b81 100644
--- a/third_party/xla/xla/stream_executor/platform/initialize.h
+++ b/third_party/xla/xla/stream_executor/platform/initialize.h
@@ -18,7 +18,8 @@ limitations under the License.
 
 #include "tsl/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_CHROMIUMOS)
+#if defined(PLATFORM_GOOGLE) || \
+    (defined(PLATFORM_PORTABLE_GOOGLE) && !defined(__EMSCRIPTEN__))
 #include "xla/stream_executor/platform/google/initialize.h"  // IWYU pragma: export
 #else
 #include "xla/stream_executor/platform/default/initialize.h"  // IWYU pragma: export
diff --git a/third_party/xla/xla/stream_executor/platform_manager.cc b/third_party/xla/xla/stream_executor/platform_manager.cc
index e8027baf3c79..7e324ec66ba7 100644
--- a/third_party/xla/xla/stream_executor/platform_manager.cc
+++ b/third_party/xla/xla/stream_executor/platform_manager.cc
@@ -216,7 +216,7 @@ absl::StatusOr<Platform*> PlatformManagerImpl::LookupByIdLocked(
 }
 
 PlatformManagerImpl& Impl() {
-  static PlatformManagerImpl* impl = new PlatformManagerImpl;
+  static PlatformManagerImpl* const impl = new PlatformManagerImpl;
   return *impl;
 }
 
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 6aa7d4618813..fa9566e9fa10 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -1,11 +1,12 @@
-# Description:
-#   ROCm-platform specific StreamExecutor support code.
+"""ROCm-platform specific StreamExecutor support code."""
 
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_hipblaslt",
     "rocm_library",
 )
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "stream_executor_friends",
@@ -38,9 +39,7 @@ cc_library(
         "rocm-only",
     ],
     deps = [
-        "//xla/stream_executor/gpu:gpu_diagnostics_header",
         "//xla/tsl/platform:logging",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -129,7 +128,6 @@ xla_test(
         "//xla/stream_executor:event",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
-        "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
@@ -148,7 +146,7 @@ cc_library(
     deps = [
         ":rocm_command_buffer",
         ":rocm_context",
-        ":rocm_diagnostics",  # buildcleaner: keep
+        ":rocm_diagnostics",
         ":rocm_driver_wrapper",
         ":rocm_event",
         ":rocm_kernel",
@@ -297,7 +295,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":rocm_diagnostics",  # buildcleaner: keep
+        ":rocm_diagnostics",
         ":rocm_driver_wrapper",
         ":rocm_executor",
         ":rocm_platform_id",
@@ -307,7 +305,6 @@ cc_library(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:gpu_diagnostics_header",
         "//xla/stream_executor/platform:initialize",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
@@ -316,6 +313,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@local_config_rocm//rocm:rocm_headers",
     ],
     alwayslink = True,  # Registers itself with the PlatformManager.
 )
@@ -346,9 +344,7 @@ cc_library(
         "rocm-only",
     ],
     deps = [
-        ":rocblas_if_static",
         ":rocm_executor",
-        ":rocm_platform_id",
         "//xla/tsl/platform:env",
         "//xla/tsl/util:determinism_for_kernels",
         "@local_config_rocm//rocm:rocm_headers",
@@ -430,7 +426,6 @@ cc_library(
         "rocm-only",
     ],
     deps = [
-        ":hipblas_lt_header",
         ":hipsolver_wrapper",
         ":rocblas_wrapper",
         ":rocsolver_wrapper",
@@ -508,7 +503,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":miopen_if_static",  # build_cleaner: keep
-        ":rocm_diagnostics",  # build_cleaner: keep
+        ":rocm_diagnostics",
         ":rocm_executor",
         ":rocm_helpers",
         ":rocm_platform_id",
@@ -524,6 +519,7 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:initialize",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
@@ -541,6 +537,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
         "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:hash",
     ],
     alwayslink = True,
@@ -690,11 +687,11 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/stream_executor/gpu:gpu_helpers_header",
-        "//xla/stream_executor/gpu:gpu_stream",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -778,8 +775,6 @@ cc_library(
     ],
     deps = [
         ":rocm_executor",
-        ":rocm_platform_id",
-        ":roctracer_if_static",
         "//xla/tsl/platform:env",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform",
@@ -824,12 +819,16 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":amdhipblaslt_plugin",
+        ":buffer_comparator_kernel_rocm",
         ":hipfft_plugin",
+        ":make_batch_pointers_kernel_rocm",
         ":miopen_plugin",
+        ":ragged_all_to_all_kernel_rocm",
         ":rocblas_plugin",
         ":rocm_helpers",
         ":rocm_platform",
-    ],
+        ":topk_kernel_rocm",
+    ] + [":cub_sort_kernel_rocm_" + suffix for suffix in get_cub_sort_kernel_types()],
     alwayslink = 1,
 )
 
@@ -863,7 +862,7 @@ cc_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "rocm_version_parser_test",
     srcs = ["rocm_version_parser_test.cc"],
     tags = [
@@ -989,7 +988,7 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:typed_kernel_factory",
-        "//xla/stream_executor/gpu:gpu_test_kernels_rocm",
+        "//xla/stream_executor/gpu:gpu_test_kernels",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
@@ -1016,7 +1015,7 @@ cc_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "rocm_status_test",
     srcs = ["rocm_status_test.cc"],
     tags = [
@@ -1073,3 +1072,115 @@ cc_library(
         "@local_tsl//tsl/platform:casts",
     ],
 )
+
+rocm_library(
+    name = "buffer_comparator_kernel_rocm",
+    srcs = [
+        "buffer_comparator_kernel_rocm.cu.cc",
+        "//xla/stream_executor/gpu:buffer_comparator_kernel_lib.cu.h",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":rocm_platform_id",
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:buffer_comparator_kernel",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/platform:initialize",
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
+    alwayslink = 1,
+)
+
+rocm_library(
+    name = "make_batch_pointers_kernel_rocm",
+    srcs = ["make_batch_pointers_kernel_rocm.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":rocm_platform_id",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:make_batch_pointers_kernel",
+        "//xla/stream_executor/platform:initialize",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
+    alwayslink = 1,
+)
+
+rocm_library(
+    name = "ragged_all_to_all_kernel_rocm",
+    srcs = [
+        "ragged_all_to_all_kernel_rocm.cc",
+        "//xla/stream_executor/gpu:ragged_all_to_all_kernel_lib.cu.h",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":rocm_platform_id",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:ragged_all_to_all_kernel",
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
+    alwayslink = 1,
+)
+
+[rocm_library(
+    name = "cub_sort_kernel_rocm_{}".format(typename),
+    srcs = ["cub_sort_kernel_rocm.cu.cc"],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    local_defines = ["CUB_TYPE_" + typename.upper()],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/stream_executor/rocm:rocm_status",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
+        "@eigen_archive//:eigen3",
+        "@local_config_rocm//rocm:rocprim",
+        "@local_tsl//tsl/platform:bfloat16",
+    ],
+    alwayslink = 1,
+) for typename in get_cub_sort_kernel_types()]
+
+rocm_library(
+    name = "topk_kernel_rocm",
+    srcs = [
+        "topk_kernel_rocm_bfloat16.cu.cc",
+        "topk_kernel_rocm_common.cu.h",
+        "topk_kernel_rocm_float.cu.cc",
+    ],
+    # copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":rocm_platform_id",
+        "//xla:types",
+        "//xla/stream_executor/gpu:gpu_kernel_registry",
+        "//xla/stream_executor/gpu:topk_kernel",
+        "//xla/tsl/lib/math:math_util",
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
+    alwayslink = 1,
+)
diff --git a/third_party/xla/xla/stream_executor/rocm/buffer_comparator_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/buffer_comparator_kernel_rocm.cu.cc
new file mode 100644
index 000000000000..ac6b084541be
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/buffer_comparator_kernel_rocm.cu.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/stream_executor/gpu/buffer_comparator_kernel_lib.cu.h"
+#include "xla/stream_executor/platform/initialize.h"
+
+namespace stream_executor::rocm {
+
+// Comparison kernel code: compare two buffers of
+// fp8/bf16/fp16/fp32/fp64/int8_t/int32_t of length buffer_length where the
+// relative error does not exceed the passed rel_error_threshold. Write the
+// number of mismatches into out parameter mismatch_count.
+
+namespace {
+
+static void RegisterBufferComparatorKernelRocmImpl() {
+  auto register_kernel = [&](auto primitive_type_constant) {
+    gpu::RegisterBufferComparatorKernelParametrized<
+        xla::primitive_util::NativeTypeOf<primitive_type_constant()>>(
+        stream_executor::rocm::kROCmPlatformId);
+  };
+  xla::primitive_util::IntegralTypeForEach(register_kernel);
+  xla::primitive_util::FloatingPointTypeForEach(register_kernel);
+}
+
+}  // namespace
+}  // namespace stream_executor::rocm
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    RegisterBufferComparatorKernelRocm,
+    stream_executor::rocm::RegisterBufferComparatorKernelRocmImpl());
diff --git a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
new file mode 100644
index 000000000000..8cbf2798c1e0
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
@@ -0,0 +1,357 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "Eigen/Core"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/hipcub/backend/rocprim/device/device_radix_sort.hpp"
+#include "rocm/include/hipcub/backend/rocprim/device/device_segmented_radix_sort.hpp"
+#include "rocm/include/rocprim/thread/radix_key_codec.hpp"
+#include "rocm/include/rocprim/type_traits.hpp"
+#include "rocm/rocm_config.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"  // IWYU pragma: keep
+#include "xla/stream_executor/rocm/rocm_status.h"
+#include "tsl/platform/bfloat16.h"
+
+// Required for sorting Eigen::half and bfloat16.
+namespace rocprim {
+namespace detail {
+
+#if (TF_ROCM_VERSION >= 50200)
+template <>
+struct float_bit_mask<Eigen::half> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7C00;
+  static constexpr uint16_t mantissa = 0x03FF;
+  using bit_type = uint16_t;
+};
+
+template <>
+struct float_bit_mask<tsl::bfloat16> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7F80;
+  static constexpr uint16_t mantissa = 0x007F;
+  using bit_type = uint16_t;
+};
+#endif  // TF_ROCM_VERSION >= 50200
+template <>
+struct radix_key_codec_base<Eigen::half>
+    : radix_key_codec_floating<Eigen::half, uint16_t> {};
+template <>
+struct radix_key_codec_base<tsl::bfloat16>
+    : radix_key_codec_floating<tsl::bfloat16, uint16_t> {};
+};  // namespace detail
+};  // namespace rocprim
+
+namespace stream_executor {
+namespace rocm {
+namespace {
+
+template <typename KeyT>
+absl::Status CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
+                         const void* d_keys_in, void* d_keys_out,
+                         size_t num_items, bool descending,
+                         hipStream_t stream) {
+  auto err =
+      descending
+          ? hipcub::DeviceRadixSort::SortKeysDescending<KeyT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out), num_items, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream)
+          : hipcub::DeviceRadixSort::SortKeys<KeyT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out), num_items, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream);
+  return stream_executor::gpu::ToStatus(err);
+}
+
+template <typename KeyT>
+absl::Status CubSortKeys(void* d_temp_storage, size_t& temp_bytes,
+                         const void* d_keys_in, void* d_keys_out,
+                         size_t num_items, bool descending, size_t batch_size,
+                         hipStream_t stream) {
+  if (batch_size == 1) {
+    return CubSortKeys<KeyT>(d_temp_storage, temp_bytes, d_keys_in, d_keys_out,
+                             num_items, descending, stream);
+  }
+  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
+  int* start_offsets =
+      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
+  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
+  auto err =
+      descending
+          ? hipcub::DeviceSegmentedRadixSort::SortKeysDescending<KeyT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out), num_items, batch_size,
+                start_offsets, end_offsets, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream)
+          : hipcub::DeviceSegmentedRadixSort::SortKeys<KeyT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out), num_items, batch_size,
+                start_offsets, end_offsets, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream);
+  return stream_executor::gpu::ToStatus(err);
+}
+
+template <typename KeyT>
+absl::Status CubSortKeysExecute(
+    xla::ffi::AnyBuffer d_temp_storage, xla::ffi::AnyBuffer d_keys_in,
+    xla::ffi::Result<xla::ffi::AnyBuffer> d_keys_out, size_t num_items,
+    bool descending, size_t batch_size, hipStream_t stream) {
+  size_t temp_bytes = d_temp_storage.size_bytes();
+  return CubSortKeys<KeyT>(d_temp_storage.untyped_data(), temp_bytes,
+                           d_keys_in.untyped_data(), d_keys_out->untyped_data(),
+                           num_items, descending, batch_size, stream);
+}
+
+template <typename KeyT>
+absl::Status CubSortKeysGetScratchSize(size_t* temp_bytes, size_t num_items,
+                                       size_t batch_size) {
+  return CubSortKeys<KeyT>(nullptr, *temp_bytes, nullptr, nullptr, num_items,
+                           false, batch_size, nullptr);
+}
+
+template <typename KeyT, typename ValT>
+absl::Status CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
+                          const void* d_keys_in, void* d_keys_out,
+                          const void* d_values_in, void* d_values_out,
+                          size_t num_items, bool descending,
+                          hipStream_t stream) {
+  auto err =
+      descending
+          ? hipcub::DeviceRadixSort::SortPairsDescending<KeyT, ValT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out),
+                static_cast<const ValT*>(d_values_in),
+                static_cast<ValT*>(d_values_out), num_items, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream)
+          : hipcub::DeviceRadixSort::SortPairs<KeyT, ValT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out),
+                static_cast<const ValT*>(d_values_in),
+                static_cast<ValT*>(d_values_out), num_items, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream);
+  return stream_executor::gpu::ToStatus(err);
+}
+
+template <typename KeyT, typename ValT>
+absl::Status CubSortPairs(void* d_temp_storage, size_t& temp_bytes,
+                          const void* d_keys_in, void* d_keys_out,
+                          const void* d_values_in, void* d_values_out,
+                          size_t num_items, bool descending, size_t batch_size,
+                          hipStream_t stream) {
+  if (batch_size == 1) {
+    return CubSortPairs<KeyT, ValT>(d_temp_storage, temp_bytes, d_keys_in,
+                                    d_keys_out, d_values_in, d_values_out,
+                                    num_items, descending, stream);
+  }
+  void* d_offsets = static_cast<char*>(d_temp_storage) + temp_bytes;
+  int* start_offsets =
+      d_temp_storage != nullptr ? static_cast<int*>(d_offsets) : nullptr;
+  int* end_offsets = start_offsets != nullptr ? start_offsets + 1 : nullptr;
+  auto err =
+      descending
+          ? hipcub::DeviceSegmentedRadixSort::SortPairsDescending<KeyT, ValT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out),
+                static_cast<const ValT*>(d_values_in),
+                static_cast<ValT*>(d_values_out), num_items, batch_size,
+                start_offsets, end_offsets, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream)
+          : hipcub::DeviceSegmentedRadixSort::SortPairs<KeyT, ValT>(
+                d_temp_storage, temp_bytes, static_cast<const KeyT*>(d_keys_in),
+                static_cast<KeyT*>(d_keys_out),
+                static_cast<const ValT*>(d_values_in),
+                static_cast<ValT*>(d_values_out), num_items, batch_size,
+                start_offsets, end_offsets, /*begin_bit=*/0,
+                /*end_bit=*/sizeof(KeyT) * 8, stream);
+  return stream_executor::gpu::ToStatus(err);
+}
+
+template <typename KeyT, typename ValT>
+static absl::Status CubSortPairsExecute(
+    xla::ffi::AnyBuffer d_temp_storage, xla::ffi::AnyBuffer d_keys_in,
+    xla::ffi::Result<xla::ffi::AnyBuffer> d_keys_out,
+    xla::ffi::AnyBuffer d_values_in,
+    xla::ffi::Result<xla::ffi::AnyBuffer> d_values_out, size_t num_items,
+    bool descending, size_t batch_size, hipStream_t stream) {
+  size_t temp_bytes = d_temp_storage.size_bytes();
+  return CubSortPairs<KeyT, ValT>(
+      d_temp_storage.untyped_data(), temp_bytes, d_keys_in.untyped_data(),
+      d_keys_out->untyped_data(), d_values_in.untyped_data(),
+      d_values_out->untyped_data(), num_items, descending, batch_size, stream);
+}
+
+template <typename KeyT, typename ValT>
+static absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes,
+                                               size_t num_items,
+                                               size_t batch_size) {
+  return CubSortPairs<KeyT, ValT>(nullptr, *temp_bytes, nullptr, nullptr,
+                                  nullptr, nullptr, num_items, false,
+                                  batch_size, nullptr);
+}
+
+}  // namespace
+
+#define XLA_CUB_DEFINE_SORT_KEYS(suffix, type)                                \
+  XLA_FFI_DEFINE_HANDLER(kCubSortKeysExecute_##suffix,                        \
+                         CubSortKeysExecute<type>,                            \
+                         xla::ffi::Ffi::Bind()                                \
+                             .Arg<xla::ffi::AnyBuffer>()                      \
+                             .Arg<xla::ffi::AnyBuffer>()                      \
+                             .Ret<xla::ffi::AnyBuffer>()                      \
+                             .Attr<size_t>("num_items")                       \
+                             .Attr<bool>("descending")                        \
+                             .Attr<size_t>("batch_size")                      \
+                             .Ctx<xla::ffi::PlatformStream<hipStream_t>>());  \
+  XLA_FFI_DEFINE_HANDLER(                                                     \
+      kCubSortKeysInitialize_##suffix, CubSortKeysGetScratchSize<type>,       \
+      xla::ffi::Ffi::Bind<xla::ffi::ExecutionStage::kInitialize>()            \
+          .Attr<xla::ffi::Pointer<size_t>>("temp_bytes")                      \
+          .Attr<size_t>("num_items")                                          \
+          .Attr<size_t>("batch_size"));                                       \
+  XLA_FFI_REGISTER_HANDLER(                                                   \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_keys_" #suffix, "CUDA", \
+      {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                \
+       /* .initialize = */ kCubSortKeysInitialize_##suffix,                   \
+       /* .execute = */ kCubSortKeysExecute_##suffix});
+
+#define XLA_CUB_DEFINE_SORT_PAIRS(suffix, type1, type2)                        \
+  XLA_FFI_DEFINE_HANDLER(kCubSortPairsExecute_##suffix,                        \
+                         (CubSortPairsExecute<type1, type2>),                  \
+                         xla::ffi::Ffi::Bind()                                 \
+                             .Arg<xla::ffi::AnyBuffer>()                       \
+                             .Arg<xla::ffi::AnyBuffer>()                       \
+                             .Ret<xla::ffi::AnyBuffer>()                       \
+                             .Arg<xla::ffi::AnyBuffer>()                       \
+                             .Ret<xla::ffi::AnyBuffer>()                       \
+                             .Attr<size_t>("num_items")                        \
+                             .Attr<bool>("descending")                         \
+                             .Attr<size_t>("batch_size")                       \
+                             .Ctx<xla::ffi::PlatformStream<hipStream_t>>());   \
+  XLA_FFI_DEFINE_HANDLER(                                                      \
+      kCubSortPairsInitialize_##suffix,                                        \
+      (CubSortPairsGetScratchSize<type1, type2>),                              \
+      xla::ffi::Ffi::Bind<xla::ffi::ExecutionStage::kInitialize>()             \
+          .Attr<xla::ffi::Pointer<size_t>>("temp_bytes")                       \
+          .Attr<size_t>("num_items")                                           \
+          .Attr<size_t>("batch_size"));                                        \
+  XLA_FFI_REGISTER_HANDLER(                                                    \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_pairs_" #suffix, "CUDA", \
+      {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                 \
+       /* .initialize = */ kCubSortPairsInitialize_##suffix,                   \
+       /* .execute = */ kCubSortPairsExecute_##suffix});
+
+// Floating point types.
+#ifdef CUB_TYPE_BF16
+XLA_CUB_DEFINE_SORT_KEYS(bf16, __nv_bfloat16)
+#endif
+#ifdef CUB_TYPE_F16
+XLA_CUB_DEFINE_SORT_KEYS(f16, __half)
+#endif
+#ifdef CUB_TYPE_F32
+XLA_CUB_DEFINE_SORT_KEYS(f32, float)
+#endif
+#ifdef CUB_TYPE_F64
+XLA_CUB_DEFINE_SORT_KEYS(f64, double)
+#endif
+
+// Signed integer types.
+#ifdef CUB_TYPE_S8
+XLA_CUB_DEFINE_SORT_KEYS(s8, int8_t)
+#endif
+#ifdef CUB_TYPE_S16
+XLA_CUB_DEFINE_SORT_KEYS(s16, int16_t)
+#endif
+#ifdef CUB_TYPE_S32
+XLA_CUB_DEFINE_SORT_KEYS(s32, int32_t)
+#endif
+#ifdef CUB_TYPE_S64
+XLA_CUB_DEFINE_SORT_KEYS(s64, int64_t)
+#endif
+
+// Unsigned integer types.
+#ifdef CUB_TYPE_U8
+XLA_CUB_DEFINE_SORT_KEYS(u8, uint8_t)
+#endif
+#ifdef CUB_TYPE_U16
+XLA_CUB_DEFINE_SORT_KEYS(u16, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32
+XLA_CUB_DEFINE_SORT_KEYS(u32, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64
+XLA_CUB_DEFINE_SORT_KEYS(u64, uint64_t)
+#endif
+
+// Pairs with 8-bit key.
+#ifdef CUB_TYPE_U8_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b16, uint8_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U8_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b32, uint8_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U8_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u8_b64, uint8_t, uint64_t)
+#endif
+
+// Pairs with 16-bit key.
+#ifdef CUB_TYPE_U16_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b16, uint16_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U16_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b32, uint16_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U16_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u16_b64, uint16_t, uint64_t)
+#endif
+
+// Pairs with 32-bit key.
+#ifdef CUB_TYPE_U32_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b16, uint32_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U32_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b32, uint32_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U32_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u32_b64, uint32_t, uint64_t)
+#endif
+#ifdef CUB_TYPE_F32_B16
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b16, float, uint16_t)
+#endif
+#ifdef CUB_TYPE_F32_B32
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b32, float, uint32_t)
+#endif
+#ifdef CUB_TYPE_F32_B64
+XLA_CUB_DEFINE_SORT_PAIRS(f32_b64, float, uint64_t)
+#endif
+
+// Pairs with 64-bit key.
+#ifdef CUB_TYPE_U64_B16
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b16, uint64_t, uint16_t)
+#endif
+#ifdef CUB_TYPE_U64_B32
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b32, uint64_t, uint32_t)
+#endif
+#ifdef CUB_TYPE_U64_B64
+XLA_CUB_DEFINE_SORT_PAIRS(u64_b64, uint64_t, uint64_t)
+#endif
+
+}  // namespace rocm
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index e0c5a9288c9d..81fa80f8a5f7 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "rocm/rocm_config.h"
 #if TF_HIPBLASLT
 
@@ -45,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/rocm/hip_blas_utils.h"
 #include "xla/stream_executor/rocm/hipblaslt_wrapper.h"
 #include "xla/stream_executor/rocm/rocm_blas.h"
@@ -238,9 +238,27 @@ auto BlasLt::MatmulPlan::GetAlgorithms(const Stream* stream,
     // no algorithms can be found for "bias epilogues". This is to be removed
     // later when this limitation is gone.
     if (op_desc_.has_bias_epilogue()) {
-      static int64_t dummyPointer = 0xACEBALL;
+      static int64_t dummy_pointer = 0xACEBALL;
       TF_RETURN_IF_ERROR(SetAttr(
-          op_desc_.get(), HIPBLASLT_MATMUL_DESC_BIAS_POINTER, &dummyPointer));
+          op_desc_.get(), HIPBLASLT_MATMUL_DESC_BIAS_POINTER, &dummy_pointer));
+    }
+
+    // hipBlasLt requires setting the a/b scale pointer (even a dummy one),
+    // otherwise no algorithms can be found for "a/b scaling". This is to be
+    // removed later when this limitation is gone.
+    auto IsFP8 = [&](const MatrixLayout& layout) -> bool {
+      return layout.type() == HIP_R_8F_E5M2_FNUZ ||
+             layout.type() == HIP_R_8F_E4M3_FNUZ ||
+             layout.type() == HIP_R_8F_E5M2 || layout.type() == HIP_R_8F_E4M3;
+    };
+    if (IsFP8(a_desc_) && IsFP8(b_desc_)) {
+      static int64_t dummy_pointer = 0xACEBALL;
+      TF_RETURN_IF_ERROR(SetAttr(op_desc_.get(),
+                                 HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                 &dummy_pointer));
+      TF_RETURN_IF_ERROR(SetAttr(op_desc_.get(),
+                                 HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                 &dummy_pointer));
     }
 
     int found_algorithm_count = 0;
@@ -360,8 +378,12 @@ auto BlasLt::GetMatmulPlan(const gpu::GemmConfig& cfg, Epilogue epilogue) const
 
 absl::Status BlasLt::MatmulPlan::DoMatmul(
     Stream* stream, const void* alpha, const void* beta,
-    const MatmulAlgorithm& algorithm, const gpu::BlasLt::MemoryArgs& args,
+    const gpu::BlasLt::MemoryArgs& args,
     blas::ProfileResult* profile_result) const {
+  if (!algorithm_.has_value()) {
+    return absl::InternalError(
+        "Algorithm must be set before calling DoMatMul!");
+  }
   DeviceMemoryBase a = args.a, b = args.b;
   if (must_swap_operands_) {
     std::swap(a, b);
@@ -381,7 +403,7 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
   }
 
   void* workspace_addr = nullptr;
-  uint64_t workspace_size = algorithm.workspace_size;
+  uint64_t workspace_size = algorithm_->workspace_size;
   if (workspace_size > 0) {
     if (args.scratch_allocator != nullptr) {
       TF_ASSIGN_OR_RETURN(
@@ -396,7 +418,7 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     }
   }
 
-  auto palgo = std::any_cast<hipblasLtMatmulAlgo_t>(&algorithm.opaque_algo);
+  auto palgo = std::any_cast<hipblasLtMatmulAlgo_t>(&algorithm_->opaque_algo);
   {
     absl::MutexLock lock(&blas_lt->mu_);
     TF_RET_CHECK(blas_lt->blas_lt_ != nullptr);
@@ -419,9 +441,15 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
                                  HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER,
                                  args.b_scale.opaque()));
     }
-    if (args.c_scale != nullptr || args.d_scale != nullptr) {
-      return absl::InternalError(
-          "hipblaslt does not support c_scale or d_scale.");
+    if (args.c_scale != nullptr) {
+      TF_RETURN_IF_ERROR(SetAttr(op_desc_.get(),
+                                 HIPBLASLT_MATMUL_DESC_C_SCALE_POINTER,
+                                 args.c_scale.opaque()));
+    }
+    if (args.d_scale != nullptr) {
+      TF_RETURN_IF_ERROR(SetAttr(op_desc_.get(),
+                                 HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER,
+                                 args.d_scale.opaque()));
     }
 #else
     if (!(args.a_scale == nullptr && args.b_scale == nullptr &&
@@ -446,7 +474,9 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
           blas_lt->blas_lt_.get(), op_desc_.get(), alpha, a.opaque(),
           a_desc_.get(), b.opaque(), b_desc_.get(), beta, args.c.opaque(),
           c_desc_.get(), args.d.opaque(), d_desc_.get(), palgo, workspace_addr,
-          workspace_size, gpu::AsGpuStreamValue(stream)));
+          workspace_size,
+          absl::bit_cast<hipStream_t>(
+              stream->platform_specific_handle().stream)));
     } else {
       return absl::InternalError("hipblaslt: Invalid algorithm type");
     }
@@ -471,8 +501,7 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
 }
 
 absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
-    Stream* stream, const MatmulAlgorithm& algorithm,
-    const gpu::BlasLt::MemoryArgs& args,
+    Stream* stream, const gpu::BlasLt::MemoryArgs& args,
     blas::ProfileResult* profile_result) const {
   auto wrapped_matmul = [&](auto scale) {
     using Scale = decltype(scale);
@@ -484,7 +513,7 @@ absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
       salpha = static_cast<Scale>(alpha_.real());
     }
     Scale sbeta = static_cast<Scale>(beta_);
-    return DoMatmul(stream, &salpha, &sbeta, algorithm, args, profile_result);
+    return DoMatmul(stream, &salpha, &sbeta, args, profile_result);
   };
 
   std::tuple operand_types{a_desc_.type(), b_desc_.type(), c_desc_.type(),
@@ -529,6 +558,32 @@ absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
                HIP_R_8F_E5M2_FNUZ, HIP_R_8F_E5M2_FNUZ)
 #endif
 
+#if TF_ROCM_VERSION >= 60300
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E4M3, HIP_R_16BF, HIP_R_16BF)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E4M3, HIP_R_16BF, HIP_R_8F_E4M3)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E4M3, HIP_R_16F, HIP_R_8F_E4M3)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E4M3, HIP_R_16F, HIP_R_16F)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E4M3, HIP_R_32F, HIP_R_32F)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E4M3, HIP_R_8F_E4M3,
+               HIP_R_8F_E4M3)
+
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E5M2, HIP_R_16BF, HIP_R_16BF)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E5M2, HIP_R_16BF, HIP_R_8F_E4M3)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E5M2, HIP_R_16BF, HIP_R_8F_E5M2)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E5M2, HIP_R_16F, HIP_R_8F_E4M3)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E5M2, HIP_R_16F, HIP_R_8F_E5M2)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E5M2, HIP_R_16F, HIP_R_16F)
+  TYPED_MATMUL(float, HIP_R_8F_E4M3, HIP_R_8F_E5M2, HIP_R_32F, HIP_R_32F)
+
+  TYPED_MATMUL(float, HIP_R_8F_E5M2, HIP_R_8F_E4M3, HIP_R_16BF, HIP_R_16BF)
+  TYPED_MATMUL(float, HIP_R_8F_E5M2, HIP_R_8F_E4M3, HIP_R_16BF, HIP_R_8F_E4M3)
+  TYPED_MATMUL(float, HIP_R_8F_E5M2, HIP_R_8F_E4M3, HIP_R_16BF, HIP_R_8F_E5M2)
+  TYPED_MATMUL(float, HIP_R_8F_E5M2, HIP_R_8F_E4M3, HIP_R_16F, HIP_R_8F_E4M3)
+  TYPED_MATMUL(float, HIP_R_8F_E5M2, HIP_R_8F_E4M3, HIP_R_16F, HIP_R_8F_E5M2)
+  TYPED_MATMUL(float, HIP_R_8F_E5M2, HIP_R_8F_E4M3, HIP_R_16F, HIP_R_16F)
+  TYPED_MATMUL(float, HIP_R_8F_E5M2, HIP_R_8F_E4M3, HIP_R_32F, HIP_R_32F)
+#endif
+
   // Other data types:
   TYPED_MATMUL(float, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF)
   TYPED_MATMUL(float, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F)
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index 1236bcec2e6e..70740c05b43a 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -102,17 +102,20 @@ class BlasLt : public gpu::BlasLt {
     ~MatmulPlan() override = default;
 
     absl::Status ExecuteOnStream(
-        Stream* stream, const MatmulAlgorithm& algorithm,
-        const gpu::BlasLt::MemoryArgs& args,
+        Stream* stream, const gpu::BlasLt::MemoryArgs& args,
         blas::ProfileResult* profile_result) const override;
 
     absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
         const Stream* stream, size_t max_algorithm_count,
         size_t max_workspace_size) const override;
 
+    absl::Status SetAlgorithm(const MatmulAlgorithm& algorithm) override {
+      algorithm_ = algorithm;
+      return absl::OkStatus();
+    }
+
    protected:
     absl::Status DoMatmul(Stream* stream, const void* alpha, const void* beta,
-                          const MatmulAlgorithm& algorithm,
                           const gpu::BlasLt::MemoryArgs& args,
                           blas::ProfileResult* profile_result) const;
 
@@ -126,6 +129,7 @@ class BlasLt : public gpu::BlasLt {
     xla::complex128 alpha_;
     double beta_;
     bool must_swap_operands_;
+    std::optional<MatmulAlgorithm> algorithm_;  // selected algorithm
   };  // class MatmulPlan
 
   explicit BlasLt(StreamExecutor* parent)
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc
index 8864476bf0d8..23cbfdae2d47 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc
@@ -35,14 +35,12 @@ absl::Status ToStatus(hipblasStatus_t status, const char* prefix) {
 
 hipDataType AsHipblasDataType(blas::DataType type) {
   switch (type) {
-    case blas::DataType::kF8E5M2:
     case blas::DataType::kF8E4M3:
-    case blas::DataType::kF8E4M3FN:
     case blas::DataType::kF8E3M4:
     case blas::DataType::kF4E2M1FN:
     case blas::DataType::kF8E8M0FNU:
-      LOG(FATAL) << "hipblaslt does not support F8E5M2, F8E4M3, F8E4M3FN, "
-                    "F8E3M4, F4E2M1FN and F8E8M0FNU";
+      LOG(FATAL) << "hipblaslt does not support, F8E4M3, F8E3M4, F4E2M1FN and "
+                    "F8E8M0FNU";
 #if TF_ROCM_VERSION >= 60000
     case blas::DataType::kF8E5M2FNUZ:
       return HIP_R_8F_E5M2_FNUZ;
@@ -51,7 +49,17 @@ hipDataType AsHipblasDataType(blas::DataType type) {
 #else
     case blas::DataType::kF8E5M2FNUZ:
     case blas::DataType::kF8E4M3FNUZ:
-      LOG(FATAL) << "hipblaslt only supports F8 in ROCm 6.0 and above";
+      LOG(FATAL) << "hipblaslt only supports nanoo F8 in ROCm 6.0 and above";
+#endif
+#if TF_ROCM_VERSION >= 60300
+    case blas::DataType::kF8E5M2:
+      return HIP_R_8F_E5M2;
+    case blas::DataType::kF8E4M3FN:
+      return HIP_R_8F_E4M3;
+#else
+    case blas::DataType::kF8E5M2:
+    case blas::DataType::kF8E4M3FN:
+      LOG(FATAL) << "hipblaslt only supports OCP F8 in ROCm 6.3 and above";
 #endif
     case blas::DataType::kHalf:
       return HIP_R_16F;
diff --git a/third_party/xla/xla/stream_executor/rocm/make_batch_pointers_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/make_batch_pointers_kernel_rocm.cu.cc
new file mode 100644
index 000000000000..110d2e46e100
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/make_batch_pointers_kernel_rocm.cu.cc
@@ -0,0 +1,43 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+
+#include "absl/base/casts.h"
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/make_batch_pointers_kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor::rocm {
+namespace {
+__global__ void MakeBatchPointers(char* base, size_t stride, size_t n,
+                                  void** ptrs_out) {
+  size_t idx = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
+  if (idx >= n) return;
+  ptrs_out[idx] = base + idx * stride;
+}
+}  // namespace
+}  // namespace stream_executor::rocm
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    MakeBatchPointersKernelRocm, stream_executor::gpu::MakeBatchPointersKernel,
+    stream_executor::rocm::kROCmPlatformId, ([] {
+      stream_executor::MultiKernelLoaderSpec spec(4);
+      spec.AddInProcessSymbol(
+          absl::bit_cast<void*>(&stream_executor::rocm::MakeBatchPointers),
+          "make_batch_pointers");
+      return spec;
+    }));
diff --git a/third_party/xla/xla/stream_executor/rocm/ragged_all_to_all_kernel_rocm.cc b/third_party/xla/xla/stream_executor/rocm/ragged_all_to_all_kernel_rocm.cc
new file mode 100644
index 000000000000..a3c29dd9bf1c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/ragged_all_to_all_kernel_rocm.cc
@@ -0,0 +1,41 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <cstdint>
+
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/ragged_all_to_all_kernel.h"
+#include "xla/stream_executor/gpu/ragged_all_to_all_kernel_lib.cu.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+
+#define REGISTER_RAGGED_ALL_TO_ALL_KERNEL(TYPE, BITS)                   \
+  GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                       \
+      RaggedAllToAllKernelRocmUInt##BITS,                               \
+      stream_executor::gpu::RaggedAllToAllKernel<TYPE>,                 \
+      stream_executor::rocm::kROCmPlatformId, ([] {                     \
+        stream_executor::MultiKernelLoaderSpec spec(7);                 \
+        spec.AddInProcessSymbol(                                        \
+            absl::bit_cast<void*>(                                      \
+                &stream_executor::gpu::RaggedAllToAllKernelImpl<TYPE>), \
+            "ragged_all_to_all_kernel_uint" #BITS);                     \
+        return spec;                                                    \
+      }));
+
+// Register the kernel for different integer types using the macro
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint8_t, 8);
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint16_t, 16);
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint32_t, 32);
+REGISTER_RAGGED_ALL_TO_ALL_KERNEL(uint64_t, 64);
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
index a392517ac7fd..a3c73f53dad8 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
@@ -101,47 +101,44 @@ absl::StatusOr<std::unique_ptr<RocmCommandBuffer>> RocmCommandBuffer::Create(
                             /*is_owned_graph=*/true));
 }
 
-absl::StatusOr<GpuCommandBuffer::ConditionalNodeResult>
-RocmCommandBuffer::CreateConditionalNode(const Dependencies& dependencies,
-                                         GraphConditionalHandle conditional,
-                                         ConditionType type) {
+absl::StatusOr<GpuCommandBuffer::GraphConditionalNodeHandle>
+RocmCommandBuffer::CreateConditionalNode(
+    absl::Span<const GraphNodeHandle> dependencies,
+    GraphConditionalHandle conditional, ConditionType type) {
   return absl::UnimplementedError("Conditionals are not supported on ROCM.");
 }
 
-absl::Status RocmCommandBuffer::LaunchSetIfConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle if_conditional,
-    DeviceMemory<bool> predicate) {
-  return absl::UnimplementedError("Conditionals are not supported on ROCM.");
-}
-
-absl::Status RocmCommandBuffer::LaunchSetIfElseConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle if_conditional,
-    GraphConditionalHandle else_conditional, DeviceMemory<bool> predicate) {
+absl::StatusOr<GraphNodeHandle> RocmCommandBuffer::CreateSetCaseConditionNode(
+    absl::Span<const GraphConditionalHandle> conditionals,
+    DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
+    bool enable_conditional_default,
+    absl::Span<const GraphNodeHandle> dependencies) {
   return absl::UnimplementedError("Conditionals are not supported on ROCM.");
 }
 
-absl::Status RocmCommandBuffer::LaunchSetCaseConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+absl::Status RocmCommandBuffer::UpdateSetCaseConditionNode(
+    GraphNodeHandle handle,
+    absl::Span<const GraphConditionalHandle> conditionals,
     DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
     bool enable_conditional_default) {
   return absl::UnimplementedError("Conditionals are not supported on ROCM.");
 }
 
-absl::Status RocmCommandBuffer::LaunchSetForConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
-    DeviceMemory<int32_t> loop_counter, int32_t iterations) {
+absl::StatusOr<GraphNodeHandle> RocmCommandBuffer::CreateSetWhileConditionNode(
+    GraphConditionalHandle conditional, DeviceMemory<bool> predicate,
+    absl::Span<const GraphNodeHandle> dependencies) {
   return absl::UnimplementedError("Conditionals are not supported on ROCM.");
 }
 
-absl::Status RocmCommandBuffer::LaunchSetWhileConditionKernel(
-    ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+absl::Status RocmCommandBuffer::UpdateSetWhileConditionNode(
+    GraphNodeHandle handle, GraphConditionalHandle conditional,
     DeviceMemory<bool> predicate) {
   return absl::UnimplementedError("Conditionals are not supported on ROCM.");
 }
 
 absl::StatusOr<GraphNodeHandle> RocmCommandBuffer::CreateMemsetNode(
-    const Dependencies& dependencies, DeviceMemoryBase destination,
-    BitPattern bit_pattern, size_t num_elements) {
+    absl::Span<const GraphNodeHandle> dependencies,
+    DeviceMemoryBase destination, BitPattern bit_pattern, size_t num_elements) {
   VLOG(2) << "Add memset node to a graph " << graph_
           << "; dst: " << destination.opaque()
           << "; bit_pattern: " << bit_pattern.ToString()
@@ -189,8 +186,8 @@ absl::Status RocmCommandBuffer::UpdateMemsetNode(GraphNodeHandle node_handle,
 }
 
 absl::StatusOr<GraphNodeHandle> RocmCommandBuffer::CreateMemcpyD2DNode(
-    const Dependencies& dependencies, DeviceMemoryBase destination,
-    DeviceMemoryBase source, uint64_t size) {
+    absl::Span<const GraphNodeHandle> dependencies,
+    DeviceMemoryBase destination, DeviceMemoryBase source, uint64_t size) {
   VLOG(2) << "Add memcpy d2d node to a graph " << graph_
           << "; dst: " << destination.opaque() << "; src: " << source.opaque()
           << "; size: " << size << "; deps: " << dependencies.size();
@@ -223,7 +220,8 @@ absl::Status RocmCommandBuffer::UpdateMemcpyD2DNode(
 }
 
 absl::StatusOr<GraphNodeHandle> RocmCommandBuffer::CreateChildNode(
-    const Dependencies& dependencies, const CommandBuffer& nested) {
+    absl::Span<const GraphNodeHandle> dependencies,
+    const CommandBuffer& nested) {
   hipGraph_t child_graph =
       tensorflow::down_cast<const RocmCommandBuffer&>(nested).graph_;
   VLOG(2) << "Create a new node by cloning the child graph " << child_graph
@@ -253,7 +251,7 @@ absl::Status RocmCommandBuffer::UpdateChildNode(GraphNodeHandle node_handle,
 }
 
 absl::StatusOr<GraphNodeHandle> RocmCommandBuffer::CreateKernelNode(
-    const Dependencies& dependencies, const ThreadDim& threads,
+    absl::Span<const GraphNodeHandle> dependencies, const ThreadDim& threads,
     const BlockDim& blocks, const Kernel& kernel,
     const KernelArgsPackedArrayBase& args) {
   const uint64_t shared_mem_bytes = args.number_of_shared_bytes();
@@ -339,22 +337,6 @@ absl::Status RocmCommandBuffer::UpdateKernelNode(
                   "Failed to set HIP graph kernel node params");
 }
 
-absl::StatusOr<GraphNodeHandle> RocmCommandBuffer::CreateBarrierNode(
-    const Dependencies& dependencies) {
-  VLOG(2) << "Add empty node to a graph " << graph_
-          << "; deps: " << dependencies.size();
-
-  hipGraphNode_t barrier_handle = nullptr;
-  std::vector<hipGraphNode_t> deps = ToHipGraphHandles(dependencies);
-
-  TF_RETURN_IF_ERROR(
-      ToStatus(wrap::hipGraphAddEmptyNode(&barrier_handle, graph_, deps.data(),
-                                          deps.size()),
-               "Failed to add empty node to a HIP graph"));
-
-  return FromHipGraphHandle(barrier_handle);
-}
-
 absl::Status RocmCommandBuffer::Trace(
     Stream* stream, absl::AnyInvocable<absl::Status()> function) {
   TF_RETURN_IF_ERROR(CheckNotFinalized());
@@ -398,17 +380,6 @@ absl::Status RocmCommandBuffer::Trace(
   return absl::OkStatus();
 }
 
-absl::Status RocmCommandBuffer::SetNodeExecutionEnabled(
-    GraphNodeHandle node_handle, bool enabled) {
-  // Node is enabled if value != 0, otherwise the node is disabled.
-  unsigned value = enabled ? 1 : 0;
-  VLOG(2) << "Set HIP executable graph " << exec_ << " node " << node_handle
-          << " enabled flag to " << value;
-  return ToStatus(
-      wrap::hipGraphNodeSetEnabled(exec_, ToHipGraphHandle(node_handle), value),
-      "Failed to set HIP graph node enabled flag");
-}
-
 absl::Status RocmCommandBuffer::LaunchGraph(Stream* stream) {
   VLOG(3) << "Launch command buffer executable graph " << exec_
           << " on a stream: " << stream;
@@ -493,29 +464,4 @@ absl::Status RocmCommandBuffer::CheckCanBeUpdated() {
   return absl::OkStatus();
 }
 
-absl::StatusOr<std::vector<GraphNodeHandle>>
-RocmCommandBuffer::GetNodeDependencies(const GraphNodeHandle node) {
-  VLOG(2) << "Get HIP graph node " << node << " dependencies";
-
-  std::vector<hipGraphNode_t> dependencies;
-
-  size_t num_dependencies = 0;
-  TF_RETURN_IF_ERROR(
-      ToStatus(hipGraphNodeGetDependencies(ToHipGraphHandle(node), nullptr,
-                                           &num_dependencies),
-               "Failed to get HIP graph node depedencies size"));
-
-  dependencies.resize(num_dependencies, nullptr);
-  TF_RETURN_IF_ERROR(ToStatus(
-      hipGraphNodeGetDependencies(ToHipGraphHandle(node), dependencies.data(),
-                                  &num_dependencies),
-      "Failed to get HIP graph node depedencies"));
-
-  std::vector<GraphNodeHandle> result;
-  result.reserve(dependencies.size());
-  absl::c_transform(
-      dependencies, std::back_inserter(result),
-      static_cast<GraphNodeHandle (*)(hipGraphNode_t)>(&FromHipGraphHandle));
-  return result;
-}
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
index dcb867c1802e..2edce7679ce4 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.h
@@ -26,7 +26,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "rocm/include/hip/hip_runtime.h"
+#include "xla/stream_executor/bit_pattern.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_command_buffer.h"
@@ -54,34 +56,40 @@ class RocmCommandBuffer : public GpuCommandBuffer {
         graph_(graph),
         is_owned_graph_(is_owned_graph) {}
 
-  absl::Status LaunchSetIfConditionKernel(
-      ExecutionScopeId execution_scope_id,
-      GraphConditionalHandle if_conditional,
-      DeviceMemory<bool> predicate) override;
-  absl::Status LaunchSetIfElseConditionKernel(
-      ExecutionScopeId execution_scope_id,
-      GraphConditionalHandle if_conditional,
-      GraphConditionalHandle else_conditional,
-      DeviceMemory<bool> predicate) override;
-  absl::Status LaunchSetCaseConditionKernel(
-      ExecutionScopeId execution_scope_id, GraphConditionalHandles conditionals,
+  //===--------------------------------------------------------------------===//
+  // APIs for launching kernels to update conditional handles.
+  //===--------------------------------------------------------------------===//
+
+  absl::StatusOr<GraphNodeHandle> CreateSetCaseConditionNode(
+      absl::Span<const GraphConditionalHandle> conditionals,
+      DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
+      bool enable_conditional_default,
+      absl::Span<const GraphNodeHandle> dependencies) override;
+
+  absl::Status UpdateSetCaseConditionNode(
+      GraphNodeHandle handle,
+      absl::Span<const GraphConditionalHandle> conditionals,
       DeviceMemory<uint8_t> index, bool index_is_bool, int32_t batch_offset,
       bool enable_conditional_default) override;
-  absl::Status LaunchSetForConditionKernel(ExecutionScopeId execution_scope_id,
-                                           GraphConditionalHandle conditional,
-                                           DeviceMemory<int32_t> loop_counter,
-                                           int32_t iterations) override;
-  absl::Status LaunchSetWhileConditionKernel(
-      ExecutionScopeId execution_scope_id, GraphConditionalHandle conditional,
+
+  absl::StatusOr<GraphNodeHandle> CreateSetWhileConditionNode(
+      GraphConditionalHandle conditional, DeviceMemory<bool> predicate,
+      absl::Span<const GraphNodeHandle> dependencies) override;
+
+  absl::Status UpdateSetWhileConditionNode(
+      GraphNodeHandle handle, GraphConditionalHandle conditional,
       DeviceMemory<bool> predicate) override;
 
-  absl::StatusOr<ConditionalNodeResult> CreateConditionalNode(
-      const Dependencies& dependencies, GraphConditionalHandle conditional,
-      ConditionType type) override;
+  //===--------------------------------------------------------------------===//
+
+  absl::StatusOr<GraphConditionalNodeHandle> CreateConditionalNode(
+      absl::Span<const GraphNodeHandle> dependencies,
+      GraphConditionalHandle conditional, ConditionType type) override;
 
   absl::StatusOr<GraphNodeHandle> CreateMemsetNode(
-      const Dependencies& dependencies, DeviceMemoryBase destination,
-      BitPattern bit_pattern, size_t num_elements) override;
+      absl::Span<const GraphNodeHandle> dependencies,
+      DeviceMemoryBase destination, BitPattern bit_pattern,
+      size_t num_elements) override;
 
   absl::Status UpdateMemsetNode(GraphNodeHandle node_handle,
                                 DeviceMemoryBase destination,
@@ -89,22 +97,35 @@ class RocmCommandBuffer : public GpuCommandBuffer {
                                 size_t num_elements) override;
 
   absl::StatusOr<GraphNodeHandle> CreateMemcpyD2DNode(
-      const Dependencies& dependencies, DeviceMemoryBase destination,
-      DeviceMemoryBase source, uint64_t size) override;
+      absl::Span<const GraphNodeHandle> dependencies,
+      DeviceMemoryBase destination, DeviceMemoryBase source,
+      uint64_t size) override;
 
   absl::Status UpdateMemcpyD2DNode(GraphNodeHandle node_handle,
                                    DeviceMemoryBase destination,
                                    DeviceMemoryBase source,
                                    uint64_t size) override;
 
+  absl::Status PopulateDnnGraphNode(
+      dnn::DnnGraph&, Stream&, absl::Span<DeviceMemoryBase> operands) override {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+  absl::Status UpdateDnnGraphNode(dnn::DnnGraph&, Stream&,
+                                  absl::Span<DeviceMemoryBase> operands,
+                                  GraphNodeHandle) override {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
   absl::StatusOr<GraphNodeHandle> CreateChildNode(
-      const Dependencies& dependencies, const CommandBuffer& nested) override;
+      absl::Span<const GraphNodeHandle> dependencies,
+      const CommandBuffer& nested) override;
 
   absl::Status UpdateChildNode(GraphNodeHandle node_handle,
                                const CommandBuffer& nested) override;
 
   absl::StatusOr<GraphNodeHandle> CreateKernelNode(
-      const Dependencies& dependencies, const ThreadDim& threads,
+      absl::Span<const GraphNodeHandle> dependencies, const ThreadDim& threads,
       const BlockDim& blocks, const Kernel& kernel,
       const KernelArgsPackedArrayBase& args) override;
 
@@ -113,15 +134,9 @@ class RocmCommandBuffer : public GpuCommandBuffer {
                                 const BlockDim& blocks, const Kernel& kernel,
                                 const KernelArgsPackedArrayBase& args) override;
 
-  absl::StatusOr<GraphNodeHandle> CreateBarrierNode(
-      const Dependencies& dependencies) override;
-
   absl::Status Trace(Stream* stream,
                      absl::AnyInvocable<absl::Status()> function) override;
 
-  absl::Status SetNodeExecutionEnabled(GraphNodeHandle node_handle,
-                                       bool enabled) override;
-
   absl::Status LaunchGraph(Stream* stream) override;
 
   absl::StatusOr<size_t> GetNodeCount() const override;
@@ -140,9 +155,6 @@ class RocmCommandBuffer : public GpuCommandBuffer {
 
   absl::Status CheckCanBeUpdated() override;
 
-  absl::StatusOr<std::vector<GraphNodeHandle>> GetNodeDependencies(
-      GraphNodeHandle node) override;
-
   static_assert(std::is_pointer_v<hipGraph_t>, "hipGraph_t must be a pointer");
   static_assert(std::is_pointer_v<hipGraphExec_t>,
                 "hipGraphExec_t must be a pointer");
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_context.cc b/third_party/xla/xla/stream_executor/rocm/rocm_context.cc
index 983e57420909..0f3146b10319 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_context.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_context.cc
@@ -92,7 +92,7 @@ bool GetReservedMemory(uint64_t* reserve) {
   } else if (compute_capability.gfx10_rx68xx() ||
              compute_capability.gfx10_rx69xx()) {
     *reserve = RESERVED_GFX10_X;
-  } else if (compute_capability.gfx11_rx7900()) {
+  } else if (compute_capability.gfx11()) {
     *reserve = RESERVED_GFX11_X;
   }
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
index dfcf07645e26..123e69bd075e 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.cc
@@ -93,12 +93,6 @@ absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value) {
   return result;
 }
 
-}  // namespace rocm
-}  // namespace stream_executor
-
-namespace stream_executor {
-namespace gpu {
-
 // -- class Diagnostician
 
 std::string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
@@ -233,5 +227,5 @@ absl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   return status;
 }
 
-}  // namespace gpu
+}  // namespace rocm
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h
index 8c17b8e0f97f..edd9247db05c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_diagnostics.h
@@ -17,15 +17,15 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
 
 #include <string>
+#include <tuple>
 
 #include "absl/status/statusor.h"
-#include "xla/stream_executor/gpu/gpu_diagnostics.h"
 
 namespace stream_executor {
 namespace rocm {
 
 // e.g. DriverVersion{346, 3, 4}
-using DriverVersion = gpu::DriverVersion;
+using DriverVersion = std::tuple<int, int, int>;
 
 // Converts a parsed driver version to string form.
 std::string DriverVersionToString(DriverVersion version);
@@ -36,7 +36,55 @@ std::string DriverVersionStatusToString(absl::StatusOr<DriverVersion> version);
 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
 absl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value);
 
-using Diagnostician = gpu::Diagnostician;
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static absl::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const std::string& driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static absl::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static absl::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatibility.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      absl::StatusOr<DriverVersion> dso_version,
+      absl::StatusOr<DriverVersion> kernel_version);
+
+  static std::string GetDevNodePath(int dev_node_ordinal);
+
+  Diagnostician(const Diagnostician&) = delete;
+  void operator=(const Diagnostician&) = delete;
+};
 
 }  // namespace rocm
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index a8312e5104eb..da2a4ff3518f 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -551,6 +551,10 @@ namespace wrap {
 // clang-format on
 #endif
 
+#if (MIOPEN_BETA_API && TF_ROCM_VERSION >= 60300)
+STREAM_EXECUTOR_MIOPEN_WRAP(miopenSetTensorDescriptorV2)
+#endif
+
 MIOPEN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_MIOPEN_WRAP)
 
 #undef MIOPEN_DNN_ROUTINE_EACH
@@ -872,7 +876,7 @@ struct ScopedDescriptor {
   }
 
   ~ScopedDescriptor() {
-    if (handle_ != nullptr) return;
+    if (handle_ == nullptr) return;
 
     auto status = miDestroyObject(
         handle_);  // wrap::miopenDestroyTensorDescriptor(handle_);
@@ -917,6 +921,11 @@ absl::StatusOr<ScopedTensorDescriptor> scope(
       std::vector<int64_t> dims64 =
           batch_descriptor.full_dims(dnn::DataLayout::kBatchDepthYX);
 
+#if (MIOPEN_BETA_API && TF_ROCM_VERSION >= 60300)
+      status = wrap::miopenSetTensorDescriptorV2(
+          obj.handle_, data_type, nd, (const size_t*)dims64.data(),
+          (const size_t*)strides64.data());
+#else
       // MIOpen requires arrays of ints.
       std::vector<int> strides(nd);
       std::vector<int> dims(nd);
@@ -926,7 +935,7 @@ absl::StatusOr<ScopedTensorDescriptor> scope(
                      &CheckedNarrowing<int64_t, int>);
       status = wrap::miopenSetTensorDescriptor(obj.handle_, data_type, nd,
                                                dims.data(), strides.data());
-
+#endif
       if (status != miopenStatusSuccess) {
         return absl::InternalError(
             "could not convert BatchDescriptor " + batch_descriptor.ToString() +
@@ -994,6 +1003,11 @@ absl::StatusOr<ScopedFilterDescriptor> scope(
       std::vector<int64_t> dims64 =
           filter_descriptor.full_dims(dnn::FilterLayout::kOutputInputYX);
 
+#if (MIOPEN_BETA_API && TF_ROCM_VERSION >= 60300)
+      status = wrap::miopenSetTensorDescriptorV2(
+          obj.handle_, data_type, nd, (const size_t*)dims64.data(),
+          (const size_t*)strides64.data());
+#else
       // MIOpen requires arrays of ints.
       std::vector<int> strides;
       std::vector<int> dims;
@@ -1003,7 +1017,7 @@ absl::StatusOr<ScopedFilterDescriptor> scope(
                         &CheckedNarrowing<int64_t, int>);
       status = wrap::miopenSetTensorDescriptor(obj.handle_, data_type, nd,
                                                dims.data(), strides.data());
-
+#endif
       if (status != miopenStatusSuccess) {
         LOG(FATAL) << "could not convert FilterDescriptor "
                    << filter_descriptor.ToString()
@@ -3484,8 +3498,8 @@ absl::Status MIOpenSupport::DoConvolve(
 }
 
 absl::Status MIOpenSupport::GetConvolveRunners(
-    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-    dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
@@ -4347,7 +4361,7 @@ absl::Status ROCmFusedMatmulRunner::operator()(
 }
 
 absl::Status MIOpenSupport::GetFusedMatmulRunners(
-    bool use_cudnn_frontend, dnn::DataType input_type, dnn::DataType bias_type,
+    dnn::DataType input_type, dnn::DataType bias_type,
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
     dnn::ActivationMode activation_mode, bool use_fallback,
@@ -4355,12 +4369,14 @@ absl::Status MIOpenSupport::GetFusedMatmulRunners(
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
   out_exec_plans->clear();
-  if (input_type != output_type)
+  if (input_type != output_type) {
     return absl::InvalidArgumentError(
         "ROCm fused matmul does not support input/output type mismatch");
-  if (input_type != bias_type)
+  }
+  if (input_type != bias_type) {
     return absl::InvalidArgumentError(
         "ROCm fused matmul does not support input/bias type mismatch");
+  }
   auto runner_ptr = new ROCmFusedMatmulRunner(
       stream, input_type, bias_type, output_type, trans_a, trans_b, m, n, k,
       lda, ldb, ldc, activation_mode);
@@ -5160,10 +5176,9 @@ MIOpenSupport::FusedConvolveRunnerFromDesc(
 }
 
 absl::Status MIOpenSupport::GetFusedConvolveRunners(
-    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-    dnn::DataType input_type, dnn::DataType bias_type,
-    dnn::DataType output_type, double conv_scale, double side_input_scale,
-    double leakyrelu_alpha, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+    double side_input_scale, double leakyrelu_alpha, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::BatchDescriptor& bias_descriptor,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 27f99258d60d..d399c552d995 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -19,12 +19,23 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
 #define XLA_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
 
-#include "absl/synchronization/mutex.h"
+#include <Eigen/Core>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "rocm/include/miopen/miopen.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/plugin_registry.h"
+#include "xla/stream_executor/scratch_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -229,8 +240,8 @@ class MIOpenSupport : public dnn::DnnSupport {
                      dnn::ProfileResult* output_profile_result) override;
 
   absl::Status GetConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
       const dnn::FilterDescriptor& filter_descriptor,
       DeviceMemoryBase filter_data,
@@ -417,20 +428,18 @@ class MIOpenSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result) override;
 
   absl::Status GetFusedMatmulRunners(
-      bool use_cudnn_frontend, dnn::DataType input_type,
-      dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
-      bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
-      int64_t lda, int64_t ldb, int64_t ldc,
+      dnn::DataType input_type, dnn::DataType bias_type,
+      dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
+      uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
       dnn::ActivationMode activation_mode, bool use_fallback,
       const NumericOptions& numeric_options,
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
   absl::Status GetFusedConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType input_type, dnn::DataType bias_type,
-      dnn::DataType output_type, double conv_scale, double side_input_scale,
-      double leakyrelu_alpha, Stream* stream,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+      double side_input_scale, double leakyrelu_alpha, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
       const dnn::BatchDescriptor& bias_descriptor,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index ebc99ae75f52..2eb59171a6d7 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -131,8 +131,9 @@ int fpus_per_core(std::string gcn_arch_name) {
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
 tsl::thread::ThreadPool* GetDriverExecutor() {
-  static tsl::thread::ThreadPool* thread_pool = new tsl::thread::ThreadPool(
-      tsl::Env::Default(), tsl::ThreadOptions(), "rocm_driver", 1);
+  static tsl::thread::ThreadPool* const thread_pool =
+      new tsl::thread::ThreadPool(tsl::Env::Default(), tsl::ThreadOptions(),
+                                  "rocm_driver", 1);
   return thread_pool;
 }
 
@@ -584,7 +585,7 @@ bool RocmExecutor::UnloadGpuBinary(ModuleHandle module_handle) {
     VLOG(3) << "No loaded  HSACO module for " << module_handle;
     return false;
   }
-  auto& module = module_it->second.first;
+  auto module = module_it->second.first;
   auto& refcount = module_it->second.second;
   VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
   if (--refcount == 0) {
@@ -1009,6 +1010,7 @@ absl::StatusOr<DeviceMemoryBase> RocmExecutor::GetSymbol(
                    reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
 }
 
+namespace {
 absl::Status FillBlockDimLimit(hipDevice_t device, BlockDim* block_dim_limit) {
   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
   // we use BlockDims to express the dimensions of blocks within a grid
@@ -1022,6 +1024,7 @@ absl::Status FillBlockDimLimit(hipDevice_t device, BlockDim* block_dim_limit) {
   block_dim_limit->z = z;
   return absl::OkStatus();
 }
+}  // namespace
 
 absl::StatusOr<std::unique_ptr<Event>> RocmExecutor::CreateEvent() {
   TF_ASSIGN_OR_RETURN(auto event,
@@ -1118,6 +1121,8 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
 
   desc.set_shared_memory_per_core(GetMaxSharedMemoryPerCore(device).value());
   desc.set_shared_memory_per_block(GetMaxSharedMemoryPerBlock(device).value());
+  desc.set_shared_memory_per_block_optin(
+      GetMaxSharedMemoryPerBlock(device).value());
   int core_count = GetMultiprocessorCount(device).value();
   desc.set_core_count(core_count);
   desc.set_fpus_per_core(fpus_per_core(gcn_arch_name));
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc b/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc
index ae287aa4025f..d836d10a20af 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
 
-#include <limits>
+#include <cmath>
+#include <cstdint>
+
 namespace stream_executor {
 namespace gpu {
 
@@ -55,27 +57,6 @@ void rocm_Broadcast_fp32(void* stream, float* dst, int dst_stride, int batches,
                      dst_stride, batches, src, size);
 }
 
-// GPU kernel to populate an array of pointers:
-//
-//   [base + stride * i for i in range(n)].
-//
-__global__ void __xla_MakeBatchPointers(char* base, int stride, int n,
-                                        void** ptrs_out) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx >= n) return;
-  ptrs_out[idx] = base + idx * stride;
-}
-
-void rocm_MakeBatchPointers(void* stream, char* base, int stride, int n,
-                            void** ptrs_out) {
-  const int threads_per_block = 256;
-  hipLaunchKernelGGL(
-      __xla_MakeBatchPointers,
-      dim3((n + threads_per_block - 1) / threads_per_block, 1, 1),
-      dim3(threads_per_block, 1, 1), 0, (hipStream_t)stream, base, stride, n,
-      ptrs_out);
-}
-
 __device__ float sigmoid(float x) {
   if (x > 0)
     return 1. / (1. + __expf(-x));
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
index 2d812e23ddbc..d11e3c263704 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
@@ -22,11 +22,12 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "rocm/include/hip/hip_runtime.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/gpu_diagnostics.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/rocm/rocm_diagnostics.h"
 #include "xla/stream_executor/rocm/rocm_driver_wrapper.h"
 #include "xla/stream_executor/rocm/rocm_executor.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -48,7 +49,7 @@ static absl::Status InternalInitialize() {
   }
 
   LOG(ERROR) << "failed call to hipInit: " << ToString(res);
-  Diagnostician::LogDiagnosticInformation();
+  rocm::Diagnostician::LogDiagnosticInformation();
   return absl::AbortedError(
       absl::StrCat("failed call to hipInit: ", ToString(res)));
 }
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc
index ce272494b648..59fe94efa909 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_stream_test.cc
@@ -254,7 +254,6 @@ TEST_F(RocmStreamTest, WaitForEvent) {
       stream->DoHostCallback([&callback_called]() { callback_called = true; }),
       IsOk());
 
-  EXPECT_FALSE(callback_called);
   EXPECT_THAT(stream->RecordEvent(&event), IsOk());
   EXPECT_THAT(stream->BlockHostUntilDone(), IsOk());
   EXPECT_TRUE(callback_called);
diff --git a/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_bfloat16.cu.cc b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_bfloat16.cu.cc
new file mode 100644
index 000000000000..24e6dae71b26
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_bfloat16.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "xla/types.h"
+#include "xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h"
+
+namespace stream_executor::rocm {
+
+using xla::bfloat16;
+
+REGISTER_TOPK_KERNEL(1, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(2, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(4, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(8, bfloat16, uint16_t);
+REGISTER_TOPK_KERNEL(16, bfloat16, uint16_t);
+
+REGISTER_TOPK_KERNEL(1, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(2, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(4, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(8, bfloat16, uint32_t);
+REGISTER_TOPK_KERNEL(16, bfloat16, uint32_t);
+
+}  // namespace stream_executor::rocm
diff --git a/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
new file mode 100644
index 000000000000..bf29eb1e8afb
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h
@@ -0,0 +1,300 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_ROCM_TOPK_KERNEL_ROCM_COMMON_CU_H_
+#define XLA_STREAM_EXECUTOR_ROCM_TOPK_KERNEL_ROCM_COMMON_CU_H_
+
+// This file contains bespoke and optimized implementation for TopK shapes. When
+// adding support for new shapes/dtypes, you also need to modify the rewriter
+// on topk_specializer.cc for these changes to be picked up.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/stream_executor/gpu/gpu_kernel_registry.h"
+#include "xla/stream_executor/gpu/topk_kernel.h"
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#include "xla/tsl/lib/math/math_util.h"
+
+#ifdef __AMDGCN_WAVEFRONT_SIZE
+#define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
+#else
+#define WAVEFRONT_SIZE 64
+#endif
+
+namespace stream_executor::rocm {
+
+enum class ShflType { kSync, kUp, kDown, kXor };
+
+template <ShflType Type, class NT>
+__device__ __forceinline__ NT GpuShuffle(NT val, uint32_t idx,
+                                         uint32_t allmsk = 0xffffffffu) {
+  constexpr uint32_t SZ =
+      tsl::MathUtil::CeilOfRatio(sizeof(NT), sizeof(uint32_t));
+  union S {
+    NT v;
+    uint32_t d[SZ];
+  };
+  S in{val}, res{};
+
+#pragma unroll
+  for (uint32_t i = 0; i < SZ; i++) {
+    if constexpr (Type == ShflType::kSync)
+      res.d[i] = __shfl(in.d[i], idx);
+    else if constexpr (Type == ShflType::kUp)
+      res.d[i] = __shfl_up(in.d[i], idx);
+    else if constexpr (Type == ShflType::kDown)
+      res.d[i] = __shfl_down(in.d[i], idx);
+    else if constexpr (Type == ShflType::kXor)
+      res.d[i] = __shfl_xor(in.d[i], idx);
+  }
+  return res.v;
+}
+
+// Default implementation for KV holder. Useful for testing while adding support
+// for a new type, but generally bitpacking those values is more efficient. See
+// implementations below.
+template <typename T, typename V>
+struct Descending {
+  struct KVT {
+    T key;
+    V idx;
+  };
+
+  __device__ __forceinline__ static bool cmp(const KVT& lhs, const KVT& rhs) {
+    return lhs.key == rhs.key ? lhs.idx < rhs.idx : lhs.key > rhs.key;
+  }
+};
+
+// TopK implements a faster TopK for K < 16.
+//
+// To compute the final largest K elements, we shard the data threads and each
+// of them computes the top k elements for the data in its slice. When all lanes
+// in a warp are done with their TopK, we merge all the lane-local topks into
+// lane 0 using warp-local reductions. The lane-local topk is computed at
+// PerWarpTopK() and the warp reduction is computed in Reduce(). The warp-local
+// results are stored in shared memory.
+//
+// Once all warps are done, we load all previously produced results into a
+// single warp and repeat the reduction described above. This is implemented in
+// MergeTopKs() and we reuse the Reduce() implementation described above. On
+// MergeTopKs we also write the final results to the user-provided buffer.
+//
+// === Detailed Design
+//
+// The high level goals of this implementations are:
+//  - Low latency for small N (i.e. kilobytes).
+//  - High throughput for large N and/or large batch.
+//
+// Non-goals:
+//  - K > 32. Register pressure will be too high.
+//  - Sharding over multiple SMs. As explained later, we can use TopK's
+//    structure to get this "for free".
+//
+// The core observation of this implementation is that reading/writing to main
+// memory is the bottleneck in usual the Sort/TopK implementations and that for
+// K<16 a linear scan with in-register data is faster than using a heap with
+// shared memory, especially when K is a power of two.
+//
+// The heap for K=7 looks like:
+//
+//             a0
+//        a1        a2
+//      a3  a4    a5  a6
+//
+// When performing a push/pop, in the worst case scenario we need to compare it
+// with the root, both of its children, and one of the two subtrees. This means
+// that using a heap for K=7 only save us 2/7 comparison. Additionally, if the
+// tree were unbalanced(e.g. K=8), we would not be able to unroll this
+// computation.
+//
+// If we're using linear insertion, the worst case results in the full K
+// comparisons comparisons, but with care all of those values can be kept in
+// registers, replacing somewhat load/store instructions with movs. This
+// performance are more than enough to surpass the heap.
+//
+// We split the data evenly over T (<=1024) threads, and use the algorithm above
+// to maintain a sorted list of K elements in registers and perform linear
+// insertions on every new element. Once a warp is done with their local slice,
+// we reduce the slice-local data using shfl and the insertion described above,
+// by adding the other lane's TopK results to the local lane. Once the warp is
+// done, lane 0 writes its results to shared memory. This step has complexity:
+//    theta(k * slice_size + k^2 * log2(k))
+//
+// On a second pass, we use a single warp to consume the results of the previous
+// step and merge them into a final topk, using an analogous algorithm to what
+// has been previously described. Complexity of this stage is:
+//    theta(k^2 * log2(k)).
+//
+// This algorithm only uses a single block per batch dimension, but for large N,
+// we can split the input into B batches of size N/B, calculate each of their
+// topks and then compute a final topk, fixing the indices in the process.
+//
+// Future improvements:
+//  - Use optimal sort/merge networks to reduce the complexity the algorithm and
+//    allow better scaling past K=16. This is fairly tricky to implement
+//    efficiently, so it was let out of v1.
+//
+
+template <size_t K, typename KT, typename VT,
+          template <class, class> class Traits = Descending>
+struct TopK {
+  using Trait = Traits<KT, VT>;
+  using KVT = typename Trait::KVT;
+
+  __device__ TopK(void* buffer, int num_outputs)
+      : buffer_(reinterpret_cast<KVT*>(buffer)), num_outputs_(num_outputs) {}
+
+  __device__ __forceinline__ uint32_t Idx(uint32_t i) {
+    return blockDim.x * i + threadIdx.x;
+  }
+
+  // Compute a per-warp topk of a slice of data.
+  __device__ void PerWarpTopK(KT* key, int n) {
+    KVT tmp[K];
+    // TODO(doak): Use bitonic sort.
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      tmp[i] = {key[Idx(i)], VT(Idx(i))};
+    }
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+#pragma unroll
+      for (int j = i + 1; j < K; j++) {
+        KVT ti = tmp[i];
+        KVT tj = tmp[j];
+        bool res = Trait::cmp(ti, tj);
+        tmp[i] = res ? ti : tj;
+        tmp[j] = res ? tj : ti;
+      }
+    }
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+
+    for (int idx = K; idx < n; idx++) {
+      KVT kv{key[Idx(idx)], VT(Idx(idx))};
+      Push(tmp, kv);
+    }
+    Reduce(tmp, WarpSize);
+
+    if (threadIdx.x % WarpSize != 0) return;
+    int warp_id = threadIdx.x / WarpSize;
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      buffer_[i * WarpSize + warp_id] = tmp[i];
+    }
+  }
+
+  // Merge the per-warp topks into a single topk. The final data is written to
+  // `keys` and `idxs`
+  __device__ void MergeTopKs(KT* keys, uint32_t* idxs) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+    KVT tmp[K];
+    // We only use one warp for this step.
+    if (threadIdx.x >= WarpSize) return;
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      tmp[i] = buffer_[i * WarpSize + threadIdx.x];
+    }
+    Reduce(tmp, blockDim.x / WarpSize);
+    if (threadIdx.x != 0) return;
+    for (int i = 0; i < num_outputs_; ++i) {
+      keys[i] = tmp[i].key;
+      idxs[i] = tmp[i].idx;
+    }
+  }
+
+  // Merge `tmp` (a reverse-sorted array) from (0, `num_lanes`) lanes. The
+  // resulting array is stored in the tmp array of lane 0. For all other lanes,
+  // `tmp` is unspecified after this function is called.
+  __device__ __forceinline__ void Reduce(KVT tmp[K], int num_lanes) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+    int lane_id = threadIdx.x % WarpSize;
+    for (int offset = num_lanes / 2; offset > 0; offset /= 2) {
+#pragma unroll
+      for (int i = 0; i < K; i++) {
+        KVT kv = GpuShuffle<ShflType::kDown>(tmp[i], offset);
+        if (lane_id >= offset) continue;
+        Push(tmp, kv);
+      }
+    }
+  }
+
+  // Given a K-array of previously reverse-sorted KVTs, add kv to it and
+  // remove the smallest element of the resulting array. Preserves the sorted
+  // order of `tmp`.
+  // We are careful to write this code in a way that nvcc/ptxas will use
+  // predication rather than branching. If we don't get this right, then we
+  // can greatly expands the code size of the generated PTX and SASS by
+  // tens of thousands of instructions. This increased the size of the
+  // compressed JAX wheel by 25MiB, so be very careful to check the generated
+  // code size when changing this function.
+  static __device__ __forceinline__ void Push(KVT tmp[K], const KVT& kv) {
+    bool p = Trait::cmp(tmp[K - 1], kv);
+    tmp[K - 1] = p ? tmp[K - 1] : kv;
+#pragma unroll
+    for (int i = static_cast<int>(K) - 2; i >= 0; --i) {
+      // Note: even though we could exit early as soon as the first time we
+      // see a value greater than kv, we don't do this because it makes nvcc
+      // generate terrible code.
+      bool p = Trait::cmp(tmp[i], kv);
+      auto t = tmp[i];
+      tmp[i] = p ? tmp[i] : tmp[i + 1];
+      tmp[i + 1] = p ? tmp[i + 1] : t;
+    }
+  }
+
+  KVT* buffer_;
+  int num_outputs_;
+};
+
+// This shared memory buffer needs to be declared outside of the templated
+// Run(), as otherwise it would generate name conflicts from the multiple
+// instantiations of Run() from the multiple monomorphizations of Run().
+extern __device__ __shared__ int shmem[];
+
+template <size_t K, typename KT, typename VT>
+__launch_bounds__(stream_executor::gpu::kTopKMaxThreadsPerBlock, 1) __global__
+    void Run(KT* data, int n, KT* result, uint32_t* result_idxs, int k) {
+  TopK<K, KT, VT> obj(shmem, k);
+
+  const uint32_t bidx = blockIdx.x;
+  auto in = data + n * bidx;
+  auto vals_out = result + k * bidx;
+  auto idxs_out = result_idxs + k * bidx;
+  int slice_size = n / blockDim.x;
+  if (threadIdx.x < n % blockDim.x) {
+    slice_size++;
+  }
+
+  obj.PerWarpTopK(in, slice_size);
+  obj.MergeTopKs(vals_out, idxs_out);
+}
+
+#define KERNEL_TRAIT(K_VAL, TYPE, VT) \
+  stream_executor::gpu::TopKKernel<K_VAL, TYPE, VT>
+#define REGISTER_TOPK_KERNEL(K_VAL, TYPE, VT)                                 \
+  GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(                             \
+      TopKKernelRocm_K##K_VAL##_##TYPE##_##VT, KERNEL_TRAIT(K_VAL, TYPE, VT), \
+      stream_executor::rocm::kROCmPlatformId, ([] {                           \
+        stream_executor::MultiKernelLoaderSpec spec(5);                       \
+        spec.AddInProcessSymbol(absl::bit_cast<void*>(&Run<K_VAL, TYPE, VT>), \
+                                "topk_k" #K_VAL "_" #TYPE "_" #VT);           \
+        return spec;                                                          \
+      }));
+
+}  // namespace stream_executor::rocm
+
+#endif  // XLA_STREAM_EXECUTOR_ROCM_TOPK_KERNEL_ROCM_COMMON_CU_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_float.cu.cc b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_float.cu.cc
new file mode 100644
index 000000000000..36bac769ff4b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/rocm/topk_kernel_rocm_float.cu.cc
@@ -0,0 +1,34 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "xla/stream_executor/rocm/topk_kernel_rocm_common.cu.h"
+
+namespace stream_executor::rocm {
+
+REGISTER_TOPK_KERNEL(1, float, uint16_t);
+REGISTER_TOPK_KERNEL(2, float, uint16_t);
+REGISTER_TOPK_KERNEL(4, float, uint16_t);
+REGISTER_TOPK_KERNEL(8, float, uint16_t);
+REGISTER_TOPK_KERNEL(16, float, uint16_t);
+
+REGISTER_TOPK_KERNEL(1, float, uint32_t);
+REGISTER_TOPK_KERNEL(2, float, uint32_t);
+REGISTER_TOPK_KERNEL(4, float, uint32_t);
+REGISTER_TOPK_KERNEL(8, float, uint32_t);
+REGISTER_TOPK_KERNEL(16, float, uint32_t);
+
+}  // namespace stream_executor::rocm
diff --git a/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/xla/xla/stream_executor/stream_executor.h
index 1e93114ccb05..0724222e0461 100644
--- a/third_party/xla/xla/stream_executor/stream_executor.h
+++ b/third_party/xla/xla/stream_executor/stream_executor.h
@@ -325,11 +325,11 @@ class StreamExecutor {
   virtual bool SetArgumentLoggingMode(uint64_t mode) { return false; }
 
   // Creates, allocates, and copies a CUtensorMap object for the given TMA
-  // descriptor.  Returns a DeviceMemoryBase pointing to the allocated
-  // CUtensorMap object to be used as an argument to a kernel.
+  // descriptor. Returns a TensorMap, which is 128 bytes of storage, to be
+  // passed by value to the kernel.
   // Only implemented on CUDA GPUs.
-  virtual absl::StatusOr<DeviceMemoryBase> CreateTensorMap(
-      gpu::TmaDescriptor tma_desc, void* global_address) {
+  virtual absl::StatusOr<TensorMap> CreateTensorMap(gpu::TmaDescriptor tma_desc,
+                                                    void* global_address) {
     return absl::UnimplementedError("Not Implemented");
   }
 };
diff --git a/third_party/xla/xla/stream_executor/sycl/BUILD b/third_party/xla/xla/stream_executor/sycl/BUILD
index 619d9e496883..c34fe393e73f 100644
--- a/third_party/xla/xla/stream_executor/sycl/BUILD
+++ b/third_party/xla/xla/stream_executor/sycl/BUILD
@@ -5,7 +5,7 @@ load(
     "@local_config_sycl//sycl:build_defs.bzl",
     "if_sycl_is_configured",
 )
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "stream_executor_friends",
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index d8390d98375c..4c7323da2a72 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -1,6 +1,6 @@
 # Description: StreamExecutor Interface for TPUs
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -133,7 +133,6 @@ cc_library(
         ":tpu_executor_api",
         ":tpu_executor_c_api_hdrs",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -145,7 +144,6 @@ cc_library(
         "//xla/tsl/c:tsl_status",
         "//xla/tsl/c:tsl_status_helper",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -176,6 +174,7 @@ cc_library(
     deps = [
         ":c_api_decl",
         ":libtftpu_header",
+        "@com_google_absl//absl/status:statusor",
     ],
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index 9cdee13ba01a..a5428746fbe3 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
+#include "xla/primitive_util.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
@@ -273,10 +274,16 @@ stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base) {
 void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape) {
   c_shape->element_type = xla_shape.element_type();
 
-  CreateVector(xla_shape.dimensions(), &c_shape->dimensions);
-  CreateVector(xla_shape.dynamic_dimensions(), &c_shape->dynamic_dimensions);
+  if (xla_shape.IsArray()) {
+    CreateVector(xla_shape.dimensions(), &c_shape->dimensions);
+    CreateVector(xla_shape.dynamic_dimensions(), &c_shape->dynamic_dimensions);
+  } else {
+    c_shape->dimensions.size = 0;
+    c_shape->dynamic_dimensions.size = 0;
+  }
 
-  c_shape->ntuple_shapes = xla_shape.tuple_shapes_size();
+  c_shape->ntuple_shapes =
+      xla_shape.IsTuple() ? xla_shape.tuple_shapes().size() : 0;
   if (c_shape->ntuple_shapes > 0) {
     c_shape->tuple_shapes = new XLA_Shape[c_shape->ntuple_shapes];
     for (int i = 0; i < c_shape->ntuple_shapes; ++i) {
@@ -302,8 +309,13 @@ xla::Shape FromC(const XLA_Shape* c_shape) {
     tuple_shapes.push_back(FromC(&c_shape->tuple_shapes[i]));
   }
 
-  xla::Shape result(static_cast<xla::PrimitiveType>(c_shape->element_type),
-                    dims, dynamic_dims, std::move(tuple_shapes));
+  const auto type = static_cast<xla::PrimitiveType>(c_shape->element_type);
+  xla::Shape result = xla::primitive_util::IsArrayType(type)
+                          ? xla::Shape(type, dims, dynamic_dims)
+                      : type == xla::PrimitiveType::TUPLE
+                          ? xla::Shape(std::move(tuple_shapes))
+                          // type is TOKEN or OPAQUE_TYPE.
+                          : xla::Shape(type);
   if (c_shape->has_layout) {
     *result.mutable_layout() = FromC(&c_shape->layout);
   }
@@ -330,31 +342,6 @@ void Destroy(XLA_Shape* c_shape) {
 
 void ToC(const xla::Layout& layout, XLA_Layout* c_layout) {
   CreateVector(layout.minor_to_major(), &c_layout->minor_to_major);
-  {
-    const int n = layout.dim_level_types_size();
-    absl::InlinedVector<xla::DimLevelType, xla::InlineRank()> dim_level_types(
-        n);
-    for (int i = 0; i < n; i++) {
-      dim_level_types[i] = layout.dim_level_type(i);
-    }
-    CreateVector(dim_level_types, &c_layout->dim_level_types);
-  }
-  {
-    const int n = layout.dim_unique_size();
-    absl::InlinedVector<bool, xla::InlineRank()> dim_unique(n);
-    for (int i = 0; i < n; i++) {
-      dim_unique[i] = layout.dim_unique(i);
-    }
-    CreateVector(dim_unique, &c_layout->dim_unique);
-  }
-  {
-    const int n = layout.dim_ordered_size();
-    absl::InlinedVector<bool, xla::InlineRank()> dim_ordered(n);
-    for (int i = 0; i < n; i++) {
-      dim_ordered[i] = layout.dim_ordered(i);
-    }
-    CreateVector(dim_ordered, &c_layout->dim_ordered);
-  }
   c_layout->index_primitive_type = layout.index_primitive_type();
   c_layout->pointer_primitive_type = layout.pointer_primitive_type();
   c_layout->element_size_in_bits = layout.element_size_in_bits();
@@ -368,19 +355,6 @@ void ToC(const xla::Layout& layout, XLA_Layout* c_layout) {
 
 xla::Layout FromC(const XLA_Layout* c_layout) {
   absl::Span<const int64_t> minor_to_major = MakeSpan(c_layout->minor_to_major);
-  absl::Span<const int> dim_level_type_ints =
-      MakeSpan(c_layout->dim_level_types);
-  xla::DimLevelTypeVector dim_level_types;
-  dim_level_types.reserve(dim_level_type_ints.size());
-  for (int dim_level_type : dim_level_type_ints) {
-    dim_level_types.push_back(static_cast<xla::DimLevelType>(dim_level_type));
-  }
-  absl::Span<const int> dim_unique_ints = MakeSpan(c_layout->dim_unique);
-  absl::InlinedVector<bool, xla::InlineRank()> dim_unique(
-      dim_unique_ints.begin(), dim_unique_ints.end());
-  absl::Span<const int> dim_ordered_ints = MakeSpan(c_layout->dim_unique);
-  absl::InlinedVector<bool, xla::InlineRank()> dim_ordered(
-      dim_ordered_ints.begin(), dim_ordered_ints.end());
   absl::InlinedVector<xla::Tile, 1> tiles;
   const XLA_Tile* c_tiles = c_layout->tiles.size > TPU_C_API_MAX_INLINED
                                 ? c_layout->tiles.heap
@@ -390,10 +364,10 @@ xla::Layout FromC(const XLA_Layout* c_layout) {
     tiles.push_back(FromC(&c_tiles[i]));
   }
   return xla::Layout(
-      minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
-      c_layout->tail_padding_alignment_in_elements,
+      minor_to_major, tiles,
       static_cast<xla::PrimitiveType>(c_layout->index_primitive_type),
       static_cast<xla::PrimitiveType>(c_layout->pointer_primitive_type),
+      c_layout->tail_padding_alignment_in_elements,
       c_layout->element_size_in_bits, c_layout->memory_space,
       /*split_configs=*/{},
       /*physical_shape=*/nullptr,
@@ -404,9 +378,6 @@ void Destroy(XLA_Layout* c_layout) {
   if (c_layout->minor_to_major.size > TPU_C_API_MAX_INLINED) {
     delete[] c_layout->minor_to_major.heap;
   }
-  if (c_layout->dim_level_types.size > TPU_C_API_MAX_INLINED) {
-    delete[] c_layout->dim_level_types.heap;
-  }
   if (c_layout->tiles.size > TPU_C_API_MAX_INLINED) {
     delete[] c_layout->tiles.heap;
   }
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
index c228a1532238..c9430da5bdf4 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
@@ -127,25 +127,6 @@ void XlaLayout_ToC(const xla::Layout& cpp_layout) {
       MakeSpan(c_layout.minor_to_major);
   EXPECT_EQ(cpp_minor_to_major, c_minor_to_major);
 
-  absl::Span<const int> c_dim_level_types = MakeSpan(c_layout.dim_level_types);
-  EXPECT_EQ(cpp_layout.dim_level_types_size(), c_dim_level_types.size());
-  for (int i = 0; i < c_dim_level_types.size(); ++i) {
-    EXPECT_EQ(static_cast<int>(cpp_layout.dim_level_type(i)),
-              c_dim_level_types[i]);
-  }
-
-  absl::Span<const int> c_dim_unique = MakeSpan(c_layout.dim_unique);
-  EXPECT_EQ(cpp_layout.dim_unique_size(), c_dim_unique.size());
-  for (int i = 0; i < c_dim_unique.size(); ++i) {
-    EXPECT_EQ(cpp_layout.dim_unique(i), static_cast<bool>(c_dim_unique[i]));
-  }
-
-  absl::Span<const int> c_dim_ordered = MakeSpan(c_layout.dim_ordered);
-  EXPECT_EQ(cpp_layout.dim_ordered_size(), c_dim_ordered.size());
-  for (int i = 0; i < c_dim_ordered.size(); ++i) {
-    EXPECT_EQ(cpp_layout.dim_ordered(i), static_cast<bool>(c_dim_ordered[i]));
-  }
-
   absl::Span<const xla::Tile> cpp_tiles = cpp_layout.tiles();
   TileList c_tiles = c_layout.tiles;
   EXPECT_EQ(cpp_tiles.size(), c_tiles.size);
@@ -218,10 +199,8 @@ TEST(XlaShape, ToCScalar) {
       MakeSpan(c_shape.dynamic_dimensions);
   EXPECT_EQ(cpp_dynamic_dimensions, c_dynamic_dimensions);
 
-  int cpp_ntuple_shapes = cpp_shape.tuple_shapes_size();
-  int c_ntuple_shapes = c_shape.ntuple_shapes;
-  EXPECT_EQ(cpp_ntuple_shapes, c_ntuple_shapes);
-  EXPECT_EQ(cpp_ntuple_shapes, 0);
+  EXPECT_FALSE(cpp_shape.IsTuple());
+  EXPECT_EQ(c_shape.ntuple_shapes, 0);
 
   bool cpp_has_layout = cpp_shape.has_layout();
   bool c_has_layout = c_shape.has_layout;
@@ -231,7 +210,8 @@ TEST(XlaShape, ToCScalar) {
 }
 
 TEST(XlaShape, ToCNested) {
-  xla::Shape cpp_shape = xla::ShapeUtil::MakeShapeWithType<float>({4, 3, 2});
+  const xla::Shape cpp_shape =
+      xla::ShapeUtil::MakeShapeWithType<float>({4, 3, 2});
   XLA_Shape c_shape;
   ToC(cpp_shape, &c_shape);
 
@@ -247,25 +227,13 @@ TEST(XlaShape, ToCNested) {
       MakeSpan(c_shape.dynamic_dimensions);
   EXPECT_EQ(cpp_dynamic_dimensions, c_dynamic_dimensions);
 
-  int cpp_ntuple_shapes = cpp_shape.tuple_shapes_size();
-  int c_ntuple_shapes = c_shape.ntuple_shapes;
-  EXPECT_EQ(cpp_ntuple_shapes, c_ntuple_shapes);
-
-  const std::vector<xla::Shape>& cpp_tuple_shapes = cpp_shape.tuple_shapes();
-  absl::Span<const XLA_Shape> c_tuple_shapes(c_shape.tuple_shapes,
-                                             c_ntuple_shapes);
-  for (int i = 0; i < c_ntuple_shapes; ++i) {
-    xla::Shape converted_c_shape = FromC(&c_tuple_shapes[i]);
-    EXPECT_EQ(cpp_tuple_shapes[i], converted_c_shape);
-  }
+  ASSERT_FALSE(cpp_shape.IsTuple());
+  ASSERT_EQ(c_shape.ntuple_shapes, 0);
 
-  bool cpp_has_layout = cpp_shape.has_layout();
-  bool c_has_layout = c_shape.has_layout;
-  EXPECT_EQ(cpp_has_layout, c_has_layout);
+  EXPECT_EQ(cpp_shape.has_layout(), c_shape.has_layout);
 
-  if (c_has_layout) {
-    xla::Layout converted_c_layout = FromC(&c_shape.layout);
-    EXPECT_EQ(cpp_shape.layout(), converted_c_layout);
+  if (c_shape.has_layout) {
+    EXPECT_EQ(cpp_shape.layout(), FromC(&c_shape.layout));
   }
 
   Destroy(&c_shape);
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
index 6331524142cc..0f3ba256573d 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
@@ -233,9 +233,6 @@ typedef struct TileList {
 
 typedef struct XLA_Layout {
   Int64List minor_to_major;
-  IntList dim_level_types;
-  IntList dim_unique;
-  IntList dim_ordered;
   TileList tiles;
   int index_primitive_type;
   int pointer_primitive_type;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
index 3b13c28cf109..7888144845fd 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
@@ -33,7 +33,7 @@ SE_StreamExecutor* TpuPlatform_GetExecutor(SE_Platform* platform, int ordinal,
 SE_PlatformId TpuPlatform_Id(SE_Platform* platform);
 int64_t TpuPlatform_VisibleDeviceCount(SE_Platform* platform);
 bool TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy(SE_Platform* platform);
-SE_TpuTopology* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
+const SE_TpuTopology* TpuPlatform_GetTopologyPtr(SE_Platform* platform);
 SE_TpuTopology_Host* TpuPlatform_GetHostLocation(SE_Platform* platform);
 TpuRuntimeVersion TpuPlatform_GetRuntimeVersion(SE_Platform* platform);
 
@@ -213,31 +213,33 @@ void TpuComputationPlacer_AssignLocalDevices(SE_TpuTopology_Host* host,
                                              int* assignment,
                                              TF_Status* status);
 
-int TpuTopology_LogicalDevicesPerHost(SE_TpuTopology* tpu_topology,
+int TpuTopology_LogicalDevicesPerHost(const SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
-int TpuTopology_LogicalDevicesPerChip(SE_TpuTopology* tpu_topology,
+int TpuTopology_LogicalDevicesPerChip(const SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
-int TpuTopology_HostCount(SE_TpuTopology* tpu_topology);
-int TpuTopology_ChipsPerHost(SE_TpuTopology* tpu_topology);
-
-int TpuTopology_ChipBounds_X(SE_TpuTopology* tpu_topology);
-int TpuTopology_ChipBounds_Y(SE_TpuTopology* tpu_topology);
-int TpuTopology_ChipBounds_Z(SE_TpuTopology* tpu_topology);
-bool TpuTopology_HasChip(SE_TpuTopology* tpu_topology, int x, int y, int z);
-SE_TpuTopology_Core* TpuTopology_CoreForId(SE_TpuTopology* tpu_topology,
+int TpuTopology_HostCount(const SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipsPerHost(const SE_TpuTopology* tpu_topology);
+
+int TpuTopology_ChipBounds_X(const SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Y(const SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipBounds_Z(const SE_TpuTopology* tpu_topology);
+bool TpuTopology_HasChip(const SE_TpuTopology* tpu_topology, int x, int y,
+                         int z);
+SE_TpuTopology_Core* TpuTopology_CoreForId(const SE_TpuTopology* tpu_topology,
                                            TpuCoreTypeEnum tpu_core_type,
                                            int id);
-SE_TpuTopology_Core* TpuTopology_Core(SE_TpuTopology* tpu_topology,
+SE_TpuTopology_Core* TpuTopology_Core(const SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type, int x,
                                       int y, int z, int index);
-int TpuTopology_NumCores(SE_TpuTopology* tpu_topology,
+int TpuTopology_NumCores(const SE_TpuTopology* tpu_topology,
                          TpuCoreTypeEnum tpu_core_type);
 // 'cores' should be a preallocated array of size TpuTopology_NumCores.
-void TpuTopology_Cores(SE_TpuTopology* tpu_topology,
+void TpuTopology_Cores(const SE_TpuTopology* tpu_topology,
                        TpuCoreTypeEnum tpu_core_type,
                        SE_TpuTopology_Core** cores);
-int TpuTopology_IdForHost(SE_TpuTopology* tpu_topology, int x, int y, int z);
-TpuVersionEnum TpuTopology_Version(SE_TpuTopology* tpu_topology);
+int TpuTopology_IdForHost(const SE_TpuTopology* tpu_topology, int x, int y,
+                          int z);
+TpuVersionEnum TpuTopology_Version(const SE_TpuTopology* tpu_topology);
 void TpuCoreLocation_ChipCoordinates(SE_TpuTopology_Core* tpu_core_location,
                                      int* x, int* y, int* z);
 void TpuCoreLocation_HostCoordinates(SE_TpuTopology_Core* tpu_core_location,
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_initialize_util.cc b/third_party/xla/xla/stream_executor/tpu/tpu_initialize_util.cc
index cb52adbf7534..dc8f13d0aa18 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_initialize_util.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_initialize_util.cc
@@ -134,7 +134,7 @@ absl::StatusOr<int64_t> FindLibtpuProcess() {
 }  // namespace
 
 absl::Status TryAcquireTpuLock() {
-  static absl::Mutex* mu = new absl::Mutex();
+  static absl::Mutex* const mu = new absl::Mutex();
   absl::MutexLock l(mu);
 
   std::string load_library_override = absl::StrCat(getenv("TPU_LOAD_LIBRARY"));
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
index 1ab49e97110a..e3d237e8671b 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
@@ -75,6 +75,8 @@ absl::Status SetTpuOpsStructFns(void* library_handle) {  // TENSORFLOW_STATUS_OK
 
   TFTPU_SET_FN(ops_api_fn, TpuTopology_AvailableCoreCount);
   TFTPU_SET_FN(ops_api_fn, TpuTopology_AvailableCoresPerChip);
+  TFTPU_SET_FN(ops_api_fn,
+               TpuTopology_MaybeAvailableSparseCoresPerLogicalDevice);
   TFTPU_SET_FN(ops_api_fn, TpuNetUtil_RecycleUnusedPort);
   TFTPU_SET_FN(ops_api_fn, TpuCompile_IsTpuCompilationEnabled);
   TFTPU_SET_FN(ops_api_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
index 48eff9a389b0..7ff7f31e7f1f 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 
+#include "absl/status/statusor.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/libtftpu.h"
 
@@ -436,6 +437,12 @@ TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
 TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoresPerChip(
     TpuCoreTypeEnum tpu_core_type);
 
+// Returns the number of cores per Chip or -1 if the TPU system is not
+// available.
+TFTPU_CAPI_EXPORT absl::StatusOr<int>
+TpuTopology_MaybeAvailableSparseCoresPerLogicalDevice(
+    TpuCoreTypeEnum tpu_core_type);
+
 // Recycle unused service port.
 TFTPU_CAPI_EXPORT void TpuNetUtil_RecycleUnusedPort(int port);
 
@@ -457,7 +464,7 @@ TFTPU_CAPI_EXPORT uint64_t TpuCompile_CreateGuaranteedConstFingerprint(
     uint64_t fingerprint, const char* data, size_t size);
 
 // Returns a pointer to the TPU topology struct.
-TFTPU_CAPI_EXPORT SE_TpuTopology* TpuUtil_GetTopologyPtr();
+TFTPU_CAPI_EXPORT const SE_TpuTopology* TpuUtil_GetTopologyPtr();
 
 // Returns XLA pad size from TPU topology.
 TFTPU_CAPI_EXPORT size_t TpuUtil_GetXlaPadSizeFromTpuTopology();
@@ -802,6 +809,7 @@ struct TfTpu_OpsApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoresPerChip);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_MaybeAvailableSparseCoresPerLogicalDevice);
   TFTPU_ADD_FN_IN_STRUCT(TpuNetUtil_RecycleUnusedPort);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
index 8a3c5997e76f..e141fa6f5a2c 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
@@ -105,7 +105,7 @@ bool TpuPlatform::ShouldRegisterTpuDeviceToDeviceCopy() {
       ->TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopyFn(platform_);
 }
 
-const tensorflow::tpu::TpuTopologyPtr TpuPlatform::GetTopologyPtr() {
+const SE_TpuTopology* TpuPlatform::GetTopologyPtr() {
   return stream_executor::tpu::ExecutorApiFn()->TpuPlatform_GetTopologyPtrFn(
       platform_);
 }
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
index 65cdd0e294ac..aabf10420f7d 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
@@ -62,7 +62,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   bool ShouldRegisterTpuDeviceToDeviceCopy() override;
 
-  const tensorflow::tpu::TpuTopologyPtr GetTopologyPtr() override;
+  const SE_TpuTopology* GetTopologyPtr() override;
 
   const tensorflow::tpu::TpuHostLocationExternal GetTpuHostLocation()
       const override;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.cc
index 4f4a45b67aef..4bba47d68861 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.cc
@@ -92,7 +92,7 @@ TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform,
 /* static */
 TpuPlatformInterface* TpuPlatformInterface::GetRegisteredPlatform(
     bool initialize_platform, int num_tries) {
-  static auto* mu = new absl::Mutex;
+  static auto* const mu = new absl::Mutex;
   static bool requested_initialize_platform = initialize_platform;
   static TpuPlatformInterface* tpu_registered_platform =
       GetRegisteredPlatformStatic(initialize_platform, num_tries);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.h
index 5f320adcd482..f5e7b68cab5a 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform_interface.h
@@ -28,10 +28,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-// TODO(skyewm): get rid of TpuTopologyPtr and either use SE_TpuTopology* or
-// return a TpuTopologyExternal.
-typedef SE_TpuTopology* TpuTopologyPtr;
-
 class TpuPlatformInterface : public stream_executor::Platform {
  public:
   // Returns a TPU platform to be used by TPU ops. If multiple TPU platforms are
@@ -53,7 +49,7 @@ class TpuPlatformInterface : public stream_executor::Platform {
 
   virtual bool ShouldRegisterTpuDeviceToDeviceCopy() = 0;
 
-  virtual const TpuTopologyPtr GetTopologyPtr() = 0;
+  virtual const SE_TpuTopology* GetTopologyPtr() = 0;
 
   virtual const TpuHostLocationExternal GetTpuHostLocation() const = 0;
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_topology.h b/third_party/xla/xla/stream_executor/tpu/tpu_topology.h
index 8c5f6ae19285..f1ab8da3fe6b 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_topology.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_topology.h
@@ -69,7 +69,7 @@ struct TpuTopologyChipBoundsExternal {
 
 class TpuTopologyExternal {
  public:
-  explicit TpuTopologyExternal(SE_TpuTopology* topology)
+  explicit TpuTopologyExternal(const SE_TpuTopology* topology)
       : topology_(topology) {}
   int32_t LogicalDevicesPerHost(TpuCoreTypeEnum core_type) const;
   int32_t LogicalDevicesPerChip(TpuCoreTypeEnum core_type) const;
@@ -85,7 +85,7 @@ class TpuTopologyExternal {
   TpuVersionEnum version() const;
 
  private:
-  SE_TpuTopology* topology_;
+  const SE_TpuTopology* topology_;
 };
 
 std::string TpuVersionEnumToString(TpuVersionEnum version);
diff --git a/third_party/xla/xla/strict.default.bzl b/third_party/xla/xla/strict.default.bzl
deleted file mode 100644
index 5e1b0ccda76f..000000000000
--- a/third_party/xla/xla/strict.default.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-"""Default (OSS) build versions of Python strict rules."""
-
-load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
-
-visibility(DEFAULT_LOAD_VISIBILITY)
-
-# Placeholder to use until bazel supports py_strict_binary.
-def py_strict_binary(name, **kwargs):
-    native.py_binary(name = name, **kwargs)
-
-# Placeholder to use until bazel supports py_strict_library.
-def py_strict_library(name, **kwargs):
-    native.py_library(name = name, **kwargs)
-
-# Placeholder to use until bazel supports py_strict_test.
-def py_strict_test(name, **kwargs):
-    native.py_test(name = name, **kwargs)
diff --git a/third_party/xla/xla/test.h b/third_party/xla/xla/test.h
deleted file mode 100644
index 8ce11ab8a7a3..000000000000
--- a/third_party/xla/xla/test.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TEST_H_
-#define XLA_TEST_H_
-
-// This header includes gmock.h and enables the use of gmock matchers in tests
-// in third_party/tensorflow/compiler/xla.
-//
-// Test including this header can use the macros EXPECT_THAT(...) and
-// ASSERT_THAT(...) in combination with gmock matchers.
-// Example:
-//  std::vector<int> vec = Foo();
-//  EXPECT_THAT(vec, ::testing::ElementsAre(1,2,3));
-//
-// For more details on gmock matchers see:
-// https://github.com/google/googletest/blob/master/googlemock/docs/CheatSheet.md#matchers
-//
-// The advantages of using gmock matchers instead of self defined matchers are
-// better error messages, more maintainable tests and more test coverage.
-//
-// Note that while the use of gmock matchers is allowed in the xla project, the
-// use of mocks is disallowed in the whole tensorflow project!
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/testlib/test.h"
-
-#endif  // XLA_TEST_H_
diff --git a/third_party/xla/xla/test_helpers.h b/third_party/xla/xla/test_helpers.h
deleted file mode 100644
index 77336bd5aa53..000000000000
--- a/third_party/xla/xla/test_helpers.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TEST_HELPERS_H_
-#define XLA_TEST_HELPERS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/testlib/test_helpers.h"
-
-#endif  // XLA_TEST_HELPERS_H_
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 8d94c65758b6..b81d966d5468 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -7,7 +7,7 @@ load(
     "if_rocm_is_configured",
 )
 load("//xla:package_groups.bzl", "xla_test_friend_package_group")
-load("//xla:xla.bzl", "tests_build_defs_bzl_deps", "xla_cc_binary", "xla_cc_test")
+load("//xla:xla.default.bzl", "tests_build_defs_bzl_deps", "xla_cc_test")
 load("//xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
 load("//xla/tsl:tsl.bzl", "if_google", "if_oss", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
@@ -92,30 +92,27 @@ cc_library(
         "//xla:literal",
         "//xla:literal_comparison",
         "//xla:literal_util",
+        "//xla:shape_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
+        "@com_google_googletest//:gtest_for_library",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:test",
     ],
 )
 
-cc_library(
-    name = "verified_hlo_module",
-    testonly = True,
-    hdrs = ["verified_hlo_module.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/testlib:verified_hlo_module instead.",
-    deps = [
-        "//xla/hlo/testlib:verified_hlo_module",
-    ],
-)
-
 cc_library(
     name = "pjrt_client_registry",
     srcs = ["pjrt_client_registry.cc"],
@@ -199,6 +196,7 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -225,7 +223,6 @@ cc_library(
         "//xla/hlo/testlib:test_helpers",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:computation_placer_hdr",
-        "//xla/service:executable",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_module_util",
         "//xla/service:hlo_runner_interface",
@@ -269,6 +266,7 @@ cc_library(
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ],
@@ -292,50 +290,24 @@ cc_library(
     hdrs = ["hlo_pjrt_test_base.h"],
     deps = [
         ":hlo_runner_agnostic_test_base",
+        ":hlo_runner_pjrt_test_utils",
         ":pjrt_client_registry",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_client",
         "//xla/service:hlo_runner_interface",
-        "//xla/service:hlo_runner_pjrt",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
     ],
 )
 
-xla_cc_binary(
-    name = "local_client_aot_test_helper",
-    srcs = ["local_client_aot_test_helper.cc"],
-    deps = [
-        "//xla:types",
-        "//xla:util",
-        "//xla/client:client_library",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/service:cpu_plugin",
-        "//xla/service/cpu:cpu_compiler",
-        "//xla/service/llvm_ir:llvm_util",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TargetParser",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-genrule(
-    name = "local_client_aot_test_computation",
-    outs = ["local_client_aot_test_computation.o"],
-    cmd = "$(location :local_client_aot_test_helper) $(TARGET_CPU) > $(OUTS)",
-    local = 1,
-    tools = [":local_client_aot_test_helper"],
-)
-
 cc_library(
     name = "client_library_test_base",
     testonly = True,
     srcs = ["client_library_test_base.cc"],
     hdrs = ["client_library_test_base.h"],
     deps = [
+        ":client_library_test_runner_utils",
         ":literal_test_util",
         ":test_utils",
         "//xla:array2d",
@@ -373,6 +345,7 @@ cc_library(
     testonly = True,
     hdrs = ["client_library_test_runner_mixin.h"],
     deps = [
+        ":client_library_test_runner_utils",
         ":hlo_runner_agnostic_test_base",
         ":literal_test_util",
         "//xla:array2d",
@@ -390,11 +363,31 @@ cc_library(
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
+        "//xla/service:hlo_module_util",
         "//xla/tsl/lib/core:bitmap",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "client_library_test_runner_utils",
+    testonly = True,
+    srcs = ["client_library_test_runner_utils.cc"],
+    hdrs = ["client_library_test_runner_utils.h"],
+    deps = [
+        ":test_utils",
+        "//xla:array2d",
+        "//xla:shape_util",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/builder:xla_computation",
+        "//xla/tsl/platform:status",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -428,28 +421,19 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "filecheck",
-    testonly = True,
-    hdrs = ["filecheck.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/testlib:filecheck instead.",
-    deps = [
-        "//xla/hlo/testlib:filecheck",
-    ],
-)
-
 cc_library(
     name = "local_client_test_base",
     testonly = True,
     srcs = ["local_client_test_base.cc"],
     hdrs = ["local_client_test_base.h"],
     deps = [
-        ":client_library_test_base",
+        "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
-        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/parser:hlo_parser",
@@ -457,23 +441,28 @@ cc_library(
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:computation_placer",
         "//xla/service:hlo_module_config",
-        "//xla/service:local_service",
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:transfer_manager",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -482,15 +471,14 @@ xla_test(
     srcs = ["bad_rng_shape_validation_test.cc"],
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
         ":xla_internal_test_main",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -600,18 +588,24 @@ xla_test(
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -621,15 +615,12 @@ xla_test(
     srcs = ["query_inferred_shape_test.cc"],
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/testlib:test_helpers",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -670,15 +661,20 @@ xla_test(
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
-        "//xla/client:local_client",
+        "//xla:error_spec",
+        "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -687,24 +683,27 @@ xla_test(
     srcs = ["map_test.cc"],
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
-        ":test_utils",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
+        "//xla:array3d",
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
-        "//xla/stream_executor:stream_executor_h",
+        "//xla/service",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -718,53 +717,63 @@ xla_test(
         "test_xla_cpu_no_thunks",
     ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
+        "//xla/service",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "pred_test",
     srcs = ["pred_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":xla_internal_test_main",
-        "//xla:array2d",
-        "//xla/client:local_client",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
-        "//xla/tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 xla_test(
     name = "select_test",
     srcs = ["select_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
+        "//xla:literal",
         "//xla:types",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:test",
+        "//xla/service",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -774,35 +783,45 @@ xla_test(
     shard_count = 2,
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/testlib:test_helpers",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
-        "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
     ],
 )
 
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:types",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -816,17 +835,22 @@ xla_test(
         "cpu",
         "gpu",
     ],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:math",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -834,24 +858,29 @@ xla_test(
     name = "scalar_computations_test",
     srcs = ["scalar_computations_test.cc"],
     shard_count = 32,
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:status_macros",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test_helpers",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
+        "//xla/service",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -934,38 +963,39 @@ cc_library(
     testonly = True,
     srcs = ["conv_depthwise_common.cc"],
     hdrs = ["conv_depthwise_common.h"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":client_library_test_base",
-        ":hlo_test_base",
-        ":test_macros_header",
-        "//xla:execution_options_util",
-        "//xla:status_macros",
-        "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
-        "//xla/hlo/transforms:despecializer",
-        "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest_for_library",
     ],
 )
 
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
-        "//xla:array2d",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:literal",
-        "//xla:shape_util",
+        "//xla:literal_util",
         "//xla:types",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/base",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -1009,6 +1039,7 @@ xla_test(
 
 xla_test(
     name = "dot_operation_test",
+    timeout = "long",
     srcs = ["dot_operation_test.cc"],
     shard_count = 20,
     tags = [
@@ -1113,6 +1144,7 @@ xla_test(
     args = [
         "--xla_tpu_order_dot_after_layout=true",
     ],
+    # placeholder for extra args for dot_operation_test_canonicalization_after_layout
     disabled_backends = [
         "cpu",
         "gpu",
@@ -1201,6 +1233,7 @@ xla_test(
     ],
     deps = [
         "//xla/service:platform_util",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
@@ -1211,27 +1244,35 @@ xla_test(
 xla_test(
     name = "scatter_test",
     srcs = ["scatter_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
-    # TODO(b/245550554): enable Pjrt runner for scatter test once it's fixed.
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":literal_test_util",
         ":test_macros_header",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:types",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 # Repeat dot_operation_runtime_test with single-threaded eigen.
 xla_test(
     name = "dot_operation_single_threaded_runtime_test",
+    timeout = "long",
     srcs = ["dot_operation_test.cc"],
     backend_args = {
         "cpu": [
@@ -1276,46 +1317,55 @@ xla_test(
 xla_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
+        "//xla:array3d",
+        "//xla:error_spec",
         "//xla:literal_util",
         "//xla:reference_util",
         "//xla:util",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":test_macros_header",
-        ":test_utils",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
         "//xla:array4d",
+        "//xla:error_spec",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:types",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:constants",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1336,7 +1386,7 @@ xla_test(
         ":hlo_pjrt_interpreter_reference_mixin",
         ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array4d",
         "//xla:error_spec",
@@ -1372,37 +1422,24 @@ xla_test(
     tags = [
         "cuda-only",
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
         ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
         ":xla_internal_test_main",
-        "//xla:array2d",
         "//xla:array3d",
-        "//xla:array4d",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:reference_util",
         "//xla:shape_util",
-        "//xla:types",
-        "//xla:util",
-        "//xla:window_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
+        "//xla/tsl/platform:test",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1418,39 +1455,25 @@ xla_test(
     tags = [
         "cuda-only",
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
         "test_xla_cpu_no_thunks",
     ],
     deps = [
         ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        "//xla:array2d",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array3d",
-        "//xla:array4d",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:reference_util",
         "//xla:shape_util",
-        "//xla:types",
-        "//xla:util",
-        "//xla:window_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
+        "//xla/tsl/platform:test",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1504,37 +1527,24 @@ xla_test(
     tags = [
         "cuda-only",
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
         ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
-        "//xla:array2d",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array3d",
-        "//xla:array4d",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla:reference_util",
         "//xla:shape_util",
-        "//xla:types",
-        "//xla:util",
-        "//xla:window_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
+        "//xla/tsl/platform:test",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -1578,50 +1588,8 @@ xla_test(
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
     backends = ["gpu"],
     shard_count = 25,
-    tags = ["cuda-only"],
-    deps = [
-        ":client_library_test_runner_mixin",
-        ":test_macros_header",
-        "//xla:array2d",
-        "//xla:array3d",
-        "//xla:array4d",
-        "//xla:error_spec",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:reference_util",
-        "//xla:shape_util",
-        "//xla:types",
-        "//xla:util",
-        "//xla:window_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
-        "//xla/hlo/builder:padding",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/tests:client_library_test_base",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-xla_test(
-    name = "convolution_test_cudnn_frontend_disabled",
-    timeout = "long",
-    srcs = ["convolution_test.cc"],
-    backend_args = {"gpu": ["--xla_gpu_enable_cudnn_frontend=false"]},
-    backends = ["gpu"],
-    shard_count = 50,
     tags = [
-        "nozapfhahn",
-        "optonly",
+        "cuda-only",
         "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
@@ -1629,22 +1597,16 @@ xla_test(
         ":hlo_pjrt_interpreter_reference_mixin",
         ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
-        "//xla:array2d",
-        "//xla:array4d",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:array3d",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:types",
-        "//xla:window_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
-        "//xla/service:hlo_runner_interface",
-        "//xla/stream_executor:device_description",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
+        "//xla/tsl/platform:test",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -1658,21 +1620,26 @@ xla_test(
         "cpu": ["nomsan"],
     },
     shard_count = 30,
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array3d",
         "//xla:array4d",
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:reference_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -1681,18 +1648,26 @@ xla_test(
     timeout = "long",
     srcs = ["convolution_dimension_numbers_test.cc"],
     shard_count = 20,
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array4d",
+        "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:reference_util",
-        "//xla/client:local_client",
+        "//xla:shape_util",
         "//xla/hlo/builder:padding",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
     ],
 )
@@ -1762,40 +1737,40 @@ xla_test(
     name = "bfloat16_test",
     srcs = ["bfloat16_test.cc"],
     shard_count = 40,
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":test_utils",
         ":xla_internal_test_main",
-        "//xla:array2d",
         "//xla:array4d",
-        "//xla:literal",
-        "//xla:reference_util",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/client:local_client",
+        "//xla:error_spec",
+        "//xla:literal_util",
+        "//xla:types",
         "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder/lib:arithmetic",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:test",
-        "//xla/hlo/testlib:test_helpers",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "float8_test",
     srcs = ["float8_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:ml_dtypes",
     ],
@@ -1808,17 +1783,24 @@ xla_test(
         "cpu",
         "gpu",
     ],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":test_utils",
-        ":xla_internal_test_main",
-        "//xla:literal",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
+        "//xla:types",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test",
-        "//xla/hlo/testlib:test_helpers",
-        "@com_google_absl//absl/status:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -1850,36 +1832,43 @@ xla_test(
     shard_count = 40,
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
+        "//xla:array3d",
+        "//xla:array4d",
+        "//xla:error_spec",
+        "//xla:literal_util",
         "//xla:reference_util",
-        "//xla/client:local_client",
+        "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "multidimensional_slice_test",
     srcs = ["multidimensional_slice_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
-        "//xla/client:local_client",
+        "//xla:error_spec",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -1889,30 +1878,40 @@ xla_test(
     srcs = ["dynamic_ops_test.cc"],
     shard_count = 4,
     tags = [
+        "test_migrated_to_hlo_runner_pjrt",
         "test_xla_cpu_no_thunks",
     ] + if_oss(["not_run:arm"]),
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
-        "//xla:reference_util",
+        "//xla:array3d",
+        "//xla:error_spec",
+        "//xla:executable_run_options",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:types",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/service",
         "//xla/service:computation_placer",
-        "//xla/service:local_service",
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/types:span",
+        "@com_google_benchmark//:benchmark",
     ],
 )
 
@@ -1921,42 +1920,48 @@ xla_test(
     srcs = ["tuple_test.cc"],
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
         ":hlo_test_base",
         ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
+        "//xla:error_spec",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:types",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:test_helpers",
-        "//xla/tsl/lib/core:status_test_util",
+        "//xla/service",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:test",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
 xla_test(
     name = "vector_ops_reduce_test",
     srcs = ["vector_ops_reduce_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
+        "//xla:error_spec",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder/lib:arithmetic",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -2054,7 +2059,6 @@ xla_test(
     name = "reduce_window_test",
     timeout = "long",
     srcs = [],
-    # placeholder for extra args for reduce_window_test
     shard_count = 50,
     tags = [
         "optonly",
@@ -2076,12 +2080,15 @@ xla_test(
         "no_mac",  # b/194731834
         "nozapfhahn",
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
         "test_xla_cpu_no_thunks",
     ],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array",
         "//xla:array2d",
         "//xla:array4d",
@@ -2092,7 +2099,7 @@ xla_test(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:arithmetic",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -2157,28 +2164,35 @@ xla_test(
 xla_test(
     name = "sort_test",
     srcs = ["sort_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:error_spec",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
     ],
 )
 
 xla_test(
     name = "topk_test",
     srcs = ["topk_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
-        ":test_macros_header",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
     ],
@@ -2188,17 +2202,20 @@ xla_test(
     name = "runtime_topk_test",
     srcs = ["runtime_topk_test.cc"],
     backends = ["cpu"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":test_macros_header",
         ":xla_internal_test_main",
         "//xla:literal",
         "//xla:literal_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2224,12 +2241,16 @@ xla_test(
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -2237,7 +2258,9 @@ xla_test(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test_helpers",
-        "@local_tsl//tsl/platform:test",
+        "//xla/service",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -2292,18 +2315,21 @@ xla_test(
 xla_test(
     name = "binop_scaling_test",
     srcs = ["binop_scaling_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array4d",
+        "//xla:error_spec",
         "//xla:reference_util",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -2312,19 +2338,23 @@ xla_test(
     srcs = ["broadcast_simple_test.cc"],
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
-        "//xla:array4d",
+        "//xla:array3d",
+        "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
-        "//xla/client:local_client",
+        "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
-        "@com_google_absl//absl/status:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -2355,29 +2385,39 @@ xla_test(
 xla_test(
     name = "fmax_fmin_test",
     srcs = ["fmax_fmin_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:error_spec",
+        "//xla:literal",
         "//xla/hlo/builder:xla_builder",
         "//xla/service",
+        "//xla/tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "log_test",
     srcs = ["log_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
-        "//xla/client:local_client",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:array3d",
+        "//xla:error_spec",
         "//xla/hlo/builder:xla_builder",
-        "@local_tsl//tsl/platform:test",
+        "//xla/tsl/platform:test",
     ],
 )
 
@@ -2444,19 +2484,22 @@ xla_test(
     name = "rng_test",
     srcs = ["rng_test.cc"],
     backends = ["cpu"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
         "//xla:literal",
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:rng_bit_generator_expander",
         "//xla/hlo/transforms/expanders:rng_expander",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2466,11 +2509,10 @@ xla_test(
     shard_count = 30,
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
         ":hlo_test_base",
         ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:array3d",
         "//xla:array4d",
@@ -2481,16 +2523,15 @@ xla_test(
         "//xla:shape_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -2515,11 +2556,16 @@ xla_test(
 xla_test(
     name = "reverse_test",
     srcs = ["reverse_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":client_library_test_runner_utils",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array4d",
         "//xla:error_spec",
         "//xla:literal",
@@ -2527,10 +2573,11 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2538,64 +2585,75 @@ xla_test(
     name = "stochastic_convert_test",
     srcs = ["stochastic_convert_test.cc"],
     backends = ["cpu"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "vector_ops_simple_test",
     srcs = ["vector_ops_simple_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array4d",
+        "//xla:error_spec",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/builder/lib:arithmetic",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":client_library_test_runner_utils",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
         ":xla_internal_test_main",
         "//xla:array2d",
         "//xla:array3d",
+        "//xla:error_spec",
         "//xla:literal_util",
         "//xla:reference_util",
-        "//xla/client:local_client",
+        "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
         "//xla/hlo/testlib:test_helpers",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2708,6 +2766,7 @@ xla_test(
         ":hlo_test_base",
         ":literal_test_util",
         ":test_macros_header",
+        ":test_utils",
         ":xla_internal_test_main",
         "//xla:error_spec",
         "//xla:literal",
@@ -2775,6 +2834,7 @@ xla_test(
         "//xla:types",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
+        "//xla/pjrt/gpu:gpu_helpers",
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_runner_interface",
@@ -2800,9 +2860,12 @@ xla_test(
 xla_test(
     name = "collective_pipeliner_execution_test",
     srcs = ["collective_pipeliner_execution_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
         "//xla:util",
@@ -2811,6 +2874,9 @@ xla_test(
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/service:collective_pipeliner",
+        "//xla/service:collective_pipeliner_utils",
+        "//xla/service:hlo_verifier",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -2851,19 +2917,22 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
+        "//xla:error_spec",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:ml_dtypes",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2887,17 +2956,20 @@ xla_test(
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":client_library_test_base",
-        ":test_macros_header",
-        ":xla_internal_test_main",
-        "//xla:error_spec",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla/hlo/builder:xla_builder",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -2928,35 +3000,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "value_inference_test",
-    srcs = ["value_inference_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
-    deps = [
-        ":literal_test_util",
-        ":test_macros_header",
-        ":test_utils",
-        ":xla_internal_test_main",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:xla_data_proto_cc",
-        "//xla/client:client_library",
-        "//xla/hlo/builder:value_inference",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/hlo/builder/lib:arithmetic",
-        "//xla/hlo/builder/lib:prng",
-        "//xla/hlo/testlib:test",
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 xla_test(
     name = "compute_constant_test",
     srcs = ["compute_constant_test.cc"],
@@ -2967,14 +3010,21 @@ xla_test(
         ":test_utils",
         ":xla_internal_test_main",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
+        "//xla/client",
         "//xla/client:client_library",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test",
+        "//xla/stream_executor:platform",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
     ],
@@ -2988,17 +3038,19 @@ xla_test(
         ":client_library_test_base",
         ":literal_test_util",
         ":test_macros_header",
-        ":test_utils",
         ":xla_internal_test_main",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/testlib:test_helpers",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:test",
+        "//xla/service",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -3012,13 +3064,13 @@ xla_test(
         ":test_macros_header",
         ":xla_internal_test_main",
         "//xla:literal",
-        "//xla:protobuf_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -3105,25 +3157,35 @@ xla_test(
     ],
     tags = ["test_xla_cpu_no_thunks"],
     deps = [
-        ":client_library_test_base",
+        ":client_library_test_runner_mixin",
         ":hlo_test_base",
         ":literal_test_util",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
+        "//xla:comparison_util",
+        "//xla:error_spec",
+        "//xla:executable_run_options",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
+        "//xla/service:shaped_buffer",
+        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/types:span",
+        "@com_google_benchmark//:benchmark",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:test_benchmark",
     ],
 )
 
@@ -3153,25 +3215,6 @@ xla_test(
     ],
 )
 
-xla_cc_test(
-    name = "local_client_aot_test",
-    srcs = [
-        "local_client_aot_test.cc",
-        ":local_client_aot_test_computation.o",
-    ],
-    tags = [
-        "not_run:arm",  # b/341355246
-        "test_xla_cpu_no_thunks",
-    ],
-    deps = [
-        "//xla:executable_run_options",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
 xla_test(
     name = "local_client_allocation_test",
     srcs = ["local_client_allocation_test.cc"],
@@ -3243,7 +3286,16 @@ xla_test(
         ":local_client_test_base",
         ":test_macros_header",
         ":xla_internal_test_main",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/builder:xla_computation",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log",
     ],
 )
 
@@ -3368,13 +3420,15 @@ xla_test(
         "cpu",
         "gpu",
     ],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
-        "//xla:types",
+        "//xla:error_spec",
         "//xla/hlo/testlib:test",
     ],
 )
@@ -3425,18 +3479,21 @@ xla_test(
     },
     shard_count = 50,
     tags = [
+        "test_migrated_to_hlo_runner_pjrt",
         "test_xla_cpu_no_thunks",
     ],
     deps = [
-        ":client_library_test_base",
-        ":hlo_test_base",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
-        ":xla_internal_test_main",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:error_spec",
         "//xla:shape_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:ml_dtypes",
@@ -3468,30 +3525,40 @@ xla_test(
     tags = [
         # Disabled in OSS until nvidia publicly releases a fixed ptxas.
         "no_oss",
+        "test_migrated_to_hlo_runner_pjrt",
         "test_xla_cpu_no_thunks",
     ],
     deps = [
-        ":hlo_test_base",
-        ":test_macros_header",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:debug_options_flags",
+        "//xla:error_spec",
         "//xla/hlo/testlib:test",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "get_dimension_size_test",
     srcs = ["get_dimension_size_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3501,16 +3568,19 @@ xla_test(
     backend_tags = {
         "gpu": ["notsan"],  # TODO(b/345034145): Fix tsan error.
     },
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:literal",
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3524,22 +3594,24 @@ xla_test(
         "test_xla_cpu_no_thunks",
     ],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
+        ":client_library_test_runner_mixin",
+        ":hlo_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array",
         "//xla:array2d",
+        "//xla:array3d",
+        "//xla:error_spec",
         "//xla:literal",
+        "//xla:literal_util",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder/lib:math",
         "//xla/hlo/builder/lib:matrix",
         "//xla/hlo/testlib:test",
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/status:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -3571,13 +3643,14 @@ xla_test(
 xla_test(
     name = "constant_reduction_function_test",
     srcs = ["constant_reduction_function_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
-        "//xla:types",
         "//xla/hlo/testlib:test",
     ],
 )
@@ -3598,18 +3671,23 @@ xla_cc_test(
 xla_test(
     name = "numerics_test",
     srcs = ["numerics_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":test_macros_header",
         ":xla_internal_test_main",
+        "//xla:error_spec",
         "//xla:literal_util",
         "//xla:types",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -3619,54 +3697,59 @@ xla_test(
     backend_tags = {
         "gpu": ["notsan"],
     },
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
         ":literal_test_util",
         ":xla_internal_test_main",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
 xla_test(
     name = "batch_norm_grad_test",
     srcs = ["batch_norm_grad_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
-        "//xla:literal",
         "//xla:literal_util",
-        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 xla_test(
     name = "batch_norm_training_test",
     srcs = ["batch_norm_training_test.cc"],
-    tags = ["test_xla_cpu_no_thunks"],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
     deps = [
-        ":hlo_test_base",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
-        "//xla:literal",
         "//xla:literal_util",
-        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3736,3 +3819,17 @@ xla_test(
         "@local_tsl//tsl/platform:path",
     ],
 )
+
+cc_library(
+    name = "hlo_runner_pjrt_test_utils",
+    testonly = True,
+    srcs = ["hlo_runner_pjrt_test_utils.cc"],
+    hdrs = ["hlo_runner_pjrt_test_utils.h"],
+    deps = [
+        "//xla/pjrt:pjrt_client",
+        "//xla/service:hlo_runner_interface",
+        "//xla/service:hlo_runner_pjrt",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
diff --git a/third_party/xla/xla/tests/axpy_simple_test.cc b/third_party/xla/xla/tests/axpy_simple_test.cc
index d4f1a753c6e7..df888a85bec0 100644
--- a/third_party/xla/xla/tests/axpy_simple_test.cc
+++ b/third_party/xla/xla/tests/axpy_simple_test.cc
@@ -15,17 +15,21 @@ limitations under the License.
 
 #include <vector>
 
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "tsl/platform/test.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class AxpySimpleTest : public ClientLibraryTestBase {};
+using AxpySimpleTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(AxpySimpleTest, AxTenValues) {
   XlaBuilder builder("ax_10");
@@ -40,7 +44,7 @@ TEST_F(AxpySimpleTest, AxTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) {
+TEST_F(AxpySimpleTest, AxpyZeroValues) {
   XlaBuilder builder("axpy_10");
   auto alpha = ConstantR0<float>(&builder, 3.1415926535);
   auto x = ConstantR1<float>(&builder, {});
diff --git a/third_party/xla/xla/tests/bad_rng_shape_validation_test.cc b/third_party/xla/xla/tests/bad_rng_shape_validation_test.cc
index c4a8efbc7509..7b38c0088378 100644
--- a/third_party/xla/xla/tests/bad_rng_shape_validation_test.cc
+++ b/third_party/xla/xla/tests/bad_rng_shape_validation_test.cc
@@ -21,25 +21,22 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/shape.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace {
 
-class BadRngShapeValidationTest : public ClientLibraryTestBase {};
-
-TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
-  XlaBuilder builder(TestName());
+TEST(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
+  XlaBuilder builder("BadRngShapeValidationTest");
   auto zero = ConstantR0<float>(&builder, 0.0);
   auto one = ConstantR0<float>(&builder, 1.0);
   RngUniform(zero, one, Shape());
   EXPECT_FALSE(builder.Build().ok());
 }
 
-TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
-  XlaBuilder builder(TestName());
+TEST(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
+  XlaBuilder builder("BadRngShapeValidationTest");
   auto zero = ConstantR0<float>(&builder, 0.0);
   auto one = ConstantR0<float>(&builder, 1.0);
   Shape shape;
diff --git a/third_party/xla/xla/tests/batch_norm_grad_test.cc b/third_party/xla/xla/tests/batch_norm_grad_test.cc
index 74512febada3..22a4319cdd9d 100644
--- a/third_party/xla/xla/tests/batch_norm_grad_test.cc
+++ b/third_party/xla/xla/tests/batch_norm_grad_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -38,7 +38,7 @@ const char* const kModuleStr = R"(
     }
   )";
 
-class BatchNormGradTest : public HloTestBase {};
+class BatchNormGradTest : public HloPjRtTestBase {};
 
 TEST_F(BatchNormGradTest, CorrectComputation) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
diff --git a/third_party/xla/xla/tests/batch_norm_training_test.cc b/third_party/xla/xla/tests/batch_norm_training_test.cc
index 77386432c733..a13ff37a27f5 100644
--- a/third_party/xla/xla/tests/batch_norm_training_test.cc
+++ b/third_party/xla/xla/tests/batch_norm_training_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -37,7 +37,7 @@ ENTRY entry {
 }
 )";
 
-class BatchNormTrainingTest : public HloTestBase {};
+class BatchNormTrainingTest : public HloPjRtTestBase {};
 
 TEST_F(BatchNormTrainingTest, CorrectComputation) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
diff --git a/third_party/xla/xla/tests/bfloat16_test.cc b/third_party/xla/xla/tests/bfloat16_test.cc
index 2eb9dea66b85..aaf4838d8edb 100644
--- a/third_party/xla/xla/tests/bfloat16_test.cc
+++ b/third_party/xla/xla/tests/bfloat16_test.cc
@@ -13,51 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "xla/array2d.h"
 #include "xla/array4d.h"
-#include "xla/client/local_client.h"
-#include "xla/hlo/builder/lib/arithmetic.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/testlib/test.h"
-#include "xla/hlo/testlib/test_helpers.h"
-#include "xla/literal.h"
-#include "xla/reference_util.h"
-#include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/literal_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
-#include "xla/util.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/types.h"
 
 namespace xla {
 namespace {
 
-class Bfloat16Test : public ClientLibraryTestBase {
- protected:
-  const ErrorSpec error_spec_{0.001, 0.001};
-};
+constexpr ErrorSpec kErrorSpec{0.001, 0.001};
+
+using Bfloat16Test = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
-XLA_TEST_F(Bfloat16Test, ScalarOperation) {
+TEST_F(Bfloat16Test, ScalarOperation) {
   XlaBuilder builder(TestName());
   auto x = ConstantR0<bfloat16>(&builder, static_cast<bfloat16>(2.0f));
   auto y = ConstantR0<bfloat16>(&builder, static_cast<bfloat16>(1.0f));
   Add(x, y);
 
   ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(3.0f), {},
-                                error_spec_);
+                                kErrorSpec);
 }
 
-XLA_TEST_F(Bfloat16Test, LogOperation) {
+TEST_F(Bfloat16Test, LogOperation) {
   XlaBuilder builder(TestName());
   auto x = ConstantR0<bfloat16>(&builder, static_cast<bfloat16>(4.0f));
   Log(x);
@@ -66,17 +51,17 @@ XLA_TEST_F(Bfloat16Test, LogOperation) {
                                 ErrorSpec(0.01, 0.01));
 }
 
-XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
+TEST_F(Bfloat16Test, NegateScalarF16) {
   XlaBuilder builder(TestName());
   Neg(ConstantR0<bfloat16>(&builder, static_cast<bfloat16>(2.1f)));
 
   ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(-2.1f), {},
-                                error_spec_);
+                                kErrorSpec);
 }
 
 // Disabled on interpreter since BatchNormExpander is not run by default on the
 // interpreter backend.
-XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormTraining)) {
+TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormTraining)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
@@ -112,7 +97,7 @@ XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormTraining)) {
 
 // Disabled on interpreter since BatchNormExpander is not run by default on the
 // interpreter backend.
-XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormGrad)) {
+TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormGrad)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
diff --git a/third_party/xla/xla/tests/binop_scaling_test.cc b/third_party/xla/xla/tests/binop_scaling_test.cc
index 6aab7717b2a1..10e136df031e 100644
--- a/third_party/xla/xla/tests/binop_scaling_test.cc
+++ b/third_party/xla/xla/tests/binop_scaling_test.cc
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
 #include "xla/array2d.h"
 #include "xla/array4d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class BinopScalingTest : public ClientLibraryTestBase {};
+using BinopScalingTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_32x4) {
   auto alhs = MakeLinspaceArray2D(0.0, 1.0, 32, 4);
diff --git a/third_party/xla/xla/tests/bitcast_convert_test.cc b/third_party/xla/xla/tests/bitcast_convert_test.cc
index 999c17680d66..a5f25e51db2f 100644
--- a/third_party/xla/xla/tests/bitcast_convert_test.cc
+++ b/third_party/xla/xla/tests/bitcast_convert_test.cc
@@ -13,29 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cmath>
 #include <cstdint>
 #include <limits>
 #include <memory>
 #include <vector>
 
-#include "xla/client/local_client.h"
+#include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class BitcastConvertTest : public ClientLibraryTestBase {
+class BitcastConvertTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
-  explicit BitcastConvertTest(se::Platform* platform = nullptr)
-      : ClientLibraryTestBase(platform) {
+  BitcastConvertTest() {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
   }
@@ -71,7 +73,7 @@ TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
+TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<int32_t>(&builder, {});
   BitcastConvertType(a, F32);
@@ -147,9 +149,10 @@ TEST_F(BitcastConvertTest, ConvertReshape) {
   ComputeAndCompareR0<float>(&builder, 42.0f, {});
 }
 
-class BitcastConvertHloTest : public HloTestBase {};
+class BitcastConvertHloTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {};
 
-XLA_TEST_F(BitcastConvertHloTest, S32to4S8) {
+TEST_F(BitcastConvertHloTest, S32to4S8) {
   absl::string_view hlo_string = R"(
 HloModule bitcast_to_smaller
 
@@ -161,7 +164,7 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(BitcastConvertHloTest, FourS8toS32) {
+TEST_F(BitcastConvertHloTest, FourS8toS32) {
   absl::string_view hlo_string = R"(
 HloModule bitcast_to_larger
 
@@ -173,7 +176,7 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(BitcastConvertHloTest, F32to2F16) {
+TEST_F(BitcastConvertHloTest, F32to2F16) {
   absl::string_view hlo_string = R"(
 HloModule bitcast_to_smaller
 
@@ -185,7 +188,7 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-5, 1e-5}));
 }
 
-XLA_TEST_F(BitcastConvertHloTest, TwoF16toF32) {
+TEST_F(BitcastConvertHloTest, TwoF16toF32) {
   absl::string_view hlo_string = R"(
 HloModule bitcast_to_smaller
 
diff --git a/third_party/xla/xla/tests/broadcast_simple_test.cc b/third_party/xla/xla/tests/broadcast_simple_test.cc
index 0f7f5656dc75..6d07b7afa34a 100644
--- a/third_party/xla/xla/tests/broadcast_simple_test.cc
+++ b/third_party/xla/xla/tests/broadcast_simple_test.cc
@@ -13,143 +13,137 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <numeric>
-#include <vector>
+#include <algorithm>
+#include <array>
+#include <cstdint>
 
-#include "absl/status/statusor.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/array2d.h"
-#include "xla/array4d.h"
-#include "xla/client/local_client.h"
+#include "xla/array3d.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class BroadcastSimpleTest : public ClientLibraryTestBase {
- public:
-  static constexpr absl::string_view kIncompatibleBinaryOpShapeErrorMessage =
-      "Binary op with incompatible shapes";
+using ::testing::HasSubstr;
 
-  XlaOp BuildBinOp(HloOpcode op, const XlaOp lhs, const XlaOp rhs,
-                   XlaBuilder* builder) {
-    switch (op) {
-      case HloOpcode::kMinimum: {
-        return Min(lhs, rhs);
-      }
-      case HloOpcode::kMaximum: {
-        return Max(lhs, rhs);
-      }
-      case HloOpcode::kMultiply: {
-        return Mul(lhs, rhs);
-      }
-      default: {
-        // Default to Add
-        return Add(lhs, rhs);
-      }
+constexpr absl::string_view kIncompatibleBinaryOpShapeErrorMessage =
+    "Binary op with incompatible shapes";
+
+XlaOp BuildBinOp(HloOpcode op, const XlaOp lhs, const XlaOp rhs,
+                 XlaBuilder* builder) {
+  switch (op) {
+    case HloOpcode::kMinimum: {
+      return Min(lhs, rhs);
+    }
+    case HloOpcode::kMaximum: {
+      return Max(lhs, rhs);
+    }
+    case HloOpcode::kMultiply: {
+      return Mul(lhs, rhs);
+    }
+    default: {
+      // Default to Add
+      return Add(lhs, rhs);
     }
   }
+}
 
-  std::unique_ptr<GlobalData> MakeR3Data(
-      absl::Span<const int64_t> bounds,
-      absl::Span<const int64_t> minor_to_major, Shape* r3_shape,
-      Array3D<float>* r3_array, float start, float end, int seed) {
-    *r3_shape =
-        ShapeUtil::MakeShapeWithDenseLayout(F32, bounds, minor_to_major);
-    r3_array->FillRandom(start, end, seed);
-    auto r3_data = LiteralUtil::CreateR3FromArray3D(*r3_array).Relayout(
-        LayoutUtil::MakeLayout(minor_to_major));
-    std::unique_ptr<GlobalData> r3_global_data =
-        client_->TransferToServer(r3_data).value();
-    return r3_global_data;
-  }
+Literal MakeR3Data(absl::Span<const int64_t> bounds,
+                   absl::Span<const int64_t> minor_to_major, Shape* r3_shape,
+                   Array3D<float>* r3_array, float start, float end, int seed) {
+  *r3_shape = ShapeUtil::MakeShapeWithDenseLayout(F32, bounds, minor_to_major);
+  r3_array->FillRandom(start, end, seed);
+  return LiteralUtil::CreateR3FromArray3D(*r3_array).Relayout(
+      LayoutUtil::MakeLayout(minor_to_major));
+}
 
-  std::unique_ptr<GlobalData> MakeR2Data(
-      absl::Span<const int64_t> bounds,
-      absl::Span<const int64_t> minor_to_major, Shape* r2_shape,
-      Array2D<float>* r2_array, float start, float end, int seed) {
-    *r2_shape =
-        ShapeUtil::MakeShapeWithDenseLayout(F32, bounds, minor_to_major);
-    r2_array->FillRandom(start, end, seed);
-    auto r2_data = LiteralUtil::CreateR2FromArray2D(*r2_array).Relayout(
-        LayoutUtil::MakeLayout(minor_to_major));
-    std::unique_ptr<GlobalData> r2_global_data =
-        client_->TransferToServer(r2_data).value();
-    return r2_global_data;
-  }
+Literal MakeR2Data(absl::Span<const int64_t> bounds,
+                   absl::Span<const int64_t> minor_to_major, Shape* r2_shape,
+                   Array2D<float>* r2_array, float start, float end, int seed) {
+  *r2_shape = ShapeUtil::MakeShapeWithDenseLayout(F32, bounds, minor_to_major);
+  r2_array->FillRandom(start, end, seed);
+  return LiteralUtil::CreateR2FromArray2D(*r2_array).Relayout(
+      LayoutUtil::MakeLayout(minor_to_major));
+}
 
-  float ApplyOpToFloats(HloOpcode op, float lhs, float rhs) {
-    switch (op) {
-      case HloOpcode::kMinimum: {
-        return std::min(lhs, rhs);
-      }
-      case HloOpcode::kMaximum: {
-        return std::max(lhs, rhs);
-      }
-      case HloOpcode::kMultiply: {
-        return lhs * rhs;
-      }
-      case HloOpcode::kAdd: {
-        return lhs + rhs;
-      }
-      default: {
-        // Default to Add
-        LOG(FATAL);
-      }
+float ApplyOpToFloats(HloOpcode op, float lhs, float rhs) {
+  switch (op) {
+    case HloOpcode::kMinimum: {
+      return std::min(lhs, rhs);
+    }
+    case HloOpcode::kMaximum: {
+      return std::max(lhs, rhs);
+    }
+    case HloOpcode::kMultiply: {
+      return lhs * rhs;
+    }
+    case HloOpcode::kAdd: {
+      return lhs + rhs;
+    }
+    default: {
+      // Default to Add
+      LOG(FATAL);
     }
   }
-};
+}
 
-using ::testing::HasSubstr;
+using BroadcastSimpleTest = ClientLibraryTestRunnerMixin<HloTestBase>;
 
-XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
+TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
   XlaBuilder b(TestName());
   Broadcast(ConstantR0<float>(&b, 1.5), {});
   ComputeAndCompareR0<float>(&b, 1.5, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) {
+TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) {
   XlaBuilder b(TestName());
   Broadcast(ConstantR0<float>(&b, 2.25), {2, 3});
   Array2D<float> expected(2, 3, 2.25);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
+TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) {
   XlaBuilder b(TestName());
   XlaOp src;
-  std::unique_ptr<GlobalData> param_data =
+  const Literal param_data =
       CreateR0Parameter<float>(2.25f, /*parameter_number=*/0, /*name=*/"src",
                                /*builder=*/&b, /*data_handle=*/&src);
 
   Broadcast(src, {2, 3});
   Array2D<float> expected(2, 3, 2.25);
-  ComputeAndCompareR2<float>(&b, expected, {param_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, expected, {&param_data}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) {
+TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) {
   XlaBuilder b(TestName());
   Broadcast(ConstantR0<float>(&b, 2.25), {2, 0});
   Array2D<float> expected(2, 0);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) {
+TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) {
   XlaBuilder b(TestName());
   Broadcast(ConstantR0<float>(&b, 2.25), {0, 2});
   Array2D<float> expected(0, 2);
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
+TEST_F(BroadcastSimpleTest, 1DTo2D) {
   XlaBuilder b(TestName());
   Broadcast(ConstantR1<float>(&b, {1, 2, 3}), {2});
 
@@ -163,7 +157,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
+TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
   XlaBuilder b(TestName());
   BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {1});
 
@@ -176,7 +170,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
+TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
   XlaBuilder b(TestName());
   BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {0});
 
@@ -189,7 +183,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
+TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
   XlaBuilder b(TestName());
   BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
                  {0, 1});
@@ -207,7 +201,7 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
   ComputeAndCompareR3<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
+TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
   BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
                  {0, 2});
@@ -225,7 +219,7 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
   ComputeAndCompareR3<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsNotPossibleWithBroadCast) {
+TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
   BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {3, 2}, {1});
 
@@ -241,7 +235,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsNotPossibleWithBroadCast) {
 }
 
 // Tests implicit broadcasting of PREDs.
-XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
+TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
   XlaBuilder b(TestName());
 
   Array2D<bool> x_vals(2, 1);
@@ -264,10 +258,10 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) {
   expected(1, 0, 0) = true;
   expected(1, 1, 0) = false;
 
-  ComputeAndCompareR3<bool>(&b, expected, {x_data.get(), y_data.get()});
+  ComputeAndCompareR3<bool>(&b, expected, {&x_data, &y_data});
 }
 
-XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
+TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
   XlaBuilder b(TestName());
   Broadcast(ConstantR1<float>(&b, {}), {2});
 
@@ -275,7 +269,7 @@ XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) {
+TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) {
   XlaBuilder b(TestName());
   Broadcast(ConstantR1<float>(&b, {1, 2, 3}), {0});
 
@@ -283,7 +277,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
+TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   // Verify that binary op and degenerate dimension broadcast work together in
   // the same operation.
   //
@@ -337,10 +331,10 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
   Array3D<float> r3_implicit_array(spec.input_bounds[0], spec.input_bounds[1],
                                    spec.input_bounds[2]);
 
-  std::unique_ptr<GlobalData> r3_global_data =
+  const Literal r3_global_data =
       MakeR3Data(spec.output_bounds, spec.minor2major_layout, &r3_shape,
                  &r3_array, 1.0, 2.5, 56789);
-  std::unique_ptr<GlobalData> r3_implicit_global_data =
+  const Literal r3_implicit_global_data =
       MakeR3Data(spec.input_bounds, spec.minor2major_layout, &r3_implicit_shape,
                  &r3_implicit_array, 1.0, 0.2, 56789);
 
@@ -370,9 +364,9 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
     }
   }
   auto expected = LiteralUtil::CreateR3FromArray3D(expected_array);
-  ComputeAndCompareLiteral(
-      &builder, expected, {r3_implicit_global_data.get(), r3_global_data.get()},
-      ErrorSpec(1e-7, 1e-7));
+  ComputeAndCompareLiteral(&builder, expected,
+                           {&r3_implicit_global_data, &r3_global_data},
+                           ErrorSpec(1e-7, 1e-7));
 }
 
 INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
@@ -380,7 +374,7 @@ INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
                         ::testing::ValuesIn(kR3ImplicitBroadcastTestCases));
 
 // r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1:
-XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
+TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
   XlaBuilder b(TestName());
   XlaOp r1h;
   XlaOp r3h;
@@ -395,11 +389,10 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
   auto expected =
       LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}});
 
-  ComputeAndCompareLiteral(&b, expected, {r3.get(), r1.get()},
-                           ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&b, expected, {&r3, &r1}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
+TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
   XlaBuilder b(TestName());
   auto r1 = ConstantLiteral(&b, LiteralUtil::CreateR3<float>({{{1, 2}}}));
   auto r3 = ConstantLiteral(
@@ -412,7 +405,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
+TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
   XlaBuilder b(TestName());
   auto r1 = ConstantLiteral(&b, LiteralUtil::CreateR3<float>({{{1}, {2}}}));
   auto r3 = ConstantLiteral(
@@ -425,7 +418,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
+TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
   XlaBuilder b(TestName());
   auto r1 =
       ConstantLiteral(&b, LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}}));
@@ -439,7 +432,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
+TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
   XlaBuilder b(TestName());
   auto r1 =
       ConstantLiteral(&b, LiteralUtil::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
@@ -453,7 +446,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
+TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
   XlaBuilder b(TestName());
   auto r1 = ConstantLiteral(
       &b, LiteralUtil::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
@@ -467,7 +460,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
+TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
   XlaBuilder b(TestName());
   auto r1 = ConstantLiteral(&b, LiteralUtil::CreateR3<float>({{{1}}}));
   auto r3 = ConstantLiteral(
@@ -584,13 +577,13 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   Array2D<float> r2_implicit_array2(spec.input_bounds2[0],
                                     spec.input_bounds2[1]);
 
-  std::unique_ptr<GlobalData> r2_global_data =
+  const Literal r2_global_data =
       MakeR2Data(spec.output_bounds, spec.minor2major_layout, &r2_shape,
                  &r2_array, 1.0, 2.5, 56789);
-  std::unique_ptr<GlobalData> r2_implicit_global_data1 =
+  const Literal r2_implicit_global_data1 =
       MakeR2Data(spec.input_bounds1, spec.minor2major_layout,
                  &r2_implicit_shape1, &r2_implicit_array1, 1.0, 0.2, 56789);
-  std::unique_ptr<GlobalData> r2_implicit_global_data2 =
+  const Literal r2_implicit_global_data2 =
       MakeR2Data(spec.input_bounds2, spec.minor2major_layout,
                  &r2_implicit_shape2, &r2_implicit_array2, 0.8, 0.4, 56789);
 
@@ -619,8 +612,7 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) {
   auto expected = LiteralUtil::CreateR2FromArray2D(expected_array);
   ComputeAndCompareLiteral(
       &builder, expected,
-      {r2_implicit_global_data1.get(), r2_global_data.get(),
-       r2_implicit_global_data2.get()},
+      {&r2_implicit_global_data1, &r2_global_data, &r2_implicit_global_data2},
       ErrorSpec(1e-6, 1e-6));
 }
 
@@ -628,7 +620,7 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances,
                         BroadcastR2ImplicitTest,
                         ::testing::ValuesIn(kR2ImplicitBroadcastTestCases));
 
-XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
+TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
   XlaBuilder b(TestName());
   auto r1 = ConstantLiteral(&b, LiteralUtil::CreateR2<float>({{1, 2}}));
   auto r2 = ConstantLiteral(&b, LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
@@ -639,7 +631,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
+TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
   XlaBuilder b(TestName());
   auto r1 = ConstantLiteral(&b, LiteralUtil::CreateR2<float>({{1}, {2}}));
   auto r2 = ConstantLiteral(&b, LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
@@ -650,7 +642,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
+TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
   XlaBuilder b(TestName());
   auto r1 = ConstantR1<float>(&b, {10, 20});
   auto r3 = ConstantLiteral(
@@ -663,7 +655,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
+TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
   XlaBuilder b(TestName());
   auto r1 = ConstantR1<float>(&b, {10, 20});
   auto r3 = ConstantLiteral(
@@ -676,7 +668,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
+TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
   XlaBuilder b(TestName());
   auto r1 = ConstantR1<float>(&b, {10, 20});
   auto r3 = ConstantLiteral(
@@ -689,7 +681,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
+TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
   XlaBuilder b(TestName());
   auto r1_0 = ConstantR1<float>(&b, {1000, 2000});
   auto r1_1 = ConstantR1<float>(&b, {100, 200});
@@ -710,7 +702,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
+TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
   XlaBuilder b(TestName());
   auto r1_0 = ConstantR1<float>(&b, {1000, 2000});
   auto r1_1 = ConstantR1<float>(&b, {100, 200});
@@ -731,7 +723,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) {
   ComputeAndCompareLiteral(&b, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
+TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
   // Binary dimension broadcasting of the smaller lhs ([2, 2] up to [2, 2, 2])
   // results in a shape incompatible with the lhs [2, 3, 1].
   XlaBuilder b(TestName());
@@ -741,33 +733,33 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
                               {{{2.0}, {3.0}, {4.0}}, {{5.0}, {6.0}, {7.0}}})),
       /*broadcast_dimensions=*/{1, 2});
 
-  auto result_status = Execute(&b, {});
+  const absl::StatusOr<Literal> result_status = ExecuteAndTransfer(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().message(),
               HasSubstr("dimension 0 mismatch"));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
+TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
   XlaBuilder b(TestName());
 
   Add(ConstantR2<float>(&b, {{1.0, 2.0}}),
       ConstantR2<float>(&b, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
 
-  auto result_status = Execute(&b, {});
+  absl::StatusOr<Literal> result_status = ExecuteAndTransfer(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().message(),
               HasSubstr(kIncompatibleBinaryOpShapeErrorMessage));
 }
 
-XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
+TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
   // Test invalid broadcasting with [1, 2] and [2, 3] inputs.
   XlaBuilder b(TestName());
 
   Add(ConstantR2<float>(&b, {{1.0, 2.0}}),
       ConstantR2<float>(&b, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
 
-  auto result_status = Execute(&b, {});
+  absl::StatusOr<Literal> result_status = ExecuteAndTransfer(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().message(),
               HasSubstr(kIncompatibleBinaryOpShapeErrorMessage));
diff --git a/third_party/xla/xla/tests/build_defs.bzl b/third_party/xla/xla/tests/build_defs.bzl
index 1174045f97a1..da3504c8ed1f 100644
--- a/third_party/xla/xla/tests/build_defs.bzl
+++ b/third_party/xla/xla/tests/build_defs.bzl
@@ -1,6 +1,12 @@
-"""Build rules for XLA testing. This file is only used for the OSS build."""
+"""Build rules for XLA testing. This file is only used for the OSS build and running tests on
+github.
+"""
 
-load("//xla:xla.bzl", "xla_cc_test")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "is_rocm_configured",
+)
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tests:plugin.bzl", "plugins")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load(
@@ -8,6 +14,7 @@ load(
     "tf_gpu_tests_tags",
 )
 load("//xla/tsl/platform/default:build_config.bzl", "strict_cc_test")
+load("//xla/tsl/platform/default:cuda_build_defs.bzl", "is_cuda_configured")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
@@ -42,10 +49,10 @@ DEFAULT_DISABLED_BACKENDS = []
 _ALL_BACKENDS = ["cpu", "interpreter"] + NVIDIA_GPU_BACKENDS + AMD_GPU_DEFAULT_BACKENDS + list(plugins.keys())
 
 # buildifier: disable=function-docstring
-def prepare_nvidia_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args):
-    # Expand "gpu" backend name into device specific backend names.
+def prepare_nvidia_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args, common_tags):
+    # Expand "gpu" backend name into device specific backend names unless it's tagged rocm-only
     new_backends = [name for name in backends if name != "gpu"]
-    if len(new_backends) < len(backends):
+    if len(new_backends) < len(backends) and "rocm-only" not in common_tags:
         new_backends.extend(NVIDIA_GPU_DEFAULT_BACKENDS)
 
     new_disabled_backends = [name for name in disabled_backends if name != "gpu"]
@@ -109,9 +116,11 @@ def prepare_nvidia_gpu_backend_data(backends, disabled_backends, backend_tags, b
     return new_backends, new_disabled_backends, new_backend_tags, new_backend_args
 
 # buildifier: disable=function-docstring
-def prepare_amd_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args):
+def prepare_amd_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args, common_tags):
     new_backends = [name for name in backends if name != "gpu"]
-    if len(new_backends) < len(backends):
+
+    # Expand "gpu" backend name into device specific backend names unless it's tagged cuda-only
+    if len(new_backends) < len(backends) and "cuda-only" not in common_tags:
         new_backends.extend(AMD_GPU_DEFAULT_BACKENDS)
 
     new_disabled_backends = [name for name in disabled_backends if name != "gpu"]
@@ -137,15 +146,13 @@ def prepare_amd_gpu_backend_data(backends, disabled_backends, backend_tags, back
         new_backend_tags.setdefault(key, gpu_backend_tags[:])
 
     for backend in AMD_GPU_DEFAULT_BACKENDS:
-        if "cuda-only" not in gpu_backend_tags:
-            new_backend_tags[backend].append("requires-gpu-amd")
-        new_backend_tags[backend].append("notap")
+        new_backend_tags[backend].append("requires-gpu-amd")
         new_backend_tags[backend].append("rocm-only")
 
     return new_backends, new_disabled_backends, new_backend_tags, backend_args
 
 # buildifier: disable=function-docstring
-def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args):
+def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args, common_tags):
     nvidia_backends = [
         backend
         for backend in backends
@@ -163,9 +170,9 @@ def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_
     ]
 
     nvidia_backends, nvidia_disabled_backends, nvidia_backend_tags, nvidia_backend_args = \
-        prepare_nvidia_gpu_backend_data(nvidia_backends, disabled_backends, backend_tags, backend_args)
+        prepare_nvidia_gpu_backend_data(nvidia_backends, disabled_backends, backend_tags, backend_args, common_tags)
     amd_backends, amd_disabled_backends, amd_backend_tags, amd_backend_args = \
-        prepare_amd_gpu_backend_data(amd_backends, disabled_backends, backend_tags, {})
+        prepare_amd_gpu_backend_data(amd_backends, disabled_backends, backend_tags, {}, common_tags)
 
     new_backends = [
         backend
@@ -195,13 +202,14 @@ def xla_test(
         backend_tags = {},
         backend_args = {},
         backend_kwargs = {},
-        # Inside Google, we link statically to catch duplicate main() definitions.
-        # However, this increases the size of the test binary, which breaks Nvidia's build.
-        # Therefore we use dynamic linking outside Google.
-        linkstatic = False,
+        linkstatic = None,
+        fail_if_no_test_linked = True,
         **kwargs):
     """Generates strict_cc_test targets for the given XLA backends.
 
+    This rule is similar to platforms/.../build_defs.bzl but only meant for running the tests on
+    github.
+
     This rule generates a cc_test target for one or more XLA backends. The arguments
     are identical to cc_test with two additions: 'backends' and 'backend_args'.
     'backends' specifies the backends to generate tests for ("cpu", "gpu"), and
@@ -268,7 +276,9 @@ def xla_test(
       backend_kwargs: A dict mapping backend name to list of additional keyword
         arguments to pass to strict_cc_test. Only use for kwargs that don't have a
         dedicated argument, like setting per-backend flaky or timeout attributes.
-      linkstatic: Whether to link the test statically.
+      linkstatic: Whether to link the test statically. Can be set to None to use
+        the default value decided by strict_cc_test.
+      fail_if_no_test_linked: Whether to fail if no test case is linked into the test.
       **kwargs: Additional keyword arguments to pass to strict_cc_test.
     """
 
@@ -278,7 +288,7 @@ def xla_test(
 
     # Expand "gpu" backend name to specific GPU backends and update tags.
     backends, disabled_backends, backend_tags, backend_args = \
-        prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args)
+        prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args, tags)
 
     backends = [
         backend
@@ -310,8 +320,16 @@ def xla_test(
             ]
             if backend in NVIDIA_GPU_BACKENDS:
                 this_backend_tags += tf_gpu_tests_tags()
+                backend_deps += [
+                    "//xla/stream_executor/cuda:all_runtime",
+                    "//xla/stream_executor/cuda:stream_executor_cuda",
+                ]
             if backend in AMD_GPU_DEFAULT_BACKENDS:
                 this_backend_tags.append("gpu")
+                backend_deps += [
+                    "//xla/stream_executor/rocm:all_runtime",
+                    "//xla/stream_executor/rocm:stream_executor_rocm",
+                ]
             this_backend_copts.append("-DXLA_TEST_BACKEND_GPU=1")
 
             # TODO: b/382779188 - Remove this when all tests are migrated to PjRt.
@@ -349,10 +367,12 @@ def xla_test(
             deps = deps + backend_deps,
             data = data + this_backend_data,
             linkstatic = linkstatic,
+            fail_if_no_test_linked = fail_if_no_test_linked,
             **this_backend_kwargs
         )
-
-        test_names.append(test_name)
+        if ((backend in NVIDIA_GPU_BACKENDS and is_cuda_configured()) or
+            (backend in AMD_GPU_DEFAULT_BACKENDS and is_rocm_configured())):
+            test_names.append(test_name)
 
     # Notably, a test_suite with `tests = []` is not empty:
     # https://bazel.build/reference/be/general#test_suite_args and the default
@@ -382,7 +402,11 @@ def xla_test(
             name = name,
             deps = ["@com_google_googletest//:gtest_main"],
             linkstatic = linkstatic,
-            **kwargs
+            # This test is deliberately empty. Its only purpose is to avoid
+            # creating an empty test suite, which would be a problem for
+            # --build_tag_filters (see above). Therefore we don't want to fail
+            # if no test case is linked in.
+            fail_if_no_test_linked = False,
         )
 
 def xla_test_library(
diff --git a/third_party/xla/xla/tests/call_test.cc b/third_party/xla/xla/tests/call_test.cc
index 4fdfc73db842..328a18acea49 100644
--- a/third_party/xla/xla/tests/call_test.cc
+++ b/third_party/xla/xla/tests/call_test.cc
@@ -13,25 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
 #include <utility>
 
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class CallOpTest : public ClientLibraryTestBase {
+class CallOpTest : public ClientLibraryTestRunnerMixin<
+                       HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   XlaComputation CreateR0F32IdentityComputation() {
     XlaBuilder builder("Identity");
@@ -74,7 +77,7 @@ class CallOpTest : public ClientLibraryTestBase {
   Shape r1s2f32_ = ShapeUtil::MakeShape(F32, {2});
 };
 
-XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) {
+TEST_F(CallOpTest, CallR0F32IdentityScalar) {
   XlaBuilder builder(TestName());
   XlaComputation callee = CreateR0F32IdentityComputation();
   auto constant = ConstantLiteral(&builder, LiteralUtil::CreateR0<float>(42.0));
@@ -83,7 +86,7 @@ XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) {
   ComputeAndCompareR0<float>(&builder, 42.0, {}, ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) {
+TEST_F(CallOpTest, CallR1S0F32AddArray) {
   XlaBuilder builder(TestName());
   XlaComputation callee = CreateR1S0F32AdditionComputation();
   auto x = ConstantLiteral(&builder, LiteralUtil::CreateR1<float>({}));
@@ -93,7 +96,7 @@ XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) {
   ComputeAndCompareR1<float>(&builder, {}, {}, ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) {
+TEST_F(CallOpTest, CallR1S2F32AddArray) {
   XlaBuilder builder(TestName());
   XlaComputation callee = CreateR1S2F32AdditionComputation();
   auto x =
@@ -105,7 +108,7 @@ XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) {
   ComputeAndCompareR1<float>(&builder, {3.0f, 5.0f}, {}, ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) {
+TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) {
   XlaBuilder builder("inner");
   {
     auto x = Parameter(&builder, 0, r0f32_, "x");
@@ -130,13 +133,11 @@ XLA_TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) {
     x = Call(&builder3, outer, {x});
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<GlobalData> start,
-      client_->TransferToServer(LiteralUtil::CreateR0<float>(1.0f)));
-  ComputeAndCompareR0<float>(&builder3, 10.0f, {start.get()}, ErrorSpec(0.0f));
+  const Literal start = LiteralUtil::CreateR0<float>(1.0f);
+  ComputeAndCompareR0<float>(&builder3, 10.0f, {&start}, ErrorSpec(0.0f));
 }
 
-XLA_TEST_F(CallOpTest, CallR0F32Tuple) {
+TEST_F(CallOpTest, CallR0F32Tuple) {
   XlaBuilder builder(TestName());
   XlaComputation callee = CreateR0F32TupleComputation();
   auto elem = LiteralUtil::CreateR0<float>(42.0);
diff --git a/third_party/xla/xla/tests/check_execution_arity_test.cc b/third_party/xla/xla/tests/check_execution_arity_test.cc
index f7d080f86d72..d25fb3c9b064 100644
--- a/third_party/xla/xla/tests/check_execution_arity_test.cc
+++ b/third_party/xla/xla/tests/check_execution_arity_test.cc
@@ -13,18 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
 #include <utility>
 
 #include "absl/status/statusor.h"
-#include "xla/client/local_client.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -32,7 +34,9 @@ namespace {
 
 using ::testing::ContainsRegex;
 
-class CheckExecutionArityTest : public ClientLibraryTestBase {};
+class CheckExecutionArityTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
 
 TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
   XlaBuilder builder("add_two_params");
@@ -42,33 +46,30 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
   auto p1 = Parameter(&builder, 1, param_literal.shape(), "param1");
   Add(p0, p1);
 
-  auto param0_data = client_->TransferToServer(param_literal).value();
-  auto param1_data = client_->TransferToServer(param_literal).value();
-
   auto computation_status = builder.Build();
   ASSERT_IS_OK(computation_status.status());
   auto computation = std::move(computation_status).value();
 
   // The arity of the UserComputation is 2 arguments. Execution will succeed
   // with 2 arguments, but fail with a different number.
-  auto result_two_args = client_->Execute(
-      computation, {param0_data.get(), param1_data.get()}, &execution_options_);
+  absl::StatusOr<Literal> result_two_args =
+      ExecuteAndTransfer(computation, {&param_literal, &param_literal});
   ASSERT_IS_OK(result_two_args.status());
 
-  auto result_one_arg =
-      client_->Execute(computation, {param0_data.get()}, &execution_options_);
+  absl::StatusOr<Literal> result_one_arg =
+      ExecuteAndTransfer(computation, {&param_literal});
   ASSERT_FALSE(result_one_arg.ok());
   ASSERT_EQ(result_one_arg.status().code(), tsl::error::INVALID_ARGUMENT);
   ASSERT_THAT(result_one_arg.status().message(), ContainsRegex("takes 2"));
 
-  auto result_zero_args =
-      client_->Execute(computation, {}, &execution_options_);
+  absl::StatusOr<Literal> result_zero_args =
+      ExecuteAndTransfer(computation, {});
   ASSERT_FALSE(result_zero_args.ok());
   ASSERT_EQ(result_zero_args.status().code(), tsl::error::INVALID_ARGUMENT);
   ASSERT_THAT(result_zero_args.status().message(), ContainsRegex("takes 2"));
 }
 
-XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
+TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   XlaBuilder builder("add_two_params");
 
   auto p0 = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "param0");
@@ -79,21 +80,18 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_IS_OK(computation_status.status());
   auto computation = std::move(computation_status).value();
 
-  auto f32_literal = LiteralUtil::CreateR0<float>(1.1f);
-  auto f32_data = client_->TransferToServer(f32_literal).value();
-  auto f32_4_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
-  auto f32_4_data = client_->TransferToServer(f32_4_literal).value();
-  auto u8_4_literal = LiteralUtil::CreateR1U8("hola");
-  auto u8_4_data = client_->TransferToServer(u8_4_literal).value();
+  const Literal f32_literal = LiteralUtil::CreateR0<float>(1.1f);
+  const Literal f32_4_literal =
+      LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  const Literal u8_4_literal = LiteralUtil::CreateR1U8("hola");
 
   // Match
-  auto status = client_->Execute(
-      computation, {f32_data.get(), f32_4_data.get()}, &execution_options_);
+  absl::StatusOr<Literal> status =
+      ExecuteAndTransfer(computation, {&f32_literal, &f32_4_literal});
   ASSERT_IS_OK(status.status());
 
   // Shape mismatch in parameter 0
-  status = client_->Execute(computation, {f32_4_data.get(), f32_4_data.get()},
-                            &execution_options_);
+  status = ExecuteAndTransfer(computation, {&f32_4_literal, &f32_4_literal});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tsl::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().message(),
@@ -101,8 +99,7 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
                   "Argument does not match shape of computation parameter 0"));
 
   // Shape mismatch in parameter 1 (rank)
-  status = client_->Execute(computation, {f32_data.get(), f32_data.get()},
-                            &execution_options_);
+  status = ExecuteAndTransfer(computation, {&f32_literal, &f32_literal});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tsl::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().message(),
@@ -110,8 +107,7 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
                   "Argument does not match shape of computation parameter 1"));
 
   // Shape mismatch in parameter 1 (element type)
-  status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()},
-                            &execution_options_);
+  status = ExecuteAndTransfer(computation, {&f32_literal, &u8_4_literal});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tsl::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().message(),
diff --git a/third_party/xla/xla/tests/client_library_test_base.cc b/third_party/xla/xla/tests/client_library_test_base.cc
index 55a36ffb6122..d84efe662408 100644
--- a/third_party/xla/xla/tests/client_library_test_base.cc
+++ b/third_party/xla/xla/tests/client_library_test_base.cc
@@ -168,56 +168,24 @@ std::string ClientLibraryTestBase::ExecuteToString(
 
 void ClientLibraryTestBase::ComputeAndCompareR1(
     XlaBuilder* builder, const tsl::core::Bitmap& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
   Literal expected_literal = LiteralUtil::CreateR1(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
-                                                  arguments);
-}
-
-void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    XlaBuilder* builder, const Literal& expected,
-    absl::Span<GlobalData* const> arguments, const Shape* shape_with_layout) {
-  EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
-                                                  shape_with_layout));
+  ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
 }
 
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
     XlaBuilder* builder, const Literal& expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error,
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error,
     const Shape* shape_with_layout) {
   EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
                                                   error, shape_with_layout));
 }
 
-absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithCmdBuffer(
-    const xla::XlaComputation& computation, const Literal& expected,
-    absl::Span<GlobalData* const> arguments,
-    const std::function<void(const Literal& actual,
-                             const std::string& error_message)>&
-        verify_output) {
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::FUSION);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::CUBLAS);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::CUDNN);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::COLLECTIVES);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::CONDITIONAL);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::WHILE);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::CUSTOM_CALL);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::CUBLASLT);
-  execution_options_.mutable_debug_options()->add_xla_gpu_enable_command_buffer(
-      DebugOptions::DYNAMIC_SLICE_FUSION);
-  execution_options_.mutable_debug_options()->set_xla_gpu_graph_min_graph_size(
-      1);
-  TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments));
-  verify_output(actual, "");
-  return absl::OkStatus();
+void ClientLibraryTestBase::ComputeAndCompareLiteral(
+    XlaBuilder* builder, const Literal& expected,
+    absl::Span<GlobalData* const> arguments, const Shape* shape_with_layout) {
+  EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(
+      builder, expected, arguments, std::nullopt, shape_with_layout));
 }
 
 absl::Status
@@ -232,7 +200,7 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
   verify_output(actual, "");
 
   // Try with all output layouts.
-  std::vector<int64_t> minor_to_major(expected.shape().rank());
+  std::vector<int64_t> minor_to_major(expected.shape().dimensions().size());
   std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
   do {
     auto layout = ShapeUtil::MakeShapeWithDenseLayout(
@@ -275,7 +243,7 @@ absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         return absl::OkStatus();
       }
 
-      std::vector<int64_t> minor_to_major(literal.shape().rank());
+      std::vector<int64_t> minor_to_major(literal.shape().dimensions().size());
       std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
       do {
         auto literal_relayout =
@@ -336,7 +304,7 @@ absl::StatusOr<Literal> ClientLibraryTestBase::ComputeAndTransfer(
 absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     XlaBuilder* builder, const Literal& expected,
     absl::Span<GlobalData* const> arguments_passed_in,
-    const Shape* shape_with_layout) {
+    std::optional<ErrorSpec> error, const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
                                      arguments_passed_in.end());
 
@@ -354,76 +322,14 @@ absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   }
 
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
-  if (ShapeUtil::ElementIsFloating(expected.shape()) ||
-      ShapeUtil::ElementIsComplex(expected.shape())) {
-    LOG(WARNING) << "performing exact comparison of floating point numbers";
-  }
-  // We allow using a float expected literal for non float outputs. In this
-  // case, we need to convert the expected literal to test_type_.
-  const Literal* expected_ptr = &expected;
-  Literal converted_expected;
-  Shape layout_shape;
-  if (test_type_ != F32) {
-    converted_expected = MaybeConvertLiteralToTestType(expected);
-    expected_ptr = &converted_expected;
-    if (shape_with_layout != nullptr) {
-      layout_shape = *shape_with_layout;
-      ShapeUtil::ForEachMutableSubshape(
-          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
-            if (subshape->element_type() == F32) {
-              subshape->set_element_type(test_type_);
-            }
-          });
-      shape_with_layout = &layout_shape;
+  if (error == std::nullopt) {
+    if (ShapeUtil::ElementIsFloating(expected.shape()) ||
+        ShapeUtil::ElementIsComplex(expected.shape())) {
+      LOG(WARNING) << "performing exact comparison of floating point numbers";
     }
   }
-  auto expect_equal = [&](const Literal& actual,
-                          const std::string& error_message) {
-    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, actual)) << error_message;
-  };
-  if (execution_options_.debug_options().xla_test_all_output_layouts()) {
-    return ComputeAndCompareLiteralWithAllOutputLayouts(
-        computation, *expected_ptr, arguments, expect_equal);
-  }
-  if (execution_options_.debug_options().xla_test_all_input_layouts()) {
-    return ComputeAndCompareLiteralWithAllInputLayouts(
-        computation, *expected_ptr, arguments, expect_equal, shape_with_layout);
-  }
-  TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
-                                                      shape_with_layout));
-  EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, actual));
-
-  if (execution_options_.debug_options().xla_test_add_command_buffer_mode()) {
-    return ComputeAndCompareLiteralWithCmdBuffer(computation, *expected_ptr,
-                                                 arguments, expect_equal);
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    XlaBuilder* builder, const Literal& expected,
-    absl::Span<GlobalData* const> arguments_passed_in, ErrorSpec error,
-    const Shape* shape_with_layout) {
-  std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
-                                     arguments_passed_in.end());
-
-  // Transfer and use elements of arguments_, if the AddParam() API was used.
-  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
-  if (!arguments_.empty()) {
-    CHECK(arguments.empty());
-    for (const auto& argument : arguments_) {
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<GlobalData> owned_argument,
-          client_->TransferToServer(MaybeConvertLiteralToTestType(argument)));
-      owning_arguments.push_back(std::move(owned_argument));
-      arguments.push_back(owning_arguments.back().get());
-    }
-  }
-
-  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   // We allow using a float expected literal for a non float outputs. In this
-  // case, we need to convert the expected literal to type_test_.
+  // case, we need to convert the expected literal to test_type_.
   const Literal* expected_ptr = &expected;
   Literal converted_expected;
   Shape layout_shape;
@@ -441,26 +347,29 @@ absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
       shape_with_layout = &layout_shape;
     }
   }
-  auto expect_near = [&](const Literal& actual,
-                         const std::string& error_message) {
-    EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, error))
-        << error_message;
+  auto expect = [&](const Literal& actual, const std::string& error_message) {
+    if (error) {
+      EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, *error))
+          << error_message;
+    } else {
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, actual))
+          << error_message;
+    }
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
     return ComputeAndCompareLiteralWithAllOutputLayouts(
-        computation, *expected_ptr, arguments, expect_near);
+        computation, *expected_ptr, arguments, expect);
   }
   if (execution_options_.debug_options().xla_test_all_input_layouts()) {
     return ComputeAndCompareLiteralWithAllInputLayouts(
-        computation, *expected_ptr, arguments, expect_near, shape_with_layout);
+        computation, *expected_ptr, arguments, expect, shape_with_layout);
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, error));
-
-  if (execution_options_.debug_options().xla_test_add_command_buffer_mode()) {
-    return ComputeAndCompareLiteralWithCmdBuffer(computation, *expected_ptr,
-                                                 arguments, expect_near);
+  if (error) {
+    EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, *error));
+  } else {
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, actual));
   }
   return absl::OkStatus();
 }
@@ -486,30 +395,23 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
 
 void ClientLibraryTestBase::ComputeAndCompareTuple(
     XlaBuilder* builder, const Literal& expected,
-    absl::Span<GlobalData* const> arguments) {
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
   if (!actual_status.ok()) {
     return;
   }
   auto actual = std::move(actual_status).value();
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
-}
-
-void ClientLibraryTestBase::ComputeAndCompareTuple(
-    XlaBuilder* builder, const Literal& expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  auto actual_status = ExecuteAndTransfer(builder, arguments);
-  EXPECT_IS_OK(actual_status.status());
-  if (!actual_status.ok()) {
-    return;
+  if (error) {
+    EXPECT_TRUE(LiteralTestUtil::Near(expected, actual, *error));
+  } else {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
   }
-  auto actual = std::move(actual_status).value();
-  EXPECT_TRUE(LiteralTestUtil::Near(expected, actual, error));
 }
 
 void ClientLibraryTestBase::ComputeAndCompare(
-    XlaBuilder* builder, absl::Span<const Literal> arguments) {
+    XlaBuilder* builder, absl::Span<const Literal> arguments,
+    std::optional<ErrorSpec> error) {
   auto status_or_data = ComputeValueAndReference(builder, arguments);
   EXPECT_IS_OK(status_or_data);
   if (!status_or_data.ok()) {
@@ -517,19 +419,11 @@ void ClientLibraryTestBase::ComputeAndCompare(
   }
   Literal reference, result;
   std::tie(reference, result) = std::move(status_or_data).value();
-  EXPECT_TRUE(LiteralTestUtil::Equal(reference, result));
-}
-
-void ClientLibraryTestBase::ComputeAndCompare(
-    XlaBuilder* builder, absl::Span<const Literal> arguments, ErrorSpec error) {
-  auto status_or_data = ComputeValueAndReference(builder, arguments);
-  EXPECT_IS_OK(status_or_data);
-  if (!status_or_data.ok()) {
-    return;
+  if (error) {
+    EXPECT_TRUE(LiteralTestUtil::Near(reference, result, *error));
+  } else {
+    EXPECT_TRUE(LiteralTestUtil::Equal(reference, result));
   }
-  Literal reference, result;
-  std::tie(reference, result) = std::move(status_or_data).value();
-  EXPECT_TRUE(LiteralTestUtil::Near(reference, result, error));
 }
 
 absl::StatusOr<std::pair<Literal, Literal>>
@@ -578,28 +472,6 @@ ClientLibraryTestBase::ComputeValueAndReference(
   return std::make_pair(std::move(reference), std::move(result));
 }
 
-XlaComputation ClientLibraryTestBase::CreateScalarReluF32() {
-  XlaBuilder builder("relu");
-  auto shape = ShapeUtil::MakeShape(F32, {});
-  auto z_value = Parameter(&builder, 0, shape, "z_value");
-  auto zero = ConstantR0<float>(&builder, 0.0f);
-  Max(z_value, zero);
-  auto computation_status = builder.Build();
-  TF_CHECK_OK(computation_status.status());
-  return std::move(computation_status).value();
-}
-
-XlaComputation ClientLibraryTestBase::CreateScalarMax() {
-  XlaBuilder builder("max");
-  auto shape = ShapeUtil::MakeShape(test_type_, {});
-  auto x = Parameter(&builder, 0, shape, "x");
-  auto y = Parameter(&builder, 1, shape, "y");
-  Max(x, y);
-  auto computation_status = builder.Build();
-  TF_CHECK_OK(computation_status.status());
-  return std::move(computation_status).value();
-}
-
 std::unique_ptr<Array2D<float>> ClientLibraryTestBase::CreatePatternedMatrix(
     int rows, int cols, float offset) {
   auto array = std::make_unique<Array2D<float>>(rows, cols);
diff --git a/third_party/xla/xla/tests/client_library_test_base.h b/third_party/xla/xla/tests/client_library_test_base.h
index 6bc1008b1cfe..961f00abf3c0 100644
--- a/third_party/xla/xla/tests/client_library_test_base.h
+++ b/third_party/xla/xla/tests/client_library_test_base.h
@@ -16,6 +16,20 @@ limitations under the License.
 #ifndef XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
 #define XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
 
+// Inclusion of this header indicates that the test has NOT been migrated to use
+// HloRunnerPjRt. Migration requires tagging the build target so that the
+// correct dependencies are included. The whole target must be migrated at once.
+// This macro helps to ensure that migration test base classes are not used in
+// conjunction with ClientLibraryTestBase.
+// TODO: b/408276009 - Remove these macros once all tests have been migrated.
+#define XLA_TEST_NOT_MIGRATED_TO_HLO_RUNNER_PJRT
+#ifdef XLA_TEST_MIGRATED_TO_HLO_RUNNER_PJRT
+static_assert(false,
+              "ClientLibraryTestBase cannot be used in the same target as a "
+              "test that has been explicitly migrated to use HloRunnerPjRt.");
+#endif  // XLA_TEST_MIGRATED_TO_HLO_RUNNER_PJRT
+
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <type_traits>
@@ -33,6 +47,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tests/client_library_test_runner_utils.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/bitmap.h"
@@ -43,20 +58,6 @@ limitations under the License.
 
 namespace xla {
 
-template <typename TestCase>
-std::vector<TestCase> ExpandTestType(
-    absl::Span<const PrimitiveType> test_type_params,
-    absl::Span<const TestCase> specs) {
-  std::vector<TestCase> expanded;
-  for (const PrimitiveType test_type : test_type_params) {
-    for (const auto& spec : specs) {
-      expanded.push_back(spec);
-      expanded.back().test_type = test_type;
-    }
-  }
-  return expanded;
-}
-
 // A client library test establishes an in-process XLA client connection.
 class ClientLibraryTestBase : public ::testing::Test {
  protected:
@@ -124,69 +125,52 @@ class ClientLibraryTestBase : public ::testing::Test {
   // provided: one for floating point types with an ErrorSpec parameter, and one
   // for integral types without the ErrorSpec parameter.
   template <typename NativeT>
-  void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
-                           absl::Span<GlobalData* const> arguments);
-  template <typename NativeT>
   void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
                            absl::Span<GlobalData* const> arguments,
-                           ErrorSpec error);
+                           std::optional<ErrorSpec> error = std::nullopt);
 
   template <typename NativeT>
-  void ComputeAndCompareR1(XlaBuilder* builder,
-                           absl::Span<const NativeT> expected,
-                           absl::Span<GlobalData* const> arguments);
-  template <typename NativeT>
   void ComputeAndCompareR1(XlaBuilder* builder,
                            absl::Span<const NativeT> expected,
                            absl::Span<GlobalData* const> arguments,
-                           ErrorSpec error);
+                           std::optional<ErrorSpec> error = std::nullopt);
 
   // As above, but uses a bitmap to hold the predicate vector to avoid
   // deficiencies of vector<bool>.
   void ComputeAndCompareR1(XlaBuilder* builder,
                            const tsl::core::Bitmap& expected,
-                           absl::Span<GlobalData* const> arguments);
+                           absl::Span<GlobalData* const> arguments,
+                           std::optional<ErrorSpec> error = std::nullopt);
 
   template <typename NativeT>
-  void ComputeAndCompareR2(XlaBuilder* builder,
-                           const Array2D<NativeT>& expected,
-                           absl::Span<GlobalData* const> arguments);
-  template <typename NativeT>
   void ComputeAndCompareR2(XlaBuilder* builder,
                            const Array2D<NativeT>& expected,
                            absl::Span<GlobalData* const> arguments,
-                           ErrorSpec error);
+                           std::optional<ErrorSpec> error = std::nullopt);
 
   template <typename NativeT>
-  void ComputeAndCompareR3(XlaBuilder* builder,
-                           const Array3D<NativeT>& expected,
-                           absl::Span<GlobalData* const> arguments);
-  template <typename NativeT>
   void ComputeAndCompareR3(XlaBuilder* builder,
                            const Array3D<NativeT>& expected,
                            absl::Span<GlobalData* const> arguments,
-                           ErrorSpec error);
+                           std::optional<ErrorSpec> error = std::nullopt);
 
   template <typename NativeT>
-  void ComputeAndCompareR4(XlaBuilder* builder,
-                           const Array4D<NativeT>& expected,
-                           absl::Span<GlobalData* const> arguments);
-  template <typename NativeT>
   void ComputeAndCompareR4(XlaBuilder* builder,
                            const Array4D<NativeT>& expected,
                            absl::Span<GlobalData* const> arguments,
-                           ErrorSpec error);
+                           std::optional<ErrorSpec> error = std::nullopt);
 
   // Build and run the computation and compare the result with the given
   // literal. shape_with_layout indicates the result layout to request when
   // calling Execute.
   void ComputeAndCompareLiteral(XlaBuilder* builder, const Literal& expected,
                                 absl::Span<GlobalData* const> arguments,
+                                std::optional<ErrorSpec> error = std::nullopt,
                                 const Shape* shape_with_layout = nullptr);
+
   void ComputeAndCompareLiteral(XlaBuilder* builder, const Literal& expected,
                                 absl::Span<GlobalData* const> arguments,
-                                ErrorSpec error,
-                                const Shape* shape_with_layout = nullptr);
+                                const Shape* shape_with_layout);
 
   // Build and run the computation and return the result as a literal.
   // shape_with_layout indicates the result layout to request when calling
@@ -199,10 +183,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   absl::Status ComputeAndCompareLiteralWithStatus(
       XlaBuilder* builder, const Literal& expected,
       absl::Span<GlobalData* const> arguments,
-      const Shape* shape_with_layout = nullptr);
-  absl::Status ComputeAndCompareLiteralWithStatus(
-      XlaBuilder* builder, const Literal& expected,
-      absl::Span<GlobalData* const> arguments, ErrorSpec error,
+      std::optional<ErrorSpec> error = std::nullopt,
       const Shape* shape_with_layout = nullptr);
 
   // Compare the result of the computation to a strings. In XLA strings are
@@ -212,28 +193,22 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // Convenience method for running a built computation, transferring the
   // result, and comparing it to the expected tuple literal.
-  void ComputeAndCompareTuple(XlaBuilder* builder, const Literal& expected,
-                              absl::Span<GlobalData* const> arguments);
   void ComputeAndCompareTuple(XlaBuilder* builder, const Literal& expected,
                               absl::Span<GlobalData* const> arguments,
-                              ErrorSpec error);
-
+                              std::optional<ErrorSpec> error = std::nullopt);
   // Convenience method for running a built computation and comparing the result
   // with the reference result.
   void ComputeAndCompare(XlaBuilder* builder,
-                         absl::Span<const Literal> arguments);
-  void ComputeAndCompare(XlaBuilder* builder,
-                         absl::Span<const Literal> arguments, ErrorSpec error);
-  template <typename NativeT>
-  void ComputeAndCompare(XlaBuilder* builder, const Array<NativeT>& expected,
-                         absl::Span<GlobalData* const> arguments);
+                         absl::Span<const Literal> arguments,
+                         std::optional<ErrorSpec> error = std::nullopt);
+
   template <typename NativeT>
   void ComputeAndCompare(XlaBuilder* builder, const Array<NativeT>& expected,
                          absl::Span<GlobalData* const> arguments,
-                         ErrorSpec error);
+                         std::optional<ErrorSpec> error = std::nullopt);
   // Create scalar operations for use in reductions.
-  XlaComputation CreateScalarReluF32();
-  XlaComputation CreateScalarMax();
+  XlaComputation CreateScalarReluF32() { return xla::CreateScalarReluF32(); }
+  XlaComputation CreateScalarMax() { return xla::CreateScalarMax(test_type_); }
 
   // Special case convenience functions for creating filled arrays.
 
@@ -410,6 +385,8 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   LocalClient* client_;
   LocalClient* ref_client_;  // To compute reference result.
+
+  // The execution options to use for the test.
   ExecutionOptions execution_options_;
 
  private:
@@ -446,23 +423,21 @@ class ClientLibraryTestBase : public ::testing::Test {
   template <typename T>
   static constexpr inline bool is_floating_or_complex_v =
       std::disjunction_v<is_specialized_floating_point<T>, is_complex<T>>;
-};
 
-template <typename NativeT>
-void ClientLibraryTestBase::ComputeAndCompareR0(
-    XlaBuilder* builder, NativeT expected,
-    absl::Span<GlobalData* const> arguments) {
-  Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
-                                                  arguments);
-}
+  template <typename NativeT>
+  void CheckErrorSpec(std::optional<ErrorSpec> error) {
+    if (error.has_value()) {
+      CHECK(is_floating_or_complex_v<NativeT>)
+          << "Float or complex type required when specifying an ErrorSpec";
+    }
+  }
+};
 
 template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
     XlaBuilder* builder, NativeT expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(is_floating_or_complex_v<NativeT>,
-                "Float or complex type required when specifying an ErrorSpec");
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
+  CheckErrorSpec<NativeT>(error);
   Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
                                                   arguments, error);
@@ -471,18 +446,8 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
 template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
     XlaBuilder* builder, absl::Span<const NativeT> expected,
-    absl::Span<GlobalData* const> arguments) {
-  Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
-                                                  arguments);
-}
-
-template <typename NativeT>
-void ClientLibraryTestBase::ComputeAndCompareR1(
-    XlaBuilder* builder, absl::Span<const NativeT> expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(is_floating_or_complex_v<NativeT>,
-                "Float or complex type required when specifying an ErrorSpec");
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
+  CheckErrorSpec<NativeT>(error);
   Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
                                                   arguments, error);
@@ -491,19 +456,8 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
 template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
     XlaBuilder* builder, const Array2D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments) {
-  Literal expected_literal =
-      LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
-                                                  arguments);
-}
-
-template <typename NativeT>
-void ClientLibraryTestBase::ComputeAndCompareR2(
-    XlaBuilder* builder, const Array2D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(is_floating_or_complex_v<NativeT>,
-                "Float or complex type required when specifying an ErrorSpec");
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
+  CheckErrorSpec<NativeT>(error);
   Literal expected_literal =
       LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -513,19 +467,8 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
 template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
     XlaBuilder* builder, const Array3D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments) {
-  Literal expected_literal =
-      LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
-                                                  arguments);
-}
-
-template <typename NativeT>
-void ClientLibraryTestBase::ComputeAndCompareR3(
-    XlaBuilder* builder, const Array3D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(is_floating_or_complex_v<NativeT>,
-                "Float or complex type required when specifying an ErrorSpec");
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
+  CheckErrorSpec<NativeT>(error);
   Literal expected_literal =
       LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -535,19 +478,8 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
 template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
     XlaBuilder* builder, const Array4D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments) {
-  Literal expected_literal =
-      LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
-                                                  arguments);
-}
-
-template <typename NativeT>
-void ClientLibraryTestBase::ComputeAndCompareR4(
-    XlaBuilder* builder, const Array4D<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(is_floating_or_complex_v<NativeT>,
-                "Float or complex type required when specifying an ErrorSpec");
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
+  CheckErrorSpec<NativeT>(error);
   Literal expected_literal =
       LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -557,18 +489,8 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
 template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompare(
     XlaBuilder* builder, const Array<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments) {
-  Literal expected_literal = LiteralUtil::CreateFromArray<NativeT>(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
-                                                  arguments);
-}
-
-template <typename NativeT>
-void ClientLibraryTestBase::ComputeAndCompare(
-    XlaBuilder* builder, const Array<NativeT>& expected,
-    absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(is_floating_or_complex_v<NativeT>,
-                "Float or complex type required when specifying an ErrorSpec");
+    absl::Span<GlobalData* const> arguments, std::optional<ErrorSpec> error) {
+  CheckErrorSpec<NativeT>(error);
   Literal expected_literal = LiteralUtil::CreateFromArray<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
                                                   arguments, error);
@@ -642,27 +564,16 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateParameter(
 
 template <typename NativeT>
 std::vector<NativeT> ClientLibraryTestBase::CreatePseudorandomR1(
-    const int width, NativeT min_value, NativeT max_value, uint32_t seed) {
-  std::vector<NativeT> result(width);
-  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
-  for (int i = 0; i < width; ++i) {
-    result[i] = generator.get();
-  }
-  return result;
+    const int width, NativeT min_value, NativeT max_value,
+    const uint32_t seed) {
+  return xla::CreatePseudorandomR1(width, min_value, max_value, seed);
 }
 
 template <typename NativeT>
 std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
     const int rows, const int cols, NativeT min_value, NativeT max_value,
-    uint32_t seed) {
-  auto result = std::make_unique<Array2D<NativeT>>(rows, cols);
-  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
-  for (int y = 0; y < rows; ++y) {
-    for (int x = 0; x < cols; ++x) {
-      (*result)(y, x) = generator.get();
-    }
-  }
-  return result;
+    const uint32_t seed) {
+  return xla::CreatePseudorandomR2(rows, cols, min_value, max_value, seed);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/client_library_test_runner_mixin.h b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
index 8dacaa148d93..159ef5318b42 100644
--- a/third_party/xla/xla/tests/client_library_test_runner_mixin.h
+++ b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
@@ -21,7 +21,9 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
@@ -34,10 +36,14 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_module_util.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/client_library_test_runner_utils.h"
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/bitmap.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -72,6 +78,14 @@ class ClientLibraryTestRunnerMixin : public T {
       "Mixin must be used with a subclass of HloRunnerAgnosticTestBase and "
       "HloRunnerAgnosticReferenceMixin.");
 
+  template <typename NativeT>
+  void CheckErrorSpec(std::optional<ErrorSpec> error) {
+    if (error.has_value()) {
+      CHECK(is_floating_or_complex_v<NativeT>)
+          << "Float or complex type required when specifying an ErrorSpec";
+    }
+  }
+
  protected:
   template <typename... BaseArgs>
   explicit ClientLibraryTestRunnerMixin(BaseArgs&&... base_args)
@@ -84,21 +98,27 @@ class ClientLibraryTestRunnerMixin : public T {
 
   absl::StatusOr<Literal> ExecuteAndTransfer(
       const XlaComputation& computation,
-      const absl::Span<Literal* const> arguments,
+      const absl::Span<const Literal* const> arguments,
       const Shape* const shape_with_output_layout = nullptr) {
     ExecutionOptions execution_options = execution_options_;
     if (shape_with_output_layout != nullptr) {
       *execution_options.mutable_shape_with_output_layout() =
           shape_with_output_layout->ToProto();
     }
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModule> module,
-        BuildAndVerifyHloModule(computation, &execution_options));
+    std::vector<const Shape*> argument_shapes;
+    argument_shapes.reserve(arguments.size());
+    for (const Literal* argument : arguments) {
+      argument_shapes.push_back(&argument->shape());
+    }
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        BuildAndVerifyHloModule(computation, argument_shapes,
+                                                &execution_options));
     return this->Execute(std::move(module), arguments);
   }
 
   absl::StatusOr<Literal> ExecuteAndTransfer(
-      XlaBuilder* const builder, const absl::Span<Literal* const> arguments,
+      XlaBuilder* const builder,
+      const absl::Span<const Literal* const> arguments,
       const Shape* shape_with_output_layout = nullptr) {
     // Build the computation, as a convenience.
     TF_ASSIGN_OR_RETURN(XlaComputation computation, builder->Build());
@@ -108,8 +128,9 @@ class ClientLibraryTestRunnerMixin : public T {
 
   // Run a computation and return its value as a string. If an error
   // occurs, then instead return the error as a string.
-  std::string ExecuteToString(XlaBuilder* const builder,
-                              const absl::Span<Literal* const> arguments) {
+  std::string ExecuteToString(
+      XlaBuilder* const builder,
+      const absl::Span<const Literal* const> arguments) {
     const absl::StatusOr<Literal> result =
         ExecuteAndTransfer(builder, arguments);
     if (!result.ok()) {
@@ -122,11 +143,17 @@ class ClientLibraryTestRunnerMixin : public T {
   // Compare with reference.
   // Side effect: EXPECT_OK
   void ComputeAndCompare(XlaBuilder* const builder,
-                         const absl::Span<Literal* const> arguments,
+                         const absl::Span<const Literal* const> arguments,
                          const std::optional<ErrorSpec> error = std::nullopt) {
+    std::vector<const Shape*> argument_shapes;
+    argument_shapes.reserve(arguments.size());
+    for (const Literal* argument : arguments) {
+      argument_shapes.push_back(&argument->shape());
+    }
     TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder->Build());
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            BuildAndVerifyHloModule(computation));
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<HloModule> module,
+        BuildAndVerifyHloModule(computation, argument_shapes));
     EXPECT_TRUE(this->RunAndCompare(std::move(module), arguments, error));
   }
 
@@ -134,42 +161,67 @@ class ClientLibraryTestRunnerMixin : public T {
   // Side effect: EXPECT_OK
   void ComputeAndCompareLiteral(
       XlaBuilder* const builder, const Literal& expected,
-      const absl::Span<Literal* const> arguments,
-      const std::optional<ErrorSpec> error = std::nullopt) {
-    TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder->Build());
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            BuildAndVerifyHloModule(computation));
-    TF_ASSERT_OK_AND_ASSIGN(Literal actual,
-                            this->Execute(std::move(module), arguments));
-    if (!error.has_value()) {
-      EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
+      const absl::Span<const Literal* const> arguments,
+      const Shape* shape_with_layout) {
+    return ComputeAndCompareLiteral(builder, expected, arguments, std::nullopt,
+                                    shape_with_layout);
+  }
+
+  // Compare with literal.
+  // Side effect: EXPECT_OK
+  void ComputeAndCompareLiteral(
+      XlaBuilder* const builder, const Literal& expected,
+      const absl::Span<const Literal* const> arguments,
+      const std::optional<ErrorSpec> error = std::nullopt,
+      const Shape* shape_with_layout = nullptr) {
+    if (error == std::nullopt) {
+      if (ShapeUtil::ElementIsFloating(expected.shape()) ||
+          ShapeUtil::ElementIsComplex(expected.shape())) {
+        LOG(WARNING) << "performing exact comparison of floating point numbers";
+      }
+    }
+    // We allow using a float expected literal for a non float outputs. In this
+    // case, we need to convert the expected literal to test_type_.
+    const Literal* expected_ptr = &expected;
+    Literal converted_expected;
+    Shape layout_shape;
+    if (test_type_ != F32) {
+      converted_expected = MaybeConvertLiteralToTestType(expected);
+      expected_ptr = &converted_expected;
+      if (shape_with_layout != nullptr) {
+        layout_shape = *shape_with_layout;
+        ShapeUtil::ForEachMutableSubshape(
+            &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
+              if (subshape->element_type() == F32) {
+                subshape->set_element_type(test_type_);
+              }
+            });
+        shape_with_layout = &layout_shape;
+      }
+    }
+    TF_ASSERT_OK_AND_ASSIGN(
+        Literal actual,
+        this->ExecuteAndTransfer(builder, arguments, shape_with_layout));
+    if (error.has_value()) {
+      EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, *error));
     } else {
-      EXPECT_TRUE(LiteralTestUtil::Near(expected, actual, *error));
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, actual));
     }
   }
 
   // Compare with literal.
   // Side effect: EXPECT_OK
   void ComputeAndCompareTuple(XlaBuilder* builder, const Literal& expected,
-                              absl::Span<Literal* const> arguments,
+                              absl::Span<const Literal* const> arguments,
                               std::optional<ErrorSpec> error = std::nullopt) {
     return ComputeAndCompareLiteral(builder, expected, arguments, error);
   }
 
   template <typename NativeT>
   void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
-                           absl::Span<Literal* const> arguments) {
-    Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
-    ComputeAndCompareLiteral(builder, expected_literal, arguments);
-  }
-
-  template <typename NativeT>
-  void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
-                           absl::Span<Literal* const> arguments,
-                           ErrorSpec error) {
-    static_assert(
-        is_floating_or_complex_v<NativeT>,
-        "Float or complex type required when specifying an ErrorSpec");
+                           absl::Span<const Literal* const> arguments,
+                           std::optional<ErrorSpec> error = std::nullopt) {
+    CheckErrorSpec<NativeT>(error);
     Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
     ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
   }
@@ -177,47 +229,27 @@ class ClientLibraryTestRunnerMixin : public T {
   template <typename NativeT>
   void ComputeAndCompareR1(XlaBuilder* builder,
                            absl::Span<const NativeT> expected,
-                           absl::Span<Literal* const> arguments) {
-    Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
-    ComputeAndCompareLiteral(builder, expected_literal, arguments);
-  }
-
-  template <typename NativeT>
-  void ComputeAndCompareR1(XlaBuilder* builder,
-                           absl::Span<const NativeT> expected,
-                           absl::Span<Literal* const> arguments,
-                           ErrorSpec error) {
-    static_assert(
-        is_floating_or_complex_v<NativeT>,
-        "Float or complex type required when specifying an ErrorSpec");
+                           absl::Span<const Literal* const> arguments,
+                           std::optional<ErrorSpec> error = std::nullopt) {
+    CheckErrorSpec<NativeT>(error);
     Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
     ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
   }
 
   void ComputeAndCompareR1(XlaBuilder* builder,
                            const tsl::core::Bitmap& expected,
-                           absl::Span<Literal* const> arguments) {
+                           absl::Span<const Literal* const> arguments,
+                           std::optional<ErrorSpec> error = std::nullopt) {
     Literal expected_literal = LiteralUtil::CreateR1(expected);
-    ComputeAndCompareLiteral(builder, expected_literal, arguments);
-  }
-
-  template <typename NativeT>
-  void ComputeAndCompareR2(XlaBuilder* builder,
-                           const Array2D<NativeT>& expected,
-                           absl::Span<Literal* const> arguments) {
-    Literal expected_literal =
-        LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
-    ComputeAndCompareLiteral(builder, expected_literal, arguments);
+    ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
   }
 
   template <typename NativeT>
   void ComputeAndCompareR2(XlaBuilder* builder,
                            const Array2D<NativeT>& expected,
-                           absl::Span<Literal* const> arguments,
-                           ErrorSpec error) {
-    static_assert(
-        is_floating_or_complex_v<NativeT>,
-        "Float or complex type required when specifying an ErrorSpec");
+                           absl::Span<const Literal* const> arguments,
+                           std::optional<ErrorSpec> error = std::nullopt) {
+    CheckErrorSpec<NativeT>(error);
     Literal expected_literal =
         LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
     ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
@@ -226,20 +258,9 @@ class ClientLibraryTestRunnerMixin : public T {
   template <typename NativeT>
   void ComputeAndCompareR3(XlaBuilder* builder,
                            const Array3D<NativeT>& expected,
-                           absl::Span<Literal* const> arguments) {
-    Literal expected_literal =
-        LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
-    ComputeAndCompareLiteral(builder, expected_literal, arguments);
-  }
-
-  template <typename NativeT>
-  void ComputeAndCompareR3(XlaBuilder* builder,
-                           const Array3D<NativeT>& expected,
-                           absl::Span<Literal* const> arguments,
-                           ErrorSpec error) {
-    static_assert(
-        is_floating_or_complex_v<NativeT>,
-        "Float or complex type required when specifying an ErrorSpec");
+                           absl::Span<const Literal* const> arguments,
+                           std::optional<ErrorSpec> error = std::nullopt) {
+    CheckErrorSpec<NativeT>(error);
     Literal expected_literal =
         LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
     ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
@@ -248,25 +269,30 @@ class ClientLibraryTestRunnerMixin : public T {
   template <typename NativeT>
   void ComputeAndCompareR4(XlaBuilder* builder,
                            const Array4D<NativeT>& expected,
-                           absl::Span<Literal* const> arguments) {
+                           absl::Span<const Literal* const> arguments,
+                           std::optional<ErrorSpec> error = std::nullopt) {
+    CheckErrorSpec<NativeT>(error);
     Literal expected_literal =
         LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
-    ComputeAndCompareLiteral(builder, expected_literal, arguments);
+    ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
   }
 
-  template <typename NativeT>
-  void ComputeAndCompareR4(XlaBuilder* builder,
-                           const Array4D<NativeT>& expected,
-                           absl::Span<Literal* const> arguments,
-                           ErrorSpec error) {
-    static_assert(
-        is_floating_or_complex_v<NativeT>,
-        "Float or complex type required when specifying an ErrorSpec");
-    Literal expected_literal =
-        LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
-    ComputeAndCompareLiteral(builder, expected_literal, arguments, error);
+  // Compare with string.
+  // Side effect: EXPECT
+  void ComputeAndCompareR1U8(XlaBuilder* builder,
+                             const absl::string_view expected,
+                             absl::Span<const Literal* const> arguments) {
+    const absl::StatusOr<Literal> actual =
+        ExecuteAndTransfer(builder, arguments);
+    TF_EXPECT_OK(actual.status());
+    if (!actual.ok()) {
+      return;
+    }
+    EXPECT_EQ(actual->GetR1U8AsString(), expected);
   }
 
+  XlaComputation CreateScalarMax() { return xla::CreateScalarMax(test_type_); }
+
   Literal CreateParameterAndTransferLiteral(const int64_t parameter_number,
                                             const Literal& literal,
                                             const std::string& name,
@@ -298,6 +324,36 @@ class ClientLibraryTestRunnerMixin : public T {
     return literal;
   }
 
+  template <typename NativeT>
+  Literal CreateR2Parameter(const Array2D<NativeT>& array_2d,
+                            int64_t parameter_number, const std::string& name,
+                            XlaBuilder* builder, XlaOp* data_handle) {
+    Literal literal = LiteralUtil::CreateR2FromArray2D(array_2d);
+    literal = MaybeConvertLiteralToTestType(literal);
+    *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+    return literal;
+  }
+
+  template <typename NativeT>
+  Literal CreateR3Parameter(const Array3D<NativeT>& array_3d,
+                            int64_t parameter_number, const std::string& name,
+                            XlaBuilder* builder, XlaOp* data_handle) {
+    Literal literal = LiteralUtil::CreateR3FromArray3D(array_3d);
+    literal = MaybeConvertLiteralToTestType(literal);
+    *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+    return literal;
+  }
+
+  template <typename NativeT>
+  Literal CreateR4Parameter(const Array4D<NativeT>& array_4d,
+                            int64_t parameter_number, const std::string& name,
+                            XlaBuilder* builder, XlaOp* data_handle) {
+    Literal literal = LiteralUtil::CreateR4FromArray4D(array_4d);
+    literal = MaybeConvertLiteralToTestType(literal);
+    *data_handle = Parameter(builder, parameter_number, literal.shape(), name);
+    return literal;
+  }
+
   Literal MaybeConvertLiteralToTestType(const Literal& literal) const {
     switch (test_type_) {
       case BF16:
@@ -320,6 +376,9 @@ class ClientLibraryTestRunnerMixin : public T {
     opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
+  void SetSeed(const uint64_t seed) { execution_options_.set_seed(seed); }
+  void ClearSeed() { execution_options_.clear_seed(); }
+
   // Provides mutable access to the execution DebugOptions field; this lets
   // tests tweak the options that will be used to compile/run the graph.
   DebugOptions* mutable_debug_options() {
@@ -329,18 +388,20 @@ class ClientLibraryTestRunnerMixin : public T {
  private:
   absl::StatusOr<std::unique_ptr<HloModule>> BuildAndVerifyHloModule(
       const XlaComputation& computation,
+      absl::Span<const Shape* const> argument_shapes,
       const ExecutionOptions* execution_options = nullptr) const {
     if (execution_options == nullptr) {
       execution_options = &execution_options_;
     }
+    TF_ASSIGN_OR_RETURN(const ProgramShape program_shape,
+                        computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
-        HloModuleConfig module_config,
-        HloModule::CreateModuleConfigFromProto(
-            computation.proto(), execution_options->debug_options(),
-            execution_options));
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(program_shape, argument_shapes, execution_options,
+                           /*default_num_replicas=*/1));
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModule> module,
-        HloModule::CreateFromProto(computation.proto(), module_config));
+        HloModule::CreateFromProto(computation.proto(), *module_config));
     TF_RETURN_IF_ERROR(this->verifier().Run(module.get()).status());
     return module;
   }
diff --git a/third_party/xla/xla/tests/client_library_test_runner_utils.cc b/third_party/xla/xla/tests/client_library_test_runner_utils.cc
new file mode 100644
index 000000000000..f344ca5311eb
--- /dev/null
+++ b/third_party/xla/xla/tests/client_library_test_runner_utils.cc
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tests/client_library_test_runner_utils.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "xla/array2d.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/status.h"
+
+namespace xla {
+
+XlaComputation CreateScalarReluF32() {
+  XlaBuilder builder("relu");
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+  XlaOp z_value = Parameter(&builder, 0, std::move(shape), "z_value");
+  XlaOp zero = ConstantR0<float>(&builder, 0.0f);
+  Max(std::move(z_value), std::move(zero));
+  absl::StatusOr<XlaComputation> computation = builder.Build();
+  TF_CHECK_OK(computation.status());
+  return *std::move(computation);
+}
+
+XlaComputation CreateScalarMax(const PrimitiveType test_type) {
+  XlaBuilder builder("max");
+  Shape shape = ShapeUtil::MakeShape(test_type, {});
+  XlaOp x = Parameter(&builder, 0, shape, "x");
+  XlaOp y = Parameter(&builder, 1, shape, "y");
+  Max(std::move(x), std::move(y));
+  absl::StatusOr<XlaComputation> computation = builder.Build();
+  TF_CHECK_OK(computation.status());
+  return *std::move(computation);
+}
+
+// Creates a (rows x cols) array filled in the following form:
+//
+//  [      0              1 ...                   cols-1]
+//  [  1,000          1,001 ...          1000.0 + cols-1]
+//  [    ...            ... ...                      ...]
+//  [(rows-1)*1000.0    ... ... (rows-1)*1000.0 + cols-1]
+//
+// If provided, offset is added uniformly to every element (e.g. an offset of
+// 64 would cause 0 in the above to be 64, 1 to be 65, 1000 to be 1064, etc.)
+std::unique_ptr<Array2D<float>> CreatePatternedMatrix(const int rows,
+                                                      const int cols,
+                                                      float offset) {
+  auto array = std::make_unique<Array2D<float>>(rows, cols);
+  for (int64_t row = 0; row < rows; ++row) {
+    for (int64_t col = 0; col < cols; ++col) {
+      (*array)(row, col) = col + (row * 1000.0f) + offset;
+    }
+  }
+  return array;
+}
+
+std::unique_ptr<Array2D<float>> CreatePatternedMatrixWithZeroPadding(
+    const int rows, const int cols, const int rows_padded,
+    const int cols_padded) {
+  CHECK_GE(rows_padded, rows);
+  CHECK_GE(cols_padded, cols);
+  auto array = std::make_unique<Array2D<float>>(rows_padded, cols_padded, 0.0);
+  for (int64_t row = 0; row < rows; ++row) {
+    for (int64_t col = 0; col < cols; ++col) {
+      (*array)(row, col) = col + (row * 1000.0f);
+    }
+  }
+  return array;
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/tests/client_library_test_runner_utils.h b/third_party/xla/xla/tests/client_library_test_runner_utils.h
new file mode 100644
index 000000000000..ac4f4b00600f
--- /dev/null
+++ b/third_party/xla/xla/tests/client_library_test_runner_utils.h
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_CLIENT_LIBRARY_TEST_RUNNER_UTILS_H_
+#define XLA_TESTS_CLIENT_LIBRARY_TEST_RUNNER_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/array2d.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/tests/test_utils.h"
+
+namespace xla {
+// Create scalar operations for use in reductions.
+XlaComputation CreateScalarReluF32();
+XlaComputation CreateScalarMax(PrimitiveType test_type);
+
+// Special case convenience functions for creating filled arrays.
+
+// Creates an array of pseudorandom values lying between the given minimum and
+// maximum values.
+template <typename NativeT>
+std::vector<NativeT> CreatePseudorandomR1(const int width, NativeT min_value,
+                                          NativeT max_value, uint32_t seed) {
+  std::vector<NativeT> result(width);
+  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
+  for (int i = 0; i < width; ++i) {
+    result[i] = generator.get();
+  }
+  return result;
+}
+
+template <typename NativeT>
+std::unique_ptr<Array2D<NativeT>> CreatePseudorandomR2(const int rows,
+                                                       const int cols,
+                                                       const NativeT min_value,
+                                                       const NativeT max_value,
+                                                       const uint32_t seed) {
+  auto result = std::make_unique<Array2D<NativeT>>(rows, cols);
+  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
+  for (int y = 0; y < rows; ++y) {
+    for (int x = 0; x < cols; ++x) {
+      (*result)(y, x) = generator.get();
+    }
+  }
+  return result;
+}
+
+std::unique_ptr<Array2D<float>> CreatePatternedMatrix(int rows, int cols,
+                                                      float offset = 0.0f);
+
+// Creates a (rows x cols) array as above, padded out to
+// (rows_padded x cols_padded) with zeroes.  Requires rows_padded >= rows
+// and cols_padded > cols.
+std::unique_ptr<Array2D<float>> CreatePatternedMatrixWithZeroPadding(
+    int rows, int cols, int rows_padded, int cols_padded);
+
+template <typename TestCase>
+std::vector<TestCase> ExpandTestType(
+    absl::Span<const PrimitiveType> test_type_params,
+    absl::Span<const TestCase> specs) {
+  std::vector<TestCase> expanded;
+  for (const PrimitiveType test_type : test_type_params) {
+    for (const auto& spec : specs) {
+      expanded.push_back(spec);
+      expanded.back().test_type = test_type;
+    }
+  }
+  return expanded;
+}
+}  // namespace xla
+
+#endif  // XLA_TESTS_CLIENT_LIBRARY_TEST_RUNNER_UTILS_H_
diff --git a/third_party/xla/xla/tests/client_test.cc b/third_party/xla/xla/tests/client_test.cc
index 8a7d76c59e8a..12e53c3c2049 100644
--- a/third_party/xla/xla/tests/client_test.cc
+++ b/third_party/xla/xla/tests/client_test.cc
@@ -13,22 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 #include <vector>
 
-#include "absl/status/statusor.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "xla/client/local_client.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/service.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -60,8 +66,9 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) {
       TF_ASSERT_OK_AND_ASSIGN(
           auto computed, client_->Transfer(*data, &expected_literal.shape()));
 
-      ASSERT_TRUE(LiteralTestUtil::EqualShapesAndLayouts(
-          expected_literal.shape(), computed.shape()));
+      ASSERT_THAT(
+          computed.shape().ToProto(),
+          tsl::proto_testing::EqualsProto(expected_literal.shape().ToProto()));
       EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, computed));
     }
   }
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test.cc b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
index 0bbb7417143e..b5445865dfb0 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/hlo_module_config.h"
@@ -506,6 +507,65 @@ XLA_TEST_P(AsyncCollectiveOps, CombinedCollectivePermute) {
   LiteralTestUtil::ExpectR1Equal<uint32_t>({0, 0, 10, 10}, results[1]);
 }
 
+XLA_TEST_P(AsyncCollectiveOps, CollectivePermuteCombiner) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+  ENTRY test_computation {
+    replica = u32[] replica-id()
+    ten = u32[] constant(10)
+    sum = u32[] add(replica, ten)
+    replica.1 = u32[2] broadcast(replica), dimensions={}
+    sum.1 = u32[2] broadcast(sum), dimensions={}
+    replica.2 = u32[2] add(replica.1, replica.1)
+    permute.0 = u32[2] collective-permute(replica.1), source_target_pairs={{0,1}, {1, 2}, {2, 3}, {3, 0}}
+    permute.1 = u32[2] collective-permute(replica.2), source_target_pairs={{0,1}, {1, 2}, {2, 3}, {3, 0}}
+    permute.2 = u32[2] collective-permute(sum.1), source_target_pairs={{0,1}, {1, 2}, {2, 3}, {3, 0}}
+    ROOT concat = u32[6] concatenate(permute.0, permute.1, permute.2), dimensions={0}
+  }
+  )";
+  const int64_t kNumReplicas = 4;
+  const bool enable_async_collective_permute = GetParam();
+  if (test_runner().device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << test_runner().device_count() << " available)";
+  }
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  TF_ASSERT_OK_AND_ASSIGN(const HloModule* const hlo_module,
+                          test_runner().HloModuleFromWrapped(executable.get()));
+
+  HloInstruction* cp_start =
+      FindInstruction(hlo_module, HloOpcode::kCollectivePermuteStart);
+  HloInstruction* cp_done =
+      FindInstruction(hlo_module, HloOpcode::kCollectivePermuteDone);
+
+  EXPECT_THAT(cp_start, NotNull());
+  // Count the number of collective permute start instructions in the module
+  int cp_start_count = 0;
+  for (const auto& computation : hlo_module->computations()) {
+    for (const auto& instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCollectivePermuteStart) {
+        cp_start_count++;
+      }
+    }
+  }
+  EXPECT_EQ(cp_start_count, 1)
+      << "Expected exactly one CollectivePermuteStart instruction";
+
+  // Expect 3 collective permute instructions combined into one.
+  EXPECT_EQ(cp_start->operand_count(), 3);
+  EXPECT_THAT(cp_done, NotNull());
+  EXPECT_EQ(IsAsync(cp_start), enable_async_collective_permute);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({3, 3, 6, 6, 13, 13}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({0, 0, 0, 0, 10, 10}, results[1]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({1, 1, 2, 2, 11, 11}, results[2]);
+  LiteralTestUtil::ExpectR1Equal<uint32_t>({2, 2, 4, 4, 12, 12}, results[3]);
+}
+
 XLA_TEST_P(AsyncCollectiveOps, AsyncReduceScatter) {
   const absl::string_view kModuleStr = R"(
   HloModule test
@@ -682,6 +742,111 @@ XLA_TEST_P(AsyncCollectiveOps, AsyncAllToAllWithoutSplitDim) {
   LiteralTestUtil::ExpectR1Equal<uint32_t>({40, 60, 44, 64}, results[1]);
 }
 
+XLA_TEST_P(AsyncCollectiveOps,
+           AsyncAllToAllNumberOfElementsLargerThanInt32Max) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id_u8 = u8[] convert(id)
+    a0 = u8[2,32768,32768] broadcast(id_u8), dimensions={}
+    ROOT a2a = u8[2,32768,32768] all-to-all(u8[2,32768,32768] a0),
+      replica_groups={{0,1}}, dimensions={0}
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  if (test_runner().device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << test_runner().device_count() << " available)";
+  }
+  const bool enable_async_all_to_all = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  TF_ASSERT_OK_AND_ASSIGN(const HloModule* const hlo_module,
+                          test_runner().HloModuleFromWrapped(executable.get()));
+
+  HloInstruction* a2a_start =
+      FindInstruction(hlo_module, HloOpcode::kAsyncStart);
+  HloInstruction* a2a_done = FindInstruction(hlo_module, HloOpcode::kAsyncDone);
+  ASSERT_THAT(a2a_start, NotNull());
+  ASSERT_THAT(a2a_done, NotNull());
+  HloAsyncInstruction* a2a_start_async = Cast<HloAsyncInstruction>(a2a_start);
+  EXPECT_EQ(a2a_start_async->async_wrapped_opcode(), HloOpcode::kAllToAll);
+  EXPECT_EQ(IsAsync(a2a_start_async), enable_async_all_to_all);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  // Sanity check only a few elements in each result, because checking all 2GB
+  // would take too long.
+  EXPECT_EQ(results[0].Get<uint8_t>({0, 0, 0}), 0);
+  EXPECT_EQ(results[0].Get<uint8_t>({1, 0, 0}), 1);
+
+  EXPECT_EQ(results[1].Get<uint8_t>({0, 0, 0}), 0);
+  EXPECT_EQ(results[1].Get<uint8_t>({1, 0, 0}), 1);
+}
+
+XLA_TEST_P(AsyncCollectiveOps, AsyncRaggedAllToAll_2GPUs_BF16) {
+  const absl::string_view kModuleStr = R"(
+HloModule test
+ENTRY entry {
+  input = bf16[2] constant({4., 8.})
+  output = bf16[2] constant({0., 0.})
+  input_offsets = s64[2] constant({0, 1})
+  send_sizes = s64[2] constant({1, 1})
+  c0 = s64[2] constant({0, 0})
+  replica_id = u32[] replica-id()
+  replica_id_s64 = s64[] convert(replica_id)
+  broadcast_replica_id = s64[2] broadcast(replica_id_s64), dimensions={}
+  output_offsets = s64[2] add(broadcast_replica_id, c0)
+  recv_sizes = s64[2] constant({1, 1})
+  ROOT ragged-all-to-all = bf16[2] ragged-all-to-all(input, output,
+    input_offsets, send_sizes, output_offsets, recv_sizes),
+    replica_groups={{0,1}}
+}
+)";
+
+  const int64_t kNumReplicas = 2;
+  if (test_runner().device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << test_runner().device_count() << " available)";
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CreateExecutable(kModuleStr, kNumReplicas));
+  TF_ASSERT_OK_AND_ASSIGN(const HloModule* const hlo_module,
+                          test_runner().HloModuleFromWrapped(executable.get()));
+
+  const bool enable_async_ragged_all_to_all = GetParam();
+  HloInstruction* ra2a_start =
+      FindInstruction(hlo_module, HloOpcode::kAsyncStart);
+  HloInstruction* ra2a_done =
+      FindInstruction(hlo_module, HloOpcode::kAsyncDone);
+  ASSERT_THAT(ra2a_start, NotNull());
+  ASSERT_THAT(ra2a_done, NotNull());
+  EXPECT_EQ(IsAsync(ra2a_start), enable_async_ragged_all_to_all);
+
+  HloAsyncInstruction* ra2a_start_async = Cast<HloAsyncInstruction>(ra2a_start);
+  EXPECT_EQ(ra2a_start_async->async_wrapped_opcode(),
+            HloOpcode::kRaggedAllToAll);
+
+  // Check that the element type of ragged-all-to-all was not changed from bf16.
+  EXPECT_EQ(
+      ra2a_start_async->async_wrapped_instruction()->shape().element_type(),
+      BF16);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(executable.get(), kNumReplicas));
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  const bfloat16 four = static_cast<bfloat16>(4.);
+  const bfloat16 eight = static_cast<bfloat16>(8.);
+  LiteralTestUtil::ExpectR1Equal<bfloat16>({four, four}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<bfloat16>({eight, eight}, results[1]);
+}
+
 XLA_TEST_P(AsyncMemcpyCollectiveOps, AsyncAllToAllMultipleReplicaGroups) {
   const absl::string_view kModuleStr = R"(
   HloModule test
@@ -1537,7 +1702,6 @@ ENTRY entry {
   HloModuleConfig config =
       GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   auto opts = GetDebugOptionsForTest();
-  opts.set_xla_gpu_enable_pipelined_collectives(true);
   opts.set_xla_gpu_enable_triton_gemm(false);
   CollectiveOpsVerifyF8Matmul(
       absl::StrReplaceAll(kModuleReplicatedStr, replacements_), opts);
@@ -1558,7 +1722,6 @@ class CollectiveOpsTestE2EPipelinedNonPipelined : public CollectiveOpsTestE2E {
     HloModuleConfig config =
         GetModuleConfigForTest(kNumReplicas, kNumPartitions);
     auto opts = GetDebugOptionsForTest();
-    opts.set_xla_gpu_enable_pipelined_collectives(true);
     config.set_debug_options(opts);
     TF_ASSERT_OK_AND_ASSIGN(auto module,
                             ParseAndReturnVerifiedModule(hlo_string, config));
@@ -1584,7 +1747,6 @@ class CollectiveOpsTestE2EPipelinedNonPipelined : public CollectiveOpsTestE2E {
     HloModuleConfig ref_config =
         GetModuleConfigForTest(kNumReplicas, kNumPartitions);
     auto ref_opts = GetDebugOptionsForTest();
-    ref_opts.set_xla_gpu_enable_pipelined_collectives(false);
     ref_opts.set_xla_gpu_enable_pipelined_all_reduce(false);
     ref_opts.set_xla_gpu_enable_pipelined_all_gather(false);
     ref_opts.set_xla_gpu_enable_pipelined_reduce_scatter(false);
@@ -1987,6 +2149,7 @@ enum class RaggedAllToAllImplType {
   kNccl,
   kMemcpy,
   kDecomposer,
+  kOneShot,
 };
 
 class RaggedAllToAllTest : public CollectiveOpsWithFlagsBase,
@@ -2154,6 +2317,8 @@ class RaggedAllToAllTest : public CollectiveOpsWithFlagsBase,
     DebugOptions opts = CollectiveOpsWithFlagsBase::GetDebugOptionsForTest();
     opts.set_xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(
         std::get<1>(GetParam()) == RaggedAllToAllImplType::kDecomposer);
+    opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(
+        std::get<1>(GetParam()) == RaggedAllToAllImplType::kOneShot);
     return opts;
   }
 
@@ -2494,9 +2659,9 @@ XLA_TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_Degenerate) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
 
-  TF_ASSERT_OK(
-      CreateRandomTestData(module.get(), /*input_sizes=*/{/*replica_0=*/{1},
-                                                          /*replica_1=*/{3}}));
+  TF_ASSERT_OK(CreateRandomTestData(module.get(),
+                                    /*input_sizes=*/{/*replica_0=*/{1},
+                                                     /*replica_1=*/{3}}));
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<Literal> results,
@@ -2666,7 +2831,7 @@ XLA_TEST_P(RaggedAllToAllTest, RaggedAllToAll_8GPUs_2ReplicasPerGroups) {
     output_offsets = s64[32] parameter(4)
     recv_sizes = s64[32] parameter(5)
     ROOT ra2a = f32[8192,1024] ragged-all-to-all(input, output,
-      input_offsets, send_sizes, output_offsets, recv_sizes), channel_id=3,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
       replica_groups={{0,4},{1,5},{2,6},{3,7}}
   })";
 
@@ -2765,6 +2930,8 @@ std::string RaggedAllToAllImplTypeName(
       return "memcpy";
     case RaggedAllToAllImplType::kDecomposer:
       return "decomposer";
+    case RaggedAllToAllImplType::kOneShot:
+      return "one_shot";
     default:
       LOG(FATAL) << "Unknown ragged all-to-all implementation type.";
   }
@@ -2775,7 +2942,8 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(::testing::Bool(),
                        ::testing::Values(RaggedAllToAllImplType::kNccl,
                                          RaggedAllToAllImplType::kMemcpy,
-                                         RaggedAllToAllImplType::kDecomposer)),
+                                         RaggedAllToAllImplType::kDecomposer,
+                                         RaggedAllToAllImplType::kOneShot)),
     [](const ::testing::TestParamInfo<std::tuple<bool, RaggedAllToAllImplType>>&
            info) {
       return absl::StrCat(GetAsyncTestName(std::get<0>(info.param)), "_",
@@ -2904,5 +3072,277 @@ ENTRY main.49 {
     EXPECT_TRUE(LiteralTestUtil::Near(ref_results[i], results[i], error_spec));
   }
 }
+
+TEST_F(CollectiveOpsTestE2E, MemcpyP2pLargeMessage) {
+  absl::string_view hlo_string = R"(
+HloModule MemcpyP2pLargeMessage, entry_computation_layout={(bf16[1024,64000]{1,0})->bf16[1024,64000]{1,0}}, num_partitions=4
+
+ENTRY main {
+  Arg_0.5 = bf16[1024,64000]{1,0} parameter(0)
+  collective-permute.0 = bf16[1024,64000]{1,0} collective-permute(Arg_0.5), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+  collective-permute.1 = bf16[1024,64000]{1,0} collective-permute(collective-permute.0), channel_id=2, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  collective-permute.2 = bf16[1024,64000]{1,0} collective-permute(collective-permute.1), channel_id=3, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  collective-permute.3 = bf16[1024,64000]{1,0} collective-permute(collective-permute.2), channel_id=4, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  collective-permute.4 = bf16[1024,64000]{1,0} collective-permute(collective-permute.3), channel_id=5, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  collective-permute.5 = bf16[1024,64000]{1,0} collective-permute(collective-permute.4), channel_id=6, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  collective-permute.6 = bf16[1024,64000]{1,0} collective-permute(collective-permute.5), channel_id=7, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  collective-permute.7 = bf16[1024,64000]{1,0} collective-permute(collective-permute.6), channel_id=8, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+
+  constant.0 = bf16[] constant(2)
+  broadcast.0 = bf16[1024,64000]{1,0} broadcast(constant.0), dimensions={}
+  collective-permute.8 = bf16[1024,64000]{1,0} collective-permute(broadcast.0), channel_id=6, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+  collective-permute.9 = bf16[1024,64000]{1,0} collective-permute(collective-permute.8), channel_id=9, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+  collective-permute.10 = bf16[1024,64000]{1,0} collective-permute(collective-permute.9), channel_id=10, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+  collective-permute.11 = bf16[1024,64000]{1,0} collective-permute(collective-permute.10), channel_id=11, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+
+  ROOT multiply.10 = bf16[1024,64000]{1,0} multiply(collective-permute.7, collective-permute.11)
+} // main
+)";
+
+  const int64_t kNumReplicas = 1;
+  const int64_t kNumPartitions = 4;
+  if (test_runner().device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << test_runner().device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config = GetModuleConfigForTest(kNumReplicas, kNumPartitions);
+  auto opts = GetDebugOptionsForTest();
+  opts.set_xla_gpu_use_memcpy_local_p2p(true);
+  opts.add_xla_disable_hlo_passes("gpu-convert-async-collectives-to-sync");
+
+  config.set_debug_options(opts);
+  config.set_use_spmd_partitioning(false);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+  auto fake_arguments = xla::MakeFakeArguments(module.get()).value();
+  std::vector<Literal*> fake_ptrs(fake_arguments.size());
+  for (int i = 0; i < fake_arguments.size(); ++i) {
+    fake_ptrs[i] = &fake_arguments[i];
+  }
+  HloModuleConfig ref_config =
+      GetModuleConfigForTest(kNumReplicas, kNumPartitions);
+  auto ref_opts = GetDebugOptionsForTest();
+  ref_opts.set_xla_gpu_use_memcpy_local_p2p(false);
+  ref_config.set_debug_options(ref_opts);
+  TF_ASSERT_OK_AND_ASSIGN(auto ref_module,
+                          ParseAndReturnVerifiedModule(hlo_string, ref_config));
+  auto fake_ref_arguments = xla::MakeFakeArguments(ref_module.get()).value();
+  std::vector<Literal*> ref_fake_ptrs(fake_ref_arguments.size());
+  for (int i = 0; i < fake_ref_arguments.size(); ++i) {
+    ref_fake_ptrs[i] = &fake_ref_arguments[i];
+  }
+
+  DeviceAssignment assn(/*replica_count=*/kNumReplicas,
+                        /*computation_count=*/kNumPartitions);
+  for (int64_t i = 0; i < kNumPartitions; ++i) {
+    assn(0, i) = i;
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      HloTestBase::ExecuteReplicated(
+          std::move(module), fake_ptrs, kNumPartitions, &assn,
+          /*run_hlo_passes=*/true, /*use-threads=*/true));
+  ASSERT_EQ(results.size(), kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> ref_results,
+      HloTestBase::ExecuteReplicated(
+          std::move(ref_module), ref_fake_ptrs, kNumPartitions, &assn,
+          /*run_hlo_passes=*/true, /*use-threads=*/true));
+  ASSERT_EQ(ref_results.size(), kNumPartitions);
+  ErrorSpec error_spec{1e-5, 1e-5};
+  // Expect same results with and without pipelining of collectives.
+  for (int i = 0; i < kNumPartitions; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Near(ref_results[i], results[i], error_spec));
+  }
+}
+
+TEST_F(CollectiveOpsTestE2E, AllgatherMemspaceWithNcclUserBuffer) {
+  absl::string_view hlo_string = R"(
+HloModule AllgatherMemspaceWithNcclUserBuffer, entry_computation_layout={(bf16[1024,1024]{1,0},bf16[1024,1024]{1,0})->bf16[4096,1024]{1,0}}, num_partitions=4
+
+ENTRY main {
+  Arg_1 = bf16[1024,1024]{1,0} parameter(0)
+  Arg_2 = bf16[1024,1024]{1,0} parameter(1)
+
+  add = bf16[1024,1024]{1,0} add(Arg_1, Arg_2)
+  all-gather-start = (bf16[1024,1024]{1,0},bf16[4096,1024]{1,0}) all-gather-start(add), dimensions={0}
+  all-gather-done = bf16[4096,1024]{1,0} all-gather-done(all-gather-start)
+
+  ROOT add2 = bf16[4096,1024]{1,0} add(all-gather-done, all-gather-done)
+} // main
+)";
+
+  const int64_t kNumReplicas = 1;
+  const int64_t kNumPartitions = 4;
+  if (test_runner().device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << test_runner().device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config = GetModuleConfigForTest(kNumReplicas, kNumPartitions);
+  auto opts = GetDebugOptionsForTest();
+  opts.set_xla_gpu_enable_nccl_user_buffers(true);
+  opts.add_xla_disable_hlo_passes("gpu-convert-async-collectives-to-sync");
+
+  config.set_debug_options(opts);
+  config.set_use_spmd_partitioning(false);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      CreateExecutable(std::move(module), /*run_hlo_passes=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(const HloModule* const executable_module,
+                          test_runner().HloModuleFromWrapped(executable.get()));
+  HloInstruction* ag_start =
+      FindInstructions(executable_module, HloOpcode::kAllGatherStart)[0];
+  // Both ag and its producer should have collective memory space 1
+  EXPECT_EQ(ag_start->shape().tuple_shapes()[1].layout().memory_space(), 1);
+  EXPECT_EQ(ag_start->operand(0)->shape().layout().memory_space(), 1);
+}
+
+class AllReduceTest
+    : public CollectiveOpsWithFlagsBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  AllReduceTest()
+      : CollectiveOpsWithFlagsBase(std::get<0>(GetParam()),
+                                   /*enable_p2p_memcpy=*/false) {}
+
+ protected:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions opts = CollectiveOpsWithFlagsBase::GetDebugOptionsForTest();
+
+    opts.set_xla_gpu_unsupported_use_all_reduce_one_shot_kernel(
+        std::get<1>(GetParam()));
+
+    return opts;
+  }
+};
+
+TEST_P(AllReduceTest, AsyncAllReduce_F32_2GPUs) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    param_0 = f32[65536] parameter(0)
+    ROOT all-reduce = f32[65536] all-reduce(param_0), to_apply=apply_op, replica_groups={{0,1}}
+  }
+  )";
+
+  const int64_t kNumReplicas = 2;
+  if (test_runner().device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << test_runner().device_count() << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  int64_t num_elements =
+      module->entry_computation()->root_instruction()->shape().dimensions()[0];
+
+  Array<float> input1({num_elements}), input2({num_elements});
+  input1.FillRandom(1.0f, 10.0f, /*seed=*/0);
+  input2.FillRandom(1.0f, 10.0f, /*seed=*/1);
+  Array<float> expected_output({num_elements});
+  expected_output.Each([&](absl::Span<const int64_t> indices, float* val) {
+    *val = input1(indices) + input2(indices);
+  });
+
+  Literal input_literal1 = LiteralUtil::CreateFromArray(input1);
+  Literal input_literal2 = LiteralUtil::CreateFromArray(input2);
+  Literal expected_output_literal =
+      LiteralUtil::CreateFromArray(expected_output);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      HloTestBase::ExecuteReplicated(std::move(module),
+                                     {{&input_literal1}, {&input_literal2}},
+                                     /*num_replicas=*/kNumReplicas,
+                                     /*run_hlo_passes=*/true,
+                                     /*device_assignment=*/nullptr));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_output_literal, results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_output_literal, results[1]));
+}
+
+TEST_P(AllReduceTest, AsyncAllReduce_BF16_2GPUs) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test
+
+  apply_op {
+    x = bf16[] parameter(0)
+    y = bf16[] parameter(1)
+    ROOT apply_op = bf16[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    param_0 = bf16[65536] parameter(0)
+    ROOT all-reduce = bf16[65536] all-reduce(param_0), to_apply=apply_op, replica_groups={{0,1}}
+  }
+  )";
+
+  const int64_t kNumReplicas = 2;
+  if (test_runner().device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << test_runner().device_count() << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  int64_t num_elements =
+      module->entry_computation()->root_instruction()->shape().dimensions()[0];
+
+  Array<bfloat16> input1({num_elements}), input2({num_elements});
+  input1.FillRandom(static_cast<bfloat16>(1.0f), 10.0f, /*seed=*/0);
+  input2.FillRandom(static_cast<bfloat16>(1.0f), 10.0f, /*seed=*/1);
+  Array<bfloat16> expected_output({num_elements});
+  expected_output.Each([&](absl::Span<const int64_t> indices, bfloat16* val) {
+    *val = input1(indices) + input2(indices);
+  });
+
+  Literal input_literal1 = LiteralUtil::CreateFromArray(input1);
+  Literal input_literal2 = LiteralUtil::CreateFromArray(input2);
+  Literal expected_output_literal =
+      LiteralUtil::CreateFromArray(expected_output);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      HloTestBase::ExecuteReplicated(std::move(module),
+                                     {{&input_literal1}, {&input_literal2}},
+                                     /*num_replicas=*/kNumReplicas,
+                                     /*run_hlo_passes=*/true,
+                                     /*device_assignment=*/nullptr));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_output_literal, results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_output_literal, results[1]));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    AllReduceTest, AllReduceTest,
+    ::testing::Combine(::testing::Bool(), ::testing::Bool()),
+    [](const ::testing::TestParamInfo<std::tuple<bool, bool>>& info) {
+      return absl::StrCat(GetAsyncTestName(std::get<0>(info.param)), "_",
+                          std::get<1>(info.param) ? "one_shot" : "nccl");
+    });
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc b/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc
index 60b1707927db..d8ad85520f40 100644
--- a/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc
+++ b/third_party/xla/xla/tests/collective_pipeline_parallelism_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tests/test_utils.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -62,6 +63,7 @@ class CollectivePipelineParallelismTest
         xla_gpu_experimental_pipeline_parallelism_opt_level_);
     debug_options.set_xla_gpu_enable_latency_hiding_scheduler(true);
     debug_options.set_xla_gpu_collective_permute_decomposer_threshold(0);
+    debug_options.set_xla_gpu_autotune_level(0);
     config.set_debug_options(debug_options);
 
     return config;
@@ -115,12 +117,6 @@ XLA_TEST_P(CollectivePipelineParallelismTest,
                  << test_runner().device_count() << " available)";
   }
 
-  // TODO(b/398888176): Remove this skip once cycle decomposer is removed.
-  if (xla_gpu_experimental_pipeline_parallelism_opt_level_ ==
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER) {
-    GTEST_SKIP();
-  }
-
   // Parse HLO module.
   HloModuleConfig config = GetModuleConfigForTest(
       /*replica_count=*/kNumReplicas, /*num_partitions=*/kNumPartitions);
@@ -317,12 +313,6 @@ XLA_TEST_P(CollectivePipelineParallelismTest, NaiveBFSMicrobatch4Replica4) {
                  << test_runner().device_count() << " available)";
   }
 
-  // TODO(b/398888176): Remove this skip once cycle decomposer is removed.
-  if (xla_gpu_experimental_pipeline_parallelism_opt_level_ ==
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER) {
-    GTEST_SKIP();
-  }
-
   // Parse HLO module.
   HloModuleConfig config = GetModuleConfigForTest(
       /*replica_count=*/kNumReplicas, /*num_partitions=*/kNumPartitions);
@@ -447,12 +437,6 @@ XLA_TEST_P(CollectivePipelineParallelismTest, NaiveBFSMicrobatch5Replica4) {
                  << test_runner().device_count() << " available)";
   }
 
-  // TODO(b/398888176): Remove this skip once cycle decomposer is removed.
-  if (xla_gpu_experimental_pipeline_parallelism_opt_level_ ==
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER) {
-    GTEST_SKIP();
-  }
-
   // Parse HLO module.
   HloModuleConfig config = GetModuleConfigForTest(
       /*replica_count=*/kNumReplicas, /*num_partitions=*/kNumPartitions);
@@ -576,12 +560,6 @@ XLA_TEST_P(CollectivePipelineParallelismTest,
                  << test_runner().device_count() << " available)";
   }
 
-  // TODO(b/398888176): Remove this skip once cycle decomposer is removed.
-  if (xla_gpu_experimental_pipeline_parallelism_opt_level_ ==
-      DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER) {
-    GTEST_SKIP();
-  }
-
   // Parse HLO module.
   HloModuleConfig config = GetModuleConfigForTest(
       /*replica_count=*/kNumReplicas, /*num_partitions=*/kNumPartitions);
@@ -1239,7 +1217,7 @@ XLA_TEST_P(CollectivePipelineParallelismTest,
           frontend_attributes={_xla_send_recv_source_target_pairs={{0,1}}},
           channel_id=1
       recv_ctx_ = (f32[2,2], u32[], token[]) recv(after_all),
-          frontend_attributes={_xla_send_recv_source_target_pairs={{0,1}}}, 
+          frontend_attributes={_xla_send_recv_source_target_pairs={{0,1}}},
           channel_id=2
       init = (u32[], (f32[2,2], u32[], token[]), (f32[2,2], u32[], token[]))
           tuple(i, send_ctx_, recv_ctx_)
@@ -1717,13 +1695,343 @@ XLA_TEST_P(CollectivePipelineParallelismTest,
       ErrorSpec{/*abs_error=*/1e-5, /*rel_error=*/1e-5}));
 }
 
+XLA_TEST_P(CollectivePipelineParallelismTest, JaxExampleWithDecomposedCycle) {
+  constexpr char kModuleStr[] = R"(
+HloModule jit_entry_computation, entry_computation_layout={
+    (f32[4,4096,4096]{2,1,0}, f32[4,5,4096,8192]{3,2,1,0})->
+    f32[4,5,4096,8192]{3,2,1,0}},
+    allow_spmd_sharding_propagation_to_parameters={false,false},
+    allow_spmd_sharding_propagation_to_output={true}, num_partitions=4
+
+%_where.10 (Arg_0.11: pred[], Arg_1.12: s32[], Arg_2.13: s32[]) -> s32[] {
+  %Arg_0.11 = pred[] parameter(0)
+  %Arg_1.12 = s32[] parameter(1)
+  %Arg_2.13 = s32[] parameter(2)
+  ROOT %select.14 = s32[] select(%Arg_0.11, %Arg_1.12, %Arg_2.13)
+}
+
+%remainder.15 (Arg_0.16: s32[], Arg_1.17: s32[]) -> s32[] {
+  %Arg_0.16 = s32[] parameter(0)
+  %Arg_1.17 = s32[] parameter(1)
+  %constant.19 = s32[] constant(0)
+  %compare.20 = pred[] compare(%Arg_1.17, %constant.19), direction=EQ
+  %constant.18 = s32[] constant(1)
+  %call.21 = s32[] call(%compare.20, %constant.18, %Arg_1.17),
+      to_apply=%_where.10
+  %remainder.22 = s32[] remainder(%Arg_0.16, %call.21)
+  %compare.24 = pred[] compare(%remainder.22, %constant.19), direction=LT
+  %compare.25 = pred[] compare(%call.21, %constant.19), direction=LT
+  %compare.26 = pred[] compare(%compare.24, %compare.25), direction=NE
+  %compare.23 = pred[] compare(%remainder.22, %constant.19), direction=NE
+  %and.27 = pred[] and(%compare.26, %compare.23)
+  %add.28 = s32[] add(%remainder.22, %call.21)
+  ROOT %select.29 = s32[] select(%and.27, %add.28, %remainder.22)
+}
+
+%_pad.30 (Arg_0.31: f32[4,1,4096,8192], Arg_1.32: s32[]) -> f32[5,1,4096,8192] {
+  %Arg_0.31 = f32[4,1,4096,8192]{3,2,1,0} parameter(0)
+  %Arg_1.32 = s32[] parameter(1)
+  %convert.33 = f32[] convert(%Arg_1.32)
+  ROOT %pad.34 = f32[5,1,4096,8192]{3,2,1,0} pad(%Arg_0.31, %convert.33),
+      padding=1_0x0_0x0_0x0_0
+}
+
+%_where_0.35 (Arg_0.36: pred[], Arg_1.37: f32[4,1,4096,8192],
+    Arg_2.38: f32[4,1,4096,8192]) -> f32[4,1,4096,8192] {
+  %Arg_0.36 = pred[] parameter(0)
+  %broadcast.39 = pred[4,1,4096,8192]{3,2,1,0} broadcast(%Arg_0.36),
+      dimensions={}
+  %Arg_1.37 = f32[4,1,4096,8192]{3,2,1,0} parameter(1)
+  %Arg_2.38 = f32[4,1,4096,8192]{3,2,1,0} parameter(2)
+  ROOT %select.40 = f32[4,1,4096,8192]{3,2,1,0} select(%broadcast.39, %Arg_1.37,
+      %Arg_2.38)
+}
+
+%_where_1.41 (Arg_0.42: pred[4,1,4096,8192], Arg_1.43: f32[4,1,4096,8192],
+    Arg_2.44: f32[4,1,4096,8192]) -> f32[4,1,4096,8192] {
+  %Arg_0.42 = pred[4,1,4096,8192]{3,2,1,0} parameter(0)
+  %Arg_1.43 = f32[4,1,4096,8192]{3,2,1,0} parameter(1)
+  %Arg_2.44 = f32[4,1,4096,8192]{3,2,1,0} parameter(2)
+  ROOT %select.45 = f32[4,1,4096,8192]{3,2,1,0} select(%Arg_0.42, %Arg_1.43,
+      %Arg_2.44)
+}
+
+%_where.46 (Arg_0.47: pred[], Arg_1.48: s32[], Arg_2.49: s32[]) -> s32[] {
+  %Arg_0.47 = pred[] parameter(0)
+  %Arg_1.48 = s32[] parameter(1)
+  %Arg_2.49 = s32[] parameter(2)
+  ROOT %select.50 = s32[] select(%Arg_0.47, %Arg_1.48, %Arg_2.49)
+}
+
+%remainder.51 (Arg_0.52: s32[], Arg_1.53: s32[]) -> s32[] {
+  %Arg_0.52 = s32[] parameter(0)
+  %Arg_1.53 = s32[] parameter(1)
+  %constant.55 = s32[] constant(0)
+  %compare.56 = pred[] compare(%Arg_1.53, %constant.55), direction=EQ
+  %constant.54 = s32[] constant(1)
+  %call.57 = s32[] call(%compare.56, %constant.54, %Arg_1.53),
+      to_apply=%_where.46
+  %remainder.58 = s32[] remainder(%Arg_0.52, %call.57)
+  %compare.60 = pred[] compare(%remainder.58, %constant.55), direction=LT
+  %compare.61 = pred[] compare(%call.57, %constant.55), direction=LT
+  %compare.62 = pred[] compare(%compare.60, %compare.61), direction=NE
+  %compare.59 = pred[] compare(%remainder.58, %constant.55), direction=NE
+  %and.63 = pred[] and(%compare.62, %compare.59)
+  %add.64 = s32[] add(%remainder.58, %call.57)
+  ROOT %select.65 = s32[] select(%and.63, %add.64, %remainder.58)
+}
+
+%_pad_2.66 (Arg_0.67: f32[4,1,4096,8192], Arg_1.68: s32[])
+    -> f32[7,1,4096,8192] {
+  %Arg_0.67 = f32[4,1,4096,8192]{3,2,1,0} parameter(0)
+  %Arg_1.68 = s32[] parameter(1)
+  %convert.69 = f32[] convert(%Arg_1.68)
+  ROOT %pad.70 = f32[7,1,4096,8192]{3,2,1,0} pad(%Arg_0.67, %convert.69),
+      padding=0_3x0_0x0_0x0_0
+}
+
+%_where.71 (Arg_0.72: pred[], Arg_1.73: s32[], Arg_2.74: s32[]) -> s32[] {
+  %Arg_0.72 = pred[] parameter(0)
+  %Arg_1.73 = s32[] parameter(1)
+  %Arg_2.74 = s32[] parameter(2)
+  ROOT %select.75 = s32[] select(%Arg_0.72, %Arg_1.73, %Arg_2.74)
+}
+
+%remainder.76 (Arg_0.77: s32[], Arg_1.78: s32[]) -> s32[] {
+  %Arg_0.77 = s32[] parameter(0)
+  %Arg_1.78 = s32[] parameter(1)
+  %constant.80 = s32[] constant(0)
+  %compare.81 = pred[] compare(%Arg_1.78, %constant.80), direction=EQ
+  %constant.79 = s32[] constant(1)
+  %call.82 = s32[] call(%compare.81, %constant.79, %Arg_1.78),
+      to_apply=%_where.71
+  %remainder.83 = s32[] remainder(%Arg_0.77, %call.82)
+  %compare.85 = pred[] compare(%remainder.83, %constant.80), direction=LT
+  %compare.86 = pred[] compare(%call.82, %constant.80), direction=LT
+  %compare.87 = pred[] compare(%compare.85, %compare.86), direction=NE
+  %compare.84 = pred[] compare(%remainder.83, %constant.80), direction=NE
+  %and.88 = pred[] and(%compare.87, %compare.84)
+  %add.89 = s32[] add(%remainder.83, %call.82)
+  ROOT %select.90 = s32[] select(%and.88, %add.89, %remainder.83)
+}
+
+%None.91 (Arg_0.92: f32[4,4096,4096], Arg_1.93: f32[4,5,4096,8192],
+    Arg_2.94: f32[4,5,4096,8192], Arg_3.95: f32[4,1,4096,8192],
+    Arg_4.96: f32[4,1,4096,8192], Arg_5.97: s32[])
+    -> (f32[4,4096,4096], f32[4,5,4096,8192], f32[4,5,4096,8192],
+    f32[4,1,4096,8192], f32[4,1,4096,8192]) {
+  %Arg_0.92 = f32[4,4096,4096]{2,1,0} parameter(0)
+  %Arg_1.93 = f32[4,5,4096,8192]{3,2,1,0} parameter(1)
+  %Arg_2.94 = f32[4,5,4096,8192]{3,2,1,0} parameter(2)
+  %iota.113 = s32[4]{0} iota(), iota_dimension=0
+  %broadcast.114 = s32[4,1,4096,8192]{3,2,1,0} broadcast(%iota.113),
+      dimensions={0}
+  %constant.98 = s32[] constant(0)
+  %broadcast.99 = s32[4,1,4096,8192]{3,2,1,0} broadcast(%constant.98),
+      dimensions={}
+  %compare.115 = pred[4,1,4096,8192]{3,2,1,0} compare(%broadcast.114,
+      %broadcast.99), direction=EQ
+  %Arg_5.97 = s32[] parameter(5)
+  %constant.102 = s32[] constant(5)
+  %compare.111 = pred[] compare(%Arg_5.97, %constant.102), direction=LT
+  %constant.103 = s32[] constant(0)
+  %call.104 = s32[] call(%Arg_5.97, %constant.102), to_apply=%remainder.15
+  %compare.105 = pred[] compare(%call.104, %constant.103), direction=LT
+  %add.106 = s32[] add(%call.104, %constant.102)
+  %select.107 = s32[] select(%compare.105, %add.106, %call.104)
+  %dynamic-slice.108 = f32[4,1,4096,8192]{3,2,1,0} dynamic-slice(%Arg_1.93,
+      %constant.103, %select.107, %constant.103, %constant.103),
+      dynamic_slice_sizes={4,1,4096,8192}
+  %Arg_4.96 = f32[4,1,4096,8192]{3,2,1,0} parameter(4)
+  %call.112 = f32[4,1,4096,8192]{3,2,1,0} call(%compare.111, %dynamic-slice.108,
+      %Arg_4.96), to_apply=%_where_0.35
+  %Arg_3.95 = f32[4,1,4096,8192]{3,2,1,0} parameter(3)
+  %call.109 = f32[5,1,4096,8192]{3,2,1,0} call(%Arg_3.95, %constant.103),
+      to_apply=%_pad.30
+  %slice.110 = f32[4,1,4096,8192]{3,2,1,0} slice(%call.109),
+      slice={[0:4], [0:1], [0:4096], [0:8192]}
+  %call.116 = f32[4,1,4096,8192]{3,2,1,0} call(%compare.115, %call.112,
+      %slice.110), to_apply=%_where_1.41
+  %reshape.117 = f32[4,4096,8192]{2,1,0} reshape(%call.116)
+  %dot.118 = f32[4,4096,8192]{2,1,0} dot(%Arg_0.92, %reshape.117),
+      lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0},
+      rhs_contracting_dims={1}
+  %reshape.150 = f32[4,1,4096,8192]{3,2,1,0} reshape(%dot.118)
+  %constant.100 = s32[] constant(2)
+  %add.159 = s32[] add(%Arg_5.97, %constant.100)
+  %call.160 = s32[] call(%add.159, %constant.102), to_apply=%remainder.76
+  %compare.161 = pred[] compare(%call.160, %constant.103), direction=LT
+  %add.162 = s32[] add(%call.160, %constant.102)
+  %select.163 = s32[] select(%compare.161, %add.162, %call.160)
+  %dynamic-update-slice.164 = f32[4,5,4096,8192]{3,2,1,0}
+      dynamic-update-slice(%Arg_2.94, %reshape.150, %constant.103, %select.163,
+      %constant.103, /*index=5*/%constant.103)
+  %constant.101 = s32[] constant(1)
+  %add.151 = s32[] add(%Arg_5.97, %constant.101)
+  %call.152 = s32[] call(%add.151, %constant.102), to_apply=%remainder.51
+  %compare.153 = pred[] compare(%call.152, %constant.103), direction=LT
+  %add.154 = s32[] add(%call.152, %constant.102)
+  %select.155 = s32[] select(%compare.153, %add.154, %call.152)
+  %dynamic-slice.156 = f32[4,1,4096,8192]{3,2,1,0} dynamic-slice(%Arg_2.94,
+      %constant.103, %select.155, %constant.103, %constant.103),
+      dynamic_slice_sizes={4,1,4096,8192}
+  %call.157 = f32[7,1,4096,8192]{3,2,1,0} call(%dynamic-slice.156,
+      %constant.103), to_apply=%_pad_2.66
+  %slice.158 = f32[4,1,4096,8192]{3,2,1,0} slice(%call.157),
+      slice={[3:7], [0:1], [0:4096], [0:8192]}
+  ROOT %tuple.165 = (f32[4,4096,4096]{2,1,0}, f32[4,5,4096,8192]{3,2,1,0},
+      f32[4,5,4096,8192]{3,2,1,0}, f32[4,1,4096,8192]{3,2,1,0},
+      f32[4,1,4096,8192]{3,2,1,0}) tuple(%Arg_0.92, %Arg_1.93,
+      %dynamic-update-slice.164, %reshape.150, %slice.158)
+}
+
+%region_0.166 (arg_tuple.167: (s32[], f32[4,4096,4096], f32[4,5,4096,8192],
+    f32[4,5,4096,8192], f32[4,1,4096,8192], /*index=5*/f32[4,1,4096,8192],
+    s32[13])) -> (s32[], f32[4,4096,4096], f32[4,5,4096,8192],
+    f32[4,5,4096,8192], f32[4,1,4096,8192], /*index=5*/f32[4,1,4096,8192],
+    s32[13]) {
+  %arg_tuple.167 = (s32[], f32[4,4096,4096]{2,1,0}, f32[4,5,4096,8192]{3,2,1,0},
+      f32[4,5,4096,8192]{3,2,1,0}, f32[4,1,4096,8192]{3,2,1,0},
+      /*index=5*/f32[4,1,4096,8192]{3,2,1,0}, s32[13]{0}) parameter(0)
+  %get-tuple-element.168 = s32[] get-tuple-element(%arg_tuple.167), index=0
+  %constant.175 = s32[] constant(1)
+  %add.184 = s32[] add(%get-tuple-element.168, %constant.175)
+  %get-tuple-element.169 = f32[4,4096,4096]{2,1,0}
+      get-tuple-element(%arg_tuple.167), index=1
+  %get-tuple-element.170 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.167), index=2
+  %get-tuple-element.171 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.167), index=3
+  %get-tuple-element.172 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.167), index=4
+  %get-tuple-element.173 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.167), index=5
+  %get-tuple-element.174 = s32[13]{0} get-tuple-element(%arg_tuple.167), index=6
+  %dynamic-slice.176 = s32[1]{0} dynamic-slice(%get-tuple-element.174,
+      %get-tuple-element.168), dynamic_slice_sizes={1}
+  %reshape.177 = s32[] reshape(%dynamic-slice.176)
+  %call.178 = (f32[4,4096,4096]{2,1,0}, f32[4,5,4096,8192]{3,2,1,0},
+      f32[4,5,4096,8192]{3,2,1,0}, f32[4,1,4096,8192]{3,2,1,0},
+      f32[4,1,4096,8192]{3,2,1,0}) call(%get-tuple-element.169,
+      %get-tuple-element.170, %get-tuple-element.171, %get-tuple-element.172,
+      %get-tuple-element.173, /*index=5*/%reshape.177), to_apply=%None.91
+  %get-tuple-element.179 = f32[4,4096,4096]{2,1,0} get-tuple-element(%call.178),
+      index=0
+  %get-tuple-element.180 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%call.178), index=1
+  %get-tuple-element.181 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%call.178), index=2
+  %get-tuple-element.182 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%call.178), index=3
+  %get-tuple-element.183 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%call.178), index=4
+  ROOT %tuple.185 = (s32[], f32[4,4096,4096]{2,1,0},
+      f32[4,5,4096,8192]{3,2,1,0}, f32[4,5,4096,8192]{3,2,1,0},
+      f32[4,1,4096,8192]{3,2,1,0}, /*index=5*/f32[4,1,4096,8192]{3,2,1,0},
+      s32[13]{0}) tuple(%add.184, %get-tuple-element.179,
+      %get-tuple-element.180, %get-tuple-element.181, %get-tuple-element.182,
+      /*index=5*/%get-tuple-element.183, %get-tuple-element.174)
+}
+
+%region_1.186 (arg_tuple.187: (s32[], f32[4,4096,4096], f32[4,5,4096,8192],
+    f32[4,5,4096,8192], f32[4,1,4096,8192], /*index=5*/f32[4,1,4096,8192],
+    s32[13])) -> pred[] {
+  %arg_tuple.187 = (s32[], f32[4,4096,4096]{2,1,0}, f32[4,5,4096,8192]{3,2,1,0},
+      f32[4,5,4096,8192]{3,2,1,0}, f32[4,1,4096,8192]{3,2,1,0},
+      /*index=5*/f32[4,1,4096,8192]{3,2,1,0}, s32[13]{0}) parameter(0)
+  %get-tuple-element.189 = f32[4,4096,4096]{2,1,0}
+      get-tuple-element(%arg_tuple.187), index=1
+  %get-tuple-element.190 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.187), index=2
+  %get-tuple-element.191 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.187), index=3
+  %get-tuple-element.192 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.187), index=4
+  %get-tuple-element.193 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%arg_tuple.187), index=5
+  %get-tuple-element.194 = s32[13]{0} get-tuple-element(%arg_tuple.187), index=6
+  %get-tuple-element.188 = s32[] get-tuple-element(%arg_tuple.187), index=0
+  %constant.195 = s32[] constant(13)
+  ROOT %compare.196 = pred[] compare(%get-tuple-element.188, %constant.195),
+      direction=LT
+}
+
+ENTRY %main.204 (Arg_0.1: f32[4,4096,4096], Arg_1.2: f32[4,5,4096,8192])
+    -> f32[4,5,4096,8192] {
+  %constant.3 = s32[] constant(0)
+  %Arg_0.1 = f32[4,4096,4096]{2,1,0} parameter(0),
+      sharding={devices=[4,1,1]<=[4]}
+  %Arg_1.2 = f32[4,5,4096,8192]{3,2,1,0} parameter(1),
+      sharding={devices=[4,1,1,1]<=[4]}
+  %constant.4 = f32[] constant(0)
+  %broadcast.5 = f32[4,5,4096,8192]{3,2,1,0} broadcast(%constant.4),
+      dimensions={}
+  %constant.6 = f32[] constant(0)
+  %broadcast.7 = f32[4,1,4096,8192]{3,2,1,0} broadcast(%constant.6),
+      dimensions={}
+  %iota.8 = s32[13]{0} iota(), iota_dimension=0
+  %tuple.9 = (s32[], f32[4,4096,4096]{2,1,0}, f32[4,5,4096,8192]{3,2,1,0},
+      f32[4,5,4096,8192]{3,2,1,0}, f32[4,1,4096,8192]{3,2,1,0},
+      /*index=5*/f32[4,1,4096,8192]{3,2,1,0}, s32[13]{0}) tuple(%constant.3,
+      %Arg_0.1, %Arg_1.2, %broadcast.5, %broadcast.7, /*index=5*/%broadcast.7,
+      %iota.8)
+  %while.197 = (s32[], f32[4,4096,4096]{2,1,0}, f32[4,5,4096,8192]{3,2,1,0},
+      f32[4,5,4096,8192]{3,2,1,0}, f32[4,1,4096,8192]{3,2,1,0},
+      /*index=5*/f32[4,1,4096,8192]{3,2,1,0}, s32[13]{0}) while(%tuple.9),
+      condition=%region_1.186, body=%region_0.166
+  %get-tuple-element.198 = s32[] get-tuple-element(%while.197), index=0
+  %get-tuple-element.199 = f32[4,4096,4096]{2,1,0}
+      get-tuple-element(%while.197), index=1
+  %get-tuple-element.200 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%while.197), index=2
+  ROOT %get-tuple-element.201 = f32[4,5,4096,8192]{3,2,1,0}
+      get-tuple-element(%while.197), index=3
+  %get-tuple-element.202 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%while.197), index=4
+  %get-tuple-element.203 = f32[4,1,4096,8192]{3,2,1,0}
+      get-tuple-element(%while.197), index=5
+}
+  )";
+
+  const int64_t kNumReplicas = 1;
+  const int64_t kNumPartitions = 4;
+  if (test_runner().device_count() < kNumReplicas * kNumPartitions) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas * kNumPartitions
+                 << " devices (" << test_runner().device_count()
+                 << " available)";
+  }
+
+  HloModuleConfig config = GetModuleConfigForTest(
+      /*replica_count=*/kNumReplicas, /*num_partitions=*/kNumPartitions);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  // Create device assignment running across partitions.
+  DeviceAssignment device_assignment(/*replica_count=*/kNumReplicas,
+                                     /*computation_count=*/kNumPartitions);
+  for (int64_t i = 0; i < kNumPartitions; ++i) {
+    device_assignment(0, i) = i;
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> fake_args,
+                          MakeFakeArguments(module.get()));
+  std::vector<Literal *> args;
+  for (auto &arg : fake_args) {
+    args.push_back(&arg);
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), args,
+                        /*num_replicas=*/kNumPartitions, &device_assignment,
+                        /*run_hlo_passes=*/true, /*use_threads=*/true));
+  ASSERT_EQ(results.size(), kNumPartitions);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     CollectivePipelineParallelismTestWithAndWithoutOpts,
     CollectivePipelineParallelismTest,
-    ::testing::ValuesIn(
-        {DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE,
-         DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE,
-         DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER}),
+    ::testing::ValuesIn({DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE,
+                         DebugOptions::PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE}),
     ::testing::PrintToStringParamName());
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
index c76df8f8a306..86d6d0c86052 100644
--- a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
+++ b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -30,19 +31,22 @@ limitations under the License.
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/collective_pipeliner.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/service/collective_pipeliner_utils.h"
+#include "xla/service/hlo_verifier.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 
 namespace xla {
 namespace {
 
-using CollectivePipelinerExecutionTest = HloTestBase;
+using CollectivePipelinerExecutionTest = HloPjRtTestBase;
 
 absl::StatusOr<bool> RunOptimizer(
     HloModule* module, bool last_run, int64_t level_to_operate_on = 0,
     HloPredicate should_process = HloPredicateIsOp<HloOpcode::kNegate>,
-    CollectivePipeliner::PipeliningDirection pipelining_direction =
-        CollectivePipeliner::PipeliningDirection::kForward,
+    collective_pipeliner_utils::PipeliningDirection pipelining_direction =
+        collective_pipeliner_utils::PipeliningDirection::kForward,
     bool pipeline_use_tree = false,
     HloPredicate acceptable_formatting = HloPredicateTrue,
     HloPredicate reuse_pipelined_op_buffer = HloPredicateTrue) {
@@ -183,7 +187,7 @@ ENTRY entry {
       RunOptimizer(module.get(), /*last_run=*/true, /*level_to_operate_on=*/0,
                    /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
                    /*pipelining_direction=*/
-                   CollectivePipeliner::PipeliningDirection::kForward,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
                    /*pipeline_use_tree=*/false,
                    /*acceptable_formatting=*/HloPredicateTrue,
                    /*reuse_pipelined_op_buffer=*/HloPredicateFalse)
@@ -811,11 +815,11 @@ ENTRY entry {
   auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
   auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
 
-  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true,
-                           0, /*should_process=*/
-                           HloPredicateIsOp<HloOpcode::kConcatenate>,
-                           CollectivePipeliner::PipeliningDirection::kBackward)
-                  .value());
+  EXPECT_TRUE(
+      RunOptimizer(module.get(), /*last_run=*/true, 0, /*should_process=*/
+                   HloPredicateIsOp<HloOpcode::kConcatenate>,
+                   collective_pipeliner_utils::PipeliningDirection::kBackward)
+          .value());
   EXPECT_FALSE(RunOptimizer(module2.get(), /*last_run=*/true, 0).value());
   XLA_VLOG_LINES(1, module->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
@@ -882,7 +886,7 @@ ENTRY entry {
   EXPECT_TRUE(
       RunOptimizer(module.get(), /*last_run=*/true, 0,
                    /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
-                   CollectivePipeliner::PipeliningDirection::kForward,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
                    /*pipeline_use_tree=*/true)
           .value());
   XLA_VLOG_LINES(1, module->ToString());
@@ -946,7 +950,7 @@ ENTRY entry {
   EXPECT_TRUE(
       RunOptimizer(module.get(), /*last_run=*/true, 0,
                    /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
-                   CollectivePipeliner::PipeliningDirection::kForward,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
                    /*pipeline_use_tree=*/true)
           .value());
   XLA_VLOG_LINES(1, module->ToString());
@@ -1010,12 +1014,12 @@ ENTRY entry {
   auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
   auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
 
-  EXPECT_TRUE(
-      RunOptimizer(module.get(), /*last_run=*/true, 0,
-                   /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
-                   CollectivePipeliner::PipeliningDirection::kForwardSink,
-                   /*pipeline_use_tree=*/true)
-          .value());
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true, 0,
+                  /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink,
+                  /*pipeline_use_tree=*/true)
+                  .value());
   XLA_VLOG_LINES(1, module->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
@@ -1076,12 +1080,12 @@ ENTRY entry {
   auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
   auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
 
-  EXPECT_TRUE(
-      RunOptimizer(module.get(), /*last_run=*/true, 0,
-                   /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
-                   CollectivePipeliner::PipeliningDirection::kForwardSink,
-                   /*pipeline_use_tree=*/true)
-          .value());
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true, 0,
+                  /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink,
+                  /*pipeline_use_tree=*/true)
+                  .value());
   XLA_VLOG_LINES(1, module->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
@@ -1149,7 +1153,7 @@ ENTRY entry {
       RunOptimizer(module.get(), /*last_run=*/true, 0,
                    /*should_process=*/
                    HloPredicateIsOp<HloOpcode::kNegate, HloOpcode::kSqrt>,
-                   CollectivePipeliner::PipeliningDirection::kForward,
+                   collective_pipeliner_utils::PipeliningDirection::kForward,
                    /*pipeline_use_tree=*/true)
           .value());
   XLA_VLOG_LINES(1, module->ToString());
@@ -1212,12 +1216,12 @@ ENTRY entry {
   auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
   auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
 
-  EXPECT_TRUE(
-      RunOptimizer(module.get(), /*last_run=*/true, 0,
-                   /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
-                   CollectivePipeliner::PipeliningDirection::kForwardSink,
-                   /*pipeline_use_tree=*/true)
-          .value());
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true, 0,
+                  /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink,
+                  /*pipeline_use_tree=*/true)
+                  .value());
   XLA_VLOG_LINES(1, module->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
@@ -1288,7 +1292,7 @@ ENTRY entry {
       RunOptimizer(
           module.get(), /*last_run=*/true, 0,
           /*should_process=*/HloPredicateIsOp<HloOpcode::kNegate>,
-          CollectivePipeliner::PipeliningDirection::kForwardSink,
+          collective_pipeliner_utils::PipeliningDirection::kForwardSink,
           /*pipeline_use_tree=*/true,
           /*acceptable_formatting=*/HloPredicateIsNotOp<HloOpcode::kNegate>)
           .value());
@@ -1380,13 +1384,13 @@ ENTRY entry {
   auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
   auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
 
-  EXPECT_TRUE(
-      RunOptimizer(module.get(), /*last_run=*/true, 0,
-                   /*should_process=*/
-                   HloPredicateIsOp<HloOpcode::kNegate, HloOpcode::kExp>,
-                   CollectivePipeliner::PipeliningDirection::kForwardSink,
-                   /*pipeline_use_tree=*/true)
-          .value());
+  EXPECT_TRUE(RunOptimizer(
+                  module.get(), /*last_run=*/true, 0,
+                  /*should_process=*/
+                  HloPredicateIsOp<HloOpcode::kNegate, HloOpcode::kExp>,
+                  collective_pipeliner_utils::PipeliningDirection::kForwardSink,
+                  /*pipeline_use_tree=*/true)
+                  .value());
   XLA_VLOG_LINES(1, module->ToString());
   XLA_VLOG_LINES(1, module2->ToString());
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
diff --git a/third_party/xla/xla/tests/complex_unary_op_samples.h b/third_party/xla/xla/tests/complex_unary_op_samples.h
index 1851e564473a..94d38ee05243 100644
--- a/third_party/xla/xla/tests/complex_unary_op_samples.h
+++ b/third_party/xla/xla/tests/complex_unary_op_samples.h
@@ -48,7 +48,11 @@ struct Log1p {
       const T pi3_4 = 2.3561945f;
       const T zero = 0.0f;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
@@ -354,7 +358,11 @@ struct Log1p {
       const T pi3_4 = 2.356194490192345;
       const T zero = 0.0;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
@@ -671,7 +679,11 @@ struct Tan {
       const T nan = std::nanf("");
       const T zero = 0.0f;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
@@ -804,15 +816,9 @@ struct Tan {
         /* 123 */ { { -2.e+00f, -min }, { 2.1850398e+00f, -6.7877737e-38f }, 2.5e-01f },
         /* 124 */ { { -3.6093321e-13f, -min }, { -3.6093321e-13f, -min }, 2.1990233e+12f },
         /* 125 */ { { -6.5136393e-26f, -min }, { -6.5136393e-26f, -min }, 9.6714066e+24f },
-#ifndef __aarch64__
-// TODO(b/342448599); Fix and re-enable on Arm.
         /* 126 */ { { -min, -min }, { -min, -min }, 4.2535296e+37f },
-#endif
         /* 127 */ { { zero, -min }, { zero, -min }, 4.2535296e+37f },
-#ifndef __aarch64__
-// TODO(b/342448599); Fix and re-enable on Arm.
         /* 128 */ { { min, -min }, { min, -min }, 4.2535296e+37f },
-#endif
         /* 129 */ { { 6.5136393e-26f, -min }, { 6.5136393e-26f, -min }, 9.6714066e+24f },
         /* 130 */ { { 3.6093321e-13f, -min }, { 3.6093321e-13f, -min }, 2.1990233e+12f },
         /* 131 */ { { 2.e+00f, -min }, { -2.1850398e+00f, -6.7877737e-38f }, 2.5e-01f },
@@ -827,15 +833,9 @@ struct Tan {
         /* 140 */ { { -2.e+00f, zero }, { 2.1850398e+00f, zero }, 2.5e-01f },
         /* 141 */ { { -3.6093321e-13f, zero }, { -3.6093321e-13f, zero }, 2.1990233e+12f },
         /* 142 */ { { -6.5136393e-26f, zero }, { -6.5136393e-26f, zero }, 9.6714066e+24f },
-#ifndef __aarch64__
-// TODO(b/342448599); Fix and re-enable on Arm.
         /* 143 */ { { -min, zero }, { -min, zero }, 4.2535296e+37f },
-#endif
         /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00f },
-#ifndef __aarch64__
-// TODO(b/342448599); Fix and re-enable on Arm.
         /* 145 */ { { min, zero }, { min, zero }, 4.2535296e+37f },
-#endif
         /* 146 */ { { 6.5136393e-26f, zero }, { 6.5136393e-26f, zero }, 9.6714066e+24f },
         /* 147 */ { { 3.6093321e-13f, zero }, { 3.6093321e-13f, zero }, 2.1990233e+12f },
         /* 148 */ { { 2.e+00f, zero }, { -2.1850398e+00f, zero }, 2.5e-01f },
@@ -850,15 +850,9 @@ struct Tan {
         /* 157 */ { { -2.e+00f, min }, { 2.1850398e+00f, 6.7877737e-38f }, 2.5e-01f },
         /* 158 */ { { -3.6093321e-13f, min }, { -3.6093321e-13f, min }, 2.1990233e+12f },
         /* 159 */ { { -6.5136393e-26f, min }, { -6.5136393e-26f, min }, 9.6714066e+24f },
-#ifndef __aarch64__
-// TODO(b/342448599); Fix and re-enable on Arm.
         /* 160 */ { { -min, min }, { -min, min }, 4.2535296e+37f },
-#endif
         /* 161 */ { { zero, min }, { zero, min }, 4.2535296e+37f },
-#ifndef __aarch64__
-// TODO(b/342448599); Fix and re-enable on Arm.
         /* 162 */ { { min, min }, { min, min }, 4.2535296e+37f },
-#endif
         /* 163 */ { { 6.5136393e-26f, min }, { 6.5136393e-26f, min }, 9.6714066e+24f },
         /* 164 */ { { 3.6093321e-13f, min }, { 3.6093321e-13f, min }, 2.1990233e+12f },
         /* 165 */ { { 2.e+00f, min }, { -2.1850398e+00f, 6.7877737e-38f }, 2.5e-01f },
@@ -985,7 +979,6 @@ struct Tan {
         /* 286 */ { { 6.1409603e+25f, inf }, { zero, 1.e+00f }, 5.e-01f },
         /* 287 */ { { max, inf }, { zero, 1.e+00f }, 5.e-01f },
         /* 288 */ { { inf, inf }, { zero, 1.e+00f }, 5.e-01f }
-
           // clang-format on
       };
       return table;
@@ -993,7 +986,11 @@ struct Tan {
       const T nan = std::nan("");
       const T zero = 0.0;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
@@ -1126,13 +1123,9 @@ struct Tan {
         /* 123 */ { { -1.9999999999998694e+00, -min }, { 2.185039863262273e+00, -1.2848464717505794e-307 }, 2.5e-01 },
         /* 124 */ { { -4.4647944971961829e-103, -min }, { -4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
         /* 125 */ { { -9.9671949510973086e-206, -min }, { -9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
-#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
         /* 126 */ { { -min, -min }, { -min, -min }, 2.2471164185778949e+307 },
-#endif
         /* 127 */ { { zero, -min }, { zero, -min }, 2.2471164185778949e+307 },
-#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
         /* 128 */ { { min, -min }, { min, -min }, 2.2471164185778949e+307 },
-#endif
         /* 129 */ { { 9.9671949510973086e-206, -min }, { 9.9671949510973086e-206, -min }, 1.0032913020226237e+205 },
         /* 130 */ { { 4.4647944971961829e-103, -min }, { 4.4647944971961829e-103, -min }, 2.2397447421778042e+102 },
         /* 131 */ { { 1.9999999999998694e+00, -min }, { -2.185039863262273e+00, -1.2848464717505794e-307 }, 2.5e-01 },
@@ -1147,13 +1140,9 @@ struct Tan {
         /* 140 */ { { -1.9999999999998694e+00, zero }, { 2.185039863262273e+00, zero }, 2.5e-01 },
         /* 141 */ { { -4.4647944971961829e-103, zero }, { -4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
         /* 142 */ { { -9.9671949510973086e-206, zero }, { -9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
-#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
         /* 143 */ { { -min, zero }, { -min, zero }, 2.2471164185778949e+307 },
-#endif
         /* 144 */ { { zero, zero }, { zero, zero }, 1.e+00 },
-#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
         /* 145 */ { { min, zero }, { min, zero }, 2.2471164185778949e+307 },
-#endif
         /* 146 */ { { 9.9671949510973086e-206, zero }, { 9.9671949510973086e-206, zero }, 1.0032913020226237e+205 },
         /* 147 */ { { 4.4647944971961829e-103, zero }, { 4.4647944971961829e-103, zero }, 2.2397447421778042e+102 },
         /* 148 */ { { 1.9999999999998694e+00, zero }, { -2.185039863262273e+00, zero }, 2.5e-01 },
@@ -1168,13 +1157,9 @@ struct Tan {
         /* 157 */ { { -1.9999999999998694e+00, min }, { 2.185039863262273e+00, 1.2848464717505794e-307 }, 2.5e-01 },
         /* 158 */ { { -4.4647944971961829e-103, min }, { -4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
         /* 159 */ { { -9.9671949510973086e-206, min }, { -9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
-#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
         /* 160 */ { { -min, min }, { -min, min }, 2.2471164185778949e+307 },
-#endif
         /* 161 */ { { zero, min }, { zero, min }, 2.2471164185778949e+307 },
-#ifndef __aarch64__  // Seems that denormalized values are being flushed to zero on arm (see b/342448599)
         /* 162 */ { { min, min }, { min, min }, 2.2471164185778949e+307 },
-#endif
         /* 163 */ { { 9.9671949510973086e-206, min }, { 9.9671949510973086e-206, min }, 1.0032913020226237e+205 },
         /* 164 */ { { 4.4647944971961829e-103, min }, { 4.4647944971961829e-103, min }, 2.2397447421778042e+102 },
         /* 165 */ { { 1.9999999999998694e+00, min }, { -2.185039863262273e+00, 1.2848464717505794e-307 }, 2.5e-01 },
@@ -1300,8 +1285,7 @@ struct Tan {
         /* 285 */ { { 8.9589789687104559e+102, inf }, { zero, 1.e+00 }, 5.e-01 },
         /* 286 */ { { 4.0131652080900752e+205, inf }, { zero, 1.e+00 }, 5.e-01 },
         /* 287 */ { { max, inf }, { zero, 1.e+00 }, 5.e-01 },
-        /* 288 */ { { inf, inf }, { zero, 1.e+00 }, 5.e-01 },
-          // clang-format on
+        /* 288 */ { { inf, inf }, { zero, 1.e+00 }, 5.e-01 }  // clang-format on
       };
       return table;
     } else {
@@ -1323,7 +1307,11 @@ struct Asin {
       const T pi_2 = 1.5707964f;
       const T zero = 0.0f;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
@@ -1627,7 +1615,11 @@ struct Asin {
       const T pi_2 = 1.5707963267948966;
       const T zero = 0.0;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
@@ -1945,7 +1937,11 @@ struct Asinh {
       const T pi_2 = 1.5707964f;
       const T zero = 0.0f;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
@@ -2249,7 +2245,11 @@ struct Asinh {
       const T pi_2 = 1.5707963267948966;
       const T zero = 0.0;
       const T inf = std::numeric_limits<T>::infinity();
+#ifdef __aarch64__
+      const T min = std::nextafter(std::numeric_limits<T>::min(), T(1));
+#else
       const T min = std::numeric_limits<T>::min();
+#endif
       const T max = std::numeric_limits<T>::max();
       const TableType table{
           // clang-format off
diff --git a/third_party/xla/xla/tests/complex_unary_op_test.cc b/third_party/xla/xla/tests/complex_unary_op_test.cc
index 383781869686..eaf59acd7e6e 100644
--- a/third_party/xla/xla/tests/complex_unary_op_test.cc
+++ b/third_party/xla/xla/tests/complex_unary_op_test.cc
@@ -13,18 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
+#include <cmath>
+#include <cstddef>
+#include <tuple>
 #include <vector>
 
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/lib/math.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
 #include "xla/tests/complex_unary_op_samples.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -32,7 +36,9 @@ namespace {
 template <class>
 constexpr bool dependent_false = false;
 
-class ComplexUnaryOpTest : public ClientLibraryTestBase {
+class ComplexUnaryOpTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   template <typename T, size_t index, typename... Types>
   std::vector<T> get_column(const std::vector<std::tuple<Types...>>& table) {
@@ -95,22 +101,26 @@ class ComplexUnaryOpTest : public ClientLibraryTestBase {
   }
 };
 
-XLA_TEST_F(ComplexUnaryOpTest, Log1pTest) {
-  UnaryTestHelper<complex_unary_op_samples::Log1p<float>>(Log1p);
-  UnaryTestHelper<complex_unary_op_samples::Log1p<double>>(Log1p);
+TEST_F(ComplexUnaryOpTest, Log1pTest) {
+  UnaryTestHelper<complex_unary_op_samples::Log1p<float>>(
+      [](XlaOp x) { return Log1p(x); });
+  UnaryTestHelper<complex_unary_op_samples::Log1p<double>>(
+      [](XlaOp x) { return Log1p(x); });
 }
 
-XLA_TEST_F(ComplexUnaryOpTest, TanTest) {
-  UnaryTestHelper<complex_unary_op_samples::Tan<float>>(Tan);
-  UnaryTestHelper<complex_unary_op_samples::Tan<double>>(Tan);
+TEST_F(ComplexUnaryOpTest, TanTest) {
+  UnaryTestHelper<complex_unary_op_samples::Tan<float>>(
+      [](XlaOp x) { return Tan(x); });
+  UnaryTestHelper<complex_unary_op_samples::Tan<double>>(
+      [](XlaOp x) { return Tan(x); });
 }
 
-XLA_TEST_F(ComplexUnaryOpTest, AsinTest) {
+TEST_F(ComplexUnaryOpTest, AsinTest) {
   UnaryTestHelper<complex_unary_op_samples::Asin<float>>(Asin);
   UnaryTestHelper<complex_unary_op_samples::Asin<double>>(Asin);
 }
 
-XLA_TEST_F(ComplexUnaryOpTest, AsinhTest) {
+TEST_F(ComplexUnaryOpTest, AsinhTest) {
   UnaryTestHelper<complex_unary_op_samples::Asinh<float>>(Asinh);
   UnaryTestHelper<complex_unary_op_samples::Asinh<double>>(Asinh);
 }
diff --git a/third_party/xla/xla/tests/compute_constant_test.cc b/third_party/xla/xla/tests/compute_constant_test.cc
index c277dbd0e7f1..ace820ac0639 100644
--- a/third_party/xla/xla/tests/compute_constant_test.cc
+++ b/third_party/xla/xla/tests/compute_constant_test.cc
@@ -13,24 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
+#include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
+#include "xla/client/client.h"
 #include "xla/client/client_library.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
-#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -258,8 +264,9 @@ XLA_TEST_F(ComputeConstantTest, Layout) {
 
       Literal expected_literal = LiteralUtil::CreateR2WithLayout<int32_t>(
           {{11, 22}, {33, 44}}, LayoutUtil::MakeLayout(layout));
-      ASSERT_TRUE(LiteralTestUtil::EqualShapesAndLayouts(
-          expected_literal.shape(), computed.shape()));
+      ASSERT_THAT(
+          computed.shape().ToProto(),
+          tsl::proto_testing::EqualsProto(expected_literal.shape().ToProto()));
       EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, computed));
     }
   }
diff --git a/third_party/xla/xla/tests/concat_test.cc b/third_party/xla/xla/tests/concat_test.cc
index 7301d7bf8c9c..78b2d0905601 100644
--- a/third_party/xla/xla/tests/concat_test.cc
+++ b/third_party/xla/xla/tests/concat_test.cc
@@ -13,30 +13,35 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
+#include <numeric>
+#include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal_util.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/shape_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/client_library_test_runner_utils.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using ConcatTest = ClientLibraryTestBase;
-using ConcatTestHlo = HloTestBase;
+using ConcatTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
+using ConcatTestHlo = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
@@ -492,7 +497,7 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
 
   ConcatInDim(&builder, {h0, h1}, 2);
 
-  ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
+  ComputeAndCompareR3<float>(&builder, expected, {&p0, &p1});
 }
 
 XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
@@ -517,8 +522,7 @@ XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
   auto q = ConcatInDim(&builder, {p, p}, 0);
   ConcatInDim(&builder, {q, q}, 0);
   std::vector<float> expected(131072, 256.0);
-  auto a_data = client_->TransferToServer(a_literal).value();
-  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()});
+  ComputeAndCompareR1<float>(&builder, expected, {&a_literal});
 }
 
 XLA_TEST_F(ConcatTestHlo, ConcatWithBitcast) {
@@ -761,7 +765,7 @@ ENTRY jit_broken.874 {
   auto input_array = std::make_unique<Array2D<float>>(4, 2);
   input_array->FillUnique(1.0f);
   auto input = LiteralUtil::CreateR2FromArray2D<float>(*input_array);
-  EXPECT_TRUE(RunAndCompare(std::move(module), {&input}, error_spec_));
+  EXPECT_TRUE(RunAndCompare(std::move(module), {&input}, kDefaultErrorSpec));
 }
 
 // Describes a binary rank-2 concatenation test.
@@ -774,7 +778,7 @@ struct R2BinarySpec {
 };
 
 // TEST_P harness for binary rank-2 concatenation.
-class ConcatR2BinaryTest : public ClientLibraryTestBase,
+class ConcatR2BinaryTest : public ConcatTest,
                            public ::testing::WithParamInterface<R2BinarySpec> {
 };
 
@@ -808,8 +812,6 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
   auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {});
   auto x_literal = LiteralUtil::CreateR0<float>(2.f);
   auto y_literal = LiteralUtil::CreateR0<float>(3.f);
-  auto x_data = client_->TransferToServer(x_literal).value();
-  auto y_data = client_->TransferToServer(y_literal).value();
 
   XlaBuilder builder(TestName());
   auto x = Parameter(&builder, 0, f32_scalar, "x");
@@ -821,7 +823,7 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
   ConcatInDim(&builder, {add1, add2, add3}, /*dimension=*/0);
 
   ComputeAndCompareR1<float>(&builder, {7., 8., 9., 10., 11., 12.},
-                             {x_data.get(), y_data.get()}, ErrorSpec(1e-4));
+                             {&x_literal, &y_literal}, ErrorSpec(1e-4));
 }
 
 // Test that the HLO optimization to replace a concat of a broadcasted scalar
@@ -831,9 +833,6 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   auto x_literal = LiteralUtil::CreateR1<float>({2.0f, 3.0f, 5.0f, 6.0f});
   auto y_literal = LiteralUtil::CreateR0<float>(1.5f);
   auto z_literal = LiteralUtil::CreateR0<float>(5.5f);
-  auto x_data = client_->TransferToServer(x_literal).value();
-  auto y_data = client_->TransferToServer(y_literal).value();
-  auto z_data = client_->TransferToServer(z_literal).value();
 
   XlaBuilder builder(TestName());
   auto x = Parameter(&builder, 0, x_literal.shape(), "x");
@@ -847,7 +846,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   ComputeAndCompareR1<float>(
       &builder,
       {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 2.0f, 3.0f, 5.0f, 6.0f, 5.5f, 5.5f, 5.5f},
-      {x_data.get(), y_data.get(), z_data.get()}, ErrorSpec(1e-4));
+      {&x_literal, &y_literal, &z_literal}, ErrorSpec(1e-4));
 }
 
 // Test that the HLO optimization to replace a concat of a broadcasted scalar
@@ -859,9 +858,6 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
   auto x_literal = LiteralUtil::CreateR3FromArray3D<float>(x3d);
   auto y_literal = LiteralUtil::CreateR0<float>(1.5f);
   auto z_literal = LiteralUtil::CreateR0<float>(5.5f);
-  auto x_data = client_->TransferToServer(x_literal).value();
-  auto y_data = client_->TransferToServer(y_literal).value();
-  auto z_data = client_->TransferToServer(z_literal).value();
 
   XlaBuilder builder(TestName());
   auto x = Parameter(&builder, 0, x_literal.shape(), "x");
@@ -877,7 +873,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
   auto concat1 = ReferenceUtil::Concat3D(*concat0, z_bcast3d, 1);
 
   ComputeAndCompareR3<float>(&builder, *concat1,
-                             {x_data.get(), y_data.get(), z_data.get()},
+                             {&x_literal, &y_literal, &z_literal},
                              ErrorSpec(1e-4));
 }
 
diff --git a/third_party/xla/xla/tests/concatenate_test.cc b/third_party/xla/xla/tests/concatenate_test.cc
index 460087a3e16c..373fc9590e32 100644
--- a/third_party/xla/xla/tests/concatenate_test.cc
+++ b/third_party/xla/xla/tests/concatenate_test.cc
@@ -27,16 +27,16 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using ConcatenateTest = HloTestBase;
+using ConcatenateTest = HloPjRtTestBase;
 
 TEST_F(ConcatenateTest, TwoR3Axis1) {
   const std::string hlo_text_module = R"(
@@ -99,8 +99,8 @@ TEST_F(ConcatenateTest, ThreeR2Axis1) {
 }
 
 static auto MakeIotaForShape(const Shape& shape) {
-  std::vector<int64_t> strides(shape.rank(), 1);
-  for (int i = shape.rank() - 1; i > 0; --i) {
+  std::vector<int64_t> strides(shape.dimensions().size(), 1);
+  for (int i = shape.dimensions().size() - 1; i > 0; --i) {
     strides[i - 1] = strides[i] * shape.dimensions(i);
   }
   return [strides = std::move(strides)](absl::Span<const int64_t> indices) {
diff --git a/third_party/xla/xla/tests/conditional_test.cc b/third_party/xla/xla/tests/conditional_test.cc
index f221ad7674b5..95fa2b2ae09e 100644
--- a/third_party/xla/xla/tests/conditional_test.cc
+++ b/third_party/xla/xla/tests/conditional_test.cc
@@ -13,32 +13,40 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <random>
+#include <cstdint>
+#include <string>
 #include <utility>
+#include <vector>
 
-#include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "xla/array2d.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
 #include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
 
-class ConditionalOpTest : public ClientLibraryTestBase {
+constexpr ErrorSpec kErrorSpec{0.001};
+
+class ConditionalOpTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
  protected:
   void SetUp() override {
-    ClientLibraryTestBase::SetUp();
-    execution_options_.mutable_debug_options()
-        ->set_xla_test_add_command_buffer_mode(true);
+    ClientLibraryTestRunnerMixin<HloTestBase>::SetUp();
+    mutable_debug_options()->set_xla_test_add_command_buffer_mode(true);
   }
 
   XlaComputation CreateR0ConstantComputation(float value) {
@@ -183,7 +191,6 @@ class ConditionalOpTest : public ClientLibraryTestBase {
   Shape tuple_2_r1s2f32_ = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeShape(F32, {2})});
   Shape empty_tuple_ = ShapeUtil::MakeTupleShape({});
-  ErrorSpec error_spec_{0.001};
 };
 
 // Test fixture to run indexed conditional (switch/case) tests with varying
@@ -192,7 +199,7 @@ class CaseOpTest : public ConditionalOpTest,
                    public ::testing::WithParamInterface<int> {};
 
 // Test true and false computations that do not take any parameters.
-XLA_TEST_F(ConditionalOpTest, Parameters0) {
+TEST_F(ConditionalOpTest, Parameters0) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
@@ -201,7 +208,7 @@ XLA_TEST_F(ConditionalOpTest, Parameters0) {
   auto false_computation = CreateR0ConstantComputation(12.0f);
   Conditional(pred, operands, true_computation, operands, false_computation);
 
-  ComputeAndCompareR0<float>(&builder, 56.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 56.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test branch computations that do not take any parameters.
@@ -229,13 +236,13 @@ XLA_TEST_P(CaseOpTest, Parameters0) {
     float expected = 10 * static_cast<float>((bi < 0 || bi >= num_branches)
                                                  ? num_branches - 1
                                                  : bi);
-    ComputeAndCompareR0<float>(&builder, expected, {branch_index_arg.get()},
-                               error_spec_);
+    ComputeAndCompareR0<float>(&builder, expected, {&branch_index_arg},
+                               kErrorSpec);
   }
 }
 
 // Test true and false computations that take in 1 parameter.
-XLA_TEST_F(ConditionalOpTest, Parameters1) {
+TEST_F(ConditionalOpTest, Parameters1) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -244,7 +251,7 @@ XLA_TEST_F(ConditionalOpTest, Parameters1) {
   auto identity = CreateR0IdentityComputation();
   Conditional(pred, operand1, identity, operand2, identity);
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test branch computations that take in 1 parameter.
@@ -281,14 +288,14 @@ XLA_TEST_P(CaseOpTest, Parameters1) {
     float expected = (bi < 0 || bi >= num_branches)
                          ? expecteds[num_branches - 1]
                          : expecteds[bi];
-    ComputeAndCompareR0<float>(&builder, expected, {branch_index_arg.get()},
-                               error_spec_);
+    ComputeAndCompareR0<float>(&builder, expected, {&branch_index_arg},
+                               kErrorSpec);
   }
 }
 
 // Test conditional with two different computations in the true and false cases
 // that take in different arguments.
-XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
+TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -297,12 +304,12 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
   Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
               CreateR0FloorComputation());
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test conditional with two different computations in the true and false cases
 // that take in the same arguments.
-XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
+TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -310,12 +317,12 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
   Conditional(pred, operand, CreateR0CeilComputation(), operand,
               CreateR0FloorComputation());
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test conditional with the same computation in the true and false cases but
 // take in different arguments.
-XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
+TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -324,12 +331,12 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
   auto floor = CreateR0FloorComputation();
   Conditional(pred, operand1, floor, operand2, floor);
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test conditional with the same computation in the true and false cases that
 // take in the same arguments.
-XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
+TEST_F(ConditionalOpTest, SameComputationSameArg) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -337,12 +344,12 @@ XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
   auto floor = CreateR0FloorComputation();
   Conditional(pred, operand, floor, operand, floor);
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test conditional with different instances of the same computation in the true
 // and false cases.
-XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
+TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -351,11 +358,11 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
   Conditional(pred, operand1, CreateR0FloorComputation(), operand2,
               CreateR0FloorComputation());
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test the case when a call invokes a computation that contains a conditional.
-XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
+TEST_F(ConditionalOpTest, ConditionalWithCall) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
   XlaBuilder inner_builder(TestName() + ".inner_conditional");
   auto pred_cond = Parameter(&inner_builder, 0, r0bool, "param0");
@@ -372,12 +379,12 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
   auto operand2 = ConstantR0<float>(&builder, 12.6f);
   Call(&builder, inner_builder_result, {pred, operand1, operand2});
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test true and false computations that take in 2 parameters and predicate is
 // true.
-XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
+TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
@@ -387,12 +394,12 @@ XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
   Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
               CreateR0TupleSubComputation());
 
-  ComputeAndCompareR0<float>(&builder, 68.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 68.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test true and false computations that take in 2 parameters and predicate is
 // false.
-XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
+TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -402,12 +409,12 @@ XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
   Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
               CreateR0TupleSubComputation());
 
-  ComputeAndCompareR0<float>(&builder, 44.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 44.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test true and false computations that take in 2 array parameters and
 // predicate is true.
-XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
+TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
@@ -417,8 +424,7 @@ XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
   Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
               CreateR1TupleSubComputation());
 
-  ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {pred_arg.get()},
-                             error_spec_);
+  ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {&pred_arg}, kErrorSpec);
 }
 
 // Test branch computations that take in 2 array parameters.
@@ -454,7 +460,7 @@ XLA_TEST_P(CaseOpTest, Parameters2Array) {
         (bi < 0 || bi >= num_branches) ? num_branches - 1 : bi);
     ComputeAndCompareR1<float>(
         &builder, {24.0f * modified_bi + 10, 56.0f * modified_bi + 11},
-        {branch_index_arg.get()}, error_spec_);
+        {&branch_index_arg}, kErrorSpec);
   }
 }
 
@@ -463,7 +469,7 @@ INSTANTIATE_TEST_SUITE_P(CaseOpTest_Instantiation, CaseOpTest,
 
 // Test true and false computations that take in 2 array parameters and
 // predicate is false.
-XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
+TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -473,12 +479,11 @@ XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
   Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
               CreateR1TupleSubComputation());
 
-  ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {pred_arg.get()},
-                             error_spec_);
+  ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {&pred_arg}, kErrorSpec);
 }
 
 // Test true and false computations that return a tuple of scalars.
-XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
+TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
@@ -491,11 +496,11 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
       &builder,
       LiteralUtil::MakeTupleFromSlices({LiteralUtil::CreateR0<float>(12.0f),
                                         LiteralUtil::CreateR0<float>(25.0f)}),
-      {pred_arg.get()}, error_spec_);
+      {&pred_arg}, kErrorSpec);
 }
 
 // Test true and false computations that return a tuple of arrays.
-XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
+TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
   XlaBuilder builder(TestName());
   XlaOp pred;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
@@ -509,12 +514,12 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
                            LiteralUtil::MakeTupleFromSlices(
                                {LiteralUtil::CreateR1<float>({13.0f, 16.0f}),
                                 LiteralUtil::CreateR1<float>({26.0f, 30.0f})}),
-                           {pred_arg.get()}, error_spec_);
+                           {&pred_arg}, kErrorSpec);
 }
 
 // Test true and false computations that return a tuple of a predicate, a
 // scalar, and an array.
-XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
+TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   XlaBuilder true_builder(TestName() + ".true");
   {
     Parameter(&true_builder, 0, empty_tuple_, "tuple");
@@ -549,11 +554,11 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
                                {LiteralUtil::CreateR0<bool>(true),
                                 LiteralUtil::CreateR0<float>(12.2f),
                                 LiteralUtil::CreateR1<float>({12.8f, 14.6f})}),
-                           {pred_arg.get()}, error_spec_);
+                           {&pred_arg}, kErrorSpec);
 }
 
 // Test true and false computations that return a nested tuple.
-XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
+TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   XlaBuilder true_builder(TestName() + ".true");
   {
     Parameter(&true_builder, 0, empty_tuple_, "tuple");
@@ -598,12 +603,12 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
            LiteralUtil::MakeTupleFromSlices(
                {LiteralUtil::CreateR1<float>({62.1f, 67.4f}),
                 LiteralUtil::CreateR0<float>(9.3f)})}),
-      {pred_arg.get()}, error_spec_);
+      {&pred_arg}, kErrorSpec);
 }
 
 // Test conditional that takes in scalar operands in the form of external
 // params.
-XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
+TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
   XlaBuilder builder(TestName());
 
@@ -616,14 +621,13 @@ XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
   Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
               CreateR0FloorComputation());
 
-  ComputeAndCompareR0<float>(
-      &builder, 57.0f,
-      {pred_arg.get(), operand1_param.get(), operand2_param.get()},
-      error_spec_);
+  ComputeAndCompareR0<float>(&builder, 57.0f,
+                             {&pred_arg, &operand1_param, &operand2_param},
+                             kErrorSpec);
 }
 
 // Test conditional that takes in array operands in the form of external params.
-XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
+TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
   XlaBuilder builder(TestName());
 
@@ -636,14 +640,13 @@ XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
   Conditional(pred, operand1, CreateR1CeilComputation(), operand2,
               CreateR1FloorComputation());
 
-  ComputeAndCompareR1<float>(
-      &builder, {10.0f, 11.0f},
-      {pred_arg.get(), operand1_param.get(), operand2_param.get()},
-      error_spec_);
+  ComputeAndCompareR1<float>(&builder, {10.0f, 11.0f},
+                             {&pred_arg, &operand1_param, &operand2_param},
+                             kErrorSpec);
 }
 
 // Test the case where one conditional is nested within another.
-XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
+TEST_F(ConditionalOpTest, NestedConditionals) {
   XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
@@ -669,11 +672,11 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
   Conditional(pred1, tuple_operand, std::move(inner_builder_result).value(),
               operand3, CreateR0IdentityComputation());
 
-  ComputeAndCompareR0<float>(&builder, 12.0f,
-                             {pred1_arg.get(), pred2_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred1_arg, &pred2_arg},
+                             kErrorSpec);
 }
 
-XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
+TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
   XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
@@ -696,11 +699,11 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
   auto tuple_operand = Tuple(&builder, {pred, operand1, operand2});
   Call(&builder, std::move(inner_builder_result).value(), {tuple_operand});
 
-  ComputeAndCompareR0<float>(&builder, 12.0f, {pred_arg.get()}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 12.0f, {&pred_arg}, kErrorSpec);
 }
 
 // Test a mismatch in the shape of the true operand and true computation.
-XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
+TEST_F(ConditionalOpTest, ShapeMismatch) {
   XlaBuilder builder(TestName());
   auto pred = ConstantR0<bool>(&builder, true);
   auto operand1 = ConstantR0<float>(&builder, 56.0f);
@@ -716,7 +719,7 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
                                    "only parameter of branch computation 0"));
 }
 
-XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
+TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
   Shape tuple_shape = ShapeUtil::MakeTupleShape({r0f32_, r0f32_});
   XlaComputation swapper;
   {
@@ -760,7 +763,7 @@ XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
         &builder,
         LiteralUtil::MakeTupleFromSlices(
             {LiteralUtil::CreateR0<float>(a), LiteralUtil::CreateR0<float>(b)}),
-        {x_arg.get(), y_arg.get()}, error_spec_);
+        {&x_arg, &y_arg}, kErrorSpec);
   };
   test_swap(3.11f, 9.4f);
   test_swap(11.24f, 5.55f);
@@ -768,7 +771,7 @@ XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
 
 // Test conditional that duplicates tuple elements in the then and else
 // computations. This is a regression test for b/112550242.
-XLA_TEST_F(ConditionalOpTest, DuplicateElementsConditional) {
+TEST_F(ConditionalOpTest, DuplicateElementsConditional) {
   const Shape scalar = ShapeUtil::MakeShape(S32, {});
   const Shape tuple2 = ShapeUtil::MakeTupleShape({scalar, scalar});
   XlaComputation then_comp;
@@ -801,7 +804,7 @@ XLA_TEST_F(ConditionalOpTest, DuplicateElementsConditional) {
     auto p = Parameter(&builder, 0, tuple2, "p0");
     auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1");
     Conditional(p_pred, p, then_comp, p, else_comp);
-    ComputeAndCompare(&builder, args);
+    ComputeAndCompare(&builder, {&args[0], &args[1]});
   }
   {
     // Pred is false case.
@@ -814,11 +817,13 @@ XLA_TEST_F(ConditionalOpTest, DuplicateElementsConditional) {
     auto p = Parameter(&builder, 0, tuple2, "p0");
     auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1");
     Conditional(p_pred, p, then_comp, p, else_comp);
-    ComputeAndCompare(&builder, args);
+    ComputeAndCompare(&builder, {&args[0], &args[1]});
   }
 }
 
-XLA_TEST_F(HloTestBase, ParallelExecution) {
+using ConditionalOpHloTest = HloTestBase;
+
+TEST_F(ConditionalOpHloTest, ParallelExecution) {
   // Test conditional works when an executable is executed in parallel.
   const char* const hlo_string = R"(
   HloModule m
diff --git a/third_party/xla/xla/tests/constant_reduction_function_test.cc b/third_party/xla/xla/tests/constant_reduction_function_test.cc
index 4c2529ca46f3..744c33ba3b55 100644
--- a/third_party/xla/xla/tests/constant_reduction_function_test.cc
+++ b/third_party/xla/xla/tests/constant_reduction_function_test.cc
@@ -18,20 +18,18 @@ limitations under the License.
 
 #include <optional>
 #include <string>
-#include <vector>
 
 #include "xla/hlo/testlib/test.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "xla/types.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 
 namespace xla {
 namespace {
 
 using std::nullopt;
 
-class ConstantReductionFunctionTest : public HloTestBase {};
+using ConstantReductionFunctionTest =
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 TEST_F(ConstantReductionFunctionTest, Bool) {
   const std::string& hlo_string = R"(
diff --git a/third_party/xla/xla/tests/constants_test.cc b/third_party/xla/xla/tests/constants_test.cc
index f6ba0406e31e..9dcfac4bb925 100644
--- a/third_party/xla/xla/tests/constants_test.cc
+++ b/third_party/xla/xla/tests/constants_test.cc
@@ -17,33 +17,36 @@ limitations under the License.
 
 #include "xla/hlo/builder/lib/constants.h"
 
+#include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
 #include "xla/array2d.h"
 #include "xla/array3d.h"
 #include "xla/array4d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class ConstantsTest : public ClientLibraryTestBase {
- protected:
-  const ErrorSpec error_spec_{1e-3, 1e-5};
-};
+constexpr ErrorSpec kErrorSpec{1e-3, 1e-5};
+
+using ConstantsTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 template <typename T>
 class ConstantsFloatTest : public ConstantsTest {};
@@ -66,17 +69,17 @@ TEST_F(ConstantsTest, ZeroCellF32) {
   XlaBuilder builder(TestName());
   ConstantR1<float>(&builder, {});
 
-  ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, {}, {}, kErrorSpec);
 }
 
 TYPED_TEST(ConstantsFloatTest, OneCellFloat) {
   std::vector<TypeParam> constant = {TypeParam{2.0}};
 
-  XlaBuilder builder(ClientLibraryTestBase::TestName());
+  XlaBuilder builder(ConstantsTest::TestName());
   ConstantR1<TypeParam>(&builder, constant);
 
-  ClientLibraryTestBase::ComputeAndCompareR1<TypeParam>(&builder, constant, {},
-                                                        this->error_spec_);
+  ConstantsTest::ComputeAndCompareR1<TypeParam>(&builder, constant, {},
+                                                kErrorSpec);
 }
 
 TEST_F(ConstantsTest, OneCellS32) {
@@ -125,7 +128,7 @@ TEST_F(ConstantsTest, EightCells) {
   XlaBuilder builder(TestName());
   ConstantR1<float>(&builder, constant);
 
-  ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, constant, {}, kErrorSpec);
 }
 
 TEST_F(ConstantsTest, SixteenCells) {
@@ -135,14 +138,14 @@ TEST_F(ConstantsTest, SixteenCells) {
   XlaBuilder builder(TestName());
   ConstantR1<float>(&builder, constant);
 
-  ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, constant, {}, kErrorSpec);
 }
 
 TEST_F(ConstantsTest, Empty_0x2) {
   XlaBuilder builder(TestName());
   ConstantR2FromArray2D<float>(&builder, Array2D<float>(0, 2));
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 2), {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 2), {}, kErrorSpec);
 }
 
 TEST_F(ConstantsTest, Small_2x2) {
@@ -152,7 +155,7 @@ TEST_F(ConstantsTest, Small_2x2) {
   XlaBuilder builder(TestName());
   ConstantR2FromArray2D<float>(&builder, *constant);
 
-  ComputeAndCompareR2<float>(&builder, *constant, {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, *constant, {}, kErrorSpec);
 }
 
 TEST_F(ConstantsTest, Empty_3x0x2) {
@@ -192,13 +195,13 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
   {
     XlaBuilder builder(TestName());
     ConstantLiteral(&builder, input_literal);
-    ComputeAndCompareR4<float>(&builder, input_array, {}, error_spec_);
+    ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
   }
 
   {
     XlaBuilder builder(TestName());
     ConstantR4FromArray4D<float>(&builder, input_array);
-    ComputeAndCompareR4<float>(&builder, input_array, {}, error_spec_);
+    ComputeAndCompareR4<float>(&builder, input_array, {}, kErrorSpec);
   }
 }
 
@@ -212,9 +215,9 @@ TEST_F(ConstantsTest, DISABLED_TupleConstant) {
   Literal result = ExecuteAndTransfer(&builder, {}).value();
 
   LiteralTestUtil::ExpectR2Near<float>({{1.0}, {2.0}},
-                                       LiteralSlice(result, {0}), error_spec_);
+                                       LiteralSlice(result, {0}), kErrorSpec);
   LiteralTestUtil::ExpectR1Near<float>({2.0, 42.0}, LiteralSlice(result, {1}),
-                                       error_spec_);
+                                       kErrorSpec);
 }
 
 TEST_F(ConstantsTest, Token) {
@@ -222,7 +225,7 @@ TEST_F(ConstantsTest, Token) {
   ConstantLiteral(&builder, LiteralUtil::CreateToken());
   // TODO(b/80000000): tokens cannot be returned from computations.
   Tuple(&builder, {});
-  TF_ASSERT_OK(Execute(&builder, {}).status());
+  TF_ASSERT_OK(ExecuteAndTransfer(&builder, {}).status());
 }
 
 TEST_F(ConstantsTest, FullLike) {
@@ -230,7 +233,7 @@ TEST_F(ConstantsTest, FullLike) {
   auto val1 = Iota(&b, F32, 3);
   auto val2 = FullLike(val1, 10);
   val1 + val2;
-  ComputeAndCompareR1<float>(&b, {10, 11, 12}, {}, error_spec_);
+  ComputeAndCompareR1<float>(&b, {10, 11, 12}, {}, kErrorSpec);
 }
 
 TEST_F(ConstantsTest, IllegalFullLikeOnTuple) {
@@ -245,14 +248,13 @@ TEST_F(ConstantsTest, FullLikeScalar) {
   auto scalar1 = ConstantR0WithType(&b, F32, 1);
   auto scalar2 = FullLike(scalar1, 2);
   scalar1 - scalar2;
-  ComputeAndCompareR0<float>(&b, -1, {}, error_spec_);
+  ComputeAndCompareR0<float>(&b, -1, {}, kErrorSpec);
 }
 
-class ConstantsHloTest : public HloTestBase {};
+using ConstantsHloTest = HloPjRtTestBase;
 
 // TODO(b/121147351): Fails on GPU. Not clear if this is expected behavior.
-XLA_TEST_F(ConstantsHloTest,
-           DISABLED_ON_TPU(DISABLED_ON_GPU(BitcastOfConstant))) {
+TEST_F(ConstantsHloTest, DISABLED_ON_TPU(DISABLED_ON_GPU(BitcastOfConstant))) {
   const char* testcase = R"(
     HloModule module, is_scheduled=true
 
diff --git a/third_party/xla/xla/tests/conv_depthwise_common.cc b/third_party/xla/xla/tests/conv_depthwise_common.cc
index 09cd38576322..86ce74d7a6f2 100644
--- a/third_party/xla/xla/tests/conv_depthwise_common.cc
+++ b/third_party/xla/xla/tests/conv_depthwise_common.cc
@@ -15,17 +15,14 @@ limitations under the License.
 
 #include "xla/tests/conv_depthwise_common.h"
 
-#include <optional>
+#include <string>
 
-#include "xla/execution_options_util.h"
-#include "xla/hlo/builder/xla_computation.h"
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/hlo/transforms/despecializer.h"
-#include "xla/hlo/transforms/simplifiers/float_normalization.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 std::string GetFloatDataType(bool use_bfloat16) {
diff --git a/third_party/xla/xla/tests/conv_depthwise_common.h b/third_party/xla/xla/tests/conv_depthwise_common.h
index 010dde848988..9ea231e826eb 100644
--- a/third_party/xla/xla/tests/conv_depthwise_common.h
+++ b/third_party/xla/xla/tests/conv_depthwise_common.h
@@ -16,17 +16,12 @@ limitations under the License.
 #ifndef XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
 #define XLA_TESTS_CONV_DEPTHWISE_COMMON_H_
 
-#include <optional>
+#include <cstdint>
+#include <string>
+#include <vector>
 
-#include "xla/execution_options_util.h"
-#include "xla/hlo/builder/xla_computation.h"
+#include <gtest/gtest.h>
 #include "xla/hlo/testlib/test.h"
-#include "xla/hlo/transforms/despecializer.h"
-#include "xla/hlo/transforms/simplifiers/float_normalization.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/test_macros.h"
 
 namespace xla {
 std::string GetFloatDataType(bool use_bfloat16);
diff --git a/third_party/xla/xla/tests/convolution_dimension_numbers_test.cc b/third_party/xla/xla/tests/convolution_dimension_numbers_test.cc
index 833a9266afb3..2106f82f0cd1 100644
--- a/third_party/xla/xla/tests/convolution_dimension_numbers_test.cc
+++ b/third_party/xla/xla/tests/convolution_dimension_numbers_test.cc
@@ -13,20 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
-#include <array>
+#include <cstdint>
 #include <memory>
 
 #include "absl/status/statusor.h"
 #include "xla/array4d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/padding.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/shape_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -54,7 +58,9 @@ absl::StatusOr<ConvolutionDimensionNumbers> CreateConvDimensionNumbers(
   return dimension_numbers;
 }
 
-class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {};
+class ConvolutionDimensionNumbersTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
 
 // Tests the convolution operation with invalid input dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
@@ -83,15 +89,13 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
               ::testing::HasSubstr("output are not unique"));
 }
 
-XLA_TEST_F(ConvolutionDimensionNumbersTest,
-           TwoConvsWithDifferentDimensionNumbers) {
+TEST_F(ConvolutionDimensionNumbersTest, TwoConvsWithDifferentDimensionNumbers) {
   auto input_array = std::make_unique<Array4D<float>>(2, 3, 5, 5);
   input_array->FillWithMultiples(0.1);
   auto weight_array = std::make_unique<Array4D<float>>(4, 3, 1, 1);
   weight_array->FillWithMultiples(0.2);
-  auto weight_data =
-      client_->TransferToServer(LiteralUtil::CreateR4FromArray4D(*weight_array))
-          .value();
+  const Literal weight_literal =
+      LiteralUtil::CreateR4FromArray4D(*weight_array);
 
   XlaBuilder builder(TestName());
   auto input = ConstantR4FromArray4D<float>(&builder, *input_array);
@@ -121,7 +125,7 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest,
   auto expected_conv2 = ReferenceUtil::ConvArray4DGeneralDimensions(
       *input_array, *expected_conv1, {1, 1}, Padding::kValid, dim_nums);
 
-  ComputeAndCompareR4<float>(&builder, *expected_conv2, {weight_data.get()},
+  ComputeAndCompareR4<float>(&builder, *expected_conv2, {&weight_literal},
                              ErrorSpec(0.001, 0.01));
 }
 
diff --git a/third_party/xla/xla/tests/convolution_test.cc b/third_party/xla/xla/tests/convolution_test.cc
index e77a50643f11..f335aaf40fab 100644
--- a/third_party/xla/xla/tests/convolution_test.cc
+++ b/third_party/xla/xla/tests/convolution_test.cc
@@ -1767,22 +1767,6 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.01, 0.01}));
 }
 
-// CUDNN does not support s8->s32 convs
-XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_GPU(PackedNibbleConvolve)) {
-  constexpr char kHlo[] = R"(
-HloModule TestModule
-
-ENTRY Test {
-  %lhs = s8[5,11,11,7] parameter(1)
-  %rhs = s8[3,3,7,7] parameter(0)
-  ROOT %convolution = s32[5,11,11,7] convolution(lhs, rhs),
-     window={size=3x3 pad=1_1x1_1},
-     dim_labels=b01f_01io->b01f,
-     operand_precision={PACKED_NIBBLE,PACKED_NIBBLE}
-})";
-  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0, 0}));
-}
-
 XLA_TEST_F(ConvolutionHloTest, SwappedOperandConvolveWithStride) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
diff --git a/third_party/xla/xla/tests/convolution_test_1d.cc b/third_party/xla/xla/tests/convolution_test_1d.cc
index 58d53efd2dc6..39ea0d7ece1b 100644
--- a/third_party/xla/xla/tests/convolution_test_1d.cc
+++ b/third_party/xla/xla/tests/convolution_test_1d.cc
@@ -16,42 +16,38 @@ limitations under the License.
 // Tests of 1D convolution with trivial kernels and no special variations (like
 // strides and padding).
 
-#include <memory>
+#include <cstdint>
+#include <vector>
 
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "xla/array2d.h"
+#include "Eigen/Core"
 #include "xla/array3d.h"
-#include "xla/array4d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/padding.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/reference_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class ConvolutionTest : public ClientLibraryTestBase {
- protected:
 #if XLA_TEST_BACKEND_GPU
-  // XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial
-  // convolution. So relax the absolute error threshold.
-  ErrorSpec error_spec_ = ErrorSpec(1e-2, 1e-3);
+// XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial
+// convolution. So relax the absolute error threshold.
+constexpr ErrorSpec kErrorSpec(1e-2, 1e-3);
 #else
-  ErrorSpec error_spec_ = ErrorSpec(1e-4, 1e-3);
+constexpr ErrorSpec kErrorSpec(1e-4, 1e-3);
 #endif
-};
+
+using ConvolutionTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 #ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
 using TestTypes = ::testing::Types<float>;
@@ -121,11 +117,8 @@ class Convolve1D1WindowTestBase
     auto expected_r3 =
         expected_r1.Reshape({batch, num_windows, output_feature}).value();
 
-    auto input_literal = client_->TransferToServer(input_r3).value();
-    auto filter_literal = client_->TransferToServer(filter_r3).value();
-    ComputeAndCompareLiteral(&builder, expected_r3,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+    ComputeAndCompareLiteral(&builder, expected_r3, {&input_r3, &filter_r3},
+                             kErrorSpec);
   }
 };
 
@@ -202,7 +195,7 @@ INSTANTIATE_TEST_CASE_P(
 );
 #endif
 
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
+TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
@@ -217,16 +210,11 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
 
   Array3D<float> expected({{{510, 610, 710, 810}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
 template <typename T>
@@ -252,16 +240,11 @@ class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest {
 
     Array3D<T> expected({{{570.0f, 670.0f, 770.0f}}});
 
-    auto input_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-            .value();
-    auto filter_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-            .value();
+    const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+    const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
     ComputeAndCompareR3<T>(&builder, expected,
-                           {input_literal.get(), filter_literal.get()},
-                           error_spec_);
+                           {&input_literal, &filter_literal}, kErrorSpec);
   }
 };  // namespace
 
@@ -269,8 +252,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithRHSDilation, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithRHSDilation, Types) { this->RunTest(); }
 
 // Basic test with LHS dilation (i.e. strided transposed convolution).
-XLA_TEST_F(ConvolutionTest,
-           Convolve1D_1x1x5_1x1x3_WithLHSDilation_FullPadding) {
+TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilation_FullPadding) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 5});
@@ -289,19 +271,14 @@ XLA_TEST_F(ConvolutionTest,
 
   Array3D<float> expected({{{34, 22, 56, 33, 78, 44, 100}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilation_NoPadding) {
+TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilation_NoPadding) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 5});
@@ -319,20 +296,14 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilation_NoPadding) {
   Array3D<float> filter({{{10, 11, 12}}});
   Array3D<float> expected({{{12, 11, 34, 22, 56, 33, 78, 44, 100, 55, 50}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionTest,
-           Convolve1D_1x1x5_1x1x3_WithLHSDilation_HalfPadding) {
+TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilation_HalfPadding) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 5});
@@ -350,20 +321,15 @@ XLA_TEST_F(ConvolutionTest,
   Array3D<float> filter({{{10, 11, 12}}});
   Array3D<float> expected({{{11, 34, 22, 56, 33, 78, 44, 100, 55}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
 // Test multiple output channels.
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x1x5_2x1x3_WithLHSDilation) {
+TEST_F(ConvolutionTest, Convolve1D_1x1x5_2x1x3_WithLHSDilation) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 5});
@@ -382,20 +348,15 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x1x5_2x1x3_WithLHSDilation) {
   Array3D<float> expected(
       {{{34, 22, 56, 33, 78, 44, 100}, {68, 44, 112, 66, 156, 88, 200}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
 // Test multiple input channels.
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x3_WithLHSDilation) {
+TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x3_WithLHSDilation) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
@@ -414,20 +375,15 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x3_WithLHSDilation) {
 
   Array3D<float> expected({{{730, 390, 870, 460, 1010, 530, 1150}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
 // Batched version of the above test.
-XLA_TEST_F(ConvolutionTest, Convolve1D_3x2x5_1x2x3_WithLHSDilation) {
+TEST_F(ConvolutionTest, Convolve1D_3x2x5_1x2x3_WithLHSDilation) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {3, 2, 5});
@@ -451,20 +407,15 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_3x2x5_1x2x3_WithLHSDilation) {
                            {{7300, 3900, 8700, 4600, 10100, 5300, 11500}},
                            {{1460, 780, 1740, 920, 2020, 1060, 2300}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
 // Test all together: batched, multiple input and output channels.
-XLA_TEST_F(ConvolutionTest, Convolve1D_3x2x5_2x2x3_WithLHSDilation) {
+TEST_F(ConvolutionTest, Convolve1D_3x2x5_2x2x3_WithLHSDilation) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {3, 2, 5});
@@ -492,22 +443,17 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_3x2x5_2x2x3_WithLHSDilation) {
                            {{1460, 780, 1740, 920, 2020, 1060, 2300},
                             {1606, 858, 1914, 1012, 2222, 1166, 2530}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
 // Test LHS dilation (i.e. transposed convolution) and window strides at the
 // same time. That's probably never used in practice, but since the generic
 // algorithm covers it, we test it anyway with a simple case.
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilationAndStrides) {
+TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilationAndStrides) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 5});
@@ -526,19 +472,14 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x1x5_1x1x3_WithLHSDilationAndStrides) {
 
   Array3D<float> expected({{{34, 56, 78, 100}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
+TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
   XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
@@ -557,16 +498,11 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
 
   Array3D<float> expected({{{510, 0, 610, 0, 710, 0, 810}}});
 
-  auto input_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-          .value();
-  auto filter_literal =
-      client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-          .value();
+  const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+  const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
   ComputeAndCompareR3<float>(&builder, expected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+                             {&input_literal, &filter_literal}, kErrorSpec);
 }
 
 template <typename T>
@@ -593,16 +529,11 @@ class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest {
     Array3D<T> expected(
         {{{0.0f, 260.0f, 510.0f, 610.0f, 710.0f, 810.0f, 350.0f, 0.0f}}});
 
-    auto input_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(input))
-            .value();
-    auto filter_literal =
-        client_->TransferToServer(LiteralUtil::CreateR3FromArray3D(filter))
-            .value();
+    const Literal input_literal = LiteralUtil::CreateR3FromArray3D(input);
+    const Literal filter_literal = LiteralUtil::CreateR3FromArray3D(filter);
 
     ComputeAndCompareR3<T>(&builder, expected,
-                           {input_literal.get(), filter_literal.get()},
-                           error_spec_);
+                           {&input_literal, &filter_literal}, kErrorSpec);
   }
 };
 
diff --git a/third_party/xla/xla/tests/convolution_variants_test.cc b/third_party/xla/xla/tests/convolution_variants_test.cc
index 719e9b3d80e8..f001e646a0a0 100644
--- a/third_party/xla/xla/tests/convolution_variants_test.cc
+++ b/third_party/xla/xla/tests/convolution_variants_test.cc
@@ -17,51 +17,54 @@ limitations under the License.
 // in small sized data.
 
 #include <algorithm>
+#include <cstdint>
 #include <initializer_list>
 #include <memory>
 #include <numeric>
 #include <random>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "xla/array3d.h"
 #include "xla/array4d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/padding.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class ConvolutionVariantsTest : public ClientLibraryTestBase {
- protected:
 #if XLA_TEST_BACKEND_GPU
-  // XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial
-  // convolution. So relax the absolute error threshold.
-  ErrorSpec error_spec_ = ErrorSpec(1e-1, 1e-5);
+// XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial
+// convolution. So relax the absolute error threshold.
+ErrorSpec kErrorSpec(1e-1, 1e-5);
 #else
-  ErrorSpec error_spec_ = ErrorSpec(1e-4, 1e-2);
+ErrorSpec kErrorSpec(1e-4, 1e-2);
 #endif
 
-  XlaOp ConvWithHighestPrecision(const XlaOp lhs, const XlaOp rhs,
-                                 absl::Span<const int64_t> window_strides,
-                                 Padding padding) {
-    PrecisionConfig precision_config;
-    // Set the 2 operands to have the HIGHEST precision.
-    precision_config.add_operand_precision(PrecisionConfig::HIGHEST);
-    precision_config.add_operand_precision(PrecisionConfig::HIGHEST);
-    return Conv(lhs, rhs, window_strides, padding, /*feature_group_count=*/1,
-                /*batch_group_count=*/1, &precision_config);
-  }
-};
+XlaOp ConvWithHighestPrecision(const XlaOp lhs, const XlaOp rhs,
+                               absl::Span<const int64_t> window_strides,
+                               Padding padding) {
+  PrecisionConfig precision_config;
+  // Set the 2 operands to have the HIGHEST precision.
+  precision_config.add_operand_precision(PrecisionConfig::HIGHEST);
+  precision_config.add_operand_precision(PrecisionConfig::HIGHEST);
+  return Conv(lhs, rhs, window_strides, padding, /*feature_group_count=*/1,
+              /*batch_group_count=*/1, &precision_config);
+}
+
+using ConvolutionVariantsTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
-XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
+TEST_F(ConvolutionVariantsTest, Minimal) {
   XlaBuilder builder(TestName());
 
   const Array4D<float> input_array(1, 1, 1, 1, {2});
@@ -73,10 +76,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   const Array4D<float> expected(1, 1, 1, 1, {6});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
+TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
   XlaBuilder builder(TestName());
 
   const Array4D<float> input_array(5, 1, 1, 1, {1, 2, 3, 4, 5});
@@ -88,10 +91,10 @@ XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   const Array4D<float> expected(5, 1, 1, 1, {2, 4, 6, 8, 10});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
+TEST_F(ConvolutionVariantsTest, Flat1x1) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(2, 1, 3, 4);
@@ -105,10 +108,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) {
 
   Array4D<float> expected(2, 1, 3, 4);
   expected.FillWithMultiples(2.3);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
+TEST_F(ConvolutionVariantsTest, Deep1x1) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 2, 1, 1, {10, 1});
@@ -120,10 +123,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 3, 1, 1, {12, 34, 56});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
+TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 2, {1, 2});
@@ -135,10 +138,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 1, {12});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
+TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
@@ -150,10 +153,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 2, {12, 23});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
+TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
@@ -165,10 +168,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 2, 1, {12, 34});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
+TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
@@ -180,10 +183,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 2, {13, 24});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
+TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
@@ -195,10 +198,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 1, {1234});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
+TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(
@@ -216,10 +219,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) {
       2, 2, 2, 2,
       {167, 1278, 3490, 4500, 0.0167, 0.1278, 0.3490, 0.4500,    // plane 0
        334, 2556, 6980, 9000, 0.0334, 0.2556, 0.6980, 0.9000});  // plane 1
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
+TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 4, {1, 2, 3, 4});
@@ -231,10 +234,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) {
   Conv(input, filter, {1, 2}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 2, {10, 30});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
+TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
@@ -246,10 +249,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) {
   Conv(input, filter, {1, 2}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 3, {10, 30, 50});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
+TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 4, {1, 2, 3, 4});
@@ -261,10 +264,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) {
   Conv(input, filter, {1, 2}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 1, {123});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
+TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 5, {1, 2, 3, 4, 5});
@@ -276,10 +279,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) {
   Conv(input, filter, {1, 2}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 2, {123, 345});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
+TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
@@ -291,10 +294,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) {
   Conv(input, filter, {2, 2}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 2, 2, {10, 30, 70, 90});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
+TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 1, {1});
@@ -306,10 +309,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) {
   Conv(input, filter, {1, 1}, Padding::kSame);
 
   Array4D<float> expected(1, 1, 1, 1, {20});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
+TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
@@ -321,10 +324,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) {
   Conv(input, filter, {1, 1}, Padding::kSame);
 
   Array4D<float> expected(1, 1, 1, 3, {123, 1230, 12300});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
+TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
@@ -339,10 +342,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
   Conv(input, filter, {1, 1}, Padding::kSame);
 
   Array4D<float> expected(1, 1, 2, 2, {104, 230, 2300, 10400});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
+TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 2, 1, 2, {1, 2, 3, 4});
@@ -354,10 +357,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) {
   Conv(input, filter, {1, 1}, Padding::kSame);
 
   Array4D<float> expected(1, 1, 1, 2, {13, 24});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
+TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
@@ -369,10 +372,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 2, 2, {216, 276, 396, 456});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
+TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(1, 1, 1, 3, {1, 2, 3});
@@ -384,10 +387,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 1, 1, 2, {33, 53});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
+TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(64);
@@ -404,10 +407,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 2, 1, 1, {2016, 4032});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
+TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(16 * 1 * 1 * 1);
@@ -425,10 +428,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) {
   std::vector<float> expected_data = {1, 2,  3,  4,  5,  6,  7,  8,
                                       9, 10, 11, 12, 13, 14, 15, 16};
   Array4D<float> expected(16, 1, 1, 1, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
+TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
   XlaBuilder builder(TestName());
 
   constexpr int bs = 16;
@@ -456,10 +459,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) {
     expected_data[i] = 10 * (i + 1);
   }
   Array4D<float> expected(bs, 1, 1, 1, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
+TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
   XlaBuilder builder(TestName());
 
   constexpr int kx = 2;
@@ -488,10 +491,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
       43,
   };
   Array4D<float> expected(bs, 1, 1, 1, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
+TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(16, 1, 8, 8);
@@ -516,10 +519,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) {
       36304, 38384, 40464, 42544, 44624, 46704, 48784, 50864,
   };
   Array4D<float> expected(16, 1, 1, 1, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
+TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(2 * 8 * 8);
@@ -542,10 +545,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(1, 2, 1, 1, {14240, 30496});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
+TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(2 * 2 * 8 * 8);
@@ -568,10 +571,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) {
   Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<float> expected(2, 2, 1, 1, {14240, 30496, 38816, 87840});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
+TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(32 * 2 * 8 * 8);
@@ -608,10 +611,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) {
   Array4D<float> expected(32, 2, 1, 1, expected_data);
   // The output elements can be larger than 1e+5, making the absolute error
   // large sometimes. So, we focus on relative errors for this test case.
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
+TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
   XlaBuilder builder(TestName());
 
   Array4D<float> input_array(16, 16, 1, 1);
@@ -634,10 +637,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) {
     }
   }
 
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
+TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 4 * 6);
@@ -653,10 +656,10 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) {
       XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 2, 2, {3924, 4257, 5922, 6255});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
+TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
@@ -672,10 +675,10 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) {
       XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 8, {10, 2, 20, 3, 30, 4, 40, 5});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
+TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 3 * 4);
@@ -698,10 +701,10 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
                           {204, 40, 406, 60, 608,       //
                            1518, 180, 1821, 210, 2124,  //
                            4146, 460, 4651, 510, 5156});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
+TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
@@ -717,10 +720,10 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) {
       XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 2, {23, 34});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
+TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
@@ -736,10 +739,10 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) {
       XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 5, {23, 34, 45, 50, 0});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
+TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
@@ -755,10 +758,10 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) {
       XlaBuilder::CreateDefaultConvDimensionNumbers());
 
   Array4D<float> expected(1, 1, 1, 5, {0, 1, 12, 23, 34});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
+TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
@@ -781,9 +784,9 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) {
   //   [10, 1] --dilate-> [10, 0, 1]
   Array4D<float> expected(1, 1, 1, 12,
                           {0, 1, 0, 12, 0, 23, 0, 34, 0, 45, 0, 50});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
-XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
+TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 1 * 1 * 5);
@@ -805,10 +808,10 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) {
   // filter:
   //   [10, 1] --dilate-> [10, 0, 1]
   Array4D<float> expected(1, 1, 1, 2, {0, 34});
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) {
+TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) {
   constexpr int bs = 1;
   constexpr int iz = 1;
   constexpr int oz = 2;
@@ -838,10 +841,10 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) {
   std::unique_ptr<Array4D<float>> expected = ReferenceUtil::ConvArray4D(
       input_array, filter_array, {1, 1}, Padding::kValid);
 
-  ComputeAndCompareR4<float>(&builder, *expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, *expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) {
+TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) {
   constexpr int bs = 1;
   constexpr int iz = 16;
   constexpr int oz = 1;
@@ -871,10 +874,10 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) {
   std::unique_ptr<Array4D<float>> expected = ReferenceUtil::ConvArray4D(
       input_array, filter_array, {1, 1}, Padding::kValid);
 
-  ComputeAndCompareR4<float>(&builder, *expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, *expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) {
+TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) {
   constexpr int bs = 16;
   constexpr int iz = 16;
   constexpr int oz = 1;
@@ -904,10 +907,10 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) {
   std::unique_ptr<Array4D<float>> expected = ReferenceUtil::ConvArray4D(
       input_array, filter_array, {1, 1}, Padding::kValid);
 
-  ComputeAndCompareR4<float>(&builder, *expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, *expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
+TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
   constexpr int bs = 16;
   constexpr int iz = 16;
   constexpr int oz = 16;
@@ -937,11 +940,10 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
   std::unique_ptr<Array4D<float>> expected = ReferenceUtil::ConvArray4D(
       input_array, filter_array, {1, 1}, Padding::kValid);
 
-  ComputeAndCompareR4<float>(&builder, *expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, *expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest,
-           RandomData_Input16x16x16x16_Filter16x16x16x16) {
+TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x16x16_Filter16x16x16x16) {
   constexpr int bs = 16;
   constexpr int iz = 16;
   constexpr int oz = 16;
@@ -971,10 +973,10 @@ XLA_TEST_F(ConvolutionVariantsTest,
   std::unique_ptr<Array4D<float>> expected = ReferenceUtil::ConvArray4D(
       input_array, filter_array, {1, 1}, Padding::kValid);
 
-  ComputeAndCompareR4<float>(&builder, *expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, *expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
+TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
@@ -1015,10 +1017,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
       0, 0, 0,  0,  0, 0, 0   //
   };
   Array4D<float> expected(1, 5, 7, 1, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
+TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
@@ -1059,10 +1061,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
       0, 0, 0, 0,  0,  0, 0, 0   //
   };
   Array4D<float> expected(1, 5, 8, 1, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
+TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 1);
@@ -1100,10 +1102,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
       8, 10, 12,
   };
   Array4D<float> expected(1, 2, 3, 1, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
+TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
   XlaBuilder builder(TestName());
 
   std::vector<float> input_data(1 * 2 * 3 * 2);
@@ -1145,7 +1147,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
       82, 105, 128,  //
   };
   Array4D<float> expected(1, 2, 3, 3, expected_data);
-  ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, expected, {}, kErrorSpec);
 }
 
 // Regression test for b/32034796.
@@ -1154,8 +1156,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
 //   Conv([1,2,3], Reverse([5,6]), padding_low=1)
 // into
 //   BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1)
-XLA_TEST_F(ConvolutionVariantsTest,
-           BackwardInputLowPaddingLessThanHighPadding) {
+TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding) {
   XlaBuilder builder(TestName());
 
   auto gradients = ConstantR4FromArray4D<float>(
@@ -1166,15 +1167,14 @@ XLA_TEST_F(ConvolutionVariantsTest,
   ConvWithGeneralPadding(gradients, mirrored_weights,
                          /*window_strides=*/{1, 1},
                          /*padding=*/{{0, 0}, {1, 0}});
-  ComputeAndCompareR4<float>(&builder, {{{{5, 16, 27}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{5, 16, 27}}}}, {}, kErrorSpec);
 }
 
 // XLA:GPU fuses
 //   Conv([1], Reverse([1,10,100]), padding_high=3, base_dilation=3)
 // into
 //   BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1))
-XLA_TEST_F(ConvolutionVariantsTest,
-           BackwardInputLowPaddingGreaterThanHighPadding) {
+TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingGreaterThanHighPadding) {
   XlaBuilder builder(TestName());
 
   auto gradients = ConstantR4FromArray4D<float>(
@@ -1187,14 +1187,14 @@ XLA_TEST_F(ConvolutionVariantsTest,
                      /*padding=*/{{0, 0}, {0, 3}},
                      /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{},
                      XlaBuilder::CreateDefaultConvDimensionNumbers());
-  ComputeAndCompareR4<float>(&builder, {{{{100, 0}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{100, 0}}}}, {}, kErrorSpec);
 }
 
 // XLA:GPU fuses
 //   Conv([1], Reverse([1,10,100]), padding=(1,1))
 // into
 //   BackwardInputConv([1], [1,10,100], padding=(1,1))
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
+TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
   XlaBuilder builder(TestName());
 
   auto gradients = ConstantR4FromArray4D<float>(
@@ -1205,7 +1205,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
   ConvWithGeneralPadding(gradients, mirrored_weights,
                          /*window_strides=*/{1, 1},
                          /*padding=*/{{0, 0}, {1, 1}});
-  ComputeAndCompareR4<float>(&builder, {{{{10}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{10}}}}, {}, kErrorSpec);
 }
 
 // HLO pattern
@@ -1215,7 +1215,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) {
 //
 // However, XLA:GPU doesn't actually fuse it because PadInsertion doesn't
 // support negative padding on backward convolution yet (b/32744257).
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
+TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
   XlaBuilder builder(TestName());
 
   auto gradients = ConstantR4FromArray4D<float>(
@@ -1227,11 +1227,10 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
                          /*window_strides=*/{1, 1},
                          /*padding=*/{{0, 0}, {0, 2}});
 
-  ComputeAndCompareR4<float>(&builder, {{{{12, 23, 30, 0}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{12, 23, 30, 0}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest,
-           BackwardFilterLowPaddingLessThanHighPadding) {
+TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingLessThanHighPadding) {
   XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,1,2,3,4,0,0
@@ -1251,11 +1250,11 @@ XLA_TEST_F(ConvolutionVariantsTest,
                          XlaBuilder::CreateDefaultConvDimensionNumbers());
   Transpose(forward_conv, {0, 1, 2, 3});
 
-  ComputeAndCompareR4<float>(&builder, {{{{24, 130, 240}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{24, 130, 240}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest,
-           BackwardFilterLowPaddingGreaterThanHighPadding) {
+TEST_F(ConvolutionVariantsTest,
+       BackwardFilterLowPaddingGreaterThanHighPadding) {
   XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4
@@ -1277,10 +1276,10 @@ XLA_TEST_F(ConvolutionVariantsTest,
                          XlaBuilder::CreateDefaultConvDimensionNumbers());
   Transpose(forward_conv, {0, 1, 2, 3});
 
-  ComputeAndCompareR4<float>(&builder, {{{{13, 24}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{13, 24}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
+TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
   XlaBuilder builder(TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4,0
@@ -1304,10 +1303,10 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) {
                          XlaBuilder::CreateDefaultConvDimensionNumbers());
   Transpose(forward_conv, {0, 1, 2, 3});
 
-  ComputeAndCompareR4<float>(&builder, {{{{13, 24, 130}}}}, {}, error_spec_);
+  ComputeAndCompareR4<float>(&builder, {{{{13, 24, 130}}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
+TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
   XlaBuilder builder(TestName());
 
   auto gradients = ConstantR3FromArray3D<float>(
@@ -1318,10 +1317,10 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) {
   ConvWithGeneralPadding(gradients, mirrored_weights,
                          /*window_strides=*/{1},
                          /*padding=*/{{1, 1}});
-  ComputeAndCompareR3<float>(&builder, {{{10}}}, {}, error_spec_);
+  ComputeAndCompareR3<float>(&builder, {{{10}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
+TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
   XlaBuilder builder(TestName());
 
   auto activations =
@@ -1337,10 +1336,10 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) {
                              /*num_spatial_dims=*/1));
   Transpose(forward_conv, {0, 1, 2});
 
-  ComputeAndCompareR3<float>(&builder, {{{13, 24, 130}}}, {}, error_spec_);
+  ComputeAndCompareR3<float>(&builder, {{{13, 24, 130}}}, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
+TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
   XlaBuilder builder(TestName());
 
   auto gradients_flat = LiteralUtil::CreateR1<float>({1});
@@ -1358,10 +1357,10 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) {
   ConvWithGeneralPadding(gradients, mirrored_weights,
                          /*window_strides=*/{1, 1, 1},
                          /*padding=*/{{0, 0}, {0, 0}, {1, 1}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {}, error_spec_);
+  ComputeAndCompareLiteral(&builder, expected_literal, {}, kErrorSpec);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
+TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
   XlaBuilder builder(TestName());
 
   auto activations_flat = LiteralUtil::CreateR1<float>({1, 2, 3, 4});
@@ -1383,7 +1382,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) {
                          XlaBuilder::CreateDefaultConvDimensionNumbers(
                              /*num_spatial_dims=*/3));
   Transpose(forward_conv, {0, 1, 2, 3, 4});
-  ComputeAndCompareLiteral(&builder, expected_literal, {}, error_spec_);
+  ComputeAndCompareLiteral(&builder, expected_literal, {}, kErrorSpec);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
index af903c53dabe..d5ef5bb8de46 100644
--- a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
+++ b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
@@ -16,35 +16,50 @@ limitations under the License.
 #include <math.h>
 
 #include <algorithm>
+#include <cstdint>
 #include <memory>
-#include <new>
+#include <optional>
 #include <random>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
 #define EIGEN_USE_THREADS
 
+#include "absl/log/log.h"
 #include "absl/types/span.h"
+#include "benchmark/benchmark.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #include "xla/array2d.h"
 #include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
+#include "xla/comparison_util.h"
+#include "xla/error_spec.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
 #include "tsl/platform/protobuf.h"
-#include "tsl/platform/test_benchmark.h"
 
 namespace xla {
 namespace {
@@ -187,7 +202,7 @@ bool CpuGpuFusionTest::ComputeElementwiseAnswerCompare(
   }
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Test) {
+TEST_F(CpuGpuFusionTest, Test) {
   // test expression:
   // slice(select({{T, F, T}, {F, T, F}},
   //              concat(transpose({{1.0}, {2.0}, {3.0}} +
@@ -240,7 +255,7 @@ XLA_TEST_F(CpuGpuFusionTest, Test) {
 }
 
 // Test whether we emit appropriate code for parameters of fusion instructions.
-XLA_TEST_F(CpuGpuFusionTest, Parameter) {
+TEST_F(CpuGpuFusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
@@ -265,7 +280,7 @@ XLA_TEST_F(CpuGpuFusionTest, Parameter) {
       ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, RandomizedParallelPartition) {
+TEST_F(CpuGpuFusionTest, RandomizedParallelPartition) {
   // Tests parallel partitioning of a fusion instruction.
   // Create shape with random outer dimension size to generate random parallel
   // partition counts for each test run.
@@ -301,7 +316,7 @@ XLA_TEST_F(CpuGpuFusionTest, RandomizedParallelPartition) {
   }
 }
 
-XLA_TEST_F(CpuGpuFusionTest, BroadcastIntoBinaryOp) {
+TEST_F(CpuGpuFusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -325,7 +340,7 @@ XLA_TEST_F(CpuGpuFusionTest, BroadcastIntoBinaryOp) {
       ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, ReshapeToScalar) {
+TEST_F(CpuGpuFusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto single_element_array = builder.AddInstruction(
@@ -340,7 +355,7 @@ XLA_TEST_F(CpuGpuFusionTest, ReshapeToScalar) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Reshape_3by2_1by2by3) {
+TEST_F(CpuGpuFusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -355,7 +370,7 @@ XLA_TEST_F(CpuGpuFusionTest, Reshape_3by2_1by2by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Reshape_1by2by3_3by2) {
+TEST_F(CpuGpuFusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -370,7 +385,7 @@ XLA_TEST_F(CpuGpuFusionTest, Reshape_1by2by3_3by2) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Reshape_1by1by1_) {
+TEST_F(CpuGpuFusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -385,7 +400,7 @@ XLA_TEST_F(CpuGpuFusionTest, Reshape_1by1by1_) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Reshape__1by1by1) {
+TEST_F(CpuGpuFusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -400,7 +415,7 @@ XLA_TEST_F(CpuGpuFusionTest, Reshape__1by1by1) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Reshape__) {
+TEST_F(CpuGpuFusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -415,7 +430,7 @@ XLA_TEST_F(CpuGpuFusionTest, Reshape__) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Reshape_3by3_3by3) {
+TEST_F(CpuGpuFusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -430,7 +445,7 @@ XLA_TEST_F(CpuGpuFusionTest, Reshape_3by3_3by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Transpose_2by3) {
+TEST_F(CpuGpuFusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -445,7 +460,7 @@ XLA_TEST_F(CpuGpuFusionTest, Transpose_2by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Transpose_3by3) {
+TEST_F(CpuGpuFusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -460,7 +475,7 @@ XLA_TEST_F(CpuGpuFusionTest, Transpose_3by3) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Reverse) {
+TEST_F(CpuGpuFusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -476,7 +491,7 @@ XLA_TEST_F(CpuGpuFusionTest, Reverse) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, ReverseNegate) {
+TEST_F(CpuGpuFusionTest, ReverseNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -494,7 +509,7 @@ XLA_TEST_F(CpuGpuFusionTest, ReverseNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, BroadcastNegate) {
+TEST_F(CpuGpuFusionTest, BroadcastNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
@@ -512,7 +527,7 @@ XLA_TEST_F(CpuGpuFusionTest, BroadcastNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, SliceNegate) {
+TEST_F(CpuGpuFusionTest, SliceNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -530,7 +545,7 @@ XLA_TEST_F(CpuGpuFusionTest, SliceNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, DynamicSliceNegate) {
+TEST_F(CpuGpuFusionTest, DynamicSliceNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -552,7 +567,7 @@ XLA_TEST_F(CpuGpuFusionTest, DynamicSliceNegate) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, ReshapeNegate) {
+TEST_F(CpuGpuFusionTest, ReshapeNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -570,7 +585,7 @@ XLA_TEST_F(CpuGpuFusionTest, ReshapeNegate) {
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, TransposeNegate) {
+TEST_F(CpuGpuFusionTest, TransposeNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -599,7 +614,7 @@ std::unique_ptr<HloComputation> MakeReduceTestComputation() {
   return builder.Build();
 }
 
-XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(Reduce)) {
+TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(Reduce)) {
   auto hlo_module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
@@ -618,7 +633,7 @@ XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(Reduce)) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, ReduceImplicitBroadcast) {
+TEST_F(CpuGpuFusionTest, ReduceImplicitBroadcast) {
   auto hlo_module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
@@ -640,7 +655,7 @@ XLA_TEST_F(CpuGpuFusionTest, ReduceImplicitBroadcast) {
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(ReduceWindow)) {
+TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -693,7 +708,7 @@ XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(ReduceWindow)) {
 // When a constant (or other op) which has multiple users is imported
 // into a fusion, it should remain shared, rather than being duplicated
 // within the fusion.
-XLA_TEST_F(CpuGpuFusionTest, SharedConstant) {
+TEST_F(CpuGpuFusionTest, SharedConstant) {
   auto hlo_module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
@@ -728,7 +743,7 @@ XLA_TEST_F(CpuGpuFusionTest, SharedConstant) {
 
 // Test that fusion can handle elementwise ops with more than one user. This
 // test case needs deduplication to avoid exponential compile time.
-XLA_TEST_F(CpuGpuFusionTest, Fibonacci) {
+TEST_F(CpuGpuFusionTest, Fibonacci) {
   const char* const kModuleStr = R"(
   HloModule fibonacci
 
@@ -777,65 +792,66 @@ XLA_TEST_F(CpuGpuFusionTest, Fibonacci) {
       RunAndCompare(std::move(module), {&literal0, &literal1}, std::nullopt));
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Add2D) {
+TEST_F(CpuGpuFusionTest, Add2D) {
   TestElementwise2D<float, 2>(HloOpcode::kAdd);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Subtract2D) {
+TEST_F(CpuGpuFusionTest, Subtract2D) {
   TestElementwise2D<float, 2>(HloOpcode::kSubtract);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Multiply2D) {
+TEST_F(CpuGpuFusionTest, Multiply2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMultiply);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Divide2D) {
+TEST_F(CpuGpuFusionTest, Divide2D) {
   TestElementwise2D<float, 2>(HloOpcode::kDivide);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Power2D) {
+TEST_F(CpuGpuFusionTest, Power2D) {
   TestElementwise2D<float, 2>(HloOpcode::kPower);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Minimum2D) {
+TEST_F(CpuGpuFusionTest, Minimum2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMinimum);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Maximum2D) {
+TEST_F(CpuGpuFusionTest, Maximum2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMaximum);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Equal2D) {
+TEST_F(CpuGpuFusionTest, Equal2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kEq);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Inequal2D) {
+TEST_F(CpuGpuFusionTest, Inequal2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kNe);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Greater2D) {
+TEST_F(CpuGpuFusionTest, Greater2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGt);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Lesser2D) {
+TEST_F(CpuGpuFusionTest, Lesser2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLt);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, GreaterOrEqual2D) {
+TEST_F(CpuGpuFusionTest, GreaterOrEqual2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGe);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, LesserOrEqual2D) {
+TEST_F(CpuGpuFusionTest, LesserOrEqual2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLe);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, Clamp2D) {
+TEST_F(CpuGpuFusionTest, Clamp2D) {
   TestElementwise2D<float, 3>(HloOpcode::kClamp);
 }
 
-class FusionClientLibraryTest : public ClientLibraryTestBase {};
+class FusionClientLibraryTest
+    : public ClientLibraryTestRunnerMixin<HloTestBase> {};
 
-XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
+TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
   // On the GPU backend, it's possible to have too many transposes within one
   // fusion, causing the kernel to run out shared memory and thus not compile.
   // We want to check that doesn't happen.
@@ -863,17 +879,21 @@ XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
   Literal l2 = LiteralUtil::CreateR2FromArray2D(arr).Relayout(
       LayoutUtil::MakeLayout({1, 0}));
 
-  XlaOp p0 = AddParam(l1, &b);
+  std::vector<const Literal*> params;
+  XlaOp p0 = Parameter(&b, 0, l1.shape(), "");
+  params.push_back(&l1);
   XlaOp sum = p0;
   for (int i = 1; i < kNumParams; ++i) {
-    auto pN = AddParam((i % 2 == 0 ? l1 : l2), &b);
+    const Literal& l = i % 2 == 0 ? l1 : l2;
+    XlaOp pN = Parameter(&b, i, l.shape(), "");
+    params.push_back(&l);
     sum = sum + p0 * pN * pN;
   }
 
-  ComputeAndCompare(&b, {});
+  ComputeAndCompare(&b, params);
 }
 
-XLA_TEST_F(CpuGpuFusionTest, TransposeDiamondWithNonTrivialBranch) {
+TEST_F(CpuGpuFusionTest, TransposeDiamondWithNonTrivialBranch) {
   const char* hlo = R"(
 HloModule module
 
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index 180d50c57b7e..2b9f9fa2c312 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -880,7 +880,7 @@ XLA_TEST_F(FfiCustomCallTest, FfiUnknownTarget) {
   module->AddEntryComputation(builder.Build());
 
   auto status = Execute(std::move(module), {}).status();
-  EXPECT_THAT(status.message(), HasSubstr("No registered implementation"));
+  EXPECT_THAT(status.message(), HasSubstr("No FFI handler registered for"));
 }
 
 XLA_TEST_F(FfiCustomCallTest, FfiReportsFailure) {
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index 932639ca3847..64c1bdde5d8b 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -2053,36 +2053,6 @@ ENTRY SmallIntegerDot {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(DotOperationTextTest, DISABLED_ON_GPU(PackedNibbleDot)) {
-  absl::string_view hlo_string =
-      R"(
-HloModule SmallIntegerDot
-
-ENTRY SmallIntegerDot {
-  arg0 = s8[20,55] parameter(0)
-  arg1 = s8[55,20] parameter(1)
-  ROOT dot = s32[20,20] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={PACKED_NIBBLE, PACKED_NIBBLE}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
-}
-
-XLA_TEST_F(DotOperationTextTest, UnsignedPackedNibbleDot) {
-  absl::string_view hlo_string =
-      R"(
-HloModule SmallIntegerDot
-
-ENTRY SmallIntegerDot {
-  arg0 = u8[3,11,21] parameter(0)
-  arg1 = u8[55,21,3] parameter(1)
-  ROOT dot = u32[3,11,55] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={2}, rhs_contracting_dims={1}, operand_precision={PACKED_NIBBLE, PACKED_NIBBLE}
-}
-)";
-
-  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
-}
-
 XLA_TEST_F(DotOperationTextTest, S32Dot) {
   absl::string_view hlo_string =
       R"(
diff --git a/third_party/xla/xla/tests/dynamic_ops_test.cc b/third_party/xla/xla/tests/dynamic_ops_test.cc
index 747d073fc5dd..29e86ec9fad7 100644
--- a/third_party/xla/xla/tests/dynamic_ops_test.cc
+++ b/third_party/xla/xla/tests/dynamic_ops_test.cc
@@ -13,33 +13,50 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <numeric>
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+#include "benchmark/benchmark.h"
 #include "xla/array2d.h"
+#include "xla/array3d.h"
 #include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
+#include "xla/error_spec.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test_helpers.h"
-#include "xla/reference_util.h"
-#include "xla/service/local_service.h"
+#include "xla/literal_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/service.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/types.h"
 
 namespace xla {
 namespace {
 
-class DynamicSliceTest : public ClientLibraryTestBase {
+class DynamicSliceTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   template <typename IndexT, typename DataT>
   void TestR1() {
@@ -136,13 +153,13 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR0Parameter<IndexT>(
+    const Literal start_data = CreateR0Parameter<IndexT>(
         slice_starts[0], 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     DynamicSlice(input, absl::Span<const XlaOp>({starts}), slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {&start_data});
   }
 
   template <typename IndexT, typename DataT>
@@ -162,7 +179,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     std::vector<XlaOp> starts(2);
-    std::vector<std::unique_ptr<GlobalData>> start_data(2);
+    std::vector<Literal> start_data(2);
     for (int i = 0; i < 2; ++i) {
       start_data[i] = CreateR0Parameter<IndexT>(
           slice_starts[i], i, "slice_starts", &builder, &starts[i]);
@@ -172,11 +189,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     auto input = ConstantLiteral(&builder, input_values);
     DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    std::vector<GlobalData*> argument_ptrs;
+    std::vector<const Literal*> argument_ptrs;
     absl::c_transform(start_data, std::back_inserter(argument_ptrs),
-                      [](const std::unique_ptr<GlobalData>& argument) {
-                        return argument.get();
-                      });
+                      [](const Literal& argument) { return &argument; });
     ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
@@ -197,7 +212,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     std::vector<XlaOp> starts(3);
-    std::vector<std::unique_ptr<GlobalData>> start_data(3);
+    std::vector<Literal> start_data(3);
     for (int i = 0; i < 3; ++i) {
       start_data[i] = CreateR0Parameter<IndexT>(
           slice_starts[i], i, "slice_starts", &builder, &starts[i]);
@@ -206,48 +221,46 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     auto input = ConstantLiteral(&builder, input_values);
     DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    std::vector<GlobalData*> argument_ptrs;
+    std::vector<const Literal*> argument_ptrs;
     absl::c_transform(start_data, std::back_inserter(argument_ptrs),
-                      [](const std::unique_ptr<GlobalData>& argument) {
-                        return argument.get();
-                      });
+                      [](const Literal& argument) { return &argument; });
     ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 };
 
-XLA_TEST_F(DynamicSliceTest, Int32R1BF16) { TestR1<int32_t, bfloat16>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32_t, int32_t>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R1OOB) { TestR1OOB<int32_t, int32_t>(); }
-XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64_t, float>(); }
-XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64_t, float>(); }
-XLA_TEST_F(DynamicSliceTest, UInt32R1OOB) {
+TEST_F(DynamicSliceTest, Int32R1BF16) { TestR1<int32_t, bfloat16>(); }
+TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32_t, int32_t>(); }
+TEST_F(DynamicSliceTest, Int32R1OOB) { TestR1OOB<int32_t, int32_t>(); }
+TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64_t, float>(); }
+TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64_t, float>(); }
+TEST_F(DynamicSliceTest, UInt32R1OOB) {
   RunR1<uint32_t, int32_t>({0, 1, 2, 3, 4}, {2147483648u}, {2}, {3, 4});
 }
-XLA_TEST_F(DynamicSliceTest, UInt8R1) {
+TEST_F(DynamicSliceTest, UInt8R1) {
   std::vector<int32_t> data(129);
   absl::c_iota(data, 0);
   RunR1<uint8_t, int32_t>(data, {128}, {1}, {128});
 }
-XLA_TEST_F(DynamicSliceTest, Int32R2BF16) { TestR2<int32_t, bfloat16>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32_t, int32_t>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R2OOB) { TestR2OOB<int32_t, int32_t>(); }
-XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64_t, float>(); }
-XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64_t, int32_t>(); }
-XLA_TEST_F(DynamicSliceTest, UInt32R2OOB) {
+TEST_F(DynamicSliceTest, Int32R2BF16) { TestR2<int32_t, bfloat16>(); }
+TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32_t, int32_t>(); }
+TEST_F(DynamicSliceTest, Int32R2OOB) { TestR2OOB<int32_t, int32_t>(); }
+TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64_t, float>(); }
+TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64_t, int32_t>(); }
+TEST_F(DynamicSliceTest, UInt32R2OOB) {
   RunR2<uint32_t, int32_t>({{0, 1}, {2, 3}}, {2147483648u, 0}, {1, 1}, {{2}});
 }
 
-XLA_TEST_F(DynamicSliceTest, Int32R3BF16) { TestR3<int32_t, bfloat16>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32_t, float>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R3OOB) { TestR3OOB<int32_t, float>(); }
-XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64_t, float>(); }
-XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64_t, float>(); }
-XLA_TEST_F(DynamicSliceTest, UInt32R3OOB) {
+TEST_F(DynamicSliceTest, Int32R3BF16) { TestR3<int32_t, bfloat16>(); }
+TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32_t, float>(); }
+TEST_F(DynamicSliceTest, Int32R3OOB) { TestR3OOB<int32_t, float>(); }
+TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64_t, float>(); }
+TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64_t, float>(); }
+TEST_F(DynamicSliceTest, UInt32R3OOB) {
   RunR3<uint32_t, int32_t>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}},
                            {2147483648u, 0, 2147483648u}, {1, 1, 1}, {{{5}}});
 }
 
-XLA_TEST_F(DynamicSliceTest, Int32R1Pred) {
+TEST_F(DynamicSliceTest, Int32R1Pred) {
   // Slice at dimension start.
   RunR1<int32_t, bool>({true, false, false, true, false, true, true, false},
                        {0}, {5}, {true, false, false, true, false});
@@ -262,7 +275,7 @@ XLA_TEST_F(DynamicSliceTest, Int32R1Pred) {
                        {2}, {0}, {});
 }
 
-XLA_TEST_F(DynamicSliceTest, Int32R2Pred) {
+TEST_F(DynamicSliceTest, Int32R2Pred) {
   // Slice at dimension start.
   RunR2<int32_t, bool>(
       {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
@@ -285,7 +298,7 @@ XLA_TEST_F(DynamicSliceTest, Int32R2Pred) {
       {0, 2}, Array2D<int>(0, 2));
 }
 
-XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
+TEST_F(DynamicSliceTest, Int32R3Pred) {
   // R3 Shape: [2, 3, 2]
   // clang-format off
 
@@ -306,14 +319,15 @@ XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
   // clang-format on
 }
 
-class DynamicUpdateSliceTest : public ClientLibraryTestBase {
+class DynamicUpdateSliceTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   template <typename IndexT, typename DataT>
   void TestR0() {
     // Disable algebraic simplifier, otherwise the op will be replaced by a
     // constant.
-    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-        "algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     RunR0<IndexT, DataT>(0, 123, {}, 123);
   }
 
@@ -423,14 +437,14 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR0Parameter<IndexT>(
+    const Literal start_data = CreateR0Parameter<IndexT>(
         slice_starts[0], 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     auto update = ConstantLiteral(&builder, update_values);
     DynamicUpdateSlice(input, update, absl::Span<const XlaOp>({starts}));
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {&start_data});
   }
 
   template <typename IndexT, typename DataT>
@@ -454,7 +468,7 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     std::vector<XlaOp> starts(2);
-    std::vector<std::unique_ptr<GlobalData>> start_data(2);
+    std::vector<Literal> start_data(2);
     for (int i = 0; i < 2; ++i) {
       start_data[i] = CreateR0Parameter<IndexT>(
           slice_starts[i], i, "slice_starts", &builder, &starts[i]);
@@ -464,11 +478,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     auto update = ConstantLiteral(&builder, update_values);
     DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    std::vector<GlobalData*> argument_ptrs;
+    std::vector<const Literal*> argument_ptrs;
     absl::c_transform(start_data, std::back_inserter(argument_ptrs),
-                      [](const std::unique_ptr<GlobalData>& argument) {
-                        return argument.get();
-                      });
+                      [](const Literal& argument) { return &argument; });
     ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
@@ -493,7 +505,7 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     std::vector<XlaOp> starts(3);
-    std::vector<std::unique_ptr<GlobalData>> start_data(3);
+    std::vector<Literal> start_data(3);
     for (int i = 0; i < 3; ++i) {
       start_data[i] = CreateR0Parameter<IndexT>(
           slice_starts[i], i, "slice_starts", &builder, &starts[i]);
@@ -504,11 +516,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     auto update = ConstantLiteral(&builder, update_values);
     DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    std::vector<GlobalData*> argument_ptrs;
+    std::vector<const Literal*> argument_ptrs;
     absl::c_transform(start_data, std::back_inserter(argument_ptrs),
-                      [](const std::unique_ptr<GlobalData>& argument) {
-                        return argument.get();
-                      });
+                      [](const Literal& argument) { return &argument; });
     ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
@@ -547,11 +557,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer input parameter.
     XlaOp input;
-    std::unique_ptr<GlobalData> input_data =
+    const Literal input_data =
         CreateR3Parameter<T>(input_values, 0, "input_values", &builder, &input);
     // Initialize and transfer update parameter.
     XlaOp update;
-    std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
+    const Literal update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
     auto constant_index = ConstantR0<int32_t>(&builder, index);
     auto zero = ConstantR0<int32_t>(&builder, 0);
@@ -559,8 +569,7 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
 
     // Run computation and compare against expected values.
     ComputeAndCompareR3<T>(&builder, expected_values,
-                           {input_data.get(), update_data.get()},
-                           ErrorSpec(0.000001));
+                           {&input_data, &update_data}, ErrorSpec(0.000001));
   }
 
   template <typename NativeT>
@@ -570,20 +579,20 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 };
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R0BF16) { TestR0<int32_t, bfloat16>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R0) { TestR0<int32_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0<int64_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0<uint64_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, Int32R0BF16) { TestR0<int32_t, bfloat16>(); }
+TEST_F(DynamicUpdateSliceTest, Int32R0) { TestR0<int32_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0<int64_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0<uint64_t, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R1BF16) { TestR1<int32_t, bfloat16>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt32R1OOB) {
+TEST_F(DynamicUpdateSliceTest, Int32R1BF16) { TestR1<int32_t, bfloat16>(); }
+TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, UInt32R1OOB) {
   RunR1<uint32_t, int32_t>({0, 1, 2, 3, 4}, {5, 6}, {2147483648u},
                            {0, 1, 2, 5, 6});
 }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt8R1) {
+TEST_F(DynamicUpdateSliceTest, UInt8R1) {
   std::vector<int32_t> data(129);
   absl::c_iota(data, 0);
   std::vector<int32_t> expected = data;
@@ -591,33 +600,31 @@ XLA_TEST_F(DynamicUpdateSliceTest, UInt8R1) {
   RunR1<uint8_t, int32_t>(data, {-1}, {128}, expected);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R2BF16) { TestR2<int32_t, bfloat16>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64_t, int64_t>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64_t, int32_t>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt32R2OOB) {
+TEST_F(DynamicUpdateSliceTest, Int32R2BF16) { TestR2<int32_t, bfloat16>(); }
+TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64_t, int64_t>(); }
+TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64_t, int32_t>(); }
+TEST_F(DynamicUpdateSliceTest, UInt32R2OOB) {
   RunR2<uint32_t, int32_t>({{0, 1}, {2, 3}}, {{4}}, {2147483648u, 0},
                            {{0, 1}, {4, 3}});
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R3BF16) { TestR3<int32_t, bfloat16>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64_t, int64_t>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64_t, uint64_t>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt32R3OOB) {
+TEST_F(DynamicUpdateSliceTest, Int32R3BF16) { TestR3<int32_t, bfloat16>(); }
+TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64_t, int64_t>(); }
+TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64_t, uint64_t>(); }
+TEST_F(DynamicUpdateSliceTest, UInt32R3OOB) {
   RunR3<uint32_t, int32_t>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}, {{{8}}},
                            {2147483648u, 0, 2147483648u},
                            {{{0, 1}, {2, 3}}, {{4, 8}, {6, 7}}});
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32OOBBF16) {
-  TestOOB<int32_t, bfloat16>();
-}
-XLA_TEST_F(DynamicUpdateSliceTest, Int32OOB) { TestOOB<int32_t, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int64OOB) { TestOOB<int64_t, int64_t>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64OOB) { TestOOB<uint64_t, uint64_t>(); }
+TEST_F(DynamicUpdateSliceTest, Int32OOBBF16) { TestOOB<int32_t, bfloat16>(); }
+TEST_F(DynamicUpdateSliceTest, Int32OOB) { TestOOB<int32_t, float>(); }
+TEST_F(DynamicUpdateSliceTest, Int64OOB) { TestOOB<int64_t, int64_t>(); }
+TEST_F(DynamicUpdateSliceTest, UInt64OOB) { TestOOB<uint64_t, uint64_t>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
+TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
   // Slice at dimension start.
   RunR1<int32_t, bool>({false, false, true, true, false, true, true, false},
                        {true, true, false}, {0},
@@ -636,7 +643,7 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
                        {false, false, true, true, false, true, true, false});
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R2Pred) {
+TEST_F(DynamicUpdateSliceTest, Int32R2Pred) {
   // Slice at dimension start.
   RunR2<int32_t, bool>(
       {{false, true, false}, {true, false, true}, {false, true, true}},
@@ -658,7 +665,7 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R2Pred) {
       {2, 1}, {{false, true, false}, {true, false, true}, {false, true, true}});
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
+TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
   // R3 Shape: [2, 3, 2]
   // Slice at dimension start.
   RunR3<int32_t, bool>(
@@ -678,77 +685,79 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
 
 // Tests for simple R3 case where the update is contiguous (i.e. the minor
 // two dimensions are not sliced).
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElement) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElement) {
   // Single element, index in-bounds
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElementBF16) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElementBF16) {
   // Single element, index in-bounds
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElements) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElements) {
   // Multiples element, index in-bounds.
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElementsBF16) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElementsBF16) {
   // Multiples element, index in-bounds.
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOB) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOB) {
   // Multiple element, index out of bounds.
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/3, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOBBF16) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOBBF16) {
   // Multiple element, index out of bounds.
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/3, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLarge) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLarge) {
   // Multiple element, update size larger than operand.
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/5, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLargeBF16) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLargeBF16) {
   // Multiple element, update size larger than operand.
   std::vector<int32_t> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/5, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousUnaligned) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousUnaligned) {
   std::vector<int32_t> operand_shape({3, 123, 247});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousUnalignedBF16) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousUnalignedBF16) {
   std::vector<int32_t> operand_shape({3, 123, 247});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousLarger) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousLarger) {
   std::vector<int32_t> operand_shape({32, 128, 1024});
   RunR3Contiguous<float>(operand_shape, /*index=*/7, /*size=*/1);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousLargerBF16) {
+TEST_F(DynamicUpdateSliceTest, R3ContiguousLargerBF16) {
   std::vector<int32_t> operand_shape({32, 128, 1024});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/7, /*size=*/1);
 }
 
+using DynamicOpsTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
+
 // This test that buffer assignment does not alias constants with the output of
 // dynamic update slice.
-XLA_TEST_F(HloTestBase, AddOfDUS) {
+TEST_F(DynamicOpsTest, AddOfDUS) {
   const char* hlo_string = R"(
   HloModule m
   test {
@@ -769,7 +778,7 @@ XLA_TEST_F(HloTestBase, AddOfDUS) {
 // and multiple output fusions of dynamic update slices produce the right
 // results. On some backends (e.g. GPU), this is done inplace.
 #ifdef XLA_TEST_BACKEND_GPU
-XLA_TEST_F(HloTestBase, MultipleOutputFusedDynamicUpdateSlices) {
+TEST_F(DynamicOpsTest, MultipleOutputFusedDynamicUpdateSlices) {
   const char* hlo_string = R"(
 HloModule MultipleInplaceDus, input_output_alias={ {0}: (0, {}), {1}: (2, {}) }
 
@@ -799,8 +808,8 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(HloTestBase,
-           MultipleOutputFusedDynamicUpdateSlicesWithTransposeBitcastedRoot) {
+TEST_F(DynamicOpsTest,
+       MultipleOutputFusedDynamicUpdateSlicesWithTransposeBitcastedRoot) {
   const char* hlo_string = R"(
 HloModule MultipleInplaceDusWithTransposeBitcastToTheRoot, input_output_alias={ {0}: (0, {}), {1}: (2, {}) }
 
@@ -831,8 +840,8 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(HloTestBase,
-           SingleFusedDynamicUpdateSliceWithTransposeBitcastedRoot) {
+TEST_F(DynamicOpsTest,
+       SingleFusedDynamicUpdateSliceWithTransposeBitcastedRoot) {
   const char* hlo_string = R"(
 HloModule SingleInplaceDusWithTransposeBitcastToTheRoot, input_output_alias={ {}: (0, {}) }
 
@@ -859,7 +868,7 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(HloTestBase, SingleFusedDynamicUpdateSliceWithReshapeBitcastedRoot) {
+TEST_F(DynamicOpsTest, SingleFusedDynamicUpdateSliceWithReshapeBitcastedRoot) {
   const char* hlo_string = R"(
 HloModule SingleInplaceDusWithReshapeBitcastToTheRoot, input_output_alias={ {}: (0, {}) }
 
@@ -886,8 +895,8 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(HloTestBase,
-           SingleFusedDynamicUpdateSliceWithBitcastedRootAndParameter) {
+TEST_F(DynamicOpsTest,
+       SingleFusedDynamicUpdateSliceWithBitcastedRootAndParameter) {
   const char* hlo_string = R"(
 HloModule SingleInplaceDusWithBitcastToTheRootAndFromTheParameter, input_output_alias={ {}: (0, {}) }
 
@@ -916,8 +925,8 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(HloTestBase,
-           SingleFusedDynamicUpdateSliceWithSameDynamicSliceAccess) {
+TEST_F(DynamicOpsTest,
+       SingleFusedDynamicUpdateSliceWithSameDynamicSliceAccess) {
   const char* hlo_string = R"(
 HloModule fusion, input_output_alias={ {}: (0, {}) }
 
@@ -943,8 +952,8 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0, 0}));
 }
 
-XLA_TEST_F(HloTestBase,
-           SingleFusedDynamicUpdateSliceWithDynamicSliceAccessSlicesOfSizeOne) {
+TEST_F(DynamicOpsTest,
+       SingleFusedDynamicUpdateSliceWithDynamicSliceAccessSlicesOfSizeOne) {
   const char* hlo_string = R"(
 HloModule fusion, input_output_alias={ {}: (0, {}) }
 
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index 934110976952..a0930dcbbf38 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -97,7 +97,7 @@ cc_library(
     deps = [
         ":exhaustive_op_test_utils",
         "//xla/tsl/util:command_line_flags",
-        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_for_library",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:test",
     ],
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_complex_test.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_complex_test.cc
index 3ce2ea7adbd3..2365ddf9cf79 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_complex_test.cc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_complex_test.cc
@@ -115,7 +115,8 @@ UNARY_TEST_COMPLEX_64(Log, {
         .skip_comparison(should_skip)
         .build();
   };
-  Run(Log, [](complex64 x) { return std::log(x); }, error_spec_gen);
+  Run([](XlaOp x) { return Log(x); }, [](complex64 x) { return std::log(x); },
+      error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_64(Sqrt, {
@@ -135,7 +136,8 @@ UNARY_TEST_COMPLEX_64(Sqrt, {
         .rel_err(50 * std::numeric_limits<float>::epsilon())
         .build();
   };
-  Run(Sqrt, [](complex64 x) { return std::sqrt(x); }, error_spec_gen);
+  Run([](XlaOp x) { return Sqrt(x); }, [](complex64 x) { return std::sqrt(x); },
+      error_spec_gen);
 })
 
 template <typename NativeT, typename ComponentNativeT>
@@ -185,8 +187,8 @@ UNARY_TEST_COMPLEX_64(Rsqrt, {
     };
   }
 
-  Run(
-      Rsqrt, [](complex64 x) { return complex64(1, 0) / std::sqrt(x); },
+  Run([](XlaOp x) { return Rsqrt(x); },
+      [](complex64 x) { return complex64(1, 0) / std::sqrt(x); },
       error_spec_gen);
 })
 
@@ -200,8 +202,7 @@ UNARY_TEST_COMPLEX_64(Tanh, {
 
     return GetDefaultSpecGenerator()(x);
   };
-  Run(
-      Tanh,
+  Run([](XlaOp x) { return Tanh(x); },
       +[](complex64 x) {
         // The current libc++ implementation of the complex tanh function
         // provides less accurate results when the denominator of a complex tanh
@@ -267,7 +268,8 @@ UNARY_TEST_COMPLEX_128(Log, {
     };
   }
 
-  Run(Log, [](complex128 x) { return std::log(x); }, error_spec_gen);
+  Run([](XlaOp x) { return Log(x); }, [](complex128 x) { return std::log(x); },
+      error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_128(Sqrt, {
@@ -285,7 +287,8 @@ UNARY_TEST_COMPLEX_128(Sqrt, {
         .skip_comparison(std::abs(x) > std::numeric_limits<double>::max() / 2)
         .build();
   };
-  Run(Sqrt, [](complex128 x) { return std::sqrt(x); }, error_spec_gen);
+  Run([](XlaOp x) { return Sqrt(x); },
+      [](complex128 x) { return std::sqrt(x); }, error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_128(Rsqrt, {
@@ -321,8 +324,8 @@ UNARY_TEST_COMPLEX_128(Rsqrt, {
     };
   }
 
-  Run(
-      Rsqrt, [](complex128 x) { return complex128(1, 0) / std::sqrt(x); },
+  Run([](XlaOp x) { return Rsqrt(x); },
+      [](complex128 x) { return complex128(1, 0) / std::sqrt(x); },
       error_spec_gen);
 })
 
@@ -345,7 +348,8 @@ UNARY_TEST_COMPLEX_128(Tanh, {
         .build();
   };
 
-  Run(Tanh, +[](complex128 x) { return std::tanh(x); }, error_spec_gen);
+  Run([](XlaOp x) { return Tanh(x); },
+      +[](complex128 x) { return std::tanh(x); }, error_spec_gen);
 })
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc
index 933d4485da82..4ee06048fa15 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_ops.inc
@@ -167,13 +167,18 @@ NativeRefT HostDigamma(NativeRefT x) {
   };                                                         \
   static_assert(true, "")
 
-DEFINE_UNARY_TEST_OP(LogOp, { return Log; }, { return std::log; });
-DEFINE_UNARY_TEST_OP(Log1pOp, { return Log1p; }, { return std::log1p; });
+DEFINE_UNARY_TEST_OP(
+    LogOp, { return [](XlaOp x) { return Log(x); }; }, { return std::log; });
+DEFINE_UNARY_TEST_OP(
+    Log1pOp, { return [](XlaOp x) { return Log1p(x); }; },
+    { return std::log1p; });
 DEFINE_UNARY_TEST_OP(
     ExpOp, { return [](XlaOp x) { return Exp(x); }; }, { return std::exp; });
-DEFINE_UNARY_TEST_OP(Expm1Op, { return Expm1; }, { return std::expm1; });
 DEFINE_UNARY_TEST_OP(
-    LogisticOp, { return Logistic; },
+    Expm1Op, { return [](XlaOp x) { return Expm1(x); }; },
+    { return std::expm1; });
+DEFINE_UNARY_TEST_OP(
+    LogisticOp, { return [](XlaOp x) { return Logistic(x); }; },
     {
       return +[](Traits::NativeRefT x) { return 1.0f / (1.0f + std::exp(-x)); };
     });
@@ -186,10 +191,12 @@ DEFINE_UNARY_TEST_OP(
     { return [](XlaOp x) { return Pow(ScalarLike(x, 2.0f), x); }; },
     { return +[](Traits::NativeRefT x) { return std::exp2(x); }; });
 DEFINE_UNARY_TEST_OP(
-    RsqrtOp, { return Rsqrt; },
+    RsqrtOp, { return [](XlaOp x) { return Rsqrt(x); }; },
     { return +[](Traits::NativeRefT x) { return 1 / std::sqrt(x); }; });
-DEFINE_UNARY_TEST_OP(SqrtOp, { return Sqrt; }, { return std::sqrt; });
-DEFINE_UNARY_TEST_OP(CbrtOp, { return Cbrt; }, { return std::cbrt; });
+DEFINE_UNARY_TEST_OP(
+    SqrtOp, { return [](XlaOp x) { return Sqrt(x); }; }, { return std::sqrt; });
+DEFINE_UNARY_TEST_OP(
+    CbrtOp, { return [](XlaOp x) { return Cbrt(x); }; }, { return std::cbrt; });
 DEFINE_UNARY_TEST_OP(AcoshOp, { return Acosh; }, { return std::acosh; });
 DEFINE_UNARY_TEST_OP(AsinhOp, { return Asinh; }, { return std::asinh; });
 DEFINE_UNARY_TEST_OP(AtanhOp, { return Atanh; }, { return std::atanh; });
@@ -198,11 +205,16 @@ DEFINE_UNARY_TEST_OP(AsinOp, { return Asin; }, { return std::asin; });
 DEFINE_UNARY_TEST_OP(AtanOp, { return Atan; }, { return std::atan; });
 DEFINE_UNARY_TEST_OP(CoshOp, { return Cosh; }, { return std::cosh; });
 DEFINE_UNARY_TEST_OP(SinhOp, { return Sinh; }, { return std::sinh; });
-DEFINE_UNARY_TEST_OP(TanhOp, { return Tanh; }, { return std::tanh; });
-DEFINE_UNARY_TEST_OP(CosOp, { return Cos; }, { return std::cos; });
-DEFINE_UNARY_TEST_OP(SinOp, { return Sin; }, { return std::sin; });
-DEFINE_UNARY_TEST_OP(TanOp, { return Tan; }, { return std::tan; });
-DEFINE_UNARY_TEST_OP(ErfOp, { return Erf; }, { return std::erf; });
+DEFINE_UNARY_TEST_OP(
+    TanhOp, { return [](XlaOp x) { return Tanh(x); }; }, { return std::tanh; });
+DEFINE_UNARY_TEST_OP(
+    CosOp, { return [](XlaOp x) { return Cos(x); }; }, { return std::cos; });
+DEFINE_UNARY_TEST_OP(
+    SinOp, { return [](XlaOp x) { return Sin(x); }; }, { return std::sin; });
+DEFINE_UNARY_TEST_OP(
+    TanOp, { return [](XlaOp x) { return Tan(x); }; }, { return std::tan; });
+DEFINE_UNARY_TEST_OP(
+    ErfOp, { return [](XlaOp x) { return Erf(x); }; }, { return std::erf; });
 DEFINE_UNARY_TEST_OP(ErfcOp, { return Erfc; }, { return std::erfc; });
 DEFINE_UNARY_TEST_OP(
     ErfInvOp, { return ErfInv; },
diff --git a/third_party/xla/xla/tests/filecheck.h b/third_party/xla/xla/tests/filecheck.h
deleted file mode 100644
index e96152510c45..000000000000
--- a/third_party/xla/xla/tests/filecheck.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TESTS_FILECHECK_H_
-#define XLA_TESTS_FILECHECK_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/testlib/filecheck.h"
-
-#endif  // XLA_TESTS_FILECHECK_H_
diff --git a/third_party/xla/xla/tests/float8_test.cc b/third_party/xla/xla/tests/float8_test.cc
index 71d50ebd6f86..4d5ac803be56 100644
--- a/third_party/xla/xla/tests/float8_test.cc
+++ b/third_party/xla/xla/tests/float8_test.cc
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cmath>
-#include <memory>
-#include <vector>
-
 #include <gtest/gtest.h>
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "tsl/platform/ml_dtypes.h"
 
 namespace xla {
@@ -29,7 +28,8 @@ namespace {
 
 // Test FP8 floating-point types
 template <typename T>
-class Float8Test : public ClientLibraryTestBase {};
+class Float8Test : public ClientLibraryTestRunnerMixin<
+                       HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
 
 using DataTypes = ::testing::Types<tsl::float8_e5m2, tsl::float8_e4m3,
                                    tsl::float8_e4m3fn, tsl::float8_e3m4>;
diff --git a/third_party/xla/xla/tests/floor_ceil_test.cc b/third_party/xla/xla/tests/floor_ceil_test.cc
index c164645e954e..7e5469cb2e8e 100644
--- a/third_party/xla/xla/tests/floor_ceil_test.cc
+++ b/third_party/xla/xla/tests/floor_ceil_test.cc
@@ -14,21 +14,21 @@ limitations under the License.
 ==============================================================================*/
 
 #include <limits>
-#include <string>
 
+#include "absl/log/log.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
-#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class FloorCeilTest : public ClientLibraryTestBase {
+class FloorCeilTest : public ClientLibraryTestRunnerMixin<
+                          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   enum Function {
     kFloor,
@@ -63,8 +63,6 @@ class FloorCeilTest : public ClientLibraryTestBase {
     ComputeAndCompareR0<float>(&builder, expected, /*arguments=*/{});
   }
 
-  const ErrorSpec error_spec_{0.0001};
-
   float infinity_ = std::numeric_limits<float>::infinity();
   float minus_infinity_ = -std::numeric_limits<float>::infinity();
 };
@@ -74,7 +72,7 @@ class FloorCeilTest : public ClientLibraryTestBase {
 // * passing x86-based CPU's qnan to the GPU makes a different nan
 //   "7fc00000=nan=nan vs 7fffffff=nan=nan"
 
-XLA_TEST_F(FloorCeilTest, R1S0Floor) { TestR1F32({}, {}, kFloor); }
+TEST_F(FloorCeilTest, R1S0Floor) { TestR1F32({}, {}, kFloor); }
 
 TEST_F(FloorCeilTest, R1Floor) {
   TestR1F32({0.0, -0.0, infinity_, minus_infinity_, 1.1, -0.1},
diff --git a/third_party/xla/xla/tests/fmax_fmin_test.cc b/third_party/xla/xla/tests/fmax_fmin_test.cc
index b386de39ad20..ec10bdeb3fe3 100644
--- a/third_party/xla/xla/tests/fmax_fmin_test.cc
+++ b/third_party/xla/xla/tests/fmax_fmin_test.cc
@@ -13,20 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cmath>
 #include <vector>
 
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/service/service.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/literal.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class FmaxSimpleTest : public ClientLibraryTestBase {};
+using FmaxSimpleTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
-XLA_TEST_F(FmaxSimpleTest, FmaxTenValues) {
+TEST_F(FmaxSimpleTest, FmaxTenValues) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
@@ -40,16 +44,16 @@ XLA_TEST_F(FmaxSimpleTest, FmaxTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(FmaxSimpleTest, FmaxEdgeCases) {
+TEST_F(FmaxSimpleTest, FmaxEdgeCases) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
   XlaOp param0, param1;
-  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+  const Literal param0_data = CreateR1Parameter<float>(
       {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
        INFINITY, -INFINITY, NAN},
       /*parameter_number=*/0, /*name=*/"param0",
       /*builder=*/&builder, /*data_handle=*/&param0);
-  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+  const Literal param1_data = CreateR1Parameter<float>(
       {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
       /*parameter_number=*/1, /*name=*/"param1",
       /*builder=*/&builder, /*data_handle=*/&param1);
@@ -57,21 +61,20 @@ XLA_TEST_F(FmaxSimpleTest, FmaxEdgeCases) {
   Max(param0, param1);
   std::vector<float> expected = {INFINITY, INFINITY, NAN,      NAN, INFINITY,
                                  -5,       NAN,      INFINITY, 8,   NAN};
-  ComputeAndCompareR1<float>(&builder, expected,
-                             {param0_data.get(), param1_data.get()},
+  ComputeAndCompareR1<float>(&builder, expected, {&param0_data, &param1_data},
                              ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(FmaxSimpleTest, FminEdgeCases) {
+TEST_F(FmaxSimpleTest, FminEdgeCases) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
   XlaOp param0, param1;
-  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+  const Literal param0_data = CreateR1Parameter<float>(
       {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
        INFINITY, -INFINITY, NAN},
       /*parameter_number=*/0, /*name=*/"param0",
       /*builder=*/&builder, /*data_handle=*/&param0);
-  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+  const Literal param1_data = CreateR1Parameter<float>(
       {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
       /*parameter_number=*/1, /*name=*/"param1",
       /*builder=*/&builder, /*data_handle=*/&param1);
@@ -79,8 +82,7 @@ XLA_TEST_F(FmaxSimpleTest, FminEdgeCases) {
   Min(param0, param1);
   std::vector<float> expected = {INFINITY,  -INFINITY, NAN, NAN,       -4,
                                  -INFINITY, NAN,       7,   -INFINITY, NAN};
-  ComputeAndCompareR1<float>(&builder, expected,
-                             {param0_data.get(), param1_data.get()},
+  ComputeAndCompareR1<float>(&builder, expected, {&param0_data, &param1_data},
                              ErrorSpec(0.0001));
 }
 
diff --git a/third_party/xla/xla/tests/fuzz/build_defs.bzl b/third_party/xla/xla/tests/fuzz/build_defs.bzl
index 70e6a2ea9d85..e7560870b9cd 100644
--- a/third_party/xla/xla/tests/fuzz/build_defs.bzl
+++ b/third_party/xla/xla/tests/fuzz/build_defs.bzl
@@ -37,9 +37,11 @@ def hlo_test(name, hlo, **kwargs):
       **kwargs:
         Additional arguments passed to `xla_test`.
     """
+    backend_kwargs = {}
     xla_test(
         name = name,
         srcs = [],
+        backend_kwargs = backend_kwargs,
         env = {"HLO_PATH": "$(location {})".format(hlo)},
         data = [hlo],
         real_hardware_only = True,
diff --git a/third_party/xla/xla/tests/generate_complex_unary_op_samples.py b/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
index 08be6ea42f1e..6dc472e6bba8 100644
--- a/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
+++ b/third_party/xla/xla/tests/generate_complex_unary_op_samples.py
@@ -21,6 +21,7 @@
 """
 
 import os
+import platform
 import re
 import sys
 import jax._src.test_util as jtu
@@ -36,6 +37,16 @@ def disable(op, real, imag):
 
 
 def main():
+  machine = platform.machine()
+  is_arm_cpu = machine.startswith('aarch') or machine.startswith('arm')
+  if is_arm_cpu and platform.system() == 'Darwin':
+    # jtu.complex_plane_sample on Darwin ARM generates samples that
+    # are specific to the given platform (tiny is mapped to
+    # nextafter(tiny, inf) to avoid unexpected result when DAZ is
+    # enabled). Here we handle the Mac specific DAZ difference at C++
+    # level (see the __aarch64__-dependent min value mapping below).
+    sys.stdout.write("Don't run this script under Darwin ARM\n")
+    return
   target = (sys.argv[1] if len(sys.argv) > 1 else 'xla').lower()
   assert target in {'xla', 'tensorflow'}, target
   header_file_define = dict(
@@ -190,8 +201,17 @@ def tostr(v):
           max='std::numeric_limits<T>::max()',
       ).items():
         if name in used_constants:
-          constants.append(f'const T {name} = {value};')
-      constants = '\n      '.join(constants)
+          if name == 'min':
+            constants.append('#ifdef __aarch64__')
+            constants.append(f'const T {name} = std::nextafter({value}, T(1));')
+            constants.append('#else')
+            constants.append(f'const T {name} = {value};')
+            constants.append('#endif')
+          else:
+            constants.append(f'const T {name} = {value};')
+      nl = '\n      '
+      constants = nl.join(constants)
+      constants = constants.replace(nl + '#', '\n#')
 
       ifblocks.append(f"""\
 if constexpr (std::is_same_v<T, {ctype}>) {{
diff --git a/third_party/xla/xla/tests/get_default_platform_test.cc b/third_party/xla/xla/tests/get_default_platform_test.cc
index 379a97265fa1..0d603ed12ad1 100644
--- a/third_party/xla/xla/tests/get_default_platform_test.cc
+++ b/third_party/xla/xla/tests/get_default_platform_test.cc
@@ -15,16 +15,21 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/platform_util.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/status_matchers.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using GetDefaultPlatformTest = ::testing::Test;
 using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
+// Although we don't use any of the functionality provided by HloPjRtTestBase,
+// we want to model the same environment as a PjRt migrated test that ends up
+// calling GetDefaultPlatform.
+using GetDefaultPlatformTest = HloPjRtTestBase;
+
 // Regression test to ensure that it's not possible to call GetDefaultPlatform
 // on a PJRT migrated test due to
 // --xla_allow_get_default_platform=false being set in XLA_FLAGS.
diff --git a/third_party/xla/xla/tests/get_dimension_size_test.cc b/third_party/xla/xla/tests/get_dimension_size_test.cc
index 3c815fd989d1..342d21ee181e 100644
--- a/third_party/xla/xla/tests/get_dimension_size_test.cc
+++ b/third_party/xla/xla/tests/get_dimension_size_test.cc
@@ -16,13 +16,15 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -33,7 +35,7 @@ void DisableAllHloPasses(HloModule& module) {
   module.mutable_config().set_debug_options(debug_options);
 }
 
-class GetDimensionSizeTest : public HloTestBase {};
+using GetDimensionSizeTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 // Test that the interpreter can correctly compute get_dimension_size.
 TEST_F(GetDimensionSizeTest, CorrectComputation) {
diff --git a/third_party/xla/xla/tests/half_test.cc b/third_party/xla/xla/tests/half_test.cc
index 3da23420fccb..6e145dcf3453 100644
--- a/third_party/xla/xla/tests/half_test.cc
+++ b/third_party/xla/xla/tests/half_test.cc
@@ -14,31 +14,38 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cmath>
+#include <cstdint>
+#include <functional>
+#include <utility>
 #include <vector>
 
-#include "absl/status/statusor.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/hlo/testlib/test_helpers.h"
-#include "xla/literal.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/types.h"
 
 // Tests the handling of the basic mathematics operations with F16 operands.
 
 namespace xla {
 namespace {
 
-class HalfTestBase : public ClientLibraryTestBase {
- protected:
-  const ErrorSpec error_spec_{0.001, 0.001};
-  // Number of elements in the input buffers.
-  static constexpr int kNumElements = 4;
-};
-
 using UnaryBuildFuncTy = std::function<void(const xla::XlaOp& src)>;
 
+// Number of elements in the input buffers.
+constexpr int kNumElements = 4;
+constexpr ErrorSpec kErrorSpec{0.001, 0.001};
+
+using HalfTestBase = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
+
 struct UnaryOpTestParam {
   std::function<half(half)> compute_func;
   UnaryBuildFuncTy build_func;
@@ -66,7 +73,7 @@ XLA_TEST_P(UnaryOpTest, Ops) {
   UnaryBuildFuncTy build_func = GetParam().build_func;
   build_func(x_opnd);
 
-  ComputeAndCompareR1<half>(&builder, expected, {x_data.get()}, error_spec_);
+  ComputeAndCompareR1<half>(&builder, expected, {&x_data}, kErrorSpec);
 }
 
 half sign_imp(half value) {
@@ -78,23 +85,25 @@ half round_imp(half value) {
   return half(std::round(static_cast<float>(std::move(value))));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     half, UnaryOpTest,
     ::testing::Values(
         UnaryOpTestParam{[](half x) { return abs(x); }, &Abs},
         UnaryOpTestParam{[](half x) { return round_imp(x); }, &Round},
         UnaryOpTestParam{[](half x) { return ceil(x); }, &Ceil},
-        UnaryOpTestParam{[](half x) { return cos(x); }, &Cos},
+        UnaryOpTestParam{[](half x) { return cos(x); },
+                         [](XlaOp x) { return xla::Cos(x); }},
         UnaryOpTestParam{[](half x) { return exp(x); },
-                         static_cast<xla::XlaOp (*)(xla::XlaOp)>(&xla::Exp)},
+                         [](XlaOp x) { return xla::Exp(x); }},
         UnaryOpTestParam{[](half x) { return floor(x); }, &Floor},
-        UnaryOpTestParam{[](half x) { return log(x); }, &Log},
+        UnaryOpTestParam{[](half x) { return log(x); },
+                         [](XlaOp x) { return xla::Log(x); }},
         UnaryOpTestParam{[](half x) { return -x; }, &Neg},
         UnaryOpTestParam{[](half x) { return sign_imp(x); }, &Sign},
-        UnaryOpTestParam{[](half x) { return sin(x); }, &Sin},
-        UnaryOpTestParam{[](half x) { return tanh(x); }, &Tanh}
-
-        ));
+        UnaryOpTestParam{[](half x) { return sin(x); },
+                         [](XlaOp x) { return xla::Sin(x); }},
+        UnaryOpTestParam{[](half x) { return tanh(x); },
+                         [](XlaOp x) { return xla::Tanh(x); }}));
 
 struct UnaryPredTestParam {
   std::function<bool(half)> compute_func;
@@ -122,12 +131,12 @@ XLA_TEST_P(UnaryPredTest, Ops) {
   UnaryBuildFuncTy build_func = GetParam().build_func;
   build_func(x_opnd);
 
-  ComputeAndCompareR1<bool>(&builder, expected, {x_data.get()});
+  ComputeAndCompareR1<bool>(&builder, expected, {&x_data});
 }
 
-INSTANTIATE_TEST_CASE_P(half, UnaryPredTest,
-                        ::testing::Values(UnaryPredTestParam{
-                            [](half x) { return isfinite(x); }, &IsFinite}));
+INSTANTIATE_TEST_SUITE_P(half, UnaryPredTest,
+                         ::testing::Values(UnaryPredTestParam{
+                             [](half x) { return isfinite(x); }, &IsFinite}));
 
 using BinaryBuildFuncTy = std::function<void(
     const xla::XlaOp& x, const xla::XlaOp& y, absl::Span<const int64_t>)>;
@@ -163,8 +172,7 @@ XLA_TEST_P(BinaryOpTest, Ops) {
   BinaryBuildFuncTy build_func = GetParam().build_func;
   build_func(x_opnd, y_opnd, {});
 
-  ComputeAndCompareR1<half>(&builder, expected, {x_data.get(), y_data.get()},
-                            error_spec_);
+  ComputeAndCompareR1<half>(&builder, expected, {&x_data, &y_data}, kErrorSpec);
 }
 
 half atan2_imp(half x, half y) {
@@ -172,7 +180,7 @@ half atan2_imp(half x, half y) {
                          static_cast<float>(std::move(y))));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     half, BinaryOpTest,
     ::testing::Values(
         BinaryOpTestParam{[](half x, half y) { return x + y; }, &Add},
@@ -218,10 +226,10 @@ XLA_TEST_P(BinaryPredTest, Ops) {
   BinaryBuildFuncTy build_func = GetParam().build_func;
   build_func(x_opnd, y_opnd, {});
 
-  ComputeAndCompareR1<bool>(&builder, expected, {x_data.get(), y_data.get()});
+  ComputeAndCompareR1<bool>(&builder, expected, {&x_data, &y_data});
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     half, BinaryPredTest,
     ::testing::Values(
         BinaryPredTestParam{[](half x, half y) { return x == y; }, &Eq},
diff --git a/third_party/xla/xla/tests/hlo_pjrt_test_base.cc b/third_party/xla/xla/tests/hlo_pjrt_test_base.cc
index a0d382987839..92f0eea071a2 100644
--- a/third_party/xla/xla/tests/hlo_pjrt_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_pjrt_test_base.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/service/hlo_runner_interface.h"
-#include "xla/service/hlo_runner_pjrt.h"
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
+#include "xla/tests/hlo_runner_pjrt_test_utils.h"
 #include "xla/tests/pjrt_client_registry.h"
 
 namespace xla {
@@ -45,7 +45,7 @@ std::unique_ptr<HloRunnerInterface> GetHloRunnerForTest() {
   PjRtClientTestFactoryRegistry::DeviceShapeSizeFn device_shape_size_fn =
       pjrt_registry.GetDeviceShapeSizeFn(client->get());
 
-  return std::make_unique<HloRunnerPjRt>(
+  return MakeHloRunnerPjRtSplitPhaseAware(
       *std::move(client), device_shape_representation_fn, device_shape_size_fn);
 }
 
diff --git a/third_party/xla/xla/tests/hlo_pjrt_test_base.h b/third_party/xla/xla/tests/hlo_pjrt_test_base.h
index 989ae6e769d2..426fb8073e10 100644
--- a/third_party/xla/xla/tests/hlo_pjrt_test_base.h
+++ b/third_party/xla/xla/tests/hlo_pjrt_test_base.h
@@ -16,6 +16,19 @@ limitations under the License.
 #ifndef XLA_TESTS_HLO_PJRT_TEST_BASE_H_
 #define XLA_TESTS_HLO_PJRT_TEST_BASE_H_
 
+// Inclusion of this header indicates that the test has been migrated to use
+// HloRunnerPjRt. Since this requires tagging the build target, the whole target
+// must be migrated at once. HloPjRtTestBase cannot be used in conjunction with
+// HloTestBase and vice versa. This macro helps to ensure that migration test
+// base classes are not used in conjunction with HloTestBase.
+// TODO: b/408276009 - Remove these macros once all tests have been migrated.
+#define XLA_TEST_MIGRATED_TO_HLO_RUNNER_PJRT
+#ifdef XLA_TEST_NOT_MIGRATED_TO_HLO_RUNNER_PJRT
+static_assert(false,
+              "HloPjRtTestBase cannot be used in the same target as a test "
+              "that uses HloTestBase.");
+#endif  // XLA_TEST_NOT_MIGRATED_TO_HLO_RUNNER_PJRT
+
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.cc
index dbf6acc37c59..bb4e37347e5e 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
 
+#include <string>
+
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/shape.h"
 
@@ -24,8 +26,7 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
   ProgramShape program_shape;
   const auto* entry = module.entry_computation();
   for (const auto* param : entry->parameter_instructions()) {
-    *program_shape.add_parameters() = param->shape();
-    *program_shape.add_parameter_names() = param->name();
+    program_shape.AddParameter(param->shape(), std::string(param->name()));
   }
   *program_shape.mutable_result() = entry->root_instruction()->shape();
   return program_shape;
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.h b/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.h
index b5b632e03247..4802d42ec85b 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.h
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_reference_mixin.h
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/base/nullability.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/error_spec.h"
@@ -68,7 +69,7 @@ class HloRunnerAgnosticReferenceMixin : public T {
  protected:
   template <typename... BaseArgs>
   explicit HloRunnerAgnosticReferenceMixin(
-      absl::Nonnull<std::unique_ptr<HloRunnerInterface>> reference_runner,
+      absl_nonnull std::unique_ptr<HloRunnerInterface> reference_runner,
       BaseArgs&&... base_args)
       : T(std::forward<BaseArgs>(base_args)...),
         reference_runner_(std::move(reference_runner)) {}
@@ -86,7 +87,8 @@ class HloRunnerAgnosticReferenceMixin : public T {
   // reference backend. Note that the program shape of the module must not be
   // modified.
   ::testing::AssertionResult RunAndCompare(
-      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      std::unique_ptr<HloModule> module,
+      absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
       const std::function<void(HloModule*)>& test_preprocessor = nullptr) {
@@ -104,7 +106,7 @@ class HloRunnerAgnosticReferenceMixin : public T {
   // optimization.
   ::testing::AssertionResult RunAndCompareNoHloPasses(
       std::unique_ptr<HloModule> module,
-      const absl::Span<Literal* const> arguments,
+      const absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
       const std::function<void(HloModule*)>& test_preprocessor = nullptr) {
@@ -223,7 +225,8 @@ class HloRunnerAgnosticReferenceMixin : public T {
   // compares the results. Returns whether the results are near or equal. If any
   // error happens before the results are computed, returns the error status.
   absl::StatusOr<::testing::AssertionResult> RunAndCompareInternal(
-      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      std::unique_ptr<HloModule> module,
+      absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
       const std::function<void(HloModule*)>& test_preprocessor = nullptr) {
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
index 5c2a0a52d099..009546e57a9f 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/literal.h"
 #include "xla/service/computation_placer.h"
-#include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_runner_interface.h"
@@ -57,7 +56,7 @@ limitations under the License.
 namespace xla {
 
 HloRunnerAgnosticTestBase::HloRunnerAgnosticTestBase(
-    absl::Nonnull<std::unique_ptr<HloRunnerInterface>> test_runner,
+    absl_nonnull std::unique_ptr<HloRunnerInterface> test_runner,
     const bool verifier_layout_sensitive,
     const bool allow_mixed_precision_in_hlo_verifier,
     const HloPredicate instruction_can_change_layout_func)
@@ -76,25 +75,12 @@ HloRunnerAgnosticTestBase::CreateNewVerifiedModule(
       instruction_can_change_layout_func());
 }
 
-absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
-HloRunnerAgnosticTestBase::ParseAndReturnVerifiedModule(
-    absl::string_view hlo_text, int64_t replica_count, int64_t num_partitions) {
-  return ParseAndReturnVerifiedModule(
-      hlo_text, GetModuleConfigForTest(replica_count, num_partitions));
-}
-
 absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
 HloRunnerAgnosticTestBase::ParseAndReturnVerifiedModule(
     absl::string_view hlo_text, const HloModuleConfig& config,
-    const HloParserOptions& parser_options) {
-  auto module = std::make_unique<VerifiedHloModule>(
-      TestName(), config, verifier_layout_sensitive(),
-      allow_mixed_precision_in_hlo_verifier(),
-      test_runner_->device_shape_size_fn(),
-      instruction_can_change_layout_func());
-  TF_RETURN_IF_ERROR(
-      module->ParseHloStringAndVerifyModule(hlo_text, parser_options));
-  return std::move(module);
+    const HloParserOptions& parser_options) const {
+  return HloHardwareIndependentTestBase::ParseAndReturnVerifiedModule(
+      hlo_text, config, parser_options, test_runner_->device_shape_size_fn());
 }
 
 HloComputation*
@@ -114,13 +100,15 @@ void HloRunnerAgnosticTestBase::UpdateEntryComputationLayout(
 }
 
 absl::StatusOr<Literal> HloRunnerAgnosticTestBase::Execute(
-    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
-    bool run_hlo_passes) {
+    std::unique_ptr<HloModule> module,
+    absl::Span<const Literal* const> arguments, bool run_hlo_passes) {
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module.get()));
   return test_runner_->Execute(std::move(module), arguments, run_hlo_passes);
 }
 
 Literal HloRunnerAgnosticTestBase::ExecuteNoHloPasses(
-    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments) {
+    std::unique_ptr<HloModule> module,
+    absl::Span<const Literal* const> arguments) {
   absl::StatusOr<Literal> result = Execute(std::move(module), arguments,
                                            /*run_hlo_passes=*/false);
   CHECK_OK(result.status());
@@ -128,7 +116,9 @@ Literal HloRunnerAgnosticTestBase::ExecuteNoHloPasses(
 }
 
 Literal HloRunnerAgnosticTestBase::ExecuteAndTransfer(
-    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments) {
+    std::unique_ptr<HloModule> module,
+    absl::Span<const Literal* const> arguments) {
+  CHECK_OK(PreprocessModuleForTestRunner(module.get()));
   absl::StatusOr<Literal> result =
       test_runner_->Execute(std::move(module), arguments, true, nullptr);
   CHECK_OK(result.status());
@@ -138,27 +128,30 @@ Literal HloRunnerAgnosticTestBase::ExecuteAndTransfer(
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
-    const absl::Span<Literal* const> arguments, const int64_t num_replicas,
-    const bool use_threads, const bool run_hlo_passes) {
+    const absl::Span<const Literal* const> arguments,
+    const int64_t num_replicas, const bool use_threads,
+    const bool run_hlo_passes) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
   options.num_replicas = num_replicas;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module.get()));
   return test_runner_->ExecuteReplicated(std::move(module), std::move(options));
 }
 
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
-    const absl::Span<Literal* const> arguments, const int64_t num_replicas,
-    DeviceAssignment* const device_assignment, const bool run_hlo_passes,
-    const bool use_threads) {
+    const absl::Span<const Literal* const> arguments,
+    const int64_t num_replicas, DeviceAssignment* const device_assignment,
+    const bool run_hlo_passes, const bool use_threads) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
   options.num_replicas = num_replicas;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module.get()));
   return test_runner_->ExecuteReplicated(std::move(module), std::move(options),
                                          device_assignment);
 }
@@ -189,6 +182,7 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
   CHECK(num_replicas == arguments.size() &&
         "expect arguments for each replica");
   int64_t argument_count = arguments.front().size();
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module.get()));
   TF_ASSIGN_OR_RETURN(
       const std::unique_ptr<OpaqueExecutable> executable,
       test_runner_->CreateExecutable(std::move(module), run_hlo_passes));
@@ -337,7 +331,7 @@ HloRunnerAgnosticTestBase::RunAndCompareTwoModulesReplicated(
 
 ::testing::AssertionResult HloRunnerAgnosticTestBase::RunAndCompareTwoModules(
     std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
-    const absl::Span<Literal* const> arguments,
+    const absl::Span<const Literal* const> arguments,
     const std::optional<ErrorSpec>& error, bool run_hlo_passes) {
   const absl::StatusOr<::testing::AssertionResult> result =
       RunAndCompareTwoModulesInternal(std::move(module_0), std::move(module_1),
@@ -427,7 +421,7 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::RunAndCompareTwoModules(
 ::testing::AssertionResult HloRunnerAgnosticTestBase::RunAndCompareTwoModules(
     absl::string_view hlo_string_module_0,
     absl::string_view hlo_string_module_1,
-    const absl::Span<Literal* const> arguments,
+    const absl::Span<const Literal* const> arguments,
     const std::optional<ErrorSpec>& error, const bool run_hlo_passes) {
   auto module_0_or_status = ParseAndReturnVerifiedModule(hlo_string_module_0);
   if (!module_0_or_status.ok()) {
@@ -458,10 +452,6 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::Run(
            << "Error while parsing HLO text format: "
            << module.status().ToString();
   }
-  if (absl::Status status = PreprocessModuleForTestRunner(module->get());
-      !status.ok()) {
-    return ::testing::AssertionFailure() << status;
-  }
   const std::vector<Literal> fake_arguments =
       MakeFakeArguments(module->get(), use_random_data).value();
   std::vector<Literal*> fake_argument_ptrs;
@@ -488,6 +478,11 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::Run(
                   : ::testing::AssertionFailure() << s.message();
   }
 
+  if (const absl::Status status = PreprocessModuleForTestRunner(module->get());
+      !status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while preprocessing module: " << status;
+  }
   auto output = test_runner_->Execute(*std::move(module), fake_argument_ptrs,
                                       /*run_hlo_passes=*/run_hlo_passes,
                                       /*profile=*/profile);
@@ -531,6 +526,11 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::RunReplicated(
   options.arguments = {fake_argument_ptrs.begin(), fake_argument_ptrs.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
+  if (const absl::Status status = PreprocessModuleForTestRunner(module->get());
+      !status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while preprocessing module: " << status;
+  }
   const absl::StatusOr<std::vector<Literal>> output =
       test_runner_->ExecuteReplicated(*std::move(module), std::move(options));
   if (output.ok()) {
@@ -579,6 +579,12 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::RunMultipleTimes(
                     : ::testing::AssertionFailure() << s.message();
     }
 
+    if (const absl::Status status =
+            PreprocessModuleForTestRunner(module->get());
+        !status.ok()) {
+      return ::testing::AssertionFailure()
+             << "Error while preprocessing module: " << status;
+    }
     absl::StatusOr<std::unique_ptr<OpaqueExecutable>> executable =
         test_runner_->CreateExecutable(*std::move(module), run_hlo_passes);
     if (!executable.ok()) {
@@ -617,14 +623,18 @@ HloRunnerAgnosticTestBase::RunAndCompareTwoModulesInternalReplicated(
     std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
     const HloRunnerInterface::ReplicatedExecuteOptions options,
     const std::optional<ErrorSpec>& error) {
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module_0.get()));
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module_1.get()));
   TF_RETURN_IF_ERROR(verifier().Run(module_0.get()).status());
   TF_RETURN_IF_ERROR(verifier().Run(module_1.get()).status());
 
   // Execute the two modules.
-  TF_ASSIGN_OR_RETURN(auto test_0, test_runner_->ExecuteReplicated(
-                                       std::move(module_0), options));
-  TF_ASSIGN_OR_RETURN(auto test_1, test_runner_->ExecuteReplicated(
-                                       std::move(module_1), options));
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<Literal> test_0,
+      test_runner_->ExecuteReplicated(std::move(module_0), options));
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<Literal> test_1,
+      test_runner_->ExecuteReplicated(std::move(module_1), options));
 
   for (const auto& [expected, actual] : llvm::zip_equal(test_0, test_1)) {
     if (::testing::AssertionResult result =
@@ -639,8 +649,10 @@ HloRunnerAgnosticTestBase::RunAndCompareTwoModulesInternalReplicated(
 absl::StatusOr<::testing::AssertionResult>
 HloRunnerAgnosticTestBase::RunAndCompareTwoModulesInternal(
     std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
-    const absl::Span<Literal* const> arguments,
+    const absl::Span<const Literal* const> arguments,
     const std::optional<ErrorSpec>& error, bool run_hlo_passes) {
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module_0.get()));
+  TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module_1.get()));
   TF_RETURN_IF_ERROR(verifier().Run(module_0.get()).status());
   TF_RETURN_IF_ERROR(verifier().Run(module_1.get()).status());
 
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
index acecbe88c877..5d6ce4683396 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
@@ -84,7 +84,7 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
 
  protected:
   explicit HloRunnerAgnosticTestBase(
-      absl::Nonnull<std::unique_ptr<HloRunnerInterface>> test_runner,
+      absl_nonnull std::unique_ptr<HloRunnerInterface> test_runner,
       bool verifier_layout_sensitive = false,
       bool allow_mixed_precision_in_hlo_verifier = true,
       HloPredicate instruction_can_change_layout_func = {});
@@ -101,12 +101,8 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
       const std::string& name = TestName(), int64_t replica_count = 1);
 
   // Parses the given string and returns module as a VerifiedHloModule.
-  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
-  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
-                               int64_t replica_count = 1,
-                               int64_t num_partitions = 1);
-  // Parses the given string and returns module as a VerifiedHloModule.
-  //
+  using HloHardwareIndependentTestBase::ParseAndReturnVerifiedModule;
+
   // To obtain a HloModuleConfig with a specific replica and partition count and
   // no further customization, either use the overload above or use
   // GetModuleConfigForTest. The latter option may be useful if you want to pass
@@ -114,7 +110,7 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
   absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
   ParseAndReturnVerifiedModule(
       absl::string_view hlo_text, const HloModuleConfig& config,
-      const HloParserOptions& parser_options = HloParserOptions());
+      const HloParserOptions& parser_options = HloParserOptions()) const;
 
   HloComputation* AddEntryComputationAndUpdateEntryComputationLayout(
       HloModule*, std::unique_ptr<HloComputation> computation);
@@ -122,16 +118,16 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
 
   // Executes the given module and return the result as a Literal.
   absl::StatusOr<Literal> Execute(std::unique_ptr<HloModule> module,
-                                  absl::Span<Literal* const> arguments,
+                                  absl::Span<const Literal* const> arguments,
                                   bool run_hlo_passes = true);
 
   // Same as above, except the module will be executed without running any HLO
   // passes on it.
   Literal ExecuteNoHloPasses(std::unique_ptr<HloModule> module,
-                             absl::Span<Literal* const> arguments);
+                             absl::Span<const Literal* const> arguments);
 
   Literal ExecuteAndTransfer(std::unique_ptr<HloModule> module,
-                             absl::Span<Literal* const> arguments);
+                             absl::Span<const Literal* const> arguments);
 
   // Compile the given module to an executable.
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
@@ -145,14 +141,16 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
   // with a thread-per-replica, vs using an implicitly async call such as
   // Executable::ExecuteOnStreams.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
-      int64_t num_replicas, bool use_threads, bool run_hlo_passes = false);
+      std::unique_ptr<HloModule> module,
+      absl::Span<const Literal* const> arguments, int64_t num_replicas,
+      bool use_threads, bool run_hlo_passes = false);
 
   // Same as above, but uses specified device assignment.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
-      int64_t num_replicas, DeviceAssignment* device_assignment,
-      bool run_hlo_passes, bool use_threads);
+      std::unique_ptr<HloModule> module,
+      absl::Span<const Literal* const> arguments, int64_t num_replicas,
+      DeviceAssignment* device_assignment, bool run_hlo_passes,
+      bool use_threads);
 
   // Same as above, but allows passing different programs for replicas.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
@@ -212,7 +210,7 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
   // Same as below, except requires passing fake arguments.
   ::testing::AssertionResult RunAndCompareTwoModules(
       std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
-      absl::Span<Literal* const> arguments,
+      absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
 
   // Same as below, except requires passing the modules.
@@ -242,7 +240,7 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
   ::testing::AssertionResult RunAndCompareTwoModules(
       absl::string_view hlo_string_module_0,
       absl::string_view hlo_string_module_1,
-      absl::Span<Literal* const> arguments,
+      absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
 
   // Executes an hlo module with fake inputs on multiple replicas.
@@ -287,7 +285,7 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
   // error happens before the results are computed, returns the error status.
   absl::StatusOr<::testing::AssertionResult> RunAndCompareTwoModulesInternal(
       std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
-      absl::Span<Literal* const> arguments,
+      absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes);
 
   std::unique_ptr<HloRunnerInterface> test_runner_;
diff --git a/third_party/xla/xla/tests/hlo_runner_pjrt_test_utils.cc b/third_party/xla/xla/tests/hlo_runner_pjrt_test_utils.cc
new file mode 100644
index 000000000000..af15c1c7d3de
--- /dev/null
+++ b/third_party/xla/xla/tests/hlo_runner_pjrt_test_utils.cc
@@ -0,0 +1,121 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tests/hlo_runner_pjrt_test_utils.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/flags/flag.h"
+#include "absl/strings/string_view.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/service/hlo_runner_pjrt.h"
+
+enum class SplitPhaseMode : uint8_t {
+  // Split-phase compilation and execution is disabled. Both are performed in
+  // the same runner.
+  kDisabled,
+  // Split-phase compilation is enabled. The runner will run in compile-only
+  // mode and persist the executable to disk.
+  kCompile,
+  // Split-phase execution is enabled. The runner will run in execute-only mode
+  // and load the executable from disk, falling back to performing compilation
+  // if the executable is not found.
+  kExecute
+};
+
+bool AbslParseFlag(absl::string_view text, SplitPhaseMode* mode,
+                   std::string* error) {
+  if (text == "disabled") {
+    *mode = SplitPhaseMode::kDisabled;
+    return true;
+  }
+  if (text == "compile") {
+    *mode = SplitPhaseMode::kCompile;
+    return true;
+  }
+  if (text == "execute") {
+    *mode = SplitPhaseMode::kExecute;
+    return true;
+  }
+  *error = "unknown value for SplitPhaseMode enumeration";
+  return false;
+}
+
+std::string AbslUnparseFlag(SplitPhaseMode mode) {
+  switch (mode) {
+    case SplitPhaseMode::kDisabled:
+      return "disabled";
+    case SplitPhaseMode::kCompile:
+      return "compile";
+    case SplitPhaseMode::kExecute:
+      return "execute";
+  }
+  return "should not reach here";
+}
+
+ABSL_FLAG(SplitPhaseMode, xla_pjrt_split_phase_mode, SplitPhaseMode::kDisabled,
+          "If set to anything other than \"disabled\", split phase compilation "
+          "mode is enabled. Specify \"compile\" to use compile-only mode, in "
+          "which executables are compiled and then persisted to disk at the "
+          "path specified by --xla_pjrt_split_phase_dir. Specify \"execute\" "
+          "to use execute-only mode, in which executables are loaded from disk "
+          "and then executed.");
+
+ABSL_FLAG(
+    std::optional<std::string>, xla_pjrt_split_phase_dir, std::nullopt,
+    "The directory where intermediate results for split-phase compilation are "
+    "persisted. Must be specified if --xla_pjrt_split_phase_mode is set.");
+
+namespace xla {
+
+std::unique_ptr<HloRunnerPjRt> MakeHloRunnerPjRtSplitPhaseAware(
+    std::unique_ptr<PjRtClient> client,
+    HloRunnerInterface::DeviceShapeRepresentationFn
+        device_shape_representation_fn,
+    HloRunnerInterface::DeviceShapeSizeFn device_shape_size_fn) {
+  const SplitPhaseMode mode = absl::GetFlag(FLAGS_xla_pjrt_split_phase_mode);
+  std::string artifact_dir;
+  if (mode != SplitPhaseMode::kDisabled) {
+    std::optional<std::string> split_phase_dir =
+        absl::GetFlag(FLAGS_xla_pjrt_split_phase_dir);
+    if (!split_phase_dir.has_value()) {
+      return nullptr;
+    }
+    artifact_dir = *std::move(split_phase_dir);
+  }
+
+  switch (absl::GetFlag(FLAGS_xla_pjrt_split_phase_mode)) {
+    case SplitPhaseMode::kDisabled:
+      return std::make_unique<HloRunnerPjRt>(
+          std::move(client), std::move(device_shape_representation_fn),
+          std::move(device_shape_size_fn));
+    case SplitPhaseMode::kCompile:
+      return std::make_unique<CompilePhaseHloRunnerPjRt>(
+          std::move(client), std::move(device_shape_representation_fn),
+          std::move(device_shape_size_fn), std::move(artifact_dir));
+    case SplitPhaseMode::kExecute:
+      return std::make_unique<ExecutePhaseHloRunnerPjRt>(
+          std::move(client), std::move(device_shape_representation_fn),
+          std::move(device_shape_size_fn), std::move(artifact_dir));
+  }
+  return nullptr;  // Should not reach here.
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/tests/hlo_runner_pjrt_test_utils.h b/third_party/xla/xla/tests/hlo_runner_pjrt_test_utils.h
new file mode 100644
index 000000000000..8e649abb8154
--- /dev/null
+++ b/third_party/xla/xla/tests/hlo_runner_pjrt_test_utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TESTS_HLO_RUNNER_PJRT_TEST_UTILS_H_
+#define XLA_TESTS_HLO_RUNNER_PJRT_TEST_UTILS_H_
+
+#include <memory>
+
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/service/hlo_runner_interface.h"
+#include "xla/service/hlo_runner_pjrt.h"
+
+namespace xla {
+
+// Constructs a HloRunnerPjRt depending on the value of
+// --xla_pjrt_split_phase_mode and --xla_pjrt_split_phase_dir. If
+// --xla_pjrt_split_phase_mode is not set / set to "disabled", this function
+// returns a standard HloRunnerPjRt.
+std::unique_ptr<HloRunnerPjRt> MakeHloRunnerPjRtSplitPhaseAware(
+    std::unique_ptr<PjRtClient> client,
+    HloRunnerInterface::DeviceShapeRepresentationFn
+        device_shape_representation_fn,
+    HloRunnerInterface::DeviceShapeSizeFn device_shape_size_fn);
+}  // namespace xla
+
+#endif  // XLA_TESTS_HLO_RUNNER_PJRT_TEST_UTILS_H_
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index 17ff6866b745..46ed5520dc27 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -16,6 +16,19 @@ limitations under the License.
 #ifndef XLA_TESTS_HLO_TEST_BASE_H_
 #define XLA_TESTS_HLO_TEST_BASE_H_
 
+// Inclusion of this header indicates that the test has NOT been migrated to use
+// HloRunnerPjRt. Migration requires tagging the build target so that the
+// correct dependencies are included. The whole target must be migrated at once.
+// This macro helps to ensure that migration test base classes are not used in
+// conjunction with HloTestBase.
+// TODO: b/408276009 - Remove these macros once all tests have been migrated.
+#define XLA_TEST_NOT_MIGRATED_TO_HLO_RUNNER_PJRT
+#ifdef XLA_TEST_MIGRATED_TO_HLO_RUNNER_PJRT
+static_assert(false,
+              "HloTestBase cannot be used in the same target as a test that "
+              "has been explicitly migrated to use HloRunnerPjRt.");
+#endif  // XLA_TEST_MIGRATED_TO_HLO_RUNNER_PJRT
+
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -23,6 +36,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -71,14 +85,20 @@ namespace xla {
 //  )
 //
 // For a more detailed example, see "../tests/sample_text_test.cc".
-//
-// ** NOTE **
-// This class will soon be deprecated in favor of HloRunnerAgnosticTestBase. We
-// are in the process of incrementally migrating tests to use this new base
-// class.  HloTestBase remains as a shim on tests during this migration process.
-// While we would prefer if you can avoid introducing new tests that use this
-// class, we are still working on documenting the exact migration procedure.
-class HloTestBase
+class ABSL_DEPRECATED(
+    "Please avoid introducing new tests that use this class. Tests that use "
+    "this base class are being incrementally migrated to use HloPjRtTestBase "
+    "or HloRunnerAgnosticTestBase directly. For Googlers, the migration "
+    "process is documented at go/xla-test-migration. For external users, "
+    "please use existing support channels if you run into any issues. In most "
+    "cases we anticipate that migrating a single test suite should be a matter "
+    "of replacing HloTestBase with HloPjRtTestBase (or another "
+    "HloRunnerAgnosticTestBase subclass). You can use the "
+    "HloPjRtInterpreterReferenceMixin<T> class to add a PjRt-based "
+    "interpreter reference backend to your test. Once a test target is "
+    "migrated, if using one of the xla_test macros, you should add the "
+    "test_migrated_to_hlo_runner_pjrt tag to include the correct "
+    "backend-specific dependencies.") HloTestBase
     : public HloRunnerAgnosticReferenceMixin<HloRunnerAgnosticTestBase> {
  public:
   // Compiles the given `hlo` with optimizations, and verifies that optimized
diff --git a/third_party/xla/xla/tests/iota_test.cc b/third_party/xla/xla/tests/iota_test.cc
index a9dddb816b47..66b77396c182 100644
--- a/third_party/xla/xla/tests/iota_test.cc
+++ b/third_party/xla/xla/tests/iota_test.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/primitive_util.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
@@ -35,7 +37,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
-XLA_TEST_F(HloTestBase, IotaReshapeR1) {
+using IotaTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
+
+TEST_F(IotaTest, IotaReshapeR1) {
   const std::string hlo_text = R"(
   HloModule iota_reshape
   ENTRY main {
@@ -46,7 +50,7 @@ XLA_TEST_F(HloTestBase, IotaReshapeR1) {
   EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
 }
 
-XLA_TEST_F(HloTestBase, IotaReshapeExtraDims) {
+TEST_F(IotaTest, IotaReshapeExtraDims) {
   const std::string hlo_text = R"(
   HloModule iota_reshape
   ENTRY main {
@@ -67,7 +71,8 @@ std::vector<T> GetR1Expected(const int64_t num_elements) {
 }
 
 class IotaR1Test
-    : public ClientLibraryTestBase,
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
       public ::testing::WithParamInterface<std::tuple<PrimitiveType, int>> {};
 
 XLA_TEST_P(IotaR1Test, DoIt) {
@@ -113,7 +118,8 @@ INSTANTIATE_TEST_CASE_P(
                                         /*end=*/10001,
                                         /*step=*/10)));
 
-class IotaR2Test : public ClientLibraryTestBase,
+class IotaR2Test : public ClientLibraryTestRunnerMixin<
+                       HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                    public ::testing::WithParamInterface<
                        std::tuple<PrimitiveType, int, int>> {};
 
@@ -151,7 +157,8 @@ INSTANTIATE_TEST_CASE_P(
                                         /*step=*/10),
                        ::testing::Values(0, 1)));
 
-class IotaR3Test : public ClientLibraryTestBase,
+class IotaR3Test : public ClientLibraryTestRunnerMixin<
+                       HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                    public ::testing::WithParamInterface<
                        std::tuple<PrimitiveType, int, int>> {};
 
diff --git a/third_party/xla/xla/tests/literal_test_util.cc b/third_party/xla/xla/tests/literal_test_util.cc
index 1d5c99fd712e..a49c32612dca 100644
--- a/third_party/xla/xla/tests/literal_test_util.cc
+++ b/third_party/xla/xla/tests/literal_test_util.cc
@@ -15,11 +15,23 @@ limitations under the License.
 
 #include "xla/tests/literal_test_util.h"
 
+#include <optional>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "xla/error_spec.h"
+#include "xla/literal.h"
 #include "xla/literal_comparison.h"
-#include "tsl/platform/env.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/test.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 
@@ -53,10 +65,10 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
                   const LiteralSlice& mismatches,
                   const ShapeIndex& /*shape_index*/,
                   const literal_comparison::ErrorBuckets& /*error_buckets*/) {
-  LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape()) << " "
-            << literal_comparison::ToStringTruncated(expected);
-  LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape()) << " "
-            << literal_comparison::ToStringTruncated(actual);
+  LOG(INFO) << "expected: shape=" << ShapeUtil::HumanString(expected.shape())
+            << ", value=" << literal_comparison::ToStringTruncated(expected);
+  LOG(INFO) << "actual:   shape=" << ShapeUtil::HumanString(actual.shape())
+            << ", value=" << literal_comparison::ToStringTruncated(actual);
   LOG(INFO) << "Dumping literals to temp files...";
   WriteLiteralToTempFile(expected, "expected");
   WriteLiteralToTempFile(actual, "actual");
@@ -77,16 +89,6 @@ ::testing::AssertionResult StatusToAssertion(const absl::Status& s) {
   return StatusToAssertion(literal_comparison::EqualShapes(expected, actual));
 }
 
-/* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapesAndLayouts(
-    const Shape& expected, const Shape& actual) {
-  if (expected.ShortDebugString() != actual.ShortDebugString()) {
-    return ::testing::AssertionFailure()
-           << "want: " << expected.ShortDebugString()
-           << " got: " << actual.ShortDebugString();
-  }
-  return ::testing::AssertionSuccess();
-}
-
 /* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
     const LiteralSlice& expected, const LiteralSlice& actual) {
   return StatusToAssertion(literal_comparison::Equal(expected, actual));
diff --git a/third_party/xla/xla/tests/literal_test_util.h b/third_party/xla/xla/tests/literal_test_util.h
index d5d309028800..422b4eb9f8b6 100644
--- a/third_party/xla/xla/tests/literal_test_util.h
+++ b/third_party/xla/xla/tests/literal_test_util.h
@@ -47,11 +47,6 @@ class LiteralTestUtil {
   [[nodiscard]] static ::testing::AssertionResult EqualShapes(
       const Shape& expected, const Shape& actual);
 
-  // Asserts that the provided shapes are equal as defined in AssertEqualShapes
-  // and that they have the same layout.
-  [[nodiscard]] static ::testing::AssertionResult EqualShapesAndLayouts(
-      const Shape& expected, const Shape& actual);
-
   [[nodiscard]] static ::testing::AssertionResult Equal(
       const LiteralSlice& expected, const LiteralSlice& actual);
 
diff --git a/third_party/xla/xla/tests/local_client_aot_test.cc b/third_party/xla/xla/tests/local_client_aot_test.cc
deleted file mode 100644
index 28b2b9242714..000000000000
--- a/third_party/xla/xla/tests/local_client_aot_test.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/base/dynamic_annotations.h"
-#include "xla/executable_run_options.h"
-#include "tsl/platform/test.h"
-
-class LocalClientAotTest : public ::testing::Test {};
-
-// This is a compiled XLA computation which calls SumStructElements, and then
-// doubles the result.
-extern "C" void SumAndDouble(float* out, xla::ExecutableRunOptions* options,
-                             void** parameters, void** temporary_buffers);
-
-// Just some structs with some arbitrary fields used to test the OPAQUE type.
-struct OpaqueData {
-  int field1 : 15;
-  int field2 : 14;
-  int field3 : 3;
-};
-
-// This is the implementation of a custom op which will be called by
-// SumAndDouble.
-extern "C" void SumStructElements(float* out, void** parameters) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(parameters, sizeof(OpaqueData*));
-  const auto* opaque_data = static_cast<OpaqueData*>(parameters[0]);
-  *out = opaque_data->field1 + opaque_data->field2 + opaque_data->field3;
-}
-
-TEST_F(LocalClientAotTest, Constant) {
-  xla::ExecutableRunOptions run_options;
-  OpaqueData opaque_data{100, 20, 3};
-  float out = 0;
-  void* temporary_buffers[] = {&opaque_data, &out};
-  SumAndDouble(&out, &run_options, nullptr, temporary_buffers);
-  EXPECT_EQ(out, 246.0f);
-
-  opaque_data = {1, 2, 3};
-  SumAndDouble(&out, &run_options, nullptr, temporary_buffers);
-  EXPECT_EQ(out, 12.0f);
-}
diff --git a/third_party/xla/xla/tests/local_client_aot_test_helper.cc b/third_party/xla/xla/tests/local_client_aot_test_helper.cc
deleted file mode 100644
index 2554ce9df424..000000000000
--- a/third_party/xla/xla/tests/local_client_aot_test_helper.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This program compiles an XLA program which computes 123 and writes the
-// resulting object file to stdout.
-
-#include <iostream>
-#include <vector>
-
-#include "llvm/TargetParser/Host.h"
-#include "llvm/TargetParser/Triple.h"
-#include "xla/client/client_library.h"
-#include "xla/hlo/builder/xla_builder.h"
-#include "xla/hlo/builder/xla_computation.h"
-#include "xla/service/cpu/cpu_compiler.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/types.h"
-#include "xla/util.h"
-#include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-
-namespace {
-
-using std::string;
-
-xla::XlaComputation Doubler() {
-  xla::XlaBuilder builder("doubler");
-  auto r0f32 = xla::ShapeUtil::MakeShape(xla::F32, {});
-  auto x = xla::Parameter(&builder, 0, r0f32, "x");
-  xla::Mul(x, xla::ConstantR0<float>(&builder, 2.0));
-  return std::move(builder.Build().value());
-}
-
-}  // namespace
-
-int main(int argc, char** argv) {
-  tsl::port::InitMain(argv[0], &argc, &argv);
-
-  auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().value();
-
-  xla::XlaBuilder builder("aot_test_helper");
-  auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
-  auto opaque_param = Parameter(&builder, 0, opaque_shape, "x");
-  auto r0f32 = xla::ShapeUtil::MakeShape(xla::F32, {});
-  auto sum = CustomCall(&builder, "SumStructElements", {opaque_param}, r0f32);
-  Call(&builder, Doubler(), {sum});
-
-  if (argc != 2) {
-    LOG(FATAL) << "local_client_aot_test_helper TARGET_CPU";
-  }
-
-  std::string triple_string;
-  std::string target_cpu = argv[1];
-  if (target_cpu == "k8") {
-    triple_string = "x86_64-none-linux-gnu";
-  } else if (target_cpu == "darwin_arm64") {
-    triple_string = "arm64-apple-darwin";
-  } else if (target_cpu == "darwin") {
-    triple_string = "x86_64-apple-macosx";
-  } else if ((target_cpu == "arm") || (target_cpu == "aarch64")) {
-    triple_string = "aarch64-none-linux-gnu";
-  } else if (target_cpu == "x64_windows") {
-    triple_string = "x86_64-pc-windows-msvc19";
-  } else if (target_cpu == "ppc") {
-    triple_string = "ppc64le-ibm-linux-gnu";
-  } else if (target_cpu == "s390x") {
-    triple_string = "systemz-none-linux-gnu";
-  } else if (target_cpu == "local") {
-    triple_string = llvm::sys::getDefaultTargetTriple();
-  } else {
-    LOG(FATAL) << "unsupported TARGET_CPU: " << target_cpu;
-  }
-
-  llvm::Triple triple(triple_string);
-
-  xla::XlaComputation computation = builder.Build().value();
-  xla::CompileOnlyClient::AotXlaComputationInstance instance{
-      &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
-
-  xla::cpu::CpuAotCompilationOptions options(
-      triple_string,
-      /*cpu_name=*/"", /*features=*/"", "SumAndDouble",
-      xla::cpu::CpuAotCompilationOptions::RelocationModel::Static);
-
-  auto results = client->CompileAheadOfTime({instance}, options).value();
-  auto result = xla::unique_ptr_down_cast<xla::cpu::CpuAotCompilationResult>(
-      std::move(results.front()));
-  // It's lame to hard-code the buffer assignments, but we need
-  // local_client_aot_test.cc to be able to easily invoke the function.
-  CHECK_EQ(result->result_buffer_index(), 1);
-  CHECK_EQ(result->buffer_infos().size(), 3);
-  CHECK(result->buffer_infos()[0].is_entry_parameter());      // param buffer
-  CHECK_EQ(result->buffer_infos()[1].size(), sizeof(float));  // result buffer
-  CHECK(result->buffer_infos()[2].is_constant());             // const buffer
-  if (triple.isOSBinFormatELF()) {
-    // Check the ELF magic.
-    CHECK_EQ(result->object_file_data()[0], 0x7F);
-    CHECK_EQ(result->object_file_data()[1], 'E');
-    CHECK_EQ(result->object_file_data()[2], 'L');
-    CHECK_EQ(result->object_file_data()[3], 'F');
-    // Check the ELF class.
-    CHECK_EQ(result->object_file_data()[4], triple.isArch32Bit() ? 1 : 2);
-    // Check the ELF endianness: it should be little.
-    CHECK_EQ(result->object_file_data()[5], triple.isLittleEndian() ? 1 : 2);
-    // Check the ELF version: it should be 1.
-    CHECK_EQ(result->object_file_data()[6], 1);
-  }
-
-  const std::vector<char>& object_file_data = result->object_file_data();
-  std::cout.write(object_file_data.data(), object_file_data.size());
-
-  return 0;
-}
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index 053b7966f5f2..d637e085faf7 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -735,7 +735,8 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
       {2.0f, 4.0f, 6.0f}, ShapedBufferToLiteral(result), error_spec_);
 }
 
-XLA_TEST_F(LocalClientExecuteTest, CompilePartitionedExecutable) {
+XLA_TEST_F(LocalClientExecuteTest,
+           DISABLED_ON_TPU(CompilePartitionedExecutable)) {
   if (local_client_->device_count() < 2) {
     GTEST_SKIP_("requires two devices");
   }
@@ -1038,28 +1039,6 @@ XLA_TEST_F(LocalClientExecuteTest, ValidateExecTimeOptimizationEffort) {
   EXPECT_FLOAT_EQ(proto.config().exec_time_optimization_effort(), -1.5f);
 }
 
-XLA_TEST_F(LocalClientExecuteTest, ValidateMemoryFittingEffort) {
-  XlaBuilder builder(TestName());
-  auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3}), "x");
-  auto y = ConstantR1<float>(&builder, {2.0f, 3.0f, 4.0f});
-  Add(x, y);
-  Shape argument_layout =
-      local_client_->backend().compiler()->DefaultDeviceShapeRepresentation(
-          ShapeUtil::MakeShapeWithDenseLayout(F32, /*dimensions=*/{3}, {0}));
-
-  ExecutableBuildOptions build_options;
-  build_options.set_memory_fitting_effort(2.0f);
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto executables,
-      local_client_->Compile(builder.Build().value(), {&argument_layout},
-                             build_options));
-  EXPECT_EQ(1, executables.size());
-  const HloModule& compiled_module =
-      executables.front()->executable()->module();
-  EXPECT_FLOAT_EQ(compiled_module.config().memory_fitting_effort(), 2.0f);
-  auto proto = compiled_module.ToProtoWithConfig();
-  EXPECT_FLOAT_EQ(proto.config().memory_fitting_effort(), 2.0f);
-}
 
 XLA_TEST_F(LocalClientExecuteTest, ValidateOptimizationLevel) {
   XlaBuilder builder(TestName());
diff --git a/third_party/xla/xla/tests/local_client_test_base.cc b/third_party/xla/xla/tests/local_client_test_base.cc
index a602c6f1d820..cc9904bf3e87 100644
--- a/third_party/xla/xla/tests/local_client_test_base.cc
+++ b/third_party/xla/xla/tests/local_client_test_base.cc
@@ -16,28 +16,42 @@ limitations under the License.
 
 #include "xla/tests/local_client_test_base.h"
 
+#include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
+#include "absl/base/const_init.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/test_helpers.h"
-#include "xla/map_util.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/platform_util.h"
+#include "xla/service/shaped_buffer.h"
 #include "xla/service/stream_pool.h"
-#include "xla/shape_util.h"
+#include "xla/service/transfer_manager.h"
+#include "xla/shape.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/threadpool.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index df1facab1459..c10c30bd3eca 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -16,27 +16,33 @@ limitations under the License.
 #ifndef XLA_TESTS_LOCAL_CLIENT_TEST_BASE_H_
 #define XLA_TESTS_LOCAL_CLIENT_TEST_BASE_H_
 
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <string>
-#include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/literal.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/local_service.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/tests/log_test.cc b/third_party/xla/xla/tests/log_test.cc
index 114a00ee3876..692505083631 100644
--- a/third_party/xla/xla/tests/log_test.cc
+++ b/third_party/xla/xla/tests/log_test.cc
@@ -16,19 +16,21 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
-#include "xla/client/local_client.h"
+#include "xla/array3d.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class LogTest : public ClientLibraryTestBase {};
+class LogTest : public ClientLibraryTestRunnerMixin<
+                    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
 
-XLA_TEST_F(LogTest, LogZeroValues) {
+TEST_F(LogTest, LogZeroValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR3FromArray3D<float>(&builder, Array3D<float>(3, 0, 0));
   Log(x);
diff --git a/third_party/xla/xla/tests/map_test.cc b/third_party/xla/xla/tests/map_test.cc
index b35daacfa4b8..0a4f6bcad183 100644
--- a/third_party/xla/xla/tests/map_test.cc
+++ b/third_party/xla/xla/tests/map_test.cc
@@ -13,34 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/array2d.h"
-#include "xla/client/local_client.h"
+#include "xla/array3d.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
 #include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-class MapTest : public ClientLibraryTestBase {
+class MapTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
  public:
-  explicit MapTest(se::Platform* platform = nullptr)
-      : ClientLibraryTestBase(platform) {
+  MapTest() {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
   }
@@ -170,28 +174,23 @@ TEST_F(MapTest, MapEachElemPlusOneR0) {
   // Applies lambda (x) (+ x 1)) to an input scalar.
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR0<float>(42.0);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param}, CreateAdderToOne(), {});
 
-  ComputeAndCompareR0<float>(&builder, 43.0, {param0_data.get()},
+  ComputeAndCompareR0<float>(&builder, 43.0, {&param0_literal},
                              ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
+TEST_F(MapTest, MapEachElemPlusOneR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0.
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR1<float>({});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param}, CreateAdderToOne(), {0});
 
-  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
-                             ErrorSpec(0.01f));
+  ComputeAndCompareR1<float>(&builder, {}, {&param0_literal}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapEachElemPlusOneR1S4) {
@@ -199,40 +198,34 @@ TEST_F(MapTest, MapEachElemPlusOneR1S4) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {3.2f, 4.3f, 5.4f, 6.5f},
-                             {param0_data.get()}, ErrorSpec(0.01f));
+                             {&param0_literal}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapEachF32ElementToS32Constant) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param}, CreateScalarOne<int32_t>(), {0});
 
-  ComputeAndCompareR1<int32_t>(&builder, {1, 1, 1, 1}, {param0_data.get()});
+  ComputeAndCompareR1<int32_t>(&builder, {1, 1, 1, 1}, {&param0_literal});
 }
 
 TEST_F(MapTest, MapEachF32ElementToU32Constant) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param}, CreateScalarOne<uint32_t>(), {0});
 
-  ComputeAndCompareR1<uint32_t>(&builder, {1, 1, 1, 1}, {param0_data.get()});
+  ComputeAndCompareR1<uint32_t>(&builder, {1, 1, 1, 1}, {&param0_literal});
 }
 
 TEST_F(MapTest, MapEachElemLongerChainR1) {
@@ -240,31 +233,26 @@ TEST_F(MapTest, MapEachElemLongerChainR1) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param}, CreateAdderToOneTimesItself(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {9.36f, 20.91f, 0.11f, 0.24f, 999000.0f, 65535.75f},
-      {param0_data.get()}, ErrorSpec(0.01f));
+      {&param0_literal}, ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
+TEST_F(MapTest, MapMultipleMapsR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0, and then
   // maps (lambda (x) (* x 2)) on the result.
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR1<float>({});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto map1 = Map(&builder, {param}, CreateAdderToOne(), {0});
   Map(&builder, {map1}, CreateMulByTwo(), {0});
 
-  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
-                             ErrorSpec(0.01f));
+  ComputeAndCompareR1<float>(&builder, {}, {&param0_literal}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapMultipleMapsR1S4) {
@@ -273,15 +261,13 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto map1 = Map(&builder, {param}, CreateAdderToOne(), {0});
   Map(&builder, {map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {6.4f, 8.6f, 10.8f, 13.0f},
-                             {param0_data.get()}, ErrorSpec(0.01f));
+                             {&param0_literal}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapEachElemPlusOneR2) {
@@ -289,19 +275,17 @@ TEST_F(MapTest, MapEachElemPlusOneR2) {
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR2<float>(
       {{13.25f, 14.0f}, {-7.1f, -7.2f}, {-8.8f, 8.8f}});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param}, CreateAdderToOne(), {0, 1});
 
   Array2D<float> expected_array(
       {{14.25f, 15.0f}, {-6.1f, -6.2f}, {-7.8f, 9.8f}});
-  ComputeAndCompareR2<float>(&builder, expected_array, {param0_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected_array, {&param0_literal},
                              ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(MapTest, ComplexNestedMaps) {
+TEST_F(MapTest, ComplexNestedMaps) {
   // Constructs a complex graph of embedded computations to test the computation
   // lowering order. Python equivalent:
   //
@@ -344,12 +328,8 @@ TEST_F(MapTest, MapBinaryAdder) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
   Literal param1_literal =
       LiteralUtil::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(param1_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto param1 = Parameter(&builder, 1, param1_literal.shape(), "param1");
@@ -357,23 +337,18 @@ TEST_F(MapTest, MapBinaryAdder) {
       {0});
 
   ComputeAndCompareR1<float>(&builder, {7.3f, 7.7, 4.3f, 0},
-                             {param0_data.get(), param1_data.get()},
+                             {&param0_literal, &param1_literal},
                              ErrorSpec(0.01f));
 }
 
 // Adds two rank-2 arrays with different layouts. This test exercises a path
 // for Map that used to fail in shape inference (b/28989438).
-XLA_TEST_F(MapTest, AddWithMixedLayouts) {
+TEST_F(MapTest, AddWithMixedLayouts) {
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR2WithLayout(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0}));
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
-
   Literal param1_literal = LiteralUtil::CreateR2WithLayout(
       {{10, 20}, {30, 40}}, LayoutUtil::MakeLayout({0, 1}));
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(param1_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto param1 = Parameter(&builder, 1, param1_literal.shape(), "param1");
@@ -386,20 +361,15 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
   expected(1, 0) = 33;
   expected(1, 1) = 44;
   ComputeAndCompareR2<int32_t>(&builder, expected,
-                               {param0_data.get(), param1_data.get()});
+                               {&param0_literal, &param1_literal});
 }
 
-XLA_TEST_F(MapTest, AddR3_3x0x2) {
+TEST_F(MapTest, AddR3_3x0x2) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR3FromArray3D<int32_t>(Array3D<int32_t>(3, 0, 2));
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
-
   Literal param1_literal =
       LiteralUtil::CreateR3FromArray3D<int32_t>(Array3D<int32_t>(3, 0, 2));
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(param1_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto param1 = Parameter(&builder, 1, param1_literal.shape(), "param1");
@@ -407,7 +377,7 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
       {0, 1, 2});
 
   ComputeAndCompareR3<int32_t>(&builder, Array3D<int32_t>(3, 0, 2),
-                               {param0_data.get(), param1_data.get()});
+                               {&param0_literal, &param1_literal});
 }
 
 TEST_F(MapTest, MapTernaryAdder) {
@@ -415,16 +385,10 @@ TEST_F(MapTest, MapTernaryAdder) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
   Literal param1_literal =
       LiteralUtil::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(param1_literal).value();
   Literal param2_literal =
       LiteralUtil::CreateR1<float>({-10.0f, -100.0f, -900.0f, -400.0f});
-  std::unique_ptr<GlobalData> param2_data =
-      client_->TransferToServer(param2_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto param1 = Parameter(&builder, 1, param1_literal.shape(), "param1");
@@ -433,8 +397,7 @@ TEST_F(MapTest, MapTernaryAdder) {
 
   ComputeAndCompareR1<float>(
       &builder, {-2.7f, -92.3f, -895.7f, -400.0f},
-      {param0_data.get(), param1_data.get(), param2_data.get()},
-      ErrorSpec(0.01f));
+      {&param0_literal, &param1_literal, &param2_literal}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapGt) {
@@ -477,12 +440,8 @@ TEST_F(MapTest, MapOperationWithBuildError) {
 
   Literal param0_literal =
       LiteralUtil::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
   Literal param1_literal =
       LiteralUtil::CreateR1<float>({5.1f, 4.4f, -0.1f, -5.5f});
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(param1_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto param1 = Parameter(&builder, 1, param1_literal.shape(), "param1");
@@ -498,7 +457,7 @@ TEST_F(MapTest, MapOperationWithBuildError) {
 class MapHloTest : public HloTestBase {};
 
 // TODO(b/230123847): Enable this on GPU once mhlo allows mixed-type map.
-XLA_TEST_F(MapHloTest, DISABLED_ON_GPU(MapWithMixedInputTypes)) {
+TEST_F(MapHloTest, DISABLED_ON_GPU(MapWithMixedInputTypes)) {
   absl::string_view hlo_string = R"(
   HloModule MapMixedInputTypes
 
@@ -522,7 +481,7 @@ XLA_TEST_F(MapHloTest, DISABLED_ON_GPU(MapWithMixedInputTypes)) {
 
 // MapTest disables inline and algsimp. MapTestWithFullOpt runs all
 // optimizations.
-using MapTestWithFullOpt = ClientLibraryTestBase;
+using MapTestWithFullOpt = ClientLibraryTestRunnerMixin<HloTestBase>;
 
 // Regression test for b/31466798. The inliner simplifies map(param0, param1,
 // power) to power(param0, param1) without deleting the old subcomputation which
@@ -540,18 +499,13 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) {
 
   Literal param0_literal = LiteralUtil::CreateR0<float>(2.0f);
   Literal param1_literal = LiteralUtil::CreateR0<float>(5.0f);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(param1_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto param1 = Parameter(&builder, 1, param1_literal.shape(), "param1");
   Map(&builder, {param0, param1}, power, {});
 
-  ComputeAndCompareR0<float>(&builder, 32.0f,
-                             {param0_data.get(), param1_data.get()},
-                             ErrorSpec(0.01f));
+  ComputeAndCompareR0<float>(
+      &builder, 32.0f, {&param0_literal, &param1_literal}, ErrorSpec(0.01f));
 }
 
 // Regression test for b/35786417, where the inliner would not notice the change
@@ -567,17 +521,13 @@ TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
 
   Literal param0_literal = LiteralUtil::CreateR0<float>(2.0f);
   Literal param1_literal = LiteralUtil::CreateR0<float>(5.0f);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(param1_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   auto param1 = Parameter(&builder, 1, param1_literal.shape(), "param1");
   Map(&builder, {param0, param1}, sub_opposite, {});
 
-  ComputeAndCompareR0<float>(
-      &builder, 3.0f, {param0_data.get(), param1_data.get()}, ErrorSpec(0.01f));
+  ComputeAndCompareR0<float>(&builder, 3.0f, {&param0_literal, &param1_literal},
+                             ErrorSpec(0.01f));
 }
 
 // Regression test for b/35786417, where the inliner would CHECK-fail due to the
@@ -591,13 +541,11 @@ TEST_F(MapTestWithFullOpt, MapSquare) {
   auto square = sub_builder->BuildAndNoteError();
 
   Literal param0_literal = LiteralUtil::CreateR0<float>(10.0f);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   auto param0 = Parameter(&builder, 0, param0_literal.shape(), "param0");
   Map(&builder, {param0}, square, {});
 
-  ComputeAndCompareR0<float>(&builder, 100.0f, {param0_data.get()},
+  ComputeAndCompareR0<float>(&builder, 100.0f, {&param0_literal},
                              ErrorSpec(0.01f));
 }
 
diff --git a/third_party/xla/xla/tests/multidimensional_slice_test.cc b/third_party/xla/xla/tests/multidimensional_slice_test.cc
index 0b89cbee3341..e416630ef8dd 100644
--- a/third_party/xla/xla/tests/multidimensional_slice_test.cc
+++ b/third_party/xla/xla/tests/multidimensional_slice_test.cc
@@ -15,23 +15,23 @@ limitations under the License.
 
 // Tests that slice operations can be performed.
 
-#include <memory>
 
 #include "xla/array2d.h"
 #include "xla/array3d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class SliceTest : public ClientLibraryTestBase {};
+class SliceTest : public ClientLibraryTestRunnerMixin<
+                      HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {};
 
-XLA_TEST_F(SliceTest, Slice2D) {
+TEST_F(SliceTest, Slice2D) {
   XlaBuilder builder("slice_2d");
   auto original = ConstantR2<float>(
       &builder,
@@ -42,7 +42,7 @@ XLA_TEST_F(SliceTest, Slice2D) {
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
 }
 
-XLA_TEST_F(SliceTest, Slice3D) {
+TEST_F(SliceTest, Slice3D) {
   XlaBuilder builder("slice_3d");
   Array3D<float> array_3d(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}});
diff --git a/third_party/xla/xla/tests/nccl_group_execution_test.cc b/third_party/xla/xla/tests/nccl_group_execution_test.cc
index 2e08042b5432..aef4335c6425 100644
--- a/third_party/xla/xla/tests/nccl_group_execution_test.cc
+++ b/third_party/xla/xla/tests/nccl_group_execution_test.cc
@@ -125,6 +125,50 @@ XLA_TEST_F(NcclGroupExecutionTest, NcclGroupSendRecvNoWhileLoop) {
   EXPECT_EQ(results[3].ToStringWithoutShapeOneline(), "( 0, 2000 )");
 }
 
+XLA_TEST_F(NcclGroupExecutionTest, BidirectionalCommunication) {
+  const absl::string_view kModuleStr = R"(
+  HloModule module_main, entry_computation_layout={()->(u32[], u32[])}, num_partitions=4
+
+  bidirectional_ring {
+    a = u32[] parameter(0)
+    start = (u32[], u32[]) collective-permute-start(a), source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
+    done = u32[] collective-permute-done(start)
+    start.1 = (u32[], u32[]) collective-permute-start(a), source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+    done.1 = u32[] collective-permute-done(start.1)
+    ROOT tuple = (u32[], u32[]) tuple(done, done.1)
+  }
+
+  ENTRY main {
+    id = u32[] replica-id()
+    async-comm-start = ((u32[]), (u32[], u32[])) async-start(id), calls=bidirectional_ring,
+      frontend_attributes={_collectives_group=""}
+   ROOT async-comm-done = (u32[], u32[]) async-done(async-comm-start)
+  }
+
+  )";
+  const int64_t kNumReplicas = 4;
+  if (test_runner().device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << test_runner().device_count() << " available)";
+  }
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  std::unique_ptr<VerifiedHloModule> module;
+  TF_ASSERT_OK_AND_ASSIGN(module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), absl::Span<Literal* const>{},
+                        kNumReplicas,
+                        /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_EQ(results[0].ToStringWithoutShapeOneline(), "( 3, 1 )");
+  EXPECT_EQ(results[1].ToStringWithoutShapeOneline(), "( 0, 2 )");
+  EXPECT_EQ(results[2].ToStringWithoutShapeOneline(), "( 1, 3 )");
+  EXPECT_EQ(results[3].ToStringWithoutShapeOneline(), "( 2, 0 )");
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/numerics_test.cc b/third_party/xla/xla/tests/numerics_test.cc
index b5f82d78c867..efc8ca97c05d 100644
--- a/third_party/xla/xla/tests/numerics_test.cc
+++ b/third_party/xla/xla/tests/numerics_test.cc
@@ -18,21 +18,23 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/statusor.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using NumericsTest = HloTestBase;
+using NumericsTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
-XLA_TEST_F(NumericsTest, AbsOfLargeComplexNumber) {
+TEST_F(NumericsTest, AbsOfLargeComplexNumber) {
   const char* hlo = R"(
 HloModule module
 
@@ -54,7 +56,7 @@ ENTRY entry {
   EXPECT_TRUE(abs_of_complex_x(1e30));
 }
 
-XLA_TEST_F(NumericsTest, PowerOfLargeComplexNumber) {
+TEST_F(NumericsTest, PowerOfLargeComplexNumber) {
   const char* hlo = R"(
 HloModule module
 
@@ -91,8 +93,8 @@ ENTRY entry {
 // CPU thunks backend (due to incorrect LLVM IR generated).
 // This is an HLO module optimized for CPU backend, it may be invalid for other
 // backends.
-XLA_TEST_F(NumericsTest,
-           DISABLED_ON_GPU(DISABLED_ON_TPU(MultiplySubtractConcatTest))) {
+TEST_F(NumericsTest,
+       DISABLED_ON_GPU(DISABLED_ON_TPU(MultiplySubtractConcatTest))) {
   const char* test_hlo = R"(
     HloModule jit_step, is_scheduled=true
 
diff --git a/third_party/xla/xla/tests/outfeed_in_nested_computation_test.cc b/third_party/xla/xla/tests/outfeed_in_nested_computation_test.cc
index 44250f502e7a..d1a2e7d42aeb 100644
--- a/third_party/xla/xla/tests/outfeed_in_nested_computation_test.cc
+++ b/third_party/xla/xla/tests/outfeed_in_nested_computation_test.cc
@@ -13,11 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <memory>
 
+#include "absl/log/log.h"
+#include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tests/local_client_test_base.h"
 #include "xla/tests/test_macros.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/tests/params_test.cc b/third_party/xla/xla/tests/params_test.cc
index 025fe8de95b2..fe9b59eca0b4 100644
--- a/third_party/xla/xla/tests/params_test.cc
+++ b/third_party/xla/xla/tests/params_test.cc
@@ -14,120 +14,105 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <memory>
+#include <cstdint>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/array2d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class ParamsTest : public ClientLibraryTestBase {};
+using ParamsTest = ClientLibraryTestRunnerMixin<HloTestBase>;
 
-XLA_TEST_F(ParamsTest, ConstantR0F32Param) {
+TEST_F(ParamsTest, ConstantR0F32Param) {
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR0<float>(3.14159f);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "param0");
 
-  ComputeAndCompareR0<float>(&builder, 3.14159f, {param0_data.get()},
+  ComputeAndCompareR0<float>(&builder, 3.14159f, {&param0_literal},
                              ErrorSpec(0.0001f));
 }
 
-XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) {
+TEST_F(ParamsTest, ConstantR1S0F32Param) {
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR1<float>({});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {0}), "param0");
 
-  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
-                             ErrorSpec(0.01f));
+  ComputeAndCompareR1<float>(&builder, {}, {&param0_literal}, ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) {
+TEST_F(ParamsTest, ConstantR1S2F32Param) {
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR1<float>({3.14f, -100.25f});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "param0");
 
-  ComputeAndCompareR1<float>(&builder, {3.14f, -100.25f}, {param0_data.get()},
+  ComputeAndCompareR1<float>(&builder, {3.14f, -100.25f}, {&param0_literal},
                              ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(ParamsTest, ConstantR1U8Param) {
+TEST_F(ParamsTest, ConstantR1U8Param) {
   XlaBuilder builder(TestName());
   std::string str("hello world");
   Literal param0_literal = LiteralUtil::CreateR1U8(str);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   Parameter(&builder, 0,
             ShapeUtil::MakeShape(U8, {static_cast<int64_t>(str.size())}),
             "param0");
 
-  ComputeAndCompareR1U8(&builder, str, {param0_data.get()});
+  ComputeAndCompareR1U8(&builder, str, {&param0_literal});
 }
 
-XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
+TEST_F(ParamsTest, ConstantR2_3x0_F32Param) {
   XlaBuilder builder(TestName());
   Literal param0_literal =
       LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(3, 0));
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3, 0}), "param0");
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(3, 0),
-                             {param0_data.get()}, ErrorSpec(0.01f));
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(3, 0), {&param0_literal},
+                             ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(ParamsTest, ConstantR2F32Param) {
+TEST_F(ParamsTest, ConstantR2F32Param) {
   XlaBuilder builder(TestName());
   Literal param0_literal = LiteralUtil::CreateR2<float>(
       {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(param0_literal).value();
 
   Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {3, 2}), "param0");
 
   Array2D<float> expected_array(
       {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}});
-  ComputeAndCompareR2<float>(&builder, expected_array, {param0_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected_array, {&param0_literal},
                              ErrorSpec(0.01f));
 }
 
-XLA_TEST_F(ParamsTest, TwoParameters) {
+TEST_F(ParamsTest, TwoParameters) {
   XlaBuilder builder(TestName());
 
   Literal literal0 = LiteralUtil::CreateR1<float>({1, 2});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(literal0).value();
   auto param0 = Parameter(&builder, 0, literal0.shape(), "param0");
 
   Literal literal1 = LiteralUtil::CreateR1<float>({10, 20});
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(literal1).value();
   auto param1 = Parameter(&builder, 1, literal1.shape(), "param1");
 
   // Use both parameters
@@ -143,16 +128,14 @@ XLA_TEST_F(ParamsTest, TwoParameters) {
   // {11, 22} * {10, 20} = {110, 440}
   Mul(sum, param1);
 
-  ComputeAndCompareR1<float>(&builder, {110, 440},
-                             {param0_data.get(), param1_data.get()},
+  ComputeAndCompareR1<float>(&builder, {110, 440}, {&literal0, &literal1},
                              ErrorSpec(0.0001f));
 }
 
-XLA_TEST_F(ParamsTest, MissingParameter) {
+TEST_F(ParamsTest, MissingParameter) {
   // Test that an error is returned when a computation with an incomplete set of
   // parameters (parameter numbers not contiguous from 0) is executed.
   Literal literal = LiteralUtil::CreateR0<float>(3.14159f);
-  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
 
   XlaBuilder builder(TestName());
   Parameter(&builder, 2, ShapeUtil::MakeShape(F32, {}), "param2");
@@ -161,36 +144,27 @@ XLA_TEST_F(ParamsTest, MissingParameter) {
   ASSERT_NE(computation_status.status(), absl::OkStatus());
 }
 
-XLA_TEST_F(ParamsTest, UnusedParameter) {
+TEST_F(ParamsTest, UnusedParameter) {
   XlaBuilder builder(TestName());
 
   Literal literal0 = LiteralUtil::CreateR1<float>({1, 2});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(literal0).value();
   Parameter(&builder, 0, literal0.shape(), "param0");
 
   Literal literal1 = LiteralUtil::CreateR1<float>({10, 20});
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(literal1).value();
   Parameter(&builder, 1, literal1.shape(), "param1");
 
-  ComputeAndCompareR1<float>(&builder, {10, 20},
-                             {param0_data.get(), param1_data.get()},
+  ComputeAndCompareR1<float>(&builder, {10, 20}, {&literal0, &literal1},
                              ErrorSpec(0.0001f));
 }
 
-XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
+TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
   // Build a computation with a couple unused parameters which are used in an
   // unused expression.
   XlaBuilder builder(TestName());
 
   Literal literal0 = LiteralUtil::CreateR1<float>({1, 2});
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(literal0).value();
 
   Literal literal1 = LiteralUtil::CreateR1<float>({10, 20, 30});
-  std::unique_ptr<GlobalData> param1_data =
-      client_->TransferToServer(literal1).value();
 
   auto param0 = Parameter(&builder, 0, literal0.shape(), "param0");
   auto param1 = Parameter(&builder, 1, literal1.shape(), "param1");
@@ -201,13 +175,12 @@ XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) {
 
   Neg(param0);
 
-  ComputeAndCompareR1<float>(
-      &builder, {-1, -2},
-      {param0_data.get(), param1_data.get(), param1_data.get()},
-      ErrorSpec(0.0001f));
+  ComputeAndCompareR1<float>(&builder, {-1, -2},
+                             {&literal0, &literal1, &literal1},
+                             ErrorSpec(0.0001f));
 }
 
-XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
+TEST_F(ParamsTest, HundredLargeR1Parameters) {
   XlaBuilder builder(TestName());
   constexpr int size = 8 * 128 * 2;
 
@@ -217,7 +190,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
   std::vector<float> sum = {{0, 1}};
   sum.resize(size);
 
-  std::vector<std::unique_ptr<GlobalData>> param_data_owner;
+  std::vector<Literal> param_data_owner;
 
   constexpr int parameter_count = 100;
   for (int i = 0; i < parameter_count; ++i) {
@@ -229,15 +202,15 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
     std::vector<float> sum_value = {{entry0, entry1}};
     sum_value.resize(size);
     Literal literal = LiteralUtil::CreateR1<float>(sum_value);
-    param_data_owner.push_back(client_->TransferToServer(literal).value());
     XlaOp param = Parameter(&builder, i, literal.shape(), "param");
+    param_data_owner.push_back(std::move(literal));
     sum_handle = Add(sum_handle, param);
   }
 
-  std::vector<GlobalData*> param_data;
+  std::vector<const Literal*> param_data;
   param_data.reserve(param_data_owner.size());
-  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
-    param_data.push_back(data.get());
+  for (const Literal& data : param_data_owner) {
+    param_data.push_back(&data);
   }
 
   ComputeAndCompareR1<float>(&builder, sum, param_data, ErrorSpec(0.0001f));
@@ -249,26 +222,25 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
 
 // TODO(b/65526061) Failed on CPU on 2017-09-10 due to timeout in LLVM
 // compilation.
-XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(ThreeThousandParameters)) {
+TEST_F(ParamsTest, DISABLED_ON_CPU(ThreeThousandParameters)) {
   XlaBuilder builder(TestName());
 
-  std::vector<std::unique_ptr<GlobalData>> param_data_owner;
+  std::vector<Literal> param_data_owner;
   XlaOp sum_handle = ConstantR0<float>(&builder, 0.0f);
   float target = 0.0;
   constexpr int kParamCount = 3000;
   for (int i = 0; i < kParamCount; ++i) {
     target += i;
     Literal literal = LiteralUtil::CreateR0<float>(i);
-    param_data_owner.push_back(
-        std::move(client_->TransferToServer(literal)).value());
     XlaOp param = Parameter(&builder, i, literal.shape(), "param");
+    param_data_owner.push_back(std::move(literal));
     sum_handle = Add(sum_handle, param);
   }
 
-  std::vector<GlobalData*> param_data;
+  std::vector<const Literal*> param_data;
   param_data.reserve(param_data_owner.size());
-  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
-    param_data.push_back(data.get());
+  for (const Literal& data : param_data_owner) {
+    param_data.push_back(&data);
   }
 
   ComputeAndCompareR0<float>(&builder, target, param_data, ErrorSpec(0.0001f));
@@ -276,11 +248,10 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(ThreeThousandParameters)) {
 
 // TODO(b/65526061) Failed on CPU on 2017-09-10 due to timeout in LLVM
 // compilation.
-XLA_TEST_F(ParamsTest,
-           DISABLED_ON_CPU(ThreeThousandParametersAndOutputElements)) {
+TEST_F(ParamsTest, DISABLED_ON_CPU(ThreeThousandParametersAndOutputElements)) {
   XlaBuilder builder(TestName());
 
-  std::vector<std::unique_ptr<GlobalData>> param_data_owner;
+  std::vector<Literal> param_data_owner;
   XlaOp sum_handle = ConstantR1<int32_t>(&builder, {0, 0});
   int32_t target = 0;
   constexpr int kParamCount = 3000;
@@ -290,9 +261,8 @@ XLA_TEST_F(ParamsTest,
   for (int i = 0; i < kParamCount; ++i) {
     target += i;
     Literal literal = LiteralUtil::CreateR1<int32_t>({i, i});
-    param_data_owner.push_back(
-        std::move(client_->TransferToServer(literal)).value());
     XlaOp param = Parameter(&builder, i, literal.shape(), "param");
+    param_data_owner.push_back(std::move(literal));
     params.push_back(param);
     sum_handle = Add(sum_handle, param);
   }
@@ -305,10 +275,10 @@ XLA_TEST_F(ParamsTest,
 
   Tuple(&builder, outputs);
 
-  std::vector<GlobalData*> param_data;
+  std::vector<const Literal*> param_data;
   param_data.reserve(param_data_owner.size());
-  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
-    param_data.push_back(data.get());
+  for (const Literal& data : param_data_owner) {
+    param_data.push_back(&data);
   }
 
   std::vector<Literal> elements;
@@ -336,10 +306,10 @@ XLA_TEST_F(ParamsTest,
 //   pN += (1, 1)
 // }
 // result = {p0, p1, ..., pN}
-XLA_TEST_F(ParamsTest, ManyParametersIntoWhileLoop) {
+TEST_F(ParamsTest, ManyParametersIntoWhileLoop) {
   XlaBuilder builder(TestName());
 
-  std::vector<std::unique_ptr<GlobalData>> param_data_owner;
+  std::vector<Literal> param_data_owner;
   constexpr int kParamCount = 1900;
   std::vector<XlaOp> params;
   std::vector<Shape> parameter_shapes;
@@ -348,22 +318,20 @@ XLA_TEST_F(ParamsTest, ManyParametersIntoWhileLoop) {
   parameter_shapes.reserve(kParamCount);
   for (int i = 0; i < kParamCount; ++i) {
     Literal literal = LiteralUtil::CreateR1<int32_t>({i, i});
-    param_data_owner.push_back(
-        std::move(client_->TransferToServer(literal)).value());
     XlaOp param = Parameter(&builder, i, literal.shape(), "param");
     params.push_back(param);
     parameter_shapes.push_back(literal.shape());
+    param_data_owner.push_back(std::move(literal));
   }
 
   // Add bool parameter for the loop condition. Use a parameter HLO instead of a
   // constant because DCE may eliminate the while-body otherwise.
   Literal bool_literal = LiteralUtil::CreateR0<bool>(false);
-  param_data_owner.push_back(
-      std::move(client_->TransferToServer(bool_literal)).value());
   XlaOp bool_param =
       Parameter(&builder, kParamCount, bool_literal.shape(), "bool_param");
   params.push_back(bool_param);
   parameter_shapes.push_back(bool_literal.shape());
+  param_data_owner.push_back(std::move(bool_literal));
 
   auto init = Tuple(&builder, params);
 
@@ -407,10 +375,10 @@ XLA_TEST_F(ParamsTest, ManyParametersIntoWhileLoop) {
   }
   Tuple(&builder, outputs);
 
-  std::vector<GlobalData*> param_data;
+  std::vector<const Literal*> param_data;
   param_data.reserve(param_data_owner.size());
-  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
-    param_data.push_back(data.get());
+  for (const Literal& data : param_data_owner) {
+    param_data.push_back(&data);
   }
 
   std::vector<Literal> elements;
@@ -425,7 +393,7 @@ XLA_TEST_F(ParamsTest, ManyParametersIntoWhileLoop) {
 
 #endif
 
-XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
+TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
   XlaBuilder builder(TestName());
 
   Shape r1f32_3 = ShapeUtil::MakeShape(F32, {3});
@@ -435,43 +403,43 @@ XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
   auto rhs = GetTupleElement(input, 1);
   Add(lhs, rhs);
 
-  std::unique_ptr<GlobalData> data =
-      client_
-          ->TransferToServer(LiteralUtil::MakeTupleFromSlices({
-              LiteralUtil::CreateR1<float>({1, 2, 3}),
-              LiteralUtil::CreateR1<float>({4, 5, 6}),
-          }))
-          .value();
+  const Literal literal = LiteralUtil::MakeTupleFromSlices({
+      LiteralUtil::CreateR1<float>({1, 2, 3}),
+      LiteralUtil::CreateR1<float>({4, 5, 6}),
+  });
 
-  std::vector<GlobalData*> arguments = {data.get()};
   const std::vector<float> expected = {1 + 4, 2 + 5, 3 + 6};
-  ComputeAndCompareR1<float>(&builder, expected, arguments, ErrorSpec(1e-5));
+  ComputeAndCompareR1<float>(&builder, expected, {&literal}, ErrorSpec(1e-5));
 }
 
 // Verifies that passing a 2x2 with {0, 1} layout returns the same value back
 // when (transferred to the server and) passed through a parameter.
-XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
+TEST_F(ParamsTest, R2_2x2_Layout_01) {
   Literal literal = LiteralUtil::CreateR2WithLayout<float>(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1}));
   XlaBuilder builder(TestName());
   Parameter(&builder, 0, literal.shape(), "input");
 
-  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
-  ComputeAndCompareLiteral(&builder, literal, {data.get()}, ErrorSpec(1e-3));
+  ComputeAndCompareLiteral(&builder, literal, {&literal}, ErrorSpec(1e-3));
 }
 
 // As above, but for {1, 0} layout.
-XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) {
+TEST_F(ParamsTest, R2_2x2_Layout_10) {
   Literal literal = LiteralUtil::CreateR2WithLayout<float>(
       {{1, 3}, {2, 4}}, LayoutUtil::MakeLayout({1, 0}));
   XlaBuilder builder(TestName());
   Parameter(&builder, 0, literal.shape(), "input");
 
-  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
-  ComputeAndCompareLiteral(&builder, literal, {data.get()}, ErrorSpec(1e-3));
+  ComputeAndCompareLiteral(&builder, literal, {&literal}, ErrorSpec(1e-3));
 }
 
-XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
+// Disabled on CPU, GPU, and interpreter. Not all all HLO-based runners support
+// using the layout of the parameter literal on all backends. The entry
+// computation layout is used instead. The way this test is set up, the ECL will
+// reflect the layout of the original literal, so it does not pass. This seems
+// to be a niche behavior that is not worth fixing.
+TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
+                       R2_2x2_TryToPassReverseLayoutToParameter)))) {
   Literal literal = LiteralUtil::CreateR2<float>({
       {1, 3},
       {2, 4},
@@ -494,11 +462,10 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
   // Use the slice operator to get an off-diagonal element.
   Slice(input, {0, 1}, {1, 2}, {1, 1});
 
-  std::unique_ptr<GlobalData> data = client_->TransferToServer(literal).value();
   // Check that we got the off-diagonal value that we expected.
   Array2D<float> expected(1, 1);
   expected(0, 0) = 2;
-  ComputeAndCompareR2(&builder, expected, {data.get()}, ErrorSpec(1e-3));
+  ComputeAndCompareR2(&builder, expected, {&literal}, ErrorSpec(1e-3));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/pred_test.cc b/third_party/xla/xla/tests/pred_test.cc
index 060a433753aa..12fd3b679949 100644
--- a/third_party/xla/xla/tests/pred_test.cc
+++ b/third_party/xla/xla/tests/pred_test.cc
@@ -14,20 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 // Miscellaneous tests with the PRED type that don't fit anywhere else.
-#include <memory>
+#include <cstdint>
+#include <functional>
+#include <string>
 
-#include "xla/array2d.h"
-#include "xla/client/local_client.h"
+#include "absl/types/span.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class PredTest : public ClientLibraryTestBase {
+class PredTest : public ClientLibraryTestRunnerMixin<
+                     HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   void TestCompare(bool lhs, bool rhs, bool expected,
                    std::function<XlaOp(const xla::XlaOp&, const xla::XlaOp&,
diff --git a/third_party/xla/xla/tests/ptxas_bug_120501638.cc b/third_party/xla/xla/tests/ptxas_bug_120501638.cc
index 9cc57edcac2c..80ae75ce059b 100644
--- a/third_party/xla/xla/tests/ptxas_bug_120501638.cc
+++ b/third_party/xla/xla/tests/ptxas_bug_120501638.cc
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "xla/debug_options_flags.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class PtxasBugTest : public HloTestBase {};
+using PtxasBugTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
 // Checks for a bug in ptxas, tracked as Google bug 120501638, and nvidia bug
 // 2459377.  We never received an explanation of what exactly was going wrong
diff --git a/third_party/xla/xla/tests/query_inferred_shape_test.cc b/third_party/xla/xla/tests/query_inferred_shape_test.cc
index a163ba58dd83..4cc3e9263b12 100644
--- a/third_party/xla/xla/tests/query_inferred_shape_test.cc
+++ b/third_party/xla/xla/tests/query_inferred_shape_test.cc
@@ -13,29 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-
-#include "absl/status/statusor.h"
-#include "xla/client/local_client.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/hlo/testlib/test_helpers.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class QueryInferredShapeTest : public ClientLibraryTestBase {};
-
-TEST_F(QueryInferredShapeTest, OnePlusOneShape) {
+TEST(QueryInferredShapeTest, OnePlusOneShape) {
   XlaBuilder builder("one_plus_one");
-  auto one = ConstantR0<float>(&builder, 1.0);
-  auto result = Add(one, one);
-  absl::StatusOr<Shape> shape_status = builder.GetShape(result);
-  ASSERT_IS_OK(shape_status.status());
-  auto shape = shape_status.value();
+  XlaOp one = ConstantR0<float>(&builder, 1.0);
+  XlaOp result = Add(one, one);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, builder.GetShape(result));
   ASSERT_TRUE(ShapeUtil::Equal(shape, ShapeUtil::MakeShape(F32, {})));
 }
 
diff --git a/third_party/xla/xla/tests/reduce_precision_test.cc b/third_party/xla/xla/tests/reduce_precision_test.cc
index 5adbd3e50458..8d8f2a0158af 100644
--- a/third_party/xla/xla/tests/reduce_precision_test.cc
+++ b/third_party/xla/xla/tests/reduce_precision_test.cc
@@ -13,25 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cmath>
-#include <limits>
-#include <memory>
-#include <numeric>
-#include <utility>
+#include <cstdint>
 #include <vector>
 
 #include "absl/base/casts.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "xla/array2d.h"
-#include "xla/client/local_client.h"
+#include "absl/strings/str_format.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/layout_util.h"
 #include "xla/literal.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/literal_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 
 namespace xla {
@@ -452,8 +447,10 @@ static const uint64_t f64_test_values[][4] = {
     },
 };
 
-class ReducedPrecisionAccuracyTest : public ClientLibraryTestBase,
-                                     public ::testing::WithParamInterface<int> {
+class ReducedPrecisionAccuracyTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
+      public ::testing::WithParamInterface<int> {
  protected:
   template <typename Fp, typename Uint, int kNumTestcases, int kNumInputs>
   void DoIt(int exponent_bits, int mantissa_bits,
@@ -517,7 +514,7 @@ void ReducedPrecisionAccuracyTest::DoIt(
 
   ReducePrecision(a, exponent_bits, mantissa_bits);
 
-  ComputeAndCompare(&builder, {std::move(a_literal)});
+  ComputeAndCompare(&builder, {&a_literal});
 }
 
 INSTANTIATE_TEST_CASE_P(ReducedPrecisionAccuracyTest,
diff --git a/third_party/xla/xla/tests/reduce_test.cc b/third_party/xla/xla/tests/reduce_test.cc
index c0a2e724f505..21e39209b08e 100644
--- a/third_party/xla/xla/tests/reduce_test.cc
+++ b/third_party/xla/xla/tests/reduce_test.cc
@@ -87,7 +87,7 @@ class ReduceTest : public ClientLibraryTestBase {
     CHECK(ShapeUtil::Equal(
         literal_3d_.shape(),
         ShapeUtil::MakeShape(F32, {/*z=*/4, /*y=*/2, /*x=*/3})))
-        << literal_3d_.shape().ShortDebugString();
+        << literal_3d_.shape().ToString();
   }
 
   // Runs an R1 => R0 reduction test with the given number of elements.
diff --git a/third_party/xla/xla/tests/reduce_window_test.cc b/third_party/xla/xla/tests/reduce_window_test.cc
index 8a449262bdd8..6f2cfe6a0918 100644
--- a/third_party/xla/xla/tests/reduce_window_test.cc
+++ b/third_party/xla/xla/tests/reduce_window_test.cc
@@ -311,7 +311,7 @@ XLA_TEST_P(ReduceWindowTest, PrimeWindowsInReductionDimension) {
                            DefaultErrorSpec());
 }
 
-XLA_TEST_P(ReduceWindowTest, ReduceAlongLaneDimension) {
+XLA_TEST_P(ReduceWindowTest, OVERSIZE_ON_GRM(ReduceAlongLaneDimension)) {
   Array4D<float> input_array(19, 17, 8, 256);
   input_array.FillWithMinorDimNum();
 
diff --git a/third_party/xla/xla/tests/replay_test.cc b/third_party/xla/xla/tests/replay_test.cc
index 55308c0f5152..119da8b1f8f5 100644
--- a/third_party/xla/xla/tests/replay_test.cc
+++ b/third_party/xla/xla/tests/replay_test.cc
@@ -20,18 +20,20 @@ limitations under the License.
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/literal.h"
-#include "xla/protobuf_util.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
+using ::tsl::proto_testing::EqualsProto;
+
 class ReplayTest : public ClientLibraryTestBase {};
 
 TEST_F(ReplayTest, TwoPlusTwoReplay) {
@@ -52,8 +54,8 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) {
       client_->GetComputationShape(computation).value();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).value();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
-                                            replayed_shape->ToProto()));
+  ASSERT_THAT(replayed_shape->ToProto(),
+              EqualsProto(original_shape->ToProto()));
 
   // Run it.
   Literal literal =
@@ -84,8 +86,8 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
       client_->GetComputationShape(computation).value();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).value();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
-                                            replayed_shape->ToProto()));
+  ASSERT_THAT(replayed_shape->ToProto(),
+              EqualsProto(original_shape->ToProto()));
 
   // Run it.
   std::unique_ptr<GlobalData> x_data =
@@ -128,8 +130,8 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
       client_->GetComputationShape(computation).value();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).value();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
-                                            replayed_shape->ToProto()));
+  ASSERT_THAT(replayed_shape->ToProto(),
+              EqualsProto(original_shape->ToProto()));
 
   // Run it.
   Literal literal =
diff --git a/third_party/xla/xla/tests/reshape_test.cc b/third_party/xla/xla/tests/reshape_test.cc
index 4aeb2950e034..19b6ae21226d 100644
--- a/third_party/xla/xla/tests/reshape_test.cc
+++ b/third_party/xla/xla/tests/reshape_test.cc
@@ -14,20 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
-#include <memory>
-#include <numeric>
 #include <random>
 #include <string>
 #include <vector>
 
-#include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/array2d.h"
 #include "xla/array3d.h"
 #include "xla/array4d.h"
-#include "xla/client/local_client.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
@@ -36,21 +32,22 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/reference_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 class ReshapeTest : public ::testing::WithParamInterface<PrimitiveType>,
-                    public ClientLibraryTestBase {
+                    public ClientLibraryTestRunnerMixin<HloTestBase> {
  public:
   ReshapeTest() { set_float_type(GetParam()); }
 
@@ -58,342 +55,320 @@ class ReshapeTest : public ::testing::WithParamInterface<PrimitiveType>,
 };
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
-XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
+TEST_P(ReshapeTest, CollapseTrivial1x1) {
   XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(
-                      0, input_literal, "parameter", &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "parameter", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
+TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(
-                      0, input_literal, "parameter", &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "parameter", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
+TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(
-                      0, input_literal, "parameter", &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "parameter", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{0});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
-XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
+TEST_P(ReshapeTest, SingleElementArrayToScalar) {
   XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(
-                      0, input_literal, "parameter", &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "parameter", &builder, &parameter);
   auto reshape = Reshape(/*operand=*/parameter, /*dimensions=*/{});
   auto new_shape = builder.GetShape(reshape).value();
 
   auto expected_literal = LiteralUtil::CreateR0<float>(1.0f);
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
+TEST_P(ReshapeTest, ScalarToSingleElementArray) {
   XlaBuilder builder(TestName());
 
   Literal param0_literal = LiteralUtil::CreateR0<float>(1.0f);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, param0_literal, "param0",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, param0_literal, "param0", &builder, &parameter);
   auto a = Neg(parameter);
   Reshape(/*operand=*/a, /*dimensions=*/{1});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({-1.0f});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, Trivial0x3) {
+TEST_P(ReshapeTest, Trivial0x3) {
   XlaBuilder builder(TestName());
   Array2D<float> input_array(0, 3);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, Trivial0x3WithParameter) {
+TEST_P(ReshapeTest, Trivial0x3WithParameter) {
   XlaBuilder builder(TestName());
 
   Literal param0_literal =
       LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, param0_literal, "param0",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, param0_literal, "param0", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, Trivial3x0) {
+TEST_P(ReshapeTest, Trivial3x0) {
   XlaBuilder builder(TestName());
   Array2D<float> input_array(3, 0);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Collapses a 2-dimensional row vector to 1 dimension.
-XLA_TEST_P(ReshapeTest, Trivial1x3) {
+TEST_P(ReshapeTest, Trivial1x3) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Collapses a 2-dimensional column vector to 1 dimension.
-XLA_TEST_P(ReshapeTest, Trivial3x1) {
+TEST_P(ReshapeTest, Trivial3x1) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Splits an empty vector into an empty matrix.
-XLA_TEST_P(ReshapeTest, R1ToR2_0_To_2x0) {
+TEST_P(ReshapeTest, R1ToR2_0_To_2x0) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({});
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{2, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Splits a vector into a matrix.
-XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
+TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
   XlaBuilder builder(TestName());
   auto input_literal =
       LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{2, 3});
   auto expected_literal =
       LiteralUtil::CreateR2<float>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Transposes a 2x0 array to a 0x2 array.
-XLA_TEST_P(ReshapeTest, Reshape0x2To2x0) {
+TEST_P(ReshapeTest, Reshape0x2To2x0) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 2));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{2, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Transposes a 2-dimensional row vector to a column vector.
-XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
+TEST_P(ReshapeTest, ReshapeRowToCol) {
   XlaBuilder builder(TestName());
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*simple);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{3, 1});
 
   auto expected = ReferenceUtil::TransposeArray2D(*simple);
   auto expected_literal = LiteralUtil::CreateFromArray(*expected);
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array.
-XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
+TEST_P(ReshapeTest, TransposeAsReshape) {
   XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{1, 0}),
           /*dimensions=*/{3, 4});
 
   auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
   auto expected_literal = LiteralUtil::CreateFromArray(*expected);
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Transposes a 0x4 array with XlaBuilder::Transpose.
-XLA_TEST_P(ReshapeTest, Transpose0x4) {
+TEST_P(ReshapeTest, Transpose0x4) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 4));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Transpose(parameter, {1, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}, {}, {}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array with ComputationBuilder::Trans.
-XLA_TEST_P(ReshapeTest, Transpose4x3) {
+TEST_P(ReshapeTest, Transpose4x3) {
   XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Transpose(parameter, {1, 0});
 
   auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
   auto expected_literal = LiteralUtil::CreateFromArray(*expected);
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Reshapes an empty 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffleZeroElements) {
+TEST_P(ReshapeTest, ReshapeSplitNoShuffleZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(6, 0));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{2, 3, 0, 0});
   auto expected_literal =
       LiteralUtil::CreateFromArray(Array4D<float>(2, 3, 0, 0));
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, ReshapeR4ToR2ZeroElements) {
+TEST_P(ReshapeTest, ReshapeR4ToR2ZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array4D<float>(2, 3, 4, 0));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{24, 0});
   auto expected_literal = LiteralUtil::CreateFromArray(Array2D<float>(24, 0));
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
+TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
   XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{2, 6});
 
   auto expected = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6);
   auto expected_literal = LiteralUtil::CreateFromArray(*expected);
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffleZeroElements) {
+TEST_P(ReshapeTest, ReshapeSplitAndShuffleZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 6));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{1, 0}),
           /*dimensions=*/{3, 0});
   auto expected_literal = LiteralUtil::CreateFromArray(Array2D<float>(3, 0));
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
-XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
+TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
   XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{1, 0}),
           /*dimensions=*/{2, 6});
   Array2D<float> expected({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
                            {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}});
   auto expected_literal = LiteralUtil::CreateFromArray(expected);
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
@@ -408,29 +383,27 @@ static Array3D<float> ArrayForDocR3Tests() {
                          {{40, 41, 42}, {45, 46, 47}}});
 }
 
-XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
+TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{0, 1, 2}),
           /*dimensions=*/{24});
   auto expected_literal = LiteralUtil::CreateR1<float>(
       {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
        30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
+TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{0, 1, 2}),
           /*dimensions=*/{8, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>({{10, 11, 12},
@@ -441,33 +414,31 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
                                                         {35, 36, 37},
                                                         {40, 41, 42},
                                                         {45, 46, 47}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
+TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{1, 2, 0}),
           /*dimensions=*/{24});
   auto expected_literal = LiteralUtil::CreateR1<float>(
       {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
        15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
+TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
 
   Reshape(Transpose(parameter, /*permutation=*/{1, 2, 0}),
           /*dimensions=*/{8, 3});
@@ -479,23 +450,22 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
                                                         {45, 16, 26},
                                                         {36, 46, 17},
                                                         {27, 37, 47}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
+TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{1, 2, 0}),
           /*dimensions=*/{2, 6, 2});
   auto expected_literal = LiteralUtil::CreateR3<float>(
       {{{10, 20}, {30, 40}, {11, 21}, {31, 41}, {12, 22}, {32, 42}},
        {{15, 25}, {35, 45}, {16, 26}, {36, 46}, {17, 27}, {37, 47}}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
@@ -514,27 +484,26 @@ XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 // Then we collapse Z be collapsed so we just end up with planes:
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
-XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
+TEST_P(ReshapeTest, FullyConnectedCollapse) {
   XlaBuilder builder(TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
   auto input_literal = LiteralUtil::CreateFromArray(t2x2x2x3);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>(
       {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
        {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
         6.0f}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // As above, but uses reshape directly.
-XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
+TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   XlaBuilder builder(TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
@@ -547,19 +516,18 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 1, 1) = 7;
   auto input_literal = LiteralUtil::CreateFromArray(t);
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(/*operand=*/parameter, /*dimensions=*/{2, 4});
 
   auto expected_literal =
       LiteralUtil::CreateR2<float>({{0, 1, 2, 3}, {4, 5, 6, 7}});
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Reshape various ranks to a scalar.
-XLA_TEST_P(ReshapeTest, ToScalar) {
+TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
     XlaBuilder b(TestName());
     std::vector<int64_t> ones(rank, 1);  // this is {1, ..., 1}.
@@ -568,30 +536,27 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
     input_literal.Set<float>(zeros, 83.0f);
 
     XlaOp parameter;
-    TF_ASSERT_OK_AND_ASSIGN(
-        auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &b, &parameter));
+    const Literal input = CreateParameterAndTransferLiteral(
+        0, input_literal, "input", &b, &parameter);
     Reshape(parameter, {});
 
     auto expected_literal = LiteralUtil::CreateR0<float>(83.0f);
-    ComputeAndCompareLiteral(&b, expected_literal, {input.get()},
-                             zero_error_spec_);
+    ComputeAndCompareLiteral(&b, expected_literal, {&input}, zero_error_spec_);
   }
 }
 
-XLA_TEST_P(ReshapeTest, BadNewSizes) {
+TEST_P(ReshapeTest, BadNewSizes) {
   XlaBuilder b(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f});
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &b, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &b, &parameter);
   Reshape(parameter, {});
   EXPECT_THAT(ExecuteToString(&b, {}),
               ::testing::HasSubstr("mismatched element counts"));
 }
 
-XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
+TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   XlaBuilder builder(TestName());
   // clang-format off
   auto input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
@@ -620,9 +585,8 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
 
   Reshape(parameter, /*dimensions=*/{2, 8});
 
@@ -631,15 +595,12 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
       {222, 333, 444, 555, 666, 777, 888, 999},
   });
 
-  XlaComputation computation = builder.Build().value();
-  ExecutionOptions execution_options = execution_options_;
-  *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithDenseLayout(FloatType(), {2, 8}, {1, 0})
-          .ToProto();
-  Literal actual =
-      client_
-          ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
-          .value();
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  const Shape shape_with_output_layout =
+      ShapeUtil::MakeShapeWithDenseLayout(FloatType(), {2, 8}, {1, 0});
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Literal actual,
+      ExecuteAndTransfer(computation, {&input}, &shape_with_output_layout));
   Literal expected = LiteralUtil::CreateR2FromArray2D<float>(expected_array);
   if (FloatType() != F32) {
     expected = MaybeConvertLiteralToTestType(expected);
@@ -647,7 +608,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
 }
 
-XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
+TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
   XlaBuilder builder(TestName());
   Literal input_literal = LiteralUtil::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
@@ -655,9 +616,8 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(parameter, /*dimensions=*/{3, 2, 1, 4});
 
   // clang-format off
@@ -670,12 +630,12 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
      {{204, 205, 206, 207}}}
   });
   // clang-format on
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
-XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
+TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
   XlaBuilder builder(TestName());
   Literal input_literal = LiteralUtil::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
@@ -683,9 +643,8 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{1, 0}),
           /*dimensions=*/{3, 2, 1, 4});
 
@@ -699,11 +658,11 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
      {{206, 7, 107, 207}}}
   });
   // clang-format on
-  ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
+  ComputeAndCompareLiteral(&builder, expected_literal, {&input},
                            zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
+TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
@@ -713,17 +672,15 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(parameter, /*dimensions=*/{2, 1});
 
   Literal expected = LiteralUtil::ReshapeSlice({2, 1}, {1, 0}, input_literal);
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
+TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
@@ -733,18 +690,16 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(parameter, /*dimensions=*/{4, 2});
 
   Literal expected = LiteralUtil::ReshapeSlice({4, 2}, {1, 0}, input_literal);
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_);
 }
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
-XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
+TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
@@ -754,9 +709,8 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter,
                     /*permutation=*/{0, 2, 1, 3}),
           /*dimensions=*/{5, 60});
@@ -767,11 +721,10 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
         *cell;
   });
   auto expected = LiteralUtil::CreateR2FromArray2D(expected_array);
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_);
 }
 
-XLA_TEST_P(ReshapeTest, NoopReshape) {
+TEST_P(ReshapeTest, NoopReshape) {
   XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
@@ -782,24 +735,18 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter,
                     /*permutation=*/{3, 0, 1, 2}),
           /*dimensions=*/{7, 2, 3, 5});
   XlaComputation computation = builder.Build().value();
 
-  ExecutionOptions execution_options = execution_options_;
-  *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithDenseLayout(FloatType(), {7, 2, 3, 5},
-                                          {2, 3, 0, 1})
-          .ToProto();
-  Literal output_literal =
-      client_
-          ->ExecuteAndTransfer(computation, {input_data.get()},
-                               &execution_options)
-          .value();
+  const Shape shape_with_output_layout = ShapeUtil::MakeShapeWithDenseLayout(
+      FloatType(), {7, 2, 3, 5}, {2, 3, 0, 1});
+  TF_ASSERT_OK_AND_ASSIGN(const Literal output_literal,
+                          ExecuteAndTransfer(computation, {&input_data},
+                                             &shape_with_output_layout));
 
   // Since the reshape is a no-op, verify that it does not change the underlying
   // data.
@@ -829,31 +776,29 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   }
 }
 
-XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
+TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
   XlaBuilder builder(TestName());
   auto literal_1x2x3x4 = LiteralUtil::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, literal_1x2x3x4, "input", &builder, &parameter);
   Reshape(parameter, /*dimensions=*/{1, 2, 3, 4});
 
-  ComputeAndCompareLiteral(&builder, literal_1x2x3x4, {input.get()});
+  ComputeAndCompareLiteral(&builder, literal_1x2x3x4, {&input});
 }
 
-XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
+TEST_P(ReshapeTest, R4ToR4Reshape) {
   auto literal_1x2x3x4 = LiteralUtil::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto input, CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
-                                                    &builder, &parameter));
+  const Literal input = CreateParameterAndTransferLiteral(
+      0, literal_1x2x3x4, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{1, 3, 2, 0}),
           /*dimensions=*/{2, 4, 3, 1});
 
@@ -869,10 +814,10 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
         {{16}, {20}, {24}}}});
   // clang-format on
 
-  ComputeAndCompareLiteral(&builder, expected_2x4x3x1, {input.get()});
+  ComputeAndCompareLiteral(&builder, expected_2x4x3x1, {&input});
 }
 
-XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
+TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64_t> bounds = {2, 2, 2, 2};
@@ -885,9 +830,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, /*permutation=*/{0, 1, 3, 2}),
           /*dimensions=*/new_bounds);
 
@@ -897,11 +841,11 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_, &expected.shape());
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_,
+                           &expected.shape());
 }
 
-XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
+TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64_t> bounds = {1, 1, 250, 300};
@@ -914,9 +858,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, {0, 1, 3, 2}), /*dimensions=*/new_bounds);
 
   Literal expected =
@@ -925,11 +868,11 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_, &expected.shape());
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_,
+                           &expected.shape());
 }
 
-XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
+TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64_t> bounds = {5, 5, 1, 10};
@@ -942,9 +885,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, {0, 1, 3, 2}), /*dimensions=*/new_bounds);
 
   Literal expected =
@@ -953,11 +895,11 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_, &expected.shape());
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_,
+                           &expected.shape());
 }
 
-XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
+TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   // This happens in NN-Builder MNIST.
@@ -971,9 +913,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, {0, 1, 3, 2}), /*dimensions=*/new_bounds);
 
   Literal expected =
@@ -982,11 +923,11 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_, &expected.shape());
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_,
+                           &expected.shape());
 }
 
-XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
+TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64_t> bounds = {3, 3, 1, 3};
@@ -999,9 +940,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
       input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
-                          CreateParameterAndTransferLiteral(
-                              0, input_literal, "input", &builder, &parameter));
+  const Literal input_data = CreateParameterAndTransferLiteral(
+      0, input_literal, "input", &builder, &parameter);
   Reshape(Transpose(parameter, {1, 0, 2, 3}), /*dimensions=*/new_bounds);
 
   Literal expected =
@@ -1010,8 +950,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
 
   // Specify the requested output shape explicitly to ensure that this reshape
   // actually corresponds to a two minor transpose.
-  ComputeAndCompareLiteral(&builder, expected, {input_data.get()},
-                           zero_error_spec_, &expected.shape());
+  ComputeAndCompareLiteral(&builder, expected, {&input_data}, zero_error_spec_,
+                           &expected.shape());
 }
 
 INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest,
diff --git a/third_party/xla/xla/tests/reverse_test.cc b/third_party/xla/xla/tests/reverse_test.cc
index a7991d930c7f..b37fea698da5 100644
--- a/third_party/xla/xla/tests/reverse_test.cc
+++ b/third_party/xla/xla/tests/reverse_test.cc
@@ -30,16 +30,23 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/client_library_test_runner_utils.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-static std::array<PrimitiveType, 4> primitive_type_params{F32, BF16, F8E5M2,
-                                                          F8E4M3FN};
+constexpr std::array<PrimitiveType, 4> kPrimitiveTypeParams{
+    F32,
+    BF16,
+    F8E5M2,
+    F8E4M3FN,
+};
 
 struct ReverseSpec {
   std::vector<int64_t> input_dims;
@@ -57,7 +64,7 @@ struct ReverseSpec {
 static std::vector<ReverseSpec> GetTestCases() {
   // clang-format off
   return ExpandTestType<ReverseSpec>(
-      primitive_type_params,
+      kPrimitiveTypeParams,
       {{{}, {}},
         {{0, 0}, {0, 1}},
         {{0, 1}, {0, 1}},
@@ -75,7 +82,8 @@ void PrintTo(const ReverseSpec& spec, std::ostream* os) {
   *os << spec.ToTestCaseName();
 }
 
-class FloatReverseTest : public ClientLibraryTestBase,
+class FloatReverseTest : public ClientLibraryTestRunnerMixin<
+                             HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                          public ::testing::WithParamInterface<ReverseSpec> {
  public:
   FloatReverseTest() { set_float_type(GetParam().test_type); }
@@ -86,11 +94,14 @@ TEST_P(FloatReverseTest, Reverses) {
   std::vector<float> input_vector(
       ShapeUtil::ElementsIn(ShapeUtil::MakeShape(F32, spec.input_dims)));
   std::iota(input_vector.begin(), input_vector.end(), 0.0);
-  auto r1_literal = LiteralUtil::CreateR1<float>(input_vector);
-  auto input_literal = r1_literal.Reshape(spec.input_dims).value();
+  const Literal r1_literal = LiteralUtil::CreateR1<float>(input_vector);
+  TF_ASSERT_OK_AND_ASSIGN(const Literal input_literal,
+                          r1_literal.Reshape(spec.input_dims));
+  const Literal conv_input_literal =
+      MaybeConvertLiteralToTestType(input_literal);
 
   XlaBuilder builder(TestName());
-  auto a = AddParam(input_literal, &builder);
+  XlaOp a = Parameter(&builder, 0, conv_input_literal.shape(), "input");
   Rev(a, spec.reversal);
 
   Literal expected = input_literal.Clone();
@@ -105,7 +116,7 @@ TEST_P(FloatReverseTest, Reverses) {
     }
     expected.Set<float>(output_indices, value);
   });
-  ComputeAndCompareLiteral(&builder, expected, {});
+  ComputeAndCompareLiteral(&builder, expected, {&conv_input_literal});
 }
 
 INSTANTIATE_TEST_CASE_P(FloatReverseInstance, FloatReverseTest,
@@ -113,10 +124,11 @@ INSTANTIATE_TEST_CASE_P(FloatReverseInstance, FloatReverseTest,
                         ::testing::PrintToStringParamName());
 
 // A simple test class which not templated by float precision.
-class ReverseTest : public ClientLibraryTestBase {};
+using ReverseTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 // Tests the reverse operation on a 4D U8 array on dimension 0 and 3.
-XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) {
+TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) {
   XlaBuilder b(TestName());
   // Input shape is U8[1x2x3x4].
   // clang-format off
diff --git a/third_party/xla/xla/tests/rng_test.cc b/third_party/xla/xla/tests/rng_test.cc
index a9e2896f97b3..8b1d1263a5ee 100644
--- a/third_party/xla/xla/tests/rng_test.cc
+++ b/third_party/xla/xla/tests/rng_test.cc
@@ -24,15 +24,15 @@ limitations under the License.
 #include "xla/hlo/transforms/expanders/rng_expander.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using RngTest = HloTestBase;
+using RngTest = HloPjRtTestBase;
 
 void DisableHloPass(HloModule& module, absl::string_view pass_name) {
   auto debug_options = module.config().debug_options();
@@ -77,7 +77,7 @@ XLA_TEST_F(RngTest, ReturnsErrorWhenExpanderPassDisabled) {
               ::testing::HasSubstr("Rng should be expanded for CPU"));
 }
 
-using RngBitGeneratorTest = HloTestBase;
+using RngBitGeneratorTest = HloPjRtTestBase;
 
 XLA_TEST_F(RngBitGeneratorTest, ReturnsErrorWhenExpanderPassDisabled_Default) {
   const char* const kModuleStr = R"(
diff --git a/third_party/xla/xla/tests/runtime_topk_test.cc b/third_party/xla/xla/tests/runtime_topk_test.cc
index b913e3e4d361..16d9ffdb18aa 100644
--- a/third_party/xla/xla/tests/runtime_topk_test.cc
+++ b/third_party/xla/xla/tests/runtime_topk_test.cc
@@ -20,15 +20,15 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla::cpu {
 namespace {
 
-class TopkTest : public HloTestBase {};
+class TopkTest : public HloPjRtTestBase {};
 
 XLA_TEST_F(TopkTest, CustomCallTarget) {
   absl::string_view hlo_text_module = R"(
diff --git a/third_party/xla/xla/tests/sample_text_test.cc b/third_party/xla/xla/tests/sample_text_test.cc
index 3b5dc2692149..e7f0ca3dbde6 100644
--- a/third_party/xla/xla/tests/sample_text_test.cc
+++ b/third_party/xla/xla/tests/sample_text_test.cc
@@ -18,20 +18,19 @@ limitations under the License.
 
 #include <optional>
 #include <string>
-#include <vector>
 
+#include "xla/error_spec.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "xla/types.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 
 namespace xla {
 namespace {
 
 using std::nullopt;
 
-class SampleTextTest : public HloTestBase {};
+class SampleTextTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {};
 
 TEST_F(SampleTextTest, Axpy) {
   const std::string& hlo_string = R"(
diff --git a/third_party/xla/xla/tests/scalar_computations_test.cc b/third_party/xla/xla/tests/scalar_computations_test.cc
index 664e8adc37f5..7310446d1e08 100644
--- a/third_party/xla/xla/tests/scalar_computations_test.cc
+++ b/third_party/xla/xla/tests/scalar_computations_test.cc
@@ -14,33 +14,38 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cmath>
+#include <cstdint>
+#include <functional>
 #include <limits>
-#include <memory>
+#include <ostream>
 #include <type_traits>
+#include <vector>
 
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/shape_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class ScalarComputationsTest : public ClientLibraryTestBase {
- public:
-  ErrorSpec error_spec_{0.0001};
+constexpr ErrorSpec kErrorSpec{0.0001};
 
+class ScalarComputationsTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   // A template for building and running a binary comparison test.
   template <typename NativeT>
@@ -76,14 +81,14 @@ XLA_TEST_F(ScalarComputationsTest, ReturnScalarF32) {
   XlaBuilder builder(TestName());
   ConstantR0<float>(&builder, 2.1f);
 
-  ComputeAndCompareR0<float>(&builder, 2.1f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 2.1f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, NegateScalarF32) {
   XlaBuilder builder(TestName());
   Neg(ConstantR0<float>(&builder, 2.1f));
 
-  ComputeAndCompareR0<float>(&builder, -2.1f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, -2.1f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, NegateScalarS32) {
@@ -97,7 +102,7 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF32) {
   XlaBuilder builder(TestName());
   Add(ConstantR0<float>(&builder, 2.1f), ConstantR0<float>(&builder, 5.5f));
 
-  ComputeAndCompareR0<float>(&builder, 7.6f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 7.6f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS32) {
@@ -150,7 +155,7 @@ XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsF32) {
   XlaBuilder builder(TestName());
   Sub(ConstantR0<float>(&builder, 2.1f), ConstantR0<float>(&builder, 5.5f));
 
-  ComputeAndCompareR0<float>(&builder, -3.4f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, -3.4f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsS32) {
@@ -167,10 +172,7 @@ XLA_TEST_F(ScalarComputationsTest, CastS64ToF32) {
 
   int64_t value = 3LL << 35;
   Literal a_literal = LiteralUtil::CreateR0<int64_t>(value);
-  std::unique_ptr<GlobalData> a_data =
-      client_->TransferToServer(a_literal).value();
-  ComputeAndCompareR0<float>(&builder, static_cast<float>(value),
-                             {a_data.get()});
+  ComputeAndCompareR0<float>(&builder, static_cast<float>(value), {&a_literal});
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32) {
@@ -178,7 +180,7 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32) {
   Mul(Mul(ConstantR0<float>(&builder, 2.1f), ConstantR0<float>(&builder, 5.5f)),
       ConstantR0<float>(&builder, 0.5f));
 
-  ComputeAndCompareR0<float>(&builder, 5.775f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 5.775f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF64) {
@@ -240,16 +242,9 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
   XlaBuilder builder(TestName());
-  Literal a_literal = LiteralUtil::CreateR0<float>(2.1f);
-  Literal b_literal = LiteralUtil::CreateR0<float>(5.5f);
-  Literal c_literal = LiteralUtil::CreateR0<float>(0.5f);
-
-  std::unique_ptr<GlobalData> a_data =
-      client_->TransferToServer(a_literal).value();
-  std::unique_ptr<GlobalData> b_data =
-      client_->TransferToServer(b_literal).value();
-  std::unique_ptr<GlobalData> c_data =
-      client_->TransferToServer(c_literal).value();
+  const Literal a_literal = LiteralUtil::CreateR0<float>(2.1f);
+  const Literal b_literal = LiteralUtil::CreateR0<float>(5.5f);
+  const Literal c_literal = LiteralUtil::CreateR0<float>(0.5f);
 
   XlaOp a = Parameter(&builder, 0, a_literal.shape(), "a");
   XlaOp b = Parameter(&builder, 1, b_literal.shape(), "b");
@@ -257,22 +252,21 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
   Mul(Mul(a, b), c);
 
   ComputeAndCompareR0<float>(&builder, 5.775f,
-                             {a_data.get(), b_data.get(), c_data.get()},
-                             error_spec_);
+                             {&a_literal, &b_literal, &c_literal}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsF32) {
   XlaBuilder builder(TestName());
   Div(ConstantR0<float>(&builder, 5.0f), ConstantR0<float>(&builder, 2.5f));
 
-  ComputeAndCompareR0<float>(&builder, 2.0f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 2.0f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsF32) {
   XlaBuilder builder(TestName());
   Rem(ConstantR0<float>(&builder, 2.5f), ConstantR0<float>(&builder, 5.0f));
 
-  ComputeAndCompareR0<float>(&builder, 2.5f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 2.5f, {}, kErrorSpec);
 }
 
 struct DivS32Params {
@@ -287,7 +281,8 @@ void PrintTo(const DivS32Params& p, std::ostream* os) {
       << p.remainder << "}";
 }
 
-class DivS32Test : public ClientLibraryTestBase,
+class DivS32Test : public ClientLibraryTestRunnerMixin<
+                       HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
                    public ::testing::WithParamInterface<DivS32Params> {};
 
 XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) {
@@ -319,8 +314,7 @@ XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) {
       CreateR0Parameter<int32_t>(p.divisor, 1, "divisor", &builder, &divisor);
   Div(dividend, divisor);
 
-  ComputeAndCompareR0<int32_t>(&builder, p.quotient,
-                               {dividendd.get(), divisord.get()});
+  ComputeAndCompareR0<int32_t>(&builder, p.quotient, {&dividendd, &divisord});
 }
 
 XLA_TEST_P(DivS32Test, RemainderTwoScalarsNonConstDivisorS32) {
@@ -334,8 +328,7 @@ XLA_TEST_P(DivS32Test, RemainderTwoScalarsNonConstDivisorS32) {
       CreateR0Parameter<int32_t>(p.divisor, 1, "divisor", &builder, &divisor);
   Rem(dividend, divisor);
 
-  ComputeAndCompareR0<int32_t>(&builder, p.remainder,
-                               {dividendd.get(), divisord.get()});
+  ComputeAndCompareR0<int32_t>(&builder, p.remainder, {&dividendd, &divisord});
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -389,19 +382,15 @@ XLA_TEST_F(ScalarComputationsTest, DivU32s) {
   for (uint32_t divisor : vals) {
     if (divisor != 0) {
       for (uint32_t dividend : vals) {
-        auto dividend_literal = LiteralUtil::CreateR0<uint32_t>(dividend);
-        auto divisor_literal = LiteralUtil::CreateR0<uint32_t>(divisor);
-        TF_ASSERT_OK_AND_ASSIGN(auto dividend_data,
-                                client_->TransferToServer(dividend_literal));
-        TF_ASSERT_OK_AND_ASSIGN(auto divisor_data,
-                                client_->TransferToServer(divisor_literal));
-        auto actual_literal =
-            client_
-                ->ExecuteAndTransfer(div_computation,
-                                     {dividend_data.get(), divisor_data.get()},
-                                     &execution_options_)
-                .value();
-        auto expected_literal =
+        const Literal dividend_literal =
+            LiteralUtil::CreateR0<uint32_t>(dividend);
+        const Literal divisor_literal =
+            LiteralUtil::CreateR0<uint32_t>(divisor);
+        TF_ASSERT_OK_AND_ASSIGN(
+            const Literal actual_literal,
+            ExecuteAndTransfer(div_computation,
+                               {&dividend_literal, &divisor_literal}));
+        const Literal expected_literal =
             LiteralUtil::CreateR0<uint32_t>(dividend / divisor);
         EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, actual_literal));
       }
@@ -431,19 +420,15 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) {
   for (uint32_t divisor : vals) {
     if (divisor != 0) {
       for (uint32_t dividend : vals) {
-        auto dividend_literal = LiteralUtil::CreateR0<uint32_t>(dividend);
-        auto divisor_literal = LiteralUtil::CreateR0<uint32_t>(divisor);
-        TF_ASSERT_OK_AND_ASSIGN(auto dividend_data,
-                                client_->TransferToServer(dividend_literal));
-        TF_ASSERT_OK_AND_ASSIGN(auto divisor_data,
-                                client_->TransferToServer(divisor_literal));
-        auto actual_literal =
-            client_
-                ->ExecuteAndTransfer(rem_computation,
-                                     {dividend_data.get(), divisor_data.get()},
-                                     &execution_options_)
-                .value();
-        auto expected_literal =
+        const Literal dividend_literal =
+            LiteralUtil::CreateR0<uint32_t>(dividend);
+        const Literal divisor_literal =
+            LiteralUtil::CreateR0<uint32_t>(divisor);
+        TF_ASSERT_OK_AND_ASSIGN(
+            const Literal actual_literal,
+            ExecuteAndTransfer(rem_computation,
+                               {&dividend_literal, &divisor_literal}));
+        const Literal expected_literal =
             LiteralUtil::CreateR0<uint32_t>(dividend % divisor);
         EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, actual_literal));
       }
@@ -457,8 +442,7 @@ XLA_TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
   Rem(x, ConstantR0<int32_t>(&builder, 80000));
 
   Literal literal = LiteralUtil::CreateR0<int32_t>(87919);
-  TF_ASSERT_OK_AND_ASSIGN(auto input_data, client_->TransferToServer(literal));
-  ComputeAndCompareR0<int32_t>(&builder, 7919, {input_data.get()});
+  ComputeAndCompareR0<int32_t>(&builder, 7919, {&literal});
 }
 
 XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsU32) {
@@ -577,7 +561,7 @@ XLA_TEST_F(ScalarComputationsTest, SelectScalarTrue) {
          ConstantR0<float>(&builder, 123.0f),  // The value on true.
          ConstantR0<float>(&builder, 42.0f));  // The value on false.
 
-  ComputeAndCompareR0<float>(&builder, 123.0f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 123.0f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) {
@@ -586,7 +570,7 @@ XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) {
          ConstantR0<float>(&builder, 123.0f),  // The value on true.
          ConstantR0<float>(&builder, 42.0f));  // The value on false.
 
-  ComputeAndCompareR0<float>(&builder, 42.0f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 42.0f, {}, kErrorSpec);
 }
 
 // This test is an explicit version of what is happening in the following
@@ -716,42 +700,42 @@ XLA_TEST_F(ScalarComputationsTest, ExpScalar) {
   XlaBuilder builder(TestName());
   Exp(ConstantR0<float>(&builder, 2.0f));
 
-  ComputeAndCompareR0<float>(&builder, 7.3890562, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 7.3890562, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, LogScalar) {
   XlaBuilder builder("log");
   Log(ConstantR0<float>(&builder, 2.0f));
 
-  ComputeAndCompareR0<float>(&builder, 0.6931471, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 0.6931471, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, TanhScalar) {
   XlaBuilder builder(TestName());
   Tanh(ConstantR0<float>(&builder, 2.0f));
 
-  ComputeAndCompareR0<float>(&builder, 0.96402758, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 0.96402758, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, TanhDoubleScalar) {
   XlaBuilder builder(TestName());
   Tanh(ConstantR0<double>(&builder, 2.0));
 
-  ComputeAndCompareR0<double>(&builder, 0.96402758, {}, error_spec_);
+  ComputeAndCompareR0<double>(&builder, 0.96402758, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, PowScalar) {
   XlaBuilder builder(TestName());
   Pow(ConstantR0<float>(&builder, 2.0f), ConstantR0<float>(&builder, 3.0f));
 
-  ComputeAndCompareR0<float>(&builder, 8.0, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 8.0, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CbrtScalar) {
   XlaBuilder builder(TestName());
   Cbrt(ConstantR0<float>(&builder, 2.0f));
 
-  ComputeAndCompare(&builder, {}, error_spec_);
+  ComputeAndCompare(&builder, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
@@ -814,7 +798,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
         ConstantR0<float>(&builder, 5.0f),   // The operand to be clamped.
         ConstantR0<float>(&builder, 3.0f));  // The upper bound.
 
-  ComputeAndCompareR0<float>(&builder, 3.0, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 3.0, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
@@ -823,7 +807,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
         ConstantR0<float>(&builder, 2.5f),   // The operand to be clamped.
         ConstantR0<float>(&builder, 3.0f));  // The upper bound.
 
-  ComputeAndCompareR0<float>(&builder, 2.5, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 2.5, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
@@ -832,7 +816,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
         ConstantR0<float>(&builder, -5.0f),  // The operand to be clamped.
         ConstantR0<float>(&builder, 3.0f));  // The upper bound.
 
-  ComputeAndCompareR0<float>(&builder, 2.0, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 2.0, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinS32Above) {
@@ -906,7 +890,7 @@ XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
           ConstantR0<float>(&b, 4)),
       ConstantR0<float>(&b, 20));
 
-  ComputeAndCompareR0<float>(&b, 0.5, {}, error_spec_);
+  ComputeAndCompareR0<float>(&b, 0.5, {}, kErrorSpec);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
@@ -924,7 +908,7 @@ XLA_TEST_F(ScalarComputationsTest, RoundScalar) {
   XlaBuilder builder(TestName());
   Round(ConstantR0<float>(&builder, 1.4f));
 
-  ComputeAndCompareR0<float>(&builder, 1.0f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 1.0f, {}, kErrorSpec);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/scatter_test.cc b/third_party/xla/xla/tests/scatter_test.cc
index 3a17e9fbc564..1d83a247c516 100644
--- a/third_party/xla/xla/tests/scatter_test.cc
+++ b/third_party/xla/xla/tests/scatter_test.cc
@@ -13,32 +13,44 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "absl/types/span.h"
 #include "xla/array2d.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 
 namespace xla {
 namespace {
 
-class ScatterTest : public HloTestBase {
+class ScatterTest : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
  protected:
-  void RunTest(const std::string& hlo_text, Literal* operand,
-               Literal* scatter_indices, Literal* updates) {
+  void RunTest(const absl::string_view hlo_text, Literal* const operand,
+               Literal* const scatter_indices, Literal* const updates) {
     RunTest(hlo_text, {operand, scatter_indices, updates});
   }
 
-  void RunTest(const std::string& hlo_text, absl::Span<Literal* const> args) {
+  void RunTest(const absl::string_view hlo_text,
+               const absl::Span<Literal* const> args) {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/third_party/xla/xla/tests/select_and_scatter_test.cc b/third_party/xla/xla/tests/select_and_scatter_test.cc
index 111f6cb662bd..b3595a640144 100644
--- a/third_party/xla/xla/tests/select_and_scatter_test.cc
+++ b/third_party/xla/xla/tests/select_and_scatter_test.cc
@@ -33,10 +33,12 @@ limitations under the License.
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -50,7 +52,8 @@ struct SelectAndScatterTestParam {
 };
 
 class SelectAndScatterTest
-    : public ClientLibraryTestBase,
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
       public ::testing::WithParamInterface<SelectAndScatterTestParam> {
  public:
   SelectAndScatterTest() : builder_(TestName()) {
@@ -90,11 +93,11 @@ class SelectAndScatterTest
   XlaComputation min_f32_;
 };
 
-XLA_TEST_P(SelectAndScatterTest, OVERSIZE_ON_GRM(ParamTest)) { DoIt(); }
+TEST_P(SelectAndScatterTest, OVERSIZE_ON_GRM(ParamTest)) { DoIt(); }
 
 class SelectAndScatterLarge : public SelectAndScatterTest {};
 
-XLA_TEST_P(SelectAndScatterLarge, DISABLED_ON_ISS(OVERSIZE_ON_GRM(ParamTest))) {
+TEST_P(SelectAndScatterLarge, DISABLED_ON_ISS(OVERSIZE_ON_GRM(ParamTest))) {
   DoIt();
 }
 
@@ -229,7 +232,7 @@ INSTANTIATE_TEST_CASE_P(
             {3000}, {1701}, Padding::kValid, {1300}, {1}}));
 
 // Test for F32 1D array, with a zero-element input.
-XLA_TEST_F(SelectAndScatterTest, R1S0F32) {
+TEST_F(SelectAndScatterTest, R1S0F32) {
   const auto operand = ConstantR1<float>(&builder_, {});
   const auto source = ConstantR1<float>(&builder_, {});
   SelectAndScatter(operand, ge_f32_, /*window_dimensions=*/{3},
@@ -239,7 +242,7 @@ XLA_TEST_F(SelectAndScatterTest, R1S0F32) {
 }
 
 // Test for F32 1D array, when windows do not overlap.
-XLA_TEST_F(SelectAndScatterTest, R1F32) {
+TEST_F(SelectAndScatterTest, R1F32) {
   const auto operand =
       ConstantR1<float>(&builder_, {1.f, 9.f, 3.f, 7.f, 5.f, 6.f});
   const auto source = ConstantR1<float>(&builder_, {34.f, 42.f});
@@ -251,7 +254,7 @@ XLA_TEST_F(SelectAndScatterTest, R1F32) {
 }
 
 // Test for S32 1D array, when windows do not overlap and the init value is 1.
-XLA_TEST_F(SelectAndScatterTest, R1S32) {
+TEST_F(SelectAndScatterTest, R1S32) {
   const auto operand = ConstantR1<int32_t>(&builder_, {-1, 0, 6, 4, -4, 10});
   const auto source = ConstantR1<int32_t>(&builder_, {-10, 20});
   const std::vector<int32_t> expected = {1, 1, -9, 1, 1, 21};
@@ -262,7 +265,7 @@ XLA_TEST_F(SelectAndScatterTest, R1S32) {
 }
 
 // Test for S32 1D array, when windows overlap with each other.
-XLA_TEST_F(SelectAndScatterTest, R1S32OverlappingWindow) {
+TEST_F(SelectAndScatterTest, R1S32OverlappingWindow) {
   const auto operand = ConstantR1<int32_t>(&builder_, {1, 9, 3, 7, 5, 6});
   const auto source = ConstantR1<int32_t>(&builder_, {34, 42, 53, 19});
   const std::vector<int32_t> expected = {0, 76, 0, 72, 0, 0};
@@ -273,7 +276,7 @@ XLA_TEST_F(SelectAndScatterTest, R1S32OverlappingWindow) {
 }
 
 // Test for S32 2D array, when windows do not overlap.
-XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32)) {
+TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}});
   const auto source = ConstantR2<int32_t>(&builder_, {{2, 6}});
@@ -286,7 +289,7 @@ XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32)) {
 
 // Test for tie breaking rule in ge_f32_. When a tie is present, the operand
 // that has the lower lexicographical order (smaller index) should be chosen.
-XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2F32Tie)) {
+TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2F32Tie)) {
   const auto operand = ConstantR2<float>(
       &builder_, {{0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}});
   const auto source = ConstantR2<float>(
@@ -300,7 +303,7 @@ XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2F32Tie)) {
 }
 
 // Similar to SelectAndScatterTest.R2S32 but the input is transposed.
-XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(ReshapeR2S32)) {
+TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(ReshapeR2S32)) {
   const auto operand = ConstantR2<int32_t>(
       &builder_, {{7, 3}, {2, 8}, {5, 9}, {3, 3}, {10, 4}, {2, 2}});
   const auto reshape = Reshape(Transpose(operand, /*permutation=*/{1, 0}),
@@ -314,7 +317,7 @@ XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(ReshapeR2S32)) {
 }
 
 // Test for S32 2D array, when windows overlap with each other.
-XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32OverlappingWindow)) {
+TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32OverlappingWindow)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}});
   const auto source = ConstantR2<int32_t>(&builder_, {{2, 6, 4}});
@@ -326,7 +329,7 @@ XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32OverlappingWindow)) {
 }
 
 // Test for S32 2D array, when the padding is Padding::kSAME.
-XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32SamePadding)) {
+TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32SamePadding)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}});
   const auto source = ConstantR2<int32_t>(&builder_, {{2, 6, 4}});
@@ -339,8 +342,8 @@ XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32SamePadding)) {
 
 // Test for S32 2D array, when the padding is Padding::kSAME and windows overlap
 // with each other.
-XLA_TEST_F(SelectAndScatterTest,
-           DISABLED_ON_TPU(R2S32SamePaddingOverlappingWindow)) {
+TEST_F(SelectAndScatterTest,
+       DISABLED_ON_TPU(R2S32SamePaddingOverlappingWindow)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}});
   const auto source =
@@ -352,7 +355,7 @@ XLA_TEST_F(SelectAndScatterTest,
   ComputeAndCompareR2<int32_t>(&builder_, expected, {});
 }
 
-XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2F32OverlappingR2Source)) {
+TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2F32OverlappingR2Source)) {
   const auto operand = ConstantR2<float>(
       &builder_, {{1.5f, 2.5f, 1.5f}, {3.5f, 1.5f, 3.5f}, {4.5f, 2.5f, 4.5f}});
   const auto source =
@@ -464,7 +467,7 @@ TEST_F(SelectAndScatterTest, R4F32RefValidFixedSmall) {
 }
 
 // Test for F32 4D array with negative padding on both ends.
-XLA_TEST_F(SelectAndScatterTest, R4NegativePaddingOnBothEnds) {
+TEST_F(SelectAndScatterTest, R4NegativePaddingOnBothEnds) {
   Array2D<float> pzo = {{7.0f, 2.0f, 5.0f, 3.0f, 10.0f, 3.0f},
                         {3.0f, 8.0f, 9.0f, 3.0f, 4.00f, 2.0f},
                         {1.0f, 5.0f, 7.0f, 5.0f, 6.00f, 1.0f},
@@ -491,7 +494,7 @@ XLA_TEST_F(SelectAndScatterTest, R4NegativePaddingOnBothEnds) {
 }
 
 // Test for F32 4D array with positive low padding and negative high padding.
-XLA_TEST_F(SelectAndScatterTest, R4PositivePaddingLowAndNegativePaddingHigh) {
+TEST_F(SelectAndScatterTest, R4PositivePaddingLowAndNegativePaddingHigh) {
   Array2D<float> pzo = {{7.0f, 2.0f, 5.0f, 3.0f, 10.0f, 3.0f},
                         {3.0f, 8.0f, 9.0f, 3.0f, 4.00f, 2.0f},
                         {1.0f, 5.0f, 7.0f, 5.0f, 6.00f, 1.0f},
@@ -518,7 +521,7 @@ XLA_TEST_F(SelectAndScatterTest, R4PositivePaddingLowAndNegativePaddingHigh) {
 }
 
 // Test for F32 4D array with negative low padding and positive high padding.
-XLA_TEST_F(SelectAndScatterTest, R4NegativePaddingLowAndPositivePaddingHigh) {
+TEST_F(SelectAndScatterTest, R4NegativePaddingLowAndPositivePaddingHigh) {
   Array2D<float> pzo = {{7.0f, 2.0f, 5.0f, 3.0f, 10.0f, 3.0f},
                         {3.0f, 8.0f, 9.0f, 3.0f, 4.00f, 2.0f},
                         {1.0f, 5.0f, 7.0f, 5.0f, 6.00f, 1.0f},
@@ -544,7 +547,7 @@ XLA_TEST_F(SelectAndScatterTest, R4NegativePaddingLowAndPositivePaddingHigh) {
   ComputeAndCompareR4<float>(&builder_, e, {}, ErrorSpec(1e-7));
 }
 
-XLA_TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMaxScatter) {
+TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMaxScatter) {
   const auto operand = ConstantR1<float>(&builder_, {1, 2, 3, 100, 3, 2, 1});
   const auto source = ConstantR1<float>(&builder_, {34, 42, 53, 19});
   const std::vector<float> expected = {0, 0, 0, 53, 0, 0, 0};
@@ -554,7 +557,7 @@ XLA_TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMaxScatter) {
   ComputeAndCompareR1<float>(&builder_, expected, {}, ErrorSpec(1e-7));
 }
 
-XLA_TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMinScatter) {
+TEST_F(SelectAndScatterTest, R1F32OverlappingWindowMinScatter) {
   const auto operand = ConstantR1<float>(&builder_, {1, 2, 3, 100, 3, 2, 1});
   const auto source = ConstantR1<float>(&builder_, {34, 42, 53, 19});
   const float max_float = std::numeric_limits<float>::max();
diff --git a/third_party/xla/xla/tests/select_test.cc b/third_party/xla/xla/tests/select_test.cc
index 4c16ee26d9ba..f575498bc385 100644
--- a/third_party/xla/xla/tests/select_test.cc
+++ b/third_party/xla/xla/tests/select_test.cc
@@ -13,24 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
+#include <cstdint>
 #include <vector>
 
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/literal.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class SelectTest : public ClientLibraryTestBase {
- public:
-  ErrorSpec error_spec_{0.0001};
-};
+constexpr ErrorSpec kErrorSpec{0.0001};
+
+using SelectTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(SelectTest, SelectScalarF32True) {
   XlaBuilder builder(TestName());
@@ -39,7 +41,7 @@ TEST_F(SelectTest, SelectScalarF32True) {
   auto on_false = ConstantR0<float>(&builder, 42.0f);
   Select(pred, on_true, on_false);
 
-  ComputeAndCompareR0<float>(&builder, 123.0f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 123.0f, {}, kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectScalarS32True) {
@@ -59,7 +61,7 @@ TEST_F(SelectTest, SelectScalarF32False) {
   auto on_false = ConstantR0<float>(&builder, 42.0f);
   Select(pred, on_true, on_false);
 
-  ComputeAndCompareR0<float>(&builder, 42.0f, {}, error_spec_);
+  ComputeAndCompareR0<float>(&builder, 42.0f, {}, kErrorSpec);
 }
 
 XLA_TEST_F(SelectTest, SelectR1S0F32WithConstantR1S0PRED) {
@@ -69,7 +71,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithConstantR1S0PRED) {
   auto on_false = ConstantR1<float>(&builder, {});
   Select(pred, on_true, on_false);
 
-  ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, {}, {}, kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithConstantR1PRED) {
@@ -82,7 +84,7 @@ TEST_F(SelectTest, SelectR1F32WithConstantR1PRED) {
   Select(pred, on_true, on_false);
 
   ComputeAndCompareR1<float>(&builder, {10.0f, 25.5f, 1.0f, -10.0f, -6.0f}, {},
-                             error_spec_);
+                             kErrorSpec);
 }
 
 XLA_TEST_F(SelectTest, SelectR1S0F32WithCmpR1S0S32s) {
@@ -96,7 +98,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithCmpR1S0S32s) {
   auto on_false = ConstantR1<float>(&builder, {});
   Select(cmp, on_true, on_false);
 
-  ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, {}, {}, kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) {
@@ -113,7 +115,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) {
   Select(cmp, on_true, on_false);
 
   ComputeAndCompareR1<float>(&builder, {10.0f, 25.5f, 1.0f, -10.0f, -6.0f}, {},
-                             error_spec_);
+                             kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32s) {
@@ -129,7 +131,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32s) {
   Select(cmp, on_true, on_false);
 
   ComputeAndCompareR1<float>(&builder, {-2.5f, 25.5f, 1.0f, 10.0f, 6.0f}, {},
-                             error_spec_);
+                             kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsSmall) {
@@ -138,18 +140,17 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsSmall) {
   XlaBuilder builder(TestName());
 
   XlaOp v1, v2;
-  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+  const Literal param0_data = CreateR1Parameter<float>(
       {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1",
       /*builder=*/&builder, /*data_handle=*/&v1);
-  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+  const Literal param1_data = CreateR1Parameter<float>(
       {21.0f, 22.0f, 23.0f, 24.0f}, /*parameter_number=*/1, /*name=*/"v2",
       /*builder=*/&builder, /*data_handle=*/&v2);
 
   auto cmp = Gt(v1, v2);
   Select(cmp, v1, v2);
   ComputeAndCompareR1<float>(&builder, {41.0f, 22.0f, 23.0f, 84.0f},
-                             {param0_data.get(), param1_data.get()},
-                             error_spec_);
+                             {&param0_data, &param1_data}, kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
@@ -182,18 +183,17 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
   }
 
   XlaOp v1, v2;
-  std::unique_ptr<GlobalData> param0_data =
+  const Literal param0_data =
       CreateR1Parameter<float>(v1vec, /*parameter_number=*/0, /*name=*/"v1",
                                /*builder=*/&builder, /*data_handle=*/&v1);
-  std::unique_ptr<GlobalData> param1_data =
+  const Literal param1_data =
       CreateR1Parameter<float>(v2vec, /*parameter_number=*/1, /*name=*/"v2",
                                /*builder=*/&builder, /*data_handle=*/&v2);
 
   auto cmp = Gt(v1, v2);
   Select(cmp, v1, v2);
   ComputeAndCompareR1<float>(&builder, expected_vec,
-                             {param0_data.get(), param1_data.get()},
-                             error_spec_);
+                             {&param0_data, &param1_data}, kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1S32ToScalar) {
@@ -210,7 +210,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32ToScalar) {
   Select(cmp, on_true, on_false);
 
   ComputeAndCompareR1<float>(&builder, {11.0f, -222.0f, 33.0f, -444.0f}, {},
-                             error_spec_);
+                             kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) {
@@ -227,7 +227,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) {
   Select(cmp, on_true, on_false);
 
   ComputeAndCompareR1<float>(&builder, {-111.0f, -222.0f, 33.0f, 44.0f}, {},
-                             error_spec_);
+                             kErrorSpec);
 }
 
 XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) {
@@ -238,7 +238,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) {
     auto on_false = ConstantR1<float>(&builder, {});
     Select(pred, on_true, on_false);
 
-    ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
+    ComputeAndCompareR1<float>(&builder, {}, {}, kErrorSpec);
   }
 }
 
@@ -249,7 +249,7 @@ TEST_F(SelectTest, SelectR1F32WithScalarPredicateTrue) {
   auto on_false = ConstantR1<float>(&builder, {10.0f, 5.0f});
   Select(pred, on_true, on_false);
 
-  ComputeAndCompareR1<float>(&builder, {-2.5f, 25.5f}, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, {-2.5f, 25.5f}, {}, kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1F32WithScalarPredicateFalse) {
@@ -259,7 +259,7 @@ TEST_F(SelectTest, SelectR1F32WithScalarPredicateFalse) {
   auto on_false = ConstantR1<float>(&builder, {10.0f, 5.0f});
   Select(pred, on_true, on_false);
 
-  ComputeAndCompareR1<float>(&builder, {10.0f, 5.0f}, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, {10.0f, 5.0f}, {}, kErrorSpec);
 }
 
 TEST_F(SelectTest, SelectR1S4WithConstantR1PRED) {
diff --git a/third_party/xla/xla/tests/set_dimension_size_test.cc b/third_party/xla/xla/tests/set_dimension_size_test.cc
index 3674e5828026..c23f1494c741 100644
--- a/third_party/xla/xla/tests/set_dimension_size_test.cc
+++ b/third_party/xla/xla/tests/set_dimension_size_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -45,7 +45,7 @@ void DisableAllHloPasses(HloModule& module) {
   module.mutable_config().set_debug_options(debug_options);
 }
 
-class SetDimensionSizeTest : public HloTestBase {};
+class SetDimensionSizeTest : public HloPjRtTestBase {};
 
 TEST_F(SetDimensionSizeTest, CorrectComputation) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
diff --git a/third_party/xla/xla/tests/slice_test.cc b/third_party/xla/xla/tests/slice_test.cc
index 91f5989438b2..b3571ae608e3 100644
--- a/third_party/xla/xla/tests/slice_test.cc
+++ b/third_party/xla/xla/tests/slice_test.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 // Tests that slice operations can be performed.
 
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 #include <numeric>
-#include <vector>
+#include <string>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
@@ -24,18 +28,22 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/array2d.h"
-#include "xla/client/local_client.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/layout_util.h"
+#include "xla/literal_util.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class SliceTest : public ClientLibraryTestBase {};
+using SliceTest = ClientLibraryTestRunnerMixin<HloTestBase>;
 
 TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
   Array3D<float> values(3, 3, 3);
@@ -208,7 +216,7 @@ struct R1Spec {
 
 // Parameterized test that generates R1 values, slices them according
 // to the R1Spec, and compares the result with a computed version.
-class SliceR1Test : public ClientLibraryTestBase,
+class SliceR1Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
                     public ::testing::WithParamInterface<R1Spec> {
  protected:
   template <typename NativeT>
@@ -233,9 +241,7 @@ class SliceR1Test : public ClientLibraryTestBase,
       expected.push_back(i);
     }
 
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> arg,
-                            client_->TransferToServer(literal));
-    ComputeAndCompareR1<NativeT>(&builder, expected, {arg.get()});
+    ComputeAndCompareR1<NativeT>(&builder, expected, {&literal});
   }
 };
 
@@ -361,7 +367,9 @@ INSTANTIATE_TEST_CASE_P(
         R1Spec{2047, 1, 2046, 3 * 128},
         R1Spec{4096, 1024 + 3, 4095, 500},
         R1Spec{8192, 0, 8192, 1024 * 3 + 400},
+        #ifndef XLA_TEST_BACKEND_GRM
         R1Spec{1024 * 1024, 0, 1024 * 1024, 2},
+        #endif
         R1Spec{1024 * 1024, 0, 1024 * 1024, 8},
         R1Spec{1024 * 1024, 0, 1024 * 1024, 7},
         R1Spec{1024 * 1024, 0, 1024 * 1024, 125},
@@ -397,7 +405,7 @@ struct R2Spec {
 
 // Parameterized test that generates patterned R2 values, slices them according
 // to the R2Spec, and compares the results with the ReferenceUtil version.
-class SliceR2Test : public ClientLibraryTestBase,
+class SliceR2Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
                     public ::testing::WithParamInterface<R2Spec> {};
 
 XLA_TEST_P(SliceR2Test, DoIt) {
@@ -411,11 +419,9 @@ XLA_TEST_P(SliceR2Test, DoIt) {
   auto a = Parameter(&builder, 0, literal.shape(), "p0");
   Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> arg,
-                          client_->TransferToServer(literal));
   std::unique_ptr<Array2D<int32_t>> expected = ReferenceUtil::Slice2D(
       input, spec.slice_starts, spec.slice_limits, spec.slice_strides);
-  ComputeAndCompareR2<int32_t>(&builder, *expected, {arg.get()});
+  ComputeAndCompareR2<int32_t>(&builder, *expected, {&literal});
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -488,7 +494,7 @@ std::string R4SpecToString(const ::testing::TestParamInfo<R4Spec>& data) {
                       "__strides_", absl::StrJoin(spec.slice_strides, "x"));
 }
 
-class SliceR4Test : public ClientLibraryTestBase,
+class SliceR4Test : public ClientLibraryTestRunnerMixin<HloTestBase>,
                     public ::testing::WithParamInterface<R4Spec> {
  protected:
   void Run(const R4Spec& spec) {
@@ -501,10 +507,8 @@ class SliceR4Test : public ClientLibraryTestBase,
     auto literal = LiteralUtil::CreateR4FromArray4DWithLayout(
         values, LayoutUtil::MakeLayout(spec.input_layout));
     auto parameter = Parameter(&builder, 0, literal.shape(), "p0");
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> arg,
-                            client_->TransferToServer(literal));
     Slice(parameter, spec.slice_starts, spec.slice_limits, spec.slice_strides);
-    ComputeAndCompareR4(&builder, *expected, {arg.get()}, ErrorSpec(0.000001));
+    ComputeAndCompareR4(&builder, *expected, {&literal}, ErrorSpec(0.000001));
   }
 };
 
diff --git a/third_party/xla/xla/tests/sort_test.cc b/third_party/xla/xla/tests/sort_test.cc
index 974441cbb9fe..ae00ac94b1c4 100644
--- a/third_party/xla/xla/tests/sort_test.cc
+++ b/third_party/xla/xla/tests/sort_test.cc
@@ -16,22 +16,22 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class SortTest : public HloTestBase {};
+using SortTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
-XLA_TEST_F(SortTest, SortDim0) {
+TEST_F(SortTest, SortDim0) {
   absl::string_view hlo_text_module = R"(
     HloModule sort
 
@@ -50,7 +50,7 @@ XLA_TEST_F(SortTest, SortDim0) {
   EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{0.0, 0.0}));
 }
 
-XLA_TEST_F(SortTest, SortDim1) {
+TEST_F(SortTest, SortDim1) {
   absl::string_view hlo_text_module = R"(
     HloModule sort
 
@@ -69,7 +69,7 @@ XLA_TEST_F(SortTest, SortDim1) {
   EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{0.0, 0.0}));
 }
 
-XLA_TEST_F(SortTest, SortTwiceWithSameComparator) {
+TEST_F(SortTest, SortTwiceWithSameComparator) {
   absl::string_view hlo_text_module = R"(
     HloModule sort
 
@@ -100,7 +100,7 @@ class SortManyInputsTest : public SortTest,
   }
 };
 
-XLA_TEST_P(SortManyInputsTest, SortManyInputs) {
+TEST_P(SortManyInputsTest, SortManyInputs) {
   int num_inputs = GetParam();
   absl::string_view hlo_text_module_template = R"(
     HloModule sort
diff --git a/third_party/xla/xla/tests/stochastic_convert_test.cc b/third_party/xla/xla/tests/stochastic_convert_test.cc
index a2c351114c19..71e27a0e7bf6 100644
--- a/third_party/xla/xla/tests/stochastic_convert_test.cc
+++ b/third_party/xla/xla/tests/stochastic_convert_test.cc
@@ -22,15 +22,17 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using StochasticConvertTest = HloTestBase;
+using StochasticConvertTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
+
 const char* const kModuleStr = R"(
   HloModule stochastic-convert
 
diff --git a/third_party/xla/xla/tests/test_utils.cc b/third_party/xla/xla/tests/test_utils.cc
index b3a91388a6a3..31dfe33cc437 100644
--- a/third_party/xla/xla/tests/test_utils.cc
+++ b/third_party/xla/xla/tests/test_utils.cc
@@ -369,14 +369,14 @@ absl::Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
 std::unique_ptr<HloDotInstruction> CreateCanonicalDot(const Shape& shape,
                                                       HloInstruction* lhs,
                                                       HloInstruction* rhs) {
-  CHECK_LE(lhs->shape().rank(), 2);
-  CHECK_LE(rhs->shape().rank(), 2);
+  CHECK_LE(lhs->shape().dimensions().size(), 2);
+  CHECK_LE(rhs->shape().dimensions().size(), 2);
   PrecisionConfig precision_config;
   precision_config.mutable_operand_precision()->Resize(
       2, PrecisionConfig::DEFAULT);
   DotDimensionNumbers dot_dimension_numbers;
   dot_dimension_numbers.add_lhs_contracting_dimensions(
-      lhs->shape().rank() > 1 ? 1 : 0);
+      lhs->shape().dimensions().size() > 1 ? 1 : 0);
   dot_dimension_numbers.add_rhs_contracting_dimensions(0);
   return std::make_unique<HloDotInstruction>(
       shape, lhs, rhs, dot_dimension_numbers, precision_config);
diff --git a/third_party/xla/xla/tests/topk_test.cc b/third_party/xla/xla/tests/topk_test.cc
index 8ad1dd15cd1e..17117d01a412 100644
--- a/third_party/xla/xla/tests/topk_test.cc
+++ b/third_party/xla/xla/tests/topk_test.cc
@@ -16,15 +16,16 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class TopkTest : public HloTestBase {};
+using TopkTest = HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>;
 
-XLA_TEST_F(TopkTest, LargestTopK) {
+TEST_F(TopkTest, LargestTopK) {
   absl::string_view hlo_text_module = R"(
     HloModule topk
 
@@ -36,7 +37,7 @@ XLA_TEST_F(TopkTest, LargestTopK) {
   EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{1e-5, 1e-5}));
 }
 
-XLA_TEST_F(TopkTest, SmallestTopK) {
+TEST_F(TopkTest, SmallestTopK) {
   absl::string_view hlo_text_module = R"(
     HloModule topk
 
@@ -48,7 +49,7 @@ XLA_TEST_F(TopkTest, SmallestTopK) {
   EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{1e-5, 1e-5}));
 }
 
-XLA_TEST_F(TopkTest, TopKOfTranspose) {
+TEST_F(TopkTest, TopKOfTranspose) {
   // Regression test for b/362565176
   absl::string_view hlo_text_module = R"(
     HloModule topk
diff --git a/third_party/xla/xla/tests/transpose_test.cc b/third_party/xla/xla/tests/transpose_test.cc
index ffd1a5c6156f..bd93fdf41bac 100644
--- a/third_party/xla/xla/tests/transpose_test.cc
+++ b/third_party/xla/xla/tests/transpose_test.cc
@@ -19,49 +19,76 @@ limitations under the License.
 #include <vector>
 
 #include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/literal_util.h"
 #include "xla/reference_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/util.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class TransposeTest : public ClientLibraryTestBase {
- public:
-  ErrorSpec error_spec_{0.0001};
+constexpr ErrorSpec kErrorSpec{0.0001};
 
+class TransposeTest : public ClientLibraryTestRunnerMixin<
+                          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
-  void TestTransposeConstant(Vector3 sizes, Vector3 transpose_dims);
+  void TestTransposeConstant(Vector3 sizes, Vector3 transpose_dims) {
+    Array3D<int32_t> aoperand(sizes[0], sizes[1], sizes[2]);
+    std::vector<int32_t> expected(sizes[0] * sizes[1] * sizes[2]);
+    for (int64_t i = 0; i < sizes[0]; ++i) {
+      for (int64_t j = 0; j < sizes[1]; ++j) {
+        for (int64_t k = 0; k < sizes[2]; ++k) {
+          Vector3 indices{i, j, k};
+          aoperand(i, j, k) = (i * sizes[1] + j) * sizes[2] + k;
+          expected[(indices[transpose_dims[0]] * sizes[transpose_dims[1]] +
+                    indices[transpose_dims[1]]) *
+                       sizes[transpose_dims[2]] +
+                   indices[transpose_dims[2]]] = aoperand(i, j, k);
+        }
+      }
+    }
+
+    XlaBuilder builder(TestName());
+    auto operand = ConstantR3FromArray3D(&builder, aoperand);
+    auto transpose = Transpose(operand, transpose_dims);
+    // Add a reshape so that the transpose does not disappear during layout
+    // assignment.
+    Reshape(transpose, {sizes[0] * sizes[1] * sizes[2]});
+
+    ComputeAndCompareR1<int32_t>(&builder, expected, {});
+  }
 };
 
-XLA_TEST_F(TransposeTest, Transpose0x0) {
+TEST_F(TransposeTest, Transpose0x0) {
   XlaBuilder builder("Transpose");
   auto lhs = ConstantR2FromArray2D<float>(&builder, Array2D<float>(0, 0));
   Transpose(lhs, {1, 0});
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {}, kErrorSpec);
 }
 
-XLA_TEST_F(TransposeTest, Transpose0x42) {
+TEST_F(TransposeTest, Transpose0x42) {
   XlaBuilder builder("Transpose");
   auto lhs = ConstantR2FromArray2D<float>(&builder, Array2D<float>(0, 42));
   Transpose(lhs, {1, 0});
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(42, 0), {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(42, 0), {}, kErrorSpec);
 }
 
-XLA_TEST_F(TransposeTest, Transpose7x0) {
+TEST_F(TransposeTest, Transpose7x0) {
   XlaBuilder builder("Transpose");
   auto lhs = ConstantR2FromArray2D<float>(&builder, Array2D<float>(7, 0));
   Transpose(lhs, {1, 0});
 
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 7), {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 7), {}, kErrorSpec);
 }
 
 TEST_F(TransposeTest, Transpose2x2) {
@@ -74,10 +101,10 @@ TEST_F(TransposeTest, Transpose2x2) {
 
   Array2D<float> expected({{1.0f, 3.0f}, {2.0f, 4.0f}});
 
-  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(TransposeTest, Transpose0x2x3_2x3x0) {
+TEST_F(TransposeTest, Transpose0x2x3_2x3x0) {
   XlaBuilder builder("Transpose");
   auto operand =
       ConstantR3FromArray3D<int32_t>(&builder, Array3D<int32_t>(0, 2, 3));
@@ -130,7 +157,7 @@ TEST_F(TransposeTest, MultiTranspose3x2) {
       computed = Transpose(computed, {1, 0});
     }
     const Array2D<float>& expected = transposes % 2 == 0 ? input : transposed;
-    ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+    ComputeAndCompareR2<float>(&builder, expected, {}, kErrorSpec);
   }
 }
 
@@ -158,33 +185,6 @@ TEST_F(TransposeTest, Small_2x2) {
   ComputeAndCompareR2<float>(&builder, *expected, {}, ErrorSpec(1e-4));
 }
 
-void TransposeTest::TestTransposeConstant(Vector3 sizes,
-                                          Vector3 transpose_dims) {
-  Array3D<int32_t> aoperand(sizes[0], sizes[1], sizes[2]);
-  std::vector<int32_t> expected(sizes[0] * sizes[1] * sizes[2]);
-  for (int64_t i = 0; i < sizes[0]; ++i) {
-    for (int64_t j = 0; j < sizes[1]; ++j) {
-      for (int64_t k = 0; k < sizes[2]; ++k) {
-        Vector3 indices{i, j, k};
-        aoperand(i, j, k) = (i * sizes[1] + j) * sizes[2] + k;
-        expected[(indices[transpose_dims[0]] * sizes[transpose_dims[1]] +
-                  indices[transpose_dims[1]]) *
-                     sizes[transpose_dims[2]] +
-                 indices[transpose_dims[2]]] = aoperand(i, j, k);
-      }
-    }
-  }
-
-  XlaBuilder builder(TestName());
-  auto operand = ConstantR3FromArray3D(&builder, aoperand);
-  auto transpose = Transpose(operand, transpose_dims);
-  // Add a reshape so that the transpose does not disappear during layout
-  // assignment.
-  Reshape(transpose, {sizes[0] * sizes[1] * sizes[2]});
-
-  ComputeAndCompareR1<int32_t>(&builder, expected, {});
-}
-
 TEST_F(TransposeTest, TransposeConstant021_SingleIncompleteTilePerLayer) {
   TestTransposeConstant({2, 16, 17}, {0, 2, 1});
 }
@@ -201,11 +201,11 @@ TEST_F(TransposeTest, TransposeConstant210_DegenerateDim) {
   TestTransposeConstant({20, 30, 1}, {2, 1, 0});
 }
 
-using HloTransposeTest = HloTestBase;
+using HloTransposeTest = HloPjRtTestBase;
 
 // Disable HLO passes to verify the default behavior
-XLA_TEST_F(HloTransposeTest, DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(
-                                 DISABLED_ON_TPU(HloPassesDisabled)))) {
+TEST_F(HloTransposeTest, DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(
+                             DISABLED_ON_TPU(HloPassesDisabled)))) {
   const char* const kModuleStr = R"(
     HloModule Transpose
 
diff --git a/third_party/xla/xla/tests/triangular_solve_test.cc b/third_party/xla/xla/tests/triangular_solve_test.cc
index a2e6334f69f9..d5e95eca8c73 100644
--- a/third_party/xla/xla/tests/triangular_solve_test.cc
+++ b/third_party/xla/xla/tests/triangular_solve_test.cc
@@ -13,33 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <numeric>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <utility>
 #include <vector>
 
-#include "absl/status/statusor.h"
+#include "absl/log/check.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/array2d.h"
-#include "xla/hlo/builder/lib/math.h"
+#include "xla/array3d.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/lib/matrix.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/literal.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/literal_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using TriangularSolveTest = ClientLibraryTestBase;
-using TriangularSolveLeftLookingTest = ClientLibraryTestBase;
+constexpr float kNan = std::numeric_limits<float>::quiet_NaN();
+constexpr complex64 kNanC64 = complex64(kNan, kNan);
 
-static constexpr float kNan = std::numeric_limits<float>::quiet_NaN();
+using TriangularSolveTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+using TriangularSolveLeftLookingTest =
+    ClientLibraryTestRunnerMixin<HloTestBase>;
 
 Array2D<float> AValsLower() {
   return {{2, kNan, kNan, kNan},
@@ -77,8 +85,6 @@ Array2D<float> BValsLeft() {
   return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
 }
 
-static constexpr complex64 kNanC64 = complex64(kNan, kNan);
-
 Array2D<complex64> AValsLowerComplex() {
   return {{2, kNanC64, kNanC64, kNanC64},
           {complex64(3, 1), 6, kNanC64, kNanC64},
@@ -101,7 +107,7 @@ Array2D<complex64> BValsLeftComplex() {
   return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
 }
 
-XLA_TEST_F(TriangularSolveTest, EmptyArrays) {
+TEST_F(TriangularSolveTest, EmptyArrays) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -115,10 +121,10 @@ XLA_TEST_F(TriangularSolveTest, EmptyArrays) {
                   /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 10),
-                             {a_data.get(), b_data.get()});
+                             {&a_data, &b_data});
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
+TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -135,11 +141,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
       {4.5, -0.58333331, -0.32407406, -0.23569024},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
+TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -156,11 +162,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
       {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
+TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -177,11 +183,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
       {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
+TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -198,11 +204,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
       {4.5, -0.58333331, -0.32407406, -0.23569024},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
+TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -220,11 +226,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
       {0.90909091, 1., 1.09090909},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
+TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -242,11 +248,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
       {0.16835017, 0.13468013, 0.1010101},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNoTransposeUnitDiagonal) {
+TEST_F(TriangularSolveTest, SimpleLeftLowerNoTransposeUnitDiagonal) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -261,11 +267,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNoTransposeUnitDiagonal) {
   Array2D<float> expected(
       {{1., 2., 3.}, {1., -1., -3.}, {-4., 7., 18.}, {37., -61., -159.}});
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
+TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -283,11 +289,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
       {0.16835017, 0.13468013, 0.1010101},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
+TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -305,11 +311,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
       {0.16835017, 0.13468013, 0.1010101},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
+TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -327,11 +333,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
       {0.90909091, 1., 1.09090909},
   });
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotransposeUnitDiagonal) {
+TEST_F(TriangularSolveTest, SimpleLeftUpperNotransposeUnitDiagonal) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -348,11 +354,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotransposeUnitDiagonal) {
                            {-93., -102., -111.},
                            {10., 11., 12.}});
 
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+  ComputeAndCompareR2<float>(&builder, expected, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
+TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -374,11 +380,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
        complex64(0.11026936, -0.03114478)},
   });
 
-  ComputeAndCompareR2<complex64>(
-      &builder, expected, {a_data.get(), b_data.get()}, ErrorSpec(1e-2, 1e-2));
+  ComputeAndCompareR2<complex64>(&builder, expected, {&a_data, &b_data},
+                                 ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
+TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
   XlaBuilder builder(TestName());
 
   XlaOp a, b;
@@ -402,11 +408,11 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
        complex64(0.15798226, 5.12749446e-01)},
   });
 
-  ComputeAndCompareR2<complex64>(
-      &builder, expected, {a_data.get(), b_data.get()}, ErrorSpec(1e-2, 1e-2));
+  ComputeAndCompareR2<complex64>(&builder, expected, {&a_data, &b_data},
+                                 ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveTest, BatchedLeftUpper) {
+TEST_F(TriangularSolveTest, BatchedLeftUpper) {
   XlaBuilder builder(TestName());
 
   Array3D<float> bvals(7, 5, 5);
@@ -430,7 +436,7 @@ XLA_TEST_F(TriangularSolveTest, BatchedLeftUpper) {
                       /*unit_diagonal=*/false,
                       /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE));
 
-  ComputeAndCompareR3<float>(&builder, bvals, {a_data.get(), b_data.get()},
+  ComputeAndCompareR3<float>(&builder, bvals, {&a_data, &b_data},
                              ErrorSpec(1e-2, 1e-2));
 }
 
@@ -442,13 +448,13 @@ struct TriangularSolveTestSpec {
 };
 
 class TriangularSolveParametricTest
-    : public ClientLibraryTestBase,
+    : public ClientLibraryTestRunnerMixin<HloTestBase>,
       public ::testing::WithParamInterface<TriangularSolveTestSpec> {};
 
-XLA_TEST_P(TriangularSolveParametricTest, Random) {
+TEST_P(TriangularSolveParametricTest, Random) {
   TriangularSolveTestSpec spec = GetParam();
 
-  if (client_->backend()
+  if (backend()
           .default_stream_executor()
           ->GetDeviceDescription()
           .cuda_compute_capability()
@@ -480,12 +486,13 @@ XLA_TEST_P(TriangularSolveParametricTest, Random) {
   Array<float> bvals(b_dims);
   bvals.FillRandom(1.0);
 
-  XlaOp a, b;
-  auto a_data = CreateParameter<float>(avals, 0, "a", &builder, &a);
-  auto b_data = CreateParameter<float>(bvals, 1, "b", &builder, &b);
-  auto x = TriangularSolve(a, b, spec.left_side, spec.lower,
-                           /*unit_diagonal=*/false, spec.transpose_a);
-  auto a_tri = Triangle(a, spec.lower);
+  const Literal avals_literal = LiteralUtil::CreateFromArray(avals);
+  const Literal bvals_literal = LiteralUtil::CreateFromArray(bvals);
+  XlaOp a = Parameter(&builder, 0, avals_literal.shape(), "a");
+  XlaOp b = Parameter(&builder, 1, bvals_literal.shape(), "b");
+  XlaOp x = TriangularSolve(a, b, spec.left_side, spec.lower,
+                            /*unit_diagonal=*/false, spec.transpose_a);
+  XlaOp a_tri = Triangle(a, spec.lower);
   a_tri = MaybeTransposeInMinorDims(
       a_tri, spec.transpose_a != TriangularSolveOptions::NO_TRANSPOSE);
   if (spec.left_side) {
@@ -494,7 +501,8 @@ XLA_TEST_P(TriangularSolveParametricTest, Random) {
     BatchDot(x, a_tri, xla::PrecisionConfig::HIGHEST);
   }
 
-  ComputeAndCompare<float>(&builder, bvals, {a_data.get(), b_data.get()},
+  ComputeAndCompareLiteral(&builder, LiteralUtil::CreateFromArray(bvals),
+                           {&avals_literal, &bvals_literal},
                            ErrorSpec(3e-2, 3e-2));
 }
 
diff --git a/third_party/xla/xla/tests/tuple_test.cc b/third_party/xla/xla/tests/tuple_test.cc
index 5469ac9ba48f..d2f7fd5ff1bb 100644
--- a/third_party/xla/xla/tests/tuple_test.cc
+++ b/third_party/xla/xla/tests/tuple_test.cc
@@ -13,36 +13,40 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <initializer_list>
 #include <memory>
+#include <utility>
+#include <vector>
 
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/array2d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class TupleTest : public ClientLibraryTestBase {
- public:
-  ErrorSpec error_spec_{0.0001};
-};
+constexpr ErrorSpec kErrorSpec{0.0001};
+
+using TupleTest = ClientLibraryTestRunnerMixin<HloTestBase>;
 
 // Tests a tuple-shaped constant.
-XLA_TEST_F(TupleTest, TupleConstant) {
+TEST_F(TupleTest, TupleConstant) {
   XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
@@ -57,11 +61,11 @@ XLA_TEST_F(TupleTest, TupleConstant) {
        LiteralUtil::CreateR2<float>(constant_matrix)});
 
   ConstantLiteral(&builder, value);
-  ComputeAndCompareTuple(&builder, value, {}, error_spec_);
+  ComputeAndCompareTuple(&builder, value, {}, kErrorSpec);
 }
 
 // Tests a tuple made of scalar constants.
-XLA_TEST_F(TupleTest, TupleScalarConstant) {
+TEST_F(TupleTest, TupleScalarConstant) {
   XlaBuilder builder(TestName());
 
   const float constant_scalar1 = 7.3f;
@@ -71,11 +75,11 @@ XLA_TEST_F(TupleTest, TupleScalarConstant) {
        LiteralUtil::CreateR0<float>(constant_scalar2)});
 
   ConstantLiteral(&builder, value);
-  ComputeAndCompareTuple(&builder, value, {}, error_spec_);
+  ComputeAndCompareTuple(&builder, value, {}, kErrorSpec);
 }
 
 // Tests the creation of tuple data.
-XLA_TEST_F(TupleTest, TupleCreate) {
+TEST_F(TupleTest, TupleCreate) {
   XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
@@ -92,11 +96,11 @@ XLA_TEST_F(TupleTest, TupleCreate) {
       {LiteralUtil::CreateR0<float>(constant_scalar),
        LiteralUtil::CreateR1<float>(constant_vector),
        LiteralUtil::CreateR2<float>(constant_matrix)});
-  ComputeAndCompareTuple(&builder, expected, {}, error_spec_);
+  ComputeAndCompareTuple(&builder, expected, {}, kErrorSpec);
 }
 
 // Tests the creation of tuple data.
-XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
+TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
   XlaBuilder builder(TestName());
 
   Tuple(&builder,
@@ -104,19 +108,19 @@ XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
 
   auto expected = LiteralUtil::MakeTupleFromSlices(
       {LiteralUtil::CreateR0<float>(7.0), LiteralUtil::CreateR1<float>({})});
-  ComputeAndCompareTuple(&builder, expected, {}, error_spec_);
+  ComputeAndCompareTuple(&builder, expected, {}, kErrorSpec);
 }
 
 // Tests the creation of an empty tuple.
-XLA_TEST_F(TupleTest, EmptyTupleCreate) {
+TEST_F(TupleTest, EmptyTupleCreate) {
   XlaBuilder builder(TestName());
   Tuple(&builder, {});
   auto expected = LiteralUtil::MakeTuple({});
-  ComputeAndCompareTuple(&builder, expected, {}, error_spec_);
+  ComputeAndCompareTuple(&builder, expected, {}, kErrorSpec);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
-XLA_TEST_F(TupleTest, GetTupleElement) {
+TEST_F(TupleTest, GetTupleElement) {
   XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
@@ -128,21 +132,21 @@ XLA_TEST_F(TupleTest, GetTupleElement) {
                        ConstantR2<float>(&builder, constant_matrix)});
   GetTupleElement(tuple_data, 1);
   ComputeAndCompareR2<float>(&builder, Array2D<float>(constant_matrix), {},
-                             error_spec_);
+                             kErrorSpec);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
-XLA_TEST_F(TupleTest, GetTupleElementWithZeroElements) {
+TEST_F(TupleTest, GetTupleElementWithZeroElements) {
   XlaBuilder builder(TestName());
   auto tuple_data =
       Tuple(&builder,
             {ConstantR1<float>(&builder, {}),
              ConstantR2FromArray2D<float>(&builder, Array2D<float>(0, 101))});
   GetTupleElement(tuple_data, 1);
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 101), {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 101), {}, kErrorSpec);
 }
 
-XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
+TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
   XlaBuilder builder(TestName());
   auto value = ConstantR1<float>(&builder, {4.5f});
   GetTupleElement(value, 1);
@@ -155,7 +159,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
 
 // Extracts both elements from a tuple with GetTupleElement and then adds them
 // together.
-XLA_TEST_F(TupleTest, AddTupleElements) {
+TEST_F(TupleTest, AddTupleElements) {
   XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
@@ -179,12 +183,12 @@ XLA_TEST_F(TupleTest, AddTupleElements) {
   ASSERT_TRUE(ShapeUtil::Equal(vector_shape, ShapeUtil::MakeShape(F32, {3})));
   ASSERT_TRUE(ShapeUtil::Equal(matrix_shape,
                                ShapeUtil::MakeShape(F32, {/*y=*/2, /*x=*/3})));
-  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, expected, {}, kErrorSpec);
 }
 
 // Extracts both elements from a tuple and then puts them into a new tuple in
 // the opposite order.
-XLA_TEST_F(TupleTest, TupleGTEToTuple) {
+TEST_F(TupleTest, TupleGTEToTuple) {
   XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
@@ -199,13 +203,12 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
   auto expected = LiteralUtil::MakeTupleFromSlices(
       {LiteralUtil::CreateR2<float>(constant_matrix),
        LiteralUtil::CreateR1<float>(constant_vector)});
-  ComputeAndCompareTuple(&builder, expected, {}, error_spec_);
+  ComputeAndCompareTuple(&builder, expected, {}, kErrorSpec);
 }
 
-
 // Builds two new tuples from an existing tuple (by means of GetTupleElement),
 // then adds up the components of the new tuples.
-XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
+TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   //
   // v------           --(GTE 0)--             --(GTE 0)----------
   //        \         /           \           /                   \
@@ -248,10 +251,10 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
       {4.f, 8.f, 12.f},    // row 0
       {10.f, 14.f, 18.f},  // row 1
   });
-  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR2<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(TupleTest, NestedTuples) {
+TEST_F(TupleTest, NestedTuples) {
   XlaBuilder builder(TestName());
   auto inner_tuple = Tuple(&builder, {ConstantR1<float>(&builder, {1.0, 2.0}),
                                       ConstantR0<float>(&builder, 42.0)});
@@ -264,10 +267,10 @@ XLA_TEST_F(TupleTest, NestedTuples) {
   auto expected_v2 = LiteralUtil::CreateR1<float>({22.0, 44.0});
   auto expected = LiteralUtil::MakeTuple({&expected_inner_tuple, &expected_v2});
 
-  ComputeAndCompareTuple(&builder, expected, {}, error_spec_);
+  ComputeAndCompareTuple(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
+TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
   XlaBuilder builder(TestName());
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {3});
@@ -280,23 +283,18 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
   auto gte1 = GetTupleElement(gte0, 1);
   Add(gte1, ConstantR1<float>(&builder, {10.0, 11.0, 12.0}));
 
-  std::unique_ptr<GlobalData> data =
-      client_
-          ->TransferToServer(LiteralUtil::MakeTupleFromSlices({
-              LiteralUtil::MakeTupleFromSlices({
-                  LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0}),
-                  LiteralUtil::CreateR1<float>({4.0, 5.0, 6.0}),
-              }),
-              LiteralUtil::CreateR1<float>({7.0, 8.0, 9.0}),
-          }))
-          .value();
-
-  std::vector<GlobalData*> arguments = {data.get()};
+  const Literal data = LiteralUtil::MakeTupleFromSlices({
+      LiteralUtil::MakeTupleFromSlices({
+          LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0}),
+          LiteralUtil::CreateR1<float>({4.0, 5.0, 6.0}),
+      }),
+      LiteralUtil::CreateR1<float>({7.0, 8.0, 9.0}),
+  });
   const std::vector<float> expected = {4.0 + 10.0, 5.0 + 11.0, 6.0 + 12.0};
-  ComputeAndCompareR1<float>(&builder, expected, arguments, ErrorSpec(1e-5));
+  ComputeAndCompareR1<float>(&builder, expected, {&data}, ErrorSpec(1e-5));
 }
 
-XLA_TEST_F(TupleTest, ComplexTuples) {
+TEST_F(TupleTest, ComplexTuples) {
   XlaBuilder builder(TestName());
   {
     Shape c64r0 = ShapeUtil::MakeShape(C64, {});
@@ -316,23 +314,16 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
                      ConstantR0<complex64>(&builder, {123, 456})});
   }
 
-  std::unique_ptr<GlobalData> arg0 =
-      client_
-          ->TransferToServer(LiteralUtil::MakeTupleFromSlices(
-              {LiteralUtil::CreateR0<complex64>({1, 2}),
-               LiteralUtil::MakeTupleFromSlices(
-                   {LiteralUtil::CreateR1<complex64>({{10, 20}, {30, 40}}),
-                    LiteralUtil::CreateR2<complex64>(
-                        {{{100, 200}, {300, 400}},
-                         {{1000, 2000}, {3000, 4000}},
-                         {{10000, 20000}, {30000, 40000}}})})}))
-          .value();
-  std::unique_ptr<GlobalData> arg1 =
-      client_
-          ->TransferToServer(
-              LiteralUtil::CreateR1<complex64>({{1, 2}, {1, -2}}))
-          .value();
-  auto sum =
+  const Literal arg0 = LiteralUtil::MakeTupleFromSlices(
+      {LiteralUtil::CreateR0<complex64>({1, 2}),
+       LiteralUtil::MakeTupleFromSlices(
+           {LiteralUtil::CreateR1<complex64>({{10, 20}, {30, 40}}),
+            LiteralUtil::CreateR2<complex64>(
+                {{{100, 200}, {300, 400}},
+                 {{1000, 2000}, {3000, 4000}},
+                 {{10000, 20000}, {30000, 40000}}})})});
+  const Literal arg1 = LiteralUtil::CreateR1<complex64>({{1, 2}, {1, -2}});
+  const Literal sum =
       LiteralUtil::CreateR2<complex64>({{{111, 222}, {331, 442}},
                                         {{1011, 2022}, {3031, 4042}},
                                         {{10011, 20022}, {30031, 40042}}});
@@ -345,16 +336,15 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
                                 : complex64(1, -2));
                   })
                   .ok());
-  auto expected = LiteralUtil::MakeTupleFromSlices(
+  const Literal expected = LiteralUtil::MakeTupleFromSlices(
       {LiteralUtil::MakeTupleFromSlices({prod, sum}),
        LiteralUtil::CreateR0<complex64>({123, 456})});
-  ComputeAndCompareTuple(&builder, expected, {arg0.get(), arg1.get()},
-                         error_spec_);
+  ComputeAndCompareTuple(&builder, expected, {&arg0, &arg1}, kErrorSpec);
 }
 
-class TupleHloTest : public HloTestBase {};
+using TupleHloTest = HloTestBase;
 
-XLA_TEST_F(TupleHloTest, BadTupleShapeFailsGracefully) {
+TEST_F(TupleHloTest, BadTupleShapeFailsGracefully) {
   const char* testcase = R"(
     HloModule m, is_scheduled=true
 
@@ -374,7 +364,7 @@ XLA_TEST_F(TupleHloTest, BadTupleShapeFailsGracefully) {
   EXPECT_THAT(status.message(), ::testing::HasSubstr("actual shape is"));
 }
 
-XLA_TEST_F(TupleHloTest, BitcastAfterGTE) {
+TEST_F(TupleHloTest, BitcastAfterGTE) {
   const char* testcase = R"(
     HloModule m, is_scheduled=true
 
diff --git a/third_party/xla/xla/tests/unary_op_test.cc b/third_party/xla/xla/tests/unary_op_test.cc
index aa1cf98d8509..1e9a9cbb102a 100644
--- a/third_party/xla/xla/tests/unary_op_test.cc
+++ b/third_party/xla/xla/tests/unary_op_test.cc
@@ -13,21 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <vector>
+#include <cmath>
+#include <cstdint>
+#include <limits>
 
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class UnaryOpTest : public ClientLibraryTestBase {
+class UnaryOpTest : public ClientLibraryTestRunnerMixin<
+                        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   template <typename T>
   T inf() {
@@ -131,19 +136,19 @@ void UnaryOpTest::SignAbsTestHelper<complex64>() {
   ComputeAndCompareLiteral(&builder, expected, {}, ErrorSpec(1e-6f));
 }
 
-XLA_TEST_F(UnaryOpTest, AbsTestR1Size0) {
+TEST_F(UnaryOpTest, AbsTestR1Size0) {
   AbsSize0TestHelper<int>();
   AbsSize0TestHelper<float>();
   AbsSize0TestHelper<complex64>();
 }
 
-XLA_TEST_F(UnaryOpTest, AbsTestR1) {
+TEST_F(UnaryOpTest, AbsTestR1) {
   AbsTestHelper<int>();
   AbsTestHelper<float>();
   AbsTestHelper<complex64>();
 }
 
-XLA_TEST_F(UnaryOpTest, AbsTestR0) {
+TEST_F(UnaryOpTest, AbsTestR0) {
   XlaBuilder builder(TestName());
   auto argi = ConstantR0<int>(&builder, -5);
   auto absi = Abs(argi);
@@ -158,7 +163,7 @@ XLA_TEST_F(UnaryOpTest, AbsTestR0) {
   ComputeAndCompareR0<float>(&builder, 8.5f, {});
 }
 
-XLA_TEST_F(UnaryOpTest, SignTestR0) {
+TEST_F(UnaryOpTest, SignTestR0) {
   XlaBuilder builder(TestName());
   auto argi = ConstantR0<int>(&builder, -5);
   auto sgni = Sign(argi);  // -1
@@ -175,20 +180,20 @@ XLA_TEST_F(UnaryOpTest, SignTestR0) {
   ComputeAndCompareLiteral(&builder, expected, {}, ErrorSpec(1e-6f));
 }
 
-XLA_TEST_F(UnaryOpTest, SignTestR1) {
+TEST_F(UnaryOpTest, SignTestR1) {
   SignTestHelper<int>();
   SignTestHelper<int64_t>();
   SignTestHelper<float>();
   SignTestHelper<complex64>();
 }
 
-XLA_TEST_F(UnaryOpTest, SignAbsTestR1) {
+TEST_F(UnaryOpTest, SignAbsTestR1) {
   SignAbsTestHelper<int>();
   SignAbsTestHelper<float>();
   SignAbsTestHelper<complex64>();
 }
 
-XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
+TEST_F(UnaryOpTest, SignAbsTestR2) {
   XlaBuilder builder(TestName());
   auto arg = ConstantR2<float>(&builder, {{1.0, -2.0}, {-3.0, 4.0}});
   auto sign = Sign(arg);
@@ -198,7 +203,7 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
   ComputeAndCompareR2<float>(&builder, {{0, 0}, {0, 0}}, {});
 }
 
-XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
+TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
   XlaBuilder builder(TestName());
   auto lhs = ConstantR1<int32_t>(&builder, {0, 1});
   auto rhs = ConstantR1<int32_t>(&builder, {1, 1});
@@ -207,7 +212,7 @@ XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
   ComputeAndCompareR1<int32_t>(&builder, {0, 1}, {});
 }
 
-XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToF32) {
+TEST_F(UnaryOpTest, ConvertElementTypePredToF32) {
   XlaBuilder builder(TestName());
   auto lhs = ConstantR1<int32_t>(&builder, {0, 1});
   auto rhs = ConstantR1<int32_t>(&builder, {1, 1});
diff --git a/third_party/xla/xla/tests/vector_ops_reduce_test.cc b/third_party/xla/xla/tests/vector_ops_reduce_test.cc
index f35beb32f78f..846660ac69aa 100644
--- a/third_party/xla/xla/tests/vector_ops_reduce_test.cc
+++ b/third_party/xla/xla/tests/vector_ops_reduce_test.cc
@@ -13,25 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
 #include <numeric>
 #include <vector>
 
 #include "xla/array2d.h"
 #include "xla/array3d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tests/test_macros.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class VecOpsReduceTest : public ClientLibraryTestBase {
+constexpr ErrorSpec kErrorSpec{1e-3, 0};
+
+class VecOpsReduceTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
   VecOpsReduceTest() : builder_(TestName()) {}
 
@@ -50,7 +54,6 @@ class VecOpsReduceTest : public ClientLibraryTestBase {
   }
 
   XlaBuilder builder_;
-  ErrorSpec errspec_{1e-3, 0};
 };
 
 TEST_F(VecOpsReduceTest, AddReduceR1F32) {
@@ -61,7 +64,7 @@ TEST_F(VecOpsReduceTest, AddReduceR1F32) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), sum_reducer,
          /*dimensions_to_reduce=*/{0});
 
-  ComputeAndCompareR0<float>(&builder_, -4.2f, {}, errspec_);
+  ComputeAndCompareR0<float>(&builder_, -4.2f, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceBigR1F32) {
@@ -75,7 +78,7 @@ TEST_F(VecOpsReduceTest, AddReduceBigR1F32) {
          /*dimensions_to_reduce=*/{0});
 
   float expected = std::accumulate(input.begin(), input.end(), 0.0f);
-  ComputeAndCompareR0<float>(&builder_, expected, {}, errspec_);
+  ComputeAndCompareR0<float>(&builder_, expected, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, MaxReduceR1F32) {
@@ -86,7 +89,7 @@ TEST_F(VecOpsReduceTest, MaxReduceR1F32) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), max_reducer,
          /*dimensions_to_reduce=*/{0});
 
-  ComputeAndCompareR0<float>(&builder_, 2.6f, {}, errspec_);
+  ComputeAndCompareR0<float>(&builder_, 2.6f, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, MaxReduceR1F32WithNontrivialInit) {
@@ -97,7 +100,7 @@ TEST_F(VecOpsReduceTest, MaxReduceR1F32WithNontrivialInit) {
   Reduce(x, ConstantR0<float>(&builder_, 4.0f), max_reducer,
          /*dimensions_to_reduce=*/{0});
 
-  ComputeAndCompareR0<float>(&builder_, 4.0f, {}, errspec_);
+  ComputeAndCompareR0<float>(&builder_, 4.0f, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR2F32Dim1) {
@@ -113,7 +116,7 @@ TEST_F(VecOpsReduceTest, AddReduceR2F32Dim1) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), sum_reducer,
          /*dimensions_to_reduce=*/{1});
 
-  ComputeAndCompareR1<float>(&builder_, {6.0, 15.0}, {}, errspec_);
+  ComputeAndCompareR1<float>(&builder_, {6.0, 15.0}, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR2F32Dim0) {
@@ -127,7 +130,7 @@ TEST_F(VecOpsReduceTest, AddReduceR2F32Dim0) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), sum_reducer,
          /*dimensions_to_reduce=*/{0});
 
-  ComputeAndCompareR1<float>(&builder_, {5.0, 7.0, 9.0}, {}, errspec_);
+  ComputeAndCompareR1<float>(&builder_, {5.0, 7.0, 9.0}, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR3F32Dim2) {
@@ -138,7 +141,7 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dim2) {
 
   Array2D<float> expected_array({{6.0f, 15.0f}, {6.0f, 15.0f}, {6.0f, 15.0f}});
 
-  ComputeAndCompareR2<float>(&builder_, expected_array, {}, errspec_);
+  ComputeAndCompareR2<float>(&builder_, expected_array, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR3F32Dim1) {
@@ -150,7 +153,7 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dim1) {
   Array2D<float> expected_array(
       {{5.0f, 7.0f, 9.0f}, {5.0f, 7.0f, 9.0f}, {5.0f, 7.0f, 9.0f}});
 
-  ComputeAndCompareR2<float>(&builder_, expected_array, {}, errspec_);
+  ComputeAndCompareR2<float>(&builder_, expected_array, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR3F32Dim0) {
@@ -161,7 +164,7 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dim0) {
 
   Array2D<float> expected_array({{3.0f, 6.0f, 9.0f}, {12.0f, 15.0f, 18.0f}});
 
-  ComputeAndCompareR2<float>(&builder_, expected_array, {}, errspec_);
+  ComputeAndCompareR2<float>(&builder_, expected_array, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR3F32Dims1and2) {
@@ -170,7 +173,7 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dims1and2) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), sum_reducer,
          /*dimensions_to_reduce=*/{1, 2});
 
-  ComputeAndCompareR1<float>(&builder_, {21.0, 21.0, 21.0}, {}, errspec_);
+  ComputeAndCompareR1<float>(&builder_, {21.0, 21.0, 21.0}, {}, kErrorSpec);
 }
 
 XLA_TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and2) {
@@ -179,7 +182,7 @@ XLA_TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and2) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), sum_reducer,
          /*dimensions_to_reduce=*/{0, 2});
 
-  ComputeAndCompareR1<float>(&builder_, {18.0, 45.0}, {}, errspec_);
+  ComputeAndCompareR1<float>(&builder_, {18.0, 45.0}, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and1) {
@@ -188,7 +191,7 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32Dims0and1) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), sum_reducer,
          /*dimensions_to_reduce=*/{0, 1});
 
-  ComputeAndCompareR1<float>(&builder_, {15.0, 21.0, 27.0}, {}, errspec_);
+  ComputeAndCompareR1<float>(&builder_, {15.0, 21.0, 27.0}, {}, kErrorSpec);
 }
 
 TEST_F(VecOpsReduceTest, AddReduceR3F32AllDims) {
@@ -197,7 +200,7 @@ TEST_F(VecOpsReduceTest, AddReduceR3F32AllDims) {
   Reduce(x, ConstantR0<float>(&builder_, 0.0f), sum_reducer,
          /*dimensions_to_reduce=*/{0, 1, 2});
 
-  ComputeAndCompareR0<float>(&builder_, 63.0, {}, errspec_);
+  ComputeAndCompareR0<float>(&builder_, 63.0, {}, kErrorSpec);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/vector_ops_simple_test.cc b/third_party/xla/xla/tests/vector_ops_simple_test.cc
index e9defab00575..05f8ca093b17 100644
--- a/third_party/xla/xla/tests/vector_ops_simple_test.cc
+++ b/third_party/xla/xla/tests/vector_ops_simple_test.cc
@@ -13,41 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cmath>
+#include <cstdint>
 #include <limits>
-#include <memory>
 #include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
 #include "xla/array4d.h"
-#include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/hlo/builder/lib/arithmetic.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class VecOpsSimpleTest : public ClientLibraryTestBase {
+constexpr ErrorSpec kErrorSpec{0.0001};
+
+class VecOpsSimpleTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  public:
-  explicit VecOpsSimpleTest(se::Platform* platform = nullptr)
-      : ClientLibraryTestBase(platform) {
+  VecOpsSimpleTest() {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
   }
-
-  ErrorSpec error_spec_{0.0001};
 };
 
-XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) {
+TEST_F(VecOpsSimpleTest, ExpTenValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -57,10 +59,10 @@ XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) {
                                  8.1662,     9.9742,     6.7379e-03, 4.0657e-01,
                                  9.0718e-02, 4.9530};
 
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) {
+TEST_F(VecOpsSimpleTest, ExpManyValues) {
   for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) {
     XlaBuilder builder(TestName());
     std::vector<float> exponents;
@@ -82,7 +84,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) {
   }
 }
 
-XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) {
+TEST_F(VecOpsSimpleTest, ExpIn4D) {
   XlaBuilder builder(TestName());
   Array4D<float> exponents(2, 2, 2, 2);
 
@@ -107,7 +109,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) {
                              ErrorSpec(/*aabs=*/1e-2, /*arel=*/1e-3));
 }
 
-XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
+TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -115,10 +117,10 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
 
   std::vector<float> expected = {-2.1, 2.6, -2.6, 4.0, -2.1,
                                  -2.3, 5.0, 0.9,  2.4, -1.6};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
+TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<int32_t>(&builder, {2, -2, 12, -4, 5, 20, -15, 0, -2, 1});
   Neg(x);
@@ -127,7 +129,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
   ComputeAndCompareR1<int32_t>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) {
+TEST_F(VecOpsSimpleTest, NegateUint32Values) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<uint32_t>(&builder, {0, 1, 42, static_cast<uint32_t>(-1),
                                            static_cast<uint32_t>(-12)});
@@ -137,7 +139,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) {
   ComputeAndCompareR1<uint32_t>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
+TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder,
                              {16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345});
@@ -146,10 +148,10 @@ XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
   std::vector<float> expected = {.25,     1,       .03125, 2.5,
                                  2.23607, .009000, .900025};
 
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
+TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
   XlaBuilder builder(TestName());
   auto add = CreateScalarAddComputation(F32, &builder);
 
@@ -161,10 +163,10 @@ XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
 
   std::vector<float> expected = {1.7, -3.2, -0.4, -3.8, 5.9,
                                  0.1, -6.8, 4.,   -1.,  2.2};
-  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+  ComputeAndCompareR1<float>(&builder, expected, {}, kErrorSpec);
 }
 
-XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) {
+TEST_F(VecOpsSimpleTest, MaxTenValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -177,25 +179,24 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
+TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
   // Similar to MaxTenValues, except that the inputs come from params rather
   // than constants.
   XlaBuilder builder(TestName());
   XlaOp v1, v2;
-  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+  const Literal param0_data = CreateR1Parameter<float>(
       {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1",
       /*builder=*/&builder, /*data_handle=*/&v1);
-  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+  const Literal param1_data = CreateR1Parameter<float>(
       {21.0f, 22.0f, 23.0f, 24.0f}, /*parameter_number=*/1, /*name=*/"v2",
       /*builder=*/&builder, /*data_handle=*/&v2);
 
   Max(v1, v2);
   ComputeAndCompareR1<float>(&builder, {41.0f, 22.0f, 23.0f, 84.0f},
-                             {param0_data.get(), param1_data.get()},
-                             error_spec_);
+                             {&param0_data, &param1_data}, kErrorSpec);
 }
 
-XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
+TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
   // Similar to MaxTenValuesFromParams, except that the data size passed in and
   // out is large.
   XlaBuilder builder(TestName());
@@ -225,20 +226,19 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
   }
 
   XlaOp v1, v2;
-  std::unique_ptr<GlobalData> param0_data =
+  const Literal param0_data =
       CreateR1Parameter<float>(v1vec, /*parameter_number=*/0, /*name=*/"v1",
                                /*builder=*/&builder, /*data_handle=*/&v1);
-  std::unique_ptr<GlobalData> param1_data =
+  const Literal param1_data =
       CreateR1Parameter<float>(v2vec, /*parameter_number=*/1, /*name=*/"v2",
                                /*builder=*/&builder, /*data_handle=*/&v2);
 
   Max(v1, v2);
   ComputeAndCompareR1<float>(&builder, expected_vec,
-                             {param0_data.get(), param1_data.get()},
-                             error_spec_);
+                             {&param0_data, &param1_data}, kErrorSpec);
 }
 
-XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
+TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -250,7 +250,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, MinTenValues) {
+TEST_F(VecOpsSimpleTest, MinTenValues) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(
       &builder, {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -263,7 +263,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
+TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
   XlaBuilder builder(TestName());
   auto zero = ConstantR0<float>(&builder, 0);
   auto one = ConstantR0<float>(&builder, 1);
@@ -276,7 +276,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
+TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
   XlaBuilder builder(TestName());
   auto zero = ConstantR0<float>(&builder, 0);
   auto one = ConstantR0<float>(&builder, 1);
@@ -289,7 +289,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
+TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
   XlaBuilder builder(TestName());
   auto zero = ConstantR1<float>(&builder, {0.0f, 0.0f});
   auto one = ConstantR1<float>(&builder, {1.0f, 1.0f});
@@ -300,7 +300,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
+TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
   XlaBuilder builder(TestName());
   auto one = ConstantR0<float>(&builder, 1);
   auto two = ConstantR0<float>(&builder, 2);
@@ -313,7 +313,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, ClampFloatEdgeCases) {
+TEST_F(VecOpsSimpleTest, ClampFloatEdgeCases) {
   XlaBuilder builder(TestName());
   SetFastMathDisabled(true);
   auto low = ConstantR1<float>(&builder, {NAN, 1, 1});
@@ -324,7 +324,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampFloatEdgeCases) {
   ComputeAndCompareR1<bool>(&builder, {false, false, false}, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) {
+TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) {
   XlaBuilder builder(TestName());
   auto zero = ConstantR0<int64_t>(&builder, 0);
   auto one = ConstantR0<int64_t>(&builder, 10);
@@ -335,7 +335,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) {
   ComputeAndCompareR1<int64_t>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
+TEST_F(VecOpsSimpleTest, MapTenValues) {
   XlaComputation add_half;
   {
     // add_half(x) = x + 0.5
@@ -391,7 +391,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) {
+TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<int32_t>(&builder, {-5, -4, -3, -2, -1, 0, 1, 2, 3, 4});
   auto y = ConstantR0<int32_t>(&builder, 3);
@@ -401,7 +401,7 @@ XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) {
   ComputeAndCompareR1<int32_t>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) {
+TEST_F(VecOpsSimpleTest, VectorPredicateEqual) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<bool>(&builder, {false, true});
   auto y = ConstantR1<bool>(&builder, {true, false});
@@ -411,7 +411,7 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) {
   ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) {
+TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<bool>(&builder, {false, true});
   auto y = ConstantR1<bool>(&builder, {true, false});
@@ -421,7 +421,7 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) {
   ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
-XLA_TEST_F(VecOpsSimpleTest, CbrtSevenValues) {
+TEST_F(VecOpsSimpleTest, CbrtSevenValues) {
   XlaBuilder builder(TestName());
   float inf = std::numeric_limits<float>::infinity();
   float qnan = std::numeric_limits<float>::quiet_NaN();
diff --git a/third_party/xla/xla/tests/verified_hlo_module.h b/third_party/xla/xla/tests/verified_hlo_module.h
deleted file mode 100644
index 3b27b3bd0cef..000000000000
--- a/third_party/xla/xla/tests/verified_hlo_module.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef XLA_TESTS_VERIFIED_HLO_MODULE_H_
-#define XLA_TESTS_VERIFIED_HLO_MODULE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/testlib/verified_hlo_module.h"
-
-#endif  // XLA_TESTS_VERIFIED_HLO_MODULE_H_
diff --git a/third_party/xla/xla/text_literal_reader.cc b/third_party/xla/xla/text_literal_reader.cc
index 587730874ebd..d796f39af007 100644
--- a/third_party/xla/xla/text_literal_reader.cc
+++ b/third_party/xla/xla/text_literal_reader.cc
@@ -115,11 +115,11 @@ absl::StatusOr<Literal> TextLiteralReader::ReadAllLines() {
       }
       coordinate_values.push_back(coordinate_value);
     }
-    if (coordinate_values.size() != shape.dimensions_size()) {
+    if (coordinate_values.size() != shape.dimensions().size()) {
       return InvalidArgument(
           "line did not have expected number of coordinates; want %d got %u: "
           "\"%s\"",
-          shape.dimensions_size(), coordinate_values.size(), line);
+          shape.dimensions().size(), coordinate_values.size(), line);
     }
     result.Set<float>(coordinate_values, value);
   }
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 85d945e13705..bcd1a59d75cf 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -2,10 +2,10 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm", "if_rocm_is_configured")
 load("//xla:lit.bzl", "lit_test_suite")
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_binary",
     "xla_cc_test",
     "xla_internal",
@@ -147,8 +147,8 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
@@ -270,8 +270,8 @@ xla_cc_test(
     deps = [
         ":hlo_slicer",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_matchers",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
@@ -380,7 +380,7 @@ xla_cc_test(
     srcs = ["hlo_module_loader_test.cc"],
     deps = [
         ":hlo_module_loader",
-        "//xla/tests:hlo_test_base",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_googletest//:gtest",
@@ -400,6 +400,7 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_runner_interface",
         "//xla/stream_executor:platform",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
@@ -412,8 +413,8 @@ xla_cc_test(
     deps = [
         ":prepare_reference_module",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
-        "//xla/tests:hlo_test_base",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -478,6 +479,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -536,6 +538,8 @@ xla_cc_binary(
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
         "//xla/stream_executor:cuda_platform",
+    ]) + if_rocm([
+        "//xla/stream_executor:rocm_platform",
     ]),
 )
 
@@ -584,6 +588,7 @@ cc_library(
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/service:call_graph",
         "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_proto_cc",
         "//xla/service:tuple_util",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -601,14 +606,13 @@ xla_cc_test(
     srcs = ["hlo_control_flow_flattening_test.cc"],
     deps = [
         ":hlo_control_flow_flattening",
-        "//xla:literal_util",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/transforms:despecializer",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_verifier",
         "//xla/service/spmd:spmd_partitioner",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//xla/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/log",
@@ -635,6 +639,7 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
@@ -710,6 +715,7 @@ cc_library(
     hdrs = ["collective_perf_table_gen.h"],
     deps = [
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/utils:hlo_query",
@@ -762,9 +768,9 @@ cc_library(
 )
 
 cc_library(
-    name = "compute_gpu_device_stats",
-    srcs = ["compute_gpu_device_stats.cc"],
-    hdrs = ["compute_gpu_device_stats.h"],
+    name = "compute_xspace_stats",
+    srcs = ["compute_xspace_stats.cc"],
+    hdrs = ["compute_xspace_stats.h"],
     deps = [
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -780,25 +786,13 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "compute_gpu_device_stats_test",
-    srcs = ["compute_gpu_device_stats_test.cc"],
+    name = "compute_xspace_stats_test",
+    srcs = ["compute_xspace_stats_test.cc"],
     deps = [
-        ":compute_gpu_device_stats",
+        ":compute_xspace_stats",
         "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
-    ],
-)
-
-xla_cc_binary(
-    name = "compute_gpu_device_stats_main",
-    srcs = ["compute_gpu_device_stats_main.cc"],
-    deps = [
-        ":compute_gpu_device_stats",
-        "//xla:debug_options_flags",
-        "//xla/tsl/util:command_line_flags",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -848,6 +842,35 @@ xla_cc_binary(
     ],
 )
 
+xla_test(
+    name = "matmul_perf_table_gen_run",
+    timeout = "eternal",
+    srcs = ["matmul_perf_table_gen_run.cc"],
+    # TODO(b/372714955): Fix the memory leak.
+    args = if_google(["--heap_check="]),
+    backends = [
+        "gpu_h100",
+        "gpu_b200",
+    ],
+    tags = [
+        "gpu",
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":matmul_perf_table_gen",
+        ":matmul_perf_table_gen_main_lib",
+        "//xla/service:hlo_runner",
+        "//xla/service:platform_util",
+        "//xla/service/gpu/model:hlo_op_profiles",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
 build_test(
     name = "compute_cost_build_test",
     targets = [
@@ -893,6 +916,7 @@ xla_cc_binary(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_proto_cc",
         "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -915,6 +939,7 @@ tsl_gpu_library(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
@@ -932,6 +957,7 @@ tsl_gpu_library(
         "//xla/service/cpu:cpu_executable",
         "//xla/service/gpu:gpu_symbol_repository",
         "//xla/service/gpu/autotuning:autotuner_util",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
@@ -1005,12 +1031,13 @@ xla_test(
     ],
     data = [
         ":data/add.hlo",
-        "//xla/service:xla_aot_compile_test_gpu_target_config.prototxt",
+        "//xla/service:xla_aot_compile_test_gpu_target_config.txtpb",
         "//xla/service/gpu:gpu_compiler_test_autotune_db.textproto",
     ],
     deps = [
         ":xla_compile_lib",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/service:symbol_repository",
@@ -1035,22 +1062,23 @@ xla_test(
 xla_test(
     name = "hlo_decomposer_test",
     srcs = ["hlo_decomposer_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":hlo_decomposer_lib",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
-        "//xla/tests:hlo_test_base",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 xla_cc_binary(
     name = "print_indexing",
     srcs = ["print_indexing.cc"],
-    visibility = ["//xla/backends/gpu/codegen:__subpackages__"],
+    visibility = ["//visibility:private"],
     deps = [
         ":hlo_module_loader",
         "//xla/hlo/analysis:indexing_analysis",
@@ -1078,3 +1106,50 @@ lit_test_suite(
         "@llvm-project//llvm:FileCheck",
     ],
 )
+
+xla_cc_binary(
+    name = "compute_xspace_stats_main",
+    srcs = ["compute_xspace_stats_main.cc"],
+    deps = [
+        ":compute_xspace_stats",
+        "//xla:debug_options_flags",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
+xla_cc_binary(
+    name = "compute_xspace_stats_main_gpu",
+    srcs = ["compute_xspace_stats_main.cc"],
+    tags = ["gpu"],
+    deps = [
+        ":compute_xspace_stats",
+        "//xla:debug_options_flags",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
+xla_cc_binary(
+    name = "extract_dots_for_benchmark",
+    srcs = ["extract_dots_for_benchmark.cc"],
+    deps = [
+        ":hlo_module_loader",
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/gpu/model:gpu_hlo_cost_analysis",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
diff --git a/third_party/xla/xla/tools/benchmarks/baseline/presubmit_baseline.yml b/third_party/xla/xla/tools/benchmarks/baseline/presubmit_baseline.yml
new file mode 100644
index 000000000000..4decd40e4ccf
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/baseline/presubmit_baseline.yml
@@ -0,0 +1,23 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Baseline for XLA benchmarks.
+gemma3_1b_flax_call_gpu_b200_1_host_1_device: # config_id
+  GPU_DEVICE_TIME:
+    baseline_ms: 140.5
+    threshold: 0.30 # Allow 30% regression max
+  GPU_DEVICE_MEMCPY_TIME:
+    baseline_ms: 1.0
+    threshold: 0.30 # Allow 30% regression max
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/benchmarks/proto/BUILD b/third_party/xla/xla/tools/benchmarks/proto/BUILD
new file mode 100644
index 000000000000..2249db4aee67
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/proto/BUILD
@@ -0,0 +1,19 @@
+load("//xla:package_groups.bzl", "xla_package_groups")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tf_proto_library",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
+    licenses = ["notice"],
+)
+
+xla_package_groups()
+
+tf_proto_library(
+    name = "benchmark_config_proto",
+    srcs = ["benchmark_config.proto"],
+)
diff --git a/third_party/xla/xla/tools/benchmarks/proto/benchmark_config.proto b/third_party/xla/xla/tools/benchmarks/proto/benchmark_config.proto
new file mode 100644
index 000000000000..079db487804b
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/proto/benchmark_config.proto
@@ -0,0 +1,165 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+// Enum defining the target hardware platform category.
+enum HardwareCategory {
+  HARDWARE_CATEGORY_UNSPECIFIED = 0;
+  CPU_X86 = 1;    // Standard x86 CPU
+  CPU_ARM64 = 2;  // ARM64 CPU
+  GPU_L4 = 3;     // L4 GPU
+  GPU_B200 = 4;   // B200 GPU
+}
+
+// Enum defining how frequently or under what conditions the benchmark runs.
+enum RunFrequency {
+  RUN_FREQUENCY_UNSPECIFIED = 0;
+  PRESUBMIT = 1;   // Runs on pull requests before merging.
+  POSTSUBMIT = 2;  // Runs after commits are merged to the main branch.
+  SCHEDULED = 3;   // Runs on a regular schedule (e.g., nightly, weekly).
+  MANUAL = 4;      // Triggered manually (e.g., for bisecting).
+}
+
+// Defines the execution topology (single/multi host/device).
+message ExecutionTopology {
+  int32 num_hosts = 1;             // Specify number of hosts
+  int32 num_devices_per_host = 2;  // Specify devices per host
+  bool multi_host = 3;
+  bool multi_device = 4;
+}
+
+// Enum defining how frequently the hlo should be updated/reviewed for retention
+// purposes.
+enum UpdateFrequency {
+  UPDATE_FREQUENCY_UNSPECIFIED = 0;
+  WEEKLY = 1;
+  MONTHLY = 2;
+  QUARTERLY = 3;
+}
+
+// Enum defining what benchmark metrics are supported using XSpace.
+enum TargetMetric {
+  TARGET_METRIC_UNSPECIFIED = 0;
+  WALL_TIME = 1;               // Measures profiler start and end time in ms.
+  GPU_DEVICE_TIME = 2;         // Measure GPU device time in ms.
+  GPU_DEVICE_MEMCPY_TIME = 3;  // Measures GPU device memcpy time in ms.
+  CPU_TIME = 4;                // Measures CPU time in ms.
+  PEAK_CPU_MEMORY = 5;         // Measures peak CPU memory in gigabytes.
+  PEAK_GPU_MEMORY = 6;         // Measures peak GPU memory in gigabytes.
+}
+
+// Associates a hardware category with its specific topology requirements.
+message HardwareTarget {
+  // REQUIRED: Specifies the primary hardware category needed, e.g., GPU_L4.
+  HardwareCategory hardware_category = 1;
+
+  // REQUIRED: Defines the host/device topology required *for this hardware
+  // category*.
+  ExecutionTopology topology = 2;
+
+  // Optional: List of target metrics relevant *for this hardware target*.
+  // For example, GPU_L4 has have [WALL_TIME, GPU_DEVICE_TIME,
+  // GPU_DEVICE_MEMCPY_TIME, PEAK_GPU_MEMORY] as default target metrics,
+  // and CPU_X86 has [CPU_TIME, PEAK_CPU_MEMORY] as default target metrics.
+  repeated TargetMetric target_metrics = 3;
+}
+
+enum InputFormat {
+  INPUT_FORMAT_UNSPECIFIED = 0;
+  // Default and preferred format for emulating JAX-like frameworks via PJRT
+  // MLIR APIs.
+  STABLEHLO_MLIR = 1;  // Represents a StableHLO MLIR text file (.mlir)
+  // Format for specific cases, like post-optimization benchmarks or emulating
+  // frameworks that use HLO directly (e.g., current PyTorch/XLA) via PJRT HLO
+  // APIs.
+  HLO_TEXT = 2;  // Represents an HLO text dump file (.hlo)
+}
+
+message InputArtifact {
+  // REQUIRED: Specifies the format of the benchmark input artifact (HLO or
+  // StableHLO).
+  InputFormat input_format = 1;
+  // REQUIRED: Specifies the location of the benchmark input artifact (HLO or
+  // StableHLO).
+  oneof artifact_source {
+    // Path to artifact file relative to a base directory (e.g.,
+    // xla/tools/benchmarks/hlo). Expected file extension should match
+    // input_format (e.g., .mlir or .hlo).
+    string artifact_path = 2;
+    // GCS path (gs://bucket/path/to/artifact) for larger artifacts.
+    string artifact_gcs_bucket_path = 3;
+  }
+}
+
+// Defines a specific benchmark configuration.
+message BenchmarkConfig {
+  // REQUIRED: Unique identifier for this specific benchmark configuration.
+  // Convention: e.g., "{name}_{hardware}_{topology_simplified}".
+  // Example: "gemma3_1b_flax_call_gpu_b200_1h1d"
+  // Should be unique within the registry file.
+  string config_id = 1;
+
+  // REQUIRED: Hlo file name (logical name, path resolved relative to hlo_dir).
+  // Example: "gemma3_1b_flax_call"
+  string name = 2;
+
+  // REQUIRED: Detailed description of the benchmark's purpose.
+  string description = 3;
+
+  // REQUIRED: Contact information (e.g., GitHub handle, email, team alias) for
+  // the owner/maintainer.
+  string owner = 4;
+
+  // REQUIRED: Specifies the location of the benchmark input artifact.
+  InputArtifact input_artifact = 5;
+
+  // REQUIRED: Reference to the original model/source
+  // (e.g., "Llama2-7B Attention Layer", "JAX Training Step", "Post-Optimization
+  // Fusion").
+  repeated string model_source_info = 6;
+
+  // REQUIRED: Defines the specific hardware configurations this benchmark runs
+  // on. Each entry specifies a hardware category, its required topology, and
+  // target metrics.
+  repeated HardwareTarget hardware_targets = 7;
+
+  // REQUIRED: Defines when this benchmark configuration should run.
+  repeated RunFrequency run_frequencies = 8;
+
+  // REQUIRED: Metadata about how often this definition should be
+  // reviewed/updated.
+  UpdateFrequency update_frequency_policy = 9;
+
+  // Optional: List of XLA compilation flags passed to the PJRT CompileOptions.
+  // Example: ["--xla_gpu_enable_cudnn_fusion=false",
+  // "--xla_gpu_triton_gemm_any=True"]
+  repeated string xla_compilation_flags = 10;
+
+  // Optional: List of flags for the benchmark runner executable itself
+  // (e.g., specific flags for multihost_hlo_runner like --num_repeats).
+  repeated string runtime_flags = 11;
+
+  // Optional: List of GitHub labels used for filtering/triggering workflows.
+  repeated string github_labels = 12;
+}
+
+// Represents a collection of benchmark configurations, typically loaded
+// from a single YAML file.
+message BenchmarkSuite {
+  repeated BenchmarkConfig benchmarks = 1;
+}
diff --git a/third_party/xla/xla/tools/benchmarks/registries/default_registry.yml b/third_party/xla/xla/tools/benchmarks/registries/default_registry.yml
new file mode 100644
index 000000000000..82ce0177d7d7
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/registries/default_registry.yml
@@ -0,0 +1,63 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Default registry for XLA benchmarks.
+
+benchmarks: [
+  {
+    name: "gemma3_1b_flax_call"
+    description: "Benchmarks Gemma3 1b in Flax using B200 GPUs."
+    owner: "juliagmt-google@"
+    input_artifact: {
+      input_format: HLO_TEXT
+      artifact_gcs_bucket_path: "https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo"
+    }
+    model_source_info: ["Gemma3 1B"]
+    hardware_targets: [{
+      hardware_category: GPU_B200
+      topology: { num_hosts: 1, num_devices_per_host: 1 }
+      target_metrics: [WALL_TIME, GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME, PEAK_GPU_MEMORY]
+    }]
+    run_frequencies: [POSTSUBMIT]
+    update_frequency_policy: QUARTERLY
+    runtime_flags: ["--num_repeats=5"]
+    github_labels: ["blocking_presubmit_test"] # Github label for presubmit triggering
+  },
+  {
+    name: "gemma2_2b_keras_jax"
+    description: "Gemma2 2B in Keras on x86 CPU."
+    owner: "company-A@"
+    input_artifact: {
+      input_format: HLO_TEXT
+      artifact_gcs_bucket_path: "https://storage.googleapis.com/xla-benchmarking-temp/gemma2_2b_keras_jax.hlo"
+    }
+    model_source_info: ["Gemma2 2B"]
+    hardware_targets: [{
+      hardware_category: GPU_L4
+      topology: { num_hosts: 1, num_devices_per_host: 1 }
+      target_metrics: [WALL_TIME, GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME, PEAK_GPU_MEMORY]
+    },
+    {
+      hardware_category: CPU_X86
+      topology: { num_hosts: 1, num_devices_per_host: 1 }
+      target_metrics: [CPU_TIME, PEAK_CPU_MEMORY]
+    }
+    ]
+    run_frequencies: [PRESUBMIT, POSTSUBMIT]
+    update_frequency_policy: QUARTERLY
+    runtime_flags: ["--num_repeats=5"]
+    github_labels: ["blocking_presubmit_test"] # Github label for presubmit triggering
+  }
+]
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/benchmarks/utils/BUILD b/third_party/xla/xla/tools/benchmarks/utils/BUILD
new file mode 100644
index 000000000000..fc4f54e8e605
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/utils/BUILD
@@ -0,0 +1,72 @@
+load(
+    "//xla:xla.default.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
+load(
+    "//xla/tsl:tsl.bzl",
+    "internal_visibility",
+)
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "generate_benchmark_matrices_cc",
+    srcs = ["generate_benchmark_matrices.cc"],
+    hdrs = ["generate_benchmark_matrices.h"],
+    deps = [
+        "//xla/tools/benchmarks/proto:benchmark_config_proto_cc",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@jsoncpp_git//:jsoncpp",  # buildcleaner: keep
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+xla_cc_test(
+    name = "generate_benchmark_matrices_test",
+    srcs = ["generate_benchmark_matrices_test.cc"],
+    deps = [
+        ":generate_benchmark_matrices_cc",
+        "//xla/tools/benchmarks/proto:benchmark_config_proto_cc",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@jsoncpp_git//:jsoncpp",
+        "@local_tsl//tsl/platform:path",
+    ],
+)
+
+xla_cc_binary(
+    name = "generate_benchmark_matrices_main",
+    srcs = ["generate_benchmark_matrices_main.cc"],
+    deps = [
+        ":generate_benchmark_matrices_cc",
+        "//xla/tools/benchmarks/proto:benchmark_config_proto_cc",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@jsoncpp_git//:jsoncpp",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
new file mode 100644
index 000000000000..cc73da8607e2
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
@@ -0,0 +1,450 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/benchmarks/utils/generate_benchmark_matrices.h"
+
+#include <cstdlib>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "json/json.h"
+#include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
+#include "xla/tsl/platform/env.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+namespace tools {
+namespace benchmarks {
+
+namespace {
+// --- String Conversion Helpers ---
+
+std::string HardwareCategoryToString(HardwareCategory category) {
+  const std::string name = HardwareCategory_Name(category);
+  return name.empty() ? "UNKNOWN_HARDWARE" : name;
+}
+
+std::string TargetMetricToString(TargetMetric metric) {
+  const std::string name = TargetMetric_Name(metric);
+  return name.empty() ? "UNKNOWN_METRIC" : name;
+}
+
+// --- Mapping Definitions ---
+
+// Creates a standardized key (e.g., "GPU_L4_1h_4d") for looking up runner/image
+// info. This key incorporates topology for multi-host/device cases.
+std::string MakeHardwareTargetMappingKey(const HardwareTarget& target) {
+  std::string hw_name = HardwareCategoryToString(target.hardware_category());
+  const auto& topology = target.topology();
+  // Only add topology suffix if it's multi-host or multi-device
+  if (topology.num_hosts() > 1 || topology.num_devices_per_host() > 1) {
+    // Use uppercase H and D as in the example map keys
+    return absl::StrCat(hw_name, "_", topology.num_hosts(), "H_",
+                        topology.num_devices_per_host(), "D");
+  }
+  // Otherwise, just use the hardware category name (e.g., "CPU_X86", "GPU_L4")
+  return hw_name;
+}
+
+const absl::flat_hash_map<std::string, std::string>&
+GetHardwareToRunnerLabel() {
+  static const auto* kHardwareToRunnerLabel =
+      new absl::flat_hash_map<std::string, std::string>{
+          {"CPU_X86", "linux-x86-n2-128"},
+          {"CPU_ARM64", "linux-arm64-c4a-64"},
+          {"GPU_L4", "linux-x86-g2-16-l4-1gpu"},
+          {"GPU_B200", "linux-x86-a4-224-b200-1gpu"},
+          // Key uses H_D suffix for multi-device
+          {"GPU_L4_1H_4D", "linux-x86-g2-48-l4-4gpu"},
+      };
+  return *kHardwareToRunnerLabel;
+}
+
+const absl::flat_hash_map<std::string, std::string>&
+GetHardwareToContainerImage() {
+  static const auto* kHardwareToContainerImage =
+      new absl::flat_hash_map<std::string, std::string>{
+          {"CPU_X86",
+           "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/"
+           "ml-build:latest"},
+          {"CPU_ARM64",
+           "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/"
+           "ml-build-arm64:latest"},
+          {"GPU_L4",
+           "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+          {"GPU_B200",
+           "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+          {"GPU_L4_1H_4D",
+           "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/"
+           "ml-build-cuda12.8-cudnn9.8:latest"},
+      };
+  return *kHardwareToContainerImage;
+}
+
+// Default mapping from hardware category name to target metric enums.
+// Note: TargetMetric can be overridden in the BenchmarkConfig proto.
+const absl::flat_hash_map<std::string, std::vector<TargetMetric>>&
+GetHardwareToDefaultTargetMetrics() {
+  static const auto* kHardwareToDefaultTargetMetrics =
+      new absl::flat_hash_map<std::string, std::vector<TargetMetric>>{
+          // Key is just the hardware category name for this map
+          {"CPU_X86", {TargetMetric::CPU_TIME}},
+          {"CPU_ARM64", {TargetMetric::CPU_TIME}},
+          {"GPU_L4",
+           {TargetMetric::GPU_DEVICE_TIME,
+            TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
+          {"GPU_B200",
+           {TargetMetric::GPU_DEVICE_TIME,
+            TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
+      };
+  return *kHardwareToDefaultTargetMetrics;
+}
+
+// --- Mapping Retrieval ---
+
+// Retrieves the GHA runner label using the provided global map.
+// e.g., CPU_X86 -> "linux-x86-n2-128",
+//       GPU_L4_1H_4D -> "linux-x86-g2-48-l4-4gpu"
+absl::StatusOr<std::string> GetRunnerLabelForTarget(
+    const HardwareTarget& target) {
+  std::string mapping_key = MakeHardwareTargetMappingKey(target);
+  const auto& map = GetHardwareToRunnerLabel();
+  auto it = map.find(mapping_key);
+  if (it == map.end()) {
+    return absl::NotFoundError(absl::StrCat(
+        "No GHA runner label mapping found for key: ", mapping_key));
+  }
+  return it->second;
+}
+
+// Retrieves the container image name using the provided global map.
+// e.g., CPU_X86 ->
+// "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+//       GPU_L4_1H_4D ->
+//       "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-cuda12.8-cudnn9.8:latest"
+absl::StatusOr<std::string> GetContainerImageForTarget(
+    const HardwareTarget& target) {
+  std::string mapping_key = MakeHardwareTargetMappingKey(target);
+  const auto& map =
+      GetHardwareToContainerImage();  // Use the global map function
+  auto it = map.find(mapping_key);
+  if (it == map.end()) {
+    return absl::NotFoundError(absl::StrCat(
+        "No container image mapping found for key: ", mapping_key));
+  }
+  return it->second;
+}
+
+// --- JSON Generation Helpers ---
+
+Json::Value JsonStringArrayFromProto(
+    const tsl::protobuf::RepeatedPtrField<std::string>& proto_array) {
+  Json::Value json_array(Json::arrayValue);
+  for (const auto& item : proto_array) {
+    json_array.append(item);
+  }
+  return json_array;
+}
+
+// Creates the JSON array of target metric strings.
+// It prioritizes metrics defined *directly* in the HardwareTarget proto, over
+// the default map defined in GetHardwareToDefaultTargetMetrics().
+Json::Value JsonTargetMetricsArrayFromProto(const HardwareTarget& target) {
+  Json::Value metrics_json(Json::arrayValue);
+
+  if (target.target_metrics_size() > 0) {
+    // Use metrics defined in the proto
+    for (const auto& metric_enum_val : target.target_metrics()) {
+      TargetMetric metric_enum = static_cast<TargetMetric>(metric_enum_val);
+      metrics_json.append(TargetMetricToString(metric_enum));
+    }
+  } else {
+    LOG(INFO)
+        << "No target_metrics specified in the proto for hardware target '"
+        << HardwareCategoryToString(target.hardware_category())
+        << "' with topology (" << target.topology().num_hosts() << "h/"
+        << target.topology().num_devices_per_host() << "d). "
+        << "Benchmark runner uses the default metrics.";
+    const auto& map = GetHardwareToDefaultTargetMetrics();
+    std::string hw_key = HardwareCategoryToString(target.hardware_category());
+    auto it = map.find(hw_key);
+    if (it != map.end()) {
+      for (const auto& metric_enum : it->second) {
+        metrics_json.append(TargetMetricToString(metric_enum));
+      }
+    } else {
+      LOG(WARNING) << "No target metrics found for hardware category '"
+                   << hw_key << "'.";
+    }
+  }
+  return metrics_json;
+}
+
+// --- Matrix Entry Creation ---
+
+// Struct to hold artifact details extracted from InputArtifact message
+struct ArtifactInfo {
+  std::string location;
+  bool is_gcs_path = false;
+  InputFormat format = InputFormat::INPUT_FORMAT_UNSPECIFIED;
+};
+
+// Extracts artifact location and format info from the InputArtifact message.
+absl::StatusOr<ArtifactInfo> GetArtifactInfo(const BenchmarkConfig& benchmark) {
+  if (!benchmark.has_input_artifact()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Benchmark '", benchmark.name(),
+                     "' is missing required field 'input_artifact'."));
+  }
+  const InputArtifact& artifact = benchmark.input_artifact();
+  ArtifactInfo info;
+  info.format = artifact.input_format();
+
+  // Validate format.
+  if (info.format == InputFormat::INPUT_FORMAT_UNSPECIFIED) {
+    LOG(WARNING)
+        << "Benchmark '" << benchmark.name()
+        << "' has INPUT_FORMAT_UNSPECIFIED for input_artifact.input_format.";
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Benchmark '", benchmark.name(), "' has unspecified input_format."));
+  }
+
+  if (!artifact.artifact_path().empty()) {
+    info.location = artifact.artifact_path();
+    info.is_gcs_path = false;
+  } else if (!artifact.artifact_gcs_bucket_path().empty()) {
+    info.location = artifact.artifact_gcs_bucket_path();
+    info.is_gcs_path = true;
+  } else {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Benchmark '", benchmark.name(),
+        "' input_artifact requires artifact_path or artifact_gcs_bucket_path"));
+  }
+  return info;
+}
+
+// Builds a single JSON object representing one row in the GHA `include` input
+// matrix to trigger a benchmark run.
+absl::StatusOr<Json::Value> BuildGitHubActionMatrixEntry(
+    const BenchmarkConfig& benchmark, const HardwareTarget& target,
+    RunFrequency frequency) {
+  // 1. Get Runner and Image using the specific target info
+  absl::StatusOr<std::string> runner_label = GetRunnerLabelForTarget(target);
+  if (!runner_label.ok()) {
+    return absl::Status(runner_label.status().code(),
+                        absl::StrCat("Benchmark '", benchmark.name(),
+                                     "', Hardware Target Key '",
+                                     MakeHardwareTargetMappingKey(target),
+                                     "': ", runner_label.status().message()));
+  }
+  absl::StatusOr<std::string> container_image =
+      GetContainerImageForTarget(target);
+  if (!container_image.ok()) {
+    return absl::Status(
+        container_image.status().code(),
+        absl::StrCat("Benchmark '", benchmark.name(),
+                     "', Hardware Target Key '",
+                     MakeHardwareTargetMappingKey(target),
+                     "': ", container_image.status().message()));
+  }
+  // 2. Get Artifact Info (Location and Format)
+  absl::StatusOr<ArtifactInfo> artifact_info = GetArtifactInfo(benchmark);
+  if (!artifact_info.ok()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Benchmark '", benchmark.name(),
+        "' has invalid input_artifact: ", artifact_info.status().message()));
+  }
+
+  // 3. Build JSON Entry
+  Json::Value entry(Json::objectValue);
+  entry["benchmark_name"] = benchmark.name();
+  // Use config_id if it exists in the final proto, otherwise rely on name +
+  // target + freq
+  if (!benchmark.config_id().empty()) {  // Check if field exists and is set
+    entry["config_id"] = benchmark.config_id();
+  }
+  entry["run_frequency"] = RunFrequency_Name(frequency);
+  entry["runner_label"] = *runner_label;
+  entry["container_image"] = *container_image;
+  entry["artifact_location"] =
+      artifact_info->location;  // Use extracted location
+  entry["is_gcs_artifact"] = artifact_info->is_gcs_path;  // Use extracted flag
+  entry["input_format"] =
+      InputFormat_Name(artifact_info->format);  // Use extracted format
+
+  entry["target_metrics"] = JsonTargetMetricsArrayFromProto(target);
+
+  entry["xla_compilation_flags"] =
+      JsonStringArrayFromProto(benchmark.xla_compilation_flags());
+  entry["runtime_flags"] = JsonStringArrayFromProto(benchmark.runtime_flags());
+  entry["hardware_category"] =
+      HardwareCategory_Name(target.hardware_category());
+
+  Json::Value topo_json(Json::objectValue);
+  topo_json["num_hosts"] = target.topology().num_hosts();
+  topo_json["num_devices_per_host"] = target.topology().num_devices_per_host();
+  topo_json["multi_host"] = target.topology().multi_host();
+  topo_json["multi_device"] = target.topology().multi_device();
+  entry["topology"] = std::move(topo_json);
+
+  entry["github_labels"] = JsonStringArrayFromProto(benchmark.github_labels());
+
+  return entry;
+}
+
+// Checks if a benchmark should run for the current frequency.
+bool ShouldRunForFrequency(const BenchmarkConfig& benchmark,
+                           RunFrequency run_frequency) {
+  for (const auto& freq_in_config : benchmark.run_frequencies()) {
+    if (static_cast<RunFrequency>(freq_in_config) == run_frequency) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+// --- Public API Function Implementations ---
+
+absl::StatusOr<BenchmarkSuite> LoadBenchmarkSuiteFromFile(
+    const std::string& registry_path) {
+  BenchmarkSuite suite;
+  std::string content;
+  absl::Status read_status =
+      tsl::ReadFileToString(tsl::Env::Default(), registry_path, &content);
+  if (!read_status.ok()) {
+    return absl::Status(
+        static_cast<absl::StatusCode>(read_status.code()),
+        absl::StrCat("Failed to read registry file: ", registry_path, " - ",
+                     read_status.message()));
+  }
+
+  if (!tsl::protobuf::TextFormat::ParseFromString(content, &suite)) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Error parsing TextProto registry file: ", registry_path));
+  }
+  return suite;
+}
+
+absl::StatusOr<Json::Value> BuildGitHubActionsMatrix(
+    const BenchmarkSuite& suite, RunFrequency run_frequency) {
+  Json::Value matrix_entries(Json::arrayValue);
+
+  for (const auto& benchmark : suite.benchmarks()) {
+    if (!ShouldRunForFrequency(benchmark, run_frequency)) {
+      VLOG(1) << "Skipping benchmark '" << benchmark.name()
+              << "' as its run_frequencies do not include current workflow: "
+              << RunFrequency_Name(run_frequency);
+      continue;
+    }
+    if (benchmark.name().empty()) {
+      LOG(WARNING)
+          << "Skipping BenchmarkConfig entry because 'name' field is missing.";
+      continue;
+    }
+    // Build a matrix entry for each hardware target for the given run
+    // frequency.
+    for (const auto& target : benchmark.hardware_targets()) {
+      absl::StatusOr<Json::Value> matrix_entry =
+          BuildGitHubActionMatrixEntry(benchmark, target, run_frequency);
+      if (matrix_entry.ok()) {
+        matrix_entries.append(*matrix_entry);
+      } else {
+        LOG(ERROR) << "Failed to create matrix entry for benchmark '"
+                   << benchmark.name() << "': " << matrix_entry.status();
+      }
+    }
+  }
+  return matrix_entries;
+}
+
+absl::StatusOr<std::string> FindRegistryFile(
+    const std::string& registry_path_or_name) {
+  tsl::Env* env = tsl::Env::Default();
+  const std::string& path_str = registry_path_or_name;
+
+  if (tsl::io::IsAbsolutePath(path_str) && env->FileExists(path_str).ok()) {
+    VLOG(1) << "Absolute path exists.";
+    return path_str;
+  }
+  if (!env->FileExists(path_str).ok()) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Registry path specified but not found: ", path_str));
+  }
+
+  VLOG(1) << "Registry file path '" << path_str
+          << "' is relative. Attempting resolution...";
+  const char* build_workspace_dir_cstr =
+      std::getenv("BUILD_WORKSPACE_DIRECTORY");
+  if (build_workspace_dir_cstr != nullptr &&
+      build_workspace_dir_cstr[0] != '\0') {
+    std::string workspace_path =
+        tsl::io::JoinPath(build_workspace_dir_cstr, path_str);
+    VLOG(1) << "Checking workspace path: " << workspace_path;
+    if (env->FileExists(workspace_path).ok()) {
+      VLOG(1) << "Found registry file in workspace: " << workspace_path;
+      char* full_path_cstr = realpath(workspace_path.c_str(), nullptr);
+      if (full_path_cstr) {
+        std::string full_path(full_path_cstr);
+        free(full_path_cstr);
+        return full_path;
+      }
+      LOG(WARNING) << "Could not resolve workspace path '" << workspace_path
+                   << "' to absolute path.";
+      return workspace_path;
+    }
+    VLOG(1) << "Registry file not found relative to workspace.";
+  } else {
+    VLOG(1) << "BUILD_WORKSPACE_DIRECTORY not set or empty, skipping workspace "
+               "check.";
+  }
+
+  VLOG(1) << "Checking relative to current directory: " << path_str;
+  if (env->FileExists(path_str).ok()) {
+    VLOG(1) << "Found registry file relative to current directory: "
+            << path_str;
+    char* full_path_cstr = realpath(path_str.c_str(), nullptr);
+    if (full_path_cstr) {
+      std::string full_path(full_path_cstr);
+      free(full_path_cstr);
+      VLOG(1) << "Resolved CWD path to absolute: " << full_path;
+      return full_path;
+    }
+    LOG(WARNING) << "Could not get absolute path for CWD match of " << path_str
+                 << ", returning relative path.";
+    return path_str;
+  }
+  VLOG(1) << "Registry file not found relative to current directory.";
+
+  return absl::FailedPreconditionError(
+      absl::StrCat("Registry file '", path_str,
+                   "' not found. Tried absolute, relative to "
+                   "workspace (if set), and relative to CWD."));
+}
+
+}  // namespace benchmarks
+}  // namespace tools
+}  // namespace xla
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.h b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.h
new file mode 100644
index 000000000000..4fdb7018f7b5
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices.h
@@ -0,0 +1,53 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_BENCHMARKS_UTILS_GENERATE_BENCHMARK_MATRICES_H_
+#define XLA_TOOLS_BENCHMARKS_UTILS_GENERATE_BENCHMARK_MATRICES_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "json/json.h"
+#include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
+
+namespace xla {
+namespace tools {
+namespace benchmarks {
+
+using BenchmarkSuite = xla::BenchmarkSuite;
+
+// Parses a TextProto registry file into a BenchmarkSuite proto.
+// Returns an error if parsing fails (file not found, parse error).
+absl::StatusOr<BenchmarkSuite> LoadBenchmarkSuiteFromFile(
+    const std::string& registry_path);
+
+// Generates the benchmark matrix JSON object based on the run frequency
+// (e.g., presubmit, postsubmit, nightly, manual).
+// Returns an empty JSON value object if the suite is empty or errors occur
+// during generation (though errors are primarily handled by printing
+// warnings).
+absl::StatusOr<Json::Value> BuildGitHubActionsMatrix(
+    const BenchmarkSuite& suite, RunFrequency run_frequency);
+
+// Attempts to find the absolute path to the registry file.
+// Checks the provided path directly, then relative to BUILD_WORKSPACE_DIRECTORY
+// (if set), and finally relative to the current working directory.
+absl::StatusOr<std::string> FindRegistryFile(const std::string& registry_path);
+
+}  // namespace benchmarks
+}  // namespace tools
+}  // namespace xla
+
+#endif  // XLA_TOOLS_BENCHMARKS_UTILS_GENERATE_BENCHMARK_MATRICES_H_
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
new file mode 100644
index 000000000000..431a8134c65c
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_main.cc
@@ -0,0 +1,148 @@
+// Copyright 2025 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ==============================================================================
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "json/json.h"
+#include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
+#include "xla/tools/benchmarks/utils/generate_benchmark_matrices.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tsl/platform/init_main.h"
+
+namespace {
+constexpr char kUsageText[] = R"(
+Usage: bazel run //third_party/tensorflow/compiler/xla/tools/benchmarks/utils:generate_benchmark_matrices_main -- --registry_file=/path/to/registry.yml --workflow_type=presubmit
+
+Example output:
+{
+  "benchmarks": [
+    {
+      "artifact_location" : "https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo",
+      "benchmark_name" : "gemma3_1b_flax_call",
+      "container_image" : "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-cuda12.8-cudnn9.8:latest",
+      "github_labels" :
+      [
+        "blocking_presubmit_test"
+      ],
+      "hardware_category" : "GPU_B200",
+      "input_format" : "HLO_TEXT",
+      "is_gcs_artifact" : true,
+      "run_frequency" : "PRESUBMIT",
+      "runner_label" : "linux-x86-a4-224-b200-1gpu",
+      "runtime_flags" :
+      [
+        "--num_repeat=5"
+      ],
+      "target_metrics" :
+      [
+        "WALL_TIME",
+        "GPU_DEVICE_TIME",
+        "GPU_DEVICE_MEMCPY_TIME",
+        "PEAK_GPU_MEMORY"
+      ],
+      "topology" :
+      {
+        "multi_device" : false,
+        "multi_host" : false,
+        "num_devices_per_host" : 1,
+        "num_hosts" : 1
+      },
+      "xla_compilation_flags" : []
+    },
+)";
+
+absl::StatusOr<xla::RunFrequency> GetRunFrequencyFromStr(
+    const std::string& workflow_type_arg_str) {
+  static const auto* const workflow_type_to_run_frequency =
+      new absl::flat_hash_map<std::string, xla::RunFrequency>{
+          {"presubmit", xla::RunFrequency::PRESUBMIT},
+          {"postsubmit", xla::RunFrequency::POSTSUBMIT},
+          {"nightly", xla::RunFrequency::SCHEDULED},
+          {"scheduled", xla::RunFrequency::SCHEDULED},
+          {"manual", xla::RunFrequency::MANUAL},
+      };
+  auto it = workflow_type_to_run_frequency->find(workflow_type_arg_str);
+  if (it == workflow_type_to_run_frequency->end()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Invalid --workflow_type: ", workflow_type_arg_str));
+  }
+  return it->second;
+}
+
+}  // namespace
+
+int main(int argc, char* argv[]) {
+  std::string registry_file_path_arg;
+  // "presubmit", "postsubmit", "nightly", "manual"
+  std::string workflow_type_arg_str;
+
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("registry_file", &registry_file_path_arg,
+                "Path to the benchmark registry file."),
+      tsl::Flag("workflow_type", &workflow_type_arg_str,
+                "Current workflow type (e.g., presubmit, postsubmit, nightly, "
+                "manual). Used to filter benchmarks.")};
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_ok) {
+    // Note: tsl::Flags::Parse usually prints usage on error,
+    // but adding a QFATAL provides explicit failure logging.
+    LOG(QFATAL) << "Failed to parse command-line flags using tsl::Flags::Parse";
+  }
+  tsl::port::InitMain(argv[0], &argc, &argv);
+
+  if (registry_file_path_arg.empty()) {
+    std::string kUsageString =
+        absl::StrCat(kUsageText, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+    LOG(QFATAL) << "Required flag --registry_file is missing.\n"
+                << kUsageString;
+  }
+  if (workflow_type_arg_str.empty()) {
+    LOG(QFATAL) << "Required flag --workflow_type is missing.";
+  }
+
+  // Convert workflow_type_arg_str to RunFrequency enum.
+  absl::StatusOr<xla::RunFrequency> current_run_frequency_status =
+      GetRunFrequencyFromStr(workflow_type_arg_str);
+  TF_QCHECK_OK(current_run_frequency_status.status());
+  xla::RunFrequency current_run_frequency = *current_run_frequency_status;
+
+  absl::StatusOr<xla::tools::benchmarks::BenchmarkSuite> suite_status =
+      xla::tools::benchmarks::LoadBenchmarkSuiteFromFile(
+          registry_file_path_arg);
+  TF_QCHECK_OK(suite_status.status()) << "Failed to load benchmark suite";
+  xla::tools::benchmarks::BenchmarkSuite suite = *suite_status;
+
+  absl::StatusOr<Json::Value> matrix_output =
+      xla::tools::benchmarks::BuildGitHubActionsMatrix(suite,
+                                                       current_run_frequency);
+  TF_QCHECK_OK(matrix_output.status())
+      << "Failed to build GitHub Actions matrix";
+
+  // Dump JSON matrix to stdout.
+  Json::StreamWriterBuilder writer_builder;
+  writer_builder["indentation"] = "  ";
+  std::string output_string = Json::writeString(writer_builder, *matrix_output);
+
+  std::cout << output_string << std::endl;
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_test.cc b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_test.cc
new file mode 100644
index 000000000000..5fbd437d1ab0
--- /dev/null
+++ b/third_party/xla/xla/tools/benchmarks/utils/generate_benchmark_matrices_test.cc
@@ -0,0 +1,443 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/tools/benchmarks/utils/generate_benchmark_matrices.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "json/json.h"
+#include "xla/tools/benchmarks/proto/benchmark_config.pb.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
+#include "tsl/platform/path.h"
+
+namespace xla {
+namespace tools {
+namespace benchmarks {
+namespace {
+
+using testing::HasSubstr;
+using testing::IsEmpty;
+using testing::SizeIs;
+using tsl::testing::IsOkAndHolds;
+using tsl::testing::StatusIs;
+
+// Helper function to create a temporary registry file.
+std::string CreateTempRegistryFile(
+    const std::string& content,
+    const std::string& filename_prefix = "registry_test") {
+  std::string temp_dir = tsl::testing::TmpDir();
+  std::string filepath = tsl::io::JoinPath(
+      temp_dir, absl::StrCat(filename_prefix, "_",
+                             tsl::Env::Default()->NowMicros(), ".textproto"));
+
+  std::ofstream file_stream(filepath);
+  if (!file_stream.is_open()) {
+    ADD_FAILURE() << "Failed to open temporary file for writing: " << filepath;
+    return "";
+  }
+  file_stream << content;
+  file_stream.close();
+  if (!file_stream) {
+    ADD_FAILURE() << "Failed to write to temporary file: " << filepath;
+    return "";
+  }
+  EXPECT_TRUE(tsl::Env::Default()->FileExists(filepath).ok());
+  return filepath;
+}
+
+// Matcher for checking a JSON value against a string.
+MATCHER_P(JsonStringEq, expected_str, "") {
+  return arg.isString() && arg.asString() == expected_str;
+}
+// Matcher for checking a JSON value against a bool.
+MATCHER_P(JsonBoolEq, expected_bool, "") {
+  return arg.isBool() && arg.asBool() == expected_bool;
+}
+// Matcher for checking a JSON array size.
+MATCHER_P(JsonArraySizeIs, expected_size, "") {
+  return arg.isArray() && arg.size() == expected_size;
+}
+// Matcher for checking if a JSON array contains a specific string.
+MATCHER_P(JsonArrayContains, expected_str, "") {
+  if (!arg.isArray()) {
+    return false;
+  }
+  for (const auto& item : arg) {
+    if (item.isString() && item.asString() == expected_str) {
+      return true;
+    }
+  }
+  return false;
+}
+// Matcher for checking if a JSON value is equal to another JSON value.
+MATCHER_P(JsonEq, value, "") { return arg == value; }
+// Matcher for checking if a JSON value is a JSON array.
+MATCHER(IsJsonArray, "") { return arg.isArray(); }
+// Matcher for checking if a JSON value is a JSON object.
+MATCHER(IsJsonObject, "") { return arg.isObject(); }
+
+// Test fixture for managing temporary files and environment variables.
+class GenerateBenchmarkMatricesTest : public testing::Test {
+ protected:
+  std::string CreateTestRegistryContent() {
+    return R"(
+      benchmarks: [
+        {
+          config_id: "gemma_gpu_b200_1h1d" # Added config_id back based on latest proto
+          name: "gemma_stablehlo_gpu"
+          owner: "owner1@"
+          input_artifact: {
+             input_format: STABLEHLO_MLIR
+             artifact_gcs_bucket_path: "gs://bucket/gemma.mlir"
+          }
+          model_source_info: "Gemma StableHLO"
+          hardware_targets: [
+            { hardware_category: GPU_B200, topology: {num_hosts:1, num_devices_per_host:1}, target_metrics: [GPU_DEVICE_TIME, PEAK_GPU_MEMORY] },
+            { hardware_category: GPU_L4,   topology: {num_hosts:1, num_devices_per_host:1}, target_metrics: [GPU_DEVICE_TIME] }
+          ]
+          run_frequencies: [POSTSUBMIT, SCHEDULED]
+          update_frequency_policy: QUARTERLY
+          runtime_flags: "--num_repeat=5"
+        },
+        {
+          config_id: "fusion_cpu_x86_1h1d"
+          name: "fusion_hlo_cpu"
+          owner: "owner2@"
+          input_artifact: {
+            input_format: HLO_TEXT
+            artifact_path: "hlo/fusion.hlo"
+          }
+          model_source_info: "Fusion HLO"
+          hardware_targets: [ { hardware_category: CPU_X86, topology: {num_hosts:1, num_devices_per_host:1}, target_metrics: [CPU_TIME] } ]
+          run_frequencies: [PRESUBMIT]
+          update_frequency_policy: MONTHLY
+          xla_compilation_flags: "--flag1=true"
+        },
+        { # Skipped due to mapping
+          config_id: "unmap_gpu_l4_8h8d"
+          name: "unmappable" owner:"t" model_source_info:"m"
+           input_artifact: { input_format:HLO_TEXT artifact_path:"f.hlo" }
+          hardware_targets: [{hardware_category:GPU_L4, topology:{num_hosts:8, num_devices_per_host:8}}] run_frequencies:[MANUAL] update_frequency_policy:WEEKLY
+        },
+        { # Now valid as missing source check moved to GetArtifactInfo
+          config_id: "no_source_test" # Will fail during matrix generation
+          name: "no_source" owner:"t" model_source_info:"m"
+           # Missing input_artifact field entirely - will fail loading
+           # OR: input_artifact: { input_format:HLO_TEXT } # Missing path/gcs_path - will fail GetArtifactInfo
+           hardware_targets: [{hardware_category:CPU_X86, topology:{num_hosts:1, num_devices_per_host:1}}] run_frequencies:[MANUAL] update_frequency_policy:WEEKLY
+        }
+      ]
+      )";
+  }
+
+  // Helper to set environment variable temporarily.
+  void SetEnvVar(const std::string& name, const std::string& value) {
+    setenv(name.c_str(), value.c_str(), 1);  // 1 = overwrite
+    env_vars_to_clear_.push_back(name);
+  }
+
+  void TearDown() override {
+    // Clean up environment variables set during the test.
+    for (const auto& var_name : env_vars_to_clear_) {
+      unsetenv(var_name.c_str());
+    }
+    // Could also clean up temp files if needed, but gUnit usually handles
+    // TmpDir
+  }
+
+ private:
+  std::vector<std::string> env_vars_to_clear_;
+};
+
+// --- ParseRegistry Tests ---
+
+TEST_F(GenerateBenchmarkMatricesTest, LoadBenchmarkSuiteSuccess) {
+  std::string filepath = CreateTempRegistryFile(CreateTestRegistryContent());
+  ASSERT_FALSE(filepath.empty());
+  TF_ASSERT_OK_AND_ASSIGN(BenchmarkSuite suite,
+                          LoadBenchmarkSuiteFromFile(filepath));
+  ASSERT_THAT(suite.benchmarks(), SizeIs(4));
+
+  // Check parsing of InputArtifact
+  EXPECT_TRUE(suite.benchmarks(0).has_input_artifact());
+  EXPECT_EQ(suite.benchmarks(0).input_artifact().input_format(),
+            InputFormat::STABLEHLO_MLIR);
+  EXPECT_TRUE(suite.benchmarks(0).input_artifact().artifact_path().empty());
+  EXPECT_EQ(suite.benchmarks(0).input_artifact().artifact_gcs_bucket_path(),
+            "gs://bucket/gemma.mlir");
+
+  EXPECT_TRUE(suite.benchmarks(1).has_input_artifact());
+  EXPECT_EQ(suite.benchmarks(1).input_artifact().input_format(),
+            InputFormat::HLO_TEXT);
+  EXPECT_EQ(suite.benchmarks(1).input_artifact().artifact_path(),
+            "hlo/fusion.hlo");
+  EXPECT_TRUE(
+      suite.benchmarks(1).input_artifact().artifact_gcs_bucket_path().empty());
+
+  // Check config_id presence
+  EXPECT_EQ(suite.benchmarks(0).config_id(), "gemma_gpu_b200_1h1d");
+  EXPECT_EQ(suite.benchmarks(1).config_id(), "fusion_cpu_x86_1h1d");
+}
+
+TEST_F(GenerateBenchmarkMatricesTest,
+       LoadBenchmarkSuiteMissingInputArtifactField) {
+  // Modify content to remove the input_artifact field from one entry
+  std::string content = R"(
+         benchmarks: [ {
+             config_id: "test1" name: "test_no_artifact_field" owner:"t"
+             model_source_info:"m"
+             hardware_targets: [{hardware_category:CPU_X86, topology:{num_hosts:1, num_devices_per_host:1}}]
+             run_frequencies:[MANUAL] update_frequency_policy:WEEKLY
+         } ] )";
+  std::string filepath = CreateTempRegistryFile(content);
+  ASSERT_FALSE(filepath.empty());
+  // Check that LoadBenchmarkSuiteFromFile now returns an error due to missing
+  // field check
+  TF_ASSERT_OK_AND_ASSIGN(BenchmarkSuite suite,
+                          LoadBenchmarkSuiteFromFile(filepath));
+}
+
+TEST_F(GenerateBenchmarkMatricesTest, LoadBenchmarkSuiteMissingArtifactSource) {
+  std::string content = R"(
+         benchmarks: [ {
+             config_id: "test1" name: "test_no_source" owner:"t"
+             input_artifact: { input_format:HLO_TEXT } # Missing path/gcs_path
+             model_source_info:"m"
+             hardware_targets: [{hardware_category:CPU_X86, topology:{num_hosts:1, num_devices_per_host:1}}]
+             run_frequencies:[MANUAL] update_frequency_policy:WEEKLY
+         } ] )";
+  std::string filepath = CreateTempRegistryFile(content);
+  ASSERT_FALSE(filepath.empty());
+  // Check that LoadBenchmarkSuiteFromFile now returns an error due to missing
+  // source check
+  TF_ASSERT_OK_AND_ASSIGN(BenchmarkSuite suite,
+                          LoadBenchmarkSuiteFromFile(filepath));
+}
+
+// --- BuildGitHubActionsMatrix Tests ---
+TEST_F(GenerateBenchmarkMatricesTest, BuildMatrixForPresubmitSuccess) {
+  std::string filepath = CreateTempRegistryFile(CreateTestRegistryContent());
+  ASSERT_FALSE(filepath.empty());
+  TF_ASSERT_OK_AND_ASSIGN(BenchmarkSuite suite,
+                          LoadBenchmarkSuiteFromFile(filepath));
+
+  // Test for PRESUBMIT frequency
+  TF_ASSERT_OK_AND_ASSIGN(
+      Json::Value matrix_array,
+      BuildGitHubActionsMatrix(suite, RunFrequency::PRESUBMIT));
+  ASSERT_THAT(matrix_array, IsJsonArray());
+  // Only "fusion_hlo_cpu" runs on PRESUBMIT (1 target * 1 frequency)
+  EXPECT_THAT(matrix_array, JsonArraySizeIs(1));
+
+  const auto& entry0 = matrix_array[0];
+  EXPECT_THAT(entry0["benchmark_name"], JsonStringEq("fusion_hlo_cpu"));
+  EXPECT_THAT(entry0["config_id"], JsonStringEq("fusion_cpu_x86_1h1d"));
+  EXPECT_THAT(entry0["run_frequency"], JsonStringEq("PRESUBMIT"));
+  EXPECT_THAT(entry0["hardware_category"], JsonStringEq("CPU_X86"));
+  EXPECT_THAT(entry0["input_format"], JsonStringEq("HLO_TEXT"));
+  EXPECT_THAT(entry0["artifact_location"], JsonStringEq("hlo/fusion.hlo"));
+  EXPECT_THAT(entry0["is_gcs_artifact"], JsonBoolEq(false));
+  EXPECT_THAT(entry0["xla_compilation_flags"], JsonArraySizeIs(1));
+  EXPECT_THAT(entry0["xla_compilation_flags"],
+              JsonArrayContains("--flag1=true"));
+}
+
+TEST_F(GenerateBenchmarkMatricesTest, BuildMatrixForPostsubmitSuccess) {
+  std::string filepath = CreateTempRegistryFile(CreateTestRegistryContent());
+  ASSERT_FALSE(filepath.empty());
+  TF_ASSERT_OK_AND_ASSIGN(BenchmarkSuite suite,
+                          LoadBenchmarkSuiteFromFile(filepath));
+
+  // Test for POSTSUBMIT frequency
+  TF_ASSERT_OK_AND_ASSIGN(
+      Json::Value matrix_array,
+      BuildGitHubActionsMatrix(suite, RunFrequency::POSTSUBMIT));
+  ASSERT_THAT(matrix_array, IsJsonArray());
+  // "gemma_stablehlo_gpu" runs on POSTSUBMIT (2 targets * 1 frequency)
+  EXPECT_THAT(matrix_array, JsonArraySizeIs(2));
+
+  bool found_b200 = false;
+  bool found_l4 = false;
+  for (const auto& entry : matrix_array) {
+    EXPECT_THAT(entry["benchmark_name"], JsonStringEq("gemma_stablehlo_gpu"));
+    EXPECT_THAT(entry["config_id"], JsonStringEq("gemma_gpu_b200_1h1d"));
+    EXPECT_THAT(entry["run_frequency"], JsonStringEq("POSTSUBMIT"));
+    EXPECT_THAT(entry["input_format"], JsonStringEq("STABLEHLO_MLIR"));
+    EXPECT_THAT(entry["artifact_location"],
+                JsonStringEq("gs://bucket/gemma.mlir"));
+    EXPECT_THAT(entry["is_gcs_artifact"], JsonBoolEq(true));
+    EXPECT_THAT(entry["runtime_flags"], JsonArraySizeIs(1));
+    EXPECT_THAT(entry["runtime_flags"], JsonArrayContains("--num_repeat=5"));
+
+    if (entry["hardware_category"] == Json::Value("GPU_B200")) {
+      found_b200 = true;
+    }
+    if (entry["hardware_category"] == Json::Value("GPU_L4")) {
+      found_l4 = true;
+    }
+  }
+  EXPECT_TRUE(found_b200);
+  EXPECT_TRUE(found_l4);
+}
+
+TEST_F(GenerateBenchmarkMatricesTest, BuildMatrixSkipsDueToMappingError) {
+  std::string filepath = CreateTempRegistryFile(CreateTestRegistryContent());
+  ASSERT_FALSE(filepath.empty());
+  TF_ASSERT_OK_AND_ASSIGN(BenchmarkSuite suite,
+                          LoadBenchmarkSuiteFromFile(filepath));
+
+  // Test for MANUAL frequency where "unmappable_target_config" should be
+  // attempted but fail mapping
+  TF_ASSERT_OK_AND_ASSIGN(
+      Json::Value matrix_array,
+      BuildGitHubActionsMatrix(suite, RunFrequency::MANUAL));
+  ASSERT_THAT(matrix_array, IsJsonArray());
+  EXPECT_THAT(matrix_array, IsEmpty());
+}
+
+TEST_F(GenerateBenchmarkMatricesTest,
+       BuildMatrixSkipsDueToMissingArtifactSource) {
+  // Create a registry with only the config that's missing artifact source
+  std::string content = R"(
+      benchmarks: [ {
+          config_id: "no_source_path_manual"
+          name: "missing_artifact_source_config" owner:"tester@" model_source_info:"validation test"
+          input_artifact: { input_format:HLO_TEXT } # Missing artifact_path/gcs_path
+          hardware_targets: [{
+            hardware_category:CPU_X86,
+            topology:{num_hosts:1, num_devices_per_host:1, multi_host: false, multi_device: false},
+            target_metrics: [CPU_TIME]
+          }]
+          run_frequencies:[MANUAL] update_frequency_policy:WEEKLY
+      } ] )";
+  std::string filepath = CreateTempRegistryFile(content);
+  ASSERT_FALSE(filepath.empty());
+  TF_ASSERT_OK_AND_ASSIGN(BenchmarkSuite suite,
+                          LoadBenchmarkSuiteFromFile(filepath));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Json::Value matrix_array,
+      BuildGitHubActionsMatrix(suite, RunFrequency::MANUAL));
+  ASSERT_THAT(matrix_array, IsJsonArray());
+  EXPECT_THAT(matrix_array,
+              IsEmpty());  // Should be skipped because GetArtifactInfo fails
+}
+
+TEST_F(GenerateBenchmarkMatricesTest, BuildMatrixEmptySuite) {
+  BenchmarkSuite suite;
+  TF_ASSERT_OK_AND_ASSIGN(
+      Json::Value matrix_array,
+      BuildGitHubActionsMatrix(suite, RunFrequency::PRESUBMIT));
+  ASSERT_THAT(matrix_array, IsJsonArray());
+  EXPECT_THAT(matrix_array, IsEmpty());
+}
+
+// --- FindRegistryFile Tests ---
+
+TEST_F(GenerateBenchmarkMatricesTest, FindRegistryPathReturnsAbsolutePath) {
+  std::string temp_dir = tsl::testing::TmpDir();
+  std::string tmp_file = tsl::io::JoinPath(temp_dir, "find_abs_test.txt");
+  // Ensure the file exists
+  std::ofstream file(tmp_file);
+  ASSERT_TRUE(file.is_open());
+  file << "test";
+  file.close();
+  ASSERT_TRUE(file) << "Failed to write to temp file: " << tmp_file;
+  ASSERT_TRUE(tsl::Env::Default()->FileExists(tmp_file).ok());
+
+  char* resolved_tmp_cstr = realpath(tmp_file.c_str(), nullptr);
+  ASSERT_NE(resolved_tmp_cstr, nullptr) << "realpath failed for " << tmp_file;
+  std::string expected_absolute_path(resolved_tmp_cstr);
+  free(resolved_tmp_cstr);
+
+  EXPECT_THAT(FindRegistryFile(tmp_file), IsOkAndHolds(expected_absolute_path));
+}
+
+TEST_F(GenerateBenchmarkMatricesTest,
+       FindRegistryPathAbsolutePathDoesNotExist) {
+  std::string non_existent_absolute_path =
+      "/absolute/path/that/does/not/exist.txt";
+  EXPECT_THAT(
+      FindRegistryFile(non_existent_absolute_path),
+      StatusIs(absl::StatusCode::kFailedPrecondition,  // Expect NotFound
+               HasSubstr("Registry path specified but not found")));
+}
+
+TEST_F(GenerateBenchmarkMatricesTest,
+       FindRegistryRelativePathExistsInWorkspace) {
+  std::string temp_dir = tsl::testing::TmpDir();
+  // Set BUILD_WORKSPACE_DIRECTORY to temp_dir
+  SetEnvVar("BUILD_WORKSPACE_DIRECTORY", temp_dir);
+
+  // Create a file within that "workspace"
+  std::string relative_path = "relative_in_workspace.textproto";
+  std::string full_path = tsl::io::JoinPath(temp_dir, relative_path);
+  std::ofstream file(full_path);
+  ASSERT_TRUE(file.is_open());
+  file << "test";
+  file.close();
+  ASSERT_TRUE(file);
+  ASSERT_TRUE(tsl::Env::Default()->FileExists(full_path).ok());
+
+  // Resolve expected absolute path
+  char* resolved_full_cstr = realpath(full_path.c_str(), nullptr);
+  ASSERT_NE(resolved_full_cstr, nullptr);
+  std::string expected_absolute_path(resolved_full_cstr);
+  free(resolved_full_cstr);
+}
+
+TEST_F(GenerateBenchmarkMatricesTest, FindRegistryRelativePathDoesNotExist) {
+  std::string non_existent_relative = "i_dont_exist_anywhere.txt";
+  // Ensure BUILD_WORKSPACE_DIRECTORY isn't set to avoid confusion
+  unsetenv("BUILD_WORKSPACE_DIRECTORY");
+
+  EXPECT_THAT(FindRegistryFile(non_existent_relative),
+              StatusIs(absl::StatusCode::kFailedPrecondition,
+                       HasSubstr("Registry path specified but not found: "
+                                 "i_dont_exist_anywhere.txt")));
+}
+
+TEST_F(GenerateBenchmarkMatricesTest, FindRegistryBuildWorkspaceDirEmpty) {
+  // Set BUILD_WORKSPACE_DIRECTORY to an empty string.
+  SetEnvVar("BUILD_WORKSPACE_DIRECTORY", "");
+  constexpr absl::string_view kRelativePath = "some_file.txt";
+
+  // It should still try CWD, so only fail if not in CWD either.
+  EXPECT_THAT(
+      FindRegistryFile(std::string(kRelativePath)),
+      StatusIs(absl::StatusCode::kFailedPrecondition,
+               HasSubstr("Registry path specified but not found: some_file.")));
+}
+
+TEST_F(GenerateBenchmarkMatricesTest, FindRegistryPathIsEmpty) {
+  EXPECT_THAT(FindRegistryFile(""),
+              StatusIs(absl::StatusCode::kFailedPrecondition,
+                       HasSubstr("Registry path specified but not found:")));
+}
+
+}  // namespace
+}  // namespace benchmarks
+}  // namespace tools
+}  // namespace xla
diff --git a/third_party/xla/xla/tools/collective_perf_table_gen.cc b/third_party/xla/xla/tools/collective_perf_table_gen.cc
index 278e6f9ed7ce..4cbfe8a5ca70 100644
--- a/third_party/xla/xla/tools/collective_perf_table_gen.cc
+++ b/third_party/xla/xla/tools/collective_perf_table_gen.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/tools/compute_gpu_device_stats.cc b/third_party/xla/xla/tools/compute_gpu_device_stats.cc
deleted file mode 100644
index f537ca1257d0..000000000000
--- a/third_party/xla/xla/tools/compute_gpu_device_stats.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/tools/compute_gpu_device_stats.h"
-
-#include <cstdint>
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-namespace xla::gpu {
-
-// Checks if an event is a memcpy operation.
-bool IsMemcpy(const tensorflow::profiler::XEvent& event,
-              int64_t memcpy_details_id) {
-  for (const auto& stat : event.stats()) {
-    if (stat.metadata_id() == memcpy_details_id) {
-      return true;
-    }
-  }
-  return false;
-}
-
-absl::StatusOr<LineStats> ProcessLineEvents(
-    const tensorflow::profiler::XLine& line, int64_t memcpy_details_id) {
-  LineStats stats;
-  for (const auto& event : line.events()) {
-    stats.total_time_ps += event.duration_ps();
-    if (IsMemcpy(event, memcpy_details_id)) {
-      stats.memcpy_time_ps += event.duration_ps();
-    }
-  }
-  return stats;
-}
-
-absl::StatusOr<GpuDeviceStats> CalculateDeviceTimeAndMemcpy(
-    const tensorflow::profiler::XSpace& xspace, absl::string_view device_name) {
-  GpuDeviceStats result;
-  int64_t total_time_ps = 0;
-  int64_t memcpy_time_ps = 0;
-
-  // Iterate over planes to find the device
-  for (const tensorflow::profiler::XPlane& plane : xspace.planes()) {
-    if (plane.name() != device_name) {
-      continue;  // Skip planes that aren't the target device.
-    }
-
-    // Create a map for stat metadata
-    absl::flat_hash_map<std::string, int64_t> stat_metadata_map;
-    for (const auto& stat_metadata : plane.stat_metadata()) {
-      stat_metadata_map[stat_metadata.second.name()] =
-          stat_metadata.second.id();
-    }
-
-    // Determine the memcpy details ID.
-    int64_t memcpy_details_id = -1;
-    if (auto it = stat_metadata_map.find("memcpy_details");
-        it != stat_metadata_map.end()) {
-      memcpy_details_id = it->second;
-    }
-
-    // Process each line in the plane
-    for (const auto& line : plane.lines()) {
-      TF_ASSIGN_OR_RETURN(LineStats line_stats,
-                          ProcessLineEvents(line, memcpy_details_id));
-      total_time_ps += line_stats.total_time_ps;
-      memcpy_time_ps += line_stats.memcpy_time_ps;
-    }
-    break;
-  }
-
-  // Calculate the time in microseconds
-  result.device_time_us = static_cast<double>(total_time_ps) / 1e6;
-  result.device_memcpy_time_us = static_cast<double>(memcpy_time_ps) / 1e6;
-  return result;
-}
-
-absl::Status Run(absl::string_view input_file) {
-  if (input_file.empty()) {
-    return absl::InvalidArgumentError("Input file must be specified.");
-  }
-  LOG(INFO) << "Input file: " << input_file;
-
-  // Read the XSpace protobuf
-  tsl::Env* env = tsl::Env::Default();
-  auto xspace_proto = std::make_unique<tensorflow::profiler::XSpace>();
-  TF_RETURN_IF_ERROR(
-      tsl::ReadBinaryProto(env, std::string(input_file), xspace_proto.get()));
-
-  LOG(INFO) << "Successfully parsed XSpace proto.";
-
-  // Calculate device and memcpy times
-  const std::string device_name = "/device:GPU:0";
-  absl::StatusOr<GpuDeviceStats> stats =
-      CalculateDeviceTimeAndMemcpy(*xspace_proto, device_name);
-  if (!stats.ok()) {
-    return stats.status();
-  }
-
-  // Print the results
-  std::cout << absl::StrFormat("Device Time: %.2f us\n", stats->device_time_us)
-            << absl::StrFormat("Device Memcpy Time: %.2f us\n",
-                               stats->device_memcpy_time_us);
-  return absl::OkStatus();
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/tools/compute_gpu_device_stats.h b/third_party/xla/xla/tools/compute_gpu_device_stats.h
deleted file mode 100644
index c9ca36389e94..000000000000
--- a/third_party/xla/xla/tools/compute_gpu_device_stats.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// A library for computing GPU statistics from an XSpace protobuf.
-#include <cstdint>
-
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "tsl/profiler/protobuf/xplane.pb.h"
-
-#ifndef XLA_TOOLS_COMPUTE_GPU_DEVICE_STATS_H_
-#define XLA_TOOLS_COMPUTE_GPU_DEVICE_STATS_H_
-
-namespace xla::gpu {
-
-// Structure to hold the calculated GPU device statistics.
-struct GpuDeviceStats {
-  double device_time_us = 0.0;
-  double device_memcpy_time_us = 0.0;
-};
-
-// Structure to hold the calculated statistics for XEvent.
-struct LineStats {
-  int64_t total_time_ps = 0;
-  int64_t memcpy_time_ps = 0;
-};
-
-// Checks if an XEvent is a memcpy operation.
-bool IsMemcpy(const tensorflow::profiler::XEvent& event,
-              int64_t memcpy_details_id);
-
-// Processes an XLine and calculates the total time and memcpy time.
-absl::StatusOr<LineStats> ProcessLineEvents(
-    const tensorflow::profiler::XLine& line, int64_t memcpy_details_id);
-
-// Calculates GPU device and memcpy times from an XSpace.
-absl::StatusOr<GpuDeviceStats> CalculateDeviceTimeAndMemcpy(
-    const tensorflow::profiler::XSpace& xspace, absl::string_view device_name);
-
-// Reads an XSpace protobuf from a file and computes GPU statistics, and prints
-// them to stdout.  Returns an error status if something goes wrong.
-absl::Status Run(absl::string_view input_file);
-
-}  // namespace xla::gpu
-
-#endif  // XLA_TOOLS_COMPUTE_GPU_DEVICE_STATS_H_
diff --git a/third_party/xla/xla/tools/compute_gpu_device_stats_main.cc b/third_party/xla/xla/tools/compute_gpu_device_stats_main.cc
deleted file mode 100644
index 2ddf45f8d858..000000000000
--- a/third_party/xla/xla/tools/compute_gpu_device_stats_main.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// A tool for computing GPU statistics from an XSpace protobuf.
-
-#include <string>
-#include <vector>
-
-#include "absl/log/log.h"
-#include "absl/strings/str_cat.h"
-#include "xla/debug_options_flags.h"
-#include "xla/tools/compute_gpu_device_stats.h"
-#include "xla/tsl/util/command_line_flags.h"
-#include "tsl/platform/init_main.h"
-
-namespace {
-
-const char* const kUsage = R"(
-    This tool computes GPU statistics from an XSpace protobuf.
-
-    Usage:
-
-      bazel run compute_gpu_device_stats -- --input=path/to/xspace.pb
-
-    Output:
-      Device Time: 12345.67 us
-      Device Memcpy Time: 1234.56 us
-    )";
-}  // namespace
-
-int main(int argc, char* argv[]) {
-  std::string input;
-  std::vector<tsl::Flag> flag_list = {tsl::Flag("input", &input, "input file")};
-  xla::AppendDebugOptionsFlags(&flag_list);
-  const std::string usage_string =
-      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
-  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
-  tsl::port::InitMain(usage_string.c_str(), &argc, &argv);
-
-  if (!parse_ok) {
-    LOG(QFATAL) << usage_string;
-  }
-  if (input.empty()) {
-    LOG(QFATAL) << "Must specify input file with --input=<filename>";
-  }
-
-  absl::Status status = xla::gpu::Run(input);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return 1;
-  }
-  return 0;
-}
diff --git a/third_party/xla/xla/tools/compute_gpu_device_stats_test.cc b/third_party/xla/xla/tools/compute_gpu_device_stats_test.cc
deleted file mode 100644
index 5f93f658a512..000000000000
--- a/third_party/xla/xla/tools/compute_gpu_device_stats_test.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/tools/compute_gpu_device_stats.h"
-
-#include <gtest/gtest.h>
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla::gpu {
-namespace {
-
-TEST(ComputeGpuDeviceStatsTest, IsMemcpyTest) {
-  tensorflow::profiler::XEvent event;
-  event.add_stats()->set_metadata_id(1);
-  EXPECT_TRUE(xla::gpu::IsMemcpy(event, 1));
-  EXPECT_FALSE(xla::gpu::IsMemcpy(event, 2));
-}
-
-TEST(ComputeGpuDeviceStatsTest, ProcessLineEventsTest) {
-  tensorflow::profiler::XLine line;
-  tensorflow::profiler::XEvent* event = line.add_events();
-  event->set_duration_ps(1000);
-  event->add_stats()->set_metadata_id(1);
-  event = line.add_events();
-  event->set_duration_ps(2000);
-  event->add_stats()->set_metadata_id(2);
-  event = line.add_events();
-  event->set_duration_ps(3000);
-  event->add_stats()->set_metadata_id(1);
-  TF_ASSERT_OK_AND_ASSIGN(LineStats stats,
-                          xla::gpu::ProcessLineEvents(line, 1));
-  EXPECT_EQ(stats.total_time_ps, 6000);
-  EXPECT_EQ(stats.memcpy_time_ps, 4000);
-}
-
-TEST(ComputeGpuDeviceStatsTest, CalculateDeviceTimeAndMemcpyTest) {
-  tensorflow::profiler::XSpace xspace;
-  tensorflow::profiler::XPlane* plane = xspace.add_planes();
-  plane->set_name("/device:GPU:0");
-  tensorflow::profiler::XStatMetadata* stat_metadata =
-      &(*plane->mutable_stat_metadata())[1];
-  stat_metadata->set_id(1);
-  stat_metadata->set_name("memcpy_details");
-  tensorflow::profiler::XLine* line = plane->add_lines();
-  tensorflow::profiler::XEvent* event = line->add_events();
-  event->set_duration_ps(1000);
-  event->add_stats()->set_metadata_id(1);
-  event = line->add_events();
-  event->set_duration_ps(2000);
-  event->add_stats()->set_metadata_id(2);
-  event = line->add_events();
-  event->set_duration_ps(3000);
-  event->add_stats()->set_metadata_id(1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      GpuDeviceStats stats,
-      xla::gpu::CalculateDeviceTimeAndMemcpy(xspace, "/device:GPU:0"));
-  EXPECT_EQ(stats.device_time_us, 0.006);
-  EXPECT_EQ(stats.device_memcpy_time_us, 0.004);
-  EXPECT_TRUE(xla::gpu::IsMemcpy(*event, 1));
-  EXPECT_FALSE(xla::gpu::IsMemcpy(*event, 2));
-}
-
-}  // namespace
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/tools/compute_xspace_stats.cc b/third_party/xla/xla/tools/compute_xspace_stats.cc
new file mode 100644
index 000000000000..fe0a3961ae52
--- /dev/null
+++ b/third_party/xla/xla/tools/compute_xspace_stats.cc
@@ -0,0 +1,216 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/compute_xspace_stats.h"
+
+#include <cstdint>
+#include <iostream>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla::gpu {
+
+// Checks if an event is a memcpy operation.
+bool IsMemcpy(const tensorflow::profiler::XEvent& event,
+              int64_t memcpy_details_id) {
+  for (const auto& stat : event.stats()) {
+    if (stat.metadata_id() == memcpy_details_id) {
+      return true;
+    }
+  }
+  return false;
+}
+
+absl::StatusOr<LineStats> ProcessLineEvents(
+    const tensorflow::profiler::XLine& line, int64_t memcpy_details_id) {
+  LineStats stats;
+  for (const auto& event : line.events()) {
+    stats.total_time_ps += event.duration_ps();
+    if (IsMemcpy(event, memcpy_details_id)) {
+      stats.memcpy_time_ps += event.duration_ps();
+    }
+  }
+  return stats;
+}
+
+absl::StatusOr<LineStats> ProcessLineEvents(
+    const tensorflow::profiler::XLine& line) {
+  LineStats line_stats;
+  for (const auto& event : line.events()) {
+    line_stats.total_time_ps += event.duration_ps();
+  }
+  return line_stats;
+}
+
+absl::StatusOr<int64_t> GetTotalTimePs(
+    const tensorflow::profiler::XPlane& plane) {
+  int64_t total_time_ps = 0;
+  for (const auto& line : plane.lines()) {
+    TF_ASSIGN_OR_RETURN(xla::gpu::LineStats line_stats,
+                        xla::gpu::ProcessLineEvents(line));
+    total_time_ps += line_stats.total_time_ps;
+  }
+  return total_time_ps;
+}
+
+absl::StatusOr<int64_t> GetWallTimePs(
+    const tensorflow::profiler::XSpace& xspace) {
+  int64_t wall_time_ps = 0;
+  for (const tensorflow::profiler::XPlane& plane : xspace.planes()) {
+    if (plane.name() != "Task Environment") {
+      continue;
+    }
+    int64_t start_time_ns = 0;
+    int64_t stop_time_ns = 0;
+    absl::flat_hash_map<std::string, int64_t> stat_metadata_map;
+    for (const auto& stat_metadata : plane.stat_metadata()) {
+      stat_metadata_map[stat_metadata.second.name()] =
+          stat_metadata.second.id();
+    }
+
+    for (const auto& stat : plane.stats()) {
+      if (stat.metadata_id() == stat_metadata_map["profile_start_time"]) {
+        start_time_ns = stat.uint64_value();
+      } else if (stat.metadata_id() == stat_metadata_map["profile_stop_time"]) {
+        stop_time_ns = stat.uint64_value();
+      }
+    }
+
+    if (start_time_ns > 0 && stop_time_ns > 0) {
+      wall_time_ps = (stop_time_ns - start_time_ns) * 1000;  // ns to ps
+    }
+    break;
+  }
+  return wall_time_ps;
+}
+
+absl::StatusOr<GpuDeviceStats> CalculateGpuDeviceStats(
+    const tensorflow::profiler::XSpace& xspace) {
+  GpuDeviceStats result;
+  int64_t total_time_ps = 0;
+  int64_t memcpy_time_ps = 0;
+  absl::string_view device_name = "/device:GPU:0";
+
+  // Iterate over planes to find the device
+  for (const tensorflow::profiler::XPlane& plane : xspace.planes()) {
+    if (plane.name() != device_name) {
+      continue;  // Skip planes that aren't the target device.
+    }
+
+    // Create a map for stat metadata
+    absl::flat_hash_map<std::string, int64_t> stat_metadata_map;
+    for (const auto& stat_metadata : plane.stat_metadata()) {
+      stat_metadata_map[stat_metadata.second.name()] =
+          stat_metadata.second.id();
+    }
+
+    // Determine the memcpy details ID.
+    int64_t memcpy_details_id = -1;
+    if (auto it = stat_metadata_map.find("memcpy_details");
+        it != stat_metadata_map.end()) {
+      memcpy_details_id = it->second;
+    }
+
+    // Process each line in the plane
+    for (const auto& line : plane.lines()) {
+      TF_ASSIGN_OR_RETURN(LineStats line_stats,
+                          ProcessLineEvents(line, memcpy_details_id));
+      total_time_ps += line_stats.total_time_ps;
+      memcpy_time_ps += line_stats.memcpy_time_ps;
+    }
+    break;
+  }
+  // Calculate Wall Time from the "Task Environment" plane
+  TF_ASSIGN_OR_RETURN(int64_t wall_time_ps, GetWallTimePs(xspace));
+  result.wall_time_us = static_cast<double>(wall_time_ps) / 1e6;
+
+  // Calculate the time in microseconds
+  result.device_time_us = static_cast<double>(total_time_ps) / 1e6;
+  result.device_memcpy_time_us = static_cast<double>(memcpy_time_ps) / 1e6;
+  return result;
+}
+
+absl::StatusOr<xla::gpu::CpuStats> CalculateCpuStats(
+    const tensorflow::profiler::XSpace& xspace) {
+  xla::gpu::CpuStats result;
+
+  // Iterate over planes to find the CPU plane
+  for (const tensorflow::profiler::XPlane& plane : xspace.planes()) {
+    if (plane.name() != "/host:CPU") {
+      continue;  // Skip planes that aren't the target device.
+    }
+    TF_ASSIGN_OR_RETURN(int64_t total_time_ps, GetTotalTimePs(plane));
+    result.cpu_time_us = static_cast<double>(total_time_ps) / 1e6;
+    break;  // Assuming only one /host:CPU plane
+  }
+
+  // Calculate Wall Time from the "Task Environment" plane
+  TF_ASSIGN_OR_RETURN(int64_t wall_time_ps, GetWallTimePs(xspace));
+  result.wall_time_us = static_cast<double>(wall_time_ps) / 1e6;
+
+  return result;
+}
+
+absl::Status Run(absl::string_view input_file, absl::string_view device_type) {
+  if (input_file.empty()) {
+    return absl::InvalidArgumentError("Input file must be specified.");
+  }
+  LOG(INFO) << "Input file: " << input_file;
+  // Read the XSpace protobuf
+  tsl::Env* env = tsl::Env::Default();
+  tensorflow::profiler::XSpace xspace_proto;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadBinaryProto(env, std::string(input_file), &xspace_proto));
+
+  LOG(INFO) << "Successfully parsed XSpace proto.";
+
+  if (device_type == "GPU") {
+    absl::StatusOr<GpuDeviceStats> stats =
+        CalculateGpuDeviceStats(xspace_proto);
+    if (!stats.ok()) {
+      return stats.status();
+    }
+    // Print the results
+    std::cout << absl::StrFormat("Device Time: %.2f us\n",
+                                 stats->device_time_us)
+              << absl::StrFormat("Device Memcpy Time: %.2f us\n",
+                                 stats->device_memcpy_time_us);
+  } else if (device_type == "CPU") {
+    absl::StatusOr<CpuStats> cpu_stats = CalculateCpuStats(xspace_proto);
+    if (!cpu_stats.ok()) {
+      return cpu_stats.status();
+    }
+    // Print the results
+    std::cout << absl::StrFormat("CPU Time: %.2f us\n", cpu_stats->cpu_time_us)
+              << absl::StrFormat("Wall Time: %.2f us\n",
+                                 cpu_stats->wall_time_us);
+  } else {
+    return absl::InvalidArgumentError("Device type must be GPU or CPU.");
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/tools/compute_xspace_stats.h b/third_party/xla/xla/tools/compute_xspace_stats.h
new file mode 100644
index 000000000000..933ff388a3d0
--- /dev/null
+++ b/third_party/xla/xla/tools/compute_xspace_stats.h
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// A library for computing GPU statistics from an XSpace protobuf.
+#include <cstdint>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+#ifndef XLA_TOOLS_COMPUTE_XSPACE_STATS_H_
+#define XLA_TOOLS_COMPUTE_XSPACE_STATS_H_
+
+namespace xla::gpu {
+
+// Structure to hold the calculated GPU device statistics.
+struct GpuDeviceStats {
+  double device_time_us = 0.0;
+  double device_memcpy_time_us = 0.0;
+  double wall_time_us = 0.0;
+};
+
+struct CpuStats {
+  double cpu_time_us = 0.0;
+  double wall_time_us = 0.0;
+};
+
+// Structure to hold the calculated statistics for XEvent.
+struct LineStats {
+  int64_t total_time_ps = 0;
+  int64_t memcpy_time_ps = 0;
+};
+
+// Checks if an XEvent is a memcpy operation.
+bool IsMemcpy(const tensorflow::profiler::XEvent& event,
+              int64_t memcpy_details_id);
+
+// Processes an XLine and calculates the total time and memcpy time.
+absl::StatusOr<LineStats> ProcessLineEvents(
+    const tensorflow::profiler::XLine& line, int64_t memcpy_details_id);
+
+absl::StatusOr<LineStats> ProcessLineEvents(
+    const tensorflow::profiler::XLine& line);
+
+// Calculates GPU device and memcpy times from an XSpace.
+absl::StatusOr<GpuDeviceStats> CalculateGpuDeviceStats(
+    const tensorflow::profiler::XSpace& xspace);
+
+absl::StatusOr<CpuStats> CalculateCpuStats(
+    const tensorflow::profiler::XSpace& xspace);
+
+// Reads an XSpace protobuf from a file and computes GPU statistics, and prints
+// them to stdout.  Returns an error status if something goes wrong.
+absl::Status Run(absl::string_view input_file, absl::string_view device_type);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_TOOLS_COMPUTE_XSPACE_STATS_H_
diff --git a/third_party/xla/xla/tools/compute_xspace_stats_main.cc b/third_party/xla/xla/tools/compute_xspace_stats_main.cc
new file mode 100644
index 000000000000..703a906ca84e
--- /dev/null
+++ b/third_party/xla/xla/tools/compute_xspace_stats_main.cc
@@ -0,0 +1,66 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A tool for computing GPU statistics from an XSpace protobuf.
+
+#include <string>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "xla/debug_options_flags.h"
+#include "xla/tools/compute_xspace_stats.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tsl/platform/init_main.h"
+
+namespace {
+
+const char* const kUsage = R"(
+    This tool computes GPU/CPU statistics from an XSpace protobuf.
+
+    Usage:
+
+      bazel run compute_xspace_stats -- --input=path/to/xspace.pb --device_type=GPU
+
+    Output:
+      Device Time: 12345.67 us
+      Device Memcpy Time: 1234.56 us
+    )";
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  std::string input, device_type;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("input", &input, "input file"),
+      tsl::Flag("device_type", &device_type, "hlo|pb|pbtxt")};
+  xla::AppendDebugOptionsFlags(&flag_list);
+  const std::string kUsageString =
+      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
+  if (!parse_ok) {
+    LOG(QFATAL) << kUsageString;
+  }
+
+  xla::AppendDebugOptionsFlags(&flag_list);
+
+  absl::Status status = xla::gpu::Run(input, device_type);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return 1;
+  }
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/compute_xspace_stats_test.cc b/third_party/xla/xla/tools/compute_xspace_stats_test.cc
new file mode 100644
index 000000000000..0bc522beaa3d
--- /dev/null
+++ b/third_party/xla/xla/tools/compute_xspace_stats_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/compute_xspace_stats.h"
+
+#include <gtest/gtest.h>
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+TEST(ComputeXSpaceStatsTest, IsMemcpyTest) {
+  tensorflow::profiler::XEvent event;
+  event.add_stats()->set_metadata_id(1);
+  EXPECT_TRUE(xla::gpu::IsMemcpy(event, 1));
+  EXPECT_FALSE(xla::gpu::IsMemcpy(event, 2));
+}
+
+TEST(ComputeXSpaceStatsTest, ProcessLineEventsTest) {
+  tensorflow::profiler::XLine line;
+  tensorflow::profiler::XEvent* event = line.add_events();
+  event->set_duration_ps(1000);
+  event->add_stats()->set_metadata_id(1);
+  event = line.add_events();
+  event->set_duration_ps(2000);
+  event->add_stats()->set_metadata_id(2);
+  event = line.add_events();
+  event->set_duration_ps(3000);
+  event->add_stats()->set_metadata_id(1);
+  TF_ASSERT_OK_AND_ASSIGN(LineStats stats,
+                          xla::gpu::ProcessLineEvents(line, 1));
+  EXPECT_EQ(stats.total_time_ps, 6000);
+  EXPECT_EQ(stats.memcpy_time_ps, 4000);
+}
+
+TEST(ComputeXSpaceStatsTest, CalculateDeviceTimeAndMemcpyTest) {
+  tensorflow::profiler::XSpace xspace;
+  tensorflow::profiler::XPlane* plane = xspace.add_planes();
+  plane->set_name("/device:GPU:0");
+  tensorflow::profiler::XStatMetadata* stat_metadata =
+      &(*plane->mutable_stat_metadata())[1];
+  stat_metadata->set_id(1);
+  stat_metadata->set_name("memcpy_details");
+  tensorflow::profiler::XLine* line = plane->add_lines();
+  tensorflow::profiler::XEvent* event = line->add_events();
+  event->set_duration_ps(1000);
+  event->add_stats()->set_metadata_id(1);
+  event = line->add_events();
+  event->set_duration_ps(2000);
+  event->add_stats()->set_metadata_id(2);
+  event = line->add_events();
+  event->set_duration_ps(3000);
+  event->add_stats()->set_metadata_id(1);
+  TF_ASSERT_OK_AND_ASSIGN(GpuDeviceStats stats,
+                          xla::gpu::CalculateGpuDeviceStats(xspace));
+  EXPECT_EQ(stats.device_time_us, 0.006);
+  EXPECT_EQ(stats.device_memcpy_time_us, 0.004);
+  EXPECT_TRUE(xla::gpu::IsMemcpy(*event, 1));
+  EXPECT_FALSE(xla::gpu::IsMemcpy(*event, 2));
+}
+
+TEST(ComputeXSpaceStatsTest, CalculateCpuTimeTest) {
+  tensorflow::profiler::XSpace xspace;
+  tensorflow::profiler::XPlane* plane = xspace.add_planes();
+  plane->set_name("/host:CPU");
+  tensorflow::profiler::XLine* line = plane->add_lines();
+  tensorflow::profiler::XEvent* event = line->add_events();
+  event->set_duration_ps(1000);
+  event = line->add_events();
+  event->set_duration_ps(2000);
+
+  tensorflow::profiler::XPlane* task_environment_plane = xspace.add_planes();
+  task_environment_plane->set_name("Task Environment");
+  tensorflow::profiler::XStatMetadata* stat_metadata =
+      &(*task_environment_plane->mutable_stat_metadata())[1];
+  stat_metadata->set_id(1);
+  stat_metadata->set_name("profile_start_time");
+  tensorflow::profiler::XStat* stat = task_environment_plane->add_stats();
+  stat->set_metadata_id(1);
+  stat->set_uint64_value(1000);
+  stat_metadata = &(*task_environment_plane->mutable_stat_metadata())[2];
+  stat_metadata->set_id(2);
+  stat_metadata->set_name("profile_stop_time");
+  stat = task_environment_plane->add_stats();
+  stat->set_metadata_id(2);
+  stat->set_uint64_value(5000);
+
+  TF_ASSERT_OK_AND_ASSIGN(CpuStats stats, xla::gpu::CalculateCpuStats(xspace));
+  EXPECT_EQ(stats.cpu_time_us, 0.003);
+  EXPECT_EQ(stats.wall_time_us, 4);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/tools/extract_collective_operations.cc b/third_party/xla/xla/tools/extract_collective_operations.cc
index 2f484059d3d3..f5c3ca7c1d2a 100644
--- a/third_party/xla/xla/tools/extract_collective_operations.cc
+++ b/third_party/xla/xla/tools/extract_collective_operations.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tools/hlo_module_loader.h"
@@ -37,27 +39,48 @@ limitations under the License.
 
 namespace {
 const char* const kUsage = R"(
-This tool extracts collective operations from HLO module and saves them together
+This tool extracts collective operations (all-reduce and all-gather) from HLO module and saves them together
 to the separate module.
 
 Usage:
 bazel run extract_collective_operations -- --input=path/to/hlo_module
-  --output=path/to/hlo_module
+  --output=path/to/hlo_module --operations=all-reduce,all-gather
 )";
 }  // namespace
 
 namespace xla {
-absl::Status ExtractCollectiveOperations(const std::string& input,
-                                         const std::string& output) {
+
+absl::Status ExtractCollectiveOperations(
+    const std::string& input, const std::string& output,
+    const absl::flat_hash_set<HloOpcode>& operation_types) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> test_module,
       LoadModuleFromFile(input, std::string(tsl::io::Extension(input)),
                          hlo_module_loader_details::Config(), nullptr));
 
+  absl::flat_hash_set<HloOpcode> done_ops;
+  absl::flat_hash_set<HloOpcode> non_optimized_ops;
+  if (operation_types.contains(HloOpcode::kAllReduce)) {
+    non_optimized_ops.insert(HloOpcode::kAllReduce);
+    done_ops.insert(HloOpcode::kAllReduceDone);
+  }
+  if (operation_types.contains(HloOpcode::kAllGather)) {
+    non_optimized_ops.insert(HloOpcode::kAllGather);
+    done_ops.insert(HloOpcode::kAllGatherDone);
+  }
+
   std::vector<xla::HloInstruction*> collective_instructions;
   for (const auto& op : test_module->computations()) {
     for (const auto& instr : op->instructions()) {
-      if (absl::StartsWith(instr->name(), "all-")) {
+      if (operation_types.contains(HloOpcode::kAllReduce) &&
+          HloPredicateIsOp<HloOpcode::kAllReduce, HloOpcode::kAllReduceStart,
+                           HloOpcode::kAllReduceDone>(instr)) {
+        collective_instructions.push_back(instr);
+      }
+
+      if (operation_types.contains(HloOpcode::kAllGather) &&
+          HloPredicateIsOp<HloOpcode::kAllGather, HloOpcode::kAllGatherStart,
+                           HloOpcode::kAllGatherDone>(instr)) {
         collective_instructions.push_back(instr);
       }
     }
@@ -66,8 +89,8 @@ absl::Status ExtractCollectiveOperations(const std::string& input,
   if (collective_instructions.empty()) {
     return absl::InternalError("No collective instructions found.");
   }
-  auto collectives_module =
-      ExtractInstructionIntoNewModule(collective_instructions);
+  auto collectives_module = ExtractCollectiveOperationsIntoNewModule(
+      collective_instructions, done_ops, non_optimized_ops);
 
   QCHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), output,
                                    collectives_module->ToString()))
@@ -79,9 +102,12 @@ absl::Status ExtractCollectiveOperations(const std::string& input,
 int main(int argc, char** argv) {
   std::string input;
   std::string output;
+  std::string operations;
   std::vector<tsl::Flag> flag_list = {
       tsl::Flag("input", &input, "input file"),
-      tsl::Flag("output", &output, "output file")};
+      tsl::Flag("output", &output, "output file"),
+      tsl::Flag("operations", &operations,
+                "operations. possible values: all-reduce, all-gather")};
   xla::AppendDebugOptionsFlags(&flag_list);
   const std::string kUsageString =
       absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
@@ -90,6 +116,14 @@ int main(int argc, char** argv) {
   if (!parse_ok) {
     LOG(QFATAL) << kUsageString;
   }
-  TF_CHECK_OK(xla::ExtractCollectiveOperations(input, output));
+
+  absl::flat_hash_set<xla::HloOpcode> operation_types;
+  if (absl::StrContains(operations, "all-reduce")) {
+    operation_types.insert(xla::HloOpcode::kAllReduce);
+  }
+  if (absl::StrContains(operations, "all-gather")) {
+    operation_types.insert(xla::HloOpcode::kAllGather);
+  }
+  TF_CHECK_OK(xla::ExtractCollectiveOperations(input, output, operation_types));
   return 0;
 }
diff --git a/third_party/xla/xla/tools/extract_dots_for_benchmark.cc b/third_party/xla/xla/tools/extract_dots_for_benchmark.cc
new file mode 100644
index 000000000000..0624b08604ae
--- /dev/null
+++ b/third_party/xla/xla/tools/extract_dots_for_benchmark.cc
@@ -0,0 +1,147 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A tool for printing benchmark entries for CPU backend's dot_benchmark_test.
+// See kUsage for details.
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/tools/hlo_module_loader.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/init_main.h"
+#include "tsl/platform/protobuf.h"
+
+namespace {
+const char* const kUsage = R"(
+This tool prints all dots of an HLO module in a format that can be pasted into
+//xla/backends/cpu/benchmarks/dot_benchmark_test.cc.
+
+Usage:
+
+  bazel run extract_dots_for_benchmark -- --input=path/to/hlo_module \
+    --format=[hlo|pb|pbtxt]
+
+Example output:
+  GenericDot{name, BF16, {1,11,1152}, BF16, {4,1152,256}, BF16, {1,11,4,256}, {}, {}, {2}, {1}},
+  GenericDot{name, BF16, {2,1,1152,256}, BF16, {1,11,1152}, BF16, {2,1,256,1,11}, {}, {}, {2}, {2}},
+  GenericDot{name, BF16, {1,11,4,256}, BF16, {1,11,256}, BF16, {1,11,4,11}, {0}, {0}, {3}, {2}},
+)";
+}  // namespace
+
+namespace xla {
+namespace {
+
+std::string ShapeToBenchmarkString(const Shape& shape) {
+  return absl::StrCat(
+      absl::AsciiStrToUpper(
+          primitive_util::LowercasePrimitiveTypeName(shape.element_type())),
+      ", {", absl::StrJoin(shape.dimensions(), ","), "}");
+}
+
+std::string TupleToString(const tsl::protobuf::RepeatedField<int64_t>& tuple) {
+  std::vector<int64_t> tuple_vec(tuple.begin(), tuple.end());
+  return absl::StrCat("{", absl::StrJoin(tuple_vec, ","), "}");
+}
+
+void PrintDots(const HloModule& module) {
+  absl::flat_hash_set<std::string> entries;  // Only print unique dots.
+  for (const HloComputation* computation : module.computations()) {
+    for (const HloInstruction* hlo : computation->instructions()) {
+      if (hlo->opcode() == HloOpcode::kDot) {
+        DotDimensionNumbers dot_dims = hlo->dot_dimension_numbers();
+        std::string entry = absl::StrCat(
+            "GenericDot{name, ",
+            ShapeToBenchmarkString(hlo->operand(0)->shape()), ", ",
+            ShapeToBenchmarkString(hlo->operand(1)->shape()), ", ",
+            ShapeToBenchmarkString(hlo->shape()), ", ",
+            TupleToString(dot_dims.lhs_batch_dimensions()), ", ",
+            TupleToString(dot_dims.rhs_batch_dimensions()), ", ",
+            TupleToString(dot_dims.lhs_contracting_dimensions()), ", ",
+            TupleToString(dot_dims.rhs_contracting_dimensions()), "}");
+        entries.insert(entry);
+      }
+    }
+  }
+
+  // Print entries in alphabetical order.
+  std::vector<absl::string_view> entries_vec(entries.begin(), entries.end());
+  std::sort(entries_vec.begin(), entries_vec.end());
+  for (auto& entry : entries_vec) {
+    std::cout << entry << ",\n";
+  }
+}
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  std::string input, format;
+  bool gpu = false;
+  bool all = false;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("input", &input, "input file"),
+      tsl::Flag("format", &format, "hlo|pb|pbtxt"),
+      tsl::Flag("gpu", &gpu,
+                "Use GPU flavor of cost analysis instead of the generic one"),
+      tsl::Flag(
+          "all", &all,
+          "Also print costs and deduplicated name of each instruction, not "
+          "just the total costs for the module")};
+  xla::AppendDebugOptionsFlags(&flag_list);
+  const std::string kUsageString =
+      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
+  if (!parse_ok) {
+    LOG(QFATAL) << kUsageString;
+  }
+
+  std::unique_ptr<xla::HloCostAnalysis> analysis;
+  if (gpu) {
+    analysis = std::make_unique<xla::gpu::GpuHloCostAnalysis>(
+        xla::HloCostAnalysis::Options{});
+  } else {
+    analysis = std::make_unique<xla::HloCostAnalysis>();
+  }
+
+  std::unique_ptr<xla::HloModule> module =
+      *xla::LoadModuleFromFile(input, format, {});
+
+  TF_CHECK_OK(
+      module->entry_computation()->root_instruction()->Accept(&*analysis));
+
+  xla::PrintDots(*module);
+
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/hlo_bisect/BUILD b/third_party/xla/xla/tools/hlo_bisect/BUILD
index 5f2fcf83603f..228036730b5b 100644
--- a/third_party/xla/xla/tools/hlo_bisect/BUILD
+++ b/third_party/xla/xla/tools/hlo_bisect/BUILD
@@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 # Description:
 #   A tool for reducing a HLO module that produces incorrect results.
 load(
-    "//xla:xla.bzl",
+    "//xla:xla.default.bzl",
     "xla_cc_binary",
     "xla_cc_test",
 )
@@ -63,9 +63,9 @@ xla_cc_test(
         ":hlo_bisect_state",
         "//xla:literal",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state_test.cc b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state_test.cc
index 19bb2f2c3e7f..e2a33ab1eb65 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state_test.cc
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state_test.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/literal.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace bisect {
@@ -35,7 +35,7 @@ namespace {
 
 namespace m = match;
 
-using HloBisectStateTest = HloTestBase;
+using HloBisectStateTest = HloHardwareIndependentTestBase;
 
 // Simple test bug checker, verifies the presence of the given instructions in
 // the entry computation.
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
index 17f11ebabc85..8b5bd2376f8d 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -55,8 +56,8 @@ namespace {
 HloInstruction* CreateConstant(const Shape& shape,
                                HloComputation* computation) {
   if (shape.IsTuple()) {
-    std::vector<HloInstruction*> tuple_arguments(shape.tuple_shapes_size());
-    for (int index = 0; index < shape.tuple_shapes_size(); ++index) {
+    std::vector<HloInstruction*> tuple_arguments(shape.tuple_shapes().size());
+    for (int index = 0; index < shape.tuple_shapes().size(); ++index) {
       tuple_arguments[index] =
           CreateConstant(shape.tuple_shapes(index), computation);
     }
@@ -82,7 +83,7 @@ void PrintSubexpression(HloInstruction* inst, int depth) {
 bool IsConstantScalarInt(const HloInstruction* inst) {
   return inst->opcode() == HloOpcode::kConstant &&
          ShapeUtil::IsEffectiveScalar(inst->shape()) &&
-         inst->shape().IsInteger();
+         inst->shape().AreAllLeavesIntegers();
 }
 
 bool IsNotContainedInLoop(const HloInstruction& while_hlo,
@@ -183,7 +184,7 @@ absl::Status HloControlFlowFlattening::FlattenWhileLoop(
       // Lazily extract the prefix on demand, reuse it as needed.
       if (prefix == nullptr) {
         prefix = TupleUtil::ExtractPrefix(
-            new_tuple, new_tuple->shape().tuple_shapes_size() - 1);
+            new_tuple, new_tuple->shape().tuple_shapes().size() - 1);
       }
       TF_RETURN_IF_ERROR(new_tuple->ReplaceUseWithDifferentShape(user, prefix));
     }
@@ -269,7 +270,7 @@ absl::Status HloControlFlowFlattening::RemoveInfeed(
     HloInstruction* infeed_hlo) const {
   CHECK_EQ(infeed_hlo->opcode(), HloOpcode::kInfeed);
   HloComputation* computation = infeed_hlo->parent();
-  CHECK_EQ(infeed_hlo->shape().tuple_shapes_size(), 2);
+  CHECK_EQ(infeed_hlo->shape().tuple_shapes().size(), 2);
   const Shape& infeed_shape = ShapeUtil::GetSubshape(infeed_hlo->shape(), {0});
 
   HloInstruction* custom_call = computation->AddInstruction(
@@ -296,7 +297,7 @@ HloControlFlowFlattening::RemoveRecvAndRecvDone(
   CHECK_EQ(recv->opcode(), HloOpcode::kRecv);
 
   HloComputation* computation = recv_done->parent();
-  CHECK_EQ(recv_done->shape().tuple_shapes_size(), 2);
+  CHECK_EQ(recv_done->shape().tuple_shapes().size(), 2);
   HloModule* module = computation->parent();
 
   HloInstruction* custom_call_recv =
@@ -336,7 +337,8 @@ absl::Status HloControlFlowFlattening::RemoveOutfeed(
   HloInstruction* custom_call =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
           outfeed_hlo->shape(), outfeed_hlo->operands(),
-          kNopReturnTokenCustomCallTarget));
+          kNopReturnTokenCustomCallTarget, "",
+          CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
   Cast<HloCustomCallInstruction>(custom_call)
       ->set_custom_call_has_side_effect(true);
   // For SPMD graphs, partitioner requires that side-effecting custom calls have
@@ -374,7 +376,8 @@ HloControlFlowFlattening::RemoveSendAndSendDone(
   HloInstruction* custom_call_send_done =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
           send_done->shape(), send_done->operands(),
-          kNopReturnTokenCustomCallTarget));
+          kNopReturnTokenCustomCallTarget, "",
+          CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
   std::string original_send_done_name(send_done->name());
   Cast<HloCustomCallInstruction>(custom_call_send_done)
       ->set_custom_call_has_side_effect(true);
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
index 91398d8e9f9f..1aafc9479a4e 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/transforms/despecializer.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/spmd/spmd_partitioner.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 
 namespace xla {
@@ -40,7 +40,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class HloControlFlowFlatteningTest : public HloTestBase {
+class HloControlFlowFlatteningTest : public HloHardwareIndependentTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
       std::unique_ptr<VerifiedHloModule> hlo_module, int64_t num_devices = 2) {
diff --git a/third_party/xla/xla/tools/hlo_decomposer.cc b/third_party/xla/xla/tools/hlo_decomposer.cc
index e083dc798a5c..f8ed1102a869 100644
--- a/third_party/xla/xla/tools/hlo_decomposer.cc
+++ b/third_party/xla/xla/tools/hlo_decomposer.cc
@@ -116,8 +116,10 @@ absl::StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
   return modules;
 }
 
-std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
-    const std::vector<HloInstruction*>& instructions) {
+std::unique_ptr<HloModule> ExtractCollectiveOperationsIntoNewModule(
+    const std::vector<HloInstruction*>& instructions,
+    const absl::flat_hash_set<HloOpcode>& done_ops,
+    const absl::flat_hash_set<HloOpcode>& non_optimized_ops) {
   CHECK(!instructions.empty());
   HloInstruction& first_instruction = *instructions[0];
   auto new_hlo_module = std::make_unique<HloModule>(
@@ -128,24 +130,42 @@ std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
   int parameter_number = 0;
   HloComputation::Builder builder("entry_computation");
   HloCloneContext clone_context(new_hlo_module.get());
-  std::vector<HloInstruction*> new_instructions;
+  std::vector<HloInstruction*> result_instructions;
+  absl::flat_hash_map<std::string, HloInstruction*> start_op_map;
   for (auto* hlo : instructions) {
-    std::vector<HloInstruction*> new_operands;
-    for (const HloInstruction* operand : hlo->operands()) {
-      std::unique_ptr<HloInstruction> new_parameter =
-          HloInstruction::CreateParameter(parameter_number, operand->shape(),
-                                          operand->name());
-      ++parameter_number;
-      new_operands.push_back(builder.AddInstruction(std::move(new_parameter)));
+    if (done_ops.contains(hlo->opcode())) {
+      std::vector<HloInstruction*> new_operands;
+      for (const HloInstruction* operand : hlo->operands()) {
+        if (start_op_map.contains(operand->name())) {
+          new_operands.push_back(start_op_map[operand->name()]);
+        }
+      }
+      result_instructions.push_back(
+          builder.AddInstruction(hlo->CloneWithNewOperands(
+              hlo->shape(), new_operands, &clone_context)));
+    } else {
+      std::vector<HloInstruction*> new_operands;
+      for (const HloInstruction* operand : hlo->operands()) {
+        std::unique_ptr<HloInstruction> new_parameter =
+            HloInstruction::CreateParameter(parameter_number, operand->shape(),
+                                            operand->name());
+        ++parameter_number;
+        new_operands.push_back(
+            builder.AddInstruction(std::move(new_parameter)));
+      }
+      std::unique_ptr<HloInstruction> new_instruction =
+          hlo->CloneWithNewOperands(hlo->shape(), new_operands, &clone_context);
+      HloInstruction* new_instr_ptr =
+          builder.AddInstruction(std::move(new_instruction));
+      if (non_optimized_ops.contains(hlo->opcode())) {
+        result_instructions.push_back(new_instr_ptr);
+      }
+      start_op_map[hlo->name()] = new_instr_ptr;
     }
-    std::unique_ptr<HloInstruction> new_instruction =
-        hlo->CloneWithNewOperands(hlo->shape(), new_operands, &clone_context);
-    new_instructions.push_back(
-        builder.AddInstruction(std::move(new_instruction)));
   }
 
   std::unique_ptr<HloInstruction> tuple_instruction =
-      HloInstruction::CreateTuple(new_instructions);
+      HloInstruction::CreateTuple(result_instructions);
   builder.AddInstruction(std::move(tuple_instruction));
   new_hlo_module->AddEntryComputationWithLayouts(builder.Build());
   return new_hlo_module;
diff --git a/third_party/xla/xla/tools/hlo_decomposer.h b/third_party/xla/xla/tools/hlo_decomposer.h
index d12b4d82216d..7fe6629bdab2 100644
--- a/third_party/xla/xla/tools/hlo_decomposer.h
+++ b/third_party/xla/xla/tools/hlo_decomposer.h
@@ -19,10 +19,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 
@@ -42,8 +44,15 @@ std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
 // with parameter instructions even if the result of one instruction is used
 // as a parameter to another. Combines results of all operations into the
 // tuple and adds this tuple as a root instruction of the new module.
-std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
-    const std::vector<HloInstruction*>& instructions);
+// Parameters:
+//   instructions: HLO instructions to be extracted.
+//   done_ops: Set of HLO opcodes that are done operations (e.g. AllReduceDone).
+//   non_optimized_ops: Set of HLO opcodes that are not optimized (e.g.
+//   AllReduce).
+std::unique_ptr<HloModule> ExtractCollectiveOperationsIntoNewModule(
+    const std::vector<HloInstruction*>& instructions,
+    const absl::flat_hash_set<HloOpcode>& done_ops,
+    const absl::flat_hash_set<HloOpcode>& non_optimized_ops);
 
 // Extracts producer and consumer HLO instruction into a new HLO module
 // replacing its operands with parameter instructions.
diff --git a/third_party/xla/xla/tools/hlo_decomposer_test.cc b/third_party/xla/xla/tools/hlo_decomposer_test.cc
index c38aa8faa535..022f7f339dda 100644
--- a/third_party/xla/xla/tools/hlo_decomposer_test.cc
+++ b/third_party/xla/xla/tools/hlo_decomposer_test.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/filecheck.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class HloDecomposerTest : public HloTestBase {
+class HloDecomposerTest : public HloPjRtTestBase {
  protected:
   std::unique_ptr<HloModule> GetModule() {
     absl::string_view kHlo = R"(
diff --git a/third_party/xla/xla/tools/hlo_extractor_test.cc b/third_party/xla/xla/tools/hlo_extractor_test.cc
index 6e3637ef9fa7..4d6a1e9657dc 100644
--- a/third_party/xla/xla/tools/hlo_extractor_test.cc
+++ b/third_party/xla/xla/tools/hlo_extractor_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/shape_util.h"
-#include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
@@ -34,7 +34,7 @@ namespace {
 
 namespace op = testing::opcode_matchers;
 
-using HloExtractorTest = HloTestBase;
+using HloExtractorTest = HloHardwareIndependentTestBase;
 
 TEST_F(HloExtractorTest, ExtractTopLevel) {
   const std::string& hlo_string = R"(
diff --git a/third_party/xla/xla/tools/hlo_module_loader.cc b/third_party/xla/xla/tools/hlo_module_loader.cc
index ce090eeeb17e..5bbcbaa255a4 100644
--- a/third_party/xla/xla/tools/hlo_module_loader.cc
+++ b/third_party/xla/xla/tools/hlo_module_loader.cc
@@ -60,7 +60,7 @@ absl::Status OverrideConfig(const hlo_module_loader_details::Config& ovr_config,
 
 std::string StripLogHeaders(absl::string_view hlo_string) {
   // I0521 12:04:45.883483    1509 service.cc:186] ...
-  static RE2* matcher = new RE2(
+  static RE2* const matcher = new RE2(
       "[IWEF]\\d{4} "
       "\\d{2}:\\d{2}:\\d{2}\\.\\d+\\s+\\d+\\s+[^:]+:\\d+\\]\\s?(.*)");
   absl::string_view matches[4];
diff --git a/third_party/xla/xla/tools/hlo_module_loader_test.cc b/third_party/xla/xla/tools/hlo_module_loader_test.cc
index e8564593251c..6f3413a14589 100644
--- a/third_party/xla/xla/tools/hlo_module_loader_test.cc
+++ b/third_party/xla/xla/tools/hlo_module_loader_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "xla/tests/hlo_test_base.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-class HloModuleLoaderTest : public HloTestBase {};
+class HloModuleLoaderTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HloModuleLoaderTest, StripsLogHeaders) {
   const std::string& hlo_string = R"(
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 1bea5dc091ae..6aff248ecc85 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -59,11 +59,11 @@ cc_library(
         "//xla/service:while_loop_constant_sinking",
         "//xla/service:while_loop_invariant_code_motion",
         "//xla/service:while_loop_simplifier",
-        "//xla/service/gpu/transforms:all_gather_dynamic_slice_simplifier",
-        "//xla/service/gpu/transforms:all_reduce_splitter",
-        "//xla/service/gpu/transforms:collective_permute_valid_iteration_annotator",
         "//xla/service/gpu/transforms:scatter_expander",
         "//xla/service/gpu/transforms:scatter_slice_simplifier",
+        "//xla/service/gpu/transforms/collectives:all_gather_dynamic_slice_simplifier",
+        "//xla/service/gpu/transforms/collectives:all_reduce_splitter",
+        "//xla/service/spmd:sharding_format_picker",
         "//xla/service/spmd/shardy:shardy_xla_pass",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
@@ -100,7 +100,6 @@ cc_library(
         "//xla/service/gpu:gpu_compiler",
         "//xla/service/gpu:gpu_hlo_schedule",
         "//xla/service/gpu:gpu_spmd_pipeline",
-        "//xla/service/gpu/transforms:all_gather_optimizer",
         "//xla/service/gpu/transforms:cudnn_custom_call_converter",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:dot_dimension_sorter",
@@ -110,7 +109,6 @@ cc_library(
         "//xla/service/gpu/transforms:gemm_broadcast_folding_rewriter",
         "//xla/service/gpu/transforms:gemm_fusion",
         "//xla/service/gpu/transforms:gemv_rewriter",
-        "//xla/service/gpu/transforms:pipelined_p2p_rewriter",
         "//xla/service/gpu/transforms:reduce_scatter_creator",
         "//xla/service/gpu/transforms:reduction_degenerate_dim_remover",
         "//xla/service/gpu/transforms:reduction_dimension_grouper",
@@ -122,7 +120,9 @@ cc_library(
         "//xla/service/gpu/transforms:topk_splitter",
         "//xla/service/gpu/transforms:transpose_dimension_grouper",
         "//xla/service/gpu/transforms:windowed_einsum_handler",
+        "//xla/service/gpu/transforms/collectives:all_gather_optimizer",
         "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/spmd:schedule_aware_collective_ops_cse",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
@@ -136,7 +136,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:ir_headers",
     ] + if_gpu_is_configured([
-        "//xla/service:gpu_plugin",
+        "//xla/service:gpu_plugin_without_collectives",
         "//xla/service/gpu:gpu_executable",
     ]) + if_cuda_is_configured([
         "//xla/stream_executor:cuda_platform",
@@ -157,6 +157,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/cpu/codegen:cpu_features",
+        "//xla/backends/cpu/codegen:ir_compiler",
         "//xla/backends/cpu/codegen:jit_compiler",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/hlo/ir:hlo",
@@ -210,6 +211,7 @@ lit_test_suite(
             "tests/gpu_hlo.hlo",
             "tests/gpu_hlo_backend.hlo",
             "tests/gpu_hlo_buffers.hlo",
+            "tests/gpu_hlo_collective_cse.hlo",
             "tests/gpu_hlo_llvm.hlo",
             "tests/gpu_hlo_pass.hlo",
             "tests/gpu_hlo_ptx.hlo",
diff --git a/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.cc b/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.cc
index 62304da074cc..3525b40ed7fc 100644
--- a/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.cc
+++ b/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.cc
@@ -37,9 +37,8 @@ limitations under the License.
 #include "xla/service/copy_insertion.h"
 #include "xla/service/executable.h"
 #include "xla/service/gather_expander.h"
-#include "xla/service/gpu/transforms/all_gather_dynamic_slice_simplifier.h"
-#include "xla/service/gpu/transforms/all_reduce_splitter.h"
-#include "xla/service/gpu/transforms/collective_permute_valid_iteration_annotator.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_dynamic_slice_simplifier.h"
+#include "xla/service/gpu/transforms/collectives/all_reduce_splitter.h"
 #include "xla/service/gpu/transforms/scatter_expander.h"
 #include "xla/service/gpu/transforms/scatter_slice_simplifier.h"
 #include "xla/service/map_inliner.h"
@@ -49,6 +48,7 @@ limitations under the License.
 #include "xla/service/scatter_simplifier.h"
 #include "xla/service/select_and_scatter_expander.h"
 #include "xla/service/sharding_remover.h"
+#include "xla/service/spmd/sharding_format_picker.h"
 #include "xla/service/spmd/shardy/shardy_xla_pass.h"
 #include "xla/service/topk_rewriter.h"
 #include "xla/service/triangular_solve_expander.h"
@@ -96,11 +96,12 @@ absl::StatusOr<std::optional<std::string>> CompiledOptProvider::GenerateStage(
   return std::nullopt;
 }
 
-absl::StatusOr<Compiler*> CompiledOptProvider::GetCompiler() {
+absl::StatusOr<std::unique_ptr<Compiler>> CompiledOptProvider::GetCompiler() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetPlatform(GetPlatformName()));
 
-  TF_ASSIGN_OR_RETURN(Compiler * compiler, Compiler::GetForPlatform(platform));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
+                      Compiler::GetForPlatform(platform));
   return compiler;
 }
 
@@ -110,7 +111,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CompiledOptProvider::GetOptimizedHlo(
 
   DebugOptions debug_opts = GetDebugOptionsFromFlags();
   Compiler::CompileOptions opts;
-  TF_ASSIGN_OR_RETURN(Compiler * compiler, GetCompiler());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler, GetCompiler());
   DebugOptions d = input_module->config().debug_options();
   d.set_xla_embed_ir_in_executable(true);
   input_module->mutable_config().set_debug_options(d);
@@ -133,7 +134,7 @@ absl::StatusOr<std::unique_ptr<Executable>> CompiledOptProvider::GetExecutable(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> optimized_module,
                       GetOptimizedHlo(std::move(input_module)));
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, GetExecutor());
-  TF_ASSIGN_OR_RETURN(Compiler * compiler, GetCompiler());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler, GetCompiler());
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       compiler->RunBackend(std::move(optimized_module), executor, opts));
@@ -156,7 +157,6 @@ void CompiledOptProvider::RegisterSharedHardwareSpecificPasses() {
   RegisterPass<BatchedGatherScatterNormalizer>();
   RegisterPass<BitcastDtypesExpander>();
   RegisterPass<CallInliner>();
-  RegisterPass<CollectivePermuteValidIterationAnnotator>();
   RegisterPass<ConditionalSimplifier>();
   RegisterPass<ConditionalToSelect>();
   RegisterPass<CopyInsertion>();
@@ -177,6 +177,13 @@ void CompiledOptProvider::RegisterSharedHardwareSpecificPasses() {
   RegisterPass<WhileLoopSimplifier>();
   RegisterPass<sdy::ShardyXLA>();
   // go/keep-sorted end
+
+  // Test-only passes exposing behavior that isn't easily testable through
+  // standard passes, e.g. internal or config-dependent behavior.
+  // go/keep-sorted start
+  RegisterPass<test_only::ShardingFormatPicker>(
+      test_only::ShardingFormatPicker::ShardingType::kBestEffortV2);
+  // go/keep-sorted end
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.h b/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.h
index eaabe294b553..8cecb1f3abb6 100644
--- a/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.h
+++ b/third_party/xla/xla/tools/hlo_opt/compiled_opt_lib.h
@@ -62,7 +62,7 @@ class CompiledOptProvider : public OptProvider {
       std::unique_ptr<HloModule> input_module);
 
   // Gets a compiler associated with the provider.
-  virtual absl::StatusOr<Compiler *> GetCompiler();
+  virtual absl::StatusOr<std::unique_ptr<Compiler>> GetCompiler();
 
   // Registers hardware-specific passes which are shared by
   // multiple backends (CPU, GPU, xPU).
diff --git a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
index afcdbec5056d..fc818bea7df4 100644
--- a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetOptions.h"
 #include "xla/backends/cpu/codegen/cpu_features.h"
+#include "xla/backends/cpu/codegen/ir_compiler.h"
 #include "xla/backends/cpu/codegen/jit_compiler.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/debug_options_flags.h"
@@ -108,7 +109,7 @@ class CpuOptProvider : public CompiledOptProvider {
     auto executor = GetExecutor();
     HloModuleConfig module_config = module.config();
     absl::StatusOr<std::unique_ptr<llvm::TargetMachine>> jit_target_machine =
-        cpu::JitCompiler::InferTargetMachine(
+        cpu::IrCompiler::InferTargetMachine(
             CompilerTargetOptions(module_config),
             CodeGenOptLevel(module_config),
             cpu::CpuFeatureFromString(
@@ -145,8 +146,8 @@ class CpuOptProvider : public CompiledOptProvider {
     RegisterPass<TransposeFolding>(
         [&](const HloInstruction& dot,
             int64_t operand) -> absl::StatusOr<bool> {
-          if (DotImplementationCanHandleTranspose(dot,
-                                                  target_machine_features)) {
+          if (DotImplementationCanHandleTranspose(
+                  dot, target_machine_features, /*allow_runtime_calls=*/true)) {
             return TransposeFolding::IsRowColumnTransposeDotOperand(dot,
                                                                     operand);
           }
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
index 07ae9c9d8c1f..e6124db1ea37 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_compiler.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
-#include "xla/service/gpu/transforms/all_gather_optimizer.h"
+#include "xla/service/gpu/transforms/collectives/all_gather_optimizer.h"
 #include "xla/service/gpu/transforms/cudnn_custom_call_converter.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/dot_dimension_sorter.h"
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/gemm_broadcast_folding_rewriter.h"
 #include "xla/service/gpu/transforms/gemm_fusion.h"
 #include "xla/service/gpu/transforms/gemv_rewriter.h"
-#include "xla/service/gpu/transforms/pipelined_p2p_rewriter.h"
 #include "xla/service/gpu/transforms/reduce_scatter_creator.h"
 #include "xla/service/gpu/transforms/reduction_degenerate_dim_remover.h"
 #include "xla/service/gpu/transforms/reduction_dimension_grouper.h"
@@ -53,6 +52,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/windowed_einsum_handler.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/spmd/schedule_aware_collective_ops_cse.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform.h"
@@ -120,6 +120,7 @@ class GpuOptProvider : public CompiledOptProvider {
   //////////////////////////////////////////////////////////////////////////////
   void RegisterProviderPasses(HloModule& module) override {
     auto device_description = GetDeviceDescription(&module);
+    auto debug_config = module.config().debug_options();
     se::GpuComputeCapability gpu_compute_capability;
     if (device_description.ok()) {
       gpu_compute_capability = device_description->gpu_compute_capability();
@@ -139,7 +140,6 @@ class GpuOptProvider : public CompiledOptProvider {
     RegisterPass<gpu::GemmBroadcastFoldingRewriter>();
     RegisterPass<gpu::GemmFusion>(gpu_compute_capability);
     RegisterPass<gpu::GemvRewriter>();
-    RegisterPass<gpu::PipelinedP2PRewriter>();
     RegisterPass<gpu::ReduceScatterCreator>();
     RegisterPass<gpu::ReductionDegenerateDimRemover>();
     RegisterPass<gpu::ReductionDimensionGrouper>();
@@ -150,6 +150,12 @@ class GpuOptProvider : public CompiledOptProvider {
     RegisterPass<gpu::TransposeDimensionGrouper>();
     RegisterPass<gpu::WindowedEinsumHandler>();
     // go/keep-sorted end
+    if (debug_config.xla_gpu_experimental_collective_cse_distance_threshold() >
+        0) {
+      RegisterPass<ScheduleAwareCollectiveOpsCSE>(
+          debug_config.xla_gpu_experimental_collective_cse_distance_threshold(),
+          false);
+    }
   }
 
  private:
@@ -169,10 +175,10 @@ class GpuOptProvider : public CompiledOptProvider {
                         GetDeviceDescription(optimized_module));
     TF_ASSIGN_OR_RETURN(se::Platform * platform,
                         PlatformUtil::GetPlatform(GetPlatformName()));
-    TF_ASSIGN_OR_RETURN(Compiler * compiler,
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
                         Compiler::GetForPlatform(platform));
 
-    auto* gpu_compiler = static_cast<gpu::GpuCompiler*>(compiler);
+    auto* gpu_compiler = static_cast<gpu::GpuCompiler*>(compiler.get());
     if (!optimized_module->has_schedule()) {
       TF_ASSIGN_OR_RETURN(gpu::ScheduleMetadata schedule_metadata,
                           gpu::ScheduleGpuModule(optimized_module,
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb
index 6de868f0d198..4f58cc288f6a 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb
@@ -37,7 +37,7 @@ gpu_device_info {
 platform_name: "CUDA"
 dnn_version_info {
   major: 8
-  minor: 3
-  patch: 2
+  minor: 9
+  patch: 4
 }
 device_description_str: "A100 80GB"
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b200.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/b200.txtpb
new file mode 100644
index 000000000000..96daa3296292
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/b200.txtpb
@@ -0,0 +1,42 @@
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 233472
+  threads_per_core_limit: 2048
+  core_count: 148
+  fpus_per_core: 128
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 7672320000000
+  l2_cache_size: 132644864
+  clock_rate_ghz: 1.965
+  device_memory_size: 191514673152
+  shared_memory_per_block_optin: 232448
+  cuda_compute_capability {
+    major: 10
+  }
+  registers_per_core_limit: 65536
+  registers_per_block_limit: 65536
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 9
+  minor: 7
+}
+device_description_str: "NVIDIA B200"
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb
index 3c882ac2dcbe..e72fa777e2ac 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb
@@ -16,6 +16,7 @@ gpu_device_info {
   threads_per_block_limit: 1024
   threads_per_warp: 64
   shared_memory_per_block: 65536
+  shared_memory_per_block_optin: 65536
   shared_memory_per_core: 65536
   threads_per_core_limit: 2048
   core_count: 110
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/cpu_hlo_pass.hlo b/third_party/xla/xla/tools/hlo_opt/tests/cpu_hlo_pass.hlo
index 14b5afb1e2e6..f35aed95cf20 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/cpu_hlo_pass.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/cpu_hlo_pass.hlo
@@ -6,10 +6,10 @@
 
 // CHECK-LABEL: ENTRY %DotOperationFusion_TransposeFusion
 // CHECK-NEXT:  %[[arg0:[^ ]+]] = f32[256,1]{1,0} parameter(0)
-// CHECK-NEXT:  %[[transpose:[^ ]+]] = f32[1,256]{1,0} transpose(f32[256,1]{1,0} %[[arg0]]), dimensions={1,0}
+// CHECK-NEXT:  %[[transpose:[^ ]+]] = f32[1,256]{1,0} transpose(%[[arg0]]), dimensions={1,0}
 // CHECK-NEXT:  %[[arg1:[^ ]+]] = f32[256,1024]{1,0} parameter(1)
-// CHECK-NEXT:  %[[exponential:[^ ]+]] = f32[256,1024]{1,0} exponential(f32[256,1024]{1,0} %[[arg1]])
-// CHECK-NEXT:  ROOT %[[dot:[^ ]+]] = f32[1,1024]{1,0} dot(f32[1,256]{1,0} %[[transpose]], f32[256,1024]{1,0} %[[exponential]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+// CHECK-NEXT:  %[[exponential:[^ ]+]] = f32[256,1024]{1,0} exponential(%[[arg1]])
+// CHECK-NEXT:  ROOT %[[dot:[^ ]+]] = f32[1,1024]{1,0} dot(%[[transpose]], %[[exponential]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
 HloModule DotOperationFusion_TransposeFusion
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
new file mode 100644
index 000000000000..fa7b25f68abe
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
@@ -0,0 +1,14 @@
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --passes=schedule-aware-collective-cse --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb --xla_gpu_experimental_collective_cse_distance_threshold=100 | FileCheck %s
+
+HloModule m
+
+// CHECK-COUNT-1: all-gather(
+
+ENTRY e {
+  param0 = s32[4] parameter(0)
+  all-gather.1 = s32[32] all-gather(param0), dimensions={0}, replica_groups=[1,8]<=[8], channel_id=1
+  all-gather.2 = s32[32] all-gather(param0), dimensions={0}, replica_groups=[1,8]<=[8], channel_id=2
+  all-gather.3 = s32[32] all-gather(param0), dimensions={0}, replica_groups=[1,8]<=[8], channel_id=3
+  add = s32[32] add(all-gather.1, all-gather.2)
+  ROOT add.1 = s32[32] add(add, all-gather.3)
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
index de9769578095..8df2cdf621eb 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
@@ -4,35 +4,35 @@ HloModule Algorithm3xBF16
 // CHECK-LABEL: HloModule Algorithm3xBF16, entry_computation_layout={(f32[128,128]{1,0}, f32[128,128]{1,0})->f32[128,128]{1,0}}
 // CHECK: ENTRY %e (p0: f32[128,128], p1: f32[128,128]) -> f32[128,128] {
 // CHECK-NEXT:    %[[constant2:.*]] = f32[] constant(inf)
-// CHECK-NEXT:    %[[broadcast2:.*]] = f32[128,128]{1,0} broadcast(f32[] %[[constant2]]), dimensions={}
+// CHECK-NEXT:    %[[broadcast2:.*]] = f32[128,128]{1,0} broadcast(%[[constant2]]), dimensions={}
 // CHECK-NEXT:    %[[p0:.*]] = f32[128,128]{1,0} parameter(0)
-// CHECK-NEXT:    %[[bitcastconvert:.*]] = u32[128,128]{1,0} bitcast-convert(f32[128,128]{1,0} %[[p0]])
+// CHECK-NEXT:    %[[bitcastconvert:.*]] = u32[128,128]{1,0} bitcast-convert(%[[p0]])
 // CHECK-NEXT:    %[[constant:.*]] = u32[] constant(4294901760)
-// CHECK-NEXT:    %[[broadcast:.*]] = u32[128,128]{1,0} broadcast(u32[] %[[constant]]), dimensions={}
-// CHECK-NEXT:    %[[and:.*]] = u32[128,128]{1,0} and(u32[128,128]{1,0} %[[bitcastconvert]], u32[128,128]{1,0} %[[broadcast]])
-// CHECK-NEXT:    %[[bitcastconvert1:.*]] = f32[128,128]{1,0} bitcast-convert(u32[128,128]{1,0} %[[and]])
-// CHECK-NEXT:    %[[subtract:.*]] = f32[128,128]{1,0} subtract(f32[128,128]{1,0} %[[p0]], f32[128,128]{1,0} %[[bitcastconvert1]])
-// CHECK-NEXT:    %[[convert1:.*]] = bf16[128,128]{1,0} convert(f32[128,128]{1,0} %[[subtract]])
+// CHECK-NEXT:    %[[broadcast:.*]] = u32[128,128]{1,0} broadcast(%[[constant]]), dimensions={}
+// CHECK-NEXT:    %[[and:.*]] = u32[128,128]{1,0} and(%[[bitcastconvert]], %[[broadcast]])
+// CHECK-NEXT:    %[[bitcastconvert1:.*]] = f32[128,128]{1,0} bitcast-convert(%[[and]])
+// CHECK-NEXT:    %[[subtract:.*]] = f32[128,128]{1,0} subtract(%[[p0]], %[[bitcastconvert1]])
+// CHECK-NEXT:    %[[convert1:.*]] = bf16[128,128]{1,0} convert(%[[subtract]])
 // CHECK-NEXT:    %[[p1:.*]] = f32[128,128]{1,0} parameter(1)
-// CHECK-NEXT:    %[[bitcastconvert2:.*]] = u32[128,128]{1,0} bitcast-convert(f32[128,128]{1,0} %[[p1]])
+// CHECK-NEXT:    %[[bitcastconvert2:.*]] = u32[128,128]{1,0} bitcast-convert(%[[p1]])
 // CHECK-NEXT:    %[[constant1:.*]] = u32[] constant(4294901760)
-// CHECK-NEXT:    %[[broadcast1:.*]] = u32[128,128]{1,0} broadcast(u32[] %[[constant1]]), dimensions={}
-// CHECK-NEXT:    %[[and1:.*]] = u32[128,128]{1,0} and(u32[128,128]{1,0} %[[bitcastconvert2]], u32[128,128]{1,0} %[[broadcast1]])
-// CHECK-NEXT:    %[[bitcastconvert3:.*]] = f32[128,128]{1,0} bitcast-convert(u32[128,128]{1,0} %[[and1]])
-// CHECK-NEXT:    %[[convert2:.*]] = bf16[128,128]{1,0} convert(f32[128,128]{1,0} %[[bitcastconvert3]])
-// CHECK-NEXT:    %[[dot1:.*]] = f32[128,128]{1,0} dot(bf16[128,128]{1,0} %[[convert1]], bf16[128,128]{1,0} %[[convert2]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=dot_bf16_bf16_f32
-// CHECK-NEXT:    %[[convert:.*]] = bf16[128,128]{1,0} convert(f32[128,128]{1,0} %[[bitcastconvert1]])
-// CHECK-NEXT:    %[[subtract1:.*]] = f32[128,128]{1,0} subtract(f32[128,128]{1,0} %[[p1]], f32[128,128]{1,0} %[[bitcastconvert3]])
-// CHECK-NEXT:    %[[convert3:.*]] = bf16[128,128]{1,0} convert(f32[128,128]{1,0} %[[subtract1]])
-// CHECK-NEXT:    %[[dot2:.*]] = f32[128,128]{1,0} dot(bf16[128,128]{1,0} %[[convert]], bf16[128,128]{1,0} %[[convert3]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=dot_bf16_bf16_f32
-// CHECK-NEXT:    %[[add:.*]] = f32[128,128]{1,0} add(f32[128,128]{1,0} %[[dot1]], f32[128,128]{1,0} %[[dot2]])
-// CHECK-NEXT:    %[[abs:.*]] = f32[128,128]{1,0} abs(f32[128,128]{1,0} %[[add]])
-// CHECK-NEXT:    %[[compare:.*]] = pred[128,128]{1,0} compare(f32[128,128]{1,0} %[[broadcast2]], f32[128,128]{1,0} %[[abs]]), direction=GE
+// CHECK-NEXT:    %[[broadcast1:.*]] = u32[128,128]{1,0} broadcast(%[[constant1]]), dimensions={}
+// CHECK-NEXT:    %[[and1:.*]] = u32[128,128]{1,0} and(%[[bitcastconvert2]], %[[broadcast1]])
+// CHECK-NEXT:    %[[bitcastconvert3:.*]] = f32[128,128]{1,0} bitcast-convert(%[[and1]])
+// CHECK-NEXT:    %[[convert2:.*]] = bf16[128,128]{1,0} convert(%[[bitcastconvert3]])
+// CHECK-NEXT:    %[[dot1:.*]] = f32[128,128]{1,0} dot(%[[convert1]], %[[convert2]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=dot_bf16_bf16_f32
+// CHECK-NEXT:    %[[convert:.*]] = bf16[128,128]{1,0} convert(%[[bitcastconvert1]])
+// CHECK-NEXT:    %[[subtract1:.*]] = f32[128,128]{1,0} subtract(%[[p1]], %[[bitcastconvert3]])
+// CHECK-NEXT:    %[[convert3:.*]] = bf16[128,128]{1,0} convert(%[[subtract1]])
+// CHECK-NEXT:    %[[dot2:.*]] = f32[128,128]{1,0} dot(%[[convert]], %[[convert3]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=dot_bf16_bf16_f32
+// CHECK-NEXT:    %[[add:.*]] = f32[128,128]{1,0} add(%[[dot1]], %[[dot2]])
+// CHECK-NEXT:    %[[abs:.*]] = f32[128,128]{1,0} abs(%[[add]])
+// CHECK-NEXT:    %[[compare:.*]] = pred[128,128]{1,0} compare(%[[broadcast2]], %[[abs]]), direction=GE
 // CHECK-NEXT:    %[[constant3:.*]] = f32[] constant(0)
-// CHECK-NEXT:    %[[broadcast3:.*]] = f32[128,128]{1,0} broadcast(f32[] %[[constant3]]), dimensions={}
-// CHECK-NEXT:    %[[select:.*]] = f32[128,128]{1,0} select(pred[128,128]{1,0} %[[compare]], f32[128,128]{1,0} %[[add]], f32[128,128]{1,0} %[[broadcast3]])
-// CHECK-NEXT:    %[[dot3:.*]] = f32[128,128]{1,0} dot(bf16[128,128]{1,0} %[[convert]], bf16[128,128]{1,0} %[[convert2]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=dot_bf16_bf16_f32
-// CHECK-NEXT:    ROOT %[[add1:.*]] = f32[128,128]{1,0} add(f32[128,128]{1,0} %[[select]], f32[128,128]{1,0} %[[dot3]])
+// CHECK-NEXT:    %[[broadcast3:.*]] = f32[128,128]{1,0} broadcast(%[[constant3]]), dimensions={}
+// CHECK-NEXT:    %[[select:.*]] = f32[128,128]{1,0} select(%[[compare]], %[[add]], %[[broadcast3]])
+// CHECK-NEXT:    %[[dot3:.*]] = f32[128,128]{1,0} dot(%[[convert]], %[[convert2]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}, algorithm=dot_bf16_bf16_f32
+// CHECK-NEXT:    ROOT %[[add1:.*]] = f32[128,128]{1,0} add(%[[select]], %[[dot3]])
 // CHECK-NEXT:  }
 ENTRY e {
   p0 = f32[128,128] parameter(0)
diff --git a/third_party/xla/xla/tools/hlo_slicer.cc b/third_party/xla/xla/tools/hlo_slicer.cc
index 2b1f1faa355c..79f76e7b766a 100644
--- a/third_party/xla/xla/tools/hlo_slicer.cc
+++ b/third_party/xla/xla/tools/hlo_slicer.cc
@@ -55,7 +55,7 @@ void ReduceTupleParameterHelper(HloModule* hlo_module,
 
   VLOG(1) << "Parameter instruction to be reduced: "
           << tuple_parameter->ToString()
-          << " shape size: " << tuple_parameter->shape().tuple_shapes_size()
+          << " shape size: " << tuple_parameter->shape().tuple_shapes().size()
           << " users size: " << tuple_parameter->users().size();
 
   // Collect the shapes of the elements that have users.
diff --git a/third_party/xla/xla/tools/hlo_slicer_test.cc b/third_party/xla/xla/tools/hlo_slicer_test.cc
index b117ba681496..59e6378f2c56 100644
--- a/third_party/xla/xla/tools/hlo_slicer_test.cc
+++ b/third_party/xla/xla/tools/hlo_slicer_test.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -36,7 +36,7 @@ namespace {
 
 namespace op = testing::opcode_matchers;
 
-using HloSlicerTest = HloTestBase;
+using HloSlicerTest = HloHardwareIndependentTestBase;
 
 TEST_F(HloSlicerTest, SingleComputationForwardSlice) {
   const std::string& hlo_string = R"(
@@ -1072,12 +1072,12 @@ TEST_F(HloSlicerTest, TestSliceModuleAndExtractReduceTupleParameter) {
     // Check that the new p.0 only has one element.
     HloInstruction* p_0 = FindInstruction(sliced_module.get(), "p.0");
     EXPECT_NE(p_0, nullptr);
-    EXPECT_EQ(p_0->shape().tuple_shapes_size(), 1);
+    EXPECT_EQ(p_0->shape().tuple_shapes().size(), 1);
 
     // Check that the new p.1 only has one element.
     HloInstruction* p_1 = FindInstruction(sliced_module.get(), "p.1");
     EXPECT_NE(p_1, nullptr);
-    EXPECT_EQ(p_1->shape().tuple_shapes_size(), 1);
+    EXPECT_EQ(p_1->shape().tuple_shapes().size(), 1);
   }
 }
 
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen.cc b/third_party/xla/xla/tools/matmul_perf_table_gen.cc
index 6a2e390996b8..2ecd3b67468f 100644
--- a/third_party/xla/xla/tools/matmul_perf_table_gen.cc
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
@@ -68,11 +69,11 @@ namespace {
 constexpr size_t kNumProfilingRuns = 5;
 
 template <class... Ts>
-struct VariantVisitor : Ts... {
+struct Overload : Ts... {
   using Ts::operator()...;
 };
 template <class... Ts>
-VariantVisitor(Ts...) -> VariantVisitor<Ts...>;
+Overload(Ts...) -> Overload<Ts...>;
 
 struct StaticSpec {
   int b;
@@ -251,7 +252,7 @@ std::vector<ExplicitSpec> GetExplicitSpecs(
   for (int i = 0; i < entry_specs.size(); i++) {
     const EntrySpec& entry_spec = entry_specs[i];
     std::visit(
-        VariantVisitor{
+        Overload{
             [&specs](const PathSpec& spec) {
               std::string hlo;
               CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), spec.filepath,
@@ -303,32 +304,31 @@ int64_t GetFlops(const HloDotInstruction& dot) {
     return ShapeUtil::GetDimension(instr.shape(), idx);
   };
 
+  const DotDimensionNumbers& dot_dims = dot.dot_dimension_numbers();
+  const HloInstruction& lhs = *dot.operand(0);
+  const HloInstruction& rhs = *dot.operand(1);
+
   // Get non-contracting dims
-  auto get_non_contracting_dim_sizes =
-      [&dot, &dim_size](const absl::flat_hash_set<int>& contracting_dims,
-                        int operand_id) {
-        int64_t fmas = 1;
-        for (int dim = 0; dim < dot.operand(operand_id)->shape().rank();
-             ++dim) {
-          if (contracting_dims.contains(dim)) {
-            continue;
-          }
-          fmas *= dim_size(*dot.operand(operand_id), dim);
-        }
-        return fmas;
-      };
-  fmas *= get_non_contracting_dim_sizes(
-      {dot.dot_dimension_numbers().lhs_contracting_dimensions().begin(),
-       dot.dot_dimension_numbers().lhs_contracting_dimensions().end()},
-      0);
-  fmas *= get_non_contracting_dim_sizes(
-      {dot.dot_dimension_numbers().rhs_contracting_dimensions().begin(),
-       dot.dot_dimension_numbers().rhs_contracting_dimensions().end()},
-      1);
+  for (int dim : GetNonContractingDims(lhs.shape().dimensions().size(),
+                                       dot_dims.lhs_contracting_dimensions(),
+                                       dot_dims.lhs_batch_dimensions())) {
+    fmas *= dim_size(lhs, dim);
+  }
+  for (int dim : GetNonContractingDims(rhs.shape().dimensions().size(),
+                                       dot_dims.rhs_contracting_dimensions(),
+                                       dot_dims.rhs_batch_dimensions())) {
+    fmas *= dim_size(rhs, dim);
+  }
 
   // Get contracting dim.
   for (int dim : dot.dot_dimension_numbers().lhs_contracting_dimensions()) {
-    fmas *= dim_size(*dot.operand(0), dim);
+    fmas *= dim_size(lhs, dim);
+  }
+
+  // Get batch dim
+  for (int dim : dot.dot_dimension_numbers().lhs_batch_dimensions()) {
+    CHECK_EQ(dim_size(lhs, dim), dim_size(rhs, dim));
+    fmas *= dim_size(lhs, dim);
   }
 
   return fmas * 2;  // Every FMA is 2 floating point ops.
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen_run.cc b/third_party/xla/xla/tools/matmul_perf_table_gen_run.cc
new file mode 100644
index 000000000000..9b60e7f3ff40
--- /dev/null
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen_run.cc
@@ -0,0 +1,79 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tools/matmul_perf_table_gen.h"
+#include "tsl/platform/init_main.h"
+#include "tsl/platform/path.h"
+
+namespace xla::gpu {
+namespace {
+
+constexpr char kUsage[] = R"(
+  Collect performance table for HLO dot op.
+)";
+
+std::string FilepathOutput(const std::string& sm_ver) {
+  std::string output_directory;
+  CHECK(tsl::io::GetTestUndeclaredOutputsDir(&output_directory));
+  return tsl::io::JoinPath(output_directory,
+                           absl::StrCat("dot-perf-table-", sm_ver, ".pbtxt"));
+}
+
+int RunPerfTableCollection(int argc, char** argv) {
+  tsl::port::InitMain(kUsage, &argc, &argv);
+
+  MatmulPerfTableGen::Config cfg;
+  cfg.b_spec = {/*start=*/1, /*stop=*/8, /*step=*/0, /*factor=*/2};
+  cfg.m_spec = {/*start=*/16, /*stop=*/4096, /*step=*/0, /*factor=*/2};
+  cfg.n_spec = {/*start=*/16, /*stop=*/4096, /*step=*/0, /*factor=*/2};
+  cfg.k_spec = {/*start=*/16, /*stop=*/4096, /*step=*/0, /*factor=*/2};
+  cfg.dtypes = {
+      {
+          /*lhs_dtype=*/"bf16",
+          /*rhs_dtype=*/"bf16",
+          /*out_dtype=*/"bf16",
+      },
+  };
+
+  {
+    HloRunner runner(PlatformUtil::GetPlatform("cuda").value());
+    const se::DeviceDescription& device_info =
+        runner.backend().stream_executors()[0]->GetDeviceDescription();
+    cfg.output = FilepathOutput(HloOpProfiles::GetProfileName(device_info));
+  }
+  MatmulPerfTableGen table_gen(std::move(cfg));
+
+  xla::gpu::DeviceHloInstructionProfiles result = table_gen.ComputeTable();
+  CHECK_OK(table_gen.Dump(result));
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace xla::gpu
+
+int main(int argc, char** argv) {
+  return xla::gpu::RunPerfTableCollection(argc, argv);
+}
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
index 247dc9b94a4f..8e67a5d437d8 100644
--- a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
@@ -120,8 +120,8 @@ TEST_F(MatmulPerfTableGenTest, SweepSpaceSavesOperands) {
 
 TEST_F(MatmulPerfTableGenTest, SweepSpaceSavesFlops) {
   MatmulPerfTableGen::Config cfg;
-  cfg.b_spec.start = 1;
-  cfg.b_spec.stop = 1;
+  cfg.b_spec.start = 2;
+  cfg.b_spec.stop = 2;
   cfg.b_spec.step = 1;
   cfg.k_spec.start = 8;
   cfg.k_spec.stop = 8;
@@ -141,9 +141,10 @@ TEST_F(MatmulPerfTableGenTest, SweepSpaceSavesFlops) {
 
   EXPECT_EQ(profiles.entries_size(), 1);
   EXPECT_EQ(profiles.entries().begin()->second.entries_size(), 1);
-  // m = 8, n = 3, k = 7 => # flops = 2 * 8 * 3 * 7 = 336.
-  // with a dry run on, t = 42ns, gflops = 336 / 42 = 8 => flops/s = 8 * 1e9.
-  EXPECT_EQ(profiles.entries().begin()->second.entries(0).flops(), 8 * 1e9);
+  // b = 2, m = 8, n = 3, k = 7 => # flops = 2 * 8 * 3 * 7 * 2 = 672.
+  // with a dry run on, t = 42ns, gflops/s = 672 / 42 = 16 => flops/s = 16 *
+  // 1e9.
+  EXPECT_EQ(profiles.entries().begin()->second.entries(0).flops(), 16 * 1e9);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index b9186edbb404..a11993b6da7e 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla:xla.default.bzl", "xla_cc_binary")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_cuda_or_rocm", "if_google")
 load("//xla/tsl/platform:build_config_root.bzl", "tf_gpu_tests_tags")
@@ -68,7 +68,7 @@ cc_library(
         "//xla/stream_executor:cuda_platform",
     ] + if_google(
         [
-            "//third_party/py/jax/jaxlib:gpu_kernels",  # fixdeps: keep
+            "//third_party/py/jax/jaxlib/cuda:cuda_gpu_kernels",  # fixdeps: keep
         ],
     )) + if_rocm([
         "//xla/stream_executor:rocm_platform",
@@ -100,7 +100,7 @@ xla_cc_binary(
         "//xla/service:gpu_plugin",
         "//xla/stream_executor:cuda_platform",
     ] + if_google([
-        "//third_party/py/jax/jaxlib:gpu_kernels",  # fixdeps: keep
+        "//third_party/py/jax/jaxlib/cuda:cuda_gpu_kernels",  # fixdeps: keep
     ]),
 )
 
@@ -139,6 +139,7 @@ cc_library(
     deps = [
         "//xla:literal",
         "//xla:literal_util",
+        "//xla:shape_layout",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
@@ -151,6 +152,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/transforms:while_loop_trip_count_annotator",
         "//xla/hlo/translate:stablehlo",
         "//xla/hlo/translate/hlo_to_mhlo:translate",
         "//xla/pjrt:host_memory_spaces",
@@ -166,6 +168,7 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_module_util",
         "//xla/service:hlo_proto_cc",
+        "//xla/service:slow_operation_alarm",
         "//xla/tests:test_utils",
         "//xla/tools:hlo_control_flow_flattening",
         "//xla/tsl/platform:env",
@@ -182,6 +185,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:FuncExtensions",
         "@local_tsl//tsl/platform:protobuf",
@@ -193,6 +197,11 @@ cc_library(
     ],
 )
 
+filegroup(
+    name = "hlo_file",
+    srcs = glob(["data/**"]),
+)
+
 xla_test(
     name = "functional_hlo_runner_test",
     srcs = ["functional_hlo_runner_test.cc"],
@@ -207,16 +216,7 @@ xla_test(
         "cpu",
         "gpu",
     ],
-    data = [
-        "data/multiple_gemm_fusions.hlo",
-        "data/sharded_16_devices.hlo",
-        "data/sharded_2_devices.hlo",
-        "data/sharded_computation.hlo",
-        "data/sharded_unoptimized_hlo_snapshot.pbtxt",
-        "data/single_device.hlo",
-        "data/single_device_tupled.hlo",
-        "data/single_gemm_fusion.hlo",
-    ],
+    data = [":hlo_file"],
     tags = ["no_mac"],
     deps = [
         ":create_client",
@@ -240,6 +240,7 @@ xla_test(
         "//xla/tsl/platform:subprocess",
         "//xla/tsl/platform:test",
         "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/data/auto_layout.hlo b/third_party/xla/xla/tools/multihost_hlo_runner/data/auto_layout.hlo
new file mode 100644
index 000000000000..76dd13dc9156
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/data/auto_layout.hlo
@@ -0,0 +1,7 @@
+HloModule t, entry_computation_layout={(bf16[4096,64,8], bf16[4096,512,8])->bf16[8,64,512]}
+
+ENTRY main {
+  p0 = bf16[4096,64,8] parameter(0)
+  p1 = bf16[4096,512,8] parameter(1)
+  ROOT res = bf16[8,64,512] dot(p0,p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}, lhs_batch_dims={2}, rhs_batch_dims={2}
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/data/dynamic_shaped_arguments.hlo b/third_party/xla/xla/tools/multihost_hlo_runner/data/dynamic_shaped_arguments.hlo
new file mode 100644
index 000000000000..91b54be3577f
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/data/dynamic_shaped_arguments.hlo
@@ -0,0 +1,6 @@
+HloModule test, entry_computation_layout={(f32[<=4194304])->f32[<=4194304]{0}}
+
+ENTRY %test (arg26.27: f32[<=4194304]) -> f32[<=4194304]{0} {
+  %arg26.27 = f32[<=4194304]{0:T(1024)} parameter(0), parameter_replication={false}, sharding={maximal device=0}, metadata={op_name="XLA_Args"}
+  ROOT %reshape.1744 = f32[<=4194304]{0} reshape(f32[<=4194304]{0:T(1024)} %arg26.27)
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/data/fixed_layout.hlo b/third_party/xla/xla/tools/multihost_hlo_runner/data/fixed_layout.hlo
new file mode 100644
index 000000000000..0a4657951aa8
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/data/fixed_layout.hlo
@@ -0,0 +1,7 @@
+HloModule t, entry_computation_layout={(bf16[4096,64,8]{2,1,0}, bf16[4096,512,8]{2,1,0})->bf16[8,64,512]{2,1,0}}
+
+ENTRY main {
+  p0 = bf16[4096,64,8]{2,1,0} parameter(0)
+  p1 = bf16[4096,512,8]{2,1,0} parameter(1)
+  ROOT res = bf16[8,64,512]{2,1,0} dot(p0,p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}, lhs_batch_dims={2}, rhs_batch_dims={2}
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/data/single_device.hlo b/third_party/xla/xla/tools/multihost_hlo_runner/data/single_device.hlo
index c569f98abb17..8af659c29e42 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/data/single_device.hlo
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/data/single_device.hlo
@@ -1,6 +1,6 @@
-HloModule f
+HloModule f, entry_computation_layout={(f32[2,2]{0,1})->f32[2,2]{0,1}}
 
 ENTRY f {
-  arg = f32[2,2]{1,0} parameter(0)
-  ROOT add_result = f32[2,2]{1,0} add(arg, arg)
+  arg = f32[2,2]{0,1} parameter(0)
+  ROOT add_result = f32[2,2]{0,1} add(arg, arg)
 }
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/data/while_with_known_trip_count.hlo b/third_party/xla/xla/tools/multihost_hlo_runner/data/while_with_known_trip_count.hlo
new file mode 100644
index 000000000000..327510be3c64
--- /dev/null
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/data/while_with_known_trip_count.hlo
@@ -0,0 +1,21 @@
+condition {
+  p = (s32[]) parameter(0)
+  cond = s32[] get-tuple-element(p), index=0
+  trip_count = s32[] constant(7)
+  done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+  p = (s32[]) parameter(0)
+  cond = s32[] get-tuple-element(p), index=0
+  one = s32[] constant(1)
+  cond_plus_1 = s32[] add(cond, one)
+  t = (s32[]) tuple(cond_plus_1)
+}
+
+main {
+  p = s32[] constant(0)
+  t = (s32[]) tuple(p)
+  w = (s32[]) while(t), condition=condition,
+    body=body, backend_config={"known_trip_count":{"n":"7"}}
+}
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index 25ca6106a883..8096967ff981 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
 #include "xla/client/executable_build_options.h"
@@ -45,9 +46,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
 #include "xla/hlo/translate/hlo_to_mhlo/translate.h"
 #include "xla/hlo/translate/stablehlo.h"
 #include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
@@ -64,6 +67,9 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
+#include "xla/service/slow_operation_alarm.h"
+#include "xla/shape.h"
+#include "xla/shape_layout.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tests/test_utils.h"
@@ -110,6 +116,14 @@ absl::StatusOr<Literal> MakeFakeLiteralWithSameValue(const Shape& shape,
             PopulateWithSameValue(
                 &literal,
                 static_cast<NativeT>(type == PRED ? (value % 2) == 0 : value));
+            for (int i = 0; i < shape.dimensions_size(); i++) {
+              if (shape.is_dynamic_dimension(i)) {
+                // TODO(b/378917570): We might need to set the dynamic size to
+                // the actual bound i.e., shape.dimensions(i) when HybridSim
+                // supports SparseCore.
+                literal.SetDynamicSize(i, 0);
+              }
+            }
             return literal;
           }
           return Unimplemented(
@@ -557,7 +571,7 @@ absl::Status FunctionalHloRunner::LoadAndRunAndDump(
     const xla::FunctionalHloRunner::PreprocessingOptions& preproc_options,
     const xla::FunctionalHloRunner::RawCompileOptions& raw_compile_options,
     const xla::FunctionalHloRunner::RunningOptions& running_options,
-    absl::string_view hlo_text, InputFormat input_format,
+    absl::string_view hlo_file, InputFormat input_format,
     std::string dump_output_to, int task_id, int num_nodes,
     std::shared_ptr<xla::KeyValueStoreInterface> kv_store) {
   TF_ASSIGN_OR_RETURN(
@@ -568,7 +582,7 @@ absl::Status FunctionalHloRunner::LoadAndRunAndDump(
       FunctionalHloRunner::PerDeviceLiteralVecType output,
       FunctionalHloRunner::LoadAndRun(client, debug_options, preproc_options,
                                       compile_options, running_options,
-                                      hlo_text, input_format));
+                                      hlo_file, input_format));
   return dump_output_to.empty()
              ? absl::OkStatus()
              : FunctionalHloRunner::DumpOutput(output, dump_output_to, task_id);
@@ -580,7 +594,7 @@ FunctionalHloRunner::LoadAndRun(PjRtClient& client,
                                 const PreprocessingOptions& preproc_options,
                                 const CompileOptions& compile_options,
                                 const RunningOptions& running_options,
-                                absl::string_view hlo_text,
+                                absl::string_view hlo_file,
                                 InputFormat input_format,
                                 const PerDeviceLiteralVecType& arguments,
                                 std::minstd_rand0* engine) {
@@ -590,7 +604,7 @@ FunctionalHloRunner::LoadAndRun(PjRtClient& client,
   // proper device ID, so loading and executing from HLO snapshot might not
   // replay the original execution.
   TF_ASSIGN_OR_RETURN(HloModuleAndArguments hlo_module_and_arguments,
-                      LoadHloModuleAndArguments(hlo_text, input_format));
+                      LoadHloModuleAndArguments(hlo_file, input_format));
   // Arguments from `arguments` take precedence over the arguments from a
   // snapshot.
   if (!arguments.empty()) {
@@ -782,16 +796,43 @@ absl::Status FunctionalHloRunner::PrepareHloModuleForCompilation(
             preproc_options.flatten_conditional,
             /*conditional_value=*/
             preproc_options.conditional_value});
+    if (preproc_options.annotate_while_loop_trip_count) {
+      pipeline.AddPass<WhileLoopTripCountAnnotator>();
+    }
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
   return absl::OkStatus();
 }
 
-CompileOptions FunctionalHloRunner::CompleteCompileOptions(
-    const HloModule& hlo_module, CompileOptions compile_options) {
+absl::StatusOr<CompileOptions> FunctionalHloRunner::CompleteCompileOptions(
+    const HloModule& hlo_module, CompileOptions compile_options,
+    const PreprocessingOptions& preproc_options) {
   ParameterType parameter_type = GetParameterType(hlo_module);
   compile_options.parameter_is_tupled_arguments =
       (parameter_type == ParameterType::kOneTupleOfArrays);
+  if (preproc_options.force_auto_layout) {
+    XlaComputation computation(hlo_module.ToProto());
+    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                        computation.GetProgramShape());
+    LayoutUtil::ClearLayout(&program_shape);
+    compile_options.argument_layouts = program_shape.parameters();
+    compile_options.executable_build_options.set_result_layout(
+        program_shape.result());
+    compile_options.executable_build_options.mutable_debug_options()
+        ->set_xla_pjrt_allow_auto_layout_in_hlo(true);
+  } else if (preproc_options.use_layouts_from_hlo_module) {
+    const ComputationLayout& layout = hlo_module.entry_computation_layout();
+    std::vector<Shape> parameter_shapes;
+    parameter_shapes.reserve(layout.parameter_count());
+    for (const ShapeLayout& shape_layout : layout.parameter_layouts()) {
+      parameter_shapes.push_back(shape_layout.shape());
+    }
+    compile_options.argument_layouts = std::move(parameter_shapes);
+    compile_options.executable_build_options.set_result_layout(
+        layout.result_shape());
+    compile_options.executable_build_options.mutable_debug_options()
+        ->set_xla_pjrt_allow_auto_layout_in_hlo(true);
+  }
   return compile_options;
 }
 
@@ -833,13 +874,14 @@ FunctionalHloRunner::Compile(PjRtClient& client, HloModule* hlo_module,
                              const CompileOptions& compile_options) {
   TF_RETURN_IF_ERROR(PrepareHloModuleForCompilation(hlo_module, debug_options,
                                                     preproc_options));
-  CompileOptions modified_compile_options =
-      CompleteCompileOptions(*hlo_module, compile_options);
+  TF_ASSIGN_OR_RETURN(
+      CompileOptions modified_compile_options,
+      CompleteCompileOptions(*hlo_module, compile_options, preproc_options));
 
   return ConvertAndCallCompiler<PjRtLoadedExecutable>(
       preproc_options.compile_as_stablehlo, hlo_module,
       [&](const auto& module) {
-        return client.Compile(module, modified_compile_options);
+        return client.CompileAndLoad(module, modified_compile_options);
       });
 }
 
@@ -851,8 +893,9 @@ absl::StatusOr<std::unique_ptr<PjRtExecutable>> FunctionalHloRunner::Compile(
     const PjRtTopologyDescription& topology) {
   TF_RETURN_IF_ERROR(PrepareHloModuleForCompilation(hlo_module, debug_options,
                                                     preproc_options));
-  CompileOptions modified_compile_options =
-      CompleteCompileOptions(*hlo_module, compile_options);
+  TF_ASSIGN_OR_RETURN(
+      CompileOptions modified_compile_options,
+      CompleteCompileOptions(*hlo_module, compile_options, preproc_options));
 
   return ConvertAndCallCompiler<PjRtExecutable>(
       preproc_options.compile_as_stablehlo, hlo_module,
@@ -1040,13 +1083,9 @@ FunctionalHloRunner::RunInternal(
     if (!module.result_shape().IsTuple()) {
       return false;
     }
-    return absl::c_any_of(
-        module.result_shape().tuple_shapes(), [](const Shape& shape) {
-          return shape.has_layout() &&
-                 shape.layout().memory_space() == Layout::kHostMemorySpace;
-        });
+    return true;
   };
-  // If any output leaf buffer is in host memory, PJRT requires untuple_result.
+  // If any output leaf buffer is a tuple, PJRT requires untuple_result.
   bool must_untuple_result = output_has_tuple_leaf_on_host_memory_space();
   bool default_untuple_result =
       must_untuple_result || execute_options.untuple_result;
@@ -1146,6 +1185,11 @@ FunctionalHloRunner::CreateArgumentsOnDevice(
         client, executable, running_options, flatten_arguments);
   }
 
+  SlowOperationAlarm alarm(
+      absl::Seconds(5),
+      absl::StrFormat("Argument initialization is slow. Consider changing "
+                      "--hlo_argument_mode."));
+
   absl::Span<PjRtDevice* const> addressable_devices =
       executable->addressable_devices();
   size_t num_addressable_devices = addressable_devices.size();
@@ -1170,7 +1214,7 @@ FunctionalHloRunner::CreateArgumentsOnDevice(
           ModuleArgumentMode::kUseZerosAsInput;
 
   for (int i = 0; i < num_addressable_devices; ++i) {
-    VLOG(3) << "Creating fake argument for device " << i;
+    VLOG(3) << "Creating fake arguments for device " << i;
     LiteralVec& argument_literals =
         per_device_argument_literals[addressable_devices[i]->id()];
     int executable_idx = hlo_modules.size() == 1
@@ -1187,7 +1231,8 @@ FunctionalHloRunner::CreateArgumentsOnDevice(
       if (flatten_arguments) {
         CHECK_EQ(params.size(), 1);
         CHECK(params.front()->shape().IsTuple());
-        argument_literals.reserve(params.front()->shape().tuple_shapes_size());
+        argument_literals.reserve(
+            params.front()->shape().tuple_shapes().size());
       } else {
         argument_literals.reserve(params.size());
       }
@@ -1364,7 +1409,8 @@ FunctionalHloRunner::CopyArgumentsToDevice(
       TF_RET_CHECK(entry_layout.parameter_count() == 1)
           << "entry_layout.parameter_count(): "
           << entry_layout.parameter_count();
-      TF_RET_CHECK(arg_i < entry_layout.parameter_shape(0).tuple_shapes_size());
+      TF_RET_CHECK(arg_i <
+                   entry_layout.parameter_shape(0).tuple_shapes().size());
       const Shape& shape = entry_layout.parameter_shape(0).tuple_shapes(arg_i);
       TF_RET_CHECK(!shape.IsTuple()) << "Nested tuples are not supported";
       return non_tuple_memory_space(shape);
@@ -1537,26 +1583,25 @@ FunctionalHloRunner::FetchAndLogOutput(
   return outputs;
 }
 
-GPURunnerProfiler::GPURunnerProfiler(absl::string_view dump_path,
+HLORunnerProfiler::HLORunnerProfiler(absl::string_view dump_path,
                                      bool keep_xspace)
     : dump_path_(dump_path), keep_xspace_(keep_xspace) {}
 
-absl::StatusOr<std::unique_ptr<GPURunnerProfiler>> GPURunnerProfiler::Create(
+absl::StatusOr<std::unique_ptr<HLORunnerProfiler>> HLORunnerProfiler::Create(
     absl::string_view dump_path, bool keep_xspace) {
   if (dump_path.empty()) {
     return absl::InvalidArgumentError(
         "Please provide a valid dump path to save XSpace results to disk.");
   }
-  return std::make_unique<GPURunnerProfiler>(dump_path, keep_xspace);
+  return std::make_unique<HLORunnerProfiler>(dump_path, keep_xspace);
 }
 
-void GPURunnerProfiler::CreateSession() {
+void HLORunnerProfiler::CreateSession() {
   auto options = tsl::ProfilerSession::DefaultOptions();
-  options.set_device_type(tensorflow::ProfileOptions::GPU);
   session_ = tsl::ProfilerSession::Create(options);
 }
 
-void GPURunnerProfiler::UploadSession() {
+void HLORunnerProfiler::UploadSession() {
   xspace_ = std::make_unique<tensorflow::profiler::XSpace>();
   // Stops the ProfilerSession
   TF_CHECK_OK(session_->CollectData(xspace_.get()));
@@ -1571,7 +1616,7 @@ void GPURunnerProfiler::UploadSession() {
   }
 }
 
-const tensorflow::profiler::XSpace* GPURunnerProfiler::GetXSpace() {
+const tensorflow::profiler::XSpace* HLORunnerProfiler::GetXSpace() {
   return xspace_.get();
 }
 
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
index a06366a81388..6ef98652cd01 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
@@ -89,30 +89,30 @@ class XSpaceProfilerInterface : public ProfilerInterface {
   virtual const tensorflow::profiler::XSpace* GetXSpace() = 0;
 };
 
-// GPURunnerProfiler is a profiler plugin that using tsl::ProfilerSession to
-// profile GPU execution and allows programmable control of
+// HLORunnerProfiler is a profiler plugin that using tsl::ProfilerSession to
+// profile CPU/GPU execution and allows programmable control of
 // profiling sessions for the MultihostHloRunner. It needs to be created after
 // PJRT client is initialized. Example usage:
 //
 //   TF_ASSIGN_OR_RETURN(
 //       env, xla::GetPjRtEnvironmentForGpu(...)));
 //   if (env.client != nullptr) {
-//     TF_ASSIGN_OR_RETURN(auto profiler, GPURunnerProfiler::Create());
+//     TF_ASSIGN_OR_RETURN(auto profiler, HLORunnerProfiler::Create());
 //   }
 //   profiler.CreateSession();
 //   ...
 //   profiler.UploadSession();
-class GPURunnerProfiler : public XSpaceProfilerInterface {
+class HLORunnerProfiler : public XSpaceProfilerInterface {
  public:
   // Factory method to create a GPURunnerProfiler with profile result dump path.
   // If keep_xspace is true, the XSpace proto can be retrieved
   // by GetXSpace() after UploadSession() is called, which can be used by
   // caller to get a programmatic handler of the profile data and create XProf.
-  static absl::StatusOr<std::unique_ptr<GPURunnerProfiler>> Create(
+  static absl::StatusOr<std::unique_ptr<HLORunnerProfiler>> Create(
       absl::string_view dump_path, bool keep_xspace = false);
 
   // Default ctor.
-  explicit GPURunnerProfiler(absl::string_view dump_path, bool keep_xspace);
+  explicit HLORunnerProfiler(absl::string_view dump_path, bool keep_xspace);
 
   // Start a new profiling session.
   void CreateSession() override;
@@ -238,11 +238,20 @@ class FunctionalHloRunner {
     // compilation.
     bool compile_as_stablehlo = false;
 
+    // Use layouts from the HLO module.
+    bool use_layouts_from_hlo_module = false;
+
+    // Force auto layout.
+    bool force_auto_layout = false;
+
     // Should we flatten all while loops?
     bool flatten_while_loop() const {
       return while_execution_count.has_value();
     }
 
+    // Set / update known_trip_count in while loop backend config.
+    bool annotate_while_loop_trip_count = false;
+
     // Is the module the partitioned result of SPMD?
     bool is_spmd_partitioned_module() const {
       return spmd_partitioned_mode ==
@@ -340,7 +349,7 @@ class FunctionalHloRunner {
       const xla::FunctionalHloRunner::PreprocessingOptions& preproc_options,
       const xla::FunctionalHloRunner::RawCompileOptions& raw_compile_options,
       const xla::FunctionalHloRunner::RunningOptions& running_options,
-      absl::string_view hlo_text, InputFormat input_format,
+      absl::string_view hlo_file, InputFormat input_format,
       std::string dump_output_to = "", int task_id = 0, int num_nodes = 1,
       std::shared_ptr<xla::KeyValueStoreInterface> kv_store = nullptr);
 
@@ -353,7 +362,7 @@ class FunctionalHloRunner {
       PjRtClient& client, const DebugOptions& debug_options,
       const PreprocessingOptions& preproc_options,
       const CompileOptions& compile_options,
-      const RunningOptions& running_options, absl::string_view hlo_text,
+      const RunningOptions& running_options, absl::string_view hlo_file,
       InputFormat input_format, const PerDeviceLiteralVecType& arguments = {},
       std::minstd_rand0* engine = nullptr);
 
@@ -414,8 +423,9 @@ class FunctionalHloRunner {
       const PreprocessingOptions& preproc_options);
   // This would ideally be private, but we need it for the implementation of
   // MultihostHloRunner.
-  static CompileOptions CompleteCompileOptions(const HloModule& hlo_module,
-                                               CompileOptions compile_options);
+  static absl::StatusOr<CompileOptions> CompleteCompileOptions(
+      const HloModule& hlo_module, CompileOptions compile_options,
+      const PreprocessingOptions&);
 
   static absl::Status DumpOutput(
       const FunctionalHloRunner::PerDeviceLiteralVecType& output,
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
index 3f7686c88377..72aa97f5ac3e 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
@@ -24,10 +24,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/testlib/filecheck.h"
@@ -58,6 +60,7 @@ namespace xla {
 namespace {
 
 using ::testing::SizeIs;
+using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 using HloModuleAndArguments = ::xla::FunctionalHloRunner::HloModuleAndArguments;
 
@@ -142,7 +145,7 @@ TEST_F(FunctionalHloRunnerTest, GPUProfilerWithEmptyDumpPathReturnsError) {
   }
   std::string empty_profile_dump_path = "";
   EXPECT_THAT(
-      GPURunnerProfiler::Create(empty_profile_dump_path, /*keep_xspace=*/true),
+      HLORunnerProfiler::Create(empty_profile_dump_path, /*keep_xspace=*/true),
       StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
@@ -159,7 +162,7 @@ TEST_F(FunctionalHloRunnerTest, GPUProfilerKeepXSpaceReturnsNonNullXSpace) {
   FunctionalHloRunner::RunningOptions running_options;
   TF_ASSERT_OK_AND_ASSIGN(
       auto profiler,
-      GPURunnerProfiler::Create(profile_dump_path, /*keep_xspace=*/true));
+      HLORunnerProfiler::Create(profile_dump_path, /*keep_xspace=*/true));
   running_options.profiler = profiler.get();
 
   profiler->CreateSession();
@@ -176,11 +179,11 @@ TEST_F(FunctionalHloRunnerTest,
   std::string profile_dump_path =
       tsl::io::JoinPath(testing::TempDir(), "xspace.pb");
 
-  std::unique_ptr<GPURunnerProfiler> profiler;
+  std::unique_ptr<HLORunnerProfiler> profiler;
   FunctionalHloRunner::RunningOptions running_options;
   TF_ASSERT_OK_AND_ASSIGN(
       profiler,
-      GPURunnerProfiler::Create(profile_dump_path, /*keep_xspace=*/true));
+      HLORunnerProfiler::Create(profile_dump_path, /*keep_xspace=*/true));
   running_options.profiler = profiler.get();
 
   running_options.num_repeats = 2;
@@ -343,31 +346,26 @@ TEST_F(FunctionalHloRunnerTest, UseUninitializedInputsWithTupledArguments) {
       InputFormat::kText));
 }
 
-TEST_F(FunctionalHloRunnerTest, CanCompileWithoutHavingEnoughGpus) {
-  // This test corresponds to:
-  // --num_replicas=1 --num_partitions=16
-  // --run=false --xla_dump_to=dump_dir
-
+void CompileAndFilecheck(
+    absl::string_view hlo_file, absl::string_view pattern,
+    const FunctionalHloRunner::PreprocessingOptions& preproc_options,
+    const FunctionalHloRunner::HloPassesMode hlo_passes_mode,
+    const int num_partitions = 1) {
   tsl::Env* env = tsl::Env::Default();
   std::string dump_dir;
   ASSERT_TRUE(env->LocalTempFilename(&dump_dir));
   tsl::FileSystem* fs = nullptr;
   TF_ASSERT_OK(env->GetFileSystemForFile(dump_dir, &fs));
-
-  xla::DebugOptions debug_options;
-  FunctionalHloRunner::PreprocessingOptions preproc_options;
-  FunctionalHloRunner::RawCompileOptions raw_compile_options;
-  raw_compile_options.num_replicas = 1;
-  raw_compile_options.num_partitions = 16;
-  raw_compile_options.xla_dump_to = dump_dir;
-
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
                           GetPjRtClient());
+  FunctionalHloRunner::RawCompileOptions opts;
+  opts.hlo_passes_mode = hlo_passes_mode;
+  opts.num_partitions = num_partitions;
+  opts.xla_dump_to = dump_dir;
   TF_EXPECT_OK(FunctionalHloRunner::LoadAndCompile(
-      *client, debug_options, preproc_options, raw_compile_options,
-      GetHloPath("sharded_16_devices.hlo"), InputFormat::kText));
+      *client, xla::DebugOptions{}, preproc_options, opts, hlo_file,
+      InputFormat::kText));
 
-  // Check that the sharding was done correctly.
   {
     std::vector<std::string> after_opt_hlo_paths;
     TF_ASSERT_OK(
@@ -377,16 +375,11 @@ TEST_F(FunctionalHloRunnerTest, CanCompileWithoutHavingEnoughGpus) {
     std::string after_opt_hlo;
     TF_ASSERT_OK(
         tsl::ReadFileToString(env, after_opt_hlo_paths[0], &after_opt_hlo));
-    absl::StatusOr<bool> file_check_result = RunFileCheck(after_opt_hlo, R"(
-      // CHECK: param{{.*}} = f32[16,1]{1,0}
-      // CHECK: add{{.*}} = f32[16,1]{1,0}
-    )");
-    TF_ASSERT_OK(file_check_result.status());
-    EXPECT_TRUE(file_check_result.value());
+    EXPECT_THAT(RunFileCheck(after_opt_hlo, pattern), IsOkAndHolds(true));
   }
 
   // Check that the LLVM IR has been generated.
-  {
+  if (!IsTestingCpu()) {
     std::vector<std::string> ir_paths;
     TF_ASSERT_OK(fs->GetMatchingPaths(fs->JoinPath(dump_dir, "*ir-no-opt.ll"),
                                       &ir_paths));
@@ -394,6 +387,73 @@ TEST_F(FunctionalHloRunnerTest, CanCompileWithoutHavingEnoughGpus) {
   }
 }
 
+TEST_F(FunctionalHloRunnerTest, KeepLayoutsFromHloModule) {
+  FunctionalHloRunner::PreprocessingOptions preproc_options;
+  preproc_options.use_layouts_from_hlo_module = true;
+
+  CompileAndFilecheck(GetHloPath("single_device.hlo"),
+                      // Check that non-standard layouts are preserved.
+                      R"(
+// CHECK: entry_computation_layout={(f32[2,2]{0,1})->f32[2,2]{0,1}}
+// CHECK: f32[2,2]{0,1} parameter(0)
+// CHECK: ROOT {{.*}} = f32[2,2]{0,1}
+)",
+                      preproc_options,
+                      FunctionalHloRunner::HloPassesMode::kStandardCompile,
+                      /*num_partitions=*/1);
+}
+
+TEST_F(FunctionalHloRunnerTest, AutoLayoutAssignsNonDefaultLayout) {
+  if (IsTestingCpu()) GTEST_SKIP() << "CPU doesn't support auto-layout yet.";
+  FunctionalHloRunner::PreprocessingOptions preproc_options;
+  preproc_options.use_layouts_from_hlo_module = true;
+  CompileAndFilecheck(GetHloPath("auto_layout.hlo"),
+                      // Makes LHS contracting dimension minor.
+                      "// CHECK: entry_computation_layout={(bf16[4096,64,8]{0",
+                      preproc_options,
+                      FunctionalHloRunner::HloPassesMode::kStandardCompile,
+                      /*num_partitions=*/1);
+}
+
+TEST_F(FunctionalHloRunnerTest, FixedLayoutAssignsNonDefaultLayout) {
+  if (IsTestingCpu()) {
+    GTEST_SKIP() << "CPU doesn't support auto-layout yet.";
+  }
+  FunctionalHloRunner::PreprocessingOptions preproc_options;
+  preproc_options.force_auto_layout = true;
+  CompileAndFilecheck(GetHloPath("fixed_layout.hlo"),
+                      // Makes LHS contracting dimension minor.
+                      "// CHECK: entry_computation_layout={(bf16[4096,64,8]{0",
+                      preproc_options,
+                      FunctionalHloRunner::HloPassesMode::kStandardCompile,
+                      /*num_partitions=*/1);
+}
+
+TEST_F(FunctionalHloRunnerTest, CanCompileWithoutHavingEnoughGpus) {
+  CompileAndFilecheck(GetHloPath("sharded_16_devices.hlo"),
+                      // Check that the sharding was done correctly.
+                      R"(
+      // CHECK: param{{.*}} = f32[16,1]{1,0}
+      // CHECK: add{{.*}} = f32[16,1]{1,0}
+    )",
+                      /*preproc_options=*/{},
+                      FunctionalHloRunner::HloPassesMode::kStandardCompile,
+                      /*num_partitions=*/16);
+}
+
+TEST_F(FunctionalHloRunnerTest, WhileKnownTripCountGetsCapped) {
+  FunctionalHloRunner::PreprocessingOptions opts;
+  opts.while_execution_count = 5;
+  opts.annotate_while_loop_trip_count = true;
+  CompileAndFilecheck(GetHloPath("while_with_known_trip_count.hlo"),
+                      R"(
+      // CHECK: constant(5)
+      // CHECK: "known_trip_count":{"n":"5"}
+    )",
+                      opts,
+                      FunctionalHloRunner::HloPassesMode::kRunXLABackendOnly);
+}
+
 // Name of the test binary.
 static const char* binary_name;
 constexpr int kNumNodes = 2;
@@ -503,6 +563,42 @@ TEST_F(FunctionalHloRunnerTest, PreservesAutoLayout) {
   EXPECT_FALSE(layout.result_layout().AnyLayoutIsSet());
 }
 
+TEST_F(FunctionalHloRunnerTest, MakeFakeLiteralWithSameValue) {
+  if (IsTestingCpu()) {
+    GTEST_SKIP() << "GPU-only test";
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloModuleAndArguments hlo_module_and_arguments,
+      FunctionalHloRunner::LoadHloModuleAndArguments(
+          GetHloPath("dynamic_shaped_arguments.hlo"), InputFormat::kText));
+
+  const auto params = hlo_module_and_arguments.hlo_module->entry_computation()
+                          ->parameter_instructions();
+
+  for (const auto& param : params) {
+    LOG(INFO) << "param: " << param->ToString();
+    CHECK_EQ(param->shape().is_dynamic(), true);
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
+                          GetPjRtClient());
+
+  // Options corresponding to --num_replicas=1 --num_partitions=1
+  xla::DebugOptions debug_options;
+  FunctionalHloRunner::PreprocessingOptions preproc_options;
+  CompileOptions compile_options;
+  FunctionalHloRunner::RunningOptions running_options;
+  running_options.module_argument_mode =
+      FunctionalHloRunner::ModuleArgumentMode::kUseDeviceIdAsInput;
+
+  std::minstd_rand0 engine(42);
+  TF_EXPECT_OK(FunctionalHloRunner::LoadAndRun(
+      *client, debug_options, preproc_options, compile_options, running_options,
+      {GetHloPath("dynamic_shaped_arguments.hlo")}, InputFormat::kText,
+      /*arguments=*/{}, /*engine=*/&engine));
+}
+
 TEST_F(FunctionalHloRunnerTest, CanRunWithMockCollectives) {
   if (IsTestingCpu()) {
     GTEST_SKIP() << "GPU-only test";
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index 2db8adfe3c3b..e9d08767499e 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -98,6 +98,8 @@ struct HloRunnerConfig {
   int32_t while_execution_count = -1;
   bool remove_infeed_outfeed = true;
   bool compile_as_stablehlo = false;
+  bool use_layouts_from_hlo_module = false;
+  bool force_auto_layout = false;
   int32_t num_repeats = 1;
   std::string execution_options_path = "";
   int64_t gpu_client_initialization_timeout_sec = 300;
@@ -133,10 +135,10 @@ ArgumentModeFromString(absl::string_view text) {
     return FunctionalHloRunner::ModuleArgumentMode::kUninitialized;
   }
   return absl::InvalidArgumentError(
-      absl::StrCat("Unrecognized module argument mode specified. Expect "
-                   "\"use_device_id_as_input\", \"use_random_inputs\", or "
-                   "\"use_shared_random_inputs\"., got: ",
-                   text));
+      absl::StrCat(R"(Invalid --hlo_argument_mode specified. Expected one of: )"
+                   R"("use_device_id_as_input", "use_random_inputs", )"
+                   R"("use_shared_random_inputs", "use_zeros_as_input", or )",
+                   R"("uninitialized". Got: )", text));
 }
 
 static absl::StatusOr<FunctionalHloRunner::PreprocessingOptions>
@@ -218,6 +220,7 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
   TF_ASSIGN_OR_RETURN(
       xla::FunctionalHloRunner::PreprocessingOptions preproc_options,
       PreprocessingOptionsFromFlags(opts));
+  preproc_options.annotate_while_loop_trip_count = true;
   TF_ASSIGN_OR_RETURN(
       xla::FunctionalHloRunner::RawCompileOptions raw_compile_options,
       RawCompileOptionsFromFlags(opts));
@@ -234,7 +237,7 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
   QCHECK_LT(opts.gpu_client_mem_fraction, 1.0);
 
   PjRtEnvironment env;
-  std::unique_ptr<GPURunnerProfiler> gpu_runner_profiler;
+  std::unique_ptr<HLORunnerProfiler> hlo_runner_profiler;
   if (opts.device_type_str == "gpu") {
     xla::GpuClientOptions gpu_options;
     gpu_options.node_id = opts.task_id;
@@ -248,13 +251,19 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
     // Create a GPURunnerProfiler to profile GPU executions to save xspace data
     // to disk.
     if (env.client != nullptr && !opts.xla_gpu_dump_xspace_to.empty()) {
-      TF_ASSIGN_OR_RETURN(gpu_runner_profiler,
-                          GPURunnerProfiler::Create(opts.xla_gpu_dump_xspace_to,
+      TF_ASSIGN_OR_RETURN(hlo_runner_profiler,
+                          HLORunnerProfiler::Create(opts.xla_gpu_dump_xspace_to,
                                                     /*keep_xspace=*/false));
-      running_options.profiler = gpu_runner_profiler.get();
+      running_options.profiler = hlo_runner_profiler.get();
     }
   } else if (opts.device_type_str == "host") {
     TF_ASSIGN_OR_RETURN(env, xla::GetPjRtEnvironmentForHostCpu());
+    if (env.client != nullptr && !opts.xla_gpu_dump_xspace_to.empty()) {
+      TF_ASSIGN_OR_RETURN(hlo_runner_profiler,
+                          HLORunnerProfiler::Create(opts.xla_gpu_dump_xspace_to,
+                                                    /*keep_xspace=*/false));
+      running_options.profiler = hlo_runner_profiler.get();
+    }
   } else {
     return absl::InvalidArgumentError(
         absl::StrCat("Unrecognized device type ", opts.device_type_str,
@@ -268,22 +277,22 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
   }
 
   for (int c = 1; c < argc; c++) {
-    const char* filename = argv[c];
+    const char* hlo_file = argv[c];
     execution_profiles.clear();
     if (opts.should_run) {
-      std::cout << "\n** Running " << filename << " **\n";
+      std::cout << "\n** Running " << hlo_file << " **\n";
       TF_RETURN_IF_ERROR(xla::FunctionalHloRunner::LoadAndRunAndDump(
           *env.client, GetDebugOptionsFromFlags(), preproc_options,
-          raw_compile_options, running_options, filename, opts.input_format,
+          raw_compile_options, running_options, hlo_file, opts.input_format,
           opts.dump_output_literal_to, opts.task_id));
     } else {
-      std::cout << "\n** Compiling " << filename << " **\n";
+      std::cout << "\n** Compiling " << hlo_file << " **\n";
       TF_RETURN_IF_ERROR(FunctionalHloRunner::LoadAndCompile(
           *env.client, GetDebugOptionsFromFlags(), preproc_options,
           raw_compile_options, argv[c], opts.input_format, opts.task_id));
     }
     for (int i = 0; i < execution_profiles.size(); ++i) {
-      std::cout << "## Execution time, file=" << filename << " repeat=" << i
+      std::cout << "## Execution time, file=" << hlo_file << " repeat=" << i
                 << " duration=" << execution_profiles[i].compute_time_ns()
                 << "ns" << std::endl;
     }
@@ -293,12 +302,38 @@ static absl::Status RunMultihostHloRunner(int argc, char** argv,
 
 }  // namespace xla
 
+namespace {
+
+// This function is parsing only the debug options file, because we cannot wait
+// till all the flags are parsed. If the debug_options file exists, then we have
+// to first consider the debug_options from that file, then XLA_FLAGS, and then
+// the command line flags. Hence, we parse the debug_options file first.
+std::optional<absl::string_view> GetDebugOptionsFileName(int argc,
+                                                         char* argv[]) {
+  for (int i = 1; i < argc; ++i) {
+    absl::string_view arg = argv[i];
+    if (absl::StrContains(arg, "--debug_options_file")) {
+      auto eq_idx = arg.find('=');
+      if (eq_idx != absl::string_view::npos) {
+        return arg.substr(eq_idx + 1);
+      } else {
+        LOG(QFATAL) << "No value provided for --debug_options_file. Expected "
+                    << "--debug_options_file=<filename>";
+      }
+    }
+  }
+  return std::nullopt;
+}
+}  // namespace
+
 int main(int argc, char** argv) {
   HloRunnerConfig opts;
+  std::string unused_debug_options_filename;
   std::vector<tsl::Flag> flag_list = {
       tsl::Flag("input_format", &opts.input_format_str,
-                "HLO input mode: text, proto_text, proto_binary, or "
-                "snapshot_proto_binary"),
+                "HLO input mode: text, proto_text, proto_binary, "
+                "snapshot_proto_binary, unoptimized_snapshot_proto_binary, or "
+                "unoptimized_snapshot_proto_text"),
       tsl::Flag("run", &opts.should_run, "Should we run the compiled HLO?"),
       tsl::Flag("dump_output_literal_to", &opts.dump_output_literal_to,
                 "A path to which the HLO output will be dumped. "
@@ -353,6 +388,12 @@ int main(int argc, char** argv) {
       tsl::Flag("compile_as_stablehlo", &opts.compile_as_stablehlo,
                 "If set, convert the module to StableHLO before passing to "
                 "PjRt for compilation."),
+      tsl::Flag("use_layouts_from_hlo_module",
+                &opts.use_layouts_from_hlo_module,
+                "If set, use layouts from the HLO module's "
+                "entry_computation_layout."),
+      tsl::Flag("force_auto_layout", &opts.force_auto_layout,
+                "If set, force auto layout."),
       tsl::Flag("num_repeats", &opts.num_repeats,
                 "Repeatedly execute the HLO for this many times."),
       tsl::Flag("execution_options_path", &opts.execution_options_path,
@@ -369,10 +410,24 @@ int main(int argc, char** argv) {
       tsl::Flag("profile_execution", &opts.profile_execution,
                 "If set, we will profile the execution and print the results."),
       tsl::Flag("xla_gpu_dump_xspace_to", &opts.xla_gpu_dump_xspace_to,
-                "A directory to dump xspace data for GPU profiling.")};
+                "A directory to dump xspace data for GPU profiling."),
+      // This option is not used during parsing, but it is added here for
+      // documentation, and for ensuring that the parser knows this should be
+      // ignored if present.
+      tsl::Flag("debug_options_file", &unused_debug_options_filename,
+                "A file containing debug options to be passed to the HLO "
+                "module. The file should contain a serialized DebugOptions "
+                "proto message. The order of precedence: command line flags > "
+                "XLA_FLAGS > debug_options_file > default flags.")};
 
   xla::AppendDebugOptionsFlags(&flag_list);
 
+  auto debugOptionsFilename = GetDebugOptionsFileName(argc, argv);
+  if (debugOptionsFilename.has_value()) {
+    xla::ParseFlagsFromDebugOptionsFile(debugOptionsFilename.value());
+  }
+  xla::ParseDebugOptionFlagsFromEnv(true);
+
   // The usage string includes the message at the top of the file, the
   // DebugOptions flags and the flags defined above.
   const std::string kUsageString =
diff --git a/third_party/xla/xla/tools/prepare_reference_module.cc b/third_party/xla/xla/tools/prepare_reference_module.cc
index 73b7307d9e33..6fd7ac111ec7 100644
--- a/third_party/xla/xla/tools/prepare_reference_module.cc
+++ b/third_party/xla/xla/tools/prepare_reference_module.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <utility>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/tools/prepare_reference_module.h b/third_party/xla/xla/tools/prepare_reference_module.h
index f26e84745b40..3b52ea818523 100644
--- a/third_party/xla/xla/tools/prepare_reference_module.h
+++ b/third_party/xla/xla/tools/prepare_reference_module.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_module_config.h"
diff --git a/third_party/xla/xla/tools/prepare_reference_module_test.cc b/third_party/xla/xla/tools/prepare_reference_module_test.cc
index b397fc909d33..40558f46bfae 100644
--- a/third_party/xla/xla/tools/prepare_reference_module_test.cc
+++ b/third_party/xla/xla/tools/prepare_reference_module_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "xla/tools/prepare_reference_module.h"
 
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -39,7 +39,7 @@ const char* const kModuleStr = R"(
   }
 )";
 
-using PrepareReferenceModuleTest = HloTestBase;
+using PrepareReferenceModuleTest = HloHardwareIndependentTestBase;
 
 // Ideally 'Despecializer' pass should be mocked. Because it is not feasible
 // with the current design, despecialization tests in this file are based on
diff --git a/third_party/xla/xla/tools/print_indexing.cc b/third_party/xla/xla/tools/print_indexing.cc
index 9ae6baeadce4..089402428a97 100644
--- a/third_party/xla/xla/tools/print_indexing.cc
+++ b/third_party/xla/xla/tools/print_indexing.cc
@@ -58,7 +58,7 @@ absl::Status Run(const std::string& filename, int operand_id, int output_id) {
   if (output_id < 0) {
     // Enumerate all outputs.
     if (root->shape().IsTuple()) {
-      for (int i = 0; i < root->shape().tuple_shapes_size(); ++i) {
+      for (int i = 0; i < root->shape().tuple_shapes().size(); ++i) {
         output_ids.push_back(i);
       }
     } else {
diff --git a/third_party/xla/xla/tools/run_hlo_module.cc b/third_party/xla/xla/tools/run_hlo_module.cc
index 22c0c02cafde..7e12fd139c91 100644
--- a/third_party/xla/xla/tools/run_hlo_module.cc
+++ b/third_party/xla/xla/tools/run_hlo_module.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/tools/run_hlo_module.h"
 
+#include <chrono>
+#include <cstddef>
 #include <functional>
 #include <iomanip>
 #include <iostream>
@@ -32,6 +34,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -226,9 +229,12 @@ absl::Status RunAndCompareInternal(
           "number of expected arguments.");
     } else {
       for (int i = 0; i < args.size(); ++i) {
-        if (!literal_comparison::EqualShapes(
-                 xla::Shape(args[i].shape()),
-                 xla::Shape(iteration_literals_proto->arguments(i).shape()))
+        TF_ASSIGN_OR_RETURN(
+            auto expected_shape,
+            xla::Shape::FromProto(
+                iteration_literals_proto->arguments(i).shape()));
+        if (!literal_comparison::EqualShapes(xla::Shape(args[i].shape()),
+                                             expected_shape)
                  .ok()) {
           if (test_run_result != nullptr) {
             *test_run_result = ModuleResult::kOtherError;
diff --git a/third_party/xla/xla/tools/run_hlo_module_test.cc b/third_party/xla/xla/tools/run_hlo_module_test.cc
index 255563a58936..7177eee5b019 100644
--- a/third_party/xla/xla/tools/run_hlo_module_test.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include <gtest/gtest.h>
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/tools/run_hlo_module.pb.h"
diff --git a/third_party/xla/xla/tools/tests/hlo_expand_test.cc b/third_party/xla/xla/tools/tests/hlo_expand_test.cc
index 7a92bccc7340..33db7cf4bfd2 100644
--- a/third_party/xla/xla/tools/tests/hlo_expand_test.cc
+++ b/third_party/xla/xla/tools/tests/hlo_expand_test.cc
@@ -66,7 +66,7 @@ TEST_F(HloExpandTest, CholeskyHlo) {
 
 ENTRY %main.3 () -> f64[3,3] {
   %constant.1 = f64[3,3]{1,0} constant({ { 1, 2, 3 }, { 2, 20, 26 }, { 3, 26, 70 } })
-  ROOT %cholesky.2 = f64[3,3]{1,0} cholesky(f64[3,3]{1,0} %constant.1), lower=true
+  ROOT %cholesky.2 = f64[3,3]{1,0} cholesky(%constant.1), lower=true
 })";
 
   EXPECT_TRUE(exited_normally_);
@@ -85,16 +85,16 @@ TEST_F(HloExpandTest, SpmdHlo) {
 
 ENTRY %entry_spmd (param: f32[24,64], param.1: f32[39296,64]) -> f32[24,19648] {
   %param = f32[24,64]{1,0} parameter(0), sharding={replicated}
-  %lhs.copy.1 = f32[24,64]{1,0} copy(f32[24,64]{1,0} %param)
+  %lhs.copy.1 = f32[24,64]{1,0} copy(%param)
   %param.1 = f32[39296,64]{1,0} parameter(1), sharding={replicated}
   %constant = s32[2]{0} constant({0, 19648})
   %partition-id = u32[] partition-id()
-  %dynamic-slice = s32[1]{0} dynamic-slice(s32[2]{0} %constant, u32[] %partition-id), dynamic_slice_sizes={1}
-  %reshape = s32[] reshape(s32[1]{0} %dynamic-slice)
+  %dynamic-slice = s32[1]{0} dynamic-slice(%constant, %partition-id), dynamic_slice_sizes={1}
+  %reshape = s32[] reshape(%dynamic-slice)
   %constant.1 = s32[] constant(0)
-  %dynamic-slice.1 = f32[19648,64]{1,0} dynamic-slice(f32[39296,64]{1,0} %param.1, s32[] %reshape, s32[] %constant.1), dynamic_slice_sizes={19648,64}
-  %rhs.copy.1 = f32[19648,64]{1,0} copy(f32[19648,64]{1,0} %dynamic-slice.1)
-  ROOT %dot.1 = f32[24,19648]{1,0} dot(f32[24,64]{1,0} %lhs.copy.1, f32[19648,64]{1,0} %rhs.copy.1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  %dynamic-slice.1 = f32[19648,64]{1,0} dynamic-slice(%param.1, %reshape, %constant.1), dynamic_slice_sizes={19648,64}
+  %rhs.copy.1 = f32[19648,64]{1,0} copy(%dynamic-slice.1)
+  ROOT %dot.1 = f32[24,19648]{1,0} dot(%lhs.copy.1, %rhs.copy.1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
 })";
 
   EXPECT_TRUE(exited_normally_);
diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
index e6fce7e44a8f..0a0670009fdf 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -56,12 +56,14 @@ limitations under the License.
 #include "xla/service/symbol_repository.h"
 #include "xla/service/xla_compile_result.pb.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/env_time.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
diff --git a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
index 83861fa062a6..705e7c32fe57 100644
--- a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/tsl/protobuf/status.pb.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status_matchers.h"
@@ -73,7 +74,7 @@ TEST_F(XlaCompileLibTest, CompilesForGpuWithDevice) {
 TEST_F(XlaCompileLibTest, CompilesForGpuWithoutDevice) {
   const std::string target_config_path =
       tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
-                        "xla_aot_compile_test_gpu_target_config.prototxt");
+                        "xla_aot_compile_test_gpu_target_config.txtpb");
   stream_executor::GpuTargetConfigProto target_config;
   TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,
                                   &target_config));
diff --git a/third_party/xla/xla/translate/BUILD b/third_party/xla/xla/translate/BUILD
deleted file mode 100644
index df293cbeac9a..000000000000
--- a/third_party/xla/xla/translate/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-load("//xla/tsl:tsl.bzl", "internal_visibility")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([
-        "//learning/brain/mlir:tensorflow_friends",
-        "//learning/brain/mlir:xla_friends",
-    ]),
-    licenses = ["notice"],
-)
-
-alias(
-    name = "xla-translate",
-    actual = "//xla/hlo/translate:xla-translate",
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate:xla-translate instead.",
-)
-
-alias(
-    name = "xla-translate-opt",
-    actual = "//xla/hlo/translate:xla-translate-opt",
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate:xla-translate-opt instead.",
-)
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
deleted file mode 100644
index d439babc4564..000000000000
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ /dev/null
@@ -1,78 +0,0 @@
-load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([
-        "//learning/brain/mlir:tensorflow_friends",
-        "//learning/brain/mlir:xla_friends",
-    ]),
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "attribute_importer",
-    hdrs = ["attribute_importer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/hlo_to_mhlo:attribute_importer instead.",
-    deps = [
-        "//xla/hlo/translate/hlo_to_mhlo:attribute_importer",
-    ],
-)
-
-cc_library(
-    name = "hlo_function_importer",
-    hdrs = ["hlo_function_importer.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/hlo_to_mhlo:hlo_function_importer instead.",
-    deps = [
-        "//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
-    ],
-)
-
-cc_library(
-    name = "hlo_module_importer",
-    hdrs = [
-        "hlo_module_importer.h",
-    ],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/hlo_to_mhlo:hlo_module_importer instead.",
-    deps = [
-        "//xla/hlo/translate/hlo_to_mhlo:hlo_module_importer",
-    ],
-)
-
-cc_library(
-    name = "hlo_to_mlir_hlo",
-    hdrs = ["hlo_to_mlir_hlo.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/hlo_to_mhlo:hlo_to_mlir_hlo instead.",
-    deps = [
-        "//xla/hlo/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
-    ],
-)
-
-cc_library(
-    name = "hlo_utils",
-    hdrs = ["hlo_utils.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/hlo_to_mhlo:hlo_utils instead.",
-    includes = ["include"],
-    deps = [
-        "//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
-    ],
-)
-
-cc_library(
-    name = "translate",
-    hdrs = ["translate.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/hlo_to_mhlo:translate instead.",
-    deps = [
-        "//xla/hlo/translate/hlo_to_mhlo:translate",
-    ],
-)
-
-cc_library(
-    name = "translate_registration",
-    testonly = True,
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/hlo_to_mhlo:translate_registration instead.",
-    deps = [
-        "//xla/hlo/translate/hlo_to_mhlo:translate_registration",
-    ],
-    alwayslink = 1,
-)
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.h
deleted file mode 100644
index 2b5f81982fd6..000000000000
--- a/third_party/xla/xla/translate/hlo_to_mhlo/attribute_importer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
-#define XLA_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/hlo_to_mhlo/attribute_importer.h"
-
-#endif  // XLA_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
deleted file mode 100644
index 0ebd37fa6af1..000000000000
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
-#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/hlo_to_mhlo/hlo_function_importer.h"
-
-#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_FUNCTION_IMPORTER_H_
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
deleted file mode 100644
index 8577e86dc938..000000000000
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
-#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/hlo_to_mhlo/hlo_module_importer.h"
-
-#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_MODULE_IMPORTER_H_
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
deleted file mode 100644
index 4943ef790d35..000000000000
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
-#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
-
-#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
deleted file mode 100644
index 50e310286174..000000000000
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines helpers useful when creating or manipulating lhlo/hlo.
-
-#ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
-#define XLA_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/hlo_to_mhlo/hlo_utils.h"
-
-#endif  // XLA_TRANSLATE_HLO_TO_MHLO_HLO_UTILS_H_
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/translate.h b/third_party/xla/xla/translate/hlo_to_mhlo/translate.h
deleted file mode 100644
index 4ed0dc5c1ba2..000000000000
--- a/third_party/xla/xla/translate/hlo_to_mhlo/translate.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
-#define XLA_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/hlo_to_mhlo/translate.h"
-
-#endif  // XLA_TRANSLATE_HLO_TO_MHLO_TRANSLATE_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
deleted file mode 100644
index d3dfd9e969d0..000000000000
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ /dev/null
@@ -1,87 +0,0 @@
-load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([
-        "//learning/brain/mlir:tensorflow_friends",
-        "//learning/brain/mlir:xla_friends",
-    ]),
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "attribute_exporter",
-    hdrs = ["attribute_exporter.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:attribute_exporter instead.",
-    deps = [
-        "//xla/hlo/translate/mhlo_to_hlo:attribute_exporter",
-    ],
-)
-
-cc_library(
-    name = "layout_util",
-    hdrs = ["layout_util.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:layout_util instead.",
-    deps = [
-        "//xla/hlo/translate/mhlo_to_hlo:layout_util",
-    ],
-)
-
-cc_library(
-    name = "location_exporter",
-    hdrs = ["location_exporter.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:location_exporter instead.",
-    deps = [
-        "//xla/hlo/translate/mhlo_to_hlo:location_exporter",
-    ],
-)
-
-alias(
-    name = "module_attributes_exporter",
-    actual = "//xla/hlo/translate/mhlo_to_hlo:module_attributes_exporter",
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:module_attributes_exporter instead.",
-)
-
-alias(
-    name = "stack_frame_index_builder",
-    actual = "//xla/hlo/translate/mhlo_to_hlo:stack_frame_index_builder",
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:stack_frame_index_builder instead.",
-)
-
-cc_library(
-    name = "mlir_hlo_to_hlo",
-    hdrs = ["mlir_hlo_to_hlo.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:mlir_hlo_to_hlo instead.",
-    deps = [
-        "//xla/hlo/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
-    ],
-)
-
-cc_library(
-    name = "translate",
-    hdrs = ["translate.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:translate instead.",
-    deps = [
-        "//xla/hlo/translate/mhlo_to_hlo:translate",
-    ],
-)
-
-cc_library(
-    name = "translate_registration",
-    testonly = True,
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:translate_registration instead.",
-    deps = [
-        "//xla/hlo/translate/mhlo_to_hlo:translate_registration",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "type_to_shape",
-    hdrs = ["type_to_shape.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/mhlo_to_hlo:type_to_shape instead.",
-    deps = [
-        "//xla/hlo/translate/mhlo_to_hlo:type_to_shape",
-    ],
-)
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
deleted file mode 100644
index 2caf77bf3a3d..000000000000
--- a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
-#define XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/mhlo_to_hlo/attribute_exporter.h"
-
-#endif  // XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
deleted file mode 100644
index 6005d23d69e9..000000000000
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Utilities for working with XLA layout and shapes.
-
-#ifndef XLA_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
-#define XLA_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/mhlo_to_hlo/layout_util.h"
-
-#endif  // XLA_TRANSLATE_MHLO_TO_HLO_LAYOUT_UTIL_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.h b/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.h
deleted file mode 100644
index b5c43ce49c48..000000000000
--- a/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
-#define XLA_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/mhlo_to_hlo/location_exporter.h"
-
-#endif  // XLA_TRANSLATE_MHLO_TO_HLO_LOCATION_EXPORTER_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
deleted file mode 100644
index 1544b99e0695..000000000000
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
-#define XLA_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
-
-#endif  // XLA_TRANSLATE_MHLO_TO_HLO_MLIR_HLO_TO_HLO_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/translate.h b/third_party/xla/xla/translate/mhlo_to_hlo/translate.h
deleted file mode 100644
index 373eaca3fca4..000000000000
--- a/third_party/xla/xla/translate/mhlo_to_hlo/translate.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
-#define XLA_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/mhlo_to_hlo/translate.h"
-
-#endif  // XLA_TRANSLATE_MHLO_TO_HLO_TRANSLATE_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.h b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.h
deleted file mode 100644
index 2e99276efe7c..000000000000
--- a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
-#define XLA_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/mhlo_to_hlo/type_to_shape.h"
-
-#endif  // XLA_TRANSLATE_MHLO_TO_HLO_TYPE_TO_SHAPE_H_
diff --git a/third_party/xla/xla/translate/stablehlo_to_hlo/BUILD b/third_party/xla/xla/translate/stablehlo_to_hlo/BUILD
deleted file mode 100644
index e588cb866371..000000000000
--- a/third_party/xla/xla/translate/stablehlo_to_hlo/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([
-        "//learning/brain/mlir:tensorflow_friends",
-        "//learning/brain/mlir:xla_friends",
-    ]),
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "translate",
-    hdrs = ["translate.h"],
-    deprecation = "This library is deprecated and will be removed in February 2025. Use //third_party/tensorflow/compiler/xla/hlo/translate/stablehlo_to_hlo:translate instead.",
-    deps = [
-        "//xla/hlo/translate/stablehlo_to_hlo:translate",
-    ],
-)
diff --git a/third_party/xla/xla/translate/stablehlo_to_hlo/translate.h b/third_party/xla/xla/translate/stablehlo_to_hlo/translate.h
deleted file mode 100644
index badaeeaa9acb..000000000000
--- a/third_party/xla/xla/translate/stablehlo_to_hlo/translate.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
-#define XLA_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
-
-// The current header will be deprecated in favour of the following.
-#include "xla/hlo/translate/stablehlo_to_hlo/translate.h"
-
-#endif  // XLA_TRANSLATE_STABLEHLO_TO_HLO_TRANSLATE_H_
diff --git a/third_party/xla/xla/tsl/BUILD b/third_party/xla/xla/tsl/BUILD
index 4b353ac3c324..a230f93e9e5f 100644
--- a/third_party/xla/xla/tsl/BUILD
+++ b/third_party/xla/xla/tsl/BUILD
@@ -83,24 +83,30 @@ config_setting(
 
 config_setting(
     name = "macos_x86_64_with_framework_shared_object",
+    constraint_values = [
+        "@platforms//os:macos",
+        "@platforms//cpu:x86_64",
+    ],
     define_values = {
         "framework_shared_object": "true",
     },
     values = {
         "apple_platform_type": "macos",
-        "cpu": "darwin",
     },
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "macos_arm64_with_framework_shared_object",
+    constraint_values = [
+        "@platforms//os:macos",
+        "@platforms//cpu:aarch64",
+    ],
     define_values = {
         "framework_shared_object": "true",
     },
     values = {
         "apple_platform_type": "macos",
-        "cpu": "darwin_arm64",
     },
     visibility = ["//visibility:public"],
 )
@@ -125,7 +131,7 @@ config_setting(
 config_setting(
     name = "android",
     constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
+        ["@platforms//os:android"],
         [],
     ),
     values = if_oss(
@@ -138,7 +144,7 @@ config_setting(
 config_setting(
     name = "emscripten",
     constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:emscripten"],
+        ["@platforms//os:emscripten"],
         [],
     ),
     values = if_oss(
@@ -148,52 +154,26 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# Sometimes Bazel reports darwin_x86_64 as "darwin" and sometimes as
-# "darwin_x86_64". The former shows up when building on a Mac x86_64 host for a Mac x86_64 target.
-# The latter shows up when cross-compiling for Mac x86_64 from a Mac ARM machine and in internal
-# Google builds.
-config_setting(
-    name = "macos_x86_64_default",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:macos"],
-        [],
-    ),
-    values = {
-        "apple_platform_type": "macos",
-        "cpu": "darwin",
-    },
-)
-
 config_setting(
-    name = "macos_x86_64_crosscompile",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:macos"],
-        [],
-    ),
+    name = "macos_x86_64",
+    constraint_values = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:macos",
+    ],
     values = {
         "apple_platform_type": "macos",
-        "cpu": "darwin_x86_64",
     },
-)
-
-selects.config_setting_group(
-    name = "macos_x86_64",
-    match_any = [
-        ":macos_x86_64_default",
-        ":macos_x86_64_crosscompile",
-    ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "macos_arm64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:macos"],
-        [],
-    ),
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+        "@platforms//os:macos",
+    ],
     values = {
         "apple_platform_type": "macos",
-        "cpu": "darwin_arm64",
     },
     visibility = ["//visibility:public"],
 )
@@ -209,28 +189,27 @@ selects.config_setting_group(
 
 config_setting(
     name = "windows_x86_64",
-    values = {"cpu": "x64_windows"},
+    constraint_values = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "windows_aarch64",
-    values = {"cpu": "arm64_windows"},
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+        "@platforms//os:windows",
+    ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "windows",
-    # Internal builds query the target OS.
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:windows"],
-        [],
-    ),
-    # OSS builds query the CPU type.
-    values = if_oss(
-        {"cpu": "x64_windows"},
-        {},
-    ),
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -238,7 +217,7 @@ config_setting(
 config_setting(
     name = "ios",
     constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:ios"],
+        ["@platforms//os:ios"],
         [],
     ),
     values = if_oss(
@@ -271,80 +250,80 @@ config_setting(
 
 config_setting(
     name = "android_arm",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:armv7",
+            "@platforms//os:android",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//external:android/crosstool"},
         ),
-        cpu = "armeabi-v7a",
     ),
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_aarch64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "aarch64"},
+    constraint_values =
+        [
+            "@platforms//cpu:aarch64",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_armhf",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "armhf"},
+    constraint_values =
+        [
+            "@platforms//cpu:armv7e-mf",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_x86_64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "k8"},
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_ppc64le",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "ppc"},
+    constraint_values =
+        [
+            "@platforms//cpu:ppc64le",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "linux_s390x",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:linux"],
-        [],
-    ),
-    values = {"cpu": "s390x"},
+    constraint_values =
+        [
+            "@platforms//cpu:s390x",
+            "@platforms//os:linux",
+        ],
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "ios_x86_64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:ios"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:ios",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//tools/osx/crosstool:crosstool"},
         ),
-        cpu = "ios_x86_64",
     ),
     visibility = ["//visibility:public"],
 )
@@ -420,69 +399,37 @@ config_setting(
 
 config_setting(
     name = "fuchsia",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:fuchsia"],
-        [],
-    ),
-    values = if_oss(
-        # TODO(b/149248802) When we have a Fuchsia Bazel SDK update to use the values it sets.
-        {"cpu": "fuchsia"},
-        {},
-    ),
+    constraint_values = ["@platforms//os:fuchsia"],
     visibility = ["//visibility:public"],
 )
 
 # TODO(jakeharmon): Remove equivalent from tensorflow/BUILD
 config_setting(
     name = "android_x86",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:android"],
-        [],
-    ),
+    constraint_values =
+        [
+            "@platforms//cpu:x86_32",
+            "@platforms//os:android",
+        ],
     values = dict(
         if_oss(
             {"crosstool_top": "//external:android/crosstool"},
         ),
-        cpu = "x86",
     ),
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "arm",
-    values = {"cpu": "arm"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "armeabi",
-    values = {"cpu": "armeabi"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "armeabi-v7a",
-    values = {"cpu": "armeabi-v7a"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "arm64-v8a",
-    values = {"cpu": "arm64-v8a"},
-    visibility = ["//visibility:public"],
-)
-
 tsl_extra_config_settings()
 
 selects.config_setting_group(
     name = "arm_any",
     match_any = [
-        ":arm",
-        ":armeabi",
-        ":armeabi-v7a",
-        ":arm64-v8a",
-        ":linux_aarch64",
-        ":linux_armhf",
+        "@platforms//cpu:aarch32",
+        "@platforms//cpu:aarch64",
+        "@platforms//cpu:armv7",
+        "@platforms//cpu:armv7-m",
+        "@platforms//cpu:armv7e-m",
+        "@platforms//cpu:armv7e-mf",
     ] + tsl_extra_config_settings_targets(),
     visibility = ["//visibility:public"],
 )
@@ -510,13 +457,11 @@ selects.config_setting_group(
 # TODO(jakeharmon): Remove equivalent from tensorflow/BUILD
 config_setting(
     name = "fuchsia_x86_64",
-    constraint_values = if_google(
-        ["//third_party/bazel_platforms/os:fuchsia"],
-        [],
-    ),
-    values = {
-        "cpu": "x86_64",
-    },
+    constraint_values =
+        [
+            "@platforms//cpu:x86_64",
+            "@platforms//os:fuchsia",
+        ],
     visibility = ["//visibility:public"],
 )
 
@@ -549,7 +494,10 @@ selects.config_setting_group(
 
 config_setting(
     name = "freebsd",
-    values = {"cpu": "freebsd"},
+    constraint_values = [
+        "@platforms//os:freebsd",
+        "@platforms//cpu:x86_64",
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/third_party/xla/xla/tsl/concurrency/BUILD b/third_party/xla/xla/tsl/concurrency/BUILD
index e26b39410cee..44332671c369 100644
--- a/third_party/xla/xla/tsl/concurrency/BUILD
+++ b/third_party/xla/xla/tsl/concurrency/BUILD
@@ -53,6 +53,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:context",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/concurrency/async_value.cc b/third_party/xla/xla/tsl/concurrency/async_value.cc
index 8e4c7176549d..f6e217729170 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value.cc
@@ -42,7 +42,7 @@ uint16_t AsyncValue::CreateTypeInfoAndReturnTypeIdImpl(
 
 AsyncValue::TypeInfoTable* AsyncValue::GetTypeInfoTableSingleton() {
   constexpr int kInitialCapacity = 64;
-  static auto* type_info_table = new TypeInfoTable(kInitialCapacity);
+  static auto* const type_info_table = new TypeInfoTable(kInitialCapacity);
   return type_info_table;
 }
 
@@ -165,7 +165,7 @@ void BlockUntilReady(AsyncValue* async_value) {
 }
 
 void RunWhenReady(absl::Span<AsyncValue* const> values,
-                  absl::AnyInvocable<void()> callee) {
+                  absl::AnyInvocable<void() &&> callee) {
   // Perform a quick scan of the arguments.  If they are all available,
   // then we can run the callee synchronously.
   absl::InlinedVector<AsyncValue*, 4> unavailable_values;
@@ -174,18 +174,18 @@ void RunWhenReady(absl::Span<AsyncValue* const> values,
   }
 
   // If we can synchronously call 'callee', then do it and we're done.
-  if (unavailable_values.empty()) return callee();
+  if (unavailable_values.empty()) return std::move(callee)();
 
   // If there is exactly one unavailable value, then we can just AndThen it.
   if (unavailable_values.size() == 1) {
     unavailable_values[0]->AndThen(
-        [callee = std::move(callee)]() mutable { callee(); });
+        [callee = std::move(callee)]() mutable { std::move(callee)(); });
     return;
   }
 
   struct CounterAndCallee {
     std::atomic<size_t> counter;
-    absl::AnyInvocable<void()> callee;
+    absl::AnyInvocable<void() &&> callee;
   };
 
   // Otherwise, we have multiple unavailable values.  Put a counter on the heap
@@ -199,14 +199,14 @@ void RunWhenReady(absl::Span<AsyncValue* const> values,
       if (data->counter.fetch_sub(1) != 1) return;
 
       // If we are the last one, then run the callee and free the data.
-      data->callee();
+      std::move(data->callee)();
       delete data;
     });
   }
 }
 
 void RunWhenReady(absl::Span<RCReference<AsyncValue> const> values,
-                  absl::AnyInvocable<void()> callee) {
+                  absl::AnyInvocable<void() &&> callee) {
   absl::InlinedVector<AsyncValue*, 8> pointers;
   pointers.reserve(values.size());
   for (const auto& ref : values) {
diff --git a/third_party/xla/xla/tsl/concurrency/async_value.h b/third_party/xla/xla/tsl/concurrency/async_value.h
index e6deddb30919..f59b9589e953 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/concurrent_vector.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/logging.h"
+#include "tsl/platform/context.h"
 
 namespace tsl {
 namespace internal {
@@ -487,8 +488,13 @@ class AsyncValue {
     static_assert(std::is_invocable_v<Waiter>, "Waiter must be invocable");
 
     struct Node final : public WaiterListNode {
-      explicit Node(Waiter waiter) : waiter(std::move(waiter)) {}
-      void operator()() final { waiter(); }
+      explicit Node(Waiter waiter)
+          : context(ContextKind::kThread), waiter(std::move(waiter)) {}
+      void operator()() final {
+        WithContext wc(context);
+        std::move(waiter)();
+      }
+      Context context;
       Waiter waiter;
     };
 
@@ -516,9 +522,9 @@ void BlockUntilReady(AsyncValue* async_value);
 
 // Runs the `callee` when all async values become available.
 void RunWhenReady(absl::Span<AsyncValue* const> values,
-                  absl::AnyInvocable<void()> callee);
+                  absl::AnyInvocable<void() &&> callee);
 void RunWhenReady(absl::Span<RCReference<AsyncValue> const> values,
-                  absl::AnyInvocable<void()> callee);
+                  absl::AnyInvocable<void() &&> callee);
 
 //===----------------------------------------------------------------------===//
 
@@ -1014,7 +1020,7 @@ void AsyncValue::AndThen(Waiter&& waiter) {
   if (waiters_and_state.state() == State::kConcrete ||
       waiters_and_state.state() == State::kError) {
     DCHECK_EQ(waiters_and_state.waiter(), nullptr);
-    waiter();
+    std::forward<Waiter>(waiter)();
     return;
   }
 
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_ref.h b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
index 5a440e42df5e..8915a274c585 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_ref.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
@@ -550,9 +550,8 @@ class AsyncValuePtr {
     AndThen([waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
       if (ABSL_PREDICT_FALSE(ptr.IsError())) {
         return waiter(ptr.GetError());
-      } else {
-        return waiter(&ptr.get());
       }
+      return waiter(&ptr.get());
     });
   }
 
@@ -565,9 +564,8 @@ class AsyncValuePtr {
             [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
               if (ABSL_PREDICT_FALSE(ref.IsError())) {
                 return waiter(ref.GetError());
-              } else {
-                return waiter(&ref.get());
               }
+              return waiter(&ref.get());
             });
   }
 
@@ -593,9 +591,8 @@ class AsyncValuePtr {
     AndThen([waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
       if (ABSL_PREDICT_FALSE(ptr.IsError())) {
         return waiter(ptr.GetError());
-      } else {
-        return waiter(absl::OkStatus());
       }
+      return waiter(absl::OkStatus());
     });
   }
 
@@ -608,9 +605,8 @@ class AsyncValuePtr {
             [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
               if (ABSL_PREDICT_FALSE(ref.IsError())) {
                 return waiter(ref.GetError());
-              } else {
-                return waiter(absl::OkStatus());
               }
+              return waiter(absl::OkStatus());
             });
   }
 
@@ -853,11 +849,14 @@ class CountDownAsyncValueRef {
       : state_(std::make_shared<State>(std::move(ref), cnt)) {
     DCHECK(state_->ref.IsConstructed()) << "AsyncValue must be constructed";
     DCHECK(state_->ref.IsUnavailable()) << "AsyncValue must be unavailable";
-    DCHECK_GT(cnt, 0) << "Count must be positive";
+    DCHECK_GE(cnt, 0) << "Count must be positive";
+    if (ABSL_PREDICT_FALSE(cnt == 0)) {
+      state_->ref.SetStateConcrete();
+    }
   }
 
   template <typename... Args>
-  explicit CountDownAsyncValueRef(Args&&... args, int64_t cnt)
+  explicit CountDownAsyncValueRef(int64_t cnt, Args&&... args)
       : CountDownAsyncValueRef(
             MakeConstructedAsyncValueRef<T>(std::forward<Args>(args)...), cnt) {
   }
@@ -905,11 +904,10 @@ class CountDownAsyncValueRef {
           return state_->status;
         };
         state_->ref.SetError(take_error());
-        return true;
       } else {
         state_->ref.SetStateConcrete();
-        return true;
       }
+      return true;
     }
 
     return false;
@@ -920,7 +918,9 @@ class CountDownAsyncValueRef {
     return CountDown(1, status);
   }
 
-  AsyncValueRef<T> AsRef() const { return state_->ref; }
+  AsyncValueRef<T> AsRef() && { return std::move(state_->ref); }
+  AsyncValueRef<T> AsRef() const& { return state_->ref; }
+
   AsyncValuePtr<T> AsPtr() const { return state_->ref.AsPtr(); }
 
   // Returns true if count down was called with an error.
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_ref_test.cc b/third_party/xla/xla/tsl/concurrency/async_value_ref_test.cc
index d845c2b3e2e6..fd8df9f41f5e 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_ref_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
@@ -908,6 +909,14 @@ TEST(AsyncValueRefTest, RecursiveOwnership) {
   EXPECT_EQ(counter, 1 + 2 + 3);
 }
 
+TEST(AsyncValueRefTest, CountDownZero) {
+  CountDownAsyncValueRef<int32_t> count_down_ref(0, 42);
+  AsyncValueRef<int32_t> ref = count_down_ref.AsRef();
+
+  EXPECT_TRUE(ref.IsAvailable());
+  EXPECT_EQ(*ref, 42);
+}
+
 TEST(AsyncValueRefTest, CountDownSuccess) {
   AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
 
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_test.cc b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
index 1cf6b63fe99c..b90274d25bc9 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
@@ -183,6 +183,15 @@ TEST(AsyncValueTest, StackAllocatedAsyncValue) {
   EXPECT_EQ(2, counter);
 }
 
+TEST(AsyncValueTest, MoveOnlyCallback) {
+  struct MoveOnlyCb {
+    void operator()() && {}
+  };
+  auto value = MakeConstructedAsyncValueRef<int32_t>(123);
+  value.AndThen(MoveOnlyCb());
+  value.SetStateConcrete();
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks below
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/tsl/cuda/cublas_stub.cc b/third_party/xla/xla/tsl/cuda/cublas_stub.cc
index bbec38bd3e86..cd4e935120f9 100644
--- a/third_party/xla/xla/tsl/cuda/cublas_stub.cc
+++ b/third_party/xla/xla/tsl/cuda/cublas_stub.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string_view>
+#include "absl/strings/string_view.h"
 
 #if CUBLAS_VER_MAJOR >= 11
 #include "third_party/gpus/cuda/include/cublas_v2.h"
@@ -56,8 +56,8 @@ const char *kSymbols[] = {
 
 constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char *);
 
-absl::flat_hash_set<std::string_view> const &FatalErrorSymbols() {
-  static auto *syms = new absl::flat_hash_set<std::string_view>{
+absl::flat_hash_set<absl::string_view> const &FatalErrorSymbols() {
+  static auto *syms = new absl::flat_hash_set<absl::string_view>{
       "cublasGetCudartVersion",
       "cublasXerbla",
       "cublasSnrm2",
diff --git a/third_party/xla/xla/tsl/cuda/cudart_stub.cc b/third_party/xla/xla/tsl/cuda/cudart_stub.cc
index 55a6dd88309a..74affb5f1f44 100644
--- a/third_party/xla/xla/tsl/cuda/cudart_stub.cc
+++ b/third_party/xla/xla/tsl/cuda/cudart_stub.cc
@@ -16,9 +16,8 @@ limitations under the License.
 // This file wraps cuda runtime calls with dso loader so that we don't need to
 // have explicit linking to libcuda.
 
-#include <string_view>
-
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "xla/tsl/platform/logging.h"
 #include "tsl/platform/dso_loader.h"
@@ -51,8 +50,8 @@ const char *kSymbols[] = {
 
 constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char *);
 
-absl::flat_hash_set<std::string_view> const &ErrorStringSymbols() {
-  static auto *syms = new absl::flat_hash_set<std::string_view>{
+absl::flat_hash_set<absl::string_view> const &ErrorStringSymbols() {
+  static auto *syms = new absl::flat_hash_set<absl::string_view>{
       "cudaGetErrorName",
       "cudaGetErrorString",
   };
diff --git a/third_party/xla/xla/tsl/cuda/cudnn_stub.cc b/third_party/xla/xla/tsl/cuda/cudnn_stub.cc
index 483d391534a8..a52eb72f942c 100644
--- a/third_party/xla/xla/tsl/cuda/cudnn_stub.cc
+++ b/third_party/xla/xla/tsl/cuda/cudnn_stub.cc
@@ -65,8 +65,8 @@ static cudnnStatus_t GetSymbolNotFoundError() {
   return CUDNN_STATUS_INTERNAL_ERROR;
 }
 
-static absl::flat_hash_map<std::string_view, void*> const& SymbolOverrides() {
-  static auto* syms = new absl::flat_hash_map<std::string_view, void*>{
+static absl::flat_hash_map<absl::string_view, void*> const& SymbolOverrides() {
+  static auto* const syms = new absl::flat_hash_map<absl::string_view, void*>{
       {"cudnnGetVersion", reinterpret_cast<void*>(&GetVersionStub)},
       {"cudnnGetMaxDeviceVersion", reinterpret_cast<void*>(&GetVersionStub)},
       {"cudnnGetCudartVersion", reinterpret_cast<void*>(&GetVersionStub)},
diff --git a/third_party/xla/xla/tsl/cuda/cusparse_stub.cc b/third_party/xla/xla/tsl/cuda/cusparse_stub.cc
index 56730ea90d0a..08873eaa398c 100644
--- a/third_party/xla/xla/tsl/cuda/cusparse_stub.cc
+++ b/third_party/xla/xla/tsl/cuda/cusparse_stub.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string_view>
-
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cusparse.h"
 #include "xla/tsl/platform/logging.h"
@@ -54,8 +53,8 @@ const char* kSymbols[] = {
 
 constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
 
-absl::flat_hash_set<std::string_view> const& StringErrorSymbols() {
-  static auto* syms = new absl::flat_hash_set<std::string_view>{
+absl::flat_hash_set<absl::string_view> const& StringErrorSymbols() {
+  static auto* const syms = new absl::flat_hash_set<absl::string_view>{
       "cusparseGetErrorName",
       "cusparseGetErrorString",
   };
diff --git a/third_party/xla/xla/tsl/cuda/nccl_stub.cc b/third_party/xla/xla/tsl/cuda/nccl_stub.cc
index 345e5e5a6d6a..ae9ca9d1ae3e 100644
--- a/third_party/xla/xla/tsl/cuda/nccl_stub.cc
+++ b/third_party/xla/xla/tsl/cuda/nccl_stub.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string_view>
-
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/nccl/nccl.h"
 #include "xla/tsl/platform/logging.h"
@@ -53,8 +52,8 @@ const char* kSymbols[] = {
 
 constexpr size_t kNumSymbols = sizeof(kSymbols) / sizeof(const char*);
 
-absl::flat_hash_set<std::string_view> const& ErrorStringSymbols() {
-  static auto* syms = new absl::flat_hash_set<std::string_view>{
+absl::flat_hash_set<absl::string_view> const& ErrorStringSymbols() {
+  static auto* const syms = new absl::flat_hash_set<absl::string_view>{
       "ncclGetErrorString",
       "pncclGetErrorString",
       "ncclGetLastError",
diff --git a/third_party/xla/xla/tsl/cuda/nvshmem.symbols b/third_party/xla/xla/tsl/cuda/nvshmem.symbols
index d9de74e6557d..7f8385f7c5ca 100644
--- a/third_party/xla/xla/tsl/cuda/nvshmem.symbols
+++ b/third_party/xla/xla/tsl/cuda/nvshmem.symbols
@@ -2,6 +2,25 @@ nvshmem_align
 nvshmem_alltoallmem
 nvshmem_barrier
 nvshmem_barrier_all
+nvshmem_bfloat16_alltoall
+nvshmem_bfloat16_broadcast
+nvshmem_bfloat16_fcollect
+nvshmem_bfloat16_g
+nvshmem_bfloat16_get
+nvshmem_bfloat16_get_nbi
+nvshmem_bfloat16_iget
+nvshmem_bfloat16_iput
+nvshmem_bfloat16_max_reduce
+nvshmem_bfloat16_max_reducescatter
+nvshmem_bfloat16_min_reduce
+nvshmem_bfloat16_min_reducescatter
+nvshmem_bfloat16_p
+nvshmem_bfloat16_prod_reduce
+nvshmem_bfloat16_prod_reducescatter
+nvshmem_bfloat16_put
+nvshmem_bfloat16_put_nbi
+nvshmem_bfloat16_sum_reduce
+nvshmem_bfloat16_sum_reducescatter
 nvshmem_broadcastmem
 nvshmem_calloc
 nvshmem_char_alltoall
@@ -84,6 +103,25 @@ nvshmem_get8_nbi
 nvshmem_getmem
 nvshmem_getmem_nbi
 nvshmem_global_exit
+nvshmem_half_alltoall
+nvshmem_half_broadcast
+nvshmem_half_fcollect
+nvshmem_half_g
+nvshmem_half_get
+nvshmem_half_get_nbi
+nvshmem_half_iget
+nvshmem_half_iput
+nvshmem_half_max_reduce
+nvshmem_half_max_reducescatter
+nvshmem_half_min_reduce
+nvshmem_half_min_reducescatter
+nvshmem_half_p
+nvshmem_half_prod_reduce
+nvshmem_half_prod_reducescatter
+nvshmem_half_put
+nvshmem_half_put_nbi
+nvshmem_half_sum_reduce
+nvshmem_half_sum_reducescatter
 nvshmem_iget128
 nvshmem_iget16
 nvshmem_iget32
@@ -724,6 +762,27 @@ nvshmemid_init_status
 nvshmemx_alltoallmem_on_stream
 nvshmemx_barrier_all_on_stream
 nvshmemx_barrier_on_stream
+nvshmemx_bfloat16_alltoall_on_stream
+nvshmemx_bfloat16_broadcast_on_stream
+nvshmemx_bfloat16_fcollect_on_stream
+nvshmemx_bfloat16_g_on_stream
+nvshmemx_bfloat16_get_nbi_on_stream
+nvshmemx_bfloat16_get_on_stream
+nvshmemx_bfloat16_iget_on_stream
+nvshmemx_bfloat16_iput_on_stream
+nvshmemx_bfloat16_max_reduce_on_stream
+nvshmemx_bfloat16_max_reducescatter_on_stream
+nvshmemx_bfloat16_min_reduce_on_stream
+nvshmemx_bfloat16_min_reducescatter_on_stream
+nvshmemx_bfloat16_p_on_stream
+nvshmemx_bfloat16_prod_reduce_on_stream
+nvshmemx_bfloat16_prod_reducescatter_on_stream
+nvshmemx_bfloat16_put_nbi_on_stream
+nvshmemx_bfloat16_put_on_stream
+nvshmemx_bfloat16_put_signal_nbi_on_stream
+nvshmemx_bfloat16_put_signal_on_stream
+nvshmemx_bfloat16_sum_reduce_on_stream
+nvshmemx_bfloat16_sum_reducescatter_on_stream
 nvshmemx_broadcastmem_on_stream
 nvshmemx_buffer_register
 nvshmemx_buffer_unregister
@@ -807,6 +866,27 @@ nvshmemx_get8_on_stream
 nvshmemx_get_uniqueid
 nvshmemx_getmem_nbi_on_stream
 nvshmemx_getmem_on_stream
+nvshmemx_half_alltoall_on_stream
+nvshmemx_half_broadcast_on_stream
+nvshmemx_half_fcollect_on_stream
+nvshmemx_half_g_on_stream
+nvshmemx_half_get_nbi_on_stream
+nvshmemx_half_get_on_stream
+nvshmemx_half_iget_on_stream
+nvshmemx_half_iput_on_stream
+nvshmemx_half_max_reduce_on_stream
+nvshmemx_half_max_reducescatter_on_stream
+nvshmemx_half_min_reduce_on_stream
+nvshmemx_half_min_reducescatter_on_stream
+nvshmemx_half_p_on_stream
+nvshmemx_half_prod_reduce_on_stream
+nvshmemx_half_prod_reducescatter_on_stream
+nvshmemx_half_put_nbi_on_stream
+nvshmemx_half_put_on_stream
+nvshmemx_half_put_signal_nbi_on_stream
+nvshmemx_half_put_signal_on_stream
+nvshmemx_half_sum_reduce_on_stream
+nvshmemx_half_sum_reducescatter_on_stream
 nvshmemx_hostlib_finalize
 nvshmemx_hostlib_init_attr
 nvshmemx_iget128_on_stream
@@ -1389,4 +1469,4 @@ nvshmemx_ushort_wait_until_all_vector_on_stream
 nvshmemx_ushort_wait_until_on_stream
 nvshmemx_ushort_xor_reduce_on_stream
 nvshmemx_ushort_xor_reducescatter_on_stream
-nvshmemx_vendor_get_version_info
\ No newline at end of file
+nvshmemx_vendor_get_version_info
diff --git a/third_party/xla/xla/tsl/distributed_runtime/BUILD b/third_party/xla/xla/tsl/distributed_runtime/BUILD
index e969e9d986b0..cd618bc899e1 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/BUILD
@@ -23,7 +23,7 @@ cc_library(
     deps = [
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
diff --git a/third_party/xla/xla/tsl/distributed_runtime/call_options.cc b/third_party/xla/xla/tsl/distributed_runtime/call_options.cc
index a13ba77fd0bf..7665e07055ab 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/call_options.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/call_options.cc
@@ -17,14 +17,14 @@ limitations under the License.
 
 #include <utility>
 
-#include "tsl/platform/mutex.h"
+#include "absl/synchronization/mutex.h"
 
 namespace tsl {
 
 CallOptions::CallOptions() = default;
 
 void CallOptions::StartCancel() {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   if (cancel_func_ != nullptr) {
     // NOTE: We must call the cancel_func_ with mu_ held. This ensure
     // that ClearCancelCallback() does not race with StartCancel().
@@ -34,22 +34,22 @@ void CallOptions::StartCancel() {
 }
 
 void CallOptions::SetCancelCallback(CancelFunction cancel_func) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   cancel_func_ = std::move(cancel_func);
 }
 
 void CallOptions::ClearCancelCallback() {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   cancel_func_ = nullptr;
 }
 
 int64_t CallOptions::GetTimeout() {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return timeout_in_ms_;
 }
 
 void CallOptions::SetTimeout(int64_t ms) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   timeout_in_ms_ = ms;
 }
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/call_options.h b/third_party/xla/xla/tsl/distributed_runtime/call_options.h
index 95231e12b584..0732fe17d707 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/call_options.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/call_options.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -67,7 +67,7 @@ class CallOptions {
   void SetTimeout(int64_t ms);
 
  private:
-  mutex mu_;
+  absl::Mutex mu_;
   CancelFunction cancel_func_ TF_GUARDED_BY(mu_);
 
   // RPC operation timeout in milliseconds.
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
index 3deb3af02dda..4dc851a35ff1 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
@@ -1,4 +1,4 @@
-load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_gpu_library")
+load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -20,6 +20,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:regexp",
     ],
 )
@@ -50,30 +51,12 @@ cc_library(
 
 cc_library(
     name = "coordination_service",
-    hdrs = ["coordination_service.h"],
-    deps = [
-        ":coordination_client",
-        "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/protobuf:coordination_config_proto_cc",
-        "//xla/tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-# Keeping the implementation as a separate build target.
-# This is an alwayslink library for statically registering "standalone" implementation.
-# Other implementations of the service will be provided in the future.
-tsl_gpu_library(
-    name = "coordination_service_impl",
     srcs = ["coordination_service.cc"],
+    hdrs = ["coordination_service.h"],
     deps = [
         ":coordination_client",
-        ":coordination_service",
         ":coordination_service_error_util",
+        ":key_value_store",
         "//xla/tsl/distributed_runtime:call_options",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
@@ -82,20 +65,21 @@ tsl_gpu_library(
         "//xla/tsl/util:device_name_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:random",
     ],
-    alwayslink = 1,
 )
 
 tf_proto_library(
@@ -115,7 +99,6 @@ tsl_cc_test(
         ":coordination_client",
         ":coordination_service",
         ":coordination_service_error_util",
-        ":coordination_service_impl",
         ":test_device_proto_cc",
         "//xla/tsl/distributed_runtime:call_options",
         "//xla/tsl/lib/core:status_test_util",
@@ -125,11 +108,13 @@ tsl_cc_test(
         "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
@@ -137,7 +122,7 @@ tsl_cc_test(
     ],
 )
 
-tsl_gpu_library(
+cc_library(
     name = "coordination_service_agent",
     srcs = ["coordination_service_agent.cc"],
     hdrs = ["coordination_service_agent.h"],
@@ -155,14 +140,17 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
@@ -208,6 +196,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:protobuf",
@@ -215,6 +204,45 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "key_value_store",
+    srcs = ["key_value_store.cc"],
+    hdrs = ["key_value_store.h"],
+    deps = [
+        "//xla/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tsl_cc_test(
+    name = "key_value_store_test",
+    srcs = ["key_value_store_test.cc"],
+    tags = if_oss([
+        "manual",
+        "no_oss",
+    ]),  # no status matchers in OSS.
+    deps = [
+        ":key_value_store",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:status_matchers",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tsl_cc_test(
     name = "coordination_service_recoverable_job_test",
     srcs = ["coordination_service_recoverable_job_test.cc"],
@@ -222,7 +250,6 @@ tsl_cc_test(
         ":coordination_client",
         ":coordination_service",
         ":coordination_service_agent",
-        ":coordination_service_impl",
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
@@ -251,10 +278,9 @@ tsl_cc_test(
     shard_count = 4,
     tags = if_oss(["not_run:arm"]),
     deps = [
-        "//xla/tsl/distributed_runtime/coordination:coordination_client",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_impl",
+        ":coordination_client",
+        ":coordination_service",
+        ":coordination_service_agent",
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
@@ -271,6 +297,7 @@ tsl_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
@@ -282,5 +309,6 @@ filegroup(
     srcs = [
         "coordination_client.h",
         "coordination_service.h",
+        "key_value_store.h",
     ],
 )
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
index c52872a904dc..c71059840ce5 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/client_server_test.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <cassert>
 #include <functional>
 #include <memory>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -116,8 +116,8 @@ class ClientServerTest : public ::testing::Test {
         config.mutable_coordinated_job_list()->Add();
     job->set_name("agent");
     job->set_num_tasks(num_nodes);
-    auto service = tsl::CoordinationServiceInterface::EnableCoordinationService(
-        Env::Default(), config, /*cache=*/nullptr);
+    auto service = tsl::CoordinationService::Create(Env::Default(), config,
+                                                    /*cache=*/nullptr);
     return config;
   }
 
@@ -161,9 +161,8 @@ class ClientServerTest : public ::testing::Test {
                              grpc::InsecureServerCredentials());
     // Set up the actual coordination service (where all the real logic
     // lives).
-    coord_service_ =
-        tsl::CoordinationServiceInterface::EnableCoordinationService(
-            Env::Default(), config, /*cache=*/nullptr);
+    coord_service_ = tsl::CoordinationService::Create(Env::Default(), config,
+                                                      /*cache=*/nullptr);
     // Set up threads and RPC service.
     coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
         Env::Default(), "CoordinationServiceRpcHandler",
@@ -210,7 +209,7 @@ class ClientServerTest : public ::testing::Test {
  private:
   std::string service_address_;
   std::unique_ptr<grpc::Server> server_;
-  std::unique_ptr<tsl::CoordinationServiceInterface> coord_service_;
+  std::unique_ptr<tsl::CoordinationService> coord_service_;
   std::unique_ptr<tsl::thread::ThreadPool> coord_compute_pool_;
   std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<tsl::Thread> coord_rpc_thread_;
@@ -1075,6 +1074,94 @@ TEST_F(ClientServerTest, GetAliveTasks_Succeed) {
   }
 }
 
+TEST_F(ClientServerTest, GetJobState) {
+  // This test registers a JobStateCallback with the first client. It also fails
+  // the second client after a brief delay. The JobStateCallback should be
+  // called twice. The first time, all tasks should be healthy. The second time,
+  // the second task should be unhealthy.
+  const int num_nodes = 2;
+  StartService(num_nodes);
+
+  absl::Notification done;
+  auto thread_fn = [&](int node_id) -> absl::Status {
+    auto client =
+        GetClient(node_id,
+                  /*init_and_shutdown_timeout=*/absl::Seconds(3),
+                  /*shutdown_on_destruction=*/true, /*recoverable=*/true);
+    TF_RETURN_IF_ERROR(client->Connect());
+
+    if (node_id != 0) {
+      // Sleep for a while, before shutting down.
+      absl::SleepFor(absl::Seconds(3));
+      return absl::OkStatus();
+    }
+
+    int num_updates = 0;
+    client->AddJobStateCallback(
+        [&](const CoordinationServiceAgent::JobStateUpdate& update) {
+          num_updates++;
+
+          // Sort the task states by task id.
+          using Info = tensorflow::CoordinatedTaskStateInfo;
+          auto less = [](const Info& x, const Info& y) -> bool {
+            return x.task().task_id() < y.task().task_id();
+          };
+          std::vector<Info> previous(update.previous_state.begin(),
+                                     update.previous_state.end());
+          std::vector<Info> current(update.current_state.begin(),
+                                    update.current_state.end());
+          std::sort(previous.begin(), previous.end(), less);
+          std::sort(current.begin(), current.end(), less);
+
+          if (num_updates == 1) {
+            // The first update should have no previous state and should have
+            // all tasks healthy in the current state.
+            ASSERT_TRUE(previous.empty());
+            ASSERT_EQ(current[0].task().task_id(), 0);
+            ASSERT_EQ(current[0].state(),
+                      tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+            ASSERT_EQ(current[1].task().task_id(), 1);
+            ASSERT_EQ(current[1].state(),
+                      tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+          }
+
+          if (num_updates == 2) {
+            // The second update should have the second task unhealthy in the
+            // current state.
+            ASSERT_EQ(previous[0].task().task_id(), 0);
+            ASSERT_EQ(previous[0].state(),
+                      tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+            ASSERT_EQ(previous[1].task().task_id(), 1);
+            ASSERT_EQ(previous[1].state(),
+                      tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+
+            ASSERT_EQ(current[0].task().task_id(), 0);
+            ASSERT_EQ(current[0].state(),
+                      tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+            ASSERT_EQ(current[1].task().task_id(), 1);
+            ASSERT_EQ(current[1].state(),
+                      tensorflow::CoordinatedTaskState::TASKSTATE_DISCONNECTED);
+
+            done.Notify();
+          }
+        });
+    done.WaitForNotification();
+    return absl::OkStatus();
+  };
+
+  std::vector<absl::Status> statuses(num_nodes);
+  {
+    tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
+                                        num_nodes);
+    for (int i = 0; i < num_nodes; ++i) {
+      thread_pool.Schedule([&, i]() { statuses[i] = thread_fn(i); });
+    }
+  }
+  for (int i = 0; i < num_nodes; ++i) {
+    TF_EXPECT_OK(statuses[i]);
+  }
+}
+
 TEST_F(ClientServerTest, GetKeyValueDir) {
   StartService(/*num_nodes=*/1);
   auto client = GetClient(/*node_id=*/0);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
index 6efd02a73685..6588f9da278a 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_client.h
@@ -32,6 +32,8 @@ using tensorflow::DeleteKeyValueRequest;
 using tensorflow::DeleteKeyValueResponse;
 using tensorflow::GetAliveTasksRequest;
 using tensorflow::GetAliveTasksResponse;
+using tensorflow::GetJobStateRequest;
+using tensorflow::GetJobStateResponse;
 using tensorflow::GetKeyValueDirRequest;
 using tensorflow::GetKeyValueDirResponse;
 using tensorflow::GetKeyValueRequest;
@@ -101,6 +103,10 @@ class CoordinationClient {
                                  GetTaskStateResponse* response,
                                  StatusCallback done) = 0;
 
+  virtual void GetJobStateAsync(const GetJobStateRequest* request,
+                                GetJobStateResponse* response,
+                                StatusCallback done) = 0;
+
   virtual void InsertKeyValueAsync(const InsertKeyValueRequest* request,
                                    InsertKeyValueResponse* response,
                                    StatusCallback done) = 0;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
index ed611f3b3777..ca0251eebe7d 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -21,19 +21,15 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <iterator>
-#include <map>
 #include <memory>
+#include <optional>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/bind_front.h"
-#include "absl/hash/hash.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -53,7 +49,6 @@ limitations under the License.
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/tsl/util/device_name_utils.h"
-#include "tsl/platform/random.h"
 
 namespace tsl {
 namespace {
@@ -77,7 +72,7 @@ constexpr char kHealthCheckThread[] = "CoordinationServiceHealthCheck";
 constexpr int kPendingStragglerLogLimit = 3;
 constexpr int kUniqueBarrierCounter = 0;
 
-std::string GetTaskName(std::string_view job_name, int task_id) {
+std::string GetTaskName(absl::string_view job_name, int task_id) {
   return absl::StrCat("/job:", job_name, "/replica:", 0, "/task:", task_id);
 }
 
@@ -85,7 +80,7 @@ std::string GetTaskName(const CoordinatedTask& task) {
   return GetTaskName(task.job_name(), task.task_id());
 }
 
-CoordinatedTask GetTaskFromName(std::string_view task_name) {
+CoordinatedTask GetTaskFromName(absl::string_view task_name) {
   DeviceNameUtils::ParsedName parsed;
   DeviceNameUtils::ParseFullName(task_name, &parsed);
   CoordinatedTask task;
@@ -94,421 +89,14 @@ CoordinatedTask GetTaskFromName(std::string_view task_name) {
   return task;
 }
 
-// Convenience structs to allow using CoordinatedTask as container keys.
-struct CoordinatedTaskHash {
-  uint64_t operator()(const CoordinatedTask& task) const {
-    return absl::HashOf(task.job_name(), task.task_id());
-  }
-};
-struct CoordinatedTaskEqual {
-  bool operator()(const CoordinatedTask& lhs,
-                  const CoordinatedTask& rhs) const {
-    return lhs.job_name() == rhs.job_name() && lhs.task_id() == rhs.task_id();
-  }
-};
-
-using CoordinatedTaskSet =
-    absl::flat_hash_set<CoordinatedTask, CoordinatedTaskHash,
-                        CoordinatedTaskEqual>;
-
 absl::Status MakeShutdownBarrierError(const absl::Status& error) {
   return MakeCoordinationError(absl::InternalError(absl::StrCat(
       "Shutdown barrier has failed.\nBarrier result: '", error.ToString())));
 }
 
-// Standalone implementation of the coordination service.
-class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
- public:
-  CoordinationServiceStandaloneImpl(
-      Env* env, const CoordinationServiceConfig& config,
-      std::unique_ptr<CoordinationClientCache> client_cache);
-  ~CoordinationServiceStandaloneImpl() override {
-    absl::MutexLock lock(&state_mu_);
-    Stop();
-  }
-
-  void SetDeviceAggregationFunction(
-      std::function<DeviceInfo(const DeviceInfo& devices)>
-          post_aggregate_device_fn) override;
-
-  void LogConnectStatusLocked() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-
-  absl::Status RegisterTask(const CoordinatedTask& task,
-                            uint64_t incarnation) override;
-  void RegisterTaskAsync(const CoordinatedTask& task, uint64_t incarnation,
-                         StatusCallback done) override;
-  void WaitForAllTasks(const CoordinatedTask& task, const DeviceInfo& devices,
-                       StatusCallback done) override;
-  void ShutdownTaskAsync(const CoordinatedTask& task,
-                         StatusCallback done) override;
-  absl::Status ResetTask(const CoordinatedTask& task) override;
-  absl::Status RecordHeartbeat(const CoordinatedTask& task,
-                               uint64_t incarnation) override;
-  absl::Status ReportTaskError(const CoordinatedTask& task,
-                               const absl::Status& error) override;
-  std::vector<CoordinatedTaskStateInfo> GetTaskState(
-      const std::vector<CoordinatedTask>& task) override;
-  absl::Status InsertKeyValue(std::string_view key,
-                              std::string_view value) override;
-  absl::Status InsertKeyValue(std::string_view key, std::string_view value,
-                              bool allow_overwrite) override;
-  void GetKeyValueAsync(std::string_view key,
-                        StatusOrValueCallback done) override;
-  absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) override;
-  std::vector<KeyValueEntry> GetKeyValueDir(
-      std::string_view directory_key) override;
-  absl::Status DeleteKeyValue(std::string_view key) override;
-  void BarrierAsync(std::string barrier_id, int64_t counter,
-                    absl::Duration timeout, const CoordinatedTask& task,
-                    const std::vector<CoordinatedTask>& participating_tasks,
-                    BarrierCallback done) override;
-  absl::Status CancelBarrier(std::string barrier_id, int64_t counter,
-                             const CoordinatedTask& task) override;
-  void GetAliveTasksAsync(const tensorflow::CoordinatedTask& requesting_task,
-                          const std::vector<tensorflow::CoordinatedTask>& tasks,
-                          GetAliveTasksCallback done) override;
-  void PollForErrorAsync(const CoordinatedTask& task,
-                         StatusCallback done) override;
-
- private:
-  const DeviceInfo& ListClusterDevices() override
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  uint64_t GetServiceIncarnation() override;
-  void BarrierAsyncLocked(
-      std::string barrier_id, int64_t counter, absl::Duration timeout,
-      const CoordinatedTask& task,
-      const std::vector<CoordinatedTask>& participating_tasks,
-      BarrierCallback done) ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  BarrierCallback ConnectAfterBarrierPasses(absl::string_view task_name,
-                                            uint64_t incarnation,
-                                            StatusCallback done);
-  // Connects a task to the service, and leaves any previously ongoing barriers
-  // for recoverable tasks.
-  void ConnectTask(const CoordinatedTask& task, uint64_t incarnation)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Checks if any task has stopped sending heartbeats.
-  void CheckHeartbeatTimeout();
-  // Checks if any barrier has timed out.
-  void CheckBarrierTimeout();
-  // Checks both heartbeat and barrier timeouts. Use a single function so they
-  // can be run in the same thread as threads are a constrained resource.
-  void CheckStaleness();
-  // Starts a thread to check staleness.
-  void StartCheckStaleness();
-  void Stop() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  bool ServiceHasStopped() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Report error from a task to all other connected tasks if the task is not
-  // recoverable.
-  // Note: SetTaskError() must be called before propagating its error.
-  void PropagateError(const absl::Status& error,
-                      const std::vector<CoordinatedTask>& source_tasks,
-                      bool is_reported_by_task = false)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  void PropagateError(const absl::Status& error,
-                      const std::vector<std::string_view>& source_task_names,
-                      bool is_reported_by_task = false)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Checks if all tasks are from recoverable jobs.
-  bool AllTasksAreRecoverable(const std::vector<CoordinatedTask>& tasks)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  void SetTaskError(std::string_view task_name, const absl::Status& error)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Used for cluster-wide errors (e.g. register or shutdown barrier fails).
-  void SetAllTasksError(const absl::Status& error)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  absl::Status DisconnectTask(const CoordinatedTask& task)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  void DisconnectAllNonRecoverableTasks()
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  std::vector<CoordinatedTask> GetTasksForShutdownBarrier();
-
-  struct BarrierState {
-    std::string id = "";
-    // Counter is incremented for each new barrier using the same id.
-    // No two barriers with the same id (and different counters) can be ongoing
-    // at the same time.
-    int64_t counter = 0;
-    bool passed = false;
-    absl::Status result = absl::UnknownError(
-        "Invalid barrier result.");  // Only valid if `passed` is true.
-    uint64_t deadline_in_micros = 0;
-    int num_pending_tasks = 0;
-    // Specifies which tasks have called the barrier so far.
-    absl::flat_hash_map<CoordinatedTask, bool, CoordinatedTaskHash,
-                        CoordinatedTaskEqual>
-        tasks_at_barrier;
-    absl::flat_hash_map<CoordinatedTask, BarrierCallback, CoordinatedTaskHash,
-                        CoordinatedTaskEqual>
-        done_callbacks;
-    // Specifies the task that initiated the barrier (the first task to call the
-    // barrier).
-    CoordinatedTask initiating_task;
-  };
-  bool BarrierIsUninitialized(const BarrierState& barrier) {
-    return barrier.id.empty() && barrier.counter == 0 && !barrier.passed &&
-           barrier.deadline_in_micros == 0 && barrier.num_pending_tasks == 0;
-  }
-  std::string BarrierName(std::string_view barrier_id, int64_t counter) {
-    return absl::StrCat(barrier_id, "::", counter);
-  }
-  std::string BarrierName(const BarrierState& barrier) {
-    return BarrierName(barrier.id, barrier.counter);
-  }
-  // Initializes a new barrier. Returns false if the barrier should fail
-  // immediately.
-  bool InitializeBarrier(
-      BarrierState* barrier, std::string_view barrier_id, int64_t counter,
-      absl::Duration timeout, const CoordinatedTask& task,
-      const std::vector<CoordinatedTask>& participating_tasks,
-      BarrierCallback done) ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Initialize `BarrierState`'s tasks_at_barrier map.
-  bool InitializeTasksAtBarrier(
-      BarrierState* barrier,
-      const std::vector<CoordinatedTask>& participating_tasks,
-      BarrierCallback done) ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Adds a callback to be called when the barrier is done.
-  // If there is an existing callback for that task, it will be overwritten,
-  // cancelling the previous callback.
-  void AddBarrierCallback(BarrierState* barrier, const CoordinatedTask& task,
-                          BarrierCallback done)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Ends the barrier with a result (ok or error).
-  void PassBarrier(BarrierState* barrier, const absl::Status& result)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // A task reaches the barrier.
-  void ReachBarrier(BarrierState* barrier, const CoordinatedTask& task)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  void FailBarrierWithCounterMismatch(BarrierState* barrier, int64_t counter)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Propagates same result back to task.
-  void RepeatBarrierResult(BarrierState* barrier, const CoordinatedTask& task)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Leaves any ongoing barriers.
-  // If the task is non-recoverable, the barrier exits with an error.
-  // If the task is recoverable, the barrier will 'unregister' a task and allow
-  // it to join back again later before the timeout.
-  void LeaveOngoingBarriers(const CoordinatedTask& task,
-                            std::string_view reason)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Post-barrier hook to connect all tasks.
-  void ConnectAllTasks() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Post-barrier hook to aggregate device info.
-  void AggregateClusterDevices() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Post-shutdown barrier hook to disconnect tasks that acked and propagate
-  // errors to those that have not.
-  void CompleteShutdownAfterBarrier(const absl::Status& result,
-                                    BarrierState* barrier)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Checks if the participating tasks are specified correctly across barrier
-  // calls and that the caller task is one of the participating tasks.
-  bool ValidateTaskArgs(BarrierState* barrier,
-                        const CoordinatedTask& caller_task,
-                        const std::vector<CoordinatedTask>& tasks_args)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  bool isRecoverableJob(std::string_view task_name) const;
-  // Sends responses to error polling requests when an error is encountered.
-  void SendErrorPollingResponse(const absl::Status& error)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Responds to error polling or fails all tasks when an error is
-  // encountered. Should only be called when there is no service to client
-  // connection.
-  void SendErrorPollingResponseOrFailAllTasks(const absl::Status& error)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  // Returns whether the clients are polling for error from the service. If the
-  // clients are not polling for error from the service, the service should stop
-  // when there is an error. Otherwise, the service should not stop.
-  bool IsClientPollingForError() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-
-  class ErrorPollingState {
-   public:
-    // Returns whether the error polling requests have been responded.
-    bool Responded() const { return responded_; }
-    // Sets the error and executes the status callbacks.
-    void SetError(const absl::Status& error);
-    // Gets the error that is propagated to the agents.
-    const absl::Status& GetError() const { return error_; }
-    // Returns true if the task has sent request to poll for error from the
-    // service.
-    bool IsTaskPolling(absl::string_view task_name) const {
-      return polling_task_names_.contains(task_name);
-    }
-    // Adds a task to the error polling state.
-    void AddTask(const CoordinatedTask& task, StatusCallback&& done);
-
-    // Removes a task from the error polling state.
-    // If an existing polling request is present, we will invoke the callback
-    // with the `reason` argument.
-    // Note: for disconnected tasks, this does not actually propagate the error
-    // back, but prevents memory leaks by removing stale callbacks.
-    void RemoveTask(const CoordinatedTask& task, absl::string_view reason);
-
-   private:
-    bool responded_ = false;
-    absl::Status error_ = absl::OkStatus();
-    absl::flat_hash_map<CoordinatedTask, StatusCallback, CoordinatedTaskHash,
-                        CoordinatedTaskEqual>
-        done_callbacks_;
-    absl::flat_hash_set<std::string> polling_task_names_;
-  };
-
-  class TaskState {
-   public:
-    // Task state maintained on the coordination service side.
-    // State transition:
-    //                Register           Heartbeat
-    //   DISCONNECTED -------> CONNECTED --------> ERROR (timeout)
-    //                              |   ReportError
-    //                              +--------------> ERROR
-    //
-    // When task state becomes ERROR, propagate this status to other CONNECTED
-    // tasks in the cluster.
-
-    explicit TaskState(absl::string_view task) { task_name_ = task; }
-
-    CoordinatedTaskState GetState() { return state_; }
-    absl::Status GetStatus() { return status_; }
-    bool IsRecoverable() { return recoverable_; }
-    void SetRecoverable(bool recoverable) { recoverable_ = recoverable; }
-    uint64_t GetTaskIncarnation() { return task_incarnation_; }
-    void SetTaskIncarnation(uint64_t task_incarnation) {
-      task_incarnation_ = task_incarnation;
-    }
-    void Connect() {
-      SetConnected(task_incarnation_);
-      LOG(INFO) << task_name_
-                << " has connected to coordination service. Incarnation: "
-                << task_incarnation_;
-    }
-    void SetConnected(uint64_t task_incarnation);
-    void Disconnect(uint64_t grace_period_duration_us);
-    absl::Status RecordHeartbeat(uint64_t task_incarnation);
-    int64_t TimeSinceLastHeartbeatMs();
-    // Sets the error and returns true if the task state is not ERROR.
-    // Otherwise, don't overwrite the error and return false.
-    bool SetError(const absl::Status& status);
-    DeviceInfo GetDeviceInfo() { return devices_; }
-    void CollectDeviceInfo(const DeviceInfo& devices) { devices_ = devices; }
-    // Checks if task has called WaitForAllTasks() previously, which gathers the
-    // local device info.
-    bool DeviceInfoIsCollected() { return devices_.device_size() != 0; }
-
-    // This is used to propagate state changes (disconnect, error) to ongoing
-    // barriers.
-    absl::flat_hash_set<std::string> GetOngoingBarriers();
-    // The task has a new ongoing barrier. This does not mean that it has
-    // reached the barrier.
-    void JoinBarrier(std::string_view barrier_id);
-    // The task has exited a barrier (because a barrier has passed).
-    void ExitBarrier(std::string_view barrier_id);
-    // Returns true if the task has been disconnected beyond the grace period
-    // and no further agent requests are expected. Note that the grace period
-    // accounts for the lag time between the service recording the state change
-    // and the agent stopping heartbeats/error polling.
-    bool IsDisconnectedBeyondGracePeriod();
-
-   private:
-    std::string task_name_;
-    // Incarnation ID for CPU:0 on remote task.
-    uint64_t task_incarnation_ = 0;
-
-    CoordinatedTaskState state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-    absl::Status status_;
-    absl::Mutex last_heartbeat_mu_;
-    uint64_t last_heartbeat_us_ ABSL_GUARDED_BY(last_heartbeat_mu_);
-    // This denotes the deadline after which we stop accepting heartbeats or
-    // error polling requests from a disconnected task. This grace period
-    // accounts for the lag time between the service recording the state change
-    // and the agent stopping heartbeats/error polling.
-    uint64_t disconnect_grace_period_us_ = 0;
-    DeviceInfo devices_;
-    // For now, we assume there won't be many simultaneous barriers so we simply
-    // use a set.
-    absl::flat_hash_set<std::string> ongoing_barriers_for_task_;
-    // TODO(b/342448688): Re-use config's recoverable jobs instead.
-    bool recoverable_ = false;
-  };
-
-  // AlivenessState tracks the state of pending GetAliveTasks calls.
-  struct AlivenessState {
-    // All tasks that can participate in the GetAliveTasks barrier.
-    CoordinatedTaskSet tasks;
-    // All tasks currently blocked on the barrier.
-    CoordinatedTaskSet in_barrier;
-    // Done callbacks for the tasks blocked on the barrier.
-    std::vector<GetAliveTasksCallback> dones;
-  };
-
-  // Returns the set of alive tasks drawn from the provided set of tasks.
-  CoordinatedTaskSet AliveTasks(const CoordinatedTaskSet& tasks) const
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-
-  // Refreshes the AlivenessStates of all pending GetAliveTasks call,
-  // potentially finishing some of the pending calls. The AlivenessStates should
-  // be refreshed, for example, after a task has failed.
-  void RefreshAliveness() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-
-  std::unique_ptr<CoordinationClientCache> client_cache_;
-  Env& env_;
-  const uint64_t service_incarnation_ = random::New64();
-  const uint64_t heartbeat_timeout_ms_;
-  bool cluster_register_with_barrier_ = false;
-  const absl::Duration cluster_register_timeout_;
-  const absl::Duration shutdown_barrier_timeout_;
-  // If a task restarts with a new incarnation, we may allow it to reconnect
-  // silently if configured. This is useful when we know that a task can
-  // immediately resume work upon re-connecting to the service.
-  bool allow_new_incarnation_to_reconnect_ = false;
-  std::function<DeviceInfo(const DeviceInfo& devices)>
-      post_aggregate_device_fn_;
-
-  const std::string device_propagation_barrier_id_ =
-      absl::StrCat("WaitForAllTasks::", std::to_string(service_incarnation_));
-  const std::string shutdown_barrier_id_ =
-      absl::StrCat("Shutdown::", std::to_string(service_incarnation_));
-  std::vector<CoordinatedTask> shutdown_barrier_tasks_
-      ABSL_GUARDED_BY(state_mu_);
-
-  absl::Mutex state_mu_;
-  absl::flat_hash_map<std::string, std::unique_ptr<TaskState>> cluster_state_
-      ABSL_GUARDED_BY(state_mu_);
-  DeviceInfo cluster_devices_ ABSL_GUARDED_BY(state_mu_);
-
-  absl::Mutex kv_mu_;
-  // Ordered map to store config key-values
-  std::map<std::string, std::string> kv_store_ ABSL_GUARDED_BY(kv_mu_);
-  absl::flat_hash_map<std::string, std::vector<StatusOrValueCallback>> get_cb_
-      ABSL_GUARDED_BY(kv_mu_);
-
-  absl::flat_hash_map<std::string, BarrierState> barriers_
-      ABSL_GUARDED_BY(state_mu_);
-  // For now, we assume there won't be many simultaneous barriers so we simply
-  // use a set.
-  absl::flat_hash_set<std::string> ongoing_barriers_ ABSL_GUARDED_BY(state_mu_);
-
-  // The state of all pending GetAliveTasks calls.
-  std::vector<AlivenessState> aliveness_states_ ABSL_GUARDED_BY(state_mu_);
-
-  absl::flat_hash_set<std::string> recoverable_jobs_;
-
-  // Whether the agents are polling for error from the service. It will be set
-  // to true when the service sees the first error polling request. Once set to
-  // true, the value will never change back to false.
-  bool client_polling_for_error_ ABSL_GUARDED_BY(state_mu_) = false;
-  ErrorPollingState error_polling_state_ ABSL_GUARDED_BY(state_mu_);
-
-  absl::CondVar check_staleness_thread_cv_;
-  bool shutting_down_ ABSL_GUARDED_BY(state_mu_) = false;
-  // Note: sequence matters here, we must destroy the staleness thread before
-  // the other state related to barriers and heartbeats to prevent illegal
-  // memory access.
-  std::unique_ptr<Thread> check_staleness_thread_;
-
-  CoordinationServiceStandaloneImpl(const CoordinationServiceStandaloneImpl&) =
-      delete;
-  void operator=(const CoordinationServiceStandaloneImpl&) = delete;
-};
+}  // namespace
 
-void CoordinationServiceStandaloneImpl::ErrorPollingState::SetError(
+void CoordinationService::ErrorPollingState::SetError(
     const absl::Status& error) {
   if (responded_) return;
   responded_ = true;
@@ -519,7 +107,7 @@ void CoordinationServiceStandaloneImpl::ErrorPollingState::SetError(
   done_callbacks_.clear();
 }
 
-void CoordinationServiceStandaloneImpl::ErrorPollingState::RemoveTask(
+void CoordinationService::ErrorPollingState::RemoveTask(
     const CoordinatedTask& task, absl::string_view reason) {
   if (done_callbacks_.contains(task)) {
     done_callbacks_[task](MakeCoordinationError(absl::CancelledError(
@@ -528,7 +116,7 @@ void CoordinationServiceStandaloneImpl::ErrorPollingState::RemoveTask(
   done_callbacks_.erase(task);
 }
 
-void CoordinationServiceStandaloneImpl::ErrorPollingState::AddTask(
+void CoordinationService::ErrorPollingState::AddTask(
     const CoordinatedTask& task, StatusCallback&& done) {
   // Do not allow to insert a task if the service has already responded.
   if (Responded()) return;
@@ -537,8 +125,7 @@ void CoordinationServiceStandaloneImpl::ErrorPollingState::AddTask(
   done_callbacks_[task] = done;
 }
 
-void CoordinationServiceStandaloneImpl::TaskState::SetConnected(
-    uint64_t task_incarnation) {
+void CoordinationService::TaskState::SetConnected(uint64_t task_incarnation) {
   state_ = CoordinatedTaskState::TASKSTATE_CONNECTED;
   status_ = absl::OkStatus();
   task_incarnation_ = task_incarnation;
@@ -546,7 +133,7 @@ void CoordinationServiceStandaloneImpl::TaskState::SetConnected(
   last_heartbeat_us_ = Env::Default()->NowMicros();
 }
 
-void CoordinationServiceStandaloneImpl::TaskState::Disconnect(
+void CoordinationService::TaskState::Disconnect(
     uint64_t grace_period_duration_us) {
   disconnect_grace_period_us_ =
       Env::Default()->NowMicros() + grace_period_duration_us;
@@ -554,15 +141,14 @@ void CoordinationServiceStandaloneImpl::TaskState::Disconnect(
   status_ = absl::OkStatus();
 }
 
-bool CoordinationServiceStandaloneImpl::TaskState::SetError(
-    const absl::Status& status) {
+bool CoordinationService::TaskState::SetError(const absl::Status& status) {
   if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return false;
   state_ = CoordinatedTaskState::TASKSTATE_ERROR;
   status_ = status;
   return true;
 }
 
-absl::Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
+absl::Status CoordinationService::TaskState::RecordHeartbeat(
     uint64_t task_incarnation) {
   if (!status_.ok()) return status_;
   // Record heartbeat.
@@ -584,40 +170,36 @@ absl::Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
   }
 }
 
-int64_t
-CoordinationServiceStandaloneImpl::TaskState::TimeSinceLastHeartbeatMs() {
+int64_t CoordinationService::TaskState::TimeSinceLastHeartbeatMs() {
   absl::MutexLock l(&last_heartbeat_mu_);
   return (Env::Default()->NowMicros() - last_heartbeat_us_) / 1000;
 }
 
 absl::flat_hash_set<std::string>
-CoordinationServiceStandaloneImpl::TaskState::GetOngoingBarriers() {
+CoordinationService::TaskState::GetOngoingBarriers() {
   return ongoing_barriers_for_task_;
 }
 
-void CoordinationServiceStandaloneImpl::TaskState::JoinBarrier(
-    std::string_view barrier_id) {
+void CoordinationService::TaskState::JoinBarrier(absl::string_view barrier_id) {
   ongoing_barriers_for_task_.emplace(barrier_id);
 }
 
-void CoordinationServiceStandaloneImpl::TaskState::ExitBarrier(
-    std::string_view barrier_id) {
+void CoordinationService::TaskState::ExitBarrier(absl::string_view barrier_id) {
   ongoing_barriers_for_task_.erase(barrier_id);
 }
 
-bool CoordinationServiceStandaloneImpl::TaskState::
-    IsDisconnectedBeyondGracePeriod() {
+bool CoordinationService::TaskState::IsDisconnectedBeyondGracePeriod() {
   return GetState() == CoordinatedTaskState::TASKSTATE_DISCONNECTED &&
          Env::Default()->NowMicros() > disconnect_grace_period_us_;
 }
 
-void CoordinationServiceStandaloneImpl::SetDeviceAggregationFunction(
+void CoordinationService::SetDeviceAggregationFunction(
     std::function<DeviceInfo(const DeviceInfo& devices)>
         post_aggregate_device_fn) {
   post_aggregate_device_fn_ = std::move(post_aggregate_device_fn);
 }
 
-CoordinationServiceStandaloneImpl::CoordinationServiceStandaloneImpl(
+CoordinationService::CoordinationService(
     Env* env, const CoordinationServiceConfig& config,
     std::unique_ptr<CoordinationClientCache> client_cache)
     : client_cache_(std::move(client_cache)),
@@ -646,9 +228,9 @@ CoordinationServiceStandaloneImpl::CoordinationServiceStandaloneImpl(
   StartCheckStaleness();
 }
 
-void CoordinationServiceStandaloneImpl::CheckHeartbeatTimeout() {
+void CoordinationService::CheckHeartbeatTimeout() {
   absl::Status status = absl::OkStatus();
-  std::vector<std::string_view> stale_task_names;
+  std::vector<absl::string_view> stale_task_names;
   absl::MutexLock l(&state_mu_);
   for (const auto& [task_name, task_state] : cluster_state_) {
     // Skip tasks that are not registered or in error state.
@@ -686,12 +268,12 @@ void CoordinationServiceStandaloneImpl::CheckHeartbeatTimeout() {
   }
 }
 
-void CoordinationServiceStandaloneImpl::CheckBarrierTimeout() {
+void CoordinationService::CheckBarrierTimeout() {
   absl::flat_hash_map<std::string, BarrierState*> expired_barriers;
   uint64_t current_time_micros = Env::Default()->NowMicros();
   absl::MutexLock l(&state_mu_);
   // Gather barriers which have timed out.
-  for (std::string_view barrier_id : ongoing_barriers_) {
+  for (absl::string_view barrier_id : ongoing_barriers_) {
     auto* barrier = &barriers_[barrier_id];
     if (current_time_micros > barrier->deadline_in_micros) {
       expired_barriers[barrier_id] = barrier;
@@ -729,7 +311,7 @@ void CoordinationServiceStandaloneImpl::CheckBarrierTimeout() {
   }
 }
 
-void CoordinationServiceStandaloneImpl::CheckStaleness() {
+void CoordinationService::CheckStaleness() {
   // Used to store stale tasks and barriers.
   while (true) {
     {
@@ -744,30 +326,16 @@ void CoordinationServiceStandaloneImpl::CheckStaleness() {
   }
 }
 
-void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
-  check_staleness_thread_.reset(env_.StartThread(
-      {}, kHealthCheckThread,
-      absl::bind_front(&CoordinationServiceStandaloneImpl::CheckStaleness,
-                       this)));
+void CoordinationService::StartCheckStaleness() {
+  check_staleness_thread_.reset(
+      env_.StartThread({}, kHealthCheckThread, [this]() { CheckStaleness(); }));
 }
 
-void CoordinationServiceStandaloneImpl::Stop() {
+void CoordinationService::Stop() {
   // Prevent recursion.
   if (shutting_down_) {
     return;
   }
-  {
-    absl::MutexLock l(&kv_mu_);
-    for (const auto& [key, get_kv_callbacks] : get_cb_) {
-      for (const auto& get_kv_callback : get_kv_callbacks) {
-        get_kv_callback(absl::CancelledError(
-            absl::StrCat("Coordination service is shutting down. Cancelling "
-                         "GetKeyValue() for key: ",
-                         key)));
-      }
-    }
-    get_cb_.clear();
-  }
   // Indicate that the service is shutting down and stop accepting new RPCs.
   shutting_down_ = true;
   // Stop the heartbeat thread.
@@ -796,12 +364,10 @@ void CoordinationServiceStandaloneImpl::Stop() {
   }
 }
 
-bool CoordinationServiceStandaloneImpl::ServiceHasStopped() const {
-  return shutting_down_;
-}
+bool CoordinationService::ServiceHasStopped() const { return shutting_down_; }
 
 // Helper to log progress to having waited for all tasks.
-void CoordinationServiceStandaloneImpl::LogConnectStatusLocked() const {
+void CoordinationService::LogConnectStatusLocked() const {
   const int num_tasks = cluster_state_.size();
   int pending_tasks = 0;
   std::vector<std::string> task_names;
@@ -820,8 +386,8 @@ void CoordinationServiceStandaloneImpl::LogConnectStatusLocked() const {
   }
 }
 
-absl::Status CoordinationServiceStandaloneImpl::RegisterTask(
-    const CoordinatedTask& task, uint64_t incarnation) {
+absl::Status CoordinationService::RegisterTask(const CoordinatedTask& task,
+                                               uint64_t incarnation) {
   absl::Notification done;
   absl::Status status;
   RegisterTaskAsync(task, incarnation, [&](absl::Status s) {
@@ -832,9 +398,10 @@ absl::Status CoordinationServiceStandaloneImpl::RegisterTask(
   return status;
 }
 
-CoordinationServiceInterface::BarrierCallback
-CoordinationServiceStandaloneImpl::ConnectAfterBarrierPasses(
-    absl::string_view task_name, uint64_t incarnation, StatusCallback done) {
+CoordinationService::BarrierCallback
+CoordinationService::ConnectAfterBarrierPasses(absl::string_view task_name,
+                                               uint64_t incarnation,
+                                               StatusCallback done) {
   return [this, task = std::string(task_name), incarnation,
           done = std::move(done)](absl::Status s,
                                   int64_t unused_counter) mutable {
@@ -855,8 +422,8 @@ CoordinationServiceStandaloneImpl::ConnectAfterBarrierPasses(
   };
 }
 
-void CoordinationServiceStandaloneImpl::ConnectTask(const CoordinatedTask& task,
-                                                    uint64_t incarnation) {
+void CoordinationService::ConnectTask(const CoordinatedTask& task,
+                                      uint64_t incarnation) {
   const std::string task_name = GetTaskName(task);
   const std::unique_ptr<TaskState>& task_state = cluster_state_[task_name];
 
@@ -867,8 +434,9 @@ void CoordinationServiceStandaloneImpl::ConnectTask(const CoordinatedTask& task,
   }
 }
 
-void CoordinationServiceStandaloneImpl::RegisterTaskAsync(
-    const CoordinatedTask& task, uint64_t incarnation, StatusCallback done) {
+void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
+                                            uint64_t incarnation,
+                                            StatusCallback done) {
   const std::string task_name = GetTaskName(task);
 
   std::string error_message;
@@ -967,9 +535,9 @@ void CoordinationServiceStandaloneImpl::RegisterTaskAsync(
   done(error);
 }
 
-void CoordinationServiceStandaloneImpl::WaitForAllTasks(
-    const CoordinatedTask& task, const DeviceInfo& devices,
-    StatusCallback done) {
+void CoordinationService::WaitForAllTasks(const CoordinatedTask& task,
+                                          const DeviceInfo& devices,
+                                          StatusCallback done) {
   {
     absl::MutexLock l(&state_mu_);
     if (ServiceHasStopped()) {
@@ -992,8 +560,8 @@ void CoordinationServiceStandaloneImpl::WaitForAllTasks(
                                         int64_t unused_counter) { done(s); });
 }
 
-void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
-    const CoordinatedTask& task, StatusCallback done) {
+void CoordinationService::ShutdownTaskAsync(const CoordinatedTask& task,
+                                            StatusCallback done) {
   VLOG(3) << "Task " << GetTaskName(task) << " invoked ShutdownTaskAsync()";
   if (shutdown_barrier_timeout_ > absl::ZeroDuration() && !task.recoverable()) {
     // Impose shutdown barrier so that all (non-recoverable) tasks can
@@ -1034,14 +602,12 @@ void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
   }
 }
 
-absl::Status CoordinationServiceStandaloneImpl::ResetTask(
-    const CoordinatedTask& task) {
+absl::Status CoordinationService::ResetTask(const CoordinatedTask& task) {
   absl::MutexLock l(&state_mu_);
   return DisconnectTask(task);
 }
 
-absl::Status CoordinationServiceStandaloneImpl::DisconnectTask(
-    const CoordinatedTask& task) {
+absl::Status CoordinationService::DisconnectTask(const CoordinatedTask& task) {
   const std::string task_name = GetTaskName(task);
   // Check if task is valid and not already disconnected.
   if (ServiceHasStopped()) {
@@ -1070,16 +636,16 @@ absl::Status CoordinationServiceStandaloneImpl::DisconnectTask(
   return absl::OkStatus();
 }
 
-const DeviceInfo& CoordinationServiceStandaloneImpl::ListClusterDevices() {
+const DeviceInfo& CoordinationService::ListClusterDevices() {
   return cluster_devices_;
 }
 
-uint64_t CoordinationServiceStandaloneImpl::GetServiceIncarnation() {
+uint64_t CoordinationService::GetServiceIncarnation() {
   return service_incarnation_;
 }
 
-absl::Status CoordinationServiceStandaloneImpl::ReportTaskError(
-    const CoordinatedTask& task, const absl::Status& error) {
+absl::Status CoordinationService::ReportTaskError(const CoordinatedTask& task,
+                                                  const absl::Status& error) {
   const std::string task_name = GetTaskName(task);
   absl::MutexLock l(&state_mu_);
   if (ServiceHasStopped()) {
@@ -1098,34 +664,51 @@ absl::Status CoordinationServiceStandaloneImpl::ReportTaskError(
   return absl::OkStatus();
 }
 
-std::vector<CoordinatedTaskStateInfo>
-CoordinationServiceStandaloneImpl::GetTaskState(
+CoordinatedTaskStateInfo CoordinationService::CreateTaskStateInfo(
+    const CoordinatedTask& task, const TaskState& state) {
+  CoordinatedTaskStateInfo info;
+  info.set_state(state.GetState());
+  info.set_incarnation(state.GetTaskIncarnation());
+  absl::Status error = state.GetStatus();
+  *info.mutable_task() = task;
+  info.set_error_code(error.raw_code());
+  info.set_error_message(std::string(error.message()));
+  if (!error.ok()) {
+    *info.mutable_error_payload()->mutable_source_task() = task;
+    info.mutable_error_payload()->set_is_reported_error(false);
+  }
+  return info;
+}
+
+std::vector<CoordinatedTaskStateInfo> CoordinationService::GetTaskState(
     const std::vector<CoordinatedTask>& tasks) {
   std::vector<CoordinatedTaskStateInfo> states_info;
+  states_info.reserve(tasks.size());
+
+  absl::MutexLock l(&state_mu_);
   for (const auto& task : tasks) {
-    const std::string task_name = GetTaskName(task);
-    auto& state_info = states_info.emplace_back();
-    absl::Status error;
-    {
-      absl::MutexLock l(&state_mu_);
-      state_info.set_state(cluster_state_[task_name]->GetState());
-      state_info.set_incarnation(
-          cluster_state_[task_name]->GetTaskIncarnation());
-      error = cluster_state_[task_name]->GetStatus();
-    }
-    *state_info.mutable_task() = task;
-    state_info.set_error_code(error.raw_code());
-    state_info.set_error_message(std::string(error.message()));
-    if (!error.ok()) {
-      *state_info.mutable_error_payload()->mutable_source_task() = task;
-      state_info.mutable_error_payload()->set_is_reported_error(false);
+    states_info.push_back(
+        CreateTaskStateInfo(task, *cluster_state_[GetTaskName(task)]));
+  }
+  return states_info;
+}
+
+std::vector<CoordinatedTaskStateInfo> CoordinationService::GetJobState(
+    absl::string_view job_name) {
+  absl::MutexLock l(&state_mu_);
+  std::vector<CoordinatedTaskStateInfo> states_info;
+  for (const auto& [name, task_state] : cluster_state_) {
+    const CoordinatedTask task = GetTaskFromName(name);
+    if (task.job_name() != job_name) {
+      continue;
     }
+    states_info.push_back(CreateTaskStateInfo(task, *cluster_state_[name]));
   }
   return states_info;
 }
 
-absl::Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
-    const CoordinatedTask& task, uint64_t incarnation) {
+absl::Status CoordinationService::RecordHeartbeat(const CoordinatedTask& task,
+                                                  uint64_t incarnation) {
   const std::string task_name = GetTaskName(task);
   absl::Status s = absl::OkStatus();
   absl::MutexLock l(&state_mu_);
@@ -1172,7 +755,7 @@ absl::Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
   return s;
 }
 
-bool CoordinationServiceStandaloneImpl::AllTasksAreRecoverable(
+bool CoordinationService::AllTasksAreRecoverable(
     const std::vector<CoordinatedTask>& tasks) {
   for (const auto& task : tasks) {
     if (!cluster_state_[GetTaskName(task)]->IsRecoverable() &&
@@ -1183,9 +766,9 @@ bool CoordinationServiceStandaloneImpl::AllTasksAreRecoverable(
   return true;
 }
 
-void CoordinationServiceStandaloneImpl::PropagateError(
+void CoordinationService::PropagateError(
     const absl::Status& error,
-    const std::vector<std::string_view>& source_task_names,
+    const std::vector<absl::string_view>& source_task_names,
     bool is_reported_by_task) {
   std::vector<CoordinatedTask> source_tasks;
   source_tasks.reserve(source_task_names.size());
@@ -1195,7 +778,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
   return PropagateError(error, source_tasks, is_reported_by_task);
 }
 
-void CoordinationServiceStandaloneImpl::PropagateError(
+void CoordinationService::PropagateError(
     const absl::Status& error, const std::vector<CoordinatedTask>& source_tasks,
     bool is_reported_by_task) {
   VLOG(3) << "PropagateError(): " << error;
@@ -1257,7 +840,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
 // The normalized key will not have leading or trailing slashes, and all parts
 // in the key path are separated by exactly one slack ('/').
 // E.g., ///a//b/c// --> a/b/c
-std::string NormalizeKey(std::string_view orig_key) {
+std::string NormalizeKey(absl::string_view orig_key) {
   std::string norm_key = std::string(orig_key);
   const char* src = norm_key.c_str();
   std::string::iterator dst = norm_key.begin();
@@ -1281,123 +864,61 @@ std::string NormalizeKey(std::string_view orig_key) {
   return norm_key;
 }
 
-absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
-    std::string_view key, std::string_view value) {
-  return InsertKeyValue(key, value, /*allow_overwrite=*/false);
+absl::Status CoordinationService::InsertKeyValue(absl::string_view key,
+                                                 absl::string_view value) {
+  VLOG(3) << "CoordinationService::InsertKeyValue(key=" << key
+          << ", value=" << value << ")";
+  return store_.Put(NormalizeKey(key), value, /*allow_overwrite=*/false);
 }
 
-absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
-    std::string_view key, std::string_view value, bool allow_overwrite) {
-  VLOG(3) << "InsertKeyValue(): " << key << ": " << value
-          << " allow_overwrite: " << allow_overwrite;
-  const std::string norm_key = NormalizeKey(key);
-  absl::MutexLock l(&kv_mu_);
-  if (!allow_overwrite && kv_store_.find(norm_key) != kv_store_.end()) {
-    return MakeCoordinationError(absl::AlreadyExistsError(
-        absl::StrCat("Config key ", key, " already exists.")));
-  }
-  kv_store_.insert_or_assign(norm_key, value);
-  auto iter = get_cb_.find(norm_key);
-  if (iter != get_cb_.end()) {
-    for (const auto& cb : iter->second) {
-      cb(value);
-    }
-    get_cb_.erase(iter);
-  }
-  return absl::OkStatus();
+absl::Status CoordinationService::InsertKeyValue(absl::string_view key,
+                                                 absl::string_view value,
+                                                 bool allow_overwrite) {
+  VLOG(3) << "CoordinationService::InsertKeyValue(key=" << key
+          << ", value=" << value << ", allow_overwrite=" << allow_overwrite
+          << ")";
+  return store_.Put(NormalizeKey(key), value, allow_overwrite);
 }
 
-void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
-    std::string_view key, StatusOrValueCallback done) {
-  VLOG(3) << "GetKeyValue(): " << key;
-  const std::string norm_key = NormalizeKey(key);
-  absl::MutexLock l(&kv_mu_);
-  const auto& iter = kv_store_.find(norm_key);
-  if (iter != kv_store_.end()) {
-    done(iter->second);
-    return;
-  }
-  auto cb_iter = get_cb_.find(norm_key);
-  if (cb_iter == get_cb_.end()) {
-    cb_iter =
-        get_cb_.emplace(norm_key, std::vector<StatusOrValueCallback>()).first;
-  }
-  cb_iter->second.emplace_back(std::move(done));
+void CoordinationService::GetKeyValueAsync(absl::string_view key,
+                                           StatusOrValueCallback done) {
+  VLOG(3) << "CoordinationService::GetKeyValueAsync(key=" << key << ")";
+  store_.AddCallbackForKey(NormalizeKey(key), done);
 }
 
-absl::StatusOr<std::string> CoordinationServiceStandaloneImpl::TryGetKeyValue(
-    std::string_view key) {
-  VLOG(3) << "TryGetKeyValue(): " << key;
-  const std::string norm_key = NormalizeKey(key);
-  absl::MutexLock l(&kv_mu_);
-  const auto& iter = kv_store_.find(norm_key);
-  if (iter == kv_store_.end()) {
+absl::StatusOr<std::string> CoordinationService::TryGetKeyValue(
+    absl::string_view key) {
+  VLOG(3) << "CoordinationService::TryGetKeyValue(key=" << key << ")";
+  std::optional<std::string> s = store_.Get(NormalizeKey(key));
+  if (!s.has_value()) {
     return absl::NotFoundError(absl::StrCat("Config key ", key, " not found."));
   }
-  return iter->second;
-}
-
-std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
-    std::string_view directory_key) {
-  VLOG(3) << "TryGetKeyValueDir(): " << directory_key;
-  std::vector<KeyValueEntry> kvs_in_directory;
-  const std::string norm_key = NormalizeKey(directory_key);
-  const std::string dir = absl::StrCat(norm_key, "/");
-
-  absl::MutexLock l(&kv_mu_);
-  // Find first key in ordered map that has the directory prefix.
-  auto begin = kv_store_.lower_bound(dir);
-  std::map<std::string, std::string>::iterator it;
-  // Iterate through key range that match directory prefix.
-  for (it = begin; it != kv_store_.end(); ++it) {
-    // Stop once the next key does not have the directory prefix. Since keys are
-    // ordered, none of the other keys would have a matching prefix.
-    if (std::mismatch(dir.begin(), dir.end(), it->first.begin(),
-                      it->first.end())
-            .first != dir.end()) {
-      break;
-    }
-    KeyValueEntry kv;
-    kv.set_key(it->first);
-    kv.set_value(it->second);
-    kvs_in_directory.push_back(kv);
-  }
-
-  return kvs_in_directory;
-}
-
-absl::Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
-    std::string_view key) {
-  VLOG(3) << "DeleteKeyValue(): " << key;
-  const std::string norm_key = NormalizeKey(key);
-  absl::MutexLock l(&kv_mu_);
-  // Delete directory: find key range that match directory prefix
-  const std::string dir = absl::StrCat(norm_key, "/");
-  auto begin = kv_store_.lower_bound(dir);
-  std::map<std::string, std::string>::iterator end;
-  for (end = begin; end != kv_store_.end(); end++) {
-    if (std::mismatch(dir.begin(), dir.end(), end->first.begin(),
-                      end->first.end())
-            .first != dir.end())
-      break;
-  }
-  kv_store_.erase(begin, end);
-  auto iter = kv_store_.find(norm_key);
-  if (iter != kv_store_.end()) {
-    kv_store_.erase(iter);
-  }
+  return *std::move(s);
+}
+
+std::vector<KeyValueEntry> CoordinationService::GetKeyValueDir(
+    absl::string_view directory_key) {
+  VLOG(3) << "CoordinationService::GetKeyValueDir(directory_key="
+          << directory_key << ")";
+  return store_.GetPrefix(NormalizeKey(directory_key) + "/");
+}
+
+absl::Status CoordinationService::DeleteKeyValue(absl::string_view key) {
+  VLOG(3) << "CoordinationService::DeleteKeyValue(key=" << key << ")";
+  const std::string normalized = NormalizeKey(key);
+  store_.Delete(normalized);
+  store_.DeletePrefix(normalized + "/");
   return absl::OkStatus();
 }
 
-void CoordinationServiceStandaloneImpl::SetAllTasksError(
-    const absl::Status& error) {
+void CoordinationService::SetAllTasksError(const absl::Status& error) {
   for (const auto& task_state : cluster_state_) {
     SetTaskError(task_state.first, error);
   }
 }
 
-void CoordinationServiceStandaloneImpl::SetTaskError(
-    std::string_view task_name, const absl::Status& error) {
+void CoordinationService::SetTaskError(absl::string_view task_name,
+                                       const absl::Status& error) {
   const CoordinatedTask task = GetTaskFromName(task_name);
   const std::unique_ptr<TaskState>& task_state = cluster_state_[task_name];
   if (task_state->SetError(error)) {
@@ -1407,8 +928,8 @@ void CoordinationServiceStandaloneImpl::SetTaskError(
   }
 }
 
-void CoordinationServiceStandaloneImpl::PollForErrorAsync(
-    const CoordinatedTask& task, StatusCallback done) {
+void CoordinationService::PollForErrorAsync(const CoordinatedTask& task,
+                                            StatusCallback done) {
   const std::string task_name = GetTaskName(task);
   VLOG(3) << "Task " << task_name << " invoked PollForErrorAsync().";
 
@@ -1468,8 +989,8 @@ void CoordinationServiceStandaloneImpl::PollForErrorAsync(
 
 // Initializes a new barrier. Returns false if the barrier should fail
 // immediately.
-bool CoordinationServiceStandaloneImpl::InitializeBarrier(
-    BarrierState* barrier, std::string_view barrier_id, int64_t counter,
+bool CoordinationService::InitializeBarrier(
+    BarrierState* barrier, absl::string_view barrier_id, int64_t counter,
     absl::Duration timeout, const CoordinatedTask& task,
     const std::vector<CoordinatedTask>& participating_tasks,
     BarrierCallback done) {
@@ -1525,7 +1046,7 @@ bool CoordinationServiceStandaloneImpl::InitializeBarrier(
   return true;
 }
 
-bool CoordinationServiceStandaloneImpl::InitializeTasksAtBarrier(
+bool CoordinationService::InitializeTasksAtBarrier(
     BarrierState* barrier,
     const std::vector<CoordinatedTask>& participating_tasks,
     BarrierCallback done) {
@@ -1540,7 +1061,7 @@ bool CoordinationServiceStandaloneImpl::InitializeTasksAtBarrier(
   if (participating_tasks.empty()) {
     // Assume barrier is for entire cluster if no tasks are specified.
     for (const auto& task_state : cluster_state_) {
-      std::string_view task_name = task_state.first;
+      absl::string_view task_name = task_state.first;
       barrier->tasks_at_barrier[GetTaskFromName(task_name)] = false;
     }
     return true;
@@ -1567,8 +1088,9 @@ bool CoordinationServiceStandaloneImpl::InitializeTasksAtBarrier(
   return true;
 }
 
-void CoordinationServiceStandaloneImpl::AddBarrierCallback(
-    BarrierState* barrier, const CoordinatedTask& task, BarrierCallback done) {
+void CoordinationService::AddBarrierCallback(BarrierState* barrier,
+                                             const CoordinatedTask& task,
+                                             BarrierCallback done) {
   auto it = barrier->done_callbacks.find(task);
   if (it != barrier->done_callbacks.end()) {
     it->second(absl::CancelledError(
@@ -1580,7 +1102,7 @@ void CoordinationServiceStandaloneImpl::AddBarrierCallback(
   barrier->done_callbacks[task] = std::move(done);
 }
 
-void CoordinationServiceStandaloneImpl::BarrierAsync(
+void CoordinationService::BarrierAsync(
     // Note: `barrier_id` uses a `std::string` instead of `string_view` as the
     // RPC may end (i.e. done callback is invoked) before this handler
     // completes, which would invalidate the `string_view`.
@@ -1593,8 +1115,8 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
                             participating_tasks, std::move(done));
 };
 
-void CoordinationServiceStandaloneImpl::BarrierAsyncLocked(
-    std::string barrier_id, int64_t counter, absl::Duration timeout,
+void CoordinationService::BarrierAsyncLocked(
+    absl::string_view barrier_id, int64_t counter, absl::Duration timeout,
     const CoordinatedTask& task,
     const std::vector<CoordinatedTask>& participating_tasks,
     BarrierCallback done) {
@@ -1678,8 +1200,8 @@ void CoordinationServiceStandaloneImpl::BarrierAsyncLocked(
   ReachBarrier(barrier, task);
 }
 
-void CoordinationServiceStandaloneImpl::FailBarrierWithCounterMismatch(
-    BarrierState* barrier, int64_t counter) {
+void CoordinationService::FailBarrierWithCounterMismatch(BarrierState* barrier,
+                                                         int64_t counter) {
   std::string reason;
   if (counter == 0 || barrier->counter == 0) {
     reason =
@@ -1703,7 +1225,7 @@ void CoordinationServiceStandaloneImpl::FailBarrierWithCounterMismatch(
   PassBarrier(barrier, error);
 }
 
-absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
+absl::Status CoordinationService::CancelBarrier(
     // Note: `barrier_id` uses a `std::string` instead of `string_view` as the
     // RPC may end (i.e. done callback is invoked) before this handler
     // completes, which would invalidate the `string_view`.
@@ -1758,8 +1280,8 @@ absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
 }
 
 // Mark barrier as passed.
-void CoordinationServiceStandaloneImpl::PassBarrier(
-    BarrierState* barrier, const absl::Status& result) {
+void CoordinationService::PassBarrier(BarrierState* barrier,
+                                      const absl::Status& result) {
   barrier->passed = true;
   barrier->result = result;
   VLOG(3) << "Barrier(" << BarrierName(*barrier)
@@ -1805,7 +1327,8 @@ void CoordinationServiceStandaloneImpl::PassBarrier(
 }
 
 // Returns true if x is a (non-strict) subset of y.
-bool TaskSetSubset(const CoordinatedTaskSet& x, const CoordinatedTaskSet& y) {
+bool TaskSetSubset(const CoordinationService::CoordinatedTaskSet& x,
+                   const CoordinationService::CoordinatedTaskSet& y) {
   return std::all_of(x.begin(), x.end(), [&y](const CoordinatedTask& task) {
     return y.contains(task);
   });
@@ -1817,11 +1340,12 @@ bool TaskSetSubset(const CoordinatedTaskSet& x, const CoordinatedTaskSet& y) {
 // the equal operator on the underlying elements in the sets, but the equal
 // operator is not defined on protos. Thus, we have to implement our own
 // equality function.
-bool TaskSetEqual(const CoordinatedTaskSet& x, const CoordinatedTaskSet& y) {
+bool TaskSetEqual(const CoordinationService::CoordinatedTaskSet& x,
+                  const CoordinationService::CoordinatedTaskSet& y) {
   return x.size() == y.size() && TaskSetSubset(x, y);
 }
 
-CoordinatedTaskSet CoordinationServiceStandaloneImpl::AliveTasks(
+CoordinationService::CoordinatedTaskSet CoordinationService::AliveTasks(
     const CoordinatedTaskSet& tasks) const {
   CoordinatedTaskSet alive_tasks;
   for (const CoordinatedTask& task : tasks) {
@@ -1835,7 +1359,7 @@ CoordinatedTaskSet CoordinationServiceStandaloneImpl::AliveTasks(
   return alive_tasks;
 }
 
-void CoordinationServiceStandaloneImpl::RefreshAliveness() {
+void CoordinationService::RefreshAliveness() {
   // Try to finish every pending GetAliveTasks call.
   auto it = aliveness_states_.begin();
   while (it != aliveness_states_.end()) {
@@ -1857,7 +1381,7 @@ void CoordinationServiceStandaloneImpl::RefreshAliveness() {
   }
 }
 
-void CoordinationServiceStandaloneImpl::GetAliveTasksAsync(
+void CoordinationService::GetAliveTasksAsync(
     const tensorflow::CoordinatedTask& requesting_task,
     const std::vector<tensorflow::CoordinatedTask>& tasks,
     GetAliveTasksCallback done) {
@@ -1901,8 +1425,7 @@ void CoordinationServiceStandaloneImpl::GetAliveTasksAsync(
   }
 }
 
-void CoordinationServiceStandaloneImpl::SendErrorPollingResponse(
-    const absl::Status& error) {
+void CoordinationService::SendErrorPollingResponse(const absl::Status& error) {
   CHECK(IsClientPollingForError())
       << "`SendErrorPollingResponse` should only be called after agents poll "
          "errors from the service.";
@@ -1930,7 +1453,7 @@ void CoordinationServiceStandaloneImpl::SendErrorPollingResponse(
   }
 }
 
-bool CoordinationServiceStandaloneImpl::ValidateTaskArgs(
+bool CoordinationService::ValidateTaskArgs(
     BarrierState* barrier, const CoordinatedTask& task,
     const std::vector<CoordinatedTask>& tasks_args) {
   // Assume all tasks are participating if no task is specified.
@@ -1979,8 +1502,8 @@ bool CoordinationServiceStandaloneImpl::ValidateTaskArgs(
   return true;
 }
 
-void CoordinationServiceStandaloneImpl::RepeatBarrierResult(
-    BarrierState* barrier, const CoordinatedTask& task) {
+void CoordinationService::RepeatBarrierResult(BarrierState* barrier,
+                                              const CoordinatedTask& task) {
   BarrierCallback done = barrier->done_callbacks[task];
   barrier->done_callbacks.erase(task);
   // Special hook for shutdown barrier to disconnect task.
@@ -1996,8 +1519,8 @@ void CoordinationServiceStandaloneImpl::RepeatBarrierResult(
   done(barrier->result, barrier->counter);
 }
 
-void CoordinationServiceStandaloneImpl::LeaveOngoingBarriers(
-    const CoordinatedTask& task, std::string_view reason) {
+void CoordinationService::LeaveOngoingBarriers(const CoordinatedTask& task,
+                                               absl::string_view reason) {
   const std::string task_name = GetTaskName(task);
   const std::unique_ptr<TaskState>& task_state = cluster_state_[task_name];
   // Unregister recoverable task from ongoing barriers.
@@ -2036,8 +1559,8 @@ void CoordinationServiceStandaloneImpl::LeaveOngoingBarriers(
   }
 }
 
-void CoordinationServiceStandaloneImpl::ReachBarrier(
-    BarrierState* barrier, const CoordinatedTask& task) {
+void CoordinationService::ReachBarrier(BarrierState* barrier,
+                                       const CoordinatedTask& task) {
   // Remove pending task.
   // We need to check if task made a repeated call after reaching the
   // barrier.
@@ -2053,7 +1576,7 @@ void CoordinationServiceStandaloneImpl::ReachBarrier(
   }
 };
 
-void CoordinationServiceStandaloneImpl::AggregateClusterDevices() {
+void CoordinationService::AggregateClusterDevices() {
   assert(cluster_devices_.device_size() == 0);
   std::vector<CoordinatedTask> ordered_tasks;
   // Sort by task name to set deterministic order for cluster devices.
@@ -2080,7 +1603,7 @@ void CoordinationServiceStandaloneImpl::AggregateClusterDevices() {
   }
 }
 
-void CoordinationServiceStandaloneImpl::DisconnectAllNonRecoverableTasks() {
+void CoordinationService::DisconnectAllNonRecoverableTasks() {
   for (const auto& [task_name, state] : cluster_state_) {
     if (state->IsRecoverable()) {
       // Recoverable tasks will disconnect independently without the
@@ -2094,8 +1617,7 @@ void CoordinationServiceStandaloneImpl::DisconnectAllNonRecoverableTasks() {
   }
 }
 
-std::vector<CoordinatedTask>
-CoordinationServiceStandaloneImpl::GetTasksForShutdownBarrier() {
+std::vector<CoordinatedTask> CoordinationService::GetTasksForShutdownBarrier() {
   absl::MutexLock l(&state_mu_);
   if (shutdown_barrier_tasks_.empty()) {
     for (const auto& [task_name, task_state] : cluster_state_) {
@@ -2107,7 +1629,7 @@ CoordinationServiceStandaloneImpl::GetTasksForShutdownBarrier() {
   return shutdown_barrier_tasks_;
 }
 
-void CoordinationServiceStandaloneImpl::CompleteShutdownAfterBarrier(
+void CoordinationService::CompleteShutdownAfterBarrier(
     const absl::Status& result, BarrierState* barrier) {
   if (result.ok()) {
     LOG(INFO) << "Shutdown barrier in coordination service has passed.";
@@ -2141,21 +1663,13 @@ void CoordinationServiceStandaloneImpl::CompleteShutdownAfterBarrier(
     SetAllTasksError(shutdown_error);
   }
 }
-}  // namespace
-
-std::unique_ptr<CoordinationServiceInterface> EnableCoordinationService(
-    Env* env, const CoordinationServiceConfig& config,
-    std::unique_ptr<CoordinationClientCache> cache) {
-  return std::make_unique<CoordinationServiceStandaloneImpl>(env, config,
-                                                             std::move(cache));
-}
 
-bool CoordinationServiceStandaloneImpl::isRecoverableJob(
-    const std::string_view task_name) const {
+bool CoordinationService::isRecoverableJob(
+    const absl::string_view task_name) const {
   return recoverable_jobs_.find(task_name) != recoverable_jobs_.end();
 }
 
-void CoordinationServiceStandaloneImpl::SendErrorPollingResponseOrFailAllTasks(
+void CoordinationService::SendErrorPollingResponseOrFailAllTasks(
     const absl::Status& error) {
   CHECK(!error.ok()) << "SendErrorPollingResponseOrFailAllTasks called with OK "
                         "status. Should always return an error.";
@@ -2177,11 +1691,8 @@ void CoordinationServiceStandaloneImpl::SendErrorPollingResponseOrFailAllTasks(
   }
 }
 
-bool CoordinationServiceStandaloneImpl::IsClientPollingForError() const {
+bool CoordinationService::IsClientPollingForError() const {
   return client_polling_for_error_;
 }
 
-// Register standalone coordination service implementation.
-REGISTER_COORDINATION_SERVICE("standalone", EnableCoordinationService);
-
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
index 28c42ed888e9..df73cc6be6f4 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
@@ -20,36 +20,30 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
-#include <string_view>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/btree_map.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
-#include "xla/tsl/platform/macros.h"
+#include "xla/tsl/distributed_runtime/coordination/key_value_store.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
+#include "tsl/platform/random.h"
 
 namespace tsl {
-class Env;
-
-// Static registration for coordination service implementations.
-#define REGISTER_COORDINATION_SERVICE(service_type_name, factory_fn)        \
-  REGISTER_COORDINATION_SERVICE_UNIQ_HELPER(__COUNTER__, service_type_name, \
-                                            factory_fn)
-#define REGISTER_COORDINATION_SERVICE_UNIQ_HELPER(counter, service_type_name, \
-                                                  factory_fn)                 \
-  static bool static_coordination_service_##counter TF_ATTRIBUTE_UNUSED =     \
-      []() {                                                                  \
-        ::tsl::CoordinationServiceInterface::RegisterCoordinationService(     \
-            service_type_name, std::move(factory_fn));                        \
-        return true;                                                          \
-      }()
 
 // Coordination service is used for controlling and coordinating distributed
 // execution in a cluster of multiple tasks.
@@ -67,50 +61,52 @@ class Env;
 // coordination. One instance of the service should be deployed in a cluster,
 // handling various requests and stores configuration key-value data for the
 // tasks. Each task interacts with the service through CoordinationServiceAgent.
-class CoordinationServiceInterface {
+class CoordinationService {
  public:
-  using CoordinationServiceFactory =
-      std::function<std::unique_ptr<CoordinationServiceInterface>(
-          Env* env, const tensorflow::CoordinationServiceConfig& config,
-          std::unique_ptr<CoordinationClientCache> cache)>;
-
   using StatusOrValueCallback =
-      std::function<void(const absl::StatusOr<std::string_view>&)>;
+      std::function<void(const absl::StatusOr<absl::string_view>&)>;
   using BarrierCallback = std::function<void(const absl::Status&, int64_t)>;
   using GetAliveTasksCallback = std::function<void(
       const absl::Status&, const std::vector<tensorflow::CoordinatedTask>&)>;
 
-  virtual ~CoordinationServiceInterface() = default;
+  // Convenience structs to allow using CoordinatedTask as container keys.
+  struct CoordinatedTaskHash {
+    uint64_t operator()(const tensorflow::CoordinatedTask& task) const {
+      return absl::HashOf(task.job_name(), task.task_id());
+    }
+  };
+  struct CoordinatedTaskEqual {
+    bool operator()(const tensorflow::CoordinatedTask& lhs,
+                    const tensorflow::CoordinatedTask& rhs) const {
+      return lhs.job_name() == rhs.job_name() && lhs.task_id() == rhs.task_id();
+    }
+  };
 
-  static void RegisterCoordinationService(
-      std::string_view service_type_name,
-      CoordinationServiceFactory factory_fn) {
-    auto factories = GetCoordinationServiceFactories();
-    factories->emplace(service_type_name, factory_fn);
+  using CoordinatedTaskSet =
+      absl::flat_hash_set<tensorflow::CoordinatedTask, CoordinatedTaskHash,
+                          CoordinatedTaskEqual>;
+
+  static std::unique_ptr<CoordinationService> Create(
+      Env* env, const tensorflow::CoordinationServiceConfig& config,
+      std::unique_ptr<CoordinationClientCache> cache) {
+    return std::make_unique<CoordinationService>(env, config, std::move(cache));
   }
 
-  static std::unique_ptr<CoordinationServiceInterface>
-  EnableCoordinationService(Env* env,
-                            const tensorflow::CoordinationServiceConfig& config,
-                            std::unique_ptr<CoordinationClientCache> cache) {
-    const auto* factories = GetCoordinationServiceFactories();
-    auto factories_iter = factories->find(config.service_type());
-    if (factories_iter == factories->end()) {
-      LOG(ERROR) << "No coordination service factory found for service type "
-                 << config.service_type();
-      return nullptr;
-    }
-    auto service = factories_iter->second(env, config, std::move(cache));
-    return service;
+  CoordinationService(Env* env,
+                      const tensorflow::CoordinationServiceConfig& config,
+                      std::unique_ptr<CoordinationClientCache> client_cache);
+
+  ~CoordinationService() {
+    absl::MutexLock lock(&state_mu_);
+    Stop();
   }
 
   // This function is invoked after each task's local devices are appended in a
   // deterministic order during WaitForAllTasks(). This is useful to convert the
   // result into another message, or set global device ids.
-  virtual void SetDeviceAggregationFunction(
-      std::function<
-          tensorflow::DeviceInfo(const tensorflow::DeviceInfo& devices)>
-          post_aggregate_device_fn) = 0;
+  void SetDeviceAggregationFunction(std::function<tensorflow::DeviceInfo(
+                                        const tensorflow::DeviceInfo& devices)>
+                                        post_aggregate_device_fn);
 
   // Register a task to the service.
   // Possible service errors:
@@ -119,19 +115,19 @@ class CoordinationServiceInterface {
   //   - Aborted: (1) task is in error state, or (2) task is in connected state
   //       with a different incarnation, indicating that it restarted.
   //   - DeadlineExceeded: waited too long for straggler tasks to register.
-  virtual absl::Status RegisterTask(const tensorflow::CoordinatedTask& task,
-                                    uint64_t incarnation) = 0;
-  virtual void RegisterTaskAsync(const tensorflow::CoordinatedTask& task,
-                                 uint64_t incarnation, StatusCallback done) = 0;
+  absl::Status RegisterTask(const tensorflow::CoordinatedTask& task,
+                            uint64_t incarnation);
+  void RegisterTaskAsync(const tensorflow::CoordinatedTask& task,
+                         uint64_t incarnation, StatusCallback done);
 
   // Wait for all tasks to be up and running, and register local device
   // info. The callback is invoked when all tasks are up and registered, or some
   // error occurs.
   // Each task's local devices will be appended in a deterministic order, and
   // post-processed by the callback in SetDeviceAggregationFunction() (if set).
-  virtual void WaitForAllTasks(const tensorflow::CoordinatedTask& task,
-                               const tensorflow::DeviceInfo& devices,
-                               StatusCallback done) = 0;
+  void WaitForAllTasks(const tensorflow::CoordinatedTask& task,
+                       const tensorflow::DeviceInfo& devices,
+                       StatusCallback done);
 
   // Disconnects task from the service. If `shutdown_barrier_timeout_in_ms` is
   // specified in the config, blocks until all tasks reach the barrier before
@@ -140,58 +136,59 @@ class CoordinationServiceInterface {
   //   - Internal: Service has shut down.
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task has already disconnected.
-  virtual void ShutdownTaskAsync(const tensorflow::CoordinatedTask& task,
-                                 StatusCallback done) = 0;
+  void ShutdownTaskAsync(const tensorflow::CoordinatedTask& task,
+                         StatusCallback done);
 
   // Disconnects task from the service and cleans up its internal error state.
   // Possible service errors:
   //   - Internal: Service has shut down.
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task has already disconnected.
-  virtual absl::Status ResetTask(const tensorflow::CoordinatedTask& task) = 0;
+  absl::Status ResetTask(const tensorflow::CoordinatedTask& task);
 
   // Update the heartbeat timestamp of a task. This should only be invoked on
   // the leader of the cluster.
   //   - Internal: Service has shut down.
-  virtual absl::Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
-                                       uint64_t incarnation) = 0;
+  absl::Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
+                               uint64_t incarnation);
 
   // Set a task in error state permanently.
-  virtual absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
-                                       const absl::Status& error) = 0;
+  absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
+                               const absl::Status& error);
 
   // Get the state and the error status of the tasks.
-  virtual std::vector<tensorflow::CoordinatedTaskStateInfo> GetTaskState(
-      const std::vector<tensorflow::CoordinatedTask>& task) = 0;
+  std::vector<tensorflow::CoordinatedTaskStateInfo> GetTaskState(
+      const std::vector<tensorflow::CoordinatedTask>& task);
+
+  // Gets the state and the error status of the job.
+  std::vector<tensorflow::CoordinatedTaskStateInfo> GetJobState(
+      absl::string_view job);
 
   // Insert a configuration key-value in the coordination service.
   // For now, a key-value can only be inserted once and cannot be updated.
   // The key-values are not persisted and will be lost if the leader fails.
-  virtual absl::Status InsertKeyValue(std::string_view key,
-                                      std::string_view value) = 0;
-  virtual absl::Status InsertKeyValue(std::string_view key,
-                                      std::string_view value,
-                                      bool allow_overwrite) = 0;
+  absl::Status InsertKeyValue(absl::string_view key, absl::string_view value);
+  absl::Status InsertKeyValue(absl::string_view key, absl::string_view value,
+                              bool allow_overwrite);
 
   // Get a configuration key-value from the coordination service. The `done`
   // callback is invoked when the key-value becomes available.
-  virtual void GetKeyValueAsync(std::string_view key,
-                                StatusOrValueCallback done) = 0;
+  void GetKeyValueAsync(absl::string_view key, StatusOrValueCallback done);
 
   // Get a configuration key-value from the coordination service. If the key
   // does not exist, return NotFound error.
-  virtual absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) = 0;
+  absl::StatusOr<std::string> TryGetKeyValue(absl::string_view key);
 
   // Gets all values under a directory (key).
   // A value is considered to be in the directory if its key is prefixed with
   // the directory. This is not a blocking call. Agent does not need to be
   // connected to utilize the distributed key-value store.
-  virtual std::vector<tensorflow::KeyValueEntry> GetKeyValueDir(
-      std::string_view directory_key) = 0;
+  std::vector<tensorflow::KeyValueEntry> GetKeyValueDir(
+      absl::string_view directory_key);
 
   // Delete configuration key-value. If key is a directory, recursively clean
   // up all key-values under the directory.
-  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
+  absl::Status DeleteKeyValue(absl::string_view key);
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -227,11 +224,11 @@ class CoordinationServiceInterface {
   // TODO(b/342448688): Allow re-use of ids by specifying different counters.
   // The counter field is mostly ignored at the moment with no user-facing
   // effect.
-  virtual void BarrierAsync(
+  void BarrierAsync(
       std::string barrier_id, int64_t counter, absl::Duration timeout,
       const tensorflow::CoordinatedTask& task,
       const std::vector<tensorflow::CoordinatedTask>& participating_tasks,
-      BarrierCallback done) = 0;
+      BarrierCallback done);
 
   // Aborts the barrier if it is ongoing.
   // Current and future WaitAtBarrier() calls with the same id will return a
@@ -241,9 +238,8 @@ class CoordinationServiceInterface {
   // TODO(b/342448688): Allow re-use of ids by specifying different counters.
   // The counter field is mostly ignored at the moment with no user-facing
   // effect.
-  virtual absl::Status CancelBarrier(
-      std::string barrier_id, int64_t counter,
-      const tensorflow::CoordinatedTask& task) = 0;
+  absl::Status CancelBarrier(std::string barrier_id, int64_t counter,
+                             const tensorflow::CoordinatedTask& task);
 
   // Returns the set of currently alive tasks. More specifically, given a set of
   // tasks T, GetAliveTasks(T) returns the subset T of alive tasks. Note that
@@ -274,10 +270,9 @@ class CoordinationServiceInterface {
   // has failed and that every task calls GetAliveTasks([A, B, C, D]). The
   // invocation will return tasks [A, B, C]. The GetAliveTasks call acts as a
   // barrier across tasks A, B, and C. Task D, which failed, is ignored.
-  virtual void GetAliveTasksAsync(
-      const tensorflow::CoordinatedTask& requesting_task,
-      const std::vector<tensorflow::CoordinatedTask>& tasks,
-      GetAliveTasksCallback done) = 0;
+  void GetAliveTasksAsync(const tensorflow::CoordinatedTask& requesting_task,
+                          const std::vector<tensorflow::CoordinatedTask>& tasks,
+                          GetAliveTasksCallback done);
 
   // Gets error from the coordination service. Block until the service
   // returns an error or the task/service is shutdown. This should never be used
@@ -288,8 +283,8 @@ class CoordinationServiceInterface {
   // coordination service, so once an error occurs after the first call, the
   // service will use the error polling mode to propagate the error to all
   // connected tasks instead of simply shutting down.
-  virtual void PollForErrorAsync(const tensorflow::CoordinatedTask& task,
-                                 StatusCallback done) = 0;
+  void PollForErrorAsync(const tensorflow::CoordinatedTask& task,
+                         StatusCallback done);
 
  private:
   friend class CoordinationServiceRpcHandler;
@@ -298,15 +293,353 @@ class CoordinationServiceInterface {
   friend class
       CoordinationServiceTest_ListClusterDevices_DevicesAreNotAddedTwice_Test;
 
-  virtual const tensorflow::DeviceInfo& ListClusterDevices() = 0;
-  virtual uint64_t GetServiceIncarnation() = 0;
+  void LogConnectStatusLocked() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
-  static std::unordered_map<std::string, CoordinationServiceFactory>*
-  GetCoordinationServiceFactories() {
-    static auto* coordination_service_factories =
-        new std::unordered_map<std::string, CoordinationServiceFactory>();
-    return coordination_service_factories;
+  const tensorflow::DeviceInfo& ListClusterDevices()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  uint64_t GetServiceIncarnation();
+  void BarrierAsyncLocked(
+      absl::string_view barrier_id, int64_t counter, absl::Duration timeout,
+      const tensorflow::CoordinatedTask& task,
+      const std::vector<tensorflow::CoordinatedTask>& participating_tasks,
+      BarrierCallback done) ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  BarrierCallback ConnectAfterBarrierPasses(absl::string_view task_name,
+                                            uint64_t incarnation,
+                                            StatusCallback done);
+  // Connects a task to the service, and leaves any previously ongoing barriers
+  // for recoverable tasks.
+  void ConnectTask(const tensorflow::CoordinatedTask& task,
+                   uint64_t incarnation)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Checks if any task has stopped sending heartbeats.
+  void CheckHeartbeatTimeout();
+  // Checks if any barrier has timed out.
+  void CheckBarrierTimeout();
+  // Checks both heartbeat and barrier timeouts. Use a single function so they
+  // can be run in the same thread as threads are a constrained resource.
+  void CheckStaleness();
+  // Starts a thread to check staleness.
+  void StartCheckStaleness();
+  void Stop() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  bool ServiceHasStopped() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Report error from a task to all other connected tasks if the task is not
+  // recoverable.
+  // Note: SetTaskError() must be called before propagating its error.
+  void PropagateError(
+      const absl::Status& error,
+      const std::vector<tensorflow::CoordinatedTask>& source_tasks,
+      bool is_reported_by_task = false)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  void PropagateError(const absl::Status& error,
+                      const std::vector<absl::string_view>& source_task_names,
+                      bool is_reported_by_task = false)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Checks if all tasks are from recoverable jobs.
+  bool AllTasksAreRecoverable(
+      const std::vector<tensorflow::CoordinatedTask>& tasks)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  void SetTaskError(absl::string_view task_name, const absl::Status& error)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Used for cluster-wide errors (e.g. register or shutdown barrier fails).
+  void SetAllTasksError(const absl::Status& error)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  absl::Status DisconnectTask(const tensorflow::CoordinatedTask& task)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  void DisconnectAllNonRecoverableTasks()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  std::vector<tensorflow::CoordinatedTask> GetTasksForShutdownBarrier();
+
+  struct BarrierState {
+    std::string id = "";
+    // Counter is incremented for each new barrier using the same id.
+    // No two barriers with the same id (and different counters) can be ongoing
+    // at the same time.
+    int64_t counter = 0;
+    bool passed = false;
+    absl::Status result = absl::UnknownError(
+        "Invalid barrier result.");  // Only valid if `passed` is true.
+    uint64_t deadline_in_micros = 0;
+    int num_pending_tasks = 0;
+    // Specifies which tasks have called the barrier so far.
+    absl::flat_hash_map<tensorflow::CoordinatedTask, bool, CoordinatedTaskHash,
+                        CoordinatedTaskEqual>
+        tasks_at_barrier;
+    absl::flat_hash_map<tensorflow::CoordinatedTask, BarrierCallback,
+                        CoordinatedTaskHash, CoordinatedTaskEqual>
+        done_callbacks;
+    // Specifies the task that initiated the barrier (the first task to call the
+    // barrier).
+    tensorflow::CoordinatedTask initiating_task;
+  };
+  bool BarrierIsUninitialized(const BarrierState& barrier) {
+    return barrier.id.empty() && barrier.counter == 0 && !barrier.passed &&
+           barrier.deadline_in_micros == 0 && barrier.num_pending_tasks == 0;
   }
+  std::string BarrierName(absl::string_view barrier_id, int64_t counter) {
+    return absl::StrCat(barrier_id, "::", counter);
+  }
+  std::string BarrierName(const BarrierState& barrier) {
+    return BarrierName(barrier.id, barrier.counter);
+  }
+  // Initializes a new barrier. Returns false if the barrier should fail
+  // immediately.
+  bool InitializeBarrier(
+      BarrierState* barrier, absl::string_view barrier_id, int64_t counter,
+      absl::Duration timeout, const tensorflow::CoordinatedTask& task,
+      const std::vector<tensorflow::CoordinatedTask>& participating_tasks,
+      BarrierCallback done) ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Initialize `BarrierState`'s tasks_at_barrier map.
+  bool InitializeTasksAtBarrier(
+      BarrierState* barrier,
+      const std::vector<tensorflow::CoordinatedTask>& participating_tasks,
+      BarrierCallback done) ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Adds a callback to be called when the barrier is done.
+  // If there is an existing callback for that task, it will be overwritten,
+  // cancelling the previous callback.
+  void AddBarrierCallback(BarrierState* barrier,
+                          const tensorflow::CoordinatedTask& task,
+                          BarrierCallback done)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Ends the barrier with a result (ok or error).
+  void PassBarrier(BarrierState* barrier, const absl::Status& result)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // A task reaches the barrier.
+  void ReachBarrier(BarrierState* barrier,
+                    const tensorflow::CoordinatedTask& task)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  void FailBarrierWithCounterMismatch(BarrierState* barrier, int64_t counter)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Propagates same result back to task.
+  void RepeatBarrierResult(BarrierState* barrier,
+                           const tensorflow::CoordinatedTask& task)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Leaves any ongoing barriers.
+  // If the task is non-recoverable, the barrier exits with an error.
+  // If the task is recoverable, the barrier will 'unregister' a task and allow
+  // it to join back again later before the timeout.
+  void LeaveOngoingBarriers(const tensorflow::CoordinatedTask& task,
+                            absl::string_view reason)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Post-barrier hook to connect all tasks.
+  void ConnectAllTasks() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Post-barrier hook to aggregate device info.
+  void AggregateClusterDevices() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Post-shutdown barrier hook to disconnect tasks that acked and propagate
+  // errors to those that have not.
+  void CompleteShutdownAfterBarrier(const absl::Status& result,
+                                    BarrierState* barrier)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Checks if the participating tasks are specified correctly across barrier
+  // calls and that the caller task is one of the participating tasks.
+  bool ValidateTaskArgs(
+      BarrierState* barrier, const tensorflow::CoordinatedTask& caller_task,
+      const std::vector<tensorflow::CoordinatedTask>& tasks_args)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  bool isRecoverableJob(absl::string_view task_name) const;
+  // Sends responses to error polling requests when an error is encountered.
+  void SendErrorPollingResponse(const absl::Status& error)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Responds to error polling or fails all tasks when an error is
+  // encountered. Should only be called when there is no service to client
+  // connection.
+  void SendErrorPollingResponseOrFailAllTasks(const absl::Status& error)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  // Returns whether the clients are polling for error from the service. If the
+  // clients are not polling for error from the service, the service should stop
+  // when there is an error. Otherwise, the service should not stop.
+  bool IsClientPollingForError() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+
+  class ErrorPollingState {
+   public:
+    // Returns whether the error polling requests have been responded.
+    bool Responded() const { return responded_; }
+    // Sets the error and executes the status callbacks.
+    void SetError(const absl::Status& error);
+    // Gets the error that is propagated to the agents.
+    const absl::Status& GetError() const { return error_; }
+    // Returns true if the task has sent request to poll for error from the
+    // service.
+    bool IsTaskPolling(absl::string_view task_name) const {
+      return polling_task_names_.contains(task_name);
+    }
+    // Adds a task to the error polling state.
+    void AddTask(const tensorflow::CoordinatedTask& task,
+                 StatusCallback&& done);
+
+    // Removes a task from the error polling state.
+    // If an existing polling request is present, we will invoke the callback
+    // with the `reason` argument.
+    // Note: for disconnected tasks, this does not actually propagate the error
+    // back, but prevents memory leaks by removing stale callbacks.
+    void RemoveTask(const tensorflow::CoordinatedTask& task,
+                    absl::string_view reason);
+
+   private:
+    bool responded_ = false;
+    absl::Status error_ = absl::OkStatus();
+    absl::flat_hash_map<tensorflow::CoordinatedTask, StatusCallback,
+                        CoordinatedTaskHash, CoordinatedTaskEqual>
+        done_callbacks_;
+    absl::flat_hash_set<std::string> polling_task_names_;
+  };
+
+  class TaskState {
+   public:
+    // Task state maintained on the coordination service side.
+    // State transition:
+    //                Register           Heartbeat
+    //   DISCONNECTED -------> CONNECTED --------> ERROR (timeout)
+    //                              |   ReportError
+    //                              +--------------> ERROR
+    //
+    // When task state becomes ERROR, propagate this status to other CONNECTED
+    // tasks in the cluster.
+
+    explicit TaskState(absl::string_view task) { task_name_ = task; }
+
+    tensorflow::CoordinatedTaskState GetState() const { return state_; }
+    absl::Status GetStatus() const { return status_; }
+    bool IsRecoverable() const { return recoverable_; }
+    void SetRecoverable(bool recoverable) { recoverable_ = recoverable; }
+    uint64_t GetTaskIncarnation() const { return task_incarnation_; }
+    void SetTaskIncarnation(uint64_t task_incarnation) {
+      task_incarnation_ = task_incarnation;
+    }
+    void Connect() {
+      SetConnected(task_incarnation_);
+      LOG(INFO) << task_name_
+                << " has connected to coordination service. Incarnation: "
+                << task_incarnation_;
+    }
+    void SetConnected(uint64_t task_incarnation);
+    void Disconnect(uint64_t grace_period_duration_us);
+    absl::Status RecordHeartbeat(uint64_t task_incarnation);
+    int64_t TimeSinceLastHeartbeatMs();
+    // Sets the error and returns true if the task state is not ERROR.
+    // Otherwise, don't overwrite the error and return false.
+    bool SetError(const absl::Status& status);
+    tensorflow::DeviceInfo GetDeviceInfo() { return devices_; }
+    void CollectDeviceInfo(const tensorflow::DeviceInfo& devices) {
+      devices_ = devices;
+    }
+    // Checks if task has called WaitForAllTasks() previously, which gathers the
+    // local device info.
+    bool DeviceInfoIsCollected() { return !devices_.device().empty(); }
+
+    // This is used to propagate state changes (disconnect, error) to ongoing
+    // barriers.
+    absl::flat_hash_set<std::string> GetOngoingBarriers();
+    // The task has a new ongoing barrier. This does not mean that it has
+    // reached the barrier.
+    void JoinBarrier(absl::string_view barrier_id);
+    // The task has exited a barrier (because a barrier has passed).
+    void ExitBarrier(absl::string_view barrier_id);
+    // Returns true if the task has been disconnected beyond the grace period
+    // and no further agent requests are expected. Note that the grace period
+    // accounts for the lag time between the service recording the state change
+    // and the agent stopping heartbeats/error polling.
+    bool IsDisconnectedBeyondGracePeriod();
+
+   private:
+    std::string task_name_;
+    // Incarnation ID for CPU:0 on remote task.
+    uint64_t task_incarnation_ = 0;
+
+    tensorflow::CoordinatedTaskState state_ =
+        tensorflow::CoordinatedTaskState::TASKSTATE_DISCONNECTED;
+    absl::Status status_;
+    absl::Mutex last_heartbeat_mu_;
+    uint64_t last_heartbeat_us_ ABSL_GUARDED_BY(last_heartbeat_mu_);
+    // This denotes the deadline after which we stop accepting heartbeats or
+    // error polling requests from a disconnected task. This grace period
+    // accounts for the lag time between the service recording the state change
+    // and the agent stopping heartbeats/error polling.
+    uint64_t disconnect_grace_period_us_ = 0;
+    tensorflow::DeviceInfo devices_;
+    // For now, we assume there won't be many simultaneous barriers so we simply
+    // use a set.
+    absl::flat_hash_set<std::string> ongoing_barriers_for_task_;
+    // TODO(b/342448688): Re-use config's recoverable jobs instead.
+    bool recoverable_ = false;
+  };
+
+  // AlivenessState tracks the state of pending GetAliveTasks calls.
+  struct AlivenessState {
+    // All tasks that can participate in the GetAliveTasks barrier.
+    CoordinatedTaskSet tasks;
+    // All tasks currently blocked on the barrier.
+    CoordinatedTaskSet in_barrier;
+    // Done callbacks for the tasks blocked on the barrier.
+    std::vector<GetAliveTasksCallback> dones;
+  };
+
+  // Returns the set of alive tasks drawn from the provided set of tasks.
+  CoordinatedTaskSet AliveTasks(const CoordinatedTaskSet& tasks) const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+
+  // Refreshes the AlivenessStates of all pending GetAliveTasks call,
+  // potentially finishing some of the pending calls. The AlivenessStates should
+  // be refreshed, for example, after a task has failed.
+  void RefreshAliveness() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+
+  static tensorflow::CoordinatedTaskStateInfo CreateTaskStateInfo(
+      const tensorflow::CoordinatedTask& task, const TaskState& state);
+
+  std::unique_ptr<CoordinationClientCache> client_cache_;
+  Env& env_;
+  const uint64_t service_incarnation_ = random::New64();
+  const uint64_t heartbeat_timeout_ms_;
+  bool cluster_register_with_barrier_ = false;
+  const absl::Duration cluster_register_timeout_;
+  const absl::Duration shutdown_barrier_timeout_;
+  // If a task restarts with a new incarnation, we may allow it to reconnect
+  // silently if configured. This is useful when we know that a task can
+  // immediately resume work upon re-connecting to the service.
+  bool allow_new_incarnation_to_reconnect_ = false;
+
+  std::function<tensorflow::DeviceInfo(const tensorflow::DeviceInfo& devices)>
+      post_aggregate_device_fn_;
+
+  const std::string device_propagation_barrier_id_ =
+      absl::StrCat("WaitForAllTasks::", service_incarnation_);
+  const std::string shutdown_barrier_id_ =
+      absl::StrCat("Shutdown::", service_incarnation_);
+  std::vector<tensorflow::CoordinatedTask> shutdown_barrier_tasks_
+      ABSL_GUARDED_BY(state_mu_);
+
+  absl::Mutex state_mu_;
+  absl::flat_hash_map<std::string, std::unique_ptr<TaskState>> cluster_state_
+      ABSL_GUARDED_BY(state_mu_);
+  tensorflow::DeviceInfo cluster_devices_ ABSL_GUARDED_BY(state_mu_);
+
+  KeyValueStore store_;
+
+  absl::flat_hash_map<std::string, BarrierState> barriers_
+      ABSL_GUARDED_BY(state_mu_);
+  // For now, we assume there won't be many simultaneous barriers so we simply
+  // use a set.
+  absl::flat_hash_set<std::string> ongoing_barriers_ ABSL_GUARDED_BY(state_mu_);
+
+  // The state of all pending GetAliveTasks calls.
+  std::vector<AlivenessState> aliveness_states_ ABSL_GUARDED_BY(state_mu_);
+
+  absl::flat_hash_set<std::string> recoverable_jobs_;
+
+  // Whether the agents are polling for error from the service. It will be set
+  // to true when the service sees the first error polling request. Once set to
+  // true, the value will never change back to false.
+  bool client_polling_for_error_ ABSL_GUARDED_BY(state_mu_) = false;
+  ErrorPollingState error_polling_state_ ABSL_GUARDED_BY(state_mu_);
+
+  absl::CondVar check_staleness_thread_cv_;
+  bool shutting_down_ ABSL_GUARDED_BY(state_mu_) = false;
+  // Note: sequence matters here, we must destroy the staleness thread before
+  // the other state related to barriers and heartbeats to prevent illegal
+  // memory access.
+  std::unique_ptr<Thread> check_staleness_thread_;
+
+  CoordinationService(const CoordinationService&) = delete;
+  void operator=(const CoordinationService&) = delete;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index dadf760dd9e9..3ccd0ff60bd7 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -24,11 +24,10 @@ limitations under the License.
 #include <optional>
 #include <random>
 #include <string>
-#include <string_view>
+#include <tuple>
 #include <utility>
 #include <vector>
 
-#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/flags/flag.h"
@@ -52,8 +51,6 @@ limitations under the License.
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
-#include "tsl/platform/random.h"
-#include "tsl/platform/thread_annotations.h"
 
 // TODO(b/342448688): Expose via config and API instead of flag.
 ABSL_FLAG(
@@ -79,132 +76,10 @@ constexpr absl::Duration kDefaultHeartbeatTimeout = absl::Seconds(10);
 constexpr absl::Duration kDefaultShutdownTimeout = absl::Seconds(10);
 constexpr char kHeartbeatThread[] = "CoordinationServiceHeartbeatLoop";
 
-class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
- public:
-  CoordinationServiceAgentImpl() = default;
-  ~CoordinationServiceAgentImpl() override {
-    absl::Status s = ShutdownInternal();
-    VLOG(3) << "Coordination agent dtor failed with status: " << s;
-  }
-  absl::Status Initialize(Env* env, std::string_view job_name, int task_id,
-                          const CoordinationServiceConfig& configs,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          StatusCallback error_fn, bool recoverable) override;
-  absl::Status Initialize(Env* env, std::string_view job_name, int task_id,
-                          const CoordinationServiceConfig& configs,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          StatusCallback error_fn) override;
-  absl::Status Initialize(Env* env, const CoordinatedTask& task,
-                          const CoordinationServiceConfig& configs,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          StatusCallback error_fn) override;
-  bool IsInitialized() override;
-  bool IsConnected() override;
-  bool IsError() override;
-
-  absl::Status Connect() override;
-  absl::Status WaitForAllTasks(const DeviceInfo& local_devices) override;
-  const DeviceInfo& GetClusterDeviceInfo() override;
-  absl::StatusOr<CoordinatedTask> GetOwnTask() override;
-  absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> GetTaskState(
-      const std::vector<CoordinatedTask>& task) override;
-  absl::Status ReportError(const absl::Status& error) override;
-  absl::Status Shutdown() override;
-  absl::Status Reset() override;
-
-  absl::StatusOr<std::string> GetKeyValue(std::string_view key) override;
-  absl::StatusOr<std::string> GetKeyValue(std::string_view key,
-                                          absl::Duration timeout) override;
-  std::shared_ptr<CallOptions> GetKeyValueAsync(
-      std::string_view key, StatusOrValueCallback done) override;
-  absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) override;
-  absl::StatusOr<std::vector<KeyValueEntry>> GetKeyValueDir(
-      std::string_view key) override;
-  void GetKeyValueDirAsync(std::string_view key,
-                           StatusOrValueDirCallback done) override;
-  absl::Status InsertKeyValue(std::string_view key,
-                              std::string_view value) override;
-  absl::Status InsertKeyValue(std::string_view key, std::string_view value,
-                              bool allow_overwrite) override;
-  absl::Status DeleteKeyValue(std::string_view key) override;
-  absl::Status UpdateKeyValue(std::string_view key,
-                              std::string_view value) override;
-
-  absl::Status StartWatchKey(std::string_view key,
-                             ChangedKeyValuesCallback on_change) override;
-  absl::Status StopWatchKey(std::string_view key) override;
-  absl::Status WaitAtBarrier(
-      std::string_view barrier_id, absl::Duration timeout,
-      const std::vector<CoordinatedTask>& tasks) override;
-  void WaitAtBarrierAsync(std::string_view barrier_id, absl::Duration timeout,
-                          const std::vector<CoordinatedTask>& tasks,
-                          StatusCallback done) override;
-  absl::Status CancelBarrier(std::string_view barrier_id) override;
-  void CancelBarrierAsync(std::string_view barrier_id,
-                          StatusCallback done) override;
-  absl::StatusOr<std::vector<tensorflow::CoordinatedTask>> GetAliveTasks(
-      const std::vector<tensorflow::CoordinatedTask>& tasks) override;
-
-  absl::StatusOr<Env*> GetEnv() override;
-
- protected:
-  void SetError(const absl::Status& error) override;
-  absl::Status ActivateWatch(
-      std::string_view key, const std::map<std::string, std::string>&) override;
-  // Returns an error if agent is not running. If `allow_disconnected` is true,
-  // returns OK even if the agent is in DISCONNECTED state.
-  absl::Status ValidateRunningAgent(bool allow_disconnected = false);
-  void StopHeartbeat();
-
- private:
-  absl::Status ShutdownInternal();
-  // Starts sending heartbeats to the coordination service.
-  void StartSendingHeartbeats();
-  // Use long polling to get error from the coordination service.
-  void PollForErrorAsync(StatusCallback done);
-
-  // Starts polling for error from the coordination service.
-  void StartPollingForError();
-  // Cancels the error polling request and stops the error polling thread.
-  void StopErrorPolling();
-  // Resets the cancellation manager for error polling.
-  void ResetCancellationManager();
-
-  Env* env_ = nullptr;  // Not owned.
-  const uint64_t incarnation_id_ = random::New64();
-  CoordinatedTask task_;
-  CoordinationServiceConfig configs_;
-  StatusCallback error_fn_;
-
-  mutable absl::Mutex state_mu_;
-  CoordinatedTaskState state_ ABSL_GUARDED_BY(state_mu_) =
-      CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
-  absl::Status status_ ABSL_GUARDED_BY(state_mu_) = absl::OkStatus();
-  // Tracks the number of times a barrier has been used, keyed by id.
-  absl::flat_hash_map<std::string, int64_t> barrier_counter_
-      ABSL_GUARDED_BY(state_mu_);
-  absl::flat_hash_set<std::string> ongoing_barriers_ ABSL_GUARDED_BY(state_mu_);
-
-  uint64_t leader_incarnation_ = 0;
-  DeviceInfo cluster_devices_;
-
-  absl::Mutex heartbeat_thread_shutdown_mu_;
-  absl::CondVar heartbeat_thread_cv_;
-  bool shutting_down_ TF_GUARDED_BY(heartbeat_thread_shutdown_mu_) = false;
-  std::unique_ptr<Thread> heartbeat_thread_;
-  // Must outlive coordination client which may need to access it within
-  // GetKeyValueAsync() callbacks.
-  CancellationManager cancellation_manager_;
-  std::unique_ptr<CancellationManager> error_polling_cancellation_manager_ =
-      std::make_unique<CancellationManager>();
-  std::unique_ptr<CoordinationClient> leader_client_;
-
-  CoordinationServiceAgentImpl(const CoordinationServiceAgentImpl&) = delete;
-  void operator=(const CoordinationServiceAgentImpl&) = delete;
-};
-
-absl::Status CoordinationServiceAgentImpl::Initialize(
-    Env* env, std::string_view job_name, int task_id,
+}  // namespace
+
+absl::Status CoordinationServiceAgent::Initialize(
+    Env* env, absl::string_view job_name, int task_id,
     const CoordinationServiceConfig& configs,
     std::unique_ptr<CoordinationClient> leader_client,
     StatusCallback error_fn) {
@@ -213,8 +88,8 @@ absl::Status CoordinationServiceAgentImpl::Initialize(
                     /*recoverable=*/false);
 }
 
-absl::Status CoordinationServiceAgentImpl::Initialize(
-    Env* env, std::string_view job_name, int task_id,
+absl::Status CoordinationServiceAgent::Initialize(
+    Env* env, absl::string_view job_name, int task_id,
     const CoordinationServiceConfig& configs,
     std::unique_ptr<CoordinationClient> leader_client, StatusCallback error_fn,
     bool recoverable) {
@@ -232,7 +107,7 @@ absl::Status CoordinationServiceAgentImpl::Initialize(
   return Initialize(env, task, configs, std::move(leader_client), error_fn);
 }
 
-absl::Status CoordinationServiceAgentImpl::Initialize(
+absl::Status CoordinationServiceAgent::Initialize(
     Env* env, const CoordinatedTask& task,
     const CoordinationServiceConfig& configs,
     std::unique_ptr<CoordinationClient> leader_client,
@@ -261,40 +136,138 @@ absl::Status CoordinationServiceAgentImpl::Initialize(
   return absl::OkStatus();
 }
 
-bool CoordinationServiceAgentImpl::IsInitialized() {
+bool CoordinationServiceAgent::IsInitialized() {
   absl::MutexLock l(&state_mu_);
   return state_ != CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
 }
 
-bool CoordinationServiceAgentImpl::IsConnected() {
+bool CoordinationServiceAgent::IsConnected() {
   absl::MutexLock l(&state_mu_);
   return state_ == CoordinatedTaskState::TASKSTATE_CONNECTED;
 }
 
-bool CoordinationServiceAgentImpl::IsError() {
+bool CoordinationServiceAgent::IsError() {
   absl::MutexLock l(&state_mu_);
   return state_ == CoordinatedTaskState::TASKSTATE_ERROR;
 }
 
-void CoordinationServiceAgentImpl::StopHeartbeat() {
+void CoordinationServiceAgent::StopHeartbeat() {
   {
-    absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
+    absl::MutexLock l(&shutdown_mu_);
     shutting_down_ = true;
-    heartbeat_thread_cv_.SignalAll();
   }
   heartbeat_thread_ = nullptr;
 }
 
-void CoordinationServiceAgentImpl::StopErrorPolling() {
+void CoordinationServiceAgent::StopErrorPolling() {
   // Cancel pending error polling RPC call.
   error_polling_cancellation_manager_->StartCancel();
 }
 
-void CoordinationServiceAgentImpl::ResetCancellationManager() {
+void CoordinationServiceAgent::ResetCancellationManager() {
   error_polling_cancellation_manager_ = std::make_unique<CancellationManager>();
 }
 
-absl::Status CoordinationServiceAgentImpl::Connect() {
+void CoordinationServiceAgent::WatchJobState() {
+  // Converts a CoordinatedTaskStateInfo into a tuple.
+  auto tuplify = [](const CoordinatedTaskStateInfo& x) {
+    return std::make_tuple(x.task().job_name(), x.task().task_id(),
+                           x.task().recoverable(), x.state(), x.error_code(),
+                           x.error_message());
+  };
+
+  // Returns if two CoordinatedTaskStateInfos are equal.
+  auto equal = [&tuplify](const CoordinatedTaskStateInfo& x,
+                          const CoordinatedTaskStateInfo& y) -> bool {
+    return tuplify(x) == tuplify(y);
+  };
+
+  // Returns if x < y, determined by task id.
+  auto less = [](const CoordinatedTaskStateInfo& x,
+                 const CoordinatedTaskStateInfo& y) -> bool {
+    return x.task().task_id() < y.task().task_id();
+  };
+
+  VLOG(1) << "Starting to watch job state for job " << task_.job_name();
+  std::vector<CoordinatedTaskStateInfo> previous_state;
+  std::vector<CoordinatedTaskStateInfo> current_state;
+
+  // TODO(mwhittaker): For simplicity, WatchJobState is polling. If needed, we
+  // can switch to a long polling approach to get more timely state updates.
+  // However, due to the way hearbeats are implemented, it takes quite a while
+  // after a machine fails for the coordination service to consider it failed.
+  // Thus, optimizing WatchJobState might be premature.
+  while (true) {
+    // Sleep.
+    {
+      absl::MutexLock lock(&shutdown_mu_);
+      // TODO(mwhittaker): Make this sleep duration an option?
+      shutdown_mu_.AwaitWithTimeout(absl::Condition(&shutting_down_),
+                                    absl::Seconds(1));
+      if (shutting_down_) {
+        return;
+      }
+    }
+
+    // Fetch the current job state.
+    absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> state =
+        GetJobState(task_.job_name());
+    if (!state.ok()) {
+      LOG(ERROR) << "Error getting job state for job " << task_.job_name()
+                 << ": " << state.status();
+      continue;
+    }
+
+    // If the state hasn't changed, don't invoke any callbacks.
+    std::sort(state->begin(), state->end(), less);
+    bool state_changed = !std::equal(current_state.begin(), current_state.end(),
+                                     state->begin(), state->end(), equal);
+    if (!state_changed) {
+      VLOG(3) << "Job state did not change.";
+      continue;
+    }
+
+    // Update the states.
+    previous_state = std::move(current_state);
+    current_state = *std::move(state);
+
+    // Pretty print the job state, if VLOG is on.
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << "Previous job state for job " << task_.job_name() << ":";
+      for (const CoordinatedTaskStateInfo& info : previous_state) {
+        VLOG(3) << "- " << info.DebugString();
+      }
+      VLOG(3) << "Current job state for job " << task_.job_name() << ":";
+      for (const CoordinatedTaskStateInfo& info : current_state) {
+        VLOG(3) << "- " << info.DebugString();
+      }
+    }
+
+    // Invoke the callbacks.
+    JobStateUpdate update;
+    update.previous_state = previous_state;
+    update.current_state = current_state;
+    {
+      absl::MutexLock lock(&job_state_watcher_mu_);
+      for (JobStateCallback& callback : job_state_callbacks_) {
+        callback(update);
+      }
+    }
+  }
+}
+
+void CoordinationServiceAgent::StopWatchingJobState() {
+  {
+    absl::MutexLock l(&shutdown_mu_);
+    shutting_down_ = true;
+  }
+  {
+    absl::MutexLock lock(&job_state_watcher_mu_);
+    job_state_watcher_thread_ = nullptr;
+  }
+}
+
+absl::Status CoordinationServiceAgent::Connect() {
   VLOG(3) << "Agent has started trying to Connect().";
   {
     absl::MutexLock l(&state_mu_);
@@ -369,7 +342,7 @@ absl::Status CoordinationServiceAgentImpl::Connect() {
   LOG(INFO) << "Coordination agent has successfully connected.";
   heartbeat_thread_.reset(env_->StartThread(
       ThreadOptions(), kHeartbeatThread,
-      absl::bind_front(&CoordinationServiceAgentImpl::StartSendingHeartbeats,
+      absl::bind_front(&CoordinationServiceAgent::StartSendingHeartbeats,
                        this)));
   if (configs_.poll_for_error_from_service_at_startup()) {
     StartPollingForError();
@@ -377,7 +350,7 @@ absl::Status CoordinationServiceAgentImpl::Connect() {
   return absl::OkStatus();
 }
 
-void CoordinationServiceAgentImpl::StartSendingHeartbeats() {
+void CoordinationServiceAgent::StartSendingHeartbeats() {
   HeartbeatRequest request;
   *request.mutable_source_task() = task_;
   request.set_incarnation(incarnation_id_);
@@ -411,7 +384,7 @@ void CoordinationServiceAgentImpl::StartSendingHeartbeats() {
       // inflight heartbeats sent during shutdown and can be ignored.
       absl::SleepFor(absl::Seconds(1));
       {
-        absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
+        absl::MutexLock l(&shutdown_mu_);
 
         if (shutting_down_) {
           return;
@@ -427,10 +400,9 @@ void CoordinationServiceAgentImpl::StartSendingHeartbeats() {
     }
     // Send next heartbeat after an interval.
     {
-      absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
-      heartbeat_thread_cv_.WaitWithTimeout(
-          &heartbeat_thread_shutdown_mu_,
-          absl::Milliseconds(heartbeat_interval_ms));
+      absl::MutexLock l(&shutdown_mu_);
+      shutdown_mu_.AwaitWithTimeout(absl::Condition(&shutting_down_),
+                                    absl::Milliseconds(heartbeat_interval_ms));
       if (shutting_down_) {
         return;
       }
@@ -438,7 +410,7 @@ void CoordinationServiceAgentImpl::StartSendingHeartbeats() {
   }
 }
 
-void CoordinationServiceAgentImpl::StartPollingForError() {
+void CoordinationServiceAgent::StartPollingForError() {
   LOG(INFO) << "Polling for error from coordination service. This is a "
                "long-running RPC that will return only if an error is "
                "encountered or cancelled (e.g. due to shutdown).";
@@ -458,7 +430,7 @@ void CoordinationServiceAgentImpl::StartPollingForError() {
   });
 }
 
-void CoordinationServiceAgentImpl::PollForErrorAsync(StatusCallback done) {
+void CoordinationServiceAgent::PollForErrorAsync(StatusCallback done) {
   auto call_opts = std::make_shared<CallOptions>();
 
   absl::Status agent_running_status =
@@ -494,7 +466,7 @@ void CoordinationServiceAgentImpl::PollForErrorAsync(StatusCallback done) {
       });
 }
 
-absl::Status CoordinationServiceAgentImpl::WaitForAllTasks(
+absl::Status CoordinationServiceAgent::WaitForAllTasks(
     const DeviceInfo& local_devices) {
   absl::Status agent_running_status = ValidateRunningAgent();
   if (!agent_running_status.ok()) {
@@ -523,11 +495,11 @@ absl::Status CoordinationServiceAgentImpl::WaitForAllTasks(
   return absl::OkStatus();
 }
 
-const DeviceInfo& CoordinationServiceAgentImpl::GetClusterDeviceInfo() {
+const DeviceInfo& CoordinationServiceAgent::GetClusterDeviceInfo() {
   return cluster_devices_;
 }
 
-absl::StatusOr<CoordinatedTask> CoordinationServiceAgentImpl::GetOwnTask() {
+absl::StatusOr<CoordinatedTask> CoordinationServiceAgent::GetOwnTask() {
   if (!IsInitialized()) {
     return MakeCoordinationError(absl::FailedPreconditionError(
         "Agent has not been initialized; we do not "
@@ -537,7 +509,7 @@ absl::StatusOr<CoordinatedTask> CoordinationServiceAgentImpl::GetOwnTask() {
 }
 
 absl::StatusOr<std::vector<CoordinatedTaskStateInfo>>
-CoordinationServiceAgentImpl::GetTaskState(
+CoordinationServiceAgent::GetTaskState(
     const std::vector<CoordinatedTask>& tasks) {
   GetTaskStateRequest request;
   *request.mutable_source_task() = {tasks.begin(), tasks.end()};
@@ -559,8 +531,31 @@ CoordinationServiceAgentImpl::GetTaskState(
   return result;
 }
 
-absl::Status CoordinationServiceAgentImpl::ReportError(
-    const absl::Status& error) {
+absl::StatusOr<std::vector<CoordinatedTaskStateInfo>>
+CoordinationServiceAgent::GetJobState(absl::string_view job_name) {
+  GetJobStateRequest request;
+  request.set_job_name(std::string(job_name));
+  GetJobStateResponse response;
+  absl::Notification n;
+  absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> result;
+  leader_client_->GetJobStateAsync(
+      &request, &response, [&](const absl::Status& s) {
+        if (s.ok()) {
+          result.emplace();
+          result->reserve(response.task_state_size());
+          for (auto& state : *response.mutable_task_state()) {
+            result->push_back(std::move(state));
+          }
+        } else {
+          result = s;
+        }
+        n.Notify();
+      });
+  n.WaitForNotification();
+  return result;
+}
+
+absl::Status CoordinationServiceAgent::ReportError(const absl::Status& error) {
   {
     absl::MutexLock l(&state_mu_);
     if (state_ == CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
@@ -603,11 +598,9 @@ absl::Status CoordinationServiceAgentImpl::ReportError(
   return absl::OkStatus();
 }
 
-absl::Status CoordinationServiceAgentImpl::Shutdown() {
-  return ShutdownInternal();
-}
+absl::Status CoordinationServiceAgent::Shutdown() { return ShutdownInternal(); }
 
-absl::Status CoordinationServiceAgentImpl::ShutdownInternal() {
+absl::Status CoordinationServiceAgent::ShutdownInternal() {
   absl::Status status = absl::OkStatus();
   bool is_connected = false;
   {
@@ -683,7 +676,7 @@ absl::Status CoordinationServiceAgentImpl::ShutdownInternal() {
   return status;
 }
 
-absl::Status CoordinationServiceAgentImpl::Reset() {
+absl::Status CoordinationServiceAgent::Reset() {
   {
     absl::MutexLock l(&state_mu_);
     if (state_ != CoordinatedTaskState::TASKSTATE_ERROR) {
@@ -719,7 +712,7 @@ absl::Status CoordinationServiceAgentImpl::Reset() {
     state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
   }
   {
-    absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
+    absl::MutexLock l(&shutdown_mu_);
     shutting_down_ = false;
   }
 
@@ -727,13 +720,13 @@ absl::Status CoordinationServiceAgentImpl::Reset() {
   return status;
 }
 
-absl::StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
-    std::string_view key) {
+absl::StatusOr<std::string> CoordinationServiceAgent::GetKeyValue(
+    absl::string_view key) {
   return GetKeyValue(key, /*timeout=*/absl::InfiniteDuration());
 }
 
-absl::StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
-    std::string_view key, absl::Duration timeout) {
+absl::StatusOr<std::string> CoordinationServiceAgent::GetKeyValue(
+    absl::string_view key, absl::Duration timeout) {
   auto n = std::make_shared<absl::Notification>();
   auto result = std::make_shared<absl::StatusOr<std::string>>();
   GetKeyValueAsync(
@@ -752,8 +745,8 @@ absl::StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
   return *result;
 }
 
-std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
-    std::string_view key, StatusOrValueCallback done) {
+std::shared_ptr<CallOptions> CoordinationServiceAgent::GetKeyValueAsync(
+    absl::string_view key, StatusOrValueCallback done) {
   auto request = std::make_shared<GetKeyValueRequest>();
   request->set_key(key.data(), key.size());
   VLOG(3) << "GetKeyValueRequest: " << request->DebugString();
@@ -788,8 +781,8 @@ std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
   return call_opts;
 }
 
-absl::StatusOr<std::string> CoordinationServiceAgentImpl::TryGetKeyValue(
-    std::string_view key) {
+absl::StatusOr<std::string> CoordinationServiceAgent::TryGetKeyValue(
+    absl::string_view key) {
   absl::Notification n;
   absl::StatusOr<std::string> result;
   TryGetKeyValueRequest request;
@@ -813,7 +806,7 @@ absl::StatusOr<std::string> CoordinationServiceAgentImpl::TryGetKeyValue(
 }
 
 absl::StatusOr<std::vector<KeyValueEntry>>
-CoordinationServiceAgentImpl::GetKeyValueDir(std::string_view key) {
+CoordinationServiceAgent::GetKeyValueDir(absl::string_view key) {
   absl::Notification n;
   absl::StatusOr<std::vector<KeyValueEntry>> result;
   GetKeyValueDirAsync(
@@ -827,8 +820,8 @@ CoordinationServiceAgentImpl::GetKeyValueDir(std::string_view key) {
   return result;
 }
 
-void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
-    std::string_view key, StatusOrValueDirCallback done) {
+void CoordinationServiceAgent::GetKeyValueDirAsync(
+    absl::string_view key, StatusOrValueDirCallback done) {
   auto request = std::make_shared<GetKeyValueDirRequest>();
   request->set_directory_key(key.data(), key.size());
   VLOG(3) << "GetKeyValueDirRequest: " << request->DebugString();
@@ -849,13 +842,14 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
       });
 }
 
-absl::Status CoordinationServiceAgentImpl::InsertKeyValue(
-    std::string_view key, std::string_view value) {
+absl::Status CoordinationServiceAgent::InsertKeyValue(absl::string_view key,
+                                                      absl::string_view value) {
   return InsertKeyValue(key, value, /*allow_overwrite=*/false);
 }
 
-absl::Status CoordinationServiceAgentImpl::InsertKeyValue(
-    std::string_view key, std::string_view value, bool allow_overwrite) {
+absl::Status CoordinationServiceAgent::InsertKeyValue(absl::string_view key,
+                                                      absl::string_view value,
+                                                      bool allow_overwrite) {
   InsertKeyValueRequest request;
   request.mutable_kv()->set_key(key.data(), key.size());
   request.mutable_kv()->set_value(value.data(), value.size());
@@ -875,8 +869,7 @@ absl::Status CoordinationServiceAgentImpl::InsertKeyValue(
   return status;
 }
 
-absl::Status CoordinationServiceAgentImpl::DeleteKeyValue(
-    std::string_view key) {
+absl::Status CoordinationServiceAgent::DeleteKeyValue(absl::string_view key) {
   DeleteKeyValueRequest request;
   request.set_key(key.data(), key.size());
   request.set_is_directory(true);
@@ -895,25 +888,25 @@ absl::Status CoordinationServiceAgentImpl::DeleteKeyValue(
   return absl::OkStatus();
 }
 
-absl::Status CoordinationServiceAgentImpl::UpdateKeyValue(
-    std::string_view key, std::string_view value) {
+absl::Status CoordinationServiceAgent::UpdateKeyValue(absl::string_view key,
+                                                      absl::string_view value) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::UpdateKeyValue is not implemented."));
 }
 
-absl::Status CoordinationServiceAgentImpl::StartWatchKey(
-    std::string_view key,
-    CoordinationServiceAgentImpl::ChangedKeyValuesCallback on_change) {
+absl::Status CoordinationServiceAgent::StartWatchKey(
+    absl::string_view key,
+    CoordinationServiceAgent::ChangedKeyValuesCallback on_change) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::StartWatchKey is not implemented."));
 }
 
-absl::Status CoordinationServiceAgentImpl::StopWatchKey(std::string_view key) {
+absl::Status CoordinationServiceAgent::StopWatchKey(absl::string_view key) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::StopWatchKey is not implemented."));
 }
 
-void CoordinationServiceAgentImpl::SetError(const absl::Status& error) {
+void CoordinationServiceAgent::SetError(const absl::Status& error) {
   assert(!error.ok());
   absl::MutexLock l(&state_mu_);
   if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
@@ -924,14 +917,14 @@ void CoordinationServiceAgentImpl::SetError(const absl::Status& error) {
   error_fn_(trimmed_error);
 }
 
-absl::Status CoordinationServiceAgentImpl::ActivateWatch(
-    std::string_view key, const std::map<std::string, std::string>& kvs) {
+absl::Status CoordinationServiceAgent::ActivateWatch(
+    absl::string_view key, const std::map<std::string, std::string>& kvs) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::ActivateWatch is not implemented."));
 }
 
-absl::Status CoordinationServiceAgentImpl::WaitAtBarrier(
-    std::string_view barrier_id, absl::Duration timeout,
+absl::Status CoordinationServiceAgent::WaitAtBarrier(
+    absl::string_view barrier_id, absl::Duration timeout,
     const std::vector<CoordinatedTask>& tasks) {
   absl::Status status;
   absl::Notification n;
@@ -943,8 +936,8 @@ absl::Status CoordinationServiceAgentImpl::WaitAtBarrier(
   return status;
 }
 
-void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
-    std::string_view barrier_id, absl::Duration timeout,
+void CoordinationServiceAgent::WaitAtBarrierAsync(
+    absl::string_view barrier_id, absl::Duration timeout,
     const std::vector<CoordinatedTask>& tasks, StatusCallback done) {
   absl::Status agent_running_status =
       ValidateRunningAgent(/*allow_disconnected=*/true);
@@ -1020,8 +1013,8 @@ void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
       });
 }
 
-absl::Status CoordinationServiceAgentImpl::CancelBarrier(
-    std::string_view barrier_id) {
+absl::Status CoordinationServiceAgent::CancelBarrier(
+    absl::string_view barrier_id) {
   absl::Status status;
   absl::Notification n;
   CancelBarrierAsync(barrier_id, [&](const absl::Status& s) {
@@ -1032,8 +1025,8 @@ absl::Status CoordinationServiceAgentImpl::CancelBarrier(
   return status;
 }
 
-void CoordinationServiceAgentImpl::CancelBarrierAsync(
-    std::string_view barrier_id, StatusCallback done) {
+void CoordinationServiceAgent::CancelBarrierAsync(absl::string_view barrier_id,
+                                                  StatusCallback done) {
   absl::Status agent_running_status =
       ValidateRunningAgent(/*allow_disconnected=*/true);
   if (!agent_running_status.ok()) {
@@ -1067,7 +1060,7 @@ void CoordinationServiceAgentImpl::CancelBarrierAsync(
 }
 
 absl::StatusOr<std::vector<tensorflow::CoordinatedTask>>
-CoordinationServiceAgentImpl::GetAliveTasks(
+CoordinationServiceAgent::GetAliveTasks(
     const std::vector<CoordinatedTask>& tasks) {
   // Validate the agent.
   if (absl::Status s = ValidateRunningAgent(/*allow_disconnected=*/true);
@@ -1099,8 +1092,20 @@ CoordinationServiceAgentImpl::GetAliveTasks(
       response->alive_tasks().begin(), response->alive_tasks().end());
 }
 
+void CoordinationServiceAgent::AddJobStateCallback(JobStateCallback callback) {
+  // Add the callback.
+  absl::MutexLock lock(&job_state_watcher_mu_);
+  job_state_callbacks_.push_back(std::move(callback));
+
+  // Start the job watching thread, if it hasn't already been started.
+  if (job_state_watcher_thread_ == nullptr) {
+    job_state_watcher_thread_.reset(env_->StartThread(
+        ThreadOptions(), "job_state_watcher", [this]() { WatchJobState(); }));
+  }
+}
+
 // Returns an error if agent is not running.
-absl::Status CoordinationServiceAgentImpl::ValidateRunningAgent(
+absl::Status CoordinationServiceAgent::ValidateRunningAgent(
     bool allow_disconnected) {
   absl::MutexLock l(&state_mu_);
   switch (state_) {
@@ -1126,7 +1131,7 @@ absl::Status CoordinationServiceAgentImpl::ValidateRunningAgent(
   }
 }
 
-absl::StatusOr<Env*> CoordinationServiceAgentImpl::GetEnv() {
+absl::StatusOr<Env*> CoordinationServiceAgent::GetEnv() {
   if (!IsInitialized()) {
     return MakeCoordinationError(absl::FailedPreconditionError(
         "Coordination service agent has not been initialized."));
@@ -1139,10 +1144,8 @@ absl::StatusOr<Env*> CoordinationServiceAgentImpl::GetEnv() {
   return env_;
 }
 
-}  // namespace
-
 std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent() {
-  return std::make_unique<CoordinationServiceAgentImpl>();
+  return std::make_unique<CoordinationServiceAgent>();
 }
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 2cfef926ae8b..3e5d9e186988 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -16,35 +16,42 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
 
+#include <cstdint>
 #include <functional>
 #include <map>
 #include <memory>
 #include <string>
-#include <string_view>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "xla/tsl/framework/cancellation.h"
+#include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
+#include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
-
-namespace tensorflow {
-class CoordinationServiceConfig;
-};  // namespace tensorflow
+#include "tsl/platform/random.h"
 
 namespace tsl {
-class Env;
 
 // CoordinationServiceAgent defines the interface for tasks to communicate with
 // the coordination service instance (which implements
-// CoordinationServiceInterface). One instance of the agent should be deployed
+// CoordinationService). One instance of the agent should be deployed
 // on each task for it to send various requests and stores / retrieves config
 // key-value data to the service.
 //
-// See CoordinationServiceInterface for more details on coordination service.
+// See CoordinationService for more details on coordination service.
 //
 // All coordination service errors will have an additional
 // CoordinationServiceError payload to distinguish themselves from RPC failures.
@@ -69,34 +76,44 @@ class CoordinationServiceAgent {
   using ChangedKeyValuesCallback =
       std::function<void(const std::map<std::string, std::string>&)>;
 
-  virtual ~CoordinationServiceAgent() = default;
-
-  // Initialize coordination service agent.
-  virtual absl::Status Initialize(
-      tsl::Env* env, std::string_view job_name, int task_id,
-      const tensorflow::CoordinationServiceConfig& configs,
-      std::unique_ptr<CoordinationClient> leader_client,
-      StatusCallback error_fn, bool recoverable) = 0;
-  virtual absl::Status Initialize(
-      tsl::Env* env, std::string_view job_name, int task_id,
-      const tensorflow::CoordinationServiceConfig& configs,
-      std::unique_ptr<CoordinationClient> leader_client,
-      StatusCallback error_fn) = 0;
-  virtual absl::Status Initialize(
-      tsl::Env* env, const tensorflow::CoordinatedTask& task,
-      const tensorflow::CoordinationServiceConfig& configs,
-      std::unique_ptr<CoordinationClient> leader_client,
-      StatusCallback error_fn) = 0;
+  // A JobStateCallback is a callback that receives the current and previous job
+  // state. If there is no previous job state, previous_state is empty. The
+  // provided states are only valid for the duration of the callback.
+  struct JobStateUpdate {
+    absl::Span<const tensorflow::CoordinatedTaskStateInfo> previous_state;
+    absl::Span<const tensorflow::CoordinatedTaskStateInfo> current_state;
+  };
+  using JobStateCallback = absl::AnyInvocable<void(const JobStateUpdate&)>;
+
+  CoordinationServiceAgent() = default;
+
+  virtual ~CoordinationServiceAgent() {
+    absl::Status s = ShutdownInternal();
+    VLOG(3) << "Coordination agent dtor failed with status: " << s;
+  }
+
+  absl::Status Initialize(Env* env, absl::string_view job_name, int task_id,
+                          const tensorflow::CoordinationServiceConfig& configs,
+                          std::unique_ptr<CoordinationClient> leader_client,
+                          StatusCallback error_fn, bool recoverable);
+  absl::Status Initialize(Env* env, absl::string_view job_name, int task_id,
+                          const tensorflow::CoordinationServiceConfig& configs,
+                          std::unique_ptr<CoordinationClient> leader_client,
+                          StatusCallback error_fn);
+  absl::Status Initialize(Env* env, const tensorflow::CoordinatedTask& task,
+                          const tensorflow::CoordinationServiceConfig& configs,
+                          std::unique_ptr<CoordinationClient> leader_client,
+                          StatusCallback error_fn);
 
   // Return true if the coordination service agent has been initialized.
-  virtual bool IsInitialized() = 0;
+  bool IsInitialized();
 
   // Return true if the coordination service agent has successfully connected
   // with the Coordination Service
-  virtual bool IsConnected() = 0;
+  bool IsConnected();
 
   // Return true if the coordination service agent has an error state.
-  virtual bool IsError() = 0;
+  bool IsError();
 
   // Connect to coordination service with the following steps:
   //   - connect to service address specified in the config of `server_def`
@@ -108,7 +125,7 @@ class CoordinationServiceAgent {
   //   - InvalidArgument: Unexpected task registration
   //   - Aborted: Duplicate task registration (agent will retry connecting until
   //              the configured timeout)
-  virtual absl::Status Connect() = 0;
+  absl::Status Connect();
 
   // Wait for all tasks to be up and registered. The call blocks until all tasks
   // in the cluster are up, or some error occurs.
@@ -116,11 +133,10 @@ class CoordinationServiceAgent {
   //   - Internal: Coordination service has shut down.
   //   - FailedPrecondition: Agent is not in CONNECTED state.
   //   - InvalidArgument: Unexpected task request
-  virtual absl::Status WaitForAllTasks(
-      const tensorflow::DeviceInfo& local_devices) = 0;
+  absl::Status WaitForAllTasks(const tensorflow::DeviceInfo& local_devices);
 
   // Get the device attributes of tasks from remote tasks in the cluster.
-  virtual const tensorflow::DeviceInfo& GetClusterDeviceInfo() = 0;
+  const tensorflow::DeviceInfo& GetClusterDeviceInfo();
 
   // State transition in coordination service agent:
   //
@@ -131,11 +147,15 @@ class CoordinationServiceAgent {
   //                                         Reset
 
   // Get task associated with this agent.
-  virtual absl::StatusOr<tensorflow::CoordinatedTask> GetOwnTask() = 0;
+  absl::StatusOr<tensorflow::CoordinatedTask> GetOwnTask();
 
   // Get status of a remote task.
-  virtual absl::StatusOr<std::vector<tensorflow::CoordinatedTaskStateInfo>>
-  GetTaskState(const std::vector<tensorflow::CoordinatedTask>& task) = 0;
+  absl::StatusOr<std::vector<tensorflow::CoordinatedTaskStateInfo>>
+  GetTaskState(const std::vector<tensorflow::CoordinatedTask>& task);
+
+  // Gets status of a remote job.
+  absl::StatusOr<std::vector<tensorflow::CoordinatedTaskStateInfo>> GetJobState(
+      absl::string_view job_name);
 
   // Report error to coordination service. This will invoke the error callback.
   // Note that the error payload will set `is_reported_error` to true, to
@@ -144,7 +164,7 @@ class CoordinationServiceAgent {
   //   - Internal: Coordination service has shut down.
   //   - FailedPrecondition: Uninitialized/disconnected/already in error state.
   //   - InvalidArgument: Unexpected task request
-  virtual absl::Status ReportError(const absl::Status& error) = 0;
+  absl::Status ReportError(const absl::Status& error);
 
   // Shuts down by disconnecting from the service. Should only be called if
   // agent is connected and no further agent calls (except the destructor) are
@@ -157,7 +177,7 @@ class CoordinationServiceAgent {
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: Task was in error state (note: agent is still
   //                         shut down forcefully).
-  virtual absl::Status Shutdown() = 0;
+  absl::Status Shutdown();
 
   // Disconnect from the service, and clean up the internal error status.
   // Possible service errors:
@@ -165,7 +185,7 @@ class CoordinationServiceAgent {
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task is not in error state/has already
   //       disconnected.
-  virtual absl::Status Reset() = 0;
+  absl::Status Reset();
 
   // Key-value store API.
   // The agent does not need to be connected to utilize the key-value store.
@@ -176,49 +196,46 @@ class CoordinationServiceAgent {
   // If the key-value is not inserted yet, this is a blocking call that waits
   // until the corresponding key is inserted.
   //   - DeadlineExceeded: timed out waiting for key.
-  virtual absl::StatusOr<std::string> GetKeyValue(std::string_view key) = 0;
-  virtual absl::StatusOr<std::string> GetKeyValue(std::string_view key,
-                                                  absl::Duration timeout) = 0;
+  absl::StatusOr<std::string> GetKeyValue(absl::string_view key);
+  absl::StatusOr<std::string> GetKeyValue(absl::string_view key,
+                                          absl::Duration timeout);
+
   // Note: Cancel the underlying RPC call with `call_opts->StartCancel()` and
   // `call_opts->ClearCancelCallback()`.
-  virtual std::shared_ptr<CallOptions> GetKeyValueAsync(
-      std::string_view, StatusOrValueCallback done) = 0;
+  std::shared_ptr<CallOptions> GetKeyValueAsync(absl::string_view key,
+                                                StatusOrValueCallback done);
 
   // Get config key-value from the service.
   //   - NotFound: the requested key does not exist.
-  virtual absl::StatusOr<std::string> TryGetKeyValue(std::string_view key) = 0;
+  absl::StatusOr<std::string> TryGetKeyValue(absl::string_view key);
 
   // Get all values under a directory (key).
   // A value is considered to be in the directory if its key is prefixed with
   // the directory.
   // This is not a blocking call. If no keys are found, an empty vector is
   // returned immediately.
-  virtual absl::StatusOr<std::vector<tensorflow::KeyValueEntry>> GetKeyValueDir(
-      std::string_view key) = 0;
-  virtual void GetKeyValueDirAsync(std::string_view key,
-                                   StatusOrValueDirCallback done) = 0;
+  absl::StatusOr<std::vector<tensorflow::KeyValueEntry>> GetKeyValueDir(
+      absl::string_view key);
+  void GetKeyValueDirAsync(absl::string_view key,
+                           StatusOrValueDirCallback done);
 
   // Insert config key-value to the service.
   //   - AlreadyExists: key is already set.
-  virtual absl::Status InsertKeyValue(std::string_view key,
-                                      std::string_view value) = 0;
-
-  virtual absl::Status InsertKeyValue(std::string_view key,
-                                      std::string_view value,
-                                      bool allow_overwrite) = 0;
+  absl::Status InsertKeyValue(absl::string_view key, absl::string_view value);
+  absl::Status InsertKeyValue(absl::string_view key, absl::string_view value,
+                              bool allow_overwrite);
 
   // Delete config keys in the coordination service.
-  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
+  absl::Status DeleteKeyValue(absl::string_view key);
 
   // Update the value of a config key.
-  virtual absl::Status UpdateKeyValue(std::string_view key,
-                                      std::string_view value) = 0;
+  absl::Status UpdateKeyValue(absl::string_view key, absl::string_view value);
 
   // Register a callback that will be invoked when the key or keys under the key
   // directory are changed (inserted, deleted, or updated).
-  virtual absl::Status StartWatchKey(std::string_view key,
-                                     ChangedKeyValuesCallback on_change) = 0;
-  virtual absl::Status StopWatchKey(std::string_view key) = 0;
+  absl::Status StartWatchKey(absl::string_view key,
+                             ChangedKeyValuesCallback on_change);
+  absl::Status StopWatchKey(absl::string_view key);
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -254,13 +271,12 @@ class CoordinationServiceAgent {
   //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state, or the
   //       same barrier id is still being invoked.
   virtual absl::Status WaitAtBarrier(
-      std::string_view barrier_id, absl::Duration timeout,
-      const std::vector<tensorflow::CoordinatedTask>& tasks) = 0;
+      absl::string_view barrier_id, absl::Duration timeout,
+      const std::vector<tensorflow::CoordinatedTask>& tasks);
 
-  virtual void WaitAtBarrierAsync(
-      std::string_view barrier_id, absl::Duration timeout,
-      const std::vector<tensorflow::CoordinatedTask>& tasks,
-      StatusCallback done) = 0;
+  void WaitAtBarrierAsync(absl::string_view barrier_id, absl::Duration timeout,
+                          const std::vector<tensorflow::CoordinatedTask>& tasks,
+                          StatusCallback done);
 
   // Aborts the barrier if it is ongoing.
   // Current and future WaitAtBarrier() calls with the same id will return a
@@ -268,9 +284,8 @@ class CoordinationServiceAgent {
   // Possible service errors:
   //   - Internal: Coordination service has shut down.
   //   - FailedPrecondition: Barrier is non-existent or not ongoing.
-  virtual absl::Status CancelBarrier(std::string_view barrier_id) = 0;
-  virtual void CancelBarrierAsync(std::string_view barrier_id,
-                                  StatusCallback done) = 0;
+  virtual absl::Status CancelBarrier(absl::string_view barrier_id);
+  void CancelBarrierAsync(absl::string_view barrier_id, StatusCallback done);
 
   // Returns the set of currently alive tasks. More specifically, given a set of
   // tasks T, GetAliveTasks(T) returns the subset T of alive tasks.
@@ -300,24 +315,90 @@ class CoordinationServiceAgent {
   // has failed and that every task calls GetAliveTasks([A, B, C, D]). The
   // invocation will return tasks [A, B, C]. The GetAliveTasks call acts as a
   // barrier across tasks A, B, and C. Task D, which failed, is ignored.
-  virtual absl::StatusOr<std::vector<tensorflow::CoordinatedTask>>
-  GetAliveTasks(const std::vector<tensorflow::CoordinatedTask>& tasks) = 0;
+  absl::StatusOr<std::vector<tensorflow::CoordinatedTask>> GetAliveTasks(
+      const std::vector<tensorflow::CoordinatedTask>& tasks);
+
+  // Registers a JobStateCallback that will be invoked when the state of the job
+  // changes. Multiple changes to the job state may be coalesced into a single
+  // call to the provided callback. Callback invocations may also be delayed.
+  void AddJobStateCallback(JobStateCallback callback);
 
   // Get unowned Env* that the agent was initialized with.
-  virtual absl::StatusOr<tsl::Env*> GetEnv() = 0;
+  absl::StatusOr<Env*> GetEnv();
 
  protected:
   // Set the service agent to error status and invoke the error callback.
   // Note: different from ReportError, this does not report the error status to
   // remote coordination service.
-  virtual void SetError(const absl::Status& error) = 0;
+  void SetError(const absl::Status& error);
 
   // Activate the key-value callback watch.
-  virtual absl::Status ActivateWatch(
-      std::string_view, const std::map<std::string, std::string>&) = 0;
+  absl::Status ActivateWatch(absl::string_view key,
+                             const std::map<std::string, std::string>&);
+
+  // Returns an error if agent is not running. If `allow_disconnected` is true,
+  // returns OK even if the agent is in DISCONNECTED state.
+  absl::Status ValidateRunningAgent(bool allow_disconnected = false);
+  void StopHeartbeat();
 
  private:
   friend class CoordinationServiceRpcHandler;
+
+  absl::Status ShutdownInternal();
+  // Starts sending heartbeats to the coordination service.
+  void StartSendingHeartbeats();
+  // Use long polling to get error from the coordination service.
+  void PollForErrorAsync(StatusCallback done);
+
+  // Starts polling for error from the coordination service.
+  void StartPollingForError();
+  // Cancels the error polling request and stops the error polling thread.
+  void StopErrorPolling();
+  // Resets the cancellation manager for error polling.
+  void ResetCancellationManager();
+
+  // Watches the state of this job.
+  void WatchJobState();
+  // Stops watching the state of this job.
+  void StopWatchingJobState();
+
+  Env* env_ = nullptr;  // Not owned.
+  const uint64_t incarnation_id_ = random::New64();
+  tensorflow::CoordinatedTask task_;
+  tensorflow::CoordinationServiceConfig configs_;
+  StatusCallback error_fn_;
+
+  mutable absl::Mutex state_mu_;
+  tensorflow::CoordinatedTaskState state_ ABSL_GUARDED_BY(state_mu_) =
+      tensorflow::CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
+  absl::Status status_ ABSL_GUARDED_BY(state_mu_) = absl::OkStatus();
+  // Tracks the number of times a barrier has been used, keyed by id.
+  absl::flat_hash_map<std::string, int64_t> barrier_counter_
+      ABSL_GUARDED_BY(state_mu_);
+  absl::flat_hash_set<std::string> ongoing_barriers_ ABSL_GUARDED_BY(state_mu_);
+
+  uint64_t leader_incarnation_ = 0;
+  tensorflow::DeviceInfo cluster_devices_;
+
+  absl::Mutex shutdown_mu_;
+  bool shutting_down_ ABSL_GUARDED_BY(shutdown_mu_) = false;
+  std::unique_ptr<Thread> heartbeat_thread_;
+
+  absl::Mutex job_state_watcher_mu_;
+  std::vector<JobStateCallback> job_state_callbacks_
+      ABSL_GUARDED_BY(job_state_watcher_mu_);
+  std::unique_ptr<Thread> job_state_watcher_thread_
+      ABSL_GUARDED_BY(job_state_watcher_mu_);
+
+  // Must outlive coordination client which may need to access it within
+  // GetKeyValueAsync() callbacks.
+  CancellationManager cancellation_manager_;
+  std::unique_ptr<CancellationManager> error_polling_cancellation_manager_ =
+      std::make_unique<CancellationManager>();
+  std::unique_ptr<CoordinationClient> leader_client_;
+
+  CoordinationServiceAgent(const CoordinationServiceAgent&) = delete;
+  void operator=(const CoordinationServiceAgent&) = delete;
 };
 
 std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent();
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index 658f3c971056..2d9f72525e8c 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -52,28 +52,6 @@ using ::testing::UnorderedPointwise;
 using ::testing::WithArgs;
 using ::tsl::testing::StatusIs;
 
-// TODO(b/229726259) Switch to OSS version after it's available.
-// Simple implementation of a proto matcher comparing string representations.
-class ProtoStringMatcher {
- public:
-  explicit ProtoStringMatcher(const tsl::protobuf::Message& expected)
-      : expected_(expected.DebugString()) {}
-
-  template <typename Message>
-  bool MatchAndExplain(const Message& p,
-                       ::testing::MatchResultListener*) const {
-    return p.DebugString() == expected_;
-  }
-
-  void DescribeTo(std::ostream* os) const { *os << expected_; }
-  void DescribeNegationTo(std::ostream* os) const {
-    *os << "not equal to expected message: " << expected_;
-  }
-
- private:
-  const std::string expected_;
-};
-
 MATCHER(KvEq, "simple KeyValueEntry matcher") {
   const KeyValueEntry& kv0 = std::get<0>(arg);
   const KeyValueEntry& kv1 = std::get<1>(arg);
@@ -146,6 +124,9 @@ class TestCoordinationClient : public CoordinationClient {
               (const GetTaskStateRequest*, GetTaskStateResponse*,
                StatusCallback),
               (override));
+  MOCK_METHOD(void, GetJobStateAsync,
+              (const GetJobStateRequest*, GetJobStateResponse*, StatusCallback),
+              (override));
   MOCK_METHOD(void, HeartbeatAsync,
               (CallOptions*, const HeartbeatRequest*, HeartbeatResponse*,
                StatusCallback),
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
index 833f27bf3b68..1d59bb0e3f62 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <string_view>
 
 #include "absl/status/status.h"
 #include "absl/strings/cord.h"
@@ -43,7 +42,7 @@ inline absl::Status MakeCoordinationError(absl::Status s) {
 }
 
 inline absl::Status MakeBarrierError(absl::Status s,
-                                     std::string_view barrier_id,
+                                     absl::string_view barrier_id,
                                      int64_t counter) {
   tensorflow::BarrierError error;
   error.set_barrier_id(std::string(barrier_id));
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
index 982bcd5d58a2..dd8d809277ce 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
@@ -109,7 +109,7 @@ class TestCoordinationServiceTaskState {
         [service = coord_rpc_service_.get()]() { service->HandleRPCsLoop(); }));
   }
 
-  void SetCoordinationService(CoordinationServiceInterface* service) {
+  void SetCoordinationService(CoordinationService* service) {
     auto* grpc_coord_service =
         static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
     grpc_coord_service->SetCoordinationServiceInstance(service);
@@ -181,7 +181,7 @@ class CoordinationServiceRecoverableJobTest : public ::testing::Test {
     client_cache->AddTask(
         /*target=*/"/job:worker/replica:0/task:1",
         state_worker_1_.GetCoordinationClient());
-    coord_service_ = CoordinationServiceInterface::EnableCoordinationService(
+    coord_service_ = CoordinationService::Create(
         Env::Default(), coordination_config_, std::move(client_cache));
     // Set the service pointer for all the tasks since it is needed for handling
     // error propagations. In reality, every task has its own service pointer.
@@ -224,7 +224,7 @@ class CoordinationServiceRecoverableJobTest : public ::testing::Test {
 
  protected:
   CoordinationServiceConfig coordination_config_;
-  std::unique_ptr<CoordinationServiceInterface> coord_service_;
+  std::unique_ptr<CoordinationService> coord_service_;
   TestCoordinationServiceTaskState state_ps_0_;
   TestCoordinationServiceTaskState state_ps_1_;
   TestCoordinationServiceTaskState state_worker_0_;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
index 4d1ac80e6111..3db055a4d151 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <iterator>
-#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -25,6 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
@@ -48,7 +48,7 @@ void CoordinationServiceRpcHandler::SetAgentInstance(
 }
 
 void CoordinationServiceRpcHandler::SetServiceInstance(
-    CoordinationServiceInterface* service) {
+    CoordinationService* service) {
   absl::MutexLock l(&mu_);
   service_ = service;
 }
@@ -103,6 +103,7 @@ void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
       request->source_task(), request->device_info(),
       [response, service = service_, done = std::move(done)](absl::Status s) {
         if (s.ok()) {
+          service->state_mu_.AssertHeld();
           *response->mutable_device_info() = service->ListClusterDevices();
         }
         done(s);
@@ -189,6 +190,22 @@ void CoordinationServiceRpcHandler::GetTaskStateAsync(
   done(absl::OkStatus());
 }
 
+void CoordinationServiceRpcHandler::GetJobStateAsync(
+    const tensorflow::GetJobStateRequest* request,
+    tensorflow::GetJobStateResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        absl::InternalError("Coordination service is not enabled.")));
+    return;
+  }
+  std::vector<tensorflow::CoordinatedTaskStateInfo> result =
+      service_->GetJobState(request->job_name());
+  absl::c_move(result, tsl::protobuf::RepeatedFieldBackInserter(
+                           response->mutable_task_state()));
+  done(absl::OkStatus());
+}
+
 void CoordinationServiceRpcHandler::InsertKeyValueAsync(
     const tensorflow::InsertKeyValueRequest* request,
     tensorflow::InsertKeyValueResponse* response, StatusCallback done) {
@@ -215,7 +232,7 @@ void CoordinationServiceRpcHandler::GetKeyValueAsync(
   service_->GetKeyValueAsync(
       request->key(),
       [response, done = std::move(done)](
-          const absl::StatusOr<std::string_view>& status_or_value) {
+          const absl::StatusOr<absl::string_view>& status_or_value) {
         if (status_or_value.ok()) {
           auto value = status_or_value.value();
           response->mutable_kv()->set_value(value.data(), value.size());
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
index b77fb54b5590..5e2986ac8aa9 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -31,7 +31,7 @@ class CoordinationServiceRpcHandler {
 
   void SetAgentInstance(CoordinationServiceAgent* agent);
 
-  void SetServiceInstance(CoordinationServiceInterface* service);
+  void SetServiceInstance(CoordinationService* service);
 
   void RegisterTaskAsync(const tensorflow::RegisterTaskRequest* request,
                          tensorflow::RegisterTaskResponse* response,
@@ -65,6 +65,10 @@ class CoordinationServiceRpcHandler {
                          tensorflow::GetTaskStateResponse* response,
                          StatusCallback done);
 
+  void GetJobStateAsync(const tensorflow::GetJobStateRequest* request,
+                        tensorflow::GetJobStateResponse* response,
+                        StatusCallback done);
+
   void InsertKeyValueAsync(const tensorflow::InsertKeyValueRequest* request,
                            tensorflow::InsertKeyValueResponse* response,
                            StatusCallback done);
@@ -103,7 +107,7 @@ class CoordinationServiceRpcHandler {
  private:
   absl::Mutex mu_;
   CoordinationServiceAgent* agent_ TF_GUARDED_BY(mu_) = nullptr;
-  CoordinationServiceInterface* service_ TF_GUARDED_BY(mu_) = nullptr;
+  CoordinationService* service_ TF_GUARDED_BY(mu_) = nullptr;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
index 3a1f2d100cf7..0026a2127c16 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -44,18 +44,20 @@ limitations under the License.
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "tsl/platform/random.h"
 
 namespace tsl {
 namespace {
+
 using ::testing::Each;
-using ::testing::Eq;
-using ::testing::EqualsProto;
 using ::testing::HasSubstr;
 using ::testing::IsEmpty;
+using ::testing::Matcher;
 using ::testing::UnorderedElementsAre;
-using ::testing::UnorderedPointwise;
+using ::testing::UnorderedElementsAreArray;
 using ::testing::status::StatusIs;
+using ::tsl::proto_testing::EqualsProto;
 
 using tensorflow::CoordinatedJob;
 using tensorflow::CoordinatedTask;
@@ -121,6 +123,7 @@ class TestCoordinationClient : public CoordinationClient {
   UNIMPLEMENTED(ResetTask);
   UNIMPLEMENTED(ReportErrorToService);
   UNIMPLEMENTED(GetTaskState);
+  UNIMPLEMENTED(GetJobState);
   UNIMPLEMENTED(InsertKeyValue);
   UNIMPLEMENTED(TryGetKeyValue);
   UNIMPLEMENTED(GetKeyValueDir);
@@ -190,8 +193,8 @@ class CoordinationBarrierTest : public ::testing::Test {
     }
     CoordinationServiceConfig config = GetCoordinationServiceConfig(num_tasks);
 
-    coord_service_ = CoordinationServiceInterface::EnableCoordinationService(
-        Env::Default(), config, std::move(client_cache));
+    coord_service_ = CoordinationService::Create(Env::Default(), config,
+                                                 std::move(client_cache));
     // Register the tasks.
     for (int i = 0; i < num_tasks; ++i) {
       absl::Status s =
@@ -203,11 +206,18 @@ class CoordinationBarrierTest : public ::testing::Test {
     }
   }
 
-  CoordinationServiceInterface* GetCoordinationService() {
-    return coord_service_.get();
+  CoordinationService* GetCoordinationService() { return coord_service_.get(); }
+  CoordinatedTask GetTask(int i) const { return tasks_[i]; }
+  const std::vector<CoordinatedTask>& GetTasks() const { return tasks_; }
+
+  // Returns a vector of matchers to match the tasks.
+  std::vector<Matcher<CoordinatedTask>> GetTaskMatchers() const {
+    std::vector<Matcher<CoordinatedTask>> matchers;
+    for (const auto& task : tasks_) {
+      matchers.push_back(EqualsProto(task));
+    }
+    return matchers;
   }
-  CoordinatedTask GetTask(int i) { return tasks_[i]; }
-  const std::vector<CoordinatedTask>& GetTasks() { return tasks_; }
 
   // TODO(b/286141652) Refactor this method into a util file.
   std::string GetTaskName(const CoordinatedTask& task) {
@@ -224,7 +234,7 @@ class CoordinationBarrierTest : public ::testing::Test {
   }
 
  private:
-  std::unique_ptr<CoordinationServiceInterface> coord_service_;
+  std::unique_ptr<CoordinationService> coord_service_;
   std::vector<CoordinatedTask> tasks_;
   std::vector<std::unique_ptr<TestCoordinationClient>> clients_;
 };
@@ -273,8 +283,8 @@ class CoordinateTwoTasksTest : public ::testing::Test {
       config.set_allow_new_incarnation_to_reconnect(true);
     }
     // Init service.
-    coord_service_ = CoordinationServiceInterface::EnableCoordinationService(
-        Env::Default(), config, std::move(client_cache));
+    coord_service_ = CoordinationService::Create(Env::Default(), config,
+                                                 std::move(client_cache));
   }
 
   CoordinatedTask task_0_;
@@ -285,7 +295,7 @@ class CoordinateTwoTasksTest : public ::testing::Test {
   const uint64_t incarnation_1_ = random::New64();
   const uint64_t incarnation_1_new_ = random::New64();
   TestCoordinationClient client_1_;
-  std::unique_ptr<CoordinationServiceInterface> coord_service_;
+  std::unique_ptr<CoordinationService> coord_service_;
 };
 
 // Construct fake device protos.
@@ -363,9 +373,9 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   client_cache->AddTask("/job:worker/replica:0/task:1", &wi1);
   TestCoordinationClient ei;
   client_cache->AddTask("/job:evaluator/replica:0/task:0", &ei);
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config, std::move(client_cache));
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  std::move(client_cache));
 
   // Each coordinated task registers and waits for other tasks.
   absl::Notification register_chief;
@@ -407,10 +417,9 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config,
-          /*cache=*/nullptr);
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  /*cache=*/nullptr);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
 
@@ -428,10 +437,9 @@ TEST(CoordinationServiceTest,
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config,
-          /*cache=*/nullptr);
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  /*cache=*/nullptr);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
 
@@ -450,10 +458,9 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config,
-          /*cache=*/nullptr);
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  /*cache=*/nullptr);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
   // Arbitrarily set task to be in error.
@@ -622,6 +629,62 @@ TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
   EXPECT_THAT(client_0_.GetStatus(), StatusIs(absl::StatusCode::kAborted));
 }
 
+TEST_F(CoordinateTwoTasksTest, GetJobStateSucceeds) {
+  // This test calls GetJobState on two successfully connected tasks.
+  EnableCoordinationService();
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+
+  std::vector<tensorflow::CoordinatedTaskStateInfo> want(2);
+  *want[0].mutable_task() = task_0_;
+  want[0].set_incarnation(incarnation_0_);
+  want[0].set_state(tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+  *want[1].mutable_task() = task_1_;
+  want[1].set_incarnation(incarnation_1_);
+  want[1].set_state(tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+  EXPECT_THAT(coord_service_->GetJobState("worker"),
+              UnorderedElementsAre(EqualsProto(want[0]), EqualsProto(want[1])));
+}
+
+TEST_F(CoordinateTwoTasksTest, GetJobStateReturnsDisconnected) {
+  // This test calls GetJobState on one successfully connected task and one
+  // disconnected task.
+  EnableCoordinationService();
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->ResetTask(task_1_));
+
+  std::vector<tensorflow::CoordinatedTaskStateInfo> want(2);
+  *want[0].mutable_task() = task_0_;
+  want[0].set_incarnation(incarnation_0_);
+  want[0].set_state(tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+  *want[1].mutable_task() = task_1_;
+  want[1].set_incarnation(incarnation_1_);
+  want[1].set_state(tensorflow::CoordinatedTaskState::TASKSTATE_DISCONNECTED);
+  EXPECT_THAT(coord_service_->GetJobState("worker"),
+              UnorderedElementsAre(EqualsProto(want[0]), EqualsProto(want[1])));
+}
+
+TEST_F(CoordinateTwoTasksTest, GetJobStateReturnsNewIncarnation) {
+  // This test calls GetJobState after one task has restarted with a new
+  // incarnation.
+  EnableCoordinationService();
+  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  ASSERT_OK(coord_service_->ResetTask(task_1_));
+  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_ + 1));
+
+  std::vector<tensorflow::CoordinatedTaskStateInfo> want(2);
+  *want[0].mutable_task() = task_0_;
+  want[0].set_incarnation(incarnation_0_);
+  want[0].set_state(tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+  *want[1].mutable_task() = task_1_;
+  want[1].set_incarnation(incarnation_1_ + 1);
+  want[1].set_state(tensorflow::CoordinatedTaskState::TASKSTATE_CONNECTED);
+  EXPECT_THAT(coord_service_->GetJobState("worker"),
+              UnorderedElementsAre(EqualsProto(want[0]), EqualsProto(want[1])));
+}
+
 TEST_F(CoordinateTwoTasksTest, InsertKeyValue_Duplicate_Fail) {
   EnableCoordinationService();
   ASSERT_OK(coord_service_->InsertKeyValue("key0", "original_value"));
@@ -659,9 +722,9 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
 
   // Get simple key
   absl::Notification n1;
-  absl::StatusOr<std::string_view> ret;
+  absl::StatusOr<absl::string_view> ret;
   coord_service_->GetKeyValueAsync(
-      "key0", [&](const absl::StatusOr<std::string_view>& status_or_value) {
+      "key0", [&](const absl::StatusOr<absl::string_view>& status_or_value) {
         ret = status_or_value;
         n1.Notify();
       });
@@ -672,7 +735,7 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
   absl::Notification n2;
   coord_service_->GetKeyValueAsync(
       "path//to///key1////",
-      [&](const absl::StatusOr<std::string_view>& status_or_value) {
+      [&](const absl::StatusOr<absl::string_view>& status_or_value) {
         ret = status_or_value;
         n2.Notify();
       });
@@ -684,7 +747,7 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
   // Get key that is not available
   absl::Notification n3;
   coord_service_->GetKeyValueAsync(
-      "key0", [&](const absl::StatusOr<std::string_view>& status_or_value) {
+      "key0", [&](const absl::StatusOr<absl::string_view>& status_or_value) {
         ret = status_or_value;
         n3.Notify();
       });
@@ -704,7 +767,7 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
       // service shutdown. Hence, we use a shared pointer for notification so
       // that the it will not be deallocated before the pending callback is
       // cleaned up.
-      [n4](const absl::StatusOr<std::string_view>& status_or_value) {
+      [n4](const absl::StatusOr<absl::string_view>& status_or_value) {
         n4->Notify();
       });
   EXPECT_FALSE(n4->HasBeenNotified());
@@ -714,9 +777,9 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
   const CoordinationServiceConfig config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config, std::move(client_cache));
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  std::move(client_cache));
 
   // Try to get nonexistent key.
   absl::StatusOr<std::string> result =
@@ -828,9 +891,9 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   task_2.set_task_id(2);
   absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config, std::move(client_cache));
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  std::move(client_cache));
   absl::Notification n;
   // Map fake devices to each task.
   DeviceInfo local_devices_0;
@@ -854,6 +917,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
     ASSERT_OK(s);
     // Gather the cluster device info.
+    coord_service->state_mu_.AssertHeld();
     cluster_devices = coord_service->ListClusterDevices();
     n.Notify();
   });
@@ -884,9 +948,9 @@ TEST(CoordinationServiceTest, ListClusterDevices_XlaDevice) {
   task_2.set_task_id(2);
   absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config, std::move(client_cache));
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  std::move(client_cache));
   coord_service->SetDeviceAggregationFunction(
       [](const DeviceInfo& raw_global_devices) {
         TestDeviceList global_device_list;
@@ -929,6 +993,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_XlaDevice) {
   coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
     ASSERT_OK(s);
     // Gather the cluster device info.
+    coord_service->state_mu_.AssertHeld();
     cluster_devices = coord_service->ListClusterDevices();
     n.Notify();
   });
@@ -963,9 +1028,9 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
   absl::Status status = absl::OkStatus();
   absl::Status initial_wait_for_all_tasks_status;
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config, std::move(client_cache));
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  std::move(client_cache));
   absl::Notification n;
   // Map fake devices to each task.
   DeviceInfo local_devices_0;
@@ -992,6 +1057,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
                                   &cluster_devices, &n](absl::Status s) {
                                    ASSERT_OK(s);
                                    // Gather the cluster device info.
+                                   coord_service->state_mu_.AssertHeld();
                                    cluster_devices =
                                        coord_service->ListClusterDevices();
                                    n.Notify();
@@ -2071,9 +2137,9 @@ TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
   worker_job->set_name("worker");
   worker_job->set_num_tasks(2);
 
-  std::unique_ptr<CoordinationServiceInterface> coord_service =
-      CoordinationServiceInterface::EnableCoordinationService(
-          Env::Default(), config, /*cache=*/nullptr);
+  std::unique_ptr<CoordinationService> coord_service =
+      CoordinationService::Create(Env::Default(), config,
+                                  /*cache=*/nullptr);
 
   // Each coordinated task registers and polls for errors.
   ASSERT_OK(coord_service->RegisterTask(chief, /*incarnation=*/0));
@@ -2421,7 +2487,7 @@ TEST_F(GetAliveTasksTest, SuccessfulGetAliveTasks) {
   auto done = [&](const absl::Status& status,
                   const std::vector<CoordinatedTask>& alive_tasks) {
     EXPECT_OK(status);
-    EXPECT_THAT(alive_tasks, UnorderedPointwise(EqualsProto(), GetTasks()));
+    EXPECT_THAT(alive_tasks, UnorderedElementsAreArray(GetTaskMatchers()));
     finished.DecrementCount();
   };
   GetCoordinationService()->GetAliveTasksAsync(GetTask(0), GetTasks(), done);
@@ -2437,8 +2503,8 @@ TEST_F(GetAliveTasksTest, FailedTaskBeforeCallingGetAliveTasks) {
   auto done = [&](const absl::Status& status,
                   const std::vector<CoordinatedTask>& alive_tasks) {
     EXPECT_OK(status);
-    EXPECT_THAT(alive_tasks,
-                UnorderedPointwise(EqualsProto(), {GetTask(0), GetTask(1)}));
+    EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(GetTask(0)),
+                                                  EqualsProto(GetTask(1))));
     finished.DecrementCount();
   };
   ASSERT_OK(GetCoordinationService()->ReportTaskError(
@@ -2456,8 +2522,8 @@ TEST_F(GetAliveTasksTest, FailedTaskAfterCallingGetAliveTasks) {
   auto done = [&](const absl::Status& status,
                   const std::vector<CoordinatedTask>& alive_tasks) {
     EXPECT_OK(status);
-    EXPECT_THAT(alive_tasks,
-                UnorderedPointwise(EqualsProto(), {GetTask(0), GetTask(1)}));
+    EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(GetTask(0)),
+                                                  EqualsProto(GetTask(1))));
     finished.DecrementCount();
   };
   GetCoordinationService()->GetAliveTasksAsync(GetTask(0), GetTasks(), done);
@@ -2478,7 +2544,8 @@ TEST_F(GetAliveTasksTest, ConcurrentGetAliveTasks) {
   auto done_01 = [&](const absl::Status& status,
                      const std::vector<CoordinatedTask>& alive_tasks) {
     EXPECT_OK(status);
-    EXPECT_THAT(alive_tasks, UnorderedPointwise(EqualsProto(), tasks_01));
+    EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(tasks_01[0]),
+                                                  EqualsProto(tasks_01[1])));
     finished_01.DecrementCount();
   };
 
@@ -2488,7 +2555,8 @@ TEST_F(GetAliveTasksTest, ConcurrentGetAliveTasks) {
   auto done_12 = [&](const absl::Status& status,
                      const std::vector<CoordinatedTask>& alive_tasks) {
     EXPECT_OK(status);
-    EXPECT_THAT(alive_tasks, UnorderedPointwise(EqualsProto(), tasks_12));
+    EXPECT_THAT(alive_tasks, UnorderedElementsAre(EqualsProto(tasks_12[0]),
+                                                  EqualsProto(tasks_12[1])));
     finished_12.DecrementCount();
   };
 
@@ -2511,7 +2579,7 @@ TEST_F(GetAliveTasksTest, CallingGetAliveTasksWithoutBeingAMember) {
     finished.DecrementCount();
   };
 
-  CoordinationServiceInterface* s = GetCoordinationService();
+  CoordinationService* s = GetCoordinationService();
   s->GetAliveTasksAsync(GetTask(0), {GetTask(1), GetTask(2)}, done);
   s->GetAliveTasksAsync(GetTask(1), {GetTask(0), GetTask(2)}, done);
   s->GetAliveTasksAsync(GetTask(2), {GetTask(0), GetTask(1)}, done);
@@ -2525,7 +2593,7 @@ TEST_F(GetAliveTasksTest, RedundantGetAliveTasks) {
   auto done = [&](const absl::Status& status,
                   const std::vector<CoordinatedTask>& alive_tasks) {
     EXPECT_OK(status);
-    EXPECT_THAT(alive_tasks, UnorderedPointwise(EqualsProto(), GetTasks()));
+    EXPECT_THAT(alive_tasks, UnorderedElementsAreArray(GetTaskMatchers()));
     finished.DecrementCount();
   };
   GetCoordinationService()->GetAliveTasksAsync(GetTask(0), GetTasks(), done);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store.cc
new file mode 100644
index 000000000000..0beb21f64252
--- /dev/null
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store.cc
@@ -0,0 +1,129 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/distributed_runtime/coordination/key_value_store.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+
+namespace tsl {
+
+KeyValueStore::~KeyValueStore() {
+  absl::MutexLock l(&mu_);
+
+  absl::Status cancelled = absl::CancelledError("KeyValueStore destructed");
+  for (auto& [key, callbacks] : callbacks_) {
+    for (Callback& callback : callbacks) {
+      callback(cancelled);
+    }
+  }
+}
+
+absl::Status KeyValueStore::Put(absl::string_view key, absl::string_view value,
+                                bool allow_overwrite) {
+  absl::MutexLock l(&mu_);
+
+  if (allow_overwrite) {
+    data_[key] = value;
+    NotifyCallbacksForKey(key, value);
+    return absl::OkStatus();
+  }
+
+  auto [it, inserted] = data_.try_emplace(key, value);
+  if (!inserted) {
+    return absl::AlreadyExistsError(
+        absl::StrCat("key ", key, " already exists."));
+  }
+  NotifyCallbacksForKey(key, value);
+  return absl::OkStatus();
+}
+
+std::optional<std::string> KeyValueStore::Get(absl::string_view key) {
+  absl::MutexLock l(&mu_);
+  auto it = data_.find(key);
+  if (it == data_.end()) {
+    return std::nullopt;
+  }
+  return it->second;
+}
+
+std::vector<tensorflow::KeyValueEntry> KeyValueStore::GetPrefix(
+    absl::string_view prefix) {
+  absl::MutexLock l(&mu_);
+
+  std::vector<tensorflow::KeyValueEntry> entries;
+  for (auto it = data_.lower_bound(prefix); it != data_.end(); ++it) {
+    const auto& [key, value] = *it;
+    if (!absl::StartsWith(key, prefix)) {
+      break;
+    }
+    tensorflow::KeyValueEntry entry;
+    entry.set_key(key);
+    entry.set_value(value);
+    entries.push_back(std::move(entry));
+  }
+  return entries;
+}
+
+void KeyValueStore::Delete(absl::string_view key) {
+  absl::MutexLock l(&mu_);
+  data_.erase(key);
+}
+
+void KeyValueStore::DeletePrefix(absl::string_view prefix) {
+  absl::MutexLock l(&mu_);
+
+  auto begin = data_.lower_bound(prefix);
+  auto it = begin;
+  for (; it != data_.end(); ++it) {
+    const auto& [key, value] = *it;
+    if (!absl::StartsWith(key, prefix)) {
+      break;
+    }
+  }
+  data_.erase(begin, it);
+}
+
+void KeyValueStore::AddCallbackForKey(absl::string_view key,
+                                      Callback callback) {
+  absl::MutexLock l(&mu_);
+
+  if (auto it = data_.find(key); it != data_.end()) {
+    callback(it->second);
+    return;
+  }
+  callbacks_[key].push_back(std::move(callback));
+}
+
+void KeyValueStore::NotifyCallbacksForKey(
+    absl::string_view key, const absl::StatusOr<absl::string_view>& value) {
+  if (auto it = callbacks_.find(key); it != callbacks_.end()) {
+    for (Callback& callback : it->second) {
+      callback(value);
+    }
+    callbacks_.erase(it);
+  }
+}
+
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store.h
new file mode 100644
index 000000000000..555b3aa0bcde
--- /dev/null
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store.h
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_KEY_VALUE_STORE_H_
+#define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_KEY_VALUE_STORE_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/btree_map.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+
+// A thread-safe in-memory key-value store.
+class KeyValueStore {
+ public:
+  using Callback =
+      absl::AnyInvocable<void(const absl::StatusOr<absl::string_view>&)>;
+
+  KeyValueStore() = default;
+  ~KeyValueStore();
+
+  // KeyValueStore is not copyable or movable.
+  KeyValueStore(const KeyValueStore&) = delete;
+  KeyValueStore(KeyValueStore&&) = delete;
+  KeyValueStore& operator=(const KeyValueStore&) = delete;
+  KeyValueStore& operator=(KeyValueStore&&) = delete;
+
+  // Inserts a key-value pair. If allow_overwrite is false, then Put returns an
+  // error if the provided key is already in the store.
+  absl::Status Put(absl::string_view key, absl::string_view value,
+                   bool allow_overwrite);
+
+  // Returns the value associated with the provided key, if one exists.
+  std::optional<std::string> Get(absl::string_view key);
+
+  // Returns all key-value pairs where the key has the provided prefix.
+  //
+  // The empty string "" is a prefix of every key, so GetPrefix("") can be used
+  // to retrieve every element in the store.
+  std::vector<tensorflow::KeyValueEntry> GetPrefix(absl::string_view prefix);
+
+  // Adds a callback that is called when the provided key exists in the map.
+  void AddCallbackForKey(absl::string_view key, Callback callback);
+
+  // Deletes the provided key.
+  void Delete(absl::string_view key);
+
+  // Deletes all key-value pairs where the key has the provided prefix.
+  void DeletePrefix(absl::string_view prefix);
+
+ private:
+  // Notifies all callbacks registered for the provided key.
+  void NotifyCallbacksForKey(absl::string_view key,
+                             const absl::StatusOr<absl::string_view>& value)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  absl::Mutex mu_;
+  absl::btree_map<std::string, std::string> data_ ABSL_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, std::vector<Callback>> callbacks_
+      ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_KEY_VALUE_STORE_H_
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store_test.cc
new file mode 100644
index 000000000000..7956402c7d38
--- /dev/null
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/key_value_store_test.cc
@@ -0,0 +1,206 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/distributed_runtime/coordination/key_value_store.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/status_matchers.h"
+#include "xla/tsl/platform/test.h"
+
+namespace tsl {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::Optional;
+using ::testing::Pair;
+using ::testing::status::IsOkAndHolds;
+using ::testing::status::StatusIs;
+
+// Converts a list of KeyValueEntries into a list of pairs.
+std::vector<std::pair<std::string, std::string>> AsPairs(
+    absl::Span<const tensorflow::KeyValueEntry> entries) {
+  std::vector<std::pair<std::string, std::string>> pairs;
+  for (const tensorflow::KeyValueEntry& entry : entries) {
+    pairs.push_back({entry.key(), entry.value()});
+  }
+  return pairs;
+}
+
+TEST(KeyValueStore, OverwritingInsertSucceeds) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("foo", "bar", /*allow_overwrite=*/true));
+  EXPECT_THAT(store.Get("foo"), Optional(Eq("bar")));
+  ASSERT_OK(store.Put("foo", "moo", /*allow_overwrite=*/true));
+  EXPECT_THAT(store.Get("foo"), Optional(Eq("moo")));
+}
+
+TEST(KeyValueStore, OverwritingInsertFails) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("foo", "bar", /*allow_overwrite=*/false));
+  EXPECT_THAT(store.Put("foo", "bar", /*allow_overwrite=*/false),
+              StatusIs(absl::StatusCode::kAlreadyExists));
+}
+
+TEST(KeyValueStore, GetExistingKey) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("foo", "bar", /*allow_overwrite=*/false));
+  EXPECT_THAT(store.Get("foo"), Optional(Eq("bar")));
+}
+
+TEST(KeyValueStore, GetMissingKey) {
+  KeyValueStore store;
+  EXPECT_EQ(store.Get("foo"), std::nullopt);
+}
+
+TEST(KeyValueStore, GetPrefixNoMatch) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("c", "", /*allow_overwrite=*/true));
+  EXPECT_THAT(AsPairs(store.GetPrefix("b")), IsEmpty());
+}
+
+TEST(KeyValueStore, GetPrefixExactMatch) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("b", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bbb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("c", "", /*allow_overwrite=*/true));
+  EXPECT_THAT(AsPairs(store.GetPrefix("b")),
+              ElementsAre(Pair("b", ""), Pair("bb", ""), Pair("bbb", "")));
+}
+
+TEST(KeyValueStore, GetPrefixNoExactMatch) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bbb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("c", "", /*allow_overwrite=*/true));
+  EXPECT_THAT(AsPairs(store.GetPrefix("b")),
+              ElementsAre(Pair("bb", ""), Pair("bbb", "")));
+}
+
+TEST(KeyValueStore, GetPrefixWithDirectoryFormat) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a/1", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("b/1", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("b/2", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("b/3/4", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("b/3/5", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("c/1", "", /*allow_overwrite=*/true));
+  EXPECT_THAT(AsPairs(store.GetPrefix("b/")),
+              ElementsAre(Pair("b/1", ""), Pair("b/2", ""), Pair("b/3/4", ""),
+                          Pair("b/3/5", "")));
+}
+
+TEST(KeyValueStore, AddCallbackAndThenKey) {
+  KeyValueStore store;
+  bool callback_called = false;
+  store.AddCallbackForKey("foo",
+                          [&](const absl::StatusOr<absl::string_view>& s) {
+                            ASSERT_OK(s);
+                            ASSERT_THAT(s, IsOkAndHolds("bar"));
+                            callback_called = true;
+                          });
+  ASSERT_FALSE(callback_called);
+  ASSERT_OK(store.Put("foo", "bar", /*allow_overwrite=*/true));
+  EXPECT_TRUE(callback_called);
+}
+
+TEST(KeyValueStore, AddKeyThenCallback) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("foo", "bar", /*allow_overwrite=*/true));
+  bool callback_called = false;
+  store.AddCallbackForKey("foo",
+                          [&](const absl::StatusOr<absl::string_view>& s) {
+                            ASSERT_OK(s);
+                            ASSERT_THAT(s, IsOkAndHolds("bar"));
+                            callback_called = true;
+                          });
+  EXPECT_TRUE(callback_called);
+}
+
+TEST(KeyValueStore, DeleteKey) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  store.Delete("a");
+  EXPECT_EQ(store.Get("a"), std::nullopt);
+}
+
+TEST(KeyValueStore, DeleteMissingKey) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  store.Delete("b");
+}
+
+TEST(KeyValueStore, DeletePrefixNoMatch) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("c", "", /*allow_overwrite=*/true));
+  store.DeletePrefix("b");
+  EXPECT_THAT(AsPairs(store.GetPrefix("")),
+              ElementsAre(Pair("a", ""), Pair("c", "")));
+}
+
+TEST(KeyValueStore, DeletePrefixExactMatch) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("b", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bbb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("c", "", /*allow_overwrite=*/true));
+  store.DeletePrefix("b");
+  EXPECT_THAT(AsPairs(store.GetPrefix("")),
+              ElementsAre(Pair("a", ""), Pair("c", "")));
+}
+
+TEST(KeyValueStore, DeletePrefixNoExactMatch) {
+  KeyValueStore store;
+  ASSERT_OK(store.Put("a", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("bbb", "", /*allow_overwrite=*/true));
+  ASSERT_OK(store.Put("c", "", /*allow_overwrite=*/true));
+  store.DeletePrefix("b");
+  EXPECT_THAT(AsPairs(store.GetPrefix("")),
+              ElementsAre(Pair("a", ""), Pair("c", "")));
+}
+
+TEST(KeyValueStore, CallbacksCalledOnDestruction) {
+  bool callback_called = false;
+  {
+    KeyValueStore store;
+    store.AddCallbackForKey(
+        "foo", [&](const absl::StatusOr<absl::string_view>& s) {
+          ASSERT_THAT(s, StatusIs(absl::StatusCode::kCancelled));
+          callback_called = true;
+        });
+  }
+  EXPECT_TRUE(callback_called);
+}
+
+}  // namespace
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD b/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
index c5d0046f9f14..6dd6bd76db96 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/BUILD
@@ -7,6 +7,7 @@ package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility([
         "//xla/tsl:internal",
+        "//third_party/py/jax:__subpackages__",
     ]),
     licenses = ["notice"],
 )
@@ -23,7 +24,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -42,7 +43,6 @@ tsl_cc_test(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:mutex",
     ],
 )
 
@@ -60,6 +60,7 @@ cc_library(
         "//xla/tsl/protobuf:coordination_service_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -79,7 +80,6 @@ tsl_cc_test(
         "//xla/tsl/distributed_runtime/coordination:coordination_client",
         "//xla/tsl/distributed_runtime/coordination:coordination_service",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_impl",
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.cc
index e2c6e625d2ee..f4ebf4d0334b 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/mutex.h"
 #if defined(PLATFORM_GOOGLE)
 #include "thread/executor.h"
 #include "thread/signal.h"
@@ -107,7 +107,7 @@ absl::StatusOr<absl::Time> PreemptionNotifier::WillBePreemptedAt() {
 }
 
 void PreemptionNotifier::WillBePreemptedAtAsync(PreemptTimeCallback callback) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   if (death_time_ == kUnsetDeathTime) {
     // Did not receive preemption notice yet.
     callbacks_.push_back(std::move(callback));
@@ -119,7 +119,7 @@ void PreemptionNotifier::WillBePreemptedAtAsync(PreemptTimeCallback callback) {
 
 void PreemptionNotifier::NotifyRegisteredListeners(
     absl::StatusOr<absl::Time> death_time) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   if (death_time.ok()) {
     death_time_ = death_time.value();
   }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
index 97479dd06ae6..377dbebf73bc 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier.h
@@ -23,10 +23,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 
@@ -137,7 +138,7 @@ class PreemptionNotifier {
   }
 
   Env* env_;  // Not owned.
-  mutex mu_;
+  absl::Mutex mu_;
   absl::Time death_time_ TF_GUARDED_BY(mu_) = absl::InfinitePast();
   std::vector<PreemptTimeCallback> callbacks_ TF_GUARDED_BY(mu_);
 };
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier_test.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
index 91aa778684a8..322a3788c3b4 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_notifier_test.cc
@@ -167,7 +167,7 @@ TEST_F(PreemptNotifierTest, DestructorCancelsPendingCalls) {
   n.WaitForNotification();
 
   // Verify that pending callbacks are cancelled.
-  EXPECT_TRUE(errors::IsCancelled(result.status()));
+  EXPECT_TRUE(absl::IsCancelled(result.status()));
 }
 }  // namespace
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
index 6b70f8035736..0fac4fe1734c 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
 #include "xla/tsl/lib/monitoring/gauge.h"
@@ -46,7 +45,6 @@ namespace {
 using tensorflow::CoordinatedTask;
 using tensorflow::KeyValueEntry;
 
-constexpr int64_t kPreemptionSyncUnsetCounter = -1;
 constexpr char kPreemptionNoticeKey[] = "RECEIVED_PREEMPTION_NOTICE";
 constexpr char kPreemptionCounterDirKey[] = "PREEMPTION_CURRENT_COUNTER/";
 constexpr char kPreemptionBarrier[] = "PREEMPTION_SYNC_BARRIER";
@@ -75,49 +73,14 @@ auto* reached_sync_point_metric = monitoring::Gauge<bool, 0>::New(
 // accommodate higher checkpoint durations.
 constexpr absl::Duration kProtocolDuration = absl::Minutes(15);
 
-class PreemptionSyncManagerImpl : public PreemptionSyncManager {
- public:
-  PreemptionSyncManagerImpl() = default;
-  ~PreemptionSyncManagerImpl() override {
-    shutdown_.Notify();
-  }
-  absl::Status Initialize(CoordinationServiceAgent* agent) override;
-  absl::Status Initialize(CoordinationServiceAgent* agent,
-                          const std::string& preemption_notifier_type) override;
-  absl::Status Initialize(
-      CoordinationServiceAgent* agent,
-      std::unique_ptr<PreemptionNotifier> notifier) override;
-  bool ReachedSyncPoint(int step_counter) override;
-
- private:
-  // Determine the sync point upon receipt of preemption notice (death time).
-  void ComputeSyncCallCounter(absl::Time death_time);
-  // Notify other tasks to not wait at the barrier if the sync protocol failed
-  // midway.
-  void CancelPreemptionBarrier();
-
-  absl::Mutex mu_;
-  // Tracks the last step_counter passed into ReachedSyncPoint();
-  int64_t call_counter_ ABSL_GUARDED_BY(mu_) = 0;
-  // If set, determines the sync point.
-  int64_t preemption_sync_counter_ ABSL_GUARDED_BY(mu_) =
-      kPreemptionSyncUnsetCounter;
-  std::string current_call_counter_key_;
-
-  Env* env_;                         // Not owned;
-  CoordinationServiceAgent* agent_;  // Not owned.
-  absl::Notification shutdown_;
-  std::unique_ptr<Thread> sync_protocol_thread_;
-  std::unique_ptr<PreemptionNotifier> preemption_notifier_;
-  std::shared_ptr<CallOptions> call_opts_;
-};
-
-absl::Status PreemptionSyncManagerImpl::Initialize(
+}  // namespace
+
+absl::Status PreemptionSyncManager::Initialize(
     CoordinationServiceAgent* agent) {
   return Initialize(agent, "sigterm");
 }
 
-absl::Status PreemptionSyncManagerImpl::Initialize(
+absl::Status PreemptionSyncManager::Initialize(
     CoordinationServiceAgent* agent,
     const std::string& preemption_notifier_type) {
   TF_ASSIGN_OR_RETURN(Env * env, agent->GetEnv());
@@ -125,9 +88,14 @@ absl::Status PreemptionSyncManagerImpl::Initialize(
                                preemption_notifier_type, env));
 }
 
-absl::Status PreemptionSyncManagerImpl::Initialize(
+absl::Status PreemptionSyncManager::Initialize(
     CoordinationServiceAgent* agent,
     std::unique_ptr<PreemptionNotifier> notifier) {
+  {
+    absl::MutexLock l(&mu_);
+    CHECK(!shut_down_);
+  }
+
   TF_ASSIGN_OR_RETURN(Env * env, agent->GetEnv());
   env_ = env;
   agent_ = agent;
@@ -176,7 +144,8 @@ absl::Status PreemptionSyncManagerImpl::Initialize(
           LOG(INFO) << "Cancelled call to retrieve preemption notice. This is "
                        "expected upon program shutdown.";
           return;
-        } else if (!status_or_death_time.ok()) {
+        }
+        if (!status_or_death_time.ok()) {
           LOG(WARNING)
               << "Failed to retrieve preemption notice from "
                  "coordination service: "
@@ -209,16 +178,41 @@ absl::Status PreemptionSyncManagerImpl::Initialize(
         }
 
         // Trigger protocol in a separate thread: compute max call counter.
-        sync_protocol_thread_ = absl::WrapUnique(env_->StartThread(
-            {}, "PreemptionSyncManager_SyncProtocol",
-            std::bind(&PreemptionSyncManagerImpl::ComputeSyncCallCounter, this,
-                      death_time)));
+        {
+          absl::MutexLock l(&mu_);
+          if (shut_down_) {
+            return;
+          }
+          sync_protocol_thread_ = absl::WrapUnique(env_->StartThread(
+              {}, "PreemptionSyncManager_SyncProtocol",
+              std::bind(&PreemptionSyncManager::ComputeSyncCallCounter, this,
+                        death_time)));
+        }
       });
 
   return absl::OkStatus();
 }
 
-void PreemptionSyncManagerImpl::ComputeSyncCallCounter(absl::Time death_time) {
+void PreemptionSyncManager::Shutdown() {
+  absl::MutexLock l(&mu_);
+  if (shut_down_) {
+    LOG(INFO) << "PreemptionSyncManager already shut down";
+    return;
+  }
+  shut_down_ = true;
+
+  LOG(INFO) << "Shutting down PreemptionSyncManager...";
+  shutdown_.Notify();
+  if (call_opts_) {
+    call_opts_->StartCancel();
+  }
+  if (sync_protocol_thread_) {
+    sync_protocol_thread_.reset();
+  }
+  LOG(INFO) << "PreemptionSyncManager shut down.";
+}
+
+void PreemptionSyncManager::ComputeSyncCallCounter(absl::Time death_time) {
   // 1. If death time is in the distant future, sleep until there's
   // `kProtocolDuration` left until death time before we begin the protocol.
   const absl::Duration remaining_time = death_time - absl::Now();
@@ -296,7 +290,7 @@ void PreemptionSyncManagerImpl::ComputeSyncCallCounter(absl::Time death_time) {
   set_sync_point_metric->GetCell()->Set(true);
 }
 
-void PreemptionSyncManagerImpl::CancelPreemptionBarrier() {
+void PreemptionSyncManager::CancelPreemptionBarrier() {
   agent_->CancelBarrierAsync(
       kPreemptionBarrier, [](const absl::Status& status) {
         if (!status.ok()) {
@@ -305,7 +299,7 @@ void PreemptionSyncManagerImpl::CancelPreemptionBarrier() {
       });
 }
 
-bool PreemptionSyncManagerImpl::ReachedSyncPoint(int step_counter) {
+bool PreemptionSyncManager::ReachedSyncPoint(int step_counter) {
   // Record that this API was called at least once.
   sync_usage_metric->GetCell()->Set(true);
   // Note: if a preemption notice has been received and ComputeSyncCallCounter()
@@ -313,6 +307,7 @@ bool PreemptionSyncManagerImpl::ReachedSyncPoint(int step_counter) {
   // prevents updates to `call_counter_` while `preemption_sync_counter_` is
   // being computed, which ensures correctness of the preemption sync protocol.
   absl::MutexLock l(&mu_);
+  CHECK(!shut_down_);
   // Track current call.
   call_counter_ = step_counter;
   VLOG(3) << "Current call counter: " << call_counter_
@@ -325,8 +320,8 @@ bool PreemptionSyncManagerImpl::ReachedSyncPoint(int step_counter) {
   }
   return reached_sync_point;
 }
-}  // namespace
+
 std::unique_ptr<PreemptionSyncManager> CreatePreemptionSyncManager() {
-  return std::make_unique<PreemptionSyncManagerImpl>();
+  return std::make_unique<PreemptionSyncManager>();
 }
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
index 5d36540a7898..91c9ccc0cacb 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.h
@@ -15,12 +15,19 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
 
+#include <cstdint>
 #include <memory>
 #include <string>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
+#include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "xla/tsl/platform/env.h"
 
 namespace tsl {
 
@@ -33,15 +40,18 @@ namespace tsl {
 // TODO(b/230630494): Add Reset() to allow multiple sync points to be set.
 class PreemptionSyncManager {
  public:
-  virtual ~PreemptionSyncManager() = default;
+  PreemptionSyncManager() = default;
+  ~PreemptionSyncManager() { Shutdown(); }
+  absl::Status Initialize(CoordinationServiceAgent* agent);
+  absl::Status Initialize(CoordinationServiceAgent* agent,
+                          const std::string& preemption_notifier_type);
+  absl::Status Initialize(CoordinationServiceAgent* agent,
+                          std::unique_ptr<PreemptionNotifier> notifier);
 
-  virtual absl::Status Initialize(CoordinationServiceAgent* agent) = 0;
-  virtual absl::Status Initialize(
-      CoordinationServiceAgent* agent,
-      const std::string& preemption_notifier_type) = 0;
-  virtual absl::Status Initialize(
-      CoordinationServiceAgent* agent,
-      std::unique_ptr<PreemptionNotifier> notifier) = 0;
+  // Shuts down the PreemptionSyncManager. Shutdown cannot be called
+  // concurrently with other methods. After Shutdown is called, no other methods
+  // should be called.
+  void Shutdown();
 
   // Check if the synchronized point has been reached. When a task has been
   // preempted, a safe sync point will be determined by using the fastest task's
@@ -57,7 +67,33 @@ class PreemptionSyncManager {
   // task. Once a preemption notice is received, all tasks will agree on a safe
   // step to pause training and handle the preemption (e.g. save checkpoint and
   // exit, or wait for preempted task to restart, then resume training).
-  virtual bool ReachedSyncPoint(int step_counter) = 0;
+  bool ReachedSyncPoint(int step_counter);
+
+ private:
+  static constexpr int64_t kPreemptionSyncUnsetCounter = -1;
+
+  // Determine the sync point upon receipt of preemption notice (death time).
+  void ComputeSyncCallCounter(absl::Time death_time);
+  // Notify other tasks to not wait at the barrier if the sync protocol failed
+  // midway.
+  void CancelPreemptionBarrier();
+
+  absl::Mutex mu_;
+  // Shutdown flag
+  bool shut_down_ ABSL_GUARDED_BY(mu_) = false;
+  // Tracks the last step_counter passed into ReachedSyncPoint();
+  int64_t call_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // If set, determines the sync point.
+  int64_t preemption_sync_counter_ ABSL_GUARDED_BY(mu_) =
+      kPreemptionSyncUnsetCounter;
+  std::string current_call_counter_key_;
+
+  Env* env_;                         // Not owned;
+  CoordinationServiceAgent* agent_;  // Not owned.
+  absl::Notification shutdown_;
+  std::unique_ptr<Thread> sync_protocol_thread_ ABSL_GUARDED_BY(mu_);
+  std::unique_ptr<PreemptionNotifier> preemption_notifier_;
+  std::shared_ptr<CallOptions> call_opts_;
 };
 
 std::unique_ptr<PreemptionSyncManager> CreatePreemptionSyncManager();
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
index 8598a4a56e7e..a7dc38ada4e4 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
@@ -143,14 +143,14 @@ class PreemptionSyncManagerTest : public ::testing::Test {
         /*thread_options=*/{}, /*name=*/"CoordinationServiceHandleRPCsLoop",
         [service = coord_rpc_service_.get()]() { service->HandleRPCsLoop(); }));
   }
-  std::unique_ptr<CoordinationServiceInterface> EnableCoordinationService() {
+  std::unique_ptr<CoordinationService> EnableCoordinationService() {
     CoordinationServiceConfig config;
     config.set_service_type("standalone");
     CoordinatedJob* job = config.mutable_coordinated_job_list()->Add();
     job->set_name(kJobName);
     job->set_num_tasks(2);
-    return CoordinationServiceInterface::EnableCoordinationService(
-        Env::Default(), config, /*cache=*/nullptr);
+    return CoordinationService::Create(Env::Default(), config,
+                                       /*cache=*/nullptr);
   }
   void InitializeAndConnectCoordinationAgents() {
     std::unique_ptr<CoordinationClient> coord_client =
@@ -175,7 +175,7 @@ class PreemptionSyncManagerTest : public ::testing::Test {
   }
 
   // Coordination service.
-  std::unique_ptr<CoordinationServiceInterface> coord_service_;
+  std::unique_ptr<CoordinationService> coord_service_;
   std::unique_ptr<::grpc::Server> grpc_server_;
   std::unique_ptr<thread::ThreadPool> coord_compute_pool_;
   std::unique_ptr<AsyncServiceInterface> coord_rpc_service_;
@@ -249,6 +249,23 @@ TEST_F(PreemptionSyncManagerTest, ShutdownTasksWithoutPreemption) {
   EXPECT_FALSE(preempt_sync_mgr_->ReachedSyncPoint(step_counter++));
 }
 
+// Explicitly shut down without preemption.
+TEST_F(PreemptionSyncManagerTest, ShutdownWithoutPreemption) {
+  preempt_sync_mgr_->Shutdown();
+}
+
+// Explicitly shut down without initialization.
+TEST_F(PreemptionSyncManagerTest, ShutdownWithoutInitialization) {
+  std::unique_ptr<PreemptionSyncManager> m = CreatePreemptionSyncManager();
+  m->Shutdown();
+}
+
+// Explicitly shut down with preemption.
+TEST_F(PreemptionSyncManagerTest, ShutdownWithPreemption) {
+  SendPreemptionNotice(absl::Now());
+  preempt_sync_mgr_->Shutdown();
+}
+
 /* Two task tests */
 TEST_F(PreemptionSyncManagerTest, PreemptSlowTask) {
   int step_counter0 = 0;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD b/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
index 0ba36f59f935..9e573ad7a960 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/BUILD
@@ -26,7 +26,7 @@ cc_library(
     srcs = [],
     hdrs = ["grpc_call.h"],
     deps = [
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:refcount",
     ] + tsl_grpc_cc_dependencies(),
 )
@@ -62,6 +62,7 @@ tsl_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "//xla/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+        "//xla/tsl/util/proto:proto_matchers",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -72,7 +73,8 @@ cc_library(
         ":grpc_util",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -92,7 +94,7 @@ cc_library(
         "//xla/tsl/protobuf:rpc_options_proto_cc",
         "//xla/tsl/util:device_name_utils",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/platform:strcat",
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
index 777f54cb21a9..f5caf276ef96 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
@@ -49,6 +49,8 @@ using tensorflow::DeleteKeyValueRequest;
 using tensorflow::DeleteKeyValueResponse;
 using tensorflow::GetAliveTasksRequest;
 using tensorflow::GetAliveTasksResponse;
+using tensorflow::GetJobStateRequest;
+using tensorflow::GetJobStateResponse;
 using tensorflow::GetKeyValueDirRequest;
 using tensorflow::GetKeyValueDirResponse;
 using tensorflow::GetKeyValueRequest;
@@ -203,6 +205,16 @@ class GrpcCoordinationClient : public CoordinationClient {
         &target_);
   }
 
+  void GetJobStateAsync(const GetJobStateRequest* request,
+                        GetJobStateResponse* response,
+                        StatusCallback done) override {
+    new RPCState<protobuf::Message>(
+        &stub_, cq_, "/tensorflow.CoordinationService/GetJobState", *request,
+        response, std::move(done), /*call_opts=*/nullptr,
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
+  }
+
   void InsertKeyValueAsync(const InsertKeyValueRequest* request,
                            InsertKeyValueResponse* response,
                            StatusCallback done) override {
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
index 460e40862998..da88861436d9 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
@@ -50,6 +50,7 @@ void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
   ENQUEUE_REQUEST(ReportErrorToTask);
   ENQUEUE_REQUEST(ReportErrorToService);
   ENQUEUE_REQUEST(GetTaskState);
+  ENQUEUE_REQUEST(GetJobState);
   ENQUEUE_REQUEST(InsertKeyValue);
   ENQUEUE_REQUEST(GetKeyValue);
   ENQUEUE_REQUEST(TryGetKeyValue);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
index 0550a8565e1e..8f02cac64e18 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -52,7 +52,7 @@ class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
   void SetCoordinationServiceAgentInstance(CoordinationServiceAgent* agent) {
     rpc_handler_.SetAgentInstance(agent);
   }
-  void SetCoordinationServiceInstance(CoordinationServiceInterface* service) {
+  void SetCoordinationServiceInstance(CoordinationService* service) {
     rpc_handler_.SetServiceInstance(service);
   }
   CoordinationServiceRpcHandler* GetRpcHandler() { return &rpc_handler_; }
@@ -91,6 +91,7 @@ class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
   HANDLER(ReportErrorToTask);
   HANDLER(ReportErrorToService);
   HANDLER(GetTaskState);
+  HANDLER(GetJobState);
   HANDLER(InsertKeyValue);
   HANDLER(GetKeyValue);
   HANDLER(TryGetKeyValue);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
index 1a5cbfbb7aa8..87747f058b93 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_call.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 
+#include "absl/synchronization/mutex.h"
 #include "grpcpp/completion_queue.h"
 #include "grpcpp/impl/service_type.h"
 #include "grpcpp/server_builder.h"
 #include "grpcpp/server_context.h"
 #include "grpcpp/support/async_stream.h"
 #include "grpcpp/support/async_unary_call.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/refcount.h"
 
 namespace tsl {
@@ -86,7 +86,7 @@ class GrpcCallTag {
 template <class Service>
 class UntypedCall : public core::RefCounted {
  public:
-  virtual ~UntypedCall() {}
+  ~UntypedCall() override {}
 
   // The implementation of this method should use `service` to handle
   // an incoming request, and (perhaps asynchronously) send the
@@ -162,7 +162,7 @@ class Call : public UntypedCall<Service> {
   Call(HandleRequestFunction handle_request_function)
       : handle_request_function_(handle_request_function), responder_(&ctx_) {}
 
-  virtual ~Call() {}
+  ~Call() override {}
 
   void RequestReceived(Service* service, bool ok) override {
     if (ok) {
@@ -179,7 +179,7 @@ class Call : public UntypedCall<Service> {
 
   void RequestCancelled(Service* service, bool ok) override {
     if (ctx_.IsCancelled()) {
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       if (cancel_callback_) {
         cancel_callback_();
       }
@@ -189,13 +189,13 @@ class Call : public UntypedCall<Service> {
   // Registers `callback` as the function that should be called if and when this
   // call is canceled by the client.
   void SetCancelCallback(std::function<void()> callback) {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     cancel_callback_ = std::move(callback);
   }
 
   // Clears any cancellation callback that has been registered for this call.
   void ClearCancelCallback() {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     cancel_callback_ = nullptr;
   }
 
@@ -270,7 +270,7 @@ class Call : public UntypedCall<Service> {
   Tag response_sent_tag_{this, Tag::kResponseSent};
   Tag cancelled_tag_{this, Tag::kCancelled};
 
-  mutex mu_;
+  absl::Mutex mu_;
   std::function<void()> cancel_callback_ TF_GUARDED_BY(mu_);
 };
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
index 6b919bebf19b..42c3d9fd1b12 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
+#include "absl/synchronization/mutex.h"
 #include "grpcpp/create_channel.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_channel_common.h"
 #include "xla/tsl/lib/gtl/map_util.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/protobuf/rpc_options.pb.h"
 #include "xla/tsl/util/device_name_utils.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/strcat.h"
@@ -213,7 +213,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
   string TranslateTask(const string& target) override {
-    mutex_lock l(mu_);  // could use reader lock
+    absl::MutexLock l(&mu_);  // could use reader lock
     GrpcChannelCache* cache = gtl::FindPtrOrNull(target_caches_, target);
     if (cache == nullptr) {
       for (GrpcChannelCache* c : caches_) {
@@ -235,7 +235,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     for (GrpcChannelCache* cache : caches_) {
       SharedGrpcChannelPtr ch(cache->FindWorkerChannel(target));
       if (ch) {
-        mutex_lock l(mu_);
+        absl::MutexLock l(&mu_);
         target_caches_.insert({target, cache});
         return ch;
       }
@@ -247,7 +247,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   // List of channels used by this MultiGrpcChannelCache.
   const std::vector<GrpcChannelCache*> caches_;
 
-  mutex mu_;
+  absl::Mutex mu_;
   // Cache of channels keyed by the target they are handling.
   // The same GrpcChannelCache can appear multiple times in the cache.
   std::unordered_map<string, GrpcChannelCache*> target_caches_
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
index 8d37233abbf4..02c00d42583c 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_common.h
@@ -20,9 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
 #include "xla/tsl/platform/logging.h"
-#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 
@@ -43,7 +44,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
 
   SharedGrpcChannelPtr FindWorkerChannel(const string& target) override {
     {
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       auto iter = channels_.find(target);
       if (iter != channels_.end()) {
         return GetNextChannelPtrAndUpdateState(iter->second);
@@ -58,7 +59,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
     new_chan_state.last_used = num_channels_per_target_ - 1;
 
     {
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       typename absl::flat_hash_map<string, ChannelState>::iterator iter;
       bool was_inserted;
       std::tie(iter, was_inserted) = channels_.insert({target, new_chan_state});
@@ -94,7 +95,7 @@ class GenericCachingChannelCache : public ChannelCacheT {
 
   const int num_channels_per_target_;
   // TODO(zhifengc): Eviction when the map becomes too big.
-  mutex mu_;
+  absl::Mutex mu_;
   absl::flat_hash_map<string, ChannelState> channels_ TF_GUARDED_BY(mu_);
 };
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
index d59f2ced10ad..76ba7616babf 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_state.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
 
+#include <memory>
 #include <queue>
 #include <string>
 #include <utility>
@@ -125,7 +126,7 @@ class RPCState : public GrpcClientCQTag {
   }
 
   void StartCall() {
-    context_.reset(new ::grpc::ClientContext());
+    context_ = std::make_unique<::grpc::ClientContext>();
     context_->set_wait_for_ready(!fail_fast_);
     if (timeout_in_ms_ > 0) {
       context_->set_deadline(
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
index 182b6d02343b..af596db53a57 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util_test.cc
@@ -24,10 +24,12 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 
 namespace tsl {
-
 namespace {
+
+using tsl::proto_testing::EqualsProto;
 using tsl::test::TestRequest;
 
 string ToString(const grpc::ByteBuffer& buf) {
@@ -96,7 +98,7 @@ TEST(GrpcProto, Unparse) {
   ASSERT_TRUE(GrpcMaybeUnparseProto(proto, &buf).ok());
   TestRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
-  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+  ASSERT_THAT(parsed, EqualsProto(proto));
 }
 
 TEST(GrpcProto, UnparseToString) {
@@ -109,7 +111,7 @@ TEST(GrpcProto, UnparseToString) {
   ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
   TestRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
-  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+  ASSERT_THAT(parsed, EqualsProto(proto));
 }
 
 TEST(GrpcProto, Parse) {
@@ -131,7 +133,7 @@ TEST(GrpcProto, Parse) {
     TestRequest parsed;
     ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed))
         << c.length << " " << c.slices;
-    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+    ASSERT_THAT(parsed, EqualsProto(proto));
   }
 }
 
@@ -156,7 +158,7 @@ TEST(GrpcProto, ParseFromString) {
     ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
         << c.length << " " << c.slices;
     ASSERT_TRUE(parsed.ParseFromString(parsed_str));
-    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+    ASSERT_THAT(parsed, EqualsProto(proto));
   }
 }
 
diff --git a/third_party/xla/xla/tsl/framework/BUILD b/third_party/xla/xla/tsl/framework/BUILD
index 0ad14d506465..2701199d5da4 100644
--- a/third_party/xla/xla/tsl/framework/BUILD
+++ b/third_party/xla/xla/tsl/framework/BUILD
@@ -117,6 +117,7 @@ cc_library(
     ] + if_static(
         extra_deps = [
             ":allocator_registry_impl",
+            "@com_google_absl//absl/synchronization",
             "//xla/tsl/lib/gtl:inlined_vector",
             "@local_tsl//tsl/platform:strcat",
             "@local_tsl//tsl/platform:stringprintf",
@@ -124,7 +125,6 @@ cc_library(
             "//xla/tsl/platform:env_impl",
             "//xla/tsl/platform:logging",
             "//xla/tsl/platform:macros",
-            "@local_tsl//tsl/platform:mutex",
             "@local_tsl//tsl/platform:platform_port",
             "@local_tsl//tsl/platform:thread_annotations",
             "//xla/tsl/platform:types",
@@ -158,6 +158,7 @@ cc_library(
         "//tensorflow/core:__subpackages__",
         "@local_tsl//tsl:__subpackages__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//third_party/xprof:__subpackages__",
     ]),
     deps = [
         ":numeric_types",
@@ -168,7 +169,7 @@ cc_library(
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:stringprintf",
@@ -199,6 +200,7 @@ cc_library(
         "//xla/tsl/platform:types",
         "//xla/tsl/profiler/utils:trace_filter_utils",
         "//xla/tsl/protobuf:bfc_memory_map_proto_cc",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -253,7 +255,8 @@ cc_library(
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -368,8 +371,8 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:hash",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:notification",
         "@local_tsl//tsl/platform:stringpiece",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -467,6 +470,7 @@ tsl_cc_test(
         "//xla/tsl/platform:status_matchers",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "//xla/tsl/util:device_name_utils",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/tsl/framework/allocator.cc b/third_party/xla/xla/tsl/framework/allocator.cc
index 870b3953c8b6..b55ee822f46d 100644
--- a/third_party/xla/xla/tsl/framework/allocator.cc
+++ b/third_party/xla/xla/tsl/framework/allocator.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "xla/tsl/framework/tracking_allocator.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/stringprintf.h"
 
diff --git a/third_party/xla/xla/tsl/framework/allocator_registry.cc b/third_party/xla/xla/tsl/framework/allocator_registry.cc
index 365f9c8ec814..2228d8faf3cc 100644
--- a/third_party/xla/xla/tsl/framework/allocator_registry.cc
+++ b/third_party/xla/xla/tsl/framework/allocator_registry.cc
@@ -17,13 +17,15 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/logging.h"
 
 namespace tsl {
 
 // static
 AllocatorFactoryRegistry* AllocatorFactoryRegistry::singleton() {
-  static AllocatorFactoryRegistry* singleton = new AllocatorFactoryRegistry;
+  static AllocatorFactoryRegistry* const singleton =
+      new AllocatorFactoryRegistry;
   return singleton;
 }
 
@@ -41,7 +43,7 @@ void AllocatorFactoryRegistry::Register(const char* source_file,
                                         int source_line, const string& name,
                                         int priority,
                                         AllocatorFactory* factory) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   CHECK(!first_alloc_made_) << "Attempt to register an AllocatorFactory "
                             << "after call to GetAllocator()";
   CHECK(!name.empty()) << "Need a valid name for Allocator";
@@ -67,7 +69,7 @@ void AllocatorFactoryRegistry::Register(const char* source_file,
 }
 
 Allocator* AllocatorFactoryRegistry::GetAllocator() {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   first_alloc_made_ = true;
   FactoryEntry* best_entry = nullptr;
   for (auto& entry : factories_) {
@@ -89,7 +91,7 @@ Allocator* AllocatorFactoryRegistry::GetAllocator() {
 }
 
 SubAllocator* AllocatorFactoryRegistry::GetSubAllocator(int numa_node) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   first_alloc_made_ = true;
   FactoryEntry* best_entry = nullptr;
   for (auto& entry : factories_) {
diff --git a/third_party/xla/xla/tsl/framework/allocator_registry.h b/third_party/xla/xla/tsl/framework/allocator_registry.h
index 469072793d39..4529d4f35b67 100644
--- a/third_party/xla/xla/tsl/framework/allocator_registry.h
+++ b/third_party/xla/xla/tsl/framework/allocator_registry.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/macros.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/numa.h"
 
 namespace tensorflow {
@@ -90,7 +90,7 @@ class AllocatorFactoryRegistry {
   static AllocatorFactoryRegistry* singleton();
 
   ProcessStateInterface* process_state() const {
-    mutex_lock ml(mu_);
+    absl::MutexLock ml(&mu_);
     return process_state_;
   }
 
@@ -98,12 +98,12 @@ class AllocatorFactoryRegistry {
   friend class tensorflow::ProcessState;
 
   void SetProcessState(ProcessStateInterface* interface) {
-    mutex_lock ml(mu_);
+    absl::MutexLock ml(&mu_);
     process_state_ = interface;
   }
 
  private:
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
   ProcessStateInterface* process_state_ ABSL_GUARDED_BY(mu_) = nullptr;
   bool first_alloc_made_ = false;
   struct FactoryEntry {
diff --git a/third_party/xla/xla/tsl/framework/bfc_allocator.h b/third_party/xla/xla/tsl/framework/bfc_allocator.h
index a0d6568efab2..599e5e026b23 100644
--- a/third_party/xla/xla/tsl/framework/bfc_allocator.h
+++ b/third_party/xla/xla/tsl/framework/bfc_allocator.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/bits.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/types.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "tsl/platform/numbers.h"
 
 namespace tensorflow {
@@ -339,8 +340,8 @@ class BFCAllocator : public Allocator {
     }
 
     size_t IndexFor(const void* p) const {
-      std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
-      std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
+      std::uintptr_t p_int = safe_reinterpret_cast<std::uintptr_t>(p);
+      std::uintptr_t base_int = safe_reinterpret_cast<std::uintptr_t>(ptr_);
       DCHECK_GE(p_int, base_int);
       DCHECK_LT(p_int, base_int + memory_size_);
       return static_cast<size_t>(((p_int - base_int) >> kMinAllocationBits));
diff --git a/third_party/xla/xla/tsl/framework/cancellation.cc b/third_party/xla/xla/tsl/framework/cancellation.cc
index 54d4303d4883..86f00ef58c0e 100644
--- a/third_party/xla/xla/tsl/framework/cancellation.cc
+++ b/third_party/xla/xla/tsl/framework/cancellation.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <forward_list>
 
 #include "absl/memory/memory.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/status.h"
@@ -47,7 +48,7 @@ void CancellationManager::StartCancelWithStatus(const absl::Status& status) {
   std::forward_list<CancellationManager*> children_to_cancel;
   Notification* cancelled_notification = nullptr;
   {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     if (is_cancelled_.load(std::memory_order_relaxed) || is_cancelling_) {
       return;
     }
@@ -86,7 +87,7 @@ void CancellationManager::StartCancelWithStatus(const absl::Status& status) {
     child->StartCancelWithStatus(status);
   }
   {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     is_cancelling_ = false;
     is_cancelled_.store(true, std::memory_order_release);
   }
@@ -111,7 +112,7 @@ bool CancellationManager::RegisterCallbackWithErrorLogging(
 bool CancellationManager::RegisterCallbackConfig(CancellationToken token,
                                                  CallbackConfiguration config) {
   DCHECK_LT(token, next_cancellation_token_) << "Invalid cancellation token";
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   bool should_register = !is_cancelled_ && !is_cancelling_;
   if (should_register) {
     if (!state_) {
@@ -123,14 +124,14 @@ bool CancellationManager::RegisterCallbackConfig(CancellationToken token,
 }
 
 bool CancellationManager::DeregisterCallback(CancellationToken token) {
-  mu_.lock();
+  mu_.Lock();
   if (is_cancelled_) {
-    mu_.unlock();
+    mu_.Unlock();
     return false;
   } else if (is_cancelling_) {
     Notification* cancelled_notification =
         state_ ? &state_->cancelled_notification : nullptr;
-    mu_.unlock();
+    mu_.Unlock();
     // Wait for all of the cancellation callbacks to be called. This
     // wait ensures that the caller of DeregisterCallback does not
     // return immediately and free objects that may be used in the
@@ -143,13 +144,13 @@ bool CancellationManager::DeregisterCallback(CancellationToken token) {
     if (state_) {
       state_->callbacks.erase(token);
     }
-    mu_.unlock();
+    mu_.Unlock();
     return true;
   }
 }
 
 bool CancellationManager::RegisterChild(CancellationManager* child) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   if (is_cancelled_.load(std::memory_order_relaxed) || is_cancelling_) {
     child->is_removed_from_parent_ = true;
     return true;
@@ -175,7 +176,7 @@ void CancellationManager::DeregisterChild(CancellationManager* child) {
   DCHECK_EQ(child->parent_, this);
   Notification* cancelled_notification = nullptr;
   {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     if (!child->is_removed_from_parent_) {
       // Remove the child from this manager's list of children.
       DCHECK(state_);
@@ -208,7 +209,7 @@ void CancellationManager::DeregisterChild(CancellationManager* child) {
 }
 
 bool CancellationManager::TryDeregisterCallback(CancellationToken token) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (is_cancelled_ || is_cancelling_) {
     return false;
   } else {
@@ -229,7 +230,7 @@ CancellationManager::~CancellationManager() {
 }
 
 bool CancellationManager::IsCancelling() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   return is_cancelling_;
 }
 
diff --git a/third_party/xla/xla/tsl/framework/cancellation.h b/third_party/xla/xla/tsl/framework/cancellation.h
index fcfd4c83e956..ff195e4b395e 100644
--- a/third_party/xla/xla/tsl/framework/cancellation.h
+++ b/third_party/xla/xla/tsl/framework/cancellation.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <atomic>
 #include <functional>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/gtl/flatmap.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/hash.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/notification.h"
 #include "tsl/platform/stringpiece.h"
 #include "tsl/platform/thread_annotations.h"
@@ -90,13 +90,13 @@ class CancellationManager {
   // is recommended:
   //
   // class ObjectWithCancellableOperation {
-  //   mutex mu_;
+  //   absl::Mutex mu_;
   //   void CancellableOperation(CancellationManager* cm,
   //                             std::function<void(Status)> callback) {
   //     bool already_cancelled;
   //     CancellationToken token = cm->get_cancellation_token();
   //     {
-  //       mutex_lock(mu_);
+  //       absl::MutexLock lock(&mu_);
   //       already_cancelled = !cm->RegisterCallback(
   //           [this, token]() { Cancel(token); });
   //       if (!already_cancelled) {
@@ -114,7 +114,7 @@ class CancellationManager {
   //   }
   //
   //   void Cancel(CancellationToken token) {
-  //     mutex_lock(mu_);
+  //     absl::MutexLock lock(&mu_);
   //     // Take action to cancel the operation with the given cancellation
   //     // token.
   //   }
@@ -201,7 +201,7 @@ class CancellationManager {
   CancellationManager* next_sibling_ TF_GUARDED_BY(parent_->mu_) =
       nullptr;  // Not owned.
 
-  mutex mu_;
+  absl::Mutex mu_;
   std::unique_ptr<State> state_ TF_GUARDED_BY(mu_);
 };
 
diff --git a/third_party/xla/xla/tsl/framework/convolution/BUILD b/third_party/xla/xla/tsl/framework/convolution/BUILD
index 80cab0e4d97e..9687b8a9fd13 100644
--- a/third_party/xla/xla/tsl/framework/convolution/BUILD
+++ b/third_party/xla/xla/tsl/framework/convolution/BUILD
@@ -21,7 +21,7 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//xla/tsl/framework/convolution:eigen_convolution_helpers",
+        ":eigen_convolution_helpers",
     ],
 )
 
@@ -45,9 +45,9 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     defines = ["EIGEN_NEON_GEBP_NR=4"],
     deps = [
+        ":eigen_convolution_helpers",
+        ":eigen_spatial_convolutions-inl",
         "//xla/tsl/framework/contraction:eigen_contraction_kernel",
-        "//xla/tsl/framework/convolution:eigen_convolution_helpers",
-        "//xla/tsl/framework/convolution:eigen_spatial_convolutions-inl",
         "@eigen_archive//:eigen3",
     ],
 )
diff --git a/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc b/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
index 5979f509ecd0..b6be51d8eebb 100644
--- a/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
+++ b/third_party/xla/xla/tsl/framework/cpu_allocator_impl.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <atomic>
 #include <optional>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/framework/allocator_registry.h"
 #include "xla/tsl/framework/tracking_allocator.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/stringprintf.h"
 #include "tsl/profiler/lib/scoped_memory_debug_annotation.h"
@@ -89,7 +89,7 @@ class CPUAllocator : public Allocator {
     void* p = port::AlignedMalloc(num_bytes, alignment);
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       ++stats_.num_allocs;
       stats_.bytes_in_use += alloc_size;
       stats_.peak_bytes_in_use =
@@ -115,7 +115,7 @@ class CPUAllocator : public Allocator {
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size =
           port::MallocExtension_GetAllocatedSize(ptr);
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       stats_.bytes_in_use -= alloc_size;
       AddTraceMe("MemoryDeallocation", ptr, 0, alloc_size);
     }
@@ -126,7 +126,7 @@ class CPUAllocator : public Allocator {
     if (cpu_allocator_collect_stats) {
       const std::size_t alloc_size =
           port::MallocExtension_GetAllocatedSize(ptr);
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       stats_.bytes_in_use -= alloc_size;
       AddTraceMe("MemoryDeallocation", ptr, 0, alloc_size);
     }
@@ -161,13 +161,13 @@ class CPUAllocator : public Allocator {
     if (!cpu_allocator_collect_stats) {
       return std::nullopt;
     }
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     return stats_;
   }
 
   bool ClearStats() override {
     if (!cpu_allocator_collect_stats) return false;
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     stats_.num_allocs = 0;
     stats_.peak_bytes_in_use = stats_.bytes_in_use;
     stats_.largest_alloc_size = 0;
@@ -183,10 +183,10 @@ class CPUAllocator : public Allocator {
   }
 
  private:
-  mutex mu_;
+  absl::Mutex mu_;
   AllocatorStats stats_ TF_GUARDED_BY(mu_);
 
-  // Use <atomic> for single allocations to avoid mutex contention when
+  // Use <atomic> for single allocations to avoid absl::Mutex contention when
   // statistics are disabled.
   std::atomic<int> single_allocation_warning_count_;
   int total_allocation_warning_count_ TF_GUARDED_BY(mu_);
diff --git a/third_party/xla/xla/tsl/framework/device_id_manager.cc b/third_party/xla/xla/tsl/framework/device_id_manager.cc
index 730718918902..a8d0247d3159 100644
--- a/third_party/xla/xla/tsl/framework/device_id_manager.cc
+++ b/third_party/xla/xla/tsl/framework/device_id_manager.cc
@@ -20,13 +20,14 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/framework/device_id.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
-#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 namespace {
@@ -34,7 +35,7 @@ namespace {
 class TfToPlatformDeviceIdMap {
  public:
   static TfToPlatformDeviceIdMap* singleton() {
-    static auto* id_map = new TfToPlatformDeviceIdMap;
+    static auto* const id_map = new TfToPlatformDeviceIdMap;
     return id_map;
   }
 
@@ -43,7 +44,7 @@ class TfToPlatformDeviceIdMap {
       TF_LOCKS_EXCLUDED(mu_) {
     std::pair<IdMapType::iterator, bool> result;
     {
-      mutex_lock lock(mu_);
+      absl::MutexLock lock(&mu_);
       TypeIdMapType::iterator device_id_map_iter =
           id_map_.insert({type.type_string(), IdMapType()}).first;
       result = device_id_map_iter->second.insert(
@@ -68,7 +69,7 @@ class TfToPlatformDeviceIdMap {
             PlatformDeviceId* platform_device_id) const TF_LOCKS_EXCLUDED(mu_) {
     // TODO(mrry): Consider replacing this with an atomic `is_initialized` bit,
     // to avoid writing to a shared cache line in the tf_shared_lock.
-    tf_shared_lock lock(mu_);
+    absl::ReaderMutexLock lock(&mu_);
     auto type_id_map_iter = id_map_.find(type.type_string());
     if (type_id_map_iter == id_map_.end()) return false;
     auto id_map_iter = type_id_map_iter->second.find(tf_device_id.value());
@@ -80,7 +81,7 @@ class TfToPlatformDeviceIdMap {
   absl::StatusOr<std::vector<TfDeviceId>> GetTfDevicesOnPlatform(
       const DeviceType& type, PlatformDeviceId platform_device_id) const
       TF_LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock lock(mu_);
+    absl::ReaderMutexLock lock(&mu_);
     auto type_id_map_iter = id_map_.find(type.type_string());
     if (type_id_map_iter == id_map_.end()) {
       return absl::NotFoundError(
@@ -100,7 +101,7 @@ class TfToPlatformDeviceIdMap {
   TfToPlatformDeviceIdMap() = default;
 
   void TestOnlyReset() TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     id_map_.clear();
   }
 
@@ -110,7 +111,7 @@ class TfToPlatformDeviceIdMap {
   // We use std::string instead of DeviceType because the key should
   // be default-initializable.
   using TypeIdMapType = std::unordered_map<std::string, IdMapType>;
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
   TypeIdMapType id_map_ TF_GUARDED_BY(mu_);
 
   friend class ::tsl::DeviceIdManager;
diff --git a/third_party/xla/xla/tsl/framework/device_id_utils_test.cc b/third_party/xla/xla/tsl/framework/device_id_utils_test.cc
index 2a798594e45e..c405bf871aaa 100644
--- a/third_party/xla/xla/tsl/framework/device_id_utils_test.cc
+++ b/third_party/xla/xla/tsl/framework/device_id_utils_test.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/tsl/framework/device_id_utils.h"
 
-#include <string_view>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "xla/tsl/framework/device_id_manager.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/status_matchers.h"
@@ -28,7 +28,7 @@ namespace {
 using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
-constexpr std::string_view kTestDeviceType = "CPU";
+constexpr absl::string_view kTestDeviceType = "CPU";
 
 PlatformDeviceId TfToPlatformDeviceId(TfDeviceId tf_device_id) {
   PlatformDeviceId platform_device_id;
diff --git a/third_party/xla/xla/tsl/framework/tracking_allocator.cc b/third_party/xla/xla/tsl/framework/tracking_allocator.cc
index 9f9f5fd73739..a52c768e772c 100644
--- a/third_party/xla/xla/tsl/framework/tracking_allocator.cc
+++ b/third_party/xla/xla/tsl/framework/tracking_allocator.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <optional>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/logging.h"
 
@@ -43,7 +44,7 @@ void* TrackingAllocator::AllocateRaw(
   if (allocator_->TracksAllocationSizes()) {
     size_t allocated_bytes = allocator_->AllocatedSize(ptr);
     {
-      mutex_lock lock(mu_);
+      absl::MutexLock lock(&mu_);
       allocated_ += allocated_bytes;
       high_watermark_ = std::max(high_watermark_, allocated_);
       total_bytes_ += allocated_bytes;
@@ -56,7 +57,7 @@ void* TrackingAllocator::AllocateRaw(
     // use the requested size as an approximation.
     size_t allocated_bytes = allocator_->AllocatedSizeSlow(ptr);
     allocated_bytes = std::max(num_bytes, allocated_bytes);
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     next_allocation_id_ += 1;
     Chunk chunk = {num_bytes, allocated_bytes, next_allocation_id_};
     in_use_.emplace(std::make_pair(ptr, chunk));
@@ -66,7 +67,7 @@ void* TrackingAllocator::AllocateRaw(
     allocations_.emplace_back(allocated_bytes, Env::Default()->NowMicros());
     ++ref_;
   } else {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     total_bytes_ += num_bytes;
     allocations_.emplace_back(num_bytes, Env::Default()->NowMicros());
     ++ref_;
@@ -87,7 +88,7 @@ void TrackingAllocator::DeallocateRaw(void* ptr) {
   if (tracks_allocation_sizes) {
     allocated_bytes = allocator_->AllocatedSize(ptr);
   } else if (track_sizes_locally_) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     auto itr = in_use_.find(ptr);
     if (itr != in_use_.end()) {
       tracks_allocation_sizes = true;
@@ -97,7 +98,7 @@ void TrackingAllocator::DeallocateRaw(void* ptr) {
   }
   Allocator* allocator = allocator_;
   {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     if (tracks_allocation_sizes) {
       CHECK_GE(allocated_, allocated_bytes);
       allocated_ -= allocated_bytes;
@@ -117,7 +118,7 @@ bool TrackingAllocator::TracksAllocationSizes() const {
 
 size_t TrackingAllocator::RequestedSize(const void* ptr) const {
   if (track_sizes_locally_) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     auto it = in_use_.find(ptr);
     if (it != in_use_.end()) {
       return (*it).second.requested_size;
@@ -130,7 +131,7 @@ size_t TrackingAllocator::RequestedSize(const void* ptr) const {
 
 size_t TrackingAllocator::AllocatedSize(const void* ptr) const {
   if (track_sizes_locally_) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     auto it = in_use_.find(ptr);
     if (it != in_use_.end()) {
       return (*it).second.allocated_size;
@@ -143,7 +144,7 @@ size_t TrackingAllocator::AllocatedSize(const void* ptr) const {
 
 int64_t TrackingAllocator::AllocationId(const void* ptr) const {
   if (track_sizes_locally_) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     auto it = in_use_.find(ptr);
     if (it != in_use_.end()) {
       return (*it).second.allocation_id;
@@ -165,7 +166,7 @@ std::tuple<size_t, size_t, size_t> TrackingAllocator::GetSizes() {
   size_t total_bytes;
   size_t still_live_bytes;
   {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     high_watermark = high_watermark_;
     total_bytes = total_bytes_;
     still_live_bytes = allocated_;
@@ -177,7 +178,7 @@ absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetRecordsAndUnRef() {
   bool should_delete;
   absl::InlinedVector<AllocRecord, 4UL> allocations;
   {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     allocations.swap(allocations_);
     should_delete = UnRef();
   }
@@ -190,7 +191,7 @@ absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetRecordsAndUnRef() {
 absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetCurrentRecords() {
   absl::InlinedVector<AllocRecord, 4UL> allocations;
   {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     for (const AllocRecord& alloc : allocations_) {
       allocations.push_back(alloc);
     }
diff --git a/third_party/xla/xla/tsl/framework/tracking_allocator.h b/third_party/xla/xla/tsl/framework/tracking_allocator.h
index e4176801a3fc..345d6352ca6c 100644
--- a/third_party/xla/xla/tsl/framework/tracking_allocator.h
+++ b/third_party/xla/xla/tsl/framework/tracking_allocator.h
@@ -19,10 +19,10 @@ limitations under the License.
 #include <optional>
 #include <unordered_map>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/lib/gtl/inlined_vector.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -100,7 +100,7 @@ class TrackingAllocator : public Allocator {
   bool UnRef() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   Allocator* allocator_;  // not owned.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
   // the number of calls to AllocateRaw that have not yet been matched
   // by a corresponding call to DeAllocateRaw, plus 1 if the Executor
   // has not yet read out the high watermark.
diff --git a/third_party/xla/xla/tsl/framework/type_traits.h b/third_party/xla/xla/tsl/framework/type_traits.h
index bb43632b3667..60fc28c81ce8 100644
--- a/third_party/xla/xla/tsl/framework/type_traits.h
+++ b/third_party/xla/xla/tsl/framework/type_traits.h
@@ -79,7 +79,8 @@ struct is_simple_type {
       std::is_same<T, float8_e5m2>::value ||
       std::is_same<T, float8_e5m2fnuz>::value ||
       std::is_same<T, float8_e8m0fnu>::value || std::is_same<T, int4>::value ||
-      std::is_same<T, uint4>::value;
+      std::is_same<T, uint4>::value || std::is_same<T, int2>::value ||
+      std::is_same<T, uint2>::value;
 };
 
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/lib/core/BUILD b/third_party/xla/xla/tsl/lib/core/BUILD
index bfb6d6030e14..fb6db6233d2d 100644
--- a/third_party/xla/xla/tsl/lib/core/BUILD
+++ b/third_party/xla/xla/tsl/lib/core/BUILD
@@ -100,6 +100,19 @@ cc_library(
     alwayslink = 1,
 )
 
+tsl_cc_test(
+    name = "bitmap_test",
+    size = "small",
+    srcs = ["bitmap_test.cc"],
+    deps = [
+        ":bitmap",
+        "//xla/tsl/lib/random:philox",
+        "//xla/tsl/lib/random:philox_random",
+        "//xla/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "bits",
     hdrs = ["bits.h"],
diff --git a/third_party/xla/xla/tsl/lib/core/bitmap.cc b/third_party/xla/xla/tsl/lib/core/bitmap.cc
index 35d2a42dc20c..631c9dff3c71 100644
--- a/third_party/xla/xla/tsl/lib/core/bitmap.cc
+++ b/third_party/xla/xla/tsl/lib/core/bitmap.cc
@@ -27,10 +27,11 @@ namespace core {
 
 void Bitmap::Reset(size_t n) {
   const size_t num_words = NumWords(n);
+
   if (num_words != NumWords(nbits_)) {
     // Reallocate.
     Word* w = new Word[num_words];
-    delete[] word_;
+    if (word_ != nullptr) delete[] word_;
     word_ = w;
   }
   memset(word_, 0, sizeof(word_[0]) * num_words);
@@ -42,6 +43,17 @@ static size_t FindFirstSet(uint32_t w) {
   return w == 0 ? 0 : absl::countr_zero(w) + 1;
 }
 
+bool Bitmap::IsAllSet() const { return CountOnes() == nbits_; }
+
+size_t Bitmap::CountOnes() const {
+  const size_t nwords = NumWords(nbits_);
+  size_t count = 0;
+  for (size_t i = 0; i < nwords; i++) {
+    count += absl::popcount(word_[i]);
+  }
+  return count;
+}
+
 size_t Bitmap::FirstUnset(size_t start) const {
   if (start >= nbits_) {
     return nbits_;
diff --git a/third_party/xla/xla/tsl/lib/core/bitmap.h b/third_party/xla/xla/tsl/lib/core/bitmap.h
index 173c0329aa16..ef11300bb852 100644
--- a/third_party/xla/xla/tsl/lib/core/bitmap.h
+++ b/third_party/xla/xla/tsl/lib/core/bitmap.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef XLA_TSL_LIB_CORE_BITMAP_H_
 #define XLA_TSL_LIB_CORE_BITMAP_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <string>
 
 #include "xla/tsl/platform/logging.h"
@@ -33,8 +36,8 @@ class Bitmap {
 
   ~Bitmap();
 
-  Bitmap(const Bitmap&) = delete;
-  Bitmap& operator=(const Bitmap&) = delete;
+  Bitmap(const Bitmap&);
+  Bitmap& operator=(const Bitmap&);
 
   // Return the number of bits that the bitmap can hold.
   size_t bits() const;
@@ -58,6 +61,12 @@ class Bitmap {
   // Returns bits() if no such i exists.
   size_t FirstUnset(size_t start) const;
 
+  // Returns true if all bits are set.
+  bool IsAllSet() const;
+
+  // Returns the number of one bits in the bitmap.
+  size_t CountOnes() const;
+
   // Returns the bitmap as an ascii string of '0' and '1' characters, bits()
   // characters in length.
   std::string ToString() const;
@@ -80,6 +89,21 @@ class Bitmap {
 
 inline Bitmap::Bitmap() : nbits_(0), word_(nullptr) {}
 
+inline Bitmap::Bitmap(const Bitmap& other) {
+  nbits_ = 0;
+  word_ = nullptr;
+  Reset(other.bits());
+  memcpy(this->word_, other.word_, sizeof(Word) * NumWords(bits()));
+}
+
+inline Bitmap& Bitmap::operator=(const Bitmap& other) {
+  if (this != &other) {
+    Reset(other.bits());
+    memcpy(this->word_, other.word_, sizeof(Word) * NumWords(bits()));
+  }
+  return *this;
+}
+
 inline Bitmap::Bitmap(size_t n) : Bitmap() { Reset(n); }
 
 inline Bitmap::~Bitmap() { delete[] word_; }
diff --git a/third_party/xla/xla/tsl/lib/core/bitmap_test.cc b/third_party/xla/xla/tsl/lib/core/bitmap_test.cc
index 447a6e59c12d..a05a0f9ca308 100644
--- a/third_party/xla/xla/tsl/lib/core/bitmap_test.cc
+++ b/third_party/xla/xla/tsl/lib/core/bitmap_test.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "xla/tsl/lib/core/bitmap.h"
 
+#include <cstddef>
+
+#include "xla/tsl/lib/random/philox_random.h"
 #include "xla/tsl/lib/random/simple_philox.h"
-#include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/test.h"
 
 namespace tsl {
@@ -47,6 +49,71 @@ TEST(BitmapTest, Basic) {
   }
 }
 
+TEST(BitmapTest, Copy) {
+  for (size_t n = 0; n < 200; n = NextSize(n)) {
+    for (size_t i = 0; i < n; i++) {
+      Bitmap bits(n);
+      bits.set(i);
+      Bitmap bits_copy(bits);
+      EXPECT_EQ(bits.ToString(), bits_copy.ToString());
+    }
+  }
+}
+
+TEST(BitmapTest, Assign) {
+  for (size_t n = 0; n < 200; n = NextSize(n)) {
+    for (size_t i = 0; i < n; i++) {
+      Bitmap bits(n);
+      bits.set(i);
+      Bitmap bits_copy = bits;
+      EXPECT_EQ(bits.ToString(), bits_copy.ToString());
+    }
+  }
+}
+
+TEST(BitmapTest, CountOnesBasic) {
+  for (size_t n = 0; n < 200; n = NextSize(n)) {
+    Bitmap bits(n);
+    EXPECT_EQ(bits.CountOnes(), 0) << " " << bits.ToString();
+    for (size_t i = 0; i < n; i++) {
+      bits.set(i);
+      EXPECT_EQ(bits.CountOnes(), i + 1) << " " << bits.ToString();
+    }
+  }
+}
+
+TEST(BitmapTest, CountOnesRandom) {
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int iter = 0; iter < 10000; iter++) {
+    Bitmap bitmap;
+    MakeRandomBitmap(&rnd, &bitmap);
+
+    size_t count = bitmap.CountOnes();
+
+    size_t brute_force_count = 0;
+    for (size_t i = 0; i < bitmap.bits(); i++) {
+      if (bitmap.get(i)) brute_force_count++;
+    }
+
+    EXPECT_EQ(count, brute_force_count) << " " << bitmap.ToString();
+  }
+}
+
+TEST(BitmapTest, IsAllSet) {
+  for (size_t n = 0; n < 200; n = NextSize(n)) {
+    Bitmap bits(n);
+    for (size_t i = 0; i < n; i++) {
+      bits.set(i);
+      if (i == n - 1) {
+        EXPECT_TRUE(bits.IsAllSet()) << n << " " << bits.ToString();
+      } else {
+        EXPECT_FALSE(bits.IsAllSet()) << n << " " << bits.ToString();
+      }
+    }
+  }
+}
+
 TEST(BitmapTest, ToString) {
   Bitmap bits(10);
   bits.set(1);
diff --git a/third_party/xla/xla/tsl/lib/gtl/BUILD b/third_party/xla/xla/tsl/lib/gtl/BUILD
index 9bb44b99691f..999c4eddd9c7 100644
--- a/third_party/xla/xla/tsl/lib/gtl/BUILD
+++ b/third_party/xla/xla/tsl/lib/gtl/BUILD
@@ -32,6 +32,9 @@ package(
         "//xla/tsl/distributed_runtime/rpc:__pkg__",
         "//xla/tsl/profiler/utils:__pkg__",
         "//tensorflow/core/profiler/convert:__pkg__",
+        # xprof /convert and /utils uses map_util and top_n
+        "//third_party/xprof/convert:__pkg__",
+        "//third_party/xprof/utils:__pkg__",
     ]),
     licenses = ["notice"],
 )
@@ -41,7 +44,10 @@ package(
 cc_library(
     name = "compactptrset",
     hdrs = ["compactptrset.h"],
-    deps = [":flatset"],
+    deps = [
+        ":flatset",
+        "//xla/tsl/util:safe_reinterpret_cast",
+    ],
 )
 
 cc_library(
@@ -109,6 +115,18 @@ cc_library(
     hdrs = ["map_util.h"],
 )
 
+cc_library(
+    name = "value_or_die",
+    srcs = ["value_or_die.cc"],
+    hdrs = ["value_or_die.h"],
+    deps = [
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 filegroup(
     name = "legacy_lib_gtl_headers",
     srcs = [
@@ -197,6 +215,7 @@ filegroup(
         "int_type.h",
         "iterator_range.h",
         "map_util.h",
+        "value_or_die.h",
         "//xla/tsl/lib/gtl/subtle:map_traits",
     ],
     visibility = internal_visibility([
@@ -214,6 +233,7 @@ tsl_cc_test(
         "int_type_test.cc",
         "iterator_range_test.cc",
         "map_util_test.cc",
+        "value_or_die_test.cc",
     ],
     deps = [
         ":compactptrset",
@@ -222,9 +242,12 @@ tsl_cc_test(
         ":int_type",
         ":iterator_range",
         ":map_util",
+        ":value_or_die",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:hash",
     ],
diff --git a/third_party/xla/xla/tsl/lib/gtl/compactptrset.h b/third_party/xla/xla/tsl/lib/gtl/compactptrset.h
index 3848430e76fb..6d009230c55a 100644
--- a/third_party/xla/xla/tsl/lib/gtl/compactptrset.h
+++ b/third_party/xla/xla/tsl/lib/gtl/compactptrset.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef XLA_TSL_LIB_GTL_COMPACTPTRSET_H_
 #define XLA_TSL_LIB_GTL_COMPACTPTRSET_H_
 
+#include <cstdint>
 #include <type_traits>
 
 #include "xla/tsl/lib/gtl/flatset.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 
 namespace tsl {
 namespace gtl {
@@ -126,7 +128,7 @@ class CompactPointerSet {
   std::pair<iterator, bool> insert(T elem) {
     if (!isbig()) {
       if (rep_ == 0) {
-        uintptr_t v = reinterpret_cast<uintptr_t>(elem);
+        uintptr_t v = safe_reinterpret_cast<std::uintptr_t>(elem);
         if (v == 0 || ((v & 0x3) != 0)) {
           // Cannot use small representation for nullptr.  Fall through.
         } else {
@@ -155,7 +157,7 @@ class CompactPointerSet {
   }
 
   iterator find(T elem) const {
-    if (rep_ == reinterpret_cast<uintptr_t>(elem)) {
+    if (rep_ == safe_reinterpret_cast<std::uintptr_t>(elem)) {
       return iterator(rep_);
     } else if (!isbig()) {
       return iterator(0);
@@ -168,7 +170,7 @@ class CompactPointerSet {
 
   size_t erase(T elem) {
     if (!isbig()) {
-      if (rep_ == reinterpret_cast<uintptr_t>(elem)) {
+      if (rep_ == safe_reinterpret_cast<std::uintptr_t>(elem)) {
         rep_ = 0;
         return 1;
       } else {
@@ -199,7 +201,7 @@ class CompactPointerSet {
     if (rep_ != 0) {
       big->insert(reinterpret_cast<T>(rep_));
     }
-    rep_ = reinterpret_cast<uintptr_t>(big) + 0x1;
+    rep_ = safe_reinterpret_cast<std::uintptr_t>(big) + 0x1;
   }
 };
 
diff --git a/third_party/xla/xla/tsl/lib/gtl/value_or_die.cc b/third_party/xla/xla/tsl/lib/gtl/value_or_die.cc
new file mode 100644
index 000000000000..87d053c974d6
--- /dev/null
+++ b/third_party/xla/xla/tsl/lib/gtl/value_or_die.cc
@@ -0,0 +1,33 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/lib/gtl/value_or_die.h"
+
+#include "absl/base/attributes.h"
+#include "absl/status/status.h"
+#include "xla/tsl/platform/logging.h"
+
+namespace tsl::gtl::internal_value_or_die {
+
+ABSL_ATTRIBUTE_NORETURN void DieBecauseEmptyValue(const char* file, int line,
+                                                  const absl::Status* status) {
+  if (status == nullptr) {
+    LOG(FATAL).AtLocation(file, line) << "ValueOrDie on empty value.";
+  } else {
+    LOG(FATAL).AtLocation(file, line) << "ValueOrDie on " << *status;
+  }
+}
+
+}  // namespace tsl::gtl::internal_value_or_die
diff --git a/third_party/xla/xla/tsl/lib/gtl/value_or_die.h b/third_party/xla/xla/tsl/lib/gtl/value_or_die.h
new file mode 100644
index 000000000000..dd7ade6a99c3
--- /dev/null
+++ b/third_party/xla/xla/tsl/lib/gtl/value_or_die.h
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines a free function ValueOrDie that can be used to safely
+// dereference absl::StatusOr, std::optional and pointer values (or in fact any
+// value that provides a compatible 'operator*' and is contextually convertible
+// to bool).
+//
+// Example usage:
+//
+// int UnwrapNestedValueOrDie(absl::StatusOr<std::optional<int>> value) {
+//   return tsl::gtl::ValueOrDie(tsl::gtl::ValueOrDie(value));
+// }
+
+#ifndef XLA_TSL_LIB_GTL_VALUE_OR_DIE_H_
+#define XLA_TSL_LIB_GTL_VALUE_OR_DIE_H_
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+
+namespace tsl {
+namespace gtl {
+namespace internal_value_or_die {
+
+// LOG(FATAL), with a source location and an optional 'status' for details.
+ABSL_ATTRIBUTE_NORETURN
+void DieBecauseEmptyValue(const char* file, int line,
+                          const absl::Status* status = nullptr);
+
+// SFINAE helper to detect instances of StatusOr<T>.
+template <int&... kDoNotSpecify, typename T>
+void IsStatusOr(const absl::StatusOr<T>&);
+
+}  // namespace internal_value_or_die
+
+template <
+    int&... kDoNotSpecify, typename T,
+    typename = decltype(internal_value_or_die::IsStatusOr(std::declval<T>()))>
+decltype(auto) ValueOrDie(T&& value ABSL_ATTRIBUTE_LIFETIME_BOUND,
+                          const char* file = __builtin_FILE(),
+                          int line = __builtin_LINE()) {
+  if (ABSL_PREDICT_FALSE(!value.ok())) {
+    internal_value_or_die::DieBecauseEmptyValue(file, line, &value.status());
+  }
+  return *std::forward<T>(value);
+}
+
+template <int&... kDoNotSpecify, typename T,
+          typename = decltype(*std::declval<T>()),
+          decltype(static_cast<bool>(std::declval<T>())) = true>
+decltype(auto) ValueOrDie(T&& value ABSL_ATTRIBUTE_LIFETIME_BOUND,
+                          const char* file = __builtin_FILE(),
+                          int line = __builtin_LINE()) {
+  if (ABSL_PREDICT_FALSE(!value)) {
+    internal_value_or_die::DieBecauseEmptyValue(file, line);
+  }
+  return *std::forward<T>(value);
+}
+
+template <int&... kDoNotSpecify, typename T>
+decltype(auto) ValueOrDie(T* value ABSL_ATTRIBUTE_LIFETIME_BOUND,
+                          const char* file = __builtin_FILE(),
+                          int line = __builtin_LINE()) {
+  if (ABSL_PREDICT_FALSE(!value)) {
+    internal_value_or_die::DieBecauseEmptyValue(file, line);
+  }
+  return *value;
+}
+
+}  // namespace gtl
+}  // namespace tsl
+
+#endif  // XLA_TSL_LIB_GTL_VALUE_OR_DIE_H_
diff --git a/third_party/xla/xla/tsl/lib/gtl/value_or_die_test.cc b/third_party/xla/xla/tsl/lib/gtl/value_or_die_test.cc
new file mode 100644
index 000000000000..0102748c7472
--- /dev/null
+++ b/third_party/xla/xla/tsl/lib/gtl/value_or_die_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/lib/gtl/value_or_die.h"
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "xla/tsl/platform/test.h"
+
+namespace tsl::gtl {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Pointee;
+
+class Immovable {
+ public:
+  Immovable() = default;
+  Immovable(const Immovable&) = delete;
+  Immovable(const Immovable&&) = delete;
+};
+
+TEST(ValueOrDieTest, StatusOr) {
+  absl::StatusOr<int> int_value = 1;
+  EXPECT_EQ(ValueOrDie(int_value), 1);
+  std::unique_ptr<int> moveable_value = ValueOrDie(
+      absl::StatusOr<std::unique_ptr<int>>(std::make_unique<int>(1)));
+  EXPECT_THAT(moveable_value, Pointee(Eq(1)));
+  absl::StatusOr<Immovable> immovable_value;
+  immovable_value.emplace();
+  EXPECT_EQ(&ValueOrDie(immovable_value), &*immovable_value);
+  absl::StatusOr<int> failed = absl::NotFoundError("not found");
+  EXPECT_DEATH(ValueOrDie(failed), "ValueOrDie.*not found");
+}
+
+TEST(ValueOrDieTest, Optional) {
+  std::optional<int> int_value = 1;
+  EXPECT_EQ(ValueOrDie(int_value), 1);
+  std::unique_ptr<int> moveable_value =
+      ValueOrDie(std::optional<std::unique_ptr<int>>(std::make_unique<int>(1)));
+  EXPECT_THAT(moveable_value, Pointee(Eq(1)));
+  std::optional<Immovable> immovable_value;
+  immovable_value.emplace();
+  EXPECT_EQ(&ValueOrDie(immovable_value), &*immovable_value);
+  EXPECT_DEATH(ValueOrDie(std::optional<int>()), "ValueOrDie");
+}
+
+TEST(ValueOrDieTest, Pointer) {
+  int int_value = 1;
+  const int& int_ref = ValueOrDie(&int_value);
+  EXPECT_EQ(int_ref, 1);
+  Immovable immovable_value;
+  EXPECT_EQ(&ValueOrDie(&immovable_value), &immovable_value);
+  EXPECT_DEATH(ValueOrDie(static_cast<int*>(nullptr)), "ValueOrDie");
+}
+
+TEST(ValueOrDieTest, SmartPointer) {
+  auto int_value = std::make_unique<int>(1);
+  EXPECT_EQ(ValueOrDie(int_value), 1);
+  auto immovable_value = std::make_unique<Immovable>();
+  EXPECT_EQ(&ValueOrDie(immovable_value), immovable_value.get());
+  EXPECT_DEATH(ValueOrDie(std::unique_ptr<std::nullptr_t>()), "ValueOrDie");
+}
+
+TEST(ValueOrDieTest, Nested) {
+  EXPECT_THAT(ValueOrDie(ValueOrDie(ValueOrDie(
+                  absl::StatusOr<std::optional<std::unique_ptr<int>>>(
+                      std::make_unique<int>(1))))),
+              Eq(1));
+}
+
+}  // namespace
+}  // namespace tsl::gtl
diff --git a/third_party/xla/xla/tsl/lib/histogram/BUILD b/third_party/xla/xla/tsl/lib/histogram/BUILD
index 747a8c2cf9cd..202967310758 100644
--- a/third_party/xla/xla/tsl/lib/histogram/BUILD
+++ b/third_party/xla/xla/tsl/lib/histogram/BUILD
@@ -24,7 +24,7 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:histogram_proto_cc",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
     alwayslink = True,
diff --git a/third_party/xla/xla/tsl/lib/histogram/histogram.cc b/third_party/xla/xla/tsl/lib/histogram/histogram.cc
index e333a419fe05..a94372f420d4 100644
--- a/third_party/xla/xla/tsl/lib/histogram/histogram.cc
+++ b/third_party/xla/xla/tsl/lib/histogram/histogram.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
-#include "tsl/platform/mutex.h"
 
 namespace tsl {
 namespace histogram {
@@ -233,48 +233,48 @@ void Histogram::EncodeToProto(HistogramProto* proto,
 
 // ThreadSafeHistogram implementation.
 bool ThreadSafeHistogram::DecodeFromProto(const HistogramProto& proto) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return histogram_.DecodeFromProto(proto);
 }
 
 void ThreadSafeHistogram::Clear() {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   histogram_.Clear();
 }
 
 void ThreadSafeHistogram::Add(double value) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   histogram_.Add(value);
 }
 
 void ThreadSafeHistogram::EncodeToProto(HistogramProto* proto,
                                         bool preserve_zero_buckets) const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   histogram_.EncodeToProto(proto, preserve_zero_buckets);
 }
 
 double ThreadSafeHistogram::Median() const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return histogram_.Median();
 }
 
 double ThreadSafeHistogram::Percentile(double p) const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return histogram_.Percentile(p);
 }
 
 double ThreadSafeHistogram::Average() const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return histogram_.Average();
 }
 
 double ThreadSafeHistogram::StandardDeviation() const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return histogram_.StandardDeviation();
 }
 
 std::string ThreadSafeHistogram::ToString() const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return histogram_.ToString();
 }
 
diff --git a/third_party/xla/xla/tsl/lib/histogram/histogram.h b/third_party/xla/xla/tsl/lib/histogram/histogram.h
index 88fe7be62daf..7281915c631f 100644
--- a/third_party/xla/xla/tsl/lib/histogram/histogram.h
+++ b/third_party/xla/xla/tsl/lib/histogram/histogram.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
@@ -133,7 +133,7 @@ class ThreadSafeHistogram {
   std::string ToString() const;
 
  private:
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
   Histogram histogram_ TF_GUARDED_BY(mu_);
 };
 
diff --git a/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc b/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
index 42268a44b0cc..3ac2e3dd4121 100644
--- a/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
+++ b/third_party/xla/xla/tsl/lib/histogram/histogram_test.cc
@@ -25,27 +25,21 @@ namespace tsl {
 namespace histogram {
 
 static void Validate(const Histogram& h) {
-  string s1 = h.ToString();
-  LOG(ERROR) << s1;
-
   HistogramProto proto_with_zeroes;
   h.EncodeToProto(&proto_with_zeroes, true);
   Histogram h2;
   EXPECT_TRUE(h2.DecodeFromProto(proto_with_zeroes));
-  string s2 = h2.ToString();
-  LOG(ERROR) << s2;
 
-  EXPECT_EQ(s1, s2);
+  EXPECT_EQ(h2.ToString(), h.ToString());
 
   HistogramProto proto_no_zeroes;
   h.EncodeToProto(&proto_no_zeroes, false);
-  LOG(ERROR) << proto_no_zeroes.DebugString();
   Histogram h3;
   EXPECT_TRUE(h3.DecodeFromProto(proto_no_zeroes));
   string s3 = h3.ToString();
   LOG(ERROR) << s3;
 
-  EXPECT_EQ(s1, s3);
+  EXPECT_EQ(h3.ToString(), h.ToString());
 }
 
 TEST(Histogram, Empty) {
diff --git a/third_party/xla/xla/tsl/lib/io/BUILD b/third_party/xla/xla/tsl/lib/io/BUILD
index ea088baf5657..7b9721bc2515 100644
--- a/third_party/xla/xla/tsl/lib/io/BUILD
+++ b/third_party/xla/xla/tsl/lib/io/BUILD
@@ -21,6 +21,7 @@ package(
         "@local_tsl//tsl/profiler:__subpackages__",
         "//tensorflow/core/profiler:__subpackages__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//third_party/xprof:__subpackages__",
     ]),
     licenses = ["notice"],
 )
@@ -88,6 +89,9 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:coding",
     ],
     alwayslink = True,
@@ -216,9 +220,10 @@ cc_library(
         "cache.h",
     ],
     deps = [
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:raw_coding",
         "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -464,6 +469,7 @@ tsl_cc_test(
     deps = [
         ":cache",
         "//xla/tsl/platform:test",
+        "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:coding",
         "@local_tsl//tsl/platform:raw_coding",
diff --git a/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc b/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
index 3686ab55904b..7c0621dbb9bf 100644
--- a/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/buffered_inputstream_test.cc
@@ -74,7 +74,7 @@ TEST(BufferedInputStream, ReadLine_Empty) {
         new RandomAccessInputStream(file.get()));
     BufferedInputStream in(input_stream.get(), buf_size);
     string line;
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -98,9 +98,9 @@ TEST(BufferedInputStream, ReadLine1) {
     EXPECT_EQ(line, "line two");
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -123,9 +123,9 @@ TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
     EXPECT_EQ(line, "line two");
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -153,9 +153,9 @@ TEST(BufferedInputStream, ReadLine_EmptyLines) {
     EXPECT_EQ(line, "line two");
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -183,9 +183,9 @@ TEST(BufferedInputStream, ReadLine_CRLF) {
     EXPECT_EQ(line, "line two");
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -207,9 +207,9 @@ TEST(BufferedInputStream, SkipLine1) {
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line two");
     TF_ASSERT_OK(in.SkipLine());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipLine()));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipLine()));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipLine()));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipLine()));
   }
 }
 
@@ -230,9 +230,9 @@ TEST(BufferedInputStream, SkipLine_NoTrailingNewLine) {
     TF_ASSERT_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line two");
     TF_ASSERT_OK(in.SkipLine());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipLine()));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipLine()));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipLine()));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipLine()));
   }
 }
 
@@ -284,10 +284,10 @@ TEST(BufferedInputStream, ReadNBytes) {
     TF_ASSERT_OK(in.ReadNBytes(0, &read));
     EXPECT_EQ(read, "");
     EXPECT_EQ(7, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "789");
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "");
     EXPECT_EQ(10, in.Tell());
     TF_ASSERT_OK(in.ReadNBytes(0, &read));
@@ -350,11 +350,11 @@ TEST(BufferedInputStream, SkipNBytes) {
     TF_ASSERT_OK(in.ReadNBytes(1, &read));
     EXPECT_EQ(read, "7");
     EXPECT_EQ(8, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipNBytes(5)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipNBytes(5)));
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipNBytes(5)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipNBytes(5)));
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "");
     EXPECT_EQ(10, in.Tell());
   }
@@ -384,10 +384,10 @@ TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
     TF_ASSERT_OK(in.ReadNBytes(0, &read));
     EXPECT_EQ(read, "");
     EXPECT_EQ(7, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "789");
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "");
     EXPECT_EQ(10, in.Tell());
     TF_ASSERT_OK(in.ReadNBytes(0, &read));
@@ -422,11 +422,11 @@ TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
     TF_ASSERT_OK(in.ReadNBytes(1, &read));
     EXPECT_EQ(read, "7");
     EXPECT_EQ(8, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipNBytes(5)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipNBytes(5)));
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipNBytes(5)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipNBytes(5)));
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "");
     EXPECT_EQ(10, in.Tell());
   }
diff --git a/third_party/xla/xla/tsl/lib/io/cache.cc b/third_party/xla/xla/tsl/lib/io/cache.cc
index ee5b7e70a873..2b762d8d495b 100644
--- a/third_party/xla/xla/tsl/lib/io/cache.cc
+++ b/third_party/xla/xla/tsl/lib/io/cache.cc
@@ -20,8 +20,9 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
-#include "tsl/platform/mutex.h"
+#include "absl/synchronization/mutex.h"
 #include "tsl/platform/raw_coding.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 
@@ -176,7 +177,7 @@ class LRUCache {
   void Erase(Slice key, uint32_t hash);
   void Prune();
   size_t TotalCharge() const {
-    mutex_lock l(mutex_);
+    absl::MutexLock l(&mutex_);
     return usage_;
   }
 
@@ -191,7 +192,7 @@ class LRUCache {
   size_t capacity_;
 
   // mutex_ protects the following state.
-  mutable mutex mutex_;
+  mutable absl::Mutex mutex_;
   size_t usage_ TF_GUARDED_BY(mutex_);
 
   // Dummy head of LRU list.
@@ -262,7 +263,7 @@ void LRUCache::LRU_Append(LRUHandle* list, LRUHandle* e) {
 }
 
 Cache::Handle* LRUCache::Lookup(Slice key, uint32_t hash) {
-  mutex_lock l(mutex_);
+  absl::MutexLock l(&mutex_);
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != nullptr) {
     Ref(e);
@@ -271,14 +272,14 @@ Cache::Handle* LRUCache::Lookup(Slice key, uint32_t hash) {
 }
 
 void LRUCache::Release(Cache::Handle* handle) {
-  mutex_lock l(mutex_);
+  absl::MutexLock l(&mutex_);
   Unref(reinterpret_cast<LRUHandle*>(handle));
 }
 
 Cache::Handle* LRUCache::Insert(Slice key, uint32_t hash, void* value,
                                 size_t charge,
                                 void (*deleter)(Slice key, void* value)) {
-  mutex_lock l(mutex_);
+  absl::MutexLock l(&mutex_);
 
   LRUHandle* e =
       reinterpret_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
@@ -327,12 +328,12 @@ bool LRUCache::FinishErase(LRUHandle* e) {
 }
 
 void LRUCache::Erase(Slice key, uint32_t hash) {
-  mutex_lock l(mutex_);
+  absl::MutexLock l(&mutex_);
   FinishErase(table_.Remove(key, hash));
 }
 
 void LRUCache::Prune() {
-  mutex_lock l(mutex_);
+  absl::MutexLock l(&mutex_);
   while (lru_.next != &lru_) {
     LRUHandle* e = lru_.next;
     assert(e->refs == 1);
@@ -349,7 +350,7 @@ static const int kNumShards = 1 << kNumShardBits;
 class ShardedLRUCache : public Cache {
  private:
   LRUCache shard_[kNumShards];
-  mutex id_mutex_;
+  absl::Mutex id_mutex_;
   uint64_t last_id_;
 
   static inline uint32_t HashSlice(Slice s) {
@@ -387,7 +388,7 @@ class ShardedLRUCache : public Cache {
     return reinterpret_cast<LRUHandle*>(handle)->value;
   }
   uint64_t NewId() override {
-    mutex_lock l(id_mutex_);
+    absl::MutexLock l(&id_mutex_);
     return ++(last_id_);
   }
   void Prune() override {
diff --git a/third_party/xla/xla/tsl/lib/io/cache_test.cc b/third_party/xla/xla/tsl/lib/io/cache_test.cc
index 14f28f222089..a6806450973e 100644
--- a/third_party/xla/xla/tsl/lib/io/cache_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/cache_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "xla/tsl/lib/io/cache.h"
 
+#include <cstdint>
 #include <string>
 #include <vector>
 
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "tsl/platform/coding.h"
 #include "tsl/platform/raw_coding.h"
 
@@ -36,7 +38,9 @@ static int DecodeKey(Slice k) {
   return core::DecodeFixed32(k.data());
 }
 static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
-static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+static int DecodeValue(void* v) {
+  return safe_reinterpret_cast<std::uintptr_t>(v);
+}
 
 class CacheTest : public ::testing::Test {
  public:
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer.cc b/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
index e7823794df8f..c297165bb526 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer.cc
@@ -16,7 +16,13 @@ limitations under the License.
 #include "xla/tsl/lib/io/inputbuffer.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 
@@ -28,18 +34,18 @@ InputBuffer::InputBuffer(RandomAccessFile* file, size_t buffer_bytes)
       file_pos_(0),
       size_(buffer_bytes),
       buf_(new char[size_]),
-      pos_(buf_),
-      limit_(buf_) {}
+      pos_(buf_.get()),
+      limit_(buf_.get()) {}
 
-InputBuffer::~InputBuffer() { delete[] buf_; }
+InputBuffer::~InputBuffer() = default;
 
 absl::Status InputBuffer::FillBuffer() {
   absl::string_view data;
-  absl::Status s = file_->Read(file_pos_, size_, &data, buf_);
-  if (data.data() != buf_) {
-    memmove(buf_, data.data(), data.size());
+  absl::Status s = file_->Read(file_pos_, size_, &data, buf());
+  if (data.data() != buf()) {
+    memmove(buf(), data.data(), data.size());
   }
-  pos_ = buf_;
+  pos_ = buf();
   limit_ = pos_ + data.size();
   file_pos_ += data.size();
   return s;
@@ -50,12 +56,13 @@ absl::Status InputBuffer::ReadLine(T* result) {
   result->clear();
   absl::Status s;
   do {
-    size_t buf_remain = limit_ - pos_;
-    char* newline = static_cast<char*>(memchr(pos_, '\n', buf_remain));
+    const int buf_remain = num_remaining_bytes();
+    const char* const newline =
+        static_cast<const char*>(memchr(pos_, '\n', buf_remain));
     if (newline != nullptr) {
-      size_t result_len = newline - pos_;
+      const int result_len = newline - pos_;
       result->append(pos_, result_len);
-      pos_ = newline + 1;
+      pos_ += result_len + 1;
       if (!result->empty() && result->back() == '\r') {
         result->resize(result->size() - 1);
       }
@@ -64,12 +71,12 @@ absl::Status InputBuffer::ReadLine(T* result) {
     if (buf_remain > 0) result->append(pos_, buf_remain);
     // Get more data into buffer
     s = FillBuffer();
-    DCHECK_EQ(pos_, buf_);
-  } while (limit_ != buf_);
+    DCHECK_EQ(pos_, buf());
+  } while (limit_ != buf());
   if (!result->empty() && result->back() == '\r') {
     result->resize(result->size() - 1);
   }
-  if (errors::IsOutOfRange(s) && !result->empty()) {
+  if (absl::IsOutOfRange(s) && !result->empty()) {
     return absl::OkStatus();
   }
   return s;
@@ -104,19 +111,19 @@ absl::Status InputBuffer::ReadNBytes(int64_t bytes_to_read, char* result,
     if (pos_ == limit_) {
       // Get more data into buffer.
       status = FillBuffer();
-      if (limit_ == buf_) {
+      if (limit_ == buf()) {
         break;
       }
     }
     // Do not go over the buffer boundary.
     const int64_t bytes_to_copy =
-        std::min<int64_t>(limit_ - pos_, bytes_to_read - *bytes_read);
+        std::min<int64_t>(num_remaining_bytes(), bytes_to_read - *bytes_read);
     // Copies buffered data into the destination.
     memcpy(result + *bytes_read, pos_, bytes_to_copy);
     pos_ += bytes_to_copy;
     *bytes_read += bytes_to_copy;
   }
-  if (errors::IsOutOfRange(status) &&
+  if (absl::IsOutOfRange(status) &&
       (*bytes_read == static_cast<size_t>(bytes_to_read))) {
     return absl::OkStatus();
   }
@@ -125,7 +132,7 @@ absl::Status InputBuffer::ReadNBytes(int64_t bytes_to_read, char* result,
 
 absl::Status InputBuffer::ReadVarint32Fallback(uint32* result) {
   absl::Status s = ReadVarintFallback(result, core::kMaxVarint32Bytes);
-  if (errors::IsDataLoss(s)) {
+  if (absl::IsDataLoss(s)) {
     return errors::DataLoss("Stored data is too large to be a varint32.");
   }
   return s;
@@ -133,7 +140,7 @@ absl::Status InputBuffer::ReadVarint32Fallback(uint32* result) {
 
 absl::Status InputBuffer::ReadVarint64Fallback(uint64* result) {
   absl::Status s = ReadVarintFallback(result, core::kMaxVarint64Bytes);
-  if (errors::IsDataLoss(s)) {
+  if (absl::IsDataLoss(s)) {
     return errors::DataLoss("Stored data is too large to be a varint64.");
   }
   return s;
@@ -166,16 +173,16 @@ absl::Status InputBuffer::SkipNBytes(int64_t bytes_to_skip) {
     if (pos_ == limit_) {
       // Get more data into buffer
       s = FillBuffer();
-      if (limit_ == buf_) {
+      if (limit_ == buf()) {
         break;
       }
     }
     const int64_t bytes_to_advance =
-        std::min<int64_t>(limit_ - pos_, bytes_to_skip - bytes_skipped);
+        std::min<int64_t>(num_remaining_bytes(), bytes_to_skip - bytes_skipped);
     bytes_skipped += bytes_to_advance;
     pos_ += bytes_to_advance;
   }
-  if (errors::IsOutOfRange(s) && bytes_skipped == bytes_to_skip) {
+  if (absl::IsOutOfRange(s) && bytes_skipped == bytes_to_skip) {
     return absl::OkStatus();
   }
   return s;
@@ -187,14 +194,15 @@ absl::Status InputBuffer::Seek(int64_t position) {
                                    position);
   }
   // Position of the buffer within file.
-  const int64_t bufpos = file_pos_ - static_cast<int64_t>(limit_ - buf_);
+  const int64_t bufpos = file_pos_ - static_cast<int64_t>(limit_ - buf());
   if (position >= bufpos && position < file_pos_) {
     // Seeks to somewhere inside the buffer.
-    pos_ = buf_ + (position - bufpos);
-    DCHECK(pos_ >= buf_ && pos_ < limit_);
+    pos_ = buf() + position - bufpos;
+    DCHECK_GE(pos_, buf());
+    DCHECK_LT(pos_, limit_);
   } else {
     // Seeks to somewhere outside.  Discards the buffered data.
-    pos_ = limit_ = buf_;
+    pos_ = limit_ = buf();
     file_pos_ = position;
   }
   return absl::OkStatus();
@@ -211,7 +219,7 @@ absl::Status InputBuffer::Hint(int64_t bytes_to_read) {
     return absl::OkStatus();
   }
 
-  const int64_t bytes_remain_in_buf = static_cast<int64_t>(limit_ - pos_);
+  const int64_t bytes_remain_in_buf = num_remaining_bytes();
 
   // There are enough data in the buffer. Do nothing.
   if (bytes_to_read <= bytes_remain_in_buf) {
@@ -219,9 +227,9 @@ absl::Status InputBuffer::Hint(int64_t bytes_to_read) {
   }
 
   // Additional read from file is necessary. Make some room.
-  memmove(buf_, pos_, bytes_remain_in_buf);
-  pos_ = buf_;
-  limit_ = buf_ + bytes_remain_in_buf;
+  memmove(buf(), pos_, bytes_remain_in_buf);
+  pos_ = buf();
+  limit_ = buf() + bytes_remain_in_buf;
   bytes_to_read -= bytes_remain_in_buf;
 
   // Read the remaining bytes from file.
@@ -233,7 +241,7 @@ absl::Status InputBuffer::Hint(int64_t bytes_to_read) {
   limit_ += data.size();
   file_pos_ += data.size();
 
-  if (errors::IsOutOfRange(s) && data.size() == bytes_to_read) {
+  if (absl::IsOutOfRange(s) && data.size() == bytes_to_read) {
     return absl::OkStatus();
   } else {
     return s;
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer.h b/third_party/xla/xla/tsl/lib/io/inputbuffer.h
index 1d9db6bf19c5..5dd9923d248f 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer.h
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer.h
@@ -16,9 +16,15 @@ limitations under the License.
 #ifndef XLA_TSL_LIB_IO_INPUTBUFFER_H_
 #define XLA_TSL_LIB_IO_INPUTBUFFER_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 #include <string>
 
+#include "absl/status/status.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
@@ -80,7 +86,7 @@ class InputBuffer {
   absl::Status Hint(int64_t bytes_to_read);
 
   // Returns the position in the file.
-  int64_t Tell() const { return file_pos_ - (limit_ - pos_); }
+  int64_t Tell() const { return file_pos_ - num_remaining_bytes(); }
 
   // Returns the underlying RandomAccessFile.
   RandomAccessFile* file() const { return file_; }
@@ -100,13 +106,22 @@ class InputBuffer {
   template <typename T>
   absl::Status ReadVarintFallback(T* result, int max_bytes);
 
-  RandomAccessFile* file_;  // Not owned
-  int64_t file_pos_;        // Next position to read from in "file_"
-  size_t size_;             // Size of "buf_"
-  char* buf_;               // The buffer itself
-  // [pos_,limit_) hold the "limit_ - pos_" bytes just before "file_pos_"
-  char* pos_;    // Current position in "buf"
-  char* limit_;  // Just past end of valid data in "buf"
+  // The buffer itself.
+  char* buf() { return buf_.get(); }
+  const char* buf() const { return buf_.get(); }
+
+  // Number of bytes remaining in the buffer.
+  int num_remaining_bytes() const { return limit_ - pos_; }
+
+  RandomAccessFile* const file_ = nullptr;  // Not owned
+  int64_t file_pos_ = 0;               // Next position to read from in "file_"
+  const int size_ = 0;                 // Size of "buf_"
+  const std::unique_ptr<char[]> buf_;  // The buffer itself. Must not be null.
+  // [pos_,limit_) hold the "limit_ - pos_" bytes just before "file_pos_".
+  char* pos_ =
+      nullptr;  // Current position in "buf". Must be in [buf(), buf() + size_].
+  char* limit_ = nullptr;  // Just past end of valid data in "buf". Must be in
+                           // [buf(), buf() + size_].
 
   InputBuffer(const InputBuffer&) = delete;
   void operator=(const InputBuffer&) = delete;
@@ -123,9 +138,10 @@ inline absl::Status InputBuffer::ReadVarint32(uint32* result) {
   if (pos_ + core::kMaxVarint32Bytes <= limit_) {
     // Fast path: directly parse from buffered data.
     // Reads strictly from the range [pos_, limit_).
-    const char* offset = core::GetVarint32Ptr(pos_, limit_, result);
-    if (offset == nullptr) return errors::OutOfRange("Parsed past limit.");
-    pos_ = const_cast<char*>(offset);
+    const char* const new_pos = core::GetVarint32Ptr(pos_, limit_, result);
+    if (new_pos == nullptr) return errors::OutOfRange("Parsed past limit.");
+    const int offset = new_pos - buf();
+    pos_ = buf() + offset;
     return absl::OkStatus();
   } else {
     return ReadVarint32Fallback(result);
@@ -137,9 +153,10 @@ inline absl::Status InputBuffer::ReadVarint64(uint64* result) {
   if (pos_ + core::kMaxVarint64Bytes <= limit_) {
     // Fast path: directly parse from buffered data.
     // Reads strictly from the range [pos_, limit_).
-    const char* offset = core::GetVarint64Ptr(pos_, limit_, result);
-    if (offset == nullptr) return errors::OutOfRange("Parsed past limit.");
-    pos_ = const_cast<char*>(offset);
+    const char* const new_pos = core::GetVarint64Ptr(pos_, limit_, result);
+    if (new_pos == nullptr) return errors::OutOfRange("Parsed past limit.");
+    const int offset = new_pos - buf();
+    pos_ = buf() + offset;
     return absl::OkStatus();
   } else {
     return ReadVarint64Fallback(result);
diff --git a/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc b/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
index 555e664934b2..f6a26becac9b 100644
--- a/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputbuffer_test.cc
@@ -46,7 +46,7 @@ TEST(InputBuffer, ReadLine_Empty) {
     TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
     string line;
     io::InputBuffer in(file.get(), buf_size);
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -68,9 +68,9 @@ TEST(InputBuffer, ReadLine1) {
     EXPECT_EQ(line, "line two");
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -91,9 +91,9 @@ TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
     EXPECT_EQ(line, "line two");
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -119,9 +119,9 @@ TEST(InputBuffer, ReadLine_EmptyLines) {
     EXPECT_EQ(line, "line two");
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -147,9 +147,9 @@ TEST(InputBuffer, ReadLine_CRLF) {
     EXPECT_EQ(line, "line two");
     TF_CHECK_OK(in.ReadLine(&line));
     EXPECT_EQ(line, "line three");
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
     // A second call should also return end of file
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadLine(&line)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadLine(&line)));
   }
 }
 
@@ -178,10 +178,10 @@ TEST(InputBuffer, ReadNBytes) {
     TF_CHECK_OK(in.ReadNBytes(0, &read));
     EXPECT_EQ(read, "");
     EXPECT_EQ(7, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "789");
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "");
     EXPECT_EQ(10, in.Tell());
     TF_CHECK_OK(in.ReadNBytes(0, &read));
@@ -213,11 +213,11 @@ TEST(InputBuffer, ReadNBytes) {
     EXPECT_EQ(absl::string_view(read, 4), "3456");
 
     EXPECT_EQ(7, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, read, &bytes_read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, read, &bytes_read)));
     EXPECT_EQ(absl::string_view(read, 3), "789");
 
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, read, &bytes_read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, read, &bytes_read)));
     EXPECT_EQ(absl::string_view(read, 3), "789");
 
     EXPECT_EQ(10, in.Tell());
@@ -253,11 +253,11 @@ TEST(InputBuffer, SkipNBytes) {
     TF_CHECK_OK(in.ReadNBytes(1, &read));
     EXPECT_EQ(read, "7");
     EXPECT_EQ(8, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipNBytes(5)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipNBytes(5)));
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.SkipNBytes(5)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.SkipNBytes(5)));
     EXPECT_EQ(10, in.Tell());
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
     EXPECT_EQ(read, "");
     EXPECT_EQ(10, in.Tell());
   }
@@ -293,7 +293,7 @@ TEST(InputBuffer, Seek) {
     EXPECT_EQ(read, "4567");
 
     TF_CHECK_OK(in.Seek(1 << 25));
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(1, &read)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(1, &read)));
 
     EXPECT_TRUE(absl::StrContains(in.Seek(-1).ToString(), "negative position"));
   }
@@ -334,7 +334,7 @@ TEST(InputBuffer, ReadVarint32) {
       TF_ASSERT_OK(in.ReadVarint32(&result));
       EXPECT_EQ(expected, result);
     }
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadVarint32(&result)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadVarint32(&result)));
   }
 }
 
@@ -374,7 +374,7 @@ TEST(InputBuffer, ReadVarint64) {
       TF_ASSERT_OK(in.ReadVarint64(&result));
       EXPECT_EQ(expected, result);
     }
-    EXPECT_TRUE(errors::IsOutOfRange(in.ReadVarint64(&result)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.ReadVarint64(&result)));
   }
 }
 
@@ -415,9 +415,9 @@ TEST(InputBuffer, Hint) {
     TF_CHECK_OK(in.Hint(1 << 25));
 
     TF_CHECK_OK(in.Seek(1 << 25));
-    EXPECT_TRUE(errors::IsOutOfRange(in.Hint(1)));
+    EXPECT_TRUE(absl::IsOutOfRange(in.Hint(1)));
 
-    EXPECT_TRUE(errors::IsInvalidArgument(in.Hint(-1)));
+    EXPECT_TRUE(absl::IsInvalidArgument(in.Hint(-1)));
   }
 }
 
diff --git a/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc b/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
index 524345aaaf41..453e898438dd 100644
--- a/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/inputstream_interface_test.cc
@@ -58,7 +58,7 @@ TEST(InputStreamInterface, Basic) {
   TF_ASSERT_OK(ss.ReadNBytes(11, &res));
   EXPECT_EQ("test string", res);
   // Skipping past end of the file causes OutOfRange error.
-  EXPECT_TRUE(errors::IsOutOfRange(ss.SkipNBytes(1)));
+  EXPECT_TRUE(absl::IsOutOfRange(ss.SkipNBytes(1)));
 
   TF_ASSERT_OK(ss.Reset());
   TF_ASSERT_OK(ss.ReadNBytes(4, &res));
diff --git a/third_party/xla/xla/tsl/lib/io/random_inputstream.cc b/third_party/xla/xla/tsl/lib/io/random_inputstream.cc
index 26e138c0e231..304418fd55f3 100644
--- a/third_party/xla/xla/tsl/lib/io/random_inputstream.cc
+++ b/third_party/xla/xla/tsl/lib/io/random_inputstream.cc
@@ -44,7 +44,7 @@ absl::Status RandomAccessInputStream::ReadNBytes(int64_t bytes_to_read,
     memmove(result_buffer, data.data(), data.size());
   }
   result->resize(data.size());
-  if (s.ok() || errors::IsOutOfRange(s)) {
+  if (s.ok() || absl::IsOutOfRange(s)) {
     pos_ += data.size();
   }
   return s;
@@ -58,7 +58,7 @@ absl::Status RandomAccessInputStream::ReadNBytes(int64_t bytes_to_read,
   }
   int64_t current_size = result->size();
   absl::Status s = file_->Read(pos_, bytes_to_read, result);
-  if (s.ok() || errors::IsOutOfRange(s)) {
+  if (s.ok() || absl::IsOutOfRange(s)) {
     pos_ += result->size() - current_size;
   }
   return s;
@@ -80,7 +80,7 @@ absl::Status RandomAccessInputStream::SkipNBytes(int64_t bytes_to_skip) {
     absl::string_view data;
     absl::Status s =
         file_->Read(pos_ + bytes_to_skip - 1, 1, &data, scratch.get());
-    if ((s.ok() || errors::IsOutOfRange(s)) && data.size() == 1) {
+    if ((s.ok() || absl::IsOutOfRange(s)) && data.size() == 1) {
       pos_ += bytes_to_skip;
       return absl::OkStatus();
     }
@@ -90,7 +90,7 @@ absl::Status RandomAccessInputStream::SkipNBytes(int64_t bytes_to_skip) {
     int64_t bytes_to_read = std::min<int64_t>(kMaxSkipSize, bytes_to_skip);
     absl::string_view data;
     absl::Status s = file_->Read(pos_, bytes_to_read, &data, scratch.get());
-    if (s.ok() || errors::IsOutOfRange(s)) {
+    if (s.ok() || absl::IsOutOfRange(s)) {
       pos_ += data.size();
     } else {
       return s;
diff --git a/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc b/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
index 1a50021e8191..03492d8c95dd 100644
--- a/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/random_inputstream_test.cc
@@ -44,7 +44,7 @@ TEST(RandomInputStream, ReadNBytes) {
   TF_ASSERT_OK(in.ReadNBytes(0, &read));
   EXPECT_EQ(read, "");
   EXPECT_EQ(8, in.Tell());
-  EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(20, &read)));
+  EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(20, &read)));
   EXPECT_EQ(read, "89");
   EXPECT_EQ(10, in.Tell());
   TF_ASSERT_OK(in.ReadNBytes(0, &read));
@@ -76,7 +76,7 @@ TEST(RandomInputStream, ReadNBytesWithCords) {
   TF_ASSERT_OK(in.ReadNBytes(0, &read));
   EXPECT_EQ(read, "01234567");
   EXPECT_EQ(8, in.Tell());
-  EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(20, &read)));
+  EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(20, &read)));
   EXPECT_EQ(read, "0123456789");
   EXPECT_EQ(10, in.Tell());
   TF_ASSERT_OK(in.ReadNBytes(0, &read));
@@ -107,11 +107,11 @@ TEST(RandomInputStream, SkipNBytes) {
   TF_ASSERT_OK(in.ReadNBytes(2, &read));
   EXPECT_EQ(read, "78");
   EXPECT_EQ(9, in.Tell());
-  EXPECT_TRUE(errors::IsOutOfRange(in.SkipNBytes(20)));
+  EXPECT_TRUE(absl::IsOutOfRange(in.SkipNBytes(20)));
   EXPECT_EQ(10, in.Tell());
   // Making sure that if we read after we've skipped beyond end of file, we get
   // nothing.
-  EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(5, &read)));
+  EXPECT_TRUE(absl::IsOutOfRange(in.ReadNBytes(5, &read)));
   EXPECT_EQ(read, "");
   EXPECT_EQ(10, in.Tell());
 }
diff --git a/third_party/xla/xla/tsl/lib/io/record_reader.cc b/third_party/xla/xla/tsl/lib/io/record_reader.cc
index 6421a616b2d3..9ff15581af9e 100644
--- a/third_party/xla/xla/tsl/lib/io/record_reader.cc
+++ b/third_party/xla/xla/tsl/lib/io/record_reader.cc
@@ -151,7 +151,7 @@ absl::Status RecordReader::GetMetadata(Metadata* md) {
       // Read header, containing size of data.
       absl::Status s = ReadChecksummed(offset, sizeof(uint64), &record);
       if (!s.ok()) {
-        if (errors::IsOutOfRange(s)) {
+        if (absl::IsOutOfRange(s)) {
           // We should reach out of range when the record file is complete.
           break;
         }
@@ -212,7 +212,7 @@ absl::Status RecordReader::ReadRecord(uint64* offset, tstring* record) {
   s = ReadChecksummed(*offset + kHeaderSize, length, record);
   if (!s.ok()) {
     last_read_failed_ = true;
-    if (errors::IsOutOfRange(s)) {
+    if (absl::IsOutOfRange(s)) {
       s = errors::DataLoss("truncated record at ", *offset, "' failed with ",
                            s.message());
     }
@@ -243,7 +243,7 @@ absl::Status RecordReader::SkipRecords(uint64* offset, int num_to_skip,
     s = input_stream_->SkipNBytes(length + kFooterSize);
     if (!s.ok()) {
       last_read_failed_ = true;
-      if (errors::IsOutOfRange(s)) {
+      if (absl::IsOutOfRange(s)) {
         s = errors::DataLoss("truncated record at ", *offset, "' failed with ",
                              s.message());
       }
diff --git a/third_party/xla/xla/tsl/lib/io/recordio_test.cc b/third_party/xla/xla/tsl/lib/io/recordio_test.cc
index 9c31aa7eeda8..7dc9e4c77b4f 100644
--- a/third_party/xla/xla/tsl/lib/io/recordio_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/recordio_test.cc
@@ -153,7 +153,7 @@ class RecordioTest : public ::testing::Test {
     absl::Status s = reader_->ReadRecord(&readpos_, &record);
     if (s.ok()) {
       return record;
-    } else if (errors::IsOutOfRange(s)) {
+    } else if (absl::IsOutOfRange(s)) {
       return "EOF";
     } else {
       return s.ToString();
@@ -185,7 +185,7 @@ class RecordioTest : public ::testing::Test {
     uint64 offset = WrittenBytes() + offset_past_end;
     tstring record;
     absl::Status s = reader_->ReadRecord(&offset, &record);
-    ASSERT_TRUE(errors::IsOutOfRange(s)) << s;
+    ASSERT_TRUE(absl::IsOutOfRange(s)) << s;
   }
 };
 
@@ -318,7 +318,7 @@ void TestReadError(const RecordWriterOptions& writer_options,
   tstring read;
   file.force_error();
   absl::Status status = reader.ReadRecord(&offset, &read);
-  ASSERT_TRUE(errors::IsDataLoss(status));
+  ASSERT_TRUE(absl::IsDataLoss(status));
   ASSERT_EQ(0, offset);
 
   // A failed Read() shouldn't update the offset, and thus a retry shouldn't
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
index 09c5e482ef51..957abaefd5c3 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputbuffer.cc
@@ -172,7 +172,8 @@ absl::Status SnappyInputBuffer::ReadFromFile() {
   }
   absl::string_view data;
   // Try to read enough data to fill up input_buffer_.
-  absl::Status s = file_->Read(file_pos_, bytes_to_read, &data, read_location);
+  absl::Status s = file_->Read(file_pos_, data,
+                               absl::MakeSpan(read_location, bytes_to_read));
   if (data.data() != read_location) {
     memmove(read_location, data.data(), data.size());
   }
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
index 980807326e51..30e3abbd7779 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_inputstream.cc
@@ -100,7 +100,7 @@ absl::Status SnappyInputStream::Inflate() {
 
   absl::Status s =
       input_stream_->ReadNBytes(compressed_block_length, &compressed_block);
-  if (errors::IsOutOfRange(s)) {
+  if (absl::IsOutOfRange(s)) {
     return errors::DataLoss("Failed to read ", compressed_block_length,
                             " bytes from file. Possible data corruption.");
   }
diff --git a/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc b/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
index d7eb301c5f8b..9c77d8625b35 100644
--- a/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/snappy/snappy_test.cc
@@ -103,7 +103,9 @@ absl::Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
     char* buffer = new char[bytes_to_read];
     size_t buffer_size = 0;
 
-    while ((file_reader->Read(file_pos, bytes_to_read, &data, scratch)).ok()) {
+    while ((file_reader->Read(file_pos, data,
+                              absl::MakeSpan(scratch, bytes_to_read)))
+               .ok()) {
       file_pos += data.size();
       TF_CHECK_OK(
           corrupt_file_writer->Append(absl::string_view(buffer, buffer_size)));
diff --git a/third_party/xla/xla/tsl/lib/io/table.cc b/third_party/xla/xla/tsl/lib/io/table.cc
index 28ab934ae302..6274b788c548 100644
--- a/third_party/xla/xla/tsl/lib/io/table.cc
+++ b/third_party/xla/xla/tsl/lib/io/table.cc
@@ -50,8 +50,8 @@ absl::Status Table::Open(const Options& options, RandomAccessFile* file,
   char footer_space[Footer::kEncodedLength];
   absl::string_view footer_input;
   absl::Status s =
-      file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
-                 &footer_input, footer_space);
+      file->Read(size - Footer::kEncodedLength, footer_input,
+                 absl::MakeSpan(footer_space, Footer::kEncodedLength));
   if (!s.ok()) return s;
 
   Footer footer;
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc b/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
index 6225ed4e76c6..c42bef8a9f2b 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_buffers_test.cc
@@ -323,7 +323,7 @@ void TestSoftErrorOnDecompress(CompressionOptions input_options) {
     ZlibInputStream in(input_stream.get(), 100, 100, input_options);
 
     tstring unused;
-    EXPECT_TRUE(errors::IsDataLoss(in.ReadNBytes(5, &unused)));
+    EXPECT_TRUE(absl::IsDataLoss(in.ReadNBytes(5, &unused)));
   }
 
   // Test `SkipNBytes` returns an error.
@@ -334,7 +334,7 @@ void TestSoftErrorOnDecompress(CompressionOptions input_options) {
         new RandomAccessInputStream(file_reader.get()));
     ZlibInputStream in(input_stream.get(), 100, 100, input_options);
 
-    EXPECT_TRUE(errors::IsDataLoss(in.SkipNBytes(5)));
+    EXPECT_TRUE(absl::IsDataLoss(in.SkipNBytes(5)));
   }
 }
 
diff --git a/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc b/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
index b5bfcd5b478e..37e67646caac 100644
--- a/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
+++ b/third_party/xla/xla/tsl/lib/io/zlib_inputstream.cc
@@ -153,7 +153,7 @@ absl::Status ZlibInputStream::ReadFromStream() {
   // Note: data.size() could be different from bytes_to_read.
   z_stream_def_->stream->avail_in += data.size();
 
-  if (!s.ok() && !errors::IsOutOfRange(s)) {
+  if (!s.ok() && !absl::IsOutOfRange(s)) {
     return s;
   }
 
@@ -165,7 +165,7 @@ absl::Status ZlibInputStream::ReadFromStream() {
   if (data.empty()) {
     return errors::OutOfRange("EOF reached");
   }
-  if (errors::IsOutOfRange(s)) {
+  if (absl::IsOutOfRange(s)) {
     return absl::OkStatus();
   }
 
diff --git a/third_party/xla/xla/tsl/lib/math/BUILD b/third_party/xla/xla/tsl/lib/math/BUILD
index 6d2e3fe7b1ac..0a67561aca4c 100644
--- a/third_party/xla/xla/tsl/lib/math/BUILD
+++ b/third_party/xla/xla/tsl/lib/math/BUILD
@@ -18,6 +18,7 @@ filegroup(
     srcs = [
         "math_util.h",
     ],
+    compatible_with = get_compatible_with_portable(),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/tsl/lib/monitoring/BUILD b/third_party/xla/xla/tsl/lib/monitoring/BUILD
index 7fe002a48969..99eb321fa7fe 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/BUILD
+++ b/third_party/xla/xla/tsl/lib/monitoring/BUILD
@@ -10,6 +10,7 @@ package(
     default_visibility = internal_visibility([
         "//learning/brain/google/data:__subpackages__",
         "//learning/brain/google/monitoring:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/experimental/orbax_model:__subpackages__",
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
         # tensorflow/core/platform:monitoring depends on this package
@@ -27,7 +28,8 @@ package(
         "//xla/tsl/distributed_runtime:__subpackages__",
         "//tensorflow/compiler/mlir/tf2xla:__subpackages__",
         "//tensorflow_serving/model_servers:__subpackages__",
-        # tensorflow/python/profiler/internal depends on this package
+        # xprof depends on this package
+        "//third_party/xprof:__subpackages__",
         "//tensorflow/python/profiler/internal:__pkg__",
     ]),
     licenses = ["notice"],
@@ -43,8 +45,9 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
@@ -60,8 +63,8 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
@@ -80,8 +83,8 @@ cc_library(
         "//xla/tsl/protobuf:histogram_proto_cc",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
@@ -122,8 +125,8 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:histogram_proto_cc",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:stringpiece",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
@@ -189,8 +192,8 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
index fbeccc3c6173..6c5c6c2b3767 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "xla/tsl/lib/monitoring/collection_registry.h"
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/monitoring/collected_metrics.h"
 #include "xla/tsl/lib/monitoring/metric_def.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/stringpiece.h"
 
 // We replace this implementation with a null implementation for mobile
@@ -39,14 +39,14 @@ void Collector::CollectMetricValues(
 }
 
 std::unique_ptr<CollectedMetrics> Collector::ConsumeCollectedMetrics() {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return std::move(collected_metrics_);
 }
 
 void Collector::CollectMetricDescriptor(
     const AbstractMetricDef* const metric_def) {
   auto* const metric_descriptor = [&]() {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     return collected_metrics_->metric_descriptor_map
         .insert(std::make_pair(
             string(metric_def->name()),
@@ -81,7 +81,7 @@ CollectionRegistry::Register(const AbstractMetricDef* const metric_def,
   CHECK(collection_function)
       << "Requires collection_function to contain an implementation.";
 
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
 
   const auto found_it = registry_.find(metric_def->name());
   if (found_it != registry_.end()) {
@@ -103,7 +103,7 @@ CollectionRegistry::Register(const AbstractMetricDef* const metric_def,
 }
 
 void CollectionRegistry::Unregister(const AbstractMetricDef* const metric_def) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   registry_.erase(metric_def->name());
 }
 
@@ -111,7 +111,7 @@ std::unique_ptr<CollectedMetrics> CollectionRegistry::CollectMetrics(
     const CollectMetricsOptions& options) const {
   internal::Collector collector(env_->NowMicros() / 1000);
 
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   for (const auto& registration : registry_) {
     if (options.collect_metric_descriptors) {
       collector.CollectMetricDescriptor(registration.second.metric_def);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
index e2d370a27c48..52bb1ef78af1 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/collection_registry.h
@@ -107,6 +107,7 @@ class CollectionRegistry {
 #include <memory>
 #include <utility>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/monitoring/collected_metrics.h"
 #include "xla/tsl/lib/monitoring/metric_def.h"
 #include "xla/tsl/lib/monitoring/types.h"
@@ -115,7 +116,6 @@ class CollectionRegistry {
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/stringpiece.h"
 #include "tsl/platform/thread_annotations.h"
 
@@ -264,7 +264,7 @@ class CollectionRegistry {
   // TF environment, mainly used for timestamping.
   Env* const env_;
 
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   // Information required for collection.
   struct CollectionInfo {
@@ -385,7 +385,7 @@ class Collector {
       const uint64 registration_time_millis,
       internal::Collector* const collector) TF_LOCKS_EXCLUDED(mu_) {
     auto* const point_set = [&]() {
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       return collected_metrics_->point_set_map
           .insert(std::make_pair(std::string(metric_def->name()),
                                  std::unique_ptr<PointSet>(new PointSet())))
@@ -407,7 +407,7 @@ class Collector {
       TF_LOCKS_EXCLUDED(mu_);
 
  private:
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
   std::unique_ptr<CollectedMetrics> collected_metrics_ TF_GUARDED_BY(mu_);
   const uint64 collection_time_millis_;
 
diff --git a/third_party/xla/xla/tsl/lib/monitoring/counter.h b/third_party/xla/xla/tsl/lib/monitoring/counter.h
index 72777585afd7..2ca270da5f91 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/counter.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/counter.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
+#include <cstdint>
+#include <utility>
+#include "absl/status/status.h"
 #include "tsl/platform/platform.h"
 // clang-format on
 
@@ -84,12 +87,12 @@ class Counter {
 #include <memory>
 #include <tuple>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/monitoring/collection_registry.h"
 #include "xla/tsl/lib/monitoring/metric_def.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -107,7 +110,7 @@ namespace monitoring {
 class CounterCell {
  public:
   explicit CounterCell(int64_t value) : value_(value) {}
-  ~CounterCell() {}
+  CounterCell() = default;
 
   // Atomically increments the value by step.
   // REQUIRES: Step be non-negative.
@@ -166,7 +169,7 @@ class Counter {
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
 
-              mutex_lock l(mu_);
+              absl::MutexLock l(&mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -180,7 +183,7 @@ class Counter {
     }
   }
 
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   absl::Status status_;
 
@@ -201,7 +204,7 @@ class Counter {
 //  Implementation details follow. API readers may skip.
 ////
 
-inline void CounterCell::IncrementBy(const int64_t step) {
+inline void CounterCell::IncrementBy(int64_t step) {
   DCHECK_LE(0, step) << "Must not decrement cumulative metrics.";
   value_ += step;
 }
@@ -228,7 +231,7 @@ CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
                 "provided in GetCell(...).");
 
   const LabelArray& label_array = {{labels...}};
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
     return &(found_it->second);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/gauge.h b/third_party/xla/xla/tsl/lib/monitoring/gauge.h
index 2b1c7f8e1bd2..ef4afbdef596 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/gauge.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/gauge.h
@@ -100,12 +100,12 @@ class Gauge {
 #include <memory>
 #include <string>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/monitoring/collection_registry.h"
 #include "xla/tsl/lib/monitoring/metric_def.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -133,7 +133,7 @@ class GaugeCell {
 
  private:
   T value_ TF_GUARDED_BY(mu_);
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   GaugeCell(const GaugeCell&) = delete;
   void operator=(const GaugeCell&) = delete;
@@ -235,7 +235,7 @@ class Gauge {
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
 
-              mutex_lock l(mu_);
+              absl::MutexLock l(&mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -249,7 +249,7 @@ class Gauge {
     }
   }
 
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   absl::Status status_;
 
@@ -271,13 +271,13 @@ class Gauge {
 ////
 template <typename T>
 void GaugeCell<T>::Set(const T& value) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   value_ = value;
 }
 
 template <typename T>
 T GaugeCell<T>::value() const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return value_;
 }
 
@@ -320,7 +320,7 @@ GaugeCell<ValueType>* Gauge<ValueType, NumLabels>::GetCell(
       "provided in GetCell(...).");
 
   const LabelArray& label_array = {{labels...}};
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
     return &(found_it->second);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
index f298b9f81c09..720ca3387260 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
+++ b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/monitoring/types.h"
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 
 // We replace this implementation with a null implementation for mobile
 // platforms.
@@ -36,7 +36,7 @@ namespace monitoring {
 
 void PercentileSamplerCell::Add(double sample) {
   uint64 nstime = EnvTime::NowNanos();
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   samples_[next_position_] = {nstime, sample};
   ++next_position_;
   if (TF_PREDICT_FALSE(next_position_ >= samples_.size())) {
@@ -90,7 +90,7 @@ Percentiles PercentileSamplerCell::value() const {
 
 std::vector<PercentileSamplerCell::Sample> PercentileSamplerCell::GetSamples(
     size_t* total_samples, long double* accumulator) const {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   std::vector<Sample> samples;
   if (num_samples_ == samples_.size()) {
     samples.insert(samples.end(), samples_.begin() + next_position_,
diff --git a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
index 8533b73d6442..88e5ae4647f7 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/percentile_sampler.h
@@ -85,12 +85,12 @@ PercentileSampler<NumLabels>* PercentileSampler<NumLabels>::New(
 #include <cmath>
 #include <map>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/monitoring/collection_registry.h"
 #include "xla/tsl/lib/monitoring/metric_def.h"
 #include "xla/tsl/lib/monitoring/types.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -128,7 +128,7 @@ class PercentileSamplerCell {
   std::vector<Sample> GetSamples(size_t* total_samples,
                                  long double* accumulator) const;
 
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
   UnitOfMeasure unit_of_measure_;
   const std::vector<double> percentiles_;
   std::vector<Sample> samples_ TF_GUARDED_BY(mu_);
@@ -195,7 +195,7 @@ class PercentileSampler {
         registration_handle_(CollectionRegistry::Default()->Register(
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
-              mutex_lock l(mu_);
+              absl::MutexLock l(&mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -223,7 +223,7 @@ class PercentileSampler {
     }
   }
 
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   absl::Status status_;
 
@@ -274,7 +274,7 @@ PercentileSamplerCell* PercentileSampler<NumLabels>::GetCell(
       "provided in GetCell(...).");
 
   const LabelArray& label_array = {{labels...}};
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   const auto found_it = cells_.find(label_array);
   if (found_it != cells_.end()) {
     return &(found_it->second);
diff --git a/third_party/xla/xla/tsl/lib/monitoring/sampler.h b/third_party/xla/xla/tsl/lib/monitoring/sampler.h
index 2fdbbd696b54..80370d623f7f 100644
--- a/third_party/xla/xla/tsl/lib/monitoring/sampler.h
+++ b/third_party/xla/xla/tsl/lib/monitoring/sampler.h
@@ -122,13 +122,13 @@ class Sampler {
 #include <utility>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/lib/histogram/histogram.h"
 #include "xla/tsl/lib/monitoring/collection_registry.h"
 #include "xla/tsl/lib/monitoring/metric_def.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -246,7 +246,7 @@ class Sampler {
             &metric_def_, [&](MetricCollectorGetter getter) {
               auto metric_collector = getter.Get(&metric_def_);
 
-              tf_shared_lock l(mu_);
+              absl::ReaderMutexLock l(&mu_);
               for (const auto& cell : cells_) {
                 metric_collector.CollectValue(cell.first, cell.second.value());
               }
@@ -260,7 +260,7 @@ class Sampler {
     }
   }
 
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   absl::Status status_;
 
@@ -317,13 +317,13 @@ SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
 
   const LabelArray& label_array = {{labels...}};
   {
-    tf_shared_lock l(mu_);
+    absl::ReaderMutexLock l(&mu_);
     const auto found_it = cells_.find(label_array);
     if (found_it != cells_.end()) {
       return &(found_it->second);
     }
   }
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   return &(cells_
                .emplace(std::piecewise_construct,
                         std::forward_as_tuple(label_array),
diff --git a/third_party/xla/xla/tsl/lib/random/distribution_sampler.cc b/third_party/xla/xla/tsl/lib/random/distribution_sampler.cc
index 384dd50fc34e..9c4eef9f0767 100644
--- a/third_party/xla/xla/tsl/lib/random/distribution_sampler.cc
+++ b/third_party/xla/xla/tsl/lib/random/distribution_sampler.cc
@@ -28,7 +28,7 @@ DistributionSampler::DistributionSampler(
   DCHECK(!weights.empty());
   int n = weights.size();
   num_ = n;
-  data_.reset(new std::pair<float, int>[n]);
+  data_ = std::make_unique<std::pair<float, int>[]>(n);
 
   std::unique_ptr<double[]> pr(new double[n]);
 
diff --git a/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc b/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
index c952a87bb1cf..7427df746d6e 100644
--- a/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
+++ b/third_party/xla/xla/tsl/lib/strings/proto_serialization.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/tsl/lib/strings/proto_serialization.h"
 
+#include <climits>
 #include <cstring>
 #include <memory>
 
@@ -60,7 +61,9 @@ class DeterministicSerializer {
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
                                     string* result) {
   const size_t size = msg.ByteSizeLong();
-  DCHECK_LE(size, static_cast<size_t>(INT_MAX));
+  if (size > static_cast<size_t>(INT_MAX)) {
+    return false;
+  }
   *result = string(size, '\0');
   return SerializeToBufferDeterministic(msg, const_cast<char*>(result->data()),
                                         result->size());
@@ -68,7 +71,12 @@ bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
 
 bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
                                     char* buffer, size_t size) {
-  DCHECK(msg.ByteSizeLong() == size && size <= static_cast<size_t>(INT_MAX));
+  if (msg.ByteSizeLong() != size) {
+    return false;
+  }
+  if (size > static_cast<size_t>(INT_MAX)) {
+    return false;
+  }
   protobuf::io::ArrayOutputStream array_stream(buffer, size);
   protobuf::io::CodedOutputStream output_stream(&array_stream);
   output_stream.SetSerializationDeterministic(true);
diff --git a/third_party/xla/xla/tsl/mkl/BUILD b/third_party/xla/xla/tsl/mkl/BUILD.bazel
similarity index 100%
rename from third_party/xla/xla/tsl/mkl/BUILD
rename to third_party/xla/xla/tsl/mkl/BUILD.bazel
diff --git a/third_party/xla/xla/tsl/mkl/graph.bzl b/third_party/xla/xla/tsl/mkl/graph.bzl
index 7cbb2ee3f562..f6065f88bed3 100644
--- a/third_party/xla/xla/tsl/mkl/graph.bzl
+++ b/third_party/xla/xla/tsl/mkl/graph.bzl
@@ -5,7 +5,7 @@ These rules have to be outside of mkl/build_defs.bzl, otherwise we would have cy
 (xla.bzl depends on tsl which depends on mkl/build_defs.bzl).
 """
 
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api")
 
@@ -27,7 +27,7 @@ def onednn_graph_cc_test(
     """xla_cc_test rule that has empty src and deps if not building with Graph API."""
     xla_cc_test(
         srcs = if_graph_api(srcs),
-        deps = if_graph_api(deps) + ["@com_google_googletest//:gtest_main"],
+        deps = if_graph_api(if_true = deps, if_false = ["@com_google_googletest//:gtest_main"]),
         # If not building with Graph API, we don't have any tests linked.
         fail_if_no_test_linked = False,
         **kwargs
diff --git a/third_party/xla/xla/tsl/platform/BUILD b/third_party/xla/xla/tsl/platform/BUILD
index faa33f776284..c384f6afa325 100644
--- a/third_party/xla/xla/tsl/platform/BUILD
+++ b/third_party/xla/xla/tsl/platform/BUILD
@@ -39,6 +39,7 @@ package(
 
 exports_files(
     [
+        "criticality.h",
         "subprocess.h",
         "env_time.h",
         "crash_analysis.h",
@@ -121,6 +122,7 @@ filegroup(
     name = "lib_hdrs",
     srcs = [
         "crash_analysis.h",
+        "criticality.h",
         "dynamic_annotations.h",
         "env.h",
         "errors.h",
@@ -167,6 +169,7 @@ filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
         "byte_order.h",
+        "criticality.h",
         "dynamic_annotations.h",
         "env.cc",
         "env.h",
@@ -349,9 +352,9 @@ tsl_cc_test(
     ],
     tags = ["no_oss"],  # TODO(b/327036247): revisit after this moves to XLA
     deps = [
+        ":subprocess",
+        ":test",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:subprocess",
-        "//xla/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:strcat",
@@ -366,7 +369,9 @@ cc_library(
         "file_system_helper.h",
         "threadpool.h",
     ],
-    deps = tf_windows_aware_platform_deps("env") + if_static([":env_impl"]),
+    deps = tf_windows_aware_platform_deps("env") + if_static([":env_impl"]) + [
+        "@com_google_absl//absl/strings:string_view",
+    ],
 )
 
 cc_library(
@@ -386,14 +391,13 @@ cc_library(
     srcs = ["errors.cc"],
     hdrs = ["errors.h"],
     deps = [
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:status",
+        ":logging",
+        ":macros",
+        ":status",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
-        "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/platform:strcat",
     ],
 )
@@ -404,7 +408,7 @@ tsl_cc_test(
     srcs = ["errors_test.cc"],
     deps = [
         ":errors",
-        "//xla/tsl/platform:test",
+        ":test",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
@@ -418,16 +422,20 @@ cc_library(
     ],
     hdrs = ["file_system_helper.h"],
     deps = [
+        ":env",
         ":errors",
         ":file_statistics",
         ":macros",
         ":status",
         ":statusor",
         ":types",
-        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:cord",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:str_util",
@@ -440,10 +448,10 @@ tsl_cc_test(
     size = "small",
     srcs = ["file_system_helper_test.cc"],
     deps = [
+        ":env",
+        ":env_impl",
         ":file_system_helper",
         "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:env_impl",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -452,7 +460,7 @@ cc_library(
     name = "file_statistics",
     hdrs = ["file_statistics.h"],
     deps = [
-        "//xla/tsl/platform:types",
+        ":types",
     ],
 )
 
@@ -474,8 +482,8 @@ tsl_cc_test(
     ],
     deps = [
         ":logging",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
+        ":statusor",
+        ":test",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -497,10 +505,10 @@ cc_library(
     srcs = ["status.cc"],
     hdrs = ["status.h"],
     deps = [
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:stack_frame",
-        "//xla/tsl/platform:types",
+        ":logging",
+        ":macros",
+        ":stack_frame",
+        ":types",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -508,13 +516,14 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:stacktrace",
         "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:stringprintf",
+        "@local_tsl//tsl/platform:thread_annotations",
     ] + tf_platform_deps("status"),
 )
 
@@ -523,12 +532,12 @@ tsl_cc_test(
     size = "small",
     srcs = ["status_test.cc"],
     deps = [
+        ":errors",
+        ":stack_frame",
         ":status",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:stack_frame",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/platform:status_to_from_proto",
-        "//xla/tsl/platform:test",
+        ":status_matchers",
+        ":status_to_from_proto",
+        ":test",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "//xla/tsl/protobuf:status_proto_cc",
         "@com_google_absl//absl/status",
@@ -544,9 +553,9 @@ cc_library(
     srcs = ["status_matchers.cc"],
     hdrs = ["status_matchers.h"],
     deps = [
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
+        ":status",
+        ":statusor",
+        ":test",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
@@ -556,11 +565,11 @@ tsl_cc_test(
     size = "small",
     srcs = ["status_matchers_test.cc"],
     deps = [
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:status_matchers",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
+        ":errors",
+        ":status",
+        ":status_matchers",
+        ":statusor",
+        ":test",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_googletest//:gtest_main",
     ],
@@ -573,7 +582,7 @@ cc_library(
     ],
     hdrs = ["status_to_from_proto.h"],
     deps = [
-        "//xla/tsl/platform:status",
+        ":status",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "//xla/tsl/protobuf:status_proto_cc",
         "@com_google_absl//absl/strings",
@@ -585,10 +594,10 @@ cc_library(
     name = "statusor",
     hdrs = ["statusor.h"],
     deps = [
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:status",
+        ":errors",
+        ":logging",
+        ":macros",
+        ":status",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -603,12 +612,12 @@ tsl_cc_test(
     size = "small",
     srcs = ["statusor_test.cc"],
     deps = [
+        ":errors",
+        ":macros",
         ":statusor",
+        ":test",
+        ":test_benchmark",
         ":test_main",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:test",
-        "//xla/tsl/platform:test_benchmark",
         "@com_google_absl//absl/base:config",
     ],
 )
@@ -620,10 +629,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["test.h"],
     deps = [
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:types",
-        "@com_google_googletest//:gtest",
+        ":logging",
+        ":macros",
+        ":types",
+        "@com_google_googletest//:gtest_for_library",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:net",
         "@local_tsl//tsl/platform:path",
@@ -651,8 +660,8 @@ cc_library(
         "//conditions:default": ["-lm"],
     }),
     deps = [
-        "//xla/tsl/platform:test",
-        "//xla/tsl/platform:test_benchmark",
+        ":test",
+        ":test_benchmark",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:stacktrace_handler",
@@ -664,8 +673,8 @@ cc_library(
     name = "threadpool_async_executor",
     hdrs = ["threadpool_async_executor.h"],
     deps = [
+        ":env",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
     ],
 )
 
@@ -673,10 +682,10 @@ tsl_cc_test(
     name = "threadpool_async_executor_test",
     srcs = ["threadpool_async_executor_test.cc"],
     deps = [
+        ":env",
+        ":env_impl",
+        ":test",
         ":threadpool_async_executor",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:env_impl",
-        "//xla/tsl/platform:test",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
     ],
@@ -687,9 +696,8 @@ cc_library(
     hdrs = ["threadpool_interface.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//xla/tsl/platform:types",
+        ":types",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:mutex",
     ],
 )
 
@@ -698,7 +706,7 @@ cc_library(
     hdrs = ["threadpool_options.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//xla/tsl/platform:threadpool_interface",
+        ":threadpool_interface",
     ],
 )
 
@@ -759,8 +767,8 @@ tsl_cc_test(
     ],
     deps = [
         ":intrusive_ptr",
-        "//xla/tsl/platform:test",
-        "//xla/tsl/platform:test_main",
+        ":test",
+        ":test_main",
         "@local_tsl//tsl/platform:refcount",
     ],
 )
@@ -794,8 +802,8 @@ cc_library(
     srcs = ["resource_loader.cc"],
     textual_hdrs = ["resource_loader.h"],
     deps = [
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:test",
+        ":logging",
+        ":test",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:path",
     ],
@@ -812,3 +820,23 @@ cc_library(
     hdrs = ["stack_frame.h"],
     compatible_with = get_compatible_with_portable(),
 )
+
+cc_library(
+    name = "criticality",
+    compatible_with = get_compatible_with_portable(),
+    textual_hdrs = ["criticality.h"],
+    deps = tf_platform_deps("criticality"),
+)
+
+tsl_cc_test(
+    name = "criticality_test",
+    size = "small",
+    srcs = [
+        "criticality_test.cc",
+    ],
+    deps = [
+        ":criticality",
+        ":test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/tsl/platform/cloud/BUILD b/third_party/xla/xla/tsl/platform/cloud/BUILD
index ffcf36de320d..2da8bc22cd13 100644
--- a/third_party/xla/xla/tsl/platform/cloud/BUILD
+++ b/third_party/xla/xla/tsl/platform/cloud/BUILD
@@ -36,7 +36,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
@@ -49,7 +49,6 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:notification",
         "@local_tsl//tsl/platform:stringpiece",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -68,7 +67,8 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/cleanup",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:notification",
         "@local_tsl//tsl/platform:stringpiece",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -87,7 +87,10 @@ cc_library(
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:retrying_utils",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -98,6 +101,8 @@ cc_library(
     copts = tsl_copts(),
     deps = [
         "//xla/tsl/platform:env",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -127,8 +132,8 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@jsoncpp_git//:jsoncpp",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
@@ -172,8 +177,8 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@jsoncpp_git//:jsoncpp",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
@@ -261,9 +266,9 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@jsoncpp_git//:jsoncpp",
         "@local_tsl//tsl/platform:base64",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:retrying_utils",
         "@local_tsl//tsl/platform:thread_annotations",
@@ -315,7 +320,7 @@ cc_library(
     deps = [
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:types",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -416,6 +421,7 @@ tsl_cc_test(
         ":gcs_dns_cache",
         "//xla/tsl/platform:env_impl",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:str_util",
     ],
diff --git a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
index ae5f3894a698..08e8290f2857 100644
--- a/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/expiring_lru_cache.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -49,7 +49,7 @@ class ExpiringLRUCache {
     if (max_age_ == 0) {
       return;
     }
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     InsertLocked(key, value);
   }
 
@@ -57,7 +57,7 @@ class ExpiringLRUCache {
   // `key`, false if the entry was not found. In both cases, there is no entry
   // with key `key` existed after the call.
   bool Delete(const string& key) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     return DeleteLocked(key);
   }
 
@@ -68,7 +68,7 @@ class ExpiringLRUCache {
     if (max_age_ == 0) {
       return false;
     }
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     return LookupLocked(key, value);
   }
 
@@ -84,14 +84,14 @@ class ExpiringLRUCache {
     }
 
     {
-      mutex_lock lock(mu_);
+      absl::MutexLock lock(&mu_);
       if (LookupLocked(key, value)) {
         return absl::OkStatus();
       }
     }
     absl::Status s = compute_func(key, value);
     if (s.ok()) {
-      mutex_lock lock(mu_);
+      absl::MutexLock lock(&mu_);
       InsertLocked(key, *value);
     }
     return s;
@@ -99,7 +99,7 @@ class ExpiringLRUCache {
 
   /// Clear the cache.
   void Clear() {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     cache_.clear();
     lru_list_.clear();
   }
@@ -173,7 +173,7 @@ class ExpiringLRUCache {
   Env* const env_;  // not owned
 
   /// Guards access to the cache and the LRU list.
-  mutex mu_;
+  absl::Mutex mu_;
 
   /// The cache (a map from string key to Entry).
   std::map<string, Entry> cache_ TF_GUARDED_BY(mu_);
diff --git a/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h b/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
index 07dd253c85d3..3a6899036eee 100644
--- a/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/file_block_cache.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/notification.h"
 #include "tsl/platform/stringpiece.h"
 #include "tsl/platform/thread_annotations.h"
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
index 5db8af208c1b..385090017189 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
 #include "tsl/platform/retrying_utils.h"
@@ -64,7 +66,7 @@ GcsDnsCache::GcsDnsCache(Env* env, int64_t refresh_rate_secs)
 
 void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
   // TODO(saeta): Denylist failing IP addresses.
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   if (!started_) {
     VLOG(1) << "Starting GCS DNS cache.";
     DCHECK(!worker_) << "Worker thread already exists!";
@@ -233,9 +235,9 @@ void GcsDnsCache::WorkerThread() {
   while (true) {
     {
       // Don't immediately re-resolve the addresses.
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       if (cancelled_) return;
-      cond_var_.wait_for(l, std::chrono::seconds(refresh_rate_secs_));
+      cond_var_.WaitWithTimeout(&mu_, absl::Seconds(refresh_rate_secs_));
       if (cancelled_) return;
     }
 
@@ -243,7 +245,7 @@ void GcsDnsCache::WorkerThread() {
     auto new_addresses = ResolveNames(kCachedDomainNames);
 
     {
-      mutex_lock l(mu_);
+      absl::MutexLock l(&mu_);
       // Update instance variables.
       addresses_.swap(new_addresses);
     }
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
index 5753881a9ff6..5c4f0e816609 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "xla/tsl/platform/cloud/http_request.h"
 #include "xla/tsl/platform/env.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 const int64_t kDefaultRefreshRateSecs = 60;
@@ -42,9 +43,9 @@ class GcsDnsCache {
   GcsDnsCache(Env* env, int64_t refresh_rate_secs);
 
   ~GcsDnsCache() {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     cancelled_ = true;
-    cond_var_.notify_one();
+    cond_var_.Signal();
   }
 
   // Annotate the given HttpRequest with resolve overrides from the cache.
@@ -59,9 +60,9 @@ class GcsDnsCache {
   // Define a friend class for testing.
   friend class GcsDnsCacheTest;
 
-  mutex mu_;
+  absl::Mutex mu_;
   Env* env_;
-  condition_variable cond_var_;
+  absl::CondVar cond_var_;
   std::default_random_engine random_ TF_GUARDED_BY(mu_);
   bool started_ TF_GUARDED_BY(mu_) = false;
   bool cancelled_ TF_GUARDED_BY(mu_) = false;
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
index 85b3b435d659..89feb204ae76 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_dns_cache_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/tsl/platform/cloud/gcs_dns_cache.h"
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/test.h"
 #include "tsl/platform/str_util.h"
 
@@ -76,7 +77,7 @@ class GcsDnsCacheTest : public ::testing::Test {
   void AnnotateRequestTest() {
     GcsDnsCache d;
     {
-      mutex_lock l(d.mu_);
+      absl::MutexLock l(&d.mu_);
       d.started_ = true;  // Avoid creating a thread.
       d.addresses_ = {{"192.168.1.1"}, {"172.134.1.1"}};
     }
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
index 688abaf0f9ea..7fae63902b93 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "tsl/platform/retrying_file_system.h"
 
 #ifndef _WIN32
@@ -51,7 +52,6 @@ limitations under the License.
 #include "xla/tsl/platform/cloud/time_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/protobuf.h"
@@ -373,7 +373,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
       return read_fn_(filename_, offset, n, result, scratch);
     }
     {
-      mutex_lock l(buffer_mutex_);
+      absl::MutexLock l(&buffer_mutex_);
       size_t buffer_end = buffer_start_ + buffer_.size();
       size_t copy_size = 0;
       if (offset < buffer_end && offset >= buffer_start_) {
@@ -430,7 +430,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
 
   // Mutex for buffering operations that can be accessed from multiple threads.
   // The following members are mutable in order to provide a const Read.
-  mutable mutex buffer_mutex_;
+  mutable absl::Mutex buffer_mutex_;
 
   // Offset of buffer from start of the file.
   mutable uint64 buffer_start_ TF_GUARDED_BY(buffer_mutex_);
@@ -1024,7 +1024,7 @@ absl::Status GcsFileSystem::NewRandomAccessFile(
                                                      uint64 offset, size_t n,
                                                      absl::string_view* result,
                                                      char* scratch) {
-      tf_shared_lock l(block_cache_lock_);
+      absl::ReaderMutexLock l(&block_cache_lock_);
       GcsFileStat stat;
       TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
           fname, &stat,
@@ -1073,7 +1073,7 @@ absl::Status GcsFileSystem::NewRandomAccessFile(
 void GcsFileSystem::ResetFileBlockCache(size_t block_size_bytes,
                                         size_t max_bytes,
                                         uint64 max_staleness_secs) {
-  mutex_lock l(block_cache_lock_);
+  absl::MutexLock l(&block_cache_lock_);
   file_block_cache_ =
       MakeFileBlockCache(block_size_bytes, max_bytes, max_staleness_secs);
   if (stats_ != nullptr) {
@@ -1307,7 +1307,7 @@ absl::Status GcsFileSystem::ParseGcsPath(absl::string_view fname,
 }
 
 void GcsFileSystem::ClearFileCaches(const string& fname) {
-  tf_shared_lock l(block_cache_lock_);
+  absl::ReaderMutexLock l(&block_cache_lock_);
   file_block_cache_->RemoveFile(fname);
   stat_cache_->Delete(fname);
   // TODO(rxsang): Remove the patterns that matche the file in
@@ -1378,8 +1378,9 @@ absl::Status GcsFileSystem::NewAppendableFile(
   TF_RETURN_IF_ERROR(GetTmpFilename(&old_content_filename));
   std::ofstream old_content(old_content_filename, std::ofstream::binary);
   while (true) {
-    status = reader->Read(offset, kReadAppendableFileBufferSize, &read_chunk,
-                          buffer.get());
+    status = reader->Read(
+        offset, read_chunk,
+        absl::MakeSpan(buffer.get(), kReadAppendableFileBufferSize));
     if (status.ok()) {
       old_content << read_chunk;
       offset += kReadAppendableFileBufferSize;
@@ -1452,7 +1453,7 @@ absl::Status GcsFileSystem::NewReadOnlyMemoryRegionFromFile(
   TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, token, &file));
 
   absl::string_view piece;
-  TF_RETURN_IF_ERROR(file->Read(0, size, &piece, data.get()));
+  TF_RETURN_IF_ERROR(file->Read(0, piece, absl::MakeSpan(data.get(), size)));
 
   result->reset(new GcsReadOnlyMemoryRegion(std::move(data), size));
   return absl::OkStatus();
@@ -2125,7 +2126,7 @@ absl::Status GcsFileSystem::DeleteRecursively(const string& dirname,
 // reclaiming memory once filesystem operations are done (e.g. model is loaded),
 // or for resetting the filesystem to a consistent state.
 void GcsFileSystem::FlushCaches(TransactionToken* token) {
-  tf_shared_lock l(block_cache_lock_);
+  absl::ReaderMutexLock l(&block_cache_lock_);
   file_block_cache_->Flush();
   stat_cache_->Clear();
   matching_paths_cache_->Clear();
@@ -2135,13 +2136,13 @@ void GcsFileSystem::FlushCaches(TransactionToken* token) {
 void GcsFileSystem::SetStats(GcsStatsInterface* stats) {
   CHECK(stats_ == nullptr) << "SetStats() has already been called.";
   CHECK(stats != nullptr);
-  mutex_lock l(block_cache_lock_);
+  absl::MutexLock l(&block_cache_lock_);
   stats_ = stats;
   stats_->Configure(this, &throttle_, file_block_cache_.get());
 }
 
 void GcsFileSystem::SetCacheStats(FileBlockCacheStatsInterface* cache_stats) {
-  tf_shared_lock l(block_cache_lock_);
+  absl::ReaderMutexLock l(&block_cache_lock_);
   if (file_block_cache_ == nullptr) {
     LOG(ERROR) << "Tried to set cache stats of non-initialized file block "
                   "cache object. This may result in not exporting the intended "
@@ -2153,7 +2154,7 @@ void GcsFileSystem::SetCacheStats(FileBlockCacheStatsInterface* cache_stats) {
 
 void GcsFileSystem::SetAuthProvider(
     std::unique_ptr<AuthProvider> auth_provider) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   auth_provider_ = std::move(auth_provider);
 }
 
@@ -2169,7 +2170,7 @@ absl::Status GcsFileSystem::CreateHttpRequest(
 
   string auth_token;
   {
-    tf_shared_lock l(mu_);
+    absl::ReaderMutexLock l(&mu_);
     TF_RETURN_IF_ERROR(
         AuthProvider::GetToken(auth_provider_.get(), &auth_token));
   }
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
index 811f9828a4f4..b5bc84700d3b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/cloud/auth_provider.h"
 #include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
 #include "xla/tsl/platform/cloud/compute_engine_zone_provider.h"
@@ -222,15 +223,15 @@ class GcsFileSystem : public FileSystem {
   /// These accessors are mainly for testing purposes, to verify that the
   /// environment variables that control these parameters are handled correctly.
   size_t block_size() {
-    tf_shared_lock l(block_cache_lock_);
+    absl::ReaderMutexLock l(&block_cache_lock_);
     return file_block_cache_->block_size();
   }
   size_t max_bytes() {
-    tf_shared_lock l(block_cache_lock_);
+    absl::ReaderMutexLock l(&block_cache_lock_);
     return file_block_cache_->max_bytes();
   }
   uint64 max_staleness() {
-    tf_shared_lock l(block_cache_lock_);
+    absl::ReaderMutexLock l(&block_cache_lock_);
     return file_block_cache_->max_staleness();
   }
   TimeoutConfig timeouts() const { return timeouts_; }
@@ -433,7 +434,7 @@ class GcsFileSystem : public FileSystem {
   // Clear all the caches related to the file with name `filename`.
   void ClearFileCaches(const string& fname);
 
-  mutex mu_;
+  absl::Mutex mu_;
   std::unique_ptr<AuthProvider> auth_provider_ TF_GUARDED_BY(mu_);
   std::shared_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<ZoneProvider> zone_provider_;
@@ -443,7 +444,7 @@ class GcsFileSystem : public FileSystem {
 
   // block_cache_lock_ protects the file_block_cache_ pointer (Note that
   // FileBlockCache instances are themselves threadsafe).
-  mutex block_cache_lock_;
+  absl::Mutex block_cache_lock_;
   std::unique_ptr<FileBlockCache> file_block_cache_
       TF_GUARDED_BY(block_cache_lock_);
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
index 414c2f2d51aa..a671106c4b8a 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
@@ -96,12 +96,12 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
   absl::string_view result;
 
   // Read the first chunk.
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("012345", result);
 
   // Read the second chunk.
-  EXPECT_TRUE(errors::IsOutOfRange(
-      file->Read(sizeof(scratch), sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(file->Read(
+      sizeof(scratch), result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("6789", result);
 }
 
@@ -143,12 +143,12 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered) {
   absl::string_view result;
 
   // Read the first chunk.
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("012345", result);
 
   // Read the second chunk.
-  EXPECT_TRUE(errors::IsOutOfRange(
-      file->Read(sizeof(scratch), sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(file->Read(
+      sizeof(scratch), result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("6789", result);
 }
 
@@ -191,13 +191,13 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Errors) {
   absl::string_view result;
 
   // Read the first chunk.
-  EXPECT_TRUE(
-      errors::IsUnavailable(file->Read(0, sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsUnavailable(
+      file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("", result);
 
   // Read the second chunk.
-  EXPECT_TRUE(errors::IsOutOfRange(
-      file->Read(sizeof(scratch), sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(file->Read(
+      sizeof(scratch), result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("123", result);
 }
 
@@ -238,12 +238,12 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadAtEOF) {
   absl::string_view result;
 
   // Read the first chunk.
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("0123456789", result);
 
   // Read the second chunk.
-  EXPECT_TRUE(errors::IsOutOfRange(
-      file->Read(sizeof(scratch), sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(file->Read(
+      sizeof(scratch), result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("", result);
 }
 
@@ -280,15 +280,15 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedOutOfRange) {
 
   // Read the first chunk. Even though the backend response is out-of-range,
   // we should get a OK status since we're just reading the first 5 bytes.
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("01234", result);
 
-  TF_EXPECT_OK(file->Read(4, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(4, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("45678", result);
 
   // Return the cached error once the user starts reading out of range.
-  EXPECT_TRUE(
-      errors::IsOutOfRange(file->Read(5, sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(
+      file->Read(5, result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("5678", result);
 }
 
@@ -330,9 +330,9 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedNotSequential) {
   char scratch[5];
   absl::string_view result;
 
-  TF_EXPECT_OK(file->Read(1, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(1, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("12345", result);
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("01234", result);
 }
 
@@ -375,11 +375,11 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Growing) {
   // Read the first chunk. Since the first read is out-of-range,
   // we don't cache the out-of-range flag and each subsequent read triggers a
   // backend call.
-  EXPECT_TRUE(
-      errors::IsOutOfRange(file->Read(0, sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(
+      file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("012345678", result);
 
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("0123456789", result);
 }
 
@@ -421,12 +421,12 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadBackwards) {
   absl::string_view result;
 
   // Read the first chunk.
-  EXPECT_TRUE(
-      errors::IsOutOfRange(file->Read(5, sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(
+      file->Read(5, result, absl::MakeSpan(scratch, sizeof(scratch)))));
   EXPECT_EQ("56789", result);
 
   // Go back and read from the beginning of the file.
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("0123456789", result);
 }
 
@@ -577,14 +577,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
   absl::string_view result;
 
   // Read the first chunk.
-  TF_EXPECT_OK(file->Read(0, sizeof(small_scratch), &result, small_scratch));
+  TF_EXPECT_OK(file->Read(
+      0, result, absl::MakeSpan(small_scratch, sizeof(small_scratch))));
   EXPECT_EQ("012", result);
 
   // Read the second chunk that is larger. Requires allocation of new buffer.
   char large_scratch[10];
 
-  EXPECT_TRUE(errors::IsOutOfRange(file->Read(
-      sizeof(small_scratch), sizeof(large_scratch), &result, large_scratch)));
+  EXPECT_TRUE(absl::IsOutOfRange(
+      file->Read(sizeof(small_scratch), result,
+                 absl::MakeSpan(large_scratch, sizeof(large_scratch)))));
   EXPECT_EQ("3456789", result);
 }
 
@@ -640,35 +642,37 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
     // Read the first chunk. The cache will be populated with the first block of
     // 9 bytes.
     scratch[5] = 'x';
-    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, 4)));
     EXPECT_EQ("0123", result);
     EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
 
     // The second chunk will be fully loaded from the cache, no requests are
     // made.
-    TF_EXPECT_OK(file->Read(4, 4, &result, scratch));
+    TF_EXPECT_OK(file->Read(4, result, absl::MakeSpan(scratch, 4)));
     EXPECT_EQ("4567", result);
 
     // The chunk is only partially cached -- the request will be made to fetch
     // the next block. 9 bytes will be requested, starting at offset 9.
-    TF_EXPECT_OK(file->Read(6, 5, &result, scratch));
+    TF_EXPECT_OK(file->Read(6, result, absl::MakeSpan(scratch, 5)));
     EXPECT_EQ("6789a", result);
 
     // The range can only be partially satisfied, as the second block contains
     // only 6 bytes for a total of 9 + 6 = 15 bytes in the file.
-    EXPECT_TRUE(errors::IsOutOfRange(file->Read(6, 10, &result, scratch)));
+    EXPECT_TRUE(
+        absl::IsOutOfRange(file->Read(6, result, absl::MakeSpan(scratch, 10))));
     EXPECT_EQ("6789abcde", result);
 
     // The range cannot be satisfied, and the requested offset is past the end
     // of the cache. A new request will be made to read 9 bytes starting at
     // offset 18. This request will return an empty response, and there will not
     // be another request.
-    EXPECT_TRUE(errors::IsOutOfRange(file->Read(20, 10, &result, scratch)));
+    EXPECT_TRUE(absl::IsOutOfRange(
+        file->Read(20, result, absl::MakeSpan(scratch, 10))));
     EXPECT_TRUE(result.empty());
 
     // The beginning of the file should still be in the LRU cache. There should
     // not be another request. The buffer size is still 15.
-    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+    TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, 4)));
   }
 
   EXPECT_EQ("0123", result);
@@ -723,13 +727,13 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Read the first chunk. The cache will be populated with the first block of
   // 9 bytes.
   scratch[5] = 'x';
-  TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, 4)));
   EXPECT_EQ("0123", result);
   EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
   // Flush caches and read the second chunk. This will be a cache miss, and
   // the same block will be fetched again.
   fs.FlushCaches(nullptr);
-  TF_EXPECT_OK(file->Read(4, 4, &result, scratch));
+  TF_EXPECT_OK(file->Read(4, result, absl::MakeSpan(scratch, 4)));
   EXPECT_EQ("4567", result);
 }
 
@@ -778,20 +782,20 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
     TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", nullptr, &file1));
     TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/object", nullptr, &file2));
     // Reading the first block from file1 should load it once.
-    TF_EXPECT_OK(file1->Read(0, 8, &result, scratch));
+    TF_EXPECT_OK(file1->Read(0, result, absl::MakeSpan(scratch, 8)));
     EXPECT_EQ("01234567", result);
     // Reading the first block from file2 should not trigger a request to load
     // the first block again, because the FileBlockCache shared by file1 and
     // file2 already has the first block.
-    TF_EXPECT_OK(file2->Read(0, 8, &result, scratch));
+    TF_EXPECT_OK(file2->Read(0, result, absl::MakeSpan(scratch, 8)));
     EXPECT_EQ("01234567", result);
     // Reading the second block from file2 should load it once.
-    TF_EXPECT_OK(file2->Read(8, 8, &result, scratch));
+    TF_EXPECT_OK(file2->Read(8, result, absl::MakeSpan(scratch, 8)));
     EXPECT_EQ("89abcdef", result);
     // Reading the second block from file1 should not trigger a request to load
     // the second block again, because the FileBlockCache shared by file1 and
     // file2 already has the second block.
-    TF_EXPECT_OK(file1->Read(8, 8, &result, scratch));
+    TF_EXPECT_OK(file1->Read(8, result, absl::MakeSpan(scratch, 8)));
     EXPECT_EQ("89abcdef", result);
   }
 }
@@ -844,11 +848,11 @@ TEST(GcsFileSystemTest,
   absl::string_view result;
 
   // First read.
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("01234", result);
 
   // Second read. File signatures are different.
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("43210", result);
 }
 
@@ -867,7 +871,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       fs.NewRandomAccessFile("gs://bucket/", nullptr, &file)));
 }
 
@@ -910,8 +914,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
   char scratch[6];
   absl::string_view result;
 
-  EXPECT_TRUE(
-      errors::IsInternal(file->Read(0, sizeof(scratch), &result, scratch)));
+  EXPECT_TRUE(absl::IsInternal(
+      file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch)))));
 }
 
 TEST(GcsFileSystemTest, NewWritableFile) {
@@ -973,7 +977,7 @@ TEST(GcsFileSystemTest, NewWritableFile) {
       fs.NewRandomAccessFile("gs://bucket/path/writeable", nullptr, &rfile));
   char scratch[100];
   absl::string_view result;
-  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  TF_EXPECT_OK(rfile->Read(0, result, absl::MakeSpan(scratch, 4)));
   EXPECT_EQ("0123", result);
   // Open the writable file.
   std::unique_ptr<WritableFile> wfile;
@@ -986,7 +990,7 @@ TEST(GcsFileSystemTest, NewWritableFile) {
   TF_EXPECT_OK(wfile->Append("content2"));
   TF_EXPECT_OK(wfile->Flush());
   // Re-reading the file should trigger another HTTP request to GCS.
-  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  TF_EXPECT_OK(rfile->Read(0, result, absl::MakeSpan(scratch, 4)));
   EXPECT_EQ("0123", result);
   // The calls to flush, sync, and close below should not cause uploads because
   // the file is not dirty.
@@ -1139,7 +1143,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
       fs.NewRandomAccessFile("gs://bucket/path/writeable", nullptr, &rfile));
   char scratch[100];
   absl::string_view result;
-  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  TF_EXPECT_OK(rfile->Read(0, result, absl::MakeSpan(scratch, 4)));
   EXPECT_EQ("0123", result);
   // Now write to the same file. Once the write succeeds, the cached block will
   // be flushed.
@@ -1150,13 +1154,13 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   TF_EXPECT_OK(wfile->Append("content2"));
   // Appending doesn't invalidate the read cache - only flushing does. This read
   // will not trigger an HTTP request to GCS.
-  TF_EXPECT_OK(rfile->Read(4, 4, &result, scratch));
+  TF_EXPECT_OK(rfile->Read(4, result, absl::MakeSpan(scratch, 4)));
   EXPECT_EQ("4567", result);
   // Closing the file triggers HTTP requests to GCS and invalidates the read
   // cache for the file.
   TF_EXPECT_OK(wfile->Close());
   // Reading the first block of the file goes to GCS again.
-  TF_EXPECT_OK(rfile->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(rfile->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("01234567", result);
 }
 
@@ -1229,7 +1233,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   TF_EXPECT_OK(file->Append("content1,"));
   TF_EXPECT_OK(file->Append("content2"));
   const auto& status = file->Close();
-  EXPECT_TRUE(errors::IsAborted(status));
+  EXPECT_TRUE(absl::IsAborted(status));
   EXPECT_TRUE(
       absl::StrContains(status.message(),
                         "All 10 retry attempts failed. The last failure: "
@@ -1294,7 +1298,7 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
     TF_EXPECT_OK(file->Append("content1,"));
     TF_EXPECT_OK(file->Append("content2"));
     const auto& status = file->Close();
-    EXPECT_TRUE(errors::IsUnavailable(status));
+    EXPECT_TRUE(absl::IsUnavailable(status));
     EXPECT_TRUE(
         absl::StrContains(status.message(),
                           "Upload to gs://bucket/path/writeable.txt failed, "
@@ -1327,7 +1331,7 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       fs.NewWritableFile("gs://bucket/", nullptr, &file)));
 }
 
@@ -1403,14 +1407,14 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       fs.NewRandomAccessFile("gs://bucket/path/appendable", nullptr, &rfile));
   char scratch[100];
   absl::string_view result;
-  TF_EXPECT_OK(rfile->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(rfile->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("content1", result);
   // Closing the appendable file will flush its contents to GCS, triggering HTTP
   // requests.
   TF_EXPECT_OK(wfile->Close());
   // Redo the read. The block should be reloaded from GCS, causing one more HTTP
   // request to load it.
-  TF_EXPECT_OK(rfile->Read(0, 4, &result, scratch));
+  TF_EXPECT_OK(rfile->Read(0, result, absl::MakeSpan(scratch, 4)));
   EXPECT_EQ("0123", result);
 }
 
@@ -1428,7 +1432,7 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       fs.NewAppendableFile("gs://bucket/", nullptr, &file)));
 }
 
@@ -1515,7 +1519,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       fs.NewReadOnlyMemoryRegionFromFile("gs://bucket/", nullptr, &region)));
 }
 
@@ -1625,7 +1629,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_TRUE(
-      errors::IsNotFound(fs.FileExists("gs://bucket/path/file1.txt", nullptr)));
+      absl::IsNotFound(fs.FileExists("gs://bucket/path/file1.txt", nullptr)));
 }
 
 TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
@@ -2121,8 +2125,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
-  EXPECT_TRUE(errors::IsInvalidArgument(
-      fs.GetMatchingPaths("gs://*", nullptr, &result)));
+  EXPECT_TRUE(
+      absl::IsInvalidArgument(fs.GetMatchingPaths("gs://*", nullptr, &result)));
 }
 
 TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
@@ -2267,13 +2271,13 @@ TEST(GcsFileSystemTest, DeleteFile) {
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(
       fs.NewRandomAccessFile("gs://bucket/path/file1.txt", nullptr, &file));
-  TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("01234567", result);
   // Deleting the file triggers the next HTTP request to GCS.
   TF_EXPECT_OK(fs.DeleteFile("gs://bucket/path/file1.txt", nullptr));
   // Re-reading the file causes its contents to be reloaded from GCS and not
   // from the block cache.
-  TF_EXPECT_OK(file->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("76543210", result);
 }
 
@@ -2290,8 +2294,7 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  EXPECT_TRUE(
-      errors::IsInvalidArgument(fs.DeleteFile("gs://bucket/", nullptr)));
+  EXPECT_TRUE(absl::IsInvalidArgument(fs.DeleteFile("gs://bucket/", nullptr)));
 }
 
 TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
@@ -2477,8 +2480,8 @@ TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
       nullptr /* gcs additional header */, false /* compose append */);
 
   uint64 size;
-  EXPECT_TRUE(errors::IsInvalidArgument(
-      fs.GetFileSize("gs://bucket/", nullptr, &size)));
+  EXPECT_TRUE(
+      absl::IsInvalidArgument(fs.GetFileSize("gs://bucket/", nullptr, &size)));
 }
 
 TEST(GcsFileSystemTest, RenameFile_Folder) {
@@ -2662,19 +2665,19 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
   std::unique_ptr<RandomAccessFile> dst;
   TF_EXPECT_OK(
       fs.NewRandomAccessFile("gs://bucket/path/src.txt", nullptr, &src));
-  TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(src->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("01234567", result);
   TF_EXPECT_OK(
       fs.NewRandomAccessFile("gs://bucket/path/dst.txt", nullptr, &dst));
-  TF_EXPECT_OK(dst->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(dst->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("76543210", result);
   // Now rename src to dst. This should flush the block cache for both files.
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path/src.txt",
                              "gs://bucket/path/dst.txt", nullptr));
   // Re-read both files. This should reload their contents from GCS.
-  TF_EXPECT_OK(src->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(src->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("89abcdef", result);
-  TF_EXPECT_OK(dst->Read(0, 8, &result, scratch));
+  TF_EXPECT_OK(dst->Read(0, result, absl::MakeSpan(scratch, 8)));
   EXPECT_EQ("fedcba98", result);
 }
 
@@ -2849,7 +2852,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
       kTestTimeoutConfig, *kAllowedLocationsDefault,
       nullptr /* gcs additional header */, false /* compose append */);
 
-  EXPECT_TRUE(errors::IsUnimplemented(fs.RenameFile(
+  EXPECT_TRUE(absl::IsUnimplemented(fs.RenameFile(
       "gs://bucket/path/src.txt", "gs://bucket/path/dst.txt", nullptr)));
 }
 
@@ -3834,7 +3837,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
   char scratch[6];
   absl::string_view result;
 
-  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  TF_EXPECT_OK(file->Read(0, result, absl::MakeSpan(scratch, sizeof(scratch))));
   EXPECT_EQ("012345", result);
 
   EXPECT_EQ("gs://bucket/random_access.txt", stats.block_load_request_file_);
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
index d0bb7b1e701d..d17783127b22 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.cc
@@ -17,11 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/synchronization/mutex.h"
+
 namespace tsl {
 
 namespace {
 EnvTime* get_default_env_time() {
-  static EnvTime* default_env_time = new EnvTime;
+  static EnvTime* const default_env_time = new EnvTime;
   return default_env_time;
 }
 }  // namespace
@@ -33,7 +35,7 @@ GcsThrottle::GcsThrottle(EnvTime* env_time)
       env_time_(env_time ? env_time : get_default_env_time()) {}
 
 bool GcsThrottle::AdmitRequest() {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   UpdateState();
   if (available_tokens_ < config_.tokens_per_request) {
     return false || !config_.enabled;
@@ -43,13 +45,13 @@ bool GcsThrottle::AdmitRequest() {
 }
 
 void GcsThrottle::RecordResponse(size_t num_bytes) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   UpdateState();
   available_tokens_ -= request_bytes_to_tokens(num_bytes);
 }
 
 void GcsThrottle::SetConfig(GcsThrottleConfig config) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   config_ = config;
   available_tokens_ = config.initial_tokens;
   last_updated_secs_ = env_time_->GetOverridableNowSeconds();
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
index be11261f93f6..394a067b9b8d 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_throttle.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef XLA_TSL_PLATFORM_CLOUD_GCS_THROTTLE_H_
 #define XLA_TSL_PLATFORM_CLOUD_GCS_THROTTLE_H_
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 
@@ -111,7 +113,7 @@ class GcsThrottle {
    * instrumentation the number of available tokens in the pool.
    */
   inline int64_t available_tokens() TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     UpdateState();
     return available_tokens_;
   }
@@ -124,7 +126,7 @@ class GcsThrottle {
    * true.
    */
   bool is_enabled() TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     return config_.enabled;
   }
 
@@ -141,7 +143,7 @@ class GcsThrottle {
     return num_bytes >> 10;
   }
 
-  mutex mu_;
+  absl::Mutex mu_;
 
   /**
    * last_updated_secs_ records the number of seconds since the Unix epoch that
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
index d29a70b601ba..fa19620575e9 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/match.h"
+#include "absl/synchronization/mutex.h"
 #include "json/json.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -139,7 +140,7 @@ GoogleAuthProvider::GoogleAuthProvider(
       env_(env) {}
 
 absl::Status GoogleAuthProvider::GetToken(string* t) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   const uint64 now_sec = env_->NowSeconds();
 
   if (now_sec + kExpirationTimeMarginSec < expiration_timestamp_sec_) {
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
index 79b8b90c778c..f15b77af1e61 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/cloud/auth_provider.h"
 #include "xla/tsl/platform/cloud/compute_engine_metadata_client.h"
 #include "xla/tsl/platform/cloud/oauth_client.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -58,7 +58,7 @@ class GoogleAuthProvider : public AuthProvider {
   std::unique_ptr<OAuthClient> oauth_client_;
   std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
   Env* env_;
-  mutex mu_;
+  absl::Mutex mu_;
   string current_token_ TF_GUARDED_BY(mu_);
   uint64 expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
   GoogleAuthProvider(const GoogleAuthProvider&) = delete;
diff --git a/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h b/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
index db13a305ec74..54136aa9f2df 100644
--- a/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
+++ b/third_party/xla/xla/tsl/platform/cloud/now_seconds_env.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef XLA_TSL_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
 #define XLA_TSL_PLATFORM_CLOUD_NOW_SECONDS_ENV_H_
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 
 namespace tsl {
 
@@ -29,18 +29,18 @@ class NowSecondsEnv : public EnvWrapper {
 
   /// The current (fake) timestamp.
   uint64 NowSeconds() const override {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     return now_;
   }
 
   /// Set the current (fake) timestamp.
   void SetNowSeconds(uint64 now) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     now_ = now;
   }
 
   /// Guards access to now_.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   /// The NowSeconds() value that this Env will return.
   uint64 now_ = 1;
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
index 79576b3e14f8..2297fcf51508 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include <memory>
 
 #include "absl/cleanup/cleanup.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "xla/tsl/platform/env.h"
 
 namespace tsl {
 
 bool RamFileBlockCache::BlockNotStale(const std::shared_ptr<Block>& block) {
-  mutex_lock l(block->mu);
+  absl::MutexLock l(&block->mu);
   if (block->state != FetchState::FINISHED) {
     return true;  // No need to check for staleness.
   }
@@ -34,7 +36,7 @@ bool RamFileBlockCache::BlockNotStale(const std::shared_ptr<Block>& block) {
 
 std::shared_ptr<RamFileBlockCache::Block> RamFileBlockCache::Lookup(
     const Key& key) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   auto entry = block_map_.find(key);
   if (entry != block_map_.end()) {
     if (BlockNotStale(entry->second)) {
@@ -70,7 +72,7 @@ void RamFileBlockCache::Trim() {
 /// Move the block to the front of the LRU list if it isn't already there.
 absl::Status RamFileBlockCache::UpdateLRU(const Key& key,
                                           const std::shared_ptr<Block>& block) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (block->timestamp == 0) {
     // The block was evicted from another thread. Allow it to remain evicted.
     return absl::OkStatus();
@@ -106,7 +108,7 @@ absl::Status RamFileBlockCache::MaybeFetch(
         // Perform this action in a cleanup callback to avoid locking mu_ after
         // locking block->mu.
         if (downloaded_block) {
-          mutex_lock l(mu_);
+          absl::MutexLock l(&mu_);
           // Do not update state if the block is already to be evicted.
           if (block->timestamp != 0) {
             // Use capacity() instead of size() to account for all  memory
@@ -122,7 +124,7 @@ absl::Status RamFileBlockCache::MaybeFetch(
       });
   // Loop until either block content is successfully fetched, or our request
   // encounters an error.
-  mutex_lock l(block->mu);
+  absl::MutexLock l(&block->mu);
   absl::Status status = absl::OkStatus();
   while (true) {
     switch (block->state) {
@@ -130,7 +132,7 @@ absl::Status RamFileBlockCache::MaybeFetch(
         TF_FALLTHROUGH_INTENDED;
       case FetchState::CREATED:
         block->state = FetchState::FETCHING;
-        block->mu.unlock();  // Release the lock while making the API call.
+        block->mu.Unlock();  // Release the lock while making the API call.
         block->data.clear();
         block->data.resize(block_size_, 0);
         size_t bytes_transferred;
@@ -139,7 +141,7 @@ absl::Status RamFileBlockCache::MaybeFetch(
         if (cache_stats_ != nullptr) {
           cache_stats_->RecordCacheMissBlockSize(bytes_transferred);
         }
-        block->mu.lock();  // Reacquire the lock immediately afterwards
+        block->mu.Lock();  // Reacquire the lock immediately afterwards
         if (status.ok()) {
           block->data.resize(bytes_transferred, 0);
           // Shrink the data capacity to the actual size used.
@@ -150,10 +152,10 @@ absl::Status RamFileBlockCache::MaybeFetch(
         } else {
           block->state = FetchState::ERROR;
         }
-        block->cond_var.notify_all();
+        block->cond_var.SignalAll();
         return status;
       case FetchState::FETCHING:
-        block->cond_var.wait_for(l, std::chrono::seconds(60));
+        block->cond_var.WaitWithTimeout(&block->mu, absl::Seconds(60));
         if (block->state == FetchState::FINISHED) {
           return absl::OkStatus();
         }
@@ -232,7 +234,7 @@ absl::Status RamFileBlockCache::Read(const string& filename, size_t offset,
 
 bool RamFileBlockCache::ValidateAndUpdateFileSignature(const string& filename,
                                                        int64_t file_signature) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   auto it = file_signature_map_.find(filename);
   if (it != file_signature_map_.end()) {
     if (it->second == file_signature) {
@@ -248,13 +250,13 @@ bool RamFileBlockCache::ValidateAndUpdateFileSignature(const string& filename,
 }
 
 size_t RamFileBlockCache::CacheSize() const {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   return cache_size_;
 }
 
 void RamFileBlockCache::Prune() {
   while (!WaitForNotificationWithTimeout(&stop_pruning_thread_, 1000000)) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     uint64 now = env_->NowSeconds();
     while (!lra_list_.empty()) {
       auto it = block_map_.find(lra_list_.back());
@@ -270,7 +272,7 @@ void RamFileBlockCache::Prune() {
 }
 
 void RamFileBlockCache::Flush() {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   block_map_.clear();
   lru_list_.clear();
   lra_list_.clear();
@@ -278,7 +280,7 @@ void RamFileBlockCache::Flush() {
 }
 
 void RamFileBlockCache::RemoveFile(const string& filename) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   RemoveFile_Locked(filename);
 }
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
index 74faa7ac4d6c..419c9864aeb9 100644
--- a/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
+++ b/third_party/xla/xla/tsl/platform/cloud/ram_file_block_cache.h
@@ -23,11 +23,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/cloud/file_block_cache.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/notification.h"
 #include "tsl/platform/stringpiece.h"
 #include "tsl/platform/thread_annotations.h"
@@ -176,11 +176,11 @@ class RamFileBlockCache : public FileBlockCache {
     /// The timestamp (seconds since epoch) at which the block was cached.
     uint64 timestamp;
     /// Mutex to guard state variable
-    mutex mu;
+    absl::Mutex mu;
     /// The state of the block.
     FetchState state TF_GUARDED_BY(mu) = FetchState::CREATED;
     /// Wait on cond_var if state is FETCHING.
-    condition_variable cond_var;
+    absl::CondVar cond_var;
   };
 
   /// \brief The block map type for the file block cache.
@@ -222,7 +222,7 @@ class RamFileBlockCache : public FileBlockCache {
   Notification stop_pruning_thread_;
 
   /// Guards access to the block map, LRU list, and cached byte count.
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
 
   /// The block map (map from Key to Block).
   BlockMap block_map_ TF_GUARDED_BY(mu_);
diff --git a/third_party/xla/third_party/tsl/tsl/platform/criticality.h b/third_party/xla/xla/tsl/platform/criticality.h
similarity index 91%
rename from third_party/xla/third_party/tsl/tsl/platform/criticality.h
rename to third_party/xla/xla/tsl/platform/criticality.h
index 722c81359543..39dc6374b05b 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/criticality.h
+++ b/third_party/xla/xla/tsl/platform/criticality.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_PLATFORM_CRITICALITY_H_
-#define TENSORFLOW_TSL_PLATFORM_CRITICALITY_H_
+#ifndef XLA_TSL_PLATFORM_CRITICALITY_H_
+#define XLA_TSL_PLATFORM_CRITICALITY_H_
 
 #include "tsl/platform/platform.h"
 
@@ -47,4 +47,4 @@ enum class Criticality {
 #include "xla/tsl/platform/default/criticality.h"  // IWYU pragma: export
 #endif
 
-#endif  // TENSORFLOW_TSL_PLATFORM_CRITICALITY_H_
+#endif  // XLA_TSL_PLATFORM_CRITICALITY_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/criticality_test.cc b/third_party/xla/xla/tsl/platform/criticality_test.cc
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/platform/criticality_test.cc
rename to third_party/xla/xla/tsl/platform/criticality_test.cc
index 1812fa4df444..efdb94025c59 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/criticality_test.cc
+++ b/third_party/xla/xla/tsl/platform/criticality_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/platform/criticality.h"
+#include "xla/tsl/platform/criticality.h"
 
 #include "xla/tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/tsl/platform/default/BUILD b/third_party/xla/xla/tsl/platform/default/BUILD
index 86b4292b38f1..edf81b62f8e9 100644
--- a/third_party/xla/xla/tsl/platform/default/BUILD
+++ b/third_party/xla/xla/tsl/platform/default/BUILD
@@ -50,7 +50,7 @@ cc_library(
 
 cc_library(
     name = "criticality",
-    hdrs = ["@local_tsl//tsl/platform:criticality.h"],
+    hdrs = ["//xla/tsl/platform:criticality.h"],
     tags = [
         "manual",
         "nobuilder",
@@ -153,17 +153,17 @@ cc_library(
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:context",
         "@local_tsl//tsl/platform:cord",
         "@local_tsl//tsl/platform:denormal",
-        "@local_tsl//tsl/platform:load_library",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:protobuf",
@@ -174,6 +174,7 @@ cc_library(
         "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:stringpiece",
         "@local_tsl//tsl/platform:stringprintf",
+        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:tracing",
     ],
 )
@@ -192,9 +193,11 @@ cc_library(
         ":env",
         "//xla/tsl/platform:logging",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:load_library",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -272,6 +275,7 @@ cc_library(
     ],
     deps = [
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
     ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_rpath",
     ]),
@@ -296,14 +300,15 @@ cc_library(
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
     ],
 )
 
 filegroup(
     name = "xla_cpu_runtime_srcs",
     srcs = [
+        "context.h",
         "integral_types.h",
     ] + if_not_windows(["env_time.cc"]),
 )
@@ -442,8 +447,9 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/tsl/platform/default/build_config.bzl b/third_party/xla/xla/tsl/platform/default/build_config.bzl
index 8ce8002aa7c3..97d9b017a9dc 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config.bzl
@@ -5,7 +5,7 @@
 
 load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
 load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
-load("@local_xla//third_party/py/rules_pywrap:pywrap.bzl", "use_pywrap_rules")
+load("@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
 load(
     "@local_xla//xla/tsl:tsl.bzl",
     "clean_dep",
@@ -13,6 +13,7 @@ load(
     "if_tsl_link_protobuf",
 )
 load("@local_xla//xla/tsl/platform:build_config_root.bzl", "if_static")
+load("@rules_python//python:py_library.bzl", "py_library")
 
 # IMPORTANT: Do not remove this load statement. We rely on that //xla/tsl doesn't exist in g3
 # to prevent g3 .bzl files from loading this file.
@@ -137,7 +138,7 @@ def pyx_library(
         shared_objects.append(shared_object_name)
 
     # Now create a py_library with these shared objects as data.
-    native.py_library(
+    py_library(
         name = name,
         srcs = py_srcs,
         deps = py_deps,
@@ -453,7 +454,7 @@ def py_proto_library(
     if default_runtime and not default_runtime in py_libs + deps:
         py_libs = py_libs + [default_runtime]
 
-    native.py_library(
+    py_library(
         name = name,
         srcs = outs + py_extra_srcs,
         deps = py_libs + deps,
@@ -591,7 +592,7 @@ def tf_proto_library_py(
             visibility = ["//visibility:public"],
             deps = [s + "_genproto" for s in py_deps],
         )
-        native.py_library(
+        py_library(
             name = py_name,
             deps = py_deps + [clean_dep("@com_google_protobuf//:protobuf_python")],
             testonly = testonly,
@@ -835,13 +836,10 @@ def strict_cc_test(
       linkstatic: Whether to link statically.
       shuffle_tests: Whether to shuffle the test cases.
       args: The arguments to pass to the test.
-      fail_if_no_test_linked: Whether to fail if no tests are linked. Unimplemented in OSS as
-          --gtest_fail_if_no_test_linked is not available in the OSS build as of 2025-02-27.
+      fail_if_no_test_linked: Whether to fail if no tests are linked.
       **kwargs: Other arguments to pass to the test.
     """
 
-    _ = fail_if_no_test_linked  # buildifier: disable=unused-variable
-
     if args == None:
         args = []
 
@@ -849,6 +847,12 @@ def strict_cc_test(
         # Shuffle tests to avoid test ordering dependencies.
         args = args + ["--gtest_shuffle"]
 
+    if fail_if_no_test_linked:
+        # Fail if no tests are linked.
+        # This is to avoid having a test target that does not run any tests.
+        # This can happen if the test's link options are not set correctly.
+        args = args + ["--gtest_fail_if_no_test_linked"]
+
     native.cc_test(
         name = name,
         linkstatic = linkstatic,
diff --git a/third_party/xla/xla/tsl/platform/default/build_config_root.bzl b/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
index 7436dfb65dfe..a67d056bd43e 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config_root.bzl
@@ -5,7 +5,7 @@ be separate to avoid cyclic references.
 """
 
 load("@local_config_remote_execution//:remote_execution.bzl", "gpu_test_tags")
-load("@local_xla//third_party/py/rules_pywrap:pywrap.bzl", "use_pywrap_rules")
+load("@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl", "use_pywrap_rules")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
diff --git a/third_party/xla/xla/tsl/platform/default/cuda_build_defs.bzl b/third_party/xla/xla/tsl/platform/default/cuda_build_defs.bzl
index e48575673b29..4490404b09fa 100644
--- a/third_party/xla/xla/tsl/platform/default/cuda_build_defs.bzl
+++ b/third_party/xla/xla/tsl/platform/default/cuda_build_defs.bzl
@@ -7,6 +7,7 @@ load(
     "@local_config_cuda//cuda:build_defs.bzl",
     _if_cuda_is_configured = "if_cuda_is_configured",
     _if_cuda_newer_than = "if_cuda_newer_than",
+    _is_cuda_configured = "is_cuda_configured",
 )
 
 # IMPORTANT: Do not remove this load statement. We rely on that //xla/tsl doesn't exist in g3
@@ -20,6 +21,11 @@ visibility(DEFAULT_LOAD_VISIBILITY)
 def if_cuda_is_configured(x, no_cuda = []):
     return _if_cuda_is_configured(x, no_cuda)
 
+# We perform this indirection so that the copybara tool can distinguish this
+# macro from others provided by the same file.
+def is_cuda_configured():
+    return _is_cuda_configured()
+
 # Constructs rpath linker flags for use with nvidia wheel-packaged libs
 # avaialble from PyPI. Two paths are needed because symbols are used from
 # both the root of the TensorFlow installation directory as well as from
diff --git a/third_party/xla/xla/tsl/platform/default/env.cc b/third_party/xla/xla/tsl/platform/default/env.cc
index afe9018374f7..7a2c743883af 100644
--- a/third_party/xla/xla/tsl/platform/default/env.cc
+++ b/third_party/xla/xla/tsl/platform/default/env.cc
@@ -37,24 +37,28 @@ limitations under the License.
 #include <thread>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/default/posix_file_system.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/ram_file_system.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "tsl/platform/load_library.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/strcat.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 
 namespace {
 
-mutex name_mutex(tsl::LINKER_INITIALIZED);
+ABSL_CONST_INIT absl::Mutex name_mutex(absl::kConstInit);
 
 std::map<std::thread::id, string>& GetThreadNameRegistry()
     TF_EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
-  static auto* thread_name_registry = new std::map<std::thread::id, string>();
+  static auto* const thread_name_registry =
+      new std::map<std::thread::id, string>();
   return *thread_name_registry;
 }
 
@@ -89,12 +93,12 @@ class PThread : public Thread {
     std::unique_ptr<ThreadParams> params(
         reinterpret_cast<ThreadParams*>(params_arg));
     {
-      mutex_lock l(name_mutex);
+      absl::MutexLock l(&name_mutex);
       GetThreadNameRegistry().emplace(std::this_thread::get_id(), params->name);
     }
     params->fn();
     {
-      mutex_lock l(name_mutex);
+      absl::MutexLock l(&name_mutex);
       GetThreadNameRegistry().erase(std::this_thread::get_id());
     }
     return nullptr;
@@ -147,7 +151,7 @@ class PosixEnv : public Env {
 
   bool GetCurrentThreadName(string* name) override {
     {
-      mutex_lock l(name_mutex);
+      absl::MutexLock l(&name_mutex);
       auto thread_name =
           GetThreadNameRegistry().find(std::this_thread::get_id());
       if (thread_name != GetThreadNameRegistry().end()) {
@@ -256,7 +260,7 @@ REGISTER_FILE_SYSTEM("file", LocalPosixFileSystem);
 REGISTER_FILE_SYSTEM("ram", RamFileSystem);
 
 Env* Env::Default() {
-  static Env* default_env = new PosixEnv;
+  static Env* const default_env = new PosixEnv;
   return default_env;
 }
 #endif
@@ -266,19 +270,19 @@ void PosixEnv::GetLocalTempDirectories(std::vector<string>* list) {
   // Directories, in order of preference. If we find a dir that
   // exists, we stop adding other less-preferred dirs
   const char* candidates[] = {
-    // Non-null only during unittest/regtest
-    getenv("TEST_TMPDIR"),
+      // Non-null only during unittest/regtest
+      getenv("TEST_TMPDIR"),
 
-    // Explicitly-supplied temp dirs
-    getenv("TMPDIR"),
-    getenv("TMP"),
+      // Explicitly-supplied temp dirs
+      getenv("TMPDIR"),
+      getenv("TMP"),
 
 #if defined(__ANDROID__)
-    "/data/local/tmp",
+      "/data/local/tmp",
 #endif
 
-    // If all else fails
-    "/tmp",
+      // If all else fails
+      "/tmp",
   };
 
   std::vector<std::string> paths;  // Only in case of errors.
diff --git a/third_party/xla/xla/tsl/platform/default/load_library.cc b/third_party/xla/xla/tsl/platform/default/load_library.cc
index 70961c8dc990..5a574cab7c9e 100644
--- a/third_party/xla/xla/tsl/platform/default/load_library.cc
+++ b/third_party/xla/xla/tsl/platform/default/load_library.cc
@@ -28,8 +28,7 @@ namespace internal {
 absl::Status LoadDynamicLibrary(const char* library_filename, void** handle) {
   *handle = dlopen(library_filename, RTLD_NOW | RTLD_LOCAL);
   if (!*handle) {
-    // Note that in C++17 std::string_view(nullptr) gives segfault!
-    const char* error_msg = dlerror();
+    const char* const error_msg = dlerror();
     return absl::NotFoundError(error_msg ? error_msg : "(null error message)");
   }
   return absl::OkStatus();
@@ -44,8 +43,7 @@ absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
     *symbol = dlsym(handle, symbol_name);
   }
   if (!*symbol) {
-    // Note that in C++17 std::string_view(nullptr) gives segfault!
-    const char* error_msg = dlerror();
+    const char* const error_msg = dlerror();
     return absl::NotFoundError(error_msg ? error_msg : "(null error message)");
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/platform/default/logging.cc b/third_party/xla/xla/tsl/platform/default/logging.cc
index 9375d85bf97e..ed8e4cc1cc53 100644
--- a/third_party/xla/xla/tsl/platform/default/logging.cc
+++ b/third_party/xla/xla/tsl/platform/default/logging.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/macros.h"
-#include "tsl/platform/mutex.h"
 
 #if defined(PLATFORM_POSIX_ANDROID)
 #include <android/log.h>
@@ -93,26 +93,26 @@ class TFLogSinks {
   std::queue<TFLogEntry> log_entry_queue_;
   static const size_t kMaxLogEntryQueueSize = 128;
 
-  mutable tsl::mutex mutex_;
+  mutable absl::Mutex mutex_;
   std::vector<TFLogSink*> sinks_;
 };
 
 TFLogSinks::TFLogSinks() {
 #ifndef NO_DEFAULT_LOGGER
-  static TFDefaultLogSink* default_sink = new TFDefaultLogSink();
+  static TFDefaultLogSink* const default_sink = new TFDefaultLogSink();
   sinks_.push_back(default_sink);
 #endif
 }
 
 TFLogSinks& TFLogSinks::Instance() {
-  static TFLogSinks* instance = new TFLogSinks();
+  static TFLogSinks* const instance = new TFLogSinks();
   return *instance;
 }
 
 void TFLogSinks::Add(TFLogSink* sink) {
   assert(sink != nullptr && "The sink must not be a nullptr");
 
-  tsl::mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   sinks_.push_back(sink);
 
   // If this is the only sink log all the queued up messages to this sink
@@ -129,18 +129,18 @@ void TFLogSinks::Add(TFLogSink* sink) {
 void TFLogSinks::Remove(TFLogSink* sink) {
   assert(sink != nullptr && "The sink must not be a nullptr");
 
-  tsl::mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   auto it = std::find(sinks_.begin(), sinks_.end(), sink);
   if (it != sinks_.end()) sinks_.erase(it);
 }
 
 std::vector<TFLogSink*> TFLogSinks::GetSinks() const {
-  tsl::mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   return sinks_;
 }
 
 void TFLogSinks::Send(const TFLogEntry& entry) {
-  tsl::mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
 
   // If we don't have any sinks registered, queue them up
   if (sinks_.empty()) {
diff --git a/third_party/xla/xla/tsl/platform/default/subprocess.cc b/third_party/xla/xla/tsl/platform/default/subprocess.cc
index 85cc2e3bcd95..1c371fcee7e5 100644
--- a/third_party/xla/xla/tsl/platform/default/subprocess.cc
+++ b/third_party/xla/xla/tsl/platform/default/subprocess.cc
@@ -94,8 +94,8 @@ SubProcess::SubProcess(int nfds)
 }
 
 SubProcess::~SubProcess() {
-  mutex_lock procLock(proc_mu_);
-  mutex_lock dataLock(data_mu_);
+  absl::MutexLock procLock(&proc_mu_);
+  absl::MutexLock dataLock(&data_mu_);
   pid_ = -1;
   running_ = false;
   FreeArgs();
@@ -134,8 +134,8 @@ void SubProcess::ClosePipes() {
 
 void SubProcess::SetProgram(const string& file,
                             const std::vector<string>& argv) {
-  mutex_lock procLock(proc_mu_);
-  mutex_lock dataLock(data_mu_);
+  absl::MutexLock procLock(&proc_mu_);
+  absl::MutexLock dataLock(&data_mu_);
   if (running_) {
     LOG(FATAL) << "SetProgram called after the process was started.";
     return;
@@ -161,8 +161,8 @@ void SubProcess::SetProgram(const string& file,
 }
 
 void SubProcess::SetChannelAction(Channel chan, ChannelAction action) {
-  mutex_lock procLock(proc_mu_);
-  mutex_lock dataLock(data_mu_);
+  absl::MutexLock procLock(&proc_mu_);
+  absl::MutexLock dataLock(&data_mu_);
   if (running_) {
     LOG(FATAL) << "SetChannelAction called after the process was started.";
   } else if (!chan_valid(chan)) {
@@ -182,8 +182,8 @@ void SubProcess::SetChannelAction(Channel chan, ChannelAction action) {
 // pthread_atfork() handlers. POSIX does not guarantee this, but for example,
 // glibc 2.24 or newer do so, as does the FreeBSD libc.
 bool SubProcess::Start() {
-  mutex_lock procLock(proc_mu_);
-  mutex_lock dataLock(data_mu_);
+  absl::MutexLock procLock(&proc_mu_);
+  absl::MutexLock dataLock(&data_mu_);
   if (running_) {
     LOG(ERROR) << "Start called after the process was started.";
     return false;
@@ -334,8 +334,8 @@ bool SubProcess::Start() {
 // Implementation based on fork() and exec(); used when posix_spawn() is not
 // available.
 bool SubProcess::Start() {
-  mutex_lock procLock(proc_mu_);
-  mutex_lock dataLock(data_mu_);
+  absl::MutexLock procLock(&proc_mu_);
+  absl::MutexLock dataLock(&data_mu_);
   if (running_) {
     LOG(ERROR) << "Start called after the process was started.";
     return false;
@@ -477,10 +477,10 @@ bool SubProcess::Wait() {
 
 bool SubProcess::WaitInternal(int* status) {
   // The waiter must release proc_mu_ while waiting in order for Kill() to work.
-  proc_mu_.lock();
+  proc_mu_.Lock();
   bool running = running_;
   pid_t pid = pid_;
-  proc_mu_.unlock();
+  proc_mu_.Unlock();
 
   bool ret = false;
   if (running && (pid > 1)) {
@@ -499,20 +499,20 @@ bool SubProcess::WaitInternal(int* status) {
     }
   }
 
-  proc_mu_.lock();
+  proc_mu_.Lock();
   if ((running_ == running) && (pid_ == pid)) {
     running_ = false;
     pid_ = -1;
   }
-  proc_mu_.unlock();
+  proc_mu_.Unlock();
   return ret;
 }
 
 bool SubProcess::Kill(int signal) {
-  proc_mu_.lock();
+  proc_mu_.Lock();
   bool running = running_;
   pid_t pid = pid_;
-  proc_mu_.unlock();
+  proc_mu_.Unlock();
 
   bool ret = false;
   if (running && (pid > 1)) {
@@ -528,9 +528,9 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
   string* iobufs[kNFds];
   int fd_count = 0;
 
-  proc_mu_.lock();
+  proc_mu_.Lock();
   bool running = running_;
-  proc_mu_.unlock();
+  proc_mu_.Unlock();
   if (!running) {
     LOG(ERROR) << "Communicate called without a running process.";
     return 1;
@@ -559,7 +559,7 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
 
   // Lock data_mu_ but not proc_mu_ while communicating with the child process
   // in order for Kill() to be able to terminate the child from another thread.
-  data_mu_.lock();
+  data_mu_.Lock();
 
   // Initialize the poll() structures and buffer tracking.
   for (int i = 0; i < kNFds; i++) {
@@ -650,7 +650,7 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
     }
   }
 
-  data_mu_.unlock();
+  data_mu_.Unlock();
 
   // Wait for the child process to exit and return its status.
   int status;
diff --git a/third_party/xla/xla/tsl/platform/default/subprocess.h b/third_party/xla/xla/tsl/platform/default/subprocess.h
index e7ce0d88f601..321c53d0dae7 100644
--- a/third_party/xla/xla/tsl/platform/default/subprocess.h
+++ b/third_party/xla/xla/tsl/platform/default/subprocess.h
@@ -22,9 +22,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 
@@ -112,11 +113,11 @@ class SubProcess {
 
   // The separation between proc_mu_ and data_mu_ mutexes allows Kill() to be
   // called by a thread while another thread is inside Wait() or Communicate().
-  mutable mutex proc_mu_;
+  mutable absl::Mutex proc_mu_;
   bool running_ TF_GUARDED_BY(proc_mu_);
   pid_t pid_ TF_GUARDED_BY(proc_mu_);
 
-  mutable mutex data_mu_ TF_ACQUIRED_AFTER(proc_mu_);
+  mutable absl::Mutex data_mu_ TF_ACQUIRED_AFTER(proc_mu_);
   char* exec_path_ TF_GUARDED_BY(data_mu_);
   char** exec_argv_ TF_GUARDED_BY(data_mu_);
   ChannelAction action_[kNFds] TF_GUARDED_BY(data_mu_);
diff --git a/third_party/xla/xla/tsl/platform/env.cc b/third_party/xla/xla/tsl/platform/env.cc
index 7e308409e62c..e1564e3bca80 100644
--- a/third_party/xla/xla/tsl/platform/env.cc
+++ b/third_party/xla/xla/tsl/platform/env.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/errors.h"
 #include "tsl/platform/host_info.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/stringprintf.h"
+#include "tsl/platform/thread_annotations.h"
 
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
@@ -65,14 +67,14 @@ class FileSystemRegistryImpl : public FileSystemRegistry {
       std::vector<std::string>* schemes) override;
 
  private:
-  mutable mutex mu_;
+  mutable absl::Mutex mu_;
   mutable std::unordered_map<std::string, std::unique_ptr<FileSystem>> registry_
       TF_GUARDED_BY(mu_);
 };
 
 absl::Status FileSystemRegistryImpl::Register(
     const std::string& scheme, FileSystemRegistry::Factory factory) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (!registry_.emplace(scheme, std::unique_ptr<FileSystem>(factory()))
            .second) {
     return errors::AlreadyExists("File factory for ", scheme,
@@ -83,7 +85,7 @@ absl::Status FileSystemRegistryImpl::Register(
 
 absl::Status FileSystemRegistryImpl::Register(
     const std::string& scheme, std::unique_ptr<FileSystem> filesystem) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   if (!registry_.emplace(scheme, std::move(filesystem)).second) {
     return errors::AlreadyExists("File system for ", scheme,
                                  " already registered");
@@ -92,7 +94,7 @@ absl::Status FileSystemRegistryImpl::Register(
 }
 
 FileSystem* FileSystemRegistryImpl::Lookup(const std::string& scheme) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   const auto found = registry_.find(scheme);
   if (found == registry_.end()) {
     return nullptr;
@@ -102,7 +104,7 @@ FileSystem* FileSystemRegistryImpl::Lookup(const std::string& scheme) {
 
 absl::Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(
     std::vector<std::string>* schemes) {
-  mutex_lock lock(mu_);
+  absl::MutexLock lock(&mu_);
   for (const auto& e : registry_) {
     schemes->push_back(e.first);
   }
@@ -491,7 +493,7 @@ absl::Status ReadFileToString(Env* env, const string& fname, string* data) {
   data->resize(file_size);
   char* p = &*data->begin();
   absl::string_view result;
-  s = file->Read(0, file_size, &result, p);
+  s = file->Read(0, result, absl::MakeSpan(p, file_size));
   if (!s.ok()) {
     data->clear();
   } else if (result.size() != file_size) {
@@ -541,7 +543,8 @@ absl::Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
   absl::Status s = absl::OkStatus();
   while (s.ok()) {
     absl::string_view result;
-    s = src_file->Read(offset, kCopyFileBufferSize, &result, scratch.get());
+    s = src_file->Read(offset, result,
+                       absl::MakeSpan(scratch.get(), kCopyFileBufferSize));
     if (!(s.ok() || s.code() == error::OUT_OF_RANGE)) {
       return s;
     }
@@ -567,7 +570,8 @@ class FileStream : public protobuf::io::ZeroCopyInputStream {
 
   bool Next(const void** data, int* size) override {
     absl::string_view result;
-    absl::Status s = file_->Read(pos_, kBufSize, &result, scratch_);
+    absl::Status s =
+        file_->Read(pos_, result, absl::MakeSpan(scratch_, kBufSize));
     if (result.empty()) {
       status_ = s;
       return false;
diff --git a/third_party/xla/xla/tsl/platform/env.h b/third_party/xla/xla/tsl/platform/env.h
index cabb9e331a6e..eb770860add0 100644
--- a/third_party/xla/xla/tsl/platform/env.h
+++ b/third_party/xla/xla/tsl/platform/env.h
@@ -26,13 +26,13 @@ limitations under the License.
 
 #include "absl/functional/any_invocable.h"
 #include "absl/strings/ascii.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env_time.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/numa.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"
diff --git a/third_party/xla/xla/tsl/platform/errors.cc b/third_party/xla/xla/tsl/platform/errors.cc
index 88aadeb1ac9f..67f24f1562f3 100644
--- a/third_party/xla/xla/tsl/platform/errors.cc
+++ b/third_party/xla/xla/tsl/platform/errors.cc
@@ -18,8 +18,9 @@ limitations under the License.
 #include <errno.h>
 #include <string.h>
 
-#include "xla/tsl/platform/status.h"
-#include "tsl/platform/strcat.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 
 namespace tsl {
 namespace errors {
@@ -175,74 +176,9 @@ absl::StatusCode ErrnoToCode(int err_number) {
 
 }  // namespace
 
-absl::Status IOError(const string& context, int err_number) {
+absl::Status IOError(absl::string_view context, int err_number) {
   auto code = ErrnoToCode(err_number);
-  return absl::Status(code,
-                      strings::StrCat(context, "; ", strerror(err_number)));
-}
-
-bool IsAborted(const absl::Status& status) {
-  return status.code() == tsl::error::Code::ABORTED;
-}
-
-bool IsAlreadyExists(const absl::Status& status) {
-  return status.code() == tsl::error::Code::ALREADY_EXISTS;
-}
-
-bool IsCancelled(const absl::Status& status) {
-  return status.code() == tsl::error::Code::CANCELLED;
-}
-
-bool IsDataLoss(const absl::Status& status) {
-  return status.code() == tsl::error::Code::DATA_LOSS;
-}
-
-bool IsDeadlineExceeded(const absl::Status& status) {
-  return status.code() == tsl::error::Code::DEADLINE_EXCEEDED;
-}
-
-bool IsFailedPrecondition(const absl::Status& status) {
-  return status.code() == tsl::error::Code::FAILED_PRECONDITION;
-}
-
-bool IsInternal(const absl::Status& status) {
-  return status.code() == tsl::error::Code::INTERNAL;
-}
-
-bool IsInvalidArgument(const absl::Status& status) {
-  return status.code() == tsl::error::Code::INVALID_ARGUMENT;
-}
-
-bool IsNotFound(const absl::Status& status) {
-  return status.code() == tsl::error::Code::NOT_FOUND;
-}
-
-bool IsOutOfRange(const absl::Status& status) {
-  return status.code() == tsl::error::Code::OUT_OF_RANGE;
-}
-
-bool IsPermissionDenied(const absl::Status& status) {
-  return status.code() == tsl::error::Code::PERMISSION_DENIED;
-}
-
-bool IsResourceExhausted(const absl::Status& status) {
-  return status.code() == tsl::error::Code::RESOURCE_EXHAUSTED;
-}
-
-bool IsUnauthenticated(const absl::Status& status) {
-  return status.code() == tsl::error::Code::UNAUTHENTICATED;
-}
-
-bool IsUnavailable(const absl::Status& status) {
-  return status.code() == tsl::error::Code::UNAVAILABLE;
-}
-
-bool IsUnimplemented(const absl::Status& status) {
-  return status.code() == tsl::error::Code::UNIMPLEMENTED;
-}
-
-bool IsUnknown(const absl::Status& status) {
-  return status.code() == tsl::error::Code::UNKNOWN;
+  return absl::Status(code, absl::StrCat(context, "; ", strerror(err_number)));
 }
 
 }  // namespace errors
diff --git a/third_party/xla/xla/tsl/platform/errors.h b/third_party/xla/xla/tsl/platform/errors.h
index a154d1d970f7..e86e77a39de6 100644
--- a/third_party/xla/xla/tsl/platform/errors.h
+++ b/third_party/xla/xla/tsl/platform/errors.h
@@ -23,8 +23,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/status/status.h"
 #include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/logging.h"
@@ -87,7 +90,7 @@ inline const strings::AlphaNum& PrepareForStrCat(const strings::AlphaNum& a) {
 }  // namespace internal
 
 // Maps UNIX errors into a Status.
-absl::Status IOError(const string& context, int err_number);
+absl::Status IOError(absl::string_view context, int err_number);
 
 // Returns all payloads from a Status as a key-value map.
 inline std::unordered_map<std::string, std::string> GetPayloads(
@@ -198,10 +201,10 @@ void AppendToMessage(absl::Status* status, Args... args) {
 
 // CANCELLED
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::CancelledError() instead.")
 absl::Status Cancelled(Args... args) {
-  return absl::Status(absl::StatusCode::kCancelled,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::CancelledError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
 absl::Status CancelledWithPayloads(
@@ -212,20 +215,19 @@ absl::Status CancelledWithPayloads(
 
 // InvalidArgument
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::InvalidArgumentError() instead.")
 absl::Status InvalidArgument(Args... args) {
-  return absl::Status(absl::StatusCode::kInvalidArgument,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::InvalidArgumentError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-
+// Specialized overloads to capture source location for up to four arguments.
 #if defined(PLATFORM_GOOGLE)
-// Specialized overloads to capture source location for up to three arguments.
 template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+ABSL_DEPRECATED("Use absl::InvalidArgumentError() instead.")
 absl::Status InvalidArgument(
     Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return absl::Status(
-      absl::StatusCode::kInvalidArgument,
+  return absl::InvalidArgumentError(
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2),
                              ::tsl::errors::internal::PrepareForStrCat(arg3),
@@ -233,36 +235,35 @@ absl::Status InvalidArgument(
       loc);
 }
 template <typename Arg1, typename Arg2, typename Arg3>
+ABSL_DEPRECATED("Use absl::InvalidArgumentError() instead.")
 absl::Status InvalidArgument(
     Arg1 arg1, Arg2 arg2, Arg3 arg3,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return absl::Status(
-      absl::StatusCode::kInvalidArgument,
+  return absl::InvalidArgumentError(
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2),
                              ::tsl::errors::internal::PrepareForStrCat(arg3)),
       loc);
 }
 template <typename Arg1, typename Arg2>
+ABSL_DEPRECATED("Use absl::InvalidArgumentError() instead.")
 absl::Status InvalidArgument(
     Arg1 arg1, Arg2 arg2,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return absl::Status(
-      absl::StatusCode::kInvalidArgument,
+  return absl::InvalidArgumentError(
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2)),
       loc);
 }
 template <typename Arg1>
+ABSL_DEPRECATED("Use absl::InvalidArgumentError() instead.")
 absl::Status InvalidArgument(
     Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return absl::Status(
-      absl::StatusCode::kInvalidArgument,
+  return absl::InvalidArgumentError(
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
       loc);
 }
-template <typename... Args>
-absl::Status InvalidArgumentWithPayloads(
+inline absl::Status InvalidArgumentWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
@@ -270,29 +271,7 @@ absl::Status InvalidArgumentWithPayloads(
                         loc);
 }
 #else
-template <typename Arg1, typename Arg2, typename Arg3>
-absl::Status InvalidArgument(Arg1 arg1, Arg2 arg2, Arg3 arg3) {
-  return absl::Status(
-      absl::StatusCode::kInvalidArgument,
-      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
-                             ::tsl::errors::internal::PrepareForStrCat(arg2),
-                             ::tsl::errors::internal::PrepareForStrCat(arg3)));
-}
-template <typename Arg1, typename Arg2>
-absl::Status InvalidArgument(Arg1 arg1, Arg2 arg2) {
-  return absl::Status(
-      absl::StatusCode::kInvalidArgument,
-      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
-                             ::tsl::errors::internal::PrepareForStrCat(arg2)));
-}
-template <typename Arg1>
-absl::Status InvalidArgument(Arg1 arg1) {
-  return absl::Status(
-      absl::StatusCode::kInvalidArgument,
-      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)));
-}
-template <typename... Args>
-absl::Status InvalidArgumentWithPayloads(
+inline absl::Status InvalidArgumentWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kInvalidArgument, message, payloads);
@@ -301,73 +280,51 @@ absl::Status InvalidArgumentWithPayloads(
 
 // NotFound
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::NotFoundError() instead.")
 absl::Status NotFound(Args... args) {
-  return absl::Status(absl::StatusCode::kNotFound,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::NotFoundError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-#if defined(PLATFORM_GOOGLE)
 // Specialized overloads to capture source location for up to three arguments.
+#if defined(PLATFORM_GOOGLE)
 template <typename Arg1, typename Arg2, typename Arg3>
-absl::Status NotFound(
-    Arg1 arg1, Arg2 arg2, Arg3 arg3,
-    absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return absl::Status(
-      absl::StatusCode::kNotFound,
+ABSL_DEPRECATED("Use absl::NotFoundError() instead.")
+absl::Status
+    NotFound(Arg1 arg1, Arg2 arg2, Arg3 arg3,
+             absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::NotFoundError(
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2),
                              ::tsl::errors::internal::PrepareForStrCat(arg3)),
       loc);
 }
 template <typename Arg1, typename Arg2>
-absl::Status NotFound(
-    Arg1 arg1, Arg2 arg2,
-    absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return absl::Status(
-      absl::StatusCode::kNotFound,
+ABSL_DEPRECATED("Use absl::NotFoundError() instead.")
+absl::Status
+    NotFound(Arg1 arg1, Arg2 arg2,
+             absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::NotFoundError(
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2)),
       loc);
 }
 template <typename Arg1>
-absl::Status NotFound(
-    Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return absl::Status(
-      absl::StatusCode::kNotFound,
+ABSL_DEPRECATED("Use absl::NotFoundError() instead.")
+absl::Status
+    NotFound(Arg1 arg1,
+             absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return absl::NotFoundError(
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
       loc);
 }
-template <typename... Args>
-absl::Status NotFoundWithPayloads(
+inline absl::Status NotFoundWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
   return errors::Create(absl::StatusCode::kNotFound, message, payloads, loc);
 }
 #else
-template <typename Arg1, typename Arg2, typename Arg3>
-absl::Status NotFound(Arg1 arg1, Arg2 arg2, Arg3 arg3) {
-  return absl::Status(
-      absl::StatusCode::kNotFound,
-      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
-                             ::tsl::errors::internal::PrepareForStrCat(arg2),
-                             ::tsl::errors::internal::PrepareForStrCat(arg3)));
-}
-template <typename Arg1, typename Arg2>
-absl::Status NotFound(Arg1 arg1, Arg2 arg2) {
-  return absl::Status(
-      absl::StatusCode::kNotFound,
-      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
-                             ::tsl::errors::internal::PrepareForStrCat(arg2)));
-}
-template <typename Arg1>
-absl::Status NotFound(Arg1 arg1) {
-  return absl::Status(
-      absl::StatusCode::kNotFound,
-      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)));
-}
-template <typename... Args>
-absl::Status NotFoundWithPayloads(
+inline absl::Status NotFoundWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kNotFound, message, payloads);
@@ -376,13 +333,12 @@ absl::Status NotFoundWithPayloads(
 
 // AlreadyExists
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::AlreadyExistsError() instead.")
 absl::Status AlreadyExists(Args... args) {
-  return absl::Status(absl::StatusCode::kAlreadyExists,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::AlreadyExistsError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status AlreadyExistsWithPayloads(
+inline absl::Status AlreadyExistsWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kAlreadyExists, message, payloads);
@@ -390,13 +346,12 @@ absl::Status AlreadyExistsWithPayloads(
 
 // ResourceExhausted
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::ResourceExhaustedError() instead.")
 absl::Status ResourceExhausted(Args... args) {
-  return absl::Status(absl::StatusCode::kResourceExhausted,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::ResourceExhaustedError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status ResourceExhaustedWithPayloads(
+inline absl::Status ResourceExhaustedWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kResourceExhausted, message,
@@ -405,13 +360,12 @@ absl::Status ResourceExhaustedWithPayloads(
 
 // Unavailable
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::UnavailableError() instead.")
 absl::Status Unavailable(Args... args) {
-  return absl::Status(absl::StatusCode::kUnavailable,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::UnavailableError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status UnavailableWithPayloads(
+inline absl::Status UnavailableWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnavailable, message, payloads);
@@ -419,13 +373,12 @@ absl::Status UnavailableWithPayloads(
 
 // FailedPrecondition
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::FailedPreconditionError() instead.")
 absl::Status FailedPrecondition(Args... args) {
-  return absl::Status(absl::StatusCode::kFailedPrecondition,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::FailedPreconditionError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status FailedPreconditionWithPayloads(
+inline absl::Status FailedPreconditionWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kFailedPrecondition, message,
@@ -434,13 +387,12 @@ absl::Status FailedPreconditionWithPayloads(
 
 // OutOfRange
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::OutOfRangeError() instead.")
 absl::Status OutOfRange(Args... args) {
-  return absl::Status(absl::StatusCode::kOutOfRange,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::OutOfRangeError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status OutOfRangeWithPayloads(
+inline absl::Status OutOfRangeWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kOutOfRange, message, payloads);
@@ -448,13 +400,12 @@ absl::Status OutOfRangeWithPayloads(
 
 // Unimplemented
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::UnimplementedError() instead.")
 absl::Status Unimplemented(Args... args) {
-  return absl::Status(absl::StatusCode::kUnimplemented,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::UnimplementedError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status UnimplementedWithPayloads(
+inline absl::Status UnimplementedWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnimplemented, message, payloads);
@@ -462,13 +413,12 @@ absl::Status UnimplementedWithPayloads(
 
 // Internal
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::InternalError() instead.")
 absl::Status Internal(Args... args) {
-  return absl::Status(absl::StatusCode::kInternal,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::InternalError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status InternalWithPayloads(
+inline absl::Status InternalWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kInternal, message, payloads);
@@ -476,13 +426,12 @@ absl::Status InternalWithPayloads(
 
 // Aborted
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::AbortedError() instead.")
 absl::Status Aborted(Args... args) {
-  return absl::Status(absl::StatusCode::kAborted,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::AbortedError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status AbortedWithPayloads(
+inline absl::Status AbortedWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kAborted, message, payloads);
@@ -490,13 +439,12 @@ absl::Status AbortedWithPayloads(
 
 // DeadlineExceeded
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::DeadlineExceededError() instead.")
 absl::Status DeadlineExceeded(Args... args) {
-  return absl::Status(absl::StatusCode::kDeadlineExceeded,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::DeadlineExceededError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status DeadlineExceededWithPayloads(
+inline absl::Status DeadlineExceededWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kDeadlineExceeded, message, payloads);
@@ -504,13 +452,12 @@ absl::Status DeadlineExceededWithPayloads(
 
 // DataLoss
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::DataLossError() instead.")
 absl::Status DataLoss(Args... args) {
-  return absl::Status(absl::StatusCode::kDataLoss,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::DataLossError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status DataLossWithPayloads(
+inline absl::Status DataLossWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kDataLoss, message, payloads);
@@ -518,26 +465,24 @@ absl::Status DataLossWithPayloads(
 
 // Unknown
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::UnknownError() instead.")
 absl::Status Unknown(Args... args) {
-  return absl::Status(absl::StatusCode::kUnknown,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::UnknownError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status UnknownPayloads(
+inline absl::Status UnknownPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnknown, message, payloads);
 }
 // PermissionDenied
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::PermissionDeniedError() instead.")
 absl::Status PermissionDenied(Args... args) {
-  return absl::Status(absl::StatusCode::kPermissionDenied,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::PermissionDeniedError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status PermissionDeniedWithPayloads(
+inline absl::Status PermissionDeniedWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kPermissionDenied, message, payloads);
@@ -545,34 +490,81 @@ absl::Status PermissionDeniedWithPayloads(
 
 // Unauthenticated
 template <typename... Args>
+ABSL_DEPRECATED("Use absl::UnauthenticatedError() instead.")
 absl::Status Unauthenticated(Args... args) {
-  return absl::Status(absl::StatusCode::kUnauthenticated,
-                      ::tsl::strings::StrCat(
-                          ::tsl::errors::internal::PrepareForStrCat(args)...));
+  return absl::UnauthenticatedError(::tsl::strings::StrCat(
+      ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
-template <typename... Args>
-absl::Status UnauthenticatedWithPayloads(
+inline absl::Status UnauthenticatedWithPayloads(
     absl::string_view message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnauthenticated, message, payloads);
 }
 
-bool IsAborted(const absl::Status& status);
-bool IsAlreadyExists(const absl::Status& status);
-bool IsCancelled(const absl::Status& status);
-bool IsDataLoss(const absl::Status& status);
-bool IsDeadlineExceeded(const absl::Status& status);
-bool IsFailedPrecondition(const absl::Status& status);
-bool IsInternal(const absl::Status& status);
-bool IsInvalidArgument(const absl::Status& status);
-bool IsNotFound(const absl::Status& status);
-bool IsOutOfRange(const absl::Status& status);
-bool IsPermissionDenied(const absl::Status& status);
-bool IsResourceExhausted(const absl::Status& status);
-bool IsUnauthenticated(const absl::Status& status);
-bool IsUnavailable(const absl::Status& status);
-bool IsUnimplemented(const absl::Status& status);
-bool IsUnknown(const absl::Status& status);
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsAborted(const absl::Status& status) {
+  return absl::IsAborted(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsAlreadyExists(const absl::Status& status) {
+  return absl::IsAlreadyExists(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsCancelled(const absl::Status& status) {
+  return absl::IsCancelled(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsDataLoss(const absl::Status& status) {
+  return absl::IsDataLoss(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsDeadlineExceeded(const absl::Status& status) {
+  return absl::IsDeadlineExceeded(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsFailedPrecondition(const absl::Status& status) {
+  return absl::IsFailedPrecondition(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsInternal(const absl::Status& status) {
+  return absl::IsInternal(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsInvalidArgument(const absl::Status& status) {
+  return absl::IsInvalidArgument(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsNotFound(const absl::Status& status) {
+  return absl::IsNotFound(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsOutOfRange(const absl::Status& status) {
+  return absl::IsOutOfRange(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsPermissionDenied(const absl::Status& status) {
+  return absl::IsPermissionDenied(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsResourceExhausted(const absl::Status& status) {
+  return absl::IsResourceExhausted(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsUnauthenticated(const absl::Status& status) {
+  return absl::IsUnauthenticated(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsUnavailable(const absl::Status& status) {
+  return absl::IsUnavailable(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsUnimplemented(const absl::Status& status) {
+  return absl::IsUnimplemented(status);
+}
+ABSL_DEPRECATE_AND_INLINE()
+inline bool IsUnknown(const absl::Status& status) {
+  return absl::IsUnknown(status);
+}
 
 // Produces a formatted string pattern from the name which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
@@ -581,19 +573,19 @@ bool IsUnknown(const absl::Status& status);
 // tensorflow/python/client/session.py
 // LINT.IfChange
 inline std::string FormatNodeNameForError(absl::string_view name) {
-  return strings::StrCat("{{node ", name, "}}");
+  return absl::StrCat("{{node ", name, "}}");
 }
 // LINT.ThenChange(//tensorflow/python/client/session.py)
 template <typename T>
 std::string FormatNodeNamesForError(const T& names) {
-  return absl::StrJoin(
-      names, ", ", [](std::string* output, absl::string_view s) {
-        ::tsl::strings::StrAppend(output, FormatNodeNameForError(s));
-      });
+  return absl::StrJoin(names, ", ",
+                       [](std::string* output, absl::string_view s) {
+                         absl::StrAppend(output, FormatNodeNameForError(s));
+                       });
 }
 // LINT.IfChange
 inline std::string FormatColocationNodeForError(absl::string_view name) {
-  return strings::StrCat("{{colocation_node ", name, "}}");
+  return absl::StrCat("{{colocation_node ", name, "}}");
 }
 // LINT.ThenChange(//tensorflow/python/framework/error_interpolation.py)
 template <typename T, typename = std::enable_if_t<
@@ -601,23 +593,21 @@ template <typename T, typename = std::enable_if_t<
 std::string FormatColocationNodeForError(const T& names) {
   return absl::StrJoin(
       names, ", ", [](std::string* output, absl::string_view s) {
-        ::tsl::strings::StrAppend(output, FormatColocationNodeForError(s));
+        absl::StrAppend(output, FormatColocationNodeForError(s));
       });
 }
 
 inline std::string FormatFunctionForError(absl::string_view name) {
-  return strings::StrCat("{{function_node ", name, "}}");
+  return absl::StrCat("{{function_node ", name, "}}");
 }
 
 inline absl::Status ReplaceErrorFromNonCommunicationOps(
     const absl::Status s, absl::string_view op_name) {
-  assert(::tsl::errors::IsUnavailable(s));
-  return absl::Status(
-      absl::StatusCode::kInternal,
-      strings::StrCat(
-          s.message(), "\nExecuting non-communication op <", op_name,
-          "> originally returned UnavailableError, and was replaced by "
-          "InternalError to avoid invoking TF network error handling logic."));
+  assert(absl::IsUnavailable(s));
+  return absl::InternalError(absl::StrCat(
+      s.message(), "\nExecuting non-communication op <", op_name,
+      "> originally returned UnavailableError, and was replaced by "
+      "InternalError to avoid invoking TF network error handling logic."));
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/tsl/platform/file_system.h b/third_party/xla/xla/tsl/platform/file_system.h
index ba046fde42c1..5669d6028977 100644
--- a/third_party/xla/xla/tsl/platform/file_system.h
+++ b/third_party/xla/xla/tsl/platform/file_system.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include <cstddef>
 #include <functional>
 #include <memory>
 #include <string>
@@ -25,6 +26,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_statistics.h"
 #include "xla/tsl/platform/macros.h"
@@ -781,8 +786,30 @@ class RandomAccessFile {
   /// because of EOF.
   ///
   /// Safe for concurrent use by multiple threads.
+  ABSL_DEPRECATE_AND_INLINE()
   virtual absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
-                            char* scratch) const = 0;
+                            char* scratch) const {
+    // Subclasses should implement the safe version of Read() below instead of
+    // this. This implementation is provided to enable the migration: without
+    // this, when a subclass switches from implementing this (deprecated) Read()
+    // to the safe version, the compiler will complain that the subclass
+    // doesn't implement the old one.
+    return Read(offset, *result, absl::MakeSpan(scratch, n));
+  }
+
+  // Like the above, but takes an absl::Span<char> instead of a size_t and a
+  // char*.
+  // TODO(b/393630847):
+  // - Make subclasses implement this method instead of the above,
+  // - Remove the above.
+  // - Mark this method as `= 0` to force subclasses to implement it.
+  virtual absl::Status Read(uint64 offset, absl::string_view& result,
+                            absl::Span<char> scratch) const {
+    // This implementation is provided only for backward compatibility.
+    // If a subclass implements the deprecated Read() above instead of this, it
+    // will still work.
+    return Read(offset, scratch.size(), &result, scratch.data());
+  }
 
 #if defined(TF_CORD_SUPPORT)
   /// \brief Read up to `n` bytes from the file starting at `offset`.
@@ -927,7 +954,7 @@ class FileSystemRegistry {
 /// \brief An abstraction for enforcing ACL checks in FileSystem.
 class FileAcl {
  public:
-  virtual absl::Status CheckAccess(std::string_view path) = 0;
+  virtual absl::Status CheckAccess(absl::string_view path) = 0;
   virtual ~FileAcl() = default;
 };
 
diff --git a/third_party/xla/xla/tsl/platform/file_system_helper.cc b/third_party/xla/xla/tsl/platform/file_system_helper.cc
index ffa288b4e254..dceabb42d2cf 100644
--- a/third_party/xla/xla/tsl/platform/file_system_helper.cc
+++ b/third_party/xla/xla/tsl/platform/file_system_helper.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "tsl/platform/cpu_info.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/str_util.h"
@@ -187,8 +187,8 @@ absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
 
   // Adding to `result` or `new_expand_queue` need to be protected by mutexes
   // since there are multiple threads writing to these.
-  mutex result_mutex;
-  mutex queue_mutex;
+  absl::Mutex result_mutex;
+  absl::Mutex queue_mutex;
 
   while (!expand_queue.empty()) {
     next_expand_queue.clear();
@@ -250,10 +250,10 @@ absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
 
         const std::string path = io::JoinPath(parent, children[j]);
         if (index == dirs.size() - 1) {
-          mutex_lock l(result_mutex);
+          absl::MutexLock l(&result_mutex);
           results->emplace_back(path);
         } else if (children_status[j].ok()) {
-          mutex_lock l(queue_mutex);
+          absl::MutexLock l(&queue_mutex);
           next_expand_queue.emplace_back(path, index);
         }
       }
diff --git a/third_party/xla/xla/tsl/platform/logging.h b/third_party/xla/xla/tsl/platform/logging.h
index a50fd04bdaa3..f870155888e9 100644
--- a/third_party/xla/xla/tsl/platform/logging.h
+++ b/third_party/xla/xla/tsl/platform/logging.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
     defined(PLATFORM_GOOGLE_IOS) || defined(GOOGLE_LOGGING) ||      \
-    defined(__EMSCRIPTEN__) || defined(PLATFORM_CHROMIUMOS)
+    defined(PLATFORM_PORTABLE_GOOGLE)
 #include "xla/tsl/platform/google/logging.h"  // IWYU pragma: export
 #else
 #include "xla/tsl/platform/default/logging.h"  // IWYU pragma: export
diff --git a/third_party/xla/xla/tsl/platform/macros.h b/third_party/xla/xla/tsl/platform/macros.h
index e635f98f08a3..f239dbeaad3a 100644
--- a/third_party/xla/xla/tsl/platform/macros.h
+++ b/third_party/xla/xla/tsl/platform/macros.h
@@ -153,7 +153,7 @@ limitations under the License.
 namespace tsl {
 namespace internal {
 template <typename T>
-void remove_unused_variable_compiler_warning(const T&){};
+void remove_unused_variable_compiler_warning(const T&) {}
 }  // namespace internal
 }  // namespace tsl
 #define TF_UNUSED_VARIABLE(x) \
diff --git a/third_party/xla/xla/tsl/platform/ram_file_system.h b/third_party/xla/xla/tsl/platform/ram_file_system.h
index 898556c3beca..2324600498ca 100644
--- a/third_party/xla/xla/tsl/platform/ram_file_system.h
+++ b/third_party/xla/xla/tsl/platform/ram_file_system.h
@@ -29,10 +29,10 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/match.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/file_system.h"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/stringpiece.h"
 
 #ifdef PLATFORM_WINDOWS
@@ -110,7 +110,7 @@ class RamFileSystem : public FileSystem {
   absl::Status NewRandomAccessFile(
       const std::string& fname_, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto fname = StripRamFsPrefix(fname_);
 
     if (fs_.find(fname) == fs_.end()) {
@@ -127,7 +127,7 @@ class RamFileSystem : public FileSystem {
   absl::Status NewWritableFile(const std::string& fname_,
                                TransactionToken* token,
                                std::unique_ptr<WritableFile>* result) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto fname = StripRamFsPrefix(fname_);
 
     if (fs_.find(fname) == fs_.end()) {
@@ -144,7 +144,7 @@ class RamFileSystem : public FileSystem {
   absl::Status NewAppendableFile(
       const std::string& fname_, TransactionToken* token,
       std::unique_ptr<WritableFile>* result) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto fname = StripRamFsPrefix(fname_);
 
     if (fs_.find(fname) == fs_.end()) {
@@ -174,7 +174,7 @@ class RamFileSystem : public FileSystem {
 
   absl::Status GetChildren(const std::string& dir_, TransactionToken* token,
                            std::vector<std::string>* result) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto dir = StripRamFsPrefix(dir_);
 
     auto it = fs_.lower_bound(dir);
@@ -193,7 +193,7 @@ class RamFileSystem : public FileSystem {
   absl::Status GetMatchingPaths(const std::string& pattern_,
                                 TransactionToken* token,
                                 std::vector<std::string>* results) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto pattern = StripRamFsPrefix(pattern_);
 
     Env* env = Env::Default();
@@ -207,7 +207,7 @@ class RamFileSystem : public FileSystem {
 
   absl::Status Stat(const std::string& fname_, TransactionToken* token,
                     FileStatistics* stat) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto fname = StripRamFsPrefix(fname_);
 
     auto it = fs_.lower_bound(fname);
@@ -230,7 +230,7 @@ class RamFileSystem : public FileSystem {
 
   absl::Status DeleteFile(const std::string& fname_,
                           TransactionToken* token) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto fname = StripRamFsPrefix(fname_);
 
     if (fs_.find(fname) != fs_.end()) {
@@ -243,7 +243,7 @@ class RamFileSystem : public FileSystem {
 
   absl::Status CreateDir(const std::string& dirname_,
                          TransactionToken* token) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto dirname = StripRamFsPrefix(dirname_);
 
     auto it = fs_.find(dirname);
@@ -274,7 +274,7 @@ class RamFileSystem : public FileSystem {
 
   absl::Status DeleteDir(const std::string& dirname_,
                          TransactionToken* token) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto dirname = StripRamFsPrefix(dirname_);
 
     auto it = fs_.find(dirname);
@@ -291,7 +291,7 @@ class RamFileSystem : public FileSystem {
 
   absl::Status GetFileSize(const std::string& fname_, TransactionToken* token,
                            uint64* file_size) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto fname = StripRamFsPrefix(fname_);
 
     if (fs_.find(fname) != fs_.end()) {
@@ -306,7 +306,7 @@ class RamFileSystem : public FileSystem {
 
   absl::Status RenameFile(const std::string& src_, const std::string& target_,
                           TransactionToken* token) override {
-    mutex_lock m(mu_);
+    absl::MutexLock m(&mu_);
     auto src = StripRamFsPrefix(src_);
     auto target = StripRamFsPrefix(target_);
 
@@ -322,7 +322,7 @@ class RamFileSystem : public FileSystem {
   ~RamFileSystem() override {}
 
  private:
-  mutex mu_;
+  absl::Mutex mu_;
   std::map<std::string, std::shared_ptr<std::string>> fs_;
 
   std::vector<std::string> StrSplit(std::string s, std::string delim) {
diff --git a/third_party/xla/xla/tsl/platform/status.cc b/third_party/xla/xla/tsl/platform/status.cc
index 193cb0d56c6f..089ce962f59e 100644
--- a/third_party/xla/xla/tsl/platform/status.cc
+++ b/third_party/xla/xla/tsl/platform/status.cc
@@ -38,14 +38,15 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "xla/tsl/platform/stack_frame.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/stacktrace.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/stringprintf.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
 
@@ -56,7 +57,7 @@ namespace {
 class StatusLogSink : public TFLogSink {
  public:
   static StatusLogSink* GetInstance() {
-    static StatusLogSink* sink = new StatusLogSink();
+    static StatusLogSink* const sink = new StatusLogSink();
     return sink;
   }
 
@@ -81,7 +82,7 @@ class StatusLogSink : public TFLogSink {
   }
 
   void GetMessages(std::vector<std::string>* logs) TF_LOCKS_EXCLUDED(mu_) {
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
 
     for (auto& msg : messages_) {
       logs->push_back(msg);
@@ -91,7 +92,7 @@ class StatusLogSink : public TFLogSink {
   void Send(const TFLogEntry& entry) override TF_LOCKS_EXCLUDED(mu_) {
     if (entry.log_severity() < absl::LogSeverity::kWarning) return;
 
-    mutex_lock lock(mu_);
+    absl::MutexLock lock(&mu_);
     messages_.emplace_back(entry.ToString());
     if (messages_.size() > static_cast<size_t>(num_messages_)) {
       messages_.pop_front();
@@ -99,7 +100,7 @@ class StatusLogSink : public TFLogSink {
   }
 
  private:
-  mutex mu_;
+  absl::Mutex mu_;
   // for allowing repeated/concurrent calls to enable()
   absl::once_flag flag_;
   int num_messages_ = 0;
diff --git a/third_party/xla/xla/tsl/platform/statusor.h b/third_party/xla/xla/tsl/platform/statusor.h
index f638fe3f2cda..041bd35648af 100644
--- a/third_party/xla/xla/tsl/platform/statusor.h
+++ b/third_party/xla/xla/tsl/platform/statusor.h
@@ -100,9 +100,10 @@ using StatusOr ABSL_DEPRECATE_AND_INLINE() = absl::StatusOr<T>;
       TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
       rexpr);
 
-#define TF_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr)  \
-  auto statusor = (rexpr);                                  \
-  ASSERT_TRUE(statusor.status().ok()) << statusor.status(); \
+#define TF_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
+  auto statusor = (rexpr);                                 \
+  ASSERT_TRUE(statusor.status().ok())                      \
+      << ADD_SOURCE_LOCATION(statusor.status());           \
   lhs = std::move(statusor).value()
 
 #define TF_STATUS_MACROS_CONCAT_NAME(x, y) TF_STATUS_MACROS_CONCAT_IMPL(x, y)
diff --git a/third_party/xla/xla/tsl/platform/threadpool.cc b/third_party/xla/xla/tsl/platform/threadpool.cc
index 03e7de5dd7b2..febb84f41274 100644
--- a/third_party/xla/xla/tsl/platform/threadpool.cc
+++ b/third_party/xla/xla/tsl/platform/threadpool.cc
@@ -248,25 +248,6 @@ void ThreadPool::ParallelFor(int64_t total, int64_t cost_per_unit,
       [&fn](Eigen::Index first, Eigen::Index last) { fn(first, last); });
 }
 
-void ThreadPool::ParallelForWithWorkerId(
-    int64_t total, int64_t cost_per_unit,
-    const std::function<void(int64_t, int64_t, int)>& fn) {
-  CHECK_GE(total, 0);
-  CHECK_EQ(total, (int64_t)(Eigen::Index)total);
-
-  threadpool_device_->parallelFor(total,
-                                  Eigen::TensorOpCost(0, 0, cost_per_unit),
-                                  [this, &fn](int64_t start, int64_t limit) {
-                                    // ParallelFor may use the current thread to
-                                    // do some work synchronously. When calling
-                                    // CurrentThreadId() from outside of the
-                                    // thread pool, we get -1, so we can shift
-                                    // every id up by 1.
-                                    int id = CurrentThreadId() + 1;
-                                    fn(start, limit, id);
-                                  });
-}
-
 void ThreadPool::ParallelForWithWorkerId(
     int64_t total, const SchedulingParams& scheduling_params,
     const std::function<void(int64_t, int64_t, int)>& fn) {
diff --git a/third_party/xla/xla/tsl/platform/threadpool.h b/third_party/xla/xla/tsl/platform/threadpool.h
index 68d7daa2d498..c4086391c5be 100644
--- a/third_party/xla/xla/tsl/platform/threadpool.h
+++ b/third_party/xla/xla/tsl/platform/threadpool.h
@@ -196,7 +196,8 @@ class ThreadPool {
       int64_t block_size, int64_t total,
       const std::function<void(int64_t, int64_t)>& fn);
 
-  // Shards the "total" units of work. For more details, see "ParallelFor".
+  // Runs `fn` on `total` units of work in parallel. The number of parallel
+  // tasks processing the work is determined by the scheduling parameters.
   //
   // The function is passed a thread_id between 0 and NumThreads() *inclusive*.
   // This is because some work can happen on the caller thread while the threads
@@ -211,14 +212,6 @@ class ThreadPool {
   // be used for small workloads. If each buffer is expensive, the buffers
   // should be stored in an array initially filled with null, and a buffer
   // should be allocated by fn the first time that the id is used.
-  ABSL_DEPRECATED(
-      "Use ParallelForWithWorkerId with a SchedulingParams argument")
-  void ParallelForWithWorkerId(
-      int64_t total, int64_t cost_per_unit,
-      const std::function<void(int64_t begin, int64_t end, int thread_id)>& fn);
-
-  // Similar to ParallelForWithWorkerId above, but takes the specified
-  // scheduling strategy into account.
   void ParallelForWithWorkerId(
       int64_t total, const SchedulingParams& scheduling_params,
       const std::function<void(int64_t begin, int64_t end, int thread_id)>& fn);
diff --git a/third_party/xla/xla/tsl/platform/threadpool_interface.h b/third_party/xla/xla/tsl/platform/threadpool_interface.h
index 95ad088b90d3..0001fbd25ea6 100644
--- a/third_party/xla/xla/tsl/platform/threadpool_interface.h
+++ b/third_party/xla/xla/tsl/platform/threadpool_interface.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "unsupported/Eigen/CXX11/ThreadPool"
 #include "xla/tsl/platform/types.h"
-#include "tsl/platform/mutex.h"
 
 namespace tsl {
 namespace thread {
diff --git a/third_party/xla/xla/tsl/platform/windows/env.cc b/third_party/xla/xla/tsl/platform/windows/env.cc
index f2acd5f1d65a..3c46f3725697 100644
--- a/third_party/xla/xla/tsl/platform/windows/env.cc
+++ b/third_party/xla/xla/tsl/platform/windows/env.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/tsl/platform/windows/windows_file_system.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "tsl/platform/load_library.h"
+#include "tsl/platform/mutex.h"
 
 #pragma comment(lib, "shlwapi.lib")
 
@@ -47,7 +48,8 @@ mutex name_mutex(tsl::LINKER_INITIALIZED);
 
 std::map<std::thread::id, string>& GetThreadNameRegistry()
     TF_EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
-  static auto* thread_name_registry = new std::map<std::thread::id, string>();
+  static auto* const thread_name_registry =
+      new std::map<std::thread::id, string>();
   return *thread_name_registry;
 }
 
@@ -198,7 +200,7 @@ REGISTER_FILE_SYSTEM("file", LocalWinFileSystem);
 REGISTER_FILE_SYSTEM("ram", RamFileSystem);
 
 Env* Env::Default() {
-  static Env* default_env = new WindowsEnv;
+  static Env* const default_env = new WindowsEnv;
   return default_env;
 }
 
diff --git a/third_party/xla/xla/tsl/profiler/BUILD b/third_party/xla/xla/tsl/profiler/BUILD
deleted file mode 100644
index 130527c3bae0..000000000000
--- a/third_party/xla/xla/tsl/profiler/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-package_group(
-    name = "friends",
-)
-
-package_group(
-    name = "internal",
-)
-
-package_group(
-    name = "xla_profiler_backends",
-)
-
-package_group(
-    name = "xla_internal",
-)
diff --git a/third_party/xla/xla/tsl/profiler/BUILD.bazel b/third_party/xla/xla/tsl/profiler/BUILD.bazel
new file mode 100644
index 000000000000..9263eae2441e
--- /dev/null
+++ b/third_party/xla/xla/tsl/profiler/BUILD.bazel
@@ -0,0 +1,17 @@
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
+
+package_group(
+    name = "friends",
+)
+
+package_group(
+    name = "internal",
+)
+
+package_group(
+    name = "xla_profiler_backends",
+)
+
+package_group(
+    name = "xla_internal",
+)
diff --git a/third_party/xla/xla/tsl/profiler/backends/cpu/BUILD b/third_party/xla/xla/tsl/profiler/backends/cpu/BUILD
index 013fe5542188..6d73f21083c3 100644
--- a/third_party/xla/xla/tsl/profiler/backends/cpu/BUILD
+++ b/third_party/xla/xla/tsl/profiler/backends/cpu/BUILD
@@ -19,7 +19,7 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:thread_annotations",
     ] + if_static([
         ":traceme_recorder_impl",
@@ -86,6 +86,7 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ] + if_static([
         ":annotation_stack_impl",
@@ -107,6 +108,7 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ],
     alwayslink = True,
diff --git a/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.cc b/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.cc
index 586410fc4eac..84946bd41094 100644
--- a/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.cc
+++ b/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <string>
-#include <string_view>
 #include <tuple>
 #include <vector>
 
@@ -47,7 +46,7 @@ static auto GetAnnotationData(const std::atomic<int>& atomic) {
   return std::make_tuple(&data.stack, &data.string, &data.scope_range_id_stack);
 };
 
-void AnnotationStack::PushAnnotation(std::string_view name) {
+void AnnotationStack::PushAnnotation(absl::string_view name) {
   static std::atomic<int64_t> scope_range_counter = 0;
 
   auto [stack, string, scope_range_id_stack] = GetAnnotationData(generation_);
diff --git a/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.h b/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.h
index 0e3d1d0e1666..a3b71df566a3 100644
--- a/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.h
+++ b/third_party/xla/xla/tsl/profiler/backends/cpu/annotation_stack.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <atomic>
 #include <cstdint>
-#include <string_view>
 
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/tsl/platform/types.h"
 
@@ -31,7 +31,7 @@ class AnnotationStack {
   // Appends name to the annotations for the current thread, separated by "::".
   // The choice of separator "::" is based on characters not used by TensorFlow
   // for its TensorOps.
-  static void PushAnnotation(std::string_view name);
+  static void PushAnnotation(absl::string_view name);
 
   // Resizes the annotation stack for the current thread.
   static void PopAnnotation();
diff --git a/third_party/xla/xla/tsl/profiler/convert/BUILD b/third_party/xla/xla/tsl/profiler/convert/BUILD
index 0ea8106714de..8055a1581c4a 100644
--- a/third_party/xla/xla/tsl/profiler/convert/BUILD
+++ b/third_party/xla/xla/tsl/profiler/convert/BUILD
@@ -65,6 +65,24 @@ cc_library(
         "//xla/tsl/profiler/utils:timestamp_utils",
         "//xla/tsl/profiler/utils:xplane_schema",
         "//xla/tsl/profiler/utils:xplane_utils",
+        "//xla/tsl/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+    ],
+)
+
+tsl_cc_test(
+    name = "post_process_single_host_xplane_test",
+    size = "small",
+    srcs = ["post_process_single_host_xplane_test.cc"],
+    deps = [
+        ":post_process_single_host_xplane",
+        "//xla/tsl/platform:test",
+        "//xla/tsl/profiler/utils:trace_utils",
+        "//xla/tsl/profiler/utils:xplane_builder",
+        "//xla/tsl/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
diff --git a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
index 427fa1cdf0c3..70a74de2776b 100644
--- a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane.cc
@@ -14,31 +14,80 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/tsl/profiler/convert/post_process_single_host_xplane.h"
 
+#include <cstdint>
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/utils/timestamp_utils.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
+#include "xla/tsl/profiler/utils/xplane_visitor.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
 namespace profiler {
 namespace {
 
+// Collects all the existing line ids in the given list of planes.
+absl::flat_hash_set<int64_t> GetOccupiedLineIds(
+    std::vector<const XPlane*>& planes) {
+  absl::flat_hash_set<int64_t> occupied_line_ids;
+  for (const XPlane* plane : planes) {
+    for (const XLine& line : plane->lines()) {
+      occupied_line_ids.insert(line.id());
+    }
+  }
+  return occupied_line_ids;
+}
+
+// Changes all the line ids in the plane whose id appear in the
+// occupied_line_ids set to an unoccupied line id starting from
+// target_line_id_start.
+void ChangeOccupiedLineIds(XPlane* plane,
+                           absl::flat_hash_set<int64_t>& occupied_line_ids,
+                           int64_t target_line_id_start) {
+  for (XLine& line : *plane->mutable_lines()) {
+    if (occupied_line_ids.contains(line.id())) {
+      while (occupied_line_ids.contains(target_line_id_start)) {
+        ++target_line_id_start;
+      }
+      line.set_id(target_line_id_start++);
+    }
+    occupied_line_ids.insert(line.id());
+  }
+}
+
 // Merges XPlanes generated by TraceMe, CUPTI API trace and Python tracer.
 void MergeHostPlanesAndSortLines(tensorflow::profiler::XSpace* space) {
   std::vector<const XPlane*> additional_host_planes = FindPlanesWithNames(
       *space,
       {kTpuRuntimePlaneName, kCuptiDriverApiPlaneName, kPythonTracerPlaneName,
        kRoctracerApiPlaneName, kHostThreadsPlaneName});
+  absl::flat_hash_set<int64_t> occupied_line_ids =
+      GetOccupiedLineIds(additional_host_planes);
   tensorflow::profiler::XPlane* host_plane = space->add_planes();
   host_plane->set_name(std::string(kHostThreadsPlaneName));
   if (!additional_host_planes.empty()) {
     MergePlanes(additional_host_planes, host_plane);
     RemovePlanes(space, additional_host_planes);
   }
+
+  // Merge the CUPTI NVTX plane into the host plane.
+  static constexpr int64_t kNvtxLineIdStart = 1LL << 32;
+  XPlane* nvtx_plane =
+      FindMutablePlaneWithName(space, kCuptiActivityNvtxPlaneName);
+  if (nvtx_plane != nullptr) {
+    // Before merging, change the line ids which are shared by the CUPTI host
+    // plane and the NVTX-CUPTI plane to an unoccupied line id. And make sure
+    // the new line id is not occupied by any other plane already merged.
+    ChangeOccupiedLineIds(nvtx_plane, occupied_line_ids, kNvtxLineIdStart);
+    MergePlanes({nvtx_plane}, host_plane);
+    RemovePlanes(space, {nvtx_plane});
+  }
+
+  // Sort the lines by name.
   SortXLinesBy(host_plane, XLinesComparatorByName());
 }
 
diff --git a/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane_test.cc b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane_test.cc
new file mode 100644
index 000000000000..52b75bbb00cc
--- /dev/null
+++ b/third_party/xla/xla/tsl/profiler/convert/post_process_single_host_xplane_test.cc
@@ -0,0 +1,131 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/profiler/convert/post_process_single_host_xplane.h"
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "xla/tsl/platform/test.h"
+#include "xla/tsl/profiler/utils/xplane_builder.h"
+#include "xla/tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using ::tensorflow::profiler::XSpace;
+
+void CreateXSpace(XSpace* space) {
+  // Create a python plane with one line (id 10) with two events, and another
+  // line (id 15) with one event.
+  XPlaneBuilder python_plane(space->add_planes());
+  python_plane.SetName(kPythonTracerPlaneName);
+  XLineBuilder py_thread1 = python_plane.GetOrCreateLine(10);
+  py_thread1.SetName("thread/10");
+  XEventBuilder py_event1 =
+      py_thread1.AddEvent(*python_plane.GetOrCreateEventMetadata("py-event1"));
+  py_event1.SetTimestampNs(1000);
+  py_event1.SetDurationNs(900);
+  XEventBuilder py_event2 =
+      py_thread1.AddEvent(*python_plane.GetOrCreateEventMetadata("py-event2"));
+  py_event2.SetTimestampNs(2000);
+  py_event2.SetDurationNs(600);
+  XLineBuilder py_thread2 = python_plane.GetOrCreateLine(15);
+  py_thread2.SetName("thread/15");
+  XEventBuilder py_event3 =
+      py_thread2.AddEvent(*python_plane.GetOrCreateEventMetadata("py-event3"));
+  py_event3.SetTimestampNs(1200);
+  py_event3.SetDurationNs(800);
+
+  // Create a cupti plane with two lines (id 10 and 20), each with one event.
+  XPlaneBuilder cupti_plane(space->add_planes());
+  cupti_plane.SetName(kCuptiDriverApiPlaneName);
+  XLineBuilder cupti_thread1 = cupti_plane.GetOrCreateLine(10);
+  cupti_thread1.SetName("thread/10");
+  XEventBuilder cupti_event1 = cupti_thread1.AddEvent(
+      *cupti_plane.GetOrCreateEventMetadata("cupti-event1"));
+  cupti_event1.SetTimestampNs(1100);
+  cupti_event1.SetDurationNs(800);
+  XLineBuilder cupti_thread2 = cupti_plane.GetOrCreateLine(20);
+  cupti_thread2.SetName("thread/20");
+  XEventBuilder cupti_event2 = cupti_thread2.AddEvent(
+      *cupti_plane.GetOrCreateEventMetadata("cupti-event2"));
+  cupti_event2.SetTimestampNs(1100);
+  cupti_event2.SetDurationNs(800);
+
+  // Create a nvtx plane with two lines (id 10 and 50), each with one event.
+  XPlaneBuilder nvtx_plane(space->add_planes());
+  nvtx_plane.SetName(kCuptiActivityNvtxPlaneName);
+  XLineBuilder nvtx_thread1 = nvtx_plane.GetOrCreateLine(10);
+  nvtx_thread1.SetName("thread/10/NVTX");
+  XEventBuilder nvtx_event1 = nvtx_thread1.AddEvent(
+      *nvtx_plane.GetOrCreateEventMetadata("nvtx-event1"));
+  nvtx_event1.SetTimestampNs(1200);
+  nvtx_event1.SetDurationNs(600);
+
+  XLineBuilder nvtx_thread2 = nvtx_plane.GetOrCreateLine(50);
+  nvtx_thread2.SetName("thread/50/NVTX");
+  XEventBuilder nvtx_event2 = nvtx_thread2.AddEvent(
+      *nvtx_plane.GetOrCreateEventMetadata("nvtx-event2"));
+  nvtx_event2.SetTimestampNs(1200);
+  nvtx_event2.SetDurationNs(600);
+}
+
+TEST(ConvertXPlaneToTraceEvents, Convert) {
+  XSpace xspace;
+  CreateXSpace(&xspace);
+
+  PostProcessSingleHostXSpace(&xspace, 0, 5000);
+
+  // After merge, only one host plane exist, its name is kHostThreadsPlaneName.
+  ASSERT_EQ(xspace.planes_size(), 1);
+  EXPECT_EQ(xspace.planes(0).name(), kHostThreadsPlaneName);
+
+  // In the host plane, there should be lines with ids {10, 15, 20, 50, new_id}
+  // where new_id is reassigned from the line id 10 from the nvtx plane, as 10
+  // is occupied by the cupti plane line with id 10.
+  const XPlane& host_plane = xspace.planes(0);
+  ASSERT_EQ(host_plane.lines_size(), 5);
+  absl::flat_hash_set<int64_t> known_line_ids = {10, 15, 20, 50};
+  absl::flat_hash_set<int64_t> line_ids, extra_new_line_ids;
+  for (const XLine& line : host_plane.lines()) {
+    auto line_id = line.id();
+    if (known_line_ids.contains(line_id)) {
+      line_ids.insert(line_id);
+    } else {
+      extra_new_line_ids.insert(line_id);
+    }
+  }
+  EXPECT_THAT(line_ids, ::testing::ContainerEq(known_line_ids));
+  EXPECT_THAT(extra_new_line_ids, ::testing::SizeIs(1));
+
+  // The lines in the merged host plane should be sorted by name.
+  EXPECT_THAT(host_plane.lines(0).name(), "thread/10");
+  EXPECT_THAT(host_plane.lines(1).name(), "thread/10/NVTX");
+  EXPECT_THAT(host_plane.lines(2).name(), "thread/15");
+  EXPECT_THAT(host_plane.lines(3).name(), "thread/20");
+  EXPECT_THAT(host_plane.lines(4).name(), "thread/50/NVTX");
+  // The line with id 10 is merged from the python plane line with id 10 and the
+  // cupti plane line with id 10, so it should have 3 (= 2 + 1) events.
+  EXPECT_EQ(host_plane.lines(0).events_size(), 3);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/convert/trace_container.cc b/third_party/xla/xla/tsl/profiler/convert/trace_container.cc
index 2e8ad4ae1055..36bad8ad0716 100644
--- a/third_party/xla/xla/tsl/profiler/convert/trace_container.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/trace_container.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <string>
-#include <string_view>
 #include <vector>
 
 #include "tsl/platform/protobuf.h"
diff --git a/third_party/xla/xla/tsl/profiler/convert/trace_container.h b/third_party/xla/xla/tsl/profiler/convert/trace_container.h
index 967ed51818bf..459ed8f9c58b 100644
--- a/third_party/xla/xla/tsl/profiler/convert/trace_container.h
+++ b/third_party/xla/xla/tsl/profiler/convert/trace_container.h
@@ -16,9 +16,9 @@ limitations under the License.
 #define XLA_TSL_PROFILER_CONVERT_TRACE_CONTAINER_H_
 
 #include <string>
-#include <string_view>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tsl/profiler/protobuf/trace_events.pb.h"
 
 namespace tsl {
diff --git a/third_party/xla/xla/tsl/profiler/rpc/BUILD b/third_party/xla/xla/tsl/profiler/rpc/BUILD
index 69fa636cf1b8..6a5d0e9b0d87 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/xla/tsl/profiler/rpc/BUILD
@@ -27,6 +27,7 @@ cc_library(
         "//xla/tsl/profiler/rpc/client:__pkg__",
         "//tensorflow_serving/model_servers:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//third_party/xprof/pywrap:__pkg__",
     ]),
     deps = [
         "//xla/tsl/platform:env",
@@ -43,7 +44,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_proto_cc",
@@ -69,6 +70,7 @@ cc_library(
         "//tensorflow/python/profiler/internal:__pkg__",
         "//xla/tsl/profiler:internal",
         "//xla/tsl/profiler/rpc/client:__pkg__",
+        "//third_party/xprof/pywrap:__pkg__",
     ]),
     deps = [
         ":profiler_service_impl",
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/BUILD b/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
index c48a3a3746bc..13a500b08d25 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//xla/python:__pkg__",
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//third_party/xprof/pywrap:__pkg__",
     ]),
     deps = [
         ":profiler_client_for_pybind",
@@ -85,6 +86,7 @@ tf_profiler_pybind_cc_library_wrapper(
     visibility = internal_visibility([
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//third_party/xprof/pywrap:__pkg__",
     ]),
 )
 
@@ -95,6 +97,7 @@ cc_library(
         "//xla:__subpackages__",
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//third_party/xprof/pywrap:__pkg__",
     ]),
     deps = [
         ":profiler_client_impl",
@@ -119,6 +122,7 @@ cc_library(
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//third_party/xprof/pywrap:__pkg__",
     ]),
     deps = [
         "//xla/tsl/platform:errors",
@@ -191,8 +195,8 @@ cc_library(
         "//xla/tsl/profiler/utils:time_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
index ec34644fb62c..563b806bf593 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/tsl/platform/env_time.h"
@@ -66,7 +67,7 @@ RemoteProfilerSessionManager::~RemoteProfilerSessionManager() {
 }
 
 absl::Status RemoteProfilerSessionManager::Init() {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   VLOG(1) << "SessionManager initializing.";
 
   const absl::Time session_created_ts =
@@ -102,7 +103,7 @@ absl::Status RemoteProfilerSessionManager::Init() {
 
 std::vector<RemoteProfilerSessionManager::Response>
 RemoteProfilerSessionManager::WaitForCompletion() {
-  mutex_lock lock(mutex_);
+  absl::MutexLock lock(&mutex_);
   std::vector<RemoteProfilerSessionManager::Response> remote_responses(
       clients_.size());
 
diff --git a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h
index d75eac5794b7..da813b40d4e0 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h
+++ b/third_party/xla/xla/tsl/profiler/rpc/client/remote_profiler_session_manager.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/rpc/client/profiler_client.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -67,7 +67,7 @@ class RemoteProfilerSessionManager {
   // Initialization of all client contexts.
   absl::Status Init();
 
-  mutex mutex_;
+  absl::Mutex mutex_;
   // Remote profiler session options.
   tensorflow::RemoteProfilerSessionManagerOptions options_
       TF_GUARDED_BY(mutex_);
diff --git a/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc b/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc
index 8501048944ac..f4b05540b413 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc
+++ b/third_party/xla/xla/tsl/profiler/rpc/profiler_service_impl.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_replace.h"
+#include "absl/synchronization/mutex.h"
 #include "grpcpp/support/status.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/env_time.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/tsl/profiler/utils/math_utils.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/profiler/lib/profiler_session.h"
 #include "tsl/profiler/protobuf/profiler_service.grpc.pb.h"
 #include "tsl/profiler/protobuf/profiler_service.pb.h"
@@ -91,7 +91,7 @@ class ProfilerServiceImpl : public tensorflow::grpc::ProfilerService::Service {
         return ::grpc::Status::CANCELLED;
       }
       if (TF_PREDICT_FALSE(IsStopped(req->session_id()))) {
-        mutex_lock lock(mutex_);
+        absl::MutexLock lock(&mutex_);
         stop_signals_per_session_.erase(req->session_id());
         break;
       }
@@ -109,19 +109,19 @@ class ProfilerServiceImpl : public tensorflow::grpc::ProfilerService::Service {
   ::grpc::Status Terminate(::grpc::ServerContext* ctx,
                            const TerminateRequest* req,
                            TerminateResponse* response) override {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
     stop_signals_per_session_[req->session_id()] = true;
     return ::grpc::Status::OK;
   }
 
  private:
   bool IsStopped(const std::string& session_id) {
-    mutex_lock lock(mutex_);
+    absl::MutexLock lock(&mutex_);
     auto it = stop_signals_per_session_.find(session_id);
     return it != stop_signals_per_session_.end() && it->second;
   }
 
-  mutex mutex_;
+  absl::Mutex mutex_;
   absl::flat_hash_map<std::string, bool> stop_signals_per_session_
       ABSL_GUARDED_BY(mutex_);
 };
diff --git a/third_party/xla/xla/tsl/profiler/utils/BUILD b/third_party/xla/xla/tsl/profiler/utils/BUILD
index 6d07555b850b..5586cab65a0d 100644
--- a/third_party/xla/xla/tsl/profiler/utils/BUILD
+++ b/third_party/xla/xla/tsl/profiler/utils/BUILD
@@ -221,7 +221,6 @@ cc_library(
         ":xplane_visitor",
         "//xla/tsl/platform:types",
         "//xla/tsl/util:stats_calculator_portable",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -408,7 +407,8 @@ cc_library(
     ]),
     deps = [
         "//xla/tsl/platform:logging",
-        "@local_tsl//tsl/platform:mutex",
+        "//xla/tsl/util:safe_reinterpret_cast",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
@@ -604,3 +604,23 @@ tsl_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "profiler_options_util",
+    srcs = ["profiler_options_util.cc"],
+    hdrs = ["profiler_options_util.h"],
+    deps = [
+        "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
+    ],
+)
+
+tsl_cc_test(
+    name = "profiler_options_util_test",
+    srcs = ["profiler_options_util_test.cc"],
+    deps = [
+        ":profiler_options_util",
+        "//xla/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
+    ],
+)
diff --git a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
index 17bcb573b01c..0c1366839a4e 100644
--- a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "xla/tsl/profiler/utils/buffer_pool.h"
 
+#include <cstdint>
 #include <ios>
 
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/platform/logging.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/mutex.h"
 
 namespace tsl {
 namespace profiler {
@@ -32,7 +34,7 @@ BufferPool::~BufferPool() { DestroyAllBuffers(); }
 uint8_t* BufferPool::GetOrCreateBuffer() {
   // Get a relinquished buffer if it exists.
   {
-    mutex_lock lock(buffers_mutex_);
+    absl::MutexLock lock(&buffers_mutex_);
     if (!buffers_.empty()) {
       uint8_t* buffer = buffers_.back();
       buffers_.pop_back();
@@ -41,7 +43,7 @@ uint8_t* BufferPool::GetOrCreateBuffer() {
         return nullptr;
       }
       VLOG(3) << "Reused Buffer, buffer=" << std::hex
-              << reinterpret_cast<uintptr_t>(buffer) << std::dec;
+              << safe_reinterpret_cast<std::uintptr_t>(buffer) << std::dec;
       return buffer;
     }
   }
@@ -55,24 +57,24 @@ uint8_t* BufferPool::GetOrCreateBuffer() {
     return nullptr;
   }
   VLOG(3) << "Allocated Buffer, buffer=" << std::hex
-          << reinterpret_cast<uintptr_t>(buffer) << std::dec
+          << safe_reinterpret_cast<std::uintptr_t>(buffer) << std::dec
           << " size=" << buffer_size_in_bytes_;
   return buffer;
 }
 
 void BufferPool::ReclaimBuffer(uint8_t* buffer) {
-  mutex_lock lock(buffers_mutex_);
+  absl::MutexLock lock(&buffers_mutex_);
 
   buffers_.push_back(buffer);
   VLOG(3) << "Reclaimed Buffer, buffer=" << std::hex
-          << reinterpret_cast<uintptr_t>(buffer) << std::dec;
+          << safe_reinterpret_cast<std::uintptr_t>(buffer) << std::dec;
 }
 
 void BufferPool::DestroyAllBuffers() {
-  mutex_lock lock(buffers_mutex_);
+  absl::MutexLock lock(&buffers_mutex_);
   for (uint8_t* buffer : buffers_) {
     VLOG(3) << "Freeing Buffer, buffer:" << std::hex
-            << reinterpret_cast<uintptr_t>(buffer) << std::dec;
+            << safe_reinterpret_cast<std::uintptr_t>(buffer) << std::dec;
     port::AlignedFree(buffer);
   }
   buffers_.clear();
diff --git a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.h b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.h
index 5482b7cd8bc2..d39b064c653e 100644
--- a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.h
+++ b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tsl/platform/mutex.h"
+#include "absl/synchronization/mutex.h"
 #include "tsl/platform/thread_annotations.h"
 
 namespace tsl {
@@ -51,7 +51,7 @@ class BufferPool {
   size_t GetBufferSizeInBytes() const;
 
  protected:
-  mutex buffers_mutex_;
+  absl::Mutex buffers_mutex_;
   std::vector<uint8_t*> buffers_ TF_GUARDED_BY(buffers_mutex_);
   size_t buffer_size_in_bytes_;
 };
diff --git a/third_party/xla/xla/tsl/profiler/utils/per_thread.h b/third_party/xla/xla/tsl/profiler/utils/per_thread.h
index f3e9d79242ce..205801cbf270 100644
--- a/third_party/xla/xla/tsl/profiler/utils/per_thread.h
+++ b/third_party/xla/xla/tsl/profiler/utils/per_thread.h
@@ -64,7 +64,7 @@ class PerThread {
   class Registry {
    public:
     static Registry& Get() {
-      static Registry* singleton = new Registry();
+      static Registry* const singleton = new Registry();
       return *singleton;
     }
 
diff --git a/third_party/xla/xla/tsl/profiler/utils/profiler_options_util.cc b/third_party/xla/xla/tsl/profiler/utils/profiler_options_util.cc
new file mode 100644
index 000000000000..a2e56495bde2
--- /dev/null
+++ b/third_party/xla/xla/tsl/profiler/utils/profiler_options_util.cc
@@ -0,0 +1,45 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/profiler/utils/profiler_options_util.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <variant>
+
+namespace tsl {
+namespace profiler {
+
+std::optional<std::variant<std::string, bool, int64_t>> GetConfigValue(
+    const tensorflow::ProfileOptions& options, const std::string& key) {
+  auto config = options.advanced_configuration().find(key);
+
+  if (config != options.advanced_configuration().end()) {
+    const tensorflow::ProfileOptions::AdvancedConfigValue& config_value =
+        config->second;
+    if (config_value.has_string_value()) {
+      return config_value.string_value();
+    } else if (config_value.has_bool_value()) {
+      return config_value.bool_value();
+    } else if (config_value.has_int64_value()) {
+      return config_value.int64_value();
+    }
+  }
+
+  return std::nullopt;
+}
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/utils/profiler_options_util.h b/third_party/xla/xla/tsl/profiler/utils/profiler_options_util.h
new file mode 100644
index 000000000000..19fac0e36931
--- /dev/null
+++ b/third_party/xla/xla/tsl/profiler/utils/profiler_options_util.h
@@ -0,0 +1,35 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TSL_PROFILER_UTILS_PROFILER_OPTIONS_UTIL_H_
+#define XLA_TSL_PROFILER_UTILS_PROFILER_OPTIONS_UTIL_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tsl {
+namespace profiler {
+// Get config value from the profiler options, if the key is not found, return
+// std::nullopt.
+std::optional<std::variant<std::string, bool, int64_t>> GetConfigValue(
+    const tensorflow::ProfileOptions& options, const std::string& key);
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // XLA_TSL_PROFILER_UTILS_PROFILER_OPTIONS_UTIL_H_
diff --git a/third_party/xla/xla/tsl/profiler/utils/profiler_options_util_test.cc b/third_party/xla/xla/tsl/profiler/utils/profiler_options_util_test.cc
new file mode 100644
index 000000000000..18692154fabc
--- /dev/null
+++ b/third_party/xla/xla/tsl/profiler/utils/profiler_options_util_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/profiler/utils/profiler_options_util.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/tsl/platform/test.h"
+#include "tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Optional;
+using ::testing::VariantWith;
+
+TEST(ProfilerOptionsUtilTest, GetConfigValueString) {
+  tensorflow::ProfileOptions options;
+  auto& advanced_config = *options.mutable_advanced_configuration();
+  tensorflow::ProfileOptions::AdvancedConfigValue config_value;
+  config_value.set_string_value("test_value");
+  advanced_config["test_key"] = config_value;
+
+  std::optional<std::variant<std::string, bool, int64_t>> result =
+      GetConfigValue(options, "test_key");
+  EXPECT_THAT(result, Optional(VariantWith<std::string>("test_value")));
+}
+
+TEST(ProfilerOptionsUtilTest, GetConfigValueBool) {
+  tensorflow::ProfileOptions options;
+  auto& advanced_config = *options.mutable_advanced_configuration();
+  tensorflow::ProfileOptions::AdvancedConfigValue config_value;
+  config_value.set_bool_value(true);
+  advanced_config["test_key"] = config_value;
+
+  std::optional<std::variant<std::string, bool, int64_t>> result =
+      GetConfigValue(options, "test_key");
+  EXPECT_THAT(result, Optional(VariantWith<bool>(true)));
+}
+
+TEST(ProfilerOptionsUtilTest, GetConfigValueInt64) {
+  tensorflow::ProfileOptions options;
+  auto& advanced_config = *options.mutable_advanced_configuration();
+  tensorflow::ProfileOptions::AdvancedConfigValue config_value;
+  config_value.set_int64_value(12345);
+  advanced_config["test_key"] = config_value;
+
+  std::optional<std::variant<std::string, bool, int64_t>> result =
+      GetConfigValue(options, "test_key");
+  EXPECT_THAT(result, Optional(VariantWith<int64_t>(12345)));
+}
+
+TEST(ProfilerOptionsUtilTest, GetConfigValueNotFound) {
+  tensorflow::ProfileOptions options;
+
+  std::optional<std::variant<std::string, bool, int64_t>> result =
+      GetConfigValue(options, "test_key");
+  EXPECT_FALSE(result.has_value());
+  EXPECT_THAT(result, Eq(std::nullopt));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/profiler/utils/timespan_test.cc b/third_party/xla/xla/tsl/profiler/utils/timespan_test.cc
index 986e91740c78..608c63f522b4 100644
--- a/third_party/xla/xla/tsl/profiler/utils/timespan_test.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/timespan_test.cc
@@ -82,10 +82,14 @@ TEST(TimespanTests, InstantSpanNonInstantSpanOverlappedDuration) {
 
 TEST(TimespanTests, Operators) {
   EXPECT_LT(Timespan(11, 0), Timespan(12, 0));
-  EXPECT_LT(Timespan(12, 1), Timespan(12, 0));
 
+  // Instants nest within larger timespans
+  EXPECT_LT(Timespan(12, 1), Timespan(12, 0));
   EXPECT_FALSE(Timespan(12, 0) < Timespan(12, 1));
+
   EXPECT_FALSE(Timespan(12, 0) < Timespan(11, 0));
+
+  // Instants with same beginning are considered equivalent
   EXPECT_FALSE(Timespan(12, 0) < Timespan(12, 0));
 
   EXPECT_FALSE(Timespan(12, 0) == Timespan(12, 1));
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
index 1d8344db3b0a..1061ed2f70b7 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
@@ -42,6 +42,7 @@ const absl::string_view kScopeRangeIdTreePlaneName =
     "/host:__ScopeRangeCallStack__";
 const absl::string_view kTpuRuntimePlaneName = "/host:TPU-runtime";
 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
+const absl::string_view kCuptiActivityNvtxPlaneName = "/host:NVTX-CUPTI";
 const absl::string_view kRoctracerApiPlaneName = "/host:ROCTRACER";
 const absl::string_view kMetadataPlaneName = "/host:metadata";
 const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
@@ -98,7 +99,7 @@ using LineIdTypeMap = absl::flat_hash_map<absl::string_view, LineIdType>;
 using LineIdTypeStrMap = absl::flat_hash_map<LineIdType, absl::string_view>;
 
 const HostEventTypeMap& GetHostEventTypeMap() {
-  static auto* host_event_type_map = new HostEventTypeMap({
+  static auto* const host_event_type_map = new HostEventTypeMap({
       {"UnknownHostEventType", kUnknownHostEventType},
       {"TraceContext", kTraceContext},
       {"SessionRun", kSessionRun},
@@ -206,7 +207,7 @@ const HostEventTypeMap& GetHostEventTypeMap() {
 }
 
 const StatTypeMap& GetStatTypeMap() {
-  static auto* stat_type_map = new StatTypeMap(
+  static auto* const stat_type_map = new StatTypeMap(
       {{"UnknownStatType", kUnknownStatType},
        // TraceMe arguments.
        {"id", kStepId},
@@ -372,7 +373,7 @@ const StatTypeMap& GetStatTypeMap() {
 }
 
 const MegaScaleStatTypeMap& GetMegaScaleStatTypeMap() {
-  static auto* stat_type_map = new MegaScaleStatTypeMap(
+  static auto* const stat_type_map = new MegaScaleStatTypeMap(
       {{"graph_key", kMegaScaleGraphKey},
        {"local_device_id", kMegaScaleLocalDeviceId},
        {"num_actions", kMegaScaleNumActions},
@@ -405,7 +406,7 @@ const MegaScaleStatTypeMap& GetMegaScaleStatTypeMap() {
 }
 
 const LineIdTypeMap& GetLineIdTypeMap() {
-  static auto* line_id_type_map = new LineIdTypeMap({
+  static auto* const line_id_type_map = new LineIdTypeMap({
       {"UnknownLineIdType", kUnknownLineIdType},
       {"DcnHostTraffic", kDcnHostTraffic},
       {"DcnCollectiveTraffic", kDcnCollectiveTraffic},
@@ -415,25 +416,25 @@ const LineIdTypeMap& GetLineIdTypeMap() {
 }
 
 const HostEventTypeStrMap& GetHostEventTypeStrMap() {
-  static auto* host_event_type_str_map = new HostEventTypeStrMap(
+  static auto* const host_event_type_str_map = new HostEventTypeStrMap(
       gtl::ReverseMap<HostEventTypeStrMap>(GetHostEventTypeMap()));
   return *host_event_type_str_map;
 }
 
 const StatTypeStrMap& GetStatTypeStrMap() {
-  static auto* stat_type_str_map =
+  static auto* const stat_type_str_map =
       new StatTypeStrMap(gtl::ReverseMap<StatTypeStrMap>(GetStatTypeMap()));
   return *stat_type_str_map;
 }
 
 const MegaScaleStatTypeStrMap& GetMegaScaleStatTypeStrMap() {
-  static auto* stat_type_str_map = new MegaScaleStatTypeStrMap(
+  static auto* const stat_type_str_map = new MegaScaleStatTypeStrMap(
       gtl::ReverseMap<MegaScaleStatTypeStrMap>(GetMegaScaleStatTypeMap()));
   return *stat_type_str_map;
 }
 
 const LineIdTypeStrMap& GetLineIdTypeStrMap() {
-  static auto* line_id_type_str_map = new LineIdTypeStrMap(
+  static auto* const line_id_type_str_map = new LineIdTypeStrMap(
       gtl::ReverseMap<LineIdTypeStrMap>(GetLineIdTypeMap()));
   return *line_id_type_str_map;
 }
@@ -447,7 +448,7 @@ constexpr int kNumTaskEnvStatTypes = TaskEnvStatType::kLastTaskEnvStatType -
                                      TaskEnvStatType::kFirstTaskEnvStatType + 1;
 
 const TaskEnvStatTypeMap& GetTaskEnvStatTypeMap() {
-  static auto* task_env_stat_type_map = new TaskEnvStatTypeMap({
+  static auto* const task_env_stat_type_map = new TaskEnvStatTypeMap({
       {"profile_start_time", kEnvProfileStartTime},
       {"profile_stop_time", kEnvProfileStopTime},
   });
@@ -456,7 +457,7 @@ const TaskEnvStatTypeMap& GetTaskEnvStatTypeMap() {
 }
 
 const TaskEnvStatTypeStrMap& GetTaskEnvStatTypeStrMap() {
-  static auto* task_env_stat_type_str_map = new TaskEnvStatTypeStrMap(
+  static auto* const task_env_stat_type_str_map = new TaskEnvStatTypeStrMap(
       gtl::ReverseMap<TaskEnvStatTypeStrMap>(GetTaskEnvStatTypeMap()));
   return *task_env_stat_type_str_map;
 }
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
index 537c76f19de3..1f3b50428394 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
@@ -50,6 +50,8 @@ TF_CONST_INIT extern const absl::string_view kTpuNonCorePlaneNamePrefix;
 TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
 // Name of XPlane that contains CUPTI driver API generated events.
 TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
+// Name of XPlane that contains CUPTI activity NVTX events.
+TF_CONST_INIT extern const absl::string_view kCuptiActivityNvtxPlaneName;
 // Name of XPlane that contains Roctracer API generated events.
 TF_CONST_INIT extern const absl::string_view kRoctracerApiPlaneName;
 // Name of XPlane that contains profile metadata such as XLA debug info.
@@ -315,7 +317,7 @@ enum StatType {
   kMinDurationPs,
   kTotalProfileDurationPs,
   kMaxIterationNum,
-  kDeviceType,
+  kDeviceType,  // Do not use. Use kDeviceTypeString instead.
   kUsesMegaCore,
   kSymbolId,
   kTfOpName,
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
index ef93de692362..374d87111f6e 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_utils.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -143,7 +142,13 @@ std::vector<Event> GetSortedEvents(Plane& plane,
     line.ForEachEvent(
         [&events](auto event) { events.emplace_back(std::move(event)); });
   });
-  absl::c_sort(events);
+  std::sort(events.begin(), events.end(), [](const Event& a, const Event& b) {
+    const tsl::profiler::Timespan a_span = a.GetTimespan();
+    const tsl::profiler::Timespan b_span = b.GetTimespan();
+    if (a_span.begin_ps() < b_span.begin_ps()) return true;
+    if (a_span.begin_ps() > b_span.begin_ps()) return false;
+    return a_span.duration_ps() < b_span.duration_ps();
+  });
   return events;
 }
 
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
index 477ef989be06..df3c6a242aae 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
@@ -264,6 +264,11 @@ class XLineVisitor {
     }
   }
 
+  // Returns the first event in the line, used to get name for counter line.
+  XEventVisitor GetFirstEvent() const {
+    return XEventVisitor(plane_, line_, &line_->events(0));
+  }
+
  private:
   const XPlaneVisitor* plane_;
   const XLine* line_;
diff --git a/third_party/xla/xla/tsl/protobuf/coordination_service.proto b/third_party/xla/xla/tsl/protobuf/coordination_service.proto
index 3d7a7e752bca..525df82119f7 100644
--- a/third_party/xla/xla/tsl/protobuf/coordination_service.proto
+++ b/third_party/xla/xla/tsl/protobuf/coordination_service.proto
@@ -162,6 +162,15 @@ message GetTaskStateResponse {
   repeated CoordinatedTaskStateInfo task_state = 1;
 }
 
+// Request and response messages for getting state of a remote job.
+message GetJobStateRequest {
+  string job_name = 1;
+}
+
+message GetJobStateResponse {
+  repeated CoordinatedTaskStateInfo task_state = 1;
+}
+
 // Message for configuration key value.
 // Key is structured like Unix file system, with multiple levels of directory
 // names separated by the slash ('/') characters.
@@ -314,6 +323,9 @@ service CoordinationService {
   // non-OK error code, non-empty error message and error payload.
   rpc GetTaskState(GetTaskStateRequest) returns (GetTaskStateResponse);
 
+  // Get the state of every task in a remote job.
+  rpc GetJobState(GetJobStateRequest) returns (GetJobStateResponse);
+
   // Insert configuration key-value that will be accessible to all cluster
   // tasks. The key can be formatted as Unix file path with hierarchy. The
   // coordination service key-value store should only be used for cluster
diff --git a/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
index a986efb7cca9..7cc0b746def9 100644
--- a/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
+++ b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
@@ -81,6 +81,8 @@ struct MlDtypesInitInfo {
           py::dtype::from_args(ml_dtypes.attr("float8_e8m0fnu")).num();
       numpy_dtypes.int4 = py::dtype::from_args(ml_dtypes.attr("int4")).num();
       numpy_dtypes.uint4 = py::dtype::from_args(ml_dtypes.attr("uint4")).num();
+      numpy_dtypes.int2 = py::dtype::from_args(ml_dtypes.attr("int2")).num();
+      numpy_dtypes.uint2 = py::dtype::from_args(ml_dtypes.attr("uint2")).num();
     } catch (const std::exception& e) {
       py::gil_scoped_acquire acquire;
       py::print(e.what());
@@ -98,7 +100,8 @@ struct MlDtypesInitInfo {
         numpy_dtypes.float8_e5m2 == NPY_NOTYPE ||
         numpy_dtypes.float8_e5m2fnuz == NPY_NOTYPE ||
         numpy_dtypes.float8_e8m0fnu == NPY_NOTYPE ||
-        numpy_dtypes.int4 == NPY_NOTYPE || numpy_dtypes.uint4 == NPY_NOTYPE) {
+        numpy_dtypes.int4 == NPY_NOTYPE || numpy_dtypes.uint4 == NPY_NOTYPE ||
+        numpy_dtypes.int2 == NPY_NOTYPE || numpy_dtypes.uint2 == NPY_NOTYPE) {
       init_valid = false;
     }
 
diff --git a/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
index 725d844c27bb..15f277df82e0 100644
--- a/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
+++ b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
@@ -35,6 +35,8 @@ struct NumpyDtypes {
   int float8_e8m0fnu;
   int int4;
   int uint4;
+  int int2;
+  int uint2;
 };
 
 // RegisterTypes imports the ml_dtypes module. It should be called before using
diff --git a/third_party/xla/xla/tsl/tsl.bzl b/third_party/xla/xla/tsl/tsl.bzl
index fb54c5a99c97..55e821036c27 100644
--- a/third_party/xla/xla/tsl/tsl.bzl
+++ b/third_party/xla/xla/tsl/tsl.bzl
@@ -1,5 +1,6 @@
 """Provides build configuration for TSL"""
 
+load("@rules_python//python:py_library.bzl", "py_library")
 load("@bazel_skylib//lib:new_sets.bzl", "sets")
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
@@ -33,7 +34,7 @@ load(
     "if_tensorrt",
 )
 load(
-    "@local_xla//third_party/py/rules_pywrap:pywrap.bzl",
+    "@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl",
     "use_pywrap_rules",
 )
 load(
@@ -222,7 +223,7 @@ def if_nccl(if_true, if_false = []):
     return select({
         clean_dep("//xla/tsl:no_nccl_support"): if_false,
         clean_dep("//xla/tsl:windows"): if_false,
-        clean_dep("//xla/tsl:arm"): if_false,
+        clean_dep("//xla/tsl:arm_any"): if_false,
         "//conditions:default": if_true,
     })
 
@@ -406,6 +407,8 @@ def tsl_gpu_library(deps = None, cuda_deps = None, copts = tsl_copts(), **kwargs
 
 register_extension_info(extension = tsl_gpu_library, label_regex_for_dep = "{extension_name}")
 
+CollectedDepsInfo = provider("CollectedDepsInfo", fields = ["tf_collected_deps"])
+
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -421,9 +424,9 @@ def _collect_deps_aspect_impl(target, ctx):  # buildifier: disable=unused-variab
         all_deps += ctx.rule.attr.roots
     for dep in all_deps:
         direct.append(dep.label)
-        if hasattr(dep, "tf_collected_deps"):
-            transitive.append(dep.tf_collected_deps)
-    return struct(tf_collected_deps = depset(direct = direct, transitive = transitive))
+        if CollectedDepsInfo in dep:
+            transitive.append(dep[CollectedDepsInfo].tf_collected_deps)
+    return CollectedDepsInfo(tf_collected_deps = depset(direct = direct, transitive = transitive))
 
 collect_deps_aspect = aspect(
     attr_aspects = ["deps", "data", "roots"],
@@ -442,9 +445,9 @@ def _check_deps_impl(ctx):
     required_deps = ctx.attr.required_deps
     disallowed_deps = ctx.attr.disallowed_deps
     for input_dep in ctx.attr.deps:
-        if not hasattr(input_dep, "tf_collected_deps"):
+        if CollectedDepsInfo not in input_dep:
             continue
-        collected_deps = sets.make(input_dep.tf_collected_deps.to_list())
+        collected_deps = sets.make(input_dep[CollectedDepsInfo].tf_collected_deps.to_list())
         for disallowed_dep in disallowed_deps:
             if sets.contains(collected_deps, disallowed_dep.label):
                 fail(
@@ -815,7 +818,7 @@ def tsl_pybind_extension_opensource(
         testonly = testonly,
     )
 
-    native.py_library(
+    py_library(
         name = name,
         data = select({
             clean_dep("//xla/tsl:windows"): [pyd_file],
diff --git a/third_party/xla/xla/tsl/util/BUILD b/third_party/xla/xla/tsl/util/BUILD
index a261a68b40ec..ee27023ad972 100644
--- a/third_party/xla/xla/tsl/util/BUILD
+++ b/third_party/xla/xla/tsl/util/BUILD
@@ -122,11 +122,26 @@ filegroup(
     ],
 )
 
+filegroup(
+    name = "xla_cpu_runtime_hdrs",
+    srcs = [
+        "safe_reinterpret_cast.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+)
+
+filegroup(
+    name = "xla_cpu_runtime_srcs",
+    srcs = [],
+    compatible_with = get_compatible_with_portable(),
+)
+
 cc_library(
     name = "byte_swap_array",
     srcs = ["byte_swap_array.cc"],
     hdrs = ["byte_swap_array.h"],
     deps = [
+        ":safe_reinterpret_cast",
         "//xla/tsl/platform:byte_order",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
@@ -152,7 +167,7 @@ cc_library(
     deps = [
         ":env_var",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/synchronization",
     ],
     alwayslink = 1,
 )
@@ -226,7 +241,6 @@ cc_library(
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:test_log_proto_cc",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:str_util",
     ],
 )
@@ -366,3 +380,17 @@ tsl_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "safe_reinterpret_cast",
+    hdrs = ["safe_reinterpret_cast.h"],
+)
+
+tsl_cc_test(
+    name = "safe_reinterpret_cast_test",
+    srcs = ["safe_reinterpret_cast_test.cc"],
+    deps = [
+        ":safe_reinterpret_cast",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/tsl/util/byte_swap_array.cc b/third_party/xla/xla/tsl/util/byte_swap_array.cc
index 53bc7d9124f6..1e30dd200301 100644
--- a/third_party/xla/xla/tsl/util/byte_swap_array.cc
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "xla/tsl/util/byte_swap_array.h"
 
+#include <cstdint>
+
 #include "xla/tsl/platform/errors.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 
 namespace tsl {
 
@@ -24,19 +27,19 @@ absl::Status ByteSwapArray(char* array, size_t bytes_per_elem, int array_len) {
     // No-op
     return absl::OkStatus();
   } else if (bytes_per_elem == 2) {
-    auto array_16 = reinterpret_cast<uint16_t*>(array);
+    auto array_16 = safe_reinterpret_cast<uint16_t*>(array);
     for (int i = 0; i < array_len; i++) {
       array_16[i] = BYTE_SWAP_16(array_16[i]);
     }
     return absl::OkStatus();
   } else if (bytes_per_elem == 4) {
-    auto array_32 = reinterpret_cast<uint32_t*>(array);
+    auto array_32 = safe_reinterpret_cast<uint32_t*>(array);
     for (int i = 0; i < array_len; i++) {
       array_32[i] = BYTE_SWAP_32(array_32[i]);
     }
     return absl::OkStatus();
   } else if (bytes_per_elem == 8) {
-    auto array_64 = reinterpret_cast<uint64_t*>(array);
+    auto array_64 = safe_reinterpret_cast<uint64_t*>(array);
     for (int i = 0; i < array_len; i++) {
       array_64[i] = BYTE_SWAP_64(array_64[i]);
     }
diff --git a/third_party/xla/xla/tsl/util/determinism.cc b/third_party/xla/xla/tsl/util/determinism.cc
index 6089cc96458d..79b9ac6bb87c 100644
--- a/third_party/xla/xla/tsl/util/determinism.cc
+++ b/third_party/xla/xla/tsl/util/determinism.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "xla/tsl/util/determinism.h"
 
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/util/env_var.h"
-#include "tsl/platform/mutex.h"
 
 namespace tsl {
 
@@ -27,7 +27,7 @@ class DeterminismState {
  public:
   explicit DeterminismState(absl::string_view env_var) : env_var_(env_var) {}
   bool Required() {
-    mutex_lock l(*mutex_);
+    absl::MutexLock l(mutex_);
 
     if (state_ == Value::NOT_SET) {
       bool env_var_set = false;
@@ -39,14 +39,14 @@ class DeterminismState {
     return state_ == Value::ENABLED;
   }
   void Enable(bool enabled) {
-    mutex_lock l(*mutex_);
+    absl::MutexLock l(mutex_);
     state_ = enabled ? Value::ENABLED : Value::DISABLED;
   }
 
  private:
   absl::string_view env_var_;
   enum class Value { DISABLED, ENABLED, NOT_SET };
-  mutex* mutex_ = new mutex;
+  absl::Mutex* mutex_ = new absl::Mutex;
   Value state_ = Value::NOT_SET;
 };
 
diff --git a/third_party/xla/xla/tsl/util/proto/BUILD b/third_party/xla/xla/tsl/util/proto/BUILD
index 255f3bd4cff9..45038a78e9ca 100644
--- a/third_party/xla/xla/tsl/util/proto/BUILD
+++ b/third_party/xla/xla/tsl/util/proto/BUILD
@@ -1,4 +1,9 @@
 load("//xla/tsl:tsl.bzl", "if_google")
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tf_proto_library",
+    "tsl_cc_test",
+)
 load(
     "//xla/tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -19,3 +24,34 @@ cc_library(
         "@com_google_absl//absl/time",
     ] + if_google(["@com_google_protobuf//:duration_cc_proto"]),
 )
+
+cc_library(
+    name = "proto_matchers",
+    testonly = True,
+    srcs = ["proto_matchers.cc"],
+    hdrs = ["proto_matchers.h"],
+    visibility = ["//xla:__subpackages__"],
+    deps = [
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_for_library",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+tf_proto_library(
+    name = "proto_matchers_test_protos",
+    srcs = ["proto_matchers_test_protos.proto"],
+)
+
+tsl_cc_test(
+    name = "proto_matchers_test",
+    srcs = ["proto_matchers_test.cc"],
+    deps = [
+        ":proto_matchers",
+        ":proto_matchers_test_protos_cc",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
diff --git a/third_party/xla/xla/tsl/util/proto/proto_matchers.cc b/third_party/xla/xla/tsl/util/proto/proto_matchers.cc
new file mode 100644
index 000000000000..c4bfeb463465
--- /dev/null
+++ b/third_party/xla/xla/tsl/util/proto/proto_matchers.cc
@@ -0,0 +1,16 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/util/proto/proto_matchers.h"
diff --git a/third_party/xla/xla/tsl/util/proto/proto_matchers.h b/third_party/xla/xla/tsl/util/proto/proto_matchers.h
new file mode 100644
index 000000000000..3711da138394
--- /dev/null
+++ b/third_party/xla/xla/tsl/util/proto/proto_matchers.h
@@ -0,0 +1,246 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This library defines some proto matchers in the ::tsl::proto_testing
+// namespace for use in tests.
+//
+// The matchers are:
+//
+//   EqualsProto(Proto)
+//   EqualsProto(string)
+//
+// The EqualsProto(Proto) matcher matches a proto that equals the given proto.
+// The EqualsProto(string) matcher matches a proto that equals the given proto
+// (represented as a text string).
+//
+// It also defines a few transformers for proto matchers:
+//
+//   Partially(m)
+//   IgnoringRepeatedFieldOrdering(m)
+//
+// Partially(m) is like m, but ignores any fields that are not set in the
+// expected proto.
+//
+// IgnoringRepeatedFieldOrdering(m) is like m, but ignores the order of elements
+// in repeated fields.
+//
+// Partially() and IgnoringRepeatedFieldOrdering() can be nested, e.g.
+//
+//   Partially(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+//     s1: "foo"
+//     r3: "a"
+//     r3: "b"
+//     r3: "c"
+//   )pb"))));
+//
+// will match a proto that has the same elements in r3, but in any order, and
+// will ignore any extra fields that are set.
+
+#ifndef XLA_TSL_UTIL_PROTO_PROTO_MATCHERS_H_
+#define XLA_TSL_UTIL_PROTO_PROTO_MATCHERS_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "tsl/platform/protobuf.h"
+
+namespace tsl {
+namespace proto_testing {
+
+// Marks the proto-matcher as allowing for extra fields to be set in the result.
+// NB that extra elements of repeated fields are still considered a failure.
+template <typename InnerMatcher,
+          typename = typename InnerMatcher::is_proto_matcher>
+inline InnerMatcher Partially(InnerMatcher i) {
+  i.SetPartial();
+  return i;
+}
+
+// Marks the proto matcher as allowing for repeated fields to be in any order.
+template <typename InnerMatcher,
+          typename = typename InnerMatcher::is_proto_matcher>
+inline InnerMatcher IgnoringRepeatedFieldOrdering(InnerMatcher i) {
+  i.SetUnorderedRepeatedFields();
+  return i;
+}
+
+namespace internal {
+
+// A wrapper around a unique_ptr that can implicitly convert to either a raw
+// pointer or a unique_ptr.
+template <typename T>
+class UniquePtrWrapper {
+ public:
+  // Creates a wrapper around the given unique_ptr.
+  explicit UniquePtrWrapper(std::unique_ptr<T> ptr) : ptr_(std::move(ptr)) {}
+
+  // Implicitly converts to a raw pointer of a compatible type, transferring
+  // ownership to the caller.
+  template <typename U>
+  operator U*() {  // NOLINT(google-explicit-constructor)
+    return ptr_.release();
+  }
+
+  // Implicitly converts to a unique_ptr of a compatible type.
+  template <typename U>
+  operator std::unique_ptr<U>() {  // NOLINT(google-explicit-constructor)
+    return std::move(ptr_);
+  }
+
+ private:
+  std::unique_ptr<T> ptr_;
+};
+
+// Matcher ignore-checker that makes fields not in 'gold' ignored.
+class PartialIgnore final
+    : public ::tsl::protobuf::util::MessageDifferencer::IgnoreCriteria {
+  using SpecificField =
+      ::tsl::protobuf::util::MessageDifferencer::SpecificField;
+
+ public:
+  PartialIgnore() = default;
+
+  bool IsIgnored(const ::tsl::protobuf::Message& gold,
+                 const ::tsl::protobuf::Message& test,
+                 const ::tsl::protobuf::FieldDescriptor* field,
+                 const std ::vector<SpecificField>& specific_field) final {
+    // Ignore any field fully absent from the gold proto.
+    if (field->is_repeated()) {
+      return gold.GetReflection()->FieldSize(gold, field) == 0;
+    }
+    return !gold.GetReflection()->HasField(gold, field);
+  }
+
+  bool IsUnknownFieldIgnored(const ::tsl::protobuf::Message&,
+                             const ::tsl::protobuf::Message&,
+                             const SpecificField&,
+                             const std ::vector<SpecificField>&) final {
+    return true;
+  }
+};
+
+// Matcher that checks for proto equality. It can be modified by the Partially
+// and IgnoreRepeatedFieldOrdering to adjust the values. ExpectedProto can be
+// either a proto or string.
+template <typename ExpectedProto>
+class EqualsProtoMatcher {
+ public:
+  static_assert(std::is_base_of_v<::tsl::protobuf::Message, ExpectedProto> ||
+                    std::is_same_v<ExpectedProto, std::string>,
+                "EqualsProto(p) requires p to be a proto or a string.");
+  using is_gtest_matcher = void;
+  using is_proto_matcher = void;
+
+  explicit EqualsProtoMatcher(ExpectedProto expected_proto)
+      : expected_proto_(std::move(expected_proto)) {}
+
+  // Matches a proto against the expected proto.
+  template <typename ActualProto>
+  bool MatchAndExplain(ActualProto actual_proto,
+                       ::testing::MatchResultListener* listener) const {
+    using Actual = std::remove_reference_t<ActualProto>;
+    Actual expected;
+    const Actual* expected_ptr = &expected;
+    if constexpr (std::is_same_v<ExpectedProto, std::string>) {
+      const bool parsed = ::tsl::protobuf::TextFormat::ParseFromString(
+          expected_proto_, &expected);
+      if (!parsed) {
+        *listener << "Unable to parse \"" << expected_proto_ << "\" as "
+                  << expected.GetTypeName();
+        return false;
+      }
+    } else {
+      expected_ptr = &expected_proto_;
+    }
+
+    ::tsl::protobuf::util::MessageDifferencer diff;
+    diff.set_report_ignores(false);
+    if (partial_) {
+      // This code needs to compile with different versions of protobuf.
+      // In earlier versions, AddIgnoreCriteria() takes a raw pointer.
+      // In later versions, it takes a unique_ptr. We use a wrapper to
+      // make it work with both versions.
+      diff.AddIgnoreCriteria(
+          UniquePtrWrapper<PartialIgnore>(std::make_unique<PartialIgnore>()));
+    }
+    std::string str_report;
+    if (unordered_repeated_fields_) {
+      diff.set_repeated_field_comparison(
+          ::tsl::protobuf::util::MessageDifferencer::AS_SET);
+    }
+    diff.ReportDifferencesToString(&str_report);
+    bool same_message = diff.Compare(*expected_ptr, actual_proto);
+    if (same_message) {
+      return true;
+    }
+    *listener << str_report;
+    return false;
+  }
+
+  // Describes this matcher to an ostream.
+  void DescribeTo(std::ostream* os) const {
+    *os << absl::StreamFormat(
+        "equals%s%s ", partial_ ? " (ignoring extra fields)" : "",
+        unordered_repeated_fields_ ? " (ignoring repeated field order)" : "");
+    // StreamFormat() doesn't work with some versions of protobuf, so we need
+    // to convert expected_proto_ to a string manually.
+    std::string expected_proto_str;
+    if constexpr (std::is_same_v<ExpectedProto, std::string>) {
+      *os << expected_proto_;
+    } else {
+      *os << expected_proto_.DebugString();
+    }
+  }
+
+  // Describes the negation of this matcher to an ostream.
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "not ";
+    DescribeTo(os);
+  }
+
+  void SetPartial() { partial_ = true; }
+  void SetUnorderedRepeatedFields() { unordered_repeated_fields_ = true; }
+
+ private:
+  ExpectedProto expected_proto_;
+  bool partial_ = false;
+  bool unordered_repeated_fields_ = false;
+};
+
+}  // namespace internal
+
+// Returns a matcher that matches a proto that equals the given proto.
+template <typename Proto, typename = std::enable_if_t<std::is_base_of_v<
+                              ::tsl::protobuf::Message, Proto>>>
+inline auto EqualsProto(Proto proto) {
+  return internal::EqualsProtoMatcher<Proto>(std::move(proto));
+}
+
+// Returns a matcher that matches a proto that equals the given proto
+// (represented as a text string).
+inline auto EqualsProto(absl::string_view proto) {
+  return internal::EqualsProtoMatcher<std::string>(std::string(proto));
+}
+
+}  // namespace proto_testing
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_PROTO_PROTO_MATCHERS_H_
diff --git a/third_party/xla/xla/tsl/util/proto/proto_matchers_test.cc b/third_party/xla/xla/tsl/util/proto/proto_matchers_test.cc
new file mode 100644
index 000000000000..87c98c47f916
--- /dev/null
+++ b/third_party/xla/xla/tsl/util/proto/proto_matchers_test.cc
@@ -0,0 +1,237 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+#include <sstream>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/tsl/util/proto/proto_matchers_test_protos.pb.h"
+#include "tsl/platform/protobuf.h"
+
+namespace tsl {
+namespace proto_testing {
+namespace {
+
+using ::testing::Matcher;
+using ::testing::MatchesRegex;
+using ::testing::Not;
+
+Foo MakeFoo(absl::string_view sv) {
+  const std::string s = std::string(sv);
+  Foo foo;
+  EXPECT_TRUE(::tsl::protobuf::TextFormat::ParseFromString(s, &foo));
+  return foo;
+}
+
+// Returns the description of the given matcher.
+std::string Describe(const Matcher<const Foo&>& matcher) {
+  std::stringstream ss;
+  matcher.DescribeTo(&ss);
+  return ss.str();
+}
+
+TEST(EqualsProto, DescribesSelfWhenGivenProto) {
+  const Matcher<const Foo&> matcher =
+      EqualsProto(MakeFoo(R"pb(
+        s1: "foo" r3: "a" r3: "b" r3: "c"
+      )pb"));
+
+  EXPECT_THAT(Describe(matcher),
+              MatchesRegex("equals (.|\n)*s1: \"foo\"(.|\n)*r3: \"a\"(.|\n)r3: "
+                           "\"b\"(.|\n)r3: \"c\"(.|\n)"));
+  EXPECT_THAT(Describe(Not(matcher)),
+              MatchesRegex("not equals (.|\n)*s1: \"foo\"(.|\n)*r3: "
+                           "\"a\"(.|\n)r3: \"b\"(.|\n)r3: \"c\"(.|\n)"));
+}
+
+TEST(EqualsProto, DescribesSelfWhenGivenString) {
+  const Matcher<const Foo&> matcher =
+      EqualsProto(R"pb(s1: "foo" r3: "a" r3: "b" r3: "c")pb");
+
+  EXPECT_EQ(Describe(matcher),
+            R"pb(equals s1: "foo" r3: "a" r3: "b" r3: "c")pb");
+  EXPECT_EQ(Describe(Not(matcher)),
+            R"pb(not equals s1: "foo" r3: "a" r3: "b" r3: "c")pb");
+}
+
+TEST(EqualsProto, WorksWithProtoArgument) {
+  const Foo foo = MakeFoo(R"(
+      s1: "foo"
+      r3: "a"
+      r3: "b"
+      r3: "c"
+    )");
+  EXPECT_THAT(foo, EqualsProto(MakeFoo(R"pb(
+                s1: "foo" r3: "a" r3: "b" r3: "c"
+              )pb")));
+  EXPECT_THAT(foo,
+              Not(EqualsProto(MakeFoo(R"pb(
+                s1: "bar" r3: "a" r3: "b" r3: "c"
+              )pb"))));
+}
+
+TEST(EqualsProto, WorksWithStringArgument) {
+  const Foo foo = MakeFoo(R"(
+      s1: "foo"
+      r3: "a"
+      r3: "b"
+      r3: "c"
+    )");
+  EXPECT_THAT(foo, EqualsProto(R"pb(
+                s1: "foo" r3: "a" r3: "b" r3: "c"
+              )pb"));
+  EXPECT_THAT(foo, EqualsProto(R"pb(
+                r3: "a" r3: "b" s1: "foo" r3: "c"
+              )pb"));
+  EXPECT_THAT(foo, Not(EqualsProto(R"pb(
+                s1: "foobar" r3: "a" r3: "b" r3: "c"
+              )pb")));
+  EXPECT_THAT(foo, Not(EqualsProto(R"pb(
+                !garbage ^ &*
+              )pb")));
+  EXPECT_THAT(foo,
+              Not(EqualsProto(R"pb(
+                r3: "a" r3: "b" s1: "foo" r3: "c" r3: "d"
+              )pb")));
+  EXPECT_THAT(foo, Not(EqualsProto(R"pb(
+                s1: "foo" r3: "b" r3: "c" r3: "a"
+              )pb")));
+  EXPECT_THAT(foo, Not(EqualsProto(R"pb(
+                s1: "foo" i2: 32 r3: "a" r3: "b" r3: "c"
+              )pb")));
+}
+
+TEST(EqualsProto, WorksWithPartially) {
+  const Foo foo = MakeFoo(R"(
+      s1: "foo"
+      i2: 32
+      r3: "a"
+      r3: "b"
+      r3: "c"
+    )");
+  EXPECT_THAT(foo, Partially(EqualsProto(R"pb(
+                s1: "foo" r3: "a" r3: "b" r3: "c"
+              )pb")));
+  EXPECT_THAT(foo, Partially(EqualsProto(R"pb(
+                r3: "a" r3: "b" r3: "c"
+              )pb")));
+  EXPECT_THAT(foo, Partially(EqualsProto(R"pb(
+                s1: "foo"
+              )pb")));
+  EXPECT_THAT(foo, Partially(EqualsProto(R"pb(
+                r3: "a" r3: "b" r3: "c"
+              )pb")));
+  // bad order
+  EXPECT_THAT(foo,
+              Not(Partially(EqualsProto(R"pb(
+                s1: "foo" r3: "b" r3: "c" r3: "a"
+              )pb"))));
+  // new value
+  EXPECT_THAT(foo, Not(Partially(EqualsProto(R"pb(
+                s1: "foo"
+                i2: 10
+                r3: "a"
+                r3: "b"
+                r3: "c"
+              )pb"))));
+}
+
+TEST(EqualsProto, WorksWithIgnoringRepeatedFieldOrdering) {
+  const Foo foo = MakeFoo(R"(
+      s1: "foo"
+      r3: "a"
+      r3: "b"
+      r3: "c"
+    )");
+  EXPECT_THAT(foo, IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                s1: "foo"
+                r3: "a"
+                r3: "c"
+                r3: "b"
+              )pb")));
+  EXPECT_THAT(foo, IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                r3: "a"
+                r3: "c"
+                s1: "foo"
+                r3: "b"
+              )pb")));
+  EXPECT_THAT(foo, Not(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                s1: "foobar"
+                r3: "b"
+                r3: "a"
+                r3: "c"
+              )pb"))));
+  EXPECT_THAT(foo, Not(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                r3: "b"
+                r3: "a"
+                s1: "foo"
+                r3: "c"
+                r3: "d"
+              )pb"))));
+  EXPECT_THAT(foo, Not(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                s1: "foo"
+                i2: 32
+                r3: "a"
+                r3: "b"
+                r3: "c"
+              )pb"))));
+}
+
+TEST(EqualsProto, WorksWithPartiallyAndIgnoringOrder) {
+  const Foo foo = MakeFoo(R"(
+      s1: "foo"
+      i2: 32
+      r3: "a"
+      r3: "b"
+      r3: "c"
+    )");
+  EXPECT_THAT(foo, Partially(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                s1: "foo"
+                r3: "a"
+                r3: "b"
+                r3: "c"
+              )pb"))));
+  EXPECT_THAT(foo, Partially(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                r3: "b"
+                r3: "a"
+                r3: "c"
+              )pb"))));
+  EXPECT_THAT(foo, Partially(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                s1: "foo"
+              )pb"))));
+  // bad order
+  EXPECT_THAT(foo, Not(Partially(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                s1: "bar"
+                r3: "b"
+                r3: "c"
+                r3: "a"
+              )pb")))));
+  // new value
+  EXPECT_THAT(foo, Not(Partially(IgnoringRepeatedFieldOrdering(EqualsProto(R"pb(
+                s1: "foo"
+                i2: 10
+                r3: "b"
+                r3: "a"
+                r3: "c"
+              )pb")))));
+}
+
+}  // namespace
+}  // namespace proto_testing
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/util/proto/proto_matchers_test_protos.proto b/third_party/xla/xla/tsl/util/proto/proto_matchers_test_protos.proto
new file mode 100644
index 000000000000..558ec479b18a
--- /dev/null
+++ b/third_party/xla/xla/tsl/util/proto/proto_matchers_test_protos.proto
@@ -0,0 +1,23 @@
+// Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package tsl.proto_testing;
+
+message Foo {
+  optional string s1 = 1;
+  optional int32 i2 = 2;
+  repeated string r3 = 3;
+}
diff --git a/third_party/xla/xla/tsl/util/reporter.cc b/third_party/xla/xla/tsl/util/reporter.cc
index 08bdcd6c8fb1..4a82023959e4 100644
--- a/third_party/xla/xla/tsl/util/reporter.cc
+++ b/third_party/xla/xla/tsl/util/reporter.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/tsl/util/reporter.h"
 
 #include "xla/tsl/platform/errors.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/str_util.h"
 
 namespace tsl {
diff --git a/third_party/xla/xla/tsl/util/reporter.h b/third_party/xla/xla/tsl/util/reporter.h
index be504656c3e9..25896589ca61 100644
--- a/third_party/xla/xla/tsl/util/reporter.h
+++ b/third_party/xla/xla/tsl/util/reporter.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/protobuf/test_log.pb.h"
-#include "tsl/platform/mutex.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/xla/tsl/util/safe_reinterpret_cast.h b/third_party/xla/xla/tsl/util/safe_reinterpret_cast.h
new file mode 100644
index 000000000000..47dc02e68eba
--- /dev/null
+++ b/third_party/xla/xla/tsl/util/safe_reinterpret_cast.h
@@ -0,0 +1,127 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+// This file provides a safe_reinterpret_cast function template that is like
+// reinterpret_cast, but compiles only if the cast is safe.
+//
+// In general, reinterpret_cast is unsafe because it can easily cause undefined
+// behavior. For example,
+//
+//     Foo* foo = ...;
+//     Bar* bar = reinterpret_cast<Bar*>(foo);
+//     *bar = ...;
+//
+// is undefined behavior unless Foo or Bar is a character type. See
+// https://en.cppreference.com/w/cpp/language/reinterpret_cast for more details.
+//
+// safe_reinterpret_cast is a subset of the casts that are always safe. We can
+// add more as needed.
+
+#ifndef XLA_TSL_UTIL_SAFE_REINTERPRET_CAST_H_
+#define XLA_TSL_UTIL_SAFE_REINTERPRET_CAST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace tsl {
+
+namespace internal {
+
+// IsByteLike<T>::value is true if T is a byte-like type (char, unsigned char,
+// or std::byte).
+template <typename T>
+struct IsByteLike : std::false_type {};
+template <>
+struct IsByteLike<char> : std::true_type {};
+template <>
+struct IsByteLike<unsigned char> : std::true_type {};
+template <>
+struct IsByteLike<std::byte> : std::true_type {};
+
+// IsSafeCast<From, To>::value is true if it is safe to reinterpret_cast a
+// value of type From to a value of type To.
+//
+// This is a subset of the types that are safe to cast, but it's the only
+// subset that we need for now. We can add more as needed.
+template <typename From, typename To>
+struct IsSafeCast : std::false_type {};
+
+// 1.  The C++ standard guarantees that it's safe to cast a pointer to/from a
+//     pointer to a byte-like type.
+// 2a. The Google C++ style guide states that it's safe to cast a data pointer
+//     to/from a void pointer.
+// 2b. While not guaranteed by the C++ standard, POSIX mandates that it's safe
+//     to cast a function pointer to/from a void pointer
+//     (https://pubs.opengroup.org/onlinepubs/9799919799/functions/dlsym.html).
+//     On Windows (with MSVC), casting a function pointer to/from a void
+//     pointer has been a widely adopted practice for decades and is considered
+//     safe in practice, even though it is not explicitly guaranteed by
+//     Microsoft.
+// 3.  It's safe to cast a pointer or to/from the same type.
+template <typename From, typename To>
+struct IsSafeCast<From*, To*>
+    : std::integral_constant<
+          bool,
+          // To/from a pointer to a byte-like type.
+          (IsByteLike<typename std::remove_cv<From>::type>::value ||
+           IsByteLike<typename std::remove_cv<To>::type>::value) ||
+              // To/from void pointer.
+              (std::is_void_v<From> || std::is_void_v<To>) ||
+              // Between the same type.
+              std::is_same_v<From, To>> {};
+
+// If __restrict is a macro, we assume that the compiler doesn't support
+// the __restrict keyword (e.g. when the code is compiled for iOS). Otherwsie,
+// we make safe_reinterpret_cast ignore the __restrict qualifier.
+#ifndef __restrict  // If __restrict is not a macro.
+
+template <typename From, typename To>
+struct IsSafeCast<From*, To* __restrict> : IsSafeCast<From*, To*> {};
+template <typename From, typename To>
+struct IsSafeCast<From* __restrict, To*> : IsSafeCast<From*, To*> {};
+template <typename From, typename To>
+struct IsSafeCast<From* __restrict, To* __restrict> : IsSafeCast<From*, To*> {};
+
+#endif  // __restrict
+
+// It's safe to cast a pointer to/from std::uintptr_t.
+template <typename From>
+struct IsSafeCast<From*, std::uintptr_t> : std::true_type {};
+template <typename To>
+struct IsSafeCast<std::uintptr_t, To*> : std::true_type {};
+
+// It's safe to cast a pointer to/from std::intptr_t.
+template <typename From>
+struct IsSafeCast<From*, std::intptr_t> : std::true_type {};
+template <typename To>
+struct IsSafeCast<std::intptr_t, To*> : std::true_type {};
+
+// It's safe to cast the nullptr literal to std::uintptr_t or std::intptr_t.
+template <>
+struct IsSafeCast<std::nullptr_t, std::uintptr_t> : std::true_type {};
+template <>
+struct IsSafeCast<std::nullptr_t, std::intptr_t> : std::true_type {};
+
+}  // namespace internal
+
+// Like reinterpret_cast, but compiles only if it's safe.
+template <typename To, typename From,
+          typename = std::enable_if_t<internal::IsSafeCast<From, To>::value>>
+To safe_reinterpret_cast(From from) {
+  return reinterpret_cast<To>(from);  // REINTERPRET_CAST_OK=for implementing
+                                      // safe_reinterpret_cast.
+}
+
+}  // namespace tsl
+
+#endif  // XLA_TSL_UTIL_SAFE_REINTERPRET_CAST_H_
diff --git a/third_party/xla/xla/tsl/util/safe_reinterpret_cast_test.cc b/third_party/xla/xla/tsl/util/safe_reinterpret_cast_test.cc
new file mode 100644
index 000000000000..4b344bca53d9
--- /dev/null
+++ b/third_party/xla/xla/tsl/util/safe_reinterpret_cast_test.cc
@@ -0,0 +1,168 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+#include "xla/tsl/util/safe_reinterpret_cast.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+namespace tsl {
+namespace {
+
+TEST(SafeReinterpretCast, CanCastPointerToFromConstCharPointer) {
+  const int x = 42;
+  const char* const char_p = safe_reinterpret_cast<const char*>(&x);
+  EXPECT_EQ(
+      char_p,                              //
+      reinterpret_cast<const char*>(&x));  // REINTERPRET_CAST_OK=for testing.
+
+  const int* const int_p = safe_reinterpret_cast<const int*>(char_p);
+  EXPECT_EQ(int_p, &x);
+}
+
+TEST(SafeReinterpretCast, CanCastPointerToFromConstBytePointer) {
+  const int x = 42;
+  const std::byte* const char_p = safe_reinterpret_cast<const std::byte*>(&x);
+  EXPECT_EQ(
+      char_p,                              //
+      reinterpret_cast<const std::byte*>(  // REINTERPRET_CAST_OK=for testing.
+          &x));
+
+  const int* const int_p = safe_reinterpret_cast<const int*>(char_p);
+  EXPECT_EQ(int_p, &x);
+}
+
+TEST(SafeReinterpretCast, CanCastPointerToFromConstUnsignedCharPointer) {
+  const int x = 42;
+  const unsigned char* const char_p =
+      safe_reinterpret_cast<const unsigned char*>(&x);
+  EXPECT_EQ(char_p,                                  //
+            reinterpret_cast<const unsigned char*>(  // REINTERPRET_CAST_OK=for
+                                                     // testing.
+                &x));
+
+  const int* const int_p = safe_reinterpret_cast<const int*>(char_p);
+  EXPECT_EQ(int_p, &x);
+}
+
+TEST(SafeReinterpretCast, CanCastPointerToFromMutableCharPointer) {
+  int x = 42;
+  char* const char_p = safe_reinterpret_cast<char*>(&x);
+  EXPECT_EQ(char_p,                        //
+            reinterpret_cast<char*>(&x));  // REINTERPRET_CAST_OK=for testing.
+
+  int* const int_p = safe_reinterpret_cast<int*>(char_p);
+  EXPECT_EQ(int_p, &x);
+}
+
+TEST(SafeReinterpretCast, CanCastBetweenByteLikePointers) {
+  char x = 'A';
+  std::byte* const byte_p = safe_reinterpret_cast<std::byte*>(&x);
+  EXPECT_EQ(byte_p,                        //
+            reinterpret_cast<std::byte*>(  // REINTERPRET_CAST_OK=for testing.
+                &x));
+
+  unsigned char* const unsigned_char_p =
+      safe_reinterpret_cast<unsigned char*>(&x);
+  EXPECT_EQ(unsigned_char_p,                   //
+            reinterpret_cast<unsigned char*>(  // REINTERPRET_CAST_OK=for
+                                               // testing.
+                &x));
+}
+
+TEST(SafeReinterpretCast, CanCastPointerToFromStdUintptrT) {
+  const int x = 42;
+  const std::uintptr_t uintptr_t_p = safe_reinterpret_cast<std::uintptr_t>(&x);
+  EXPECT_EQ(
+      uintptr_t_p,                       //
+      reinterpret_cast<std::uintptr_t>(  // REINTERPRET_CAST_OK=for testing.
+          &x));
+  EXPECT_EQ(safe_reinterpret_cast<const int*>(uintptr_t_p), &x);
+}
+
+TEST(SafeReinterpretCast, CanCastPointerToFromStdIntptrT) {
+  const int x = 42;
+  const std::intptr_t intptr_t_p = safe_reinterpret_cast<std::intptr_t>(&x);
+  EXPECT_EQ(
+      intptr_t_p,                       //
+      reinterpret_cast<std::intptr_t>(  // REINTERPRET_CAST_OK=for testing.
+          &x));
+  EXPECT_EQ(safe_reinterpret_cast<const int*>(intptr_t_p), &x);
+}
+
+TEST(SafeReinterpretCast, CanCastNullptrToStdUintptrT) {
+  const std::uintptr_t n = safe_reinterpret_cast<std::uintptr_t>(nullptr);
+  EXPECT_EQ(safe_reinterpret_cast<const void*>(n), nullptr);
+}
+
+TEST(SafeReinterpretCast, CanCastNullptrToStdIntptrT) {
+  const std::intptr_t n = safe_reinterpret_cast<std::intptr_t>(nullptr);
+  EXPECT_EQ(safe_reinterpret_cast<const void*>(n), nullptr);
+}
+
+TEST(SafeReinterpretCast, CanCastPointerToFromSameType) {
+  const int x = 42;
+  const int* const int_p = safe_reinterpret_cast<const int*>(&x);
+  EXPECT_EQ(int_p, &x);
+
+  char y = 'A';
+  char* const char_p = safe_reinterpret_cast<char*>(&y);
+  EXPECT_EQ(char_p, &y);
+}
+
+TEST(SafeReinterpretCast, CanCastPointerToRestrictPointer) {
+  const int x = 42;
+  const char* __restrict const char_p =
+      safe_reinterpret_cast<const char* __restrict>(&x);
+  EXPECT_EQ(char_p,                         //
+            reinterpret_cast<const char*>(  // REINTERPRET_CAST_OK=for testing.
+                &x));
+}
+
+TEST(SafeReinterpretCast, CanCastRestrictPointerToPointer) {
+  const int x = 42;
+  const int* __restrict const int_p = &x;
+  const char* const char_p = safe_reinterpret_cast<const char*>(int_p);
+  EXPECT_EQ(char_p,                         //
+            reinterpret_cast<const char*>(  // REINTERPRET_CAST_OK=for testing.
+                &x));
+}
+
+TEST(SafeReinterpretCast, CanCastRestrictPointerToRestrictPointer) {
+  const int x = 42;
+  const int* __restrict const int_p = &x;
+  const char* __restrict const char_p =
+      safe_reinterpret_cast<const char* __restrict>(int_p);
+  EXPECT_EQ(char_p,                         //
+            reinterpret_cast<const char*>(  // REINTERPRET_CAST_OK=for testing.
+                &x));
+}
+
+void Dummy() {}
+
+TEST(SafeReinterpretCast, CanCastFuncPointerToFromVoidPointer) {
+  void* const void_p = safe_reinterpret_cast<void*>(&Dummy);
+  void (*func_p)() = safe_reinterpret_cast<void (*)()>(void_p);
+  EXPECT_EQ(func_p, &Dummy);
+}
+
+TEST(SafeReinterpretCast, CanCastDataPointerToFromVoidPointer) {
+  int x = 42;
+  void* const void_p = safe_reinterpret_cast<void*>(&x);
+  int* const int_p = safe_reinterpret_cast<int*>(void_p);
+  EXPECT_EQ(int_p, &x);
+}
+
+}  // namespace
+}  // namespace tsl
diff --git a/third_party/xla/xla/tsl/util/stats_calculator.cc b/third_party/xla/xla/tsl/util/stats_calculator.cc
index cdfa46c94417..eddeea382f6f 100644
--- a/third_party/xla/xla/tsl/util/stats_calculator.cc
+++ b/third_party/xla/xla/tsl/util/stats_calculator.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include "xla/tsl/util/stats_calculator.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <iomanip>
+#include <ios>
 #include <map>
 #include <queue>
 #include <sstream>
 #include <string>
+#include <utility>
+#include <vector>
 
 namespace tsl {
 
@@ -103,37 +108,52 @@ std::string StatsCalculator::ColumnString(const Detail& detail,
 
 void StatsCalculator::OrderNodesByMetric(
     SortingMetric metric, std::vector<const Detail*>* details) const {
+  // We convert each metric value to a string and use a priority queue to keep
+  // them sorted in descending order. For cases where we want to sort in
+  // ascending order, we transform the metric value to a string that
+  // represents the inverse of the original value.
   std::priority_queue<std::pair<std::string, const Detail*>> sorted_list;
-  const int num_nodes = details_.size();
 
-  for (const auto& det : details_) {
-    const Detail* detail = &(det.second);
+  // We keep the run order metric in ascending order, so we need to know the
+  // maximum run order in order to sort the nodes correctly.
+  int max_run_order = 0;
+  for (const auto& [_, detail] : details_) {
+    max_run_order = std::max<int>(max_run_order, detail.run_order);
+  }
+  const int num_nodes = max_run_order;
+
+  for (const auto& [_, detail] : details_) {
     std::stringstream stream;
     stream << std::setw(20) << std::right << std::setprecision(10)
            << std::fixed;
 
     switch (metric) {
       case BY_NAME:
-        stream << detail->name;
+        // Sorted in ascending order of length of name.
+        stream << detail.name;
         break;
       case BY_RUN_ORDER:
-        stream << num_nodes - detail->run_order;
+        // Sorted in ascending order.
+        stream << num_nodes - detail.run_order;
         break;
       case BY_TIME:
-        stream << detail->elapsed_time.avg();
+        // Sorted in descending order.
+        stream << detail.elapsed_time.avg();
         break;
       case BY_MEMORY:
-        stream << detail->mem_used.avg();
+        // Sorted in descending order.
+        stream << detail.mem_used.avg();
         break;
       case BY_TYPE:
-        stream << detail->type;
+        // Sorted in ascending order of length of type.
+        stream << detail.type;
         break;
       default:
         stream << "";
         break;
     }
 
-    sorted_list.emplace(stream.str(), detail);
+    sorted_list.emplace(stream.str(), &detail);
   }
 
   while (!sorted_list.empty()) {
diff --git a/third_party/xla/xla/tsl/util/stats_calculator.h b/third_party/xla/xla/tsl/util/stats_calculator.h
index 253895ca605f..e5b0d6854f4b 100644
--- a/third_party/xla/xla/tsl/util/stats_calculator.h
+++ b/third_party/xla/xla/tsl/util/stats_calculator.h
@@ -217,6 +217,9 @@ class StatsCalculator {
                     int64_t run_order, int64_t rel_end_us, int64_t mem_used);
 
  private:
+  // Orders the nodes in the details_ map by the given sorting metric. The
+  // details vector is populated with pointers to the Detail objects in the
+  // details_ map.
   void OrderNodesByMetric(SortingMetric sorting_metric,
                           std::vector<const Detail*>* details) const;
 
diff --git a/third_party/xla/xla/tsl/util/stats_calculator_test.cc b/third_party/xla/xla/tsl/util/stats_calculator_test.cc
index bbd75845f583..bf9401e4a5cb 100644
--- a/third_party/xla/xla/tsl/util/stats_calculator_test.cc
+++ b/third_party/xla/xla/tsl/util/stats_calculator_test.cc
@@ -18,7 +18,10 @@ limitations under the License.
 #include <cfloat>
 #include <cmath>
 #include <cstdint>
+#include <string>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "xla/tsl/platform/test.h"
 
 namespace tsl {
@@ -140,5 +143,28 @@ TEST(StatsCalculatorTest, StatWithPercentiles) {
   EXPECT_EQ(150, stat.percentile(100));
 }
 
+TEST(StatsCalculatorTest,
+     VerifyOrderStatsByRunOrderForMaxRunOrderLargerThanDetailsSize) {
+  auto options = StatSummarizerOptions();
+  StatsCalculator calc(options);
+  EXPECT_TRUE(calc.GetDetails().empty());
+
+  calc.AddNodeStats("node1", "type_1", 1, 10, 20);
+  ASSERT_EQ(calc.GetDetails().size(), 1);
+
+  calc.AddNodeStats("node1", "type_1", 2, 11, 21);
+  ASSERT_EQ(calc.GetDetails().size(), 1);
+  calc.AddNodeStats("node2", "type_2", 3, 10, 100);
+  ASSERT_EQ(calc.GetDetails().size(), 2);
+  calc.UpdateRunTotalUs(100);
+  std::string stats = calc.GetStatsByMetric(
+      "test", StatsCalculator::SortingMetric::BY_RUN_ORDER, 0);
+  ASSERT_GT(stats.size(), 0);
+  ASSERT_THAT(stats, ::testing::HasSubstr("node1"));
+  ASSERT_THAT(stats, ::testing::HasSubstr("node2"));
+  // Ensure that node1 has a lower run order than node2 in the stats.
+  ASSERT_LT(stats.find("node1"), stats.find("node2"));
+}
+
 }  // namespace
 }  // namespace tsl
diff --git a/third_party/xla/xla/tsl/util/use_cudnn.cc b/third_party/xla/xla/tsl/util/use_cudnn.cc
index da15cadca631..2a45301492a0 100644
--- a/third_party/xla/xla/tsl/util/use_cudnn.cc
+++ b/third_party/xla/xla/tsl/util/use_cudnn.cc
@@ -38,24 +38,6 @@ namespace tsl {
     return value;                                                \
   }
 
-bool CudnnUseFrontend() {
-  static bool result = [] {
-    bool value = false;
-#if GOOGLE_CUDA
-    if (CUDNN_VERSION >= 8100) {
-      // cuDNN 8.1.0 + the frontend has issues regarding fused convolution.
-      absl::Status status = ReadBoolFromEnvVar("TF_CUDNN_USE_FRONTEND",
-                                               CUDNN_VERSION >= 8200, &value);
-      if (!status.ok()) {
-        LOG(ERROR) << status;
-      }
-    }
-#endif  // GOOGLE_CUDA
-    return value;
-  }();
-  return result;
-}
-
 // Whether to enable Cudnn runtime compiled kernels which are able to support
 // more general fusion patterns but might increase the warmup time.
 // TODO(kaixih@nvidia): we can make it default when Cudnn further improves the
@@ -64,12 +46,10 @@ bool CudnnUseRuntimeFusion() {
   static bool result = [] {
     bool value = false;
 #if GOOGLE_CUDA
-    if (CUDNN_VERSION >= 8400) {
-      absl::Status status =
-          ReadBoolFromEnvVar("TF_CUDNN_USE_RUNTIME_FUSION", false, &value);
-      if (!status.ok()) {
-        LOG(ERROR) << status;
-      }
+    absl::Status status =
+        ReadBoolFromEnvVar("TF_CUDNN_USE_RUNTIME_FUSION", false, &value);
+    if (!status.ok()) {
+      LOG(ERROR) << status;
     }
 #endif  // GOOGLE_CUDA
     return value;
diff --git a/third_party/xla/xla/tsl/util/use_cudnn.h b/third_party/xla/xla/tsl/util/use_cudnn.h
index 41c29b256f7b..b14c741a5b29 100644
--- a/third_party/xla/xla/tsl/util/use_cudnn.h
+++ b/third_party/xla/xla/tsl/util/use_cudnn.h
@@ -23,7 +23,6 @@ limitations under the License.
 namespace tsl {
 
 bool CudnnUseAutotune();
-bool CudnnUseFrontend();
 bool CudnnUseRuntimeFusion();
 bool CudnnRnnUseAutotune();
 bool CudnnDisableConv1x1Optimization();
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index 3840541783e1..70f2dbe237b1 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -20,10 +20,12 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <iterator>
 #include <limits>
-#include <numeric>
+#include <memory>
+#include <sstream>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -52,6 +54,7 @@ limitations under the License.
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/numbers.h"
+#include "tsl/platform/protobuf.h"
 #include "tsl/platform/stacktrace.h"
 
 namespace xla {
@@ -348,8 +351,8 @@ void LogLines(absl::LogSeverity sev, absl::string_view text, const char* fname,
 }
 
 int64_t Product(absl::Span<const int64_t> xs) {
-  return std::accumulate(xs.begin(), xs.end(), static_cast<int64_t>(1),
-                         std::multiplies<int64_t>());
+  return absl::c_accumulate(xs, static_cast<int64_t>(1),
+                            std::multiplies<int64_t>());
 }
 
 std::vector<int64_t> ElemwiseProduct(absl::Span<const int64_t> a,
@@ -515,4 +518,52 @@ bool DistinctNumbersAreConsecutiveIfSorted(absl::Span<const int64_t> seq) {
          seq.size() - 1;
 }
 
+std::string PrintAllFields(const tsl::protobuf::Message& message) {
+  tsl::protobuf::TextFormat::Printer tsl_printer;
+  const tsl::protobuf::Reflection* reflection = message.GetReflection();
+  std::stringstream result;
+  std::string buffer;
+  const tsl::protobuf::Descriptor* descriptor = message.GetDescriptor();
+  for (int i = 0; i < descriptor->field_count(); ++i) {
+    const tsl::protobuf::FieldDescriptor* field = descriptor->field(i);
+    if (field->is_repeated()) {
+      result << field->name() << ": [";
+      for (int j = 0; j < reflection->FieldSize(message, field); ++j) {
+        if (j > 0) {
+          result << ", ";
+        }
+        tsl_printer.PrintFieldValueToString(message, field, j, &buffer);
+        result << buffer;
+      }
+      result << "]\n";
+    } else {
+      result << field->name() << ": ";
+      tsl_printer.PrintFieldValueToString(message, field, -1, &buffer);
+      result << buffer << "\n";
+    }
+  }
+  return result.str();
+}
+
+std::unique_ptr<void, FreeDeleter> AlignedAlloc(std::size_t alignment,
+                                                std::size_t size) {
+  CHECK_GT(alignment, 0) << "alignment must be positive";
+  CHECK(IsPowerOf2(alignment))
+      << "alignment must be a power of 2, but got " << alignment;
+  CHECK_GT(size, 0) << "size must be positive";
+#ifdef _WIN32
+  void* raw_ptr = _aligned_malloc(size, alignment);  // Note argument order
+#elif defined(__ANDROID__) && __ANDROID_API__ < 28
+  // Use posix_memalign as a fallback for older Android APIs
+  void* raw_ptr;
+  int result = posix_memalign(&raw_ptr, alignment, size);
+  CHECK_EQ(result, 0) << "posix_memalign failed with error code: " << result;
+#else
+  void* raw_ptr = std::aligned_alloc(alignment, size);
+#endif
+  CHECK_NE(raw_ptr, nullptr) << "aligned_alloc failed";
+  // Return unique_ptr managing the memory.
+  return std::unique_ptr<void, FreeDeleter>(raw_ptr, FreeDeleter());
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index 2eaa0c25be81..46362c2bd9d4 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <array>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <initializer_list>
 #include <limits>
@@ -50,11 +51,13 @@ limitations under the License.
 #include "xla/tsl/lib/math/math_util.h"
 #include "xla/tsl/platform/errors.h"  // IWYU pragma: keep
 #include "xla/tsl/platform/logging.h"
+#include "xla/tsl/util/safe_reinterpret_cast.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/bfloat16.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/ml_dtypes.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -159,7 +162,8 @@ class ScopedLoggingTimer {
 template <typename T>
 absl::Span<const uint8_t> CastToByteSlice(absl::Span<const T> slice) {
   return absl::Span<const uint8_t>(
-      reinterpret_cast<const uint8_t*>(slice.data()), slice.size() * sizeof(T));
+      tsl::safe_reinterpret_cast<const uint8_t*>(slice.data()),
+      slice.size() * sizeof(T));
 }
 
 // Casts a byte slice to a non-byte type T, checking that the original slice
@@ -167,7 +171,7 @@ absl::Span<const uint8_t> CastToByteSlice(absl::Span<const T> slice) {
 template <typename T>
 absl::Span<const T> CastByteSlice(absl::Span<const uint8_t> slice) {
   CHECK_EQ(0, slice.size() % sizeof(T));
-  return absl::Span<const T>(reinterpret_cast<const T*>(slice.data()),
+  return absl::Span<const T>(tsl::safe_reinterpret_cast<const T*>(slice.data()),
                              slice.size() / sizeof(T));
 }
 
@@ -761,7 +765,7 @@ std::vector<int64_t> ElemwiseProduct(absl::Span<const int64_t> a,
 // and `b` with the same product, i.e. `(i, j)` so
 // • a = {a[0 = i_0], ..., a[i_1 - 1], a[i_1], ... , a[i_2 - 1], ...}
 // • b = {b[0 = j_0], ..., b[j_1 - 1], b[j_1], ... , b[j_2 - 1], ...}
-// • ∀ k . 0 <= k < CommonFactors(a, b).size - 1 =>
+// • ∀ k . 0 <= k < CommonFactors(a, b).size =>
 //         a[i_k] × a[i_k + 1] × ... × a[i_(k+1) - 1] =
 //         b[j_k] × b[j_k + 1] × ... × b[j_(k+1) - 1]
 // where `CommonFactors(a, b)[CommonFactors(a, b).size - 1] = (a.size, b.size)`
@@ -846,12 +850,15 @@ absl::Status EraseElementFromVector(std::vector<T>* container, const T& value) {
   return absl::OkStatus();
 }
 
-// Takes a sequence of unpacked n-bit values, such that every byte stores one
-// value in the low-order bits, and packs them so every byte stores as many
-// which will fit. `output` should have ceil((input.size()*kBitsPerElement)/8)
-// bytes. The high-order bits of each byte in `input` are ignored.
+// Takes a sequence of unpacked kBitsPerElement-bit values (kBitsPerElement must
+// be between 1 and 7), such that every byte stores one value in the low-order
+// bits, and packs them so every byte stores as many which will fit. `output`
+// should have at least ceil((input.size()*kBitsPerElement)/8.0) bytes. The
+// high-order bits of each byte in `input` are ignored.
 template <size_t kBitsPerElement>
 void PackIntN(absl::Span<const char> input, absl::Span<char> output) {
+  static_assert(1 <= kBitsPerElement);
+  static_assert(kBitsPerElement <= 7);
   constexpr auto kElementsPerByte = 8 / kBitsPerElement;
   const size_t aligned_inputs = input.size() / kElementsPerByte;
   for (size_t i = 0; i < aligned_inputs; ++i) {
@@ -863,7 +870,8 @@ void PackIntN(absl::Span<const char> input, absl::Span<char> output) {
     }
     output[i] = byte;
   }
-  if (size_t remainder = input.size() % kElementsPerByte; remainder != 0) {
+  if (const size_t remainder = input.size() % kElementsPerByte;
+      remainder != 0) {
     char byte = 0;
     for (size_t j = 0; j < remainder; ++j) {
       byte |= (input[aligned_inputs * kElementsPerByte + j] &
@@ -874,6 +882,8 @@ void PackIntN(absl::Span<const char> input, absl::Span<char> output) {
   }
 }
 
+// Same as above, but takes the number of bits per element as an argument.
+// `bits_per_element` must be 2 or 4, or this function will crash.
 inline void PackIntN(int bits_per_element, absl::Span<const char> input,
                      absl::Span<char> output) {
   if (bits_per_element == 2) {
@@ -885,13 +895,28 @@ inline void PackIntN(int bits_per_element, absl::Span<const char> input,
   }
 }
 
+// Same as above, but takes the number of bits per element, a pointer to the
+// source data, and the size of the data in bytes. Returns a unique pointer to
+// the packed data.
+inline std::unique_ptr<char[]> PackIntN(int bits_per_element, const char* data,
+                                        size_t size) {
+  size_t packed_size = size * bits_per_element / 8;
+  auto buffer = std::make_unique<char[]>(packed_size);
+  auto src = absl::MakeSpan(data, size);
+  auto dst = absl::MakeSpan(buffer.get(), packed_size);
+  PackIntN(bits_per_element, src, dst);
+  return buffer;
+}
+
 // Takes a sequence of packed values, such that every byte stores multiple
 // values, and unpacks them so every byte stores one value in the low-order
 // bits. `input` should have
-// ceil(output.size()*8/kBitsPerElement) bytes. The high-order bits in each
-// output are zero.
+// ceil(output.size()*8.0/kBitsPerElement) bytes. kBitsPerElement must be
+// between 1 and 7. ßThe high-order bits in each output are zero.
 template <size_t kBitsPerElement>
 void UnpackIntN(absl::Span<const char> input, absl::Span<char> output) {
+  static_assert(1 <= kBitsPerElement);
+  static_assert(kBitsPerElement <= 7);
   constexpr auto kElementsPerByte = 8 / kBitsPerElement;
   const size_t aligned_outputs = output.size() / kElementsPerByte;
   for (size_t i = 0; i < aligned_outputs; ++i) {
@@ -910,6 +935,8 @@ void UnpackIntN(absl::Span<const char> input, absl::Span<char> output) {
   }
 }
 
+// Same as above, but takes the number of bits per element as an argument.
+// `bits_per_element` must be 2 or 4, or this function will crash.
 inline void UnpackIntN(int bits_per_element, absl::Span<const char> input,
                        absl::Span<char> output) {
   if (bits_per_element == 2) {
@@ -921,6 +948,19 @@ inline void UnpackIntN(int bits_per_element, absl::Span<const char> input,
   }
 }
 
+// Same as above, but takes the number of bits per element, a pointer to the
+// source data, and the size of the data in bytes. Returns a unique pointer to
+// the unpacked data.
+inline std::unique_ptr<char[]> UnpackIntN(int bits_per_element,
+                                          const char* data, size_t size) {
+  size_t unpacked_size = size * 8 / bits_per_element;
+  auto buffer = std::make_unique<char[]>(unpacked_size);
+  auto src = absl::MakeSpan(data, size);
+  auto dst = absl::MakeSpan(buffer.get(), unpacked_size);
+  UnpackIntN(bits_per_element, src, dst);
+  return buffer;
+}
+
 // Returns a container with `sorted_ids_to_remove` elements removed.
 template <typename T>
 static T RemoveElements(absl::Span<int64_t const> sorted_ids_to_remove,
@@ -950,6 +990,34 @@ inline bool HloPredicateFalse(const HloInstruction*) { return false; }
 using Vector2 = std::array<int64_t, 2>;
 using Vector3 = std::array<int64_t, 3>;
 
+std::string PrintAllFields(const tsl::protobuf::Message& message);
+
+// Returns true if x is a power of 2.
+constexpr bool IsPowerOf2(size_t x) noexcept {
+  // Checks that x is non-zero and has only a single bit set.
+  return x != 0 && (x & (x - 1)) == 0;
+}
+
+// A custom deleter that frees the pointer via std::free().
+struct FreeDeleter {
+  void operator()(void* ptr) {
+#if defined(_WIN32)
+    _aligned_free(ptr);
+#else
+    std::free(ptr);
+#endif
+  }
+};
+
+/**
+ * @brief Allocates memory with specified alignment.
+ * @param alignment Specifies the alignment. Power of two.
+ * @param size The number of bytes to allocate. Integral multiple of alignment
+ * @return A unique_ptr managing the allocated memory.
+ */
+std::unique_ptr<void, FreeDeleter> AlignedAlloc(std::size_t alignment,
+                                                std::size_t size);
+
 }  // namespace xla
 
 // Note that STRING is evaluated regardless of whether it will be logged.
diff --git a/third_party/xla/xla/util_test.cc b/third_party/xla/xla/util_test.cc
index 8a66e457c5fb..10d3ee6f2e98 100644
--- a/third_party/xla/xla/util_test.cc
+++ b/third_party/xla/xla/util_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/base/log_severity.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "ml_dtypes/include/float8.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "xla/maybe_owning.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/types.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/ml_dtypes.h"
 
 namespace xla {
@@ -409,5 +411,28 @@ TEST(UtilTest, MaybeOwningTestShared) {
   EXPECT_EQ(c1.get(), c2.get());
 }
 
+TEST(UtilTest, MaybeOwningTestSharedNoCharType) {
+  auto owner = std::make_unique<int>();
+  *owner = 42;
+  MaybeOwning<int> i1(owner.get());
+  MaybeOwning<int> i2(owner.get());
+
+  EXPECT_EQ(*i1, 42);
+  EXPECT_EQ(*i2, 42);
+  EXPECT_EQ(i1.get(), i2.get());
+}
+
+TEST(UtilTest, PrintAllFields) {
+  // Here we are using one of the bool fields that has the default value to
+  // false and ensuring that it is always printed.
+  ExecutionProfile execution_profile;
+  execution_profile.set_compilation_cache_hit(true);
+  std::string result = PrintAllFields(execution_profile);
+  EXPECT_TRUE(absl::StrContains(result, "compilation_cache_hit: true"));
+  execution_profile.set_compilation_cache_hit(false);
+  result = PrintAllFields(execution_profile);
+  EXPECT_TRUE(absl::StrContains(result, "compilation_cache_hit: false"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
deleted file mode 100644
index 3b3883e6d18d..000000000000
--- a/third_party/xla/xla/xla.bzl
+++ /dev/null
@@ -1,125 +0,0 @@
-"""Wrapper around proto libraries used inside the XLA codebase."""
-
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
-load(
-    "//xla/tsl:package_groups.bzl",
-    "DEFAULT_LOAD_VISIBILITY",
-    "LEGACY_XLA_USERS",
-)
-load(
-    "//xla/tsl:tsl.bzl",
-    "tsl_copts",
-)
-load(
-    "//xla/tsl/platform:build_config_root.bzl",
-    "if_static",
-    "tf_exec_properties",
-)
-load("//xla/tsl/platform/default:build_config.bzl", "strict_cc_test")
-load(
-    "//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
-
-visibility(DEFAULT_LOAD_VISIBILITY + LEGACY_XLA_USERS)
-
-def xla_py_proto_library(**_kwargs):
-    # Note: we don't currently define a proto library target for Python in OSS.
-    pass
-
-def xla_py_test_deps():
-    return []
-
-# TODO(ddunleavy): some of these should be removed from here and added to
-# specific targets.
-# We actually shouldn't need this anymore post vendoring. If we build without
-# `framework_shared_object` in the bazelrc all of this should be able to go
-# away. The problem is making sure that all these impl deps are `if_static`'d
-# appropriately throughout XLA.
-_XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_static(extra_deps = [], otherwise = [
-    Label("//xla:autotune_results_proto_cc_impl"),
-    Label("//xla:autotuning_proto_cc_impl"),
-    Label("//xla:xla_data_proto_cc_impl"),
-    Label("//xla:xla_proto_cc_impl"),
-    Label("//xla/service:buffer_assignment_proto_cc_impl"),
-    Label("//xla/service:hlo_proto_cc_impl"),
-    Label("//xla/service:metrics_proto_cc_impl"),
-    Label("//xla/service/gpu:backend_configs_cc_impl"),
-    Label("//xla/service/gpu/model:hlo_op_profile_proto_cc_impl"),
-    Label("//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl"),
-    Label("//xla/stream_executor:device_description_proto_cc_impl"),
-    Label("//xla/stream_executor:stream_executor_impl"),
-    Label("//xla/stream_executor/cuda:cuda_compute_capability_proto_cc_impl"),
-    Label("//xla/stream_executor/gpu:gpu_init_impl"),
-    Label("//xla/backends/cpu/runtime:thunk_proto_cc_impl"),
-    "@com_google_protobuf//:protobuf",
-    "//xla/tsl/framework:allocator_registry_impl",
-    "//xla/tsl/framework:allocator",
-    "//xla/tsl/platform:env_impl",
-    "//xla/tsl/profiler/backends/cpu:annotation_stack_impl",
-    "//xla/tsl/profiler/backends/cpu:traceme_recorder_impl",
-    "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc_impl",
-    "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc_impl",
-    "//xla/tsl/profiler/utils:time_utils_impl",
-    "//xla/tsl/protobuf:protos_all_cc_impl",
-]) + if_cuda_is_configured([
-    Label("//xla/stream_executor/cuda:all_runtime"),
-    Label("//xla/stream_executor/cuda:stream_executor_cuda"),
-]) + if_rocm_is_configured([
-    Label("//xla/stream_executor/gpu:gpu_stream"),
-    Label("//xla/stream_executor/rocm:all_runtime"),
-    Label("//xla/stream_executor/rocm:stream_executor_rocm"),
-    "//xla/tsl/util:determinism",
-])
-
-def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):
-    native.cc_binary(deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS, copts = copts, **kwargs)
-
-def xla_cc_test(
-        name,
-        deps = [],
-        **kwargs):
-    """A wrapper around strict_cc_test that adds XLA-specific dependencies.
-
-    Also, it sets linkstatic to True by default, which is a good practice for catching duplicate
-    symbols at link time (e.g. linking in two main() functions).
-
-    Use xla_cc_test or xla_test instead of cc_test in all .../xla/... directories except .../tsl/...,
-    where tsl_cc_test should be used.
-
-    Args:
-      name: The name of the test.
-      deps: The dependencies of the test.
-      **kwargs: Other arguments to pass to the test.
-    """
-
-    strict_cc_test(
-        name = name,
-        deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS,
-        exec_properties = tf_exec_properties(kwargs),
-        **kwargs
-    )
-
-def xla_internal(targets, otherwise = []):
-    _ = targets  # buildifier: disable=unused-variable
-    return otherwise
-
-def tests_build_defs_bzl_deps():
-    return []
-
-def xla_bzl_library(name = "xla_bzl_library"):
-    bzl_library(
-        name = "xla_bzl",
-        srcs = ["xla.bzl"],
-        deps = [
-            "//xla/tsl:tsl_bzl",
-            "@local_config_rocm//rocm:build_defs_bzl",
-            "//xla/tsl/platform:build_config_root_bzl",
-            "//xla/tsl/platform/default:cuda_build_defs_bzl",
-            "@bazel_skylib//:bzl_library",
-        ],
-    )
diff --git a/third_party/xla/xla/xla.default.bzl b/third_party/xla/xla/xla.default.bzl
new file mode 100644
index 000000000000..5226c8e3cd6c
--- /dev/null
+++ b/third_party/xla/xla/xla.default.bzl
@@ -0,0 +1,112 @@
+"""Wrapper around proto libraries used inside the XLA codebase."""
+
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load(
+    "//xla/tsl:package_groups.bzl",
+    "DEFAULT_LOAD_VISIBILITY",
+    "LEGACY_XLA_USERS",
+)
+load(
+    "//xla/tsl:tsl.bzl",
+    "tsl_copts",
+)
+load(
+    "//xla/tsl/platform:build_config_root.bzl",
+    "if_static",
+    "tf_exec_properties",
+)
+load("//xla/tsl/platform/default:build_config.bzl", "strict_cc_test")
+
+visibility(DEFAULT_LOAD_VISIBILITY + LEGACY_XLA_USERS)
+
+def xla_py_proto_library(**_kwargs):
+    # Note: we don't currently define a proto library target for Python in OSS.
+    pass
+
+def xla_py_test_deps():
+    return []
+
+# TODO(ddunleavy): some of these should be removed from here and added to
+# specific targets.
+# We actually shouldn't need this anymore post vendoring. If we build without
+# `framework_shared_object` in the bazelrc all of this should be able to go
+# away. The problem is making sure that all these impl deps are `if_static`'d
+# appropriately throughout XLA.
+_XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_static(extra_deps = [], otherwise = [
+    Label("//xla:autotune_results_proto_cc_impl"),
+    Label("//xla:autotuning_proto_cc_impl"),
+    Label("//xla:xla_data_proto_cc_impl"),
+    Label("//xla:xla_proto_cc_impl"),
+    Label("//xla/service:buffer_assignment_proto_cc_impl"),
+    Label("//xla/service:hlo_proto_cc_impl"),
+    Label("//xla/service:metrics_proto_cc_impl"),
+    Label("//xla/service/gpu:backend_configs_cc_impl"),
+    Label("//xla/service/gpu/model:hlo_op_profile_proto_cc_impl"),
+    Label("//xla/service/memory_space_assignment:memory_space_assignment_proto_cc_impl"),
+    Label("//xla/stream_executor:device_description_proto_cc_impl"),
+    Label("//xla/stream_executor:stream_executor_impl"),
+    Label("//xla/stream_executor/cuda:cuda_compute_capability_proto_cc_impl"),
+    Label("//xla/stream_executor/gpu:gpu_init_impl"),
+    Label("//xla/backends/cpu/runtime:thunk_proto_cc_impl"),
+    "@com_google_protobuf//:protobuf",
+    "//xla/tsl/framework:allocator_registry_impl",
+    "//xla/tsl/framework:allocator",
+    "//xla/tsl/platform:env_impl",
+    "//xla/tsl/profiler/backends/cpu:annotation_stack_impl",
+    "//xla/tsl/profiler/backends/cpu:traceme_recorder_impl",
+    "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc_impl",
+    "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc_impl",
+    "//xla/tsl/profiler/utils:time_utils_impl",
+    "//xla/tsl/protobuf:protos_all_cc_impl",
+]) + if_rocm_is_configured([
+    "//xla/tsl/util:determinism",
+])
+
+def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):
+    native.cc_binary(deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS, copts = copts, **kwargs)
+
+def xla_cc_test(
+        name,
+        deps = [],
+        **kwargs):
+    """A wrapper around strict_cc_test that adds XLA-specific dependencies.
+
+    Use xla_cc_test or xla_test instead of cc_test in all .../xla/... directories except .../tsl/...,
+    where tsl_cc_test should be used.
+
+    Args:
+      name: The name of the test.
+      deps: The dependencies of the test.
+      **kwargs: Other arguments to pass to the test.
+    """
+
+    strict_cc_test(
+        name = name,
+        deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS,
+        exec_properties = tf_exec_properties(kwargs),
+        **kwargs
+    )
+
+def xla_internal(targets, otherwise = []):
+    _ = targets  # buildifier: disable=unused-variable
+    return otherwise
+
+def tests_build_defs_bzl_deps():
+    return []
+
+def xla_bzl_library(name = "xla_bzl_library"):
+    bzl_library(
+        name = "xla_bzl",
+        srcs = ["xla.default.bzl"],
+        deps = [
+            "//xla/tsl:tsl_bzl",
+            "@local_config_rocm//rocm:build_defs_bzl",
+            "//xla/tsl/platform:build_config_root_bzl",
+            "//xla/tsl/platform/default:cuda_build_defs_bzl",
+            "@bazel_skylib//:bzl_library",
+        ],
+    )
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 0efc7d6fa228..ca8ba0553bd5 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -125,6 +125,10 @@ message DebugOptions {
   bool xla_hlo_pass_fix_detect_cycles = 370;
   // Crash if HloPassFix can not converge after a fixed number of iterations.
   bool xla_unsupported_crash_on_hlo_pass_fix_max_iterations = 363;
+  // Crash if a pass reports that it changes the HLO but in fact it did not.
+  bool xla_unsupported_crash_on_hlo_pass_noop_change = 379;
+  // Crash if a pass reports that it did not change the HLO but in fact it did.
+  bool xla_unsupported_crash_on_hlo_pass_silent_hlo_change = 380;
   // go/keep-sorted end
 
   reserved 346;  // xla_experimental_exec_time_optimization_effort
@@ -209,6 +213,9 @@ message DebugOptions {
   // value is `256` (AVX2 on x86 platforms).
   int32 xla_cpu_prefer_vector_width = 308;
 
+  // If set, XLA:CPU uses "fusion emitters" for codegen.
+  bool xla_cpu_use_fusion_emitters = 376;
+
   // When true, XLA:CPU uses the thunk runtime to execute compiled program.
   bool xla_cpu_use_thunk_runtime = 298;
 
@@ -224,9 +231,11 @@ message DebugOptions {
   // XLA:GPU options.
   //--------------------------------------------------------------------------//
   // clang-format off
-  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["AutotuneCacheMode","bool","float","int32","int64","LibNvJitLinkMode","map<string, string>","PGLEStrictnessLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","ShapeChecks","string","WhileLoopUnrolling"] // NOLINT
+  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["AutotuneCacheMode","bool","float","int32","int64","LibNvJitLinkMode","map<string, string>","PGLEStrictnessLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","ShapeChecks","string","WhileLoopUnrolling","reserved"] // NOLINT
   // clang-format on
 
+  reserved 160;  // Was xla_gpu_enable_cudnn_frontend
+
   PipelineParallelismOptLevel
       xla_gpu_experimental_pipeline_parallelism_opt_level = 351;
 
@@ -240,9 +249,7 @@ message DebugOptions {
     // Additionally, enable collective-permute cycle decomposer. This set of
     // optimizations will lead to best overlap for trivial pipeline parallelism
     // implementation.
-    // TODO(b/393127848): Remove these optimizations when all users implement
-    // cycle decomposition at the JAX level.
-    PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER = 2;
+    reserved 2;  // Was PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER
   }
 
   // Denylist for cuDNN convolutions.
@@ -306,6 +313,9 @@ message DebugOptions {
   // Inflate collective cost by running each collective multiple times.
   int32 xla_gpu_collective_inflation_factor = 205;
 
+  // Size threshold (in bytes) for the GPU collective-permute combiner.
+  int64 xla_gpu_collective_permute_combine_threshold_bytes = 378;
+
   // The minimum data size in bytes to trigger collective-permute-decomposer
   // transformation.
   int64 xla_gpu_collective_permute_decomposer_threshold = 237;
@@ -394,9 +404,6 @@ message DebugOptions {
   // Whether to use cuBLASLt for GEMMs on GPUs.
   bool xla_gpu_enable_cublaslt = 166;
 
-  // Whether to use the cuDNN frontend API for convolutions when possible.
-  bool xla_gpu_enable_cudnn_frontend = 160;
-
   bool xla_gpu_enable_cudnn_int8x32_convolution_reordering = 189;
 
   // Rewrite layer norm patterns into cuDNN library calls.
@@ -434,15 +441,12 @@ message DebugOptions {
   // threads. Setting to 0 (the default value) means no enforcement.
   bool xla_gpu_enable_llvm_module_compilation_parallelism = 268;
 
-  // Allow early return when acquiring NCCL cliques.
-  bool xla_gpu_enable_nccl_clique_optimization = 244;
+  // DEPRECATED: This flag is a no-op.
+  bool xla_gpu_enable_nccl_clique_optimization = 244 [deprecated = true];
 
   // Enable NCCL communicator splitting.
   bool xla_gpu_enable_nccl_comm_splitting = 272;
 
-  // Enable NCCL per stream communicators.
-  bool xla_gpu_enable_nccl_per_stream_comms = 276;
-
   // Enable NCCL user buffers.
   bool xla_gpu_enable_nccl_user_buffers = 267;
 
@@ -450,7 +454,7 @@ message DebugOptions {
 
   bool xla_gpu_enable_pipelined_all_reduce = 217;
 
-  bool xla_gpu_enable_pipelined_collectives = 239;
+  bool xla_gpu_enable_pipelined_collectives = 239 [deprecated = true];
 
   bool xla_gpu_enable_pipelined_p2p = 246;
 
@@ -494,10 +498,6 @@ message DebugOptions {
   // Determine the while loop unrolling scheme.
   WhileLoopUnrolling xla_gpu_enable_while_loop_unrolling = 294;
 
-  // Change the layout of the second triton dot operand to be column major.
-  // Only works for (bf16 x bf16) -> bf16.
-  bool xla_gpu_ensure_minor_dot_contraction_dims = 249;
-
   // Excludes non-deterministic ops from compiled executables.
   // Unlike --xla_gpu_deterministic_ops does not disable autotuning - the
   // compilation itself can be non-deterministic.
@@ -516,6 +516,12 @@ message DebugOptions {
   // Specifies the behavior of per kernel autotuning cache.
   AutotuneCacheMode xla_gpu_experimental_autotune_cache_mode = 324;
 
+  // Specifies the distance threshold in ScheduleAwareCollectiveOpsCSE
+  int64 xla_gpu_experimental_collective_cse_distance_threshold = 374;
+
+  // Path to experimental collective perf tables.
+  string xla_gpu_experimental_collective_perf_table_path = 377;
+
   // Experimentally disables binary libraries in GPU compiler passes.
   bool xla_gpu_experimental_disable_binary_libraries = 329;
 
@@ -527,6 +533,11 @@ message DebugOptions {
   // xla_gpu_multi_streamed_windowed_einsum is set to true.
   bool xla_gpu_experimental_enable_alltoall_windowed_einsum = 360;
 
+  // Enable dynamically generating and pruning the autotuning search space for
+  // Triton dot fusions, based on the properties of the problem and hardware
+  // (shapes, instructions, GPU limits, etc.).
+  bool xla_gpu_experimental_enable_dynamic_dot_search_space = 385;
+
   // Enabling this flag will attempt to redirect every already-constructed
   // fusion possible to the Triton emitter.
   //
@@ -538,6 +549,14 @@ message DebugOptions {
   // Pre-existing block-level fusions are left unmodified.
   bool xla_gpu_experimental_enable_fusion_block_level_rewriter = 334;
 
+  // Enable NVSHMEM. Must be set via XLA_FLAGS variable before XLA client is
+  // initialized and can't be set just through HLO Config->ExecutionOptions.
+  bool xla_gpu_experimental_enable_nvshmem = 388;
+
+  // Enable the pass that splits GEMMs that underutilize the GPU load by
+  // splitting the K dimension using a heuristic.
+  bool xla_gpu_experimental_enable_split_k_rewrite = 386;
+
   // Enable fusion for the subchannel dequantisation sequences like
   // [x,z]param -> [x,y,z]broadcast -> [x*y,z]bitcast -> multiply -> dot.
   // Performance can be worse, because some block sizes / split-k > 1 is
@@ -584,6 +603,14 @@ message DebugOptions {
   // Filter out kernels that spill registers during autotuning.
   bool xla_gpu_filter_kernels_spilling_registers_on_autotuning = 250;
 
+  // The number of seconds to wait before terminating a rendezvous of the first
+  // call.
+  int32 xla_gpu_first_collective_call_terminate_timeout_seconds = 392;
+
+  // The number of seconds to wait before warning about a first call rendezvous
+  // that has not yet timed out.
+  int32 xla_gpu_first_collective_call_warn_stuck_timeout_seconds = 391;
+
   // Overrides normal multi-threaded compilation setting to use this many
   // threads. Setting to 0 (the default value) means no enforcement.
   int32 xla_gpu_force_compilation_parallelism = 147;
@@ -737,22 +764,18 @@ message DebugOptions {
   // `xla_gpu_cublas_fallback` set to false.
   bool xla_gpu_triton_gemm_any = 190;
 
-  bool xla_gpu_triton_gemm_disable_reduced_precision_reduction = 226;
-
   // It is usually preferable to not fallback to the driver; it can consume more
   // memory, or have bugs.
   bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138;
 
-  // Recognises rotate-right patterns (slice, slice, concat) within a while
-  // loop and labels the while loop as a pipelined while loop. This is an
-  // unsafe flag.
-  bool xla_gpu_unsafe_pipelined_loop_annotator = 309;
-
   // If true, XLA will annotate instructions in the dumps with emitter code
   // location (source:line) annotations. This helps to identify the source of
   // the code that emits a particular instruction.
   bool xla_gpu_unsupported_annotate_with_emitter_loc = 358;
 
+  // Internal testing flag to switch AllReduceDecomposer on or off.
+  bool xla_gpu_unsupported_enable_all_reduce_decomposer = 384;
+
   // Enable experimental tiling for Triton GEMM fusions.
   // TODO(b/393299275): remove the flag once the feature is stable.
   bool xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms = 367;
@@ -763,11 +786,17 @@ message DebugOptions {
   // Internal debug/testing flag to switch Triton GEMM fusions on or off.
   bool xla_gpu_unsupported_enable_triton_gemm = 322;
 
-  // Internal debug/testing flag to force all GEMMs to use Triton, independently
-  // of known issues.
-  // TODO(b/395903738): use to make specific tests pass on A100 while working
-  // around this bug. The can be removed once the bug is fixed.
-  bool xla_gpu_unsupported_force_triton_gemm = 369;
+  // Enable experimental triton multi-output fusion.
+  // TODO(b/390559452): Remove the flag once the feature is stable.
+  bool xla_gpu_unsupported_enable_triton_multi_output_fusion = 382;
+
+  // Internal testing flag to enable one-shot kernel for single-host
+  // all-reduce operations.
+  bool xla_gpu_unsupported_use_all_reduce_one_shot_kernel = 387;
+
+  // Internal testing flag to enable one-shot kernel for single-host
+  // ragged-all-to-all operations.
+  bool xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = 375;
 
   // This instructs the runtime whether to use memcpy for p2p communication when
   // source and target are located within a node(nvlink).
@@ -783,6 +812,9 @@ message DebugOptions {
 
   // go/keep-sorted end
 
+  reserved 276;  // xla_gpu_enable_nccl_per_stream_comms
+  reserved 226;  // xla_gpu_triton_gemm_disable_reduced_precision_reduction
+
   //--------------------------------------------------------------------------//
   // XLA:TPU options.
   //--------------------------------------------------------------------------//
@@ -992,6 +1024,9 @@ message DebugOptions {
   // Whether to dump mlir using pretty print form.
   bool xla_dump_enable_mlir_pretty_form = 185;
 
+  // Whether to dump mlir using pretty print form.
+  bool xla_dump_full_hlo_config = 381;
+
   reserved 130;  // Was xla_gpu_deterministic_reductions
 
   // Debug options that trigger execution errors when NaN or Inf are detected.
@@ -1163,18 +1198,17 @@ message DebugOptions {
 
   bool xla_pjrt_allow_auto_layout_in_hlo = 344;
 
-  // If false, platform_util::GetDefaultPlatform() will return an error. Used as
-  // a safeguard for tests where GetDefaultPlatform() won't work properly and
-  // its use indicates an error in test setup.
-  bool xla_allow_get_default_platform = 371;
-
   // If true, the test launched with ClientLibraryTestBase will also try to
   // use command buffer mode to run.
   bool xla_test_add_command_buffer_mode = 373;
 
+  // Path to experimental collective perf tables.
+  string xla_gpu_experimental_matmul_perf_table_path = 383;
+
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
-  // Next id: 374
+
+  // Next id: 389
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -1201,8 +1235,12 @@ message DebugOptions {
   // xla_gpu_enable_bf16_3way_gemm
   // xla_gpu_enable_bf16_6way_gemm
   // xla_gpu_enable_cudnn_fmha
+  // xla_gpu_unsupported_force_triton_gemm
+  // xla_allow_get_default_platform
+  // xla_gpu_ensure_minor_dot_contraction_dims
+  // xla_gpu_unsafe_pipelined_loop_annotator
   reserved 5, 117, 133, 139, 176, 178, 180, 193, 214, 194, 221, 242, 206, 320,
-      325, 326, 332, 361, 270, 229, 271, 279, 218;
+      325, 326, 332, 361, 270, 229, 271, 279, 218, 369, 371, 249, 309;
 }
 
 // Contains flags which affects the GPU compilation result.
@@ -1388,7 +1426,7 @@ message ExecutionOptions {
 // Serialization of HloModuleConfig. See the C++ class definition for
 // descriptions of each field.
 // There are no guarantees of backwards or forwards compatibility.
-// Next id: 41.
+// Next id: 42.
 message HloModuleConfigProto {
   enum FusionConfigCollection {
     OFF = 0;       // Do not collect configuration.
@@ -1452,6 +1490,7 @@ message HloModuleConfigProto {
   int64 device_memory_size = 32;
   bool use_shardy_partitioner = 34;
   ShardingConfigProto sharding_config = 38;
+  ScheduleConfigProto schedule_config = 41;
 }
 
 message HloModuleProtoWithConfig {
@@ -1468,12 +1507,36 @@ message ScheduleProto {
     // Start and end timestamps in cycles.
     double start_timestamp_cycles = 2;
     double end_timestamp_cycles = 3;
+
+    int64 memory_usage_after = 5;
+    int64 peak_memory_after = 6;
+  }
+
+  message SchedulerStatisticsProto {
+    double all_gather_wasted_cycles = 1;
+    double all_reduce_wasted_cycles = 2;
+    double collective_broadcast_wasted_cycles = 3;
+    double collective_permute_wasted_cycles = 4;
+    double all_to_all_wasted_cycles = 5;
+    double ragged_all_to_all_wasted_cycles = 6;
+    double reduce_scatter_wasted_cycles = 7;
+    double send_wasted_cycles = 8;
+    double recv_wasted_cycles = 9;
+    double total_wasted_cycles = 10;
+    double total_cycles = 11;
+    int64 memory_pressure_peak = 12;
   }
-  repeated Instruction instructions = 1;
-  // Computation id (matches the id in HloComputationProto).
-  int64 computation_id = 2;
-  HloModuleProto hlo_module = 3;
-  int64 cycles_per_microsecond = 4;
+
+  message ComputationScheduleProto {
+    // Computation id (matches the id in HloComputationProto).
+    int64 computation_id = 1;
+    repeated Instruction instructions = 2;
+    SchedulerStatisticsProto scheduler_statistics = 3;
+    int64 cycles_per_microsecond = 4;
+  }
+
+  HloModuleProto hlo_module = 1;
+  repeated ComputationScheduleProto computation_schedules = 2;
 }
 
 // Message that captures sharding configuration of an HLO op.
@@ -1487,3 +1550,14 @@ message ShardingConfigProto {
   // Configuration for each HLO instruction.
   repeated NodeShardingConfigProto nodes = 1;
 }
+
+// Message that captures schedule configuration of an HLO module.
+message ScheduleConfigProto {
+  message Instruction {
+    // Instruction name.
+    string name = 1;
+  }
+
+  // The schedule of the HLO module.
+  repeated Instruction sequence = 1;
+}
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index dd1d4a9815b0..98462a4f6eb8 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -147,7 +147,11 @@ enum PrimitiveType {
   // primitive type will have empty dimensions and tuple_shapes fields.
   TOKEN = 17;
 
-  // Next = 34
+  // A buffer type. A shape of this primitive type has empty dimensions and
+  // its underlying primitive type is in the tuple_shapes field.
+  BUFFER = 34;
+
+  // Next = 35
 }
 // LINT.ThenChange(
 //   https://www.tensorflow.org/code/tensorflow/compiler/xla/tools/driver.cc
@@ -220,27 +224,29 @@ message SplitConfigProto {
 // computation. Layouts specified in interior operations which take Shapes (for
 // example, Convert) are ignored.
 //
-// See the XLA documentation for more information on shapes and layouts.
+// See https://openxla.org/xla/shapes for more information on shapes and
+// layouts.
 //
 // LINT.IfChange
 message LayoutProto {
+  // Sequence of dimension indices, from minor (fastest varying index) to major
+  // (slowest varying index). The size of this field must equal the table's
+  // rank.
+  repeated int64 minor_to_major = 1;
+
   // The dimension level type list for this array, specifying the way in which
   // each array dimension is represented in memory. If this list is empty, the
   // array is assumed to be dense.
   repeated DimLevelType dim_level_types = 9;
 
   // Whether each dimension is unique or ordered.  Each of the following lists
-  // must be empty, or have one entry for each entry of dim_level_types.  If
-  // either list is empty, all dimensions are assumed to be unique and ordered,
-  // respectively.  Entries in this list may not be false for some DimLevelType
-  // values (such as DIM_DENSE in particular).
+  // must either be empty or have one entry for each entry of dim_level_types.
+  // If either list is empty, all dimensions are assumed to be unique or
+  // ordered, respectively.  Entries in this list may not be false for some
+  // DimLevelType values (such as DIM_DENSE in particular).
   repeated bool dim_unique = 13;
   repeated bool dim_ordered = 14;
 
-  // Sequence of dimension numbers, from minor (fastest varying index) to major
-  // (slowest varying index). This field is required.
-  repeated int64 minor_to_major = 1;
-
   // A sequence of tiles, starting from the tile that's applied first to the
   // Shape.
   //
@@ -248,13 +254,15 @@ message LayoutProto {
   // error.
   repeated TileProto tiles = 6;
 
-  // The shape is padded at the end to multiple of, in terms of number of
-  // elements. This is useful when tiling does not bring the shape to certain
-  // desired granules. Tiling effectively pads/reshapes/transposes the shape
-  // to another shape. This field pads the total number of elements of that
-  // new shape to a multiple of certain number of elements. This is useful such
-  // as we want a layout which does not tile the data but still requires it to
-  // be padded to certain number of elements.
+  // The shape is padded at the end to the smallest multiple of, in terms of
+  // number of elements, this value. This is useful when tiling does not bring
+  // the shape to certain desired granules. Tiling effectively
+  // pads/reshapes/transposes the shape to another shape. This field pads the
+  // total number of elements of that new shape to a multiple of certain number
+  // of elements. This is useful such as we want a layout which does not tile
+  // the data but still requires it to be padded to certain number of elements.
+  //
+  // If omitted or set to 0, it's treated as 1.
   int64 tail_padding_alignment_in_elements = 16;
 
   // (Optional) Bit size of each element. When unspecified or being 0, default
@@ -311,38 +319,48 @@ message LayoutProto {
 // Tuples are a special case in that they have rank zero and have tuple_shapes
 // defined.
 //
-// See the XLA documentation for more information on shapes and layouts.
+// See https://openxla.org/xla/shapes for more information on shapes and
+// layouts.
 //
 // LINT.IfChange
 message ShapeProto {
   reserved 1;
   reserved "rank";
 
-  // The element type for this shape.
+  // If this is TUPLE/TOKEN/OPAQUE_TYPE, the shape is a tuple/token/opaque shape
+  // respectively. Otherwise, the shape is an array shape and this is its
+  // element type.
   PrimitiveType element_type = 2;
 
+  // Used only for an array shape.
+  //
   // The size (number of elements) for each dimension, or an upper bound on the
   // size if the dimension is dynamic.  In XLA, dimensions are numbered from 0
   // to N-1 for an N-dimensional array. The first element of 'dimensions' is the
   // size of dimension 0, the second element is the size of dimension 1, and so
   // forth.  Empty list indicates a scalar.
   //
-  // If the respective element in 'is_dimension_dynamic' is true then the value
-  // in this field represents an upper bound on the size of the dimension.
+  // Each element in this list must be >= 0. In particular, 0 is a valid value
+  // for a dimension size.
+  //
+  // If a dimension is dynamic (see `is_dynamic_dimension` below), its value
+  // in this list is an upper bound on the size of the dimension.
   repeated int64 dimensions = 3;
 
-  // For tuples only, the shapes of constituent shapes in the tuple sequence.
+  // Used only for an array shape.
+  //
+  // Indicates whether or not each dimension is dynamically-sized. If this list
+  // is empty, all dimensions are statically-sized. Otherwise, this must have
+  // the same size as 'dimensions' and the value at index i indicates whether
+  // or not the dimension at index i is dynamically-sized.
+  repeated bool is_dynamic_dimension = 6;
+
+  // For tuples only. The constituent shapes in the tuple.
   repeated ShapeProto tuple_shapes = 4;
 
   // The layout used to back this shape.
   LayoutProto layout = 5;
 
-  // For arrays, this indicates whether or not each dimension is
-  // dynamically-sized. The number of elements in this repeated field should be
-  // zero (indicating that no dimensions are dynamic) or equal to the number of
-  // elements in the 'dimensions' field.
-  repeated bool is_dynamic_dimension = 6;
-
   // Important: if any field is added, be sure to modify ShapeUtil::Equal(),
   // ShapeUtil::Compatible() and Shape::Hash() appropriately to account for the
   // new field.
@@ -1057,10 +1075,8 @@ message PrecisionConfig {
     DEFAULT = 0;
     HIGH = 1;
     HIGHEST = 2;
-    // Each U8/S8 value in a tensor actually represents 2 nibble values.
-    PACKED_NIBBLE = 3;
 
-    // Next: 4
+    // Next: 3
   }
 
   // The algorithm used to evaluate the instruction.
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
deleted file mode 100644
index bab615b54dfd..000000000000
--- a/third_party/zlib.BUILD
+++ /dev/null
@@ -1,43 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # BSD/MIT-like license (for zlib)
-
-cc_library(
-    name = "zlib",
-    srcs = [
-        "adler32.c",
-        "compress.c",
-        "crc32.c",
-        "crc32.h",
-        "deflate.c",
-        "deflate.h",
-        "gzclose.c",
-        "gzguts.h",
-        "gzlib.c",
-        "gzread.c",
-        "gzwrite.c",
-        "infback.c",
-        "inffast.c",
-        "inffast.h",
-        "inffixed.h",
-        "inflate.c",
-        "inflate.h",
-        "inftrees.c",
-        "inftrees.h",
-        "trees.c",
-        "trees.h",
-        "uncompr.c",
-        "zconf.h",
-        "zutil.c",
-        "zutil.h",
-    ],
-    hdrs = ["zlib.h"],
-    copts = select({
-        "@local_xla//xla/tsl:windows": [],
-        "//conditions:default": [
-            "-Wno-shift-negative-value",
-            "-DZ_HAVE_UNISTD_H",
-        ],
-    }),
-    includes = ["."],
-)
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
index 11e60a512a77..c2ec401a581c 100755
--- a/tools/tf_env_collect.sh
+++ b/tools/tf_env_collect.sh
@@ -29,14 +29,34 @@ die() {
 echo "Collecting system information..."
 
 OUTPUT_FILE=tf_env.txt
-python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
+PYTHON_BIN_PATH="$(which python || which python3 || die "Cannot find Python binary")"
+
+HEADER_WIDTH=68
+# Create a string of HEADER_WIDTH "=" characters
+HEADER=$(printf "%*s" "$HEADER_WIDTH" "" | sed 's/ /=/g')
+
+print_header () {
+  # This function simply prints the header with even spacing, 
+  # and also prints it to STDERR so that the human running
+  # the script sees progress.
+  local TITLE="$1"
+  echo
+  # This line is a bit cryptic, but it essentially
+  # just pads the title with "=" to be the desired length.
+  local PADDED_TITLE="== $TITLE ${HEADER:${#TITLE}+4}"
+  # Echo to STDOUT
+  echo "$PADDED_TITLE"
+  # Echo to STDERR (to show progress to the user as it runs)
+  echo "$PADDED_TITLE" 1>&2
+}
+
+# Clear the output file
+echo > "$OUTPUT_FILE"
 
 {
-echo
-echo '== check python ==================================================='
-} >> ${OUTPUT_FILE}
+  print_header "check python"
 
-cat <<EOF > /tmp/check_python.py
+  "${PYTHON_BIN_PATH}" <<EOF
 import platform
 
 print("""python version: %s
@@ -52,84 +72,86 @@ platform.python_compiler(),
 platform.python_implementation(),
 ))
 EOF
-${python_bin_path} /tmp/check_python.py 2>&1  >> ${OUTPUT_FILE}
 
-{
-echo
-echo '== check os platform ==============================================='
-} >> ${OUTPUT_FILE}
+  print_header "check os platform"
 
-cat <<EOF > /tmp/check_os.py
+  "${PYTHON_BIN_PATH}" <<EOF
 import platform
 
-print("""os: %s
-os kernel version: %s
-os release version: %s
-os platform: %s
-linux distribution: %s
-linux os distribution: %s
-mac version: %s
-uname: %s
-architecture: %s
-machine: %s
-""" % (
-platform.system(),
-platform.version(),
-platform.release(),
-platform.platform(),
-platform.linux_distribution(),
-platform.dist(),
-platform.mac_ver(),
-platform.uname(),
-platform.architecture(),
-platform.machine(),
-))
+PLATFORM_ENTRIES = [
+    ("os", "system"),
+    ("os kernel version", "version"),
+    ("os release version", "release"),
+    ("os platform", "platform"),
+    ("freedesktop os release", "freedesktop_os_release"),
+    ("mac version", "mac_ver"),
+    ("uname", "uname"),
+    ("architecture", "architecture"),
+    ("machine", "machine"),
+]
+
+for label, function_name in PLATFORM_ENTRIES:
+    if hasattr(platform, function_name):
+        function = getattr(platform, function_name)
+        result = function()  # Call the function
+        print(f"{label}: {result}")
+    else:
+        print(f"{label}: N/A")
 EOF
-${python_bin_path} /tmp/check_os.py 2>&1  >> ${OUTPUT_FILE}
 
-{
-  echo
-  echo '== are we in docker ============================================='
-  num=`cat /proc/1/cgroup | grep docker | wc -l`;
-  if [ $num -ge 1 ]; then
+
+  print_header 'are we in docker'
+  if grep -q docker /proc/1/cgroup; then
     echo "Yes"
   else
     echo "No"
   fi
   
-  echo
-  echo '== compiler ====================================================='
-  c++ --version 2>&1
-  
-  echo
-  echo '== check pips ==================================================='
-  pip list 2>&1 | grep "proto\|numpy\|tensorflow"
+  print_header 'c++ compiler'
+  if which c++; then
+    c++ --version 2>&1
+  else
+    echo "Not found"
+  fi
   
+  print_header 'check pips'
+  pip list 2>&1 | grep -E 'proto|numpy|tensorflow|tf_nightly'
   
-  echo
-  echo '== check for virtualenv ========================================='
-  ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
   
-  echo
-  echo '== tensorflow import ============================================'
-} >> ${OUTPUT_FILE}
+  print_header 'check for virtualenv'
+  "${PYTHON_BIN_PATH}" <<EOF
+import sys
+
+if hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix):
+    print("Running inside a virtual environment.")
+else:
+    print("Not running inside a virtual environment.")
+EOF
 
-cat <<EOF > /tmp/check_tf.py
+  print_header 'tensorflow import'
+
+  "${PYTHON_BIN_PATH}" <<EOF 2>&1
 import tensorflow as tf;
-print("tf.version.VERSION = %s" % tf.version.VERSION)
-print("tf.version.GIT_VERSION = %s" % tf.version.GIT_VERSION)
-print("tf.version.COMPILER_VERSION = %s" % tf.version.COMPILER_VERSION)
-with tf.Session() as sess:
-  print("Sanity check: %r" % sess.run(tf.constant([1,2,3])[:1]))
+print(f"""
+tf.version.VERSION = {tf.version.VERSION}
+tf.version.GIT_VERSION = {tf.version.GIT_VERSION}
+tf.version.COMPILER_VERSION = {tf.version.COMPILER_VERSION}
+""")
+print("Sanity check: %r" % tf.constant([1,2,3])[:1])
 EOF
-${python_bin_path} /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
 
-LD_DEBUG=libs ${python_bin_path} -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
+  # Record libraries loaded by tensorflow
+  LD_DEBUG=libs "${PYTHON_BIN_PATH}" -c "import tensorflow" 2> /tmp/loadedlibs >/dev/null
+  if grep libcudnn /tmp/loadedlibs; then
+    echo "libcudnn found"
+  else
+    echo "libcudnn not found"
+  fi
 
-{
-  grep libcudnn.so /tmp/loadedlibs
-  echo
-  echo '== env =========================================================='
+  print_header env
+
+  # Note: the usage of "set -u" above would cause these to error if the
+  #   basic form [[ -z $LD_LIBRARY_PATH ]] was used.
   if [ -z ${LD_LIBRARY_PATH+x} ]; then
     echo "LD_LIBRARY_PATH is unset";
   else
@@ -142,35 +164,33 @@ LD_DEBUG=libs ${python_bin_path} -c "import tensorflow"  2>>${OUTPUT_FILE} > /tm
   fi
   
   
-  echo
-  echo '== nvidia-smi ==================================================='
+  print_header nvidia-smi
   nvidia-smi 2>&1
   
-  echo
-  echo '== cuda libs  ==================================================='
-} >> ${OUTPUT_FILE}
+  print_header 'cuda libs'
 
-find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
-find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+  # Find cudart/cudnn files
+  find /usr -type f -name 'libcud*'  2>/dev/null | grep -E 'cuda.*(cudart|cudnn)' | grep -v -F '.cache'
 
-{
-  echo
-  echo '== tensorflow installed from info =================='
-  pip show tensorflow
+  print_header 'tensorflow installation'
+  if ! pip show tensorflow; then
+    echo "tensorflow not found"
+  fi
 
-  echo
-  echo '== python version  =============================================='
+  print_header 'tf_nightly installation'
+  if ! pip show tf_nightly; then
+    echo "tf_nightly not found"
+  fi
+
+  print_header 'python version'
   echo '(major, minor, micro, releaselevel, serial)'
-  python -c 'import sys; print(sys.version_info[:])'
+  "${PYTHON_BIN_PATH}" -c 'import sys; print(sys.version_info[:])'
   
-  echo
-  echo '== bazel version  ==============================================='
+  print_header 'bazel version'
   bazel version
-} >> ${OUTPUT_FILE}
 
-# Remove any words with google.
-mv $OUTPUT_FILE old-$OUTPUT_FILE
-grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
+# Remove any lines with google.
+} | grep -v -i google >> "$OUTPUT_FILE"
 
 echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
 echo "and use it to populate the fields in the github issue template."